From af732203b8f7f006927528db5497f5cbc4c4742a Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dim@FreeBSD.org>
Date: Sun, 13 Jun 2021 21:31:46 +0200
Subject: Merge llvm-project 12.0.1 release and follow-up fixes

Merge llvm-project main llvmorg-12-init-17869-g8e464dd76bef

This updates llvm, clang, compiler-rt, libc++, libunwind, lld, lldb and
openmp to llvmorg-12-init-17869-g8e464dd76bef, the last commit before the
upstream release/12.x branch was created.

PR:		255570

(cherry picked from commit e8d8bef961a50d4dc22501cde4fb9fb0be1b2532)

Merge llvm-project 12.0.0 release

This updates llvm, clang, compiler-rt, libc++, libunwind, lld, lldb and
openmp to llvmorg-12.0.0-0-gd28af7c654d8, a.k.a. 12.0.0 release.

PR:		255570

(cherry picked from commit d409305fa3838fb39b38c26fc085fb729b8766d5)

Disable strict-fp for powerpcspe, as it does not work properly yet

Merge commit 5c18d1136665 from llvm git (by Qiu Chaofan)

  [SPE] Disable strict-fp for SPE by default

  As discussed in PR50385, strict-fp on PowerPC SPE has not been
  handled well. This patch disables it by default for SPE.

  Reviewed By: nemanjai, vit9696, jhibbits

  Differential Revision: https://reviews.llvm.org/D103235

PR:		255570

(cherry picked from commit 715df83abc049b23d9acddc81f2480bd4c056d64)

Apply upstream libc++ fix to allow building with devel/xxx-xtoolchain-gcc

Merge commit 52e9d80d5db2 from llvm git (by Jason Liu):

  [libc++] add `inline` for __open's definition in ifstream and ofstream

  Summary:

  When building with gcc on AIX, it seems that gcc does not like the
  `always_inline` without the `inline` keyword.
  So adding the inline keywords in for __open in ifstream and ofstream.
  That will also make it consistent with __open in basic_filebuf
  (it seems we added `inline` there before for gcc build as well).

  Differential Revision: https://reviews.llvm.org/D99422

PR:		255570

(cherry picked from commit d099db25464b826c5724cf2fb5b22292bbe15f6e)

Undefine HAVE_(DE)REGISTER_FRAME in llvm's config.h on arm

Otherwise, the lli tool (enable by WITH_CLANG_EXTRAS) won't link on arm,
stating that __register_frame is undefined. This function is normally
provided by libunwind, but explicitly not for the ARM Exception ABI.

Reported by:	oh
PR:		255570

(cherry picked from commit f336b45e943c7f9a90ffcea1a6c4c7039e54c73c)

Merge llvm-project 12.0.1 rc2

This updates llvm, clang, compiler-rt, libc++, libunwind, lld, lldb and
openmp to llvmorg-12.0.1-rc2-0-ge7dac564cd0e, a.k.a. 12.0.1 rc2.

PR:		255570

(cherry picked from commit 23408297fbf3089f0388a8873b02fa75ab3f5bb9)

Revert libunwind change to fix backtrace segfault on aarch64

Revert commit 22b615a96593 from llvm git (by Daniel Kiss):

  [libunwind] Support for leaf function unwinding.

  Unwinding leaf function is useful in cases when the backtrace finds a
  leaf function for example when it caused a signal.
  This patch also add the support for the DW_CFA_undefined because it marks
  the end of the frames.

  Ryan Prichard provided code for the tests.

  Reviewed By: #libunwind, mstorsjo

  Differential Revision: https://reviews.llvm.org/D83573

  Reland with limit the test to the x86_64-linux target.

Bisection has shown that this particular upstream commit causes programs
using backtrace(3) on aarch64 to segfault. This affects the lang/rust
port, for instance. Until we can upstream to fix this problem, revert
the commit for now.

Reported by:	mikael
PR:		256864

(cherry picked from commit 5866c369e4fd917c0d456f0f10b92ee354b82279)

Merge llvm-project 12.0.1 release

This updates llvm, clang, compiler-rt, libc++, libunwind, lld, lldb and
openmp to llvmorg-12.0.1-0-gfed41342a82f, a.k.a. 12.0.1 release.

PR:		255570

(cherry picked from commit 4652422eb477731f284b1345afeefef7f269da50)

compilert-rt: build out-of-line LSE atomics helpers for aarch64

Both clang >= 12 and gcc >= 10.1 now default to -moutline-atomics for
aarch64. This requires a bunch of helper functions in libcompiler_rt.a,
to avoid link errors like "undefined symbol: __aarch64_ldadd8_acq_rel".

(Note: of course you can use -mno-outline-atomics as a workaround too,
but this would negate the potential performance benefit of the faster
LSE instructions.)

Bump __FreeBSD_version so ports maintainers can easily detect this.

PR:		257392

(cherry picked from commit cc55ee8009a550810d38777fd6ace9abf3a2f6b4)
---
 contrib/llvm-project/llvm/include/llvm-c/Core.h    |   60 +-
 .../llvm-project/llvm/include/llvm-c/DataTypes.h   |    4 -
 .../llvm-project/llvm/include/llvm-c/DebugInfo.h   |    4 +-
 contrib/llvm-project/llvm/include/llvm-c/Error.h   |    5 +
 contrib/llvm-project/llvm/include/llvm-c/LLJIT.h   |  213 +
 .../llvm/include/llvm-c/LinkTimeOptimizer.h        |   66 -
 contrib/llvm-project/llvm/include/llvm-c/Orc.h     |  452 +-
 .../llvm-project/llvm/include/llvm-c/OrcBindings.h |  169 -
 contrib/llvm-project/llvm/include/llvm-c/OrcEE.h   |   55 +
 .../llvm/include/llvm-c/Transforms/IPO.h           |    3 -
 .../llvm/include/llvm-c/Transforms/Scalar.h        |    6 +-
 .../llvm/include/llvm/ADT/APFixedPoint.h           |  237 +
 .../llvm-project/llvm/include/llvm/ADT/APFloat.h   |    5 +-
 contrib/llvm-project/llvm/include/llvm/ADT/APInt.h |   44 +-
 .../llvm-project/llvm/include/llvm/ADT/APSInt.h    |   20 +-
 .../llvm/include/llvm/ADT/AllocatorList.h          |    7 -
 contrib/llvm-project/llvm/include/llvm/ADT/Any.h   |    7 +-
 .../llvm-project/llvm/include/llvm/ADT/BitVector.h |   40 +-
 .../llvm-project/llvm/include/llvm/ADT/DenseMap.h  |   35 +-
 .../llvm/include/llvm/ADT/DenseMapInfo.h           |   45 +
 .../llvm-project/llvm/include/llvm/ADT/DenseSet.h  |   21 +-
 .../llvm/include/llvm/ADT/DepthFirstIterator.h     |    2 +-
 .../llvm/include/llvm/ADT/DirectedGraph.h          |   16 +-
 .../llvm/include/llvm/ADT/FloatingPointMode.h      |   18 +
 .../llvm/include/llvm/ADT/FunctionExtras.h         |   12 +-
 .../llvm-project/llvm/include/llvm/ADT/Hashing.h   |   25 +
 .../llvm/include/llvm/ADT/IntervalMap.h            |   10 +-
 .../llvm/include/llvm/ADT/IntrusiveRefCntPtr.h     |   45 +-
 .../llvm-project/llvm/include/llvm/ADT/Optional.h  |  165 +-
 .../llvm/include/llvm/ADT/PointerUnion.h           |    7 -
 .../llvm-project/llvm/include/llvm/ADT/STLExtras.h |  104 +-
 .../llvm-project/llvm/include/llvm/ADT/Sequence.h  |    4 +
 .../llvm-project/llvm/include/llvm/ADT/SetVector.h |    5 +
 .../llvm-project/llvm/include/llvm/ADT/SmallSet.h  |    7 +
 .../llvm/include/llvm/ADT/SmallString.h            |   61 +-
 .../llvm/include/llvm/ADT/SmallVector.h            |  549 +-
 .../llvm-project/llvm/include/llvm/ADT/SparseSet.h |    9 +-
 .../llvm-project/llvm/include/llvm/ADT/Statistic.h |    2 +
 .../llvm/include/llvm/ADT/StringExtras.h           |  109 +-
 .../llvm-project/llvm/include/llvm/ADT/StringMap.h |   12 +-
 .../llvm-project/llvm/include/llvm/ADT/StringSet.h |    3 +
 .../llvm-project/llvm/include/llvm/ADT/Triple.h    |   67 +-
 .../llvm-project/llvm/include/llvm/ADT/iterator.h  |   22 +-
 .../llvm/include/llvm/ADT/iterator_range.h         |    1 -
 .../llvm/include/llvm/ADT/simple_ilist.h           |    6 +-
 .../llvm/include/llvm/Analysis/AliasAnalysis.h     |  104 +-
 .../llvm/include/llvm/Analysis/AliasSetTracker.h   |   30 +-
 .../llvm/include/llvm/Analysis/AssumptionCache.h   |    2 +-
 .../include/llvm/Analysis/BasicAliasAnalysis.h     |   63 +-
 .../include/llvm/Analysis/BlockFrequencyInfoImpl.h |   12 +-
 .../include/llvm/Analysis/BranchProbabilityInfo.h  |  287 +-
 .../llvm/include/llvm/Analysis/CFGPrinter.h        |   22 +-
 .../llvm/include/llvm/Analysis/CGSCCPassManager.h  |  540 +-
 .../llvm/include/llvm/Analysis/CallGraph.h         |    7 -
 .../llvm/include/llvm/Analysis/CaptureTracking.h   |    8 +
 .../llvm/include/llvm/Analysis/CodeMetrics.h       |    3 +-
 .../llvm/include/llvm/Analysis/ConstantFolding.h   |    6 +-
 .../llvm/include/llvm/Analysis/ConstraintSystem.h  |   88 +
 .../llvm-project/llvm/include/llvm/Analysis/DDG.h  |   28 +-
 .../llvm/include/llvm/Analysis/DDGPrinter.h        |   91 +
 .../llvm/include/llvm/Analysis/Delinearization.h   |   33 +
 .../llvm/include/llvm/Analysis/DemandedBits.h      |   14 +
 .../include/llvm/Analysis/DivergenceAnalysis.h     |   85 +-
 .../llvm/include/llvm/Analysis/DominanceFrontier.h |    1 -
 .../llvm/include/llvm/Analysis/EHPersonalities.h   |   11 +-
 .../llvm/Analysis/FunctionPropertiesAnalysis.h     |   86 +
 .../include/llvm/Analysis/IRSimilarityIdentifier.h |  789 +++
 .../llvm/include/llvm/Analysis/IVDescriptors.h     |  154 +-
 .../llvm/include/llvm/Analysis/InlineAdvisor.h     |   82 +-
 .../include/llvm/Analysis/InlineFeaturesAnalysis.h |   45 -
 .../llvm/Analysis/InlineSizeEstimatorAnalysis.h    |   12 +-
 .../llvm/include/llvm/Analysis/InstCount.h         |   28 +
 .../include/llvm/Analysis/InstructionSimplify.h    |   42 +-
 .../llvm/include/llvm/Analysis/Interval.h          |    3 -
 .../llvm/include/llvm/Analysis/IntervalIterator.h  |    6 +-
 .../llvm/Analysis/IteratedDominanceFrontier.h      |    8 +-
 .../llvm/Analysis/LazyBranchProbabilityInfo.h      |    2 +-
 .../llvm/include/llvm/Analysis/LazyCallGraph.h     |   51 +-
 .../llvm/include/llvm/Analysis/LazyValueInfo.h     |   18 +-
 .../llvm-project/llvm/include/llvm/Analysis/Lint.h |   28 +-
 .../llvm/include/llvm/Analysis/Loads.h             |    9 +
 .../include/llvm/Analysis/LoopAccessAnalysis.h     |   17 +-
 .../include/llvm/Analysis/LoopAnalysisManager.h    |    1 +
 .../llvm/include/llvm/Analysis/LoopCacheAnalysis.h |   29 +-
 .../llvm/include/llvm/Analysis/LoopInfo.h          |   22 +-
 .../llvm/include/llvm/Analysis/LoopInfoImpl.h      |   22 +-
 .../llvm/include/llvm/Analysis/LoopNestAnalysis.h  |   17 +-
 .../llvm/include/llvm/Analysis/MLInlineAdvisor.h   |   13 +-
 .../llvm/include/llvm/Analysis/MemDerefPrinter.h   |   24 +
 .../llvm/Analysis/MemoryDependenceAnalysis.h       |    2 +-
 .../llvm/include/llvm/Analysis/MemoryLocation.h    |   68 +-
 .../llvm/include/llvm/Analysis/MemorySSA.h         |   61 +-
 .../llvm/include/llvm/Analysis/MemorySSAUpdater.h  |    9 +-
 .../include/llvm/Analysis/ModuleDebugInfoPrinter.h |   29 +
 .../llvm/include/llvm/Analysis/MustExecute.h       |   19 +
 .../include/llvm/Analysis/ObjCARCAnalysisUtils.h   |   34 +-
 .../llvm/Analysis/OptimizationRemarkEmitter.h      |   10 +-
 .../llvm/include/llvm/Analysis/PhiValues.h         |    7 +-
 .../include/llvm/Analysis/ProfileSummaryInfo.h     |    6 +-
 .../llvm/include/llvm/Analysis/RegionInfo.h        |    3 -
 .../llvm/include/llvm/Analysis/RegionInfoImpl.h    |   12 +-
 .../llvm/include/llvm/Analysis/RegionPass.h        |    2 -
 .../include/llvm/Analysis/ReplayInlineAdvisor.h    |   41 +
 .../llvm/include/llvm/Analysis/ScalarEvolution.h   |  279 +-
 .../llvm/Analysis/ScalarEvolutionDivision.h        |    1 +
 .../llvm/Analysis/ScalarEvolutionExpressions.h     |  135 +-
 .../llvm/include/llvm/Analysis/SparsePropagation.h |    3 +-
 .../llvm/include/llvm/Analysis/StackLifetime.h     |   16 +-
 .../include/llvm/Analysis/StackSafetyAnalysis.h    |    3 +-
 .../include/llvm/Analysis/SyncDependenceAnalysis.h |   42 +-
 .../include/llvm/Analysis/TargetLibraryInfo.def    |   21 +
 .../llvm/include/llvm/Analysis/TargetLibraryInfo.h |    1 +
 .../include/llvm/Analysis/TargetTransformInfo.h    |  271 +-
 .../llvm/Analysis/TargetTransformInfoImpl.h        |  346 +-
 .../Utils/ImportedFunctionsInliningStatistics.h    |  112 +
 .../llvm/include/llvm/Analysis/Utils/Local.h       |   76 +-
 .../llvm/include/llvm/Analysis/Utils/TFUtils.h     |  203 +-
 .../llvm/include/llvm/Analysis/ValueLattice.h      |   11 +
 .../llvm/include/llvm/Analysis/ValueTracking.h     |  107 +-
 .../llvm/include/llvm/Analysis/VecFuncs.def        |  146 +
 .../llvm/include/llvm/Analysis/VectorUtils.h       |   56 +-
 .../llvm/include/llvm/BinaryFormat/COFF.h          |    1 +
 .../llvm/include/llvm/BinaryFormat/Dwarf.def       |   19 +-
 .../llvm/include/llvm/BinaryFormat/Dwarf.h         |   93 +-
 .../llvm/include/llvm/BinaryFormat/DynamicTags.def |    1 +
 .../llvm/include/llvm/BinaryFormat/ELF.h           |   92 +-
 .../include/llvm/BinaryFormat/ELFRelocs/CSKY.def   |   74 +
 .../llvm/BinaryFormat/ELFRelocs/PowerPC64.def      |   12 +
 .../llvm/include/llvm/BinaryFormat/MachO.h         |    5 +-
 .../llvm/include/llvm/BinaryFormat/Wasm.h          |   49 +-
 .../llvm/include/llvm/BinaryFormat/WasmRelocs.def  |    5 +
 .../llvm/include/llvm/BinaryFormat/WasmTraits.h    |   68 +
 .../llvm/include/llvm/BinaryFormat/XCOFF.h         |  111 +
 .../llvm/include/llvm/Bitcode/BitcodeCommon.h      |   30 +
 .../llvm/include/llvm/Bitcode/BitcodeConvenience.h |  486 ++
 .../llvm/include/llvm/Bitcode/BitcodeWriter.h      |   16 +-
 .../llvm/include/llvm/Bitcode/LLVMBitCodes.h       |   25 +-
 .../llvm/include/llvm/Bitstream/BitCodes.h         |    9 +-
 .../llvm/include/llvm/Bitstream/BitstreamWriter.h  |  103 +-
 .../llvm/include/llvm/CodeGen/Analysis.h           |    5 -
 .../llvm/include/llvm/CodeGen/AntiDepBreaker.h     |    1 -
 .../llvm/include/llvm/CodeGen/AsmPrinter.h         |  131 +-
 .../llvm/include/llvm/CodeGen/AsmPrinterHandler.h  |    7 +-
 .../include/llvm/CodeGen/BasicBlockSectionUtils.h  |   30 +
 .../llvm/include/llvm/CodeGen/BasicTTIImpl.h       |  491 +-
 .../llvm/include/llvm/CodeGen/CalcSpillWeights.h   |   70 +-
 .../llvm/include/llvm/CodeGen/CallingConvLower.h   |   15 +-
 .../llvm/include/llvm/CodeGen/CodeGenPassBuilder.h | 1144 +++++
 .../llvm/include/llvm/CodeGen/CommandFlags.h       |   30 +-
 .../llvm-project/llvm/include/llvm/CodeGen/DIE.h   |   27 +-
 .../llvm/CodeGen/DbgEntityHistoryCalculator.h      |   24 +
 .../llvm/include/llvm/CodeGen/DebugHandlerBase.h   |   12 +-
 .../include/llvm/CodeGen/DwarfStringPoolEntry.h    |    4 +-
 .../llvm/include/llvm/CodeGen/FastISel.h           |   43 +-
 .../include/llvm/CodeGen/FunctionLoweringInfo.h    |   34 +-
 .../llvm/include/llvm/CodeGen/GlobalISel/CSEInfo.h |    8 +-
 .../include/llvm/CodeGen/GlobalISel/CallLowering.h |  150 +-
 .../llvm/CodeGen/GlobalISel/CombinerHelper.h       |  268 +-
 .../llvm/CodeGen/GlobalISel/GISelChangeObserver.h  |    2 +-
 .../llvm/CodeGen/GlobalISel/GISelKnownBits.h       |   12 +-
 .../llvm/CodeGen/GlobalISel/GISelWorkList.h        |    3 -
 .../include/llvm/CodeGen/GlobalISel/IRTranslator.h |  102 +-
 .../llvm/CodeGen/GlobalISel/InstructionSelector.h  |   45 +-
 .../CodeGen/GlobalISel/InstructionSelectorImpl.h   |   77 +-
 .../GlobalISel/LegalizationArtifactCombiner.h      |  124 +-
 .../include/llvm/CodeGen/GlobalISel/Legalizer.h    |    3 -
 .../llvm/CodeGen/GlobalISel/LegalizerHelper.h      |   82 +-
 .../llvm/CodeGen/GlobalISel/LegalizerInfo.h        |   80 +-
 .../include/llvm/CodeGen/GlobalISel/Localizer.h    |    8 +-
 .../llvm/CodeGen/GlobalISel/MIPatternMatch.h       |  102 +-
 .../llvm/CodeGen/GlobalISel/MachineIRBuilder.h     |  162 +-
 .../llvm/CodeGen/GlobalISel/RegisterBankInfo.h     |   23 +-
 .../llvm/include/llvm/CodeGen/GlobalISel/Utils.h   |  130 +-
 .../llvm/include/llvm/CodeGen/ISDOpcodes.h         |  108 +-
 .../llvm/include/llvm/CodeGen/LexicalScopes.h      |    3 -
 .../llvm/include/llvm/CodeGen/LiveInterval.h       |   34 +-
 .../llvm/include/llvm/CodeGen/LiveIntervalUnion.h  |    3 +
 .../llvm/include/llvm/CodeGen/LiveIntervals.h      |   25 +-
 .../llvm/include/llvm/CodeGen/LiveRangeEdit.h      |    8 +-
 .../llvm/include/llvm/CodeGen/LiveRegMatrix.h      |   17 +-
 .../llvm/include/llvm/CodeGen/LiveRegUnits.h       |    3 +-
 .../llvm/include/llvm/CodeGen/LiveVariables.h      |   54 +-
 .../llvm/include/llvm/CodeGen/LowLevelType.h       |    4 +
 .../llvm/include/llvm/CodeGen/MBFIWrapper.h        |    2 +
 .../llvm/include/llvm/CodeGen/MIRFormatter.h       |    6 +-
 .../llvm/include/llvm/CodeGen/MIRYamlMapping.h     |   51 +-
 .../llvm/include/llvm/CodeGen/MachineBasicBlock.h  |   31 +-
 .../llvm/CodeGen/MachineBlockFrequencyInfo.h       |   23 +-
 .../include/llvm/CodeGen/MachineCombinerPattern.h  |    5 +
 .../include/llvm/CodeGen/MachineConstantPool.h     |    6 +-
 .../llvm/CodeGen/MachineDominanceFrontier.h        |    1 -
 .../llvm/include/llvm/CodeGen/MachineDominators.h  |    1 -
 .../llvm/include/llvm/CodeGen/MachineFrameInfo.h   |    7 +-
 .../llvm/include/llvm/CodeGen/MachineFunction.h    |   58 +-
 .../llvm/include/llvm/CodeGen/MachineInstr.h       |   40 +-
 .../include/llvm/CodeGen/MachineInstrBuilder.h     |   41 +-
 .../include/llvm/CodeGen/MachineJumpTableInfo.h    |    3 +
 .../llvm/include/llvm/CodeGen/MachineLoopInfo.h    |    6 +
 .../llvm/include/llvm/CodeGen/MachineLoopUtils.h   |    4 -
 .../llvm/include/llvm/CodeGen/MachineModuleInfo.h  |   27 +-
 .../llvm/include/llvm/CodeGen/MachineOperand.h     |   14 +-
 .../llvm/include/llvm/CodeGen/MachineOutliner.h    |    3 +-
 .../llvm/include/llvm/CodeGen/MachinePassManager.h |  256 +
 .../include/llvm/CodeGen/MachinePassRegistry.def   |  197 +
 .../llvm/include/llvm/CodeGen/MachinePipeliner.h   |   17 +-
 .../include/llvm/CodeGen/MachineRegisterInfo.h     |   42 +-
 .../llvm/include/llvm/CodeGen/MachineSSAUpdater.h  |    4 +-
 .../llvm/include/llvm/CodeGen/MachineStableHash.h  |   30 +
 .../include/llvm/CodeGen/MachineTraceMetrics.h     |    4 +-
 .../include/llvm/CodeGen/MultiHazardRecognizer.h   |   47 +
 .../llvm/CodeGen/NonRelocatableStringpool.h        |    4 +-
 .../llvm/include/llvm/CodeGen/Passes.h             |   32 +-
 .../llvm/include/llvm/CodeGen/RDFGraph.h           |    4 -
 .../llvm/include/llvm/CodeGen/RDFLiveness.h        |   38 +-
 .../llvm/include/llvm/CodeGen/RDFRegisters.h       |   43 +-
 .../include/llvm/CodeGen/ReachingDefAnalysis.h     |   45 +-
 .../llvm/include/llvm/CodeGen/RegAllocPBQP.h       |   28 +-
 .../llvm/include/llvm/CodeGen/Register.h           |   32 +-
 .../llvm/include/llvm/CodeGen/RegisterPressure.h   |   34 +-
 .../llvm/include/llvm/CodeGen/RegisterScavenging.h |   13 +-
 .../include/llvm/CodeGen/ResourcePriorityQueue.h   |    1 -
 .../llvm/include/llvm/CodeGen/RuntimeLibcalls.h    |    5 +
 .../llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h  |   20 +-
 .../llvm/CodeGen/ScheduleHazardRecognizer.h        |   10 +-
 .../llvm/include/llvm/CodeGen/SelectionDAG.h       |  138 +-
 .../llvm/include/llvm/CodeGen/SelectionDAGISel.h   |    2 -
 .../llvm/include/llvm/CodeGen/SelectionDAGNodes.h  |  240 +-
 .../include/llvm/CodeGen/SelectionDAGTargetInfo.h  |    2 +-
 .../llvm/include/llvm/CodeGen/SlotIndexes.h        |   45 +-
 .../llvm/include/llvm/CodeGen/StableHashing.h      |  112 +
 .../llvm/include/llvm/CodeGen/StackMaps.h          |   62 +-
 .../include/llvm/CodeGen/SwitchLoweringUtils.h     |   11 +-
 .../llvm/include/llvm/CodeGen/TargetCallingConv.h  |   62 +-
 .../include/llvm/CodeGen/TargetFrameLowering.h     |   18 +-
 .../llvm/include/llvm/CodeGen/TargetInstrInfo.h    |  121 +-
 .../llvm/include/llvm/CodeGen/TargetLowering.h     |  130 +-
 .../llvm/CodeGen/TargetLoweringObjectFileImpl.h    |   30 +-
 .../llvm/include/llvm/CodeGen/TargetPassConfig.h   |   28 +-
 .../llvm/include/llvm/CodeGen/TargetRegisterInfo.h |   96 +-
 .../include/llvm/CodeGen/TargetSubtargetInfo.h     |    4 +-
 .../llvm/include/llvm/CodeGen/TileShapeInfo.h      |   97 +
 .../llvm/include/llvm/CodeGen/ValueTypes.h         |   96 +-
 .../llvm/include/llvm/CodeGen/ValueTypes.td        |  209 +-
 .../llvm/include/llvm/CodeGen/VirtRegMap.h         |   35 +-
 .../llvm/include/llvm/CodeGen/WasmEHFuncInfo.h     |    2 +
 .../llvm/include/llvm/DWARFLinker/DWARFLinker.h    |  104 +-
 .../llvm/DWARFLinker/DWARFLinkerCompileUnit.h      |   33 +-
 .../llvm/DWARFLinker/DWARFLinkerDeclContext.h      |   31 +-
 .../llvm/include/llvm/DWARFLinker/DWARFStreamer.h  |    2 +-
 .../include/llvm/DebugInfo/CodeView/CVRecord.h     |   17 +-
 .../llvm/DebugInfo/CodeView/CVSymbolVisitor.h      |    3 -
 .../llvm/DebugInfo/CodeView/CodeViewRecordIO.h     |    3 +-
 .../llvm/DebugInfo/CodeView/CodeViewRegisters.def  |   36 +-
 .../DebugInfo/CodeView/DebugSubsectionVisitor.h    |    3 -
 .../DebugInfo/CodeView/DebugSymbolsSubsection.h    |    2 +-
 .../DebugInfo/CodeView/LazyRandomTypeCollection.h  |    1 -
 .../include/llvm/DebugInfo/CodeView/RecordName.h   |    1 -
 .../include/llvm/DebugInfo/CodeView/SymbolDumper.h |    2 +-
 .../include/llvm/DebugInfo/CodeView/SymbolRecord.h |    3 -
 .../llvm/DebugInfo/CodeView/SymbolRecordHelpers.h  |    3 +-
 .../llvm/DebugInfo/CodeView/TypeCollection.h       |    3 +-
 .../include/llvm/DebugInfo/CodeView/TypeHashing.h  |   17 +-
 .../include/llvm/DebugInfo/CodeView/TypeIndex.h    |   11 +-
 .../llvm/DebugInfo/CodeView/TypeIndexDiscovery.h   |    4 +-
 .../include/llvm/DebugInfo/CodeView/TypeRecord.h   |    8 +-
 .../llvm/DebugInfo/CodeView/TypeRecordHelpers.h    |    3 +-
 .../llvm/DebugInfo/CodeView/TypeStreamMerger.h     |    2 +-
 .../llvm/include/llvm/DebugInfo/DIContext.h        |   18 +-
 .../DebugInfo/DWARF/DWARFAbbreviationDeclaration.h |   10 +
 .../include/llvm/DebugInfo/DWARF/DWARFContext.h    |   18 +-
 .../include/llvm/DebugInfo/DWARF/DWARFDebugAddr.h  |   18 +
 .../llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h     |    3 +-
 .../include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h |   17 +-
 .../include/llvm/DebugInfo/DWARF/DWARFDebugLine.h  |   12 +-
 .../include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h   |    2 +
 .../include/llvm/DebugInfo/DWARF/DWARFDebugMacro.h |    7 +-
 .../llvm/DebugInfo/DWARF/DWARFDebugRnglists.h      |    3 +-
 .../llvm/include/llvm/DebugInfo/DWARF/DWARFDie.h   |    6 +-
 .../include/llvm/DebugInfo/DWARF/DWARFExpression.h |   16 +-
 .../include/llvm/DebugInfo/DWARF/DWARFFormValue.h  |    3 +
 .../include/llvm/DebugInfo/DWARF/DWARFListTable.h  |   66 +-
 .../llvm/include/llvm/DebugInfo/DWARF/DWARFUnit.h  |   37 +-
 .../include/llvm/DebugInfo/DWARF/DWARFVerifier.h   |    7 +-
 .../include/llvm/DebugInfo/MSF/MappedBlockStream.h |    2 -
 .../PDB/Native/DbiModuleDescriptorBuilder.h        |   63 +-
 .../llvm/DebugInfo/PDB/Native/NativeEnumSymbols.h  |   41 +
 .../DebugInfo/PDB/Native/NativeFunctionSymbol.h    |    6 +-
 .../DebugInfo/PDB/Native/NativeInlineSiteSymbol.h  |   46 +
 .../llvm/DebugInfo/PDB/Native/NativeLineNumber.h   |    3 +-
 .../llvm/DebugInfo/PDB/Native/NativePublicSymbol.h |    1 -
 .../llvm/DebugInfo/PDB/Native/NativeSession.h      |   11 +
 .../llvm/DebugInfo/PDB/Native/SymbolCache.h        |   48 +-
 .../include/llvm/DebugInfo/PDB/Native/TpiStream.h  |    2 +-
 .../llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h   |    9 +-
 .../llvm/include/llvm/DebugInfo/PDB/PDBExtras.h    |    6 +-
 .../llvm/include/llvm/DebugInfo/PDB/PDBSymbol.h    |    8 +-
 .../include/llvm/DebugInfo/Symbolize/Symbolize.h   |    2 +-
 .../llvm/include/llvm/Demangle/ItaniumDemangle.h   |  203 +-
 .../llvm/include/llvm/Demangle/Utility.h           |    2 +-
 .../include/llvm/ExecutionEngine/ExecutionEngine.h |   21 -
 .../llvm/ExecutionEngine/JITEventListener.h        |    1 -
 .../llvm/ExecutionEngine/JITLink/EHFrameSupport.h  |   33 +-
 .../include/llvm/ExecutionEngine/JITLink/ELF.h     |   13 +-
 .../llvm/ExecutionEngine/JITLink/ELF_x86_64.h      |   14 +-
 .../include/llvm/ExecutionEngine/JITLink/JITLink.h |   91 +-
 .../llvm/ExecutionEngine/JITLink/JITLinkDylib.h    |   24 +
 .../ExecutionEngine/JITLink/JITLinkMemoryManager.h |   19 +-
 .../include/llvm/ExecutionEngine/JITLink/MachO.h   |   11 +-
 .../llvm/ExecutionEngine/JITLink/MachO_arm64.h     |   11 +-
 .../llvm/ExecutionEngine/JITLink/MachO_x86_64.h    |   13 +-
 .../llvm/include/llvm/ExecutionEngine/JITSymbol.h  |    2 +-
 .../ExecutionEngine/Orc/CompileOnDemandLayer.h     |  639 +--
 .../llvm/ExecutionEngine/Orc/CompileUtils.h        |    2 -
 .../llvm/include/llvm/ExecutionEngine/Orc/Core.h   |  611 ++-
 .../llvm/ExecutionEngine/Orc/ExecutionUtils.h      |  110 +-
 .../llvm/ExecutionEngine/Orc/GlobalMappingLayer.h  |  111 -
 .../llvm/ExecutionEngine/Orc/IRCompileLayer.h      |  100 +-
 .../llvm/ExecutionEngine/Orc/IRTransformLayer.h    |   80 +-
 .../llvm/ExecutionEngine/Orc/IndirectionUtils.h    |   55 +-
 .../llvm/include/llvm/ExecutionEngine/Orc/LLJIT.h  |   49 +-
 .../llvm/ExecutionEngine/Orc/LambdaResolver.h      |   84 -
 .../llvm/include/llvm/ExecutionEngine/Orc/Layer.h  |   42 +-
 .../llvm/ExecutionEngine/Orc/LazyEmittingLayer.h   |  267 -
 .../llvm/ExecutionEngine/Orc/LazyReexports.h       |   17 +-
 .../llvm/include/llvm/ExecutionEngine/Orc/Legacy.h |  211 -
 .../llvm/ExecutionEngine/Orc/MachOPlatform.h       |   20 +-
 .../llvm/ExecutionEngine/Orc/NullResolver.h        |   43 -
 .../llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h  |   57 +-
 .../ExecutionEngine/Orc/ObjectTransformLayer.h     |   84 +-
 .../include/llvm/ExecutionEngine/Orc/OrcError.h    |   74 -
 .../Orc/OrcRPCTargetProcessControl.h               |  415 ++
 .../ExecutionEngine/Orc/OrcRemoteTargetClient.h    |  263 +-
 .../ExecutionEngine/Orc/OrcRemoteTargetRPCAPI.h    |  280 +-
 .../ExecutionEngine/Orc/OrcRemoteTargetServer.h    |   21 +-
 .../ExecutionEngine/Orc/RPC/RPCSerialization.h     |  702 ---
 .../llvm/ExecutionEngine/Orc/RPC/RPCUtils.h        | 1687 -------
 .../llvm/ExecutionEngine/Orc/RPC/RawByteChannel.h  |  184 -
 .../ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h |  387 +-
 .../llvm/ExecutionEngine/Orc/RemoteObjectLayer.h   |  564 ---
 .../ExecutionEngine/Orc/Shared/FDRawByteChannel.h  |   79 +
 .../llvm/ExecutionEngine/Orc/Shared/OrcError.h     |   74 +
 .../llvm/ExecutionEngine/Orc/Shared/RPCUtils.h     | 1657 +++++++
 .../ExecutionEngine/Orc/Shared/RawByteChannel.h    |  183 +
 .../ExecutionEngine/Orc/Shared/Serialization.h     |  769 +++
 .../Orc/Shared/TargetProcessControlTypes.h         |  165 +
 .../include/llvm/ExecutionEngine/Orc/Speculation.h |    7 +-
 .../Orc/TPCDynamicLibrarySearchGenerator.h         |   66 +
 .../llvm/ExecutionEngine/Orc/TPCEHFrameRegistrar.h |   54 +
 .../llvm/ExecutionEngine/Orc/TPCIndirectionUtils.h |  222 +
 .../Orc/TargetProcess/OrcRPCTPCServer.h            |  620 +++
 .../Orc/TargetProcess/RegisterEHFrames.h           |   41 +
 .../Orc/TargetProcess/TargetExecutionUtils.h       |   38 +
 .../ExecutionEngine/Orc/TargetProcessControl.h     |  218 +
 .../llvm/ExecutionEngine/Orc/ThreadSafeModule.h    |    2 +-
 .../include/llvm/ExecutionEngine/RuntimeDyld.h     |   11 +-
 .../llvm/include/llvm/FileCheck/FileCheck.h        |  216 +
 .../llvm/Frontend/Directive/DirectiveBase.td       |   52 +-
 .../llvm/include/llvm/Frontend/OpenACC/ACC.td      |  132 +-
 .../llvm/include/llvm/Frontend/OpenMP/OMP.td       |  234 +-
 .../include/llvm/Frontend/OpenMP/OMPConstants.h    |   44 +-
 .../llvm/include/llvm/Frontend/OpenMP/OMPContext.h |   49 +-
 .../include/llvm/Frontend/OpenMP/OMPGridValues.h   |   22 +-
 .../include/llvm/Frontend/OpenMP/OMPIRBuilder.h    |  402 +-
 .../llvm/include/llvm/Frontend/OpenMP/OMPKinds.def | 1046 ++--
 .../llvm/include/llvm/FuzzMutate/IRMutator.h       |   11 +
 .../llvm-project/llvm/include/llvm/IR/Argument.h   |   27 +-
 .../llvm/include/llvm/IR/Assumptions.h             |   50 +
 .../llvm-project/llvm/include/llvm/IR/Attributes.h |   78 +-
 .../llvm/include/llvm/IR/Attributes.td             |   32 +-
 .../llvm-project/llvm/include/llvm/IR/BasicBlock.h |   88 +-
 .../llvm/include/llvm/IR/CallingConv.h             |    3 +
 .../llvm-project/llvm/include/llvm/IR/Constant.h   |   32 +-
 .../llvm/include/llvm/IR/ConstantRange.h           |   26 +-
 .../llvm-project/llvm/include/llvm/IR/Constants.h  |   99 +-
 .../llvm-project/llvm/include/llvm/IR/DIBuilder.h  |   44 +-
 .../llvm-project/llvm/include/llvm/IR/DataLayout.h |   54 +-
 .../llvm/include/llvm/IR/DebugInfoMetadata.h       |  238 +-
 .../llvm-project/llvm/include/llvm/IR/DebugLoc.h   |   16 +-
 .../llvm/include/llvm/IR/DerivedTypes.h            |   76 +-
 .../llvm/include/llvm/IR/DiagnosticInfo.h          |   24 +-
 .../llvm-project/llvm/include/llvm/IR/Dominators.h |   30 +-
 .../llvm/include/llvm/IR/FixedMetadataKinds.def    |    5 +-
 .../llvm/include/llvm/IR/FixedPointBuilder.h       |  465 ++
 .../llvm-project/llvm/include/llvm/IR/Function.h   |   46 +-
 .../include/llvm/IR/GetElementPtrTypeIterator.h    |    2 +-
 .../llvm/include/llvm/IR/GlobalObject.h            |   69 +-
 .../llvm/include/llvm/IR/GlobalVariable.h          |    9 +-
 .../llvm-project/llvm/include/llvm/IR/IRBuilder.h  |  175 +-
 .../llvm/include/llvm/IR/IRPrintingPasses.h        |   23 +-
 .../llvm-project/llvm/include/llvm/IR/InstrTypes.h |  149 +-
 .../llvm/include/llvm/IR/Instruction.h             |   70 +-
 .../llvm/include/llvm/IR/Instructions.h            |   78 +-
 .../llvm/include/llvm/IR/IntrinsicInst.h           |   79 +
 .../llvm-project/llvm/include/llvm/IR/Intrinsics.h |   12 +-
 .../llvm/include/llvm/IR/Intrinsics.td             |  696 +--
 .../llvm/include/llvm/IR/IntrinsicsAArch64.td      |  471 +-
 .../llvm/include/llvm/IR/IntrinsicsAMDGPU.td       |  142 +-
 .../llvm/include/llvm/IR/IntrinsicsARM.td          |   25 +-
 .../llvm/include/llvm/IR/IntrinsicsBPF.td          |   10 +-
 .../llvm/include/llvm/IR/IntrinsicsNVVM.td         |  113 +-
 .../llvm/include/llvm/IR/IntrinsicsPowerPC.td      |  584 ++-
 .../llvm/include/llvm/IR/IntrinsicsRISCV.td        | 1024 ++++
 .../llvm/include/llvm/IR/IntrinsicsVE.td           |   35 +
 .../llvm/include/llvm/IR/IntrinsicsVEVL.gen.td     | 1213 +++++
 .../llvm/include/llvm/IR/IntrinsicsWebAssembly.td  |  196 +-
 .../llvm/include/llvm/IR/IntrinsicsX86.td          |  153 +-
 .../llvm/include/llvm/IR/LLVMContext.h             |   16 +-
 .../llvm/include/llvm/IR/LLVMRemarkStreamer.h      |   11 +-
 .../llvm/include/llvm/IR/LegacyPassManagers.h      |    5 +-
 .../llvm-project/llvm/include/llvm/IR/MDBuilder.h  |    5 +-
 .../llvm/include/llvm/IR/MatrixBuilder.h           |   39 +-
 .../llvm-project/llvm/include/llvm/IR/Metadata.def |    2 +
 .../llvm-project/llvm/include/llvm/IR/Metadata.h   |   48 +-
 contrib/llvm-project/llvm/include/llvm/IR/Module.h |    7 +-
 .../llvm/include/llvm/IR/ModuleSummaryIndex.h      |   14 +-
 .../llvm-project/llvm/include/llvm/IR/Operator.h   |    5 +
 .../llvm-project/llvm/include/llvm/IR/OptBisect.h  |   20 +-
 .../llvm/include/llvm/IR/PassInstrumentation.h     |  117 +-
 .../llvm/include/llvm/IR/PassManager.h             |  138 +-
 .../llvm/include/llvm/IR/PassManagerImpl.h         |    7 -
 .../llvm/include/llvm/IR/PassManagerInternal.h     |   22 +
 .../llvm/include/llvm/IR/PassTimingInfo.h          |   15 +-
 .../llvm/include/llvm/IR/PatternMatch.h            |  272 +-
 .../llvm/include/llvm/IR/PredIteratorCache.h       |    4 +-
 .../llvm/include/llvm/IR/PrintPasses.h             |   44 +
 .../llvm/include/llvm/IR/PseudoProbe.h             |   87 +
 .../llvm/include/llvm/IR/ReplaceConstant.h         |   28 +
 .../llvm/include/llvm/IR/RuntimeLibcalls.def       |   32 +-
 .../llvm-project/llvm/include/llvm/IR/Statepoint.h |   50 +-
 .../llvm/include/llvm/IR/StructuralHash.h          |   34 +
 .../llvm/include/llvm/IR/SymbolTableListTraits.h   |    8 +-
 contrib/llvm-project/llvm/include/llvm/IR/Type.h   |   32 +-
 contrib/llvm-project/llvm/include/llvm/IR/User.h   |    2 +-
 .../llvm/include/llvm/IR/VPIntrinsics.def          |  140 +-
 .../llvm-project/llvm/include/llvm/IR/Value.def    |   18 +
 contrib/llvm-project/llvm/include/llvm/IR/Value.h  |  104 +-
 .../llvm/include/llvm/IR/ValueHandle.h             |   26 +-
 .../llvm-project/llvm/include/llvm/IR/Verifier.h   |    2 +
 .../llvm/include/llvm/InitializePasses.h           |   63 +-
 .../include/llvm/InterfaceStub/ELFObjHandler.h     |   47 +
 .../llvm/include/llvm/InterfaceStub/ELFStub.h      |   66 +
 .../llvm/include/llvm/InterfaceStub/TBEHandler.h   |   43 +
 .../llvm-project/llvm/include/llvm/LTO/Config.h    |   29 +-
 contrib/llvm-project/llvm/include/llvm/LTO/LTO.h   |   12 +-
 .../llvm/include/llvm/LTO/LTOBackend.h             |   25 +-
 .../include/llvm/LTO/legacy/LTOCodeGenerator.h     |   18 +-
 .../llvm/include/llvm/LTO/legacy/LTOModule.h       |    2 -
 .../llvm-project/llvm/include/llvm/LinkAllPasses.h |   10 +-
 .../llvm/include/llvm/MC/MCAsmBackend.h            |    8 +-
 .../llvm-project/llvm/include/llvm/MC/MCAsmInfo.h  |   50 +-
 .../llvm-project/llvm/include/llvm/MC/MCAsmMacro.h |    6 +
 .../llvm/include/llvm/MC/MCAssembler.h             |    8 +-
 .../llvm-project/llvm/include/llvm/MC/MCCodeView.h |    2 -
 .../llvm-project/llvm/include/llvm/MC/MCContext.h  |   31 +-
 .../llvm-project/llvm/include/llvm/MC/MCDwarf.h    |   11 +-
 .../llvm/include/llvm/MC/MCELFObjectWriter.h       |    1 -
 contrib/llvm-project/llvm/include/llvm/MC/MCExpr.h |   32 +-
 .../llvm-project/llvm/include/llvm/MC/MCFixup.h    |    1 -
 .../llvm-project/llvm/include/llvm/MC/MCFragment.h |   52 +
 contrib/llvm-project/llvm/include/llvm/MC/MCInst.h |    2 +-
 .../llvm/include/llvm/MC/MCInstPrinter.h           |   12 +
 .../llvm/include/llvm/MC/MCInstrDesc.h             |   31 +-
 .../llvm/include/llvm/MC/MCMachObjectWriter.h      |   14 +-
 .../llvm/include/llvm/MC/MCObjectFileInfo.h        |   17 +-
 .../llvm/include/llvm/MC/MCObjectStreamer.h        |    2 +
 .../llvm/include/llvm/MC/MCParser/AsmLexer.h       |    1 +
 .../llvm/include/llvm/MC/MCParser/MCAsmLexer.h     |   22 +-
 .../llvm/include/llvm/MC/MCParser/MCAsmParser.h    |   30 +-
 .../include/llvm/MC/MCParser/MCTargetAsmParser.h   |    3 +-
 .../llvm/include/llvm/MC/MCPseudoProbe.h           |  178 +
 .../llvm-project/llvm/include/llvm/MC/MCRegister.h |   25 +-
 .../llvm/include/llvm/MC/MCRegisterInfo.h          |    1 +
 .../llvm-project/llvm/include/llvm/MC/MCSchedule.h |    2 +-
 .../llvm/include/llvm/MC/MCSectionXCOFF.h          |   18 +-
 .../llvm-project/llvm/include/llvm/MC/MCStreamer.h |   51 +-
 .../llvm/include/llvm/MC/MCSubtargetInfo.h         |   24 +-
 .../llvm-project/llvm/include/llvm/MC/MCSymbol.h   |    5 +-
 .../llvm/include/llvm/MC/MCSymbolWasm.h            |   18 +
 .../llvm/include/llvm/MC/MCSymbolXCOFF.h           |    4 -
 .../llvm/include/llvm/MC/MCTargetOptions.h         |    1 +
 .../llvm/include/llvm/MC/MCWasmObjectWriter.h      |    4 +
 .../llvm/include/llvm/MC/MCWasmStreamer.h          |    4 -
 .../llvm-project/llvm/include/llvm/MC/MCWin64EH.h  |    7 +-
 .../llvm/include/llvm/MC/MCWinCOFFStreamer.h       |    1 +
 .../llvm-project/llvm/include/llvm/MC/MCWinEH.h    |   24 +-
 .../llvm/include/llvm/MC/StringTableBuilder.h      |   12 +-
 .../llvm/include/llvm/MC/SubtargetFeature.h        |    2 +-
 .../include/llvm/MCA/HardwareUnits/Scheduler.h     |    6 +-
 .../llvm/include/llvm/Object/ArchiveWriter.h       |    6 +
 .../llvm-project/llvm/include/llvm/Object/Binary.h |   13 +-
 .../llvm-project/llvm/include/llvm/Object/COFF.h   |   23 +-
 .../llvm-project/llvm/include/llvm/Object/ELF.h    |  736 ++-
 .../llvm/include/llvm/Object/ELFObjectFile.h       |  294 +-
 .../llvm/include/llvm/Object/ELFTypes.h            |   30 +-
 .../llvm-project/llvm/include/llvm/Object/MachO.h  |    1 +
 .../llvm/include/llvm/Object/MachOUniversal.h      |    8 +
 .../include/llvm/Object/MachOUniversalWriter.h     |  102 +
 .../llvm/include/llvm/Object/ObjectFile.h          |    6 +-
 .../llvm/include/llvm/Object/RelocationResolver.h  |   10 +-
 .../llvm/include/llvm/Object/StackMapParser.h      |   24 +-
 .../llvm/include/llvm/Object/SymbolicFile.h        |    6 +-
 .../llvm-project/llvm/include/llvm/Object/Wasm.h   |   20 +-
 .../llvm/include/llvm/Object/WasmTraits.h          |   68 -
 .../llvm/include/llvm/Object/XCOFFObjectFile.h     |  101 +
 .../llvm/include/llvm/ObjectYAML/ArchiveYAML.h     |   77 +
 .../llvm/include/llvm/ObjectYAML/DWARFEmitter.h    |   16 +-
 .../llvm/include/llvm/ObjectYAML/DWARFYAML.h       |  197 +-
 .../llvm/include/llvm/ObjectYAML/ELFYAML.h         |  332 +-
 .../llvm/include/llvm/ObjectYAML/MachOYAML.h       |    2 +-
 .../llvm/include/llvm/ObjectYAML/MinidumpYAML.h    |    2 +-
 .../llvm/include/llvm/ObjectYAML/ObjectYAML.h      |    2 +
 .../llvm/include/llvm/ObjectYAML/WasmYAML.h        |    3 +
 .../llvm/include/llvm/ObjectYAML/yaml2obj.h        |    5 +
 .../llvm/include/llvm/Option/ArgList.h             |    4 +
 .../llvm/include/llvm/Option/OptParser.td          |   93 +-
 .../llvm/include/llvm/Option/OptTable.h            |   28 +-
 .../llvm-project/llvm/include/llvm/Option/Option.h |   14 +-
 contrib/llvm-project/llvm/include/llvm/Pass.h      |   20 +
 .../llvm/include/llvm/PassAnalysisSupport.h        |   24 +-
 .../llvm/include/llvm/Passes/PassBuilder.h         |  188 +-
 .../include/llvm/Passes/StandardInstrumentations.h |  232 +-
 .../llvm/ProfileData/Coverage/CoverageMapping.h    |   70 +-
 .../ProfileData/Coverage/CoverageMappingReader.h   |    4 +-
 .../llvm/include/llvm/ProfileData/GCOV.h           |  184 +-
 .../llvm/include/llvm/ProfileData/InstrProf.h      |   40 +-
 .../include/llvm/ProfileData/InstrProfData.inc     |  139 +-
 .../include/llvm/ProfileData/InstrProfReader.h     |   26 +-
 .../include/llvm/ProfileData/InstrProfWriter.h     |    6 +-
 .../llvm/include/llvm/ProfileData/ProfileCommon.h  |    7 +-
 .../llvm/include/llvm/ProfileData/SampleProf.h     |  325 +-
 .../include/llvm/ProfileData/SampleProfReader.h    |  164 +-
 .../include/llvm/ProfileData/SampleProfWriter.h    |  197 +-
 .../include/llvm/Remarks/BitstreamRemarkParser.h   |    6 +-
 .../include/llvm/Remarks/HotnessThresholdParser.h  |   63 +
 .../include/llvm/Support/AArch64TargetParser.def   |   35 +
 .../include/llvm/Support/AArch64TargetParser.h     |    8 +-
 .../llvm/include/llvm/Support/AMDGPUMetadata.h     |    3 +-
 .../include/llvm/Support/AMDHSAKernelDescriptor.h  |   72 +-
 .../llvm/include/llvm/Support/ARMTargetParser.def  |   15 +-
 .../llvm/include/llvm/Support/ARMTargetParser.h    |    3 +-
 .../llvm/include/llvm/Support/ARMWinEH.h           |   87 +-
 .../llvm/include/llvm/Support/AlignOf.h            |   35 +-
 .../llvm/include/llvm/Support/Allocator.h          |   25 +-
 .../llvm/include/llvm/Support/AtomicOrdering.h     |   30 +-
 .../llvm/include/llvm/Support/BinaryItemStream.h   |    3 +-
 .../llvm/include/llvm/Support/BinaryStreamRef.h    |    8 +-
 .../llvm/include/llvm/Support/CFGDiff.h            |  225 +-
 .../llvm/include/llvm/Support/CFGUpdate.h          |    1 -
 .../llvm/include/llvm/Support/CheckedArithmetic.h  |    4 +-
 .../llvm/include/llvm/Support/CommandLine.h        |   32 +-
 .../llvm/include/llvm/Support/Compiler.h           |   24 +-
 .../include/llvm/Support/CrashRecoveryContext.h    |   12 +-
 .../llvm/include/llvm/Support/DOTGraphTraits.h     |    3 +-
 .../llvm-project/llvm/include/llvm/Support/Error.h |    8 +-
 .../llvm/include/llvm/Support/ErrorHandling.h      |    6 +-
 .../llvm/include/llvm/Support/ErrorOr.h            |    6 +-
 .../llvm/include/llvm/Support/ExitCodes.h          |   33 +
 .../llvm/include/llvm/Support/FileCheck.h          |  191 -
 .../llvm/include/llvm/Support/FileCollector.h      |   78 +-
 .../llvm/include/llvm/Support/FileSystem.h         |   88 +-
 .../include/llvm/Support/FileSystem/UniqueID.h     |   52 +
 .../llvm/include/llvm/Support/FormatVariadic.h     |    4 +-
 .../llvm/include/llvm/Support/GenericDomTree.h     |   62 +-
 .../llvm/Support/GenericDomTreeConstruction.h      |  293 +-
 .../llvm/include/llvm/Support/GlobPattern.h        |   10 +
 .../llvm/include/llvm/Support/GraphWriter.h        |    8 +-
 .../llvm-project/llvm/include/llvm/Support/Host.h  |   14 +
 .../llvm/include/llvm/Support/InitLLVM.h           |    3 +-
 .../llvm/include/llvm/Support/InstructionCost.h    |  238 +
 .../llvm-project/llvm/include/llvm/Support/JSON.h  |  205 +-
 .../llvm/include/llvm/Support/KnownBits.h          |  145 +-
 .../llvm/include/llvm/Support/LineIterator.h       |    8 +-
 .../llvm/include/llvm/Support/MachineValueType.h   |  342 +-
 .../llvm/include/llvm/Support/MathExtras.h         |    2 +-
 .../llvm/include/llvm/Support/MemoryBuffer.h       |   24 +-
 .../llvm/include/llvm/Support/MemoryBufferRef.h    |   56 +
 .../llvm/include/llvm/Support/Parallel.h           |  104 +-
 .../llvm-project/llvm/include/llvm/Support/Path.h  |   38 +
 .../llvm/include/llvm/Support/PluginLoader.h       |    4 +
 .../llvm/include/llvm/Support/Process.h            |   10 +-
 .../llvm/include/llvm/Support/Program.h            |   16 +-
 .../include/llvm/Support/RISCVTargetParser.def     |   14 +
 .../llvm/include/llvm/Support/Signals.h            |    6 +-
 .../llvm/include/llvm/Support/Signposts.h          |   11 +-
 .../llvm/include/llvm/Support/SourceMgr.h          |    5 +
 .../llvm/include/llvm/Support/SuffixTree.h         |    4 +-
 .../llvm/include/llvm/Support/SwapByteOrder.h      |    2 +-
 .../include/llvm/Support/SymbolRemappingReader.h   |    2 +-
 .../llvm/include/llvm/Support/TargetOpcodes.def    |   77 +-
 .../llvm/include/llvm/Support/TargetParser.h       |   25 +-
 .../llvm/include/llvm/Support/TargetRegistry.h     |    2 +
 .../llvm/include/llvm/Support/TaskQueue.h          |    2 +-
 .../llvm/include/llvm/Support/Threading.h          |   12 +-
 .../llvm/include/llvm/Support/ToolOutputFile.h     |    4 +
 .../llvm/include/llvm/Support/TrigramIndex.h       |    4 +-
 .../llvm/include/llvm/Support/TypeSize.h           |  531 +-
 .../llvm/include/llvm/Support/VirtualFileSystem.h  |   31 +-
 .../llvm/include/llvm/Support/Win64EH.h            |    8 +-
 .../llvm/include/llvm/Support/X86TargetParser.def  |    9 +
 .../llvm/include/llvm/Support/X86TargetParser.h    |   19 +-
 .../llvm/include/llvm/Support/YAMLParser.h         |   13 +-
 .../llvm/include/llvm/Support/YAMLTraits.h         |   92 +-
 .../llvm/include/llvm/Support/raw_ostream.h        |  114 +-
 .../llvm/include/llvm/TableGen/DirectiveEmitter.h  |  211 +
 .../llvm/include/llvm/TableGen/Error.h             |   19 +-
 .../llvm/include/llvm/TableGen/Record.h            |  362 +-
 .../llvm/include/llvm/TableGen/SearchableTable.td  |   26 +-
 .../llvm/include/llvm/Target/CGPassBuilderOption.h |   65 +
 .../llvm/include/llvm/Target/GenericOpcodes.td     |  602 ++-
 .../llvm/include/llvm/Target/GlobalISel/Combine.td |  353 +-
 .../llvm/Target/GlobalISel/SelectionDAGCompat.td   |   34 +-
 .../llvm/include/llvm/Target/Target.td             |  396 +-
 .../llvm/include/llvm/Target/TargetCallingConv.td  |    6 +-
 .../include/llvm/Target/TargetInstrPredicate.td    |   27 +-
 .../llvm/include/llvm/Target/TargetItinerary.td    |    2 +-
 .../include/llvm/Target/TargetLoweringObjectFile.h |   36 +-
 .../llvm/include/llvm/Target/TargetMachine.h       |   70 +-
 .../llvm/include/llvm/Target/TargetOptions.h       |   67 +-
 .../llvm/include/llvm/Target/TargetPfmCounters.td  |    2 +
 .../llvm/include/llvm/Target/TargetSchedule.td     |   28 +-
 .../llvm/include/llvm/Target/TargetSelectionDAG.td |  271 +-
 .../include/llvm/Testing/Support/SupportHelpers.h  |  138 +
 .../llvm/include/llvm/TextAPI/ELF/ELFStub.h        |   68 -
 .../llvm/include/llvm/TextAPI/ELF/TBEHandler.h     |   43 -
 .../llvm/include/llvm/TextAPI/MachO/Platform.h     |    5 +-
 .../AggressiveInstCombine/AggressiveInstCombine.h  |    1 -
 .../llvm/include/llvm/Transforms/Coroutines.h      |    2 +-
 .../llvm/Transforms/Coroutines/CoroCleanup.h       |    1 +
 .../include/llvm/Transforms/Coroutines/CoroEarly.h |    1 +
 .../include/llvm/Transforms/Coroutines/CoroElide.h |    1 +
 .../include/llvm/Transforms/Coroutines/CoroSplit.h |    5 +
 .../include/llvm/Transforms/HelloNew/HelloWorld.h  |   23 +
 .../llvm/include/llvm/Transforms/IPO.h             |   18 +-
 .../include/llvm/Transforms/IPO/AlwaysInliner.h    |    1 +
 .../llvm/Transforms/IPO/Annotation2Metadata.h      |   30 +
 .../llvm/include/llvm/Transforms/IPO/Attributor.h  |  658 ++-
 .../include/llvm/Transforms/IPO/BlockExtractor.h   |   25 +
 .../llvm/Transforms/IPO/CalledValuePropagation.h   |    1 -
 .../llvm/include/llvm/Transforms/IPO/CrossDSOCFI.h |    1 -
 .../llvm/Transforms/IPO/ForceFunctionAttrs.h       |    1 -
 .../llvm/include/llvm/Transforms/IPO/IROutliner.h  |  358 ++
 .../llvm/include/llvm/Transforms/IPO/Inliner.h     |   14 +-
 .../include/llvm/Transforms/IPO/LoopExtractor.h    |   32 +
 .../include/llvm/Transforms/IPO/LowerTypeTests.h   |   10 +-
 .../llvm/include/llvm/Transforms/IPO/OpenMPOpt.h   |   10 +
 .../llvm/Transforms/IPO/SampleContextTracker.h     |  152 +
 .../include/llvm/Transforms/IPO/SampleProfile.h    |    9 +-
 .../llvm/Transforms/IPO/SampleProfileProbe.h       |  147 +
 .../include/llvm/Transforms/IPO/StripSymbols.h     |   47 +
 .../llvm/Transforms/IPO/WholeProgramDevirt.h       |    3 +
 .../llvm/Transforms/InstCombine/InstCombiner.h     |  528 ++
 .../llvm/include/llvm/Transforms/Instrumentation.h |    8 +-
 .../Transforms/Instrumentation/AddressSanitizer.h  |    2 +
 .../Transforms/Instrumentation/BoundsChecking.h    |    1 +
 .../Transforms/Instrumentation/DataFlowSanitizer.h |   32 +
 .../llvm/Transforms/Instrumentation/GCOVProfiler.h |    2 +-
 .../Instrumentation/HWAddressSanitizer.h           |   19 +
 .../Transforms/Instrumentation/InstrProfiling.h    |    5 -
 .../llvm/Transforms/Instrumentation/MemProfiler.h  |   51 +
 .../Transforms/Instrumentation/MemorySanitizer.h   |    1 +
 .../Transforms/Instrumentation/SanitizerCoverage.h |    1 +
 .../Transforms/Instrumentation/ThreadSanitizer.h   |    1 +
 .../llvm/include/llvm/Transforms/ObjCARC.h         |   18 +
 .../llvm/include/llvm/Transforms/Scalar.h          |   67 +-
 .../Transforms/Scalar/AlignmentFromAssumptions.h   |    6 +-
 .../llvm/Transforms/Scalar/AnnotationRemarks.h     |   26 +
 .../llvm/Transforms/Scalar/ConstantHoisting.h      |    1 -
 .../llvm/Transforms/Scalar/ConstraintElimination.h |   24 +
 .../llvm/include/llvm/Transforms/Scalar/DCE.h      |    6 +
 .../llvm/include/llvm/Transforms/Scalar/GVN.h      |   15 +-
 .../llvm/Transforms/Scalar/IndVarSimplify.h        |    4 +
 .../llvm/Transforms/Scalar/InferAddressSpaces.h    |   27 +
 .../llvm/Transforms/Scalar/InstSimplifyPass.h      |    4 -
 .../include/llvm/Transforms/Scalar/JumpThreading.h |   71 +-
 .../include/llvm/Transforms/Scalar/LoopFlatten.h   |   32 +
 .../llvm/Transforms/Scalar/LoopIdiomRecognize.h    |   13 +
 .../llvm/Transforms/Scalar/LoopInterchange.h       |   24 +
 .../llvm/Transforms/Scalar/LoopPassManager.h       |  414 +-
 .../include/llvm/Transforms/Scalar/LoopReroll.h    |   27 +
 .../include/llvm/Transforms/Scalar/LoopRotation.h  |    4 +-
 .../llvm/Transforms/Scalar/LoopUnrollPass.h        |    2 +-
 .../llvm/Transforms/Scalar/LoopVersioningLICM.h    |   25 +
 .../include/llvm/Transforms/Scalar/LowerAtomic.h   |    1 +
 .../llvm/Transforms/Scalar/LowerExpectIntrinsic.h  |    3 +
 .../llvm/Transforms/Scalar/LowerMatrixIntrinsics.h |    8 +-
 .../llvm/Transforms/Scalar/MemCpyOptimizer.h       |   29 +-
 .../llvm/Transforms/Scalar/NaryReassociate.h       |    2 +-
 .../llvm/include/llvm/Transforms/Scalar/Reg2Mem.h  |   27 +
 .../llvm/include/llvm/Transforms/Scalar/SROA.h     |    5 +-
 .../Transforms/Scalar/ScalarizeMaskedMemIntrin.h   |   29 +
 .../Transforms/Scalar/SeparateConstOffsetFromGEP.h |   27 +
 .../include/llvm/Transforms/Scalar/SimplifyCFG.h   |   10 +-
 .../Transforms/Scalar/StraightLineStrengthReduce.h |   24 +
 .../llvm/Transforms/Scalar/StructurizeCFG.h        |   20 +
 .../llvm/include/llvm/Transforms/Utils.h           |    2 +-
 .../llvm/Transforms/Utils/BasicBlockUtils.h        |  129 +-
 .../include/llvm/Transforms/Utils/BuildLibCalls.h  |    4 +
 .../llvm/Transforms/Utils/CallGraphUpdater.h       |    2 +-
 .../llvm/include/llvm/Transforms/Utils/Cloning.h   |   43 +
 .../llvm/include/llvm/Transforms/Utils/Debugify.h  |   65 +-
 .../include/llvm/Transforms/Utils/FixIrreducible.h |   20 +
 .../Utils/ImportedFunctionsInliningStatistics.h    |  106 -
 .../llvm/Transforms/Utils/InstructionNamer.h       |   20 +
 .../llvm/include/llvm/Transforms/Utils/Local.h     |  118 +-
 .../llvm/include/llvm/Transforms/Utils/LoopPeel.h  |   40 +
 .../llvm/Transforms/Utils/LoopRotationUtils.h      |    3 +-
 .../llvm/include/llvm/Transforms/Utils/LoopUtils.h |  105 +-
 .../include/llvm/Transforms/Utils/LoopVersioning.h |   24 +-
 .../include/llvm/Transforms/Utils/LowerSwitch.h    |   26 +
 .../include/llvm/Transforms/Utils/MatrixUtils.h    |   94 +
 .../include/llvm/Transforms/Utils/MetaRenamer.h    |   26 +
 .../llvm/include/llvm/Transforms/Utils/MisExpect.h |   43 -
 .../include/llvm/Transforms/Utils/PredicateInfo.h  |   34 +-
 .../Transforms/Utils/ScalarEvolutionExpander.h     |  792 +--
 .../llvm/Transforms/Utils/SimplifyCFGOptions.h     |   77 +
 .../include/llvm/Transforms/Utils/SimplifyIndVar.h |   23 +
 .../llvm/Transforms/Utils/SimplifyLibCalls.h       |    2 +-
 .../llvm/include/llvm/Transforms/Utils/SizeOpts.h  |   11 -
 .../llvm/Transforms/Utils/StripGCRelocates.h       |   25 +
 .../Transforms/Utils/StripNonLineTableDebugInfo.h  |   26 +
 .../llvm/Transforms/Utils/UnifyFunctionExitNodes.h |   27 +-
 .../include/llvm/Transforms/Utils/UnifyLoopExits.h |   22 +
 .../include/llvm/Transforms/Utils/UnrollLoop.h     |   17 -
 .../Vectorize/LoopVectorizationLegality.h          |   74 +-
 .../llvm/Transforms/Vectorize/SLPVectorizer.h      |    7 +-
 .../llvm/include/llvm/module.modulemap             |   20 +-
 .../llvm/lib/Analysis/AliasAnalysis.cpp            |  100 +-
 .../llvm/lib/Analysis/AliasAnalysisEvaluator.cpp   |    6 +-
 .../llvm/lib/Analysis/AliasSetTracker.cpp          |   39 +-
 .../llvm-project/llvm/lib/Analysis/Analysis.cpp    |    7 +-
 .../llvm/lib/Analysis/AssumeBundleQueries.cpp      |   18 +-
 .../llvm/lib/Analysis/AssumptionCache.cpp          |   21 +-
 .../llvm/lib/Analysis/BasicAliasAnalysis.cpp       |  875 ++--
 .../llvm/lib/Analysis/BranchProbabilityInfo.cpp    |  914 ++--
 contrib/llvm-project/llvm/lib/Analysis/CFG.cpp     |   15 +-
 .../llvm-project/llvm/lib/Analysis/CFGPrinter.cpp  |   12 +-
 .../llvm/lib/Analysis/CFLAndersAliasAnalysis.cpp   |    3 +-
 .../llvm/lib/Analysis/CGSCCPassManager.cpp         |  558 ++-
 .../llvm-project/llvm/lib/Analysis/CallGraph.cpp   |   48 +-
 .../llvm/lib/Analysis/CallGraphSCCPass.cpp         |   33 +-
 .../llvm-project/llvm/lib/Analysis/CallPrinter.cpp |    5 +-
 .../llvm/lib/Analysis/CaptureTracking.cpp          |  105 +-
 .../llvm-project/llvm/lib/Analysis/CodeMetrics.cpp |   15 +-
 .../llvm/lib/Analysis/ConstantFolding.cpp          |  335 +-
 .../llvm/lib/Analysis/ConstraintSystem.cpp         |  158 +
 .../llvm-project/llvm/lib/Analysis/CostModel.cpp   |   13 +-
 contrib/llvm-project/llvm/lib/Analysis/DDG.cpp     |    8 +-
 .../llvm-project/llvm/lib/Analysis/DDGPrinter.cpp  |  150 +
 .../llvm/lib/Analysis/Delinearization.cpp          |   48 +-
 .../llvm/lib/Analysis/DemandedBits.cpp             |  107 +-
 .../llvm/lib/Analysis/DependenceAnalysis.cpp       |   79 +-
 .../llvm/lib/Analysis/DependenceGraphBuilder.cpp   |  137 +-
 .../lib/Analysis/DevelopmentModeInlineAdvisor.cpp  |  531 ++
 .../llvm/lib/Analysis/DivergenceAnalysis.cpp       |  342 +-
 .../llvm/lib/Analysis/DomTreeUpdater.cpp           |    5 +-
 .../llvm/lib/Analysis/EHPersonalities.cpp          |   38 +-
 .../lib/Analysis/FunctionPropertiesAnalysis.cpp    |   88 +
 .../llvm/lib/Analysis/GlobalsModRef.cpp            |   39 +-
 .../llvm/lib/Analysis/IRSimilarityIdentifier.cpp   |  937 ++++
 .../llvm/lib/Analysis/IVDescriptors.cpp            |  350 +-
 .../ImportedFunctionsInliningStatistics.cpp        |  212 +
 .../lib/Analysis/IndirectCallPromotionAnalysis.cpp |    4 +-
 .../llvm/lib/Analysis/InlineAdvisor.cpp            |  190 +-
 .../llvm-project/llvm/lib/Analysis/InlineCost.cpp  |  263 +-
 .../llvm/lib/Analysis/InlineFeaturesAnalysis.cpp   |   41 -
 .../lib/Analysis/InlineSizeEstimatorAnalysis.cpp   |  142 +-
 .../llvm-project/llvm/lib/Analysis/InstCount.cpp   |   87 +-
 .../llvm/lib/Analysis/InstructionSimplify.cpp      | 1238 +++--
 .../llvm-project/llvm/lib/Analysis/Interval.cpp    |   11 -
 .../llvm/lib/Analysis/LazyCallGraph.cpp            |  297 +-
 .../llvm/lib/Analysis/LazyValueInfo.cpp            |  342 +-
 .../llvm/lib/Analysis/LegacyDivergenceAnalysis.cpp |    6 +-
 contrib/llvm-project/llvm/lib/Analysis/Lint.cpp    |  399 +-
 contrib/llvm-project/llvm/lib/Analysis/Loads.cpp   |   69 +-
 .../llvm/lib/Analysis/LoopAccessAnalysis.cpp       |  106 +-
 .../llvm/lib/Analysis/LoopAnalysisManager.cpp      |    1 +
 .../llvm/lib/Analysis/LoopCacheAnalysis.cpp        |   17 +-
 .../llvm-project/llvm/lib/Analysis/LoopInfo.cpp    |   32 +-
 .../llvm/lib/Analysis/LoopNestAnalysis.cpp         |  136 +-
 .../llvm-project/llvm/lib/Analysis/LoopPass.cpp    |   22 +-
 .../llvm/lib/Analysis/MLInlineAdvisor.cpp          |   65 +-
 .../llvm/lib/Analysis/MemDepPrinter.cpp            |    4 +-
 .../llvm/lib/Analysis/MemDerefPrinter.cpp          |   34 +
 .../llvm/lib/Analysis/MemoryBuiltins.cpp           |   27 +-
 .../llvm/lib/Analysis/MemoryDependenceAnalysis.cpp |   63 +-
 .../llvm/lib/Analysis/MemoryLocation.cpp           |   86 +-
 .../llvm-project/llvm/lib/Analysis/MemorySSA.cpp   |  248 +-
 .../llvm/lib/Analysis/MemorySSAUpdater.cpp         |  102 +-
 .../llvm/lib/Analysis/ModuleDebugInfoPrinter.cpp   |   56 +-
 .../llvm/lib/Analysis/ModuleSummaryAnalysis.cpp    |    4 +-
 .../llvm-project/llvm/lib/Analysis/MustExecute.cpp |   98 +-
 .../llvm/lib/Analysis/ObjCARCAliasAnalysis.cpp     |   15 +-
 .../llvm/lib/Analysis/ObjCARCAnalysisUtils.cpp     |   20 +
 .../llvm/lib/Analysis/ObjCARCInstKind.cpp          |    1 -
 .../lib/Analysis/OptimizationRemarkEmitter.cpp     |   30 +-
 .../llvm-project/llvm/lib/Analysis/PhiValues.cpp   |    1 -
 .../llvm-project/llvm/lib/Analysis/RegionInfo.cpp  |    3 -
 .../llvm-project/llvm/lib/Analysis/RegionPass.cpp  |   72 +-
 .../llvm/lib/Analysis/ReleaseModeModelRunner.cpp   |    7 +-
 .../llvm/lib/Analysis/ReplayInlineAdvisor.cpp      |   82 +
 .../llvm/lib/Analysis/ScalarEvolution.cpp          | 2002 +++++---
 .../lib/Analysis/ScalarEvolutionAliasAnalysis.cpp  |    6 +-
 .../llvm/lib/Analysis/ScalarEvolutionDivision.cpp  |   12 +-
 .../llvm/lib/Analysis/ScopedNoAliasAA.cpp          |   28 +-
 .../llvm/lib/Analysis/StackLifetime.cpp            |   83 +-
 .../llvm/lib/Analysis/StackSafetyAnalysis.cpp      |  256 +-
 .../llvm/lib/Analysis/SyncDependenceAnalysis.cpp   |  469 +-
 contrib/llvm-project/llvm/lib/Analysis/TFUtils.cpp |  338 +-
 .../llvm/lib/Analysis/TargetLibraryInfo.cpp        |   38 +-
 .../llvm/lib/Analysis/TargetTransformInfo.cpp      |  168 +-
 .../llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp   |   82 +
 .../llvm/lib/Analysis/VFABIDemangling.cpp          |   10 +-
 .../llvm/lib/Analysis/ValueTracking.cpp            | 1485 +++---
 .../llvm-project/llvm/lib/Analysis/VectorUtils.cpp |   77 +-
 .../llvm/lib/Analysis/models/inliner/README.txt    |    3 +
 .../lib/Analysis/models/inliner/output_spec.json   |   14 +
 .../llvm-project/llvm/lib/AsmParser/LLLexer.cpp    |    8 +
 .../llvm-project/llvm/lib/AsmParser/LLParser.cpp   | 3734 +++++++-------
 contrib/llvm-project/llvm/lib/AsmParser/LLParser.h |  444 +-
 contrib/llvm-project/llvm/lib/AsmParser/LLToken.h  |    7 +
 .../llvm-project/llvm/lib/BinaryFormat/Dwarf.cpp   |   25 +
 .../llvm-project/llvm/lib/BinaryFormat/MachO.cpp   |    8 +-
 .../llvm/lib/BinaryFormat/MsgPackDocument.cpp      |    3 +-
 .../llvm-project/llvm/lib/BinaryFormat/Wasm.cpp    |    4 +
 .../llvm-project/llvm/lib/BinaryFormat/XCOFF.cpp   |   78 +
 .../llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp    |    5 +
 .../llvm/lib/Bitcode/Reader/BitcodeReader.cpp      |  291 +-
 .../llvm/lib/Bitcode/Reader/MetadataLoader.cpp     |  192 +-
 .../llvm/lib/Bitcode/Reader/ValueList.cpp          |    3 -
 .../llvm/lib/Bitcode/Writer/BitcodeWriter.cpp      |  223 +-
 .../llvm/lib/Bitcode/Writer/ValueEnumerator.cpp    |   37 +-
 .../llvm/lib/Bitstream/Reader/BitstreamReader.cpp  |   12 +-
 .../llvm/lib/CodeGen/AllocationOrder.cpp           |   17 +-
 .../llvm/lib/CodeGen/AllocationOrder.h             |  128 +-
 contrib/llvm-project/llvm/lib/CodeGen/Analysis.cpp |   61 +-
 .../llvm/lib/CodeGen/AsmPrinter/AIXException.cpp   |   79 +
 .../llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp     |   14 +-
 .../llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp    |    4 +-
 .../llvm/lib/CodeGen/AsmPrinter/AddressPool.h      |    2 +-
 .../llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp     |  589 ++-
 .../lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp     |   49 +-
 .../lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp |   42 +-
 .../llvm/lib/CodeGen/AsmPrinter/ByteStreamer.h     |    8 +-
 .../llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp  |  176 +-
 .../llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h    |   11 +-
 .../llvm/lib/CodeGen/AsmPrinter/DIE.cpp            |   90 +-
 .../llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp        |   11 +-
 .../llvm/lib/CodeGen/AsmPrinter/DIEHash.h          |    4 +-
 .../AsmPrinter/DbgEntityHistoryCalculator.cpp      |  185 +-
 .../lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp    |   76 +-
 .../lib/CodeGen/AsmPrinter/DwarfCFIException.cpp   |    7 +-
 .../lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp    |   86 +-
 .../llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h |   12 +-
 .../llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp     |  482 +-
 .../llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h       |   64 +-
 .../llvm/lib/CodeGen/AsmPrinter/DwarfException.h   |   14 +
 .../lib/CodeGen/AsmPrinter/DwarfExpression.cpp     |   46 +-
 .../llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h  |   14 +-
 .../llvm/lib/CodeGen/AsmPrinter/DwarfFile.cpp      |   12 +-
 .../llvm/lib/CodeGen/AsmPrinter/DwarfFile.h        |    3 +-
 .../lib/CodeGen/AsmPrinter/DwarfStringPool.cpp     |   10 +-
 .../llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.h  |    2 +-
 .../llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp      |  282 +-
 .../llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h        |   34 +-
 .../llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp     |  397 +-
 .../llvm/lib/CodeGen/AsmPrinter/EHStreamer.h       |   43 +-
 .../llvm/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp |    7 +-
 .../lib/CodeGen/AsmPrinter/PseudoProbePrinter.cpp  |   84 +
 .../lib/CodeGen/AsmPrinter/PseudoProbePrinter.h    |   53 +
 .../llvm/lib/CodeGen/AsmPrinter/WasmException.cpp  |   11 +-
 .../llvm/lib/CodeGen/AsmPrinter/WasmException.h    |    1 +
 .../llvm/lib/CodeGen/AsmPrinter/WinCFGuard.cpp     |   53 +-
 .../llvm/lib/CodeGen/AsmPrinter/WinCFGuard.h       |    1 +
 .../llvm/lib/CodeGen/AsmPrinter/WinException.cpp   |   56 +-
 .../llvm/lib/CodeGen/AtomicExpandPass.cpp          |   17 +-
 .../llvm/lib/CodeGen/BBSectionsPrepare.cpp         |  457 --
 .../llvm/lib/CodeGen/BasicBlockSections.cpp        |  484 ++
 .../llvm/lib/CodeGen/BranchFolding.cpp             |   16 +-
 .../llvm-project/llvm/lib/CodeGen/BranchFolding.h  |    3 +-
 .../llvm/lib/CodeGen/BranchRelaxation.cpp          |   40 +-
 .../llvm/lib/CodeGen/BreakFalseDeps.cpp            |   32 +-
 .../llvm/lib/CodeGen/CalcSpillWeights.cpp          |  289 +-
 .../llvm/lib/CodeGen/CallingConvLower.cpp          |   23 +-
 contrib/llvm-project/llvm/lib/CodeGen/CodeGen.cpp  |    6 +-
 .../llvm/lib/CodeGen/CodeGenPassBuilder.cpp        |   25 +
 .../llvm/lib/CodeGen/CodeGenPrepare.cpp            |  387 +-
 .../llvm-project/llvm/lib/CodeGen/CommandFlags.cpp |   96 +-
 .../lib/CodeGen/DeadMachineInstructionElim.cpp     |   21 +-
 .../llvm/lib/CodeGen/DetectDeadLanes.cpp           |    5 +-
 .../llvm/lib/CodeGen/DwarfEHPrepare.cpp            |  232 +-
 .../llvm/lib/CodeGen/EarlyIfConversion.cpp         |  106 +-
 .../llvm/lib/CodeGen/ExpandReductions.cpp          |  128 +-
 .../lib/CodeGen/FixupStatepointCallerSaved.cpp     |  362 +-
 .../llvm/lib/CodeGen/GCRootLowering.cpp            |    5 +-
 .../llvm/lib/CodeGen/GlobalISel/CSEInfo.cpp        |   28 +-
 .../llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp  |   34 +-
 .../llvm/lib/CodeGen/GlobalISel/CallLowering.cpp   |  624 ++-
 .../llvm/lib/CodeGen/GlobalISel/Combiner.cpp       |    5 +-
 .../llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp | 2031 +++++++-
 .../lib/CodeGen/GlobalISel/GISelChangeObserver.cpp |    2 +-
 .../llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp |  249 +-
 .../llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp   |  893 +++-
 .../lib/CodeGen/GlobalISel/InlineAsmLowering.cpp   |    6 +
 .../lib/CodeGen/GlobalISel/InstructionSelect.cpp   |    2 +-
 .../lib/CodeGen/GlobalISel/InstructionSelector.cpp |   14 +-
 .../lib/CodeGen/GlobalISel/LegalityPredicates.cpp  |   24 +-
 .../lib/CodeGen/GlobalISel/LegalizeMutations.cpp   |   10 +
 .../llvm/lib/CodeGen/GlobalISel/Legalizer.cpp      |    2 +-
 .../lib/CodeGen/GlobalISel/LegalizerHelper.cpp     | 1612 ++++--
 .../llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp  |    6 +-
 .../llvm/lib/CodeGen/GlobalISel/Localizer.cpp      |   23 +
 .../lib/CodeGen/GlobalISel/MachineIRBuilder.cpp    |  145 +-
 .../lib/CodeGen/GlobalISel/RegisterBankInfo.cpp    |    3 +-
 .../llvm/lib/CodeGen/GlobalISel/Utils.cpp          |  390 +-
 .../llvm-project/llvm/lib/CodeGen/GlobalMerge.cpp  |    5 +-
 .../llvm/lib/CodeGen/HardwareLoops.cpp             |   51 +-
 .../llvm-project/llvm/lib/CodeGen/IfConversion.cpp |    7 +-
 .../llvm/lib/CodeGen/ImplicitNullChecks.cpp        |  258 +-
 .../llvm/lib/CodeGen/InlineSpiller.cpp             |  107 +-
 .../llvm/lib/CodeGen/InterferenceCache.cpp         |   10 +-
 .../llvm/lib/CodeGen/InterferenceCache.h           |   18 +-
 .../llvm/lib/CodeGen/InterleavedAccessPass.cpp     |  119 +-
 .../lib/CodeGen/InterleavedLoadCombinePass.cpp     |   17 +-
 .../llvm/lib/CodeGen/IntrinsicLowering.cpp         |    1 +
 .../llvm/lib/CodeGen/LLVMTargetMachine.cpp         |   63 +-
 .../llvm/lib/CodeGen/LexicalScopes.cpp             |    2 +-
 .../llvm/lib/CodeGen/LiveDebugValues.cpp           | 1976 --------
 .../CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp  | 3363 +++++++++++++
 .../CodeGen/LiveDebugValues/LiveDebugValues.cpp    |   97 +
 .../lib/CodeGen/LiveDebugValues/LiveDebugValues.h  |   32 +
 .../CodeGen/LiveDebugValues/VarLocBasedImpl.cpp    | 1994 ++++++++
 .../llvm/lib/CodeGen/LiveDebugVariables.cpp        |   71 +-
 .../llvm/lib/CodeGen/LiveDebugVariables.h          |    1 -
 .../llvm-project/llvm/lib/CodeGen/LiveInterval.cpp |   23 +-
 .../llvm/lib/CodeGen/LiveIntervalCalc.cpp          |    6 +-
 .../llvm/lib/CodeGen/LiveIntervalUnion.cpp         |   16 +-
 .../llvm/lib/CodeGen/LiveIntervals.cpp             |   79 +-
 .../llvm/lib/CodeGen/LiveRangeEdit.cpp             |   29 +-
 .../llvm/lib/CodeGen/LiveRangeShrink.cpp           |    3 +-
 .../llvm/lib/CodeGen/LiveRegMatrix.cpp             |   52 +-
 .../llvm-project/llvm/lib/CodeGen/LiveRegUnits.cpp |    4 -
 .../llvm/lib/CodeGen/LiveVariables.cpp             |   61 +-
 .../llvm/lib/CodeGen/LocalStackSlotAllocation.cpp  |   11 +-
 .../llvm-project/llvm/lib/CodeGen/LowLevelType.cpp |   16 +
 .../llvm-project/llvm/lib/CodeGen/LowerEmuTLS.cpp  |    3 +-
 .../llvm-project/llvm/lib/CodeGen/MBFIWrapper.cpp  |   12 +
 .../llvm/lib/CodeGen/MIRCanonicalizerPass.cpp      |   15 +-
 .../llvm/lib/CodeGen/MIRParser/MILexer.cpp         |   11 +-
 .../llvm/lib/CodeGen/MIRParser/MILexer.h           |    2 +
 .../llvm/lib/CodeGen/MIRParser/MIParser.cpp        |   34 +-
 .../llvm/lib/CodeGen/MIRParser/MIRParser.cpp       |   37 +-
 .../llvm-project/llvm/lib/CodeGen/MIRPrinter.cpp   |  137 +-
 .../llvm/lib/CodeGen/MIRVRegNamerUtils.cpp         |   14 +
 .../llvm/lib/CodeGen/MachineBasicBlock.cpp         |  244 +-
 .../llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp |   15 +-
 .../llvm/lib/CodeGen/MachineBlockPlacement.cpp     |   84 +-
 .../llvm-project/llvm/lib/CodeGen/MachineCSE.cpp   |   48 +-
 .../llvm/lib/CodeGen/MachineCheckDebugify.cpp      |  126 +
 .../llvm/lib/CodeGen/MachineCombiner.cpp           |   72 +-
 .../llvm/lib/CodeGen/MachineCopyPropagation.cpp    |  126 +-
 .../llvm/lib/CodeGen/MachineDebugify.cpp           |   32 +-
 .../llvm/lib/CodeGen/MachineFunction.cpp           |  125 +-
 .../llvm/lib/CodeGen/MachineFunctionPass.cpp       |    1 -
 .../lib/CodeGen/MachineFunctionPrinterPass.cpp     |    4 +-
 .../llvm/lib/CodeGen/MachineFunctionSplitter.cpp   |  155 +
 .../llvm-project/llvm/lib/CodeGen/MachineInstr.cpp |  179 +-
 .../llvm-project/llvm/lib/CodeGen/MachineLICM.cpp  |  169 +-
 .../llvm/lib/CodeGen/MachineLoopInfo.cpp           |   56 +
 .../llvm/lib/CodeGen/MachineLoopUtils.cpp          |   11 -
 .../llvm/lib/CodeGen/MachineModuleInfo.cpp         |   27 +-
 .../llvm/lib/CodeGen/MachineOperand.cpp            |   30 +-
 .../llvm/lib/CodeGen/MachineOutliner.cpp           |   21 +-
 .../llvm/lib/CodeGen/MachinePassManager.cpp        |  121 +
 .../llvm/lib/CodeGen/MachinePipeliner.cpp          |   51 +-
 .../llvm/lib/CodeGen/MachineRegisterInfo.cpp       |   25 +-
 .../llvm/lib/CodeGen/MachineSSAUpdater.cpp         |   11 +-
 .../llvm/lib/CodeGen/MachineScheduler.cpp          |  256 +-
 .../llvm-project/llvm/lib/CodeGen/MachineSink.cpp  |  290 +-
 .../llvm/lib/CodeGen/MachineStableHash.cpp         |  194 +
 .../llvm/lib/CodeGen/MachineTraceMetrics.cpp       |   28 +-
 .../llvm/lib/CodeGen/MachineVerifier.cpp           |  412 +-
 .../llvm/lib/CodeGen/ModuloSchedule.cpp            |    2 -
 .../llvm/lib/CodeGen/MultiHazardRecognizer.cpp     |   92 +
 .../llvm/lib/CodeGen/PHIElimination.cpp            |   74 +-
 .../llvm/lib/CodeGen/PHIEliminationUtils.cpp       |    2 +-
 .../llvm-project/llvm/lib/CodeGen/ParallelCG.cpp   |    2 +
 .../llvm/lib/CodeGen/PeepholeOptimizer.cpp         |  165 +-
 .../llvm/lib/CodeGen/PostRAHazardRecognizer.cpp    |    8 +-
 .../llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp  |    2 +-
 .../llvm/lib/CodeGen/PrologEpilogInserter.cpp      |   40 +-
 .../llvm/lib/CodeGen/PseudoProbeInserter.cpp       |   95 +
 contrib/llvm-project/llvm/lib/CodeGen/RDFGraph.cpp |    5 -
 .../llvm-project/llvm/lib/CodeGen/RDFLiveness.cpp  |  151 +-
 .../llvm-project/llvm/lib/CodeGen/RDFRegisters.cpp |   35 +-
 .../llvm/lib/CodeGen/ReachingDefAnalysis.cpp       |  186 +-
 .../llvm-project/llvm/lib/CodeGen/RegAllocBase.cpp |   34 +-
 .../llvm-project/llvm/lib/CodeGen/RegAllocBase.h   |    4 +-
 .../llvm/lib/CodeGen/RegAllocBasic.cpp             |   51 +-
 .../llvm-project/llvm/lib/CodeGen/RegAllocFast.cpp | 1486 +++---
 .../llvm/lib/CodeGen/RegAllocGreedy.cpp            |  537 +-
 .../llvm-project/llvm/lib/CodeGen/RegAllocPBQP.cpp |  117 +-
 .../llvm/lib/CodeGen/RegisterClassInfo.cpp         |   13 +-
 .../llvm/lib/CodeGen/RegisterCoalescer.cpp         |  270 +-
 .../llvm/lib/CodeGen/RegisterCoalescer.h           |   14 +-
 .../llvm/lib/CodeGen/RegisterPressure.cpp          |   80 +-
 .../llvm/lib/CodeGen/RegisterScavenging.cpp        |   43 +-
 .../llvm/lib/CodeGen/RenameIndependentSubregs.cpp  |   10 +-
 .../llvm-project/llvm/lib/CodeGen/SafeStack.cpp    |   35 +-
 .../llvm/lib/CodeGen/SafeStackLayout.cpp           |    9 +-
 .../llvm/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp  |  911 ----
 .../llvm/lib/CodeGen/ScheduleDAGInstrs.cpp         |   15 +-
 .../llvm/lib/CodeGen/ScheduleDAGPrinter.cpp        |    2 +-
 .../llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp  | 2396 +++++----
 .../llvm/lib/CodeGen/SelectionDAG/FastISel.cpp     |  257 +-
 .../CodeGen/SelectionDAG/FunctionLoweringInfo.cpp  |   10 +-
 .../llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp |  103 +-
 .../llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h   |   12 +-
 .../llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp  |  551 +-
 .../CodeGen/SelectionDAG/LegalizeFloatTypes.cpp    |  289 +-
 .../CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp  |  396 +-
 .../lib/CodeGen/SelectionDAG/LegalizeTypes.cpp     |   17 +-
 .../llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h  |   62 +-
 .../CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp  |   11 +-
 .../lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp |   25 +-
 .../CodeGen/SelectionDAG/LegalizeVectorTypes.cpp   | 1082 ++--
 .../lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp   |    2 +-
 .../lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp |   17 +-
 .../CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp    |   31 +-
 .../llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp |  946 ++--
 .../SelectionDAG/SelectionDAGAddressAnalysis.cpp   |   31 +-
 .../CodeGen/SelectionDAG/SelectionDAGBuilder.cpp   |  776 +--
 .../lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h |   22 +-
 .../CodeGen/SelectionDAG/SelectionDAGDumper.cpp    |   53 +-
 .../lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp  |  164 +-
 .../CodeGen/SelectionDAG/StatepointLowering.cpp    |  322 +-
 .../lib/CodeGen/SelectionDAG/TargetLowering.cpp    | 1071 +++-
 .../llvm-project/llvm/lib/CodeGen/ShrinkWrap.cpp   |   13 +-
 .../llvm/lib/CodeGen/SjLjEHPrepare.cpp             |    2 +-
 .../llvm/lib/CodeGen/SpillPlacement.cpp            |    4 -
 contrib/llvm-project/llvm/lib/CodeGen/SplitKit.cpp |   78 +-
 contrib/llvm-project/llvm/lib/CodeGen/SplitKit.h   |   13 +-
 .../llvm/lib/CodeGen/StackColoring.cpp             |   56 +-
 .../llvm-project/llvm/lib/CodeGen/StackMaps.cpp    |  180 +-
 .../llvm/lib/CodeGen/StackProtector.cpp            |   48 +-
 .../llvm/lib/CodeGen/StackSlotColoring.cpp         |   17 +-
 .../llvm/lib/CodeGen/SwiftErrorValueTracking.cpp   |    4 +-
 .../llvm/lib/CodeGen/SwitchLoweringUtils.cpp       |    4 +-
 .../llvm/lib/CodeGen/TailDuplicator.cpp            |    3 +-
 .../llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp   |   11 +-
 .../llvm/lib/CodeGen/TargetInstrInfo.cpp           |   44 +-
 .../llvm/lib/CodeGen/TargetLoweringBase.cpp        |  272 +-
 .../lib/CodeGen/TargetLoweringObjectFileImpl.cpp   |  443 +-
 .../llvm/lib/CodeGen/TargetOptionsImpl.cpp         |    8 +-
 .../llvm/lib/CodeGen/TargetPassConfig.cpp          |  303 +-
 .../llvm/lib/CodeGen/TargetRegisterInfo.cpp        |   53 +-
 .../llvm/lib/CodeGen/TargetSubtargetInfo.cpp       |   11 +-
 .../llvm/lib/CodeGen/TwoAddressInstructionPass.cpp |  270 +-
 .../llvm/lib/CodeGen/TypePromotion.cpp             |   12 +-
 .../llvm-project/llvm/lib/CodeGen/ValueTypes.cpp   |   51 +-
 .../llvm-project/llvm/lib/CodeGen/VirtRegMap.cpp   |   17 +-
 .../llvm/lib/CodeGen/WasmEHPrepare.cpp             |   31 +-
 .../llvm-project/llvm/lib/CodeGen/WinEHPrepare.cpp |    8 +-
 .../llvm/lib/CodeGen/XRayInstrumentation.cpp       |   12 +-
 .../llvm/lib/DWARFLinker/DWARFLinker.cpp           |  417 +-
 .../lib/DWARFLinker/DWARFLinkerCompileUnit.cpp     |    6 +-
 .../lib/DWARFLinker/DWARFLinkerDeclContext.cpp     |   80 +-
 .../llvm/lib/DWARFLinker/DWARFStreamer.cpp         |   50 +-
 .../lib/DebugInfo/CodeView/CodeViewRecordIO.cpp    |    2 -
 .../CodeView/DebugFrameDataSubsection.cpp          |    7 +-
 .../llvm/lib/DebugInfo/CodeView/EnumTables.cpp     |   12 +-
 .../CodeView/LazyRandomTypeCollection.cpp          |   10 +-
 .../llvm/lib/DebugInfo/CodeView/RecordName.cpp     |    8 +-
 .../lib/DebugInfo/CodeView/RecordSerialization.cpp |    2 +-
 .../lib/DebugInfo/CodeView/TypeIndexDiscovery.cpp  |    3 +-
 .../lib/DebugInfo/CodeView/TypeRecordMapping.cpp   |    2 +-
 .../lib/DebugInfo/CodeView/TypeStreamMerger.cpp    |    2 +-
 .../llvm/lib/DebugInfo/DWARF/DWARFAddressRange.cpp |    5 +-
 .../llvm/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp  |    7 +-
 .../llvm/lib/DebugInfo/DWARF/DWARFContext.cpp      |  113 +-
 .../lib/DebugInfo/DWARF/DWARFDataExtractor.cpp     |   14 +-
 .../lib/DebugInfo/DWARF/DWARFDebugArangeSet.cpp    |   16 +-
 .../llvm/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp |    3 +-
 .../llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp   |   53 +-
 .../lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp    |    3 +-
 .../llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp    |   84 +-
 .../llvm/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp     |   14 +-
 .../llvm/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp   |   14 +-
 .../lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp    |    7 +
 .../lib/DebugInfo/DWARF/DWARFDebugRnglists.cpp     |  139 +-
 .../llvm/lib/DebugInfo/DWARF/DWARFDie.cpp          |   74 +-
 .../llvm/lib/DebugInfo/DWARF/DWARFExpression.cpp   |   29 +-
 .../llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp    |   13 +-
 .../llvm/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp     |    4 +-
 .../llvm/lib/DebugInfo/DWARF/DWARFListTable.cpp    |    9 +-
 .../llvm/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp     |    7 +-
 .../llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp         |  194 +-
 .../llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp     |   42 +
 .../llvm/lib/DebugInfo/GSYM/GsymCreator.cpp        |    2 +-
 .../llvm/lib/DebugInfo/MSF/MSFBuilder.cpp          |    6 +-
 .../PDB/Native/DbiModuleDescriptorBuilder.cpp      |   81 +-
 .../lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp  |   10 +-
 .../lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp  |    2 +-
 .../lib/DebugInfo/PDB/Native/NamedStreamMap.cpp    |    2 +-
 .../lib/DebugInfo/PDB/Native/NativeEnumSymbols.cpp |   41 +
 .../DebugInfo/PDB/Native/NativeFunctionSymbol.cpp  |   98 +-
 .../PDB/Native/NativeInlineSiteSymbol.cpp          |  177 +
 .../lib/DebugInfo/PDB/Native/NativeLineNumber.cpp  |    7 +-
 .../DebugInfo/PDB/Native/NativePublicSymbol.cpp    |    6 +-
 .../lib/DebugInfo/PDB/Native/NativeSession.cpp     |   81 +-
 .../lib/DebugInfo/PDB/Native/NativeSourceFile.cpp  |    2 +-
 .../lib/DebugInfo/PDB/Native/NativeTypeUDT.cpp     |    2 +-
 .../llvm/lib/DebugInfo/PDB/Native/SymbolCache.cpp  |  195 +-
 .../lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp  |   62 +-
 .../llvm/lib/DebugInfo/PDB/PDBContext.cpp          |   39 +-
 .../llvm/lib/DebugInfo/PDB/PDBExtras.cpp           |   16 +-
 .../llvm/lib/DebugInfo/PDB/PDBInterfaceAnchors.cpp |    4 +-
 .../llvm/lib/DebugInfo/PDB/PDBSymbol.cpp           |   17 +
 .../llvm/lib/DebugInfo/PDB/UDTLayout.cpp           |    8 +-
 .../llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp     |    6 +-
 .../DebugInfo/Symbolize/SymbolizableObjectFile.cpp |    9 -
 .../DebugInfo/Symbolize/SymbolizableObjectFile.h   |    6 +-
 .../llvm/lib/DebugInfo/Symbolize/Symbolize.cpp     |   19 +-
 .../llvm-project/llvm/lib/Demangle/Demangle.cpp    |    2 +-
 .../llvm/lib/Demangle/MicrosoftDemangleNodes.cpp   |    1 -
 .../llvm/lib/ExecutionEngine/ExecutionEngine.cpp   |   15 +-
 .../IntelJITEvents/IntelJITEventListener.cpp       |  396 +-
 .../IntelJITEvents/IntelJITEventsWrapper.h         |   45 +-
 .../Interpreter/ExternalFunctions.cpp              |    2 +-
 .../lib/ExecutionEngine/JITLink/EHFrameSupport.cpp |  334 +-
 .../ExecutionEngine/JITLink/EHFrameSupportImpl.h   |   24 +-
 .../llvm/lib/ExecutionEngine/JITLink/ELF.cpp       |   66 +-
 .../lib/ExecutionEngine/JITLink/ELF_x86_64.cpp     |  544 +-
 .../llvm/lib/ExecutionEngine/JITLink/JITLink.cpp   |   58 +-
 .../lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp |   62 +-
 .../lib/ExecutionEngine/JITLink/JITLinkGeneric.h   |   16 +-
 .../JITLink/JITLinkMemoryManager.cpp               |    3 +-
 .../llvm/lib/ExecutionEngine/JITLink/MachO.cpp     |   60 +-
 .../JITLink/MachOLinkGraphBuilder.cpp              |   10 +-
 .../JITLink/MachOLinkGraphBuilder.h                |    4 +-
 .../lib/ExecutionEngine/JITLink/MachO_arm64.cpp    |  111 +-
 .../lib/ExecutionEngine/JITLink/MachO_x86_64.cpp   |   65 +-
 .../llvm/lib/ExecutionEngine/MCJIT/MCJIT.h         |   10 +-
 .../ExecutionEngine/Orc/CompileOnDemandLayer.cpp   |   76 +-
 .../llvm/lib/ExecutionEngine/Orc/Core.cpp          | 2636 ++++++----
 .../lib/ExecutionEngine/Orc/ExecutionUtils.cpp     |   40 +-
 .../lib/ExecutionEngine/Orc/IRCompileLayer.cpp     |    6 +-
 .../lib/ExecutionEngine/Orc/IRTransformLayer.cpp   |    6 +-
 .../lib/ExecutionEngine/Orc/IndirectionUtils.cpp   |   21 +-
 .../llvm/lib/ExecutionEngine/Orc/LLJIT.cpp         |  188 +-
 .../llvm/lib/ExecutionEngine/Orc/Layer.cpp         |   67 +-
 .../llvm/lib/ExecutionEngine/Orc/LazyReexports.cpp |   72 +-
 .../llvm/lib/ExecutionEngine/Orc/Legacy.cpp        |   68 -
 .../llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp |   57 +-
 .../llvm/lib/ExecutionEngine/Orc/NullResolver.cpp  |   37 -
 .../lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp |  339 +-
 .../ExecutionEngine/Orc/ObjectTransformLayer.cpp   |    7 +-
 .../llvm/lib/ExecutionEngine/Orc/OrcCBindings.cpp  |  158 -
 .../lib/ExecutionEngine/Orc/OrcCBindingsStack.h    |  534 --
 .../ExecutionEngine/Orc/OrcMCJITReplacement.cpp    |  138 -
 .../lib/ExecutionEngine/Orc/OrcMCJITReplacement.h  |  502 --
 .../lib/ExecutionEngine/Orc/OrcV2CBindings.cpp     |  303 +-
 .../Orc/RTDyldObjectLinkingLayer.cpp               |  149 +-
 .../lib/ExecutionEngine/Orc/Shared/OrcError.cpp    |  120 +
 .../lib/ExecutionEngine/Orc/Shared/RPCError.cpp    |   58 +
 .../Orc/Shared/TargetProcessControlTypes.cpp       |   44 +
 .../lib/ExecutionEngine/Orc/SpeculateAnalyses.cpp  |    9 +-
 .../llvm/lib/ExecutionEngine/Orc/Speculation.cpp   |    7 +-
 .../Orc/TPCDynamicLibrarySearchGenerator.cpp       |   70 +
 .../ExecutionEngine/Orc/TPCEHFrameRegistrar.cpp    |   80 +
 .../ExecutionEngine/Orc/TPCIndirectionUtils.cpp    |  423 ++
 .../Orc/TargetProcess/RegisterEHFrames.cpp         |  208 +
 .../Orc/TargetProcess/TargetExecutionUtils.cpp     |   43 +
 .../ExecutionEngine/Orc/TargetProcessControl.cpp   |  153 +
 .../lib/ExecutionEngine/Orc/ThreadSafeModule.cpp   |    2 +-
 .../llvm/lib/ExecutionEngine/OrcError/OrcError.cpp |  121 -
 .../llvm/lib/ExecutionEngine/OrcError/RPCError.cpp |   59 -
 .../RuntimeDyld/RTDyldMemoryManager.cpp            |   12 +-
 .../ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp    |   37 +-
 .../RuntimeDyld/RuntimeDyldChecker.cpp             |    4 +-
 .../ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp |   25 +-
 .../ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h  |    7 +-
 .../RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h    |    9 +-
 .../lib/ExecutionEngine/SectionMemoryManager.cpp   |   20 +-
 .../llvm-project/llvm/lib/FileCheck/FileCheck.cpp  | 2754 ++++++++++
 .../llvm/lib/FileCheck/FileCheckImpl.h             |  859 ++++
 .../llvm/lib/Frontend/OpenMP/OMPContext.cpp        |   38 +-
 .../llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp      |  830 +++-
 .../llvm-project/llvm/lib/FuzzMutate/IRMutator.cpp |   43 +
 contrib/llvm-project/llvm/lib/IR/AsmWriter.cpp     |  146 +-
 contrib/llvm-project/llvm/lib/IR/Assumptions.cpp   |   36 +
 contrib/llvm-project/llvm/lib/IR/AttributeImpl.h   |    7 +-
 contrib/llvm-project/llvm/lib/IR/Attributes.cpp    |  241 +-
 contrib/llvm-project/llvm/lib/IR/AutoUpgrade.cpp   |  490 +-
 contrib/llvm-project/llvm/lib/IR/BasicBlock.cpp    |  142 +-
 contrib/llvm-project/llvm/lib/IR/ConstantFold.cpp  |  151 +-
 contrib/llvm-project/llvm/lib/IR/ConstantRange.cpp |   97 +-
 contrib/llvm-project/llvm/lib/IR/Constants.cpp     |  415 +-
 contrib/llvm-project/llvm/lib/IR/Core.cpp          |   54 +-
 contrib/llvm-project/llvm/lib/IR/DIBuilder.cpp     |   47 +-
 contrib/llvm-project/llvm/lib/IR/DataLayout.cpp    |  414 +-
 contrib/llvm-project/llvm/lib/IR/DebugInfo.cpp     |   35 +-
 .../llvm-project/llvm/lib/IR/DebugInfoMetadata.cpp |  142 +-
 contrib/llvm-project/llvm/lib/IR/DebugLoc.cpp      |   18 +-
 .../llvm-project/llvm/lib/IR/DiagnosticInfo.cpp    |   26 +-
 contrib/llvm-project/llvm/lib/IR/Dominators.cpp    |   24 +-
 contrib/llvm-project/llvm/lib/IR/Function.cpp      |  128 +-
 contrib/llvm-project/llvm/lib/IR/Globals.cpp       |    8 +-
 contrib/llvm-project/llvm/lib/IR/IRBuilder.cpp     |  212 +-
 .../llvm-project/llvm/lib/IR/IRPrintingPasses.cpp  |    4 +-
 contrib/llvm-project/llvm/lib/IR/Instruction.cpp   |   35 +-
 contrib/llvm-project/llvm/lib/IR/Instructions.cpp  |  338 +-
 contrib/llvm-project/llvm/lib/IR/IntrinsicInst.cpp |   37 +-
 contrib/llvm-project/llvm/lib/IR/LLVMContext.cpp   |    9 +-
 .../llvm-project/llvm/lib/IR/LLVMContextImpl.cpp   |   40 +-
 contrib/llvm-project/llvm/lib/IR/LLVMContextImpl.h |  225 +-
 .../llvm/lib/IR/LLVMRemarkStreamer.cpp             |   17 +-
 .../llvm-project/llvm/lib/IR/LegacyPassManager.cpp |  156 +-
 contrib/llvm-project/llvm/lib/IR/MDBuilder.cpp     |   32 +-
 contrib/llvm-project/llvm/lib/IR/Mangler.cpp       |   32 +-
 contrib/llvm-project/llvm/lib/IR/Metadata.cpp      |  391 +-
 contrib/llvm-project/llvm/lib/IR/Module.cpp        |    9 +-
 .../llvm/lib/IR/ModuleSummaryIndex.cpp             |   28 +-
 contrib/llvm-project/llvm/lib/IR/Operator.cpp      |   20 +-
 contrib/llvm-project/llvm/lib/IR/OptBisect.cpp     |    2 +
 contrib/llvm-project/llvm/lib/IR/Pass.cpp          |   13 +-
 .../llvm/lib/IR/PassInstrumentation.cpp            |   19 +
 contrib/llvm-project/llvm/lib/IR/PassManager.cpp   |   48 +
 contrib/llvm-project/llvm/lib/IR/PassRegistry.cpp  |    6 +-
 .../llvm-project/llvm/lib/IR/PassTimingInfo.cpp    |   60 +-
 contrib/llvm-project/llvm/lib/IR/PrintPasses.cpp   |   88 +
 .../llvm-project/llvm/lib/IR/ProfileSummary.cpp    |    2 +-
 contrib/llvm-project/llvm/lib/IR/PseudoProbe.cpp   |   99 +
 .../llvm-project/llvm/lib/IR/ReplaceConstant.cpp   |   70 +
 .../llvm/lib/IR/SafepointIRVerifier.cpp            |    3 +-
 .../llvm-project/llvm/lib/IR/StructuralHash.cpp    |   84 +
 contrib/llvm-project/llvm/lib/IR/Type.cpp          |   63 +-
 contrib/llvm-project/llvm/lib/IR/Use.cpp           |   29 +-
 contrib/llvm-project/llvm/lib/IR/User.cpp          |    2 +-
 contrib/llvm-project/llvm/lib/IR/Value.cpp         |   86 +-
 contrib/llvm-project/llvm/lib/IR/Verifier.cpp      |  431 +-
 .../llvm-project/llvm/lib/IRReader/IRReader.cpp    |    8 +-
 .../llvm/lib/InterfaceStub/ELFObjHandler.cpp       |  680 +++
 .../llvm/lib/InterfaceStub/ELFStub.cpp             |   28 +
 .../llvm/lib/InterfaceStub/TBEHandler.cpp          |  143 +
 contrib/llvm-project/llvm/lib/LTO/Caching.cpp      |    1 +
 contrib/llvm-project/llvm/lib/LTO/LTO.cpp          |   75 +-
 contrib/llvm-project/llvm/lib/LTO/LTOBackend.cpp   |  240 +-
 .../llvm-project/llvm/lib/LTO/LTOCodeGenerator.cpp |   54 +-
 contrib/llvm-project/llvm/lib/LTO/LTOModule.cpp    |    3 +
 .../llvm/lib/LTO/ThinLTOCodeGenerator.cpp          |   68 +-
 contrib/llvm-project/llvm/lib/Linker/IRMover.cpp   |  158 +-
 .../llvm-project/llvm/lib/Linker/LinkModules.cpp   |    2 +-
 .../llvm-project/llvm/lib/MC/ELFObjectWriter.cpp   |   46 +-
 contrib/llvm-project/llvm/lib/MC/MCAsmBackend.cpp  |   15 +-
 contrib/llvm-project/llvm/lib/MC/MCAsmInfo.cpp     |   11 +-
 .../llvm-project/llvm/lib/MC/MCAsmInfoXCOFF.cpp    |    9 +
 contrib/llvm-project/llvm/lib/MC/MCAsmMacro.cpp    |    5 +
 contrib/llvm-project/llvm/lib/MC/MCAsmStreamer.cpp |  152 +-
 contrib/llvm-project/llvm/lib/MC/MCAssembler.cpp   |   94 +-
 contrib/llvm-project/llvm/lib/MC/MCCodeView.cpp    |    5 +-
 contrib/llvm-project/llvm/lib/MC/MCContext.cpp     |   57 +-
 contrib/llvm-project/llvm/lib/MC/MCDwarf.cpp       |   29 +-
 contrib/llvm-project/llvm/lib/MC/MCELFStreamer.cpp |   35 +-
 contrib/llvm-project/llvm/lib/MC/MCExpr.cpp        |  142 +-
 contrib/llvm-project/llvm/lib/MC/MCFragment.cpp    |   24 +
 .../llvm-project/llvm/lib/MC/MCObjectFileInfo.cpp  |  132 +-
 .../llvm-project/llvm/lib/MC/MCObjectStreamer.cpp  |   90 +-
 .../llvm-project/llvm/lib/MC/MCParser/AsmLexer.cpp |  174 +-
 .../llvm/lib/MC/MCParser/AsmParser.cpp             |  131 +-
 .../llvm/lib/MC/MCParser/COFFAsmParser.cpp         |   10 +-
 .../llvm/lib/MC/MCParser/COFFMasmParser.cpp        |  119 +-
 .../llvm/lib/MC/MCParser/DarwinAsmParser.cpp       |    1 +
 .../llvm/lib/MC/MCParser/ELFAsmParser.cpp          |   14 +-
 .../llvm/lib/MC/MCParser/MasmParser.cpp            | 1903 ++++---
 .../llvm/lib/MC/MCParser/WasmAsmParser.cpp         |   82 +-
 contrib/llvm-project/llvm/lib/MC/MCPseudoProbe.cpp |  213 +
 contrib/llvm-project/llvm/lib/MC/MCSchedule.cpp    |    4 +-
 contrib/llvm-project/llvm/lib/MC/MCSection.cpp     |    3 +-
 contrib/llvm-project/llvm/lib/MC/MCSectionELF.cpp  |    8 +-
 .../llvm-project/llvm/lib/MC/MCSectionMachO.cpp    |   20 +-
 contrib/llvm-project/llvm/lib/MC/MCSectionWasm.cpp |   10 +-
 .../llvm-project/llvm/lib/MC/MCSectionXCOFF.cpp    |    1 +
 contrib/llvm-project/llvm/lib/MC/MCStreamer.cpp    |   63 +-
 .../llvm-project/llvm/lib/MC/MCSubtargetInfo.cpp   |   42 +-
 contrib/llvm-project/llvm/lib/MC/MCSymbolXCOFF.cpp |   10 +-
 .../llvm-project/llvm/lib/MC/MCWasmStreamer.cpp    |   12 -
 contrib/llvm-project/llvm/lib/MC/MCWin64EH.cpp     |  487 +-
 .../llvm-project/llvm/lib/MC/MCWinCOFFStreamer.cpp |   14 +-
 .../llvm/lib/MC/StringTableBuilder.cpp             |   15 +-
 .../llvm-project/llvm/lib/MC/WasmObjectWriter.cpp  |  936 ++--
 .../llvm/lib/MC/WinCOFFObjectWriter.cpp            |    5 +-
 .../llvm-project/llvm/lib/MC/XCOFFObjectWriter.cpp |   27 +-
 .../llvm/lib/MCA/HardwareUnits/RegisterFile.cpp    |   17 +-
 .../llvm/lib/MCA/HardwareUnits/Scheduler.cpp       |    2 +-
 contrib/llvm-project/llvm/lib/MCA/InstrBuilder.cpp |   16 +-
 .../llvm/lib/MCA/Stages/InstructionTables.cpp      |    3 +-
 contrib/llvm-project/llvm/lib/Object/Archive.cpp   |    4 +-
 .../llvm-project/llvm/lib/Object/ArchiveWriter.cpp |  161 +-
 contrib/llvm-project/llvm/lib/Object/Binary.cpp    |   10 +-
 .../llvm/lib/Object/COFFObjectFile.cpp             |   74 +-
 contrib/llvm-project/llvm/lib/Object/ELF.cpp       |   58 +-
 .../llvm-project/llvm/lib/Object/ELFObjectFile.cpp |  155 +-
 .../llvm/lib/Object/MachOObjectFile.cpp            |   40 +-
 .../llvm/lib/Object/MachOUniversal.cpp             |   29 +
 .../llvm/lib/Object/MachOUniversalWriter.cpp       |  337 ++
 .../llvm-project/llvm/lib/Object/ObjectFile.cpp    |    5 +-
 .../llvm/lib/Object/RelocationResolver.cpp         |  286 +-
 .../llvm-project/llvm/lib/Object/SymbolSize.cpp    |    2 +-
 .../llvm-project/llvm/lib/Object/SymbolicFile.cpp  |   59 +-
 .../llvm/lib/Object/WasmObjectFile.cpp             |  350 +-
 .../llvm/lib/Object/XCOFFObjectFile.cpp            |  303 +-
 .../llvm/lib/ObjectYAML/ArchiveEmitter.cpp         |   51 +
 .../llvm/lib/ObjectYAML/ArchiveYAML.cpp            |   58 +
 .../llvm/lib/ObjectYAML/COFFEmitter.cpp            |    1 -
 .../llvm/lib/ObjectYAML/CodeViewYAMLSymbols.cpp    |   25 +-
 .../llvm/lib/ObjectYAML/DWARFEmitter.cpp           | 1051 +++-
 .../llvm/lib/ObjectYAML/DWARFVisitor.cpp           |  196 -
 .../llvm/lib/ObjectYAML/DWARFVisitor.h             |   97 -
 .../llvm-project/llvm/lib/ObjectYAML/DWARFYAML.cpp |  168 +-
 .../llvm/lib/ObjectYAML/ELFEmitter.cpp             |  677 +--
 .../llvm-project/llvm/lib/ObjectYAML/ELFYAML.cpp   |  478 +-
 .../llvm/lib/ObjectYAML/MachOEmitter.cpp           |   48 +-
 .../llvm-project/llvm/lib/ObjectYAML/MachOYAML.cpp |    4 +-
 .../llvm/lib/ObjectYAML/MinidumpYAML.cpp           |    4 +-
 .../llvm/lib/ObjectYAML/ObjectYAML.cpp             |    9 +-
 .../llvm/lib/ObjectYAML/WasmEmitter.cpp            |   42 +-
 .../llvm-project/llvm/lib/ObjectYAML/WasmYAML.cpp  |   13 +-
 .../llvm-project/llvm/lib/ObjectYAML/yaml2obj.cpp  |    2 +
 contrib/llvm-project/llvm/lib/Option/OptTable.cpp  |  109 +-
 contrib/llvm-project/llvm/lib/Option/Option.cpp    |   15 +-
 .../llvm-project/llvm/lib/Passes/PassBuilder.cpp   |  847 +++-
 .../llvm-project/llvm/lib/Passes/PassRegistry.def  |   84 +-
 .../llvm/lib/Passes/StandardInstrumentations.cpp   |  743 ++-
 .../lib/ProfileData/Coverage/CoverageMapping.cpp   |   34 +-
 .../ProfileData/Coverage/CoverageMappingReader.cpp |   53 +-
 .../ProfileData/Coverage/CoverageMappingWriter.cpp |   16 +-
 contrib/llvm-project/llvm/lib/ProfileData/GCOV.cpp |  940 ++--
 .../llvm/lib/ProfileData/InstrProf.cpp             |   41 +-
 .../llvm/lib/ProfileData/InstrProfReader.cpp       |   36 +-
 .../llvm/lib/ProfileData/InstrProfWriter.cpp       |   12 +-
 .../llvm/lib/ProfileData/ProfileSummaryBuilder.cpp |   45 +-
 .../llvm/lib/ProfileData/SampleProf.cpp            |   83 +-
 .../llvm/lib/ProfileData/SampleProfReader.cpp      |  340 +-
 .../llvm/lib/ProfileData/SampleProfWriter.cpp      |  272 +-
 .../llvm/lib/Remarks/BitstreamRemarkParser.cpp     |    1 -
 .../llvm/lib/Remarks/BitstreamRemarkParser.h       |    6 +-
 .../llvm/lib/Support/AArch64TargetParser.cpp       |   16 +-
 .../llvm/lib/Support/AMDGPUMetadata.cpp            |    3 +-
 .../llvm-project/llvm/lib/Support/APFixedPoint.cpp |  574 +++
 contrib/llvm-project/llvm/lib/Support/APFloat.cpp  |  108 +-
 contrib/llvm-project/llvm/lib/Support/APInt.cpp    |   17 +-
 .../llvm/lib/Support/ARMAttributeParser.cpp        |    2 +-
 .../llvm/lib/Support/ARMTargetParser.cpp           |   33 +-
 contrib/llvm-project/llvm/lib/Support/CRC.cpp      |    2 +-
 .../llvm-project/llvm/lib/Support/CachePruning.cpp |    5 +-
 .../llvm-project/llvm/lib/Support/CommandLine.cpp  |   95 +-
 .../llvm-project/llvm/lib/Support/Compression.cpp  |    4 +-
 .../llvm/lib/Support/ConvertUTFWrapper.cpp         |    2 +
 .../llvm/lib/Support/CrashRecoveryContext.cpp      |   39 +-
 .../llvm-project/llvm/lib/Support/DebugCounter.cpp |    2 +-
 .../llvm/lib/Support/DynamicLibrary.cpp            |    4 +-
 .../llvm/lib/Support/ELFAttributeParser.cpp        |    2 +-
 contrib/llvm-project/llvm/lib/Support/Error.cpp    |    4 +
 .../llvm/lib/Support/ErrorHandling.cpp             |   11 +-
 .../llvm-project/llvm/lib/Support/FileCheck.cpp    | 2580 ----------
 .../llvm-project/llvm/lib/Support/FileCheckImpl.h  |  832 ----
 .../llvm/lib/Support/FileCollector.cpp             |  128 +-
 .../llvm/lib/Support/FormatVariadic.cpp            |   23 +-
 contrib/llvm-project/llvm/lib/Support/Host.cpp     |  298 +-
 contrib/llvm-project/llvm/lib/Support/InitLLVM.cpp |   11 +-
 .../llvm/lib/Support/InstructionCost.cpp           |   24 +
 contrib/llvm-project/llvm/lib/Support/JSON.cpp     |  219 +-
 .../llvm-project/llvm/lib/Support/KnownBits.cpp    |  397 ++
 .../llvm-project/llvm/lib/Support/LineIterator.cpp |    8 +-
 .../llvm-project/llvm/lib/Support/LowLevelType.cpp |    2 +-
 .../llvm/lib/Support/MemoryBufferRef.cpp           |   19 +
 contrib/llvm-project/llvm/lib/Support/Path.cpp     |   52 +-
 .../llvm/lib/Support/PrettyStackTrace.cpp          |   13 +-
 contrib/llvm-project/llvm/lib/Support/Process.cpp  |   21 +-
 contrib/llvm-project/llvm/lib/Support/Program.cpp  |   14 +-
 contrib/llvm-project/llvm/lib/Support/SHA1.cpp     |    2 +-
 contrib/llvm-project/llvm/lib/Support/Signals.cpp  |    9 +-
 .../llvm-project/llvm/lib/Support/Signposts.cpp    |   41 +-
 .../llvm-project/llvm/lib/Support/SmallVector.cpp  |   68 +-
 .../llvm-project/llvm/lib/Support/SourceMgr.cpp    |   56 +-
 .../llvm-project/llvm/lib/Support/TargetParser.cpp |  111 +-
 contrib/llvm-project/llvm/lib/Support/Timer.cpp    |   14 +-
 .../llvm-project/llvm/lib/Support/TrigramIndex.cpp |    7 +-
 contrib/llvm-project/llvm/lib/Support/Triple.cpp   |   98 +-
 contrib/llvm-project/llvm/lib/Support/Unicode.cpp  |   11 +
 .../llvm-project/llvm/lib/Support/Unix/Path.inc    |   76 +-
 .../llvm-project/llvm/lib/Support/Unix/Process.inc |    6 +-
 .../llvm-project/llvm/lib/Support/Unix/Program.inc |    6 +-
 .../llvm-project/llvm/lib/Support/Unix/Signals.inc |   29 +-
 .../llvm/lib/Support/VirtualFileSystem.cpp         |  158 +-
 .../llvm-project/llvm/lib/Support/Windows/Path.inc |   61 +-
 .../llvm/lib/Support/Windows/Process.inc           |    3 +-
 .../llvm/lib/Support/Windows/Program.inc           |   59 +-
 .../llvm/lib/Support/Windows/Signals.inc           |    5 +-
 .../llvm/lib/Support/Windows/Threading.inc         |   25 +-
 .../llvm/lib/Support/X86TargetParser.cpp           |  372 +-
 .../llvm-project/llvm/lib/Support/YAMLParser.cpp   |  155 +-
 .../llvm-project/llvm/lib/Support/YAMLTraits.cpp   |   75 +-
 .../llvm-project/llvm/lib/Support/raw_ostream.cpp  |   59 +-
 .../llvm/lib/TableGen/DetailedRecordsBackend.cpp   |  203 +
 contrib/llvm-project/llvm/lib/TableGen/Error.cpp   |   85 +-
 .../llvm-project/llvm/lib/TableGen/JSONBackend.cpp |    5 +-
 contrib/llvm-project/llvm/lib/TableGen/Main.cpp    |   43 +-
 contrib/llvm-project/llvm/lib/TableGen/Record.cpp  |  519 +-
 contrib/llvm-project/llvm/lib/TableGen/TGLexer.cpp |   53 +-
 contrib/llvm-project/llvm/lib/TableGen/TGLexer.h   |   28 +-
 .../llvm-project/llvm/lib/TableGen/TGParser.cpp    |  722 ++-
 contrib/llvm-project/llvm/lib/TableGen/TGParser.h  |    5 +
 .../llvm/lib/TableGen/TableGenBackendSkeleton.cpp  |   64 +
 .../llvm-project/llvm/lib/Target/AArch64/AArch64.h |    8 +-
 .../llvm/lib/Target/AArch64/AArch64.td             |  200 +-
 .../Target/AArch64/AArch64AdvSIMDScalarPass.cpp    |    2 +-
 .../llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp  |  401 +-
 .../lib/Target/AArch64/AArch64BranchTargets.cpp    |    5 +-
 .../Target/AArch64/AArch64CallingConvention.cpp    |   57 +-
 .../llvm/lib/Target/AArch64/AArch64Combine.td      |   64 +-
 .../Target/AArch64/AArch64CompressJumpTables.cpp   |   41 +-
 .../Target/AArch64/AArch64ExpandPseudoInsts.cpp    |   44 +
 .../lib/Target/AArch64/AArch64FalkorHWPFFix.cpp    |   10 +-
 .../llvm/lib/Target/AArch64/AArch64FastISel.cpp    |   18 +-
 .../lib/Target/AArch64/AArch64FrameLowering.cpp    |  575 ++-
 .../llvm/lib/Target/AArch64/AArch64FrameLowering.h |   30 +-
 .../lib/Target/AArch64/AArch64ISelDAGToDAG.cpp     |   87 +-
 .../lib/Target/AArch64/AArch64ISelLowering.cpp     | 3183 +++++++++---
 .../llvm/lib/Target/AArch64/AArch64ISelLowering.h  |  137 +-
 .../llvm/lib/Target/AArch64/AArch64InstrFormats.td |  130 +-
 .../llvm/lib/Target/AArch64/AArch64InstrGISel.td   |   60 +
 .../llvm/lib/Target/AArch64/AArch64InstrInfo.cpp   |  527 +-
 .../llvm/lib/Target/AArch64/AArch64InstrInfo.h     |   52 +-
 .../llvm/lib/Target/AArch64/AArch64InstrInfo.td    |  334 +-
 .../Target/AArch64/AArch64LoadStoreOptimizer.cpp   |   43 +-
 .../llvm/lib/Target/AArch64/AArch64MCInstLower.cpp |    6 +
 .../Target/AArch64/AArch64MachineFunctionInfo.cpp  |   82 +
 .../Target/AArch64/AArch64MachineFunctionInfo.h    |   74 +-
 .../llvm/lib/Target/AArch64/AArch64MacroFusion.cpp |   16 +-
 .../AArch64/AArch64RedundantCopyElimination.cpp    |    5 +
 .../lib/Target/AArch64/AArch64RegisterInfo.cpp     |  111 +-
 .../llvm/lib/Target/AArch64/AArch64RegisterInfo.h  |   18 +-
 .../llvm/lib/Target/AArch64/AArch64RegisterInfo.td |   26 +
 .../lib/Target/AArch64/AArch64SIMDInstrOpt.cpp     |   10 +-
 .../llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td |  684 ++-
 .../llvm/lib/Target/AArch64/AArch64SchedA55.td     |  339 ++
 .../llvm/lib/Target/AArch64/AArch64SchedA57.td     |   61 +-
 .../lib/Target/AArch64/AArch64SchedA57WriteRes.td  |   19 +
 .../llvm/lib/Target/AArch64/AArch64SchedA64FX.td   | 3890 +++++++++++++++
 .../llvm/lib/Target/AArch64/AArch64SchedTSV110.td  |  745 +++
 .../lib/Target/AArch64/AArch64SelectionDAGInfo.cpp |    6 +-
 .../llvm/lib/Target/AArch64/AArch64StackOffset.h   |  151 -
 .../lib/Target/AArch64/AArch64StackTagging.cpp     |   22 +-
 .../Target/AArch64/AArch64StackTaggingPreRA.cpp    |  167 +-
 .../llvm/lib/Target/AArch64/AArch64Subtarget.cpp   |   22 +-
 .../llvm/lib/Target/AArch64/AArch64Subtarget.h     |   52 +-
 .../lib/Target/AArch64/AArch64SystemOperands.td    |  253 +-
 .../lib/Target/AArch64/AArch64TargetMachine.cpp    |   57 +-
 .../llvm/lib/Target/AArch64/AArch64TargetMachine.h |    6 +
 .../Target/AArch64/AArch64TargetTransformInfo.cpp  |  196 +-
 .../Target/AArch64/AArch64TargetTransformInfo.h    |   52 +-
 .../Target/AArch64/AsmParser/AArch64AsmParser.cpp  |  596 ++-
 .../AArch64/Disassembler/AArch64Disassembler.cpp   |   45 +-
 .../Target/AArch64/GISel/AArch64CallLowering.cpp   |  106 +-
 .../lib/Target/AArch64/GISel/AArch64CallLowering.h |    5 +-
 .../Target/AArch64/GISel/AArch64GlobalISelUtils.h  |   29 +
 .../AArch64/GISel/AArch64InstructionSelector.cpp   | 1677 ++++---
 .../Target/AArch64/GISel/AArch64LegalizerInfo.cpp  |  336 +-
 .../Target/AArch64/GISel/AArch64LegalizerInfo.h    |    2 +
 .../AArch64/GISel/AArch64PostLegalizerCombiner.cpp |  523 +-
 .../AArch64/GISel/AArch64PostLegalizerLowering.cpp |  704 +++
 .../AArch64/GISel/AArch64PostSelectOptimize.cpp    |  187 +
 .../AArch64/GISel/AArch64PreLegalizerCombiner.cpp  |   29 +-
 .../AArch64/GISel/AArch64RegisterBankInfo.cpp      |   92 +-
 .../Target/AArch64/GISel/AArch64RegisterBankInfo.h |   15 +-
 .../llvm/lib/Target/AArch64/GISel/select-saddo.mir |  158 +
 .../llvm/lib/Target/AArch64/GISel/select-ssubo.mir |  158 +
 .../AArch64/MCTargetDesc/AArch64AddressingModes.h  |    6 +-
 .../AArch64/MCTargetDesc/AArch64AsmBackend.cpp     |   27 +-
 .../MCTargetDesc/AArch64ELFObjectWriter.cpp        |    6 +-
 .../AArch64/MCTargetDesc/AArch64ELFStreamer.cpp    |   55 +
 .../AArch64/MCTargetDesc/AArch64InstPrinter.cpp    |   34 +-
 .../AArch64/MCTargetDesc/AArch64InstPrinter.h      |    8 +-
 .../AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp      |    4 +-
 .../Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp  |    1 +
 .../Target/AArch64/MCTargetDesc/AArch64MCExpr.h    |    2 +
 .../AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp   |    8 +-
 .../MCTargetDesc/AArch64MachObjectWriter.cpp       |    6 +-
 .../AArch64/MCTargetDesc/AArch64TargetStreamer.cpp |   57 +-
 .../AArch64/MCTargetDesc/AArch64TargetStreamer.h   |   18 +
 .../MCTargetDesc/AArch64WinCOFFStreamer.cpp        |   37 +-
 .../llvm/lib/Target/AArch64/SVEInstrFormats.td     |  557 ++-
 .../llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp   |   58 +-
 .../lib/Target/AArch64/Utils/AArch64BaseInfo.cpp   |    9 +-
 .../lib/Target/AArch64/Utils/AArch64BaseInfo.h     |   20 +-
 .../llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.h   |   99 +-
 .../llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.td  |  197 +-
 .../llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp |   58 +-
 .../llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h   |   22 +-
 .../lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp   |   31 +-
 .../Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp |   21 +-
 .../Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp  |   16 +-
 .../lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp  |    4 +-
 .../lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h    |    7 +-
 .../llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp    |  162 +-
 .../llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h      |   24 +-
 .../lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp    |   18 +-
 .../llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp  |  856 +++-
 .../llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h    |   41 +-
 .../llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td    |   76 +-
 .../lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp     |   39 +-
 .../llvm/lib/Target/AMDGPU/AMDGPUCombine.td        |   32 +-
 .../lib/Target/AMDGPU/AMDGPUExportClustering.cpp   |   10 +-
 .../lib/Target/AMDGPU/AMDGPUExportClustering.h     |    3 +-
 .../llvm/lib/Target/AMDGPU/AMDGPUGISel.td          |   39 +-
 .../Target/AMDGPU/AMDGPUGenRegisterBankInfo.def    |   12 +-
 .../lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp    |   12 +-
 .../llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h |   38 +-
 .../Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp    |  122 +-
 .../lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h  |   23 +-
 .../llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp  |  601 ++-
 .../llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp  |  237 +-
 .../llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h    |   24 +-
 .../llvm/lib/Target/AMDGPU/AMDGPUInline.cpp        |  226 -
 .../Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp   | 1075 ++++
 .../llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp     |   10 +-
 .../llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h       |   25 +-
 .../Target/AMDGPU/AMDGPUInstructionSelector.cpp    | 1101 ++--
 .../lib/Target/AMDGPU/AMDGPUInstructionSelector.h  |   62 +-
 .../llvm/lib/Target/AMDGPU/AMDGPUInstructions.td   |  159 +-
 .../lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp |  195 +
 .../llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 1259 +++--
 .../llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h   |   48 +-
 .../llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp      |   93 +-
 .../llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp       |    5 +-
 .../lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp    |   10 +-
 .../Target/AMDGPU/AMDGPULowerKernelArguments.cpp   |   53 +-
 .../Target/AMDGPU/AMDGPULowerKernelAttributes.cpp  |   38 +-
 .../llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp   |    7 +-
 .../llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp  |   38 +
 .../llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h    |   47 +
 .../Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp |   62 +-
 .../lib/Target/AMDGPU/AMDGPUMachineFunction.cpp    |   36 +-
 .../llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h |   24 +-
 .../lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp  |    1 -
 .../lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h    |    4 -
 .../llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp   |    4 +-
 .../llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.h     |    3 +-
 .../AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp   |    5 -
 .../llvm/lib/Target/AMDGPU/AMDGPUPTNote.h          |    7 +-
 .../lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp   |    4 -
 .../lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h     |    1 -
 .../Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp  |  120 +-
 .../Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp   |    8 +-
 .../Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp   |  154 +-
 .../llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp |  281 +-
 .../Target/AMDGPU/AMDGPUPropagateAttributes.cpp    |   49 +-
 .../lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp    |   10 +-
 .../lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp   |  372 +-
 .../lib/Target/AMDGPU/AMDGPURegisterBankInfo.h     |    8 +-
 .../llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td  |    2 +-
 .../Target/AMDGPU/AMDGPURewriteOutArguments.cpp    |   33 +-
 .../lib/Target/AMDGPU/AMDGPUSearchableTables.td    |    2 +
 .../llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp     |  172 +-
 .../llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h       | 1204 +----
 .../llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |  252 +-
 .../llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h   |   23 +-
 .../lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp   |    9 +-
 .../Target/AMDGPU/AMDGPUTargetTransformInfo.cpp    |  365 +-
 .../lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h  |  106 +-
 .../AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp       |   63 +-
 .../llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp |   66 +-
 .../lib/Target/AMDGPU/AMDILCFGStructurizer.cpp     |   29 +-
 .../llvm/lib/Target/AMDGPU/AMDKernelCodeT.h        |    8 +-
 .../Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp    | 1997 +++++---
 .../llvm/lib/Target/AMDGPU/BUFInstructions.td      |  237 +-
 .../llvm/lib/Target/AMDGPU/DSInstructions.td       |  119 +-
 .../AMDGPU/Disassembler/AMDGPUDisassembler.cpp     |  435 +-
 .../AMDGPU/Disassembler/AMDGPUDisassembler.h       |   39 +-
 .../llvm/lib/Target/AMDGPU/EXPInstructions.td      |  125 +
 .../lib/Target/AMDGPU/EvergreenInstructions.td     |  138 +-
 .../llvm/lib/Target/AMDGPU/FLATInstructions.td     |  704 ++-
 .../llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp       |   22 +-
 .../llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp |  220 +-
 .../llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h   |    3 +-
 .../llvm/lib/Target/AMDGPU/GCNILPSched.cpp         |    1 -
 .../lib/Target/AMDGPU/GCNIterativeScheduler.cpp    |   21 -
 .../llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h |    6 -
 .../llvm/lib/Target/AMDGPU/GCNMinRegStrategy.cpp   |   13 -
 .../llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp      |   39 +-
 .../llvm/lib/Target/AMDGPU/GCNProcessors.td        |   44 +-
 .../llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp  |  256 +-
 .../llvm/lib/Target/AMDGPU/GCNRegPressure.cpp      |   38 +-
 .../llvm/lib/Target/AMDGPU/GCNRegPressure.h        |   12 +-
 .../llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp    |   10 +-
 .../llvm/lib/Target/AMDGPU/GCNSubtarget.h          | 1064 ++++
 .../llvm/lib/Target/AMDGPU/InstCombineTables.td    |   11 +
 .../AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp       |    9 +-
 .../AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp  |    6 -
 .../AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp      |    3 +-
 .../Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h |    6 +-
 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp      |  217 +-
 .../Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h |   12 +-
 .../Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp |    1 -
 .../AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h      |    2 +-
 .../AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp     |   12 +-
 .../AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h       |   14 +-
 .../AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp   |   81 +-
 .../AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h     |   18 +-
 .../AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp      |    9 +-
 .../Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp |   16 +-
 .../llvm/lib/Target/AMDGPU/MIMGInstructions.td     |  129 +-
 .../llvm/lib/Target/AMDGPU/R600AsmPrinter.cpp      |    4 +-
 .../llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp |   11 +-
 .../lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp |   36 +-
 .../llvm/lib/Target/AMDGPU/R600Defines.h           |    2 -
 .../lib/Target/AMDGPU/R600EmitClauseMarkers.cpp    |   20 +-
 .../lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp  |   16 +-
 .../llvm/lib/Target/AMDGPU/R600FrameLowering.cpp   |   13 +-
 .../llvm/lib/Target/AMDGPU/R600FrameLowering.h     |    4 +-
 .../llvm/lib/Target/AMDGPU/R600ISelLowering.cpp    |   59 +-
 .../llvm/lib/Target/AMDGPU/R600InstrInfo.cpp       |   44 +-
 .../llvm/lib/Target/AMDGPU/R600InstrInfo.h         |    4 +-
 .../llvm/lib/Target/AMDGPU/R600Instructions.td     |    8 +-
 .../lib/Target/AMDGPU/R600MachineScheduler.cpp     |   15 +-
 .../llvm/lib/Target/AMDGPU/R600MachineScheduler.h  |    2 +-
 .../AMDGPU/R600OpenCLImageTypeLoweringPass.cpp     |   15 -
 .../Target/AMDGPU/R600OptimizeVectorRegisters.cpp  |   23 +-
 .../llvm/lib/Target/AMDGPU/R600Packetizer.cpp      |    7 +-
 .../llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp    |   10 +-
 .../llvm/lib/Target/AMDGPU/R600RegisterInfo.h      |    2 +-
 .../llvm/lib/Target/AMDGPU/R600Subtarget.h         |  174 +
 .../llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp        |   13 +-
 .../lib/Target/AMDGPU/SIAnnotateControlFlow.cpp    |   37 +-
 .../llvm/lib/Target/AMDGPU/SIDefines.h             |  257 +-
 .../llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp     |  144 +-
 .../llvm/lib/Target/AMDGPU/SIFixVGPRCopies.cpp     |    3 +-
 .../llvm/lib/Target/AMDGPU/SIFixupVectorISel.cpp   |  239 -
 .../llvm/lib/Target/AMDGPU/SIFoldOperands.cpp      |  283 +-
 .../llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp |   55 +-
 .../llvm/lib/Target/AMDGPU/SIFrameLowering.cpp     |  406 +-
 .../llvm/lib/Target/AMDGPU/SIFrameLowering.h       |    6 +-
 .../llvm/lib/Target/AMDGPU/SIISelLowering.cpp      | 2013 ++++----
 .../llvm/lib/Target/AMDGPU/SIISelLowering.h        |   56 +-
 .../llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp |    5 +-
 .../llvm/lib/Target/AMDGPU/SIInsertSkips.cpp       |  163 +-
 .../llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp    |  241 +-
 .../llvm/lib/Target/AMDGPU/SIInstrFormats.td       |   52 +-
 .../llvm/lib/Target/AMDGPU/SIInstrInfo.cpp         | 1551 +++---
 .../llvm/lib/Target/AMDGPU/SIInstrInfo.h           |  104 +-
 .../llvm/lib/Target/AMDGPU/SIInstrInfo.td          |  552 +--
 .../llvm/lib/Target/AMDGPU/SIInstructions.td       |  570 ++-
 .../lib/Target/AMDGPU/SILoadStoreOptimizer.cpp     |   52 +-
 .../llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp  |  316 +-
 .../llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp     |   38 +-
 .../llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp   |  109 +-
 .../lib/Target/AMDGPU/SIMachineFunctionInfo.cpp    |   72 +-
 .../llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h |   65 +-
 .../llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp  |   42 +-
 .../llvm/lib/Target/AMDGPU/SIMachineScheduler.h    |   10 +-
 .../llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp   |  586 ++-
 .../llvm/lib/Target/AMDGPU/SIModeRegister.cpp      |   24 +-
 .../lib/Target/AMDGPU/SIOptimizeExecMasking.cpp    |   59 +-
 .../Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp   |  208 +-
 .../llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp      |   76 +-
 .../llvm/lib/Target/AMDGPU/SIPostRABundler.cpp     |    6 +-
 .../lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp     |   17 +-
 .../llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp   |   20 +-
 .../llvm/lib/Target/AMDGPU/SIProgramInfo.cpp       |   56 +
 .../llvm/lib/Target/AMDGPU/SIProgramInfo.h         |   12 +-
 .../llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp      |  744 ++-
 .../llvm/lib/Target/AMDGPU/SIRegisterInfo.h        |   42 +-
 .../llvm/lib/Target/AMDGPU/SIRegisterInfo.td       |   52 +-
 .../Target/AMDGPU/SIRemoveShortExecBranches.cpp    |    3 +-
 .../llvm/lib/Target/AMDGPU/SISchedule.td           |   15 +-
 .../lib/Target/AMDGPU/SIShrinkInstructions.cpp     |  170 +-
 .../llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp     |  204 +-
 .../llvm/lib/Target/AMDGPU/SMInstructions.td       |    4 +-
 .../llvm/lib/Target/AMDGPU/SOPInstructions.td      |  629 ++-
 .../lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp     |  247 +
 .../llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h  |   15 +
 .../lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp     |  460 +-
 .../llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h  |  188 +-
 .../lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp  |   50 +-
 .../lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h    |   18 +-
 .../Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp    |    4 +-
 .../lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h  |    2 +-
 .../llvm/lib/Target/AMDGPU/VOP1Instructions.td     |   29 +-
 .../llvm/lib/Target/AMDGPU/VOP2Instructions.td     |  225 +-
 .../llvm/lib/Target/AMDGPU/VOP3Instructions.td     |  592 +--
 .../llvm/lib/Target/AMDGPU/VOP3PInstructions.td    |  269 +-
 .../llvm/lib/Target/AMDGPU/VOPCInstructions.td     |    2 +-
 .../llvm/lib/Target/AMDGPU/VOPInstructions.td      |   38 +-
 .../llvm/lib/Target/ARC/ARCISelLowering.cpp        |    1 -
 .../llvm/lib/Target/ARC/ARCSubtarget.cpp           |    2 +-
 .../llvm/lib/Target/ARC/ARCSubtarget.h             |    2 +-
 .../llvm/lib/Target/ARC/ARCTargetMachine.cpp       |    4 +-
 .../lib/Target/ARC/MCTargetDesc/ARCInstPrinter.h   |    1 +
 .../Target/ARC/MCTargetDesc/ARCMCTargetDesc.cpp    |    2 +-
 .../llvm/lib/Target/ARM/A15SDOptimizer.cpp         |    3 +-
 contrib/llvm-project/llvm/lib/Target/ARM/ARM.h     |    5 +
 contrib/llvm-project/llvm/lib/Target/ARM/ARM.td    |  109 +-
 .../llvm/lib/Target/ARM/ARMAsmPrinter.cpp          |   48 +-
 .../llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp       |  721 ++-
 .../llvm/lib/Target/ARM/ARMBaseInstrInfo.h         |  221 +-
 .../llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp    |   21 +-
 .../llvm/lib/Target/ARM/ARMBaseRegisterInfo.h      |   10 +-
 .../llvm/lib/Target/ARM/ARMBlockPlacement.cpp      |  228 +
 .../llvm/lib/Target/ARM/ARMCallLowering.cpp        |   68 +-
 .../llvm/lib/Target/ARM/ARMCallLowering.h          |    6 +-
 .../llvm/lib/Target/ARM/ARMConstantIslandPass.cpp  |   79 +-
 .../llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp   |   92 +-
 .../llvm/lib/Target/ARM/ARMFastISel.cpp            |   18 +-
 .../llvm-project/llvm/lib/Target/ARM/ARMFeatures.h |    1 +
 .../llvm/lib/Target/ARM/ARMFrameLowering.cpp       |   13 +-
 .../llvm/lib/Target/ARM/ARMFrameLowering.h         |    5 +-
 .../llvm/lib/Target/ARM/ARMHazardRecognizer.cpp    |  190 +-
 .../llvm/lib/Target/ARM/ARMHazardRecognizer.h      |   46 +-
 .../llvm/lib/Target/ARM/ARMISelLowering.cpp        |  737 ++-
 .../llvm/lib/Target/ARM/ARMISelLowering.h          |   46 +-
 .../llvm/lib/Target/ARM/ARMInstrFormats.td         |    5 +-
 .../llvm/lib/Target/ARM/ARMInstrInfo.td            |   92 +-
 .../llvm/lib/Target/ARM/ARMInstrMVE.td             |  801 +--
 .../llvm/lib/Target/ARM/ARMInstrNEON.td            |  147 +-
 .../llvm/lib/Target/ARM/ARMInstrThumb.td           |   13 +-
 .../llvm/lib/Target/ARM/ARMInstrThumb2.td          |   51 +-
 .../llvm/lib/Target/ARM/ARMInstrVFP.td             |   51 +-
 .../llvm/lib/Target/ARM/ARMInstructionSelector.cpp |    2 -
 .../llvm/lib/Target/ARM/ARMLegalizerInfo.cpp       |    2 +-
 .../llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp  |  449 +-
 .../llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp    | 1525 +++---
 .../llvm/lib/Target/ARM/ARMParallelDSP.cpp         |    6 +-
 .../llvm/lib/Target/ARM/ARMPredicates.td           |    5 +
 .../llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp    |   12 +-
 .../llvm/lib/Target/ARM/ARMRegisterInfo.td         |   17 +
 .../llvm/lib/Target/ARM/ARMSLSHardening.cpp        |  416 ++
 .../llvm/lib/Target/ARM/ARMSchedule.td             |   66 +-
 .../llvm/lib/Target/ARM/ARMScheduleA57.td          |  147 +-
 .../llvm/lib/Target/ARM/ARMScheduleA57WriteRes.td  |   11 +-
 .../llvm/lib/Target/ARM/ARMScheduleA9.td           |    4 +-
 .../llvm/lib/Target/ARM/ARMScheduleM7.td           |  488 ++
 .../llvm/lib/Target/ARM/ARMScheduleR52.td          |    4 +-
 .../llvm/lib/Target/ARM/ARMScheduleSwift.td        |    4 +-
 .../llvm/lib/Target/ARM/ARMSubtarget.cpp           |   14 +-
 .../llvm/lib/Target/ARM/ARMSubtarget.h             |   19 +-
 .../llvm/lib/Target/ARM/ARMTargetMachine.cpp       |   28 +-
 .../llvm/lib/Target/ARM/ARMTargetMachine.h         |    6 +
 .../llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp |  858 +++-
 .../llvm/lib/Target/ARM/ARMTargetTransformInfo.h   |   58 +-
 .../llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp |   28 +-
 .../Target/ARM/Disassembler/ARMDisassembler.cpp    |    7 +-
 .../Target/ARM/MCTargetDesc/ARMAddressingModes.h   |   26 +
 .../lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp  |    1 +
 .../llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h |    2 +-
 .../lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp |    2 -
 .../lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h   |    1 +
 .../lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp   |    1 +
 .../Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp    |  132 +-
 .../lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h  |   15 +
 .../lib/Target/ARM/MVEGatherScatterLowering.cpp    |  256 +-
 .../llvm/lib/Target/ARM/MVETailPredUtils.h         |  157 +
 .../llvm/lib/Target/ARM/MVETailPredication.cpp     |  505 +-
 .../llvm/lib/Target/ARM/MVEVPTBlockPass.cpp        |   36 +-
 .../lib/Target/ARM/MVEVPTOptimisationsPass.cpp     |  460 +-
 .../llvm/lib/Target/ARM/Thumb2InstrInfo.cpp        |   47 +
 .../llvm/lib/Target/ARM/Thumb2InstrInfo.h          |    8 +
 .../llvm/lib/Target/ARM/Thumb2SizeReduction.cpp    |    2 +-
 .../llvm-project/llvm/lib/Target/AVR/AVRDevices.td |   21 +-
 .../llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp   |  147 +
 .../llvm/lib/Target/AVR/AVRFrameLowering.cpp       |   33 +-
 .../llvm/lib/Target/AVR/AVRISelDAGToDAG.cpp        |    5 +-
 .../llvm/lib/Target/AVR/AVRISelLowering.cpp        |  129 +-
 .../llvm/lib/Target/AVR/AVRISelLowering.h          |    7 +
 .../llvm/lib/Target/AVR/AVRInstrInfo.td            |   35 +-
 .../llvm/lib/Target/AVR/AVRSubtarget.cpp           |    6 +-
 .../llvm/lib/Target/AVR/AVRSubtarget.h             |    2 +-
 .../llvm/lib/Target/AVR/AVRTargetMachine.cpp       |    2 +-
 .../llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp |    5 +-
 .../lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp  |    2 +-
 .../lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h    |    6 +-
 .../lib/Target/AVR/MCTargetDesc/AVRInstPrinter.h   |    1 +
 .../llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp |   14 +-
 .../Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp    |    2 +-
 contrib/llvm-project/llvm/lib/Target/BPF/BPF.h     |   37 +-
 .../lib/Target/BPF/BPFAbstractMemberAccess.cpp     |  323 +-
 .../llvm/lib/Target/BPF/BPFAdjustOpt.cpp           |  323 ++
 contrib/llvm-project/llvm/lib/Target/BPF/BPFCORE.h |   30 +
 .../llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp    |  130 +
 .../llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp        |    2 -
 .../llvm/lib/Target/BPF/BPFISelLowering.cpp        |    1 -
 .../llvm/lib/Target/BPF/BPFInstrFormats.td         |   11 +-
 .../llvm/lib/Target/BPF/BPFInstrInfo.td            |  190 +-
 .../llvm/lib/Target/BPF/BPFMIChecking.cpp          |   78 +-
 .../llvm/lib/Target/BPF/BPFMIPeephole.cpp          |    3 +
 .../llvm/lib/Target/BPF/BPFPreserveDIType.cpp      |  131 +-
 .../llvm/lib/Target/BPF/BPFSubtarget.cpp           |    4 +-
 .../llvm/lib/Target/BPF/BPFSubtarget.h             |    2 +-
 .../llvm/lib/Target/BPF/BPFTargetMachine.cpp       |   63 +-
 .../llvm/lib/Target/BPF/BPFTargetMachine.h         |    6 +
 .../llvm/lib/Target/BPF/BPFTargetTransformInfo.h   |   61 +
 contrib/llvm-project/llvm/lib/Target/BPF/BTF.def   |    1 +
 .../llvm-project/llvm/lib/Target/BPF/BTFDebug.cpp  |  106 +-
 .../llvm-project/llvm/lib/Target/BPF/BTFDebug.h    |   21 +-
 .../Target/BPF/Disassembler/BPFDisassembler.cpp    |    4 +-
 .../lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp  |    5 -
 .../lib/Target/BPF/MCTargetDesc/BPFInstPrinter.h   |    1 +
 .../Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp   |   11 +-
 .../Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp    |    2 +-
 contrib/llvm-project/llvm/lib/Target/CSKY/CSKY.td  |   32 +
 .../llvm/lib/Target/CSKY/CSKYInstrFormats.td       |  528 ++
 .../llvm/lib/Target/CSKY/CSKYInstrInfo.td          |  108 +
 .../llvm/lib/Target/CSKY/CSKYRegisterInfo.td       |  182 +
 .../llvm/lib/Target/CSKY/CSKYTargetMachine.cpp     |   68 +
 .../llvm/lib/Target/CSKY/CSKYTargetMachine.h       |   38 +
 .../Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp    |   69 +
 .../lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h  |   39 +
 .../CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp      |   45 +
 .../lib/Target/CSKY/MCTargetDesc/CSKYMCAsmInfo.cpp |   25 +
 .../lib/Target/CSKY/MCTargetDesc/CSKYMCAsmInfo.h   |   29 +
 .../Target/CSKY/MCTargetDesc/CSKYMCCodeEmitter.cpp |   71 +
 .../Target/CSKY/MCTargetDesc/CSKYMCCodeEmitter.h   |   61 +
 .../Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.cpp  |   62 +
 .../Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.h    |   48 +
 .../lib/Target/CSKY/TargetInfo/CSKYTargetInfo.cpp  |   20 +
 .../lib/Target/CSKY/TargetInfo/CSKYTargetInfo.h    |   20 +
 .../Target/Hexagon/AsmParser/HexagonAsmParser.cpp  |    6 +-
 .../llvm/lib/Target/Hexagon/BitTracker.cpp         |   27 +-
 .../llvm/lib/Target/Hexagon/BitTracker.h           |   17 +-
 .../Hexagon/Disassembler/HexagonDisassembler.cpp   |    6 +-
 .../llvm-project/llvm/lib/Target/Hexagon/Hexagon.h |    5 +-
 .../llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp |   76 +-
 .../llvm/lib/Target/Hexagon/HexagonBitTracker.cpp  |    8 +-
 .../llvm/lib/Target/Hexagon/HexagonBitTracker.h    |    4 +-
 .../llvm/lib/Target/Hexagon/HexagonBlockRanges.cpp |    6 +-
 .../llvm/lib/Target/Hexagon/HexagonBlockRanges.h   |    5 +-
 .../lib/Target/Hexagon/HexagonCFGOptimizer.cpp     |    1 +
 .../llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp   |   16 +-
 .../lib/Target/Hexagon/HexagonConstExtenders.cpp   |   12 +-
 .../lib/Target/Hexagon/HexagonConstPropagation.cpp |   41 +-
 .../lib/Target/Hexagon/HexagonCopyToCombine.cpp    |    2 +
 .../llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp |    6 +-
 .../lib/Target/Hexagon/HexagonExpandCondsets.cpp   |   91 +-
 .../lib/Target/Hexagon/HexagonFrameLowering.cpp    |   13 +-
 .../llvm/lib/Target/Hexagon/HexagonFrameLowering.h |    5 +-
 .../llvm/lib/Target/Hexagon/HexagonGenInsert.cpp   |   16 +-
 .../lib/Target/Hexagon/HexagonGenPredicate.cpp     |   17 +-
 .../lib/Target/Hexagon/HexagonHardwareLoops.cpp    |    6 +-
 .../lib/Target/Hexagon/HexagonISelDAGToDAG.cpp     |    4 +-
 .../lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp  |  229 +-
 .../lib/Target/Hexagon/HexagonISelLowering.cpp     |  162 +-
 .../llvm/lib/Target/Hexagon/HexagonISelLowering.h  |  909 ++--
 .../lib/Target/Hexagon/HexagonISelLoweringHVX.cpp  |  738 ++-
 .../llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp   |    7 +-
 .../llvm/lib/Target/Hexagon/HexagonInstrInfo.h     |    4 +-
 .../lib/Target/Hexagon/HexagonIntrinsicsV60.td     |    2 +-
 .../Target/Hexagon/HexagonLoopIdiomRecognition.cpp |  265 +-
 .../Target/Hexagon/HexagonLoopIdiomRecognition.h   |   24 +
 .../llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp |    2 +-
 .../llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp |    2 +-
 .../Target/Hexagon/HexagonOptimizeSZextends.cpp    |    4 +-
 .../llvm/lib/Target/Hexagon/HexagonPatterns.td     |  119 +-
 .../llvm/lib/Target/Hexagon/HexagonPatternsHVX.td  |  240 +-
 .../llvm/lib/Target/Hexagon/HexagonPeephole.cpp    |   11 +-
 .../lib/Target/Hexagon/HexagonRegisterInfo.cpp     |    2 +-
 .../llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp |   23 +-
 .../llvm/lib/Target/Hexagon/HexagonSubtarget.cpp   |   89 +-
 .../llvm/lib/Target/Hexagon/HexagonSubtarget.h     |   30 +-
 .../lib/Target/Hexagon/HexagonTargetMachine.cpp    |   71 +-
 .../llvm/lib/Target/Hexagon/HexagonTargetMachine.h |    2 +
 .../lib/Target/Hexagon/HexagonTargetObjectFile.cpp |    1 +
 .../Target/Hexagon/HexagonTargetTransformInfo.cpp  |   47 +-
 .../Target/Hexagon/HexagonTargetTransformInfo.h    |   11 +-
 .../lib/Target/Hexagon/HexagonVectorCombine.cpp    | 1487 ++++++
 .../Hexagon/HexagonVectorLoopCarriedReuse.cpp      |  165 +-
 .../Target/Hexagon/HexagonVectorLoopCarriedReuse.h |  139 +
 .../Hexagon/MCTargetDesc/HexagonAsmBackend.cpp     |    4 +-
 .../Hexagon/MCTargetDesc/HexagonInstPrinter.h      |    1 +
 .../Hexagon/MCTargetDesc/HexagonMCCompound.cpp     |   19 +-
 .../Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp    |    2 +-
 .../Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp   |    3 +-
 .../Hexagon/MCTargetDesc/HexagonShuffler.cpp       |    5 +-
 .../llvm/lib/Target/Hexagon/RDFDeadCode.cpp        |    3 +-
 .../llvm/lib/Target/Lanai/LanaiISelLowering.cpp    |    5 +-
 .../llvm/lib/Target/Lanai/LanaiSubtarget.cpp       |    4 +-
 .../llvm/lib/Target/Lanai/LanaiSubtarget.h         |    3 +-
 .../llvm/lib/Target/Lanai/LanaiTargetMachine.cpp   |    4 +-
 .../llvm/lib/Target/Lanai/LanaiTargetMachine.h     |    2 -
 .../lib/Target/Lanai/LanaiTargetTransformInfo.h    |    3 +-
 .../Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp  |    5 -
 .../Target/Lanai/MCTargetDesc/LanaiInstPrinter.h   |    1 +
 .../Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp       |    2 +-
 .../Target/MSP430/AsmParser/MSP430AsmParser.cpp    |    1 -
 .../MSP430/MCTargetDesc/MSP430AsmBackend.cpp       |    5 -
 .../Target/MSP430/MCTargetDesc/MSP430InstPrinter.h |    1 +
 .../MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp     |    2 +-
 .../llvm/lib/Target/MSP430/MSP430ISelLowering.cpp  |   12 +-
 .../llvm/lib/Target/MSP430/MSP430Subtarget.cpp     |    4 +-
 .../llvm/lib/Target/MSP430/MSP430Subtarget.h       |    2 +-
 .../llvm/lib/Target/MSP430/MSP430TargetMachine.h   |    2 +-
 .../lib/Target/Mips/AsmParser/MipsAsmParser.cpp    |    8 +-
 .../lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp   |    1 -
 .../lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h  |    9 -
 .../lib/Target/Mips/MCTargetDesc/MipsInstPrinter.h |    1 +
 .../Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp  |    2 +-
 contrib/llvm-project/llvm/lib/Target/Mips/Mips.td  |    2 +-
 .../llvm/lib/Target/Mips/MipsAsmPrinter.cpp        |    2 +-
 .../llvm/lib/Target/Mips/MipsCallLowering.cpp      |   82 +-
 .../llvm/lib/Target/Mips/MipsCallLowering.h        |    7 +-
 .../lib/Target/Mips/MipsConstantIslandPass.cpp     |    9 +-
 .../llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp   |   19 +-
 .../llvm/lib/Target/Mips/MipsExpandPseudo.cpp      |    4 +-
 .../llvm/lib/Target/Mips/MipsISelLowering.cpp      |   21 +-
 .../llvm/lib/Target/Mips/MipsISelLowering.h        |   14 -
 .../llvm/lib/Target/Mips/MipsInstrFPU.td           |    9 +
 .../llvm/lib/Target/Mips/MipsInstrInfo.cpp         |    2 +-
 .../llvm/lib/Target/Mips/MipsInstrInfo.td          |    2 +-
 .../llvm/lib/Target/Mips/MipsLegalizerInfo.cpp     |   11 +-
 .../llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp  |   13 +-
 .../llvm/lib/Target/Mips/MipsRegisterBankInfo.h    |    2 +-
 .../llvm/lib/Target/Mips/MipsSEFrameLowering.cpp   |   11 +-
 .../llvm/lib/Target/Mips/MipsSEFrameLowering.h     |    5 +-
 .../llvm/lib/Target/Mips/MipsSEISelLowering.cpp    |    4 +-
 .../llvm/lib/Target/Mips/MipsSchedule.td           |    3 +
 .../llvm/lib/Target/Mips/MipsScheduleGeneric.td    |    5 +-
 .../llvm/lib/Target/Mips/MipsScheduleP5600.td      |    4 +-
 .../llvm/lib/Target/Mips/MipsSubtarget.cpp         |   32 +-
 .../llvm/lib/Target/Mips/MipsSubtarget.h           |    2 +-
 .../llvm/lib/Target/Mips/MipsTargetMachine.cpp     |   29 +-
 .../llvm/lib/Target/Mips/MipsTargetMachine.h       |    8 +
 .../Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h   |    1 +
 .../Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp   |    1 +
 .../NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp       |    2 +-
 contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTX.h |   19 +
 .../llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp      |    6 +-
 .../llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp   |    9 +-
 .../llvm/lib/Target/NVPTX/NVPTXFrameLowering.h     |    5 +-
 .../llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp    |    9 +-
 .../llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp    |   37 +-
 .../llvm/lib/Target/NVPTX/NVPTXISelLowering.h      |    3 +-
 .../llvm/lib/Target/NVPTX/NVPTXInstrFormats.td     |   32 +-
 .../llvm/lib/Target/NVPTX/NVPTXInstrInfo.td        |   36 +-
 .../llvm/lib/Target/NVPTX/NVPTXIntrinsics.td       |   68 +-
 .../llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp       |    9 +-
 .../lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp     |    3 +-
 .../llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td     |    4 +-
 .../llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp       |    8 +-
 .../llvm/lib/Target/NVPTX/NVPTXSubtarget.h         |    2 +-
 .../llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp   |   31 +-
 .../llvm/lib/Target/NVPTX/NVPTXTargetMachine.h     |    4 +-
 .../lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp  |  257 +
 .../lib/Target/NVPTX/NVPTXTargetTransformInfo.h    |    3 +
 .../llvm/lib/Target/NVPTX/NVVMIntrRange.cpp        |   40 +-
 .../llvm/lib/Target/NVPTX/NVVMReflect.cpp          |   15 +-
 .../lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp  |  257 +-
 .../PowerPC/Disassembler/PPCDisassembler.cpp       |   38 +-
 .../lib/Target/PowerPC/GISel/PPCCallLowering.cpp   |   53 +
 .../lib/Target/PowerPC/GISel/PPCCallLowering.h     |   40 +
 .../PowerPC/GISel/PPCInstructionSelector.cpp       |   92 +
 .../lib/Target/PowerPC/GISel/PPCLegalizerInfo.cpp  |   20 +
 .../lib/Target/PowerPC/GISel/PPCLegalizerInfo.h    |   28 +
 .../Target/PowerPC/GISel/PPCRegisterBankInfo.cpp   |   27 +
 .../lib/Target/PowerPC/GISel/PPCRegisterBankInfo.h |   39 +
 .../lib/Target/PowerPC/GISel/PPCRegisterBanks.td   |   15 +
 .../Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp  |   10 +-
 .../PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp    |   24 +
 .../Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp |  118 +
 .../Target/PowerPC/MCTargetDesc/PPCELFStreamer.h   |    7 +
 .../Target/PowerPC/MCTargetDesc/PPCFixupKinds.h    |    3 +
 .../Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp |  163 +-
 .../Target/PowerPC/MCTargetDesc/PPCInstPrinter.h   |   86 +-
 .../Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp   |    5 +-
 .../lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h |    1 -
 .../PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp      |   63 +-
 .../Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h |   12 +-
 .../PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp       |   26 +-
 .../Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h  |   13 +-
 .../PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp  |   11 +-
 .../llvm/lib/Target/PowerPC/P9InstrResources.td    |    2 +-
 contrib/llvm-project/llvm/lib/Target/PowerPC/PPC.h |   63 +-
 .../llvm-project/llvm/lib/Target/PowerPC/PPC.td    |   56 +-
 .../llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp      |  674 ++-
 .../llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp    |   11 +-
 .../llvm/lib/Target/PowerPC/PPCCCState.cpp         |    2 +-
 .../llvm/lib/Target/PowerPC/PPCCTRLoops.cpp        |   92 +-
 .../llvm/lib/Target/PowerPC/PPCCallingConv.td      |   28 +-
 .../llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp     |   26 +-
 .../llvm/lib/Target/PowerPC/PPCFastISel.cpp        |   19 +-
 .../llvm/lib/Target/PowerPC/PPCFrameLowering.cpp   |  586 +--
 .../llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp    |  841 ++--
 .../llvm/lib/Target/PowerPC/PPCISelLowering.cpp    | 3810 ++++++--------
 .../llvm/lib/Target/PowerPC/PPCISelLowering.h      |  130 +-
 .../llvm/lib/Target/PowerPC/PPCInstr64Bit.td       |   99 +-
 .../llvm/lib/Target/PowerPC/PPCInstrAltivec.td     |  170 +-
 .../llvm/lib/Target/PowerPC/PPCInstrFormats.td     |   55 +-
 .../llvm/lib/Target/PowerPC/PPCInstrHTM.td         |    5 +-
 .../llvm/lib/Target/PowerPC/PPCInstrInfo.cpp       | 1019 +++-
 .../llvm/lib/Target/PowerPC/PPCInstrInfo.h         |  106 +-
 .../llvm/lib/Target/PowerPC/PPCInstrInfo.td        |  441 +-
 .../llvm/lib/Target/PowerPC/PPCInstrPrefix.td      | 1735 ++++++-
 .../llvm/lib/Target/PowerPC/PPCInstrQPX.td         | 1212 -----
 .../llvm/lib/Target/PowerPC/PPCInstrSPE.td         |   10 -
 .../llvm/lib/Target/PowerPC/PPCInstrVSX.td         |  375 +-
 .../lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp    |   40 +-
 .../lib/Target/PowerPC/PPCLowerMASSVEntries.cpp    |    6 +-
 .../llvm/lib/Target/PowerPC/PPCMCInstLower.cpp     |   14 +-
 .../llvm/lib/Target/PowerPC/PPCMIPeephole.cpp      |  278 +-
 .../lib/Target/PowerPC/PPCMachineFunctionInfo.cpp  |   34 +
 .../lib/Target/PowerPC/PPCMachineFunctionInfo.h    |   37 +-
 .../lib/Target/PowerPC/PPCMachineScheduler.cpp     |  140 +-
 .../llvm/lib/Target/PowerPC/PPCMacroFusion.cpp     |    4 +-
 .../llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp |  240 +-
 .../llvm/lib/Target/PowerPC/PPCQPXLoadSplat.cpp    |  161 -
 .../lib/Target/PowerPC/PPCReduceCRLogicals.cpp     |    4 +-
 .../llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp    |  170 +-
 .../llvm/lib/Target/PowerPC/PPCRegisterInfo.h      |   27 +-
 .../llvm/lib/Target/PowerPC/PPCRegisterInfo.td     |  113 +-
 .../llvm/lib/Target/PowerPC/PPCScheduleP9.td       |   11 +-
 .../llvm/lib/Target/PowerPC/PPCSubtarget.cpp       |   56 +-
 .../llvm/lib/Target/PowerPC/PPCSubtarget.h         |   44 +-
 .../llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp  |   49 +-
 .../llvm/lib/Target/PowerPC/PPCTargetMachine.cpp   |  111 +-
 .../llvm/lib/Target/PowerPC/PPCTargetMachine.h     |    5 +
 .../lib/Target/PowerPC/PPCTargetTransformInfo.cpp  |  326 +-
 .../lib/Target/PowerPC/PPCTargetTransformInfo.h    |   14 +-
 .../llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp    |    6 +-
 .../llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp  |   30 +-
 .../PowerPC/TargetInfo/PowerPCTargetInfo.cpp       |    9 +-
 .../Target/PowerPC/TargetInfo/PowerPCTargetInfo.h  |    1 +
 .../lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp  |  657 ++-
 .../RISCV/Disassembler/RISCVDisassembler.cpp       |   30 +-
 .../Target/RISCV/MCTargetDesc/RISCVAsmBackend.h    |    2 +-
 .../Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp    |  142 +
 .../lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h  |  406 ++
 .../Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp |    4 +-
 .../Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp |   39 +-
 .../Target/RISCV/MCTargetDesc/RISCVInstPrinter.h   |    5 +-
 .../RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp      |   15 +-
 .../lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp  |    1 -
 .../RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp       |    7 +-
 .../lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp  |   91 +
 .../lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h    |   43 +
 .../RISCV/MCTargetDesc/RISCVTargetStreamer.cpp     |   34 +-
 contrib/llvm-project/llvm/lib/Target/RISCV/RISCV.h |    5 +-
 .../llvm-project/llvm/lib/Target/RISCV/RISCV.td    |   92 +-
 .../llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp      |    4 +-
 .../llvm/lib/Target/RISCV/RISCVCallLowering.cpp    |   11 +-
 .../llvm/lib/Target/RISCV/RISCVCallLowering.h      |    6 +-
 .../llvm/lib/Target/RISCV/RISCVCleanupVSETVLI.cpp  |  154 +
 .../lib/Target/RISCV/RISCVExpandPseudoInsts.cpp    |   65 +
 .../llvm/lib/Target/RISCV/RISCVFrameLowering.cpp   |  145 +-
 .../llvm/lib/Target/RISCV/RISCVFrameLowering.h     |    5 +-
 .../llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp    | 1095 ++--
 .../llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h      |   27 +-
 .../llvm/lib/Target/RISCV/RISCVISelLowering.cpp    | 1969 +++++++-
 .../llvm/lib/Target/RISCV/RISCVISelLowering.h      |  125 +-
 .../llvm/lib/Target/RISCV/RISCVInstrFormats.td     |   85 +-
 .../llvm/lib/Target/RISCV/RISCVInstrFormatsV.td    |  141 +-
 .../llvm/lib/Target/RISCV/RISCVInstrInfo.cpp       |  117 +-
 .../llvm/lib/Target/RISCV/RISCVInstrInfo.h         |   21 +-
 .../llvm/lib/Target/RISCV/RISCVInstrInfo.td        |  141 +-
 .../llvm/lib/Target/RISCV/RISCVInstrInfoB.td       | 1029 ++--
 .../llvm/lib/Target/RISCV/RISCVInstrInfoC.td       |   22 +-
 .../llvm/lib/Target/RISCV/RISCVInstrInfoD.td       |   22 +-
 .../llvm/lib/Target/RISCV/RISCVInstrInfoF.td       |   30 +-
 .../llvm/lib/Target/RISCV/RISCVInstrInfoM.td       |    6 +-
 .../llvm/lib/Target/RISCV/RISCVInstrInfoV.td       |  695 ++-
 .../lib/Target/RISCV/RISCVInstrInfoVPseudos.td     | 4416 +++++++++++++++++
 .../lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td  |  643 +++
 .../llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td     |  371 ++
 .../llvm/lib/Target/RISCV/RISCVMCInstLower.cpp     |   98 +
 .../llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp |    4 +-
 .../llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp    |   23 +-
 .../llvm/lib/Target/RISCV/RISCVRegisterInfo.td     |  389 +-
 .../llvm/lib/Target/RISCV/RISCVSchedRocket.td      |  233 +
 .../llvm/lib/Target/RISCV/RISCVSchedRocket32.td    |  227 -
 .../llvm/lib/Target/RISCV/RISCVSchedRocket64.td    |  228 -
 .../llvm/lib/Target/RISCV/RISCVSchedSiFive7.td     |  222 +
 .../llvm/lib/Target/RISCV/RISCVSchedule.td         |    2 +-
 .../llvm/lib/Target/RISCV/RISCVSubtarget.cpp       |   14 +-
 .../llvm/lib/Target/RISCV/RISCVSubtarget.h         |   20 +-
 .../llvm/lib/Target/RISCV/RISCVSystemOperands.td   |    2 +
 .../llvm/lib/Target/RISCV/RISCVTargetMachine.cpp   |   49 +-
 .../llvm/lib/Target/RISCV/RISCVTargetMachine.h     |    5 +-
 .../lib/Target/RISCV/RISCVTargetTransformInfo.cpp  |    8 +-
 .../lib/Target/RISCV/RISCVTargetTransformInfo.h    |    5 +-
 .../llvm/lib/Target/RISCV/Utils/RISCVBaseInfo.cpp  |   81 -
 .../llvm/lib/Target/RISCV/Utils/RISCVBaseInfo.h    |  223 -
 .../llvm/lib/Target/RISCV/Utils/RISCVMatInt.cpp    |   93 -
 .../llvm/lib/Target/RISCV/Utils/RISCVMatInt.h      |   44 -
 .../lib/Target/Sparc/AsmParser/SparcAsmParser.cpp  |  158 +-
 .../llvm/lib/Target/Sparc/LeonPasses.cpp           |    7 +-
 .../llvm/lib/Target/Sparc/LeonPasses.h             |    9 +-
 .../Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp  |    6 -
 .../Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp |   30 +-
 .../Target/Sparc/MCTargetDesc/SparcInstPrinter.h   |    1 +
 .../Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp      |   52 +-
 .../lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp  |   77 +-
 .../lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h    |    1 +
 .../Sparc/MCTargetDesc/SparcMCTargetDesc.cpp       |    2 +-
 .../llvm/lib/Target/Sparc/SparcAsmPrinter.cpp      |    2 +-
 .../llvm/lib/Target/Sparc/SparcFrameLowering.cpp   |   10 +-
 .../llvm/lib/Target/Sparc/SparcFrameLowering.h     |    5 +-
 .../llvm/lib/Target/Sparc/SparcISelLowering.cpp    |   56 +-
 .../llvm/lib/Target/Sparc/SparcInstr64Bit.td       |    6 +-
 .../llvm/lib/Target/Sparc/SparcInstrFormats.td     |    4 +-
 .../llvm/lib/Target/Sparc/SparcInstrInfo.td        |   25 +-
 .../llvm/lib/Target/Sparc/SparcRegisterInfo.cpp    |    2 +-
 .../llvm/lib/Target/Sparc/SparcSubtarget.cpp       |    8 +-
 .../llvm/lib/Target/Sparc/SparcSubtarget.h         |    4 +-
 .../llvm/lib/Target/Sparc/SparcTargetMachine.cpp   |   14 +-
 .../Target/SystemZ/AsmParser/SystemZAsmParser.cpp  |   36 +-
 .../SystemZ/Disassembler/SystemZDisassembler.cpp   |    4 +-
 .../SystemZ/MCTargetDesc/SystemZInstPrinter.h      |    1 +
 .../SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp   |    4 -
 .../SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp      |    2 +
 .../SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp   |    2 +-
 .../llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp  |   21 +-
 .../llvm/lib/Target/SystemZ/SystemZElimCompare.cpp |   16 +-
 .../llvm/lib/Target/SystemZ/SystemZFeatures.td     |    2 +-
 .../lib/Target/SystemZ/SystemZFrameLowering.cpp    |   88 +-
 .../llvm/lib/Target/SystemZ/SystemZFrameLowering.h |   12 +-
 .../lib/Target/SystemZ/SystemZHazardRecognizer.cpp |    6 +-
 .../lib/Target/SystemZ/SystemZHazardRecognizer.h   |    2 +-
 .../lib/Target/SystemZ/SystemZISelDAGToDAG.cpp     |   44 +
 .../lib/Target/SystemZ/SystemZISelLowering.cpp     |  101 +-
 .../llvm/lib/Target/SystemZ/SystemZISelLowering.h  |    5 +-
 .../llvm/lib/Target/SystemZ/SystemZInstrFormats.td |   54 +
 .../llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp   |   12 +-
 .../llvm/lib/Target/SystemZ/SystemZInstrInfo.td    |  104 +-
 .../llvm/lib/Target/SystemZ/SystemZInstrVector.td  |    8 +-
 .../lib/Target/SystemZ/SystemZMachineScheduler.cpp |    1 +
 .../llvm/lib/Target/SystemZ/SystemZOperators.td    |   35 +-
 .../lib/Target/SystemZ/SystemZRegisterInfo.cpp     |   14 +-
 .../llvm/lib/Target/SystemZ/SystemZScheduleZ13.td  |    2 +-
 .../llvm/lib/Target/SystemZ/SystemZScheduleZ14.td  |    2 +-
 .../llvm/lib/Target/SystemZ/SystemZScheduleZ15.td  |    2 +-
 .../llvm/lib/Target/SystemZ/SystemZScheduleZ196.td |    2 +-
 .../lib/Target/SystemZ/SystemZScheduleZEC12.td     |    2 +-
 .../lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp |    5 +-
 .../llvm/lib/Target/SystemZ/SystemZSubtarget.cpp   |   22 +-
 .../llvm/lib/Target/SystemZ/SystemZSubtarget.h     |    2 +-
 .../lib/Target/SystemZ/SystemZTargetMachine.cpp    |   10 +-
 .../Target/SystemZ/SystemZTargetTransformInfo.cpp  |   33 +-
 .../Target/SystemZ/SystemZTargetTransformInfo.h    |    6 +-
 .../llvm/lib/Target/TargetLoweringObjectFile.cpp   |   58 +
 .../llvm-project/llvm/lib/Target/TargetMachine.cpp |   85 +-
 .../llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp   |  103 +-
 .../lib/Target/VE/Disassembler/VEDisassembler.cpp  |   55 +-
 contrib/llvm-project/llvm/lib/Target/VE/LVLGen.cpp |  137 +
 .../lib/Target/VE/MCTargetDesc/VEInstPrinter.h     |    1 +
 .../lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp  |    6 +-
 .../llvm/lib/Target/VE/TargetInfo/VETargetInfo.cpp |    2 +-
 contrib/llvm-project/llvm/lib/Target/VE/VE.h       |   23 +-
 contrib/llvm-project/llvm/lib/Target/VE/VE.td      |    5 +-
 .../llvm/lib/Target/VE/VEAsmPrinter.cpp            |   48 +-
 .../llvm/lib/Target/VE/VECallingConv.td            |  138 +-
 .../llvm/lib/Target/VE/VEFrameLowering.cpp         |  402 +-
 .../llvm/lib/Target/VE/VEFrameLowering.h           |    9 +-
 .../llvm/lib/Target/VE/VEISelDAGToDAG.cpp          |   40 +-
 .../llvm/lib/Target/VE/VEISelLowering.cpp          | 2238 ++++++++-
 .../llvm/lib/Target/VE/VEISelLowering.h            |  150 +-
 .../llvm/lib/Target/VE/VEInstrBuilder.h            |   41 +
 .../llvm/lib/Target/VE/VEInstrFormats.td           |   89 +
 .../llvm/lib/Target/VE/VEInstrInfo.cpp             |  534 +-
 .../llvm-project/llvm/lib/Target/VE/VEInstrInfo.h  |   32 +
 .../llvm-project/llvm/lib/Target/VE/VEInstrInfo.td |  845 +++-
 .../llvm/lib/Target/VE/VEInstrIntrinsicVL.gen.td   | 1604 ++++++
 .../llvm/lib/Target/VE/VEInstrIntrinsicVL.td       |   64 +
 .../llvm/lib/Target/VE/VEInstrPatternsVec.td       |   91 +
 .../llvm-project/llvm/lib/Target/VE/VEInstrVec.td  | 1510 ++++++
 .../llvm/lib/Target/VE/VEMCInstLower.cpp           |    8 +-
 .../llvm/lib/Target/VE/VERegisterInfo.cpp          |  105 +-
 .../llvm/lib/Target/VE/VERegisterInfo.h            |    2 -
 .../llvm/lib/Target/VE/VERegisterInfo.td           |  104 +-
 .../llvm/lib/Target/VE/VESubtarget.cpp             |   64 +-
 .../llvm-project/llvm/lib/Target/VE/VESubtarget.h  |   21 +-
 .../llvm/lib/Target/VE/VETargetMachine.cpp         |   32 +-
 .../llvm/lib/Target/VE/VETargetTransformInfo.h     |   25 +-
 .../llvm/lib/Target/VE/VVPInstrInfo.td             |   46 +
 .../llvm/lib/Target/VE/VVPInstrPatternsVec.td      |   71 +
 .../llvm-project/llvm/lib/Target/VE/VVPNodes.def   |   41 +
 .../WebAssembly/AsmParser/WebAssemblyAsmParser.cpp |  140 +-
 .../Disassembler/WebAssemblyDisassembler.cpp       |   23 +
 .../MCTargetDesc/WebAssemblyAsmBackend.cpp         |    5 -
 .../MCTargetDesc/WebAssemblyInstPrinter.cpp        |  116 +-
 .../MCTargetDesc/WebAssemblyInstPrinter.h          |    7 +-
 .../MCTargetDesc/WebAssemblyMCCodeEmitter.cpp      |   12 +-
 .../MCTargetDesc/WebAssemblyMCTargetDesc.cpp       |    8 +-
 .../MCTargetDesc/WebAssemblyMCTargetDesc.h         |   95 +-
 .../MCTargetDesc/WebAssemblyTargetStreamer.cpp     |   13 +-
 .../MCTargetDesc/WebAssemblyTargetStreamer.h       |    5 +
 .../MCTargetDesc/WebAssemblyWasmObjectWriter.cpp   |   18 +-
 .../Target/WebAssembly/WebAssemblyAsmPrinter.cpp   |  133 +-
 .../lib/Target/WebAssembly/WebAssemblyAsmPrinter.h |    3 +
 .../lib/Target/WebAssembly/WebAssemblyCFGSort.cpp  |   91 +-
 .../Target/WebAssembly/WebAssemblyCFGStackify.cpp  |  676 +--
 .../WebAssembly/WebAssemblyDebugValueManager.cpp   |   14 +-
 .../WebAssembly/WebAssemblyExplicitLocals.cpp      |   34 +-
 .../lib/Target/WebAssembly/WebAssemblyFastISel.cpp |   44 +-
 .../WebAssembly/WebAssemblyFixBrTableDefaults.cpp  |   45 +-
 .../llvm/lib/Target/WebAssembly/WebAssemblyISD.def |    5 +
 .../Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp |   38 -
 .../Target/WebAssembly/WebAssemblyISelLowering.cpp |  329 +-
 .../Target/WebAssembly/WebAssemblyISelLowering.h   |    1 +
 .../Target/WebAssembly/WebAssemblyInstrAtomics.td  |  212 +-
 .../WebAssembly/WebAssemblyInstrBulkMemory.td      |    2 +-
 .../Target/WebAssembly/WebAssemblyInstrControl.td  |   45 +-
 .../Target/WebAssembly/WebAssemblyInstrInfo.cpp    |   42 +-
 .../lib/Target/WebAssembly/WebAssemblyInstrInfo.td |   39 +-
 .../Target/WebAssembly/WebAssemblyInstrMemory.td   |    2 +-
 .../lib/Target/WebAssembly/WebAssemblyInstrRef.td  |   36 +-
 .../lib/Target/WebAssembly/WebAssemblyInstrSIMD.td | 1199 +++--
 .../Target/WebAssembly/WebAssemblyInstrTable.td    |   64 +
 .../WebAssembly/WebAssemblyLateEHPrepare.cpp       |  325 +-
 .../WebAssemblyLowerEmscriptenEHSjLj.cpp           |  123 +-
 .../Target/WebAssembly/WebAssemblyMCInstLower.cpp  |   49 +-
 .../WebAssemblyOptimizeLiveIntervals.cpp           |    2 +-
 .../lib/Target/WebAssembly/WebAssemblyPeephole.cpp |    7 +-
 .../Target/WebAssembly/WebAssemblyRegColoring.cpp  |   22 +-
 .../Target/WebAssembly/WebAssemblyRegStackify.cpp  |   44 +-
 .../Target/WebAssembly/WebAssemblyRegisterInfo.td  |    6 +-
 .../Target/WebAssembly/WebAssemblySortRegion.cpp   |   78 +
 .../lib/Target/WebAssembly/WebAssemblySortRegion.h |   91 +
 .../Target/WebAssembly/WebAssemblySubtarget.cpp    |    9 +-
 .../lib/Target/WebAssembly/WebAssemblySubtarget.h  |    2 +-
 .../WebAssembly/WebAssemblyTargetMachine.cpp       |   34 +-
 .../Target/WebAssembly/WebAssemblyTargetMachine.h  |    1 +
 .../WebAssembly/WebAssemblyTargetTransformInfo.cpp |   18 +
 .../WebAssembly/WebAssemblyTargetTransformInfo.h   |    3 +
 .../Target/WebAssembly/WebAssemblyUtilities.cpp    |   33 +
 .../lib/Target/WebAssembly/WebAssemblyUtilities.h  |   28 +-
 .../llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp | 1218 ++++-
 .../Target/X86/Disassembler/X86Disassembler.cpp    |    7 +-
 .../Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp  |   11 +
 .../Target/X86/MCTargetDesc/X86ATTInstPrinter.h    |    1 +
 .../lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp  |   42 +-
 .../llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h |   25 +-
 .../Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp |    9 +
 .../X86/MCTargetDesc/X86InstPrinterCommon.cpp      |   19 +
 .../X86/MCTargetDesc/X86IntelInstPrinter.cpp       |   10 +
 .../Target/X86/MCTargetDesc/X86IntelInstPrinter.h  |    1 +
 .../Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp   |  225 +-
 .../Target/X86/MCTargetDesc/X86MCTargetDesc.cpp    |   13 +-
 .../lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h  |    6 +-
 .../X86/MCTargetDesc/X86MachObjectWriter.cpp       |    2 +-
 .../Target/X86/MCTargetDesc/X86ShuffleDecode.cpp   |    2 +-
 .../Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp |    9 +-
 contrib/llvm-project/llvm/lib/Target/X86/X86.h     |   11 +-
 contrib/llvm-project/llvm/lib/Target/X86/X86.td    |  868 ++--
 .../llvm/lib/Target/X86/X86AsmPrinter.cpp          |   10 +-
 .../llvm/lib/Target/X86/X86AsmPrinter.h            |    6 +-
 .../Target/X86/X86AvoidStoreForwardingBlocks.cpp   |    4 +-
 .../lib/Target/X86/X86CallFrameOptimization.cpp    |   23 +-
 .../llvm/lib/Target/X86/X86CallLowering.cpp        |   70 +-
 .../llvm/lib/Target/X86/X86CallLowering.h          |    6 +-
 .../llvm/lib/Target/X86/X86CallingConv.cpp         |   10 +
 .../llvm/lib/Target/X86/X86CallingConv.td          |   15 +-
 .../llvm/lib/Target/X86/X86CmovConversion.cpp      |    6 +-
 .../llvm/lib/Target/X86/X86CondBrFolding.cpp       |  579 ---
 .../llvm/lib/Target/X86/X86DomainReassignment.cpp  |   48 +-
 .../llvm/lib/Target/X86/X86EvexToVex.cpp           |   31 +-
 .../llvm/lib/Target/X86/X86ExpandPseudo.cpp        |   77 +-
 .../llvm/lib/Target/X86/X86FastISel.cpp            |  104 +-
 .../llvm/lib/Target/X86/X86FixupBWInsts.cpp        |    5 +-
 .../llvm/lib/Target/X86/X86FixupLEAs.cpp           |   15 +-
 .../llvm/lib/Target/X86/X86FixupSetCC.cpp          |   17 +-
 .../llvm/lib/Target/X86/X86FlagsCopyLowering.cpp   |    8 +-
 .../llvm/lib/Target/X86/X86FrameLowering.cpp       |  168 +-
 .../llvm/lib/Target/X86/X86FrameLowering.h         |   23 +-
 .../llvm/lib/Target/X86/X86ISelDAGToDAG.cpp        |  648 ++-
 .../llvm/lib/Target/X86/X86ISelLowering.cpp        | 4966 +++++++++++-------
 .../llvm/lib/Target/X86/X86ISelLowering.h          |   60 +-
 .../lib/Target/X86/X86IndirectBranchTracking.cpp   |    2 +-
 .../llvm/lib/Target/X86/X86IndirectThunks.cpp      |    1 -
 .../llvm/lib/Target/X86/X86InsertPrefetch.cpp      |    8 +-
 .../llvm/lib/Target/X86/X86InsertWait.cpp          |    6 +-
 .../lib/Target/X86/X86InstCombineIntrinsic.cpp     | 2017 ++++++++
 .../llvm/lib/Target/X86/X86InstrAMX.td             |   54 +-
 .../llvm/lib/Target/X86/X86InstrAVX512.td          |  738 +--
 .../llvm/lib/Target/X86/X86InstrArithmetic.td      |    9 +
 .../llvm/lib/Target/X86/X86InstrCompiler.td        |  192 +-
 .../llvm/lib/Target/X86/X86InstrFMA.td             |   22 +-
 .../llvm/lib/Target/X86/X86InstrFPStack.td         |    8 +-
 .../llvm/lib/Target/X86/X86InstrFoldTables.cpp     |   14 +
 .../llvm/lib/Target/X86/X86InstrFormats.td         |    6 +
 .../llvm/lib/Target/X86/X86InstrFragmentsSIMD.td   |   36 +-
 .../llvm/lib/Target/X86/X86InstrInfo.cpp           |  255 +-
 .../llvm/lib/Target/X86/X86InstrInfo.h             |   32 +-
 .../llvm/lib/Target/X86/X86InstrInfo.td            |  150 +-
 .../llvm-project/llvm/lib/Target/X86/X86InstrKL.td |   86 +
 .../llvm/lib/Target/X86/X86InstrMMX.td             |    2 +
 .../llvm/lib/Target/X86/X86InstrSNP.td             |   47 +
 .../llvm/lib/Target/X86/X86InstrSSE.td             |  221 +-
 .../llvm/lib/Target/X86/X86InstrSVM.td             |   28 +-
 .../llvm/lib/Target/X86/X86InstrSystem.td          |   14 +-
 .../llvm/lib/Target/X86/X86InstrTDX.td             |   39 +
 .../llvm/lib/Target/X86/X86InstructionSelector.cpp |   99 +-
 .../llvm/lib/Target/X86/X86InterleavedAccess.cpp   |   24 +-
 .../llvm/lib/Target/X86/X86IntrinsicsInfo.h        |   40 +-
 .../llvm/lib/Target/X86/X86LegalizerInfo.cpp       |   35 +-
 .../X86/X86LoadValueInjectionLoadHardening.cpp     |  184 +-
 .../X86/X86LoadValueInjectionRetHardening.cpp      |   81 +-
 .../llvm/lib/Target/X86/X86LowerAMXType.cpp        |  351 ++
 .../llvm/lib/Target/X86/X86MCInstLower.cpp         |   69 +-
 .../llvm/lib/Target/X86/X86PartialReduction.cpp    |    9 +-
 .../llvm/lib/Target/X86/X86PreTileConfig.cpp       |  265 +
 .../llvm/lib/Target/X86/X86RegisterInfo.cpp        |  139 +-
 .../llvm/lib/Target/X86/X86RegisterInfo.h          |   14 +-
 .../llvm/lib/Target/X86/X86RegisterInfo.td         |   12 +-
 .../llvm/lib/Target/X86/X86SelectionDAGInfo.cpp    |    8 +
 .../Target/X86/X86ShuffleDecodeConstantPool.cpp    |   53 +-
 .../lib/Target/X86/X86ShuffleDecodeConstantPool.h  |    8 -
 ...86SpeculativeExecutionSideEffectSuppression.cpp |    1 +
 .../lib/Target/X86/X86SpeculativeLoadHardening.cpp |   17 +-
 .../llvm/lib/Target/X86/X86Subtarget.cpp           |  102 +-
 .../llvm/lib/Target/X86/X86Subtarget.h             |   68 +-
 .../llvm/lib/Target/X86/X86TargetMachine.cpp       |  117 +-
 .../llvm/lib/Target/X86/X86TargetMachine.h         |    2 +
 .../llvm/lib/Target/X86/X86TargetObjectFile.cpp    |    8 -
 .../llvm/lib/Target/X86/X86TargetObjectFile.h      |    2 +-
 .../llvm/lib/Target/X86/X86TargetTransformInfo.cpp |  358 +-
 .../llvm/lib/Target/X86/X86TargetTransformInfo.h   |   26 +-
 .../llvm/lib/Target/X86/X86TileConfig.cpp          |  248 +
 .../llvm/lib/Target/X86/X86WinEHState.cpp          |    2 +-
 .../Target/XCore/MCTargetDesc/XCoreInstPrinter.h   |    1 +
 .../XCore/MCTargetDesc/XCoreMCTargetDesc.cpp       |    2 +-
 .../llvm/lib/Target/XCore/XCoreISelLowering.cpp    |   18 +-
 .../lib/Target/XCore/XCoreLowerThreadLocal.cpp     |   56 +-
 .../llvm/lib/Target/XCore/XCoreSubtarget.cpp       |    4 +-
 .../llvm/lib/Target/XCore/XCoreSubtarget.h         |    2 +-
 .../llvm/lib/Target/XCore/XCoreTargetMachine.cpp   |    4 +-
 .../llvm/lib/Testing/Support/Annotations.cpp       |   12 +-
 .../llvm-project/llvm/lib/TextAPI/ELF/ELFStub.cpp  |   28 -
 .../llvm/lib/TextAPI/ELF/TBEHandler.cpp            |  160 -
 .../llvm/lib/TextAPI/MachO/InterfaceFile.cpp       |    2 -
 .../llvm/lib/TextAPI/MachO/Platform.cpp            |    4 +-
 .../llvm-project/llvm/lib/TextAPI/MachO/Target.cpp |    1 +
 .../llvm/lib/TextAPI/MachO/TextStub.cpp            |   27 +-
 .../llvm/lib/TextAPI/MachO/TextStubCommon.cpp      |    3 +
 .../llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp    |   44 +-
 .../AggressiveInstCombine.cpp                      |  149 +-
 .../AggressiveInstCombine/TruncInstCombine.cpp     |   13 +-
 .../llvm/lib/Transforms/Coroutines/CoroCleanup.cpp |    1 +
 .../llvm/lib/Transforms/Coroutines/CoroEarly.cpp   |   20 +-
 .../llvm/lib/Transforms/Coroutines/CoroElide.cpp   |   25 +-
 .../llvm/lib/Transforms/Coroutines/CoroFrame.cpp   | 1518 ++++--
 .../llvm/lib/Transforms/Coroutines/CoroInstr.h     |  181 +-
 .../llvm/lib/Transforms/Coroutines/CoroInternal.h  |   59 +-
 .../llvm/lib/Transforms/Coroutines/CoroSplit.cpp   |  595 ++-
 .../llvm/lib/Transforms/Coroutines/Coroutines.cpp  |  109 +-
 .../llvm/lib/Transforms/HelloNew/HelloWorld.cpp    |   17 +
 .../llvm/lib/Transforms/IPO/AlwaysInliner.cpp      |   55 +-
 .../lib/Transforms/IPO/Annotation2Metadata.cpp     |  106 +
 .../llvm/lib/Transforms/IPO/ArgumentPromotion.cpp  |   44 +-
 .../llvm/lib/Transforms/IPO/Attributor.cpp         |  542 +-
 .../lib/Transforms/IPO/AttributorAttributes.cpp    | 1306 ++++-
 .../llvm/lib/Transforms/IPO/BlockExtractor.cpp     |   79 +-
 .../llvm/lib/Transforms/IPO/ConstantMerge.cpp      |    2 +
 .../lib/Transforms/IPO/DeadArgumentElimination.cpp |    2 +-
 .../llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp |   47 +-
 .../llvm/lib/Transforms/IPO/FunctionAttrs.cpp      |  271 +-
 .../llvm/lib/Transforms/IPO/FunctionImport.cpp     |   49 +-
 .../llvm/lib/Transforms/IPO/GlobalOpt.cpp          |   24 +-
 .../llvm/lib/Transforms/IPO/HotColdSplitting.cpp   |  100 +-
 .../lib/Transforms/IPO/IPConstantPropagation.cpp   |  308 --
 .../llvm-project/llvm/lib/Transforms/IPO/IPO.cpp   |   11 +-
 .../llvm/lib/Transforms/IPO/IROutliner.cpp         | 1764 +++++++
 .../llvm/lib/Transforms/IPO/Inliner.cpp            |  100 +-
 .../llvm/lib/Transforms/IPO/LoopExtractor.cpp      |  137 +-
 .../llvm/lib/Transforms/IPO/LowerTypeTests.cpp     |   32 +-
 .../llvm/lib/Transforms/IPO/MergeFunctions.cpp     |    6 +-
 .../llvm/lib/Transforms/IPO/OpenMPOpt.cpp          | 1350 ++++-
 .../llvm/lib/Transforms/IPO/PartialInlining.cpp    |  329 +-
 .../llvm/lib/Transforms/IPO/PassManagerBuilder.cpp |  161 +-
 .../llvm/lib/Transforms/IPO/PruneEH.cpp            |   72 +-
 .../lib/Transforms/IPO/SampleContextTracker.cpp    |  585 +++
 .../llvm/lib/Transforms/IPO/SampleProfile.cpp      |  980 +++-
 .../llvm/lib/Transforms/IPO/SampleProfileProbe.cpp |  434 ++
 .../llvm/lib/Transforms/IPO/StripSymbols.cpp       |   56 +-
 .../lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp    |    6 +-
 .../llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp |   22 +-
 .../Transforms/InstCombine/InstCombineAddSub.cpp   |  335 +-
 .../Transforms/InstCombine/InstCombineAndOrXor.cpp |  774 +--
 .../InstCombine/InstCombineAtomicRMW.cpp           |   18 +-
 .../Transforms/InstCombine/InstCombineCalls.cpp    | 3340 ++-----------
 .../Transforms/InstCombine/InstCombineCasts.cpp    |  376 +-
 .../Transforms/InstCombine/InstCombineCompares.cpp |  502 +-
 .../Transforms/InstCombine/InstCombineInternal.h   |  407 +-
 .../InstCombine/InstCombineLoadStoreAlloca.cpp     |  232 +-
 .../InstCombine/InstCombineMulDivRem.cpp           |  209 +-
 .../Transforms/InstCombine/InstCombineNegator.cpp  |  149 +-
 .../lib/Transforms/InstCombine/InstCombinePHI.cpp  |  253 +-
 .../Transforms/InstCombine/InstCombineSelect.cpp   |  421 +-
 .../Transforms/InstCombine/InstCombineShifts.cpp   |  266 +-
 .../InstCombine/InstCombineSimplifyDemanded.cpp    |  604 +--
 .../Transforms/InstCombine/InstCombineTables.td    |   11 -
 .../InstCombine/InstCombineVectorOps.cpp           |  434 +-
 .../InstCombine/InstructionCombining.cpp           |  422 +-
 .../Instrumentation/AddressSanitizer.cpp           |  166 +-
 .../llvm/lib/Transforms/Instrumentation/CFGMST.h   |   17 +-
 .../lib/Transforms/Instrumentation/CGProfile.cpp   |    3 +-
 .../Instrumentation/ControlHeightReduction.cpp     |   38 +-
 .../Instrumentation/DataFlowSanitizer.cpp          |  859 +++-
 .../Transforms/Instrumentation/GCOVProfiling.cpp   |  682 +--
 .../Instrumentation/HWAddressSanitizer.cpp         |  311 +-
 .../Instrumentation/IndirectCallPromotion.cpp      |    9 +-
 .../Transforms/Instrumentation/InstrProfiling.cpp  |  109 +-
 .../Transforms/Instrumentation/Instrumentation.cpp |    4 +-
 .../lib/Transforms/Instrumentation/MemProfiler.cpp |  638 +++
 .../Transforms/Instrumentation/MemorySanitizer.cpp |  411 +-
 .../Instrumentation/PGOInstrumentation.cpp         |  324 +-
 .../Transforms/Instrumentation/PGOMemOPSizeOpt.cpp |   38 +-
 .../Transforms/Instrumentation/PoisonChecking.cpp  |   10 +-
 .../Instrumentation/SanitizerCoverage.cpp          |  124 +-
 .../Transforms/Instrumentation/ThreadSanitizer.cpp |  186 +-
 .../Instrumentation/ValueProfileCollector.cpp      |    2 +-
 .../Instrumentation/ValueProfileCollector.h        |    9 +-
 .../lib/Transforms/ObjCARC/DependencyAnalysis.cpp  |   67 +-
 .../lib/Transforms/ObjCARC/DependencyAnalysis.h    |   12 +-
 .../llvm/lib/Transforms/ObjCARC/ObjCARC.cpp        |    4 +-
 .../llvm/lib/Transforms/ObjCARC/ObjCARC.h          |    2 -
 .../llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp  |   78 +-
 .../lib/Transforms/ObjCARC/ObjCARCContract.cpp     |  226 +-
 .../llvm/lib/Transforms/ObjCARC/ObjCARCExpand.cpp  |   78 +-
 .../llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp    |  337 +-
 .../lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp  |   25 +-
 .../lib/Transforms/ObjCARC/ProvenanceAnalysis.h    |   12 +-
 .../ObjCARC/ProvenanceAnalysisEvaluator.cpp        |    4 +-
 .../llvm/lib/Transforms/ObjCARC/PtrState.cpp       |    4 +-
 .../llvm/lib/Transforms/Scalar/ADCE.cpp            |    4 +-
 .../Transforms/Scalar/AlignmentFromAssumptions.cpp |  121 +-
 .../lib/Transforms/Scalar/AnnotationRemarks.cpp    |   90 +
 .../lib/Transforms/Scalar/CallSiteSplitting.cpp    |    2 +-
 .../lib/Transforms/Scalar/ConstantHoisting.cpp     |   15 +-
 .../llvm/lib/Transforms/Scalar/ConstantProp.cpp    |  121 -
 .../Transforms/Scalar/ConstraintElimination.cpp    |  407 ++
 .../Scalar/CorrelatedValuePropagation.cpp          |  339 +-
 .../llvm/lib/Transforms/Scalar/DCE.cpp             |   66 +-
 .../lib/Transforms/Scalar/DeadStoreElimination.cpp | 1066 ++--
 .../llvm/lib/Transforms/Scalar/DivRemPairs.cpp     |   10 +-
 .../llvm/lib/Transforms/Scalar/EarlyCSE.cpp        |  432 +-
 .../llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp  |    1 +
 .../llvm/lib/Transforms/Scalar/Float2Int.cpp       |    1 -
 .../llvm/lib/Transforms/Scalar/GVN.cpp             |  417 +-
 .../llvm/lib/Transforms/Scalar/GVNHoist.cpp        | 1383 +++---
 .../llvm/lib/Transforms/Scalar/GVNSink.cpp         |   13 +-
 .../llvm/lib/Transforms/Scalar/GuardWidening.cpp   |   14 +-
 .../llvm/lib/Transforms/Scalar/IndVarSimplify.cpp  | 1308 +----
 .../Scalar/InductiveRangeCheckElimination.cpp      |  104 +-
 .../lib/Transforms/Scalar/InferAddressSpaces.cpp   |  160 +-
 .../lib/Transforms/Scalar/InstSimplifyPass.cpp     |    2 +
 .../llvm/lib/Transforms/Scalar/JumpThreading.cpp   |  352 +-
 .../llvm/lib/Transforms/Scalar/LICM.cpp            |  333 +-
 .../lib/Transforms/Scalar/LoopDataPrefetch.cpp     |    2 +-
 .../llvm/lib/Transforms/Scalar/LoopDeletion.cpp    |  129 +-
 .../llvm/lib/Transforms/Scalar/LoopDistribute.cpp  |   16 +-
 .../llvm/lib/Transforms/Scalar/LoopFlatten.cpp     |  728 +++
 .../llvm/lib/Transforms/Scalar/LoopFuse.cpp        |  329 +-
 .../lib/Transforms/Scalar/LoopIdiomRecognize.cpp   |  534 +-
 .../llvm/lib/Transforms/Scalar/LoopInterchange.cpp |  135 +-
 .../lib/Transforms/Scalar/LoopLoadElimination.cpp  |   63 +-
 .../llvm/lib/Transforms/Scalar/LoopPassManager.cpp |  313 +-
 .../llvm/lib/Transforms/Scalar/LoopPredication.cpp |   15 +-
 .../llvm/lib/Transforms/Scalar/LoopRerollPass.cpp  |   72 +-
 .../llvm/lib/Transforms/Scalar/LoopRotation.cpp    |   45 +-
 .../llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp |   28 +-
 .../llvm/lib/Transforms/Scalar/LoopSink.cpp        |  148 +-
 .../lib/Transforms/Scalar/LoopStrengthReduce.cpp   |  139 +-
 .../lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp |   12 +-
 .../llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp  |   92 +-
 .../llvm/lib/Transforms/Scalar/LoopUnswitch.cpp    |  325 +-
 .../lib/Transforms/Scalar/LoopVersioningLICM.cpp   |  110 +-
 .../Transforms/Scalar/LowerConstantIntrinsics.cpp  |   10 +-
 .../lib/Transforms/Scalar/LowerExpectIntrinsic.cpp |   21 +-
 .../Transforms/Scalar/LowerMatrixIntrinsics.cpp    |  291 +-
 .../llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp |  860 +++-
 .../llvm/lib/Transforms/Scalar/MergeICmps.cpp      |   24 +-
 .../llvm/lib/Transforms/Scalar/NaryReassociate.cpp |   90 +-
 .../llvm/lib/Transforms/Scalar/NewGVN.cpp          |  127 +-
 .../llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp |    9 +-
 .../llvm/lib/Transforms/Scalar/Reassociate.cpp     |  114 +-
 .../llvm/lib/Transforms/Scalar/Reg2Mem.cpp         |  120 +-
 .../Transforms/Scalar/RewriteStatepointsForGC.cpp  |  147 +-
 .../llvm/lib/Transforms/Scalar/SCCP.cpp            |  419 +-
 .../llvm/lib/Transforms/Scalar/SROA.cpp            |  351 +-
 .../llvm/lib/Transforms/Scalar/Scalar.cpp          |   37 +-
 .../Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp |  948 ++++
 .../llvm/lib/Transforms/Scalar/Scalarizer.cpp      |    9 +-
 .../Scalar/SeparateConstOffsetFromGEP.cpp          |   84 +-
 .../lib/Transforms/Scalar/SimpleLoopUnswitch.cpp   |   74 +-
 .../llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp |  188 +-
 .../llvm/lib/Transforms/Scalar/Sink.cpp            |   80 +-
 .../lib/Transforms/Scalar/SpeculateAroundPHIs.cpp  |   11 +-
 .../lib/Transforms/Scalar/SpeculativeExecution.cpp |    9 +-
 .../Scalar/StraightLineStrengthReduce.cpp          |  101 +-
 .../llvm/lib/Transforms/Scalar/StructurizeCFG.cpp  |  171 +-
 .../Transforms/Scalar/TailRecursionElimination.cpp |  146 +-
 .../lib/Transforms/Scalar/WarnMissedTransforms.cpp |    6 +-
 .../llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp |    3 -
 .../lib/Transforms/Utils/AssumeBundleBuilder.cpp   |    5 +-
 .../llvm/lib/Transforms/Utils/BasicBlockUtils.cpp  |  320 +-
 .../lib/Transforms/Utils/BreakCriticalEdges.cpp    |   35 +-
 .../llvm/lib/Transforms/Utils/BuildLibCalls.cpp    |  446 +-
 .../llvm/lib/Transforms/Utils/CallGraphUpdater.cpp |    5 +-
 .../lib/Transforms/Utils/CallPromotionUtils.cpp    |   13 +-
 .../Transforms/Utils/CanonicalizeFreezeInLoops.cpp |    4 +-
 .../llvm/lib/Transforms/Utils/CloneFunction.cpp    |  150 +-
 .../llvm/lib/Transforms/Utils/CloneModule.cpp      |   15 +-
 .../llvm/lib/Transforms/Utils/CodeExtractor.cpp    |   60 +-
 .../llvm/lib/Transforms/Utils/CodeMoverUtils.cpp   |   45 +-
 .../llvm/lib/Transforms/Utils/Debugify.cpp         |  103 +-
 .../lib/Transforms/Utils/EntryExitInstrumenter.cpp |   13 +-
 .../llvm/lib/Transforms/Utils/EscapeEnumerator.cpp |    5 +-
 .../llvm/lib/Transforms/Utils/Evaluator.cpp        |   14 +-
 .../llvm/lib/Transforms/Utils/FixIrreducible.cpp   |   32 +-
 .../lib/Transforms/Utils/FunctionComparator.cpp    |   25 +-
 .../llvm/lib/Transforms/Utils/GlobalStatus.cpp     |    3 +-
 .../llvm/lib/Transforms/Utils/GuardUtils.cpp       |    2 +-
 .../Utils/ImportedFunctionsInliningStatistics.cpp  |  202 -
 .../lib/Transforms/Utils/InjectTLIMappings.cpp     |    4 +-
 .../llvm/lib/Transforms/Utils/InlineFunction.cpp   |  273 +-
 .../llvm/lib/Transforms/Utils/InstructionNamer.cpp |   59 +-
 .../llvm/lib/Transforms/Utils/LCSSA.cpp            |   82 +-
 .../llvm/lib/Transforms/Utils/Local.cpp            |  753 +--
 .../llvm/lib/Transforms/Utils/LoopPeel.cpp         |  862 ++++
 .../lib/Transforms/Utils/LoopRotationUtils.cpp     |  108 +-
 .../llvm/lib/Transforms/Utils/LoopSimplify.cpp     |   24 +-
 .../llvm/lib/Transforms/Utils/LoopUnroll.cpp       |  129 +-
 .../llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp |   35 +-
 .../llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp   |  798 ---
 .../lib/Transforms/Utils/LoopUnrollRuntime.cpp     |   33 +-
 .../llvm/lib/Transforms/Utils/LoopUtils.cpp        |  485 +-
 .../llvm/lib/Transforms/Utils/LoopVersioning.cpp   |  150 +-
 .../llvm/lib/Transforms/Utils/LowerInvoke.cpp      |    2 +-
 .../llvm/lib/Transforms/Utils/LowerSwitch.cpp      |  403 +-
 .../llvm/lib/Transforms/Utils/MatrixUtils.cpp      |  104 +
 .../llvm/lib/Transforms/Utils/MetaRenamer.cpp      |  235 +-
 .../llvm/lib/Transforms/Utils/MisExpect.cpp        |  178 -
 .../llvm/lib/Transforms/Utils/PredicateInfo.cpp    |  225 +-
 .../Transforms/Utils/PromoteMemoryToRegister.cpp   |  118 +-
 .../llvm/lib/Transforms/Utils/SSAUpdater.cpp       |   19 +-
 .../Transforms/Utils/ScalarEvolutionExpander.cpp   |  753 +--
 .../llvm/lib/Transforms/Utils/SimplifyCFG.cpp      | 1893 ++++---
 .../llvm/lib/Transforms/Utils/SimplifyIndVar.cpp   | 1130 ++++-
 .../llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp |  105 +-
 .../llvm/lib/Transforms/Utils/SizeOpts.cpp         |    5 -
 .../llvm/lib/Transforms/Utils/StripGCRelocates.cpp |   46 +-
 .../Utils/StripNonLineTableDebugInfo.cpp           |   23 +-
 .../Transforms/Utils/UnifyFunctionExitNodes.cpp    |  113 +-
 .../llvm/lib/Transforms/Utils/UnifyLoopExits.cpp   |   53 +-
 .../Utils/UniqueInternalLinkageNames.cpp           |   23 +-
 .../llvm/lib/Transforms/Utils/Utils.cpp            |   10 +-
 .../llvm/lib/Transforms/Utils/VNCoercion.cpp       |   35 +-
 .../llvm/lib/Transforms/Utils/ValueMapper.cpp      |   28 +-
 .../Transforms/Vectorize/LoadStoreVectorizer.cpp   |   18 +-
 .../Vectorize/LoopVectorizationLegality.cpp        |  110 +-
 .../Vectorize/LoopVectorizationPlanner.h           |   52 +-
 .../lib/Transforms/Vectorize/LoopVectorize.cpp     | 3673 ++++++++++----
 .../lib/Transforms/Vectorize/SLPVectorizer.cpp     | 2251 +++++----
 .../lib/Transforms/Vectorize/VPRecipeBuilder.h     |   10 +-
 .../llvm/lib/Transforms/Vectorize/VPlan.cpp        |  302 +-
 .../llvm/lib/Transforms/Vectorize/VPlan.h          |  606 ++-
 .../lib/Transforms/Vectorize/VPlanPredicator.cpp   |    4 +-
 .../llvm/lib/Transforms/Vectorize/VPlanSLP.cpp     |   25 +-
 .../lib/Transforms/Vectorize/VPlanTransforms.cpp   |   10 +-
 .../llvm/lib/Transforms/Vectorize/VPlanValue.h     |  214 +-
 .../lib/Transforms/Vectorize/VPlanVerifier.cpp     |    8 +-
 .../lib/Transforms/Vectorize/VectorCombine.cpp     |  207 +-
 .../lib/WindowsManifest/WindowsManifestMerger.cpp  |    6 +-
 .../llvm/lib/XRay/InstrumentationMap.cpp           |   24 +-
 .../llvm/tools/bugpoint/CrashDebugger.cpp          |    2 +-
 .../llvm/tools/bugpoint/ExecutionDriver.cpp        |   18 +-
 .../llvm/tools/bugpoint/ExtractFunction.cpp        |    2 +-
 .../llvm/tools/bugpoint/OptimizerDriver.cpp        |    3 +
 .../llvm/tools/bugpoint/ToolRunner.cpp             |    2 +-
 contrib/llvm-project/llvm/tools/llc/llc.cpp        |  124 +-
 .../llvm/tools/lli/ChildTarget/ChildTarget.cpp     |    6 +-
 .../llvm-project/llvm/tools/lli/RemoteJITUtils.h   |   72 +-
 contrib/llvm-project/llvm/tools/lli/lli.cpp        |   28 +-
 .../llvm-project/llvm/tools/llvm-ar/llvm-ar.cpp    |    2 +-
 .../llvm/tools/llvm-cov/CodeCoverage.cpp           |   96 +-
 .../llvm/tools/llvm-cov/CoverageExporterJson.cpp   |  100 +-
 .../llvm/tools/llvm-cov/CoverageExporterLcov.cpp   |   99 +-
 .../llvm/tools/llvm-cov/CoverageReport.cpp         |   49 +-
 .../llvm/tools/llvm-cov/CoverageSummaryInfo.cpp    |   38 +-
 .../llvm/tools/llvm-cov/CoverageSummaryInfo.h      |   54 +-
 .../llvm/tools/llvm-cov/CoverageViewOptions.h      |    6 +
 .../llvm/tools/llvm-cov/SourceCoverageView.cpp     |   23 +-
 .../llvm/tools/llvm-cov/SourceCoverageView.h       |   28 +
 .../llvm/tools/llvm-cov/SourceCoverageViewHTML.cpp |   76 +-
 .../llvm/tools/llvm-cov/SourceCoverageViewHTML.h   |    3 +
 .../llvm/tools/llvm-cov/SourceCoverageViewText.cpp |   50 +-
 .../llvm/tools/llvm-cov/SourceCoverageViewText.h   |    3 +
 contrib/llvm-project/llvm/tools/llvm-cov/gcov.cpp  |   26 +-
 .../llvm/tools/llvm-diff/DifferenceEngine.cpp      |    2 +-
 .../llvm/tools/llvm-dwarfdump/Statistics.cpp       |  202 +-
 .../llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp   |    4 +-
 .../llvm-project/llvm/tools/llvm-dwp/llvm-dwp.cpp  |   56 +-
 .../llvm/tools/llvm-link/llvm-link.cpp             |   53 +-
 .../llvm-project/llvm/tools/llvm-lto/llvm-lto.cpp  |   46 +-
 .../llvm/tools/llvm-lto2/llvm-lto2.cpp             |   21 +-
 .../llvm-project/llvm/tools/llvm-mc/llvm-mc.cpp    |    8 +
 .../llvm/tools/llvm-mca/CodeRegion.cpp             |    1 -
 .../llvm/tools/llvm-mca/PipelinePrinter.cpp        |    2 +-
 .../llvm/tools/llvm-mca/PipelinePrinter.h          |    4 +-
 .../tools/llvm-mca/Views/BottleneckAnalysis.cpp    |   47 +-
 .../llvm/tools/llvm-mca/Views/BottleneckAnalysis.h |   13 +-
 .../llvm/tools/llvm-mca/Views/DispatchStatistics.h |    1 +
 .../tools/llvm-mca/Views/InstructionInfoView.cpp   |  127 +-
 .../tools/llvm-mca/Views/InstructionInfoView.h     |   28 +-
 .../llvm/tools/llvm-mca/Views/InstructionView.cpp  |   60 +
 .../llvm/tools/llvm-mca/Views/InstructionView.h    |   67 +
 .../tools/llvm-mca/Views/RegisterFileStatistics.h  |    3 +
 .../tools/llvm-mca/Views/ResourcePressureView.cpp  |   52 +-
 .../tools/llvm-mca/Views/ResourcePressureView.h    |   10 +-
 .../llvm-mca/Views/RetireControlUnitStatistics.h   |    3 +
 .../tools/llvm-mca/Views/SchedulerStatistics.h     |    1 +
 .../llvm/tools/llvm-mca/Views/SummaryView.cpp      |   54 +-
 .../llvm/tools/llvm-mca/Views/SummaryView.h        |   18 +-
 .../llvm/tools/llvm-mca/Views/TimelineView.cpp     |   68 +-
 .../llvm/tools/llvm-mca/Views/TimelineView.h       |   11 +-
 .../llvm/tools/llvm-mca/Views/View.cpp             |    3 +
 .../llvm-project/llvm/tools/llvm-mca/Views/View.h  |   18 +
 .../llvm-project/llvm/tools/llvm-mca/llvm-mca.cpp  |   22 +-
 .../llvm-project/llvm/tools/llvm-nm/llvm-nm.cpp    | 1082 ++--
 .../llvm/tools/llvm-objcopy/BitcodeStripOpts.td    |   24 +
 .../llvm/tools/llvm-objcopy/COFF/COFFObjcopy.cpp   |   38 +-
 .../llvm/tools/llvm-objcopy/COFF/Object.cpp        |   69 +-
 .../llvm/tools/llvm-objcopy/COFF/Object.h          |    4 +-
 .../llvm/tools/llvm-objcopy/CopyConfig.cpp         |  130 +-
 .../llvm/tools/llvm-objcopy/CopyConfig.h           |    9 +
 .../llvm/tools/llvm-objcopy/ELF/ELFObjcopy.cpp     |  153 +-
 .../llvm/tools/llvm-objcopy/ELF/Object.cpp         |  891 ++--
 .../llvm/tools/llvm-objcopy/ELF/Object.h           |  264 +-
 .../llvm/tools/llvm-objcopy/InstallNameToolOpts.td |   10 +
 .../llvm-objcopy/MachO/MachOLayoutBuilder.cpp      |   14 +-
 .../tools/llvm-objcopy/MachO/MachOLayoutBuilder.h  |    8 +-
 .../llvm/tools/llvm-objcopy/MachO/MachOObjcopy.cpp |  148 +-
 .../llvm/tools/llvm-objcopy/MachO/MachOObjcopy.h   |    4 +
 .../llvm/tools/llvm-objcopy/MachO/MachOReader.cpp  |   51 +-
 .../llvm/tools/llvm-objcopy/MachO/MachOReader.h    |    6 +-
 .../llvm/tools/llvm-objcopy/MachO/MachOWriter.cpp  |   14 +-
 .../llvm/tools/llvm-objcopy/MachO/Object.cpp       |   59 +-
 .../llvm/tools/llvm-objcopy/MachO/Object.h         |   15 +-
 .../llvm/tools/llvm-objcopy/llvm-objcopy.cpp       |  108 +-
 .../llvm/tools/llvm-objcopy/llvm-objcopy.h         |   34 +-
 .../llvm/tools/llvm-objcopy/wasm/Object.cpp        |    4 +-
 .../llvm/tools/llvm-objcopy/wasm/WasmObjcopy.cpp   |    1 -
 .../llvm/tools/llvm-objdump/ELFDump.cpp            |   49 +-
 .../llvm/tools/llvm-objdump/MachODump.cpp          |   90 +-
 .../llvm/tools/llvm-objdump/llvm-objdump.cpp       |  281 +-
 .../llvm/tools/llvm-objdump/llvm-objdump.h         |    6 +-
 .../llvm/tools/llvm-pdbutil/DumpOutputStyle.cpp    |   17 +-
 .../llvm/tools/llvm-pdbutil/FormatUtil.cpp         |    7 +-
 .../llvm/tools/llvm-pdbutil/FormatUtil.h           |    4 +-
 .../tools/llvm-pdbutil/MinimalSymbolDumper.cpp     |   13 +-
 .../llvm/tools/llvm-pdbutil/llvm-pdbutil.cpp       |    4 +-
 .../llvm/tools/llvm-profdata/llvm-profdata.cpp     | 1204 ++++-
 .../llvm/tools/llvm-readobj/ARMEHABIPrinter.h      |  135 +-
 .../llvm/tools/llvm-readobj/ARMWinEHPrinter.cpp    |  211 +-
 .../llvm/tools/llvm-readobj/ARMWinEHPrinter.h      |   12 +
 .../llvm/tools/llvm-readobj/COFFDumper.cpp         |   64 +-
 .../llvm/tools/llvm-readobj/DwarfCFIEHPrinter.h    |   82 +-
 .../llvm/tools/llvm-readobj/ELFDumper.cpp          | 5244 +++++++++-----------
 .../llvm-project/llvm/tools/llvm-readobj/Error.cpp |   56 -
 .../llvm-project/llvm/tools/llvm-readobj/Error.h   |   40 -
 .../llvm/tools/llvm-readobj/MachODumper.cpp        |   41 +-
 .../llvm/tools/llvm-readobj/ObjDumper.cpp          |   52 +-
 .../llvm/tools/llvm-readobj/ObjDumper.h            |   49 +-
 .../llvm/tools/llvm-readobj/WasmDumper.cpp         |   16 +-
 .../llvm/tools/llvm-readobj/Win64EHDumper.cpp      |    5 +-
 .../tools/llvm-readobj/WindowsResourceDumper.cpp   |    1 -
 .../llvm/tools/llvm-readobj/XCOFFDumper.cpp        |   15 +-
 .../llvm/tools/llvm-readobj/llvm-readobj.cpp       |  157 +-
 .../llvm/tools/llvm-rtdyld/llvm-rtdyld.cpp         |    2 +
 .../llvm/tools/llvm-size/llvm-size.cpp             |    4 +-
 .../llvm/tools/llvm-stress/llvm-stress.cpp         |   38 +-
 .../llvm/tools/llvm-symbolizer/Opts.td             |   71 +
 .../llvm/tools/llvm-symbolizer/llvm-symbolizer.cpp |  383 +-
 .../llvm/tools/llvm-xray/xray-account.cpp          |   84 +-
 .../llvm/tools/llvm-xray/xray-account.h            |   31 +-
 .../llvm/tools/llvm-xray/xray-graph.cpp            |    3 +-
 .../llvm/tools/llvm-xray/xray-stacks.cpp           |   23 +-
 .../llvm-project/llvm/tools/opt/NewPMDriver.cpp    |  158 +-
 contrib/llvm-project/llvm/tools/opt/NewPMDriver.h  |   13 +-
 contrib/llvm-project/llvm/tools/opt/opt.cpp        |  233 +-
 .../llvm/utils/TableGen/AsmMatcherEmitter.cpp      |  139 +-
 .../llvm/utils/TableGen/AsmWriterEmitter.cpp       |  113 +-
 .../llvm/utils/TableGen/AsmWriterInst.cpp          |    7 +-
 .../llvm/utils/TableGen/CallingConvEmitter.cpp     |   18 +-
 .../llvm/utils/TableGen/CodeEmitterGen.cpp         |    8 +-
 .../llvm/utils/TableGen/CodeGenDAGPatterns.cpp     |   34 +-
 .../llvm/utils/TableGen/CodeGenDAGPatterns.h       |    8 +-
 .../llvm/utils/TableGen/CodeGenInstruction.cpp     |    2 +-
 .../llvm/utils/TableGen/CodeGenIntrinsics.h        |   10 +-
 .../llvm/utils/TableGen/CodeGenMapTable.cpp        |   26 +-
 .../llvm/utils/TableGen/CodeGenRegisters.cpp       |   19 +-
 .../llvm/utils/TableGen/CodeGenRegisters.h         |    4 +-
 .../llvm/utils/TableGen/CodeGenSchedule.cpp        |  351 +-
 .../llvm/utils/TableGen/CodeGenSchedule.h          |   13 +-
 .../llvm/utils/TableGen/CodeGenTarget.cpp          |  255 +-
 .../llvm/utils/TableGen/CodeGenTarget.h            |    9 +-
 .../llvm/utils/TableGen/DAGISelEmitter.cpp         |   14 +-
 .../llvm/utils/TableGen/DAGISelMatcher.h           |   20 +-
 .../llvm/utils/TableGen/DAGISelMatcherEmitter.cpp  |  226 +-
 .../llvm/utils/TableGen/DAGISelMatcherGen.cpp      |   10 +-
 .../llvm/utils/TableGen/DFAEmitter.cpp             |    5 +-
 .../llvm/utils/TableGen/DFAPacketizerEmitter.cpp   |    2 +-
 .../llvm/utils/TableGen/DirectiveEmitter.cpp       |  760 ++-
 .../llvm/utils/TableGen/ExegesisEmitter.cpp        |    2 +-
 .../llvm/utils/TableGen/FixedLenDecoderEmitter.cpp |   44 +-
 .../llvm/utils/TableGen/GICombinerEmitter.cpp      |   55 +-
 .../utils/TableGen/GlobalISel/CodeExpander.cpp     |   23 +-
 .../llvm/utils/TableGen/GlobalISel/GIMatchDag.cpp  |    2 +-
 .../utils/TableGen/GlobalISel/GIMatchDagInstr.cpp  |    2 +-
 .../llvm/utils/TableGen/GlobalISel/GIMatchTree.cpp |   15 +-
 .../llvm/utils/TableGen/GlobalISel/GIMatchTree.h   |    5 +-
 .../llvm/utils/TableGen/GlobalISelEmitter.cpp      |  802 ++-
 .../llvm/utils/TableGen/InstrInfoEmitter.cpp       |   39 +-
 .../llvm/utils/TableGen/IntrinsicEmitter.cpp       |   20 +-
 .../llvm/utils/TableGen/OptParserEmitter.cpp       |  309 +-
 .../llvm/utils/TableGen/PredicateExpander.cpp      |   22 +-
 .../llvm/utils/TableGen/PredicateExpander.h        |    3 +
 .../llvm/utils/TableGen/PseudoLoweringEmitter.cpp  |  114 +-
 .../utils/TableGen/RISCVCompressInstEmitter.cpp    |  234 +-
 .../llvm/utils/TableGen/RegisterBankEmitter.cpp    |   19 +-
 .../llvm/utils/TableGen/RegisterInfoEmitter.cpp    |   47 +-
 .../llvm/utils/TableGen/SearchableTableEmitter.cpp |  229 +-
 .../llvm/utils/TableGen/SubtargetEmitter.cpp       |  122 +-
 .../llvm/utils/TableGen/SubtargetFeatureInfo.cpp   |    5 +-
 .../llvm/utils/TableGen/SubtargetFeatureInfo.h     |    5 -
 .../llvm-project/llvm/utils/TableGen/TableGen.cpp  |   30 +-
 .../TableGen/WebAssemblyDisassemblerEmitter.cpp    |   15 +-
 .../llvm/utils/TableGen/X86DisassemblerTables.cpp  |    2 +-
 .../llvm/utils/TableGen/X86FoldTablesEmitter.cpp   |   27 +-
 .../llvm/utils/TableGen/X86RecognizableInstr.cpp   |    2 +-
 2737 files changed, 249293 insertions(+), 114395 deletions(-)
 create mode 100644 contrib/llvm-project/llvm/include/llvm-c/LLJIT.h
 delete mode 100644 contrib/llvm-project/llvm/include/llvm-c/LinkTimeOptimizer.h
 delete mode 100644 contrib/llvm-project/llvm/include/llvm-c/OrcBindings.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm-c/OrcEE.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/ADT/APFixedPoint.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/Analysis/ConstraintSystem.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/Analysis/DDGPrinter.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/Analysis/Delinearization.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h
 delete mode 100644 contrib/llvm-project/llvm/include/llvm/Analysis/InlineFeaturesAnalysis.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/Analysis/InstCount.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/Analysis/MemDerefPrinter.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/Analysis/ModuleDebugInfoPrinter.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/Analysis/ReplayInlineAdvisor.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/Analysis/Utils/ImportedFunctionsInliningStatistics.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/BinaryFormat/ELFRelocs/CSKY.def
 create mode 100644 contrib/llvm-project/llvm/include/llvm/BinaryFormat/WasmTraits.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/Bitcode/BitcodeCommon.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/Bitcode/BitcodeConvenience.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/CodeGen/BasicBlockSectionUtils.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/CodeGen/CodeGenPassBuilder.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/CodeGen/MachinePassManager.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/CodeGen/MachinePassRegistry.def
 create mode 100644 contrib/llvm-project/llvm/include/llvm/CodeGen/MachineStableHash.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/CodeGen/MultiHazardRecognizer.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/CodeGen/StableHashing.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/CodeGen/TileShapeInfo.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/DebugInfo/PDB/Native/NativeEnumSymbols.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/DebugInfo/PDB/Native/NativeInlineSiteSymbol.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/ExecutionEngine/JITLink/JITLinkDylib.h
 delete mode 100644 contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/GlobalMappingLayer.h
 delete mode 100644 contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/LambdaResolver.h
 delete mode 100644 contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/LazyEmittingLayer.h
 delete mode 100644 contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/Legacy.h
 delete mode 100644 contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/NullResolver.h
 delete mode 100644 contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/OrcError.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/OrcRPCTargetProcessControl.h
 delete mode 100644 contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/RPC/RPCSerialization.h
 delete mode 100644 contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/RPC/RPCUtils.h
 delete mode 100644 contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/RPC/RawByteChannel.h
 delete mode 100644 contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/RemoteObjectLayer.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/Shared/FDRawByteChannel.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/Shared/OrcError.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/Shared/RPCUtils.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/Shared/RawByteChannel.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/Shared/Serialization.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/Shared/TargetProcessControlTypes.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/TPCDynamicLibrarySearchGenerator.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/TPCEHFrameRegistrar.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/TPCIndirectionUtils.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/OrcRPCTPCServer.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/TargetExecutionUtils.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/TargetProcessControl.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/FileCheck/FileCheck.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/IR/Assumptions.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/IR/FixedPointBuilder.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsVE.td
 create mode 100644 contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsVEVL.gen.td
 create mode 100644 contrib/llvm-project/llvm/include/llvm/IR/PrintPasses.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/IR/PseudoProbe.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/IR/ReplaceConstant.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/IR/StructuralHash.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/InterfaceStub/ELFObjHandler.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/InterfaceStub/ELFStub.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/InterfaceStub/TBEHandler.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/MC/MCPseudoProbe.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/Object/MachOUniversalWriter.h
 delete mode 100644 contrib/llvm-project/llvm/include/llvm/Object/WasmTraits.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/ObjectYAML/ArchiveYAML.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/Remarks/HotnessThresholdParser.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/Support/ExitCodes.h
 delete mode 100644 contrib/llvm-project/llvm/include/llvm/Support/FileCheck.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/Support/FileSystem/UniqueID.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/Support/InstructionCost.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/Support/MemoryBufferRef.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/TableGen/DirectiveEmitter.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/Target/CGPassBuilderOption.h
 delete mode 100644 contrib/llvm-project/llvm/include/llvm/TextAPI/ELF/ELFStub.h
 delete mode 100644 contrib/llvm-project/llvm/include/llvm/TextAPI/ELF/TBEHandler.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/Transforms/HelloNew/HelloWorld.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/Transforms/IPO/Annotation2Metadata.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/Transforms/IPO/BlockExtractor.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/Transforms/IPO/IROutliner.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/Transforms/IPO/LoopExtractor.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/Transforms/IPO/SampleProfileProbe.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/Transforms/IPO/StripSymbols.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/Transforms/Instrumentation/DataFlowSanitizer.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/Transforms/Instrumentation/MemProfiler.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/AnnotationRemarks.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/ConstraintElimination.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/InferAddressSpaces.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/LoopFlatten.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/LoopInterchange.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/LoopReroll.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/LoopVersioningLICM.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/Reg2Mem.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/ScalarizeMaskedMemIntrin.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/SeparateConstOffsetFromGEP.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/StraightLineStrengthReduce.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/StructurizeCFG.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/Transforms/Utils/FixIrreducible.h
 delete mode 100644 contrib/llvm-project/llvm/include/llvm/Transforms/Utils/ImportedFunctionsInliningStatistics.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/Transforms/Utils/InstructionNamer.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/Transforms/Utils/LoopPeel.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/Transforms/Utils/LowerSwitch.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/Transforms/Utils/MatrixUtils.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/Transforms/Utils/MetaRenamer.h
 delete mode 100644 contrib/llvm-project/llvm/include/llvm/Transforms/Utils/MisExpect.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/Transforms/Utils/StripGCRelocates.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/Transforms/Utils/StripNonLineTableDebugInfo.h
 create mode 100644 contrib/llvm-project/llvm/include/llvm/Transforms/Utils/UnifyLoopExits.h
 create mode 100644 contrib/llvm-project/llvm/lib/Analysis/ConstraintSystem.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Analysis/DDGPrinter.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Analysis/IRSimilarityIdentifier.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Analysis/ImportedFunctionsInliningStatistics.cpp
 delete mode 100644 contrib/llvm-project/llvm/lib/Analysis/InlineFeaturesAnalysis.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Analysis/ReplayInlineAdvisor.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Analysis/models/inliner/README.txt
 create mode 100644 contrib/llvm-project/llvm/lib/Analysis/models/inliner/output_spec.json
 create mode 100644 contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.h
 delete mode 100644 contrib/llvm-project/llvm/lib/CodeGen/BBSectionsPrepare.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/CodeGen/BasicBlockSections.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/CodeGen/CodeGenPassBuilder.cpp
 delete mode 100644 contrib/llvm-project/llvm/lib/CodeGen/LiveDebugValues.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.h
 create mode 100644 contrib/llvm-project/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/CodeGen/MachineCheckDebugify.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/CodeGen/MachineFunctionSplitter.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/CodeGen/MachinePassManager.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/CodeGen/MachineStableHash.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/CodeGen/MultiHazardRecognizer.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/CodeGen/PseudoProbeInserter.cpp
 delete mode 100644 contrib/llvm-project/llvm/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/NativeEnumSymbols.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/NativeInlineSiteSymbol.cpp
 delete mode 100644 contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/Legacy.cpp
 delete mode 100644 contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/NullResolver.cpp
 delete mode 100644 contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/OrcCBindings.cpp
 delete mode 100644 contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/OrcCBindingsStack.h
 delete mode 100644 contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/OrcMCJITReplacement.cpp
 delete mode 100644 contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h
 create mode 100644 contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/Shared/OrcError.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/Shared/RPCError.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/Shared/TargetProcessControlTypes.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/TPCDynamicLibrarySearchGenerator.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/TPCEHFrameRegistrar.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/TPCIndirectionUtils.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/TargetProcess/TargetExecutionUtils.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/TargetProcessControl.cpp
 delete mode 100644 contrib/llvm-project/llvm/lib/ExecutionEngine/OrcError/OrcError.cpp
 delete mode 100644 contrib/llvm-project/llvm/lib/ExecutionEngine/OrcError/RPCError.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/FileCheck/FileCheck.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/FileCheck/FileCheckImpl.h
 create mode 100644 contrib/llvm-project/llvm/lib/IR/Assumptions.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/IR/PrintPasses.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/IR/PseudoProbe.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/IR/ReplaceConstant.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/IR/StructuralHash.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/InterfaceStub/ELFObjHandler.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/InterfaceStub/ELFStub.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/InterfaceStub/TBEHandler.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/MC/MCPseudoProbe.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Object/MachOUniversalWriter.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/ObjectYAML/ArchiveEmitter.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/ObjectYAML/ArchiveYAML.cpp
 delete mode 100644 contrib/llvm-project/llvm/lib/ObjectYAML/DWARFVisitor.cpp
 delete mode 100644 contrib/llvm-project/llvm/lib/ObjectYAML/DWARFVisitor.h
 create mode 100644 contrib/llvm-project/llvm/lib/Support/APFixedPoint.cpp
 delete mode 100644 contrib/llvm-project/llvm/lib/Support/FileCheck.cpp
 delete mode 100644 contrib/llvm-project/llvm/lib/Support/FileCheckImpl.h
 create mode 100644 contrib/llvm-project/llvm/lib/Support/InstructionCost.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Support/MemoryBufferRef.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/TableGen/DetailedRecordsBackend.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/TableGen/TableGenBackendSkeleton.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SchedA55.td
 create mode 100644 contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SchedA64FX.td
 create mode 100644 contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SchedTSV110.td
 delete mode 100644 contrib/llvm-project/llvm/lib/Target/AArch64/AArch64StackOffset.h
 create mode 100644 contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.h
 create mode 100644 contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PostSelectOptimize.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Target/AArch64/GISel/select-saddo.mir
 create mode 100644 contrib/llvm-project/llvm/lib/Target/AArch64/GISel/select-ssubo.mir
 delete mode 100644 contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInline.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h
 create mode 100644 contrib/llvm-project/llvm/lib/Target/AMDGPU/EXPInstructions.td
 create mode 100644 contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSubtarget.h
 create mode 100644 contrib/llvm-project/llvm/lib/Target/AMDGPU/InstCombineTables.td
 create mode 100644 contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Subtarget.h
 delete mode 100644 contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFixupVectorISel.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Target/ARM/ARMBlockPlacement.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Target/ARM/ARMSLSHardening.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleM7.td
 create mode 100644 contrib/llvm-project/llvm/lib/Target/ARM/MVETailPredUtils.h
 create mode 100644 contrib/llvm-project/llvm/lib/Target/BPF/BPFAdjustOpt.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Target/BPF/BPFTargetTransformInfo.h
 create mode 100644 contrib/llvm-project/llvm/lib/Target/CSKY/CSKY.td
 create mode 100644 contrib/llvm-project/llvm/lib/Target/CSKY/CSKYInstrFormats.td
 create mode 100644 contrib/llvm-project/llvm/lib/Target/CSKY/CSKYInstrInfo.td
 create mode 100644 contrib/llvm-project/llvm/lib/Target/CSKY/CSKYRegisterInfo.td
 create mode 100644 contrib/llvm-project/llvm/lib/Target/CSKY/CSKYTargetMachine.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Target/CSKY/CSKYTargetMachine.h
 create mode 100644 contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h
 create mode 100644 contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCAsmInfo.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCAsmInfo.h
 create mode 100644 contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCCodeEmitter.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCCodeEmitter.h
 create mode 100644 contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.h
 create mode 100644 contrib/llvm-project/llvm/lib/Target/CSKY/TargetInfo/CSKYTargetInfo.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Target/CSKY/TargetInfo/CSKYTargetInfo.h
 create mode 100644 contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.h
 create mode 100644 contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.h
 create mode 100644 contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCCallLowering.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCCallLowering.h
 create mode 100644 contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCInstructionSelector.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCLegalizerInfo.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCLegalizerInfo.h
 create mode 100644 contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.h
 create mode 100644 contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCRegisterBanks.td
 delete mode 100644 contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrQPX.td
 delete mode 100644 contrib/llvm-project/llvm/lib/Target/PowerPC/PPCQPXLoadSplat.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
 create mode 100644 contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h
 create mode 100644 contrib/llvm-project/llvm/lib/Target/RISCV/RISCVCleanupVSETVLI.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
 create mode 100644 contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
 create mode 100644 contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
 create mode 100644 contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSchedRocket.td
 delete mode 100644 contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSchedRocket32.td
 delete mode 100644 contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSchedRocket64.td
 create mode 100644 contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
 delete mode 100644 contrib/llvm-project/llvm/lib/Target/RISCV/Utils/RISCVBaseInfo.cpp
 delete mode 100644 contrib/llvm-project/llvm/lib/Target/RISCV/Utils/RISCVBaseInfo.h
 delete mode 100644 contrib/llvm-project/llvm/lib/Target/RISCV/Utils/RISCVMatInt.cpp
 delete mode 100644 contrib/llvm-project/llvm/lib/Target/RISCV/Utils/RISCVMatInt.h
 create mode 100644 contrib/llvm-project/llvm/lib/Target/VE/LVLGen.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Target/VE/VEInstrBuilder.h
 create mode 100644 contrib/llvm-project/llvm/lib/Target/VE/VEInstrIntrinsicVL.gen.td
 create mode 100644 contrib/llvm-project/llvm/lib/Target/VE/VEInstrIntrinsicVL.td
 create mode 100644 contrib/llvm-project/llvm/lib/Target/VE/VEInstrPatternsVec.td
 create mode 100644 contrib/llvm-project/llvm/lib/Target/VE/VEInstrVec.td
 create mode 100644 contrib/llvm-project/llvm/lib/Target/VE/VVPInstrInfo.td
 create mode 100644 contrib/llvm-project/llvm/lib/Target/VE/VVPInstrPatternsVec.td
 create mode 100644 contrib/llvm-project/llvm/lib/Target/VE/VVPNodes.def
 create mode 100644 contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrTable.td
 create mode 100644 contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblySortRegion.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblySortRegion.h
 delete mode 100644 contrib/llvm-project/llvm/lib/Target/X86/X86CondBrFolding.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Target/X86/X86InstrKL.td
 create mode 100644 contrib/llvm-project/llvm/lib/Target/X86/X86InstrSNP.td
 create mode 100644 contrib/llvm-project/llvm/lib/Target/X86/X86InstrTDX.td
 create mode 100644 contrib/llvm-project/llvm/lib/Target/X86/X86LowerAMXType.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Target/X86/X86PreTileConfig.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Target/X86/X86TileConfig.cpp
 delete mode 100644 contrib/llvm-project/llvm/lib/TextAPI/ELF/ELFStub.cpp
 delete mode 100644 contrib/llvm-project/llvm/lib/TextAPI/ELF/TBEHandler.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Transforms/HelloNew/HelloWorld.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Transforms/IPO/Annotation2Metadata.cpp
 delete mode 100644 contrib/llvm-project/llvm/lib/Transforms/IPO/IPConstantPropagation.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Transforms/IPO/IROutliner.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Transforms/IPO/SampleContextTracker.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
 delete mode 100644 contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineTables.td
 create mode 100644 contrib/llvm-project/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Transforms/Scalar/AnnotationRemarks.cpp
 delete mode 100644 contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstantProp.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
 delete mode 100644 contrib/llvm-project/llvm/lib/Transforms/Utils/ImportedFunctionsInliningStatistics.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Transforms/Utils/LoopPeel.cpp
 delete mode 100644 contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp
 create mode 100644 contrib/llvm-project/llvm/lib/Transforms/Utils/MatrixUtils.cpp
 delete mode 100644 contrib/llvm-project/llvm/lib/Transforms/Utils/MisExpect.cpp
 create mode 100644 contrib/llvm-project/llvm/tools/llvm-mca/Views/InstructionView.cpp
 create mode 100644 contrib/llvm-project/llvm/tools/llvm-mca/Views/InstructionView.h
 create mode 100644 contrib/llvm-project/llvm/tools/llvm-objcopy/BitcodeStripOpts.td
 delete mode 100644 contrib/llvm-project/llvm/tools/llvm-readobj/Error.cpp
 delete mode 100644 contrib/llvm-project/llvm/tools/llvm-readobj/Error.h
 create mode 100644 contrib/llvm-project/llvm/tools/llvm-symbolizer/Opts.td

(limited to 'contrib/llvm-project/llvm')

diff --git a/contrib/llvm-project/llvm/include/llvm-c/Core.h b/contrib/llvm-project/llvm/include/llvm-c/Core.h
index c8a6f970419b..2901ab715810 100644
--- a/contrib/llvm-project/llvm/include/llvm-c/Core.h
+++ b/contrib/llvm-project/llvm/include/llvm-c/Core.h
@@ -162,7 +162,8 @@ typedef enum {
   LLVMX86_MMXTypeKind,   /**< X86 MMX */
   LLVMTokenTypeKind,     /**< Tokens */
   LLVMScalableVectorTypeKind, /**< Scalable SIMD vector type */
-  LLVMBFloatTypeKind     /**< 16 bit brain floating point type */
+  LLVMBFloatTypeKind,    /**< 16 bit brain floating point type */
+  LLVMX86_AMXTypeKind    /**< X86 AMX */
 } LLVMTypeKind;
 
 typedef enum {
@@ -281,6 +282,7 @@ typedef enum {
   LLVMInlineAsmValueKind,
 
   LLVMInstructionValueKind,
+  LLVMPoisonValueValueKind
 } LLVMValueKind;
 
 typedef enum {
@@ -602,6 +604,17 @@ unsigned LLVMGetEnumAttributeKind(LLVMAttributeRef A);
  */
 uint64_t LLVMGetEnumAttributeValue(LLVMAttributeRef A);
 
+/**
+ * Create a type attribute
+ */
+LLVMAttributeRef LLVMCreateTypeAttribute(LLVMContextRef C, unsigned KindID,
+                                         LLVMTypeRef type_ref);
+
+/**
+ * Get the type attribute's value.
+ */
+LLVMTypeRef LLVMGetTypeAttributeValue(LLVMAttributeRef A);
+
 /**
  * Create a string attribute.
  */
@@ -624,6 +637,12 @@ const char *LLVMGetStringAttributeValue(LLVMAttributeRef A, unsigned *Length);
  */
 LLVMBool LLVMIsEnumAttribute(LLVMAttributeRef A);
 LLVMBool LLVMIsStringAttribute(LLVMAttributeRef A);
+LLVMBool LLVMIsTypeAttribute(LLVMAttributeRef A);
+
+/**
+ * Obtain a Type from a context by its registered name.
+ */
+LLVMTypeRef LLVMGetTypeByName2(LLVMContextRef C, const char *Name);
 
 /**
  * @}
@@ -866,9 +885,7 @@ LLVMValueRef LLVMGetInlineAsm(LLVMTypeRef Ty,
  */
 LLVMContextRef LLVMGetModuleContext(LLVMModuleRef M);
 
-/**
- * Obtain a Type from a module by its registered name.
- */
+/** Deprecated: Use LLVMGetTypeByName2 instead. */
 LLVMTypeRef LLVMGetTypeByName(LLVMModuleRef M, const char *Name);
 
 /**
@@ -1444,9 +1461,21 @@ unsigned LLVMGetPointerAddressSpace(LLVMTypeRef PointerTy);
 LLVMTypeRef LLVMVectorType(LLVMTypeRef ElementType, unsigned ElementCount);
 
 /**
- * Obtain the number of elements in a vector type.
+ * Create a vector type that contains a defined type and has a scalable
+ * number of elements.
  *
- * This only works on types that represent vectors.
+ * The created type will exist in the context thats its element type
+ * exists in.
+ *
+ * @see llvm::ScalableVectorType::get()
+ */
+LLVMTypeRef LLVMScalableVectorType(LLVMTypeRef ElementType,
+                                   unsigned ElementCount);
+
+/**
+ * Obtain the (possibly scalable) number of elements in a vector type.
+ *
+ * This only works on types that represent vectors (fixed or scalable).
  *
  * @see llvm::VectorType::getNumElements()
  */
@@ -1477,6 +1506,11 @@ LLVMTypeRef LLVMLabelTypeInContext(LLVMContextRef C);
  */
 LLVMTypeRef LLVMX86MMXTypeInContext(LLVMContextRef C);
 
+/**
+ * Create a X86 AMX type in a context.
+ */
+LLVMTypeRef LLVMX86AMXTypeInContext(LLVMContextRef C);
+
 /**
  * Create a token type in a context.
  */
@@ -1494,6 +1528,7 @@ LLVMTypeRef LLVMMetadataTypeInContext(LLVMContextRef C);
 LLVMTypeRef LLVMVoidType(void);
 LLVMTypeRef LLVMLabelType(void);
 LLVMTypeRef LLVMX86MMXType(void);
+LLVMTypeRef LLVMX86AMXType(void);
 
 /**
  * @}
@@ -1550,6 +1585,7 @@ LLVMTypeRef LLVMX86MMXType(void);
           macro(Function)                   \
           macro(GlobalVariable)             \
       macro(UndefValue)                     \
+      macro(PoisonValue)                    \
     macro(Instruction)                      \
       macro(UnaryOperator)                  \
       macro(BinaryOperator)                 \
@@ -1683,6 +1719,11 @@ LLVMBool LLVMIsConstant(LLVMValueRef Val);
  */
 LLVMBool LLVMIsUndef(LLVMValueRef Val);
 
+/**
+ * Determine whether a value instance is poisonous.
+ */
+LLVMBool LLVMIsPoison(LLVMValueRef Val);
+
 /**
  * Convert value instances between types.
  *
@@ -1841,6 +1882,13 @@ LLVMValueRef LLVMConstAllOnes(LLVMTypeRef Ty);
  */
 LLVMValueRef LLVMGetUndef(LLVMTypeRef Ty);
 
+/**
+ * Obtain a constant value referring to a poison value of a type.
+ *
+ * @see llvm::PoisonValue::get()
+ */
+LLVMValueRef LLVMGetPoison(LLVMTypeRef Ty);
+
 /**
  * Determine whether a value instance is null.
  *
diff --git a/contrib/llvm-project/llvm/include/llvm-c/DataTypes.h b/contrib/llvm-project/llvm/include/llvm-c/DataTypes.h
index 0f27ba81865e..4eb0ac97d97e 100644
--- a/contrib/llvm-project/llvm/include/llvm-c/DataTypes.h
+++ b/contrib/llvm-project/llvm/include/llvm-c/DataTypes.h
@@ -77,8 +77,4 @@ typedef signed int ssize_t;
 # define UINT64_MAX 0xffffffffffffffffULL
 #endif
 
-#ifndef HUGE_VALF
-#define HUGE_VALF (float)HUGE_VAL
-#endif
-
 #endif /* LLVM_C_DATATYPES_H */
diff --git a/contrib/llvm-project/llvm/include/llvm-c/DebugInfo.h b/contrib/llvm-project/llvm/include/llvm-c/DebugInfo.h
index cdf5f5a0cca8..5a9cd8e2ee63 100644
--- a/contrib/llvm-project/llvm/include/llvm-c/DebugInfo.h
+++ b/contrib/llvm-project/llvm/include/llvm-c/DebugInfo.h
@@ -159,7 +159,9 @@ enum {
   LLVMDIImportedEntityMetadataKind,
   LLVMDIMacroMetadataKind,
   LLVMDIMacroFileMetadataKind,
-  LLVMDICommonBlockMetadataKind
+  LLVMDICommonBlockMetadataKind,
+  LLVMDIStringTypeMetadataKind,
+  LLVMDIGenericSubrangeMetadataKind
 };
 typedef unsigned LLVMMetadataKind;
 
diff --git a/contrib/llvm-project/llvm/include/llvm-c/Error.h b/contrib/llvm-project/llvm/include/llvm-c/Error.h
index 92f81bf38304..bc702ac7a1bf 100644
--- a/contrib/llvm-project/llvm/include/llvm-c/Error.h
+++ b/contrib/llvm-project/llvm/include/llvm-c/Error.h
@@ -62,6 +62,11 @@ void LLVMDisposeErrorMessage(char *ErrMsg);
  */
 LLVMErrorTypeId LLVMGetStringErrorTypeId(void);
 
+/**
+ * Create a StringError.
+ */
+LLVMErrorRef LLVMCreateStringError(const char *ErrMsg);
+
 LLVM_C_EXTERN_C_END
 
 #endif
diff --git a/contrib/llvm-project/llvm/include/llvm-c/LLJIT.h b/contrib/llvm-project/llvm/include/llvm-c/LLJIT.h
new file mode 100644
index 000000000000..28eb8bbff96b
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm-c/LLJIT.h
@@ -0,0 +1,213 @@
+/*===----------- llvm-c/LLJIT.h - OrcV2 LLJIT C bindings --------*- C++ -*-===*\
+|*                                                                            *|
+|* Part of the LLVM Project, under the Apache License v2.0 with LLVM          *|
+|* Exceptions.                                                                *|
+|* See https://llvm.org/LICENSE.txt for license information.                  *|
+|* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception                    *|
+|*                                                                            *|
+|*===----------------------------------------------------------------------===*|
+|*                                                                            *|
+|* This header declares the C interface to the LLJIT class in                 *|
+|* libLLVMOrcJIT.a, which provides a simple MCJIT-like ORC JIT.               *|
+|*                                                                            *|
+|* Many exotic languages can interoperate with C code but have a harder time  *|
+|* with C++ due to name mangling. So in addition to C, this interface enables *|
+|* tools written in such languages.                                           *|
+|*                                                                            *|
+|* Note: This interface is experimental. It is *NOT* stable, and may be       *|
+|*       changed without warning. Only C API usage documentation is           *|
+|*       provided. See the C++ documentation for all higher level ORC API     *|
+|*       details.                                                             *|
+|*                                                                            *|
+\*===----------------------------------------------------------------------===*/
+
+#ifndef LLVM_C_LLJIT_H
+#define LLVM_C_LLJIT_H
+
+#include "llvm-c/Error.h"
+#include "llvm-c/Orc.h"
+#include "llvm-c/TargetMachine.h"
+#include "llvm-c/Types.h"
+
+LLVM_C_EXTERN_C_BEGIN
+
+/**
+ * A function for constructing an ObjectLinkingLayer instance to be used
+ * by an LLJIT instance.
+ *
+ * Clients can call LLVMOrcLLJITBuilderSetObjectLinkingLayerCreator to
+ * set the creator function to use when constructing an LLJIT instance.
+ * This can be used to override the default linking layer implementation
+ * that would otherwise be chosen by LLJITBuilder.
+ *
+ * Object linking layers returned by this function will become owned by the
+ * LLJIT instance. The client is not responsible for managing their lifetimes
+ * after the function returns.
+ */
+typedef LLVMOrcObjectLayerRef (
+    *LLVMOrcLLJITBuilderObjectLinkingLayerCreatorFunction)(
+    void *Ctx, LLVMOrcExecutionSessionRef ES, const char *Triple);
+
+/**
+ * A reference to an orc::LLJITBuilder instance.
+ */
+typedef struct LLVMOrcOpaqueLLJITBuilder *LLVMOrcLLJITBuilderRef;
+
+/**
+ * A reference to an orc::LLJIT instance.
+ */
+typedef struct LLVMOrcOpaqueLLJIT *LLVMOrcLLJITRef;
+
+/**
+ * Create an LLVMOrcLLJITBuilder.
+ *
+ * The client owns the resulting LLJITBuilder and should dispose of it using
+ * LLVMOrcDisposeLLJITBuilder once they are done with it.
+ */
+LLVMOrcLLJITBuilderRef LLVMOrcCreateLLJITBuilder(void);
+
+/**
+ * Dispose of an LLVMOrcLLJITBuilderRef. This should only be called if ownership
+ * has not been passed to LLVMOrcCreateLLJIT (e.g. because some error prevented
+ * that function from being called).
+ */
+void LLVMOrcDisposeLLJITBuilder(LLVMOrcLLJITBuilderRef Builder);
+
+/**
+ * Set the JITTargetMachineBuilder to be used when constructing the LLJIT
+ * instance. Calling this function is optional: if it is not called then the
+ * LLJITBuilder will use JITTargeTMachineBuilder::detectHost to construct a
+ * JITTargetMachineBuilder.
+ */
+void LLVMOrcLLJITBuilderSetJITTargetMachineBuilder(
+    LLVMOrcLLJITBuilderRef Builder, LLVMOrcJITTargetMachineBuilderRef JTMB);
+
+/**
+ * Set an ObjectLinkingLayer creator function for this LLJIT instance.
+ */
+void LLVMOrcLLJITBuilderSetObjectLinkingLayerCreator(
+    LLVMOrcLLJITBuilderRef Builder,
+    LLVMOrcLLJITBuilderObjectLinkingLayerCreatorFunction F, void *Ctx);
+
+/**
+ * Create an LLJIT instance from an LLJITBuilder.
+ *
+ * This operation takes ownership of the Builder argument: clients should not
+ * dispose of the builder after calling this function (even if the function
+ * returns an error). If a null Builder argument is provided then a
+ * default-constructed LLJITBuilder will be used.
+ *
+ * On success the resulting LLJIT instance is uniquely owned by the client and
+ * automatically manages the memory of all JIT'd code and all modules that are
+ * transferred to it (e.g. via LLVMOrcLLJITAddLLVMIRModule). Disposing of the
+ * LLJIT instance will free all memory managed by the JIT, including JIT'd code
+ * and not-yet compiled modules.
+ */
+LLVMErrorRef LLVMOrcCreateLLJIT(LLVMOrcLLJITRef *Result,
+                                LLVMOrcLLJITBuilderRef Builder);
+
+/**
+ * Dispose of an LLJIT instance.
+ */
+LLVMErrorRef LLVMOrcDisposeLLJIT(LLVMOrcLLJITRef J);
+
+/**
+ * Get a reference to the ExecutionSession for this LLJIT instance.
+ *
+ * The ExecutionSession is owned by the LLJIT instance. The client is not
+ * responsible for managing its memory.
+ */
+LLVMOrcExecutionSessionRef LLVMOrcLLJITGetExecutionSession(LLVMOrcLLJITRef J);
+
+/**
+ * Return a reference to the Main JITDylib.
+ *
+ * The JITDylib is owned by the LLJIT instance. The client is not responsible
+ * for managing its memory.
+ */
+LLVMOrcJITDylibRef LLVMOrcLLJITGetMainJITDylib(LLVMOrcLLJITRef J);
+
+/**
+ * Return the target triple for this LLJIT instance. This string is owned by
+ * the LLJIT instance and should not be freed by the client.
+ */
+const char *LLVMOrcLLJITGetTripleString(LLVMOrcLLJITRef J);
+
+/**
+ * Returns the global prefix character according to the LLJIT's DataLayout.
+ */
+char LLVMOrcLLJITGetGlobalPrefix(LLVMOrcLLJITRef J);
+
+/**
+ * Mangles the given string according to the LLJIT instance's DataLayout, then
+ * interns the result in the SymbolStringPool and returns a reference to the
+ * pool entry. Clients should call LLVMOrcReleaseSymbolStringPoolEntry to
+ * decrement the ref-count on the pool entry once they are finished with this
+ * value.
+ */
+LLVMOrcSymbolStringPoolEntryRef
+LLVMOrcLLJITMangleAndIntern(LLVMOrcLLJITRef J, const char *UnmangledName);
+
+/**
+ * Add a buffer representing an object file to the given JITDylib in the given
+ * LLJIT instance. This operation transfers ownership of the buffer to the
+ * LLJIT instance. The buffer should not be disposed of or referenced once this
+ * function returns.
+ *
+ * Resources associated with the given object will be tracked by the given
+ * JITDylib's default resource tracker.
+ */
+LLVMErrorRef LLVMOrcLLJITAddObjectFile(LLVMOrcLLJITRef J, LLVMOrcJITDylibRef JD,
+                                       LLVMMemoryBufferRef ObjBuffer);
+
+/**
+ * Add a buffer representing an object file to the given ResourceTracker's
+ * JITDylib in the given LLJIT instance. This operation transfers ownership of
+ * the buffer to the LLJIT instance. The buffer should not be disposed of or
+ * referenced once this function returns.
+ *
+ * Resources associated with the given object will be tracked by ResourceTracker
+ * RT.
+ */
+LLVMErrorRef LLVMOrcLLJITAddObjectFileWithRT(LLVMOrcLLJITRef J,
+                                             LLVMOrcResourceTrackerRef RT,
+                                             LLVMMemoryBufferRef ObjBuffer);
+
+/**
+ * Add an IR module to the given JITDylib in the given LLJIT instance. This
+ * operation transfers ownership of the TSM argument to the LLJIT instance.
+ * The TSM argument should not be disposed of or referenced once this
+ * function returns.
+ *
+ * Resources associated with the given Module will be tracked by the given
+ * JITDylib's default resource tracker.
+ */
+LLVMErrorRef LLVMOrcLLJITAddLLVMIRModule(LLVMOrcLLJITRef J,
+                                         LLVMOrcJITDylibRef JD,
+                                         LLVMOrcThreadSafeModuleRef TSM);
+
+/**
+ * Add an IR module to the given ResourceTracker's JITDylib in the given LLJIT
+ * instance. This operation transfers ownership of the TSM argument to the LLJIT
+ * instance. The TSM argument should not be disposed of or referenced once this
+ * function returns.
+ *
+ * Resources associated with the given Module will be tracked by ResourceTracker
+ * RT.
+ */
+LLVMErrorRef LLVMOrcLLJITAddLLVMIRModuleWithRT(LLVMOrcLLJITRef J,
+                                               LLVMOrcResourceTrackerRef JD,
+                                               LLVMOrcThreadSafeModuleRef TSM);
+
+/**
+ * Look up the given symbol in the main JITDylib of the given LLJIT instance.
+ *
+ * This operation does not take ownership of the Name argument.
+ */
+LLVMErrorRef LLVMOrcLLJITLookup(LLVMOrcLLJITRef J,
+                                LLVMOrcJITTargetAddress *Result,
+                                const char *Name);
+
+LLVM_C_EXTERN_C_END
+
+#endif /* LLVM_C_LLJIT_H */
diff --git a/contrib/llvm-project/llvm/include/llvm-c/LinkTimeOptimizer.h b/contrib/llvm-project/llvm/include/llvm-c/LinkTimeOptimizer.h
deleted file mode 100644
index 9ae65b8fe5e0..000000000000
--- a/contrib/llvm-project/llvm/include/llvm-c/LinkTimeOptimizer.h
+++ /dev/null
@@ -1,66 +0,0 @@
-//===-- llvm/LinkTimeOptimizer.h - LTO Public C Interface -------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This header provides a C API to use the LLVM link time optimization
-// library. This is intended to be used by linkers which are C-only in
-// their implementation for performing LTO.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_C_LINKTIMEOPTIMIZER_H
-#define LLVM_C_LINKTIMEOPTIMIZER_H
-
-#include "llvm-c/ExternC.h"
-
-LLVM_C_EXTERN_C_BEGIN
-
-/**
- * @defgroup LLVMCLinkTimeOptimizer Link Time Optimization
- * @ingroup LLVMC
- *
- * @{
- */
-
-  /// This provides a dummy type for pointers to the LTO object.
-  typedef void* llvm_lto_t;
-
-  /// This provides a C-visible enumerator to manage status codes.
-  /// This should map exactly onto the C++ enumerator LTOStatus.
-  typedef enum llvm_lto_status {
-    LLVM_LTO_UNKNOWN,
-    LLVM_LTO_OPT_SUCCESS,
-    LLVM_LTO_READ_SUCCESS,
-    LLVM_LTO_READ_FAILURE,
-    LLVM_LTO_WRITE_FAILURE,
-    LLVM_LTO_NO_TARGET,
-    LLVM_LTO_NO_WORK,
-    LLVM_LTO_MODULE_MERGE_FAILURE,
-    LLVM_LTO_ASM_FAILURE,
-
-    //  Added C-specific error codes
-    LLVM_LTO_NULL_OBJECT
-  } llvm_lto_status_t;
-
-  /// This provides C interface to initialize link time optimizer. This allows
-  /// linker to use dlopen() interface to dynamically load LinkTimeOptimizer.
-  /// extern "C" helps, because dlopen() interface uses name to find the symbol.
-  extern llvm_lto_t llvm_create_optimizer(void);
-  extern void llvm_destroy_optimizer(llvm_lto_t lto);
-
-  extern llvm_lto_status_t llvm_read_object_file
-    (llvm_lto_t lto, const char* input_filename);
-  extern llvm_lto_status_t llvm_optimize_modules
-    (llvm_lto_t lto, const char* output_filename);
-
-/**
- * @}
- */
-
-  LLVM_C_EXTERN_C_END
-
-#endif
diff --git a/contrib/llvm-project/llvm/include/llvm-c/Orc.h b/contrib/llvm-project/llvm/include/llvm-c/Orc.h
index 09a058846108..9beef44c89dd 100644
--- a/contrib/llvm-project/llvm/include/llvm-c/Orc.h
+++ b/contrib/llvm-project/llvm/include/llvm-c/Orc.h
@@ -38,33 +38,201 @@ LLVM_C_EXTERN_C_BEGIN
  */
 typedef uint64_t LLVMOrcJITTargetAddress;
 
+/**
+ * Represents generic linkage flags for a symbol definition.
+ */
+typedef enum {
+  LLVMJITSymbolGenericFlagsExported = 1U << 0,
+  LLVMJITSymbolGenericFlagsWeak = 1U << 1
+} LLVMJITSymbolGenericFlags;
+
+/**
+ * Represents target specific flags for a symbol definition.
+ */
+typedef uint8_t LLVMJITTargetSymbolFlags;
+
+/**
+ * Represents the linkage flags for a symbol definition.
+ */
+typedef struct {
+  uint8_t GenericFlags;
+  uint8_t TargetFlags;
+} LLVMJITSymbolFlags;
+
+/**
+ * Represents an evaluated symbol address and flags.
+ */
+typedef struct {
+  LLVMOrcJITTargetAddress Address;
+  LLVMJITSymbolFlags Flags;
+} LLVMJITEvaluatedSymbol;
+
 /**
  * A reference to an orc::ExecutionSession instance.
  */
 typedef struct LLVMOrcOpaqueExecutionSession *LLVMOrcExecutionSessionRef;
 
+/**
+ * Error reporter function.
+ */
+typedef void (*LLVMOrcErrorReporterFunction)(void *Ctx, LLVMErrorRef Err);
+
+/**
+ * A reference to an orc::SymbolStringPool.
+ */
+typedef struct LLVMOrcOpaqueSymbolStringPool *LLVMOrcSymbolStringPoolRef;
+
 /**
  * A reference to an orc::SymbolStringPool table entry.
  */
-typedef struct LLVMOrcQuaqueSymbolStringPoolEntryPtr
+typedef struct LLVMOrcOpaqueSymbolStringPoolEntry
     *LLVMOrcSymbolStringPoolEntryRef;
 
+/**
+ * Represents a pair of a symbol name and an evaluated symbol.
+ */
+typedef struct {
+  LLVMOrcSymbolStringPoolEntryRef Name;
+  LLVMJITEvaluatedSymbol Sym;
+} LLVMJITCSymbolMapPair;
+
+/**
+ * Represents a list of (SymbolStringPtr, JITEvaluatedSymbol) pairs that can be
+ * used to construct a SymbolMap.
+ */
+typedef LLVMJITCSymbolMapPair *LLVMOrcCSymbolMapPairs;
+
+/**
+ * Lookup kind. This can be used by definition generators when deciding whether
+ * to produce a definition for a requested symbol.
+ *
+ * This enum should be kept in sync with llvm::orc::LookupKind.
+ */
+typedef enum {
+  LLVMOrcLookupKindStatic,
+  LLVMOrcLookupKindDLSym
+} LLVMOrcLookupKind;
+
+/**
+ * JITDylib lookup flags. This can be used by definition generators when
+ * deciding whether to produce a definition for a requested symbol.
+ *
+ * This enum should be kept in sync with llvm::orc::JITDylibLookupFlags.
+ */
+typedef enum {
+  LLVMOrcJITDylibLookupFlagsMatchExportedSymbolsOnly,
+  LLVMOrcJITDylibLookupFlagsMatchAllSymbols
+} LLVMOrcJITDylibLookupFlags;
+
+/**
+ * Symbol lookup flags for lookup sets. This should be kept in sync with
+ * llvm::orc::SymbolLookupFlags.
+ */
+typedef enum {
+  LLVMOrcSymbolLookupFlagsRequiredSymbol,
+  LLVMOrcSymbolLookupFlagsWeaklyReferencedSymbol
+} LLVMOrcSymbolLookupFlags;
+
+/**
+ * An element type for a symbol lookup set.
+ */
+typedef struct {
+  LLVMOrcSymbolStringPoolEntryRef Name;
+  LLVMOrcSymbolLookupFlags LookupFlags;
+} LLVMOrcCLookupSetElement;
+
+/**
+ * A set of symbols to look up / generate.
+ *
+ * The list is terminated with an element containing a null pointer for the
+ * Name field.
+ *
+ * If a client creates an instance of this type then they are responsible for
+ * freeing it, and for ensuring that all strings have been retained over the
+ * course of its life. Clients receiving a copy from a callback are not
+ * responsible for managing lifetime or retain counts.
+ */
+typedef LLVMOrcCLookupSetElement *LLVMOrcCLookupSet;
+
+/**
+ * A reference to an orc::MaterializationUnit.
+ */
+typedef struct LLVMOrcOpaqueMaterializationUnit *LLVMOrcMaterializationUnitRef;
+
 /**
  * A reference to an orc::JITDylib instance.
  */
 typedef struct LLVMOrcOpaqueJITDylib *LLVMOrcJITDylibRef;
 
 /**
- * A reference to an orc::JITDylib::DefinitionGenerator.
+ * A reference to an orc::ResourceTracker instance.
+ */
+typedef struct LLVMOrcOpaqueResourceTracker *LLVMOrcResourceTrackerRef;
+
+/**
+ * A reference to an orc::DefinitionGenerator.
+ */
+typedef struct LLVMOrcOpaqueDefinitionGenerator
+    *LLVMOrcDefinitionGeneratorRef;
+
+/**
+ * An opaque lookup state object. Instances of this type can be captured to
+ * suspend a lookup while a custom generator function attempts to produce a
+ * definition.
+ *
+ * If a client captures a lookup state object then they must eventually call
+ * LLVMOrcLookupStateContinueLookup to restart the lookup. This is required
+ * in order to release memory allocated for the lookup state, even if errors
+ * have occurred while the lookup was suspended (if these errors have made the
+ * lookup impossible to complete then it will issue its own error before
+ * destruction).
+ */
+typedef struct LLVMOrcOpaqueLookupState *LLVMOrcLookupStateRef;
+
+/**
+ * A custom generator function. This can be used to create a custom generator
+ * object using LLVMOrcCreateCustomCAPIDefinitionGenerator. The resulting
+ * object can be attached to a JITDylib, via LLVMOrcJITDylibAddGenerator, to
+ * receive callbacks when lookups fail to match existing definitions.
+ *
+ * GeneratorObj will contain the address of the custom generator object.
+ *
+ * Ctx will contain the context object passed to
+ * LLVMOrcCreateCustomCAPIDefinitionGenerator.
+ *
+ * LookupState will contain a pointer to an LLVMOrcLookupStateRef object. This
+ * can optionally be modified to make the definition generation process
+ * asynchronous: If the LookupStateRef value is copied, and the original
+ * LLVMOrcLookupStateRef set to null, the lookup will be suspended. Once the
+ * asynchronous definition process has been completed clients must call
+ * LLVMOrcLookupStateContinueLookup to continue the lookup (this should be
+ * done unconditionally, even if errors have occurred in the mean time, to
+ * free the lookup state memory and notify the query object of the failures. If
+ * LookupState is captured this function must return LLVMErrorSuccess.
+ *
+ * The Kind argument can be inspected to determine the lookup kind (e.g.
+ * as-if-during-static-link, or as-if-during-dlsym).
+ *
+ * The JD argument specifies which JITDylib the definitions should be generated
+ * into.
+ *
+ * The JDLookupFlags argument can be inspected to determine whether the original
+ * lookup included non-exported symobls.
+ *
+ * Finally, the LookupSet argument contains the set of symbols that could not
+ * be found in JD already (the set of generation candidates).
  */
-typedef struct LLVMOrcOpaqueJITDylibDefinitionGenerator
-    *LLVMOrcJITDylibDefinitionGeneratorRef;
+typedef LLVMErrorRef (*LLVMOrcCAPIDefinitionGeneratorTryToGenerateFunction)(
+    LLVMOrcDefinitionGeneratorRef GeneratorObj, void *Ctx,
+    LLVMOrcLookupStateRef *LookupState, LLVMOrcLookupKind Kind,
+    LLVMOrcJITDylibRef JD, LLVMOrcJITDylibLookupFlags JDLookupFlags,
+    LLVMOrcCLookupSet LookupSet, size_t LookupSetSize);
 
 /**
  * Predicate function for SymbolStringPoolEntries.
  */
-typedef int (*LLVMOrcSymbolPredicate)(LLVMOrcSymbolStringPoolEntryRef Sym,
-                                      void *Ctx);
+typedef int (*LLVMOrcSymbolPredicate)(void *Ctx,
+                                      LLVMOrcSymbolStringPoolEntryRef Sym);
 
 /**
  * A reference to an orc::ThreadSafeContext instance.
@@ -83,14 +251,43 @@ typedef struct LLVMOrcOpaqueJITTargetMachineBuilder
     *LLVMOrcJITTargetMachineBuilderRef;
 
 /**
- * A reference to an orc::LLJITBuilder instance.
+ * A reference to an orc::ObjectLayer instance.
+ */
+typedef struct LLVMOrcOpaqueObjectLayer *LLVMOrcObjectLayerRef;
+
+/**
+ * Attach a custom error reporter function to the ExecutionSession.
+ *
+ * The error reporter will be called to deliver failure notices that can not be
+ * directly reported to a caller. For example, failure to resolve symbols in
+ * the JIT linker is typically reported via the error reporter (callers
+ * requesting definitions from the JIT will typically be delivered a
+ * FailureToMaterialize error instead).
+ */
+void LLVMOrcExecutionSessionSetErrorReporter(
+    LLVMOrcExecutionSessionRef ES, LLVMOrcErrorReporterFunction ReportError,
+    void *Ctx);
+
+/**
+ * Return a reference to the SymbolStringPool for an ExecutionSession.
+ *
+ * Ownership of the pool remains with the ExecutionSession: The caller is
+ * not required to free the pool.
  */
-typedef struct LLVMOrcOpaqueLLJITBuilder *LLVMOrcLLJITBuilderRef;
+LLVMOrcSymbolStringPoolRef
+LLVMOrcExecutionSessionGetSymbolStringPool(LLVMOrcExecutionSessionRef ES);
 
 /**
- * A reference to an orc::LLJIT instance.
+ * Clear all unreferenced symbol string pool entries.
+ *
+ * This can be called at any time to release unused entries in the
+ * ExecutionSession's string pool. Since it locks the pool (preventing
+ * interning of any new strings) it is recommended that it only be called
+ * infrequently, ideally when the caller has reason to believe that some
+ * entries will have become unreferenced, e.g. after removing a module or
+ * closing a JITDylib.
  */
-typedef struct LLVMOrcOpaqueLLJIT *LLVMOrcLLJITRef;
+void LLVMOrcSymbolStringPoolClearDeadEntries(LLVMOrcSymbolStringPoolRef SSP);
 
 /**
  * Intern a string in the ExecutionSession's SymbolStringPool and return a
@@ -107,27 +304,139 @@ typedef struct LLVMOrcOpaqueLLJIT *LLVMOrcLLJITRef;
 LLVMOrcSymbolStringPoolEntryRef
 LLVMOrcExecutionSessionIntern(LLVMOrcExecutionSessionRef ES, const char *Name);
 
+/**
+ * Increments the ref-count for a SymbolStringPool entry.
+ */
+void LLVMOrcRetainSymbolStringPoolEntry(LLVMOrcSymbolStringPoolEntryRef S);
+
 /**
  * Reduces the ref-count for of a SymbolStringPool entry.
  */
 void LLVMOrcReleaseSymbolStringPoolEntry(LLVMOrcSymbolStringPoolEntryRef S);
 
+const char *LLVMOrcSymbolStringPoolEntryStr(LLVMOrcSymbolStringPoolEntryRef S);
+
+/**
+ * Reduces the ref-count of a ResourceTracker.
+ */
+void LLVMOrcReleaseResourceTracker(LLVMOrcResourceTrackerRef RT);
+
+/**
+ * Transfers tracking of all resources associated with resource tracker SrcRT
+ * to resource tracker DstRT.
+ */
+void LLVMOrcResourceTrackerTransferTo(LLVMOrcResourceTrackerRef SrcRT,
+                                      LLVMOrcResourceTrackerRef DstRT);
+
+/**
+ * Remove all resources associated with the given tracker. See
+ * ResourceTracker::remove().
+ */
+LLVMErrorRef LLVMOrcResourceTrackerRemove(LLVMOrcResourceTrackerRef RT);
+
 /**
  * Dispose of a JITDylib::DefinitionGenerator. This should only be called if
  * ownership has not been passed to a JITDylib (e.g. because some error
  * prevented the client from calling LLVMOrcJITDylibAddGenerator).
  */
-void LLVMOrcDisposeJITDylibDefinitionGenerator(
-    LLVMOrcJITDylibDefinitionGeneratorRef DG);
+void LLVMOrcDisposeDefinitionGenerator(LLVMOrcDefinitionGeneratorRef DG);
 
 /**
- * Add a JITDylib::DefinitionGenerator to the given JITDylib.
+ * Dispose of a MaterializationUnit.
+ */
+void LLVMOrcDisposeMaterializationUnit(LLVMOrcMaterializationUnitRef MU);
+
+/**
+ * Create a MaterializationUnit to define the given symbols as pointing to
+ * the corresponding raw addresses.
+ */
+LLVMOrcMaterializationUnitRef
+LLVMOrcAbsoluteSymbols(LLVMOrcCSymbolMapPairs Syms, size_t NumPairs);
+
+/**
+ * Create a "bare" JITDylib.
+ *
+ * The client is responsible for ensuring that the JITDylib's name is unique,
+ * e.g. by calling LLVMOrcExecutionSessionGetJTIDylibByName first.
+ *
+ * This call does not install any library code or symbols into the newly
+ * created JITDylib. The client is responsible for all configuration.
+ */
+LLVMOrcJITDylibRef
+LLVMOrcExecutionSessionCreateBareJITDylib(LLVMOrcExecutionSessionRef ES,
+                                          const char *Name);
+
+/**
+ * Create a JITDylib.
+ *
+ * The client is responsible for ensuring that the JITDylib's name is unique,
+ * e.g. by calling LLVMOrcExecutionSessionGetJTIDylibByName first.
+ *
+ * If a Platform is attached to the ExecutionSession then
+ * Platform::setupJITDylib will be called to install standard platform symbols
+ * (e.g. standard library interposes). If no Platform is installed then this
+ * call is equivalent to LLVMExecutionSessionRefCreateBareJITDylib and will
+ * always return success.
+ */
+LLVMErrorRef
+LLVMOrcExecutionSessionCreateJITDylib(LLVMOrcExecutionSessionRef ES,
+                                      LLVMOrcJITDylibRef *Result,
+                                      const char *Name);
+
+/**
+ * Returns the JITDylib with the given name, or NULL if no such JITDylib
+ * exists.
+ */
+LLVMOrcJITDylibRef
+LLVMOrcExecutionSessionGetJITDylibByName(LLVMOrcExecutionSessionRef ES,
+                                         const char *Name);
+
+/**
+ * Return a reference to a newly created resource tracker associated with JD.
+ * The tracker is returned with an initial ref-count of 1, and must be released
+ * with LLVMOrcReleaseResourceTracker when no longer needed.
+ */
+LLVMOrcResourceTrackerRef
+LLVMOrcJITDylibCreateResourceTracker(LLVMOrcJITDylibRef JD);
+
+/**
+ * Return a reference to the default resource tracker for the given JITDylib.
+ * This operation will increase the retain count of the tracker: Clients should
+ * call LLVMOrcReleaseResourceTracker when the result is no longer needed.
+ */
+LLVMOrcResourceTrackerRef
+LLVMOrcJITDylibGetDefaultResourceTracker(LLVMOrcJITDylibRef JD);
+
+/**
+ * Add the given MaterializationUnit to the given JITDylib.
+ *
+ * If this operation succeeds then JITDylib JD will take ownership of MU.
+ * If the operation fails then ownership remains with the caller who should
+ * call LLVMOrcDisposeMaterializationUnit to destroy it.
+ */
+LLVMErrorRef LLVMOrcJITDylibDefine(LLVMOrcJITDylibRef JD,
+                                   LLVMOrcMaterializationUnitRef MU);
+
+/**
+ * Calls remove on all trackers associated with this JITDylib, see
+ * JITDylib::clear().
+ */
+LLVMErrorRef LLVMOrcJITDylibClear(LLVMOrcJITDylibRef JD);
+
+/**
+ * Add a DefinitionGenerator to the given JITDylib.
  *
  * The JITDylib will take ownership of the given generator: The client is no
  * longer responsible for managing its memory.
  */
 void LLVMOrcJITDylibAddGenerator(LLVMOrcJITDylibRef JD,
-                                 LLVMOrcJITDylibDefinitionGeneratorRef DG);
+                                 LLVMOrcDefinitionGeneratorRef DG);
+
+/**
+ * Create a custom generator.
+ */
+LLVMOrcDefinitionGeneratorRef LLVMOrcCreateCustomCAPIDefinitionGenerator(
+    LLVMOrcCAPIDefinitionGeneratorTryToGenerateFunction F, void *Ctx);
 
 /**
  * Get a DynamicLibrarySearchGenerator that will reflect process symbols into
@@ -148,7 +457,7 @@ void LLVMOrcJITDylibAddGenerator(LLVMOrcJITDylibRef JD,
  * the global prefix if present.
  */
 LLVMErrorRef LLVMOrcCreateDynamicLibrarySearchGeneratorForProcess(
-    LLVMOrcJITDylibDefinitionGeneratorRef *Result, char GlobalPrefx,
+    LLVMOrcDefinitionGeneratorRef *Result, char GlobalPrefx,
     LLVMOrcSymbolPredicate Filter, void *FilterCtx);
 
 /**
@@ -156,7 +465,7 @@ LLVMErrorRef LLVMOrcCreateDynamicLibrarySearchGeneratorForProcess(
  *
  * Ownership of the underlying ThreadSafeContext data is shared: Clients
  * can and should dispose of their ThreadSafeContext as soon as they no longer
- * need to refer to it directly. Other references (e.g. from ThreadSafeModules
+ * need to refer to it directly. Other references (e.g. from ThreadSafeModules)
  * will keep the data alive as long as it is needed.
  */
 LLVMOrcThreadSafeContextRef LLVMOrcCreateNewThreadSafeContext(void);
@@ -178,7 +487,7 @@ void LLVMOrcDisposeThreadSafeContext(LLVMOrcThreadSafeContextRef TSCtx);
  * after this function returns.
  *
  * Ownership of the ThreadSafeModule is unique: If it is transferred to the JIT
- * (e.g. by LLVMOrcLLJITAddLLVMIRModule), in which case the client is no longer
+ * (e.g. by LLVMOrcLLJITAddLLVMIRModule) then the client is no longer
  * responsible for it. If it is not transferred to the JIT then the client
  * should call LLVMOrcDisposeThreadSafeModule to dispose of it.
  */
@@ -221,114 +530,9 @@ void LLVMOrcDisposeJITTargetMachineBuilder(
     LLVMOrcJITTargetMachineBuilderRef JTMB);
 
 /**
- * Create an LLJITTargetMachineBuilder.
- *
- * The client owns the resulting LLJITBuilder and should dispose of it using
- * LLVMOrcDisposeLLJITBuilder once they are done with it.
- */
-LLVMOrcLLJITBuilderRef LLVMOrcCreateLLJITBuilder(void);
-
-/**
- * Dispose of an LLVMOrcLLJITBuilderRef. This should only be called if ownership
- * has not been passed to LLVMOrcCreateLLJIT (e.g. because some error prevented
- * that function from being called).
- */
-void LLVMOrcDisposeLLJITBuilder(LLVMOrcLLJITBuilderRef Builder);
-
-/**
- * Set the JITTargetMachineBuilder to be used when constructing the LLJIT
- * instance. Calling this function is optional: if it is not called then the
- * LLJITBuilder will use JITTargeTMachineBuilder::detectHost to construct a
- * JITTargetMachineBuilder.
- */
-void LLVMOrcLLJITBuilderSetJITTargetMachineBuilder(
-    LLVMOrcLLJITBuilderRef Builder, LLVMOrcJITTargetMachineBuilderRef JTMB);
-
-/**
- * Create an LLJIT instance from an LLJITBuilder.
- *
- * This operation takes ownership of the Builder argument: clients should not
- * dispose of the builder after calling this function (even if the function
- * returns an error). If a null Builder argument is provided then a
- * default-constructed LLJITBuilder will be used.
- *
- * On success the resulting LLJIT instance is uniquely owned by the client and
- * automatically manages the memory of all JIT'd code and all modules that are
- * transferred to it (e.g. via LLVMOrcLLJITAddLLVMIRModule). Disposing of the
- * LLJIT instance will free all memory managed by the JIT, including JIT'd code
- * and not-yet compiled modules.
- */
-LLVMErrorRef LLVMOrcCreateLLJIT(LLVMOrcLLJITRef *Result,
-                                LLVMOrcLLJITBuilderRef Builder);
-
-/**
- * Dispose of an LLJIT instance.
- */
-LLVMErrorRef LLVMOrcDisposeLLJIT(LLVMOrcLLJITRef J);
-
-/**
- * Get a reference to the ExecutionSession for this LLJIT instance.
- *
- * The ExecutionSession is owned by the LLJIT instance. The client is not
- * responsible for managing its memory.
- */
-LLVMOrcExecutionSessionRef LLVMOrcLLJITGetExecutionSession(LLVMOrcLLJITRef J);
-
-/**
- * Return a reference to the Main JITDylib.
- *
- * The JITDylib is owned by the LLJIT instance. The client is not responsible
- * for managing its memory.
- */
-LLVMOrcJITDylibRef LLVMOrcLLJITGetMainJITDylib(LLVMOrcLLJITRef J);
-
-/**
- * Return the target triple for this LLJIT instance. This string is owned by
- * the LLJIT instance and should not be freed by the client.
- */
-const char *LLVMOrcLLJITGetTripleString(LLVMOrcLLJITRef J);
-
-/**
- * Returns the global prefix character according to the LLJIT's DataLayout.
- */
-char LLVMOrcLLJITGetGlobalPrefix(LLVMOrcLLJITRef J);
-
-/**
- * Mangles the given string according to the LLJIT instance's DataLayout, then
- * interns the result in the SymbolStringPool and returns a reference to the
- * pool entry. Clients should call LLVMOrcReleaseSymbolStringPoolEntry to
- * decrement the ref-count on the pool entry once they are finished with this
- * value.
- */
-LLVMOrcSymbolStringPoolEntryRef
-LLVMOrcLLJITMangleAndIntern(LLVMOrcLLJITRef J, const char *UnmangledName);
-
-/**
- * Add a buffer representing an object file to the given JITDylib in the given
- * LLJIT instance. This operation transfers ownership of the buffer to the
- * LLJIT instance. The buffer should not be disposed of or referenced once this
- * function returns.
- */
-LLVMErrorRef LLVMOrcLLJITAddObjectFile(LLVMOrcLLJITRef J, LLVMOrcJITDylibRef JD,
-                                       LLVMMemoryBufferRef ObjBuffer);
-
-/**
- * Add an IR module to the given JITDylib of the given LLJIT instance. This
- * operation transfers ownership of the TSM argument to the LLJIT instance.
- * The TSM argument should not be 3disposed of or referenced once this
- * function returns.
- */
-LLVMErrorRef LLVMOrcLLJITAddLLVMIRModule(LLVMOrcLLJITRef J,
-                                         LLVMOrcJITDylibRef JD,
-                                         LLVMOrcThreadSafeModuleRef TSM);
-/**
- * Look up the given symbol in the main JITDylib of the given LLJIT instance.
- *
- * This operation does not take ownership of the Name argument.
+ * Dispose of an ObjectLayer.
  */
-LLVMErrorRef LLVMOrcLLJITLookup(LLVMOrcLLJITRef J,
-                                LLVMOrcJITTargetAddress *Result,
-                                const char *Name);
+void LLVMOrcDisposeObjectLayer(LLVMOrcObjectLayerRef ObjLayer);
 
 LLVM_C_EXTERN_C_END
 
diff --git a/contrib/llvm-project/llvm/include/llvm-c/OrcBindings.h b/contrib/llvm-project/llvm/include/llvm-c/OrcBindings.h
deleted file mode 100644
index 11cdade7c26f..000000000000
--- a/contrib/llvm-project/llvm/include/llvm-c/OrcBindings.h
+++ /dev/null
@@ -1,169 +0,0 @@
-/*===----------- llvm-c/OrcBindings.h - Orc Lib C Iface ---------*- C++ -*-===*\
-|*                                                                            *|
-|* Part of the LLVM Project, under the Apache License v2.0 with LLVM          *|
-|* Exceptions.                                                                *|
-|* See https://llvm.org/LICENSE.txt for license information.                  *|
-|* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception                    *|
-|*                                                                            *|
-|*===----------------------------------------------------------------------===*|
-|*                                                                            *|
-|* This header declares the C interface to libLLVMOrcJIT.a, which implements  *|
-|* JIT compilation of LLVM IR.                                                *|
-|*                                                                            *|
-|* Many exotic languages can interoperate with C code but have a harder time  *|
-|* with C++ due to name mangling. So in addition to C, this interface enables *|
-|* tools written in such languages.                                           *|
-|*                                                                            *|
-|* Note: This interface is experimental. It is *NOT* stable, and may be       *|
-|*       changed without warning.                                             *|
-|*                                                                            *|
-\*===----------------------------------------------------------------------===*/
-
-#ifndef LLVM_C_ORCBINDINGS_H
-#define LLVM_C_ORCBINDINGS_H
-
-#include "llvm-c/Error.h"
-#include "llvm-c/ExternC.h"
-#include "llvm-c/Object.h"
-#include "llvm-c/TargetMachine.h"
-
-LLVM_C_EXTERN_C_BEGIN
-
-typedef struct LLVMOrcOpaqueJITStack *LLVMOrcJITStackRef;
-typedef uint64_t LLVMOrcModuleHandle;
-typedef uint64_t LLVMOrcTargetAddress;
-typedef uint64_t (*LLVMOrcSymbolResolverFn)(const char *Name, void *LookupCtx);
-typedef uint64_t (*LLVMOrcLazyCompileCallbackFn)(LLVMOrcJITStackRef JITStack,
-                                                 void *CallbackCtx);
-
-/**
- * Create an ORC JIT stack.
- *
- * The client owns the resulting stack, and must call OrcDisposeInstance(...)
- * to destroy it and free its memory. The JIT stack will take ownership of the
- * TargetMachine, which will be destroyed when the stack is destroyed. The
- * client should not attempt to dispose of the Target Machine, or it will result
- * in a double-free.
- */
-LLVMOrcJITStackRef LLVMOrcCreateInstance(LLVMTargetMachineRef TM);
-
-/**
- * Get the error message for the most recent error (if any).
- *
- * This message is owned by the ORC JIT Stack and will be freed when the stack
- * is disposed of by LLVMOrcDisposeInstance.
- */
-const char *LLVMOrcGetErrorMsg(LLVMOrcJITStackRef JITStack);
-
-/**
- * Mangle the given symbol.
- * Memory will be allocated for MangledSymbol to hold the result. The client
- */
-void LLVMOrcGetMangledSymbol(LLVMOrcJITStackRef JITStack, char **MangledSymbol,
-                             const char *Symbol);
-
-/**
- * Dispose of a mangled symbol.
- */
-void LLVMOrcDisposeMangledSymbol(char *MangledSymbol);
-
-/**
- * Create a lazy compile callback.
- */
-LLVMErrorRef LLVMOrcCreateLazyCompileCallback(
-    LLVMOrcJITStackRef JITStack, LLVMOrcTargetAddress *RetAddr,
-    LLVMOrcLazyCompileCallbackFn Callback, void *CallbackCtx);
-
-/**
- * Create a named indirect call stub.
- */
-LLVMErrorRef LLVMOrcCreateIndirectStub(LLVMOrcJITStackRef JITStack,
-                                       const char *StubName,
-                                       LLVMOrcTargetAddress InitAddr);
-
-/**
- * Set the pointer for the given indirect stub.
- */
-LLVMErrorRef LLVMOrcSetIndirectStubPointer(LLVMOrcJITStackRef JITStack,
-                                           const char *StubName,
-                                           LLVMOrcTargetAddress NewAddr);
-
-/**
- * Add module to be eagerly compiled.
- */
-LLVMErrorRef LLVMOrcAddEagerlyCompiledIR(LLVMOrcJITStackRef JITStack,
-                                         LLVMOrcModuleHandle *RetHandle,
-                                         LLVMModuleRef Mod,
-                                         LLVMOrcSymbolResolverFn SymbolResolver,
-                                         void *SymbolResolverCtx);
-
-/**
- * Add module to be lazily compiled one function at a time.
- */
-LLVMErrorRef LLVMOrcAddLazilyCompiledIR(LLVMOrcJITStackRef JITStack,
-                                        LLVMOrcModuleHandle *RetHandle,
-                                        LLVMModuleRef Mod,
-                                        LLVMOrcSymbolResolverFn SymbolResolver,
-                                        void *SymbolResolverCtx);
-
-/**
- * Add an object file.
- *
- * This method takes ownership of the given memory buffer and attempts to add
- * it to the JIT as an object file.
- * Clients should *not* dispose of the 'Obj' argument: the JIT will manage it
- * from this call onwards.
- */
-LLVMErrorRef LLVMOrcAddObjectFile(LLVMOrcJITStackRef JITStack,
-                                  LLVMOrcModuleHandle *RetHandle,
-                                  LLVMMemoryBufferRef Obj,
-                                  LLVMOrcSymbolResolverFn SymbolResolver,
-                                  void *SymbolResolverCtx);
-
-/**
- * Remove a module set from the JIT.
- *
- * This works for all modules that can be added via OrcAdd*, including object
- * files.
- */
-LLVMErrorRef LLVMOrcRemoveModule(LLVMOrcJITStackRef JITStack,
-                                 LLVMOrcModuleHandle H);
-
-/**
- * Get symbol address from JIT instance.
- */
-LLVMErrorRef LLVMOrcGetSymbolAddress(LLVMOrcJITStackRef JITStack,
-                                     LLVMOrcTargetAddress *RetAddr,
-                                     const char *SymbolName);
-
-/**
- * Get symbol address from JIT instance, searching only the specified
- * handle.
- */
-LLVMErrorRef LLVMOrcGetSymbolAddressIn(LLVMOrcJITStackRef JITStack,
-                                       LLVMOrcTargetAddress *RetAddr,
-                                       LLVMOrcModuleHandle H,
-                                       const char *SymbolName);
-
-/**
- * Dispose of an ORC JIT stack.
- */
-LLVMErrorRef LLVMOrcDisposeInstance(LLVMOrcJITStackRef JITStack);
-
-/**
- * Register a JIT Event Listener.
- *
- * A NULL listener is ignored.
- */
-void LLVMOrcRegisterJITEventListener(LLVMOrcJITStackRef JITStack, LLVMJITEventListenerRef L);
-
-/**
- * Unegister a JIT Event Listener.
- *
- * A NULL listener is ignored.
- */
-void LLVMOrcUnregisterJITEventListener(LLVMOrcJITStackRef JITStack, LLVMJITEventListenerRef L);
-
-LLVM_C_EXTERN_C_END
-
-#endif /* LLVM_C_ORCBINDINGS_H */
diff --git a/contrib/llvm-project/llvm/include/llvm-c/OrcEE.h b/contrib/llvm-project/llvm/include/llvm-c/OrcEE.h
new file mode 100644
index 000000000000..2435e7421a42
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm-c/OrcEE.h
@@ -0,0 +1,55 @@
+/*===-- llvm-c/OrcEE.h - OrcV2 C bindings ExecutionEngine utils -*- C++ -*-===*\
+|*                                                                            *|
+|* Part of the LLVM Project, under the Apache License v2.0 with LLVM          *|
+|* Exceptions.                                                                *|
+|* See https://llvm.org/LICENSE.txt for license information.                  *|
+|* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception                    *|
+|*                                                                            *|
+|*===----------------------------------------------------------------------===*|
+|*                                                                            *|
+|* This header declares the C interface to ExecutionEngine based utils, e.g.  *|
+|* RTDyldObjectLinkingLayer (based on RuntimeDyld) in Orc.                    *|
+|*                                                                            *|
+|* Many exotic languages can interoperate with C code but have a harder time  *|
+|* with C++ due to name mangling. So in addition to C, this interface enables *|
+|* tools written in such languages.                                           *|
+|*                                                                            *|
+|* Note: This interface is experimental. It is *NOT* stable, and may be       *|
+|*       changed without warning. Only C API usage documentation is           *|
+|*       provided. See the C++ documentation for all higher level ORC API     *|
+|*       details.                                                             *|
+|*                                                                            *|
+\*===----------------------------------------------------------------------===*/
+
+#ifndef LLVM_C_ORCEE_H
+#define LLVM_C_ORCEE_H
+
+#include "llvm-c/Error.h"
+#include "llvm-c/ExecutionEngine.h"
+#include "llvm-c/Orc.h"
+#include "llvm-c/TargetMachine.h"
+#include "llvm-c/Types.h"
+
+LLVM_C_EXTERN_C_BEGIN
+
+/**
+ * Create a RTDyldObjectLinkingLayer instance using the standard
+ * SectionMemoryManager for memory management.
+ */
+LLVMOrcObjectLayerRef
+LLVMOrcCreateRTDyldObjectLinkingLayerWithSectionMemoryManager(
+    LLVMOrcExecutionSessionRef ES);
+
+/**
+ * Add the given listener to the given RTDyldObjectLinkingLayer.
+ *
+ * Note: Layer must be an RTDyldObjectLinkingLayer instance or
+ * behavior is undefined.
+ */
+void LLVMOrcRTDyldObjectLinkingLayerRegisterJITEventListener(
+    LLVMOrcObjectLayerRef RTDyldObjLinkingLayer,
+    LLVMJITEventListenerRef Listener);
+
+LLVM_C_EXTERN_C_END
+
+#endif /* LLVM_C_ORCEE_H */
diff --git a/contrib/llvm-project/llvm/include/llvm-c/Transforms/IPO.h b/contrib/llvm-project/llvm/include/llvm-c/Transforms/IPO.h
index cde3d2460920..3f2cadf32366 100644
--- a/contrib/llvm-project/llvm/include/llvm-c/Transforms/IPO.h
+++ b/contrib/llvm-project/llvm/include/llvm-c/Transforms/IPO.h
@@ -57,9 +57,6 @@ void LLVMAddGlobalDCEPass(LLVMPassManagerRef PM);
 /** See llvm::createGlobalOptimizerPass function. */
 void LLVMAddGlobalOptimizerPass(LLVMPassManagerRef PM);
 
-/** See llvm::createIPConstantPropagationPass function. */
-void LLVMAddIPConstantPropagationPass(LLVMPassManagerRef PM);
-
 /** See llvm::createPruneEHPass function. */
 void LLVMAddPruneEHPass(LLVMPassManagerRef PM);
 
diff --git a/contrib/llvm-project/llvm/include/llvm-c/Transforms/Scalar.h b/contrib/llvm-project/llvm/include/llvm-c/Transforms/Scalar.h
index 93d79a205195..ba142508bbe4 100644
--- a/contrib/llvm-project/llvm/include/llvm-c/Transforms/Scalar.h
+++ b/contrib/llvm-project/llvm/include/llvm-c/Transforms/Scalar.h
@@ -67,6 +67,9 @@ void LLVMAddIndVarSimplifyPass(LLVMPassManagerRef PM);
 /** See llvm::createInstructionCombiningPass function. */
 void LLVMAddInstructionCombiningPass(LLVMPassManagerRef PM);
 
+/** See llvm::createInstSimplifyLegacyPass function. */
+void LLVMAddInstructionSimplifyPass(LLVMPassManagerRef PM);
+
 /** See llvm::createJumpThreadingPass function. */
 void LLVMAddJumpThreadingPass(LLVMPassManagerRef PM);
 
@@ -125,9 +128,6 @@ void LLVMAddSimplifyLibCallsPass(LLVMPassManagerRef PM);
 /** See llvm::createTailCallEliminationPass function. */
 void LLVMAddTailCallEliminationPass(LLVMPassManagerRef PM);
 
-/** See llvm::createConstantPropagationPass function. */
-void LLVMAddConstantPropagationPass(LLVMPassManagerRef PM);
-
 /** See llvm::demotePromoteMemoryToRegisterPass function. */
 void LLVMAddDemoteMemoryToRegisterPass(LLVMPassManagerRef PM);
 
diff --git a/contrib/llvm-project/llvm/include/llvm/ADT/APFixedPoint.h b/contrib/llvm-project/llvm/include/llvm/ADT/APFixedPoint.h
new file mode 100644
index 000000000000..d6349e6b2a88
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/ADT/APFixedPoint.h
@@ -0,0 +1,237 @@
+//===- APFixedPoint.h - Fixed point constant handling -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Defines the fixed point number interface.
+/// This is a class for abstracting various operations performed on fixed point
+/// types.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADT_APFIXEDPOINT_H
+#define LLVM_ADT_APFIXEDPOINT_H
+
+#include "llvm/ADT/APSInt.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+class APFloat;
+struct fltSemantics;
+
+/// The fixed point semantics work similarly to fltSemantics. The width
+/// specifies the whole bit width of the underlying scaled integer (with padding
+/// if any). The scale represents the number of fractional bits in this type.
+/// When HasUnsignedPadding is true and this type is unsigned, the first bit
+/// in the value this represents is treated as padding.
+class FixedPointSemantics {
+public:
+  FixedPointSemantics(unsigned Width, unsigned Scale, bool IsSigned,
+                      bool IsSaturated, bool HasUnsignedPadding)
+      : Width(Width), Scale(Scale), IsSigned(IsSigned),
+        IsSaturated(IsSaturated), HasUnsignedPadding(HasUnsignedPadding) {
+    assert(Width >= Scale && "Not enough room for the scale");
+    assert(!(IsSigned && HasUnsignedPadding) &&
+           "Cannot have unsigned padding on a signed type.");
+  }
+
+  unsigned getWidth() const { return Width; }
+  unsigned getScale() const { return Scale; }
+  bool isSigned() const { return IsSigned; }
+  bool isSaturated() const { return IsSaturated; }
+  bool hasUnsignedPadding() const { return HasUnsignedPadding; }
+
+  void setSaturated(bool Saturated) { IsSaturated = Saturated; }
+
+  /// Return the number of integral bits represented by these semantics. These
+  /// are separate from the fractional bits and do not include the sign or
+  /// padding bit.
+  unsigned getIntegralBits() const {
+    if (IsSigned || (!IsSigned && HasUnsignedPadding))
+      return Width - Scale - 1;
+    else
+      return Width - Scale;
+  }
+
+  /// Return the FixedPointSemantics that allows for calculating the full
+  /// precision semantic that can precisely represent the precision and ranges
+  /// of both input values. This does not compute the resulting semantics for a
+  /// given binary operation.
+  FixedPointSemantics
+  getCommonSemantics(const FixedPointSemantics &Other) const;
+
+  /// Returns true if this fixed-point semantic with its value bits interpreted
+  /// as an integer can fit in the given floating point semantic without
+  /// overflowing to infinity.
+  /// For example, a signed 8-bit fixed-point semantic has a maximum and
+  /// minimum integer representation of 127 and -128, respectively. If both of
+  /// these values can be represented (possibly inexactly) in the floating
+  /// point semantic without overflowing, this returns true.
+  bool fitsInFloatSemantics(const fltSemantics &FloatSema) const;
+
+  /// Return the FixedPointSemantics for an integer type.
+  static FixedPointSemantics GetIntegerSemantics(unsigned Width,
+                                                 bool IsSigned) {
+    return FixedPointSemantics(Width, /*Scale=*/0, IsSigned,
+                               /*IsSaturated=*/false,
+                               /*HasUnsignedPadding=*/false);
+  }
+
+private:
+  unsigned Width          : 16;
+  unsigned Scale          : 13;
+  unsigned IsSigned       : 1;
+  unsigned IsSaturated    : 1;
+  unsigned HasUnsignedPadding : 1;
+};
+
+/// The APFixedPoint class works similarly to APInt/APSInt in that it is a
+/// functional replacement for a scaled integer. It is meant to replicate the
+/// fixed point types proposed in ISO/IEC JTC1 SC22 WG14 N1169. The class carries
+/// info about the fixed point type's width, sign, scale, and saturation, and
+/// provides different operations that would normally be performed on fixed point
+/// types.
+class APFixedPoint {
+public:
+  APFixedPoint(const APInt &Val, const FixedPointSemantics &Sema)
+      : Val(Val, !Sema.isSigned()), Sema(Sema) {
+    assert(Val.getBitWidth() == Sema.getWidth() &&
+           "The value should have a bit width that matches the Sema width");
+  }
+
+  APFixedPoint(uint64_t Val, const FixedPointSemantics &Sema)
+      : APFixedPoint(APInt(Sema.getWidth(), Val, Sema.isSigned()), Sema) {}
+
+  // Zero initialization.
+  APFixedPoint(const FixedPointSemantics &Sema) : APFixedPoint(0, Sema) {}
+
+  APSInt getValue() const { return APSInt(Val, !Sema.isSigned()); }
+  inline unsigned getWidth() const { return Sema.getWidth(); }
+  inline unsigned getScale() const { return Sema.getScale(); }
+  inline bool isSaturated() const { return Sema.isSaturated(); }
+  inline bool isSigned() const { return Sema.isSigned(); }
+  inline bool hasPadding() const { return Sema.hasUnsignedPadding(); }
+  FixedPointSemantics getSemantics() const { return Sema; }
+
+  bool getBoolValue() const { return Val.getBoolValue(); }
+
+  // Convert this number to match the semantics provided. If the overflow
+  // parameter is provided, set this value to true or false to indicate if this
+  // operation results in an overflow.
+  APFixedPoint convert(const FixedPointSemantics &DstSema,
+                       bool *Overflow = nullptr) const;
+
+  // Perform binary operations on a fixed point type. The resulting fixed point
+  // value will be in the common, full precision semantics that can represent
+  // the precision and ranges of both input values. See convert() for an
+  // explanation of the Overflow parameter.
+  APFixedPoint add(const APFixedPoint &Other, bool *Overflow = nullptr) const;
+  APFixedPoint sub(const APFixedPoint &Other, bool *Overflow = nullptr) const;
+  APFixedPoint mul(const APFixedPoint &Other, bool *Overflow = nullptr) const;
+  APFixedPoint div(const APFixedPoint &Other, bool *Overflow = nullptr) const;
+
+  // Perform shift operations on a fixed point type. Unlike the other binary
+  // operations, the resulting fixed point value will be in the original
+  // semantic.
+  APFixedPoint shl(unsigned Amt, bool *Overflow = nullptr) const;
+  APFixedPoint shr(unsigned Amt, bool *Overflow = nullptr) const {
+    // Right shift cannot overflow.
+    if (Overflow)
+      *Overflow = false;
+    return APFixedPoint(Val >> Amt, Sema);
+  }
+
+  /// Perform a unary negation (-X) on this fixed point type, taking into
+  /// account saturation if applicable.
+  APFixedPoint negate(bool *Overflow = nullptr) const;
+
+  /// Return the integral part of this fixed point number, rounded towards
+  /// zero. (-2.5k -> -2)
+  APSInt getIntPart() const {
+    if (Val < 0 && Val != -Val) // Cover the case when we have the min val
+      return -(-Val >> getScale());
+    else
+      return Val >> getScale();
+  }
+
+  /// Return the integral part of this fixed point number, rounded towards
+  /// zero. The value is stored into an APSInt with the provided width and sign.
+  /// If the overflow parameter is provided, and the integral value is not able
+  /// to be fully stored in the provided width and sign, the overflow parameter
+  /// is set to true.
+  APSInt convertToInt(unsigned DstWidth, bool DstSign,
+                      bool *Overflow = nullptr) const;
+
+  /// Convert this fixed point number to a floating point value with the
+  /// provided semantics.
+  APFloat convertToFloat(const fltSemantics &FloatSema) const;
+
+  void toString(SmallVectorImpl<char> &Str) const;
+  std::string toString() const {
+    SmallString<40> S;
+    toString(S);
+    return std::string(S.str());
+  }
+
+  // If LHS > RHS, return 1. If LHS == RHS, return 0. If LHS < RHS, return -1.
+  int compare(const APFixedPoint &Other) const;
+  bool operator==(const APFixedPoint &Other) const {
+    return compare(Other) == 0;
+  }
+  bool operator!=(const APFixedPoint &Other) const {
+    return compare(Other) != 0;
+  }
+  bool operator>(const APFixedPoint &Other) const { return compare(Other) > 0; }
+  bool operator<(const APFixedPoint &Other) const { return compare(Other) < 0; }
+  bool operator>=(const APFixedPoint &Other) const {
+    return compare(Other) >= 0;
+  }
+  bool operator<=(const APFixedPoint &Other) const {
+    return compare(Other) <= 0;
+  }
+
+  static APFixedPoint getMax(const FixedPointSemantics &Sema);
+  static APFixedPoint getMin(const FixedPointSemantics &Sema);
+
+  /// Given a floating point semantic, return the next floating point semantic
+  /// with a larger exponent and larger or equal mantissa.
+  static const fltSemantics *promoteFloatSemantics(const fltSemantics *S);
+
+  /// Create an APFixedPoint with a value equal to that of the provided integer,
+  /// and in the same semantics as the provided target semantics. If the value
+  /// is not able to fit in the specified fixed point semantics, and the
+  /// overflow parameter is provided, it is set to true.
+  static APFixedPoint getFromIntValue(const APSInt &Value,
+                                      const FixedPointSemantics &DstFXSema,
+                                      bool *Overflow = nullptr);
+
+  /// Create an APFixedPoint with a value equal to that of the provided
+  /// floating point value, in the provided target semantics. If the value is
+  /// not able to fit in the specified fixed point semantics and the overflow
+  /// parameter is specified, it is set to true.
+  /// For NaN, the Overflow flag is always set. For +inf and -inf, if the
+  /// semantic is saturating, the value saturates. Otherwise, the Overflow flag
+  /// is set.
+  static APFixedPoint getFromFloatValue(const APFloat &Value,
+                                        const FixedPointSemantics &DstFXSema,
+                                        bool *Overflow = nullptr);
+
+private:
+  APSInt Val;
+  FixedPointSemantics Sema;
+};
+
+inline raw_ostream &operator<<(raw_ostream &OS, const APFixedPoint &FX) {
+  OS << FX.toString();
+  return OS;
+}
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm-project/llvm/include/llvm/ADT/APFloat.h b/contrib/llvm-project/llvm/include/llvm/ADT/APFloat.h
index 876e52c150a0..1f9ac22621a6 100644
--- a/contrib/llvm-project/llvm/include/llvm/ADT/APFloat.h
+++ b/contrib/llvm-project/llvm/include/llvm/ADT/APFloat.h
@@ -249,7 +249,7 @@ public:
   /// \name Constructors
   /// @{
 
-  IEEEFloat(const fltSemantics &); // Default construct to 0.0
+  IEEEFloat(const fltSemantics &); // Default construct to +0.0
   IEEEFloat(const fltSemantics &, integerPart);
   IEEEFloat(const fltSemantics &, uninitializedTag);
   IEEEFloat(const fltSemantics &, const APInt &);
@@ -539,6 +539,9 @@ private:
                                  roundingMode) const;
   opStatus roundSignificandWithExponent(const integerPart *, unsigned int, int,
                                         roundingMode);
+  ExponentType exponentNaN() const;
+  ExponentType exponentInf() const;
+  ExponentType exponentZero() const;
 
   /// @}
 
diff --git a/contrib/llvm-project/llvm/include/llvm/ADT/APInt.h b/contrib/llvm-project/llvm/include/llvm/ADT/APInt.h
index f7df648d27ed..b97ea2cd9aee 100644
--- a/contrib/llvm-project/llvm/include/llvm/ADT/APInt.h
+++ b/contrib/llvm-project/llvm/include/llvm/ADT/APInt.h
@@ -31,6 +31,7 @@ class raw_ostream;
 template <typename T> class SmallVectorImpl;
 template <typename T> class ArrayRef;
 template <typename T> class Optional;
+template <typename T> struct DenseMapInfo;
 
 class APInt;
 
@@ -96,7 +97,7 @@ private:
 
   unsigned BitWidth; ///< The number of bits in this APInt.
 
-  friend struct DenseMapAPIntKeyInfo;
+  friend struct DenseMapInfo<APInt>;
 
   friend class APSInt;
 
@@ -764,8 +765,8 @@ public:
 
   /// Move assignment operator.
   APInt &operator=(APInt &&that) {
-#ifdef _MSC_VER
-    // The MSVC std::shuffle implementation still does self-assignment.
+#ifdef EXPENSIVE_CHECKS
+    // Some std::shuffle implementations still do self-assignment.
     if (this == &that)
       return *this;
 #endif
@@ -793,11 +794,10 @@ public:
   APInt &operator=(uint64_t RHS) {
     if (isSingleWord()) {
       U.VAL = RHS;
-      clearUnusedBits();
-    } else {
-      U.pVal[0] = RHS;
-      memset(U.pVal+1, 0, (getNumWords() - 1) * APINT_WORD_SIZE);
+      return clearUnusedBits();
     }
+    U.pVal[0] = RHS;
+    memset(U.pVal + 1, 0, (getNumWords() - 1) * APINT_WORD_SIZE);
     return *this;
   }
 
@@ -854,10 +854,9 @@ public:
   APInt &operator|=(uint64_t RHS) {
     if (isSingleWord()) {
       U.VAL |= RHS;
-      clearUnusedBits();
-    } else {
-      U.pVal[0] |= RHS;
+      return clearUnusedBits();
     }
+    U.pVal[0] |= RHS;
     return *this;
   }
 
@@ -884,10 +883,9 @@ public:
   APInt &operator^=(uint64_t RHS) {
     if (isSingleWord()) {
       U.VAL ^= RHS;
-      clearUnusedBits();
-    } else {
-      U.pVal[0] ^= RHS;
+      return clearUnusedBits();
     }
+    U.pVal[0] ^= RHS;
     return *this;
   }
 
@@ -1405,6 +1403,12 @@ public:
   /// extended, truncated, or left alone to make it that width.
   APInt zextOrTrunc(unsigned width) const;
 
+  /// Truncate to width
+  ///
+  /// Make this APInt have the bit width given by \p width. The value is
+  /// truncated or left alone to make it that width.
+  APInt truncOrSelf(unsigned width) const;
+
   /// Sign extend or truncate to width
   ///
   /// Make this APInt have the bit width given by \p width. The value is sign
@@ -1449,6 +1453,14 @@ public:
     setBit(BitWidth - 1);
   }
 
+  /// Set a given bit to a given value.
+  void setBitVal(unsigned BitPosition, bool BitValue) {
+    if (BitValue)
+      setBit(BitPosition);
+    else
+      clearBit(BitPosition);
+  }
+
   /// Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
   /// This function handles "wrap" case when \p loBit >= \p hiBit, and calls
   /// setBits when \p loBit < \p hiBit.
@@ -1609,11 +1621,7 @@ public:
   /// returns the smallest bit width that will retain the negative value. For
   /// example, -1 can be written as 0b1 or 0xFFFFFFFFFF. 0b1 is shorter and so
   /// for -1, this function will always return 1.
-  unsigned getMinSignedBits() const {
-    if (isNegative())
-      return BitWidth - countLeadingOnes() + 1;
-    return getActiveBits() + 1;
-  }
+  unsigned getMinSignedBits() const { return BitWidth - getNumSignBits() + 1; }
 
   /// Get zero extended value
   ///
diff --git a/contrib/llvm-project/llvm/include/llvm/ADT/APSInt.h b/contrib/llvm-project/llvm/include/llvm/ADT/APSInt.h
index 0f991826c457..82e9ba81141f 100644
--- a/contrib/llvm-project/llvm/include/llvm/ADT/APSInt.h
+++ b/contrib/llvm-project/llvm/include/llvm/ADT/APSInt.h
@@ -18,6 +18,7 @@
 
 namespace llvm {
 
+/// An arbitrary precision integer that knows its signedness.
 class LLVM_NODISCARD APSInt : public APInt {
   bool IsUnsigned;
 
@@ -25,8 +26,7 @@ public:
   /// Default constructor that creates an uninitialized APInt.
   explicit APSInt() : IsUnsigned(false) {}
 
-  /// APSInt ctor - Create an APSInt with the specified width, default to
-  /// unsigned.
+  /// Create an APSInt with the specified width, default to unsigned.
   explicit APSInt(uint32_t BitWidth, bool isUnsigned = true)
    : APInt(BitWidth, 0), IsUnsigned(isUnsigned) {}
 
@@ -78,11 +78,11 @@ public:
   void setIsUnsigned(bool Val) { IsUnsigned = Val; }
   void setIsSigned(bool Val) { IsUnsigned = !Val; }
 
-  /// toString - Append this APSInt to the specified SmallString.
+  /// Append this APSInt to the specified SmallString.
   void toString(SmallVectorImpl<char> &Str, unsigned Radix = 10) const {
     APInt::toString(Str, Radix, isSigned());
   }
-  /// toString - Converts an APInt to a std::string.  This is an inefficient
+  /// Converts an APInt to a std::string.  This is an inefficient
   /// method; you should prefer passing in a SmallString instead.
   std::string toString(unsigned Radix) const {
     return APInt::toString(Radix, isSigned());
@@ -282,15 +282,15 @@ public:
     return APSInt(~static_cast<const APInt&>(*this), IsUnsigned);
   }
 
-  /// getMaxValue - Return the APSInt representing the maximum integer value
-  ///  with the given bit width and signedness.
+  /// Return the APSInt representing the maximum integer value with the given
+  /// bit width and signedness.
   static APSInt getMaxValue(uint32_t numBits, bool Unsigned) {
     return APSInt(Unsigned ? APInt::getMaxValue(numBits)
                            : APInt::getSignedMaxValue(numBits), Unsigned);
   }
 
-  /// getMinValue - Return the APSInt representing the minimum integer value
-  ///  with the given bit width and signedness.
+  /// Return the APSInt representing the minimum integer value with the given
+  /// bit width and signedness.
   static APSInt getMinValue(uint32_t numBits, bool Unsigned) {
     return APSInt(Unsigned ? APInt::getMinValue(numBits)
                            : APInt::getSignedMinValue(numBits), Unsigned);
@@ -331,8 +331,8 @@ public:
   static APSInt get(int64_t X) { return APSInt(APInt(64, X), false); }
   static APSInt getUnsigned(uint64_t X) { return APSInt(APInt(64, X), true); }
 
-  /// Profile - Used to insert APSInt objects, or objects that contain APSInt
-  ///  objects, into FoldingSets.
+  /// Used to insert APSInt objects, or objects that contain APSInt objects,
+  /// into FoldingSets.
   void Profile(FoldingSetNodeID& ID) const;
 };
 
diff --git a/contrib/llvm-project/llvm/include/llvm/ADT/AllocatorList.h b/contrib/llvm-project/llvm/include/llvm/ADT/AllocatorList.h
index 447d7a7538db..404a657f27de 100644
--- a/contrib/llvm-project/llvm/include/llvm/ADT/AllocatorList.h
+++ b/contrib/llvm-project/llvm/include/llvm/ADT/AllocatorList.h
@@ -118,13 +118,6 @@ private:
 
     reference operator*() const { return base_type::wrapped()->V; }
     pointer operator->() const { return &operator*(); }
-
-    friend bool operator==(const IteratorImpl &L, const IteratorImpl &R) {
-      return L.wrapped() == R.wrapped();
-    }
-    friend bool operator!=(const IteratorImpl &L, const IteratorImpl &R) {
-      return !(L == R);
-    }
   };
 
 public:
diff --git a/contrib/llvm-project/llvm/include/llvm/ADT/Any.h b/contrib/llvm-project/llvm/include/llvm/ADT/Any.h
index 0aded628cda4..1e3abca70679 100644
--- a/contrib/llvm-project/llvm/include/llvm/ADT/Any.h
+++ b/contrib/llvm-project/llvm/include/llvm/ADT/Any.h
@@ -23,7 +23,12 @@
 
 namespace llvm {
 
-class Any {
+class LLVM_EXTERNAL_VISIBILITY Any {
+
+  // The `Typeid<T>::Id` static data member below is a globally unique
+  // identifier for the type `T`. It is explicitly marked with default
+  // visibility so that when `-fvisibility=hidden` is used, the loader still
+  // merges duplicate definitions across DSO boundaries.
   template <typename T> struct TypeId { static const char Id; };
 
   struct StorageBase {
diff --git a/contrib/llvm-project/llvm/include/llvm/ADT/BitVector.h b/contrib/llvm-project/llvm/include/llvm/ADT/BitVector.h
index a8d0f07af94a..2a857786f454 100644
--- a/contrib/llvm-project/llvm/include/llvm/ADT/BitVector.h
+++ b/contrib/llvm-project/llvm/include/llvm/ADT/BitVector.h
@@ -203,9 +203,10 @@ public:
     return !any();
   }
 
-  /// find_first_in - Returns the index of the first set bit in the range
-  /// [Begin, End).  Returns -1 if all bits in the range are unset.
-  int find_first_in(unsigned Begin, unsigned End) const {
+  /// find_first_in - Returns the index of the first set / unset bit,
+  /// depending on \p Set, in the range [Begin, End).
+  /// Returns -1 if all bits in the range are unset / set.
+  int find_first_in(unsigned Begin, unsigned End, bool Set = true) const {
     assert(Begin <= End && End <= Size);
     if (Begin == End)
       return -1;
@@ -214,8 +215,14 @@ public:
     unsigned LastWord = (End - 1) / BITWORD_SIZE;
 
     // Check subsequent words.
+    // The code below is based on search for the first _set_ bit. If
+    // we're searching for the first _unset_, we just take the
+    // complement of each word before we use it and apply
+    // the same method.
     for (unsigned i = FirstWord; i <= LastWord; ++i) {
       BitWord Copy = Bits[i];
+      if (!Set)
+        Copy = ~Copy;
 
       if (i == FirstWord) {
         unsigned FirstBit = Begin % BITWORD_SIZE;
@@ -266,32 +273,7 @@ public:
   /// find_first_unset_in - Returns the index of the first unset bit in the
   /// range [Begin, End).  Returns -1 if all bits in the range are set.
   int find_first_unset_in(unsigned Begin, unsigned End) const {
-    assert(Begin <= End && End <= Size);
-    if (Begin == End)
-      return -1;
-
-    unsigned FirstWord = Begin / BITWORD_SIZE;
-    unsigned LastWord = (End - 1) / BITWORD_SIZE;
-
-    // Check subsequent words.
-    for (unsigned i = FirstWord; i <= LastWord; ++i) {
-      BitWord Copy = Bits[i];
-
-      if (i == FirstWord) {
-        unsigned FirstBit = Begin % BITWORD_SIZE;
-        Copy |= maskTrailingOnes<BitWord>(FirstBit);
-      }
-
-      if (i == LastWord) {
-        unsigned LastBit = (End - 1) % BITWORD_SIZE;
-        Copy |= maskTrailingZeros<BitWord>(LastBit + 1);
-      }
-      if (Copy != ~BitWord(0)) {
-        unsigned Result = i * BITWORD_SIZE + countTrailingOnes(Copy);
-        return Result < size() ? Result : -1;
-      }
-    }
-    return -1;
+    return find_first_in(Begin, End, /* Set = */ false);
   }
 
   /// find_last_unset_in - Returns the index of the last unset bit in the
diff --git a/contrib/llvm-project/llvm/include/llvm/ADT/DenseMap.h b/contrib/llvm-project/llvm/include/llvm/ADT/DenseMap.h
index 34d397cc9793..ce0b05db840c 100644
--- a/contrib/llvm-project/llvm/include/llvm/ADT/DenseMap.h
+++ b/contrib/llvm-project/llvm/include/llvm/ADT/DenseMap.h
@@ -426,8 +426,8 @@ protected:
     setNumEntries(other.getNumEntries());
     setNumTombstones(other.getNumTombstones());
 
-    if (is_trivially_copyable<KeyT>::value &&
-        is_trivially_copyable<ValueT>::value)
+    if (std::is_trivially_copyable<KeyT>::value &&
+        std::is_trivially_copyable<ValueT>::value)
       memcpy(reinterpret_cast<void *>(getBuckets()), other.getBuckets(),
              getNumBuckets() * sizeof(BucketT));
     else
@@ -954,7 +954,7 @@ public:
           std::swap(*LHSB, *RHSB);
           continue;
         }
-        // Swap separately and handle any assymetry.
+        // Swap separately and handle any asymmetry.
         std::swap(LHSB->getFirst(), RHSB->getFirst());
         if (hasLHSValue) {
           ::new (&RHSB->getSecond()) ValueT(std::move(LHSB->getSecond()));
@@ -1042,7 +1042,7 @@ public:
     if (Small) {
       // First move the inline buckets into a temporary storage.
       AlignedCharArrayUnion<BucketT[InlineBuckets]> TmpStorage;
-      BucketT *TmpBegin = reinterpret_cast<BucketT *>(TmpStorage.buffer);
+      BucketT *TmpBegin = reinterpret_cast<BucketT *>(&TmpStorage);
       BucketT *TmpEnd = TmpBegin;
 
       // Loop over the buckets, moving non-empty, non-tombstones into the
@@ -1132,8 +1132,8 @@ private:
     assert(Small);
     // Note that this cast does not violate aliasing rules as we assert that
     // the memory's dynamic type is the small, inline bucket buffer, and the
-    // 'storage.buffer' static type is 'char *'.
-    return reinterpret_cast<const BucketT *>(storage.buffer);
+    // 'storage' is a POD containing a char buffer.
+    return reinterpret_cast<const BucketT *>(&storage);
   }
 
   BucketT *getInlineBuckets() {
@@ -1144,7 +1144,7 @@ private:
   const LargeRep *getLargeRep() const {
     assert(!Small);
     // Note, same rule about aliasing as with getInlineBuckets.
-    return reinterpret_cast<const LargeRep *>(storage.buffer);
+    return reinterpret_cast<const LargeRep *>(&storage);
   }
 
   LargeRep *getLargeRep() {
@@ -1190,8 +1190,6 @@ class DenseMapIterator : DebugEpochBase::HandleBase {
   friend class DenseMapIterator<KeyT, ValueT, KeyInfoT, Bucket, true>;
   friend class DenseMapIterator<KeyT, ValueT, KeyInfoT, Bucket, false>;
 
-  using ConstIterator = DenseMapIterator<KeyT, ValueT, KeyInfoT, Bucket, true>;
-
 public:
   using difference_type = ptrdiff_t;
   using value_type =
@@ -1244,19 +1242,18 @@ public:
     return Ptr;
   }
 
-  bool operator==(const ConstIterator &RHS) const {
-    assert((!Ptr || isHandleInSync()) && "handle not in sync!");
+  friend bool operator==(const DenseMapIterator &LHS,
+                         const DenseMapIterator &RHS) {
+    assert((!LHS.Ptr || LHS.isHandleInSync()) && "handle not in sync!");
     assert((!RHS.Ptr || RHS.isHandleInSync()) && "handle not in sync!");
-    assert(getEpochAddress() == RHS.getEpochAddress() &&
+    assert(LHS.getEpochAddress() == RHS.getEpochAddress() &&
            "comparing incomparable iterators!");
-    return Ptr == RHS.Ptr;
+    return LHS.Ptr == RHS.Ptr;
   }
-  bool operator!=(const ConstIterator &RHS) const {
-    assert((!Ptr || isHandleInSync()) && "handle not in sync!");
-    assert((!RHS.Ptr || RHS.isHandleInSync()) && "handle not in sync!");
-    assert(getEpochAddress() == RHS.getEpochAddress() &&
-           "comparing incomparable iterators!");
-    return Ptr != RHS.Ptr;
+
+  friend bool operator!=(const DenseMapIterator &LHS,
+                         const DenseMapIterator &RHS) {
+    return !(LHS == RHS);
   }
 
   inline DenseMapIterator& operator++() {  // Preincrement
diff --git a/contrib/llvm-project/llvm/include/llvm/ADT/DenseMapInfo.h b/contrib/llvm-project/llvm/include/llvm/ADT/DenseMapInfo.h
index e465331ac6f7..8271b9334b86 100644
--- a/contrib/llvm-project/llvm/include/llvm/ADT/DenseMapInfo.h
+++ b/contrib/llvm-project/llvm/include/llvm/ADT/DenseMapInfo.h
@@ -13,6 +13,8 @@
 #ifndef LLVM_ADT_DENSEMAPINFO_H
 #define LLVM_ADT_DENSEMAPINFO_H
 
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/StringRef.h"
@@ -347,6 +349,49 @@ template <> struct DenseMapInfo<hash_code> {
   static bool isEqual(hash_code LHS, hash_code RHS) { return LHS == RHS; }
 };
 
+/// Provide DenseMapInfo for APInt.
+template <> struct DenseMapInfo<APInt> {
+  static inline APInt getEmptyKey() {
+    APInt V(nullptr, 0);
+    V.U.VAL = 0;
+    return V;
+  }
+
+  static inline APInt getTombstoneKey() {
+    APInt V(nullptr, 0);
+    V.U.VAL = 1;
+    return V;
+  }
+
+  static unsigned getHashValue(const APInt &Key) {
+    return static_cast<unsigned>(hash_value(Key));
+  }
+
+  static bool isEqual(const APInt &LHS, const APInt &RHS) {
+    return LHS.getBitWidth() == RHS.getBitWidth() && LHS == RHS;
+  }
+};
+
+/// Provide DenseMapInfo for APSInt, using the DenseMapInfo for APInt.
+template <> struct DenseMapInfo<APSInt> {
+  static inline APSInt getEmptyKey() {
+    return APSInt(DenseMapInfo<APInt>::getEmptyKey());
+  }
+
+  static inline APSInt getTombstoneKey() {
+    return APSInt(DenseMapInfo<APInt>::getTombstoneKey());
+  }
+
+  static unsigned getHashValue(const APSInt &Key) {
+    return static_cast<unsigned>(hash_value(Key));
+  }
+
+  static bool isEqual(const APSInt &LHS, const APSInt &RHS) {
+    return LHS.getBitWidth() == RHS.getBitWidth() &&
+           LHS.isUnsigned() == RHS.isUnsigned() && LHS == RHS;
+  }
+};
+
 } // end namespace llvm
 
 #endif // LLVM_ADT_DENSEMAPINFO_H
diff --git a/contrib/llvm-project/llvm/include/llvm/ADT/DenseSet.h b/contrib/llvm-project/llvm/include/llvm/ADT/DenseSet.h
index 07edc3d8e4ec..edce7c43773c 100644
--- a/contrib/llvm-project/llvm/include/llvm/ADT/DenseSet.h
+++ b/contrib/llvm-project/llvm/include/llvm/ADT/DenseSet.h
@@ -130,8 +130,12 @@ public:
 
     Iterator& operator++() { ++I; return *this; }
     Iterator operator++(int) { auto T = *this; ++I; return T; }
-    bool operator==(const ConstIterator& X) const { return I == X.I; }
-    bool operator!=(const ConstIterator& X) const { return I != X.I; }
+    friend bool operator==(const Iterator &X, const Iterator &Y) {
+      return X.I == Y.I;
+    }
+    friend bool operator!=(const Iterator &X, const Iterator &Y) {
+      return X.I != Y.I;
+    }
   };
 
   class ConstIterator {
@@ -155,8 +159,12 @@ public:
 
     ConstIterator& operator++() { ++I; return *this; }
     ConstIterator operator++(int) { auto T = *this; ++I; return T; }
-    bool operator==(const ConstIterator& X) const { return I == X.I; }
-    bool operator!=(const ConstIterator& X) const { return I != X.I; }
+    friend bool operator==(const ConstIterator &X, const ConstIterator &Y) {
+      return X.I == Y.I;
+    }
+    friend bool operator!=(const ConstIterator &X, const ConstIterator &Y) {
+      return X.I != Y.I;
+    }
   };
 
   using iterator = Iterator;
@@ -173,6 +181,11 @@ public:
     return ConstIterator(TheMap.find(V));
   }
 
+  /// Check if the set contains the given element.
+  bool contains(const_arg_type_t<ValueT> V) const {
+    return TheMap.find(V) != TheMap.end();
+  }
+
   /// Alternative version of find() which allows a different, and possibly less
   /// expensive, key type.
   /// The DenseMapInfo is responsible for supplying methods
diff --git a/contrib/llvm-project/llvm/include/llvm/ADT/DepthFirstIterator.h b/contrib/llvm-project/llvm/include/llvm/ADT/DepthFirstIterator.h
index 11967f5eefcc..5bfea28332b2 100644
--- a/contrib/llvm-project/llvm/include/llvm/ADT/DepthFirstIterator.h
+++ b/contrib/llvm-project/llvm/include/llvm/ADT/DepthFirstIterator.h
@@ -198,7 +198,7 @@ public:
   // nodes that a depth first iteration did not find: ie unreachable nodes.
   //
   bool nodeVisited(NodeRef Node) const {
-    return this->Visited.count(Node) != 0;
+    return this->Visited.contains(Node);
   }
 
   /// getPathLength - Return the length of the path from the entry node to the
diff --git a/contrib/llvm-project/llvm/include/llvm/ADT/DirectedGraph.h b/contrib/llvm-project/llvm/include/llvm/ADT/DirectedGraph.h
index cfe98e178a91..e8bb9e6b2292 100644
--- a/contrib/llvm-project/llvm/include/llvm/ADT/DirectedGraph.h
+++ b/contrib/llvm-project/llvm/include/llvm/ADT/DirectedGraph.h
@@ -38,8 +38,10 @@ public:
 
   /// Static polymorphism: delegate implementation (via isEqualTo) to the
   /// derived class.
-  bool operator==(const EdgeType &E) const { return getDerived().isEqualTo(E); }
-  bool operator!=(const EdgeType &E) const { return !operator==(E); }
+  bool operator==(const DGEdge &E) const {
+    return getDerived().isEqualTo(E.getDerived());
+  }
+  bool operator!=(const DGEdge &E) const { return !operator==(E); }
 
   /// Retrieve the target node this edge connects to.
   const NodeType &getTargetNode() const { return TargetNode; }
@@ -91,8 +93,12 @@ public:
 
   /// Static polymorphism: delegate implementation (via isEqualTo) to the
   /// derived class.
-  bool operator==(const NodeType &N) const { return getDerived().isEqualTo(N); }
-  bool operator!=(const NodeType &N) const { return !operator==(N); }
+  friend bool operator==(const NodeType &M, const NodeType &N) {
+    return M.isEqualTo(N);
+  }
+  friend bool operator!=(const NodeType &M, const NodeType &N) {
+    return !(M == N);
+  }
 
   const_iterator begin() const { return Edges.begin(); }
   const_iterator end() const { return Edges.end(); }
@@ -223,7 +229,7 @@ public:
       if (*Node == N)
         continue;
       Node->findEdgesTo(N, TempList);
-      EL.insert(EL.end(), TempList.begin(), TempList.end());
+      llvm::append_range(EL, TempList);
       TempList.clear();
     }
     return !EL.empty();
diff --git a/contrib/llvm-project/llvm/include/llvm/ADT/FloatingPointMode.h b/contrib/llvm-project/llvm/include/llvm/ADT/FloatingPointMode.h
index 3ba8ae1b2855..698830937870 100644
--- a/contrib/llvm-project/llvm/include/llvm/ADT/FloatingPointMode.h
+++ b/contrib/llvm-project/llvm/include/llvm/ADT/FloatingPointMode.h
@@ -44,6 +44,24 @@ enum class RoundingMode : int8_t {
   Invalid = -1    ///< Denotes invalid value.
 };
 
+/// Returns text representation of the given rounding mode.
+inline StringRef spell(RoundingMode RM) {
+  switch (RM) {
+  case RoundingMode::TowardZero: return "towardzero";
+  case RoundingMode::NearestTiesToEven: return "tonearest";
+  case RoundingMode::TowardPositive: return "upward";
+  case RoundingMode::TowardNegative: return "downward";
+  case RoundingMode::NearestTiesToAway: return "tonearestaway";
+  case RoundingMode::Dynamic: return "dynamic";
+  default: return "invalid";
+  }
+}
+
+inline raw_ostream &operator << (raw_ostream &OS, RoundingMode RM) {
+  OS << spell(RM);
+  return OS;
+}
+
 /// Represent subnormal handling kind for floating point instruction inputs and
 /// outputs.
 struct DenormalMode {
diff --git a/contrib/llvm-project/llvm/include/llvm/ADT/FunctionExtras.h b/contrib/llvm-project/llvm/include/llvm/ADT/FunctionExtras.h
index 4c75e4d2547b..7f8fb103f148 100644
--- a/contrib/llvm-project/llvm/include/llvm/ADT/FunctionExtras.h
+++ b/contrib/llvm-project/llvm/include/llvm/ADT/FunctionExtras.h
@@ -64,12 +64,12 @@ template <typename ReturnT, typename... ParamTs> class UniqueFunctionBase {
 protected:
   static constexpr size_t InlineStorageSize = sizeof(void *) * 3;
 
-  // MSVC has a bug and ICEs if we give it a particular dependent value
-  // expression as part of the `std::conditional` below. To work around this,
-  // we build that into a template struct's constexpr bool.
-  template <typename T> struct IsSizeLessThanThresholdT {
-    static constexpr bool value = sizeof(T) <= (2 * sizeof(void *));
-  };
+  template <typename T, class = void>
+  struct IsSizeLessThanThresholdT : std::false_type {};
+
+  template <typename T>
+  struct IsSizeLessThanThresholdT<
+      T, std::enable_if_t<sizeof(T) <= 2 * sizeof(void *)>> : std::true_type {};
 
   // Provide a type function to map parameters that won't observe extra copies
   // or moves and which are small enough to likely pass in register to values
diff --git a/contrib/llvm-project/llvm/include/llvm/ADT/Hashing.h b/contrib/llvm-project/llvm/include/llvm/ADT/Hashing.h
index 9ee310c879fd..cb53b7fa7469 100644
--- a/contrib/llvm-project/llvm/include/llvm/ADT/Hashing.h
+++ b/contrib/llvm-project/llvm/include/llvm/ADT/Hashing.h
@@ -52,6 +52,7 @@
 #include <cassert>
 #include <cstring>
 #include <string>
+#include <tuple>
 #include <utility>
 
 namespace llvm {
@@ -112,6 +113,10 @@ template <typename T> hash_code hash_value(const T *ptr);
 template <typename T, typename U>
 hash_code hash_value(const std::pair<T, U> &arg);
 
+/// Compute a hash_code for a tuple.
+template <typename... Ts>
+hash_code hash_value(const std::tuple<Ts...> &arg);
+
 /// Compute a hash_code for a standard string.
 template <typename T>
 hash_code hash_value(const std::basic_string<T> &arg);
@@ -645,6 +650,26 @@ hash_code hash_value(const std::pair<T, U> &arg) {
   return hash_combine(arg.first, arg.second);
 }
 
+// Implementation details for the hash_value overload for std::tuple<...>(...).
+namespace hashing {
+namespace detail {
+
+template <typename... Ts, std::size_t... Indices>
+hash_code hash_value_tuple_helper(const std::tuple<Ts...> &arg,
+                                  std::index_sequence<Indices...> indices) {
+  return hash_combine(std::get<Indices>(arg)...);
+}
+
+} // namespace detail
+} // namespace hashing
+
+template <typename... Ts>
+hash_code hash_value(const std::tuple<Ts...> &arg) {
+  // TODO: Use std::apply when LLVM starts using C++17.
+  return ::llvm::hashing::detail::hash_value_tuple_helper(
+      arg, typename std::index_sequence_for<Ts...>());
+}
+
 // Declared and documented above, but defined here so that any of the hashing
 // infrastructure is available.
 template <typename T>
diff --git a/contrib/llvm-project/llvm/include/llvm/ADT/IntervalMap.h b/contrib/llvm-project/llvm/include/llvm/ADT/IntervalMap.h
index db7804d0a551..0b6c7d667807 100644
--- a/contrib/llvm-project/llvm/include/llvm/ADT/IntervalMap.h
+++ b/contrib/llvm-project/llvm/include/llvm/ADT/IntervalMap.h
@@ -963,8 +963,7 @@ public:
 
 private:
   // The root data is either a RootLeaf or a RootBranchData instance.
-  alignas(RootLeaf) alignas(RootBranchData)
-      AlignedCharArrayUnion<RootLeaf, RootBranchData> data;
+  AlignedCharArrayUnion<RootLeaf, RootBranchData> data;
 
   // Tree height.
   // 0: Leaves in root.
@@ -979,10 +978,7 @@ private:
   Allocator &allocator;
 
   /// Represent data as a node type without breaking aliasing rules.
-  template <typename T>
-  T &dataAs() const {
-    return *bit_cast<T *>(const_cast<char *>(data.buffer));
-  }
+  template <typename T> T &dataAs() const { return *bit_cast<T *>(&data); }
 
   const RootLeaf &rootLeaf() const {
     assert(!branched() && "Cannot acces leaf data in branched root");
@@ -1040,7 +1036,7 @@ private:
 
 public:
   explicit IntervalMap(Allocator &a) : height(0), rootSize(0), allocator(a) {
-    assert((uintptr_t(data.buffer) & (alignof(RootLeaf) - 1)) == 0 &&
+    assert((uintptr_t(&data) & (alignof(RootLeaf) - 1)) == 0 &&
            "Insufficient alignment");
     new(&rootLeaf()) RootLeaf();
   }
diff --git a/contrib/llvm-project/llvm/include/llvm/ADT/IntrusiveRefCntPtr.h b/contrib/llvm-project/llvm/include/llvm/ADT/IntrusiveRefCntPtr.h
index 6d97fe15db8b..ca4c40db48b9 100644
--- a/contrib/llvm-project/llvm/include/llvm/ADT/IntrusiveRefCntPtr.h
+++ b/contrib/llvm-project/llvm/include/llvm/ADT/IntrusiveRefCntPtr.h
@@ -58,6 +58,7 @@
 #include <atomic>
 #include <cassert>
 #include <cstddef>
+#include <memory>
 
 namespace llvm {
 
@@ -70,10 +71,23 @@ namespace llvm {
 template <class Derived> class RefCountedBase {
   mutable unsigned RefCount = 0;
 
-public:
+protected:
   RefCountedBase() = default;
   RefCountedBase(const RefCountedBase &) {}
+  RefCountedBase &operator=(const RefCountedBase &) = delete;
+
+#ifndef NDEBUG
+  ~RefCountedBase() {
+    assert(RefCount == 0 &&
+           "Destruction occured when there are still references to this.");
+  }
+#else
+  // Default the destructor in release builds, A trivial destructor may enable
+  // better codegen.
+  ~RefCountedBase() = default;
+#endif
 
+public:
   void Retain() const { ++RefCount; }
 
   void Release() const {
@@ -85,10 +99,24 @@ public:
 
 /// A thread-safe version of \c RefCountedBase.
 template <class Derived> class ThreadSafeRefCountedBase {
-  mutable std::atomic<int> RefCount;
+  mutable std::atomic<int> RefCount{0};
 
 protected:
-  ThreadSafeRefCountedBase() : RefCount(0) {}
+  ThreadSafeRefCountedBase() = default;
+  ThreadSafeRefCountedBase(const ThreadSafeRefCountedBase &) {}
+  ThreadSafeRefCountedBase &
+  operator=(const ThreadSafeRefCountedBase &) = delete;
+
+#ifndef NDEBUG
+  ~ThreadSafeRefCountedBase() {
+    assert(RefCount == 0 &&
+           "Destruction occured when there are still references to this.");
+  }
+#else
+  // Default the destructor in release builds, A trivial destructor may enable
+  // better codegen.
+  ~ThreadSafeRefCountedBase() = default;
+#endif
 
 public:
   void Retain() const { RefCount.fetch_add(1, std::memory_order_relaxed); }
@@ -148,6 +176,11 @@ public:
     S.Obj = nullptr;
   }
 
+  template <class X>
+  IntrusiveRefCntPtr(std::unique_ptr<X> S) : Obj(S.release()) {
+    retain();
+  }
+
   template <class X>
   IntrusiveRefCntPtr(const IntrusiveRefCntPtr<X> &S) : Obj(S.get()) {
     retain();
@@ -264,6 +297,12 @@ template <class T> struct simplify_type<const IntrusiveRefCntPtr<T>> {
   }
 };
 
+/// Factory function for creating intrusive ref counted pointers.
+template <typename T, typename... Args>
+IntrusiveRefCntPtr<T> makeIntrusiveRefCnt(Args &&...A) {
+  return IntrusiveRefCntPtr<T>(new T(std::forward<Args>(A)...));
+}
+
 } // end namespace llvm
 
 #endif // LLVM_ADT_INTRUSIVEREFCNTPTR_H
diff --git a/contrib/llvm-project/llvm/include/llvm/ADT/Optional.h b/contrib/llvm-project/llvm/include/llvm/ADT/Optional.h
index c64b82352397..a285c81d1be8 100644
--- a/contrib/llvm-project/llvm/include/llvm/ADT/Optional.h
+++ b/contrib/llvm-project/llvm/include/llvm/ADT/Optional.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_ADT_OPTIONAL_H
 #define LLVM_ADT_OPTIONAL_H
 
+#include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/None.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/type_traits.h"
@@ -32,7 +33,30 @@ namespace optional_detail {
 struct in_place_t {};
 
 /// Storage for any type.
-template <typename T, bool = is_trivially_copyable<T>::value>
+//
+// The specialization condition intentionally uses
+// llvm::is_trivially_copy_constructible instead of
+// std::is_trivially_copy_constructible.  GCC versions prior to 7.4 may
+// instantiate the copy constructor of `T` when
+// std::is_trivially_copy_constructible is instantiated.  This causes
+// compilation to fail if we query the trivially copy constructible property of
+// a class which is not copy constructible.
+//
+// The current implementation of OptionalStorage insists that in order to use
+// the trivial specialization, the value_type must be trivially copy
+// constructible and trivially copy assignable due to =default implementations
+// of the copy/move constructor/assignment.  It does not follow that this is
+// necessarily the case std::is_trivially_copyable is true (hence the expanded
+// specialization condition).
+//
+// The move constructible / assignable conditions emulate the remaining behavior
+// of std::is_trivially_copyable.
+template <typename T, bool = (llvm::is_trivially_copy_constructible<T>::value &&
+                              std::is_trivially_copy_assignable<T>::value &&
+                              (std::is_trivially_move_constructible<T>::value ||
+                               !std::is_move_constructible<T>::value) &&
+                              (std::is_trivially_move_assignable<T>::value ||
+                               !std::is_move_assignable<T>::value))>
 class OptionalStorage {
   union {
     char empty;
@@ -43,21 +67,21 @@ class OptionalStorage {
 public:
   ~OptionalStorage() { reset(); }
 
-  OptionalStorage() noexcept : empty(), hasVal(false) {}
+  constexpr OptionalStorage() noexcept : empty(), hasVal(false) {}
 
-  OptionalStorage(OptionalStorage const &other) : OptionalStorage() {
+  constexpr OptionalStorage(OptionalStorage const &other) : OptionalStorage() {
     if (other.hasValue()) {
       emplace(other.value);
     }
   }
-  OptionalStorage(OptionalStorage &&other) : OptionalStorage() {
+  constexpr OptionalStorage(OptionalStorage &&other) : OptionalStorage() {
     if (other.hasValue()) {
       emplace(std::move(other.value));
     }
   }
 
   template <class... Args>
-  explicit OptionalStorage(in_place_t, Args &&... args)
+  constexpr explicit OptionalStorage(in_place_t, Args &&... args)
       : value(std::forward<Args>(args)...), hasVal(true) {}
 
   void reset() noexcept {
@@ -67,13 +91,13 @@ public:
     }
   }
 
-  bool hasValue() const noexcept { return hasVal; }
+  constexpr bool hasValue() const noexcept { return hasVal; }
 
   T &getValue() LLVM_LVALUE_FUNCTION noexcept {
     assert(hasVal);
     return value;
   }
-  T const &getValue() const LLVM_LVALUE_FUNCTION noexcept {
+  constexpr T const &getValue() const LLVM_LVALUE_FUNCTION noexcept {
     assert(hasVal);
     return value;
   }
@@ -148,16 +172,16 @@ template <typename T> class OptionalStorage<T, true> {
 public:
   ~OptionalStorage() = default;
 
-  OptionalStorage() noexcept : empty{} {}
+  constexpr OptionalStorage() noexcept : empty{} {}
 
-  OptionalStorage(OptionalStorage const &other) = default;
-  OptionalStorage(OptionalStorage &&other) = default;
+  constexpr OptionalStorage(OptionalStorage const &other) = default;
+  constexpr OptionalStorage(OptionalStorage &&other) = default;
 
   OptionalStorage &operator=(OptionalStorage const &other) = default;
   OptionalStorage &operator=(OptionalStorage &&other) = default;
 
   template <class... Args>
-  explicit OptionalStorage(in_place_t, Args &&... args)
+  constexpr explicit OptionalStorage(in_place_t, Args &&... args)
       : value(std::forward<Args>(args)...), hasVal(true) {}
 
   void reset() noexcept {
@@ -167,13 +191,13 @@ public:
     }
   }
 
-  bool hasValue() const noexcept { return hasVal; }
+  constexpr bool hasValue() const noexcept { return hasVal; }
 
   T &getValue() LLVM_LVALUE_FUNCTION noexcept {
     assert(hasVal);
     return value;
   }
-  T const &getValue() const LLVM_LVALUE_FUNCTION noexcept {
+  constexpr T const &getValue() const LLVM_LVALUE_FUNCTION noexcept {
     assert(hasVal);
     return value;
   }
@@ -221,11 +245,12 @@ public:
   constexpr Optional() {}
   constexpr Optional(NoneType) {}
 
-  Optional(const T &y) : Storage(optional_detail::in_place_t{}, y) {}
-  Optional(const Optional &O) = default;
+  constexpr Optional(const T &y) : Storage(optional_detail::in_place_t{}, y) {}
+  constexpr Optional(const Optional &O) = default;
 
-  Optional(T &&y) : Storage(optional_detail::in_place_t{}, std::move(y)) {}
-  Optional(Optional &&O) = default;
+  constexpr Optional(T &&y)
+      : Storage(optional_detail::in_place_t{}, std::move(y)) {}
+  constexpr Optional(Optional &&O) = default;
 
   Optional &operator=(T &&y) {
     Storage = std::move(y);
@@ -238,7 +263,7 @@ public:
     Storage.emplace(std::forward<ArgTypes>(Args)...);
   }
 
-  static inline Optional create(const T *y) {
+  static constexpr Optional create(const T *y) {
     return y ? Optional(*y) : Optional();
   }
 
@@ -250,16 +275,20 @@ public:
 
   void reset() { Storage.reset(); }
 
-  const T *getPointer() const { return &Storage.getValue(); }
+  constexpr const T *getPointer() const { return &Storage.getValue(); }
   T *getPointer() { return &Storage.getValue(); }
-  const T &getValue() const LLVM_LVALUE_FUNCTION { return Storage.getValue(); }
+  constexpr const T &getValue() const LLVM_LVALUE_FUNCTION {
+    return Storage.getValue();
+  }
   T &getValue() LLVM_LVALUE_FUNCTION { return Storage.getValue(); }
 
-  explicit operator bool() const { return hasValue(); }
-  bool hasValue() const { return Storage.hasValue(); }
-  const T *operator->() const { return getPointer(); }
+  constexpr explicit operator bool() const { return hasValue(); }
+  constexpr bool hasValue() const { return Storage.hasValue(); }
+  constexpr const T *operator->() const { return getPointer(); }
   T *operator->() { return getPointer(); }
-  const T &operator*() const LLVM_LVALUE_FUNCTION { return getValue(); }
+  constexpr const T &operator*() const LLVM_LVALUE_FUNCTION {
+    return getValue();
+  }
   T &operator*() LLVM_LVALUE_FUNCTION { return getValue(); }
 
   template <typename U>
@@ -294,137 +323,157 @@ public:
 #endif
 };
 
+template <class T> llvm::hash_code hash_value(const Optional<T> &O) {
+  return O ? hash_combine(true, *O) : hash_value(false);
+}
+
 template <typename T, typename U>
-bool operator==(const Optional<T> &X, const Optional<U> &Y) {
+constexpr bool operator==(const Optional<T> &X, const Optional<U> &Y) {
   if (X && Y)
     return *X == *Y;
   return X.hasValue() == Y.hasValue();
 }
 
 template <typename T, typename U>
-bool operator!=(const Optional<T> &X, const Optional<U> &Y) {
+constexpr bool operator!=(const Optional<T> &X, const Optional<U> &Y) {
   return !(X == Y);
 }
 
 template <typename T, typename U>
-bool operator<(const Optional<T> &X, const Optional<U> &Y) {
+constexpr bool operator<(const Optional<T> &X, const Optional<U> &Y) {
   if (X && Y)
     return *X < *Y;
   return X.hasValue() < Y.hasValue();
 }
 
 template <typename T, typename U>
-bool operator<=(const Optional<T> &X, const Optional<U> &Y) {
+constexpr bool operator<=(const Optional<T> &X, const Optional<U> &Y) {
   return !(Y < X);
 }
 
 template <typename T, typename U>
-bool operator>(const Optional<T> &X, const Optional<U> &Y) {
+constexpr bool operator>(const Optional<T> &X, const Optional<U> &Y) {
   return Y < X;
 }
 
 template <typename T, typename U>
-bool operator>=(const Optional<T> &X, const Optional<U> &Y) {
+constexpr bool operator>=(const Optional<T> &X, const Optional<U> &Y) {
   return !(X < Y);
 }
 
-template<typename T>
-bool operator==(const Optional<T> &X, NoneType) {
+template <typename T>
+constexpr bool operator==(const Optional<T> &X, NoneType) {
   return !X;
 }
 
-template<typename T>
-bool operator==(NoneType, const Optional<T> &X) {
+template <typename T>
+constexpr bool operator==(NoneType, const Optional<T> &X) {
   return X == None;
 }
 
-template<typename T>
-bool operator!=(const Optional<T> &X, NoneType) {
+template <typename T>
+constexpr bool operator!=(const Optional<T> &X, NoneType) {
   return !(X == None);
 }
 
-template<typename T>
-bool operator!=(NoneType, const Optional<T> &X) {
+template <typename T>
+constexpr bool operator!=(NoneType, const Optional<T> &X) {
   return X != None;
 }
 
-template <typename T> bool operator<(const Optional<T> &X, NoneType) {
+template <typename T> constexpr bool operator<(const Optional<T> &X, NoneType) {
   return false;
 }
 
-template <typename T> bool operator<(NoneType, const Optional<T> &X) {
+template <typename T> constexpr bool operator<(NoneType, const Optional<T> &X) {
   return X.hasValue();
 }
 
-template <typename T> bool operator<=(const Optional<T> &X, NoneType) {
+template <typename T>
+constexpr bool operator<=(const Optional<T> &X, NoneType) {
   return !(None < X);
 }
 
-template <typename T> bool operator<=(NoneType, const Optional<T> &X) {
+template <typename T>
+constexpr bool operator<=(NoneType, const Optional<T> &X) {
   return !(X < None);
 }
 
-template <typename T> bool operator>(const Optional<T> &X, NoneType) {
+template <typename T> constexpr bool operator>(const Optional<T> &X, NoneType) {
   return None < X;
 }
 
-template <typename T> bool operator>(NoneType, const Optional<T> &X) {
+template <typename T> constexpr bool operator>(NoneType, const Optional<T> &X) {
   return X < None;
 }
 
-template <typename T> bool operator>=(const Optional<T> &X, NoneType) {
+template <typename T>
+constexpr bool operator>=(const Optional<T> &X, NoneType) {
   return None <= X;
 }
 
-template <typename T> bool operator>=(NoneType, const Optional<T> &X) {
+template <typename T>
+constexpr bool operator>=(NoneType, const Optional<T> &X) {
   return X <= None;
 }
 
-template <typename T> bool operator==(const Optional<T> &X, const T &Y) {
+template <typename T>
+constexpr bool operator==(const Optional<T> &X, const T &Y) {
   return X && *X == Y;
 }
 
-template <typename T> bool operator==(const T &X, const Optional<T> &Y) {
+template <typename T>
+constexpr bool operator==(const T &X, const Optional<T> &Y) {
   return Y && X == *Y;
 }
 
-template <typename T> bool operator!=(const Optional<T> &X, const T &Y) {
+template <typename T>
+constexpr bool operator!=(const Optional<T> &X, const T &Y) {
   return !(X == Y);
 }
 
-template <typename T> bool operator!=(const T &X, const Optional<T> &Y) {
+template <typename T>
+constexpr bool operator!=(const T &X, const Optional<T> &Y) {
   return !(X == Y);
 }
 
-template <typename T> bool operator<(const Optional<T> &X, const T &Y) {
+template <typename T>
+constexpr bool operator<(const Optional<T> &X, const T &Y) {
   return !X || *X < Y;
 }
 
-template <typename T> bool operator<(const T &X, const Optional<T> &Y) {
+template <typename T>
+constexpr bool operator<(const T &X, const Optional<T> &Y) {
   return Y && X < *Y;
 }
 
-template <typename T> bool operator<=(const Optional<T> &X, const T &Y) {
+template <typename T>
+constexpr bool operator<=(const Optional<T> &X, const T &Y) {
   return !(Y < X);
 }
 
-template <typename T> bool operator<=(const T &X, const Optional<T> &Y) {
+template <typename T>
+constexpr bool operator<=(const T &X, const Optional<T> &Y) {
   return !(Y < X);
 }
 
-template <typename T> bool operator>(const Optional<T> &X, const T &Y) {
+template <typename T>
+constexpr bool operator>(const Optional<T> &X, const T &Y) {
   return Y < X;
 }
 
-template <typename T> bool operator>(const T &X, const Optional<T> &Y) {
+template <typename T>
+constexpr bool operator>(const T &X, const Optional<T> &Y) {
   return Y < X;
 }
 
-template <typename T> bool operator>=(const Optional<T> &X, const T &Y) {
+template <typename T>
+constexpr bool operator>=(const Optional<T> &X, const T &Y) {
   return !(X < Y);
 }
 
-template <typename T> bool operator>=(const T &X, const Optional<T> &Y) {
+template <typename T>
+constexpr bool operator>=(const T &X, const Optional<T> &Y) {
   return !(X < Y);
 }
 
diff --git a/contrib/llvm-project/llvm/include/llvm/ADT/PointerUnion.h b/contrib/llvm-project/llvm/include/llvm/ADT/PointerUnion.h
index 6fecff8d756f..c39691061b72 100644
--- a/contrib/llvm-project/llvm/include/llvm/ADT/PointerUnion.h
+++ b/contrib/llvm-project/llvm/include/llvm/ADT/PointerUnion.h
@@ -93,13 +93,6 @@ namespace pointer_union_detail {
     static constexpr int NumLowBitsAvailable = lowBitsAvailable<PTs...>();
   };
 
-  /// Implement assignment in terms of construction.
-  template <typename Derived, typename T> struct AssignableFrom {
-    Derived &operator=(T t) {
-      return static_cast<Derived &>(*this) = Derived(t);
-    }
-  };
-
   template <typename Derived, typename ValTy, int I, typename ...Types>
   class PointerUnionMembers;
 
diff --git a/contrib/llvm-project/llvm/include/llvm/ADT/STLExtras.h b/contrib/llvm-project/llvm/include/llvm/ADT/STLExtras.h
index 50b688b36648..63c7f48a5bd2 100644
--- a/contrib/llvm-project/llvm/include/llvm/ADT/STLExtras.h
+++ b/contrib/llvm-project/llvm/include/llvm/ADT/STLExtras.h
@@ -193,9 +193,15 @@ public:
   template <typename Callable>
   function_ref(
       Callable &&callable,
+      // This is not the copy-constructor.
       std::enable_if_t<
           !std::is_same<std::remove_cv_t<std::remove_reference_t<Callable>>,
-                        function_ref>::value> * = nullptr)
+                        function_ref>::value> * = nullptr,
+      // Functor must be callable and return a suitable type.
+      std::enable_if_t<std::is_void<Ret>::value ||
+                       std::is_convertible<decltype(std::declval<Callable>()(
+                                               std::declval<Params>()...)),
+                                           Ret>::value> * = nullptr)
       : callback(callback_fn<typename std::remove_reference<Callable>::type>),
         callable(reinterpret_cast<intptr_t>(&callable)) {}
 
@@ -206,15 +212,6 @@ public:
   explicit operator bool() const { return callback; }
 };
 
-// deleter - Very very very simple method that is used to invoke operator
-// delete on something.  It is used like this:
-//
-//   for_each(V.begin(), B.end(), deleter<Interval>);
-template <class T>
-inline void deleter(T *Ptr) {
-  delete Ptr;
-}
-
 //===----------------------------------------------------------------------===//
 //     Extra additions to <iterator>
 //===----------------------------------------------------------------------===//
@@ -275,7 +272,7 @@ template <typename ContainerTy> bool hasSingleElement(ContainerTy &&C) {
 
 /// Return a range covering \p RangeOrContainer with the first N elements
 /// excluded.
-template <typename T> auto drop_begin(T &&RangeOrContainer, size_t N) {
+template <typename T> auto drop_begin(T &&RangeOrContainer, size_t N = 1) {
   return make_range(std::next(adl_begin(RangeOrContainer), N),
                     adl_end(RangeOrContainer));
 }
@@ -541,7 +538,7 @@ public:
   early_inc_iterator_impl(WrappedIteratorT I) : BaseT(I) {}
 
   using BaseT::operator*;
-  typename BaseT::reference operator*() {
+  decltype(*std::declval<WrappedIteratorT>()) operator*() {
 #if LLVM_ENABLE_ABI_BREAKING_CHECKS
     assert(!IsEarlyIncremented && "Cannot dereference twice!");
     IsEarlyIncremented = true;
@@ -558,12 +555,12 @@ public:
     return *this;
   }
 
-  using BaseT::operator==;
-  bool operator==(const early_inc_iterator_impl &RHS) const {
+  friend bool operator==(const early_inc_iterator_impl &LHS,
+                         const early_inc_iterator_impl &RHS) {
 #if LLVM_ENABLE_ABI_BREAKING_CHECKS
-    assert(!IsEarlyIncremented && "Cannot compare after dereferencing!");
+    assert(!LHS.IsEarlyIncremented && "Cannot compare after dereferencing!");
 #endif
-    return BaseT::operator==(RHS);
+    return (const BaseT &)LHS == (const BaseT &)RHS;
   }
 };
 
@@ -1246,6 +1243,15 @@ public:
   }
 };
 
+/// Given a container of pairs, return a range over the first elements.
+template <typename ContainerTy> auto make_first_range(ContainerTy &&c) {
+  return llvm::map_range(
+      std::forward<ContainerTy>(c),
+      [](decltype((*std::begin(c))) elt) -> decltype((elt.first)) {
+        return elt.first;
+      });
+}
+
 /// Given a container of pairs, return a range over the second elements.
 template <typename ContainerTy> auto make_second_range(ContainerTy &&c) {
   return llvm::map_range(
@@ -1422,7 +1428,7 @@ template <typename T>
 // is trivially copyable.
 using sort_trivially_copyable = conjunction<
     std::is_pointer<T>,
-    is_trivially_copyable<typename std::iterator_traits<T>::value_type>>;
+    std::is_trivially_copyable<typename std::iterator_traits<T>::value_type>>;
 } // namespace detail
 
 // Provide wrappers to std::sort which shuffle the elements before sorting
@@ -1471,18 +1477,19 @@ inline void sort(Container &&C, Compare Comp) {
 /// which is only enabled when the operation is O(1).
 template <typename R>
 auto size(R &&Range,
-          std::enable_if_t<std::is_same<typename std::iterator_traits<decltype(
-                                            Range.begin())>::iterator_category,
-                                        std::random_access_iterator_tag>::value,
-                           void> * = nullptr) {
+          std::enable_if_t<
+              std::is_base_of<std::random_access_iterator_tag,
+                              typename std::iterator_traits<decltype(
+                                  Range.begin())>::iterator_category>::value,
+              void> * = nullptr) {
   return std::distance(Range.begin(), Range.end());
 }
 
 /// Provide wrappers to std::for_each which take ranges instead of having to
 /// pass begin/end explicitly.
-template <typename R, typename UnaryPredicate>
-UnaryPredicate for_each(R &&Range, UnaryPredicate P) {
-  return std::for_each(adl_begin(Range), adl_end(Range), P);
+template <typename R, typename UnaryFunction>
+UnaryFunction for_each(R &&Range, UnaryFunction F) {
+  return std::for_each(adl_begin(Range), adl_end(Range), F);
 }
 
 /// Provide wrappers to std::all_of which take ranges instead of having to pass
@@ -1543,6 +1550,13 @@ OutputIt copy(R &&Range, OutputIt Out) {
   return std::copy(adl_begin(Range), adl_end(Range), Out);
 }
 
+/// Provide wrappers to std::move which take ranges instead of having to
+/// pass begin/end explicitly.
+template <typename R, typename OutputIt>
+OutputIt move(R &&Range, OutputIt Out) {
+  return std::move(adl_begin(Range), adl_end(Range), Out);
+}
+
 /// Wrapper function around std::find to detect if an element exists
 /// in a container.
 template <typename R, typename E>
@@ -1577,9 +1591,9 @@ auto count_if(R &&Range, UnaryPredicate P) {
 
 /// Wrapper function around std::transform to apply a function to a range and
 /// store the result elsewhere.
-template <typename R, typename OutputIt, typename UnaryPredicate>
-OutputIt transform(R &&Range, OutputIt d_first, UnaryPredicate P) {
-  return std::transform(adl_begin(Range), adl_end(Range), d_first, P);
+template <typename R, typename OutputIt, typename UnaryFunction>
+OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F) {
+  return std::transform(adl_begin(Range), adl_end(Range), d_first, F);
 }
 
 /// Provide wrappers to std::partition which take ranges instead of having to
@@ -1654,6 +1668,22 @@ void erase_if(Container &C, UnaryPredicate P) {
   C.erase(remove_if(C, P), C.end());
 }
 
+/// Wrapper function to remove a value from a container:
+///
+/// C.erase(remove(C.begin(), C.end(), V), C.end());
+template <typename Container, typename ValueType>
+void erase_value(Container &C, ValueType V) {
+  C.erase(std::remove(C.begin(), C.end(), V), C.end());
+}
+
+/// Wrapper function to append a range to a container.
+///
+/// C.insert(C.end(), R.begin(), R.end());
+template <typename Container, typename Range>
+inline void append_range(Container &C, Range &&R) {
+  C.insert(C.end(), R.begin(), R.end());
+}
+
 /// Given a sequence container Cont, replace the range [ContIt, ContEnd) with
 /// the range [ValIt, ValEnd) (which is not from the same container).
 template<typename Container, typename RandomAccessIterator>
@@ -1911,16 +1941,16 @@ decltype(auto) apply_tuple(F &&f, Tuple &&t) {
 /// Return true if the sequence [Begin, End) has exactly N items. Runs in O(N)
 /// time. Not meant for use with random-access iterators.
 /// Can optionally take a predicate to filter lazily some items.
-template<typename IterTy,
-         typename Pred = bool (*)(const decltype(*std::declval<IterTy>()) &)>
+template <typename IterTy,
+          typename Pred = bool (*)(const decltype(*std::declval<IterTy>()) &)>
 bool hasNItems(
     IterTy &&Begin, IterTy &&End, unsigned N,
     Pred &&ShouldBeCounted =
         [](const decltype(*std::declval<IterTy>()) &) { return true; },
     std::enable_if_t<
-        !std::is_same<typename std::iterator_traits<std::remove_reference_t<
-                          decltype(Begin)>>::iterator_category,
-                      std::random_access_iterator_tag>::value,
+        !std::is_base_of<std::random_access_iterator_tag,
+                         typename std::iterator_traits<std::remove_reference_t<
+                             decltype(Begin)>>::iterator_category>::value,
         void> * = nullptr) {
   for (; N; ++Begin) {
     if (Begin == End)
@@ -1936,16 +1966,16 @@ bool hasNItems(
 /// Return true if the sequence [Begin, End) has N or more items. Runs in O(N)
 /// time. Not meant for use with random-access iterators.
 /// Can optionally take a predicate to lazily filter some items.
-template<typename IterTy,
-         typename Pred = bool (*)(const decltype(*std::declval<IterTy>()) &)>
+template <typename IterTy,
+          typename Pred = bool (*)(const decltype(*std::declval<IterTy>()) &)>
 bool hasNItemsOrMore(
     IterTy &&Begin, IterTy &&End, unsigned N,
     Pred &&ShouldBeCounted =
         [](const decltype(*std::declval<IterTy>()) &) { return true; },
     std::enable_if_t<
-        !std::is_same<typename std::iterator_traits<std::remove_reference_t<
-                          decltype(Begin)>>::iterator_category,
-                      std::random_access_iterator_tag>::value,
+        !std::is_base_of<std::random_access_iterator_tag,
+                         typename std::iterator_traits<std::remove_reference_t<
+                             decltype(Begin)>>::iterator_category>::value,
         void> * = nullptr) {
   for (; N; ++Begin) {
     if (Begin == End)
diff --git a/contrib/llvm-project/llvm/include/llvm/ADT/Sequence.h b/contrib/llvm-project/llvm/include/llvm/ADT/Sequence.h
index 8c505f2010dd..8a695d75f77a 100644
--- a/contrib/llvm-project/llvm/include/llvm/ADT/Sequence.h
+++ b/contrib/llvm-project/llvm/include/llvm/ADT/Sequence.h
@@ -42,6 +42,10 @@ public:
   value_sequence_iterator(const value_sequence_iterator &) = default;
   value_sequence_iterator(value_sequence_iterator &&Arg)
       : Value(std::move(Arg.Value)) {}
+  value_sequence_iterator &operator=(const value_sequence_iterator &Arg) {
+    Value = Arg.Value;
+    return *this;
+  }
 
   template <typename U, typename Enabler = decltype(ValueT(std::declval<U>()))>
   value_sequence_iterator(U &&Value) : Value(std::forward<U>(Value)) {}
diff --git a/contrib/llvm-project/llvm/include/llvm/ADT/SetVector.h b/contrib/llvm-project/llvm/include/llvm/ADT/SetVector.h
index 91ad72143ed3..32bcd50966cc 100644
--- a/contrib/llvm-project/llvm/include/llvm/ADT/SetVector.h
+++ b/contrib/llvm-project/llvm/include/llvm/ADT/SetVector.h
@@ -205,6 +205,11 @@ public:
     return true;
   }
 
+  /// Check if the SetVector contains the given key.
+  bool contains(const key_type &key) const {
+    return set_.find(key) != set_.end();
+  }
+
   /// Count the number of elements of a given key in the SetVector.
   /// \returns 0 if the element is not in the SetVector, 1 if it is.
   size_type count(const key_type &key) const {
diff --git a/contrib/llvm-project/llvm/include/llvm/ADT/SmallSet.h b/contrib/llvm-project/llvm/include/llvm/ADT/SmallSet.h
index a03fa7dd8423..0600e528ee69 100644
--- a/contrib/llvm-project/llvm/include/llvm/ADT/SmallSet.h
+++ b/contrib/llvm-project/llvm/include/llvm/ADT/SmallSet.h
@@ -232,6 +232,13 @@ public:
     return {Set.end()};
   }
 
+  /// Check if the SmallSet contains the given element.
+  bool contains(const T &V) const {
+    if (isSmall())
+      return vfind(V) != Vector.end();
+    return Set.find(V) != Set.end();
+  }
+
 private:
   bool isSmall() const { return Set.empty(); }
 
diff --git a/contrib/llvm-project/llvm/include/llvm/ADT/SmallString.h b/contrib/llvm-project/llvm/include/llvm/ADT/SmallString.h
index cd6f2173d04f..5a56321ae492 100644
--- a/contrib/llvm-project/llvm/include/llvm/ADT/SmallString.h
+++ b/contrib/llvm-project/llvm/include/llvm/ADT/SmallString.h
@@ -30,63 +30,56 @@ public:
   /// Initialize from a StringRef.
   SmallString(StringRef S) : SmallVector<char, InternalLen>(S.begin(), S.end()) {}
 
+  /// Initialize by concatenating a list of StringRefs.
+  SmallString(std::initializer_list<StringRef> Refs)
+      : SmallVector<char, InternalLen>() {
+    this->append(Refs);
+  }
+
   /// Initialize with a range.
   template<typename ItTy>
   SmallString(ItTy S, ItTy E) : SmallVector<char, InternalLen>(S, E) {}
 
-  // Note that in order to add new overloads for append & assign, we have to
-  // duplicate the inherited versions so as not to inadvertently hide them.
-
   /// @}
   /// @name String Assignment
   /// @{
 
-  /// Assign from a repeated element.
-  void assign(size_t NumElts, char Elt) {
-    this->SmallVectorImpl<char>::assign(NumElts, Elt);
-  }
-
-  /// Assign from an iterator pair.
-  template<typename in_iter>
-  void assign(in_iter S, in_iter E) {
-    this->clear();
-    SmallVectorImpl<char>::append(S, E);
-  }
+  using SmallVector<char, InternalLen>::assign;
 
   /// Assign from a StringRef.
   void assign(StringRef RHS) {
-    this->clear();
-    SmallVectorImpl<char>::append(RHS.begin(), RHS.end());
+    SmallVectorImpl<char>::assign(RHS.begin(), RHS.end());
   }
 
-  /// Assign from a SmallVector.
-  void assign(const SmallVectorImpl<char> &RHS) {
+  /// Assign from a list of StringRefs.
+  void assign(std::initializer_list<StringRef> Refs) {
     this->clear();
-    SmallVectorImpl<char>::append(RHS.begin(), RHS.end());
+    append(Refs);
   }
 
   /// @}
   /// @name String Concatenation
   /// @{
 
-  /// Append from an iterator pair.
-  template<typename in_iter>
-  void append(in_iter S, in_iter E) {
-    SmallVectorImpl<char>::append(S, E);
-  }
-
-  void append(size_t NumInputs, char Elt) {
-    SmallVectorImpl<char>::append(NumInputs, Elt);
-  }
+  using SmallVector<char, InternalLen>::append;
 
   /// Append from a StringRef.
   void append(StringRef RHS) {
     SmallVectorImpl<char>::append(RHS.begin(), RHS.end());
   }
 
-  /// Append from a SmallVector.
-  void append(const SmallVectorImpl<char> &RHS) {
-    SmallVectorImpl<char>::append(RHS.begin(), RHS.end());
+  /// Append from a list of StringRefs.
+  void append(std::initializer_list<StringRef> Refs) {
+    size_t SizeNeeded = this->size();
+    for (const StringRef &Ref : Refs)
+      SizeNeeded += Ref.size();
+    this->reserve(SizeNeeded);
+    auto CurEnd = this->end();
+    for (const StringRef &Ref : Refs) {
+      this->uninitialized_copy(Ref.begin(), Ref.end(), CurEnd);
+      CurEnd += Ref.size();
+    }
+    this->set_size(SizeNeeded);
   }
 
   /// @}
@@ -280,9 +273,9 @@ public:
   }
 
   // Extra operators.
-  const SmallString &operator=(StringRef RHS) {
-    this->clear();
-    return *this += RHS;
+  SmallString &operator=(StringRef RHS) {
+    this->assign(RHS);
+    return *this;
   }
 
   SmallString &operator+=(StringRef RHS) {
diff --git a/contrib/llvm-project/llvm/include/llvm/ADT/SmallVector.h b/contrib/llvm-project/llvm/include/llvm/ADT/SmallVector.h
index 3ccee3d21d48..e960b272db04 100644
--- a/contrib/llvm-project/llvm/include/llvm/ADT/SmallVector.h
+++ b/contrib/llvm-project/llvm/include/llvm/ADT/SmallVector.h
@@ -14,7 +14,6 @@
 #define LLVM_ADT_SMALLVECTOR_H
 
 #include "llvm/ADT/iterator_range.h"
-#include "llvm/Support/AlignOf.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
@@ -57,10 +56,15 @@ protected:
   SmallVectorBase(void *FirstEl, size_t TotalCapacity)
       : BeginX(FirstEl), Capacity(TotalCapacity) {}
 
+  /// This is a helper for \a grow() that's out of line to reduce code
+  /// duplication.  This function will report a fatal error if it can't grow at
+  /// least to \p MinSize.
+  void *mallocForGrow(size_t MinSize, size_t TSize, size_t &NewCapacity);
+
   /// This is an implementation of the grow() method which only works
   /// on POD-like data types and is out of line to reduce code duplication.
   /// This function will report a fatal error if it cannot increase capacity.
-  void grow_pod(void *FirstEl, size_t MinCapacity, size_t TSize);
+  void grow_pod(void *FirstEl, size_t MinSize, size_t TSize);
 
 public:
   size_t size() const { return Size; }
@@ -90,8 +94,9 @@ using SmallVectorSizeType =
 
 /// Figure out the offset of the first element.
 template <class T, typename = void> struct SmallVectorAlignmentAndSize {
-  AlignedCharArrayUnion<SmallVectorBase<SmallVectorSizeType<T>>> Base;
-  AlignedCharArrayUnion<T> FirstEl;
+  alignas(SmallVectorBase<SmallVectorSizeType<T>>) char Base[sizeof(
+      SmallVectorBase<SmallVectorSizeType<T>>)];
+  alignas(T) char FirstEl[sizeof(T)];
 };
 
 /// This is the part of SmallVectorTemplateBase which does not depend on whether
@@ -115,8 +120,8 @@ class SmallVectorTemplateCommon
 protected:
   SmallVectorTemplateCommon(size_t Size) : Base(getFirstEl(), Size) {}
 
-  void grow_pod(size_t MinCapacity, size_t TSize) {
-    Base::grow_pod(getFirstEl(), MinCapacity, TSize);
+  void grow_pod(size_t MinSize, size_t TSize) {
+    Base::grow_pod(getFirstEl(), MinSize, TSize);
   }
 
   /// Return true if this is a smallvector which has not had dynamic
@@ -129,6 +134,102 @@ protected:
     this->Size = this->Capacity = 0; // FIXME: Setting Capacity to 0 is suspect.
   }
 
+  /// Return true if V is an internal reference to the given range.
+  bool isReferenceToRange(const void *V, const void *First, const void *Last) const {
+    // Use std::less to avoid UB.
+    std::less<> LessThan;
+    return !LessThan(V, First) && LessThan(V, Last);
+  }
+
+  /// Return true if V is an internal reference to this vector.
+  bool isReferenceToStorage(const void *V) const {
+    return isReferenceToRange(V, this->begin(), this->end());
+  }
+
+  /// Return true if First and Last form a valid (possibly empty) range in this
+  /// vector's storage.
+  bool isRangeInStorage(const void *First, const void *Last) const {
+    // Use std::less to avoid UB.
+    std::less<> LessThan;
+    return !LessThan(First, this->begin()) && !LessThan(Last, First) &&
+           !LessThan(this->end(), Last);
+  }
+
+  /// Return true unless Elt will be invalidated by resizing the vector to
+  /// NewSize.
+  bool isSafeToReferenceAfterResize(const void *Elt, size_t NewSize) {
+    // Past the end.
+    if (LLVM_LIKELY(!isReferenceToStorage(Elt)))
+      return true;
+
+    // Return false if Elt will be destroyed by shrinking.
+    if (NewSize <= this->size())
+      return Elt < this->begin() + NewSize;
+
+    // Return false if we need to grow.
+    return NewSize <= this->capacity();
+  }
+
+  /// Check whether Elt will be invalidated by resizing the vector to NewSize.
+  void assertSafeToReferenceAfterResize(const void *Elt, size_t NewSize) {
+    assert(isSafeToReferenceAfterResize(Elt, NewSize) &&
+           "Attempting to reference an element of the vector in an operation "
+           "that invalidates it");
+  }
+
+  /// Check whether Elt will be invalidated by increasing the size of the
+  /// vector by N.
+  void assertSafeToAdd(const void *Elt, size_t N = 1) {
+    this->assertSafeToReferenceAfterResize(Elt, this->size() + N);
+  }
+
+  /// Check whether any part of the range will be invalidated by clearing.
+  void assertSafeToReferenceAfterClear(const T *From, const T *To) {
+    if (From == To)
+      return;
+    this->assertSafeToReferenceAfterResize(From, 0);
+    this->assertSafeToReferenceAfterResize(To - 1, 0);
+  }
+  template <
+      class ItTy,
+      std::enable_if_t<!std::is_same<std::remove_const_t<ItTy>, T *>::value,
+                       bool> = false>
+  void assertSafeToReferenceAfterClear(ItTy, ItTy) {}
+
+  /// Check whether any part of the range will be invalidated by growing.
+  void assertSafeToAddRange(const T *From, const T *To) {
+    if (From == To)
+      return;
+    this->assertSafeToAdd(From, To - From);
+    this->assertSafeToAdd(To - 1, To - From);
+  }
+  template <
+      class ItTy,
+      std::enable_if_t<!std::is_same<std::remove_const_t<ItTy>, T *>::value,
+                       bool> = false>
+  void assertSafeToAddRange(ItTy, ItTy) {}
+
+  /// Reserve enough space to add one element, and return the updated element
+  /// pointer in case it was a reference to the storage.
+  template <class U>
+  static const T *reserveForParamAndGetAddressImpl(U *This, const T &Elt,
+                                                   size_t N) {
+    size_t NewSize = This->size() + N;
+    if (LLVM_LIKELY(NewSize <= This->capacity()))
+      return &Elt;
+
+    bool ReferencesStorage = false;
+    int64_t Index = -1;
+    if (!U::TakesParamByValue) {
+      if (LLVM_UNLIKELY(This->isReferenceToStorage(&Elt))) {
+        ReferencesStorage = true;
+        Index = &Elt - This->begin();
+      }
+    }
+    This->grow(NewSize);
+    return ReferencesStorage ? This->begin() + Index : &Elt;
+  }
+
 public:
   using size_type = size_t;
   using difference_type = ptrdiff_t;
@@ -212,7 +313,12 @@ template <typename T, bool = (is_trivially_copy_constructible<T>::value) &&
                              (is_trivially_move_constructible<T>::value) &&
                              std::is_trivially_destructible<T>::value>
 class SmallVectorTemplateBase : public SmallVectorTemplateCommon<T> {
+  friend class SmallVectorTemplateCommon<T>;
+
 protected:
+  static constexpr bool TakesParamByValue = false;
+  using ValueParamT = const T &;
+
   SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon<T>(Size) {}
 
   static void destroy_range(T *S, T *E) {
@@ -242,18 +348,68 @@ protected:
   /// element, or MinSize more elements if specified.
   void grow(size_t MinSize = 0);
 
+  /// Create a new allocation big enough for \p MinSize and pass back its size
+  /// in \p NewCapacity. This is the first section of \a grow().
+  T *mallocForGrow(size_t MinSize, size_t &NewCapacity) {
+    return static_cast<T *>(
+        SmallVectorBase<SmallVectorSizeType<T>>::mallocForGrow(
+            MinSize, sizeof(T), NewCapacity));
+  }
+
+  /// Move existing elements over to the new allocation \p NewElts, the middle
+  /// section of \a grow().
+  void moveElementsForGrow(T *NewElts);
+
+  /// Transfer ownership of the allocation, finishing up \a grow().
+  void takeAllocationForGrow(T *NewElts, size_t NewCapacity);
+
+  /// Reserve enough space to add one element, and return the updated element
+  /// pointer in case it was a reference to the storage.
+  const T *reserveForParamAndGetAddress(const T &Elt, size_t N = 1) {
+    return this->reserveForParamAndGetAddressImpl(this, Elt, N);
+  }
+
+  /// Reserve enough space to add one element, and return the updated element
+  /// pointer in case it was a reference to the storage.
+  T *reserveForParamAndGetAddress(T &Elt, size_t N = 1) {
+    return const_cast<T *>(
+        this->reserveForParamAndGetAddressImpl(this, Elt, N));
+  }
+
+  static T &&forward_value_param(T &&V) { return std::move(V); }
+  static const T &forward_value_param(const T &V) { return V; }
+
+  void growAndAssign(size_t NumElts, const T &Elt) {
+    // Grow manually in case Elt is an internal reference.
+    size_t NewCapacity;
+    T *NewElts = mallocForGrow(NumElts, NewCapacity);
+    std::uninitialized_fill_n(NewElts, NumElts, Elt);
+    this->destroy_range(this->begin(), this->end());
+    takeAllocationForGrow(NewElts, NewCapacity);
+    this->set_size(NumElts);
+  }
+
+  template <typename... ArgTypes> T &growAndEmplaceBack(ArgTypes &&... Args) {
+    // Grow manually in case one of Args is an internal reference.
+    size_t NewCapacity;
+    T *NewElts = mallocForGrow(0, NewCapacity);
+    ::new ((void *)(NewElts + this->size())) T(std::forward<ArgTypes>(Args)...);
+    moveElementsForGrow(NewElts);
+    takeAllocationForGrow(NewElts, NewCapacity);
+    this->set_size(this->size() + 1);
+    return this->back();
+  }
+
 public:
   void push_back(const T &Elt) {
-    if (LLVM_UNLIKELY(this->size() >= this->capacity()))
-      this->grow();
-    ::new ((void*) this->end()) T(Elt);
+    const T *EltPtr = reserveForParamAndGetAddress(Elt);
+    ::new ((void *)this->end()) T(*EltPtr);
     this->set_size(this->size() + 1);
   }
 
   void push_back(T &&Elt) {
-    if (LLVM_UNLIKELY(this->size() >= this->capacity()))
-      this->grow();
-    ::new ((void*) this->end()) T(::std::move(Elt));
+    T *EltPtr = reserveForParamAndGetAddress(Elt);
+    ::new ((void *)this->end()) T(::std::move(*EltPtr));
     this->set_size(this->size() + 1);
   }
 
@@ -266,29 +422,27 @@ public:
 // Define this out-of-line to dissuade the C++ compiler from inlining it.
 template <typename T, bool TriviallyCopyable>
 void SmallVectorTemplateBase<T, TriviallyCopyable>::grow(size_t MinSize) {
-  // Ensure we can fit the new capacity.
-  // This is only going to be applicable when the capacity is 32 bit.
-  if (MinSize > this->SizeTypeMax())
-    report_bad_alloc_error("SmallVector capacity overflow during allocation");
-
-  // Ensure we can meet the guarantee of space for at least one more element.
-  // The above check alone will not catch the case where grow is called with a
-  // default MinCapacity of 0, but the current capacity cannot be increased.
-  // This is only going to be applicable when the capacity is 32 bit.
-  if (this->capacity() == this->SizeTypeMax())
-    report_bad_alloc_error("SmallVector capacity unable to grow");
-
-  // Always grow, even from zero.
-  size_t NewCapacity = size_t(NextPowerOf2(this->capacity() + 2));
-  NewCapacity = std::min(std::max(NewCapacity, MinSize), this->SizeTypeMax());
-  T *NewElts = static_cast<T*>(llvm::safe_malloc(NewCapacity*sizeof(T)));
+  size_t NewCapacity;
+  T *NewElts = mallocForGrow(MinSize, NewCapacity);
+  moveElementsForGrow(NewElts);
+  takeAllocationForGrow(NewElts, NewCapacity);
+}
 
+// Define this out-of-line to dissuade the C++ compiler from inlining it.
+template <typename T, bool TriviallyCopyable>
+void SmallVectorTemplateBase<T, TriviallyCopyable>::moveElementsForGrow(
+    T *NewElts) {
   // Move the elements over.
   this->uninitialized_move(this->begin(), this->end(), NewElts);
 
   // Destroy the original elements.
   destroy_range(this->begin(), this->end());
+}
 
+// Define this out-of-line to dissuade the C++ compiler from inlining it.
+template <typename T, bool TriviallyCopyable>
+void SmallVectorTemplateBase<T, TriviallyCopyable>::takeAllocationForGrow(
+    T *NewElts, size_t NewCapacity) {
   // If this wasn't grown from the inline copy, deallocate the old space.
   if (!this->isSmall())
     free(this->begin());
@@ -303,7 +457,18 @@ void SmallVectorTemplateBase<T, TriviallyCopyable>::grow(size_t MinSize) {
 /// skipping destruction.
 template <typename T>
 class SmallVectorTemplateBase<T, true> : public SmallVectorTemplateCommon<T> {
+  friend class SmallVectorTemplateCommon<T>;
+
 protected:
+  /// True if it's cheap enough to take parameters by value. Doing so avoids
+  /// overhead related to mitigations for reference invalidation.
+  static constexpr bool TakesParamByValue = sizeof(T) <= 2 * sizeof(void *);
+
+  /// Either const T& or T, depending on whether it's cheap enough to take
+  /// parameters by value.
+  using ValueParamT =
+      typename std::conditional<TakesParamByValue, T, const T &>::type;
+
   SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon<T>(Size) {}
 
   // No need to do a destroy loop for POD's.
@@ -344,11 +509,43 @@ protected:
   /// least one more element or MinSize if specified.
   void grow(size_t MinSize = 0) { this->grow_pod(MinSize, sizeof(T)); }
 
+  /// Reserve enough space to add one element, and return the updated element
+  /// pointer in case it was a reference to the storage.
+  const T *reserveForParamAndGetAddress(const T &Elt, size_t N = 1) {
+    return this->reserveForParamAndGetAddressImpl(this, Elt, N);
+  }
+
+  /// Reserve enough space to add one element, and return the updated element
+  /// pointer in case it was a reference to the storage.
+  T *reserveForParamAndGetAddress(T &Elt, size_t N = 1) {
+    return const_cast<T *>(
+        this->reserveForParamAndGetAddressImpl(this, Elt, N));
+  }
+
+  /// Copy \p V or return a reference, depending on \a ValueParamT.
+  static ValueParamT forward_value_param(ValueParamT V) { return V; }
+
+  void growAndAssign(size_t NumElts, T Elt) {
+    // Elt has been copied in case it's an internal reference, side-stepping
+    // reference invalidation problems without losing the realloc optimization.
+    this->set_size(0);
+    this->grow(NumElts);
+    std::uninitialized_fill_n(this->begin(), NumElts, Elt);
+    this->set_size(NumElts);
+  }
+
+  template <typename... ArgTypes> T &growAndEmplaceBack(ArgTypes &&... Args) {
+    // Use push_back with a copy in case Args has an internal reference,
+    // side-stepping reference invalidation problems without losing the realloc
+    // optimization.
+    push_back(T(std::forward<ArgTypes>(Args)...));
+    return this->back();
+  }
+
 public:
-  void push_back(const T &Elt) {
-    if (LLVM_UNLIKELY(this->size() >= this->capacity()))
-      this->grow();
-    memcpy(reinterpret_cast<void *>(this->end()), &Elt, sizeof(T));
+  void push_back(ValueParamT Elt) {
+    const T *EltPtr = reserveForParamAndGetAddress(Elt);
+    memcpy(reinterpret_cast<void *>(this->end()), EltPtr, sizeof(T));
     this->set_size(this->size() + 1);
   }
 
@@ -368,6 +565,9 @@ public:
   using size_type = typename SuperClass::size_type;
 
 protected:
+  using SmallVectorTemplateBase<T>::TakesParamByValue;
+  using ValueParamT = typename SuperClass::ValueParamT;
+
   // Default ctor - Initialize to empty.
   explicit SmallVectorImpl(unsigned N)
       : SmallVectorTemplateBase<T>(N) {}
@@ -387,29 +587,38 @@ public:
     this->Size = 0;
   }
 
-  void resize(size_type N) {
+private:
+  template <bool ForOverwrite> void resizeImpl(size_type N) {
     if (N < this->size()) {
-      this->destroy_range(this->begin()+N, this->end());
-      this->set_size(N);
+      this->pop_back_n(this->size() - N);
     } else if (N > this->size()) {
-      if (this->capacity() < N)
-        this->grow(N);
+      this->reserve(N);
       for (auto I = this->end(), E = this->begin() + N; I != E; ++I)
-        new (&*I) T();
+        if (ForOverwrite)
+          new (&*I) T;
+        else
+          new (&*I) T();
       this->set_size(N);
     }
   }
 
-  void resize(size_type N, const T &NV) {
+public:
+  void resize(size_type N) { resizeImpl<false>(N); }
+
+  /// Like resize, but \ref T is POD, the new values won't be initialized.
+  void resize_for_overwrite(size_type N) { resizeImpl<true>(N); }
+
+  void resize(size_type N, ValueParamT NV) {
+    if (N == this->size())
+      return;
+
     if (N < this->size()) {
-      this->destroy_range(this->begin()+N, this->end());
-      this->set_size(N);
-    } else if (N > this->size()) {
-      if (this->capacity() < N)
-        this->grow(N);
-      std::uninitialized_fill(this->end(), this->begin()+N, NV);
-      this->set_size(N);
+      this->pop_back_n(this->size() - N);
+      return;
     }
+
+    // N > this->size(). Defer to append.
+    this->append(N - this->size(), NV);
   }
 
   void reserve(size_type N) {
@@ -417,6 +626,12 @@ public:
       this->grow(N);
   }
 
+  void pop_back_n(size_type NumItems) {
+    assert(this->size() >= NumItems);
+    this->destroy_range(this->end() - NumItems, this->end());
+    this->set_size(this->size() - NumItems);
+  }
+
   LLVM_NODISCARD T pop_back_val() {
     T Result = ::std::move(this->back());
     this->pop_back();
@@ -431,20 +646,17 @@ public:
                 typename std::iterator_traits<in_iter>::iterator_category,
                 std::input_iterator_tag>::value>>
   void append(in_iter in_start, in_iter in_end) {
+    this->assertSafeToAddRange(in_start, in_end);
     size_type NumInputs = std::distance(in_start, in_end);
-    if (NumInputs > this->capacity() - this->size())
-      this->grow(this->size()+NumInputs);
-
+    this->reserve(this->size() + NumInputs);
     this->uninitialized_copy(in_start, in_end, this->end());
     this->set_size(this->size() + NumInputs);
   }
 
   /// Append \p NumInputs copies of \p Elt to the end.
-  void append(size_type NumInputs, const T &Elt) {
-    if (NumInputs > this->capacity() - this->size())
-      this->grow(this->size()+NumInputs);
-
-    std::uninitialized_fill_n(this->end(), NumInputs, Elt);
+  void append(size_type NumInputs, ValueParamT Elt) {
+    const T *EltPtr = this->reserveForParamAndGetAddress(Elt, NumInputs);
+    std::uninitialized_fill_n(this->end(), NumInputs, *EltPtr);
     this->set_size(this->size() + NumInputs);
   }
 
@@ -452,22 +664,33 @@ public:
     append(IL.begin(), IL.end());
   }
 
-  // FIXME: Consider assigning over existing elements, rather than clearing &
-  // re-initializing them - for all assign(...) variants.
+  void append(const SmallVectorImpl &RHS) { append(RHS.begin(), RHS.end()); }
 
-  void assign(size_type NumElts, const T &Elt) {
-    clear();
-    if (this->capacity() < NumElts)
-      this->grow(NumElts);
+  void assign(size_type NumElts, ValueParamT Elt) {
+    // Note that Elt could be an internal reference.
+    if (NumElts > this->capacity()) {
+      this->growAndAssign(NumElts, Elt);
+      return;
+    }
+
+    // Assign over existing elements.
+    std::fill_n(this->begin(), std::min(NumElts, this->size()), Elt);
+    if (NumElts > this->size())
+      std::uninitialized_fill_n(this->end(), NumElts - this->size(), Elt);
+    else if (NumElts < this->size())
+      this->destroy_range(this->begin() + NumElts, this->end());
     this->set_size(NumElts);
-    std::uninitialized_fill(this->begin(), this->end(), Elt);
   }
 
+  // FIXME: Consider assigning over existing elements, rather than clearing &
+  // re-initializing them - for all assign(...) variants.
+
   template <typename in_iter,
             typename = std::enable_if_t<std::is_convertible<
                 typename std::iterator_traits<in_iter>::iterator_category,
                 std::input_iterator_tag>::value>>
   void assign(in_iter in_start, in_iter in_end) {
+    this->assertSafeToReferenceAfterClear(in_start, in_end);
     clear();
     append(in_start, in_end);
   }
@@ -477,12 +700,13 @@ public:
     append(IL);
   }
 
+  void assign(const SmallVectorImpl &RHS) { assign(RHS.begin(), RHS.end()); }
+
   iterator erase(const_iterator CI) {
     // Just cast away constness because this is a non-const member function.
     iterator I = const_cast<iterator>(CI);
 
-    assert(I >= this->begin() && "Iterator to erase is out of bounds.");
-    assert(I < this->end() && "Erasing at past-the-end iterator.");
+    assert(this->isReferenceToStorage(CI) && "Iterator to erase is out of bounds.");
 
     iterator N = I;
     // Shift all elts down one.
@@ -497,9 +721,7 @@ public:
     iterator S = const_cast<iterator>(CS);
     iterator E = const_cast<iterator>(CE);
 
-    assert(S >= this->begin() && "Range to erase is out of bounds.");
-    assert(S <= E && "Trying to erase invalid range.");
-    assert(E <= this->end() && "Trying to erase past the end.");
+    assert(this->isRangeInStorage(S, E) && "Range to erase is out of bounds.");
 
     iterator N = S;
     // Shift all elts down.
@@ -510,20 +732,26 @@ public:
     return(N);
   }
 
-  iterator insert(iterator I, T &&Elt) {
+private:
+  template <class ArgType> iterator insert_one_impl(iterator I, ArgType &&Elt) {
+    // Callers ensure that ArgType is derived from T.
+    static_assert(
+        std::is_same<std::remove_const_t<std::remove_reference_t<ArgType>>,
+                     T>::value,
+        "ArgType must be derived from T!");
+
     if (I == this->end()) {  // Important special case for empty vector.
-      this->push_back(::std::move(Elt));
+      this->push_back(::std::forward<ArgType>(Elt));
       return this->end()-1;
     }
 
-    assert(I >= this->begin() && "Insertion iterator is out of bounds.");
-    assert(I <= this->end() && "Inserting past the end of the vector.");
+    assert(this->isReferenceToStorage(I) && "Insertion iterator is out of bounds.");
 
-    if (this->size() >= this->capacity()) {
-      size_t EltNo = I-this->begin();
-      this->grow();
-      I = this->begin()+EltNo;
-    }
+    // Grow if necessary.
+    size_t Index = I - this->begin();
+    std::remove_reference_t<ArgType> *EltPtr =
+        this->reserveForParamAndGetAddress(Elt);
+    I = this->begin() + Index;
 
     ::new ((void*) this->end()) T(::std::move(this->back()));
     // Push everything else over.
@@ -531,45 +759,26 @@ public:
     this->set_size(this->size() + 1);
 
     // If we just moved the element we're inserting, be sure to update
-    // the reference.
-    T *EltPtr = &Elt;
-    if (I <= EltPtr && EltPtr < this->end())
+    // the reference (never happens if TakesParamByValue).
+    static_assert(!TakesParamByValue || std::is_same<ArgType, T>::value,
+                  "ArgType must be 'T' when taking by value!");
+    if (!TakesParamByValue && this->isReferenceToRange(EltPtr, I, this->end()))
       ++EltPtr;
 
-    *I = ::std::move(*EltPtr);
+    *I = ::std::forward<ArgType>(*EltPtr);
     return I;
   }
 
-  iterator insert(iterator I, const T &Elt) {
-    if (I == this->end()) {  // Important special case for empty vector.
-      this->push_back(Elt);
-      return this->end()-1;
-    }
-
-    assert(I >= this->begin() && "Insertion iterator is out of bounds.");
-    assert(I <= this->end() && "Inserting past the end of the vector.");
-
-    if (this->size() >= this->capacity()) {
-      size_t EltNo = I-this->begin();
-      this->grow();
-      I = this->begin()+EltNo;
-    }
-    ::new ((void*) this->end()) T(std::move(this->back()));
-    // Push everything else over.
-    std::move_backward(I, this->end()-1, this->end());
-    this->set_size(this->size() + 1);
-
-    // If we just moved the element we're inserting, be sure to update
-    // the reference.
-    const T *EltPtr = &Elt;
-    if (I <= EltPtr && EltPtr < this->end())
-      ++EltPtr;
+public:
+  iterator insert(iterator I, T &&Elt) {
+    return insert_one_impl(I, this->forward_value_param(std::move(Elt)));
+  }
 
-    *I = *EltPtr;
-    return I;
+  iterator insert(iterator I, const T &Elt) {
+    return insert_one_impl(I, this->forward_value_param(Elt));
   }
 
-  iterator insert(iterator I, size_type NumToInsert, const T &Elt) {
+  iterator insert(iterator I, size_type NumToInsert, ValueParamT Elt) {
     // Convert iterator to elt# to avoid invalidating iterator when we reserve()
     size_t InsertElt = I - this->begin();
 
@@ -578,11 +787,11 @@ public:
       return this->begin()+InsertElt;
     }
 
-    assert(I >= this->begin() && "Insertion iterator is out of bounds.");
-    assert(I <= this->end() && "Inserting past the end of the vector.");
+    assert(this->isReferenceToStorage(I) && "Insertion iterator is out of bounds.");
 
-    // Ensure there is enough space.
-    reserve(this->size() + NumToInsert);
+    // Ensure there is enough space, and get the (maybe updated) address of
+    // Elt.
+    const T *EltPtr = this->reserveForParamAndGetAddress(Elt, NumToInsert);
 
     // Uninvalidate the iterator.
     I = this->begin()+InsertElt;
@@ -599,7 +808,12 @@ public:
       // Copy the existing elements that get replaced.
       std::move_backward(I, OldEnd-NumToInsert, OldEnd);
 
-      std::fill_n(I, NumToInsert, Elt);
+      // If we just moved the element we're inserting, be sure to update
+      // the reference (never happens if TakesParamByValue).
+      if (!TakesParamByValue && I <= EltPtr && EltPtr < this->end())
+        EltPtr += NumToInsert;
+
+      std::fill_n(I, NumToInsert, *EltPtr);
       return I;
     }
 
@@ -612,11 +826,16 @@ public:
     size_t NumOverwritten = OldEnd-I;
     this->uninitialized_move(I, OldEnd, this->end()-NumOverwritten);
 
+    // If we just moved the element we're inserting, be sure to update
+    // the reference (never happens if TakesParamByValue).
+    if (!TakesParamByValue && I <= EltPtr && EltPtr < this->end())
+      EltPtr += NumToInsert;
+
     // Replace the overwritten part.
-    std::fill_n(I, NumOverwritten, Elt);
+    std::fill_n(I, NumOverwritten, *EltPtr);
 
     // Insert the non-overwritten middle part.
-    std::uninitialized_fill_n(OldEnd, NumToInsert-NumOverwritten, Elt);
+    std::uninitialized_fill_n(OldEnd, NumToInsert - NumOverwritten, *EltPtr);
     return I;
   }
 
@@ -633,8 +852,10 @@ public:
       return this->begin()+InsertElt;
     }
 
-    assert(I >= this->begin() && "Insertion iterator is out of bounds.");
-    assert(I <= this->end() && "Inserting past the end of the vector.");
+    assert(this->isReferenceToStorage(I) && "Insertion iterator is out of bounds.");
+
+    // Check that the reserve that follows doesn't invalidate the iterators.
+    this->assertSafeToAddRange(From, To);
 
     size_t NumToInsert = std::distance(From, To);
 
@@ -686,7 +907,8 @@ public:
 
   template <typename... ArgTypes> reference emplace_back(ArgTypes &&... Args) {
     if (LLVM_UNLIKELY(this->size() >= this->capacity()))
-      this->grow();
+      return this->growAndEmplaceBack(std::forward<ArgTypes>(Args)...);
+
     ::new ((void *)this->end()) T(std::forward<ArgTypes>(Args)...);
     this->set_size(this->size() + 1);
     return this->back();
@@ -721,10 +943,8 @@ void SmallVectorImpl<T>::swap(SmallVectorImpl<T> &RHS) {
     std::swap(this->Capacity, RHS.Capacity);
     return;
   }
-  if (RHS.size() > this->capacity())
-    this->grow(RHS.size());
-  if (this->size() > RHS.capacity())
-    RHS.grow(this->size());
+  this->reserve(RHS.size());
+  RHS.reserve(this->size());
 
   // Swap the shared elements.
   size_t NumShared = this->size();
@@ -779,8 +999,7 @@ SmallVectorImpl<T> &SmallVectorImpl<T>::
   // FIXME: don't do this if they're efficiently moveable.
   if (this->capacity() < RHSSize) {
     // Destroy current elements.
-    this->destroy_range(this->begin(), this->end());
-    this->set_size(0);
+    this->clear();
     CurSize = 0;
     this->grow(RHSSize);
   } else if (CurSize) {
@@ -839,8 +1058,7 @@ SmallVectorImpl<T> &SmallVectorImpl<T>::operator=(SmallVectorImpl<T> &&RHS) {
   // elements.
   if (this->capacity() < RHSSize) {
     // Destroy current elements.
-    this->destroy_range(this->begin(), this->end());
-    this->set_size(0);
+    this->clear();
     CurSize = 0;
     this->grow(RHSSize);
   } else if (CurSize) {
@@ -863,13 +1081,71 @@ SmallVectorImpl<T> &SmallVectorImpl<T>::operator=(SmallVectorImpl<T> &&RHS) {
 /// to avoid allocating unnecessary storage.
 template <typename T, unsigned N>
 struct SmallVectorStorage {
-  AlignedCharArrayUnion<T> InlineElts[N];
+  alignas(T) char InlineElts[N * sizeof(T)];
 };
 
 /// We need the storage to be properly aligned even for small-size of 0 so that
 /// the pointer math in \a SmallVectorTemplateCommon::getFirstEl() is
 /// well-defined.
-template <typename T> struct alignas(alignof(T)) SmallVectorStorage<T, 0> {};
+template <typename T> struct alignas(T) SmallVectorStorage<T, 0> {};
+
+/// Forward declaration of SmallVector so that
+/// calculateSmallVectorDefaultInlinedElements can reference
+/// `sizeof(SmallVector<T, 0>)`.
+template <typename T, unsigned N> class LLVM_GSL_OWNER SmallVector;
+
+/// Helper class for calculating the default number of inline elements for
+/// `SmallVector<T>`.
+///
+/// This should be migrated to a constexpr function when our minimum
+/// compiler support is enough for multi-statement constexpr functions.
+template <typename T> struct CalculateSmallVectorDefaultInlinedElements {
+  // Parameter controlling the default number of inlined elements
+  // for `SmallVector<T>`.
+  //
+  // The default number of inlined elements ensures that
+  // 1. There is at least one inlined element.
+  // 2. `sizeof(SmallVector<T>) <= kPreferredSmallVectorSizeof` unless
+  // it contradicts 1.
+  static constexpr size_t kPreferredSmallVectorSizeof = 64;
+
+  // static_assert that sizeof(T) is not "too big".
+  //
+  // Because our policy guarantees at least one inlined element, it is possible
+  // for an arbitrarily large inlined element to allocate an arbitrarily large
+  // amount of inline storage. We generally consider it an antipattern for a
+  // SmallVector to allocate an excessive amount of inline storage, so we want
+  // to call attention to these cases and make sure that users are making an
+  // intentional decision if they request a lot of inline storage.
+  //
+  // We want this assertion to trigger in pathological cases, but otherwise
+  // not be too easy to hit. To accomplish that, the cutoff is actually somewhat
+  // larger than kPreferredSmallVectorSizeof (otherwise,
+  // `SmallVector<SmallVector<T>>` would be one easy way to trip it, and that
+  // pattern seems useful in practice).
+  //
+  // One wrinkle is that this assertion is in theory non-portable, since
+  // sizeof(T) is in general platform-dependent. However, we don't expect this
+  // to be much of an issue, because most LLVM development happens on 64-bit
+  // hosts, and therefore sizeof(T) is expected to *decrease* when compiled for
+  // 32-bit hosts, dodging the issue. The reverse situation, where development
+  // happens on a 32-bit host and then fails due to sizeof(T) *increasing* on a
+  // 64-bit host, is expected to be very rare.
+  static_assert(
+      sizeof(T) <= 256,
+      "You are trying to use a default number of inlined elements for "
+      "`SmallVector<T>` but `sizeof(T)` is really big! Please use an "
+      "explicit number of inlined elements with `SmallVector<T, N>` to make "
+      "sure you really want that much inline storage.");
+
+  // Discount the size of the header itself when calculating the maximum inline
+  // bytes.
+  static constexpr size_t PreferredInlineBytes =
+      kPreferredSmallVectorSizeof - sizeof(SmallVector<T, 0>);
+  static constexpr size_t NumElementsThatFit = PreferredInlineBytes / sizeof(T);
+  static constexpr size_t value =
+      NumElementsThatFit == 0 ? 1 : NumElementsThatFit;
+};
 
 /// This is a 'vector' (really, a variable-sized array), optimized
 /// for the case when the array is small.  It contains some number of elements
@@ -877,9 +1153,18 @@ template <typename T> struct alignas(alignof(T)) SmallVectorStorage<T, 0> {};
 /// elements is below that threshold.  This allows normal "small" cases to be
 /// fast without losing generality for large inputs.
 ///
-/// Note that this does not attempt to be exception safe.
+/// \note
+/// In the absence of a well-motivated choice for the number of inlined
+/// elements \p N, it is recommended to use \c SmallVector<T> (that is,
+/// omitting the \p N). This will choose a default number of inlined elements
+/// reasonable for allocation on the stack (for example, trying to keep \c
+/// sizeof(SmallVector<T>) around 64 bytes).
 ///
-template <typename T, unsigned N>
+/// \warning This does not attempt to be exception safe.
+///
+/// \see https://llvm.org/docs/ProgrammersManual.html#llvm-adt-smallvector-h
+template <typename T,
+          unsigned N = CalculateSmallVectorDefaultInlinedElements<T>::value>
 class LLVM_GSL_OWNER SmallVector : public SmallVectorImpl<T>,
                                    SmallVectorStorage<T, N> {
 public:
@@ -918,7 +1203,7 @@ public:
       SmallVectorImpl<T>::operator=(RHS);
   }
 
-  const SmallVector &operator=(const SmallVector &RHS) {
+  SmallVector &operator=(const SmallVector &RHS) {
     SmallVectorImpl<T>::operator=(RHS);
     return *this;
   }
@@ -933,17 +1218,17 @@ public:
       SmallVectorImpl<T>::operator=(::std::move(RHS));
   }
 
-  const SmallVector &operator=(SmallVector &&RHS) {
+  SmallVector &operator=(SmallVector &&RHS) {
     SmallVectorImpl<T>::operator=(::std::move(RHS));
     return *this;
   }
 
-  const SmallVector &operator=(SmallVectorImpl<T> &&RHS) {
+  SmallVector &operator=(SmallVectorImpl<T> &&RHS) {
     SmallVectorImpl<T>::operator=(::std::move(RHS));
     return *this;
   }
 
-  const SmallVector &operator=(std::initializer_list<T> IL) {
+  SmallVector &operator=(std::initializer_list<T> IL) {
     this->assign(IL);
     return *this;
   }
diff --git a/contrib/llvm-project/llvm/include/llvm/ADT/SparseSet.h b/contrib/llvm-project/llvm/include/llvm/ADT/SparseSet.h
index 74457d5fd679..d8acf1ee2f3a 100644
--- a/contrib/llvm-project/llvm/include/llvm/ADT/SparseSet.h
+++ b/contrib/llvm-project/llvm/include/llvm/ADT/SparseSet.h
@@ -229,12 +229,15 @@ public:
     return const_cast<SparseSet*>(this)->findIndex(KeyIndexOf(Key));
   }
 
+  /// Check if the set contains the given \c Key.
+  ///
+  /// @param Key A valid key to find.
+  bool contains(const KeyT &Key) const { return find(Key) == end() ? 0 : 1; }
+
   /// count - Returns 1 if this set contains an element identified by Key,
   /// 0 otherwise.
   ///
-  size_type count(const KeyT &Key) const {
-    return find(Key) == end() ? 0 : 1;
-  }
+  size_type count(const KeyT &Key) const { return contains(Key) ? 1 : 0; }
 
   /// insert - Attempts to insert a new element.
   ///
diff --git a/contrib/llvm-project/llvm/include/llvm/ADT/Statistic.h b/contrib/llvm-project/llvm/include/llvm/ADT/Statistic.h
index d7aff6c5939a..aa338ccff19a 100644
--- a/contrib/llvm-project/llvm/include/llvm/ADT/Statistic.h
+++ b/contrib/llvm-project/llvm/include/llvm/ADT/Statistic.h
@@ -36,6 +36,8 @@
 // configure time.
 #if !defined(NDEBUG) || LLVM_FORCE_ENABLE_STATS
 #define LLVM_ENABLE_STATS 1
+#else
+#define LLVM_ENABLE_STATS 0
 #endif
 
 namespace llvm {
diff --git a/contrib/llvm-project/llvm/include/llvm/ADT/StringExtras.h b/contrib/llvm-project/llvm/include/llvm/ADT/StringExtras.h
index 990a3054a9d2..68e89508cba9 100644
--- a/contrib/llvm-project/llvm/include/llvm/ADT/StringExtras.h
+++ b/contrib/llvm-project/llvm/include/llvm/ADT/StringExtras.h
@@ -66,17 +66,29 @@ inline ArrayRef<uint8_t> arrayRefFromStringRef(StringRef Input) {
 ///
 /// If \p C is not a valid hex digit, -1U is returned.
 inline unsigned hexDigitValue(char C) {
-  if (C >= '0' && C <= '9') return C-'0';
-  if (C >= 'a' && C <= 'f') return C-'a'+10U;
-  if (C >= 'A' && C <= 'F') return C-'A'+10U;
-  return -1U;
+  struct HexTable {
+    unsigned LUT[255] = {};
+    constexpr HexTable() {
+      // Default initialize everything to invalid.
+      for (int i = 0; i < 255; ++i)
+        LUT[i] = ~0U;
+      // Initialize `0`-`9`.
+      for (int i = 0; i < 10; ++i)
+        LUT['0' + i] = i;
+      // Initialize `A`-`F` and `a`-`f`.
+      for (int i = 0; i < 6; ++i)
+        LUT['A' + i] = LUT['a' + i] = 10 + i;
+    }
+  };
+  constexpr HexTable Table;
+  return Table.LUT[static_cast<unsigned char>(C)];
 }
 
 /// Checks if character \p C is one of the 10 decimal digits.
 inline bool isDigit(char C) { return C >= '0' && C <= '9'; }
 
 /// Checks if character \p C is a hexadecimal numeric character.
-inline bool isHexDigit(char C) { return hexDigitValue(C) != -1U; }
+inline bool isHexDigit(char C) { return hexDigitValue(C) != ~0U; }
 
 /// Checks if character \p C is a valid letter as classified by "C" locale.
 inline bool isAlpha(char C) {
@@ -165,34 +177,68 @@ inline std::string toHex(ArrayRef<uint8_t> Input, bool LowerCase = false) {
   return toHex(toStringRef(Input), LowerCase);
 }
 
-inline uint8_t hexFromNibbles(char MSB, char LSB) {
+/// Store the binary representation of the two provided values, \p MSB and
+/// \p LSB, that make up the nibbles of a hexadecimal digit. If \p MSB or \p LSB
+/// do not correspond to proper nibbles of a hexadecimal digit, this method
+/// returns false. Otherwise, returns true.
+inline bool tryGetHexFromNibbles(char MSB, char LSB, uint8_t &Hex) {
   unsigned U1 = hexDigitValue(MSB);
   unsigned U2 = hexDigitValue(LSB);
-  assert(U1 != -1U && U2 != -1U);
+  if (U1 == ~0U || U2 == ~0U)
+    return false;
 
-  return static_cast<uint8_t>((U1 << 4) | U2);
+  Hex = static_cast<uint8_t>((U1 << 4) | U2);
+  return true;
 }
 
-/// Convert hexadecimal string \p Input to its binary representation.
-/// The return string is half the size of \p Input.
-inline std::string fromHex(StringRef Input) {
+/// Return the binary representation of the two provided values, \p MSB and
+/// \p LSB, that make up the nibbles of a hexadecimal digit.
+inline uint8_t hexFromNibbles(char MSB, char LSB) {
+  uint8_t Hex = 0;
+  bool GotHex = tryGetHexFromNibbles(MSB, LSB, Hex);
+  (void)GotHex;
+  assert(GotHex && "MSB and/or LSB do not correspond to hex digits");
+  return Hex;
+}
+
+/// Convert hexadecimal string \p Input to its binary representation and store
+/// the result in \p Output. Returns true if the binary representation could be
+/// converted from the hexadecimal string. Returns false if \p Input contains
+/// non-hexadecimal digits. The output string is half the size of \p Input.
+inline bool tryGetFromHex(StringRef Input, std::string &Output) {
   if (Input.empty())
-    return std::string();
+    return true;
 
-  std::string Output;
   Output.reserve((Input.size() + 1) / 2);
   if (Input.size() % 2 == 1) {
-    Output.push_back(hexFromNibbles('0', Input.front()));
+    uint8_t Hex = 0;
+    if (!tryGetHexFromNibbles('0', Input.front(), Hex))
+      return false;
+
+    Output.push_back(Hex);
     Input = Input.drop_front();
   }
 
   assert(Input.size() % 2 == 0);
   while (!Input.empty()) {
-    uint8_t Hex = hexFromNibbles(Input[0], Input[1]);
+    uint8_t Hex = 0;
+    if (!tryGetHexFromNibbles(Input[0], Input[1], Hex))
+      return false;
+
     Output.push_back(Hex);
     Input = Input.drop_front(2);
   }
-  return Output;
+  return true;
+}
+
+/// Convert hexadecimal string \p Input to its binary representation.
+/// The return string is half the size of \p Input.
+inline std::string fromHex(StringRef Input) {
+  std::string Hex;
+  bool GotHex = tryGetFromHex(Input, Hex);
+  (void)GotHex;
+  assert(GotHex && "Input contains non hex digits");
+  return Hex;
 }
 
 /// Convert the string \p S to an integer of the specified type using
@@ -245,7 +291,7 @@ inline std::string utostr(uint64_t X, bool isNeg = false) {
 
 inline std::string itostr(int64_t X) {
   if (X < 0)
-    return utostr(-static_cast<uint64_t>(X), true);
+    return utostr(static_cast<uint64_t>(1) + ~static_cast<uint64_t>(X), true);
   else
     return utostr(static_cast<uint64_t>(X));
 }
@@ -338,13 +384,16 @@ inline std::string join_impl(IteratorT Begin, IteratorT End,
 
   size_t Len = (std::distance(Begin, End) - 1) * Separator.size();
   for (IteratorT I = Begin; I != End; ++I)
-    Len += (*Begin).size();
+    Len += (*I).size();
   S.reserve(Len);
+  size_t PrevCapacity = S.capacity();
+  (void)PrevCapacity;
   S += (*Begin);
   while (++Begin != End) {
     S += Separator;
     S += (*Begin);
   }
+  assert(PrevCapacity == S.capacity() && "String grew during building");
   return S;
 }
 
@@ -416,6 +465,30 @@ inline std::string join_items(Sep Separator, Args &&... Items) {
   return Result;
 }
 
+/// A helper class to return the specified delimiter string after the first
+/// invocation of operator StringRef().  Used to generate a comma-separated
+/// list from a loop like so:
+///
+/// \code
+///   ListSeparator LS;
+///   for (auto &I : C)
+///     OS << LS << I.getName();
+/// \end
+class ListSeparator {
+  bool First = true;
+  StringRef Separator;
+
+public:
+  ListSeparator(StringRef Separator = ", ") : Separator(Separator) {}
+  operator StringRef() {
+    if (First) {
+      First = false;
+      return {};
+    }
+    return Separator;
+  }
+};
+
 } // end namespace llvm
 
 #endif // LLVM_ADT_STRINGEXTRAS_H
diff --git a/contrib/llvm-project/llvm/include/llvm/ADT/StringMap.h b/contrib/llvm-project/llvm/include/llvm/ADT/StringMap.h
index 840f328db796..a82afc9a817c 100644
--- a/contrib/llvm-project/llvm/include/llvm/ADT/StringMap.h
+++ b/contrib/llvm-project/llvm/include/llvm/ADT/StringMap.h
@@ -78,10 +78,12 @@ protected:
   void init(unsigned Size);
 
 public:
+  static constexpr uintptr_t TombstoneIntVal =
+      static_cast<uintptr_t>(-1)
+      << PointerLikeTypeTraits<StringMapEntryBase *>::NumLowBitsAvailable;
+
   static StringMapEntryBase *getTombstoneVal() {
-    uintptr_t Val = static_cast<uintptr_t>(-1);
-    Val <<= PointerLikeTypeTraits<StringMapEntryBase *>::NumLowBitsAvailable;
-    return reinterpret_cast<StringMapEntryBase *>(Val);
+    return reinterpret_cast<StringMapEntryBase *>(TombstoneIntVal);
   }
 
   unsigned getNumBuckets() const { return NumBuckets; }
@@ -387,7 +389,9 @@ public:
     return static_cast<DerivedTy &>(*this);
   }
 
-  bool operator==(const DerivedTy &RHS) const { return Ptr == RHS.Ptr; }
+  friend bool operator==(const DerivedTy &LHS, const DerivedTy &RHS) {
+    return LHS.Ptr == RHS.Ptr;
+  }
 
   DerivedTy &operator++() { // Preincrement
     ++Ptr;
diff --git a/contrib/llvm-project/llvm/include/llvm/ADT/StringSet.h b/contrib/llvm-project/llvm/include/llvm/ADT/StringSet.h
index 63d929399a4e..c4245175544b 100644
--- a/contrib/llvm-project/llvm/include/llvm/ADT/StringSet.h
+++ b/contrib/llvm-project/llvm/include/llvm/ADT/StringSet.h
@@ -45,6 +45,9 @@ public:
   insert(const StringMapEntry<ValueTy> &mapEntry) {
     return insert(mapEntry.getKey());
   }
+
+  /// Check if the set contains the given \c key.
+  bool contains(StringRef key) const { return Base::FindKey(key) != -1; }
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/include/llvm/ADT/Triple.h b/contrib/llvm-project/llvm/include/llvm/ADT/Triple.h
index 6bad18f19244..eed315c929ad 100644
--- a/contrib/llvm-project/llvm/include/llvm/ADT/Triple.h
+++ b/contrib/llvm-project/llvm/include/llvm/ADT/Triple.h
@@ -56,6 +56,7 @@ public:
     avr,            // AVR: Atmel AVR microcontroller
     bpfel,          // eBPF or extended BPF or 64-bit BPF (little endian)
     bpfeb,          // eBPF or extended BPF or 64-bit BPF (big endian)
+    csky,           // CSKY: csky
     hexagon,        // Hexagon: hexagon
     mips,           // MIPS: mips, mipsallegrex, mipsr6
     mipsel,         // MIPSEL: mipsel, mipsallegrexe, mipsr6el
@@ -63,6 +64,7 @@ public:
     mips64el,       // MIPS64EL: mips64el, mips64r6el, mipsn32el, mipsn32r6el
     msp430,         // MSP430: msp430
     ppc,            // PPC: powerpc
+    ppcle,          // PPCLE: powerpc (little endian)
     ppc64,          // PPC64: powerpc64, ppu
     ppc64le,        // PPC64LE: powerpc64le
     r600,           // R600: AMD GPUs HD2XXX - HD6XXX
@@ -103,6 +105,7 @@ public:
   enum SubArchType {
     NoSubArch,
 
+    ARMSubArch_v8_7a,
     ARMSubArch_v8_6a,
     ARMSubArch_v8_5a,
     ARMSubArch_v8_4a,
@@ -128,6 +131,8 @@ public:
     ARMSubArch_v5te,
     ARMSubArch_v4t,
 
+    AArch64SubArch_arm64e,
+
     KalimbaSubArch_v3,
     KalimbaSubArch_v4,
     KalimbaSubArch_v5,
@@ -142,8 +147,6 @@ public:
     Apple,
     PC,
     SCEI,
-    BGP,
-    BGQ,
     Freescale,
     IBM,
     ImaginationTechnologies,
@@ -175,11 +178,11 @@ public:
     OpenBSD,
     Solaris,
     Win32,
+    ZOS,
     Haiku,
     Minix,
     RTEMS,
     NaCl,       // Native Client
-    CNK,        // BG/P Compute-Node Kernel
     AIX,
     CUDA,       // NVIDIA CUDA
     NVCL,       // NVIDIA OpenCL
@@ -206,6 +209,7 @@ public:
     GNUEABI,
     GNUEABIHF,
     GNUX32,
+    GNUILP32,
     CODE16,
     EABI,
     EABIHF,
@@ -227,6 +231,7 @@ public:
 
     COFF,
     ELF,
+    GOFF,
     MachO,
     Wasm,
     XCOFF,
@@ -471,6 +476,8 @@ public:
     return getSubArch() == Triple::ARMSubArch_v7k;
   }
 
+  bool isOSzOS() const { return getOS() == Triple::ZOS; }
+
   /// isOSDarwin - Is this a "Darwin" OS (macOS, iOS, tvOS or watchOS).
   bool isOSDarwin() const {
     return isMacOSX() || isiOS() || isWatchOS();
@@ -484,6 +491,12 @@ public:
     return getEnvironment() == Triple::MacABI;
   }
 
+  /// Returns true for targets that run on a macOS machine.
+  bool isTargetMachineMac() const {
+    return isMacOSX() || (isOSDarwin() && (isSimulatorEnvironment() ||
+                                           isMacCatalystEnvironment()));
+  }
+
   bool isOSNetBSD() const {
     return getOS() == Triple::NetBSD;
   }
@@ -623,6 +636,9 @@ public:
     return getObjectFormat() == Triple::COFF;
   }
 
+  /// Tests whether the OS uses the GOFF binary format.
+  bool isOSBinFormatGOFF() const { return getObjectFormat() == Triple::GOFF; }
+
   /// Tests whether the environment is MachO.
   bool isOSBinFormatMachO() const {
     return getObjectFormat() == Triple::MachO;
@@ -703,7 +719,20 @@ public:
 
   /// Tests whether the target is AArch64 (little and big endian).
   bool isAArch64() const {
-    return getArch() == Triple::aarch64 || getArch() == Triple::aarch64_be;
+    return getArch() == Triple::aarch64 || getArch() == Triple::aarch64_be ||
+           getArch() == Triple::aarch64_32;
+  }
+
+  /// Tests whether the target is AArch64 and pointers are the size specified by
+  /// \p PointerWidth.
+  bool isAArch64(int PointerWidth) const {
+    assert(PointerWidth == 64 || PointerWidth == 32);
+    if (!isAArch64())
+      return false;
+    return getArch() == Triple::aarch64_32 ||
+                   getEnvironment() == Triple::GNUILP32
+               ? PointerWidth == 32
+               : PointerWidth == 64;
   }
 
   /// Tests whether the target is MIPS 32-bit (little and big endian).
@@ -721,6 +750,17 @@ public:
     return isMIPS32() || isMIPS64();
   }
 
+  /// Tests whether the target is PowerPC (32- or 64-bit LE or BE).
+  bool isPPC() const {
+    return getArch() == Triple::ppc || getArch() == Triple::ppc64 ||
+           getArch() == Triple::ppcle || getArch() == Triple::ppc64le;
+  }
+
+  /// Tests whether the target is 32-bit PowerPC (little and big endian).
+  bool isPPC32() const {
+    return getArch() == Triple::ppc || getArch() == Triple::ppcle;
+  }
+
   /// Tests whether the target is 64-bit PowerPC (little and big endian).
   bool isPPC64() const {
     return getArch() == Triple::ppc64 || getArch() == Triple::ppc64le;
@@ -751,6 +791,17 @@ public:
     return getArch() == Triple::wasm32 || getArch() == Triple::wasm64;
   }
 
+  // Tests whether the target is CSKY
+  bool isCSKY() const {
+    return getArch() == Triple::csky;
+  }
+
+  /// Tests whether the target is the Apple "arm64e" AArch64 subarch.
+  bool isArm64e() const {
+    return getArch() == Triple::aarch64 &&
+           getSubArch() == Triple::AArch64SubArch_arm64e;
+  }
+
   /// Tests whether the target supports comdat
   bool supportsCOMDAT() const {
     return !(isOSBinFormatMachO() || isOSBinFormatXCOFF());
@@ -761,6 +812,14 @@ public:
     return isAndroid() || isOSOpenBSD() || isWindowsCygwinEnvironment();
   }
 
+  /// Tests whether the target uses -data-sections as default.
+  bool hasDefaultDataSections() const {
+    return isOSBinFormatXCOFF() || isWasm();
+  }
+
+  /// Tests if the environment supports dllimport/export annotations.
+  bool hasDLLImportExport() const { return isOSWindows() || isPS4CPU(); }
+
   /// @}
   /// @name Mutators
   /// @{
diff --git a/contrib/llvm-project/llvm/include/llvm/ADT/iterator.h b/contrib/llvm-project/llvm/include/llvm/ADT/iterator.h
index 9a1f6e1511e7..6625a3f6179e 100644
--- a/contrib/llvm-project/llvm/include/llvm/ADT/iterator.h
+++ b/contrib/llvm-project/llvm/include/llvm/ADT/iterator.h
@@ -142,28 +142,30 @@ public:
     return tmp;
   }
 
+#ifndef __cpp_impl_three_way_comparison
   bool operator!=(const DerivedT &RHS) const {
-    return !static_cast<const DerivedT *>(this)->operator==(RHS);
+    return !(static_cast<const DerivedT &>(*this) == RHS);
   }
+#endif
 
   bool operator>(const DerivedT &RHS) const {
     static_assert(
         IsRandomAccess,
         "Relational operators are only defined for random access iterators.");
-    return !static_cast<const DerivedT *>(this)->operator<(RHS) &&
-           !static_cast<const DerivedT *>(this)->operator==(RHS);
+    return !(static_cast<const DerivedT &>(*this) < RHS) &&
+           !(static_cast<const DerivedT &>(*this) == RHS);
   }
   bool operator<=(const DerivedT &RHS) const {
     static_assert(
         IsRandomAccess,
         "Relational operators are only defined for random access iterators.");
-    return !static_cast<const DerivedT *>(this)->operator>(RHS);
+    return !(static_cast<const DerivedT &>(*this) > RHS);
   }
   bool operator>=(const DerivedT &RHS) const {
     static_assert(
         IsRandomAccess,
         "Relational operators are only defined for random access iterators.");
-    return !static_cast<const DerivedT *>(this)->operator<(RHS);
+    return !(static_cast<const DerivedT &>(*this) < RHS);
   }
 
   PointerT operator->() { return &static_cast<DerivedT *>(this)->operator*(); }
@@ -260,12 +262,16 @@ public:
     return *static_cast<DerivedT *>(this);
   }
 
-  bool operator==(const DerivedT &RHS) const { return I == RHS.I; }
-  bool operator<(const DerivedT &RHS) const {
+  friend bool operator==(const iterator_adaptor_base &LHS,
+                         const iterator_adaptor_base &RHS) {
+    return LHS.I == RHS.I;
+  }
+  friend bool operator<(const iterator_adaptor_base &LHS,
+                        const iterator_adaptor_base &RHS) {
     static_assert(
         BaseT::IsRandomAccess,
         "Relational operators are only defined for random access iterators.");
-    return I < RHS.I;
+    return LHS.I < RHS.I;
   }
 
   ReferenceT operator*() const { return *I; }
diff --git a/contrib/llvm-project/llvm/include/llvm/ADT/iterator_range.h b/contrib/llvm-project/llvm/include/llvm/ADT/iterator_range.h
index f038f6bf2128..a9b46a3aa45b 100644
--- a/contrib/llvm-project/llvm/include/llvm/ADT/iterator_range.h
+++ b/contrib/llvm-project/llvm/include/llvm/ADT/iterator_range.h
@@ -18,7 +18,6 @@
 #ifndef LLVM_ADT_ITERATOR_RANGE_H
 #define LLVM_ADT_ITERATOR_RANGE_H
 
-#include <iterator>
 #include <utility>
 
 namespace llvm {
diff --git a/contrib/llvm-project/llvm/include/llvm/ADT/simple_ilist.h b/contrib/llvm-project/llvm/include/llvm/ADT/simple_ilist.h
index 9257b47b9cf8..d4b6be347219 100644
--- a/contrib/llvm-project/llvm/include/llvm/ADT/simple_ilist.h
+++ b/contrib/llvm-project/llvm/include/llvm/ADT/simple_ilist.h
@@ -28,8 +28,8 @@ namespace llvm {
 /// This is a simple intrusive list for a \c T that inherits from \c
 /// ilist_node<T>.  The list never takes ownership of anything inserted in it.
 ///
-/// Unlike \a iplist<T> and \a ilist<T>, \a simple_ilist<T> never allocates or
-/// deletes values, and has no callback traits.
+/// Unlike \a iplist<T> and \a ilist<T>, \a simple_ilist<T> never deletes
+/// values, and has no callback traits.
 ///
 /// The API for adding nodes include \a push_front(), \a push_back(), and \a
 /// insert().  These all take values by reference (not by pointer), except for
@@ -52,7 +52,7 @@ namespace llvm {
 /// to calling \a std::for_each() on the range to be discarded.
 ///
 /// The currently available \p Options customize the nodes in the list.  The
-/// same options must be specified in the \a ilist_node instantation for
+/// same options must be specified in the \a ilist_node instantiation for
 /// compatibility (although the order is irrelevant).
 /// \li Use \a ilist_tag to designate which ilist_node for a given \p T this
 /// list should use.  This is useful if a type \p T is part of multiple,
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/AliasAnalysis.h b/contrib/llvm-project/llvm/include/llvm/Analysis/AliasAnalysis.h
index c35ee2f499de..9f7461243f35 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/AliasAnalysis.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/AliasAnalysis.h
@@ -42,10 +42,6 @@
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/MemoryLocation.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Pass.h"
 #include <cstdint>
@@ -56,9 +52,17 @@
 namespace llvm {
 
 class AnalysisUsage;
+class AtomicCmpXchgInst;
 class BasicAAResult;
 class BasicBlock;
+class CatchPadInst;
+class CatchReturnInst;
 class DominatorTree;
+class FenceInst;
+class Function;
+class InvokeInst;
+class PreservedAnalyses;
+class TargetLibraryInfo;
 class Value;
 
 /// The possible results of an alias query.
@@ -342,12 +346,28 @@ createModRefInfo(const FunctionModRefBehavior FMRB) {
 class AAQueryInfo {
 public:
   using LocPair = std::pair<MemoryLocation, MemoryLocation>;
-  using AliasCacheT = SmallDenseMap<LocPair, AliasResult, 8>;
+  struct CacheEntry {
+    AliasResult Result;
+    /// Number of times a NoAlias assumption has been used.
+    /// 0 for assumptions that have not been used, -1 for definitive results.
+    int NumAssumptionUses;
+    /// Whether this is a definitive (non-assumption) result.
+    bool isDefinitive() const { return NumAssumptionUses < 0; }
+  };
+  using AliasCacheT = SmallDenseMap<LocPair, CacheEntry, 8>;
   AliasCacheT AliasCache;
 
   using IsCapturedCacheT = SmallDenseMap<const Value *, bool, 8>;
   IsCapturedCacheT IsCapturedCache;
 
+  /// How many active NoAlias assumption uses there are.
+  int NumAssumptionUses = 0;
+
+  /// Location pairs for which an assumption based result is currently stored.
+  /// Used to remove all potentially incorrect results from the cache if an
+  /// assumption is disproven.
+  SmallVector<AAQueryInfo::LocPair, 4> AssumptionBasedResults;
+
   AAQueryInfo() : AliasCache(), IsCapturedCache() {}
 };
 
@@ -401,7 +421,8 @@ public:
 
   /// A convenience wrapper around the primary \c alias interface.
   AliasResult alias(const Value *V1, const Value *V2) {
-    return alias(V1, LocationSize::unknown(), V2, LocationSize::unknown());
+    return alias(MemoryLocation::getBeforeOrAfter(V1),
+                 MemoryLocation::getBeforeOrAfter(V2));
   }
 
   /// A trivial helper function to check to see if the specified pointers are
@@ -418,7 +439,8 @@ public:
 
   /// A convenience wrapper around the \c isNoAlias helper interface.
   bool isNoAlias(const Value *V1, const Value *V2) {
-    return isNoAlias(MemoryLocation(V1), MemoryLocation(V2));
+    return isNoAlias(MemoryLocation::getBeforeOrAfter(V1),
+                     MemoryLocation::getBeforeOrAfter(V2));
   }
 
   /// A trivial helper function to check to see if the specified pointers are
@@ -440,7 +462,7 @@ public:
   /// A convenience wrapper around the primary \c pointsToConstantMemory
   /// interface.
   bool pointsToConstantMemory(const Value *P, bool OrLocal = false) {
-    return pointsToConstantMemory(MemoryLocation(P), OrLocal);
+    return pointsToConstantMemory(MemoryLocation::getBeforeOrAfter(P), OrLocal);
   }
 
   /// @}
@@ -533,7 +555,7 @@ public:
   /// write at most from objects pointed to by their pointer-typed arguments
   /// (with arbitrary offsets).
   static bool onlyAccessesArgPointees(FunctionModRefBehavior MRB) {
-    return !(MRB & FMRL_Anywhere & ~FMRL_ArgumentPointees);
+    return !((unsigned)MRB & FMRL_Anywhere & ~FMRL_ArgumentPointees);
   }
 
   /// Checks if functions with the specified behavior are known to potentially
@@ -541,26 +563,27 @@ public:
   /// (with arbitrary offsets).
   static bool doesAccessArgPointees(FunctionModRefBehavior MRB) {
     return isModOrRefSet(createModRefInfo(MRB)) &&
-           (MRB & FMRL_ArgumentPointees);
+           ((unsigned)MRB & FMRL_ArgumentPointees);
   }
 
   /// Checks if functions with the specified behavior are known to read and
   /// write at most from memory that is inaccessible from LLVM IR.
   static bool onlyAccessesInaccessibleMem(FunctionModRefBehavior MRB) {
-    return !(MRB & FMRL_Anywhere & ~FMRL_InaccessibleMem);
+    return !((unsigned)MRB & FMRL_Anywhere & ~FMRL_InaccessibleMem);
   }
 
   /// Checks if functions with the specified behavior are known to potentially
   /// read or write from memory that is inaccessible from LLVM IR.
   static bool doesAccessInaccessibleMem(FunctionModRefBehavior MRB) {
-    return isModOrRefSet(createModRefInfo(MRB)) && (MRB & FMRL_InaccessibleMem);
+    return isModOrRefSet(createModRefInfo(MRB)) &&
+             ((unsigned)MRB & FMRL_InaccessibleMem);
   }
 
   /// Checks if functions with the specified behavior are known to read and
   /// write at most from memory that is inaccessible from LLVM IR or objects
   /// pointed to by their pointer-typed arguments (with arbitrary offsets).
   static bool onlyAccessesInaccessibleOrArgMem(FunctionModRefBehavior MRB) {
-    return !(MRB & FMRL_Anywhere &
+    return !((unsigned)MRB & FMRL_Anywhere &
              ~(FMRL_InaccessibleMem | FMRL_ArgumentPointees));
   }
 
@@ -760,40 +783,7 @@ private:
                            AAQueryInfo &AAQI);
   ModRefInfo getModRefInfo(const Instruction *I,
                            const Optional<MemoryLocation> &OptLoc,
-                           AAQueryInfo &AAQIP) {
-    if (OptLoc == None) {
-      if (const auto *Call = dyn_cast<CallBase>(I)) {
-        return createModRefInfo(getModRefBehavior(Call));
-      }
-    }
-
-    const MemoryLocation &Loc = OptLoc.getValueOr(MemoryLocation());
-
-    switch (I->getOpcode()) {
-    case Instruction::VAArg:
-      return getModRefInfo((const VAArgInst *)I, Loc, AAQIP);
-    case Instruction::Load:
-      return getModRefInfo((const LoadInst *)I, Loc, AAQIP);
-    case Instruction::Store:
-      return getModRefInfo((const StoreInst *)I, Loc, AAQIP);
-    case Instruction::Fence:
-      return getModRefInfo((const FenceInst *)I, Loc, AAQIP);
-    case Instruction::AtomicCmpXchg:
-      return getModRefInfo((const AtomicCmpXchgInst *)I, Loc, AAQIP);
-    case Instruction::AtomicRMW:
-      return getModRefInfo((const AtomicRMWInst *)I, Loc, AAQIP);
-    case Instruction::Call:
-      return getModRefInfo((const CallInst *)I, Loc, AAQIP);
-    case Instruction::Invoke:
-      return getModRefInfo((const InvokeInst *)I, Loc, AAQIP);
-    case Instruction::CatchPad:
-      return getModRefInfo((const CatchPadInst *)I, Loc, AAQIP);
-    case Instruction::CatchRet:
-      return getModRefInfo((const CatchReturnInst *)I, Loc, AAQIP);
-    default:
-      return ModRefInfo::NoModRef;
-    }
-  }
+                           AAQueryInfo &AAQIP);
 
   class Concept;
 
@@ -807,6 +797,9 @@ private:
 
   std::vector<AnalysisKey *> AADeps;
 
+  /// Query depth used to distinguish recursive queries.
+  unsigned Depth = 0;
+
   friend class BatchAAResults;
 };
 
@@ -847,6 +840,13 @@ public:
   FunctionModRefBehavior getModRefBehavior(const CallBase *Call) {
     return AA.getModRefBehavior(Call);
   }
+  bool isMustAlias(const MemoryLocation &LocA, const MemoryLocation &LocB) {
+    return alias(LocA, LocB) == MustAlias;
+  }
+  bool isMustAlias(const Value *V1, const Value *V2) {
+    return alias(MemoryLocation(V1, LocationSize::precise(1)),
+                 MemoryLocation(V2, LocationSize::precise(1))) == MustAlias;
+  }
 };
 
 /// Temporary typedef for legacy code that uses a generic \c AliasAnalysis
@@ -1107,9 +1107,6 @@ public:
 /// Return true if this pointer is returned by a noalias function.
 bool isNoAliasCall(const Value *V);
 
-/// Return true if this is an argument with the noalias attribute.
-bool isNoAliasArgument(const Value *V);
-
 /// Return true if this pointer refers to a distinct and identifiable object.
 /// This returns true for:
 ///    Global Variables and Functions (but not Global Aliases)
@@ -1157,12 +1154,7 @@ public:
     ResultGetters.push_back(&getModuleAAResultImpl<AnalysisT>);
   }
 
-  Result run(Function &F, FunctionAnalysisManager &AM) {
-    Result R(AM.getResult<TargetLibraryAnalysis>(F));
-    for (auto &Getter : ResultGetters)
-      (*Getter)(F, AM, R);
-    return R;
-  }
+  Result run(Function &F, FunctionAnalysisManager &AM);
 
 private:
   friend AnalysisInfoMixin<AAManager>;
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/AliasSetTracker.h b/contrib/llvm-project/llvm/include/llvm/Analysis/AliasSetTracker.h
index 690a94d9cf2c..b27fd5aa92a7 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/AliasSetTracker.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/AliasSetTracker.h
@@ -20,9 +20,10 @@
 #include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/ADT/ilist.h"
 #include "llvm/ADT/ilist_node.h"
-#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/Casting.h"
 #include <cassert>
@@ -33,6 +34,7 @@
 
 namespace llvm {
 
+class AAResults;
 class AliasSetTracker;
 class BasicBlock;
 class LoadInst;
@@ -45,6 +47,8 @@ class StoreInst;
 class VAArgInst;
 class Value;
 
+enum AliasResult : uint8_t;
+
 class AliasSet : public ilist_node<AliasSet> {
   friend class AliasSetTracker;
 
@@ -293,7 +297,7 @@ private:
   void addPointer(AliasSetTracker &AST, PointerRec &Entry, LocationSize Size,
                   const AAMDNodes &AAInfo, bool KnownMustAlias = false,
                   bool SkipSizeUpdate = false);
-  void addUnknownInst(Instruction *I, AliasAnalysis &AA);
+  void addUnknownInst(Instruction *I, AAResults &AA);
 
   void removeUnknownInst(AliasSetTracker &AST, Instruction *I) {
     bool WasEmpty = UnknownInsts.empty();
@@ -311,8 +315,8 @@ public:
   /// If the specified pointer "may" (or must) alias one of the members in the
   /// set return the appropriate AliasResult. Otherwise return NoAlias.
   AliasResult aliasesPointer(const Value *Ptr, LocationSize Size,
-                             const AAMDNodes &AAInfo, AliasAnalysis &AA) const;
-  bool aliasesUnknownInst(const Instruction *Inst, AliasAnalysis &AA) const;
+                             const AAMDNodes &AAInfo, AAResults &AA) const;
+  bool aliasesUnknownInst(const Instruction *Inst, AAResults &AA) const;
 };
 
 inline raw_ostream& operator<<(raw_ostream &OS, const AliasSet &AS) {
@@ -338,7 +342,7 @@ class AliasSetTracker {
   /// handle.
   struct ASTCallbackVHDenseMapInfo : public DenseMapInfo<Value *> {};
 
-  AliasAnalysis &AA;
+  AAResults &AA;
   MemorySSA *MSSA = nullptr;
   Loop *L = nullptr;
   ilist<AliasSet> AliasSets;
@@ -352,9 +356,9 @@ class AliasSetTracker {
 public:
   /// Create an empty collection of AliasSets, and use the specified alias
   /// analysis object to disambiguate load and store addresses.
-  explicit AliasSetTracker(AliasAnalysis &aa) : AA(aa) {}
-  explicit AliasSetTracker(AliasAnalysis &aa, MemorySSA *mssa, Loop *l)
-      : AA(aa), MSSA(mssa), L(l) {}
+  explicit AliasSetTracker(AAResults &AA) : AA(AA) {}
+  explicit AliasSetTracker(AAResults &AA, MemorySSA *MSSA, Loop *L)
+      : AA(AA), MSSA(MSSA), L(L) {}
   ~AliasSetTracker() { clear(); }
 
   /// These methods are used to add different types of instructions to the alias
@@ -393,7 +397,7 @@ public:
   AliasSet &getAliasSetFor(const MemoryLocation &MemLoc);
 
   /// Return the underlying alias analysis object used by this tracker.
-  AliasAnalysis &getAliasAnalysis() const { return AA; }
+  AAResults &getAliasAnalysis() const { return AA; }
 
   /// This method is used to remove a pointer value from the AliasSetTracker
   /// entirely. It should be used when an instruction is deleted from the
@@ -457,6 +461,14 @@ inline raw_ostream& operator<<(raw_ostream &OS, const AliasSetTracker &AST) {
   return OS;
 }
 
+class AliasSetsPrinterPass : public PassInfoMixin<AliasSetsPrinterPass> {
+  raw_ostream &OS;
+
+public:
+  explicit AliasSetsPrinterPass(raw_ostream &OS);
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
 } // end namespace llvm
 
 #endif // LLVM_ANALYSIS_ALIASSETTRACKER_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/AssumptionCache.h b/contrib/llvm-project/llvm/include/llvm/Analysis/AssumptionCache.h
index 0ef63dc68e1c..c4602d3449c0 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/AssumptionCache.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/AssumptionCache.h
@@ -45,7 +45,7 @@ public:
   enum : unsigned { ExprResultIdx = std::numeric_limits<unsigned>::max() };
 
   struct ResultElem {
-    WeakTrackingVH Assume;
+    WeakVH Assume;
 
     /// contains either ExprResultIdx or the index of the operand bundle
     /// containing the knowledge.
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/BasicAliasAnalysis.h b/contrib/llvm-project/llvm/include/llvm/Analysis/BasicAliasAnalysis.h
index 9214bfcd7a24..46b8cd1f3a88 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/BasicAliasAnalysis.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/BasicAliasAnalysis.h
@@ -18,9 +18,6 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/MemoryLocation.h"
-#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Pass.h"
 #include <algorithm>
@@ -120,6 +117,9 @@ private:
 
     APInt Scale;
 
+    // Context instruction to use when querying information about this index.
+    const Instruction *CxtI;
+
     bool operator==(const VariableGEPIndex &Other) const {
       return V == Other.V && ZExtBits == Other.ZExtBits &&
              SExtBits == Other.SExtBits && Scale == Other.Scale;
@@ -128,6 +128,17 @@ private:
     bool operator!=(const VariableGEPIndex &Other) const {
       return !operator==(Other);
     }
+
+    void dump() const {
+      print(dbgs());
+      dbgs() << "\n";
+    }
+    void print(raw_ostream &OS) const {
+      OS << "(V=" << V->getName()
+	 << ", zextbits=" << ZExtBits
+	 << ", sextbits=" << SExtBits
+	 << ", scale=" << Scale << ")";
+    }
   };
 
   // Represents the internal structure of a GEP, decomposed into a base pointer,
@@ -135,15 +146,29 @@ private:
   struct DecomposedGEP {
     // Base pointer of the GEP
     const Value *Base;
-    // Total constant offset w.r.t the base from indexing into structs
-    APInt StructOffset;
-    // Total constant offset w.r.t the base from indexing through
-    // pointers/arrays/vectors
-    APInt OtherOffset;
+    // Total constant offset from base.
+    APInt Offset;
     // Scaled variable (non-constant) indices.
     SmallVector<VariableGEPIndex, 4> VarIndices;
     // Is GEP index scale compile-time constant.
     bool HasCompileTimeConstantScale;
+
+    void dump() const {
+      print(dbgs());
+      dbgs() << "\n";
+    }
+    void print(raw_ostream &OS) const {
+      OS << "(DecomposedGEP Base=" << Base->getName()
+	 << ", Offset=" << Offset
+	 << ", VarIndices=[";
+      for (size_t i = 0; i < VarIndices.size(); i++) {
+       if (i != 0)
+         OS << ", ";
+       VarIndices[i].print(OS);
+      }
+      OS << "], HasCompileTimeConstantScale=" << HasCompileTimeConstantScale
+	 << ")";
+    }
   };
 
   /// Tracks phi nodes we have visited.
@@ -171,8 +196,9 @@ private:
                       const DataLayout &DL, unsigned Depth, AssumptionCache *AC,
                       DominatorTree *DT, bool &NSW, bool &NUW);
 
-  static bool DecomposeGEPExpression(const Value *V, DecomposedGEP &Decomposed,
-      const DataLayout &DL, AssumptionCache *AC, DominatorTree *DT);
+  static DecomposedGEP
+  DecomposeGEPExpression(const Value *V, const DataLayout &DL,
+                         AssumptionCache *AC, DominatorTree *DT);
 
   static bool isGEPBaseAtNegativeOffset(const GEPOperator *GEPOp,
       const DecomposedGEP &DecompGEP, const DecomposedGEP &DecompObject,
@@ -206,18 +232,23 @@ private:
   AliasResult aliasPHI(const PHINode *PN, LocationSize PNSize,
                        const AAMDNodes &PNAAInfo, const Value *V2,
                        LocationSize V2Size, const AAMDNodes &V2AAInfo,
-                       const Value *UnderV2, AAQueryInfo &AAQI);
+                       AAQueryInfo &AAQI);
 
   AliasResult aliasSelect(const SelectInst *SI, LocationSize SISize,
                           const AAMDNodes &SIAAInfo, const Value *V2,
                           LocationSize V2Size, const AAMDNodes &V2AAInfo,
-                          const Value *UnderV2, AAQueryInfo &AAQI);
+                          AAQueryInfo &AAQI);
 
   AliasResult aliasCheck(const Value *V1, LocationSize V1Size,
-                         AAMDNodes V1AATag, const Value *V2,
-                         LocationSize V2Size, AAMDNodes V2AATag,
-                         AAQueryInfo &AAQI, const Value *O1 = nullptr,
-                         const Value *O2 = nullptr);
+                         const AAMDNodes &V1AATag, const Value *V2,
+                         LocationSize V2Size, const AAMDNodes &V2AATag,
+                         AAQueryInfo &AAQI);
+
+  AliasResult aliasCheckRecursive(const Value *V1, LocationSize V1Size,
+                                  const AAMDNodes &V1AATag, const Value *V2,
+                                  LocationSize V2Size, const AAMDNodes &V2AATag,
+                                  AAQueryInfo &AAQI, const Value *O1,
+                                  const Value *O2);
 };
 
 /// Analysis pass providing a never-invalidated alias analysis result.
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h b/contrib/llvm-project/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h
index 868da7a64f68..c22787531117 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h
@@ -169,7 +169,7 @@ inline raw_ostream &operator<<(raw_ostream &OS, BlockMass X) {
 /// algorithms for BlockFrequencyInfoImplBase.  Only algorithms that depend on
 /// the block type (or that call such algorithms) are skipped here.
 ///
-/// Nevertheless, the majority of the overall algorithm documention lives with
+/// Nevertheless, the majority of the overall algorithm documentation lives with
 /// BlockFrequencyInfoImpl.  See there for details.
 class BlockFrequencyInfoImplBase {
 public:
@@ -458,7 +458,7 @@ public:
 
   /// Analyze irreducible SCCs.
   ///
-  /// Separate irreducible SCCs from \c G, which is an explict graph of \c
+  /// Separate irreducible SCCs from \c G, which is an explicit graph of \c
   /// OuterLoop (or the top-level function, if \c OuterLoop is \c nullptr).
   /// Insert them into \a Loops before \c Insert.
   ///
@@ -706,7 +706,7 @@ void IrreducibleGraph::addEdges(const BlockNode &Node,
 ///
 /// In addition to loops, this algorithm has limited support for irreducible
 /// SCCs, which are SCCs with multiple entry blocks.  Irreducible SCCs are
-/// discovered on they fly, and modelled as loops with multiple headers.
+/// discovered on the fly, and modelled as loops with multiple headers.
 ///
 /// The headers of irreducible sub-SCCs consist of its entry blocks and all
 /// nodes that are targets of a backedge within it (excluding backedges within
@@ -1246,7 +1246,7 @@ bool BlockFrequencyInfoImpl<BT>::computeMassInLoop(LoopData &Loop) {
       }
     }
     // As a heuristic, if some headers don't have a weight, give them the
-    // minimium weight seen (not to disrupt the existing trends too much by
+    // minimum weight seen (not to disrupt the existing trends too much by
     // using a weight that's in the general range of the other headers' weights,
     // and the minimum seems to perform better than the average.)
     // FIXME: better update in the passes that drop the header weight.
@@ -1449,8 +1449,8 @@ void BlockFrequencyInfoImpl<BT>::verifyMatch(
       BlockNode Node = Entry.second;
       if (OtherValidNodes.count(BB)) {
         BlockNode OtherNode = OtherValidNodes[BB];
-        auto Freq = Freqs[Node.Index];
-        auto OtherFreq = Other.Freqs[OtherNode.Index];
+        const auto &Freq = Freqs[Node.Index];
+        const auto &OtherFreq = Other.Freqs[OtherNode.Index];
         if (Freq.Integer != OtherFreq.Integer) {
           Match = false;
           dbgs() << "Freq mismatch: " << bfi_detail::getBlockName(BB) << " "
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/BranchProbabilityInfo.h b/contrib/llvm-project/llvm/include/llvm/Analysis/BranchProbabilityInfo.h
index 3e72afba36c3..6a286236a80e 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/BranchProbabilityInfo.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/BranchProbabilityInfo.h
@@ -27,13 +27,16 @@
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
+#include <memory>
 #include <utility>
 
 namespace llvm {
 
 class Function;
+class Loop;
 class LoopInfo;
 class raw_ostream;
+class DominatorTree;
 class PostDominatorTree;
 class TargetLibraryInfo;
 class Value;
@@ -50,20 +53,79 @@ class Value;
 /// identify an edge, since we can have multiple edges from Src to Dst.
 /// As an example, we can have a switch which jumps to Dst with value 0 and
 /// value 10.
+///
+/// Process of computing branch probabilities can be logically viewed as three
+/// step process:
+///
+///   First, if there is a profile information associated with the branch then
+/// it is trivially translated to branch probabilities. There is one exception
+/// from this rule though. Probabilities for edges leading to "unreachable"
+/// blocks (blocks with the estimated weight not greater than
+/// UNREACHABLE_WEIGHT) are evaluated according to static estimation and
+/// override profile information. If no branch probabilities were calculated
+/// on this step then take the next one.
+///
+///   Second, estimate absolute execution weights for each block based on
+/// statically known information. Roots of such information are "cold",
+/// "unreachable", "noreturn" and "unwind" blocks. Those blocks get their
+/// weights set to BlockExecWeight::COLD, BlockExecWeight::UNREACHABLE,
+/// BlockExecWeight::NORETURN and BlockExecWeight::UNWIND respectively. Then the
+/// weights are propagated to the other blocks up the domination line. In
+/// addition, if all successors have estimated weights set then maximum of these
+/// weights assigned to the block itself (while this is not ideal heuristic in
+/// theory it's simple and works reasonably well in most cases) and the process
+/// repeats. Once the process of weights propagation converges branch
+/// probabilities are set for all such branches that have at least one successor
+/// with the weight set. Default execution weight (BlockExecWeight::DEFAULT) is
+/// used for any successors which doesn't have its weight set. For loop back
+/// branches we use their weights scaled by loop trip count equal to
+/// 'LBH_TAKEN_WEIGHT/LBH_NOTTAKEN_WEIGHT'.
+///
+/// Here is a simple example demonstrating how the described algorithm works.
+///
+///          BB1
+///         /   \
+///        v     v
+///      BB2     BB3
+///     /   \
+///    v     v
+///  ColdBB  UnreachBB
+///
+/// Initially, ColdBB is associated with COLD_WEIGHT and UnreachBB with
+/// UNREACHABLE_WEIGHT. COLD_WEIGHT is set to BB2 as maximum between its
+/// successors. BB1 and BB3 has no explicit estimated weights and assumed to
+/// have DEFAULT_WEIGHT. Based on assigned weights branches will have the
+/// following probabilities:
+/// P(BB1->BB2) = COLD_WEIGHT/(COLD_WEIGHT + DEFAULT_WEIGHT) =
+///   0xffff / (0xffff + 0xfffff) = 0.0588(5.9%)
+/// P(BB1->BB3) = DEFAULT_WEIGHT_WEIGHT/(COLD_WEIGHT + DEFAULT_WEIGHT) =
+///          0xfffff / (0xffff + 0xfffff) = 0.941(94.1%)
+/// P(BB2->ColdBB) = COLD_WEIGHT/(COLD_WEIGHT + UNREACHABLE_WEIGHT) = 1(100%)
+/// P(BB2->UnreachBB) =
+///   UNREACHABLE_WEIGHT/(COLD_WEIGHT+UNREACHABLE_WEIGHT) = 0(0%)
+///
+/// If no branch probabilities were calculated on this step then take the next
+/// one.
+///
+///   Third, apply different kinds of local heuristics for each individual
+/// branch until first match. For example probability of a pointer to be null is
+/// estimated as PH_TAKEN_WEIGHT/(PH_TAKEN_WEIGHT + PH_NONTAKEN_WEIGHT). If
+/// no local heuristic has been matched then branch is left with no explicit
+/// probability set and assumed to have default probability.
 class BranchProbabilityInfo {
 public:
   BranchProbabilityInfo() = default;
 
   BranchProbabilityInfo(const Function &F, const LoopInfo &LI,
                         const TargetLibraryInfo *TLI = nullptr,
+                        DominatorTree *DT = nullptr,
                         PostDominatorTree *PDT = nullptr) {
-    calculate(F, LI, TLI, PDT);
+    calculate(F, LI, TLI, DT, PDT);
   }
 
   BranchProbabilityInfo(BranchProbabilityInfo &&Arg)
       : Probs(std::move(Arg.Probs)), LastF(Arg.LastF),
-        PostDominatedByUnreachable(std::move(Arg.PostDominatedByUnreachable)),
-        PostDominatedByColdCall(std::move(Arg.PostDominatedByColdCall)) {}
+        EstimatedBlockWeight(std::move(Arg.EstimatedBlockWeight)) {}
 
   BranchProbabilityInfo(const BranchProbabilityInfo &) = delete;
   BranchProbabilityInfo &operator=(const BranchProbabilityInfo &) = delete;
@@ -71,8 +133,7 @@ public:
   BranchProbabilityInfo &operator=(BranchProbabilityInfo &&RHS) {
     releaseMemory();
     Probs = std::move(RHS.Probs);
-    PostDominatedByColdCall = std::move(RHS.PostDominatedByColdCall);
-    PostDominatedByUnreachable = std::move(RHS.PostDominatedByUnreachable);
+    EstimatedBlockWeight = std::move(RHS.EstimatedBlockWeight);
     return *this;
   }
 
@@ -121,16 +182,6 @@ public:
   raw_ostream &printEdgeProbability(raw_ostream &OS, const BasicBlock *Src,
                                     const BasicBlock *Dst) const;
 
-protected:
-  /// Set the raw edge probability for the given edge.
-  ///
-  /// This allows a pass to explicitly set the edge probability for an edge. It
-  /// can be used when updating the CFG to update and preserve the branch
-  /// probability information. Read the implementation of how these edge
-  /// probabilities are calculated carefully before using!
-  void setEdgeProbability(const BasicBlock *Src, unsigned IndexInSuccessors,
-                          BranchProbability Prob);
-
 public:
   /// Set the raw probabilities for all edges from the given block.
   ///
@@ -140,24 +191,85 @@ public:
   void setEdgeProbability(const BasicBlock *Src,
                           const SmallVectorImpl<BranchProbability> &Probs);
 
+  /// Copy outgoing edge probabilities from \p Src to \p Dst.
+  ///
+  /// This allows to keep probabilities unset for the destination if they were
+  /// unset for source.
+  void copyEdgeProbabilities(BasicBlock *Src, BasicBlock *Dst);
+
   static BranchProbability getBranchProbStackProtector(bool IsLikely) {
     static const BranchProbability LikelyProb((1u << 20) - 1, 1u << 20);
     return IsLikely ? LikelyProb : LikelyProb.getCompl();
   }
 
   void calculate(const Function &F, const LoopInfo &LI,
-                 const TargetLibraryInfo *TLI, PostDominatorTree *PDT);
+                 const TargetLibraryInfo *TLI, DominatorTree *DT,
+                 PostDominatorTree *PDT);
 
   /// Forget analysis results for the given basic block.
   void eraseBlock(const BasicBlock *BB);
 
-  // Use to track SCCs for handling irreducible loops.
-  using SccMap = DenseMap<const BasicBlock *, int>;
-  using SccHeaderMap = DenseMap<const BasicBlock *, bool>;
-  using SccHeaderMaps = std::vector<SccHeaderMap>;
-  struct SccInfo {
+  // Data structure to track SCCs for handling irreducible loops.
+  class SccInfo {
+    // Enum of types to classify basic blocks in SCC. Basic block belonging to
+    // SCC is 'Inner' until it is either 'Header' or 'Exiting'. Note that a
+    // basic block can be 'Header' and 'Exiting' at the same time.
+    enum SccBlockType {
+      Inner = 0x0,
+      Header = 0x1,
+      Exiting = 0x2,
+    };
+    // Map of basic blocks to SCC IDs they belong to. If basic block doesn't
+    // belong to any SCC it is not in the map.
+    using SccMap = DenseMap<const BasicBlock *, int>;
+    // Each basic block in SCC is attributed with one or several types from
+    // SccBlockType. Map value has uint32_t type (instead of SccBlockType)
+    // since basic block may be for example "Header" and "Exiting" at the same
+    // time and we need to be able to keep more than one value from
+    // SccBlockType.
+    using SccBlockTypeMap = DenseMap<const BasicBlock *, uint32_t>;
+    // Vector containing classification of basic blocks for all  SCCs where i'th
+    // vector element corresponds to SCC with ID equal to i.
+    using SccBlockTypeMaps = std::vector<SccBlockTypeMap>;
+
     SccMap SccNums;
-    SccHeaderMaps SccHeaders;
+    SccBlockTypeMaps SccBlocks;
+
+  public:
+    explicit SccInfo(const Function &F);
+
+    /// If \p BB belongs to some SCC then ID of that SCC is returned, otherwise
+    /// -1 is returned. If \p BB belongs to more than one SCC at the same time
+    /// result is undefined.
+    int getSCCNum(const BasicBlock *BB) const;
+    /// Returns true if \p BB is a 'header' block in SCC with \p SccNum ID,
+    /// false otherwise.
+    bool isSCCHeader(const BasicBlock *BB, int SccNum) const {
+      return getSccBlockType(BB, SccNum) & Header;
+    }
+    /// Returns true if \p BB is an 'exiting' block in SCC with \p SccNum ID,
+    /// false otherwise.
+    bool isSCCExitingBlock(const BasicBlock *BB, int SccNum) const {
+      return getSccBlockType(BB, SccNum) & Exiting;
+    }
+    /// Fills in \p Enters vector with all such blocks that don't belong to
+    /// SCC with \p SccNum ID but there is an edge to a block belonging to the
+    /// SCC.
+    void getSccEnterBlocks(int SccNum,
+                           SmallVectorImpl<BasicBlock *> &Enters) const;
+    /// Fills in \p Exits vector with all such blocks that don't belong to
+    /// SCC with \p SccNum ID but there is an edge from a block belonging to the
+    /// SCC.
+    void getSccExitBlocks(int SccNum,
+                          SmallVectorImpl<BasicBlock *> &Exits) const;
+
+  private:
+    /// Returns \p BB's type according to classification given by SccBlockType
+    /// enum. Please note that \p BB must belong to SSC with \p SccNum ID.
+    uint32_t getSccBlockType(const BasicBlock *BB, int SccNum) const;
+    /// Calculates \p BB's type and stores it in internal data structures for
+    /// future use. Please note that \p BB must belong to SSC with \p SccNum ID.
+    void calculateSccBlockType(const BasicBlock *BB, int SccNum);
   };
 
 private:
@@ -169,7 +281,6 @@ private:
     void deleted() override {
       assert(BPI != nullptr);
       BPI->eraseBlock(cast<BasicBlock>(getValPtr()));
-      BPI->Handles.erase(*this);
     }
 
   public:
@@ -177,44 +288,132 @@ private:
         : CallbackVH(const_cast<Value *>(V)), BPI(BPI) {}
   };
 
+  /// Pair of Loop and SCC ID number. Used to unify handling of normal and
+  /// SCC based loop representations.
+  using LoopData = std::pair<Loop *, int>;
+  /// Helper class to keep basic block along with its loop data information.
+  class LoopBlock {
+  public:
+    explicit LoopBlock(const BasicBlock *BB, const LoopInfo &LI,
+                       const SccInfo &SccI);
+
+    const BasicBlock *getBlock() const { return BB; }
+    BasicBlock *getBlock() { return const_cast<BasicBlock *>(BB); }
+    LoopData getLoopData() const { return LD; }
+    Loop *getLoop() const { return LD.first; }
+    int getSccNum() const { return LD.second; }
+
+    bool belongsToLoop() const { return getLoop() || getSccNum() != -1; }
+    bool belongsToSameLoop(const LoopBlock &LB) const {
+      return (LB.getLoop() && getLoop() == LB.getLoop()) ||
+             (LB.getSccNum() != -1 && getSccNum() == LB.getSccNum());
+    }
+
+  private:
+    const BasicBlock *const BB = nullptr;
+    LoopData LD = {nullptr, -1};
+  };
+
+  // Pair of LoopBlocks representing an edge from first to second block.
+  using LoopEdge = std::pair<const LoopBlock &, const LoopBlock &>;
+
   DenseSet<BasicBlockCallbackVH, DenseMapInfo<Value*>> Handles;
 
   // Since we allow duplicate edges from one basic block to another, we use
   // a pair (PredBlock and an index in the successors) to specify an edge.
   using Edge = std::pair<const BasicBlock *, unsigned>;
 
-  // Default weight value. Used when we don't have information about the edge.
-  // TODO: DEFAULT_WEIGHT makes sense during static predication, when none of
-  // the successors have a weight yet. But it doesn't make sense when providing
-  // weight to an edge that may have siblings with non-zero weights. This can
-  // be handled various ways, but it's probably fine for an edge with unknown
-  // weight to just "inherit" the non-zero weight of an adjacent successor.
-  static const uint32_t DEFAULT_WEIGHT = 16;
-
   DenseMap<Edge, BranchProbability> Probs;
 
   /// Track the last function we run over for printing.
   const Function *LastF = nullptr;
 
-  /// Track the set of blocks directly succeeded by a returning block.
-  SmallPtrSet<const BasicBlock *, 16> PostDominatedByUnreachable;
+  const LoopInfo *LI = nullptr;
+
+  /// Keeps information about all SCCs in a function.
+  std::unique_ptr<const SccInfo> SccI;
 
-  /// Track the set of blocks that always lead to a cold call.
-  SmallPtrSet<const BasicBlock *, 16> PostDominatedByColdCall;
+  /// Keeps mapping of a basic block to its estimated weight.
+  SmallDenseMap<const BasicBlock *, uint32_t> EstimatedBlockWeight;
+
+  /// Keeps mapping of a loop to estimated weight to enter the loop.
+  SmallDenseMap<LoopData, uint32_t> EstimatedLoopWeight;
+
+  /// Helper to construct LoopBlock for \p BB.
+  LoopBlock getLoopBlock(const BasicBlock *BB) const {
+    return LoopBlock(BB, *LI, *SccI.get());
+  }
 
-  void computePostDominatedByUnreachable(const Function &F,
-                                         PostDominatorTree *PDT);
-  void computePostDominatedByColdCall(const Function &F,
-                                      PostDominatorTree *PDT);
-  bool calcUnreachableHeuristics(const BasicBlock *BB);
+  /// Returns true if destination block belongs to some loop and source block is
+  /// either doesn't belong to any loop or belongs to a loop which is not inner
+  /// relative to the destination block.
+  bool isLoopEnteringEdge(const LoopEdge &Edge) const;
+  /// Returns true if source block belongs to some loop and destination block is
+  /// either doesn't belong to any loop or belongs to a loop which is not inner
+  /// relative to the source block.
+  bool isLoopExitingEdge(const LoopEdge &Edge) const;
+  /// Returns true if \p Edge is either enters to or exits from some loop, false
+  /// in all other cases.
+  bool isLoopEnteringExitingEdge(const LoopEdge &Edge) const;
+  /// Returns true if source and destination blocks belongs to the same loop and
+  /// destination block is loop header.
+  bool isLoopBackEdge(const LoopEdge &Edge) const;
+  // Fills in \p Enters vector with all "enter" blocks to a loop \LB belongs to.
+  void getLoopEnterBlocks(const LoopBlock &LB,
+                          SmallVectorImpl<BasicBlock *> &Enters) const;
+  // Fills in \p Exits vector with all "exit" blocks from a loop \LB belongs to.
+  void getLoopExitBlocks(const LoopBlock &LB,
+                         SmallVectorImpl<BasicBlock *> &Exits) const;
+
+  /// Returns estimated weight for \p BB. None if \p BB has no estimated weight.
+  Optional<uint32_t> getEstimatedBlockWeight(const BasicBlock *BB) const;
+
+  /// Returns estimated weight to enter \p L. In other words it is weight of
+  /// loop's header block not scaled by trip count. Returns None if \p L has no
+  /// no estimated weight.
+  Optional<uint32_t> getEstimatedLoopWeight(const LoopData &L) const;
+
+  /// Return estimated weight for \p Edge. Returns None if estimated weight is
+  /// unknown.
+  Optional<uint32_t> getEstimatedEdgeWeight(const LoopEdge &Edge) const;
+
+  /// Iterates over all edges leading from \p SrcBB to \p Successors and
+  /// returns maximum of all estimated weights. If at least one edge has unknown
+  /// estimated weight None is returned.
+  template <class IterT>
+  Optional<uint32_t>
+  getMaxEstimatedEdgeWeight(const LoopBlock &SrcBB,
+                            iterator_range<IterT> Successors) const;
+
+  /// If \p LoopBB has no estimated weight then set it to \p BBWeight and
+  /// return true. Otherwise \p BB's weight remains unchanged and false is
+  /// returned. In addition all blocks/loops that might need their weight to be
+  /// re-estimated are put into BlockWorkList/LoopWorkList.
+  bool updateEstimatedBlockWeight(LoopBlock &LoopBB, uint32_t BBWeight,
+                                  SmallVectorImpl<BasicBlock *> &BlockWorkList,
+                                  SmallVectorImpl<LoopBlock> &LoopWorkList);
+
+  /// Starting from \p LoopBB (including \p LoopBB itself) propagate \p BBWeight
+  /// up the domination tree.
+  void propagateEstimatedBlockWeight(const LoopBlock &LoopBB, DominatorTree *DT,
+                                     PostDominatorTree *PDT, uint32_t BBWeight,
+                                     SmallVectorImpl<BasicBlock *> &WorkList,
+                                     SmallVectorImpl<LoopBlock> &LoopWorkList);
+
+  /// Returns block's weight encoded in the IR.
+  Optional<uint32_t> getInitialEstimatedBlockWeight(const BasicBlock *BB);
+
+  // Computes estimated weights for all blocks in \p F.
+  void computeEestimateBlockWeight(const Function &F, DominatorTree *DT,
+                                   PostDominatorTree *PDT);
+
+  /// Based on computed weights by \p computeEstimatedBlockWeight set
+  /// probabilities on branches.
+  bool calcEstimatedHeuristics(const BasicBlock *BB);
   bool calcMetadataWeights(const BasicBlock *BB);
-  bool calcColdCallHeuristics(const BasicBlock *BB);
   bool calcPointerHeuristics(const BasicBlock *BB);
-  bool calcLoopBranchHeuristics(const BasicBlock *BB, const LoopInfo &LI,
-                                SccInfo &SccI);
   bool calcZeroHeuristics(const BasicBlock *BB, const TargetLibraryInfo *TLI);
   bool calcFloatingPointHeuristics(const BasicBlock *BB);
-  bool calcInvokeHeuristics(const BasicBlock *BB);
 };
 
 /// Analysis pass which computes \c BranchProbabilityInfo.
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/CFGPrinter.h b/contrib/llvm-project/llvm/include/llvm/Analysis/CFGPrinter.h
index c4e49ce493ea..53700798b6b3 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/CFGPrinter.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/CFGPrinter.h
@@ -18,6 +18,7 @@
 #ifndef LLVM_ANALYSIS_CFGPRINTER_H
 #define LLVM_ANALYSIS_CFGPRINTER_H
 
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/HeatUtils.h"
@@ -141,8 +142,18 @@ struct DOTGraphTraits<DOTFuncInfo *> : public DefaultDOTGraphTraits {
     return OS.str();
   }
 
-  static std::string getCompleteNodeLabel(const BasicBlock *Node,
-                                          DOTFuncInfo *) {
+  static void eraseComment(std::string &OutStr, unsigned &I, unsigned Idx) {
+    OutStr.erase(OutStr.begin() + I, OutStr.begin() + Idx);
+    --I;
+  }
+
+  static std::string getCompleteNodeLabel(
+      const BasicBlock *Node, DOTFuncInfo *,
+      llvm::function_ref<void(raw_string_ostream &, const BasicBlock &)>
+          HandleBasicBlock = [](raw_string_ostream &OS,
+                                const BasicBlock &Node) -> void { OS << Node; },
+      llvm::function_ref<void(std::string &, unsigned &, unsigned)>
+          HandleComment = eraseComment) {
     enum { MaxColumns = 80 };
     std::string Str;
     raw_string_ostream OS(Str);
@@ -152,7 +163,7 @@ struct DOTGraphTraits<DOTFuncInfo *> : public DefaultDOTGraphTraits {
       OS << ":";
     }
 
-    OS << *Node;
+    HandleBasicBlock(OS, *Node);
     std::string OutStr = OS.str();
     if (OutStr[0] == '\n')
       OutStr.erase(OutStr.begin());
@@ -168,8 +179,7 @@ struct DOTGraphTraits<DOTFuncInfo *> : public DefaultDOTGraphTraits {
         LastSpace = 0;
       } else if (OutStr[i] == ';') {             // Delete comments!
         unsigned Idx = OutStr.find('\n', i + 1); // Find end of line
-        OutStr.erase(OutStr.begin() + i, OutStr.begin() + Idx);
-        --i;
+        HandleComment(OutStr, i, Idx);
       } else if (ColNum == MaxColumns) { // Wrap lines.
         // Wrap very long names even though we can't find a space.
         if (!LastSpace)
@@ -285,7 +295,7 @@ struct DOTGraphTraits<DOTFuncInfo *> : public DefaultDOTGraphTraits {
                         " fillcolor=\"" + Color + "70\"";
     return Attrs;
   }
-  bool isNodeHidden(const BasicBlock *Node);
+  bool isNodeHidden(const BasicBlock *Node, const DOTFuncInfo *CFGInfo);
   void computeHiddenNodes(const Function *F);
 };
 } // End llvm namespace
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/CGSCCPassManager.h b/contrib/llvm-project/llvm/include/llvm/Analysis/CGSCCPassManager.h
index eb0d3ae8fedf..985424a74054 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/CGSCCPassManager.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/CGSCCPassManager.h
@@ -90,6 +90,7 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/PriorityWorklist.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -314,6 +315,16 @@ struct CGSCCUpdateResult {
   /// for a better technique.
   SmallDenseSet<std::pair<LazyCallGraph::Node *, LazyCallGraph::SCC *>, 4>
       &InlinedInternalEdges;
+
+  /// Weak VHs to keep track of indirect calls for the purposes of detecting
+  /// devirtualization.
+  ///
+  /// This is a map to avoid having duplicate entries. If a Value is
+  /// deallocated, its corresponding WeakTrackingVH will be nulled out. When
+  /// checking if a Value is in the map or not, also check if the corresponding
+  /// WeakTrackingVH is null to avoid issues with a new Value sharing the same
+  /// address as a deallocated one.
+  SmallMapVector<Value *, WeakTrackingVH, 16> IndirectVHs;
 };
 
 /// The core module pass which does a post-order walk of the SCCs and
@@ -325,18 +336,15 @@ struct CGSCCUpdateResult {
 /// \c CGSCCAnalysisManagerModuleProxy analysis prior to running the CGSCC
 /// pass over the module to enable a \c FunctionAnalysisManager to be used
 /// within this run safely.
-template <typename CGSCCPassT>
 class ModuleToPostOrderCGSCCPassAdaptor
-    : public PassInfoMixin<ModuleToPostOrderCGSCCPassAdaptor<CGSCCPassT>> {
+    : public PassInfoMixin<ModuleToPostOrderCGSCCPassAdaptor> {
 public:
-  explicit ModuleToPostOrderCGSCCPassAdaptor(CGSCCPassT Pass)
-      : Pass(std::move(Pass)) {}
+  using PassConceptT =
+      detail::PassConcept<LazyCallGraph::SCC, CGSCCAnalysisManager,
+                          LazyCallGraph &, CGSCCUpdateResult &>;
 
-  // We have to explicitly define all the special member functions because MSVC
-  // refuses to generate them.
-  ModuleToPostOrderCGSCCPassAdaptor(
-      const ModuleToPostOrderCGSCCPassAdaptor &Arg)
-      : Pass(Arg.Pass) {}
+  explicit ModuleToPostOrderCGSCCPassAdaptor(std::unique_ptr<PassConceptT> Pass)
+      : Pass(std::move(Pass)) {}
 
   ModuleToPostOrderCGSCCPassAdaptor(ModuleToPostOrderCGSCCPassAdaptor &&Arg)
       : Pass(std::move(Arg.Pass)) {}
@@ -355,16 +363,22 @@ public:
   /// Runs the CGSCC pass across every SCC in the module.
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
 
+  static bool isRequired() { return true; }
+
 private:
-  CGSCCPassT Pass;
+  std::unique_ptr<PassConceptT> Pass;
 };
 
 /// A function to deduce a function pass type and wrap it in the
 /// templated adaptor.
 template <typename CGSCCPassT>
-ModuleToPostOrderCGSCCPassAdaptor<CGSCCPassT>
+ModuleToPostOrderCGSCCPassAdaptor
 createModuleToPostOrderCGSCCPassAdaptor(CGSCCPassT Pass) {
-  return ModuleToPostOrderCGSCCPassAdaptor<CGSCCPassT>(std::move(Pass));
+  using PassModelT = detail::PassModel<LazyCallGraph::SCC, CGSCCPassT,
+                                       PreservedAnalyses, CGSCCAnalysisManager,
+                                       LazyCallGraph &, CGSCCUpdateResult &>;
+  return ModuleToPostOrderCGSCCPassAdaptor(
+      std::make_unique<PassModelT>(std::move(Pass)));
 }
 
 /// A proxy from a \c FunctionAnalysisManager to an \c SCC.
@@ -442,17 +456,13 @@ LazyCallGraph::SCC &updateCGAndAnalysisManagerForCGSCCPass(
 /// \c FunctionAnalysisManagerCGSCCProxy analysis prior to running the function
 /// pass over the SCC to enable a \c FunctionAnalysisManager to be used
 /// within this run safely.
-template <typename FunctionPassT>
 class CGSCCToFunctionPassAdaptor
-    : public PassInfoMixin<CGSCCToFunctionPassAdaptor<FunctionPassT>> {
+    : public PassInfoMixin<CGSCCToFunctionPassAdaptor> {
 public:
-  explicit CGSCCToFunctionPassAdaptor(FunctionPassT Pass)
-      : Pass(std::move(Pass)) {}
+  using PassConceptT = detail::PassConcept<Function, FunctionAnalysisManager>;
 
-  // We have to explicitly define all the special member functions because MSVC
-  // refuses to generate them.
-  CGSCCToFunctionPassAdaptor(const CGSCCToFunctionPassAdaptor &Arg)
-      : Pass(Arg.Pass) {}
+  explicit CGSCCToFunctionPassAdaptor(std::unique_ptr<PassConceptT> Pass)
+      : Pass(std::move(Pass)) {}
 
   CGSCCToFunctionPassAdaptor(CGSCCToFunctionPassAdaptor &&Arg)
       : Pass(std::move(Arg.Pass)) {}
@@ -469,90 +479,24 @@ public:
 
   /// Runs the function pass across every function in the module.
   PreservedAnalyses run(LazyCallGraph::SCC &C, CGSCCAnalysisManager &AM,
-                        LazyCallGraph &CG, CGSCCUpdateResult &UR) {
-    // Setup the function analysis manager from its proxy.
-    FunctionAnalysisManager &FAM =
-        AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager();
-
-    SmallVector<LazyCallGraph::Node *, 4> Nodes;
-    for (LazyCallGraph::Node &N : C)
-      Nodes.push_back(&N);
-
-    // The SCC may get split while we are optimizing functions due to deleting
-    // edges. If this happens, the current SCC can shift, so keep track of
-    // a pointer we can overwrite.
-    LazyCallGraph::SCC *CurrentC = &C;
-
-    LLVM_DEBUG(dbgs() << "Running function passes across an SCC: " << C
-                      << "\n");
-
-    PreservedAnalyses PA = PreservedAnalyses::all();
-    for (LazyCallGraph::Node *N : Nodes) {
-      // Skip nodes from other SCCs. These may have been split out during
-      // processing. We'll eventually visit those SCCs and pick up the nodes
-      // there.
-      if (CG.lookupSCC(*N) != CurrentC)
-        continue;
-
-      Function &F = N->getFunction();
-
-      PassInstrumentation PI = FAM.getResult<PassInstrumentationAnalysis>(F);
-      if (!PI.runBeforePass<Function>(Pass, F))
-        continue;
-
-      PreservedAnalyses PassPA;
-      {
-        TimeTraceScope TimeScope(Pass.name());
-        PassPA = Pass.run(F, FAM);
-      }
-
-      PI.runAfterPass<Function>(Pass, F);
-
-      // We know that the function pass couldn't have invalidated any other
-      // function's analyses (that's the contract of a function pass), so
-      // directly handle the function analysis manager's invalidation here.
-      FAM.invalidate(F, PassPA);
-
-      // Then intersect the preserved set so that invalidation of module
-      // analyses will eventually occur when the module pass completes.
-      PA.intersect(std::move(PassPA));
-
-      // If the call graph hasn't been preserved, update it based on this
-      // function pass. This may also update the current SCC to point to
-      // a smaller, more refined SCC.
-      auto PAC = PA.getChecker<LazyCallGraphAnalysis>();
-      if (!PAC.preserved() && !PAC.preservedSet<AllAnalysesOn<Module>>()) {
-        CurrentC = &updateCGAndAnalysisManagerForFunctionPass(CG, *CurrentC, *N,
-                                                              AM, UR, FAM);
-        assert(
-            CG.lookupSCC(*N) == CurrentC &&
-            "Current SCC not updated to the SCC containing the current node!");
-      }
-    }
+                        LazyCallGraph &CG, CGSCCUpdateResult &UR);
 
-    // By definition we preserve the proxy. And we preserve all analyses on
-    // Functions. This precludes *any* invalidation of function analyses by the
-    // proxy, but that's OK because we've taken care to invalidate analyses in
-    // the function analysis manager incrementally above.
-    PA.preserveSet<AllAnalysesOn<Function>>();
-    PA.preserve<FunctionAnalysisManagerCGSCCProxy>();
-
-    // We've also ensured that we updated the call graph along the way.
-    PA.preserve<LazyCallGraphAnalysis>();
-
-    return PA;
-  }
+  static bool isRequired() { return true; }
 
 private:
-  FunctionPassT Pass;
+  std::unique_ptr<PassConceptT> Pass;
 };
 
 /// A function to deduce a function pass type and wrap it in the
 /// templated adaptor.
 template <typename FunctionPassT>
-CGSCCToFunctionPassAdaptor<FunctionPassT>
+CGSCCToFunctionPassAdaptor
 createCGSCCToFunctionPassAdaptor(FunctionPassT Pass) {
-  return CGSCCToFunctionPassAdaptor<FunctionPassT>(std::move(Pass));
+  using PassModelT =
+      detail::PassModel<Function, FunctionPassT, PreservedAnalyses,
+                        FunctionAnalysisManager>;
+  return CGSCCToFunctionPassAdaptor(
+      std::make_unique<PassModelT>(std::move(Pass)));
 }
 
 /// A helper that repeats an SCC pass each time an indirect call is refined to
@@ -569,410 +513,36 @@ createCGSCCToFunctionPassAdaptor(FunctionPassT Pass) {
 /// This repetition has the potential to be very large however, as each one
 /// might refine a single call site. As a consequence, in practice we use an
 /// upper bound on the number of repetitions to limit things.
-template <typename PassT>
-class DevirtSCCRepeatedPass
-    : public PassInfoMixin<DevirtSCCRepeatedPass<PassT>> {
+class DevirtSCCRepeatedPass : public PassInfoMixin<DevirtSCCRepeatedPass> {
 public:
-  explicit DevirtSCCRepeatedPass(PassT Pass, int MaxIterations)
+  using PassConceptT =
+      detail::PassConcept<LazyCallGraph::SCC, CGSCCAnalysisManager,
+                          LazyCallGraph &, CGSCCUpdateResult &>;
+
+  explicit DevirtSCCRepeatedPass(std::unique_ptr<PassConceptT> Pass,
+                                 int MaxIterations)
       : Pass(std::move(Pass)), MaxIterations(MaxIterations) {}
 
   /// Runs the wrapped pass up to \c MaxIterations on the SCC, iterating
   /// whenever an indirect call is refined.
   PreservedAnalyses run(LazyCallGraph::SCC &InitialC, CGSCCAnalysisManager &AM,
-                        LazyCallGraph &CG, CGSCCUpdateResult &UR) {
-    PreservedAnalyses PA = PreservedAnalyses::all();
-    PassInstrumentation PI =
-        AM.getResult<PassInstrumentationAnalysis>(InitialC, CG);
-
-    // The SCC may be refined while we are running passes over it, so set up
-    // a pointer that we can update.
-    LazyCallGraph::SCC *C = &InitialC;
-
-    // Collect value handles for all of the indirect call sites.
-    SmallVector<WeakTrackingVH, 8> CallHandles;
-
-    // Struct to track the counts of direct and indirect calls in each function
-    // of the SCC.
-    struct CallCount {
-      int Direct;
-      int Indirect;
-    };
-
-    // Put value handles on all of the indirect calls and return the number of
-    // direct calls for each function in the SCC.
-    auto ScanSCC = [](LazyCallGraph::SCC &C,
-                      SmallVectorImpl<WeakTrackingVH> &CallHandles) {
-      assert(CallHandles.empty() && "Must start with a clear set of handles.");
-
-      SmallDenseMap<Function *, CallCount> CallCounts;
-      CallCount CountLocal = {0, 0};
-      for (LazyCallGraph::Node &N : C) {
-        CallCount &Count =
-            CallCounts.insert(std::make_pair(&N.getFunction(), CountLocal))
-                .first->second;
-        for (Instruction &I : instructions(N.getFunction()))
-          if (auto *CB = dyn_cast<CallBase>(&I)) {
-            if (CB->getCalledFunction()) {
-              ++Count.Direct;
-            } else {
-              ++Count.Indirect;
-              CallHandles.push_back(WeakTrackingVH(&I));
-            }
-          }
-      }
-
-      return CallCounts;
-    };
-
-    // Populate the initial call handles and get the initial call counts.
-    auto CallCounts = ScanSCC(*C, CallHandles);
-
-    for (int Iteration = 0;; ++Iteration) {
-
-      if (!PI.runBeforePass<LazyCallGraph::SCC>(Pass, *C))
-        continue;
-
-      PreservedAnalyses PassPA = Pass.run(*C, AM, CG, UR);
-
-      if (UR.InvalidatedSCCs.count(C))
-        PI.runAfterPassInvalidated<LazyCallGraph::SCC>(Pass);
-      else
-        PI.runAfterPass<LazyCallGraph::SCC>(Pass, *C);
-
-      // If the SCC structure has changed, bail immediately and let the outer
-      // CGSCC layer handle any iteration to reflect the refined structure.
-      if (UR.UpdatedC && UR.UpdatedC != C) {
-        PA.intersect(std::move(PassPA));
-        break;
-      }
-
-      // Check that we didn't miss any update scenario.
-      assert(!UR.InvalidatedSCCs.count(C) && "Processing an invalid SCC!");
-      assert(C->begin() != C->end() && "Cannot have an empty SCC!");
-
-      // Check whether any of the handles were devirtualized.
-      auto IsDevirtualizedHandle = [&](WeakTrackingVH &CallH) {
-        if (!CallH)
-          return false;
-        auto *CB = dyn_cast<CallBase>(CallH);
-        if (!CB)
-          return false;
-
-        // If the call is still indirect, leave it alone.
-        Function *F = CB->getCalledFunction();
-        if (!F)
-          return false;
-
-        LLVM_DEBUG(dbgs() << "Found devirtualized call from "
-                          << CB->getParent()->getParent()->getName() << " to "
-                          << F->getName() << "\n");
-
-        // We now have a direct call where previously we had an indirect call,
-        // so iterate to process this devirtualization site.
-        return true;
-      };
-      bool Devirt = llvm::any_of(CallHandles, IsDevirtualizedHandle);
-
-      // Rescan to build up a new set of handles and count how many direct
-      // calls remain. If we decide to iterate, this also sets up the input to
-      // the next iteration.
-      CallHandles.clear();
-      auto NewCallCounts = ScanSCC(*C, CallHandles);
-
-      // If we haven't found an explicit devirtualization already see if we
-      // have decreased the number of indirect calls and increased the number
-      // of direct calls for any function in the SCC. This can be fooled by all
-      // manner of transformations such as DCE and other things, but seems to
-      // work well in practice.
-      if (!Devirt)
-        // Iterate over the keys in NewCallCounts, if Function also exists in
-        // CallCounts, make the check below.
-        for (auto &Pair : NewCallCounts) {
-          auto &CallCountNew = Pair.second;
-          auto CountIt = CallCounts.find(Pair.first);
-          if (CountIt != CallCounts.end()) {
-            const auto &CallCountOld = CountIt->second;
-            if (CallCountOld.Indirect > CallCountNew.Indirect &&
-                CallCountOld.Direct < CallCountNew.Direct) {
-              Devirt = true;
-              break;
-            }
-          }
-        }
-
-      if (!Devirt) {
-        PA.intersect(std::move(PassPA));
-        break;
-      }
-
-      // Otherwise, if we've already hit our max, we're done.
-      if (Iteration >= MaxIterations) {
-        LLVM_DEBUG(
-            dbgs() << "Found another devirtualization after hitting the max "
-                      "number of repetitions ("
-                   << MaxIterations << ") on SCC: " << *C << "\n");
-        PA.intersect(std::move(PassPA));
-        break;
-      }
-
-      LLVM_DEBUG(
-          dbgs()
-          << "Repeating an SCC pass after finding a devirtualization in: " << *C
-          << "\n");
-
-      // Move over the new call counts in preparation for iterating.
-      CallCounts = std::move(NewCallCounts);
-
-      // Update the analysis manager with each run and intersect the total set
-      // of preserved analyses so we're ready to iterate.
-      AM.invalidate(*C, PassPA);
-
-      PA.intersect(std::move(PassPA));
-    }
-
-    // Note that we don't add any preserved entries here unlike a more normal
-    // "pass manager" because we only handle invalidation *between* iterations,
-    // not after the last iteration.
-    return PA;
-  }
+                        LazyCallGraph &CG, CGSCCUpdateResult &UR);
 
 private:
-  PassT Pass;
+  std::unique_ptr<PassConceptT> Pass;
   int MaxIterations;
 };
 
 /// A function to deduce a function pass type and wrap it in the
 /// templated adaptor.
-template <typename PassT>
-DevirtSCCRepeatedPass<PassT> createDevirtSCCRepeatedPass(PassT Pass,
-                                                         int MaxIterations) {
-  return DevirtSCCRepeatedPass<PassT>(std::move(Pass), MaxIterations);
-}
-
-// Out-of-line implementation details for templates below this point.
-
 template <typename CGSCCPassT>
-PreservedAnalyses
-ModuleToPostOrderCGSCCPassAdaptor<CGSCCPassT>::run(Module &M,
-                                                   ModuleAnalysisManager &AM) {
-  // Setup the CGSCC analysis manager from its proxy.
-  CGSCCAnalysisManager &CGAM =
-      AM.getResult<CGSCCAnalysisManagerModuleProxy>(M).getManager();
-
-  // Get the call graph for this module.
-  LazyCallGraph &CG = AM.getResult<LazyCallGraphAnalysis>(M);
-
-  // Get Function analysis manager from its proxy.
-  FunctionAnalysisManager &FAM =
-      AM.getCachedResult<FunctionAnalysisManagerModuleProxy>(M)->getManager();
-
-  // We keep worklists to allow us to push more work onto the pass manager as
-  // the passes are run.
-  SmallPriorityWorklist<LazyCallGraph::RefSCC *, 1> RCWorklist;
-  SmallPriorityWorklist<LazyCallGraph::SCC *, 1> CWorklist;
-
-  // Keep sets for invalidated SCCs and RefSCCs that should be skipped when
-  // iterating off the worklists.
-  SmallPtrSet<LazyCallGraph::RefSCC *, 4> InvalidRefSCCSet;
-  SmallPtrSet<LazyCallGraph::SCC *, 4> InvalidSCCSet;
-
-  SmallDenseSet<std::pair<LazyCallGraph::Node *, LazyCallGraph::SCC *>, 4>
-      InlinedInternalEdges;
-
-  CGSCCUpdateResult UR = {
-      RCWorklist, CWorklist, InvalidRefSCCSet,         InvalidSCCSet,
-      nullptr,    nullptr,   PreservedAnalyses::all(), InlinedInternalEdges};
-
-  // Request PassInstrumentation from analysis manager, will use it to run
-  // instrumenting callbacks for the passes later.
-  PassInstrumentation PI = AM.getResult<PassInstrumentationAnalysis>(M);
-
-  PreservedAnalyses PA = PreservedAnalyses::all();
-  CG.buildRefSCCs();
-  for (auto RCI = CG.postorder_ref_scc_begin(),
-            RCE = CG.postorder_ref_scc_end();
-       RCI != RCE;) {
-    assert(RCWorklist.empty() &&
-           "Should always start with an empty RefSCC worklist");
-    // The postorder_ref_sccs range we are walking is lazily constructed, so
-    // we only push the first one onto the worklist. The worklist allows us
-    // to capture *new* RefSCCs created during transformations.
-    //
-    // We really want to form RefSCCs lazily because that makes them cheaper
-    // to update as the program is simplified and allows us to have greater
-    // cache locality as forming a RefSCC touches all the parts of all the
-    // functions within that RefSCC.
-    //
-    // We also eagerly increment the iterator to the next position because
-    // the CGSCC passes below may delete the current RefSCC.
-    RCWorklist.insert(&*RCI++);
-
-    do {
-      LazyCallGraph::RefSCC *RC = RCWorklist.pop_back_val();
-      if (InvalidRefSCCSet.count(RC)) {
-        LLVM_DEBUG(dbgs() << "Skipping an invalid RefSCC...\n");
-        continue;
-      }
-
-      assert(CWorklist.empty() &&
-             "Should always start with an empty SCC worklist");
-
-      LLVM_DEBUG(dbgs() << "Running an SCC pass across the RefSCC: " << *RC
-                        << "\n");
-
-      // The top of the worklist may *also* be the same SCC we just ran over
-      // (and invalidated for). Keep track of that last SCC we processed due
-      // to SCC update to avoid redundant processing when an SCC is both just
-      // updated itself and at the top of the worklist.
-      LazyCallGraph::SCC *LastUpdatedC = nullptr;
-
-      // Push the initial SCCs in reverse post-order as we'll pop off the
-      // back and so see this in post-order.
-      for (LazyCallGraph::SCC &C : llvm::reverse(*RC))
-        CWorklist.insert(&C);
-
-      do {
-        LazyCallGraph::SCC *C = CWorklist.pop_back_val();
-        // Due to call graph mutations, we may have invalid SCCs or SCCs from
-        // other RefSCCs in the worklist. The invalid ones are dead and the
-        // other RefSCCs should be queued above, so we just need to skip both
-        // scenarios here.
-        if (InvalidSCCSet.count(C)) {
-          LLVM_DEBUG(dbgs() << "Skipping an invalid SCC...\n");
-          continue;
-        }
-        if (LastUpdatedC == C) {
-          LLVM_DEBUG(dbgs() << "Skipping redundant run on SCC: " << *C << "\n");
-          continue;
-        }
-        if (&C->getOuterRefSCC() != RC) {
-          LLVM_DEBUG(dbgs() << "Skipping an SCC that is now part of some other "
-                               "RefSCC...\n");
-          continue;
-        }
-
-        // Ensure we can proxy analysis updates from the CGSCC analysis manager
-        // into the the Function analysis manager by getting a proxy here.
-        // This also needs to update the FunctionAnalysisManager, as this may be
-        // the first time we see this SCC.
-        CGAM.getResult<FunctionAnalysisManagerCGSCCProxy>(*C, CG).updateFAM(
-            FAM);
-
-        // Each time we visit a new SCC pulled off the worklist,
-        // a transformation of a child SCC may have also modified this parent
-        // and invalidated analyses. So we invalidate using the update record's
-        // cross-SCC preserved set. This preserved set is intersected by any
-        // CGSCC pass that handles invalidation (primarily pass managers) prior
-        // to marking its SCC as preserved. That lets us track everything that
-        // might need invalidation across SCCs without excessive invalidations
-        // on a single SCC.
-        //
-        // This essentially allows SCC passes to freely invalidate analyses
-        // of any ancestor SCC. If this becomes detrimental to successfully
-        // caching analyses, we could force each SCC pass to manually
-        // invalidate the analyses for any SCCs other than themselves which
-        // are mutated. However, that seems to lose the robustness of the
-        // pass-manager driven invalidation scheme.
-        CGAM.invalidate(*C, UR.CrossSCCPA);
-
-        do {
-          // Check that we didn't miss any update scenario.
-          assert(!InvalidSCCSet.count(C) && "Processing an invalid SCC!");
-          assert(C->begin() != C->end() && "Cannot have an empty SCC!");
-          assert(&C->getOuterRefSCC() == RC &&
-                 "Processing an SCC in a different RefSCC!");
-
-          LastUpdatedC = UR.UpdatedC;
-          UR.UpdatedRC = nullptr;
-          UR.UpdatedC = nullptr;
-
-          // Check the PassInstrumentation's BeforePass callbacks before
-          // running the pass, skip its execution completely if asked to
-          // (callback returns false).
-          if (!PI.runBeforePass<LazyCallGraph::SCC>(Pass, *C))
-            continue;
-
-          PreservedAnalyses PassPA;
-          {
-            TimeTraceScope TimeScope(Pass.name());
-            PassPA = Pass.run(*C, CGAM, CG, UR);
-          }
-
-          if (UR.InvalidatedSCCs.count(C))
-            PI.runAfterPassInvalidated<LazyCallGraph::SCC>(Pass);
-          else
-            PI.runAfterPass<LazyCallGraph::SCC>(Pass, *C);
-
-          // Update the SCC and RefSCC if necessary.
-          C = UR.UpdatedC ? UR.UpdatedC : C;
-          RC = UR.UpdatedRC ? UR.UpdatedRC : RC;
-
-          if (UR.UpdatedC) {
-            // If we're updating the SCC, also update the FAM inside the proxy's
-            // result.
-            CGAM.getResult<FunctionAnalysisManagerCGSCCProxy>(*C, CG).updateFAM(
-                FAM);
-          }
-
-          // If the CGSCC pass wasn't able to provide a valid updated SCC,
-          // the current SCC may simply need to be skipped if invalid.
-          if (UR.InvalidatedSCCs.count(C)) {
-            LLVM_DEBUG(dbgs() << "Skipping invalidated root or island SCC!\n");
-            break;
-          }
-          // Check that we didn't miss any update scenario.
-          assert(C->begin() != C->end() && "Cannot have an empty SCC!");
-
-          // We handle invalidating the CGSCC analysis manager's information
-          // for the (potentially updated) SCC here. Note that any other SCCs
-          // whose structure has changed should have been invalidated by
-          // whatever was updating the call graph. This SCC gets invalidated
-          // late as it contains the nodes that were actively being
-          // processed.
-          CGAM.invalidate(*C, PassPA);
-
-          // Then intersect the preserved set so that invalidation of module
-          // analyses will eventually occur when the module pass completes.
-          // Also intersect with the cross-SCC preserved set to capture any
-          // cross-SCC invalidation.
-          UR.CrossSCCPA.intersect(PassPA);
-          PA.intersect(std::move(PassPA));
-
-          // The pass may have restructured the call graph and refined the
-          // current SCC and/or RefSCC. We need to update our current SCC and
-          // RefSCC pointers to follow these. Also, when the current SCC is
-          // refined, re-run the SCC pass over the newly refined SCC in order
-          // to observe the most precise SCC model available. This inherently
-          // cannot cycle excessively as it only happens when we split SCCs
-          // apart, at most converging on a DAG of single nodes.
-          // FIXME: If we ever start having RefSCC passes, we'll want to
-          // iterate there too.
-          if (UR.UpdatedC)
-            LLVM_DEBUG(dbgs()
-                       << "Re-running SCC passes after a refinement of the "
-                          "current SCC: "
-                       << *UR.UpdatedC << "\n");
-
-          // Note that both `C` and `RC` may at this point refer to deleted,
-          // invalid SCC and RefSCCs respectively. But we will short circuit
-          // the processing when we check them in the loop above.
-        } while (UR.UpdatedC);
-      } while (!CWorklist.empty());
-
-      // We only need to keep internal inlined edge information within
-      // a RefSCC, clear it to save on space and let the next time we visit
-      // any of these functions have a fresh start.
-      InlinedInternalEdges.clear();
-    } while (!RCWorklist.empty());
-  }
-
-  // By definition we preserve the call garph, all SCC analyses, and the
-  // analysis proxies by handling them above and in any nested pass managers.
-  PA.preserveSet<AllAnalysesOn<LazyCallGraph::SCC>>();
-  PA.preserve<LazyCallGraphAnalysis>();
-  PA.preserve<CGSCCAnalysisManagerModuleProxy>();
-  PA.preserve<FunctionAnalysisManagerModuleProxy>();
-  return PA;
+DevirtSCCRepeatedPass createDevirtSCCRepeatedPass(CGSCCPassT Pass,
+                                                  int MaxIterations) {
+  using PassModelT = detail::PassModel<LazyCallGraph::SCC, CGSCCPassT,
+                                       PreservedAnalyses, CGSCCAnalysisManager,
+                                       LazyCallGraph &, CGSCCUpdateResult &>;
+  return DevirtSCCRepeatedPass(std::make_unique<PassModelT>(std::move(Pass)),
+                               MaxIterations);
 }
 
 // Clear out the debug logging macro.
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/CallGraph.h b/contrib/llvm-project/llvm/include/llvm/Analysis/CallGraph.h
index 98f9b0683fd4..4da448c9900b 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/CallGraph.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/CallGraph.h
@@ -87,13 +87,6 @@ class CallGraph {
   /// or calling an external function.
   std::unique_ptr<CallGraphNode> CallsExternalNode;
 
-  /// Replace the function represented by this node by another.
-  ///
-  /// This does not rescan the body of the function, so it is suitable when
-  /// splicing the body of one function to another while also updating all
-  /// callers from the old function to the new.
-  void spliceFunction(const Function *From, const Function *To);
-
 public:
   explicit CallGraph(Module &M);
   CallGraph(CallGraph &&Arg);
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/CaptureTracking.h b/contrib/llvm-project/llvm/include/llvm/Analysis/CaptureTracking.h
index e68675b278f1..9da5f18e944b 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/CaptureTracking.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/CaptureTracking.h
@@ -13,6 +13,8 @@
 #ifndef LLVM_ANALYSIS_CAPTURETRACKING_H
 #define LLVM_ANALYSIS_CAPTURETRACKING_H
 
+#include "llvm/ADT/DenseMap.h"
+
 namespace llvm {
 
   class Value;
@@ -94,6 +96,12 @@ namespace llvm {
   /// is zero, a default value is assumed.
   void PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker,
                             unsigned MaxUsesToExplore = 0);
+
+  /// Returns true if the pointer is to a function-local object that never
+  /// escapes from the function.
+  bool isNonEscapingLocalObject(
+      const Value *V,
+      SmallDenseMap<const Value *, bool, 8> *IsCapturedCache = nullptr);
 } // end namespace llvm
 
 #endif
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/CodeMetrics.h b/contrib/llvm-project/llvm/include/llvm/Analysis/CodeMetrics.h
index eab24c8ab179..615591aa83ad 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/CodeMetrics.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/CodeMetrics.h
@@ -75,7 +75,8 @@ struct CodeMetrics {
 
   /// Add information about a block to the current state.
   void analyzeBasicBlock(const BasicBlock *BB, const TargetTransformInfo &TTI,
-                         const SmallPtrSetImpl<const Value*> &EphValues);
+                         const SmallPtrSetImpl<const Value *> &EphValues,
+                         bool PrepareForLTO = false);
 
   /// Collect a loop's ephemeral values (those used only by an assume
   /// or similar intrinsics in the loop).
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/ConstantFolding.h b/contrib/llvm-project/llvm/include/llvm/Analysis/ConstantFolding.h
index 0ccc782ad6f5..ef6e66b2b88e 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/ConstantFolding.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/ConstantFolding.h
@@ -25,6 +25,7 @@ template <typename T> class ArrayRef;
 class CallBase;
 class Constant;
 class ConstantExpr;
+class DSOLocalEquivalent;
 class DataLayout;
 class Function;
 class GlobalValue;
@@ -34,8 +35,11 @@ class Type;
 
 /// If this constant is a constant offset from a global, return the global and
 /// the constant. Because of constantexprs, this function is recursive.
+/// If the global is part of a dso_local_equivalent constant, return it through
+/// `Equiv` if it is provided.
 bool IsConstantOffsetFromGlobal(Constant *C, GlobalValue *&GV, APInt &Offset,
-                                const DataLayout &DL);
+                                const DataLayout &DL,
+                                DSOLocalEquivalent **DSOEquiv = nullptr);
 
 /// ConstantFoldInstruction - Try to constant fold the specified instruction.
 /// If successful, the constant result is returned, if not, null is returned.
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/ConstraintSystem.h b/contrib/llvm-project/llvm/include/llvm/Analysis/ConstraintSystem.h
new file mode 100644
index 000000000000..83c1fb4485fd
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/ConstraintSystem.h
@@ -0,0 +1,88 @@
+//===- ConstraintSystem.h -  A system of linear constraints. --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_CONSTRAINTSYSTEM_H
+#define LLVM_ANALYSIS_CONSTRAINTSYSTEM_H
+
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+
+#include <string>
+
+namespace llvm {
+
+class ConstraintSystem {
+  /// Current linear constraints in the system.
+  /// An entry of the form c0, c1, ... cn represents the following constraint:
+  ///   c0 >= v0 * c1 + .... + v{n-1} * cn
+  SmallVector<SmallVector<int64_t, 8>, 4> Constraints;
+
+  /// Current greatest common divisor for all coefficients in the system.
+  uint32_t GCD = 1;
+
+  // Eliminate constraints from the system using Fourier–Motzkin elimination.
+  bool eliminateUsingFM();
+
+  /// Print the constraints in the system, using \p Names as variable names.
+  void dump(ArrayRef<std::string> Names) const;
+
+  /// Print the constraints in the system, using x0...xn as variable names.
+  void dump() const;
+
+  /// Returns true if there may be a solution for the constraints in the system.
+  bool mayHaveSolutionImpl();
+
+public:
+  bool addVariableRow(const SmallVector<int64_t, 8> &R) {
+    assert(Constraints.empty() || R.size() == Constraints.back().size());
+    // If all variable coefficients are 0, the constraint does not provide any
+    // usable information.
+    if (all_of(makeArrayRef(R).drop_front(1), [](int64_t C) { return C == 0; }))
+      return false;
+
+    for (const auto &C : R) {
+      auto A = std::abs(C);
+      GCD = APIntOps::GreatestCommonDivisor({32, (uint32_t)A}, {32, GCD})
+                .getZExtValue();
+    }
+    Constraints.push_back(R);
+    return true;
+  }
+
+  bool addVariableRowFill(const SmallVector<int64_t, 8> &R) {
+    for (auto &CR : Constraints) {
+      while (CR.size() != R.size())
+        CR.push_back(0);
+    }
+    return addVariableRow(R);
+  }
+
+  /// Returns true if there may be a solution for the constraints in the system.
+  bool mayHaveSolution();
+
+  static SmallVector<int64_t, 8> negate(SmallVector<int64_t, 8> R) {
+    // The negated constraint R is obtained by multiplying by -1 and adding 1 to
+    // the constant.
+    R[0] += 1;
+    for (auto &C : R)
+      C *= -1;
+    return R;
+  }
+
+  bool isConditionImplied(SmallVector<int64_t, 8> R);
+
+  void popLastConstraint() { Constraints.pop_back(); }
+
+  /// Returns the number of rows in the constraint system.
+  unsigned size() const { return Constraints.size(); }
+};
+} // namespace llvm
+
+#endif // LLVM_ANALYSIS_CONSTRAINTSYSTEM_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/DDG.h b/contrib/llvm-project/llvm/include/llvm/Analysis/DDG.h
index 9e2b7907eaec..e3bef33e55c3 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/DDG.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/DDG.h
@@ -152,7 +152,7 @@ private:
     setKind((InstList.size() == 0 && Input.size() == 1)
                 ? NodeKind::SingleInstruction
                 : NodeKind::MultiInstruction);
-    InstList.insert(InstList.end(), Input.begin(), Input.end());
+    llvm::append_range(InstList, Input);
   }
   void appendInstructions(const SimpleDDGNode &Input) {
     appendInstructions(Input.getInstructions());
@@ -290,6 +290,12 @@ public:
   bool getDependencies(const NodeType &Src, const NodeType &Dst,
                        DependenceList &Deps) const;
 
+  /// Return a string representing the type of dependence that the dependence
+  /// analysis identified between the two given nodes. This function assumes
+  /// that there is a memory dependence between the given two nodes.
+  const std::string getDependenceString(const NodeType &Src,
+                                        const NodeType &Dst) const;
+
 protected:
   // Name of the graph.
   std::string Name;
@@ -463,6 +469,26 @@ bool DependenceGraphInfo<NodeType>::getDependencies(
   return !Deps.empty();
 }
 
+template <typename NodeType>
+const std::string
+DependenceGraphInfo<NodeType>::getDependenceString(const NodeType &Src,
+                                                   const NodeType &Dst) const {
+  std::string Str;
+  raw_string_ostream OS(Str);
+  DependenceList Deps;
+  if (!getDependencies(Src, Dst, Deps))
+    return OS.str();
+  interleaveComma(Deps, OS, [&](const std::unique_ptr<Dependence> &D) {
+    D->dump(OS);
+    // Remove the extra new-line character printed by the dump
+    // method
+    if (OS.str().back() == '\n')
+      OS.str().pop_back();
+  });
+
+  return OS.str();
+}
+
 //===--------------------------------------------------------------------===//
 // GraphTraits specializations for the DDG
 //===--------------------------------------------------------------------===//
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/DDGPrinter.h b/contrib/llvm-project/llvm/include/llvm/Analysis/DDGPrinter.h
new file mode 100644
index 000000000000..4477b387fe50
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/DDGPrinter.h
@@ -0,0 +1,91 @@
+//===- llvm/Analysis/DDGPrinter.h -------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//
+// This file defines the DOT printer for the Data-Dependence Graph (DDG).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_DDGPRINTER_H
+#define LLVM_ANALYSIS_DDGPRINTER_H
+
+#include "llvm/Analysis/DDG.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/DOTGraphTraits.h"
+
+namespace llvm {
+
+//===--------------------------------------------------------------------===//
+// Implementation of DDG DOT Printer for a loop.
+//===--------------------------------------------------------------------===//
+class DDGDotPrinterPass : public PassInfoMixin<DDGDotPrinterPass> {
+public:
+  PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
+                        LoopStandardAnalysisResults &AR, LPMUpdater &U);
+};
+
+//===--------------------------------------------------------------------===//
+// Specialization of DOTGraphTraits.
+//===--------------------------------------------------------------------===//
+template <>
+struct DOTGraphTraits<const DataDependenceGraph *>
+    : public DefaultDOTGraphTraits {
+
+  DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
+
+  /// Generate a title for the graph in DOT format
+  std::string getGraphName(const DataDependenceGraph *G) {
+    assert(G && "expected a valid pointer to the graph.");
+    return "DDG for '" + std::string(G->getName()) + "'";
+  }
+
+  /// Print a DDG node either in concise form (-ddg-dot-only) or
+  /// verbose mode (-ddg-dot).
+  std::string getNodeLabel(const DDGNode *Node,
+                           const DataDependenceGraph *Graph);
+
+  /// Print attributes of an edge in the DDG graph. If the edge
+  /// is a MemoryDependence edge, then detailed dependence info
+  /// available from DependenceAnalysis is displayed.
+  std::string
+  getEdgeAttributes(const DDGNode *Node,
+                    GraphTraits<const DDGNode *>::ChildIteratorType I,
+                    const DataDependenceGraph *G);
+
+  /// Do not print nodes that are part of a pi-block separately. They
+  /// will be printed when their containing pi-block is being printed.
+  bool isNodeHidden(const DDGNode *Node, const DataDependenceGraph *G);
+
+private:
+  /// Print a DDG node in concise form.
+  static std::string getSimpleNodeLabel(const DDGNode *Node,
+                                        const DataDependenceGraph *G);
+
+  /// Print a DDG node with more information including containing instructions
+  /// and detailed information about the dependence edges.
+  static std::string getVerboseNodeLabel(const DDGNode *Node,
+                                         const DataDependenceGraph *G);
+
+  /// Print a DDG edge in concise form.
+  static std::string getSimpleEdgeAttributes(const DDGNode *Src,
+                                             const DDGEdge *Edge,
+                                             const DataDependenceGraph *G);
+
+  /// Print a DDG edge with more information including detailed information
+  /// about the dependence edges.
+  static std::string getVerboseEdgeAttributes(const DDGNode *Src,
+                                              const DDGEdge *Edge,
+                                              const DataDependenceGraph *G);
+};
+
+using DDGDotGraphTraits = DOTGraphTraits<const DataDependenceGraph *>;
+
+} // namespace llvm
+
+#endif // LLVM_ANALYSIS_DDGPRINTER_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/Delinearization.h b/contrib/llvm-project/llvm/include/llvm/Analysis/Delinearization.h
new file mode 100644
index 000000000000..2658b6bbc80c
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/Delinearization.h
@@ -0,0 +1,33 @@
+//===---- Delinearization.h - MultiDimensional Index Delinearization ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements an analysis pass that tries to delinearize all GEP
+// instructions in all loops using the SCEV analysis functionality. This pass is
+// only used for testing purposes: if your pass needs delinearization, please
+// use the on-demand SCEVAddRecExpr::delinearize() function.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_DELINEARIZATION_H
+#define LLVM_ANALYSIS_DELINEARIZATION_H
+
+#include "llvm/IR/PassManager.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+struct DelinearizationPrinterPass
+    : public PassInfoMixin<DelinearizationPrinterPass> {
+  explicit DelinearizationPrinterPass(raw_ostream &OS);
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+private:
+  raw_ostream &OS;
+};
+} // namespace llvm
+
+#endif // LLVM_ANALYSIS_DELINEARIZATION_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/DemandedBits.h b/contrib/llvm-project/llvm/include/llvm/Analysis/DemandedBits.h
index 04db3eb57c18..7a8618a27ce7 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/DemandedBits.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/DemandedBits.h
@@ -61,6 +61,20 @@ public:
 
   void print(raw_ostream &OS);
 
+  /// Compute alive bits of one addition operand from alive output and known
+  /// operand bits
+  static APInt determineLiveOperandBitsAdd(unsigned OperandNo,
+                                           const APInt &AOut,
+                                           const KnownBits &LHS,
+                                           const KnownBits &RHS);
+
+  /// Compute alive bits of one subtraction operand from alive output and known
+  /// operand bits
+  static APInt determineLiveOperandBitsSub(unsigned OperandNo,
+                                           const APInt &AOut,
+                                           const KnownBits &LHS,
+                                           const KnownBits &RHS);
+
 private:
   void performAnalysis();
   void determineLiveOperandBits(const Instruction *UserI,
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/DivergenceAnalysis.h b/contrib/llvm-project/llvm/include/llvm/Analysis/DivergenceAnalysis.h
index a2da97bb9059..2e4ae65d0981 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/DivergenceAnalysis.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/DivergenceAnalysis.h
@@ -59,8 +59,10 @@ public:
   /// \brief Mark \p UniVal as a value that is always uniform.
   void addUniformOverride(const Value &UniVal);
 
-  /// \brief Mark \p DivVal as a value that is always divergent.
-  void markDivergent(const Value &DivVal);
+  /// \brief Mark \p DivVal as a value that is always divergent. Will not do so
+  /// if `isAlwaysUniform(DivVal)`.
+  /// \returns Whether the tracked divergence state of \p DivVal changed.
+  bool markDivergent(const Value &DivVal);
 
   /// \brief Propagate divergence to all instructions in the region.
   /// Divergence is seeded by calls to \p markDivergent.
@@ -76,45 +78,38 @@ public:
   /// \brief Whether \p Val is divergent at its definition.
   bool isDivergent(const Value &Val) const;
 
-  /// \brief Whether \p U is divergent. Uses of a uniform value can be divergent.
+  /// \brief Whether \p U is divergent. Uses of a uniform value can be
+  /// divergent.
   bool isDivergentUse(const Use &U) const;
 
   void print(raw_ostream &OS, const Module *) const;
 
 private:
-  bool updateTerminator(const Instruction &Term) const;
-  bool updatePHINode(const PHINode &Phi) const;
-
-  /// \brief Computes whether \p Inst is divergent based on the
-  /// divergence of its operands.
-  ///
-  /// \returns Whether \p Inst is divergent.
-  ///
-  /// This should only be called for non-phi, non-terminator instructions.
-  bool updateNormalInstruction(const Instruction &Inst) const;
-
-  /// \brief Mark users of live-out users as divergent.
-  ///
-  /// \param LoopHeader the header of the divergent loop.
-  ///
-  /// Marks all users of live-out values of the loop headed by \p LoopHeader
-  /// as divergent and puts them on the worklist.
-  void taintLoopLiveOuts(const BasicBlock &LoopHeader);
-
-  /// \brief Push all users of \p Val (in the region) to the worklist
+  /// \brief Mark \p Term as divergent and push all Instructions that become
+  /// divergent as a result on the worklist.
+  void analyzeControlDivergence(const Instruction &Term);
+  /// \brief Mark all phi nodes in \p JoinBlock as divergent and push them on
+  /// the worklist.
+  void taintAndPushPhiNodes(const BasicBlock &JoinBlock);
+
+  /// \brief Identify all Instructions that become divergent because \p DivExit
+  /// is a divergent loop exit of \p DivLoop. Mark those instructions as
+  /// divergent and push them on the worklist.
+  void propagateLoopExitDivergence(const BasicBlock &DivExit,
+                                   const Loop &DivLoop);
+
+  /// \brief Internal implementation function for propagateLoopExitDivergence.
+  void analyzeLoopExitDivergence(const BasicBlock &DivExit,
+                                 const Loop &OuterDivLoop);
+
+  /// \brief Mark all instruction as divergent that use a value defined in \p
+  /// OuterDivLoop. Push their users on the worklist.
+  void analyzeTemporalDivergence(const Instruction &I,
+                                 const Loop &OuterDivLoop);
+
+  /// \brief Push all users of \p Val (in the region) to the worklist.
   void pushUsers(const Value &I);
 
-  /// \brief Push all phi nodes in @block to the worklist
-  void pushPHINodes(const BasicBlock &Block);
-
-  /// \brief Mark \p Block as join divergent
-  ///
-  /// A block is join divergent if two threads may reach it from different
-  /// incoming blocks at the same time.
-  void markBlockJoinDivergent(const BasicBlock &Block) {
-    DivergentJoinBlocks.insert(&Block);
-  }
-
   /// \brief Whether \p Val is divergent when read in \p ObservingBlock.
   bool isTemporalDivergent(const BasicBlock &ObservingBlock,
                            const Value &Val) const;
@@ -123,27 +118,9 @@ private:
   ///
   /// (see markBlockJoinDivergent).
   bool isJoinDivergent(const BasicBlock &Block) const {
-    return DivergentJoinBlocks.find(&Block) != DivergentJoinBlocks.end();
+    return DivergentJoinBlocks.contains(&Block);
   }
 
-  /// \brief Propagate control-induced divergence to users (phi nodes and
-  /// instructions).
-  //
-  // \param JoinBlock is a divergent loop exit or join point of two disjoint
-  // paths.
-  // \returns Whether \p JoinBlock is a divergent loop exit of \p TermLoop.
-  bool propagateJoinDivergence(const BasicBlock &JoinBlock,
-                               const Loop *TermLoop);
-
-  /// \brief Propagate induced value divergence due to control divergence in \p
-  /// Term.
-  void propagateBranchDivergence(const Instruction &Term);
-
-  /// \brief Propagate divergent caused by a divergent loop exit.
-  ///
-  /// \param ExitingLoop is a divergent loop.
-  void propagateLoopDivergence(const Loop &ExitingLoop);
-
 private:
   const Function &F;
   // If regionLoop != nullptr, analysis is only performed within \p RegionLoop.
@@ -166,7 +143,7 @@ private:
   DenseSet<const Value *> UniformOverrides;
 
   // Blocks with joining divergent control from different predecessors.
-  DenseSet<const BasicBlock *> DivergentJoinBlocks;
+  DenseSet<const BasicBlock *> DivergentJoinBlocks; // FIXME Deprecated
 
   // Detected/marked divergent values.
   DenseSet<const Value *> DivergentValues;
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/DominanceFrontier.h b/contrib/llvm-project/llvm/include/llvm/Analysis/DominanceFrontier.h
index f67929c997f9..cef5e03b3b7a 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/DominanceFrontier.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/DominanceFrontier.h
@@ -26,7 +26,6 @@
 #include <map>
 #include <set>
 #include <utility>
-#include <vector>
 
 namespace llvm {
 
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/EHPersonalities.h b/contrib/llvm-project/llvm/include/llvm/Analysis/EHPersonalities.h
index c17b0b4a90d3..eaada6627494 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/EHPersonalities.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/EHPersonalities.h
@@ -28,11 +28,12 @@ enum class EHPersonality {
   GNU_CXX_SjLj,
   GNU_ObjC,
   MSVC_X86SEH,
-  MSVC_Win64SEH,
+  MSVC_TableSEH,
   MSVC_CXX,
   CoreCLR,
   Rust,
-  Wasm_CXX
+  Wasm_CXX,
+  XL_CXX
 };
 
 /// See if the given exception handling personality function is one
@@ -51,7 +52,7 @@ inline bool isAsynchronousEHPersonality(EHPersonality Pers) {
   // unknown personalities don't catch asynch exceptions.
   switch (Pers) {
   case EHPersonality::MSVC_X86SEH:
-  case EHPersonality::MSVC_Win64SEH:
+  case EHPersonality::MSVC_TableSEH:
     return true;
   default:
     return false;
@@ -65,7 +66,7 @@ inline bool isFuncletEHPersonality(EHPersonality Pers) {
   switch (Pers) {
   case EHPersonality::MSVC_CXX:
   case EHPersonality::MSVC_X86SEH:
-  case EHPersonality::MSVC_Win64SEH:
+  case EHPersonality::MSVC_TableSEH:
   case EHPersonality::CoreCLR:
     return true;
   default:
@@ -80,7 +81,7 @@ inline bool isScopedEHPersonality(EHPersonality Pers) {
   switch (Pers) {
   case EHPersonality::MSVC_CXX:
   case EHPersonality::MSVC_X86SEH:
-  case EHPersonality::MSVC_Win64SEH:
+  case EHPersonality::MSVC_TableSEH:
   case EHPersonality::CoreCLR:
   case EHPersonality::Wasm_CXX:
     return true;
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h b/contrib/llvm-project/llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h
new file mode 100644
index 000000000000..a5f96e72ce97
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h
@@ -0,0 +1,86 @@
+//=- FunctionPropertiesAnalysis.h - Function Properties Analysis --*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the FunctionPropertiesInfo and FunctionPropertiesAnalysis
+// classes used to extract function properties.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_FUNCTIONPROPERTIESANALYSIS_H_
+#define LLVM_FUNCTIONPROPERTIESANALYSIS_H_
+
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+class Function;
+
+class FunctionPropertiesInfo {
+public:
+  static FunctionPropertiesInfo getFunctionPropertiesInfo(const Function &F,
+                                                          const LoopInfo &LI);
+
+  void print(raw_ostream &OS) const;
+
+  /// Number of basic blocks
+  int64_t BasicBlockCount = 0;
+
+  /// Number of blocks reached from a conditional instruction, or that are
+  /// 'cases' of a SwitchInstr.
+  // FIXME: We may want to replace this with a more meaningful metric, like
+  // number of conditionally executed blocks:
+  // 'if (a) s();' would be counted here as 2 blocks, just like
+  // 'if (a) s(); else s2(); s3();' would.
+  int64_t BlocksReachedFromConditionalInstruction = 0;
+
+  /// Number of uses of this function, plus 1 if the function is callable
+  /// outside the module.
+  int64_t Uses = 0;
+
+  /// Number of direct calls made from this function to other functions
+  /// defined in this module.
+  int64_t DirectCallsToDefinedFunctions = 0;
+
+  // Load Instruction Count
+  int64_t LoadInstCount = 0;
+
+  // Store Instruction Count
+  int64_t StoreInstCount = 0;
+
+  // Maximum Loop Depth in the Function
+  int64_t MaxLoopDepth = 0;
+
+  // Number of Top Level Loops in the Function
+  int64_t TopLevelLoopCount = 0;
+};
+
+// Analysis pass
+class FunctionPropertiesAnalysis
+    : public AnalysisInfoMixin<FunctionPropertiesAnalysis> {
+
+public:
+  static AnalysisKey Key;
+
+  using Result = FunctionPropertiesInfo;
+
+  Result run(Function &F, FunctionAnalysisManager &FAM);
+};
+
+/// Printer pass for the FunctionPropertiesAnalysis results.
+class FunctionPropertiesPrinterPass
+    : public PassInfoMixin<FunctionPropertiesPrinterPass> {
+  raw_ostream &OS;
+
+public:
+  explicit FunctionPropertiesPrinterPass(raw_ostream &OS) : OS(OS) {}
+
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
+} // namespace llvm
+#endif // LLVM_FUNCTIONPROPERTIESANALYSIS_H_
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h b/contrib/llvm-project/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h
new file mode 100644
index 000000000000..9e97541e542b
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h
@@ -0,0 +1,789 @@
+//===- IRSimilarityIdentifier.h - Find similarity in a module --------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// \file
+// Interface file for the IRSimilarityIdentifier for identifying similarities in
+// IR including the IRInstructionMapper, which maps an Instruction to unsigned
+// integers.
+//
+// Two sequences of instructions are called "similar" if they perform the same
+// series of operations for all inputs.
+//
+// \code
+// %1 = add i32 %a, 10
+// %2 = add i32 %a, %1
+// %3 = icmp slt icmp %1, %2
+// \endcode
+//
+// and
+//
+// \code
+// %1 = add i32 11, %a
+// %2 = sub i32 %a, %1
+// %3 = icmp sgt icmp %2, %1
+// \endcode
+//
+// ultimately have the same result, even if the inputs, and structure are
+// slightly different.
+//
+// For instructions, we do not worry about operands that do not have fixed
+// semantic meaning to the program.  We consider the opcode that the instruction
+// has, the types, parameters, and extra information such as the function name,
+// or comparison predicate.  These are used to create a hash to map instructions
+// to integers to be used in similarity matching in sequences of instructions
+//
+// Terminology:
+// An IRSimilarityCandidate is a region of IRInstructionData (wrapped
+// Instructions), usually used to denote a region of similarity has been found.
+//
+// A SimilarityGroup is a set of IRSimilarityCandidates that are structurally
+// similar to one another.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H
+#define LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H
+
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Allocator.h"
+
+namespace llvm {
+namespace IRSimilarity {
+
+struct IRInstructionDataList;
+
+/// This represents what is and is not supported when finding similarity in
+/// Instructions.
+///
+/// Legal Instructions are considered when looking at similarity between
+/// Instructions.
+///
+/// Illegal Instructions cannot be considered when looking for similarity
+/// between Instructions. They act as boundaries between similarity regions.
+///
+/// Invisible Instructions are skipped over during analysis.
+// TODO: Shared with MachineOutliner
+enum InstrType { Legal, Illegal, Invisible };
+
+/// This provides the utilities for hashing an Instruction to an unsigned
+/// integer. Two IRInstructionDatas produce the same hash value when their
+/// underlying Instructions perform the same operation (even if they don't have
+/// the same input operands.)
+/// As a more concrete example, consider the following:
+///
+/// \code
+/// %add1 = add i32 %a, %b
+/// %add2 = add i32 %c, %d
+/// %add3 = add i64 %e, %f
+/// \endcode
+///
+// Then the IRInstructionData wrappers for these Instructions may be hashed like
+/// so:
+///
+/// \code
+/// ; These two adds have the same types and operand types, so they hash to the
+/// ; same number.
+/// %add1 = add i32 %a, %b ; Hash: 1
+/// %add2 = add i32 %c, %d ; Hash: 1
+/// ; This add produces an i64. This differentiates it from %add1 and %add2. So,
+/// ; it hashes to a different number.
+/// %add3 = add i64 %e, %f; Hash: 2
+/// \endcode
+///
+///
+/// This hashing scheme will be used to represent the program as a very long
+/// string. This string can then be placed in a data structure which can be used
+/// for similarity queries.
+///
+/// TODO: Handle types of Instructions which can be equal even with different
+/// operands. (E.g. comparisons with swapped predicates.)
+/// TODO: Handle CallInsts, which are only checked for function type
+/// by \ref isSameOperationAs.
+/// TODO: Handle GetElementPtrInsts, as some of the operands have to be the
+/// exact same, and some do not.
+struct IRInstructionData : ilist_node<IRInstructionData> {
+
+  /// The source Instruction that is being wrapped.
+  Instruction *Inst = nullptr;
+  /// The values of the operands in the Instruction.
+  SmallVector<Value *, 4> OperVals;
+  /// The legality of the wrapped instruction. This is informed by InstrType,
+  /// and is used when checking when two instructions are considered similar.
+  /// If either instruction is not legal, the instructions are automatically not
+  /// considered similar.
+  bool Legal;
+
+  /// This is only relevant if we are wrapping a CmpInst where we needed to
+  /// change the predicate of a compare instruction from a greater than form
+  /// to a less than form.  It is None otherwise.
+  Optional<CmpInst::Predicate> RevisedPredicate;
+
+  /// Gather the information that is difficult to gather for an Instruction, or
+  /// is changed. i.e. the operands of an Instruction and the Types of those
+  /// operands. This extra information allows for similarity matching to make
+  /// assertions that allow for more flexibility when checking for whether an
+  /// Instruction performs the same operation.
+  IRInstructionData(Instruction &I, bool Legality, IRInstructionDataList &IDL);
+
+  /// Get the predicate that the compare instruction is using for hashing the
+  /// instruction. the IRInstructionData must be wrapping a CmpInst.
+  CmpInst::Predicate getPredicate() const;
+
+  /// A function that swaps the predicates to their less than form if they are
+  /// in a greater than form. Otherwise, the predicate is unchanged.
+  ///
+  /// \param CI - The comparison operation to find a consistent preidcate for.
+  /// \return the consistent comparison predicate. 
+  static CmpInst::Predicate predicateForConsistency(CmpInst *CI);
+
+  /// Hashes \p Value based on its opcode, types, and operand types.
+  /// Two IRInstructionData instances produce the same hash when they perform
+  /// the same operation.
+  ///
+  /// As a simple example, consider the following instructions.
+  ///
+  /// \code
+  /// %add1 = add i32 %x1, %y1
+  /// %add2 = add i32 %x2, %y2
+  ///
+  /// %sub = sub i32 %x1, %y1
+  ///
+  /// %add_i64 = add i64 %x2, %y2
+  /// \endcode
+  ///
+  /// Because the first two adds operate the same types, and are performing the
+  /// same action, they will be hashed to the same value.
+  ///
+  /// However, the subtraction instruction is not the same as an addition, and
+  /// will be hashed to a different value.
+  ///
+  /// Finally, the last add has a different type compared to the first two add
+  /// instructions, so it will also be hashed to a different value that any of
+  /// the previous instructions.
+  ///
+  /// \param [in] ID - The IRInstructionData instance to be hashed.
+  /// \returns A hash_value of the IRInstructionData.
+  friend hash_code hash_value(const IRInstructionData &ID) {
+    SmallVector<Type *, 4> OperTypes;
+    for (Value *V : ID.OperVals)
+      OperTypes.push_back(V->getType());
+
+    if (isa<CmpInst>(ID.Inst))
+      return llvm::hash_combine(
+          llvm::hash_value(ID.Inst->getOpcode()),
+          llvm::hash_value(ID.Inst->getType()),
+          llvm::hash_value(ID.getPredicate()),
+          llvm::hash_combine_range(OperTypes.begin(), OperTypes.end()));
+    else if (CallInst *CI = dyn_cast<CallInst>(ID.Inst))
+      return llvm::hash_combine(
+          llvm::hash_value(ID.Inst->getOpcode()),
+          llvm::hash_value(ID.Inst->getType()),
+          llvm::hash_value(CI->getCalledFunction()->getName().str()),
+          llvm::hash_combine_range(OperTypes.begin(), OperTypes.end()));
+    return llvm::hash_combine(
+        llvm::hash_value(ID.Inst->getOpcode()),
+        llvm::hash_value(ID.Inst->getType()),
+        llvm::hash_combine_range(OperTypes.begin(), OperTypes.end()));
+  }
+
+  IRInstructionDataList *IDL = nullptr;
+};
+
+struct IRInstructionDataList : simple_ilist<IRInstructionData> {};
+
+/// Compare one IRInstructionData class to another IRInstructionData class for
+/// whether they are performing a the same operation, and can mapped to the
+/// same value. For regular instructions if the hash value is the same, then
+/// they will also be close.
+///
+/// \param A - The first IRInstructionData class to compare
+/// \param B - The second IRInstructionData class to compare
+/// \returns true if \p A and \p B are similar enough to be mapped to the same
+/// value.
+bool isClose(const IRInstructionData &A, const IRInstructionData &B);
+
+struct IRInstructionDataTraits : DenseMapInfo<IRInstructionData *> {
+  static inline IRInstructionData *getEmptyKey() { return nullptr; }
+  static inline IRInstructionData *getTombstoneKey() {
+    return reinterpret_cast<IRInstructionData *>(-1);
+  }
+
+  static unsigned getHashValue(const IRInstructionData *E) {
+    using llvm::hash_value;
+    assert(E && "IRInstructionData is a nullptr?");
+    return hash_value(*E);
+  }
+
+  static bool isEqual(const IRInstructionData *LHS,
+                      const IRInstructionData *RHS) {
+    if (RHS == getEmptyKey() || RHS == getTombstoneKey() ||
+        LHS == getEmptyKey() || LHS == getTombstoneKey())
+      return LHS == RHS;
+
+    assert(LHS && RHS && "nullptr should have been caught by getEmptyKey?");
+    return isClose(*LHS, *RHS);
+  }
+};
+
+/// Helper struct for converting the Instructions in a Module into a vector of
+/// unsigned integers. This vector of unsigned integers can be thought of as a
+/// "numeric string". This numeric string can then be queried by, for example,
+/// data structures that find repeated substrings.
+///
+/// This hashing is done per BasicBlock in the module. To hash Instructions
+/// based off of their operations, each Instruction is wrapped in an
+/// IRInstructionData struct. The unsigned integer for an IRInstructionData
+/// depends on:
+/// - The hash provided by the IRInstructionData.
+/// - Which member of InstrType the IRInstructionData is classified as.
+// See InstrType for more details on the possible classifications, and how they
+// manifest in the numeric string.
+///
+/// The numeric string for an individual BasicBlock is terminated by an unique
+/// unsigned integer. This prevents data structures which rely on repetition
+/// from matching across BasicBlocks. (For example, the SuffixTree.)
+/// As a concrete example, if we have the following two BasicBlocks:
+/// \code
+/// bb0:
+/// %add1 = add i32 %a, %b
+/// %add2 = add i32 %c, %d
+/// %add3 = add i64 %e, %f
+/// bb1:
+/// %sub = sub i32 %c, %d
+/// \endcode
+/// We may hash the Instructions like this (via IRInstructionData):
+/// \code
+/// bb0:
+/// %add1 = add i32 %a, %b ; Hash: 1
+/// %add2 = add i32 %c, %d; Hash: 1
+/// %add3 = add i64 %e, %f; Hash: 2
+/// bb1:
+/// %sub = sub i32 %c, %d; Hash: 3
+/// %add4 = add i32 %c, %d ; Hash: 1
+/// \endcode
+/// And produce a "numeric string representation" like so:
+/// 1, 1, 2, unique_integer_1, 3, 1, unique_integer_2
+///
+/// TODO: This is very similar to the MachineOutliner, and should be
+/// consolidated into the same interface.
+struct IRInstructionMapper {
+  /// The starting illegal instruction number to map to.
+  ///
+  /// Set to -3 for compatibility with DenseMapInfo<unsigned>.
+  unsigned IllegalInstrNumber = static_cast<unsigned>(-3);
+
+  /// The next available integer to assign to a legal Instruction to.
+  unsigned LegalInstrNumber = 0;
+
+  /// Correspondence from IRInstructionData to unsigned integers.
+  DenseMap<IRInstructionData *, unsigned, IRInstructionDataTraits>
+      InstructionIntegerMap;
+
+  /// Set if we added an illegal number in the previous step.
+  /// Since each illegal number is unique, we only need one of them between
+  /// each range of legal numbers. This lets us make sure we don't add more
+  /// than one illegal number per range.
+  bool AddedIllegalLastTime = false;
+
+  /// Marks whether we found a illegal instruction in the previous step.
+  bool CanCombineWithPrevInstr = false;
+
+  /// Marks whether we have found a set of instructions that is long enough
+  /// to be considered for similarity.
+  bool HaveLegalRange = false;
+
+  /// This allocator pointer is in charge of holding on to the IRInstructionData
+  /// so it is not deallocated until whatever external tool is using it is done
+  /// with the information.
+  SpecificBumpPtrAllocator<IRInstructionData> *InstDataAllocator = nullptr;
+
+  /// This allocator pointer is in charge of creating the IRInstructionDataList
+  /// so it is not deallocated until whatever external tool is using it is done
+  /// with the information.
+  SpecificBumpPtrAllocator<IRInstructionDataList> *IDLAllocator = nullptr;
+
+  /// Get an allocated IRInstructionData struct using the InstDataAllocator.
+  ///
+  /// \param I - The Instruction to wrap with IRInstructionData.
+  /// \param Legality - A boolean value that is true if the instruction is to
+  /// be considered for similarity, and false if not.
+  /// \param IDL - The InstructionDataList that the IRInstructionData is
+  /// inserted into.
+  /// \returns An allocated IRInstructionData struct.
+  IRInstructionData *allocateIRInstructionData(Instruction &I, bool Legality,
+                                               IRInstructionDataList &IDL);
+
+  /// Get an allocated IRInstructionDataList object using the IDLAllocator.
+  ///
+  /// \returns An allocated IRInstructionDataList object.
+  IRInstructionDataList *allocateIRInstructionDataList();
+
+  IRInstructionDataList *IDL = nullptr;
+
+  /// Maps the Instructions in a BasicBlock \p BB to legal or illegal integers
+  /// determined by \p InstrType. Two Instructions are mapped to the same value
+  /// if they are close as defined by the InstructionData class above.
+  ///
+  /// \param [in] BB - The BasicBlock to be mapped to integers.
+  /// \param [in,out] InstrList - Vector of IRInstructionData to append to.
+  /// \param [in,out] IntegerMapping - Vector of unsigned integers to append to.
+  void convertToUnsignedVec(BasicBlock &BB,
+                            std::vector<IRInstructionData *> &InstrList,
+                            std::vector<unsigned> &IntegerMapping);
+
+  /// Maps an Instruction to a legal integer.
+  ///
+  /// \param [in] It - The Instruction to be mapped to an integer.
+  /// \param [in,out] IntegerMappingForBB - Vector of unsigned integers to
+  /// append to.
+  /// \param [in,out] InstrListForBB - Vector of InstructionData to append to.
+  /// \returns The integer \p It was mapped to.
+  unsigned mapToLegalUnsigned(BasicBlock::iterator &It,
+                              std::vector<unsigned> &IntegerMappingForBB,
+                              std::vector<IRInstructionData *> &InstrListForBB);
+
+  /// Maps an Instruction to an illegal integer.
+  ///
+  /// \param [in] It - The \p Instruction to be mapped to an integer.
+  /// \param [in,out] IntegerMappingForBB - Vector of unsigned integers to
+  /// append to.
+  /// \param [in,out] InstrListForBB - Vector of IRInstructionData to append to.
+  /// \param End - true if creating a dummy IRInstructionData at the end of a
+  /// basic block.
+  /// \returns The integer \p It was mapped to.
+  unsigned mapToIllegalUnsigned(
+      BasicBlock::iterator &It, std::vector<unsigned> &IntegerMappingForBB,
+      std::vector<IRInstructionData *> &InstrListForBB, bool End = false);
+
+  IRInstructionMapper(SpecificBumpPtrAllocator<IRInstructionData> *IDA,
+                      SpecificBumpPtrAllocator<IRInstructionDataList> *IDLA)
+      : InstDataAllocator(IDA), IDLAllocator(IDLA) {
+    // Make sure that the implementation of DenseMapInfo<unsigned> hasn't
+    // changed.
+    assert(DenseMapInfo<unsigned>::getEmptyKey() == static_cast<unsigned>(-1) &&
+           "DenseMapInfo<unsigned>'s empty key isn't -1!");
+    assert(DenseMapInfo<unsigned>::getTombstoneKey() ==
+               static_cast<unsigned>(-2) &&
+           "DenseMapInfo<unsigned>'s tombstone key isn't -2!");
+
+    IDL = new (IDLAllocator->Allocate())
+        IRInstructionDataList();
+  }
+
+  /// Custom InstVisitor to classify different instructions for whether it can
+  /// be analyzed for similarity.
+  struct InstructionClassification
+      : public InstVisitor<InstructionClassification, InstrType> {
+    InstructionClassification() {}
+
+    // TODO: Determine a scheme to resolve when the label is similar enough.
+    InstrType visitBranchInst(BranchInst &BI) { return Illegal; }
+    // TODO: Determine a scheme to resolve when the labels are similar enough.
+    InstrType visitPHINode(PHINode &PN) { return Illegal; }
+    // TODO: Handle allocas.
+    InstrType visitAllocaInst(AllocaInst &AI) { return Illegal; }
+    // We exclude variable argument instructions since variable arguments
+    // requires extra checking of the argument list.
+    InstrType visitVAArgInst(VAArgInst &VI) { return Illegal; }
+    // We exclude all exception handling cases since they are so context
+    // dependent.
+    InstrType visitLandingPadInst(LandingPadInst &LPI) { return Illegal; }
+    InstrType visitFuncletPadInst(FuncletPadInst &FPI) { return Illegal; }
+    // DebugInfo should be included in the regions, but should not be
+    // analyzed for similarity as it has no bearing on the outcome of the
+    // program.
+    InstrType visitDbgInfoIntrinsic(DbgInfoIntrinsic &DII) { return Invisible; }
+    // TODO: Handle specific intrinsics.
+    InstrType visitIntrinsicInst(IntrinsicInst &II) { return Illegal; }
+    // We only allow call instructions where the function has a name and
+    // is not an indirect call.
+    InstrType visitCallInst(CallInst &CI) {
+      Function *F = CI.getCalledFunction();
+      if (!F || CI.isIndirectCall() || !F->hasName())
+        return Illegal;
+      return Legal;
+    }
+    // TODO: We do not current handle similarity that changes the control flow.
+    InstrType visitInvokeInst(InvokeInst &II) { return Illegal; }
+    // TODO: We do not current handle similarity that changes the control flow.
+    InstrType visitCallBrInst(CallBrInst &CBI) { return Illegal; }
+    // TODO: Handle interblock similarity.
+    InstrType visitTerminator(Instruction &I) { return Illegal; }
+    InstrType visitInstruction(Instruction &I) { return Legal; }
+  };
+
+  /// Maps an Instruction to a member of InstrType.
+  InstructionClassification InstClassifier;
+};
+
+/// This is a class that wraps a range of IRInstructionData from one point to
+/// another in the vector of IRInstructionData, which is a region of the
+/// program.  It is also responsible for defining the structure within this
+/// region of instructions.
+///
+/// The structure of a region is defined through a value numbering system
+/// assigned to each unique value in a region at the creation of the
+/// IRSimilarityCandidate.
+///
+/// For example, for each Instruction we add a mapping for each new
+/// value seen in that Instruction.
+/// IR:                    Mapping Added:
+/// %add1 = add i32 %a, c1    %add1 -> 3, %a -> 1, c1 -> 2
+/// %add2 = add i32 %a, %1    %add2 -> 4
+/// %add3 = add i32 c2, c1    %add3 -> 6, c2 -> 5
+///
+/// We can compare IRSimilarityCandidates against one another.
+/// The \ref isSimilar function compares each IRInstructionData against one
+/// another and if we have the same sequences of IRInstructionData that would
+/// create the same hash, we have similar IRSimilarityCandidates.
+///
+/// We can also compare the structure of IRSimilarityCandidates. If we can
+/// create a mapping of registers in the region contained by one
+/// IRSimilarityCandidate to the region contained by different
+/// IRSimilarityCandidate, they can be considered structurally similar.
+///
+/// IRSimilarityCandidate1:   IRSimilarityCandidate2:
+/// %add1 = add i32 %a, %b    %add1 = add i32 %d, %e
+/// %add2 = add i32 %a, %c    %add2 = add i32 %d, %f
+/// %add3 = add i32 c1, c2    %add3 = add i32 c3, c4
+///
+/// Can have the following mapping from candidate to candidate of:
+/// %a -> %d, %b -> %e, %c -> %f, c1 -> c3, c2 -> c4
+/// and can be considered similar.
+///
+/// IRSimilarityCandidate1:   IRSimilarityCandidate2:
+/// %add1 = add i32 %a, %b    %add1 = add i32 %d, c4
+/// %add2 = add i32 %a, %c    %add2 = add i32 %d, %f
+/// %add3 = add i32 c1, c2    %add3 = add i32 c3, c4
+///
+/// We cannot create the same mapping since the use of c4 is not used in the
+/// same way as %b or c2.
+class IRSimilarityCandidate {
+private:
+  /// The start index of this IRSimilarityCandidate in the instruction list.
+  unsigned StartIdx = 0;
+
+  /// The number of instructions in this IRSimilarityCandidate.
+  unsigned Len = 0;
+
+  /// The first instruction in this IRSimilarityCandidate.
+  IRInstructionData *FirstInst = nullptr;
+
+  /// The last instruction in this IRSimilarityCandidate.
+  IRInstructionData *LastInst = nullptr;
+
+  /// Global Value Numbering structures
+  /// @{
+  /// Stores the mapping of the value to the number assigned to it in the
+  /// IRSimilarityCandidate.
+  DenseMap<Value *, unsigned> ValueToNumber;
+  /// Stores the mapping of the number to the value assigned this number.
+  DenseMap<unsigned, Value *> NumberToValue;
+  /// @}
+
+public:
+  /// \param StartIdx - The starting location of the region.
+  /// \param Len - The length of the region.
+  /// \param FirstInstIt - The starting IRInstructionData of the region.
+  /// \param LastInstIt - The ending IRInstructionData of the region.
+  IRSimilarityCandidate(unsigned StartIdx, unsigned Len,
+                        IRInstructionData *FirstInstIt,
+                        IRInstructionData *LastInstIt);
+
+  /// \param A - The first IRInstructionCandidate to compare.
+  /// \param B - The second IRInstructionCandidate to compare.
+  /// \returns True when every IRInstructionData in \p A is similar to every
+  /// IRInstructionData in \p B.
+  static bool isSimilar(const IRSimilarityCandidate &A,
+                        const IRSimilarityCandidate &B);
+
+  /// \param A - The first IRInstructionCandidate to compare.
+  /// \param B - The second IRInstructionCandidate to compare.
+  /// \returns True when every IRInstructionData in \p A is structurally similar
+  /// to \p B.
+  static bool compareStructure(const IRSimilarityCandidate &A,
+                               const IRSimilarityCandidate &B);
+
+  struct OperandMapping {
+    /// The IRSimilarityCandidate that holds the instruction the OperVals were
+    /// pulled from.
+    const IRSimilarityCandidate &IRSC;
+
+    /// The operand values to be analyzed.
+    ArrayRef<Value *> &OperVals;
+
+    /// The current mapping of global value numbers from one IRSimilarityCandidate
+    /// to another IRSimilarityCandidate.
+    DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMapping;
+  };
+
+  /// Compare the operands in \p A and \p B and check that the current mapping
+  /// of global value numbers from \p A to \p B and \p B to \A is consistent.
+  ///
+  /// \param A - The first IRInstructionCandidate, operand values, and current
+  /// operand mappings to compare.
+  /// \param B - The second IRInstructionCandidate, operand values, and current
+  /// operand mappings to compare.
+  /// \returns true if the IRSimilarityCandidates operands are compatible.
+  static bool compareNonCommutativeOperandMapping(OperandMapping A,
+                                                  OperandMapping B);
+
+  /// Compare the operands in \p A and \p B and check that the current mapping
+  /// of global value numbers from \p A to \p B and \p B to \A is consistent
+  /// given that the operands are commutative.
+  ///
+  /// \param A - The first IRInstructionCandidate, operand values, and current
+  /// operand mappings to compare.
+  /// \param B - The second IRInstructionCandidate, operand values, and current
+  /// operand mappings to compare.
+  /// \returns true if the IRSimilarityCandidates operands are compatible.
+  static bool compareCommutativeOperandMapping(OperandMapping A,
+                                               OperandMapping B);
+
+  /// Compare the start and end indices of the two IRSimilarityCandidates for
+  /// whether they overlap. If the start instruction of one
+  /// IRSimilarityCandidate is less than the end instruction of the other, and
+  /// the start instruction of one is greater than the start instruction of the
+  /// other, they overlap.
+  ///
+  /// \returns true if the IRSimilarityCandidates do not have overlapping
+  /// instructions.
+  static bool overlap(const IRSimilarityCandidate &A,
+                      const IRSimilarityCandidate &B);
+
+  /// \returns the number of instructions in this Candidate.
+  unsigned getLength() const { return Len; }
+
+  /// \returns the start index of this IRSimilarityCandidate.
+  unsigned getStartIdx() const { return StartIdx; }
+
+  /// \returns the end index of this IRSimilarityCandidate.
+  unsigned getEndIdx() const { return StartIdx + Len - 1; }
+
+  /// \returns The first IRInstructionData.
+  IRInstructionData *front() const { return FirstInst; }
+  /// \returns The last IRInstructionData.
+  IRInstructionData *back() const { return LastInst; }
+
+  /// \returns The first Instruction.
+  Instruction *frontInstruction() { return FirstInst->Inst; }
+  /// \returns The last Instruction
+  Instruction *backInstruction() { return LastInst->Inst; }
+
+  /// \returns The BasicBlock the IRSimilarityCandidate starts in.
+  BasicBlock *getStartBB() { return FirstInst->Inst->getParent(); }
+  /// \returns The BasicBlock the IRSimilarityCandidate ends in.
+  BasicBlock *getEndBB() { return LastInst->Inst->getParent(); }
+
+  /// \returns The Function that the IRSimilarityCandidate is located in.
+  Function *getFunction() { return getStartBB()->getParent(); }
+
+  /// Finds the positive number associated with \p V if it has been mapped.
+  /// \param [in] V - the Value to find.
+  /// \returns The positive number corresponding to the value.
+  /// \returns None if not present.
+  Optional<unsigned> getGVN(Value *V) {
+    assert(V != nullptr && "Value is a nullptr?");
+    DenseMap<Value *, unsigned>::iterator VNIt = ValueToNumber.find(V);
+    if (VNIt == ValueToNumber.end())
+      return None;
+    return VNIt->second;
+  }
+
+  /// Finds the Value associate with \p Num if it exists.
+  /// \param [in] Num - the number to find.
+  /// \returns The Value associated with the number.
+  /// \returns None if not present.
+  Optional<Value *> fromGVN(unsigned Num) {
+    DenseMap<unsigned, Value *>::iterator VNIt = NumberToValue.find(Num);
+    if (VNIt == NumberToValue.end())
+      return None;
+    assert(VNIt->second != nullptr && "Found value is a nullptr!");
+    return VNIt->second;
+  }
+
+  /// \param RHS -The IRSimilarityCandidate to compare against
+  /// \returns true if the IRSimilarityCandidate is occurs after the
+  /// IRSimilarityCandidate in the program.
+  bool operator<(const IRSimilarityCandidate &RHS) const {
+    return getStartIdx() > RHS.getStartIdx();
+  }
+
+  using iterator = IRInstructionDataList::iterator;
+  iterator begin() const { return iterator(front()); }
+  iterator end() const { return std::next(iterator(back())); }
+};
+
+typedef std::vector<IRSimilarityCandidate> SimilarityGroup;
+typedef std::vector<SimilarityGroup> SimilarityGroupList;
+
+/// This class puts all the pieces of the IRInstructionData,
+/// IRInstructionMapper, IRSimilarityCandidate together.
+///
+/// It first feeds the Module or vector of Modules into the IRInstructionMapper,
+/// and puts all the mapped instructions into a single long list of
+/// IRInstructionData.
+///
+/// The list of unsigned integers is given to the Suffix Tree or similar data
+/// structure to find repeated subsequences.  We construct an
+/// IRSimilarityCandidate for each instance of the subsequence.  We compare them
+/// against one another since  These repeated subsequences can have different
+/// structure.  For each different kind of structure found, we create a
+/// similarity group.
+///
+/// If we had four IRSimilarityCandidates A, B, C, and D where A, B and D are
+/// structurally similar to one another, while C is different we would have two
+/// SimilarityGroups:
+///
+/// SimilarityGroup 1:  SimilarityGroup 2
+/// A, B, D             C
+///
+/// A list of the different similarity groups is then returned after
+/// analyzing the module.
+class IRSimilarityIdentifier {
+public:
+  IRSimilarityIdentifier()
+      : Mapper(&InstDataAllocator, &InstDataListAllocator) {}
+
+  /// \param M the module to find similarity in.
+  explicit IRSimilarityIdentifier(Module &M)
+      : Mapper(&InstDataAllocator, &InstDataListAllocator) {
+    findSimilarity(M);
+  }
+
+private:
+  /// Map the instructions in the module to unsigned integers, using mapping
+  /// already present in the Mapper if possible.
+  ///
+  /// \param [in] M Module - To map to integers.
+  /// \param [in,out] InstrList - The vector to append IRInstructionData to.
+  /// \param [in,out] IntegerMapping - The vector to append integers to.
+  void populateMapper(Module &M, std::vector<IRInstructionData *> &InstrList,
+                      std::vector<unsigned> &IntegerMapping);
+
+  /// Map the instructions in the modules vector to unsigned integers, using
+  /// mapping already present in the mapper if possible.
+  ///
+  /// \param [in] Modules - The list of modules to use to populate the mapper
+  /// \param [in,out] InstrList - The vector to append IRInstructionData to.
+  /// \param [in,out] IntegerMapping - The vector to append integers to.
+  void populateMapper(ArrayRef<std::unique_ptr<Module>> &Modules,
+                      std::vector<IRInstructionData *> &InstrList,
+                      std::vector<unsigned> &IntegerMapping);
+
+  /// Find the similarity candidates in \p InstrList and corresponding
+  /// \p UnsignedVec
+  ///
+  /// \param [in,out] InstrList - The vector to append IRInstructionData to.
+  /// \param [in,out] IntegerMapping - The vector to append integers to.
+  /// candidates found in the program.
+  void findCandidates(std::vector<IRInstructionData *> &InstrList,
+                      std::vector<unsigned> &IntegerMapping);
+
+public:
+  // Find the IRSimilarityCandidates in the \p Modules and group by structural
+  // similarity in a SimilarityGroup, each group is returned in a
+  // SimilarityGroupList.
+  //
+  // \param [in] Modules - the modules to analyze.
+  // \returns The groups of similarity ranges found in the modules.
+  SimilarityGroupList &
+  findSimilarity(ArrayRef<std::unique_ptr<Module>> Modules);
+
+  // Find the IRSimilarityCandidates in the given Module grouped by structural
+  // similarity in a SimilarityGroup, contained inside a SimilarityGroupList.
+  //
+  // \param [in] M - the module to analyze.
+  // \returns The groups of similarity ranges found in the module.
+  SimilarityGroupList &findSimilarity(Module &M);
+
+  // Clears \ref SimilarityCandidates if it is already filled by a previous run.
+  void resetSimilarityCandidates() {
+    // If we've already analyzed a Module or set of Modules, so we must clear
+    // the SimilarityCandidates to make sure we do not have only old values
+    // hanging around.
+    if (SimilarityCandidates.hasValue())
+      SimilarityCandidates->clear();
+    else
+      SimilarityCandidates = SimilarityGroupList();
+  }
+
+  // \returns The groups of similarity ranges found in the most recently passed
+  // set of modules.
+  Optional<SimilarityGroupList> &getSimilarity() {
+    return SimilarityCandidates;
+  }
+
+private:
+  /// The allocator for IRInstructionData.
+  SpecificBumpPtrAllocator<IRInstructionData> InstDataAllocator;
+
+  /// The allocator for IRInstructionDataLists.
+  SpecificBumpPtrAllocator<IRInstructionDataList> InstDataListAllocator;
+
+  /// Map Instructions to unsigned integers and wraps the Instruction in an
+  /// instance of IRInstructionData.
+  IRInstructionMapper Mapper;
+
+  /// The SimilarityGroups found with the most recent run of \ref
+  /// findSimilarity. None if there is no recent run.
+  Optional<SimilarityGroupList> SimilarityCandidates;
+};
+
+} // end namespace IRSimilarity
+
+/// An analysis pass based on legacy pass manager that runs and returns
+/// IRSimilarityIdentifier run on the Module.
+class IRSimilarityIdentifierWrapperPass : public ModulePass {
+  std::unique_ptr<IRSimilarity::IRSimilarityIdentifier> IRSI;
+
+public:
+  static char ID;
+  IRSimilarityIdentifierWrapperPass();
+
+  IRSimilarity::IRSimilarityIdentifier &getIRSI() { return *IRSI; }
+  const IRSimilarity::IRSimilarityIdentifier &getIRSI() const { return *IRSI; }
+
+  bool doInitialization(Module &M) override;
+  bool doFinalization(Module &M) override;
+  bool runOnModule(Module &M) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+  }
+};
+
+/// An analysis pass that runs and returns the IRSimilarityIdentifier run on the
+/// Module.
+class IRSimilarityAnalysis : public AnalysisInfoMixin<IRSimilarityAnalysis> {
+public:
+  typedef IRSimilarity::IRSimilarityIdentifier Result;
+
+  Result run(Module &M, ModuleAnalysisManager &);
+
+private:
+  friend AnalysisInfoMixin<IRSimilarityAnalysis>;
+  static AnalysisKey Key;
+};
+
+/// Printer pass that uses \c IRSimilarityAnalysis.
+class IRSimilarityAnalysisPrinterPass
+    : public PassInfoMixin<IRSimilarityAnalysisPrinterPass> {
+  raw_ostream &OS;
+
+public:
+  explicit IRSimilarityAnalysisPrinterPass(raw_ostream &OS) : OS(OS) {}
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/IVDescriptors.h b/contrib/llvm-project/llvm/include/llvm/Analysis/IVDescriptors.h
index 1bae83d13c7a..28546110ba04 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/IVDescriptors.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/IVDescriptors.h
@@ -32,7 +32,24 @@ class PredicatedScalarEvolution;
 class ScalarEvolution;
 class SCEV;
 class DominatorTree;
-class ICFLoopSafetyInfo;
+
+/// These are the kinds of recurrences that we support.
+enum class RecurKind {
+  None,   ///< Not a recurrence.
+  Add,    ///< Sum of integers.
+  Mul,    ///< Product of integers.
+  Or,     ///< Bitwise or logical OR of integers.
+  And,    ///< Bitwise or logical AND of integers.
+  Xor,    ///< Bitwise or logical XOR of integers.
+  SMin,   ///< Signed integer min implemented in terms of select(cmp()).
+  SMax,   ///< Signed integer max implemented in terms of select(cmp()).
+  UMin,   ///< Unisgned integer min implemented in terms of select(cmp()).
+  UMax,   ///< Unsigned integer max implemented in terms of select(cmp()).
+  FAdd,   ///< Sum of floats.
+  FMul,   ///< Product of floats.
+  FMin,   ///< FP min implemented in terms of select(cmp()).
+  FMax    ///< FP max implemented in terms of select(cmp()).
+};
 
 /// The RecurrenceDescriptor is used to identify recurrences variables in a
 /// loop. Reduction is a special case of recurrence that has uses of the
@@ -48,40 +65,13 @@ class ICFLoopSafetyInfo;
 /// This struct holds information about recurrence variables.
 class RecurrenceDescriptor {
 public:
-  /// This enum represents the kinds of recurrences that we support.
-  enum RecurrenceKind {
-    RK_NoRecurrence,  ///< Not a recurrence.
-    RK_IntegerAdd,    ///< Sum of integers.
-    RK_IntegerMult,   ///< Product of integers.
-    RK_IntegerOr,     ///< Bitwise or logical OR of numbers.
-    RK_IntegerAnd,    ///< Bitwise or logical AND of numbers.
-    RK_IntegerXor,    ///< Bitwise or logical XOR of numbers.
-    RK_IntegerMinMax, ///< Min/max implemented in terms of select(cmp()).
-    RK_FloatAdd,      ///< Sum of floats.
-    RK_FloatMult,     ///< Product of floats.
-    RK_FloatMinMax    ///< Min/max implemented in terms of select(cmp()).
-  };
-
-  // This enum represents the kind of minmax recurrence.
-  enum MinMaxRecurrenceKind {
-    MRK_Invalid,
-    MRK_UIntMin,
-    MRK_UIntMax,
-    MRK_SIntMin,
-    MRK_SIntMax,
-    MRK_FloatMin,
-    MRK_FloatMax
-  };
-
   RecurrenceDescriptor() = default;
 
-  RecurrenceDescriptor(Value *Start, Instruction *Exit, RecurrenceKind K,
-                       FastMathFlags FMF, MinMaxRecurrenceKind MK,
-                       Instruction *UAI, Type *RT, bool Signed,
-                       SmallPtrSetImpl<Instruction *> &CI)
+  RecurrenceDescriptor(Value *Start, Instruction *Exit, RecurKind K,
+                       FastMathFlags FMF, Instruction *UAI, Type *RT,
+                       bool Signed, SmallPtrSetImpl<Instruction *> &CI)
       : StartValue(Start), LoopExitInstr(Exit), Kind(K), FMF(FMF),
-        MinMaxKind(MK), UnsafeAlgebraInst(UAI), RecurrenceType(RT),
-        IsSigned(Signed) {
+        UnsafeAlgebraInst(UAI), RecurrenceType(RT), IsSigned(Signed) {
     CastInsts.insert(CI.begin(), CI.end());
   }
 
@@ -89,22 +79,22 @@ public:
   class InstDesc {
   public:
     InstDesc(bool IsRecur, Instruction *I, Instruction *UAI = nullptr)
-        : IsRecurrence(IsRecur), PatternLastInst(I), MinMaxKind(MRK_Invalid),
-          UnsafeAlgebraInst(UAI) {}
+        : IsRecurrence(IsRecur), PatternLastInst(I),
+          RecKind(RecurKind::None), UnsafeAlgebraInst(UAI) {}
 
-    InstDesc(Instruction *I, MinMaxRecurrenceKind K, Instruction *UAI = nullptr)
-        : IsRecurrence(true), PatternLastInst(I), MinMaxKind(K),
+    InstDesc(Instruction *I, RecurKind K, Instruction *UAI = nullptr)
+        : IsRecurrence(true), PatternLastInst(I), RecKind(K),
           UnsafeAlgebraInst(UAI) {}
 
-    bool isRecurrence() { return IsRecurrence; }
+    bool isRecurrence() const { return IsRecurrence; }
 
-    bool hasUnsafeAlgebra() { return UnsafeAlgebraInst != nullptr; }
+    bool hasUnsafeAlgebra() const { return UnsafeAlgebraInst != nullptr; }
 
-    Instruction *getUnsafeAlgebraInst() { return UnsafeAlgebraInst; }
+    Instruction *getUnsafeAlgebraInst() const { return UnsafeAlgebraInst; }
 
-    MinMaxRecurrenceKind getMinMaxKind() { return MinMaxKind; }
+    RecurKind getRecKind() const { return RecKind; }
 
-    Instruction *getPatternInst() { return PatternLastInst; }
+    Instruction *getPatternInst() const { return PatternLastInst; }
 
   private:
     // Is this instruction a recurrence candidate.
@@ -112,8 +102,8 @@ public:
     // The last instruction in a min/max pattern (select of the select(icmp())
     // pattern), or the current recurrence instruction otherwise.
     Instruction *PatternLastInst;
-    // If this is a min/max pattern the comparison predicate.
-    MinMaxRecurrenceKind MinMaxKind;
+    // If this is a min/max pattern.
+    RecurKind RecKind;
     // Recurrence has unsafe algebra.
     Instruction *UnsafeAlgebraInst;
   };
@@ -123,7 +113,7 @@ public:
   /// select(icmp()) this function advances the instruction pointer 'I' from the
   /// compare instruction to the select instruction and stores this pointer in
   /// 'PatternLastInst' member of the returned struct.
-  static InstDesc isRecurrenceInstr(Instruction *I, RecurrenceKind Kind,
+  static InstDesc isRecurrenceInstr(Instruction *I, RecurKind Kind,
                                     InstDesc &Prev, bool HasFunNoNaNAttr);
 
   /// Returns true if instruction I has multiple uses in Insts
@@ -134,27 +124,28 @@ public:
   /// Returns true if all uses of the instruction I is within the Set.
   static bool areAllUsesIn(Instruction *I, SmallPtrSetImpl<Instruction *> &Set);
 
-  /// Returns a struct describing if the instruction if the instruction is a
+  /// Returns a struct describing if the instruction is a
   /// Select(ICmp(X, Y), X, Y) instruction pattern corresponding to a min(X, Y)
-  /// or max(X, Y).
-  static InstDesc isMinMaxSelectCmpPattern(Instruction *I, InstDesc &Prev);
+  /// or max(X, Y). \p Prev specifies the description of an already processed
+  /// select instruction, so its corresponding cmp can be matched to it.
+  static InstDesc isMinMaxSelectCmpPattern(Instruction *I,
+                                           const InstDesc &Prev);
 
   /// Returns a struct describing if the instruction is a
   /// Select(FCmp(X, Y), (Z = X op PHINode), PHINode) instruction pattern.
-  static InstDesc isConditionalRdxPattern(RecurrenceKind Kind, Instruction *I);
+  static InstDesc isConditionalRdxPattern(RecurKind Kind, Instruction *I);
 
   /// Returns identity corresponding to the RecurrenceKind.
-  static Constant *getRecurrenceIdentity(RecurrenceKind K, Type *Tp);
+  static Constant *getRecurrenceIdentity(RecurKind K, Type *Tp);
 
-  /// Returns the opcode of binary operation corresponding to the
-  /// RecurrenceKind.
-  static unsigned getRecurrenceBinOp(RecurrenceKind Kind);
+  /// Returns the opcode corresponding to the RecurrenceKind.
+  static unsigned getOpcode(RecurKind Kind);
 
   /// Returns true if Phi is a reduction of type Kind and adds it to the
   /// RecurrenceDescriptor. If either \p DB is non-null or \p AC and \p DT are
   /// non-null, the minimal bit width needed to compute the reduction will be
   /// computed.
-  static bool AddReductionVar(PHINode *Phi, RecurrenceKind Kind, Loop *TheLoop,
+  static bool AddReductionVar(PHINode *Phi, RecurKind Kind, Loop *TheLoop,
                               bool HasFunNoNaNAttr,
                               RecurrenceDescriptor &RedDes,
                               DemandedBits *DB = nullptr,
@@ -183,42 +174,63 @@ public:
                          DenseMap<Instruction *, Instruction *> &SinkAfter,
                          DominatorTree *DT);
 
-  RecurrenceKind getRecurrenceKind() { return Kind; }
+  RecurKind getRecurrenceKind() const { return Kind; }
 
-  MinMaxRecurrenceKind getMinMaxRecurrenceKind() { return MinMaxKind; }
+  unsigned getOpcode() const { return getOpcode(getRecurrenceKind()); }
 
-  FastMathFlags getFastMathFlags() { return FMF; }
+  FastMathFlags getFastMathFlags() const { return FMF; }
 
-  TrackingVH<Value> getRecurrenceStartValue() { return StartValue; }
+  TrackingVH<Value> getRecurrenceStartValue() const { return StartValue; }
 
-  Instruction *getLoopExitInstr() { return LoopExitInstr; }
+  Instruction *getLoopExitInstr() const { return LoopExitInstr; }
 
   /// Returns true if the recurrence has unsafe algebra which requires a relaxed
   /// floating-point model.
-  bool hasUnsafeAlgebra() { return UnsafeAlgebraInst != nullptr; }
+  bool hasUnsafeAlgebra() const { return UnsafeAlgebraInst != nullptr; }
 
   /// Returns first unsafe algebra instruction in the PHI node's use-chain.
-  Instruction *getUnsafeAlgebraInst() { return UnsafeAlgebraInst; }
+  Instruction *getUnsafeAlgebraInst() const { return UnsafeAlgebraInst; }
 
   /// Returns true if the recurrence kind is an integer kind.
-  static bool isIntegerRecurrenceKind(RecurrenceKind Kind);
+  static bool isIntegerRecurrenceKind(RecurKind Kind);
 
   /// Returns true if the recurrence kind is a floating point kind.
-  static bool isFloatingPointRecurrenceKind(RecurrenceKind Kind);
+  static bool isFloatingPointRecurrenceKind(RecurKind Kind);
 
   /// Returns true if the recurrence kind is an arithmetic kind.
-  static bool isArithmeticRecurrenceKind(RecurrenceKind Kind);
+  static bool isArithmeticRecurrenceKind(RecurKind Kind);
+
+  /// Returns true if the recurrence kind is an integer min/max kind.
+  static bool isIntMinMaxRecurrenceKind(RecurKind Kind) {
+    return Kind == RecurKind::UMin || Kind == RecurKind::UMax ||
+           Kind == RecurKind::SMin || Kind == RecurKind::SMax;
+  }
+
+  /// Returns true if the recurrence kind is a floating-point min/max kind.
+  static bool isFPMinMaxRecurrenceKind(RecurKind Kind) {
+    return Kind == RecurKind::FMin || Kind == RecurKind::FMax;
+  }
+
+  /// Returns true if the recurrence kind is any min/max kind.
+  static bool isMinMaxRecurrenceKind(RecurKind Kind) {
+    return isIntMinMaxRecurrenceKind(Kind) || isFPMinMaxRecurrenceKind(Kind);
+  }
 
   /// Returns the type of the recurrence. This type can be narrower than the
   /// actual type of the Phi if the recurrence has been type-promoted.
-  Type *getRecurrenceType() { return RecurrenceType; }
+  Type *getRecurrenceType() const { return RecurrenceType; }
 
   /// Returns a reference to the instructions used for type-promoting the
   /// recurrence.
-  SmallPtrSet<Instruction *, 8> &getCastInsts() { return CastInsts; }
+  const SmallPtrSet<Instruction *, 8> &getCastInsts() const { return CastInsts; }
 
   /// Returns true if all source operands of the recurrence are SExtInsts.
-  bool isSigned() { return IsSigned; }
+  bool isSigned() const { return IsSigned; }
+
+  /// Attempts to find a chain of operations from Phi to LoopExitInst that can
+  /// be treated as a set of reductions instructions for in-loop reductions.
+  SmallVector<Instruction *, 4> getReductionOpChain(PHINode *Phi,
+                                                    Loop *L) const;
 
 private:
   // The starting value of the recurrence.
@@ -227,12 +239,10 @@ private:
   // The instruction who's value is used outside the loop.
   Instruction *LoopExitInstr = nullptr;
   // The kind of the recurrence.
-  RecurrenceKind Kind = RK_NoRecurrence;
+  RecurKind Kind = RecurKind::None;
   // The fast-math flags on the recurrent instructions.  We propagate these
   // fast-math flags into the vectorized FP instructions we generate.
   FastMathFlags FMF;
-  // If this a min/max recurrence the kind of recurrence.
-  MinMaxRecurrenceKind MinMaxKind = MRK_Invalid;
   // First occurrence of unasfe algebra in the PHI's use-chain.
   Instruction *UnsafeAlgebraInst = nullptr;
   // The type of the recurrence.
@@ -258,12 +268,6 @@ public:
   /// Default constructor - creates an invalid induction.
   InductionDescriptor() = default;
 
-  /// Get the consecutive direction. Returns:
-  ///   0 - unknown or non-consecutive.
-  ///   1 - consecutive and increasing.
-  ///  -1 - consecutive and decreasing.
-  int getConsecutiveDirection() const;
-
   Value *getStartValue() const { return StartValue; }
   InductionKind getKind() const { return IK; }
   const SCEV *getStep() const { return Step; }
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/InlineAdvisor.h b/contrib/llvm-project/llvm/include/llvm/Analysis/InlineAdvisor.h
index 3480d93385a8..c39fae13d3b8 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/InlineAdvisor.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/InlineAdvisor.h
@@ -9,12 +9,12 @@
 #ifndef LLVM_INLINEADVISOR_H_
 #define LLVM_INLINEADVISOR_H_
 
-#include <memory>
-#include <unordered_set>
-#include <vector>
-
 #include "llvm/Analysis/InlineCost.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/Analysis/Utils/ImportedFunctionsInliningStatistics.h"
+#include <memory>
+#include <unordered_set>
 
 namespace llvm {
 class BasicBlock;
@@ -36,7 +36,11 @@ class OptimizationRemarkEmitter;
 /// requires the full C Tensorflow API library, and evaluates models
 /// dynamically. This mode also permits generating training logs, for offline
 /// training.
-enum class InliningAdvisorMode : int { Default, Release, Development };
+enum class InliningAdvisorMode : int {
+  Default,
+  Release,
+  Development
+};
 
 class InlineAdvisor;
 /// Capture state between an inlining decision having had been made, and
@@ -62,10 +66,7 @@ public:
   /// behavior by implementing the corresponding record*Impl.
   ///
   /// Call after inlining succeeded, and did not result in deleting the callee.
-  void recordInlining() {
-    markRecorded();
-    recordInliningImpl();
-  }
+  void recordInlining();
 
   /// Call after inlining succeeded, and resulted in deleting the callee.
   void recordInliningWithCalleeDeleted();
@@ -111,21 +112,44 @@ private:
     assert(!Recorded && "Recording should happen exactly once");
     Recorded = true;
   }
+  void recordInlineStatsIfNeeded();
 
   bool Recorded = false;
 };
 
+class DefaultInlineAdvice : public InlineAdvice {
+public:
+  DefaultInlineAdvice(InlineAdvisor *Advisor, CallBase &CB,
+                      Optional<InlineCost> OIC, OptimizationRemarkEmitter &ORE,
+                      bool EmitRemarks = true)
+      : InlineAdvice(Advisor, CB, ORE, OIC.hasValue()), OriginalCB(&CB),
+        OIC(OIC), EmitRemarks(EmitRemarks) {}
+
+private:
+  void recordUnsuccessfulInliningImpl(const InlineResult &Result) override;
+  void recordInliningWithCalleeDeletedImpl() override;
+  void recordInliningImpl() override;
+
+private:
+  CallBase *const OriginalCB;
+  Optional<InlineCost> OIC;
+  bool EmitRemarks;
+};
+
 /// Interface for deciding whether to inline a call site or not.
 class InlineAdvisor {
 public:
   InlineAdvisor(InlineAdvisor &&) = delete;
-  virtual ~InlineAdvisor() { freeDeletedFunctions(); }
+  virtual ~InlineAdvisor();
 
   /// Get an InlineAdvice containing a recommendation on whether to
   /// inline or not. \p CB is assumed to be a direct call. \p FAM is assumed to
-  /// be up-to-date wrt previous inlining decisions.
+  /// be up-to-date wrt previous inlining decisions. \p MandatoryOnly indicates
+  /// only mandatory (always-inline) call sites should be recommended - this
+  /// allows the InlineAdvisor track such inlininings.
   /// Returns an InlineAdvice with the inlining recommendation.
-  virtual std::unique_ptr<InlineAdvice> getAdvice(CallBase &CB) = 0;
+  std::unique_ptr<InlineAdvice> getAdvice(CallBase &CB,
+                                          bool MandatoryOnly = false);
 
   /// This must be called when the Inliner pass is entered, to allow the
   /// InlineAdvisor update internal state, as result of function passes run
@@ -138,9 +162,14 @@ public:
   virtual void onPassExit() {}
 
 protected:
-  InlineAdvisor(FunctionAnalysisManager &FAM) : FAM(FAM) {}
+  InlineAdvisor(Module &M, FunctionAnalysisManager &FAM);
+  virtual std::unique_ptr<InlineAdvice> getAdviceImpl(CallBase &CB) = 0;
+  virtual std::unique_ptr<InlineAdvice> getMandatoryAdvice(CallBase &CB,
+                                                           bool Advice);
 
+  Module &M;
   FunctionAnalysisManager &FAM;
+  std::unique_ptr<ImportedFunctionsInliningStatistics> ImportedFunctionsStats;
 
   /// We may want to defer deleting functions to after the inlining for a whole
   /// module has finished. This allows us to reliably use function pointers as
@@ -155,6 +184,14 @@ protected:
     return DeletedFunctions.count(F);
   }
 
+  enum class MandatoryInliningKind { NotMandatory, Always, Never };
+
+  static MandatoryInliningKind getMandatoryKind(CallBase &CB,
+                                                FunctionAnalysisManager &FAM,
+                                                OptimizationRemarkEmitter &ORE);
+
+  OptimizationRemarkEmitter &getCallerORE(CallBase &CB);
+
 private:
   friend class InlineAdvice;
   void markFunctionAsDeleted(Function *F);
@@ -166,11 +203,12 @@ private:
 /// reusable as-is for inliner pass test scenarios, as well as for regular use.
 class DefaultInlineAdvisor : public InlineAdvisor {
 public:
-  DefaultInlineAdvisor(FunctionAnalysisManager &FAM, InlineParams Params)
-      : InlineAdvisor(FAM), Params(Params) {}
+  DefaultInlineAdvisor(Module &M, FunctionAnalysisManager &FAM,
+                       InlineParams Params)
+      : InlineAdvisor(M, FAM), Params(Params) {}
 
 private:
-  std::unique_ptr<InlineAdvice> getAdvice(CallBase &CB) override;
+  std::unique_ptr<InlineAdvice> getAdviceImpl(CallBase &CB) override;
 
   void onPassExit() override { freeDeletedFunctions(); }
 
@@ -190,7 +228,8 @@ public:
       // InlineAdvisor must be preserved across analysis invalidations.
       return false;
     }
-    bool tryCreate(InlineParams Params, InliningAdvisorMode Mode);
+    bool tryCreate(InlineParams Params, InliningAdvisorMode Mode,
+                   StringRef ReplayFile);
     InlineAdvisor *getAdvisor() const { return Advisor.get(); }
     void clear() { Advisor.reset(); }
 
@@ -208,6 +247,12 @@ std::unique_ptr<InlineAdvisor>
 getReleaseModeAdvisor(Module &M, ModuleAnalysisManager &MAM);
 #endif
 
+#ifdef LLVM_HAVE_TF_API
+std::unique_ptr<InlineAdvisor>
+getDevelopmentModeAdvisor(Module &M, ModuleAnalysisManager &MAM,
+                          std::function<bool(CallBase &)> GetDefaultAdvice);
+#endif
+
 // Default (manual policy) decision making helper APIs. Shared with the legacy
 // pass manager inliner.
 
@@ -226,6 +271,9 @@ void emitInlinedInto(OptimizationRemarkEmitter &ORE, DebugLoc DLoc,
                      bool ForProfileContext = false,
                      const char *PassName = nullptr);
 
+/// get call site location as string
+std::string getCallSiteLocation(DebugLoc DLoc);
+
 /// Add location info to ORE message.
 void addLocationToRemarks(OptimizationRemark &Remark, DebugLoc DLoc);
 
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/InlineFeaturesAnalysis.h b/contrib/llvm-project/llvm/include/llvm/Analysis/InlineFeaturesAnalysis.h
deleted file mode 100644
index cc3f96c424e9..000000000000
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/InlineFeaturesAnalysis.h
+++ /dev/null
@@ -1,45 +0,0 @@
-//===- InlineFeaturesAnalysis.h - ML Policy Feature extraction  -*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_INLINEFEATURESANALYSIS_H_
-#define LLVM_INLINEFEATURESANALYSIS_H_
-
-#include "llvm/IR/PassManager.h"
-
-namespace llvm {
-class Function;
-
-class InlineFeaturesAnalysis
-    : public AnalysisInfoMixin<InlineFeaturesAnalysis> {
-public:
-  static AnalysisKey Key;
-  struct Result {
-    /// Number of basic blocks
-    int64_t BasicBlockCount = 0;
-
-    /// Number of blocks reached from a conditional instruction, or that are
-    /// 'cases' of a SwitchInstr.
-    // FIXME: We may want to replace this with a more meaningful metric, like
-    // number of conditionally executed blocks:
-    // 'if (a) s();' would be counted here as 2 blocks, just like
-    // 'if (a) s(); else s2(); s3();' would.
-    int64_t BlocksReachedFromConditionalInstruction = 0;
-
-    /// Number of uses of this function, plus 1 if the function is callable
-    /// outside the module.
-    int64_t Uses = 0;
-
-    /// Number of direct calls made from this function to other functions
-    /// defined in this module.
-    int64_t DirectCallsToDefinedFunctions = 0;
-  };
-  Result run(const Function &F, FunctionAnalysisManager &FAM);
-};
-
-} // namespace llvm
-#endif // LLVM_INLINEFEATURESANALYSIS_H_
\ No newline at end of file
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/InlineSizeEstimatorAnalysis.h b/contrib/llvm-project/llvm/include/llvm/Analysis/InlineSizeEstimatorAnalysis.h
index 29a6f5914674..ab2cf52494c0 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/InlineSizeEstimatorAnalysis.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/InlineSizeEstimatorAnalysis.h
@@ -31,5 +31,15 @@ public:
 private:
   std::unique_ptr<TFModelEvaluator> Evaluator;
 };
+
+class InlineSizeEstimatorAnalysisPrinterPass
+    : public PassInfoMixin<InlineSizeEstimatorAnalysisPrinterPass> {
+  raw_ostream &OS;
+
+public:
+  explicit InlineSizeEstimatorAnalysisPrinterPass(raw_ostream &OS) : OS(OS) {}
+
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
 } // namespace llvm
-#endif // LLVM_ANALYSIS_INLINESIZEESTIMATORANALYSIS_H
\ No newline at end of file
+#endif // LLVM_ANALYSIS_INLINESIZEESTIMATORANALYSIS_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/InstCount.h b/contrib/llvm-project/llvm/include/llvm/Analysis/InstCount.h
new file mode 100644
index 000000000000..e5ce822caf6e
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/InstCount.h
@@ -0,0 +1,28 @@
+//===- InstCount.h - Collects the count of all instructions -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass collects the count of all instructions and reports them
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_INSTCOUNT_H
+#define LLVM_ANALYSIS_INSTCOUNT_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class Function;
+
+struct InstCountPass : PassInfoMixin<InstCountPass> {
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_ANALYSIS_INSTCOUNT_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/InstructionSimplify.h b/contrib/llvm-project/llvm/include/llvm/Analysis/InstructionSimplify.h
index b5ae54fb98bc..17d6f30a35cb 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/InstructionSimplify.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/InstructionSimplify.h
@@ -26,6 +26,10 @@
 // same call context of that function (and not split between caller and callee
 // contexts of a directly recursive call, for example).
 //
+// Additionally, these routines can't simplify to the instructions that are not
+// def-reachable, meaning we can't just scan the basic block for instructions
+// to simplify to.
+//
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_ANALYSIS_INSTRUCTIONSIMPLIFY_H
@@ -98,19 +102,39 @@ struct SimplifyQuery {
   // be safely used.
   const InstrInfoQuery IIQ;
 
+  /// Controls whether simplifications are allowed to constrain the range of
+  /// possible values for uses of undef. If it is false, simplifications are not
+  /// allowed to assume a particular value for a use of undef for example.
+  bool CanUseUndef = true;
+
   SimplifyQuery(const DataLayout &DL, const Instruction *CXTI = nullptr)
       : DL(DL), CxtI(CXTI) {}
 
   SimplifyQuery(const DataLayout &DL, const TargetLibraryInfo *TLI,
                 const DominatorTree *DT = nullptr,
                 AssumptionCache *AC = nullptr,
-                const Instruction *CXTI = nullptr, bool UseInstrInfo = true)
-      : DL(DL), TLI(TLI), DT(DT), AC(AC), CxtI(CXTI), IIQ(UseInstrInfo) {}
+                const Instruction *CXTI = nullptr, bool UseInstrInfo = true,
+                bool CanUseUndef = true)
+      : DL(DL), TLI(TLI), DT(DT), AC(AC), CxtI(CXTI), IIQ(UseInstrInfo),
+        CanUseUndef(CanUseUndef) {}
   SimplifyQuery getWithInstruction(Instruction *I) const {
     SimplifyQuery Copy(*this);
     Copy.CxtI = I;
     return Copy;
   }
+  SimplifyQuery getWithoutUndef() const {
+    SimplifyQuery Copy(*this);
+    Copy.CanUseUndef = false;
+    return Copy;
+  }
+
+  /// If CanUseUndef is true, returns whether \p V is undef.
+  /// Otherwise always return false.
+  bool isUndefValue(Value *V) const {
+    if (!CanUseUndef)
+      return false;
+    return isa<UndefValue>(V);
+  }
 };
 
 // NOTE: the explicit multiple argument versions of these functions are
@@ -268,7 +292,8 @@ Value *SimplifyFreezeInst(Value *Op, const SimplifyQuery &Q);
 Value *SimplifyInstruction(Instruction *I, const SimplifyQuery &Q,
                            OptimizationRemarkEmitter *ORE = nullptr);
 
-/// See if V simplifies when its operand Op is replaced with RepOp.
+/// See if V simplifies when its operand Op is replaced with RepOp. If not,
+/// return null.
 /// AllowRefinement specifies whether the simplification can be a refinement,
 /// or whether it needs to be strictly identical.
 Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
@@ -288,17 +313,6 @@ bool replaceAndRecursivelySimplify(
     const DominatorTree *DT = nullptr, AssumptionCache *AC = nullptr,
     SmallSetVector<Instruction *, 8> *UnsimplifiedUsers = nullptr);
 
-/// Recursively attempt to simplify an instruction.
-///
-/// This routine uses SimplifyInstruction to simplify 'I', and if successful
-/// replaces uses of 'I' with the simplified value. It then recurses on each
-/// of the users impacted. It returns true if any simplifications were
-/// performed.
-bool recursivelySimplifyInstruction(Instruction *I,
-                                    const TargetLibraryInfo *TLI = nullptr,
-                                    const DominatorTree *DT = nullptr,
-                                    AssumptionCache *AC = nullptr);
-
 // These helper functions return a SimplifyQuery structure that contains as
 // many of the optional analysis we use as are currently valid.  This is the
 // strongly preferred way of constructing SimplifyQuery in passes.
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/Interval.h b/contrib/llvm-project/llvm/include/llvm/Analysis/Interval.h
index 5c9a4535bc7f..9afe659d00dd 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/Interval.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/Interval.h
@@ -89,9 +89,6 @@ public:
     return HeaderNode == I.HeaderNode;
   }
 
-  /// isLoop - Find out if there is a back edge in this interval...
-  bool isLoop() const;
-
   /// print - Show contents in human readable format...
   void print(raw_ostream &O) const;
 };
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/IntervalIterator.h b/contrib/llvm-project/llvm/include/llvm/Analysis/IntervalIterator.h
index efaaf9715b3d..8e2273618a66 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/IntervalIterator.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/IntervalIterator.h
@@ -81,7 +81,7 @@ inline void addNodeToInterval(Interval *Int, BasicBlock *BB) {
 // BasicBlocks are added to the interval.
 inline void addNodeToInterval(Interval *Int, Interval *I) {
   // Add all of the nodes in I as new nodes in Int.
-  Int->Nodes.insert(Int->Nodes.end(), I->Nodes.begin(), I->Nodes.end());
+  llvm::append_range(Int->Nodes, I->Nodes);
 }
 
 template<class NodeTy, class OrigContainer_t, class GT = GraphTraits<NodeTy *>,
@@ -227,9 +227,7 @@ private:
 
       if (Int->isSuccessor(NodeHeader)) {
         // If we were in the successor list from before... remove from succ list
-        Int->Successors.erase(std::remove(Int->Successors.begin(),
-                                          Int->Successors.end(), NodeHeader),
-                              Int->Successors.end());
+        llvm::erase_value(Int->Successors, NodeHeader);
       }
 
       // Now that we have discovered that Node is in the interval, perhaps some
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/IteratedDominanceFrontier.h b/contrib/llvm-project/llvm/include/llvm/Analysis/IteratedDominanceFrontier.h
index fb6605285156..8166b52aa226 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/IteratedDominanceFrontier.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/IteratedDominanceFrontier.h
@@ -73,13 +73,7 @@ ChildrenGetterTy<BasicBlock, IsPostDom>::get(const NodeRef &N) {
     return {Children.begin(), Children.end()};
   }
 
-  using SnapShotBBPairTy =
-      std::pair<const GraphDiff<BasicBlock *, IsPostDom> *, OrderedNodeTy>;
-
-  ChildrenTy Ret;
-  for (const auto &SnapShotBBPair : children<SnapShotBBPairTy>({GD, N}))
-    Ret.emplace_back(SnapShotBBPair.second);
-  return Ret;
+  return GD->template getChildren<IsPostDom>(N);
 }
 
 } // end of namespace IDFCalculatorDetail
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/LazyBranchProbabilityInfo.h b/contrib/llvm-project/llvm/include/llvm/Analysis/LazyBranchProbabilityInfo.h
index f4249f74104c..3c632f02905a 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/LazyBranchProbabilityInfo.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/LazyBranchProbabilityInfo.h
@@ -63,7 +63,7 @@ class LazyBranchProbabilityInfoPass : public FunctionPass {
     BranchProbabilityInfo &getCalculated() {
       if (!Calculated) {
         assert(F && LI && "call setAnalysis");
-        BPI.calculate(*F, *LI, TLI, nullptr);
+        BPI.calculate(*F, *LI, TLI, nullptr, nullptr);
         Calculated = true;
       }
       return BPI;
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/LazyCallGraph.h b/contrib/llvm-project/llvm/include/llvm/Analysis/LazyCallGraph.h
index ea63b837ba70..f7a5adac2b43 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/LazyCallGraph.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/LazyCallGraph.h
@@ -258,7 +258,6 @@ public:
     iterator begin() { return iterator(Edges.begin(), Edges.end()); }
     iterator end() { return iterator(Edges.end(), Edges.end()); }
 
-    Edge &operator[](int i) { return Edges[i]; }
     Edge &operator[](Node &N) {
       assert(EdgeIndexMap.find(&N) != EdgeIndexMap.end() && "No such edge!");
       auto &E = Edges[EdgeIndexMap.find(&N)->second];
@@ -305,13 +304,6 @@ public:
 
     /// Internal helper to remove the edge to the given function.
     bool removeEdgeInternal(Node &ChildN);
-
-    /// Internal helper to replace an edge key with a new one.
-    ///
-    /// This should be used when the function for a particular node in the
-    /// graph gets replaced and we are updating all of the edges to that node
-    /// to use the new function as the key.
-    void replaceEdgeKey(Function &OldTarget, Function &NewTarget);
   };
 
   /// A node in the call graph.
@@ -606,10 +598,6 @@ public:
     void verify();
 #endif
 
-    /// Handle any necessary parent set updates after inserting a trivial ref
-    /// or call edge.
-    void handleTrivialEdgeInsertion(Node &SourceN, Node &TargetN);
-
   public:
     using iterator = pointee_iterator<SmallVectorImpl<SCC *>::const_iterator>;
     using range = iterator_range<iterator>;
@@ -1058,12 +1046,29 @@ public:
   /// fully visited by the DFS prior to calling this routine.
   void removeDeadFunction(Function &F);
 
-  /// Introduce a node for the function \p NewF in the SCC \p C.
-  void addNewFunctionIntoSCC(Function &NewF, SCC &C);
+  /// Add a new function split/outlined from an existing function.
+  ///
+  /// The new function may only reference other functions that the original
+  /// function did.
+  ///
+  /// The original function must reference (either directly or indirectly) the
+  /// new function.
+  ///
+  /// The new function may also reference the original function.
+  /// It may end up in a parent SCC in the case that the original function's
+  /// edge to the new function is a ref edge, and the edge back is a call edge.
+  void addSplitFunction(Function &OriginalFunction, Function &NewFunction);
 
-  /// Introduce a node for the function \p NewF, as a single node in a
-  /// new SCC, in the RefSCC \p RC.
-  void addNewFunctionIntoRefSCC(Function &NewF, RefSCC &RC);
+  /// Add new ref-recursive functions split/outlined from an existing function.
+  ///
+  /// The new functions may only reference other functions that the original
+  /// function did. The new functions may reference (not call) the original
+  /// function.
+  ///
+  /// The original function must reference (not call) all new functions.
+  /// All new functions must reference (not call) each other.
+  void addSplitRefRecursiveFunctions(Function &OriginalFunction,
+                                     ArrayRef<Function *> NewFunctions);
 
   ///@}
 
@@ -1168,16 +1173,14 @@ private:
   /// the NodeMap.
   Node &insertInto(Function &F, Node *&MappedN);
 
+  /// Helper to initialize a new node created outside of creating SCCs and add
+  /// it to the NodeMap if necessary. For example, useful when a function is
+  /// split.
+  Node &initNode(Function &F);
+
   /// Helper to update pointers back to the graph object during moves.
   void updateGraphPtrs();
 
-  /// Helper to insert a new function, add it to the NodeMap, and populate its
-  /// node.
-  Node &createNode(Function &F);
-
-  /// Helper to add the given Node \p N to the SCCMap, mapped to the SCC \p C.
-  void addNodeToSCC(SCC &C, Node &N);
-
   /// Allocates an SCC and constructs it using the graph allocator.
   ///
   /// The arguments are forwarded to the constructor.
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/LazyValueInfo.h b/contrib/llvm-project/llvm/include/llvm/Analysis/LazyValueInfo.h
index 1bc88235273e..363cb49af382 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/LazyValueInfo.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/LazyValueInfo.h
@@ -71,20 +71,20 @@ public:
                               Instruction *CxtI = nullptr);
 
   /// Determine whether the specified value comparison with a constant is known
-  /// to be true or false at the specified instruction
-  /// (from an assume intrinsic). Pred is a CmpInst predicate.
+  /// to be true or false at the specified instruction.
+  /// \p Pred is a CmpInst predicate. If \p UseBlockValue is true, the block
+  /// value is also taken into account.
   Tristate getPredicateAt(unsigned Pred, Value *V, Constant *C,
-                          Instruction *CxtI);
+                          Instruction *CxtI, bool UseBlockValue = false);
 
-  /// Determine whether the specified value is known to be a
-  /// constant at the end of the specified block.  Return null if not.
-  Constant *getConstant(Value *V, BasicBlock *BB, Instruction *CxtI = nullptr);
+  /// Determine whether the specified value is known to be a constant at the
+  /// specified instruction. Return null if not.
+  Constant *getConstant(Value *V, Instruction *CxtI);
 
   /// Return the ConstantRange constraint that is known to hold for the
-  /// specified value at the end of the specified block. This may only be called
+  /// specified value at the specified instruction. This may only be called
   /// on integer-typed Values.
-  ConstantRange getConstantRange(Value *V, BasicBlock *BB,
-                                 Instruction *CxtI = nullptr,
+  ConstantRange getConstantRange(Value *V, Instruction *CxtI,
                                  bool UndefAllowed = true);
 
   /// Determine whether the specified value is known to be a
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/Lint.h b/contrib/llvm-project/llvm/include/llvm/Analysis/Lint.h
index 0fea81e215c9..6eb637e72782 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/Lint.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/Lint.h
@@ -19,30 +19,30 @@
 #ifndef LLVM_ANALYSIS_LINT_H
 #define LLVM_ANALYSIS_LINT_H
 
+#include "llvm/IR/PassManager.h"
+
 namespace llvm {
 
 class FunctionPass;
 class Module;
 class Function;
 
-/// Create a lint pass.
-///
-/// Check a module or function.
-FunctionPass *createLintPass();
+FunctionPass *createLintLegacyPassPass();
 
-/// Check a module.
+/// Lint a module.
 ///
 /// This should only be used for debugging, because it plays games with
 /// PassManagers and stuff.
-void lintModule(
-  const Module &M    ///< The module to be checked
-);
+void lintModule(const Module &M);
+
+// Lint a function.
+void lintFunction(const Function &F);
 
-// lintFunction - Check a function.
-void lintFunction(
-  const Function &F  ///< The function to be checked
-);
+class LintPass : public PassInfoMixin<LintPass> {
+public:
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
 
-} // End llvm namespace
+} // namespace llvm
 
-#endif
+#endif // LLVM_ANALYSIS_LINT_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/Loads.h b/contrib/llvm-project/llvm/include/llvm/Analysis/Loads.h
index 5665a802942d..24a05610e68d 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/Loads.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/Loads.h
@@ -155,6 +155,15 @@ Value *FindAvailablePtrLoadStore(Value *Ptr, Type *AccessTy, bool AtLeastAtomic,
                                  BasicBlock::iterator &ScanFrom,
                                  unsigned MaxInstsToScan, AAResults *AA,
                                  bool *IsLoadCSE, unsigned *NumScanedInst);
+
+/// Returns true if a pointer value \p A can be replace with another pointer
+/// value \B if they are deemed equal through some means (e.g. information from
+/// conditions).
+/// NOTE: the current implementations is incomplete and unsound. It does not
+/// reject all invalid cases yet, but will be made stricter in the future. In
+/// particular this means returning true means unknown if replacement is safe.
+bool canReplacePointersIfEqual(Value *A, Value *B, const DataLayout &DL,
+                               Instruction *CtxI);
 }
 
 #endif
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/contrib/llvm-project/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index a5237e9ba59e..13fbe884eddf 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -171,7 +171,8 @@ public:
 
   MemoryDepChecker(PredicatedScalarEvolution &PSE, const Loop *L)
       : PSE(PSE), InnermostLoop(L), AccessIdx(0), MaxSafeDepDistBytes(0),
-        MaxSafeRegisterWidth(-1U), FoundNonConstantDistanceDependence(false),
+        MaxSafeVectorWidthInBits(-1U),
+        FoundNonConstantDistanceDependence(false),
         Status(VectorizationSafetyStatus::Safe), RecordDependences(true) {}
 
   /// Register the location (instructions are given increasing numbers)
@@ -204,13 +205,21 @@ public:
     return Status == VectorizationSafetyStatus::Safe;
   }
 
+  /// Return true if the number of elements that are safe to operate on
+  /// simultaneously is not bounded.
+  bool isSafeForAnyVectorWidth() const {
+    return MaxSafeVectorWidthInBits == UINT_MAX;
+  }
+
   /// The maximum number of bytes of a vector register we can vectorize
   /// the accesses safely with.
   uint64_t getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; }
 
   /// Return the number of elements that are safe to operate on
   /// simultaneously, multiplied by the size of the element in bits.
-  uint64_t getMaxSafeRegisterWidth() const { return MaxSafeRegisterWidth; }
+  uint64_t getMaxSafeVectorWidthInBits() const {
+    return MaxSafeVectorWidthInBits;
+  }
 
   /// In same cases when the dependency check fails we can still
   /// vectorize the loop with a dynamic array access check.
@@ -275,7 +284,7 @@ private:
   /// operate on simultaneously, multiplied by the size of the element in bits.
   /// The size of the element is taken from the memory access that is most
   /// restrictive.
-  uint64_t MaxSafeRegisterWidth;
+  uint64_t MaxSafeVectorWidthInBits;
 
   /// If we see a non-constant dependence distance we can still try to
   /// vectorize this loop with runtime checks.
@@ -418,7 +427,7 @@ public:
                       bool UseDependencies);
 
   /// Returns the checks that generateChecks created.
-  const SmallVector<RuntimePointerCheck, 4> &getChecks() const {
+  const SmallVectorImpl<RuntimePointerCheck> &getChecks() const {
     return Checks;
   }
 
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/LoopAnalysisManager.h b/contrib/llvm-project/llvm/include/llvm/Analysis/LoopAnalysisManager.h
index 0e162e03bde1..11dbd15c8678 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/LoopAnalysisManager.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/LoopAnalysisManager.h
@@ -57,6 +57,7 @@ struct LoopStandardAnalysisResults {
   ScalarEvolution &SE;
   TargetLibraryInfo &TLI;
   TargetTransformInfo &TTI;
+  BlockFrequencyInfo *BFI;
   MemorySSA *MSSA;
 };
 
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/LoopCacheAnalysis.h b/contrib/llvm-project/llvm/include/llvm/Analysis/LoopCacheAnalysis.h
index ffec78b6db2c..e8f2205545eb 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/LoopCacheAnalysis.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/LoopCacheAnalysis.h
@@ -14,19 +14,20 @@
 #ifndef LLVM_ANALYSIS_LOOPCACHEANALYSIS_H
 #define LLVM_ANALYSIS_LOOPCACHEANALYSIS_H
 
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/DependenceAnalysis.h"
 #include "llvm/Analysis/LoopAnalysisManager.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/Pass.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
 
+class AAResults;
+class DependenceInfo;
 class LPMUpdater;
+class ScalarEvolution;
+class SCEV;
+class TargetTransformInfo;
+
 using CacheCostTy = int64_t;
 using LoopVectorTy = SmallVector<Loop *, 8>;
 
@@ -70,7 +71,7 @@ public:
   /// the same chace line iff the distance between them in the innermost
   /// dimension is less than the cache line size. Return None if unsure.
   Optional<bool> hasSpacialReuse(const IndexedReference &Other, unsigned CLS,
-                                 AliasAnalysis &AA) const;
+                                 AAResults &AA) const;
 
   /// Return true if the current object and the indexed reference \p Other
   /// have distance smaller than \p MaxDistance in the dimension associated with
@@ -78,7 +79,7 @@ public:
   /// MaxDistance and None if unsure.
   Optional<bool> hasTemporalReuse(const IndexedReference &Other,
                                   unsigned MaxDistance, const Loop &L,
-                                  DependenceInfo &DI, AliasAnalysis &AA) const;
+                                  DependenceInfo &DI, AAResults &AA) const;
 
   /// Compute the cost of the reference w.r.t. the given loop \p L when it is
   /// considered in the innermost position in the loop nest.
@@ -118,7 +119,7 @@ private:
 
   /// Return true if the given reference \p Other is definetely aliased with
   /// the indexed reference represented by this class.
-  bool isAliased(const IndexedReference &Other, AliasAnalysis &AA) const;
+  bool isAliased(const IndexedReference &Other, AAResults &AA) const;
 
 private:
   /// True if the reference can be delinearized, false otherwise.
@@ -183,7 +184,7 @@ public:
   /// between array elements accessed in a loop so that the elements are
   /// classified to have temporal reuse.
   CacheCost(const LoopVectorTy &Loops, const LoopInfo &LI, ScalarEvolution &SE,
-            TargetTransformInfo &TTI, AliasAnalysis &AA, DependenceInfo &DI,
+            TargetTransformInfo &TTI, AAResults &AA, DependenceInfo &DI,
             Optional<unsigned> TRT = None);
 
   /// Create a CacheCost for the loop nest rooted by \p Root.
@@ -197,9 +198,9 @@ public:
   /// Return the estimated cost of loop \p L if the given loop is part of the
   /// loop nest associated with this object. Return -1 otherwise.
   CacheCostTy getLoopCost(const Loop &L) const {
-    auto IT = std::find_if(
-        LoopCosts.begin(), LoopCosts.end(),
-        [&L](const LoopCacheCostTy &LCC) { return LCC.first == &L; });
+    auto IT = llvm::find_if(LoopCosts, [&L](const LoopCacheCostTy &LCC) {
+      return LCC.first == &L;
+    });
     return (IT != LoopCosts.end()) ? (*IT).second : -1;
   }
 
@@ -258,7 +259,7 @@ private:
   const LoopInfo &LI;
   ScalarEvolution &SE;
   TargetTransformInfo &TTI;
-  AliasAnalysis &AA;
+  AAResults &AA;
   DependenceInfo &DI;
 };
 
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/LoopInfo.h b/contrib/llvm-project/llvm/include/llvm/Analysis/LoopInfo.h
index 35fe2a03a2a2..a5717bae12c3 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/LoopInfo.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/LoopInfo.h
@@ -155,7 +155,17 @@ public:
   iterator end() const { return getSubLoops().end(); }
   reverse_iterator rbegin() const { return getSubLoops().rbegin(); }
   reverse_iterator rend() const { return getSubLoops().rend(); }
-  bool empty() const { return getSubLoops().empty(); }
+
+  // LoopInfo does not detect irreducible control flow, just natural
+  // loops. That is, it is possible that there is cyclic control
+  // flow within the "innermost loop" or around the "outermost
+  // loop".
+
+  /// Return true if the loop does not contain any (natural) loops.
+  bool isInnermost() const { return getSubLoops().empty(); }
+  /// Return true if the loop does not have a parent (natural) loop
+  // (i.e. it is outermost, which is the same as top-level).
+  bool isOutermost() const { return getParentLoop() == nullptr; }
 
   /// Get a list of the basic blocks which make up this loop.
   ArrayRef<BlockT *> getBlocks() const {
@@ -292,6 +302,9 @@ public:
   /// Otherwise return null.
   BlockT *getUniqueExitBlock() const;
 
+  /// Return true if this loop does not have any exit blocks.
+  bool hasNoExitBlocks() const;
+
   /// Edge type.
   typedef std::pair<BlockT *, BlockT *> Edge;
 
@@ -830,6 +843,9 @@ public:
   /// unrolling pass is run more than once (which it generally is).
   void setLoopAlreadyUnrolled();
 
+  /// Add llvm.loop.mustprogress to this loop's loop id metadata.
+  void setLoopMustProgress();
+
   void dump() const;
   void dumpVerbose() const;
 
@@ -974,7 +990,7 @@ public:
   LoopT *removeLoop(iterator I) {
     assert(I != end() && "Cannot remove end iterator!");
     LoopT *L = *I;
-    assert(!L->getParentLoop() && "Not a top-level loop!");
+    assert(L->isOutermost() && "Not a top-level loop!");
     TopLevelLoops.erase(TopLevelLoops.begin() + (I - begin()));
     return L;
   }
@@ -1002,7 +1018,7 @@ public:
 
   /// This adds the specified loop to the collection of top-level loops.
   void addTopLevelLoop(LoopT *New) {
-    assert(!New->getParentLoop() && "Loop already in subloop!");
+    assert(New->isOutermost() && "Loop already in subloop!");
     TopLevelLoops.push_back(New);
   }
 
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/LoopInfoImpl.h b/contrib/llvm-project/llvm/include/llvm/Analysis/LoopInfoImpl.h
index 58a4abafcc85..426b349c6b8a 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/LoopInfoImpl.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/LoopInfoImpl.h
@@ -68,6 +68,13 @@ void LoopBase<BlockT, LoopT>::getExitBlocks(
         ExitBlocks.push_back(Succ);
 }
 
+template <class BlockT, class LoopT>
+bool LoopBase<BlockT, LoopT>::hasNoExitBlocks() const {
+  SmallVector<BlockT *, 8> ExitBlocks;
+  getExitBlocks(ExitBlocks);
+  return ExitBlocks.empty();
+}
+
 /// getExitBlock - If getExitBlocks would return exactly one block,
 /// return that block. Otherwise return null.
 template <class BlockT, class LoopT>
@@ -502,7 +509,7 @@ void PopulateLoopsDFS<BlockT, LoopT>::insertIntoLoop(BlockT *Block) {
   if (Subloop && Block == Subloop->getHeader()) {
     // We reach this point once per subloop after processing all the blocks in
     // the subloop.
-    if (Subloop->getParentLoop())
+    if (!Subloop->isOutermost())
       Subloop->getParentLoop()->getSubLoopsVector().push_back(Subloop);
     else
       LI->addTopLevelLoop(Subloop);
@@ -666,12 +673,13 @@ static void compareLoops(const LoopT *L, const LoopT *OtherL,
          "Mismatched basic blocks in the loops!");
 
   const SmallPtrSetImpl<const BlockT *> &BlocksSet = L->getBlocksSet();
-  const SmallPtrSetImpl<const BlockT *> &OtherBlocksSet = L->getBlocksSet();
+  const SmallPtrSetImpl<const BlockT *> &OtherBlocksSet =
+      OtherL->getBlocksSet();
   assert(BlocksSet.size() == OtherBlocksSet.size() &&
-         std::all_of(BlocksSet.begin(), BlocksSet.end(),
-                     [&OtherBlocksSet](const BlockT *BB) {
-                       return OtherBlocksSet.count(BB);
-                     }) &&
+         llvm::all_of(BlocksSet,
+                      [&OtherBlocksSet](const BlockT *BB) {
+                        return OtherBlocksSet.count(BB);
+                      }) &&
          "Mismatched basic blocks in BlocksSets!");
 }
 #endif
@@ -681,7 +689,7 @@ void LoopInfoBase<BlockT, LoopT>::verify(
     const DomTreeBase<BlockT> &DomTree) const {
   DenseSet<const LoopT *> Loops;
   for (iterator I = begin(), E = end(); I != E; ++I) {
-    assert(!(*I)->getParentLoop() && "Top-level loop has a parent!");
+    assert((*I)->isOutermost() && "Top-level loop has a parent!");
     (*I)->verifyLoopNest(&Loops);
   }
 
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/LoopNestAnalysis.h b/contrib/llvm-project/llvm/include/llvm/Analysis/LoopNestAnalysis.h
index 792958a312ce..9c4fb4dbc29b 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/LoopNestAnalysis.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/LoopNestAnalysis.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_ANALYSIS_LOOPNESTANALYSIS_H
 #define LLVM_ANALYSIS_LOOPNESTANALYSIS_H
 
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/LoopInfo.h"
 
@@ -59,6 +60,12 @@ public:
   /// getMaxPerfectDepth(Loop_i) would return 2.
   static unsigned getMaxPerfectDepth(const Loop &Root, ScalarEvolution &SE);
 
+  /// Recursivelly traverse all empty 'single successor' basic blocks of \p From
+  /// (if there are any). Return the last basic block found or \p End if it was
+  /// reached during the search.
+  static const BasicBlock &skipEmptyBlockUntil(const BasicBlock *From,
+                                               const BasicBlock *End);
+
   /// Return the outermost loop in the loop nest.
   Loop &getOutermostLoop() const { return *Loops.front(); }
 
@@ -124,10 +131,16 @@ public:
 
   /// Return true if all loops in the loop nest are in simplify form.
   bool areAllLoopsSimplifyForm() const {
-    return llvm::all_of(Loops,
-                        [](const Loop *L) { return L->isLoopSimplifyForm(); });
+    return all_of(Loops, [](const Loop *L) { return L->isLoopSimplifyForm(); });
+  }
+
+  /// Return true if all loops in the loop nest are in rotated form.
+  bool areAllLoopsRotatedForm() const {
+    return all_of(Loops, [](const Loop *L) { return L->isRotatedForm(); });
   }
 
+  StringRef getName() const { return Loops.front()->getName(); }
+
 protected:
   const unsigned MaxPerfectDepth; // maximum perfect nesting depth level.
   LoopVectorTy Loops; // the loops in the nest (in breadth first order).
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/MLInlineAdvisor.h b/contrib/llvm-project/llvm/include/llvm/Analysis/MLInlineAdvisor.h
index cbe3b1f1f4e6..54edbb823263 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/MLInlineAdvisor.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/MLInlineAdvisor.h
@@ -31,8 +31,6 @@ public:
 
   void onPassEntry() override;
 
-  std::unique_ptr<InlineAdvice> getAdvice(CallBase &CB) override;
-
   int64_t getIRSize(const Function &F) const { return F.getInstructionCount(); }
   void onSuccessfulInlining(const MLInlineAdvice &Advice,
                             bool CalleeWasDeleted);
@@ -42,13 +40,16 @@ public:
   const MLModelRunner &getModelRunner() const { return *ModelRunner.get(); }
 
 protected:
-  virtual std::unique_ptr<MLInlineAdvice>
-  getMandatoryAdvice(CallBase &CB, OptimizationRemarkEmitter &ORE);
+  std::unique_ptr<InlineAdvice> getAdviceImpl(CallBase &CB) override;
+
+  std::unique_ptr<InlineAdvice> getMandatoryAdvice(CallBase &CB,
+                                                   bool Advice) override;
+
+  virtual std::unique_ptr<MLInlineAdvice> getMandatoryAdviceImpl(CallBase &CB);
 
   virtual std::unique_ptr<MLInlineAdvice>
   getAdviceFromModel(CallBase &CB, OptimizationRemarkEmitter &ORE);
 
-  Module &M;
   std::unique_ptr<MLModelRunner> ModelRunner;
 
 private:
@@ -104,4 +105,4 @@ private:
 
 } // namespace llvm
 
-#endif // LLVM_ANALYSIS_MLINLINEADVISOR_H
\ No newline at end of file
+#endif // LLVM_ANALYSIS_MLINLINEADVISOR_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/MemDerefPrinter.h b/contrib/llvm-project/llvm/include/llvm/Analysis/MemDerefPrinter.h
new file mode 100644
index 000000000000..bafdc543eeaf
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/MemDerefPrinter.h
@@ -0,0 +1,24 @@
+//===- MemDerefPrinter.h - Printer for isDereferenceablePointer -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_MEMDEREFPRINTER_H
+#define LLVM_ANALYSIS_MEMDEREFPRINTER_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+class MemDerefPrinterPass : public PassInfoMixin<MemDerefPrinterPass> {
+  raw_ostream &OS;
+
+public:
+  MemDerefPrinterPass(raw_ostream &OS) : OS(OS) {}
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+} // namespace llvm
+
+#endif // LLVM_ANALYSIS_MEMDEREFPRINTER_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/MemoryDependenceAnalysis.h b/contrib/llvm-project/llvm/include/llvm/Analysis/MemoryDependenceAnalysis.h
index 0777dc7d7862..efde00f82d57 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/MemoryDependenceAnalysis.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/MemoryDependenceAnalysis.h
@@ -302,7 +302,7 @@ private:
     /// The maximum size of the dereferences of the pointer.
     ///
     /// May be UnknownSize if the sizes are unknown.
-    LocationSize Size = LocationSize::unknown();
+    LocationSize Size = LocationSize::afterPointer();
     /// The AA tags associated with dereferences of the pointer.
     ///
     /// The members may be null if there are no tags or conflicting tags.
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/MemoryLocation.h b/contrib/llvm-project/llvm/include/llvm/Analysis/MemoryLocation.h
index d01ac7da85cd..3b188d763ef2 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/MemoryLocation.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/MemoryLocation.h
@@ -64,10 +64,11 @@ class VAArgInst;
 // None.
 class LocationSize {
   enum : uint64_t {
-    Unknown = ~uint64_t(0),
+    BeforeOrAfterPointer = ~uint64_t(0),
+    AfterPointer = BeforeOrAfterPointer - 1,
+    MapEmpty = BeforeOrAfterPointer - 2,
+    MapTombstone = BeforeOrAfterPointer - 3,
     ImpreciseBit = uint64_t(1) << 63,
-    MapEmpty = Unknown - 1,
-    MapTombstone = Unknown - 2,
 
     // The maximum value we can represent without falling back to 'unknown'.
     MaxValue = (MapTombstone - 1) & ~ImpreciseBit,
@@ -81,7 +82,11 @@ class LocationSize {
 
   constexpr LocationSize(uint64_t Raw, DirectConstruction): Value(Raw) {}
 
-  static_assert(Unknown & ImpreciseBit, "Unknown is imprecise by definition.");
+  static_assert(AfterPointer & ImpreciseBit,
+                "AfterPointer is imprecise by definition.");
+  static_assert(BeforeOrAfterPointer & ImpreciseBit,
+                "BeforeOrAfterPointer is imprecise by definition.");
+
 public:
   // FIXME: Migrate all users to construct via either `precise` or `upperBound`,
   // to make it more obvious at the callsite the kind of size that they're
@@ -90,12 +95,12 @@ public:
   // Since the overwhelming majority of users of this provide precise values,
   // this assumes the provided value is precise.
   constexpr LocationSize(uint64_t Raw)
-      : Value(Raw > MaxValue ? Unknown : Raw) {}
+      : Value(Raw > MaxValue ? AfterPointer : Raw) {}
 
   static LocationSize precise(uint64_t Value) { return LocationSize(Value); }
   static LocationSize precise(TypeSize Value) {
     if (Value.isScalable())
-      return unknown();
+      return afterPointer();
     return precise(Value.getFixedSize());
   }
 
@@ -104,17 +109,25 @@ public:
     if (LLVM_UNLIKELY(Value == 0))
       return precise(0);
     if (LLVM_UNLIKELY(Value > MaxValue))
-      return unknown();
+      return afterPointer();
     return LocationSize(Value | ImpreciseBit, Direct);
   }
   static LocationSize upperBound(TypeSize Value) {
     if (Value.isScalable())
-      return unknown();
+      return afterPointer();
     return upperBound(Value.getFixedSize());
   }
 
-  constexpr static LocationSize unknown() {
-    return LocationSize(Unknown, Direct);
+  /// Any location after the base pointer (but still within the underlying
+  /// object).
+  constexpr static LocationSize afterPointer() {
+    return LocationSize(AfterPointer, Direct);
+  }
+
+  /// Any location before or after the base pointer (but still within the
+  /// underlying object).
+  constexpr static LocationSize beforeOrAfterPointer() {
+    return LocationSize(BeforeOrAfterPointer, Direct);
   }
 
   // Sentinel values, generally used for maps.
@@ -131,20 +144,24 @@ public:
     if (Other == *this)
       return *this;
 
-    if (!hasValue() || !Other.hasValue())
-      return unknown();
+    if (Value == BeforeOrAfterPointer || Other.Value == BeforeOrAfterPointer)
+      return beforeOrAfterPointer();
+    if (Value == AfterPointer || Other.Value == AfterPointer)
+      return afterPointer();
 
     return upperBound(std::max(getValue(), Other.getValue()));
   }
 
-  bool hasValue() const { return Value != Unknown; }
+  bool hasValue() const {
+    return Value != AfterPointer && Value != BeforeOrAfterPointer;
+  }
   uint64_t getValue() const {
     assert(hasValue() && "Getting value from an unknown LocationSize!");
     return Value & ~ImpreciseBit;
   }
 
   // Returns whether or not this value is precise. Note that if a value is
-  // precise, it's guaranteed to not be `unknown()`.
+  // precise, it's guaranteed to not be unknown.
   bool isPrecise() const {
     return (Value & ImpreciseBit) == 0;
   }
@@ -152,6 +169,9 @@ public:
   // Convenience method to check if this LocationSize's value is 0.
   bool isZero() const { return hasValue() && getValue() == 0; }
 
+  /// Whether accesses before the base pointer are possible.
+  bool mayBeBeforePointer() const { return Value == BeforeOrAfterPointer; }
+
   bool operator==(const LocationSize &Other) const {
     return Value == Other.Value;
   }
@@ -242,14 +262,30 @@ public:
     return getForArgument(Call, ArgIdx, &TLI);
   }
 
+  /// Return a location that may access any location after Ptr, while remaining
+  /// within the underlying object.
+  static MemoryLocation getAfter(const Value *Ptr,
+                                 const AAMDNodes &AATags = AAMDNodes()) {
+    return MemoryLocation(Ptr, LocationSize::afterPointer(), AATags);
+  }
+
+  /// Return a location that may access any location before or after Ptr, while
+  /// remaining within the underlying object.
+  static MemoryLocation
+  getBeforeOrAfter(const Value *Ptr, const AAMDNodes &AATags = AAMDNodes()) {
+    return MemoryLocation(Ptr, LocationSize::beforeOrAfterPointer(), AATags);
+  }
+
   // Return the exact size if the exact size is known at compiletime,
   // otherwise return MemoryLocation::UnknownSize.
   static uint64_t getSizeOrUnknown(const TypeSize &T) {
     return T.isScalable() ? UnknownSize : T.getFixedSize();
   }
 
-  explicit MemoryLocation(const Value *Ptr = nullptr,
-                          LocationSize Size = LocationSize::unknown(),
+  MemoryLocation()
+      : Ptr(nullptr), Size(LocationSize::beforeOrAfterPointer()), AATags() {}
+
+  explicit MemoryLocation(const Value *Ptr, LocationSize Size,
                           const AAMDNodes &AATags = AAMDNodes())
       : Ptr(Ptr), Size(Size), AATags(AATags) {}
 
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/MemorySSA.h b/contrib/llvm-project/llvm/include/llvm/Analysis/MemorySSA.h
index 5ce2b3fd047f..63c031b1921f 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/MemorySSA.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/MemorySSA.h
@@ -88,6 +88,7 @@
 #include "llvm/IR/DerivedUser.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Use.h"
 #include "llvm/IR/User.h"
@@ -108,6 +109,7 @@ namespace llvm {
 /// Enables memory ssa as a dependency for loop passes.
 extern cl::opt<bool> EnableMSSALoopDependency;
 
+class AllocaInst;
 class Function;
 class Instruction;
 class MemoryAccess;
@@ -270,7 +272,7 @@ public:
   // Retrieve AliasResult type of the optimized access. Ideally this would be
   // returned by the caching walker and may go away in the future.
   Optional<AliasResult> getOptimizedAccessType() const {
-    return OptimizedAccessAlias;
+    return isOptimized() ? OptimizedAccessAlias : None;
   }
 
   /// Reset the ID of what this MemoryUse was optimized to, causing it to
@@ -840,7 +842,6 @@ private:
 
   CachingWalker<AliasAnalysis> *getWalkerImpl();
   void buildMemorySSA(BatchAAResults &BAA);
-  void optimizeUses();
 
   void prepareForMoveTo(MemoryAccess *, BasicBlock *);
   void verifyUseInDefs(MemoryAccess *, MemoryAccess *) const;
@@ -848,15 +849,11 @@ private:
   using AccessMap = DenseMap<const BasicBlock *, std::unique_ptr<AccessList>>;
   using DefsMap = DenseMap<const BasicBlock *, std::unique_ptr<DefsList>>;
 
-  void
-  determineInsertionPoint(const SmallPtrSetImpl<BasicBlock *> &DefiningBlocks);
   void markUnreachableAsLiveOnEntry(BasicBlock *BB);
-  bool dominatesUse(const MemoryAccess *, const MemoryAccess *) const;
   MemoryPhi *createMemoryPhi(BasicBlock *BB);
   template <typename AliasAnalysisType>
   MemoryUseOrDef *createNewAccess(Instruction *, AliasAnalysisType *,
                                   const MemoryUseOrDef *Template = nullptr);
-  MemoryAccess *findDominatingDef(BasicBlock *, enum InsertionPlace);
   void placePHINodes(const SmallPtrSetImpl<BasicBlock *> &);
   MemoryAccess *renameBlock(BasicBlock *, MemoryAccess *, bool);
   void renameSuccessorPhis(BasicBlock *, MemoryAccess *, bool);
@@ -1181,9 +1178,11 @@ class upward_defs_iterator
   using BaseT = upward_defs_iterator::iterator_facade_base;
 
 public:
-  upward_defs_iterator(const MemoryAccessPair &Info, DominatorTree *DT)
+  upward_defs_iterator(const MemoryAccessPair &Info, DominatorTree *DT,
+                       bool *PerformedPhiTranslation = nullptr)
       : DefIterator(Info.first), Location(Info.second),
-        OriginalAccess(Info.first), DT(DT) {
+        OriginalAccess(Info.first), DT(DT),
+        PerformedPhiTranslation(PerformedPhiTranslation) {
     CurrentPair.first = nullptr;
 
     WalkingPhi = Info.first && isa<MemoryPhi>(Info.first);
@@ -1215,38 +1214,60 @@ public:
   BasicBlock *getPhiArgBlock() const { return DefIterator.getPhiArgBlock(); }
 
 private:
+  /// Returns true if \p Ptr is guaranteed to be loop invariant for any possible
+  /// loop. In particular, this guarantees that it only references a single
+  /// MemoryLocation during execution of the containing function.
+  bool IsGuaranteedLoopInvariant(Value *Ptr) const;
+
   void fillInCurrentPair() {
     CurrentPair.first = *DefIterator;
+    CurrentPair.second = Location;
     if (WalkingPhi && Location.Ptr) {
+      // Mark size as unknown, if the location is not guaranteed to be
+      // loop-invariant for any possible loop in the function. Setting the size
+      // to unknown guarantees that any memory accesses that access locations
+      // after the pointer are considered as clobbers, which is important to
+      // catch loop carried dependences.
+      if (Location.Ptr &&
+          !IsGuaranteedLoopInvariant(const_cast<Value *>(Location.Ptr)))
+        CurrentPair.second =
+            Location.getWithNewSize(LocationSize::beforeOrAfterPointer());
       PHITransAddr Translator(
           const_cast<Value *>(Location.Ptr),
           OriginalAccess->getBlock()->getModule()->getDataLayout(), nullptr);
+
       if (!Translator.PHITranslateValue(OriginalAccess->getBlock(),
                                         DefIterator.getPhiArgBlock(), DT,
-                                        false)) {
-        if (Translator.getAddr() != Location.Ptr) {
-          CurrentPair.second = Location.getWithNewPtr(Translator.getAddr());
-          return;
+                                        true)) {
+        Value *TransAddr = Translator.getAddr();
+        if (TransAddr != Location.Ptr) {
+          CurrentPair.second = CurrentPair.second.getWithNewPtr(TransAddr);
+
+          if (TransAddr &&
+              !IsGuaranteedLoopInvariant(const_cast<Value *>(TransAddr)))
+            CurrentPair.second = CurrentPair.second.getWithNewSize(
+                LocationSize::beforeOrAfterPointer());
+
+          if (PerformedPhiTranslation)
+            *PerformedPhiTranslation = true;
         }
-      } else {
-        CurrentPair.second = Location.getWithNewSize(LocationSize::unknown());
-        return;
       }
     }
-    CurrentPair.second = Location;
   }
 
   MemoryAccessPair CurrentPair;
   memoryaccess_def_iterator DefIterator;
   MemoryLocation Location;
   MemoryAccess *OriginalAccess = nullptr;
-  bool WalkingPhi = false;
   DominatorTree *DT = nullptr;
+  bool WalkingPhi = false;
+  bool *PerformedPhiTranslation = nullptr;
 };
 
-inline upward_defs_iterator upward_defs_begin(const MemoryAccessPair &Pair,
-                                              DominatorTree &DT) {
-  return upward_defs_iterator(Pair, &DT);
+inline upward_defs_iterator
+upward_defs_begin(const MemoryAccessPair &Pair, DominatorTree &DT,
+                  bool *PerformedPhiTranslation = nullptr) {
+  return upward_defs_iterator(Pair, &DT, PerformedPhiTranslation);
 }
 
 inline upward_defs_iterator upward_defs_end() { return upward_defs_iterator(); }
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/MemorySSAUpdater.h b/contrib/llvm-project/llvm/include/llvm/Analysis/MemorySSAUpdater.h
index 20588ef083c5..b0bf2e5ead62 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/MemorySSAUpdater.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/MemorySSAUpdater.h
@@ -52,8 +52,6 @@ class LoopBlocksRPO;
 using ValueToValueMapTy = ValueMap<const Value *, WeakTrackingVH>;
 using PhiToDefMap = SmallDenseMap<MemoryPhi *, MemoryAccess *>;
 using CFGUpdate = cfg::Update<BasicBlock *>;
-using GraphDiffInvBBPair =
-    std::pair<const GraphDiff<BasicBlock *> *, Inverse<BasicBlock *>>;
 
 class MemorySSAUpdater {
 private:
@@ -121,8 +119,11 @@ public:
       ArrayRef<BasicBlock *> ExitBlocks,
       ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps, DominatorTree &DT);
 
-  /// Apply CFG updates, analogous with the DT edge updates.
-  void applyUpdates(ArrayRef<CFGUpdate> Updates, DominatorTree &DT);
+  /// Apply CFG updates, analogous with the DT edge updates. By default, the
+  /// DT is assumed to be already up to date. If UpdateDTFirst is true, first
+  /// update the DT with the same updates.
+  void applyUpdates(ArrayRef<CFGUpdate> Updates, DominatorTree &DT,
+                    bool UpdateDTFirst = false);
   /// Apply CFG insert updates, analogous with the DT edge updates.
   void applyInsertUpdates(ArrayRef<CFGUpdate> Updates, DominatorTree &DT);
 
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/ModuleDebugInfoPrinter.h b/contrib/llvm-project/llvm/include/llvm/Analysis/ModuleDebugInfoPrinter.h
new file mode 100644
index 000000000000..99aa315319b8
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/ModuleDebugInfoPrinter.h
@@ -0,0 +1,29 @@
+//===- ModuleDebugInfoPrinter.h - -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_MODULEDEBUGINFOPRINTER_H
+#define LLVM_ANALYSIS_MODULEDEBUGINFOPRINTER_H
+
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+class ModuleDebugInfoPrinterPass
+    : public PassInfoMixin<ModuleDebugInfoPrinterPass> {
+  DebugInfoFinder Finder;
+  raw_ostream &OS;
+
+public:
+  explicit ModuleDebugInfoPrinterPass(raw_ostream &OS);
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+} // end namespace llvm
+
+#endif // LLVM_ANALYSIS_MODULEDEBUGINFOPRINTER_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/MustExecute.h b/contrib/llvm-project/llvm/include/llvm/Analysis/MustExecute.h
index a3b7bee97808..df489aaa534d 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/MustExecute.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/MustExecute.h
@@ -27,6 +27,8 @@
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/InstructionPrecedenceTracking.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
 
@@ -541,6 +543,23 @@ private:
   MustBeExecutedIterator EndIterator;
 };
 
+class MustExecutePrinterPass : public PassInfoMixin<MustExecutePrinterPass> {
+  raw_ostream &OS;
+
+public:
+  MustExecutePrinterPass(raw_ostream &OS) : OS(OS) {}
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
+class MustBeExecutedContextPrinterPass
+    : public PassInfoMixin<MustBeExecutedContextPrinterPass> {
+  raw_ostream &OS;
+
+public:
+  MustBeExecutedContextPrinterPass(raw_ostream &OS) : OS(OS) {}
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
 } // namespace llvm
 
 #endif
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/ObjCARCAnalysisUtils.h b/contrib/llvm-project/llvm/include/llvm/Analysis/ObjCARCAnalysisUtils.h
index cad1c52f7f87..16c5f6701da0 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/ObjCARCAnalysisUtils.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/ObjCARCAnalysisUtils.h
@@ -23,7 +23,6 @@
 #define LLVM_LIB_ANALYSIS_OBJCARCANALYSISUTILS_H
 
 #include "llvm/ADT/Optional.h"
-#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/ObjCARCInstKind.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Constants.h"
@@ -31,6 +30,9 @@
 #include "llvm/IR/ValueHandle.h"
 
 namespace llvm {
+
+class AAResults;
+
 namespace objcarc {
 
 /// A handy option to enable/disable all ARC Optimizations.
@@ -64,10 +66,9 @@ inline bool ModuleHasARC(const Module &M) {
 /// This is a wrapper around getUnderlyingObject which also knows how to
 /// look through objc_retain and objc_autorelease calls, which we know to return
 /// their argument verbatim.
-inline const Value *GetUnderlyingObjCPtr(const Value *V,
-                                                const DataLayout &DL) {
+inline const Value *GetUnderlyingObjCPtr(const Value *V) {
   for (;;) {
-    V = GetUnderlyingObject(V, DL);
+    V = getUnderlyingObject(V);
     if (!IsForwarding(GetBasicARCInstKind(V)))
       break;
     V = cast<CallInst>(V)->getArgOperand(0);
@@ -78,12 +79,12 @@ inline const Value *GetUnderlyingObjCPtr(const Value *V,
 
 /// A wrapper for GetUnderlyingObjCPtr used for results memoization.
 inline const Value *
-GetUnderlyingObjCPtrCached(const Value *V, const DataLayout &DL,
+GetUnderlyingObjCPtrCached(const Value *V,
                            DenseMap<const Value *, WeakTrackingVH> &Cache) {
   if (auto InCache = Cache.lookup(V))
     return InCache;
 
-  const Value *Computed = GetUnderlyingObjCPtr(V, DL);
+  const Value *Computed = GetUnderlyingObjCPtr(V);
   Cache[V] = const_cast<Value *>(Computed);
   return Computed;
 }
@@ -146,7 +147,7 @@ inline bool IsPotentialRetainableObjPtr(const Value *Op) {
     return false;
   // Special arguments can not be a valid retainable object pointer.
   if (const Argument *Arg = dyn_cast<Argument>(Op))
-    if (Arg->hasPassPointeeByValueAttr() || Arg->hasNestAttr() ||
+    if (Arg->hasPassPointeeByValueCopyAttr() || Arg->hasNestAttr() ||
         Arg->hasStructRetAttr())
       return false;
   // Only consider values with pointer types.
@@ -162,24 +163,7 @@ inline bool IsPotentialRetainableObjPtr(const Value *Op) {
   return true;
 }
 
-inline bool IsPotentialRetainableObjPtr(const Value *Op,
-                                               AliasAnalysis &AA) {
-  // First make the rudimentary check.
-  if (!IsPotentialRetainableObjPtr(Op))
-    return false;
-
-  // Objects in constant memory are not reference-counted.
-  if (AA.pointsToConstantMemory(Op))
-    return false;
-
-  // Pointers in constant memory are not pointing to reference-counted objects.
-  if (const LoadInst *LI = dyn_cast<LoadInst>(Op))
-    if (AA.pointsToConstantMemory(LI->getPointerOperand()))
-      return false;
-
-  // Otherwise assume the worst.
-  return true;
-}
+bool IsPotentialRetainableObjPtr(const Value *Op, AAResults &AA);
 
 /// Helper for GetARCInstKind. Determines what kind of construct CS
 /// is.
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/OptimizationRemarkEmitter.h b/contrib/llvm-project/llvm/include/llvm/Analysis/OptimizationRemarkEmitter.h
index ab97d5b8504e..9815dd05cd1c 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/OptimizationRemarkEmitter.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/OptimizationRemarkEmitter.h
@@ -88,8 +88,14 @@ public:
   /// provide more context so that non-trivial false positives can be quickly
   /// detected by the user.
   bool allowExtraAnalysis(StringRef PassName) const {
-    return (F->getContext().getLLVMRemarkStreamer() ||
-            F->getContext().getDiagHandlerPtr()->isAnyRemarkEnabled(PassName));
+    return OptimizationRemarkEmitter::allowExtraAnalysis(*F, PassName);
+  }
+  static bool allowExtraAnalysis(const Function &F, StringRef PassName) {
+    return allowExtraAnalysis(F.getContext(), PassName);
+  }
+  static bool allowExtraAnalysis(LLVMContext &Ctx, StringRef PassName) {
+    return Ctx.getLLVMRemarkStreamer() ||
+           Ctx.getDiagHandlerPtr()->isAnyRemarkEnabled(PassName);
   }
 
 private:
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/PhiValues.h b/contrib/llvm-project/llvm/include/llvm/Analysis/PhiValues.h
index ea879d727282..c0e91c8b0bdf 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/PhiValues.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/PhiValues.h
@@ -21,7 +21,7 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/ValueHandle.h"
@@ -40,7 +40,7 @@ class Function;
 /// it is queried.
 class PhiValues {
 public:
-  using ValueSet = SmallPtrSet<Value *, 4>;
+  using ValueSet = SmallSetVector<Value *, 4>;
 
   /// Construct an empty PhiValues.
   PhiValues(const Function &F) : F(F) {}
@@ -70,8 +70,7 @@ public:
                   FunctionAnalysisManager::Invalidator &);
 
 private:
-  using PhiSet = SmallPtrSet<const PHINode *, 4>;
-  using ConstValueSet = SmallPtrSet<const Value *, 4>;
+  using ConstValueSet = SmallSetVector<const Value *, 4>;
 
   /// The next depth number to be used by processPhi.
   unsigned int NextDepthNumber = 1;
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/ProfileSummaryInfo.h b/contrib/llvm-project/llvm/include/llvm/Analysis/ProfileSummaryInfo.h
index a1fea9fefc9a..a4e6ffc3dd58 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/ProfileSummaryInfo.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/ProfileSummaryInfo.h
@@ -25,7 +25,6 @@ class BasicBlock;
 class BlockFrequencyInfo;
 class CallBase;
 class Function;
-class ProfileSummary;
 
 /// Analysis providing profile information.
 ///
@@ -39,7 +38,7 @@ class ProfileSummary;
 // units. This would require making this depend on BFI.
 class ProfileSummaryInfo {
 private:
-  Module &M;
+  const Module &M;
   std::unique_ptr<ProfileSummary> Summary;
   void computeThresholds();
   // Count thresholds to answer isHotCount and isColdCount queries.
@@ -59,7 +58,8 @@ private:
   mutable DenseMap<int, uint64_t> ThresholdCache;
 
 public:
-  ProfileSummaryInfo(Module &M) : M(M) { refresh(); }
+  ProfileSummaryInfo(const Module &M) : M(M) { refresh(); }
+
   ProfileSummaryInfo(ProfileSummaryInfo &&Arg) = default;
 
   /// If no summary is present, attempt to refresh.
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/RegionInfo.h b/contrib/llvm-project/llvm/include/llvm/Analysis/RegionInfo.h
index b0336c559774..f93081d6f51d 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/RegionInfo.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/RegionInfo.h
@@ -59,7 +59,6 @@
 namespace llvm {
 
 class DominanceFrontier;
-class DominatorTree;
 class Loop;
 class LoopInfo;
 class PostDominatorTree;
@@ -877,8 +876,6 @@ public:
   void verifyAnalysis() const;
 };
 
-class Region;
-
 class RegionNode : public RegionNodeBase<RegionTraits<Function>> {
 public:
   inline RegionNode(Region *Parent, BasicBlock *Entry, bool isSubRegion = false)
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/RegionInfoImpl.h b/contrib/llvm-project/llvm/include/llvm/Analysis/RegionInfoImpl.h
index 8d9ec646f519..b694effb2229 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/RegionInfoImpl.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/RegionInfoImpl.h
@@ -585,10 +585,8 @@ bool RegionInfoBase<Tr>::isRegion(BlockT *entry, BlockT *exit) const {
   // Exit is the header of a loop that contains the entry. In this case,
   // the dominance frontier must only contain the exit.
   if (!DT->dominates(entry, exit)) {
-    for (typename DST::iterator SI = entrySuccs->begin(),
-                                SE = entrySuccs->end();
-         SI != SE; ++SI) {
-      if (*SI != exit && *SI != entry)
+    for (BlockT *successor : *entrySuccs) {
+      if (successor != exit && successor != entry)
         return false;
     }
 
@@ -817,8 +815,7 @@ void RegionInfoBase<Tr>::verifyAnalysis() const {
 // Region pass manager support.
 template <class Tr>
 typename Tr::RegionT *RegionInfoBase<Tr>::getRegionFor(BlockT *BB) const {
-  typename BBtoRegionMap::const_iterator I = BBtoRegion.find(BB);
-  return I != BBtoRegion.end() ? I->second : nullptr;
+  return BBtoRegion.lookup(BB);
 }
 
 template <class Tr>
@@ -889,8 +886,7 @@ typename Tr::RegionT *RegionInfoBase<Tr>::getCommonRegion(RegionT *A,
 template <class Tr>
 typename Tr::RegionT *
 RegionInfoBase<Tr>::getCommonRegion(SmallVectorImpl<RegionT *> &Regions) const {
-  RegionT *ret = Regions.back();
-  Regions.pop_back();
+  RegionT *ret = Regions.pop_back_val();
 
   for (RegionT *R : Regions)
     ret = getCommonRegion(ret, R);
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/RegionPass.h b/contrib/llvm-project/llvm/include/llvm/Analysis/RegionPass.h
index 995c5dca3de3..5c7fa5f56693 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/RegionPass.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/RegionPass.h
@@ -85,8 +85,6 @@ protected:
 /// The pass manager to schedule RegionPasses.
 class RGPassManager : public FunctionPass, public PMDataManager {
   std::deque<Region*> RQ;
-  bool skipThisRegion;
-  bool redoThisRegion;
   RegionInfo *RI;
   Region *CurrentRegion;
 
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/ReplayInlineAdvisor.h b/contrib/llvm-project/llvm/include/llvm/Analysis/ReplayInlineAdvisor.h
new file mode 100644
index 000000000000..3018bcc241d8
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/ReplayInlineAdvisor.h
@@ -0,0 +1,41 @@
+//===- ReplayInlineAdvisor.h - Replay Inline Advisor interface -*- C++ --*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+#ifndef LLVM_ANALYSIS_REPLAYINLINEADVISOR_H
+#define LLVM_ANALYSIS_REPLAYINLINEADVISOR_H
+
+#include "llvm/ADT/StringSet.h"
+#include "llvm/Analysis/InlineAdvisor.h"
+#include "llvm/IR/LLVMContext.h"
+
+namespace llvm {
+class BasicBlock;
+class CallBase;
+class Function;
+class Module;
+class OptimizationRemarkEmitter;
+
+/// Replay inline advisor that uses optimization remarks from inlining of
+/// previous build to guide current inlining. This is useful for inliner tuning.
+class ReplayInlineAdvisor : public InlineAdvisor {
+public:
+  ReplayInlineAdvisor(Module &M, FunctionAnalysisManager &FAM,
+                      LLVMContext &Context,
+                      std::unique_ptr<InlineAdvisor> OriginalAdvisor,
+                      StringRef RemarksFile, bool EmitRemarks);
+  std::unique_ptr<InlineAdvice> getAdviceImpl(CallBase &CB) override;
+  bool areReplayRemarksLoaded() const { return HasReplayRemarks; }
+
+private:
+  StringSet<> InlineSitesFromRemarks;
+  std::unique_ptr<InlineAdvisor> OriginalAdvisor;
+  bool HasReplayRemarks = false;
+  bool EmitRemarks = false;
+};
+} // namespace llvm
+#endif // LLVM_ANALYSIS_REPLAYINLINEADVISOR_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/ScalarEvolution.h b/contrib/llvm-project/llvm/include/llvm/Analysis/ScalarEvolution.h
index 81c5fc932588..b3f199de2cfa 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -70,6 +70,7 @@ class StructType;
 class TargetLibraryInfo;
 class Type;
 class Value;
+enum SCEVTypes : unsigned short;
 
 /// This class represents an analyzed expression in the program.  These are
 /// opaque objects that the client is not allowed to do much with directly.
@@ -82,7 +83,7 @@ class SCEV : public FoldingSetNode {
   FoldingSetNodeIDRef FastID;
 
   // The SCEV baseclass this node corresponds to
-  const unsigned short SCEVType;
+  const SCEVTypes SCEVType;
 
 protected:
   // Estimated complexity of this node's expression tree size.
@@ -119,13 +120,13 @@ public:
     NoWrapMask = (1 << 3) - 1
   };
 
-  explicit SCEV(const FoldingSetNodeIDRef ID, unsigned SCEVTy,
+  explicit SCEV(const FoldingSetNodeIDRef ID, SCEVTypes SCEVTy,
                 unsigned short ExpressionSize)
       : FastID(ID), SCEVType(SCEVTy), ExpressionSize(ExpressionSize) {}
   SCEV(const SCEV &) = delete;
   SCEV &operator=(const SCEV &) = delete;
 
-  unsigned getSCEVType() const { return SCEVType; }
+  SCEVTypes getSCEVType() const { return SCEVType; }
 
   /// Return the LLVM type of this SCEV expression.
   Type *getType() const;
@@ -511,6 +512,7 @@ public:
   const SCEV *getConstant(ConstantInt *V);
   const SCEV *getConstant(const APInt &Val);
   const SCEV *getConstant(Type *Ty, uint64_t V, bool isSigned = false);
+  const SCEV *getPtrToIntExpr(const SCEV *Op, Type *Ty, unsigned Depth = 0);
   const SCEV *getTruncateExpr(const SCEV *Op, Type *Ty, unsigned Depth = 0);
   const SCEV *getZeroExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth = 0);
   const SCEV *getSignExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth = 0);
@@ -572,7 +574,9 @@ public:
   /// \p IndexExprs The expressions for the indices.
   const SCEV *getGEPExpr(GEPOperator *GEP,
                          const SmallVectorImpl<const SCEV *> &IndexExprs);
-  const SCEV *getMinMaxExpr(unsigned Kind,
+  const SCEV *getAbsExpr(const SCEV *Op, bool IsNSW);
+  const SCEV *getSignumExpr(const SCEV *Op);
+  const SCEV *getMinMaxExpr(SCEVTypes Kind,
                             SmallVectorImpl<const SCEV *> &Operands);
   const SCEV *getSMaxExpr(const SCEV *LHS, const SCEV *RHS);
   const SCEV *getSMaxExpr(SmallVectorImpl<const SCEV *> &Operands);
@@ -591,9 +595,22 @@ public:
   /// Return a SCEV for the constant 1 of a specific type.
   const SCEV *getOne(Type *Ty) { return getConstant(Ty, 1); }
 
-  /// Return an expression for sizeof AllocTy that is type IntTy
+  /// Return a SCEV for the constant -1 of a specific type.
+  const SCEV *getMinusOne(Type *Ty) {
+    return getConstant(Ty, -1, /*isSigned=*/true);
+  }
+
+  /// Return an expression for sizeof ScalableTy that is type IntTy, where
+  /// ScalableTy is a scalable vector type.
+  const SCEV *getSizeOfScalableVectorExpr(Type *IntTy,
+                                          ScalableVectorType *ScalableTy);
+
+  /// Return an expression for the alloc size of AllocTy that is type IntTy
   const SCEV *getSizeOfExpr(Type *IntTy, Type *AllocTy);
 
+  /// Return an expression for the store size of StoreTy that is type IntTy
+  const SCEV *getStoreSizeOfExpr(Type *IntTy, Type *StoreTy);
+
   /// Return an expression for offsetof on the given field with type IntTy
   const SCEV *getOffsetOfExpr(Type *IntTy, StructType *STy, unsigned FieldNo);
 
@@ -677,6 +694,12 @@ public:
   bool isLoopEntryGuardedByCond(const Loop *L, ICmpInst::Predicate Pred,
                                 const SCEV *LHS, const SCEV *RHS);
 
+  /// Test whether entry to the basic block is protected by a conditional
+  /// between LHS and RHS.
+  bool isBasicBlockEntryGuardedByCond(const BasicBlock *BB,
+                                      ICmpInst::Predicate Pred, const SCEV *LHS,
+                                      const SCEV *RHS);
+
   /// Test whether the backedge of the loop is protected by a conditional
   /// between LHS and RHS.  This is used to eliminate casts.
   bool isLoopBackedgeGuardedByCond(const Loop *L, ICmpInst::Predicate Pred,
@@ -696,7 +719,8 @@ public:
   /// before taking the branch. For loops with multiple exits, it may not be
   /// the number times that the loop header executes if the loop exits
   /// prematurely via another branch.
-  unsigned getSmallConstantTripCount(const Loop *L, BasicBlock *ExitingBlock);
+  unsigned getSmallConstantTripCount(const Loop *L,
+                                     const BasicBlock *ExitingBlock);
 
   /// Returns the upper bound of the loop trip count as a normal unsigned
   /// value.
@@ -718,8 +742,7 @@ public:
   /// for getSmallConstantTripCount, this assumes that control exits the loop
   /// via ExitingBlock.
   unsigned getSmallConstantTripMultiple(const Loop *L,
-                                        BasicBlock *ExitingBlock);
-
+                                        const BasicBlock *ExitingBlock);
 
   /// The terms "backedge taken count" and "exit count" are used
   /// interchangeably to refer to the number of times the backedge of a loop 
@@ -730,6 +753,8 @@ public:
     Exact,
     /// A constant which provides an upper bound on the exact trip count.
     ConstantMaximum,
+    /// An expression which provides an upper bound on the exact trip count.
+    SymbolicMaximum,
   };
 
   /// Return the number of times the backedge executes before the given exit
@@ -737,8 +762,8 @@ public:
   /// For a single exit loop, this value is equivelent to the result of
   /// getBackedgeTakenCount.  The loop is guaranteed to exit (via *some* exit)
   /// before the backedge is executed (ExitCount + 1) times.  Note that there
-  /// is no guarantee about *which* exit is taken on the exiting iteration.  
-  const SCEV *getExitCount(const Loop *L, BasicBlock *ExitingBlock,
+  /// is no guarantee about *which* exit is taken on the exiting iteration.
+  const SCEV *getExitCount(const Loop *L, const BasicBlock *ExitingBlock,
                            ExitCountKind Kind = Exact);
 
   /// If the specified loop has a predictable backedge-taken count, return it,
@@ -766,7 +791,15 @@ public:
   /// SCEVCouldNotCompute object.
   const SCEV *getConstantMaxBackedgeTakenCount(const Loop *L) {
     return getBackedgeTakenCount(L, ConstantMaximum);
-  } 
+  }
+
+  /// When successful, this returns a SCEV that is greater than or equal
+  /// to (i.e. a "conservative over-approximation") of the value returend by
+  /// getBackedgeTakenCount.  If such a value cannot be computed, it returns the
+  /// SCEVCouldNotCompute object.
+  const SCEV *getSymbolicMaxBackedgeTakenCount(const Loop *L) {
+    return getBackedgeTakenCount(L, SymbolicMaximum);
+  }
 
   /// Return true if the backedge taken count is either the value returned by
   /// getConstantMaxBackedgeTakenCount or zero.
@@ -905,32 +938,61 @@ public:
   bool isKnownPredicate(ICmpInst::Predicate Pred, const SCEV *LHS,
                         const SCEV *RHS);
 
+  /// Test if the given expression is known to satisfy the condition described
+  /// by Pred, LHS, and RHS in the given Context.
+  bool isKnownPredicateAt(ICmpInst::Predicate Pred, const SCEV *LHS,
+                        const SCEV *RHS, const Instruction *Context);
+
   /// Test if the condition described by Pred, LHS, RHS is known to be true on
   /// every iteration of the loop of the recurrency LHS.
   bool isKnownOnEveryIteration(ICmpInst::Predicate Pred,
                                const SCEVAddRecExpr *LHS, const SCEV *RHS);
 
-  /// Return true if, for all loop invariant X, the predicate "LHS `Pred` X"
-  /// is monotonically increasing or decreasing.  In the former case set
-  /// `Increasing` to true and in the latter case set `Increasing` to false.
-  ///
   /// A predicate is said to be monotonically increasing if may go from being
   /// false to being true as the loop iterates, but never the other way
   /// around.  A predicate is said to be monotonically decreasing if may go
   /// from being true to being false as the loop iterates, but never the other
   /// way around.
-  bool isMonotonicPredicate(const SCEVAddRecExpr *LHS, ICmpInst::Predicate Pred,
-                            bool &Increasing);
-
-  /// Return true if the result of the predicate LHS `Pred` RHS is loop
-  /// invariant with respect to L.  Set InvariantPred, InvariantLHS and
-  /// InvariantLHS so that InvariantLHS `InvariantPred` InvariantRHS is the
-  /// loop invariant form of LHS `Pred` RHS.
-  bool isLoopInvariantPredicate(ICmpInst::Predicate Pred, const SCEV *LHS,
-                                const SCEV *RHS, const Loop *L,
-                                ICmpInst::Predicate &InvariantPred,
-                                const SCEV *&InvariantLHS,
-                                const SCEV *&InvariantRHS);
+  enum MonotonicPredicateType {
+    MonotonicallyIncreasing,
+    MonotonicallyDecreasing
+  };
+
+  /// If, for all loop invariant X, the predicate "LHS `Pred` X" is
+  /// monotonically increasing or decreasing, returns
+  /// Some(MonotonicallyIncreasing) and Some(MonotonicallyDecreasing)
+  /// respectively. If we could not prove either of these facts, returns None.
+  Optional<MonotonicPredicateType>
+  getMonotonicPredicateType(const SCEVAddRecExpr *LHS,
+                            ICmpInst::Predicate Pred);
+
+  struct LoopInvariantPredicate {
+    ICmpInst::Predicate Pred;
+    const SCEV *LHS;
+    const SCEV *RHS;
+
+    LoopInvariantPredicate(ICmpInst::Predicate Pred, const SCEV *LHS,
+                           const SCEV *RHS)
+        : Pred(Pred), LHS(LHS), RHS(RHS) {}
+  };
+  /// If the result of the predicate LHS `Pred` RHS is loop invariant with
+  /// respect to L, return a LoopInvariantPredicate with LHS and RHS being
+  /// invariants, available at L's entry. Otherwise, return None.
+  Optional<LoopInvariantPredicate>
+  getLoopInvariantPredicate(ICmpInst::Predicate Pred, const SCEV *LHS,
+                            const SCEV *RHS, const Loop *L);
+
+  /// If the result of the predicate LHS `Pred` RHS is loop invariant with
+  /// respect to L at given Context during at least first MaxIter iterations,
+  /// return a LoopInvariantPredicate with LHS and RHS being invariants,
+  /// available at L's entry. Otherwise, return None. The predicate should be
+  /// the loop's exit condition.
+  Optional<LoopInvariantPredicate>
+  getLoopInvariantExitCondDuringFirstIterations(ICmpInst::Predicate Pred,
+                                                const SCEV *LHS,
+                                                const SCEV *RHS, const Loop *L,
+                                                const Instruction *Context,
+                                                const SCEV *MaxIter);
 
   /// Simplify LHS and RHS in a comparison with predicate Pred. Return true
   /// iff any changes were made. If the operands are provably equal or
@@ -1101,6 +1163,20 @@ public:
       const SCEV *S, const Loop *L,
       SmallPtrSetImpl<const SCEVPredicate *> &Preds);
 
+  /// Compute \p LHS - \p RHS and returns the result as an APInt if it is a
+  /// constant, and None if it isn't.
+  ///
+  /// This is intended to be a cheaper version of getMinusSCEV.  We can be
+  /// frugal here since we just bail out of actually constructing and
+  /// canonicalizing an expression in the cases where the result isn't going
+  /// to be a constant.
+  Optional<APInt> computeConstantDifference(const SCEV *LHS, const SCEV *RHS);
+
+  /// Update no-wrap flags of an AddRec. This may drop the cached info about
+  /// this AddRec (such as range info) in case if new flags may potentially
+  /// sharpen it.
+  void setNoWrapFlags(SCEVAddRecExpr *AddRec, SCEV::NoWrapFlags Flags);
+
 private:
   /// A CallbackVH to arrange for ScalarEvolution to be notified whenever a
   /// Value is deleted.
@@ -1181,7 +1257,7 @@ private:
   ValueExprMapType ValueExprMap;
 
   /// Mark predicate values currently being processed by isImpliedCond.
-  SmallPtrSet<Value *, 6> PendingLoopPredicates;
+  SmallPtrSet<const Value *, 6> PendingLoopPredicates;
 
   /// Mark SCEVUnknown Phis currently being processed by getRangeRef.
   SmallPtrSet<const PHINode *, 6> PendingPhiRanges;
@@ -1284,39 +1360,41 @@ private:
     /// never have more than one computable exit.
     SmallVector<ExitNotTakenInfo, 1> ExitNotTaken;
 
-    /// The pointer part of \c MaxAndComplete is an expression indicating the
-    /// least maximum backedge-taken count of the loop that is known, or a
-    /// SCEVCouldNotCompute. This expression is only valid if the predicates
-    /// associated with all loop exits are true.
-    ///
-    /// The integer part of \c MaxAndComplete is a boolean indicating if \c
-    /// ExitNotTaken has an element for every exiting block in the loop.
-    PointerIntPair<const SCEV *, 1> MaxAndComplete;
+    /// Expression indicating the least constant maximum backedge-taken count of
+    /// the loop that is known, or a SCEVCouldNotCompute. This expression is
+    /// only valid if the redicates associated with all loop exits are true.
+    const SCEV *ConstantMax;
+
+    /// Indicating if \c ExitNotTaken has an element for every exiting block in
+    /// the loop.
+    bool IsComplete;
+
+    /// Expression indicating the least maximum backedge-taken count of the loop
+    /// that is known, or a SCEVCouldNotCompute. Lazily computed on first query.
+    const SCEV *SymbolicMax = nullptr;
 
     /// True iff the backedge is taken either exactly Max or zero times.
     bool MaxOrZero = false;
 
-    /// \name Helper projection functions on \c MaxAndComplete.
-    /// @{
-    bool isComplete() const { return MaxAndComplete.getInt(); }
-    const SCEV *getMax() const { return MaxAndComplete.getPointer(); }
-    /// @}
+    bool isComplete() const { return IsComplete; }
+    const SCEV *getConstantMax() const { return ConstantMax; }
 
   public:
-    BackedgeTakenInfo() : MaxAndComplete(nullptr, 0) {}
+    BackedgeTakenInfo() : ConstantMax(nullptr), IsComplete(false) {}
     BackedgeTakenInfo(BackedgeTakenInfo &&) = default;
     BackedgeTakenInfo &operator=(BackedgeTakenInfo &&) = default;
 
     using EdgeExitInfo = std::pair<BasicBlock *, ExitLimit>;
 
     /// Initialize BackedgeTakenInfo from a list of exact exit counts.
-    BackedgeTakenInfo(ArrayRef<EdgeExitInfo> ExitCounts, bool Complete,
-                      const SCEV *MaxCount, bool MaxOrZero);
+    BackedgeTakenInfo(ArrayRef<EdgeExitInfo> ExitCounts, bool IsComplete,
+                      const SCEV *ConstantMax, bool MaxOrZero);
 
     /// Test whether this BackedgeTakenInfo contains any computed information,
     /// or whether it's all SCEVCouldNotCompute values.
     bool hasAnyInfo() const {
-      return !ExitNotTaken.empty() || !isa<SCEVCouldNotCompute>(getMax());
+      return !ExitNotTaken.empty() ||
+             !isa<SCEVCouldNotCompute>(getConstantMax());
     }
 
     /// Test whether this BackedgeTakenInfo contains complete information.
@@ -1347,17 +1425,22 @@ private:
     /// edge, or SCEVCouldNotCompute. The loop is guaranteed not to exit via
     /// this block before this number of iterations, but may exit via another
     /// block.
-    const SCEV *getExact(BasicBlock *ExitingBlock, ScalarEvolution *SE) const;
+    const SCEV *getExact(const BasicBlock *ExitingBlock,
+                         ScalarEvolution *SE) const;
+
+    /// Get the constant max backedge taken count for the loop.
+    const SCEV *getConstantMax(ScalarEvolution *SE) const;
 
-    /// Get the max backedge taken count for the loop.
-    const SCEV *getMax(ScalarEvolution *SE) const;
+    /// Get the constant max backedge taken count for the particular loop exit.
+    const SCEV *getConstantMax(const BasicBlock *ExitingBlock,
+                               ScalarEvolution *SE) const;
 
-    /// Get the max backedge taken count for the particular loop exit.
-    const SCEV *getMax(BasicBlock *ExitingBlock, ScalarEvolution *SE) const;
+    /// Get the symbolic max backedge taken count for the loop.
+    const SCEV *getSymbolicMax(const Loop *L, ScalarEvolution *SE);
 
     /// Return true if the number of times this backedge is taken is either the
-    /// value returned by getMax or zero.
-    bool isMaxOrZero(ScalarEvolution *SE) const;
+    /// value returned by getConstantMax or zero.
+    bool isConstantMaxOrZero(ScalarEvolution *SE) const;
 
     /// Return true if any backedge taken count expressions refer to the given
     /// subexpression.
@@ -1462,6 +1545,13 @@ private:
   ConstantRange getRangeForAffineAR(const SCEV *Start, const SCEV *Stop,
                                     const SCEV *MaxBECount, unsigned BitWidth);
 
+  /// Determines the range for the affine non-self-wrapping SCEVAddRecExpr {\p
+  /// Start,+,\p Stop}<nw>.
+  ConstantRange getRangeForAffineNoSelfWrappingAR(const SCEVAddRecExpr *AddRec,
+                                                  const SCEV *MaxBECount,
+                                                  unsigned BitWidth,
+                                                  RangeSignHint SignHint);
+
   /// Try to compute a range for the affine SCEVAddRecExpr {\p Start,+,\p
   /// Stop} by "factoring out" a ternary expression from the add recurrence.
   /// Helper called by \c getRange.
@@ -1507,7 +1597,7 @@ private:
   /// Return the BackedgeTakenInfo for the given loop, lazily computing new
   /// values if the loop hasn't been analyzed yet. The returned result is
   /// guaranteed not to be predicated.
-  const BackedgeTakenInfo &getBackedgeTakenInfo(const Loop *L);
+  BackedgeTakenInfo &getBackedgeTakenInfo(const Loop *L);
 
   /// Similar to getBackedgeTakenInfo, but will add predicates as required
   /// with the purpose of returning complete information.
@@ -1540,6 +1630,11 @@ private:
                                      bool ExitIfTrue, bool ControlsExit,
                                      bool AllowPredicates = false);
 
+  /// Return a symbolic upper bound for the backedge taken count of the loop.
+  /// This is more general than getConstantMaxBackedgeTakenCount as it returns
+  /// an arbitrary expression as opposed to only constants.
+  const SCEV *computeSymbolicMaxBackedgeTakenCount(const Loop *L);
+
   // Helper functions for computeExitLimitFromCond to avoid exponential time
   // complexity.
 
@@ -1577,6 +1672,10 @@ private:
                                          Value *ExitCond, bool ExitIfTrue,
                                          bool ControlsExit,
                                          bool AllowPredicates);
+  Optional<ScalarEvolution::ExitLimit>
+  computeExitLimitFromCondFromBinOp(ExitLimitCacheTy &Cache, const Loop *L,
+                                    Value *ExitCond, bool ExitIfTrue,
+                                    bool ControlsExit, bool AllowPredicates);
 
   /// Compute the number of times the backedge of the specified loop will
   /// execute if its exit condition were a conditional branch of the ICmpInst
@@ -1655,27 +1754,44 @@ private:
   /// Return a predecessor of BB (which may not be an immediate predecessor)
   /// which has exactly one successor from which BB is reachable, or null if
   /// no such block is found.
-  std::pair<BasicBlock *, BasicBlock *>
-  getPredecessorWithUniqueSuccessorForBB(BasicBlock *BB);
+  std::pair<const BasicBlock *, const BasicBlock *>
+  getPredecessorWithUniqueSuccessorForBB(const BasicBlock *BB) const;
 
   /// Test whether the condition described by Pred, LHS, and RHS is true
-  /// whenever the given FoundCondValue value evaluates to true.
+  /// whenever the given FoundCondValue value evaluates to true in given
+  /// Context. If Context is nullptr, then the found predicate is true
+  /// everywhere. LHS and FoundLHS may have different type width.
   bool isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS,
-                     Value *FoundCondValue, bool Inverse);
+                     const Value *FoundCondValue, bool Inverse,
+                     const Instruction *Context = nullptr);
+
+  /// Test whether the condition described by Pred, LHS, and RHS is true
+  /// whenever the given FoundCondValue value evaluates to true in given
+  /// Context. If Context is nullptr, then the found predicate is true
+  /// everywhere. LHS and FoundLHS must have same type width.
+  bool isImpliedCondBalancedTypes(ICmpInst::Predicate Pred, const SCEV *LHS,
+                                  const SCEV *RHS,
+                                  ICmpInst::Predicate FoundPred,
+                                  const SCEV *FoundLHS, const SCEV *FoundRHS,
+                                  const Instruction *Context);
 
   /// Test whether the condition described by Pred, LHS, and RHS is true
   /// whenever the condition described by FoundPred, FoundLHS, FoundRHS is
-  /// true.
+  /// true in given Context. If Context is nullptr, then the found predicate is
+  /// true everywhere.
   bool isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS,
                      ICmpInst::Predicate FoundPred, const SCEV *FoundLHS,
-                     const SCEV *FoundRHS);
+                     const SCEV *FoundRHS,
+                     const Instruction *Context = nullptr);
 
   /// Test whether the condition described by Pred, LHS, and RHS is true
   /// whenever the condition described by Pred, FoundLHS, and FoundRHS is
-  /// true.
+  /// true in given Context. If Context is nullptr, then the found predicate is
+  /// true everywhere.
   bool isImpliedCondOperands(ICmpInst::Predicate Pred, const SCEV *LHS,
                              const SCEV *RHS, const SCEV *FoundLHS,
-                             const SCEV *FoundRHS);
+                             const SCEV *FoundRHS,
+                             const Instruction *Context = nullptr);
 
   /// Test whether the condition described by Pred, LHS, and RHS is true
   /// whenever the condition described by Pred, FoundLHS, and FoundRHS is
@@ -1708,7 +1824,7 @@ private:
 
   /// Return true if the condition denoted by \p LHS \p Pred \p RHS is implied
   /// by a call to @llvm.experimental.guard in \p BB.
-  bool isImpliedViaGuard(BasicBlock *BB, ICmpInst::Predicate Pred,
+  bool isImpliedViaGuard(const BasicBlock *BB, ICmpInst::Predicate Pred,
                          const SCEV *LHS, const SCEV *RHS);
 
   /// Test whether the condition described by Pred, LHS, and RHS is true
@@ -1722,6 +1838,18 @@ private:
                                           const SCEV *FoundLHS,
                                           const SCEV *FoundRHS);
 
+  /// Test whether the condition described by Pred, LHS, and RHS is true
+  /// whenever the condition described by Pred, FoundLHS, and FoundRHS is
+  /// true.
+  ///
+  /// This routine tries to weaken the known condition basing on fact that
+  /// FoundLHS is an AddRec.
+  bool isImpliedCondOperandsViaAddRecStart(ICmpInst::Predicate Pred,
+                                           const SCEV *LHS, const SCEV *RHS,
+                                           const SCEV *FoundLHS,
+                                           const SCEV *FoundRHS,
+                                           const Instruction *Context);
+
   /// Test whether the condition described by Pred, LHS, and RHS is true
   /// whenever the condition described by Pred, FoundLHS, and FoundRHS is
   /// true.
@@ -1762,15 +1890,6 @@ private:
   bool splitBinaryAdd(const SCEV *Expr, const SCEV *&L, const SCEV *&R,
                       SCEV::NoWrapFlags &Flags);
 
-  /// Compute \p LHS - \p RHS and returns the result as an APInt if it is a
-  /// constant, and None if it isn't.
-  ///
-  /// This is intended to be a cheaper version of getMinusSCEV.  We can be
-  /// frugal here since we just bail out of actually constructing and
-  /// canonicalizing an expression in the cases where the result isn't going
-  /// to be a constant.
-  Optional<APInt> computeConstantDifference(const SCEV *LHS, const SCEV *RHS);
-
   /// Drop memoized information computed for S.
   void forgetMemoizedResults(const SCEV *S);
 
@@ -1793,8 +1912,17 @@ private:
   /// Try to prove NSW or NUW on \p AR relying on ConstantRange manipulation.
   SCEV::NoWrapFlags proveNoWrapViaConstantRanges(const SCEVAddRecExpr *AR);
 
-  bool isMonotonicPredicateImpl(const SCEVAddRecExpr *LHS,
-                                ICmpInst::Predicate Pred, bool &Increasing);
+  /// Try to prove NSW on \p AR by proving facts about conditions known  on
+  /// entry and backedge.
+  SCEV::NoWrapFlags proveNoSignedWrapViaInduction(const SCEVAddRecExpr *AR);
+
+  /// Try to prove NUW on \p AR by proving facts about conditions known on
+  /// entry and backedge.
+  SCEV::NoWrapFlags proveNoUnsignedWrapViaInduction(const SCEVAddRecExpr *AR);
+
+  Optional<MonotonicPredicateType>
+  getMonotonicPredicateTypeImpl(const SCEVAddRecExpr *LHS,
+                                ICmpInst::Predicate Pred);
 
   /// Return SCEV no-wrap flags that can be proven based on reasoning about
   /// how poison produced from no-wrap flags on this value (e.g. a nuw add)
@@ -1893,6 +2021,9 @@ private:
   /// Assign A and B to LHS and RHS, respectively.
   bool matchURem(const SCEV *Expr, const SCEV *&LHS, const SCEV *&RHS);
 
+  /// Try to apply information from loop guards for \p L to \p Expr.
+  const SCEV *applyLoopGuards(const SCEV *Expr, const Loop *L);
+
   /// Look for a SCEV expression with type `SCEVType` and operands `Ops` in
   /// `UniqueSCEVs`.
   ///
@@ -1901,7 +2032,7 @@ private:
   /// constructed to look up the SCEV and the third component is the insertion
   /// point.
   std::tuple<SCEV *, FoldingSetNodeID, void *>
-  findExistingSCEVInCache(int SCEVType, ArrayRef<const SCEV *> Ops);
+  findExistingSCEVInCache(SCEVTypes SCEVType, ArrayRef<const SCEV *> Ops);
 
   FoldingSet<SCEV> UniqueSCEVs;
   FoldingSet<SCEVPredicate> UniquePreds;
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/ScalarEvolutionDivision.h b/contrib/llvm-project/llvm/include/llvm/Analysis/ScalarEvolutionDivision.h
index 480f92c117a0..24f0c51487bd 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/ScalarEvolutionDivision.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/ScalarEvolutionDivision.h
@@ -33,6 +33,7 @@ public:
 
   // Except in the trivial case described above, we do not know how to divide
   // Expr by Denominator for the following functions with empty implementation.
+  void visitPtrToIntExpr(const SCEVPtrToIntExpr *Numerator) {}
   void visitTruncateExpr(const SCEVTruncateExpr *Numerator) {}
   void visitZeroExtendExpr(const SCEVZeroExtendExpr *Numerator) {}
   void visitSignExtendExpr(const SCEVSignExtendExpr *Numerator) {}
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/ScalarEvolutionExpressions.h b/contrib/llvm-project/llvm/include/llvm/Analysis/ScalarEvolutionExpressions.h
index 0076e02ae1bf..37e675f08afc 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/ScalarEvolutionExpressions.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/ScalarEvolutionExpressions.h
@@ -35,12 +35,12 @@ class ConstantRange;
 class Loop;
 class Type;
 
-  enum SCEVTypes {
+  enum SCEVTypes : unsigned short {
     // These should be ordered in terms of increasing complexity to make the
     // folders simpler.
     scConstant, scTruncate, scZeroExtend, scSignExtend, scAddExpr, scMulExpr,
     scUDivExpr, scAddRecExpr, scUMaxExpr, scSMaxExpr, scUMinExpr, scSMinExpr,
-    scUnknown, scCouldNotCompute
+    scPtrToInt, scUnknown, scCouldNotCompute
   };
 
   /// This class represents a constant integer value.
@@ -74,16 +74,56 @@ class Type;
   /// This is the base class for unary cast operator classes.
   class SCEVCastExpr : public SCEV {
   protected:
-    const SCEV *Op;
+    std::array<const SCEV *, 1> Operands;
     Type *Ty;
 
-    SCEVCastExpr(const FoldingSetNodeIDRef ID,
-                 unsigned SCEVTy, const SCEV *op, Type *ty);
+    SCEVCastExpr(const FoldingSetNodeIDRef ID, SCEVTypes SCEVTy, const SCEV *op,
+                 Type *ty);
 
   public:
-    const SCEV *getOperand() const { return Op; }
+    const SCEV *getOperand() const { return Operands[0]; }
+    const SCEV *getOperand(unsigned i) const {
+      assert(i == 0 && "Operand index out of range!");
+      return Operands[0];
+    }
+    using op_iterator = std::array<const SCEV *, 1>::const_iterator;
+    using op_range = iterator_range<op_iterator>;
+
+    op_range operands() const {
+      return make_range(Operands.begin(), Operands.end());
+    }
+    size_t getNumOperands() const { return 1; }
     Type *getType() const { return Ty; }
 
+    /// Methods for support type inquiry through isa, cast, and dyn_cast:
+    static bool classof(const SCEV *S) {
+      return S->getSCEVType() == scPtrToInt || S->getSCEVType() == scTruncate ||
+             S->getSCEVType() == scZeroExtend ||
+             S->getSCEVType() == scSignExtend;
+    }
+  };
+
+  /// This class represents a cast from a pointer to a pointer-sized integer
+  /// value.
+  class SCEVPtrToIntExpr : public SCEVCastExpr {
+    friend class ScalarEvolution;
+
+    SCEVPtrToIntExpr(const FoldingSetNodeIDRef ID, const SCEV *Op, Type *ITy);
+
+  public:
+    /// Methods for support type inquiry through isa, cast, and dyn_cast:
+    static bool classof(const SCEV *S) {
+      return S->getSCEVType() == scPtrToInt;
+    }
+  };
+
+  /// This is the base class for unary integral cast operator classes.
+  class SCEVIntegralCastExpr : public SCEVCastExpr {
+  protected:
+    SCEVIntegralCastExpr(const FoldingSetNodeIDRef ID, SCEVTypes SCEVTy,
+                         const SCEV *op, Type *ty);
+
+  public:
     /// Methods for support type inquiry through isa, cast, and dyn_cast:
     static bool classof(const SCEV *S) {
       return S->getSCEVType() == scTruncate ||
@@ -94,7 +134,7 @@ class Type;
 
   /// This class represents a truncation of an integer value to a
   /// smaller integer value.
-  class SCEVTruncateExpr : public SCEVCastExpr {
+  class SCEVTruncateExpr : public SCEVIntegralCastExpr {
     friend class ScalarEvolution;
 
     SCEVTruncateExpr(const FoldingSetNodeIDRef ID,
@@ -109,7 +149,7 @@ class Type;
 
   /// This class represents a zero extension of a small integer value
   /// to a larger integer value.
-  class SCEVZeroExtendExpr : public SCEVCastExpr {
+  class SCEVZeroExtendExpr : public SCEVIntegralCastExpr {
     friend class ScalarEvolution;
 
     SCEVZeroExtendExpr(const FoldingSetNodeIDRef ID,
@@ -124,7 +164,7 @@ class Type;
 
   /// This class represents a sign extension of a small integer value
   /// to a larger integer value.
-  class SCEVSignExtendExpr : public SCEVCastExpr {
+  class SCEVSignExtendExpr : public SCEVIntegralCastExpr {
     friend class ScalarEvolution;
 
     SCEVSignExtendExpr(const FoldingSetNodeIDRef ID,
@@ -263,16 +303,28 @@ class Type;
   class SCEVUDivExpr : public SCEV {
     friend class ScalarEvolution;
 
-    const SCEV *LHS;
-    const SCEV *RHS;
+    std::array<const SCEV *, 2> Operands;
 
     SCEVUDivExpr(const FoldingSetNodeIDRef ID, const SCEV *lhs, const SCEV *rhs)
-        : SCEV(ID, scUDivExpr, computeExpressionSize({lhs, rhs})), LHS(lhs),
-          RHS(rhs) {}
+        : SCEV(ID, scUDivExpr, computeExpressionSize({lhs, rhs})) {
+        Operands[0] = lhs;
+        Operands[1] = rhs;
+      }
 
   public:
-    const SCEV *getLHS() const { return LHS; }
-    const SCEV *getRHS() const { return RHS; }
+    const SCEV *getLHS() const { return Operands[0]; }
+    const SCEV *getRHS() const { return Operands[1]; }
+    size_t getNumOperands() const { return 2; }
+    const SCEV *getOperand(unsigned i) const {
+      assert((i == 0 || i == 1) && "Operand index out of range!");
+      return i == 0 ? getLHS() : getRHS();
+    }
+
+    using op_iterator = std::array<const SCEV *, 2>::const_iterator;
+    using op_range = iterator_range<op_iterator>;
+    op_range operands() const {
+      return make_range(Operands.begin(), Operands.end());
+    }
 
     Type *getType() const {
       // In most cases the types of LHS and RHS will be the same, but in some
@@ -389,7 +441,7 @@ class Type;
 
   public:
     static bool classof(const SCEV *S) {
-      return isMinMaxType(static_cast<SCEVTypes>(S->getSCEVType()));
+      return isMinMaxType(S->getSCEVType());
     }
 
     static enum SCEVTypes negate(enum SCEVTypes T) {
@@ -518,6 +570,8 @@ class Type;
       switch (S->getSCEVType()) {
       case scConstant:
         return ((SC*)this)->visitConstant((const SCEVConstant*)S);
+      case scPtrToInt:
+        return ((SC *)this)->visitPtrToIntExpr((const SCEVPtrToIntExpr *)S);
       case scTruncate:
         return ((SC*)this)->visitTruncateExpr((const SCEVTruncateExpr*)S);
       case scZeroExtend:
@@ -544,9 +598,8 @@ class Type;
         return ((SC*)this)->visitUnknown((const SCEVUnknown*)S);
       case scCouldNotCompute:
         return ((SC*)this)->visitCouldNotCompute((const SCEVCouldNotCompute*)S);
-      default:
-        llvm_unreachable("Unknown SCEV type!");
       }
+      llvm_unreachable("Unknown SCEV kind!");
     }
 
     RetVal visitCouldNotCompute(const SCEVCouldNotCompute *S) {
@@ -583,12 +636,13 @@ class Type;
         switch (S->getSCEVType()) {
         case scConstant:
         case scUnknown:
-          break;
+          continue;
+        case scPtrToInt:
         case scTruncate:
         case scZeroExtend:
         case scSignExtend:
           push(cast<SCEVCastExpr>(S)->getOperand());
-          break;
+          continue;
         case scAddExpr:
         case scMulExpr:
         case scSMaxExpr:
@@ -598,18 +652,17 @@ class Type;
         case scAddRecExpr:
           for (const auto *Op : cast<SCEVNAryExpr>(S)->operands())
             push(Op);
-          break;
+          continue;
         case scUDivExpr: {
           const SCEVUDivExpr *UDiv = cast<SCEVUDivExpr>(S);
           push(UDiv->getLHS());
           push(UDiv->getRHS());
-          break;
+          continue;
         }
         case scCouldNotCompute:
           llvm_unreachable("Attempt to use a SCEVCouldNotCompute object!");
-        default:
-          llvm_unreachable("Unknown SCEV kind!");
         }
+        llvm_unreachable("Unknown SCEV kind!");
       }
     }
   };
@@ -677,6 +730,13 @@ class Type;
       return Constant;
     }
 
+    const SCEV *visitPtrToIntExpr(const SCEVPtrToIntExpr *Expr) {
+      const SCEV *Operand = ((SC *)this)->visit(Expr->getOperand());
+      return Operand == Expr->getOperand()
+                 ? Expr
+                 : SE.getPtrToIntExpr(Operand, Expr->getType());
+    }
+
     const SCEV *visitTruncateExpr(const SCEVTruncateExpr *Expr) {
       const SCEV *Operand = ((SC*)this)->visit(Expr->getOperand());
       return Operand == Expr->getOperand()
@@ -787,35 +847,30 @@ class Type;
   };
 
   using ValueToValueMap = DenseMap<const Value *, Value *>;
+  using ValueToSCEVMapTy = DenseMap<const Value *, const SCEV *>;
 
   /// The SCEVParameterRewriter takes a scalar evolution expression and updates
-  /// the SCEVUnknown components following the Map (Value -> Value).
+  /// the SCEVUnknown components following the Map (Value -> SCEV).
   class SCEVParameterRewriter : public SCEVRewriteVisitor<SCEVParameterRewriter> {
   public:
     static const SCEV *rewrite(const SCEV *Scev, ScalarEvolution &SE,
-                               ValueToValueMap &Map,
-                               bool InterpretConsts = false) {
-      SCEVParameterRewriter Rewriter(SE, Map, InterpretConsts);
+                               ValueToSCEVMapTy &Map) {
+      SCEVParameterRewriter Rewriter(SE, Map);
       return Rewriter.visit(Scev);
     }
 
-    SCEVParameterRewriter(ScalarEvolution &SE, ValueToValueMap &M, bool C)
-      : SCEVRewriteVisitor(SE), Map(M), InterpretConsts(C) {}
+    SCEVParameterRewriter(ScalarEvolution &SE, ValueToSCEVMapTy &M)
+        : SCEVRewriteVisitor(SE), Map(M) {}
 
     const SCEV *visitUnknown(const SCEVUnknown *Expr) {
-      Value *V = Expr->getValue();
-      if (Map.count(V)) {
-        Value *NV = Map[V];
-        if (InterpretConsts && isa<ConstantInt>(NV))
-          return SE.getConstant(cast<ConstantInt>(NV));
-        return SE.getUnknown(NV);
-      }
-      return Expr;
+      auto I = Map.find(Expr->getValue());
+      if (I == Map.end())
+        return Expr;
+      return I->second;
     }
 
   private:
-    ValueToValueMap &Map;
-    bool InterpretConsts;
+    ValueToSCEVMapTy &Map;
   };
 
   using LoopToScevMapT = DenseMap<const Loop *, const SCEV *>;
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/SparsePropagation.h b/contrib/llvm-project/llvm/include/llvm/Analysis/SparsePropagation.h
index fac92e4a25a4..81a2533152de 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/SparsePropagation.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/SparsePropagation.h
@@ -485,8 +485,7 @@ void SparseSolver<LatticeKey, LatticeVal, KeyInfo>::Solve() {
 
     // Process the basic block work list.
     while (!BBWorkList.empty()) {
-      BasicBlock *BB = BBWorkList.back();
-      BBWorkList.pop_back();
+      BasicBlock *BB = BBWorkList.pop_back_val();
 
       LLVM_DEBUG(dbgs() << "\nPopped off BBWL: " << *BB);
 
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/StackLifetime.h b/contrib/llvm-project/llvm/include/llvm/Analysis/StackLifetime.h
index 8abc6cc1ce00..df342a9533ee 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/StackLifetime.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/StackLifetime.h
@@ -13,6 +13,7 @@
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Support/raw_ostream.h"
@@ -121,6 +122,8 @@ private:
   DenseMap<const BasicBlock *, SmallVector<std::pair<unsigned, Marker>, 4>>
       BBMarkers;
 
+  bool HasUnknownLifetimeStartOrEnd = false;
+
   void dumpAllocas() const;
   void dumpBlockLiveness() const;
   void dumpLiveRanges() const;
@@ -166,16 +169,9 @@ public:
 
 static inline raw_ostream &operator<<(raw_ostream &OS, const BitVector &V) {
   OS << "{";
-  int Idx = V.find_first();
-  bool First = true;
-  while (Idx >= 0) {
-    if (!First) {
-      OS << ", ";
-    }
-    First = false;
-    OS << Idx;
-    Idx = V.find_next(Idx);
-  }
+  ListSeparator LS;
+  for (int Idx = V.find_first(); Idx >= 0; Idx = V.find_next(Idx))
+    OS << LS << Idx;
   OS << "}";
   return OS;
 }
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/StackSafetyAnalysis.h b/contrib/llvm-project/llvm/include/llvm/Analysis/StackSafetyAnalysis.h
index 846c2e6f7e91..59c1e3e3bd56 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/StackSafetyAnalysis.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/StackSafetyAnalysis.h
@@ -51,7 +51,8 @@ public:
   /// StackSafety assumes that missing parameter information means possibility
   /// of access to the parameter with any offset, so we can correctly link
   /// code without StackSafety information, e.g. non-ThinLTO.
-  std::vector<FunctionSummary::ParamAccess> getParamAccesses() const;
+  std::vector<FunctionSummary::ParamAccess>
+  getParamAccesses(ModuleSummaryIndex &Index) const;
 };
 
 class StackSafetyGlobalInfo {
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/SyncDependenceAnalysis.h b/contrib/llvm-project/llvm/include/llvm/Analysis/SyncDependenceAnalysis.h
index 2f07b3135308..9838d629e93e 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/SyncDependenceAnalysis.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/SyncDependenceAnalysis.h
@@ -21,6 +21,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include <memory>
+#include <unordered_map>
 
 namespace llvm {
 
@@ -30,6 +31,26 @@ class Loop;
 class PostDominatorTree;
 
 using ConstBlockSet = SmallPtrSet<const BasicBlock *, 4>;
+struct ControlDivergenceDesc {
+  // Join points of divergent disjoint paths.
+  ConstBlockSet JoinDivBlocks;
+  // Divergent loop exits
+  ConstBlockSet LoopDivBlocks;
+};
+
+struct ModifiedPO {
+  std::vector<const BasicBlock *> LoopPO;
+  std::unordered_map<const BasicBlock *, unsigned> POIndex;
+  void appendBlock(const BasicBlock &BB) {
+    POIndex[&BB] = LoopPO.size();
+    LoopPO.push_back(&BB);
+  }
+  unsigned getIndexOf(const BasicBlock &BB) const {
+    return POIndex.find(&BB)->second;
+  }
+  unsigned size() const { return LoopPO.size(); }
+  const BasicBlock *getBlockAt(unsigned Idx) const { return LoopPO[Idx]; }
+};
 
 /// \brief Relates points of divergent control to join points in
 /// reducible CFGs.
@@ -51,28 +72,19 @@ public:
   /// header. Those exit blocks are added to the returned set.
   /// If L is the parent loop of \p Term and an exit of L is in the returned
   /// set then L is a divergent loop.
-  const ConstBlockSet &join_blocks(const Instruction &Term);
-
-  /// \brief Computes divergent join points and loop exits (in the surrounding
-  /// loop) caused by the divergent loop exits of\p Loop.
-  ///
-  /// The set of blocks which are reachable by disjoint paths from the
-  /// loop exits of \p Loop.
-  /// This treats the loop as a single node in \p Loop's parent loop.
-  /// The returned set has the same properties as for join_blocks(TermInst&).
-  const ConstBlockSet &join_blocks(const Loop &Loop);
+  const ControlDivergenceDesc &getJoinBlocks(const Instruction &Term);
 
 private:
-  static ConstBlockSet EmptyBlockSet;
+  static ControlDivergenceDesc EmptyDivergenceDesc;
+
+  ModifiedPO LoopPO;
 
-  ReversePostOrderTraversal<const Function *> FuncRPOT;
   const DominatorTree &DT;
   const PostDominatorTree &PDT;
   const LoopInfo &LI;
 
-  std::map<const Loop *, std::unique_ptr<ConstBlockSet>> CachedLoopExitJoins;
-  std::map<const Instruction *, std::unique_ptr<ConstBlockSet>>
-      CachedBranchJoins;
+  std::map<const Instruction *, std::unique_ptr<ControlDivergenceDesc>>
+      CachedControlDivDescs;
 };
 
 } // namespace llvm
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/TargetLibraryInfo.def b/contrib/llvm-project/llvm/include/llvm/Analysis/TargetLibraryInfo.def
index 3864d4955104..defc95d0062a 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/TargetLibraryInfo.def
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/TargetLibraryInfo.def
@@ -262,6 +262,12 @@ TLI_DEFINE_STRING_INTERNAL("__atanhf_finite")
 /// long double __atanhl_finite(long double x);
 TLI_DEFINE_ENUM_INTERNAL(atanhl_finite)
 TLI_DEFINE_STRING_INTERNAL("__atanhl_finite")
+/// void __atomic_load(size_t size, void *mptr, void *vptr, int smodel);
+TLI_DEFINE_ENUM_INTERNAL(atomic_load)
+TLI_DEFINE_STRING_INTERNAL("__atomic_load")
+/// void __atomic_store(size_t size, void *mptr, void *vptr, int smodel);
+TLI_DEFINE_ENUM_INTERNAL(atomic_store)
+TLI_DEFINE_STRING_INTERNAL("__atomic_store")
 /// double __cosh_finite(double x);
 TLI_DEFINE_ENUM_INTERNAL(cosh_finite)
 TLI_DEFINE_STRING_INTERNAL("__cosh_finite")
@@ -360,6 +366,9 @@ TLI_DEFINE_STRING_INTERNAL("__memcpy_chk")
 /// void *__memmove_chk(void *s1, const void *s2, size_t n, size_t s1size);
 TLI_DEFINE_ENUM_INTERNAL(memmove_chk)
 TLI_DEFINE_STRING_INTERNAL("__memmove_chk")
+/// void *__mempcpy_chk(void *s1, const void *s2, size_t n, size_t s1size);
+TLI_DEFINE_ENUM_INTERNAL(mempcpy_chk)
+TLI_DEFINE_STRING_INTERNAL("__mempcpy_chk")
 /// void *__memset_chk(void *s, char v, size_t n, size_t s1size);
 TLI_DEFINE_ENUM_INTERNAL(memset_chk)
 TLI_DEFINE_STRING_INTERNAL("__memset_chk")
@@ -1411,6 +1420,18 @@ TLI_DEFINE_STRING_INTERNAL("utimes")
 /// void *valloc(size_t size);
 TLI_DEFINE_ENUM_INTERNAL(valloc)
 TLI_DEFINE_STRING_INTERNAL("valloc")
+/// void *vec_calloc(size_t count, size_t size);
+TLI_DEFINE_ENUM_INTERNAL(vec_calloc)
+TLI_DEFINE_STRING_INTERNAL("vec_calloc")
+/// void vec_free(void *ptr);
+TLI_DEFINE_ENUM_INTERNAL(vec_free)
+TLI_DEFINE_STRING_INTERNAL("vec_free")
+/// void *vec_malloc(size_t size);
+TLI_DEFINE_ENUM_INTERNAL(vec_malloc)
+TLI_DEFINE_STRING_INTERNAL("vec_malloc")
+/// void *vec_realloc(void *ptr, size_t size);
+TLI_DEFINE_ENUM_INTERNAL(vec_realloc)
+TLI_DEFINE_STRING_INTERNAL("vec_realloc")
 /// int vfprintf(FILE *stream, const char *format, va_list ap);
 TLI_DEFINE_ENUM_INTERNAL(vfprintf)
 TLI_DEFINE_STRING_INTERNAL("vfprintf")
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/TargetLibraryInfo.h b/contrib/llvm-project/llvm/include/llvm/Analysis/TargetLibraryInfo.h
index 3a7c26e1463b..34a8a1e3407c 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/TargetLibraryInfo.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/TargetLibraryInfo.h
@@ -88,6 +88,7 @@ public:
   enum VectorLibrary {
     NoLibrary,  // Don't use any vector library.
     Accelerate, // Use Accelerate framework.
+    LIBMVEC_X86,// GLIBC Vector Math library.
     MASSV,      // IBM MASS vector library.
     SVML        // Intel short vector math library.
   };
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/TargetTransformInfo.h b/contrib/llvm-project/llvm/include/llvm/Analysis/TargetTransformInfo.h
index b6698eefdb01..cdfb04424e56 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -21,11 +21,13 @@
 #ifndef LLVM_ANALYSIS_TARGETTRANSFORMINFO_H
 #define LLVM_ANALYSIS_TARGETTRANSFORMINFO_H
 
+#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/DataTypes.h"
+#include "llvm/Support/InstructionCost.h"
 #include <functional>
 
 namespace llvm {
@@ -42,6 +44,7 @@ class CallBase;
 class ExtractElementInst;
 class Function;
 class GlobalValue;
+class InstCombiner;
 class IntrinsicInst;
 class LoadInst;
 class LoopAccessInfo;
@@ -56,6 +59,7 @@ class TargetLibraryInfo;
 class Type;
 class User;
 class Value;
+struct KnownBits;
 template <typename T> class Optional;
 
 /// Information about a load/store intrinsic defined by the target.
@@ -90,7 +94,7 @@ struct HardwareLoopInfo {
   Loop *L = nullptr;
   BasicBlock *ExitBlock = nullptr;
   BranchInst *ExitBranch = nullptr;
-  const SCEV *ExitCount = nullptr;
+  const SCEV *TripCount = nullptr;
   IntegerType *CountType = nullptr;
   Value *LoopDecrement = nullptr; // Decrement the loop counter by this
                                   // value in every iteration.
@@ -114,7 +118,7 @@ class IntrinsicCostAttributes {
   SmallVector<Type *, 4> ParamTys;
   SmallVector<const Value *, 4> Arguments;
   FastMathFlags FMF;
-  unsigned VF = 1;
+  ElementCount VF = ElementCount::getFixed(1);
   // If ScalarizationCost is UINT_MAX, the cost of scalarizing the
   // arguments and the return value will be computed based on types.
   unsigned ScalarizationCost = std::numeric_limits<unsigned>::max();
@@ -125,10 +129,10 @@ public:
   IntrinsicCostAttributes(Intrinsic::ID Id, const CallBase &CI);
 
   IntrinsicCostAttributes(Intrinsic::ID Id, const CallBase &CI,
-                          unsigned Factor);
+                          ElementCount Factor);
 
   IntrinsicCostAttributes(Intrinsic::ID Id, const CallBase &CI,
-                          unsigned Factor, unsigned ScalarCost);
+                          ElementCount Factor, unsigned ScalarCost);
 
   IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
                           ArrayRef<Type *> Tys, FastMathFlags Flags);
@@ -151,7 +155,7 @@ public:
   Intrinsic::ID getID() const { return IID; }
   const IntrinsicInst *getInst() const { return II; }
   Type *getReturnType() const { return RetTy; }
-  unsigned getVectorFactor() const { return VF; }
+  ElementCount getVectorFactor() const { return VF; }
   FastMathFlags getFlags() const { return FMF; }
   unsigned getScalarizationCost() const { return ScalarizationCost; }
   const SmallVectorImpl<const Value *> &getArgs() const { return Arguments; }
@@ -228,19 +232,24 @@ public:
   ///
   /// Note, this method does not cache the cost calculation and it
   /// can be expensive in some cases.
-  int getInstructionCost(const Instruction *I, enum TargetCostKind kind) const {
+  InstructionCost getInstructionCost(const Instruction *I,
+                                     enum TargetCostKind kind) const {
+    InstructionCost Cost;
     switch (kind) {
     case TCK_RecipThroughput:
-      return getInstructionThroughput(I);
-
+      Cost = getInstructionThroughput(I);
+      break;
     case TCK_Latency:
-      return getInstructionLatency(I);
-
+      Cost = getInstructionLatency(I);
+      break;
     case TCK_CodeSize:
     case TCK_SizeAndLatency:
-      return getUserCost(I, kind);
+      Cost = getUserCost(I, kind);
+      break;
     }
-    llvm_unreachable("Unknown instruction cost kind");
+    if (Cost == -1)
+      Cost.setInvalid();
+    return Cost;
   }
 
   /// Underlying constants for 'cost' values in this interface.
@@ -280,6 +289,9 @@ public:
   /// individual classes of instructions would be better.
   unsigned getInliningThresholdMultiplier() const;
 
+  /// \returns A value to be added to the inlining threshold.
+  unsigned adjustInliningThreshold(const CallBase *CB) const;
+
   /// \returns Vector bonus in percent.
   ///
   /// Vector bonuses: We want to more aggressively inline vector-dense kernels
@@ -323,8 +335,7 @@ public:
   /// This is a helper function which calls the two-argument getUserCost
   /// with \p Operands which are the current operands U has.
   int getUserCost(const User *U, TargetCostKind CostKind) const {
-    SmallVector<const Value *, 4> Operands(U->value_op_begin(),
-                                           U->value_op_end());
+    SmallVector<const Value *, 4> Operands(U->operand_values());
     return getUserCost(U, Operands, CostKind);
   }
 
@@ -379,6 +390,8 @@ public:
 
   bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const;
 
+  unsigned getAssumedAddrSpace(const Value *V) const;
+
   /// Rewrite intrinsic call \p II such that \p OldV will be replaced with \p
   /// NewV, which has a different address space. This should happen for every
   /// operand index that collectFlatAddressOperands returned for the intrinsic.
@@ -542,6 +555,29 @@ public:
   /// target-independent defaults with information from \p L and \p SE.
   void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
                              PeelingPreferences &PP) const;
+
+  /// Targets can implement their own combinations for target-specific
+  /// intrinsics. This function will be called from the InstCombine pass every
+  /// time a target-specific intrinsic is encountered.
+  ///
+  /// \returns None to not do anything target specific or a value that will be
+  /// returned from the InstCombiner. It is possible to return null and stop
+  /// further processing of the intrinsic by returning nullptr.
+  Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
+                                               IntrinsicInst &II) const;
+  /// Can be used to implement target-specific instruction combining.
+  /// \see instCombineIntrinsic
+  Optional<Value *>
+  simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II,
+                                   APInt DemandedMask, KnownBits &Known,
+                                   bool &KnownBitsComputed) const;
+  /// Can be used to implement target-specific instruction combining.
+  /// \see instCombineIntrinsic
+  Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
+      InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
+      APInt &UndefElts2, APInt &UndefElts3,
+      std::function<void(Instruction *, unsigned, APInt, APInt &)>
+          SimplifyAndSetOp) const;
   /// @}
 
   /// \name Scalar Target Information
@@ -583,6 +619,11 @@ public:
   bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
                      TargetTransformInfo::LSRCost &C2) const;
 
+  /// Return true if LSR major cost is number of registers. Targets which
+  /// implement their own isLSRCostLess and unset number of registers as major
+  /// cost should return false, otherwise return true.
+  bool isNumRegsMajorCostOfLSR() const;
+
   /// \returns true if LSR should not optimize a chain that includes \p I.
   bool isProfitableLSRChainElement(Instruction *I) const;
 
@@ -672,6 +713,9 @@ public:
   /// Return true if this type is legal.
   bool isTypeLegal(Type *Ty) const;
 
+  /// Returns the estimated number of registers required to represent \p Ty.
+  unsigned getRegUsageForType(Type *Ty) const;
+
   /// Return true if switches should be turned into lookup tables for the
   /// target.
   bool shouldBuildLookupTables() const;
@@ -780,8 +824,9 @@ public:
   /// Return the expected cost of materialization for the given integer
   /// immediate of the specified type for a given instruction. The cost can be
   /// zero if the immediate can be folded into the specified instruction.
-  int getIntImmCostInst(unsigned Opc, unsigned Idx, const APInt &Imm,
-                        Type *Ty, TargetCostKind CostKind) const;
+  int getIntImmCostInst(unsigned Opc, unsigned Idx, const APInt &Imm, Type *Ty,
+                        TargetCostKind CostKind,
+                        Instruction *Inst = nullptr) const;
   int getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
                           Type *Ty, TargetCostKind CostKind) const;
 
@@ -845,6 +890,10 @@ public:
   static ReductionKind matchVectorSplittingReduction(
     const ExtractElementInst *ReduxRoot, unsigned &Opcode, VectorType *&Ty);
 
+  static ReductionKind matchVectorReduction(const ExtractElementInst *ReduxRoot,
+                                            unsigned &Opcode, VectorType *&Ty,
+                                            bool &IsPairwise);
+
   /// Additional information about an operand's possible values.
   enum OperandValueKind {
     OK_AnyValue,               // Operand can have any value.
@@ -881,6 +930,10 @@ public:
   /// \return The width of the smallest vector register type.
   unsigned getMinVectorRegisterBitWidth() const;
 
+  /// \return The maximum value of vscale if the target specifies an
+  ///  architectural maximum vector length, and None otherwise.
+  Optional<unsigned> getMaxVScale() const;
+
   /// \return True if the vectorization factor should be chosen to
   /// make the vector of the smallest element type match the size of a
   /// vector register. For wider element types, this could result in
@@ -894,6 +947,11 @@ public:
   /// applies when shouldMaximizeVectorBandwidth returns true.
   unsigned getMinimumVF(unsigned ElemWidth) const;
 
+  /// \return The maximum vectorization factor for types of given element
+  /// bit width and opcode, or 0 if there is no maximum VF.
+  /// Currently only used by the SLP vectorizer.
+  unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const;
+
   /// \return True if it should be considered for address type promotion.
   /// \p AllowPromotionWithoutCommonHeader Set true if promoting \p I is
   /// profitable without finding other extensions fed by the same input.
@@ -996,10 +1054,47 @@ public:
   int getShuffleCost(ShuffleKind Kind, VectorType *Tp, int Index = 0,
                      VectorType *SubTp = nullptr) const;
 
+  /// Represents a hint about the context in which a cast is used.
+  ///
+  /// For zext/sext, the context of the cast is the operand, which must be a
+  /// load of some kind. For trunc, the context is of the cast is the single
+  /// user of the instruction, which must be a store of some kind.
+  ///
+  /// This enum allows the vectorizer to give getCastInstrCost an idea of the
+  /// type of cast it's dealing with, as not every cast is equal. For instance,
+  /// the zext of a load may be free, but the zext of an interleaving load can
+  //// be (very) expensive!
+  ///
+  /// See \c getCastContextHint to compute a CastContextHint from a cast
+  /// Instruction*. Callers can use it if they don't need to override the
+  /// context and just want it to be calculated from the instruction.
+  ///
+  /// FIXME: This handles the types of load/store that the vectorizer can
+  /// produce, which are the cases where the context instruction is most
+  /// likely to be incorrect. There are other situations where that can happen
+  /// too, which might be handled here but in the long run a more general
+  /// solution of costing multiple instructions at the same times may be better.
+  enum class CastContextHint : uint8_t {
+    None,          ///< The cast is not used with a load/store of any kind.
+    Normal,        ///< The cast is used with a normal load/store.
+    Masked,        ///< The cast is used with a masked load/store.
+    GatherScatter, ///< The cast is used with a gather/scatter.
+    Interleave,    ///< The cast is used with an interleaved load/store.
+    Reversed,      ///< The cast is used with a reversed load/store.
+  };
+
+  /// Calculates a CastContextHint from \p I.
+  /// This should be used by callers of getCastInstrCost if they wish to
+  /// determine the context from some instruction.
+  /// \returns the CastContextHint for ZExt/SExt/Trunc, None if \p I is nullptr,
+  /// or if it's another type of cast.
+  static CastContextHint getCastContextHint(const Instruction *I);
+
   /// \return The expected cost of cast instructions, such as bitcast, trunc,
   /// zext, etc. If there is an existing instruction that holds Opcode, it
   /// may be passed in the 'I' parameter.
   int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                       TTI::CastContextHint CCH,
                        TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency,
                        const Instruction *I = nullptr) const;
 
@@ -1015,10 +1110,14 @@ public:
 
   /// \returns The expected cost of compare and select instructions. If there
   /// is an existing instruction that holds Opcode, it may be passed in the
-  /// 'I' parameter.
-  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy = nullptr,
-                         TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
-                         const Instruction *I = nullptr) const;
+  /// 'I' parameter. The \p VecPred parameter can be used to indicate the select
+  /// is using a compare with the specified predicate as condition. When vector
+  /// types are passed, \p VecPred must be used for all lanes.
+  int getCmpSelInstrCost(
+      unsigned Opcode, Type *ValTy, Type *CondTy = nullptr,
+      CmpInst::Predicate VecPred = CmpInst::BAD_ICMP_PREDICATE,
+      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
+      const Instruction *I = nullptr) const;
 
   /// \return The expected cost of vector Insert and Extract.
   /// Use -1 to indicate that there is no information on the index value.
@@ -1086,6 +1185,16 @@ public:
     VectorType *Ty, VectorType *CondTy, bool IsPairwiseForm, bool IsUnsigned,
     TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const;
 
+  /// Calculate the cost of an extended reduction pattern, similar to
+  /// getArithmeticReductionCost of an Add reduction with an extension and
+  /// optional multiply. This is the cost of as:
+  /// ResTy vecreduce.add(ext(Ty A)), or if IsMLA flag is set then:
+  /// ResTy vecreduce.add(mul(ext(Ty A), ext(Ty B)). The reduction happens
+  /// on a VectorType with ResTy elements and Ty lanes.
+  InstructionCost getExtendedAddReductionCost(
+      bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *Ty,
+      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const;
+
   /// \returns The cost of Intrinsic instructions. Analyses the real arguments.
   /// Three cases are handled: 1. scalar instruction 2. vector instruction
   /// 3. scalar instruction which is to be vectorized.
@@ -1221,6 +1330,24 @@ public:
   bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
                              ReductionFlags Flags) const;
 
+  /// \returns True if the target prefers reductions in loop.
+  bool preferInLoopReduction(unsigned Opcode, Type *Ty,
+                             ReductionFlags Flags) const;
+
+  /// \returns True if the target prefers reductions select kept in the loop
+  /// when tail folding. i.e.
+  /// loop:
+  ///   p = phi (0, s)
+  ///   a = add (p, x)
+  ///   s = select (mask, a, p)
+  /// vecreduce.add(s)
+  ///
+  /// As opposed to the normal scheme of p = phi (0, a) which allows the select
+  /// to be pulled out of the loop. If the select(.., add, ..) can be predicated
+  /// by the target, this can lead to cleaner code generation.
+  bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty,
+                                       ReductionFlags Flags) const;
+
   /// \returns True if the target wants to expand the given reduction intrinsic
   /// into a shuffle sequence.
   bool shouldExpandReduction(const IntrinsicInst *II) const;
@@ -1229,6 +1356,9 @@ public:
   /// to a stack reload.
   unsigned getGISelRematGlobalCost() const;
 
+  /// \returns True if the target supports scalable vectors.
+  bool supportsScalableVectors() const;
+
   /// \name Vector Predication Information
   /// @{
   /// Whether the target supports the %evl parameter of VP intrinsic efficiently
@@ -1268,6 +1398,7 @@ public:
                          ArrayRef<const Value *> Operands,
                          TTI::TargetCostKind CostKind) = 0;
   virtual unsigned getInliningThresholdMultiplier() = 0;
+  virtual unsigned adjustInliningThreshold(const CallBase *CB) = 0;
   virtual int getInlinerVectorBonusPercent() = 0;
   virtual int getMemcpyCost(const Instruction *I) = 0;
   virtual unsigned
@@ -1284,6 +1415,7 @@ public:
   virtual bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
                                           Intrinsic::ID IID) const = 0;
   virtual bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const = 0;
+  virtual unsigned getAssumedAddrSpace(const Value *V) const = 0;
   virtual Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
                                                   Value *OldV,
                                                   Value *NewV) const = 0;
@@ -1301,6 +1433,17 @@ public:
                               AssumptionCache &AC, TargetLibraryInfo *TLI,
                               DominatorTree *DT, const LoopAccessInfo *LAI) = 0;
   virtual bool emitGetActiveLaneMask() = 0;
+  virtual Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
+                                                       IntrinsicInst &II) = 0;
+  virtual Optional<Value *>
+  simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II,
+                                   APInt DemandedMask, KnownBits &Known,
+                                   bool &KnownBitsComputed) = 0;
+  virtual Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
+      InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
+      APInt &UndefElts2, APInt &UndefElts3,
+      std::function<void(Instruction *, unsigned, APInt, APInt &)>
+          SimplifyAndSetOp) = 0;
   virtual bool isLegalAddImmediate(int64_t Imm) = 0;
   virtual bool isLegalICmpImmediate(int64_t Imm) = 0;
   virtual bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV,
@@ -1309,6 +1452,7 @@ public:
                                      Instruction *I) = 0;
   virtual bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
                              TargetTransformInfo::LSRCost &C2) = 0;
+  virtual bool isNumRegsMajorCostOfLSR() = 0;
   virtual bool isProfitableLSRChainElement(Instruction *I) = 0;
   virtual bool canMacroFuseCmp() = 0;
   virtual bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE,
@@ -1335,6 +1479,7 @@ public:
   virtual bool isProfitableToHoist(Instruction *I) = 0;
   virtual bool useAA() = 0;
   virtual bool isTypeLegal(Type *Ty) = 0;
+  virtual unsigned getRegUsageForType(Type *Ty) = 0;
   virtual bool shouldBuildLookupTables() = 0;
   virtual bool shouldBuildLookupTablesForConstant(Constant *C) = 0;
   virtual bool useColdCCForColdCall(Function &F) = 0;
@@ -1365,7 +1510,8 @@ public:
   virtual int getIntImmCost(const APInt &Imm, Type *Ty,
                             TargetCostKind CostKind) = 0;
   virtual int getIntImmCostInst(unsigned Opc, unsigned Idx, const APInt &Imm,
-                                Type *Ty, TargetCostKind CostKind) = 0;
+                                Type *Ty, TargetCostKind CostKind,
+                                Instruction *Inst = nullptr) = 0;
   virtual int getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
                                   const APInt &Imm, Type *Ty,
                                   TargetCostKind CostKind) = 0;
@@ -1375,8 +1521,10 @@ public:
   virtual const char *getRegisterClassName(unsigned ClassID) const = 0;
   virtual unsigned getRegisterBitWidth(bool Vector) const = 0;
   virtual unsigned getMinVectorRegisterBitWidth() = 0;
+  virtual Optional<unsigned> getMaxVScale() const = 0;
   virtual bool shouldMaximizeVectorBandwidth(bool OptSize) const = 0;
   virtual unsigned getMinimumVF(unsigned ElemWidth) const = 0;
+  virtual unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const = 0;
   virtual bool shouldConsiderAddressTypePromotion(
       const Instruction &I, bool &AllowPromotionWithoutCommonHeader) = 0;
   virtual unsigned getCacheLineSize() const = 0;
@@ -1418,6 +1566,7 @@ public:
   virtual int getShuffleCost(ShuffleKind Kind, VectorType *Tp, int Index,
                              VectorType *SubTp) = 0;
   virtual int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                               CastContextHint CCH,
                                TTI::TargetCostKind CostKind,
                                const Instruction *I) = 0;
   virtual int getExtractWithExtendCost(unsigned Opcode, Type *Dst,
@@ -1425,6 +1574,7 @@ public:
   virtual int getCFInstrCost(unsigned Opcode,
                              TTI::TargetCostKind CostKind) = 0;
   virtual int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                                 CmpInst::Predicate VecPred,
                                  TTI::TargetCostKind CostKind,
                                  const Instruction *I) = 0;
   virtual int getVectorInstrCost(unsigned Opcode, Type *Val,
@@ -1452,6 +1602,9 @@ public:
   virtual int getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
                                      bool IsPairwiseForm, bool IsUnsigned,
                                      TTI::TargetCostKind CostKind) = 0;
+  virtual InstructionCost getExtendedAddReductionCost(
+      bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *Ty,
+      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) = 0;
   virtual int getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                     TTI::TargetCostKind CostKind) = 0;
   virtual int getCallInstrCost(Function *F, Type *RetTy,
@@ -1499,8 +1652,13 @@ public:
                                         VectorType *VecTy) const = 0;
   virtual bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
                                      ReductionFlags) const = 0;
+  virtual bool preferInLoopReduction(unsigned Opcode, Type *Ty,
+                                     ReductionFlags) const = 0;
+  virtual bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty,
+                                               ReductionFlags) const = 0;
   virtual bool shouldExpandReduction(const IntrinsicInst *II) const = 0;
   virtual unsigned getGISelRematGlobalCost() const = 0;
+  virtual bool supportsScalableVectors() const = 0;
   virtual bool hasActiveVectorLength() const = 0;
   virtual int getInstructionLatency(const Instruction *I) = 0;
 };
@@ -1525,6 +1683,9 @@ public:
   unsigned getInliningThresholdMultiplier() override {
     return Impl.getInliningThresholdMultiplier();
   }
+  unsigned adjustInliningThreshold(const CallBase *CB) override {
+    return Impl.adjustInliningThreshold(CB);
+  }
   int getInlinerVectorBonusPercent() override {
     return Impl.getInlinerVectorBonusPercent();
   }
@@ -1558,6 +1719,10 @@ public:
     return Impl.isNoopAddrSpaceCast(FromAS, ToAS);
   }
 
+  unsigned getAssumedAddrSpace(const Value *V) const override {
+    return Impl.getAssumedAddrSpace(V);
+  }
+
   Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
                                           Value *NewV) const override {
     return Impl.rewriteIntrinsicWithAddressSpace(II, OldV, NewV);
@@ -1588,6 +1753,26 @@ public:
   bool emitGetActiveLaneMask() override {
     return Impl.emitGetActiveLaneMask();
   }
+  Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
+                                               IntrinsicInst &II) override {
+    return Impl.instCombineIntrinsic(IC, II);
+  }
+  Optional<Value *>
+  simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II,
+                                   APInt DemandedMask, KnownBits &Known,
+                                   bool &KnownBitsComputed) override {
+    return Impl.simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known,
+                                                 KnownBitsComputed);
+  }
+  Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
+      InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
+      APInt &UndefElts2, APInt &UndefElts3,
+      std::function<void(Instruction *, unsigned, APInt, APInt &)>
+          SimplifyAndSetOp) override {
+    return Impl.simplifyDemandedVectorEltsIntrinsic(
+        IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
+        SimplifyAndSetOp);
+  }
   bool isLegalAddImmediate(int64_t Imm) override {
     return Impl.isLegalAddImmediate(Imm);
   }
@@ -1604,6 +1789,9 @@ public:
                      TargetTransformInfo::LSRCost &C2) override {
     return Impl.isLSRCostLess(C1, C2);
   }
+  bool isNumRegsMajorCostOfLSR() override {
+    return Impl.isNumRegsMajorCostOfLSR();
+  }
   bool isProfitableLSRChainElement(Instruction *I) override {
     return Impl.isProfitableLSRChainElement(I);
   }
@@ -1665,6 +1853,9 @@ public:
   }
   bool useAA() override { return Impl.useAA(); }
   bool isTypeLegal(Type *Ty) override { return Impl.isTypeLegal(Ty); }
+  unsigned getRegUsageForType(Type *Ty) override {
+    return Impl.getRegUsageForType(Ty);
+  }
   bool shouldBuildLookupTables() override {
     return Impl.shouldBuildLookupTables();
   }
@@ -1729,9 +1920,10 @@ public:
                     TargetCostKind CostKind) override {
     return Impl.getIntImmCost(Imm, Ty, CostKind);
   }
-  int getIntImmCostInst(unsigned Opc, unsigned Idx, const APInt &Imm,
-                        Type *Ty, TargetCostKind CostKind) override {
-    return Impl.getIntImmCostInst(Opc, Idx, Imm, Ty, CostKind);
+  int getIntImmCostInst(unsigned Opc, unsigned Idx, const APInt &Imm, Type *Ty,
+                        TargetCostKind CostKind,
+                        Instruction *Inst = nullptr) override {
+    return Impl.getIntImmCostInst(Opc, Idx, Imm, Ty, CostKind, Inst);
   }
   int getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
                           Type *Ty, TargetCostKind CostKind) override {
@@ -1753,12 +1945,18 @@ public:
   unsigned getMinVectorRegisterBitWidth() override {
     return Impl.getMinVectorRegisterBitWidth();
   }
+  Optional<unsigned> getMaxVScale() const override {
+    return Impl.getMaxVScale();
+  }
   bool shouldMaximizeVectorBandwidth(bool OptSize) const override {
     return Impl.shouldMaximizeVectorBandwidth(OptSize);
   }
   unsigned getMinimumVF(unsigned ElemWidth) const override {
     return Impl.getMinimumVF(ElemWidth);
   }
+  unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override {
+    return Impl.getMaximumVF(ElemWidth, Opcode);
+  }
   bool shouldConsiderAddressTypePromotion(
       const Instruction &I, bool &AllowPromotionWithoutCommonHeader) override {
     return Impl.shouldConsiderAddressTypePromotion(
@@ -1826,9 +2024,9 @@ public:
     return Impl.getShuffleCost(Kind, Tp, Index, SubTp);
   }
   int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
-                       TTI::TargetCostKind CostKind,
+                       CastContextHint CCH, TTI::TargetCostKind CostKind,
                        const Instruction *I) override {
-    return Impl.getCastInstrCost(Opcode, Dst, Src, CostKind, I);
+    return Impl.getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
   }
   int getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
                                unsigned Index) override {
@@ -1838,9 +2036,10 @@ public:
     return Impl.getCFInstrCost(Opcode, CostKind);
   }
   int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                         CmpInst::Predicate VecPred,
                          TTI::TargetCostKind CostKind,
                          const Instruction *I) override {
-    return Impl.getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I);
+    return Impl.getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
   }
   int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) override {
     return Impl.getVectorInstrCost(Opcode, Val, Index);
@@ -1886,6 +2085,12 @@ public:
     return Impl.getMinMaxReductionCost(Ty, CondTy, IsPairwiseForm, IsUnsigned,
                                        CostKind);
   }
+  InstructionCost getExtendedAddReductionCost(
+      bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *Ty,
+      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) override {
+    return Impl.getExtendedAddReductionCost(IsMLA, IsUnsigned, ResTy, Ty,
+                                            CostKind);
+  }
   int getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                             TTI::TargetCostKind CostKind) override {
     return Impl.getIntrinsicInstrCost(ICA, CostKind);
@@ -1979,6 +2184,14 @@ public:
                              ReductionFlags Flags) const override {
     return Impl.useReductionIntrinsic(Opcode, Ty, Flags);
   }
+  bool preferInLoopReduction(unsigned Opcode, Type *Ty,
+                             ReductionFlags Flags) const override {
+    return Impl.preferInLoopReduction(Opcode, Ty, Flags);
+  }
+  bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty,
+                                       ReductionFlags Flags) const override {
+    return Impl.preferPredicatedReductionSelect(Opcode, Ty, Flags);
+  }
   bool shouldExpandReduction(const IntrinsicInst *II) const override {
     return Impl.shouldExpandReduction(II);
   }
@@ -1987,6 +2200,10 @@ public:
     return Impl.getGISelRematGlobalCost();
   }
 
+  bool supportsScalableVectors() const override {
+    return Impl.supportsScalableVectors();
+  }
+
   bool hasActiveVectorLength() const override {
     return Impl.hasActiveVectorLength();
   }
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/contrib/llvm-project/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 0ce975d6d4b5..7e31cb365a87 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -20,7 +20,7 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
-#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/Type.h"
 
@@ -46,7 +46,7 @@ public:
 
   int getGEPCost(Type *PointeeType, const Value *Ptr,
                  ArrayRef<const Value *> Operands,
-                 TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency) {
+                 TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency) const {
     // In the basic model, we just assume that all-constant GEPs will be folded
     // into their uses via addressing modes.
     for (unsigned Idx = 0, Size = Operands.size(); Idx != Size; ++Idx)
@@ -59,28 +59,31 @@ public:
   unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
                                             unsigned &JTSize,
                                             ProfileSummaryInfo *PSI,
-                                            BlockFrequencyInfo *BFI) {
+                                            BlockFrequencyInfo *BFI) const {
     (void)PSI;
     (void)BFI;
     JTSize = 0;
     return SI.getNumCases();
   }
 
-  unsigned getInliningThresholdMultiplier() { return 1; }
+  unsigned getInliningThresholdMultiplier() const { return 1; }
+  unsigned adjustInliningThreshold(const CallBase *CB) const { return 0; }
 
-  int getInlinerVectorBonusPercent() { return 150; }
+  int getInlinerVectorBonusPercent() const { return 150; }
 
-  unsigned getMemcpyCost(const Instruction *I) { return TTI::TCC_Expensive; }
+  unsigned getMemcpyCost(const Instruction *I) const {
+    return TTI::TCC_Expensive;
+  }
 
-  bool hasBranchDivergence() { return false; }
+  bool hasBranchDivergence() const { return false; }
 
-  bool useGPUDivergenceAnalysis() { return false; }
+  bool useGPUDivergenceAnalysis() const { return false; }
 
-  bool isSourceOfDivergence(const Value *V) { return false; }
+  bool isSourceOfDivergence(const Value *V) const { return false; }
 
-  bool isAlwaysUniform(const Value *V) { return false; }
+  bool isAlwaysUniform(const Value *V) const { return false; }
 
-  unsigned getFlatAddressSpace() { return -1; }
+  unsigned getFlatAddressSpace() const { return -1; }
 
   bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
                                   Intrinsic::ID IID) const {
@@ -89,12 +92,14 @@ public:
 
   bool isNoopAddrSpaceCast(unsigned, unsigned) const { return false; }
 
+  unsigned getAssumedAddrSpace(const Value *V) const { return -1; }
+
   Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
                                           Value *NewV) const {
     return nullptr;
   }
 
-  bool isLoweredToCall(const Function *F) {
+  bool isLoweredToCall(const Function *F) const {
     assert(F && "A concrete function must be provided to this routine.");
 
     // FIXME: These should almost certainly not be handled here, and instead
@@ -132,7 +137,7 @@ public:
 
   bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
                                 AssumptionCache &AC, TargetLibraryInfo *LibInfo,
-                                HardwareLoopInfo &HWLoopInfo) {
+                                HardwareLoopInfo &HWLoopInfo) const {
     return false;
   }
 
@@ -147,38 +152,60 @@ public:
     return false;
   }
 
+  Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
+                                               IntrinsicInst &II) const {
+    return None;
+  }
+
+  Optional<Value *>
+  simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II,
+                                   APInt DemandedMask, KnownBits &Known,
+                                   bool &KnownBitsComputed) const {
+    return None;
+  }
+
+  Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
+      InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
+      APInt &UndefElts2, APInt &UndefElts3,
+      std::function<void(Instruction *, unsigned, APInt, APInt &)>
+          SimplifyAndSetOp) const {
+    return None;
+  }
+
   void getUnrollingPreferences(Loop *, ScalarEvolution &,
-                               TTI::UnrollingPreferences &) {}
+                               TTI::UnrollingPreferences &) const {}
 
   void getPeelingPreferences(Loop *, ScalarEvolution &,
-                             TTI::PeelingPreferences &) {}
+                             TTI::PeelingPreferences &) const {}
 
-  bool isLegalAddImmediate(int64_t Imm) { return false; }
+  bool isLegalAddImmediate(int64_t Imm) const { return false; }
 
-  bool isLegalICmpImmediate(int64_t Imm) { return false; }
+  bool isLegalICmpImmediate(int64_t Imm) const { return false; }
 
   bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
                              bool HasBaseReg, int64_t Scale, unsigned AddrSpace,
-                             Instruction *I = nullptr) {
+                             Instruction *I = nullptr) const {
     // Guess that only reg and reg+reg addressing is allowed. This heuristic is
     // taken from the implementation of LSR.
     return !BaseGV && BaseOffset == 0 && (Scale == 0 || Scale == 1);
   }
 
-  bool isLSRCostLess(TTI::LSRCost &C1, TTI::LSRCost &C2) {
+  bool isLSRCostLess(TTI::LSRCost &C1, TTI::LSRCost &C2) const {
     return std::tie(C1.NumRegs, C1.AddRecCost, C1.NumIVMuls, C1.NumBaseAdds,
                     C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
            std::tie(C2.NumRegs, C2.AddRecCost, C2.NumIVMuls, C2.NumBaseAdds,
                     C2.ScaleCost, C2.ImmCost, C2.SetupCost);
   }
 
-  bool isProfitableLSRChainElement(Instruction *I) { return false; }
+  bool isNumRegsMajorCostOfLSR() const { return true; }
+
+  bool isProfitableLSRChainElement(Instruction *I) const { return false; }
 
-  bool canMacroFuseCmp() { return false; }
+  bool canMacroFuseCmp() const { return false; }
 
   bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, LoopInfo *LI,
                   DominatorTree *DT, AssumptionCache *AC,
-                  TargetLibraryInfo *LibInfo) {
+                  TargetLibraryInfo *LibInfo) const {
     return false;
   }
 
@@ -186,40 +213,51 @@ public:
 
   bool shouldFavorBackedgeIndex(const Loop *L) const { return false; }
 
-  bool isLegalMaskedStore(Type *DataType, Align Alignment) { return false; }
+  bool isLegalMaskedStore(Type *DataType, Align Alignment) const {
+    return false;
+  }
 
-  bool isLegalMaskedLoad(Type *DataType, Align Alignment) { return false; }
+  bool isLegalMaskedLoad(Type *DataType, Align Alignment) const {
+    return false;
+  }
 
-  bool isLegalNTStore(Type *DataType, Align Alignment) {
+  bool isLegalNTStore(Type *DataType, Align Alignment) const {
     // By default, assume nontemporal memory stores are available for stores
     // that are aligned and have a size that is a power of 2.
     unsigned DataSize = DL.getTypeStoreSize(DataType);
     return Alignment >= DataSize && isPowerOf2_32(DataSize);
   }
 
-  bool isLegalNTLoad(Type *DataType, Align Alignment) {
+  bool isLegalNTLoad(Type *DataType, Align Alignment) const {
     // By default, assume nontemporal memory loads are available for loads that
     // are aligned and have a size that is a power of 2.
     unsigned DataSize = DL.getTypeStoreSize(DataType);
     return Alignment >= DataSize && isPowerOf2_32(DataSize);
   }
 
-  bool isLegalMaskedScatter(Type *DataType, Align Alignment) { return false; }
+  bool isLegalMaskedScatter(Type *DataType, Align Alignment) const {
+    return false;
+  }
 
-  bool isLegalMaskedGather(Type *DataType, Align Alignment) { return false; }
+  bool isLegalMaskedGather(Type *DataType, Align Alignment) const {
+    return false;
+  }
 
-  bool isLegalMaskedCompressStore(Type *DataType) { return false; }
+  bool isLegalMaskedCompressStore(Type *DataType) const { return false; }
 
-  bool isLegalMaskedExpandLoad(Type *DataType) { return false; }
+  bool isLegalMaskedExpandLoad(Type *DataType) const { return false; }
 
-  bool hasDivRemOp(Type *DataType, bool IsSigned) { return false; }
+  bool hasDivRemOp(Type *DataType, bool IsSigned) const { return false; }
 
-  bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) { return false; }
+  bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) const {
+    return false;
+  }
 
-  bool prefersVectorizedAddressing() { return true; }
+  bool prefersVectorizedAddressing() const { return true; }
 
   int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
-                           bool HasBaseReg, int64_t Scale, unsigned AddrSpace) {
+                           bool HasBaseReg, int64_t Scale,
+                           unsigned AddrSpace) const {
     // Guess that all legal addressing mode are free.
     if (isLegalAddressingMode(Ty, BaseGV, BaseOffset, HasBaseReg, Scale,
                               AddrSpace))
@@ -227,80 +265,87 @@ public:
     return -1;
   }
 
-  bool LSRWithInstrQueries() { return false; }
+  bool LSRWithInstrQueries() const { return false; }
 
-  bool isTruncateFree(Type *Ty1, Type *Ty2) { return false; }
+  bool isTruncateFree(Type *Ty1, Type *Ty2) const { return false; }
 
-  bool isProfitableToHoist(Instruction *I) { return true; }
+  bool isProfitableToHoist(Instruction *I) const { return true; }
 
-  bool useAA() { return false; }
+  bool useAA() const { return false; }
 
-  bool isTypeLegal(Type *Ty) { return false; }
+  bool isTypeLegal(Type *Ty) const { return false; }
 
-  bool shouldBuildLookupTables() { return true; }
-  bool shouldBuildLookupTablesForConstant(Constant *C) { return true; }
+  unsigned getRegUsageForType(Type *Ty) const { return 1; }
 
-  bool useColdCCForColdCall(Function &F) { return false; }
+  bool shouldBuildLookupTables() const { return true; }
+  bool shouldBuildLookupTablesForConstant(Constant *C) const { return true; }
+
+  bool useColdCCForColdCall(Function &F) const { return false; }
 
   unsigned getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts,
-                                    bool Insert, bool Extract) {
+                                    bool Insert, bool Extract) const {
     return 0;
   }
 
   unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
-                                            unsigned VF) {
+                                            unsigned VF) const {
     return 0;
   }
 
-  bool supportsEfficientVectorElementLoadStore() { return false; }
+  bool supportsEfficientVectorElementLoadStore() const { return false; }
 
-  bool enableAggressiveInterleaving(bool LoopHasReductions) { return false; }
+  bool enableAggressiveInterleaving(bool LoopHasReductions) const {
+    return false;
+  }
 
   TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
                                                     bool IsZeroCmp) const {
     return {};
   }
 
-  bool enableInterleavedAccessVectorization() { return false; }
+  bool enableInterleavedAccessVectorization() const { return false; }
 
-  bool enableMaskedInterleavedAccessVectorization() { return false; }
+  bool enableMaskedInterleavedAccessVectorization() const { return false; }
 
-  bool isFPVectorizationPotentiallyUnsafe() { return false; }
+  bool isFPVectorizationPotentiallyUnsafe() const { return false; }
 
   bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth,
                                       unsigned AddressSpace, unsigned Alignment,
-                                      bool *Fast) {
+                                      bool *Fast) const {
     return false;
   }
 
-  TTI::PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) {
+  TTI::PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const {
     return TTI::PSK_Software;
   }
 
-  bool haveFastSqrt(Type *Ty) { return false; }
+  bool haveFastSqrt(Type *Ty) const { return false; }
 
-  bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) { return true; }
+  bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) const { return true; }
 
-  unsigned getFPOpCost(Type *Ty) { return TargetTransformInfo::TCC_Basic; }
+  unsigned getFPOpCost(Type *Ty) const {
+    return TargetTransformInfo::TCC_Basic;
+  }
 
   int getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
-                            Type *Ty) {
+                            Type *Ty) const {
     return 0;
   }
 
   unsigned getIntImmCost(const APInt &Imm, Type *Ty,
-                         TTI::TargetCostKind CostKind) {
+                         TTI::TargetCostKind CostKind) const {
     return TTI::TCC_Basic;
   }
 
   unsigned getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,
-                             Type *Ty, TTI::TargetCostKind CostKind) {
+                             Type *Ty, TTI::TargetCostKind CostKind,
+                             Instruction *Inst = nullptr) const {
     return TTI::TCC_Free;
   }
 
   unsigned getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
                                const APInt &Imm, Type *Ty,
-                               TTI::TargetCostKind CostKind) {
+                               TTI::TargetCostKind CostKind) const {
     return TTI::TCC_Free;
   }
 
@@ -323,15 +368,18 @@ public:
 
   unsigned getRegisterBitWidth(bool Vector) const { return 32; }
 
-  unsigned getMinVectorRegisterBitWidth() { return 128; }
+  unsigned getMinVectorRegisterBitWidth() const { return 128; }
+
+  Optional<unsigned> getMaxVScale() const { return None; }
 
   bool shouldMaximizeVectorBandwidth(bool OptSize) const { return false; }
 
   unsigned getMinimumVF(unsigned ElemWidth) const { return 0; }
 
-  bool
-  shouldConsiderAddressTypePromotion(const Instruction &I,
-                                     bool &AllowPromotionWithoutCommonHeader) {
+  unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { return 0; }
+
+  bool shouldConsiderAddressTypePromotion(
+      const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
     AllowPromotionWithoutCommonHeader = false;
     return false;
   }
@@ -370,7 +418,7 @@ public:
   unsigned getMaxPrefetchIterationsAhead() const { return UINT_MAX; }
   bool enableWritePrefetching() const { return false; }
 
-  unsigned getMaxInterleaveFactor(unsigned VF) { return 1; }
+  unsigned getMaxInterleaveFactor(unsigned VF) const { return 1; }
 
   unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
                                   TTI::TargetCostKind CostKind,
@@ -379,7 +427,7 @@ public:
                                   TTI::OperandValueProperties Opd1PropInfo,
                                   TTI::OperandValueProperties Opd2PropInfo,
                                   ArrayRef<const Value *> Args,
-                                  const Instruction *CxtI = nullptr) {
+                                  const Instruction *CxtI = nullptr) const {
     // FIXME: A number of transformation tests seem to require these values
     // which seems a little odd for how arbitary there are.
     switch (Opcode) {
@@ -398,13 +446,14 @@ public:
   }
 
   unsigned getShuffleCost(TTI::ShuffleKind Kind, VectorType *Ty, int Index,
-                          VectorType *SubTp) {
+                          VectorType *SubTp) const {
     return 1;
   }
 
   unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                            TTI::CastContextHint CCH,
                             TTI::TargetCostKind CostKind,
-                            const Instruction *I) {
+                            const Instruction *I) const {
     switch (Opcode) {
     default:
       break;
@@ -427,23 +476,24 @@ public:
         // Identity and pointer-to-pointer casts are free.
         return 0;
       break;
-    case Instruction::Trunc:
+    case Instruction::Trunc: {
       // trunc to a native type is free (assuming the target has compare and
       // shift-right of the same width).
-      if (DL.isLegalInteger(DL.getTypeSizeInBits(Dst)))
+      TypeSize DstSize = DL.getTypeSizeInBits(Dst);
+      if (!DstSize.isScalable() && DL.isLegalInteger(DstSize.getFixedSize()))
         return 0;
       break;
     }
+    }
     return 1;
   }
 
   unsigned getExtractWithExtendCost(unsigned Opcode, Type *Dst,
-                                    VectorType *VecTy, unsigned Index) {
+                                    VectorType *VecTy, unsigned Index) const {
     return 1;
   }
 
-  unsigned getCFInstrCost(unsigned Opcode,
-                          TTI::TargetCostKind CostKind) {
+  unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) const {
     // A phi would be free, unless we're costing the throughput because it
     // will require a register.
     if (Opcode == Instruction::PHI && CostKind != TTI::TCK_RecipThroughput)
@@ -452,12 +502,14 @@ public:
   }
 
   unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                              CmpInst::Predicate VecPred,
                               TTI::TargetCostKind CostKind,
                               const Instruction *I) const {
     return 1;
   }
 
-  unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
+  unsigned getVectorInstrCost(unsigned Opcode, Type *Val,
+                              unsigned Index) const {
     return 1;
   }
 
@@ -469,32 +521,33 @@ public:
 
   unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
                                  unsigned AddressSpace,
-                                 TTI::TargetCostKind CostKind) {
+                                 TTI::TargetCostKind CostKind) const {
     return 1;
   }
 
   unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
                                   const Value *Ptr, bool VariableMask,
                                   Align Alignment, TTI::TargetCostKind CostKind,
-                                  const Instruction *I = nullptr) {
+                                  const Instruction *I = nullptr) const {
     return 1;
   }
 
   unsigned getInterleavedMemoryOpCost(
       unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
       Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
-      bool UseMaskForCond, bool UseMaskForGaps) {
+      bool UseMaskForCond, bool UseMaskForGaps) const {
     return 1;
   }
 
   unsigned getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
-                                 TTI::TargetCostKind CostKind) {
+                                 TTI::TargetCostKind CostKind) const {
     switch (ICA.getID()) {
     default:
       break;
     case Intrinsic::annotation:
     case Intrinsic::assume:
     case Intrinsic::sideeffect:
+    case Intrinsic::pseudoprobe:
     case Intrinsic::dbg_declare:
     case Intrinsic::dbg_value:
     case Intrinsic::dbg_label:
@@ -505,6 +558,7 @@ public:
     case Intrinsic::is_constant:
     case Intrinsic::lifetime_start:
     case Intrinsic::lifetime_end:
+    case Intrinsic::experimental_noalias_scope_decl:
     case Intrinsic::objectsize:
     case Intrinsic::ptr_annotation:
     case Intrinsic::var_annotation:
@@ -526,26 +580,38 @@ public:
   }
 
   unsigned getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys,
-                            TTI::TargetCostKind CostKind) {
+                            TTI::TargetCostKind CostKind) const {
     return 1;
   }
 
-  unsigned getNumberOfParts(Type *Tp) { return 0; }
+  unsigned getNumberOfParts(Type *Tp) const { return 0; }
 
   unsigned getAddressComputationCost(Type *Tp, ScalarEvolution *,
-                                     const SCEV *) {
+                                     const SCEV *) const {
     return 0;
   }
 
   unsigned getArithmeticReductionCost(unsigned, VectorType *, bool,
-                                      TTI::TargetCostKind) { return 1; }
+                                      TTI::TargetCostKind) const {
+    return 1;
+  }
 
   unsigned getMinMaxReductionCost(VectorType *, VectorType *, bool, bool,
-                                  TTI::TargetCostKind) { return 1; }
+                                  TTI::TargetCostKind) const {
+    return 1;
+  }
+
+  InstructionCost getExtendedAddReductionCost(
+      bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *Ty,
+      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const {
+    return 1;
+  }
 
-  unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) { return 0; }
+  unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) const {
+    return 0;
+  }
 
-  bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) {
+  bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const {
     return false;
   }
 
@@ -559,7 +625,7 @@ public:
   }
 
   Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
-                                           Type *ExpectedType) {
+                                           Type *ExpectedType) const {
     return nullptr;
   }
 
@@ -637,22 +703,34 @@ public:
     return false;
   }
 
+  bool preferInLoopReduction(unsigned Opcode, Type *Ty,
+                             TTI::ReductionFlags Flags) const {
+    return false;
+  }
+
+  bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty,
+                                       TTI::ReductionFlags Flags) const {
+    return false;
+  }
+
   bool shouldExpandReduction(const IntrinsicInst *II) const { return true; }
 
   unsigned getGISelRematGlobalCost() const { return 1; }
 
+  bool supportsScalableVectors() const { return false; }
+
   bool hasActiveVectorLength() const { return false; }
 
 protected:
   // Obtain the minimum required size to hold the value (without the sign)
   // In case of a vector it returns the min required size for one element.
-  unsigned minRequiredElementSize(const Value *Val, bool &isSigned) {
+  unsigned minRequiredElementSize(const Value *Val, bool &isSigned) const {
     if (isa<ConstantDataVector>(Val) || isa<ConstantVector>(Val)) {
       const auto *VectorValue = cast<Constant>(Val);
 
       // In case of a vector need to pick the max between the min
       // required size for each element
-      auto *VT = cast<VectorType>(Val->getType());
+      auto *VT = cast<FixedVectorType>(Val->getType());
 
       // Assume unsigned elements
       isSigned = false;
@@ -700,12 +778,12 @@ protected:
     return Val->getType()->getScalarSizeInBits();
   }
 
-  bool isStridedAccess(const SCEV *Ptr) {
+  bool isStridedAccess(const SCEV *Ptr) const {
     return Ptr && isa<SCEVAddRecExpr>(Ptr);
   }
 
   const SCEVConstant *getConstantStrideStep(ScalarEvolution *SE,
-                                            const SCEV *Ptr) {
+                                            const SCEV *Ptr) const {
     if (!isStridedAccess(Ptr))
       return nullptr;
     const SCEVAddRecExpr *AddRec = cast<SCEVAddRecExpr>(Ptr);
@@ -713,7 +791,7 @@ protected:
   }
 
   bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr,
-                                       int64_t MergeDistance) {
+                                       int64_t MergeDistance) const {
     const SCEVConstant *Step = getConstantStrideStep(SE, Ptr);
     if (!Step)
       return false;
@@ -775,7 +853,12 @@ public:
         uint64_t Field = ConstIdx->getZExtValue();
         BaseOffset += DL.getStructLayout(STy)->getElementOffset(Field);
       } else {
-        int64_t ElementSize = DL.getTypeAllocSize(GTI.getIndexedType());
+        // If this operand is a scalable type, bail out early.
+        // TODO: handle scalable vectors
+        if (isa<ScalableVectorType>(TargetType))
+          return TTI::TCC_Basic;
+        int64_t ElementSize =
+            DL.getTypeAllocSize(GTI.getIndexedType()).getFixedSize();
         if (ConstIdx) {
           BaseOffset +=
               ConstIdx->getValue().sextOrTrunc(PtrSizeBits) * ElementSize;
@@ -800,30 +883,17 @@ public:
   int getUserCost(const User *U, ArrayRef<const Value *> Operands,
                   TTI::TargetCostKind CostKind) {
     auto *TargetTTI = static_cast<T *>(this);
-
-    // FIXME: We shouldn't have to special-case intrinsics here.
-    if (CostKind == TTI::TCK_RecipThroughput) {
-      if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) {
-        IntrinsicCostAttributes CostAttrs(*II);
-        return TargetTTI->getIntrinsicInstrCost(CostAttrs, CostKind);
-      }
-    }
-
+    // Handle non-intrinsic calls, invokes, and callbr.
     // FIXME: Unlikely to be true for anything but CodeSize.
-    if (const auto *CB = dyn_cast<CallBase>(U)) {
-      const Function *F = CB->getCalledFunction();
-      if (F) {
-        FunctionType *FTy = F->getFunctionType();
-        if (Intrinsic::ID IID = F->getIntrinsicID()) {
-          IntrinsicCostAttributes Attrs(IID, *CB);
-          return TargetTTI->getIntrinsicInstrCost(Attrs, CostKind);
-        }
-
+    auto *CB = dyn_cast<CallBase>(U);
+    if (CB && !isa<IntrinsicInst>(U)) {
+      if (const Function *F = CB->getCalledFunction()) {
         if (!TargetTTI->isLoweredToCall(F))
           return TTI::TCC_Basic; // Give a basic cost if it will be lowered
 
-        return TTI::TCC_Basic * (FTy->getNumParams() + 1);
+        return TTI::TCC_Basic * (F->getFunctionType()->getNumParams() + 1);
       }
+      // For indirect or other calls, scale cost by number of arguments.
       return TTI::TCC_Basic * (CB->arg_size() + 1);
     }
 
@@ -835,6 +905,12 @@ public:
     switch (Opcode) {
     default:
       break;
+    case Instruction::Call: {
+      assert(isa<IntrinsicInst>(U) && "Unexpected non-intrinsic call");
+      auto *Intrinsic = cast<IntrinsicInst>(U);
+      IntrinsicCostAttributes CostAttrs(Intrinsic->getIntrinsicID(), *CB);
+      return TargetTTI->getIntrinsicInstrCost(CostAttrs, CostKind);
+    }
     case Instruction::Br:
     case Instruction::Ret:
     case Instruction::PHI:
@@ -895,7 +971,8 @@ public:
     case Instruction::SExt:
     case Instruction::ZExt:
     case Instruction::AddrSpaceCast:
-      return TargetTTI->getCastInstrCost(Opcode, Ty, OpTy, CostKind, I);
+      return TargetTTI->getCastInstrCost(
+          Opcode, Ty, OpTy, TTI::getCastContextHint(I), CostKind, I);
     case Instruction::Store: {
       auto *SI = cast<StoreInst>(U);
       Type *ValTy = U->getOperand(0)->getType();
@@ -912,12 +989,16 @@ public:
     case Instruction::Select: {
       Type *CondTy = U->getOperand(0)->getType();
       return TargetTTI->getCmpSelInstrCost(Opcode, U->getType(), CondTy,
+                                           CmpInst::BAD_ICMP_PREDICATE,
                                            CostKind, I);
     }
     case Instruction::ICmp:
     case Instruction::FCmp: {
       Type *ValTy = U->getOperand(0)->getType();
+      // TODO: Also handle ICmp/FCmp constant expressions.
       return TargetTTI->getCmpSelInstrCost(Opcode, ValTy, U->getType(),
+                                           I ? cast<CmpInst>(I)->getPredicate()
+                                             : CmpInst::BAD_ICMP_PREDICATE,
                                            CostKind, I);
     }
     case Instruction::InsertElement: {
@@ -969,41 +1050,23 @@ public:
       if (CI)
         Idx = CI->getZExtValue();
 
-      // Try to match a reduction sequence (series of shufflevector and
-      // vector  adds followed by a extractelement).
-      unsigned ReduxOpCode;
-      VectorType *ReduxType;
-
-      switch (TTI::matchVectorSplittingReduction(EEI, ReduxOpCode,
-                                                 ReduxType)) {
-      case TTI::RK_Arithmetic:
-        return TargetTTI->getArithmeticReductionCost(ReduxOpCode, ReduxType,
-                                          /*IsPairwiseForm=*/false,
-                                          CostKind);
-      case TTI::RK_MinMax:
-        return TargetTTI->getMinMaxReductionCost(
-            ReduxType, cast<VectorType>(CmpInst::makeCmpResultType(ReduxType)),
-            /*IsPairwiseForm=*/false, /*IsUnsigned=*/false, CostKind);
-      case TTI::RK_UnsignedMinMax:
-        return TargetTTI->getMinMaxReductionCost(
-            ReduxType, cast<VectorType>(CmpInst::makeCmpResultType(ReduxType)),
-            /*IsPairwiseForm=*/false, /*IsUnsigned=*/true, CostKind);
-      case TTI::RK_None:
-        break;
-      }
-
-      switch (TTI::matchPairwiseReduction(EEI, ReduxOpCode, ReduxType)) {
+      // Try to match a reduction (a series of shufflevector and vector ops
+      // followed by an extractelement).
+      unsigned RdxOpcode;
+      VectorType *RdxType;
+      bool IsPairwise;
+      switch (TTI::matchVectorReduction(EEI, RdxOpcode, RdxType, IsPairwise)) {
       case TTI::RK_Arithmetic:
-        return TargetTTI->getArithmeticReductionCost(ReduxOpCode, ReduxType,
-                                          /*IsPairwiseForm=*/true, CostKind);
+        return TargetTTI->getArithmeticReductionCost(RdxOpcode, RdxType,
+                                                     IsPairwise, CostKind);
       case TTI::RK_MinMax:
         return TargetTTI->getMinMaxReductionCost(
-            ReduxType, cast<VectorType>(CmpInst::makeCmpResultType(ReduxType)),
-            /*IsPairwiseForm=*/true, /*IsUnsigned=*/false, CostKind);
+            RdxType, cast<VectorType>(CmpInst::makeCmpResultType(RdxType)),
+            IsPairwise, /*IsUnsigned=*/false, CostKind);
       case TTI::RK_UnsignedMinMax:
         return TargetTTI->getMinMaxReductionCost(
-            ReduxType, cast<VectorType>(CmpInst::makeCmpResultType(ReduxType)),
-            /*IsPairwiseForm=*/true, /*IsUnsigned=*/true, CostKind);
+            RdxType, cast<VectorType>(CmpInst::makeCmpResultType(RdxType)),
+            IsPairwise, /*IsUnsigned=*/true, CostKind);
       case TTI::RK_None:
         break;
       }
@@ -1016,8 +1079,7 @@ public:
   }
 
   int getInstructionLatency(const Instruction *I) {
-    SmallVector<const Value *, 4> Operands(I->value_op_begin(),
-                                           I->value_op_end());
+    SmallVector<const Value *, 4> Operands(I->operand_values());
     if (getUserCost(I, Operands, TTI::TCK_Latency) == TTI::TCC_Free)
       return 0;
 
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/Utils/ImportedFunctionsInliningStatistics.h b/contrib/llvm-project/llvm/include/llvm/Analysis/Utils/ImportedFunctionsInliningStatistics.h
new file mode 100644
index 000000000000..d02bcd0e335b
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/Utils/ImportedFunctionsInliningStatistics.h
@@ -0,0 +1,112 @@
+//===-- ImportedFunctionsInliningStatistics.h -------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// Generating inliner statistics for imported functions, mostly useful for
+// ThinLTO.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_UTILS_IMPORTEDFUNCTIONSINLININGSTATISTICS_H
+#define LLVM_TRANSFORMS_UTILS_IMPORTEDFUNCTIONSINLININGSTATISTICS_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include <string>
+#include <vector>
+
+namespace llvm {
+class Module;
+class Function;
+/// Calculate and dump ThinLTO specific inliner stats.
+/// The main statistics are:
+/// (1) Number of inlined imported functions,
+/// (2) Number of imported functions inlined into importing module (indirect),
+/// (3) Number of non imported functions inlined into importing module
+/// (indirect).
+/// The difference between first and the second is that first stat counts
+/// all performed inlines on imported functions, but the second one only the
+/// functions that have been eventually inlined to a function in the importing
+/// module (by a chain of inlines). Because llvm uses bottom-up inliner, it is
+/// possible to e.g. import function `A`, `B` and then inline `B` to `A`,
+/// and after this `A` might be too big to be inlined into some other function
+/// that calls it. It calculates this statistic by building graph, where
+/// the nodes are functions, and edges are performed inlines and then by marking
+/// the edges starting from not imported function.
+///
+/// If `Verbose` is set to true, then it also dumps statistics
+/// per each inlined function, sorted by the greatest inlines count like
+/// - number of performed inlines
+/// - number of performed inlines to importing module
+class ImportedFunctionsInliningStatistics {
+private:
+  /// InlineGraphNode represents node in graph of inlined functions.
+  struct InlineGraphNode {
+    // Default-constructible and movable.
+    InlineGraphNode() = default;
+    InlineGraphNode(InlineGraphNode &&) = default;
+    InlineGraphNode &operator=(InlineGraphNode &&) = default;
+
+    llvm::SmallVector<InlineGraphNode *, 8> InlinedCallees;
+    /// Incremented every direct inline.
+    int32_t NumberOfInlines = 0;
+    /// Number of inlines into non imported function (possibly indirect via
+    /// intermediate inlines). Computed based on graph search.
+    int32_t NumberOfRealInlines = 0;
+    bool Imported = false;
+    bool Visited = false;
+  };
+
+public:
+  ImportedFunctionsInliningStatistics() = default;
+  ImportedFunctionsInliningStatistics(
+      const ImportedFunctionsInliningStatistics &) = delete;
+
+  /// Set information like AllFunctions, ImportedFunctions, ModuleName.
+  void setModuleInfo(const Module &M);
+  /// Record inline of @param Callee to @param Caller for statistis.
+  void recordInline(const Function &Caller, const Function &Callee);
+  /// Dump stats computed with InlinerStatistics class.
+  /// If @param Verbose is true then separate statistics for every inlined
+  /// function will be printed.
+  void dump(bool Verbose);
+
+private:
+  /// Creates new Node in NodeMap and sets attributes, or returns existed one.
+  InlineGraphNode &createInlineGraphNode(const Function &);
+  void calculateRealInlines();
+  void dfs(InlineGraphNode &GraphNode);
+
+  using NodesMapTy =
+      llvm::StringMap<std::unique_ptr<InlineGraphNode>>;
+  using SortedNodesTy =
+      std::vector<const NodesMapTy::MapEntryTy*>;
+  /// Returns vector of elements sorted by
+  /// (-NumberOfInlines, -NumberOfRealInlines, FunctionName).
+  SortedNodesTy getSortedNodes();
+
+private:
+  /// This map manage life of all InlineGraphNodes. Unique pointer to
+  /// InlineGraphNode used since the node pointers are also saved in the
+  /// InlinedCallees vector. If it would store InlineGraphNode instead then the
+  /// address of the node would not be invariant.
+  NodesMapTy NodesMap;
+  /// Non external functions that have some other function inlined inside.
+  std::vector<StringRef> NonImportedCallers;
+  int AllFunctions = 0;
+  int ImportedFunctions = 0;
+  StringRef ModuleName;
+};
+
+enum class InlinerFunctionImportStatsOpts {
+  No = 0,
+  Basic = 1,
+  Verbose = 2,
+};
+
+} // llvm
+
+#endif // LLVM_TRANSFORMS_UTILS_IMPORTEDFUNCTIONSINLININGSTATISTICS_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/Utils/Local.h b/contrib/llvm-project/llvm/include/llvm/Analysis/Utils/Local.h
index f31b56345424..bd82b34165d6 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/Utils/Local.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/Utils/Local.h
@@ -30,7 +30,7 @@ Value *EmitGEPOffset(IRBuilderTy *Builder, const DataLayout &DL, User *GEP,
                      bool NoAssumptions = false) {
   GEPOperator *GEPOp = cast<GEPOperator>(GEP);
   Type *IntIdxTy = DL.getIndexType(GEP->getType());
-  Value *Result = Constant::getNullValue(IntIdxTy);
+  Value *Result = nullptr;
 
   // If the GEP is inbounds, we know that none of the addressing operations will
   // overflow in a signed sense.
@@ -46,6 +46,7 @@ Value *EmitGEPOffset(IRBuilderTy *Builder, const DataLayout &DL, User *GEP,
        ++i, ++GTI) {
     Value *Op = *i;
     uint64_t Size = DL.getTypeAllocSize(GTI.getIndexedType()) & PtrSizeMask;
+    Value *Offset;
     if (Constant *OpC = dyn_cast<Constant>(Op)) {
       if (OpC->isZeroValue())
         continue;
@@ -54,46 +55,47 @@ Value *EmitGEPOffset(IRBuilderTy *Builder, const DataLayout &DL, User *GEP,
       if (StructType *STy = GTI.getStructTypeOrNull()) {
         uint64_t OpValue = OpC->getUniqueInteger().getZExtValue();
         Size = DL.getStructLayout(STy)->getElementOffset(OpValue);
-
-        if (Size)
-          Result = Builder->CreateAdd(Result, ConstantInt::get(IntIdxTy, Size),
-                                      GEP->getName().str()+".offs");
-        continue;
+        if (!Size)
+          continue;
+
+        Offset = ConstantInt::get(IntIdxTy, Size);
+      } else {
+        // Splat the constant if needed.
+        if (IntIdxTy->isVectorTy() && !OpC->getType()->isVectorTy())
+          OpC = ConstantVector::getSplat(
+              cast<VectorType>(IntIdxTy)->getElementCount(), OpC);
+
+        Constant *Scale = ConstantInt::get(IntIdxTy, Size);
+        Constant *OC =
+            ConstantExpr::getIntegerCast(OpC, IntIdxTy, true /*SExt*/);
+        Offset =
+            ConstantExpr::getMul(OC, Scale, false /*NUW*/, isInBounds /*NSW*/);
       }
-
-      // Splat the constant if needed.
-      if (IntIdxTy->isVectorTy() && !OpC->getType()->isVectorTy())
-        OpC = ConstantVector::getSplat(
-            cast<VectorType>(IntIdxTy)->getElementCount(), OpC);
-
-      Constant *Scale = ConstantInt::get(IntIdxTy, Size);
-      Constant *OC = ConstantExpr::getIntegerCast(OpC, IntIdxTy, true /*SExt*/);
-      Scale =
-          ConstantExpr::getMul(OC, Scale, false /*NUW*/, isInBounds /*NSW*/);
-      // Emit an add instruction.
-      Result = Builder->CreateAdd(Result, Scale, GEP->getName().str()+".offs");
-      continue;
-    }
-
-    // Splat the index if needed.
-    if (IntIdxTy->isVectorTy() && !Op->getType()->isVectorTy())
-      Op = Builder->CreateVectorSplat(
-          cast<VectorType>(IntIdxTy)->getNumElements(), Op);
-
-    // Convert to correct type.
-    if (Op->getType() != IntIdxTy)
-      Op = Builder->CreateIntCast(Op, IntIdxTy, true, Op->getName().str()+".c");
-    if (Size != 1) {
-      // We'll let instcombine(mul) convert this to a shl if possible.
-      Op = Builder->CreateMul(Op, ConstantInt::get(IntIdxTy, Size),
-                              GEP->getName().str() + ".idx", false /*NUW*/,
-                              isInBounds /*NSW*/);
+    } else {
+      // Splat the index if needed.
+      if (IntIdxTy->isVectorTy() && !Op->getType()->isVectorTy())
+        Op = Builder->CreateVectorSplat(
+            cast<FixedVectorType>(IntIdxTy)->getNumElements(), Op);
+
+      // Convert to correct type.
+      if (Op->getType() != IntIdxTy)
+        Op = Builder->CreateIntCast(Op, IntIdxTy, true, Op->getName().str()+".c");
+      if (Size != 1) {
+        // We'll let instcombine(mul) convert this to a shl if possible.
+        Op = Builder->CreateMul(Op, ConstantInt::get(IntIdxTy, Size),
+                                GEP->getName().str() + ".idx", false /*NUW*/,
+                                isInBounds /*NSW*/);
+      }
+      Offset = Op;
     }
 
-    // Emit an add instruction.
-    Result = Builder->CreateAdd(Op, Result, GEP->getName().str()+".offs");
+    if (Result)
+      Result = Builder->CreateAdd(Result, Offset, GEP->getName().str()+".offs",
+                                  false /*NUW*/, isInBounds /*NSW*/);
+    else
+      Result = Offset;
   }
-  return Result;
+  return Result ? Result : Constant::getNullValue(IntIdxTy);
 }
 
 }
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/Utils/TFUtils.h b/contrib/llvm-project/llvm/include/llvm/Analysis/Utils/TFUtils.h
index 2ab2c7a57d94..ea6bc2cf19ee 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/Utils/TFUtils.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/Utils/TFUtils.h
@@ -9,10 +9,11 @@
 #ifndef LLVM_ANALYSIS_UTILS_TFUTILS_H
 #define LLVM_ANALYSIS_UTILS_TFUTILS_H
 
-#include "llvm/Config/config.h"
+#include "llvm/Config/llvm-config.h"
 
 #ifdef LLVM_HAVE_TF_API
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/JSON.h"
 
 #include <memory>
 #include <vector>
@@ -36,6 +37,141 @@ namespace llvm {
 class TFModelEvaluatorImpl;
 class EvaluationResultImpl;
 
+/// TensorSpec encapsulates the specification of a tensor: its dimensions, or
+/// "shape" (row-major), its type (see TensorSpec::getDataType specializations
+/// for supported types), its name and port (see "TensorFlow: Large-Scale
+/// Machine Learning on Heterogeneous Distributed Systems", section 4.2, para 2:
+/// https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45166.pdf)
+///
+/// TensorSpec is used to set up a TFModelEvaluator by describing the expected
+/// inputs and outputs.
+class TensorSpec final {
+public:
+  template <typename T>
+  static TensorSpec createSpec(const std::string &Name,
+                               const std::vector<int64_t> &Shape,
+                               int Port = 0) {
+    return TensorSpec(Name, Port, getDataType<T>(), Shape);
+  }
+
+  const std::string &name() const { return Name; }
+  int port() const { return Port; }
+  int typeIndex() const { return TypeIndex; }
+  const std::vector<int64_t> &shape() const { return Shape; }
+
+  bool operator==(const TensorSpec &Other) const {
+    return Name == Other.Name && Port == Other.Port &&
+           TypeIndex == Other.TypeIndex && Shape == Other.Shape;
+  }
+
+  bool operator!=(const TensorSpec &Other) const { return !(*this == Other); }
+
+  /// Get the number of elements in a tensor with this shape.
+  size_t getElementCount() const { return ElementCount; }
+  /// Get the size, in bytes, of one element.
+  size_t getElementByteSize() const;
+
+  template <typename T> bool isElementType() const {
+    return getDataType<T>() == TypeIndex;
+  }
+
+private:
+  TensorSpec(const std::string &Name, int Port, int TypeIndex,
+             const std::vector<int64_t> &Shape);
+
+  template <typename T> static int getDataType() {
+    llvm_unreachable("Undefined tensor type");
+  }
+
+  std::string Name;
+  int Port = 0;
+  int TypeIndex = 0;
+  std::vector<int64_t> Shape;
+  size_t ElementCount = 0;
+};
+
+/// Construct a TensorSpec from a JSON dictionary of the form:
+/// { "name": <string>,
+///   "port": <int>,
+///   "type": <string. Use LLVM's types, e.g. float, double, int64_t>,
+///   "shape": <array of ints> }
+/// For the "type" field, see the C++ primitive types used in
+/// TFUTILS_SUPPORTED_TYPES.
+Optional<TensorSpec> getTensorSpecFromJSON(LLVMContext &Ctx,
+                                           const json::Value &Value);
+
+struct LoggedFeatureSpec {
+  TensorSpec Spec;
+  Optional<std::string> LoggingName;
+};
+
+/// Load the output specs. If SpecFileOverride is not empty, that path is used.
+/// Otherwise, the file is assumed to be called 'output_spec.json' and be found
+/// under ModelPath (the model directory).
+/// The first output tensor name must match ExpectedDecisionName.
+/// In case of error, the return is None and the error is logged.
+Optional<std::vector<LoggedFeatureSpec>>
+loadOutputSpecs(LLVMContext &Ctx, StringRef ExpectedDecisionName,
+                StringRef ModelPath, StringRef SpecFileOverride = StringRef());
+
+/// Logging utility - given an ordered specification of features, and assuming
+/// a scalar reward, allow logging feature values and rewards, and then print
+/// as tf.train.SequenceExample text protobuf.
+/// The assumption is that, for an event to be logged (i.e. a set of feature
+/// values and a reward), the user calls the log* API for each feature exactly
+/// once, providing the index matching the position in the feature spec list
+/// provided at construction:
+/// event 0:
+///   logTensorValue(0, ...)
+///   logTensorValue(1, ...)
+///   ...
+///   logReward(...)
+/// event 1:
+///   logTensorValue(0, ...)
+///   logTensorValue(1, ...)
+///   ...
+///   logReward(...)
+///
+/// At the end, call print to generate the protobuf.
+class Logger final {
+public:
+  /// Construct a Logger. If IncludeReward is false, then logReward shouldn't
+  /// be called, and the reward feature won't be printed out.
+  Logger(const std::vector<LoggedFeatureSpec> &FeatureSpecs,
+         const TensorSpec &RewardSpec, bool IncludeReward)
+      : FeatureSpecs(FeatureSpecs), RewardSpec(RewardSpec),
+        RawLogData(FeatureSpecs.size() + IncludeReward),
+        IncludeReward(IncludeReward) {}
+
+  template <typename T> void logReward(T Value) {
+    assert(IncludeReward);
+    logTensorValue(RawLogData.size() - 1, &Value);
+  }
+
+  template <typename T> void logFinalReward(T Value) {
+    assert(RawLogData.back().empty());
+    logReward(Value);
+  }
+
+  template <typename T>
+  void logTensorValue(size_t FeatureID, const T *Value, size_t Size = 1) {
+    const char *Start = reinterpret_cast<const char *>(Value);
+    const char *End = Start + sizeof(T) * Size;
+    RawLogData[FeatureID].insert(RawLogData[FeatureID].end(), Start, End);
+  }
+
+  void print(raw_ostream &OS);
+
+private:
+  std::vector<LoggedFeatureSpec> FeatureSpecs;
+  TensorSpec RewardSpec;
+  /// RawData has one entry per feature, plus one more for the reward.
+  /// Each feature's values are then stored in a vector, in succession.
+  /// This means the ith event is stored at [*][i]
+  std::vector<std::vector<char>> RawLogData;
+  const bool IncludeReward;
+};
+
 class TFModelEvaluator final {
 public:
   /// The result of a model evaluation. Handles the lifetime of the output
@@ -44,25 +180,41 @@ public:
   class EvaluationResult {
   public:
     EvaluationResult(const EvaluationResult &) = delete;
+    EvaluationResult &operator=(const EvaluationResult &Other) = delete;
+
     EvaluationResult(EvaluationResult &&Other);
+    EvaluationResult &operator=(EvaluationResult &&Other);
+
     ~EvaluationResult();
 
-    /// Get a pointer to the first element of the tensor at Index.
+    /// Get a (const) pointer to the first element of the tensor at Index.
     template <typename T> T *getTensorValue(size_t Index) {
       return static_cast<T *>(getUntypedTensorValue(Index));
     }
 
+    template <typename T> const T *getTensorValue(size_t Index) const {
+      return static_cast<T *>(getUntypedTensorValue(Index));
+    }
+
+    /// Get a (const) pointer to the untyped data of the tensor.
+    void *getUntypedTensorValue(size_t Index);
+    const void *getUntypedTensorValue(size_t Index) const;
+
   private:
     friend class TFModelEvaluator;
     EvaluationResult(std::unique_ptr<EvaluationResultImpl> Impl);
-    void *getUntypedTensorValue(size_t Index);
     std::unique_ptr<EvaluationResultImpl> Impl;
   };
 
   TFModelEvaluator(StringRef SavedModelPath,
-                   const std::vector<std::string> &InputNames,
-                   const std::vector<std::string> &OutputNames,
+                   const std::vector<TensorSpec> &InputSpecs,
+                   const std::vector<TensorSpec> &OutputSpecs,
                    const char *Tags = "serve");
+  TFModelEvaluator(StringRef SavedModelPath,
+                   const std::vector<TensorSpec> &InputSpecs,
+                   function_ref<TensorSpec(size_t)> GetOutputSpecs,
+                   size_t OutputSpecsSize, const char *Tags = "serve");
+
   ~TFModelEvaluator();
   TFModelEvaluator(const TFModelEvaluator &) = delete;
   TFModelEvaluator(TFModelEvaluator &&) = delete;
@@ -82,33 +234,32 @@ public:
   /// otherwise.
   bool isValid() const { return !!Impl; }
 
-  /// Initialize the input at Index as a tensor of the given type and
-  /// dimensions.
-  template <typename T>
-  void initInput(size_t Index, const std::vector<int64_t> &Dimensions) {
-    return initInput(Index, getModelTypeIndex<T>(), Dimensions);
-  }
-
 private:
   void *getUntypedInput(size_t Index);
-  template <typename T> int getModelTypeIndex();
-  void initInput(size_t Index, int TypeIndex,
-                 const std::vector<int64_t> &Dimensions);
-
   std::unique_ptr<TFModelEvaluatorImpl> Impl;
 };
 
-template <> int TFModelEvaluator::getModelTypeIndex<float>();
-template <> int TFModelEvaluator::getModelTypeIndex<double>();
-template <> int TFModelEvaluator::getModelTypeIndex<int8_t>();
-template <> int TFModelEvaluator::getModelTypeIndex<uint8_t>();
-template <> int TFModelEvaluator::getModelTypeIndex<int16_t>();
-template <> int TFModelEvaluator::getModelTypeIndex<uint16_t>();
-template <> int TFModelEvaluator::getModelTypeIndex<int32_t>();
-template <> int TFModelEvaluator::getModelTypeIndex<uint32_t>();
-template <> int TFModelEvaluator::getModelTypeIndex<int64_t>();
-template <> int TFModelEvaluator::getModelTypeIndex<uint64_t>();
+/// List of supported types, as a pair:
+/// - C++ type
+/// - enum name (implementation-specific)
+#define TFUTILS_SUPPORTED_TYPES(M)                                             \
+  M(float, TF_FLOAT)                                                           \
+  M(double, TF_DOUBLE)                                                         \
+  M(int8_t, TF_INT8)                                                           \
+  M(uint8_t, TF_UINT8)                                                         \
+  M(int16_t, TF_INT16)                                                         \
+  M(uint16_t, TF_UINT16)                                                       \
+  M(int32_t, TF_INT32)                                                         \
+  M(uint32_t, TF_UINT32)                                                       \
+  M(int64_t, TF_INT64)                                                         \
+  M(uint64_t, TF_UINT64)
+
+#define TFUTILS_GETDATATYPE_DEF(T, E)                                          \
+  template <> int TensorSpec::getDataType<T>();
+
+TFUTILS_SUPPORTED_TYPES(TFUTILS_GETDATATYPE_DEF)
 
+#undef TFUTILS_GETDATATYPE_DEF
 } // namespace llvm
 
 #endif // LLVM_HAVE_TF_API
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/ValueLattice.h b/contrib/llvm-project/llvm/include/llvm/Analysis/ValueLattice.h
index bf5bab9ced22..108d08033ac3 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/ValueLattice.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/ValueLattice.h
@@ -11,6 +11,7 @@
 
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
 //
 //===----------------------------------------------------------------------===//
 //                               ValueLatticeElement
@@ -456,6 +457,16 @@ public:
     if (isConstant() && Other.isConstant())
       return ConstantExpr::getCompare(Pred, getConstant(), Other.getConstant());
 
+    if (ICmpInst::isEquality(Pred)) {
+      // not(C) != C => true, not(C) == C => false.
+      if ((isNotConstant() && Other.isConstant() &&
+           getNotConstant() == Other.getConstant()) ||
+          (isConstant() && Other.isNotConstant() &&
+           getConstant() == Other.getNotConstant()))
+        return Pred == ICmpInst::ICMP_NE
+            ? ConstantInt::getTrue(Ty) : ConstantInt::getFalse(Ty);
+    }
+
     // Integer constants are represented as ConstantRanges with single
     // elements.
     if (!isConstantRange() || !Other.isConstantRange())
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/ValueTracking.h b/contrib/llvm-project/llvm/include/llvm/Analysis/ValueTracking.h
index 9510739ef5ab..86c0991451c5 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/ValueTracking.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/ValueTracking.h
@@ -21,12 +21,14 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Operator.h"
 #include <cassert>
 #include <cstdint>
 
 namespace llvm {
 
 class AddOperator;
+class AllocaInst;
 class APInt;
 class AssumptionCache;
 class DominatorTree;
@@ -43,6 +45,8 @@ class StringRef;
 class TargetLibraryInfo;
 class Value;
 
+constexpr unsigned MaxAnalysisRecursionDepth = 6;
+
   /// Determine which bits of V are known to be either zero or one and return
   /// them in the KnownZero/KnownOne bit sets.
   ///
@@ -366,14 +370,13 @@ class Value;
   /// that the returned value has pointer type if the specified value does. If
   /// the MaxLookup value is non-zero, it limits the number of instructions to
   /// be stripped off.
-  Value *GetUnderlyingObject(Value *V, const DataLayout &DL,
-                             unsigned MaxLookup = 6);
-  inline const Value *GetUnderlyingObject(const Value *V, const DataLayout &DL,
+  Value *getUnderlyingObject(Value *V, unsigned MaxLookup = 6);
+  inline const Value *getUnderlyingObject(const Value *V,
                                           unsigned MaxLookup = 6) {
-    return GetUnderlyingObject(const_cast<Value *>(V), DL, MaxLookup);
+    return getUnderlyingObject(const_cast<Value *>(V), MaxLookup);
   }
 
-  /// This method is similar to GetUnderlyingObject except that it can
+  /// This method is similar to getUnderlyingObject except that it can
   /// look through phi and select instructions and return multiple objects.
   ///
   /// If LoopInfo is passed, loop phis are further analyzed.  If a pointer
@@ -401,20 +404,30 @@ class Value;
   /// Since A[i] and A[i-1] are independent pointers, getUnderlyingObjects
   /// should not assume that Curr and Prev share the same underlying object thus
   /// it shouldn't look through the phi above.
-  void GetUnderlyingObjects(const Value *V,
+  void getUnderlyingObjects(const Value *V,
                             SmallVectorImpl<const Value *> &Objects,
-                            const DataLayout &DL, LoopInfo *LI = nullptr,
-                            unsigned MaxLookup = 6);
+                            LoopInfo *LI = nullptr, unsigned MaxLookup = 6);
 
-  /// This is a wrapper around GetUnderlyingObjects and adds support for basic
+  /// This is a wrapper around getUnderlyingObjects and adds support for basic
   /// ptrtoint+arithmetic+inttoptr sequences.
   bool getUnderlyingObjectsForCodeGen(const Value *V,
-                            SmallVectorImpl<Value *> &Objects,
-                            const DataLayout &DL);
+                                      SmallVectorImpl<Value *> &Objects);
+
+  /// Returns unique alloca where the value comes from, or nullptr.
+  /// If OffsetZero is true check that V points to the begining of the alloca.
+  AllocaInst *findAllocaForValue(Value *V, bool OffsetZero = false);
+  inline const AllocaInst *findAllocaForValue(const Value *V,
+                                              bool OffsetZero = false) {
+    return findAllocaForValue(const_cast<Value *>(V), OffsetZero);
+  }
 
   /// Return true if the only users of this pointer are lifetime markers.
   bool onlyUsedByLifetimeMarkers(const Value *V);
 
+  /// Return true if the only users of this pointer are lifetime markers or
+  /// droppable instructions.
+  bool onlyUsedByLifetimeMarkersOrDroppableInsts(const Value *V);
+
   /// Return true if speculation of the given load must be suppressed to avoid
   /// ordering or interfering with an active sanitizer.  If not suppressed,
   /// dereferenceability and alignment must be proven separately.  Note: This
@@ -571,45 +584,65 @@ class Value;
   /// if, for all i, r is evaluated to poison or op raises UB if vi = poison.
   /// To filter out operands that raise UB on poison, you can use
   /// getGuaranteedNonPoisonOp.
-  bool propagatesPoison(const Instruction *I);
+  bool propagatesPoison(const Operator *I);
 
-  /// Return either nullptr or an operand of I such that I will trigger
-  /// undefined behavior if I is executed and that operand has a poison
-  /// value.
-  const Value *getGuaranteedNonPoisonOp(const Instruction *I);
+  /// Insert operands of I into Ops such that I will trigger undefined behavior
+  /// if I is executed and that operand has a poison value.
+  void getGuaranteedNonPoisonOps(const Instruction *I,
+                                 SmallPtrSetImpl<const Value *> &Ops);
 
-  /// Return true if the given instruction must trigger undefined behavior.
+  /// Return true if the given instruction must trigger undefined behavior
   /// when I is executed with any operands which appear in KnownPoison holding
   /// a poison value at the point of execution.
   bool mustTriggerUB(const Instruction *I,
                      const SmallSet<const Value *, 16>& KnownPoison);
 
-  /// Return true if this function can prove that if PoisonI is executed
-  /// and yields a poison value, then that will trigger undefined behavior.
+  /// Return true if this function can prove that if Inst is executed
+  /// and yields a poison value or undef bits, then that will trigger
+  /// undefined behavior.
   ///
   /// Note that this currently only considers the basic block that is
-  /// the parent of I.
-  bool programUndefinedIfPoison(const Instruction *PoisonI);
-
-  /// Return true if I can create poison from non-poison operands.
-  /// For vectors, canCreatePoison returns true if there is potential poison in
-  /// any element of the result when vectors without poison are given as
+  /// the parent of Inst.
+  bool programUndefinedIfUndefOrPoison(const Instruction *Inst);
+  bool programUndefinedIfPoison(const Instruction *Inst);
+
+  /// canCreateUndefOrPoison returns true if Op can create undef or poison from
+  /// non-undef & non-poison operands.
+  /// For vectors, canCreateUndefOrPoison returns true if there is potential
+  /// poison or undef in any element of the result when vectors without
+  /// undef/poison poison are given as operands.
+  /// For example, given `Op = shl <2 x i32> %x, <0, 32>`, this function returns
+  /// true. If Op raises immediate UB but never creates poison or undef
+  /// (e.g. sdiv I, 0), canCreatePoison returns false.
+  ///
+  /// canCreatePoison returns true if Op can create poison from non-poison
   /// operands.
-  /// For example, given `I = shl <2 x i32> %x, <0, 32>`, this function returns
-  /// true. If I raises immediate UB but never creates poison (e.g. sdiv I, 0),
-  /// canCreatePoison returns false.
-  bool canCreatePoison(const Instruction *I);
-
-  /// Return true if this function can prove that V is never undef value
-  /// or poison value.
-  //
+  bool canCreateUndefOrPoison(const Operator *Op);
+  bool canCreatePoison(const Operator *Op);
+
+  /// Return true if V is poison given that ValAssumedPoison is already poison.
+  /// For example, if ValAssumedPoison is `icmp X, 10` and V is `icmp X, 5`,
+  /// impliesPoison returns true.
+  bool impliesPoison(const Value *ValAssumedPoison, const Value *V);
+
+  /// Return true if this function can prove that V does not have undef bits
+  /// and is never poison. If V is an aggregate value or vector, check whether
+  /// all elements (except padding) are not undef or poison.
+  /// Note that this is different from canCreateUndefOrPoison because the
+  /// function assumes Op's operands are not poison/undef.
+  ///
   /// If CtxI and DT are specified this method performs flow-sensitive analysis
   /// and returns true if it is guaranteed to be never undef or poison
   /// immediately before the CtxI.
   bool isGuaranteedNotToBeUndefOrPoison(const Value *V,
+                                        AssumptionCache *AC = nullptr,
                                         const Instruction *CtxI = nullptr,
                                         const DominatorTree *DT = nullptr,
                                         unsigned Depth = 0);
+  bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC = nullptr,
+                                 const Instruction *CtxI = nullptr,
+                                 const DominatorTree *DT = nullptr,
+                                 unsigned Depth = 0);
 
   /// Specific patterns of select instructions we can match.
   enum SelectPatternFlavor {
@@ -700,6 +733,14 @@ class Value;
   /// minimum/maximum flavor.
   CmpInst::Predicate getInverseMinMaxPred(SelectPatternFlavor SPF);
 
+  /// Check if the values in \p VL are select instructions that can be converted
+  /// to a min or max (vector) intrinsic. Returns the intrinsic ID, if such a
+  /// conversion is possible, together with a bool indicating whether all select
+  /// conditions are only used by the selects. Otherwise return
+  /// Intrinsic::not_intrinsic.
+  std::pair<Intrinsic::ID, bool>
+  canConvertToMinOrMaxIntrinsic(ArrayRef<Value *> VL);
+
   /// Return true if RHS is known to be implied true by LHS.  Return false if
   /// RHS is known to be implied false by LHS.  Otherwise, return None if no
   /// implication can be made.
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/VecFuncs.def b/contrib/llvm-project/llvm/include/llvm/Analysis/VecFuncs.def
index 2f64b0fedc7a..cfc3d6115866 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/VecFuncs.def
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/VecFuncs.def
@@ -62,6 +62,87 @@ TLI_DEFINE_VECFUNC("acoshf", "vacoshf", 4)
 TLI_DEFINE_VECFUNC("atanhf", "vatanhf", 4)
 
 
+#elif defined(TLI_DEFINE_LIBMVEC_X86_VECFUNCS)
+// GLIBC Vector math Functions
+
+TLI_DEFINE_VECFUNC("sin", "_ZGVbN2v_sin", 2)
+TLI_DEFINE_VECFUNC("sin", "_ZGVdN4v_sin", 4)
+
+TLI_DEFINE_VECFUNC("sinf", "_ZGVbN4v_sinf", 4)
+TLI_DEFINE_VECFUNC("sinf", "_ZGVdN8v_sinf", 8)
+
+TLI_DEFINE_VECFUNC("llvm.sin.f64", "_ZGVbN2v_sin", 2)
+TLI_DEFINE_VECFUNC("llvm.sin.f64", "_ZGVdN4v_sin", 4)
+
+TLI_DEFINE_VECFUNC("llvm.sin.f32", "_ZGVbN4v_sinf", 4)
+TLI_DEFINE_VECFUNC("llvm.sin.f32", "_ZGVdN8v_sinf", 8)
+
+TLI_DEFINE_VECFUNC("cos", "_ZGVbN2v_cos", 2)
+TLI_DEFINE_VECFUNC("cos", "_ZGVdN4v_cos", 4)
+
+TLI_DEFINE_VECFUNC("cosf", "_ZGVbN4v_cosf", 4)
+TLI_DEFINE_VECFUNC("cosf", "_ZGVdN8v_cosf", 8)
+
+TLI_DEFINE_VECFUNC("llvm.cos.f64", "_ZGVbN2v_cos", 2)
+TLI_DEFINE_VECFUNC("llvm.cos.f64", "_ZGVdN4v_cos", 4)
+
+TLI_DEFINE_VECFUNC("llvm.cos.f32", "_ZGVbN4v_cosf", 4)
+TLI_DEFINE_VECFUNC("llvm.cos.f32", "_ZGVdN8v_cosf", 8)
+
+TLI_DEFINE_VECFUNC("pow", "_ZGVbN2vv_pow", 2)
+TLI_DEFINE_VECFUNC("pow", "_ZGVdN4vv_pow", 4)
+
+TLI_DEFINE_VECFUNC("powf", "_ZGVbN4vv_powf", 4)
+TLI_DEFINE_VECFUNC("powf", "_ZGVdN8vv_powf", 8)
+
+TLI_DEFINE_VECFUNC("__pow_finite", "_ZGVbN2vv___pow_finite", 2)
+TLI_DEFINE_VECFUNC("__pow_finite", "_ZGVdN4vv___pow_finite", 4)
+
+TLI_DEFINE_VECFUNC("__powf_finite", "_ZGVbN4vv___powf_finite", 4)
+TLI_DEFINE_VECFUNC("__powf_finite", "_ZGVdN8vv___powf_finite", 8)
+
+TLI_DEFINE_VECFUNC("llvm.pow.f64", "_ZGVbN2vv_pow", 2)
+TLI_DEFINE_VECFUNC("llvm.pow.f64", "_ZGVdN4vv_pow", 4)
+
+TLI_DEFINE_VECFUNC("llvm.pow.f32", "_ZGVbN4vv_powf", 4)
+TLI_DEFINE_VECFUNC("llvm.pow.f32", "_ZGVdN8vv_powf", 8)
+
+TLI_DEFINE_VECFUNC("exp", "_ZGVbN2v_exp", 2)
+TLI_DEFINE_VECFUNC("exp", "_ZGVdN4v_exp", 4)
+
+TLI_DEFINE_VECFUNC("expf", "_ZGVbN4v_expf", 4)
+TLI_DEFINE_VECFUNC("expf", "_ZGVdN8v_expf", 8)
+
+TLI_DEFINE_VECFUNC("__exp_finite", "_ZGVbN2v___exp_finite", 2)
+TLI_DEFINE_VECFUNC("__exp_finite", "_ZGVdN4v___exp_finite", 4)
+
+TLI_DEFINE_VECFUNC("__expf_finite", "_ZGVbN4v___expf_finite", 4)
+TLI_DEFINE_VECFUNC("__expf_finite", "_ZGVdN8v___expf_finite", 8)
+
+TLI_DEFINE_VECFUNC("llvm.exp.f64", "_ZGVbN2v_exp", 2)
+TLI_DEFINE_VECFUNC("llvm.exp.f64", "_ZGVdN4v_exp", 4)
+
+TLI_DEFINE_VECFUNC("llvm.exp.f32", "_ZGVbN4v_expf", 4)
+TLI_DEFINE_VECFUNC("llvm.exp.f32", "_ZGVdN8v_expf", 8)
+
+TLI_DEFINE_VECFUNC("log", "_ZGVbN2v_log", 2)
+TLI_DEFINE_VECFUNC("log", "_ZGVdN4v_log", 4)
+
+TLI_DEFINE_VECFUNC("logf", "_ZGVbN4v_logf", 4)
+TLI_DEFINE_VECFUNC("logf", "_ZGVdN8v_logf", 8)
+
+TLI_DEFINE_VECFUNC("__log_finite", "_ZGVbN2v___log_finite", 2)
+TLI_DEFINE_VECFUNC("__log_finite", "_ZGVdN4v___log_finite", 4)
+
+TLI_DEFINE_VECFUNC("__logf_finite", "_ZGVbN4v___logf_finite", 4)
+TLI_DEFINE_VECFUNC("__logf_finite", "_ZGVdN8v___logf_finite", 8)
+
+TLI_DEFINE_VECFUNC("llvm.log.f64", "_ZGVbN2v_log", 2)
+TLI_DEFINE_VECFUNC("llvm.log.f64", "_ZGVdN4v_log", 4)
+
+TLI_DEFINE_VECFUNC("llvm.log.f32", "_ZGVbN4v_logf", 4)
+TLI_DEFINE_VECFUNC("llvm.log.f32", "_ZGVdN8v_logf", 8)
+
 #elif defined(TLI_DEFINE_MASSV_VECFUNCS)
 // IBM MASS library's vector Functions
 
@@ -245,6 +326,70 @@ TLI_DEFINE_VECFUNC("llvm.log.f32", "__svml_logf4", 4)
 TLI_DEFINE_VECFUNC("llvm.log.f32", "__svml_logf8", 8)
 TLI_DEFINE_VECFUNC("llvm.log.f32", "__svml_logf16", 16)
 
+TLI_DEFINE_VECFUNC("log2", "__svml_log22", 2)
+TLI_DEFINE_VECFUNC("log2", "__svml_log24", 4)
+TLI_DEFINE_VECFUNC("log2", "__svml_log28", 8)
+
+TLI_DEFINE_VECFUNC("log2f", "__svml_log2f4", 4)
+TLI_DEFINE_VECFUNC("log2f", "__svml_log2f8", 8)
+TLI_DEFINE_VECFUNC("log2f", "__svml_log2f16", 16)
+
+TLI_DEFINE_VECFUNC("__log2_finite", "__svml_log22", 2)
+TLI_DEFINE_VECFUNC("__log2_finite", "__svml_log24", 4)
+TLI_DEFINE_VECFUNC("__log2_finite", "__svml_log28", 8)
+
+TLI_DEFINE_VECFUNC("__log2f_finite", "__svml_log2f4", 4)
+TLI_DEFINE_VECFUNC("__log2f_finite", "__svml_log2f8", 8)
+TLI_DEFINE_VECFUNC("__log2f_finite", "__svml_log2f16", 16)
+
+TLI_DEFINE_VECFUNC("llvm.log2.f64", "__svml_log22", 2)
+TLI_DEFINE_VECFUNC("llvm.log2.f64", "__svml_log24", 4)
+TLI_DEFINE_VECFUNC("llvm.log2.f64", "__svml_log28", 8)
+
+TLI_DEFINE_VECFUNC("llvm.log2.f32", "__svml_log2f4", 4)
+TLI_DEFINE_VECFUNC("llvm.log2.f32", "__svml_log2f8", 8)
+TLI_DEFINE_VECFUNC("llvm.log2.f32", "__svml_log2f16", 16)
+
+TLI_DEFINE_VECFUNC("log10", "__svml_log102", 2)
+TLI_DEFINE_VECFUNC("log10", "__svml_log104", 4)
+TLI_DEFINE_VECFUNC("log10", "__svml_log108", 8)
+
+TLI_DEFINE_VECFUNC("log10f", "__svml_log10f4", 4)
+TLI_DEFINE_VECFUNC("log10f", "__svml_log10f8", 8)
+TLI_DEFINE_VECFUNC("log10f", "__svml_log10f16", 16)
+
+TLI_DEFINE_VECFUNC("__log10_finite", "__svml_log102", 2)
+TLI_DEFINE_VECFUNC("__log10_finite", "__svml_log104", 4)
+TLI_DEFINE_VECFUNC("__log10_finite", "__svml_log108", 8)
+
+TLI_DEFINE_VECFUNC("__log10f_finite", "__svml_log10f4", 4)
+TLI_DEFINE_VECFUNC("__log10f_finite", "__svml_log10f8", 8)
+TLI_DEFINE_VECFUNC("__log10f_finite", "__svml_log10f16", 16)
+
+TLI_DEFINE_VECFUNC("llvm.log10.f64", "__svml_log102", 2)
+TLI_DEFINE_VECFUNC("llvm.log10.f64", "__svml_log104", 4)
+TLI_DEFINE_VECFUNC("llvm.log10.f64", "__svml_log108", 8)
+
+TLI_DEFINE_VECFUNC("llvm.log10.f32", "__svml_log10f4", 4)
+TLI_DEFINE_VECFUNC("llvm.log10.f32", "__svml_log10f8", 8)
+TLI_DEFINE_VECFUNC("llvm.log10.f32", "__svml_log10f16", 16)
+
+TLI_DEFINE_VECFUNC("sqrt", "__svml_sqrt2", 2)
+TLI_DEFINE_VECFUNC("sqrt", "__svml_sqrt4", 4)
+TLI_DEFINE_VECFUNC("sqrt", "__svml_sqrt8", 8)
+
+TLI_DEFINE_VECFUNC("sqrtf", "__svml_sqrtf4", 4)
+TLI_DEFINE_VECFUNC("sqrtf", "__svml_sqrtf8", 8)
+TLI_DEFINE_VECFUNC("sqrtf", "__svml_sqrtf16", 16)
+
+TLI_DEFINE_VECFUNC("__sqrt_finite", "__svml_sqrt2", 2)
+TLI_DEFINE_VECFUNC("__sqrt_finite", "__svml_sqrt4", 4)
+TLI_DEFINE_VECFUNC("__sqrt_finite", "__svml_sqrt8", 8)
+
+TLI_DEFINE_VECFUNC("__sqrtf_finite", "__svml_sqrtf4", 4)
+TLI_DEFINE_VECFUNC("__sqrtf_finite", "__svml_sqrtf8", 8)
+TLI_DEFINE_VECFUNC("__sqrtf_finite", "__svml_sqrtf16", 16)
+
 TLI_DEFINE_VECFUNC("exp2", "__svml_exp22", 2)
 TLI_DEFINE_VECFUNC("exp2", "__svml_exp24", 4)
 TLI_DEFINE_VECFUNC("exp2", "__svml_exp28", 8)
@@ -275,6 +420,7 @@ TLI_DEFINE_VECFUNC("__exp2f_finite", "__svml_exp2f16", 16)
 
 #undef TLI_DEFINE_VECFUNC
 #undef TLI_DEFINE_ACCELERATE_VECFUNCS
+#undef TLI_DEFINE_LIBMVEC_X86_VECFUNCS
 #undef TLI_DEFINE_MASSV_VECFUNCS
 #undef TLI_DEFINE_SVML_VECFUNCS
 #undef TLI_DEFINE_MASSV_VECFUNCS_NAMES
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/VectorUtils.h b/contrib/llvm-project/llvm/include/llvm/Analysis/VectorUtils.h
index b1d7850442fb..26cb0e456ed4 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/VectorUtils.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/VectorUtils.h
@@ -14,12 +14,12 @@
 #define LLVM_ANALYSIS_VECTORUTILS_H
 
 #include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Support/CheckedArithmetic.h"
 
 namespace llvm {
+class TargetLibraryInfo;
 
 /// Describes the type of Parameters
 enum class VFParamKind {
@@ -99,7 +99,8 @@ struct VFShape {
   // Retrieve the VFShape that can be used to map a (scalar) function to itself,
   // with VF = 1.
   static VFShape getScalarShape(const CallInst &CI) {
-    return VFShape::get(CI, /*EC*/ {1, false}, /*HasGlobalPredicate*/ false);
+    return VFShape::get(CI, ElementCount::getFixed(1),
+                        /*HasGlobalPredicate*/ false);
   }
 
   // Retrieve the basic vectorization shape of the function, where all
@@ -114,7 +115,7 @@ struct VFShape {
       Parameters.push_back(
           VFParameter({CI.arg_size(), VFParamKind::GlobalPredicate}));
 
-    return {EC.Min, EC.Scalable, Parameters};
+    return {EC.getKnownMinValue(), EC.isScalable(), Parameters};
   }
   /// Sanity check on the Parameters in the VFShape.
   bool hasValidParameterList() const;
@@ -299,13 +300,17 @@ namespace Intrinsic {
 typedef unsigned ID;
 }
 
-/// A helper function for converting Scalar types to vector types.
-/// If the incoming type is void, we return void. If the VF is 1, we return
-/// the scalar type.
-inline Type *ToVectorTy(Type *Scalar, unsigned VF, bool isScalable = false) {
-  if (Scalar->isVoidTy() || VF == 1)
+/// A helper function for converting Scalar types to vector types. If
+/// the incoming type is void, we return void. If the EC represents a
+/// scalar, we return the scalar type.
+inline Type *ToVectorTy(Type *Scalar, ElementCount EC) {
+  if (Scalar->isVoidTy() || Scalar->isMetadataTy() || EC.isScalar())
     return Scalar;
-  return VectorType::get(Scalar, {VF, isScalable});
+  return VectorType::get(Scalar, EC);
+}
+
+inline Type *ToVectorTy(Type *Scalar, unsigned VF) {
+  return ToVectorTy(Scalar, ElementCount::getFixed(VF));
 }
 
 /// Identify if the intrinsic is trivially vectorizable.
@@ -353,7 +358,7 @@ int getSplatIndex(ArrayRef<int> Mask);
 /// Get splat value if the input is a splat vector or return nullptr.
 /// The value may be extracted from a splat constants vector or from
 /// a sequence of instructions that broadcast a single value into a vector.
-const Value *getSplatValue(const Value *V);
+Value *getSplatValue(const Value *V);
 
 /// Return true if each element of the vector value \p V is poisoned or equal to
 /// every other non-poisoned element. If an index element is specified, either
@@ -539,20 +544,20 @@ createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs);
 /// elements, it will be padded with undefs.
 Value *concatenateVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vecs);
 
-/// Given a mask vector of the form <Y x i1>, Return true if all of the
-/// elements of this predicate mask are false or undef.  That is, return true
-/// if all lanes can be assumed inactive. 
+/// Given a mask vector of i1, Return true if all of the elements of this
+/// predicate mask are known to be false or undef.  That is, return true if all
+/// lanes can be assumed inactive.
 bool maskIsAllZeroOrUndef(Value *Mask);
 
-/// Given a mask vector of the form <Y x i1>, Return true if all of the
-/// elements of this predicate mask are true or undef.  That is, return true
-/// if all lanes can be assumed active. 
+/// Given a mask vector of i1, Return true if all of the elements of this
+/// predicate mask are known to be true or undef.  That is, return true if all
+/// lanes can be assumed active.
 bool maskIsAllOneOrUndef(Value *Mask);
 
 /// Given a mask vector of the form <Y x i1>, return an APInt (of bitwidth Y)
 /// for each lane which may be active.
 APInt possiblyDemandedEltsInMask(Value *Mask);
-  
+
 /// The group of interleaved loads/stores sharing the same stride and
 /// close to each other.
 ///
@@ -615,6 +620,11 @@ public:
       return false;
     int32_t Key = *MaybeKey;
 
+    // Skip if the key is used for either the tombstone or empty special values.
+    if (DenseMapInfo<int32_t>::getTombstoneKey() == Key ||
+        DenseMapInfo<int32_t>::getEmptyKey() == Key)
+      return false;
+
     // Skip if there is already a member with the same index.
     if (Members.find(Key) != Members.end())
       return false;
@@ -650,11 +660,7 @@ public:
   /// \returns nullptr if contains no such member.
   InstTy *getMember(uint32_t Index) const {
     int32_t Key = SmallestKey + Index;
-    auto Member = Members.find(Key);
-    if (Member == Members.end())
-      return nullptr;
-
-    return Member->second;
+    return Members.lookup(Key);
   }
 
   /// Get the index for the given member. Unlike the key in the member
@@ -772,9 +778,7 @@ public:
   /// \returns nullptr if doesn't have such group.
   InterleaveGroup<Instruction> *
   getInterleaveGroup(const Instruction *Instr) const {
-    if (InterleaveGroupMap.count(Instr))
-      return InterleaveGroupMap.find(Instr)->second;
-    return nullptr;
+    return InterleaveGroupMap.lookup(Instr);
   }
 
   iterator_range<SmallPtrSetIterator<llvm::InterleaveGroup<Instruction> *>>
diff --git a/contrib/llvm-project/llvm/include/llvm/BinaryFormat/COFF.h b/contrib/llvm-project/llvm/include/llvm/BinaryFormat/COFF.h
index 1919d7f0dece..716d649f7c51 100644
--- a/contrib/llvm-project/llvm/include/llvm/BinaryFormat/COFF.h
+++ b/contrib/llvm-project/llvm/include/llvm/BinaryFormat/COFF.h
@@ -311,6 +311,7 @@ enum SectionCharacteristics : uint32_t {
   IMAGE_SCN_ALIGN_2048BYTES = 0x00C00000,
   IMAGE_SCN_ALIGN_4096BYTES = 0x00D00000,
   IMAGE_SCN_ALIGN_8192BYTES = 0x00E00000,
+  IMAGE_SCN_ALIGN_MASK = 0x00F00000,
   IMAGE_SCN_LNK_NRELOC_OVFL = 0x01000000,
   IMAGE_SCN_MEM_DISCARDABLE = 0x02000000,
   IMAGE_SCN_MEM_NOT_CACHED = 0x04000000,
diff --git a/contrib/llvm-project/llvm/include/llvm/BinaryFormat/Dwarf.def b/contrib/llvm-project/llvm/include/llvm/BinaryFormat/Dwarf.def
index f0337ef4fb54..f69877bb50df 100644
--- a/contrib/llvm-project/llvm/include/llvm/BinaryFormat/Dwarf.def
+++ b/contrib/llvm-project/llvm/include/llvm/BinaryFormat/Dwarf.def
@@ -17,7 +17,7 @@
     defined HANDLE_DW_VIRTUALITY || defined HANDLE_DW_DEFAULTED ||             \
     defined HANDLE_DW_CC || defined HANDLE_DW_LNS || defined HANDLE_DW_LNE ||  \
     defined HANDLE_DW_LNCT || defined HANDLE_DW_MACRO ||                       \
-    defined HANDLE_MACRO_FLAG ||                                               \
+    defined HANDLE_DW_MACRO_GNU || defined HANDLE_MACRO_FLAG ||                \
     defined HANDLE_DW_RLE || defined HANDLE_DW_LLE ||                          \
     (defined HANDLE_DW_CFA && defined HANDLE_DW_CFA_PRED) ||                   \
     defined HANDLE_DW_APPLE_PROPERTY || defined HANDLE_DW_UT ||                \
@@ -88,6 +88,10 @@
 #define HANDLE_DW_MACRO(ID, NAME)
 #endif
 
+#ifndef HANDLE_DW_MACRO_GNU
+#define HANDLE_DW_MACRO_GNU(ID, NAME)
+#endif
+
 #ifndef HANDLE_MACRO_FLAG
 #define HANDLE_MACRO_FLAG(ID, NAME)
 #endif
@@ -837,6 +841,18 @@ HANDLE_DW_MACRO(0x0a, import_sup)
 HANDLE_DW_MACRO(0x0b, define_strx)
 HANDLE_DW_MACRO(0x0c, undef_strx)
 
+// GNU .debug_macro extension.
+HANDLE_DW_MACRO_GNU(0x01, define)
+HANDLE_DW_MACRO_GNU(0x02, undef)
+HANDLE_DW_MACRO_GNU(0x03, start_file)
+HANDLE_DW_MACRO_GNU(0x04, end_file)
+HANDLE_DW_MACRO_GNU(0x05, define_indirect)
+HANDLE_DW_MACRO_GNU(0x06, undef_indirect)
+HANDLE_DW_MACRO_GNU(0x07, transparent_include)
+HANDLE_DW_MACRO_GNU(0x08, define_indirect_alt)
+HANDLE_DW_MACRO_GNU(0x09, undef_indirect_alt)
+HANDLE_DW_MACRO_GNU(0x0a, transparent_include_alt)
+
 // DWARF v5 Macro header flags.
 HANDLE_MACRO_FLAG(0x01, OFFSET_SIZE)
 HANDLE_MACRO_FLAG(0x02, DEBUG_LINE_OFFSET)
@@ -986,6 +1002,7 @@ HANDLE_DW_SECT(8, RNGLISTS)
 #undef HANDLE_DW_LNE
 #undef HANDLE_DW_LNCT
 #undef HANDLE_DW_MACRO
+#undef HANDLE_DW_MACRO_GNU
 #undef HANDLE_MACRO_FLAG
 #undef HANDLE_DW_RLE
 #undef HANDLE_DW_LLE
diff --git a/contrib/llvm-project/llvm/include/llvm/BinaryFormat/Dwarf.h b/contrib/llvm-project/llvm/include/llvm/BinaryFormat/Dwarf.h
index 4e8b708f39bb..cafc5be686bc 100644
--- a/contrib/llvm-project/llvm/include/llvm/BinaryFormat/Dwarf.h
+++ b/contrib/llvm-project/llvm/include/llvm/BinaryFormat/Dwarf.h
@@ -27,6 +27,8 @@
 #include "llvm/Support/FormatVariadicDetails.h"
 #include "llvm/ADT/Triple.h"
 
+#include <limits>
+
 namespace llvm {
 class StringRef;
 
@@ -118,10 +120,11 @@ enum LocationAtom {
 #include "llvm/BinaryFormat/Dwarf.def"
   DW_OP_lo_user = 0xe0,
   DW_OP_hi_user = 0xff,
-  DW_OP_LLVM_fragment = 0x1000,    ///< Only used in LLVM metadata.
-  DW_OP_LLVM_convert = 0x1001,     ///< Only used in LLVM metadata.
-  DW_OP_LLVM_tag_offset = 0x1002,  ///< Only used in LLVM metadata.
-  DW_OP_LLVM_entry_value = 0x1003, ///< Only used in LLVM metadata.
+  DW_OP_LLVM_fragment = 0x1000,         ///< Only used in LLVM metadata.
+  DW_OP_LLVM_convert = 0x1001,          ///< Only used in LLVM metadata.
+  DW_OP_LLVM_tag_offset = 0x1002,       ///< Only used in LLVM metadata.
+  DW_OP_LLVM_entry_value = 0x1003,      ///< Only used in LLVM metadata.
+  DW_OP_LLVM_implicit_pointer = 0x1004, ///< Only used in LLVM metadata.
 };
 
 enum TypeKind : uint8_t {
@@ -183,6 +186,7 @@ enum SourceLanguage {
 };
 
 inline bool isCPlusPlus(SourceLanguage S) {
+  bool result = false;
   // Deliberately enumerate all the language options so we get a warning when
   // new language options are added (-Wswitch) that'll hopefully help keep this
   // switch up-to-date when new C++ versions are added.
@@ -191,7 +195,8 @@ inline bool isCPlusPlus(SourceLanguage S) {
   case DW_LANG_C_plus_plus_03:
   case DW_LANG_C_plus_plus_11:
   case DW_LANG_C_plus_plus_14:
-    return true;
+    result = true;
+    break;
   case DW_LANG_C89:
   case DW_LANG_C:
   case DW_LANG_Ada83:
@@ -230,9 +235,68 @@ inline bool isCPlusPlus(SourceLanguage S) {
   case DW_LANG_BORLAND_Delphi:
   case DW_LANG_lo_user:
   case DW_LANG_hi_user:
-    return false;
+    result = false;
+    break;
   }
-  llvm_unreachable("Invalid source language");
+
+  return result;
+}
+
+inline bool isFortran(SourceLanguage S) {
+  bool result = false;
+  // Deliberately enumerate all the language options so we get a warning when
+  // new language options are added (-Wswitch) that'll hopefully help keep this
+  // switch up-to-date when new Fortran versions are added.
+  switch (S) {
+  case DW_LANG_Fortran77:
+  case DW_LANG_Fortran90:
+  case DW_LANG_Fortran95:
+  case DW_LANG_Fortran03:
+  case DW_LANG_Fortran08:
+    result = true;
+    break;
+  case DW_LANG_C89:
+  case DW_LANG_C:
+  case DW_LANG_Ada83:
+  case DW_LANG_C_plus_plus:
+  case DW_LANG_Cobol74:
+  case DW_LANG_Cobol85:
+  case DW_LANG_Pascal83:
+  case DW_LANG_Modula2:
+  case DW_LANG_Java:
+  case DW_LANG_C99:
+  case DW_LANG_Ada95:
+  case DW_LANG_PLI:
+  case DW_LANG_ObjC:
+  case DW_LANG_ObjC_plus_plus:
+  case DW_LANG_UPC:
+  case DW_LANG_D:
+  case DW_LANG_Python:
+  case DW_LANG_OpenCL:
+  case DW_LANG_Go:
+  case DW_LANG_Modula3:
+  case DW_LANG_Haskell:
+  case DW_LANG_C_plus_plus_03:
+  case DW_LANG_C_plus_plus_11:
+  case DW_LANG_OCaml:
+  case DW_LANG_Rust:
+  case DW_LANG_C11:
+  case DW_LANG_Swift:
+  case DW_LANG_Julia:
+  case DW_LANG_Dylan:
+  case DW_LANG_C_plus_plus_14:
+  case DW_LANG_RenderScript:
+  case DW_LANG_BLISS:
+  case DW_LANG_Mips_Assembler:
+  case DW_LANG_GOOGLE_RenderScript:
+  case DW_LANG_BORLAND_Delphi:
+  case DW_LANG_lo_user:
+  case DW_LANG_hi_user:
+    result = false;
+    break;
+  }
+
+  return result;
 }
 
 enum CaseSensitivity {
@@ -309,6 +373,14 @@ enum MacroEntryType {
   DW_MACRO_hi_user = 0xff
 };
 
+/// GNU .debug_macro macro information entry type encodings.
+enum GnuMacroEntryType {
+#define HANDLE_DW_MACRO_GNU(ID, NAME) DW_MACRO_GNU_##NAME = ID,
+#include "llvm/BinaryFormat/Dwarf.def"
+  DW_MACRO_GNU_lo_user = 0xe0,
+  DW_MACRO_GNU_hi_user = 0xff
+};
+
 /// DWARF v5 range list entry encoding values.
 enum RnglistEntries {
 #define HANDLE_DW_RLE(ID, NAME) DW_RLE_##NAME = ID,
@@ -472,6 +544,7 @@ StringRef LNStandardString(unsigned Standard);
 StringRef LNExtendedString(unsigned Encoding);
 StringRef MacinfoString(unsigned Encoding);
 StringRef MacroString(unsigned Encoding);
+StringRef GnuMacroString(unsigned Encoding);
 StringRef RangeListEncodingString(unsigned Encoding);
 StringRef LocListEncodingString(unsigned Encoding);
 StringRef CallFrameString(unsigned Encoding, Triple::ArchType Arch);
@@ -483,6 +556,7 @@ StringRef GDBIndexEntryLinkageString(GDBIndexEntryLinkage Linkage);
 StringRef IndexString(unsigned Idx);
 StringRef FormatString(DwarfFormat Format);
 StringRef FormatString(bool IsDWARF64);
+StringRef RLEString(unsigned RLE);
 /// @}
 
 /// \defgroup DwarfConstantsParsing Dwarf constants parsing functions
@@ -674,6 +748,11 @@ template <> struct EnumTraits<LocationAtom> : public std::true_type {
   static constexpr char Type[3] = "OP";
   static constexpr StringRef (*StringFn)(unsigned) = &OperationEncodingString;
 };
+
+inline uint64_t computeTombstoneAddress(uint8_t AddressByteSize) {
+  return std::numeric_limits<uint64_t>::max() >> (8 - AddressByteSize) * 8;
+}
+
 } // End of namespace dwarf
 
 /// Dwarf constants format_provider
diff --git a/contrib/llvm-project/llvm/include/llvm/BinaryFormat/DynamicTags.def b/contrib/llvm-project/llvm/include/llvm/BinaryFormat/DynamicTags.def
index aec408bd2d72..c08f8a53bdb5 100644
--- a/contrib/llvm-project/llvm/include/llvm/BinaryFormat/DynamicTags.def
+++ b/contrib/llvm-project/llvm/include/llvm/BinaryFormat/DynamicTags.def
@@ -120,6 +120,7 @@ DYNAMIC_TAG(VERNEEDNUM, 0X6FFFFFFF) // The number of entries in DT_VERNEED.
 // AArch64 specific dynamic table entries
 AARCH64_DYNAMIC_TAG(AARCH64_BTI_PLT, 0x70000001)
 AARCH64_DYNAMIC_TAG(AARCH64_PAC_PLT, 0x70000003)
+AARCH64_DYNAMIC_TAG(AARCH64_VARIANT_PCS, 0x70000005)
 
 // Hexagon specific dynamic table entries
 HEXAGON_DYNAMIC_TAG(HEXAGON_SYMSZ, 0x70000000)
diff --git a/contrib/llvm-project/llvm/include/llvm/BinaryFormat/ELF.h b/contrib/llvm-project/llvm/include/llvm/BinaryFormat/ELF.h
index 21a5c26883cd..1552303b610c 100644
--- a/contrib/llvm-project/llvm/include/llvm/BinaryFormat/ELF.h
+++ b/contrib/llvm-project/llvm/include/llvm/BinaryFormat/ELF.h
@@ -107,13 +107,17 @@ struct Elf64_Ehdr {
   unsigned char getDataEncoding() const { return e_ident[EI_DATA]; }
 };
 
-// File types
+// File types.
+// See current registered ELF types at:
+//    http://www.sco.com/developers/gabi/latest/ch4.eheader.html
 enum {
   ET_NONE = 0,        // No file type
   ET_REL = 1,         // Relocatable file
   ET_EXEC = 2,        // Executable file
   ET_DYN = 3,         // Shared object file
   ET_CORE = 4,        // Core file
+  ET_LOOS = 0xfe00,   // Beginning of operating system-specific codes
+  ET_HIOS = 0xfeff,   // Operating system-specific
   ET_LOPROC = 0xff00, // Beginning of processor-specific codes
   ET_HIPROC = 0xffff  // Processor-specific
 };
@@ -312,6 +316,7 @@ enum {
   EM_LANAI = 244,         // Lanai 32-bit processor
   EM_BPF = 247,           // Linux kernel bpf virtual machine
   EM_VE = 251,            // NEC SX-Aurora VE
+  EM_CSKY = 252,          // C-SKY 32-bit processor
 };
 
 // Object file classes.
@@ -359,6 +364,14 @@ enum {
   ELFOSABI_LAST_ARCH = 255     // Last Architecture-specific OS ABI
 };
 
+// AMDGPU OS ABI Version identification.
+enum {
+  // ELFABIVERSION_AMDGPU_HSA_V1 does not exist because OS ABI identification
+  // was never defined for V1.
+  ELFABIVERSION_AMDGPU_HSA_V2 = 0,
+  ELFABIVERSION_AMDGPU_HSA_V3 = 1,
+};
+
 #define ELF_RELOC(name, value) name = value,
 
 // X86_64 relocations.
@@ -686,41 +699,39 @@ enum : unsigned {
   EF_AMDGPU_MACH_R600_LAST = EF_AMDGPU_MACH_R600_TURKS,
 
   // AMDGCN-based processors.
-
-  // AMDGCN GFX6.
-  EF_AMDGPU_MACH_AMDGCN_GFX600 = 0x020,
-  EF_AMDGPU_MACH_AMDGCN_GFX601 = 0x021,
-  // AMDGCN GFX7.
-  EF_AMDGPU_MACH_AMDGCN_GFX700 = 0x022,
-  EF_AMDGPU_MACH_AMDGCN_GFX701 = 0x023,
-  EF_AMDGPU_MACH_AMDGCN_GFX702 = 0x024,
-  EF_AMDGPU_MACH_AMDGCN_GFX703 = 0x025,
-  EF_AMDGPU_MACH_AMDGCN_GFX704 = 0x026,
-  // AMDGCN GFX8.
-  EF_AMDGPU_MACH_AMDGCN_GFX801 = 0x028,
-  EF_AMDGPU_MACH_AMDGCN_GFX802 = 0x029,
-  EF_AMDGPU_MACH_AMDGCN_GFX803 = 0x02a,
-  EF_AMDGPU_MACH_AMDGCN_GFX810 = 0x02b,
-  // AMDGCN GFX9.
-  EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c,
-  EF_AMDGPU_MACH_AMDGCN_GFX902 = 0x02d,
-  EF_AMDGPU_MACH_AMDGCN_GFX904 = 0x02e,
-  EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f,
-  EF_AMDGPU_MACH_AMDGCN_GFX908 = 0x030,
-  EF_AMDGPU_MACH_AMDGCN_GFX909 = 0x031,
-  // AMDGCN GFX10.
-  EF_AMDGPU_MACH_AMDGCN_GFX1010 = 0x033,
-  EF_AMDGPU_MACH_AMDGCN_GFX1011 = 0x034,
-  EF_AMDGPU_MACH_AMDGCN_GFX1012 = 0x035,
-  EF_AMDGPU_MACH_AMDGCN_GFX1030 = 0x036,
-
-  // Reserved for AMDGCN-based processors.
-  EF_AMDGPU_MACH_AMDGCN_RESERVED0 = 0x027,
-  EF_AMDGPU_MACH_AMDGCN_RESERVED1 = 0x032,
+  EF_AMDGPU_MACH_AMDGCN_GFX600        = 0x020,
+  EF_AMDGPU_MACH_AMDGCN_GFX601        = 0x021,
+  EF_AMDGPU_MACH_AMDGCN_GFX700        = 0x022,
+  EF_AMDGPU_MACH_AMDGCN_GFX701        = 0x023,
+  EF_AMDGPU_MACH_AMDGCN_GFX702        = 0x024,
+  EF_AMDGPU_MACH_AMDGCN_GFX703        = 0x025,
+  EF_AMDGPU_MACH_AMDGCN_GFX704        = 0x026,
+  EF_AMDGPU_MACH_AMDGCN_RESERVED_0X27 = 0x027,
+  EF_AMDGPU_MACH_AMDGCN_GFX801        = 0x028,
+  EF_AMDGPU_MACH_AMDGCN_GFX802        = 0x029,
+  EF_AMDGPU_MACH_AMDGCN_GFX803        = 0x02a,
+  EF_AMDGPU_MACH_AMDGCN_GFX810        = 0x02b,
+  EF_AMDGPU_MACH_AMDGCN_GFX900        = 0x02c,
+  EF_AMDGPU_MACH_AMDGCN_GFX902        = 0x02d,
+  EF_AMDGPU_MACH_AMDGCN_GFX904        = 0x02e,
+  EF_AMDGPU_MACH_AMDGCN_GFX906        = 0x02f,
+  EF_AMDGPU_MACH_AMDGCN_GFX908        = 0x030,
+  EF_AMDGPU_MACH_AMDGCN_GFX909        = 0x031,
+  EF_AMDGPU_MACH_AMDGCN_GFX90C        = 0x032,
+  EF_AMDGPU_MACH_AMDGCN_GFX1010       = 0x033,
+  EF_AMDGPU_MACH_AMDGCN_GFX1011       = 0x034,
+  EF_AMDGPU_MACH_AMDGCN_GFX1012       = 0x035,
+  EF_AMDGPU_MACH_AMDGCN_GFX1030       = 0x036,
+  EF_AMDGPU_MACH_AMDGCN_GFX1031       = 0x037,
+  EF_AMDGPU_MACH_AMDGCN_GFX1032       = 0x038,
+  EF_AMDGPU_MACH_AMDGCN_GFX1033       = 0x039,
+  EF_AMDGPU_MACH_AMDGCN_GFX602        = 0x03a,
+  EF_AMDGPU_MACH_AMDGCN_GFX705        = 0x03b,
+  EF_AMDGPU_MACH_AMDGCN_GFX805        = 0x03c,
 
   // First/last AMDGCN-based processors.
   EF_AMDGPU_MACH_AMDGCN_FIRST = EF_AMDGPU_MACH_AMDGCN_GFX600,
-  EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX1030,
+  EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX805,
 
   // Indicates if the "xnack" target feature is enabled for all code contained
   // in the object.
@@ -777,6 +788,12 @@ enum {
 #include "ELFRelocs/VE.def"
 };
 
+
+// ELF Relocation types for CSKY
+enum {
+#include "ELFRelocs/CSKY.def"
+};
+
 #undef ELF_RELOC
 
 // Section header.
@@ -854,10 +871,11 @@ enum : unsigned {
   SHT_LLVM_ADDRSIG = 0x6fff4c03, // List of address-significant symbols
                                  // for safe ICF.
   SHT_LLVM_DEPENDENT_LIBRARIES =
-      0x6fff4c04,                  // LLVM Dependent Library Specifiers.
-  SHT_LLVM_SYMPART = 0x6fff4c05,   // Symbol partition specification.
-  SHT_LLVM_PART_EHDR = 0x6fff4c06, // ELF header for loadable partition.
-  SHT_LLVM_PART_PHDR = 0x6fff4c07, // Phdrs for loadable partition.
+      0x6fff4c04,                    // LLVM Dependent Library Specifiers.
+  SHT_LLVM_SYMPART = 0x6fff4c05,     // Symbol partition specification.
+  SHT_LLVM_PART_EHDR = 0x6fff4c06,   // ELF header for loadable partition.
+  SHT_LLVM_PART_PHDR = 0x6fff4c07,   // Phdrs for loadable partition.
+  SHT_LLVM_BB_ADDR_MAP = 0x6fff4c08, // LLVM Basic Block Address Map.
   // Android's experimental support for SHT_RELR sections.
   // https://android.googlesource.com/platform/bionic/+/b7feec74547f84559a1467aca02708ff61346d2a/libc/include/elf.h#512
   SHT_ANDROID_RELR = 0x6fffff00,   // Relocation entries; only offsets.
diff --git a/contrib/llvm-project/llvm/include/llvm/BinaryFormat/ELFRelocs/CSKY.def b/contrib/llvm-project/llvm/include/llvm/BinaryFormat/ELFRelocs/CSKY.def
new file mode 100644
index 000000000000..c5f2dbae8033
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/BinaryFormat/ELFRelocs/CSKY.def
@@ -0,0 +1,74 @@
+
+#ifndef ELF_RELOC
+#error "ELF_RELOC must be defined"
+#endif
+
+ELF_RELOC(R_CKCORE_NONE,                        0)
+ELF_RELOC(R_CKCORE_ADDR32,                      1)
+ELF_RELOC(R_CKCORE_PCREL_IMM8_4,                2)
+ELF_RELOC(R_CKCORE_PCREL_IMM11_2,               3)
+ELF_RELOC(R_CKCORE_PCREL_IMM4_2,                4)
+ELF_RELOC(R_CKCORE_PCREL32,                     5)
+ELF_RELOC(R_CKCORE_PCREL_JSR_IMM11_2,           6)
+ELF_RELOC(R_CKCORE_GNU_VTINHERIT,               7)
+ELF_RELOC(R_CKCORE_GNU_VTENTRY,                 8)
+ELF_RELOC(R_CKCORE_RELATIVE,                    9)
+ELF_RELOC(R_CKCORE_COPY,                       10)
+ELF_RELOC(R_CKCORE_GLOB_DAT,                   11)
+ELF_RELOC(R_CKCORE_JUMP_SLOT,                  12)
+ELF_RELOC(R_CKCORE_GOTOFF,                     13)
+ELF_RELOC(R_CKCORE_GOTPC,                      14)
+ELF_RELOC(R_CKCORE_GOT32,                      15)
+ELF_RELOC(R_CKCORE_PLT32,                      16)
+ELF_RELOC(R_CKCORE_ADDRGOT,                    17)
+ELF_RELOC(R_CKCORE_ADDRPLT,                    18)
+ELF_RELOC(R_CKCORE_PCREL_IMM26_2,              19)
+ELF_RELOC(R_CKCORE_PCREL_IMM16_2,              20)
+ELF_RELOC(R_CKCORE_PCREL_IMM16_4,              21)
+ELF_RELOC(R_CKCORE_PCREL_IMM10_2,              22)
+ELF_RELOC(R_CKCORE_PCREL_IMM10_4,              23)
+ELF_RELOC(R_CKCORE_ADDR_HI16,                  24)
+ELF_RELOC(R_CKCORE_ADDR_LO16,                  25)
+ELF_RELOC(R_CKCORE_GOTPC_HI16,                 26)
+ELF_RELOC(R_CKCORE_GOTPC_LO16,                 27)
+ELF_RELOC(R_CKCORE_GOTOFF_HI16,                28)
+ELF_RELOC(R_CKCORE_GOTOFF_LO16,                29)
+ELF_RELOC(R_CKCORE_GOT12,                      30)
+ELF_RELOC(R_CKCORE_GOT_HI16,                   31)
+ELF_RELOC(R_CKCORE_GOT_LO16,                   32)
+ELF_RELOC(R_CKCORE_PLT12,                      33)
+ELF_RELOC(R_CKCORE_PLT_HI16,                   34)
+ELF_RELOC(R_CKCORE_PLT_LO16,                   35)
+ELF_RELOC(R_CKCORE_ADDRGOT_HI16,               36)
+ELF_RELOC(R_CKCORE_ADDRGOT_LO16,               37)
+ELF_RELOC(R_CKCORE_ADDRPLT_HI16,               38)
+ELF_RELOC(R_CKCORE_ADDRPLT_LO16,               39)
+ELF_RELOC(R_CKCORE_PCREL_JSR_IMM26_2,          40)
+ELF_RELOC(R_CKCORE_TOFFSET_LO16,               41)
+ELF_RELOC(R_CKCORE_DOFFSET_LO16,               42)
+ELF_RELOC(R_CKCORE_PCREL_IMM18_2,              43)
+ELF_RELOC(R_CKCORE_DOFFSET_IMM18,              44)
+ELF_RELOC(R_CKCORE_DOFFSET_IMM18_2,            45)
+ELF_RELOC(R_CKCORE_DOFFSET_IMM18_4,            46)
+ELF_RELOC(R_CKCORE_GOTOFF_IMM18,               47)
+ELF_RELOC(R_CKCORE_GOT_IMM18_4,                48)
+ELF_RELOC(R_CKCORE_PLT_IMM18_4,                49)
+ELF_RELOC(R_CKCORE_PCREL_IMM7_4,               50)
+ELF_RELOC(R_CKCORE_TLS_LE32,                   51)
+ELF_RELOC(R_CKCORE_TLS_IE32,                   52)
+ELF_RELOC(R_CKCORE_TLS_GD32,                   53)
+ELF_RELOC(R_CKCORE_TLS_LDM32,                  54)
+ELF_RELOC(R_CKCORE_TLS_LDO32,                  55)
+ELF_RELOC(R_CKCORE_TLS_DTPMOD32,               56)
+ELF_RELOC(R_CKCORE_TLS_DTPOFF32,               57)
+ELF_RELOC(R_CKCORE_TLS_TPOFF32,                58)
+ELF_RELOC(R_CKCORE_PCREL_FLRW_IMM8_4,          59)
+ELF_RELOC(R_CKCORE_NOJSRI,                     60)
+ELF_RELOC(R_CKCORE_CALLGRAPH,                  61)
+ELF_RELOC(R_CKCORE_IRELATIVE,                  62)
+ELF_RELOC(R_CKCORE_PCREL_BLOOP_IMM4_4,         63)
+ELF_RELOC(R_CKCORE_PCREL_BLOOP_IMM12_4,        64)
+ELF_RELOC(R_CKCORE_PCREL_VLRW_IMM12_1,         65)
+ELF_RELOC(R_CKCORE_PCREL_VLRW_IMM12_2,         66)
+ELF_RELOC(R_CKCORE_PCREL_VLRW_IMM12_4,         67)
+ELF_RELOC(R_CKCORE_PCREL_VLRW_IMM12_8,         68)
diff --git a/contrib/llvm-project/llvm/include/llvm/BinaryFormat/ELFRelocs/PowerPC64.def b/contrib/llvm-project/llvm/include/llvm/BinaryFormat/ELFRelocs/PowerPC64.def
index e28c9caaefaf..0422aa0606d7 100644
--- a/contrib/llvm-project/llvm/include/llvm/BinaryFormat/ELFRelocs/PowerPC64.def
+++ b/contrib/llvm-project/llvm/include/llvm/BinaryFormat/ELFRelocs/PowerPC64.def
@@ -97,8 +97,14 @@
 #undef R_PPC64_DTPREL16_HIGH
 #undef R_PPC64_DTPREL16_HIGHA
 #undef R_PPC64_REL24_NOTOC
+#undef R_PPC64_PCREL_OPT
 #undef R_PPC64_PCREL34
 #undef R_PPC64_GOT_PCREL34
+#undef R_PPC64_TPREL34
+#undef R_PPC64_DTPREL34
+#undef R_PPC64_GOT_TLSGD_PCREL34
+#undef R_PPC64_GOT_TLSLD_PCREL34
+#undef R_PPC64_GOT_TPREL_PCREL34
 #undef R_PPC64_IRELATIVE
 #undef R_PPC64_REL16
 #undef R_PPC64_REL16_LO
@@ -194,8 +200,14 @@ ELF_RELOC(R_PPC64_TPREL16_HIGHA,        113)
 ELF_RELOC(R_PPC64_DTPREL16_HIGH,        114)
 ELF_RELOC(R_PPC64_DTPREL16_HIGHA,       115)
 ELF_RELOC(R_PPC64_REL24_NOTOC,          116)
+ELF_RELOC(R_PPC64_PCREL_OPT,            123)
 ELF_RELOC(R_PPC64_PCREL34,              132)
 ELF_RELOC(R_PPC64_GOT_PCREL34,          133)
+ELF_RELOC(R_PPC64_TPREL34,              146)
+ELF_RELOC(R_PPC64_DTPREL34,             147)
+ELF_RELOC(R_PPC64_GOT_TLSGD_PCREL34,    148)
+ELF_RELOC(R_PPC64_GOT_TLSLD_PCREL34,    149)
+ELF_RELOC(R_PPC64_GOT_TPREL_PCREL34,    150)
 ELF_RELOC(R_PPC64_IRELATIVE,            248)
 ELF_RELOC(R_PPC64_REL16,                249)
 ELF_RELOC(R_PPC64_REL16_LO,             250)
diff --git a/contrib/llvm-project/llvm/include/llvm/BinaryFormat/MachO.h b/contrib/llvm-project/llvm/include/llvm/BinaryFormat/MachO.h
index e43fea0a2465..f5d5ec328b5e 100644
--- a/contrib/llvm-project/llvm/include/llvm/BinaryFormat/MachO.h
+++ b/contrib/llvm-project/llvm/include/llvm/BinaryFormat/MachO.h
@@ -83,6 +83,7 @@ enum {
   MH_NO_HEAP_EXECUTION = 0x01000000u,
   MH_APP_EXTENSION_SAFE = 0x02000000u,
   MH_NLIST_OUTOFSYNC_WITH_DYLDINFO = 0x04000000u,
+  MH_SIM_SUPPORT = 0x08000000u,
   MH_DYLIB_IN_CACHE = 0x80000000u,
 };
 
@@ -495,7 +496,8 @@ enum PlatformType {
   PLATFORM_MACCATALYST = 6,
   PLATFORM_IOSSIMULATOR = 7,
   PLATFORM_TVOSSIMULATOR = 8,
-  PLATFORM_WATCHOSSIMULATOR = 9
+  PLATFORM_WATCHOSSIMULATOR = 9,
+  PLATFORM_DRIVERKIT = 10,
 };
 
 // Values for tools enum in build_tool_version.
@@ -1492,6 +1494,7 @@ enum CPUSubTypeARM {
 
 enum CPUSubTypeARM64 {
   CPU_SUBTYPE_ARM64_ALL = 0,
+  CPU_SUBTYPE_ARM64_V8 = 1,
   CPU_SUBTYPE_ARM64E = 2,
 };
 
diff --git a/contrib/llvm-project/llvm/include/llvm/BinaryFormat/Wasm.h b/contrib/llvm-project/llvm/include/llvm/BinaryFormat/Wasm.h
index 1aca692e30a7..063c6a3f9449 100644
--- a/contrib/llvm-project/llvm/include/llvm/BinaryFormat/Wasm.h
+++ b/contrib/llvm-project/llvm/include/llvm/BinaryFormat/Wasm.h
@@ -41,7 +41,7 @@ struct WasmDylinkInfo {
   uint32_t MemoryAlignment;  // P2 alignment of memory
   uint32_t TableSize;  // Table size in elements
   uint32_t TableAlignment;  // P2 alignment of table
-  std::vector<StringRef> Needed; // Shared library depenedencies
+  std::vector<StringRef> Needed; // Shared library dependencies
 };
 
 struct WasmProducerInfo {
@@ -67,11 +67,17 @@ struct WasmLimits {
   uint64_t Maximum;
 };
 
-struct WasmTable {
+struct WasmTableType {
   uint8_t ElemType;
   WasmLimits Limits;
 };
 
+struct WasmTable {
+  uint32_t Index;
+  WasmTableType Type;
+  StringRef SymbolName; // from the "linking" section
+};
+
 struct WasmInitExpr {
   uint8_t Opcode;
   union {
@@ -114,7 +120,7 @@ struct WasmImport {
   union {
     uint32_t SigIndex;
     WasmGlobalType Global;
-    WasmTable Table;
+    WasmTableType Table;
     WasmLimits Memory;
     WasmEventType Event;
   };
@@ -140,8 +146,11 @@ struct WasmFunction {
 
 struct WasmDataSegment {
   uint32_t InitFlags;
-  uint32_t MemoryIndex; // present if InitFlags & WASM_SEGMENT_HAS_MEMINDEX
-  WasmInitExpr Offset; // present if InitFlags & WASM_SEGMENT_IS_PASSIVE == 0
+  // Present if InitFlags & WASM_DATA_SEGMENT_HAS_MEMINDEX.
+  uint32_t MemoryIndex;
+  // Present if InitFlags & WASM_DATA_SEGMENT_IS_PASSIVE == 0.
+  WasmInitExpr Offset;
+
   ArrayRef<uint8_t> Content;
   StringRef Name; // from the "segment info" section
   uint32_t Alignment;
@@ -186,15 +195,22 @@ struct WasmSymbolInfo {
   // For symbols to be exported from the final module
   Optional<StringRef> ExportName;
   union {
-    // For function or global symbols, the index in function or global index
-    // space.
+    // For function, table, or global symbols, the index in function, table, or
+    // global index space.
     uint32_t ElementIndex;
     // For a data symbols, the address of the data relative to segment.
     WasmDataReference DataRef;
   };
 };
 
-struct WasmFunctionName {
+enum class NameType {
+  FUNCTION,
+  GLOBAL,
+  DATA_SEGMENT,
+};
+
+struct WasmDebugName {
+  NameType Type;
   uint32_t Index;
   StringRef Name;
 };
@@ -231,7 +247,6 @@ enum : unsigned {
   WASM_TYPE_F64 = 0x7C,
   WASM_TYPE_V128 = 0x7B,
   WASM_TYPE_FUNCREF = 0x70,
-  WASM_TYPE_EXNREF = 0x68,
   WASM_TYPE_EXTERNREF = 0x6F,
   WASM_TYPE_FUNC = 0x60,
   WASM_TYPE_NORESULT = 0x40, // for blocks with no result values
@@ -251,6 +266,7 @@ enum : unsigned {
   WASM_OPCODE_END = 0x0b,
   WASM_OPCODE_CALL = 0x10,
   WASM_OPCODE_LOCAL_GET = 0x20,
+  WASM_OPCODE_LOCAL_SET = 0x21,
   WASM_OPCODE_GLOBAL_GET = 0x23,
   WASM_OPCODE_GLOBAL_SET = 0x24,
   WASM_OPCODE_I32_STORE = 0x36,
@@ -287,8 +303,8 @@ enum : unsigned {
 };
 
 enum : unsigned {
-  WASM_SEGMENT_IS_PASSIVE = 0x01,
-  WASM_SEGMENT_HAS_MEMINDEX = 0x02,
+  WASM_DATA_SEGMENT_IS_PASSIVE = 0x01,
+  WASM_DATA_SEGMENT_HAS_MEMINDEX = 0x02,
 };
 
 // Feature policy prefixes used in the custom "target_features" section
@@ -300,8 +316,10 @@ enum : uint8_t {
 
 // Kind codes used in the custom "name" section
 enum : unsigned {
-  WASM_NAMES_FUNCTION = 0x1,
-  WASM_NAMES_LOCAL = 0x2,
+  WASM_NAMES_FUNCTION = 1,
+  WASM_NAMES_LOCAL = 2,
+  WASM_NAMES_GLOBAL = 7,
+  WASM_NAMES_DATA_SEGMENT = 9,
 };
 
 // Kind codes used in the custom "linking" section
@@ -316,6 +334,8 @@ enum : unsigned {
 enum : unsigned {
   WASM_COMDAT_DATA = 0x0,
   WASM_COMDAT_FUNCTION = 0x1,
+  // GLOBAL, EVENT, and TABLE are in here but LLVM doesn't use them yet.
+  WASM_COMDAT_SECTION = 0x5,
 };
 
 // Kind codes used in the custom "linking" section in the WASM_SYMBOL_TABLE
@@ -325,6 +345,7 @@ enum WasmSymbolType : unsigned {
   WASM_SYMBOL_TYPE_GLOBAL = 0x2,
   WASM_SYMBOL_TYPE_SECTION = 0x3,
   WASM_SYMBOL_TYPE_EVENT = 0x4,
+  WASM_SYMBOL_TYPE_TABLE = 0x5,
 };
 
 // Kinds of event attributes.
@@ -360,7 +381,7 @@ enum class ValType {
   F32 = WASM_TYPE_F32,
   F64 = WASM_TYPE_F64,
   V128 = WASM_TYPE_V128,
-  EXNREF = WASM_TYPE_EXNREF,
+  FUNCREF = WASM_TYPE_FUNCREF,
   EXTERNREF = WASM_TYPE_EXTERNREF,
 };
 
diff --git a/contrib/llvm-project/llvm/include/llvm/BinaryFormat/WasmRelocs.def b/contrib/llvm-project/llvm/include/llvm/BinaryFormat/WasmRelocs.def
index 05c5147e6314..dca63eca9455 100644
--- a/contrib/llvm-project/llvm/include/llvm/BinaryFormat/WasmRelocs.def
+++ b/contrib/llvm-project/llvm/include/llvm/BinaryFormat/WasmRelocs.def
@@ -20,3 +20,8 @@ WASM_RELOC(R_WASM_MEMORY_ADDR_LEB64,      14)
 WASM_RELOC(R_WASM_MEMORY_ADDR_SLEB64,     15)
 WASM_RELOC(R_WASM_MEMORY_ADDR_I64,        16)
 WASM_RELOC(R_WASM_MEMORY_ADDR_REL_SLEB64, 17)
+WASM_RELOC(R_WASM_TABLE_INDEX_SLEB64,     18)
+WASM_RELOC(R_WASM_TABLE_INDEX_I64,        19)
+WASM_RELOC(R_WASM_TABLE_NUMBER_LEB,       20)
+WASM_RELOC(R_WASM_MEMORY_ADDR_TLS_SLEB,   21)
+WASM_RELOC(R_WASM_FUNCTION_OFFSET_I64,    22)
diff --git a/contrib/llvm-project/llvm/include/llvm/BinaryFormat/WasmTraits.h b/contrib/llvm-project/llvm/include/llvm/BinaryFormat/WasmTraits.h
new file mode 100644
index 000000000000..e34182499187
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/BinaryFormat/WasmTraits.h
@@ -0,0 +1,68 @@
+//===- WasmTraits.h - DenseMap traits for the Wasm structures ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides llvm::DenseMapInfo traits for the Wasm structures.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_BINARYFORMAT_WASMTRAITS_H
+#define LLVM_BINARYFORMAT_WASMTRAITS_H
+
+#include "llvm/ADT/Hashing.h"
+#include "llvm/BinaryFormat/Wasm.h"
+
+namespace llvm {
+
+template <typename T> struct DenseMapInfo;
+
+// Traits for using WasmSignature in a DenseMap.
+template <> struct DenseMapInfo<wasm::WasmSignature> {
+  static wasm::WasmSignature getEmptyKey() {
+    wasm::WasmSignature Sig;
+    Sig.State = wasm::WasmSignature::Empty;
+    return Sig;
+  }
+  static wasm::WasmSignature getTombstoneKey() {
+    wasm::WasmSignature Sig;
+    Sig.State = wasm::WasmSignature::Tombstone;
+    return Sig;
+  }
+  static unsigned getHashValue(const wasm::WasmSignature &Sig) {
+    uintptr_t H = hash_value(Sig.State);
+    for (auto Ret : Sig.Returns)
+      H = hash_combine(H, Ret);
+    for (auto Param : Sig.Params)
+      H = hash_combine(H, Param);
+    return H;
+  }
+  static bool isEqual(const wasm::WasmSignature &LHS,
+                      const wasm::WasmSignature &RHS) {
+    return LHS == RHS;
+  }
+};
+
+// Traits for using WasmGlobalType in a DenseMap
+template <> struct DenseMapInfo<wasm::WasmGlobalType> {
+  static wasm::WasmGlobalType getEmptyKey() {
+    return wasm::WasmGlobalType{1, true};
+  }
+  static wasm::WasmGlobalType getTombstoneKey() {
+    return wasm::WasmGlobalType{2, true};
+  }
+  static unsigned getHashValue(const wasm::WasmGlobalType &GlobalType) {
+    return hash_combine(GlobalType.Type, GlobalType.Mutable);
+  }
+  static bool isEqual(const wasm::WasmGlobalType &LHS,
+                      const wasm::WasmGlobalType &RHS) {
+    return LHS == RHS;
+  }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_BINARYFORMAT_WASMTRAITS_H
diff --git a/contrib/llvm-project/llvm/include/llvm/BinaryFormat/XCOFF.h b/contrib/llvm-project/llvm/include/llvm/BinaryFormat/XCOFF.h
index 5a7ce80a2f62..48e1baf72689 100644
--- a/contrib/llvm-project/llvm/include/llvm/BinaryFormat/XCOFF.h
+++ b/contrib/llvm-project/llvm/include/llvm/BinaryFormat/XCOFF.h
@@ -18,6 +18,7 @@
 
 namespace llvm {
 class StringRef;
+template <unsigned> class SmallString;
 
 namespace XCOFF {
 
@@ -28,6 +29,7 @@ constexpr size_t NameSize = 8;
 constexpr size_t SymbolTableEntrySize = 18;
 constexpr size_t RelocationSerializationSize32 = 10;
 constexpr uint16_t RelocOverflow = 65535;
+constexpr uint8_t AllocRegNo = 31;
 
 enum ReservedSectionNum : int16_t { N_DEBUG = -2, N_ABS = -1, N_UNDEF = 0 };
 
@@ -294,6 +296,115 @@ enum CFileCpuId : uint8_t {
 
 StringRef getMappingClassString(XCOFF::StorageMappingClass SMC);
 StringRef getRelocationTypeString(XCOFF::RelocationType Type);
+SmallString<32> parseParmsType(uint32_t Value, unsigned ParmsNum);
+
+struct TracebackTable {
+  enum LanguageID : uint8_t {
+    C,
+    Fortran,
+    Pascal,
+    Ada,
+    PL1,
+    Basic,
+    Lisp,
+    Cobol,
+    Modula2,
+    CPlusPlus,
+    Rpg,
+    PL8,
+    PLIX = PL8,
+    Assembly,
+    Java,
+    ObjectiveC
+  };
+  // Byte 1
+  static constexpr uint32_t VersionMask = 0xFF00'0000;
+  static constexpr uint8_t VersionShift = 24;
+
+  // Byte 2
+  static constexpr uint32_t LanguageIdMask = 0x00FF'0000;
+  static constexpr uint8_t LanguageIdShift = 16;
+
+  // Byte 3
+  static constexpr uint32_t IsGlobaLinkageMask = 0x0000'8000;
+  static constexpr uint32_t IsOutOfLineEpilogOrPrologueMask = 0x0000'4000;
+  static constexpr uint32_t HasTraceBackTableOffsetMask = 0x0000'2000;
+  static constexpr uint32_t IsInternalProcedureMask = 0x0000'1000;
+  static constexpr uint32_t HasControlledStorageMask = 0x0000'0800;
+  static constexpr uint32_t IsTOClessMask = 0x0000'0400;
+  static constexpr uint32_t IsFloatingPointPresentMask = 0x0000'0200;
+  static constexpr uint32_t IsFloatingPointOperationLogOrAbortEnabledMask =
+      0x0000'0100;
+
+  // Byte 4
+  static constexpr uint32_t IsInterruptHandlerMask = 0x0000'0080;
+  static constexpr uint32_t IsFunctionNamePresentMask = 0x0000'0040;
+  static constexpr uint32_t IsAllocaUsedMask = 0x0000'0020;
+  static constexpr uint32_t OnConditionDirectiveMask = 0x0000'001C;
+  static constexpr uint32_t IsCRSavedMask = 0x0000'0002;
+  static constexpr uint32_t IsLRSavedMask = 0x0000'0001;
+  static constexpr uint8_t OnConditionDirectiveShift = 2;
+
+  // Byte 5
+  static constexpr uint32_t IsBackChainStoredMask = 0x8000'0000;
+  static constexpr uint32_t IsFixupMask = 0x4000'0000;
+  static constexpr uint32_t FPRSavedMask = 0x3F00'0000;
+  static constexpr uint32_t FPRSavedShift = 24;
+
+  // Byte 6
+  static constexpr uint32_t HasVectorInfoMask = 0x0080'0000;
+  static constexpr uint32_t HasExtensionTableMask = 0x0040'0000;
+  static constexpr uint32_t GPRSavedMask = 0x003F'0000;
+  static constexpr uint32_t GPRSavedShift = 16;
+
+  // Byte 7
+  static constexpr uint32_t NumberOfFixedParmsMask = 0x0000'FF00;
+  static constexpr uint8_t NumberOfFixedParmsShift = 8;
+
+  // Byte 8
+  static constexpr uint32_t NumberOfFloatingPointParmsMask = 0x0000'00FE;
+  static constexpr uint32_t HasParmsOnStackMask = 0x0000'0001;
+  static constexpr uint8_t NumberOfFloatingPointParmsShift = 1;
+
+  // Masks to select leftmost bits for decoding parameter type information.
+  // Bit to use when vector info is not presented.
+  static constexpr uint32_t ParmTypeIsFloatingBit = 0x8000'0000;
+  static constexpr uint32_t ParmTypeFloatingIsDoubleBit = 0x4000'0000;
+  // Bits to use when vector info is presented.
+  static constexpr uint32_t ParmTypeIsFixedBits = 0x0000'0000;
+  static constexpr uint32_t ParmTypeIsVectorBits = 0x4000'0000;
+  static constexpr uint32_t ParmTypeIsFloatingBits = 0x8000'0000;
+  static constexpr uint32_t ParmTypeIsDoubleBits = 0xC000'0000;
+  static constexpr uint32_t ParmTypeMask = 0xC000'0000;
+
+  // Vector extension
+  static constexpr uint16_t NumberOfVRSavedMask = 0xFC00;
+  static constexpr uint16_t IsVRSavedOnStackMask = 0x0200;
+  static constexpr uint16_t HasVarArgsMask = 0x0100;
+  static constexpr uint8_t NumberOfVRSavedShift = 10;
+
+  static constexpr uint16_t NumberOfVectorParmsMask = 0x00FE;
+  static constexpr uint16_t HasVMXInstructionMask = 0x0001;
+  static constexpr uint8_t NumberOfVectorParmsShift = 1;
+
+  static constexpr uint32_t ParmTypeIsVectorCharBit = 0x0000'0000;
+  static constexpr uint32_t ParmTypeIsVectorShortBit = 0x4000'0000;
+  static constexpr uint32_t ParmTypeIsVectorIntBit = 0x8000'0000;
+  static constexpr uint32_t ParmTypeIsVectorFloatBit = 0xC000'0000;
+};
+
+// Extended Traceback table flags.
+enum ExtendedTBTableFlag : uint8_t {
+  TB_OS1 = 0x80,         ///< Reserved for OS use.
+  TB_RESERVED = 0x40,    ///< Reserved for compiler.
+  TB_SSP_CANARY = 0x20,  ///< stack smasher canary present on stack.
+  TB_OS2 = 0x10,         ///< Reserved for OS use.
+  TB_EH_INFO = 0x08,     ///< Exception handling info present.
+  TB_LONGTBTABLE2 = 0x01 ///< Additional tbtable extension exists.
+};
+
+StringRef getNameForTracebackTableLanguageId(TracebackTable::LanguageID LangId);
+SmallString<32> getExtendedTBTableFlagString(uint8_t Flag);
 
 } // end namespace XCOFF
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/include/llvm/Bitcode/BitcodeCommon.h b/contrib/llvm-project/llvm/include/llvm/Bitcode/BitcodeCommon.h
new file mode 100644
index 000000000000..6a3e74550bc4
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/Bitcode/BitcodeCommon.h
@@ -0,0 +1,30 @@
+//===- BitcodeCommon.h - Common code for encode/decode   --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This header defines common code to be used by BitcodeWriter and
+// BitcodeReader.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_BITCODE_BITCODECOMMON_H
+#define LLVM_BITCODE_BITCODECOMMON_H
+
+#include "llvm/ADT/Bitfields.h"
+
+namespace llvm {
+
+struct AllocaPackedValues {
+  using Align = Bitfield::Element<unsigned, 0, 5>;
+  using UsedWithInAlloca = Bitfield::Element<bool, Align::NextBit, 1>;
+  using ExplicitType = Bitfield::Element<bool, UsedWithInAlloca::NextBit, 1>;
+  using SwiftError = Bitfield::Element<bool, ExplicitType::NextBit, 1>;
+};
+
+} // namespace llvm
+
+#endif // LLVM_BITCODE_BITCODECOMMON_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Bitcode/BitcodeConvenience.h b/contrib/llvm-project/llvm/include/llvm/Bitcode/BitcodeConvenience.h
new file mode 100644
index 000000000000..0060d014ba82
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/Bitcode/BitcodeConvenience.h
@@ -0,0 +1,486 @@
+//===- llvm/Bitcode/BitcodeConvenience.h - Convenience Wrappers -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file Convenience wrappers for the LLVM bitcode format and bitstream APIs.
+///
+/// This allows you to use a sort of DSL to declare and use bitcode
+/// abbreviations and records. Example:
+///
+/// \code
+///     using Metadata = BCRecordLayout<
+///       METADATA_ID,  // ID
+///       BCFixed<16>,  // Module format major version
+///       BCFixed<16>,  // Module format minor version
+///       BCBlob        // misc. version information
+///     >;
+///     Metadata metadata(Out);
+///     metadata.emit(ScratchRecord, VERSION_MAJOR, VERSION_MINOR, Data);
+/// \endcode
+///
+/// For details on the bitcode format, see
+///   http://llvm.org/docs/BitCodeFormat.html
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_BITCODE_BITCODECONVENIENCE_H
+#define LLVM_BITCODE_BITCODECONVENIENCE_H
+
+#include "llvm/Bitstream/BitCodes.h"
+#include "llvm/Bitstream/BitstreamWriter.h"
+#include <cstdint>
+
+namespace llvm {
+namespace detail {
+/// Convenience base for all kinds of bitcode abbreviation fields.
+///
+/// This just defines common properties queried by the metaprogramming.
+template <bool Compound = false> class BCField {
+public:
+  static const bool IsCompound = Compound;
+
+  /// Asserts that the given data is a valid value for this field.
+  template <typename T> static void assertValid(const T &data) {}
+
+  /// Converts a raw numeric representation of this value to its preferred
+  /// type.
+  template <typename T> static T convert(T rawValue) { return rawValue; }
+};
+} // namespace detail
+
+/// Represents a literal operand in a bitcode record.
+///
+/// The value of a literal operand is the same for all instances of the record,
+/// so it is only emitted in the abbreviation definition.
+///
+/// Note that because this uses a compile-time template, you cannot have a
+/// literal operand that is fixed at run-time without dropping down to the
+/// raw LLVM APIs.
+template <uint64_t Value> class BCLiteral : public detail::BCField<> {
+public:
+  static void emitOp(llvm::BitCodeAbbrev &abbrev) {
+    abbrev.Add(llvm::BitCodeAbbrevOp(Value));
+  }
+
+  template <typename T> static void assertValid(const T &data) {
+    assert(data == Value && "data value does not match declared literal value");
+  }
+};
+
+/// Represents a fixed-width value in a bitcode record.
+///
+/// Note that the LLVM bitcode format only supports unsigned values.
+template <unsigned Width> class BCFixed : public detail::BCField<> {
+public:
+  static_assert(Width <= 64, "fixed-width field is too large");
+
+  static void emitOp(llvm::BitCodeAbbrev &abbrev) {
+    abbrev.Add(llvm::BitCodeAbbrevOp(llvm::BitCodeAbbrevOp::Fixed, Width));
+  }
+
+  static void assertValid(const bool &data) {
+    assert(llvm::isUInt<Width>(data) &&
+           "data value does not fit in the given bit width");
+  }
+
+  template <typename T> static void assertValid(const T &data) {
+    assert(data >= 0 && "cannot encode signed integers");
+    assert(llvm::isUInt<Width>(data) &&
+           "data value does not fit in the given bit width");
+  }
+};
+
+/// Represents a variable-width value in a bitcode record.
+///
+/// The \p Width parameter should include the continuation bit.
+///
+/// Note that the LLVM bitcode format only supports unsigned values.
+template <unsigned Width> class BCVBR : public detail::BCField<> {
+  static_assert(Width >= 2, "width does not have room for continuation bit");
+
+public:
+  static void emitOp(llvm::BitCodeAbbrev &abbrev) {
+    abbrev.Add(llvm::BitCodeAbbrevOp(llvm::BitCodeAbbrevOp::VBR, Width));
+  }
+
+  template <typename T> static void assertValid(const T &data) {
+    assert(data >= 0 && "cannot encode signed integers");
+  }
+};
+
+/// Represents a character encoded in LLVM's Char6 encoding.
+///
+/// This format is suitable for encoding decimal numbers (without signs or
+/// exponents) and C identifiers (without dollar signs), but not much else.
+///
+/// \sa http://llvm.org/docs/BitCodeFormat.html#char6-encoded-value
+class BCChar6 : public detail::BCField<> {
+public:
+  static void emitOp(llvm::BitCodeAbbrev &abbrev) {
+    abbrev.Add(llvm::BitCodeAbbrevOp(llvm::BitCodeAbbrevOp::Char6));
+  }
+
+  template <typename T> static void assertValid(const T &data) {
+    assert(llvm::BitCodeAbbrevOp::isChar6(data) && "invalid Char6 data");
+  }
+
+  template <typename T> char convert(T rawValue) {
+    return static_cast<char>(rawValue);
+  }
+};
+
+/// Represents an untyped blob of bytes.
+///
+/// If present, this must be the last field in a record.
+class BCBlob : public detail::BCField<true> {
+public:
+  static void emitOp(llvm::BitCodeAbbrev &abbrev) {
+    abbrev.Add(llvm::BitCodeAbbrevOp(llvm::BitCodeAbbrevOp::Blob));
+  }
+};
+
+/// Represents an array of some other type.
+///
+/// If present, this must be the last field in a record.
+template <typename ElementTy> class BCArray : public detail::BCField<true> {
+  static_assert(!ElementTy::IsCompound, "arrays can only contain scalar types");
+
+public:
+  static void emitOp(llvm::BitCodeAbbrev &abbrev) {
+    abbrev.Add(llvm::BitCodeAbbrevOp(llvm::BitCodeAbbrevOp::Array));
+    ElementTy::emitOp(abbrev);
+  }
+};
+
+namespace detail {
+/// Attaches the last field to an abbreviation.
+///
+/// This is the base case for \c emitOps.
+///
+/// \sa BCRecordLayout::emitAbbrev
+template <typename FieldTy> static void emitOps(llvm::BitCodeAbbrev &abbrev) {
+  FieldTy::emitOp(abbrev);
+}
+
+/// Attaches fields to an abbreviation.
+///
+/// This is the recursive case for \c emitOps.
+///
+/// \sa BCRecordLayout::emitAbbrev
+template <typename FieldTy, typename Next, typename... Rest>
+static void emitOps(llvm::BitCodeAbbrev &abbrev) {
+  static_assert(!FieldTy::IsCompound,
+                "arrays and blobs may not appear in the middle of a record");
+  FieldTy::emitOp(abbrev);
+  emitOps<Next, Rest...>(abbrev);
+}
+
+/// Helper class for dealing with a scalar element in the middle of a record.
+///
+/// \sa BCRecordLayout
+template <typename ElementTy, typename... Fields> class BCRecordCoding {
+public:
+  template <typename BufferTy, typename ElementDataTy, typename... DataTy>
+  static void emit(llvm::BitstreamWriter &Stream, BufferTy &buffer,
+                   unsigned code, ElementDataTy element, DataTy &&...data) {
+    static_assert(!ElementTy::IsCompound,
+                  "arrays and blobs may not appear in the middle of a record");
+    ElementTy::assertValid(element);
+    buffer.push_back(element);
+    BCRecordCoding<Fields...>::emit(Stream, buffer, code,
+                                    std::forward<DataTy>(data)...);
+  }
+
+  template <typename T, typename ElementDataTy, typename... DataTy>
+  static void read(ArrayRef<T> buffer, ElementDataTy &element,
+                   DataTy &&...data) {
+    assert(!buffer.empty() && "too few elements in buffer");
+    element = ElementTy::convert(buffer.front());
+    BCRecordCoding<Fields...>::read(buffer.slice(1),
+                                    std::forward<DataTy>(data)...);
+  }
+
+  template <typename T, typename... DataTy>
+  static void read(ArrayRef<T> buffer, NoneType, DataTy &&...data) {
+    assert(!buffer.empty() && "too few elements in buffer");
+    BCRecordCoding<Fields...>::read(buffer.slice(1),
+                                    std::forward<DataTy>(data)...);
+  }
+};
+
+/// Helper class for dealing with a scalar element at the end of a record.
+///
+/// This has a separate implementation because up until now we've only been
+/// \em building the record (into a data buffer), and now we need to hand it
+/// off to the BitstreamWriter to be emitted.
+///
+/// \sa BCRecordLayout
+template <typename ElementTy> class BCRecordCoding<ElementTy> {
+public:
+  template <typename BufferTy, typename DataTy>
+  static void emit(llvm::BitstreamWriter &Stream, BufferTy &buffer,
+                   unsigned code, const DataTy &data) {
+    static_assert(!ElementTy::IsCompound,
+                  "arrays and blobs need special handling");
+    ElementTy::assertValid(data);
+    buffer.push_back(data);
+    Stream.EmitRecordWithAbbrev(code, buffer);
+  }
+
+  template <typename T, typename DataTy>
+  static void read(ArrayRef<T> buffer, DataTy &data) {
+    assert(buffer.size() == 1 && "record data does not match layout");
+    data = ElementTy::convert(buffer.front());
+  }
+
+  template <typename T> static void read(ArrayRef<T> buffer, NoneType) {
+    assert(buffer.size() == 1 && "record data does not match layout");
+    (void)buffer;
+  }
+
+  template <typename T> static void read(ArrayRef<T> buffer) = delete;
+};
+
+/// Helper class for dealing with an array at the end of a record.
+///
+/// \sa BCRecordLayout::emitRecord
+template <typename ElementTy> class BCRecordCoding<BCArray<ElementTy>> {
+public:
+  template <typename BufferTy>
+  static void emit(llvm::BitstreamWriter &Stream, BufferTy &buffer,
+                   unsigned code, StringRef data) {
+    // TODO: validate array data.
+    Stream.EmitRecordWithArray(code, buffer, data);
+  }
+
+  template <typename BufferTy, typename ArrayTy>
+  static void emit(llvm::BitstreamWriter &Stream, BufferTy &buffer,
+                   unsigned code, const ArrayTy &array) {
+#ifndef NDEBUG
+    for (auto &element : array)
+      ElementTy::assertValid(element);
+#endif
+    buffer.reserve(buffer.size() + std::distance(array.begin(), array.end()));
+    std::copy(array.begin(), array.end(), std::back_inserter(buffer));
+    Stream.EmitRecordWithAbbrev(code, buffer);
+  }
+
+  template <typename BufferTy, typename ElementDataTy, typename... DataTy>
+  static void emit(llvm::BitstreamWriter &Stream, BufferTy &buffer,
+                   unsigned code, ElementDataTy element, DataTy... data) {
+    std::array<ElementDataTy, 1 + sizeof...(data)> array{{element, data...}};
+    emit(Stream, buffer, code, array);
+  }
+
+  template <typename BufferTy>
+  static void emit(llvm::BitstreamWriter &Stream, BufferTy &Buffer,
+                   unsigned code, NoneType) {
+    Stream.EmitRecordWithAbbrev(code, Buffer);
+  }
+
+  template <typename T>
+  static void read(ArrayRef<T> Buffer, ArrayRef<T> &rawData) {
+    rawData = Buffer;
+  }
+
+  template <typename T, typename ArrayTy>
+  static void read(ArrayRef<T> buffer, ArrayTy &array) {
+    array.append(llvm::map_iterator(buffer.begin(), T::convert),
+                 llvm::map_iterator(buffer.end(), T::convert));
+  }
+
+  template <typename T> static void read(ArrayRef<T> buffer, NoneType) {
+    (void)buffer;
+  }
+
+  template <typename T> static void read(ArrayRef<T> buffer) = delete;
+};
+
+/// Helper class for dealing with a blob at the end of a record.
+///
+/// \sa BCRecordLayout
+template <> class BCRecordCoding<BCBlob> {
+public:
+  template <typename BufferTy>
+  static void emit(llvm::BitstreamWriter &Stream, BufferTy &buffer,
+                   unsigned code, StringRef data) {
+    Stream.EmitRecordWithBlob(code, buffer, data);
+  }
+
+  template <typename T> static void read(ArrayRef<T> buffer) { (void)buffer; }
+
+  /// Blob data is not stored in the buffer if you are using the correct
+  /// accessor; this method should not be used.
+  template <typename T, typename DataTy>
+  static void read(ArrayRef<T> buffer, DataTy &data) = delete;
+};
+
+/// A type trait whose \c type field is the last of its template parameters.
+template <typename Head, typename... Tail> struct last_type {
+  using type = typename last_type<Tail...>::type;
+};
+
+template <typename Head> struct last_type<Head> { using type = Head; };
+
+/// A type trait whose \c value field is \c true if the last type is BCBlob.
+template <typename... Types>
+using has_blob = std::is_same<BCBlob, typename last_type<int, Types...>::type>;
+
+/// A type trait whose \c value field is \c true if the given type is a
+/// BCArray (of any element kind).
+template <typename T> struct is_array {
+private:
+  template <typename E> static bool check(BCArray<E> *);
+  static int check(...);
+
+public:
+  typedef bool value_type;
+  static constexpr bool value = !std::is_same<decltype(check((T *)nullptr)),
+                                              decltype(check(false))>::value;
+};
+
+/// A type trait whose \c value field is \c true if the last type is a
+/// BCArray (of any element kind).
+template <typename... Types>
+using has_array = is_array<typename last_type<int, Types...>::type>;
+} // namespace detail
+
+/// Represents a single bitcode record type.
+///
+/// This class template is meant to be instantiated and then given a name,
+/// so that from then on that name can be used.
+template <typename IDField, typename... Fields> class BCGenericRecordLayout {
+  llvm::BitstreamWriter &Stream;
+
+public:
+  /// The abbreviation code used for this record in the current block.
+  ///
+  /// Note that this is not the same as the semantic record code, which is the
+  /// first field of the record.
+  const unsigned AbbrevCode;
+
+  /// Create a layout and register it with the given bitstream writer.
+  explicit BCGenericRecordLayout(llvm::BitstreamWriter &Stream)
+      : Stream(Stream), AbbrevCode(emitAbbrev(Stream)) {}
+
+  /// Emit a record to the bitstream writer, using the given buffer for scratch
+  /// space.
+  ///
+  /// Note that even fixed arguments must be specified here.
+  template <typename BufferTy, typename... Data>
+  void emit(BufferTy &buffer, unsigned id, Data &&...data) const {
+    emitRecord(Stream, buffer, AbbrevCode, id, std::forward<Data>(data)...);
+  }
+
+  /// Registers this record's layout with the bitstream reader.
+  ///
+  /// eturns The abbreviation code for the newly-registered record type.
+  static unsigned emitAbbrev(llvm::BitstreamWriter &Stream) {
+    auto Abbrev = std::make_shared<llvm::BitCodeAbbrev>();
+    detail::emitOps<IDField, Fields...>(*Abbrev);
+    return Stream.EmitAbbrev(std::move(Abbrev));
+  }
+
+  /// Emit a record identified by \p abbrCode to bitstream reader \p Stream,
+  /// using \p buffer for scratch space.
+  ///
+  /// Note that even fixed arguments must be specified here. Blobs are passed
+  /// as StringRefs, while arrays can be passed inline, as aggregates, or as
+  /// pre-encoded StringRef data. Skipped values and empty arrays should use
+  /// the special Nothing value.
+  template <typename BufferTy, typename... Data>
+  static void emitRecord(llvm::BitstreamWriter &Stream, BufferTy &buffer,
+                         unsigned abbrCode, unsigned recordID, Data &&...data) {
+    static_assert(sizeof...(data) <= sizeof...(Fields) ||
+                      detail::has_array<Fields...>::value,
+                  "Too many record elements");
+    static_assert(sizeof...(data) >= sizeof...(Fields),
+                  "Too few record elements");
+    buffer.clear();
+    detail::BCRecordCoding<IDField, Fields...>::emit(
+        Stream, buffer, abbrCode, recordID, std::forward<Data>(data)...);
+  }
+
+  /// Extract record data from \p buffer into the given data fields.
+  ///
+  /// Note that even fixed arguments must be specified here. Pass \c Nothing
+  /// if you don't care about a particular parameter. Blob data is not included
+  /// in the buffer and should be handled separately by the caller.
+  template <typename ElementTy, typename... Data>
+  static void readRecord(ArrayRef<ElementTy> buffer, Data &&...data) {
+    static_assert(sizeof...(data) <= sizeof...(Fields),
+                  "Too many record elements");
+    static_assert(sizeof...(Fields) <=
+                      sizeof...(data) + detail::has_blob<Fields...>::value,
+                  "Too few record elements");
+    return detail::BCRecordCoding<Fields...>::read(buffer,
+                                                   std::forward<Data>(data)...);
+  }
+
+  /// Extract record data from \p buffer into the given data fields.
+  ///
+  /// Note that even fixed arguments must be specified here. Pass \c Nothing
+  /// if you don't care about a particular parameter. Blob data is not included
+  /// in the buffer and should be handled separately by the caller.
+  template <typename BufferTy, typename... Data>
+  static void readRecord(BufferTy &buffer, Data &&...data) {
+    return readRecord(llvm::makeArrayRef(buffer), std::forward<Data>(data)...);
+  }
+};
+
+/// A record with a fixed record code.
+template <unsigned RecordCode, typename... Fields>
+class BCRecordLayout
+    : public BCGenericRecordLayout<BCLiteral<RecordCode>, Fields...> {
+  using Base = BCGenericRecordLayout<BCLiteral<RecordCode>, Fields...>;
+
+public:
+  enum : unsigned {
+    /// The record code associated with this layout.
+    Code = RecordCode
+  };
+
+  /// Create a layout and register it with the given bitstream writer.
+  explicit BCRecordLayout(llvm::BitstreamWriter &Stream) : Base(Stream) {}
+
+  /// Emit a record to the bitstream writer, using the given buffer for scratch
+  /// space.
+  ///
+  /// Note that even fixed arguments must be specified here.
+  template <typename BufferTy, typename... Data>
+  void emit(BufferTy &buffer, Data &&...data) const {
+    Base::emit(buffer, RecordCode, std::forward<Data>(data)...);
+  }
+
+  /// Emit a record identified by \p abbrCode to bitstream reader \p Stream,
+  /// using \p buffer for scratch space.
+  ///
+  /// Note that even fixed arguments must be specified here. Currently, arrays
+  /// and blobs can only be passed as StringRefs.
+  template <typename BufferTy, typename... Data>
+  static void emitRecord(llvm::BitstreamWriter &Stream, BufferTy &buffer,
+                         unsigned abbrCode, Data &&...data) {
+    Base::emitRecord(Stream, buffer, abbrCode, RecordCode,
+                     std::forward<Data>(data)...);
+  }
+};
+
+/// RAII object to pair entering and exiting a sub-block.
+class BCBlockRAII {
+  llvm::BitstreamWriter &Stream;
+
+public:
+  BCBlockRAII(llvm::BitstreamWriter &Stream, unsigned block, unsigned abbrev)
+      : Stream(Stream) {
+    Stream.EnterSubblock(block, abbrev);
+  }
+
+  ~BCBlockRAII() { Stream.ExitBlock(); }
+};
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm-project/llvm/include/llvm/Bitcode/BitcodeWriter.h b/contrib/llvm-project/llvm/include/llvm/Bitcode/BitcodeWriter.h
index 4beb89d30e00..7ad2d37a2a35 100644
--- a/contrib/llvm-project/llvm/include/llvm/Bitcode/BitcodeWriter.h
+++ b/contrib/llvm-project/llvm/include/llvm/Bitcode/BitcodeWriter.h
@@ -47,7 +47,7 @@ class raw_ostream;
 
   public:
     /// Create a BitcodeWriter that writes to Buffer.
-    BitcodeWriter(SmallVectorImpl<char> &Buffer);
+    BitcodeWriter(SmallVectorImpl<char> &Buffer, raw_fd_stream *FS = nullptr);
 
     ~BitcodeWriter();
 
@@ -152,10 +152,18 @@ class raw_ostream;
                         const std::map<std::string, GVSummaryMapTy>
                             *ModuleToSummariesForIndex = nullptr);
 
-  /// Save a copy of the llvm IR as data in the __LLVM,__bitcode section.
+  /// If EmbedBitcode is set, save a copy of the llvm IR as data in the
+  ///  __LLVM,__bitcode section (.llvmbc on non-MacOS).
+  /// If available, pass the serialized module via the Buf parameter. If not,
+  /// pass an empty (default-initialized) MemoryBufferRef, and the serialization
+  /// will be handled by this API. The same behavior happens if the provided Buf
+  /// is not bitcode (i.e. if it's invalid data or even textual LLVM assembly).
+  /// If EmbedCmdline is set, the command line is also exported in
+  /// the corresponding section (__LLVM,_cmdline / .llvmcmd) - even if CmdArgs
+  /// were empty.
   void EmbedBitcodeInModule(Module &M, MemoryBufferRef Buf, bool EmbedBitcode,
-                            bool EmbedMarker,
-                            const std::vector<uint8_t> *CmdArgs);
+                            bool EmbedCmdline,
+                            const std::vector<uint8_t> &CmdArgs);
 
 } // end namespace llvm
 
diff --git a/contrib/llvm-project/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/contrib/llvm-project/llvm/include/llvm/Bitcode/LLVMBitCodes.h
index de4fe6630324..5b4854d6c95e 100644
--- a/contrib/llvm-project/llvm/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/contrib/llvm-project/llvm/include/llvm/Bitcode/LLVMBitCodes.h
@@ -168,7 +168,8 @@ enum TypeCodes {
 
   TYPE_CODE_TOKEN = 22, // TOKEN
 
-  TYPE_CODE_BFLOAT = 23 // BRAIN FLOATING POINT
+  TYPE_CODE_BFLOAT = 23, // BRAIN FLOATING POINT
+  TYPE_CODE_X86_AMX = 24 // X86 AMX
 };
 
 enum OperandBundleTagCode {
@@ -338,7 +339,11 @@ enum MetadataCodes {
   METADATA_INDEX_OFFSET = 38,           // [offset]
   METADATA_INDEX = 39,                  // [bitpos]
   METADATA_LABEL = 40,                  // [distinct, scope, name, file, line]
-  METADATA_COMMON_BLOCK = 44,     // [distinct, scope, name, variable,...]
+  METADATA_STRING_TYPE = 41,            // [distinct, name, size, align,...]
+  // Codes 42 and 43 are reserved for support for Fortran array specific debug
+  // info.
+  METADATA_COMMON_BLOCK = 44,    // [distinct, scope, name, variable,...]
+  METADATA_GENERIC_SUBRANGE = 45 // [distinct, count, lo, up, stride]
 };
 
 // The constants block (CONSTANTS_BLOCK_ID) describes emission for each
@@ -371,6 +376,7 @@ enum ConstantsCodes {
                                  //                 asmdialect,asmstr,conststr]
   CST_CODE_CE_GEP_WITH_INRANGE_INDEX = 24, //      [opty, flags, n x operands]
   CST_CODE_CE_UNOP = 25,         // CE_UNOP:      [opcode, opval]
+  CST_CODE_POISON = 26,          // POISON
 };
 
 /// CastOpcodes - These are values used in the bitcode files to encode which
@@ -536,8 +542,9 @@ enum FunctionCodes {
 
   FUNC_CODE_DEBUG_LOC = 35,        // DEBUG_LOC:  [Line,Col,ScopeVal, IAVal]
   FUNC_CODE_INST_FENCE = 36,       // FENCE: [ordering, synchscope]
-  FUNC_CODE_INST_CMPXCHG_OLD = 37, // CMPXCHG: [ptrty,ptr,cmp,new, align, vol,
-                                   //           ordering, synchscope]
+  FUNC_CODE_INST_CMPXCHG_OLD = 37, // CMPXCHG: [ptrty, ptr, cmp, val, vol,
+                                   //            ordering, synchscope,
+                                   //            failure_ordering?, weak?]
   FUNC_CODE_INST_ATOMICRMW = 38,   // ATOMICRMW: [ptrty,ptr,val, operation,
                                    //             align, vol,
                                    //             ordering, synchscope]
@@ -551,8 +558,9 @@ enum FunctionCodes {
   FUNC_CODE_INST_GEP = 43,             // GEP:  [inbounds, n x operands]
   FUNC_CODE_INST_STORE = 44,       // STORE: [ptrty,ptr,valty,val, align, vol]
   FUNC_CODE_INST_STOREATOMIC = 45, // STORE: [ptrty,ptr,val, align, vol
-  FUNC_CODE_INST_CMPXCHG = 46,     // CMPXCHG: [ptrty,ptr,valty,cmp,new, align,
-                                   //           vol,ordering,synchscope]
+  FUNC_CODE_INST_CMPXCHG = 46,     // CMPXCHG: [ptrty, ptr, cmp, val, vol,
+                                   //           success_ordering, synchscope,
+                                   //           failure_ordering, weak]
   FUNC_CODE_INST_LANDINGPAD = 47,  // LANDINGPAD: [ty,val,num,id0,val0...]
   FUNC_CODE_INST_CLEANUPRET = 48,  // CLEANUPRET: [val] or [val,bb#]
   FUNC_CODE_INST_CATCHRET = 49,    // CATCHRET: [val,bb#]
@@ -644,6 +652,11 @@ enum AttributeKindCodes {
   ATTR_KIND_NO_MERGE = 66,
   ATTR_KIND_NULL_POINTER_IS_VALID = 67,
   ATTR_KIND_NOUNDEF = 68,
+  ATTR_KIND_BYREF = 69,
+  ATTR_KIND_MUSTPROGRESS = 70,
+  ATTR_KIND_NO_CALLBACK = 71,
+  ATTR_KIND_HOT = 72,
+  ATTR_KIND_NO_PROFILE = 73,
 };
 
 enum ComdatSelectionKindCodes {
diff --git a/contrib/llvm-project/llvm/include/llvm/Bitstream/BitCodes.h b/contrib/llvm-project/llvm/include/llvm/Bitstream/BitCodes.h
index 41a3de3b20ef..9cd4e535a470 100644
--- a/contrib/llvm-project/llvm/include/llvm/Bitstream/BitCodes.h
+++ b/contrib/llvm-project/llvm/include/llvm/Bitstream/BitCodes.h
@@ -18,6 +18,7 @@
 #define LLVM_BITSTREAM_BITCODES_H
 
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <cassert>
@@ -137,13 +138,7 @@ public:
   }
 
   /// isChar6 - Return true if this character is legal in the Char6 encoding.
-  static bool isChar6(char C) {
-    if (C >= 'a' && C <= 'z') return true;
-    if (C >= 'A' && C <= 'Z') return true;
-    if (C >= '0' && C <= '9') return true;
-    if (C == '.' || C == '_') return true;
-    return false;
-  }
+  static bool isChar6(char C) { return isAlnum(C) || C == '.' || C == '_'; }
   static unsigned EncodeChar6(char C) {
     if (C >= 'a' && C <= 'z') return C-'a';
     if (C >= 'A' && C <= 'Z') return C-'A'+26;
diff --git a/contrib/llvm-project/llvm/include/llvm/Bitstream/BitstreamWriter.h b/contrib/llvm-project/llvm/include/llvm/Bitstream/BitstreamWriter.h
index c0ead19dc71d..3954df4897ae 100644
--- a/contrib/llvm-project/llvm/include/llvm/Bitstream/BitstreamWriter.h
+++ b/contrib/llvm-project/llvm/include/llvm/Bitstream/BitstreamWriter.h
@@ -20,17 +20,28 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Bitstream/BitCodes.h"
 #include "llvm/Support/Endian.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
 #include <vector>
 
 namespace llvm {
 
 class BitstreamWriter {
+  /// Out - The buffer that keeps unflushed bytes.
   SmallVectorImpl<char> &Out;
 
+  /// FS - The file stream that Out flushes to. If FS is nullptr, it does not
+  /// support read or seek, Out cannot be flushed until all data are written.
+  raw_fd_stream *FS;
+
+  /// FlushThreshold - If FS is valid, this is the threshold (unit B) to flush
+  /// FS.
+  const uint64_t FlushThreshold;
+
   /// CurBit - Always between 0 and 31 inclusive, specifies the next bit to use.
   unsigned CurBit;
 
-  /// CurValue - The current value.  Only bits < CurBit are valid.
+  /// CurValue - The current value. Only bits < CurBit are valid.
   uint32_t CurValue;
 
   /// CurCodeSize - This is the declared size of code values used for the
@@ -64,15 +75,19 @@ class BitstreamWriter {
 
   void WriteByte(unsigned char Value) {
     Out.push_back(Value);
+    FlushToFile();
   }
 
   void WriteWord(unsigned Value) {
     Value = support::endian::byte_swap<uint32_t, support::little>(Value);
     Out.append(reinterpret_cast<const char *>(&Value),
                reinterpret_cast<const char *>(&Value + 1));
+    FlushToFile();
   }
 
-  size_t GetBufferOffset() const { return Out.size(); }
+  uint64_t GetNumOfFlushedBytes() const { return FS ? FS->tell() : 0; }
+
+  size_t GetBufferOffset() const { return Out.size() + GetNumOfFlushedBytes(); }
 
   size_t GetWordIndex() const {
     size_t Offset = GetBufferOffset();
@@ -80,9 +95,29 @@ class BitstreamWriter {
     return Offset / 4;
   }
 
+  /// If the related file stream supports reading, seeking and writing, flush
+  /// the buffer if its size is above a threshold.
+  void FlushToFile() {
+    if (!FS)
+      return;
+    if (Out.size() < FlushThreshold)
+      return;
+    FS->write((char *)&Out.front(), Out.size());
+    Out.clear();
+  }
+
 public:
-  explicit BitstreamWriter(SmallVectorImpl<char> &O)
-    : Out(O), CurBit(0), CurValue(0), CurCodeSize(2) {}
+  /// Create a BitstreamWriter that writes to Buffer \p O.
+  ///
+  /// \p FS is the file stream that \p O flushes to incrementally. If \p FS is
+  /// null, \p O does not flush incrementially, but writes to disk at the end.
+  ///
+  /// \p FlushThreshold is the threshold (unit M) to flush \p O if \p FS is
+  /// valid.
+  BitstreamWriter(SmallVectorImpl<char> &O, raw_fd_stream *FS = nullptr,
+                  uint32_t FlushThreshold = 512)
+      : Out(O), FS(FS), FlushThreshold(FlushThreshold << 20), CurBit(0),
+        CurValue(0), CurCodeSize(2) {}
 
   ~BitstreamWriter() {
     assert(CurBit == 0 && "Unflushed data remaining");
@@ -103,12 +138,60 @@ public:
   /// with the specified value.
   void BackpatchWord(uint64_t BitNo, unsigned NewWord) {
     using namespace llvm::support;
-    unsigned ByteNo = BitNo / 8;
-    assert((!endian::readAtBitAlignment<uint32_t, little, unaligned>(
-               &Out[ByteNo], BitNo & 7)) &&
-           "Expected to be patching over 0-value placeholders");
-    endian::writeAtBitAlignment<uint32_t, little, unaligned>(
-        &Out[ByteNo], NewWord, BitNo & 7);
+    uint64_t ByteNo = BitNo / 8;
+    uint64_t StartBit = BitNo & 7;
+    uint64_t NumOfFlushedBytes = GetNumOfFlushedBytes();
+
+    if (ByteNo >= NumOfFlushedBytes) {
+      assert((!endian::readAtBitAlignment<uint32_t, little, unaligned>(
+                 &Out[ByteNo - NumOfFlushedBytes], StartBit)) &&
+             "Expected to be patching over 0-value placeholders");
+      endian::writeAtBitAlignment<uint32_t, little, unaligned>(
+          &Out[ByteNo - NumOfFlushedBytes], NewWord, StartBit);
+      return;
+    }
+
+    // If the byte offset to backpatch is flushed, use seek to backfill data.
+    // First, save the file position to restore later.
+    uint64_t CurPos = FS->tell();
+
+    // Copy data to update into Bytes from the file FS and the buffer Out.
+    char Bytes[9]; // Use one more byte to silence a warning from Visual C++.
+    size_t BytesNum = StartBit ? 8 : 4;
+    size_t BytesFromDisk = std::min(static_cast<uint64_t>(BytesNum), NumOfFlushedBytes - ByteNo);
+    size_t BytesFromBuffer = BytesNum - BytesFromDisk;
+
+    // When unaligned, copy existing data into Bytes from the file FS and the
+    // buffer Out so that it can be updated before writing. For debug builds
+    // read bytes unconditionally in order to check that the existing value is 0
+    // as expected.
+#ifdef NDEBUG
+    if (StartBit)
+#endif
+    {
+      FS->seek(ByteNo);
+      ssize_t BytesRead = FS->read(Bytes, BytesFromDisk);
+      (void)BytesRead; // silence warning
+      assert(BytesRead >= 0 && static_cast<size_t>(BytesRead) == BytesFromDisk);
+      for (size_t i = 0; i < BytesFromBuffer; ++i)
+        Bytes[BytesFromDisk + i] = Out[i];
+      assert((!endian::readAtBitAlignment<uint32_t, little, unaligned>(
+                 Bytes, StartBit)) &&
+             "Expected to be patching over 0-value placeholders");
+    }
+
+    // Update Bytes in terms of bit offset and value.
+    endian::writeAtBitAlignment<uint32_t, little, unaligned>(Bytes, NewWord,
+                                                             StartBit);
+
+    // Copy updated data back to the file FS and the buffer Out.
+    FS->seek(ByteNo);
+    FS->write(Bytes, BytesFromDisk);
+    for (size_t i = 0; i < BytesFromBuffer; ++i)
+      Out[i] = Bytes[BytesFromDisk + i];
+
+    // Restore the file position.
+    FS->seek(CurPos);
   }
 
   void BackpatchWord64(uint64_t BitNo, uint64_t Val) {
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/Analysis.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/Analysis.h
index fe610b5bdc8d..bdfb416d9bd9 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/Analysis.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/Analysis.h
@@ -92,11 +92,6 @@ void computeValueLLTs(const DataLayout &DL, Type &Ty,
 /// ExtractTypeInfo - Returns the type info, possibly bitcast, encoded in V.
 GlobalValue *ExtractTypeInfo(Value *V);
 
-/// hasInlineAsmMemConstraint - Return true if the inline asm instruction being
-/// processed uses a memory 'm' constraint.
-bool hasInlineAsmMemConstraint(InlineAsm::ConstraintInfoVector &CInfos,
-                               const TargetLowering &TLI);
-
 /// getFCmpCondCode - Return the ISD condition code corresponding to
 /// the given LLVM IR floating-point condition code.  This includes
 /// consideration of global floating-point math flags.
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/AntiDepBreaker.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/AntiDepBreaker.h
index d75c13e2dd75..0553d7d452a4 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/AntiDepBreaker.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/AntiDepBreaker.h
@@ -17,7 +17,6 @@
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/Support/Compiler.h"
 #include <cassert>
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/AsmPrinter.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/AsmPrinter.h
index 0eb950861af6..76486b0b48ce 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/AsmPrinter.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/AsmPrinter.h
@@ -66,6 +66,7 @@ class MCSymbol;
 class MCTargetOptions;
 class MDNode;
 class Module;
+class PseudoProbeHandler;
 class raw_ostream;
 class StackMaps;
 class StringRef;
@@ -139,9 +140,30 @@ public:
   using GOTEquivUsePair = std::pair<const GlobalVariable *, unsigned>;
   MapVector<const MCSymbol *, GOTEquivUsePair> GlobalGOTEquivs;
 
+  /// struct HandlerInfo and Handlers permit users or target extended
+  /// AsmPrinter to add their own handlers.
+  struct HandlerInfo {
+    std::unique_ptr<AsmPrinterHandler> Handler;
+    const char *TimerName;
+    const char *TimerDescription;
+    const char *TimerGroupName;
+    const char *TimerGroupDescription;
+
+    HandlerInfo(std::unique_ptr<AsmPrinterHandler> Handler,
+                const char *TimerName, const char *TimerDescription,
+                const char *TimerGroupName, const char *TimerGroupDescription)
+        : Handler(std::move(Handler)), TimerName(TimerName),
+          TimerDescription(TimerDescription), TimerGroupName(TimerGroupName),
+          TimerGroupDescription(TimerGroupDescription) {}
+  };
+
 private:
   MCSymbol *CurrentFnEnd = nullptr;
-  MCSymbol *CurExceptionSym = nullptr;
+
+  /// Map a basic block section ID to the exception symbol associated with that
+  /// section. Map entries are assigned and looked up via
+  /// AsmPrinter::getMBBExceptionSym.
+  DenseMap<unsigned, MCSymbol *> MBBSectionExceptionSyms;
 
   // The symbol used to represent the start of the current BB section of the
   // function. This is used to calculate the size of the BB section.
@@ -158,26 +180,10 @@ private:
 protected:
   MCSymbol *CurrentFnBegin = nullptr;
 
-  /// Protected struct HandlerInfo and Handlers permit target extended
-  /// AsmPrinter adds their own handlers.
-  struct HandlerInfo {
-    std::unique_ptr<AsmPrinterHandler> Handler;
-    const char *TimerName;
-    const char *TimerDescription;
-    const char *TimerGroupName;
-    const char *TimerGroupDescription;
-
-    HandlerInfo(std::unique_ptr<AsmPrinterHandler> Handler,
-                const char *TimerName, const char *TimerDescription,
-                const char *TimerGroupName, const char *TimerGroupDescription)
-        : Handler(std::move(Handler)), TimerName(TimerName),
-          TimerDescription(TimerDescription), TimerGroupName(TimerGroupName),
-          TimerGroupDescription(TimerGroupDescription) {}
-  };
-
   /// A vector of all debug/EH info emitters we should use. This vector
   /// maintains ownership of the emitters.
-  SmallVector<HandlerInfo, 1> Handlers;
+  std::vector<HandlerInfo> Handlers;
+  size_t NumUserHandlers = 0;
 
 public:
   struct SrcMgrDiagInfo {
@@ -201,6 +207,10 @@ private:
   /// If the target supports dwarf debug info, this pointer is non-null.
   DwarfDebug *DD = nullptr;
 
+  /// A handler that supports pseudo probe emission with embedded inline
+  /// context.
+  PseudoProbeHandler *PP = nullptr;
+
   /// If the current module uses dwarf CFI annotations strictly for debugging.
   bool isCFIMoveForDebugging = false;
 
@@ -216,6 +226,14 @@ public:
   uint16_t getDwarfVersion() const;
   void setDwarfVersion(uint16_t Version);
 
+  bool isDwarf64() const;
+
+  /// Returns 4 for DWARF32 and 8 for DWARF64.
+  unsigned int getDwarfOffsetByteSize() const;
+
+  /// Returns 4 for DWARF32 and 12 for DWARF64.
+  unsigned int getUnitLengthFieldByteSize() const;
+
   bool isPositionIndependent() const;
 
   /// Return true if assembly output should contain comments.
@@ -230,7 +248,10 @@ public:
 
   MCSymbol *getFunctionBegin() const { return CurrentFnBegin; }
   MCSymbol *getFunctionEnd() const { return CurrentFnEnd; }
-  MCSymbol *getCurExceptionSym();
+
+  // Return the exception symbol associated with the MBB section containing a
+  // given basic block.
+  MCSymbol *getMBBExceptionSym(const MachineBasicBlock &MBB);
 
   /// Return information about object file lowering.
   const TargetLoweringObjectFile &getObjFileLowering() const;
@@ -342,6 +363,10 @@ public:
 
   void emitStackSizeSection(const MachineFunction &MF);
 
+  void emitBBAddrMapSection(const MachineFunction &MF);
+
+  void emitPseudoProbe(const MachineInstr &MI);
+
   void emitRemarksSection(remarks::RemarkStreamer &RS);
 
   enum CFIMoveType { CFI_M_None, CFI_M_EH, CFI_M_Debug };
@@ -369,6 +394,32 @@ public:
   /// so, emit it and return true, otherwise do nothing and return false.
   bool emitSpecialLLVMGlobal(const GlobalVariable *GV);
 
+  /// `llvm.global_ctors` and `llvm.global_dtors` are arrays of Structor
+  /// structs.
+  ///
+  /// Priority - init priority
+  /// Func - global initialization or global clean-up function
+  /// ComdatKey - associated data
+  struct Structor {
+    int Priority = 0;
+    Constant *Func = nullptr;
+    GlobalValue *ComdatKey = nullptr;
+
+    Structor() = default;
+  };
+
+  /// This method gathers an array of Structors and then sorts them out by
+  /// Priority.
+  /// @param List The initializer of `llvm.global_ctors` or `llvm.global_dtors`
+  /// array.
+  /// @param[out] Structors Sorted Structor structs by Priority.
+  void preprocessXXStructorList(const DataLayout &DL, const Constant *List,
+                                SmallVector<Structor, 8> &Structors);
+
+  /// This method emits `llvm.global_ctors` or `llvm.global_dtors` list.
+  virtual void emitXXStructorList(const DataLayout &DL, const Constant *List,
+                                  bool IsCtor);
+
   /// Emit an alignment directive to the specified power of two boundary. If a
   /// global value is specified, and if that global has an explicit alignment
   /// requested, it will override the alignment request if required for
@@ -403,6 +454,11 @@ public:
   // Overridable Hooks
   //===------------------------------------------------------------------===//
 
+  void addAsmPrinterHandler(HandlerInfo Handler) {
+    Handlers.insert(Handlers.begin(), std::move(Handler));
+    NumUserHandlers++;
+  }
+
   // Targets can, or in the case of EmitInstruction, must implement these to
   // customize output.
 
@@ -534,9 +590,6 @@ public:
     emitLabelPlusOffset(Label, 0, Size, IsSectionRelative);
   }
 
-  /// Emit something like ".long Label + Offset".
-  void emitDwarfOffset(const MCSymbol *Label, uint64_t Offset) const;
-
   //===------------------------------------------------------------------===//
   // Dwarf Emission Helper Routines
   //===------------------------------------------------------------------===//
@@ -557,7 +610,7 @@ public:
   unsigned GetSizeOfEncodedValue(unsigned Encoding) const;
 
   /// Emit reference to a ttype global with a specified encoding.
-  void emitTTypeReference(const GlobalValue *GV, unsigned Encoding) const;
+  virtual void emitTTypeReference(const GlobalValue *GV, unsigned Encoding);
 
   /// Emit a reference to a symbol for use in dwarf. Different object formats
   /// represent this in different ways. Some use a relocation others encode
@@ -565,18 +618,39 @@ public:
   void emitDwarfSymbolReference(const MCSymbol *Label,
                                 bool ForceOffset = false) const;
 
-  /// Emit the 4-byte offset of a string from the start of its section.
+  /// Emit the 4- or 8-byte offset of a string from the start of its section.
   ///
   /// When possible, emit a DwarfStringPool section offset without any
   /// relocations, and without using the symbol.  Otherwise, defers to \a
   /// emitDwarfSymbolReference().
+  ///
+  /// The length of the emitted value depends on the DWARF format.
   void emitDwarfStringOffset(DwarfStringPoolEntry S) const;
 
-  /// Emit the 4-byte offset of a string from the start of its section.
+  /// Emit the 4-or 8-byte offset of a string from the start of its section.
   void emitDwarfStringOffset(DwarfStringPoolEntryRef S) const {
     emitDwarfStringOffset(S.getEntry());
   }
 
+  /// Emit something like ".long Label + Offset" or ".quad Label + Offset"
+  /// depending on the DWARF format.
+  void emitDwarfOffset(const MCSymbol *Label, uint64_t Offset) const;
+
+  /// Emit 32- or 64-bit value depending on the DWARF format.
+  void emitDwarfLengthOrOffset(uint64_t Value) const;
+
+  /// Emit a special value of 0xffffffff if producing 64-bit debugging info.
+  void maybeEmitDwarf64Mark() const;
+
+  /// Emit a unit length field. The actual format, DWARF32 or DWARF64, is chosen
+  /// according to the settings.
+  void emitDwarfUnitLength(uint64_t Length, const Twine &Comment) const;
+
+  /// Emit a unit length field. The actual format, DWARF32 or DWARF64, is chosen
+  /// according to the settings.
+  void emitDwarfUnitLength(const MCSymbol *Hi, const MCSymbol *Lo,
+                           const Twine &Comment) const;
+
   /// Emit reference to a call site with a specified encoding
   void emitCallSiteOffset(const MCSymbol *Hi, const MCSymbol *Lo,
                           unsigned Encoding) const;
@@ -713,12 +787,13 @@ private:
   void emitModuleIdents(Module &M);
   /// Emit bytes for llvm.commandline metadata.
   void emitModuleCommandLines(Module &M);
-  void emitXXStructorList(const DataLayout &DL, const Constant *List,
-                          bool isCtor);
 
   GCMetadataPrinter *GetOrCreateGCPrinter(GCStrategy &S);
   /// Emit GlobalAlias or GlobalIFunc.
   void emitGlobalIndirectSymbol(Module &M, const GlobalIndirectSymbol &GIS);
+
+  /// This method decides whether the specified basic block requires a label.
+  bool shouldEmitLabelForBasicBlock(const MachineBasicBlock &MBB) const;
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/AsmPrinterHandler.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/AsmPrinterHandler.h
index 899d067d03f0..dc81a3040097 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/AsmPrinterHandler.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/AsmPrinterHandler.h
@@ -23,8 +23,10 @@ class MachineBasicBlock;
 class MachineFunction;
 class MachineInstr;
 class MCSymbol;
+class Module;
 
-typedef MCSymbol *ExceptionSymbolProvider(AsmPrinter *Asm);
+typedef MCSymbol *ExceptionSymbolProvider(AsmPrinter *Asm,
+                                          const MachineBasicBlock *MBB);
 
 /// Collects and handles AsmPrinter objects required to build debug
 /// or EH information.
@@ -36,6 +38,8 @@ public:
   /// this tracks that size.
   virtual void setSymbolSize(const MCSymbol *Sym, uint64_t Size) = 0;
 
+  virtual void beginModule(Module *M) {}
+
   /// Emit all sections that should come after the content.
   virtual void endModule() = 0;
 
@@ -74,6 +78,7 @@ public:
   /// Process end of a basic block during basic block sections.
   virtual void endBasicBlock(const MachineBasicBlock &MBB) {}
 };
+
 } // End of namespace llvm
 
 #endif
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/BasicBlockSectionUtils.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/BasicBlockSectionUtils.h
new file mode 100644
index 000000000000..d8da3be0cd4c
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/BasicBlockSectionUtils.h
@@ -0,0 +1,30 @@
+//===- BasicBlockSectionUtils.h - Utilities for basic block sections     --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_BASICBLOCKSECTIONUTILS_H
+#define LLVM_CODEGEN_BASICBLOCKSECTIONUTILS_H
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/CommandLine.h"
+
+namespace llvm {
+
+extern cl::opt<std::string> BBSectionsColdTextPrefix;
+
+class MachineFunction;
+class MachineBasicBlock;
+
+using MachineBasicBlockComparator =
+    function_ref<bool(const MachineBasicBlock &, const MachineBasicBlock &)>;
+
+void sortBasicBlocksAndUpdateBranches(MachineFunction &MF,
+                                      MachineBasicBlockComparator MBBCmp);
+
+} // end namespace llvm
+
+#endif // LLVM_CODEGEN_BASICBLOCKSECTIONUTILS_H
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 407f09063dce..9514dd22be80 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -40,7 +40,6 @@
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
-#include "llvm/MC/MCSchedule.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -115,12 +114,14 @@ private:
 
   /// Estimate a cost of subvector extraction as a sequence of extract and
   /// insert operations.
-  unsigned getExtractSubvectorOverhead(FixedVectorType *VTy, int Index,
+  unsigned getExtractSubvectorOverhead(VectorType *VTy, int Index,
                                        FixedVectorType *SubVTy) {
     assert(VTy && SubVTy &&
            "Can only extract subvectors from vectors");
     int NumSubElts = SubVTy->getNumElements();
-    assert((Index + NumSubElts) <= (int)VTy->getNumElements() &&
+    assert((!isa<FixedVectorType>(VTy) ||
+            (Index + NumSubElts) <=
+                (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
            "SK_ExtractSubvector index out of range");
 
     unsigned Cost = 0;
@@ -138,12 +139,14 @@ private:
 
   /// Estimate a cost of subvector insertion as a sequence of extract and
   /// insert operations.
-  unsigned getInsertSubvectorOverhead(FixedVectorType *VTy, int Index,
+  unsigned getInsertSubvectorOverhead(VectorType *VTy, int Index,
                                       FixedVectorType *SubVTy) {
     assert(VTy && SubVTy &&
            "Can only insert subvectors into vectors");
     int NumSubElts = SubVTy->getNumElements();
-    assert((Index + NumSubElts) <= (int)VTy->getNumElements() &&
+    assert((!isa<FixedVectorType>(VTy) ||
+            (Index + NumSubElts) <=
+                (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
            "SK_InsertSubvector index out of range");
 
     unsigned Cost = 0;
@@ -222,7 +225,11 @@ public:
   }
 
   bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
-    return getTLI()->isNoopAddrSpaceCast(FromAS, ToAS);
+    return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS);
+  }
+
+  unsigned getAssumedAddrSpace(const Value *V) const {
+    return getTLI()->getTargetMachine().getAssumedAddrSpace(V);
   }
 
   Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
@@ -265,6 +272,10 @@ public:
     return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
   }
 
+  bool isNumRegsMajorCostOfLSR() {
+    return TargetTransformInfoImplBase::isNumRegsMajorCostOfLSR();
+  }
+
   bool isProfitableLSRChainElement(Instruction *I) {
     return TargetTransformInfoImplBase::isProfitableLSRChainElement(I);
   }
@@ -294,6 +305,10 @@ public:
     return getTLI()->isTypeLegal(VT);
   }
 
+  unsigned getRegUsageForType(Type *Ty) {
+    return getTLI()->getTypeLegalizationCost(DL, Ty).first;
+  }
+
   int getGEPCost(Type *PointeeType, const Value *Ptr,
                  ArrayRef<const Value *> Operands) {
     return BaseT::getGEPCost(PointeeType, Ptr, Operands);
@@ -386,6 +401,7 @@ public:
   }
 
   unsigned getInliningThresholdMultiplier() { return 1; }
+  unsigned adjustInliningThreshold(const CallBase *CB) { return 0; }
 
   int getInlinerVectorBonusPercent() { return 150; }
 
@@ -477,6 +493,30 @@ public:
     return BaseT::emitGetActiveLaneMask();
   }
 
+  Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
+                                               IntrinsicInst &II) {
+    return BaseT::instCombineIntrinsic(IC, II);
+  }
+
+  Optional<Value *> simplifyDemandedUseBitsIntrinsic(InstCombiner &IC,
+                                                     IntrinsicInst &II,
+                                                     APInt DemandedMask,
+                                                     KnownBits &Known,
+                                                     bool &KnownBitsComputed) {
+    return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known,
+                                                   KnownBitsComputed);
+  }
+
+  Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
+      InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
+      APInt &UndefElts2, APInt &UndefElts3,
+      std::function<void(Instruction *, unsigned, APInt, APInt &)>
+          SimplifyAndSetOp) {
+    return BaseT::simplifyDemandedVectorEltsIntrinsic(
+        IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
+        SimplifyAndSetOp);
+  }
+
   int getInstructionLatency(const Instruction *I) {
     if (isa<LoadInst>(I))
       return getST()->getSchedModel().DefaultLoadLatency;
@@ -532,6 +572,8 @@ public:
 
   unsigned getRegisterBitWidth(bool Vector) const { return 32; }
 
+  Optional<unsigned> getMaxVScale() const { return None; }
+
   /// Estimate the overhead of scalarizing an instruction. Insert and Extract
   /// are set if the demanded result elements need to be inserted and/or
   /// extracted from vectors.
@@ -567,7 +609,7 @@ public:
     return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract);
   }
 
-  /// Estimate the overhead of scalarizing an instructions unique
+  /// Estimate the overhead of scalarizing an instruction's unique
   /// non-constant operands. The types of the arguments are ordinarily
   /// scalar, in which case the costs are multiplied with VF.
   unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
@@ -575,8 +617,14 @@ public:
     unsigned Cost = 0;
     SmallPtrSet<const Value*, 4> UniqueOperands;
     for (const Value *A : Args) {
+      // Disregard things like metadata arguments.
+      Type *Ty = A->getType();
+      if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() &&
+          !Ty->isPtrOrPtrVectorTy())
+        continue;
+
       if (!isa<Constant>(A) && UniqueOperands.insert(A).second) {
-        auto *VecTy = dyn_cast<VectorType>(A->getType());
+        auto *VecTy = dyn_cast<VectorType>(Ty);
         if (VecTy) {
           // If A is a vector operand, VF should be 1 or correspond to A.
           assert((VF == 1 ||
@@ -584,7 +632,7 @@ public:
                  "Vector argument does not match VF");
         }
         else
-          VecTy = FixedVectorType::get(A->getType(), VF);
+          VecTy = FixedVectorType::get(Ty, VF);
 
         Cost += getScalarizationOverhead(VecTy, false, true);
       }
@@ -658,7 +706,8 @@ public:
     if (auto *VTy = dyn_cast<VectorType>(Ty)) {
       unsigned Num = cast<FixedVectorType>(VTy)->getNumElements();
       unsigned Cost = thisT()->getArithmeticInstrCost(
-          Opcode, VTy->getScalarType(), CostKind);
+          Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
+          Opd1PropInfo, Opd2PropInfo, Args, CxtI);
       // Return the cost of multiple scalar invocation plus the cost of
       // inserting and extracting the values.
       return getScalarizationOverhead(VTy, Args) + Num * Cost;
@@ -681,19 +730,20 @@ public:
     case TTI::SK_PermuteTwoSrc:
       return getPermuteShuffleOverhead(cast<FixedVectorType>(Tp));
     case TTI::SK_ExtractSubvector:
-      return getExtractSubvectorOverhead(cast<FixedVectorType>(Tp), Index,
+      return getExtractSubvectorOverhead(Tp, Index,
                                          cast<FixedVectorType>(SubTp));
     case TTI::SK_InsertSubvector:
-      return getInsertSubvectorOverhead(cast<FixedVectorType>(Tp), Index,
+      return getInsertSubvectorOverhead(Tp, Index,
                                         cast<FixedVectorType>(SubTp));
     }
     llvm_unreachable("Unknown TTI::ShuffleKind");
   }
 
   unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                            TTI::CastContextHint CCH,
                             TTI::TargetCostKind CostKind,
                             const Instruction *I = nullptr) {
-    if (BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I) == 0)
+    if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0)
       return 0;
 
     const TargetLoweringBase *TLI = getTLI();
@@ -731,15 +781,12 @@ public:
         return 0;
       LLVM_FALLTHROUGH;
     case Instruction::SExt:
-      if (!I)
-        break;
-
-      if (getTLI()->isExtFree(I))
+      if (I && getTLI()->isExtFree(I))
         return 0;
 
       // If this is a zext/sext of a load, return 0 if the corresponding
       // extending load exists on target.
-      if (I && isa<LoadInst>(I->getOperand(0))) {
+      if (CCH == TTI::CastContextHint::Normal) {
         EVT ExtVT = EVT::getEVT(Dst);
         EVT LoadVT = EVT::getEVT(Src);
         unsigned LType =
@@ -814,7 +861,7 @@ public:
         unsigned SplitCost =
             (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0;
         return SplitCost +
-               (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy,
+               (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH,
                                           CostKind, I));
       }
 
@@ -822,7 +869,7 @@ public:
       // the operation will get scalarized.
       unsigned Num = cast<FixedVectorType>(DstVTy)->getNumElements();
       unsigned Cost = thisT()->getCastInstrCost(
-          Opcode, Dst->getScalarType(), Src->getScalarType(), CostKind, I);
+          Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I);
 
       // Return the cost of multiple scalar invocation plus the cost of
       // inserting and extracting the values.
@@ -847,7 +894,7 @@ public:
     return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy,
                                        Index) +
            thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(),
-                                     TTI::TCK_RecipThroughput);
+                                     TTI::CastContextHint::None, TTI::TCK_RecipThroughput);
   }
 
   unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) {
@@ -855,6 +902,7 @@ public:
   }
 
   unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                              CmpInst::Predicate VecPred,
                               TTI::TargetCostKind CostKind,
                               const Instruction *I = nullptr) {
     const TargetLoweringBase *TLI = getTLI();
@@ -863,7 +911,8 @@ public:
 
     // TODO: Handle other cost kinds.
     if (CostKind != TTI::TCK_RecipThroughput)
-      return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I);
+      return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
+                                       I);
 
     // Selects on vectors are actually vector selects.
     if (ISD == ISD::SELECT) {
@@ -888,7 +937,7 @@ public:
       if (CondTy)
         CondTy = CondTy->getScalarType();
       unsigned Cost = thisT()->getCmpSelInstrCost(
-          Opcode, ValVTy->getScalarType(), CondTy, CostKind, I);
+          Opcode, ValVTy->getScalarType(), CondTy, VecPred, CostKind, I);
 
       // Return the cost of multiple scalar invocation plus the cost of
       // inserting and extracting the values.
@@ -922,7 +971,11 @@ public:
       return Cost;
 
     if (Src->isVectorTy() &&
-        Src->getPrimitiveSizeInBits() < LT.second.getSizeInBits()) {
+        // In practice it's not currently possible to have a change in lane
+        // length for extending loads or truncating stores so both types should
+        // have the same scalable property.
+        TypeSize::isKnownLT(Src->getPrimitiveSizeInBits(),
+                            LT.second.getSizeInBits())) {
       // This is a vector load that legalizes to a larger type than the vector
       // itself. Unless the corresponding extending load or truncating store is
       // legal, then this will scalarize.
@@ -945,6 +998,51 @@ public:
     return Cost;
   }
 
+  unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
+                                  const Value *Ptr, bool VariableMask,
+                                  Align Alignment, TTI::TargetCostKind CostKind,
+                                  const Instruction *I = nullptr) {
+    auto *VT = cast<FixedVectorType>(DataTy);
+    // Assume the target does not have support for gather/scatter operations
+    // and provide a rough estimate.
+    //
+    // First, compute the cost of extracting the individual addresses and the
+    // individual memory operations.
+    int LoadCost =
+        VT->getNumElements() *
+        (getVectorInstrCost(
+             Instruction::ExtractElement,
+             FixedVectorType::get(PointerType::get(VT->getElementType(), 0),
+                                  VT->getNumElements()),
+             -1) +
+         getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind));
+
+    // Next, compute the cost of packing the result in a vector.
+    int PackingCost = getScalarizationOverhead(VT, Opcode != Instruction::Store,
+                                               Opcode == Instruction::Store);
+
+    int ConditionalCost = 0;
+    if (VariableMask) {
+      // Compute the cost of conditionally executing the memory operations with
+      // variable masks. This includes extracting the individual conditions, a
+      // branches and PHIs to combine the results.
+      // NOTE: Estimating the cost of conditionally executing the memory
+      // operations accurately is quite difficult and the current solution
+      // provides a very rough estimate only.
+      ConditionalCost =
+          VT->getNumElements() *
+          (getVectorInstrCost(
+               Instruction::ExtractElement,
+               FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()),
+                                    VT->getNumElements()),
+               -1) +
+           getCFInstrCost(Instruction::Br, CostKind) +
+           getCFInstrCost(Instruction::PHI, CostKind));
+    }
+
+    return LoadCost + PackingCost + ConditionalCost;
+  }
+
   unsigned getInterleavedMemoryOpCost(
       unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
       Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
@@ -1099,76 +1197,52 @@ public:
   /// Get intrinsic cost based on arguments.
   unsigned getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                  TTI::TargetCostKind CostKind) {
-    Intrinsic::ID IID = ICA.getID();
-
-    // Special case some scalar intrinsics.
-    if (CostKind != TTI::TCK_RecipThroughput) {
-      switch (IID) {
-      default:
-        break;
-      case Intrinsic::cttz:
-        if (getTLI()->isCheapToSpeculateCttz())
-          return TargetTransformInfo::TCC_Basic;
-        break;
-      case Intrinsic::ctlz:
-        if (getTLI()->isCheapToSpeculateCtlz())
-          return TargetTransformInfo::TCC_Basic;
-        break;
-      case Intrinsic::memcpy:
-        return thisT()->getMemcpyCost(ICA.getInst());
-        // TODO: other libc intrinsics.
-      }
-      return BaseT::getIntrinsicInstrCost(ICA, CostKind);
-    }
-
+    // Check for generically free intrinsics.
     if (BaseT::getIntrinsicInstrCost(ICA, CostKind) == 0)
       return 0;
 
-    // TODO: Combine these two logic paths.
+    // Assume that target intrinsics are cheap.
+    Intrinsic::ID IID = ICA.getID();
+    if (Function::isTargetIntrinsic(IID))
+      return TargetTransformInfo::TCC_Basic;
+
     if (ICA.isTypeBasedOnly())
       return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
 
     Type *RetTy = ICA.getReturnType();
-    unsigned VF = ICA.getVectorFactor();
-    unsigned RetVF =
-        (RetTy->isVectorTy() ? cast<FixedVectorType>(RetTy)->getNumElements()
-                             : 1);
-    assert((RetVF == 1 || VF == 1) && "VF > 1 and RetVF is a vector type");
+
+    ElementCount VF = ICA.getVectorFactor();
+    ElementCount RetVF =
+        (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount()
+                             : ElementCount::getFixed(1));
+    assert((RetVF.isScalar() || VF.isScalar()) &&
+           "VF > 1 and RetVF is a vector type");
     const IntrinsicInst *I = ICA.getInst();
     const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
     FastMathFlags FMF = ICA.getFlags();
-
     switch (IID) {
-    default: {
-      // Assume that we need to scalarize this intrinsic.
-      SmallVector<Type *, 4> Types;
-      for (const Value *Op : Args) {
-        Type *OpTy = Op->getType();
-        assert(VF == 1 || !OpTy->isVectorTy());
-        Types.push_back(VF == 1 ? OpTy : FixedVectorType::get(OpTy, VF));
-      }
+    default:
+      break;
 
-      if (VF > 1 && !RetTy->isVoidTy())
-        RetTy = FixedVectorType::get(RetTy, VF);
-
-      // Compute the scalarization overhead based on Args for a vector
-      // intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while
-      // CostModel will pass a vector RetTy and VF is 1.
-      unsigned ScalarizationCost = std::numeric_limits<unsigned>::max();
-      if (RetVF > 1 || VF > 1) {
-        ScalarizationCost = 0;
-        if (!RetTy->isVoidTy())
-          ScalarizationCost +=
-              getScalarizationOverhead(cast<VectorType>(RetTy), true, false);
-        ScalarizationCost += getOperandsScalarizationOverhead(Args, VF);
-      }
+    case Intrinsic::cttz:
+      // FIXME: If necessary, this should go in target-specific overrides.
+      if (VF.isScalar() && RetVF.isScalar() &&
+          getTLI()->isCheapToSpeculateCttz())
+        return TargetTransformInfo::TCC_Basic;
+      break;
+
+    case Intrinsic::ctlz:
+      // FIXME: If necessary, this should go in target-specific overrides.
+      if (VF.isScalar() && RetVF.isScalar() &&
+          getTLI()->isCheapToSpeculateCtlz())
+        return TargetTransformInfo::TCC_Basic;
+      break;
+
+    case Intrinsic::memcpy:
+      return thisT()->getMemcpyCost(ICA.getInst());
 
-      IntrinsicCostAttributes Attrs(IID, RetTy, Types, FMF,
-                                    ScalarizationCost, I);
-      return thisT()->getIntrinsicInstrCost(Attrs, CostKind);
-    }
     case Intrinsic::masked_scatter: {
-      assert(VF == 1 && "Can't vectorize types here.");
+      assert(VF.isScalar() && "Can't vectorize types here.");
       const Value *Mask = Args[3];
       bool VarMask = !isa<Constant>(Mask);
       Align Alignment = cast<ConstantInt>(Args[2])->getAlignValue();
@@ -1177,31 +1251,57 @@ public:
                                              VarMask, Alignment, CostKind, I);
     }
     case Intrinsic::masked_gather: {
-      assert(VF == 1 && "Can't vectorize types here.");
+      assert(VF.isScalar() && "Can't vectorize types here.");
       const Value *Mask = Args[2];
       bool VarMask = !isa<Constant>(Mask);
       Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue();
       return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0],
                                              VarMask, Alignment, CostKind, I);
     }
-    case Intrinsic::experimental_vector_reduce_add:
-    case Intrinsic::experimental_vector_reduce_mul:
-    case Intrinsic::experimental_vector_reduce_and:
-    case Intrinsic::experimental_vector_reduce_or:
-    case Intrinsic::experimental_vector_reduce_xor:
-    case Intrinsic::experimental_vector_reduce_v2_fadd:
-    case Intrinsic::experimental_vector_reduce_v2_fmul:
-    case Intrinsic::experimental_vector_reduce_smax:
-    case Intrinsic::experimental_vector_reduce_smin:
-    case Intrinsic::experimental_vector_reduce_fmax:
-    case Intrinsic::experimental_vector_reduce_fmin:
-    case Intrinsic::experimental_vector_reduce_umax:
-    case Intrinsic::experimental_vector_reduce_umin: {
+    case Intrinsic::experimental_vector_extract: {
+      // FIXME: Handle case where a scalable vector is extracted from a scalable
+      // vector
+      if (isa<ScalableVectorType>(RetTy))
+        return BaseT::getIntrinsicInstrCost(ICA, CostKind);
+      unsigned Index = cast<ConstantInt>(Args[1])->getZExtValue();
+      return thisT()->getShuffleCost(TTI::SK_ExtractSubvector,
+                                     cast<VectorType>(Args[0]->getType()),
+                                     Index, cast<VectorType>(RetTy));
+    }
+    case Intrinsic::experimental_vector_insert: {
+      // FIXME: Handle case where a scalable vector is inserted into a scalable
+      // vector
+      if (isa<ScalableVectorType>(Args[1]->getType()))
+        return BaseT::getIntrinsicInstrCost(ICA, CostKind);
+      unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
+      return thisT()->getShuffleCost(
+          TTI::SK_InsertSubvector, cast<VectorType>(Args[0]->getType()), Index,
+          cast<VectorType>(Args[1]->getType()));
+    }
+    case Intrinsic::vector_reduce_add:
+    case Intrinsic::vector_reduce_mul:
+    case Intrinsic::vector_reduce_and:
+    case Intrinsic::vector_reduce_or:
+    case Intrinsic::vector_reduce_xor:
+    case Intrinsic::vector_reduce_smax:
+    case Intrinsic::vector_reduce_smin:
+    case Intrinsic::vector_reduce_fmax:
+    case Intrinsic::vector_reduce_fmin:
+    case Intrinsic::vector_reduce_umax:
+    case Intrinsic::vector_reduce_umin: {
       IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, 1, I);
-      return getIntrinsicInstrCost(Attrs, CostKind);
+      return getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
+    }
+    case Intrinsic::vector_reduce_fadd:
+    case Intrinsic::vector_reduce_fmul: {
+      IntrinsicCostAttributes Attrs(
+          IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, 1, I);
+      return getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
     }
     case Intrinsic::fshl:
     case Intrinsic::fshr: {
+      if (isa<ScalableVectorType>(RetTy))
+        return BaseT::getIntrinsicInstrCost(ICA, CostKind);
       const Value *X = Args[0];
       const Value *Y = Args[1];
       const Value *Z = Args[2];
@@ -1232,14 +1332,48 @@ public:
       // For non-rotates (X != Y) we must add shift-by-zero handling costs.
       if (X != Y) {
         Type *CondTy = RetTy->getWithNewBitWidth(1);
-        Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
-                                            CostKind);
-        Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy,
-                                            CondTy, CostKind);
+        Cost +=
+            thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
+                                        CmpInst::BAD_ICMP_PREDICATE, CostKind);
+        Cost +=
+            thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
+                                        CmpInst::BAD_ICMP_PREDICATE, CostKind);
       }
       return Cost;
     }
     }
+    // TODO: Handle the remaining intrinsic with scalable vector type
+    if (isa<ScalableVectorType>(RetTy))
+      return BaseT::getIntrinsicInstrCost(ICA, CostKind);
+
+    // Assume that we need to scalarize this intrinsic.
+    SmallVector<Type *, 4> Types;
+    for (const Value *Op : Args) {
+      Type *OpTy = Op->getType();
+      assert(VF.isScalar() || !OpTy->isVectorTy());
+      Types.push_back(VF.isScalar()
+                          ? OpTy
+                          : FixedVectorType::get(OpTy, VF.getKnownMinValue()));
+    }
+
+    if (VF.isVector() && !RetTy->isVoidTy())
+      RetTy = FixedVectorType::get(RetTy, VF.getKnownMinValue());
+
+    // Compute the scalarization overhead based on Args for a vector
+    // intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while
+    // CostModel will pass a vector RetTy and VF is 1.
+    unsigned ScalarizationCost = std::numeric_limits<unsigned>::max();
+    if (RetVF.isVector() || VF.isVector()) {
+      ScalarizationCost = 0;
+      if (!RetTy->isVoidTy())
+        ScalarizationCost +=
+            getScalarizationOverhead(cast<VectorType>(RetTy), true, false);
+      ScalarizationCost +=
+          getOperandsScalarizationOverhead(Args, VF.getKnownMinValue());
+    }
+
+    IntrinsicCostAttributes Attrs(IID, RetTy, Types, FMF, ScalarizationCost, I);
+    return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
   }
 
   /// Get intrinsic cost based on argument types.
@@ -1255,10 +1389,21 @@ public:
     unsigned ScalarizationCostPassed = ICA.getScalarizationCost();
     bool SkipScalarizationCost = ICA.skipScalarizationCost();
 
-    auto *VecOpTy = Tys.empty() ? nullptr : dyn_cast<VectorType>(Tys[0]);
+    VectorType *VecOpTy = nullptr;
+    if (!Tys.empty()) {
+      // The vector reduction operand is operand 0 except for fadd/fmul.
+      // Their operand 0 is a scalar start value, so the vector op is operand 1.
+      unsigned VecTyIndex = 0;
+      if (IID == Intrinsic::vector_reduce_fadd ||
+          IID == Intrinsic::vector_reduce_fmul)
+        VecTyIndex = 1;
+      assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes");
+      VecOpTy = dyn_cast<VectorType>(Tys[VecTyIndex]);
+    }
 
+    // Library call cost - other than size, make it expensive.
+    unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10;
     SmallVector<unsigned, 2> ISDs;
-    unsigned SingleCallCost = 10; // Library call cost. Make it expensive.
     switch (IID) {
     default: {
       // Assume that we need to scalarize this intrinsic.
@@ -1327,13 +1472,15 @@ public:
       break;
     case Intrinsic::minnum:
       ISDs.push_back(ISD::FMINNUM);
-      if (FMF.noNaNs())
-        ISDs.push_back(ISD::FMINIMUM);
       break;
     case Intrinsic::maxnum:
       ISDs.push_back(ISD::FMAXNUM);
-      if (FMF.noNaNs())
-        ISDs.push_back(ISD::FMAXIMUM);
+      break;
+    case Intrinsic::minimum:
+      ISDs.push_back(ISD::FMINIMUM);
+      break;
+    case Intrinsic::maximum:
+      ISDs.push_back(ISD::FMAXIMUM);
       break;
     case Intrinsic::copysign:
       ISDs.push_back(ISD::FCOPYSIGN);
@@ -1375,6 +1522,7 @@ public:
     case Intrinsic::lifetime_start:
     case Intrinsic::lifetime_end:
     case Intrinsic::sideeffect:
+    case Intrinsic::pseudoprobe:
       return 0;
     case Intrinsic::masked_store: {
       Type *Ty = Tys[0];
@@ -1388,50 +1536,72 @@ public:
       return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0,
                                             CostKind);
     }
-    case Intrinsic::experimental_vector_reduce_add:
+    case Intrinsic::vector_reduce_add:
       return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy,
                                                  /*IsPairwiseForm=*/false,
                                                  CostKind);
-    case Intrinsic::experimental_vector_reduce_mul:
+    case Intrinsic::vector_reduce_mul:
       return thisT()->getArithmeticReductionCost(Instruction::Mul, VecOpTy,
                                                  /*IsPairwiseForm=*/false,
                                                  CostKind);
-    case Intrinsic::experimental_vector_reduce_and:
+    case Intrinsic::vector_reduce_and:
       return thisT()->getArithmeticReductionCost(Instruction::And, VecOpTy,
                                                  /*IsPairwiseForm=*/false,
                                                  CostKind);
-    case Intrinsic::experimental_vector_reduce_or:
+    case Intrinsic::vector_reduce_or:
       return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy,
                                                  /*IsPairwiseForm=*/false,
                                                  CostKind);
-    case Intrinsic::experimental_vector_reduce_xor:
+    case Intrinsic::vector_reduce_xor:
       return thisT()->getArithmeticReductionCost(Instruction::Xor, VecOpTy,
                                                  /*IsPairwiseForm=*/false,
                                                  CostKind);
-    case Intrinsic::experimental_vector_reduce_v2_fadd:
+    case Intrinsic::vector_reduce_fadd:
       // FIXME: Add new flag for cost of strict reductions.
       return thisT()->getArithmeticReductionCost(Instruction::FAdd, VecOpTy,
                                                  /*IsPairwiseForm=*/false,
                                                  CostKind);
-    case Intrinsic::experimental_vector_reduce_v2_fmul:
+    case Intrinsic::vector_reduce_fmul:
       // FIXME: Add new flag for cost of strict reductions.
       return thisT()->getArithmeticReductionCost(Instruction::FMul, VecOpTy,
                                                  /*IsPairwiseForm=*/false,
                                                  CostKind);
-    case Intrinsic::experimental_vector_reduce_smax:
-    case Intrinsic::experimental_vector_reduce_smin:
-    case Intrinsic::experimental_vector_reduce_fmax:
-    case Intrinsic::experimental_vector_reduce_fmin:
+    case Intrinsic::vector_reduce_smax:
+    case Intrinsic::vector_reduce_smin:
+    case Intrinsic::vector_reduce_fmax:
+    case Intrinsic::vector_reduce_fmin:
       return thisT()->getMinMaxReductionCost(
           VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)),
           /*IsPairwiseForm=*/false,
           /*IsUnsigned=*/false, CostKind);
-    case Intrinsic::experimental_vector_reduce_umax:
-    case Intrinsic::experimental_vector_reduce_umin:
+    case Intrinsic::vector_reduce_umax:
+    case Intrinsic::vector_reduce_umin:
       return thisT()->getMinMaxReductionCost(
           VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)),
           /*IsPairwiseForm=*/false,
           /*IsUnsigned=*/true, CostKind);
+    case Intrinsic::abs:
+    case Intrinsic::smax:
+    case Intrinsic::smin:
+    case Intrinsic::umax:
+    case Intrinsic::umin: {
+      // abs(X) = select(icmp(X,0),X,sub(0,X))
+      // minmax(X,Y) = select(icmp(X,Y),X,Y)
+      Type *CondTy = RetTy->getWithNewBitWidth(1);
+      unsigned Cost = 0;
+      // TODO: Ideally getCmpSelInstrCost would accept an icmp condition code.
+      Cost +=
+          thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
+                                      CmpInst::BAD_ICMP_PREDICATE, CostKind);
+      Cost +=
+          thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
+                                      CmpInst::BAD_ICMP_PREDICATE, CostKind);
+      // TODO: Should we add an OperandValueProperties::OP_Zero property?
+      if (IID == Intrinsic::abs)
+        Cost += thisT()->getArithmeticInstrCost(
+            BinaryOperator::Sub, RetTy, CostKind, TTI::OK_UniformConstantValue);
+      return Cost;
+    }
     case Intrinsic::sadd_sat:
     case Intrinsic::ssub_sat: {
       Type *CondTy = RetTy->getWithNewBitWidth(1);
@@ -1447,10 +1617,12 @@ public:
       IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
                                     ScalarizationCostPassed);
       Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
-      Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
-                                          CostKind);
-      Cost += 2 * thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy,
-                                              CondTy, CostKind);
+      Cost +=
+          thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
+                                      CmpInst::BAD_ICMP_PREDICATE, CostKind);
+      Cost += 2 * thisT()->getCmpSelInstrCost(
+                      BinaryOperator::Select, RetTy, CondTy,
+                      CmpInst::BAD_ICMP_PREDICATE, CostKind);
       return Cost;
     }
     case Intrinsic::uadd_sat:
@@ -1466,8 +1638,9 @@ public:
       IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
                                     ScalarizationCostPassed);
       Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
-      Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
-                                          CostKind);
+      Cost +=
+          thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
+                                      CmpInst::BAD_ICMP_PREDICATE, CostKind);
       return Cost;
     }
     case Intrinsic::smul_fix:
@@ -1477,13 +1650,14 @@ public:
 
       unsigned ExtOp =
           IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
+      TTI::CastContextHint CCH = TTI::CastContextHint::None;
 
       unsigned Cost = 0;
-      Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CostKind);
+      Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind);
       Cost +=
           thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
       Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy,
-                                            CostKind);
+                                            CCH, CostKind);
       Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, RetTy,
                                               CostKind, TTI::OK_AnyValue,
                                               TTI::OK_UniformConstantValue);
@@ -1511,10 +1685,12 @@ public:
       //   Overflow -> (LHSSign != RHSSign) && (LHSSign != SumSign)
       unsigned Cost = 0;
       Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
-      Cost += 3 * thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy,
-                                              OverflowTy, CostKind);
-      Cost += 2 * thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, OverflowTy,
-                                              OverflowTy, CostKind);
+      Cost += 3 * thisT()->getCmpSelInstrCost(
+                      Instruction::ICmp, SumTy, OverflowTy,
+                      CmpInst::BAD_ICMP_PREDICATE, CostKind);
+      Cost += 2 * thisT()->getCmpSelInstrCost(
+                      Instruction::Select, OverflowTy, OverflowTy,
+                      CmpInst::BAD_ICMP_PREDICATE, CostKind);
       Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, OverflowTy,
                                               CostKind);
       return Cost;
@@ -1529,8 +1705,9 @@ public:
 
       unsigned Cost = 0;
       Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
-      Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy,
-                                          OverflowTy, CostKind);
+      Cost +=
+          thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, OverflowTy,
+                                      CmpInst::BAD_ICMP_PREDICATE, CostKind);
       return Cost;
     }
     case Intrinsic::smul_with_overflow:
@@ -1542,13 +1719,14 @@ public:
 
       unsigned ExtOp =
           IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
+      TTI::CastContextHint CCH = TTI::CastContextHint::None;
 
       unsigned Cost = 0;
-      Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CostKind);
+      Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind);
       Cost +=
           thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
       Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy,
-                                            CostKind);
+                                            CCH, CostKind);
       Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, MulTy,
                                               CostKind, TTI::OK_AnyValue,
                                               TTI::OK_UniformConstantValue);
@@ -1558,8 +1736,9 @@ public:
                                                 CostKind, TTI::OK_AnyValue,
                                                 TTI::OK_UniformConstantValue);
 
-      Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, MulTy,
-                                          OverflowTy, CostKind);
+      Cost +=
+          thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, MulTy, OverflowTy,
+                                      CmpInst::BAD_ICMP_PREDICATE, CostKind);
       return Cost;
     }
     case Intrinsic::ctpop:
@@ -1568,7 +1747,12 @@ public:
       // library call but still not a cheap instruction.
       SingleCallCost = TargetTransformInfo::TCC_Expensive;
       break;
-    // FIXME: ctlz, cttz, ...
+    case Intrinsic::ctlz:
+      ISDs.push_back(ISD::CTLZ);
+      break;
+    case Intrinsic::cttz:
+      ISDs.push_back(ISD::CTTZ);
+      break;
     case Intrinsic::bswap:
       ISDs.push_back(ISD::BSWAP);
       break;
@@ -1604,7 +1788,7 @@ public:
       }
     }
 
-    auto MinLegalCostI = std::min_element(LegalCost.begin(), LegalCost.end());
+    auto *MinLegalCostI = std::min_element(LegalCost.begin(), LegalCost.end());
     if (MinLegalCostI != LegalCost.end())
       return *MinLegalCostI;
 
@@ -1801,9 +1985,10 @@ public:
           (IsPairwise + 1) * thisT()->getShuffleCost(TTI::SK_ExtractSubvector,
                                                      Ty, NumVecElts, SubTy);
       MinMaxCost +=
-          thisT()->getCmpSelInstrCost(CmpOpcode, SubTy, CondTy, CostKind) +
+          thisT()->getCmpSelInstrCost(CmpOpcode, SubTy, CondTy,
+                                      CmpInst::BAD_ICMP_PREDICATE, CostKind) +
           thisT()->getCmpSelInstrCost(Instruction::Select, SubTy, CondTy,
-                                      CostKind);
+                                      CmpInst::BAD_ICMP_PREDICATE, CostKind);
       Ty = SubTy;
       ++LongVectorCount;
     }
@@ -1825,15 +2010,37 @@ public:
                    thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, 0, Ty);
     MinMaxCost +=
         NumReduxLevels *
-        (thisT()->getCmpSelInstrCost(CmpOpcode, Ty, CondTy, CostKind) +
+        (thisT()->getCmpSelInstrCost(CmpOpcode, Ty, CondTy,
+                                     CmpInst::BAD_ICMP_PREDICATE, CostKind) +
          thisT()->getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
-                                     CostKind));
+                                     CmpInst::BAD_ICMP_PREDICATE, CostKind));
     // The last min/max should be in vector registers and we counted it above.
     // So just need a single extractelement.
     return ShuffleCost + MinMaxCost +
            thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
   }
 
+  InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned,
+                                              Type *ResTy, VectorType *Ty,
+                                              TTI::TargetCostKind CostKind) {
+    // Without any native support, this is equivalent to the cost of
+    // vecreduce.add(ext) or if IsMLA vecreduce.add(mul(ext, ext))
+    VectorType *ExtTy = VectorType::get(ResTy, Ty);
+    unsigned RedCost = thisT()->getArithmeticReductionCost(
+        Instruction::Add, ExtTy, false, CostKind);
+    unsigned MulCost = 0;
+    unsigned ExtCost = thisT()->getCastInstrCost(
+        IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
+        TTI::CastContextHint::None, CostKind);
+    if (IsMLA) {
+      MulCost =
+          thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
+      ExtCost *= 2;
+    }
+
+    return RedCost + MulCost + ExtCost;
+  }
+
   unsigned getVectorSplitCost() { return 1; }
 
   /// @}
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/CalcSpillWeights.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/CalcSpillWeights.h
index 9b8b7324f30a..78dae81f596e 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/CalcSpillWeights.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/CalcSpillWeights.h
@@ -44,64 +44,60 @@ class VirtRegMap;
   /// Calculate auxiliary information for a virtual register such as its
   /// spill weight and allocation hint.
   class VirtRegAuxInfo {
-  public:
-    using NormalizingFn = float (*)(float, unsigned, unsigned);
-
-  private:
     MachineFunction &MF;
     LiveIntervals &LIS;
-    VirtRegMap *VRM;
+    const VirtRegMap &VRM;
     const MachineLoopInfo &Loops;
     const MachineBlockFrequencyInfo &MBFI;
-    DenseMap<unsigned, float> Hint;
-    NormalizingFn normalize;
 
   public:
-    VirtRegAuxInfo(MachineFunction &mf, LiveIntervals &lis,
-                   VirtRegMap *vrm, const MachineLoopInfo &loops,
-                   const MachineBlockFrequencyInfo &mbfi,
-                   NormalizingFn norm = normalizeSpillWeight)
-        : MF(mf), LIS(lis), VRM(vrm), Loops(loops), MBFI(mbfi), normalize(norm) {}
+    VirtRegAuxInfo(MachineFunction &MF, LiveIntervals &LIS,
+                   const VirtRegMap &VRM, const MachineLoopInfo &Loops,
+                   const MachineBlockFrequencyInfo &MBFI)
+        : MF(MF), LIS(LIS), VRM(VRM), Loops(Loops), MBFI(MBFI) {}
+
+    virtual ~VirtRegAuxInfo() = default;
 
     /// (re)compute li's spill weight and allocation hint.
-    void calculateSpillWeightAndHint(LiveInterval &li);
+    void calculateSpillWeightAndHint(LiveInterval &LI);
 
-    /// Compute future expected spill weight of a split artifact of li
+    /// Compute future expected spill weight of a split artifact of LI
     /// that will span between start and end slot indexes.
-    /// \param li     The live interval to be split.
-    /// \param start  The expected begining of the split artifact. Instructions
+    /// \param LI     The live interval to be split.
+    /// \param Start  The expected beginning of the split artifact. Instructions
     ///               before start will not affect the weight.
-    /// \param end    The expected end of the split artifact. Instructions
+    /// \param End    The expected end of the split artifact. Instructions
     ///               after end will not affect the weight.
     /// \return The expected spill weight of the split artifact. Returns
-    /// negative weight for unspillable li.
-    float futureWeight(LiveInterval &li, SlotIndex start, SlotIndex end);
+    /// negative weight for unspillable LI.
+    float futureWeight(LiveInterval &LI, SlotIndex Start, SlotIndex End);
+
+    /// Compute spill weights and allocation hints for all virtual register
+    /// live intervals.
+    void calculateSpillWeightsAndHints();
 
+  protected:
     /// Helper function for weight calculations.
-    /// (Re)compute li's spill weight and allocation hint, or, for non null
+    /// (Re)compute LI's spill weight and allocation hint, or, for non null
     /// start and end - compute future expected spill weight of a split
-    /// artifact of li that will span between start and end slot indexes.
-    /// \param li     The live interval for which to compute the weight.
-    /// \param start  The expected begining of the split artifact. Instructions
+    /// artifact of LI that will span between start and end slot indexes.
+    /// \param LI     The live interval for which to compute the weight.
+    /// \param Start  The expected beginning of the split artifact. Instructions
     ///               before start will not affect the weight. Relevant for
     ///               weight calculation of future split artifact.
-    /// \param end    The expected end of the split artifact. Instructions
+    /// \param End    The expected end of the split artifact. Instructions
     ///               after end will not affect the weight. Relevant for
     ///               weight calculation of future split artifact.
-    /// \return The spill weight. Returns negative weight for unspillable li.
-    float weightCalcHelper(LiveInterval &li, SlotIndex *start = nullptr,
-                           SlotIndex *end = nullptr);
-  };
-
-  /// Compute spill weights and allocation hints for all virtual register
-  /// live intervals.
-  void calculateSpillWeightsAndHints(LiveIntervals &LIS, MachineFunction &MF,
-                                     VirtRegMap *VRM,
-                                     const MachineLoopInfo &MLI,
-                                     const MachineBlockFrequencyInfo &MBFI,
-                                     VirtRegAuxInfo::NormalizingFn norm =
-                                         normalizeSpillWeight);
+    /// \return The spill weight. Returns negative weight for unspillable LI.
+    float weightCalcHelper(LiveInterval &LI, SlotIndex *Start = nullptr,
+                           SlotIndex *End = nullptr);
 
+    /// Weight normalization function.
+    virtual float normalize(float UseDefFreq, unsigned Size,
+                            unsigned NumInstr) {
+      return normalizeSpillWeight(UseDefFreq, Size, NumInstr);
+    }
+  };
 } // end namespace llvm
 
 #endif // LLVM_CODEGEN_CALCSPILLWEIGHTS_H
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/CallingConvLower.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/CallingConvLower.h
index 8ebe788ac360..2fe4e371263b 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/CallingConvLower.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/CallingConvLower.h
@@ -16,7 +16,7 @@
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/Register.h"
 #include "llvm/CodeGen/TargetCallingConv.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/MC/MCRegisterInfo.h"
@@ -25,6 +25,7 @@
 namespace llvm {
 
 class CCState;
+class MachineFunction;
 class MVT;
 class TargetRegisterInfo;
 
@@ -339,6 +340,11 @@ public:
     return Regs.size();
   }
 
+  void DeallocateReg(MCPhysReg Reg) {
+    assert(isAllocated(Reg) && "Trying to deallocate an unallocated register");
+    MarkUnallocated(Reg);
+  }
+
   /// AllocateReg - Attempt to allocate one register.  If it is not available,
   /// return zero.  Otherwise, return the register, marking it and any aliases
   /// as allocated.
@@ -432,10 +438,7 @@ public:
     return AllocateStack(Size, Align(Alignment));
   }
 
-  void ensureMaxAlignment(Align Alignment) {
-    if (!AnalyzingMustTailForwardedRegs)
-      MF.getFrameInfo().ensureMaxAlignment(Alignment);
-  }
+  void ensureMaxAlignment(Align Alignment);
 
   /// Version of AllocateStack with extra register to be shadowed.
   LLVM_ATTRIBUTE_DEPRECATED(unsigned AllocateStack(unsigned Size,
@@ -572,6 +575,8 @@ public:
 private:
   /// MarkAllocated - Mark a register and all of its aliases as allocated.
   void MarkAllocated(MCPhysReg Reg);
+
+  void MarkUnallocated(MCPhysReg Reg);
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/CodeGenPassBuilder.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/CodeGenPassBuilder.h
new file mode 100644
index 000000000000..893bc6e013f4
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/CodeGenPassBuilder.h
@@ -0,0 +1,1144 @@
+//===- Construction of codegen pass pipelines ------------------*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// Interfaces for registering analysis passes, producing common pass manager
+/// configurations, and parsing of pass pipelines.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_CODEGENPASSBUILDER_H
+#define LLVM_CODEGEN_CODEGENPASSBUILDER_H
+
+#include "llvm/ADT/FunctionExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/CFLAndersAliasAnalysis.h"
+#include "llvm/Analysis/CFLSteensAliasAnalysis.h"
+#include "llvm/Analysis/ScopedNoAliasAA.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/TypeBasedAliasAnalysis.h"
+#include "llvm/CodeGen/ExpandReductions.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachinePassManager.h"
+#include "llvm/CodeGen/PreISelIntrinsicLowering.h"
+#include "llvm/CodeGen/UnreachableBlockElim.h"
+#include "llvm/IR/IRPrintingPasses.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCTargetOptions.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/CGPassBuilderOption.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/ConstantHoisting.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Scalar/LoopStrengthReduce.h"
+#include "llvm/Transforms/Scalar/LowerConstantIntrinsics.h"
+#include "llvm/Transforms/Scalar/MergeICmps.h"
+#include "llvm/Transforms/Scalar/PartiallyInlineLibCalls.h"
+#include "llvm/Transforms/Scalar/ScalarizeMaskedMemIntrin.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/EntryExitInstrumenter.h"
+#include "llvm/Transforms/Utils/LowerInvoke.h"
+#include <cassert>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+namespace llvm {
+
+// FIXME: Dummy target independent passes definitions that have not yet been
+// ported to new pass manager. Once they do, remove these.
+#define DUMMY_FUNCTION_PASS(NAME, PASS_NAME, CONSTRUCTOR)                      \
+  struct PASS_NAME : public PassInfoMixin<PASS_NAME> {                         \
+    template <typename... Ts> PASS_NAME(Ts &&...) {}                           \
+    PreservedAnalyses run(Function &, FunctionAnalysisManager &) {             \
+      return PreservedAnalyses::all();                                         \
+    }                                                                          \
+  };
+#define DUMMY_MODULE_PASS(NAME, PASS_NAME, CONSTRUCTOR)                        \
+  struct PASS_NAME : public PassInfoMixin<PASS_NAME> {                         \
+    template <typename... Ts> PASS_NAME(Ts &&...) {}                           \
+    PreservedAnalyses run(Module &, ModuleAnalysisManager &) {                 \
+      return PreservedAnalyses::all();                                         \
+    }                                                                          \
+  };
+#define DUMMY_MACHINE_MODULE_PASS(NAME, PASS_NAME, CONSTRUCTOR)                \
+  struct PASS_NAME : public PassInfoMixin<PASS_NAME> {                         \
+    template <typename... Ts> PASS_NAME(Ts &&...) {}                           \
+    Error run(Module &, MachineFunctionAnalysisManager &) {                    \
+      return Error::success();                                                 \
+    }                                                                          \
+    PreservedAnalyses run(MachineFunction &,                                   \
+                          MachineFunctionAnalysisManager &) {                  \
+      llvm_unreachable("this api is to make new PM api happy");                \
+    }                                                                          \
+    static AnalysisKey Key;                                                    \
+  };
+#define DUMMY_MACHINE_FUNCTION_PASS(NAME, PASS_NAME, CONSTRUCTOR)              \
+  struct PASS_NAME : public PassInfoMixin<PASS_NAME> {                         \
+    template <typename... Ts> PASS_NAME(Ts &&...) {}                           \
+    PreservedAnalyses run(MachineFunction &,                                   \
+                          MachineFunctionAnalysisManager &) {                  \
+      return PreservedAnalyses::all();                                         \
+    }                                                                          \
+    static AnalysisKey Key;                                                    \
+  };
+#include "MachinePassRegistry.def"
+
+/// This class provides access to building LLVM's passes.
+///
+/// Its members provide the baseline state available to passes during their
+/// construction. The \c MachinePassRegistry.def file specifies how to construct
+/// all of the built-in passes, and those may reference these members during
+/// construction.
+template <typename DerivedT> class CodeGenPassBuilder {
+public:
+  explicit CodeGenPassBuilder(LLVMTargetMachine &TM, CGPassBuilderOption Opts,
+                              PassInstrumentationCallbacks *PIC)
+      : TM(TM), Opt(Opts), PIC(PIC) {
+    // Target could set CGPassBuilderOption::MISchedPostRA to true to achieve
+    //     substitutePass(&PostRASchedulerID, &PostMachineSchedulerID)
+
+    // Target should override TM.Options.EnableIPRA in their target-specific
+    // LLVMTM ctor. See TargetMachine::setGlobalISel for example.
+    if (Opt.EnableIPRA)
+      TM.Options.EnableIPRA = *Opt.EnableIPRA;
+
+    if (Opt.EnableGlobalISelAbort)
+      TM.Options.GlobalISelAbort = *Opt.EnableGlobalISelAbort;
+
+    if (!Opt.OptimizeRegAlloc)
+      Opt.OptimizeRegAlloc = getOptLevel() != CodeGenOpt::None;
+  }
+
+  Error buildPipeline(ModulePassManager &MPM, MachineFunctionPassManager &MFPM,
+                      raw_pwrite_stream &Out, raw_pwrite_stream *DwoOut,
+                      CodeGenFileType FileType) const;
+
+  void registerModuleAnalyses(ModuleAnalysisManager &) const;
+  void registerFunctionAnalyses(FunctionAnalysisManager &) const;
+  void registerMachineFunctionAnalyses(MachineFunctionAnalysisManager &) const;
+  std::pair<StringRef, bool> getPassNameFromLegacyName(StringRef) const;
+
+  void registerAnalyses(MachineFunctionAnalysisManager &MFAM) const {
+    registerModuleAnalyses(*MFAM.MAM);
+    registerFunctionAnalyses(*MFAM.FAM);
+    registerMachineFunctionAnalyses(MFAM);
+  }
+
+  PassInstrumentationCallbacks *getPassInstrumentationCallbacks() const {
+    return PIC;
+  }
+
+protected:
+  template <typename PassT> using has_key_t = decltype(PassT::Key);
+
+  template <typename PassT>
+  using is_module_pass_t = decltype(std::declval<PassT &>().run(
+      std::declval<Module &>(), std::declval<ModuleAnalysisManager &>()));
+
+  template <typename PassT>
+  using is_function_pass_t = decltype(std::declval<PassT &>().run(
+      std::declval<Function &>(), std::declval<FunctionAnalysisManager &>()));
+
+  // Function object to maintain state while adding codegen IR passes.
+  class AddIRPass {
+  public:
+    AddIRPass(ModulePassManager &MPM, bool DebugPM, bool Check = true)
+        : MPM(MPM), FPM(DebugPM) {
+      if (Check)
+        AddingFunctionPasses = false;
+    }
+    ~AddIRPass() {
+      MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+    }
+
+    // Add Function Pass
+    template <typename PassT>
+    std::enable_if_t<is_detected<is_function_pass_t, PassT>::value>
+    operator()(PassT &&Pass) {
+      if (AddingFunctionPasses && !*AddingFunctionPasses)
+        AddingFunctionPasses = true;
+      FPM.addPass(std::forward<PassT>(Pass));
+    }
+
+    // Add Module Pass
+    template <typename PassT>
+    std::enable_if_t<is_detected<is_module_pass_t, PassT>::value &&
+                     !is_detected<is_function_pass_t, PassT>::value>
+    operator()(PassT &&Pass) {
+      assert((!AddingFunctionPasses || !*AddingFunctionPasses) &&
+             "could not add module pass after adding function pass");
+      MPM.addPass(std::forward<PassT>(Pass));
+    }
+
+  private:
+    ModulePassManager &MPM;
+    FunctionPassManager FPM;
+    // The codegen IR pipeline are mostly function passes with the exceptions of
+    // a few loop and module passes. `AddingFunctionPasses` make sures that
+    // we could only add module passes at the beginning of the pipeline. Once
+    // we begin adding function passes, we could no longer add module passes.
+    // This special-casing introduces less adaptor passes. If we have the need
+    // of adding module passes after function passes, we could change the
+    // implementation to accommodate that.
+    Optional<bool> AddingFunctionPasses;
+  };
+
+  // Function object to maintain state while adding codegen machine passes.
+  class AddMachinePass {
+  public:
+    AddMachinePass(MachineFunctionPassManager &PM) : PM(PM) {}
+
+    template <typename PassT> void operator()(PassT &&Pass) {
+      static_assert(
+          is_detected<has_key_t, PassT>::value,
+          "Machine function pass must define a static member variable `Key`.");
+      for (auto &C : BeforeCallbacks)
+        if (!C(&PassT::Key))
+          return;
+      PM.addPass(std::forward<PassT>(Pass));
+      for (auto &C : AfterCallbacks)
+        C(&PassT::Key);
+    }
+
+    template <typename PassT> void insertPass(AnalysisKey *ID, PassT Pass) {
+      AfterCallbacks.emplace_back(
+          [this, ID, Pass = std::move(Pass)](AnalysisKey *PassID) {
+            if (PassID == ID)
+              this->PM.addPass(std::move(Pass));
+          });
+    }
+
+    void disablePass(AnalysisKey *ID) {
+      BeforeCallbacks.emplace_back(
+          [ID](AnalysisKey *PassID) { return PassID != ID; });
+    }
+
+    MachineFunctionPassManager releasePM() { return std::move(PM); }
+
+  private:
+    MachineFunctionPassManager &PM;
+    SmallVector<llvm::unique_function<bool(AnalysisKey *)>, 4> BeforeCallbacks;
+    SmallVector<llvm::unique_function<void(AnalysisKey *)>, 4> AfterCallbacks;
+  };
+
+  LLVMTargetMachine &TM;
+  CGPassBuilderOption Opt;
+  PassInstrumentationCallbacks *PIC;
+
+  /// Target override these hooks to parse target-specific analyses.
+  void registerTargetAnalysis(ModuleAnalysisManager &) const {}
+  void registerTargetAnalysis(FunctionAnalysisManager &) const {}
+  void registerTargetAnalysis(MachineFunctionAnalysisManager &) const {}
+  std::pair<StringRef, bool> getTargetPassNameFromLegacyName(StringRef) const {
+    return {"", false};
+  }
+
+  template <typename TMC> TMC &getTM() const { return static_cast<TMC &>(TM); }
+  CodeGenOpt::Level getOptLevel() const { return TM.getOptLevel(); }
+
+  /// Check whether or not GlobalISel should abort on error.
+  /// When this is disabled, GlobalISel will fall back on SDISel instead of
+  /// erroring out.
+  bool isGlobalISelAbortEnabled() const {
+    return TM.Options.GlobalISelAbort == GlobalISelAbortMode::Enable;
+  }
+
+  /// Check whether or not a diagnostic should be emitted when GlobalISel
+  /// uses the fallback path. In other words, it will emit a diagnostic
+  /// when GlobalISel failed and isGlobalISelAbortEnabled is false.
+  bool reportDiagnosticWhenGlobalISelFallback() const {
+    return TM.Options.GlobalISelAbort == GlobalISelAbortMode::DisableWithDiag;
+  }
+
+  /// addInstSelector - This method should install an instruction selector pass,
+  /// which converts from LLVM code to machine instructions.
+  Error addInstSelector(AddMachinePass &) const {
+    return make_error<StringError>("addInstSelector is not overridden",
+                                   inconvertibleErrorCode());
+  }
+
+  /// Add passes that optimize instruction level parallelism for out-of-order
+  /// targets. These passes are run while the machine code is still in SSA
+  /// form, so they can use MachineTraceMetrics to control their heuristics.
+  ///
+  /// All passes added here should preserve the MachineDominatorTree,
+  /// MachineLoopInfo, and MachineTraceMetrics analyses.
+  void addILPOpts(AddMachinePass &) const {}
+
+  /// This method may be implemented by targets that want to run passes
+  /// immediately before register allocation.
+  void addPreRegAlloc(AddMachinePass &) const {}
+
+  /// addPreRewrite - Add passes to the optimized register allocation pipeline
+  /// after register allocation is complete, but before virtual registers are
+  /// rewritten to physical registers.
+  ///
+  /// These passes must preserve VirtRegMap and LiveIntervals, and when running
+  /// after RABasic or RAGreedy, they should take advantage of LiveRegMatrix.
+  /// When these passes run, VirtRegMap contains legal physreg assignments for
+  /// all virtual registers.
+  ///
+  /// Note if the target overloads addRegAssignAndRewriteOptimized, this may not
+  /// be honored. This is also not generally used for the the fast variant,
+  /// where the allocation and rewriting are done in one pass.
+  void addPreRewrite(AddMachinePass &) const {}
+
+  /// Add passes to be run immediately after virtual registers are rewritten
+  /// to physical registers.
+  void addPostRewrite(AddMachinePass &) const {}
+
+  /// This method may be implemented by targets that want to run passes after
+  /// register allocation pass pipeline but before prolog-epilog insertion.
+  void addPostRegAlloc(AddMachinePass &) const {}
+
+  /// This method may be implemented by targets that want to run passes after
+  /// prolog-epilog insertion and before the second instruction scheduling pass.
+  void addPreSched2(AddMachinePass &) const {}
+
+  /// This pass may be implemented by targets that want to run passes
+  /// immediately before machine code is emitted.
+  void addPreEmitPass(AddMachinePass &) const {}
+
+  /// Targets may add passes immediately before machine code is emitted in this
+  /// callback. This is called even later than `addPreEmitPass`.
+  // FIXME: Rename `addPreEmitPass` to something more sensible given its actual
+  // position and remove the `2` suffix here as this callback is what
+  // `addPreEmitPass` *should* be but in reality isn't.
+  void addPreEmitPass2(AddMachinePass &) const {}
+
+  /// {{@ For GlobalISel
+  ///
+
+  /// addPreISel - This method should add any "last minute" LLVM->LLVM
+  /// passes (which are run just before instruction selector).
+  void addPreISel(AddIRPass &) const {
+    llvm_unreachable("addPreISel is not overridden");
+  }
+
+  /// This method should install an IR translator pass, which converts from
+  /// LLVM code to machine instructions with possibly generic opcodes.
+  Error addIRTranslator(AddMachinePass &) const {
+    return make_error<StringError>("addIRTranslator is not overridden",
+                                   inconvertibleErrorCode());
+  }
+
+  /// This method may be implemented by targets that want to run passes
+  /// immediately before legalization.
+  void addPreLegalizeMachineIR(AddMachinePass &) const {}
+
+  /// This method should install a legalize pass, which converts the instruction
+  /// sequence into one that can be selected by the target.
+  Error addLegalizeMachineIR(AddMachinePass &) const {
+    return make_error<StringError>("addLegalizeMachineIR is not overridden",
+                                   inconvertibleErrorCode());
+  }
+
+  /// This method may be implemented by targets that want to run passes
+  /// immediately before the register bank selection.
+  void addPreRegBankSelect(AddMachinePass &) const {}
+
+  /// This method should install a register bank selector pass, which
+  /// assigns register banks to virtual registers without a register
+  /// class or register banks.
+  Error addRegBankSelect(AddMachinePass &) const {
+    return make_error<StringError>("addRegBankSelect is not overridden",
+                                   inconvertibleErrorCode());
+  }
+
+  /// This method may be implemented by targets that want to run passes
+  /// immediately before the (global) instruction selection.
+  void addPreGlobalInstructionSelect(AddMachinePass &) const {}
+
+  /// This method should install a (global) instruction selector pass, which
+  /// converts possibly generic instructions to fully target-specific
+  /// instructions, thereby constraining all generic virtual registers to
+  /// register classes.
+  Error addGlobalInstructionSelect(AddMachinePass &) const {
+    return make_error<StringError>(
+        "addGlobalInstructionSelect is not overridden",
+        inconvertibleErrorCode());
+  }
+  /// @}}
+
+  /// High level function that adds all passes necessary to go from llvm IR
+  /// representation to the MI representation.
+  /// Adds IR based lowering and target specific optimization passes and finally
+  /// the core instruction selection passes.
+  /// \returns true if an error occurred, false otherwise.
+  void addISelPasses(AddIRPass &) const;
+
+  /// Add the actual instruction selection passes. This does not include
+  /// preparation passes on IR.
+  Error addCoreISelPasses(AddMachinePass &) const;
+
+  /// Add the complete, standard set of LLVM CodeGen passes.
+  /// Fully developed targets will not generally override this.
+  Error addMachinePasses(AddMachinePass &) const;
+
+  /// Add passes to lower exception handling for the code generator.
+  void addPassesToHandleExceptions(AddIRPass &) const;
+
+  /// Add common target configurable passes that perform LLVM IR to IR
+  /// transforms following machine independent optimization.
+  void addIRPasses(AddIRPass &) const;
+
+  /// Add pass to prepare the LLVM IR for code generation. This should be done
+  /// before exception handling preparation passes.
+  void addCodeGenPrepare(AddIRPass &) const;
+
+  /// Add common passes that perform LLVM IR to IR transforms in preparation for
+  /// instruction selection.
+  void addISelPrepare(AddIRPass &) const;
+
+  /// Methods with trivial inline returns are convenient points in the common
+  /// codegen pass pipeline where targets may insert passes. Methods with
+  /// out-of-line standard implementations are major CodeGen stages called by
+  /// addMachinePasses. Some targets may override major stages when inserting
+  /// passes is insufficient, but maintaining overriden stages is more work.
+  ///
+
+  /// addMachineSSAOptimization - Add standard passes that optimize machine
+  /// instructions in SSA form.
+  void addMachineSSAOptimization(AddMachinePass &) const;
+
+  /// addFastRegAlloc - Add the minimum set of target-independent passes that
+  /// are required for fast register allocation.
+  Error addFastRegAlloc(AddMachinePass &) const;
+
+  /// addOptimizedRegAlloc - Add passes related to register allocation.
+  /// LLVMTargetMachine provides standard regalloc passes for most targets.
+  void addOptimizedRegAlloc(AddMachinePass &) const;
+
+  /// Add passes that optimize machine instructions after register allocation.
+  void addMachineLateOptimization(AddMachinePass &) const;
+
+  /// addGCPasses - Add late codegen passes that analyze code for garbage
+  /// collection. This should return true if GC info should be printed after
+  /// these passes.
+  void addGCPasses(AddMachinePass &) const {}
+
+  /// Add standard basic block placement passes.
+  void addBlockPlacement(AddMachinePass &) const;
+
+  using CreateMCStreamer =
+      std::function<Expected<std::unique_ptr<MCStreamer>>(MCContext &)>;
+  void addAsmPrinter(AddMachinePass &, CreateMCStreamer) const {
+    llvm_unreachable("addAsmPrinter is not overridden");
+  }
+
+  /// Utilities for targets to add passes to the pass manager.
+  ///
+
+  /// createTargetRegisterAllocator - Create the register allocator pass for
+  /// this target at the current optimization level.
+  void addTargetRegisterAllocator(AddMachinePass &, bool Optimized) const;
+
+  /// addMachinePasses helper to create the target-selected or overriden
+  /// regalloc pass.
+  void addRegAllocPass(AddMachinePass &, bool Optimized) const;
+
+  /// Add core register alloator passes which do the actual register assignment
+  /// and rewriting. \returns true if any passes were added.
+  Error addRegAssignmentFast(AddMachinePass &) const;
+  Error addRegAssignmentOptimized(AddMachinePass &) const;
+
+private:
+  DerivedT &derived() { return static_cast<DerivedT &>(*this); }
+  const DerivedT &derived() const {
+    return static_cast<const DerivedT &>(*this);
+  }
+};
+
+template <typename Derived>
+Error CodeGenPassBuilder<Derived>::buildPipeline(
+    ModulePassManager &MPM, MachineFunctionPassManager &MFPM,
+    raw_pwrite_stream &Out, raw_pwrite_stream *DwoOut,
+    CodeGenFileType FileType) const {
+  AddIRPass addIRPass(MPM, Opt.DebugPM);
+  addISelPasses(addIRPass);
+
+  AddMachinePass addPass(MFPM);
+  if (auto Err = addCoreISelPasses(addPass))
+    return std::move(Err);
+
+  if (auto Err = derived().addMachinePasses(addPass))
+    return std::move(Err);
+
+  derived().addAsmPrinter(
+      addPass, [this, &Out, DwoOut, FileType](MCContext &Ctx) {
+        return this->TM.createMCStreamer(Out, DwoOut, FileType, Ctx);
+      });
+
+  addPass(FreeMachineFunctionPass());
+  return Error::success();
+}
+
+static inline AAManager registerAAAnalyses(CFLAAType UseCFLAA) {
+  AAManager AA;
+
+  // The order in which these are registered determines their priority when
+  // being queried.
+
+  switch (UseCFLAA) {
+  case CFLAAType::Steensgaard:
+    AA.registerFunctionAnalysis<CFLSteensAA>();
+    break;
+  case CFLAAType::Andersen:
+    AA.registerFunctionAnalysis<CFLAndersAA>();
+    break;
+  case CFLAAType::Both:
+    AA.registerFunctionAnalysis<CFLAndersAA>();
+    AA.registerFunctionAnalysis<CFLSteensAA>();
+    break;
+  default:
+    break;
+  }
+
+  // Basic AliasAnalysis support.
+  // Add TypeBasedAliasAnalysis before BasicAliasAnalysis so that
+  // BasicAliasAnalysis wins if they disagree. This is intended to help
+  // support "obvious" type-punning idioms.
+  AA.registerFunctionAnalysis<TypeBasedAA>();
+  AA.registerFunctionAnalysis<ScopedNoAliasAA>();
+  AA.registerFunctionAnalysis<BasicAA>();
+
+  return AA;
+}
+
+template <typename Derived>
+void CodeGenPassBuilder<Derived>::registerModuleAnalyses(
+    ModuleAnalysisManager &MAM) const {
+#define MODULE_ANALYSIS(NAME, PASS_NAME, CONSTRUCTOR)                          \
+  MAM.registerPass([&] { return PASS_NAME CONSTRUCTOR; });
+#include "MachinePassRegistry.def"
+  derived().registerTargetAnalysis(MAM);
+}
+
+template <typename Derived>
+void CodeGenPassBuilder<Derived>::registerFunctionAnalyses(
+    FunctionAnalysisManager &FAM) const {
+  FAM.registerPass([this] { return registerAAAnalyses(this->Opt.UseCFLAA); });
+
+#define FUNCTION_ANALYSIS(NAME, PASS_NAME, CONSTRUCTOR)                        \
+  FAM.registerPass([&] { return PASS_NAME CONSTRUCTOR; });
+#include "MachinePassRegistry.def"
+  derived().registerTargetAnalysis(FAM);
+}
+
+template <typename Derived>
+void CodeGenPassBuilder<Derived>::registerMachineFunctionAnalyses(
+    MachineFunctionAnalysisManager &MFAM) const {
+#define MACHINE_FUNCTION_ANALYSIS(NAME, PASS_NAME, CONSTRUCTOR)                \
+  MFAM.registerPass([&] { return PASS_NAME CONSTRUCTOR; });
+#include "MachinePassRegistry.def"
+  derived().registerTargetAnalysis(MFAM);
+}
+
+// FIXME: For new PM, use pass name directly in commandline seems good.
+// Translate stringfied pass name to its old commandline name. Returns the
+// matching legacy name and a boolean value indicating if the pass is a machine
+// pass.
+template <typename Derived>
+std::pair<StringRef, bool>
+CodeGenPassBuilder<Derived>::getPassNameFromLegacyName(StringRef Name) const {
+  std::pair<StringRef, bool> Ret;
+  if (Name.empty())
+    return Ret;
+
+#define FUNCTION_PASS(NAME, PASS_NAME, CONSTRUCTOR)                            \
+  if (Name == NAME)                                                            \
+    Ret = {#PASS_NAME, false};
+#define DUMMY_FUNCTION_PASS(NAME, PASS_NAME, CONSTRUCTOR)                      \
+  if (Name == NAME)                                                            \
+    Ret = {#PASS_NAME, false};
+#define MODULE_PASS(NAME, PASS_NAME, CONSTRUCTOR)                              \
+  if (Name == NAME)                                                            \
+    Ret = {#PASS_NAME, false};
+#define DUMMY_MODULE_PASS(NAME, PASS_NAME, CONSTRUCTOR)                        \
+  if (Name == NAME)                                                            \
+    Ret = {#PASS_NAME, false};
+#define MACHINE_MODULE_PASS(NAME, PASS_NAME, CONSTRUCTOR)                      \
+  if (Name == NAME)                                                            \
+    Ret = {#PASS_NAME, true};
+#define DUMMY_MACHINE_MODULE_PASS(NAME, PASS_NAME, CONSTRUCTOR)                \
+  if (Name == NAME)                                                            \
+    Ret = {#PASS_NAME, true};
+#define MACHINE_FUNCTION_PASS(NAME, PASS_NAME, CONSTRUCTOR)                    \
+  if (Name == NAME)                                                            \
+    Ret = {#PASS_NAME, true};
+#define DUMMY_MACHINE_FUNCTION_PASS(NAME, PASS_NAME, CONSTRUCTOR)              \
+  if (Name == NAME)                                                            \
+    Ret = {#PASS_NAME, true};
+#include "llvm/CodeGen/MachinePassRegistry.def"
+
+  if (Ret.first.empty())
+    Ret = derived().getTargetPassNameFromLegacyName(Name);
+
+  if (Ret.first.empty())
+    report_fatal_error(Twine('\"') + Twine(Name) +
+                       Twine("\" pass could not be found."));
+
+  return Ret;
+}
+
+template <typename Derived>
+void CodeGenPassBuilder<Derived>::addISelPasses(AddIRPass &addPass) const {
+  if (TM.useEmulatedTLS())
+    addPass(LowerEmuTLSPass());
+
+  addPass(PreISelIntrinsicLoweringPass());
+
+  derived().addIRPasses(addPass);
+  derived().addCodeGenPrepare(addPass);
+  addPassesToHandleExceptions(addPass);
+  derived().addISelPrepare(addPass);
+}
+
+/// Add common target configurable passes that perform LLVM IR to IR transforms
+/// following machine independent optimization.
+template <typename Derived>
+void CodeGenPassBuilder<Derived>::addIRPasses(AddIRPass &addPass) const {
+  // Before running any passes, run the verifier to determine if the input
+  // coming from the front-end and/or optimizer is valid.
+  if (!Opt.DisableVerify)
+    addPass(VerifierPass());
+
+  // Run loop strength reduction before anything else.
+  if (getOptLevel() != CodeGenOpt::None && !Opt.DisableLSR) {
+    addPass(createFunctionToLoopPassAdaptor(
+        LoopStrengthReducePass(), /*UseMemorySSA*/ true, Opt.DebugPM));
+    // FIXME: use -stop-after so we could remove PrintLSR
+    if (Opt.PrintLSR)
+      addPass(PrintFunctionPass(dbgs(), "\n\n*** Code after LSR ***\n"));
+  }
+
+  if (getOptLevel() != CodeGenOpt::None) {
+    // The MergeICmpsPass tries to create memcmp calls by grouping sequences of
+    // loads and compares. ExpandMemCmpPass then tries to expand those calls
+    // into optimally-sized loads and compares. The transforms are enabled by a
+    // target lowering hook.
+    if (!Opt.DisableMergeICmps)
+      addPass(MergeICmpsPass());
+    addPass(ExpandMemCmpPass());
+  }
+
+  // Run GC lowering passes for builtin collectors
+  // TODO: add a pass insertion point here
+  addPass(GCLoweringPass());
+  addPass(ShadowStackGCLoweringPass());
+  addPass(LowerConstantIntrinsicsPass());
+
+  // Make sure that no unreachable blocks are instruction selected.
+  addPass(UnreachableBlockElimPass());
+
+  // Prepare expensive constants for SelectionDAG.
+  if (getOptLevel() != CodeGenOpt::None && !Opt.DisableConstantHoisting)
+    addPass(ConstantHoistingPass());
+
+  if (getOptLevel() != CodeGenOpt::None && !Opt.DisablePartialLibcallInlining)
+    addPass(PartiallyInlineLibCallsPass());
+
+  // Instrument function entry and exit, e.g. with calls to mcount().
+  addPass(EntryExitInstrumenterPass(/*PostInlining=*/true));
+
+  // Add scalarization of target's unsupported masked memory intrinsics pass.
+  // the unsupported intrinsic will be replaced with a chain of basic blocks,
+  // that stores/loads element one-by-one if the appropriate mask bit is set.
+  addPass(ScalarizeMaskedMemIntrinPass());
+
+  // Expand reduction intrinsics into shuffle sequences if the target wants to.
+  addPass(ExpandReductionsPass());
+}
+
+/// Turn exception handling constructs into something the code generators can
+/// handle.
+template <typename Derived>
+void CodeGenPassBuilder<Derived>::addPassesToHandleExceptions(
+    AddIRPass &addPass) const {
+  const MCAsmInfo *MCAI = TM.getMCAsmInfo();
+  assert(MCAI && "No MCAsmInfo");
+  switch (MCAI->getExceptionHandlingType()) {
+  case ExceptionHandling::SjLj:
+    // SjLj piggy-backs on dwarf for this bit. The cleanups done apply to both
+    // Dwarf EH prepare needs to be run after SjLj prepare. Otherwise,
+    // catch info can get misplaced when a selector ends up more than one block
+    // removed from the parent invoke(s). This could happen when a landing
+    // pad is shared by multiple invokes and is also a target of a normal
+    // edge from elsewhere.
+    addPass(SjLjEHPreparePass());
+    LLVM_FALLTHROUGH;
+  case ExceptionHandling::DwarfCFI:
+  case ExceptionHandling::ARM:
+  case ExceptionHandling::AIX:
+    addPass(DwarfEHPass(getOptLevel()));
+    break;
+  case ExceptionHandling::WinEH:
+    // We support using both GCC-style and MSVC-style exceptions on Windows, so
+    // add both preparation passes. Each pass will only actually run if it
+    // recognizes the personality function.
+    addPass(WinEHPass());
+    addPass(DwarfEHPass(getOptLevel()));
+    break;
+  case ExceptionHandling::Wasm:
+    // Wasm EH uses Windows EH instructions, but it does not need to demote PHIs
+    // on catchpads and cleanuppads because it does not outline them into
+    // funclets. Catchswitch blocks are not lowered in SelectionDAG, so we
+    // should remove PHIs there.
+    addPass(WinEHPass(/*DemoteCatchSwitchPHIOnly=*/false));
+    addPass(WasmEHPass());
+    break;
+  case ExceptionHandling::None:
+    addPass(LowerInvokePass());
+
+    // The lower invoke pass may create unreachable code. Remove it.
+    addPass(UnreachableBlockElimPass());
+    break;
+  }
+}
+
+/// Add pass to prepare the LLVM IR for code generation. This should be done
+/// before exception handling preparation passes.
+template <typename Derived>
+void CodeGenPassBuilder<Derived>::addCodeGenPrepare(AddIRPass &addPass) const {
+  if (getOptLevel() != CodeGenOpt::None && !Opt.DisableCGP)
+    addPass(CodeGenPreparePass());
+  // TODO: Default ctor'd RewriteSymbolPass is no-op.
+  // addPass(RewriteSymbolPass());
+}
+
+/// Add common passes that perform LLVM IR to IR transforms in preparation for
+/// instruction selection.
+template <typename Derived>
+void CodeGenPassBuilder<Derived>::addISelPrepare(AddIRPass &addPass) const {
+  derived().addPreISel(addPass);
+
+  // Add both the safe stack and the stack protection passes: each of them will
+  // only protect functions that have corresponding attributes.
+  addPass(SafeStackPass());
+  addPass(StackProtectorPass());
+
+  if (Opt.PrintISelInput)
+    addPass(PrintFunctionPass(dbgs(),
+                              "\n\n*** Final LLVM Code input to ISel ***\n"));
+
+  // All passes which modify the LLVM IR are now complete; run the verifier
+  // to ensure that the IR is valid.
+  if (!Opt.DisableVerify)
+    addPass(VerifierPass());
+}
+
+template <typename Derived>
+Error CodeGenPassBuilder<Derived>::addCoreISelPasses(
+    AddMachinePass &addPass) const {
+  // Enable FastISel with -fast-isel, but allow that to be overridden.
+  TM.setO0WantsFastISel(Opt.EnableFastISelOption.getValueOr(true));
+
+  // Determine an instruction selector.
+  enum class SelectorType { SelectionDAG, FastISel, GlobalISel };
+  SelectorType Selector;
+
+  if (Opt.EnableFastISelOption && *Opt.EnableFastISelOption == true)
+    Selector = SelectorType::FastISel;
+  else if ((Opt.EnableGlobalISelOption &&
+            *Opt.EnableGlobalISelOption == true) ||
+           (TM.Options.EnableGlobalISel &&
+            (!Opt.EnableGlobalISelOption ||
+             *Opt.EnableGlobalISelOption == false)))
+    Selector = SelectorType::GlobalISel;
+  else if (TM.getOptLevel() == CodeGenOpt::None && TM.getO0WantsFastISel())
+    Selector = SelectorType::FastISel;
+  else
+    Selector = SelectorType::SelectionDAG;
+
+  // Set consistently TM.Options.EnableFastISel and EnableGlobalISel.
+  if (Selector == SelectorType::FastISel) {
+    TM.setFastISel(true);
+    TM.setGlobalISel(false);
+  } else if (Selector == SelectorType::GlobalISel) {
+    TM.setFastISel(false);
+    TM.setGlobalISel(true);
+  }
+
+  // Add instruction selector passes.
+  if (Selector == SelectorType::GlobalISel) {
+    if (auto Err = derived().addIRTranslator(addPass))
+      return std::move(Err);
+
+    derived().addPreLegalizeMachineIR(addPass);
+
+    if (auto Err = derived().addLegalizeMachineIR(addPass))
+      return std::move(Err);
+
+    // Before running the register bank selector, ask the target if it
+    // wants to run some passes.
+    derived().addPreRegBankSelect(addPass);
+
+    if (auto Err = derived().addRegBankSelect(addPass))
+      return std::move(Err);
+
+    derived().addPreGlobalInstructionSelect(addPass);
+
+    if (auto Err = derived().addGlobalInstructionSelect(addPass))
+      return std::move(Err);
+
+    // Pass to reset the MachineFunction if the ISel failed.
+    addPass(ResetMachineFunctionPass(reportDiagnosticWhenGlobalISelFallback(),
+                                     isGlobalISelAbortEnabled()));
+
+    // Provide a fallback path when we do not want to abort on
+    // not-yet-supported input.
+    if (!isGlobalISelAbortEnabled())
+      if (auto Err = derived().addInstSelector(addPass))
+        return std::move(Err);
+
+  } else if (auto Err = derived().addInstSelector(addPass))
+    return std::move(Err);
+
+  // Expand pseudo-instructions emitted by ISel. Don't run the verifier before
+  // FinalizeISel.
+  addPass(FinalizeISelPass());
+
+  // // Print the instruction selected machine code...
+  // printAndVerify("After Instruction Selection");
+
+  return Error::success();
+}
+
+/// Add the complete set of target-independent postISel code generator passes.
+///
+/// This can be read as the standard order of major LLVM CodeGen stages. Stages
+/// with nontrivial configuration or multiple passes are broken out below in
+/// add%Stage routines.
+///
+/// Any CodeGenPassBuilder<Derived>::addXX routine may be overriden by the
+/// Target. The addPre/Post methods with empty header implementations allow
+/// injecting target-specific fixups just before or after major stages.
+/// Additionally, targets have the flexibility to change pass order within a
+/// stage by overriding default implementation of add%Stage routines below. Each
+/// technique has maintainability tradeoffs because alternate pass orders are
+/// not well supported. addPre/Post works better if the target pass is easily
+/// tied to a common pass. But if it has subtle dependencies on multiple passes,
+/// the target should override the stage instead.
+template <typename Derived>
+Error CodeGenPassBuilder<Derived>::addMachinePasses(
+    AddMachinePass &addPass) const {
+  // Add passes that optimize machine instructions in SSA form.
+  if (getOptLevel() != CodeGenOpt::None) {
+    derived().addMachineSSAOptimization(addPass);
+  } else {
+    // If the target requests it, assign local variables to stack slots relative
+    // to one another and simplify frame index references where possible.
+    addPass(LocalStackSlotPass());
+  }
+
+  if (TM.Options.EnableIPRA)
+    addPass(RegUsageInfoPropagationPass());
+
+  // Run pre-ra passes.
+  derived().addPreRegAlloc(addPass);
+
+  // Run register allocation and passes that are tightly coupled with it,
+  // including phi elimination and scheduling.
+  if (*Opt.OptimizeRegAlloc) {
+    derived().addOptimizedRegAlloc(addPass);
+  } else {
+    if (auto Err = derived().addFastRegAlloc(addPass))
+      return Err;
+  }
+
+  // Run post-ra passes.
+  derived().addPostRegAlloc(addPass);
+
+  // Insert prolog/epilog code.  Eliminate abstract frame index references...
+  if (getOptLevel() != CodeGenOpt::None) {
+    addPass(PostRAMachineSinkingPass());
+    addPass(ShrinkWrapPass());
+  }
+
+  addPass(PrologEpilogInserterPass());
+
+  /// Add passes that optimize machine instructions after register allocation.
+  if (getOptLevel() != CodeGenOpt::None)
+    derived().addMachineLateOptimization(addPass);
+
+  // Expand pseudo instructions before second scheduling pass.
+  addPass(ExpandPostRAPseudosPass());
+
+  // Run pre-sched2 passes.
+  derived().addPreSched2(addPass);
+
+  if (Opt.EnableImplicitNullChecks)
+    addPass(ImplicitNullChecksPass());
+
+  // Second pass scheduler.
+  // Let Target optionally insert this pass by itself at some other
+  // point.
+  if (getOptLevel() != CodeGenOpt::None &&
+      !TM.targetSchedulesPostRAScheduling()) {
+    if (Opt.MISchedPostRA)
+      addPass(PostMachineSchedulerPass());
+    else
+      addPass(PostRASchedulerPass());
+  }
+
+  // GC
+  derived().addGCPasses(addPass);
+
+  // Basic block placement.
+  if (getOptLevel() != CodeGenOpt::None)
+    derived().addBlockPlacement(addPass);
+
+  // Insert before XRay Instrumentation.
+  addPass(FEntryInserterPass());
+
+  addPass(XRayInstrumentationPass());
+  addPass(PatchableFunctionPass());
+
+  derived().addPreEmitPass(addPass);
+
+  if (TM.Options.EnableIPRA)
+    // Collect register usage information and produce a register mask of
+    // clobbered registers, to be used to optimize call sites.
+    addPass(RegUsageInfoCollectorPass());
+
+  addPass(FuncletLayoutPass());
+
+  addPass(StackMapLivenessPass());
+  addPass(LiveDebugValuesPass());
+
+  if (TM.Options.EnableMachineOutliner && getOptLevel() != CodeGenOpt::None &&
+      Opt.EnableMachineOutliner != RunOutliner::NeverOutline) {
+    bool RunOnAllFunctions =
+        (Opt.EnableMachineOutliner == RunOutliner::AlwaysOutline);
+    bool AddOutliner = RunOnAllFunctions || TM.Options.SupportsDefaultOutlining;
+    if (AddOutliner)
+      addPass(MachineOutlinerPass(RunOnAllFunctions));
+  }
+
+  // Add passes that directly emit MI after all other MI passes.
+  derived().addPreEmitPass2(addPass);
+
+  return Error::success();
+}
+
+/// Add passes that optimize machine instructions in SSA form.
+template <typename Derived>
+void CodeGenPassBuilder<Derived>::addMachineSSAOptimization(
+    AddMachinePass &addPass) const {
+  // Pre-ra tail duplication.
+  addPass(EarlyTailDuplicatePass());
+
+  // Optimize PHIs before DCE: removing dead PHI cycles may make more
+  // instructions dead.
+  addPass(OptimizePHIsPass());
+
+  // This pass merges large allocas. StackSlotColoring is a different pass
+  // which merges spill slots.
+  addPass(StackColoringPass());
+
+  // If the target requests it, assign local variables to stack slots relative
+  // to one another and simplify frame index references where possible.
+  addPass(LocalStackSlotPass());
+
+  // With optimization, dead code should already be eliminated. However
+  // there is one known exception: lowered code for arguments that are only
+  // used by tail calls, where the tail calls reuse the incoming stack
+  // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll).
+  addPass(DeadMachineInstructionElimPass());
+
+  // Allow targets to insert passes that improve instruction level parallelism,
+  // like if-conversion. Such passes will typically need dominator trees and
+  // loop info, just like LICM and CSE below.
+  derived().addILPOpts(addPass);
+
+  addPass(EarlyMachineLICMPass());
+  addPass(MachineCSEPass());
+
+  addPass(MachineSinkingPass());
+
+  addPass(PeepholeOptimizerPass());
+  // Clean-up the dead code that may have been generated by peephole
+  // rewriting.
+  addPass(DeadMachineInstructionElimPass());
+}
+
+//===---------------------------------------------------------------------===//
+/// Register Allocation Pass Configuration
+//===---------------------------------------------------------------------===//
+
+/// Instantiate the default register allocator pass for this target for either
+/// the optimized or unoptimized allocation path. This will be added to the pass
+/// manager by addFastRegAlloc in the unoptimized case or addOptimizedRegAlloc
+/// in the optimized case.
+///
+/// A target that uses the standard regalloc pass order for fast or optimized
+/// allocation may still override this for per-target regalloc
+/// selection. But -regalloc=... always takes precedence.
+template <typename Derived>
+void CodeGenPassBuilder<Derived>::addTargetRegisterAllocator(
+    AddMachinePass &addPass, bool Optimized) const {
+  if (Optimized)
+    addPass(RAGreedyPass());
+  else
+    addPass(RAFastPass());
+}
+
+/// Find and instantiate the register allocation pass requested by this target
+/// at the current optimization level.  Different register allocators are
+/// defined as separate passes because they may require different analysis.
+template <typename Derived>
+void CodeGenPassBuilder<Derived>::addRegAllocPass(AddMachinePass &addPass,
+                                                  bool Optimized) const {
+  if (Opt.RegAlloc == RegAllocType::Default)
+    // With no -regalloc= override, ask the target for a regalloc pass.
+    derived().addTargetRegisterAllocator(addPass, Optimized);
+  else if (Opt.RegAlloc == RegAllocType::Basic)
+    addPass(RABasicPass());
+  else if (Opt.RegAlloc == RegAllocType::Fast)
+    addPass(RAFastPass());
+  else if (Opt.RegAlloc == RegAllocType::Greedy)
+    addPass(RAGreedyPass());
+  else if (Opt.RegAlloc == RegAllocType::PBQP)
+    addPass(RAPBQPPass());
+  else
+    llvm_unreachable("unknonwn register allocator type");
+}
+
+template <typename Derived>
+Error CodeGenPassBuilder<Derived>::addRegAssignmentFast(
+    AddMachinePass &addPass) const {
+  if (Opt.RegAlloc != RegAllocType::Default &&
+      Opt.RegAlloc != RegAllocType::Fast)
+    return make_error<StringError>(
+        "Must use fast (default) register allocator for unoptimized regalloc.",
+        inconvertibleErrorCode());
+
+  addRegAllocPass(addPass, false);
+  return Error::success();
+}
+
+template <typename Derived>
+Error CodeGenPassBuilder<Derived>::addRegAssignmentOptimized(
+    AddMachinePass &addPass) const {
+  // Add the selected register allocation pass.
+  addRegAllocPass(addPass, true);
+
+  // Allow targets to change the register assignments before rewriting.
+  derived().addPreRewrite(addPass);
+
+  // Finally rewrite virtual registers.
+  addPass(VirtRegRewriterPass());
+  // Perform stack slot coloring and post-ra machine LICM.
+  //
+  // FIXME: Re-enable coloring with register when it's capable of adding
+  // kill markers.
+  addPass(StackSlotColoringPass());
+
+  return Error::success();
+}
+
+/// Add the minimum set of target-independent passes that are required for
+/// register allocation. No coalescing or scheduling.
+template <typename Derived>
+Error CodeGenPassBuilder<Derived>::addFastRegAlloc(
+    AddMachinePass &addPass) const {
+  addPass(PHIEliminationPass());
+  addPass(TwoAddressInstructionPass());
+  return derived().addRegAssignmentFast(addPass);
+}
+
+/// Add standard target-independent passes that are tightly coupled with
+/// optimized register allocation, including coalescing, machine instruction
+/// scheduling, and register allocation itself.
+template <typename Derived>
+void CodeGenPassBuilder<Derived>::addOptimizedRegAlloc(
+    AddMachinePass &addPass) const {
+  addPass(DetectDeadLanesPass());
+
+  addPass(ProcessImplicitDefsPass());
+
+  // Edge splitting is smarter with machine loop info.
+  addPass(PHIEliminationPass());
+
+  // Eventually, we want to run LiveIntervals before PHI elimination.
+  if (Opt.EarlyLiveIntervals)
+    addPass(LiveIntervalsPass());
+
+  addPass(TwoAddressInstructionPass());
+  addPass(RegisterCoalescerPass());
+
+  // The machine scheduler may accidentally create disconnected components
+  // when moving subregister definitions around, avoid this by splitting them to
+  // separate vregs before. Splitting can also improve reg. allocation quality.
+  addPass(RenameIndependentSubregsPass());
+
+  // PreRA instruction scheduling.
+  addPass(MachineSchedulerPass());
+
+  if (derived().addRegAssignmentOptimized(addPass)) {
+    // Allow targets to expand pseudo instructions depending on the choice of
+    // registers before MachineCopyPropagation.
+    derived().addPostRewrite(addPass);
+
+    // Copy propagate to forward register uses and try to eliminate COPYs that
+    // were not coalesced.
+    addPass(MachineCopyPropagationPass());
+
+    // Run post-ra machine LICM to hoist reloads / remats.
+    //
+    // FIXME: can this move into MachineLateOptimization?
+    addPass(MachineLICMPass());
+  }
+}
+
+//===---------------------------------------------------------------------===//
+/// Post RegAlloc Pass Configuration
+//===---------------------------------------------------------------------===//
+
+/// Add passes that optimize machine instructions after register allocation.
+template <typename Derived>
+void CodeGenPassBuilder<Derived>::addMachineLateOptimization(
+    AddMachinePass &addPass) const {
+  // Branch folding must be run after regalloc and prolog/epilog insertion.
+  addPass(BranchFolderPass());
+
+  // Tail duplication.
+  // Note that duplicating tail just increases code size and degrades
+  // performance for targets that require Structured Control Flow.
+  // In addition it can also make CFG irreducible. Thus we disable it.
+  if (!TM.requiresStructuredCFG())
+    addPass(TailDuplicatePass());
+
+  // Copy propagation.
+  addPass(MachineCopyPropagationPass());
+}
+
+/// Add standard basic block placement passes.
+template <typename Derived>
+void CodeGenPassBuilder<Derived>::addBlockPlacement(
+    AddMachinePass &addPass) const {
+  addPass(MachineBlockPlacementPass());
+  // Run a separate pass to collect block placement statistics.
+  if (Opt.EnableBlockPlacementStats)
+    addPass(MachineBlockPlacementStatsPass());
+}
+
+} // namespace llvm
+
+#endif // LLVM_CODEGEN_CODEGENPASSBUILDER_H
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/CommandFlags.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/CommandFlags.h
index 1b77556dcbb1..e6c64cd4dd8e 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/CommandFlags.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/CommandFlags.h
@@ -14,6 +14,7 @@
 
 #include "llvm/ADT/FloatingPointMode.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/MC/MCTargetOptionsCommandFlags.h"
@@ -74,6 +75,8 @@ bool getDontPlaceZerosInBSS();
 
 bool getEnableGuaranteedTailCallOpt();
 
+bool getEnableAIXExtendedAltivecABI();
+
 bool getDisableTailCalls();
 
 bool getStackSymbolOrdering();
@@ -94,8 +97,16 @@ Optional<bool> getExplicitDataSections();
 bool getFunctionSections();
 Optional<bool> getExplicitFunctionSections();
 
+bool getIgnoreXCOFFVisibility();
+
+bool getXCOFFTracebackTable();
+
 std::string getBBSections();
 
+std::string getStackProtectorGuard();
+unsigned getStackProtectorGuardOffset();
+std::string getStackProtectorGuardReg();
+
 unsigned getTLSSize();
 
 bool getEmulatedTLS();
@@ -114,8 +125,14 @@ bool getEnableAddrsig();
 
 bool getEmitCallSiteInfo();
 
+bool getEnableMachineFunctionSplitter();
+
 bool getEnableDebugEntryValues();
 
+bool getPseudoProbeForProfiling();
+
+bool getValueTrackingVariableLocations();
+
 bool getForceDwarfFrameSection();
 
 bool getXRayOmitFunctionIndex();
@@ -128,9 +145,16 @@ struct RegisterCodeGenFlags {
 
 llvm::BasicBlockSection getBBSectionsMode(llvm::TargetOptions &Options);
 
-// Common utility function tightly tied to the options listed here. Initializes
-// a TargetOptions object with CodeGen flags and returns it.
-TargetOptions InitTargetOptionsFromCodeGenFlags();
+llvm::StackProtectorGuards
+getStackProtectorGuardMode(llvm::TargetOptions &Options);
+
+/// Common utility function tightly tied to the options listed here. Initializes
+/// a TargetOptions object with CodeGen flags and returns it.
+/// \p TheTriple is used to determine the default value for options if
+///    options are not explicitly specified. If those triple dependant options
+///    value do not have effect for your component, a default Triple() could be
+///    passed in.
+TargetOptions InitTargetOptionsFromCodeGenFlags(const llvm::Triple &TheTriple);
 
 std::string getCPUStr();
 
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/DIE.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/DIE.h
index c7baaf6aef3d..3efef6ec0acd 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/DIE.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/DIE.h
@@ -247,6 +247,7 @@ public:
   unsigned SizeOf(const AsmPrinter *AP, dwarf::Form Form) const;
 
   void print(raw_ostream &O) const;
+  uint64_t getIndex() const { return Index; }
 };
 
 //===--------------------------------------------------------------------===//
@@ -382,12 +383,12 @@ private:
     static_assert(std::is_standard_layout<T>::value ||
                       std::is_pointer<T>::value,
                   "Expected standard layout or pointer");
-    new (reinterpret_cast<void *>(Val.buffer)) T(V);
+    new (reinterpret_cast<void *>(&Val)) T(V);
   }
 
-  template <class T> T *get() { return reinterpret_cast<T *>(Val.buffer); }
+  template <class T> T *get() { return reinterpret_cast<T *>(&Val); }
   template <class T> const T *get() const {
-    return reinterpret_cast<const T *>(Val.buffer);
+    return reinterpret_cast<const T *>(&Val);
   }
   template <class T> void destruct() { get<T>()->~T(); }
 
@@ -589,7 +590,6 @@ public:
     T &operator*() const { return *static_cast<T *>(N); }
 
     bool operator==(const iterator &X) const { return N == X.N; }
-    bool operator!=(const iterator &X) const { return N != X.N; }
   };
 
   class const_iterator
@@ -612,7 +612,6 @@ public:
     const T &operator*() const { return *static_cast<const T *>(N); }
 
     bool operator==(const const_iterator &X) const { return N == X.N; }
-    bool operator!=(const const_iterator &X) const { return N != X.N; }
   };
 
   iterator begin() {
@@ -788,7 +787,7 @@ public:
 
   /// Get the absolute offset within the .debug_info or .debug_types section
   /// for this DIE.
-  unsigned getDebugSectionOffset() const;
+  uint64_t getDebugSectionOffset() const;
 
   /// Compute the offset of this DIE and all its children.
   ///
@@ -864,14 +863,11 @@ class DIEUnit {
   /// a valid section depending on the client that is emitting DWARF.
   MCSection *Section;
   uint64_t Offset; /// .debug_info or .debug_types absolute section offset.
-  uint32_t Length; /// The length in bytes of all of the DIEs in this unit.
-  const uint16_t Version; /// The Dwarf version number for this unit.
-  const uint8_t AddrSize; /// The size in bytes of an address for this unit.
 protected:
   virtual ~DIEUnit() = default;
 
 public:
-  DIEUnit(uint16_t Version, uint8_t AddrSize, dwarf::Tag UnitTag);
+  explicit DIEUnit(dwarf::Tag UnitTag);
   DIEUnit(const DIEUnit &RHS) = delete;
   DIEUnit(DIEUnit &&RHS) = delete;
   void operator=(const DIEUnit &RHS) = delete;
@@ -893,19 +889,14 @@ public:
   ///
   /// \returns Section pointer which can be NULL.
   MCSection *getSection() const { return Section; }
-  void setDebugSectionOffset(unsigned O) { Offset = O; }
-  unsigned getDebugSectionOffset() const { return Offset; }
-  void setLength(uint64_t L) { Length = L; }
-  uint64_t getLength() const { return Length; }
-  uint16_t getDwarfVersion() const { return Version; }
-  uint16_t getAddressSize() const { return AddrSize; }
+  void setDebugSectionOffset(uint64_t O) { Offset = O; }
+  uint64_t getDebugSectionOffset() const { return Offset; }
   DIE &getUnitDie() { return Die; }
   const DIE &getUnitDie() const { return Die; }
 };
 
 struct BasicDIEUnit final : DIEUnit {
-  BasicDIEUnit(uint16_t Version, uint8_t AddrSize, dwarf::Tag UnitTag)
-      : DIEUnit(Version, AddrSize, UnitTag) {}
+  explicit BasicDIEUnit(dwarf::Tag UnitTag) : DIEUnit(UnitTag) {}
 };
 
 //===--------------------------------------------------------------------===//
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/DbgEntityHistoryCalculator.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/DbgEntityHistoryCalculator.h
index f7fc74a27fca..bca6065b1643 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/DbgEntityHistoryCalculator.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/DbgEntityHistoryCalculator.h
@@ -12,6 +12,7 @@
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/LexicalScopes.h"
 #include <utility>
 
 namespace llvm {
@@ -23,6 +24,24 @@ class MachineFunction;
 class MachineInstr;
 class TargetRegisterInfo;
 
+/// Record instruction ordering so we can query their relative positions within
+/// a function. Meta instructions are given the same ordinal as the preceding
+/// non-meta instruction. Class state is invalid if MF is modified after
+/// calling initialize.
+class InstructionOrdering {
+public:
+  void initialize(const MachineFunction &MF);
+  void clear() { InstNumberMap.clear(); }
+
+  /// Check if instruction \p A comes before \p B, where \p A and \p B both
+  /// belong to the MachineFunction passed to initialize().
+  bool isBefore(const MachineInstr *A, const MachineInstr *B) const;
+
+private:
+  /// Each instruction is assigned an order number.
+  DenseMap<const MachineInstr *, unsigned> InstNumberMap;
+};
+
 /// For each user variable, keep a list of instruction ranges where this
 /// variable is accessible. The variables are listed in order of appearance.
 class DbgValueHistoryMap {
@@ -52,6 +71,8 @@ public:
   ///   register-described debug values that have their end index
   ///   set to this entry's position in the entry vector.
   class Entry {
+    friend DbgValueHistoryMap;
+
   public:
     enum EntryKind { DbgValue, Clobber };
 
@@ -89,6 +110,9 @@ public:
     return Entries[Index];
   }
 
+  /// Drop location ranges which exist entirely outside each variable's scope.
+  void trimLocationRanges(const MachineFunction &MF, LexicalScopes &LScopes,
+                          const InstructionOrdering &Ordering);
   bool empty() const { return VarEntries.empty(); }
   void clear() { VarEntries.clear(); }
   EntriesMap::const_iterator begin() const { return VarEntries.begin(); }
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/DebugHandlerBase.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/DebugHandlerBase.h
index 4ff0fdea36ae..45823b2ba349 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/DebugHandlerBase.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/DebugHandlerBase.h
@@ -110,8 +110,13 @@ protected:
   virtual void endFunctionImpl(const MachineFunction *MF) = 0;
   virtual void skippedNonDebugFunction() {}
 
+private:
+  InstructionOrdering InstOrdering;
+
   // AsmPrinterHandler overrides.
 public:
+  void beginModule(Module *M) override;
+
   void beginInstruction(const MachineInstr *MI) override;
   void endInstruction() override;
 
@@ -129,8 +134,13 @@ public:
 
   /// If this type is derived from a base type then return base type size.
   static uint64_t getBaseTypeSize(const DIType *Ty);
+
+  /// Return true if type encoding is unsigned.
+  static bool isUnsignedDIType(const DIType *Ty);
+
+  const InstructionOrdering &getInstOrdering() const { return InstOrdering; }
 };
 
-}
+} // namespace llvm
 
 #endif
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/DwarfStringPoolEntry.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/DwarfStringPoolEntry.h
index e189352a7b2d..abeba62707c1 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/DwarfStringPoolEntry.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/DwarfStringPoolEntry.h
@@ -21,7 +21,7 @@ struct DwarfStringPoolEntry {
   static constexpr unsigned NotIndexed = -1;
 
   MCSymbol *Symbol;
-  unsigned Offset;
+  uint64_t Offset;
   unsigned Index;
 
   bool isIndexed() const { return Index != NotIndexed; }
@@ -47,7 +47,7 @@ public:
     assert(getMapEntry()->second.Symbol && "No symbol available!");
     return getMapEntry()->second.Symbol;
   }
-  unsigned getOffset() const { return getMapEntry()->second.Offset; }
+  uint64_t getOffset() const { return getMapEntry()->second.Offset; }
   bool isIndexed() const { return MapEntryAndIndexed.getInt(); }
   unsigned getIndex() const {
     assert(isIndexed());
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/FastISel.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/FastISel.h
index 7662179db44d..26bf4ab2618c 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/FastISel.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/FastISel.h
@@ -224,10 +224,6 @@ protected:
   /// makes sense (for example, on function calls)
   MachineInstr *EmitStartPt;
 
-  /// Last local value flush point. On a subsequent flush, no local value will
-  /// sink past this point.
-  MachineBasicBlock::iterator LastFlushPoint;
-
 public:
   virtual ~FastISel();
 
@@ -246,7 +242,7 @@ public:
   /// be appended.
   void startNewBlock();
 
-  /// Flush the local value map and sink local values if possible.
+  /// Flush the local value map.
   void finishBasicBlock();
 
   /// Return current debug location information.
@@ -313,10 +309,7 @@ public:
   void removeDeadCode(MachineBasicBlock::iterator I,
                       MachineBasicBlock::iterator E);
 
-  struct SavePoint {
-    MachineBasicBlock::iterator InsertPt;
-    DebugLoc DL;
-  };
+  using SavePoint = MachineBasicBlock::iterator;
 
   /// Prepare InsertPt to begin inserting instructions into the local
   /// value area and return the old insert position.
@@ -497,7 +490,10 @@ protected:
   /// - \c Add has a constant operand.
   bool canFoldAddIntoGEP(const User *GEP, const Value *Add);
 
-  /// Test whether the given value has exactly one use.
+  /// Test whether the register associated with this value has exactly one use,
+  /// in which case that single use is killing. Note that multiple IR values
+  /// may map onto the same register, in which case this is not the same as
+  /// checking that an IR value has one use.
   bool hasTrivialKill(const Value *V);
 
   /// Create a machine mem operand from the given instruction.
@@ -510,18 +506,6 @@ protected:
                    unsigned NumArgs);
   bool lowerCallTo(CallLoweringInfo &CLI);
 
-  bool isCommutativeIntrinsic(IntrinsicInst const *II) {
-    switch (II->getIntrinsicID()) {
-    case Intrinsic::sadd_with_overflow:
-    case Intrinsic::uadd_with_overflow:
-    case Intrinsic::smul_with_overflow:
-    case Intrinsic::umul_with_overflow:
-      return true;
-    default:
-      return false;
-    }
-  }
-
   bool lowerCall(const CallInst *I);
   /// Select and emit code for a binary operator instruction, which has
   /// an opcode which directly corresponds to the given ISD opcode.
@@ -536,7 +520,6 @@ protected:
   bool selectFreeze(const User *I);
   bool selectCast(const User *I, unsigned Opcode);
   bool selectExtractValue(const User *U);
-  bool selectInsertValue(const User *I);
   bool selectXRayCustomEvent(const CallInst *II);
   bool selectXRayTypedEvent(const CallInst *II);
 
@@ -572,20 +555,6 @@ private:
   /// Removes dead local value instructions after SavedLastLocalvalue.
   void removeDeadLocalValueCode(MachineInstr *SavedLastLocalValue);
 
-  struct InstOrderMap {
-    DenseMap<MachineInstr *, unsigned> Orders;
-    MachineInstr *FirstTerminator = nullptr;
-    unsigned FirstTerminatorOrder = std::numeric_limits<unsigned>::max();
-
-    void initialize(MachineBasicBlock *MBB,
-                    MachineBasicBlock::iterator LastFlushPoint);
-  };
-
-  /// Sinks the local value materialization instruction LocalMI to its first use
-  /// in the basic block, or deletes it if it is not used.
-  void sinkLocalValueMaterialization(MachineInstr &LocalMI, Register DefReg,
-                                     InstOrderMap &OrderMap);
-
   /// Insertion point before trying to select the current instruction.
   MachineBasicBlock::iterator SavedInsertPt;
 
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h
index c99ca00eac29..b6bde0249f88 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h
@@ -91,13 +91,33 @@ public:
   /// Track virtual registers created for exception pointers.
   DenseMap<const Value *, Register> CatchPadExceptionPointers;
 
-  /// Keep track of frame indices allocated for statepoints as they could be
-  /// used across basic block boundaries (e.g. for an invoke).  For each
-  /// gc.statepoint instruction, maps uniqued llvm IR values to the slots they
-  /// were spilled in.  If a value is mapped to None it means we visited the
-  /// value but didn't spill it (because it was a constant, for instance).
-  using StatepointSpillMapTy = DenseMap<const Value *, Optional<int>>;
-  DenseMap<const Instruction *, StatepointSpillMapTy> StatepointSpillMaps;
+  /// Helper object to track which of three possible relocation mechanisms are
+  /// used for a particular value being relocated over a statepoint.
+  struct StatepointRelocationRecord {
+    enum RelocType {
+      // Value did not need to be relocated and can be used directly.
+      NoRelocate,
+      // Value was spilled to stack and needs filled at the gc.relocate.
+      Spill,
+      // Value was lowered to tied def and gc.relocate should be replaced with
+      // copy from vreg.
+      VReg,
+    } type = NoRelocate;
+    // Payload contains either frame index of the stack slot in which the value
+    // was spilled, or virtual register which contains the re-definition.
+    union payload_t {
+      payload_t() : FI(-1) {}
+      int FI;
+      Register Reg;
+    } payload;
+  };
+
+  /// Keep track of each value which was relocated and the strategy used to
+  /// relocate that value.  This information is required when visiting
+  /// gc.relocates which may appear in following blocks.
+  using StatepointSpillMapTy =
+    DenseMap<const Value *, StatepointRelocationRecord>;
+  DenseMap<const Instruction *, StatepointSpillMapTy> StatepointRelocationMaps;
 
   /// StaticAllocaMap - Keep track of frame indices for fixed sized allocas in
   /// the entry block.  This allows the allocas to be efficiently referenced
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/CSEInfo.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/CSEInfo.h
index 8bd9e9443552..f76dec57c840 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/CSEInfo.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/CSEInfo.h
@@ -16,14 +16,12 @@
 #include "llvm/CodeGen/CSEConfigBase.h"
 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
 #include "llvm/CodeGen/GlobalISel/GISelWorkList.h"
-#include "llvm/CodeGen/GlobalISel/Utils.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/Pass.h"
 #include "llvm/Support/Allocator.h"
+#include "llvm/Support/CodeGen.h"
 
 namespace llvm {
+class MachineBasicBlock;
 
 /// A class that wraps MachineInstrs and derives from FoldingSetNode in order to
 /// be uniqued in a CSEMap. The tradeoff here is extra memory allocations for
@@ -184,6 +182,8 @@ public:
 
   const GISelInstProfileBuilder &addNodeIDRegNum(Register Reg) const;
 
+  const GISelInstProfileBuilder &addNodeIDReg(Register Reg) const;
+
   const GISelInstProfileBuilder &addNodeIDImmediate(int64_t Imm) const;
   const GISelInstProfileBuilder &
   addNodeIDMBB(const MachineBasicBlock *MBB) const;
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h
index 4d60dffb91db..26ae7129f04a 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h
@@ -17,8 +17,11 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/TargetCallingConv.h"
+#include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Type.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MachineValueType.h"
 #include <cstdint>
@@ -26,16 +29,14 @@
 
 namespace llvm {
 
-class CCState;
 class CallBase;
 class DataLayout;
 class Function;
+class FunctionLoweringInfo;
 class MachineIRBuilder;
-class MachineOperand;
 struct MachinePointerInfo;
 class MachineRegisterInfo;
 class TargetLowering;
-class Type;
 class Value;
 
 class CallLowering {
@@ -43,21 +44,30 @@ class CallLowering {
 
   virtual void anchor();
 public:
-  struct ArgInfo {
+  struct BaseArgInfo {
+    Type *Ty;
+    SmallVector<ISD::ArgFlagsTy, 4> Flags;
+    bool IsFixed;
+
+    BaseArgInfo(Type *Ty,
+                ArrayRef<ISD::ArgFlagsTy> Flags = ArrayRef<ISD::ArgFlagsTy>(),
+                bool IsFixed = true)
+        : Ty(Ty), Flags(Flags.begin(), Flags.end()), IsFixed(IsFixed) {}
+
+    BaseArgInfo() : Ty(nullptr), IsFixed(false) {}
+  };
+
+  struct ArgInfo : public BaseArgInfo {
     SmallVector<Register, 4> Regs;
     // If the argument had to be split into multiple parts according to the
     // target calling convention, then this contains the original vregs
     // if the argument was an incoming arg.
     SmallVector<Register, 2> OrigRegs;
-    Type *Ty;
-    SmallVector<ISD::ArgFlagsTy, 4> Flags;
-    bool IsFixed;
 
     ArgInfo(ArrayRef<Register> Regs, Type *Ty,
             ArrayRef<ISD::ArgFlagsTy> Flags = ArrayRef<ISD::ArgFlagsTy>(),
             bool IsFixed = true)
-        : Regs(Regs.begin(), Regs.end()), Ty(Ty),
-          Flags(Flags.begin(), Flags.end()), IsFixed(IsFixed) {
+        : BaseArgInfo(Ty, Flags, IsFixed), Regs(Regs.begin(), Regs.end()) {
       if (!Regs.empty() && Flags.empty())
         this->Flags.push_back(ISD::ArgFlagsTy());
       // FIXME: We should have just one way of saying "no register".
@@ -66,7 +76,7 @@ public:
              "only void types should have no register");
     }
 
-    ArgInfo() : Ty(nullptr), IsFixed(false) {}
+    ArgInfo() : BaseArgInfo() {}
   };
 
   struct CallLoweringInfo {
@@ -102,6 +112,15 @@ public:
 
     /// True if the call is to a vararg function.
     bool IsVarArg = false;
+
+    /// True if the function's return value can be lowered to registers.
+    bool CanLowerReturn = true;
+
+    /// VReg to hold the hidden sret parameter.
+    Register DemoteRegister;
+
+    /// The stack index for sret demotion.
+    int DemoteStackIndex;
   };
 
   /// Argument handling is mostly uniform between the four places that
@@ -111,15 +130,18 @@ public:
   /// argument should go, exactly what happens can vary slightly. This
   /// class abstracts the differences.
   struct ValueHandler {
-    ValueHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
-                 CCAssignFn *AssignFn)
-      : MIRBuilder(MIRBuilder), MRI(MRI), AssignFn(AssignFn) {}
+    ValueHandler(bool IsIncoming, MachineIRBuilder &MIRBuilder,
+                 MachineRegisterInfo &MRI, CCAssignFn *AssignFn)
+        : MIRBuilder(MIRBuilder), MRI(MRI), AssignFn(AssignFn),
+          IsIncomingArgumentHandler(IsIncoming) {}
 
     virtual ~ValueHandler() = default;
 
     /// Returns true if the handler is dealing with incoming arguments,
     /// i.e. those that move values from some physical location to vregs.
-    virtual bool isIncomingArgumentHandler() const = 0;
+    bool isIncomingArgumentHandler() const {
+      return IsIncomingArgumentHandler;
+    }
 
     /// Materialize a VReg containing the address of the specified
     /// stack-based object. This is either based on a FrameIndex or
@@ -147,6 +169,7 @@ public:
     virtual void assignValueToAddress(const ArgInfo &Arg, Register Addr,
                                       uint64_t Size, MachinePointerInfo &MPO,
                                       CCValAssign &VA) {
+      assert(Arg.Regs.size() == 1);
       assignValueToAddress(Arg.Regs[0], Addr, Size, MPO, VA);
     }
 
@@ -177,9 +200,22 @@ public:
     CCAssignFn *AssignFn;
 
   private:
+    bool IsIncomingArgumentHandler;
     virtual void anchor();
   };
 
+  struct IncomingValueHandler : public ValueHandler {
+    IncomingValueHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
+                         CCAssignFn *AssignFn)
+        : ValueHandler(true, MIRBuilder, MRI, AssignFn) {}
+  };
+
+  struct OutgoingValueHandler : public ValueHandler {
+    OutgoingValueHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
+                         CCAssignFn *AssignFn)
+        : ValueHandler(false, MIRBuilder, MRI, AssignFn) {}
+  };
+
 protected:
   /// Getter for generic TargetLowering class.
   const TargetLowering *getTLI() const {
@@ -192,6 +228,17 @@ protected:
     return static_cast<const XXXTargetLowering *>(TLI);
   }
 
+  /// \returns Flags corresponding to the attributes on the \p ArgIdx-th
+  /// parameter of \p Call.
+  ISD::ArgFlagsTy getAttributesForArgIdx(const CallBase &Call,
+                                         unsigned ArgIdx) const;
+
+  /// Adds flags to \p Flags based off of the attributes in \p Attrs.
+  /// \p OpIdx is the index in \p Attrs to add flags from.
+  void addArgFlagsFromAttributes(ISD::ArgFlagsTy &Flags,
+                                 const AttributeList &Attrs,
+                                 unsigned OpIdx) const;
+
   template <typename FuncInfoTy>
   void setArgFlags(ArgInfo &Arg, unsigned OpIdx, const DataLayout &DL,
                    const FuncInfoTy &FuncInfo) const;
@@ -215,7 +262,7 @@ protected:
                   MachineIRBuilder &MIRBuilder) const;
 
   /// Invoke Handler::assignArg on each of the given \p Args and then use
-  /// \p Callback to move them to the assigned locations.
+  /// \p Handler to move them to the assigned locations.
   ///
   /// \return True if everything has succeeded, false otherwise.
   bool handleAssignments(MachineIRBuilder &MIRBuilder,
@@ -235,6 +282,14 @@ protected:
                       CCAssignFn &AssignFnFixed,
                       CCAssignFn &AssignFnVarArg) const;
 
+  /// Check whether parameters to a call that are passed in callee saved
+  /// registers are the same as from the calling function.  This needs to be
+  /// checked for tail call eligibility.
+  bool parametersInCSRMatch(const MachineRegisterInfo &MRI,
+                            const uint32_t *CallerPreservedMask,
+                            const SmallVectorImpl<CCValAssign> &ArgLocs,
+                            const SmallVectorImpl<ArgInfo> &OutVals) const;
+
   /// \returns True if the calling convention for a callee and its caller pass
   /// results in the same way. Typically used for tail call eligibility checks.
   ///
@@ -265,20 +320,73 @@ public:
     return false;
   }
 
+  /// Load the returned value from the stack into virtual registers in \p VRegs.
+  /// It uses the frame index \p FI and the start offset from \p DemoteReg.
+  /// The loaded data size will be determined from \p RetTy.
+  void insertSRetLoads(MachineIRBuilder &MIRBuilder, Type *RetTy,
+                       ArrayRef<Register> VRegs, Register DemoteReg,
+                       int FI) const;
+
+  /// Store the return value given by \p VRegs into stack starting at the offset
+  /// specified in \p DemoteReg.
+  void insertSRetStores(MachineIRBuilder &MIRBuilder, Type *RetTy,
+                        ArrayRef<Register> VRegs, Register DemoteReg) const;
+
+  /// Insert the hidden sret ArgInfo to the beginning of \p SplitArgs.
+  /// This function should be called from the target specific
+  /// lowerFormalArguments when \p F requires the sret demotion.
+  void insertSRetIncomingArgument(const Function &F,
+                                  SmallVectorImpl<ArgInfo> &SplitArgs,
+                                  Register &DemoteReg, MachineRegisterInfo &MRI,
+                                  const DataLayout &DL) const;
+
+  /// For the call-base described by \p CB, insert the hidden sret ArgInfo to
+  /// the OrigArgs field of \p Info.
+  void insertSRetOutgoingArgument(MachineIRBuilder &MIRBuilder,
+                                  const CallBase &CB,
+                                  CallLoweringInfo &Info) const;
+
+  /// \return True if the return type described by \p Outs can be returned
+  /// without performing sret demotion.
+  bool checkReturn(CCState &CCInfo, SmallVectorImpl<BaseArgInfo> &Outs,
+                   CCAssignFn *Fn) const;
+
+  /// Get the type and the ArgFlags for the split components of \p RetTy as
+  /// returned by \c ComputeValueVTs.
+  void getReturnInfo(CallingConv::ID CallConv, Type *RetTy, AttributeList Attrs,
+                     SmallVectorImpl<BaseArgInfo> &Outs,
+                     const DataLayout &DL) const;
+
+  /// Toplevel function to check the return type based on the target calling
+  /// convention. \return True if the return value of \p MF can be returned
+  /// without performing sret demotion.
+  bool checkReturnTypeForCallConv(MachineFunction &MF) const;
+
+  /// This hook must be implemented to check whether the return values
+  /// described by \p Outs can fit into the return registers. If false
+  /// is returned, an sret-demotion is performed.
+  virtual bool canLowerReturn(MachineFunction &MF, CallingConv::ID CallConv,
+                              SmallVectorImpl<BaseArgInfo> &Outs,
+                              bool IsVarArg) const {
+    return true;
+  }
+
   /// This hook must be implemented to lower outgoing return values, described
   /// by \p Val, into the specified virtual registers \p VRegs.
   /// This hook is used by GlobalISel.
   ///
+  /// \p FLI is required for sret demotion.
+  ///
   /// \p SwiftErrorVReg is non-zero if the function has a swifterror parameter
   /// that needs to be implicitly returned.
   ///
   /// \return True if the lowering succeeds, false otherwise.
   virtual bool lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val,
-                           ArrayRef<Register> VRegs,
+                           ArrayRef<Register> VRegs, FunctionLoweringInfo &FLI,
                            Register SwiftErrorVReg) const {
     if (!supportSwiftError()) {
       assert(SwiftErrorVReg == 0 && "attempt to use unsupported swifterror");
-      return lowerReturn(MIRBuilder, Val, VRegs);
+      return lowerReturn(MIRBuilder, Val, VRegs, FLI);
     }
     return false;
   }
@@ -286,7 +394,8 @@ public:
   /// This hook behaves as the extended lowerReturn function, but for targets
   /// that do not support swifterror value promotion.
   virtual bool lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val,
-                           ArrayRef<Register> VRegs) const {
+                           ArrayRef<Register> VRegs,
+                           FunctionLoweringInfo &FLI) const {
     return false;
   }
 
@@ -299,12 +408,13 @@ public:
   /// the second in \c VRegs[1], and so on. For each argument, there will be one
   /// register for each non-aggregate type, as returned by \c computeValueLLTs.
   /// \p MIRBuilder is set to the proper insertion for the argument
-  /// lowering.
+  /// lowering. \p FLI is required for sret demotion.
   ///
   /// \return True if the lowering succeeded, false otherwise.
   virtual bool lowerFormalArguments(MachineIRBuilder &MIRBuilder,
                                     const Function &F,
-                                    ArrayRef<ArrayRef<Register>> VRegs) const {
+                                    ArrayRef<ArrayRef<Register>> VRegs,
+                                    FunctionLoweringInfo &FLI) const {
     return false;
   }
 
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index c317b7ed4c54..8570f5ca5dd5 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -17,6 +17,8 @@
 #ifndef LLVM_CODEGEN_GLOBALISEL_COMBINER_HELPER_H
 #define LLVM_CODEGEN_GLOBALISEL_COMBINER_HELPER_H
 
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/CodeGen/LowLevelType.h"
 #include "llvm/CodeGen/Register.h"
 #include "llvm/Support/Alignment.h"
@@ -25,12 +27,15 @@ namespace llvm {
 
 class GISelChangeObserver;
 class MachineIRBuilder;
+class MachineInstrBuilder;
 class MachineRegisterInfo;
 class MachineInstr;
 class MachineOperand;
 class GISelKnownBits;
 class MachineDominatorTree;
 class LegalizerInfo;
+struct LegalityQuery;
+class TargetLowering;
 
 struct PreferredTuple {
   LLT Ty;                // The result type of the extend.
@@ -50,6 +55,37 @@ struct PtrAddChain {
   Register Base;
 };
 
+struct RegisterImmPair {
+  Register Reg;
+  int64_t Imm;
+};
+
+struct ShiftOfShiftedLogic {
+  MachineInstr *Logic;
+  MachineInstr *Shift2;
+  Register LogicNonShiftReg;
+  uint64_t ValSum;
+};
+
+using OperandBuildSteps =
+    SmallVector<std::function<void(MachineInstrBuilder &)>, 4>;
+struct InstructionBuildSteps {
+  unsigned Opcode = 0;          /// The opcode for the produced instruction.
+  OperandBuildSteps OperandFns; /// Operands to be added to the instruction.
+  InstructionBuildSteps() = default;
+  InstructionBuildSteps(unsigned Opcode, const OperandBuildSteps &OperandFns)
+      : Opcode(Opcode), OperandFns(OperandFns) {}
+};
+
+struct InstructionStepsMatchInfo {
+  /// Describes instructions to be built during a combine.
+  SmallVector<InstructionBuildSteps, 2> InstrsToBuild;
+  InstructionStepsMatchInfo() = default;
+  InstructionStepsMatchInfo(
+      std::initializer_list<InstructionBuildSteps> InstrsToBuild)
+      : InstrsToBuild(InstrsToBuild) {}
+};
+
 class CombinerHelper {
 protected:
   MachineIRBuilder &Builder;
@@ -69,6 +105,12 @@ public:
     return KB;
   }
 
+  const TargetLowering &getTargetLowering() const;
+
+  /// \return true if the combine is running prior to legalization, or if \p
+  /// Query is legal on the target.
+  bool isLegalOrBeforeLegalizer(const LegalityQuery &Query) const;
+
   /// MachineRegisterInfo::replaceRegWith() and inform the observer of the changes
   void replaceRegWith(MachineRegisterInfo &MRI, Register FromReg, Register ToReg) const;
 
@@ -107,12 +149,17 @@ public:
   bool matchCombineIndexedLoadStore(MachineInstr &MI, IndexedLoadStoreMatchInfo &MatchInfo);
   void applyCombineIndexedLoadStore(MachineInstr &MI, IndexedLoadStoreMatchInfo &MatchInfo);
 
-  bool matchSextAlreadyExtended(MachineInstr &MI);
-  bool applySextAlreadyExtended(MachineInstr &MI);
+  bool matchSextTruncSextLoad(MachineInstr &MI);
+  bool applySextTruncSextLoad(MachineInstr &MI);
 
-  bool matchElideBrByInvertingCond(MachineInstr &MI);
-  void applyElideBrByInvertingCond(MachineInstr &MI);
-  bool tryElideBrByInvertingCond(MachineInstr &MI);
+  /// Match sext_inreg(load p), imm -> sextload p
+  bool matchSextInRegOfLoad(MachineInstr &MI, std::tuple<Register, unsigned> &MatchInfo);
+  bool applySextInRegOfLoad(MachineInstr &MI, std::tuple<Register, unsigned> &MatchInfo);
+
+  /// If a brcond's true block is not the fallthrough, make it so by inverting
+  /// the condition and swapping operands.
+  bool matchOptBrCondByInvertingCond(MachineInstr &MI);
+  void applyOptBrCondByInvertingCond(MachineInstr &MI);
 
   /// If \p MI is G_CONCAT_VECTORS, try to combine it.
   /// Returns true if MI changed.
@@ -189,10 +236,28 @@ public:
   bool matchPtrAddImmedChain(MachineInstr &MI, PtrAddChain &MatchInfo);
   bool applyPtrAddImmedChain(MachineInstr &MI, PtrAddChain &MatchInfo);
 
+  /// Fold (shift (shift base, x), y) -> (shift base (x+y))
+  bool matchShiftImmedChain(MachineInstr &MI, RegisterImmPair &MatchInfo);
+  bool applyShiftImmedChain(MachineInstr &MI, RegisterImmPair &MatchInfo);
+
+  /// If we have a shift-by-constant of a bitwise logic op that itself has a
+  /// shift-by-constant operand with identical opcode, we may be able to convert
+  /// that into 2 independent shifts followed by the logic op.
+  bool matchShiftOfShiftedLogic(MachineInstr &MI,
+                                ShiftOfShiftedLogic &MatchInfo);
+  bool applyShiftOfShiftedLogic(MachineInstr &MI,
+                                ShiftOfShiftedLogic &MatchInfo);
+
   /// Transform a multiply by a power-of-2 value to a left shift.
   bool matchCombineMulToShl(MachineInstr &MI, unsigned &ShiftVal);
   bool applyCombineMulToShl(MachineInstr &MI, unsigned &ShiftVal);
 
+  // Transform a G_SHL with an extended source into a narrower shift if
+  // possible.
+  bool matchCombineShlOfExtend(MachineInstr &MI, RegisterImmPair &MatchData);
+  bool applyCombineShlOfExtend(MachineInstr &MI,
+                               const RegisterImmPair &MatchData);
+
   /// Reduce a shift by a constant to an unmerge and a shift on a half sized
   /// type. This will not produce a shift smaller than \p TargetShiftSize.
   bool matchCombineShiftToUnmerge(MachineInstr &MI, unsigned TargetShiftSize,
@@ -200,6 +265,86 @@ public:
   bool applyCombineShiftToUnmerge(MachineInstr &MI, const unsigned &ShiftVal);
   bool tryCombineShiftToUnmerge(MachineInstr &MI, unsigned TargetShiftAmount);
 
+  /// Transform <ty,...> G_UNMERGE(G_MERGE ty X, Y, Z) -> ty X, Y, Z.
+  bool
+  matchCombineUnmergeMergeToPlainValues(MachineInstr &MI,
+                                        SmallVectorImpl<Register> &Operands);
+  bool
+  applyCombineUnmergeMergeToPlainValues(MachineInstr &MI,
+                                        SmallVectorImpl<Register> &Operands);
+
+  /// Transform G_UNMERGE Constant -> Constant1, Constant2, ...
+  bool matchCombineUnmergeConstant(MachineInstr &MI,
+                                   SmallVectorImpl<APInt> &Csts);
+  bool applyCombineUnmergeConstant(MachineInstr &MI,
+                                   SmallVectorImpl<APInt> &Csts);
+
+  /// Transform X, Y<dead> = G_UNMERGE Z -> X = G_TRUNC Z.
+  bool matchCombineUnmergeWithDeadLanesToTrunc(MachineInstr &MI);
+  bool applyCombineUnmergeWithDeadLanesToTrunc(MachineInstr &MI);
+
+  /// Transform X, Y = G_UNMERGE(G_ZEXT(Z)) -> X = G_ZEXT(Z); Y = G_CONSTANT 0
+  bool matchCombineUnmergeZExtToZExt(MachineInstr &MI);
+  bool applyCombineUnmergeZExtToZExt(MachineInstr &MI);
+
+  /// Transform fp_instr(cst) to constant result of the fp operation.
+  bool matchCombineConstantFoldFpUnary(MachineInstr &MI,
+                                       Optional<APFloat> &Cst);
+  bool applyCombineConstantFoldFpUnary(MachineInstr &MI,
+                                       Optional<APFloat> &Cst);
+
+  /// Transform IntToPtr(PtrToInt(x)) to x if cast is in the same address space.
+  bool matchCombineI2PToP2I(MachineInstr &MI, Register &Reg);
+  bool applyCombineI2PToP2I(MachineInstr &MI, Register &Reg);
+
+  /// Transform PtrToInt(IntToPtr(x)) to x.
+  bool matchCombineP2IToI2P(MachineInstr &MI, Register &Reg);
+  bool applyCombineP2IToI2P(MachineInstr &MI, Register &Reg);
+
+  /// Transform G_ADD (G_PTRTOINT x), y -> G_PTRTOINT (G_PTR_ADD x, y)
+  /// Transform G_ADD y, (G_PTRTOINT x) -> G_PTRTOINT (G_PTR_ADD x, y)
+  bool matchCombineAddP2IToPtrAdd(MachineInstr &MI,
+                                  std::pair<Register, bool> &PtrRegAndCommute);
+  bool applyCombineAddP2IToPtrAdd(MachineInstr &MI,
+                                  std::pair<Register, bool> &PtrRegAndCommute);
+
+  // Transform G_PTR_ADD (G_PTRTOINT C1), C2 -> C1 + C2
+  bool matchCombineConstPtrAddToI2P(MachineInstr &MI, int64_t &NewCst);
+  bool applyCombineConstPtrAddToI2P(MachineInstr &MI, int64_t &NewCst);
+
+  /// Transform anyext(trunc(x)) to x.
+  bool matchCombineAnyExtTrunc(MachineInstr &MI, Register &Reg);
+  bool applyCombineAnyExtTrunc(MachineInstr &MI, Register &Reg);
+
+  /// Transform [asz]ext([asz]ext(x)) to [asz]ext x.
+  bool matchCombineExtOfExt(MachineInstr &MI,
+                            std::tuple<Register, unsigned> &MatchInfo);
+  bool applyCombineExtOfExt(MachineInstr &MI,
+                            std::tuple<Register, unsigned> &MatchInfo);
+
+  /// Transform fneg(fneg(x)) to x.
+  bool matchCombineFNegOfFNeg(MachineInstr &MI, Register &Reg);
+
+  /// Match fabs(fabs(x)) to fabs(x).
+  bool matchCombineFAbsOfFAbs(MachineInstr &MI, Register &Src);
+  bool applyCombineFAbsOfFAbs(MachineInstr &MI, Register &Src);
+
+  /// Transform trunc ([asz]ext x) to x or ([asz]ext x) or (trunc x).
+  bool matchCombineTruncOfExt(MachineInstr &MI,
+                              std::pair<Register, unsigned> &MatchInfo);
+  bool applyCombineTruncOfExt(MachineInstr &MI,
+                              std::pair<Register, unsigned> &MatchInfo);
+
+  /// Transform trunc (shl x, K) to shl (trunc x),
+  /// K => K < VT.getScalarSizeInBits().
+  bool matchCombineTruncOfShl(MachineInstr &MI,
+                              std::pair<Register, Register> &MatchInfo);
+  bool applyCombineTruncOfShl(MachineInstr &MI,
+                              std::pair<Register, Register> &MatchInfo);
+
+  /// Transform G_MUL(x, -1) to G_SUB(0, x)
+  bool applyCombineMulByNegativeOne(MachineInstr &MI);
+
   /// Return true if any explicit use operand on \p MI is defined by a
   /// G_IMPLICIT_DEF.
   bool matchAnyExplicitUseIsUndef(MachineInstr &MI);
@@ -214,6 +359,13 @@ public:
   /// Return true if a G_STORE instruction \p MI is storing an undef value.
   bool matchUndefStore(MachineInstr &MI);
 
+  /// Return true if a G_SELECT instruction \p MI has an undef comparison.
+  bool matchUndefSelectCmp(MachineInstr &MI);
+
+  /// Return true if a G_SELECT instruction \p MI has a constant comparison. If
+  /// true, \p OpIdx will store the operand index of the known selected value.
+  bool matchConstantSelectCmp(MachineInstr &MI, unsigned &OpIdx);
+
   /// Replace an instruction with a G_FCONSTANT with value \p C.
   bool replaceInstWithFConstant(MachineInstr &MI, double C);
 
@@ -226,6 +378,9 @@ public:
   /// Delete \p MI and replace all of its uses with its \p OpIdx-th operand.
   bool replaceSingleDefInstWithOperand(MachineInstr &MI, unsigned OpIdx);
 
+  /// Delete \p MI and replace all of its uses with \p Replacement.
+  bool replaceSingleDefInstWithReg(MachineInstr &MI, Register Replacement);
+
   /// Return true if \p MOP1 and \p MOP2 are register operands are defined by
   /// equivalent instructions.
   bool matchEqualDefs(const MachineOperand &MOP1, const MachineOperand &MOP2);
@@ -243,6 +398,12 @@ public:
   /// Check if operand \p OpIdx is zero.
   bool matchOperandIsZero(MachineInstr &MI, unsigned OpIdx);
 
+  /// Check if operand \p OpIdx is undef.
+  bool matchOperandIsUndef(MachineInstr &MI, unsigned OpIdx);
+
+  /// Check if operand \p OpIdx is known to be a power of 2.
+  bool matchOperandIsKnownToBeAPowerOfTwo(MachineInstr &MI, unsigned OpIdx);
+
   /// Erase \p MI
   bool eraseInst(MachineInstr &MI);
 
@@ -252,6 +413,79 @@ public:
   bool applySimplifyAddToSub(MachineInstr &MI,
                              std::tuple<Register, Register> &MatchInfo);
 
+  /// Match (logic_op (op x...), (op y...)) -> (op (logic_op x, y))
+  bool
+  matchHoistLogicOpWithSameOpcodeHands(MachineInstr &MI,
+                                       InstructionStepsMatchInfo &MatchInfo);
+
+  /// Replace \p MI with a series of instructions described in \p MatchInfo.
+  bool applyBuildInstructionSteps(MachineInstr &MI,
+                                  InstructionStepsMatchInfo &MatchInfo);
+
+  /// Match ashr (shl x, C), C -> sext_inreg (C)
+  bool matchAshrShlToSextInreg(MachineInstr &MI,
+                               std::tuple<Register, int64_t> &MatchInfo);
+  bool applyAshShlToSextInreg(MachineInstr &MI,
+                              std::tuple<Register, int64_t> &MatchInfo);
+  /// \return true if \p MI is a G_AND instruction whose operands are x and y
+  /// where x & y == x or x & y == y. (E.g., one of operands is all-ones value.)
+  ///
+  /// \param [in] MI - The G_AND instruction.
+  /// \param [out] Replacement - A register the G_AND should be replaced with on
+  /// success.
+  bool matchRedundantAnd(MachineInstr &MI, Register &Replacement);
+
+  /// \return true if \p MI is a G_OR instruction whose operands are x and y
+  /// where x | y == x or x | y == y. (E.g., one of operands is all-zeros
+  /// value.)
+  ///
+  /// \param [in] MI - The G_OR instruction.
+  /// \param [out] Replacement - A register the G_OR should be replaced with on
+  /// success.
+  bool matchRedundantOr(MachineInstr &MI, Register &Replacement);
+
+  /// \return true if \p MI is a G_SEXT_INREG that can be erased.
+  bool matchRedundantSExtInReg(MachineInstr &MI);
+
+  /// Combine inverting a result of a compare into the opposite cond code.
+  bool matchNotCmp(MachineInstr &MI, SmallVectorImpl<Register> &RegsToNegate);
+  bool applyNotCmp(MachineInstr &MI, SmallVectorImpl<Register> &RegsToNegate);
+
+  /// Fold (xor (and x, y), y) -> (and (not x), y)
+  ///{
+  bool matchXorOfAndWithSameReg(MachineInstr &MI,
+                                std::pair<Register, Register> &MatchInfo);
+  bool applyXorOfAndWithSameReg(MachineInstr &MI,
+                                std::pair<Register, Register> &MatchInfo);
+  ///}
+
+  /// Combine G_PTR_ADD with nullptr to G_INTTOPTR
+  bool matchPtrAddZero(MachineInstr &MI);
+  bool applyPtrAddZero(MachineInstr &MI);
+
+  /// Combine G_UREM x, (known power of 2) to an add and bitmasking.
+  bool applySimplifyURemByPow2(MachineInstr &MI);
+
+  bool matchCombineInsertVecElts(MachineInstr &MI,
+                                 SmallVectorImpl<Register> &MatchInfo);
+
+  bool applyCombineInsertVecElts(MachineInstr &MI,
+                             SmallVectorImpl<Register> &MatchInfo);
+
+  /// Match expression trees of the form
+  ///
+  /// \code
+  ///  sN *a = ...
+  ///  sM val = a[0] | (a[1] << N) | (a[2] << 2N) | (a[3] << 3N) ...
+  /// \endcode
+  ///
+  /// And check if the tree can be replaced with a M-bit load + possibly a
+  /// bswap.
+  bool matchLoadOrCombine(MachineInstr &MI,
+                          std::function<void(MachineIRBuilder &)> &MatchInfo);
+  bool applyLoadOrCombine(MachineInstr &MI,
+                          std::function<void(MachineIRBuilder &)> &MatchInfo);
+
   /// Try to transform \p MI by using all of the above
   /// combine functions. Returns true if changed.
   bool tryCombine(MachineInstr &MI);
@@ -280,6 +514,30 @@ private:
   /// \returns true if a candidate is found.
   bool findPreIndexCandidate(MachineInstr &MI, Register &Addr, Register &Base,
                              Register &Offset);
+
+  /// Helper function for matchLoadOrCombine. Searches for Registers
+  /// which may have been produced by a load instruction + some arithmetic.
+  ///
+  /// \param [in] Root - The search root.
+  ///
+  /// \returns The Registers found during the search.
+  Optional<SmallVector<Register, 8>>
+  findCandidatesForLoadOrCombine(const MachineInstr *Root) const;
+
+  /// Helper function for matchLoadOrCombine.
+  ///
+  /// Checks if every register in \p RegsToVisit is defined by a load
+  /// instruction + some arithmetic.
+  ///
+  /// \param [out] MemOffset2Idx - Maps the byte positions each load ends up
+  /// at to the index of the load.
+  /// \param [in] MemSizeInBits - The number of bits each load should produce.
+  ///
+  /// \returns The lowest-index load found and the lowest index on success.
+  Optional<std::pair<MachineInstr *, int64_t>> findLoadOffsetsForLoadOrCombine(
+      SmallDenseMap<int64_t, int64_t, 8> &MemOffset2Idx,
+      const SmallVector<Register, 8> &RegsToVisit,
+      const unsigned MemSizeInBits);
 };
 } // namespace llvm
 
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/GISelChangeObserver.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/GISelChangeObserver.h
index d8fe4b3103db..dd7f04a33f4b 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/GISelChangeObserver.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/GISelChangeObserver.h
@@ -51,7 +51,7 @@ public:
   /// For convenience, finishedChangingAllUsesOfReg() will report the completion
   /// of the changes. The use list may change between this call and
   /// finishedChangingAllUsesOfReg().
-  void changingAllUsesOfReg(const MachineRegisterInfo &MRI, unsigned Reg);
+  void changingAllUsesOfReg(const MachineRegisterInfo &MRI, Register Reg);
   /// All instructions reported as changing by changingAllUsesOfReg() have
   /// finished being changed.
   void finishedChangingAllUsesOfReg();
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/GISelKnownBits.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/GISelKnownBits.h
index 55cf54d6e946..eafed3760738 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/GISelKnownBits.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/GISelKnownBits.h
@@ -13,13 +13,11 @@
 #ifndef LLVM_CODEGEN_GLOBALISEL_KNOWNBITSINFO_H
 #define LLVM_CODEGEN_GLOBALISEL_KNOWNBITSINFO_H
 
-#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/Register.h"
-#include "llvm/IR/PassManager.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
 #include "llvm/Support/KnownBits.h"
 
 namespace llvm {
@@ -36,10 +34,16 @@ class GISelKnownBits : public GISelChangeObserver {
   /// Cache maintained during a computeKnownBits request.
   SmallDenseMap<Register, KnownBits, 16> ComputeKnownBitsCache;
 
+  void computeKnownBitsMin(Register Src0, Register Src1, KnownBits &Known,
+                           const APInt &DemandedElts,
+                           unsigned Depth = 0);
+
+  unsigned computeNumSignBitsMin(Register Src0, Register Src1,
+                                 const APInt &DemandedElts, unsigned Depth = 0);
+
 public:
   GISelKnownBits(MachineFunction &MF, unsigned MaxDepth = 6);
   virtual ~GISelKnownBits() = default;
-  void setMF(MachineFunction &MF);
 
   const MachineFunction &getMachineFunction() const {
     return MF;
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/GISelWorkList.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/GISelWorkList.h
index b0bb519283b1..9e7ade3ee329 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/GISelWorkList.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/GISelWorkList.h
@@ -11,9 +11,6 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/Support/Debug.h"
 
 namespace llvm {
 
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
index 751ab67c4e97..8eab8a5846a7 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
@@ -20,12 +20,14 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
-#include "llvm/CodeGen/SwiftErrorValueTracking.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/SwiftErrorValueTracking.h"
 #include "llvm/CodeGen/SwitchLoweringUtils.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/Support/Allocator.h"
+#include "llvm/Support/CodeGen.h"
 #include <memory>
 #include <utility>
 
@@ -36,8 +38,8 @@ class BasicBlock;
 class CallInst;
 class CallLowering;
 class Constant;
+class ConstrainedFPIntrinsic;
 class DataLayout;
-class FunctionLoweringInfo;
 class Instruction;
 class MachineBasicBlock;
 class MachineFunction;
@@ -217,12 +219,14 @@ private:
 
   /// Translate an LLVM string intrinsic (memcpy, memset, ...).
   bool translateMemFunc(const CallInst &CI, MachineIRBuilder &MIRBuilder,
-                        Intrinsic::ID ID);
+                        unsigned Opcode);
 
   void getStackGuard(Register DstReg, MachineIRBuilder &MIRBuilder);
 
   bool translateOverflowIntrinsic(const CallInst &CI, unsigned Op,
                                   MachineIRBuilder &MIRBuilder);
+  bool translateFixedPointIntrinsic(unsigned Op, const CallInst &CI,
+                                    MachineIRBuilder &MIRBuilder);
 
   /// Helper function for translateSimpleIntrinsic.
   /// \return The generic opcode for \p IntrinsicID if \p IntrinsicID is a
@@ -256,6 +260,19 @@ private:
   /// \pre \p U is a call instruction.
   bool translateCall(const User &U, MachineIRBuilder &MIRBuilder);
 
+  /// When an invoke or a cleanupret unwinds to the next EH pad, there are
+  /// many places it could ultimately go. In the IR, we have a single unwind
+  /// destination, but in the machine CFG, we enumerate all the possible blocks.
+  /// This function skips over imaginary basic blocks that hold catchswitch
+  /// instructions, and finds all the "real" machine
+  /// basic block destinations. As those destinations may not be successors of
+  /// EHPadBB, here we also calculate the edge probability to those
+  /// destinations. The passed-in Prob is the edge probability to EHPadBB.
+  bool findUnwindDestinations(
+      const BasicBlock *EHPadBB, BranchProbability Prob,
+      SmallVectorImpl<std::pair<MachineBasicBlock *, BranchProbability>>
+          &UnwindDests);
+
   bool translateInvoke(const User &U, MachineIRBuilder &MIRBuilder);
 
   bool translateCallBr(const User &U, MachineIRBuilder &MIRBuilder);
@@ -287,11 +304,37 @@ private:
   /// MachineBasicBlocks for the function have been created.
   void finishPendingPhis();
 
+  /// Translate \p Inst into a unary operation \p Opcode.
+  /// \pre \p U is a unary operation.
+  bool translateUnaryOp(unsigned Opcode, const User &U,
+                        MachineIRBuilder &MIRBuilder);
+
   /// Translate \p Inst into a binary operation \p Opcode.
   /// \pre \p U is a binary operation.
   bool translateBinaryOp(unsigned Opcode, const User &U,
                          MachineIRBuilder &MIRBuilder);
 
+  /// If the set of cases should be emitted as a series of branches, return
+  /// true. If we should emit this as a bunch of and/or'd together conditions,
+  /// return false.
+  bool shouldEmitAsBranches(const std::vector<SwitchCG::CaseBlock> &Cases);
+  /// Helper method for findMergedConditions.
+  /// This function emits a branch and is used at the leaves of an OR or an
+  /// AND operator tree.
+  void emitBranchForMergedCondition(const Value *Cond, MachineBasicBlock *TBB,
+                                    MachineBasicBlock *FBB,
+                                    MachineBasicBlock *CurBB,
+                                    MachineBasicBlock *SwitchBB,
+                                    BranchProbability TProb,
+                                    BranchProbability FProb, bool InvertCond);
+  /// Used during condbr translation to find trees of conditions that can be
+  /// optimized.
+  void findMergedConditions(const Value *Cond, MachineBasicBlock *TBB,
+                            MachineBasicBlock *FBB, MachineBasicBlock *CurBB,
+                            MachineBasicBlock *SwitchBB,
+                            Instruction::BinaryOps Opc, BranchProbability TProb,
+                            BranchProbability FProb, bool InvertCond);
+
   /// Translate branch (br) instruction.
   /// \pre \p U is a branch instruction.
   bool translateBr(const User &U, MachineIRBuilder &MIRBuilder);
@@ -305,19 +348,23 @@ private:
   void emitSwitchCase(SwitchCG::CaseBlock &CB, MachineBasicBlock *SwitchBB,
                       MachineIRBuilder &MIB);
 
-  bool lowerJumpTableWorkItem(SwitchCG::SwitchWorkListItem W,
-                              MachineBasicBlock *SwitchMBB,
-                              MachineBasicBlock *CurMBB,
-                              MachineBasicBlock *DefaultMBB,
-                              MachineIRBuilder &MIB,
-                              MachineFunction::iterator BBI,
-                              BranchProbability UnhandledProbs,
-                              SwitchCG::CaseClusterIt I,
-                              MachineBasicBlock *Fallthrough,
-                              bool FallthroughUnreachable);
-
-  bool lowerSwitchRangeWorkItem(SwitchCG::CaseClusterIt I,
-                                Value *Cond,
+  /// Generate for for the BitTest header block, which precedes each sequence of
+  /// BitTestCases.
+  void emitBitTestHeader(SwitchCG::BitTestBlock &BTB,
+                         MachineBasicBlock *SwitchMBB);
+  /// Generate code to produces one "bit test" for a given BitTestCase \p B.
+  void emitBitTestCase(SwitchCG::BitTestBlock &BB, MachineBasicBlock *NextMBB,
+                       BranchProbability BranchProbToNext, Register Reg,
+                       SwitchCG::BitTestCase &B, MachineBasicBlock *SwitchBB);
+
+  bool lowerJumpTableWorkItem(
+      SwitchCG::SwitchWorkListItem W, MachineBasicBlock *SwitchMBB,
+      MachineBasicBlock *CurMBB, MachineBasicBlock *DefaultMBB,
+      MachineIRBuilder &MIB, MachineFunction::iterator BBI,
+      BranchProbability UnhandledProbs, SwitchCG::CaseClusterIt I,
+      MachineBasicBlock *Fallthrough, bool FallthroughUnreachable);
+
+  bool lowerSwitchRangeWorkItem(SwitchCG::CaseClusterIt I, Value *Cond,
                                 MachineBasicBlock *Fallthrough,
                                 bool FallthroughUnreachable,
                                 BranchProbability UnhandledProbs,
@@ -325,6 +372,14 @@ private:
                                 MachineIRBuilder &MIB,
                                 MachineBasicBlock *SwitchMBB);
 
+  bool lowerBitTestWorkItem(
+      SwitchCG::SwitchWorkListItem W, MachineBasicBlock *SwitchMBB,
+      MachineBasicBlock *CurMBB, MachineBasicBlock *DefaultMBB,
+      MachineIRBuilder &MIB, MachineFunction::iterator BBI,
+      BranchProbability DefaultProb, BranchProbability UnhandledProbs,
+      SwitchCG::CaseClusterIt I, MachineBasicBlock *Fallthrough,
+      bool FallthroughUnreachable);
+
   bool lowerSwitchWorkItem(SwitchCG::SwitchWorkListItem W, Value *Cond,
                            MachineBasicBlock *SwitchMBB,
                            MachineBasicBlock *DefaultMBB,
@@ -351,8 +406,6 @@ private:
   /// \pre \p U is a return instruction.
   bool translateRet(const User &U, MachineIRBuilder &MIRBuilder);
 
-  bool translateFSub(const User &U, MachineIRBuilder &MIRBuilder);
-
   bool translateFNeg(const User &U, MachineIRBuilder &MIRBuilder);
 
   bool translateAdd(const User &U, MachineIRBuilder &MIRBuilder) {
@@ -437,6 +490,9 @@ private:
   bool translateFAdd(const User &U, MachineIRBuilder &MIRBuilder) {
     return translateBinaryOp(TargetOpcode::G_FADD, U, MIRBuilder);
   }
+  bool translateFSub(const User &U, MachineIRBuilder &MIRBuilder) {
+    return translateBinaryOp(TargetOpcode::G_FSUB, U, MIRBuilder);
+  }
   bool translateFMul(const User &U, MachineIRBuilder &MIRBuilder) {
     return translateBinaryOp(TargetOpcode::G_FMUL, U, MIRBuilder);
   }
@@ -515,6 +571,8 @@ private:
   /// Current target configuration. Controls how the pass handles errors.
   const TargetPassConfig *TPC;
 
+  CodeGenOpt::Level OptLevel;
+
   /// Current optimization remark emitter. Used to report failures.
   std::unique_ptr<OptimizationRemarkEmitter> ORE;
 
@@ -614,12 +672,12 @@ private:
   BranchProbability getEdgeProbability(const MachineBasicBlock *Src,
                                        const MachineBasicBlock *Dst) const;
 
-  void addSuccessorWithProb(MachineBasicBlock *Src, MachineBasicBlock *Dst,
-                            BranchProbability Prob);
+  void addSuccessorWithProb(
+      MachineBasicBlock *Src, MachineBasicBlock *Dst,
+      BranchProbability Prob = BranchProbability::getUnknown());
 
 public:
-  // Ctor, nothing fancy.
-  IRTranslator();
+  IRTranslator(CodeGenOpt::Level OptLevel = CodeGenOpt::None);
 
   StringRef getPassName() const override { return "IRTranslator"; }
 
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelector.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelector.h
index 1af96cb4a9ee..5b8243a93e7f 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelector.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelector.h
@@ -112,6 +112,14 @@ enum {
   /// - InsnID - Instruction ID
   /// - Expected opcode
   GIM_CheckOpcode,
+
+  /// Check the opcode on the specified instruction, checking 2 acceptable
+  /// alternatives.
+  /// - InsnID - Instruction ID
+  /// - Expected opcode
+  /// - Alternative expected opcode
+  GIM_CheckOpcodeIsEither,
+
   /// Check the instruction has the right number of operands
   /// - InsnID - Instruction ID
   /// - Expected number of operands
@@ -164,6 +172,15 @@ enum {
   GIM_CheckMemorySizeEqualToLLT,
   GIM_CheckMemorySizeLessThanLLT,
   GIM_CheckMemorySizeGreaterThanLLT,
+
+  /// Check if this is a vector that can be treated as a vector splat
+  /// constant. This is valid for both G_BUILD_VECTOR as well as
+  /// G_BUILD_VECTOR_TRUNC. For AllOnes refers to individual bits, so a -1
+  /// element.
+  /// - InsnID - Instruction ID
+  GIM_CheckIsBuildVectorAllOnes,
+  GIM_CheckIsBuildVectorAllZeros,
+
   /// Check a generic C++ instruction predicate
   /// - InsnID - Instruction ID
   /// - PredicateID - The ID of the predicate function to call
@@ -237,6 +254,15 @@ enum {
   /// - OtherOpIdx - Other operand index
   GIM_CheckIsSameOperand,
 
+  /// Predicates with 'let PredicateCodeUsesOperands = 1' need to examine some
+  /// named operands that will be recorded in RecordedOperands. Names of these
+  /// operands are referenced in predicate argument list. Emitter determines
+  /// StoreIdx(corresponds to the order in which names appear in argument list).
+  /// - InsnID - Instruction ID
+  /// - OpIdx - Operand index
+  /// - StoreIdx - Store location in RecordedOperands.
+  GIM_RecordNamedOperand,
+
   /// Fail the current try-block, or completely fail to match if there is no
   /// current try-block.
   GIM_Reject,
@@ -429,6 +455,11 @@ protected:
     std::vector<ComplexRendererFns::value_type> Renderers;
     RecordedMIVector MIs;
     DenseMap<unsigned, unsigned> TempRegisters;
+    /// Named operands that predicate with 'let PredicateCodeUsesOperands = 1'
+    /// referenced in its argument list. Operands are inserted at index set by
+    /// emitter, it corresponds to the order in which names appear in argument
+    /// list. Currently such predicates don't have more then 3 arguments.
+    std::array<const MachineOperand *, 3> RecordedOperands;
 
     MatcherState(unsigned MaxRenderers);
   };
@@ -489,21 +520,13 @@ protected:
     llvm_unreachable(
         "Subclasses must override this with a tablegen-erated function");
   }
-  virtual bool testMIPredicate_MI(unsigned, const MachineInstr &) const {
+  virtual bool testMIPredicate_MI(
+      unsigned, const MachineInstr &,
+      const std::array<const MachineOperand *, 3> &Operands) const {
     llvm_unreachable(
         "Subclasses must override this with a tablegen-erated function");
   }
 
-  /// Constrain a register operand of an instruction \p I to a specified
-  /// register class. This could involve inserting COPYs before (for uses) or
-  /// after (for defs) and may replace the operand of \p I.
-  /// \returns whether operand regclass constraining succeeded.
-  bool constrainOperandRegToRegClass(MachineInstr &I, unsigned OpIdx,
-                                     const TargetRegisterClass &RC,
-                                     const TargetInstrInfo &TII,
-                                     const TargetRegisterInfo &TRI,
-                                     const RegisterBankInfo &RBI) const;
-
   bool isOperandImmEqual(const MachineOperand &MO, int64_t Value,
                          const MachineRegisterInfo &MRI) const;
 
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h
index 73ac578d61be..82e26b0bc355 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h
@@ -154,24 +154,31 @@ bool InstructionSelector::executeMatchTable(
       break;
     }
 
-    case GIM_CheckOpcode: {
+    case GIM_CheckOpcode:
+    case GIM_CheckOpcodeIsEither: {
       int64_t InsnID = MatchTable[CurrentIdx++];
-      int64_t Expected = MatchTable[CurrentIdx++];
+      int64_t Expected0 = MatchTable[CurrentIdx++];
+      int64_t Expected1 = -1;
+      if (MatcherOpcode == GIM_CheckOpcodeIsEither)
+        Expected1 = MatchTable[CurrentIdx++];
 
       assert(State.MIs[InsnID] != nullptr && "Used insn before defined");
       unsigned Opcode = State.MIs[InsnID]->getOpcode();
 
       DEBUG_WITH_TYPE(TgtInstructionSelector::getName(),
-                      dbgs() << CurrentIdx << ": GIM_CheckOpcode(MIs[" << InsnID
-                             << "], ExpectedOpcode=" << Expected
-                             << ") // Got=" << Opcode << "\n");
-      if (Opcode != Expected) {
+        dbgs() << CurrentIdx << ": GIM_CheckOpcode(MIs[" << InsnID
+        << "], ExpectedOpcode=" << Expected0;
+        if (MatcherOpcode == GIM_CheckOpcodeIsEither)
+          dbgs() << " || " << Expected1;
+        dbgs() << ") // Got=" << Opcode << "\n";
+      );
+
+      if (Opcode != Expected0 && Opcode != Expected1) {
         if (handleReject() == RejectAndGiveUp)
           return false;
       }
       break;
     }
-
     case GIM_SwitchOpcode: {
       int64_t InsnID = MatchTable[CurrentIdx++];
       int64_t LowerBound = MatchTable[CurrentIdx++];
@@ -193,7 +200,7 @@ bool InstructionSelector::executeMatchTable(
       CurrentIdx = MatchTable[CurrentIdx + (Opcode - LowerBound)];
       if (!CurrentIdx) {
         CurrentIdx = Default;
-	break;
+        break;
       }
       OnFailResumeAt.push_back(Default);
       break;
@@ -321,6 +328,35 @@ bool InstructionSelector::executeMatchTable(
           return false;
       break;
     }
+    case GIM_CheckIsBuildVectorAllOnes:
+    case GIM_CheckIsBuildVectorAllZeros: {
+      int64_t InsnID = MatchTable[CurrentIdx++];
+
+      DEBUG_WITH_TYPE(TgtInstructionSelector::getName(),
+                      dbgs() << CurrentIdx
+                             << ": GIM_CheckBuildVectorAll{Zeros|Ones}(MIs["
+                             << InsnID << "])\n");
+      assert(State.MIs[InsnID] != nullptr && "Used insn before defined");
+
+      const MachineInstr *MI = State.MIs[InsnID];
+      assert((MI->getOpcode() == TargetOpcode::G_BUILD_VECTOR ||
+              MI->getOpcode() == TargetOpcode::G_BUILD_VECTOR_TRUNC) &&
+             "Expected G_BUILD_VECTOR or G_BUILD_VECTOR_TRUNC");
+
+      if (MatcherOpcode == GIM_CheckIsBuildVectorAllOnes) {
+        if (!isBuildVectorAllOnes(*MI, MRI)) {
+          if (handleReject() == RejectAndGiveUp)
+            return false;
+        }
+      } else {
+        if (!isBuildVectorAllZeros(*MI, MRI)) {
+          if (handleReject() == RejectAndGiveUp)
+            return false;
+        }
+      }
+
+      break;
+    }
     case GIM_CheckCxxInsnPredicate: {
       int64_t InsnID = MatchTable[CurrentIdx++];
       int64_t Predicate = MatchTable[CurrentIdx++];
@@ -331,7 +367,8 @@ bool InstructionSelector::executeMatchTable(
       assert(State.MIs[InsnID] != nullptr && "Used insn before defined");
       assert(Predicate > GIPFP_MI_Invalid && "Expected a valid predicate");
 
-      if (!testMIPredicate_MI(Predicate, *State.MIs[InsnID]))
+      if (!testMIPredicate_MI(Predicate, *State.MIs[InsnID],
+                              State.RecordedOperands))
         if (handleReject() == RejectAndGiveUp)
           return false;
       break;
@@ -581,6 +618,20 @@ bool InstructionSelector::executeMatchTable(
 
       break;
     }
+    case GIM_RecordNamedOperand: {
+      int64_t InsnID = MatchTable[CurrentIdx++];
+      int64_t OpIdx = MatchTable[CurrentIdx++];
+      uint64_t StoreIdx = MatchTable[CurrentIdx++];
+
+      DEBUG_WITH_TYPE(TgtInstructionSelector::getName(),
+                      dbgs() << CurrentIdx << ": GIM_RecordNamedOperand(MIs["
+                             << InsnID << "]->getOperand(" << OpIdx
+                             << "), StoreIdx=" << StoreIdx << ")\n");
+      assert(State.MIs[InsnID] != nullptr && "Used insn before defined");
+      assert(StoreIdx < State.RecordedOperands.size() && "Index out of range");
+      State.RecordedOperands[StoreIdx] = &State.MIs[InsnID]->getOperand(OpIdx);
+      break;
+    }
     case GIM_CheckRegBankForClass: {
       int64_t InsnID = MatchTable[CurrentIdx++];
       int64_t OpIdx = MatchTable[CurrentIdx++];
@@ -1007,8 +1058,12 @@ bool InstructionSelector::executeMatchTable(
       int64_t OpIdx = MatchTable[CurrentIdx++];
       int64_t RCEnum = MatchTable[CurrentIdx++];
       assert(OutMIs[InsnID] && "Attempted to add to undefined instruction");
-      constrainOperandRegToRegClass(*OutMIs[InsnID].getInstr(), OpIdx,
-                                    *TRI.getRegClass(RCEnum), TII, TRI, RBI);
+      MachineInstr &I = *OutMIs[InsnID].getInstr();
+      MachineFunction &MF = *I.getParent()->getParent();
+      MachineRegisterInfo &MRI = MF.getRegInfo();
+      const TargetRegisterClass &RC = *TRI.getRegClass(RCEnum);
+      MachineOperand &MO = I.getOperand(OpIdx);
+      constrainOperandRegClass(MF, TRI, MRI, TII, RBI, I, RC, MO);
       DEBUG_WITH_TYPE(TgtInstructionSelector::getName(),
                       dbgs() << CurrentIdx << ": GIR_ConstrainOperandRC(OutMIs["
                              << InsnID << "], " << OpIdx << ", " << RCEnum
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
index 016b0bacab85..e7bda3b4bd97 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
@@ -105,19 +105,23 @@ public:
     Register SrcReg = lookThroughCopyInstrs(MI.getOperand(1).getReg());
 
     // zext(trunc x) - > and (aext/copy/trunc x), mask
+    // zext(sext x) -> and (sext x), mask
     Register TruncSrc;
-    if (mi_match(SrcReg, MRI, m_GTrunc(m_Reg(TruncSrc)))) {
+    Register SextSrc;
+    if (mi_match(SrcReg, MRI, m_GTrunc(m_Reg(TruncSrc))) ||
+        mi_match(SrcReg, MRI, m_GSExt(m_Reg(SextSrc)))) {
       LLT DstTy = MRI.getType(DstReg);
       if (isInstUnsupported({TargetOpcode::G_AND, {DstTy}}) ||
           isConstantUnsupported(DstTy))
         return false;
       LLVM_DEBUG(dbgs() << ".. Combine MI: " << MI;);
       LLT SrcTy = MRI.getType(SrcReg);
-      APInt Mask = APInt::getAllOnesValue(SrcTy.getScalarSizeInBits());
-      auto MIBMask = Builder.buildConstant(
-        DstTy, Mask.zext(DstTy.getScalarSizeInBits()));
-      Builder.buildAnd(DstReg, Builder.buildAnyExtOrTrunc(DstTy, TruncSrc),
-                       MIBMask);
+      APInt MaskVal = APInt::getAllOnesValue(SrcTy.getScalarSizeInBits());
+      auto Mask = Builder.buildConstant(
+        DstTy, MaskVal.zext(DstTy.getScalarSizeInBits()));
+      auto Extended = SextSrc ? Builder.buildSExtOrTrunc(DstTy, SextSrc) :
+                                Builder.buildAnyExtOrTrunc(DstTy, TruncSrc);
+      Builder.buildAnd(DstReg, Extended, Mask);
       markInstAndDefDead(MI, *MRI.getVRegDef(SrcReg), DeadInsts);
       return true;
     }
@@ -482,7 +486,7 @@ public:
                                     MachineRegisterInfo &MRI,
                                     MachineIRBuilder &Builder,
                                     SmallVectorImpl<Register> &UpdatedDefs,
-                                    GISelObserverWrapper &Observer) {
+                                    GISelChangeObserver &Observer) {
     if (!llvm::canReplaceReg(DstReg, SrcReg, MRI)) {
       Builder.buildCopy(DstReg, SrcReg);
       UpdatedDefs.push_back(DstReg);
@@ -502,20 +506,78 @@ public:
       Observer.changedInstr(*UseMI);
   }
 
-  bool tryCombineMerges(MachineInstr &MI,
-                        SmallVectorImpl<MachineInstr *> &DeadInsts,
-                        SmallVectorImpl<Register> &UpdatedDefs,
-                        GISelObserverWrapper &Observer) {
+  /// Return the operand index in \p MI that defines \p Def
+  static unsigned getDefIndex(const MachineInstr &MI, Register SearchDef) {
+    unsigned DefIdx = 0;
+    for (const MachineOperand &Def : MI.defs()) {
+      if (Def.getReg() == SearchDef)
+        break;
+      ++DefIdx;
+    }
+
+    return DefIdx;
+  }
+
+  bool tryCombineUnmergeValues(MachineInstr &MI,
+                               SmallVectorImpl<MachineInstr *> &DeadInsts,
+                               SmallVectorImpl<Register> &UpdatedDefs,
+                               GISelChangeObserver &Observer) {
     assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES);
 
     unsigned NumDefs = MI.getNumOperands() - 1;
-    MachineInstr *SrcDef =
-        getDefIgnoringCopies(MI.getOperand(NumDefs).getReg(), MRI);
+    Register SrcReg = MI.getOperand(NumDefs).getReg();
+    MachineInstr *SrcDef = getDefIgnoringCopies(SrcReg, MRI);
     if (!SrcDef)
       return false;
 
     LLT OpTy = MRI.getType(MI.getOperand(NumDefs).getReg());
     LLT DestTy = MRI.getType(MI.getOperand(0).getReg());
+
+    if (SrcDef->getOpcode() == TargetOpcode::G_UNMERGE_VALUES) {
+      // %0:_(<4 x s16>) = G_FOO
+      // %1:_(<2 x s16>), %2:_(<2 x s16>) = G_UNMERGE_VALUES %0
+      // %3:_(s16), %4:_(s16) = G_UNMERGE_VALUES %1
+      //
+      // %3:_(s16), %4:_(s16), %5:_(s16), %6:_(s16) = G_UNMERGE_VALUES %0
+      const unsigned NumSrcOps = SrcDef->getNumOperands();
+      Register SrcUnmergeSrc = SrcDef->getOperand(NumSrcOps - 1).getReg();
+      LLT SrcUnmergeSrcTy = MRI.getType(SrcUnmergeSrc);
+
+      // If we need to decrease the number of vector elements in the result type
+      // of an unmerge, this would involve the creation of an equivalent unmerge
+      // to copy back to the original result registers.
+      LegalizeActionStep ActionStep = LI.getAction(
+          {TargetOpcode::G_UNMERGE_VALUES, {OpTy, SrcUnmergeSrcTy}});
+      switch (ActionStep.Action) {
+      case LegalizeActions::Lower:
+      case LegalizeActions::Unsupported:
+        break;
+      case LegalizeActions::FewerElements:
+      case LegalizeActions::NarrowScalar:
+        if (ActionStep.TypeIdx == 1)
+          return false;
+        break;
+      default:
+        return false;
+      }
+
+      Builder.setInstrAndDebugLoc(MI);
+      auto NewUnmerge = Builder.buildUnmerge(DestTy, SrcUnmergeSrc);
+
+      // TODO: Should we try to process out the other defs now? If the other
+      // defs of the source unmerge are also unmerged, we end up with a separate
+      // unmerge for each one.
+      unsigned SrcDefIdx = getDefIndex(*SrcDef, SrcReg);
+      for (unsigned I = 0; I != NumDefs; ++I) {
+        Register Def = MI.getOperand(I).getReg();
+        replaceRegOrBuildCopy(Def, NewUnmerge.getReg(SrcDefIdx * NumDefs + I),
+                              MRI, Builder, UpdatedDefs, Observer);
+      }
+
+      markInstAndDefDead(MI, *SrcDef, DeadInsts, SrcDefIdx);
+      return true;
+    }
+
     MachineInstr *MergeI = SrcDef;
     unsigned ConvertOp = 0;
 
@@ -743,9 +805,12 @@ public:
       Changed = tryCombineSExt(MI, DeadInsts, UpdatedDefs);
       break;
     case TargetOpcode::G_UNMERGE_VALUES:
-      Changed = tryCombineMerges(MI, DeadInsts, UpdatedDefs, WrapperObserver);
+      Changed =
+          tryCombineUnmergeValues(MI, DeadInsts, UpdatedDefs, WrapperObserver);
       break;
     case TargetOpcode::G_MERGE_VALUES:
+    case TargetOpcode::G_BUILD_VECTOR:
+    case TargetOpcode::G_CONCAT_VECTORS:
       // If any of the users of this merge are an unmerge, then add them to the
       // artifact worklist in case there's folding that can be done looking up.
       for (MachineInstr &U : MRI.use_instructions(MI.getOperand(0).getReg())) {
@@ -829,7 +894,8 @@ private:
   /// dead.
   /// MI is not marked dead.
   void markDefDead(MachineInstr &MI, MachineInstr &DefMI,
-                   SmallVectorImpl<MachineInstr *> &DeadInsts) {
+                   SmallVectorImpl<MachineInstr *> &DeadInsts,
+                   unsigned DefIdx = 0) {
     // Collect all the copy instructions that are made dead, due to deleting
     // this instruction. Collect all of them until the Trunc(DefMI).
     // Eg,
@@ -856,8 +922,27 @@ private:
         break;
       PrevMI = TmpDef;
     }
-    if (PrevMI == &DefMI && MRI.hasOneUse(DefMI.getOperand(0).getReg()))
-      DeadInsts.push_back(&DefMI);
+
+    if (PrevMI == &DefMI) {
+      unsigned I = 0;
+      bool IsDead = true;
+      for (MachineOperand &Def : DefMI.defs()) {
+        if (I != DefIdx) {
+          if (!MRI.use_empty(Def.getReg())) {
+            IsDead = false;
+            break;
+          }
+        } else {
+          if (!MRI.hasOneUse(DefMI.getOperand(DefIdx).getReg()))
+            break;
+        }
+
+        ++I;
+      }
+
+      if (IsDead)
+        DeadInsts.push_back(&DefMI);
+    }
   }
 
   /// Mark MI as dead. If a def of one of MI's operands, DefMI, would also be
@@ -866,9 +951,10 @@ private:
   /// copies in between the extends and the truncs, and this attempts to collect
   /// the in between copies if they're dead.
   void markInstAndDefDead(MachineInstr &MI, MachineInstr &DefMI,
-                          SmallVectorImpl<MachineInstr *> &DeadInsts) {
+                          SmallVectorImpl<MachineInstr *> &DeadInsts,
+                          unsigned DefIdx = 0) {
     DeadInsts.push_back(&MI);
-    markDefDead(MI, DefMI, DeadInsts);
+    markDefDead(MI, DefMI, DeadInsts, DefIdx);
   }
 
   /// Erase the dead instructions in the list and call the observer hooks.
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/Legalizer.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/Legalizer.h
index e59bf1b91262..690e84f79a6b 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/Legalizer.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/Legalizer.h
@@ -64,9 +64,6 @@ public:
         MachineFunctionProperties::Property::NoPHIs);
   }
 
-  bool combineExtracts(MachineInstr &MI, MachineRegisterInfo &MRI,
-                       const TargetInstrInfo &TII);
-
   bool runOnMachineFunction(MachineFunction &MF) override;
 
   static MFResult
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
index 058aacf38634..4a982b00125d 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
@@ -32,6 +32,7 @@ class LegalizerInfo;
 class Legalizer;
 class MachineRegisterInfo;
 class GISelChangeObserver;
+class TargetLowering;
 
 class LegalizerHelper {
 public:
@@ -45,6 +46,7 @@ public:
 private:
   MachineRegisterInfo &MRI;
   const LegalizerInfo &LI;
+  const TargetLowering &TLI;
 
 public:
   enum LegalizeResult {
@@ -62,6 +64,7 @@ public:
 
   /// Expose LegalizerInfo so the clients can re-use.
   const LegalizerInfo &getLegalizerInfo() const { return LI; }
+  const TargetLowering &getTargetLowering() const { return TLI; }
 
   LegalizerHelper(MachineFunction &MF, GISelChangeObserver &Observer,
                   MachineIRBuilder &B);
@@ -154,6 +157,10 @@ public:
   /// def by inserting a G_BITCAST from \p CastTy
   void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx);
 
+  /// Widen \p OrigReg to \p WideTy by merging to a wider type, padding with
+  /// G_IMPLICIT_DEF, and producing dead results.
+  Register widenWithUnmerge(LLT WideTy, Register OrigReg);
+
 private:
   LegalizeResult
   widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx, LLT WideTy);
@@ -163,8 +170,10 @@ private:
   widenScalarExtract(MachineInstr &MI, unsigned TypeIdx, LLT WideTy);
   LegalizeResult
   widenScalarInsert(MachineInstr &MI, unsigned TypeIdx, LLT WideTy);
-  LegalizeResult widenScalarAddSubSat(MachineInstr &MI, unsigned TypeIdx,
-                                      LLT WideTy);
+  LegalizeResult widenScalarAddoSubo(MachineInstr &MI, unsigned TypeIdx,
+                                     LLT WideTy);
+  LegalizeResult widenScalarAddSubShlSat(MachineInstr &MI, unsigned TypeIdx,
+                                         LLT WideTy);
 
   /// Helper function to split a wide generic register into bitwise blocks with
   /// the given Type (which implies the number of blocks needed). The generic
@@ -191,11 +200,19 @@ private:
                    LLT PartTy, ArrayRef<Register> PartRegs,
                    LLT LeftoverTy = LLT(), ArrayRef<Register> LeftoverRegs = {});
 
-  /// Unmerge \p SrcReg into \p Parts with the greatest common divisor type with
-  /// \p DstTy and \p NarrowTy. Returns the GCD type.
+  /// Unmerge \p SrcReg into smaller sized values, and append them to \p
+  /// Parts. The elements of \p Parts will be the greatest common divisor type
+  /// of \p DstTy, \p NarrowTy and the type of \p SrcReg. This will compute and
+  /// return the GCD type.
   LLT extractGCDType(SmallVectorImpl<Register> &Parts, LLT DstTy,
                      LLT NarrowTy, Register SrcReg);
 
+  /// Unmerge \p SrcReg into \p GCDTy typed registers. This will append all of
+  /// the unpacked registers to \p Parts. This version is if the common unmerge
+  /// type is already known.
+  void extractGCDType(SmallVectorImpl<Register> &Parts, LLT GCDTy,
+                      Register SrcReg);
+
   /// Produce a merge of values in \p VRegs to define \p DstReg. Perform a merge
   /// from the least common multiple type, and convert as appropriate to \p
   /// DstReg.
@@ -228,7 +245,23 @@ private:
                          ArrayRef<Register> Src1Regs,
                          ArrayRef<Register> Src2Regs, LLT NarrowTy);
 
+  void changeOpcode(MachineInstr &MI, unsigned NewOpcode);
+
 public:
+  /// Return the alignment to use for a stack temporary object with the given
+  /// type.
+  Align getStackTemporaryAlignment(LLT Type, Align MinAlign = Align()) const;
+
+  /// Create a stack temporary based on the size in bytes and the alignment
+  MachineInstrBuilder createStackTemporary(TypeSize Bytes, Align Alignment,
+                                           MachinePointerInfo &PtrInfo);
+
+  /// Get a pointer to vector element \p Index located in memory for a vector of
+  /// type \p VecTy starting at a base address of \p VecPtr. If \p Index is out
+  /// of bounds the returned pointer is unspecified, but will be within the
+  /// vector bounds.
+  Register getVectorElementPointer(Register VecPtr, LLT VecTy, Register Index);
+
   LegalizeResult fewerElementsVectorImplicitDef(MachineInstr &MI,
                                                 unsigned TypeIdx, LLT NarrowTy);
 
@@ -256,9 +289,11 @@ public:
   LegalizeResult fewerElementsVectorUnmergeValues(MachineInstr &MI,
                                                   unsigned TypeIdx,
                                                   LLT NarrowTy);
-  LegalizeResult fewerElementsVectorBuildVector(MachineInstr &MI,
-                                                unsigned TypeIdx,
-                                                LLT NarrowTy);
+  LegalizeResult fewerElementsVectorMerge(MachineInstr &MI, unsigned TypeIdx,
+                                          LLT NarrowTy);
+  LegalizeResult fewerElementsVectorExtractInsertVectorElt(MachineInstr &MI,
+                                                           unsigned TypeIdx,
+                                                           LLT NarrowTy);
 
   LegalizeResult
   reduceLoadStoreWidth(MachineInstr &MI, unsigned TypeIdx, LLT NarrowTy);
@@ -281,6 +316,7 @@ public:
 
   LegalizeResult narrowScalarShift(MachineInstr &MI, unsigned TypeIdx, LLT Ty);
   LegalizeResult narrowScalarMul(MachineInstr &MI, LLT Ty);
+  LegalizeResult narrowScalarFPTOI(MachineInstr &MI, unsigned TypeIdx, LLT Ty);
   LegalizeResult narrowScalarExtract(MachineInstr &MI, unsigned TypeIdx, LLT Ty);
   LegalizeResult narrowScalarInsert(MachineInstr &MI, unsigned TypeIdx, LLT Ty);
 
@@ -291,34 +327,52 @@ public:
   LegalizeResult narrowScalarCTTZ(MachineInstr &MI, unsigned TypeIdx, LLT Ty);
   LegalizeResult narrowScalarCTPOP(MachineInstr &MI, unsigned TypeIdx, LLT Ty);
 
+  /// Perform Bitcast legalize action on G_EXTRACT_VECTOR_ELT.
+  LegalizeResult bitcastExtractVectorElt(MachineInstr &MI, unsigned TypeIdx,
+                                         LLT CastTy);
+
+  /// Perform Bitcast legalize action on G_INSERT_VECTOR_ELT.
+  LegalizeResult bitcastInsertVectorElt(MachineInstr &MI, unsigned TypeIdx,
+                                        LLT CastTy);
+
   LegalizeResult lowerBitcast(MachineInstr &MI);
-  LegalizeResult lowerBitCount(MachineInstr &MI, unsigned TypeIdx, LLT Ty);
+  LegalizeResult lowerLoad(MachineInstr &MI);
+  LegalizeResult lowerStore(MachineInstr &MI);
+  LegalizeResult lowerBitCount(MachineInstr &MI);
 
   LegalizeResult lowerU64ToF32BitOps(MachineInstr &MI);
-  LegalizeResult lowerUITOFP(MachineInstr &MI, unsigned TypeIdx, LLT Ty);
-  LegalizeResult lowerSITOFP(MachineInstr &MI, unsigned TypeIdx, LLT Ty);
-  LegalizeResult lowerFPTOUI(MachineInstr &MI, unsigned TypeIdx, LLT Ty);
+  LegalizeResult lowerUITOFP(MachineInstr &MI);
+  LegalizeResult lowerSITOFP(MachineInstr &MI);
+  LegalizeResult lowerFPTOUI(MachineInstr &MI);
   LegalizeResult lowerFPTOSI(MachineInstr &MI);
 
   LegalizeResult lowerFPTRUNC_F64_TO_F16(MachineInstr &MI);
-  LegalizeResult lowerFPTRUNC(MachineInstr &MI, unsigned TypeIdx, LLT Ty);
+  LegalizeResult lowerFPTRUNC(MachineInstr &MI);
+  LegalizeResult lowerFPOWI(MachineInstr &MI);
 
-  LegalizeResult lowerMinMax(MachineInstr &MI, unsigned TypeIdx, LLT Ty);
-  LegalizeResult lowerFCopySign(MachineInstr &MI, unsigned TypeIdx, LLT Ty);
+  LegalizeResult lowerMinMax(MachineInstr &MI);
+  LegalizeResult lowerFCopySign(MachineInstr &MI);
   LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI);
   LegalizeResult lowerFMad(MachineInstr &MI);
   LegalizeResult lowerIntrinsicRound(MachineInstr &MI);
   LegalizeResult lowerFFloor(MachineInstr &MI);
   LegalizeResult lowerMergeValues(MachineInstr &MI);
   LegalizeResult lowerUnmergeValues(MachineInstr &MI);
+  LegalizeResult lowerExtractInsertVectorElt(MachineInstr &MI);
   LegalizeResult lowerShuffleVector(MachineInstr &MI);
   LegalizeResult lowerDynStackAlloc(MachineInstr &MI);
   LegalizeResult lowerExtract(MachineInstr &MI);
   LegalizeResult lowerInsert(MachineInstr &MI);
   LegalizeResult lowerSADDO_SSUBO(MachineInstr &MI);
+  LegalizeResult lowerAddSubSatToMinMax(MachineInstr &MI);
+  LegalizeResult lowerAddSubSatToAddoSubo(MachineInstr &MI);
+  LegalizeResult lowerShlSat(MachineInstr &MI);
   LegalizeResult lowerBswap(MachineInstr &MI);
   LegalizeResult lowerBitreverse(MachineInstr &MI);
   LegalizeResult lowerReadWriteRegister(MachineInstr &MI);
+  LegalizeResult lowerSMULH_UMULH(MachineInstr &MI);
+  LegalizeResult lowerSelect(MachineInstr &MI);
+
 };
 
 /// Helper function that creates a libcall to the given \p Name using the given
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
index 61e0418757bc..c0a89b6ae619 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
@@ -37,7 +37,6 @@ extern cl::opt<bool> DisableGISelLegalityCheck;
 
 class LegalizerHelper;
 class MachineInstr;
-class MachineIRBuilder;
 class MachineRegisterInfo;
 class MCInstrInfo;
 class GISelChangeObserver;
@@ -183,7 +182,7 @@ struct TypePairAndMemDesc {
            MemSize == Other.MemSize;
   }
 
-  /// \returns true if this memory access is legal with for the acecss described
+  /// \returns true if this memory access is legal with for the access described
   /// by \p Other (The alignment is sufficient for the size and result type).
   bool isCompatible(const TypePairAndMemDesc &Other) const {
     return Type0 == Other.Type0 && Type1 == Other.Type1 &&
@@ -218,11 +217,19 @@ Predicate any(Predicate P0, Predicate P1, Args... args) {
   return any(any(P0, P1), args...);
 }
 
-/// True iff the given type index is the specified types.
+/// True iff the given type index is the specified type.
 LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit);
 /// True iff the given type index is one of the specified types.
 LegalityPredicate typeInSet(unsigned TypeIdx,
                             std::initializer_list<LLT> TypesInit);
+
+/// True iff the given type index is not the specified type.
+inline LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type) {
+  return [=](const LegalityQuery &Query) {
+           return Query.Types[TypeIdx] != Type;
+         };
+}
+
 /// True iff the given types for the given pair of type indexes is one of the
 /// specified type pairs.
 LegalityPredicate
@@ -308,6 +315,11 @@ LegalizeMutation changeElementTo(unsigned TypeIdx, unsigned FromTypeIdx);
 /// Keep the same scalar or element type as the given type.
 LegalizeMutation changeElementTo(unsigned TypeIdx, LLT Ty);
 
+/// Change the scalar size or element size to have the same scalar size as type
+/// index \p FromIndex. Unlike changeElementTo, this discards pointer types and
+/// only changes the size.
+LegalizeMutation changeElementSizeTo(unsigned TypeIdx, unsigned FromTypeIdx);
+
 /// Widen the scalar type or vector element type for the given type index to the
 /// next power of 2.
 LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min = 0);
@@ -616,8 +628,7 @@ public:
   /// The instruction is lowered when type index 0 is any type in the given
   /// list. Keep type index 0 as the same type.
   LegalizeRuleSet &lowerFor(std::initializer_list<LLT> Types) {
-    return actionFor(LegalizeAction::Lower, Types,
-                     LegalizeMutations::changeTo(0, 0));
+    return actionFor(LegalizeAction::Lower, Types);
   }
   /// The instruction is lowered when type index 0 is any type in the given
   /// list.
@@ -628,8 +639,7 @@ public:
   /// The instruction is lowered when type indexes 0 and 1 is any type pair in
   /// the given list. Keep type index 0 as the same type.
   LegalizeRuleSet &lowerFor(std::initializer_list<std::pair<LLT, LLT>> Types) {
-    return actionFor(LegalizeAction::Lower, Types,
-                     LegalizeMutations::changeTo(0, 0));
+    return actionFor(LegalizeAction::Lower, Types);
   }
   /// The instruction is lowered when type indexes 0 and 1 is any type pair in
   /// the given list.
@@ -654,6 +664,15 @@ public:
                                      Types2);
   }
 
+  /// The instruction is emitted as a library call.
+  LegalizeRuleSet &libcall() {
+    using namespace LegalizeMutations;
+    // We have no choice but conservatively assume that predicate-less lowering
+    // properly handles all type indices by design:
+    markAllIdxsAsCovered();
+    return actionIf(LegalizeAction::Libcall, always);
+  }
+
   /// Like legalIf, but for the Libcall action.
   LegalizeRuleSet &libcallIf(LegalityPredicate Predicate) {
     // We have no choice but conservatively assume that a libcall with a
@@ -696,6 +715,13 @@ public:
     markAllIdxsAsCovered();
     return actionIf(LegalizeAction::NarrowScalar, Predicate, Mutation);
   }
+  /// Narrow the scalar, specified in mutation, when type indexes 0 and 1 is any
+  /// type pair in the given list.
+  LegalizeRuleSet &
+  narrowScalarFor(std::initializer_list<std::pair<LLT, LLT>> Types,
+                  LegalizeMutation Mutation) {
+    return actionFor(LegalizeAction::NarrowScalar, Types, Mutation);
+  }
 
   /// Add more elements to reach the type selected by the mutation if the
   /// predicate is true.
@@ -800,6 +826,13 @@ public:
                     LegalizeMutations::scalarize(TypeIdx));
   }
 
+  LegalizeRuleSet &scalarizeIf(LegalityPredicate Predicate, unsigned TypeIdx) {
+    using namespace LegalityPredicates;
+    return actionIf(LegalizeAction::FewerElements,
+                    all(Predicate, isVector(typeIdx(TypeIdx))),
+                    LegalizeMutations::scalarize(TypeIdx));
+  }
+
   /// Ensure the scalar or element is at least as wide as Ty.
   LegalizeRuleSet &minScalarOrElt(unsigned TypeIdx, const LLT Ty) {
     using namespace LegalityPredicates;
@@ -857,7 +890,10 @@ public:
     return actionIf(
         LegalizeAction::NarrowScalar,
         [=](const LegalityQuery &Query) {
-          return scalarWiderThan(TypeIdx, Ty.getSizeInBits()) && Predicate(Query);
+          const LLT QueryTy = Query.Types[TypeIdx];
+          return QueryTy.isScalar() &&
+                 QueryTy.getSizeInBits() > Ty.getSizeInBits() &&
+                 Predicate(Query);
         },
         changeElementTo(typeIdx(TypeIdx), Ty));
   }
@@ -883,11 +919,25 @@ public:
           return Query.Types[LargeTypeIdx].getScalarSizeInBits() >
                  Query.Types[TypeIdx].getSizeInBits();
         },
+        LegalizeMutations::changeElementSizeTo(TypeIdx, LargeTypeIdx));
+  }
+
+  /// Narrow the scalar to match the size of another.
+  LegalizeRuleSet &maxScalarSameAs(unsigned TypeIdx, unsigned NarrowTypeIdx) {
+    typeIdx(TypeIdx);
+    return narrowScalarIf(
         [=](const LegalityQuery &Query) {
-          LLT T = Query.Types[LargeTypeIdx];
-          return std::make_pair(TypeIdx,
-                                T.isVector() ? T.getElementType() : T);
-        });
+          return Query.Types[NarrowTypeIdx].getScalarSizeInBits() <
+                 Query.Types[TypeIdx].getSizeInBits();
+        },
+        LegalizeMutations::changeElementSizeTo(TypeIdx, NarrowTypeIdx));
+  }
+
+  /// Change the type \p TypeIdx to have the same scalar size as type \p
+  /// SameSizeIdx.
+  LegalizeRuleSet &scalarSameSizeAs(unsigned TypeIdx, unsigned SameSizeIdx) {
+    return minScalarSameAs(TypeIdx, SameSizeIdx)
+          .maxScalarSameAs(TypeIdx, SameSizeIdx);
   }
 
   /// Conditionally widen the scalar or elt to match the size of another.
@@ -1207,6 +1257,12 @@ public:
   bool isLegal(const LegalityQuery &Query) const {
     return getAction(Query).Action == LegalizeAction::Legal;
   }
+
+  bool isLegalOrCustom(const LegalityQuery &Query) const {
+    auto Action = getAction(Query).Action;
+    return Action == LegalizeAction::Legal || Action == LegalizeAction::Custom;
+  }
+
   bool isLegal(const MachineInstr &MI, const MachineRegisterInfo &MRI) const;
   bool isLegalOrCustom(const MachineInstr &MI,
                        const MachineRegisterInfo &MRI) const;
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/Localizer.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/Localizer.h
index 67e450641eaf..1d1afff7f934 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/Localizer.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/Localizer.h
@@ -52,9 +52,6 @@ private:
   /// TTI used for getting remat costs for instructions.
   TargetTransformInfo *TTI;
 
-  /// Check whether or not \p MI needs to be moved close to its uses.
-  bool shouldLocalize(const MachineInstr &MI);
-
   /// Check if \p MOUse is used in the same basic block as \p Def.
   /// If the use is in the same block, we say it is local.
   /// When the use is not local, \p InsertMBB will contain the basic
@@ -67,6 +64,11 @@ private:
 
   typedef SmallSetVector<MachineInstr *, 32> LocalizedSetVecT;
 
+  /// If \p Op is a phi operand and not unique in that phi, that is,
+  /// there are other operands in the phi with the same register,
+  /// return true.
+  bool isNonUniquePhiValue(MachineOperand &Op) const;
+
   /// Do inter-block localization from the entry block.
   bool localizeInterBlock(MachineFunction &MF,
                           LocalizedSetVecT &LocalizedInstrs);
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h
index 043be086ff41..55d6d365fbb4 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h
@@ -39,11 +39,25 @@ inline OneUse_match<SubPat> m_OneUse(const SubPat &SP) {
   return SP;
 }
 
+template <typename SubPatternT> struct OneNonDBGUse_match {
+  SubPatternT SubPat;
+  OneNonDBGUse_match(const SubPatternT &SP) : SubPat(SP) {}
+
+  bool match(const MachineRegisterInfo &MRI, Register Reg) {
+    return MRI.hasOneNonDBGUse(Reg) && SubPat.match(MRI, Reg);
+  }
+};
+
+template <typename SubPat>
+inline OneNonDBGUse_match<SubPat> m_OneNonDBGUse(const SubPat &SP) {
+  return SP;
+}
+
 struct ConstantMatch {
   int64_t &CR;
   ConstantMatch(int64_t &C) : CR(C) {}
   bool match(const MachineRegisterInfo &MRI, Register Reg) {
-    if (auto MaybeCst = getConstantVRegVal(Reg, MRI)) {
+    if (auto MaybeCst = getConstantVRegSExtVal(Reg, MRI)) {
       CR = *MaybeCst;
       return true;
     }
@@ -53,6 +67,29 @@ struct ConstantMatch {
 
 inline ConstantMatch m_ICst(int64_t &Cst) { return ConstantMatch(Cst); }
 
+/// Matcher for a specific constant value.
+struct SpecificConstantMatch {
+  int64_t RequestedVal;
+  SpecificConstantMatch(int64_t RequestedVal) : RequestedVal(RequestedVal) {}
+  bool match(const MachineRegisterInfo &MRI, Register Reg) {
+    int64_t MatchedVal;
+    return mi_match(Reg, MRI, m_ICst(MatchedVal)) && MatchedVal == RequestedVal;
+  }
+};
+
+/// Matches a constant equal to \p RequestedValue.
+inline SpecificConstantMatch m_SpecificICst(int64_t RequestedValue) {
+  return SpecificConstantMatch(RequestedValue);
+}
+
+///{
+/// Convenience matchers for specific integer values.
+inline SpecificConstantMatch m_ZeroInt() { return SpecificConstantMatch(0); }
+inline SpecificConstantMatch m_AllOnesInt() {
+  return SpecificConstantMatch(-1);
+}
+///}
+
 // TODO: Rework this for different kinds of MachineOperand.
 // Currently assumes the Src for a match is a register.
 // We might want to support taking in some MachineOperands and call getReg on
@@ -197,6 +234,12 @@ m_GAdd(const LHS &L, const RHS &R) {
   return BinaryOp_match<LHS, RHS, TargetOpcode::G_ADD, true>(L, R);
 }
 
+template <typename LHS, typename RHS>
+inline BinaryOp_match<LHS, RHS, TargetOpcode::G_PTR_ADD, true>
+m_GPtrAdd(const LHS &L, const RHS &R) {
+  return BinaryOp_match<LHS, RHS, TargetOpcode::G_PTR_ADD, true>(L, R);
+}
+
 template <typename LHS, typename RHS>
 inline BinaryOp_match<LHS, RHS, TargetOpcode::G_SUB> m_GSub(const LHS &L,
                                                             const RHS &R) {
@@ -233,6 +276,12 @@ m_GAnd(const LHS &L, const RHS &R) {
   return BinaryOp_match<LHS, RHS, TargetOpcode::G_AND, true>(L, R);
 }
 
+template <typename LHS, typename RHS>
+inline BinaryOp_match<LHS, RHS, TargetOpcode::G_XOR, true>
+m_GXor(const LHS &L, const RHS &R) {
+  return BinaryOp_match<LHS, RHS, TargetOpcode::G_XOR, true>(L, R);
+}
+
 template <typename LHS, typename RHS>
 inline BinaryOp_match<LHS, RHS, TargetOpcode::G_OR, true> m_GOr(const LHS &L,
                                                                 const RHS &R) {
@@ -251,6 +300,12 @@ m_GLShr(const LHS &L, const RHS &R) {
   return BinaryOp_match<LHS, RHS, TargetOpcode::G_LSHR, false>(L, R);
 }
 
+template <typename LHS, typename RHS>
+inline BinaryOp_match<LHS, RHS, TargetOpcode::G_ASHR, false>
+m_GAShr(const LHS &L, const RHS &R) {
+  return BinaryOp_match<LHS, RHS, TargetOpcode::G_ASHR, false>(L, R);
+}
+
 // Helper for unary instructions (G_[ZSA]EXT/G_TRUNC) etc
 template <typename SrcTy, unsigned Opcode> struct UnaryOp_match {
   SrcTy L;
@@ -384,6 +439,51 @@ struct CheckType {
 
 inline CheckType m_SpecificType(LLT Ty) { return Ty; }
 
+template <typename Src0Ty, typename Src1Ty, typename Src2Ty, unsigned Opcode>
+struct TernaryOp_match {
+  Src0Ty Src0;
+  Src1Ty Src1;
+  Src2Ty Src2;
+
+  TernaryOp_match(const Src0Ty &Src0, const Src1Ty &Src1, const Src2Ty &Src2)
+      : Src0(Src0), Src1(Src1), Src2(Src2) {}
+  template <typename OpTy>
+  bool match(const MachineRegisterInfo &MRI, OpTy &&Op) {
+    MachineInstr *TmpMI;
+    if (mi_match(Op, MRI, m_MInstr(TmpMI))) {
+      if (TmpMI->getOpcode() == Opcode && TmpMI->getNumOperands() == 4) {
+        return (Src0.match(MRI, TmpMI->getOperand(1).getReg()) &&
+                Src1.match(MRI, TmpMI->getOperand(2).getReg()) &&
+                Src2.match(MRI, TmpMI->getOperand(3).getReg()));
+      }
+    }
+    return false;
+  }
+};
+template <typename Src0Ty, typename Src1Ty, typename Src2Ty>
+inline TernaryOp_match<Src0Ty, Src1Ty, Src2Ty,
+                       TargetOpcode::G_INSERT_VECTOR_ELT>
+m_GInsertVecElt(const Src0Ty &Src0, const Src1Ty &Src1, const Src2Ty &Src2) {
+  return TernaryOp_match<Src0Ty, Src1Ty, Src2Ty,
+                         TargetOpcode::G_INSERT_VECTOR_ELT>(Src0, Src1, Src2);
+}
+
+/// Matches a register negated by a G_SUB.
+/// G_SUB 0, %negated_reg
+template <typename SrcTy>
+inline BinaryOp_match<SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB>
+m_Neg(const SrcTy &&Src) {
+  return m_GSub(m_ZeroInt(), Src);
+}
+
+/// Matches a register not-ed by a G_XOR.
+/// G_XOR %not_reg, -1
+template <typename SrcTy>
+inline BinaryOp_match<SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true>
+m_Not(const SrcTy &&Src) {
+  return m_GXor(Src, m_AllOnesInt());
+}
+
 } // namespace GMIPatternMatch
 } // namespace llvm
 
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
index d6498345f25c..1ab4cd704824 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
@@ -18,9 +18,10 @@
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugLoc.h"
-
+#include "llvm/IR/Module.h"
 
 namespace llvm {
 
@@ -223,6 +224,7 @@ class MachineIRBuilder {
 protected:
   void validateTruncExt(const LLT Dst, const LLT Src, bool IsExtend);
 
+  void validateUnaryOp(const LLT Res, const LLT Op0);
   void validateBinaryOp(const LLT Res, const LLT Op0, const LLT Op1);
   void validateShiftOp(const LLT Res, const LLT Op0, const LLT Op1);
 
@@ -250,6 +252,11 @@ public:
     setDebugLoc(MI.getDebugLoc());
   }
 
+  MachineIRBuilder(MachineInstr &MI, GISelChangeObserver &Observer) :
+    MachineIRBuilder(MI) {
+    setChangeObserver(Observer);
+  }
+
   virtual ~MachineIRBuilder() = default;
 
   MachineIRBuilder(const MachineIRBuilderState &BState) : State(BState) {}
@@ -729,7 +736,7 @@ public:
   ///      depend on bit 0 (for now).
   ///
   /// \return The newly created instruction.
-  MachineInstrBuilder buildBrCond(Register Tst, MachineBasicBlock &Dest);
+  MachineInstrBuilder buildBrCond(const SrcOp &Tst, MachineBasicBlock &Dest);
 
   /// Build and insert G_BRINDIRECT \p Tgt
   ///
@@ -813,7 +820,17 @@ public:
   ///
   /// \return a MachineInstrBuilder for the newly created instruction.
   MachineInstrBuilder buildLoad(const DstOp &Res, const SrcOp &Addr,
-                                MachineMemOperand &MMO);
+                                MachineMemOperand &MMO) {
+    return buildLoadInstr(TargetOpcode::G_LOAD, Res, Addr, MMO);
+  }
+
+  /// Build and insert a G_LOAD instruction, while constructing the
+  /// MachineMemOperand.
+  MachineInstrBuilder
+  buildLoad(const DstOp &Res, const SrcOp &Addr, MachinePointerInfo PtrInfo,
+            Align Alignment,
+            MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone,
+            const AAMDNodes &AAInfo = AAMDNodes());
 
   /// Build and insert `Res = <opcode> Addr, MMO`.
   ///
@@ -847,6 +864,14 @@ public:
   MachineInstrBuilder buildStore(const SrcOp &Val, const SrcOp &Addr,
                                  MachineMemOperand &MMO);
 
+  /// Build and insert a G_STORE instruction, while constructing the
+  /// MachineMemOperand.
+  MachineInstrBuilder
+  buildStore(const SrcOp &Val, const SrcOp &Addr, MachinePointerInfo PtrInfo,
+             Align Alignment,
+             MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone,
+             const AAMDNodes &AAInfo = AAMDNodes());
+
   /// Build and insert `Res0, ... = G_EXTRACT Src, Idx0`.
   ///
   /// \pre setBasicBlock or setMI must have been called.
@@ -938,6 +963,23 @@ public:
   MachineInstrBuilder buildBuildVectorTrunc(const DstOp &Res,
                                             ArrayRef<Register> Ops);
 
+  /// Build and insert a vector splat of a scalar \p Src using a
+  /// G_INSERT_VECTOR_ELT and G_SHUFFLE_VECTOR idiom.
+  ///
+  /// \pre setBasicBlock or setMI must have been called.
+  /// \pre \p Src must have the same type as the element type of \p Dst
+  ///
+  /// \return a MachineInstrBuilder for the newly created instruction.
+  MachineInstrBuilder buildShuffleSplat(const DstOp &Res, const SrcOp &Src);
+
+  /// Build and insert \p Res = G_SHUFFLE_VECTOR \p Src1, \p Src2, \p Mask
+  ///
+  /// \pre setBasicBlock or setMI must have been called.
+  ///
+  /// \return a MachineInstrBuilder for the newly created instruction.
+  MachineInstrBuilder buildShuffleVector(const DstOp &Res, const SrcOp &Src1,
+                                         const SrcOp &Src2, ArrayRef<int> Mask);
+
   /// Build and insert \p Res = G_CONCAT_VECTORS \p Op0, ...
   ///
   /// G_CONCAT_VECTORS creates a vector from the concatenation of 2 or more
@@ -1521,6 +1563,13 @@ public:
     return buildInstr(TargetOpcode::G_FSUB, {Dst}, {Src0, Src1}, Flags);
   }
 
+  /// Build and insert \p Res = G_FDIV \p Op0, \p Op1
+  MachineInstrBuilder buildFDiv(const DstOp &Dst, const SrcOp &Src0,
+                                const SrcOp &Src1,
+                                Optional<unsigned> Flags = None) {
+    return buildInstr(TargetOpcode::G_FDIV, {Dst}, {Src0, Src1}, Flags);
+  }
+
   /// Build and insert \p Res = G_FMA \p Op0, \p Op1, \p Op2
   MachineInstrBuilder buildFMA(const DstOp &Dst, const SrcOp &Src0,
                                const SrcOp &Src1, const SrcOp &Src2,
@@ -1583,6 +1632,13 @@ public:
     return buildInstr(TargetOpcode::G_FEXP2, {Dst}, {Src}, Flags);
   }
 
+  /// Build and insert \p Dst = G_FPOW \p Src0, \p Src1
+  MachineInstrBuilder buildFPow(const DstOp &Dst, const SrcOp &Src0,
+                                const SrcOp &Src1,
+                                Optional<unsigned> Flags = None) {
+    return buildInstr(TargetOpcode::G_FPOW, {Dst}, {Src0, Src1}, Flags);
+  }
+
   /// Build and insert \p Res = G_FCOPYSIGN \p Op0, \p Op1
   MachineInstrBuilder buildFCopysign(const DstOp &Dst, const SrcOp &Src0,
                                      const SrcOp &Src1) {
@@ -1633,6 +1689,11 @@ public:
     return buildInstr(TargetOpcode::G_UMAX, {Dst}, {Src0, Src1});
   }
 
+  /// Build and insert \p Dst = G_ABS \p Src
+  MachineInstrBuilder buildAbs(const DstOp &Dst, const SrcOp &Src) {
+    return buildInstr(TargetOpcode::G_ABS, {Dst}, {Src});
+  }
+
   /// Build and insert \p Res = G_JUMP_TABLE \p JTI
   ///
   /// G_JUMP_TABLE sets \p Res to the address of the jump table specified by
@@ -1641,6 +1702,101 @@ public:
   /// \return a MachineInstrBuilder for the newly created instruction.
   MachineInstrBuilder buildJumpTable(const LLT PtrTy, unsigned JTI);
 
+  /// Build and insert \p Res = G_VECREDUCE_SEQ_FADD \p ScalarIn, \p VecIn
+  ///
+  /// \p ScalarIn is the scalar accumulator input to start the sequential
+  /// reduction operation of \p VecIn.
+  MachineInstrBuilder buildVecReduceSeqFAdd(const DstOp &Dst,
+                                            const SrcOp &ScalarIn,
+                                            const SrcOp &VecIn) {
+    return buildInstr(TargetOpcode::G_VECREDUCE_SEQ_FADD, {Dst},
+                      {ScalarIn, {VecIn}});
+  }
+
+  /// Build and insert \p Res = G_VECREDUCE_SEQ_FMUL \p ScalarIn, \p VecIn
+  ///
+  /// \p ScalarIn is the scalar accumulator input to start the sequential
+  /// reduction operation of \p VecIn.
+  MachineInstrBuilder buildVecReduceSeqFMul(const DstOp &Dst,
+                                            const SrcOp &ScalarIn,
+                                            const SrcOp &VecIn) {
+    return buildInstr(TargetOpcode::G_VECREDUCE_SEQ_FMUL, {Dst},
+                      {ScalarIn, {VecIn}});
+  }
+
+  /// Build and insert \p Res = G_VECREDUCE_FADD \p Src
+  ///
+  /// \p ScalarIn is the scalar accumulator input to the reduction operation of
+  /// \p VecIn.
+  MachineInstrBuilder buildVecReduceFAdd(const DstOp &Dst,
+                                         const SrcOp &ScalarIn,
+                                         const SrcOp &VecIn) {
+    return buildInstr(TargetOpcode::G_VECREDUCE_FADD, {Dst}, {ScalarIn, VecIn});
+  }
+
+  /// Build and insert \p Res = G_VECREDUCE_FMUL \p Src
+  ///
+  /// \p ScalarIn is the scalar accumulator input to the reduction operation of
+  /// \p VecIn.
+  MachineInstrBuilder buildVecReduceFMul(const DstOp &Dst,
+                                         const SrcOp &ScalarIn,
+                                         const SrcOp &VecIn) {
+    return buildInstr(TargetOpcode::G_VECREDUCE_FMUL, {Dst}, {ScalarIn, VecIn});
+  }
+
+  /// Build and insert \p Res = G_VECREDUCE_FMAX \p Src
+  MachineInstrBuilder buildVecReduceFMax(const DstOp &Dst, const SrcOp &Src) {
+    return buildInstr(TargetOpcode::G_VECREDUCE_FMAX, {Dst}, {Src});
+  }
+
+  /// Build and insert \p Res = G_VECREDUCE_FMIN \p Src
+  MachineInstrBuilder buildVecReduceFMin(const DstOp &Dst, const SrcOp &Src) {
+    return buildInstr(TargetOpcode::G_VECREDUCE_FMIN, {Dst}, {Src});
+  }
+  /// Build and insert \p Res = G_VECREDUCE_ADD \p Src
+  MachineInstrBuilder buildVecReduceAdd(const DstOp &Dst, const SrcOp &Src) {
+    return buildInstr(TargetOpcode::G_VECREDUCE_ADD, {Dst}, {Src});
+  }
+
+  /// Build and insert \p Res = G_VECREDUCE_MUL \p Src
+  MachineInstrBuilder buildVecReduceMul(const DstOp &Dst, const SrcOp &Src) {
+    return buildInstr(TargetOpcode::G_VECREDUCE_MUL, {Dst}, {Src});
+  }
+
+  /// Build and insert \p Res = G_VECREDUCE_AND \p Src
+  MachineInstrBuilder buildVecReduceAnd(const DstOp &Dst, const SrcOp &Src) {
+    return buildInstr(TargetOpcode::G_VECREDUCE_AND, {Dst}, {Src});
+  }
+
+  /// Build and insert \p Res = G_VECREDUCE_OR \p Src
+  MachineInstrBuilder buildVecReduceOr(const DstOp &Dst, const SrcOp &Src) {
+    return buildInstr(TargetOpcode::G_VECREDUCE_OR, {Dst}, {Src});
+  }
+
+  /// Build and insert \p Res = G_VECREDUCE_XOR \p Src
+  MachineInstrBuilder buildVecReduceXor(const DstOp &Dst, const SrcOp &Src) {
+    return buildInstr(TargetOpcode::G_VECREDUCE_XOR, {Dst}, {Src});
+  }
+
+  /// Build and insert \p Res = G_VECREDUCE_SMAX \p Src
+  MachineInstrBuilder buildVecReduceSMax(const DstOp &Dst, const SrcOp &Src) {
+    return buildInstr(TargetOpcode::G_VECREDUCE_SMAX, {Dst}, {Src});
+  }
+
+  /// Build and insert \p Res = G_VECREDUCE_SMIN \p Src
+  MachineInstrBuilder buildVecReduceSMin(const DstOp &Dst, const SrcOp &Src) {
+    return buildInstr(TargetOpcode::G_VECREDUCE_SMIN, {Dst}, {Src});
+  }
+
+  /// Build and insert \p Res = G_VECREDUCE_UMAX \p Src
+  MachineInstrBuilder buildVecReduceUMax(const DstOp &Dst, const SrcOp &Src) {
+    return buildInstr(TargetOpcode::G_VECREDUCE_UMAX, {Dst}, {Src});
+  }
+
+  /// Build and insert \p Res = G_VECREDUCE_UMIN \p Src
+  MachineInstrBuilder buildVecReduceUMin(const DstOp &Dst, const SrcOp &Src) {
+    return buildInstr(TargetOpcode::G_VECREDUCE_UMIN, {Dst}, {Src});
+  }
   virtual MachineInstrBuilder buildInstr(unsigned Opc, ArrayRef<DstOp> DstOps,
                                          ArrayRef<SrcOp> SrcOps,
                                          Optional<unsigned> Flags = None);
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/RegisterBankInfo.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/RegisterBankInfo.h
index 8725d96efd82..da785406bc31 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/RegisterBankInfo.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/RegisterBankInfo.h
@@ -104,36 +104,37 @@ public:
   /// Currently the TableGen-like file would look like:
   /// \code
   /// PartialMapping[] = {
-  /// /*32-bit add*/    {0, 32, GPR}, // Scalar entry repeated for first vec elt.
-  /// /*2x32-bit add*/  {0, 32, GPR}, {32, 32, GPR},
-  /// /*<2x32-bit> vadd {0, 64, VPR}
+  /// /*32-bit add*/      {0, 32, GPR}, // Scalar entry repeated for first
+  ///                                   // vec elt.
+  /// /*2x32-bit add*/    {0, 32, GPR}, {32, 32, GPR},
+  /// /*<2x32-bit> vadd*/ {0, 64, VPR}
   /// }; // PartialMapping duplicated.
   ///
   /// ValueMapping[] {
-  ///   /*plain 32-bit add*/ {&PartialMapping[0], 1},
+  ///   /*plain 32-bit add*/       {&PartialMapping[0], 1},
   ///   /*expanded vadd on 2xadd*/ {&PartialMapping[1], 2},
-  ///   /*plain <2x32-bit> vadd*/ {&PartialMapping[3], 1}
+  ///   /*plain <2x32-bit> vadd*/  {&PartialMapping[3], 1}
   /// };
   /// \endcode
   ///
   /// With the array of pointer, we would have:
   /// \code
   /// PartialMapping[] = {
-  /// /*32-bit add lower */ {0, 32, GPR},
+  /// /*32-bit add lower */ { 0, 32, GPR},
   /// /*32-bit add upper */ {32, 32, GPR},
-  /// /*<2x32-bit> vadd {0, 64, VPR}
+  /// /*<2x32-bit> vadd */  { 0, 64, VPR}
   /// }; // No more duplication.
   ///
   /// BreakDowns[] = {
-  /// /*AddBreakDown*/ &PartialMapping[0],
+  /// /*AddBreakDown*/   &PartialMapping[0],
   /// /*2xAddBreakDown*/ &PartialMapping[0], &PartialMapping[1],
-  /// /*VAddBreakDown*/ &PartialMapping[2]
+  /// /*VAddBreakDown*/  &PartialMapping[2]
   /// }; // Addresses of PartialMapping duplicated (smaller).
   ///
   /// ValueMapping[] {
-  ///   /*plain 32-bit add*/ {&BreakDowns[0], 1},
+  ///   /*plain 32-bit add*/       {&BreakDowns[0], 1},
   ///   /*expanded vadd on 2xadd*/ {&BreakDowns[1], 2},
-  ///   /*plain <2x32-bit> vadd*/ {&BreakDowns[3], 1}
+  ///   /*plain <2x32-bit> vadd*/  {&BreakDowns[3], 1}
   /// };
   /// \endcode
   ///
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/Utils.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/Utils.h
index 42d86917721a..68553ab5b1a8 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/Utils.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/GlobalISel/Utils.h
@@ -18,11 +18,12 @@
 #include "llvm/CodeGen/Register.h"
 #include "llvm/Support/Alignment.h"
 #include "llvm/Support/LowLevelTypeImpl.h"
-#include "llvm/Support/MachineValueType.h"
+#include <cstdint>
 
 namespace llvm {
 
 class AnalysisUsage;
+class GISelKnownBits;
 class MachineFunction;
 class MachineInstr;
 class MachineOperand;
@@ -33,10 +34,10 @@ class MachineRegisterInfo;
 class MCInstrDesc;
 class RegisterBankInfo;
 class TargetInstrInfo;
+class TargetLowering;
 class TargetPassConfig;
 class TargetRegisterInfo;
 class TargetRegisterClass;
-class Twine;
 class ConstantFP;
 class APFloat;
 
@@ -51,9 +52,10 @@ Register constrainRegToClass(MachineRegisterInfo &MRI,
 
 /// Constrain the Register operand OpIdx, so that it is now constrained to the
 /// TargetRegisterClass passed as an argument (RegClass).
-/// If this fails, create a new virtual register in the correct class and
-/// insert a COPY before \p InsertPt if it is a use or after if it is a
-/// definition. The debug location of \p InsertPt is used for the new copy.
+/// If this fails, create a new virtual register in the correct class and insert
+/// a COPY before \p InsertPt if it is a use or after if it is a definition.
+/// In both cases, the function also updates the register of RegMo. The debug
+/// location of \p InsertPt is used for the new copy.
 ///
 /// \return The virtual register constrained to the right register class.
 Register constrainOperandRegClass(const MachineFunction &MF,
@@ -63,12 +65,13 @@ Register constrainOperandRegClass(const MachineFunction &MF,
                                   const RegisterBankInfo &RBI,
                                   MachineInstr &InsertPt,
                                   const TargetRegisterClass &RegClass,
-                                  const MachineOperand &RegMO);
+                                  MachineOperand &RegMO);
 
-/// Try to constrain Reg so that it is usable by argument OpIdx of the
-/// provided MCInstrDesc \p II. If this fails, create a new virtual
-/// register in the correct class and insert a COPY before \p InsertPt
-/// if it is a use or after if it is a definition.
+/// Try to constrain Reg so that it is usable by argument OpIdx of the provided
+/// MCInstrDesc \p II. If this fails, create a new virtual register in the
+/// correct class and insert a COPY before \p InsertPt if it is a use or after
+/// if it is a definition. In both cases, the function also updates the register
+/// of RegMo.
 /// This is equivalent to constrainOperandRegClass(..., RegClass, ...)
 /// with RegClass obtained from the MCInstrDesc. The debug location of \p
 /// InsertPt is used for the new copy.
@@ -80,7 +83,7 @@ Register constrainOperandRegClass(const MachineFunction &MF,
                                   const TargetInstrInfo &TII,
                                   const RegisterBankInfo &RBI,
                                   MachineInstr &InsertPt, const MCInstrDesc &II,
-                                  const MachineOperand &RegMO, unsigned OpIdx);
+                                  MachineOperand &RegMO, unsigned OpIdx);
 
 /// Mutate the newly-selected instruction \p I to constrain its (possibly
 /// generic) virtual register operands to the instruction's register class.
@@ -121,14 +124,19 @@ void reportGISelWarning(MachineFunction &MF, const TargetPassConfig &TPC,
                         MachineOptimizationRemarkEmitter &MORE,
                         MachineOptimizationRemarkMissed &R);
 
+/// If \p VReg is defined by a G_CONSTANT, return the corresponding value.
+Optional<APInt> getConstantVRegVal(Register VReg,
+                                   const MachineRegisterInfo &MRI);
+
 /// If \p VReg is defined by a G_CONSTANT fits in int64_t
 /// returns it.
-Optional<int64_t> getConstantVRegVal(Register VReg,
-                                     const MachineRegisterInfo &MRI);
+Optional<int64_t> getConstantVRegSExtVal(Register VReg,
+                                         const MachineRegisterInfo &MRI);
+
 /// Simple struct used to hold a constant integer value and a virtual
 /// register.
 struct ValueAndVReg {
-  int64_t Value;
+  APInt Value;
   Register VReg;
 };
 /// If \p VReg is defined by a statically evaluable chain of
@@ -138,10 +146,13 @@ struct ValueAndVReg {
 /// When \p LookThroughInstrs == false this function behaves like
 /// getConstantVRegVal.
 /// When \p HandleFConstants == false the function bails on G_FCONSTANTs.
+/// When \p LookThroughAnyExt == true the function treats G_ANYEXT same as
+/// G_SEXT.
 Optional<ValueAndVReg>
 getConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI,
                                   bool LookThroughInstrs = true,
-                                  bool HandleFConstants = true);
+                                  bool HandleFConstants = true,
+                                  bool LookThroughAnyExt = false);
 const ConstantFP* getConstantFPVRegVal(Register VReg,
                                        const MachineRegisterInfo &MRI);
 
@@ -151,9 +162,20 @@ const ConstantFP* getConstantFPVRegVal(Register VReg,
 MachineInstr *getOpcodeDef(unsigned Opcode, Register Reg,
                            const MachineRegisterInfo &MRI);
 
-/// Find the def instruction for \p Reg, folding away any trivial copies. Note
-/// it may still return a COPY, if it changes the type. May return nullptr if \p
-/// Reg is not a generic virtual register.
+/// Simple struct used to hold a Register value and the instruction which
+/// defines it.
+struct DefinitionAndSourceRegister {
+  MachineInstr *MI;
+  Register Reg;
+};
+
+/// Find the def instruction for \p Reg, and underlying value Register folding
+/// away any copies.
+Optional<DefinitionAndSourceRegister>
+getDefSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI);
+
+/// Find the def instruction for \p Reg, folding away any trivial copies. May
+/// return nullptr if \p Reg is not a generic virtual register.
 MachineInstr *getDefIgnoringCopies(Register Reg,
                                    const MachineRegisterInfo &MRI);
 
@@ -178,6 +200,12 @@ Optional<APInt> ConstantFoldBinOp(unsigned Opcode, const Register Op1,
 Optional<APInt> ConstantFoldExtOp(unsigned Opcode, const Register Op1,
                                   uint64_t Imm, const MachineRegisterInfo &MRI);
 
+/// Test if the given value is known to have exactly one bit set. This differs
+/// from computeKnownBits in that it doesn't necessarily determine which bit is
+/// set.
+bool isKnownToBeAPowerOfTwo(Register Val, const MachineRegisterInfo &MRI,
+                            GISelKnownBits *KnownBits = nullptr);
+
 /// Returns true if \p Val can be assumed to never be a NaN. If \p SNaN is true,
 /// this returns if \p Val can be assumed to never be a signaling NaN.
 bool isKnownNeverNaN(Register Val, const MachineRegisterInfo &MRI,
@@ -190,17 +218,65 @@ inline bool isKnownNeverSNaN(Register Val, const MachineRegisterInfo &MRI) {
 
 Align inferAlignFromPtrInfo(MachineFunction &MF, const MachinePointerInfo &MPO);
 
-/// Return the least common multiple type of \p Ty0 and \p Ty1, by changing
-/// the number of vector elements or scalar bitwidth. The intent is a
-/// G_MERGE_VALUES can be constructed from \p Ty0 elements, and unmerged into
-/// \p Ty1.
-LLT getLCMType(LLT Ty0, LLT Ty1);
+/// Return a virtual register corresponding to the incoming argument register \p
+/// PhysReg. This register is expected to have class \p RC, and optional type \p
+/// RegTy. This assumes all references to the register will use the same type.
+///
+/// If there is an existing live-in argument register, it will be returned.
+/// This will also ensure there is a valid copy
+Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII,
+                                  MCRegister PhysReg,
+                                  const TargetRegisterClass &RC,
+                                  LLT RegTy = LLT());
+
+/// Return the least common multiple type of \p OrigTy and \p TargetTy, by changing the
+/// number of vector elements or scalar bitwidth. The intent is a
+/// G_MERGE_VALUES, G_BUILD_VECTOR, or G_CONCAT_VECTORS can be constructed from
+/// \p OrigTy elements, and unmerged into \p TargetTy
+LLVM_READNONE
+LLT getLCMType(LLT OrigTy, LLT TargetTy);
 
-/// Return a type that is greatest common divisor of \p OrigTy and \p
-/// TargetTy. This will either change the number of vector elements, or
-/// bitwidth of scalars. The intent is the result type can be used as the
-/// result of a G_UNMERGE_VALUES from \p OrigTy.
+/// Return a type where the total size is the greatest common divisor of \p
+/// OrigTy and \p TargetTy. This will try to either change the number of vector
+/// elements, or bitwidth of scalars. The intent is the result type can be used
+/// as the result of a G_UNMERGE_VALUES from \p OrigTy, and then some
+/// combination of G_MERGE_VALUES, G_BUILD_VECTOR and G_CONCAT_VECTORS (possibly
+/// with intermediate casts) can re-form \p TargetTy.
+///
+/// If these are vectors with different element types, this will try to produce
+/// a vector with a compatible total size, but the element type of \p OrigTy. If
+/// this can't be satisfied, this will produce a scalar smaller than the
+/// original vector elements.
+///
+/// In the worst case, this returns LLT::scalar(1)
+LLVM_READNONE
 LLT getGCDType(LLT OrigTy, LLT TargetTy);
 
+/// \returns The splat index of a G_SHUFFLE_VECTOR \p MI when \p MI is a splat.
+/// If \p MI is not a splat, returns None.
+Optional<int> getSplatIndex(MachineInstr &MI);
+
+/// Returns a scalar constant of a G_BUILD_VECTOR splat if it exists.
+Optional<int64_t> getBuildVectorConstantSplat(const MachineInstr &MI,
+                                              const MachineRegisterInfo &MRI);
+
+/// Return true if the specified instruction is a G_BUILD_VECTOR or
+/// G_BUILD_VECTOR_TRUNC where all of the elements are 0 or undef.
+bool isBuildVectorAllZeros(const MachineInstr &MI,
+                           const MachineRegisterInfo &MRI);
+
+/// Return true if the specified instruction is a G_BUILD_VECTOR or
+/// G_BUILD_VECTOR_TRUNC where all of the elements are ~0 or undef.
+bool isBuildVectorAllOnes(const MachineInstr &MI,
+                          const MachineRegisterInfo &MRI);
+
+/// Returns true if given the TargetLowering's boolean contents information,
+/// the value \p Val contains a true value.
+bool isConstTrueVal(const TargetLowering &TLI, int64_t Val, bool IsVector,
+                    bool IsFP);
+
+/// Returns an integer representing true, as defined by the
+/// TargetBooleanContents.
+int64_t getICmpTrueVal(const TargetLowering &TLI, bool IsVector, bool IsFP);
 } // End namespace llvm.
 #endif
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/ISDOpcodes.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/ISDOpcodes.h
index 534f988c5e96..1974e2f842c9 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -86,7 +86,16 @@ enum NodeType {
   /// the parent's frame or return address, and so on.
   FRAMEADDR,
   RETURNADDR,
+
+  /// ADDROFRETURNADDR - Represents the llvm.addressofreturnaddress intrinsic.
+  /// This node takes no operand, returns a target-specific pointer to the
+  /// place in the stack frame where the return address of the current
+  /// function is stored.
   ADDROFRETURNADDR,
+
+  /// SPONENTRY - Represents the llvm.sponentry intrinsic. Takes no argument
+  /// and returns the stack pointer value at the entry of the current
+  /// function calling this intrinsic.
   SPONENTRY,
 
   /// LOCAL_RECOVER - Represents the llvm.localrecover intrinsic.
@@ -274,6 +283,16 @@ enum NodeType {
   ADDCARRY,
   SUBCARRY,
 
+  /// Carry-using overflow-aware nodes for multiple precision addition and
+  /// subtraction. These nodes take three operands: The first two are normal lhs
+  /// and rhs to the add or sub, and the third is a boolean indicating if there
+  /// is an incoming carry. They produce two results: the normal result of the
+  /// add or sub, and a boolean that indicates if an overflow occured (*not*
+  /// flag, because it may be a store to memory, etc.). If the type of the
+  /// boolean is not i1 then the high bits conform to getBooleanContents.
+  SADDO_CARRY,
+  SSUBO_CARRY,
+
   /// RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
   /// These nodes take two operands: the normal LHS and RHS to the add. They
   /// produce two results: the normal result of the add, and a boolean that
@@ -310,6 +329,16 @@ enum NodeType {
   SSUBSAT,
   USUBSAT,
 
+  /// RESULT = [US]SHLSAT(LHS, RHS) - Perform saturation left shift. The first
+  /// operand is the value to be shifted, and the second argument is the amount
+  /// to shift by. Both must be integers of the same bit width (W). If the true
+  /// value of LHS << RHS exceeds the largest value that can be represented by
+  /// W bits, the resulting value is this maximum value, Otherwise, if this
+  /// value is less than the smallest value that can be represented by W bits,
+  /// the resulting value is this minimum value.
+  SSHLSAT,
+  USHLSAT,
+
   /// RESULT = [US]MULFIX(LHS, RHS, SCALE) - Perform fixed point multiplication
   /// on
   /// 2 integers with the same width and scale. SCALE represents the scale of
@@ -504,7 +533,8 @@ enum NodeType {
   /// IDX is first scaled by the runtime scaling factor of T. Elements IDX
   /// through (IDX + num_elements(T) - 1) must be valid VECTOR indices. If this
   /// condition cannot be determined statically but is false at runtime, then
-  /// the result vector is undefined.
+  /// the result vector is undefined. The IDX parameter must be a vector index
+  /// constant type, which for most targets will be an integer pointer type.
   ///
   /// This operation supports extracting a fixed-width vector from a scalable
   /// vector, but not the other way around.
@@ -587,6 +617,7 @@ enum NodeType {
   CTLZ,
   CTPOP,
   BITREVERSE,
+  PARITY,
 
   /// Bit counting operators with an undefined result for zero inputs.
   CTTZ_ZERO_UNDEF,
@@ -703,6 +734,21 @@ enum NodeType {
   FP_TO_SINT,
   FP_TO_UINT,
 
+  /// FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a
+  /// signed or unsigned integer type with the bit width given in operand 1 with
+  /// the following semantics:
+  ///
+  ///  * If the value is NaN, zero is returned.
+  ///  * If the value is larger/smaller than the largest/smallest integer,
+  ///    the largest/smallest integer is returned (saturation).
+  ///  * Otherwise the result of rounding the value towards zero is returned.
+  ///
+  /// The width given in operand 1 must be equal to, or smaller than, the scalar
+  /// result type width. It may end up being smaller than the result witdh as a
+  /// result of integer type legalization.
+  FP_TO_SINT_SAT,
+  FP_TO_UINT_SAT,
+
   /// X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type
   /// down to the precision of the destination VT.  TRUNC is a flag, which is
   /// always an integer that is zero or one.  If TRUNC is 0, this is a
@@ -844,13 +890,18 @@ enum NodeType {
   /// BRCOND - Conditional branch.  The first operand is the chain, the
   /// second is the condition, the third is the block to branch to if the
   /// condition is true.  If the type of the condition is not i1, then the
-  /// high bits must conform to getBooleanContents.
+  /// high bits must conform to getBooleanContents. If the condition is undef,
+  /// it nondeterministically jumps to the block.
+  /// TODO: Its semantics w.r.t undef requires further discussion; we need to
+  /// make it sure that it is consistent with optimizations in MIR & the
+  /// meaning of IMPLICIT_DEF. See https://reviews.llvm.org/D92015
   BRCOND,
 
   /// BR_CC - Conditional branch.  The behavior is like that of SELECT_CC, in
   /// that the condition is represented as condition code, and two nodes to
   /// compare, rather than as a combined SetCC node.  The operands in order
-  /// are chain, cc, lhs, rhs, block to branch to if condition is true.
+  /// are chain, cc, lhs, rhs, block to branch to if condition is true. If
+  /// condition is undef, it nondeterministically jumps to the block.
   BR_CC,
 
   /// INLINEASM - Represents an inline asm block.  This node always has two
@@ -981,6 +1032,9 @@ enum NodeType {
   /// DEBUGTRAP - Trap intended to get the attention of a debugger.
   DEBUGTRAP,
 
+  /// UBSANTRAP - Trap with an immediate describing the kind of sanitizer failure.
+  UBSANTRAP,
+
   /// PREFETCH - This corresponds to a prefetch intrinsic. The first operand
   /// is the chain.  The other operands are the address to prefetch,
   /// read / write specifier, locality specifier and instruction / data cache
@@ -1075,6 +1129,10 @@ enum NodeType {
   /// known nonzero constant. The only operand here is the chain.
   GET_DYNAMIC_AREA_OFFSET,
 
+  /// Pseudo probe for AutoFDO, as a place holder in a basic block to improve
+  /// the sample counts quality.
+  PSEUDO_PROBE,
+
   /// VSCALE(IMM) - Returns the runtime scaling factor used to calculate the
   /// number of elements within a scalable vector. IMM is a constant integer
   /// multiplier that is applied to the runtime value.
@@ -1082,12 +1140,25 @@ enum NodeType {
 
   /// Generic reduction nodes. These nodes represent horizontal vector
   /// reduction operations, producing a scalar result.
-  /// The STRICT variants perform reductions in sequential order. The first
+  /// The SEQ variants perform reductions in sequential order. The first
   /// operand is an initial scalar accumulator value, and the second operand
   /// is the vector to reduce.
-  VECREDUCE_STRICT_FADD,
-  VECREDUCE_STRICT_FMUL,
-  /// These reductions are non-strict, and have a single vector operand.
+  /// E.g. RES = VECREDUCE_SEQ_FADD f32 ACC, <4 x f32> SRC_VEC
+  ///  ... is equivalent to
+  /// RES = (((ACC + SRC_VEC[0]) + SRC_VEC[1]) + SRC_VEC[2]) + SRC_VEC[3]
+  VECREDUCE_SEQ_FADD,
+  VECREDUCE_SEQ_FMUL,
+
+  /// These reductions have relaxed evaluation order semantics, and have a
+  /// single vector operand. The order of evaluation is unspecified. For
+  /// pow-of-2 vectors, one valid legalizer expansion is to use a tree
+  /// reduction, i.e.:
+  /// For RES = VECREDUCE_FADD <8 x f16> SRC_VEC
+  ///   PART_RDX = FADD SRC_VEC[0:3], SRC_VEC[4:7]
+  ///   PART_RDX2 = FADD PART_RDX[0:1], PART_RDX[2:3]
+  ///   RES = FADD PART_RDX2[0], PART_RDX2[1]
+  /// For non-pow-2 vectors, this can be computed by extracting each element
+  /// and performing the operation as if it were scalarized.
   VECREDUCE_FADD,
   VECREDUCE_FMUL,
   /// FMIN/FMAX nodes can have flags, for NaN/NoNaN variants.
@@ -1106,6 +1177,10 @@ enum NodeType {
   VECREDUCE_UMAX,
   VECREDUCE_UMIN,
 
+// Vector Predication
+#define BEGIN_REGISTER_VP_SDNODE(VPSDID, ...) VPSDID,
+#include "llvm/IR/VPIntrinsics.def"
+
   /// BUILTIN_OP_END - This must be the last enum value in this list.
   /// The target-specific pre-isel opcode values start here.
   BUILTIN_OP_END
@@ -1122,6 +1197,19 @@ static const int FIRST_TARGET_STRICTFP_OPCODE = BUILTIN_OP_END + 400;
 /// be used with SelectionDAG::getMemIntrinsicNode.
 static const int FIRST_TARGET_MEMORY_OPCODE = BUILTIN_OP_END + 500;
 
+/// Get underlying scalar opcode for VECREDUCE opcode.
+/// For example ISD::AND for ISD::VECREDUCE_AND.
+NodeType getVecReduceBaseOpcode(unsigned VecReduceOpcode);
+
+/// Whether this is a vector-predicated Opcode.
+bool isVPOpcode(unsigned Opcode);
+
+/// The operand position of the vector mask.
+Optional<unsigned> getVPMaskIdx(unsigned Opcode);
+
+/// The operand position of the explicit vector length parameter.
+Optional<unsigned> getVPExplicitVectorLengthIdx(unsigned Opcode);
+
 //===--------------------------------------------------------------------===//
 /// MemIndexedMode enum - This enum defines the load / store indexed
 /// addressing modes.
@@ -1244,6 +1332,12 @@ inline bool isUnsignedIntSetCC(CondCode Code) {
   return Code == SETUGT || Code == SETUGE || Code == SETULT || Code == SETULE;
 }
 
+/// Return true if this is a setcc instruction that performs an equality
+/// comparison when used with integer operands.
+inline bool isIntEqualitySetCC(CondCode Code) {
+  return Code == SETEQ || Code == SETNE;
+}
+
 /// Return true if the specified condition returns true if the two operands to
 /// the condition are equal. Note that if one of the two operands is a NaN,
 /// this value is meaningless.
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/LexicalScopes.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/LexicalScopes.h
index bac850d327ef..9617ba80c138 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/LexicalScopes.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/LexicalScopes.h
@@ -194,9 +194,6 @@ public:
     return I != LexicalScopeMap.end() ? &I->second : nullptr;
   }
 
-  /// dump - Print data structures to dbgs().
-  void dump() const;
-
   /// getOrCreateAbstractScope - Find or create an abstract lexical scope.
   LexicalScope *getOrCreateAbstractScope(const DILocalScope *Scope);
 
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/LiveInterval.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/LiveInterval.h
index 0764257125e6..c2b158ac1b7f 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/LiveInterval.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/LiveInterval.h
@@ -25,6 +25,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/iterator_range.h"
+#include "llvm/CodeGen/Register.h"
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/MC/LaneBitmask.h"
 #include "llvm/Support/Allocator.h"
@@ -597,10 +598,9 @@ namespace llvm {
     /// @p End.
     bool isUndefIn(ArrayRef<SlotIndex> Undefs, SlotIndex Begin,
                    SlotIndex End) const {
-      return std::any_of(Undefs.begin(), Undefs.end(),
-                [Begin,End] (SlotIndex Idx) -> bool {
-                  return Begin <= Idx && Idx < End;
-                });
+      return llvm::any_of(Undefs, [Begin, End](SlotIndex Idx) -> bool {
+        return Begin <= Idx && Idx < End;
+      });
     }
 
     /// Flush segment set into the regular segment vector.
@@ -704,12 +704,16 @@ namespace llvm {
   private:
     SubRange *SubRanges = nullptr; ///< Single linked list of subregister live
                                    /// ranges.
+    const Register Reg; // the register or stack slot of this interval.
+    float Weight = 0.0; // weight of this interval
 
   public:
-    const unsigned reg;  // the register or stack slot of this interval.
-    float weight;        // weight of this interval
+    Register reg() const { return Reg; }
+    float weight() const { return Weight; }
+    void incrementWeight(float Inc) { Weight += Inc; }
+    void setWeight(float Value) { Weight = Value; }
 
-    LiveInterval(unsigned Reg, float Weight) : reg(Reg), weight(Weight) {}
+    LiveInterval(unsigned Reg, float Weight) : Reg(Reg), Weight(Weight) {}
 
     ~LiveInterval() {
       clearSubRanges();
@@ -731,10 +735,10 @@ namespace llvm {
         ++*this;
         return res;
       }
-      bool operator!=(const SingleLinkedListIterator<T> &Other) {
+      bool operator!=(const SingleLinkedListIterator<T> &Other) const {
         return P != Other.operator->();
       }
-      bool operator==(const SingleLinkedListIterator<T> &Other) {
+      bool operator==(const SingleLinkedListIterator<T> &Other) const {
         return P == Other.operator->();
       }
       T &operator*() const {
@@ -806,14 +810,10 @@ namespace llvm {
     unsigned getSize() const;
 
     /// isSpillable - Can this interval be spilled?
-    bool isSpillable() const {
-      return weight != huge_valf;
-    }
+    bool isSpillable() const { return Weight != huge_valf; }
 
     /// markNotSpillable - Mark interval as not spillable
-    void markNotSpillable() {
-      weight = huge_valf;
-    }
+    void markNotSpillable() { Weight = huge_valf; }
 
     /// For a given lane mask @p LaneMask, compute indexes at which the
     /// lane is marked undefined by subregister <def,read-undef> definitions.
@@ -834,7 +834,7 @@ namespace llvm {
     ///    function will be applied to the L0010 and L0008 subranges.
     ///
     /// \p Indexes and \p TRI are required to clean up the VNIs that
-    /// don't defne the related lane masks after they get shrunk. E.g.,
+    /// don't define the related lane masks after they get shrunk. E.g.,
     /// when L000F gets split into L0007 and L0008 maybe only a subset
     /// of the VNIs that defined L000F defines L0007.
     ///
@@ -870,7 +870,7 @@ namespace llvm {
     bool operator<(const LiveInterval& other) const {
       const SlotIndex &thisIndex = beginIndex();
       const SlotIndex &otherIndex = other.beginIndex();
-      return std::tie(thisIndex, reg) < std::tie(otherIndex, other.reg);
+      return std::tie(thisIndex, Reg) < std::tie(otherIndex, other.Reg);
     }
 
     void print(raw_ostream &OS) const;
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/LiveIntervalUnion.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/LiveIntervalUnion.h
index c555763a4ec2..ad9e06d2bcf0 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/LiveIntervalUnion.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/LiveIntervalUnion.h
@@ -104,6 +104,9 @@ public:
   void verify(LiveVirtRegBitSet& VisitedVRegs);
 #endif
 
+  // Get any virtual register that is assign to this physical unit
+  LiveInterval *getOneVReg() const;
+
   /// Query interferences between a single live virtual register and a live
   /// interval union.
   class Query {
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/LiveIntervals.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/LiveIntervals.h
index 945a40829714..fa08166791b0 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/LiveIntervals.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/LiveIntervals.h
@@ -114,8 +114,8 @@ class VirtRegMap;
     LiveInterval &getInterval(Register Reg) {
       if (hasInterval(Reg))
         return *VirtRegIntervals[Reg.id()];
-      else
-        return createAndComputeVirtRegInterval(Reg);
+
+      return createAndComputeVirtRegInterval(Reg);
     }
 
     const LiveInterval &getInterval(Register Reg) const {
@@ -142,14 +142,14 @@ class VirtRegMap;
     }
 
     /// Interval removal.
-    void removeInterval(unsigned Reg) {
+    void removeInterval(Register Reg) {
       delete VirtRegIntervals[Reg];
       VirtRegIntervals[Reg] = nullptr;
     }
 
     /// Given a register and an instruction, adds a live segment from that
     /// instruction to the end of its MBB.
-    LiveInterval::Segment addSegmentToEndOfBlock(unsigned reg,
+    LiveInterval::Segment addSegmentToEndOfBlock(Register Reg,
                                                  MachineInstr &startInst);
 
     /// After removing some uses of a register, shrink its live range to just
@@ -167,7 +167,7 @@ class VirtRegMap;
     /// the lane mask of the subregister range.
     /// This may leave the subrange empty which needs to be cleaned up with
     /// LiveInterval::removeEmptySubranges() afterwards.
-    void shrinkToUses(LiveInterval::SubRange &SR, unsigned Reg);
+    void shrinkToUses(LiveInterval::SubRange &SR, Register Reg);
 
     /// Extend the live range \p LR to reach all points in \p Indices. The
     /// points in the \p Indices array must be jointly dominated by the union
@@ -256,9 +256,8 @@ class VirtRegMap;
       return Indexes->getMBBFromIndex(index);
     }
 
-    void insertMBBInMaps(MachineBasicBlock *MBB,
-                         MachineInstr *InsertionPoint = nullptr) {
-      Indexes->insertMBBInMaps(MBB, InsertionPoint);
+    void insertMBBInMaps(MachineBasicBlock *MBB) {
+      Indexes->insertMBBInMaps(MBB);
       assert(unsigned(MBB->getNumber()) == RegMaskBlocks.size() &&
              "Blocks must be added in order.");
       RegMaskBlocks.push_back(std::make_pair(RegMaskSlots.size(), 0));
@@ -423,7 +422,7 @@ class VirtRegMap;
     /// Reg. Subsequent uses should rely on on-demand recomputation.  \note This
     /// method can result in inconsistent liveness tracking if multiple phyical
     /// registers share a regunit, and should be used cautiously.
-    void removeAllRegUnitsForPhysReg(unsigned Reg) {
+    void removeAllRegUnitsForPhysReg(MCRegister Reg) {
       for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
         removeRegUnit(*Units);
     }
@@ -431,7 +430,7 @@ class VirtRegMap;
     /// Remove value numbers and related live segments starting at position
     /// \p Pos that are part of any liverange of physical register \p Reg or one
     /// of its subregisters.
-    void removePhysRegDefAt(unsigned Reg, SlotIndex Pos);
+    void removePhysRegDefAt(MCRegister Reg, SlotIndex Pos);
 
     /// Remove value number and related live segments of \p LI and its subranges
     /// that start at position \p Pos.
@@ -463,7 +462,7 @@ class VirtRegMap;
     bool computeDeadValues(LiveInterval &LI,
                            SmallVectorImpl<MachineInstr*> *dead);
 
-    static LiveInterval* createInterval(unsigned Reg);
+    static LiveInterval *createInterval(Register Reg);
 
     void printInstrs(raw_ostream &O) const;
     void dumpInstrs() const;
@@ -474,7 +473,7 @@ class VirtRegMap;
 
     using ShrinkToUsesWorkList = SmallVector<std::pair<SlotIndex, VNInfo*>, 16>;
     void extendSegmentsToUses(LiveRange &Segments,
-                              ShrinkToUsesWorkList &WorkList, unsigned Reg,
+                              ShrinkToUsesWorkList &WorkList, Register Reg,
                               LaneBitmask LaneMask);
 
     /// Helper function for repairIntervalsInRange(), walks backwards and
@@ -484,7 +483,7 @@ class VirtRegMap;
     void repairOldRegInRange(MachineBasicBlock::iterator Begin,
                              MachineBasicBlock::iterator End,
                              const SlotIndex endIdx, LiveRange &LR,
-                             unsigned Reg,
+                             Register Reg,
                              LaneBitmask LaneMask = LaneBitmask::getAll());
 
     class HMEditor;
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/LiveRangeEdit.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/LiveRangeEdit.h
index 3c4273130ab2..87d48adc7f27 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/LiveRangeEdit.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/LiveRangeEdit.h
@@ -56,14 +56,14 @@ public:
 
     /// Called when a virtual register is no longer used. Return false to defer
     /// its deletion from LiveIntervals.
-    virtual bool LRE_CanEraseVirtReg(unsigned) { return true; }
+    virtual bool LRE_CanEraseVirtReg(Register) { return true; }
 
     /// Called before shrinking the live range of a virtual register.
-    virtual void LRE_WillShrinkVirtReg(unsigned) {}
+    virtual void LRE_WillShrinkVirtReg(Register) {}
 
     /// Called after cloning a virtual register.
     /// This is used for new registers representing connected components of Old.
-    virtual void LRE_DidCloneVirtReg(unsigned New, unsigned Old) {}
+    virtual void LRE_DidCloneVirtReg(Register New, Register Old) {}
   };
 
 private:
@@ -152,7 +152,7 @@ public:
     return *Parent;
   }
 
-  Register getReg() const { return getParent().reg; }
+  Register getReg() const { return getParent().reg(); }
 
   /// Iterator for accessing the new registers added by this edit.
   using iterator = SmallVectorImpl<Register>::const_iterator;
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/LiveRegMatrix.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/LiveRegMatrix.h
index ab4d44f9a611..fc67bce329ab 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/LiveRegMatrix.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/LiveRegMatrix.h
@@ -104,19 +104,19 @@ public:
   /// If this function returns IK_Free, it is legal to assign(VirtReg, PhysReg).
   /// When there is more than one kind of interference, the InterferenceKind
   /// with the highest enum value is returned.
-  InterferenceKind checkInterference(LiveInterval &VirtReg, unsigned PhysReg);
+  InterferenceKind checkInterference(LiveInterval &VirtReg, MCRegister PhysReg);
 
   /// Check for interference in the segment [Start, End) that may prevent
   /// assignment to PhysReg. If this function returns true, there is
   /// interference in the segment [Start, End) of some other interval already
   /// assigned to PhysReg. If this function returns false, PhysReg is free at
   /// the segment [Start, End).
-  bool checkInterference(SlotIndex Start, SlotIndex End, unsigned PhysReg);
+  bool checkInterference(SlotIndex Start, SlotIndex End, MCRegister PhysReg);
 
   /// Assign VirtReg to PhysReg.
   /// This will mark VirtReg's live range as occupied in the LiveRegMatrix and
   /// update VirtRegMap. The live range is expected to be available in PhysReg.
-  void assign(LiveInterval &VirtReg, unsigned PhysReg);
+  void assign(LiveInterval &VirtReg, MCRegister PhysReg);
 
   /// Unassign VirtReg from its PhysReg.
   /// Assuming that VirtReg was previously assigned to a PhysReg, this undoes
@@ -124,7 +124,7 @@ public:
   void unassign(LiveInterval &VirtReg);
 
   /// Returns true if the given \p PhysReg has any live intervals assigned.
-  bool isPhysRegUsed(unsigned PhysReg) const;
+  bool isPhysRegUsed(MCRegister PhysReg) const;
 
   //===--------------------------------------------------------------------===//
   // Low-level interface.
@@ -136,22 +136,25 @@ public:
   /// Check for regmask interference only.
   /// Return true if VirtReg crosses a regmask operand that clobbers PhysReg.
   /// If PhysReg is null, check if VirtReg crosses any regmask operands.
-  bool checkRegMaskInterference(LiveInterval &VirtReg, unsigned PhysReg = 0);
+  bool checkRegMaskInterference(LiveInterval &VirtReg,
+                                MCRegister PhysReg = MCRegister::NoRegister);
 
   /// Check for regunit interference only.
   /// Return true if VirtReg overlaps a fixed assignment of one of PhysRegs's
   /// register units.
-  bool checkRegUnitInterference(LiveInterval &VirtReg, unsigned PhysReg);
+  bool checkRegUnitInterference(LiveInterval &VirtReg, MCRegister PhysReg);
 
   /// Query a line of the assigned virtual register matrix directly.
   /// Use MCRegUnitIterator to enumerate all regunits in the desired PhysReg.
   /// This returns a reference to an internal Query data structure that is only
   /// valid until the next query() call.
-  LiveIntervalUnion::Query &query(const LiveRange &LR, unsigned RegUnit);
+  LiveIntervalUnion::Query &query(const LiveRange &LR, MCRegister RegUnit);
 
   /// Directly access the live interval unions per regunit.
   /// This returns an array indexed by the regunit number.
   LiveIntervalUnion *getLiveUnions() { return &Matrix[0]; }
+
+  Register getOneVReg(unsigned PhysReg) const;
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/LiveRegUnits.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/LiveRegUnits.h
index 1ed091e3bb5e..39a1ec461ef6 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/LiveRegUnits.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/LiveRegUnits.h
@@ -15,7 +15,7 @@
 #define LLVM_CODEGEN_LIVEREGUNITS_H
 
 #include "llvm/ADT/BitVector.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/MC/LaneBitmask.h"
 #include "llvm/MC/MCRegisterInfo.h"
@@ -67,7 +67,6 @@ public:
         UsedRegUnits.addReg(Reg);
       }
     }
-    return;
   }
 
   /// Initialize and clear the set.
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/LiveVariables.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/LiveVariables.h
index efb0fa85a0fe..9b0667bbbeb0 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/LiveVariables.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/LiveVariables.h
@@ -105,8 +105,7 @@ public:
     /// isLiveIn - Is Reg live in to MBB? This means that Reg is live through
     /// MBB, or it is killed in MBB. If Reg is only used by PHI instructions in
     /// MBB, it is not considered live in.
-    bool isLiveIn(const MachineBasicBlock &MBB,
-                  unsigned Reg,
+    bool isLiveIn(const MachineBasicBlock &MBB, Register Reg,
                   MachineRegisterInfo &MRI);
 
     void dump() const;
@@ -149,25 +148,25 @@ private:   // Intermediate data structures
   /// HandlePhysRegKill - Add kills of Reg and its sub-registers to the
   /// uses. Pay special attention to the sub-register uses which may come below
   /// the last use of the whole register.
-  bool HandlePhysRegKill(unsigned Reg, MachineInstr *MI);
+  bool HandlePhysRegKill(Register Reg, MachineInstr *MI);
 
   /// HandleRegMask - Call HandlePhysRegKill for all registers clobbered by Mask.
   void HandleRegMask(const MachineOperand&);
 
-  void HandlePhysRegUse(unsigned Reg, MachineInstr &MI);
-  void HandlePhysRegDef(unsigned Reg, MachineInstr *MI,
+  void HandlePhysRegUse(Register Reg, MachineInstr &MI);
+  void HandlePhysRegDef(Register Reg, MachineInstr *MI,
                         SmallVectorImpl<unsigned> &Defs);
   void UpdatePhysRegDefs(MachineInstr &MI, SmallVectorImpl<unsigned> &Defs);
 
   /// FindLastRefOrPartRef - Return the last reference or partial reference of
   /// the specified register.
-  MachineInstr *FindLastRefOrPartRef(unsigned Reg);
+  MachineInstr *FindLastRefOrPartRef(Register Reg);
 
   /// FindLastPartialDef - Return the last partial def of the specified
   /// register. Also returns the sub-registers that're defined by the
   /// instruction.
-  MachineInstr *FindLastPartialDef(unsigned Reg,
-                                   SmallSet<unsigned,4> &PartDefRegs);
+  MachineInstr *FindLastPartialDef(Register Reg,
+                                   SmallSet<unsigned, 4> &PartDefRegs);
 
   /// analyzePHINodes - Gather information about the PHI nodes in here. In
   /// particular, we want to map the variable information of a virtual
@@ -184,21 +183,21 @@ public:
 
   /// RegisterDefIsDead - Return true if the specified instruction defines the
   /// specified register, but that definition is dead.
-  bool RegisterDefIsDead(MachineInstr &MI, unsigned Reg) const;
+  bool RegisterDefIsDead(MachineInstr &MI, Register Reg) const;
 
   //===--------------------------------------------------------------------===//
   //  API to update live variable information
 
   /// replaceKillInstruction - Update register kill info by replacing a kill
   /// instruction with a new one.
-  void replaceKillInstruction(unsigned Reg, MachineInstr &OldMI,
+  void replaceKillInstruction(Register Reg, MachineInstr &OldMI,
                               MachineInstr &NewMI);
 
   /// addVirtualRegisterKilled - Add information about the fact that the
   /// specified register is killed after being used by the specified
   /// instruction. If AddIfNotFound is true, add a implicit operand if it's
   /// not found.
-  void addVirtualRegisterKilled(unsigned IncomingReg, MachineInstr &MI,
+  void addVirtualRegisterKilled(Register IncomingReg, MachineInstr &MI,
                                 bool AddIfNotFound = false) {
     if (MI.addRegisterKilled(IncomingReg, TRI, AddIfNotFound))
       getVarInfo(IncomingReg).Kills.push_back(&MI);
@@ -208,14 +207,14 @@ public:
   /// register from the live variable information. Returns true if the
   /// variable was marked as killed by the specified instruction,
   /// false otherwise.
-  bool removeVirtualRegisterKilled(unsigned reg, MachineInstr &MI) {
-    if (!getVarInfo(reg).removeKill(MI))
+  bool removeVirtualRegisterKilled(Register Reg, MachineInstr &MI) {
+    if (!getVarInfo(Reg).removeKill(MI))
       return false;
 
     bool Removed = false;
     for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
       MachineOperand &MO = MI.getOperand(i);
-      if (MO.isReg() && MO.isKill() && MO.getReg() == reg) {
+      if (MO.isReg() && MO.isKill() && MO.getReg() == Reg) {
         MO.setIsKill(false);
         Removed = true;
         break;
@@ -234,7 +233,7 @@ public:
   /// addVirtualRegisterDead - Add information about the fact that the specified
   /// register is dead after being used by the specified instruction. If
   /// AddIfNotFound is true, add a implicit operand if it's not found.
-  void addVirtualRegisterDead(unsigned IncomingReg, MachineInstr &MI,
+  void addVirtualRegisterDead(Register IncomingReg, MachineInstr &MI,
                               bool AddIfNotFound = false) {
     if (MI.addRegisterDead(IncomingReg, TRI, AddIfNotFound))
       getVarInfo(IncomingReg).Kills.push_back(&MI);
@@ -244,14 +243,14 @@ public:
   /// register from the live variable information. Returns true if the
   /// variable was marked dead at the specified instruction, false
   /// otherwise.
-  bool removeVirtualRegisterDead(unsigned reg, MachineInstr &MI) {
-    if (!getVarInfo(reg).removeKill(MI))
+  bool removeVirtualRegisterDead(Register Reg, MachineInstr &MI) {
+    if (!getVarInfo(Reg).removeKill(MI))
       return false;
 
     bool Removed = false;
     for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
       MachineOperand &MO = MI.getOperand(i);
-      if (MO.isReg() && MO.isDef() && MO.getReg() == reg) {
+      if (MO.isReg() && MO.isDef() && MO.getReg() == Reg) {
         MO.setIsDead(false);
         Removed = true;
         break;
@@ -270,24 +269,25 @@ public:
 
   /// getVarInfo - Return the VarInfo structure for the specified VIRTUAL
   /// register.
-  VarInfo &getVarInfo(unsigned RegIdx);
+  VarInfo &getVarInfo(Register Reg);
 
   void MarkVirtRegAliveInBlock(VarInfo& VRInfo, MachineBasicBlock* DefBlock,
                                MachineBasicBlock *BB);
-  void MarkVirtRegAliveInBlock(VarInfo& VRInfo, MachineBasicBlock* DefBlock,
+  void MarkVirtRegAliveInBlock(VarInfo &VRInfo, MachineBasicBlock *DefBlock,
                                MachineBasicBlock *BB,
-                               std::vector<MachineBasicBlock*> &WorkList);
-  void HandleVirtRegDef(unsigned reg, MachineInstr &MI);
-  void HandleVirtRegUse(unsigned reg, MachineBasicBlock *MBB, MachineInstr &MI);
+                               SmallVectorImpl<MachineBasicBlock *> &WorkList);
+
+  void HandleVirtRegDef(Register reg, MachineInstr &MI);
+  void HandleVirtRegUse(Register reg, MachineBasicBlock *MBB, MachineInstr &MI);
 
-  bool isLiveIn(unsigned Reg, const MachineBasicBlock &MBB) {
+  bool isLiveIn(Register Reg, const MachineBasicBlock &MBB) {
     return getVarInfo(Reg).isLiveIn(MBB, Reg, *MRI);
   }
 
   /// isLiveOut - Determine if Reg is live out from MBB, when not considering
   /// PHI nodes. This means that Reg is either killed by a successor block or
   /// passed through one.
-  bool isLiveOut(unsigned Reg, const MachineBasicBlock &MBB);
+  bool isLiveOut(Register Reg, const MachineBasicBlock &MBB);
 
   /// addNewBlock - Add a new basic block BB between DomBB and SuccBB. All
   /// variables that are live out of DomBB and live into SuccBB will be marked
@@ -303,10 +303,10 @@ public:
                    std::vector<SparseBitVector<>> &LiveInSets);
 
   /// isPHIJoin - Return true if Reg is a phi join register.
-  bool isPHIJoin(unsigned Reg) { return PHIJoins.test(Reg); }
+  bool isPHIJoin(Register Reg) { return PHIJoins.test(Reg.id()); }
 
   /// setPHIJoin - Mark Reg as a phi join register.
-  void setPHIJoin(unsigned Reg) { PHIJoins.set(Reg); }
+  void setPHIJoin(Register Reg) { PHIJoins.set(Reg.id()); }
 };
 
 } // End llvm namespace
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/LowLevelType.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/LowLevelType.h
index 6295d86f749c..402fa2ce61e7 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/LowLevelType.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/LowLevelType.h
@@ -23,6 +23,7 @@ namespace llvm {
 
 class DataLayout;
 class Type;
+struct fltSemantics;
 
 /// Construct a low-level type based on an LLVM type.
 LLT getLLTForType(Type &Ty, const DataLayout &DL);
@@ -35,6 +36,9 @@ MVT getMVTForLLT(LLT Ty);
 /// scalarable vector types, and will assert if used.
 LLT getLLTForMVT(MVT Ty);
 
+/// Get the appropriate floating point arithmetic semantic based on the bit size
+/// of the given scalar LLT.
+const llvm::fltSemantics &getFltSemanticForLLT(LLT Ty);
 }
 
 #endif // LLVM_CODEGEN_LOWLEVELTYPE_H
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/MBFIWrapper.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/MBFIWrapper.h
index 062431a6f96b..bcbf3eedf59d 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/MBFIWrapper.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/MBFIWrapper.h
@@ -28,6 +28,8 @@ class MBFIWrapper {
 
   BlockFrequency getBlockFreq(const MachineBasicBlock *MBB) const;
   void setBlockFreq(const MachineBasicBlock *MBB, BlockFrequency F);
+  Optional<uint64_t> getBlockProfileCount(const MachineBasicBlock *MBB) const;
+
   raw_ostream &printBlockFreq(raw_ostream &OS,
                               const MachineBasicBlock *MBB) const;
   raw_ostream &printBlockFreq(raw_ostream &OS,
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/MIRFormatter.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/MIRFormatter.h
index e57c32c5ae61..9cb92091db50 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/MIRFormatter.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/MIRFormatter.h
@@ -14,11 +14,15 @@
 #ifndef LLVM_CODEGEN_MIRFORMATTER_H
 #define LLVM_CODEGEN_MIRFORMATTER_H
 
-#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstdint>
 
 namespace llvm {
 
+class MachineFunction;
+class MachineInstr;
 struct PerFunctionMIParsingState;
 struct SlotMapping;
 
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/MIRYamlMapping.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/MIRYamlMapping.h
index c68b073ebb8c..4a7406473b11 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/MIRYamlMapping.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/MIRYamlMapping.h
@@ -159,6 +159,22 @@ template <> struct ScalarTraits<MaybeAlign> {
   static QuotingType mustQuote(StringRef) { return QuotingType::None; }
 };
 
+template <> struct ScalarTraits<Align> {
+  static void output(const Align &Alignment, void *, llvm::raw_ostream &OS) {
+    OS << Alignment.value();
+  }
+  static StringRef input(StringRef Scalar, void *, Align &Alignment) {
+    unsigned long long N;
+    if (getAsUnsignedInteger(Scalar, 10, N))
+      return "invalid number";
+    if (!isPowerOf2_64(N))
+      return "must be a power of two";
+    Alignment = Align(N);
+    return StringRef();
+  }
+  static QuotingType mustQuote(StringRef) { return QuotingType::None; }
+};
+
 } // end namespace yaml
 } // end namespace llvm
 
@@ -331,7 +347,7 @@ struct ScalarEnumerationTraits<TargetStackID::Value> {
   static void enumeration(yaml::IO &IO, TargetStackID::Value &ID) {
     IO.enumCase(ID, "default", TargetStackID::Default);
     IO.enumCase(ID, "sgpr-spill", TargetStackID::SGPRSpill);
-    IO.enumCase(ID, "sve-vec", TargetStackID::SVEVector);
+    IO.enumCase(ID, "scalable-vector", TargetStackID::ScalableVector);
     IO.enumCase(ID, "noalloc", TargetStackID::NoAlloc);
   }
 };
@@ -425,6 +441,36 @@ template <> struct MappingTraits<CallSiteInfo> {
   static const bool flow = true;
 };
 
+/// Serializable representation of debug value substitutions.
+struct DebugValueSubstitution {
+  unsigned SrcInst;
+  unsigned SrcOp;
+  unsigned DstInst;
+  unsigned DstOp;
+
+  bool operator==(const DebugValueSubstitution &Other) const {
+    return std::tie(SrcInst, SrcOp, DstInst, DstOp) ==
+           std::tie(Other.SrcInst, Other.SrcOp, Other.DstInst, Other.DstOp);
+  }
+};
+
+template <> struct MappingTraits<DebugValueSubstitution> {
+  static void mapping(IO &YamlIO, DebugValueSubstitution &Sub) {
+    YamlIO.mapRequired("srcinst", Sub.SrcInst);
+    YamlIO.mapRequired("srcop", Sub.SrcOp);
+    YamlIO.mapRequired("dstinst", Sub.DstInst);
+    YamlIO.mapRequired("dstop", Sub.DstOp);
+  }
+
+  static const bool flow = true;
+};
+} // namespace yaml
+} // namespace llvm
+
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::yaml::DebugValueSubstitution)
+
+namespace llvm {
+namespace yaml {
 struct MachineConstantPoolValue {
   UnsignedValue ID;
   StringValue Value;
@@ -609,6 +655,7 @@ struct MachineFunction {
   std::vector<MachineConstantPoolValue> Constants; /// Constant pool.
   std::unique_ptr<MachineFunctionInfo> MachineFuncInfo;
   std::vector<CallSiteInfo> CallSitesInfo;
+  std::vector<DebugValueSubstitution> DebugValueSubstitutions;
   MachineJumpTable JumpTableInfo;
   BlockStringValue Body;
 };
@@ -637,6 +684,8 @@ template <> struct MappingTraits<MachineFunction> {
                        std::vector<MachineStackObject>());
     YamlIO.mapOptional("callSites", MF.CallSitesInfo,
                        std::vector<CallSiteInfo>());
+    YamlIO.mapOptional("debugValueSubstitutions", MF.DebugValueSubstitutions,
+                       std::vector<DebugValueSubstitution>());
     YamlIO.mapOptional("constants", MF.Constants,
                        std::vector<MachineConstantPoolValue>());
     YamlIO.mapOptional("machineFunctionInfo", MF.MachineFuncInfo);
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineBasicBlock.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineBasicBlock.h
index d6cb7211cf70..2bad64c6cc2e 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineBasicBlock.h
@@ -40,6 +40,7 @@ class Printable;
 class SlotIndexes;
 class StringRef;
 class raw_ostream;
+class LiveIntervals;
 class TargetRegisterClass;
 class TargetRegisterInfo;
 
@@ -174,8 +175,9 @@ private:
   /// is only computed once and is cached.
   mutable MCSymbol *CachedMCSymbol = nullptr;
 
-  /// Used during basic block sections to mark the end of a basic block.
-  MCSymbol *EndMCSymbol = nullptr;
+  /// Marks the end of the basic block. Used during basic block sections to
+  /// calculate the size of the basic block, or the BB section ending with it.
+  mutable MCSymbol *CachedEndMCSymbol = nullptr;
 
   // Intrusive list support
   MachineBasicBlock() = default;
@@ -432,6 +434,9 @@ public:
 
   bool hasEHPadSuccessor() const;
 
+  /// Returns true if this is the entry block of the function.
+  bool isEntryBlock() const;
+
   /// Returns true if this is the entry block of an EH scope, i.e., the block
   /// that used to have a catchpad or cleanuppad instruction in the LLVM IR.
   bool isEHScopeEntry() const { return IsEHScopeEntry; }
@@ -474,6 +479,9 @@ public:
   /// Sets the section ID for this basic block.
   void setSectionID(MBBSectionID V) { SectionID = V; }
 
+  /// Returns the MCSymbol marking the end of this basic block.
+  MCSymbol *getEndSymbol() const;
+
   /// Returns true if this block may have an INLINEASM_BR (overestimate, by
   /// checking if any of the successors are indirect targets of any inlineasm_br
   /// in the function).
@@ -671,6 +679,17 @@ public:
     return !empty() && back().isEHScopeReturn();
   }
 
+  /// Split a basic block into 2 pieces at \p SplitPoint. A new block will be
+  /// inserted after this block, and all instructions after \p SplitInst moved
+  /// to it (\p SplitInst will be in the original block). If \p LIS is provided,
+  /// LiveIntervals will be appropriately updated. \return the newly inserted
+  /// block.
+  ///
+  /// If \p UpdateLiveIns is true, this will ensure the live ins list is
+  /// accurate, including for physreg uses/defs in the original block.
+  MachineBasicBlock *splitAt(MachineInstr &SplitInst, bool UpdateLiveIns = true,
+                             LiveIntervals *LIS = nullptr);
+
   /// Split the critical edge from this block to the given successor block, and
   /// return the newly created block, or null if splitting is not possible.
   ///
@@ -872,6 +891,14 @@ public:
   void print(raw_ostream &OS, ModuleSlotTracker &MST,
              const SlotIndexes * = nullptr, bool IsStandalone = true) const;
 
+  enum PrintNameFlag {
+    PrintNameIr = (1 << 0), ///< Add IR name where available
+    PrintNameAttributes = (1 << 1), ///< Print attributes
+  };
+
+  void printName(raw_ostream &os, unsigned printNameFlags = PrintNameIr,
+                 ModuleSlotTracker *moduleSlotTracker = nullptr) const;
+
   // Printing method used by LoopInfo.
   void printAsOperand(raw_ostream &OS, bool PrintType = true) const;
 
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineBlockFrequencyInfo.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineBlockFrequencyInfo.h
index 0f8d69ebd7da..6c442d3d07bd 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineBlockFrequencyInfo.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineBlockFrequencyInfo.h
@@ -58,18 +58,33 @@ public:
   /// information. Please note that initial frequency is equal to 1024. It means
   /// that we should not rely on the value itself, but only on the comparison to
   /// the other block frequencies. We do this to avoid using of floating points.
-  ///
+  /// For example, to get the frequency of a block relative to the entry block,
+  /// divide the integral value returned by this function (the
+  /// BlockFrequency::getFrequency() value) by getEntryFreq().
   BlockFrequency getBlockFreq(const MachineBasicBlock *MBB) const;
 
+  /// Compute the frequency of the block, relative to the entry block.
+  /// This API assumes getEntryFreq() is non-zero.
+  float getBlockFreqRelativeToEntryBlock(const MachineBasicBlock *MBB) const {
+    return getBlockFreq(MBB).getFrequency() * (1.0f / getEntryFreq());
+  }
+
   Optional<uint64_t> getBlockProfileCount(const MachineBasicBlock *MBB) const;
   Optional<uint64_t> getProfileCountFromFreq(uint64_t Freq) const;
 
-  bool isIrrLoopHeader(const MachineBasicBlock *MBB);
+  bool isIrrLoopHeader(const MachineBasicBlock *MBB) const;
 
-  void setBlockFreq(const MachineBasicBlock *MBB, uint64_t Freq);
+  /// incrementally calculate block frequencies when we split edges, to avoid
+  /// full CFG traversal.
+  void onEdgeSplit(const MachineBasicBlock &NewPredecessor,
+                   const MachineBasicBlock &NewSuccessor,
+                   const MachineBranchProbabilityInfo &MBPI);
 
   const MachineFunction *getFunction() const;
   const MachineBranchProbabilityInfo *getMBPI() const;
+
+  /// Pop up a ghostview window with the current block frequency propagation
+  /// rendered using dot.
   void view(const Twine &Name, bool isSimple = true) const;
 
   // Print the block frequency Freq to OS using the current functions entry
@@ -81,6 +96,8 @@ public:
   raw_ostream &printBlockFreq(raw_ostream &OS,
                               const MachineBasicBlock *MBB) const;
 
+  /// Divide a block's BlockFrequency::getFrequency() value by this value to
+  /// obtain the entry block - relative frequency of said block.
   uint64_t getEntryFreq() const;
 };
 
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineCombinerPattern.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineCombinerPattern.h
index e9f52fb064e1..ac0cc70744d1 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineCombinerPattern.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineCombinerPattern.h
@@ -29,6 +29,11 @@ enum class MachineCombinerPattern {
   REASSOC_XY_AMM_BMM,
   REASSOC_XMM_AMM_BMM,
 
+  // These are patterns matched by the PowerPC to reassociate FMA and FSUB to
+  // reduce register pressure.
+  REASSOC_XY_BCA,
+  REASSOC_XY_BAC,
+
   // These are multiply-add patterns matched by the AArch64 machine combiner.
   MULADDW_OP1,
   MULADDW_OP2,
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineConstantPool.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineConstantPool.h
index cfc9ca88c976..a9bc0ce300b2 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineConstantPool.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineConstantPool.h
@@ -41,10 +41,10 @@ public:
   explicit MachineConstantPoolValue(Type *ty) : Ty(ty) {}
   virtual ~MachineConstantPoolValue() = default;
 
-  /// getType - get type of this MachineConstantPoolValue.
-  ///
   Type *getType() const { return Ty; }
 
+  virtual unsigned getSizeInBytes(const DataLayout &DL) const;
+
   virtual int getExistingMachineCPValue(MachineConstantPool *CP,
                                         Align Alignment) = 0;
 
@@ -94,7 +94,7 @@ public:
 
   Align getAlign() const { return Alignment; }
 
-  Type *getType() const;
+  unsigned getSizeInBytes(const DataLayout &DL) const;
 
   /// This method classifies the entry according to whether or not it may
   /// generate a relocation entry.  This must be conservative, so if it might
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineDominanceFrontier.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineDominanceFrontier.h
index f7bbd07a63ab..e3e679608784 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineDominanceFrontier.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineDominanceFrontier.h
@@ -14,7 +14,6 @@
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/Support/GenericDomTree.h"
-#include <vector>
 
 namespace llvm {
 
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineDominators.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineDominators.h
index cf3af4d38223..46bf73cdd7b6 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineDominators.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineDominators.h
@@ -23,7 +23,6 @@
 #include "llvm/Support/GenericDomTreeConstruction.h"
 #include <cassert>
 #include <memory>
-#include <vector>
 
 namespace llvm {
 
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineFrameInfo.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineFrameInfo.h
index 5cd7f9cde674..7f0ec0df57c5 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineFrameInfo.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineFrameInfo.h
@@ -14,6 +14,7 @@
 #define LLVM_CODEGEN_MACHINEFRAMEINFO_H
 
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/Register.h"
 #include "llvm/Support/Alignment.h"
 #include "llvm/Support/DataTypes.h"
 #include <cassert>
@@ -31,7 +32,7 @@ class AllocaInst;
 /// Callee saved reg can also be saved to a different register rather than
 /// on the stack by setting DstReg instead of FrameIdx.
 class CalleeSavedInfo {
-  unsigned Reg;
+  Register Reg;
   union {
     int FrameIdx;
     unsigned DstReg;
@@ -58,14 +59,14 @@ public:
   : Reg(R), FrameIdx(FI), Restored(true), SpilledToReg(false) {}
 
   // Accessors.
-  unsigned getReg()                        const { return Reg; }
+  Register getReg()                        const { return Reg; }
   int getFrameIdx()                        const { return FrameIdx; }
   unsigned getDstReg()                     const { return DstReg; }
   void setFrameIdx(int FI) {
     FrameIdx = FI;
     SpilledToReg = false;
   }
-  void setDstReg(unsigned SpillReg) {
+  void setDstReg(Register SpillReg) {
     DstReg = SpillReg;
     SpilledToReg = true;
   }
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineFunction.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineFunction.h
index 809c21dd26fc..e9979c788ce0 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineFunction.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineFunction.h
@@ -431,6 +431,39 @@ public:
   using VariableDbgInfoMapTy = SmallVector<VariableDbgInfo, 4>;
   VariableDbgInfoMapTy VariableDbgInfos;
 
+  /// A count of how many instructions in the function have had numbers
+  /// assigned to them. Used for debug value tracking, to determine the
+  /// next instruction number.
+  unsigned DebugInstrNumberingCount = 0;
+
+  /// Set value of DebugInstrNumberingCount field. Avoid using this unless
+  /// you're deserializing this data.
+  void setDebugInstrNumberingCount(unsigned Num);
+
+  /// Pair of instruction number and operand number.
+  using DebugInstrOperandPair = std::pair<unsigned, unsigned>;
+
+  /// Substitution map: from one <inst,operand> pair to another. Used to
+  /// record changes in where a value is defined, so that debug variable
+  /// locations can find it later.
+  std::map<DebugInstrOperandPair, DebugInstrOperandPair>
+      DebugValueSubstitutions;
+
+  /// Create a substitution between one <instr,operand> value to a different,
+  /// new value.
+  void makeDebugValueSubstitution(DebugInstrOperandPair, DebugInstrOperandPair);
+
+  /// Create substitutions for any tracked values in \p Old, to point at
+  /// \p New. Needed when we re-create an instruction during optimization,
+  /// which has the same signature (i.e., def operands in the same place) but
+  /// a modified instruction type, flags, or otherwise. An example: X86 moves
+  /// are sometimes transformed into equivalent LEAs.
+  /// If the two instructions are not the same opcode, limit which operands to
+  /// examine for substitutions to the first N operands by setting
+  /// \p MaxOperand.
+  void substituteDebugValuesForInst(const MachineInstr &Old, MachineInstr &New,
+                                    unsigned MaxOperand = UINT_MAX);
+
   MachineFunction(Function &F, const LLVMTargetMachine &Target,
                   const TargetSubtargetInfo &STI, unsigned FunctionNum,
                   MachineModuleInfo &MMI);
@@ -494,7 +527,8 @@ public:
   /// Returns true if this function has basic block sections enabled.
   bool hasBBSections() const {
     return (BBSectionsType == BasicBlockSection::All ||
-            BBSectionsType == BasicBlockSection::List);
+            BBSectionsType == BasicBlockSection::List ||
+            BBSectionsType == BasicBlockSection::Preset);
   }
 
   /// Returns true if basic block labels are to be generated for this function.
@@ -504,9 +538,6 @@ public:
 
   void setBBSectionsType(BasicBlockSection V) { BBSectionsType = V; }
 
-  /// Creates basic block Labels for this function.
-  void createBBLabels();
-
   /// Assign IsBeginSection IsEndSection fields for basic blocks in this
   /// function.
   void assignBeginEndSections();
@@ -769,7 +800,7 @@ public:
   /// CreateMachineInstr - Allocate a new MachineInstr. Use this instead
   /// of `new MachineInstr'.
   MachineInstr *CreateMachineInstr(const MCInstrDesc &MCID, const DebugLoc &DL,
-                                   bool NoImp = false);
+                                   bool NoImplicit = false);
 
   /// Create a new MachineInstr which is a copy of \p Orig, identical in all
   /// ways except the instruction has no parent, prev, or next. Bundling flags
@@ -815,6 +846,14 @@ public:
   MachineMemOperand *getMachineMemOperand(const MachineMemOperand *MMO,
                                           int64_t Offset, uint64_t Size);
 
+  /// getMachineMemOperand - Allocate a new MachineMemOperand by copying
+  /// an existing one, replacing only the MachinePointerInfo and size.
+  /// MachineMemOperands are owned by the MachineFunction and need not be
+  /// explicitly deallocated.
+  MachineMemOperand *getMachineMemOperand(const MachineMemOperand *MMO,
+                                          MachinePointerInfo &PtrInfo,
+                                          uint64_t Size);
+
   /// Allocate a new MachineMemOperand by copying an existing one,
   /// replacing only AliasAnalysis information. MachineMemOperands are owned
   /// by the MachineFunction and need not be explicitly deallocated.
@@ -1067,6 +1106,10 @@ public:
   /// the same callee.
   void moveCallSiteInfo(const MachineInstr *Old,
                         const MachineInstr *New);
+
+  unsigned getNewDebugInstrNum() {
+    return ++DebugInstrNumberingCount;
+  }
 };
 
 //===--------------------------------------------------------------------===//
@@ -1133,6 +1176,11 @@ template <> struct GraphTraits<Inverse<const MachineFunction*>> :
   }
 };
 
+class MachineFunctionAnalysisManager;
+void verifyMachineFunction(MachineFunctionAnalysisManager *,
+                           const std::string &Banner,
+                           const MachineFunction &MF);
+
 } // end namespace llvm
 
 #endif // LLVM_CODEGEN_MACHINEFUNCTION_H
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineInstr.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineInstr.h
index 970d6d7db334..f8d97c2c07a6 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineInstr.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineInstr.h
@@ -249,6 +249,10 @@ private:
 
   DebugLoc debugLoc;                    // Source line information.
 
+  /// Unique instruction number. Used by DBG_INSTR_REFs to refer to the values
+  /// defined by this instruction.
+  unsigned DebugInstrNum;
+
   // Intrusive list support
   friend struct ilist_traits<MachineInstr>;
   friend struct ilist_callback_traits<MachineBasicBlock>;
@@ -280,6 +284,9 @@ public:
   const MachineBasicBlock* getParent() const { return Parent; }
   MachineBasicBlock* getParent() { return Parent; }
 
+  /// Move the instruction before \p MovePos.
+  void moveBefore(MachineInstr *MovePos);
+
   /// Return the function that contains the basic block that this instruction
   /// belongs to.
   ///
@@ -441,6 +448,18 @@ public:
   /// this DBG_LABEL instruction.
   const DILabel *getDebugLabel() const;
 
+  /// Fetch the instruction number of this MachineInstr. If it does not have
+  /// one already, a new and unique number will be assigned.
+  unsigned getDebugInstrNum();
+
+  /// Examine the instruction number of this MachineInstr. May be zero if
+  /// it hasn't been assigned a number yet.
+  unsigned peekDebugInstrNum() const { return DebugInstrNum; }
+
+  /// Set instruction number of this MachineInstr. Avoid using unless you're
+  /// deserializing this information.
+  void setDebugInstrNum(unsigned Num) { DebugInstrNum = Num; }
+
   /// Emit an error referring to the source location of this instruction.
   /// This should only be used for inline assembly that is somehow
   /// impossible to compile. Other errors should have been handled much
@@ -1137,12 +1156,22 @@ public:
     return getOpcode() == TargetOpcode::CFI_INSTRUCTION;
   }
 
+  bool isPseudoProbe() const {
+    return getOpcode() == TargetOpcode::PSEUDO_PROBE;
+  }
+  
   // True if the instruction represents a position in the function.
   bool isPosition() const { return isLabel() || isCFIInstruction(); }
 
   bool isDebugValue() const { return getOpcode() == TargetOpcode::DBG_VALUE; }
   bool isDebugLabel() const { return getOpcode() == TargetOpcode::DBG_LABEL; }
-  bool isDebugInstr() const { return isDebugValue() || isDebugLabel(); }
+  bool isDebugRef() const { return getOpcode() == TargetOpcode::DBG_INSTR_REF; }
+  bool isDebugInstr() const {
+    return isDebugValue() || isDebugLabel() || isDebugRef();
+  }
+  bool isDebugOrPseudoInstr() const {
+    return isDebugInstr() || isPseudoProbe();
+  }
 
   bool isDebugOffsetImm() const { return getDebugOffset().isImm(); }
 
@@ -1235,9 +1264,11 @@ public:
     case TargetOpcode::EH_LABEL:
     case TargetOpcode::GC_LABEL:
     case TargetOpcode::DBG_VALUE:
+    case TargetOpcode::DBG_INSTR_REF:
     case TargetOpcode::DBG_LABEL:
     case TargetOpcode::LIFETIME_START:
     case TargetOpcode::LIFETIME_END:
+    case TargetOpcode::PSEUDO_PROBE:
       return true;
     }
   }
@@ -1310,7 +1341,8 @@ public:
   /// Return true if the MachineInstr modifies (fully define or partially
   /// define) the specified register.
   /// NOTE: It's ignoring subreg indices on virtual registers.
-  bool modifiesRegister(Register Reg, const TargetRegisterInfo *TRI) const {
+  bool modifiesRegister(Register Reg,
+                        const TargetRegisterInfo *TRI = nullptr) const {
     return findRegisterDefOperandIdx(Reg, false, true, TRI) != -1;
   }
 
@@ -1761,8 +1793,10 @@ public:
   void setDebugValueUndef() {
     assert(isDebugValue() && "Must be a debug value instruction.");
     for (MachineOperand &MO : debug_operands()) {
-      if (MO.isReg())
+      if (MO.isReg()) {
         MO.setReg(0);
+        MO.setSubReg(0);
+      }
     }
   }
 
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineInstrBuilder.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineInstrBuilder.h
index cabb9f1c97c9..115c50175604 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineInstrBuilder.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineInstrBuilder.h
@@ -40,20 +40,30 @@ class MDNode;
 
 namespace RegState {
 
-  enum {
-    Define         = 0x2,
-    Implicit       = 0x4,
-    Kill           = 0x8,
-    Dead           = 0x10,
-    Undef          = 0x20,
-    EarlyClobber   = 0x40,
-    Debug          = 0x80,
-    InternalRead   = 0x100,
-    Renamable      = 0x200,
-    DefineNoRead   = Define | Undef,
-    ImplicitDefine = Implicit | Define,
-    ImplicitKill   = Implicit | Kill
-  };
+enum {
+  /// Register definition.
+  Define = 0x2,
+  /// Not emitted register (e.g. carry, or temporary result).
+  Implicit = 0x4,
+  /// The last use of a register.
+  Kill = 0x8,
+  /// Unused definition.
+  Dead = 0x10,
+  /// Value of the register doesn't matter.
+  Undef = 0x20,
+  /// Register definition happens before uses.
+  EarlyClobber = 0x40,
+  /// Register 'use' is for debugging purpose.
+  Debug = 0x80,
+  /// Register reads a value that is defined inside the same instruction or
+  /// bundle.
+  InternalRead = 0x100,
+  /// Register that may be renamed.
+  Renamable = 0x200,
+  DefineNoRead = Define | Undef,
+  ImplicitDefine = Implicit | Define,
+  ImplicitKill = Implicit | Kill
+};
 
 } // end namespace RegState
 
@@ -295,6 +305,9 @@ public:
       case MachineOperand::MO_BlockAddress:
         return addBlockAddress(Disp.getBlockAddress(), Disp.getOffset() + off,
                                TargetFlags);
+      case MachineOperand::MO_JumpTableIndex:
+        assert(off == 0 && "cannot create offset into jump tables");
+        return addJumpTableIndex(Disp.getIndex(), TargetFlags);
     }
   }
 
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineJumpTableInfo.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineJumpTableInfo.h
index 11781145b378..1d082bd03e5b 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineJumpTableInfo.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineJumpTableInfo.h
@@ -106,6 +106,9 @@ public:
     JumpTables[Idx].MBBs.clear();
   }
 
+  /// RemoveMBBFromJumpTables - If MBB is present in any jump tables, remove it.
+  bool RemoveMBBFromJumpTables(MachineBasicBlock *MBB);
+
   /// ReplaceMBBInJumpTables - If Old is the target of any jump tables, update
   /// the jump tables to branch to New instead.
   bool ReplaceMBBInJumpTables(MachineBasicBlock *Old, MachineBasicBlock *New);
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineLoopInfo.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineLoopInfo.h
index 8a93f91ae54d..c7491d4191de 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineLoopInfo.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineLoopInfo.h
@@ -67,6 +67,12 @@ public:
   /// it returns an unknown location.
   DebugLoc getStartLoc() const;
 
+  /// Returns true if the instruction is loop invariant.
+  /// I.e., all virtual register operands are defined outside of the loop,
+  /// physical registers aren't accessed explicitly, and there are no side
+  /// effects that aren't captured by the operands or other flags.
+  bool isLoopInvariant(MachineInstr &I) const;
+
   void dump() const;
 
 private:
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineLoopUtils.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineLoopUtils.h
index 2cb0134ca848..ec0b3529c0d6 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineLoopUtils.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineLoopUtils.h
@@ -37,10 +37,6 @@ MachineBasicBlock *PeelSingleBlockLoop(LoopPeelDirection Direction,
                                        MachineRegisterInfo &MRI,
                                        const TargetInstrInfo *TII);
 
-/// Return true if PhysReg is live outside the loop, i.e. determine if it
-/// is live in the loop exit blocks, and false otherwise.
-bool isRegLiveInExitBlocks(MachineLoop *Loop, int PhysReg);
-
 } // namespace llvm
 
 #endif // LLVM_LIB_CODEGEN_MACHINELOOPUTILS_H
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineModuleInfo.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineModuleInfo.h
index 0ee595b5b5ce..fa900affb214 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineModuleInfo.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineModuleInfo.h
@@ -54,8 +54,8 @@ class Module;
 //===----------------------------------------------------------------------===//
 /// This class can be derived from and used by targets to hold private
 /// target-specific information for each Module.  Objects of type are
-/// accessed/created with MMI::getInfo and destroyed when the MachineModuleInfo
-/// is destroyed.
+/// accessed/created with MachineModuleInfo::getObjFileInfo and destroyed when
+/// the MachineModuleInfo is destroyed.
 ///
 class MachineModuleInfoImpl {
 public:
@@ -83,6 +83,9 @@ class MachineModuleInfo {
 
   /// This is the MCContext used for the entire code generator.
   MCContext Context;
+  // This is an external context, that if assigned, will be used instead of the
+  // internal context.
+  MCContext *ExternalContext = nullptr;
 
   /// This is the LLVM Module being worked on.
   const Module *TheModule;
@@ -149,6 +152,9 @@ class MachineModuleInfo {
 public:
   explicit MachineModuleInfo(const LLVMTargetMachine *TM = nullptr);
 
+  explicit MachineModuleInfo(const LLVMTargetMachine *TM,
+                             MCContext *ExtContext);
+
   MachineModuleInfo(MachineModuleInfo &&MMII);
 
   ~MachineModuleInfo();
@@ -158,8 +164,12 @@ public:
 
   const LLVMTargetMachine &getTarget() const { return TM; }
 
-  const MCContext &getContext() const { return Context; }
-  MCContext &getContext() { return Context; }
+  const MCContext &getContext() const {
+    return ExternalContext ? *ExternalContext : Context;
+  }
+  MCContext &getContext() {
+    return ExternalContext ? *ExternalContext : Context;
+  }
 
   const Module *getModule() const { return TheModule; }
 
@@ -251,6 +261,12 @@ public:
     return Personalities;
   }
   /// \}
+
+  // MMI owes MCContext. It should never be invalidated.
+  bool invalidate(Module &, const PreservedAnalyses &,
+                  ModuleAnalysisManager::Invalidator &) {
+    return false;
+  }
 }; // End class MachineModuleInfo
 
 class MachineModuleInfoWrapperPass : public ImmutablePass {
@@ -260,6 +276,9 @@ public:
   static char ID; // Pass identification, replacement for typeid
   explicit MachineModuleInfoWrapperPass(const LLVMTargetMachine *TM = nullptr);
 
+  explicit MachineModuleInfoWrapperPass(const LLVMTargetMachine *TM,
+                                        MCContext *ExtContext);
+
   // Initialization and Finalization
   bool doInitialization(Module &) override;
   bool doFinalization(Module &) override;
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineOperand.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineOperand.h
index 0f252137364c..b12351b8a702 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineOperand.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineOperand.h
@@ -33,7 +33,6 @@ class MachineRegisterInfo;
 class MCCFIInstruction;
 class MDNode;
 class ModuleSlotTracker;
-class TargetMachine;
 class TargetIntrinsicInfo;
 class TargetRegisterInfo;
 class hash_code;
@@ -728,12 +727,12 @@ public:
   /// ChangeToImmediate - Replace this operand with a new immediate operand of
   /// the specified value.  If an operand is known to be an immediate already,
   /// the setImm method should be used.
-  void ChangeToImmediate(int64_t ImmVal);
+  void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags = 0);
 
   /// ChangeToFPImmediate - Replace this operand with a new FP immediate operand
   /// of the specified value.  If an operand is known to be an FP immediate
   /// already, the setFPImm method should be used.
-  void ChangeToFPImmediate(const ConstantFP *FPImm);
+  void ChangeToFPImmediate(const ConstantFP *FPImm, unsigned TargetFlags = 0);
 
   /// ChangeToES - Replace this operand with a new external symbol operand.
   void ChangeToES(const char *SymName, unsigned TargetFlags = 0);
@@ -743,10 +742,10 @@ public:
                   unsigned TargetFlags = 0);
 
   /// ChangeToMCSymbol - Replace this operand with a new MC symbol operand.
-  void ChangeToMCSymbol(MCSymbol *Sym);
+  void ChangeToMCSymbol(MCSymbol *Sym, unsigned TargetFlags = 0);
 
   /// Replace this operand with a frame index.
-  void ChangeToFrameIndex(int Idx);
+  void ChangeToFrameIndex(int Idx, unsigned TargetFlags = 0);
 
   /// Replace this operand with a target index.
   void ChangeToTargetIndex(unsigned Idx, int64_t Offset,
@@ -759,6 +758,11 @@ public:
                         bool isKill = false, bool isDead = false,
                         bool isUndef = false, bool isDebug = false);
 
+  /// getTargetIndexName - If this MachineOperand is a TargetIndex that has a
+  /// name, attempt to get the name. Returns nullptr if the TargetIndex does not
+  /// have a name. Asserts if MO is not a TargetIndex.
+  const char *getTargetIndexName() const;
+
   //===--------------------------------------------------------------------===//
   // Construction methods.
   //===--------------------------------------------------------------------===//
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineOutliner.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineOutliner.h
index 4a1b04ab3e88..a5dbbdb4fdcd 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineOutliner.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineOutliner.h
@@ -15,10 +15,11 @@
 #ifndef LLVM_MACHINEOUTLINER_H
 #define LLVM_MACHINEOUTLINER_H
 
+#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/LiveRegUnits.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/CodeGen/LivePhysRegs.h"
 
 namespace llvm {
 namespace outliner {
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/MachinePassManager.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/MachinePassManager.h
new file mode 100644
index 000000000000..1489177d9668
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/MachinePassManager.h
@@ -0,0 +1,256 @@
+//===- PassManager.h --- Pass management for CodeGen ------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This header defines the pass manager interface for codegen. The codegen
+// pipeline consists of only machine function passes. There is no container
+// relationship between IR module/function and machine function in terms of pass
+// manager organization. So there is no need for adaptor classes (for example
+// ModuleToMachineFunctionAdaptor). Since invalidation could only happen among
+// machine function passes, there is no proxy classes to handle cross-IR-unit
+// invalidation. IR analysis results are provided for machine function passes by
+// their respective analysis managers such as ModuleAnalysisManager and
+// FunctionAnalysisManager.
+//
+// TODO: Add MachineFunctionProperties support.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_MACHINEPASSMANAGER_H
+#define LLVM_CODEGEN_MACHINEPASSMANAGER_H
+
+#include "llvm/ADT/FunctionExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/type_traits.h"
+
+namespace llvm {
+class Module;
+
+extern template class AnalysisManager<MachineFunction>;
+
+/// An AnalysisManager<MachineFunction> that also exposes IR analysis results.
+class MachineFunctionAnalysisManager : public AnalysisManager<MachineFunction> {
+public:
+  using Base = AnalysisManager<MachineFunction>;
+
+  MachineFunctionAnalysisManager() : Base(false), FAM(nullptr), MAM(nullptr) {}
+  MachineFunctionAnalysisManager(FunctionAnalysisManager &FAM,
+                                 ModuleAnalysisManager &MAM,
+                                 bool DebugLogging = false)
+      : Base(DebugLogging), FAM(&FAM), MAM(&MAM) {}
+  MachineFunctionAnalysisManager(MachineFunctionAnalysisManager &&) = default;
+  MachineFunctionAnalysisManager &
+  operator=(MachineFunctionAnalysisManager &&) = default;
+
+  /// Get the result of an analysis pass for a Function.
+  ///
+  /// Runs the analysis if a cached result is not available.
+  template <typename PassT> typename PassT::Result &getResult(Function &F) {
+    return FAM->getResult<PassT>(F);
+  }
+
+  /// Get the cached result of an analysis pass for a Function.
+  ///
+  /// This method never runs the analysis.
+  ///
+  /// \returns null if there is no cached result.
+  template <typename PassT>
+  typename PassT::Result *getCachedResult(Function &F) {
+    return FAM->getCachedResult<PassT>(F);
+  }
+
+  /// Get the result of an analysis pass for a Module.
+  ///
+  /// Runs the analysis if a cached result is not available.
+  template <typename PassT> typename PassT::Result &getResult(Module &M) {
+    return MAM->getResult<PassT>(M);
+  }
+
+  /// Get the cached result of an analysis pass for a Module.
+  ///
+  /// This method never runs the analysis.
+  ///
+  /// \returns null if there is no cached result.
+  template <typename PassT> typename PassT::Result *getCachedResult(Module &M) {
+    return MAM->getCachedResult<PassT>(M);
+  }
+
+  /// Get the result of an analysis pass for a MachineFunction.
+  ///
+  /// Runs the analysis if a cached result is not available.
+  using Base::getResult;
+
+  /// Get the cached result of an analysis pass for a MachineFunction.
+  ///
+  /// This method never runs the analysis.
+  ///
+  /// returns null if there is no cached result.
+  using Base::getCachedResult;
+
+  // FIXME: Add LoopAnalysisManager or CGSCCAnalysisManager if needed.
+  FunctionAnalysisManager *FAM;
+  ModuleAnalysisManager *MAM;
+};
+
+extern template class PassManager<MachineFunction>;
+
+/// MachineFunctionPassManager adds/removes below features to/from the base
+/// PassManager template instantiation.
+///
+/// - Support passes that implement doInitialization/doFinalization. This is for
+///   machine function passes to work on module level constructs. One such pass
+///   is AsmPrinter.
+///
+/// - Support machine module pass which runs over the module (for example,
+///   MachineOutliner). A machine module pass needs to define the method:
+///
+///   ```Error run(Module &, MachineFunctionAnalysisManager &)```
+///
+///   FIXME: machine module passes still need to define the usual machine
+///          function pass interface, namely,
+///          `PreservedAnalyses run(MachineFunction &,
+///                                 MachineFunctionAnalysisManager &)`
+///          But this interface wouldn't be executed. It is just a placeholder
+///          to satisfy the pass manager type-erased inteface. This
+///          special-casing of machine module pass is due to its limited use
+///          cases and the unnecessary complexity it may bring to the machine
+///          pass manager.
+///
+/// - The base class `run` method is replaced by an alternative `run` method.
+///   See details below.
+///
+/// - Support codegening in the SCC order. Users include interprocedural
+///   register allocation (IPRA).
+class MachineFunctionPassManager
+    : public PassManager<MachineFunction, MachineFunctionAnalysisManager> {
+  using Base = PassManager<MachineFunction, MachineFunctionAnalysisManager>;
+
+public:
+  MachineFunctionPassManager(bool DebugLogging = false,
+                             bool RequireCodeGenSCCOrder = false,
+                             bool VerifyMachineFunction = false)
+      : Base(DebugLogging), RequireCodeGenSCCOrder(RequireCodeGenSCCOrder),
+        VerifyMachineFunction(VerifyMachineFunction) {}
+  MachineFunctionPassManager(MachineFunctionPassManager &&) = default;
+  MachineFunctionPassManager &
+  operator=(MachineFunctionPassManager &&) = default;
+
+  /// Run machine passes for a Module.
+  ///
+  /// The intended use is to start the codegen pipeline for a Module. The base
+  /// class's `run` method is deliberately hidden by this due to the observation
+  /// that we don't yet have the use cases of compositing two instances of
+  /// machine pass managers, or compositing machine pass managers with other
+  /// types of pass managers.
+  Error run(Module &M, MachineFunctionAnalysisManager &MFAM);
+
+  template <typename PassT> void addPass(PassT &&Pass) {
+    Base::addPass(std::forward<PassT>(Pass));
+    PassConceptT *P = Passes.back().get();
+    addDoInitialization<PassT>(P);
+    addDoFinalization<PassT>(P);
+
+    // Add machine module pass.
+    addRunOnModule<PassT>(P);
+  }
+
+private:
+  template <typename PassT>
+  using has_init_t = decltype(std::declval<PassT &>().doInitialization(
+      std::declval<Module &>(),
+      std::declval<MachineFunctionAnalysisManager &>()));
+
+  template <typename PassT>
+  std::enable_if_t<!is_detected<has_init_t, PassT>::value>
+  addDoInitialization(PassConceptT *Pass) {}
+
+  template <typename PassT>
+  std::enable_if_t<is_detected<has_init_t, PassT>::value>
+  addDoInitialization(PassConceptT *Pass) {
+    using PassModelT =
+        detail::PassModel<MachineFunction, PassT, PreservedAnalyses,
+                          MachineFunctionAnalysisManager>;
+    auto *P = static_cast<PassModelT *>(Pass);
+    InitializationFuncs.emplace_back(
+        [=](Module &M, MachineFunctionAnalysisManager &MFAM) {
+          return P->Pass.doInitialization(M, MFAM);
+        });
+  }
+
+  template <typename PassT>
+  using has_fini_t = decltype(std::declval<PassT &>().doFinalization(
+      std::declval<Module &>(),
+      std::declval<MachineFunctionAnalysisManager &>()));
+
+  template <typename PassT>
+  std::enable_if_t<!is_detected<has_fini_t, PassT>::value>
+  addDoFinalization(PassConceptT *Pass) {}
+
+  template <typename PassT>
+  std::enable_if_t<is_detected<has_fini_t, PassT>::value>
+  addDoFinalization(PassConceptT *Pass) {
+    using PassModelT =
+        detail::PassModel<MachineFunction, PassT, PreservedAnalyses,
+                          MachineFunctionAnalysisManager>;
+    auto *P = static_cast<PassModelT *>(Pass);
+    FinalizationFuncs.emplace_back(
+        [=](Module &M, MachineFunctionAnalysisManager &MFAM) {
+          return P->Pass.doFinalization(M, MFAM);
+        });
+  }
+
+  template <typename PassT>
+  using is_machine_module_pass_t = decltype(std::declval<PassT &>().run(
+      std::declval<Module &>(),
+      std::declval<MachineFunctionAnalysisManager &>()));
+
+  template <typename PassT>
+  using is_machine_function_pass_t = decltype(std::declval<PassT &>().run(
+      std::declval<MachineFunction &>(),
+      std::declval<MachineFunctionAnalysisManager &>()));
+
+  template <typename PassT>
+  std::enable_if_t<!is_detected<is_machine_module_pass_t, PassT>::value>
+  addRunOnModule(PassConceptT *Pass) {}
+
+  template <typename PassT>
+  std::enable_if_t<is_detected<is_machine_module_pass_t, PassT>::value>
+  addRunOnModule(PassConceptT *Pass) {
+    static_assert(is_detected<is_machine_function_pass_t, PassT>::value,
+                  "machine module pass needs to define machine function pass "
+                  "api. sorry.");
+
+    using PassModelT =
+        detail::PassModel<MachineFunction, PassT, PreservedAnalyses,
+                          MachineFunctionAnalysisManager>;
+    auto *P = static_cast<PassModelT *>(Pass);
+    MachineModulePasses.emplace(
+        Passes.size() - 1,
+        [=](Module &M, MachineFunctionAnalysisManager &MFAM) {
+          return P->Pass.run(M, MFAM);
+        });
+  }
+
+  using FuncTy = Error(Module &, MachineFunctionAnalysisManager &);
+  SmallVector<llvm::unique_function<FuncTy>, 4> InitializationFuncs;
+  SmallVector<llvm::unique_function<FuncTy>, 4> FinalizationFuncs;
+
+  using PassIndex = decltype(Passes)::size_type;
+  std::map<PassIndex, llvm::unique_function<FuncTy>> MachineModulePasses;
+
+  // Run codegen in the SCC order.
+  bool RequireCodeGenSCCOrder;
+
+  bool VerifyMachineFunction;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_CODEGEN_MACHINEPASSMANAGER_H
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/MachinePassRegistry.def b/contrib/llvm-project/llvm/include/llvm/CodeGen/MachinePassRegistry.def
new file mode 100644
index 000000000000..e9eaa5f77000
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/MachinePassRegistry.def
@@ -0,0 +1,197 @@
+//===- MachinePassRegistry.def - Registry of passes -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is used as the registry of passes that are for target-independent
+// code generator.
+//
+//===----------------------------------------------------------------------===//
+
+// NOTE: NO INCLUDE GUARD DESIRED!
+
+#ifndef MODULE_ANALYSIS
+#define MODULE_ANALYSIS(NAME, PASS_NAME, CONSTRUCTOR)
+#endif
+MODULE_ANALYSIS("pass-instrumentation", PassInstrumentationAnalysis, (PIC))
+#undef MODULE_ANALYSIS
+
+#ifndef MODULE_PASS
+#define MODULE_PASS(NAME, PASS_NAME, CONSTRUCTOR)
+#endif
+MODULE_PASS("pre-isel-intrinsic-lowering", PreISelIntrinsicLoweringPass, ())
+#undef MODULE_PASS
+
+#ifndef FUNCTION_ANALYSIS
+#define FUNCTION_ANALYSIS(NAME, PASS_NAME, CONSTRUCTOR)
+#endif
+FUNCTION_ANALYSIS("pass-instrumentation", PassInstrumentationAnalysis, (PIC))
+FUNCTION_ANALYSIS("targetir", TargetIRAnalysis, (std::move(TM.getTargetIRAnalysis())))
+#undef FUNCTION_ANALYSIS
+
+#ifndef FUNCTION_PASS
+#define FUNCTION_PASS(NAME, PASS_NAME, CONSTRUCTOR)
+#endif
+FUNCTION_PASS("mergeicmps", MergeICmpsPass, ())
+FUNCTION_PASS("lower-constant-intrinsics", LowerConstantIntrinsicsPass, ())
+FUNCTION_PASS("unreachableblockelim", UnreachableBlockElimPass, ())
+FUNCTION_PASS("consthoist", ConstantHoistingPass, ())
+FUNCTION_PASS("partially-inline-libcalls", PartiallyInlineLibCallsPass, ())
+FUNCTION_PASS("ee-instrument", EntryExitInstrumenterPass, (false))
+FUNCTION_PASS("post-inline-ee-instrument", EntryExitInstrumenterPass, (true))
+FUNCTION_PASS("expand-reductions", ExpandReductionsPass, ())
+FUNCTION_PASS("lowerinvoke", LowerInvokePass, ())
+FUNCTION_PASS("scalarize-masked-mem-intrin", ScalarizeMaskedMemIntrinPass, ())
+FUNCTION_PASS("verify", VerifierPass, ())
+#undef FUNCTION_PASS
+
+#ifndef LOOP_PASS
+#define LOOP_PASS(NAME, PASS_NAME, CONSTRUCTOR)
+#endif
+LOOP_PASS("loop-reduce", LoopStrengthReducePass, ())
+#undef LOOP_PASS
+
+#ifndef MACHINE_MODULE_PASS
+#define MACHINE_MODULE_PASS(NAME, PASS_NAME, CONSTRUCTOR)
+#endif
+#undef MACHINE_MODULE_PASS
+
+#ifndef MACHINE_FUNCTION_ANALYSIS
+#define MACHINE_FUNCTION_ANALYSIS(NAME, PASS_NAME, CONSTRUCTOR)
+#endif
+MACHINE_FUNCTION_ANALYSIS("pass-instrumentation", PassInstrumentationAnalysis, (PIC))
+// LiveVariables currently requires pure SSA form.
+// FIXME: Once TwoAddressInstruction pass no longer uses kill flags,
+// LiveVariables can be removed completely, and LiveIntervals can be directly
+// computed. (We still either need to regenerate kill flags after regalloc, or
+// preferably fix the scavenger to not depend on them).
+// MACHINE_FUNCTION_ANALYSIS("live-vars", LiveVariablesAnalysis())
+
+// MACHINE_FUNCTION_ANALYSIS("live-stacks", LiveStacksPass())
+// MACHINE_FUNCTION_ANALYSIS("slot-indexes", SlotIndexesAnalysis())
+// MACHINE_FUNCTION_ANALYSIS("edge-bundles", EdgeBundlesAnalysis())
+// MACHINE_FUNCTION_ANALYSIS("lazy-machine-bfi", LazyMachineBlockFrequencyInfoAnalysis())
+// MACHINE_FUNCTION_ANALYSIS("machine-bfi", MachineBlockFrequencyInfoAnalysis())
+// MACHINE_FUNCTION_ANALYSIS("machine-loops", MachineLoopInfoAnalysis())
+// MACHINE_FUNCTION_ANALYSIS("machine-dom-frontier", MachineDominanceFrontierAnalysis())
+// MACHINE_FUNCTION_ANALYSIS("machine-dom-tree", MachineDominatorTreeAnalysis())
+// MACHINE_FUNCTION_ANALYSIS("machine-ore", MachineOptimizationRemarkEmitterPassAnalysis())
+// MACHINE_FUNCTION_ANALYSIS("machine-post-dom-tree", MachinePostDominatorTreeAnalysis())
+// MACHINE_FUNCTION_ANALYSIS("machine-region-info", MachineRegionInfoPassAnalysis())
+// MACHINE_FUNCTION_ANALYSIS("machine-trace-metrics", MachineTraceMetricsAnalysis())
+// MACHINE_FUNCTION_ANALYSIS("reaching-def", ReachingDefAnalysisAnalysis())
+// MACHINE_FUNCTION_ANALYSIS("live-reg-matrix", LiveRegMatrixAnalysis())
+// MACHINE_FUNCTION_ANALYSIS("gc-analysis", GCMachineCodeAnalysisPass())
+#undef MACHINE_FUNCTION_ANALYSIS
+
+#ifndef MACHINE_FUNCTION_PASS
+#define MACHINE_FUNCTION_PASS(NAME, PASS_NAME, CONSTRUCTOR)
+#endif
+// MACHINE_FUNCTION_PASS("mir-printer", PrintMIRPass, ())
+// MACHINE_FUNCTION_PASS("free-machine-function", FreeMachineFunctionPass, ())
+#undef MACHINE_FUNCTION_PASS
+
+// After a pass is converted to new pass manager, its entry should be moved from
+// dummy table to the normal one. For example, for a machine function pass,
+// DUMMY_MACHINE_FUNCTION_PASS to MACHINE_FUNCTION_PASS.
+
+#ifndef DUMMY_FUNCTION_PASS
+#define DUMMY_FUNCTION_PASS(NAME, PASS_NAME, CONSTRUCTOR)
+#endif
+DUMMY_FUNCTION_PASS("expandmemcmp", ExpandMemCmpPass, ())
+DUMMY_FUNCTION_PASS("gc-lowering", GCLoweringPass, ())
+DUMMY_FUNCTION_PASS("shadow-stack-gc-lowering", ShadowStackGCLoweringPass, ())
+DUMMY_FUNCTION_PASS("sjljehprepare", SjLjEHPreparePass, ())
+DUMMY_FUNCTION_PASS("dwarfehprepare", DwarfEHPass, ())
+DUMMY_FUNCTION_PASS("winehprepare", WinEHPass, ())
+DUMMY_FUNCTION_PASS("wasmehprepare", WasmEHPass, ())
+DUMMY_FUNCTION_PASS("codegenprepare", CodeGenPreparePass, ())
+DUMMY_FUNCTION_PASS("safe-stack", SafeStackPass, ())
+DUMMY_FUNCTION_PASS("stack-protector", StackProtectorPass, ())
+DUMMY_FUNCTION_PASS("atomic-expand", AtomicExpandPass, ())
+DUMMY_FUNCTION_PASS("interleaved-access", InterleavedAccessPass, ())
+DUMMY_FUNCTION_PASS("indirectbr-expand", IndirectBrExpandPass, ())
+DUMMY_FUNCTION_PASS("cfguard-dispatch", CFGuardDispatchPass, ())
+DUMMY_FUNCTION_PASS("cfguard-check", CFGuardCheckPass, ())
+DUMMY_FUNCTION_PASS("gc-info-printer", GCInfoPrinterPass, ())
+#undef DUMMY_FUNCTION_PASS
+
+#ifndef DUMMY_MODULE_PASS
+#define DUMMY_MODULE_PASS(NAME, PASS_NAME, CONSTRUCTOR)
+#endif
+DUMMY_MODULE_PASS("lower-emutls", LowerEmuTLSPass, ())
+#undef DUMMY_MODULE_PASS
+
+#ifndef DUMMY_MACHINE_MODULE_PASS
+#define DUMMY_MACHINE_MODULE_PASS(NAME, PASS_NAME, CONSTRUCTOR)
+#endif
+DUMMY_MACHINE_MODULE_PASS("machine-outliner", MachineOutlinerPass, ())
+#undef DUMMY_MACHINE_MODULE_PASS
+
+#ifndef DUMMY_MACHINE_FUNCTION_PASS
+#define DUMMY_MACHINE_FUNCTION_PASS(NAME, PASS_NAME, CONSTRUCTOR)
+#endif
+DUMMY_MACHINE_FUNCTION_PASS("mir-printer", PrintMIRPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("free-machine-function", FreeMachineFunctionPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("finalize-isel", FinalizeISelPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("localstackalloc", LocalStackSlotPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("shrink-wrap", ShrinkWrapPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("prologepilog", PrologEpilogInserterPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("postrapseudos", ExpandPostRAPseudosPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("implicit-null-checks", ImplicitNullChecksPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("postmisched", PostMachineSchedulerPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("machine-scheduler", MachineSchedulerPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("machine-cp", MachineCopyPropagationPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("post-RA-sched", PostRASchedulerPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("fentry-insert", FEntryInserterPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("xray-instrumentation", XRayInstrumentationPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("patchable-function", PatchableFunctionPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("reg-usage-propagation", RegUsageInfoPropagationPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("reg-usage-collector", RegUsageInfoCollectorPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("funclet-layout", FuncletLayoutPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("stackmap-liveness", StackMapLivenessPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("livedebugvalues", LiveDebugValuesPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("early-tailduplication", EarlyTailDuplicatePass, ())
+DUMMY_MACHINE_FUNCTION_PASS("opt-phis", OptimizePHIsPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("stack-coloring", StackColoringPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("dead-mi-elimination", DeadMachineInstructionElimPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("early-machinelicm", EarlyMachineLICMPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("machinelicm", MachineLICMPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("machine-cse", MachineCSEPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("machine-sink", MachineSinkingPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("postra-machine-sink", PostRAMachineSinkingPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("peephole-opt", PeepholeOptimizerPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("regalloc", RegAllocPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("virtregrewriter", VirtRegRewriterPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("stack-slot-coloring", StackSlotColoringPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("phi-node-elimination", PHIEliminationPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("twoaddressinstruction", TwoAddressInstructionPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("detect-dead-lanes", DetectDeadLanesPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("processimpdefs", ProcessImplicitDefsPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("liveintervals", LiveIntervalsPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("simple-register-coalescing", RegisterCoalescerPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("rename-independent-subregs", RenameIndependentSubregsPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("branch-folder", BranchFolderPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("tailduplication", TailDuplicatePass, ())
+DUMMY_MACHINE_FUNCTION_PASS("block-placement", MachineBlockPlacementPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("block-placement-stats", MachineBlockPlacementStatsPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("early-ifcvt", EarlyIfConverterPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("machine-combiner", MachineCombinerPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("lrshrink", LiveRangeShrinkPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("break-false-deps", BreakFalseDepsPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("cfi-instr-inserter", CFIInstrInserterPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("cfguard-longjmp", CFGuardLongjmpPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("ra-basic", RABasicPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("ra-fast", RAFastPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("ra-greedy", RAGreedyPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("ra-pbqp", RAPBQPPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("legalizer", LegalizerPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("irtranslator", IRTranslatorPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("regbankselect", RegBankSelectPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("instruction-select", InstructionSelectPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("reset-machine-function", ResetMachineFunctionPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("machineverifier", MachineVerifierPass, ())
+#undef DUMMY_MACHINE_FUNCTION_PASS
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/MachinePipeliner.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/MachinePipeliner.h
index 8b2c27e7b888..f89a453749e8 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/MachinePipeliner.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/MachinePipeliner.h
@@ -40,8 +40,6 @@
 #ifndef LLVM_LIB_CODEGEN_MACHINEPIPELINER_H
 #define LLVM_LIB_CODEGEN_MACHINEPIPELINER_H
 
-#include "llvm/Analysis/AliasAnalysis.h"
-
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
@@ -51,6 +49,7 @@
 
 namespace llvm {
 
+class AAResults;
 class NodeSet;
 class SMSchedule;
 
@@ -92,15 +91,7 @@ public:
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<AAResultsWrapperPass>();
-    AU.addPreserved<AAResultsWrapperPass>();
-    AU.addRequired<MachineLoopInfo>();
-    AU.addRequired<MachineDominatorTree>();
-    AU.addRequired<LiveIntervals>();
-    AU.addRequired<MachineOptimizationRemarkEmitterPass>();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
 
 private:
   void preprocessPhiNodes(MachineBasicBlock &B);
@@ -285,7 +276,7 @@ public:
   static bool classof(const ScheduleDAGInstrs *DAG) { return true; }
 
 private:
-  void addLoopCarriedDependences(AliasAnalysis *AA);
+  void addLoopCarriedDependences(AAResults *AA);
   void updatePhiDependences();
   void changeDependences();
   unsigned calculateResMII();
@@ -304,7 +295,7 @@ private:
   void checkValidNodeOrder(const NodeSetType &Circuits) const;
   bool schedulePipeline(SMSchedule &Schedule);
   bool computeDelta(MachineInstr &MI, unsigned &Delta);
-  MachineInstr *findDefInLoop(unsigned Reg);
+  MachineInstr *findDefInLoop(Register Reg);
   bool canUseLastOffsetValue(MachineInstr *MI, unsigned &BasePos,
                              unsigned &OffsetPos, unsigned &NewBase,
                              int64_t &NewOffset);
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
index 35aab5018fa4..57086b4eebd6 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -442,10 +442,20 @@ public:
   /// Return true if there is exactly one operand defining the specified
   /// register.
   bool hasOneDef(Register RegNo) const {
-    def_iterator DI = def_begin(RegNo);
-    if (DI == def_end())
-      return false;
-    return ++DI == def_end();
+    return hasSingleElement(def_operands(RegNo));
+  }
+
+  /// Returns the defining operand if there is exactly one operand defining the
+  /// specified register, otherwise nullptr.
+  MachineOperand *getOneDef(Register Reg) const {
+    def_iterator DI = def_begin(Reg);
+    if (DI == def_end()) // No defs.
+      return nullptr;
+
+    def_iterator OneDef = DI;
+    if (++DI == def_end())
+      return &*OneDef;
+    return nullptr; // Multiple defs.
   }
 
   /// use_iterator/use_begin/use_end - Walk all uses of the specified register.
@@ -498,10 +508,7 @@ public:
   /// hasOneUse - Return true if there is exactly one instruction using the
   /// specified register.
   bool hasOneUse(Register RegNo) const {
-    use_iterator UI = use_begin(RegNo);
-    if (UI == use_end())
-      return false;
-    return ++UI == use_end();
+    return hasSingleElement(use_operands(RegNo));
   }
 
   /// use_nodbg_iterator/use_nodbg_begin/use_nodbg_end - Walk all uses of the
@@ -612,14 +619,10 @@ public:
   /// function. Writing to a constant register has no effect.
   bool isConstantPhysReg(MCRegister PhysReg) const;
 
-  /// Returns true if either isConstantPhysReg or TRI->isCallerPreservedPhysReg
-  /// returns true. This is a utility member function.
-  bool isCallerPreservedOrConstPhysReg(MCRegister PhysReg) const;
-
   /// Get an iterator over the pressure sets affected by the given physical or
   /// virtual register. If RegUnit is physical, it must be a register unit (from
   /// MCRegUnitIterator).
-  PSetIterator getPressureSets(unsigned RegUnit) const;
+  PSetIterator getPressureSets(Register RegUnit) const;
 
   //===--------------------------------------------------------------------===//
   // Virtual Register Info
@@ -894,7 +897,7 @@ public:
   ///
   /// Reserved registers may belong to an allocatable register class, but the
   /// target has explicitly requested that they are not used.
-  bool isReserved(Register PhysReg) const {
+  bool isReserved(MCRegister PhysReg) const {
     return getReservedRegs().test(PhysReg.id());
   }
 
@@ -1174,14 +1177,13 @@ class PSetIterator {
 public:
   PSetIterator() = default;
 
-  PSetIterator(unsigned RegUnit, const MachineRegisterInfo *MRI) {
+  PSetIterator(Register RegUnit, const MachineRegisterInfo *MRI) {
     const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo();
-    if (Register::isVirtualRegister(RegUnit)) {
+    if (RegUnit.isVirtual()) {
       const TargetRegisterClass *RC = MRI->getRegClass(RegUnit);
       PSet = TRI->getRegClassPressureSets(RC);
       Weight = TRI->getRegClassWeight(RC).RegWeight;
-    }
-    else {
+    } else {
       PSet = TRI->getRegUnitPressureSets(RegUnit);
       Weight = TRI->getRegUnitWeight(RegUnit);
     }
@@ -1203,8 +1205,8 @@ public:
   }
 };
 
-inline PSetIterator MachineRegisterInfo::
-getPressureSets(unsigned RegUnit) const {
+inline PSetIterator
+MachineRegisterInfo::getPressureSets(Register RegUnit) const {
   return PSetIterator(RegUnit, this);
 }
 
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineSSAUpdater.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineSSAUpdater.h
index df972e12d461..0af356e376ab 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineSSAUpdater.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineSSAUpdater.h
@@ -40,9 +40,6 @@ private:
   //typedef DenseMap<MachineBasicBlock*, Register> AvailableValsTy;
   void *AV = nullptr;
 
-  /// VR - Current virtual register whose uses are being updated.
-  Register VR;
-
   /// VRC - Register class of the current virtual register.
   const TargetRegisterClass *VRC;
 
@@ -65,6 +62,7 @@ public:
   /// Initialize - Reset this object to get ready for a new set of SSA
   /// updates.
   void Initialize(Register V);
+  void Initialize(const TargetRegisterClass *RC);
 
   /// AddAvailableValue - Indicate that a rewritten value is available at the
   /// end of the specified block with the specified value.
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineStableHash.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineStableHash.h
new file mode 100644
index 000000000000..8423b2da1c78
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineStableHash.h
@@ -0,0 +1,30 @@
+//===------------ MachineStableHash.h - MIR Stable Hashing Utilities ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Stable hashing for MachineInstr and MachineOperand. Useful or getting a
+// hash across runs, modules, etc.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_MACHINESTABLEHASH_H
+#define LLVM_CODEGEN_MACHINESTABLEHASH_H
+
+#include "llvm/CodeGen/StableHashing.h"
+
+namespace llvm {
+class MachineInstr;
+class MachineOperand;
+
+stable_hash stableHashValue(const MachineOperand &MO);
+stable_hash stableHashValue(const MachineInstr &MI, bool HashVRegs = false,
+                            bool HashConstantPoolIndices = false,
+                            bool HashMemOperands = false);
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineTraceMetrics.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineTraceMetrics.h
index 025989504177..46b57365e653 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineTraceMetrics.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineTraceMetrics.h
@@ -140,13 +140,13 @@ public:
   /// successors.
   struct LiveInReg {
     /// The virtual register required, or a register unit.
-    unsigned Reg;
+    Register Reg;
 
     /// For virtual registers: Minimum height of the defining instruction.
     /// For regunits: Height of the highest user in the trace.
     unsigned Height;
 
-    LiveInReg(unsigned Reg, unsigned Height = 0) : Reg(Reg), Height(Height) {}
+    LiveInReg(Register Reg, unsigned Height = 0) : Reg(Reg), Height(Height) {}
   };
 
   /// Per-basic block information that relates to a specific trace through the
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/MultiHazardRecognizer.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/MultiHazardRecognizer.h
new file mode 100644
index 000000000000..9846045ff014
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/MultiHazardRecognizer.h
@@ -0,0 +1,47 @@
+//=- llvm/CodeGen/MultiHazardRecognizer.h - Scheduling Support ----*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MultiHazardRecognizer class, which is a wrapper
+// for a set of ScheduleHazardRecognizer instances
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_MULTIHAZARDRECOGNIZER_H
+#define LLVM_CODEGEN_MULTIHAZARDRECOGNIZER_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+
+namespace llvm {
+
+class MachineInstr;
+class SUnit;
+
+class MultiHazardRecognizer : public ScheduleHazardRecognizer {
+  SmallVector<std::unique_ptr<ScheduleHazardRecognizer>, 4> Recognizers;
+
+public:
+  MultiHazardRecognizer() = default;
+  void AddHazardRecognizer(std::unique_ptr<ScheduleHazardRecognizer> &&);
+
+  bool atIssueLimit() const override;
+  HazardType getHazardType(SUnit *, int Stalls = 0) override;
+  void Reset() override;
+  void EmitInstruction(SUnit *) override;
+  void EmitInstruction(MachineInstr *) override;
+  unsigned PreEmitNoops(SUnit *) override;
+  unsigned PreEmitNoops(MachineInstr *) override;
+  bool ShouldPreferAnother(SUnit *) override;
+  void AdvanceCycle() override;
+  void RecedeCycle() override;
+  void EmitNoop() override;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_CODEGEN_MULTIHAZARDRECOGNIZER_H
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/NonRelocatableStringpool.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/NonRelocatableStringpool.h
index 56db30ff7d6d..fe07c70d85c5 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/NonRelocatableStringpool.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/NonRelocatableStringpool.h
@@ -39,7 +39,7 @@ public:
 
   /// Get the offset of string \p S in the string table. This can insert a new
   /// element or return the offset of a pre-existing one.
-  uint32_t getStringOffset(StringRef S) { return getEntry(S).getOffset(); }
+  uint64_t getStringOffset(StringRef S) { return getEntry(S).getOffset(); }
 
   /// Get permanent storage for \p S (but do not necessarily emit \p S in the
   /// output section). A latter call to getStringOffset() with the same string
@@ -57,7 +57,7 @@ public:
 
 private:
   MapTy Strings;
-  uint32_t CurrentEndOffset = 0;
+  uint64_t CurrentEndOffset = 0;
   unsigned NumEntries = 0;
   DwarfStringPoolEntryRef EmptyString;
   std::function<StringRef(StringRef Input)> Translator;
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/Passes.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/Passes.h
index 9e5b4446c195..676ed2c65eb1 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/Passes.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/Passes.h
@@ -44,11 +44,15 @@ namespace llvm {
   /// the entry block.
   FunctionPass *createUnreachableBlockEliminationPass();
 
-  /// createBBSectionsPrepare Pass - This pass assigns sections to machine basic
-  /// blocks and is enabled with -fbasic-block-sections.
-  /// Buf is a memory buffer that contains the list of functions and basic
-  /// block ids to selectively enable basic block sections.
-  MachineFunctionPass *createBBSectionsPreparePass(const MemoryBuffer *Buf);
+  /// createBasicBlockSections Pass - This pass assigns sections to machine
+  /// basic blocks and is enabled with -fbasic-block-sections. Buf is a memory
+  /// buffer that contains the list of functions and basic block ids to
+  /// selectively enable basic block sections.
+  MachineFunctionPass *createBasicBlockSectionsPass(const MemoryBuffer *Buf);
+
+  /// createMachineFunctionSplitterPass - This pass splits machine functions
+  /// using profile information.
+  MachineFunctionPass *createMachineFunctionSplitterPass();
 
   /// MachineFunctionPrinter pass - This pass prints out the machine function to
   /// the given stream as a debugging tool.
@@ -72,10 +76,6 @@ namespace llvm {
   /// matching during instruction selection.
   FunctionPass *createCodeGenPreparePass();
 
-  /// createScalarizeMaskedMemIntrinPass - Replace masked load, store, gather
-  /// and scatter intrinsics with scalar code when target doesn't support them.
-  FunctionPass *createScalarizeMaskedMemIntrinPass();
-
   /// AtomicExpandID -- Lowers atomic operations in terms of either cmpxchg
   /// load-linked/store-conditional loops.
   extern char &AtomicExpandID;
@@ -387,10 +387,6 @@ namespace llvm {
   /// createJumpInstrTables - This pass creates jump-instruction tables.
   ModulePass *createJumpInstrTablesPass();
 
-  /// createForwardControlFlowIntegrityPass - This pass adds control-flow
-  /// integrity.
-  ModulePass *createForwardControlFlowIntegrityPass();
-
   /// InterleavedAccess Pass - This pass identifies and matches interleaved
   /// memory accesses to target specific intrinsics.
   ///
@@ -471,6 +467,9 @@ namespace llvm {
   /// Create Hardware Loop pass. \see HardwareLoops.cpp
   FunctionPass *createHardwareLoopsPass();
 
+  /// This pass inserts pseudo probe annotation for callsite profiling.
+  FunctionPass *createPseudoProbeInserter();
+
   /// Create IR Type Promotion pass. \see TypePromotion.cpp
   FunctionPass *createTypePromotionPass();
 
@@ -483,9 +482,16 @@ namespace llvm {
   /// info was generated by another source such as clang.
   ModulePass *createStripDebugMachineModulePass(bool OnlyDebugified);
 
+  /// Creates MIR Check Debug pass. \see MachineCheckDebugify.cpp
+  ModulePass *createCheckDebugMachineModulePass();
+
   /// The pass fixups statepoint machine instruction to replace usage of
   /// caller saved registers with stack slots.
   extern char &FixupStatepointCallerSavedID;
+
+  /// The pass transform load/store <256 x i32> to AMX load/store intrinsics
+  /// or split the data to two <128 x i32>.
+  FunctionPass *createX86LowerAMXTypePass();
 } // End llvm namespace
 
 #endif
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/RDFGraph.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/RDFGraph.h
index 585f43e116f9..00d6ec93d555 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/RDFGraph.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/RDFGraph.h
@@ -753,10 +753,6 @@ namespace rdf {
 
     NodeAddr<RefNode*> getNextRelated(NodeAddr<InstrNode*> IA,
         NodeAddr<RefNode*> RA) const;
-    NodeAddr<RefNode*> getNextImp(NodeAddr<InstrNode*> IA,
-        NodeAddr<RefNode*> RA, bool Create);
-    NodeAddr<RefNode*> getNextImp(NodeAddr<InstrNode*> IA,
-        NodeAddr<RefNode*> RA) const;
     NodeAddr<RefNode*> getNextShadow(NodeAddr<InstrNode*> IA,
         NodeAddr<RefNode*> RA, bool Create);
     NodeAddr<RefNode*> getNextShadow(NodeAddr<InstrNode*> IA,
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/RDFLiveness.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/RDFLiveness.h
index ea4890271726..d39d3585e7bd 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/RDFLiveness.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/RDFLiveness.h
@@ -18,6 +18,8 @@
 #include "llvm/MC/LaneBitmask.h"
 #include <map>
 #include <set>
+#include <unordered_map>
+#include <unordered_set>
 #include <utility>
 
 namespace llvm {
@@ -28,6 +30,30 @@ class MachineDominatorTree;
 class MachineRegisterInfo;
 class TargetRegisterInfo;
 
+} // namespace llvm
+
+namespace llvm {
+namespace rdf {
+namespace detail {
+
+using NodeRef = std::pair<NodeId, LaneBitmask>;
+
+} // namespace detail
+} // namespace rdf
+} // namespace llvm
+
+namespace std {
+
+template <> struct hash<llvm::rdf::detail::NodeRef> {
+  std::size_t operator()(llvm::rdf::detail::NodeRef R) const {
+    return std::hash<llvm::rdf::NodeId>{}(R.first) ^
+           std::hash<llvm::LaneBitmask::Type>{}(R.second.getAsInteger());
+  }
+};
+
+} // namespace std
+
+namespace llvm {
 namespace rdf {
 
   struct Liveness {
@@ -46,10 +72,9 @@ namespace rdf {
       std::map<MachineBasicBlock*,RegisterAggr> Map;
     };
 
-    using NodeRef = std::pair<NodeId, LaneBitmask>;
-    using NodeRefSet = std::set<NodeRef>;
-    // RegisterId in RefMap must be normalized.
-    using RefMap = std::map<RegisterId, NodeRefSet>;
+    using NodeRef = detail::NodeRef;
+    using NodeRefSet = std::unordered_set<NodeRef>;
+    using RefMap = std::unordered_map<RegisterId, NodeRefSet>;
 
     Liveness(MachineRegisterInfo &mri, const DataFlowGraph &g)
         : DFG(g), TRI(g.getTRI()), PRI(g.getPRI()), MDT(g.getDT()),
@@ -110,15 +135,14 @@ namespace rdf {
     // Cache of mapping from node ids (for RefNodes) to the containing
     // basic blocks. Not computing it each time for each node reduces
     // the liveness calculation time by a large fraction.
-    using NodeBlockMap = DenseMap<NodeId, MachineBasicBlock *>;
-    NodeBlockMap NBMap;
+    DenseMap<NodeId, MachineBasicBlock *> NBMap;
 
     // Phi information:
     //
     // RealUseMap
     // map: NodeId -> (map: RegisterId -> NodeRefSet)
     //      phi id -> (map: register -> set of reached non-phi uses)
-    std::map<NodeId, RefMap> RealUseMap;
+    DenseMap<NodeId, RefMap> RealUseMap;
 
     // Inverse iterated dominance frontier.
     std::map<MachineBasicBlock*,std::set<MachineBasicBlock*>> IIDF;
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/RDFRegisters.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/RDFRegisters.h
index 4afaf80e4659..c49b4883e1c1 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/RDFRegisters.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/RDFRegisters.h
@@ -91,6 +91,11 @@ namespace rdf {
     bool operator< (const RegisterRef &RR) const {
       return Reg < RR.Reg || (Reg == RR.Reg && Mask < RR.Mask);
     }
+
+    size_t hash() const {
+      return std::hash<RegisterId>{}(Reg) ^
+             std::hash<LaneBitmask::Type>{}(Mask.getAsInteger());
+    }
   };
 
 
@@ -110,8 +115,6 @@ namespace rdf {
       return RegMasks.get(Register::stackSlot2Index(R));
     }
 
-    RegisterRef normalize(RegisterRef RR) const;
-
     bool alias(RegisterRef RA, RegisterRef RB) const {
       if (!isRegMaskId(RA.Reg))
         return !isRegMaskId(RB.Reg) ? aliasRR(RA, RB) : aliasRM(RA, RB);
@@ -128,6 +131,10 @@ namespace rdf {
       return MaskInfos[Register::stackSlot2Index(MaskId)].Units;
     }
 
+    const BitVector &getUnitAliases(uint32_t U) const {
+      return AliasInfos[U].Regs;
+    }
+
     RegisterRef mapTo(RegisterRef RR, unsigned R) const;
     const TargetRegisterInfo &getTRI() const { return TRI; }
 
@@ -142,12 +149,16 @@ namespace rdf {
     struct MaskInfo {
       BitVector Units;
     };
+    struct AliasInfo {
+      BitVector Regs;
+    };
 
     const TargetRegisterInfo &TRI;
     IndexedSet<const uint32_t*> RegMasks;
     std::vector<RegInfo> RegInfos;
     std::vector<UnitInfo> UnitInfos;
     std::vector<MaskInfo> MaskInfos;
+    std::vector<AliasInfo> AliasInfos;
 
     bool aliasRR(RegisterRef RA, RegisterRef RB) const;
     bool aliasRM(RegisterRef RR, RegisterRef RM) const;
@@ -159,10 +170,15 @@ namespace rdf {
         : Units(pri.getTRI().getNumRegUnits()), PRI(pri) {}
     RegisterAggr(const RegisterAggr &RG) = default;
 
+    unsigned count() const { return Units.count(); }
     bool empty() const { return Units.none(); }
     bool hasAliasOf(RegisterRef RR) const;
     bool hasCoverOf(RegisterRef RR) const;
 
+    bool operator==(const RegisterAggr &A) const {
+      return DenseMapInfo<BitVector>::isEqual(Units, A.Units);
+    }
+
     static bool isCoverOf(RegisterRef RA, RegisterRef RB,
                           const PhysicalRegisterInfo &PRI) {
       return RegisterAggr(PRI).insert(RA).hasCoverOf(RB);
@@ -179,6 +195,10 @@ namespace rdf {
     RegisterRef clearIn(RegisterRef RR) const;
     RegisterRef makeRegRef() const;
 
+    size_t hash() const {
+      return DenseMapInfo<BitVector>::getHashValue(Units);
+    }
+
     void print(raw_ostream &OS) const;
 
     struct rr_iterator {
@@ -233,8 +253,27 @@ namespace rdf {
   };
   raw_ostream &operator<< (raw_ostream &OS, const PrintLaneMaskOpt &P);
 
+  raw_ostream &operator<< (raw_ostream &OS, const RegisterAggr &A);
 } // end namespace rdf
 
 } // end namespace llvm
 
+namespace std {
+  template <> struct hash<llvm::rdf::RegisterRef> {
+    size_t operator()(llvm::rdf::RegisterRef A) const {
+      return A.hash();
+    }
+  };
+  template <> struct hash<llvm::rdf::RegisterAggr> {
+    size_t operator()(const llvm::rdf::RegisterAggr &A) const {
+      return A.hash();
+    }
+  };
+  template <> struct equal_to<llvm::rdf::RegisterAggr> {
+    bool operator()(const llvm::rdf::RegisterAggr &A,
+                    const llvm::rdf::RegisterAggr &B) const {
+      return A == B;
+    }
+  };
+}
 #endif // LLVM_LIB_TARGET_HEXAGON_RDFREGISTERS_H
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h
index a8a436337e07..bcb48de2fe5a 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h
@@ -139,23 +139,25 @@ public:
 
   /// Provides the instruction id of the closest reaching def instruction of
   /// PhysReg that reaches MI, relative to the begining of MI's basic block.
-  int getReachingDef(MachineInstr *MI, int PhysReg) const;
+  int getReachingDef(MachineInstr *MI, MCRegister PhysReg) const;
 
   /// Return whether A and B use the same def of PhysReg.
-  bool hasSameReachingDef(MachineInstr *A, MachineInstr *B, int PhysReg) const;
+  bool hasSameReachingDef(MachineInstr *A, MachineInstr *B,
+                          MCRegister PhysReg) const;
 
   /// Return whether the reaching def for MI also is live out of its parent
   /// block.
-  bool isReachingDefLiveOut(MachineInstr *MI, int PhysReg) const;
+  bool isReachingDefLiveOut(MachineInstr *MI, MCRegister PhysReg) const;
 
   /// Return the local MI that produces the live out value for PhysReg, or
   /// nullptr for a non-live out or non-local def.
   MachineInstr *getLocalLiveOutMIDef(MachineBasicBlock *MBB,
-                                     int PhysReg) const;
+                                     MCRegister PhysReg) const;
 
   /// If a single MachineInstr creates the reaching definition, then return it.
   /// Otherwise return null.
-  MachineInstr *getUniqueReachingMIDef(MachineInstr *MI, int PhysReg) const;
+  MachineInstr *getUniqueReachingMIDef(MachineInstr *MI,
+                                       MCRegister PhysReg) const;
 
   /// If a single MachineInstr creates the reaching definition, for MIs operand
   /// at Idx, then return it. Otherwise return null.
@@ -167,40 +169,45 @@ public:
 
   /// Provide whether the register has been defined in the same basic block as,
   /// and before, MI.
-  bool hasLocalDefBefore(MachineInstr *MI, int PhysReg) const;
+  bool hasLocalDefBefore(MachineInstr *MI, MCRegister PhysReg) const;
 
   /// Return whether the given register is used after MI, whether it's a local
   /// use or a live out.
-  bool isRegUsedAfter(MachineInstr *MI, int PhysReg) const;
+  bool isRegUsedAfter(MachineInstr *MI, MCRegister PhysReg) const;
 
   /// Return whether the given register is defined after MI.
-  bool isRegDefinedAfter(MachineInstr *MI, int PhysReg) const;
+  bool isRegDefinedAfter(MachineInstr *MI, MCRegister PhysReg) const;
 
   /// Provides the clearance - the number of instructions since the closest
   /// reaching def instuction of PhysReg that reaches MI.
-  int getClearance(MachineInstr *MI, MCPhysReg PhysReg) const;
+  int getClearance(MachineInstr *MI, MCRegister PhysReg) const;
 
   /// Provides the uses, in the same block as MI, of register that MI defines.
   /// This does not consider live-outs.
-  void getReachingLocalUses(MachineInstr *MI, int PhysReg,
+  void getReachingLocalUses(MachineInstr *MI, MCRegister PhysReg,
                             InstSet &Uses) const;
 
   /// Search MBB for a definition of PhysReg and insert it into Defs. If no
   /// definition is found, recursively search the predecessor blocks for them.
-  void getLiveOuts(MachineBasicBlock *MBB, int PhysReg, InstSet &Defs,
+  void getLiveOuts(MachineBasicBlock *MBB, MCRegister PhysReg, InstSet &Defs,
                    BlockSet &VisitedBBs) const;
-  void getLiveOuts(MachineBasicBlock *MBB, int PhysReg, InstSet &Defs) const;
+  void getLiveOuts(MachineBasicBlock *MBB, MCRegister PhysReg,
+                   InstSet &Defs) const;
 
   /// For the given block, collect the instructions that use the live-in
   /// value of the provided register. Return whether the value is still
   /// live on exit.
-  bool getLiveInUses(MachineBasicBlock *MBB, int PhysReg,
+  bool getLiveInUses(MachineBasicBlock *MBB, MCRegister PhysReg,
                      InstSet &Uses) const;
 
   /// Collect the users of the value stored in PhysReg, which is defined
   /// by MI.
-  void getGlobalUses(MachineInstr *MI, int PhysReg,
-                     InstSet &Uses) const;
+  void getGlobalUses(MachineInstr *MI, MCRegister PhysReg, InstSet &Uses) const;
+
+  /// Collect all possible definitions of the value stored in PhysReg, which is
+  /// used by MI.
+  void getGlobalReachingDefs(MachineInstr *MI, MCRegister PhysReg,
+                             InstSet &Defs) const;
 
   /// Return whether From can be moved forwards to just before To.
   bool isSafeToMoveForwards(MachineInstr *From, MachineInstr *To) const;
@@ -224,12 +231,13 @@ public:
 
   /// Return whether a MachineInstr could be inserted at MI and safely define
   /// the given register without affecting the program.
-  bool isSafeToDefRegAt(MachineInstr *MI, int PhysReg) const;
+  bool isSafeToDefRegAt(MachineInstr *MI, MCRegister PhysReg) const;
 
   /// Return whether a MachineInstr could be inserted at MI and safely define
   /// the given register without affecting the program, ignoring any effects
   /// on the provided instructions.
-  bool isSafeToDefRegAt(MachineInstr *MI, int PhysReg, InstSet &Ignore) const;
+  bool isSafeToDefRegAt(MachineInstr *MI, MCRegister PhysReg,
+                        InstSet &Ignore) const;
 
 private:
   /// Set up LiveRegs by merging predecessor live-out values.
@@ -264,7 +272,8 @@ private:
 
   /// Provides the instruction of the closest reaching def instruction of
   /// PhysReg that reaches MI, relative to the begining of MI's basic block.
-  MachineInstr *getReachingLocalMIDef(MachineInstr *MI, int PhysReg) const;
+  MachineInstr *getReachingLocalMIDef(MachineInstr *MI,
+                                      MCRegister PhysReg) const;
 };
 
 } // namespace llvm
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/RegAllocPBQP.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/RegAllocPBQP.h
index f7f92248f4ce..1ed55082e32c 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/RegAllocPBQP.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/RegAllocPBQP.h
@@ -22,6 +22,8 @@
 #include "llvm/CodeGen/PBQP/Math.h"
 #include "llvm/CodeGen/PBQP/ReductionRules.h"
 #include "llvm/CodeGen/PBQP/Solution.h"
+#include "llvm/CodeGen/Register.h"
+#include "llvm/MC/MCRegister.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <algorithm>
 #include <cassert>
@@ -96,13 +98,13 @@ public:
   AllowedRegVector() = default;
   AllowedRegVector(AllowedRegVector &&) = default;
 
-  AllowedRegVector(const std::vector<unsigned> &OptVec)
-    : NumOpts(OptVec.size()), Opts(new unsigned[NumOpts]) {
+  AllowedRegVector(const std::vector<MCRegister> &OptVec)
+      : NumOpts(OptVec.size()), Opts(new MCRegister[NumOpts]) {
     std::copy(OptVec.begin(), OptVec.end(), Opts.get());
   }
 
   unsigned size() const { return NumOpts; }
-  unsigned operator[](size_t I) const { return Opts[I]; }
+  MCRegister operator[](size_t I) const { return Opts[I]; }
 
   bool operator==(const AllowedRegVector &Other) const {
     if (NumOpts != Other.NumOpts)
@@ -116,12 +118,12 @@ public:
 
 private:
   unsigned NumOpts = 0;
-  std::unique_ptr<unsigned[]> Opts;
+  std::unique_ptr<MCRegister[]> Opts;
 };
 
 inline hash_code hash_value(const AllowedRegVector &OptRegs) {
-  unsigned *OStart = OptRegs.Opts.get();
-  unsigned *OEnd = OptRegs.Opts.get() + OptRegs.NumOpts;
+  MCRegister *OStart = OptRegs.Opts.get();
+  MCRegister *OEnd = OptRegs.Opts.get() + OptRegs.NumOpts;
   return hash_combine(OptRegs.NumOpts,
                       hash_combine_range(OStart, OEnd));
 }
@@ -143,11 +145,11 @@ public:
   LiveIntervals &LIS;
   MachineBlockFrequencyInfo &MBFI;
 
-  void setNodeIdForVReg(unsigned VReg, GraphBase::NodeId NId) {
-    VRegToNodeId[VReg] = NId;
+  void setNodeIdForVReg(Register VReg, GraphBase::NodeId NId) {
+    VRegToNodeId[VReg.id()] = NId;
   }
 
-  GraphBase::NodeId getNodeIdForVReg(unsigned VReg) const {
+  GraphBase::NodeId getNodeIdForVReg(Register VReg) const {
     auto VRegItr = VRegToNodeId.find(VReg);
     if (VRegItr == VRegToNodeId.end())
       return GraphBase::invalidNodeId();
@@ -159,7 +161,7 @@ public:
   }
 
 private:
-  DenseMap<unsigned, GraphBase::NodeId> VRegToNodeId;
+  DenseMap<Register, GraphBase::NodeId> VRegToNodeId;
   AllowedRegVecPool AllowedRegVecs;
 };
 
@@ -197,8 +199,8 @@ public:
   NodeMetadata(NodeMetadata &&) = default;
   NodeMetadata& operator=(NodeMetadata &&) = default;
 
-  void setVReg(unsigned VReg) { this->VReg = VReg; }
-  unsigned getVReg() const { return VReg; }
+  void setVReg(Register VReg) { this->VReg = VReg; }
+  Register getVReg() const { return VReg; }
 
   void setAllowedRegs(GraphMetadata::AllowedRegVecRef AllowedRegs) {
     this->AllowedRegs = std::move(AllowedRegs);
@@ -256,7 +258,7 @@ private:
   unsigned NumOpts = 0;
   unsigned DeniedOpts = 0;
   std::unique_ptr<unsigned[]> OptUnsafeEdges;
-  unsigned VReg = 0;
+  Register VReg;
   GraphMetadata::AllowedRegVecRef AllowedRegs;
 
 #ifndef NDEBUG
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/Register.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/Register.h
index 054040cd29a1..d7057cfb76e0 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/Register.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/Register.h
@@ -40,24 +40,24 @@ public:
   /// frame index in a variable that normally holds a register. isStackSlot()
   /// returns true if Reg is in the range used for stack slots.
   ///
-  /// Note that isVirtualRegister() and isPhysicalRegister() cannot handle stack
-  /// slots, so if a variable may contains a stack slot, always check
-  /// isStackSlot() first.
-  ///
+  /// FIXME: remove in favor of member.
   static bool isStackSlot(unsigned Reg) {
     return MCRegister::isStackSlot(Reg);
   }
 
+  /// Return true if this is a stack slot.
+  bool isStack() const { return MCRegister::isStackSlot(Reg); }
+
   /// Compute the frame index from a register value representing a stack slot.
-  static int stackSlot2Index(unsigned Reg) {
-    assert(isStackSlot(Reg) && "Not a stack slot");
+  static int stackSlot2Index(Register Reg) {
+    assert(Reg.isStack() && "Not a stack slot");
     return int(Reg - MCRegister::FirstStackSlot);
   }
 
   /// Convert a non-negative frame index to a stack slot register value.
-  static unsigned index2StackSlot(int FI) {
+  static Register index2StackSlot(int FI) {
     assert(FI >= 0 && "Cannot hold a negative frame index.");
-    return FI + MCRegister::FirstStackSlot;
+    return Register(FI + MCRegister::FirstStackSlot);
   }
 
   /// Return true if the specified register number is in
@@ -69,20 +69,19 @@ public:
   /// Return true if the specified register number is in
   /// the virtual register namespace.
   static bool isVirtualRegister(unsigned Reg) {
-    assert(!isStackSlot(Reg) && "Not a register! Check isStackSlot() first.");
-    return Reg & MCRegister::VirtualRegFlag;
+    return Reg & MCRegister::VirtualRegFlag && !isStackSlot(Reg);
   }
 
   /// Convert a virtual register number to a 0-based index.
   /// The first virtual register in a function will get the index 0.
-  static unsigned virtReg2Index(unsigned Reg) {
+  static unsigned virtReg2Index(Register Reg) {
     assert(isVirtualRegister(Reg) && "Not a virtual register");
     return Reg & ~MCRegister::VirtualRegFlag;
   }
 
   /// Convert a 0-based index to a virtual register number.
   /// This is the inverse operation of VirtReg2IndexFunctor below.
-  static unsigned index2VirtReg(unsigned Index) {
+  static Register index2VirtReg(unsigned Index) {
     assert(Index < (1u << 31) && "Index too large for virtual register range.");
     return Index | MCRegister::VirtualRegFlag;
   }
@@ -115,6 +114,15 @@ public:
     return MCRegister(Reg);
   }
 
+  /// Utility to check-convert this value to a MCRegister. The caller is
+  /// expected to have already validated that this Register is, indeed,
+  /// physical.
+  MCRegister asMCReg() const {
+    assert(Reg == MCRegister::NoRegister ||
+           MCRegister::isPhysicalRegister(Reg));
+    return MCRegister(Reg);
+  }
+
   bool isValid() const { return Reg != MCRegister::NoRegister; }
 
   /// Comparisons between register objects
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/RegisterPressure.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/RegisterPressure.h
index 92333b859f1b..1deeb4d41511 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/RegisterPressure.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/RegisterPressure.h
@@ -37,10 +37,10 @@ class MachineRegisterInfo;
 class RegisterClassInfo;
 
 struct RegisterMaskPair {
-  unsigned RegUnit; ///< Virtual register or register unit.
+  Register RegUnit; ///< Virtual register or register unit.
   LaneBitmask LaneMask;
 
-  RegisterMaskPair(unsigned RegUnit, LaneBitmask LaneMask)
+  RegisterMaskPair(Register RegUnit, LaneBitmask LaneMask)
       : RegUnit(RegUnit), LaneMask(LaneMask) {}
 };
 
@@ -157,7 +157,7 @@ public:
   const_iterator begin() const { return &PressureChanges[0]; }
   const_iterator end() const { return &PressureChanges[MaxPSets]; }
 
-  void addPressureChange(unsigned RegUnit, bool IsDec,
+  void addPressureChange(Register RegUnit, bool IsDec,
                          const MachineRegisterInfo *MRI);
 
   void dump(const TargetRegisterInfo &TRI) const;
@@ -275,24 +275,24 @@ private:
   RegSet Regs;
   unsigned NumRegUnits;
 
-  unsigned getSparseIndexFromReg(unsigned Reg) const {
-    if (Register::isVirtualRegister(Reg))
+  unsigned getSparseIndexFromReg(Register Reg) const {
+    if (Reg.isVirtual())
       return Register::virtReg2Index(Reg) + NumRegUnits;
     assert(Reg < NumRegUnits);
     return Reg;
   }
 
-  unsigned getRegFromSparseIndex(unsigned SparseIndex) const {
+  Register getRegFromSparseIndex(unsigned SparseIndex) const {
     if (SparseIndex >= NumRegUnits)
-      return Register::index2VirtReg(SparseIndex-NumRegUnits);
-    return SparseIndex;
+      return Register::index2VirtReg(SparseIndex - NumRegUnits);
+    return Register(SparseIndex);
   }
 
 public:
   void clear();
   void init(const MachineRegisterInfo &MRI);
 
-  LaneBitmask contains(unsigned Reg) const {
+  LaneBitmask contains(Register Reg) const {
     unsigned SparseIndex = getSparseIndexFromReg(Reg);
     RegSet::const_iterator I = Regs.find(SparseIndex);
     if (I == Regs.end())
@@ -332,7 +332,7 @@ public:
   template<typename ContainerT>
   void appendTo(ContainerT &To) const {
     for (const IndexMaskPair &P : Regs) {
-      unsigned Reg = getRegFromSparseIndex(P.Index);
+      Register Reg = getRegFromSparseIndex(P.Index);
       if (P.LaneMask.any())
         To.push_back(RegisterMaskPair(Reg, P.LaneMask));
     }
@@ -390,7 +390,7 @@ class RegPressureTracker {
   LiveRegSet LiveRegs;
 
   /// Set of vreg defs that start a live range.
-  SparseSet<unsigned, VirtReg2IndexFunctor> UntiedDefs;
+  SparseSet<Register, VirtReg2IndexFunctor> UntiedDefs;
   /// Live-through pressure.
   std::vector<unsigned> LiveThruPressure;
 
@@ -532,7 +532,7 @@ public:
     return getDownwardPressure(MI, PressureResult, MaxPressureResult);
   }
 
-  bool hasUntiedDef(unsigned VirtReg) const {
+  bool hasUntiedDef(Register VirtReg) const {
     return UntiedDefs.count(VirtReg);
   }
 
@@ -548,9 +548,9 @@ protected:
   /// after the current position.
   SlotIndex getCurrSlot() const;
 
-  void increaseRegPressure(unsigned RegUnit, LaneBitmask PreviousMask,
+  void increaseRegPressure(Register RegUnit, LaneBitmask PreviousMask,
                            LaneBitmask NewMask);
-  void decreaseRegPressure(unsigned RegUnit, LaneBitmask PreviousMask,
+  void decreaseRegPressure(Register RegUnit, LaneBitmask PreviousMask,
                            LaneBitmask NewMask);
 
   void bumpDeadDefs(ArrayRef<RegisterMaskPair> DeadDefs);
@@ -561,9 +561,9 @@ protected:
   void discoverLiveInOrOut(RegisterMaskPair Pair,
                            SmallVectorImpl<RegisterMaskPair> &LiveInOrOut);
 
-  LaneBitmask getLastUsedLanes(unsigned RegUnit, SlotIndex Pos) const;
-  LaneBitmask getLiveLanesAt(unsigned RegUnit, SlotIndex Pos) const;
-  LaneBitmask getLiveThroughAt(unsigned RegUnit, SlotIndex Pos) const;
+  LaneBitmask getLastUsedLanes(Register RegUnit, SlotIndex Pos) const;
+  LaneBitmask getLiveLanesAt(Register RegUnit, SlotIndex Pos) const;
+  LaneBitmask getLiveThroughAt(Register RegUnit, SlotIndex Pos) const;
 };
 
 void dumpRegSetPressure(ArrayRef<unsigned> SetPressure,
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/RegisterScavenging.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/RegisterScavenging.h
index 5b5a80a67e7f..4f48ea2dc8e8 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/RegisterScavenging.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/RegisterScavenging.h
@@ -89,15 +89,6 @@ public:
     while (MBBI != I) forward();
   }
 
-  /// Invert the behavior of forward() on the current instruction (undo the
-  /// changes to the available registers made by forward()).
-  void unprocess();
-
-  /// Unprocess instructions until you reach the provided iterator.
-  void unprocess(MachineBasicBlock::iterator I) {
-    while (MBBI != I) unprocess();
-  }
-
   /// Update internal register state and move MBB iterator backwards.
   /// Contrary to unprocess() this method gives precise results even in the
   /// absence of kill flags.
@@ -203,10 +194,10 @@ private:
   void determineKillsAndDefs();
 
   /// Add all Reg Units that Reg contains to BV.
-  void addRegUnits(BitVector &BV, Register Reg);
+  void addRegUnits(BitVector &BV, MCRegister Reg);
 
   /// Remove all Reg Units that \p Reg contains from \p BV.
-  void removeRegUnits(BitVector &BV, Register Reg);
+  void removeRegUnits(BitVector &BV, MCRegister Reg);
 
   /// Return the candidate register that is unused for the longest after
   /// StartMI. UseMI is set to the instruction where the search stopped.
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/ResourcePriorityQueue.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/ResourcePriorityQueue.h
index b38cd4924174..bd63dd875621 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/ResourcePriorityQueue.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/ResourcePriorityQueue.h
@@ -107,7 +107,6 @@ namespace llvm {
     /// InitNumRegDefsLeft - Determine the # of regs defined by this node.
     ///
     void initNumRegDefsLeft(SUnit *SU);
-    void updateNumRegDefsLeft(SUnit *SU);
     int regPressureDelta(SUnit *SU, bool RawPressure = false);
     int rawRegPressureDelta (SUnit *SU, unsigned RCId);
 
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/RuntimeLibcalls.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/RuntimeLibcalls.h
index f71f39e5bf03..86e24cab76f6 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/RuntimeLibcalls.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/RuntimeLibcalls.h
@@ -15,6 +15,7 @@
 #define LLVM_CODEGEN_RUNTIMELIBCALLS_H
 
 #include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/Support/AtomicOrdering.h"
 
 namespace llvm {
 namespace RTLIB {
@@ -60,6 +61,10 @@ namespace RTLIB {
   /// UNKNOWN_LIBCALL if there is none.
   Libcall getSYNC(unsigned Opc, MVT VT);
 
+  /// Return the outline atomics value for the given opcode, atomic ordering
+  /// and type, or UNKNOWN_LIBCALL if there is none.
+  Libcall getOUTLINE_ATOMIC(unsigned Opc, AtomicOrdering Order, MVT VT);
+
   /// getMEMCPY_ELEMENT_UNORDERED_ATOMIC - Return
   /// MEMCPY_ELEMENT_UNORDERED_ATOMIC_* value for the given element size or
   /// UNKNOW_LIBCALL if there is none.
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
index 1eb9b9f322ba..50b186de2b05 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
@@ -268,6 +268,11 @@ namespace llvm {
       return SU->SchedClass;
     }
 
+    /// IsReachable - Checks if SU is reachable from TargetSU.
+    bool IsReachable(SUnit *SU, SUnit *TargetSU) {
+      return Topo.IsReachable(SU, TargetSU);
+    }
+
     /// Returns an iterator to the top of the current scheduling region.
     MachineBasicBlock::iterator begin() const { return RegionBegin; }
 
@@ -362,16 +367,6 @@ namespace llvm {
     void addVRegDefDeps(SUnit *SU, unsigned OperIdx);
     void addVRegUseDeps(SUnit *SU, unsigned OperIdx);
 
-    /// Initializes register live-range state for updating kills.
-    /// PostRA helper for rewriting kill flags.
-    void startBlockForKills(MachineBasicBlock *BB);
-
-    /// Toggles a register operand kill flag.
-    ///
-    /// Other adjustments may be made to the instruction if necessary. Return
-    /// true if the operand has been deleted, false if not.
-    void toggleKillFlag(MachineInstr &MI, MachineOperand &MO);
-
     /// Returns a mask for which lanes get read/written by the given (register)
     /// machine operand.
     LaneBitmask getLaneMaskForMO(const MachineOperand &MO) const;
@@ -393,10 +388,7 @@ namespace llvm {
 
   /// Returns an existing SUnit for this MI, or nullptr.
   inline SUnit *ScheduleDAGInstrs::getSUnit(MachineInstr *MI) const {
-    DenseMap<MachineInstr*, SUnit*>::const_iterator I = MISUnitMap.find(MI);
-    if (I == MISUnitMap.end())
-      return nullptr;
-    return I->second;
+    return MISUnitMap.lookup(MI);
   }
 
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/ScheduleHazardRecognizer.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/ScheduleHazardRecognizer.h
index 37590f496ca2..9f1101b658d0 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/ScheduleHazardRecognizer.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/ScheduleHazardRecognizer.h
@@ -57,7 +57,7 @@ public:
   ///     other instruction is available, issue it first.
   ///  * NoopHazard: issuing this instruction would break the program.  If
   ///     some other instruction can be issued, do so, otherwise issue a noop.
-  virtual HazardType getHazardType(SUnit *m, int Stalls = 0) {
+  virtual HazardType getHazardType(SUnit *, int Stalls = 0) {
     return NoHazard;
   }
 
@@ -114,6 +114,14 @@ public:
     // Default implementation: count it as a cycle.
     AdvanceCycle();
   }
+
+  /// EmitNoops - This callback is invoked when noops were added to the
+  /// instruction stream.
+  virtual void EmitNoops(unsigned Quantity) {
+    // Default implementation: count it as a cycle.
+    for (unsigned i = 0; i < Quantity; ++i)
+      EmitNoop();
+  }
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/SelectionDAG.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/SelectionDAG.h
index f26ab6f287a0..aeb488dd6c83 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -27,7 +27,6 @@
 #include "llvm/ADT/iterator.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/CodeGen/DAGCombine.h"
-#include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
@@ -64,6 +63,7 @@ class ConstantFP;
 class ConstantInt;
 class DataLayout;
 struct fltSemantics;
+class FunctionLoweringInfo;
 class GlobalValue;
 struct KnownBits;
 class LegacyDivergenceAnalysis;
@@ -331,6 +331,29 @@ public:
     virtual void anchor();
   };
 
+  /// Help to insert SDNodeFlags automatically in transforming. Use
+  /// RAII to save and resume flags in current scope.
+  class FlagInserter {
+    SelectionDAG &DAG;
+    SDNodeFlags Flags;
+    FlagInserter *LastInserter;
+
+  public:
+    FlagInserter(SelectionDAG &SDAG, SDNodeFlags Flags)
+        : DAG(SDAG), Flags(Flags),
+          LastInserter(SDAG.getFlagInserter()) {
+      SDAG.setFlagInserter(this);
+    }
+    FlagInserter(SelectionDAG &SDAG, SDNode *N)
+        : FlagInserter(SDAG, N->getFlags()) {}
+
+    FlagInserter(const FlagInserter &) = delete;
+    FlagInserter &operator=(const FlagInserter &) = delete;
+    ~FlagInserter() { DAG.setFlagInserter(LastInserter); }
+
+    const SDNodeFlags getFlags() const { return Flags; }
+  };
+
   /// When true, additional steps are taken to
   /// ensure that getConstant() and similar functions return DAG nodes that
   /// have legal types. This is important after type legalization since
@@ -433,6 +456,9 @@ public:
   ProfileSummaryInfo *getPSI() const { return PSI; }
   BlockFrequencyInfo *getBFI() const { return BFI; }
 
+  FlagInserter *getFlagInserter() { return Inserter; }
+  void setFlagInserter(FlagInserter *FI) { Inserter = FI; }
+
   /// Just dump dot graph to a user-provided path and title.
   /// This doesn't open the dot viewer program and
   /// helps visualization when outside debugging session.
@@ -695,9 +721,7 @@ public:
   // When generating a branch to a BB, we don't in general know enough
   // to provide debug info for the BB at that time, so keep this one around.
   SDValue getBasicBlock(MachineBasicBlock *MBB);
-  SDValue getBasicBlock(MachineBasicBlock *MBB, SDLoc dl);
   SDValue getExternalSymbol(const char *Sym, EVT VT);
-  SDValue getExternalSymbol(const char *Sym, const SDLoc &dl, EVT VT);
   SDValue getTargetExternalSymbol(const char *Sym, EVT VT,
                                   unsigned TargetFlags = 0);
   SDValue getMCSymbol(MCSymbol *Sym, EVT VT);
@@ -870,7 +894,7 @@ public:
 
   /// Returns sum of the base pointer and offset.
   /// Unlike getObjectPtrOffset this does not set NoUnsignedWrap by default.
-  SDValue getMemBasePlusOffset(SDValue Base, int64_t Offset, const SDLoc &DL,
+  SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL,
                                const SDNodeFlags Flags = SDNodeFlags());
   SDValue getMemBasePlusOffset(SDValue Base, SDValue Offset, const SDLoc &DL,
                                const SDNodeFlags Flags = SDNodeFlags());
@@ -878,7 +902,7 @@ public:
   /// Create an add instruction with appropriate flags when used for
   /// addressing some offset of an object. i.e. if a load is split into multiple
   /// components, create an add nuw from the base pointer to the offset.
-  SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, int64_t Offset) {
+  SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset) {
     SDNodeFlags Flags;
     Flags.setNoUnsignedWrap(true);
     return getMemBasePlusOffset(Ptr, Offset, SL, Flags);
@@ -945,21 +969,31 @@ public:
   SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
                   ArrayRef<SDUse> Ops);
   SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
-                  ArrayRef<SDValue> Ops, const SDNodeFlags Flags = SDNodeFlags());
+                  ArrayRef<SDValue> Ops, const SDNodeFlags Flags);
   SDValue getNode(unsigned Opcode, const SDLoc &DL, ArrayRef<EVT> ResultTys,
                   ArrayRef<SDValue> Ops);
   SDValue getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
-                  ArrayRef<SDValue> Ops, const SDNodeFlags Flags = SDNodeFlags());
+                  ArrayRef<SDValue> Ops, const SDNodeFlags Flags);
+
+  // Use flags from current flag inserter.
+  SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
+                  ArrayRef<SDValue> Ops);
+  SDValue getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
+                  ArrayRef<SDValue> Ops);
+  SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue Operand);
+  SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
+                  SDValue N2);
+  SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
+                  SDValue N2, SDValue N3);
 
   // Specialize based on number of operands.
   SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT);
   SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue Operand,
-                  const SDNodeFlags Flags = SDNodeFlags());
+                  const SDNodeFlags Flags);
   SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
-                  SDValue N2, const SDNodeFlags Flags = SDNodeFlags());
+                  SDValue N2, const SDNodeFlags Flags);
   SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
-                  SDValue N2, SDValue N3,
-                  const SDNodeFlags Flags = SDNodeFlags());
+                  SDValue N2, SDValue N3, const SDNodeFlags Flags);
   SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
                   SDValue N2, SDValue N3, SDValue N4);
   SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
@@ -1169,6 +1203,12 @@ public:
   SDValue getLifetimeNode(bool IsStart, const SDLoc &dl, SDValue Chain,
                           int FrameIndex, int64_t Size, int64_t Offset = -1);
 
+  /// Creates a PseudoProbeSDNode with function GUID `Guid` and
+  /// the index of the block `Index` it is probing, as well as the attributes
+  /// `attr` of the probe.
+  SDValue getPseudoProbeNode(const SDLoc &Dl, SDValue Chain, uint64_t Guid,
+                             uint64_t Index, uint32_t Attr);
+
   /// Create a MERGE_VALUES node from the given operands.
   SDValue getMergeValues(ArrayRef<SDValue> Ops, const SDLoc &dl);
 
@@ -1178,14 +1218,15 @@ public:
   /// This function will set the MOLoad flag on MMOFlags, but you can set it if
   /// you want.  The MOStore flag must not be set.
   SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr,
-                  MachinePointerInfo PtrInfo, MaybeAlign Alignment,
+                  MachinePointerInfo PtrInfo,
+                  MaybeAlign Alignment = MaybeAlign(),
                   MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone,
                   const AAMDNodes &AAInfo = AAMDNodes(),
                   const MDNode *Ranges = nullptr);
   /// FIXME: Remove once transition to Align is over.
   inline SDValue
   getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr,
-          MachinePointerInfo PtrInfo, unsigned Alignment = 0,
+          MachinePointerInfo PtrInfo, unsigned Alignment,
           MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone,
           const AAMDNodes &AAInfo = AAMDNodes(),
           const MDNode *Ranges = nullptr) {
@@ -1197,14 +1238,14 @@ public:
   SDValue
   getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain,
              SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT,
-             MaybeAlign Alignment,
+             MaybeAlign Alignment = MaybeAlign(),
              MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone,
              const AAMDNodes &AAInfo = AAMDNodes());
   /// FIXME: Remove once transition to Align is over.
   inline SDValue
   getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain,
              SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT,
-             unsigned Alignment = 0,
+             unsigned Alignment,
              MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone,
              const AAMDNodes &AAInfo = AAMDNodes()) {
     return getExtLoad(ExtType, dl, VT, Chain, Ptr, PtrInfo, MemVT,
@@ -1221,13 +1262,12 @@ public:
                   MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone,
                   const AAMDNodes &AAInfo = AAMDNodes(),
                   const MDNode *Ranges = nullptr);
-  inline SDValue
-  getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType, EVT VT,
-          const SDLoc &dl, SDValue Chain, SDValue Ptr, SDValue Offset,
-          MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment,
-          MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone,
-          const AAMDNodes &AAInfo = AAMDNodes(),
-          const MDNode *Ranges = nullptr) {
+  inline SDValue getLoad(
+      ISD::MemIndexedMode AM, ISD::LoadExtType ExtType, EVT VT, const SDLoc &dl,
+      SDValue Chain, SDValue Ptr, SDValue Offset, MachinePointerInfo PtrInfo,
+      EVT MemVT, MaybeAlign Alignment = MaybeAlign(),
+      MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone,
+      const AAMDNodes &AAInfo = AAMDNodes(), const MDNode *Ranges = nullptr) {
     // Ensures that codegen never sees a None Alignment.
     return getLoad(AM, ExtType, VT, dl, Chain, Ptr, Offset, PtrInfo, MemVT,
                    Alignment.getValueOr(getEVTAlign(MemVT)), MMOFlags, AAInfo,
@@ -1237,7 +1277,7 @@ public:
   inline SDValue
   getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType, EVT VT,
           const SDLoc &dl, SDValue Chain, SDValue Ptr, SDValue Offset,
-          MachinePointerInfo PtrInfo, EVT MemVT, unsigned Alignment = 0,
+          MachinePointerInfo PtrInfo, EVT MemVT, unsigned Alignment,
           MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone,
           const AAMDNodes &AAInfo = AAMDNodes(),
           const MDNode *Ranges = nullptr) {
@@ -1260,7 +1300,7 @@ public:
            const AAMDNodes &AAInfo = AAMDNodes());
   inline SDValue
   getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr,
-           MachinePointerInfo PtrInfo, MaybeAlign Alignment,
+           MachinePointerInfo PtrInfo, MaybeAlign Alignment = MaybeAlign(),
            MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone,
            const AAMDNodes &AAInfo = AAMDNodes()) {
     return getStore(Chain, dl, Val, Ptr, PtrInfo,
@@ -1270,7 +1310,7 @@ public:
   /// FIXME: Remove once transition to Align is over.
   inline SDValue
   getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr,
-           MachinePointerInfo PtrInfo, unsigned Alignment = 0,
+           MachinePointerInfo PtrInfo, unsigned Alignment,
            MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone,
            const AAMDNodes &AAInfo = AAMDNodes()) {
     return getStore(Chain, dl, Val, Ptr, PtrInfo, MaybeAlign(Alignment),
@@ -1285,7 +1325,8 @@ public:
                 const AAMDNodes &AAInfo = AAMDNodes());
   inline SDValue
   getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr,
-                MachinePointerInfo PtrInfo, EVT SVT, MaybeAlign Alignment,
+                MachinePointerInfo PtrInfo, EVT SVT,
+                MaybeAlign Alignment = MaybeAlign(),
                 MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone,
                 const AAMDNodes &AAInfo = AAMDNodes()) {
     return getTruncStore(Chain, dl, Val, Ptr, PtrInfo, SVT,
@@ -1295,7 +1336,7 @@ public:
   /// FIXME: Remove once transition to Align is over.
   inline SDValue
   getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr,
-                MachinePointerInfo PtrInfo, EVT SVT, unsigned Alignment = 0,
+                MachinePointerInfo PtrInfo, EVT SVT, unsigned Alignment,
                 MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone,
                 const AAMDNodes &AAInfo = AAMDNodes()) {
     return getTruncStore(Chain, dl, Val, Ptr, PtrInfo, SVT,
@@ -1321,10 +1362,11 @@ public:
                                 ISD::MemIndexedMode AM);
   SDValue getMaskedGather(SDVTList VTs, EVT VT, const SDLoc &dl,
                           ArrayRef<SDValue> Ops, MachineMemOperand *MMO,
-                          ISD::MemIndexType IndexType);
+                          ISD::MemIndexType IndexType, ISD::LoadExtType ExtTy);
   SDValue getMaskedScatter(SDVTList VTs, EVT VT, const SDLoc &dl,
                            ArrayRef<SDValue> Ops, MachineMemOperand *MMO,
-                           ISD::MemIndexType IndexType);
+                           ISD::MemIndexType IndexType,
+                           bool IsTruncating = false);
 
   /// Construct a node to track a Value* through the backend.
   SDValue getSrcValue(const Value *v);
@@ -1389,6 +1431,9 @@ public:
   void setNodeMemRefs(MachineSDNode *N,
                       ArrayRef<MachineMemOperand *> NewMemRefs);
 
+  // Calculate divergence of node \p N based on its operands.
+  bool calculateDivergence(SDNode *N);
+
   // Propagates the change in divergence to users
   void updateDivergence(SDNode * N);
 
@@ -1409,8 +1454,6 @@ public:
                        EVT VT2, ArrayRef<SDValue> Ops);
   SDNode *SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT1,
                        EVT VT2, EVT VT3, ArrayRef<SDValue> Ops);
-  SDNode *SelectNodeTo(SDNode *N, unsigned TargetOpc, EVT VT1,
-                       EVT VT2, SDValue Op1);
   SDNode *SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT1,
                        EVT VT2, SDValue Op1, SDValue Op2);
   SDNode *SelectNodeTo(SDNode *N, unsigned MachineOpc, SDVTList VTs,
@@ -1468,8 +1511,13 @@ public:
                                 SDValue Operand, SDValue Subreg);
 
   /// Get the specified node if it's already available, or else return NULL.
-  SDNode *getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef<SDValue> Ops,
-                          const SDNodeFlags Flags = SDNodeFlags());
+  SDNode *getNodeIfExists(unsigned Opcode, SDVTList VTList,
+                          ArrayRef<SDValue> Ops, const SDNodeFlags Flags);
+  SDNode *getNodeIfExists(unsigned Opcode, SDVTList VTList,
+                          ArrayRef<SDValue> Ops);
+
+  /// Check if a node exists without modifying its flags.
+  bool doesNodeExist(unsigned Opcode, SDVTList VTList, ArrayRef<SDValue> Ops);
 
   /// Creates a SDDbgValue node.
   SDDbgValue *getDbgValue(DIVariable *Var, DIExpression *Expr, SDNode *N,
@@ -1543,7 +1591,14 @@ public:
   /// chain to the token factor. This ensures that the new memory node will have
   /// the same relative memory dependency position as the old load. Returns the
   /// new merged load chain.
-  SDValue makeEquivalentMemoryOrdering(LoadSDNode *Old, SDValue New);
+  SDValue makeEquivalentMemoryOrdering(SDValue OldChain, SDValue NewMemOpChain);
+
+  /// If an existing load has uses of its chain, create a token factor node with
+  /// that chain and the new memory node's chain and update users of the old
+  /// chain to the token factor. This ensures that the new memory node will have
+  /// the same relative memory dependency position as the old load. Returns the
+  /// new merged load chain.
+  SDValue makeEquivalentMemoryOrdering(LoadSDNode *OldLoad, SDValue NewMemOp);
 
   /// Topological-sort the AllNodes list and a
   /// assign a unique node id for each node in the DAG based on their
@@ -1781,7 +1836,8 @@ public:
   /// for \p DemandedElts.
   ///
   /// NOTE: The function will return true for a demanded splat of UNDEF values.
-  bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts);
+  bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts,
+                    unsigned Depth = 0);
 
   /// Test whether \p V has a splatted value.
   bool isSplatValue(SDValue V, bool AllowUndefs = false);
@@ -1903,14 +1959,14 @@ public:
   }
 
   /// Test whether the given value is a constant int or similar node.
-  SDNode *isConstantIntBuildVectorOrConstantInt(SDValue N);
+  SDNode *isConstantIntBuildVectorOrConstantInt(SDValue N) const;
 
   /// Test whether the given value is a constant FP or similar node.
-  SDNode *isConstantFPBuildVectorOrConstantFP(SDValue N);
+  SDNode *isConstantFPBuildVectorOrConstantFP(SDValue N) const ;
 
   /// \returns true if \p N is any kind of constant or build_vector of
   /// constants, int or float. If a vector, it may not necessarily be a splat.
-  inline bool isConstantValueOfAnyType(SDValue N) {
+  inline bool isConstantValueOfAnyType(SDValue N) const {
     return isConstantIntBuildVectorOrConstantInt(N) ||
            isConstantFPBuildVectorOrConstantFP(N);
   }
@@ -1958,6 +2014,10 @@ public:
 
   bool shouldOptForSize() const;
 
+  /// Get the (commutative) neutral element for the given opcode, if it exists.
+  SDValue getNeutralElement(unsigned Opcode, const SDLoc &DL, EVT VT,
+                            SDNodeFlags Flags);
+
 private:
   void InsertNode(SDNode *N);
   bool RemoveNodeFromCSEMaps(SDNode *N);
@@ -1998,6 +2058,8 @@ private:
 
   std::map<std::pair<std::string, unsigned>, SDNode *> TargetExternalSymbols;
   DenseMap<MCSymbol *, SDNode *> MCSymbols;
+
+  FlagInserter *Inserter = nullptr;
 };
 
 template <> struct GraphTraits<SelectionDAG*> : public GraphTraits<SDNode*> {
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/SelectionDAGISel.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/SelectionDAGISel.h
index 3bfbf3765e4f..84bb11edd715 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/SelectionDAGISel.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/SelectionDAGISel.h
@@ -323,8 +323,6 @@ private:
   SDNode *MorphNode(SDNode *Node, unsigned TargetOpc, SDVTList VTList,
                     ArrayRef<SDValue> Ops, unsigned EmitNodeInfo);
 
-  SDNode *MutateStrictFPToFP(SDNode *Node, unsigned NewOpc);
-
   /// Prepares the landing pad to take incoming values or do other EH
   /// personality specific tasks. Returns true if the block should be
   /// instruction selected, false if no code should be emitted for it.
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index 7c2b49087edd..000e383b71eb 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -85,29 +85,42 @@ namespace ISD {
 
   /// Node predicates
 
-  /// If N is a BUILD_VECTOR node whose elements are all the same constant or
-  /// undefined, return true and return the constant value in \p SplatValue.
-  bool isConstantSplatVector(const SDNode *N, APInt &SplatValue);
-
-  /// Return true if the specified node is a BUILD_VECTOR where all of the
-  /// elements are ~0 or undef.
-  bool isBuildVectorAllOnes(const SDNode *N);
-
-  /// Return true if the specified node is a BUILD_VECTOR where all of the
-  /// elements are 0 or undef.
-  bool isBuildVectorAllZeros(const SDNode *N);
-
-  /// Return true if the specified node is a BUILD_VECTOR node of all
-  /// ConstantSDNode or undef.
-  bool isBuildVectorOfConstantSDNodes(const SDNode *N);
-
-  /// Return true if the specified node is a BUILD_VECTOR node of all
-  /// ConstantFPSDNode or undef.
-  bool isBuildVectorOfConstantFPSDNodes(const SDNode *N);
-
-  /// Return true if the node has at least one operand and all operands of the
-  /// specified node are ISD::UNDEF.
-  bool allOperandsUndef(const SDNode *N);
+/// If N is a BUILD_VECTOR or SPLAT_VECTOR node whose elements are all the
+/// same constant or undefined, return true and return the constant value in
+/// \p SplatValue.
+bool isConstantSplatVector(const SDNode *N, APInt &SplatValue);
+
+/// Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where
+/// all of the elements are ~0 or undef. If \p BuildVectorOnly is set to
+/// true, it only checks BUILD_VECTOR.
+bool isConstantSplatVectorAllOnes(const SDNode *N,
+                                  bool BuildVectorOnly = false);
+
+/// Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where
+/// all of the elements are 0 or undef. If \p BuildVectorOnly is set to true, it
+/// only checks BUILD_VECTOR.
+bool isConstantSplatVectorAllZeros(const SDNode *N,
+                                   bool BuildVectorOnly = false);
+
+/// Return true if the specified node is a BUILD_VECTOR where all of the
+/// elements are ~0 or undef.
+bool isBuildVectorAllOnes(const SDNode *N);
+
+/// Return true if the specified node is a BUILD_VECTOR where all of the
+/// elements are 0 or undef.
+bool isBuildVectorAllZeros(const SDNode *N);
+
+/// Return true if the specified node is a BUILD_VECTOR node of all
+/// ConstantSDNode or undef.
+bool isBuildVectorOfConstantSDNodes(const SDNode *N);
+
+/// Return true if the specified node is a BUILD_VECTOR node of all
+/// ConstantFPSDNode or undef.
+bool isBuildVectorOfConstantFPSDNodes(const SDNode *N);
+
+/// Return true if the node has at least one operand and all operands of the
+/// specified node are ISD::UNDEF.
+bool allOperandsUndef(const SDNode *N);
 
 } // end namespace ISD
 
@@ -180,8 +193,8 @@ public:
     return getValueType().getSizeInBits();
   }
 
-  TypeSize getScalarValueSizeInBits() const {
-    return getValueType().getScalarType().getSizeInBits();
+  uint64_t getScalarValueSizeInBits() const {
+    return getValueType().getScalarType().getFixedSizeInBits();
   }
 
   // Forwarding methods - These forward to the corresponding methods in SDNode.
@@ -357,11 +370,6 @@ template<> struct simplify_type<SDUse> {
 /// the backend.
 struct SDNodeFlags {
 private:
-  // This bit is used to determine if the flags are in a defined state.
-  // Flag bits can only be masked out during intersection if the masking flags
-  // are defined.
-  bool AnyDefined : 1;
-
   bool NoUnsignedWrap : 1;
   bool NoSignedWrap : 1;
   bool Exact : 1;
@@ -383,9 +391,8 @@ private:
 public:
   /// Default constructor turns off all optimization flags.
   SDNodeFlags()
-      : AnyDefined(false), NoUnsignedWrap(false), NoSignedWrap(false),
-        Exact(false), NoNaNs(false), NoInfs(false),
-        NoSignedZeros(false), AllowReciprocal(false),
+      : NoUnsignedWrap(false), NoSignedWrap(false), Exact(false), NoNaNs(false),
+        NoInfs(false), NoSignedZeros(false), AllowReciprocal(false),
         AllowContract(false), ApproximateFuncs(false),
         AllowReassociation(false), NoFPExcept(false) {}
 
@@ -400,56 +407,18 @@ public:
     setAllowReassociation(FPMO.hasAllowReassoc());
   }
 
-  /// Sets the state of the flags to the defined state.
-  void setDefined() { AnyDefined = true; }
-  /// Returns true if the flags are in a defined state.
-  bool isDefined() const { return AnyDefined; }
-
   // These are mutators for each flag.
-  void setNoUnsignedWrap(bool b) {
-    setDefined();
-    NoUnsignedWrap = b;
-  }
-  void setNoSignedWrap(bool b) {
-    setDefined();
-    NoSignedWrap = b;
-  }
-  void setExact(bool b) {
-    setDefined();
-    Exact = b;
-  }
-  void setNoNaNs(bool b) {
-    setDefined();
-    NoNaNs = b;
-  }
-  void setNoInfs(bool b) {
-    setDefined();
-    NoInfs = b;
-  }
-  void setNoSignedZeros(bool b) {
-    setDefined();
-    NoSignedZeros = b;
-  }
-  void setAllowReciprocal(bool b) {
-    setDefined();
-    AllowReciprocal = b;
-  }
-  void setAllowContract(bool b) {
-    setDefined();
-    AllowContract = b;
-  }
-  void setApproximateFuncs(bool b) {
-    setDefined();
-    ApproximateFuncs = b;
-  }
-  void setAllowReassociation(bool b) {
-    setDefined();
-    AllowReassociation = b;
-  }
-  void setNoFPExcept(bool b) {
-    setDefined();
-    NoFPExcept = b;
-  }
+  void setNoUnsignedWrap(bool b) { NoUnsignedWrap = b; }
+  void setNoSignedWrap(bool b) { NoSignedWrap = b; }
+  void setExact(bool b) { Exact = b; }
+  void setNoNaNs(bool b) { NoNaNs = b; }
+  void setNoInfs(bool b) { NoInfs = b; }
+  void setNoSignedZeros(bool b) { NoSignedZeros = b; }
+  void setAllowReciprocal(bool b) { AllowReciprocal = b; }
+  void setAllowContract(bool b) { AllowContract = b; }
+  void setApproximateFuncs(bool b) { ApproximateFuncs = b; }
+  void setAllowReassociation(bool b) { AllowReassociation = b; }
+  void setNoFPExcept(bool b) { NoFPExcept = b; }
 
   // These are accessors for each flag.
   bool hasNoUnsignedWrap() const { return NoUnsignedWrap; }
@@ -464,11 +433,9 @@ public:
   bool hasAllowReassociation() const { return AllowReassociation; }
   bool hasNoFPExcept() const { return NoFPExcept; }
 
-  /// Clear any flags in this flag set that aren't also set in Flags.
-  /// If the given Flags are undefined then don't do anything.
+  /// Clear any flags in this flag set that aren't also set in Flags. All
+  /// flags will be cleared if Flags are undefined.
   void intersectWith(const SDNodeFlags Flags) {
-    if (!Flags.isDefined())
-      return;
     NoUnsignedWrap &= Flags.NoUnsignedWrap;
     NoSignedWrap &= Flags.NoSignedWrap;
     Exact &= Flags.Exact;
@@ -559,6 +526,7 @@ BEGIN_TWO_BYTE_PACK()
   class LoadSDNodeBitfields {
     friend class LoadSDNode;
     friend class MaskedLoadSDNode;
+    friend class MaskedGatherSDNode;
 
     uint16_t : NumLSBaseSDNodeBits;
 
@@ -569,6 +537,7 @@ BEGIN_TWO_BYTE_PACK()
   class StoreSDNodeBitfields {
     friend class StoreSDNode;
     friend class MaskedStoreSDNode;
+    friend class MaskedScatterSDNode;
 
     uint16_t : NumLSBaseSDNodeBits;
 
@@ -720,9 +689,7 @@ public:
   bool use_empty() const { return UseList == nullptr; }
 
   /// Return true if there is exactly one use of this node.
-  bool hasOneUse() const {
-    return !use_empty() && std::next(use_begin()) == use_end();
-  }
+  bool hasOneUse() const { return hasSingleElement(uses()); }
 
   /// Return the number of uses of this node. This method takes
   /// time proportional to the number of uses.
@@ -1379,8 +1346,18 @@ public:
   }
 
   const SDValue &getChain() const { return getOperand(0); }
+
   const SDValue &getBasePtr() const {
-    return getOperand(getOpcode() == ISD::STORE ? 2 : 1);
+    switch (getOpcode()) {
+    case ISD::STORE:
+    case ISD::MSTORE:
+      return getOperand(2);
+    case ISD::MGATHER:
+    case ISD::MSCATTER:
+      return getOperand(3);
+    default:
+      return getOperand(1);
+    }
   }
 
   // Methods to support isa and dyn_cast
@@ -1784,6 +1761,32 @@ public:
   }
 };
 
+/// This SDNode is used for PSEUDO_PROBE values, which are the function guid and
+/// the index of the basic block being probed. A pseudo probe serves as a place
+/// holder and will be removed at the end of compilation. It does not have any
+/// operand because we do not want the instruction selection to deal with any.
+class PseudoProbeSDNode : public SDNode {
+  friend class SelectionDAG;
+  uint64_t Guid;
+  uint64_t Index;
+  uint32_t Attributes;
+
+  PseudoProbeSDNode(unsigned Opcode, unsigned Order, const DebugLoc &Dl,
+                    SDVTList VTs, uint64_t Guid, uint64_t Index, uint32_t Attr)
+      : SDNode(Opcode, Order, Dl, VTs), Guid(Guid), Index(Index),
+        Attributes(Attr) {}
+
+public:
+  uint64_t getGuid() const { return Guid; }
+  uint64_t getIndex() const { return Index; }
+  uint32_t getAttributes() const { return Attributes; }
+
+  // Methods to support isa and dyn_cast
+  static bool classof(const SDNode *N) {
+    return N->getOpcode() == ISD::PSEUDO_PROBE;
+  }
+};
+
 class JumpTableSDNode : public SDNode {
   friend class SelectionDAG;
 
@@ -1944,6 +1947,33 @@ public:
   /// the vector width and set the bits where elements are undef.
   SDValue getSplatValue(BitVector *UndefElements = nullptr) const;
 
+  /// Find the shortest repeating sequence of values in the build vector.
+  ///
+  /// e.g. { u, X, u, X, u, u, X, u } -> { X }
+  ///      { X, Y, u, Y, u, u, X, u } -> { X, Y }
+  ///
+  /// Currently this must be a power-of-2 build vector.
+  /// The DemandedElts mask indicates the elements that must be present,
+  /// undemanded elements in Sequence may be null (SDValue()). If passed a
+  /// non-null UndefElements bitvector, it will resize it to match the original
+  /// vector width and set the bits where elements are undef. If result is
+  /// false, Sequence will be empty.
+  bool getRepeatedSequence(const APInt &DemandedElts,
+                           SmallVectorImpl<SDValue> &Sequence,
+                           BitVector *UndefElements = nullptr) const;
+
+  /// Find the shortest repeating sequence of values in the build vector.
+  ///
+  /// e.g. { u, X, u, X, u, u, X, u } -> { X }
+  ///      { X, Y, u, Y, u, u, X, u } -> { X, Y }
+  ///
+  /// Currently this must be a power-of-2 build vector.
+  /// If passed a non-null UndefElements bitvector, it will resize it to match
+  /// the original vector width and set the bits where elements are undef.
+  /// If result is false, Sequence will be empty.
+  bool getRepeatedSequence(SmallVectorImpl<SDValue> &Sequence,
+                           BitVector *UndefElements = nullptr) const;
+
   /// Returns the demanded splatted constant or null if this is not a constant
   /// splat.
   ///
@@ -2292,9 +2322,6 @@ public:
   // MaskedLoadSDNode (Chain, ptr, offset, mask, passthru)
   // MaskedStoreSDNode (Chain, data, ptr, offset, mask)
   // Mask is a vector of i1 elements
-  const SDValue &getBasePtr() const {
-    return getOperand(getOpcode() == ISD::MLOAD ? 1 : 2);
-  }
   const SDValue &getOffset() const {
     return getOperand(getOpcode() == ISD::MLOAD ? 2 : 3);
   }
@@ -2402,6 +2429,9 @@ public:
   ISD::MemIndexType getIndexType() const {
     return static_cast<ISD::MemIndexType>(LSBaseSDNodeBits.AddressingMode);
   }
+  void setIndexType(ISD::MemIndexType IndexType) {
+    LSBaseSDNodeBits.AddressingMode = IndexType;
+  }
   bool isIndexScaled() const {
     return (getIndexType() == ISD::SIGNED_SCALED) ||
            (getIndexType() == ISD::UNSIGNED_SCALED);
@@ -2434,12 +2464,18 @@ public:
 
   MaskedGatherSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs,
                      EVT MemVT, MachineMemOperand *MMO,
-                     ISD::MemIndexType IndexType)
+                     ISD::MemIndexType IndexType, ISD::LoadExtType ETy)
       : MaskedGatherScatterSDNode(ISD::MGATHER, Order, dl, VTs, MemVT, MMO,
-                                  IndexType) {}
+                                  IndexType) {
+    LoadSDNodeBits.ExtTy = ETy;
+  }
 
   const SDValue &getPassThru() const { return getOperand(1); }
 
+  ISD::LoadExtType getExtensionType() const {
+    return ISD::LoadExtType(LoadSDNodeBits.ExtTy);
+  }
+
   static bool classof(const SDNode *N) {
     return N->getOpcode() == ISD::MGATHER;
   }
@@ -2453,9 +2489,16 @@ public:
 
   MaskedScatterSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs,
                       EVT MemVT, MachineMemOperand *MMO,
-                      ISD::MemIndexType IndexType)
+                      ISD::MemIndexType IndexType, bool IsTrunc)
       : MaskedGatherScatterSDNode(ISD::MSCATTER, Order, dl, VTs, MemVT, MMO,
-                                  IndexType) {}
+                                  IndexType) {
+    StoreSDNodeBits.IsTruncating = IsTrunc;
+  }
+
+  /// Return true if the op does a truncation before store.
+  /// For integers this is the same as doing a TRUNCATE and storing the result.
+  /// For floats, it is the same as doing an FP_ROUND and storing the result.
+  bool isTruncatingStore() const { return StoreSDNodeBits.IsTruncating; }
 
   const SDValue &getValue() const { return getOperand(1); }
 
@@ -2605,7 +2648,8 @@ template <> struct GraphTraits<SDNode*> {
 /// with 4 and 8 byte pointer alignment, respectively.
 using LargestSDNode = AlignedCharArrayUnion<AtomicSDNode, TargetIndexSDNode,
                                             BlockAddressSDNode,
-                                            GlobalAddressSDNode>;
+                                            GlobalAddressSDNode,
+                                            PseudoProbeSDNode>;
 
 /// The SDNode class with the greatest alignment requirement.
 using MostAlignedSDNode = GlobalAddressSDNode;
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h
index 014523f1af6a..78f6fc6656fa 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h
@@ -85,7 +85,7 @@ public:
     return SDValue();
   }
 
-  /// Emit target-specific code that performs a memcmp, in cases where that is
+  /// Emit target-specific code that performs a memcmp/bcmp, in cases where that is
   /// faster than a libcall. The first returned SDValue is the result of the
   /// memcmp and the second is the chain. Both SDValues can be null if a normal
   /// libcall should be used.
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/SlotIndexes.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/SlotIndexes.h
index 19eab7ae5e35..b2133de93ea2 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/SlotIndexes.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/SlotIndexes.h
@@ -604,38 +604,27 @@ class raw_ostream;
     }
 
     /// Add the given MachineBasicBlock into the maps.
-    /// If \p InsertionPoint is specified then the block will be placed
-    /// before the given machine instr, otherwise it will be placed
-    /// before the next block in MachineFunction insertion order.
-    void insertMBBInMaps(MachineBasicBlock *mbb,
-                         MachineInstr *InsertionPoint = nullptr) {
-      MachineFunction::iterator nextMBB =
-        std::next(MachineFunction::iterator(mbb));
-
-      IndexListEntry *startEntry = nullptr;
-      IndexListEntry *endEntry = nullptr;
-      IndexList::iterator newItr;
-      if (InsertionPoint) {
-        startEntry = createEntry(nullptr, 0);
-        endEntry = getInstructionIndex(*InsertionPoint).listEntry();
-        newItr = indexList.insert(endEntry->getIterator(), startEntry);
-      } else if (nextMBB == mbb->getParent()->end()) {
-        startEntry = &indexList.back();
-        endEntry = createEntry(nullptr, 0);
-        newItr = indexList.insertAfter(startEntry->getIterator(), endEntry);
-      } else {
-        startEntry = createEntry(nullptr, 0);
-        endEntry = getMBBStartIdx(&*nextMBB).listEntry();
-        newItr = indexList.insert(endEntry->getIterator(), startEntry);
-      }
+    /// If it contains any instructions then they must already be in the maps.
+    /// This is used after a block has been split by moving some suffix of its
+    /// instructions into a newly created block.
+    void insertMBBInMaps(MachineBasicBlock *mbb) {
+      assert(mbb != &mbb->getParent()->front() &&
+             "Can't insert a new block at the beginning of a function.");
+      auto prevMBB = std::prev(MachineFunction::iterator(mbb));
+
+      // Create a new entry to be used for the start of mbb and the end of
+      // prevMBB.
+      IndexListEntry *startEntry = createEntry(nullptr, 0);
+      IndexListEntry *endEntry = getMBBEndIdx(&*prevMBB).listEntry();
+      IndexListEntry *insEntry =
+          mbb->empty() ? endEntry
+                       : getInstructionIndex(mbb->front()).listEntry();
+      IndexList::iterator newItr =
+          indexList.insert(insEntry->getIterator(), startEntry);
 
       SlotIndex startIdx(startEntry, SlotIndex::Slot_Block);
       SlotIndex endIdx(endEntry, SlotIndex::Slot_Block);
 
-      MachineFunction::iterator prevMBB(mbb);
-      assert(prevMBB != mbb->getParent()->end() &&
-             "Can't insert a new block at the beginning of a function.");
-      --prevMBB;
       MBBRanges[prevMBB->getNumber()].second = startIdx;
 
       assert(unsigned(mbb->getNumber()) == MBBRanges.size() &&
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/StableHashing.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/StableHashing.h
new file mode 100644
index 000000000000..caf27e152e78
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/StableHashing.h
@@ -0,0 +1,112 @@
+//===- llvm/CodeGen/StableHashing.h - Utilities for stable hashing * C++ *-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides types and functions for computing and combining stable
+// hashes. Stable hashes can be useful for hashing across different modules,
+// processes, or compiler runs.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_STABLEHASHING_H
+#define LLVM_CODEGEN_STABLEHASHING_H
+
+#include "llvm/ADT/StringRef.h"
+
+namespace llvm {
+
+/// An opaque object representing a stable hash code. It can be serialized,
+/// deserialized, and is stable across processes and executions.
+using stable_hash = uint64_t;
+
+// Implementation details
+namespace hashing {
+namespace detail {
+
+// Stable hashes are based on the 64-bit FNV-1 hash:
+// https://en.wikipedia.org/wiki/Fowler-Noll-Vo_hash_function
+
+const uint64_t FNV_PRIME_64 = 1099511628211u;
+const uint64_t FNV_OFFSET_64 = 14695981039346656037u;
+
+inline void stable_hash_append(stable_hash &Hash, const char Value) {
+  Hash = Hash ^ (Value & 0xFF);
+  Hash = Hash * FNV_PRIME_64;
+}
+
+inline void stable_hash_append(stable_hash &Hash, stable_hash Value) {
+  for (unsigned I = 0; I < 8; ++I) {
+    stable_hash_append(Hash, static_cast<char>(Value));
+    Value >>= 8;
+  }
+}
+
+} // namespace detail
+} // namespace hashing
+
+inline stable_hash stable_hash_combine(stable_hash A, stable_hash B) {
+  stable_hash Hash = hashing::detail::FNV_OFFSET_64;
+  hashing::detail::stable_hash_append(Hash, A);
+  hashing::detail::stable_hash_append(Hash, B);
+  return Hash;
+}
+
+inline stable_hash stable_hash_combine(stable_hash A, stable_hash B,
+                                       stable_hash C) {
+  stable_hash Hash = hashing::detail::FNV_OFFSET_64;
+  hashing::detail::stable_hash_append(Hash, A);
+  hashing::detail::stable_hash_append(Hash, B);
+  hashing::detail::stable_hash_append(Hash, C);
+  return Hash;
+}
+
+inline stable_hash stable_hash_combine(stable_hash A, stable_hash B,
+                                       stable_hash C, stable_hash D) {
+  stable_hash Hash = hashing::detail::FNV_OFFSET_64;
+  hashing::detail::stable_hash_append(Hash, A);
+  hashing::detail::stable_hash_append(Hash, B);
+  hashing::detail::stable_hash_append(Hash, C);
+  hashing::detail::stable_hash_append(Hash, D);
+  return Hash;
+}
+
+/// Compute a stable_hash for a sequence of values.
+///
+/// This hashes a sequence of values. It produces the same stable_hash as
+/// 'stable_hash_combine(a, b, c, ...)', but can run over arbitrary sized
+/// sequences and is significantly faster given pointers and types which
+/// can be hashed as a sequence of bytes.
+template <typename InputIteratorT>
+stable_hash stable_hash_combine_range(InputIteratorT First,
+                                      InputIteratorT Last) {
+  stable_hash Hash = hashing::detail::FNV_OFFSET_64;
+  for (auto I = First; I != Last; ++I)
+    hashing::detail::stable_hash_append(Hash, *I);
+  return Hash;
+}
+
+inline stable_hash stable_hash_combine_array(const stable_hash *P, size_t C) {
+  stable_hash Hash = hashing::detail::FNV_OFFSET_64;
+  for (size_t I = 0; I < C; ++I)
+    hashing::detail::stable_hash_append(Hash, P[I]);
+  return Hash;
+}
+
+inline stable_hash stable_hash_combine_string(const StringRef &S) {
+  return stable_hash_combine_range(S.begin(), S.end());
+}
+
+inline stable_hash stable_hash_combine_string(const char *C) {
+  stable_hash Hash = hashing::detail::FNV_OFFSET_64;
+  while (*C)
+    hashing::detail::stable_hash_append(Hash, *(C++));
+  return Hash;
+}
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/StackMaps.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/StackMaps.h
index e33ee226e41a..928d7cc6cc04 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/StackMaps.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/StackMaps.h
@@ -148,9 +148,13 @@ public:
 ///   <StackMaps::ConstantOp>, <calling convention>,
 ///   <StackMaps::ConstantOp>, <statepoint flags>,
 ///   <StackMaps::ConstantOp>, <num deopt args>, [deopt args...],
-///   <gc base/derived pairs...> <gc allocas...>
-/// Note that the last two sets of arguments are not currently length
-///   prefixed.
+///   <StackMaps::ConstantOp>, <num gc pointer args>, [gc pointer args...],
+///   <StackMaps::ConstantOp>, <num gc allocas>, [gc allocas args...],
+///   <StackMaps::ConstantOp>, <num  entries in gc map>, [base/derived pairs]
+///   base/derived pairs in gc map are logical indices into <gc pointer args>
+///   section.
+///   All gc pointers assigned to VRegs produce new value (in form of MI Def
+///   operand) and are tied to it.
 class StatepointOpers {
   // TODO:: we should change the STATEPOINT representation so that CC and
   // Flags should be part of meta operands, with args and deopt operands, and
@@ -166,21 +170,23 @@ class StatepointOpers {
   enum { CCOffset = 1, FlagsOffset = 3, NumDeoptOperandsOffset = 5 };
 
 public:
-  explicit StatepointOpers(const MachineInstr *MI) : MI(MI) {}
+  explicit StatepointOpers(const MachineInstr *MI) : MI(MI) {
+    NumDefs = MI->getNumDefs();
+  }
 
   /// Get index of statepoint ID operand.
-  unsigned getIDPos() const { return IDPos; }
+  unsigned getIDPos() const { return NumDefs + IDPos; }
 
   /// Get index of Num Patch Bytes operand.
-  unsigned getNBytesPos() const { return NBytesPos; }
+  unsigned getNBytesPos() const { return NumDefs + NBytesPos; }
 
   /// Get index of Num Call Arguments operand.
-  unsigned getNCallArgsPos() const { return NCallArgsPos; }
+  unsigned getNCallArgsPos() const { return NumDefs + NCallArgsPos; }
 
   /// Get starting index of non call related arguments
   /// (calling convention, statepoint flags, vm state and gc state).
   unsigned getVarIdx() const {
-    return MI->getOperand(NCallArgsPos).getImm() + MetaEnd;
+    return MI->getOperand(NumDefs + NCallArgsPos).getImm() + MetaEnd + NumDefs;
   }
 
   /// Get index of Calling Convention operand.
@@ -195,16 +201,16 @@ public:
   }
 
   /// Return the ID for the given statepoint.
-  uint64_t getID() const { return MI->getOperand(IDPos).getImm(); }
+  uint64_t getID() const { return MI->getOperand(NumDefs + IDPos).getImm(); }
 
   /// Return the number of patchable bytes the given statepoint should emit.
   uint32_t getNumPatchBytes() const {
-    return MI->getOperand(NBytesPos).getImm();
+    return MI->getOperand(NumDefs + NBytesPos).getImm();
   }
 
   /// Return the target of the underlying call.
   const MachineOperand &getCallTarget() const {
-    return MI->getOperand(CallTargetPos);
+    return MI->getOperand(NumDefs + CallTargetPos);
   }
 
   /// Return the calling convention.
@@ -215,8 +221,31 @@ public:
   /// Return the statepoint flags.
   uint64_t getFlags() const { return MI->getOperand(getFlagsIdx()).getImm(); }
 
+  uint64_t getNumDeoptArgs() const {
+    return MI->getOperand(getNumDeoptArgsIdx()).getImm();
+  }
+
+  /// Get index of number of gc map entries.
+  unsigned getNumGcMapEntriesIdx();
+
+  /// Get index of number of gc allocas.
+  unsigned getNumAllocaIdx();
+
+  /// Get index of number of GC pointers.
+  unsigned getNumGCPtrIdx();
+
+  /// Get index of first GC pointer operand of -1 if there are none.
+  int getFirstGCPtrIdx();
+
+  /// Get vector of base/derived pairs from statepoint.
+  /// Elements are indices into GC Pointer operand list (logical).
+  /// Returns number of elements in GCMap.
+  unsigned
+  getGCPointerMap(SmallVectorImpl<std::pair<unsigned, unsigned>> &GCMap);
+
 private:
   const MachineInstr *MI;
+  unsigned NumDefs;
 };
 
 class StackMaps {
@@ -258,6 +287,10 @@ public:
 
   StackMaps(AsmPrinter &AP);
 
+  /// Get index of next meta operand.
+  /// Similar to parseOperand, but does not actually parses operand meaning.
+  static unsigned getNextMetaArgIdx(const MachineInstr *MI, unsigned CurIdx);
+
   void reset() {
     CSInfos.clear();
     ConstPool.clear();
@@ -330,6 +363,13 @@ private:
                MachineInstr::const_mop_iterator MOE, LocationVec &Locs,
                LiveOutVec &LiveOuts) const;
 
+  /// Specialized parser of statepoint operands.
+  /// They do not directly correspond to StackMap record entries.
+  void parseStatepointOpers(const MachineInstr &MI,
+                            MachineInstr::const_mop_iterator MOI,
+                            MachineInstr::const_mop_iterator MOE,
+                            LocationVec &Locations, LiveOutVec &LiveOuts);
+
   /// Create a live-out register record for the given register @p Reg.
   LiveOutReg createLiveOutReg(unsigned Reg,
                               const TargetRegisterInfo *TRI) const;
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/SwitchLoweringUtils.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/SwitchLoweringUtils.h
index 4d6afa617d3a..51f1d7d6fd21 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/SwitchLoweringUtils.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/SwitchLoweringUtils.h
@@ -10,16 +10,21 @@
 #define LLVM_CODEGEN_SWITCHLOWERINGUTILS_H
 
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
-#include "llvm/CodeGen/TargetLowering.h"
-#include "llvm/IR/Constants.h"
+#include "llvm/IR/InstrTypes.h"
 #include "llvm/Support/BranchProbability.h"
+#include <vector>
 
 namespace llvm {
 
+class BlockFrequencyInfo;
+class ConstantInt;
 class FunctionLoweringInfo;
 class MachineBasicBlock;
-class BlockFrequencyInfo;
+class ProfileSummaryInfo;
+class TargetLowering;
+class TargetMachine;
 
 namespace SwitchCG {
 
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/TargetCallingConv.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/TargetCallingConv.h
index 347d7ff40404..df974b499851 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/TargetCallingConv.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/TargetCallingConv.h
@@ -31,6 +31,7 @@ namespace ISD {
     unsigned IsInReg : 1;    ///< Passed in register
     unsigned IsSRet : 1;     ///< Hidden struct-ret ptr
     unsigned IsByVal : 1;    ///< Struct passed by value
+    unsigned IsByRef : 1;    ///< Passed in memory
     unsigned IsNest : 1;     ///< Nested fn static chain
     unsigned IsReturned : 1; ///< Always returned
     unsigned IsSplit : 1;
@@ -43,25 +44,31 @@ namespace ISD {
     unsigned IsHva : 1;        ///< HVA field for
     unsigned IsHvaStart : 1;   ///< HVA structure start
     unsigned IsSecArgPass : 1; ///< Second argument
-    unsigned ByValAlign : 4;   ///< Log 2 of byval alignment
+    unsigned ByValOrByRefAlign : 4; ///< Log 2 of byval/byref alignment
     unsigned OrigAlign : 5;    ///< Log 2 of original alignment
     unsigned IsInConsecutiveRegsLast : 1;
     unsigned IsInConsecutiveRegs : 1;
     unsigned IsCopyElisionCandidate : 1; ///< Argument copy elision candidate
     unsigned IsPointer : 1;
 
-    unsigned ByValSize; ///< Byval struct size
+    unsigned ByValOrByRefSize; ///< Byval or byref struct size
 
     unsigned PointerAddrSpace; ///< Address space of pointer argument
 
+    /// Set the alignment used by byref or byval parameters.
+    void setAlignImpl(Align A) {
+      ByValOrByRefAlign = encode(A);
+      assert(getNonZeroByValAlign() == A && "bitfield overflow");
+    }
+
   public:
     ArgFlagsTy()
-        : IsZExt(0), IsSExt(0), IsInReg(0), IsSRet(0), IsByVal(0), IsNest(0),
-          IsReturned(0), IsSplit(0), IsInAlloca(0), IsPreallocated(0),
+      : IsZExt(0), IsSExt(0), IsInReg(0), IsSRet(0), IsByVal(0), IsByRef(0),
+          IsNest(0), IsReturned(0), IsSplit(0), IsInAlloca(0), IsPreallocated(0),
           IsSplitEnd(0), IsSwiftSelf(0), IsSwiftError(0), IsCFGuardTarget(0),
-          IsHva(0), IsHvaStart(0), IsSecArgPass(0), ByValAlign(0), OrigAlign(0),
-          IsInConsecutiveRegsLast(0), IsInConsecutiveRegs(0),
-          IsCopyElisionCandidate(0), IsPointer(0), ByValSize(0),
+          IsHva(0), IsHvaStart(0), IsSecArgPass(0), ByValOrByRefAlign(0),
+          OrigAlign(0), IsInConsecutiveRegsLast(0), IsInConsecutiveRegs(0),
+          IsCopyElisionCandidate(0), IsPointer(0), ByValOrByRefSize(0),
           PointerAddrSpace(0) {
       static_assert(sizeof(*this) == 3 * sizeof(unsigned), "flags are too big");
     }
@@ -81,6 +88,9 @@ namespace ISD {
     bool isByVal() const { return IsByVal; }
     void setByVal() { IsByVal = 1; }
 
+    bool isByRef() const { return IsByRef; }
+    void setByRef() { IsByRef = 1; }
+
     bool isInAlloca() const { return IsInAlloca; }
     void setInAlloca() { IsInAlloca = 1; }
 
@@ -112,10 +122,12 @@ namespace ISD {
     void setReturned() { IsReturned = 1; }
 
     bool isInConsecutiveRegs()  const { return IsInConsecutiveRegs; }
-    void setInConsecutiveRegs() { IsInConsecutiveRegs = 1; }
+    void setInConsecutiveRegs(bool Flag = true) { IsInConsecutiveRegs = Flag; }
 
     bool isInConsecutiveRegsLast() const { return IsInConsecutiveRegsLast; }
-    void setInConsecutiveRegsLast() { IsInConsecutiveRegsLast = 1; }
+    void setInConsecutiveRegsLast(bool Flag = true) {
+      IsInConsecutiveRegsLast = Flag;
+    }
 
     bool isSplit()   const { return IsSplit; }
     void setSplit()  { IsSplit = 1; }
@@ -131,17 +143,22 @@ namespace ISD {
 
     LLVM_ATTRIBUTE_DEPRECATED(unsigned getByValAlign() const,
                               "Use getNonZeroByValAlign() instead") {
-      MaybeAlign A = decodeMaybeAlign(ByValAlign);
+      MaybeAlign A = decodeMaybeAlign(ByValOrByRefAlign);
       return A ? A->value() : 0;
     }
     Align getNonZeroByValAlign() const {
-      MaybeAlign A = decodeMaybeAlign(ByValAlign);
+      MaybeAlign A = decodeMaybeAlign(ByValOrByRefAlign);
       assert(A && "ByValAlign must be defined");
       return *A;
     }
     void setByValAlign(Align A) {
-      ByValAlign = encode(A);
-      assert(getNonZeroByValAlign() == A && "bitfield overflow");
+      assert(isByVal() && !isByRef());
+      setAlignImpl(A);
+    }
+
+    void setByRefAlign(Align A) {
+      assert(!isByVal() && isByRef());
+      setAlignImpl(A);
     }
 
     LLVM_ATTRIBUTE_DEPRECATED(unsigned getOrigAlign() const,
@@ -157,8 +174,23 @@ namespace ISD {
       assert(getNonZeroOrigAlign() == A && "bitfield overflow");
     }
 
-    unsigned getByValSize() const { return ByValSize; }
-    void setByValSize(unsigned S) { ByValSize = S; }
+    unsigned getByValSize() const {
+      assert(isByVal() && !isByRef());
+      return ByValOrByRefSize;
+    }
+    void setByValSize(unsigned S) {
+      assert(isByVal() && !isByRef());
+      ByValOrByRefSize = S;
+    }
+
+    unsigned getByRefSize() const {
+      assert(!isByVal() && isByRef());
+      return ByValOrByRefSize;
+    }
+    void setByRefSize(unsigned S) {
+      assert(!isByVal() && isByRef());
+      ByValOrByRefSize = S;
+    }
 
     unsigned getPointerAddrSpace() const { return PointerAddrSpace; }
     void setPointerAddrSpace(unsigned AS) { PointerAddrSpace = AS; }
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/TargetFrameLowering.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/TargetFrameLowering.h
index d6580430daf7..792452f6e81d 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/TargetFrameLowering.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/TargetFrameLowering.h
@@ -14,6 +14,7 @@
 #define LLVM_CODEGEN_TARGETFRAMELOWERING_H
 
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/Support/TypeSize.h"
 #include <vector>
 
 namespace llvm {
@@ -26,7 +27,7 @@ namespace TargetStackID {
   enum Value {
     Default = 0,
     SGPRSpill = 1,
-    SVEVector = 2,
+    ScalableVector = 2,
     NoAlloc = 255
   };
 }
@@ -297,8 +298,8 @@ public:
   /// getFrameIndexReference - This method should return the base register
   /// and offset used to reference a frame index location. The offset is
   /// returned directly, and the base register is returned via FrameReg.
-  virtual int getFrameIndexReference(const MachineFunction &MF, int FI,
-                                     Register &FrameReg) const;
+  virtual StackOffset getFrameIndexReference(const MachineFunction &MF, int FI,
+                                             Register &FrameReg) const;
 
   /// Same as \c getFrameIndexReference, except that the stack pointer (as
   /// opposed to the frame pointer) will be the preferred value for \p
@@ -306,9 +307,10 @@ public:
   /// use offsets from RSP.  If \p IgnoreSPUpdates is true, the returned
   /// offset is only guaranteed to be valid with respect to the value of SP at
   /// the end of the prologue.
-  virtual int getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI,
-                                             Register &FrameReg,
-                                             bool IgnoreSPUpdates) const {
+  virtual StackOffset
+  getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI,
+                                 Register &FrameReg,
+                                 bool IgnoreSPUpdates) const {
     // Always safe to dispatch to getFrameIndexReference.
     return getFrameIndexReference(MF, FI, FrameReg);
   }
@@ -316,8 +318,8 @@ public:
   /// getNonLocalFrameIndexReference - This method returns the offset used to
   /// reference a frame index location. The offset can be from either FP/BP/SP
   /// based on which base register is returned by llvm.localaddress.
-  virtual int getNonLocalFrameIndexReference(const MachineFunction &MF,
-                                       int FI) const {
+  virtual StackOffset getNonLocalFrameIndexReference(const MachineFunction &MF,
+                                                     int FI) const {
     // By default, dispatch to getFrameIndexReference. Interested targets can
     // override this.
     Register FrameReg;
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index b3b2fa218627..36afdefd27b2 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -25,6 +25,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineOutliner.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/Support/BranchProbability.h"
@@ -80,6 +81,15 @@ struct RegImmPair {
   RegImmPair(Register Reg, int64_t Imm) : Reg(Reg), Imm(Imm) {}
 };
 
+/// Used to describe addressing mode similar to ExtAddrMode in CodeGenPrepare.
+/// It holds the register values, the scale value and the displacement.
+struct ExtAddrMode {
+  Register BaseReg;
+  Register ScaledReg;
+  int64_t Scale;
+  int64_t Displacement;
+};
+
 //---------------------------------------------------------------------------
 ///
 /// TargetInstrInfo - Interface to description of machine instruction set
@@ -339,6 +349,12 @@ public:
                                  unsigned &Size, unsigned &Offset,
                                  const MachineFunction &MF) const;
 
+  /// Return true if the given instruction is terminator that is unspillable,
+  /// according to isUnspillableTerminatorImpl.
+  bool isUnspillableTerminator(const MachineInstr *MI) const {
+    return MI->isTerminator() && isUnspillableTerminatorImpl(MI);
+  }
+
   /// Returns the size in bytes of the specified MachineInstr, or ~0U
   /// when this function is not implemented by a target.
   virtual unsigned getInstSizeInBytes(const MachineInstr &MI) const {
@@ -724,7 +740,7 @@ public:
     return nullptr;
   }
 
-  /// Analyze the loop code, return true if it cannot be understoo. Upon
+  /// Analyze the loop code, return true if it cannot be understood. Upon
   /// success, this function returns false and returns information about the
   /// induction variable and compare instruction used at the end.
   virtual bool analyzeLoop(MachineLoop &L, MachineInstr *&IndVarInst,
@@ -771,7 +787,7 @@ public:
 
   /// Second variant of isProfitableToIfCvt. This one
   /// checks for the case where two basic blocks from true and false path
-  /// of a if-then-else (diamond) are predicated on mutally exclusive
+  /// of a if-then-else (diamond) are predicated on mutually exclusive
   /// predicates, where the probability of the true path being taken is given
   /// by Probability, and Confidence is a measure of our confidence that it
   /// will be properly predicted.
@@ -945,6 +961,17 @@ protected:
     return None;
   }
 
+  /// Return true if the given terminator MI is not expected to spill. This
+  /// sets the live interval as not spillable and adjusts phi node lowering to
+  /// not introduce copies after the terminator. Use with care, these are
+  /// currently used for hardware loop intrinsics in very controlled situations,
+  /// created prior to registry allocation in loops that only have single phi
+  /// users for the terminators value. They may run out of registers if not used
+  /// carefully.
+  virtual bool isUnspillableTerminatorImpl(const MachineInstr *MI) const {
+    return false;
+  }
+
 public:
   /// If the specific machine instruction is a instruction that moves/copies
   /// value from one register to another register return destination and source
@@ -968,6 +995,15 @@ public:
     return None;
   }
 
+  /// Returns true if MI is an instruction that defines Reg to have a constant
+  /// value and the value is recorded in ImmVal. The ImmVal is a result that
+  /// should be interpreted as modulo size of Reg.
+  virtual bool getConstValDefinedInReg(const MachineInstr &MI,
+                                       const Register Reg,
+                                       int64_t &ImmVal) const {
+    return false;
+  }
+
   /// Store the specified register of the given register class to the specified
   /// stack frame index. The store instruction is to be added to the given
   /// machine basic block before the specified machine instruction. If isKill
@@ -1041,9 +1077,23 @@ public:
   /// faster sequence.
   /// \param Root - Instruction that could be combined with one of its operands
   /// \param Patterns - Vector of possible combination patterns
-  virtual bool getMachineCombinerPatterns(
-      MachineInstr &Root,
-      SmallVectorImpl<MachineCombinerPattern> &Patterns) const;
+  virtual bool
+  getMachineCombinerPatterns(MachineInstr &Root,
+                             SmallVectorImpl<MachineCombinerPattern> &Patterns,
+                             bool DoRegPressureReduce) const;
+
+  /// Return true if target supports reassociation of instructions in machine
+  /// combiner pass to reduce register pressure for a given BB.
+  virtual bool
+  shouldReduceRegisterPressure(MachineBasicBlock *MBB,
+                               RegisterClassInfo *RegClassInfo) const {
+    return false;
+  }
+
+  /// Fix up the placeholder we may add in genAlternativeCodeSequence().
+  virtual void
+  finalizeInsInstrs(MachineInstr &Root, MachineCombinerPattern &P,
+                    SmallVectorImpl<MachineInstr *> &InsInstrs) const {}
 
   /// Return true when a code sequence can improve throughput. It
   /// should be called only for instructions in loops.
@@ -1248,10 +1298,11 @@ public:
                                bool &OffsetIsScalable,
                                const TargetRegisterInfo *TRI) const;
 
-  /// Get the base operands and byte offset of an instruction that reads/writes
-  /// memory.
+  /// Get zero or more base operands and the byte offset of an instruction that
+  /// reads/writes memory. Note that there may be zero base operands if the
+  /// instruction accesses a constant address.
   /// It returns false if MI does not read/write memory.
-  /// It returns false if no base operands and offset was found.
+  /// It returns false if base operands and offset could not be determined.
   /// It is not guaranteed to always recognize base operands and offsets in all
   /// cases.
   virtual bool getMemOperandsWithOffsetWidth(
@@ -1270,6 +1321,27 @@ public:
     return false;
   }
 
+  /// Target dependent implementation to get the values constituting the address
+  /// MachineInstr that is accessing memory. These values are returned as a
+  /// struct ExtAddrMode which contains all relevant information to make up the
+  /// address.
+  virtual Optional<ExtAddrMode>
+  getAddrModeFromMemoryOp(const MachineInstr &MemI,
+                          const TargetRegisterInfo *TRI) const {
+    return None;
+  }
+
+  /// Returns true if MI's Def is NullValueReg, and the MI
+  /// does not change the Zero value. i.e. cases such as rax = shr rax, X where
+  /// NullValueReg = rax. Note that if the NullValueReg is non-zero, this
+  /// function can return true even if becomes zero. Specifically cases such as
+  /// NullValueReg = shl NullValueReg, 63.
+  virtual bool preservesZeroValueInReg(const MachineInstr *MI,
+                                       const Register NullValueReg,
+                                       const TargetRegisterInfo *TRI) const {
+    return false;
+  }
+
   /// If the instruction is an increment of a constant value, return the amount.
   virtual bool getIncrementValue(const MachineInstr &MI, int &Value) const {
     return false;
@@ -1304,6 +1376,11 @@ public:
   virtual void insertNoop(MachineBasicBlock &MBB,
                           MachineBasicBlock::iterator MI) const;
 
+  /// Insert noops into the instruction stream at the specified point.
+  virtual void insertNoops(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI,
+                           unsigned Quantity) const;
+
   /// Return the noop instruction to use for a noop.
   virtual void getNoop(MCInst &NopInst) const;
 
@@ -1355,8 +1432,13 @@ public:
   /// If the specified instruction defines any predicate
   /// or condition code register(s) used for predication, returns true as well
   /// as the definition predicate(s) by reference.
-  virtual bool DefinesPredicate(MachineInstr &MI,
-                                std::vector<MachineOperand> &Pred) const {
+  /// SkipDead should be set to false at any point that dead
+  /// predicate instructions should be considered as being defined.
+  /// A dead predicate instruction is one that is guaranteed to be removed
+  /// after a call to PredicateInstruction.
+  virtual bool ClobbersPredicate(MachineInstr &MI,
+                                 std::vector<MachineOperand> &Pred,
+                                 bool SkipDead) const {
     return false;
   }
 
@@ -1442,7 +1524,7 @@ public:
   /// the machine instruction generated due to folding.
   virtual MachineInstr *optimizeLoadInstr(MachineInstr &MI,
                                           const MachineRegisterInfo *MRI,
-                                          unsigned &FoldAsLoadDefReg,
+                                          Register &FoldAsLoadDefReg,
                                           MachineInstr *&DefMI) const {
     return nullptr;
   }
@@ -1627,7 +1709,7 @@ public:
   /// This hook works similarly to getPartialRegUpdateClearance, except that it
   /// does not take an operand index. Instead sets \p OpNum to the index of the
   /// unused register.
-  virtual unsigned getUndefRegClearance(const MachineInstr &MI, unsigned &OpNum,
+  virtual unsigned getUndefRegClearance(const MachineInstr &MI, unsigned OpNum,
                                         const TargetRegisterInfo *TRI) const {
     // The default implementation returns 0 for no undef register dependency.
     return 0;
@@ -1688,6 +1770,21 @@ public:
     return 5;
   }
 
+  /// Return the maximal number of alias checks on memory operands. For
+  /// instructions with more than one memory operands, the alias check on a
+  /// single MachineInstr pair has quadratic overhead and results in
+  /// unacceptable performance in the worst case. The limit here is to clamp
+  /// that maximal checks performed. Usually, that's the product of memory
+  /// operand numbers from that pair of MachineInstr to be checked. For
+  /// instance, with two MachineInstrs with 4 and 5 memory operands
+  /// correspondingly, a total of 20 checks are required. With this limit set to
+  /// 16, their alias check is skipped. We choose to limit the product instead
+  /// of the individual instruction as targets may have special MachineInstrs
+  /// with a considerably high number of memory operands, such as `ldm` in ARM.
+  /// Setting this limit per MachineInstr would result in either too high
+  /// overhead or too rigid restriction.
+  virtual unsigned getMemOperandAACheckLimit() const { return 16; }
+
   /// Return an array that contains the ids of the target indices (used for the
   /// TargetIndex machine operand) and their names.
   ///
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/TargetLowering.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/TargetLowering.h
index 06f2b3ca38ea..40115fbd2f15 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -278,6 +278,7 @@ public:
     bool IsSRet : 1;
     bool IsNest : 1;
     bool IsByVal : 1;
+    bool IsByRef : 1;
     bool IsInAlloca : 1;
     bool IsPreallocated : 1;
     bool IsReturned : 1;
@@ -290,7 +291,7 @@ public:
 
     ArgListEntry()
         : IsSExt(false), IsZExt(false), IsInReg(false), IsSRet(false),
-          IsNest(false), IsByVal(false), IsInAlloca(false),
+          IsNest(false), IsByVal(false), IsByRef(false), IsInAlloca(false),
           IsPreallocated(false), IsReturned(false), IsSwiftSelf(false),
           IsSwiftError(false), IsCFGuardTarget(false) {}
 
@@ -374,6 +375,13 @@ public:
   EVT getShiftAmountTy(EVT LHSTy, const DataLayout &DL,
                        bool LegalTypes = true) const;
 
+  /// Return the preferred type to use for a shift opcode, given the shifted
+  /// amount type is \p ShiftValueTy.
+  LLVM_READONLY
+  virtual LLT getPreferredShiftAmountTy(LLT ShiftValueTy) const {
+    return ShiftValueTy;
+  }
+
   /// Returns the type to be used for the index operand of:
   /// ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT,
   /// ISD::INSERT_SUBVECTOR, and ISD::EXTRACT_SUBVECTOR
@@ -419,7 +427,7 @@ public:
   virtual TargetLoweringBase::LegalizeTypeAction
   getPreferredVectorAction(MVT VT) const {
     // The default action for one element vectors is to scalarize
-    if (VT.getVectorElementCount() == 1)
+    if (VT.getVectorElementCount().isScalar())
       return TypeScalarizeVector;
     // The default action for an odd-width vector is to widen.
     if (!VT.isPow2VectorType())
@@ -597,6 +605,12 @@ public:
     return false;
   }
 
+  /// Return the maximum number of "x & (x - 1)" operations that can be done
+  /// instead of deferring to a custom CTPOP.
+  virtual unsigned getCustomCtpopCost(EVT VT, ISD::CondCode Cond) const {
+    return 1;
+  }
+
   /// Return true if instruction generated for equality comparison is folded
   /// with instruction generated for signed comparison.
   virtual bool isEqualityCmpFoldedWithSignedCmp() const { return true; }
@@ -1085,8 +1099,13 @@ public:
 
   /// Return true if the specified operation is legal on this target or can be
   /// made legal with custom lowering. This is used to help guide high-level
-  /// lowering decisions.
-  bool isOperationLegalOrCustom(unsigned Op, EVT VT) const {
+  /// lowering decisions. LegalOnly is an optional convenience for code paths
+  /// traversed pre and post legalisation.
+  bool isOperationLegalOrCustom(unsigned Op, EVT VT,
+                                bool LegalOnly = false) const {
+    if (LegalOnly)
+      return isOperationLegal(Op, VT);
+
     return (VT == MVT::Other || isTypeLegal(VT)) &&
       (getOperationAction(Op, VT) == Legal ||
        getOperationAction(Op, VT) == Custom);
@@ -1094,8 +1113,13 @@ public:
 
   /// Return true if the specified operation is legal on this target or can be
   /// made legal using promotion. This is used to help guide high-level lowering
-  /// decisions.
-  bool isOperationLegalOrPromote(unsigned Op, EVT VT) const {
+  /// decisions. LegalOnly is an optional convenience for code paths traversed
+  /// pre and post legalisation.
+  bool isOperationLegalOrPromote(unsigned Op, EVT VT,
+                                 bool LegalOnly = false) const {
+    if (LegalOnly)
+      return isOperationLegal(Op, VT);
+
     return (VT == MVT::Other || isTypeLegal(VT)) &&
       (getOperationAction(Op, VT) == Legal ||
        getOperationAction(Op, VT) == Promote);
@@ -1103,8 +1127,13 @@ public:
 
   /// Return true if the specified operation is legal on this target or can be
   /// made legal with custom lowering or using promotion. This is used to help
-  /// guide high-level lowering decisions.
-  bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT) const {
+  /// guide high-level lowering decisions. LegalOnly is an optional convenience
+  /// for code paths traversed pre and post legalisation.
+  bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT,
+                                         bool LegalOnly = false) const {
+    if (LegalOnly)
+      return isOperationLegal(Op, VT);
+
     return (VT == MVT::Other || isTypeLegal(VT)) &&
       (getOperationAction(Op, VT) == Legal ||
        getOperationAction(Op, VT) == Custom ||
@@ -1289,6 +1318,10 @@ public:
             getIndexedMaskedStoreAction(IdxMode, VT.getSimpleVT()) == Custom);
   }
 
+  // Returns true if VT is a legal index type for masked gathers/scatters
+  // on this target
+  virtual bool shouldRemoveExtendFromGSIndex(EVT VT) const { return false; }
+
   /// Return how the condition code should be treated: either it is legal, needs
   /// to be expanded to some other code sequence, or the target has a custom
   /// expander for it.
@@ -1625,6 +1658,11 @@ public:
                           const MachineMemOperand &MMO,
                           bool *Fast = nullptr) const;
 
+  /// LLT handling variant.
+  bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, LLT Ty,
+                          const MachineMemOperand &MMO,
+                          bool *Fast = nullptr) const;
+
   /// Returns the target specific optimal type for load and store operations as
   /// a result of memset, memcpy, and memmove lowering.
   /// It returns EVT::Other if the type should be determined using generic
@@ -1663,13 +1701,9 @@ public:
 
   virtual bool isJumpTableRelative() const;
 
-  /// Return true if a mulh[s|u] node for a specific type is cheaper than
-  /// a multiply followed by a shift. This is false by default.
-  virtual bool isMulhCheaperThanMulShift(EVT Type) const { return false; }
-
   /// If a physical register, this specifies the register that
   /// llvm.savestack/llvm.restorestack should save and restore.
-  unsigned getStackPointerRegisterToSaveRestore() const {
+  Register getStackPointerRegisterToSaveRestore() const {
     return StackPointerRegisterToSaveRestore;
   }
 
@@ -1758,17 +1792,10 @@ public:
     return "";
   }
 
-  /// Returns true if a cast between SrcAS and DestAS is a noop.
-  virtual bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const {
-    return false;
-  }
-
   /// Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g. we
   /// are happy to sink it into basic blocks. A cast may be free, but not
   /// necessarily a no-op. e.g. a free truncate from a 64-bit to 32-bit pointer.
-  virtual bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const {
-    return isNoopAddrSpaceCast(SrcAS, DestAS);
-  }
+  virtual bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const;
 
   /// Return true if the pointer arguments to CI should be aligned by aligning
   /// the object whose address is being passed. If so then MinSize is set to the
@@ -2758,6 +2785,10 @@ public:
     return false;
   }
 
+  /// Does this target require the clearing of high-order bits in a register
+  /// passed to the fp16 to fp conversion library function.
+  virtual bool shouldKeepZExtForFP16Conv() const { return false; }
+
   //===--------------------------------------------------------------------===//
   // Runtime Library hooks
   //
@@ -3090,16 +3121,6 @@ protected:
   MachineBasicBlock *emitPatchPoint(MachineInstr &MI,
                                     MachineBasicBlock *MBB) const;
 
-  /// Replace/modify the XRay custom event operands with target-dependent
-  /// details.
-  MachineBasicBlock *emitXRayCustomEvent(MachineInstr &MI,
-                                         MachineBasicBlock *MBB) const;
-
-  /// Replace/modify the XRay typed event operands with target-dependent
-  /// details.
-  MachineBasicBlock *emitXRayTypedEvent(MachineInstr &MI,
-                                        MachineBasicBlock *MBB) const;
-
   bool IsStrictFPEnabled;
 };
 
@@ -4188,7 +4209,7 @@ public:
 
   // Lower custom output constraints. If invalid, return SDValue().
   virtual SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag,
-                                              SDLoc DL,
+                                              const SDLoc &DL,
                                               const AsmOperandInfo &OpInfo,
                                               SelectionDAG &DAG) const;
 
@@ -4255,6 +4276,20 @@ public:
     return SDValue();
   }
 
+  /// Return a target-dependent comparison result if the input operand is
+  /// suitable for use with a square root estimate calculation. For example, the
+  /// comparison may check if the operand is NAN, INF, zero, normal, etc. The
+  /// result should be used as the condition operand for a select or branch.
+  virtual SDValue getSqrtInputTest(SDValue Operand, SelectionDAG &DAG,
+                                   const DenormalMode &Mode) const;
+
+  /// Return a target-dependent result if the input operand is not suitable for
+  /// use with a square root estimate calculation.
+  virtual SDValue getSqrtResultForDenormInput(SDValue Operand,
+                                              SelectionDAG &DAG) const {
+    return DAG.getConstantFP(0.0, SDLoc(Operand), Operand.getValueType());
+  }
+
   //===--------------------------------------------------------------------===//
   // Legalization utility functions
   //
@@ -4269,7 +4304,7 @@ public:
   /// \param RL Low bits of the RHS of the MUL.  See LL for meaning
   /// \param RH High bits of the RHS of the MUL.  See LL for meaning.
   /// \returns true if the node has been expanded, false if it has not
-  bool expandMUL_LOHI(unsigned Opcode, EVT VT, SDLoc dl, SDValue LHS,
+  bool expandMUL_LOHI(unsigned Opcode, EVT VT, const SDLoc &dl, SDValue LHS,
                       SDValue RHS, SmallVectorImpl<SDValue> &Result, EVT HiLoVT,
                       SelectionDAG &DAG, MulExpansionKind Kind,
                       SDValue LL = SDValue(), SDValue LH = SDValue(),
@@ -4297,9 +4332,12 @@ public:
 
   /// Expand rotations.
   /// \param N Node to expand
+  /// \param AllowVectorOps expand vector rotate, this should only be performed
+  ///        if the legalization is happening outside of LegalizeVectorOps
   /// \param Result output after conversion
   /// \returns True, if the expansion was successful, false otherwise
-  bool expandROT(SDNode *N, SDValue &Result, SelectionDAG &DAG) const;
+  bool expandROT(SDNode *N, bool AllowVectorOps, SDValue &Result,
+                 SelectionDAG &DAG) const;
 
   /// Expand float(f32) to SINT(i64) conversion
   /// \param N Node to expand
@@ -4326,6 +4364,11 @@ public:
   /// Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
   SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const;
 
+  /// Expand FP_TO_[US]INT_SAT into FP_TO_[US]INT and selects or min/max.
+  /// \param N Node to expand
+  /// \returns The expansion result
+  SDValue expandFP_TO_INT_SAT(SDNode *N, SelectionDAG &DAG) const;
+
   /// Expand CTPOP nodes. Expands vector/scalar CTPOP nodes,
   /// vector nodes can only succeed if all operations are legal/custom.
   /// \param N Node to expand
@@ -4352,8 +4395,10 @@ public:
   /// (ABS x) -> (XOR (ADD x, (SRA x, type_size)), (SRA x, type_size))
   /// \param N Node to expand
   /// \param Result output after conversion
+  /// \param IsNegative indicate negated abs
   /// \returns True, if the expansion was successful, false otherwise
-  bool expandABS(SDNode *N, SDValue &Result, SelectionDAG &DAG) const;
+  bool expandABS(SDNode *N, SDValue &Result, SelectionDAG &DAG,
+                 bool IsNegative = false) const;
 
   /// Turn load of vector type into a load of the individual elements.
   /// \param LD load to expand
@@ -4393,10 +4438,18 @@ public:
   SDValue getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT,
                                   SDValue Index) const;
 
+  /// Method for building the DAG expansion of ISD::[US][MIN|MAX]. This
+  /// method accepts integers as its arguments.
+  SDValue expandIntMINMAX(SDNode *Node, SelectionDAG &DAG) const;
+
   /// Method for building the DAG expansion of ISD::[US][ADD|SUB]SAT. This
   /// method accepts integers as its arguments.
   SDValue expandAddSubSat(SDNode *Node, SelectionDAG &DAG) const;
 
+  /// Method for building the DAG expansion of ISD::[US]SHLSAT. This
+  /// method accepts integers as its arguments.
+  SDValue expandShlSat(SDNode *Node, SelectionDAG &DAG) const;
+
   /// Method for building the DAG expansion of ISD::[U|S]MULFIX[SAT]. This
   /// method accepts integers as its arguments.
   SDValue expandFixedPointMul(SDNode *Node, SelectionDAG &DAG) const;
@@ -4428,6 +4481,9 @@ public:
   /// only the first Count elements of the vector are used.
   SDValue expandVecReduce(SDNode *Node, SelectionDAG &DAG) const;
 
+  /// Expand a VECREDUCE_SEQ_* into an explicit ordered calculation.
+  SDValue expandVecReduceSeq(SDNode *Node, SelectionDAG &DAG) const;
+
   /// Expand an SREM or UREM using SDIV/UDIV or SDIVREM/UDIVREM, if legal.
   /// Returns true if the expansion was successful.
   bool expandREM(SDNode *Node, SDValue &Result, SelectionDAG &DAG) const;
@@ -4482,6 +4538,10 @@ public:
   // combiner can fold the new nodes.
   SDValue lowerCmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG) const;
 
+  /// Give targets the chance to reduce the number of distinct addresing modes.
+  ISD::MemIndexType getCanonicalIndexType(ISD::MemIndexType IndexType,
+                                          EVT MemVT, SDValue Offsets) const;
+
 private:
   SDValue foldSetCCWithAnd(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond,
                            const SDLoc &DL, DAGCombinerInfo &DCI) const;
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h
index 6e2c0973e354..31e08b7d1e63 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h
@@ -21,6 +21,7 @@ namespace llvm {
 
 class GlobalValue;
 class MachineModuleInfo;
+class MachineFunction;
 class MCContext;
 class MCExpr;
 class MCSection;
@@ -35,10 +36,9 @@ class TargetLoweringObjectFileELF : public TargetLoweringObjectFile {
 protected:
   MCSymbolRefExpr::VariantKind PLTRelativeVariantKind =
       MCSymbolRefExpr::VK_None;
-  const TargetMachine *TM = nullptr;
 
 public:
-  TargetLoweringObjectFileELF() = default;
+  TargetLoweringObjectFileELF();
   ~TargetLoweringObjectFileELF() override = default;
 
   void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
@@ -63,6 +63,8 @@ public:
 
   MCSection *getSectionForJumpTable(const Function &F,
                                     const TargetMachine &TM) const override;
+  MCSection *getSectionForLSDA(const Function &F,
+                               const TargetMachine &TM) const override;
 
   MCSection *
   getSectionForMachineBasicBlock(const Function &F,
@@ -95,6 +97,9 @@ public:
                                        const GlobalValue *RHS,
                                        const TargetMachine &TM) const override;
 
+  const MCExpr *lowerDSOLocalEquivalent(const DSOLocalEquivalent *Equiv,
+                                        const TargetMachine &TM) const override;
+
   MCSection *getSectionForCommandLines() const override;
 };
 
@@ -143,6 +148,7 @@ public:
 
 class TargetLoweringObjectFileCOFF : public TargetLoweringObjectFile {
   mutable unsigned NextUniqueID = 0;
+  const TargetMachine *TM = nullptr;
 
 public:
   ~TargetLoweringObjectFileCOFF() override = default;
@@ -168,12 +174,6 @@ public:
   MCSection *getStaticDtorSection(unsigned Priority,
                                   const MCSymbol *KeySym) const override;
 
-  void emitLinkerFlagsForGlobal(raw_ostream &OS,
-                                const GlobalValue *GV) const override;
-
-  void emitLinkerFlagsForUsed(raw_ostream &OS,
-                              const GlobalValue *GV) const override;
-
   const MCExpr *lowerRelativeReference(const GlobalValue *LHS,
                                        const GlobalValue *RHS,
                                        const TargetMachine &TM) const override;
@@ -183,6 +183,9 @@ public:
   MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind,
                                    const Constant *C,
                                    Align &Alignment) const override;
+
+private:
+  void emitLinkerDirectives(MCStreamer &Streamer, Module &M) const;
 };
 
 class TargetLoweringObjectFileWasm : public TargetLoweringObjectFile {
@@ -217,6 +220,10 @@ public:
   TargetLoweringObjectFileXCOFF() = default;
   ~TargetLoweringObjectFileXCOFF() override = default;
 
+  static bool ShouldEmitEHBlock(const MachineFunction *MF);
+
+  static MCSymbol *getEHInfoTableSymbol(const MachineFunction *MF);
+
   void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
 
   bool shouldPutJumpTableInFunctionSection(bool UsesLabelDifference,
@@ -246,12 +253,13 @@ public:
                                    const Constant *C,
                                    Align &Alignment) const override;
 
-  static XCOFF::StorageClass getStorageClassForGlobal(const GlobalObject *GO);
+  static XCOFF::StorageClass getStorageClassForGlobal(const GlobalValue *GV);
 
   MCSection *
   getSectionForFunctionDescriptor(const Function *F,
                                   const TargetMachine &TM) const override;
-  MCSection *getSectionForTOCEntry(const MCSymbol *Sym) const override;
+  MCSection *getSectionForTOCEntry(const MCSymbol *Sym,
+                                   const TargetMachine &TM) const override;
 
   /// For external functions, this will always return a function descriptor
   /// csect.
@@ -263,7 +271,7 @@ public:
   MCSymbol *getTargetSymbol(const GlobalValue *GV,
                             const TargetMachine &TM) const override;
 
-  MCSymbol *getFunctionEntryPointSymbol(const Function *F,
+  MCSymbol *getFunctionEntryPointSymbol(const GlobalValue *Func,
                                         const TargetMachine &TM) const override;
 };
 
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/TargetPassConfig.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/TargetPassConfig.h
index a18c8b16bf1c..b4787710379f 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/TargetPassConfig.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/TargetPassConfig.h
@@ -25,6 +25,7 @@ struct MachineSchedContext;
 class PassConfigImpl;
 class ScheduleDAGInstrs;
 class CSEConfigBase;
+class PassInstrumentationCallbacks;
 
 // The old pass manager infrastructure is hidden in a legacy namespace now.
 namespace legacy {
@@ -187,7 +188,7 @@ public:
 
   /// Insert InsertedPassID pass after TargetPassID pass.
   void insertPass(AnalysisID TargetPassID, IdentifyingPassPtr InsertedPassID,
-                  bool VerifyAfter = true, bool PrintAfter = true);
+                  bool VerifyAfter = true);
 
   /// Allow the target to enable a specific standard pass by default.
   void enablePass(AnalysisID PassID) { substitutePass(PassID, PassID); }
@@ -313,14 +314,17 @@ public:
   /// Add a pass to remove debug info from the MIR.
   void addStripDebugPass();
 
+  /// Add a pass to check synthesized debug info for MIR.
+  void addCheckDebugPass();
+
   /// Add standard passes before a pass that's about to be added. For example,
   /// the DebugifyMachineModulePass if it is enabled.
   void addMachinePrePasses(bool AllowDebugify = true);
 
   /// Add standard passes after a pass that has just been added. For example,
   /// the MachineVerifier if it is enabled.
-  void addMachinePostPasses(const std::string &Banner, bool AllowPrint = true,
-                            bool AllowVerify = true, bool AllowStrip = true);
+  void addMachinePostPasses(const std::string &Banner, bool AllowVerify = true,
+                            bool AllowStrip = true);
 
   /// Check whether or not GlobalISel should abort on error.
   /// When this is disabled, GlobalISel will fall back on SDISel instead of
@@ -441,32 +445,30 @@ protected:
 
   /// Add a CodeGen pass at this point in the pipeline after checking overrides.
   /// Return the pass that was added, or zero if no pass was added.
-  /// @p printAfter    if true and adding a machine function pass add an extra
-  ///                  machine printer pass afterwards
   /// @p verifyAfter   if true and adding a machine function pass add an extra
   ///                  machine verification pass afterwards.
-  AnalysisID addPass(AnalysisID PassID, bool verifyAfter = true,
-                     bool printAfter = true);
+  AnalysisID addPass(AnalysisID PassID, bool verifyAfter = true);
 
   /// Add a pass to the PassManager if that pass is supposed to be run, as
   /// determined by the StartAfter and StopAfter options. Takes ownership of the
   /// pass.
-  /// @p printAfter    if true and adding a machine function pass add an extra
-  ///                  machine printer pass afterwards
   /// @p verifyAfter   if true and adding a machine function pass add an extra
   ///                  machine verification pass afterwards.
-  void addPass(Pass *P, bool verifyAfter = true, bool printAfter = true);
+  void addPass(Pass *P, bool verifyAfter = true);
 
   /// addMachinePasses helper to create the target-selected or overriden
   /// regalloc pass.
   virtual FunctionPass *createRegAllocPass(bool Optimized);
 
-  /// Add core register alloator passes which do the actual register assignment
+  /// Add core register allocator passes which do the actual register assignment
   /// and rewriting. \returns true if any passes were added.
-  virtual bool addRegAssignmentFast();
-  virtual bool addRegAssignmentOptimized();
+  virtual bool addRegAssignAndRewriteFast();
+  virtual bool addRegAssignAndRewriteOptimized();
 };
 
+void registerCodeGenCallback(PassInstrumentationCallbacks &PIC,
+                             LLVMTargetMachine &);
+
 } // end namespace llvm
 
 #endif // LLVM_CODEGEN_TARGETPASSCONFIG_H
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
index d921c4c9028b..8790e2f09eb6 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -34,6 +34,7 @@
 namespace llvm {
 
 class BitVector;
+class DIExpression;
 class LiveRegMatrix;
 class MachineFunction;
 class MachineInstr;
@@ -87,22 +88,21 @@ public:
 
   /// Return true if the specified register is included in this register class.
   /// This does not include virtual registers.
-  bool contains(unsigned Reg) const {
+  bool contains(Register Reg) const {
     /// FIXME: Historically this function has returned false when given vregs
     ///        but it should probably only receive physical registers
-    if (!Register::isPhysicalRegister(Reg))
+    if (!Reg.isPhysical())
       return false;
-    return MC->contains(Reg);
+    return MC->contains(Reg.asMCReg());
   }
 
   /// Return true if both registers are in this class.
-  bool contains(unsigned Reg1, unsigned Reg2) const {
+  bool contains(Register Reg1, Register Reg2) const {
     /// FIXME: Historically this function has returned false when given a vregs
     ///        but it should probably only receive physical registers
-    if (!Register::isPhysicalRegister(Reg1) ||
-        !Register::isPhysicalRegister(Reg2))
+    if (!Reg1.isPhysical() || !Reg2.isPhysical())
       return false;
-    return MC->contains(Reg1, Reg2);
+    return MC->contains(Reg1.asMCReg(), Reg2.asMCReg());
   }
 
   /// Return the cost of copying a value between two registers in this class.
@@ -386,12 +386,12 @@ public:
   /// The registers may be virtual registers.
   bool regsOverlap(Register regA, Register regB) const {
     if (regA == regB) return true;
-    if (regA.isVirtual() || regB.isVirtual())
+    if (!regA.isPhysical() || !regB.isPhysical())
       return false;
 
     // Regunits are numerically ordered. Find a common unit.
-    MCRegUnitIterator RUA(regA, this);
-    MCRegUnitIterator RUB(regB, this);
+    MCRegUnitIterator RUA(regA.asMCReg(), this);
+    MCRegUnitIterator RUB(regB.asMCReg(), this);
     do {
       if (*RUA == *RUB) return true;
       if (*RUA < *RUB) ++RUA;
@@ -401,9 +401,9 @@ public:
   }
 
   /// Returns true if Reg contains RegUnit.
-  bool hasRegUnit(MCRegister Reg, unsigned RegUnit) const {
+  bool hasRegUnit(MCRegister Reg, Register RegUnit) const {
     for (MCRegUnitIterator Units(Reg, this); Units.isValid(); ++Units)
-      if (*Units == RegUnit)
+      if (Register(*Units) == RegUnit)
         return true;
     return false;
   }
@@ -415,6 +415,16 @@ public:
   virtual Register lookThruCopyLike(Register SrcReg,
                                     const MachineRegisterInfo *MRI) const;
 
+  /// Find the original SrcReg unless it is the target of a copy-like operation,
+  /// in which case we chain backwards through all such operations to the
+  /// ultimate source register. If a physical register is encountered, we stop
+  /// the search.
+  /// Return the original SrcReg if all the definitions in the chain only have
+  /// one user and not a physical register.
+  virtual Register
+  lookThruSingleUseCopyChain(Register SrcReg,
+                             const MachineRegisterInfo *MRI) const;
+
   /// Return a null-terminated list of all of the callee-saved registers on
   /// this target. The register should be in the order of desired callee-save
   /// stack frame offset. The first register is closest to the incoming stack
@@ -449,6 +459,13 @@ public:
     return nullptr;
   }
 
+  /// Return a register mask for the registers preserved by the unwinder,
+  /// or nullptr if no custom mask is needed.
+  virtual const uint32_t *
+  getCustomEHPadPreservedMask(const MachineFunction &MF) const {
+    return nullptr;
+  }
+
   /// Return a register mask that clobbers everything.
   virtual const uint32_t *getNoPreservedMask() const {
     llvm_unreachable("target does not provide no preserved mask");
@@ -894,11 +911,11 @@ public:
     return false;
   }
 
-  /// Insert defining instruction(s) for BaseReg to be a pointer to FrameIdx
-  /// before insertion point I.
-  virtual void materializeFrameBaseRegister(MachineBasicBlock *MBB,
-                                            Register BaseReg, int FrameIdx,
-                                            int64_t Offset) const {
+  /// Insert defining instruction(s) for a pointer to FrameIdx before
+  /// insertion point I. Return materialized frame pointer.
+  virtual Register materializeFrameBaseRegister(MachineBasicBlock *MBB,
+                                                int FrameIdx,
+                                                int64_t Offset) const {
     llvm_unreachable("materializeFrameBaseRegister does not exist on this "
                      "target");
   }
@@ -917,6 +934,15 @@ public:
     llvm_unreachable("isFrameOffsetLegal does not exist on this target");
   }
 
+  /// Gets the DWARF expression opcodes for \p Offset.
+  virtual void getOffsetOpcodes(const StackOffset &Offset,
+                                SmallVectorImpl<uint64_t> &Ops) const;
+
+  /// Prepends a DWARF expression for \p Offset to DIExpression \p Expr.
+  DIExpression *
+  prependOffsetExpression(const DIExpression *Expr, unsigned PrependFlags,
+                          const StackOffset &Offset) const;
+
   /// Spill the register so it can be used by the register scavenger.
   /// Return true if the register was spilled, false otherwise.
   /// If this function does not spill the register, the scavenger
@@ -970,6 +996,36 @@ public:
   virtual bool shouldRegionSplitForVirtReg(const MachineFunction &MF,
                                            const LiveInterval &VirtReg) const;
 
+  /// Last chance recoloring has a high compile time cost especially for
+  /// targets with a lot of registers.
+  /// This method is used to decide whether or not \p VirtReg should
+  /// go through this expensive heuristic.
+  /// When this target hook is hit, by returning false, there is a high
+  /// chance that the register allocation will fail altogether (usually with
+  /// "ran out of registers").
+  /// That said, this error usually points to another problem in the
+  /// optimization pipeline.
+  virtual bool
+  shouldUseLastChanceRecoloringForVirtReg(const MachineFunction &MF,
+                                          const LiveInterval &VirtReg) const {
+    return true;
+  }
+
+  /// Deferred spilling delays the spill insertion of a virtual register
+  /// after every other allocation. By deferring the spilling, it is
+  /// sometimes possible to eliminate that spilling altogether because
+  /// something else could have been eliminated, thus leaving some space
+  /// for the virtual register.
+  /// However, this comes with a compile time impact because it adds one
+  /// more stage to the greedy register allocator.
+  /// This method is used to decide whether \p VirtReg should use the deferred
+  /// spilling stage instead of being spilled right away.
+  virtual bool
+  shouldUseDeferredSpillingForVirtReg(const MachineFunction &MF,
+                                      const LiveInterval &VirtReg) const {
+    return false;
+  }
+
   //===--------------------------------------------------------------------===//
   /// Debug information queries.
 
@@ -994,7 +1050,7 @@ public:
   /// Returns the physical register number of sub-register "Index"
   /// for physical register RegNo. Return zero if the sub-register does not
   /// exist.
-  inline Register getSubReg(MCRegister Reg, unsigned Idx) const {
+  inline MCRegister getSubReg(MCRegister Reg, unsigned Idx) const {
     return static_cast<const MCRegisterInfo *>(this)->getSubReg(Reg, Idx);
   }
 };
@@ -1146,8 +1202,8 @@ public:
 
 // This is useful when building IndexedMaps keyed on virtual registers
 struct VirtReg2IndexFunctor {
-  using argument_type = unsigned;
-  unsigned operator()(unsigned Reg) const {
+  using argument_type = Register;
+  unsigned operator()(Register Reg) const {
     return Register::virtReg2Index(Reg);
   }
 };
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
index e0dfd9c8bbc5..3fac2f688dd8 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
@@ -58,8 +58,8 @@ class Triple;
 ///
 class TargetSubtargetInfo : public MCSubtargetInfo {
 protected: // Can only create subclasses...
-  TargetSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS,
-                      ArrayRef<SubtargetFeatureKV> PF,
+  TargetSubtargetInfo(const Triple &TT, StringRef CPU, StringRef TuneCPU,
+                      StringRef FS, ArrayRef<SubtargetFeatureKV> PF,
                       ArrayRef<SubtargetSubTypeKV> PD,
                       const MCWriteProcResEntry *WPR,
                       const MCWriteLatencyEntry *WL,
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/TileShapeInfo.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/TileShapeInfo.h
new file mode 100644
index 000000000000..031d23555b7e
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/TileShapeInfo.h
@@ -0,0 +1,97 @@
+//===- llvm/CodeGen/TileShapeInfo.h - ---------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file Shape utility for AMX.
+/// AMX hardware requires to config the shape of tile data register before use.
+/// The 2D shape includes row and column. In AMX intrinsics interface the shape
+/// is passed as 1st and 2nd parameter and they are lowered as the 1st and 2nd
+/// machine operand of AMX pseudo instructions. ShapeT class is to facilitate
+/// tile config and register allocator. The row and column are machine operand
+/// of AMX pseudo instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_TILESHAPEINFO_H
+#define LLVM_CODEGEN_TILESHAPEINFO_H
+
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Register.h"
+#include <utility>
+
+namespace llvm {
+
+class ShapeT {
+public:
+  ShapeT(MachineOperand *Row, MachineOperand *Col,
+         const MachineRegisterInfo *MRI = nullptr)
+      : Row(Row), Col(Col) {
+    if (MRI)
+      deduceImm(MRI);
+  }
+  ShapeT()
+      : Row(nullptr), Col(nullptr), RowImm(InvalidImmShape),
+        ColImm(InvalidImmShape) {}
+  bool operator==(const ShapeT &Shape) {
+    MachineOperand *R = Shape.Row;
+    MachineOperand *C = Shape.Col;
+    if (!R || !C)
+      return false;
+    if (!Row || !Col)
+      return false;
+    if (Row->getReg() == R->getReg() && Col->getReg() == C->getReg())
+      return true;
+    if ((RowImm != InvalidImmShape) && (ColImm != InvalidImmShape))
+      return RowImm == Shape.getRowImm() && ColImm == Shape.getColImm();
+    return false;
+  }
+
+  bool operator!=(const ShapeT &Shape) { return !(*this == Shape); }
+
+  MachineOperand *getRow() const { return Row; }
+
+  MachineOperand *getCol() const { return Col; }
+
+  int64_t getRowImm() const { return RowImm; }
+
+  int64_t getColImm() const { return ColImm; }
+
+  bool isValid() { return (Row != nullptr) && (Col != nullptr); }
+
+  void deduceImm(const MachineRegisterInfo *MRI) {
+    // All def must be the same value, otherwise it is invalid MIs.
+    // Find the immediate.
+    // TODO copy propagation.
+    auto GetImm = [&](Register Reg) {
+      int64_t Imm = InvalidImmShape;
+      for (const MachineOperand &DefMO : MRI->def_operands(Reg)) {
+        const auto *MI = DefMO.getParent();
+        if (MI->isMoveImmediate()) {
+          Imm = MI->getOperand(1).getImm();
+          break;
+        }
+      }
+      return Imm;
+    };
+    RowImm = GetImm(Row->getReg());
+    ColImm = GetImm(Col->getReg());
+  }
+
+private:
+  static constexpr int64_t InvalidImmShape = -1;
+  MachineOperand *Row;
+  MachineOperand *Col;
+  int64_t RowImm;
+  int64_t ColImm;
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/ValueTypes.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/ValueTypes.h
index db8161caf7d2..888b83d6f736 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/ValueTypes.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/ValueTypes.h
@@ -92,26 +92,17 @@ namespace llvm {
     /// with the element type converted to an integer type with the same
     /// bitwidth.
     EVT changeVectorElementTypeToInteger() const {
-      if (!isSimple())
-        return changeExtendedVectorElementTypeToInteger();
-      MVT EltTy = getSimpleVT().getVectorElementType();
-      unsigned BitWidth = EltTy.getSizeInBits();
-      MVT IntTy = MVT::getIntegerVT(BitWidth);
-      MVT VecTy = MVT::getVectorVT(IntTy, getVectorElementCount());
-      assert(VecTy.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE &&
-             "Simple vector VT not representable by simple integer vector VT!");
-      return VecTy;
+      if (isSimple())
+        return getSimpleVT().changeVectorElementTypeToInteger();
+      return changeExtendedVectorElementTypeToInteger();
     }
 
     /// Return a VT for a vector type whose attributes match ourselves
     /// with the exception of the element type that is chosen by the caller.
     EVT changeVectorElementType(EVT EltVT) const {
-      if (!isSimple())
-        return changeExtendedVectorElementType(EltVT);
-      MVT VecTy = MVT::getVectorVT(EltVT.V, getVectorElementCount());
-      assert(VecTy.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE &&
-             "Simple vector VT not representable by simple integer vector VT!");
-      return VecTy;
+      if (isSimple() && EltVT.isSimple())
+        return getSimpleVT().changeVectorElementType(EltVT.getSimpleVT());
+      return changeExtendedVectorElementType(EltVT);
     }
 
     /// Return the type converted to an equivalently sized integer or vector
@@ -122,8 +113,7 @@ namespace llvm {
         return changeVectorElementTypeToInteger();
 
       if (isSimple())
-        return MVT::getIntegerVT(getSizeInBits());
-
+        return getSimpleVT().changeTypeToInteger();
       return changeExtendedTypeToInteger();
     }
 
@@ -214,9 +204,7 @@ namespace llvm {
     }
 
     /// Return true if the bit size is a multiple of 8.
-    bool isByteSized() const {
-      return getSizeInBits().isByteSized();
-    }
+    bool isByteSized() const { return getSizeInBits().isKnownMultipleOf(8); }
 
     /// Return true if the size is a power-of-two number of bytes.
     bool isRound() const {
@@ -232,28 +220,58 @@ namespace llvm {
       return getSizeInBits() == VT.getSizeInBits();
     }
 
+    /// Return true if we know at compile time this has more bits than VT.
+    bool knownBitsGT(EVT VT) const {
+      return TypeSize::isKnownGT(getSizeInBits(), VT.getSizeInBits());
+    }
+
+    /// Return true if we know at compile time this has more than or the same
+    /// bits as VT.
+    bool knownBitsGE(EVT VT) const {
+      return TypeSize::isKnownGE(getSizeInBits(), VT.getSizeInBits());
+    }
+
+    /// Return true if we know at compile time this has fewer bits than VT.
+    bool knownBitsLT(EVT VT) const {
+      return TypeSize::isKnownLT(getSizeInBits(), VT.getSizeInBits());
+    }
+
+    /// Return true if we know at compile time this has fewer than or the same
+    /// bits as VT.
+    bool knownBitsLE(EVT VT) const {
+      return TypeSize::isKnownLE(getSizeInBits(), VT.getSizeInBits());
+    }
+
     /// Return true if this has more bits than VT.
     bool bitsGT(EVT VT) const {
       if (EVT::operator==(VT)) return false;
-      return getSizeInBits() > VT.getSizeInBits();
+      assert(isScalableVector() == VT.isScalableVector() &&
+             "Comparison between scalable and fixed types");
+      return knownBitsGT(VT);
     }
 
     /// Return true if this has no less bits than VT.
     bool bitsGE(EVT VT) const {
       if (EVT::operator==(VT)) return true;
-      return getSizeInBits() >= VT.getSizeInBits();
+      assert(isScalableVector() == VT.isScalableVector() &&
+             "Comparison between scalable and fixed types");
+      return knownBitsGE(VT);
     }
 
     /// Return true if this has less bits than VT.
     bool bitsLT(EVT VT) const {
       if (EVT::operator==(VT)) return false;
-      return getSizeInBits() < VT.getSizeInBits();
+      assert(isScalableVector() == VT.isScalableVector() &&
+             "Comparison between scalable and fixed types");
+      return knownBitsLT(VT);
     }
 
     /// Return true if this has no more bits than VT.
     bool bitsLE(EVT VT) const {
       if (EVT::operator==(VT)) return true;
-      return getSizeInBits() <= VT.getSizeInBits();
+      assert(isScalableVector() == VT.isScalableVector() &&
+             "Comparison between scalable and fixed types");
+      return knownBitsLE(VT);
     }
 
     /// Return the SimpleValueType held in the specified simple EVT.
@@ -285,7 +303,7 @@ namespace llvm {
       if (isScalableVector())
         WithColor::warning()
             << "Possible incorrect use of EVT::getVectorNumElements() for "
-               "scalable vector. Scalable flag may be dropped, use"
+               "scalable vector. Scalable flag may be dropped, use "
                "EVT::getVectorElementCount() instead\n";
 #endif
       if (isSimple())
@@ -304,7 +322,7 @@ namespace llvm {
 
     /// Given a vector type, return the minimum number of elements it contains.
     unsigned getVectorMinNumElements() const {
-      return getVectorElementCount().Min;
+      return getVectorElementCount().getKnownMinValue();
     }
 
     /// Return the size of the specified value type in bits.
@@ -318,8 +336,14 @@ namespace llvm {
       return getExtendedSizeInBits();
     }
 
-    TypeSize getScalarSizeInBits() const {
-      return getScalarType().getSizeInBits();
+    /// Return the size of the specified fixed width value type in bits. The
+    /// function will assert if the type is scalable.
+    uint64_t getFixedSizeInBits() const {
+      return getSizeInBits().getFixedSize();
+    }
+
+    uint64_t getScalarSizeInBits() const {
+      return getScalarType().getSizeInBits().getFixedSize();
     }
 
     /// Return the number of bytes overwritten by a store of the specified value
@@ -383,8 +407,17 @@ namespace llvm {
     EVT getHalfNumVectorElementsVT(LLVMContext &Context) const {
       EVT EltVT = getVectorElementType();
       auto EltCnt = getVectorElementCount();
-      assert(!(EltCnt.Min & 1) && "Splitting vector, but not in half!");
-      return EVT::getVectorVT(Context, EltVT, EltCnt / 2);
+      assert(EltCnt.isKnownEven() && "Splitting vector, but not in half!");
+      return EVT::getVectorVT(Context, EltVT, EltCnt.divideCoefficientBy(2));
+    }
+
+    // Return a VT for a vector type with the same element type but
+    // double the number of elements. The type returned may be an
+    // extended type.
+    EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const {
+      EVT EltVT = getVectorElementType();
+      auto EltCnt = getVectorElementCount();
+      return EVT::getVectorVT(Context, EltVT, EltCnt * 2);
     }
 
     /// Returns true if the given vector is a power of 2.
@@ -398,7 +431,8 @@ namespace llvm {
     EVT getPow2VectorType(LLVMContext &Context) const {
       if (!isPow2VectorType()) {
         ElementCount NElts = getVectorElementCount();
-        NElts.Min = 1 << Log2_32_Ceil(NElts.Min);
+        unsigned NewMinCount = 1 << Log2_32_Ceil(NElts.getKnownMinValue());
+        NElts = ElementCount::get(NewMinCount, NElts.isScalable());
         return EVT::getVectorVT(Context, getVectorElementType(), NElts);
       }
       else {
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/ValueTypes.td b/contrib/llvm-project/llvm/include/llvm/CodeGen/ValueTypes.td
index c5eb87cf1d34..d13d0a7772e9 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/ValueTypes.td
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/ValueTypes.td
@@ -87,107 +87,116 @@ def v4i64  : ValueType<256, 60>;   //  4 x i64 vector value
 def v8i64  : ValueType<512, 61>;   //  8 x i64 vector value
 def v16i64 : ValueType<1024,62>;   // 16 x i64 vector value
 def v32i64 : ValueType<2048,63>;   // 32 x i64 vector value
+def v64i64 : ValueType<4096,64>;   // 64 x i64 vector value
+def v128i64: ValueType<8192,65>;   // 128 x i64 vector value
+def v256i64: ValueType<16384,66>;  // 256 x i64 vector value
+
+def v1i128 : ValueType<128, 67>;   //  1 x i128 vector value
+
+def v2f16    : ValueType<32 , 68>;    //    2 x f16 vector value
+def v3f16    : ValueType<48 , 69>;    //    3 x f16 vector value
+def v4f16    : ValueType<64 , 70>;    //    4 x f16 vector value
+def v8f16    : ValueType<128, 71>;    //    8 x f16 vector value
+def v16f16   : ValueType<256, 72>;    //   16 x f16 vector value
+def v32f16   : ValueType<512, 73>;    //   32 x f16 vector value
+def v64f16   : ValueType<1024, 74>;   //   64 x f16 vector value
+def v128f16  : ValueType<2048, 75>;   //  128 x f16 vector value
+def v2bf16   : ValueType<32 , 76>;    //    2 x bf16 vector value
+def v3bf16   : ValueType<48 , 77>;    //    3 x bf16 vector value
+def v4bf16   : ValueType<64 , 78>;    //    4 x bf16 vector value
+def v8bf16   : ValueType<128, 79>;    //    8 x bf16 vector value
+def v16bf16  : ValueType<256, 80>;    //   16 x bf16 vector value
+def v32bf16  : ValueType<512, 81>;    //   32 x bf16 vector value
+def v64bf16  : ValueType<1024, 82>;   //   64 x bf16 vector value
+def v128bf16 : ValueType<2048, 83>;   //  128 x bf16 vector value
+def v1f32    : ValueType<32 , 84>;    //    1 x f32 vector value
+def v2f32    : ValueType<64 , 85>;    //    2 x f32 vector value
+def v3f32    : ValueType<96 , 86>;    //    3 x f32 vector value
+def v4f32    : ValueType<128, 87>;    //    4 x f32 vector value
+def v5f32    : ValueType<160, 88>;    //    5 x f32 vector value
+def v8f32    : ValueType<256, 89>;    //    8 x f32 vector value
+def v16f32   : ValueType<512,  90>;   //   16 x f32 vector value
+def v32f32   : ValueType<1024, 91>;   //   32 x f32 vector value
+def v64f32   : ValueType<2048, 92>;   //   64 x f32 vector value
+def v128f32  : ValueType<4096, 93>;   //  128 x f32 vector value
+def v256f32  : ValueType<8182, 94>;   //  256 x f32 vector value
+def v512f32  : ValueType<16384, 95>;  //  512 x f32 vector value
+def v1024f32 : ValueType<32768, 96>;  // 1024 x f32 vector value
+def v2048f32 : ValueType<65536, 97>;  // 2048 x f32 vector value
+def v1f64    : ValueType<64, 98>;     //    1 x f64 vector value
+def v2f64    : ValueType<128, 99>;    //    2 x f64 vector value
+def v4f64    : ValueType<256, 100>;    //    4 x f64 vector value
+def v8f64    : ValueType<512, 101>;    //    8 x f64 vector value
+def v16f64   : ValueType<1024, 102>;   //   16 x f64 vector value
+def v32f64   : ValueType<2048, 103>;  //   32 x f64 vector value
+def v64f64   : ValueType<4096, 104>;  //   64 x f64 vector value
+def v128f64  : ValueType<8192, 105>;  //  128 x f64 vector value
+def v256f64  : ValueType<16384, 106>; //  256 x f64 vector value
+
+def nxv1i1  : ValueType<1,  107>;  // n x  1 x i1  vector value
+def nxv2i1  : ValueType<2,  108>;  // n x  2 x i1  vector value
+def nxv4i1  : ValueType<4,  109>;  // n x  4 x i1  vector value
+def nxv8i1  : ValueType<8,  110>;  // n x  8 x i1  vector value
+def nxv16i1 : ValueType<16, 111>;  // n x 16 x i1  vector value
+def nxv32i1 : ValueType<32, 112>;  // n x 32 x i1  vector value
+def nxv64i1  : ValueType<64,113>;  // n x  64 x i1  vector value
+
+def nxv1i8  : ValueType<8,   114>;  // n x  1 x i8  vector value
+def nxv2i8  : ValueType<16,  115>;  // n x  2 x i8  vector value
+def nxv4i8  : ValueType<32,  116>;  // n x  4 x i8  vector value
+def nxv8i8  : ValueType<64,  117>;  // n x  8 x i8  vector value
+def nxv16i8 : ValueType<128, 118>;  // n x 16 x i8  vector value
+def nxv32i8 : ValueType<256, 119>;  // n x 32 x i8  vector value
+def nxv64i8  : ValueType<512,  120>;  // n x  64 x i8  vector value
+
+def nxv1i16 : ValueType<16,  121>; // n x  1 x i16 vector value
+def nxv2i16 : ValueType<32,  122>; // n x  2 x i16 vector value
+def nxv4i16 : ValueType<64,  123>; // n x  4 x i16 vector value
+def nxv8i16 : ValueType<128, 124>; // n x  8 x i16 vector value
+def nxv16i16: ValueType<256, 125>; // n x 16 x i16 vector value
+def nxv32i16: ValueType<512, 126>; // n x 32 x i16 vector value
+
+def nxv1i32 : ValueType<32,  127>; // n x  1 x i32 vector value
+def nxv2i32 : ValueType<64,  128>; // n x  2 x i32 vector value
+def nxv4i32 : ValueType<128, 129>; // n x  4 x i32 vector value
+def nxv8i32 : ValueType<256, 130>; // n x  8 x i32 vector value
+def nxv16i32: ValueType<512, 131>; // n x 16 x i32 vector value
+def nxv32i32: ValueType<1024,132>; // n x 32 x i32 vector value
+
+def nxv1i64 : ValueType<64,  133>; // n x  1 x i64 vector value
+def nxv2i64 : ValueType<128, 134>; // n x  2 x i64 vector value
+def nxv4i64 : ValueType<256, 135>; // n x  4 x i64 vector value
+def nxv8i64 : ValueType<512, 136>; // n x  8 x i64 vector value
+def nxv16i64: ValueType<1024,137>; // n x 16 x i64 vector value
+def nxv32i64: ValueType<2048,138>; // n x 32 x i64 vector value
+
+def nxv1f16   : ValueType<32, 139>; // n x   1 x f16 vector value
+def nxv2f16  : ValueType<32 , 140>; // n x  2 x f16 vector value
+def nxv4f16  : ValueType<64 , 141>; // n x  4 x f16 vector value
+def nxv8f16  : ValueType<128, 142>; // n x  8 x f16 vector value
+def nxv16f16  : ValueType<256,143>; // n x  16 x f16 vector value
+def nxv32f16  : ValueType<512,144>; // n x  32 x f16 vector value
+def nxv2bf16 : ValueType<32 , 145>; // n x  2 x bf16 vector value
+def nxv4bf16 : ValueType<64 , 146>; // n x  4 x bf16 vector value
+def nxv8bf16 : ValueType<128, 147>; // n x  8 x bf16 vector value
+def nxv1f32  : ValueType<32 , 148>; // n x  1 x f32 vector value
+def nxv2f32  : ValueType<64 , 149>; // n x  2 x f32 vector value
+def nxv4f32  : ValueType<128, 150>; // n x  4 x f32 vector value
+def nxv8f32  : ValueType<256, 151>; // n x  8 x f32 vector value
+def nxv16f32 : ValueType<512, 152>; // n x 16 x f32 vector value
+def nxv1f64  : ValueType<64,  153>; // n x  1 x f64 vector value
+def nxv2f64  : ValueType<128, 154>; // n x  2 x f64 vector value
+def nxv4f64  : ValueType<256, 155>; // n x  4 x f64 vector value
+def nxv8f64  : ValueType<512, 156>; // n x  8 x f64 vector value
+
+def x86mmx : ValueType<64 , 157>;   // X86 MMX value
+def FlagVT : ValueType<0  , 158>;   // Pre-RA sched glue
+def isVoid : ValueType<0  , 159>;   // Produces no value
+def untyped: ValueType<8  , 160>;   // Produces an untyped value
+def funcref : ValueType<0  , 161>;   // WebAssembly's funcref type
+def externref : ValueType<0  , 162>;   // WebAssembly's externref type
+def x86amx : ValueType<8192, 163>;   // X86 AMX value
 
-def v1i128 : ValueType<128, 64>;   //  1 x i128 vector value
-
-def v2f16    : ValueType<32 , 65>;    //    2 x f16 vector value
-def v3f16    : ValueType<48 , 66>;    //    3 x f16 vector value
-def v4f16    : ValueType<64 , 67>;    //    4 x f16 vector value
-def v8f16    : ValueType<128, 68>;    //    8 x f16 vector value
-def v16f16   : ValueType<256, 69>;    //   16 x f16 vector value
-def v32f16   : ValueType<512, 70>;    //   32 x f16 vector value
-def v64f16   : ValueType<1024, 71>;   //   64 x f16 vector value
-def v128f16  : ValueType<2048, 72>;   //  128 x f16 vector value
-def v2bf16   : ValueType<32 , 73>;    //    2 x bf16 vector value
-def v3bf16   : ValueType<48 , 74>;    //    3 x bf16 vector value
-def v4bf16   : ValueType<64 , 75>;    //    4 x bf16 vector value
-def v8bf16   : ValueType<128, 76>;    //    8 x bf16 vector value
-def v16bf16  : ValueType<256, 77>;    //   16 x bf16 vector value
-def v32bf16  : ValueType<512, 78>;    //   32 x bf16 vector value
-def v64bf16  : ValueType<1024, 79>;   //   64 x bf16 vector value
-def v128bf16 : ValueType<2048, 80>;   //  128 x bf16 vector value
-def v1f32    : ValueType<32 , 81>;    //    1 x f32 vector value
-def v2f32    : ValueType<64 , 82>;    //    2 x f32 vector value
-def v3f32    : ValueType<96 , 83>;    //    3 x f32 vector value
-def v4f32    : ValueType<128, 84>;    //    4 x f32 vector value
-def v5f32    : ValueType<160, 85>;    //    5 x f32 vector value
-def v8f32    : ValueType<256, 86>;    //    8 x f32 vector value
-def v16f32   : ValueType<512,  87>;   //   16 x f32 vector value
-def v32f32   : ValueType<1024, 88>;   //   32 x f32 vector value
-def v64f32   : ValueType<2048, 89>;   //   64 x f32 vector value
-def v128f32  : ValueType<4096, 90>;   //  128 x f32 vector value
-def v256f32  : ValueType<8182, 91>;   //  256 x f32 vector value
-def v512f32  : ValueType<16384, 92>;  //  512 x f32 vector value
-def v1024f32 : ValueType<32768, 93>;  // 1024 x f32 vector value
-def v2048f32 : ValueType<65536, 94>;  // 2048 x f32 vector value
-def v1f64    : ValueType<64, 95>;     //    1 x f64 vector value
-def v2f64    : ValueType<128, 96>;    //    2 x f64 vector value
-def v4f64    : ValueType<256, 97>;    //    4 x f64 vector value
-def v8f64    : ValueType<512, 98>;    //    8 x f64 vector value
-def v16f64   : ValueType<1024, 99>;   //   16 x f64 vector value
-def v32f64   : ValueType<2048, 100>;  //   32 x f64 vector value
-
-def nxv1i1  : ValueType<1,  101>;  // n x  1 x i1  vector value
-def nxv2i1  : ValueType<2,  102>;  // n x  2 x i1  vector value
-def nxv4i1  : ValueType<4,  103>;  // n x  4 x i1  vector value
-def nxv8i1  : ValueType<8,  104>;  // n x  8 x i1  vector value
-def nxv16i1 : ValueType<16, 105>;  // n x 16 x i1  vector value
-def nxv32i1 : ValueType<32, 106>;  // n x 32 x i1  vector value
-def nxv64i1  : ValueType<64,107>;  // n x  64 x i1  vector value
-
-def nxv1i8  : ValueType<8,   108>;  // n x  1 x i8  vector value
-def nxv2i8  : ValueType<16,  109>;  // n x  2 x i8  vector value
-def nxv4i8  : ValueType<32,  110>;  // n x  4 x i8  vector value
-def nxv8i8  : ValueType<64,  111>;  // n x  8 x i8  vector value
-def nxv16i8 : ValueType<128, 112>;  // n x 16 x i8  vector value
-def nxv32i8 : ValueType<256, 113>;  // n x 32 x i8  vector value
-def nxv64i8  : ValueType<512,  114>;  // n x  64 x i8  vector value
-
-def nxv1i16 : ValueType<16,  115>; // n x  1 x i16 vector value
-def nxv2i16 : ValueType<32,  116>; // n x  2 x i16 vector value
-def nxv4i16 : ValueType<64,  117>; // n x  4 x i16 vector value
-def nxv8i16 : ValueType<128, 118>; // n x  8 x i16 vector value
-def nxv16i16: ValueType<256, 119>; // n x 16 x i16 vector value
-def nxv32i16: ValueType<512, 120>; // n x 32 x i16 vector value
-
-def nxv1i32 : ValueType<32,  121>; // n x  1 x i32 vector value
-def nxv2i32 : ValueType<64,  122>; // n x  2 x i32 vector value
-def nxv4i32 : ValueType<128, 123>; // n x  4 x i32 vector value
-def nxv8i32 : ValueType<256, 124>; // n x  8 x i32 vector value
-def nxv16i32: ValueType<512, 125>; // n x 16 x i32 vector value
-def nxv32i32: ValueType<1024,126>; // n x 32 x i32 vector value
-
-def nxv1i64 : ValueType<64,  127>; // n x  1 x i64 vector value
-def nxv2i64 : ValueType<128, 128>; // n x  2 x i64 vector value
-def nxv4i64 : ValueType<256, 129>; // n x  4 x i64 vector value
-def nxv8i64 : ValueType<512, 130>; // n x  8 x i64 vector value
-def nxv16i64: ValueType<1024,131>; // n x 16 x i64 vector value
-def nxv32i64: ValueType<2048,132>; // n x 32 x i64 vector value
-
-def nxv1f16   : ValueType<32, 133>; // n x   1 x f16 vector value
-def nxv2f16  : ValueType<32 , 134>; // n x  2 x f16 vector value
-def nxv4f16  : ValueType<64 , 135>; // n x  4 x f16 vector value
-def nxv8f16  : ValueType<128, 136>; // n x  8 x f16 vector value
-def nxv16f16  : ValueType<256,137>; // n x  16 x f16 vector value
-def nxv32f16  : ValueType<512,138>; // n x  32 x f16 vector value
-def nxv2bf16 : ValueType<32 , 139>; // n x  2 x bf16 vector value
-def nxv4bf16 : ValueType<64 , 140>; // n x  4 x bf16 vector value
-def nxv8bf16 : ValueType<128, 141>; // n x  8 x bf16 vector value
-def nxv1f32  : ValueType<32 , 142>; // n x  1 x f32 vector value
-def nxv2f32  : ValueType<64 , 143>; // n x  2 x f32 vector value
-def nxv4f32  : ValueType<128, 144>; // n x  4 x f32 vector value
-def nxv8f32  : ValueType<256, 145>; // n x  8 x f32 vector value
-def nxv16f32 : ValueType<512, 146>; // n x 16 x f32 vector value
-def nxv1f64  : ValueType<64,  147>; // n x  1 x f64 vector value
-def nxv2f64  : ValueType<128, 148>; // n x  2 x f64 vector value
-def nxv4f64  : ValueType<256, 149>; // n x  4 x f64 vector value
-def nxv8f64  : ValueType<512, 150>; // n x  8 x f64 vector value
-
-def x86mmx : ValueType<64 , 151>;   // X86 MMX value
-def FlagVT : ValueType<0  , 152>;   // Pre-RA sched glue
-def isVoid : ValueType<0  , 153>;   // Produces no value
-def untyped: ValueType<8  , 154>;   // Produces an untyped value
-def exnref : ValueType<0  , 155>;   // WebAssembly's exnref type
 
 def token  : ValueType<0  , 248>;   // TokenTy
 def MetadataVT: ValueType<0, 249>;  // Metadata
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/VirtRegMap.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/VirtRegMap.h
index 823154318eb7..deef4b90279a 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/VirtRegMap.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/VirtRegMap.h
@@ -19,6 +19,7 @@
 #include "llvm/ADT/IndexedMap.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TileShapeInfo.h"
 #include "llvm/Pass.h"
 #include <cassert>
 
@@ -60,6 +61,10 @@ class TargetInstrInfo;
     /// mapping.
     IndexedMap<unsigned, VirtReg2IndexFunctor> Virt2SplitMap;
 
+    /// Virt2ShapeMap - For X86 AMX register whose register is bound shape
+    /// information.
+    DenseMap<unsigned, ShapeT> Virt2ShapeMap;
+
     /// createSpillSlot - Allocate a spill slot for RC from MFI.
     unsigned createSpillSlot(const TargetRegisterClass *RC);
 
@@ -98,15 +103,30 @@ class TargetInstrInfo;
 
     /// returns the physical register mapped to the specified
     /// virtual register
-    Register getPhys(Register virtReg) const {
+    MCRegister getPhys(Register virtReg) const {
       assert(virtReg.isVirtual());
-      return Virt2PhysMap[virtReg.id()];
+      return MCRegister::from(Virt2PhysMap[virtReg.id()]);
     }
 
     /// creates a mapping for the specified virtual register to
     /// the specified physical register
     void assignVirt2Phys(Register virtReg, MCPhysReg physReg);
 
+    bool isShapeMapEmpty() const { return Virt2ShapeMap.empty(); }
+
+    bool hasShape(Register virtReg) const {
+      return getShape(virtReg).isValid();
+    }
+
+    ShapeT getShape(Register virtReg) const {
+      assert(virtReg.isVirtual());
+      return Virt2ShapeMap.lookup(virtReg);
+    }
+
+    void assignVirt2Shape(Register virtReg, ShapeT shape) {
+      Virt2ShapeMap[virtReg.id()] = shape;
+    }
+
     /// clears the specified virtual register's, physical
     /// register mapping
     void clearVirt(Register virtReg) {
@@ -131,12 +151,15 @@ class TargetInstrInfo;
     bool hasKnownPreference(Register VirtReg);
 
     /// records virtReg is a split live interval from SReg.
-    void setIsSplitFromReg(Register virtReg, unsigned SReg) {
+    void setIsSplitFromReg(Register virtReg, Register SReg) {
       Virt2SplitMap[virtReg.id()] = SReg;
+      if (hasShape(SReg)) {
+        Virt2ShapeMap[virtReg.id()] = getShape(SReg);
+      }
     }
 
     /// returns the live interval virtReg is split from.
-    unsigned getPreSplitReg(Register virtReg) const {
+    Register getPreSplitReg(Register virtReg) const {
       return Virt2SplitMap[virtReg.id()];
     }
 
@@ -144,8 +167,8 @@ class TargetInstrInfo;
     /// from through splitting.
     /// A register that was not created by splitting is its own original.
     /// This operation is idempotent.
-    unsigned getOriginal(unsigned VirtReg) const {
-      unsigned Orig = getPreSplitReg(VirtReg);
+    Register getOriginal(Register VirtReg) const {
+      Register Orig = getPreSplitReg(VirtReg);
       return Orig ? Orig : VirtReg;
     }
 
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/WasmEHFuncInfo.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/WasmEHFuncInfo.h
index 41f8856f31f2..54e8c40a9e72 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/WasmEHFuncInfo.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/WasmEHFuncInfo.h
@@ -22,7 +22,9 @@ class BasicBlock;
 class Function;
 class MachineBasicBlock;
 
+namespace WebAssembly {
 enum EventTag { CPP_EXCEPTION = 0, C_LONGJMP = 1 };
+}
 
 using BBOrMBB = PointerUnion<const BasicBlock *, MachineBasicBlock *>;
 
diff --git a/contrib/llvm-project/llvm/include/llvm/DWARFLinker/DWARFLinker.h b/contrib/llvm-project/llvm/include/llvm/DWARFLinker/DWARFLinker.h
index be3c5ebcadae..7281966fc608 100644
--- a/contrib/llvm-project/llvm/include/llvm/DWARFLinker/DWARFLinker.h
+++ b/contrib/llvm-project/llvm/include/llvm/DWARFLinker/DWARFLinker.h
@@ -64,14 +64,17 @@ public:
   /// section. Reset current relocation pointer if neccessary.
   virtual bool hasValidRelocs(bool ResetRelocsPtr = true) = 0;
 
-  /// Checks that there is a relocation against .debug_info
-  /// table between \p StartOffset and \p NextOffset.
-  ///
-  /// This function must be called with offsets in strictly ascending
-  /// order because it never looks back at relocations it already 'went past'.
-  /// \returns true and sets Info.InDebugMap if it is the case.
-  virtual bool hasValidRelocationAt(uint64_t StartOffset, uint64_t EndOffset,
-                                    CompileUnit::DIEInfo &Info) = 0;
+  /// Checks that the specified DIE has a DW_AT_Location attribute
+  /// that references into a live code section. This function
+  /// must be called with DIE offsets in strictly ascending order.
+  virtual bool hasLiveMemoryLocation(const DWARFDie &DIE,
+                                     CompileUnit::DIEInfo &Info) = 0;
+
+  /// Checks that the specified DIE has a DW_AT_Low_pc attribute
+  /// that references into a live code section. This function
+  /// must be called with DIE offsets in strictly ascending order.
+  virtual bool hasLiveAddressRange(const DWARFDie &DIE,
+                                   CompileUnit::DIEInfo &Info) = 0;
 
   /// Apply the valid relocations to the buffer \p Data, taking into
   /// account that Data is at \p BaseOffset in the debug_info section.
@@ -82,6 +85,9 @@ public:
   virtual bool applyValidRelocs(MutableArrayRef<char> Data, uint64_t BaseOffset,
                                 bool IsLittleEndian) = 0;
 
+  /// Relocate the given address offset if a valid relocation exists.
+  virtual llvm::Expected<uint64_t> relocateIndexedAddr(uint64_t Offset) = 0;
+
   /// Returns all valid functions address ranges(i.e., those ranges
   /// which points to sections with code).
   virtual RangesTy &getValidAddressRanges() = 0;
@@ -180,7 +186,8 @@ public:
   ///
   /// As a side effect, this also switches the current Dwarf version
   /// of the MC layer to the one of U.getOrigUnit().
-  virtual void emitCompileUnitHeader(CompileUnit &Unit) = 0;
+  virtual void emitCompileUnitHeader(CompileUnit &Unit,
+                                     unsigned DwarfVersion) = 0;
 
   /// Recursively emit the DIE tree rooted at \p Die.
   virtual void emitDIE(DIE &Die) = 0;
@@ -202,9 +209,9 @@ using UnitListTy = std::vector<std::unique_ptr<CompileUnit>>;
 
 /// this class represents DWARF information for source file
 /// and it`s address map.
-class DwarfFile {
+class DWARFFile {
 public:
-  DwarfFile(StringRef Name, DWARFContext *Dwarf, AddressesMap *Addresses,
+  DWARFFile(StringRef Name, DWARFContext *Dwarf, AddressesMap *Addresses,
             const std::vector<std::string> &Warnings)
       : FileName(Name), Dwarf(Dwarf), Addresses(Addresses), Warnings(Warnings) {
   }
@@ -222,7 +229,7 @@ public:
 typedef std::function<void(const Twine &Warning, StringRef Context,
                            const DWARFDie *DIE)>
     messageHandler;
-typedef std::function<ErrorOr<DwarfFile &>(StringRef ContainerName,
+typedef std::function<ErrorOr<DWARFFile &>(StringRef ContainerName,
                                            StringRef Path)>
     objFileLoader;
 typedef std::map<std::string, std::string> swiftInterfacesMap;
@@ -249,7 +256,7 @@ public:
       : TheDwarfEmitter(Emitter), DwarfLinkerClientID(ClientID) {}
 
   /// Add object file to be linked.
-  void addObjectFile(DwarfFile &File);
+  void addObjectFile(DWARFFile &File);
 
   /// Link debug info for added objFiles. Object
   /// files are linked all together.
@@ -353,36 +360,38 @@ private:
   /// of work needs to be performed when processing the current item. The flags
   /// and info fields are optional based on the type.
   struct WorklistItem {
-    WorklistItemType Type;
     DWARFDie Die;
+    WorklistItemType Type;
     CompileUnit &CU;
     unsigned Flags;
-    unsigned AncestorIdx = 0;
-    CompileUnit::DIEInfo *OtherInfo = nullptr;
+    union {
+      const unsigned AncestorIdx;
+      CompileUnit::DIEInfo *OtherInfo;
+    };
 
     WorklistItem(DWARFDie Die, CompileUnit &CU, unsigned Flags,
                  WorklistItemType T = WorklistItemType::LookForDIEsToKeep)
-        : Type(T), Die(Die), CU(CU), Flags(Flags) {}
+        : Die(Die), Type(T), CU(CU), Flags(Flags), AncestorIdx(0) {}
 
     WorklistItem(DWARFDie Die, CompileUnit &CU, WorklistItemType T,
                  CompileUnit::DIEInfo *OtherInfo = nullptr)
-        : Type(T), Die(Die), CU(CU), OtherInfo(OtherInfo) {}
+        : Die(Die), Type(T), CU(CU), Flags(0), OtherInfo(OtherInfo) {}
 
     WorklistItem(unsigned AncestorIdx, CompileUnit &CU, unsigned Flags)
-        : Type(WorklistItemType::LookForParentDIEsToKeep), CU(CU), Flags(Flags),
-          AncestorIdx(AncestorIdx) {}
+        : Die(), Type(WorklistItemType::LookForParentDIEsToKeep), CU(CU),
+          Flags(Flags), AncestorIdx(AncestorIdx) {}
   };
 
   /// returns true if we need to translate strings.
   bool needToTranslateStrings() { return StringsTranslator != nullptr; }
 
-  void reportWarning(const Twine &Warning, const DwarfFile &File,
+  void reportWarning(const Twine &Warning, const DWARFFile &File,
                      const DWARFDie *DIE = nullptr) const {
     if (Options.WarningHandler != nullptr)
       Options.WarningHandler(Warning, File.FileName, DIE);
   }
 
-  void reportError(const Twine &Warning, const DwarfFile &File,
+  void reportError(const Twine &Warning, const DWARFFile &File,
                    const DWARFDie *DIE = nullptr) const {
     if (Options.ErrorHandler != nullptr)
       Options.ErrorHandler(Warning, File.FileName, DIE);
@@ -398,18 +407,18 @@ private:
   void updateAccelKind(DWARFContext &Dwarf);
 
   /// Emit warnings as Dwarf compile units to leave a trail after linking.
-  bool emitPaperTrailWarnings(const DwarfFile &File,
+  bool emitPaperTrailWarnings(const DWARFFile &File,
                               OffsetsStringPool &StringPool);
 
   void copyInvariantDebugSection(DWARFContext &Dwarf);
 
   /// Keeps track of data associated with one object during linking.
   struct LinkContext {
-    DwarfFile &File;
+    DWARFFile &File;
     UnitListTy CompileUnits;
     bool Skip = false;
 
-    LinkContext(DwarfFile &File) : File(File) {}
+    LinkContext(DWARFFile &File) : File(File) {}
 
     /// Clear part of the context that's no longer needed when we're done with
     /// the debug object.
@@ -438,7 +447,7 @@ private:
   /// kept. All DIEs referenced though attributes should be kept.
   void lookForRefDIEsToKeep(const DWARFDie &Die, CompileUnit &CU,
                             unsigned Flags, const UnitListTy &Units,
-                            const DwarfFile &File,
+                            const DWARFFile &File,
                             SmallVectorImpl<WorklistItem> &Worklist);
 
   /// \defgroup FindRootDIEs Find DIEs corresponding to Address map entries.
@@ -450,7 +459,7 @@ private:
   /// The return value indicates whether the DIE is incomplete.
   void lookForDIEsToKeep(AddressesMap &RelocMgr, RangesTy &Ranges,
                          const UnitListTy &Units, const DWARFDie &DIE,
-                         const DwarfFile &File, CompileUnit &CU,
+                         const DWARFFile &File, CompileUnit &CU,
                          unsigned Flags);
 
   /// If this compile unit is really a skeleton CU that points to a
@@ -460,9 +469,8 @@ private:
   /// pointing to the module, and a DW_AT_gnu_dwo_id with the module
   /// hash.
   bool registerModuleReference(DWARFDie CUDie, const DWARFUnit &Unit,
-                               const DwarfFile &File,
+                               const DWARFFile &File,
                                OffsetsStringPool &OffsetsStringPool,
-                               UniquingStringPool &UniquingStringPoolStringPool,
                                DeclContextTree &ODRContexts,
                                uint64_t ModulesEndOffset, unsigned &UnitID,
                                bool IsLittleEndian, unsigned Indent = 0,
@@ -473,9 +481,8 @@ private:
   /// to Units.
   Error loadClangModule(DWARFDie CUDie, StringRef FilePath,
                         StringRef ModuleName, uint64_t DwoId,
-                        const DwarfFile &File,
+                        const DWARFFile &File,
                         OffsetsStringPool &OffsetsStringPool,
-                        UniquingStringPool &UniquingStringPool,
                         DeclContextTree &ODRContexts, uint64_t ModulesEndOffset,
                         unsigned &UnitID, bool IsLittleEndian,
                         unsigned Indent = 0, bool Quiet = false);
@@ -484,22 +491,21 @@ private:
   void keepDIEAndDependencies(AddressesMap &RelocMgr, RangesTy &Ranges,
                               const UnitListTy &Units, const DWARFDie &DIE,
                               CompileUnit::DIEInfo &MyInfo,
-                              const DwarfFile &File, CompileUnit &CU,
+                              const DWARFFile &File, CompileUnit &CU,
                               bool UseODR);
 
   unsigned shouldKeepDIE(AddressesMap &RelocMgr, RangesTy &Ranges,
-                         const DWARFDie &DIE, const DwarfFile &File,
+                         const DWARFDie &DIE, const DWARFFile &File,
                          CompileUnit &Unit, CompileUnit::DIEInfo &MyInfo,
                          unsigned Flags);
 
   /// Check if a variable describing DIE should be kept.
   /// \returns updated TraversalFlags.
   unsigned shouldKeepVariableDIE(AddressesMap &RelocMgr, const DWARFDie &DIE,
-                                 CompileUnit &Unit,
                                  CompileUnit::DIEInfo &MyInfo, unsigned Flags);
 
   unsigned shouldKeepSubprogramDIE(AddressesMap &RelocMgr, RangesTy &Ranges,
-                                   const DWARFDie &DIE, const DwarfFile &File,
+                                   const DWARFDie &DIE, const DWARFFile &File,
                                    CompileUnit &Unit,
                                    CompileUnit::DIEInfo &MyInfo,
                                    unsigned Flags);
@@ -508,7 +514,7 @@ private:
   /// RefValue. The resulting DIE might be in another CompileUnit which is
   /// stored into \p ReferencedCU. \returns null if resolving fails for any
   /// reason.
-  DWARFDie resolveDIEReference(const DwarfFile &File, const UnitListTy &Units,
+  DWARFDie resolveDIEReference(const DWARFFile &File, const UnitListTy &Units,
                                const DWARFFormValue &RefValue,
                                const DWARFDie &DIE, CompileUnit *&RefCU);
 
@@ -523,7 +529,7 @@ private:
   class DIECloner {
     DWARFLinker &Linker;
     DwarfEmitter *Emitter;
-    DwarfFile &ObjFile;
+    DWARFFile &ObjFile;
 
     /// Allocator used for all the DIEValue objects.
     BumpPtrAllocator &DIEAlloc;
@@ -533,7 +539,7 @@ private:
     bool Update;
 
   public:
-    DIECloner(DWARFLinker &Linker, DwarfEmitter *Emitter, DwarfFile &ObjFile,
+    DIECloner(DWARFLinker &Linker, DwarfEmitter *Emitter, DWARFFile &ObjFile,
               BumpPtrAllocator &DIEAlloc,
               std::vector<std::unique_ptr<CompileUnit>> &CompileUnits,
               bool Update)
@@ -551,7 +557,7 @@ private:
     /// applied to the entry point of the function to get the linked address.
     /// \param Die the output DIE to use, pass NULL to create one.
     /// \returns the root of the cloned tree or null if nothing was selected.
-    DIE *cloneDIE(const DWARFDie &InputDIE, const DwarfFile &File,
+    DIE *cloneDIE(const DWARFDie &InputDIE, const DWARFFile &File,
                   CompileUnit &U, OffsetsStringPool &StringPool,
                   int64_t PCOffset, uint32_t OutOffset, unsigned Flags,
                   bool IsLittleEndian, DIE *Die = nullptr);
@@ -560,7 +566,7 @@ private:
     /// chose to keep above. If there are no valid relocs, then there's
     /// nothing to clone/emit.
     uint64_t cloneAllCompileUnits(DWARFContext &DwarfContext,
-                                  const DwarfFile &File,
+                                  const DWARFFile &File,
                                   OffsetsStringPool &StringPool,
                                   bool IsLittleEndian);
 
@@ -606,7 +612,7 @@ private:
 
     /// Helper for cloneDIE.
     unsigned cloneAttribute(DIE &Die, const DWARFDie &InputDIE,
-                            const DwarfFile &File, CompileUnit &U,
+                            const DWARFFile &File, CompileUnit &U,
                             OffsetsStringPool &StringPool,
                             const DWARFFormValue &Val,
                             const AttributeSpec AttrSpec, unsigned AttrSize,
@@ -627,18 +633,18 @@ private:
                                         AttributeSpec AttrSpec,
                                         unsigned AttrSize,
                                         const DWARFFormValue &Val,
-                                        const DwarfFile &File,
+                                        const DWARFFile &File,
                                         CompileUnit &Unit);
 
     /// Clone a DWARF expression that may be referencing another DIE.
     void cloneExpression(DataExtractor &Data, DWARFExpression Expression,
-                         const DwarfFile &File, CompileUnit &Unit,
+                         const DWARFFile &File, CompileUnit &Unit,
                          SmallVectorImpl<uint8_t> &OutputBuffer);
 
     /// Clone an attribute referencing another DIE and add
     /// it to \p Die.
     /// \returns the size of the new attribute.
-    unsigned cloneBlockAttribute(DIE &Die, const DwarfFile &File,
+    unsigned cloneBlockAttribute(DIE &Die, const DWARFFile &File,
                                  CompileUnit &Unit, AttributeSpec AttrSpec,
                                  const DWARFFormValue &Val, unsigned AttrSize,
                                  bool IsLittleEndian);
@@ -654,7 +660,7 @@ private:
     /// Clone a scalar attribute  and add it to \p Die.
     /// \returns the size of the new attribute.
     unsigned cloneScalarAttribute(DIE &Die, const DWARFDie &InputDIE,
-                                  const DwarfFile &File, CompileUnit &U,
+                                  const DWARFFile &File, CompileUnit &U,
                                   AttributeSpec AttrSpec,
                                   const DWARFFormValue &Val, unsigned AttrSize,
                                   AttributesInfo &Info);
@@ -670,7 +676,7 @@ private:
     void copyAbbrev(const DWARFAbbreviationDeclaration &Abbrev, bool hasODR);
 
     uint32_t hashFullyQualifiedName(DWARFDie DIE, CompileUnit &U,
-                                    const DwarfFile &File,
+                                    const DWARFFile &File,
                                     int RecurseDepth = 0);
 
     /// Helper for cloneDIE.
@@ -685,7 +691,7 @@ private:
   /// Compute and emit debug_ranges section for \p Unit, and
   /// patch the attributes referencing it.
   void patchRangesForUnit(const CompileUnit &Unit, DWARFContext &Dwarf,
-                          const DwarfFile &File) const;
+                          const DWARFFile &File) const;
 
   /// Generate and emit the DW_AT_ranges attribute for a compile_unit if it had
   /// one.
@@ -695,7 +701,7 @@ private:
   /// parts according to the linked function ranges and emit the result in the
   /// debug_line section.
   void patchLineTableForUnit(CompileUnit &Unit, DWARFContext &OrigDwarf,
-                             const DwarfFile &File);
+                             const DWARFFile &File);
 
   /// Emit the accelerator entries for \p Unit.
   void emitAcceleratorEntriesForUnit(CompileUnit &Unit);
@@ -703,7 +709,7 @@ private:
   void emitAppleAcceleratorEntriesForUnit(CompileUnit &Unit);
 
   /// Patch the frame info for an object file and emit it.
-  void patchFrameInfoForObject(const DwarfFile &, RangesTy &Ranges,
+  void patchFrameInfoForObject(const DWARFFile &, RangesTy &Ranges,
                                DWARFContext &, unsigned AddressSize);
 
   /// FoldingSet that uniques the abbreviations.
diff --git a/contrib/llvm-project/llvm/include/llvm/DWARFLinker/DWARFLinkerCompileUnit.h b/contrib/llvm-project/llvm/include/llvm/DWARFLinker/DWARFLinkerCompileUnit.h
index 944e7e3501c9..a6310bcb5df1 100644
--- a/contrib/llvm-project/llvm/include/llvm/DWARFLinker/DWARFLinkerCompileUnit.h
+++ b/contrib/llvm-project/llvm/include/llvm/DWARFLinker/DWARFLinkerCompileUnit.h
@@ -101,10 +101,7 @@ public:
 
   unsigned getUniqueID() const { return ID; }
 
-  void createOutputDIE() {
-    NewUnit.emplace(OrigUnit.getVersion(), OrigUnit.getAddressByteSize(),
-                    OrigUnit.getUnitDIE().getTag());
-  }
+  void createOutputDIE() { NewUnit.emplace(OrigUnit.getUnitDIE().getTag()); }
 
   DIE *getOutputUnitDIE() const {
     if (NewUnit)
@@ -123,6 +120,11 @@ public:
   DIEInfo &getInfo(unsigned Idx) { return Info[Idx]; }
   const DIEInfo &getInfo(unsigned Idx) const { return Info[Idx]; }
 
+  DIEInfo &getInfo(const DWARFDie &Die) {
+    unsigned Idx = getOrigUnit().getDIEIndex(Die);
+    return Info[Idx];
+  }
+
   uint64_t getStartOffset() const { return StartOffset; }
   uint64_t getNextUnitOffset() const { return NextUnitOffset; }
   void setStartOffset(uint64_t DebugInfoSize) { StartOffset = DebugInfoSize; }
@@ -157,7 +159,7 @@ public:
   /// Compute the end offset for this unit. Must be called after the CU's DIEs
   /// have been cloned.  \returns the next unit offset (which is also the
   /// current debug_info section size).
-  uint64_t computeNextUnitOffset();
+  uint64_t computeNextUnitOffset(uint16_t DwarfVersion);
 
   /// Keep track of a forward reference to DIE \p Die in \p RefUnit by \p
   /// Attr. The attribute should be fixed up later to point to the absolute
@@ -235,21 +237,6 @@ public:
   const std::vector<AccelInfo> &getNamespaces() const { return Namespaces; }
   const std::vector<AccelInfo> &getObjC() const { return ObjC; }
 
-  /// Get the full path for file \a FileNum in the line table
-  StringRef getResolvedPath(unsigned FileNum) {
-    if (FileNum >= ResolvedPaths.size())
-      return StringRef();
-    return ResolvedPaths[FileNum];
-  }
-
-  /// Set the fully resolved path for the line-table's file \a FileNum
-  /// to \a Path.
-  void setResolvedPath(unsigned FileNum, StringRef Path) {
-    if (ResolvedPaths.size() <= FileNum)
-      ResolvedPaths.resize(FileNum + 1);
-    ResolvedPaths[FileNum] = Path;
-  }
-
   MCSymbol *getLabelBegin() { return LabelBegin; }
   void setLabelBegin(MCSymbol *S) { LabelBegin = S; }
 
@@ -308,12 +295,6 @@ private:
   std::vector<AccelInfo> ObjC;
   /// @}
 
-  /// Cached resolved paths from the line table.
-  /// Note, the StringRefs here point in to the intern (uniquing) string pool.
-  /// This means that a StringRef returned here doesn't need to then be uniqued
-  /// for the purposes of getting a unique address for each string.
-  std::vector<StringRef> ResolvedPaths;
-
   /// Is this unit subject to the ODR rule?
   bool HasODR;
 
diff --git a/contrib/llvm-project/llvm/include/llvm/DWARFLinker/DWARFLinkerDeclContext.h b/contrib/llvm-project/llvm/include/llvm/DWARFLinker/DWARFLinkerDeclContext.h
index e59e15f00a7e..d2274488e85f 100644
--- a/contrib/llvm-project/llvm/include/llvm/DWARFLinker/DWARFLinkerDeclContext.h
+++ b/contrib/llvm-project/llvm/include/llvm/DWARFLinker/DWARFLinkerDeclContext.h
@@ -15,6 +15,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/NonRelocatableStringpool.h"
 #include "llvm/DWARFLinker/DWARFLinkerCompileUnit.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugLine.h"
 #include "llvm/DebugInfo/DWARF/DWARFDie.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
@@ -31,16 +32,18 @@ class CachedPathResolver {
 public:
   /// Resolve a path by calling realpath and cache its result. The returned
   /// StringRef is interned in the given \p StringPool.
-  StringRef resolve(std::string Path, NonRelocatableStringpool &StringPool) {
+  StringRef resolve(const std::string &Path,
+                    NonRelocatableStringpool &StringPool) {
     StringRef FileName = sys::path::filename(Path);
-    SmallString<256> ParentPath = sys::path::parent_path(Path);
+    StringRef ParentPath = sys::path::parent_path(Path);
 
     // If the ParentPath has not yet been resolved, resolve and cache it for
     // future look-ups.
     if (!ResolvedPaths.count(ParentPath)) {
       SmallString<256> RealPath;
       sys::fs::real_path(ParentPath, RealPath);
-      ResolvedPaths.insert({ParentPath, StringRef(RealPath).str()});
+      ResolvedPaths.insert(
+          {ParentPath, std::string(RealPath.c_str(), RealPath.size())});
     }
 
     // Join the file name again with the resolved path.
@@ -95,7 +98,6 @@ public:
   void setDefinedInClangModule(bool Val) { DefinedInClangModule = Val; }
 
   uint16_t getTag() const { return Tag; }
-  StringRef getName() const { return Name; }
 
 private:
   friend DeclMapInfo;
@@ -129,10 +131,10 @@ public:
   ///
   /// FIXME: The invalid bit along the return value is to emulate some
   /// dsymutil-classic functionality.
-  PointerIntPair<DeclContext *, 1>
-  getChildDeclContext(DeclContext &Context, const DWARFDie &DIE,
-                      CompileUnit &Unit, UniquingStringPool &StringPool,
-                      bool InClangModule);
+  PointerIntPair<DeclContext *, 1> getChildDeclContext(DeclContext &Context,
+                                                       const DWARFDie &DIE,
+                                                       CompileUnit &Unit,
+                                                       bool InClangModule);
 
   DeclContext &getRoot() { return Root; }
 
@@ -141,8 +143,19 @@ private:
   DeclContext Root;
   DeclContext::Map Contexts;
 
-  /// Cache resolved paths from the line table.
+  /// Cached resolved paths from the line table.
+  /// The key is <UniqueUnitID, FileIdx>.
+  using ResolvedPathsMap = DenseMap<std::pair<unsigned, unsigned>, StringRef>;
+  ResolvedPathsMap ResolvedPaths;
+
+  /// Helper that resolves and caches fragments of file paths.
   CachedPathResolver PathResolver;
+
+  /// String pool keeping real path bodies.
+  NonRelocatableStringpool StringPool;
+
+  StringRef getResolvedPath(CompileUnit &CU, unsigned FileNum,
+                            const DWARFDebugLine::LineTable &LineTable);
 };
 
 /// Info type for the DenseMap storing the DeclContext pointers.
diff --git a/contrib/llvm-project/llvm/include/llvm/DWARFLinker/DWARFStreamer.h b/contrib/llvm-project/llvm/include/llvm/DWARFLinker/DWARFStreamer.h
index de58f5dedf24..7b0851159252 100644
--- a/contrib/llvm-project/llvm/include/llvm/DWARFLinker/DWARFStreamer.h
+++ b/contrib/llvm-project/llvm/include/llvm/DWARFLinker/DWARFStreamer.h
@@ -64,7 +64,7 @@ public:
   ///
   /// As a side effect, this also switches the current Dwarf version
   /// of the MC layer to the one of U.getOrigUnit().
-  void emitCompileUnitHeader(CompileUnit &Unit) override;
+  void emitCompileUnitHeader(CompileUnit &Unit, unsigned DwarfVersion) override;
 
   /// Recursively emit the DIE tree rooted at \p Die.
   void emitDIE(DIE &Die) override;
diff --git a/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/CVRecord.h b/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/CVRecord.h
index 784c47e3bf5d..bb29ef5f2ce8 100644
--- a/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/CVRecord.h
+++ b/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/CVRecord.h
@@ -11,9 +11,9 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/Optional.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/CodeViewError.h"
 #include "llvm/DebugInfo/CodeView/RecordSerialization.h"
-#include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/BinaryStreamRef.h"
 #include "llvm/Support/Endian.h"
@@ -61,12 +61,9 @@ public:
   ArrayRef<uint8_t> RecordData;
 };
 
-template <typename Kind> struct RemappedRecord {
-  explicit RemappedRecord(const CVRecord<Kind> &R) : OriginalRecord(R) {}
-
-  CVRecord<Kind> OriginalRecord;
-  SmallVector<std::pair<uint32_t, TypeIndex>, 8> Mappings;
-};
+// There are two kinds of codeview records: type and symbol records.
+using CVType = CVRecord<TypeLeafKind>;
+using CVSymbol = CVRecord<SymbolKind>;
 
 template <typename Record, typename Func>
 Error forEachCodeViewRecord(ArrayRef<uint8_t> StreamBuffer, Func F) {
@@ -126,6 +123,12 @@ struct VarStreamArrayExtractor<codeview::CVRecord<Kind>> {
   }
 };
 
+namespace codeview {
+using CVSymbolArray = VarStreamArray<CVSymbol>;
+using CVTypeArray = VarStreamArray<CVType>;
+using CVTypeRange = iterator_range<CVTypeArray::Iterator>;
+} // namespace codeview
+
 } // end namespace llvm
 
 #endif // LLVM_DEBUGINFO_CODEVIEW_RECORDITERATOR_H
diff --git a/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/CVSymbolVisitor.h b/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/CVSymbolVisitor.h
index 1615ff41df12..82ef8c173bee 100644
--- a/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/CVSymbolVisitor.h
+++ b/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/CVSymbolVisitor.h
@@ -10,9 +10,6 @@
 #define LLVM_DEBUGINFO_CODEVIEW_CVSYMBOLVISITOR_H
 
 #include "llvm/DebugInfo/CodeView/CVRecord.h"
-#include "llvm/DebugInfo/CodeView/CodeView.h"
-#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
-#include "llvm/DebugInfo/CodeView/SymbolVisitorDelegate.h"
 #include "llvm/Support/ErrorOr.h"
 
 namespace llvm {
diff --git a/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/CodeViewRecordIO.h b/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/CodeViewRecordIO.h
index f26e80ebe2a9..d851dea0a27f 100644
--- a/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/CodeViewRecordIO.h
+++ b/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/CodeViewRecordIO.h
@@ -15,7 +15,8 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/DebugInfo/CodeView/CodeViewError.h"
-#include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/CodeView/GUID.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/BinaryStreamWriter.h"
 #include "llvm/Support/Error.h"
diff --git a/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/CodeViewRegisters.def b/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/CodeViewRegisters.def
index ed5c143818e6..48ea7e52c172 100644
--- a/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/CodeViewRegisters.def
+++ b/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/CodeViewRegisters.def
@@ -15,6 +15,7 @@
 #endif
 
 #if !defined(CV_REGISTERS_ALL) && !defined(CV_REGISTERS_X86) &&                \
+    !defined(CV_REGISTERS_ARM) &&                                              \
     !defined(CV_REGISTERS_ARM64)
 #error Need include at least one register set.
 #endif
@@ -393,13 +394,46 @@ CV_REGISTER(ARM_PC, 25)
 
 // Status register
 
-CV_REGISTER(ARM_CPSR, 25)
+CV_REGISTER(ARM_CPSR, 26)
 
 // ARM VFPv1 registers
 
 CV_REGISTER(ARM_FPSCR, 40)
 CV_REGISTER(ARM_FPEXC, 41)
 
+CV_REGISTER(ARM_FS0, 50)
+CV_REGISTER(ARM_FS1, 51)
+CV_REGISTER(ARM_FS2, 52)
+CV_REGISTER(ARM_FS3, 53)
+CV_REGISTER(ARM_FS4, 54)
+CV_REGISTER(ARM_FS5, 55)
+CV_REGISTER(ARM_FS6, 56)
+CV_REGISTER(ARM_FS7, 57)
+CV_REGISTER(ARM_FS8, 58)
+CV_REGISTER(ARM_FS9, 59)
+CV_REGISTER(ARM_FS10, 60)
+CV_REGISTER(ARM_FS11, 61)
+CV_REGISTER(ARM_FS12, 62)
+CV_REGISTER(ARM_FS13, 63)
+CV_REGISTER(ARM_FS14, 64)
+CV_REGISTER(ARM_FS15, 65)
+CV_REGISTER(ARM_FS16, 66)
+CV_REGISTER(ARM_FS17, 67)
+CV_REGISTER(ARM_FS18, 68)
+CV_REGISTER(ARM_FS19, 69)
+CV_REGISTER(ARM_FS20, 70)
+CV_REGISTER(ARM_FS21, 71)
+CV_REGISTER(ARM_FS22, 72)
+CV_REGISTER(ARM_FS23, 73)
+CV_REGISTER(ARM_FS24, 74)
+CV_REGISTER(ARM_FS25, 75)
+CV_REGISTER(ARM_FS26, 76)
+CV_REGISTER(ARM_FS27, 77)
+CV_REGISTER(ARM_FS28, 78)
+CV_REGISTER(ARM_FS29, 79)
+CV_REGISTER(ARM_FS30, 80)
+CV_REGISTER(ARM_FS31, 81)
+
 // ARM VFPv3/NEON registers
 
 CV_REGISTER(ARM_FS32, 200)
diff --git a/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/DebugSubsectionVisitor.h b/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/DebugSubsectionVisitor.h
index 720b1b49581f..624a623e75b8 100644
--- a/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/DebugSubsectionVisitor.h
+++ b/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/DebugSubsectionVisitor.h
@@ -10,10 +10,8 @@
 #define LLVM_DEBUGINFO_CODEVIEW_MODULEDEBUGFRAGMENTVISITOR_H
 
 #include "llvm/DebugInfo/CodeView/CodeView.h"
-#include "llvm/DebugInfo/CodeView/DebugSubsectionRecord.h"
 #include "llvm/DebugInfo/CodeView/StringsAndChecksums.h"
 #include "llvm/Support/Error.h"
-#include <cstdint>
 
 namespace llvm {
 
@@ -30,7 +28,6 @@ class DebugStringTableSubsectionRef;
 class DebugSymbolRVASubsectionRef;
 class DebugSymbolsSubsectionRef;
 class DebugUnknownSubsectionRef;
-class StringsAndChecksumsRef;
 
 class DebugSubsectionVisitor {
 public:
diff --git a/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/DebugSymbolsSubsection.h b/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/DebugSymbolsSubsection.h
index 784fc59484b9..51b8523ed969 100644
--- a/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/DebugSymbolsSubsection.h
+++ b/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/DebugSymbolsSubsection.h
@@ -9,8 +9,8 @@
 #ifndef LLVM_DEBUGINFO_CODEVIEW_DEBUGSYMBOLSSUBSECTION_H
 #define LLVM_DEBUGINFO_CODEVIEW_DEBUGSYMBOLSSUBSECTION_H
 
+#include "llvm/DebugInfo/CodeView/CVRecord.h"
 #include "llvm/DebugInfo/CodeView/DebugSubsection.h"
-#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
 #include "llvm/Support/Error.h"
 
 namespace llvm {
diff --git a/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h b/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h
index 35eeef5a327e..ddbb4e3c5e6c 100644
--- a/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h
+++ b/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h
@@ -14,7 +14,6 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/DebugInfo/CodeView/TypeCollection.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
-#include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/BinaryStreamArray.h"
 #include "llvm/Support/Error.h"
diff --git a/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/RecordName.h b/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/RecordName.h
index cc09db8933bd..8e06be9e41e8 100644
--- a/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/RecordName.h
+++ b/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/RecordName.h
@@ -9,7 +9,6 @@
 #ifndef LLVM_DEBUGINFO_CODEVIEW_RECORDNAME_H
 #define LLVM_DEBUGINFO_CODEVIEW_RECORDNAME_H
 
-#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeCollection.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
 
diff --git a/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/SymbolDumper.h b/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/SymbolDumper.h
index d832a48b1265..aaeffb2446ad 100644
--- a/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/SymbolDumper.h
+++ b/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/SymbolDumper.h
@@ -11,8 +11,8 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringSet.h"
+#include "llvm/DebugInfo/CodeView/CVRecord.h"
 #include "llvm/DebugInfo/CodeView/SymbolDumpDelegate.h"
-#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
 
 namespace llvm {
diff --git a/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/SymbolRecord.h b/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/SymbolRecord.h
index 4383534b0db2..c37f6b4d5fa7 100644
--- a/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/SymbolRecord.h
+++ b/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/SymbolRecord.h
@@ -1003,9 +1003,6 @@ public:
   uint32_t RecordOffset = 0;
 };
 
-using CVSymbol = CVRecord<SymbolKind>;
-using CVSymbolArray = VarStreamArray<CVSymbol>;
-
 Expected<CVSymbol> readSymbolFromStream(BinaryStreamRef Stream,
                                         uint32_t Offset);
 
diff --git a/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/SymbolRecordHelpers.h b/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/SymbolRecordHelpers.h
index 57dbc56c0769..71bc70dde6ed 100644
--- a/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/SymbolRecordHelpers.h
+++ b/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/SymbolRecordHelpers.h
@@ -9,7 +9,8 @@
 #ifndef LLVM_DEBUGINFO_CODEVIEW_SYMBOLRECORDHELPERS_H
 #define LLVM_DEBUGINFO_CODEVIEW_SYMBOLRECORDHELPERS_H
 
-#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
+#include "llvm/DebugInfo/CodeView/CVRecord.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
 
 namespace llvm {
 namespace codeview {
diff --git a/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/TypeCollection.h b/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/TypeCollection.h
index 102d68c3fb2a..bde5a8b3ab2f 100644
--- a/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/TypeCollection.h
+++ b/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/TypeCollection.h
@@ -10,9 +10,8 @@
 #define LLVM_DEBUGINFO_CODEVIEW_TYPECOLLECTION_H
 
 #include "llvm/ADT/StringRef.h"
-
+#include "llvm/DebugInfo/CodeView/CVRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
-#include "llvm/DebugInfo/CodeView/TypeRecord.h"
 
 namespace llvm {
 namespace codeview {
diff --git a/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/TypeHashing.h b/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/TypeHashing.h
index b0a16cccbff3..9f34d026b1ba 100644
--- a/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/TypeHashing.h
+++ b/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/TypeHashing.h
@@ -86,6 +86,16 @@ struct GloballyHashedType {
 
   bool empty() const { return *(const uint64_t*)Hash.data() == 0; }
 
+  friend inline bool operator==(const GloballyHashedType &L,
+                                const GloballyHashedType &R) {
+    return L.Hash == R.Hash;
+  }
+
+  friend inline bool operator!=(const GloballyHashedType &L,
+                                const GloballyHashedType &R) {
+    return !(L.Hash == R.Hash);
+  }
+
   /// Given a sequence of bytes representing a record, compute a global hash for
   /// this record.  Due to the nature of global hashes incorporating the hashes
   /// of referenced records, this function requires a list of types and ids
@@ -161,15 +171,10 @@ struct GloballyHashedType {
     return Hashes;
   }
 };
-#if defined(_MSC_VER)
-// is_trivially_copyable is not available in older versions of libc++, but it is
-// available in all supported versions of MSVC, so at least this gives us some
-// coverage.
 static_assert(std::is_trivially_copyable<GloballyHashedType>::value,
               "GloballyHashedType must be trivially copyable so that we can "
               "reinterpret_cast arrays of hash data to arrays of "
               "GloballyHashedType");
-#endif
 } // namespace codeview
 
 template <> struct DenseMapInfo<codeview::LocallyHashedType> {
@@ -206,7 +211,7 @@ template <> struct DenseMapInfo<codeview::GloballyHashedType> {
 
   static bool isEqual(codeview::GloballyHashedType LHS,
                       codeview::GloballyHashedType RHS) {
-    return LHS.Hash == RHS.Hash;
+    return LHS == RHS;
   }
 };
 
diff --git a/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/TypeIndex.h b/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/TypeIndex.h
index b9e2562bfc2b..bdc6cf46509b 100644
--- a/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/TypeIndex.h
+++ b/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/TypeIndex.h
@@ -116,13 +116,22 @@ public:
 
   uint32_t toArrayIndex() const {
     assert(!isSimple());
-    return getIndex() - FirstNonSimpleIndex;
+    return (getIndex() & ~DecoratedItemIdMask) - FirstNonSimpleIndex;
   }
 
   static TypeIndex fromArrayIndex(uint32_t Index) {
     return TypeIndex(Index + FirstNonSimpleIndex);
   }
 
+  static TypeIndex fromDecoratedArrayIndex(bool IsItem, uint32_t Index) {
+    return TypeIndex((Index + FirstNonSimpleIndex) |
+                     (IsItem ? DecoratedItemIdMask : 0));
+  }
+
+  TypeIndex removeDecoration() {
+    return TypeIndex(Index & ~DecoratedItemIdMask);
+  }
+
   SimpleTypeKind getSimpleKind() const {
     assert(isSimple());
     return static_cast<SimpleTypeKind>(Index & SimpleKindMask);
diff --git a/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/TypeIndexDiscovery.h b/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/TypeIndexDiscovery.h
index 469768787274..f4f5835d8b57 100644
--- a/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/TypeIndexDiscovery.h
+++ b/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/TypeIndexDiscovery.h
@@ -10,8 +10,8 @@
 #define LLVM_DEBUGINFO_CODEVIEW_TYPEINDEXDISCOVERY_H
 
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
-#include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/CodeView/CVRecord.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/Support/Error.h"
 
 namespace llvm {
diff --git a/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/TypeRecord.h b/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/TypeRecord.h
index 35f5c0561138..3b6d1b0b1a70 100644
--- a/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/TypeRecord.h
+++ b/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/TypeRecord.h
@@ -14,7 +14,6 @@
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/iterator_range.h"
 #include "llvm/DebugInfo/CodeView/CVRecord.h"
 #include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/GUID.h"
@@ -32,15 +31,10 @@ using support::little32_t;
 using support::ulittle16_t;
 using support::ulittle32_t;
 
-using CVType = CVRecord<TypeLeafKind>;
-using RemappedType = RemappedRecord<TypeLeafKind>;
-
 struct CVMemberRecord {
   TypeLeafKind Kind;
   ArrayRef<uint8_t> Data;
 };
-using CVTypeArray = VarStreamArray<CVType>;
-using CVTypeRange = iterator_range<CVTypeArray::Iterator>;
 
 /// Equvalent to CV_fldattr_t in cvinfo.h.
 struct MemberAttributes {
@@ -703,7 +697,7 @@ public:
       : TypeRecord(TypeRecordKind::VFTable), CompleteClass(CompleteClass),
         OverriddenVFTable(OverriddenVFTable), VFPtrOffset(VFPtrOffset) {
     MethodNames.push_back(Name);
-    MethodNames.insert(MethodNames.end(), Methods.begin(), Methods.end());
+    llvm::append_range(MethodNames, Methods);
   }
 
   TypeIndex getCompleteClass() const { return CompleteClass; }
diff --git a/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/TypeRecordHelpers.h b/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/TypeRecordHelpers.h
index 19492b93681c..041f5214967c 100644
--- a/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/TypeRecordHelpers.h
+++ b/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/TypeRecordHelpers.h
@@ -9,7 +9,8 @@
 #ifndef LLVM_DEBUGINFO_CODEVIEW_TYPERECORDHELPERS_H
 #define LLVM_DEBUGINFO_CODEVIEW_TYPERECORDHELPERS_H
 
-#include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/CodeView/CVRecord.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
 
 namespace llvm {
 namespace codeview {
diff --git a/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/TypeStreamMerger.h b/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/TypeStreamMerger.h
index d0506cce8176..04d7c7b0420a 100644
--- a/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/TypeStreamMerger.h
+++ b/contrib/llvm-project/llvm/include/llvm/DebugInfo/CodeView/TypeStreamMerger.h
@@ -11,7 +11,7 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/CodeView/CVRecord.h"
 #include "llvm/Support/Error.h"
 
 namespace llvm {
diff --git a/contrib/llvm-project/llvm/include/llvm/DebugInfo/DIContext.h b/contrib/llvm-project/llvm/include/llvm/DebugInfo/DIContext.h
index 661d30d04c94..ae78fe912188 100644
--- a/contrib/llvm-project/llvm/include/llvm/DebugInfo/DIContext.h
+++ b/contrib/llvm-project/llvm/include/llvm/DebugInfo/DIContext.h
@@ -35,6 +35,7 @@ struct DILineInfo {
   static constexpr const char *const Addr2LineBadString = "??";
   std::string FileName;
   std::string FunctionName;
+  std::string StartFileName;
   Optional<StringRef> Source;
   uint32_t Line = 0;
   uint32_t Column = 0;
@@ -43,12 +44,15 @@ struct DILineInfo {
   // DWARF-specific.
   uint32_t Discriminator = 0;
 
-  DILineInfo() : FileName(BadString), FunctionName(BadString) {}
+  DILineInfo()
+      : FileName(BadString), FunctionName(BadString), StartFileName(BadString) {
+  }
 
   bool operator==(const DILineInfo &RHS) const {
     return Line == RHS.Line && Column == RHS.Column &&
            FileName == RHS.FileName && FunctionName == RHS.FunctionName &&
-           StartLine == RHS.StartLine && Discriminator == RHS.Discriminator;
+           StartFileName == RHS.StartFileName && StartLine == RHS.StartLine &&
+           Discriminator == RHS.Discriminator;
   }
 
   bool operator!=(const DILineInfo &RHS) const {
@@ -56,10 +60,10 @@ struct DILineInfo {
   }
 
   bool operator<(const DILineInfo &RHS) const {
-    return std::tie(FileName, FunctionName, Line, Column, StartLine,
-                    Discriminator) <
-           std::tie(RHS.FileName, RHS.FunctionName, RHS.Line, RHS.Column,
-                    RHS.StartLine, RHS.Discriminator);
+    return std::tie(FileName, FunctionName, StartFileName, Line, Column,
+                    StartLine, Discriminator) <
+           std::tie(RHS.FileName, RHS.FunctionName, RHS.StartFileName, RHS.Line,
+                    RHS.Column, RHS.StartLine, RHS.Discriminator);
   }
 
   explicit operator bool() const { return *this != DILineInfo(); }
@@ -72,6 +76,8 @@ struct DILineInfo {
       OS << "function '" << FunctionName << "', ";
     OS << "line " << Line << ", ";
     OS << "column " << Column << ", ";
+    if (StartFileName != BadString)
+      OS << "start file '" << StartFileName << "', ";
     OS << "start line " << StartLine << '\n';
   }
 };
diff --git a/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h b/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h
index 39ae53c4e7fe..cf4c827b9267 100644
--- a/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h
+++ b/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h
@@ -111,6 +111,16 @@ public:
     return AttributeSpecs[idx].Attr;
   }
 
+  bool getAttrIsImplicitConstByIndex(uint32_t idx) const {
+    assert(idx < AttributeSpecs.size());
+    return AttributeSpecs[idx].isImplicitConst();
+  }
+
+  int64_t getAttrImplicitConstValueByIndex(uint32_t idx) const {
+    assert(idx < AttributeSpecs.size());
+    return AttributeSpecs[idx].getImplicitConstValue();
+  }
+
   /// Get the index of the specified attribute.
   ///
   /// Searches the this abbreviation declaration for the index of the specified
diff --git a/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFContext.h b/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFContext.h
index 97903a96b3fc..7d88e1447dca 100644
--- a/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFContext.h
+++ b/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFContext.h
@@ -146,6 +146,7 @@ public:
   bool verify(raw_ostream &OS, DIDumpOptions DumpOpts = {}) override;
 
   using unit_iterator_range = DWARFUnitVector::iterator_range;
+  using compile_unit_range = DWARFUnitVector::compile_unit_range;
 
   /// Get units from .debug_info in this context.
   unit_iterator_range info_section_units() {
@@ -163,10 +164,12 @@ public:
   }
 
   /// Get compile units in this context.
-  unit_iterator_range compile_units() { return info_section_units(); }
+  compile_unit_range compile_units() {
+    return make_filter_range(info_section_units(), isCompileUnit);
+  }
 
-  /// Get type units in this context.
-  unit_iterator_range type_units() { return types_section_units(); }
+  // If you want type_units(), it'll need to be a concat iterator of a filter of
+  // TUs in info_section + all the (all type) units in types_section
 
   /// Get all normal compile/type units in this context.
   unit_iterator_range normal_units() {
@@ -189,10 +192,13 @@ public:
   }
 
   /// Get compile units in the DWO context.
-  unit_iterator_range dwo_compile_units() { return dwo_info_section_units(); }
+  compile_unit_range dwo_compile_units() {
+    return make_filter_range(dwo_info_section_units(), isCompileUnit);
+  }
 
-  /// Get type units in the DWO context.
-  unit_iterator_range dwo_type_units() { return dwo_types_section_units(); }
+  // If you want dwo_type_units(), it'll need to be a concat iterator of a
+  // filter of TUs in dwo_info_section + all the (all type) units in
+  // dwo_types_section.
 
   /// Get all units in the DWO context.
   unit_iterator_range dwo_units() {
diff --git a/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugAddr.h b/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugAddr.h
index 32844ffd570f..69e67866946c 100644
--- a/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugAddr.h
+++ b/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugAddr.h
@@ -74,6 +74,24 @@ public:
   /// Return the full length of this table, including the length field.
   /// Return None if the length cannot be identified reliably.
   Optional<uint64_t> getFullLength() const;
+
+  /// Return the DWARF format of this table.
+  dwarf::DwarfFormat getFormat() const { return Format; }
+
+  /// Return the length of this table.
+  uint64_t getLength() const { return Length; }
+
+  /// Return the version of this table.
+  uint16_t getVersion() const { return Version; }
+
+  /// Return the address size of this table.
+  uint8_t getAddressSize() const { return AddrSize; }
+
+  /// Return the segment selector size of this table.
+  uint8_t getSegmentSelectorSize() const { return SegSize; }
+
+  /// Return the parsed addresses of this table.
+  ArrayRef<uint64_t> getAddressEntries() const { return Addrs; }
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h b/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h
index 0681a2e33a50..3d5852ee1518 100644
--- a/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h
+++ b/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h
@@ -60,7 +60,8 @@ public:
   DWARFDebugArangeSet() { clear(); }
 
   void clear();
-  Error extract(DWARFDataExtractor data, uint64_t *offset_ptr);
+  Error extract(DWARFDataExtractor data, uint64_t *offset_ptr,
+                function_ref<void(Error)> WarningHandler);
   void dump(raw_ostream &OS) const;
 
   uint64_t getCompileUnitDIEOffset() const { return HeaderData.CuOffset; }
diff --git a/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h b/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h
index 233b55cc55c1..af87811f5d7d 100644
--- a/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h
+++ b/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h
@@ -71,8 +71,8 @@ public:
   /// where a problem occurred in case an error is returned.
   Error parse(DWARFDataExtractor Data, uint64_t *Offset, uint64_t EndOffset);
 
-  void dump(raw_ostream &OS, const MCRegisterInfo *MRI, bool IsEH,
-            unsigned IndentLevel = 1) const;
+  void dump(raw_ostream &OS, DIDumpOptions DumpOpts, const MCRegisterInfo *MRI,
+            bool IsEH, unsigned IndentLevel = 1) const;
 
 private:
   std::vector<Instruction> Instructions;
@@ -121,7 +121,8 @@ private:
   static ArrayRef<OperandType[2]> getOperandTypes();
 
   /// Print \p Opcode's operand number \p OperandIdx which has value \p Operand.
-  void printOperand(raw_ostream &OS, const MCRegisterInfo *MRI, bool IsEH,
+  void printOperand(raw_ostream &OS, DIDumpOptions DumpOpts,
+                    const MCRegisterInfo *MRI, bool IsEH,
                     const Instruction &Instr, unsigned OperandIdx,
                     uint64_t Operand) const;
 };
@@ -146,8 +147,8 @@ public:
   CFIProgram &cfis() { return CFIs; }
 
   /// Dump the instructions in this CFI fragment
-  virtual void dump(raw_ostream &OS, const MCRegisterInfo *MRI,
-                    bool IsEH) const = 0;
+  virtual void dump(raw_ostream &OS, DIDumpOptions DumpOpts,
+                    const MCRegisterInfo *MRI, bool IsEH) const = 0;
 
 protected:
   const FrameKind Kind;
@@ -201,7 +202,7 @@ public:
 
   uint32_t getLSDAPointerEncoding() const { return LSDAPointerEncoding; }
 
-  void dump(raw_ostream &OS, const MCRegisterInfo *MRI,
+  void dump(raw_ostream &OS, DIDumpOptions DumpOpts, const MCRegisterInfo *MRI,
             bool IsEH) const override;
 
 private:
@@ -242,7 +243,7 @@ public:
   uint64_t getAddressRange() const { return AddressRange; }
   Optional<uint64_t> getLSDAAddress() const { return LSDAAddress; }
 
-  void dump(raw_ostream &OS, const MCRegisterInfo *MRI,
+  void dump(raw_ostream &OS, DIDumpOptions DumpOpts, const MCRegisterInfo *MRI,
             bool IsEH) const override;
 
   static bool classof(const FrameEntry *FE) { return FE->getKind() == FK_FDE; }
@@ -285,7 +286,7 @@ public:
   ~DWARFDebugFrame();
 
   /// Dump the section data into the given stream.
-  void dump(raw_ostream &OS, const MCRegisterInfo *MRI,
+  void dump(raw_ostream &OS, DIDumpOptions DumpOpts, const MCRegisterInfo *MRI,
             Optional<uint64_t> Offset) const;
 
   /// Parse the section from raw data. \p Data is assumed to contain the whole
diff --git a/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h b/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h
index fe46d613aedd..bc6c67ae6c5d 100644
--- a/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h
+++ b/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h
@@ -121,6 +121,8 @@ public:
 
     bool hasFileAtIndex(uint64_t FileIndex) const;
 
+    Optional<uint64_t> getLastValidFileIndex() const;
+
     bool
     getFileNameByIndex(uint64_t FileIndex, StringRef CompDir,
                        DILineInfoSpecifier::FileLineInfoKind Kind,
@@ -251,6 +253,10 @@ public:
       return Prologue.hasFileAtIndex(FileIndex);
     }
 
+    Optional<uint64_t> getLastValidFileIndex() const {
+      return Prologue.getLastValidFileIndex();
+    }
+
     /// Extracts filename by its index in filename table in prologue.
     /// In Dwarf 4, the files are 1-indexed and the current compilation file
     /// name is not represented in the list. In DWARF v5, the files are
@@ -309,12 +315,10 @@ public:
   /// Helper to allow for parsing of an entire .debug_line section in sequence.
   class SectionParser {
   public:
-    using cu_range = DWARFUnitVector::iterator_range;
-    using tu_range = DWARFUnitVector::iterator_range;
     using LineToUnitMap = std::map<uint64_t, DWARFUnit *>;
 
-    SectionParser(DWARFDataExtractor &Data, const DWARFContext &C, cu_range CUs,
-                  tu_range TUs);
+    SectionParser(DWARFDataExtractor &Data, const DWARFContext &C,
+                  DWARFUnitVector::iterator_range Units);
 
     /// Get the next line table from the section. Report any issues via the
     /// handlers.
diff --git a/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h b/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h
index 3b141304f85f..dbc11c51a789 100644
--- a/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h
+++ b/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h
@@ -72,6 +72,8 @@ public:
       std::function<Optional<object::SectionedAddress>(uint32_t)> LookupAddr,
       function_ref<bool(Expected<DWARFLocationExpression>)> Callback) const;
 
+  const DWARFDataExtractor &getData() { return Data; }
+
 protected:
   DWARFDataExtractor Data;
 
diff --git a/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugMacro.h b/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugMacro.h
index 4d463d8fe6f5..f1768a1ddab5 100644
--- a/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugMacro.h
+++ b/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugMacro.h
@@ -96,6 +96,9 @@ class DWARFDebugMacro {
     MacroHeader Header;
     SmallVector<Entry, 4> Macros;
     uint64_t Offset;
+
+    /// Whether or not this is a .debug_macro section.
+    bool IsDebugMacro;
   };
 
   /// A list of all the macro entries in the debug_macinfo section.
@@ -107,7 +110,7 @@ public:
   /// Print the macro list found within the debug_macinfo/debug_macro section.
   void dump(raw_ostream &OS) const;
 
-  Error parseMacro(DWARFUnitVector::iterator_range Units,
+  Error parseMacro(DWARFUnitVector::compile_unit_range Units,
                    DataExtractor StringExtractor,
                    DWARFDataExtractor MacroData) {
     return parseImpl(Units, StringExtractor, MacroData, /*IsMacro=*/true);
@@ -123,7 +126,7 @@ public:
 private:
   /// Parse the debug_macinfo/debug_macro section accessible via the 'MacroData'
   /// parameter.
-  Error parseImpl(Optional<DWARFUnitVector::iterator_range> Units,
+  Error parseImpl(Optional<DWARFUnitVector::compile_unit_range> Units,
                   Optional<DataExtractor> StringExtractor,
                   DWARFDataExtractor Data, bool IsMacro);
 };
diff --git a/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugRnglists.h b/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugRnglists.h
index 88e5432851d6..4d28bdcde2e4 100644
--- a/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugRnglists.h
+++ b/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugRnglists.h
@@ -34,7 +34,7 @@ struct RangeListEntry : public DWARFListEntryBase {
   uint64_t Value0;
   uint64_t Value1;
 
-  Error extract(DWARFDataExtractor Data, uint64_t End, uint64_t *OffsetPtr);
+  Error extract(DWARFDataExtractor Data, uint64_t *OffsetPtr);
   void dump(raw_ostream &OS, uint8_t AddrSize, uint8_t MaxEncodingStringLength,
             uint64_t &CurrentBase, DIDumpOptions DumpOpts,
             llvm::function_ref<Optional<object::SectionedAddress>(uint32_t)>
@@ -48,6 +48,7 @@ public:
   /// Build a DWARFAddressRangesVector from a rangelist.
   DWARFAddressRangesVector
   getAbsoluteRanges(Optional<object::SectionedAddress> BaseAddr,
+                    uint8_t AddressByteSize,
                     function_ref<Optional<object::SectionedAddress>(uint32_t)>
                         LookupPooledAddress) const;
 
diff --git a/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFDie.h b/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFDie.h
index 05a6056e8e21..0f76d7f1b31c 100644
--- a/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFDie.h
+++ b/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFDie.h
@@ -262,6 +262,7 @@ public:
   /// for this subprogram by resolving DW_AT_sepcification or
   /// DW_AT_abstract_origin references if necessary.
   uint64_t getDeclLine() const;
+  std::string getDeclFile(DILineInfoSpecifier::FileLineInfoKind Kind) const;
 
   /// Retrieves values of DW_AT_call_file, DW_AT_call_line and DW_AT_call_column
   /// from DIE (or zeroes if they are missing). This function looks for
@@ -381,11 +382,6 @@ inline bool operator==(const DWARFDie::iterator &LHS,
   return LHS.Die == RHS.Die;
 }
 
-inline bool operator!=(const DWARFDie::iterator &LHS,
-                       const DWARFDie::iterator &RHS) {
-  return !(LHS == RHS);
-}
-
 // These inline functions must follow the DWARFDie::iterator definition above
 // as they use functions from that class.
 inline DWARFDie::iterator DWARFDie::begin() const {
diff --git a/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFExpression.h b/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFExpression.h
index edfa68d49a60..447ad66b9352 100644
--- a/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFExpression.h
+++ b/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFExpression.h
@@ -10,10 +10,11 @@
 #define LLVM_DEBUGINFO_DWARFEXPRESSION_H
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/iterator.h"
 #include "llvm/ADT/iterator_range.h"
-#include "llvm/ADT/Optional.h"
 #include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/DebugInfo/DIContext.h"
 #include "llvm/Support/DataExtractor.h"
 
 namespace llvm {
@@ -93,8 +94,9 @@ public:
     bool extract(DataExtractor Data, uint8_t AddressSize, uint64_t Offset,
                  Optional<dwarf::DwarfFormat> Format);
     bool isError() { return Error; }
-    bool print(raw_ostream &OS, const DWARFExpression *Expr,
-               const MCRegisterInfo *RegInfo, DWARFUnit *U, bool isEH);
+    bool print(raw_ostream &OS, DIDumpOptions DumpOpts,
+               const DWARFExpression *Expr, const MCRegisterInfo *RegInfo,
+               DWARFUnit *U, bool isEH);
     bool verify(DWARFUnit *U);
   };
 
@@ -143,7 +145,8 @@ public:
   iterator begin() const { return iterator(this, 0); }
   iterator end() const { return iterator(this, Data.getData().size()); }
 
-  void print(raw_ostream &OS, const MCRegisterInfo *RegInfo, DWARFUnit *U,
+  void print(raw_ostream &OS, DIDumpOptions DumpOpts,
+             const MCRegisterInfo *RegInfo, DWARFUnit *U,
              bool IsEH = false) const;
 
   /// Print the expression in a format intended to be compact and useful to a
@@ -164,10 +167,5 @@ inline bool operator==(const DWARFExpression::iterator &LHS,
                        const DWARFExpression::iterator &RHS) {
   return LHS.Expr == RHS.Expr && LHS.Offset == RHS.Offset;
 }
-
-inline bool operator!=(const DWARFExpression::iterator &LHS,
-                       const DWARFExpression::iterator &RHS) {
-  return !(LHS == RHS);
-}
 }
 #endif
diff --git a/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFFormValue.h b/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFFormValue.h
index 3f1be4e5a592..1342e645934c 100644
--- a/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFFormValue.h
+++ b/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFFormValue.h
@@ -82,6 +82,9 @@ public:
   void dump(raw_ostream &OS, DIDumpOptions DumpOpts = DIDumpOptions()) const;
   void dumpSectionedAddress(raw_ostream &OS, DIDumpOptions DumpOpts,
                             object::SectionedAddress SA) const;
+  void dumpAddress(raw_ostream &OS, uint64_t Address) const;
+  static void dumpAddress(raw_ostream &OS, uint8_t AddressSize,
+                          uint64_t Address);
   static void dumpAddressSection(const DWARFObject &Obj, raw_ostream &OS,
                                  DIDumpOptions DumpOpts, uint64_t SectionIndex);
 
diff --git a/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFListTable.h b/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFListTable.h
index 496fdb2477f9..8f58b4e6458e 100644
--- a/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFListTable.h
+++ b/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFListTable.h
@@ -46,7 +46,7 @@ public:
   const ListEntries &getEntries() const { return Entries; }
   bool empty() const { return Entries.empty(); }
   void clear() { Entries.clear(); }
-  Error extract(DWARFDataExtractor Data, uint64_t HeaderOffset, uint64_t End,
+  Error extract(DWARFDataExtractor Data, uint64_t HeaderOffset,
                 uint64_t *OffsetPtr, StringRef SectionName,
                 StringRef ListStringName);
 };
@@ -72,10 +72,6 @@ class DWARFListTableHeader {
   };
 
   Header HeaderData;
-  /// The offset table, which contains offsets to the individual list entries.
-  /// It is used by forms such as DW_FORM_rnglistx.
-  /// FIXME: Generate the table and use the appropriate forms.
-  std::vector<uint64_t> Offsets;
   /// The table's format, either DWARF32 or DWARF64.
   dwarf::DwarfFormat Format;
   /// The offset at which the header (and hence the table) is located within
@@ -93,7 +89,6 @@ public:
 
   void clear() {
     HeaderData = {};
-    Offsets.clear();
   }
   uint64_t getHeaderOffset() const { return HeaderOffset; }
   uint8_t getAddrSize() const { return HeaderData.AddrSize; }
@@ -115,11 +110,23 @@ public:
     llvm_unreachable("Invalid DWARF format (expected DWARF32 or DWARF64");
   }
 
-  void dump(raw_ostream &OS, DIDumpOptions DumpOpts = {}) const;
-  Optional<uint64_t> getOffsetEntry(uint32_t Index) const {
-    if (Index < Offsets.size())
-      return Offsets[Index];
-    return None;
+  void dump(DataExtractor Data, raw_ostream &OS,
+            DIDumpOptions DumpOpts = {}) const;
+  Optional<uint64_t> getOffsetEntry(DataExtractor Data, uint32_t Index) const {
+    if (Index > HeaderData.OffsetEntryCount)
+      return None;
+
+    return getOffsetEntry(Data, getHeaderOffset() + getHeaderSize(Format), Format, Index);
+  }
+
+  static Optional<uint64_t> getOffsetEntry(DataExtractor Data,
+                                           uint64_t OffsetTableOffset,
+                                           dwarf::DwarfFormat Format,
+                                           uint32_t Index) {
+    uint8_t OffsetByteSize = Format == dwarf::DWARF64 ? 8 : 4;
+    uint64_t Offset = OffsetTableOffset + OffsetByteSize * Index;
+    auto R = Data.getUnsigned(&Offset, OffsetByteSize);
+    return R;
   }
 
   /// Extract the table header and the array of offsets.
@@ -169,14 +176,14 @@ public:
   uint8_t getAddrSize() const { return Header.getAddrSize(); }
   dwarf::DwarfFormat getFormat() const { return Header.getFormat(); }
 
-  void dump(raw_ostream &OS,
+  void dump(DWARFDataExtractor Data, raw_ostream &OS,
             llvm::function_ref<Optional<object::SectionedAddress>(uint32_t)>
                 LookupPooledAddress,
             DIDumpOptions DumpOpts = {}) const;
 
   /// Return the contents of the offset entry designated by a given index.
-  Optional<uint64_t> getOffsetEntry(uint32_t Index) const {
-    return Header.getOffsetEntry(Index);
+  Optional<uint64_t> getOffsetEntry(DataExtractor Data, uint32_t Index) const {
+    return Header.getOffsetEntry(Data, Index);
   }
   /// Return the size of the table header including the length but not including
   /// the offsets. This is dependent on the table format, which is unambiguously
@@ -196,18 +203,18 @@ Error DWARFListTableBase<DWARFListType>::extract(DWARFDataExtractor Data,
     return E;
 
   Data.setAddressSize(Header.getAddrSize());
-  uint64_t End = getHeaderOffset() + Header.length();
-  while (*OffsetPtr < End) {
+  Data = DWARFDataExtractor(Data, getHeaderOffset() + Header.length());
+  while (Data.isValidOffset(*OffsetPtr)) {
     DWARFListType CurrentList;
     uint64_t Off = *OffsetPtr;
-    if (Error E = CurrentList.extract(Data, getHeaderOffset(), End, OffsetPtr,
+    if (Error E = CurrentList.extract(Data, getHeaderOffset(), OffsetPtr,
                                       Header.getSectionName(),
                                       Header.getListTypeString()))
       return E;
     ListMap[Off] = CurrentList;
   }
 
-  assert(*OffsetPtr == End &&
+  assert(*OffsetPtr == Data.size() &&
          "mismatch between expected length of table and length "
          "of extracted data");
   return Error::success();
@@ -215,18 +222,18 @@ Error DWARFListTableBase<DWARFListType>::extract(DWARFDataExtractor Data,
 
 template <typename ListEntryType>
 Error DWARFListType<ListEntryType>::extract(DWARFDataExtractor Data,
-                                            uint64_t HeaderOffset, uint64_t End,
+                                            uint64_t HeaderOffset,
                                             uint64_t *OffsetPtr,
                                             StringRef SectionName,
                                             StringRef ListTypeString) {
-  if (*OffsetPtr < HeaderOffset || *OffsetPtr >= End)
+  if (*OffsetPtr < HeaderOffset || *OffsetPtr >= Data.size())
     return createStringError(errc::invalid_argument,
                        "invalid %s list offset 0x%" PRIx64,
                        ListTypeString.data(), *OffsetPtr);
   Entries.clear();
-  while (*OffsetPtr < End) {
+  while (Data.isValidOffset(*OffsetPtr)) {
     ListEntryType Entry;
-    if (Error E = Entry.extract(Data, End, OffsetPtr))
+    if (Error E = Entry.extract(Data, OffsetPtr))
       return E;
     Entries.push_back(Entry);
     if (Entry.isSentinel())
@@ -240,11 +247,11 @@ Error DWARFListType<ListEntryType>::extract(DWARFDataExtractor Data,
 
 template <typename DWARFListType>
 void DWARFListTableBase<DWARFListType>::dump(
-    raw_ostream &OS,
+    DWARFDataExtractor Data, raw_ostream &OS,
     llvm::function_ref<Optional<object::SectionedAddress>(uint32_t)>
         LookupPooledAddress,
     DIDumpOptions DumpOpts) const {
-  Header.dump(OS, DumpOpts);
+  Header.dump(Data, OS, DumpOpts);
   OS << HeaderString << "\n";
 
   // Determine the length of the longest encoding string we have in the table,
@@ -269,19 +276,14 @@ template <typename DWARFListType>
 Expected<DWARFListType>
 DWARFListTableBase<DWARFListType>::findList(DWARFDataExtractor Data,
                                             uint64_t Offset) {
-  auto Entry = ListMap.find(Offset);
-  if (Entry != ListMap.end())
-    return Entry->second;
-
   // Extract the list from the section and enter it into the list map.
   DWARFListType List;
-  uint64_t End = getHeaderOffset() + Header.length();
-  uint64_t StartingOffset = Offset;
+  if (Header.length())
+    Data = DWARFDataExtractor(Data, getHeaderOffset() + Header.length());
   if (Error E =
-          List.extract(Data, getHeaderOffset(), End, &Offset,
+          List.extract(Data, Header.length() ? getHeaderOffset() : 0, &Offset,
                        Header.getSectionName(), Header.getListTypeString()))
     return std::move(E);
-  ListMap[StartingOffset] = List;
   return List;
 }
 
diff --git a/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFUnit.h b/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFUnit.h
index 5b3b46626059..369cbdc28c2e 100644
--- a/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFUnit.h
+++ b/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFUnit.h
@@ -113,6 +113,8 @@ public:
 const DWARFUnitIndex &getDWARFUnitIndex(DWARFContext &Context,
                                         DWARFSectionKind Kind);
 
+bool isCompileUnit(const std::unique_ptr<DWARFUnit> &U);
+
 /// Describe a collection of units. Intended to hold all units either from
 /// .debug_info and .debug_types, or from .debug_info.dwo and .debug_types.dwo.
 class DWARFUnitVector final : public SmallVector<std::unique_ptr<DWARFUnit>, 1> {
@@ -127,6 +129,9 @@ public:
   using iterator = typename UnitVector::iterator;
   using iterator_range = llvm::iterator_range<typename UnitVector::iterator>;
 
+  using compile_unit_range =
+      decltype(make_filter_range(std::declval<iterator_range>(), isCompileUnit));
+
   DWARFUnit *getUnitForOffset(uint64_t Offset) const;
   DWARFUnit *getUnitForIndexEntry(const DWARFUnitIndex::Entry &E);
 
@@ -204,7 +209,6 @@ class DWARFUnit {
   const DWARFDebugAbbrev *Abbrev;
   const DWARFSection *RangeSection;
   uint64_t RangeSectionBase;
-  const DWARFSection *LocSection;
   uint64_t LocSectionBase;
 
   /// Location table of this unit.
@@ -223,10 +227,6 @@ class DWARFUnit {
   /// offsets table (DWARF v5).
   Optional<StrOffsetsContributionDescriptor> StringOffsetsTableContribution;
 
-  /// A table of range lists (DWARF v5 and later).
-  Optional<DWARFDebugRnglistTable> RngListTable;
-  Optional<DWARFListTableHeader> LoclistTableHeader;
-
   mutable const DWARFAbbreviationDeclarationSet *Abbrevs;
   llvm::Optional<object::SectionedAddress> BaseAddr;
   /// The compile unit debug information entry items.
@@ -294,6 +294,7 @@ public:
   dwarf::DwarfFormat getFormat() const { return Header.getFormat(); }
   uint8_t getUnitType() const { return Header.getUnitType(); }
   bool isTypeUnit() const { return Header.isTypeUnit(); }
+  uint64_t getAbbrOffset() const { return Header.getAbbrOffset(); }
   uint64_t getNextUnitOffset() const { return Header.getNextUnitOffset(); }
   const DWARFSection &getLineSection() const { return LineSection; }
   StringRef getStringSection() const { return StringSection; }
@@ -313,10 +314,6 @@ public:
     RangeSection = RS;
     RangeSectionBase = Base;
   }
-  void setLocSection(const DWARFSection *LS, uint64_t Base) {
-    LocSection = LS;
-    LocSectionBase = Base;
-  }
 
   uint64_t getLocSectionBase() const {
     return LocSectionBase;
@@ -411,21 +408,10 @@ public:
   /// Return a rangelist's offset based on an index. The index designates
   /// an entry in the rangelist table's offset array and is supplied by
   /// DW_FORM_rnglistx.
-  Optional<uint64_t> getRnglistOffset(uint32_t Index) {
-    if (!RngListTable)
-      return None;
-    if (Optional<uint64_t> Off = RngListTable->getOffsetEntry(Index))
-      return *Off + RangeSectionBase;
-    return None;
-  }
+  Optional<uint64_t> getRnglistOffset(uint32_t Index);
+
+  Optional<uint64_t> getLoclistOffset(uint32_t Index);
 
-  Optional<uint64_t> getLoclistOffset(uint32_t Index) {
-    if (!LoclistTableHeader)
-      return None;
-    if (Optional<uint64_t> Off = LoclistTableHeader->getOffsetEntry(Index))
-      return *Off + getLocSectionBase();
-    return None;
-  }
   Expected<DWARFAddressRangesVector> collectAddressRanges();
 
   Expected<DWARFLocationExpressionsVector>
@@ -480,7 +466,6 @@ public:
   /// The unit needs to have its DIEs extracted for this method to work.
   DWARFDie getDIEForOffset(uint64_t Offset) {
     extractDIEsIfNeeded(false);
-    assert(!DieArray.empty());
     auto It =
         llvm::partition_point(DieArray, [=](const DWARFDebugInfoEntry &DIE) {
           return DIE.getOffset() < Offset;
@@ -529,6 +514,10 @@ private:
   bool parseDWO();
 };
 
+inline bool isCompileUnit(const std::unique_ptr<DWARFUnit> &U) {
+  return !U->isTypeUnit();
+}
+
 } // end namespace llvm
 
 #endif // LLVM_DEBUGINFO_DWARF_DWARFUNIT_H
diff --git a/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFVerifier.h b/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
index 22b1d722fc89..18d889f5cadb 100644
--- a/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
+++ b/contrib/llvm-project/llvm/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
@@ -12,25 +12,22 @@
 #include "llvm/ADT/Optional.h"
 #include "llvm/DebugInfo/DIContext.h"
 #include "llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h"
-#include "llvm/DebugInfo/DWARF/DWARFAddressRange.h"
 #include "llvm/DebugInfo/DWARF/DWARFDie.h"
 #include "llvm/DebugInfo/DWARF/DWARFUnitIndex.h"
-
 #include <cstdint>
 #include <map>
 #include <set>
 
 namespace llvm {
 class raw_ostream;
+struct DWARFAddressRange;
 struct DWARFAttribute;
 class DWARFContext;
-class DWARFDie;
-class DWARFUnit;
-class DWARFCompileUnit;
 class DWARFDataExtractor;
 class DWARFDebugAbbrev;
 class DataExtractor;
 struct DWARFSection;
+class DWARFUnit;
 
 /// A class that verifies DWARF debug information given a DWARF Context.
 class DWARFVerifier {
diff --git a/contrib/llvm-project/llvm/include/llvm/DebugInfo/MSF/MappedBlockStream.h b/contrib/llvm-project/llvm/include/llvm/DebugInfo/MSF/MappedBlockStream.h
index 593d781b990e..473c89e8106f 100644
--- a/contrib/llvm-project/llvm/include/llvm/DebugInfo/MSF/MappedBlockStream.h
+++ b/contrib/llvm-project/llvm/include/llvm/DebugInfo/MSF/MappedBlockStream.h
@@ -24,8 +24,6 @@
 namespace llvm {
 namespace msf {
 
-struct MSFLayout;
-
 /// MappedBlockStream represents data stored in an MSF file into chunks of a
 /// particular size (called the Block Size), and whose chunks may not be
 /// necessarily contiguous.  The arrangement of these chunks MSF the file
diff --git a/contrib/llvm-project/llvm/include/llvm/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.h b/contrib/llvm-project/llvm/include/llvm/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.h
index beaaef0c5a6c..82b63d729454 100644
--- a/contrib/llvm-project/llvm/include/llvm/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.h
+++ b/contrib/llvm-project/llvm/include/llvm/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.h
@@ -34,6 +34,34 @@ struct MSFLayout;
 }
 namespace pdb {
 
+// Represents merged or unmerged symbols. Merged symbols can be written to the
+// output file as is, but unmerged symbols must be rewritten first. In either
+// case, the size must be known up front.
+struct SymbolListWrapper {
+  explicit SymbolListWrapper(ArrayRef<uint8_t> Syms)
+      : SymPtr(const_cast<uint8_t *>(Syms.data())), SymSize(Syms.size()),
+        NeedsToBeMerged(false) {}
+  explicit SymbolListWrapper(void *SymSrc, uint32_t Length)
+      : SymPtr(SymSrc), SymSize(Length), NeedsToBeMerged(true) {}
+
+  ArrayRef<uint8_t> asArray() const {
+    return ArrayRef<uint8_t>(static_cast<const uint8_t *>(SymPtr), SymSize);
+  }
+
+  uint32_t size() const { return SymSize; }
+
+  void *SymPtr = nullptr;
+  uint32_t SymSize = 0;
+  bool NeedsToBeMerged = false;
+};
+
+/// Represents a string table reference at some offset in the module symbol
+/// stream.
+struct StringTableFixup {
+  uint32_t StrTabOffset = 0;
+  uint32_t SymOffsetOfReference = 0;
+};
+
 class DbiModuleDescriptorBuilder {
   friend class DbiStreamBuilder;
 
@@ -48,10 +76,28 @@ public:
 
   void setPdbFilePathNI(uint32_t NI);
   void setObjFileName(StringRef Name);
+
+  // Callback to merge one source of unmerged symbols.
+  using MergeSymbolsCallback = Error (*)(void *Ctx, void *Symbols,
+                                         BinaryStreamWriter &Writer);
+
+  void setMergeSymbolsCallback(void *Ctx, MergeSymbolsCallback Callback) {
+    MergeSymsCtx = Ctx;
+    MergeSymsCallback = Callback;
+  }
+
+  void setStringTableFixups(std::vector<StringTableFixup> &&Fixups) {
+    StringTableFixups = std::move(Fixups);
+  }
+
   void setFirstSectionContrib(const SectionContrib &SC);
   void addSymbol(codeview::CVSymbol Symbol);
   void addSymbolsInBulk(ArrayRef<uint8_t> BulkSymbols);
 
+  // Add symbols of known size which will be merged (rewritten) when committing
+  // the PDB to disk.
+  void addUnmergedSymbols(void *SymSrc, uint32_t SymLength);
+
   void
   addDebugSubsection(std::shared_ptr<codeview::DebugSubsection> Subsection);
 
@@ -77,8 +123,14 @@ public:
   void finalize();
   Error finalizeMsfLayout();
 
-  Error commit(BinaryStreamWriter &ModiWriter, const msf::MSFLayout &MsfLayout,
-               WritableBinaryStreamRef MsfBuffer);
+  /// Commit the DBI descriptor to the DBI stream.
+  Error commit(BinaryStreamWriter &ModiWriter);
+
+  /// Commit the accumulated symbols to the module symbol stream. Safe to call
+  /// in parallel on different DbiModuleDescriptorBuilder objects. Only modifies
+  /// the pre-allocated stream in question.
+  Error commitSymbolStream(const msf::MSFLayout &MsfLayout,
+                           WritableBinaryStreamRef MsfBuffer);
 
 private:
   uint32_t calculateC13DebugInfoSize() const;
@@ -91,7 +143,12 @@ private:
   std::string ModuleName;
   std::string ObjFileName;
   std::vector<std::string> SourceFiles;
-  std::vector<ArrayRef<uint8_t>> Symbols;
+  std::vector<SymbolListWrapper> Symbols;
+
+  void *MergeSymsCtx = nullptr;
+  MergeSymbolsCallback MergeSymsCallback = nullptr;
+
+  std::vector<StringTableFixup> StringTableFixups;
 
   std::vector<codeview::DebugSubsectionRecordBuilder> C13Builders;
 
diff --git a/contrib/llvm-project/llvm/include/llvm/DebugInfo/PDB/Native/NativeEnumSymbols.h b/contrib/llvm-project/llvm/include/llvm/DebugInfo/PDB/Native/NativeEnumSymbols.h
new file mode 100644
index 000000000000..480b3fb11419
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/DebugInfo/PDB/Native/NativeEnumSymbols.h
@@ -0,0 +1,41 @@
+//==- NativeEnumSymbols.h - Native Symbols Enumerator impl -------*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_NATIVE_NATIVEENUMSYMBOLS_H
+#define LLVM_DEBUGINFO_PDB_NATIVE_NATIVEENUMSYMBOLS_H
+
+#include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
+
+#include <vector>
+
+namespace llvm {
+namespace pdb {
+
+class NativeSession;
+
+class NativeEnumSymbols : public IPDBEnumChildren<PDBSymbol> {
+public:
+  NativeEnumSymbols(NativeSession &Session, std::vector<SymIndexId> Symbols);
+
+  uint32_t getChildCount() const override;
+  std::unique_ptr<PDBSymbol> getChildAtIndex(uint32_t Index) const override;
+  std::unique_ptr<PDBSymbol> getNext() override;
+  void reset() override;
+
+private:
+  std::vector<SymIndexId> Symbols;
+  uint32_t Index;
+  NativeSession &Session;
+};
+
+} // namespace pdb
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm-project/llvm/include/llvm/DebugInfo/PDB/Native/NativeFunctionSymbol.h b/contrib/llvm-project/llvm/include/llvm/DebugInfo/PDB/Native/NativeFunctionSymbol.h
index 4adf89f0d69a..b219055d2153 100644
--- a/contrib/llvm-project/llvm/include/llvm/DebugInfo/PDB/Native/NativeFunctionSymbol.h
+++ b/contrib/llvm-project/llvm/include/llvm/DebugInfo/PDB/Native/NativeFunctionSymbol.h
@@ -20,7 +20,7 @@ namespace pdb {
 class NativeFunctionSymbol : public NativeRawSymbol {
 public:
   NativeFunctionSymbol(NativeSession &Session, SymIndexId Id,
-                       const codeview::ProcSym &Sym);
+                       const codeview::ProcSym &Sym, uint32_t RecordOffset);
 
   ~NativeFunctionSymbol() override;
 
@@ -30,13 +30,15 @@ public:
   uint32_t getAddressOffset() const override;
   uint32_t getAddressSection() const override;
   std::string getName() const override;
-  PDB_SymType getSymTag() const override;
   uint64_t getLength() const override;
   uint32_t getRelativeVirtualAddress() const override;
   uint64_t getVirtualAddress() const override;
+  std::unique_ptr<IPDBEnumSymbols>
+  findInlineFramesByVA(uint64_t VA) const override;
 
 protected:
   const codeview::ProcSym Sym;
+  uint32_t RecordOffset = 0;
 };
 
 } // namespace pdb
diff --git a/contrib/llvm-project/llvm/include/llvm/DebugInfo/PDB/Native/NativeInlineSiteSymbol.h b/contrib/llvm-project/llvm/include/llvm/DebugInfo/PDB/Native/NativeInlineSiteSymbol.h
new file mode 100644
index 000000000000..2f6aba038ae8
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/DebugInfo/PDB/Native/NativeInlineSiteSymbol.h
@@ -0,0 +1,46 @@
+//===- NativeInlineSiteSymbol.h - info about inline sites -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_NATIVE_NATIVEINLINESITESYMBOL_H
+#define LLVM_DEBUGINFO_PDB_NATIVE_NATIVEINLINESITESYMBOL_H
+
+#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
+#include "llvm/DebugInfo/PDB/Native/NativeRawSymbol.h"
+#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+
+namespace llvm {
+namespace pdb {
+
+class NativeInlineSiteSymbol : public NativeRawSymbol {
+public:
+  NativeInlineSiteSymbol(NativeSession &Session, SymIndexId Id,
+                         const codeview::InlineSiteSym &Sym,
+                         uint64_t ParentAddr);
+
+  ~NativeInlineSiteSymbol() override;
+
+  void dump(raw_ostream &OS, int Indent, PdbSymbolIdField ShowIdFields,
+            PdbSymbolIdField RecurseIdFields) const override;
+
+  std::string getName() const override;
+  std::unique_ptr<IPDBEnumLineNumbers>
+  findInlineeLinesByVA(uint64_t VA, uint32_t Length) const override;
+
+private:
+  const codeview::InlineSiteSym Sym;
+  uint64_t ParentAddr;
+
+  void getLineOffset(uint32_t OffsetInFunc, uint32_t &LineOffset,
+                     uint32_t &FileOffset) const;
+};
+
+} // namespace pdb
+} // namespace llvm
+
+#endif // LLVM_DEBUGINFO_PDB_NATIVE_NATIVEINLINESITESYMBOL_H
diff --git a/contrib/llvm-project/llvm/include/llvm/DebugInfo/PDB/Native/NativeLineNumber.h b/contrib/llvm-project/llvm/include/llvm/DebugInfo/PDB/Native/NativeLineNumber.h
index a7ce82c70b08..5dedc70f11ba 100644
--- a/contrib/llvm-project/llvm/include/llvm/DebugInfo/PDB/Native/NativeLineNumber.h
+++ b/contrib/llvm-project/llvm/include/llvm/DebugInfo/PDB/Native/NativeLineNumber.h
@@ -22,7 +22,7 @@ public:
                             const codeview::LineInfo Line,
                             uint32_t ColumnNumber, uint32_t Length,
                             uint32_t Section, uint32_t Offset,
-                            uint32_t SrcFileId);
+                            uint32_t SrcFileId, uint32_t CompilandId);
 
   uint32_t getLineNumber() const override;
   uint32_t getLineNumberEnd() const override;
@@ -45,6 +45,7 @@ private:
   uint32_t Offset;
   uint32_t Length;
   uint32_t SrcFileId;
+  uint32_t CompilandId;
 };
 } // namespace pdb
 } // namespace llvm
diff --git a/contrib/llvm-project/llvm/include/llvm/DebugInfo/PDB/Native/NativePublicSymbol.h b/contrib/llvm-project/llvm/include/llvm/DebugInfo/PDB/Native/NativePublicSymbol.h
index 0a1451530f18..9f410e27f4cb 100644
--- a/contrib/llvm-project/llvm/include/llvm/DebugInfo/PDB/Native/NativePublicSymbol.h
+++ b/contrib/llvm-project/llvm/include/llvm/DebugInfo/PDB/Native/NativePublicSymbol.h
@@ -30,7 +30,6 @@ public:
   uint32_t getAddressOffset() const override;
   uint32_t getAddressSection() const override;
   std::string getName() const override;
-  PDB_SymType getSymTag() const override;
   uint32_t getRelativeVirtualAddress() const override;
   uint64_t getVirtualAddress() const override;
 
diff --git a/contrib/llvm-project/llvm/include/llvm/DebugInfo/PDB/Native/NativeSession.h b/contrib/llvm-project/llvm/include/llvm/DebugInfo/PDB/Native/NativeSession.h
index 342e63599e66..5f8fc587e546 100644
--- a/contrib/llvm-project/llvm/include/llvm/DebugInfo/PDB/Native/NativeSession.h
+++ b/contrib/llvm-project/llvm/include/llvm/DebugInfo/PDB/Native/NativeSession.h
@@ -110,9 +110,14 @@ public:
   const SymbolCache &getSymbolCache() const { return Cache; }
   uint32_t getRVAFromSectOffset(uint32_t Section, uint32_t Offset) const;
   uint64_t getVAFromSectOffset(uint32_t Section, uint32_t Offset) const;
+  bool moduleIndexForVA(uint64_t VA, uint16_t &ModuleIndex) const;
+  bool moduleIndexForSectOffset(uint32_t Sect, uint32_t Offset,
+                                uint16_t &ModuleIndex) const;
+  Expected<ModuleDebugStreamRef> getModuleDebugStream(uint32_t Index) const;
 
 private:
   void initializeExeSymbol();
+  void parseSectionContribs();
 
   std::unique_ptr<PDBFile> Pdb;
   std::unique_ptr<BumpPtrAllocator> Allocator;
@@ -120,6 +125,12 @@ private:
   SymbolCache Cache;
   SymIndexId ExeSymbol = 0;
   uint64_t LoadAddress = 0;
+
+  /// Map from virtual address to module index.
+  using IMap =
+      IntervalMap<uint64_t, uint16_t, 8, IntervalMapHalfOpenInfo<uint64_t>>;
+  IMap::Allocator IMapAllocator;
+  IMap AddrToModuleIndex;
 };
 } // namespace pdb
 } // namespace llvm
diff --git a/contrib/llvm-project/llvm/include/llvm/DebugInfo/PDB/Native/SymbolCache.h b/contrib/llvm-project/llvm/include/llvm/DebugInfo/PDB/Native/SymbolCache.h
index 90fd19a7a2fb..1ff6ca173b2b 100644
--- a/contrib/llvm-project/llvm/include/llvm/DebugInfo/PDB/Native/SymbolCache.h
+++ b/contrib/llvm-project/llvm/include/llvm/DebugInfo/PDB/Native/SymbolCache.h
@@ -37,40 +37,40 @@ class SymbolCache {
   /// an Id.  Id allocation is an implementation, with the only guarantee
   /// being that once an Id is allocated, the symbol can be assumed to be
   /// cached.
-  std::vector<std::unique_ptr<NativeRawSymbol>> Cache;
+  mutable std::vector<std::unique_ptr<NativeRawSymbol>> Cache;
 
   /// For type records from the TPI stream which have been paresd and cached,
   /// stores a mapping to SymIndexId of the cached symbol.
-  DenseMap<codeview::TypeIndex, SymIndexId> TypeIndexToSymbolId;
+  mutable DenseMap<codeview::TypeIndex, SymIndexId> TypeIndexToSymbolId;
 
   /// For field list members which have been parsed and cached, stores a mapping
   /// from (IndexOfClass, MemberIndex) to the corresponding SymIndexId of the
   /// cached symbol.
-  DenseMap<std::pair<codeview::TypeIndex, uint32_t>, SymIndexId>
+  mutable DenseMap<std::pair<codeview::TypeIndex, uint32_t>, SymIndexId>
       FieldListMembersToSymbolId;
 
   /// List of SymIndexIds for each compiland, indexed by compiland index as they
   /// appear in the PDB file.
-  std::vector<SymIndexId> Compilands;
+  mutable std::vector<SymIndexId> Compilands;
 
   /// List of source files, indexed by unique source file index.
   mutable std::vector<std::unique_ptr<NativeSourceFile>> SourceFiles;
+
+  /// Map from string table offset to source file Id.
   mutable DenseMap<uint32_t, SymIndexId> FileNameOffsetToId;
 
   /// Map from global symbol offset to SymIndexId.
-  DenseMap<uint32_t, SymIndexId> GlobalOffsetToSymbolId;
-
-  /// Map from segment and code offset to SymIndexId.
-  DenseMap<std::pair<uint32_t, uint32_t>, SymIndexId> AddressToFunctionSymId;
-  DenseMap<std::pair<uint32_t, uint32_t>, SymIndexId> AddressToPublicSymId;
+  mutable DenseMap<uint32_t, SymIndexId> GlobalOffsetToSymbolId;
 
-  /// Map from virtual address to module index.
-  using IMap =
-      IntervalMap<uint64_t, uint16_t, 8, IntervalMapHalfOpenInfo<uint64_t>>;
-  IMap::Allocator IMapAllocator;
-  IMap AddrToModuleIndex;
+  /// Map from segment and code offset to function symbols.
+  mutable DenseMap<std::pair<uint32_t, uint32_t>, SymIndexId> AddressToSymbolId;
+  /// Map from segment and code offset to public symbols.
+  mutable DenseMap<std::pair<uint32_t, uint32_t>, SymIndexId>
+      AddressToPublicSymId;
 
-  Expected<ModuleDebugStreamRef> getModuleDebugStream(uint32_t Index) const;
+  /// Map from module index and symbol table offset to SymIndexId.
+  mutable DenseMap<std::pair<uint16_t, uint32_t>, SymIndexId>
+      SymTabOffsetToSymbolId;
 
   struct LineTableEntry {
     uint64_t Addr;
@@ -83,7 +83,7 @@ class SymbolCache {
   std::vector<LineTableEntry> findLineTable(uint16_t Modi) const;
   mutable DenseMap<uint16_t, std::vector<LineTableEntry>> LineTable;
 
-  SymIndexId createSymbolPlaceholder() {
+  SymIndexId createSymbolPlaceholder() const {
     SymIndexId Id = Cache.size();
     Cache.push_back(nullptr);
     return Id;
@@ -91,7 +91,7 @@ class SymbolCache {
 
   template <typename ConcreteSymbolT, typename CVRecordT, typename... Args>
   SymIndexId createSymbolForType(codeview::TypeIndex TI, codeview::CVType CVT,
-                                 Args &&... ConstructorArgs) {
+                                 Args &&...ConstructorArgs) const {
     CVRecordT Record;
     if (auto EC =
             codeview::TypeDeserializer::deserializeAs<CVRecordT>(CVT, Record)) {
@@ -104,10 +104,10 @@ class SymbolCache {
   }
 
   SymIndexId createSymbolForModifiedType(codeview::TypeIndex ModifierTI,
-                                         codeview::CVType CVT);
+                                         codeview::CVType CVT) const;
 
   SymIndexId createSimpleType(codeview::TypeIndex TI,
-                              codeview::ModifierOptions Mods);
+                              codeview::ModifierOptions Mods) const;
 
   std::unique_ptr<PDBSymbol> findFunctionSymbolBySectOffset(uint32_t Sect,
                                                             uint32_t Offset);
@@ -118,7 +118,7 @@ public:
   SymbolCache(NativeSession &Session, DbiStream *Dbi);
 
   template <typename ConcreteSymbolT, typename... Args>
-  SymIndexId createSymbol(Args &&... ConstructorArgs) {
+  SymIndexId createSymbol(Args &&...ConstructorArgs) const {
     SymIndexId Id = Cache.size();
 
     // Initial construction must not access the cache, since it must be done
@@ -145,7 +145,7 @@ public:
   std::unique_ptr<IPDBEnumSymbols>
   createGlobalsEnumerator(codeview::SymbolKind Kind);
 
-  SymIndexId findSymbolByTypeIndex(codeview::TypeIndex TI);
+  SymIndexId findSymbolByTypeIndex(codeview::TypeIndex TI) const;
 
   template <typename ConcreteSymbolT, typename... Args>
   SymIndexId getOrCreateFieldListMember(codeview::TypeIndex FieldListTI,
@@ -163,6 +163,9 @@ public:
   }
 
   SymIndexId getOrCreateGlobalSymbolByOffset(uint32_t Offset);
+  SymIndexId getOrCreateInlineSymbol(codeview::InlineSiteSym Sym,
+                                     uint64_t ParentAddr, uint16_t Modi,
+                                     uint32_t RecordOffset) const;
 
   std::unique_ptr<PDBSymbol>
   findSymbolBySectOffset(uint32_t Sect, uint32_t Offset, PDB_SymType Type);
@@ -185,9 +188,6 @@ public:
   std::unique_ptr<IPDBSourceFile> getSourceFileById(SymIndexId FileId) const;
   SymIndexId
   getOrCreateSourceFile(const codeview::FileChecksumEntry &Checksum) const;
-
-  void parseSectionContribs();
-  Optional<uint16_t> getModuleIndexForAddr(uint64_t Addr) const;
 };
 
 } // namespace pdb
diff --git a/contrib/llvm-project/llvm/include/llvm/DebugInfo/PDB/Native/TpiStream.h b/contrib/llvm-project/llvm/include/llvm/DebugInfo/PDB/Native/TpiStream.h
index 1b7fd2d54cb2..70288868ca21 100644
--- a/contrib/llvm-project/llvm/include/llvm/DebugInfo/PDB/Native/TpiStream.h
+++ b/contrib/llvm-project/llvm/include/llvm/DebugInfo/PDB/Native/TpiStream.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_DEBUGINFO_PDB_RAW_PDBTPISTREAM_H
 #define LLVM_DEBUGINFO_PDB_RAW_PDBTPISTREAM_H
 
-#include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/CodeView/CVRecord.h"
 #include "llvm/DebugInfo/PDB/Native/HashTable.h"
 #include "llvm/DebugInfo/PDB/Native/RawConstants.h"
 #include "llvm/DebugInfo/PDB/Native/RawTypes.h"
diff --git a/contrib/llvm-project/llvm/include/llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h b/contrib/llvm-project/llvm/include/llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h
index 72d98e9c2c4d..9ef2ee6a9307 100644
--- a/contrib/llvm-project/llvm/include/llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h
+++ b/contrib/llvm-project/llvm/include/llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h
@@ -54,16 +54,20 @@ public:
 
   void setVersionHeader(PdbRaw_TpiVer Version);
   void addTypeRecord(ArrayRef<uint8_t> Type, Optional<uint32_t> Hash);
+  void addTypeRecords(ArrayRef<uint8_t> Types, ArrayRef<uint16_t> Sizes,
+                      ArrayRef<uint32_t> Hashes);
 
   Error finalizeMsfLayout();
 
-  uint32_t getRecordCount() const { return TypeRecords.size(); }
+  uint32_t getRecordCount() const { return TypeRecordCount; }
 
   Error commit(const msf::MSFLayout &Layout, WritableBinaryStreamRef Buffer);
 
   uint32_t calculateSerializedLength();
 
 private:
+  void updateTypeIndexOffsets(ArrayRef<uint16_t> Sizes);
+
   uint32_t calculateHashBufferSize() const;
   uint32_t calculateIndexOffsetSize() const;
   Error finalize();
@@ -71,10 +75,11 @@ private:
   msf::MSFBuilder &Msf;
   BumpPtrAllocator &Allocator;
 
+  uint32_t TypeRecordCount = 0;
   size_t TypeRecordBytes = 0;
 
   PdbRaw_TpiVer VerHeader = PdbRaw_TpiVer::PdbTpiV80;
-  std::vector<ArrayRef<uint8_t>> TypeRecords;
+  std::vector<ArrayRef<uint8_t>> TypeRecBuffers;
   std::vector<uint32_t> TypeHashes;
   std::vector<codeview::TypeIndexOffset> TypeIndexOffsets;
   uint32_t HashStreamIndex = kInvalidStreamIndex;
diff --git a/contrib/llvm-project/llvm/include/llvm/DebugInfo/PDB/PDBExtras.h b/contrib/llvm-project/llvm/include/llvm/DebugInfo/PDB/PDBExtras.h
index 45aba013e7c8..802d18a069ee 100644
--- a/contrib/llvm-project/llvm/include/llvm/DebugInfo/PDB/PDBExtras.h
+++ b/contrib/llvm-project/llvm/include/llvm/DebugInfo/PDB/PDBExtras.h
@@ -9,16 +9,15 @@
 #ifndef LLVM_DEBUGINFO_PDB_PDBEXTRAS_H
 #define LLVM_DEBUGINFO_PDB_PDBEXTRAS_H
 
+#include "llvm/ADT/StringRef.h"
 #include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/PDB/PDBTypes.h"
 #include "llvm/Support/raw_ostream.h"
-
+#include <cstdint>
 #include <unordered_map>
 
 namespace llvm {
 
-class raw_ostream;
-
 namespace pdb {
 
 using TagStats = std::unordered_map<PDB_SymType, int>;
@@ -51,7 +50,6 @@ void dumpSymbolField(raw_ostream &OS, StringRef Name, T Value, int Indent) {
   OS << Name << ": " << Value;
 }
 
-
 } // end namespace pdb
 
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/include/llvm/DebugInfo/PDB/PDBSymbol.h b/contrib/llvm-project/llvm/include/llvm/DebugInfo/PDB/PDBSymbol.h
index 2982146f960c..24cf1e459f92 100644
--- a/contrib/llvm-project/llvm/include/llvm/DebugInfo/PDB/PDBSymbol.h
+++ b/contrib/llvm-project/llvm/include/llvm/DebugInfo/PDB/PDBSymbol.h
@@ -42,7 +42,6 @@ class StringRef;
 class raw_ostream;
 
 namespace pdb {
-class IPDBRawSymbol;
 class IPDBSession;
 
 #define DECLARE_PDB_SYMBOL_CONCRETE_TYPE(TagValue)                             \
@@ -141,7 +140,14 @@ public:
                                                      StringRef Name,
                                                      PDB_NameSearchFlags Flags,
                                                      uint32_t RVA) const;
+  std::unique_ptr<IPDBEnumSymbols> findInlineFramesByVA(uint64_t VA) const;
   std::unique_ptr<IPDBEnumSymbols> findInlineFramesByRVA(uint32_t RVA) const;
+  std::unique_ptr<IPDBEnumLineNumbers>
+  findInlineeLinesByVA(uint64_t VA, uint32_t Length) const;
+  std::unique_ptr<IPDBEnumLineNumbers>
+  findInlineeLinesByRVA(uint32_t RVA, uint32_t Length) const;
+
+  std::string getName() const;
 
   const IPDBRawSymbol &getRawSymbol() const { return *RawSymbol; }
   IPDBRawSymbol &getRawSymbol() { return *RawSymbol; }
diff --git a/contrib/llvm-project/llvm/include/llvm/DebugInfo/Symbolize/Symbolize.h b/contrib/llvm-project/llvm/include/llvm/DebugInfo/Symbolize/Symbolize.h
index 085e4bb4ccb8..1c8fa11660af 100644
--- a/contrib/llvm-project/llvm/include/llvm/DebugInfo/Symbolize/Symbolize.h
+++ b/contrib/llvm-project/llvm/include/llvm/DebugInfo/Symbolize/Symbolize.h
@@ -43,7 +43,7 @@ public:
     bool Demangle = true;
     bool RelativeAddresses = false;
     bool UntagAddresses = false;
-    bool UseNativePDBReader = false;
+    bool UseDIA = false;
     std::string DefaultArch;
     std::vector<std::string> DsymHints;
     std::string FallbackDebugPath;
diff --git a/contrib/llvm-project/llvm/include/llvm/Demangle/ItaniumDemangle.h b/contrib/llvm-project/llvm/include/llvm/Demangle/ItaniumDemangle.h
index 6ab873218386..e5fca98f9271 100644
--- a/contrib/llvm-project/llvm/include/llvm/Demangle/ItaniumDemangle.h
+++ b/contrib/llvm-project/llvm/include/llvm/Demangle/ItaniumDemangle.h
@@ -82,6 +82,7 @@
     X(PostfixExpr) \
     X(ConditionalExpr) \
     X(MemberExpr) \
+    X(SubobjectExpr) \
     X(EnclosingExpr) \
     X(CastExpr) \
     X(SizeofParamPackExpr) \
@@ -91,10 +92,10 @@
     X(PrefixExpr) \
     X(FunctionParam) \
     X(ConversionExpr) \
+    X(PointerToMemberConversionExpr) \
     X(InitListExpr) \
     X(FoldExpr) \
     X(ThrowExpr) \
-    X(UUIDOfExpr) \
     X(BoolExpr) \
     X(StringLiteral) \
     X(LambdaExpr) \
@@ -1656,6 +1657,40 @@ public:
   }
 };
 
+class SubobjectExpr : public Node {
+  const Node *Type;
+  const Node *SubExpr;
+  StringView Offset;
+  NodeArray UnionSelectors;
+  bool OnePastTheEnd;
+
+public:
+  SubobjectExpr(const Node *Type_, const Node *SubExpr_, StringView Offset_,
+                NodeArray UnionSelectors_, bool OnePastTheEnd_)
+      : Node(KSubobjectExpr), Type(Type_), SubExpr(SubExpr_), Offset(Offset_),
+        UnionSelectors(UnionSelectors_), OnePastTheEnd(OnePastTheEnd_) {}
+
+  template<typename Fn> void match(Fn F) const {
+    F(Type, SubExpr, Offset, UnionSelectors, OnePastTheEnd);
+  }
+
+  void printLeft(OutputStream &S) const override {
+    SubExpr->print(S);
+    S += ".<";
+    Type->print(S);
+    S += " at offset ";
+    if (Offset.empty()) {
+      S += "0";
+    } else if (Offset[0] == 'n') {
+      S += "-";
+      S += Offset.dropFront();
+    } else {
+      S += Offset;
+    }
+    S += ">";
+  }
+};
+
 class EnclosingExpr : public Node {
   const StringView Prefix;
   const Node *Infix;
@@ -1843,6 +1878,28 @@ public:
   }
 };
 
+class PointerToMemberConversionExpr : public Node {
+  const Node *Type;
+  const Node *SubExpr;
+  StringView Offset;
+
+public:
+  PointerToMemberConversionExpr(const Node *Type_, const Node *SubExpr_,
+                                StringView Offset_)
+      : Node(KPointerToMemberConversionExpr), Type(Type_), SubExpr(SubExpr_),
+        Offset(Offset_) {}
+
+  template<typename Fn> void match(Fn F) const { F(Type, SubExpr, Offset); }
+
+  void printLeft(OutputStream &S) const override {
+    S += "(";
+    Type->print(S);
+    S += ")(";
+    SubExpr->print(S);
+    S += ")";
+  }
+};
+
 class InitListExpr : public Node {
   const Node *Ty;
   NodeArray Inits;
@@ -1977,21 +2034,6 @@ public:
   }
 };
 
-// MSVC __uuidof extension, generated by clang in -fms-extensions mode.
-class UUIDOfExpr : public Node {
-  Node *Operand;
-public:
-  UUIDOfExpr(Node *Operand_) : Node(KUUIDOfExpr), Operand(Operand_) {}
-
-  template<typename Fn> void match(Fn F) const { F(Operand); }
-
-  void printLeft(OutputStream &S) const override {
-    S << "__uuidof(";
-    Operand->print(S);
-    S << ")";
-  }
-};
-
 class BoolExpr : public Node {
   bool Value;
 
@@ -2313,9 +2355,9 @@ template <typename Derived, typename Alloc> struct AbstractManglingParser {
     TemplateParamList Params;
 
   public:
-    ScopedTemplateParamList(AbstractManglingParser *Parser)
-        : Parser(Parser),
-          OldNumTemplateParamLists(Parser->TemplateParams.size()) {
+    ScopedTemplateParamList(AbstractManglingParser *TheParser)
+        : Parser(TheParser),
+          OldNumTemplateParamLists(TheParser->TemplateParams.size()) {
       Parser->TemplateParams.push_back(&Params);
     }
     ~ScopedTemplateParamList() {
@@ -2437,6 +2479,8 @@ template <typename Derived, typename Alloc> struct AbstractManglingParser {
   Node *parseConversionExpr();
   Node *parseBracedExpr();
   Node *parseFoldExpr();
+  Node *parsePointerToMemberConversionExpr();
+  Node *parseSubobjectExpr();
 
   /// Parse the <type> production.
   Node *parseType();
@@ -4404,6 +4448,50 @@ Node *AbstractManglingParser<Derived, Alloc>::parseFoldExpr() {
   return make<FoldExpr>(IsLeftFold, OperatorName, Pack, Init);
 }
 
+// <expression> ::= mc <parameter type> <expr> [<offset number>] E
+//
+// Not yet in the spec: https://github.com/itanium-cxx-abi/cxx-abi/issues/47
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parsePointerToMemberConversionExpr() {
+  Node *Ty = getDerived().parseType();
+  if (!Ty)
+    return nullptr;
+  Node *Expr = getDerived().parseExpr();
+  if (!Expr)
+    return nullptr;
+  StringView Offset = getDerived().parseNumber(true);
+  if (!consumeIf('E'))
+    return nullptr;
+  return make<PointerToMemberConversionExpr>(Ty, Expr, Offset);
+}
+
+// <expression> ::= so <referent type> <expr> [<offset number>] <union-selector>* [p] E
+// <union-selector> ::= _ [<number>]
+//
+// Not yet in the spec: https://github.com/itanium-cxx-abi/cxx-abi/issues/47
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseSubobjectExpr() {
+  Node *Ty = getDerived().parseType();
+  if (!Ty)
+    return nullptr;
+  Node *Expr = getDerived().parseExpr();
+  if (!Expr)
+    return nullptr;
+  StringView Offset = getDerived().parseNumber(true);
+  size_t SelectorsBegin = Names.size();
+  while (consumeIf('_')) {
+    Node *Selector = make<NameType>(parseNumber());
+    if (!Selector)
+      return nullptr;
+    Names.push_back(Selector);
+  }
+  bool OnePastTheEnd = consumeIf('p');
+  if (!consumeIf('E'))
+    return nullptr;
+  return make<SubobjectExpr>(
+      Ty, Expr, Offset, popTrailingNodeArray(SelectorsBegin), OnePastTheEnd);
+}
+
 // <expression> ::= <unary operator-name> <expression>
 //              ::= <binary operator-name> <expression> <expression>
 //              ::= <ternary operator-name> <expression> <expression> <expression>
@@ -4661,6 +4749,9 @@ Node *AbstractManglingParser<Derived, Alloc>::parseExpr() {
     return nullptr;
   case 'm':
     switch (First[1]) {
+    case 'c':
+      First += 2;
+      return parsePointerToMemberConversionExpr();
     case 'i':
       First += 2;
       return getDerived().parseBinaryExpr("-");
@@ -4808,6 +4899,9 @@ Node *AbstractManglingParser<Derived, Alloc>::parseExpr() {
         return Ex;
       return make<CastExpr>("static_cast", T, Ex);
     }
+    case 'o':
+      First += 2;
+      return parseSubobjectExpr();
     case 'p': {
       First += 2;
       Node *Child = getDerived().parseExpr();
@@ -4903,6 +4997,43 @@ Node *AbstractManglingParser<Derived, Alloc>::parseExpr() {
     }
     }
     return nullptr;
+  case 'u': {
+    ++First;
+    Node *Name = getDerived().parseSourceName(/*NameState=*/nullptr);
+    if (!Name)
+      return nullptr;
+    // Special case legacy __uuidof mangling. The 't' and 'z' appear where the
+    // standard encoding expects a <template-arg>, and would be otherwise be
+    // interpreted as <type> node 'short' or 'ellipsis'. However, neither
+    // __uuidof(short) nor __uuidof(...) can actually appear, so there is no
+    // actual conflict here.
+    if (Name->getBaseName() == "__uuidof") {
+      if (numLeft() < 2)
+        return nullptr;
+      if (*First == 't') {
+        ++First;
+        Node *Ty = getDerived().parseType();
+        if (!Ty)
+          return nullptr;
+        return make<CallExpr>(Name, makeNodeArray(&Ty, &Ty + 1));
+      }
+      if (*First == 'z') {
+        ++First;
+        Node *Ex = getDerived().parseExpr();
+        if (!Ex)
+          return nullptr;
+        return make<CallExpr>(Name, makeNodeArray(&Ex, &Ex + 1));
+      }
+    }
+    size_t ExprsBegin = Names.size();
+    while (!consumeIf('E')) {
+      Node *E = getDerived().parseTemplateArg();
+      if (E == nullptr)
+        return E;
+      Names.push_back(E);
+    }
+    return make<CallExpr>(Name, popTrailingNodeArray(ExprsBegin));
+  }
   case '1':
   case '2':
   case '3':
@@ -4914,21 +5045,6 @@ Node *AbstractManglingParser<Derived, Alloc>::parseExpr() {
   case '9':
     return getDerived().parseUnresolvedName();
   }
-
-  if (consumeIf("u8__uuidoft")) {
-    Node *Ty = getDerived().parseType();
-    if (!Ty)
-      return nullptr;
-    return make<UUIDOfExpr>(Ty);
-  }
-
-  if (consumeIf("u8__uuidofz")) {
-    Node *Ex = getDerived().parseExpr();
-    if (!Ex)
-      return nullptr;
-    return make<UUIDOfExpr>(Ex);
-  }
-
   return nullptr;
 }
 
@@ -4975,6 +5091,16 @@ Node *AbstractManglingParser<Derived, Alloc>::parseSpecialName() {
   switch (look()) {
   case 'T':
     switch (look(1)) {
+    // TA <template-arg>    # template parameter object
+    //
+    // Not yet in the spec: https://github.com/itanium-cxx-abi/cxx-abi/issues/63
+    case 'A': {
+      First += 2;
+      Node *Arg = getDerived().parseTemplateArg();
+      if (Arg == nullptr)
+        return nullptr;
+      return make<SpecialName>("template parameter object for ", Arg);
+    }
     // TV <type>    # virtual table
     case 'V': {
       First += 2;
@@ -5103,7 +5229,7 @@ Node *AbstractManglingParser<Derived, Alloc>::parseEncoding() {
     decltype(TemplateParams) OldParams;
 
   public:
-    SaveTemplateParams(AbstractManglingParser *Parser) : Parser(Parser) {
+    SaveTemplateParams(AbstractManglingParser *TheParser) : Parser(TheParser) {
       OldParams = std::move(Parser->TemplateParams);
       Parser->TemplateParams.clear();
     }
@@ -5203,7 +5329,12 @@ struct FloatData<long double>
 #else
     static const size_t mangled_size = 20;  // May need to be adjusted to 16 or 24 on other platforms
 #endif
-    static const size_t max_demangled_size = 40;
+    // `-0x1.ffffffffffffffffffffffffffffp+16383` + 'L' + '\0' == 42 bytes.
+    // 28 'f's * 4 bits == 112 bits, which is the number of mantissa bits.
+    // Negatives are one character longer than positives.
+    // `0x1.` and `p` are constant, and exponents `+16383` and `-16382` are the
+    // same length. 1 sign bit, 112 mantissa bits, and 15 exponent bits == 128.
+    static const size_t max_demangled_size = 42;
     static constexpr const char *spec = "%LaL";
 };
 
diff --git a/contrib/llvm-project/llvm/include/llvm/Demangle/Utility.h b/contrib/llvm-project/llvm/include/llvm/Demangle/Utility.h
index 04e1936ebbe7..846a5f0818e7 100644
--- a/contrib/llvm-project/llvm/include/llvm/Demangle/Utility.h
+++ b/contrib/llvm-project/llvm/include/llvm/Demangle/Utility.h
@@ -52,7 +52,7 @@ class OutputStream {
     char *TempPtr = std::end(Temp);
 
     while (N) {
-      *--TempPtr = '0' + char(N % 10);
+      *--TempPtr = char('0' + N % 10);
       N /= 10;
     }
 
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/ExecutionEngine.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/ExecutionEngine.h
index 2562da7cf60b..2e386518f0bf 100644
--- a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/ExecutionEngine.h
+++ b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/ExecutionEngine.h
@@ -142,11 +142,6 @@ protected:
       std::shared_ptr<LegacyJITSymbolResolver> SR,
       std::unique_ptr<TargetMachine> TM);
 
-  static ExecutionEngine *(*OrcMCJITReplacementCtor)(
-      std::string *ErrorStr, std::shared_ptr<MCJITMemoryManager> MM,
-      std::shared_ptr<LegacyJITSymbolResolver> SR,
-      std::unique_ptr<TargetMachine> TM);
-
   static ExecutionEngine *(*InterpCtor)(std::unique_ptr<Module> M,
                                         std::string *ErrorStr);
 
@@ -552,7 +547,6 @@ private:
   std::string MCPU;
   SmallVector<std::string, 4> MAttrs;
   bool VerifyModules;
-  bool UseOrcMCJITReplacement;
   bool EmulatedTLS = true;
 
 public:
@@ -648,17 +642,6 @@ public:
     return *this;
   }
 
-  // Use OrcMCJITReplacement instead of MCJIT. Off by default.
-  LLVM_ATTRIBUTE_DEPRECATED(
-      inline void setUseOrcMCJITReplacement(bool UseOrcMCJITReplacement),
-      "ORCv1 utilities (including OrcMCJITReplacement) are deprecated. Please "
-      "use ORCv2/LLJIT instead (see docs/ORCv2.rst)");
-
-  void setUseOrcMCJITReplacement(ORCv1DeprecationAcknowledgement,
-                                 bool UseOrcMCJITReplacement) {
-    this->UseOrcMCJITReplacement = UseOrcMCJITReplacement;
-  }
-
   void setEmulatedTLS(bool EmulatedTLS) {
     this->EmulatedTLS = EmulatedTLS;
   }
@@ -679,10 +662,6 @@ public:
   ExecutionEngine *create(TargetMachine *TM);
 };
 
-void EngineBuilder::setUseOrcMCJITReplacement(bool UseOrcMCJITReplacement) {
-  this->UseOrcMCJITReplacement = UseOrcMCJITReplacement;
-}
-
 // Create wrappers for C Binding types (see CBindingWrapping.h).
 DEFINE_SIMPLE_CONVERSION_FUNCTIONS(ExecutionEngine, LLVMExecutionEngineRef)
 
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/JITEventListener.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/JITEventListener.h
index 606b6f7cc128..4eefd993de2b 100644
--- a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/JITEventListener.h
+++ b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/JITEventListener.h
@@ -20,7 +20,6 @@
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/Support/CBindingWrapping.h"
 #include <cstdint>
-#include <vector>
 
 namespace llvm {
 
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/JITLink/EHFrameSupport.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/JITLink/EHFrameSupport.h
index 72687682f606..ec78d9db40b6 100644
--- a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/JITLink/EHFrameSupport.h
+++ b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/JITLink/EHFrameSupport.h
@@ -21,14 +21,6 @@
 namespace llvm {
 namespace jitlink {
 
-/// Registers all FDEs in the given eh-frame section with the current process.
-Error registerEHFrameSection(const void *EHFrameSectionAddr,
-                             size_t EHFrameSectionSize);
-
-/// Deregisters all FDEs in the given eh-frame section with the current process.
-Error deregisterEHFrameSection(const void *EHFrameSectionAddr,
-                               size_t EHFrameSectionSize);
-
 /// Supports registration/deregistration of EH-frames in a target process.
 class EHFrameRegistrar {
 public:
@@ -42,32 +34,11 @@ public:
 /// Registers / Deregisters EH-frames in the current process.
 class InProcessEHFrameRegistrar final : public EHFrameRegistrar {
 public:
-  /// Get a reference to the InProcessEHFrameRegistrar singleton.
-  static InProcessEHFrameRegistrar &getInstance();
-
-  InProcessEHFrameRegistrar(const InProcessEHFrameRegistrar &) = delete;
-  InProcessEHFrameRegistrar &
-  operator=(const InProcessEHFrameRegistrar &) = delete;
-
-  InProcessEHFrameRegistrar(InProcessEHFrameRegistrar &&) = delete;
-  InProcessEHFrameRegistrar &operator=(InProcessEHFrameRegistrar &&) = delete;
-
   Error registerEHFrames(JITTargetAddress EHFrameSectionAddr,
-                         size_t EHFrameSectionSize) override {
-    return registerEHFrameSection(
-        jitTargetAddressToPointer<void *>(EHFrameSectionAddr),
-        EHFrameSectionSize);
-  }
+                         size_t EHFrameSectionSize) override;
 
   Error deregisterEHFrames(JITTargetAddress EHFrameSectionAddr,
-                           size_t EHFrameSectionSize) override {
-    return deregisterEHFrameSection(
-        jitTargetAddressToPointer<void *>(EHFrameSectionAddr),
-        EHFrameSectionSize);
-  }
-
-private:
-  InProcessEHFrameRegistrar();
+                           size_t EHFrameSectionSize) override;
 };
 
 using StoreFrameRangeFunction =
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/JITLink/ELF.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/JITLink/ELF.h
index 9f6ea5271f4b..8912f3a2db45 100644
--- a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/JITLink/ELF.h
+++ b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/JITLink/ELF.h
@@ -19,11 +19,20 @@
 namespace llvm {
 namespace jitlink {
 
-/// jit-link the given ObjBuffer, which must be a ELF object file.
+/// Create a LinkGraph from an ELF relocatable object.
+///
+/// Note: The graph does not take ownership of the underlying buffer, nor copy
+/// its contents. The caller is responsible for ensuring that the object buffer
+/// outlives the graph.
+Expected<std::unique_ptr<LinkGraph>>
+createLinkGraphFromELFObject(MemoryBufferRef ObjectBuffer);
+
+/// Link the given graph.
 ///
 /// Uses conservative defaults for GOT and stub handling based on the target
 /// platform.
-void jitLink_ELF(std::unique_ptr<JITLinkContext> Ctx);
+void link_ELF(std::unique_ptr<LinkGraph> G,
+              std::unique_ptr<JITLinkContext> Ctx);
 
 } // end namespace jitlink
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/JITLink/ELF_x86_64.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/JITLink/ELF_x86_64.h
index 7860088f3569..1423b0c30b2a 100644
--- a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/JITLink/ELF_x86_64.h
+++ b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/JITLink/ELF_x86_64.h
@@ -44,8 +44,20 @@ enum ELFX86RelocationKind : Edge::Kind {
 
 } // end namespace ELF_x86_64_Edges
 
+/// Create a LinkGraph from an ELF/x86-64 relocatable object.
+///
+/// Note: The graph does not take ownership of the underlying buffer, nor copy
+/// its contents. The caller is responsible for ensuring that the object buffer
+/// outlives the graph.
+Expected<std::unique_ptr<LinkGraph>>
+createLinkGraphFromELFObject_x86_64(MemoryBufferRef ObjectBuffer);
+
 /// jit-link the given object buffer, which must be a ELF x86-64 object file.
-void jitLink_ELF_x86_64(std::unique_ptr<JITLinkContext> Ctx);
+void link_ELF_x86_64(std::unique_ptr<LinkGraph> G,
+                     std::unique_ptr<JITLinkContext> Ctx);
+
+/// Return the string name of the given ELF x86-64 edge kind.
+StringRef getELFX86RelocationKindName(Edge::Kind R);
 } // end namespace jitlink
 } // end namespace llvm
 
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h
index 76f9dea4160f..e8c0e28b83aa 100644
--- a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h
+++ b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h
@@ -395,6 +395,10 @@ public:
     return Name;
   }
 
+  /// Rename this symbol. The client is responsible for updating scope and
+  /// linkage if this name-change requires it.
+  void setName(StringRef Name) { this->Name = Name; }
+
   /// Returns true if this Symbol has content (potentially) defined within this
   /// object file (i.e. is anything but an external or absolute symbol).
   bool isDefined() const {
@@ -782,21 +786,48 @@ public:
                                  Section::const_block_iterator, const Block *,
                                  getSectionConstBlocks>;
 
-  LinkGraph(std::string Name, unsigned PointerSize,
+  LinkGraph(std::string Name, const Triple &TT, unsigned PointerSize,
             support::endianness Endianness)
-      : Name(std::move(Name)), PointerSize(PointerSize),
+      : Name(std::move(Name)), TT(TT), PointerSize(PointerSize),
         Endianness(Endianness) {}
 
   /// Returns the name of this graph (usually the name of the original
   /// underlying MemoryBuffer).
   const std::string &getName() { return Name; }
 
+  /// Returns the target triple for this Graph.
+  const Triple &getTargetTriple() const { return TT; }
+
   /// Returns the pointer size for use in this graph.
   unsigned getPointerSize() const { return PointerSize; }
 
   /// Returns the endianness of content in this graph.
   support::endianness getEndianness() const { return Endianness; }
 
+  /// Allocate a copy of the given string using the LinkGraph's allocator.
+  /// This can be useful when renaming symbols or adding new content to the
+  /// graph.
+  StringRef allocateString(StringRef Source) {
+    auto *AllocatedBuffer = Allocator.Allocate<char>(Source.size());
+    llvm::copy(Source, AllocatedBuffer);
+    return StringRef(AllocatedBuffer, Source.size());
+  }
+
+  /// Allocate a copy of the given string using the LinkGraph's allocator.
+  /// This can be useful when renaming symbols or adding new content to the
+  /// graph.
+  ///
+  /// Note: This Twine-based overload requires an extra string copy and an
+  /// extra heap allocation for large strings. The StringRef overload should
+  /// be preferred where possible.
+  StringRef allocateString(Twine Source) {
+    SmallString<256> TmpBuffer;
+    auto SourceStr = Source.toStringRef(TmpBuffer);
+    auto *AllocatedBuffer = Allocator.Allocate<char>(SourceStr.size());
+    llvm::copy(SourceStr, AllocatedBuffer);
+    return StringRef(AllocatedBuffer, SourceStr.size());
+  }
+
   /// Create a section with the given name, protection flags, and alignment.
   Section &createSection(StringRef Name, sys::Memory::ProtectionFlags Prot) {
     std::unique_ptr<Section> Sec(new Section(Name, Prot, Sections.size()));
@@ -959,7 +990,7 @@ public:
       Section &Sec = Sym.getBlock().getSection();
       Sec.removeSymbol(Sym);
     }
-    Sym.makeExternal(createAddressable(false));
+    Sym.makeExternal(createAddressable(0, false));
     ExternalSymbols.insert(&Sym);
   }
 
@@ -1019,6 +1050,7 @@ private:
   BumpPtrAllocator Allocator;
 
   std::string Name;
+  Triple TT;
   unsigned PointerSize;
   support::endianness Endianness;
   SectionList Sections;
@@ -1191,15 +1223,31 @@ struct PassConfiguration {
   /// Notable use cases: Building GOT, stub, and TLV symbols.
   LinkGraphPassList PostPrunePasses;
 
+  /// Post-allocation passes.
+  ///
+  /// These passes are called on the graph after memory has been allocated and
+  /// defined nodes have been assigned their final addresses, but before the
+  /// context has been notified of these addresses. At this point externals
+  /// have not been resolved, and symbol content has not yet been copied into
+  /// working memory.
+  ///
+  /// Notable use cases: Setting up data structures associated with addresses
+  /// of defined symbols (e.g. a mapping of __dso_handle to JITDylib* for the
+  /// JIT runtime) -- using a PostAllocationPass for this ensures that the
+  /// data structures are in-place before any query for resolved symbols
+  /// can complete.
+  LinkGraphPassList PostAllocationPasses;
+
   /// Pre-fixup passes.
   ///
   /// These passes are called on the graph after memory has been allocated,
-  /// content copied into working memory, and nodes have been assigned their
-  /// final addresses.
+  /// content copied into working memory, and all nodes (including externals)
+  /// have been assigned their final addresses, but before any fixups have been
+  /// applied.
   ///
   /// Notable use cases: Late link-time optimizations like GOT and stub
   /// elimination.
-  LinkGraphPassList PostAllocationPasses;
+  LinkGraphPassList PreFixupPasses;
 
   /// Post-fixup passes.
   ///
@@ -1255,16 +1303,18 @@ class JITLinkContext {
 public:
   using LookupMap = DenseMap<StringRef, SymbolLookupFlags>;
 
+  /// Create a JITLinkContext.
+  JITLinkContext(const JITLinkDylib *JD) : JD(JD) {}
+
   /// Destroy a JITLinkContext.
   virtual ~JITLinkContext();
 
+  /// Return the JITLinkDylib that this link is targeting, if any.
+  const JITLinkDylib *getJITLinkDylib() const { return JD; }
+
   /// Return the MemoryManager to be used for this link.
   virtual JITLinkMemoryManager &getMemoryManager() = 0;
 
-  /// Returns a StringRef for the object buffer.
-  /// This method can not be called once takeObjectBuffer has been called.
-  virtual MemoryBufferRef getObjectBuffer() const = 0;
-
   /// Notify this context that linking failed.
   /// Called by JITLink if linking cannot be completed.
   virtual void notifyFailed(Error Err) = 0;
@@ -1279,7 +1329,11 @@ public:
   /// their final memory locations in the target process. At this point the
   /// LinkGraph can be inspected to build a symbol table, however the block
   /// content will not generally have been copied to the target location yet.
-  virtual void notifyResolved(LinkGraph &G) = 0;
+  ///
+  /// If the client detects an error in the LinkGraph state (e.g. unexpected or
+  /// missing symbols) they may return an error here. The error will be
+  /// propagated to notifyFailed and the linker will bail out.
+  virtual Error notifyResolved(LinkGraph &G) = 0;
 
   /// Called by JITLink to notify the context that the object has been
   /// finalized (i.e. emitted to memory and memory permissions set). If all of
@@ -1305,16 +1359,25 @@ public:
   /// Called by JITLink to modify the pass pipeline prior to linking.
   /// The default version performs no modification.
   virtual Error modifyPassConfig(const Triple &TT, PassConfiguration &Config);
+
+private:
+  const JITLinkDylib *JD = nullptr;
 };
 
 /// Marks all symbols in a graph live. This can be used as a default,
 /// conservative mark-live implementation.
 Error markAllSymbolsLive(LinkGraph &G);
 
-/// Basic JITLink implementation.
+/// Create a LinkGraph from the given object buffer.
 ///
-/// This function will use sensible defaults for GOT and Stub handling.
-void jitLink(std::unique_ptr<JITLinkContext> Ctx);
+/// Note: The graph does not take ownership of the underlying buffer, nor copy
+/// its contents. The caller is responsible for ensuring that the object buffer
+/// outlives the graph.
+Expected<std::unique_ptr<LinkGraph>>
+createLinkGraphFromObject(MemoryBufferRef ObjectBuffer);
+
+/// Link the given graph.
+void link(std::unique_ptr<LinkGraph> G, std::unique_ptr<JITLinkContext> Ctx);
 
 } // end namespace jitlink
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/JITLink/JITLinkDylib.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/JITLink/JITLinkDylib.h
new file mode 100644
index 000000000000..2aa88cb50074
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/JITLink/JITLinkDylib.h
@@ -0,0 +1,24 @@
+//===-- JITLinkDylib.h - JITLink Dylib type ---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines the JITLinkDylib API.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_JITLINK_JITLINKDYLIB_H
+#define LLVM_EXECUTIONENGINE_JITLINK_JITLINKDYLIB_H
+
+namespace llvm {
+namespace jitlink {
+
+class JITLinkDylib {};
+
+} // end namespace jitlink
+} // end namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_JITLINK_JITLINKDYLIB_H
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h
index 0c8514a60a50..cee7d6b09c48 100644
--- a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h
+++ b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h
@@ -14,10 +14,11 @@
 #define LLVM_EXECUTIONENGINE_JITLINK_JITLINKMEMORYMANAGER_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ExecutionEngine/JITLink/JITLinkDylib.h"
 #include "llvm/ExecutionEngine/JITSymbol.h"
 #include "llvm/Support/Error.h"
-#include "llvm/Support/Memory.h"
 #include "llvm/Support/MSVCErrorWorkarounds.h"
+#include "llvm/Support/Memory.h"
 
 #include <cstdint>
 #include <future>
@@ -93,18 +94,28 @@ public:
   virtual ~JITLinkMemoryManager();
 
   /// Create an Allocation object.
+  ///
+  /// The JD argument represents the target JITLinkDylib, and can be used by
+  /// JITLinkMemoryManager implementers to manage per-dylib allocation pools
+  /// (e.g. one pre-reserved address space slab per dylib to ensure that all
+  /// allocations for the dylib are within a certain range). The JD argument
+  /// may be null (representing an allocation not associated with any
+  /// JITDylib.
+  ///
+  /// The request argument describes the segment sizes and permisssions being
+  /// requested.
   virtual Expected<std::unique_ptr<Allocation>>
-  allocate(const SegmentsRequestMap &Request) = 0;
+  allocate(const JITLinkDylib *JD, const SegmentsRequestMap &Request) = 0;
 };
 
 /// A JITLinkMemoryManager that allocates in-process memory.
 class InProcessMemoryManager : public JITLinkMemoryManager {
 public:
   Expected<std::unique_ptr<Allocation>>
-  allocate(const SegmentsRequestMap &Request) override;
+  allocate(const JITLinkDylib *JD, const SegmentsRequestMap &Request) override;
 };
 
 } // end namespace jitlink
 } // end namespace llvm
 
-#endif // LLVM_EXECUTIONENGINE_JITLINK_JITLINK_H
+#endif // LLVM_EXECUTIONENGINE_JITLINK_JITLINKMEMORYMANAGER_H
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/JITLink/MachO.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/JITLink/MachO.h
index 7facb657a51c..b8432c4d26c6 100644
--- a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/JITLink/MachO.h
+++ b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/JITLink/MachO.h
@@ -18,11 +18,20 @@
 namespace llvm {
 namespace jitlink {
 
+/// Create a LinkGraph from a MachO relocatable object.
+///
+/// Note: The graph does not take ownership of the underlying buffer, nor copy
+/// its contents. The caller is responsible for ensuring that the object buffer
+/// outlives the graph.
+Expected<std::unique_ptr<LinkGraph>>
+createLinkGraphFromMachOObject(MemoryBufferRef ObjectBuffer);
+
 /// jit-link the given ObjBuffer, which must be a MachO object file.
 ///
 /// Uses conservative defaults for GOT and stub handling based on the target
 /// platform.
-void jitLink_MachO(std::unique_ptr<JITLinkContext> Ctx);
+void link_MachO(std::unique_ptr<LinkGraph> G,
+                std::unique_ptr<JITLinkContext> Ctx);
 
 } // end namespace jitlink
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/JITLink/MachO_arm64.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/JITLink/MachO_arm64.h
index d70b545fff86..c6aed2b60eac 100644
--- a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/JITLink/MachO_arm64.h
+++ b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/JITLink/MachO_arm64.h
@@ -40,6 +40,14 @@ enum MachOARM64RelocationKind : Edge::Kind {
 
 } // namespace MachO_arm64_Edges
 
+/// Create a LinkGraph from a MachO/arm64 relocatable object.
+///
+/// Note: The graph does not take ownership of the underlying buffer, nor copy
+/// its contents. The caller is responsible for ensuring that the object buffer
+/// outlives the graph.
+Expected<std::unique_ptr<LinkGraph>>
+createLinkGraphFromMachOObject_arm64(MemoryBufferRef ObjectBuffer);
+
 /// jit-link the given object buffer, which must be a MachO arm64 object file.
 ///
 /// If PrePrunePasses is empty then a default mark-live pass will be inserted
@@ -49,7 +57,8 @@ enum MachOARM64RelocationKind : Edge::Kind {
 /// If PostPrunePasses is empty then a default GOT-and-stubs insertion pass will
 /// be inserted. If PostPrunePasses is not empty then the caller is responsible
 /// for including a pass to insert GOT and stub edges.
-void jitLink_MachO_arm64(std::unique_ptr<JITLinkContext> Ctx);
+void link_MachO_arm64(std::unique_ptr<LinkGraph> G,
+                      std::unique_ptr<JITLinkContext> Ctx);
 
 /// Return the string name of the given MachO arm64 edge kind.
 StringRef getMachOARM64RelocationKindName(Edge::Kind R);
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/JITLink/MachO_x86_64.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/JITLink/MachO_x86_64.h
index 27fcdf4fa990..66c53d8c8291 100644
--- a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/JITLink/MachO_x86_64.h
+++ b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/JITLink/MachO_x86_64.h
@@ -45,7 +45,15 @@ enum MachOX86RelocationKind : Edge::Kind {
 
 } // namespace MachO_x86_64_Edges
 
-/// jit-link the given object buffer, which must be a MachO x86-64 object file.
+/// Create a LinkGraph from a MachO/x86-64 relocatable object.
+///
+/// Note: The graph does not take ownership of the underlying buffer, nor copy
+/// its contents. The caller is responsible for ensuring that the object buffer
+/// outlives the graph.
+Expected<std::unique_ptr<LinkGraph>>
+createLinkGraphFromMachOObject_x86_64(MemoryBufferRef ObjectBuffer);
+
+/// jit-link the given LinkGraph.
 ///
 /// If PrePrunePasses is empty then a default mark-live pass will be inserted
 /// that will mark all exported atoms live. If PrePrunePasses is not empty, the
@@ -54,7 +62,8 @@ enum MachOX86RelocationKind : Edge::Kind {
 /// If PostPrunePasses is empty then a default GOT-and-stubs insertion pass will
 /// be inserted. If PostPrunePasses is not empty then the caller is responsible
 /// for including a pass to insert GOT and stub edges.
-void jitLink_MachO_x86_64(std::unique_ptr<JITLinkContext> Ctx);
+void link_MachO_x86_64(std::unique_ptr<LinkGraph> G,
+                       std::unique_ptr<JITLinkContext> Ctx);
 
 /// Return the string name of the given MachO x86-64 edge kind.
 StringRef getMachOX86RelocationKindName(Edge::Kind R);
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/JITSymbol.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/JITSymbol.h
index 6f0030a18f47..9bbdd21f77de 100644
--- a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/JITSymbol.h
+++ b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/JITSymbol.h
@@ -429,7 +429,7 @@ public:
   virtual JITSymbol findSymbol(const std::string &Name) = 0;
 
 private:
-  virtual void anchor();
+  void anchor() override;
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h
index 9ecc0464dec1..91b12fd2277a 100644
--- a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h
+++ b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h
@@ -20,12 +20,10 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ExecutionEngine/JITSymbol.h"
 #include "llvm/ExecutionEngine/Orc/IndirectionUtils.h"
-#include "llvm/ExecutionEngine/Orc/LambdaResolver.h"
 #include "llvm/ExecutionEngine/Orc/Layer.h"
 #include "llvm/ExecutionEngine/Orc/LazyReexports.h"
-#include "llvm/ExecutionEngine/Orc/Legacy.h"
-#include "llvm/ExecutionEngine/Orc/OrcError.h"
 #include "llvm/ExecutionEngine/Orc/Speculation.h"
+#include "llvm/ExecutionEngine/Orc/Shared/OrcError.h"
 #include "llvm/ExecutionEngine/RuntimeDyld.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Constant.h"
@@ -96,7 +94,8 @@ public:
 
   /// Emits the given module. This should not be called by clients: it will be
   /// called by the JIT when a definition added via the add method is requested.
-  void emit(MaterializationResponsibility R, ThreadSafeModule TSM) override;
+  void emit(std::unique_ptr<MaterializationResponsibility> R,
+            ThreadSafeModule TSM) override;
 
 private:
   struct PerDylibResources {
@@ -120,7 +119,8 @@ private:
 
   void expandPartition(GlobalValueSet &Partition);
 
-  void emitPartition(MaterializationResponsibility R, ThreadSafeModule TSM,
+  void emitPartition(std::unique_ptr<MaterializationResponsibility> R,
+                     ThreadSafeModule TSM,
                      IRMaterializationUnit::SymbolNameToDefinitionMap Defs);
 
   mutable std::mutex CODLayerMutex;
@@ -134,635 +134,6 @@ private:
   ImplSymbolMap *AliaseeImpls = nullptr;
 };
 
-/// Compile-on-demand layer.
-///
-///   When a module is added to this layer a stub is created for each of its
-/// function definitions. The stubs and other global values are immediately
-/// added to the layer below. When a stub is called it triggers the extraction
-/// of the function body from the original module. The extracted body is then
-/// compiled and executed.
-template <typename BaseLayerT,
-          typename CompileCallbackMgrT = JITCompileCallbackManager,
-          typename IndirectStubsMgrT = IndirectStubsManager>
-class LegacyCompileOnDemandLayer {
-private:
-  template <typename MaterializerFtor>
-  class LambdaMaterializer final : public ValueMaterializer {
-  public:
-    LambdaMaterializer(MaterializerFtor M) : M(std::move(M)) {}
-
-    Value *materialize(Value *V) final { return M(V); }
-
-  private:
-    MaterializerFtor M;
-  };
-
-  template <typename MaterializerFtor>
-  LambdaMaterializer<MaterializerFtor>
-  createLambdaMaterializer(MaterializerFtor M) {
-    return LambdaMaterializer<MaterializerFtor>(std::move(M));
-  }
-
-  // Provide type-erasure for the Modules and MemoryManagers.
-  template <typename ResourceT>
-  class ResourceOwner {
-  public:
-    ResourceOwner() = default;
-    ResourceOwner(const ResourceOwner &) = delete;
-    ResourceOwner &operator=(const ResourceOwner &) = delete;
-    virtual ~ResourceOwner() = default;
-
-    virtual ResourceT& getResource() const = 0;
-  };
-
-  template <typename ResourceT, typename ResourcePtrT>
-  class ResourceOwnerImpl : public ResourceOwner<ResourceT> {
-  public:
-    ResourceOwnerImpl(ResourcePtrT ResourcePtr)
-      : ResourcePtr(std::move(ResourcePtr)) {}
-
-    ResourceT& getResource() const override { return *ResourcePtr; }
-
-  private:
-    ResourcePtrT ResourcePtr;
-  };
-
-  template <typename ResourceT, typename ResourcePtrT>
-  std::unique_ptr<ResourceOwner<ResourceT>>
-  wrapOwnership(ResourcePtrT ResourcePtr) {
-    using RO = ResourceOwnerImpl<ResourceT, ResourcePtrT>;
-    return std::make_unique<RO>(std::move(ResourcePtr));
-  }
-
-  struct LogicalDylib {
-    struct SourceModuleEntry {
-      std::unique_ptr<Module> SourceMod;
-      std::set<Function*> StubsToClone;
-    };
-
-    using SourceModulesList = std::vector<SourceModuleEntry>;
-    using SourceModuleHandle = typename SourceModulesList::size_type;
-
-    LogicalDylib() = default;
-
-    LogicalDylib(VModuleKey K, std::shared_ptr<SymbolResolver> BackingResolver,
-                 std::unique_ptr<IndirectStubsMgrT> StubsMgr)
-        : K(std::move(K)), BackingResolver(std::move(BackingResolver)),
-          StubsMgr(std::move(StubsMgr)) {}
-
-    SourceModuleHandle addSourceModule(std::unique_ptr<Module> M) {
-      SourceModuleHandle H = SourceModules.size();
-      SourceModules.push_back(SourceModuleEntry());
-      SourceModules.back().SourceMod = std::move(M);
-      return H;
-    }
-
-    Module& getSourceModule(SourceModuleHandle H) {
-      return *SourceModules[H].SourceMod;
-    }
-
-    std::set<Function*>& getStubsToClone(SourceModuleHandle H) {
-      return SourceModules[H].StubsToClone;
-    }
-
-    JITSymbol findSymbol(BaseLayerT &BaseLayer, const std::string &Name,
-                         bool ExportedSymbolsOnly) {
-      if (auto Sym = StubsMgr->findStub(Name, ExportedSymbolsOnly))
-        return Sym;
-      for (auto BLK : BaseLayerVModuleKeys)
-        if (auto Sym = BaseLayer.findSymbolIn(BLK, Name, ExportedSymbolsOnly))
-          return Sym;
-        else if (auto Err = Sym.takeError())
-          return std::move(Err);
-      return nullptr;
-    }
-
-    Error removeModulesFromBaseLayer(BaseLayerT &BaseLayer) {
-      for (auto &BLK : BaseLayerVModuleKeys)
-        if (auto Err = BaseLayer.removeModule(BLK))
-          return Err;
-      return Error::success();
-    }
-
-    VModuleKey K;
-    std::shared_ptr<SymbolResolver> BackingResolver;
-    std::unique_ptr<IndirectStubsMgrT> StubsMgr;
-    SymbolLinkagePromoter PromoteSymbols;
-    SourceModulesList SourceModules;
-    std::vector<VModuleKey> BaseLayerVModuleKeys;
-  };
-
-public:
-
-  /// Module partitioning functor.
-  using PartitioningFtor = std::function<std::set<Function*>(Function&)>;
-
-  /// Builder for IndirectStubsManagers.
-  using IndirectStubsManagerBuilderT =
-      std::function<std::unique_ptr<IndirectStubsMgrT>()>;
-
-  using SymbolResolverGetter =
-      std::function<std::shared_ptr<SymbolResolver>(VModuleKey K)>;
-
-  using SymbolResolverSetter =
-      std::function<void(VModuleKey K, std::shared_ptr<SymbolResolver> R)>;
-
-  /// Construct a compile-on-demand layer instance.
-  LLVM_ATTRIBUTE_DEPRECATED(
-      LegacyCompileOnDemandLayer(
-          ExecutionSession &ES, BaseLayerT &BaseLayer,
-          SymbolResolverGetter GetSymbolResolver,
-          SymbolResolverSetter SetSymbolResolver, PartitioningFtor Partition,
-          CompileCallbackMgrT &CallbackMgr,
-          IndirectStubsManagerBuilderT CreateIndirectStubsManager,
-          bool CloneStubsIntoPartitions = true),
-      "ORCv1 layers (layers with the 'Legacy' prefix) are deprecated. Please "
-      "use "
-      "the ORCv2 LegacyCompileOnDemandLayer instead");
-
-  /// Legacy layer constructor with deprecation acknowledgement.
-  LegacyCompileOnDemandLayer(
-      ORCv1DeprecationAcknowledgement, ExecutionSession &ES,
-      BaseLayerT &BaseLayer, SymbolResolverGetter GetSymbolResolver,
-      SymbolResolverSetter SetSymbolResolver, PartitioningFtor Partition,
-      CompileCallbackMgrT &CallbackMgr,
-      IndirectStubsManagerBuilderT CreateIndirectStubsManager,
-      bool CloneStubsIntoPartitions = true)
-      : ES(ES), BaseLayer(BaseLayer),
-        GetSymbolResolver(std::move(GetSymbolResolver)),
-        SetSymbolResolver(std::move(SetSymbolResolver)),
-        Partition(std::move(Partition)), CompileCallbackMgr(CallbackMgr),
-        CreateIndirectStubsManager(std::move(CreateIndirectStubsManager)),
-        CloneStubsIntoPartitions(CloneStubsIntoPartitions) {}
-
-  ~LegacyCompileOnDemandLayer() {
-    // FIXME: Report error on log.
-    while (!LogicalDylibs.empty())
-      consumeError(removeModule(LogicalDylibs.begin()->first));
-  }
-
-  /// Add a module to the compile-on-demand layer.
-  Error addModule(VModuleKey K, std::unique_ptr<Module> M) {
-
-    assert(!LogicalDylibs.count(K) && "VModuleKey K already in use");
-    auto I = LogicalDylibs.insert(
-        LogicalDylibs.end(),
-        std::make_pair(K, LogicalDylib(K, GetSymbolResolver(K),
-                                       CreateIndirectStubsManager())));
-
-    return addLogicalModule(I->second, std::move(M));
-  }
-
-  /// Add extra modules to an existing logical module.
-  Error addExtraModule(VModuleKey K, std::unique_ptr<Module> M) {
-    return addLogicalModule(LogicalDylibs[K], std::move(M));
-  }
-
-  /// Remove the module represented by the given key.
-  ///
-  ///   This will remove all modules in the layers below that were derived from
-  /// the module represented by K.
-  Error removeModule(VModuleKey K) {
-    auto I = LogicalDylibs.find(K);
-    assert(I != LogicalDylibs.end() && "VModuleKey K not valid here");
-    auto Err = I->second.removeModulesFromBaseLayer(BaseLayer);
-    LogicalDylibs.erase(I);
-    return Err;
-  }
-
-  /// Search for the given named symbol.
-  /// @param Name The name of the symbol to search for.
-  /// @param ExportedSymbolsOnly If true, search only for exported symbols.
-  /// @return A handle for the given named symbol, if it exists.
-  JITSymbol findSymbol(StringRef Name, bool ExportedSymbolsOnly) {
-    for (auto &KV : LogicalDylibs) {
-      if (auto Sym = KV.second.StubsMgr->findStub(Name, ExportedSymbolsOnly))
-        return Sym;
-      if (auto Sym =
-              findSymbolIn(KV.first, std::string(Name), ExportedSymbolsOnly))
-        return Sym;
-      else if (auto Err = Sym.takeError())
-        return std::move(Err);
-    }
-    return BaseLayer.findSymbol(std::string(Name), ExportedSymbolsOnly);
-  }
-
-  /// Get the address of a symbol provided by this layer, or some layer
-  ///        below this one.
-  JITSymbol findSymbolIn(VModuleKey K, const std::string &Name,
-                         bool ExportedSymbolsOnly) {
-    assert(LogicalDylibs.count(K) && "VModuleKey K is not valid here");
-    return LogicalDylibs[K].findSymbol(BaseLayer, Name, ExportedSymbolsOnly);
-  }
-
-  /// Update the stub for the given function to point at FnBodyAddr.
-  /// This can be used to support re-optimization.
-  /// @return true if the function exists and the stub is updated, false
-  ///         otherwise.
-  //
-  // FIXME: We should track and free associated resources (unused compile
-  //        callbacks, uncompiled IR, and no-longer-needed/reachable function
-  //        implementations).
-  Error updatePointer(std::string FuncName, JITTargetAddress FnBodyAddr) {
-    //Find out which logical dylib contains our symbol
-    auto LDI = LogicalDylibs.begin();
-    for (auto LDE = LogicalDylibs.end(); LDI != LDE; ++LDI) {
-      if (auto LMResources =
-            LDI->getLogicalModuleResourcesForSymbol(FuncName, false)) {
-        Module &SrcM = LMResources->SourceModule->getResource();
-        std::string CalledFnName = mangle(FuncName, SrcM.getDataLayout());
-        if (auto Err = LMResources->StubsMgr->updatePointer(CalledFnName,
-                                                            FnBodyAddr))
-          return Err;
-        return Error::success();
-      }
-    }
-    return make_error<JITSymbolNotFound>(FuncName);
-  }
-
-private:
-  Error addLogicalModule(LogicalDylib &LD, std::unique_ptr<Module> SrcMPtr) {
-
-    // Rename anonymous globals and promote linkage to ensure that everything
-    // will resolve properly after we partition SrcM.
-    LD.PromoteSymbols(*SrcMPtr);
-
-    // Create a logical module handle for SrcM within the logical dylib.
-    Module &SrcM = *SrcMPtr;
-    auto LMId = LD.addSourceModule(std::move(SrcMPtr));
-
-    // Create stub functions.
-    const DataLayout &DL = SrcM.getDataLayout();
-
-    typename IndirectStubsMgrT::StubInitsMap StubInits;
-    for (auto &F : SrcM) {
-      // Skip declarations.
-      if (F.isDeclaration())
-        continue;
-
-      // Skip weak functions for which we already have definitions.
-      auto MangledName = mangle(F.getName(), DL);
-      if (F.hasWeakLinkage() || F.hasLinkOnceLinkage()) {
-        if (auto Sym = LD.findSymbol(BaseLayer, MangledName, false))
-          continue;
-        else if (auto Err = Sym.takeError())
-          return Err;
-      }
-
-      // Record all functions defined by this module.
-      if (CloneStubsIntoPartitions)
-        LD.getStubsToClone(LMId).insert(&F);
-
-      // Create a callback, associate it with the stub for the function,
-      // and set the compile action to compile the partition containing the
-      // function.
-      auto CompileAction = [this, &LD, LMId, &F]() -> JITTargetAddress {
-        if (auto FnImplAddrOrErr = this->extractAndCompile(LD, LMId, F))
-          return *FnImplAddrOrErr;
-        else {
-          // FIXME: Report error, return to 'abort' or something similar.
-          consumeError(FnImplAddrOrErr.takeError());
-          return 0;
-        }
-      };
-      if (auto CCAddr =
-              CompileCallbackMgr.getCompileCallback(std::move(CompileAction)))
-        StubInits[MangledName] =
-            std::make_pair(*CCAddr, JITSymbolFlags::fromGlobalValue(F));
-      else
-        return CCAddr.takeError();
-    }
-
-    if (auto Err = LD.StubsMgr->createStubs(StubInits))
-      return Err;
-
-    // If this module doesn't contain any globals, aliases, or module flags then
-    // we can bail out early and avoid the overhead of creating and managing an
-    // empty globals module.
-    if (SrcM.global_empty() && SrcM.alias_empty() &&
-        !SrcM.getModuleFlagsMetadata())
-      return Error::success();
-
-    // Create the GlobalValues module.
-    auto GVsM = std::make_unique<Module>((SrcM.getName() + ".globals").str(),
-                                          SrcM.getContext());
-    GVsM->setDataLayout(DL);
-
-    ValueToValueMapTy VMap;
-
-    // Clone global variable decls.
-    for (auto &GV : SrcM.globals())
-      if (!GV.isDeclaration() && !VMap.count(&GV))
-        cloneGlobalVariableDecl(*GVsM, GV, &VMap);
-
-    // And the aliases.
-    for (auto &A : SrcM.aliases())
-      if (!VMap.count(&A))
-        cloneGlobalAliasDecl(*GVsM, A, VMap);
-
-    // Clone the module flags.
-    cloneModuleFlagsMetadata(*GVsM, SrcM, VMap);
-
-    // Now we need to clone the GV and alias initializers.
-
-    // Initializers may refer to functions declared (but not defined) in this
-    // module. Build a materializer to clone decls on demand.
-    auto Materializer = createLambdaMaterializer(
-      [&LD, &GVsM](Value *V) -> Value* {
-        if (auto *F = dyn_cast<Function>(V)) {
-          // Decls in the original module just get cloned.
-          if (F->isDeclaration())
-            return cloneFunctionDecl(*GVsM, *F);
-
-          // Definitions in the original module (which we have emitted stubs
-          // for at this point) get turned into a constant alias to the stub
-          // instead.
-          const DataLayout &DL = GVsM->getDataLayout();
-          std::string FName = mangle(F->getName(), DL);
-          unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(F->getType());
-          JITTargetAddress StubAddr =
-            LD.StubsMgr->findStub(FName, false).getAddress();
-
-          ConstantInt *StubAddrCI =
-            ConstantInt::get(GVsM->getContext(), APInt(PtrBitWidth, StubAddr));
-          Constant *Init = ConstantExpr::getCast(Instruction::IntToPtr,
-                                                 StubAddrCI, F->getType());
-          return GlobalAlias::create(F->getFunctionType(),
-                                     F->getType()->getAddressSpace(),
-                                     F->getLinkage(), F->getName(),
-                                     Init, GVsM.get());
-        }
-        // else....
-        return nullptr;
-      });
-
-    // Clone the global variable initializers.
-    for (auto &GV : SrcM.globals())
-      if (!GV.isDeclaration())
-        moveGlobalVariableInitializer(GV, VMap, &Materializer);
-
-    // Clone the global alias initializers.
-    for (auto &A : SrcM.aliases()) {
-      auto *NewA = cast<GlobalAlias>(VMap[&A]);
-      assert(NewA && "Alias not cloned?");
-      Value *Init = MapValue(A.getAliasee(), VMap, RF_None, nullptr,
-                             &Materializer);
-      NewA->setAliasee(cast<Constant>(Init));
-    }
-
-    // Build a resolver for the globals module and add it to the base layer.
-    auto LegacyLookup = [this, &LD](StringRef Name) -> JITSymbol {
-      if (auto Sym = LD.StubsMgr->findStub(Name, false))
-        return Sym;
-
-      if (auto Sym = LD.findSymbol(BaseLayer, std::string(Name), false))
-        return Sym;
-      else if (auto Err = Sym.takeError())
-        return std::move(Err);
-
-      return nullptr;
-    };
-
-    auto GVsResolver = createSymbolResolver(
-        [&LD, LegacyLookup](const SymbolNameSet &Symbols) {
-          auto RS = getResponsibilitySetWithLegacyFn(Symbols, LegacyLookup);
-
-          if (!RS) {
-            logAllUnhandledErrors(
-                RS.takeError(), errs(),
-                "CODLayer/GVsResolver responsibility set lookup failed: ");
-            return SymbolNameSet();
-          }
-
-          if (RS->size() == Symbols.size())
-            return *RS;
-
-          SymbolNameSet NotFoundViaLegacyLookup;
-          for (auto &S : Symbols)
-            if (!RS->count(S))
-              NotFoundViaLegacyLookup.insert(S);
-          auto RS2 =
-              LD.BackingResolver->getResponsibilitySet(NotFoundViaLegacyLookup);
-
-          for (auto &S : RS2)
-            (*RS).insert(S);
-
-          return *RS;
-        },
-        [this, &LD,
-         LegacyLookup](std::shared_ptr<AsynchronousSymbolQuery> Query,
-                       SymbolNameSet Symbols) {
-          auto NotFoundViaLegacyLookup =
-              lookupWithLegacyFn(ES, *Query, Symbols, LegacyLookup);
-          return LD.BackingResolver->lookup(Query, NotFoundViaLegacyLookup);
-        });
-
-    SetSymbolResolver(LD.K, std::move(GVsResolver));
-
-    if (auto Err = BaseLayer.addModule(LD.K, std::move(GVsM)))
-      return Err;
-
-    LD.BaseLayerVModuleKeys.push_back(LD.K);
-
-    return Error::success();
-  }
-
-  static std::string mangle(StringRef Name, const DataLayout &DL) {
-    std::string MangledName;
-    {
-      raw_string_ostream MangledNameStream(MangledName);
-      Mangler::getNameWithPrefix(MangledNameStream, Name, DL);
-    }
-    return MangledName;
-  }
-
-  Expected<JITTargetAddress>
-  extractAndCompile(LogicalDylib &LD,
-                    typename LogicalDylib::SourceModuleHandle LMId,
-                    Function &F) {
-    Module &SrcM = LD.getSourceModule(LMId);
-
-    // If F is a declaration we must already have compiled it.
-    if (F.isDeclaration())
-      return 0;
-
-    // Grab the name of the function being called here.
-    std::string CalledFnName = mangle(F.getName(), SrcM.getDataLayout());
-
-    JITTargetAddress CalledAddr = 0;
-    auto Part = Partition(F);
-    if (auto PartKeyOrErr = emitPartition(LD, LMId, Part)) {
-      auto &PartKey = *PartKeyOrErr;
-      for (auto *SubF : Part) {
-        std::string FnName = mangle(SubF->getName(), SrcM.getDataLayout());
-        if (auto FnBodySym = BaseLayer.findSymbolIn(PartKey, FnName, false)) {
-          if (auto FnBodyAddrOrErr = FnBodySym.getAddress()) {
-            JITTargetAddress FnBodyAddr = *FnBodyAddrOrErr;
-
-            // If this is the function we're calling record the address so we can
-            // return it from this function.
-            if (SubF == &F)
-              CalledAddr = FnBodyAddr;
-
-            // Update the function body pointer for the stub.
-            if (auto EC = LD.StubsMgr->updatePointer(FnName, FnBodyAddr))
-              return 0;
-
-          } else
-            return FnBodyAddrOrErr.takeError();
-        } else if (auto Err = FnBodySym.takeError())
-          return std::move(Err);
-        else
-          llvm_unreachable("Function not emitted for partition");
-      }
-
-      LD.BaseLayerVModuleKeys.push_back(PartKey);
-    } else
-      return PartKeyOrErr.takeError();
-
-    return CalledAddr;
-  }
-
-  template <typename PartitionT>
-  Expected<VModuleKey>
-  emitPartition(LogicalDylib &LD,
-                typename LogicalDylib::SourceModuleHandle LMId,
-                const PartitionT &Part) {
-    Module &SrcM = LD.getSourceModule(LMId);
-
-    // Create the module.
-    std::string NewName(SrcM.getName());
-    for (auto *F : Part) {
-      NewName += ".";
-      NewName += F->getName();
-    }
-
-    auto M = std::make_unique<Module>(NewName, SrcM.getContext());
-    M->setDataLayout(SrcM.getDataLayout());
-    ValueToValueMapTy VMap;
-
-    auto Materializer = createLambdaMaterializer([&LD, &LMId,
-                                                  &M](Value *V) -> Value * {
-      if (auto *GV = dyn_cast<GlobalVariable>(V))
-        return cloneGlobalVariableDecl(*M, *GV);
-
-      if (auto *F = dyn_cast<Function>(V)) {
-        // Check whether we want to clone an available_externally definition.
-        if (!LD.getStubsToClone(LMId).count(F))
-          return cloneFunctionDecl(*M, *F);
-
-        // Ok - we want an inlinable stub. For that to work we need a decl
-        // for the stub pointer.
-        auto *StubPtr = createImplPointer(*F->getType(), *M,
-                                          F->getName() + "$stub_ptr", nullptr);
-        auto *ClonedF = cloneFunctionDecl(*M, *F);
-        makeStub(*ClonedF, *StubPtr);
-        ClonedF->setLinkage(GlobalValue::AvailableExternallyLinkage);
-        ClonedF->addFnAttr(Attribute::AlwaysInline);
-        return ClonedF;
-      }
-
-      if (auto *A = dyn_cast<GlobalAlias>(V)) {
-        auto *Ty = A->getValueType();
-        if (Ty->isFunctionTy())
-          return Function::Create(cast<FunctionType>(Ty),
-                                  GlobalValue::ExternalLinkage, A->getName(),
-                                  M.get());
-
-        return new GlobalVariable(*M, Ty, false, GlobalValue::ExternalLinkage,
-                                  nullptr, A->getName(), nullptr,
-                                  GlobalValue::NotThreadLocal,
-                                  A->getType()->getAddressSpace());
-      }
-
-      return nullptr;
-    });
-
-    // Create decls in the new module.
-    for (auto *F : Part)
-      cloneFunctionDecl(*M, *F, &VMap);
-
-    // Move the function bodies.
-    for (auto *F : Part)
-      moveFunctionBody(*F, VMap, &Materializer);
-
-    auto K = ES.allocateVModule();
-
-    auto LegacyLookup = [this, &LD](StringRef Name) -> JITSymbol {
-      return LD.findSymbol(BaseLayer, std::string(Name), false);
-    };
-
-    // Create memory manager and symbol resolver.
-    auto Resolver = createSymbolResolver(
-        [&LD, LegacyLookup](const SymbolNameSet &Symbols) {
-          auto RS = getResponsibilitySetWithLegacyFn(Symbols, LegacyLookup);
-          if (!RS) {
-            logAllUnhandledErrors(
-                RS.takeError(), errs(),
-                "CODLayer/SubResolver responsibility set lookup failed: ");
-            return SymbolNameSet();
-          }
-
-          if (RS->size() == Symbols.size())
-            return *RS;
-
-          SymbolNameSet NotFoundViaLegacyLookup;
-          for (auto &S : Symbols)
-            if (!RS->count(S))
-              NotFoundViaLegacyLookup.insert(S);
-
-          auto RS2 =
-              LD.BackingResolver->getResponsibilitySet(NotFoundViaLegacyLookup);
-
-          for (auto &S : RS2)
-            (*RS).insert(S);
-
-          return *RS;
-        },
-        [this, &LD, LegacyLookup](std::shared_ptr<AsynchronousSymbolQuery> Q,
-                                  SymbolNameSet Symbols) {
-          auto NotFoundViaLegacyLookup =
-              lookupWithLegacyFn(ES, *Q, Symbols, LegacyLookup);
-          return LD.BackingResolver->lookup(Q,
-                                            std::move(NotFoundViaLegacyLookup));
-        });
-    SetSymbolResolver(K, std::move(Resolver));
-
-    if (auto Err = BaseLayer.addModule(std::move(K), std::move(M)))
-      return std::move(Err);
-
-    return K;
-  }
-
-  ExecutionSession &ES;
-  BaseLayerT &BaseLayer;
-  SymbolResolverGetter GetSymbolResolver;
-  SymbolResolverSetter SetSymbolResolver;
-  PartitioningFtor Partition;
-  CompileCallbackMgrT &CompileCallbackMgr;
-  IndirectStubsManagerBuilderT CreateIndirectStubsManager;
-
-  std::map<VModuleKey, LogicalDylib> LogicalDylibs;
-  bool CloneStubsIntoPartitions;
-};
-
-template <typename BaseLayerT, typename CompileCallbackMgrT,
-          typename IndirectStubsMgrT>
-LegacyCompileOnDemandLayer<BaseLayerT, CompileCallbackMgrT, IndirectStubsMgrT>::
-    LegacyCompileOnDemandLayer(
-        ExecutionSession &ES, BaseLayerT &BaseLayer,
-        SymbolResolverGetter GetSymbolResolver,
-        SymbolResolverSetter SetSymbolResolver, PartitioningFtor Partition,
-        CompileCallbackMgrT &CallbackMgr,
-        IndirectStubsManagerBuilderT CreateIndirectStubsManager,
-        bool CloneStubsIntoPartitions)
-    : ES(ES), BaseLayer(BaseLayer),
-      GetSymbolResolver(std::move(GetSymbolResolver)),
-      SetSymbolResolver(std::move(SetSymbolResolver)),
-      Partition(std::move(Partition)), CompileCallbackMgr(CallbackMgr),
-      CreateIndirectStubsManager(std::move(CreateIndirectStubsManager)),
-      CloneStubsIntoPartitions(CloneStubsIntoPartitions) {}
-
 } // end namespace orc
 } // end namespace llvm
 
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/CompileUtils.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/CompileUtils.h
index 8376d163d57a..c7ba57228ab7 100644
--- a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/CompileUtils.h
+++ b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/CompileUtils.h
@@ -28,8 +28,6 @@ class TargetMachine;
 
 namespace orc {
 
-class JITTargetMachineBuilder;
-
 IRSymbolMapper::ManglingOptions
 irManglingOptionsFromTargetOptions(const TargetOptions &Opts);
 
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/Core.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/Core.h
index a117acefd2d3..4a4b58ed32e3 100644
--- a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/Core.h
+++ b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/Core.h
@@ -16,11 +16,14 @@
 #include "llvm/ADT/BitmaskEnum.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/FunctionExtras.h"
+#include "llvm/ADT/IntrusiveRefCntPtr.h"
+#include "llvm/ExecutionEngine/JITLink/JITLinkDylib.h"
 #include "llvm/ExecutionEngine/JITSymbol.h"
 #include "llvm/ExecutionEngine/Orc/SymbolStringPool.h"
 #include "llvm/ExecutionEngine/OrcV1Deprecation.h"
 #include "llvm/Support/Debug.h"
 
+#include <atomic>
 #include <memory>
 #include <vector>
 
@@ -33,11 +36,67 @@ class ExecutionSession;
 class MaterializationUnit;
 class MaterializationResponsibility;
 class JITDylib;
+class ResourceTracker;
+class InProgressLookupState;
+
 enum class SymbolState : uint8_t;
 
-/// VModuleKey provides a unique identifier (allocated and managed by
-/// ExecutionSessions) for a module added to the JIT.
-using VModuleKey = uint64_t;
+using ResourceTrackerSP = IntrusiveRefCntPtr<ResourceTracker>;
+using JITDylibSP = IntrusiveRefCntPtr<JITDylib>;
+
+using ResourceKey = uintptr_t;
+
+/// API to remove / transfer ownership of JIT resources.
+class ResourceTracker : public ThreadSafeRefCountedBase<ResourceTracker> {
+private:
+  friend class ExecutionSession;
+  friend class JITDylib;
+  friend class MaterializationResponsibility;
+
+public:
+  ResourceTracker(const ResourceTracker &) = delete;
+  ResourceTracker &operator=(const ResourceTracker &) = delete;
+  ResourceTracker(ResourceTracker &&) = delete;
+  ResourceTracker &operator=(ResourceTracker &&) = delete;
+
+  ~ResourceTracker();
+
+  /// Return the JITDylib targeted by this tracker.
+  JITDylib &getJITDylib() const {
+    return *reinterpret_cast<JITDylib *>(JDAndFlag.load() &
+                                         ~static_cast<uintptr_t>(1));
+  }
+
+  /// Remove all resources associated with this key.
+  Error remove();
+
+  /// Transfer all resources associated with this key to the given
+  /// tracker, which must target the same JITDylib as this one.
+  void transferTo(ResourceTracker &DstRT);
+
+  /// Return true if this tracker has become defunct.
+  bool isDefunct() const { return JDAndFlag.load() & 0x1; }
+
+  /// Returns the key associated with this tracker.
+  /// This method should not be used except for debug logging: there is no
+  /// guarantee that the returned value will remain valid.
+  ResourceKey getKeyUnsafe() const { return reinterpret_cast<uintptr_t>(this); }
+
+private:
+  ResourceTracker(JITDylibSP JD);
+
+  void makeDefunct();
+
+  std::atomic_uintptr_t JDAndFlag;
+};
+
+/// Listens for ResourceTracker operations.
+class ResourceManager {
+public:
+  virtual ~ResourceManager();
+  virtual Error handleRemoveResources(ResourceKey K) = 0;
+  virtual void handleTransferResources(ResourceKey DstK, ResourceKey SrcK) = 0;
+};
 
 /// A set of symbol names (represented by SymbolStringPtrs for
 //         efficiency).
@@ -158,9 +217,19 @@ public:
 
   /// Add an element to the set. The client is responsible for checking that
   /// duplicates are not added.
-  void add(SymbolStringPtr Name,
-           SymbolLookupFlags Flags = SymbolLookupFlags::RequiredSymbol) {
+  SymbolLookupSet &
+  add(SymbolStringPtr Name,
+      SymbolLookupFlags Flags = SymbolLookupFlags::RequiredSymbol) {
     Symbols.push_back(std::make_pair(std::move(Name), Flags));
+    return *this;
+  }
+
+  /// Quickly append one lookup set to another.
+  SymbolLookupSet &append(SymbolLookupSet Other) {
+    Symbols.reserve(Symbols.size() + Other.size());
+    for (auto &KV : Other)
+      Symbols.push_back(std::move(KV));
+    return *this;
   }
 
   bool empty() const { return Symbols.empty(); }
@@ -287,7 +356,7 @@ public:
     for (UnderlyingVector::size_type I = 1; I != Symbols.size(); ++I)
       if (Symbols[I].first == Symbols[I - 1].first)
         return true;
-    return true;
+    return false;
   }
 #endif
 
@@ -318,6 +387,18 @@ using RegisterDependenciesFunction =
 /// are no dependants to register with.
 extern RegisterDependenciesFunction NoDependenciesToRegister;
 
+class ResourceTrackerDefunct : public ErrorInfo<ResourceTrackerDefunct> {
+public:
+  static char ID;
+
+  ResourceTrackerDefunct(ResourceTrackerSP RT);
+  std::error_code convertToErrorCode() const override;
+  void log(raw_ostream &OS) const override;
+
+private:
+  ResourceTrackerSP RT;
+};
+
 /// Used to notify a JITDylib that the given set of symbols failed to
 /// materialize.
 class FailedToMaterialize : public ErrorInfo<FailedToMaterialize> {
@@ -408,9 +489,10 @@ private:
 /// emit symbols, or abandon materialization by notifying any unmaterialized
 /// symbols of an error.
 class MaterializationResponsibility {
-  friend class MaterializationUnit;
+  friend class ExecutionSession;
+
 public:
-  MaterializationResponsibility(MaterializationResponsibility &&) = default;
+  MaterializationResponsibility(MaterializationResponsibility &&) = delete;
   MaterializationResponsibility &
   operator=(MaterializationResponsibility &&) = delete;
 
@@ -419,12 +501,15 @@ public:
   ///        emitted or notified of an error.
   ~MaterializationResponsibility();
 
+  /// Returns the ResourceTracker for this instance.
+  template <typename Func> Error withResourceKeyDo(Func &&F) const;
+
   /// Returns the target JITDylib that these symbols are being materialized
   ///        into.
   JITDylib &getTargetJITDylib() const { return *JD; }
 
-  /// Returns the VModuleKey for this instance.
-  VModuleKey getVModuleKey() const { return K; }
+  /// Returns the ExecutionSession for this instance.
+  ExecutionSession &getExecutionSession();
 
   /// Returns the symbol flags map for this responsibility instance.
   /// Note: The returned flags may have transient flags (Lazy, Materializing)
@@ -509,13 +594,13 @@ public:
   /// materializers to break up work based on run-time information (e.g.
   /// by introspecting which symbols have actually been looked up and
   /// materializing only those).
-  void replace(std::unique_ptr<MaterializationUnit> MU);
+  Error replace(std::unique_ptr<MaterializationUnit> MU);
 
   /// Delegates responsibility for the given symbols to the returned
   /// materialization responsibility. Useful for breaking up work between
   /// threads, or different kinds of materialization processes.
-  MaterializationResponsibility delegate(const SymbolNameSet &Symbols,
-                                         VModuleKey NewKey = VModuleKey());
+  Expected<std::unique_ptr<MaterializationResponsibility>>
+  delegate(const SymbolNameSet &Symbols);
 
   void addDependencies(const SymbolStringPtr &Name,
                        const SymbolDependenceMap &Dependencies);
@@ -526,19 +611,17 @@ public:
 private:
   /// Create a MaterializationResponsibility for the given JITDylib and
   ///        initial symbols.
-  MaterializationResponsibility(std::shared_ptr<JITDylib> JD,
-                                SymbolFlagsMap SymbolFlags,
-                                SymbolStringPtr InitSymbol, VModuleKey K)
+  MaterializationResponsibility(JITDylibSP JD, SymbolFlagsMap SymbolFlags,
+                                SymbolStringPtr InitSymbol)
       : JD(std::move(JD)), SymbolFlags(std::move(SymbolFlags)),
-        InitSymbol(std::move(InitSymbol)), K(std::move(K)) {
-    assert(this->JD && "Cannot initialize with null JD");
+        InitSymbol(std::move(InitSymbol)) {
+    assert(this->JD && "Cannot initialize with null JITDylib");
     assert(!this->SymbolFlags.empty() && "Materializing nothing?");
   }
 
-  std::shared_ptr<JITDylib> JD;
+  JITDylibSP JD;
   SymbolFlagsMap SymbolFlags;
   SymbolStringPtr InitSymbol;
-  VModuleKey K;
 };
 
 /// A MaterializationUnit represents a set of symbol definitions that can
@@ -555,9 +638,9 @@ class MaterializationUnit {
 
 public:
   MaterializationUnit(SymbolFlagsMap InitalSymbolFlags,
-                      SymbolStringPtr InitSymbol, VModuleKey K)
+                      SymbolStringPtr InitSymbol)
       : SymbolFlags(std::move(InitalSymbolFlags)),
-        InitSymbol(std::move(InitSymbol)), K(std::move(K)) {
+        InitSymbol(std::move(InitSymbol)) {
     assert((!this->InitSymbol || this->SymbolFlags.count(this->InitSymbol)) &&
            "If set, InitSymbol should appear in InitialSymbolFlags map");
   }
@@ -577,7 +660,8 @@ public:
   /// Implementations of this method should materialize all symbols
   ///        in the materialzation unit, except for those that have been
   ///        previously discarded.
-  virtual void materialize(MaterializationResponsibility R) = 0;
+  virtual void
+  materialize(std::unique_ptr<MaterializationResponsibility> R) = 0;
 
   /// Called by JITDylibs to notify MaterializationUnits that the given symbol
   /// has been overridden.
@@ -589,17 +673,10 @@ public:
 protected:
   SymbolFlagsMap SymbolFlags;
   SymbolStringPtr InitSymbol;
-  VModuleKey K;
 
 private:
   virtual void anchor();
 
-  MaterializationResponsibility
-  createMaterializationResponsibility(std::shared_ptr<JITDylib> JD) {
-    return MaterializationResponsibility(std::move(JD), std::move(SymbolFlags),
-                                         std::move(InitSymbol), K);
-  }
-
   /// Implementations of this method should discard the given symbol
   ///        from the source (e.g. if the source is an LLVM IR Module and the
   ///        symbol is a function, delete the function body or mark it available
@@ -607,21 +684,18 @@ private:
   virtual void discard(const JITDylib &JD, const SymbolStringPtr &Name) = 0;
 };
 
-using MaterializationUnitList =
-    std::vector<std::unique_ptr<MaterializationUnit>>;
-
 /// A MaterializationUnit implementation for pre-existing absolute symbols.
 ///
 /// All symbols will be resolved and marked ready as soon as the unit is
 /// materialized.
 class AbsoluteSymbolsMaterializationUnit : public MaterializationUnit {
 public:
-  AbsoluteSymbolsMaterializationUnit(SymbolMap Symbols, VModuleKey K);
+  AbsoluteSymbolsMaterializationUnit(SymbolMap Symbols);
 
   StringRef getName() const override;
 
 private:
-  void materialize(MaterializationResponsibility R) override;
+  void materialize(std::unique_ptr<MaterializationResponsibility> R) override;
   void discard(const JITDylib &JD, const SymbolStringPtr &Name) override;
   static SymbolFlagsMap extractFlags(const SymbolMap &Symbols);
 
@@ -639,9 +713,9 @@ private:
 /// \endcode
 ///
 inline std::unique_ptr<AbsoluteSymbolsMaterializationUnit>
-absoluteSymbols(SymbolMap Symbols, VModuleKey K = VModuleKey()) {
+absoluteSymbols(SymbolMap Symbols) {
   return std::make_unique<AbsoluteSymbolsMaterializationUnit>(
-      std::move(Symbols), std::move(K));
+      std::move(Symbols));
 }
 
 /// A materialization unit for symbol aliases. Allows existing symbols to be
@@ -658,12 +732,12 @@ public:
   ///       resolved.
   ReExportsMaterializationUnit(JITDylib *SourceJD,
                                JITDylibLookupFlags SourceJDLookupFlags,
-                               SymbolAliasMap Aliases, VModuleKey K);
+                               SymbolAliasMap Aliases);
 
   StringRef getName() const override;
 
 private:
-  void materialize(MaterializationResponsibility R) override;
+  void materialize(std::unique_ptr<MaterializationResponsibility> R) override;
   void discard(const JITDylib &JD, const SymbolStringPtr &Name) override;
   static SymbolFlagsMap extractFlags(const SymbolAliasMap &Aliases);
 
@@ -684,10 +758,9 @@ private:
 ///     return Err;
 /// \endcode
 inline std::unique_ptr<ReExportsMaterializationUnit>
-symbolAliases(SymbolAliasMap Aliases, VModuleKey K = VModuleKey()) {
+symbolAliases(SymbolAliasMap Aliases) {
   return std::make_unique<ReExportsMaterializationUnit>(
-      nullptr, JITDylibLookupFlags::MatchAllSymbols, std::move(Aliases),
-      std::move(K));
+      nullptr, JITDylibLookupFlags::MatchAllSymbols, std::move(Aliases));
 }
 
 /// Create a materialization unit for re-exporting symbols from another JITDylib
@@ -696,10 +769,9 @@ symbolAliases(SymbolAliasMap Aliases, VModuleKey K = VModuleKey()) {
 inline std::unique_ptr<ReExportsMaterializationUnit>
 reexports(JITDylib &SourceJD, SymbolAliasMap Aliases,
           JITDylibLookupFlags SourceJDLookupFlags =
-              JITDylibLookupFlags::MatchExportedSymbolsOnly,
-          VModuleKey K = VModuleKey()) {
+              JITDylibLookupFlags::MatchExportedSymbolsOnly) {
   return std::make_unique<ReExportsMaterializationUnit>(
-      &SourceJD, SourceJDLookupFlags, std::move(Aliases), std::move(K));
+      &SourceJD, SourceJDLookupFlags, std::move(Aliases));
 }
 
 /// Build a SymbolAliasMap for the common case where you want to re-export
@@ -723,8 +795,10 @@ enum class SymbolState : uint8_t {
 /// makes a callback when all symbols are available.
 class AsynchronousSymbolQuery {
   friend class ExecutionSession;
+  friend class InProgressFullLookupState;
   friend class JITDylib;
   friend class JITSymbolResolverAdapter;
+  friend class MaterializationResponsibility;
 
 public:
   /// Create a query for the given symbols. The NotifyComplete
@@ -757,8 +831,6 @@ private:
 
   void dropSymbol(const SymbolStringPtr &Name);
 
-  bool canStillFail();
-
   void handleFailed(Error Err);
 
   void detach();
@@ -770,34 +842,62 @@ private:
   SymbolState RequiredState;
 };
 
+/// Wraps state for a lookup-in-progress.
+/// DefinitionGenerators can optionally take ownership of a LookupState object
+/// to suspend a lookup-in-progress while they search for definitions.
+class LookupState {
+  friend class OrcV2CAPIHelper;
+  friend class ExecutionSession;
+
+public:
+  LookupState();
+  LookupState(LookupState &&);
+  LookupState &operator=(LookupState &&);
+  ~LookupState();
+
+  /// Continue the lookup. This can be called by DefinitionGenerators
+  /// to re-start a captured query-application operation.
+  void continueLookup(Error Err);
+
+private:
+  LookupState(std::unique_ptr<InProgressLookupState> IPLS);
+
+  // For C API.
+  void reset(InProgressLookupState *IPLS);
+
+  std::unique_ptr<InProgressLookupState> IPLS;
+};
+
+/// Definition generators can be attached to JITDylibs to generate new
+/// definitions for otherwise unresolved symbols during lookup.
+class DefinitionGenerator {
+public:
+  virtual ~DefinitionGenerator();
+
+  /// DefinitionGenerators should override this method to insert new
+  /// definitions into the parent JITDylib. K specifies the kind of this
+  /// lookup. JD specifies the target JITDylib being searched, and
+  /// JDLookupFlags specifies whether the search should match against
+  /// hidden symbols. Finally, Symbols describes the set of unresolved
+  /// symbols and their associated lookup flags.
+  virtual Error tryToGenerate(LookupState &LS, LookupKind K, JITDylib &JD,
+                              JITDylibLookupFlags JDLookupFlags,
+                              const SymbolLookupSet &LookupSet) = 0;
+};
+
 /// A symbol table that supports asynchoronous symbol queries.
 ///
 /// Represents a virtual shared object. Instances can not be copied or moved, so
 /// their addresses may be used as keys for resource management.
 /// JITDylib state changes must be made via an ExecutionSession to guarantee
 /// that they are synchronized with respect to other JITDylib operations.
-class JITDylib : public std::enable_shared_from_this<JITDylib> {
+class JITDylib : public ThreadSafeRefCountedBase<JITDylib>,
+                 public jitlink::JITLinkDylib {
   friend class AsynchronousSymbolQuery;
   friend class ExecutionSession;
   friend class Platform;
   friend class MaterializationResponsibility;
 public:
-  /// Definition generators can be attached to JITDylibs to generate new
-  /// definitions for otherwise unresolved symbols during lookup.
-  class DefinitionGenerator {
-  public:
-    virtual ~DefinitionGenerator();
-
-    /// DefinitionGenerators should override this method to insert new
-    /// definitions into the parent JITDylib. K specifies the kind of this
-    /// lookup. JD specifies the target JITDylib being searched, and
-    /// JDLookupFlags specifies whether the search should match against
-    /// hidden symbols. Finally, Symbols describes the set of unresolved
-    /// symbols and their associated lookup flags.
-    virtual Error tryToGenerate(LookupKind K, JITDylib &JD,
-                                JITDylibLookupFlags JDLookupFlags,
-                                const SymbolLookupSet &LookupSet) = 0;
-  };
 
   using AsynchronousSymbolQuerySet =
     std::set<std::shared_ptr<AsynchronousSymbolQuery>>;
@@ -813,6 +913,21 @@ public:
   /// Get a reference to the ExecutionSession for this JITDylib.
   ExecutionSession &getExecutionSession() const { return ES; }
 
+  /// Calls remove on all trackers currently associated with this JITDylib.
+  /// Does not run static deinits.
+  ///
+  /// Note that removal happens outside the session lock, so new code may be
+  /// added concurrently while the clear is underway, and the newly added
+  /// code will *not* be cleared. Adding new code concurrently with a clear
+  /// is usually a bug and should be avoided.
+  Error clear();
+
+  /// Get the default resource tracker for this JITDylib.
+  ResourceTrackerSP getDefaultResourceTracker();
+
+  /// Create a resource tracker for this JITDylib.
+  ResourceTrackerSP createResourceTracker();
+
   /// Adds a definition generator to this JITDylib and returns a referenece to
   /// it.
   ///
@@ -873,10 +988,13 @@ public:
   /// Define all symbols provided by the materialization unit to be part of this
   /// JITDylib.
   ///
+  /// If RT is not specified then the default resource tracker will be used.
+  ///
   /// This overload always takes ownership of the MaterializationUnit. If any
   /// errors occur, the MaterializationUnit consumed.
   template <typename MaterializationUnitType>
-  Error define(std::unique_ptr<MaterializationUnitType> &&MU);
+  Error define(std::unique_ptr<MaterializationUnitType> &&MU,
+               ResourceTrackerSP RT = nullptr);
 
   /// Define all symbols provided by the materialization unit to be part of this
   /// JITDylib.
@@ -886,7 +1004,8 @@ public:
   /// may allow the caller to modify the MaterializationUnit to correct the
   /// issue, then re-call define.
   template <typename MaterializationUnitType>
-  Error define(std::unique_ptr<MaterializationUnitType> &MU);
+  Error define(std::unique_ptr<MaterializationUnitType> &MU,
+               ResourceTrackerSP RT = nullptr);
 
   /// Tries to remove the given symbols.
   ///
@@ -900,41 +1019,47 @@ public:
   /// left unmodified (no symbols are removed).
   Error remove(const SymbolNameSet &Names);
 
-  /// Search the given JITDylib for the symbols in Symbols. If found, store
-  /// the flags for each symbol in Flags. If any required symbols are not found
-  /// then an error will be returned.
-  Expected<SymbolFlagsMap> lookupFlags(LookupKind K,
-                                       JITDylibLookupFlags JDLookupFlags,
-                                       SymbolLookupSet LookupSet);
-
   /// Dump current JITDylib state to OS.
   void dump(raw_ostream &OS);
 
-  /// FIXME: Remove this when we remove the old ORC layers.
-  /// Search the given JITDylibs in order for the symbols in Symbols. Results
-  ///        (once they become available) will be returned via the given Query.
-  ///
-  /// If any symbol is not found then the unresolved symbols will be returned,
-  /// and the query will not be applied. The Query is not failed and can be
-  /// re-used in a subsequent lookup once the symbols have been added, or
-  /// manually failed.
-  Expected<SymbolNameSet>
-  legacyLookup(std::shared_ptr<AsynchronousSymbolQuery> Q, SymbolNameSet Names);
+  /// Returns the given JITDylibs and all of their transitive dependencies in
+  /// DFS order (based on linkage relationships). Each JITDylib will appear
+  /// only once.
+  static std::vector<JITDylibSP> getDFSLinkOrder(ArrayRef<JITDylibSP> JDs);
+
+  /// Returns the given JITDylibs and all of their transitive dependensies in
+  /// reverse DFS order (based on linkage relationships). Each JITDylib will
+  /// appear only once.
+  static std::vector<JITDylibSP>
+  getReverseDFSLinkOrder(ArrayRef<JITDylibSP> JDs);
+
+  /// Return this JITDylib and its transitive dependencies in DFS order
+  /// based on linkage relationships.
+  std::vector<JITDylibSP> getDFSLinkOrder();
+
+  /// Rteurn this JITDylib and its transitive dependencies in reverse DFS order
+  /// based on linkage relationships.
+  std::vector<JITDylibSP> getReverseDFSLinkOrder();
 
 private:
   using AsynchronousSymbolQueryList =
       std::vector<std::shared_ptr<AsynchronousSymbolQuery>>;
 
   struct UnmaterializedInfo {
-    UnmaterializedInfo(std::unique_ptr<MaterializationUnit> MU)
-        : MU(std::move(MU)) {}
+    UnmaterializedInfo(std::unique_ptr<MaterializationUnit> MU,
+                       ResourceTracker *RT)
+        : MU(std::move(MU)), RT(RT) {}
 
     std::unique_ptr<MaterializationUnit> MU;
+    ResourceTracker *RT;
   };
 
   using UnmaterializedInfosMap =
       DenseMap<SymbolStringPtr, std::shared_ptr<UnmaterializedInfo>>;
 
+  using UnmaterializedInfosList =
+      std::vector<std::shared_ptr<UnmaterializedInfo>>;
+
   struct MaterializingInfo {
     SymbolDependenceMap Dependants;
     SymbolDependenceMap UnemittedDependencies;
@@ -1001,25 +1126,16 @@ private:
 
   JITDylib(ExecutionSession &ES, std::string Name);
 
-  Error defineImpl(MaterializationUnit &MU);
-
-  void lookupFlagsImpl(SymbolFlagsMap &Result, LookupKind K,
-                       JITDylibLookupFlags JDLookupFlags,
-                       SymbolLookupSet &Unresolved);
+  ResourceTrackerSP getTracker(MaterializationResponsibility &MR);
+  std::pair<AsynchronousSymbolQuerySet, std::shared_ptr<SymbolDependenceMap>>
+  removeTracker(ResourceTracker &RT);
 
-  Error lodgeQuery(MaterializationUnitList &MUs,
-                   std::shared_ptr<AsynchronousSymbolQuery> &Q, LookupKind K,
-                   JITDylibLookupFlags JDLookupFlags,
-                   SymbolLookupSet &Unresolved);
+  void transferTracker(ResourceTracker &DstRT, ResourceTracker &SrcRT);
 
-  Error lodgeQueryImpl(MaterializationUnitList &MUs,
-                       std::shared_ptr<AsynchronousSymbolQuery> &Q,
-                       LookupKind K, JITDylibLookupFlags JDLookupFlags,
-                       SymbolLookupSet &Unresolved);
+  Error defineImpl(MaterializationUnit &MU);
 
-  bool lookupImpl(std::shared_ptr<AsynchronousSymbolQuery> &Q,
-                  std::vector<std::unique_ptr<MaterializationUnit>> &MUs,
-                  SymbolLookupSet &Unresolved);
+  void installMaterializationUnit(std::unique_ptr<MaterializationUnit> MU,
+                                  ResourceTracker &RT);
 
   void detachQueryHelper(AsynchronousSymbolQuery &Q,
                          const SymbolNameSet &QuerySymbols);
@@ -1030,29 +1146,45 @@ private:
 
   Expected<SymbolFlagsMap> defineMaterializing(SymbolFlagsMap SymbolFlags);
 
-  void replace(std::unique_ptr<MaterializationUnit> MU);
+  Error replace(MaterializationResponsibility &FromMR,
+                std::unique_ptr<MaterializationUnit> MU);
+
+  Expected<std::unique_ptr<MaterializationResponsibility>>
+  delegate(MaterializationResponsibility &FromMR, SymbolFlagsMap SymbolFlags,
+           SymbolStringPtr InitSymbol);
 
   SymbolNameSet getRequestedSymbols(const SymbolFlagsMap &SymbolFlags) const;
 
   void addDependencies(const SymbolStringPtr &Name,
                        const SymbolDependenceMap &Dependants);
 
-  Error resolve(const SymbolMap &Resolved);
+  Error resolve(MaterializationResponsibility &MR, const SymbolMap &Resolved);
+
+  Error emit(MaterializationResponsibility &MR, const SymbolFlagsMap &Emitted);
 
-  Error emit(const SymbolFlagsMap &Emitted);
+  void unlinkMaterializationResponsibility(MaterializationResponsibility &MR);
 
   using FailedSymbolsWorklist =
       std::vector<std::pair<JITDylib *, SymbolStringPtr>>;
-  static void notifyFailed(FailedSymbolsWorklist FailedSymbols);
+
+  static std::pair<AsynchronousSymbolQuerySet,
+                   std::shared_ptr<SymbolDependenceMap>>
+      failSymbols(FailedSymbolsWorklist);
 
   ExecutionSession &ES;
   std::string JITDylibName;
+  std::mutex GeneratorsMutex;
   bool Open = true;
   SymbolTable Symbols;
   UnmaterializedInfosMap UnmaterializedInfos;
   MaterializingInfosMap MaterializingInfos;
-  std::vector<std::unique_ptr<DefinitionGenerator>> DefGenerators;
+  std::vector<std::shared_ptr<DefinitionGenerator>> DefGenerators;
   JITDylibSearchOrder LinkOrder;
+  ResourceTrackerSP DefaultTracker;
+
+  // Map trackers to sets of symbols tracked.
+  DenseMap<ResourceTracker *, SymbolNameVector> TrackerSymbols;
+  DenseMap<MaterializationResponsibility *, ResourceTracker *> MRTrackers;
 };
 
 /// Platforms set up standard symbols and mediate interactions between dynamic
@@ -1071,11 +1203,12 @@ public:
 
   /// This method will be called under the ExecutionSession lock each time a
   /// MaterializationUnit is added to a JITDylib.
-  virtual Error notifyAdding(JITDylib &JD, const MaterializationUnit &MU) = 0;
+  virtual Error notifyAdding(ResourceTracker &RT,
+                             const MaterializationUnit &MU) = 0;
 
   /// This method will be called under the ExecutionSession lock when a
-  /// VModuleKey is removed.
-  virtual Error notifyRemoving(JITDylib &JD, VModuleKey K) = 0;
+  /// ResourceTracker is removed.
+  virtual Error notifyRemoving(ResourceTracker &RT) = 0;
 
   /// A utility function for looking up initializer symbols. Performs a blocking
   /// lookup for the given symbols in each of the given JITDylibs.
@@ -1086,8 +1219,12 @@ public:
 
 /// An ExecutionSession represents a running JIT program.
 class ExecutionSession {
-  // FIXME: Remove this when we remove the old ORC layers.
+  friend class InProgressLookupFlagsState;
+  friend class InProgressFullLookupState;
   friend class JITDylib;
+  friend class LookupState;
+  friend class MaterializationResponsibility;
+  friend class ResourceTracker;
 
 public:
   /// For reporting errors.
@@ -1096,13 +1233,16 @@ public:
   /// For dispatching MaterializationUnit::materialize calls.
   using DispatchMaterializationFunction =
       std::function<void(std::unique_ptr<MaterializationUnit> MU,
-                         MaterializationResponsibility MR)>;
+                         std::unique_ptr<MaterializationResponsibility> MR)>;
 
   /// Construct an ExecutionSession.
   ///
   /// SymbolStringPools may be shared between ExecutionSessions.
   ExecutionSession(std::shared_ptr<SymbolStringPool> SSP = nullptr);
 
+  /// End the session. Closes all JITDylibs.
+  Error endSession();
+
   /// Add a symbol name to the SymbolStringPool and return a pointer to it.
   SymbolStringPtr intern(StringRef SymName) { return SSP->intern(SymName); }
 
@@ -1122,6 +1262,14 @@ public:
     return F();
   }
 
+  /// Register the given ResourceManager with this ExecutionSession.
+  /// Managers will be notified of events in reverse order of registration.
+  void registerResourceManager(ResourceManager &RM);
+
+  /// Deregister the given ResourceManager with this ExecutionSession.
+  /// Manager must have been previously registered.
+  void deregisterResourceManager(ResourceManager &RM);
+
   /// Return a pointer to the "name" JITDylib.
   /// Ownership of JITDylib remains within Execution Session
   JITDylib *getJITDylibByName(StringRef Name);
@@ -1147,17 +1295,6 @@ public:
   /// If no Platform is attached this call is equivalent to createBareJITDylib.
   Expected<JITDylib &> createJITDylib(std::string Name);
 
-  /// Allocate a module key for a new module to add to the JIT.
-  VModuleKey allocateVModule() {
-    return runSessionLocked([this]() { return ++LastKey; });
-  }
-
-  /// Return a module key to the ExecutionSession so that it can be
-  ///        re-used. This should only be done once all resources associated
-  ///        with the original key have been released.
-  void releaseVModule(VModuleKey Key) { /* FIXME: Recycle keys */
-  }
-
   /// Set the error reporter function.
   ExecutionSession &setErrorReporter(ErrorReporter ReportError) {
     this->ReportError = std::move(ReportError);
@@ -1176,19 +1313,18 @@ public:
     return *this;
   }
 
-  void legacyFailQuery(AsynchronousSymbolQuery &Q, Error Err);
+  /// Search the given JITDylibs to find the flags associated with each of the
+  /// given symbols.
+  void lookupFlags(LookupKind K, JITDylibSearchOrder SearchOrder,
+                   SymbolLookupSet Symbols,
+                   unique_function<void(Expected<SymbolFlagsMap>)> OnComplete);
 
-  using LegacyAsyncLookupFunction = std::function<SymbolNameSet(
-      std::shared_ptr<AsynchronousSymbolQuery> Q, SymbolNameSet Names)>;
-
-  /// A legacy lookup function for JITSymbolResolverAdapter.
-  /// Do not use -- this will be removed soon.
-  Expected<SymbolMap>
-  legacyLookup(LegacyAsyncLookupFunction AsyncLookup, SymbolNameSet Names,
-               SymbolState RequiredState,
-               RegisterDependenciesFunction RegisterDependencies);
+  /// Blocking version of lookupFlags.
+  Expected<SymbolFlagsMap> lookupFlags(LookupKind K,
+                                       JITDylibSearchOrder SearchOrder,
+                                       SymbolLookupSet Symbols);
 
-  /// Search the given JITDylib list for the given symbols.
+  /// Search the given JITDylibs for the given symbols.
   ///
   /// SearchOrder lists the JITDylibs to search. For each dylib, the associated
   /// boolean indicates whether the search should match against non-exported
@@ -1248,10 +1384,11 @@ public:
          SymbolState RequiredState = SymbolState::Ready);
 
   /// Materialize the given unit.
-  void dispatchMaterialization(std::unique_ptr<MaterializationUnit> MU,
-                               MaterializationResponsibility MR) {
+  void
+  dispatchMaterialization(std::unique_ptr<MaterializationUnit> MU,
+                          std::unique_ptr<MaterializationResponsibility> MR) {
     assert(MU && "MU must be non-null");
-    DEBUG_WITH_TYPE("orc", dumpDispatchInfo(MR.getTargetJITDylib(), *MU));
+    DEBUG_WITH_TYPE("orc", dumpDispatchInfo(MR->getTargetJITDylib(), *MU));
     DispatchMaterialization(std::move(MU), std::move(MR));
   }
 
@@ -1263,41 +1400,124 @@ private:
     logAllUnhandledErrors(std::move(Err), errs(), "JIT session error: ");
   }
 
-  static void
-  materializeOnCurrentThread(std::unique_ptr<MaterializationUnit> MU,
-                             MaterializationResponsibility MR) {
+  static void materializeOnCurrentThread(
+      std::unique_ptr<MaterializationUnit> MU,
+      std::unique_ptr<MaterializationResponsibility> MR) {
     MU->materialize(std::move(MR));
   }
 
-  void runOutstandingMUs();
+  void dispatchOutstandingMUs();
+
+  static std::unique_ptr<MaterializationResponsibility>
+  createMaterializationResponsibility(ResourceTracker &RT,
+                                      SymbolFlagsMap Symbols,
+                                      SymbolStringPtr InitSymbol) {
+    auto &JD = RT.getJITDylib();
+    std::unique_ptr<MaterializationResponsibility> MR(
+        new MaterializationResponsibility(&JD, std::move(Symbols),
+                                          std::move(InitSymbol)));
+    JD.MRTrackers[MR.get()] = &RT;
+    return MR;
+  }
+
+  Error removeResourceTracker(ResourceTracker &RT);
+  void transferResourceTracker(ResourceTracker &DstRT, ResourceTracker &SrcRT);
+  void destroyResourceTracker(ResourceTracker &RT);
+
+  // State machine functions for query application..
+
+  /// IL_updateCandidatesFor is called to remove already-defined symbols that
+  /// match a given query from the set of candidate symbols to generate
+  /// definitions for (no need to generate a definition if one already exists).
+  Error IL_updateCandidatesFor(JITDylib &JD, JITDylibLookupFlags JDLookupFlags,
+                               SymbolLookupSet &Candidates,
+                               SymbolLookupSet *NonCandidates);
+
+  /// OL_applyQueryPhase1 is an optionally re-startable loop for triggering
+  /// definition generation. It is called when a lookup is performed, and again
+  /// each time that LookupState::continueLookup is called.
+  void OL_applyQueryPhase1(std::unique_ptr<InProgressLookupState> IPLS,
+                           Error Err);
+
+  /// OL_completeLookup is run once phase 1 successfully completes for a lookup
+  /// call. It attempts to attach the symbol to all symbol table entries and
+  /// collect all MaterializationUnits to dispatch. If this method fails then
+  /// all MaterializationUnits will be left un-materialized.
+  void OL_completeLookup(std::unique_ptr<InProgressLookupState> IPLS,
+                         std::shared_ptr<AsynchronousSymbolQuery> Q,
+                         RegisterDependenciesFunction RegisterDependencies);
+
+  /// OL_completeLookupFlags is run once phase 1 successfully completes for a
+  /// lookupFlags call.
+  void OL_completeLookupFlags(
+      std::unique_ptr<InProgressLookupState> IPLS,
+      unique_function<void(Expected<SymbolFlagsMap>)> OnComplete);
+
+  // State machine functions for MaterializationResponsibility.
+  void OL_destroyMaterializationResponsibility(
+      MaterializationResponsibility &MR);
+  SymbolNameSet OL_getRequestedSymbols(const MaterializationResponsibility &MR);
+  Error OL_notifyResolved(MaterializationResponsibility &MR,
+                          const SymbolMap &Symbols);
+  Error OL_notifyEmitted(MaterializationResponsibility &MR);
+  Error OL_defineMaterializing(MaterializationResponsibility &MR,
+                               SymbolFlagsMap SymbolFlags);
+  void OL_notifyFailed(MaterializationResponsibility &MR);
+  Error OL_replace(MaterializationResponsibility &MR,
+                   std::unique_ptr<MaterializationUnit> MU);
+  Expected<std::unique_ptr<MaterializationResponsibility>>
+  OL_delegate(MaterializationResponsibility &MR, const SymbolNameSet &Symbols);
+  void OL_addDependencies(MaterializationResponsibility &MR,
+                          const SymbolStringPtr &Name,
+                          const SymbolDependenceMap &Dependencies);
+  void OL_addDependenciesForAll(MaterializationResponsibility &MR,
+                                const SymbolDependenceMap &Dependencies);
 
 #ifndef NDEBUG
   void dumpDispatchInfo(JITDylib &JD, MaterializationUnit &MU);
 #endif // NDEBUG
 
   mutable std::recursive_mutex SessionMutex;
+  bool SessionOpen = true;
   std::shared_ptr<SymbolStringPool> SSP;
   std::unique_ptr<Platform> P;
-  VModuleKey LastKey = 0;
   ErrorReporter ReportError = logErrorsToStdErr;
   DispatchMaterializationFunction DispatchMaterialization =
       materializeOnCurrentThread;
 
-  std::vector<std::shared_ptr<JITDylib>> JDs;
+  std::vector<ResourceManager *> ResourceManagers;
+
+  std::vector<JITDylibSP> JDs;
 
   // FIXME: Remove this (and runOutstandingMUs) once the linking layer works
   //        with callbacks from asynchronous queries.
   mutable std::recursive_mutex OutstandingMUsMutex;
   std::vector<std::pair<std::unique_ptr<MaterializationUnit>,
-                        MaterializationResponsibility>>
+                        std::unique_ptr<MaterializationResponsibility>>>
       OutstandingMUs;
 };
 
+inline ExecutionSession &MaterializationResponsibility::getExecutionSession() {
+  return JD->getExecutionSession();
+}
+
+template <typename Func>
+Error MaterializationResponsibility::withResourceKeyDo(Func &&F) const {
+  return JD->getExecutionSession().runSessionLocked([&]() -> Error {
+    auto I = JD->MRTrackers.find(this);
+    assert(I != JD->MRTrackers.end() && "No tracker for this MR");
+    if (I->second->isDefunct())
+      return make_error<ResourceTrackerDefunct>(I->second);
+    F(I->second->getKeyUnsafe());
+    return Error::success();
+  });
+}
+
 template <typename GeneratorT>
 GeneratorT &JITDylib::addGenerator(std::unique_ptr<GeneratorT> DefGenerator) {
   auto &G = *DefGenerator;
-  ES.runSessionLocked(
-      [&]() { DefGenerators.push_back(std::move(DefGenerator)); });
+  std::lock_guard<std::mutex> Lock(GeneratorsMutex);
+  DefGenerators.push_back(std::move(DefGenerator));
   return G;
 }
 
@@ -1308,7 +1528,8 @@ auto JITDylib::withLinkOrderDo(Func &&F)
 }
 
 template <typename MaterializationUnitType>
-Error JITDylib::define(std::unique_ptr<MaterializationUnitType> &&MU) {
+Error JITDylib::define(std::unique_ptr<MaterializationUnitType> &&MU,
+                       ResourceTrackerSP RT) {
   assert(MU && "Can not define with a null MU");
 
   if (MU->getSymbols().empty()) {
@@ -1320,29 +1541,36 @@ Error JITDylib::define(std::unique_ptr<MaterializationUnitType> &&MU) {
     return Error::success();
   } else
     DEBUG_WITH_TYPE("orc", {
-      dbgs() << "Defining MU " << MU->getName() << " for " << getName() << "\n";
+      dbgs() << "Defining MU " << MU->getName() << " for " << getName()
+             << " (tracker: ";
+      if (RT == getDefaultResourceTracker())
+        dbgs() << "default)";
+      else if (RT)
+        dbgs() << RT.get() << ")\n";
+      else
+        dbgs() << "0x0, default will be used)\n";
     });
 
   return ES.runSessionLocked([&, this]() -> Error {
     if (auto Err = defineImpl(*MU))
       return Err;
 
+    if (!RT)
+      RT = getDefaultResourceTracker();
+
     if (auto *P = ES.getPlatform()) {
-      if (auto Err = P->notifyAdding(*this, *MU))
+      if (auto Err = P->notifyAdding(*RT, *MU))
         return Err;
     }
 
-    /// defineImpl succeeded.
-    auto UMI = std::make_shared<UnmaterializedInfo>(std::move(MU));
-    for (auto &KV : UMI->MU->getSymbols())
-      UnmaterializedInfos[KV.first] = UMI;
-
+    installMaterializationUnit(std::move(MU), *RT);
     return Error::success();
   });
 }
 
 template <typename MaterializationUnitType>
-Error JITDylib::define(std::unique_ptr<MaterializationUnitType> &MU) {
+Error JITDylib::define(std::unique_ptr<MaterializationUnitType> &MU,
+                       ResourceTrackerSP RT) {
   assert(MU && "Can not define with a null MU");
 
   if (MU->getSymbols().empty()) {
@@ -1354,30 +1582,36 @@ Error JITDylib::define(std::unique_ptr<MaterializationUnitType> &MU) {
     return Error::success();
   } else
     DEBUG_WITH_TYPE("orc", {
-      dbgs() << "Defining MU " << MU->getName() << " for " << getName() << "\n";
+      dbgs() << "Defining MU " << MU->getName() << " for " << getName()
+             << " (tracker: ";
+      if (RT == getDefaultResourceTracker())
+        dbgs() << "default)";
+      else if (RT)
+        dbgs() << RT.get() << ")\n";
+      else
+        dbgs() << "0x0, default will be used)\n";
     });
 
   return ES.runSessionLocked([&, this]() -> Error {
     if (auto Err = defineImpl(*MU))
       return Err;
 
+    if (!RT)
+      RT = getDefaultResourceTracker();
+
     if (auto *P = ES.getPlatform()) {
-      if (auto Err = P->notifyAdding(*this, *MU))
+      if (auto Err = P->notifyAdding(*RT, *MU))
         return Err;
     }
 
-    /// defineImpl succeeded.
-    auto UMI = std::make_shared<UnmaterializedInfo>(std::move(MU));
-    for (auto &KV : UMI->MU->getSymbols())
-      UnmaterializedInfos[KV.first] = UMI;
-
+    installMaterializationUnit(std::move(MU), *RT);
     return Error::success();
   });
 }
 
 /// ReexportsGenerator can be used with JITDylib::addGenerator to automatically
 /// re-export a subset of the source JITDylib's symbols in the target.
-class ReexportsGenerator : public JITDylib::DefinitionGenerator {
+class ReexportsGenerator : public DefinitionGenerator {
 public:
   using SymbolPredicate = std::function<bool(SymbolStringPtr)>;
 
@@ -1388,7 +1622,7 @@ public:
                      JITDylibLookupFlags SourceJDLookupFlags,
                      SymbolPredicate Allow = SymbolPredicate());
 
-  Error tryToGenerate(LookupKind K, JITDylib &JD,
+  Error tryToGenerate(LookupState &LS, LookupKind K, JITDylib &JD,
                       JITDylibLookupFlags JDLookupFlags,
                       const SymbolLookupSet &LookupSet) override;
 
@@ -1398,6 +1632,57 @@ private:
   SymbolPredicate Allow;
 };
 
+// --------------- IMPLEMENTATION --------------
+// Implementations for inline functions/methods.
+// ---------------------------------------------
+
+inline MaterializationResponsibility::~MaterializationResponsibility() {
+  JD->getExecutionSession().OL_destroyMaterializationResponsibility(*this);
+}
+
+inline SymbolNameSet MaterializationResponsibility::getRequestedSymbols() const {
+  return JD->getExecutionSession().OL_getRequestedSymbols(*this);
+}
+
+inline Error MaterializationResponsibility::notifyResolved(
+    const SymbolMap &Symbols) {
+  return JD->getExecutionSession().OL_notifyResolved(*this, Symbols);
+}
+
+inline Error MaterializationResponsibility::notifyEmitted() {
+  return JD->getExecutionSession().OL_notifyEmitted(*this);
+}
+
+inline Error MaterializationResponsibility::defineMaterializing(
+    SymbolFlagsMap SymbolFlags) {
+  return JD->getExecutionSession().OL_defineMaterializing(
+      *this, std::move(SymbolFlags));
+}
+
+inline void MaterializationResponsibility::failMaterialization() {
+  JD->getExecutionSession().OL_notifyFailed(*this);
+}
+
+inline Error MaterializationResponsibility::replace(
+    std::unique_ptr<MaterializationUnit> MU) {
+  return JD->getExecutionSession().OL_replace(*this, std::move(MU));
+}
+
+inline Expected<std::unique_ptr<MaterializationResponsibility>>
+MaterializationResponsibility::delegate(const SymbolNameSet &Symbols) {
+  return JD->getExecutionSession().OL_delegate(*this, Symbols);
+}
+
+inline void MaterializationResponsibility::addDependencies(
+    const SymbolStringPtr &Name, const SymbolDependenceMap &Dependencies) {
+  JD->getExecutionSession().OL_addDependencies(*this, Name, Dependencies);
+}
+
+inline void MaterializationResponsibility::addDependenciesForAll(
+    const SymbolDependenceMap &Dependencies) {
+  JD->getExecutionSession().OL_addDependenciesForAll(*this, Dependencies);
+}
+
 } // End namespace orc
 } // End namespace llvm
 
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h
index 3b824b83b052..fdddc9694d0b 100644
--- a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h
+++ b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h
@@ -18,7 +18,7 @@
 #include "llvm/ExecutionEngine/JITSymbol.h"
 #include "llvm/ExecutionEngine/Orc/Core.h"
 #include "llvm/ExecutionEngine/Orc/Mangling.h"
-#include "llvm/ExecutionEngine/Orc/OrcError.h"
+#include "llvm/ExecutionEngine/Orc/Shared/OrcError.h"
 #include "llvm/ExecutionEngine/RuntimeDyld.h"
 #include "llvm/Object/Archive.h"
 #include "llvm/Support/DynamicLibrary.h"
@@ -41,17 +41,6 @@ namespace orc {
 
 class ObjectLayer;
 
-/// Run a main function, returning the result.
-///
-/// If the optional ProgramName argument is given then it will be inserted
-/// before the strings in Args as the first argument to the called function.
-///
-/// It is legal to have an empty argument list and no program name, however
-/// many main functions will expect a name argument at least, and will fail
-/// if none is provided.
-int runAsMain(int (*Main)(int, char *[]), ArrayRef<std::string> Args,
-              Optional<StringRef> ProgramName = None);
-
 /// This iterator provides a convenient way to iterate over the elements
 ///        of an llvm.global_ctors/llvm.global_dtors instance.
 ///
@@ -152,56 +141,6 @@ inline iterator_range<StaticInitGVIterator> getStaticInitGVs(Module &M) {
   return make_range(StaticInitGVIterator(M), StaticInitGVIterator());
 }
 
-/// Convenience class for recording constructor/destructor names for
-///        later execution.
-template <typename JITLayerT>
-class LegacyCtorDtorRunner {
-public:
-  /// Construct a CtorDtorRunner for the given range using the given
-  ///        name mangling function.
-  LLVM_ATTRIBUTE_DEPRECATED(
-      LegacyCtorDtorRunner(std::vector<std::string> CtorDtorNames,
-                           VModuleKey K),
-      "ORCv1 utilities (utilities with the 'Legacy' prefix) are deprecated. "
-      "Please use the ORCv2 CtorDtorRunner utility instead");
-
-  LegacyCtorDtorRunner(ORCv1DeprecationAcknowledgement,
-                       std::vector<std::string> CtorDtorNames, VModuleKey K)
-      : CtorDtorNames(std::move(CtorDtorNames)), K(K) {}
-
-  /// Run the recorded constructors/destructors through the given JIT
-  ///        layer.
-  Error runViaLayer(JITLayerT &JITLayer) const {
-    using CtorDtorTy = void (*)();
-
-    for (const auto &CtorDtorName : CtorDtorNames) {
-      if (auto CtorDtorSym = JITLayer.findSymbolIn(K, CtorDtorName, false)) {
-        if (auto AddrOrErr = CtorDtorSym.getAddress()) {
-          CtorDtorTy CtorDtor =
-            reinterpret_cast<CtorDtorTy>(static_cast<uintptr_t>(*AddrOrErr));
-          CtorDtor();
-        } else
-          return AddrOrErr.takeError();
-      } else {
-        if (auto Err = CtorDtorSym.takeError())
-          return Err;
-        else
-          return make_error<JITSymbolNotFound>(CtorDtorName);
-      }
-    }
-    return Error::success();
-  }
-
-private:
-  std::vector<std::string> CtorDtorNames;
-  orc::VModuleKey K;
-};
-
-template <typename JITLayerT>
-LegacyCtorDtorRunner<JITLayerT>::LegacyCtorDtorRunner(
-    std::vector<std::string> CtorDtorNames, VModuleKey K)
-    : CtorDtorNames(std::move(CtorDtorNames)), K(K) {}
-
 class CtorDtorRunner {
 public:
   CtorDtorRunner(JITDylib &JD) : JD(JD) {}
@@ -250,45 +189,6 @@ protected:
                                void *DSOHandle);
 };
 
-class LegacyLocalCXXRuntimeOverrides : public LocalCXXRuntimeOverridesBase {
-public:
-  /// Create a runtime-overrides class.
-  template <typename MangleFtorT>
-  LLVM_ATTRIBUTE_DEPRECATED(
-      LegacyLocalCXXRuntimeOverrides(const MangleFtorT &Mangle),
-      "ORCv1 utilities (utilities with the 'Legacy' prefix) are deprecated. "
-      "Please use the ORCv2 LocalCXXRuntimeOverrides utility instead");
-
-  template <typename MangleFtorT>
-  LegacyLocalCXXRuntimeOverrides(ORCv1DeprecationAcknowledgement,
-                                 const MangleFtorT &Mangle) {
-    addOverride(Mangle("__dso_handle"), toTargetAddress(&DSOHandleOverride));
-    addOverride(Mangle("__cxa_atexit"), toTargetAddress(&CXAAtExitOverride));
-  }
-
-  /// Search overrided symbols.
-  JITEvaluatedSymbol searchOverrides(const std::string &Name) {
-    auto I = CXXRuntimeOverrides.find(Name);
-    if (I != CXXRuntimeOverrides.end())
-      return JITEvaluatedSymbol(I->second, JITSymbolFlags::Exported);
-    return nullptr;
-  }
-
-private:
-  void addOverride(const std::string &Name, JITTargetAddress Addr) {
-    CXXRuntimeOverrides.insert(std::make_pair(Name, Addr));
-  }
-
-  StringMap<JITTargetAddress> CXXRuntimeOverrides;
-};
-
-template <typename MangleFtorT>
-LegacyLocalCXXRuntimeOverrides::LegacyLocalCXXRuntimeOverrides(
-    const MangleFtorT &Mangle) {
-  addOverride(Mangle("__dso_handle"), toTargetAddress(&DSOHandleOverride));
-  addOverride(Mangle("__cxa_atexit"), toTargetAddress(&CXAAtExitOverride));
-}
-
 class LocalCXXRuntimeOverrides : public LocalCXXRuntimeOverridesBase {
 public:
   Error enable(JITDylib &JD, MangleAndInterner &Mangler);
@@ -315,7 +215,7 @@ private:
 /// If an instance of this class is attached to a JITDylib as a fallback
 /// definition generator, then any symbol found in the given DynamicLibrary that
 /// passes the 'Allow' predicate will be added to the JITDylib.
-class DynamicLibrarySearchGenerator : public JITDylib::DefinitionGenerator {
+class DynamicLibrarySearchGenerator : public DefinitionGenerator {
 public:
   using SymbolPredicate = std::function<bool(const SymbolStringPtr &)>;
 
@@ -343,7 +243,7 @@ public:
     return Load(nullptr, GlobalPrefix, std::move(Allow));
   }
 
-  Error tryToGenerate(LookupKind K, JITDylib &JD,
+  Error tryToGenerate(LookupState &LS, LookupKind K, JITDylib &JD,
                       JITDylibLookupFlags JDLookupFlags,
                       const SymbolLookupSet &Symbols) override;
 
@@ -358,7 +258,7 @@ private:
 /// If an instance of this class is attached to a JITDylib as a fallback
 /// definition generator, then any symbol found in the archive will result in
 /// the containing object being added to the JITDylib.
-class StaticLibraryDefinitionGenerator : public JITDylib::DefinitionGenerator {
+class StaticLibraryDefinitionGenerator : public DefinitionGenerator {
 public:
   /// Try to create a StaticLibraryDefinitionGenerator from the given path.
   ///
@@ -381,7 +281,7 @@ public:
   static Expected<std::unique_ptr<StaticLibraryDefinitionGenerator>>
   Create(ObjectLayer &L, std::unique_ptr<MemoryBuffer> ArchiveBuffer);
 
-  Error tryToGenerate(LookupKind K, JITDylib &JD,
+  Error tryToGenerate(LookupState &LS, LookupKind K, JITDylib &JD,
                       JITDylibLookupFlags JDLookupFlags,
                       const SymbolLookupSet &Symbols) override;
 
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/GlobalMappingLayer.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/GlobalMappingLayer.h
deleted file mode 100644
index a4e43d4e1c9c..000000000000
--- a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/GlobalMappingLayer.h
+++ /dev/null
@@ -1,111 +0,0 @@
-//===- GlobalMappingLayer.h - Run all IR through a functor ------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Convenience layer for injecting symbols that will appear in calls to
-// findSymbol.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_EXECUTIONENGINE_ORC_GLOBALMAPPINGLAYER_H
-#define LLVM_EXECUTIONENGINE_ORC_GLOBALMAPPINGLAYER_H
-
-#include "llvm/ExecutionEngine/JITSymbol.h"
-#include <map>
-#include <memory>
-#include <string>
-
-namespace llvm {
-
-class Module;
-class JITSymbolResolver;
-
-namespace orc {
-
-/// Global mapping layer.
-///
-///   This layer overrides the findSymbol method to first search a local symbol
-/// table that the client can define. It can be used to inject new symbol
-/// mappings into the JIT. Beware, however: symbols within a single IR module or
-/// object file will still resolve locally (via RuntimeDyld's symbol table) -
-/// such internal references cannot be overriden via this layer.
-template <typename BaseLayerT>
-class GlobalMappingLayer {
-public:
-
-  /// Handle to an added module.
-  using ModuleHandleT = typename BaseLayerT::ModuleHandleT;
-
-  /// Construct an GlobalMappingLayer with the given BaseLayer
-  GlobalMappingLayer(BaseLayerT &BaseLayer) : BaseLayer(BaseLayer) {}
-
-  /// Add the given module to the JIT.
-  /// @return A handle for the added modules.
-  Expected<ModuleHandleT>
-  addModule(std::shared_ptr<Module> M,
-            std::shared_ptr<JITSymbolResolver> Resolver) {
-    return BaseLayer.addModule(std::move(M), std::move(Resolver));
-  }
-
-  /// Remove the module set associated with the handle H.
-  Error removeModule(ModuleHandleT H) { return BaseLayer.removeModule(H); }
-
-  /// Manually set the address to return for the given symbol.
-  void setGlobalMapping(const std::string &Name, JITTargetAddress Addr) {
-    SymbolTable[Name] = Addr;
-  }
-
-  /// Remove the given symbol from the global mapping.
-  void eraseGlobalMapping(const std::string &Name) {
-    SymbolTable.erase(Name);
-  }
-
-  /// Search for the given named symbol.
-  ///
-  ///          This method will first search the local symbol table, returning
-  ///        any symbol found there. If the symbol is not found in the local
-  ///        table then this call will be passed through to the base layer.
-  ///
-  /// @param Name The name of the symbol to search for.
-  /// @param ExportedSymbolsOnly If true, search only for exported symbols.
-  /// @return A handle for the given named symbol, if it exists.
-  JITSymbol findSymbol(const std::string &Name, bool ExportedSymbolsOnly) {
-    auto I = SymbolTable.find(Name);
-    if (I != SymbolTable.end())
-      return JITSymbol(I->second, JITSymbolFlags::Exported);
-    return BaseLayer.findSymbol(Name, ExportedSymbolsOnly);
-  }
-
-  /// Get the address of the given symbol in the context of the of the
-  ///        module represented by the handle H. This call is forwarded to the
-  ///        base layer's implementation.
-  /// @param H The handle for the module to search in.
-  /// @param Name The name of the symbol to search for.
-  /// @param ExportedSymbolsOnly If true, search only for exported symbols.
-  /// @return A handle for the given named symbol, if it is found in the
-  ///         given module.
-  JITSymbol findSymbolIn(ModuleHandleT H, const std::string &Name,
-                         bool ExportedSymbolsOnly) {
-    return BaseLayer.findSymbolIn(H, Name, ExportedSymbolsOnly);
-  }
-
-  /// Immediately emit and finalize the module set represented by the
-  ///        given handle.
-  /// @param H Handle for module set to emit/finalize.
-  Error emitAndFinalize(ModuleHandleT H) {
-    return BaseLayer.emitAndFinalize(H);
-  }
-
-private:
-  BaseLayerT &BaseLayer;
-  std::map<std::string, JITTargetAddress> SymbolTable;
-};
-
-} // end namespace orc
-} // end namespace llvm
-
-#endif // LLVM_EXECUTIONENGINE_ORC_GLOBALMAPPINGLAYER_H
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/IRCompileLayer.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/IRCompileLayer.h
index eb74d283f043..f8fdb171bbf9 100644
--- a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/IRCompileLayer.h
+++ b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/IRCompileLayer.h
@@ -45,8 +45,8 @@ public:
     IRSymbolMapper::ManglingOptions MO;
   };
 
-  using NotifyCompiledFunction =
-      std::function<void(VModuleKey K, ThreadSafeModule TSM)>;
+  using NotifyCompiledFunction = std::function<void(
+      MaterializationResponsibility &R, ThreadSafeModule TSM)>;
 
   IRCompileLayer(ExecutionSession &ES, ObjectLayer &BaseLayer,
                  std::unique_ptr<IRCompiler> Compile);
@@ -55,7 +55,8 @@ public:
 
   void setNotifyCompiled(NotifyCompiledFunction NotifyCompiled);
 
-  void emit(MaterializationResponsibility R, ThreadSafeModule TSM) override;
+  void emit(std::unique_ptr<MaterializationResponsibility> R,
+            ThreadSafeModule TSM) override;
 
 private:
   mutable std::mutex IRLayerMutex;
@@ -65,99 +66,6 @@ private:
   NotifyCompiledFunction NotifyCompiled = NotifyCompiledFunction();
 };
 
-/// Eager IR compiling layer.
-///
-///   This layer immediately compiles each IR module added via addModule to an
-/// object file and adds this module file to the layer below, which must
-/// implement the object layer concept.
-template <typename BaseLayerT, typename CompileFtor>
-class LegacyIRCompileLayer {
-public:
-  /// Callback type for notifications when modules are compiled.
-  using NotifyCompiledCallback =
-      std::function<void(VModuleKey K, std::unique_ptr<Module>)>;
-
-  /// Construct an LegacyIRCompileLayer with the given BaseLayer, which must
-  ///        implement the ObjectLayer concept.
-  LLVM_ATTRIBUTE_DEPRECATED(
-      LegacyIRCompileLayer(
-          BaseLayerT &BaseLayer, CompileFtor Compile,
-          NotifyCompiledCallback NotifyCompiled = NotifyCompiledCallback()),
-      "ORCv1 layers (layers with the 'Legacy' prefix) are deprecated. Please "
-      "use "
-      "the ORCv2 IRCompileLayer instead");
-
-  /// Legacy layer constructor with deprecation acknowledgement.
-  LegacyIRCompileLayer(
-      ORCv1DeprecationAcknowledgement, BaseLayerT &BaseLayer,
-      CompileFtor Compile,
-      NotifyCompiledCallback NotifyCompiled = NotifyCompiledCallback())
-      : BaseLayer(BaseLayer), Compile(std::move(Compile)),
-        NotifyCompiled(std::move(NotifyCompiled)) {}
-
-  /// Get a reference to the compiler functor.
-  CompileFtor& getCompiler() { return Compile; }
-
-  /// (Re)set the NotifyCompiled callback.
-  void setNotifyCompiled(NotifyCompiledCallback NotifyCompiled) {
-    this->NotifyCompiled = std::move(NotifyCompiled);
-  }
-
-  /// Compile the module, and add the resulting object to the base layer
-  ///        along with the given memory manager and symbol resolver.
-  Error addModule(VModuleKey K, std::unique_ptr<Module> M) {
-    auto Obj = Compile(*M);
-    if (!Obj)
-      return Obj.takeError();
-    if (auto Err = BaseLayer.addObject(std::move(K), std::move(*Obj)))
-      return Err;
-    if (NotifyCompiled)
-      NotifyCompiled(std::move(K), std::move(M));
-    return Error::success();
-  }
-
-  /// Remove the module associated with the VModuleKey K.
-  Error removeModule(VModuleKey K) { return BaseLayer.removeObject(K); }
-
-  /// Search for the given named symbol.
-  /// @param Name The name of the symbol to search for.
-  /// @param ExportedSymbolsOnly If true, search only for exported symbols.
-  /// @return A handle for the given named symbol, if it exists.
-  JITSymbol findSymbol(const std::string &Name, bool ExportedSymbolsOnly) {
-    return BaseLayer.findSymbol(Name, ExportedSymbolsOnly);
-  }
-
-  /// Get the address of the given symbol in compiled module represented
-  ///        by the handle H. This call is forwarded to the base layer's
-  ///        implementation.
-  /// @param K The VModuleKey for the module to search in.
-  /// @param Name The name of the symbol to search for.
-  /// @param ExportedSymbolsOnly If true, search only for exported symbols.
-  /// @return A handle for the given named symbol, if it is found in the
-  ///         given module.
-  JITSymbol findSymbolIn(VModuleKey K, const std::string &Name,
-                         bool ExportedSymbolsOnly) {
-    return BaseLayer.findSymbolIn(K, Name, ExportedSymbolsOnly);
-  }
-
-  /// Immediately emit and finalize the module represented by the given
-  ///        handle.
-  /// @param K The VModuleKey for the module to emit/finalize.
-  Error emitAndFinalize(VModuleKey K) { return BaseLayer.emitAndFinalize(K); }
-
-private:
-  BaseLayerT &BaseLayer;
-  CompileFtor Compile;
-  NotifyCompiledCallback NotifyCompiled;
-};
-
-template <typename BaseLayerT, typename CompileFtor>
-LegacyIRCompileLayer<BaseLayerT, CompileFtor>::LegacyIRCompileLayer(
-    BaseLayerT &BaseLayer, CompileFtor Compile,
-    NotifyCompiledCallback NotifyCompiled)
-    : BaseLayer(BaseLayer), Compile(std::move(Compile)),
-      NotifyCompiled(std::move(NotifyCompiled)) {}
-
 } // end namespace orc
 } // end namespace llvm
 
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/IRTransformLayer.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/IRTransformLayer.h
index 296d74ae6b86..66966a0f8762 100644
--- a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/IRTransformLayer.h
+++ b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/IRTransformLayer.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_EXECUTIONENGINE_ORC_IRTRANSFORMLAYER_H
 #define LLVM_EXECUTIONENGINE_ORC_IRTRANSFORMLAYER_H
 
+#include "llvm/ADT/FunctionExtras.h"
 #include "llvm/ExecutionEngine/JITSymbol.h"
 #include "llvm/ExecutionEngine/Orc/Layer.h"
 #include <memory>
@@ -27,7 +28,7 @@ namespace orc {
 /// before operating on the module.
 class IRTransformLayer : public IRLayer {
 public:
-  using TransformFunction = std::function<Expected<ThreadSafeModule>(
+  using TransformFunction = unique_function<Expected<ThreadSafeModule>(
       ThreadSafeModule, MaterializationResponsibility &R)>;
 
   IRTransformLayer(ExecutionSession &ES, IRLayer &BaseLayer,
@@ -37,7 +38,8 @@ public:
     this->Transform = std::move(Transform);
   }
 
-  void emit(MaterializationResponsibility R, ThreadSafeModule TSM) override;
+  void emit(std::unique_ptr<MaterializationResponsibility> R,
+            ThreadSafeModule TSM) override;
 
   static ThreadSafeModule identityTransform(ThreadSafeModule TSM,
                                             MaterializationResponsibility &R) {
@@ -49,80 +51,6 @@ private:
   TransformFunction Transform;
 };
 
-/// IR mutating layer.
-///
-///   This layer applies a user supplied transform to each module that is added,
-/// then adds the transformed module to the layer below.
-template <typename BaseLayerT, typename TransformFtor>
-class LegacyIRTransformLayer {
-public:
-
-  /// Construct an LegacyIRTransformLayer with the given BaseLayer
-  LLVM_ATTRIBUTE_DEPRECATED(
-      LegacyIRTransformLayer(BaseLayerT &BaseLayer,
-                             TransformFtor Transform = TransformFtor()),
-      "ORCv1 layers (layers with the 'Legacy' prefix) are deprecated. Please "
-      "use "
-      "the ORCv2 IRTransformLayer instead");
-
-  /// Legacy layer constructor with deprecation acknowledgement.
-  LegacyIRTransformLayer(ORCv1DeprecationAcknowledgement, BaseLayerT &BaseLayer,
-                         TransformFtor Transform = TransformFtor())
-      : BaseLayer(BaseLayer), Transform(std::move(Transform)) {}
-
-  /// Apply the transform functor to the module, then add the module to
-  ///        the layer below, along with the memory manager and symbol resolver.
-  ///
-  /// @return A handle for the added modules.
-  Error addModule(VModuleKey K, std::unique_ptr<Module> M) {
-    return BaseLayer.addModule(std::move(K), Transform(std::move(M)));
-  }
-
-  /// Remove the module associated with the VModuleKey K.
-  Error removeModule(VModuleKey K) { return BaseLayer.removeModule(K); }
-
-  /// Search for the given named symbol.
-  /// @param Name The name of the symbol to search for.
-  /// @param ExportedSymbolsOnly If true, search only for exported symbols.
-  /// @return A handle for the given named symbol, if it exists.
-  JITSymbol findSymbol(const std::string &Name, bool ExportedSymbolsOnly) {
-    return BaseLayer.findSymbol(Name, ExportedSymbolsOnly);
-  }
-
-  /// Get the address of the given symbol in the context of the module
-  ///        represented by the VModuleKey K. This call is forwarded to the base
-  ///        layer's implementation.
-  /// @param K The VModuleKey for the module to search in.
-  /// @param Name The name of the symbol to search for.
-  /// @param ExportedSymbolsOnly If true, search only for exported symbols.
-  /// @return A handle for the given named symbol, if it is found in the
-  ///         given module.
-  JITSymbol findSymbolIn(VModuleKey K, const std::string &Name,
-                         bool ExportedSymbolsOnly) {
-    return BaseLayer.findSymbolIn(K, Name, ExportedSymbolsOnly);
-  }
-
-  /// Immediately emit and finalize the module represented by the given
-  ///        VModuleKey.
-  /// @param K The VModuleKey for the module to emit/finalize.
-  Error emitAndFinalize(VModuleKey K) { return BaseLayer.emitAndFinalize(K); }
-
-  /// Access the transform functor directly.
-  TransformFtor& getTransform() { return Transform; }
-
-  /// Access the mumate functor directly.
-  const TransformFtor& getTransform() const { return Transform; }
-
-private:
-  BaseLayerT &BaseLayer;
-  TransformFtor Transform;
-};
-
-template <typename BaseLayerT, typename TransformFtor>
-LegacyIRTransformLayer<BaseLayerT, TransformFtor>::LegacyIRTransformLayer(
-    BaseLayerT &BaseLayer, TransformFtor Transform)
-    : BaseLayer(BaseLayer), Transform(std::move(Transform)) {}
-
 } // end namespace orc
 } // end namespace llvm
 
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/IndirectionUtils.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/IndirectionUtils.h
index e0cfd8bf2409..78e3ceef50e2 100644
--- a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/IndirectionUtils.h
+++ b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/IndirectionUtils.h
@@ -62,14 +62,33 @@ public:
       JITTargetAddress TrampolineAddr,
       NotifyLandingResolvedFunction OnLandingResolved) const>;
 
-  virtual ~TrampolinePool() {}
+  virtual ~TrampolinePool();
 
   /// Get an available trampoline address.
   /// Returns an error if no trampoline can be created.
-  virtual Expected<JITTargetAddress> getTrampoline() = 0;
+  Expected<JITTargetAddress> getTrampoline() {
+    std::lock_guard<std::mutex> Lock(TPMutex);
+    if (AvailableTrampolines.empty()) {
+      if (auto Err = grow())
+        return std::move(Err);
+    }
+    assert(!AvailableTrampolines.empty() && "Failed to grow trampoline pool");
+    auto TrampolineAddr = AvailableTrampolines.back();
+    AvailableTrampolines.pop_back();
+    return TrampolineAddr;
+  }
 
-private:
-  virtual void anchor();
+  /// Returns the given trampoline to the pool for re-use.
+  void releaseTrampoline(JITTargetAddress TrampolineAddr) {
+    std::lock_guard<std::mutex> Lock(TPMutex);
+    AvailableTrampolines.push_back(TrampolineAddr);
+  }
+
+protected:
+  virtual Error grow() = 0;
+
+  std::mutex TPMutex;
+  std::vector<JITTargetAddress> AvailableTrampolines;
 };
 
 /// A trampoline pool for trampolines within the current process.
@@ -90,26 +109,6 @@ public:
     return std::move(LTP);
   }
 
-  /// Get a free trampoline. Returns an error if one can not be provided (e.g.
-  /// because the pool is empty and can not be grown).
-  Expected<JITTargetAddress> getTrampoline() override {
-    std::lock_guard<std::mutex> Lock(LTPMutex);
-    if (AvailableTrampolines.empty()) {
-      if (auto Err = grow())
-        return std::move(Err);
-    }
-    assert(!AvailableTrampolines.empty() && "Failed to grow trampoline pool");
-    auto TrampolineAddr = AvailableTrampolines.back();
-    AvailableTrampolines.pop_back();
-    return TrampolineAddr;
-  }
-
-  /// Returns the given trampoline to the pool for re-use.
-  void releaseTrampoline(JITTargetAddress TrampolineAddr) {
-    std::lock_guard<std::mutex> Lock(LTPMutex);
-    AvailableTrampolines.push_back(TrampolineAddr);
-  }
-
 private:
   static JITTargetAddress reenter(void *TrampolinePoolPtr, void *TrampolineId) {
     LocalTrampolinePool<ORCABI> *TrampolinePool =
@@ -154,8 +153,8 @@ private:
     }
   }
 
-  Error grow() {
-    assert(this->AvailableTrampolines.empty() && "Growing prematurely?");
+  Error grow() override {
+    assert(AvailableTrampolines.empty() && "Growing prematurely?");
 
     std::error_code EC;
     auto TrampolineBlock =
@@ -175,7 +174,7 @@ private:
         pointerToJITTargetAddress(ResolverBlock.base()), NumTrampolines);
 
     for (unsigned I = 0; I < NumTrampolines; ++I)
-      this->AvailableTrampolines.push_back(pointerToJITTargetAddress(
+      AvailableTrampolines.push_back(pointerToJITTargetAddress(
           TrampolineMem + (I * ORCABI::TrampolineSize)));
 
     if (auto EC = sys::Memory::protectMappedMemory(
@@ -189,10 +188,8 @@ private:
 
   ResolveLandingFunction ResolveLanding;
 
-  std::mutex LTPMutex;
   sys::OwningMemoryBlock ResolverBlock;
   std::vector<sys::OwningMemoryBlock> TrampolineBlocks;
-  std::vector<JITTargetAddress> AvailableTrampolines;
 };
 
 /// Target-independent base class for compile callback management.
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/LLJIT.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/LLJIT.h
index 96f8e169e7dc..ff0aa0238523 100644
--- a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/LLJIT.h
+++ b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/LLJIT.h
@@ -19,7 +19,6 @@
 #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
 #include "llvm/ExecutionEngine/Orc/IRTransformLayer.h"
 #include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
-#include "llvm/ExecutionEngine/Orc/ObjectTransformLayer.h"
 #include "llvm/ExecutionEngine/Orc/ThreadSafeModule.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ThreadPool.h"
@@ -29,6 +28,8 @@ namespace orc {
 
 class LLJITBuilderState;
 class LLLazyJITBuilderState;
+class ObjectTransformLayer;
+class TargetProcessControl;
 
 /// A pre-fabricated ORC JIT stack that can serve as an alternative to MCJIT.
 ///
@@ -85,21 +86,8 @@ public:
     return ES->createJITDylib(std::move(Name));
   }
 
-  /// A convenience method for defining MUs in LLJIT's Main JITDylib. This can
-  /// be useful for succinctly defining absolute symbols, aliases and
-  /// re-exports.
-  template <typename MUType>
-  Error define(std::unique_ptr<MUType> &&MU) {
-    return Main->define(std::move(MU));
-  }
-
-  /// A convenience method for defining MUs in LLJIT's Main JITDylib. This can
-  /// be usedful for succinctly defining absolute symbols, aliases and
-  /// re-exports.
-  template <typename MUType>
-  Error define(std::unique_ptr<MUType> &MU) {
-    return Main->define(MU);
-  }
+  /// Adds an IR module with the given ResourceTracker.
+  Error addIRModule(ResourceTrackerSP RT, ThreadSafeModule TSM);
 
   /// Adds an IR module to the given JITDylib.
   Error addIRModule(JITDylib &JD, ThreadSafeModule TSM);
@@ -109,6 +97,9 @@ public:
     return addIRModule(*Main, std::move(TSM));
   }
 
+  /// Adds an object file to the given JITDylib.
+  Error addObjectFile(ResourceTrackerSP RT, std::unique_ptr<MemoryBuffer> Obj);
+
   /// Adds an object file to the given JITDylib.
   Error addObjectFile(JITDylib &JD, std::unique_ptr<MemoryBuffer> Obj);
 
@@ -178,7 +169,7 @@ public:
   ObjectLayer &getObjLinkingLayer() { return *ObjLinkingLayer; }
 
   /// Returns a reference to the object transform layer.
-  ObjectTransformLayer &getObjTransformLayer() { return ObjTransformLayer; }
+  ObjectTransformLayer &getObjTransformLayer() { return *ObjTransformLayer; }
 
   /// Returns a reference to the IR transform layer.
   IRTransformLayer &getIRTransformLayer() { return *TransformLayer; }
@@ -195,7 +186,7 @@ public:
   }
 
 protected:
-  static std::unique_ptr<ObjectLayer>
+  static Expected<std::unique_ptr<ObjectLayer>>
   createObjectLinkingLayer(LLJITBuilderState &S, ExecutionSession &ES);
 
   static Expected<std::unique_ptr<IRCompileLayer::IRCompiler>>
@@ -218,7 +209,7 @@ protected:
   std::unique_ptr<ThreadPool> CompileThreads;
 
   std::unique_ptr<ObjectLayer> ObjLinkingLayer;
-  ObjectTransformLayer ObjTransformLayer;
+  std::unique_ptr<ObjectTransformLayer> ObjTransformLayer;
   std::unique_ptr<IRCompileLayer> CompileLayer;
   std::unique_ptr<IRTransformLayer> TransformLayer;
   std::unique_ptr<IRTransformLayer> InitHelperTransformLayer;
@@ -237,6 +228,9 @@ public:
     CODLayer->setPartitionFunction(std::move(Partition));
   }
 
+  /// Returns a reference to the on-demand layer.
+  CompileOnDemandLayer &getCompileOnDemandLayer() { return *CODLayer; }
+
   /// Add a module to be lazily compiled to JITDylib JD.
   Error addLazyIRModule(JITDylib &JD, ThreadSafeModule M);
 
@@ -256,8 +250,9 @@ private:
 
 class LLJITBuilderState {
 public:
-  using ObjectLinkingLayerCreator = std::function<std::unique_ptr<ObjectLayer>(
-      ExecutionSession &, const Triple &TT)>;
+  using ObjectLinkingLayerCreator =
+      std::function<Expected<std::unique_ptr<ObjectLayer>>(ExecutionSession &,
+                                                           const Triple &)>;
 
   using CompileFunctionCreator =
       std::function<Expected<std::unique_ptr<IRCompileLayer::IRCompiler>>(
@@ -272,6 +267,7 @@ public:
   CompileFunctionCreator CreateCompileFunction;
   PlatformSetupFunction SetUpPlatform;
   unsigned NumCompileThreads = 0;
+  TargetProcessControl *TPC = nullptr;
 
   /// Called prior to JIT class construcion to fix up defaults.
   Error prepareForConstruction();
@@ -354,6 +350,17 @@ public:
     return impl();
   }
 
+  /// Set a TargetProcessControl object.
+  ///
+  /// If the platform uses ObjectLinkingLayer by default and no
+  /// ObjectLinkingLayerCreator has been set then the TargetProcessControl
+  /// object will be used to supply the memory manager for the
+  /// ObjectLinkingLayer.
+  SetterImpl &setTargetProcessControl(TargetProcessControl &TPC) {
+    impl().TPC = &TPC;
+    return impl();
+  }
+
   /// Create an instance of the JIT.
   Expected<std::unique_ptr<JITType>> create() {
     if (auto Err = impl().prepareForConstruction())
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/LambdaResolver.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/LambdaResolver.h
deleted file mode 100644
index b31914f12a0d..000000000000
--- a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/LambdaResolver.h
+++ /dev/null
@@ -1,84 +0,0 @@
-//===- LambdaResolverMM - Redirect symbol lookup via a functor --*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//   Defines a RuntimeDyld::SymbolResolver subclass that uses a user-supplied
-// functor for symbol resolution.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_EXECUTIONENGINE_ORC_LAMBDARESOLVER_H
-#define LLVM_EXECUTIONENGINE_ORC_LAMBDARESOLVER_H
-
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ExecutionEngine/JITSymbol.h"
-#include "llvm/ExecutionEngine/OrcV1Deprecation.h"
-#include <memory>
-
-namespace llvm {
-namespace orc {
-
-template <typename DylibLookupFtorT, typename ExternalLookupFtorT>
-class LambdaResolver : public LegacyJITSymbolResolver {
-public:
-  LLVM_ATTRIBUTE_DEPRECATED(
-      LambdaResolver(DylibLookupFtorT DylibLookupFtor,
-                     ExternalLookupFtorT ExternalLookupFtor),
-      "ORCv1 utilities (including resolvers) are deprecated and will be "
-      "removed "
-      "in the next release. Please use ORCv2 (see docs/ORCv2.rst)");
-
-  LambdaResolver(ORCv1DeprecationAcknowledgement,
-                 DylibLookupFtorT DylibLookupFtor,
-                 ExternalLookupFtorT ExternalLookupFtor)
-      : DylibLookupFtor(DylibLookupFtor),
-        ExternalLookupFtor(ExternalLookupFtor) {}
-
-  JITSymbol findSymbolInLogicalDylib(const std::string &Name) final {
-    return DylibLookupFtor(Name);
-  }
-
-  JITSymbol findSymbol(const std::string &Name) final {
-    return ExternalLookupFtor(Name);
-  }
-
-private:
-  DylibLookupFtorT DylibLookupFtor;
-  ExternalLookupFtorT ExternalLookupFtor;
-};
-
-template <typename DylibLookupFtorT, typename ExternalLookupFtorT>
-LambdaResolver<DylibLookupFtorT, ExternalLookupFtorT>::LambdaResolver(
-    DylibLookupFtorT DylibLookupFtor, ExternalLookupFtorT ExternalLookupFtor)
-    : DylibLookupFtor(DylibLookupFtor), ExternalLookupFtor(ExternalLookupFtor) {
-}
-
-template <typename DylibLookupFtorT,
-          typename ExternalLookupFtorT>
-std::shared_ptr<LambdaResolver<DylibLookupFtorT, ExternalLookupFtorT>>
-createLambdaResolver(DylibLookupFtorT DylibLookupFtor,
-                     ExternalLookupFtorT ExternalLookupFtor) {
-  using LR = LambdaResolver<DylibLookupFtorT, ExternalLookupFtorT>;
-  return std::make_unique<LR>(std::move(DylibLookupFtor),
-                         std::move(ExternalLookupFtor));
-}
-
-template <typename DylibLookupFtorT, typename ExternalLookupFtorT>
-std::shared_ptr<LambdaResolver<DylibLookupFtorT, ExternalLookupFtorT>>
-createLambdaResolver(ORCv1DeprecationAcknowledgement,
-                     DylibLookupFtorT DylibLookupFtor,
-                     ExternalLookupFtorT ExternalLookupFtor) {
-  using LR = LambdaResolver<DylibLookupFtorT, ExternalLookupFtorT>;
-  return std::make_unique<LR>(AcknowledgeORCv1Deprecation,
-                         std::move(DylibLookupFtor),
-                         std::move(ExternalLookupFtor));
-}
-
-} // end namespace orc
-} // end namespace llvm
-
-#endif // LLVM_EXECUTIONENGINE_ORC_LAMBDARESOLVER_H
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/Layer.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/Layer.h
index e843d0f56245..f9cc15583b42 100644
--- a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/Layer.h
+++ b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/Layer.h
@@ -34,15 +34,15 @@ public:
   /// SymbolFlags and SymbolToDefinition maps.
   IRMaterializationUnit(ExecutionSession &ES,
                         const IRSymbolMapper::ManglingOptions &MO,
-                        ThreadSafeModule TSM, VModuleKey K);
+                        ThreadSafeModule TSM);
 
   /// Create an IRMaterializationLayer from a module, and pre-existing
   /// SymbolFlags and SymbolToDefinition maps. The maps must provide
   /// entries for each definition in M.
   /// This constructor is useful for delegating work from one
   /// IRMaterializationUnit to another.
-  IRMaterializationUnit(ThreadSafeModule TSM, VModuleKey K,
-                        SymbolFlagsMap SymbolFlags, SymbolStringPtr InitSymbol,
+  IRMaterializationUnit(ThreadSafeModule TSM, SymbolFlagsMap SymbolFlags,
+                        SymbolStringPtr InitSymbol,
                         SymbolNameToDefinitionMap SymbolToDefinition);
 
   /// Return the ModuleIdentifier as the name for this MaterializationUnit.
@@ -94,13 +94,19 @@ public:
   /// Returns the current value of the CloneToNewContextOnEmit flag.
   bool getCloneToNewContextOnEmit() const { return CloneToNewContextOnEmit; }
 
+  /// Add a MaterializatinoUnit representing the given IR to the JITDylib
+  /// targeted by the given tracker.
+  virtual Error add(ResourceTrackerSP RT, ThreadSafeModule TSM);
+
   /// Adds a MaterializationUnit representing the given IR to the given
-  /// JITDylib.
-  virtual Error add(JITDylib &JD, ThreadSafeModule TSM,
-                    VModuleKey K = VModuleKey());
+  /// JITDylib. If RT is not specif
+  Error add(JITDylib &JD, ThreadSafeModule TSM) {
+    return add(JD.getDefaultResourceTracker(), std::move(TSM));
+  }
 
   /// Emit should materialize the given IR.
-  virtual void emit(MaterializationResponsibility R, ThreadSafeModule TSM) = 0;
+  virtual void emit(std::unique_ptr<MaterializationResponsibility> R,
+                    ThreadSafeModule TSM) = 0;
 
 private:
   bool CloneToNewContextOnEmit = false;
@@ -114,14 +120,12 @@ class BasicIRLayerMaterializationUnit : public IRMaterializationUnit {
 public:
   BasicIRLayerMaterializationUnit(IRLayer &L,
                                   const IRSymbolMapper::ManglingOptions &MO,
-                                  ThreadSafeModule TSM, VModuleKey K);
+                                  ThreadSafeModule TSM);
 
 private:
-
-  void materialize(MaterializationResponsibility R) override;
+  void materialize(std::unique_ptr<MaterializationResponsibility> R) override;
 
   IRLayer &L;
-  VModuleKey K;
 };
 
 /// Interface for Layers that accept object files.
@@ -135,11 +139,14 @@ public:
 
   /// Adds a MaterializationUnit representing the given IR to the given
   /// JITDylib.
-  virtual Error add(JITDylib &JD, std::unique_ptr<MemoryBuffer> O,
-                    VModuleKey K = VModuleKey());
+  virtual Error add(ResourceTrackerSP RT, std::unique_ptr<MemoryBuffer> O);
+
+  Error add(JITDylib &JD, std::unique_ptr<MemoryBuffer> O) {
+    return add(JD.getDefaultResourceTracker(), std::move(O));
+  }
 
   /// Emit should materialize the given IR.
-  virtual void emit(MaterializationResponsibility R,
+  virtual void emit(std::unique_ptr<MaterializationResponsibility> R,
                     std::unique_ptr<MemoryBuffer> O) = 0;
 
 private:
@@ -151,9 +158,9 @@ private:
 class BasicObjectLayerMaterializationUnit : public MaterializationUnit {
 public:
   static Expected<std::unique_ptr<BasicObjectLayerMaterializationUnit>>
-  Create(ObjectLayer &L, VModuleKey K, std::unique_ptr<MemoryBuffer> O);
+  Create(ObjectLayer &L, std::unique_ptr<MemoryBuffer> O);
 
-  BasicObjectLayerMaterializationUnit(ObjectLayer &L, VModuleKey K,
+  BasicObjectLayerMaterializationUnit(ObjectLayer &L,
                                       std::unique_ptr<MemoryBuffer> O,
                                       SymbolFlagsMap SymbolFlags,
                                       SymbolStringPtr InitSymbol);
@@ -162,8 +169,7 @@ public:
   StringRef getName() const override;
 
 private:
-
-  void materialize(MaterializationResponsibility R) override;
+  void materialize(std::unique_ptr<MaterializationResponsibility> R) override;
   void discard(const JITDylib &JD, const SymbolStringPtr &Name) override;
 
   ObjectLayer &L;
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/LazyEmittingLayer.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/LazyEmittingLayer.h
deleted file mode 100644
index 84f5e0350c2e..000000000000
--- a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/LazyEmittingLayer.h
+++ /dev/null
@@ -1,267 +0,0 @@
-//===- LazyEmittingLayer.h - Lazily emit IR to lower JIT layers -*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Contains the definition for a lazy-emitting layer for the JIT.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_EXECUTIONENGINE_ORC_LAZYEMITTINGLAYER_H
-#define LLVM_EXECUTIONENGINE_ORC_LAZYEMITTINGLAYER_H
-
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringMap.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ExecutionEngine/JITSymbol.h"
-#include "llvm/ExecutionEngine/Orc/Core.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/Mangler.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
-#include <cassert>
-#include <list>
-#include <memory>
-#include <string>
-
-namespace llvm {
-namespace orc {
-
-/// Lazy-emitting IR layer.
-///
-///   This layer accepts LLVM IR Modules (via addModule) but does not
-/// immediately emit them the layer below. Instead, emission to the base layer
-/// is deferred until the first time the client requests the address (via
-/// JITSymbol::getAddress) for a symbol contained in this layer.
-template <typename BaseLayerT> class LazyEmittingLayer {
-private:
-  class EmissionDeferredModule {
-  public:
-    EmissionDeferredModule(VModuleKey K, std::unique_ptr<Module> M)
-        : K(std::move(K)), M(std::move(M)) {}
-
-    JITSymbol find(StringRef Name, bool ExportedSymbolsOnly, BaseLayerT &B) {
-      switch (EmitState) {
-      case NotEmitted:
-        if (auto GV = searchGVs(Name, ExportedSymbolsOnly)) {
-          JITSymbolFlags Flags = JITSymbolFlags::fromGlobalValue(*GV);
-          auto GetAddress = [this, ExportedSymbolsOnly, Name = Name.str(),
-                             &B]() -> Expected<JITTargetAddress> {
-            if (this->EmitState == Emitting)
-              return 0;
-            else if (this->EmitState == NotEmitted) {
-              this->EmitState = Emitting;
-              if (auto Err = this->emitToBaseLayer(B))
-                return std::move(Err);
-              this->EmitState = Emitted;
-            }
-            if (auto Sym = B.findSymbolIn(K, Name, ExportedSymbolsOnly))
-              return Sym.getAddress();
-            else if (auto Err = Sym.takeError())
-              return std::move(Err);
-            else
-              llvm_unreachable("Successful symbol lookup should return "
-                               "definition address here");
-          };
-          return JITSymbol(std::move(GetAddress), Flags);
-        } else
-          return nullptr;
-      case Emitting:
-        // Calling "emit" can trigger a recursive call to 'find' (e.g. to check
-        // for pre-existing definitions of common-symbol), but any symbol in
-        // this module would already have been found internally (in the
-        // RuntimeDyld that did the lookup), so just return a nullptr here.
-        return nullptr;
-      case Emitted:
-        return B.findSymbolIn(K, std::string(Name), ExportedSymbolsOnly);
-      }
-      llvm_unreachable("Invalid emit-state.");
-    }
-
-    Error removeModuleFromBaseLayer(BaseLayerT& BaseLayer) {
-      return EmitState != NotEmitted ? BaseLayer.removeModule(K)
-                                     : Error::success();
-    }
-
-    void emitAndFinalize(BaseLayerT &BaseLayer) {
-      assert(EmitState != Emitting &&
-             "Cannot emitAndFinalize while already emitting");
-      if (EmitState == NotEmitted) {
-        EmitState = Emitting;
-        emitToBaseLayer(BaseLayer);
-        EmitState = Emitted;
-      }
-      BaseLayer.emitAndFinalize(K);
-    }
-
-  private:
-
-    const GlobalValue* searchGVs(StringRef Name,
-                                 bool ExportedSymbolsOnly) const {
-      // FIXME: We could clean all this up if we had a way to reliably demangle
-      //        names: We could just demangle name and search, rather than
-      //        mangling everything else.
-
-      // If we have already built the mangled name set then just search it.
-      if (MangledSymbols) {
-        auto VI = MangledSymbols->find(Name);
-        if (VI == MangledSymbols->end())
-          return nullptr;
-        auto GV = VI->second;
-        if (!ExportedSymbolsOnly || GV->hasDefaultVisibility())
-          return GV;
-        return nullptr;
-      }
-
-      // If we haven't built the mangled name set yet, try to build it. As an
-      // optimization this will leave MangledNames set to nullptr if we find
-      // Name in the process of building the set.
-      return buildMangledSymbols(Name, ExportedSymbolsOnly);
-    }
-
-    Error emitToBaseLayer(BaseLayerT &BaseLayer) {
-      // We don't need the mangled names set any more: Once we've emitted this
-      // to the base layer we'll just look for symbols there.
-      MangledSymbols.reset();
-      return BaseLayer.addModule(std::move(K), std::move(M));
-    }
-
-    // If the mangled name of the given GlobalValue matches the given search
-    // name (and its visibility conforms to the ExportedSymbolsOnly flag) then
-    // return the symbol. Otherwise, add the mangled name to the Names map and
-    // return nullptr.
-    const GlobalValue* addGlobalValue(StringMap<const GlobalValue*> &Names,
-                                      const GlobalValue &GV,
-                                      const Mangler &Mang, StringRef SearchName,
-                                      bool ExportedSymbolsOnly) const {
-      // Modules don't "provide" decls or common symbols.
-      if (GV.isDeclaration() || GV.hasCommonLinkage())
-        return nullptr;
-
-      // Mangle the GV name.
-      std::string MangledName;
-      {
-        raw_string_ostream MangledNameStream(MangledName);
-        Mang.getNameWithPrefix(MangledNameStream, &GV, false);
-      }
-
-      // Check whether this is the name we were searching for, and if it is then
-      // bail out early.
-      if (MangledName == SearchName)
-        if (!ExportedSymbolsOnly || GV.hasDefaultVisibility())
-          return &GV;
-
-      // Otherwise add this to the map for later.
-      Names[MangledName] = &GV;
-      return nullptr;
-    }
-
-    // Build the MangledSymbols map. Bails out early (with MangledSymbols left set
-    // to nullptr) if the given SearchName is found while building the map.
-    const GlobalValue* buildMangledSymbols(StringRef SearchName,
-                                           bool ExportedSymbolsOnly) const {
-      assert(!MangledSymbols && "Mangled symbols map already exists?");
-
-      auto Symbols = std::make_unique<StringMap<const GlobalValue*>>();
-
-      Mangler Mang;
-
-      for (const auto &GO : M->global_objects())
-          if (auto GV = addGlobalValue(*Symbols, GO, Mang, SearchName,
-                                       ExportedSymbolsOnly))
-            return GV;
-
-      MangledSymbols = std::move(Symbols);
-      return nullptr;
-    }
-
-    enum { NotEmitted, Emitting, Emitted } EmitState = NotEmitted;
-    VModuleKey K;
-    std::unique_ptr<Module> M;
-    mutable std::unique_ptr<StringMap<const GlobalValue*>> MangledSymbols;
-  };
-
-  BaseLayerT &BaseLayer;
-  std::map<VModuleKey, std::unique_ptr<EmissionDeferredModule>> ModuleMap;
-
-public:
-
-  /// Construct a lazy emitting layer.
-  LLVM_ATTRIBUTE_DEPRECATED(
-      LazyEmittingLayer(BaseLayerT &BaseLayer),
-      "ORCv1 layers (including LazyEmittingLayer) are deprecated. Please use "
-      "ORCv2, where lazy emission is the default");
-
-  /// Construct a lazy emitting layer.
-  LazyEmittingLayer(ORCv1DeprecationAcknowledgement, BaseLayerT &BaseLayer)
-      : BaseLayer(BaseLayer) {}
-
-  /// Add the given module to the lazy emitting layer.
-  Error addModule(VModuleKey K, std::unique_ptr<Module> M) {
-    assert(!ModuleMap.count(K) && "VModuleKey K already in use");
-    ModuleMap[K] =
-        std::make_unique<EmissionDeferredModule>(std::move(K), std::move(M));
-    return Error::success();
-  }
-
-  /// Remove the module represented by the given handle.
-  ///
-  ///   This method will free the memory associated with the given module, both
-  /// in this layer, and the base layer.
-  Error removeModule(VModuleKey K) {
-    auto I = ModuleMap.find(K);
-    assert(I != ModuleMap.end() && "VModuleKey K not valid here");
-    auto EDM = std::move(I.second);
-    ModuleMap.erase(I);
-    return EDM->removeModuleFromBaseLayer(BaseLayer);
-  }
-
-  /// Search for the given named symbol.
-  /// @param Name The name of the symbol to search for.
-  /// @param ExportedSymbolsOnly If true, search only for exported symbols.
-  /// @return A handle for the given named symbol, if it exists.
-  JITSymbol findSymbol(const std::string &Name, bool ExportedSymbolsOnly) {
-    // Look for the symbol among existing definitions.
-    if (auto Symbol = BaseLayer.findSymbol(Name, ExportedSymbolsOnly))
-      return Symbol;
-
-    // If not found then search the deferred modules. If any of these contain a
-    // definition of 'Name' then they will return a JITSymbol that will emit
-    // the corresponding module when the symbol address is requested.
-    for (auto &KV : ModuleMap)
-      if (auto Symbol = KV.second->find(Name, ExportedSymbolsOnly, BaseLayer))
-        return Symbol;
-
-    // If no definition found anywhere return a null symbol.
-    return nullptr;
-  }
-
-  /// Get the address of the given symbol in the context of the of
-  ///        compiled modules represented by the key K.
-  JITSymbol findSymbolIn(VModuleKey K, const std::string &Name,
-                         bool ExportedSymbolsOnly) {
-    assert(ModuleMap.count(K) && "VModuleKey K not valid here");
-    return ModuleMap[K]->find(Name, ExportedSymbolsOnly, BaseLayer);
-  }
-
-  /// Immediately emit and finalize the module represented by the given
-  ///        key.
-  Error emitAndFinalize(VModuleKey K) {
-    assert(ModuleMap.count(K) && "VModuleKey K not valid here");
-    return ModuleMap[K]->emitAndFinalize(BaseLayer);
-  }
-};
-
-template <typename BaseLayerT>
-LazyEmittingLayer<BaseLayerT>::LazyEmittingLayer(BaseLayerT &BaseLayer)
-    : BaseLayer(BaseLayer) {}
-
-} // end namespace orc
-} // end namespace llvm
-
-#endif // LLVM_EXECUTIONENGINE_ORC_LAZYEMITTINGLAYER_H
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/LazyReexports.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/LazyReexports.h
index 0d3ccecdf121..e6a9d8945285 100644
--- a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/LazyReexports.h
+++ b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/LazyReexports.h
@@ -40,6 +40,9 @@ public:
   using NotifyResolvedFunction =
       unique_function<Error(JITTargetAddress ResolvedAddr)>;
 
+  LazyCallThroughManager(ExecutionSession &ES,
+                         JITTargetAddress ErrorHandlerAddr, TrampolinePool *TP);
+
   // Return a free call-through trampoline and bind it to look up and call
   // through to the given symbol.
   Expected<JITTargetAddress>
@@ -56,9 +59,6 @@ protected:
   using NotifyLandingResolvedFunction =
       TrampolinePool::NotifyLandingResolvedFunction;
 
-  LazyCallThroughManager(ExecutionSession &ES,
-                         JITTargetAddress ErrorHandlerAddr, TrampolinePool *TP);
-
   struct ReexportsEntry {
     JITDylib *SourceJD;
     SymbolStringPtr SymbolName;
@@ -144,12 +144,12 @@ public:
                                    IndirectStubsManager &ISManager,
                                    JITDylib &SourceJD,
                                    SymbolAliasMap CallableAliases,
-                                   ImplSymbolMap *SrcJDLoc, VModuleKey K);
+                                   ImplSymbolMap *SrcJDLoc);
 
   StringRef getName() const override;
 
 private:
-  void materialize(MaterializationResponsibility R) override;
+  void materialize(std::unique_ptr<MaterializationResponsibility> R) override;
   void discard(const JITDylib &JD, const SymbolStringPtr &Name) override;
   static SymbolFlagsMap extractFlags(const SymbolAliasMap &Aliases);
 
@@ -166,11 +166,10 @@ private:
 inline std::unique_ptr<LazyReexportsMaterializationUnit>
 lazyReexports(LazyCallThroughManager &LCTManager,
               IndirectStubsManager &ISManager, JITDylib &SourceJD,
-              SymbolAliasMap CallableAliases, ImplSymbolMap *SrcJDLoc = nullptr,
-              VModuleKey K = VModuleKey()) {
+              SymbolAliasMap CallableAliases,
+              ImplSymbolMap *SrcJDLoc = nullptr) {
   return std::make_unique<LazyReexportsMaterializationUnit>(
-      LCTManager, ISManager, SourceJD, std::move(CallableAliases), SrcJDLoc,
-      std::move(K));
+      LCTManager, ISManager, SourceJD, std::move(CallableAliases), SrcJDLoc);
 }
 
 } // End namespace orc
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/Legacy.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/Legacy.h
deleted file mode 100644
index b20202a49ef6..000000000000
--- a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/Legacy.h
+++ /dev/null
@@ -1,211 +0,0 @@
-//===--- Legacy.h -- Adapters for ExecutionEngine API interop ---*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Contains core ORC APIs.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_EXECUTIONENGINE_ORC_LEGACY_H
-#define LLVM_EXECUTIONENGINE_ORC_LEGACY_H
-
-#include "llvm/ExecutionEngine/JITSymbol.h"
-#include "llvm/ExecutionEngine/Orc/Core.h"
-
-namespace llvm {
-namespace orc {
-
-/// SymbolResolver is a composable interface for looking up symbol flags
-///        and addresses using the AsynchronousSymbolQuery type. It will
-///        eventually replace the LegacyJITSymbolResolver interface as the
-///        stardard ORC symbol resolver type.
-///
-/// FIXME: SymbolResolvers should go away and be replaced with VSOs with
-///        defenition generators.
-class SymbolResolver {
-public:
-  virtual ~SymbolResolver() = default;
-
-  /// Returns the subset of the given symbols that the caller is responsible for
-  /// materializing.
-  virtual SymbolNameSet getResponsibilitySet(const SymbolNameSet &Symbols) = 0;
-
-  /// For each symbol in Symbols that can be found, assigns that symbols
-  /// value in Query. Returns the set of symbols that could not be found.
-  virtual SymbolNameSet lookup(std::shared_ptr<AsynchronousSymbolQuery> Query,
-                               SymbolNameSet Symbols) = 0;
-
-private:
-  virtual void anchor();
-};
-
-/// Implements SymbolResolver with a pair of supplied function objects
-///        for convenience. See createSymbolResolver.
-template <typename GetResponsibilitySetFn, typename LookupFn>
-class LambdaSymbolResolver final : public SymbolResolver {
-public:
-  template <typename GetResponsibilitySetFnRef, typename LookupFnRef>
-  LambdaSymbolResolver(GetResponsibilitySetFnRef &&GetResponsibilitySet,
-                       LookupFnRef &&Lookup)
-      : GetResponsibilitySet(
-            std::forward<GetResponsibilitySetFnRef>(GetResponsibilitySet)),
-        Lookup(std::forward<LookupFnRef>(Lookup)) {}
-
-  SymbolNameSet getResponsibilitySet(const SymbolNameSet &Symbols) final {
-    return GetResponsibilitySet(Symbols);
-  }
-
-  SymbolNameSet lookup(std::shared_ptr<AsynchronousSymbolQuery> Query,
-                       SymbolNameSet Symbols) final {
-    return Lookup(std::move(Query), std::move(Symbols));
-  }
-
-private:
-  GetResponsibilitySetFn GetResponsibilitySet;
-  LookupFn Lookup;
-};
-
-/// Creates a SymbolResolver implementation from the pair of supplied
-///        function objects.
-template <typename GetResponsibilitySetFn, typename LookupFn>
-std::unique_ptr<LambdaSymbolResolver<
-    std::remove_cv_t<std::remove_reference_t<GetResponsibilitySetFn>>,
-    std::remove_cv_t<std::remove_reference_t<LookupFn>>>>
-createSymbolResolver(GetResponsibilitySetFn &&GetResponsibilitySet,
-                     LookupFn &&Lookup) {
-  using LambdaSymbolResolverImpl = LambdaSymbolResolver<
-      std::remove_cv_t<std::remove_reference_t<GetResponsibilitySetFn>>,
-      std::remove_cv_t<std::remove_reference_t<LookupFn>>>;
-  return std::make_unique<LambdaSymbolResolverImpl>(
-      std::forward<GetResponsibilitySetFn>(GetResponsibilitySet),
-      std::forward<LookupFn>(Lookup));
-}
-
-/// Legacy adapter. Remove once we kill off the old ORC layers.
-class JITSymbolResolverAdapter : public JITSymbolResolver {
-public:
-  JITSymbolResolverAdapter(ExecutionSession &ES, SymbolResolver &R,
-                           MaterializationResponsibility *MR);
-  Expected<LookupSet> getResponsibilitySet(const LookupSet &Symbols) override;
-  void lookup(const LookupSet &Symbols, OnResolvedFunction OnResolved) override;
-
-private:
-  ExecutionSession &ES;
-  std::set<SymbolStringPtr> ResolvedStrings;
-  SymbolResolver &R;
-  MaterializationResponsibility *MR;
-};
-
-/// Use the given legacy-style FindSymbol function (i.e. a function that takes
-/// a const std::string& or StringRef and returns a JITSymbol) to get the
-/// subset of symbols that the caller is responsible for materializing. If any
-/// JITSymbol returned by FindSymbol is in an error state the function returns
-/// immediately with that error.
-///
-/// Useful for implementing getResponsibilitySet bodies that query legacy
-/// resolvers.
-template <typename FindSymbolFn>
-Expected<SymbolNameSet>
-getResponsibilitySetWithLegacyFn(const SymbolNameSet &Symbols,
-                                 FindSymbolFn FindSymbol) {
-  SymbolNameSet Result;
-
-  for (auto &S : Symbols) {
-    if (JITSymbol Sym = FindSymbol(*S)) {
-      if (!Sym.getFlags().isStrong())
-        Result.insert(S);
-    } else if (auto Err = Sym.takeError())
-      return std::move(Err);
-  }
-
-  return Result;
-}
-
-/// Use the given legacy-style FindSymbol function (i.e. a function that
-///        takes a const std::string& or StringRef and returns a JITSymbol) to
-///        find the address and flags for each symbol in Symbols and store the
-///        result in Query. If any JITSymbol returned by FindSymbol is in an
-///        error then Query.notifyFailed(...) is called with that error and the
-///        function returns immediately. On success, returns the set of symbols
-///        not found.
-///
-/// Useful for implementing lookup bodies that query legacy resolvers.
-template <typename FindSymbolFn>
-SymbolNameSet
-lookupWithLegacyFn(ExecutionSession &ES, AsynchronousSymbolQuery &Query,
-                   const SymbolNameSet &Symbols, FindSymbolFn FindSymbol) {
-  SymbolNameSet SymbolsNotFound;
-  bool NewSymbolsResolved = false;
-
-  for (auto &S : Symbols) {
-    if (JITSymbol Sym = FindSymbol(*S)) {
-      if (auto Addr = Sym.getAddress()) {
-        Query.notifySymbolMetRequiredState(
-            S, JITEvaluatedSymbol(*Addr, Sym.getFlags()));
-        NewSymbolsResolved = true;
-      } else {
-        ES.legacyFailQuery(Query, Addr.takeError());
-        return SymbolNameSet();
-      }
-    } else if (auto Err = Sym.takeError()) {
-      ES.legacyFailQuery(Query, std::move(Err));
-      return SymbolNameSet();
-    } else
-      SymbolsNotFound.insert(S);
-  }
-
-  if (NewSymbolsResolved && Query.isComplete())
-    Query.handleComplete();
-
-  return SymbolsNotFound;
-}
-
-/// An ORC SymbolResolver implementation that uses a legacy
-///        findSymbol-like function to perform lookup;
-template <typename LegacyLookupFn>
-class LegacyLookupFnResolver final : public SymbolResolver {
-public:
-  using ErrorReporter = std::function<void(Error)>;
-
-  LegacyLookupFnResolver(ExecutionSession &ES, LegacyLookupFn LegacyLookup,
-                         ErrorReporter ReportError)
-      : ES(ES), LegacyLookup(std::move(LegacyLookup)),
-        ReportError(std::move(ReportError)) {}
-
-  SymbolNameSet getResponsibilitySet(const SymbolNameSet &Symbols) final {
-    if (auto ResponsibilitySet =
-            getResponsibilitySetWithLegacyFn(Symbols, LegacyLookup))
-      return std::move(*ResponsibilitySet);
-    else {
-      ReportError(ResponsibilitySet.takeError());
-      return SymbolNameSet();
-    }
-  }
-
-  SymbolNameSet lookup(std::shared_ptr<AsynchronousSymbolQuery> Query,
-                       SymbolNameSet Symbols) final {
-    return lookupWithLegacyFn(ES, *Query, Symbols, LegacyLookup);
-  }
-
-private:
-  ExecutionSession &ES;
-  LegacyLookupFn LegacyLookup;
-  ErrorReporter ReportError;
-};
-
-template <typename LegacyLookupFn>
-std::shared_ptr<LegacyLookupFnResolver<LegacyLookupFn>>
-createLegacyLookupResolver(ExecutionSession &ES, LegacyLookupFn LegacyLookup,
-                           std::function<void(Error)> ErrorReporter) {
-  return std::make_shared<LegacyLookupFnResolver<LegacyLookupFn>>(
-      ES, std::move(LegacyLookup), std::move(ErrorReporter));
-}
-
-} // End namespace orc
-} // End namespace llvm
-
-#endif // LLVM_EXECUTIONENGINE_ORC_LEGACY_H
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h
index 15fe079eccaf..90e1d4704f34 100644
--- a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h
+++ b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h
@@ -98,8 +98,9 @@ public:
   ExecutionSession &getExecutionSession() const { return ES; }
 
   Error setupJITDylib(JITDylib &JD) override;
-  Error notifyAdding(JITDylib &JD, const MaterializationUnit &MU) override;
-  Error notifyRemoving(JITDylib &JD, VModuleKey K) override;
+  Error notifyAdding(ResourceTracker &RT,
+                     const MaterializationUnit &MU) override;
+  Error notifyRemoving(ResourceTracker &RT) override;
 
   Expected<InitializerSequence> getInitializerSequence(JITDylib &JD);
 
@@ -119,6 +120,19 @@ private:
     LocalDependenciesMap getSyntheticSymbolLocalDependencies(
         MaterializationResponsibility &MR) override;
 
+    // FIXME: We should be tentatively tracking scraped sections and discarding
+    // if the MR fails.
+    Error notifyFailed(MaterializationResponsibility &MR) override {
+      return Error::success();
+    }
+
+    Error notifyRemovingResources(ResourceKey K) override {
+      return Error::success();
+    }
+
+    void notifyTransferringResources(ResourceKey DstKey,
+                                     ResourceKey SrcKey) override {}
+
   private:
     using InitSymbolDepMap =
         DenseMap<MaterializationResponsibility *, JITLinkSymbolVector>;
@@ -136,8 +150,6 @@ private:
     InitSymbolDepMap InitSymbolDeps;
   };
 
-  static std::vector<JITDylib *> getDFSLinkOrder(JITDylib &JD);
-
   void registerInitInfo(JITDylib &JD, JITTargetAddress ObjCImageInfoAddr,
                         MachOJITDylibInitializers::SectionExtent ModInits,
                         MachOJITDylibInitializers::SectionExtent ObjCSelRefs,
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/NullResolver.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/NullResolver.h
deleted file mode 100644
index ffa37a13d064..000000000000
--- a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/NullResolver.h
+++ /dev/null
@@ -1,43 +0,0 @@
-//===------ NullResolver.h - Reject symbol lookup requests ------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//   Defines a RuntimeDyld::SymbolResolver subclass that rejects all symbol
-// resolution requests, for clients that have no cross-object fixups.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_EXECUTIONENGINE_ORC_NULLRESOLVER_H
-#define LLVM_EXECUTIONENGINE_ORC_NULLRESOLVER_H
-
-#include "llvm/ExecutionEngine/Orc/Legacy.h"
-#include "llvm/ExecutionEngine/RuntimeDyld.h"
-
-namespace llvm {
-namespace orc {
-
-class NullResolver : public SymbolResolver {
-public:
-  SymbolNameSet getResponsibilitySet(const SymbolNameSet &Symbols) final;
-
-  SymbolNameSet lookup(std::shared_ptr<AsynchronousSymbolQuery> Query,
-                       SymbolNameSet Symbols) final;
-};
-
-/// SymbolResolver impliementation that rejects all resolution requests.
-/// Useful for clients that have no cross-object fixups.
-class NullLegacyResolver : public LegacyJITSymbolResolver {
-public:
-  JITSymbol findSymbol(const std::string &Name) final;
-
-  JITSymbol findSymbolInLogicalDylib(const std::string &Name) final;
-};
-
-} // End namespace orc.
-} // End namespace llvm.
-
-#endif // LLVM_EXECUTIONENGINE_ORC_NULLRESOLVER_H
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h
index 2bfe3b001709..f2975e29fcd6 100644
--- a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h
+++ b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h
@@ -35,6 +35,7 @@ namespace llvm {
 
 namespace jitlink {
 class EHFrameRegistrar;
+class LinkGraph;
 class Symbol;
 } // namespace jitlink
 
@@ -51,7 +52,7 @@ class ObjectLinkingLayerJITLinkContext;
 /// Clients can use this class to add relocatable object files to an
 /// ExecutionSession, and it typically serves as the base layer (underneath
 /// a compiling layer like IRCompileLayer) for the rest of the JIT.
-class ObjectLinkingLayer : public ObjectLayer {
+class ObjectLinkingLayer : public ObjectLayer, private ResourceManager {
   friend class ObjectLinkingLayerJITLinkContext;
 
 public:
@@ -72,10 +73,10 @@ public:
     virtual Error notifyEmitted(MaterializationResponsibility &MR) {
       return Error::success();
     }
-    virtual Error notifyRemovingModule(VModuleKey K) {
-      return Error::success();
-    }
-    virtual Error notifyRemovingAllModules() { return Error::success(); }
+    virtual Error notifyFailed(MaterializationResponsibility &MR) = 0;
+    virtual Error notifyRemovingResources(ResourceKey K) = 0;
+    virtual void notifyTransferringResources(ResourceKey DstKey,
+                                             ResourceKey SrcKey) = 0;
 
     /// Return any dependencies that synthetic symbols (e.g. init symbols)
     /// have on locally scoped jitlink::Symbols. This is used by the
@@ -90,8 +91,14 @@ public:
   using ReturnObjectBufferFunction =
       std::function<void(std::unique_ptr<MemoryBuffer>)>;
 
-  /// Construct an ObjectLinkingLayer with the given NotifyLoaded,
-  /// and NotifyEmitted functors.
+  /// Construct an ObjectLinkingLayer.
+  ObjectLinkingLayer(ExecutionSession &ES,
+                     jitlink::JITLinkMemoryManager &MemMgr);
+
+  /// Construct an ObjectLinkingLayer. Takes ownership of the given
+  /// JITLinkMemoryManager. This method is a temporary hack to simplify
+  /// co-existence with RTDyldObjectLinkingLayer (which also owns its
+  /// allocators).
   ObjectLinkingLayer(ExecutionSession &ES,
                      std::unique_ptr<jitlink::JITLinkMemoryManager> MemMgr);
 
@@ -112,10 +119,14 @@ public:
     return *this;
   }
 
-  /// Emit the object.
-  void emit(MaterializationResponsibility R,
+  /// Emit an object file.
+  void emit(std::unique_ptr<MaterializationResponsibility> R,
             std::unique_ptr<MemoryBuffer> O) override;
 
+  /// Emit a LinkGraph.
+  void emit(std::unique_ptr<MaterializationResponsibility> R,
+            std::unique_ptr<jitlink::LinkGraph> G);
+
   /// Instructs this ObjectLinkingLayer instance to override the symbol flags
   /// found in the AtomGraph with the flags supplied by the
   /// MaterializationResponsibility instance. This is a workaround to support
@@ -155,27 +166,31 @@ private:
   void notifyLoaded(MaterializationResponsibility &MR);
   Error notifyEmitted(MaterializationResponsibility &MR, AllocPtr Alloc);
 
-  Error removeModule(VModuleKey K);
-  Error removeAllModules();
+  Error handleRemoveResources(ResourceKey K) override;
+  void handleTransferResources(ResourceKey DstKey, ResourceKey SrcKey) override;
 
   mutable std::mutex LayerMutex;
-  std::unique_ptr<jitlink::JITLinkMemoryManager> MemMgr;
+  jitlink::JITLinkMemoryManager &MemMgr;
+  std::unique_ptr<jitlink::JITLinkMemoryManager> MemMgrOwnership;
   bool OverrideObjectFlags = false;
   bool AutoClaimObjectSymbols = false;
   ReturnObjectBufferFunction ReturnObjectBuffer;
-  DenseMap<VModuleKey, AllocPtr> TrackedAllocs;
-  std::vector<AllocPtr> UntrackedAllocs;
+  DenseMap<ResourceKey, std::vector<AllocPtr>> Allocs;
   std::vector<std::unique_ptr<Plugin>> Plugins;
 };
 
 class EHFrameRegistrationPlugin : public ObjectLinkingLayer::Plugin {
 public:
-  EHFrameRegistrationPlugin(jitlink::EHFrameRegistrar &Registrar);
-  Error notifyEmitted(MaterializationResponsibility &MR) override;
+  EHFrameRegistrationPlugin(
+      ExecutionSession &ES,
+      std::unique_ptr<jitlink::EHFrameRegistrar> Registrar);
   void modifyPassConfig(MaterializationResponsibility &MR, const Triple &TT,
                         jitlink::PassConfiguration &PassConfig) override;
-  Error notifyRemovingModule(VModuleKey K) override;
-  Error notifyRemovingAllModules() override;
+  Error notifyEmitted(MaterializationResponsibility &MR) override;
+  Error notifyFailed(MaterializationResponsibility &MR) override;
+  Error notifyRemovingResources(ResourceKey K) override;
+  void notifyTransferringResources(ResourceKey DstKey,
+                                   ResourceKey SrcKey) override;
 
 private:
 
@@ -185,10 +200,10 @@ private:
   };
 
   std::mutex EHFramePluginMutex;
-  jitlink::EHFrameRegistrar &Registrar;
+  ExecutionSession &ES;
+  std::unique_ptr<jitlink::EHFrameRegistrar> Registrar;
   DenseMap<MaterializationResponsibility *, EHFrameRange> InProcessLinks;
-  DenseMap<VModuleKey, EHFrameRange> TrackedEHFrameRanges;
-  std::vector<EHFrameRange> UntrackedEHFrameRanges;
+  DenseMap<ResourceKey, std::vector<EHFrameRange>> EHFrameRanges;
 };
 
 } // end namespace orc
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/ObjectTransformLayer.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/ObjectTransformLayer.h
index bf989cc8677c..d8395ab34e47 100644
--- a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/ObjectTransformLayer.h
+++ b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/ObjectTransformLayer.h
@@ -31,7 +31,7 @@ public:
   ObjectTransformLayer(ExecutionSession &ES, ObjectLayer &BaseLayer,
                        TransformFunction Transform = TransformFunction());
 
-  void emit(MaterializationResponsibility R,
+  void emit(std::unique_ptr<MaterializationResponsibility> R,
             std::unique_ptr<MemoryBuffer> O) override;
 
   void setTransform(TransformFunction Transform) {
@@ -43,88 +43,6 @@ private:
   TransformFunction Transform;
 };
 
-/// Object mutating layer.
-///
-///   This layer accepts sets of ObjectFiles (via addObject). It
-/// immediately applies the user supplied functor to each object, then adds
-/// the set of transformed objects to the layer below.
-template <typename BaseLayerT, typename TransformFtor>
-class LegacyObjectTransformLayer {
-public:
-  /// Construct an ObjectTransformLayer with the given BaseLayer
-  LLVM_ATTRIBUTE_DEPRECATED(
-      LegacyObjectTransformLayer(BaseLayerT &BaseLayer,
-                                 TransformFtor Transform = TransformFtor()),
-      "ORCv1 layers (layers with the 'Legacy' prefix) are deprecated. Please "
-      "use "
-      "the ORCv2 ObjectTransformLayer instead");
-
-  /// Legacy layer constructor with deprecation acknowledgement.
-  LegacyObjectTransformLayer(ORCv1DeprecationAcknowledgement,
-                             BaseLayerT &BaseLayer,
-                             TransformFtor Transform = TransformFtor())
-      : BaseLayer(BaseLayer), Transform(std::move(Transform)) {}
-
-  /// Apply the transform functor to each object in the object set, then
-  ///        add the resulting set of objects to the base layer, along with the
-  ///        memory manager and symbol resolver.
-  ///
-  /// @return A handle for the added objects.
-  template <typename ObjectPtr> Error addObject(VModuleKey K, ObjectPtr Obj) {
-    return BaseLayer.addObject(std::move(K), Transform(std::move(Obj)));
-  }
-
-  /// Remove the object set associated with the VModuleKey K.
-  Error removeObject(VModuleKey K) { return BaseLayer.removeObject(K); }
-
-  /// Search for the given named symbol.
-  /// @param Name The name of the symbol to search for.
-  /// @param ExportedSymbolsOnly If true, search only for exported symbols.
-  /// @return A handle for the given named symbol, if it exists.
-  JITSymbol findSymbol(const std::string &Name, bool ExportedSymbolsOnly) {
-    return BaseLayer.findSymbol(Name, ExportedSymbolsOnly);
-  }
-
-  /// Get the address of the given symbol in the context of the set of
-  ///        objects represented by the VModuleKey K. This call is forwarded to
-  ///        the base layer's implementation.
-  /// @param K The VModuleKey associated with the object set to search in.
-  /// @param Name The name of the symbol to search for.
-  /// @param ExportedSymbolsOnly If true, search only for exported symbols.
-  /// @return A handle for the given named symbol, if it is found in the
-  ///         given object set.
-  JITSymbol findSymbolIn(VModuleKey K, const std::string &Name,
-                         bool ExportedSymbolsOnly) {
-    return BaseLayer.findSymbolIn(K, Name, ExportedSymbolsOnly);
-  }
-
-  /// Immediately emit and finalize the object set represented by the
-  ///        given VModuleKey K.
-  Error emitAndFinalize(VModuleKey K) { return BaseLayer.emitAndFinalize(K); }
-
-  /// Map section addresses for the objects associated with the
-  /// VModuleKey K.
-  void mapSectionAddress(VModuleKey K, const void *LocalAddress,
-                         JITTargetAddress TargetAddr) {
-    BaseLayer.mapSectionAddress(K, LocalAddress, TargetAddr);
-  }
-
-  /// Access the transform functor directly.
-  TransformFtor &getTransform() { return Transform; }
-
-  /// Access the mumate functor directly.
-  const TransformFtor &getTransform() const { return Transform; }
-
-private:
-  BaseLayerT &BaseLayer;
-  TransformFtor Transform;
-};
-
-template <typename BaseLayerT, typename TransformFtor>
-LegacyObjectTransformLayer<BaseLayerT, TransformFtor>::
-    LegacyObjectTransformLayer(BaseLayerT &BaseLayer, TransformFtor Transform)
-    : BaseLayer(BaseLayer), Transform(std::move(Transform)) {}
-
 } // end namespace orc
 } // end namespace llvm
 
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/OrcError.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/OrcError.h
deleted file mode 100644
index 9b0d941f5459..000000000000
--- a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/OrcError.h
+++ /dev/null
@@ -1,74 +0,0 @@
-//===------ OrcError.h - Reject symbol lookup requests ------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//   Define an error category, error codes, and helper utilities for Orc.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_EXECUTIONENGINE_ORC_ORCERROR_H
-#define LLVM_EXECUTIONENGINE_ORC_ORCERROR_H
-
-#include "llvm/Support/Error.h"
-#include "llvm/Support/raw_ostream.h"
-#include <string>
-#include <system_error>
-
-namespace llvm {
-namespace orc {
-
-enum class OrcErrorCode : int {
-  // RPC Errors
-  UnknownORCError = 1,
-  DuplicateDefinition,
-  JITSymbolNotFound,
-  RemoteAllocatorDoesNotExist,
-  RemoteAllocatorIdAlreadyInUse,
-  RemoteMProtectAddrUnrecognized,
-  RemoteIndirectStubsOwnerDoesNotExist,
-  RemoteIndirectStubsOwnerIdAlreadyInUse,
-  RPCConnectionClosed,
-  RPCCouldNotNegotiateFunction,
-  RPCResponseAbandoned,
-  UnexpectedRPCCall,
-  UnexpectedRPCResponse,
-  UnknownErrorCodeFromRemote,
-  UnknownResourceHandle,
-  MissingSymbolDefinitions,
-  UnexpectedSymbolDefinitions,
-};
-
-std::error_code orcError(OrcErrorCode ErrCode);
-
-class DuplicateDefinition : public ErrorInfo<DuplicateDefinition> {
-public:
-  static char ID;
-
-  DuplicateDefinition(std::string SymbolName);
-  std::error_code convertToErrorCode() const override;
-  void log(raw_ostream &OS) const override;
-  const std::string &getSymbolName() const;
-private:
-  std::string SymbolName;
-};
-
-class JITSymbolNotFound : public ErrorInfo<JITSymbolNotFound> {
-public:
-  static char ID;
-
-  JITSymbolNotFound(std::string SymbolName);
-  std::error_code convertToErrorCode() const override;
-  void log(raw_ostream &OS) const override;
-  const std::string &getSymbolName() const;
-private:
-  std::string SymbolName;
-};
-
-} // End namespace orc.
-} // End namespace llvm.
-
-#endif // LLVM_EXECUTIONENGINE_ORC_ORCERROR_H
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/OrcRPCTargetProcessControl.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/OrcRPCTargetProcessControl.h
new file mode 100644
index 000000000000..a8aa42799115
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/OrcRPCTargetProcessControl.h
@@ -0,0 +1,415 @@
+//===--- OrcRPCTargetProcessControl.h - Remote target control ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Utilities for interacting with target processes.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_ORCRPCTARGETPROCESSCONTROL_H
+#define LLVM_EXECUTIONENGINE_ORC_ORCRPCTARGETPROCESSCONTROL_H
+
+#include "llvm/ExecutionEngine/Orc/Shared/RPCUtils.h"
+#include "llvm/ExecutionEngine/Orc/Shared/RawByteChannel.h"
+#include "llvm/ExecutionEngine/Orc/TargetProcess/OrcRPCTPCServer.h"
+#include "llvm/ExecutionEngine/Orc/TargetProcessControl.h"
+#include "llvm/Support/MSVCErrorWorkarounds.h"
+
+namespace llvm {
+namespace orc {
+
+/// JITLinkMemoryManager implementation for a process connected via an ORC RPC
+/// endpoint.
+template <typename OrcRPCTPCImplT>
+class OrcRPCTPCJITLinkMemoryManager : public jitlink::JITLinkMemoryManager {
+private:
+  struct HostAlloc {
+    std::unique_ptr<char[]> Mem;
+    uint64_t Size;
+  };
+
+  struct TargetAlloc {
+    JITTargetAddress Address = 0;
+    uint64_t AllocatedSize = 0;
+  };
+
+  using HostAllocMap = DenseMap<int, HostAlloc>;
+  using TargetAllocMap = DenseMap<int, TargetAlloc>;
+
+public:
+  class OrcRPCAllocation : public Allocation {
+  public:
+    OrcRPCAllocation(OrcRPCTPCJITLinkMemoryManager<OrcRPCTPCImplT> &Parent,
+                     HostAllocMap HostAllocs, TargetAllocMap TargetAllocs)
+        : Parent(Parent), HostAllocs(std::move(HostAllocs)),
+          TargetAllocs(std::move(TargetAllocs)) {
+      assert(HostAllocs.size() == TargetAllocs.size() &&
+             "HostAllocs size should match TargetAllocs");
+    }
+
+    ~OrcRPCAllocation() override {
+      assert(TargetAllocs.empty() && "failed to deallocate");
+    }
+
+    MutableArrayRef<char> getWorkingMemory(ProtectionFlags Seg) override {
+      auto I = HostAllocs.find(Seg);
+      assert(I != HostAllocs.end() && "No host allocation for segment");
+      auto &HA = I->second;
+      return {HA.Mem.get(), static_cast<size_t>(HA.Size)};
+    }
+
+    JITTargetAddress getTargetMemory(ProtectionFlags Seg) override {
+      auto I = TargetAllocs.find(Seg);
+      assert(I != TargetAllocs.end() && "No target allocation for segment");
+      return I->second.Address;
+    }
+
+    void finalizeAsync(FinalizeContinuation OnFinalize) override {
+
+      std::vector<tpctypes::BufferWrite> BufferWrites;
+      orcrpctpc::ReleaseOrFinalizeMemRequest FMR;
+
+      for (auto &KV : HostAllocs) {
+        assert(TargetAllocs.count(KV.first) &&
+               "No target allocation for buffer");
+        auto &HA = KV.second;
+        auto &TA = TargetAllocs[KV.first];
+        BufferWrites.push_back({TA.Address, StringRef(HA.Mem.get(), HA.Size)});
+        FMR.push_back({orcrpctpc::toWireProtectionFlags(
+                           static_cast<sys::Memory::ProtectionFlags>(KV.first)),
+                       TA.Address, TA.AllocatedSize});
+      }
+
+      DEBUG_WITH_TYPE("orc", {
+        dbgs() << "finalizeAsync " << (void *)this << ":\n";
+        auto FMRI = FMR.begin();
+        for (auto &B : BufferWrites) {
+          auto Prot = FMRI->Prot;
+          ++FMRI;
+          dbgs() << "  Writing " << formatv("{0:x16}", B.Buffer.size())
+                 << " bytes to " << ((Prot & orcrpctpc::WPF_Read) ? 'R' : '-')
+                 << ((Prot & orcrpctpc::WPF_Write) ? 'W' : '-')
+                 << ((Prot & orcrpctpc::WPF_Exec) ? 'X' : '-')
+                 << " segment: local " << (const void *)B.Buffer.data()
+                 << " -> target " << formatv("{0:x16}", B.Address) << "\n";
+        }
+      });
+      if (auto Err =
+              Parent.Parent.getMemoryAccess().writeBuffers(BufferWrites)) {
+        OnFinalize(std::move(Err));
+        return;
+      }
+
+      DEBUG_WITH_TYPE("orc", dbgs() << " Applying permissions...\n");
+      if (auto Err =
+              Parent.getEndpoint().template callAsync<orcrpctpc::FinalizeMem>(
+                  [OF = std::move(OnFinalize)](Error Err2) {
+                    // FIXME: Dispatch to work queue.
+                    std::thread([OF = std::move(OF),
+                                 Err3 = std::move(Err2)]() mutable {
+                      DEBUG_WITH_TYPE(
+                          "orc", { dbgs() << "  finalizeAsync complete\n"; });
+                      OF(std::move(Err3));
+                    }).detach();
+                    return Error::success();
+                  },
+                  FMR)) {
+        DEBUG_WITH_TYPE("orc", dbgs() << "    failed.\n");
+        Parent.getEndpoint().abandonPendingResponses();
+        Parent.reportError(std::move(Err));
+      }
+      DEBUG_WITH_TYPE("orc", {
+        dbgs() << "Leaving finalizeAsync (finalization may continue in "
+                  "background)\n";
+      });
+    }
+
+    Error deallocate() override {
+      orcrpctpc::ReleaseOrFinalizeMemRequest RMR;
+      for (auto &KV : TargetAllocs)
+        RMR.push_back({orcrpctpc::toWireProtectionFlags(
+                           static_cast<sys::Memory::ProtectionFlags>(KV.first)),
+                       KV.second.Address, KV.second.AllocatedSize});
+      TargetAllocs.clear();
+
+      return Parent.getEndpoint().template callB<orcrpctpc::ReleaseMem>(RMR);
+    }
+
+  private:
+    OrcRPCTPCJITLinkMemoryManager<OrcRPCTPCImplT> &Parent;
+    HostAllocMap HostAllocs;
+    TargetAllocMap TargetAllocs;
+  };
+
+  OrcRPCTPCJITLinkMemoryManager(OrcRPCTPCImplT &Parent) : Parent(Parent) {}
+
+  Expected<std::unique_ptr<Allocation>>
+  allocate(const jitlink::JITLinkDylib *JD,
+           const SegmentsRequestMap &Request) override {
+    orcrpctpc::ReserveMemRequest RMR;
+    HostAllocMap HostAllocs;
+
+    for (auto &KV : Request) {
+      assert(KV.second.getContentSize() <= std::numeric_limits<size_t>::max() &&
+             "Content size is out-of-range for host");
+
+      RMR.push_back({orcrpctpc::toWireProtectionFlags(
+                         static_cast<sys::Memory::ProtectionFlags>(KV.first)),
+                     KV.second.getContentSize() + KV.second.getZeroFillSize(),
+                     KV.second.getAlignment()});
+      HostAllocs[KV.first] = {
+          std::make_unique<char[]>(KV.second.getContentSize()),
+          KV.second.getContentSize()};
+    }
+
+    DEBUG_WITH_TYPE("orc", {
+      dbgs() << "Orc remote memmgr got request:\n";
+      for (auto &KV : Request)
+        dbgs() << "  permissions: "
+               << ((KV.first & sys::Memory::MF_READ) ? 'R' : '-')
+               << ((KV.first & sys::Memory::MF_WRITE) ? 'W' : '-')
+               << ((KV.first & sys::Memory::MF_EXEC) ? 'X' : '-')
+               << ", content size: "
+               << formatv("{0:x16}", KV.second.getContentSize())
+               << " + zero-fill-size: "
+               << formatv("{0:x16}", KV.second.getZeroFillSize())
+               << ", align: " << KV.second.getAlignment() << "\n";
+    });
+
+    // FIXME: LLVM RPC needs to be fixed to support alt
+    // serialization/deserialization on return types. For now just
+    // translate from std::map to DenseMap manually.
+    auto TmpTargetAllocs =
+        Parent.getEndpoint().template callB<orcrpctpc::ReserveMem>(RMR);
+    if (!TmpTargetAllocs)
+      return TmpTargetAllocs.takeError();
+
+    if (TmpTargetAllocs->size() != RMR.size())
+      return make_error<StringError>(
+          "Number of target allocations does not match request",
+          inconvertibleErrorCode());
+
+    TargetAllocMap TargetAllocs;
+    for (auto &E : *TmpTargetAllocs)
+      TargetAllocs[orcrpctpc::fromWireProtectionFlags(E.Prot)] = {
+          E.Address, E.AllocatedSize};
+
+    DEBUG_WITH_TYPE("orc", {
+      auto HAI = HostAllocs.begin();
+      for (auto &KV : TargetAllocs)
+        dbgs() << "  permissions: "
+               << ((KV.first & sys::Memory::MF_READ) ? 'R' : '-')
+               << ((KV.first & sys::Memory::MF_WRITE) ? 'W' : '-')
+               << ((KV.first & sys::Memory::MF_EXEC) ? 'X' : '-')
+               << " assigned local " << (void *)HAI->second.Mem.get()
+               << ", target " << formatv("{0:x16}", KV.second.Address) << "\n";
+    });
+
+    return std::make_unique<OrcRPCAllocation>(*this, std::move(HostAllocs),
+                                              std::move(TargetAllocs));
+  }
+
+private:
+  void reportError(Error Err) { Parent.reportError(std::move(Err)); }
+
+  decltype(std::declval<OrcRPCTPCImplT>().getEndpoint()) getEndpoint() {
+    return Parent.getEndpoint();
+  }
+
+  OrcRPCTPCImplT &Parent;
+};
+
+/// TargetProcessControl::MemoryAccess implementation for a process connected
+/// via an ORC RPC endpoint.
+template <typename OrcRPCTPCImplT>
+class OrcRPCTPCMemoryAccess : public TargetProcessControl::MemoryAccess {
+public:
+  OrcRPCTPCMemoryAccess(OrcRPCTPCImplT &Parent) : Parent(Parent) {}
+
+  void writeUInt8s(ArrayRef<tpctypes::UInt8Write> Ws,
+                   WriteResultFn OnWriteComplete) override {
+    writeViaRPC<orcrpctpc::WriteUInt8s>(Ws, std::move(OnWriteComplete));
+  }
+
+  void writeUInt16s(ArrayRef<tpctypes::UInt16Write> Ws,
+                    WriteResultFn OnWriteComplete) override {
+    writeViaRPC<orcrpctpc::WriteUInt16s>(Ws, std::move(OnWriteComplete));
+  }
+
+  void writeUInt32s(ArrayRef<tpctypes::UInt32Write> Ws,
+                    WriteResultFn OnWriteComplete) override {
+    writeViaRPC<orcrpctpc::WriteUInt32s>(Ws, std::move(OnWriteComplete));
+  }
+
+  void writeUInt64s(ArrayRef<tpctypes::UInt64Write> Ws,
+                    WriteResultFn OnWriteComplete) override {
+    writeViaRPC<orcrpctpc::WriteUInt64s>(Ws, std::move(OnWriteComplete));
+  }
+
+  void writeBuffers(ArrayRef<tpctypes::BufferWrite> Ws,
+                    WriteResultFn OnWriteComplete) override {
+    writeViaRPC<orcrpctpc::WriteBuffers>(Ws, std::move(OnWriteComplete));
+  }
+
+private:
+  template <typename WriteRPCFunction, typename WriteElementT>
+  void writeViaRPC(ArrayRef<WriteElementT> Ws, WriteResultFn OnWriteComplete) {
+    if (auto Err = Parent.getEndpoint().template callAsync<WriteRPCFunction>(
+            [OWC = std::move(OnWriteComplete)](Error Err2) mutable -> Error {
+              OWC(std::move(Err2));
+              return Error::success();
+            },
+            Ws)) {
+      Parent.reportError(std::move(Err));
+      Parent.getEndpoint().abandonPendingResponses();
+    }
+  }
+
+  OrcRPCTPCImplT &Parent;
+};
+
+// TargetProcessControl for a process connected via an ORC RPC Endpoint.
+template <typename RPCEndpointT>
+class OrcRPCTargetProcessControlBase : public TargetProcessControl {
+public:
+  using ErrorReporter = unique_function<void(Error)>;
+
+  using OnCloseConnectionFunction = unique_function<Error(Error)>;
+
+  OrcRPCTargetProcessControlBase(std::shared_ptr<SymbolStringPool> SSP,
+                                 RPCEndpointT &EP, ErrorReporter ReportError)
+      : TargetProcessControl(std::move(SSP)),
+        ReportError(std::move(ReportError)), EP(EP) {}
+
+  void reportError(Error Err) { ReportError(std::move(Err)); }
+
+  RPCEndpointT &getEndpoint() { return EP; }
+
+  Expected<tpctypes::DylibHandle> loadDylib(const char *DylibPath) override {
+    DEBUG_WITH_TYPE("orc", {
+      dbgs() << "Loading dylib \"" << (DylibPath ? DylibPath : "") << "\" ";
+      if (!DylibPath)
+        dbgs() << "(process symbols)";
+      dbgs() << "\n";
+    });
+    if (!DylibPath)
+      DylibPath = "";
+    auto H = EP.template callB<orcrpctpc::LoadDylib>(DylibPath);
+    DEBUG_WITH_TYPE("orc", {
+      if (H)
+        dbgs() << "  got handle " << formatv("{0:x16}", *H) << "\n";
+      else
+        dbgs() << "  error, unable to load\n";
+    });
+    return H;
+  }
+
+  Expected<std::vector<tpctypes::LookupResult>>
+  lookupSymbols(ArrayRef<LookupRequest> Request) override {
+    std::vector<orcrpctpc::RemoteLookupRequest> RR;
+    for (auto &E : Request) {
+      RR.push_back({});
+      RR.back().first = E.Handle;
+      for (auto &KV : E.Symbols)
+        RR.back().second.push_back(
+            {(*KV.first).str(),
+             KV.second == SymbolLookupFlags::WeaklyReferencedSymbol});
+    }
+    DEBUG_WITH_TYPE("orc", {
+      dbgs() << "Compound lookup:\n";
+      for (auto &R : Request) {
+        dbgs() << "  In " << formatv("{0:x16}", R.Handle) << ": {";
+        bool First = true;
+        for (auto &KV : R.Symbols) {
+          dbgs() << (First ? "" : ",") << " " << *KV.first;
+          First = false;
+        }
+        dbgs() << " }\n";
+      }
+    });
+    return EP.template callB<orcrpctpc::LookupSymbols>(RR);
+  }
+
+  Expected<int32_t> runAsMain(JITTargetAddress MainFnAddr,
+                              ArrayRef<std::string> Args) override {
+    DEBUG_WITH_TYPE("orc", {
+      dbgs() << "Running as main: " << formatv("{0:x16}", MainFnAddr)
+             << ", args = [";
+      for (unsigned I = 0; I != Args.size(); ++I)
+        dbgs() << (I ? "," : "") << " \"" << Args[I] << "\"";
+      dbgs() << "]\n";
+    });
+    auto Result = EP.template callB<orcrpctpc::RunMain>(MainFnAddr, Args);
+    DEBUG_WITH_TYPE("orc", {
+      dbgs() << "  call to " << formatv("{0:x16}", MainFnAddr);
+      if (Result)
+        dbgs() << " returned result " << *Result << "\n";
+      else
+        dbgs() << " failed\n";
+    });
+    return Result;
+  }
+
+  Expected<tpctypes::WrapperFunctionResult>
+  runWrapper(JITTargetAddress WrapperFnAddr,
+             ArrayRef<uint8_t> ArgBuffer) override {
+    DEBUG_WITH_TYPE("orc", {
+      dbgs() << "Running as wrapper function "
+             << formatv("{0:x16}", WrapperFnAddr) << " with "
+             << formatv("{0:x16}", ArgBuffer.size()) << " argument buffer\n";
+    });
+    auto Result =
+        EP.template callB<orcrpctpc::RunWrapper>(WrapperFnAddr, ArgBuffer);
+    // dbgs() << "Returned from runWrapper...\n";
+    return Result;
+  }
+
+  Error closeConnection(OnCloseConnectionFunction OnCloseConnection) {
+    DEBUG_WITH_TYPE("orc", dbgs() << "Closing connection to remote\n");
+    return EP.template callAsync<orcrpctpc::CloseConnection>(
+        std::move(OnCloseConnection));
+  }
+
+  Error closeConnectionAndWait() {
+    std::promise<MSVCPError> P;
+    auto F = P.get_future();
+    if (auto Err = closeConnection([&](Error Err2) -> Error {
+          P.set_value(std::move(Err2));
+          return Error::success();
+        })) {
+      EP.abandonAllPendingResponses();
+      return joinErrors(std::move(Err), F.get());
+    }
+    return F.get();
+  }
+
+protected:
+  /// Subclasses must call this during construction to initialize the
+  /// TargetTriple and PageSize members.
+  Error initializeORCRPCTPCBase() {
+    if (auto TripleOrErr = EP.template callB<orcrpctpc::GetTargetTriple>())
+      TargetTriple = Triple(*TripleOrErr);
+    else
+      return TripleOrErr.takeError();
+
+    if (auto PageSizeOrErr = EP.template callB<orcrpctpc::GetPageSize>())
+      PageSize = *PageSizeOrErr;
+    else
+      return PageSizeOrErr.takeError();
+
+    return Error::success();
+  }
+
+private:
+  ErrorReporter ReportError;
+  RPCEndpointT &EP;
+};
+
+} // end namespace orc
+} // end namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_ORC_ORCRPCTARGETPROCESSCONTROL_H
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetClient.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetClient.h
index 86e8d5df3ad9..3d139740d677 100644
--- a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetClient.h
+++ b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetClient.h
@@ -20,6 +20,7 @@
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ExecutionEngine/JITSymbol.h"
+#include "llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h"
 #include "llvm/ExecutionEngine/Orc/IndirectionUtils.h"
 #include "llvm/ExecutionEngine/Orc/OrcRemoteTargetRPCAPI.h"
 #include "llvm/ExecutionEngine/RuntimeDyld.h"
@@ -53,7 +54,7 @@ namespace remote {
 /// OrcRemoteTargetServer class) via an RPC system (see RPCUtils.h) to carry out
 /// its actions.
 class OrcRemoteTargetClient
-    : public rpc::SingleThreadedRPCEndpoint<rpc::RawByteChannel> {
+    : public shared::SingleThreadedRPCEndpoint<shared::RawByteChannel> {
 public:
   /// Remote-mapped RuntimeDyld-compatible memory manager.
   class RemoteRTDyldMemoryManager : public RuntimeDyld::MemoryManager {
@@ -329,6 +330,221 @@ public:
     std::vector<EHFrame> RegisteredEHFrames;
   };
 
+  class RPCMMAlloc : public jitlink::JITLinkMemoryManager::Allocation {
+    using AllocationMap = DenseMap<unsigned, sys::MemoryBlock>;
+    using FinalizeContinuation =
+        jitlink::JITLinkMemoryManager::Allocation::FinalizeContinuation;
+    using ProtectionFlags = sys::Memory::ProtectionFlags;
+    using SegmentsRequestMap =
+        DenseMap<unsigned, jitlink::JITLinkMemoryManager::SegmentRequest>;
+
+    RPCMMAlloc(OrcRemoteTargetClient &Client, ResourceIdMgr::ResourceId Id)
+        : Client(Client), Id(Id) {}
+
+  public:
+    static Expected<std::unique_ptr<RPCMMAlloc>>
+    Create(OrcRemoteTargetClient &Client, ResourceIdMgr::ResourceId Id,
+           const SegmentsRequestMap &Request) {
+      auto *MM = new RPCMMAlloc(Client, Id);
+
+      if (Error Err = MM->allocateHostBlocks(Request))
+        return std::move(Err);
+
+      if (Error Err = MM->allocateTargetBlocks())
+        return std::move(Err);
+
+      return std::unique_ptr<RPCMMAlloc>(MM);
+    }
+
+    MutableArrayRef<char> getWorkingMemory(ProtectionFlags Seg) override {
+      assert(HostSegBlocks.count(Seg) && "No allocation for segment");
+      return {static_cast<char *>(HostSegBlocks[Seg].base()),
+              HostSegBlocks[Seg].allocatedSize()};
+    }
+
+    JITTargetAddress getTargetMemory(ProtectionFlags Seg) override {
+      assert(TargetSegBlocks.count(Seg) && "No allocation for segment");
+      return pointerToJITTargetAddress(TargetSegBlocks[Seg].base());
+    }
+
+    void finalizeAsync(FinalizeContinuation OnFinalize) override {
+      // Host allocations (working memory) remain ReadWrite.
+      OnFinalize(copyAndProtect());
+    }
+
+    Error deallocate() override {
+      // TODO: Cannot release target allocation. RPCAPI has no function
+      // symmetric to reserveMem(). Add RPC call like freeMem()?
+      return errorCodeToError(sys::Memory::releaseMappedMemory(HostAllocation));
+    }
+
+  private:
+    OrcRemoteTargetClient &Client;
+    ResourceIdMgr::ResourceId Id;
+    AllocationMap HostSegBlocks;
+    AllocationMap TargetSegBlocks;
+    JITTargetAddress TargetSegmentAddr;
+    sys::MemoryBlock HostAllocation;
+
+    Error allocateHostBlocks(const SegmentsRequestMap &Request) {
+      unsigned TargetPageSize = Client.getPageSize();
+
+      if (!isPowerOf2_64(static_cast<uint64_t>(TargetPageSize)))
+        return make_error<StringError>("Host page size is not a power of 2",
+                                       inconvertibleErrorCode());
+
+      auto TotalSize = calcTotalAllocSize(Request, TargetPageSize);
+      if (!TotalSize)
+        return TotalSize.takeError();
+
+      // Allocate one slab to cover all the segments.
+      const sys::Memory::ProtectionFlags ReadWrite =
+          static_cast<sys::Memory::ProtectionFlags>(sys::Memory::MF_READ |
+                                                    sys::Memory::MF_WRITE);
+      std::error_code EC;
+      HostAllocation =
+          sys::Memory::allocateMappedMemory(*TotalSize, nullptr, ReadWrite, EC);
+      if (EC)
+        return errorCodeToError(EC);
+
+      char *SlabAddr = static_cast<char *>(HostAllocation.base());
+#ifndef NDEBUG
+      char *SlabAddrEnd = SlabAddr + HostAllocation.allocatedSize();
+#endif
+
+      // Allocate segment memory from the slab.
+      for (auto &KV : Request) {
+        const auto &Seg = KV.second;
+
+        uint64_t SegmentSize = Seg.getContentSize() + Seg.getZeroFillSize();
+        uint64_t AlignedSegmentSize = alignTo(SegmentSize, TargetPageSize);
+
+        // Zero out zero-fill memory.
+        char *ZeroFillBegin = SlabAddr + Seg.getContentSize();
+        memset(ZeroFillBegin, 0, Seg.getZeroFillSize());
+
+        // Record the block for this segment.
+        HostSegBlocks[KV.first] =
+            sys::MemoryBlock(SlabAddr, AlignedSegmentSize);
+
+        SlabAddr += AlignedSegmentSize;
+        assert(SlabAddr <= SlabAddrEnd && "Out of range");
+      }
+
+      return Error::success();
+    }
+
+    Error allocateTargetBlocks() {
+      // Reserve memory for all blocks on the target. We need as much space on
+      // the target as we allocated on the host.
+      TargetSegmentAddr = Client.reserveMem(Id, HostAllocation.allocatedSize(),
+                                            Client.getPageSize());
+      if (!TargetSegmentAddr)
+        return make_error<StringError>("Failed to reserve memory on the target",
+                                       inconvertibleErrorCode());
+
+      // Map memory blocks into the allocation, that match the host allocation.
+      JITTargetAddress TargetAllocAddr = TargetSegmentAddr;
+      for (const auto &KV : HostSegBlocks) {
+        size_t TargetAllocSize = KV.second.allocatedSize();
+
+        TargetSegBlocks[KV.first] =
+            sys::MemoryBlock(jitTargetAddressToPointer<void *>(TargetAllocAddr),
+                             TargetAllocSize);
+
+        TargetAllocAddr += TargetAllocSize;
+        assert(TargetAllocAddr - TargetSegmentAddr <=
+                   HostAllocation.allocatedSize() &&
+               "Out of range on target");
+      }
+
+      return Error::success();
+    }
+
+    Error copyAndProtect() {
+      unsigned Permissions = 0u;
+
+      // Copy segments one by one.
+      for (auto &KV : TargetSegBlocks) {
+        Permissions |= KV.first;
+
+        const sys::MemoryBlock &TargetBlock = KV.second;
+        const sys::MemoryBlock &HostBlock = HostSegBlocks.lookup(KV.first);
+
+        size_t TargetAllocSize = TargetBlock.allocatedSize();
+        auto TargetAllocAddr = pointerToJITTargetAddress(TargetBlock.base());
+        auto *HostAllocBegin = static_cast<const char *>(HostBlock.base());
+
+        bool CopyErr =
+            Client.writeMem(TargetAllocAddr, HostAllocBegin, TargetAllocSize);
+        if (CopyErr)
+          return createStringError(inconvertibleErrorCode(),
+                                   "Failed to copy %d segment to the target",
+                                   KV.first);
+      }
+
+      // Set permission flags for all segments at once.
+      bool ProtectErr =
+          Client.setProtections(Id, TargetSegmentAddr, Permissions);
+      if (ProtectErr)
+        return createStringError(inconvertibleErrorCode(),
+                                 "Failed to apply permissions for %d segment "
+                                 "on the target",
+                                 Permissions);
+      return Error::success();
+    }
+
+    static Expected<size_t>
+    calcTotalAllocSize(const SegmentsRequestMap &Request,
+                       unsigned TargetPageSize) {
+      size_t TotalSize = 0;
+      for (const auto &KV : Request) {
+        const auto &Seg = KV.second;
+
+        if (Seg.getAlignment() > TargetPageSize)
+          return make_error<StringError>("Cannot request alignment higher than "
+                                         "page alignment on target",
+                                         inconvertibleErrorCode());
+
+        TotalSize = alignTo(TotalSize, TargetPageSize);
+        TotalSize += Seg.getContentSize();
+        TotalSize += Seg.getZeroFillSize();
+      }
+
+      return TotalSize;
+    }
+  };
+
+  class RemoteJITLinkMemoryManager : public jitlink::JITLinkMemoryManager {
+  public:
+    RemoteJITLinkMemoryManager(OrcRemoteTargetClient &Client,
+                               ResourceIdMgr::ResourceId Id)
+        : Client(Client), Id(Id) {}
+
+    RemoteJITLinkMemoryManager(const RemoteJITLinkMemoryManager &) = delete;
+    RemoteJITLinkMemoryManager(RemoteJITLinkMemoryManager &&) = default;
+
+    RemoteJITLinkMemoryManager &
+    operator=(const RemoteJITLinkMemoryManager &) = delete;
+    RemoteJITLinkMemoryManager &
+    operator=(RemoteJITLinkMemoryManager &&) = delete;
+
+    ~RemoteJITLinkMemoryManager() {
+      Client.destroyRemoteAllocator(Id);
+      LLVM_DEBUG(dbgs() << "Destroyed remote allocator " << Id << "\n");
+    }
+
+    Expected<std::unique_ptr<Allocation>>
+    allocate(const jitlink::JITLinkDylib *JD,
+             const SegmentsRequestMap &Request) override {
+      return RPCMMAlloc::Create(Client, Id, Request);
+    }
+
+  private:
+    OrcRemoteTargetClient &Client;
+    ResourceIdMgr::ResourceId Id;
+  };
+
   /// Remote indirect stubs manager.
   class RemoteIndirectStubsManager : public IndirectStubsManager {
   public:
@@ -453,20 +669,8 @@ public:
   public:
     RemoteTrampolinePool(OrcRemoteTargetClient &Client) : Client(Client) {}
 
-    Expected<JITTargetAddress> getTrampoline() override {
-      std::lock_guard<std::mutex> Lock(RTPMutex);
-      if (AvailableTrampolines.empty()) {
-        if (auto Err = grow())
-          return std::move(Err);
-      }
-      assert(!AvailableTrampolines.empty() && "Failed to grow trampoline pool");
-      auto TrampolineAddr = AvailableTrampolines.back();
-      AvailableTrampolines.pop_back();
-      return TrampolineAddr;
-    }
-
   private:
-    Error grow() {
+    Error grow() override {
       JITTargetAddress BlockAddr = 0;
       uint32_t NumTrampolines = 0;
       if (auto TrampolineInfoOrErr = Client.emitTrampolineBlock())
@@ -476,14 +680,12 @@ public:
 
       uint32_t TrampolineSize = Client.getTrampolineSize();
       for (unsigned I = 0; I < NumTrampolines; ++I)
-        this->AvailableTrampolines.push_back(BlockAddr + (I * TrampolineSize));
+        AvailableTrampolines.push_back(BlockAddr + (I * TrampolineSize));
 
       return Error::success();
     }
 
-    std::mutex RTPMutex;
     OrcRemoteTargetClient &Client;
-    std::vector<JITTargetAddress> AvailableTrampolines;
   };
 
   /// Remote compile callback manager.
@@ -501,7 +703,7 @@ public:
   /// Channel is the ChannelT instance to communicate on. It is assumed that
   /// the channel is ready to be read from and written to.
   static Expected<std::unique_ptr<OrcRemoteTargetClient>>
-  Create(rpc::RawByteChannel &Channel, ExecutionSession &ES) {
+  Create(shared::RawByteChannel &Channel, ExecutionSession &ES) {
     Error Err = Error::success();
     auto Client = std::unique_ptr<OrcRemoteTargetClient>(
         new OrcRemoteTargetClient(Channel, ES, Err));
@@ -518,6 +720,14 @@ public:
     return callB<exec::CallIntVoid>(Addr);
   }
 
+  /// Call the int(int) function at the given address in the target and return
+  /// its result.
+  Expected<int> callIntInt(JITTargetAddress Addr, int Arg) {
+    LLVM_DEBUG(dbgs() << "Calling int(*)(int) " << format("0x%016" PRIx64, Addr)
+                      << "\n");
+    return callB<exec::CallIntInt>(Addr, Arg);
+  }
+
   /// Call the int(int, char*[]) function at the given address in the target and
   /// return its result.
   Expected<int> callMain(JITTargetAddress Addr,
@@ -546,6 +756,18 @@ public:
         new RemoteRTDyldMemoryManager(*this, Id));
   }
 
+  /// Create a JITLink-compatible memory manager which will allocate working
+  /// memory on the host and target memory on the remote target.
+  Expected<std::unique_ptr<RemoteJITLinkMemoryManager>>
+  createRemoteJITLinkMemoryManager() {
+    auto Id = AllocatorIds.getNext();
+    if (auto Err = callB<mem::CreateRemoteAllocator>(Id))
+      return std::move(Err);
+    LLVM_DEBUG(dbgs() << "Created remote allocator " << Id << "\n");
+    return std::unique_ptr<RemoteJITLinkMemoryManager>(
+        new RemoteJITLinkMemoryManager(*this, Id));
+  }
+
   /// Create an RCIndirectStubsManager that will allocate stubs on the remote
   /// target.
   Expected<std::unique_ptr<RemoteIndirectStubsManager>>
@@ -583,9 +805,10 @@ public:
   Error terminateSession() { return callB<utils::TerminateSession>(); }
 
 private:
-  OrcRemoteTargetClient(rpc::RawByteChannel &Channel, ExecutionSession &ES,
+  OrcRemoteTargetClient(shared::RawByteChannel &Channel, ExecutionSession &ES,
                         Error &Err)
-      : rpc::SingleThreadedRPCEndpoint<rpc::RawByteChannel>(Channel, true),
+      : shared::SingleThreadedRPCEndpoint<shared::RawByteChannel>(Channel,
+                                                                  true),
         ES(ES) {
     ErrorAsOutParameter EAO(&Err);
 
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetRPCAPI.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetRPCAPI.h
index 52a328165240..367bfb369191 100644
--- a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetRPCAPI.h
+++ b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetRPCAPI.h
@@ -16,8 +16,8 @@
 #define LLVM_EXECUTIONENGINE_ORC_ORCREMOTETARGETRPCAPI_H
 
 #include "llvm/ExecutionEngine/JITSymbol.h"
-#include "llvm/ExecutionEngine/Orc/RPC/RPCUtils.h"
-#include "llvm/ExecutionEngine/Orc/RPC/RawByteChannel.h"
+#include "llvm/ExecutionEngine/Orc/Shared/RPCUtils.h"
+#include "llvm/ExecutionEngine/Orc/Shared/RawByteChannel.h"
 
 namespace llvm {
 namespace orc {
@@ -73,10 +73,9 @@ private:
 
 } // end namespace remote
 
-namespace rpc {
+namespace shared {
 
-template <>
-class RPCTypeName<JITSymbolFlags> {
+template <> class SerializationTypeName<JITSymbolFlags> {
 public:
   static const char *getName() { return "JITSymbolFlags"; }
 };
@@ -100,7 +99,7 @@ public:
   }
 };
 
-template <> class RPCTypeName<remote::DirectBufferWriter> {
+template <> class SerializationTypeName<remote::DirectBufferWriter> {
 public:
   static const char *getName() { return "DirectBufferWriter"; }
 };
@@ -133,7 +132,7 @@ public:
   }
 };
 
-} // end namespace rpc
+} // end namespace shared
 
 namespace remote {
 
@@ -167,20 +166,20 @@ private:
 namespace eh {
 
   /// Registers EH frames on the remote.
-  class RegisterEHFrames
-      : public rpc::Function<RegisterEHFrames,
-                             void(JITTargetAddress Addr, uint32_t Size)> {
-  public:
-    static const char *getName() { return "RegisterEHFrames"; }
-  };
+class RegisterEHFrames
+    : public shared::RPCFunction<RegisterEHFrames,
+                                 void(JITTargetAddress Addr, uint32_t Size)> {
+public:
+  static const char *getName() { return "RegisterEHFrames"; }
+};
 
   /// Deregisters EH frames on the remote.
-  class DeregisterEHFrames
-      : public rpc::Function<DeregisterEHFrames,
-                             void(JITTargetAddress Addr, uint32_t Size)> {
-  public:
-    static const char *getName() { return "DeregisterEHFrames"; }
-  };
+class DeregisterEHFrames
+    : public shared::RPCFunction<DeregisterEHFrames,
+                                 void(JITTargetAddress Addr, uint32_t Size)> {
+public:
+  static const char *getName() { return "DeregisterEHFrames"; }
+};
 
 } // end namespace eh
 
@@ -189,28 +188,38 @@ namespace exec {
 
   /// Call an 'int32_t()'-type function on the remote, returns the called
   /// function's return value.
-  class CallIntVoid
-      : public rpc::Function<CallIntVoid, int32_t(JITTargetAddress Addr)> {
-  public:
-    static const char *getName() { return "CallIntVoid"; }
-  };
+class CallIntVoid
+    : public shared::RPCFunction<CallIntVoid, int32_t(JITTargetAddress Addr)> {
+public:
+  static const char *getName() { return "CallIntVoid"; }
+};
+
+  /// Call an 'int32_t(int32_t)'-type function on the remote, returns the called
+  /// function's return value.
+class CallIntInt
+    : public shared::RPCFunction<CallIntInt,
+                                 int32_t(JITTargetAddress Addr, int)> {
+public:
+  static const char *getName() { return "CallIntInt"; }
+};
 
   /// Call an 'int32_t(int32_t, char**)'-type function on the remote, returns the
   /// called function's return value.
-  class CallMain
-      : public rpc::Function<CallMain, int32_t(JITTargetAddress Addr,
-                                               std::vector<std::string> Args)> {
-  public:
-    static const char *getName() { return "CallMain"; }
-  };
+class CallMain
+    : public shared::RPCFunction<CallMain,
+                                 int32_t(JITTargetAddress Addr,
+                                         std::vector<std::string> Args)> {
+public:
+  static const char *getName() { return "CallMain"; }
+};
 
   /// Calls a 'void()'-type function on the remote, returns when the called
   /// function completes.
-  class CallVoidVoid
-      : public rpc::Function<CallVoidVoid, void(JITTargetAddress FnAddr)> {
-  public:
-    static const char *getName() { return "CallVoidVoid"; }
-  };
+class CallVoidVoid
+    : public shared::RPCFunction<CallVoidVoid, void(JITTargetAddress FnAddr)> {
+public:
+  static const char *getName() { return "CallVoidVoid"; }
+};
 
 } // end namespace exec
 
@@ -218,60 +227,62 @@ namespace exec {
 namespace mem {
 
   /// Creates a memory allocator on the remote.
-  class CreateRemoteAllocator
-      : public rpc::Function<CreateRemoteAllocator,
-                             void(ResourceIdMgr::ResourceId AllocatorID)> {
-  public:
-    static const char *getName() { return "CreateRemoteAllocator"; }
-  };
+class CreateRemoteAllocator
+    : public shared::RPCFunction<CreateRemoteAllocator,
+                                 void(ResourceIdMgr::ResourceId AllocatorID)> {
+public:
+  static const char *getName() { return "CreateRemoteAllocator"; }
+};
 
   /// Destroys a remote allocator, freeing any memory allocated by it.
-  class DestroyRemoteAllocator
-      : public rpc::Function<DestroyRemoteAllocator,
-                             void(ResourceIdMgr::ResourceId AllocatorID)> {
-  public:
-    static const char *getName() { return "DestroyRemoteAllocator"; }
-  };
+class DestroyRemoteAllocator
+    : public shared::RPCFunction<DestroyRemoteAllocator,
+                                 void(ResourceIdMgr::ResourceId AllocatorID)> {
+public:
+  static const char *getName() { return "DestroyRemoteAllocator"; }
+};
 
   /// Read a remote memory block.
-  class ReadMem
-      : public rpc::Function<ReadMem, std::vector<uint8_t>(JITTargetAddress Src,
-                                                           uint64_t Size)> {
-  public:
-    static const char *getName() { return "ReadMem"; }
-  };
+class ReadMem
+    : public shared::RPCFunction<
+          ReadMem, std::vector<uint8_t>(JITTargetAddress Src, uint64_t Size)> {
+public:
+  static const char *getName() { return "ReadMem"; }
+};
 
   /// Reserve a block of memory on the remote via the given allocator.
-  class ReserveMem
-      : public rpc::Function<ReserveMem,
-                             JITTargetAddress(ResourceIdMgr::ResourceId AllocID,
-                                              uint64_t Size, uint32_t Align)> {
-  public:
-    static const char *getName() { return "ReserveMem"; }
-  };
+class ReserveMem
+    : public shared::RPCFunction<
+          ReserveMem, JITTargetAddress(ResourceIdMgr::ResourceId AllocID,
+                                       uint64_t Size, uint32_t Align)> {
+public:
+  static const char *getName() { return "ReserveMem"; }
+};
 
   /// Set the memory protection on a memory block.
-  class SetProtections
-      : public rpc::Function<SetProtections,
-                             void(ResourceIdMgr::ResourceId AllocID,
-                                  JITTargetAddress Dst, uint32_t ProtFlags)> {
-  public:
-    static const char *getName() { return "SetProtections"; }
-  };
+class SetProtections
+    : public shared::RPCFunction<
+          SetProtections, void(ResourceIdMgr::ResourceId AllocID,
+                               JITTargetAddress Dst, uint32_t ProtFlags)> {
+public:
+  static const char *getName() { return "SetProtections"; }
+};
 
   /// Write to a remote memory block.
-  class WriteMem
-      : public rpc::Function<WriteMem, void(remote::DirectBufferWriter DB)> {
-  public:
-    static const char *getName() { return "WriteMem"; }
-  };
+class WriteMem
+    : public shared::RPCFunction<WriteMem,
+                                 void(remote::DirectBufferWriter DB)> {
+public:
+  static const char *getName() { return "WriteMem"; }
+};
 
   /// Write to a remote pointer.
-  class WritePtr : public rpc::Function<WritePtr, void(JITTargetAddress Dst,
-                                                       JITTargetAddress Val)> {
-  public:
-    static const char *getName() { return "WritePtr"; }
-  };
+class WritePtr
+    : public shared::RPCFunction<WritePtr, void(JITTargetAddress Dst,
+                                                JITTargetAddress Val)> {
+public:
+  static const char *getName() { return "WritePtr"; }
+};
 
 } // end namespace mem
 
@@ -279,45 +290,46 @@ namespace mem {
 namespace stubs {
 
   /// Creates an indirect stub owner on the remote.
-  class CreateIndirectStubsOwner
-      : public rpc::Function<CreateIndirectStubsOwner,
-                             void(ResourceIdMgr::ResourceId StubOwnerID)> {
-  public:
-    static const char *getName() { return "CreateIndirectStubsOwner"; }
-  };
+class CreateIndirectStubsOwner
+    : public shared::RPCFunction<CreateIndirectStubsOwner,
+                                 void(ResourceIdMgr::ResourceId StubOwnerID)> {
+public:
+  static const char *getName() { return "CreateIndirectStubsOwner"; }
+};
 
   /// RPC function for destroying an indirect stubs owner.
-  class DestroyIndirectStubsOwner
-      : public rpc::Function<DestroyIndirectStubsOwner,
-                             void(ResourceIdMgr::ResourceId StubsOwnerID)> {
-  public:
-    static const char *getName() { return "DestroyIndirectStubsOwner"; }
-  };
+class DestroyIndirectStubsOwner
+    : public shared::RPCFunction<DestroyIndirectStubsOwner,
+                                 void(ResourceIdMgr::ResourceId StubsOwnerID)> {
+public:
+  static const char *getName() { return "DestroyIndirectStubsOwner"; }
+};
 
   /// EmitIndirectStubs result is (StubsBase, PtrsBase, NumStubsEmitted).
-  class EmitIndirectStubs
-      : public rpc::Function<
-            EmitIndirectStubs,
-            std::tuple<JITTargetAddress, JITTargetAddress, uint32_t>(
-                ResourceIdMgr::ResourceId StubsOwnerID,
-                uint32_t NumStubsRequired)> {
-  public:
-    static const char *getName() { return "EmitIndirectStubs"; }
-  };
+class EmitIndirectStubs
+    : public shared::RPCFunction<
+          EmitIndirectStubs,
+          std::tuple<JITTargetAddress, JITTargetAddress, uint32_t>(
+              ResourceIdMgr::ResourceId StubsOwnerID,
+              uint32_t NumStubsRequired)> {
+public:
+  static const char *getName() { return "EmitIndirectStubs"; }
+};
 
   /// RPC function to emit the resolver block and return its address.
-  class EmitResolverBlock : public rpc::Function<EmitResolverBlock, void()> {
-  public:
-    static const char *getName() { return "EmitResolverBlock"; }
-  };
+class EmitResolverBlock
+    : public shared::RPCFunction<EmitResolverBlock, void()> {
+public:
+  static const char *getName() { return "EmitResolverBlock"; }
+};
 
   /// EmitTrampolineBlock result is (BlockAddr, NumTrampolines).
-  class EmitTrampolineBlock
-      : public rpc::Function<EmitTrampolineBlock,
-                             std::tuple<JITTargetAddress, uint32_t>()> {
-  public:
-    static const char *getName() { return "EmitTrampolineBlock"; }
-  };
+class EmitTrampolineBlock
+    : public shared::RPCFunction<EmitTrampolineBlock,
+                                 std::tuple<JITTargetAddress, uint32_t>()> {
+public:
+  static const char *getName() { return "EmitTrampolineBlock"; }
+};
 
 } // end namespace stubs
 
@@ -326,44 +338,44 @@ namespace utils {
 
   /// GetRemoteInfo result is (Triple, PointerSize, PageSize, TrampolineSize,
   ///                          IndirectStubsSize).
-  class GetRemoteInfo
-      : public rpc::Function<
-            GetRemoteInfo,
-            std::tuple<std::string, uint32_t, uint32_t, uint32_t, uint32_t>()> {
-  public:
-    static const char *getName() { return "GetRemoteInfo"; }
-  };
+class GetRemoteInfo
+    : public shared::RPCFunction<
+          GetRemoteInfo,
+          std::tuple<std::string, uint32_t, uint32_t, uint32_t, uint32_t>()> {
+public:
+  static const char *getName() { return "GetRemoteInfo"; }
+};
 
   /// Get the address of a remote symbol.
-  class GetSymbolAddress
-      : public rpc::Function<GetSymbolAddress,
-                             JITTargetAddress(std::string SymbolName)> {
-  public:
-    static const char *getName() { return "GetSymbolAddress"; }
-  };
+class GetSymbolAddress
+    : public shared::RPCFunction<GetSymbolAddress,
+                                 JITTargetAddress(std::string SymbolName)> {
+public:
+  static const char *getName() { return "GetSymbolAddress"; }
+};
 
   /// Request that the host execute a compile callback.
-  class RequestCompile
-      : public rpc::Function<
-            RequestCompile, JITTargetAddress(JITTargetAddress TrampolineAddr)> {
-  public:
-    static const char *getName() { return "RequestCompile"; }
-  };
+class RequestCompile
+    : public shared::RPCFunction<
+          RequestCompile, JITTargetAddress(JITTargetAddress TrampolineAddr)> {
+public:
+  static const char *getName() { return "RequestCompile"; }
+};
 
   /// Notify the remote and terminate the session.
-  class TerminateSession : public rpc::Function<TerminateSession, void()> {
-  public:
-    static const char *getName() { return "TerminateSession"; }
-  };
+class TerminateSession : public shared::RPCFunction<TerminateSession, void()> {
+public:
+  static const char *getName() { return "TerminateSession"; }
+};
 
 } // namespace utils
 
 class OrcRemoteTargetRPCAPI
-    : public rpc::SingleThreadedRPCEndpoint<rpc::RawByteChannel> {
+    : public shared::SingleThreadedRPCEndpoint<shared::RawByteChannel> {
 public:
   // FIXME: Remove constructors once MSVC supports synthesizing move-ops.
-  OrcRemoteTargetRPCAPI(rpc::RawByteChannel &C)
-      : rpc::SingleThreadedRPCEndpoint<rpc::RawByteChannel>(C, true) {}
+  OrcRemoteTargetRPCAPI(shared::RawByteChannel &C)
+      : shared::SingleThreadedRPCEndpoint<shared::RawByteChannel>(C, true) {}
 };
 
 } // end namespace remote
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetServer.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetServer.h
index 50c155d77db1..ce9bf064303d 100644
--- a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetServer.h
+++ b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetServer.h
@@ -16,8 +16,8 @@
 
 #include "llvm/ExecutionEngine/JITSymbol.h"
 #include "llvm/ExecutionEngine/Orc/IndirectionUtils.h"
-#include "llvm/ExecutionEngine/Orc/OrcError.h"
 #include "llvm/ExecutionEngine/Orc/OrcRemoteTargetRPCAPI.h"
+#include "llvm/ExecutionEngine/Orc/Shared/OrcError.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/Format.h"
@@ -46,7 +46,7 @@ namespace remote {
 
 template <typename ChannelT, typename TargetT>
 class OrcRemoteTargetServer
-    : public rpc::SingleThreadedRPCEndpoint<rpc::RawByteChannel> {
+    : public shared::SingleThreadedRPCEndpoint<shared::RawByteChannel> {
 public:
   using SymbolLookupFtor =
       std::function<JITTargetAddress(const std::string &Name)>;
@@ -57,12 +57,14 @@ public:
   OrcRemoteTargetServer(ChannelT &Channel, SymbolLookupFtor SymbolLookup,
                         EHFrameRegistrationFtor EHFramesRegister,
                         EHFrameRegistrationFtor EHFramesDeregister)
-      : rpc::SingleThreadedRPCEndpoint<rpc::RawByteChannel>(Channel, true),
+      : shared::SingleThreadedRPCEndpoint<shared::RawByteChannel>(Channel,
+                                                                  true),
         SymbolLookup(std::move(SymbolLookup)),
         EHFramesRegister(std::move(EHFramesRegister)),
         EHFramesDeregister(std::move(EHFramesDeregister)) {
     using ThisT = std::remove_reference_t<decltype(*this)>;
     addHandler<exec::CallIntVoid>(*this, &ThisT::handleCallIntVoid);
+    addHandler<exec::CallIntInt>(*this, &ThisT::handleCallIntInt);
     addHandler<exec::CallMain>(*this, &ThisT::handleCallMain);
     addHandler<exec::CallVoidVoid>(*this, &ThisT::handleCallVoidVoid);
     addHandler<mem::CreateRemoteAllocator>(*this,
@@ -168,6 +170,19 @@ private:
     return Result;
   }
 
+  Expected<int32_t> handleCallIntInt(JITTargetAddress Addr, int Arg) {
+    using IntIntFnTy = int (*)(int);
+
+    IntIntFnTy Fn = reinterpret_cast<IntIntFnTy>(static_cast<uintptr_t>(Addr));
+
+    LLVM_DEBUG(dbgs() << "  Calling " << format("0x%016x", Addr)
+                      << " with argument " << Arg << "\n");
+    int Result = Fn(Arg);
+    LLVM_DEBUG(dbgs() << "  Result = " << Result << "\n");
+
+    return Result;
+  }
+
   Expected<int32_t> handleCallMain(JITTargetAddress Addr,
                                    std::vector<std::string> Args) {
     using MainFnTy = int (*)(int, const char *[]);
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/RPC/RPCSerialization.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/RPC/RPCSerialization.h
deleted file mode 100644
index 2f37ab40c7f8..000000000000
--- a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/RPC/RPCSerialization.h
+++ /dev/null
@@ -1,702 +0,0 @@
-//===- llvm/ExecutionEngine/Orc/RPC/RPCSerialization.h --------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_EXECUTIONENGINE_ORC_RPCSERIALIZATION_H
-#define LLVM_EXECUTIONENGINE_ORC_RPCSERIALIZATION_H
-
-#include "llvm/ExecutionEngine/Orc/OrcError.h"
-#include "llvm/Support/thread.h"
-#include <map>
-#include <mutex>
-#include <set>
-#include <sstream>
-#include <string>
-#include <vector>
-
-namespace llvm {
-namespace orc {
-namespace rpc {
-
-template <typename T>
-class RPCTypeName;
-
-/// TypeNameSequence is a utility for rendering sequences of types to a string
-/// by rendering each type, separated by ", ".
-template <typename... ArgTs> class RPCTypeNameSequence {};
-
-/// Render an empty TypeNameSequence to an ostream.
-template <typename OStream>
-OStream &operator<<(OStream &OS, const RPCTypeNameSequence<> &V) {
-  return OS;
-}
-
-/// Render a TypeNameSequence of a single type to an ostream.
-template <typename OStream, typename ArgT>
-OStream &operator<<(OStream &OS, const RPCTypeNameSequence<ArgT> &V) {
-  OS << RPCTypeName<ArgT>::getName();
-  return OS;
-}
-
-/// Render a TypeNameSequence of more than one type to an ostream.
-template <typename OStream, typename ArgT1, typename ArgT2, typename... ArgTs>
-OStream&
-operator<<(OStream &OS, const RPCTypeNameSequence<ArgT1, ArgT2, ArgTs...> &V) {
-  OS << RPCTypeName<ArgT1>::getName() << ", "
-     << RPCTypeNameSequence<ArgT2, ArgTs...>();
-  return OS;
-}
-
-template <>
-class RPCTypeName<void> {
-public:
-  static const char* getName() { return "void"; }
-};
-
-template <>
-class RPCTypeName<int8_t> {
-public:
-  static const char* getName() { return "int8_t"; }
-};
-
-template <>
-class RPCTypeName<uint8_t> {
-public:
-  static const char* getName() { return "uint8_t"; }
-};
-
-template <>
-class RPCTypeName<int16_t> {
-public:
-  static const char* getName() { return "int16_t"; }
-};
-
-template <>
-class RPCTypeName<uint16_t> {
-public:
-  static const char* getName() { return "uint16_t"; }
-};
-
-template <>
-class RPCTypeName<int32_t> {
-public:
-  static const char* getName() { return "int32_t"; }
-};
-
-template <>
-class RPCTypeName<uint32_t> {
-public:
-  static const char* getName() { return "uint32_t"; }
-};
-
-template <>
-class RPCTypeName<int64_t> {
-public:
-  static const char* getName() { return "int64_t"; }
-};
-
-template <>
-class RPCTypeName<uint64_t> {
-public:
-  static const char* getName() { return "uint64_t"; }
-};
-
-template <>
-class RPCTypeName<bool> {
-public:
-  static const char* getName() { return "bool"; }
-};
-
-template <>
-class RPCTypeName<std::string> {
-public:
-  static const char* getName() { return "std::string"; }
-};
-
-template <>
-class RPCTypeName<Error> {
-public:
-  static const char* getName() { return "Error"; }
-};
-
-template <typename T>
-class RPCTypeName<Expected<T>> {
-public:
-  static const char* getName() {
-    static std::string Name = [] {
-      std::string Name;
-      raw_string_ostream(Name) << "Expected<"
-                               << RPCTypeNameSequence<T>()
-                               << ">";
-      return Name;
-    }();
-    return Name.data();
-  }
-};
-
-template <typename T1, typename T2>
-class RPCTypeName<std::pair<T1, T2>> {
-public:
-  static const char* getName() {
-    static std::string Name = [] {
-      std::string Name;
-      raw_string_ostream(Name) << "std::pair<" << RPCTypeNameSequence<T1, T2>()
-                               << ">";
-      return Name;
-    }();
-    return Name.data();
-  }
-};
-
-template <typename... ArgTs>
-class RPCTypeName<std::tuple<ArgTs...>> {
-public:
-  static const char* getName() {
-    static std::string Name = [] {
-      std::string Name;
-      raw_string_ostream(Name) << "std::tuple<"
-                               << RPCTypeNameSequence<ArgTs...>() << ">";
-      return Name;
-    }();
-    return Name.data();
-  }
-};
-
-template <typename T>
-class RPCTypeName<std::vector<T>> {
-public:
-  static const char*getName() {
-    static std::string Name = [] {
-      std::string Name;
-      raw_string_ostream(Name) << "std::vector<" << RPCTypeName<T>::getName()
-                               << ">";
-      return Name;
-    }();
-    return Name.data();
-  }
-};
-
-template <typename T> class RPCTypeName<std::set<T>> {
-public:
-  static const char *getName() {
-    static std::string Name = [] {
-      std::string Name;
-      raw_string_ostream(Name)
-          << "std::set<" << RPCTypeName<T>::getName() << ">";
-      return Name;
-    }();
-    return Name.data();
-  }
-};
-
-template <typename K, typename V> class RPCTypeName<std::map<K, V>> {
-public:
-  static const char *getName() {
-    static std::string Name = [] {
-      std::string Name;
-      raw_string_ostream(Name)
-          << "std::map<" << RPCTypeNameSequence<K, V>() << ">";
-      return Name;
-    }();
-    return Name.data();
-  }
-};
-
-/// The SerializationTraits<ChannelT, T> class describes how to serialize and
-/// deserialize an instance of type T to/from an abstract channel of type
-/// ChannelT. It also provides a representation of the type's name via the
-/// getName method.
-///
-/// Specializations of this class should provide the following functions:
-///
-///   @code{.cpp}
-///
-///   static const char* getName();
-///   static Error serialize(ChannelT&, const T&);
-///   static Error deserialize(ChannelT&, T&);
-///
-///   @endcode
-///
-/// The third argument of SerializationTraits is intended to support SFINAE.
-/// E.g.:
-///
-///   @code{.cpp}
-///
-///   class MyVirtualChannel { ... };
-///
-///   template <DerivedChannelT>
-///   class SerializationTraits<DerivedChannelT, bool,
-///         std::enable_if_t<
-///           std::is_base_of<VirtChannel, DerivedChannel>::value
-///         >> {
-///   public:
-///     static const char* getName() { ... };
-///   }
-///
-///   @endcode
-template <typename ChannelT, typename WireType,
-          typename ConcreteType = WireType, typename = void>
-class SerializationTraits;
-
-template <typename ChannelT>
-class SequenceTraits {
-public:
-  static Error emitSeparator(ChannelT &C) { return Error::success(); }
-  static Error consumeSeparator(ChannelT &C) { return Error::success(); }
-};
-
-/// Utility class for serializing sequences of values of varying types.
-/// Specializations of this class contain 'serialize' and 'deserialize' methods
-/// for the given channel. The ArgTs... list will determine the "over-the-wire"
-/// types to be serialized. The serialize and deserialize methods take a list
-/// CArgTs... ("caller arg types") which must be the same length as ArgTs...,
-/// but may be different types from ArgTs, provided that for each CArgT there
-/// is a SerializationTraits specialization
-/// SerializeTraits<ChannelT, ArgT, CArgT> with methods that can serialize the
-/// caller argument to over-the-wire value.
-template <typename ChannelT, typename... ArgTs>
-class SequenceSerialization;
-
-template <typename ChannelT>
-class SequenceSerialization<ChannelT> {
-public:
-  static Error serialize(ChannelT &C) { return Error::success(); }
-  static Error deserialize(ChannelT &C) { return Error::success(); }
-};
-
-template <typename ChannelT, typename ArgT>
-class SequenceSerialization<ChannelT, ArgT> {
-public:
-
-  template <typename CArgT>
-  static Error serialize(ChannelT &C, CArgT &&CArg) {
-    return SerializationTraits<ChannelT, ArgT, std::decay_t<CArgT>>::serialize(
-        C, std::forward<CArgT>(CArg));
-  }
-
-  template <typename CArgT>
-  static Error deserialize(ChannelT &C, CArgT &CArg) {
-    return SerializationTraits<ChannelT, ArgT, CArgT>::deserialize(C, CArg);
-  }
-};
-
-template <typename ChannelT, typename ArgT, typename... ArgTs>
-class SequenceSerialization<ChannelT, ArgT, ArgTs...> {
-public:
-
-  template <typename CArgT, typename... CArgTs>
-  static Error serialize(ChannelT &C, CArgT &&CArg,
-                         CArgTs &&... CArgs) {
-    if (auto Err =
-            SerializationTraits<ChannelT, ArgT, std::decay_t<CArgT>>::serialize(
-                C, std::forward<CArgT>(CArg)))
-      return Err;
-    if (auto Err = SequenceTraits<ChannelT>::emitSeparator(C))
-      return Err;
-    return SequenceSerialization<ChannelT, ArgTs...>::
-             serialize(C, std::forward<CArgTs>(CArgs)...);
-  }
-
-  template <typename CArgT, typename... CArgTs>
-  static Error deserialize(ChannelT &C, CArgT &CArg,
-                           CArgTs &... CArgs) {
-    if (auto Err =
-        SerializationTraits<ChannelT, ArgT, CArgT>::deserialize(C, CArg))
-      return Err;
-    if (auto Err = SequenceTraits<ChannelT>::consumeSeparator(C))
-      return Err;
-    return SequenceSerialization<ChannelT, ArgTs...>::deserialize(C, CArgs...);
-  }
-};
-
-template <typename ChannelT, typename... ArgTs>
-Error serializeSeq(ChannelT &C, ArgTs &&... Args) {
-  return SequenceSerialization<ChannelT, std::decay_t<ArgTs>...>::serialize(
-      C, std::forward<ArgTs>(Args)...);
-}
-
-template <typename ChannelT, typename... ArgTs>
-Error deserializeSeq(ChannelT &C, ArgTs &... Args) {
-  return SequenceSerialization<ChannelT, ArgTs...>::deserialize(C, Args...);
-}
-
-template <typename ChannelT>
-class SerializationTraits<ChannelT, Error> {
-public:
-
-  using WrappedErrorSerializer =
-    std::function<Error(ChannelT &C, const ErrorInfoBase&)>;
-
-  using WrappedErrorDeserializer =
-    std::function<Error(ChannelT &C, Error &Err)>;
-
-  template <typename ErrorInfoT, typename SerializeFtor,
-            typename DeserializeFtor>
-  static void registerErrorType(std::string Name, SerializeFtor Serialize,
-                                DeserializeFtor Deserialize) {
-    assert(!Name.empty() &&
-           "The empty string is reserved for the Success value");
-
-    const std::string *KeyName = nullptr;
-    {
-      // We're abusing the stability of std::map here: We take a reference to the
-      // key of the deserializers map to save us from duplicating the string in
-      // the serializer. This should be changed to use a stringpool if we switch
-      // to a map type that may move keys in memory.
-      std::lock_guard<std::recursive_mutex> Lock(DeserializersMutex);
-      auto I =
-        Deserializers.insert(Deserializers.begin(),
-                             std::make_pair(std::move(Name),
-                                            std::move(Deserialize)));
-      KeyName = &I->first;
-    }
-
-    {
-      assert(KeyName != nullptr && "No keyname pointer");
-      std::lock_guard<std::recursive_mutex> Lock(SerializersMutex);
-      Serializers[ErrorInfoT::classID()] =
-          [KeyName, Serialize = std::move(Serialize)](
-              ChannelT &C, const ErrorInfoBase &EIB) -> Error {
-        assert(EIB.dynamicClassID() == ErrorInfoT::classID() &&
-               "Serializer called for wrong error type");
-        if (auto Err = serializeSeq(C, *KeyName))
-          return Err;
-        return Serialize(C, static_cast<const ErrorInfoT &>(EIB));
-      };
-    }
-  }
-
-  static Error serialize(ChannelT &C, Error &&Err) {
-    std::lock_guard<std::recursive_mutex> Lock(SerializersMutex);
-
-    if (!Err)
-      return serializeSeq(C, std::string());
-
-    return handleErrors(std::move(Err),
-                        [&C](const ErrorInfoBase &EIB) {
-                          auto SI = Serializers.find(EIB.dynamicClassID());
-                          if (SI == Serializers.end())
-                            return serializeAsStringError(C, EIB);
-                          return (SI->second)(C, EIB);
-                        });
-  }
-
-  static Error deserialize(ChannelT &C, Error &Err) {
-    std::lock_guard<std::recursive_mutex> Lock(DeserializersMutex);
-
-    std::string Key;
-    if (auto Err = deserializeSeq(C, Key))
-      return Err;
-
-    if (Key.empty()) {
-      ErrorAsOutParameter EAO(&Err);
-      Err = Error::success();
-      return Error::success();
-    }
-
-    auto DI = Deserializers.find(Key);
-    assert(DI != Deserializers.end() && "No deserializer for error type");
-    return (DI->second)(C, Err);
-  }
-
-private:
-
-  static Error serializeAsStringError(ChannelT &C, const ErrorInfoBase &EIB) {
-    std::string ErrMsg;
-    {
-      raw_string_ostream ErrMsgStream(ErrMsg);
-      EIB.log(ErrMsgStream);
-    }
-    return serialize(C, make_error<StringError>(std::move(ErrMsg),
-                                                inconvertibleErrorCode()));
-  }
-
-  static std::recursive_mutex SerializersMutex;
-  static std::recursive_mutex DeserializersMutex;
-  static std::map<const void*, WrappedErrorSerializer> Serializers;
-  static std::map<std::string, WrappedErrorDeserializer> Deserializers;
-};
-
-template <typename ChannelT>
-std::recursive_mutex SerializationTraits<ChannelT, Error>::SerializersMutex;
-
-template <typename ChannelT>
-std::recursive_mutex SerializationTraits<ChannelT, Error>::DeserializersMutex;
-
-template <typename ChannelT>
-std::map<const void*,
-         typename SerializationTraits<ChannelT, Error>::WrappedErrorSerializer>
-SerializationTraits<ChannelT, Error>::Serializers;
-
-template <typename ChannelT>
-std::map<std::string,
-         typename SerializationTraits<ChannelT, Error>::WrappedErrorDeserializer>
-SerializationTraits<ChannelT, Error>::Deserializers;
-
-/// Registers a serializer and deserializer for the given error type on the
-/// given channel type.
-template <typename ChannelT, typename ErrorInfoT, typename SerializeFtor,
-          typename DeserializeFtor>
-void registerErrorSerialization(std::string Name, SerializeFtor &&Serialize,
-                                DeserializeFtor &&Deserialize) {
-  SerializationTraits<ChannelT, Error>::template registerErrorType<ErrorInfoT>(
-    std::move(Name),
-    std::forward<SerializeFtor>(Serialize),
-    std::forward<DeserializeFtor>(Deserialize));
-}
-
-/// Registers serialization/deserialization for StringError.
-template <typename ChannelT>
-void registerStringError() {
-  static bool AlreadyRegistered = false;
-  if (!AlreadyRegistered) {
-    registerErrorSerialization<ChannelT, StringError>(
-      "StringError",
-      [](ChannelT &C, const StringError &SE) {
-        return serializeSeq(C, SE.getMessage());
-      },
-      [](ChannelT &C, Error &Err) -> Error {
-        ErrorAsOutParameter EAO(&Err);
-        std::string Msg;
-        if (auto E2 = deserializeSeq(C, Msg))
-          return E2;
-        Err =
-          make_error<StringError>(std::move(Msg),
-                                  orcError(
-                                    OrcErrorCode::UnknownErrorCodeFromRemote));
-        return Error::success();
-      });
-    AlreadyRegistered = true;
-  }
-}
-
-/// SerializationTraits for Expected<T1> from an Expected<T2>.
-template <typename ChannelT, typename T1, typename T2>
-class SerializationTraits<ChannelT, Expected<T1>, Expected<T2>> {
-public:
-
-  static Error serialize(ChannelT &C, Expected<T2> &&ValOrErr) {
-    if (ValOrErr) {
-      if (auto Err = serializeSeq(C, true))
-        return Err;
-      return SerializationTraits<ChannelT, T1, T2>::serialize(C, *ValOrErr);
-    }
-    if (auto Err = serializeSeq(C, false))
-      return Err;
-    return serializeSeq(C, ValOrErr.takeError());
-  }
-
-  static Error deserialize(ChannelT &C, Expected<T2> &ValOrErr) {
-    ExpectedAsOutParameter<T2> EAO(&ValOrErr);
-    bool HasValue;
-    if (auto Err = deserializeSeq(C, HasValue))
-      return Err;
-    if (HasValue)
-      return SerializationTraits<ChannelT, T1, T2>::deserialize(C, *ValOrErr);
-    Error Err = Error::success();
-    if (auto E2 = deserializeSeq(C, Err))
-      return E2;
-    ValOrErr = std::move(Err);
-    return Error::success();
-  }
-};
-
-/// SerializationTraits for Expected<T1> from a T2.
-template <typename ChannelT, typename T1, typename T2>
-class SerializationTraits<ChannelT, Expected<T1>, T2> {
-public:
-
-  static Error serialize(ChannelT &C, T2 &&Val) {
-    return serializeSeq(C, Expected<T2>(std::forward<T2>(Val)));
-  }
-};
-
-/// SerializationTraits for Expected<T1> from an Error.
-template <typename ChannelT, typename T>
-class SerializationTraits<ChannelT, Expected<T>, Error> {
-public:
-
-  static Error serialize(ChannelT &C, Error &&Err) {
-    return serializeSeq(C, Expected<T>(std::move(Err)));
-  }
-};
-
-/// SerializationTraits default specialization for std::pair.
-template <typename ChannelT, typename T1, typename T2, typename T3, typename T4>
-class SerializationTraits<ChannelT, std::pair<T1, T2>, std::pair<T3, T4>> {
-public:
-  static Error serialize(ChannelT &C, const std::pair<T3, T4> &V) {
-    if (auto Err = SerializationTraits<ChannelT, T1, T3>::serialize(C, V.first))
-      return Err;
-    return SerializationTraits<ChannelT, T2, T4>::serialize(C, V.second);
-  }
-
-  static Error deserialize(ChannelT &C, std::pair<T3, T4> &V) {
-    if (auto Err =
-            SerializationTraits<ChannelT, T1, T3>::deserialize(C, V.first))
-      return Err;
-    return SerializationTraits<ChannelT, T2, T4>::deserialize(C, V.second);
-  }
-};
-
-/// SerializationTraits default specialization for std::tuple.
-template <typename ChannelT, typename... ArgTs>
-class SerializationTraits<ChannelT, std::tuple<ArgTs...>> {
-public:
-
-  /// RPC channel serialization for std::tuple.
-  static Error serialize(ChannelT &C, const std::tuple<ArgTs...> &V) {
-    return serializeTupleHelper(C, V, std::index_sequence_for<ArgTs...>());
-  }
-
-  /// RPC channel deserialization for std::tuple.
-  static Error deserialize(ChannelT &C, std::tuple<ArgTs...> &V) {
-    return deserializeTupleHelper(C, V, std::index_sequence_for<ArgTs...>());
-  }
-
-private:
-  // Serialization helper for std::tuple.
-  template <size_t... Is>
-  static Error serializeTupleHelper(ChannelT &C, const std::tuple<ArgTs...> &V,
-                                    std::index_sequence<Is...> _) {
-    return serializeSeq(C, std::get<Is>(V)...);
-  }
-
-  // Serialization helper for std::tuple.
-  template <size_t... Is>
-  static Error deserializeTupleHelper(ChannelT &C, std::tuple<ArgTs...> &V,
-                                      std::index_sequence<Is...> _) {
-    return deserializeSeq(C, std::get<Is>(V)...);
-  }
-};
-
-/// SerializationTraits default specialization for std::vector.
-template <typename ChannelT, typename T>
-class SerializationTraits<ChannelT, std::vector<T>> {
-public:
-
-  /// Serialize a std::vector<T> from std::vector<T>.
-  static Error serialize(ChannelT &C, const std::vector<T> &V) {
-    if (auto Err = serializeSeq(C, static_cast<uint64_t>(V.size())))
-      return Err;
-
-    for (const auto &E : V)
-      if (auto Err = serializeSeq(C, E))
-        return Err;
-
-    return Error::success();
-  }
-
-  /// Deserialize a std::vector<T> to a std::vector<T>.
-  static Error deserialize(ChannelT &C, std::vector<T> &V) {
-    assert(V.empty() &&
-           "Expected default-constructed vector to deserialize into");
-
-    uint64_t Count = 0;
-    if (auto Err = deserializeSeq(C, Count))
-      return Err;
-
-    V.resize(Count);
-    for (auto &E : V)
-      if (auto Err = deserializeSeq(C, E))
-        return Err;
-
-    return Error::success();
-  }
-};
-
-template <typename ChannelT, typename T, typename T2>
-class SerializationTraits<ChannelT, std::set<T>, std::set<T2>> {
-public:
-  /// Serialize a std::set<T> from std::set<T2>.
-  static Error serialize(ChannelT &C, const std::set<T2> &S) {
-    if (auto Err = serializeSeq(C, static_cast<uint64_t>(S.size())))
-      return Err;
-
-    for (const auto &E : S)
-      if (auto Err = SerializationTraits<ChannelT, T, T2>::serialize(C, E))
-        return Err;
-
-    return Error::success();
-  }
-
-  /// Deserialize a std::set<T> to a std::set<T>.
-  static Error deserialize(ChannelT &C, std::set<T2> &S) {
-    assert(S.empty() && "Expected default-constructed set to deserialize into");
-
-    uint64_t Count = 0;
-    if (auto Err = deserializeSeq(C, Count))
-      return Err;
-
-    while (Count-- != 0) {
-      T2 Val;
-      if (auto Err = SerializationTraits<ChannelT, T, T2>::deserialize(C, Val))
-        return Err;
-
-      auto Added = S.insert(Val).second;
-      if (!Added)
-        return make_error<StringError>("Duplicate element in deserialized set",
-                                       orcError(OrcErrorCode::UnknownORCError));
-    }
-
-    return Error::success();
-  }
-};
-
-template <typename ChannelT, typename K, typename V, typename K2, typename V2>
-class SerializationTraits<ChannelT, std::map<K, V>, std::map<K2, V2>> {
-public:
-  /// Serialize a std::map<K, V> from std::map<K2, V2>.
-  static Error serialize(ChannelT &C, const std::map<K2, V2> &M) {
-    if (auto Err = serializeSeq(C, static_cast<uint64_t>(M.size())))
-      return Err;
-
-    for (const auto &E : M) {
-      if (auto Err =
-              SerializationTraits<ChannelT, K, K2>::serialize(C, E.first))
-        return Err;
-      if (auto Err =
-              SerializationTraits<ChannelT, V, V2>::serialize(C, E.second))
-        return Err;
-    }
-
-    return Error::success();
-  }
-
-  /// Deserialize a std::map<K, V> to a std::map<K, V>.
-  static Error deserialize(ChannelT &C, std::map<K2, V2> &M) {
-    assert(M.empty() && "Expected default-constructed map to deserialize into");
-
-    uint64_t Count = 0;
-    if (auto Err = deserializeSeq(C, Count))
-      return Err;
-
-    while (Count-- != 0) {
-      std::pair<K2, V2> Val;
-      if (auto Err =
-              SerializationTraits<ChannelT, K, K2>::deserialize(C, Val.first))
-        return Err;
-
-      if (auto Err =
-              SerializationTraits<ChannelT, V, V2>::deserialize(C, Val.second))
-        return Err;
-
-      auto Added = M.insert(Val).second;
-      if (!Added)
-        return make_error<StringError>("Duplicate element in deserialized map",
-                                       orcError(OrcErrorCode::UnknownORCError));
-    }
-
-    return Error::success();
-  }
-};
-
-} // end namespace rpc
-} // end namespace orc
-} // end namespace llvm
-
-#endif // LLVM_EXECUTIONENGINE_ORC_RPCSERIALIZATION_H
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/RPC/RPCUtils.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/RPC/RPCUtils.h
deleted file mode 100644
index f348844f39ce..000000000000
--- a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/RPC/RPCUtils.h
+++ /dev/null
@@ -1,1687 +0,0 @@
-//===- RPCUtils.h - Utilities for building RPC APIs -------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Utilities to support construction of simple RPC APIs.
-//
-// The RPC utilities aim for ease of use (minimal conceptual overhead) for C++
-// programmers, high performance, low memory overhead, and efficient use of the
-// communications channel.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_EXECUTIONENGINE_ORC_RPCUTILS_H
-#define LLVM_EXECUTIONENGINE_ORC_RPCUTILS_H
-
-#include <map>
-#include <thread>
-#include <vector>
-
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ExecutionEngine/Orc/OrcError.h"
-#include "llvm/ExecutionEngine/Orc/RPC/RPCSerialization.h"
-#include "llvm/Support/MSVCErrorWorkarounds.h"
-
-#include <future>
-
-namespace llvm {
-namespace orc {
-namespace rpc {
-
-/// Base class of all fatal RPC errors (those that necessarily result in the
-/// termination of the RPC session).
-class RPCFatalError : public ErrorInfo<RPCFatalError> {
-public:
-  static char ID;
-};
-
-/// RPCConnectionClosed is returned from RPC operations if the RPC connection
-/// has already been closed due to either an error or graceful disconnection.
-class ConnectionClosed : public ErrorInfo<ConnectionClosed> {
-public:
-  static char ID;
-  std::error_code convertToErrorCode() const override;
-  void log(raw_ostream &OS) const override;
-};
-
-/// BadFunctionCall is returned from handleOne when the remote makes a call with
-/// an unrecognized function id.
-///
-/// This error is fatal because Orc RPC needs to know how to parse a function
-/// call to know where the next call starts, and if it doesn't recognize the
-/// function id it cannot parse the call.
-template <typename FnIdT, typename SeqNoT>
-class BadFunctionCall
-  : public ErrorInfo<BadFunctionCall<FnIdT, SeqNoT>, RPCFatalError> {
-public:
-  static char ID;
-
-  BadFunctionCall(FnIdT FnId, SeqNoT SeqNo)
-      : FnId(std::move(FnId)), SeqNo(std::move(SeqNo)) {}
-
-  std::error_code convertToErrorCode() const override {
-    return orcError(OrcErrorCode::UnexpectedRPCCall);
-  }
-
-  void log(raw_ostream &OS) const override {
-    OS << "Call to invalid RPC function id '" << FnId << "' with "
-          "sequence number " << SeqNo;
-  }
-
-private:
-  FnIdT FnId;
-  SeqNoT SeqNo;
-};
-
-template <typename FnIdT, typename SeqNoT>
-char BadFunctionCall<FnIdT, SeqNoT>::ID = 0;
-
-/// InvalidSequenceNumberForResponse is returned from handleOne when a response
-/// call arrives with a sequence number that doesn't correspond to any in-flight
-/// function call.
-///
-/// This error is fatal because Orc RPC needs to know how to parse the rest of
-/// the response call to know where the next call starts, and if it doesn't have
-/// a result parser for this sequence number it can't do that.
-template <typename SeqNoT>
-class InvalidSequenceNumberForResponse
-    : public ErrorInfo<InvalidSequenceNumberForResponse<SeqNoT>, RPCFatalError> {
-public:
-  static char ID;
-
-  InvalidSequenceNumberForResponse(SeqNoT SeqNo)
-      : SeqNo(std::move(SeqNo)) {}
-
-  std::error_code convertToErrorCode() const override {
-    return orcError(OrcErrorCode::UnexpectedRPCCall);
-  };
-
-  void log(raw_ostream &OS) const override {
-    OS << "Response has unknown sequence number " << SeqNo;
-  }
-private:
-  SeqNoT SeqNo;
-};
-
-template <typename SeqNoT>
-char InvalidSequenceNumberForResponse<SeqNoT>::ID = 0;
-
-/// This non-fatal error will be passed to asynchronous result handlers in place
-/// of a result if the connection goes down before a result returns, or if the
-/// function to be called cannot be negotiated with the remote.
-class ResponseAbandoned : public ErrorInfo<ResponseAbandoned> {
-public:
-  static char ID;
-
-  std::error_code convertToErrorCode() const override;
-  void log(raw_ostream &OS) const override;
-};
-
-/// This error is returned if the remote does not have a handler installed for
-/// the given RPC function.
-class CouldNotNegotiate : public ErrorInfo<CouldNotNegotiate> {
-public:
-  static char ID;
-
-  CouldNotNegotiate(std::string Signature);
-  std::error_code convertToErrorCode() const override;
-  void log(raw_ostream &OS) const override;
-  const std::string &getSignature() const { return Signature; }
-private:
-  std::string Signature;
-};
-
-template <typename DerivedFunc, typename FnT> class Function;
-
-// RPC Function class.
-// DerivedFunc should be a user defined class with a static 'getName()' method
-// returning a const char* representing the function's name.
-template <typename DerivedFunc, typename RetT, typename... ArgTs>
-class Function<DerivedFunc, RetT(ArgTs...)> {
-public:
-  /// User defined function type.
-  using Type = RetT(ArgTs...);
-
-  /// Return type.
-  using ReturnType = RetT;
-
-  /// Returns the full function prototype as a string.
-  static const char *getPrototype() {
-    static std::string Name = [] {
-      std::string Name;
-      raw_string_ostream(Name)
-          << RPCTypeName<RetT>::getName() << " " << DerivedFunc::getName()
-          << "(" << llvm::orc::rpc::RPCTypeNameSequence<ArgTs...>() << ")";
-      return Name;
-    }();
-    return Name.data();
-  }
-};
-
-/// Allocates RPC function ids during autonegotiation.
-/// Specializations of this class must provide four members:
-///
-/// static T getInvalidId():
-///   Should return a reserved id that will be used to represent missing
-/// functions during autonegotiation.
-///
-/// static T getResponseId():
-///   Should return a reserved id that will be used to send function responses
-/// (return values).
-///
-/// static T getNegotiateId():
-///   Should return a reserved id for the negotiate function, which will be used
-/// to negotiate ids for user defined functions.
-///
-/// template <typename Func> T allocate():
-///   Allocate a unique id for function Func.
-template <typename T, typename = void> class RPCFunctionIdAllocator;
-
-/// This specialization of RPCFunctionIdAllocator provides a default
-/// implementation for integral types.
-template <typename T>
-class RPCFunctionIdAllocator<T, std::enable_if_t<std::is_integral<T>::value>> {
-public:
-  static T getInvalidId() { return T(0); }
-  static T getResponseId() { return T(1); }
-  static T getNegotiateId() { return T(2); }
-
-  template <typename Func> T allocate() { return NextId++; }
-
-private:
-  T NextId = 3;
-};
-
-namespace detail {
-
-/// Provides a typedef for a tuple containing the decayed argument types.
-template <typename T> class FunctionArgsTuple;
-
-template <typename RetT, typename... ArgTs>
-class FunctionArgsTuple<RetT(ArgTs...)> {
-public:
-  using Type = std::tuple<std::decay_t<std::remove_reference_t<ArgTs>>...>;
-};
-
-// ResultTraits provides typedefs and utilities specific to the return type
-// of functions.
-template <typename RetT> class ResultTraits {
-public:
-  // The return type wrapped in llvm::Expected.
-  using ErrorReturnType = Expected<RetT>;
-
-#ifdef _MSC_VER
-  // The ErrorReturnType wrapped in a std::promise.
-  using ReturnPromiseType = std::promise<MSVCPExpected<RetT>>;
-
-  // The ErrorReturnType wrapped in a std::future.
-  using ReturnFutureType = std::future<MSVCPExpected<RetT>>;
-#else
-  // The ErrorReturnType wrapped in a std::promise.
-  using ReturnPromiseType = std::promise<ErrorReturnType>;
-
-  // The ErrorReturnType wrapped in a std::future.
-  using ReturnFutureType = std::future<ErrorReturnType>;
-#endif
-
-  // Create a 'blank' value of the ErrorReturnType, ready and safe to
-  // overwrite.
-  static ErrorReturnType createBlankErrorReturnValue() {
-    return ErrorReturnType(RetT());
-  }
-
-  // Consume an abandoned ErrorReturnType.
-  static void consumeAbandoned(ErrorReturnType RetOrErr) {
-    consumeError(RetOrErr.takeError());
-  }
-};
-
-// ResultTraits specialization for void functions.
-template <> class ResultTraits<void> {
-public:
-  // For void functions, ErrorReturnType is llvm::Error.
-  using ErrorReturnType = Error;
-
-#ifdef _MSC_VER
-  // The ErrorReturnType wrapped in a std::promise.
-  using ReturnPromiseType = std::promise<MSVCPError>;
-
-  // The ErrorReturnType wrapped in a std::future.
-  using ReturnFutureType = std::future<MSVCPError>;
-#else
-  // The ErrorReturnType wrapped in a std::promise.
-  using ReturnPromiseType = std::promise<ErrorReturnType>;
-
-  // The ErrorReturnType wrapped in a std::future.
-  using ReturnFutureType = std::future<ErrorReturnType>;
-#endif
-
-  // Create a 'blank' value of the ErrorReturnType, ready and safe to
-  // overwrite.
-  static ErrorReturnType createBlankErrorReturnValue() {
-    return ErrorReturnType::success();
-  }
-
-  // Consume an abandoned ErrorReturnType.
-  static void consumeAbandoned(ErrorReturnType Err) {
-    consumeError(std::move(Err));
-  }
-};
-
-// ResultTraits<Error> is equivalent to ResultTraits<void>. This allows
-// handlers for void RPC functions to return either void (in which case they
-// implicitly succeed) or Error (in which case their error return is
-// propagated). See usage in HandlerTraits::runHandlerHelper.
-template <> class ResultTraits<Error> : public ResultTraits<void> {};
-
-// ResultTraits<Expected<T>> is equivalent to ResultTraits<T>. This allows
-// handlers for RPC functions returning a T to return either a T (in which
-// case they implicitly succeed) or Expected<T> (in which case their error
-// return is propagated). See usage in HandlerTraits::runHandlerHelper.
-template <typename RetT>
-class ResultTraits<Expected<RetT>> : public ResultTraits<RetT> {};
-
-// Determines whether an RPC function's defined error return type supports
-// error return value.
-template <typename T>
-class SupportsErrorReturn {
-public:
-  static const bool value = false;
-};
-
-template <>
-class SupportsErrorReturn<Error> {
-public:
-  static const bool value = true;
-};
-
-template <typename T>
-class SupportsErrorReturn<Expected<T>> {
-public:
-  static const bool value = true;
-};
-
-// RespondHelper packages return values based on whether or not the declared
-// RPC function return type supports error returns.
-template <bool FuncSupportsErrorReturn>
-class RespondHelper;
-
-// RespondHelper specialization for functions that support error returns.
-template <>
-class RespondHelper<true> {
-public:
-
-  // Send Expected<T>.
-  template <typename WireRetT, typename HandlerRetT, typename ChannelT,
-            typename FunctionIdT, typename SequenceNumberT>
-  static Error sendResult(ChannelT &C, const FunctionIdT &ResponseId,
-                          SequenceNumberT SeqNo,
-                          Expected<HandlerRetT> ResultOrErr) {
-    if (!ResultOrErr && ResultOrErr.template errorIsA<RPCFatalError>())
-      return ResultOrErr.takeError();
-
-    // Open the response message.
-    if (auto Err = C.startSendMessage(ResponseId, SeqNo))
-      return Err;
-
-    // Serialize the result.
-    if (auto Err =
-        SerializationTraits<ChannelT, WireRetT,
-                            Expected<HandlerRetT>>::serialize(
-                                                     C, std::move(ResultOrErr)))
-      return Err;
-
-    // Close the response message.
-    if (auto Err = C.endSendMessage())
-      return Err;
-    return C.send();
-  }
-
-  template <typename ChannelT, typename FunctionIdT, typename SequenceNumberT>
-  static Error sendResult(ChannelT &C, const FunctionIdT &ResponseId,
-                          SequenceNumberT SeqNo, Error Err) {
-    if (Err && Err.isA<RPCFatalError>())
-      return Err;
-    if (auto Err2 = C.startSendMessage(ResponseId, SeqNo))
-      return Err2;
-    if (auto Err2 = serializeSeq(C, std::move(Err)))
-      return Err2;
-    if (auto Err2 = C.endSendMessage())
-      return Err2;
-    return C.send();
-  }
-
-};
-
-// RespondHelper specialization for functions that do not support error returns.
-template <>
-class RespondHelper<false> {
-public:
-
-  template <typename WireRetT, typename HandlerRetT, typename ChannelT,
-            typename FunctionIdT, typename SequenceNumberT>
-  static Error sendResult(ChannelT &C, const FunctionIdT &ResponseId,
-                          SequenceNumberT SeqNo,
-                          Expected<HandlerRetT> ResultOrErr) {
-    if (auto Err = ResultOrErr.takeError())
-      return Err;
-
-    // Open the response message.
-    if (auto Err = C.startSendMessage(ResponseId, SeqNo))
-      return Err;
-
-    // Serialize the result.
-    if (auto Err =
-        SerializationTraits<ChannelT, WireRetT, HandlerRetT>::serialize(
-                                                               C, *ResultOrErr))
-      return Err;
-
-    // End the response message.
-    if (auto Err = C.endSendMessage())
-      return Err;
-
-    return C.send();
-  }
-
-  template <typename ChannelT, typename FunctionIdT, typename SequenceNumberT>
-  static Error sendResult(ChannelT &C, const FunctionIdT &ResponseId,
-                          SequenceNumberT SeqNo, Error Err) {
-    if (Err)
-      return Err;
-    if (auto Err2 = C.startSendMessage(ResponseId, SeqNo))
-      return Err2;
-    if (auto Err2 = C.endSendMessage())
-      return Err2;
-    return C.send();
-  }
-
-};
-
-
-// Send a response of the given wire return type (WireRetT) over the
-// channel, with the given sequence number.
-template <typename WireRetT, typename HandlerRetT, typename ChannelT,
-          typename FunctionIdT, typename SequenceNumberT>
-Error respond(ChannelT &C, const FunctionIdT &ResponseId,
-              SequenceNumberT SeqNo, Expected<HandlerRetT> ResultOrErr) {
-  return RespondHelper<SupportsErrorReturn<WireRetT>::value>::
-    template sendResult<WireRetT>(C, ResponseId, SeqNo, std::move(ResultOrErr));
-}
-
-// Send an empty response message on the given channel to indicate that
-// the handler ran.
-template <typename WireRetT, typename ChannelT, typename FunctionIdT,
-          typename SequenceNumberT>
-Error respond(ChannelT &C, const FunctionIdT &ResponseId, SequenceNumberT SeqNo,
-              Error Err) {
-  return RespondHelper<SupportsErrorReturn<WireRetT>::value>::
-    sendResult(C, ResponseId, SeqNo, std::move(Err));
-}
-
-// Converts a given type to the equivalent error return type.
-template <typename T> class WrappedHandlerReturn {
-public:
-  using Type = Expected<T>;
-};
-
-template <typename T> class WrappedHandlerReturn<Expected<T>> {
-public:
-  using Type = Expected<T>;
-};
-
-template <> class WrappedHandlerReturn<void> {
-public:
-  using Type = Error;
-};
-
-template <> class WrappedHandlerReturn<Error> {
-public:
-  using Type = Error;
-};
-
-template <> class WrappedHandlerReturn<ErrorSuccess> {
-public:
-  using Type = Error;
-};
-
-// Traits class that strips the response function from the list of handler
-// arguments.
-template <typename FnT> class AsyncHandlerTraits;
-
-template <typename ResultT, typename... ArgTs>
-class AsyncHandlerTraits<Error(std::function<Error(Expected<ResultT>)>, ArgTs...)> {
-public:
-  using Type = Error(ArgTs...);
-  using ResultType = Expected<ResultT>;
-};
-
-template <typename... ArgTs>
-class AsyncHandlerTraits<Error(std::function<Error(Error)>, ArgTs...)> {
-public:
-  using Type = Error(ArgTs...);
-  using ResultType = Error;
-};
-
-template <typename... ArgTs>
-class AsyncHandlerTraits<ErrorSuccess(std::function<Error(Error)>, ArgTs...)> {
-public:
-  using Type = Error(ArgTs...);
-  using ResultType = Error;
-};
-
-template <typename... ArgTs>
-class AsyncHandlerTraits<void(std::function<Error(Error)>, ArgTs...)> {
-public:
-  using Type = Error(ArgTs...);
-  using ResultType = Error;
-};
-
-template <typename ResponseHandlerT, typename... ArgTs>
-class AsyncHandlerTraits<Error(ResponseHandlerT, ArgTs...)>
-    : public AsyncHandlerTraits<Error(std::decay_t<ResponseHandlerT>,
-                                      ArgTs...)> {};
-
-// This template class provides utilities related to RPC function handlers.
-// The base case applies to non-function types (the template class is
-// specialized for function types) and inherits from the appropriate
-// speciilization for the given non-function type's call operator.
-template <typename HandlerT>
-class HandlerTraits : public HandlerTraits<decltype(
-                          &std::remove_reference<HandlerT>::type::operator())> {
-};
-
-// Traits for handlers with a given function type.
-template <typename RetT, typename... ArgTs>
-class HandlerTraits<RetT(ArgTs...)> {
-public:
-  // Function type of the handler.
-  using Type = RetT(ArgTs...);
-
-  // Return type of the handler.
-  using ReturnType = RetT;
-
-  // Call the given handler with the given arguments.
-  template <typename HandlerT, typename... TArgTs>
-  static typename WrappedHandlerReturn<RetT>::Type
-  unpackAndRun(HandlerT &Handler, std::tuple<TArgTs...> &Args) {
-    return unpackAndRunHelper(Handler, Args,
-                              std::index_sequence_for<TArgTs...>());
-  }
-
-  // Call the given handler with the given arguments.
-  template <typename HandlerT, typename ResponderT, typename... TArgTs>
-  static Error unpackAndRunAsync(HandlerT &Handler, ResponderT &Responder,
-                                 std::tuple<TArgTs...> &Args) {
-    return unpackAndRunAsyncHelper(Handler, Responder, Args,
-                                   std::index_sequence_for<TArgTs...>());
-  }
-
-  // Call the given handler with the given arguments.
-  template <typename HandlerT>
-  static std::enable_if_t<
-      std::is_void<typename HandlerTraits<HandlerT>::ReturnType>::value, Error>
-  run(HandlerT &Handler, ArgTs &&... Args) {
-    Handler(std::move(Args)...);
-    return Error::success();
-  }
-
-  template <typename HandlerT, typename... TArgTs>
-  static std::enable_if_t<
-      !std::is_void<typename HandlerTraits<HandlerT>::ReturnType>::value,
-      typename HandlerTraits<HandlerT>::ReturnType>
-  run(HandlerT &Handler, TArgTs... Args) {
-    return Handler(std::move(Args)...);
-  }
-
-  // Serialize arguments to the channel.
-  template <typename ChannelT, typename... CArgTs>
-  static Error serializeArgs(ChannelT &C, const CArgTs... CArgs) {
-    return SequenceSerialization<ChannelT, ArgTs...>::serialize(C, CArgs...);
-  }
-
-  // Deserialize arguments from the channel.
-  template <typename ChannelT, typename... CArgTs>
-  static Error deserializeArgs(ChannelT &C, std::tuple<CArgTs...> &Args) {
-    return deserializeArgsHelper(C, Args, std::index_sequence_for<CArgTs...>());
-  }
-
-private:
-  template <typename ChannelT, typename... CArgTs, size_t... Indexes>
-  static Error deserializeArgsHelper(ChannelT &C, std::tuple<CArgTs...> &Args,
-                                     std::index_sequence<Indexes...> _) {
-    return SequenceSerialization<ChannelT, ArgTs...>::deserialize(
-        C, std::get<Indexes>(Args)...);
-  }
-
-  template <typename HandlerT, typename ArgTuple, size_t... Indexes>
-  static typename WrappedHandlerReturn<
-      typename HandlerTraits<HandlerT>::ReturnType>::Type
-  unpackAndRunHelper(HandlerT &Handler, ArgTuple &Args,
-                     std::index_sequence<Indexes...>) {
-    return run(Handler, std::move(std::get<Indexes>(Args))...);
-  }
-
-  template <typename HandlerT, typename ResponderT, typename ArgTuple,
-            size_t... Indexes>
-  static typename WrappedHandlerReturn<
-      typename HandlerTraits<HandlerT>::ReturnType>::Type
-  unpackAndRunAsyncHelper(HandlerT &Handler, ResponderT &Responder,
-                          ArgTuple &Args, std::index_sequence<Indexes...>) {
-    return run(Handler, Responder, std::move(std::get<Indexes>(Args))...);
-  }
-};
-
-// Handler traits for free functions.
-template <typename RetT, typename... ArgTs>
-class HandlerTraits<RetT(*)(ArgTs...)>
-  : public HandlerTraits<RetT(ArgTs...)> {};
-
-// Handler traits for class methods (especially call operators for lambdas).
-template <typename Class, typename RetT, typename... ArgTs>
-class HandlerTraits<RetT (Class::*)(ArgTs...)>
-    : public HandlerTraits<RetT(ArgTs...)> {};
-
-// Handler traits for const class methods (especially call operators for
-// lambdas).
-template <typename Class, typename RetT, typename... ArgTs>
-class HandlerTraits<RetT (Class::*)(ArgTs...) const>
-    : public HandlerTraits<RetT(ArgTs...)> {};
-
-// Utility to peel the Expected wrapper off a response handler error type.
-template <typename HandlerT> class ResponseHandlerArg;
-
-template <typename ArgT> class ResponseHandlerArg<Error(Expected<ArgT>)> {
-public:
-  using ArgType = Expected<ArgT>;
-  using UnwrappedArgType = ArgT;
-};
-
-template <typename ArgT>
-class ResponseHandlerArg<ErrorSuccess(Expected<ArgT>)> {
-public:
-  using ArgType = Expected<ArgT>;
-  using UnwrappedArgType = ArgT;
-};
-
-template <> class ResponseHandlerArg<Error(Error)> {
-public:
-  using ArgType = Error;
-};
-
-template <> class ResponseHandlerArg<ErrorSuccess(Error)> {
-public:
-  using ArgType = Error;
-};
-
-// ResponseHandler represents a handler for a not-yet-received function call
-// result.
-template <typename ChannelT> class ResponseHandler {
-public:
-  virtual ~ResponseHandler() {}
-
-  // Reads the function result off the wire and acts on it. The meaning of
-  // "act" will depend on how this method is implemented in any given
-  // ResponseHandler subclass but could, for example, mean running a
-  // user-specified handler or setting a promise value.
-  virtual Error handleResponse(ChannelT &C) = 0;
-
-  // Abandons this outstanding result.
-  virtual void abandon() = 0;
-
-  // Create an error instance representing an abandoned response.
-  static Error createAbandonedResponseError() {
-    return make_error<ResponseAbandoned>();
-  }
-};
-
-// ResponseHandler subclass for RPC functions with non-void returns.
-template <typename ChannelT, typename FuncRetT, typename HandlerT>
-class ResponseHandlerImpl : public ResponseHandler<ChannelT> {
-public:
-  ResponseHandlerImpl(HandlerT Handler) : Handler(std::move(Handler)) {}
-
-  // Handle the result by deserializing it from the channel then passing it
-  // to the user defined handler.
-  Error handleResponse(ChannelT &C) override {
-    using UnwrappedArgType = typename ResponseHandlerArg<
-        typename HandlerTraits<HandlerT>::Type>::UnwrappedArgType;
-    UnwrappedArgType Result;
-    if (auto Err =
-            SerializationTraits<ChannelT, FuncRetT,
-                                UnwrappedArgType>::deserialize(C, Result))
-      return Err;
-    if (auto Err = C.endReceiveMessage())
-      return Err;
-    return Handler(std::move(Result));
-  }
-
-  // Abandon this response by calling the handler with an 'abandoned response'
-  // error.
-  void abandon() override {
-    if (auto Err = Handler(this->createAbandonedResponseError())) {
-      // Handlers should not fail when passed an abandoned response error.
-      report_fatal_error(std::move(Err));
-    }
-  }
-
-private:
-  HandlerT Handler;
-};
-
-// ResponseHandler subclass for RPC functions with void returns.
-template <typename ChannelT, typename HandlerT>
-class ResponseHandlerImpl<ChannelT, void, HandlerT>
-    : public ResponseHandler<ChannelT> {
-public:
-  ResponseHandlerImpl(HandlerT Handler) : Handler(std::move(Handler)) {}
-
-  // Handle the result (no actual value, just a notification that the function
-  // has completed on the remote end) by calling the user-defined handler with
-  // Error::success().
-  Error handleResponse(ChannelT &C) override {
-    if (auto Err = C.endReceiveMessage())
-      return Err;
-    return Handler(Error::success());
-  }
-
-  // Abandon this response by calling the handler with an 'abandoned response'
-  // error.
-  void abandon() override {
-    if (auto Err = Handler(this->createAbandonedResponseError())) {
-      // Handlers should not fail when passed an abandoned response error.
-      report_fatal_error(std::move(Err));
-    }
-  }
-
-private:
-  HandlerT Handler;
-};
-
-template <typename ChannelT, typename FuncRetT, typename HandlerT>
-class ResponseHandlerImpl<ChannelT, Expected<FuncRetT>, HandlerT>
-    : public ResponseHandler<ChannelT> {
-public:
-  ResponseHandlerImpl(HandlerT Handler) : Handler(std::move(Handler)) {}
-
-  // Handle the result by deserializing it from the channel then passing it
-  // to the user defined handler.
-  Error handleResponse(ChannelT &C) override {
-    using HandlerArgType = typename ResponseHandlerArg<
-        typename HandlerTraits<HandlerT>::Type>::ArgType;
-    HandlerArgType Result((typename HandlerArgType::value_type()));
-
-    if (auto Err =
-            SerializationTraits<ChannelT, Expected<FuncRetT>,
-                                HandlerArgType>::deserialize(C, Result))
-      return Err;
-    if (auto Err = C.endReceiveMessage())
-      return Err;
-    return Handler(std::move(Result));
-  }
-
-  // Abandon this response by calling the handler with an 'abandoned response'
-  // error.
-  void abandon() override {
-    if (auto Err = Handler(this->createAbandonedResponseError())) {
-      // Handlers should not fail when passed an abandoned response error.
-      report_fatal_error(std::move(Err));
-    }
-  }
-
-private:
-  HandlerT Handler;
-};
-
-template <typename ChannelT, typename HandlerT>
-class ResponseHandlerImpl<ChannelT, Error, HandlerT>
-    : public ResponseHandler<ChannelT> {
-public:
-  ResponseHandlerImpl(HandlerT Handler) : Handler(std::move(Handler)) {}
-
-  // Handle the result by deserializing it from the channel then passing it
-  // to the user defined handler.
-  Error handleResponse(ChannelT &C) override {
-    Error Result = Error::success();
-    if (auto Err = SerializationTraits<ChannelT, Error, Error>::deserialize(
-            C, Result)) {
-      consumeError(std::move(Result));
-      return Err;
-    }
-    if (auto Err = C.endReceiveMessage()) {
-      consumeError(std::move(Result));
-      return Err;
-    }
-    return Handler(std::move(Result));
-  }
-
-  // Abandon this response by calling the handler with an 'abandoned response'
-  // error.
-  void abandon() override {
-    if (auto Err = Handler(this->createAbandonedResponseError())) {
-      // Handlers should not fail when passed an abandoned response error.
-      report_fatal_error(std::move(Err));
-    }
-  }
-
-private:
-  HandlerT Handler;
-};
-
-// Create a ResponseHandler from a given user handler.
-template <typename ChannelT, typename FuncRetT, typename HandlerT>
-std::unique_ptr<ResponseHandler<ChannelT>> createResponseHandler(HandlerT H) {
-  return std::make_unique<ResponseHandlerImpl<ChannelT, FuncRetT, HandlerT>>(
-      std::move(H));
-}
-
-// Helper for wrapping member functions up as functors. This is useful for
-// installing methods as result handlers.
-template <typename ClassT, typename RetT, typename... ArgTs>
-class MemberFnWrapper {
-public:
-  using MethodT = RetT (ClassT::*)(ArgTs...);
-  MemberFnWrapper(ClassT &Instance, MethodT Method)
-      : Instance(Instance), Method(Method) {}
-  RetT operator()(ArgTs &&... Args) {
-    return (Instance.*Method)(std::move(Args)...);
-  }
-
-private:
-  ClassT &Instance;
-  MethodT Method;
-};
-
-// Helper that provides a Functor for deserializing arguments.
-template <typename... ArgTs> class ReadArgs {
-public:
-  Error operator()() { return Error::success(); }
-};
-
-template <typename ArgT, typename... ArgTs>
-class ReadArgs<ArgT, ArgTs...> : public ReadArgs<ArgTs...> {
-public:
-  ReadArgs(ArgT &Arg, ArgTs &... Args)
-      : ReadArgs<ArgTs...>(Args...), Arg(Arg) {}
-
-  Error operator()(ArgT &ArgVal, ArgTs &... ArgVals) {
-    this->Arg = std::move(ArgVal);
-    return ReadArgs<ArgTs...>::operator()(ArgVals...);
-  }
-
-private:
-  ArgT &Arg;
-};
-
-// Manage sequence numbers.
-template <typename SequenceNumberT> class SequenceNumberManager {
-public:
-  // Reset, making all sequence numbers available.
-  void reset() {
-    std::lock_guard<std::mutex> Lock(SeqNoLock);
-    NextSequenceNumber = 0;
-    FreeSequenceNumbers.clear();
-  }
-
-  // Get the next available sequence number. Will re-use numbers that have
-  // been released.
-  SequenceNumberT getSequenceNumber() {
-    std::lock_guard<std::mutex> Lock(SeqNoLock);
-    if (FreeSequenceNumbers.empty())
-      return NextSequenceNumber++;
-    auto SequenceNumber = FreeSequenceNumbers.back();
-    FreeSequenceNumbers.pop_back();
-    return SequenceNumber;
-  }
-
-  // Release a sequence number, making it available for re-use.
-  void releaseSequenceNumber(SequenceNumberT SequenceNumber) {
-    std::lock_guard<std::mutex> Lock(SeqNoLock);
-    FreeSequenceNumbers.push_back(SequenceNumber);
-  }
-
-private:
-  std::mutex SeqNoLock;
-  SequenceNumberT NextSequenceNumber = 0;
-  std::vector<SequenceNumberT> FreeSequenceNumbers;
-};
-
-// Checks that predicate P holds for each corresponding pair of type arguments
-// from T1 and T2 tuple.
-template <template <class, class> class P, typename T1Tuple, typename T2Tuple>
-class RPCArgTypeCheckHelper;
-
-template <template <class, class> class P>
-class RPCArgTypeCheckHelper<P, std::tuple<>, std::tuple<>> {
-public:
-  static const bool value = true;
-};
-
-template <template <class, class> class P, typename T, typename... Ts,
-          typename U, typename... Us>
-class RPCArgTypeCheckHelper<P, std::tuple<T, Ts...>, std::tuple<U, Us...>> {
-public:
-  static const bool value =
-      P<T, U>::value &&
-      RPCArgTypeCheckHelper<P, std::tuple<Ts...>, std::tuple<Us...>>::value;
-};
-
-template <template <class, class> class P, typename T1Sig, typename T2Sig>
-class RPCArgTypeCheck {
-public:
-  using T1Tuple = typename FunctionArgsTuple<T1Sig>::Type;
-  using T2Tuple = typename FunctionArgsTuple<T2Sig>::Type;
-
-  static_assert(std::tuple_size<T1Tuple>::value >=
-                    std::tuple_size<T2Tuple>::value,
-                "Too many arguments to RPC call");
-  static_assert(std::tuple_size<T1Tuple>::value <=
-                    std::tuple_size<T2Tuple>::value,
-                "Too few arguments to RPC call");
-
-  static const bool value = RPCArgTypeCheckHelper<P, T1Tuple, T2Tuple>::value;
-};
-
-template <typename ChannelT, typename WireT, typename ConcreteT>
-class CanSerialize {
-private:
-  using S = SerializationTraits<ChannelT, WireT, ConcreteT>;
-
-  template <typename T>
-  static std::true_type check(
-      std::enable_if_t<std::is_same<decltype(T::serialize(
-                                        std::declval<ChannelT &>(),
-                                        std::declval<const ConcreteT &>())),
-                                    Error>::value,
-                       void *>);
-
-  template <typename> static std::false_type check(...);
-
-public:
-  static const bool value = decltype(check<S>(0))::value;
-};
-
-template <typename ChannelT, typename WireT, typename ConcreteT>
-class CanDeserialize {
-private:
-  using S = SerializationTraits<ChannelT, WireT, ConcreteT>;
-
-  template <typename T>
-  static std::true_type
-      check(std::enable_if_t<
-            std::is_same<decltype(T::deserialize(std::declval<ChannelT &>(),
-                                                 std::declval<ConcreteT &>())),
-                         Error>::value,
-            void *>);
-
-  template <typename> static std::false_type check(...);
-
-public:
-  static const bool value = decltype(check<S>(0))::value;
-};
-
-/// Contains primitive utilities for defining, calling and handling calls to
-/// remote procedures. ChannelT is a bidirectional stream conforming to the
-/// RPCChannel interface (see RPCChannel.h), FunctionIdT is a procedure
-/// identifier type that must be serializable on ChannelT, and SequenceNumberT
-/// is an integral type that will be used to number in-flight function calls.
-///
-/// These utilities support the construction of very primitive RPC utilities.
-/// Their intent is to ensure correct serialization and deserialization of
-/// procedure arguments, and to keep the client and server's view of the API in
-/// sync.
-template <typename ImplT, typename ChannelT, typename FunctionIdT,
-          typename SequenceNumberT>
-class RPCEndpointBase {
-protected:
-  class OrcRPCInvalid : public Function<OrcRPCInvalid, void()> {
-  public:
-    static const char *getName() { return "__orc_rpc$invalid"; }
-  };
-
-  class OrcRPCResponse : public Function<OrcRPCResponse, void()> {
-  public:
-    static const char *getName() { return "__orc_rpc$response"; }
-  };
-
-  class OrcRPCNegotiate
-      : public Function<OrcRPCNegotiate, FunctionIdT(std::string)> {
-  public:
-    static const char *getName() { return "__orc_rpc$negotiate"; }
-  };
-
-  // Helper predicate for testing for the presence of SerializeTraits
-  // serializers.
-  template <typename WireT, typename ConcreteT>
-  class CanSerializeCheck : detail::CanSerialize<ChannelT, WireT, ConcreteT> {
-  public:
-    using detail::CanSerialize<ChannelT, WireT, ConcreteT>::value;
-
-    static_assert(value, "Missing serializer for argument (Can't serialize the "
-                         "first template type argument of CanSerializeCheck "
-                         "from the second)");
-  };
-
-  // Helper predicate for testing for the presence of SerializeTraits
-  // deserializers.
-  template <typename WireT, typename ConcreteT>
-  class CanDeserializeCheck
-      : detail::CanDeserialize<ChannelT, WireT, ConcreteT> {
-  public:
-    using detail::CanDeserialize<ChannelT, WireT, ConcreteT>::value;
-
-    static_assert(value, "Missing deserializer for argument (Can't deserialize "
-                         "the second template type argument of "
-                         "CanDeserializeCheck from the first)");
-  };
-
-public:
-  /// Construct an RPC instance on a channel.
-  RPCEndpointBase(ChannelT &C, bool LazyAutoNegotiation)
-      : C(C), LazyAutoNegotiation(LazyAutoNegotiation) {
-    // Hold ResponseId in a special variable, since we expect Response to be
-    // called relatively frequently, and want to avoid the map lookup.
-    ResponseId = FnIdAllocator.getResponseId();
-    RemoteFunctionIds[OrcRPCResponse::getPrototype()] = ResponseId;
-
-    // Register the negotiate function id and handler.
-    auto NegotiateId = FnIdAllocator.getNegotiateId();
-    RemoteFunctionIds[OrcRPCNegotiate::getPrototype()] = NegotiateId;
-    Handlers[NegotiateId] = wrapHandler<OrcRPCNegotiate>(
-        [this](const std::string &Name) { return handleNegotiate(Name); });
-  }
-
-
-  /// Negotiate a function id for Func with the other end of the channel.
-  template <typename Func> Error negotiateFunction(bool Retry = false) {
-    return getRemoteFunctionId<Func>(true, Retry).takeError();
-  }
-
-  /// Append a call Func, does not call send on the channel.
-  /// The first argument specifies a user-defined handler to be run when the
-  /// function returns. The handler should take an Expected<Func::ReturnType>,
-  /// or an Error (if Func::ReturnType is void). The handler will be called
-  /// with an error if the return value is abandoned due to a channel error.
-  template <typename Func, typename HandlerT, typename... ArgTs>
-  Error appendCallAsync(HandlerT Handler, const ArgTs &... Args) {
-
-    static_assert(
-        detail::RPCArgTypeCheck<CanSerializeCheck, typename Func::Type,
-                                void(ArgTs...)>::value,
-        "");
-
-    // Look up the function ID.
-    FunctionIdT FnId;
-    if (auto FnIdOrErr = getRemoteFunctionId<Func>(LazyAutoNegotiation, false))
-      FnId = *FnIdOrErr;
-    else {
-      // Negotiation failed. Notify the handler then return the negotiate-failed
-      // error.
-      cantFail(Handler(make_error<ResponseAbandoned>()));
-      return FnIdOrErr.takeError();
-    }
-
-    SequenceNumberT SeqNo; // initialized in locked scope below.
-    {
-      // Lock the pending responses map and sequence number manager.
-      std::lock_guard<std::mutex> Lock(ResponsesMutex);
-
-      // Allocate a sequence number.
-      SeqNo = SequenceNumberMgr.getSequenceNumber();
-      assert(!PendingResponses.count(SeqNo) &&
-             "Sequence number already allocated");
-
-      // Install the user handler.
-      PendingResponses[SeqNo] =
-        detail::createResponseHandler<ChannelT, typename Func::ReturnType>(
-            std::move(Handler));
-    }
-
-    // Open the function call message.
-    if (auto Err = C.startSendMessage(FnId, SeqNo)) {
-      abandonPendingResponses();
-      return Err;
-    }
-
-    // Serialize the call arguments.
-    if (auto Err = detail::HandlerTraits<typename Func::Type>::serializeArgs(
-            C, Args...)) {
-      abandonPendingResponses();
-      return Err;
-    }
-
-    // Close the function call messagee.
-    if (auto Err = C.endSendMessage()) {
-      abandonPendingResponses();
-      return Err;
-    }
-
-    return Error::success();
-  }
-
-  Error sendAppendedCalls() { return C.send(); };
-
-  template <typename Func, typename HandlerT, typename... ArgTs>
-  Error callAsync(HandlerT Handler, const ArgTs &... Args) {
-    if (auto Err = appendCallAsync<Func>(std::move(Handler), Args...))
-      return Err;
-    return C.send();
-  }
-
-  /// Handle one incoming call.
-  Error handleOne() {
-    FunctionIdT FnId;
-    SequenceNumberT SeqNo;
-    if (auto Err = C.startReceiveMessage(FnId, SeqNo)) {
-      abandonPendingResponses();
-      return Err;
-    }
-    if (FnId == ResponseId)
-      return handleResponse(SeqNo);
-    auto I = Handlers.find(FnId);
-    if (I != Handlers.end())
-      return I->second(C, SeqNo);
-
-    // else: No handler found. Report error to client?
-    return make_error<BadFunctionCall<FunctionIdT, SequenceNumberT>>(FnId,
-                                                                     SeqNo);
-  }
-
-  /// Helper for handling setter procedures - this method returns a functor that
-  /// sets the variables referred to by Args... to values deserialized from the
-  /// channel.
-  /// E.g.
-  ///
-  ///   typedef Function<0, bool, int> Func1;
-  ///
-  ///   ...
-  ///   bool B;
-  ///   int I;
-  ///   if (auto Err = expect<Func1>(Channel, readArgs(B, I)))
-  ///     /* Handle Args */ ;
-  ///
-  template <typename... ArgTs>
-  static detail::ReadArgs<ArgTs...> readArgs(ArgTs &... Args) {
-    return detail::ReadArgs<ArgTs...>(Args...);
-  }
-
-  /// Abandon all outstanding result handlers.
-  ///
-  /// This will call all currently registered result handlers to receive an
-  /// "abandoned" error as their argument. This is used internally by the RPC
-  /// in error situations, but can also be called directly by clients who are
-  /// disconnecting from the remote and don't or can't expect responses to their
-  /// outstanding calls. (Especially for outstanding blocking calls, calling
-  /// this function may be necessary to avoid dead threads).
-  void abandonPendingResponses() {
-    // Lock the pending responses map and sequence number manager.
-    std::lock_guard<std::mutex> Lock(ResponsesMutex);
-
-    for (auto &KV : PendingResponses)
-      KV.second->abandon();
-    PendingResponses.clear();
-    SequenceNumberMgr.reset();
-  }
-
-  /// Remove the handler for the given function.
-  /// A handler must currently be registered for this function.
-  template <typename Func>
-  void removeHandler() {
-    auto IdItr = LocalFunctionIds.find(Func::getPrototype());
-    assert(IdItr != LocalFunctionIds.end() &&
-           "Function does not have a registered handler");
-    auto HandlerItr = Handlers.find(IdItr->second);
-    assert(HandlerItr != Handlers.end() &&
-           "Function does not have a registered handler");
-    Handlers.erase(HandlerItr);
-  }
-
-  /// Clear all handlers.
-  void clearHandlers() {
-    Handlers.clear();
-  }
-
-protected:
-
-  FunctionIdT getInvalidFunctionId() const {
-    return FnIdAllocator.getInvalidId();
-  }
-
-  /// Add the given handler to the handler map and make it available for
-  /// autonegotiation and execution.
-  template <typename Func, typename HandlerT>
-  void addHandlerImpl(HandlerT Handler) {
-
-    static_assert(detail::RPCArgTypeCheck<
-                      CanDeserializeCheck, typename Func::Type,
-                      typename detail::HandlerTraits<HandlerT>::Type>::value,
-                  "");
-
-    FunctionIdT NewFnId = FnIdAllocator.template allocate<Func>();
-    LocalFunctionIds[Func::getPrototype()] = NewFnId;
-    Handlers[NewFnId] = wrapHandler<Func>(std::move(Handler));
-  }
-
-  template <typename Func, typename HandlerT>
-  void addAsyncHandlerImpl(HandlerT Handler) {
-
-    static_assert(detail::RPCArgTypeCheck<
-                      CanDeserializeCheck, typename Func::Type,
-                      typename detail::AsyncHandlerTraits<
-                        typename detail::HandlerTraits<HandlerT>::Type
-                      >::Type>::value,
-                  "");
-
-    FunctionIdT NewFnId = FnIdAllocator.template allocate<Func>();
-    LocalFunctionIds[Func::getPrototype()] = NewFnId;
-    Handlers[NewFnId] = wrapAsyncHandler<Func>(std::move(Handler));
-  }
-
-  Error handleResponse(SequenceNumberT SeqNo) {
-    using Handler = typename decltype(PendingResponses)::mapped_type;
-    Handler PRHandler;
-
-    {
-      // Lock the pending responses map and sequence number manager.
-      std::unique_lock<std::mutex> Lock(ResponsesMutex);
-      auto I = PendingResponses.find(SeqNo);
-
-      if (I != PendingResponses.end()) {
-        PRHandler = std::move(I->second);
-        PendingResponses.erase(I);
-        SequenceNumberMgr.releaseSequenceNumber(SeqNo);
-      } else {
-        // Unlock the pending results map to prevent recursive lock.
-        Lock.unlock();
-        abandonPendingResponses();
-        return make_error<
-                 InvalidSequenceNumberForResponse<SequenceNumberT>>(SeqNo);
-      }
-    }
-
-    assert(PRHandler &&
-           "If we didn't find a response handler we should have bailed out");
-
-    if (auto Err = PRHandler->handleResponse(C)) {
-      abandonPendingResponses();
-      return Err;
-    }
-
-    return Error::success();
-  }
-
-  FunctionIdT handleNegotiate(const std::string &Name) {
-    auto I = LocalFunctionIds.find(Name);
-    if (I == LocalFunctionIds.end())
-      return getInvalidFunctionId();
-    return I->second;
-  }
-
-  // Find the remote FunctionId for the given function.
-  template <typename Func>
-  Expected<FunctionIdT> getRemoteFunctionId(bool NegotiateIfNotInMap,
-                                            bool NegotiateIfInvalid) {
-    bool DoNegotiate;
-
-    // Check if we already have a function id...
-    auto I = RemoteFunctionIds.find(Func::getPrototype());
-    if (I != RemoteFunctionIds.end()) {
-      // If it's valid there's nothing left to do.
-      if (I->second != getInvalidFunctionId())
-        return I->second;
-      DoNegotiate = NegotiateIfInvalid;
-    } else
-      DoNegotiate = NegotiateIfNotInMap;
-
-    // We don't have a function id for Func yet, but we're allowed to try to
-    // negotiate one.
-    if (DoNegotiate) {
-      auto &Impl = static_cast<ImplT &>(*this);
-      if (auto RemoteIdOrErr =
-          Impl.template callB<OrcRPCNegotiate>(Func::getPrototype())) {
-        RemoteFunctionIds[Func::getPrototype()] = *RemoteIdOrErr;
-        if (*RemoteIdOrErr == getInvalidFunctionId())
-          return make_error<CouldNotNegotiate>(Func::getPrototype());
-        return *RemoteIdOrErr;
-      } else
-        return RemoteIdOrErr.takeError();
-    }
-
-    // No key was available in the map and we weren't allowed to try to
-    // negotiate one, so return an unknown function error.
-    return make_error<CouldNotNegotiate>(Func::getPrototype());
-  }
-
-  using WrappedHandlerFn = std::function<Error(ChannelT &, SequenceNumberT)>;
-
-  // Wrap the given user handler in the necessary argument-deserialization code,
-  // result-serialization code, and call to the launch policy (if present).
-  template <typename Func, typename HandlerT>
-  WrappedHandlerFn wrapHandler(HandlerT Handler) {
-    return [this, Handler](ChannelT &Channel,
-                           SequenceNumberT SeqNo) mutable -> Error {
-      // Start by deserializing the arguments.
-      using ArgsTuple =
-          typename detail::FunctionArgsTuple<
-            typename detail::HandlerTraits<HandlerT>::Type>::Type;
-      auto Args = std::make_shared<ArgsTuple>();
-
-      if (auto Err =
-              detail::HandlerTraits<typename Func::Type>::deserializeArgs(
-                  Channel, *Args))
-        return Err;
-
-      // GCC 4.7 and 4.8 incorrectly issue a -Wunused-but-set-variable warning
-      // for RPCArgs. Void cast RPCArgs to work around this for now.
-      // FIXME: Remove this workaround once we can assume a working GCC version.
-      (void)Args;
-
-      // End receieve message, unlocking the channel for reading.
-      if (auto Err = Channel.endReceiveMessage())
-        return Err;
-
-      using HTraits = detail::HandlerTraits<HandlerT>;
-      using FuncReturn = typename Func::ReturnType;
-      return detail::respond<FuncReturn>(Channel, ResponseId, SeqNo,
-                                         HTraits::unpackAndRun(Handler, *Args));
-    };
-  }
-
-  // Wrap the given user handler in the necessary argument-deserialization code,
-  // result-serialization code, and call to the launch policy (if present).
-  template <typename Func, typename HandlerT>
-  WrappedHandlerFn wrapAsyncHandler(HandlerT Handler) {
-    return [this, Handler](ChannelT &Channel,
-                           SequenceNumberT SeqNo) mutable -> Error {
-      // Start by deserializing the arguments.
-      using AHTraits = detail::AsyncHandlerTraits<
-                         typename detail::HandlerTraits<HandlerT>::Type>;
-      using ArgsTuple =
-          typename detail::FunctionArgsTuple<typename AHTraits::Type>::Type;
-      auto Args = std::make_shared<ArgsTuple>();
-
-      if (auto Err =
-              detail::HandlerTraits<typename Func::Type>::deserializeArgs(
-                  Channel, *Args))
-        return Err;
-
-      // GCC 4.7 and 4.8 incorrectly issue a -Wunused-but-set-variable warning
-      // for RPCArgs. Void cast RPCArgs to work around this for now.
-      // FIXME: Remove this workaround once we can assume a working GCC version.
-      (void)Args;
-
-      // End receieve message, unlocking the channel for reading.
-      if (auto Err = Channel.endReceiveMessage())
-        return Err;
-
-      using HTraits = detail::HandlerTraits<HandlerT>;
-      using FuncReturn = typename Func::ReturnType;
-      auto Responder =
-        [this, SeqNo](typename AHTraits::ResultType RetVal) -> Error {
-          return detail::respond<FuncReturn>(C, ResponseId, SeqNo,
-                                             std::move(RetVal));
-        };
-
-      return HTraits::unpackAndRunAsync(Handler, Responder, *Args);
-    };
-  }
-
-  ChannelT &C;
-
-  bool LazyAutoNegotiation;
-
-  RPCFunctionIdAllocator<FunctionIdT> FnIdAllocator;
-
-  FunctionIdT ResponseId;
-  std::map<std::string, FunctionIdT> LocalFunctionIds;
-  std::map<const char *, FunctionIdT> RemoteFunctionIds;
-
-  std::map<FunctionIdT, WrappedHandlerFn> Handlers;
-
-  std::mutex ResponsesMutex;
-  detail::SequenceNumberManager<SequenceNumberT> SequenceNumberMgr;
-  std::map<SequenceNumberT, std::unique_ptr<detail::ResponseHandler<ChannelT>>>
-      PendingResponses;
-};
-
-} // end namespace detail
-
-template <typename ChannelT, typename FunctionIdT = uint32_t,
-          typename SequenceNumberT = uint32_t>
-class MultiThreadedRPCEndpoint
-    : public detail::RPCEndpointBase<
-          MultiThreadedRPCEndpoint<ChannelT, FunctionIdT, SequenceNumberT>,
-          ChannelT, FunctionIdT, SequenceNumberT> {
-private:
-  using BaseClass =
-      detail::RPCEndpointBase<
-        MultiThreadedRPCEndpoint<ChannelT, FunctionIdT, SequenceNumberT>,
-        ChannelT, FunctionIdT, SequenceNumberT>;
-
-public:
-  MultiThreadedRPCEndpoint(ChannelT &C, bool LazyAutoNegotiation)
-      : BaseClass(C, LazyAutoNegotiation) {}
-
-  /// Add a handler for the given RPC function.
-  /// This installs the given handler functor for the given RPC Function, and
-  /// makes the RPC function available for negotiation/calling from the remote.
-  template <typename Func, typename HandlerT>
-  void addHandler(HandlerT Handler) {
-    return this->template addHandlerImpl<Func>(std::move(Handler));
-  }
-
-  /// Add a class-method as a handler.
-  template <typename Func, typename ClassT, typename RetT, typename... ArgTs>
-  void addHandler(ClassT &Object, RetT (ClassT::*Method)(ArgTs...)) {
-    addHandler<Func>(
-      detail::MemberFnWrapper<ClassT, RetT, ArgTs...>(Object, Method));
-  }
-
-  template <typename Func, typename HandlerT>
-  void addAsyncHandler(HandlerT Handler) {
-    return this->template addAsyncHandlerImpl<Func>(std::move(Handler));
-  }
-
-  /// Add a class-method as a handler.
-  template <typename Func, typename ClassT, typename RetT, typename... ArgTs>
-  void addAsyncHandler(ClassT &Object, RetT (ClassT::*Method)(ArgTs...)) {
-    addAsyncHandler<Func>(
-      detail::MemberFnWrapper<ClassT, RetT, ArgTs...>(Object, Method));
-  }
-
-  /// Return type for non-blocking call primitives.
-  template <typename Func>
-  using NonBlockingCallResult = typename detail::ResultTraits<
-      typename Func::ReturnType>::ReturnFutureType;
-
-  /// Call Func on Channel C. Does not block, does not call send. Returns a pair
-  /// of a future result and the sequence number assigned to the result.
-  ///
-  /// This utility function is primarily used for single-threaded mode support,
-  /// where the sequence number can be used to wait for the corresponding
-  /// result. In multi-threaded mode the appendCallNB method, which does not
-  /// return the sequence numeber, should be preferred.
-  template <typename Func, typename... ArgTs>
-  Expected<NonBlockingCallResult<Func>> appendCallNB(const ArgTs &... Args) {
-    using RTraits = detail::ResultTraits<typename Func::ReturnType>;
-    using ErrorReturn = typename RTraits::ErrorReturnType;
-    using ErrorReturnPromise = typename RTraits::ReturnPromiseType;
-
-    ErrorReturnPromise Promise;
-    auto FutureResult = Promise.get_future();
-
-    if (auto Err = this->template appendCallAsync<Func>(
-            [Promise = std::move(Promise)](ErrorReturn RetOrErr) mutable {
-              Promise.set_value(std::move(RetOrErr));
-              return Error::success();
-            },
-            Args...)) {
-      RTraits::consumeAbandoned(FutureResult.get());
-      return std::move(Err);
-    }
-    return std::move(FutureResult);
-  }
-
-  /// The same as appendCallNBWithSeq, except that it calls C.send() to
-  /// flush the channel after serializing the call.
-  template <typename Func, typename... ArgTs>
-  Expected<NonBlockingCallResult<Func>> callNB(const ArgTs &... Args) {
-    auto Result = appendCallNB<Func>(Args...);
-    if (!Result)
-      return Result;
-    if (auto Err = this->C.send()) {
-      this->abandonPendingResponses();
-      detail::ResultTraits<typename Func::ReturnType>::consumeAbandoned(
-          std::move(Result->get()));
-      return std::move(Err);
-    }
-    return Result;
-  }
-
-  /// Call Func on Channel C. Blocks waiting for a result. Returns an Error
-  /// for void functions or an Expected<T> for functions returning a T.
-  ///
-  /// This function is for use in threaded code where another thread is
-  /// handling responses and incoming calls.
-  template <typename Func, typename... ArgTs,
-            typename AltRetT = typename Func::ReturnType>
-  typename detail::ResultTraits<AltRetT>::ErrorReturnType
-  callB(const ArgTs &... Args) {
-    if (auto FutureResOrErr = callNB<Func>(Args...))
-      return FutureResOrErr->get();
-    else
-      return FutureResOrErr.takeError();
-  }
-
-  /// Handle incoming RPC calls.
-  Error handlerLoop() {
-    while (true)
-      if (auto Err = this->handleOne())
-        return Err;
-    return Error::success();
-  }
-};
-
-template <typename ChannelT, typename FunctionIdT = uint32_t,
-          typename SequenceNumberT = uint32_t>
-class SingleThreadedRPCEndpoint
-    : public detail::RPCEndpointBase<
-          SingleThreadedRPCEndpoint<ChannelT, FunctionIdT, SequenceNumberT>,
-          ChannelT, FunctionIdT, SequenceNumberT> {
-private:
-  using BaseClass =
-      detail::RPCEndpointBase<
-        SingleThreadedRPCEndpoint<ChannelT, FunctionIdT, SequenceNumberT>,
-        ChannelT, FunctionIdT, SequenceNumberT>;
-
-public:
-  SingleThreadedRPCEndpoint(ChannelT &C, bool LazyAutoNegotiation)
-      : BaseClass(C, LazyAutoNegotiation) {}
-
-  template <typename Func, typename HandlerT>
-  void addHandler(HandlerT Handler) {
-    return this->template addHandlerImpl<Func>(std::move(Handler));
-  }
-
-  template <typename Func, typename ClassT, typename RetT, typename... ArgTs>
-  void addHandler(ClassT &Object, RetT (ClassT::*Method)(ArgTs...)) {
-    addHandler<Func>(
-        detail::MemberFnWrapper<ClassT, RetT, ArgTs...>(Object, Method));
-  }
-
-  template <typename Func, typename HandlerT>
-  void addAsyncHandler(HandlerT Handler) {
-    return this->template addAsyncHandlerImpl<Func>(std::move(Handler));
-  }
-
-  /// Add a class-method as a handler.
-  template <typename Func, typename ClassT, typename RetT, typename... ArgTs>
-  void addAsyncHandler(ClassT &Object, RetT (ClassT::*Method)(ArgTs...)) {
-    addAsyncHandler<Func>(
-      detail::MemberFnWrapper<ClassT, RetT, ArgTs...>(Object, Method));
-  }
-
-  template <typename Func, typename... ArgTs,
-            typename AltRetT = typename Func::ReturnType>
-  typename detail::ResultTraits<AltRetT>::ErrorReturnType
-  callB(const ArgTs &... Args) {
-    bool ReceivedResponse = false;
-    using ResultType = typename detail::ResultTraits<AltRetT>::ErrorReturnType;
-    auto Result = detail::ResultTraits<AltRetT>::createBlankErrorReturnValue();
-
-    // We have to 'Check' result (which we know is in a success state at this
-    // point) so that it can be overwritten in the async handler.
-    (void)!!Result;
-
-    if (auto Err = this->template appendCallAsync<Func>(
-            [&](ResultType R) {
-              Result = std::move(R);
-              ReceivedResponse = true;
-              return Error::success();
-            },
-            Args...)) {
-      detail::ResultTraits<typename Func::ReturnType>::consumeAbandoned(
-          std::move(Result));
-      return std::move(Err);
-    }
-
-    if (auto Err = this->C.send()) {
-      detail::ResultTraits<typename Func::ReturnType>::consumeAbandoned(
-          std::move(Result));
-      return std::move(Err);
-    }
-
-    while (!ReceivedResponse) {
-      if (auto Err = this->handleOne()) {
-        detail::ResultTraits<typename Func::ReturnType>::consumeAbandoned(
-            std::move(Result));
-        return std::move(Err);
-      }
-    }
-
-    return Result;
-  }
-};
-
-/// Asynchronous dispatch for a function on an RPC endpoint.
-template <typename RPCClass, typename Func>
-class RPCAsyncDispatch {
-public:
-  RPCAsyncDispatch(RPCClass &Endpoint) : Endpoint(Endpoint) {}
-
-  template <typename HandlerT, typename... ArgTs>
-  Error operator()(HandlerT Handler, const ArgTs &... Args) const {
-    return Endpoint.template appendCallAsync<Func>(std::move(Handler), Args...);
-  }
-
-private:
-  RPCClass &Endpoint;
-};
-
-/// Construct an asynchronous dispatcher from an RPC endpoint and a Func.
-template <typename Func, typename RPCEndpointT>
-RPCAsyncDispatch<RPCEndpointT, Func> rpcAsyncDispatch(RPCEndpointT &Endpoint) {
-  return RPCAsyncDispatch<RPCEndpointT, Func>(Endpoint);
-}
-
-/// Allows a set of asynchrounous calls to be dispatched, and then
-///        waited on as a group.
-class ParallelCallGroup {
-public:
-
-  ParallelCallGroup() = default;
-  ParallelCallGroup(const ParallelCallGroup &) = delete;
-  ParallelCallGroup &operator=(const ParallelCallGroup &) = delete;
-
-  /// Make as asynchronous call.
-  template <typename AsyncDispatcher, typename HandlerT, typename... ArgTs>
-  Error call(const AsyncDispatcher &AsyncDispatch, HandlerT Handler,
-             const ArgTs &... Args) {
-    // Increment the count of outstanding calls. This has to happen before
-    // we invoke the call, as the handler may (depending on scheduling)
-    // be run immediately on another thread, and we don't want the decrement
-    // in the wrapped handler below to run before the increment.
-    {
-      std::unique_lock<std::mutex> Lock(M);
-      ++NumOutstandingCalls;
-    }
-
-    // Wrap the user handler in a lambda that will decrement the
-    // outstanding calls count, then poke the condition variable.
-    using ArgType = typename detail::ResponseHandlerArg<
-        typename detail::HandlerTraits<HandlerT>::Type>::ArgType;
-    auto WrappedHandler = [this, Handler = std::move(Handler)](ArgType Arg) {
-      auto Err = Handler(std::move(Arg));
-      std::unique_lock<std::mutex> Lock(M);
-      --NumOutstandingCalls;
-      CV.notify_all();
-      return Err;
-    };
-
-    return AsyncDispatch(std::move(WrappedHandler), Args...);
-  }
-
-  /// Blocks until all calls have been completed and their return value
-  ///        handlers run.
-  void wait() {
-    std::unique_lock<std::mutex> Lock(M);
-    while (NumOutstandingCalls > 0)
-      CV.wait(Lock);
-  }
-
-private:
-  std::mutex M;
-  std::condition_variable CV;
-  uint32_t NumOutstandingCalls = 0;
-};
-
-/// Convenience class for grouping RPC Functions into APIs that can be
-///        negotiated as a block.
-///
-template <typename... Funcs>
-class APICalls {
-public:
-
-  /// Test whether this API contains Function F.
-  template <typename F>
-  class Contains {
-  public:
-    static const bool value = false;
-  };
-
-  /// Negotiate all functions in this API.
-  template <typename RPCEndpoint>
-  static Error negotiate(RPCEndpoint &R) {
-    return Error::success();
-  }
-};
-
-template <typename Func, typename... Funcs>
-class APICalls<Func, Funcs...> {
-public:
-
-  template <typename F>
-  class Contains {
-  public:
-    static const bool value = std::is_same<F, Func>::value |
-                              APICalls<Funcs...>::template Contains<F>::value;
-  };
-
-  template <typename RPCEndpoint>
-  static Error negotiate(RPCEndpoint &R) {
-    if (auto Err = R.template negotiateFunction<Func>())
-      return Err;
-    return APICalls<Funcs...>::negotiate(R);
-  }
-
-};
-
-template <typename... InnerFuncs, typename... Funcs>
-class APICalls<APICalls<InnerFuncs...>, Funcs...> {
-public:
-
-  template <typename F>
-  class Contains {
-  public:
-    static const bool value =
-      APICalls<InnerFuncs...>::template Contains<F>::value |
-      APICalls<Funcs...>::template Contains<F>::value;
-  };
-
-  template <typename RPCEndpoint>
-  static Error negotiate(RPCEndpoint &R) {
-    if (auto Err = APICalls<InnerFuncs...>::negotiate(R))
-      return Err;
-    return APICalls<Funcs...>::negotiate(R);
-  }
-
-};
-
-} // end namespace rpc
-} // end namespace orc
-} // end namespace llvm
-
-#endif
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/RPC/RawByteChannel.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/RPC/RawByteChannel.h
deleted file mode 100644
index 35745993248c..000000000000
--- a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/RPC/RawByteChannel.h
+++ /dev/null
@@ -1,184 +0,0 @@
-//===- llvm/ExecutionEngine/Orc/RPC/RawByteChannel.h ----------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_EXECUTIONENGINE_ORC_RAWBYTECHANNEL_H
-#define LLVM_EXECUTIONENGINE_ORC_RAWBYTECHANNEL_H
-
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ExecutionEngine/Orc/RPC/RPCSerialization.h"
-#include "llvm/Support/Endian.h"
-#include "llvm/Support/Error.h"
-#include <cstdint>
-#include <mutex>
-#include <string>
-#include <type_traits>
-
-namespace llvm {
-namespace orc {
-namespace rpc {
-
-/// Interface for byte-streams to be used with RPC.
-class RawByteChannel {
-public:
-  virtual ~RawByteChannel() = default;
-
-  /// Read Size bytes from the stream into *Dst.
-  virtual Error readBytes(char *Dst, unsigned Size) = 0;
-
-  /// Read size bytes from *Src and append them to the stream.
-  virtual Error appendBytes(const char *Src, unsigned Size) = 0;
-
-  /// Flush the stream if possible.
-  virtual Error send() = 0;
-
-  /// Notify the channel that we're starting a message send.
-  /// Locks the channel for writing.
-  template <typename FunctionIdT, typename SequenceIdT>
-  Error startSendMessage(const FunctionIdT &FnId, const SequenceIdT &SeqNo) {
-    writeLock.lock();
-    if (auto Err = serializeSeq(*this, FnId, SeqNo)) {
-      writeLock.unlock();
-      return Err;
-    }
-    return Error::success();
-  }
-
-  /// Notify the channel that we're ending a message send.
-  /// Unlocks the channel for writing.
-  Error endSendMessage() {
-    writeLock.unlock();
-    return Error::success();
-  }
-
-  /// Notify the channel that we're starting a message receive.
-  /// Locks the channel for reading.
-  template <typename FunctionIdT, typename SequenceNumberT>
-  Error startReceiveMessage(FunctionIdT &FnId, SequenceNumberT &SeqNo) {
-    readLock.lock();
-    if (auto Err = deserializeSeq(*this, FnId, SeqNo)) {
-      readLock.unlock();
-      return Err;
-    }
-    return Error::success();
-  }
-
-  /// Notify the channel that we're ending a message receive.
-  /// Unlocks the channel for reading.
-  Error endReceiveMessage() {
-    readLock.unlock();
-    return Error::success();
-  }
-
-  /// Get the lock for stream reading.
-  std::mutex &getReadLock() { return readLock; }
-
-  /// Get the lock for stream writing.
-  std::mutex &getWriteLock() { return writeLock; }
-
-private:
-  std::mutex readLock, writeLock;
-};
-
-template <typename ChannelT, typename T>
-class SerializationTraits<
-    ChannelT, T, T,
-    std::enable_if_t<
-        std::is_base_of<RawByteChannel, ChannelT>::value &&
-        (std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value ||
-         std::is_same<T, uint16_t>::value || std::is_same<T, int16_t>::value ||
-         std::is_same<T, uint32_t>::value || std::is_same<T, int32_t>::value ||
-         std::is_same<T, uint64_t>::value || std::is_same<T, int64_t>::value ||
-         std::is_same<T, char>::value)>> {
-public:
-  static Error serialize(ChannelT &C, T V) {
-    support::endian::byte_swap<T, support::big>(V);
-    return C.appendBytes(reinterpret_cast<const char *>(&V), sizeof(T));
-  };
-
-  static Error deserialize(ChannelT &C, T &V) {
-    if (auto Err = C.readBytes(reinterpret_cast<char *>(&V), sizeof(T)))
-      return Err;
-    support::endian::byte_swap<T, support::big>(V);
-    return Error::success();
-  };
-};
-
-template <typename ChannelT>
-class SerializationTraits<
-    ChannelT, bool, bool,
-    std::enable_if_t<std::is_base_of<RawByteChannel, ChannelT>::value>> {
-public:
-  static Error serialize(ChannelT &C, bool V) {
-    uint8_t Tmp = V ? 1 : 0;
-    if (auto Err =
-          C.appendBytes(reinterpret_cast<const char *>(&Tmp), 1))
-      return Err;
-    return Error::success();
-  }
-
-  static Error deserialize(ChannelT &C, bool &V) {
-    uint8_t Tmp = 0;
-    if (auto Err = C.readBytes(reinterpret_cast<char *>(&Tmp), 1))
-      return Err;
-    V = Tmp != 0;
-    return Error::success();
-  }
-};
-
-template <typename ChannelT>
-class SerializationTraits<
-    ChannelT, std::string, StringRef,
-    std::enable_if_t<std::is_base_of<RawByteChannel, ChannelT>::value>> {
-public:
-  /// RPC channel serialization for std::strings.
-  static Error serialize(RawByteChannel &C, StringRef S) {
-    if (auto Err = serializeSeq(C, static_cast<uint64_t>(S.size())))
-      return Err;
-    return C.appendBytes((const char *)S.data(), S.size());
-  }
-};
-
-template <typename ChannelT, typename T>
-class SerializationTraits<
-    ChannelT, std::string, T,
-    std::enable_if_t<std::is_base_of<RawByteChannel, ChannelT>::value &&
-                     (std::is_same<T, const char *>::value ||
-                      std::is_same<T, char *>::value)>> {
-public:
-  static Error serialize(RawByteChannel &C, const char *S) {
-    return SerializationTraits<ChannelT, std::string, StringRef>::serialize(C,
-                                                                            S);
-  }
-};
-
-template <typename ChannelT>
-class SerializationTraits<
-    ChannelT, std::string, std::string,
-    std::enable_if_t<std::is_base_of<RawByteChannel, ChannelT>::value>> {
-public:
-  /// RPC channel serialization for std::strings.
-  static Error serialize(RawByteChannel &C, const std::string &S) {
-    return SerializationTraits<ChannelT, std::string, StringRef>::serialize(C,
-                                                                            S);
-  }
-
-  /// RPC channel deserialization for std::strings.
-  static Error deserialize(RawByteChannel &C, std::string &S) {
-    uint64_t Count = 0;
-    if (auto Err = deserializeSeq(C, Count))
-      return Err;
-    S.resize(Count);
-    return C.readBytes(&S[0], Count);
-  }
-};
-
-} // end namespace rpc
-} // end namespace orc
-} // end namespace llvm
-
-#endif // LLVM_EXECUTIONENGINE_ORC_RAWBYTECHANNEL_H
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h
index 9ada0871cf0c..7dfbf32b1ffa 100644
--- a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h
+++ b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h
@@ -20,7 +20,6 @@
 #include "llvm/ExecutionEngine/JITSymbol.h"
 #include "llvm/ExecutionEngine/Orc/Core.h"
 #include "llvm/ExecutionEngine/Orc/Layer.h"
-#include "llvm/ExecutionEngine/Orc/Legacy.h"
 #include "llvm/ExecutionEngine/RuntimeDyld.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/Error.h"
@@ -36,16 +35,16 @@
 namespace llvm {
 namespace orc {
 
-class RTDyldObjectLinkingLayer : public ObjectLayer {
+class RTDyldObjectLinkingLayer : public ObjectLayer, private ResourceManager {
 public:
   /// Functor for receiving object-loaded notifications.
-  using NotifyLoadedFunction =
-      std::function<void(VModuleKey, const object::ObjectFile &Obj,
-                         const RuntimeDyld::LoadedObjectInfo &)>;
+  using NotifyLoadedFunction = std::function<void(
+      MaterializationResponsibility &R, const object::ObjectFile &Obj,
+      const RuntimeDyld::LoadedObjectInfo &)>;
 
   /// Functor for receiving finalization notifications.
-  using NotifyEmittedFunction =
-      std::function<void(VModuleKey, std::unique_ptr<MemoryBuffer>)>;
+  using NotifyEmittedFunction = std::function<void(
+      MaterializationResponsibility &R, std::unique_ptr<MemoryBuffer>)>;
 
   using GetMemoryManagerFunction =
       std::function<std::unique_ptr<RuntimeDyld::MemoryManager>()>;
@@ -58,7 +57,7 @@ public:
   ~RTDyldObjectLinkingLayer();
 
   /// Emit the object.
-  void emit(MaterializationResponsibility R,
+  void emit(std::unique_ptr<MaterializationResponsibility> R,
             std::unique_ptr<MemoryBuffer> O) override;
 
   /// Set the NotifyLoaded callback.
@@ -123,16 +122,23 @@ public:
   void unregisterJITEventListener(JITEventListener &L);
 
 private:
-  Error onObjLoad(VModuleKey K, MaterializationResponsibility &R,
+  using MemoryManagerUP = std::unique_ptr<RuntimeDyld::MemoryManager>;
+
+  Error onObjLoad(MaterializationResponsibility &R,
                   const object::ObjectFile &Obj,
-                  RuntimeDyld::MemoryManager *MemMgr,
-                  std::unique_ptr<RuntimeDyld::LoadedObjectInfo> LoadedObjInfo,
+                  RuntimeDyld::MemoryManager &MemMgr,
+                  RuntimeDyld::LoadedObjectInfo &LoadedObjInfo,
                   std::map<StringRef, JITEvaluatedSymbol> Resolved,
                   std::set<StringRef> &InternalSymbols);
 
-  void onObjEmit(VModuleKey K, MaterializationResponsibility &R,
+  void onObjEmit(MaterializationResponsibility &R,
                  object::OwningBinary<object::ObjectFile> O,
-                 RuntimeDyld::MemoryManager *MemMgr, Error Err);
+                 std::unique_ptr<RuntimeDyld::MemoryManager> MemMgr,
+                 std::unique_ptr<RuntimeDyld::LoadedObjectInfo> LoadedObjInfo,
+                 Error Err);
+
+  Error handleRemoveResources(ResourceKey K) override;
+  void handleTransferResources(ResourceKey DstKey, ResourceKey SrcKey) override;
 
   mutable std::mutex RTDyldLayerMutex;
   GetMemoryManagerFunction GetMemoryManager;
@@ -141,361 +147,8 @@ private:
   bool ProcessAllSections = false;
   bool OverrideObjectFlags = false;
   bool AutoClaimObjectSymbols = false;
-  std::vector<std::unique_ptr<RuntimeDyld::MemoryManager>> MemMgrs;
+  DenseMap<ResourceKey, std::vector<MemoryManagerUP>> MemMgrs;
   std::vector<JITEventListener *> EventListeners;
-  DenseMap<RuntimeDyld::MemoryManager *,
-           std::unique_ptr<RuntimeDyld::LoadedObjectInfo>>
-      LoadedObjInfos;
-};
-
-class LegacyRTDyldObjectLinkingLayerBase {
-public:
-  using ObjectPtr = std::unique_ptr<MemoryBuffer>;
-
-protected:
-
-  /// Holds an object to be allocated/linked as a unit in the JIT.
-  ///
-  /// An instance of this class will be created for each object added
-  /// via JITObjectLayer::addObject. Deleting the instance (via
-  /// removeObject) frees its memory, removing all symbol definitions that
-  /// had been provided by this instance. Higher level layers are responsible
-  /// for taking any action required to handle the missing symbols.
-  class LinkedObject {
-  public:
-    LinkedObject() = default;
-    LinkedObject(const LinkedObject&) = delete;
-    void operator=(const LinkedObject&) = delete;
-    virtual ~LinkedObject() = default;
-
-    virtual Error finalize() = 0;
-
-    virtual JITSymbol::GetAddressFtor
-    getSymbolMaterializer(std::string Name) = 0;
-
-    virtual void mapSectionAddress(const void *LocalAddress,
-                                   JITTargetAddress TargetAddr) const = 0;
-
-    JITSymbol getSymbol(StringRef Name, bool ExportedSymbolsOnly) {
-      auto SymEntry = SymbolTable.find(Name);
-      if (SymEntry == SymbolTable.end())
-        return nullptr;
-      if (!SymEntry->second.getFlags().isExported() && ExportedSymbolsOnly)
-        return nullptr;
-      if (!Finalized)
-        return JITSymbol(getSymbolMaterializer(std::string(Name)),
-                         SymEntry->second.getFlags());
-      return JITSymbol(SymEntry->second);
-    }
-
-  protected:
-    StringMap<JITEvaluatedSymbol> SymbolTable;
-    bool Finalized = false;
-  };
-};
-
-/// Bare bones object linking layer.
-///
-///   This class is intended to be used as the base layer for a JIT. It allows
-/// object files to be loaded into memory, linked, and the addresses of their
-/// symbols queried. All objects added to this layer can see each other's
-/// symbols.
-class LegacyRTDyldObjectLinkingLayer : public LegacyRTDyldObjectLinkingLayerBase {
-public:
-
-  using LegacyRTDyldObjectLinkingLayerBase::ObjectPtr;
-
-  /// Functor for receiving object-loaded notifications.
-  using NotifyLoadedFtor =
-      std::function<void(VModuleKey, const object::ObjectFile &Obj,
-                         const RuntimeDyld::LoadedObjectInfo &)>;
-
-  /// Functor for receiving finalization notifications.
-  using NotifyFinalizedFtor =
-      std::function<void(VModuleKey, const object::ObjectFile &Obj,
-                         const RuntimeDyld::LoadedObjectInfo &)>;
-
-  /// Functor for receiving deallocation notifications.
-  using NotifyFreedFtor = std::function<void(VModuleKey, const object::ObjectFile &Obj)>;
-
-private:
-  using OwnedObject = object::OwningBinary<object::ObjectFile>;
-
-  template <typename MemoryManagerPtrT>
-  class ConcreteLinkedObject : public LinkedObject {
-  public:
-    ConcreteLinkedObject(LegacyRTDyldObjectLinkingLayer &Parent, VModuleKey K,
-                         OwnedObject Obj, MemoryManagerPtrT MemMgr,
-                         std::shared_ptr<SymbolResolver> Resolver,
-                         bool ProcessAllSections)
-        : K(std::move(K)),
-          Parent(Parent),
-          MemMgr(std::move(MemMgr)),
-          PFC(std::make_unique<PreFinalizeContents>(
-              std::move(Obj), std::move(Resolver),
-              ProcessAllSections)) {
-      buildInitialSymbolTable(PFC->Obj);
-    }
-
-    ~ConcreteLinkedObject() override {
-      if (this->Parent.NotifyFreed && ObjForNotify.getBinary())
-        this->Parent.NotifyFreed(K, *ObjForNotify.getBinary());
-
-      MemMgr->deregisterEHFrames();
-    }
-
-    Error finalize() override {
-      assert(PFC && "mapSectionAddress called on finalized LinkedObject");
-
-      JITSymbolResolverAdapter ResolverAdapter(Parent.ES, *PFC->Resolver,
-					       nullptr);
-      PFC->RTDyld = std::make_unique<RuntimeDyld>(*MemMgr, ResolverAdapter);
-      PFC->RTDyld->setProcessAllSections(PFC->ProcessAllSections);
-
-      Finalized = true;
-
-      std::unique_ptr<RuntimeDyld::LoadedObjectInfo> Info =
-          PFC->RTDyld->loadObject(*PFC->Obj.getBinary());
-
-      // Copy the symbol table out of the RuntimeDyld instance.
-      {
-        auto SymTab = PFC->RTDyld->getSymbolTable();
-        for (auto &KV : SymTab)
-          SymbolTable[KV.first] = KV.second;
-      }
-
-      if (Parent.NotifyLoaded)
-        Parent.NotifyLoaded(K, *PFC->Obj.getBinary(), *Info);
-
-      PFC->RTDyld->finalizeWithMemoryManagerLocking();
-
-      if (PFC->RTDyld->hasError())
-        return make_error<StringError>(PFC->RTDyld->getErrorString(),
-                                       inconvertibleErrorCode());
-
-      if (Parent.NotifyFinalized)
-        Parent.NotifyFinalized(K, *PFC->Obj.getBinary(), *Info);
-
-      // Release resources.
-      if (this->Parent.NotifyFreed)
-        ObjForNotify = std::move(PFC->Obj); // needed for callback
-      PFC = nullptr;
-      return Error::success();
-    }
-
-    JITSymbol::GetAddressFtor getSymbolMaterializer(std::string Name) override {
-      return [this, Name]() -> Expected<JITTargetAddress> {
-        // The symbol may be materialized between the creation of this lambda
-        // and its execution, so we need to double check.
-        if (!this->Finalized)
-          if (auto Err = this->finalize())
-            return std::move(Err);
-        return this->getSymbol(Name, false).getAddress();
-      };
-    }
-
-    void mapSectionAddress(const void *LocalAddress,
-                           JITTargetAddress TargetAddr) const override {
-      assert(PFC && "mapSectionAddress called on finalized LinkedObject");
-      assert(PFC->RTDyld && "mapSectionAddress called on raw LinkedObject");
-      PFC->RTDyld->mapSectionAddress(LocalAddress, TargetAddr);
-    }
-
-  private:
-    void buildInitialSymbolTable(const OwnedObject &Obj) {
-      for (auto &Symbol : Obj.getBinary()->symbols()) {
-        if (Expected<uint32_t> SymbolFlagsOrErr = Symbol.getFlags()) {
-          if (*SymbolFlagsOrErr & object::SymbolRef::SF_Undefined)
-            continue;
-        } else {
-          // FIXME: Raise an error for bad symbols.
-          consumeError(SymbolFlagsOrErr.takeError());
-          continue;
-        }
-
-        Expected<StringRef> SymbolName = Symbol.getName();
-        // FIXME: Raise an error for bad symbols.
-        if (!SymbolName) {
-          consumeError(SymbolName.takeError());
-          continue;
-        }
-        // FIXME: Raise an error for bad symbols.
-        auto Flags = JITSymbolFlags::fromObjectSymbol(Symbol);
-        if (!Flags) {
-          consumeError(Flags.takeError());
-          continue;
-        }
-        SymbolTable.insert(
-            std::make_pair(*SymbolName, JITEvaluatedSymbol(0, *Flags)));
-      }
-    }
-
-    // Contains the information needed prior to finalization: the object files,
-    // memory manager, resolver, and flags needed for RuntimeDyld.
-    struct PreFinalizeContents {
-      PreFinalizeContents(OwnedObject Obj,
-                          std::shared_ptr<SymbolResolver> Resolver,
-                          bool ProcessAllSections)
-          : Obj(std::move(Obj)),
-            Resolver(std::move(Resolver)),
-            ProcessAllSections(ProcessAllSections) {}
-
-      OwnedObject Obj;
-      std::shared_ptr<SymbolResolver> Resolver;
-      bool ProcessAllSections;
-      std::unique_ptr<RuntimeDyld> RTDyld;
-    };
-
-    VModuleKey K;
-    LegacyRTDyldObjectLinkingLayer &Parent;
-    MemoryManagerPtrT MemMgr;
-    OwnedObject ObjForNotify;
-    std::unique_ptr<PreFinalizeContents> PFC;
-  };
-
-  template <typename MemoryManagerPtrT>
-  std::unique_ptr<ConcreteLinkedObject<MemoryManagerPtrT>>
-  createLinkedObject(LegacyRTDyldObjectLinkingLayer &Parent, VModuleKey K,
-                     OwnedObject Obj, MemoryManagerPtrT MemMgr,
-                     std::shared_ptr<SymbolResolver> Resolver,
-                     bool ProcessAllSections) {
-    using LOS = ConcreteLinkedObject<MemoryManagerPtrT>;
-    return std::make_unique<LOS>(Parent, std::move(K), std::move(Obj),
-                                  std::move(MemMgr), std::move(Resolver),
-                                  ProcessAllSections);
-  }
-
-public:
-  struct Resources {
-    std::shared_ptr<RuntimeDyld::MemoryManager> MemMgr;
-    std::shared_ptr<SymbolResolver> Resolver;
-  };
-
-  using ResourcesGetter = std::function<Resources(VModuleKey)>;
-
-  /// Construct an ObjectLinkingLayer with the given NotifyLoaded,
-  ///        and NotifyFinalized functors.
-  LLVM_ATTRIBUTE_DEPRECATED(
-      LegacyRTDyldObjectLinkingLayer(
-          ExecutionSession &ES, ResourcesGetter GetResources,
-          NotifyLoadedFtor NotifyLoaded = NotifyLoadedFtor(),
-          NotifyFinalizedFtor NotifyFinalized = NotifyFinalizedFtor(),
-          NotifyFreedFtor NotifyFreed = NotifyFreedFtor()),
-      "ORCv1 layers (layers with the 'Legacy' prefix) are deprecated. Please "
-      "use "
-      "ORCv2 (see docs/ORCv2.rst)");
-
-  // Legacy layer constructor with deprecation acknowledgement.
-  LegacyRTDyldObjectLinkingLayer(
-      ORCv1DeprecationAcknowledgement, ExecutionSession &ES,
-      ResourcesGetter GetResources,
-      NotifyLoadedFtor NotifyLoaded = NotifyLoadedFtor(),
-      NotifyFinalizedFtor NotifyFinalized = NotifyFinalizedFtor(),
-      NotifyFreedFtor NotifyFreed = NotifyFreedFtor())
-      : ES(ES), GetResources(std::move(GetResources)),
-        NotifyLoaded(std::move(NotifyLoaded)),
-        NotifyFinalized(std::move(NotifyFinalized)),
-        NotifyFreed(std::move(NotifyFreed)), ProcessAllSections(false) {}
-
-  /// Set the 'ProcessAllSections' flag.
-  ///
-  /// If set to true, all sections in each object file will be allocated using
-  /// the memory manager, rather than just the sections required for execution.
-  ///
-  /// This is kludgy, and may be removed in the future.
-  void setProcessAllSections(bool ProcessAllSections) {
-    this->ProcessAllSections = ProcessAllSections;
-  }
-
-  /// Add an object to the JIT.
-  Error addObject(VModuleKey K, ObjectPtr ObjBuffer) {
-
-    auto Obj =
-        object::ObjectFile::createObjectFile(ObjBuffer->getMemBufferRef());
-    if (!Obj)
-      return Obj.takeError();
-
-    assert(!LinkedObjects.count(K) && "VModuleKey already in use");
-
-    auto R = GetResources(K);
-
-    LinkedObjects[K] = createLinkedObject(
-        *this, K, OwnedObject(std::move(*Obj), std::move(ObjBuffer)),
-        std::move(R.MemMgr), std::move(R.Resolver), ProcessAllSections);
-
-    return Error::success();
-  }
-
-  /// Remove the object associated with VModuleKey K.
-  ///
-  ///   All memory allocated for the object will be freed, and the sections and
-  /// symbols it provided will no longer be available. No attempt is made to
-  /// re-emit the missing symbols, and any use of these symbols (directly or
-  /// indirectly) will result in undefined behavior. If dependence tracking is
-  /// required to detect or resolve such issues it should be added at a higher
-  /// layer.
-  Error removeObject(VModuleKey K) {
-    assert(LinkedObjects.count(K) && "VModuleKey not associated with object");
-    // How do we invalidate the symbols in H?
-    LinkedObjects.erase(K);
-    return Error::success();
-  }
-
-  /// Search for the given named symbol.
-  /// @param Name The name of the symbol to search for.
-  /// @param ExportedSymbolsOnly If true, search only for exported symbols.
-  /// @return A handle for the given named symbol, if it exists.
-  JITSymbol findSymbol(StringRef Name, bool ExportedSymbolsOnly) {
-    for (auto &KV : LinkedObjects)
-      if (auto Sym = KV.second->getSymbol(Name, ExportedSymbolsOnly))
-        return Sym;
-      else if (auto Err = Sym.takeError())
-        return std::move(Err);
-
-    return nullptr;
-  }
-
-  /// Search for the given named symbol in the context of the loaded
-  ///        object represented by the VModuleKey K.
-  /// @param K The VModuleKey for the object to search in.
-  /// @param Name The name of the symbol to search for.
-  /// @param ExportedSymbolsOnly If true, search only for exported symbols.
-  /// @return A handle for the given named symbol, if it is found in the
-  ///         given object.
-  JITSymbol findSymbolIn(VModuleKey K, StringRef Name,
-                         bool ExportedSymbolsOnly) {
-    assert(LinkedObjects.count(K) && "VModuleKey not associated with object");
-    return LinkedObjects[K]->getSymbol(Name, ExportedSymbolsOnly);
-  }
-
-  /// Map section addresses for the object associated with the
-  ///        VModuleKey K.
-  void mapSectionAddress(VModuleKey K, const void *LocalAddress,
-                         JITTargetAddress TargetAddr) {
-    assert(LinkedObjects.count(K) && "VModuleKey not associated with object");
-    LinkedObjects[K]->mapSectionAddress(LocalAddress, TargetAddr);
-  }
-
-  /// Immediately emit and finalize the object represented by the given
-  ///        VModuleKey.
-  /// @param K VModuleKey for object to emit/finalize.
-  Error emitAndFinalize(VModuleKey K) {
-    assert(LinkedObjects.count(K) && "VModuleKey not associated with object");
-    return LinkedObjects[K]->finalize();
-  }
-
-private:
-  ExecutionSession &ES;
-
-  ResourcesGetter GetResources;
-  NotifyLoadedFtor NotifyLoaded;
-  NotifyFinalizedFtor NotifyFinalized;
-  NotifyFreedFtor NotifyFreed;
-
-  // NB!  `LinkedObjects` needs to be destroyed before `NotifyFreed` because
-  // `~ConcreteLinkedObject` calls `NotifyFreed`
-  std::map<VModuleKey, std::unique_ptr<LinkedObject>> LinkedObjects;
-  bool ProcessAllSections = false;
 };
 
 } // end namespace orc
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/RemoteObjectLayer.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/RemoteObjectLayer.h
deleted file mode 100644
index d7304cfcf931..000000000000
--- a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/RemoteObjectLayer.h
+++ /dev/null
@@ -1,564 +0,0 @@
-//===------ RemoteObjectLayer.h - Forwards objs to a remote -----*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Forwards objects to a remote object layer via RPC.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_EXECUTIONENGINE_ORC_REMOTEOBJECTLAYER_H
-#define LLVM_EXECUTIONENGINE_ORC_REMOTEOBJECTLAYER_H
-
-#include "llvm/ExecutionEngine/Orc/Core.h"
-#include "llvm/ExecutionEngine/Orc/LambdaResolver.h"
-#include "llvm/ExecutionEngine/Orc/OrcRemoteTargetRPCAPI.h"
-#include "llvm/Object/ObjectFile.h"
-#include <map>
-
-namespace llvm {
-namespace orc {
-
-/// RPC API needed by RemoteObjectClientLayer and RemoteObjectServerLayer.
-class RemoteObjectLayerAPI {
-public:
-
-  using ObjHandleT = remote::ResourceIdMgr::ResourceId;
-
-protected:
-
-  using RemoteSymbolId = remote::ResourceIdMgr::ResourceId;
-  using RemoteSymbol = std::pair<RemoteSymbolId, JITSymbolFlags>;
-
-public:
-
-  using BadSymbolHandleError = remote::ResourceNotFound<RemoteSymbolId>;
-  using BadObjectHandleError = remote::ResourceNotFound<ObjHandleT>;
-
-protected:
-
-  static const ObjHandleT InvalidObjectHandleId = 0;
-  static const RemoteSymbolId NullSymbolId = 0;
-
-  class AddObject
-    : public rpc::Function<AddObject, Expected<ObjHandleT>(std::string)> {
-  public:
-    static const char *getName() { return "AddObject"; }
-  };
-
-  class RemoveObject
-    : public rpc::Function<RemoveObject, Error(ObjHandleT)> {
-  public:
-    static const char *getName() { return "RemoveObject"; }
-  };
-
-  class FindSymbol
-    : public rpc::Function<FindSymbol, Expected<RemoteSymbol>(std::string,
-                                                              bool)> {
-  public:
-    static const char *getName() { return "FindSymbol"; }
-  };
-
-  class FindSymbolIn
-    : public rpc::Function<FindSymbolIn,
-                           Expected<RemoteSymbol>(ObjHandleT, std::string,
-                                                  bool)> {
-  public:
-    static const char *getName() { return "FindSymbolIn"; }
-  };
-
-  class EmitAndFinalize
-    : public rpc::Function<EmitAndFinalize,
-                           Error(ObjHandleT)> {
-  public:
-    static const char *getName() { return "EmitAndFinalize"; }
-  };
-
-  class Lookup
-    : public rpc::Function<Lookup,
-                           Expected<RemoteSymbol>(ObjHandleT, std::string)> {
-  public:
-    static const char *getName() { return "Lookup"; }
-  };
-
-  class LookupInLogicalDylib
-    : public rpc::Function<LookupInLogicalDylib,
-                           Expected<RemoteSymbol>(ObjHandleT, std::string)> {
-  public:
-    static const char *getName() { return "LookupInLogicalDylib"; }
-  };
-
-  class ReleaseRemoteSymbol
-    : public rpc::Function<ReleaseRemoteSymbol, Error(RemoteSymbolId)> {
-  public:
-    static const char *getName() { return "ReleaseRemoteSymbol"; }
-  };
-
-  class MaterializeRemoteSymbol
-    : public rpc::Function<MaterializeRemoteSymbol,
-                           Expected<JITTargetAddress>(RemoteSymbolId)> {
-  public:
-    static const char *getName() { return "MaterializeRemoteSymbol"; }
-  };
-};
-
-/// Base class containing common utilities for RemoteObjectClientLayer and
-/// RemoteObjectServerLayer.
-template <typename RPCEndpoint>
-class RemoteObjectLayer : public RemoteObjectLayerAPI {
-public:
-
-  RemoteObjectLayer(RPCEndpoint &Remote,
-                    std::function<void(Error)> ReportError)
-      : Remote(Remote), ReportError(std::move(ReportError)),
-        SymbolIdMgr(NullSymbolId + 1) {
-    using ThisT = RemoteObjectLayer<RPCEndpoint>;
-    Remote.template addHandler<ReleaseRemoteSymbol>(
-             *this, &ThisT::handleReleaseRemoteSymbol);
-    Remote.template addHandler<MaterializeRemoteSymbol>(
-             *this, &ThisT::handleMaterializeRemoteSymbol);
-  }
-
-protected:
-
-  /// This class is used as the symbol materializer for JITSymbols returned by
-  /// RemoteObjectLayerClient/RemoteObjectLayerServer -- the materializer knows
-  /// how to call back to the other RPC endpoint to get the address when
-  /// requested.
-  class RemoteSymbolMaterializer {
-  public:
-
-    /// Construct a RemoteSymbolMaterializer for the given RemoteObjectLayer
-    /// with the given Id.
-    RemoteSymbolMaterializer(RemoteObjectLayer &C,
-                             RemoteSymbolId Id)
-      : C(C), Id(Id) {}
-
-    RemoteSymbolMaterializer(RemoteSymbolMaterializer &&Other)
-        : C(Other.C), Id(Other.Id) {
-      Other.Id = 0;
-    }
-
-    RemoteSymbolMaterializer &operator=(RemoteSymbolMaterializer &&) = delete;
-
-    /// Release the remote symbol.
-    ~RemoteSymbolMaterializer() {
-      if (Id)
-        C.releaseRemoteSymbol(Id);
-    }
-
-    /// Materialize the symbol on the remote and get its address.
-    Expected<JITTargetAddress> materialize() {
-      auto Addr = C.materializeRemoteSymbol(Id);
-      Id = 0;
-      return Addr;
-    }
-
-  private:
-    RemoteObjectLayer &C;
-    RemoteSymbolId Id;
-  };
-
-  /// Convenience function for getting a null remote symbol value.
-  RemoteSymbol nullRemoteSymbol() {
-    return RemoteSymbol(0, JITSymbolFlags());
-  }
-
-  /// Creates a StringError that contains a copy of Err's log message, then
-  /// sends that StringError to ReportError.
-  ///
-  /// This allows us to locally log error messages for errors that will actually
-  /// be delivered to the remote.
-  Error teeLog(Error Err) {
-    return handleErrors(std::move(Err),
-                        [this](std::unique_ptr<ErrorInfoBase> EIB) {
-                          ReportError(make_error<StringError>(
-                                        EIB->message(),
-                                        EIB->convertToErrorCode()));
-                          return Error(std::move(EIB));
-                        });
-  }
-
-  Error badRemoteSymbolIdError(RemoteSymbolId Id) {
-    return make_error<BadSymbolHandleError>(Id, "Remote JIT Symbol");
-  }
-
-  Error badObjectHandleError(ObjHandleT H) {
-    return make_error<RemoteObjectLayerAPI::BadObjectHandleError>(
-             H, "Bad object handle");
-  }
-
-  /// Create a RemoteSymbol wrapping the given JITSymbol.
-  Expected<RemoteSymbol> jitSymbolToRemote(JITSymbol Sym) {
-    if (Sym) {
-      auto Id = SymbolIdMgr.getNext();
-      auto Flags = Sym.getFlags();
-      assert(!InUseSymbols.count(Id) && "Symbol id already in use");
-      InUseSymbols.insert(std::make_pair(Id, std::move(Sym)));
-      return RemoteSymbol(Id, Flags);
-    } else if (auto Err = Sym.takeError())
-      return teeLog(std::move(Err));
-    // else...
-    return nullRemoteSymbol();
-  }
-
-  /// Convert an Expected<RemoteSymbol> to a JITSymbol.
-  JITSymbol remoteToJITSymbol(Expected<RemoteSymbol> RemoteSymOrErr) {
-    if (RemoteSymOrErr) {
-      auto &RemoteSym = *RemoteSymOrErr;
-      if (RemoteSym == nullRemoteSymbol())
-        return nullptr;
-      // else...
-      RemoteSymbolMaterializer RSM(*this, RemoteSym.first);
-      auto Sym = JITSymbol(
-          [RSM = std::move(RSM)]() mutable { return RSM.materialize(); },
-          RemoteSym.second);
-      return Sym;
-    } else
-      return RemoteSymOrErr.takeError();
-  }
-
-  RPCEndpoint &Remote;
-  std::function<void(Error)> ReportError;
-
-private:
-
-  /// Notify the remote to release the given JITSymbol.
-  void releaseRemoteSymbol(RemoteSymbolId Id) {
-    if (auto Err = Remote.template callB<ReleaseRemoteSymbol>(Id))
-      ReportError(std::move(Err));
-  }
-
-  /// Notify the remote to materialize the JITSymbol with the given Id and
-  /// return its address.
-  Expected<JITTargetAddress> materializeRemoteSymbol(RemoteSymbolId Id) {
-    return Remote.template callB<MaterializeRemoteSymbol>(Id);
-  }
-
-  /// Release the JITSymbol with the given Id.
-  Error handleReleaseRemoteSymbol(RemoteSymbolId Id) {
-    auto SI = InUseSymbols.find(Id);
-    if (SI != InUseSymbols.end()) {
-      InUseSymbols.erase(SI);
-      return Error::success();
-    } else
-      return teeLog(badRemoteSymbolIdError(Id));
-  }
-
-  /// Run the materializer for the JITSymbol with the given Id and return its
-  /// address.
-  Expected<JITTargetAddress> handleMaterializeRemoteSymbol(RemoteSymbolId Id) {
-    auto SI = InUseSymbols.find(Id);
-    if (SI != InUseSymbols.end()) {
-      auto AddrOrErr = SI->second.getAddress();
-      InUseSymbols.erase(SI);
-      SymbolIdMgr.release(Id);
-      if (AddrOrErr)
-        return *AddrOrErr;
-      else
-        return teeLog(AddrOrErr.takeError());
-    } else {
-      return teeLog(badRemoteSymbolIdError(Id));
-    }
-  }
-
-  remote::ResourceIdMgr SymbolIdMgr;
-  std::map<RemoteSymbolId, JITSymbol> InUseSymbols;
-};
-
-/// RemoteObjectClientLayer forwards the ORC Object Layer API over an RPC
-/// connection.
-///
-/// This class can be used as the base layer of a JIT stack on the client and
-/// will forward operations to a corresponding RemoteObjectServerLayer on the
-/// server (which can be composed on top of a "real" object layer like
-/// RTDyldObjectLinkingLayer to actually carry out the operations).
-///
-/// Sending relocatable objects to the server (rather than fully relocated
-/// bits) allows JIT'd code to be cached on the server side and re-used in
-/// subsequent JIT sessions.
-template <typename RPCEndpoint>
-class RemoteObjectClientLayer : public RemoteObjectLayer<RPCEndpoint> {
-private:
-
-  using AddObject = RemoteObjectLayerAPI::AddObject;
-  using RemoveObject = RemoteObjectLayerAPI::RemoveObject;
-  using FindSymbol = RemoteObjectLayerAPI::FindSymbol;
-  using FindSymbolIn = RemoteObjectLayerAPI::FindSymbolIn;
-  using EmitAndFinalize = RemoteObjectLayerAPI::EmitAndFinalize;
-  using Lookup = RemoteObjectLayerAPI::Lookup;
-  using LookupInLogicalDylib = RemoteObjectLayerAPI::LookupInLogicalDylib;
-
-  using RemoteObjectLayer<RPCEndpoint>::teeLog;
-  using RemoteObjectLayer<RPCEndpoint>::badObjectHandleError;
-  using RemoteObjectLayer<RPCEndpoint>::remoteToJITSymbol;
-
-public:
-
-  using ObjHandleT = RemoteObjectLayerAPI::ObjHandleT;
-  using RemoteSymbol = RemoteObjectLayerAPI::RemoteSymbol;
-
-  using ObjectPtr = std::unique_ptr<MemoryBuffer>;
-
-  /// Create a RemoteObjectClientLayer that communicates with a
-  /// RemoteObjectServerLayer instance via the given RPCEndpoint.
-  ///
-  /// The ReportError functor can be used locally log errors that are intended
-  /// to be sent  sent
-  LLVM_ATTRIBUTE_DEPRECATED(
-      RemoteObjectClientLayer(RPCEndpoint &Remote,
-                              std::function<void(Error)> ReportError),
-      "ORCv1 layers (including RemoteObjectClientLayer) are deprecated. Please "
-      "use "
-      "ORCv2 (see docs/ORCv2.rst)");
-
-  RemoteObjectClientLayer(ORCv1DeprecationAcknowledgement, RPCEndpoint &Remote,
-                          std::function<void(Error)> ReportError)
-      : RemoteObjectLayer<RPCEndpoint>(Remote, std::move(ReportError)) {
-    using ThisT = RemoteObjectClientLayer<RPCEndpoint>;
-    Remote.template addHandler<Lookup>(*this, &ThisT::lookup);
-    Remote.template addHandler<LookupInLogicalDylib>(
-            *this, &ThisT::lookupInLogicalDylib);
-  }
-
-  /// Add an object to the JIT.
-  ///
-  /// @return A handle that can be used to refer to the loaded object (for
-  ///         symbol searching, finalization, freeing memory, etc.).
-  Expected<ObjHandleT>
-  addObject(ObjectPtr ObjBuffer,
-            std::shared_ptr<LegacyJITSymbolResolver> Resolver) {
-    if (auto HandleOrErr =
-            this->Remote.template callB<AddObject>(ObjBuffer->getBuffer())) {
-      auto &Handle = *HandleOrErr;
-      // FIXME: Return an error for this:
-      assert(!Resolvers.count(Handle) && "Handle already in use?");
-      Resolvers[Handle] = std::move(Resolver);
-      return Handle;
-    } else
-      return HandleOrErr.takeError();
-  }
-
-  /// Remove the given object from the JIT.
-  Error removeObject(ObjHandleT H) {
-    return this->Remote.template callB<RemoveObject>(H);
-  }
-
-  /// Search for the given named symbol.
-  JITSymbol findSymbol(StringRef Name, bool ExportedSymbolsOnly) {
-    return remoteToJITSymbol(
-             this->Remote.template callB<FindSymbol>(Name,
-                                                     ExportedSymbolsOnly));
-  }
-
-  /// Search for the given named symbol within the given context.
-  JITSymbol findSymbolIn(ObjHandleT H, StringRef Name, bool ExportedSymbolsOnly) {
-    return remoteToJITSymbol(
-             this->Remote.template callB<FindSymbolIn>(H, Name,
-                                                       ExportedSymbolsOnly));
-  }
-
-  /// Immediately emit and finalize the object with the given handle.
-  Error emitAndFinalize(ObjHandleT H) {
-    return this->Remote.template callB<EmitAndFinalize>(H);
-  }
-
-private:
-
-  Expected<RemoteSymbol> lookup(ObjHandleT H, const std::string &Name) {
-    auto RI = Resolvers.find(H);
-    if (RI != Resolvers.end()) {
-      return this->jitSymbolToRemote(RI->second->findSymbol(Name));
-    } else
-      return teeLog(badObjectHandleError(H));
-  }
-
-  Expected<RemoteSymbol> lookupInLogicalDylib(ObjHandleT H,
-                                              const std::string &Name) {
-    auto RI = Resolvers.find(H);
-    if (RI != Resolvers.end())
-      return this->jitSymbolToRemote(
-               RI->second->findSymbolInLogicalDylib(Name));
-    else
-      return teeLog(badObjectHandleError(H));
-  }
-
-  std::map<remote::ResourceIdMgr::ResourceId,
-           std::shared_ptr<LegacyJITSymbolResolver>>
-      Resolvers;
-};
-
-/// RemoteObjectServerLayer acts as a server and handling RPC calls for the
-/// object layer API from the given RPC connection.
-///
-/// This class can be composed on top of a 'real' object layer (e.g.
-/// RTDyldObjectLinkingLayer) to do the actual work of relocating objects
-/// and making them executable.
-template <typename BaseLayerT, typename RPCEndpoint>
-class RemoteObjectServerLayer : public RemoteObjectLayer<RPCEndpoint> {
-private:
-
-  using ObjHandleT = RemoteObjectLayerAPI::ObjHandleT;
-  using RemoteSymbol = RemoteObjectLayerAPI::RemoteSymbol;
-
-  using AddObject = RemoteObjectLayerAPI::AddObject;
-  using RemoveObject = RemoteObjectLayerAPI::RemoveObject;
-  using FindSymbol = RemoteObjectLayerAPI::FindSymbol;
-  using FindSymbolIn = RemoteObjectLayerAPI::FindSymbolIn;
-  using EmitAndFinalize = RemoteObjectLayerAPI::EmitAndFinalize;
-  using Lookup = RemoteObjectLayerAPI::Lookup;
-  using LookupInLogicalDylib = RemoteObjectLayerAPI::LookupInLogicalDylib;
-
-  using RemoteObjectLayer<RPCEndpoint>::teeLog;
-  using RemoteObjectLayer<RPCEndpoint>::badObjectHandleError;
-  using RemoteObjectLayer<RPCEndpoint>::remoteToJITSymbol;
-
-public:
-
-  /// Create a RemoteObjectServerLayer with the given base layer (which must be
-  /// an object layer), RPC endpoint, and error reporter function.
-  LLVM_ATTRIBUTE_DEPRECATED(
-      RemoteObjectServerLayer(BaseLayerT &BaseLayer, RPCEndpoint &Remote,
-                              std::function<void(Error)> ReportError),
-      "ORCv1 layers (including RemoteObjectServerLayer) are deprecated. Please "
-      "use "
-      "ORCv2 (see docs/ORCv2.rst)");
-
-  RemoteObjectServerLayer(ORCv1DeprecationAcknowledgement,
-                          BaseLayerT &BaseLayer, RPCEndpoint &Remote,
-                          std::function<void(Error)> ReportError)
-      : RemoteObjectLayer<RPCEndpoint>(Remote, std::move(ReportError)),
-        BaseLayer(BaseLayer), HandleIdMgr(1) {
-    using ThisT = RemoteObjectServerLayer<BaseLayerT, RPCEndpoint>;
-
-    Remote.template addHandler<AddObject>(*this, &ThisT::addObject);
-    Remote.template addHandler<RemoveObject>(*this, &ThisT::removeObject);
-    Remote.template addHandler<FindSymbol>(*this, &ThisT::findSymbol);
-    Remote.template addHandler<FindSymbolIn>(*this, &ThisT::findSymbolIn);
-    Remote.template addHandler<EmitAndFinalize>(*this, &ThisT::emitAndFinalize);
-  }
-
-private:
-
-  class StringMemoryBuffer : public MemoryBuffer {
-  public:
-    StringMemoryBuffer(std::string Buffer)
-      : Buffer(std::move(Buffer)) {
-      init(this->Buffer.data(), this->Buffer.data() + this->Buffer.size(),
-           false);
-    }
-
-    BufferKind getBufferKind() const override { return MemoryBuffer_Malloc; }
-  private:
-    std::string Buffer;
-  };
-
-  JITSymbol lookup(ObjHandleT Id, const std::string &Name) {
-    return remoteToJITSymbol(
-             this->Remote.template callB<Lookup>(Id, Name));
-  }
-
-  JITSymbol lookupInLogicalDylib(ObjHandleT Id, const std::string &Name) {
-    return remoteToJITSymbol(
-             this->Remote.template callB<LookupInLogicalDylib>(Id, Name));
-  }
-
-  Expected<ObjHandleT> addObject(std::string ObjBuffer) {
-    auto Buffer = std::make_unique<StringMemoryBuffer>(std::move(ObjBuffer));
-    auto Id = HandleIdMgr.getNext();
-    assert(!BaseLayerHandles.count(Id) && "Id already in use?");
-
-    auto Resolver = createLambdaResolver(
-        AcknowledgeORCv1Deprecation,
-        [this, Id](const std::string &Name) { return lookup(Id, Name); },
-        [this, Id](const std::string &Name) {
-          return lookupInLogicalDylib(Id, Name);
-        });
-
-    if (auto HandleOrErr =
-            BaseLayer.addObject(std::move(Buffer), std::move(Resolver))) {
-      BaseLayerHandles[Id] = std::move(*HandleOrErr);
-      return Id;
-    } else
-      return teeLog(HandleOrErr.takeError());
-  }
-
-  Error removeObject(ObjHandleT H) {
-    auto HI = BaseLayerHandles.find(H);
-    if (HI != BaseLayerHandles.end()) {
-      if (auto Err = BaseLayer.removeObject(HI->second))
-        return teeLog(std::move(Err));
-      return Error::success();
-    } else
-      return teeLog(badObjectHandleError(H));
-  }
-
-  Expected<RemoteSymbol> findSymbol(const std::string &Name,
-                                    bool ExportedSymbolsOnly) {
-    if (auto Sym = BaseLayer.findSymbol(Name, ExportedSymbolsOnly))
-      return this->jitSymbolToRemote(std::move(Sym));
-    else if (auto Err = Sym.takeError())
-      return teeLog(std::move(Err));
-    return this->nullRemoteSymbol();
-  }
-
-  Expected<RemoteSymbol> findSymbolIn(ObjHandleT H, const std::string &Name,
-                                      bool ExportedSymbolsOnly) {
-    auto HI = BaseLayerHandles.find(H);
-    if (HI != BaseLayerHandles.end()) {
-      if (auto Sym = BaseLayer.findSymbolIn(HI->second, Name, ExportedSymbolsOnly))
-        return this->jitSymbolToRemote(std::move(Sym));
-      else if (auto Err = Sym.takeError())
-        return teeLog(std::move(Err));
-      return this->nullRemoteSymbol();
-    } else
-      return teeLog(badObjectHandleError(H));
-  }
-
-  Error emitAndFinalize(ObjHandleT H) {
-    auto HI = BaseLayerHandles.find(H);
-    if (HI != BaseLayerHandles.end()) {
-      if (auto Err = BaseLayer.emitAndFinalize(HI->second))
-        return teeLog(std::move(Err));
-      return Error::success();
-    } else
-      return teeLog(badObjectHandleError(H));
-  }
-
-  BaseLayerT &BaseLayer;
-  remote::ResourceIdMgr HandleIdMgr;
-  std::map<ObjHandleT, typename BaseLayerT::ObjHandleT> BaseLayerHandles;
-};
-
-template <typename RPCEndpoint>
-RemoteObjectClientLayer<RPCEndpoint>::RemoteObjectClientLayer(
-    RPCEndpoint &Remote, std::function<void(Error)> ReportError)
-    : RemoteObjectLayer<RPCEndpoint>(Remote, std::move(ReportError)) {
-  using ThisT = RemoteObjectClientLayer<RPCEndpoint>;
-  Remote.template addHandler<Lookup>(*this, &ThisT::lookup);
-  Remote.template addHandler<LookupInLogicalDylib>(
-      *this, &ThisT::lookupInLogicalDylib);
-}
-
-template <typename BaseLayerT, typename RPCEndpoint>
-RemoteObjectServerLayer<BaseLayerT, RPCEndpoint>::RemoteObjectServerLayer(
-    BaseLayerT &BaseLayer, RPCEndpoint &Remote,
-    std::function<void(Error)> ReportError)
-    : RemoteObjectLayer<RPCEndpoint>(Remote, std::move(ReportError)),
-      BaseLayer(BaseLayer), HandleIdMgr(1) {
-  using ThisT = RemoteObjectServerLayer<BaseLayerT, RPCEndpoint>;
-
-  Remote.template addHandler<AddObject>(*this, &ThisT::addObject);
-  Remote.template addHandler<RemoveObject>(*this, &ThisT::removeObject);
-  Remote.template addHandler<FindSymbol>(*this, &ThisT::findSymbol);
-  Remote.template addHandler<FindSymbolIn>(*this, &ThisT::findSymbolIn);
-  Remote.template addHandler<EmitAndFinalize>(*this, &ThisT::emitAndFinalize);
-}
-
-} // end namespace orc
-} // end namespace llvm
-
-#endif // LLVM_EXECUTIONENGINE_ORC_REMOTEOBJECTLAYER_H
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/Shared/FDRawByteChannel.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/Shared/FDRawByteChannel.h
new file mode 100644
index 000000000000..3f96fe3da49d
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/Shared/FDRawByteChannel.h
@@ -0,0 +1,79 @@
+//===- FDRawByteChannel.h - File descriptor based byte-channel -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// File descriptor based RawByteChannel.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_SHARED_FDRAWBYTECHANNEL_H
+#define LLVM_EXECUTIONENGINE_ORC_SHARED_FDRAWBYTECHANNEL_H
+
+#include "llvm/ExecutionEngine/Orc/Shared/RawByteChannel.h"
+
+#if !defined(_MSC_VER) && !defined(__MINGW32__)
+#include <unistd.h>
+#else
+#include <io.h>
+#endif
+
+namespace llvm {
+namespace orc {
+namespace shared {
+
+/// Serialization channel that reads from and writes from file descriptors.
+class FDRawByteChannel final : public RawByteChannel {
+public:
+  FDRawByteChannel(int InFD, int OutFD) : InFD(InFD), OutFD(OutFD) {}
+
+  llvm::Error readBytes(char *Dst, unsigned Size) override {
+    assert(Dst && "Attempt to read into null.");
+    ssize_t Completed = 0;
+    while (Completed < static_cast<ssize_t>(Size)) {
+      ssize_t Read = ::read(InFD, Dst + Completed, Size - Completed);
+      if (Read <= 0) {
+        auto ErrNo = errno;
+        if (ErrNo == EAGAIN || ErrNo == EINTR)
+          continue;
+        else
+          return llvm::errorCodeToError(
+              std::error_code(errno, std::generic_category()));
+      }
+      Completed += Read;
+    }
+    return llvm::Error::success();
+  }
+
+  llvm::Error appendBytes(const char *Src, unsigned Size) override {
+    assert(Src && "Attempt to append from null.");
+    ssize_t Completed = 0;
+    while (Completed < static_cast<ssize_t>(Size)) {
+      ssize_t Written = ::write(OutFD, Src + Completed, Size - Completed);
+      if (Written < 0) {
+        auto ErrNo = errno;
+        if (ErrNo == EAGAIN || ErrNo == EINTR)
+          continue;
+        else
+          return llvm::errorCodeToError(
+              std::error_code(errno, std::generic_category()));
+      }
+      Completed += Written;
+    }
+    return llvm::Error::success();
+  }
+
+  llvm::Error send() override { return llvm::Error::success(); }
+
+private:
+  int InFD, OutFD;
+};
+
+} // namespace shared
+} // namespace orc
+} // namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_ORC_SHARED_FDRAWBYTECHANNEL_H
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/Shared/OrcError.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/Shared/OrcError.h
new file mode 100644
index 000000000000..9b0d941f5459
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/Shared/OrcError.h
@@ -0,0 +1,74 @@
+//===------ OrcError.h - Reject symbol lookup requests ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//   Define an error category, error codes, and helper utilities for Orc.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_ORCERROR_H
+#define LLVM_EXECUTIONENGINE_ORC_ORCERROR_H
+
+#include "llvm/Support/Error.h"
+#include "llvm/Support/raw_ostream.h"
+#include <string>
+#include <system_error>
+
+namespace llvm {
+namespace orc {
+
+enum class OrcErrorCode : int {
+  // RPC Errors
+  UnknownORCError = 1,
+  DuplicateDefinition,
+  JITSymbolNotFound,
+  RemoteAllocatorDoesNotExist,
+  RemoteAllocatorIdAlreadyInUse,
+  RemoteMProtectAddrUnrecognized,
+  RemoteIndirectStubsOwnerDoesNotExist,
+  RemoteIndirectStubsOwnerIdAlreadyInUse,
+  RPCConnectionClosed,
+  RPCCouldNotNegotiateFunction,
+  RPCResponseAbandoned,
+  UnexpectedRPCCall,
+  UnexpectedRPCResponse,
+  UnknownErrorCodeFromRemote,
+  UnknownResourceHandle,
+  MissingSymbolDefinitions,
+  UnexpectedSymbolDefinitions,
+};
+
+std::error_code orcError(OrcErrorCode ErrCode);
+
+class DuplicateDefinition : public ErrorInfo<DuplicateDefinition> {
+public:
+  static char ID;
+
+  DuplicateDefinition(std::string SymbolName);
+  std::error_code convertToErrorCode() const override;
+  void log(raw_ostream &OS) const override;
+  const std::string &getSymbolName() const;
+private:
+  std::string SymbolName;
+};
+
+class JITSymbolNotFound : public ErrorInfo<JITSymbolNotFound> {
+public:
+  static char ID;
+
+  JITSymbolNotFound(std::string SymbolName);
+  std::error_code convertToErrorCode() const override;
+  void log(raw_ostream &OS) const override;
+  const std::string &getSymbolName() const;
+private:
+  std::string SymbolName;
+};
+
+} // End namespace orc.
+} // End namespace llvm.
+
+#endif // LLVM_EXECUTIONENGINE_ORC_ORCERROR_H
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/Shared/RPCUtils.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/Shared/RPCUtils.h
new file mode 100644
index 000000000000..e0ac640ebcdd
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/Shared/RPCUtils.h
@@ -0,0 +1,1657 @@
+//===- RPCUtils.h - Utilities for building RPC APIs -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Utilities to support construction of simple RPC APIs.
+//
+// The RPC utilities aim for ease of use (minimal conceptual overhead) for C++
+// programmers, high performance, low memory overhead, and efficient use of the
+// communications channel.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_SHARED_RPCUTILS_H
+#define LLVM_EXECUTIONENGINE_ORC_SHARED_RPCUTILS_H
+
+#include <map>
+#include <thread>
+#include <vector>
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ExecutionEngine/Orc/Shared/OrcError.h"
+#include "llvm/ExecutionEngine/Orc/Shared/Serialization.h"
+#include "llvm/Support/MSVCErrorWorkarounds.h"
+
+#include <future>
+
+namespace llvm {
+namespace orc {
+namespace shared {
+
+/// Base class of all fatal RPC errors (those that necessarily result in the
+/// termination of the RPC session).
+class RPCFatalError : public ErrorInfo<RPCFatalError> {
+public:
+  static char ID;
+};
+
+/// RPCConnectionClosed is returned from RPC operations if the RPC connection
+/// has already been closed due to either an error or graceful disconnection.
+class ConnectionClosed : public ErrorInfo<ConnectionClosed> {
+public:
+  static char ID;
+  std::error_code convertToErrorCode() const override;
+  void log(raw_ostream &OS) const override;
+};
+
+/// BadFunctionCall is returned from handleOne when the remote makes a call with
+/// an unrecognized function id.
+///
+/// This error is fatal because Orc RPC needs to know how to parse a function
+/// call to know where the next call starts, and if it doesn't recognize the
+/// function id it cannot parse the call.
+template <typename FnIdT, typename SeqNoT>
+class BadFunctionCall
+    : public ErrorInfo<BadFunctionCall<FnIdT, SeqNoT>, RPCFatalError> {
+public:
+  static char ID;
+
+  BadFunctionCall(FnIdT FnId, SeqNoT SeqNo)
+      : FnId(std::move(FnId)), SeqNo(std::move(SeqNo)) {}
+
+  std::error_code convertToErrorCode() const override {
+    return orcError(OrcErrorCode::UnexpectedRPCCall);
+  }
+
+  void log(raw_ostream &OS) const override {
+    OS << "Call to invalid RPC function id '" << FnId
+       << "' with "
+          "sequence number "
+       << SeqNo;
+  }
+
+private:
+  FnIdT FnId;
+  SeqNoT SeqNo;
+};
+
+template <typename FnIdT, typename SeqNoT>
+char BadFunctionCall<FnIdT, SeqNoT>::ID = 0;
+
+/// InvalidSequenceNumberForResponse is returned from handleOne when a response
+/// call arrives with a sequence number that doesn't correspond to any in-flight
+/// function call.
+///
+/// This error is fatal because Orc RPC needs to know how to parse the rest of
+/// the response call to know where the next call starts, and if it doesn't have
+/// a result parser for this sequence number it can't do that.
+template <typename SeqNoT>
+class InvalidSequenceNumberForResponse
+    : public ErrorInfo<InvalidSequenceNumberForResponse<SeqNoT>,
+                       RPCFatalError> {
+public:
+  static char ID;
+
+  InvalidSequenceNumberForResponse(SeqNoT SeqNo) : SeqNo(std::move(SeqNo)) {}
+
+  std::error_code convertToErrorCode() const override {
+    return orcError(OrcErrorCode::UnexpectedRPCCall);
+  };
+
+  void log(raw_ostream &OS) const override {
+    OS << "Response has unknown sequence number " << SeqNo;
+  }
+
+private:
+  SeqNoT SeqNo;
+};
+
+template <typename SeqNoT>
+char InvalidSequenceNumberForResponse<SeqNoT>::ID = 0;
+
+/// This non-fatal error will be passed to asynchronous result handlers in place
+/// of a result if the connection goes down before a result returns, or if the
+/// function to be called cannot be negotiated with the remote.
+class ResponseAbandoned : public ErrorInfo<ResponseAbandoned> {
+public:
+  static char ID;
+
+  std::error_code convertToErrorCode() const override;
+  void log(raw_ostream &OS) const override;
+};
+
+/// This error is returned if the remote does not have a handler installed for
+/// the given RPC function.
+class CouldNotNegotiate : public ErrorInfo<CouldNotNegotiate> {
+public:
+  static char ID;
+
+  CouldNotNegotiate(std::string Signature);
+  std::error_code convertToErrorCode() const override;
+  void log(raw_ostream &OS) const override;
+  const std::string &getSignature() const { return Signature; }
+
+private:
+  std::string Signature;
+};
+
+template <typename DerivedFunc, typename FnT> class RPCFunction;
+
+// RPC Function class.
+// DerivedFunc should be a user defined class with a static 'getName()' method
+// returning a const char* representing the function's name.
+template <typename DerivedFunc, typename RetT, typename... ArgTs>
+class RPCFunction<DerivedFunc, RetT(ArgTs...)> {
+public:
+  /// User defined function type.
+  using Type = RetT(ArgTs...);
+
+  /// Return type.
+  using ReturnType = RetT;
+
+  /// Returns the full function prototype as a string.
+  static const char *getPrototype() {
+    static std::string Name = [] {
+      std::string Name;
+      raw_string_ostream(Name)
+          << SerializationTypeName<RetT>::getName() << " "
+          << DerivedFunc::getName() << "("
+          << SerializationTypeNameSequence<ArgTs...>() << ")";
+      return Name;
+    }();
+    return Name.data();
+  }
+};
+
+/// Allocates RPC function ids during autonegotiation.
+/// Specializations of this class must provide four members:
+///
+/// static T getInvalidId():
+///   Should return a reserved id that will be used to represent missing
+/// functions during autonegotiation.
+///
+/// static T getResponseId():
+///   Should return a reserved id that will be used to send function responses
+/// (return values).
+///
+/// static T getNegotiateId():
+///   Should return a reserved id for the negotiate function, which will be used
+/// to negotiate ids for user defined functions.
+///
+/// template <typename Func> T allocate():
+///   Allocate a unique id for function Func.
+template <typename T, typename = void> class RPCFunctionIdAllocator;
+
+/// This specialization of RPCFunctionIdAllocator provides a default
+/// implementation for integral types.
+template <typename T>
+class RPCFunctionIdAllocator<T, std::enable_if_t<std::is_integral<T>::value>> {
+public:
+  static T getInvalidId() { return T(0); }
+  static T getResponseId() { return T(1); }
+  static T getNegotiateId() { return T(2); }
+
+  template <typename Func> T allocate() { return NextId++; }
+
+private:
+  T NextId = 3;
+};
+
+namespace detail {
+
+/// Provides a typedef for a tuple containing the decayed argument types.
+template <typename T> class RPCFunctionArgsTuple;
+
+template <typename RetT, typename... ArgTs>
+class RPCFunctionArgsTuple<RetT(ArgTs...)> {
+public:
+  using Type = std::tuple<std::decay_t<std::remove_reference_t<ArgTs>>...>;
+};
+
+// ResultTraits provides typedefs and utilities specific to the return type
+// of functions.
+template <typename RetT> class ResultTraits {
+public:
+  // The return type wrapped in llvm::Expected.
+  using ErrorReturnType = Expected<RetT>;
+
+#ifdef _MSC_VER
+  // The ErrorReturnType wrapped in a std::promise.
+  using ReturnPromiseType = std::promise<MSVCPExpected<RetT>>;
+
+  // The ErrorReturnType wrapped in a std::future.
+  using ReturnFutureType = std::future<MSVCPExpected<RetT>>;
+#else
+  // The ErrorReturnType wrapped in a std::promise.
+  using ReturnPromiseType = std::promise<ErrorReturnType>;
+
+  // The ErrorReturnType wrapped in a std::future.
+  using ReturnFutureType = std::future<ErrorReturnType>;
+#endif
+
+  // Create a 'blank' value of the ErrorReturnType, ready and safe to
+  // overwrite.
+  static ErrorReturnType createBlankErrorReturnValue() {
+    return ErrorReturnType(RetT());
+  }
+
+  // Consume an abandoned ErrorReturnType.
+  static void consumeAbandoned(ErrorReturnType RetOrErr) {
+    consumeError(RetOrErr.takeError());
+  }
+};
+
+// ResultTraits specialization for void functions.
+template <> class ResultTraits<void> {
+public:
+  // For void functions, ErrorReturnType is llvm::Error.
+  using ErrorReturnType = Error;
+
+#ifdef _MSC_VER
+  // The ErrorReturnType wrapped in a std::promise.
+  using ReturnPromiseType = std::promise<MSVCPError>;
+
+  // The ErrorReturnType wrapped in a std::future.
+  using ReturnFutureType = std::future<MSVCPError>;
+#else
+  // The ErrorReturnType wrapped in a std::promise.
+  using ReturnPromiseType = std::promise<ErrorReturnType>;
+
+  // The ErrorReturnType wrapped in a std::future.
+  using ReturnFutureType = std::future<ErrorReturnType>;
+#endif
+
+  // Create a 'blank' value of the ErrorReturnType, ready and safe to
+  // overwrite.
+  static ErrorReturnType createBlankErrorReturnValue() {
+    return ErrorReturnType::success();
+  }
+
+  // Consume an abandoned ErrorReturnType.
+  static void consumeAbandoned(ErrorReturnType Err) {
+    consumeError(std::move(Err));
+  }
+};
+
+// ResultTraits<Error> is equivalent to ResultTraits<void>. This allows
+// handlers for void RPC functions to return either void (in which case they
+// implicitly succeed) or Error (in which case their error return is
+// propagated). See usage in HandlerTraits::runHandlerHelper.
+template <> class ResultTraits<Error> : public ResultTraits<void> {};
+
+// ResultTraits<Expected<T>> is equivalent to ResultTraits<T>. This allows
+// handlers for RPC functions returning a T to return either a T (in which
+// case they implicitly succeed) or Expected<T> (in which case their error
+// return is propagated). See usage in HandlerTraits::runHandlerHelper.
+template <typename RetT>
+class ResultTraits<Expected<RetT>> : public ResultTraits<RetT> {};
+
+// Determines whether an RPC function's defined error return type supports
+// error return value.
+template <typename T> class SupportsErrorReturn {
+public:
+  static const bool value = false;
+};
+
+template <> class SupportsErrorReturn<Error> {
+public:
+  static const bool value = true;
+};
+
+template <typename T> class SupportsErrorReturn<Expected<T>> {
+public:
+  static const bool value = true;
+};
+
+// RespondHelper packages return values based on whether or not the declared
+// RPC function return type supports error returns.
+template <bool FuncSupportsErrorReturn> class RespondHelper;
+
+// RespondHelper specialization for functions that support error returns.
+template <> class RespondHelper<true> {
+public:
+  // Send Expected<T>.
+  template <typename WireRetT, typename HandlerRetT, typename ChannelT,
+            typename FunctionIdT, typename SequenceNumberT>
+  static Error sendResult(ChannelT &C, const FunctionIdT &ResponseId,
+                          SequenceNumberT SeqNo,
+                          Expected<HandlerRetT> ResultOrErr) {
+    if (!ResultOrErr && ResultOrErr.template errorIsA<RPCFatalError>())
+      return ResultOrErr.takeError();
+
+    // Open the response message.
+    if (auto Err = C.startSendMessage(ResponseId, SeqNo))
+      return Err;
+
+    // Serialize the result.
+    if (auto Err =
+            SerializationTraits<ChannelT, WireRetT, Expected<HandlerRetT>>::
+                serialize(C, std::move(ResultOrErr)))
+      return Err;
+
+    // Close the response message.
+    if (auto Err = C.endSendMessage())
+      return Err;
+    return C.send();
+  }
+
+  template <typename ChannelT, typename FunctionIdT, typename SequenceNumberT>
+  static Error sendResult(ChannelT &C, const FunctionIdT &ResponseId,
+                          SequenceNumberT SeqNo, Error Err) {
+    if (Err && Err.isA<RPCFatalError>())
+      return Err;
+    if (auto Err2 = C.startSendMessage(ResponseId, SeqNo))
+      return Err2;
+    if (auto Err2 = serializeSeq(C, std::move(Err)))
+      return Err2;
+    if (auto Err2 = C.endSendMessage())
+      return Err2;
+    return C.send();
+  }
+};
+
+// RespondHelper specialization for functions that do not support error returns.
+template <> class RespondHelper<false> {
+public:
+  template <typename WireRetT, typename HandlerRetT, typename ChannelT,
+            typename FunctionIdT, typename SequenceNumberT>
+  static Error sendResult(ChannelT &C, const FunctionIdT &ResponseId,
+                          SequenceNumberT SeqNo,
+                          Expected<HandlerRetT> ResultOrErr) {
+    if (auto Err = ResultOrErr.takeError())
+      return Err;
+
+    // Open the response message.
+    if (auto Err = C.startSendMessage(ResponseId, SeqNo))
+      return Err;
+
+    // Serialize the result.
+    if (auto Err =
+            SerializationTraits<ChannelT, WireRetT, HandlerRetT>::serialize(
+                C, *ResultOrErr))
+      return Err;
+
+    // End the response message.
+    if (auto Err = C.endSendMessage())
+      return Err;
+
+    return C.send();
+  }
+
+  template <typename ChannelT, typename FunctionIdT, typename SequenceNumberT>
+  static Error sendResult(ChannelT &C, const FunctionIdT &ResponseId,
+                          SequenceNumberT SeqNo, Error Err) {
+    if (Err)
+      return Err;
+    if (auto Err2 = C.startSendMessage(ResponseId, SeqNo))
+      return Err2;
+    if (auto Err2 = C.endSendMessage())
+      return Err2;
+    return C.send();
+  }
+};
+
+// Send a response of the given wire return type (WireRetT) over the
+// channel, with the given sequence number.
+template <typename WireRetT, typename HandlerRetT, typename ChannelT,
+          typename FunctionIdT, typename SequenceNumberT>
+Error respond(ChannelT &C, const FunctionIdT &ResponseId, SequenceNumberT SeqNo,
+              Expected<HandlerRetT> ResultOrErr) {
+  return RespondHelper<SupportsErrorReturn<WireRetT>::value>::
+      template sendResult<WireRetT>(C, ResponseId, SeqNo,
+                                    std::move(ResultOrErr));
+}
+
+// Send an empty response message on the given channel to indicate that
+// the handler ran.
+template <typename WireRetT, typename ChannelT, typename FunctionIdT,
+          typename SequenceNumberT>
+Error respond(ChannelT &C, const FunctionIdT &ResponseId, SequenceNumberT SeqNo,
+              Error Err) {
+  return RespondHelper<SupportsErrorReturn<WireRetT>::value>::sendResult(
+      C, ResponseId, SeqNo, std::move(Err));
+}
+
+// Converts a given type to the equivalent error return type.
+template <typename T> class WrappedHandlerReturn {
+public:
+  using Type = Expected<T>;
+};
+
+template <typename T> class WrappedHandlerReturn<Expected<T>> {
+public:
+  using Type = Expected<T>;
+};
+
+template <> class WrappedHandlerReturn<void> {
+public:
+  using Type = Error;
+};
+
+template <> class WrappedHandlerReturn<Error> {
+public:
+  using Type = Error;
+};
+
+template <> class WrappedHandlerReturn<ErrorSuccess> {
+public:
+  using Type = Error;
+};
+
+// Traits class that strips the response function from the list of handler
+// arguments.
+template <typename FnT> class AsyncHandlerTraits;
+
+template <typename ResultT, typename... ArgTs>
+class AsyncHandlerTraits<Error(std::function<Error(Expected<ResultT>)>,
+                               ArgTs...)> {
+public:
+  using Type = Error(ArgTs...);
+  using ResultType = Expected<ResultT>;
+};
+
+template <typename... ArgTs>
+class AsyncHandlerTraits<Error(std::function<Error(Error)>, ArgTs...)> {
+public:
+  using Type = Error(ArgTs...);
+  using ResultType = Error;
+};
+
+template <typename... ArgTs>
+class AsyncHandlerTraits<ErrorSuccess(std::function<Error(Error)>, ArgTs...)> {
+public:
+  using Type = Error(ArgTs...);
+  using ResultType = Error;
+};
+
+template <typename... ArgTs>
+class AsyncHandlerTraits<void(std::function<Error(Error)>, ArgTs...)> {
+public:
+  using Type = Error(ArgTs...);
+  using ResultType = Error;
+};
+
+template <typename ResponseHandlerT, typename... ArgTs>
+class AsyncHandlerTraits<Error(ResponseHandlerT, ArgTs...)>
+    : public AsyncHandlerTraits<Error(std::decay_t<ResponseHandlerT>,
+                                      ArgTs...)> {};
+
+// This template class provides utilities related to RPC function handlers.
+// The base case applies to non-function types (the template class is
+// specialized for function types) and inherits from the appropriate
+// speciilization for the given non-function type's call operator.
+template <typename HandlerT>
+class HandlerTraits
+    : public HandlerTraits<
+          decltype(&std::remove_reference<HandlerT>::type::operator())> {};
+
+// Traits for handlers with a given function type.
+template <typename RetT, typename... ArgTs>
+class HandlerTraits<RetT(ArgTs...)> {
+public:
+  // Function type of the handler.
+  using Type = RetT(ArgTs...);
+
+  // Return type of the handler.
+  using ReturnType = RetT;
+
+  // Call the given handler with the given arguments.
+  template <typename HandlerT, typename... TArgTs>
+  static typename WrappedHandlerReturn<RetT>::Type
+  unpackAndRun(HandlerT &Handler, std::tuple<TArgTs...> &Args) {
+    return unpackAndRunHelper(Handler, Args,
+                              std::index_sequence_for<TArgTs...>());
+  }
+
+  // Call the given handler with the given arguments.
+  template <typename HandlerT, typename ResponderT, typename... TArgTs>
+  static Error unpackAndRunAsync(HandlerT &Handler, ResponderT &Responder,
+                                 std::tuple<TArgTs...> &Args) {
+    return unpackAndRunAsyncHelper(Handler, Responder, Args,
+                                   std::index_sequence_for<TArgTs...>());
+  }
+
+  // Call the given handler with the given arguments.
+  template <typename HandlerT>
+  static std::enable_if_t<
+      std::is_void<typename HandlerTraits<HandlerT>::ReturnType>::value, Error>
+  run(HandlerT &Handler, ArgTs &&...Args) {
+    Handler(std::move(Args)...);
+    return Error::success();
+  }
+
+  template <typename HandlerT, typename... TArgTs>
+  static std::enable_if_t<
+      !std::is_void<typename HandlerTraits<HandlerT>::ReturnType>::value,
+      typename HandlerTraits<HandlerT>::ReturnType>
+  run(HandlerT &Handler, TArgTs... Args) {
+    return Handler(std::move(Args)...);
+  }
+
+  // Serialize arguments to the channel.
+  template <typename ChannelT, typename... CArgTs>
+  static Error serializeArgs(ChannelT &C, const CArgTs... CArgs) {
+    return SequenceSerialization<ChannelT, ArgTs...>::serialize(C, CArgs...);
+  }
+
+  // Deserialize arguments from the channel.
+  template <typename ChannelT, typename... CArgTs>
+  static Error deserializeArgs(ChannelT &C, std::tuple<CArgTs...> &Args) {
+    return deserializeArgsHelper(C, Args, std::index_sequence_for<CArgTs...>());
+  }
+
+private:
+  template <typename ChannelT, typename... CArgTs, size_t... Indexes>
+  static Error deserializeArgsHelper(ChannelT &C, std::tuple<CArgTs...> &Args,
+                                     std::index_sequence<Indexes...> _) {
+    return SequenceSerialization<ChannelT, ArgTs...>::deserialize(
+        C, std::get<Indexes>(Args)...);
+  }
+
+  template <typename HandlerT, typename ArgTuple, size_t... Indexes>
+  static typename WrappedHandlerReturn<
+      typename HandlerTraits<HandlerT>::ReturnType>::Type
+  unpackAndRunHelper(HandlerT &Handler, ArgTuple &Args,
+                     std::index_sequence<Indexes...>) {
+    return run(Handler, std::move(std::get<Indexes>(Args))...);
+  }
+
+  template <typename HandlerT, typename ResponderT, typename ArgTuple,
+            size_t... Indexes>
+  static typename WrappedHandlerReturn<
+      typename HandlerTraits<HandlerT>::ReturnType>::Type
+  unpackAndRunAsyncHelper(HandlerT &Handler, ResponderT &Responder,
+                          ArgTuple &Args, std::index_sequence<Indexes...>) {
+    return run(Handler, Responder, std::move(std::get<Indexes>(Args))...);
+  }
+};
+
+// Handler traits for free functions.
+template <typename RetT, typename... ArgTs>
+class HandlerTraits<RetT (*)(ArgTs...)> : public HandlerTraits<RetT(ArgTs...)> {
+};
+
+// Handler traits for class methods (especially call operators for lambdas).
+template <typename Class, typename RetT, typename... ArgTs>
+class HandlerTraits<RetT (Class::*)(ArgTs...)>
+    : public HandlerTraits<RetT(ArgTs...)> {};
+
+// Handler traits for const class methods (especially call operators for
+// lambdas).
+template <typename Class, typename RetT, typename... ArgTs>
+class HandlerTraits<RetT (Class::*)(ArgTs...) const>
+    : public HandlerTraits<RetT(ArgTs...)> {};
+
+// Utility to peel the Expected wrapper off a response handler error type.
+template <typename HandlerT> class ResponseHandlerArg;
+
+template <typename ArgT> class ResponseHandlerArg<Error(Expected<ArgT>)> {
+public:
+  using ArgType = Expected<ArgT>;
+  using UnwrappedArgType = ArgT;
+};
+
+template <typename ArgT>
+class ResponseHandlerArg<ErrorSuccess(Expected<ArgT>)> {
+public:
+  using ArgType = Expected<ArgT>;
+  using UnwrappedArgType = ArgT;
+};
+
+template <> class ResponseHandlerArg<Error(Error)> {
+public:
+  using ArgType = Error;
+};
+
+template <> class ResponseHandlerArg<ErrorSuccess(Error)> {
+public:
+  using ArgType = Error;
+};
+
+// ResponseHandler represents a handler for a not-yet-received function call
+// result.
+template <typename ChannelT> class ResponseHandler {
+public:
+  virtual ~ResponseHandler() {}
+
+  // Reads the function result off the wire and acts on it. The meaning of
+  // "act" will depend on how this method is implemented in any given
+  // ResponseHandler subclass but could, for example, mean running a
+  // user-specified handler or setting a promise value.
+  virtual Error handleResponse(ChannelT &C) = 0;
+
+  // Abandons this outstanding result.
+  virtual void abandon() = 0;
+
+  // Create an error instance representing an abandoned response.
+  static Error createAbandonedResponseError() {
+    return make_error<ResponseAbandoned>();
+  }
+};
+
+// ResponseHandler subclass for RPC functions with non-void returns.
+template <typename ChannelT, typename FuncRetT, typename HandlerT>
+class ResponseHandlerImpl : public ResponseHandler<ChannelT> {
+public:
+  ResponseHandlerImpl(HandlerT Handler) : Handler(std::move(Handler)) {}
+
+  // Handle the result by deserializing it from the channel then passing it
+  // to the user defined handler.
+  Error handleResponse(ChannelT &C) override {
+    using UnwrappedArgType = typename ResponseHandlerArg<
+        typename HandlerTraits<HandlerT>::Type>::UnwrappedArgType;
+    UnwrappedArgType Result;
+    if (auto Err =
+            SerializationTraits<ChannelT, FuncRetT,
+                                UnwrappedArgType>::deserialize(C, Result))
+      return Err;
+    if (auto Err = C.endReceiveMessage())
+      return Err;
+    return Handler(std::move(Result));
+  }
+
+  // Abandon this response by calling the handler with an 'abandoned response'
+  // error.
+  void abandon() override {
+    if (auto Err = Handler(this->createAbandonedResponseError())) {
+      // Handlers should not fail when passed an abandoned response error.
+      report_fatal_error(std::move(Err));
+    }
+  }
+
+private:
+  HandlerT Handler;
+};
+
+// ResponseHandler subclass for RPC functions with void returns.
+template <typename ChannelT, typename HandlerT>
+class ResponseHandlerImpl<ChannelT, void, HandlerT>
+    : public ResponseHandler<ChannelT> {
+public:
+  ResponseHandlerImpl(HandlerT Handler) : Handler(std::move(Handler)) {}
+
+  // Handle the result (no actual value, just a notification that the function
+  // has completed on the remote end) by calling the user-defined handler with
+  // Error::success().
+  Error handleResponse(ChannelT &C) override {
+    if (auto Err = C.endReceiveMessage())
+      return Err;
+    return Handler(Error::success());
+  }
+
+  // Abandon this response by calling the handler with an 'abandoned response'
+  // error.
+  void abandon() override {
+    if (auto Err = Handler(this->createAbandonedResponseError())) {
+      // Handlers should not fail when passed an abandoned response error.
+      report_fatal_error(std::move(Err));
+    }
+  }
+
+private:
+  HandlerT Handler;
+};
+
+template <typename ChannelT, typename FuncRetT, typename HandlerT>
+class ResponseHandlerImpl<ChannelT, Expected<FuncRetT>, HandlerT>
+    : public ResponseHandler<ChannelT> {
+public:
+  ResponseHandlerImpl(HandlerT Handler) : Handler(std::move(Handler)) {}
+
+  // Handle the result by deserializing it from the channel then passing it
+  // to the user defined handler.
+  Error handleResponse(ChannelT &C) override {
+    using HandlerArgType = typename ResponseHandlerArg<
+        typename HandlerTraits<HandlerT>::Type>::ArgType;
+    HandlerArgType Result((typename HandlerArgType::value_type()));
+
+    if (auto Err = SerializationTraits<ChannelT, Expected<FuncRetT>,
+                                       HandlerArgType>::deserialize(C, Result))
+      return Err;
+    if (auto Err = C.endReceiveMessage())
+      return Err;
+    return Handler(std::move(Result));
+  }
+
+  // Abandon this response by calling the handler with an 'abandoned response'
+  // error.
+  void abandon() override {
+    if (auto Err = Handler(this->createAbandonedResponseError())) {
+      // Handlers should not fail when passed an abandoned response error.
+      report_fatal_error(std::move(Err));
+    }
+  }
+
+private:
+  HandlerT Handler;
+};
+
+template <typename ChannelT, typename HandlerT>
+class ResponseHandlerImpl<ChannelT, Error, HandlerT>
+    : public ResponseHandler<ChannelT> {
+public:
+  ResponseHandlerImpl(HandlerT Handler) : Handler(std::move(Handler)) {}
+
+  // Handle the result by deserializing it from the channel then passing it
+  // to the user defined handler.
+  Error handleResponse(ChannelT &C) override {
+    Error Result = Error::success();
+    if (auto Err = SerializationTraits<ChannelT, Error, Error>::deserialize(
+            C, Result)) {
+      consumeError(std::move(Result));
+      return Err;
+    }
+    if (auto Err = C.endReceiveMessage()) {
+      consumeError(std::move(Result));
+      return Err;
+    }
+    return Handler(std::move(Result));
+  }
+
+  // Abandon this response by calling the handler with an 'abandoned response'
+  // error.
+  void abandon() override {
+    if (auto Err = Handler(this->createAbandonedResponseError())) {
+      // Handlers should not fail when passed an abandoned response error.
+      report_fatal_error(std::move(Err));
+    }
+  }
+
+private:
+  HandlerT Handler;
+};
+
+// Create a ResponseHandler from a given user handler.
+template <typename ChannelT, typename FuncRetT, typename HandlerT>
+std::unique_ptr<ResponseHandler<ChannelT>> createResponseHandler(HandlerT H) {
+  return std::make_unique<ResponseHandlerImpl<ChannelT, FuncRetT, HandlerT>>(
+      std::move(H));
+}
+
+// Helper for wrapping member functions up as functors. This is useful for
+// installing methods as result handlers.
+template <typename ClassT, typename RetT, typename... ArgTs>
+class MemberFnWrapper {
+public:
+  using MethodT = RetT (ClassT::*)(ArgTs...);
+  MemberFnWrapper(ClassT &Instance, MethodT Method)
+      : Instance(Instance), Method(Method) {}
+  RetT operator()(ArgTs &&...Args) {
+    return (Instance.*Method)(std::move(Args)...);
+  }
+
+private:
+  ClassT &Instance;
+  MethodT Method;
+};
+
+// Helper that provides a Functor for deserializing arguments.
+template <typename... ArgTs> class ReadArgs {
+public:
+  Error operator()() { return Error::success(); }
+};
+
+template <typename ArgT, typename... ArgTs>
+class ReadArgs<ArgT, ArgTs...> : public ReadArgs<ArgTs...> {
+public:
+  ReadArgs(ArgT &Arg, ArgTs &...Args) : ReadArgs<ArgTs...>(Args...), Arg(Arg) {}
+
+  Error operator()(ArgT &ArgVal, ArgTs &...ArgVals) {
+    this->Arg = std::move(ArgVal);
+    return ReadArgs<ArgTs...>::operator()(ArgVals...);
+  }
+
+private:
+  ArgT &Arg;
+};
+
+// Manage sequence numbers.
+template <typename SequenceNumberT> class SequenceNumberManager {
+public:
+  // Reset, making all sequence numbers available.
+  void reset() {
+    std::lock_guard<std::mutex> Lock(SeqNoLock);
+    NextSequenceNumber = 0;
+    FreeSequenceNumbers.clear();
+  }
+
+  // Get the next available sequence number. Will re-use numbers that have
+  // been released.
+  SequenceNumberT getSequenceNumber() {
+    std::lock_guard<std::mutex> Lock(SeqNoLock);
+    if (FreeSequenceNumbers.empty())
+      return NextSequenceNumber++;
+    auto SequenceNumber = FreeSequenceNumbers.back();
+    FreeSequenceNumbers.pop_back();
+    return SequenceNumber;
+  }
+
+  // Release a sequence number, making it available for re-use.
+  void releaseSequenceNumber(SequenceNumberT SequenceNumber) {
+    std::lock_guard<std::mutex> Lock(SeqNoLock);
+    FreeSequenceNumbers.push_back(SequenceNumber);
+  }
+
+private:
+  std::mutex SeqNoLock;
+  SequenceNumberT NextSequenceNumber = 0;
+  std::vector<SequenceNumberT> FreeSequenceNumbers;
+};
+
+// Checks that predicate P holds for each corresponding pair of type arguments
+// from T1 and T2 tuple.
+template <template <class, class> class P, typename T1Tuple, typename T2Tuple>
+class RPCArgTypeCheckHelper;
+
+template <template <class, class> class P>
+class RPCArgTypeCheckHelper<P, std::tuple<>, std::tuple<>> {
+public:
+  static const bool value = true;
+};
+
+template <template <class, class> class P, typename T, typename... Ts,
+          typename U, typename... Us>
+class RPCArgTypeCheckHelper<P, std::tuple<T, Ts...>, std::tuple<U, Us...>> {
+public:
+  static const bool value =
+      P<T, U>::value &&
+      RPCArgTypeCheckHelper<P, std::tuple<Ts...>, std::tuple<Us...>>::value;
+};
+
+template <template <class, class> class P, typename T1Sig, typename T2Sig>
+class RPCArgTypeCheck {
+public:
+  using T1Tuple = typename RPCFunctionArgsTuple<T1Sig>::Type;
+  using T2Tuple = typename RPCFunctionArgsTuple<T2Sig>::Type;
+
+  static_assert(std::tuple_size<T1Tuple>::value >=
+                    std::tuple_size<T2Tuple>::value,
+                "Too many arguments to RPC call");
+  static_assert(std::tuple_size<T1Tuple>::value <=
+                    std::tuple_size<T2Tuple>::value,
+                "Too few arguments to RPC call");
+
+  static const bool value = RPCArgTypeCheckHelper<P, T1Tuple, T2Tuple>::value;
+};
+
+template <typename ChannelT, typename WireT, typename ConcreteT>
+class CanSerialize {
+private:
+  using S = SerializationTraits<ChannelT, WireT, ConcreteT>;
+
+  template <typename T>
+  static std::true_type check(
+      std::enable_if_t<std::is_same<decltype(T::serialize(
+                                        std::declval<ChannelT &>(),
+                                        std::declval<const ConcreteT &>())),
+                                    Error>::value,
+                       void *>);
+
+  template <typename> static std::false_type check(...);
+
+public:
+  static const bool value = decltype(check<S>(0))::value;
+};
+
+template <typename ChannelT, typename WireT, typename ConcreteT>
+class CanDeserialize {
+private:
+  using S = SerializationTraits<ChannelT, WireT, ConcreteT>;
+
+  template <typename T>
+  static std::true_type
+      check(std::enable_if_t<
+            std::is_same<decltype(T::deserialize(std::declval<ChannelT &>(),
+                                                 std::declval<ConcreteT &>())),
+                         Error>::value,
+            void *>);
+
+  template <typename> static std::false_type check(...);
+
+public:
+  static const bool value = decltype(check<S>(0))::value;
+};
+
+/// Contains primitive utilities for defining, calling and handling calls to
+/// remote procedures. ChannelT is a bidirectional stream conforming to the
+/// RPCChannel interface (see RPCChannel.h), FunctionIdT is a procedure
+/// identifier type that must be serializable on ChannelT, and SequenceNumberT
+/// is an integral type that will be used to number in-flight function calls.
+///
+/// These utilities support the construction of very primitive RPC utilities.
+/// Their intent is to ensure correct serialization and deserialization of
+/// procedure arguments, and to keep the client and server's view of the API in
+/// sync.
+template <typename ImplT, typename ChannelT, typename FunctionIdT,
+          typename SequenceNumberT>
+class RPCEndpointBase {
+protected:
+  class OrcRPCInvalid : public RPCFunction<OrcRPCInvalid, void()> {
+  public:
+    static const char *getName() { return "__orc_rpc$invalid"; }
+  };
+
+  class OrcRPCResponse : public RPCFunction<OrcRPCResponse, void()> {
+  public:
+    static const char *getName() { return "__orc_rpc$response"; }
+  };
+
+  class OrcRPCNegotiate
+      : public RPCFunction<OrcRPCNegotiate, FunctionIdT(std::string)> {
+  public:
+    static const char *getName() { return "__orc_rpc$negotiate"; }
+  };
+
+  // Helper predicate for testing for the presence of SerializeTraits
+  // serializers.
+  template <typename WireT, typename ConcreteT>
+  class CanSerializeCheck : detail::CanSerialize<ChannelT, WireT, ConcreteT> {
+  public:
+    using detail::CanSerialize<ChannelT, WireT, ConcreteT>::value;
+
+    static_assert(value, "Missing serializer for argument (Can't serialize the "
+                         "first template type argument of CanSerializeCheck "
+                         "from the second)");
+  };
+
+  // Helper predicate for testing for the presence of SerializeTraits
+  // deserializers.
+  template <typename WireT, typename ConcreteT>
+  class CanDeserializeCheck
+      : detail::CanDeserialize<ChannelT, WireT, ConcreteT> {
+  public:
+    using detail::CanDeserialize<ChannelT, WireT, ConcreteT>::value;
+
+    static_assert(value, "Missing deserializer for argument (Can't deserialize "
+                         "the second template type argument of "
+                         "CanDeserializeCheck from the first)");
+  };
+
+public:
+  /// Construct an RPC instance on a channel.
+  RPCEndpointBase(ChannelT &C, bool LazyAutoNegotiation)
+      : C(C), LazyAutoNegotiation(LazyAutoNegotiation) {
+    // Hold ResponseId in a special variable, since we expect Response to be
+    // called relatively frequently, and want to avoid the map lookup.
+    ResponseId = FnIdAllocator.getResponseId();
+    RemoteFunctionIds[OrcRPCResponse::getPrototype()] = ResponseId;
+
+    // Register the negotiate function id and handler.
+    auto NegotiateId = FnIdAllocator.getNegotiateId();
+    RemoteFunctionIds[OrcRPCNegotiate::getPrototype()] = NegotiateId;
+    Handlers[NegotiateId] = wrapHandler<OrcRPCNegotiate>(
+        [this](const std::string &Name) { return handleNegotiate(Name); });
+  }
+
+  /// Negotiate a function id for Func with the other end of the channel.
+  template <typename Func> Error negotiateFunction(bool Retry = false) {
+    return getRemoteFunctionId<Func>(true, Retry).takeError();
+  }
+
+  /// Append a call Func, does not call send on the channel.
+  /// The first argument specifies a user-defined handler to be run when the
+  /// function returns. The handler should take an Expected<Func::ReturnType>,
+  /// or an Error (if Func::ReturnType is void). The handler will be called
+  /// with an error if the return value is abandoned due to a channel error.
+  template <typename Func, typename HandlerT, typename... ArgTs>
+  Error appendCallAsync(HandlerT Handler, const ArgTs &...Args) {
+
+    static_assert(
+        detail::RPCArgTypeCheck<CanSerializeCheck, typename Func::Type,
+                                void(ArgTs...)>::value,
+        "");
+
+    // Look up the function ID.
+    FunctionIdT FnId;
+    if (auto FnIdOrErr = getRemoteFunctionId<Func>(LazyAutoNegotiation, false))
+      FnId = *FnIdOrErr;
+    else {
+      // Negotiation failed. Notify the handler then return the negotiate-failed
+      // error.
+      cantFail(Handler(make_error<ResponseAbandoned>()));
+      return FnIdOrErr.takeError();
+    }
+
+    SequenceNumberT SeqNo; // initialized in locked scope below.
+    {
+      // Lock the pending responses map and sequence number manager.
+      std::lock_guard<std::mutex> Lock(ResponsesMutex);
+
+      // Allocate a sequence number.
+      SeqNo = SequenceNumberMgr.getSequenceNumber();
+      assert(!PendingResponses.count(SeqNo) &&
+             "Sequence number already allocated");
+
+      // Install the user handler.
+      PendingResponses[SeqNo] =
+          detail::createResponseHandler<ChannelT, typename Func::ReturnType>(
+              std::move(Handler));
+    }
+
+    // Open the function call message.
+    if (auto Err = C.startSendMessage(FnId, SeqNo)) {
+      abandonPendingResponses();
+      return Err;
+    }
+
+    // Serialize the call arguments.
+    if (auto Err = detail::HandlerTraits<typename Func::Type>::serializeArgs(
+            C, Args...)) {
+      abandonPendingResponses();
+      return Err;
+    }
+
+    // Close the function call messagee.
+    if (auto Err = C.endSendMessage()) {
+      abandonPendingResponses();
+      return Err;
+    }
+
+    return Error::success();
+  }
+
+  Error sendAppendedCalls() { return C.send(); };
+
+  template <typename Func, typename HandlerT, typename... ArgTs>
+  Error callAsync(HandlerT Handler, const ArgTs &...Args) {
+    if (auto Err = appendCallAsync<Func>(std::move(Handler), Args...))
+      return Err;
+    return C.send();
+  }
+
+  /// Handle one incoming call.
+  Error handleOne() {
+    FunctionIdT FnId;
+    SequenceNumberT SeqNo;
+    if (auto Err = C.startReceiveMessage(FnId, SeqNo)) {
+      abandonPendingResponses();
+      return Err;
+    }
+    if (FnId == ResponseId)
+      return handleResponse(SeqNo);
+    auto I = Handlers.find(FnId);
+    if (I != Handlers.end())
+      return I->second(C, SeqNo);
+
+    // else: No handler found. Report error to client?
+    return make_error<BadFunctionCall<FunctionIdT, SequenceNumberT>>(FnId,
+                                                                     SeqNo);
+  }
+
+  /// Helper for handling setter procedures - this method returns a functor that
+  /// sets the variables referred to by Args... to values deserialized from the
+  /// channel.
+  /// E.g.
+  ///
+  ///   typedef Function<0, bool, int> Func1;
+  ///
+  ///   ...
+  ///   bool B;
+  ///   int I;
+  ///   if (auto Err = expect<Func1>(Channel, readArgs(B, I)))
+  ///     /* Handle Args */ ;
+  ///
+  template <typename... ArgTs>
+  static detail::ReadArgs<ArgTs...> readArgs(ArgTs &...Args) {
+    return detail::ReadArgs<ArgTs...>(Args...);
+  }
+
+  /// Abandon all outstanding result handlers.
+  ///
+  /// This will call all currently registered result handlers to receive an
+  /// "abandoned" error as their argument. This is used internally by the RPC
+  /// in error situations, but can also be called directly by clients who are
+  /// disconnecting from the remote and don't or can't expect responses to their
+  /// outstanding calls. (Especially for outstanding blocking calls, calling
+  /// this function may be necessary to avoid dead threads).
+  void abandonPendingResponses() {
+    // Lock the pending responses map and sequence number manager.
+    std::lock_guard<std::mutex> Lock(ResponsesMutex);
+
+    for (auto &KV : PendingResponses)
+      KV.second->abandon();
+    PendingResponses.clear();
+    SequenceNumberMgr.reset();
+  }
+
+  /// Remove the handler for the given function.
+  /// A handler must currently be registered for this function.
+  template <typename Func> void removeHandler() {
+    auto IdItr = LocalFunctionIds.find(Func::getPrototype());
+    assert(IdItr != LocalFunctionIds.end() &&
+           "Function does not have a registered handler");
+    auto HandlerItr = Handlers.find(IdItr->second);
+    assert(HandlerItr != Handlers.end() &&
+           "Function does not have a registered handler");
+    Handlers.erase(HandlerItr);
+  }
+
+  /// Clear all handlers.
+  void clearHandlers() { Handlers.clear(); }
+
+protected:
+  FunctionIdT getInvalidFunctionId() const {
+    return FnIdAllocator.getInvalidId();
+  }
+
+  /// Add the given handler to the handler map and make it available for
+  /// autonegotiation and execution.
+  template <typename Func, typename HandlerT>
+  void addHandlerImpl(HandlerT Handler) {
+
+    static_assert(detail::RPCArgTypeCheck<
+                      CanDeserializeCheck, typename Func::Type,
+                      typename detail::HandlerTraits<HandlerT>::Type>::value,
+                  "");
+
+    FunctionIdT NewFnId = FnIdAllocator.template allocate<Func>();
+    LocalFunctionIds[Func::getPrototype()] = NewFnId;
+    Handlers[NewFnId] = wrapHandler<Func>(std::move(Handler));
+  }
+
+  template <typename Func, typename HandlerT>
+  void addAsyncHandlerImpl(HandlerT Handler) {
+
+    static_assert(
+        detail::RPCArgTypeCheck<
+            CanDeserializeCheck, typename Func::Type,
+            typename detail::AsyncHandlerTraits<
+                typename detail::HandlerTraits<HandlerT>::Type>::Type>::value,
+        "");
+
+    FunctionIdT NewFnId = FnIdAllocator.template allocate<Func>();
+    LocalFunctionIds[Func::getPrototype()] = NewFnId;
+    Handlers[NewFnId] = wrapAsyncHandler<Func>(std::move(Handler));
+  }
+
+  Error handleResponse(SequenceNumberT SeqNo) {
+    using Handler = typename decltype(PendingResponses)::mapped_type;
+    Handler PRHandler;
+
+    {
+      // Lock the pending responses map and sequence number manager.
+      std::unique_lock<std::mutex> Lock(ResponsesMutex);
+      auto I = PendingResponses.find(SeqNo);
+
+      if (I != PendingResponses.end()) {
+        PRHandler = std::move(I->second);
+        PendingResponses.erase(I);
+        SequenceNumberMgr.releaseSequenceNumber(SeqNo);
+      } else {
+        // Unlock the pending results map to prevent recursive lock.
+        Lock.unlock();
+        abandonPendingResponses();
+        return make_error<InvalidSequenceNumberForResponse<SequenceNumberT>>(
+            SeqNo);
+      }
+    }
+
+    assert(PRHandler &&
+           "If we didn't find a response handler we should have bailed out");
+
+    if (auto Err = PRHandler->handleResponse(C)) {
+      abandonPendingResponses();
+      return Err;
+    }
+
+    return Error::success();
+  }
+
+  FunctionIdT handleNegotiate(const std::string &Name) {
+    auto I = LocalFunctionIds.find(Name);
+    if (I == LocalFunctionIds.end())
+      return getInvalidFunctionId();
+    return I->second;
+  }
+
+  // Find the remote FunctionId for the given function.
+  template <typename Func>
+  Expected<FunctionIdT> getRemoteFunctionId(bool NegotiateIfNotInMap,
+                                            bool NegotiateIfInvalid) {
+    bool DoNegotiate;
+
+    // Check if we already have a function id...
+    auto I = RemoteFunctionIds.find(Func::getPrototype());
+    if (I != RemoteFunctionIds.end()) {
+      // If it's valid there's nothing left to do.
+      if (I->second != getInvalidFunctionId())
+        return I->second;
+      DoNegotiate = NegotiateIfInvalid;
+    } else
+      DoNegotiate = NegotiateIfNotInMap;
+
+    // We don't have a function id for Func yet, but we're allowed to try to
+    // negotiate one.
+    if (DoNegotiate) {
+      auto &Impl = static_cast<ImplT &>(*this);
+      if (auto RemoteIdOrErr =
+              Impl.template callB<OrcRPCNegotiate>(Func::getPrototype())) {
+        RemoteFunctionIds[Func::getPrototype()] = *RemoteIdOrErr;
+        if (*RemoteIdOrErr == getInvalidFunctionId())
+          return make_error<CouldNotNegotiate>(Func::getPrototype());
+        return *RemoteIdOrErr;
+      } else
+        return RemoteIdOrErr.takeError();
+    }
+
+    // No key was available in the map and we weren't allowed to try to
+    // negotiate one, so return an unknown function error.
+    return make_error<CouldNotNegotiate>(Func::getPrototype());
+  }
+
+  using WrappedHandlerFn = std::function<Error(ChannelT &, SequenceNumberT)>;
+
+  // Wrap the given user handler in the necessary argument-deserialization code,
+  // result-serialization code, and call to the launch policy (if present).
+  template <typename Func, typename HandlerT>
+  WrappedHandlerFn wrapHandler(HandlerT Handler) {
+    return [this, Handler](ChannelT &Channel,
+                           SequenceNumberT SeqNo) mutable -> Error {
+      // Start by deserializing the arguments.
+      using ArgsTuple = typename detail::RPCFunctionArgsTuple<
+          typename detail::HandlerTraits<HandlerT>::Type>::Type;
+      auto Args = std::make_shared<ArgsTuple>();
+
+      if (auto Err =
+              detail::HandlerTraits<typename Func::Type>::deserializeArgs(
+                  Channel, *Args))
+        return Err;
+
+      // GCC 4.7 and 4.8 incorrectly issue a -Wunused-but-set-variable warning
+      // for RPCArgs. Void cast RPCArgs to work around this for now.
+      // FIXME: Remove this workaround once we can assume a working GCC version.
+      (void)Args;
+
+      // End receieve message, unlocking the channel for reading.
+      if (auto Err = Channel.endReceiveMessage())
+        return Err;
+
+      using HTraits = detail::HandlerTraits<HandlerT>;
+      using FuncReturn = typename Func::ReturnType;
+      return detail::respond<FuncReturn>(Channel, ResponseId, SeqNo,
+                                         HTraits::unpackAndRun(Handler, *Args));
+    };
+  }
+
+  // Wrap the given user handler in the necessary argument-deserialization code,
+  // result-serialization code, and call to the launch policy (if present).
+  template <typename Func, typename HandlerT>
+  WrappedHandlerFn wrapAsyncHandler(HandlerT Handler) {
+    return [this, Handler](ChannelT &Channel,
+                           SequenceNumberT SeqNo) mutable -> Error {
+      // Start by deserializing the arguments.
+      using AHTraits = detail::AsyncHandlerTraits<
+          typename detail::HandlerTraits<HandlerT>::Type>;
+      using ArgsTuple =
+          typename detail::RPCFunctionArgsTuple<typename AHTraits::Type>::Type;
+      auto Args = std::make_shared<ArgsTuple>();
+
+      if (auto Err =
+              detail::HandlerTraits<typename Func::Type>::deserializeArgs(
+                  Channel, *Args))
+        return Err;
+
+      // GCC 4.7 and 4.8 incorrectly issue a -Wunused-but-set-variable warning
+      // for RPCArgs. Void cast RPCArgs to work around this for now.
+      // FIXME: Remove this workaround once we can assume a working GCC version.
+      (void)Args;
+
+      // End receieve message, unlocking the channel for reading.
+      if (auto Err = Channel.endReceiveMessage())
+        return Err;
+
+      using HTraits = detail::HandlerTraits<HandlerT>;
+      using FuncReturn = typename Func::ReturnType;
+      auto Responder = [this,
+                        SeqNo](typename AHTraits::ResultType RetVal) -> Error {
+        return detail::respond<FuncReturn>(C, ResponseId, SeqNo,
+                                           std::move(RetVal));
+      };
+
+      return HTraits::unpackAndRunAsync(Handler, Responder, *Args);
+    };
+  }
+
+  ChannelT &C;
+
+  bool LazyAutoNegotiation;
+
+  RPCFunctionIdAllocator<FunctionIdT> FnIdAllocator;
+
+  FunctionIdT ResponseId;
+  std::map<std::string, FunctionIdT> LocalFunctionIds;
+  std::map<const char *, FunctionIdT> RemoteFunctionIds;
+
+  std::map<FunctionIdT, WrappedHandlerFn> Handlers;
+
+  std::mutex ResponsesMutex;
+  detail::SequenceNumberManager<SequenceNumberT> SequenceNumberMgr;
+  std::map<SequenceNumberT, std::unique_ptr<detail::ResponseHandler<ChannelT>>>
+      PendingResponses;
+};
+
+} // end namespace detail
+
+template <typename ChannelT, typename FunctionIdT = uint32_t,
+          typename SequenceNumberT = uint32_t>
+class MultiThreadedRPCEndpoint
+    : public detail::RPCEndpointBase<
+          MultiThreadedRPCEndpoint<ChannelT, FunctionIdT, SequenceNumberT>,
+          ChannelT, FunctionIdT, SequenceNumberT> {
+private:
+  using BaseClass = detail::RPCEndpointBase<
+      MultiThreadedRPCEndpoint<ChannelT, FunctionIdT, SequenceNumberT>,
+      ChannelT, FunctionIdT, SequenceNumberT>;
+
+public:
+  MultiThreadedRPCEndpoint(ChannelT &C, bool LazyAutoNegotiation)
+      : BaseClass(C, LazyAutoNegotiation) {}
+
+  /// Add a handler for the given RPC function.
+  /// This installs the given handler functor for the given RPCFunction, and
+  /// makes the RPC function available for negotiation/calling from the remote.
+  template <typename Func, typename HandlerT>
+  void addHandler(HandlerT Handler) {
+    return this->template addHandlerImpl<Func>(std::move(Handler));
+  }
+
+  /// Add a class-method as a handler.
+  template <typename Func, typename ClassT, typename RetT, typename... ArgTs>
+  void addHandler(ClassT &Object, RetT (ClassT::*Method)(ArgTs...)) {
+    addHandler<Func>(
+        detail::MemberFnWrapper<ClassT, RetT, ArgTs...>(Object, Method));
+  }
+
+  template <typename Func, typename HandlerT>
+  void addAsyncHandler(HandlerT Handler) {
+    return this->template addAsyncHandlerImpl<Func>(std::move(Handler));
+  }
+
+  /// Add a class-method as a handler.
+  template <typename Func, typename ClassT, typename RetT, typename... ArgTs>
+  void addAsyncHandler(ClassT &Object, RetT (ClassT::*Method)(ArgTs...)) {
+    addAsyncHandler<Func>(
+        detail::MemberFnWrapper<ClassT, RetT, ArgTs...>(Object, Method));
+  }
+
+  /// Return type for non-blocking call primitives.
+  template <typename Func>
+  using NonBlockingCallResult = typename detail::ResultTraits<
+      typename Func::ReturnType>::ReturnFutureType;
+
+  /// Call Func on Channel C. Does not block, does not call send. Returns a pair
+  /// of a future result and the sequence number assigned to the result.
+  ///
+  /// This utility function is primarily used for single-threaded mode support,
+  /// where the sequence number can be used to wait for the corresponding
+  /// result. In multi-threaded mode the appendCallNB method, which does not
+  /// return the sequence numeber, should be preferred.
+  template <typename Func, typename... ArgTs>
+  Expected<NonBlockingCallResult<Func>> appendCallNB(const ArgTs &...Args) {
+    using RTraits = detail::ResultTraits<typename Func::ReturnType>;
+    using ErrorReturn = typename RTraits::ErrorReturnType;
+    using ErrorReturnPromise = typename RTraits::ReturnPromiseType;
+
+    ErrorReturnPromise Promise;
+    auto FutureResult = Promise.get_future();
+
+    if (auto Err = this->template appendCallAsync<Func>(
+            [Promise = std::move(Promise)](ErrorReturn RetOrErr) mutable {
+              Promise.set_value(std::move(RetOrErr));
+              return Error::success();
+            },
+            Args...)) {
+      RTraits::consumeAbandoned(FutureResult.get());
+      return std::move(Err);
+    }
+    return std::move(FutureResult);
+  }
+
+  /// The same as appendCallNBWithSeq, except that it calls C.send() to
+  /// flush the channel after serializing the call.
+  template <typename Func, typename... ArgTs>
+  Expected<NonBlockingCallResult<Func>> callNB(const ArgTs &...Args) {
+    auto Result = appendCallNB<Func>(Args...);
+    if (!Result)
+      return Result;
+    if (auto Err = this->C.send()) {
+      this->abandonPendingResponses();
+      detail::ResultTraits<typename Func::ReturnType>::consumeAbandoned(
+          std::move(Result->get()));
+      return std::move(Err);
+    }
+    return Result;
+  }
+
+  /// Call Func on Channel C. Blocks waiting for a result. Returns an Error
+  /// for void functions or an Expected<T> for functions returning a T.
+  ///
+  /// This function is for use in threaded code where another thread is
+  /// handling responses and incoming calls.
+  template <typename Func, typename... ArgTs,
+            typename AltRetT = typename Func::ReturnType>
+  typename detail::ResultTraits<AltRetT>::ErrorReturnType
+  callB(const ArgTs &...Args) {
+    if (auto FutureResOrErr = callNB<Func>(Args...))
+      return FutureResOrErr->get();
+    else
+      return FutureResOrErr.takeError();
+  }
+
+  /// Handle incoming RPC calls.
+  Error handlerLoop() {
+    while (true)
+      if (auto Err = this->handleOne())
+        return Err;
+    return Error::success();
+  }
+};
+
+template <typename ChannelT, typename FunctionIdT = uint32_t,
+          typename SequenceNumberT = uint32_t>
+class SingleThreadedRPCEndpoint
+    : public detail::RPCEndpointBase<
+          SingleThreadedRPCEndpoint<ChannelT, FunctionIdT, SequenceNumberT>,
+          ChannelT, FunctionIdT, SequenceNumberT> {
+private:
+  using BaseClass = detail::RPCEndpointBase<
+      SingleThreadedRPCEndpoint<ChannelT, FunctionIdT, SequenceNumberT>,
+      ChannelT, FunctionIdT, SequenceNumberT>;
+
+public:
+  SingleThreadedRPCEndpoint(ChannelT &C, bool LazyAutoNegotiation)
+      : BaseClass(C, LazyAutoNegotiation) {}
+
+  template <typename Func, typename HandlerT>
+  void addHandler(HandlerT Handler) {
+    return this->template addHandlerImpl<Func>(std::move(Handler));
+  }
+
+  template <typename Func, typename ClassT, typename RetT, typename... ArgTs>
+  void addHandler(ClassT &Object, RetT (ClassT::*Method)(ArgTs...)) {
+    addHandler<Func>(
+        detail::MemberFnWrapper<ClassT, RetT, ArgTs...>(Object, Method));
+  }
+
+  template <typename Func, typename HandlerT>
+  void addAsyncHandler(HandlerT Handler) {
+    return this->template addAsyncHandlerImpl<Func>(std::move(Handler));
+  }
+
+  /// Add a class-method as a handler.
+  template <typename Func, typename ClassT, typename RetT, typename... ArgTs>
+  void addAsyncHandler(ClassT &Object, RetT (ClassT::*Method)(ArgTs...)) {
+    addAsyncHandler<Func>(
+        detail::MemberFnWrapper<ClassT, RetT, ArgTs...>(Object, Method));
+  }
+
+  template <typename Func, typename... ArgTs,
+            typename AltRetT = typename Func::ReturnType>
+  typename detail::ResultTraits<AltRetT>::ErrorReturnType
+  callB(const ArgTs &...Args) {
+    bool ReceivedResponse = false;
+    using ResultType = typename detail::ResultTraits<AltRetT>::ErrorReturnType;
+    auto Result = detail::ResultTraits<AltRetT>::createBlankErrorReturnValue();
+
+    // We have to 'Check' result (which we know is in a success state at this
+    // point) so that it can be overwritten in the async handler.
+    (void)!!Result;
+
+    if (auto Err = this->template appendCallAsync<Func>(
+            [&](ResultType R) {
+              Result = std::move(R);
+              ReceivedResponse = true;
+              return Error::success();
+            },
+            Args...)) {
+      detail::ResultTraits<typename Func::ReturnType>::consumeAbandoned(
+          std::move(Result));
+      return std::move(Err);
+    }
+
+    if (auto Err = this->C.send()) {
+      detail::ResultTraits<typename Func::ReturnType>::consumeAbandoned(
+          std::move(Result));
+      return std::move(Err);
+    }
+
+    while (!ReceivedResponse) {
+      if (auto Err = this->handleOne()) {
+        detail::ResultTraits<typename Func::ReturnType>::consumeAbandoned(
+            std::move(Result));
+        return std::move(Err);
+      }
+    }
+
+    return Result;
+  }
+};
+
+/// Asynchronous dispatch for a function on an RPC endpoint.
+template <typename RPCClass, typename Func> class RPCAsyncDispatch {
+public:
+  RPCAsyncDispatch(RPCClass &Endpoint) : Endpoint(Endpoint) {}
+
+  template <typename HandlerT, typename... ArgTs>
+  Error operator()(HandlerT Handler, const ArgTs &...Args) const {
+    return Endpoint.template appendCallAsync<Func>(std::move(Handler), Args...);
+  }
+
+private:
+  RPCClass &Endpoint;
+};
+
+/// Construct an asynchronous dispatcher from an RPC endpoint and a Func.
+template <typename Func, typename RPCEndpointT>
+RPCAsyncDispatch<RPCEndpointT, Func> rpcAsyncDispatch(RPCEndpointT &Endpoint) {
+  return RPCAsyncDispatch<RPCEndpointT, Func>(Endpoint);
+}
+
+/// Allows a set of asynchrounous calls to be dispatched, and then
+///        waited on as a group.
+class ParallelCallGroup {
+public:
+  ParallelCallGroup() = default;
+  ParallelCallGroup(const ParallelCallGroup &) = delete;
+  ParallelCallGroup &operator=(const ParallelCallGroup &) = delete;
+
+  /// Make as asynchronous call.
+  template <typename AsyncDispatcher, typename HandlerT, typename... ArgTs>
+  Error call(const AsyncDispatcher &AsyncDispatch, HandlerT Handler,
+             const ArgTs &...Args) {
+    // Increment the count of outstanding calls. This has to happen before
+    // we invoke the call, as the handler may (depending on scheduling)
+    // be run immediately on another thread, and we don't want the decrement
+    // in the wrapped handler below to run before the increment.
+    {
+      std::unique_lock<std::mutex> Lock(M);
+      ++NumOutstandingCalls;
+    }
+
+    // Wrap the user handler in a lambda that will decrement the
+    // outstanding calls count, then poke the condition variable.
+    using ArgType = typename detail::ResponseHandlerArg<
+        typename detail::HandlerTraits<HandlerT>::Type>::ArgType;
+    auto WrappedHandler = [this, Handler = std::move(Handler)](ArgType Arg) {
+      auto Err = Handler(std::move(Arg));
+      std::unique_lock<std::mutex> Lock(M);
+      --NumOutstandingCalls;
+      CV.notify_all();
+      return Err;
+    };
+
+    return AsyncDispatch(std::move(WrappedHandler), Args...);
+  }
+
+  /// Blocks until all calls have been completed and their return value
+  ///        handlers run.
+  void wait() {
+    std::unique_lock<std::mutex> Lock(M);
+    while (NumOutstandingCalls > 0)
+      CV.wait(Lock);
+  }
+
+private:
+  std::mutex M;
+  std::condition_variable CV;
+  uint32_t NumOutstandingCalls = 0;
+};
+
+/// Convenience class for grouping RPCFunctions into APIs that can be
+///        negotiated as a block.
+///
+template <typename... Funcs> class APICalls {
+public:
+  /// Test whether this API contains Function F.
+  template <typename F> class Contains {
+  public:
+    static const bool value = false;
+  };
+
+  /// Negotiate all functions in this API.
+  template <typename RPCEndpoint> static Error negotiate(RPCEndpoint &R) {
+    return Error::success();
+  }
+};
+
+template <typename Func, typename... Funcs> class APICalls<Func, Funcs...> {
+public:
+  template <typename F> class Contains {
+  public:
+    static const bool value = std::is_same<F, Func>::value |
+                              APICalls<Funcs...>::template Contains<F>::value;
+  };
+
+  template <typename RPCEndpoint> static Error negotiate(RPCEndpoint &R) {
+    if (auto Err = R.template negotiateFunction<Func>())
+      return Err;
+    return APICalls<Funcs...>::negotiate(R);
+  }
+};
+
+template <typename... InnerFuncs, typename... Funcs>
+class APICalls<APICalls<InnerFuncs...>, Funcs...> {
+public:
+  template <typename F> class Contains {
+  public:
+    static const bool value =
+        APICalls<InnerFuncs...>::template Contains<F>::value |
+        APICalls<Funcs...>::template Contains<F>::value;
+  };
+
+  template <typename RPCEndpoint> static Error negotiate(RPCEndpoint &R) {
+    if (auto Err = APICalls<InnerFuncs...>::negotiate(R))
+      return Err;
+    return APICalls<Funcs...>::negotiate(R);
+  }
+};
+
+} // end namespace shared
+} // end namespace orc
+} // end namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_ORC_SHARED_RPCUTILS_H
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/Shared/RawByteChannel.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/Shared/RawByteChannel.h
new file mode 100644
index 000000000000..2ee471939251
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/Shared/RawByteChannel.h
@@ -0,0 +1,183 @@
+//===- RawByteChannel.h -----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_SHARED_RAWBYTECHANNEL_H
+#define LLVM_EXECUTIONENGINE_ORC_SHARED_RAWBYTECHANNEL_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ExecutionEngine/Orc/Shared/Serialization.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
+#include <cstdint>
+#include <mutex>
+#include <string>
+#include <type_traits>
+
+namespace llvm {
+namespace orc {
+namespace shared {
+
+/// Interface for byte-streams to be used with ORC Serialization.
+class RawByteChannel {
+public:
+  virtual ~RawByteChannel() = default;
+
+  /// Read Size bytes from the stream into *Dst.
+  virtual Error readBytes(char *Dst, unsigned Size) = 0;
+
+  /// Read size bytes from *Src and append them to the stream.
+  virtual Error appendBytes(const char *Src, unsigned Size) = 0;
+
+  /// Flush the stream if possible.
+  virtual Error send() = 0;
+
+  /// Notify the channel that we're starting a message send.
+  /// Locks the channel for writing.
+  template <typename FunctionIdT, typename SequenceIdT>
+  Error startSendMessage(const FunctionIdT &FnId, const SequenceIdT &SeqNo) {
+    writeLock.lock();
+    if (auto Err = serializeSeq(*this, FnId, SeqNo)) {
+      writeLock.unlock();
+      return Err;
+    }
+    return Error::success();
+  }
+
+  /// Notify the channel that we're ending a message send.
+  /// Unlocks the channel for writing.
+  Error endSendMessage() {
+    writeLock.unlock();
+    return Error::success();
+  }
+
+  /// Notify the channel that we're starting a message receive.
+  /// Locks the channel for reading.
+  template <typename FunctionIdT, typename SequenceNumberT>
+  Error startReceiveMessage(FunctionIdT &FnId, SequenceNumberT &SeqNo) {
+    readLock.lock();
+    if (auto Err = deserializeSeq(*this, FnId, SeqNo)) {
+      readLock.unlock();
+      return Err;
+    }
+    return Error::success();
+  }
+
+  /// Notify the channel that we're ending a message receive.
+  /// Unlocks the channel for reading.
+  Error endReceiveMessage() {
+    readLock.unlock();
+    return Error::success();
+  }
+
+  /// Get the lock for stream reading.
+  std::mutex &getReadLock() { return readLock; }
+
+  /// Get the lock for stream writing.
+  std::mutex &getWriteLock() { return writeLock; }
+
+private:
+  std::mutex readLock, writeLock;
+};
+
+template <typename ChannelT, typename T>
+class SerializationTraits<
+    ChannelT, T, T,
+    std::enable_if_t<
+        std::is_base_of<RawByteChannel, ChannelT>::value &&
+        (std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value ||
+         std::is_same<T, uint16_t>::value || std::is_same<T, int16_t>::value ||
+         std::is_same<T, uint32_t>::value || std::is_same<T, int32_t>::value ||
+         std::is_same<T, uint64_t>::value || std::is_same<T, int64_t>::value ||
+         std::is_same<T, char>::value)>> {
+public:
+  static Error serialize(ChannelT &C, T V) {
+    support::endian::byte_swap<T, support::big>(V);
+    return C.appendBytes(reinterpret_cast<const char *>(&V), sizeof(T));
+  };
+
+  static Error deserialize(ChannelT &C, T &V) {
+    if (auto Err = C.readBytes(reinterpret_cast<char *>(&V), sizeof(T)))
+      return Err;
+    support::endian::byte_swap<T, support::big>(V);
+    return Error::success();
+  };
+};
+
+template <typename ChannelT>
+class SerializationTraits<
+    ChannelT, bool, bool,
+    std::enable_if_t<std::is_base_of<RawByteChannel, ChannelT>::value>> {
+public:
+  static Error serialize(ChannelT &C, bool V) {
+    uint8_t Tmp = V ? 1 : 0;
+    if (auto Err = C.appendBytes(reinterpret_cast<const char *>(&Tmp), 1))
+      return Err;
+    return Error::success();
+  }
+
+  static Error deserialize(ChannelT &C, bool &V) {
+    uint8_t Tmp = 0;
+    if (auto Err = C.readBytes(reinterpret_cast<char *>(&Tmp), 1))
+      return Err;
+    V = Tmp != 0;
+    return Error::success();
+  }
+};
+
+template <typename ChannelT>
+class SerializationTraits<
+    ChannelT, std::string, StringRef,
+    std::enable_if_t<std::is_base_of<RawByteChannel, ChannelT>::value>> {
+public:
+  /// Serialization channel serialization for std::strings.
+  static Error serialize(RawByteChannel &C, StringRef S) {
+    if (auto Err = serializeSeq(C, static_cast<uint64_t>(S.size())))
+      return Err;
+    return C.appendBytes((const char *)S.data(), S.size());
+  }
+};
+
+template <typename ChannelT, typename T>
+class SerializationTraits<
+    ChannelT, std::string, T,
+    std::enable_if_t<std::is_base_of<RawByteChannel, ChannelT>::value &&
+                     (std::is_same<T, const char *>::value ||
+                      std::is_same<T, char *>::value)>> {
+public:
+  static Error serialize(RawByteChannel &C, const char *S) {
+    return SerializationTraits<ChannelT, std::string, StringRef>::serialize(C,
+                                                                            S);
+  }
+};
+
+template <typename ChannelT>
+class SerializationTraits<
+    ChannelT, std::string, std::string,
+    std::enable_if_t<std::is_base_of<RawByteChannel, ChannelT>::value>> {
+public:
+  /// Serialization channel serialization for std::strings.
+  static Error serialize(RawByteChannel &C, const std::string &S) {
+    return SerializationTraits<ChannelT, std::string, StringRef>::serialize(C,
+                                                                            S);
+  }
+
+  /// Serialization channel deserialization for std::strings.
+  static Error deserialize(RawByteChannel &C, std::string &S) {
+    uint64_t Count = 0;
+    if (auto Err = deserializeSeq(C, Count))
+      return Err;
+    S.resize(Count);
+    return C.readBytes(&S[0], Count);
+  }
+};
+
+} // end namespace shared
+} // end namespace orc
+} // end namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_ORC_SHARED_RAWBYTECHANNEL_H
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/Shared/Serialization.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/Shared/Serialization.h
new file mode 100644
index 000000000000..f2d07632bd5d
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/Shared/Serialization.h
@@ -0,0 +1,769 @@
+//===- Serialization.h ------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_SHARED_SERIALIZATION_H
+#define LLVM_EXECUTIONENGINE_ORC_SHARED_SERIALIZATION_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ExecutionEngine/Orc/Shared/OrcError.h"
+#include "llvm/Support/thread.h"
+#include <map>
+#include <mutex>
+#include <set>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace llvm {
+namespace orc {
+namespace shared {
+
+template <typename T> class SerializationTypeName;
+
+/// TypeNameSequence is a utility for rendering sequences of types to a string
+/// by rendering each type, separated by ", ".
+template <typename... ArgTs> class SerializationTypeNameSequence {};
+
+/// Render an empty TypeNameSequence to an ostream.
+template <typename OStream>
+OStream &operator<<(OStream &OS, const SerializationTypeNameSequence<> &V) {
+  return OS;
+}
+
+/// Render a TypeNameSequence of a single type to an ostream.
+template <typename OStream, typename ArgT>
+OStream &operator<<(OStream &OS, const SerializationTypeNameSequence<ArgT> &V) {
+  OS << SerializationTypeName<ArgT>::getName();
+  return OS;
+}
+
+/// Render a TypeNameSequence of more than one type to an ostream.
+template <typename OStream, typename ArgT1, typename ArgT2, typename... ArgTs>
+OStream &
+operator<<(OStream &OS,
+           const SerializationTypeNameSequence<ArgT1, ArgT2, ArgTs...> &V) {
+  OS << SerializationTypeName<ArgT1>::getName() << ", "
+     << SerializationTypeNameSequence<ArgT2, ArgTs...>();
+  return OS;
+}
+
+template <> class SerializationTypeName<void> {
+public:
+  static const char *getName() { return "void"; }
+};
+
+template <> class SerializationTypeName<int8_t> {
+public:
+  static const char *getName() { return "int8_t"; }
+};
+
+template <> class SerializationTypeName<uint8_t> {
+public:
+  static const char *getName() { return "uint8_t"; }
+};
+
+template <> class SerializationTypeName<int16_t> {
+public:
+  static const char *getName() { return "int16_t"; }
+};
+
+template <> class SerializationTypeName<uint16_t> {
+public:
+  static const char *getName() { return "uint16_t"; }
+};
+
+template <> class SerializationTypeName<int32_t> {
+public:
+  static const char *getName() { return "int32_t"; }
+};
+
+template <> class SerializationTypeName<uint32_t> {
+public:
+  static const char *getName() { return "uint32_t"; }
+};
+
+template <> class SerializationTypeName<int64_t> {
+public:
+  static const char *getName() { return "int64_t"; }
+};
+
+template <> class SerializationTypeName<uint64_t> {
+public:
+  static const char *getName() { return "uint64_t"; }
+};
+
+template <> class SerializationTypeName<bool> {
+public:
+  static const char *getName() { return "bool"; }
+};
+
+template <> class SerializationTypeName<std::string> {
+public:
+  static const char *getName() { return "std::string"; }
+};
+
+template <> class SerializationTypeName<Error> {
+public:
+  static const char *getName() { return "Error"; }
+};
+
+template <typename T> class SerializationTypeName<Expected<T>> {
+public:
+  static const char *getName() {
+    static std::string Name = [] {
+      std::string Name;
+      raw_string_ostream(Name)
+          << "Expected<" << SerializationTypeNameSequence<T>() << ">";
+      return Name;
+    }();
+    return Name.data();
+  }
+};
+
+template <typename T1, typename T2>
+class SerializationTypeName<std::pair<T1, T2>> {
+public:
+  static const char *getName() {
+    static std::string Name = [] {
+      std::string Name;
+      raw_string_ostream(Name)
+          << "std::pair<" << SerializationTypeNameSequence<T1, T2>() << ">";
+      return Name;
+    }();
+    return Name.data();
+  }
+};
+
+template <typename... ArgTs> class SerializationTypeName<std::tuple<ArgTs...>> {
+public:
+  static const char *getName() {
+    static std::string Name = [] {
+      std::string Name;
+      raw_string_ostream(Name)
+          << "std::tuple<" << SerializationTypeNameSequence<ArgTs...>() << ">";
+      return Name;
+    }();
+    return Name.data();
+  }
+};
+
+template <typename T> class SerializationTypeName<Optional<T>> {
+public:
+  static const char *getName() {
+    static std::string Name = [] {
+      std::string Name;
+      raw_string_ostream(Name)
+          << "Optional<" << SerializationTypeName<T>::getName() << ">";
+      return Name;
+    }();
+    return Name.data();
+  }
+};
+
+template <typename T> class SerializationTypeName<std::vector<T>> {
+public:
+  static const char *getName() {
+    static std::string Name = [] {
+      std::string Name;
+      raw_string_ostream(Name)
+          << "std::vector<" << SerializationTypeName<T>::getName() << ">";
+      return Name;
+    }();
+    return Name.data();
+  }
+};
+
+template <typename T> class SerializationTypeName<std::set<T>> {
+public:
+  static const char *getName() {
+    static std::string Name = [] {
+      std::string Name;
+      raw_string_ostream(Name)
+          << "std::set<" << SerializationTypeName<T>::getName() << ">";
+      return Name;
+    }();
+    return Name.data();
+  }
+};
+
+template <typename K, typename V> class SerializationTypeName<std::map<K, V>> {
+public:
+  static const char *getName() {
+    static std::string Name = [] {
+      std::string Name;
+      raw_string_ostream(Name)
+          << "std::map<" << SerializationTypeNameSequence<K, V>() << ">";
+      return Name;
+    }();
+    return Name.data();
+  }
+};
+
+/// The SerializationTraits<ChannelT, T> class describes how to serialize and
+/// deserialize an instance of type T to/from an abstract channel of type
+/// ChannelT. It also provides a representation of the type's name via the
+/// getName method.
+///
+/// Specializations of this class should provide the following functions:
+///
+///   @code{.cpp}
+///
+///   static const char* getName();
+///   static Error serialize(ChannelT&, const T&);
+///   static Error deserialize(ChannelT&, T&);
+///
+///   @endcode
+///
+/// The third argument of SerializationTraits is intended to support SFINAE.
+/// E.g.:
+///
+///   @code{.cpp}
+///
+///   class MyVirtualChannel { ... };
+///
+///   template <DerivedChannelT>
+///   class SerializationTraits<DerivedChannelT, bool,
+///         std::enable_if_t<
+///           std::is_base_of<VirtChannel, DerivedChannel>::value
+///         >> {
+///   public:
+///     static const char* getName() { ... };
+///   }
+///
+///   @endcode
+template <typename ChannelT, typename WireType,
+          typename ConcreteType = WireType, typename = void>
+class SerializationTraits;
+
+template <typename ChannelT> class SequenceTraits {
+public:
+  static Error emitSeparator(ChannelT &C) { return Error::success(); }
+  static Error consumeSeparator(ChannelT &C) { return Error::success(); }
+};
+
+/// Utility class for serializing sequences of values of varying types.
+/// Specializations of this class contain 'serialize' and 'deserialize' methods
+/// for the given channel. The ArgTs... list will determine the "over-the-wire"
+/// types to be serialized. The serialize and deserialize methods take a list
+/// CArgTs... ("caller arg types") which must be the same length as ArgTs...,
+/// but may be different types from ArgTs, provided that for each CArgT there
+/// is a SerializationTraits specialization
+/// SerializeTraits<ChannelT, ArgT, CArgT> with methods that can serialize the
+/// caller argument to over-the-wire value.
+template <typename ChannelT, typename... ArgTs> class SequenceSerialization;
+
+template <typename ChannelT> class SequenceSerialization<ChannelT> {
+public:
+  static Error serialize(ChannelT &C) { return Error::success(); }
+  static Error deserialize(ChannelT &C) { return Error::success(); }
+};
+
+template <typename ChannelT, typename ArgT>
+class SequenceSerialization<ChannelT, ArgT> {
+public:
+  template <typename CArgT> static Error serialize(ChannelT &C, CArgT &&CArg) {
+    return SerializationTraits<ChannelT, ArgT, std::decay_t<CArgT>>::serialize(
+        C, std::forward<CArgT>(CArg));
+  }
+
+  template <typename CArgT> static Error deserialize(ChannelT &C, CArgT &CArg) {
+    return SerializationTraits<ChannelT, ArgT, CArgT>::deserialize(C, CArg);
+  }
+};
+
+template <typename ChannelT, typename ArgT, typename... ArgTs>
+class SequenceSerialization<ChannelT, ArgT, ArgTs...> {
+public:
+  template <typename CArgT, typename... CArgTs>
+  static Error serialize(ChannelT &C, CArgT &&CArg, CArgTs &&...CArgs) {
+    if (auto Err =
+            SerializationTraits<ChannelT, ArgT, std::decay_t<CArgT>>::serialize(
+                C, std::forward<CArgT>(CArg)))
+      return Err;
+    if (auto Err = SequenceTraits<ChannelT>::emitSeparator(C))
+      return Err;
+    return SequenceSerialization<ChannelT, ArgTs...>::serialize(
+        C, std::forward<CArgTs>(CArgs)...);
+  }
+
+  template <typename CArgT, typename... CArgTs>
+  static Error deserialize(ChannelT &C, CArgT &CArg, CArgTs &...CArgs) {
+    if (auto Err =
+            SerializationTraits<ChannelT, ArgT, CArgT>::deserialize(C, CArg))
+      return Err;
+    if (auto Err = SequenceTraits<ChannelT>::consumeSeparator(C))
+      return Err;
+    return SequenceSerialization<ChannelT, ArgTs...>::deserialize(C, CArgs...);
+  }
+};
+
+template <typename ChannelT, typename... ArgTs>
+Error serializeSeq(ChannelT &C, ArgTs &&...Args) {
+  return SequenceSerialization<ChannelT, std::decay_t<ArgTs>...>::serialize(
+      C, std::forward<ArgTs>(Args)...);
+}
+
+template <typename ChannelT, typename... ArgTs>
+Error deserializeSeq(ChannelT &C, ArgTs &...Args) {
+  return SequenceSerialization<ChannelT, ArgTs...>::deserialize(C, Args...);
+}
+
+template <typename ChannelT> class SerializationTraits<ChannelT, Error> {
+public:
+  using WrappedErrorSerializer =
+      std::function<Error(ChannelT &C, const ErrorInfoBase &)>;
+
+  using WrappedErrorDeserializer =
+      std::function<Error(ChannelT &C, Error &Err)>;
+
+  template <typename ErrorInfoT, typename SerializeFtor,
+            typename DeserializeFtor>
+  static void registerErrorType(std::string Name, SerializeFtor Serialize,
+                                DeserializeFtor Deserialize) {
+    assert(!Name.empty() &&
+           "The empty string is reserved for the Success value");
+
+    const std::string *KeyName = nullptr;
+    {
+      // We're abusing the stability of std::map here: We take a reference to
+      // the key of the deserializers map to save us from duplicating the string
+      // in the serializer. This should be changed to use a stringpool if we
+      // switch to a map type that may move keys in memory.
+      std::lock_guard<std::recursive_mutex> Lock(DeserializersMutex);
+      auto I = Deserializers.insert(
+          Deserializers.begin(),
+          std::make_pair(std::move(Name), std::move(Deserialize)));
+      KeyName = &I->first;
+    }
+
+    {
+      assert(KeyName != nullptr && "No keyname pointer");
+      std::lock_guard<std::recursive_mutex> Lock(SerializersMutex);
+      Serializers[ErrorInfoT::classID()] =
+          [KeyName, Serialize = std::move(Serialize)](
+              ChannelT &C, const ErrorInfoBase &EIB) -> Error {
+        assert(EIB.dynamicClassID() == ErrorInfoT::classID() &&
+               "Serializer called for wrong error type");
+        if (auto Err = serializeSeq(C, *KeyName))
+          return Err;
+        return Serialize(C, static_cast<const ErrorInfoT &>(EIB));
+      };
+    }
+  }
+
+  static Error serialize(ChannelT &C, Error &&Err) {
+    std::lock_guard<std::recursive_mutex> Lock(SerializersMutex);
+
+    if (!Err)
+      return serializeSeq(C, std::string());
+
+    return handleErrors(std::move(Err), [&C](const ErrorInfoBase &EIB) {
+      auto SI = Serializers.find(EIB.dynamicClassID());
+      if (SI == Serializers.end())
+        return serializeAsStringError(C, EIB);
+      return (SI->second)(C, EIB);
+    });
+  }
+
+  static Error deserialize(ChannelT &C, Error &Err) {
+    std::lock_guard<std::recursive_mutex> Lock(DeserializersMutex);
+
+    std::string Key;
+    if (auto Err = deserializeSeq(C, Key))
+      return Err;
+
+    if (Key.empty()) {
+      ErrorAsOutParameter EAO(&Err);
+      Err = Error::success();
+      return Error::success();
+    }
+
+    auto DI = Deserializers.find(Key);
+    assert(DI != Deserializers.end() && "No deserializer for error type");
+    return (DI->second)(C, Err);
+  }
+
+private:
+  static Error serializeAsStringError(ChannelT &C, const ErrorInfoBase &EIB) {
+    std::string ErrMsg;
+    {
+      raw_string_ostream ErrMsgStream(ErrMsg);
+      EIB.log(ErrMsgStream);
+    }
+    return serialize(C, make_error<StringError>(std::move(ErrMsg),
+                                                inconvertibleErrorCode()));
+  }
+
+  static std::recursive_mutex SerializersMutex;
+  static std::recursive_mutex DeserializersMutex;
+  static std::map<const void *, WrappedErrorSerializer> Serializers;
+  static std::map<std::string, WrappedErrorDeserializer> Deserializers;
+};
+
+template <typename ChannelT>
+std::recursive_mutex SerializationTraits<ChannelT, Error>::SerializersMutex;
+
+template <typename ChannelT>
+std::recursive_mutex SerializationTraits<ChannelT, Error>::DeserializersMutex;
+
+template <typename ChannelT>
+std::map<const void *,
+         typename SerializationTraits<ChannelT, Error>::WrappedErrorSerializer>
+    SerializationTraits<ChannelT, Error>::Serializers;
+
+template <typename ChannelT>
+std::map<std::string, typename SerializationTraits<
+                          ChannelT, Error>::WrappedErrorDeserializer>
+    SerializationTraits<ChannelT, Error>::Deserializers;
+
+/// Registers a serializer and deserializer for the given error type on the
+/// given channel type.
+template <typename ChannelT, typename ErrorInfoT, typename SerializeFtor,
+          typename DeserializeFtor>
+void registerErrorSerialization(std::string Name, SerializeFtor &&Serialize,
+                                DeserializeFtor &&Deserialize) {
+  SerializationTraits<ChannelT, Error>::template registerErrorType<ErrorInfoT>(
+      std::move(Name), std::forward<SerializeFtor>(Serialize),
+      std::forward<DeserializeFtor>(Deserialize));
+}
+
+/// Registers serialization/deserialization for StringError.
+template <typename ChannelT> void registerStringError() {
+  static bool AlreadyRegistered = false;
+  if (!AlreadyRegistered) {
+    registerErrorSerialization<ChannelT, StringError>(
+        "StringError",
+        [](ChannelT &C, const StringError &SE) {
+          return serializeSeq(C, SE.getMessage());
+        },
+        [](ChannelT &C, Error &Err) -> Error {
+          ErrorAsOutParameter EAO(&Err);
+          std::string Msg;
+          if (auto E2 = deserializeSeq(C, Msg))
+            return E2;
+          Err = make_error<StringError>(
+              std::move(Msg),
+              orcError(OrcErrorCode::UnknownErrorCodeFromRemote));
+          return Error::success();
+        });
+    AlreadyRegistered = true;
+  }
+}
+
+/// SerializationTraits for Expected<T1> from an Expected<T2>.
+template <typename ChannelT, typename T1, typename T2>
+class SerializationTraits<ChannelT, Expected<T1>, Expected<T2>> {
+public:
+  static Error serialize(ChannelT &C, Expected<T2> &&ValOrErr) {
+    if (ValOrErr) {
+      if (auto Err = serializeSeq(C, true))
+        return Err;
+      return SerializationTraits<ChannelT, T1, T2>::serialize(C, *ValOrErr);
+    }
+    if (auto Err = serializeSeq(C, false))
+      return Err;
+    return serializeSeq(C, ValOrErr.takeError());
+  }
+
+  static Error deserialize(ChannelT &C, Expected<T2> &ValOrErr) {
+    ExpectedAsOutParameter<T2> EAO(&ValOrErr);
+    bool HasValue;
+    if (auto Err = deserializeSeq(C, HasValue))
+      return Err;
+    if (HasValue)
+      return SerializationTraits<ChannelT, T1, T2>::deserialize(C, *ValOrErr);
+    Error Err = Error::success();
+    if (auto E2 = deserializeSeq(C, Err))
+      return E2;
+    ValOrErr = std::move(Err);
+    return Error::success();
+  }
+};
+
+/// SerializationTraits for Expected<T1> from a T2.
+template <typename ChannelT, typename T1, typename T2>
+class SerializationTraits<ChannelT, Expected<T1>, T2> {
+public:
+  static Error serialize(ChannelT &C, T2 &&Val) {
+    return serializeSeq(C, Expected<T2>(std::forward<T2>(Val)));
+  }
+};
+
+/// SerializationTraits for Expected<T1> from an Error.
+template <typename ChannelT, typename T>
+class SerializationTraits<ChannelT, Expected<T>, Error> {
+public:
+  static Error serialize(ChannelT &C, Error &&Err) {
+    return serializeSeq(C, Expected<T>(std::move(Err)));
+  }
+};
+
+/// SerializationTraits default specialization for std::pair.
+template <typename ChannelT, typename T1, typename T2, typename T3, typename T4>
+class SerializationTraits<ChannelT, std::pair<T1, T2>, std::pair<T3, T4>> {
+public:
+  static Error serialize(ChannelT &C, const std::pair<T3, T4> &V) {
+    if (auto Err = SerializationTraits<ChannelT, T1, T3>::serialize(C, V.first))
+      return Err;
+    return SerializationTraits<ChannelT, T2, T4>::serialize(C, V.second);
+  }
+
+  static Error deserialize(ChannelT &C, std::pair<T3, T4> &V) {
+    if (auto Err =
+            SerializationTraits<ChannelT, T1, T3>::deserialize(C, V.first))
+      return Err;
+    return SerializationTraits<ChannelT, T2, T4>::deserialize(C, V.second);
+  }
+};
+
+/// SerializationTraits default specialization for std::tuple.
+template <typename ChannelT, typename... ArgTs>
+class SerializationTraits<ChannelT, std::tuple<ArgTs...>> {
+public:
+  /// RPC channel serialization for std::tuple.
+  static Error serialize(ChannelT &C, const std::tuple<ArgTs...> &V) {
+    return serializeTupleHelper(C, V, std::index_sequence_for<ArgTs...>());
+  }
+
+  /// RPC channel deserialization for std::tuple.
+  static Error deserialize(ChannelT &C, std::tuple<ArgTs...> &V) {
+    return deserializeTupleHelper(C, V, std::index_sequence_for<ArgTs...>());
+  }
+
+private:
+  // Serialization helper for std::tuple.
+  template <size_t... Is>
+  static Error serializeTupleHelper(ChannelT &C, const std::tuple<ArgTs...> &V,
+                                    std::index_sequence<Is...> _) {
+    return serializeSeq(C, std::get<Is>(V)...);
+  }
+
+  // Serialization helper for std::tuple.
+  template <size_t... Is>
+  static Error deserializeTupleHelper(ChannelT &C, std::tuple<ArgTs...> &V,
+                                      std::index_sequence<Is...> _) {
+    return deserializeSeq(C, std::get<Is>(V)...);
+  }
+};
+
+template <typename ChannelT, typename T>
+class SerializationTraits<ChannelT, Optional<T>> {
+public:
+  /// Serialize an Optional<T>.
+  static Error serialize(ChannelT &C, const Optional<T> &O) {
+    if (auto Err = serializeSeq(C, O != None))
+      return Err;
+    if (O)
+      if (auto Err = serializeSeq(C, *O))
+        return Err;
+    return Error::success();
+  }
+
+  /// Deserialize an Optional<T>.
+  static Error deserialize(ChannelT &C, Optional<T> &O) {
+    bool HasValue = false;
+    if (auto Err = deserializeSeq(C, HasValue))
+      return Err;
+    if (HasValue)
+      if (auto Err = deserializeSeq(C, *O))
+        return Err;
+    return Error::success();
+  };
+};
+
+/// SerializationTraits default specialization for std::vector.
+template <typename ChannelT, typename T>
+class SerializationTraits<ChannelT, std::vector<T>> {
+public:
+  /// Serialize a std::vector<T> from std::vector<T>.
+  static Error serialize(ChannelT &C, const std::vector<T> &V) {
+    if (auto Err = serializeSeq(C, static_cast<uint64_t>(V.size())))
+      return Err;
+
+    for (const auto &E : V)
+      if (auto Err = serializeSeq(C, E))
+        return Err;
+
+    return Error::success();
+  }
+
+  /// Deserialize a std::vector<T> to a std::vector<T>.
+  static Error deserialize(ChannelT &C, std::vector<T> &V) {
+    assert(V.empty() &&
+           "Expected default-constructed vector to deserialize into");
+
+    uint64_t Count = 0;
+    if (auto Err = deserializeSeq(C, Count))
+      return Err;
+
+    V.resize(Count);
+    for (auto &E : V)
+      if (auto Err = deserializeSeq(C, E))
+        return Err;
+
+    return Error::success();
+  }
+};
+
+/// Enable vector serialization from an ArrayRef.
+template <typename ChannelT, typename T>
+class SerializationTraits<ChannelT, std::vector<T>, ArrayRef<T>> {
+public:
+  static Error serialize(ChannelT &C, ArrayRef<T> V) {
+    if (auto Err = serializeSeq(C, static_cast<uint64_t>(V.size())))
+      return Err;
+
+    for (const auto &E : V)
+      if (auto Err = serializeSeq(C, E))
+        return Err;
+
+    return Error::success();
+  }
+};
+
+template <typename ChannelT, typename T, typename T2>
+class SerializationTraits<ChannelT, std::set<T>, std::set<T2>> {
+public:
+  /// Serialize a std::set<T> from std::set<T2>.
+  static Error serialize(ChannelT &C, const std::set<T2> &S) {
+    if (auto Err = serializeSeq(C, static_cast<uint64_t>(S.size())))
+      return Err;
+
+    for (const auto &E : S)
+      if (auto Err = SerializationTraits<ChannelT, T, T2>::serialize(C, E))
+        return Err;
+
+    return Error::success();
+  }
+
+  /// Deserialize a std::set<T> to a std::set<T>.
+  static Error deserialize(ChannelT &C, std::set<T2> &S) {
+    assert(S.empty() && "Expected default-constructed set to deserialize into");
+
+    uint64_t Count = 0;
+    if (auto Err = deserializeSeq(C, Count))
+      return Err;
+
+    while (Count-- != 0) {
+      T2 Val;
+      if (auto Err = SerializationTraits<ChannelT, T, T2>::deserialize(C, Val))
+        return Err;
+
+      auto Added = S.insert(Val).second;
+      if (!Added)
+        return make_error<StringError>("Duplicate element in deserialized set",
+                                       orcError(OrcErrorCode::UnknownORCError));
+    }
+
+    return Error::success();
+  }
+};
+
+template <typename ChannelT, typename K, typename V, typename K2, typename V2>
+class SerializationTraits<ChannelT, std::map<K, V>, std::map<K2, V2>> {
+public:
+  /// Serialize a std::map<K, V> from std::map<K2, V2>.
+  static Error serialize(ChannelT &C, const std::map<K2, V2> &M) {
+    if (auto Err = serializeSeq(C, static_cast<uint64_t>(M.size())))
+      return Err;
+
+    for (const auto &E : M) {
+      if (auto Err =
+              SerializationTraits<ChannelT, K, K2>::serialize(C, E.first))
+        return Err;
+      if (auto Err =
+              SerializationTraits<ChannelT, V, V2>::serialize(C, E.second))
+        return Err;
+    }
+
+    return Error::success();
+  }
+
+  /// Deserialize a std::map<K, V> to a std::map<K, V>.
+  static Error deserialize(ChannelT &C, std::map<K2, V2> &M) {
+    assert(M.empty() && "Expected default-constructed map to deserialize into");
+
+    uint64_t Count = 0;
+    if (auto Err = deserializeSeq(C, Count))
+      return Err;
+
+    while (Count-- != 0) {
+      std::pair<K2, V2> Val;
+      if (auto Err =
+              SerializationTraits<ChannelT, K, K2>::deserialize(C, Val.first))
+        return Err;
+
+      if (auto Err =
+              SerializationTraits<ChannelT, V, V2>::deserialize(C, Val.second))
+        return Err;
+
+      auto Added = M.insert(Val).second;
+      if (!Added)
+        return make_error<StringError>("Duplicate element in deserialized map",
+                                       orcError(OrcErrorCode::UnknownORCError));
+    }
+
+    return Error::success();
+  }
+};
+
+template <typename ChannelT, typename K, typename V, typename K2, typename V2>
+class SerializationTraits<ChannelT, std::map<K, V>, DenseMap<K2, V2>> {
+public:
+  /// Serialize a std::map<K, V> from DenseMap<K2, V2>.
+  static Error serialize(ChannelT &C, const DenseMap<K2, V2> &M) {
+    if (auto Err = serializeSeq(C, static_cast<uint64_t>(M.size())))
+      return Err;
+
+    for (auto &E : M) {
+      if (auto Err =
+              SerializationTraits<ChannelT, K, K2>::serialize(C, E.first))
+        return Err;
+
+      if (auto Err =
+              SerializationTraits<ChannelT, V, V2>::serialize(C, E.second))
+        return Err;
+    }
+
+    return Error::success();
+  }
+
+  /// Serialize a std::map<K, V> from DenseMap<K2, V2>.
+  static Error deserialize(ChannelT &C, DenseMap<K2, V2> &M) {
+    assert(M.empty() && "Expected default-constructed map to deserialize into");
+
+    uint64_t Count = 0;
+    if (auto Err = deserializeSeq(C, Count))
+      return Err;
+
+    while (Count-- != 0) {
+      std::pair<K2, V2> Val;
+      if (auto Err =
+              SerializationTraits<ChannelT, K, K2>::deserialize(C, Val.first))
+        return Err;
+
+      if (auto Err =
+              SerializationTraits<ChannelT, V, V2>::deserialize(C, Val.second))
+        return Err;
+
+      auto Added = M.insert(Val).second;
+      if (!Added)
+        return make_error<StringError>("Duplicate element in deserialized map",
+                                       orcError(OrcErrorCode::UnknownORCError));
+    }
+
+    return Error::success();
+  }
+};
+
+} // namespace shared
+} // end namespace orc
+} // end namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_ORC_RPC_RPCSERIALIZATION_H
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/Shared/TargetProcessControlTypes.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/Shared/TargetProcessControlTypes.h
new file mode 100644
index 000000000000..d01b3ef21f80
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/Shared/TargetProcessControlTypes.h
@@ -0,0 +1,165 @@
+//===--- TargetProcessControlTypes.h -- Shared Core/TPC types ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// TargetProcessControl types that are used by both the Orc and
+// OrcTargetProcess libraries.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_SHARED_TARGETPROCESSCONTROLTYPES_H
+#define LLVM_EXECUTIONENGINE_ORC_SHARED_TARGETPROCESSCONTROLTYPES_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ExecutionEngine/JITSymbol.h"
+
+#include <vector>
+
+namespace llvm {
+namespace orc {
+namespace tpctypes {
+
+template <typename T> struct UIntWrite {
+  UIntWrite() = default;
+  UIntWrite(JITTargetAddress Address, T Value)
+      : Address(Address), Value(Value) {}
+
+  JITTargetAddress Address = 0;
+  T Value = 0;
+};
+
+/// Describes a write to a uint8_t.
+using UInt8Write = UIntWrite<uint8_t>;
+
+/// Describes a write to a uint16_t.
+using UInt16Write = UIntWrite<uint16_t>;
+
+/// Describes a write to a uint32_t.
+using UInt32Write = UIntWrite<uint32_t>;
+
+/// Describes a write to a uint64_t.
+using UInt64Write = UIntWrite<uint64_t>;
+
+/// Describes a write to a buffer.
+/// For use with TargetProcessControl::MemoryAccess objects.
+struct BufferWrite {
+  BufferWrite() = default;
+  BufferWrite(JITTargetAddress Address, StringRef Buffer)
+      : Address(Address), Buffer(Buffer) {}
+
+  JITTargetAddress Address = 0;
+  StringRef Buffer;
+};
+
+/// A handle used to represent a loaded dylib in the target process.
+using DylibHandle = JITTargetAddress;
+
+using LookupResult = std::vector<JITTargetAddress>;
+
+/// Either a uint8_t array or a uint8_t*.
+union CWrapperFunctionResultData {
+  uint8_t Value[8];
+  uint8_t *ValuePtr;
+};
+
+/// C ABI compatible wrapper function result.
+///
+/// This can be safely returned from extern "C" functions, but should be used
+/// to construct a WrapperFunctionResult for safety.
+struct CWrapperFunctionResult {
+  uint64_t Size;
+  CWrapperFunctionResultData Data;
+  void (*Destroy)(CWrapperFunctionResultData Data, uint64_t Size);
+};
+
+/// C++ wrapper function result: Same as CWrapperFunctionResult but
+/// auto-releases memory.
+class WrapperFunctionResult {
+public:
+  /// Create a default WrapperFunctionResult.
+  WrapperFunctionResult() { zeroInit(R); }
+
+  /// Create a WrapperFunctionResult from a CWrapperFunctionResult. This
+  /// instance takes ownership of the result object and will automatically
+  /// call the Destroy member upon destruction.
+  WrapperFunctionResult(CWrapperFunctionResult R) : R(R) {}
+
+  WrapperFunctionResult(const WrapperFunctionResult &) = delete;
+  WrapperFunctionResult &operator=(const WrapperFunctionResult &) = delete;
+
+  WrapperFunctionResult(WrapperFunctionResult &&Other) {
+    zeroInit(R);
+    std::swap(R, Other.R);
+  }
+
+  WrapperFunctionResult &operator=(WrapperFunctionResult &&Other) {
+    CWrapperFunctionResult Tmp;
+    zeroInit(Tmp);
+    std::swap(Tmp, Other.R);
+    std::swap(R, Tmp);
+    return *this;
+  }
+
+  ~WrapperFunctionResult() {
+    if (R.Destroy)
+      R.Destroy(R.Data, R.Size);
+  }
+
+  /// Relinquish ownership of and return the CWrapperFunctionResult.
+  CWrapperFunctionResult release() {
+    CWrapperFunctionResult Tmp;
+    zeroInit(Tmp);
+    std::swap(R, Tmp);
+    return Tmp;
+  }
+
+  /// Get an ArrayRef covering the data in the result.
+  ArrayRef<uint8_t> getData() const {
+    if (R.Size <= 8)
+      return ArrayRef<uint8_t>(R.Data.Value, R.Size);
+    return ArrayRef<uint8_t>(R.Data.ValuePtr, R.Size);
+  }
+
+  /// Create a WrapperFunctionResult from the given integer, provided its
+  /// size is no greater than 64 bits.
+  template <typename T,
+            typename _ = std::enable_if_t<std::is_integral<T>::value &&
+                                          sizeof(T) <= sizeof(uint64_t)>>
+  static WrapperFunctionResult from(T Value) {
+    CWrapperFunctionResult R;
+    R.Size = sizeof(T);
+    memcpy(&R.Data.Value, Value, R.Size);
+    R.Destroy = nullptr;
+    return R;
+  }
+
+  /// Create a WrapperFunctionResult from the given string.
+  static WrapperFunctionResult from(StringRef S);
+
+  /// Always free Data.ValuePtr by calling free on it.
+  static void destroyWithFree(CWrapperFunctionResultData Data, uint64_t Size);
+
+  /// Always free Data.ValuePtr by calling delete[] on it.
+  static void destroyWithDeleteArray(CWrapperFunctionResultData Data,
+                                     uint64_t Size);
+
+private:
+  static void zeroInit(CWrapperFunctionResult &R) {
+    R.Size = 0;
+    R.Data.ValuePtr = nullptr;
+    R.Destroy = nullptr;
+  }
+
+  CWrapperFunctionResult R;
+};
+
+} // end namespace tpctypes
+} // end namespace orc
+} // end namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_ORC_SHARED_TARGETPROCESSCONTROLTYPES_H
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/Speculation.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/Speculation.h
index d8213d3b35e8..a138f60a7756 100644
--- a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/Speculation.h
+++ b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/Speculation.h
@@ -18,14 +18,10 @@
 #include "llvm/ExecutionEngine/Orc/Core.h"
 #include "llvm/ExecutionEngine/Orc/DebugUtils.h"
 #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/Passes/PassBuilder.h"
 #include "llvm/Support/Debug.h"
-
 #include <mutex>
 #include <type_traits>
 #include <utility>
-#include <vector>
 
 namespace llvm {
 namespace orc {
@@ -185,7 +181,8 @@ public:
       : IRLayer(ES, BaseLayer.getManglingOptions()), NextLayer(BaseLayer),
         S(Spec), Mangle(Mangle), QueryAnalysis(Interpreter) {}
 
-  void emit(MaterializationResponsibility R, ThreadSafeModule TSM);
+  void emit(std::unique_ptr<MaterializationResponsibility> R,
+            ThreadSafeModule TSM) override;
 
 private:
   TargetAndLikelies
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/TPCDynamicLibrarySearchGenerator.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/TPCDynamicLibrarySearchGenerator.h
new file mode 100644
index 000000000000..ed4f6080bb4e
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/TPCDynamicLibrarySearchGenerator.h
@@ -0,0 +1,66 @@
+//===------------ TPCDynamicLibrarySearchGenerator.h ------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Support loading and searching of dynamic libraries in a target process via
+// the TargetProcessControl class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_TPCDYNAMICLIBRARYSEARCHGENERATOR_H
+#define LLVM_EXECUTIONENGINE_ORC_TPCDYNAMICLIBRARYSEARCHGENERATOR_H
+
+#include "llvm/ADT/FunctionExtras.h"
+#include "llvm/ExecutionEngine/Orc/TargetProcessControl.h"
+
+namespace llvm {
+namespace orc {
+
+class TPCDynamicLibrarySearchGenerator : public DefinitionGenerator {
+public:
+  using SymbolPredicate = unique_function<bool(const SymbolStringPtr &)>;
+
+  /// Create a DynamicLibrarySearchGenerator that searches for symbols in the
+  /// library with the given handle.
+  ///
+  /// If the Allow predicate is given then only symbols matching the predicate
+  /// will be searched for. If the predicate is not given then all symbols will
+  /// be searched for.
+  TPCDynamicLibrarySearchGenerator(TargetProcessControl &TPC,
+                                   tpctypes::DylibHandle H,
+                                   SymbolPredicate Allow = SymbolPredicate())
+      : TPC(TPC), H(H), Allow(std::move(Allow)) {}
+
+  /// Permanently loads the library at the given path and, on success, returns
+  /// a DynamicLibrarySearchGenerator that will search it for symbol definitions
+  /// in the library. On failure returns the reason the library failed to load.
+  static Expected<std::unique_ptr<TPCDynamicLibrarySearchGenerator>>
+  Load(TargetProcessControl &TPC, const char *LibraryPath,
+       SymbolPredicate Allow = SymbolPredicate());
+
+  /// Creates a TPCDynamicLibrarySearchGenerator that searches for symbols in
+  /// the target process.
+  static Expected<std::unique_ptr<TPCDynamicLibrarySearchGenerator>>
+  GetForTargetProcess(TargetProcessControl &TPC,
+                      SymbolPredicate Allow = SymbolPredicate()) {
+    return Load(TPC, nullptr, std::move(Allow));
+  }
+
+  Error tryToGenerate(LookupState &LS, LookupKind K, JITDylib &JD,
+                      JITDylibLookupFlags JDLookupFlags,
+                      const SymbolLookupSet &Symbols) override;
+
+private:
+  TargetProcessControl &TPC;
+  tpctypes::DylibHandle H;
+  SymbolPredicate Allow;
+};
+
+} // end namespace orc
+} // end namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_ORC_TPCDYNAMICLIBRARYSEARCHGENERATOR_H
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/TPCEHFrameRegistrar.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/TPCEHFrameRegistrar.h
new file mode 100644
index 000000000000..519f818907f9
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/TPCEHFrameRegistrar.h
@@ -0,0 +1,54 @@
+//===-- TPCEHFrameRegistrar.h - TPC based eh-frame registration -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// TargetProcessControl based eh-frame registration.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_TPCEHFRAMEREGISTRAR_H
+#define LLVM_EXECUTIONENGINE_ORC_TPCEHFRAMEREGISTRAR_H
+
+#include "llvm/ExecutionEngine/JITLink/EHFrameSupport.h"
+#include "llvm/ExecutionEngine/Orc/TargetProcessControl.h"
+
+namespace llvm {
+namespace orc {
+
+/// Register/Deregisters EH frames in a remote process via a
+/// TargetProcessControl instance.
+class TPCEHFrameRegistrar : public jitlink::EHFrameRegistrar {
+public:
+  /// Create from a TargetProcessControl instance alone. This will use
+  /// the TPC's lookupSymbols method to find the registration/deregistration
+  /// funciton addresses by name.
+  static Expected<std::unique_ptr<TPCEHFrameRegistrar>>
+  Create(TargetProcessControl &TPC);
+
+  /// Create a TPCEHFrameRegistrar with the given TargetProcessControl
+  /// object and registration/deregistration function addresses.
+  TPCEHFrameRegistrar(TargetProcessControl &TPC,
+                      JITTargetAddress RegisterEHFrameWrapperFnAddr,
+                      JITTargetAddress DeregisterEHFRameWrapperFnAddr)
+      : TPC(TPC), RegisterEHFrameWrapperFnAddr(RegisterEHFrameWrapperFnAddr),
+        DeregisterEHFrameWrapperFnAddr(DeregisterEHFRameWrapperFnAddr) {}
+
+  Error registerEHFrames(JITTargetAddress EHFrameSectionAddr,
+                         size_t EHFrameSectionSize) override;
+  Error deregisterEHFrames(JITTargetAddress EHFrameSectionAddr,
+                           size_t EHFrameSectionSize) override;
+
+private:
+  TargetProcessControl &TPC;
+  JITTargetAddress RegisterEHFrameWrapperFnAddr;
+  JITTargetAddress DeregisterEHFrameWrapperFnAddr;
+};
+
+} // end namespace orc
+} // end namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_ORC_TPCEHFRAMEREGISTRAR_H
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/TPCIndirectionUtils.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/TPCIndirectionUtils.h
new file mode 100644
index 000000000000..e7abd7fb90df
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/TPCIndirectionUtils.h
@@ -0,0 +1,222 @@
+//===--- TPCIndirectionUtils.h - TPC based indirection utils ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Indirection utilities (stubs, trampolines, lazy call-throughs) that use the
+// TargetProcessControl API to interact with the target process.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_TPCINDIRECTIONUTILS_H
+#define LLVM_EXECUTIONENGINE_ORC_TPCINDIRECTIONUTILS_H
+
+#include "llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h"
+#include "llvm/ExecutionEngine/Orc/IndirectionUtils.h"
+#include "llvm/ExecutionEngine/Orc/LazyReexports.h"
+
+#include <mutex>
+
+namespace llvm {
+namespace orc {
+
+class TargetProcessControl;
+
+/// Provides TargetProcessControl based indirect stubs, trampoline pool and
+/// lazy call through manager.
+class TPCIndirectionUtils {
+  friend class TPCIndirectionUtilsAccess;
+
+public:
+  /// ABI support base class. Used to write resolver, stub, and trampoline
+  /// blocks.
+  class ABISupport {
+  protected:
+    ABISupport(unsigned PointerSize, unsigned TrampolineSize, unsigned StubSize,
+               unsigned StubToPointerMaxDisplacement, unsigned ResolverCodeSize)
+        : PointerSize(PointerSize), TrampolineSize(TrampolineSize),
+          StubSize(StubSize),
+          StubToPointerMaxDisplacement(StubToPointerMaxDisplacement),
+          ResolverCodeSize(ResolverCodeSize) {}
+
+  public:
+    virtual ~ABISupport();
+
+    unsigned getPointerSize() const { return PointerSize; }
+    unsigned getTrampolineSize() const { return TrampolineSize; }
+    unsigned getStubSize() const { return StubSize; }
+    unsigned getStubToPointerMaxDisplacement() const {
+      return StubToPointerMaxDisplacement;
+    }
+    unsigned getResolverCodeSize() const { return ResolverCodeSize; }
+
+    virtual void writeResolverCode(char *ResolverWorkingMem,
+                                   JITTargetAddress ResolverTargetAddr,
+                                   JITTargetAddress ReentryFnAddr,
+                                   JITTargetAddress ReentryCtxAddr) const = 0;
+
+    virtual void writeTrampolines(char *TrampolineBlockWorkingMem,
+                                  JITTargetAddress TrampolineBlockTragetAddr,
+                                  JITTargetAddress ResolverAddr,
+                                  unsigned NumTrampolines) const = 0;
+
+    virtual void
+    writeIndirectStubsBlock(char *StubsBlockWorkingMem,
+                            JITTargetAddress StubsBlockTargetAddress,
+                            JITTargetAddress PointersBlockTargetAddress,
+                            unsigned NumStubs) const = 0;
+
+  private:
+    unsigned PointerSize = 0;
+    unsigned TrampolineSize = 0;
+    unsigned StubSize = 0;
+    unsigned StubToPointerMaxDisplacement = 0;
+    unsigned ResolverCodeSize = 0;
+  };
+
+  /// Create using the given ABI class.
+  template <typename ORCABI>
+  static std::unique_ptr<TPCIndirectionUtils>
+  CreateWithABI(TargetProcessControl &TPC);
+
+  /// Create based on the TargetProcessControl triple.
+  static Expected<std::unique_ptr<TPCIndirectionUtils>>
+  Create(TargetProcessControl &TPC);
+
+  /// Return a reference to the TargetProcessControl object.
+  TargetProcessControl &getTargetProcessControl() const { return TPC; }
+
+  /// Return a reference to the ABISupport object for this instance.
+  ABISupport &getABISupport() const { return *ABI; }
+
+  /// Release memory for resources held by this instance. This *must* be called
+  /// prior to destruction of the class.
+  Error cleanup();
+
+  /// Write resolver code to the target process and return its address.
+  /// This must be called before any call to createTrampolinePool or
+  /// createLazyCallThroughManager.
+  Expected<JITTargetAddress>
+  writeResolverBlock(JITTargetAddress ReentryFnAddr,
+                     JITTargetAddress ReentryCtxAddr);
+
+  /// Returns the address of the Resolver block. Returns zero if the
+  /// writeResolverBlock method has not previously been called.
+  JITTargetAddress getResolverBlockAddress() const { return ResolverBlockAddr; }
+
+  /// Create an IndirectStubsManager for the target process.
+  std::unique_ptr<IndirectStubsManager> createIndirectStubsManager();
+
+  /// Create a TrampolinePool for the target process.
+  TrampolinePool &getTrampolinePool();
+
+  /// Create a LazyCallThroughManager.
+  /// This function should only be called once.
+  LazyCallThroughManager &
+  createLazyCallThroughManager(ExecutionSession &ES,
+                               JITTargetAddress ErrorHandlerAddr);
+
+  /// Create a LazyCallThroughManager for the target process.
+  LazyCallThroughManager &getLazyCallThroughManager() {
+    assert(LCTM && "createLazyCallThroughManager must be called first");
+    return *LCTM;
+  }
+
+private:
+  using Allocation = jitlink::JITLinkMemoryManager::Allocation;
+
+  struct IndirectStubInfo {
+    IndirectStubInfo() = default;
+    IndirectStubInfo(JITTargetAddress StubAddress,
+                     JITTargetAddress PointerAddress)
+        : StubAddress(StubAddress), PointerAddress(PointerAddress) {}
+    JITTargetAddress StubAddress = 0;
+    JITTargetAddress PointerAddress = 0;
+  };
+
+  using IndirectStubInfoVector = std::vector<IndirectStubInfo>;
+
+  /// Create a TPCIndirectionUtils instance.
+  TPCIndirectionUtils(TargetProcessControl &TPC,
+                      std::unique_ptr<ABISupport> ABI);
+
+  Expected<IndirectStubInfoVector> getIndirectStubs(unsigned NumStubs);
+
+  std::mutex TPCUIMutex;
+  TargetProcessControl &TPC;
+  std::unique_ptr<ABISupport> ABI;
+  JITTargetAddress ResolverBlockAddr;
+  std::unique_ptr<jitlink::JITLinkMemoryManager::Allocation> ResolverBlock;
+  std::unique_ptr<TrampolinePool> TP;
+  std::unique_ptr<LazyCallThroughManager> LCTM;
+
+  std::vector<IndirectStubInfo> AvailableIndirectStubs;
+  std::vector<std::unique_ptr<Allocation>> IndirectStubAllocs;
+};
+
+/// This will call writeResolver on the given TPCIndirectionUtils instance
+/// to set up re-entry via a function that will directly return the trampoline
+/// landing address.
+///
+/// The TPCIndirectionUtils' LazyCallThroughManager must have been previously
+/// created via TPCIndirectionUtils::createLazyCallThroughManager.
+///
+/// The TPCIndirectionUtils' writeResolver method must not have been previously
+/// called.
+///
+/// This function is experimental and likely subject to revision.
+Error setUpInProcessLCTMReentryViaTPCIU(TPCIndirectionUtils &TPCIU);
+
+namespace detail {
+
+template <typename ORCABI>
+class ABISupportImpl : public TPCIndirectionUtils::ABISupport {
+public:
+  ABISupportImpl()
+      : ABISupport(ORCABI::PointerSize, ORCABI::TrampolineSize,
+                   ORCABI::StubSize, ORCABI::StubToPointerMaxDisplacement,
+                   ORCABI::ResolverCodeSize) {}
+
+  void writeResolverCode(char *ResolverWorkingMem,
+                         JITTargetAddress ResolverTargetAddr,
+                         JITTargetAddress ReentryFnAddr,
+                         JITTargetAddress ReentryCtxAddr) const override {
+    ORCABI::writeResolverCode(ResolverWorkingMem, ResolverTargetAddr,
+                              ReentryFnAddr, ReentryCtxAddr);
+  }
+
+  void writeTrampolines(char *TrampolineBlockWorkingMem,
+                        JITTargetAddress TrampolineBlockTargetAddr,
+                        JITTargetAddress ResolverAddr,
+                        unsigned NumTrampolines) const override {
+    ORCABI::writeTrampolines(TrampolineBlockWorkingMem,
+                             TrampolineBlockTargetAddr, ResolverAddr,
+                             NumTrampolines);
+  }
+
+  void writeIndirectStubsBlock(char *StubsBlockWorkingMem,
+                               JITTargetAddress StubsBlockTargetAddress,
+                               JITTargetAddress PointersBlockTargetAddress,
+                               unsigned NumStubs) const override {
+    ORCABI::writeIndirectStubsBlock(StubsBlockWorkingMem,
+                                    StubsBlockTargetAddress,
+                                    PointersBlockTargetAddress, NumStubs);
+  }
+};
+
+} // end namespace detail
+
+template <typename ORCABI>
+std::unique_ptr<TPCIndirectionUtils>
+TPCIndirectionUtils::CreateWithABI(TargetProcessControl &TPC) {
+  return std::unique_ptr<TPCIndirectionUtils>(new TPCIndirectionUtils(
+      TPC, std::make_unique<detail::ABISupportImpl<ORCABI>>()));
+}
+
+} // end namespace orc
+} // end namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_ORC_TPCINDIRECTIONUTILS_H
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/OrcRPCTPCServer.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/OrcRPCTPCServer.h
new file mode 100644
index 000000000000..253e06ba0ba1
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/OrcRPCTPCServer.h
@@ -0,0 +1,620 @@
+//===-- OrcRPCTPCServer.h -- OrcRPCTargetProcessControl Server --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// OrcRPCTargetProcessControl server class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_ORCRPCTPCSERVER_H
+#define LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_ORCRPCTPCSERVER_H
+
+#include "llvm/ADT/BitmaskEnum.h"
+#include "llvm/ExecutionEngine/Orc/Shared/RPCUtils.h"
+#include "llvm/ExecutionEngine/Orc/Shared/RawByteChannel.h"
+#include "llvm/ExecutionEngine/Orc/Shared/TargetProcessControlTypes.h"
+#include "llvm/ExecutionEngine/Orc/TargetProcess/TargetExecutionUtils.h"
+#include "llvm/ExecutionEngine/Orc/TargetProcessControl.h"
+#include "llvm/Support/DynamicLibrary.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/Memory.h"
+#include "llvm/Support/Process.h"
+
+#include <atomic>
+
+namespace llvm {
+namespace orc {
+
+namespace orcrpctpc {
+
+enum WireProtectionFlags : uint8_t {
+  WPF_None = 0,
+  WPF_Read = 1U << 0,
+  WPF_Write = 1U << 1,
+  WPF_Exec = 1U << 2,
+  LLVM_MARK_AS_BITMASK_ENUM(WPF_Exec)
+};
+
+/// Convert from sys::Memory::ProtectionFlags
+inline WireProtectionFlags
+toWireProtectionFlags(sys::Memory::ProtectionFlags PF) {
+  WireProtectionFlags WPF = WPF_None;
+  if (PF & sys::Memory::MF_READ)
+    WPF |= WPF_Read;
+  if (PF & sys::Memory::MF_WRITE)
+    WPF |= WPF_Write;
+  if (PF & sys::Memory::MF_EXEC)
+    WPF |= WPF_Exec;
+  return WPF;
+}
+
+inline sys::Memory::ProtectionFlags
+fromWireProtectionFlags(WireProtectionFlags WPF) {
+  int PF = 0;
+  if (WPF & WPF_Read)
+    PF |= sys::Memory::MF_READ;
+  if (WPF & WPF_Write)
+    PF |= sys::Memory::MF_WRITE;
+  if (WPF & WPF_Exec)
+    PF |= sys::Memory::MF_EXEC;
+  return static_cast<sys::Memory::ProtectionFlags>(PF);
+}
+
+struct ReserveMemRequestElement {
+  WireProtectionFlags Prot = WPF_None;
+  uint64_t Size = 0;
+  uint64_t Alignment = 0;
+};
+
+using ReserveMemRequest = std::vector<ReserveMemRequestElement>;
+
+struct ReserveMemResultElement {
+  WireProtectionFlags Prot = WPF_None;
+  JITTargetAddress Address = 0;
+  uint64_t AllocatedSize = 0;
+};
+
+using ReserveMemResult = std::vector<ReserveMemResultElement>;
+
+struct ReleaseOrFinalizeMemRequestElement {
+  WireProtectionFlags Prot = WPF_None;
+  JITTargetAddress Address = 0;
+  uint64_t Size = 0;
+};
+
+using ReleaseOrFinalizeMemRequest =
+    std::vector<ReleaseOrFinalizeMemRequestElement>;
+
+} // end namespace orcrpctpc
+
+namespace shared {
+
+template <> class SerializationTypeName<tpctypes::UInt8Write> {
+public:
+  static const char *getName() { return "UInt8Write"; }
+};
+
+template <> class SerializationTypeName<tpctypes::UInt16Write> {
+public:
+  static const char *getName() { return "UInt16Write"; }
+};
+
+template <> class SerializationTypeName<tpctypes::UInt32Write> {
+public:
+  static const char *getName() { return "UInt32Write"; }
+};
+
+template <> class SerializationTypeName<tpctypes::UInt64Write> {
+public:
+  static const char *getName() { return "UInt64Write"; }
+};
+
+template <> class SerializationTypeName<tpctypes::BufferWrite> {
+public:
+  static const char *getName() { return "BufferWrite"; }
+};
+
+template <> class SerializationTypeName<orcrpctpc::ReserveMemRequestElement> {
+public:
+  static const char *getName() { return "ReserveMemRequestElement"; }
+};
+
+template <> class SerializationTypeName<orcrpctpc::ReserveMemResultElement> {
+public:
+  static const char *getName() { return "ReserveMemResultElement"; }
+};
+
+template <>
+class SerializationTypeName<orcrpctpc::ReleaseOrFinalizeMemRequestElement> {
+public:
+  static const char *getName() { return "ReleaseOrFinalizeMemRequestElement"; }
+};
+
+template <> class SerializationTypeName<tpctypes::WrapperFunctionResult> {
+public:
+  static const char *getName() { return "WrapperFunctionResult"; }
+};
+
+template <typename ChannelT, typename WriteT>
+class SerializationTraits<
+    ChannelT, WriteT, WriteT,
+    std::enable_if_t<std::is_same<WriteT, tpctypes::UInt8Write>::value ||
+                     std::is_same<WriteT, tpctypes::UInt16Write>::value ||
+                     std::is_same<WriteT, tpctypes::UInt32Write>::value ||
+                     std::is_same<WriteT, tpctypes::UInt64Write>::value>> {
+public:
+  static Error serialize(ChannelT &C, const WriteT &W) {
+    return serializeSeq(C, W.Address, W.Value);
+  }
+  static Error deserialize(ChannelT &C, WriteT &W) {
+    return deserializeSeq(C, W.Address, W.Value);
+  }
+};
+
+template <typename ChannelT>
+class SerializationTraits<
+    ChannelT, tpctypes::BufferWrite, tpctypes::BufferWrite,
+    std::enable_if_t<std::is_base_of<RawByteChannel, ChannelT>::value>> {
+public:
+  static Error serialize(ChannelT &C, const tpctypes::BufferWrite &W) {
+    uint64_t Size = W.Buffer.size();
+    if (auto Err = serializeSeq(C, W.Address, Size))
+      return Err;
+
+    return C.appendBytes(W.Buffer.data(), Size);
+  }
+  static Error deserialize(ChannelT &C, tpctypes::BufferWrite &W) {
+    JITTargetAddress Address;
+    uint64_t Size;
+
+    if (auto Err = deserializeSeq(C, Address, Size))
+      return Err;
+
+    char *Buffer = jitTargetAddressToPointer<char *>(Address);
+
+    if (auto Err = C.readBytes(Buffer, Size))
+      return Err;
+
+    W = {Address, StringRef(Buffer, Size)};
+    return Error::success();
+  }
+};
+
+template <typename ChannelT>
+class SerializationTraits<ChannelT, orcrpctpc::ReserveMemRequestElement> {
+public:
+  static Error serialize(ChannelT &C,
+                         const orcrpctpc::ReserveMemRequestElement &E) {
+    return serializeSeq(C, static_cast<uint8_t>(E.Prot), E.Size, E.Alignment);
+  }
+
+  static Error deserialize(ChannelT &C,
+                           orcrpctpc::ReserveMemRequestElement &E) {
+    return deserializeSeq(C, *reinterpret_cast<uint8_t *>(&E.Prot), E.Size,
+                          E.Alignment);
+  }
+};
+
+template <typename ChannelT>
+class SerializationTraits<ChannelT, orcrpctpc::ReserveMemResultElement> {
+public:
+  static Error serialize(ChannelT &C,
+                         const orcrpctpc::ReserveMemResultElement &E) {
+    return serializeSeq(C, static_cast<uint8_t>(E.Prot), E.Address,
+                        E.AllocatedSize);
+  }
+
+  static Error deserialize(ChannelT &C, orcrpctpc::ReserveMemResultElement &E) {
+    return deserializeSeq(C, *reinterpret_cast<uint8_t *>(&E.Prot), E.Address,
+                          E.AllocatedSize);
+  }
+};
+
+template <typename ChannelT>
+class SerializationTraits<ChannelT,
+                          orcrpctpc::ReleaseOrFinalizeMemRequestElement> {
+public:
+  static Error
+  serialize(ChannelT &C,
+            const orcrpctpc::ReleaseOrFinalizeMemRequestElement &E) {
+    return serializeSeq(C, static_cast<uint8_t>(E.Prot), E.Address, E.Size);
+  }
+
+  static Error deserialize(ChannelT &C,
+                           orcrpctpc::ReleaseOrFinalizeMemRequestElement &E) {
+    return deserializeSeq(C, *reinterpret_cast<uint8_t *>(&E.Prot), E.Address,
+                          E.Size);
+  }
+};
+
+template <typename ChannelT>
+class SerializationTraits<
+    ChannelT, tpctypes::WrapperFunctionResult, tpctypes::WrapperFunctionResult,
+    std::enable_if_t<std::is_base_of<RawByteChannel, ChannelT>::value>> {
+public:
+  static Error serialize(ChannelT &C,
+                         const tpctypes::WrapperFunctionResult &E) {
+    auto Data = E.getData();
+    if (auto Err = serializeSeq(C, static_cast<uint64_t>(Data.size())))
+      return Err;
+    if (Data.size() == 0)
+      return Error::success();
+    return C.appendBytes(reinterpret_cast<const char *>(Data.data()),
+                         Data.size());
+  }
+
+  static Error deserialize(ChannelT &C, tpctypes::WrapperFunctionResult &E) {
+    tpctypes::CWrapperFunctionResult R;
+
+    R.Size = 0;
+    R.Data.ValuePtr = nullptr;
+    R.Destroy = nullptr;
+
+    if (auto Err = deserializeSeq(C, R.Size))
+      return Err;
+    if (R.Size == 0)
+      return Error::success();
+    R.Data.ValuePtr = new uint8_t[R.Size];
+    if (auto Err =
+            C.readBytes(reinterpret_cast<char *>(R.Data.ValuePtr), R.Size)) {
+      R.Destroy = tpctypes::WrapperFunctionResult::destroyWithDeleteArray;
+      return Err;
+    }
+
+    E = tpctypes::WrapperFunctionResult(R);
+    return Error::success();
+  }
+};
+
+} // end namespace shared
+
+namespace orcrpctpc {
+
+using RemoteSymbolLookupSet = std::vector<std::pair<std::string, bool>>;
+using RemoteLookupRequest =
+    std::pair<tpctypes::DylibHandle, RemoteSymbolLookupSet>;
+
+class GetTargetTriple
+    : public shared::RPCFunction<GetTargetTriple, std::string()> {
+public:
+  static const char *getName() { return "GetTargetTriple"; }
+};
+
+class GetPageSize : public shared::RPCFunction<GetPageSize, uint64_t()> {
+public:
+  static const char *getName() { return "GetPageSize"; }
+};
+
+class ReserveMem
+    : public shared::RPCFunction<ReserveMem, Expected<ReserveMemResult>(
+                                                 ReserveMemRequest)> {
+public:
+  static const char *getName() { return "ReserveMem"; }
+};
+
+class FinalizeMem
+    : public shared::RPCFunction<FinalizeMem,
+                                 Error(ReleaseOrFinalizeMemRequest)> {
+public:
+  static const char *getName() { return "FinalizeMem"; }
+};
+
+class ReleaseMem
+    : public shared::RPCFunction<ReleaseMem,
+                                 Error(ReleaseOrFinalizeMemRequest)> {
+public:
+  static const char *getName() { return "ReleaseMem"; }
+};
+
+class WriteUInt8s
+    : public shared::RPCFunction<WriteUInt8s,
+                                 Error(std::vector<tpctypes::UInt8Write>)> {
+public:
+  static const char *getName() { return "WriteUInt8s"; }
+};
+
+class WriteUInt16s
+    : public shared::RPCFunction<WriteUInt16s,
+                                 Error(std::vector<tpctypes::UInt16Write>)> {
+public:
+  static const char *getName() { return "WriteUInt16s"; }
+};
+
+class WriteUInt32s
+    : public shared::RPCFunction<WriteUInt32s,
+                                 Error(std::vector<tpctypes::UInt32Write>)> {
+public:
+  static const char *getName() { return "WriteUInt32s"; }
+};
+
+class WriteUInt64s
+    : public shared::RPCFunction<WriteUInt64s,
+                                 Error(std::vector<tpctypes::UInt64Write>)> {
+public:
+  static const char *getName() { return "WriteUInt64s"; }
+};
+
+class WriteBuffers
+    : public shared::RPCFunction<WriteBuffers,
+                                 Error(std::vector<tpctypes::BufferWrite>)> {
+public:
+  static const char *getName() { return "WriteBuffers"; }
+};
+
+class LoadDylib
+    : public shared::RPCFunction<LoadDylib, Expected<tpctypes::DylibHandle>(
+                                                std::string DylibPath)> {
+public:
+  static const char *getName() { return "LoadDylib"; }
+};
+
+class LookupSymbols
+    : public shared::RPCFunction<LookupSymbols,
+                                 Expected<std::vector<tpctypes::LookupResult>>(
+                                     std::vector<RemoteLookupRequest>)> {
+public:
+  static const char *getName() { return "LookupSymbols"; }
+};
+
+class RunMain
+    : public shared::RPCFunction<RunMain,
+                                 int32_t(JITTargetAddress MainAddr,
+                                         std::vector<std::string> Args)> {
+public:
+  static const char *getName() { return "RunMain"; }
+};
+
+class RunWrapper
+    : public shared::RPCFunction<RunWrapper,
+                                 tpctypes::WrapperFunctionResult(
+                                     JITTargetAddress, std::vector<uint8_t>)> {
+public:
+  static const char *getName() { return "RunWrapper"; }
+};
+
+class CloseConnection : public shared::RPCFunction<CloseConnection, void()> {
+public:
+  static const char *getName() { return "CloseConnection"; }
+};
+
+} // end namespace orcrpctpc
+
+/// TargetProcessControl for a process connected via an ORC RPC Endpoint.
+template <typename RPCEndpointT> class OrcRPCTPCServer {
+public:
+  /// Create an OrcRPCTPCServer from the given endpoint.
+  OrcRPCTPCServer(RPCEndpointT &EP) : EP(EP) {
+    using ThisT = OrcRPCTPCServer<RPCEndpointT>;
+
+    TripleStr = sys::getProcessTriple();
+    PageSize = sys::Process::getPageSizeEstimate();
+
+    EP.template addHandler<orcrpctpc::GetTargetTriple>(*this,
+                                                       &ThisT::getTargetTriple);
+    EP.template addHandler<orcrpctpc::GetPageSize>(*this, &ThisT::getPageSize);
+
+    EP.template addHandler<orcrpctpc::ReserveMem>(*this, &ThisT::reserveMemory);
+    EP.template addHandler<orcrpctpc::FinalizeMem>(*this,
+                                                   &ThisT::finalizeMemory);
+    EP.template addHandler<orcrpctpc::ReleaseMem>(*this, &ThisT::releaseMemory);
+
+    EP.template addHandler<orcrpctpc::WriteUInt8s>(
+        handleWriteUInt<tpctypes::UInt8Write>);
+    EP.template addHandler<orcrpctpc::WriteUInt16s>(
+        handleWriteUInt<tpctypes::UInt16Write>);
+    EP.template addHandler<orcrpctpc::WriteUInt32s>(
+        handleWriteUInt<tpctypes::UInt32Write>);
+    EP.template addHandler<orcrpctpc::WriteUInt64s>(
+        handleWriteUInt<tpctypes::UInt64Write>);
+    EP.template addHandler<orcrpctpc::WriteBuffers>(handleWriteBuffer);
+
+    EP.template addHandler<orcrpctpc::LoadDylib>(*this, &ThisT::loadDylib);
+    EP.template addHandler<orcrpctpc::LookupSymbols>(*this,
+                                                     &ThisT::lookupSymbols);
+
+    EP.template addHandler<orcrpctpc::RunMain>(*this, &ThisT::runMain);
+    EP.template addHandler<orcrpctpc::RunWrapper>(*this, &ThisT::runWrapper);
+
+    EP.template addHandler<orcrpctpc::CloseConnection>(*this,
+                                                       &ThisT::closeConnection);
+  }
+
+  /// Set the ProgramName to be used as the first argv element when running
+  /// functions via runAsMain.
+  void setProgramName(Optional<std::string> ProgramName = None) {
+    this->ProgramName = std::move(ProgramName);
+  }
+
+  /// Get the RPC endpoint for this server.
+  RPCEndpointT &getEndpoint() { return EP; }
+
+  /// Run the server loop.
+  Error run() {
+    while (!Finished) {
+      if (auto Err = EP.handleOne())
+        return Err;
+    }
+    return Error::success();
+  }
+
+private:
+  std::string getTargetTriple() { return TripleStr; }
+  uint64_t getPageSize() { return PageSize; }
+
+  template <typename WriteT>
+  static void handleWriteUInt(const std::vector<WriteT> &Ws) {
+    using ValueT = decltype(std::declval<WriteT>().Value);
+    for (auto &W : Ws)
+      *jitTargetAddressToPointer<ValueT *>(W.Address) = W.Value;
+  }
+
+  std::string getProtStr(orcrpctpc::WireProtectionFlags WPF) {
+    std::string Result;
+    Result += (WPF & orcrpctpc::WPF_Read) ? 'R' : '-';
+    Result += (WPF & orcrpctpc::WPF_Write) ? 'W' : '-';
+    Result += (WPF & orcrpctpc::WPF_Exec) ? 'X' : '-';
+    return Result;
+  }
+
+  static void handleWriteBuffer(const std::vector<tpctypes::BufferWrite> &Ws) {
+    for (auto &W : Ws) {
+      memcpy(jitTargetAddressToPointer<char *>(W.Address), W.Buffer.data(),
+             W.Buffer.size());
+    }
+  }
+
+  Expected<orcrpctpc::ReserveMemResult>
+  reserveMemory(const orcrpctpc::ReserveMemRequest &Request) {
+    orcrpctpc::ReserveMemResult Allocs;
+    auto PF = sys::Memory::MF_READ | sys::Memory::MF_WRITE;
+
+    uint64_t TotalSize = 0;
+
+    for (const auto &E : Request) {
+      uint64_t Size = alignTo(E.Size, PageSize);
+      uint16_t Align = E.Alignment;
+
+      if ((Align > PageSize) || (PageSize % Align))
+        return make_error<StringError>(
+            "Page alignmen does not satisfy requested alignment",
+            inconvertibleErrorCode());
+
+      TotalSize += Size;
+    }
+
+    // Allocate memory slab.
+    std::error_code EC;
+    auto MB = sys::Memory::allocateMappedMemory(TotalSize, nullptr, PF, EC);
+    if (EC)
+      return make_error<StringError>("Unable to allocate memory: " +
+                                         EC.message(),
+                                     inconvertibleErrorCode());
+
+    // Zero-fill the whole thing.
+    memset(MB.base(), 0, MB.allocatedSize());
+
+    // Carve up sections to return.
+    uint64_t SectionBase = 0;
+    for (const auto &E : Request) {
+      uint64_t SectionSize = alignTo(E.Size, PageSize);
+      Allocs.push_back({E.Prot,
+                        pointerToJITTargetAddress(MB.base()) + SectionBase,
+                        SectionSize});
+      SectionBase += SectionSize;
+    }
+
+    return Allocs;
+  }
+
+  Error finalizeMemory(const orcrpctpc::ReleaseOrFinalizeMemRequest &FMR) {
+    for (const auto &E : FMR) {
+      sys::MemoryBlock MB(jitTargetAddressToPointer<void *>(E.Address), E.Size);
+
+      auto PF = orcrpctpc::fromWireProtectionFlags(E.Prot);
+      if (auto EC =
+              sys::Memory::protectMappedMemory(MB, static_cast<unsigned>(PF)))
+        return make_error<StringError>("error protecting memory: " +
+                                           EC.message(),
+                                       inconvertibleErrorCode());
+    }
+    return Error::success();
+  }
+
+  Error releaseMemory(const orcrpctpc::ReleaseOrFinalizeMemRequest &RMR) {
+    for (const auto &E : RMR) {
+      sys::MemoryBlock MB(jitTargetAddressToPointer<void *>(E.Address), E.Size);
+
+      if (auto EC = sys::Memory::releaseMappedMemory(MB))
+        return make_error<StringError>("error release memory: " + EC.message(),
+                                       inconvertibleErrorCode());
+    }
+    return Error::success();
+  }
+
+  Expected<tpctypes::DylibHandle> loadDylib(const std::string &Path) {
+    std::string ErrMsg;
+    const char *DLPath = !Path.empty() ? Path.c_str() : nullptr;
+    auto DL = sys::DynamicLibrary::getPermanentLibrary(DLPath, &ErrMsg);
+    if (!DL.isValid())
+      return make_error<StringError>(std::move(ErrMsg),
+                                     inconvertibleErrorCode());
+
+    tpctypes::DylibHandle H = Dylibs.size();
+    Dylibs[H] = std::move(DL);
+    return H;
+  }
+
+  Expected<std::vector<tpctypes::LookupResult>>
+  lookupSymbols(const std::vector<orcrpctpc::RemoteLookupRequest> &Request) {
+    std::vector<tpctypes::LookupResult> Result;
+
+    for (const auto &E : Request) {
+      auto I = Dylibs.find(E.first);
+      if (I == Dylibs.end())
+        return make_error<StringError>("Unrecognized handle",
+                                       inconvertibleErrorCode());
+      auto &DL = I->second;
+      Result.push_back({});
+
+      for (const auto &KV : E.second) {
+        auto &SymString = KV.first;
+        bool WeakReference = KV.second;
+
+        const char *Sym = SymString.c_str();
+#ifdef __APPLE__
+        if (*Sym == '_')
+          ++Sym;
+#endif
+
+        void *Addr = DL.getAddressOfSymbol(Sym);
+        if (!Addr && !WeakReference)
+          return make_error<StringError>(Twine("Missing definition for ") + Sym,
+                                         inconvertibleErrorCode());
+
+        Result.back().push_back(pointerToJITTargetAddress(Addr));
+      }
+    }
+
+    return Result;
+  }
+
+  int32_t runMain(JITTargetAddress MainFnAddr,
+                  const std::vector<std::string> &Args) {
+    Optional<StringRef> ProgramNameOverride;
+    if (ProgramName)
+      ProgramNameOverride = *ProgramName;
+
+    return runAsMain(
+        jitTargetAddressToFunction<int (*)(int, char *[])>(MainFnAddr), Args,
+        ProgramNameOverride);
+  }
+
+  tpctypes::WrapperFunctionResult
+  runWrapper(JITTargetAddress WrapperFnAddr,
+             const std::vector<uint8_t> &ArgBuffer) {
+    using WrapperFnTy = tpctypes::CWrapperFunctionResult (*)(
+        const uint8_t *Data, uint64_t Size);
+    auto *WrapperFn = jitTargetAddressToFunction<WrapperFnTy>(WrapperFnAddr);
+    return WrapperFn(ArgBuffer.data(), ArgBuffer.size());
+  }
+
+  void closeConnection() { Finished = true; }
+
+  std::string TripleStr;
+  uint64_t PageSize = 0;
+  Optional<std::string> ProgramName;
+  RPCEndpointT &EP;
+  std::atomic<bool> Finished{false};
+  DenseMap<tpctypes::DylibHandle, sys::DynamicLibrary> Dylibs;
+};
+
+} // end namespace orc
+} // end namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_ORCRPCTPCSERVER_H
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.h
new file mode 100644
index 000000000000..811c50e3ce4d
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.h
@@ -0,0 +1,41 @@
+//===----- RegisterEHFrames.h -- Register EH frame sections -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Support for dynamically registering and deregistering eh-frame sections
+// in-process via libunwind.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_REGISTEREHFRAMES_H
+#define LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_REGISTEREHFRAMES_H
+
+#include "llvm/ExecutionEngine/Orc/Shared/TargetProcessControlTypes.h"
+#include "llvm/Support/Error.h"
+#include <vector>
+
+namespace llvm {
+namespace orc {
+
+/// Register frames in the given eh-frame section with libunwind.
+Error registerEHFrameSection(const void *EHFrameSectionAddr,
+                             size_t EHFrameSectionSize);
+
+/// Unregister frames in the given eh-frame section with libunwind.
+Error deregisterEHFrameSection(const void *EHFrameSectionAddr,
+                               size_t EHFrameSectionSize);
+
+} // end namespace orc
+} // end namespace llvm
+
+extern "C" llvm::orc::tpctypes::CWrapperFunctionResult
+llvm_orc_registerEHFrameSectionWrapper(uint8_t *Data, uint64_t Size);
+
+extern "C" llvm::orc::tpctypes::CWrapperFunctionResult
+llvm_orc_deregisterEHFrameSectionWrapper(uint8_t *Data, uint64_t Size);
+
+#endif // LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_REGISTEREHFRAMES_H
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/TargetExecutionUtils.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/TargetExecutionUtils.h
new file mode 100644
index 000000000000..1d2f6d2be089
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/TargetExecutionUtils.h
@@ -0,0 +1,38 @@
+//===-- TargetExecutionUtils.h - Utils for execution in target --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Utilities for execution in the target process.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_TARGETEXECUTIONUTILS_H
+#define LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_TARGETEXECUTIONUTILS_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringRef.h"
+#include <string>
+
+namespace llvm {
+namespace orc {
+
+/// Run a main function, returning the result.
+///
+/// If the optional ProgramName argument is given then it will be inserted
+/// before the strings in Args as the first argument to the called function.
+///
+/// It is legal to have an empty argument list and no program name, however
+/// many main functions will expect a name argument at least, and will fail
+/// if none is provided.
+int runAsMain(int (*Main)(int, char *[]), ArrayRef<std::string> Args,
+              Optional<StringRef> ProgramName = None);
+
+} // end namespace orc
+} // end namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_TARGETEXECUTIONUTILS_H
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/TargetProcessControl.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/TargetProcessControl.h
new file mode 100644
index 000000000000..b60b1ca6e372
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/TargetProcessControl.h
@@ -0,0 +1,218 @@
+//===--- TargetProcessControl.h - Target process control APIs ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Utilities for interacting with target processes.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_TARGETPROCESSCONTROL_H
+#define LLVM_EXECUTIONENGINE_ORC_TARGETPROCESSCONTROL_H
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h"
+#include "llvm/ExecutionEngine/Orc/Core.h"
+#include "llvm/ExecutionEngine/Orc/Shared/TargetProcessControlTypes.h"
+#include "llvm/Support/DynamicLibrary.h"
+#include "llvm/Support/MSVCErrorWorkarounds.h"
+
+#include <future>
+#include <vector>
+
+namespace llvm {
+namespace orc {
+
+/// TargetProcessControl supports interaction with a JIT target process.
+class TargetProcessControl {
+public:
+  /// APIs for manipulating memory in the target process.
+  class MemoryAccess {
+  public:
+    /// Callback function for asynchronous writes.
+    using WriteResultFn = unique_function<void(Error)>;
+
+    virtual ~MemoryAccess();
+
+    virtual void writeUInt8s(ArrayRef<tpctypes::UInt8Write> Ws,
+                             WriteResultFn OnWriteComplete) = 0;
+
+    virtual void writeUInt16s(ArrayRef<tpctypes::UInt16Write> Ws,
+                              WriteResultFn OnWriteComplete) = 0;
+
+    virtual void writeUInt32s(ArrayRef<tpctypes::UInt32Write> Ws,
+                              WriteResultFn OnWriteComplete) = 0;
+
+    virtual void writeUInt64s(ArrayRef<tpctypes::UInt64Write> Ws,
+                              WriteResultFn OnWriteComplete) = 0;
+
+    virtual void writeBuffers(ArrayRef<tpctypes::BufferWrite> Ws,
+                              WriteResultFn OnWriteComplete) = 0;
+
+    Error writeUInt8s(ArrayRef<tpctypes::UInt8Write> Ws) {
+      std::promise<MSVCPError> ResultP;
+      auto ResultF = ResultP.get_future();
+      writeUInt8s(Ws, [&](Error Err) { ResultP.set_value(std::move(Err)); });
+      return ResultF.get();
+    }
+
+    Error writeUInt16s(ArrayRef<tpctypes::UInt16Write> Ws) {
+      std::promise<MSVCPError> ResultP;
+      auto ResultF = ResultP.get_future();
+      writeUInt16s(Ws, [&](Error Err) { ResultP.set_value(std::move(Err)); });
+      return ResultF.get();
+    }
+
+    Error writeUInt32s(ArrayRef<tpctypes::UInt32Write> Ws) {
+      std::promise<MSVCPError> ResultP;
+      auto ResultF = ResultP.get_future();
+      writeUInt32s(Ws, [&](Error Err) { ResultP.set_value(std::move(Err)); });
+      return ResultF.get();
+    }
+
+    Error writeUInt64s(ArrayRef<tpctypes::UInt64Write> Ws) {
+      std::promise<MSVCPError> ResultP;
+      auto ResultF = ResultP.get_future();
+      writeUInt64s(Ws, [&](Error Err) { ResultP.set_value(std::move(Err)); });
+      return ResultF.get();
+    }
+
+    Error writeBuffers(ArrayRef<tpctypes::BufferWrite> Ws) {
+      std::promise<MSVCPError> ResultP;
+      auto ResultF = ResultP.get_future();
+      writeBuffers(Ws, [&](Error Err) { ResultP.set_value(std::move(Err)); });
+      return ResultF.get();
+    }
+  };
+
+  /// A pair of a dylib and a set of symbols to be looked up.
+  struct LookupRequest {
+    LookupRequest(tpctypes::DylibHandle Handle, const SymbolLookupSet &Symbols)
+        : Handle(Handle), Symbols(Symbols) {}
+    tpctypes::DylibHandle Handle;
+    const SymbolLookupSet &Symbols;
+  };
+
+  virtual ~TargetProcessControl();
+
+  /// Intern a symbol name in the SymbolStringPool.
+  SymbolStringPtr intern(StringRef SymName) { return SSP->intern(SymName); }
+
+  /// Return a shared pointer to the SymbolStringPool for this instance.
+  std::shared_ptr<SymbolStringPool> getSymbolStringPool() const { return SSP; }
+
+  /// Return the Triple for the target process.
+  const Triple &getTargetTriple() const { return TargetTriple; }
+
+  /// Get the page size for the target process.
+  unsigned getPageSize() const { return PageSize; }
+
+  /// Return a MemoryAccess object for the target process.
+  MemoryAccess &getMemoryAccess() const { return *MemAccess; }
+
+  /// Return a JITLinkMemoryManager for the target process.
+  jitlink::JITLinkMemoryManager &getMemMgr() const { return *MemMgr; }
+
+  /// Load the dynamic library at the given path and return a handle to it.
+  /// If LibraryPath is null this function will return the global handle for
+  /// the target process.
+  virtual Expected<tpctypes::DylibHandle> loadDylib(const char *DylibPath) = 0;
+
+  /// Search for symbols in the target process.
+  ///
+  /// The result of the lookup is a 2-dimentional array of target addresses
+  /// that correspond to the lookup order. If a required symbol is not
+  /// found then this method will return an error. If a weakly referenced
+  /// symbol is not found then it be assigned a '0' value in the result.
+  /// that correspond to the lookup order.
+  virtual Expected<std::vector<tpctypes::LookupResult>>
+  lookupSymbols(ArrayRef<LookupRequest> Request) = 0;
+
+  /// Run function with a main-like signature.
+  virtual Expected<int32_t> runAsMain(JITTargetAddress MainFnAddr,
+                                      ArrayRef<std::string> Args) = 0;
+
+  /// Run a wrapper function with signature:
+  ///
+  /// \code{.cpp}
+  ///   CWrapperFunctionResult fn(uint8_t *Data, uint64_t Size);
+  /// \endcode{.cpp}
+  ///
+  virtual Expected<tpctypes::WrapperFunctionResult>
+  runWrapper(JITTargetAddress WrapperFnAddr, ArrayRef<uint8_t> ArgBuffer) = 0;
+
+  /// Disconnect from the target process.
+  ///
+  /// This should be called after the JIT session is shut down.
+  virtual Error disconnect() = 0;
+
+protected:
+  TargetProcessControl(std::shared_ptr<SymbolStringPool> SSP)
+      : SSP(std::move(SSP)) {}
+
+  std::shared_ptr<SymbolStringPool> SSP;
+  Triple TargetTriple;
+  unsigned PageSize = 0;
+  MemoryAccess *MemAccess = nullptr;
+  jitlink::JITLinkMemoryManager *MemMgr = nullptr;
+};
+
+/// A TargetProcessControl implementation targeting the current process.
+class SelfTargetProcessControl : public TargetProcessControl,
+                                 private TargetProcessControl::MemoryAccess {
+public:
+  SelfTargetProcessControl(
+      std::shared_ptr<SymbolStringPool> SSP, Triple TargetTriple,
+      unsigned PageSize, std::unique_ptr<jitlink::JITLinkMemoryManager> MemMgr);
+
+  /// Create a SelfTargetProcessControl with the given memory manager.
+  /// If no memory manager is given a jitlink::InProcessMemoryManager will
+  /// be used by default.
+  static Expected<std::unique_ptr<SelfTargetProcessControl>>
+  Create(std::shared_ptr<SymbolStringPool> SSP,
+         std::unique_ptr<jitlink::JITLinkMemoryManager> MemMgr = nullptr);
+
+  Expected<tpctypes::DylibHandle> loadDylib(const char *DylibPath) override;
+
+  Expected<std::vector<tpctypes::LookupResult>>
+  lookupSymbols(ArrayRef<LookupRequest> Request) override;
+
+  Expected<int32_t> runAsMain(JITTargetAddress MainFnAddr,
+                              ArrayRef<std::string> Args) override;
+
+  Expected<tpctypes::WrapperFunctionResult>
+  runWrapper(JITTargetAddress WrapperFnAddr,
+             ArrayRef<uint8_t> ArgBuffer) override;
+
+  Error disconnect() override;
+
+private:
+  void writeUInt8s(ArrayRef<tpctypes::UInt8Write> Ws,
+                   WriteResultFn OnWriteComplete) override;
+
+  void writeUInt16s(ArrayRef<tpctypes::UInt16Write> Ws,
+                    WriteResultFn OnWriteComplete) override;
+
+  void writeUInt32s(ArrayRef<tpctypes::UInt32Write> Ws,
+                    WriteResultFn OnWriteComplete) override;
+
+  void writeUInt64s(ArrayRef<tpctypes::UInt64Write> Ws,
+                    WriteResultFn OnWriteComplete) override;
+
+  void writeBuffers(ArrayRef<tpctypes::BufferWrite> Ws,
+                    WriteResultFn OnWriteComplete) override;
+
+  std::unique_ptr<jitlink::JITLinkMemoryManager> OwnedMemMgr;
+  char GlobalManglingPrefix = 0;
+  std::vector<std::unique_ptr<sys::DynamicLibrary>> DynamicLibraries;
+};
+
+} // end namespace orc
+} // end namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_ORC_TARGETPROCESSCONTROL_H
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/ThreadSafeModule.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/ThreadSafeModule.h
index 58c96737e580..82f2b7464953 100644
--- a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/ThreadSafeModule.h
+++ b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/Orc/ThreadSafeModule.h
@@ -162,7 +162,7 @@ using GVModifier = std::function<void(GlobalValue &)>;
 
 /// Clones the given module on to a new context.
 ThreadSafeModule
-cloneToNewContext(ThreadSafeModule &TSMW,
+cloneToNewContext(const ThreadSafeModule &TSMW,
                   GVPredicate ShouldCloneDef = GVPredicate(),
                   GVModifier UpdateClonedDefSource = GVModifier());
 
diff --git a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/RuntimeDyld.h b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/RuntimeDyld.h
index 1b3ce1127e4a..9b83092e653f 100644
--- a/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/RuntimeDyld.h
+++ b/contrib/llvm-project/llvm/include/llvm/ExecutionEngine/RuntimeDyld.h
@@ -271,11 +271,11 @@ private:
       object::OwningBinary<object::ObjectFile> O,
       RuntimeDyld::MemoryManager &MemMgr, JITSymbolResolver &Resolver,
       bool ProcessAllSections,
-      unique_function<Error(const object::ObjectFile &Obj,
-                            std::unique_ptr<LoadedObjectInfo>,
+      unique_function<Error(const object::ObjectFile &Obj, LoadedObjectInfo &,
                             std::map<StringRef, JITEvaluatedSymbol>)>
           OnLoaded,
-      unique_function<void(object::OwningBinary<object::ObjectFile> O, Error)>
+      unique_function<void(object::OwningBinary<object::ObjectFile> O,
+                           std::unique_ptr<LoadedObjectInfo>, Error)>
           OnEmitted);
 
   // RuntimeDyldImpl is the actual class. RuntimeDyld is just the public
@@ -298,10 +298,11 @@ void jitLinkForORC(
     RuntimeDyld::MemoryManager &MemMgr, JITSymbolResolver &Resolver,
     bool ProcessAllSections,
     unique_function<Error(const object::ObjectFile &Obj,
-                          std::unique_ptr<RuntimeDyld::LoadedObjectInfo>,
+                          RuntimeDyld::LoadedObjectInfo &,
                           std::map<StringRef, JITEvaluatedSymbol>)>
         OnLoaded,
-    unique_function<void(object::OwningBinary<object::ObjectFile>, Error)>
+    unique_function<void(object::OwningBinary<object::ObjectFile>,
+                         std::unique_ptr<RuntimeDyld::LoadedObjectInfo>, Error)>
         OnEmitted);
 
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/include/llvm/FileCheck/FileCheck.h b/contrib/llvm-project/llvm/include/llvm/FileCheck/FileCheck.h
new file mode 100644
index 000000000000..b44ab025694b
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/FileCheck/FileCheck.h
@@ -0,0 +1,216 @@
+//==-- llvm/FileCheck/FileCheck.h --------------------------------*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file has some utilities to use FileCheck as an API
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_FILECHECK_FILECHECK_H
+#define LLVM_FILECHECK_FILECHECK_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Regex.h"
+#include "llvm/Support/SourceMgr.h"
+#include <bitset>
+#include <string>
+#include <vector>
+
+namespace llvm {
+
+/// Contains info about various FileCheck options.
+struct FileCheckRequest {
+  std::vector<StringRef> CheckPrefixes;
+  std::vector<StringRef> CommentPrefixes;
+  bool NoCanonicalizeWhiteSpace = false;
+  std::vector<StringRef> ImplicitCheckNot;
+  std::vector<StringRef> GlobalDefines;
+  bool AllowEmptyInput = false;
+  bool AllowUnusedPrefixes = false;
+  bool MatchFullLines = false;
+  bool IgnoreCase = false;
+  bool IsDefaultCheckPrefix = false;
+  bool EnableVarScope = false;
+  bool AllowDeprecatedDagOverlap = false;
+  bool Verbose = false;
+  bool VerboseVerbose = false;
+};
+
+namespace Check {
+
+enum FileCheckKind {
+  CheckNone = 0,
+  CheckPlain,
+  CheckNext,
+  CheckSame,
+  CheckNot,
+  CheckDAG,
+  CheckLabel,
+  CheckEmpty,
+  CheckComment,
+
+  /// Indicates the pattern only matches the end of file. This is used for
+  /// trailing CHECK-NOTs.
+  CheckEOF,
+
+  /// Marks when parsing found a -NOT check combined with another CHECK suffix.
+  CheckBadNot,
+
+  /// Marks when parsing found a -COUNT directive with invalid count value.
+  CheckBadCount
+};
+
+enum FileCheckKindModifier {
+  /// Modifies directive to perform literal match.
+  ModifierLiteral = 0,
+
+  // The number of modifier.
+  Size
+};
+
+class FileCheckType {
+  FileCheckKind Kind;
+  int Count; ///< optional Count for some checks
+  /// Modifers for the check directive.
+  std::bitset<FileCheckKindModifier::Size> Modifiers;
+
+public:
+  FileCheckType(FileCheckKind Kind = CheckNone)
+      : Kind(Kind), Count(1), Modifiers() {}
+  FileCheckType(const FileCheckType &) = default;
+  FileCheckType &operator=(const FileCheckType &) = default;
+
+  operator FileCheckKind() const { return Kind; }
+
+  int getCount() const { return Count; }
+  FileCheckType &setCount(int C);
+
+  bool isLiteralMatch() const {
+    return Modifiers[FileCheckKindModifier::ModifierLiteral];
+  }
+  FileCheckType &setLiteralMatch(bool Literal = true) {
+    Modifiers.set(FileCheckKindModifier::ModifierLiteral, Literal);
+    return *this;
+  }
+
+  // \returns a description of \p Prefix.
+  std::string getDescription(StringRef Prefix) const;
+
+  // \returns a description of \p Modifiers.
+  std::string getModifiersDescription() const;
+};
+} // namespace Check
+
+/// Summary of a FileCheck diagnostic.
+struct FileCheckDiag {
+  /// What is the FileCheck directive for this diagnostic?
+  Check::FileCheckType CheckTy;
+  /// Where is the FileCheck directive for this diagnostic?
+  SMLoc CheckLoc;
+  /// What type of match result does this diagnostic describe?
+  ///
+  /// A directive's supplied pattern is said to be either expected or excluded
+  /// depending on whether the pattern must have or must not have a match in
+  /// order for the directive to succeed.  For example, a CHECK directive's
+  /// pattern is expected, and a CHECK-NOT directive's pattern is excluded.
+  /// All match result types whose names end with "Excluded" are for excluded
+  /// patterns, and all others are for expected patterns.
+  ///
+  /// There might be more than one match result for a single pattern.  For
+  /// example, there might be several discarded matches
+  /// (MatchFoundButDiscarded) before either a good match
+  /// (MatchFoundAndExpected) or a failure to match (MatchNoneButExpected),
+  /// and there might be a fuzzy match (MatchFuzzy) after the latter.
+  enum MatchType {
+    /// Indicates a good match for an expected pattern.
+    MatchFoundAndExpected,
+    /// Indicates a match for an excluded pattern.
+    MatchFoundButExcluded,
+    /// Indicates a match for an expected pattern, but the match is on the
+    /// wrong line.
+    MatchFoundButWrongLine,
+    /// Indicates a discarded match for an expected pattern.
+    MatchFoundButDiscarded,
+    /// Indicates no match for an excluded pattern.
+    MatchNoneAndExcluded,
+    /// Indicates no match for an expected pattern, but this might follow good
+    /// matches when multiple matches are expected for the pattern, or it might
+    /// follow discarded matches for the pattern.
+    MatchNoneButExpected,
+    /// Indicates a fuzzy match that serves as a suggestion for the next
+    /// intended match for an expected pattern with too few or no good matches.
+    MatchFuzzy,
+  } MatchTy;
+  /// The search range if MatchTy is MatchNoneAndExcluded or
+  /// MatchNoneButExpected, or the match range otherwise.
+  unsigned InputStartLine;
+  unsigned InputStartCol;
+  unsigned InputEndLine;
+  unsigned InputEndCol;
+  /// A note to replace the one normally indicated by MatchTy, or the empty
+  /// string if none.
+  std::string Note;
+  FileCheckDiag(const SourceMgr &SM, const Check::FileCheckType &CheckTy,
+                SMLoc CheckLoc, MatchType MatchTy, SMRange InputRange,
+                StringRef Note = "");
+};
+
+class FileCheckPatternContext;
+struct FileCheckString;
+
+/// FileCheck class takes the request and exposes various methods that
+/// use information from the request.
+class FileCheck {
+  FileCheckRequest Req;
+  std::unique_ptr<FileCheckPatternContext> PatternContext;
+  // C++17 TODO: make this a plain std::vector.
+  std::unique_ptr<std::vector<FileCheckString>> CheckStrings;
+
+public:
+  explicit FileCheck(FileCheckRequest Req);
+  ~FileCheck();
+
+  // Combines the check prefixes into a single regex so that we can efficiently
+  // scan for any of the set.
+  //
+  // The semantics are that the longest-match wins which matches our regex
+  // library.
+  Regex buildCheckPrefixRegex();
+
+  /// Reads the check file from \p Buffer and records the expected strings it
+  /// contains. Errors are reported against \p SM.
+  ///
+  /// Only expected strings whose prefix is one of those listed in \p PrefixRE
+  /// are recorded. \returns true in case of an error, false otherwise.
+  ///
+  /// If \p ImpPatBufferIDRange, then the range (inclusive start, exclusive end)
+  /// of IDs for source buffers added to \p SM for implicit patterns are
+  /// recorded in it.  The range is empty if there are none.
+  bool
+  readCheckFile(SourceMgr &SM, StringRef Buffer, Regex &PrefixRE,
+                std::pair<unsigned, unsigned> *ImpPatBufferIDRange = nullptr);
+
+  bool ValidateCheckPrefixes();
+
+  /// Canonicalizes whitespaces in the file. Line endings are replaced with
+  /// UNIX-style '\n'.
+  StringRef CanonicalizeFile(MemoryBuffer &MB,
+                             SmallVectorImpl<char> &OutputBuffer);
+
+  /// Checks the input to FileCheck provided in the \p Buffer against the
+  /// expected strings read from the check file and record diagnostics emitted
+  /// in \p Diags. Errors are recorded against \p SM.
+  ///
+  /// \returns false if the input fails to satisfy the checks.
+  bool checkInput(SourceMgr &SM, StringRef Buffer,
+                  std::vector<FileCheckDiag> *Diags = nullptr);
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm-project/llvm/include/llvm/Frontend/Directive/DirectiveBase.td b/contrib/llvm-project/llvm/include/llvm/Frontend/Directive/DirectiveBase.td
index 26049ca60db3..e40f40f74c73 100644
--- a/contrib/llvm-project/llvm/include/llvm/Frontend/Directive/DirectiveBase.td
+++ b/contrib/llvm-project/llvm/include/llvm/Frontend/Directive/DirectiveBase.td
@@ -35,10 +35,10 @@ class DirectiveLanguage {
 
   // Make the enum values available in the namespace. This allows us to
   // write something like Enum_X if we have a `using namespace cppNamespace`.
-  bit makeEnumAvailableInNamespace = 0;
+  bit makeEnumAvailableInNamespace = false;
 
   // Generate include and macro to enable LLVM BitmaskEnum.
-  bit enableBitmaskEnumInNamespace = 0;
+  bit enableBitmaskEnumInNamespace = false;
 
   // Header file included in the implementation code generated. Ususally the
   // output file of the declaration code generation. Can be left blank.
@@ -46,6 +46,24 @@ class DirectiveLanguage {
 
   // EnumSet class name used for clauses to generated the allowed clauses map.
   string clauseEnumSetClass = "";
+
+  // Class holding the clauses in the flang parse-tree.
+  string flangClauseBaseClass = "";
+}
+
+// Information about values accepted by enum-like clauses
+class ClauseVal<string n, int v, bit uv> {
+  // Name of the clause value.
+  string name = n;
+
+  // Integer value of the clause.
+  int value = v;
+
+  // Can user specify this value?
+  bit isUserValue = uv;
+
+  // Set clause value used by default when unknown.
+  bit isDefault = false;
 }
 
 // Information about a specific clause.
@@ -57,18 +75,32 @@ class Clause<string c> {
   string alternativeName = "";
 
   // Optional class holding value of the clause in clang AST.
-  string clangClass = ?;
+  string clangClass = "";
 
   // Optional class holding value of the clause in flang AST.
-  string flangClass = ?;
+  string flangClass = "";
+
+  // If set to true, value is optional. Not optional by default.
+  bit isValueOptional = false;
+
+  // Name of enum when there is a list of allowed clause values.
+  string enumClauseValue = "";
+
+  // List of allowed clause values
+  list<ClauseVal> allowedClauseValues = [];
+  // If set to true, value class is part of a list. Single class by default.
+  bit isValueList = false;
+
+  // Define a default value such as "*".
+  string defaultValue = "";
 
   // Is clause implicit? If clause is set as implicit, the default kind will
   // be return in get<LanguageName>ClauseKind instead of their own kind.
-  bit isImplicit = 0;
+  bit isImplicit = false;
 
-  // Set directive used by default when unknown. Function returning the kind
+  // Set clause used by default when unknown. Function returning the kind
   // of enumeration will use this clause as the default.
-  bit isDefault = 0;
+  bit isDefault = false;
 }
 
 // Hold information about clause validity by version.
@@ -92,6 +124,10 @@ class Directive<string d> {
   // function.
   string alternativeName = "";
 
+  // Clauses cannot appear twice in the three allowed lists below. Also, since
+  // required implies allowed, the same clause cannot appear in both the
+  // allowedClauses and requiredClauses lists.
+
   // List of allowed clauses for the directive.
   list<VersionedClause> allowedClauses = [];
 
@@ -105,5 +141,5 @@ class Directive<string d> {
   list<VersionedClause> requiredClauses = [];
 
   // Set directive used by default when unknown.
-  bit isDefault = 0;
+  bit isDefault = false;
 }
diff --git a/contrib/llvm-project/llvm/include/llvm/Frontend/OpenACC/ACC.td b/contrib/llvm-project/llvm/include/llvm/Frontend/OpenACC/ACC.td
index e96b7e846662..6045a9ac2af0 100644
--- a/contrib/llvm-project/llvm/include/llvm/Frontend/OpenACC/ACC.td
+++ b/contrib/llvm-project/llvm/include/llvm/Frontend/OpenACC/ACC.td
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This is the definition file for OpenACC directives and clauses.
+// This is the definition file for OpenACC 3.1 directives and clauses.
 //
 //===----------------------------------------------------------------------===//
 
@@ -21,32 +21,34 @@ def OpenACC : DirectiveLanguage {
   let cppNamespace = "acc"; // final namespace will be llvm::acc
   let directivePrefix = "ACCD_";
   let clausePrefix = "ACCC_";
-  let makeEnumAvailableInNamespace = 1;
-  let enableBitmaskEnumInNamespace = 1;
+  let makeEnumAvailableInNamespace = true;
+  let enableBitmaskEnumInNamespace = true;
   let includeHeader = "llvm/Frontend/OpenACC/ACC.h.inc";
   let clauseEnumSetClass = "AccClauseSet";
+  let flangClauseBaseClass = "AccClause";
 }
 
 //===----------------------------------------------------------------------===//
 // Definition of OpenACC clauses
 //===----------------------------------------------------------------------===//
 
-// 2.9.6
-def ACCC_Auto : Clause<"auto"> {}
-
 // 2.16.1
 def ACCC_Async : Clause<"async"> {
-  let flangClass = "std::optional<ScalarIntExpr>";
+  let flangClass = "ScalarIntExpr";
+  let isValueOptional = true;
 }
 
-// 2.7.11
+// 2.9.7
+def ACCC_Auto : Clause<"auto"> {}
+
+// 2.7.12
 def ACCC_Attach : Clause<"attach"> {
   let flangClass = "AccObjectList";
 }
 
 // 2.15.1
 def ACCC_Bind : Clause<"bind"> {
-  let flangClass = "Name";
+  let flangClass = "AccBindClause";
 }
 
 // 2.12
@@ -58,41 +60,49 @@ def ACCC_Collapse : Clause<"collapse"> {
   let flangClass = "ScalarIntConstantExpr";
 }
 
-// 2.7.5
+// 2.7.6
 def ACCC_Copy : Clause<"copy"> {
   let flangClass = "AccObjectList";
 }
-// 2.7.6
+// 2.7.7
 def ACCC_Copyin : Clause<"copyin"> {
   let flangClass = "AccObjectListWithModifier";
 }
 
-// 2.7.7
+// 2.7.8
 def ACCC_Copyout : Clause<"copyout"> {
   let flangClass = "AccObjectListWithModifier";
 }
 
-// 2.7.8
+// 2.7.9
 def ACCC_Create : Clause<"create"> {
   let flangClass = "AccObjectListWithModifier";
 }
 
-// 2.5.14
+// 2.5.15
+def ACC_Default_none : ClauseVal<"none", 1, 1> { let isDefault = 1; }
+def ACC_Default_present : ClauseVal<"present", 0, 1> {}
+
 def ACCC_Default : Clause<"default"> {
   let flangClass = "AccDefaultClause";
+  let enumClauseValue = "DefaultValue";
+  let allowedClauseValues = [
+    ACC_Default_present,
+    ACC_Default_none
+  ];
 }
 
-// 2.4.12
+// 2.14.3
 def ACCC_DefaultAsync : Clause<"default_async"> {
   let flangClass = "ScalarIntExpr";
 }
 
-// 2.7.10
+// 2.7.11
 def ACCC_Delete : Clause<"delete"> {
   let flangClass = "AccObjectList";
 }
 
-// 2.7.12
+// 2.7.13
 def ACCC_Detach : Clause<"detach"> {
   let flangClass = "AccObjectList";
 }
@@ -102,38 +112,41 @@ def ACCC_Device : Clause<"device"> {
   let flangClass = "AccObjectList";
 }
 
-// 2.14.1
+// 2.14.1 - 2.14.2
 def ACCC_DeviceNum : Clause<"device_num">  {
-  let flangClass = "ScalarIntConstantExpr";
+  let flangClass = "ScalarIntExpr";
 }
 
-// 2.7.3
+// 2.7.4
 def ACCC_DevicePtr : Clause<"deviceptr"> {
   let flangClass = "AccObjectList";
 }
 
-// 2.13
+// 2.13.1
 def ACCC_DeviceResident : Clause<"device_resident"> {
   let flangClass = "AccObjectList";
 }
 
 // 2.4
 def ACCC_DeviceType : Clause<"device_type"> {
-  // (DeviceType, "*"
-  let flangClass = "std::optional<std::list<Name>>";
+  let flangClass = "ScalarIntExpr";
+  let defaultValue = "*";
+  let isValueOptional = true;
+  let isValueList = true;
 }
 
 // 2.6.6
 def ACCC_Finalize : Clause<"finalize"> {}
 
-// 2.5.12
+// 2.5.13
 def ACCC_FirstPrivate : Clause<"firstprivate"> {
   let flangClass = "AccObjectList";
 }
 
 // 2.9.2
 def ACCC_Gang : Clause<"gang"> {
-  let flangClass = "std::optional<AccGangArgument>";
+  let flangClass = "AccGangArgument";
+  let isValueOptional = true;
 }
 
 // 2.14.4
@@ -141,7 +154,7 @@ def ACCC_Host : Clause<"host"> {
   let flangClass = "AccObjectList";
 }
 
-// 2.5.4
+// 2.5.5
 def ACCC_If : Clause <"if"> {
   let flangClass = "ScalarLogicalExpr";
 }
@@ -149,15 +162,15 @@ def ACCC_If : Clause <"if"> {
 // 2.14.4
 def ACCC_IfPresent : Clause<"if_present"> {}
 
-// 2.9.9
+// 2.9.6
 def ACCC_Independent : Clause<"independent"> {}
 
-// 2.13
+// 2.13.3
 def ACCC_Link : Clause<"link"> {
   let flangClass = "AccObjectList";
 }
 
-// 2.7.9
+// 2.7.10
 def ACCC_NoCreate : Clause<"no_create"> {
   let flangClass = "AccObjectList";
 }
@@ -165,29 +178,29 @@ def ACCC_NoCreate : Clause<"no_create"> {
 // 2.15.1
 def ACCC_NoHost : Clause<"nohost"> {}
 
-// 2.5.8
+// 2.5.9
 def ACCC_NumGangs : Clause<"num_gangs"> {
   let flangClass = "ScalarIntExpr";
 }
 
-// 2.5.9
+// 2.5.10
 def ACCC_NumWorkers : Clause<"num_workers"> {
   let flangClass = "ScalarIntExpr";
 }
 
-// 2.7.4
+// 2.7.5
 def ACCC_Present : Clause<"present"> {
   let flangClass = "AccObjectList";
 }
 
-// 2.5.11
+// 2.5.12
 def ACCC_Private : Clause<"private"> {
   let flangClass = "AccObjectList";
 }
 
-// 2.9.7
+// 2.9.8
 def ACCC_Tile : Clause <"tile"> {
-  let flangClass = "AccSizeExprList";
+  let flangClass = "AccTileExprList";
 }
 
 // 2.8.1
@@ -198,14 +211,14 @@ def ACCC_UseDevice : Clause <"use_device"> {
 // 2.12
 def ACCC_Read : Clause<"read"> {}
 
-// 2.5.13
+// 2.5.14
 def ACCC_Reduction : Clause<"reduction"> {
   let flangClass = "AccObjectListWithReduction";
 }
 
-// 2.5.5
+// 2.5.6
 def ACCC_Self : Clause<"self"> {
-  let flangClass = "std::optional<ScalarLogicalExpr>";
+  let flangClass = "AccSelfClause";
 }
 
 // 2.9.5
@@ -213,29 +226,32 @@ def ACCC_Seq : Clause<"seq"> {}
 
 // 2.9.4
 def ACCC_Vector : Clause<"vector"> {
-  let flangClass = "std::optional<ScalarIntExpr>";
+  let flangClass = "ScalarIntExpr";
+  let isValueOptional = true;
 }
 
-// 2.5.10
+// 2.5.11
 def ACCC_VectorLength : Clause<"vector_length"> {
   let flangClass = "ScalarIntExpr";
 }
 
 // 2.16.2
 def ACCC_Wait : Clause<"wait"> {
-  let flangClass = "std::optional<AccWaitArgument>";
+  let flangClass = "AccWaitArgument";
+  let isValueOptional = true;
 }
 
 // 2.9.3
 def ACCC_Worker: Clause<"worker"> {
-  let flangClass = "std::optional<ScalarIntExpr>";
+  let flangClass = "ScalarIntExpr";
+  let isValueOptional = true;
 }
 
 // 2.12
 def ACCC_Write : Clause<"write"> {}
 
 def ACCC_Unknown : Clause<"unknown"> {
-  let isDefault = 1;
+  let isDefault = true;
 }
 
 //===----------------------------------------------------------------------===//
@@ -248,7 +264,8 @@ def ACC_Atomic : Directive<"atomic"> {}
 // 2.6.5
 def ACC_Data : Directive<"data"> {
   let allowedOnceClauses = [
-    VersionedClause<ACCC_If>
+    VersionedClause<ACCC_If>,
+    VersionedClause<ACCC_Default>
   ];
   let requiredClauses = [
     VersionedClause<ACCC_Attach>,
@@ -277,7 +294,7 @@ def ACC_Declare : Directive<"declare"> {
   ];
 }
 
-// 2.5.2
+// 2.5.3
 def ACC_Kernels : Directive<"kernels"> {
   let allowedClauses = [
     VersionedClause<ACCC_Attach>,
@@ -288,7 +305,8 @@ def ACC_Kernels : Directive<"kernels"> {
     VersionedClause<ACCC_DeviceType>,
     VersionedClause<ACCC_NoCreate>,
     VersionedClause<ACCC_Present>,
-    VersionedClause<ACCC_DevicePtr>
+    VersionedClause<ACCC_DevicePtr>,
+    VersionedClause<ACCC_Wait>
   ];
   let allowedOnceClauses = [
     VersionedClause<ACCC_Async>,
@@ -297,8 +315,7 @@ def ACC_Kernels : Directive<"kernels"> {
     VersionedClause<ACCC_NumGangs>,
     VersionedClause<ACCC_NumWorkers>,
     VersionedClause<ACCC_Self>,
-    VersionedClause<ACCC_VectorLength>,
-    VersionedClause<ACCC_Wait>
+    VersionedClause<ACCC_VectorLength>
   ];
 }
 
@@ -330,8 +347,10 @@ def ACC_Parallel : Directive<"parallel"> {
   ];
 }
 
-// 2.5.3
+// 2.5.2
 def ACC_Serial : Directive<"serial"> {
+  // Spec line 950-951: clause is as for the parallel construct except that the
+  // num_gangs, num_workers, and vector_length clauses are not permitted.
   let allowedClauses = [
     VersionedClause<ACCC_Attach>,
     VersionedClause<ACCC_Copy>,
@@ -340,10 +359,10 @@ def ACC_Serial : Directive<"serial"> {
     VersionedClause<ACCC_Create>,
     VersionedClause<ACCC_DevicePtr>,
     VersionedClause<ACCC_DeviceType>,
-    VersionedClause<ACCC_FirstPrivate>,
     VersionedClause<ACCC_NoCreate>,
     VersionedClause<ACCC_Present>,
     VersionedClause<ACCC_Private>,
+    VersionedClause<ACCC_FirstPrivate>,
     VersionedClause<ACCC_Wait>
   ];
   let allowedOnceClauses = [
@@ -406,9 +425,15 @@ def ACC_Routine : Directive<"routine"> {
 // 2.14.3
 def ACC_Set : Directive<"set"> {
   let allowedOnceClauses = [
+    VersionedClause<ACCC_DefaultAsync>,
+    VersionedClause<ACCC_DeviceNum>,
+    VersionedClause<ACCC_DeviceType>,
     VersionedClause<ACCC_If>
   ];
   let requiredClauses = [
+    // The three following clauses are also in allowedOnceClauses list due to
+    // restriction 2255 - Two instances of the same clause may not appear on the
+    // same directive.
     VersionedClause<ACCC_DefaultAsync>,
     VersionedClause<ACCC_DeviceNum>,
     VersionedClause<ACCC_DeviceType>
@@ -478,6 +503,8 @@ def ACC_ExitData : Directive<"exit data"> {
     VersionedClause<ACCC_Detach>
   ];
 }
+
+// 2.8
 def ACC_HostData : Directive<"host_data"> {
   let allowedClauses = [
     VersionedClause<ACCC_If>,
@@ -508,7 +535,6 @@ def ACC_KernelsLoop : Directive<"kernels loop"> {
     VersionedClause<ACCC_Default>,
     VersionedClause<ACCC_Gang>,
     VersionedClause<ACCC_If>,
-    VersionedClause<ACCC_Independent>,
     VersionedClause<ACCC_NumGangs>,
     VersionedClause<ACCC_NumWorkers>,
     VersionedClause<ACCC_Reduction>,
@@ -600,5 +626,5 @@ def ACC_SerialLoop : Directive<"serial loop"> {
 }
 
 def ACC_Unknown : Directive<"unknown"> {
-  let isDefault = 1;
-}
\ No newline at end of file
+  let isDefault = true;
+}
diff --git a/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMP.td b/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMP.td
index a565bdf90b3f..10fa5a37b891 100644
--- a/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMP.td
+++ b/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMP.td
@@ -21,10 +21,11 @@ def OpenMP : DirectiveLanguage {
   let cppNamespace = "omp"; // final namespace will be llvm::omp
   let directivePrefix = "OMPD_";
   let clausePrefix = "OMPC_";
-  let makeEnumAvailableInNamespace = 1;
-  let enableBitmaskEnumInNamespace = 1;
+  let makeEnumAvailableInNamespace = true;
+  let enableBitmaskEnumInNamespace = true;
   let includeHeader = "llvm/Frontend/OpenMP/OMP.h.inc";
   let clauseEnumSetClass = "OmpClauseSet";
+  let flangClauseBaseClass = "OmpClause";
 }
 
 //===----------------------------------------------------------------------===//
@@ -33,39 +34,120 @@ def OpenMP : DirectiveLanguage {
 
 def OMPC_Allocator : Clause<"allocator"> {
   let clangClass = "OMPAllocatorClause";
+  let flangClass = "ScalarIntExpr";
+}
+def OMPC_If : Clause<"if"> {
+  let clangClass = "OMPIfClause";
+  let flangClass = "OmpIfClause";
+}
+def OMPC_Final : Clause<"final"> {
+  let clangClass = "OMPFinalClause";
+  let flangClass = "ScalarLogicalExpr";
 }
-def OMPC_If : Clause<"if"> { let clangClass = "OMPIfClause"; }
-def OMPC_Final : Clause<"final"> { let clangClass = "OMPFinalClause"; }
 def OMPC_NumThreads : Clause<"num_threads"> {
   let clangClass = "OMPNumThreadsClause";
+  let flangClass = "ScalarIntExpr";
+}
+def OMPC_SafeLen : Clause<"safelen"> {
+  let clangClass = "OMPSafelenClause";
+  let flangClass = "ScalarIntConstantExpr";
+}
+def OMPC_SimdLen : Clause<"simdlen"> {
+  let clangClass = "OMPSimdlenClause";
+  let flangClass = "ScalarIntConstantExpr";
+}
+def OMPC_Collapse : Clause<"collapse"> {
+  let clangClass = "OMPCollapseClause";
+  let flangClass = "ScalarIntConstantExpr";
+}
+def OMPC_Default : Clause<"default"> {
+  let clangClass = "OMPDefaultClause";
+  let flangClass = "OmpDefaultClause";
+}
+def OMPC_Private : Clause<"private"> {
+  let clangClass = "OMPPrivateClause";
+  let flangClass = "OmpObjectList";
 }
-def OMPC_SafeLen : Clause<"safelen"> { let clangClass = "OMPSafelenClause"; }
-def OMPC_SimdLen : Clause<"simdlen"> { let clangClass = "OMPSimdlenClause"; }
-def OMPC_Collapse : Clause<"collapse"> { let clangClass = "OMPCollapseClause"; }
-def OMPC_Default : Clause<"default"> { let clangClass = "OMPDefaultClause"; }
-def OMPC_Private : Clause<"private"> { let clangClass = "OMPPrivateClause"; }
 def OMPC_FirstPrivate : Clause<"firstprivate"> {
   let clangClass = "OMPFirstprivateClause";
+  let flangClass = "OmpObjectList";
 }
 def OMPC_LastPrivate : Clause<"lastprivate"> {
   let clangClass = "OMPLastprivateClause";
+  let flangClass = "OmpObjectList";
+}
+def OMPC_Shared : Clause<"shared"> {
+  let clangClass = "OMPSharedClause";
+  let flangClass = "OmpObjectList";
 }
-def OMPC_Shared : Clause<"shared"> { let clangClass = "OMPSharedClause"; }
 def OMPC_Reduction : Clause<"reduction"> {
   let clangClass = "OMPReductionClause";
+  let flangClass = "OmpReductionClause";
+}
+def OMPC_Linear : Clause<"linear"> {
+  let clangClass = "OMPLinearClause";
+  let flangClass = "OmpLinearClause";
+}
+def OMPC_Aligned : Clause<"aligned"> {
+  let clangClass = "OMPAlignedClause";
+  let flangClass = "OmpAlignedClause";
+}
+def OMPC_Copyin : Clause<"copyin"> {
+  let clangClass = "OMPCopyinClause";
+  let flangClass = "OmpObjectList";
 }
-def OMPC_Linear : Clause<"linear"> { let clangClass = "OMPLinearClause"; }
-def OMPC_Aligned : Clause<"aligned"> { let clangClass = "OMPAlignedClause"; }
-def OMPC_Copyin : Clause<"copyin"> { let clangClass = "OMPCopyinClause"; }
 def OMPC_CopyPrivate : Clause<"copyprivate"> {
   let clangClass = "OMPCopyprivateClause";
+  let flangClass = "OmpObjectList";
 }
+def OMP_PROC_BIND_master : ClauseVal<"master",2,1> {}
+def OMP_PROC_BIND_close : ClauseVal<"close",3,1> {}
+def OMP_PROC_BIND_spread : ClauseVal<"spread",4,1> {}
+def OMP_PROC_BIND_default : ClauseVal<"default",5,0> {}
+def OMP_PROC_BIND_unknown : ClauseVal<"unknown",6,0> { let isDefault = true; }
 def OMPC_ProcBind : Clause<"proc_bind"> {
   let clangClass = "OMPProcBindClause";
+  let flangClass = "OmpProcBindClause";
+  let enumClauseValue = "ProcBindKind";
+  let allowedClauseValues = [
+    OMP_PROC_BIND_master,
+    OMP_PROC_BIND_close,
+    OMP_PROC_BIND_spread,
+    OMP_PROC_BIND_default,
+    OMP_PROC_BIND_unknown
+  ];
+}
+
+// static and auto are C++ keywords so need a capital to disambiguate.
+def OMP_SCHEDULE_Static : ClauseVal<"Static", 2, 1> {}
+def OMP_SCHEDULE_Dynamic : ClauseVal<"Dynamic", 3, 1> {}
+def OMP_SCHEDULE_Guided : ClauseVal<"Guided", 4, 1> {}
+def OMP_SCHEDULE_Auto : ClauseVal<"Auto", 5, 1> {}
+def OMP_SCHEDULE_Runtime : ClauseVal<"Runtime", 6, 1> {}
+def OMP_SCHEDULE_Default : ClauseVal<"Default", 7, 0> { let isDefault = 1; }
+
+def OMPC_Schedule : Clause<"schedule"> {
+  let clangClass = "OMPScheduleClause";
+  let flangClass = "OmpScheduleClause";
+  let enumClauseValue = "ScheduleKind";
+  let allowedClauseValues = [
+    OMP_SCHEDULE_Static,
+    OMP_SCHEDULE_Dynamic,
+    OMP_SCHEDULE_Guided,
+    OMP_SCHEDULE_Auto,
+    OMP_SCHEDULE_Runtime,
+    OMP_SCHEDULE_Default
+  ];
+}
+
+def OMPC_Ordered : Clause<"ordered"> {
+  let clangClass = "OMPOrderedClause";
+  let flangClass = "ScalarIntConstantExpr";
+  let isValueOptional = true;
+}
+def OMPC_NoWait : Clause<"nowait"> {
+  let clangClass = "OMPNowaitClause";
 }
-def OMPC_Schedule : Clause<"schedule"> { let clangClass = "OMPScheduleClause"; }
-def OMPC_Ordered : Clause<"ordered"> { let clangClass = "OMPOrderedClause"; }
-def OMPC_NoWait : Clause<"nowait"> { let clangClass = "OMPNowaitClause"; }
 def OMPC_Untied : Clause<"untied"> { let clangClass = "OMPUntiedClause"; }
 def OMPC_Mergeable : Clause<"mergeable"> {
   let clangClass = "OMPMergeableClause";
@@ -79,50 +161,77 @@ def OMPC_AcqRel : Clause<"acq_rel"> { let clangClass = "OMPAcqRelClause"; }
 def OMPC_Acquire : Clause<"acquire"> { let clangClass = "OMPAcquireClause"; }
 def OMPC_Release : Clause<"release"> { let clangClass = "OMPReleaseClause"; }
 def OMPC_Relaxed : Clause<"relaxed"> { let clangClass = "OMPRelaxedClause"; }
-def OMPC_Depend : Clause<"depend"> { let clangClass = "OMPDependClause"; }
-def OMPC_Device : Clause<"device"> { let clangClass = "OMPDeviceClause"; }
+def OMPC_Depend : Clause<"depend"> {
+  let clangClass = "OMPDependClause";
+  let flangClass = "OmpDependClause";
+}
+def OMPC_Device : Clause<"device"> {
+  let clangClass = "OMPDeviceClause";
+  let flangClass = "ScalarIntExpr";
+}
 def OMPC_Threads : Clause<"threads"> { let clangClass = "OMPThreadsClause"; }
 def OMPC_Simd : Clause<"simd"> { let clangClass = "OMPSIMDClause"; }
-def OMPC_Map : Clause<"map"> { let clangClass = "OMPMapClause"; }
+def OMPC_Map : Clause<"map"> {
+  let clangClass = "OMPMapClause";
+  let flangClass = "OmpMapClause";
+}
 def OMPC_NumTeams : Clause<"num_teams"> {
   let clangClass = "OMPNumTeamsClause";
+  let flangClass = "ScalarIntExpr";
 }
 def OMPC_ThreadLimit : Clause<"thread_limit"> {
   let clangClass = "OMPThreadLimitClause";
+  let flangClass = "ScalarIntExpr";
 }
 def OMPC_Priority : Clause<"priority"> {
   let clangClass = "OMPPriorityClause";
+  let flangClass = "ScalarIntExpr";
 }
 def OMPC_GrainSize : Clause<"grainsize"> {
   let clangClass = "OMPGrainsizeClause";
+  let flangClass = "ScalarIntExpr";
 }
 def OMPC_NoGroup : Clause<"nogroup"> {
   let clangClass = "OMPNogroupClause";
 }
 def OMPC_NumTasks : Clause<"num_tasks"> {
   let clangClass = "OMPNumTasksClause";
+  let flangClass = "ScalarIntExpr";
 }
 def OMPC_Hint : Clause<"hint"> {
   let clangClass = "OMPHintClause";
+  let flangClass = "ConstantExpr";
 }
 def OMPC_DistSchedule : Clause<"dist_schedule"> {
   let clangClass = "OMPDistScheduleClause";
+  let flangClass = "ScalarIntExpr";
+  let isValueOptional = true;
 }
 def OMPC_DefaultMap : Clause<"defaultmap"> {
   let clangClass = "OMPDefaultmapClause";
+  let flangClass = "OmpDefaultmapClause";
 }
 def OMPC_To : Clause<"to"> {
   let clangClass = "OMPToClause";
+  let flangClass = "OmpObjectList";
+}
+def OMPC_From : Clause<"from"> {
+  let clangClass = "OMPFromClause";
+  let flangClass = "OmpObjectList";
 }
-def OMPC_From : Clause<"from"> { let clangClass = "OMPFromClause"; }
 def OMPC_UseDevicePtr : Clause<"use_device_ptr"> {
   let clangClass = "OMPUseDevicePtrClause";
+  let flangClass = "Name";
+  let isValueList = true;
 }
 def OMPC_IsDevicePtr : Clause<"is_device_ptr"> {
   let clangClass = "OMPIsDevicePtrClause";
+  let flangClass = "Name";
+  let isValueList = true;
 }
 def OMPC_TaskReduction : Clause<"task_reduction"> {
   let clangClass = "OMPTaskReductionClause";
+  let flangClass = "OmpReductionClause";
 }
 def OMPC_InReduction : Clause<"in_reduction"> {
   let clangClass = "OMPInReductionClause";
@@ -144,12 +253,19 @@ def OMPC_AtomicDefaultMemOrder : Clause<"atomic_default_mem_order"> {
 }
 def OMPC_Allocate : Clause<"allocate"> {
   let clangClass = "OMPAllocateClause";
+  let flangClass = "OmpAllocateClause";
 }
 def OMPC_NonTemporal : Clause<"nontemporal"> {
   let clangClass = "OMPNontemporalClause";
 }
+
+def OMP_ORDER_concurrent : ClauseVal<"default",2,0> { let isDefault = 1; }
 def OMPC_Order : Clause<"order"> {
   let clangClass = "OMPOrderClause";
+  let enumClauseValue = "OrderKind";
+  let allowedClauseValues = [
+    OMP_ORDER_concurrent
+  ];
 }
 def OMPC_Destroy : Clause<"destroy"> {
   let clangClass = "OMPDestroyClause";
@@ -172,26 +288,31 @@ def OMPC_Affinity : Clause<"affinity"> {
 def OMPC_UseDeviceAddr : Clause<"use_device_addr"> {
   let clangClass = "OMPUseDeviceAddrClause";
 }
-def OMPC_Uniform : Clause<"uniform"> {}
+def OMPC_Uniform : Clause<"uniform"> {
+  let flangClass = "Name";
+  let isValueList = true;
+}
 def OMPC_DeviceType : Clause<"device_type"> {}
 def OMPC_Match : Clause<"match"> {}
 def OMPC_Depobj : Clause<"depobj"> {
   let clangClass = "OMPDepobjClause";
-  let isImplicit = 1;
+  let isImplicit = true;
 }
 def OMPC_Flush : Clause<"flush"> {
   let clangClass = "OMPFlushClause";
-  let isImplicit = 1;
+  let isImplicit = true;
 }
 def OMPC_ThreadPrivate : Clause<"threadprivate"> {
   let alternativeName = "threadprivate or thread local";
-  let isImplicit = 1;
+  let isImplicit = true;
 }
 def OMPC_Unknown : Clause<"unknown"> {
-  let isImplicit = 1;
-  let isDefault = 1;
+  let isImplicit = true;
+  let isDefault = true;
+}
+def OMPC_Link : Clause<"link"> {
+  let flangClass = "OmpObjectList";
 }
-def OMPC_Link : Clause<"link"> {}
 def OMPC_Inbranch : Clause<"inbranch"> {}
 def OMPC_Notinbranch : Clause<"notinbranch"> {}
 
@@ -202,7 +323,6 @@ def OMPC_Notinbranch : Clause<"notinbranch"> {}
 def OMP_ThreadPrivate : Directive<"threadprivate"> {}
 def OMP_Parallel : Directive<"parallel"> {
   let allowedClauses = [
-    VersionedClause<OMPC_Default>,
     VersionedClause<OMPC_Private>,
     VersionedClause<OMPC_FirstPrivate>,
     VersionedClause<OMPC_Shared>,
@@ -211,6 +331,7 @@ def OMP_Parallel : Directive<"parallel"> {
     VersionedClause<OMPC_Allocate>
   ];
   let allowedOnceClauses = [
+    VersionedClause<OMPC_Default>,
     VersionedClause<OMPC_If>,
     VersionedClause<OMPC_NumThreads>,
     VersionedClause<OMPC_ProcBind>,
@@ -218,7 +339,6 @@ def OMP_Parallel : Directive<"parallel"> {
 }
 def OMP_Task : Directive<"task"> {
   let allowedClauses = [
-    VersionedClause<OMPC_Default>,
     VersionedClause<OMPC_Private>,
     VersionedClause<OMPC_FirstPrivate>,
     VersionedClause<OMPC_Shared>,
@@ -231,6 +351,7 @@ def OMP_Task : Directive<"task"> {
     VersionedClause<OMPC_Affinity, 50>
   ];
   let allowedOnceClauses = [
+    VersionedClause<OMPC_Default>,
     VersionedClause<OMPC_If>,
     VersionedClause<OMPC_Final>,
     VersionedClause<OMPC_Priority>
@@ -312,7 +433,11 @@ def OMP_Critical : Directive<"critical"> {
 }
 def OMP_TaskYield : Directive<"taskyield"> {}
 def OMP_Barrier : Directive<"barrier"> {}
-def OMP_TaskWait : Directive<"taskwait"> {}
+def OMP_TaskWait : Directive<"taskwait"> {
+  let allowedClauses = [
+    VersionedClause<OMPC_Depend, 50>
+  ];
+}
 def OMP_TaskGroup : Directive<"taskgroup"> {
   let allowedClauses = [
     VersionedClause<OMPC_TaskReduction>,
@@ -320,7 +445,7 @@ def OMP_TaskGroup : Directive<"taskgroup"> {
   ];
 }
 def OMP_Flush : Directive<"flush"> {
-  let allowedClauses = [
+  let allowedOnceClauses = [
     VersionedClause<OMPC_AcqRel, 50>,
     VersionedClause<OMPC_Acquire, 50>,
     VersionedClause<OMPC_Release, 50>,
@@ -342,6 +467,8 @@ def OMP_Atomic : Directive<"atomic"> {
     VersionedClause<OMPC_Write>,
     VersionedClause<OMPC_Update>,
     VersionedClause<OMPC_Capture>,
+  ];
+  let allowedOnceClauses = [
     VersionedClause<OMPC_SeqCst>,
     VersionedClause<OMPC_AcqRel, 50>,
     VersionedClause<OMPC_Acquire, 50>,
@@ -398,7 +525,6 @@ def OMP_Requires : Directive<"requires"> {
 }
 def OMP_TargetData : Directive<"target data"> {
   let allowedClauses = [
-    VersionedClause<OMPC_Map>,
     VersionedClause<OMPC_UseDevicePtr>,
     VersionedClause<OMPC_UseDeviceAddr, 50>
   ];
@@ -412,19 +538,20 @@ def OMP_TargetData : Directive<"target data"> {
 }
 def OMP_TargetEnterData : Directive<"target enter data"> {
   let allowedClauses = [
-    VersionedClause<OMPC_Depend>,
-    VersionedClause<OMPC_Map>
+    VersionedClause<OMPC_Depend>
   ];
   let allowedOnceClauses = [
     VersionedClause<OMPC_If>,
     VersionedClause<OMPC_Device>,
     VersionedClause<OMPC_NoWait>
   ];
+  let requiredClauses = [
+    VersionedClause<OMPC_Map>
+  ];
 }
 def OMP_TargetExitData : Directive<"target exit data"> {
   let allowedClauses = [
-    VersionedClause<OMPC_Depend>,
-    VersionedClause<OMPC_Map>
+    VersionedClause<OMPC_Depend>
   ];
   let allowedOnceClauses = [
     VersionedClause<OMPC_Device>,
@@ -708,7 +835,6 @@ def OMP_TaskLoop : Directive<"taskloop"> {
     VersionedClause<OMPC_Private>,
     VersionedClause<OMPC_FirstPrivate>,
     VersionedClause<OMPC_LastPrivate>,
-    VersionedClause<OMPC_Default>,
     VersionedClause<OMPC_Untied>,
     VersionedClause<OMPC_Mergeable>,
     VersionedClause<OMPC_NoGroup>,
@@ -717,6 +843,7 @@ def OMP_TaskLoop : Directive<"taskloop"> {
     VersionedClause<OMPC_Allocate>
   ];
   let allowedOnceClauses = [
+    VersionedClause<OMPC_Default>,
     VersionedClause<OMPC_If>,
     VersionedClause<OMPC_Collapse>,
     VersionedClause<OMPC_Final>,
@@ -770,7 +897,12 @@ def OMP_Distribute : Directive<"distribute"> {
     VersionedClause<OMPC_DistSchedule>
   ];
 }
-def OMP_DeclareTarget : Directive<"declare target"> {}
+def OMP_DeclareTarget : Directive<"declare target"> {
+  let allowedClauses = [
+    VersionedClause<OMPC_To>,
+    VersionedClause<OMPC_Link>
+  ];
+}
 def OMP_EndDeclareTarget : Directive<"end declare target"> {}
 def OMP_DistributeParallelFor : Directive<"distribute parallel for"> {
   let allowedClauses = [
@@ -1460,6 +1592,9 @@ def OMP_Scan : Directive<"scan"> {
     VersionedClause<OMPC_Exclusive, 50>
   ];
 }
+def OMP_Assumes : Directive<"assumes"> {}
+def OMP_BeginAssumes : Directive<"begin assumes"> {}
+def OMP_EndAssumes : Directive<"end assumes"> {}
 def OMP_BeginDeclareVariant : Directive<"begin declare variant"> {}
 def OMP_EndDeclareVariant : Directive<"end declare variant"> {}
 def OMP_ParallelWorkshare : Directive<"parallel workshare"> {
@@ -1481,9 +1616,24 @@ def OMP_ParallelWorkshare : Directive<"parallel workshare"> {
 def OMP_Workshare : Directive<"workshare"> {}
 def OMP_EndDo : Directive<"end do"> {}
 def OMP_EndDoSimd : Directive<"end do simd"> {}
-def OMP_EndSections : Directive<"end sections"> {}
-def OMP_EndSingle : Directive<"end single"> {}
-def OMP_EndWorkshare : Directive<"end workshare"> {}
+def OMP_EndSections : Directive<"end sections"> {
+  let allowedOnceClauses = [
+    VersionedClause<OMPC_NoWait>
+  ];
+}
+def OMP_EndSingle : Directive<"end single"> {
+  let allowedClauses = [
+    VersionedClause<OMPC_CopyPrivate>
+  ];
+  let allowedOnceClauses = [
+    VersionedClause<OMPC_NoWait>
+  ];
+}
+def OMP_EndWorkshare : Directive<"end workshare"> {
+  let allowedClauses = [
+    VersionedClause<OMPC_NoWait>
+  ];
+}
 def OMP_Unknown : Directive<"unknown"> {
-  let isDefault = 1;
+  let isDefault = true;
 }
diff --git a/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h b/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
index d171d0a2b6c4..36ce3fc0f66f 100644
--- a/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
+++ b/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
@@ -16,6 +16,7 @@
 
 #include "llvm/ADT/BitmaskEnum.h"
 
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Frontend/OpenMP/OMP.h.inc"
 
 namespace llvm {
@@ -41,12 +42,12 @@ enum class InternalControlVar {
 #include "llvm/Frontend/OpenMP/OMPKinds.def"
 
 enum class ICVInitValue {
-#define ICV_DATA_ENV(Enum, Name, EnvVar, Init) Init,
+#define ICV_INIT_VALUE(Enum, Name) Enum,
 #include "llvm/Frontend/OpenMP/OMPKinds.def"
 };
 
-#define ICV_DATA_ENV(Enum, Name, EnvVar, Init)                                 \
-  constexpr auto Init = omp::ICVInitValue::Init;
+#define ICV_INIT_VALUE(Enum, Name)                                             \
+  constexpr auto Enum = omp::ICVInitValue::Enum;
 #include "llvm/Frontend/OpenMP/OMPKinds.def"
 
 /// IDs for all omp runtime library (RTL) functions.
@@ -68,16 +69,6 @@ enum class DefaultKind {
   constexpr auto Enum = omp::DefaultKind::Enum;
 #include "llvm/Frontend/OpenMP/OMPKinds.def"
 
-/// IDs for the different proc bind kinds.
-enum class ProcBindKind {
-#define OMP_PROC_BIND_KIND(Enum, Str, Value) Enum = Value,
-#include "llvm/Frontend/OpenMP/OMPKinds.def"
-};
-
-#define OMP_PROC_BIND_KIND(Enum, ...)                                          \
-  constexpr auto Enum = omp::ProcBindKind::Enum;
-#include "llvm/Frontend/OpenMP/OMPKinds.def"
-
 /// IDs for all omp runtime library ident_t flag encodings (see
 /// their defintion in openmp/runtime/src/kmp.h).
 enum class IdentFlag {
@@ -89,6 +80,33 @@ enum class IdentFlag {
 #define OMP_IDENT_FLAG(Enum, ...) constexpr auto Enum = omp::IdentFlag::Enum;
 #include "llvm/Frontend/OpenMP/OMPKinds.def"
 
+/// Helper to describe assume clauses.
+struct AssumptionClauseMappingInfo {
+  /// The identifier describing the (beginning of the) clause.
+  llvm::StringLiteral Identifier;
+  /// Flag to determine if the identifier is a full name or the start of a name.
+  bool StartsWith;
+  /// Flag to determine if a directive lists follows.
+  bool HasDirectiveList;
+  /// Flag to determine if an expression follows.
+  bool HasExpression;
+};
+
+/// All known assume clauses.
+static constexpr AssumptionClauseMappingInfo AssumptionClauseMappings[] = {
+#define OMP_ASSUME_CLAUSE(Identifier, StartsWith, HasDirectiveList,            \
+                          HasExpression)                                       \
+  {Identifier, StartsWith, HasDirectiveList, HasExpression},
+#include "llvm/Frontend/OpenMP/OMPKinds.def"
+};
+
+inline std::string getAllAssumeClauseOptions() {
+  std::string S;
+  for (const AssumptionClauseMappingInfo &ACMI : AssumptionClauseMappings)
+    S += (S.empty() ? "'" : "', '") + ACMI.Identifier.str();
+  return S + "'";
+}
+
 } // end namespace omp
 
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPContext.h b/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPContext.h
index 1a42d189db44..8a4179167c89 100644
--- a/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPContext.h
+++ b/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPContext.h
@@ -70,15 +70,20 @@ TraitSelector getOpenMPContextTraitSelectorForProperty(TraitProperty Property);
 /// Return a textual representation of the trait selector \p Kind.
 StringRef getOpenMPContextTraitSelectorName(TraitSelector Kind);
 
-/// Parse \p Str and return the trait set it matches or
-/// TraitProperty::invalid.
-TraitProperty getOpenMPContextTraitPropertyKind(TraitSet Set, StringRef Str);
+/// Parse \p Str and return the trait property it matches in the set \p Set and
+/// selector \p Selector or TraitProperty::invalid.
+TraitProperty getOpenMPContextTraitPropertyKind(TraitSet Set,
+                                                TraitSelector Selector,
+                                                StringRef Str);
 
 /// Return the trait property for a singleton selector \p Selector.
 TraitProperty getOpenMPContextTraitPropertyForSelector(TraitSelector Selector);
 
-/// Return a textual representation of the trait property \p Kind.
-StringRef getOpenMPContextTraitPropertyName(TraitProperty Kind);
+/// Return a textual representation of the trait property \p Kind, which might
+/// be the raw string we parsed (\p RawString) if we do not translate the
+/// property into a (distinct) enum.
+StringRef getOpenMPContextTraitPropertyName(TraitProperty Kind,
+                                            StringRef RawString);
 
 /// Return a textual representation of the trait property \p Kind with selector
 /// and set name included.
@@ -112,24 +117,36 @@ bool isValidTraitPropertyForTraitSetAndSelector(TraitProperty Property,
 /// scored (via the ScoresMap). In addition, the required consturct nesting is
 /// decribed as well.
 struct VariantMatchInfo {
-  /// Add the trait \p Property to the required trait set. If \p Score is not
-  /// null, it recorded as well. If \p Property is in the `construct` set it
-  /// is recorded in-order in the ConstructTraits as well.
-  void addTrait(TraitProperty Property, APInt *Score = nullptr) {
-    addTrait(getOpenMPContextTraitSetForProperty(Property), Property, Score);
+  /// Add the trait \p Property to the required trait set. \p RawString is the
+  /// string we parsed and derived \p Property from. If \p Score is not null, it
+  /// recorded as well. If \p Property is in the `construct` set it is recorded
+  /// in-order in the ConstructTraits as well.
+  void addTrait(TraitProperty Property, StringRef RawString,
+                APInt *Score = nullptr) {
+    addTrait(getOpenMPContextTraitSetForProperty(Property), Property, RawString,
+             Score);
   }
   /// Add the trait \p Property which is in set \p Set to the required trait
-  /// set. If \p Score is not null, it recorded as well. If \p Set is the
-  /// `construct` set it is recorded in-order in the ConstructTraits as well.
-  void addTrait(TraitSet Set, TraitProperty Property, APInt *Score = nullptr) {
+  /// set. \p RawString is the string we parsed and derived \p Property from. If
+  /// \p Score is not null, it recorded as well. If \p Set is the `construct`
+  /// set it is recorded in-order in the ConstructTraits as well.
+  void addTrait(TraitSet Set, TraitProperty Property, StringRef RawString,
+                APInt *Score = nullptr) {
     if (Score)
       ScoreMap[Property] = *Score;
+
+    // Special handling for `device={isa(...)}` as we do not match the enum but
+    // the raw string.
+    if (Property == TraitProperty::device_isa___ANY)
+      ISATraits.push_back(RawString);
+
     RequiredTraits.set(unsigned(Property));
     if (Set == TraitSet::construct)
       ConstructTraits.push_back(Property);
   }
 
   BitVector RequiredTraits = BitVector(unsigned(TraitProperty::Last) + 1);
+  SmallVector<StringRef, 8> ISATraits;
   SmallVector<TraitProperty, 8> ConstructTraits;
   SmallDenseMap<TraitProperty, APInt> ScoreMap;
 };
@@ -139,6 +156,7 @@ struct VariantMatchInfo {
 /// in OpenMP constructs at the location.
 struct OMPContext {
   OMPContext(bool IsDeviceCompilation, Triple TargetTriple);
+  virtual ~OMPContext() = default;
 
   void addTrait(TraitProperty Property) {
     addTrait(getOpenMPContextTraitSetForProperty(Property), Property);
@@ -149,6 +167,11 @@ struct OMPContext {
       ConstructTraits.push_back(Property);
   }
 
+  /// Hook for users to check if an ISA trait matches. The trait is described as
+  /// the string that got parsed and it depends on the target and context if
+  /// this matches or not.
+  virtual bool matchesISATrait(StringRef) const { return false; }
+
   BitVector ActiveTraits = BitVector(unsigned(TraitProperty::Last) + 1);
   SmallVector<TraitProperty, 8> ConstructTraits;
 };
diff --git a/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h b/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h
index 3ae4a2edbf96..6b48cc447e13 100644
--- a/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h
+++ b/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h
@@ -1,9 +1,8 @@
 //====--- OMPGridValues.h - Language-specific address spaces --*- C++ -*-====//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 ///
@@ -29,29 +28,30 @@ namespace omp {
 ///        use the new array name.
 ///
 /// Example usage in clang:
-///   const unsigned slot_size = ctx.GetTargetInfo().getGridValue(GV_Warp_Size);
+///   const unsigned slot_size =
+///   ctx.GetTargetInfo().getGridValue(llvm::omp::GVIDX::GV_Warp_Size);
 ///
 /// Example usage in libomptarget/deviceRTLs:
-///   #include "OMPGridValues.h"
+///   #include "llvm/Frontend/OpenMP/OMPGridValues.h"
 ///   #ifdef __AMDGPU__
 ///     #define GRIDVAL AMDGPUGpuGridValues
 ///   #else
 ///     #define GRIDVAL NVPTXGpuGridValues
 ///   #endif
 ///   ... Then use this reference for GV_Warp_Size in the deviceRTL source.
-///   GRIDVAL[GV_Warp_Size]
+///   llvm::omp::GRIDVAL[llvm::omp::GVIDX::GV_Warp_Size]
 ///
 /// Example usage in libomptarget hsa plugin:
-///   #include "OMPGridValues.h"
+///   #include "llvm/Frontend/OpenMP/OMPGridValues.h"
 ///   #define GRIDVAL AMDGPUGpuGridValues
 ///   ... Then use this reference to access GV_Warp_Size in the hsa plugin.
-///   GRIDVAL[GV_Warp_Size]
+///   llvm::omp::GRIDVAL[llvm::omp::GVIDX::GV_Warp_Size]
 ///
 /// Example usage in libomptarget cuda plugin:
-///    #include "OMPGridValues.h"
+///    #include "llvm/Frontend/OpenMP/OMPGridValues.h"
 ///    #define GRIDVAL NVPTXGpuGridValues
 ///   ... Then use this reference to access GV_Warp_Size in the cuda plugin.
-///    GRIDVAL[GV_Warp_Size]
+///    llvm::omp::GRIDVAL[llvm::omp::GVIDX::GV_Warp_Size]
 ///
 enum GVIDX {
   /// The maximum number of workers in a kernel.
diff --git a/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index a2a440d65fd8..22204d9a9ccb 100644
--- a/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -18,8 +18,10 @@
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/Support/Allocator.h"
+#include <forward_list>
 
 namespace llvm {
+class CanonicalLoopInfo;
 
 /// An interface to create LLVM-IR for OpenMP directives.
 ///
@@ -36,7 +38,10 @@ public:
   void initialize();
 
   /// Finalize the underlying module, e.g., by outlining regions.
-  void finalize();
+  /// \param AllowExtractorSinking Flag to include sinking instructions,
+  ///                              emitted by CodeExtractor, in the
+  ///                              outlined region. Default is false.
+  void finalize(bool AllowExtractorSinking = false);
 
   /// Add attributes known for \p FnID to \p Fn.
   void addAttributes(omp::RuntimeFunction FnID, Function &Fn);
@@ -56,7 +61,7 @@ public:
 
   struct FinalizationInfo {
     /// The finalization callback provided by the last in-flight invocation of
-    /// CreateXXXX for the directive of kind DK.
+    /// createXXXX for the directive of kind DK.
     FinalizeCallbackTy FiniCB;
 
     /// The directive kind of the innermost directive that has an associated
@@ -96,6 +101,17 @@ public:
       function_ref<void(InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
                         BasicBlock &ContinuationBB)>;
 
+  /// Callback type for loop body code generation.
+  ///
+  /// \param CodeGenIP is the insertion point where the loop's body code must be
+  ///                  placed. This will be a dedicated BasicBlock with a
+  ///                  conditional branch from the loop condition check and
+  ///                  terminated with an unconditional branch to the loop
+  ///                  latch.
+  /// \param IndVar    is the induction variable usable at the insertion point.
+  using LoopBodyGenCallbackTy =
+      function_ref<void(InsertPointTy CodeGenIP, Value *IndVar)>;
+
   /// Callback type for variable privatization (think copy & default
   /// constructor).
   ///
@@ -103,15 +119,20 @@ public:
   ///                 should be placed.
   /// \param CodeGenIP is the insertion point at which the privatization code
   ///                  should be placed.
-  /// \param Val The value beeing copied/created.
+  /// \param Original The value being copied/created, should not be used in the
+  ///                 generated IR.
+  /// \param Inner The equivalent of \p Original that should be used in the
+  ///              generated IR; this is equal to \p Original if the value is
+  ///              a pointer and can thus be passed directly, otherwise it is
+  ///              an equivalent but different value.
   /// \param ReplVal The replacement value, thus a copy or new created version
-  ///                of \p Val.
+  ///                of \p Inner.
   ///
   /// \returns The new insertion point where code generation continues and
-  ///          \p ReplVal the replacement of \p Val.
+  ///          \p ReplVal the replacement value.
   using PrivatizeCallbackTy = function_ref<InsertPointTy(
-      InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value &Val,
-      Value *&ReplVal)>;
+      InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value &Original,
+      Value &Inner, Value *&ReplVal)>;
 
   /// Description of a LLVM-IR insertion point (IP) and a debug/source location
   /// (filename, line, column, ...).
@@ -139,7 +160,7 @@ public:
   ///                        should be checked and acted upon.
   ///
   /// \returns The insertion point after the barrier.
-  InsertPointTy CreateBarrier(const LocationDescription &Loc, omp::Directive DK,
+  InsertPointTy createBarrier(const LocationDescription &Loc, omp::Directive DK,
                               bool ForceSimpleCall = false,
                               bool CheckCancelFlag = true);
 
@@ -150,12 +171,13 @@ public:
   /// \param CanceledDirective The kind of directive that is cancled.
   ///
   /// \returns The insertion point after the barrier.
-  InsertPointTy CreateCancel(const LocationDescription &Loc, Value *IfCondition,
+  InsertPointTy createCancel(const LocationDescription &Loc, Value *IfCondition,
                              omp::Directive CanceledDirective);
 
   /// Generator for '#omp parallel'
   ///
   /// \param Loc The insert and source location description.
+  /// \param AllocaIP The insertion points to be used for alloca instructions.
   /// \param BodyGenCB Callback that will generate the region code.
   /// \param PrivCB Callback to copy a given variable (think copy constructor).
   /// \param FiniCB Callback to finalize variable copies.
@@ -166,25 +188,179 @@ public:
   ///
   /// \returns The insertion position *after* the parallel.
   IRBuilder<>::InsertPoint
-  CreateParallel(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
-                 PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB,
-                 Value *IfCondition, Value *NumThreads,
-                 omp::ProcBindKind ProcBind, bool IsCancellable);
+  createParallel(const LocationDescription &Loc, InsertPointTy AllocaIP,
+                 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
+                 FinalizeCallbackTy FiniCB, Value *IfCondition,
+                 Value *NumThreads, omp::ProcBindKind ProcBind,
+                 bool IsCancellable);
+
+  /// Generator for the control flow structure of an OpenMP canonical loop.
+  ///
+  /// This generator operates on the logical iteration space of the loop, i.e.
+  /// the caller only has to provide a loop trip count of the loop as defined by
+  /// base language semantics. The trip count is interpreted as an unsigned
+  /// integer. The induction variable passed to \p BodyGenCB will be of the same
+  /// type and run from 0 to \p TripCount - 1. It is up to the callback to
+  /// convert the logical iteration variable to the loop counter variable in the
+  /// loop body.
+  ///
+  /// \param Loc       The insert and source location description. The insert
+  ///                  location can be between two instructions or the end of a
+  ///                  degenerate block (e.g. a BB under construction).
+  /// \param BodyGenCB Callback that will generate the loop body code.
+  /// \param TripCount Number of iterations the loop body is executed.
+  /// \param Name      Base name used to derive BB and instruction names.
+  ///
+  /// \returns An object representing the created control flow structure which
+  ///          can be used for loop-associated directives.
+  CanonicalLoopInfo *createCanonicalLoop(const LocationDescription &Loc,
+                                         LoopBodyGenCallbackTy BodyGenCB,
+                                         Value *TripCount,
+                                         const Twine &Name = "loop");
+
+  /// Generator for the control flow structure of an OpenMP canonical loop.
+  ///
+  /// Instead of a logical iteration space, this allows specifying user-defined
+  /// loop counter values using increment, upper- and lower bounds. To
+  /// disambiguate the terminology when counting downwards, instead of lower
+  /// bounds we use \p Start for the loop counter value in the first body
+  /// iteration.
+  ///
+  /// Consider the following limitations:
+  ///
+  ///  * A loop counter space over all integer values of its bit-width cannot be
+  ///    represented. E.g using uint8_t, its loop trip count of 256 cannot be
+  ///    stored into an 8 bit integer):
+  ///
+  ///      DO I = 0, 255, 1
+  ///
+  ///  * Unsigned wrapping is only supported when wrapping only "once"; E.g.
+  ///    effectively counting downwards:
+  ///
+  ///      for (uint8_t i = 100u; i > 0; i += 127u)
+  ///
+  ///
+  /// TODO: May need to add additional parameters to represent:
+  ///
+  ///  * Allow representing downcounting with unsigned integers.
+  ///
+  ///  * Sign of the step and the comparison operator might disagree:
+  ///
+  ///      for (int i = 0; i < 42; --i)
+  ///
+  //
+  /// \param Loc       The insert and source location description.
+  /// \param BodyGenCB Callback that will generate the loop body code.
+  /// \param Start     Value of the loop counter for the first iterations.
+  /// \param Stop      Loop counter values past this will stop the the
+  ///                  iterations.
+  /// \param Step      Loop counter increment after each iteration; negative
+  ///                  means counting down. \param IsSigned  Whether Start, Stop
+  ///                  and Stop are signed integers.
+  /// \param InclusiveStop Whether  \p Stop itself is a valid value for the loop
+  ///                      counter.
+  /// \param ComputeIP Insertion point for instructions computing the trip
+  ///                  count. Can be used to ensure the trip count is available
+  ///                  at the outermost loop of a loop nest. If not set,
+  ///                  defaults to the preheader of the generated loop.
+  /// \param Name      Base name used to derive BB and instruction names.
+  ///
+  /// \returns An object representing the created control flow structure which
+  ///          can be used for loop-associated directives.
+  CanonicalLoopInfo *createCanonicalLoop(const LocationDescription &Loc,
+                                         LoopBodyGenCallbackTy BodyGenCB,
+                                         Value *Start, Value *Stop, Value *Step,
+                                         bool IsSigned, bool InclusiveStop,
+                                         InsertPointTy ComputeIP = {},
+                                         const Twine &Name = "loop");
+
+  /// Modifies the canonical loop to be a statically-scheduled workshare loop.
+  ///
+  /// This takes a \p LoopInfo representing a canonical loop, such as the one
+  /// created by \p createCanonicalLoop and emits additional instructions to
+  /// turn it into a workshare loop. In particular, it calls to an OpenMP
+  /// runtime function in the preheader to obtain the loop bounds to be used in
+  /// the current thread, updates the relevant instructions in the canonical
+  /// loop and calls to an OpenMP runtime finalization function after the loop.
+  ///
+  /// \param Loc      The source location description, the insertion location
+  ///                 is not used.
+  /// \param CLI      A descriptor of the canonical loop to workshare.
+  /// \param AllocaIP An insertion point for Alloca instructions usable in the
+  ///                 preheader of the loop.
+  /// \param NeedsBarrier Indicates whether a barrier must be insterted after
+  ///                     the loop.
+  /// \param Chunk    The size of loop chunk considered as a unit when
+  ///                 scheduling. If \p nullptr, defaults to 1.
+  ///
+  /// \returns Updated CanonicalLoopInfo.
+  CanonicalLoopInfo *createStaticWorkshareLoop(const LocationDescription &Loc,
+                                               CanonicalLoopInfo *CLI,
+                                               InsertPointTy AllocaIP,
+                                               bool NeedsBarrier,
+                                               Value *Chunk = nullptr);
+
+  /// Tile a loop nest.
+  ///
+  /// Tiles the loops of \p Loops by the tile sizes in \p TileSizes. Loops in
+  /// \p/ Loops must be perfectly nested, from outermost to innermost loop
+  /// (i.e. Loops.front() is the outermost loop). The trip count llvm::Value
+  /// of every loop and every tile sizes must be usable in the outermost
+  /// loop's preheader. This implies that the loop nest is rectangular.
+  ///
+  /// Example:
+  /// \code
+  ///   for (int i = 0; i < 15; ++i) // Canonical loop "i"
+  ///     for (int j = 0; j < 14; ++j) // Canonical loop "j"
+  ///         body(i, j);
+  /// \endcode
+  ///
+  /// After tiling with Loops={i,j} and TileSizes={5,7}, the loop is changed to
+  /// \code
+  ///   for (int i1 = 0; i1 < 3; ++i1)
+  ///     for (int j1 = 0; j1 < 2; ++j1)
+  ///       for (int i2 = 0; i2 < 5; ++i2)
+  ///         for (int j2 = 0; j2 < 7; ++j2)
+  ///           body(i1*3+i2, j1*3+j2);
+  /// \endcode
+  ///
+  /// The returned vector are the loops {i1,j1,i2,j2}. The loops i1 and j1 are
+  /// referred to the floor, and the loops i2 and j2 are the tiles. Tiling also
+  /// handles non-constant trip counts, non-constant tile sizes and trip counts
+  /// that are not multiples of the tile size. In the latter case the tile loop
+  /// of the last floor-loop iteration will have fewer iterations than specified
+  /// as its tile size.
+  ///
+  ///
+  /// @param DL        Debug location for instructions added by tiling, for
+  ///                  instance the floor- and tile trip count computation.
+  /// @param Loops     Loops to tile. The CanonicalLoopInfo objects are
+  ///                  invalidated by this method, i.e. should not used after
+  ///                  tiling.
+  /// @param TileSizes For each loop in \p Loops, the tile size for that
+  ///                  dimensions.
+  ///
+  /// \returns A list of generated loops. Contains twice as many loops as the
+  ///          input loop nest; the first half are the floor loops and the
+  ///          second half are the tile loops.
+  std::vector<CanonicalLoopInfo *>
+  tileLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
+            ArrayRef<Value *> TileSizes);
 
   /// Generator for '#omp flush'
   ///
   /// \param Loc The location where the flush directive was encountered
-  void CreateFlush(const LocationDescription &Loc);
+  void createFlush(const LocationDescription &Loc);
 
   /// Generator for '#omp taskwait'
   ///
   /// \param Loc The location where the taskwait directive was encountered.
-  void CreateTaskwait(const LocationDescription &Loc);
+  void createTaskwait(const LocationDescription &Loc);
 
   /// Generator for '#omp taskyield'
   ///
   /// \param Loc The location where the taskyield directive was encountered.
-  void CreateTaskyield(const LocationDescription &Loc);
+  void createTaskyield(const LocationDescription &Loc);
 
   ///}
 
@@ -224,6 +400,9 @@ public:
                           omp::IdentFlag Flags = omp::IdentFlag(0),
                           unsigned Reserve2Flags = 0);
 
+  // Get the type corresponding to __kmpc_impl_lanemask_t from the deviceRTL
+  Type *getLanemaskType();
+
   /// Generate control flow and cleanup for cancellation.
   ///
   /// \param CancelFlag Flag indicating if the cancellation is performed.
@@ -305,6 +484,10 @@ public:
   /// Collection of regions that need to be outlined during finalization.
   SmallVector<OutlineInfo, 16> OutlineInfos;
 
+  /// Collection of owned canonical loop objects that eventually need to be
+  /// free'd.
+  std::forward_list<CanonicalLoopInfo> LoopInfos;
+
   /// Add a new region that will be outlined later.
   void addOutlineInfo(OutlineInfo &&OI) { OutlineInfos.emplace_back(OI); }
 
@@ -316,6 +499,32 @@ public:
   StringMap<AssertingVH<Constant>, BumpPtrAllocator> InternalVars;
 
 public:
+  /// Generator for __kmpc_copyprivate
+  ///
+  /// \param Loc The source location description.
+  /// \param BufSize Number of elements in the buffer.
+  /// \param CpyBuf List of pointers to data to be copied.
+  /// \param CpyFn function to call for copying data.
+  /// \param DidIt flag variable; 1 for 'single' thread, 0 otherwise.
+  ///
+  /// \return The insertion position *after* the CopyPrivate call.
+
+  InsertPointTy createCopyPrivate(const LocationDescription &Loc,
+                                  llvm::Value *BufSize, llvm::Value *CpyBuf,
+                                  llvm::Value *CpyFn, llvm::Value *DidIt);
+
+  /// Generator for '#omp single'
+  ///
+  /// \param Loc The source location description.
+  /// \param BodyGenCB Callback that will generate the region code.
+  /// \param FiniCB Callback to finalize variable copies.
+  /// \param DidIt Local variable used as a flag to indicate 'single' thread
+  ///
+  /// \returns The insertion position *after* the single call.
+  InsertPointTy createSingle(const LocationDescription &Loc,
+                             BodyGenCallbackTy BodyGenCB,
+                             FinalizeCallbackTy FiniCB, llvm::Value *DidIt);
+
   /// Generator for '#omp master'
   ///
   /// \param Loc The insert and source location description.
@@ -323,7 +532,7 @@ public:
   /// \param FiniCB Callback to finalize variable copies.
   ///
   /// \returns The insertion position *after* the master.
-  InsertPointTy CreateMaster(const LocationDescription &Loc,
+  InsertPointTy createMaster(const LocationDescription &Loc,
                              BodyGenCallbackTy BodyGenCB,
                              FinalizeCallbackTy FiniCB);
 
@@ -336,7 +545,7 @@ public:
   /// \param HintInst Hint Instruction for hint clause associated with critical
   ///
   /// \returns The insertion position *after* the master.
-  InsertPointTy CreateCritical(const LocationDescription &Loc,
+  InsertPointTy createCritical(const LocationDescription &Loc,
                                BodyGenCallbackTy BodyGenCB,
                                FinalizeCallbackTy FiniCB,
                                StringRef CriticalName, Value *HintInst);
@@ -353,7 +562,7 @@ public:
   //				 and copy.in.end block
   ///
   /// \returns The insertion point where copying operation to be emitted.
-  InsertPointTy CreateCopyinClauseBlocks(InsertPointTy IP, Value *MasterAddr,
+  InsertPointTy createCopyinClauseBlocks(InsertPointTy IP, Value *MasterAddr,
                                          Value *PrivateAddr,
                                          llvm::IntegerType *IntPtrTy,
                                          bool BranchtoEnd = true);
@@ -366,7 +575,7 @@ public:
   /// \param Name Name of call Instruction for OMP_alloc
   ///
   /// \returns CallInst to the OMP_Alloc call
-  CallInst *CreateOMPAlloc(const LocationDescription &Loc, Value *Size,
+  CallInst *createOMPAlloc(const LocationDescription &Loc, Value *Size,
                            Value *Allocator, std::string Name = "");
 
   /// Create a runtime call for kmpc_free
@@ -377,7 +586,7 @@ public:
   /// \param Name Name of call Instruction for OMP_Free
   ///
   /// \returns CallInst to the OMP_Free call
-  CallInst *CreateOMPFree(const LocationDescription &Loc, Value *Addr,
+  CallInst *createOMPFree(const LocationDescription &Loc, Value *Addr,
                           Value *Allocator, std::string Name = "");
 
   /// Create a runtime call for kmpc_threadprivate_cached
@@ -388,7 +597,7 @@ public:
   /// \param Name Name of call Instruction for callinst
   ///
   /// \returns CallInst to the thread private cache call.
-  CallInst *CreateCachedThreadPrivate(const LocationDescription &Loc,
+  CallInst *createCachedThreadPrivate(const LocationDescription &Loc,
                                       llvm::Value *Pointer,
                                       llvm::ConstantInt *Size,
                                       const llvm::Twine &Name = Twine(""));
@@ -496,6 +705,155 @@ private:
   /// \param CriticalName Name of the critical region.
   ///
   Value *getOMPCriticalRegionLock(StringRef CriticalName);
+
+  /// Create the control flow structure of a canonical OpenMP loop.
+  ///
+  /// The emitted loop will be disconnected, i.e. no edge to the loop's
+  /// preheader and no terminator in the AfterBB. The OpenMPIRBuilder's
+  /// IRBuilder location is not preserved.
+  ///
+  /// \param DL        DebugLoc used for the instructions in the skeleton.
+  /// \param TripCount Value to be used for the trip count.
+  /// \param F         Function in which to insert the BasicBlocks.
+  /// \param PreInsertBefore  Where to insert BBs that execute before the body,
+  ///                         typically the body itself.
+  /// \param PostInsertBefore Where to insert BBs that execute after the body.
+  /// \param Name      Base name used to derive BB
+  ///                  and instruction names.
+  ///
+  /// \returns The CanonicalLoopInfo that represents the emitted loop.
+  CanonicalLoopInfo *createLoopSkeleton(DebugLoc DL, Value *TripCount,
+                                        Function *F,
+                                        BasicBlock *PreInsertBefore,
+                                        BasicBlock *PostInsertBefore,
+                                        const Twine &Name = {});
+};
+
+/// Class to represented the control flow structure of an OpenMP canonical loop.
+///
+/// The control-flow structure is standardized for easy consumption by
+/// directives associated with loops. For instance, the worksharing-loop
+/// construct may change this control flow such that each loop iteration is
+/// executed on only one thread.
+///
+/// The control flow can be described as follows:
+///
+///     Preheader
+///        |
+///  /-> Header
+///  |     |
+///  |    Cond---\
+///  |     |     |
+///  |    Body   |
+///  |    | |    |
+///  |   <...>   |
+///  |    | |    |
+///   \--Latch   |
+///              |
+///             Exit
+///              |
+///            After
+///
+/// Code in the header, condition block, latch and exit block must not have any
+/// side-effect. The body block is the single entry point into the loop body,
+/// which may contain arbitrary control flow as long as all control paths
+/// eventually branch to the latch block.
+///
+/// Defined outside OpenMPIRBuilder because one cannot forward-declare nested
+/// classes.
+class CanonicalLoopInfo {
+  friend class OpenMPIRBuilder;
+
+private:
+  /// Whether this object currently represents a loop.
+  bool IsValid = false;
+
+  BasicBlock *Preheader;
+  BasicBlock *Header;
+  BasicBlock *Cond;
+  BasicBlock *Body;
+  BasicBlock *Latch;
+  BasicBlock *Exit;
+  BasicBlock *After;
+
+  /// Add the control blocks of this loop to \p BBs.
+  ///
+  /// This does not include any block from the body, including the one returned
+  /// by getBody().
+  void collectControlBlocks(SmallVectorImpl<BasicBlock *> &BBs);
+
+public:
+  /// The preheader ensures that there is only a single edge entering the loop.
+  /// Code that must be execute before any loop iteration can be emitted here,
+  /// such as computing the loop trip count and begin lifetime markers. Code in
+  /// the preheader is not considered part of the canonical loop.
+  BasicBlock *getPreheader() const { return Preheader; }
+
+  /// The header is the entry for each iteration. In the canonical control flow,
+  /// it only contains the PHINode for the induction variable.
+  BasicBlock *getHeader() const { return Header; }
+
+  /// The condition block computes whether there is another loop iteration. If
+  /// yes, branches to the body; otherwise to the exit block.
+  BasicBlock *getCond() const { return Cond; }
+
+  /// The body block is the single entry for a loop iteration and not controlled
+  /// by CanonicalLoopInfo. It can contain arbitrary control flow but must
+  /// eventually branch to the \p Latch block.
+  BasicBlock *getBody() const { return Body; }
+
+  /// Reaching the latch indicates the end of the loop body code. In the
+  /// canonical control flow, it only contains the increment of the induction
+  /// variable.
+  BasicBlock *getLatch() const { return Latch; }
+
+  /// Reaching the exit indicates no more iterations are being executed.
+  BasicBlock *getExit() const { return Exit; }
+
+  /// The after block is intended for clean-up code such as lifetime end
+  /// markers. It is separate from the exit block to ensure, analogous to the
+  /// preheader, it having just a single entry edge and being free from PHI
+  /// nodes should there be multiple loop exits (such as from break
+  /// statements/cancellations).
+  BasicBlock *getAfter() const { return After; }
+
+  /// Returns the llvm::Value containing the number of loop iterations. It must
+  /// be valid in the preheader and always interpreted as an unsigned integer of
+  /// any bit-width.
+  Value *getTripCount() const {
+    Instruction *CmpI = &Cond->front();
+    assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
+    return CmpI->getOperand(1);
+  }
+
+  /// Returns the instruction representing the current logical induction
+  /// variable. Always unsigned, always starting at 0 with an increment of one.
+  Instruction *getIndVar() const {
+    Instruction *IndVarPHI = &Header->front();
+    assert(isa<PHINode>(IndVarPHI) && "First inst must be the IV PHI");
+    return IndVarPHI;
+  }
+
+  /// Return the type of the induction variable (and the trip count).
+  Type *getIndVarType() const { return getIndVar()->getType(); }
+
+  /// Return the insertion point for user code before the loop.
+  OpenMPIRBuilder::InsertPointTy getPreheaderIP() const {
+    return {Preheader, std::prev(Preheader->end())};
+  };
+
+  /// Return the insertion point for user code in the body.
+  OpenMPIRBuilder::InsertPointTy getBodyIP() const {
+    return {Body, Body->begin()};
+  };
+
+  /// Return the insertion point for user code after the loop.
+  OpenMPIRBuilder::InsertPointTy getAfterIP() const {
+    return {After, After->begin()};
+  };
+
+  /// Consistency self-check.
+  void assertOK() const;
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index 93ea63c1c2e6..75d360bf4237 100644
--- a/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -7,217 +7,15 @@
 //===----------------------------------------------------------------------===//
 /// \file
 ///
-/// This file defines the list of supported OpenMP directives, clauses, runtime
+/// This file defines the list of supported OpenMP runtime
 /// calls, and other things that need to be listed in enums.
 ///
-//===----------------------------------------------------------------------===//
-
-/// OpenMP Directives and combined directives
+/// This file is under transition to OMP.td with TableGen code generation.
 ///
-///{
-
-#ifndef OMP_DIRECTIVE
-#define OMP_DIRECTIVE(Enum, Str)
-#endif
-
-#define __OMP_DIRECTIVE_EXT(Name, Str) OMP_DIRECTIVE(OMPD_##Name, Str)
-#define __OMP_DIRECTIVE(Name) __OMP_DIRECTIVE_EXT(Name, #Name)
-
-__OMP_DIRECTIVE(threadprivate)
-__OMP_DIRECTIVE(parallel)
-__OMP_DIRECTIVE(task)
-__OMP_DIRECTIVE(simd)
-__OMP_DIRECTIVE(for)
-__OMP_DIRECTIVE(sections)
-__OMP_DIRECTIVE(section)
-__OMP_DIRECTIVE(single)
-__OMP_DIRECTIVE(master)
-__OMP_DIRECTIVE(critical)
-__OMP_DIRECTIVE(taskyield)
-__OMP_DIRECTIVE(barrier)
-__OMP_DIRECTIVE(taskwait)
-__OMP_DIRECTIVE(taskgroup)
-__OMP_DIRECTIVE(flush)
-__OMP_DIRECTIVE(ordered)
-__OMP_DIRECTIVE(atomic)
-__OMP_DIRECTIVE(target)
-__OMP_DIRECTIVE(teams)
-__OMP_DIRECTIVE(cancel)
-__OMP_DIRECTIVE(requires)
-__OMP_DIRECTIVE_EXT(target_data, "target data")
-__OMP_DIRECTIVE_EXT(target_enter_data, "target enter data")
-__OMP_DIRECTIVE_EXT(target_exit_data, "target exit data")
-__OMP_DIRECTIVE_EXT(target_parallel, "target parallel")
-__OMP_DIRECTIVE_EXT(target_parallel_for, "target parallel for")
-__OMP_DIRECTIVE_EXT(target_update, "target update")
-__OMP_DIRECTIVE_EXT(parallel_for, "parallel for")
-__OMP_DIRECTIVE_EXT(parallel_for_simd, "parallel for simd")
-__OMP_DIRECTIVE_EXT(parallel_master, "parallel master")
-__OMP_DIRECTIVE_EXT(parallel_sections, "parallel sections")
-__OMP_DIRECTIVE_EXT(for_simd, "for simd")
-__OMP_DIRECTIVE_EXT(cancellation_point, "cancellation point")
-__OMP_DIRECTIVE_EXT(declare_reduction, "declare reduction")
-__OMP_DIRECTIVE_EXT(declare_mapper, "declare mapper")
-__OMP_DIRECTIVE_EXT(declare_simd, "declare simd")
-__OMP_DIRECTIVE(taskloop)
-__OMP_DIRECTIVE_EXT(taskloop_simd, "taskloop simd")
-__OMP_DIRECTIVE(distribute)
-__OMP_DIRECTIVE_EXT(declare_target, "declare target")
-__OMP_DIRECTIVE_EXT(end_declare_target, "end declare target")
-__OMP_DIRECTIVE_EXT(distribute_parallel_for, "distribute parallel for")
-__OMP_DIRECTIVE_EXT(distribute_parallel_for_simd,
-                    "distribute parallel for simd")
-__OMP_DIRECTIVE_EXT(distribute_simd, "distribute simd")
-__OMP_DIRECTIVE_EXT(target_parallel_for_simd, "target parallel for simd")
-__OMP_DIRECTIVE_EXT(target_simd, "target simd")
-__OMP_DIRECTIVE_EXT(teams_distribute, "teams distribute")
-__OMP_DIRECTIVE_EXT(teams_distribute_simd, "teams distribute simd")
-__OMP_DIRECTIVE_EXT(teams_distribute_parallel_for_simd,
-                    "teams distribute parallel for simd")
-__OMP_DIRECTIVE_EXT(teams_distribute_parallel_for,
-                    "teams distribute parallel for")
-__OMP_DIRECTIVE_EXT(target_teams, "target teams")
-__OMP_DIRECTIVE_EXT(target_teams_distribute, "target teams distribute")
-__OMP_DIRECTIVE_EXT(target_teams_distribute_parallel_for,
-                    "target teams distribute parallel for")
-__OMP_DIRECTIVE_EXT(target_teams_distribute_parallel_for_simd,
-                    "target teams distribute parallel for simd")
-__OMP_DIRECTIVE_EXT(target_teams_distribute_simd,
-                    "target teams distribute simd")
-__OMP_DIRECTIVE(allocate)
-__OMP_DIRECTIVE_EXT(declare_variant, "declare variant")
-__OMP_DIRECTIVE_EXT(master_taskloop, "master taskloop")
-__OMP_DIRECTIVE_EXT(parallel_master_taskloop, "parallel master taskloop")
-__OMP_DIRECTIVE_EXT(master_taskloop_simd, "master taskloop simd")
-__OMP_DIRECTIVE_EXT(parallel_master_taskloop_simd,
-                    "parallel master taskloop simd")
-__OMP_DIRECTIVE(depobj)
-__OMP_DIRECTIVE(scan)
-__OMP_DIRECTIVE_EXT(begin_declare_variant, "begin declare variant")
-__OMP_DIRECTIVE_EXT(end_declare_variant, "end declare variant")
-
-// Has to be the last because Clang implicitly expects it to be.
-__OMP_DIRECTIVE(unknown)
-
-#undef __OMP_DIRECTIVE_EXT
-#undef __OMP_DIRECTIVE
-#undef OMP_DIRECTIVE
-
-///}
-
-/// OpenMP Clauses
-///
-///{
-
-#ifndef OMP_CLAUSE
-#define OMP_CLAUSE(Enum, Str, Implicit)
-#endif
-#ifndef OMP_CLAUSE_CLASS
-#define OMP_CLAUSE_CLASS(Enum, Str, Class)
-#endif
-#ifndef OMP_CLAUSE_NO_CLASS
-#define OMP_CLAUSE_NO_CLASS(Enum, Str)
-#endif
-
-#define __OMP_CLAUSE(Name, Class)                                              \
-  OMP_CLAUSE(OMPC_##Name, #Name, /* Implicit */ false)                         \
-  OMP_CLAUSE_CLASS(OMPC_##Name, #Name, Class)
-#define __OMP_CLAUSE_NO_CLASS(Name)                                            \
-  OMP_CLAUSE(OMPC_##Name, #Name, /* Implicit */ false)                         \
-  OMP_CLAUSE_NO_CLASS(OMPC_##Name, #Name)
-#define __OMP_IMPLICIT_CLAUSE_CLASS(Name, Str, Class)                          \
-  OMP_CLAUSE(OMPC_##Name, Str, /* Implicit */ true)                            \
-  OMP_CLAUSE_CLASS(OMPC_##Name, Str, Class)
-#define __OMP_IMPLICIT_CLAUSE_NO_CLASS(Name, Str)                              \
-  OMP_CLAUSE(OMPC_##Name, Str, /* Implicit */ true)                            \
-  OMP_CLAUSE_NO_CLASS(OMPC_##Name, Str)
-
-__OMP_CLAUSE(allocator, OMPAllocatorClause)
-__OMP_CLAUSE(if, OMPIfClause)
-__OMP_CLAUSE(final, OMPFinalClause)
-__OMP_CLAUSE(num_threads, OMPNumThreadsClause)
-__OMP_CLAUSE(safelen, OMPSafelenClause)
-__OMP_CLAUSE(simdlen, OMPSimdlenClause)
-__OMP_CLAUSE(collapse, OMPCollapseClause)
-__OMP_CLAUSE(default, OMPDefaultClause)
-__OMP_CLAUSE(private, OMPPrivateClause)
-__OMP_CLAUSE(firstprivate, OMPFirstprivateClause)
-__OMP_CLAUSE(lastprivate, OMPLastprivateClause)
-__OMP_CLAUSE(shared, OMPSharedClause)
-__OMP_CLAUSE(reduction, OMPReductionClause)
-__OMP_CLAUSE(linear, OMPLinearClause)
-__OMP_CLAUSE(aligned, OMPAlignedClause)
-__OMP_CLAUSE(copyin, OMPCopyinClause)
-__OMP_CLAUSE(copyprivate, OMPCopyprivateClause)
-__OMP_CLAUSE(proc_bind, OMPProcBindClause)
-__OMP_CLAUSE(schedule, OMPScheduleClause)
-__OMP_CLAUSE(ordered, OMPOrderedClause)
-__OMP_CLAUSE(nowait, OMPNowaitClause)
-__OMP_CLAUSE(untied, OMPUntiedClause)
-__OMP_CLAUSE(mergeable, OMPMergeableClause)
-__OMP_CLAUSE(read, OMPReadClause)
-__OMP_CLAUSE(write, OMPWriteClause)
-__OMP_CLAUSE(update, OMPUpdateClause)
-__OMP_CLAUSE(capture, OMPCaptureClause)
-__OMP_CLAUSE(seq_cst, OMPSeqCstClause)
-__OMP_CLAUSE(acq_rel, OMPAcqRelClause)
-__OMP_CLAUSE(acquire, OMPAcquireClause)
-__OMP_CLAUSE(release, OMPReleaseClause)
-__OMP_CLAUSE(relaxed, OMPRelaxedClause)
-__OMP_CLAUSE(depend, OMPDependClause)
-__OMP_CLAUSE(device, OMPDeviceClause)
-__OMP_CLAUSE(threads, OMPThreadsClause)
-__OMP_CLAUSE(simd, OMPSIMDClause)
-__OMP_CLAUSE(map, OMPMapClause)
-__OMP_CLAUSE(num_teams, OMPNumTeamsClause)
-__OMP_CLAUSE(thread_limit, OMPThreadLimitClause)
-__OMP_CLAUSE(priority, OMPPriorityClause)
-__OMP_CLAUSE(grainsize, OMPGrainsizeClause)
-__OMP_CLAUSE(nogroup, OMPNogroupClause)
-__OMP_CLAUSE(num_tasks, OMPNumTasksClause)
-__OMP_CLAUSE(hint, OMPHintClause)
-__OMP_CLAUSE(dist_schedule, OMPDistScheduleClause)
-__OMP_CLAUSE(defaultmap, OMPDefaultmapClause)
-__OMP_CLAUSE(to, OMPToClause)
-__OMP_CLAUSE(from, OMPFromClause)
-__OMP_CLAUSE(use_device_ptr, OMPUseDevicePtrClause)
-__OMP_CLAUSE(is_device_ptr, OMPIsDevicePtrClause)
-__OMP_CLAUSE(task_reduction, OMPTaskReductionClause)
-__OMP_CLAUSE(in_reduction, OMPInReductionClause)
-__OMP_CLAUSE(unified_address, OMPUnifiedAddressClause)
-__OMP_CLAUSE(unified_shared_memory, OMPUnifiedSharedMemoryClause)
-__OMP_CLAUSE(reverse_offload, OMPReverseOffloadClause)
-__OMP_CLAUSE(dynamic_allocators, OMPDynamicAllocatorsClause)
-__OMP_CLAUSE(atomic_default_mem_order, OMPAtomicDefaultMemOrderClause)
-__OMP_CLAUSE(allocate, OMPAllocateClause)
-__OMP_CLAUSE(nontemporal, OMPNontemporalClause)
-__OMP_CLAUSE(order, OMPOrderClause)
-__OMP_CLAUSE(destroy, OMPDestroyClause)
-__OMP_CLAUSE(detach, OMPDetachClause)
-__OMP_CLAUSE(inclusive, OMPInclusiveClause)
-__OMP_CLAUSE(exclusive, OMPExclusiveClause)
-__OMP_CLAUSE(uses_allocators, OMPUsesAllocatorsClause)
-__OMP_CLAUSE(affinity, OMPAffinityClause)
-__OMP_CLAUSE(use_device_addr, OMPUseDeviceAddrClause)
-
-__OMP_CLAUSE_NO_CLASS(uniform)
-__OMP_CLAUSE_NO_CLASS(device_type)
-__OMP_CLAUSE_NO_CLASS(match)
-
-__OMP_IMPLICIT_CLAUSE_CLASS(depobj, "depobj", OMPDepobjClause)
-__OMP_IMPLICIT_CLAUSE_CLASS(flush, "flush", OMPFlushClause)
-
-__OMP_IMPLICIT_CLAUSE_NO_CLASS(threadprivate, "threadprivate or thread local")
-__OMP_IMPLICIT_CLAUSE_NO_CLASS(unknown, "unknown")
-
-#undef __OMP_IMPLICIT_CLAUSE_NO_CLASS
-#undef __OMP_IMPLICIT_CLAUSE_CLASS
-#undef __OMP_CLAUSE
-#undef OMP_CLAUSE_NO_CLASS
-#undef OMP_CLAUSE_CLASS
-#undef OMP_CLAUSE
+//===----------------------------------------------------------------------===//
 
-///}
+/// OpenMP Directives, combined directives and Clauses
+/// - Moved to OMP.td
 
 /// Types used in runtime structs or runtime functions
 ///
@@ -232,13 +30,16 @@ __OMP_IMPLICIT_CLAUSE_NO_CLASS(unknown, "unknown")
 __OMP_TYPE(Void)
 __OMP_TYPE(Int1)
 __OMP_TYPE(Int8)
+__OMP_TYPE(Int16)
 __OMP_TYPE(Int32)
 __OMP_TYPE(Int64)
 __OMP_TYPE(Int8Ptr)
+__OMP_TYPE(Int16Ptr)
 __OMP_TYPE(Int32Ptr)
 __OMP_TYPE(Int64Ptr)
 
 OMP_TYPE(SizeTy, M.getDataLayout().getIntPtrType(Ctx))
+OMP_TYPE(LanemaskTy, getLanemaskType())
 
 #define __OMP_PTR_TYPE(NAME, BASE) OMP_TYPE(NAME, BASE->getPointerTo())
 
@@ -286,6 +87,7 @@ __OMP_ARRAY_TYPE(KmpCriticalName, Int32, 8)
   OMP_STRUCT_TYPE(VarName, "struct." #Name, __VA_ARGS__)
 
 __OMP_STRUCT_TYPE(Ident, ident_t, Int32, Int32, Int32, Int32, Int8Ptr)
+__OMP_STRUCT_TYPE(AsyncInfo, __tgt_async_info, Int8Ptr)
 
 #undef __OMP_STRUCT_TYPE
 #undef OMP_STRUCT_TYPE
@@ -305,6 +107,9 @@ __OMP_FUNCTION_TYPE(KmpcDtor, false, Void, VoidPtr)
 __OMP_FUNCTION_TYPE(KmpcCopyCtor, false, VoidPtr, VoidPtr, VoidPtr)
 __OMP_FUNCTION_TYPE(TaskRoutineEntry, false, Int32, Int32,
                     /* kmp_task_t */ VoidPtr)
+__OMP_FUNCTION_TYPE(ShuffleReduce, false, Void, VoidPtr, Int16, Int16, Int16)
+__OMP_FUNCTION_TYPE(InterWarpCopy, false, Void, VoidPtr, Int32)
+__OMP_FUNCTION_TYPE(GlobalList, false, Void, VoidPtr, Int32, VoidPtr)
 
 #undef __OMP_FUNCTION_TYPE
 #undef OMP_FUNCTION_TYPE
@@ -315,6 +120,20 @@ __OMP_FUNCTION_TYPE(TaskRoutineEntry, false, Int32, Int32,
 ///
 ///{
 
+#ifndef ICV_INIT_VALUE
+#define ICV_INIT_VALUE(Enum, Name)
+#endif
+
+#define __ICV_INIT_VALUE(Name) ICV_INIT_VALUE(ICV_##Name, #Name)
+
+__ICV_INIT_VALUE(ZERO)
+__ICV_INIT_VALUE(FALSE)
+__ICV_INIT_VALUE(IMPLEMENTATION_DEFINED)
+__ICV_INIT_VALUE(LAST)
+
+#undef __ICV_INIT_VALUE
+#undef ICV_INIT_VALUE
+
 #ifndef ICV_DATA_ENV
 #define ICV_DATA_ENV(Enum, Name, EnvVarName, Init)
 #endif
@@ -325,6 +144,7 @@ __OMP_FUNCTION_TYPE(TaskRoutineEntry, false, Int32, Int32,
 __ICV_DATA_ENV(nthreads, OMP_NUM_THREADS, ICV_IMPLEMENTATION_DEFINED)
 __ICV_DATA_ENV(active_levels, NONE, ICV_ZERO)
 __ICV_DATA_ENV(cancel, OMP_CANCELLATION, ICV_FALSE)
+__ICV_DATA_ENV(proc_bind, OMP_PROC_BIND, ICV_IMPLEMENTATION_DEFINED)
 __ICV_DATA_ENV(__last, last, ICV_LAST)
 
 #undef __ICV_DATA_ENV
@@ -350,6 +170,7 @@ __ICV_RT_SET(nthreads, omp_set_num_threads)
 __ICV_RT_GET(nthreads, omp_get_max_threads)
 __ICV_RT_GET(active_levels, omp_get_active_level)
 __ICV_RT_GET(cancel, omp_get_cancellation)
+__ICV_RT_GET(proc_bind, omp_get_proc_bind)
 
 #undef __ICV_RT_GET
 #undef ICV_RT_GET
@@ -380,10 +201,9 @@ __OMP_RTL(__kmpc_omp_taskyield, false, Int32, IdentPtr, Int32, /* Int */ Int32)
 __OMP_RTL(__kmpc_push_num_threads, false, Void, IdentPtr, Int32,
           /* Int */ Int32)
 __OMP_RTL(__kmpc_push_proc_bind, false, Void, IdentPtr, Int32, /* Int */ Int32)
-__OMP_RTL(__kmpc_serialized_parallel, false, Void, IdentPtr, Int32)
-__OMP_RTL(__kmpc_end_serialized_parallel, false, Void, IdentPtr, Int32)
 __OMP_RTL(__kmpc_omp_reg_task_with_affinity, false, Int32, IdentPtr, Int32,
-          Int8Ptr, Int32, Int8Ptr)
+          /* kmp_task_t */ VoidPtr, Int32,
+          /* kmp_task_affinity_info_t */ VoidPtr)
 
 __OMP_RTL(omp_get_thread_num, false, Int32, )
 __OMP_RTL(omp_get_num_threads, false, Int32, )
@@ -430,8 +250,7 @@ __OMP_RTL(__kmpc_reduce, false, Int32, IdentPtr, Int32, Int32, SizeTy, VoidPtr,
           ReduceFunctionPtr, KmpCriticalNamePtrTy)
 __OMP_RTL(__kmpc_reduce_nowait, false, Int32, IdentPtr, Int32, Int32, SizeTy,
           VoidPtr, ReduceFunctionPtr, KmpCriticalNamePtrTy)
-__OMP_RTL(__kmpc_end_reduce, false, Void, IdentPtr, Int32,
-          KmpCriticalNamePtrTy)
+__OMP_RTL(__kmpc_end_reduce, false, Void, IdentPtr, Int32, KmpCriticalNamePtrTy)
 __OMP_RTL(__kmpc_end_reduce_nowait, false, Void, IdentPtr, Int32,
           KmpCriticalNamePtrTy)
 
@@ -514,10 +333,10 @@ __OMP_RTL(__kmpc_taskloop, false, Void, IdentPtr, /* Int */ Int32, VoidPtr,
           /* Int */ Int32, Int64, VoidPtr)
 __OMP_RTL(__kmpc_omp_target_task_alloc, false, /* kmp_task_t */ VoidPtr,
           IdentPtr, Int32, Int32, SizeTy, SizeTy, TaskRoutineEntryPtr, Int64)
-__OMP_RTL(__kmpc_taskred_modifier_init, false, VoidPtr, IdentPtr,
-          /* Int */ Int32, /* Int */ Int32, /* Int */ Int32, VoidPtr)
-__OMP_RTL(__kmpc_taskred_init, false, VoidPtr, /* Int */ Int32,
-          /* Int */ Int32, VoidPtr)
+__OMP_RTL(__kmpc_taskred_modifier_init, false, /* kmp_taskgroup */ VoidPtr,
+          IdentPtr, /* Int */ Int32, /* Int */ Int32, /* Int */ Int32, VoidPtr)
+__OMP_RTL(__kmpc_taskred_init, false, /* kmp_taskgroup */ VoidPtr,
+          /* Int */ Int32, /* Int */ Int32, VoidPtr)
 __OMP_RTL(__kmpc_task_reduction_modifier_fini, false, Void, IdentPtr,
           /* Int */ Int32, /* Int */ Int32)
 __OMP_RTL(__kmpc_task_reduction_get_th_data, false, VoidPtr, Int32, VoidPtr,
@@ -556,45 +375,83 @@ __OMP_RTL(__kmpc_init_allocator, false, /* omp_allocator_handle_t */ VoidPtr,
 __OMP_RTL(__kmpc_destroy_allocator, false, Void, /* Int */ Int32,
           /* omp_allocator_handle_t */ VoidPtr)
 
-__OMP_RTL(__kmpc_push_target_tripcount, false, Void, Int64, Int64)
-__OMP_RTL(__tgt_target, false, Int32, Int64, VoidPtr, Int32, VoidPtrPtr,
-          VoidPtrPtr, Int64Ptr, Int64Ptr)
-__OMP_RTL(__tgt_target_nowait, false, Int32, Int64, VoidPtr, Int32, VoidPtrPtr,
-          VoidPtrPtr, Int64Ptr, Int64Ptr)
-__OMP_RTL(__tgt_target_teams, false, Int32, Int64, VoidPtr, Int32, VoidPtrPtr,
-          VoidPtrPtr, Int64Ptr, Int64Ptr, Int32, Int32)
-__OMP_RTL(__tgt_target_teams_nowait, false, Int32, Int64, VoidPtr, Int32,
-          VoidPtrPtr, VoidPtrPtr, Int64Ptr, Int64Ptr, Int32, Int32)
+__OMP_RTL(__kmpc_push_target_tripcount_mapper, false, Void, IdentPtr, Int64, Int64)
+__OMP_RTL(__tgt_target_mapper, false, Int32, IdentPtr, Int64, VoidPtr, Int32, VoidPtrPtr,
+          VoidPtrPtr, Int64Ptr, Int64Ptr, VoidPtrPtr, VoidPtrPtr)
+__OMP_RTL(__tgt_target_nowait_mapper, false, Int32, IdentPtr, Int64, VoidPtr, Int32,
+          VoidPtrPtr, VoidPtrPtr, Int64Ptr, Int64Ptr, VoidPtrPtr, VoidPtrPtr)
+__OMP_RTL(__tgt_target_teams_mapper, false, Int32, IdentPtr, Int64, VoidPtr, Int32,
+          VoidPtrPtr, VoidPtrPtr, Int64Ptr, Int64Ptr, VoidPtrPtr, VoidPtrPtr, Int32, Int32)
+__OMP_RTL(__tgt_target_teams_nowait_mapper, false, Int32, IdentPtr, Int64, VoidPtr, Int32,
+          VoidPtrPtr, VoidPtrPtr, Int64Ptr, Int64Ptr, VoidPtrPtr, VoidPtrPtr, Int32, Int32)
 __OMP_RTL(__tgt_register_requires, false, Void, Int64)
-__OMP_RTL(__tgt_target_data_begin, false, Void, Int64, Int32, VoidPtrPtr,
-          VoidPtrPtr, Int64Ptr, Int64Ptr)
-__OMP_RTL(__tgt_target_data_begin_nowait, false, Void, Int64, Int32, VoidPtrPtr,
-          VoidPtrPtr, Int64Ptr, Int64Ptr)
-__OMP_RTL(__tgt_target_data_end, false, Void, Int64, Int32, VoidPtrPtr,
-          VoidPtrPtr, Int64Ptr, Int64Ptr)
-__OMP_RTL(__tgt_target_data_end_nowait, false, Void, Int64, Int32, VoidPtrPtr,
-          VoidPtrPtr, Int64Ptr, Int64Ptr)
-__OMP_RTL(__tgt_target_data_update, false, Void, Int64, Int32, VoidPtrPtr,
-          VoidPtrPtr, Int64Ptr, Int64Ptr)
-__OMP_RTL(__tgt_target_data_update_nowait, false, Void, Int64, Int32,
-          VoidPtrPtr, VoidPtrPtr, Int64Ptr, Int64Ptr)
+__OMP_RTL(__tgt_target_data_begin_mapper, false, Void, IdentPtr, Int64, Int32, VoidPtrPtr,
+          VoidPtrPtr, Int64Ptr, Int64Ptr, VoidPtrPtr, VoidPtrPtr)
+__OMP_RTL(__tgt_target_data_begin_nowait_mapper, false, Void, IdentPtr, Int64, Int32,
+          VoidPtrPtr, VoidPtrPtr, Int64Ptr, Int64Ptr, VoidPtrPtr, VoidPtrPtr)
+__OMP_RTL(__tgt_target_data_begin_mapper_issue, false, Void, IdentPtr, Int64, Int32,
+          VoidPtrPtr, VoidPtrPtr, Int64Ptr, Int64Ptr, VoidPtrPtr, VoidPtrPtr, AsyncInfoPtr)
+__OMP_RTL(__tgt_target_data_begin_mapper_wait, false, Void, Int64, AsyncInfoPtr)
+__OMP_RTL(__tgt_target_data_end_mapper, false, Void, IdentPtr, Int64, Int32, VoidPtrPtr,
+          VoidPtrPtr, Int64Ptr, Int64Ptr, VoidPtrPtr, VoidPtrPtr)
+__OMP_RTL(__tgt_target_data_end_nowait_mapper, false, Void, IdentPtr, Int64, Int32,
+          VoidPtrPtr, VoidPtrPtr, Int64Ptr, Int64Ptr, VoidPtrPtr, VoidPtrPtr)
+__OMP_RTL(__tgt_target_data_update_mapper, false, Void, IdentPtr, Int64, Int32,
+          VoidPtrPtr, VoidPtrPtr, Int64Ptr, Int64Ptr, VoidPtrPtr, VoidPtrPtr)
+__OMP_RTL(__tgt_target_data_update_nowait_mapper, false, Void, IdentPtr, Int64, Int32,
+          VoidPtrPtr, VoidPtrPtr, Int64Ptr, Int64Ptr, VoidPtrPtr, VoidPtrPtr)
 __OMP_RTL(__tgt_mapper_num_components, false, Int64, VoidPtr)
 __OMP_RTL(__tgt_push_mapper_component, false, Void, VoidPtr, VoidPtr, VoidPtr,
-          Int64, Int64)
+          Int64, Int64, VoidPtr)
 __OMP_RTL(__kmpc_task_allow_completion_event, false, VoidPtr, IdentPtr,
           /* Int */ Int32, /* kmp_task_t */ VoidPtr)
 
-/// Note that device runtime functions (in the following) do not necessarily
-/// need attributes as we expect to see the definitions.
-__OMP_RTL(__kmpc_kernel_parallel, false, Int1, VoidPtrPtr)
+/// OpenMP Device runtime functions
+__OMP_RTL(__kmpc_kernel_init, false, Void, Int32, Int16)
+__OMP_RTL(__kmpc_kernel_deinit, false, Void, Int16)
+__OMP_RTL(__kmpc_spmd_kernel_init, false, Void, Int32, Int16)
+__OMP_RTL(__kmpc_spmd_kernel_deinit_v2, false, Void, Int16)
 __OMP_RTL(__kmpc_kernel_prepare_parallel, false, Void, VoidPtr)
+__OMP_RTL(__kmpc_kernel_parallel, false, Int1, VoidPtrPtr)
+__OMP_RTL(__kmpc_kernel_end_parallel, false, Void, )
+__OMP_RTL(__kmpc_serialized_parallel, false, Void, IdentPtr, Int32)
+__OMP_RTL(__kmpc_end_serialized_parallel, false, Void, IdentPtr, Int32)
+__OMP_RTL(__kmpc_shuffle_int32, false, Int32, Int32, Int16, Int16)
+__OMP_RTL(__kmpc_nvptx_parallel_reduce_nowait_v2, false, Int32, IdentPtr, Int32,
+          Int32, SizeTy, VoidPtr, ShuffleReducePtr, InterWarpCopyPtr)
+__OMP_RTL(__kmpc_nvptx_end_reduce_nowait, false, Void, Int32)
+__OMP_RTL(__kmpc_nvptx_teams_reduce_nowait_v2, false, Int32, IdentPtr, Int32,
+          VoidPtr, Int32, VoidPtr, ShuffleReducePtr, InterWarpCopyPtr,
+          GlobalListPtr, GlobalListPtr, GlobalListPtr, GlobalListPtr)
+
+__OMP_RTL(__kmpc_shuffle_int64, false, Int64, Int64, Int16, Int16)
+__OMP_RTL(__kmpc_data_sharing_init_stack, false, Void, )
+__OMP_RTL(__kmpc_data_sharing_init_stack_spmd, false, Void, )
+
+__OMP_RTL(__kmpc_data_sharing_coalesced_push_stack, false, VoidPtr, SizeTy, Int16)
+__OMP_RTL(__kmpc_data_sharing_push_stack, false, VoidPtr, SizeTy, Int16)
+__OMP_RTL(__kmpc_data_sharing_pop_stack, false, Void, VoidPtr)
+__OMP_RTL(__kmpc_begin_sharing_variables, false, Void, VoidPtrPtrPtr, SizeTy)
+__OMP_RTL(__kmpc_end_sharing_variables, false, Void, )
+__OMP_RTL(__kmpc_get_shared_variables, false, Void, VoidPtrPtrPtr)
+__OMP_RTL(__kmpc_parallel_level, false, Int16, IdentPtr, Int32)
+__OMP_RTL(__kmpc_is_spmd_exec_mode, false, Int8, )
+__OMP_RTL(__kmpc_get_team_static_memory, false, Void, Int16, VoidPtr, SizeTy,
+          Int16, VoidPtrPtr)
+__OMP_RTL(__kmpc_restore_team_static_memory, false, Void, Int16, Int16)
+__OMP_RTL(__kmpc_barrier_simple_spmd, false, Void, IdentPtr, Int32)
+
+__OMP_RTL(__kmpc_warp_active_thread_mask, false, LanemaskTy,)
+__OMP_RTL(__kmpc_syncwarp, false, Void, LanemaskTy)
 
 __OMP_RTL(__last, false, Void, )
 
 #undef __OMP_RTL
 #undef OMP_RTL
 
+#define ParamAttrs(...) ArrayRef<AttributeSet>({__VA_ARGS__})
 #define EnumAttr(Kind) Attribute::get(Ctx, Attribute::AttrKind::Kind)
+#define EnumAttrInt(Kind, N) Attribute::get(Ctx, Attribute::AttrKind::Kind, N)
 #define AttributeSet(...)                                                      \
   AttributeSet::get(Ctx, ArrayRef<Attribute>({__VA_ARGS__}))
 
@@ -607,19 +464,94 @@ __OMP_RTL(__last, false, Void, )
 __OMP_ATTRS_SET(GetterAttrs,
                 OptimisticAttributes
                     ? AttributeSet(EnumAttr(NoUnwind), EnumAttr(ReadOnly),
-                                   EnumAttr(NoSync), EnumAttr(NoFree), EnumAttr(InaccessibleMemOnly))
+                                   EnumAttr(NoSync), EnumAttr(NoFree),
+                                   EnumAttr(InaccessibleMemOnly),
+                                   EnumAttr(WillReturn))
                     : AttributeSet(EnumAttr(NoUnwind)))
 __OMP_ATTRS_SET(GetterArgWriteAttrs,
                 OptimisticAttributes
                     ? AttributeSet(EnumAttr(NoUnwind), EnumAttr(NoSync),
-                                   EnumAttr(NoFree), EnumAttr(InaccessibleMemOrArgMemOnly))
+                                   EnumAttr(NoFree),
+                                   EnumAttr(InaccessibleMemOrArgMemOnly),
+                                   EnumAttr(WillReturn))
                     : AttributeSet(EnumAttr(NoUnwind)))
 __OMP_ATTRS_SET(SetterAttrs,
                 OptimisticAttributes
                     ? AttributeSet(EnumAttr(NoUnwind), EnumAttr(WriteOnly),
-                                   EnumAttr(NoSync), EnumAttr(NoFree), EnumAttr(InaccessibleMemOnly))
+                                   EnumAttr(NoSync), EnumAttr(NoFree),
+                                   EnumAttr(InaccessibleMemOnly),
+                                   EnumAttr(WillReturn))
+                    : AttributeSet(EnumAttr(NoUnwind)))
+
+__OMP_ATTRS_SET(DefaultAttrs,
+                OptimisticAttributes
+                    ? AttributeSet(EnumAttr(NoUnwind), EnumAttr(NoSync),
+                                   EnumAttr(WillReturn), EnumAttr(NoFree))
+                    : AttributeSet(EnumAttr(NoUnwind)))
+
+__OMP_ATTRS_SET(BarrierAttrs,
+                OptimisticAttributes
+                    ? AttributeSet(EnumAttr(NoUnwind), EnumAttr(Convergent))
+                    : AttributeSet(EnumAttr(NoUnwind), EnumAttr(Convergent)))
+
+__OMP_ATTRS_SET(InaccessibleArgOnlyAttrs,
+                OptimisticAttributes
+                    ? AttributeSet(EnumAttr(NoUnwind), EnumAttr(NoSync),
+                                   EnumAttr(InaccessibleMemOrArgMemOnly),
+                                   EnumAttr(WillReturn), EnumAttr(NoFree))
                     : AttributeSet(EnumAttr(NoUnwind)))
 
+#if 0
+__OMP_ATTRS_SET(InaccessibleOnlyAttrs,
+                OptimisticAttributes
+                    ? AttributeSet(EnumAttr(NoUnwind), EnumAttr(NoSync),
+                                   EnumAttr(InaccessibleMemOnly),
+                                   EnumAttr(WillReturn), EnumAttr(NoFree))
+                    : AttributeSet(EnumAttr(NoUnwind)))
+#endif
+
+__OMP_ATTRS_SET(AllocAttrs,
+                OptimisticAttributes
+                    ? AttributeSet(EnumAttr(NoUnwind), EnumAttr(NoSync),
+                                   EnumAttr(WillReturn))
+                    : AttributeSet(EnumAttr(NoUnwind)))
+
+__OMP_ATTRS_SET(ForkAttrs, OptimisticAttributes
+                               ? AttributeSet(EnumAttr(NoUnwind))
+                               : AttributeSet(EnumAttr(NoUnwind)))
+
+__OMP_ATTRS_SET(ReadOnlyPtrAttrs,
+                OptimisticAttributes
+                    ? AttributeSet(EnumAttr(ReadOnly), EnumAttr(NoFree),
+                                   EnumAttr(NoCapture))
+                    : AttributeSet())
+
+#if 0
+__OMP_ATTRS_SET(WriteOnlyPtrAttrs,
+                OptimisticAttributes
+                    ? AttributeSet(EnumAttr(WriteOnly), EnumAttr(NoFree),
+                                   EnumAttr(NoCapture))
+                    : AttributeSet())
+#endif
+
+__OMP_ATTRS_SET(ArgPtrAttrs,
+                OptimisticAttributes
+                    ? AttributeSet(EnumAttr(NoCapture), EnumAttr(NoFree))
+                    : AttributeSet())
+
+__OMP_ATTRS_SET(ReturnPtrAttrs,
+                OptimisticAttributes
+                    ? AttributeSet(EnumAttr(NoAlias))
+                    : AttributeSet())
+
+#if 0
+__OMP_ATTRS_SET(ReturnAlignedPtrAttrs,
+                OptimisticAttributes
+                    ? AttributeSet(EnumAttr(NoAlias), EnumAttrInt(Alignment, 8),
+                                   EnumAttrInt(DereferenceableOrNull, 8))
+                    : AttributeSet())
+#endif
+
 #undef __OMP_ATTRS_SET
 #undef OMP_ATTRS_SET
 
@@ -630,295 +562,314 @@ __OMP_ATTRS_SET(SetterAttrs,
 #define __OMP_RTL_ATTRS(Name, FnAttrSet, RetAttrSet, ArgAttrSets)              \
   OMP_RTL_ATTRS(OMPRTL_##Name, FnAttrSet, RetAttrSet, ArgAttrSets)
 
-__OMP_RTL_ATTRS(__kmpc_barrier, AttributeSet(), AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_cancel,
-                AttributeSet(EnumAttr(InaccessibleMemOrArgMemOnly)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_cancel_barrier, AttributeSet(), AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_flush, AttributeSet(), AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_global_thread_num, GetterAttrs, AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_fork_call, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_omp_taskwait, AttributeSet(), AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_omp_taskyield,
-                AttributeSet(EnumAttr(InaccessibleMemOrArgMemOnly)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_push_num_threads,
-                AttributeSet(EnumAttr(InaccessibleMemOrArgMemOnly)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_push_proc_bind,
-                AttributeSet(EnumAttr(InaccessibleMemOrArgMemOnly)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_serialized_parallel,
-                AttributeSet(EnumAttr(InaccessibleMemOrArgMemOnly)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_end_serialized_parallel,
-                AttributeSet(EnumAttr(InaccessibleMemOrArgMemOnly)),
-                AttributeSet(), {})
-
-__OMP_RTL_ATTRS(omp_get_thread_num, GetterAttrs, AttributeSet(), {})
-__OMP_RTL_ATTRS(omp_get_num_threads, GetterAttrs, AttributeSet(), {})
-__OMP_RTL_ATTRS(omp_get_max_threads, GetterAttrs, AttributeSet(), {})
-__OMP_RTL_ATTRS(omp_in_parallel, GetterAttrs, AttributeSet(), {})
-__OMP_RTL_ATTRS(omp_get_dynamic, GetterAttrs, AttributeSet(), {})
-__OMP_RTL_ATTRS(omp_get_cancellation, GetterAttrs, AttributeSet(), {})
-__OMP_RTL_ATTRS(omp_get_nested, GetterAttrs, AttributeSet(), {})
-__OMP_RTL_ATTRS(omp_get_schedule, GetterArgWriteAttrs, AttributeSet(),
-                ArrayRef<AttributeSet>(
-                    {AttributeSet(EnumAttr(NoCapture), EnumAttr(WriteOnly)),
-                     AttributeSet(EnumAttr(NoCapture), EnumAttr(WriteOnly))}))
-__OMP_RTL_ATTRS(omp_get_thread_limit, GetterAttrs, AttributeSet(), {})
+__OMP_RTL_ATTRS(__kmpc_barrier, BarrierAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_barrier_simple_spmd, BarrierAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_warp_active_thread_mask, BarrierAttrs, AttributeSet(),
+                ParamAttrs())
+__OMP_RTL_ATTRS(__kmpc_syncwarp, BarrierAttrs, AttributeSet(), ParamAttrs())
+__OMP_RTL_ATTRS(__kmpc_cancel, InaccessibleArgOnlyAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_cancel_barrier, BarrierAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_flush, BarrierAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_global_thread_num, GetterAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_fork_call, ForkAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), ReadOnlyPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_omp_taskwait, BarrierAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_omp_taskyield, InaccessibleArgOnlyAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_push_num_threads, InaccessibleArgOnlyAttrs,
+                AttributeSet(), ParamAttrs(ReadOnlyPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_push_proc_bind, InaccessibleArgOnlyAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_serialized_parallel, InaccessibleArgOnlyAttrs,
+                AttributeSet(), ParamAttrs(ReadOnlyPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_end_serialized_parallel, InaccessibleArgOnlyAttrs,
+                AttributeSet(), ParamAttrs(ReadOnlyPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_omp_reg_task_with_affinity, DefaultAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), ReadOnlyPtrAttrs,
+                           AttributeSet(), ReadOnlyPtrAttrs))
+
+__OMP_RTL_ATTRS(omp_get_thread_num, GetterAttrs, AttributeSet(), ParamAttrs())
+__OMP_RTL_ATTRS(omp_get_num_threads, GetterAttrs, AttributeSet(), ParamAttrs())
+__OMP_RTL_ATTRS(omp_get_max_threads, GetterAttrs, AttributeSet(), ParamAttrs())
+__OMP_RTL_ATTRS(omp_in_parallel, GetterAttrs, AttributeSet(), ParamAttrs())
+__OMP_RTL_ATTRS(omp_get_dynamic, GetterAttrs, AttributeSet(), ParamAttrs())
+__OMP_RTL_ATTRS(omp_get_cancellation, GetterAttrs, AttributeSet(), ParamAttrs())
+__OMP_RTL_ATTRS(omp_get_nested, GetterAttrs, AttributeSet(), ParamAttrs())
+__OMP_RTL_ATTRS(
+    omp_get_schedule, GetterArgWriteAttrs, AttributeSet(),
+    ParamAttrs(AttributeSet(EnumAttr(NoCapture), EnumAttr(WriteOnly)),
+               AttributeSet(EnumAttr(NoCapture), EnumAttr(WriteOnly))))
+__OMP_RTL_ATTRS(omp_get_thread_limit, GetterAttrs, AttributeSet(), ParamAttrs())
 __OMP_RTL_ATTRS(omp_get_supported_active_levels, GetterAttrs, AttributeSet(),
-                {})
-__OMP_RTL_ATTRS(omp_get_max_active_levels, GetterAttrs, AttributeSet(), {})
-__OMP_RTL_ATTRS(omp_get_level, GetterAttrs, AttributeSet(), {})
-__OMP_RTL_ATTRS(omp_get_ancestor_thread_num, GetterAttrs, AttributeSet(), {})
-__OMP_RTL_ATTRS(omp_get_team_size, GetterAttrs, AttributeSet(), {})
-__OMP_RTL_ATTRS(omp_get_active_level, GetterAttrs, AttributeSet(), {})
-__OMP_RTL_ATTRS(omp_in_final, GetterAttrs, AttributeSet(), {})
-__OMP_RTL_ATTRS(omp_get_proc_bind, GetterAttrs, AttributeSet(), {})
-__OMP_RTL_ATTRS(omp_get_num_places, GetterAttrs, AttributeSet(), {})
-__OMP_RTL_ATTRS(omp_get_num_procs, GetterAttrs, AttributeSet(), {})
+                ParamAttrs())
+__OMP_RTL_ATTRS(omp_get_max_active_levels, GetterAttrs, AttributeSet(),
+                ParamAttrs())
+__OMP_RTL_ATTRS(omp_get_level, GetterAttrs, AttributeSet(), ParamAttrs())
+__OMP_RTL_ATTRS(omp_get_ancestor_thread_num, GetterAttrs, AttributeSet(),
+                ParamAttrs())
+__OMP_RTL_ATTRS(omp_get_team_size, GetterAttrs, AttributeSet(), ParamAttrs())
+__OMP_RTL_ATTRS(omp_get_active_level, GetterAttrs, AttributeSet(), ParamAttrs())
+__OMP_RTL_ATTRS(omp_in_final, GetterAttrs, AttributeSet(), ParamAttrs())
+__OMP_RTL_ATTRS(omp_get_proc_bind, GetterAttrs, AttributeSet(), ParamAttrs())
+__OMP_RTL_ATTRS(omp_get_num_places, GetterAttrs, AttributeSet(), ParamAttrs())
+__OMP_RTL_ATTRS(omp_get_num_procs, GetterAttrs, AttributeSet(), ParamAttrs())
 __OMP_RTL_ATTRS(omp_get_place_proc_ids, GetterArgWriteAttrs, AttributeSet(),
-                ArrayRef<AttributeSet>({AttributeSet(),
-                                        AttributeSet(EnumAttr(NoCapture),
-                                                     EnumAttr(WriteOnly))}))
-__OMP_RTL_ATTRS(omp_get_place_num, GetterAttrs, AttributeSet(), {})
-__OMP_RTL_ATTRS(omp_get_partition_num_places, GetterAttrs, AttributeSet(), {})
-__OMP_RTL_ATTRS(omp_get_partition_place_nums, GetterAttrs, AttributeSet(), {})
-
-__OMP_RTL_ATTRS(omp_set_num_threads, SetterAttrs, AttributeSet(), {})
-__OMP_RTL_ATTRS(omp_set_dynamic, SetterAttrs, AttributeSet(), {})
-__OMP_RTL_ATTRS(omp_set_nested, SetterAttrs, AttributeSet(), {})
-__OMP_RTL_ATTRS(omp_set_schedule, SetterAttrs, AttributeSet(), {})
-__OMP_RTL_ATTRS(omp_set_max_active_levels, SetterAttrs, AttributeSet(), {})
-
-__OMP_RTL_ATTRS(__kmpc_master,
-                AttributeSet(EnumAttr(InaccessibleMemOrArgMemOnly)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_end_master,
-                AttributeSet(EnumAttr(InaccessibleMemOrArgMemOnly)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_critical,
-                AttributeSet(EnumAttr(InaccessibleMemOrArgMemOnly)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_critical_with_hint,
-                AttributeSet(EnumAttr(InaccessibleMemOrArgMemOnly)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_end_critical,
-                AttributeSet(EnumAttr(InaccessibleMemOrArgMemOnly)),
-                AttributeSet(), {})
-
-__OMP_RTL_ATTRS(__kmpc_begin, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_end, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-
-__OMP_RTL_ATTRS(__kmpc_reduce, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_reduce_nowait, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_end_reduce, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_end_reduce_nowait, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-
-__OMP_RTL_ATTRS(__kmpc_ordered, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_end_ordered, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-
-__OMP_RTL_ATTRS(__kmpc_for_static_init_4, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_for_static_init_4u, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_for_static_init_8, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_for_static_init_8u, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_for_static_fini, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_dist_dispatch_init_4, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_dist_dispatch_init_4u, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_dist_dispatch_init_8, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_dist_dispatch_init_8u, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_dispatch_init_4, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_dispatch_init_4u, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_dispatch_init_8, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_dispatch_init_8u, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_dispatch_next_4, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_dispatch_next_4u, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_dispatch_next_8, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_dispatch_next_8u, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_dispatch_fini_4, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_dispatch_fini_4u, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_dispatch_fini_8, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_dispatch_fini_8u, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_team_static_init_4, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_team_static_init_4u, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_team_static_init_8, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_team_static_init_8u, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_dist_for_static_init_4, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_dist_for_static_init_4u, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_dist_for_static_init_8, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_dist_for_static_init_8u, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-
-__OMP_RTL_ATTRS(__kmpc_single, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_end_single, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-
-__OMP_RTL_ATTRS(__kmpc_omp_task_alloc, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_omp_task, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_end_taskgroup, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_taskgroup, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_omp_task_begin_if0, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_omp_task_complete_if0, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_omp_task_with_deps, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_taskloop, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_omp_target_task_alloc,
-                AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_taskred_modifier_init,
-                AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_taskred_init,
-                AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_task_reduction_modifier_fini,
-                AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_task_reduction_get_th_data,
-                AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_task_reduction_init,
-                AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_task_reduction_modifier_init,
-                AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_proxy_task_completed_ooo,
-                AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-
-__OMP_RTL_ATTRS(__kmpc_omp_wait_deps, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_cancellationpoint, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-
-__OMP_RTL_ATTRS(__kmpc_fork_teams, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_push_num_teams, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-
-__OMP_RTL_ATTRS(__kmpc_copyprivate, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_threadprivate_cached, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_threadprivate_register, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-
-__OMP_RTL_ATTRS(__kmpc_doacross_init, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_doacross_post, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_doacross_wait, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_doacross_fini, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-
-__OMP_RTL_ATTRS(__kmpc_alloc, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_free, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-
-__OMP_RTL_ATTRS(__kmpc_init_allocator, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_destroy_allocator, AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-
-__OMP_RTL_ATTRS(__kmpc_push_target_tripcount,
-                AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__tgt_target,
-                AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__tgt_target_nowait,
-                AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__tgt_target_teams,
-                AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__tgt_target_teams_nowait,
-                AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__tgt_register_requires,
-                AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__tgt_target_data_begin,
-                AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__tgt_target_data_begin_nowait,
-                AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__tgt_target_data_end,
-                AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__tgt_target_data_end_nowait,
-                AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__tgt_target_data_update,
-                AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__tgt_target_data_update_nowait,
-                AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__tgt_mapper_num_components,
-                AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__tgt_push_mapper_component,
-                AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
-__OMP_RTL_ATTRS(__kmpc_task_allow_completion_event,
-                AttributeSet(EnumAttr(NoUnwind)),
-                AttributeSet(), {})
+                ParamAttrs(AttributeSet(), AttributeSet(EnumAttr(NoCapture),
+                                                        EnumAttr(WriteOnly))))
+__OMP_RTL_ATTRS(omp_get_place_num, GetterAttrs, AttributeSet(), ParamAttrs())
+__OMP_RTL_ATTRS(omp_get_partition_num_places, GetterAttrs, AttributeSet(),
+                ParamAttrs())
+__OMP_RTL_ATTRS(omp_get_partition_place_nums, GetterAttrs, AttributeSet(),
+                ParamAttrs())
+
+__OMP_RTL_ATTRS(omp_set_num_threads, SetterAttrs, AttributeSet(), ParamAttrs())
+__OMP_RTL_ATTRS(omp_set_dynamic, SetterAttrs, AttributeSet(), ParamAttrs())
+__OMP_RTL_ATTRS(omp_set_nested, SetterAttrs, AttributeSet(), ParamAttrs())
+__OMP_RTL_ATTRS(omp_set_schedule, SetterAttrs, AttributeSet(), ParamAttrs())
+__OMP_RTL_ATTRS(omp_set_max_active_levels, SetterAttrs, AttributeSet(),
+                ParamAttrs())
+
+__OMP_RTL_ATTRS(__kmpc_master, InaccessibleArgOnlyAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_end_master, InaccessibleArgOnlyAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_critical, BarrierAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet()))
+__OMP_RTL_ATTRS(__kmpc_critical_with_hint, BarrierAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
+                           AttributeSet()))
+__OMP_RTL_ATTRS(__kmpc_end_critical, BarrierAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet()))
+
+__OMP_RTL_ATTRS(__kmpc_begin, DefaultAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_end, DefaultAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs))
+
+__OMP_RTL_ATTRS(__kmpc_reduce, BarrierAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
+                           AttributeSet(), ReadOnlyPtrAttrs, AttributeSet()))
+__OMP_RTL_ATTRS(__kmpc_reduce_nowait, BarrierAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
+                           AttributeSet(), ReadOnlyPtrAttrs, AttributeSet()))
+__OMP_RTL_ATTRS(__kmpc_end_reduce, BarrierAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet()))
+__OMP_RTL_ATTRS(__kmpc_end_reduce_nowait, BarrierAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet()))
+
+__OMP_RTL_ATTRS(__kmpc_ordered, BarrierAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_end_ordered, BarrierAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs))
+
+__OMP_RTL_ATTRS(__kmpc_for_static_init_4, GetterArgWriteAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
+                           ArgPtrAttrs, ArgPtrAttrs, ArgPtrAttrs, ArgPtrAttrs,
+                           AttributeSet(), AttributeSet()))
+__OMP_RTL_ATTRS(__kmpc_for_static_init_4u, GetterArgWriteAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
+                           ArgPtrAttrs, ArgPtrAttrs, ArgPtrAttrs, ArgPtrAttrs,
+                           AttributeSet(), AttributeSet()))
+__OMP_RTL_ATTRS(__kmpc_for_static_init_8, GetterArgWriteAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
+                           ArgPtrAttrs, ArgPtrAttrs, ArgPtrAttrs, ArgPtrAttrs,
+                           AttributeSet(), AttributeSet()))
+__OMP_RTL_ATTRS(__kmpc_for_static_init_8u, GetterArgWriteAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
+                           ArgPtrAttrs, ArgPtrAttrs, ArgPtrAttrs, ArgPtrAttrs,
+                           AttributeSet(), AttributeSet()))
+__OMP_RTL_ATTRS(__kmpc_for_static_fini, InaccessibleArgOnlyAttrs,
+                AttributeSet(), ParamAttrs(ReadOnlyPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_dist_dispatch_init_4, GetterArgWriteAttrs,
+                AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
+                           ArgPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_dist_dispatch_init_4u, GetterArgWriteAttrs,
+                AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
+                           ArgPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_dist_dispatch_init_8, GetterArgWriteAttrs,
+                AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
+                           ArgPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_dist_dispatch_init_8u, GetterArgWriteAttrs,
+                AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
+                           ArgPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_dispatch_init_4, GetterArgWriteAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_dispatch_init_4u, GetterArgWriteAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_dispatch_init_8, GetterArgWriteAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_dispatch_init_8u, GetterArgWriteAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_dispatch_next_4, GetterArgWriteAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), ArgPtrAttrs,
+                           ArgPtrAttrs, ArgPtrAttrs, ArgPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_dispatch_next_4u, GetterArgWriteAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), ArgPtrAttrs,
+                           ArgPtrAttrs, ArgPtrAttrs, ArgPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_dispatch_next_8, GetterArgWriteAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), ArgPtrAttrs,
+                           ArgPtrAttrs, ArgPtrAttrs, ArgPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_dispatch_next_8u, GetterArgWriteAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), ArgPtrAttrs,
+                           ArgPtrAttrs, ArgPtrAttrs, ArgPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_dispatch_fini_4, InaccessibleArgOnlyAttrs,
+                AttributeSet(), ParamAttrs(ReadOnlyPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_dispatch_fini_4u, InaccessibleArgOnlyAttrs,
+                AttributeSet(), ParamAttrs(ReadOnlyPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_dispatch_fini_8, InaccessibleArgOnlyAttrs,
+                AttributeSet(), ParamAttrs(ReadOnlyPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_dispatch_fini_8u, InaccessibleArgOnlyAttrs,
+                AttributeSet(), ParamAttrs(ReadOnlyPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_team_static_init_4, GetterArgWriteAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), ArgPtrAttrs,
+                           ArgPtrAttrs, ArgPtrAttrs, ArgPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_team_static_init_4u, GetterArgWriteAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), ArgPtrAttrs,
+                           ArgPtrAttrs, ArgPtrAttrs, ArgPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_team_static_init_8, GetterArgWriteAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), ArgPtrAttrs,
+                           ArgPtrAttrs, ArgPtrAttrs, ArgPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_team_static_init_8u, GetterArgWriteAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), ArgPtrAttrs,
+                           ArgPtrAttrs, ArgPtrAttrs, ArgPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_dist_for_static_init_4, GetterArgWriteAttrs,
+                AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
+                           ArgPtrAttrs, ArgPtrAttrs, ArgPtrAttrs, ArgPtrAttrs,
+                           ArgPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_dist_for_static_init_4u, GetterArgWriteAttrs,
+                AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
+                           ArgPtrAttrs, ArgPtrAttrs, ArgPtrAttrs, ArgPtrAttrs,
+                           ArgPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_dist_for_static_init_8, GetterArgWriteAttrs,
+                AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
+                           ArgPtrAttrs, ArgPtrAttrs, ArgPtrAttrs, ArgPtrAttrs,
+                           ArgPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_dist_for_static_init_8u, GetterArgWriteAttrs,
+                AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
+                           ArgPtrAttrs, ArgPtrAttrs, ArgPtrAttrs, ArgPtrAttrs,
+                           ArgPtrAttrs))
+
+__OMP_RTL_ATTRS(__kmpc_single, BarrierAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_end_single, BarrierAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs))
+
+__OMP_RTL_ATTRS(__kmpc_omp_task_alloc, DefaultAttrs, ReturnPtrAttrs,
+                ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
+                           AttributeSet(), AttributeSet(), ReadOnlyPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_omp_task, DefaultAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet()))
+__OMP_RTL_ATTRS(__kmpc_end_taskgroup, BarrierAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_taskgroup, BarrierAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_omp_task_begin_if0, DefaultAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_omp_task_complete_if0, DefaultAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_omp_task_with_deps, DefaultAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
+                           AttributeSet(), ReadOnlyPtrAttrs, AttributeSet(),
+                           ReadOnlyPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_taskloop, DefaultAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
+                           AttributeSet(), ArgPtrAttrs, ArgPtrAttrs,
+                           AttributeSet(), AttributeSet(), AttributeSet(),
+                           AttributeSet(), AttributeSet()))
+__OMP_RTL_ATTRS(__kmpc_omp_target_task_alloc, DefaultAttrs, ReturnPtrAttrs,
+                ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
+                           AttributeSet(), AttributeSet(), ReadOnlyPtrAttrs,
+                           AttributeSet()))
+__OMP_RTL_ATTRS(__kmpc_taskred_modifier_init, DefaultAttrs, ReturnPtrAttrs,
+                ParamAttrs(ReadOnlyPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_taskred_init, DefaultAttrs, AttributeSet(), ParamAttrs())
+__OMP_RTL_ATTRS(__kmpc_task_reduction_modifier_fini, BarrierAttrs,
+                AttributeSet(), ParamAttrs(ReadOnlyPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_task_reduction_get_th_data, DefaultAttrs, ReturnPtrAttrs,
+                ParamAttrs())
+__OMP_RTL_ATTRS(__kmpc_task_reduction_init, DefaultAttrs, ReturnPtrAttrs,
+                ParamAttrs())
+__OMP_RTL_ATTRS(__kmpc_task_reduction_modifier_init, DefaultAttrs,
+                ReturnPtrAttrs, ParamAttrs())
+__OMP_RTL_ATTRS(__kmpc_proxy_task_completed_ooo, DefaultAttrs, AttributeSet(),
+                ParamAttrs())
+
+__OMP_RTL_ATTRS(__kmpc_omp_wait_deps, BarrierAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
+                           ReadOnlyPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_cancellationpoint, DefaultAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs))
+
+__OMP_RTL_ATTRS(__kmpc_fork_teams, ForkAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), ReadOnlyPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_push_num_teams, InaccessibleArgOnlyAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs))
+
+__OMP_RTL_ATTRS(__kmpc_copyprivate, DefaultAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
+                           ReadOnlyPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_threadprivate_cached, DefaultAttrs, ReturnPtrAttrs,
+                ParamAttrs(ReadOnlyPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_threadprivate_register, DefaultAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), ReadOnlyPtrAttrs,
+                           ReadOnlyPtrAttrs, ReadOnlyPtrAttrs))
+
+__OMP_RTL_ATTRS(__kmpc_doacross_init, BarrierAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_doacross_post, BarrierAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), ReadOnlyPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_doacross_wait, BarrierAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), ReadOnlyPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_doacross_fini, BarrierAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs))
+
+__OMP_RTL_ATTRS(__kmpc_alloc, DefaultAttrs, ReturnPtrAttrs, {})
+__OMP_RTL_ATTRS(__kmpc_free, AllocAttrs, AttributeSet(), {})
+
+__OMP_RTL_ATTRS(__kmpc_init_allocator, DefaultAttrs, ReturnPtrAttrs, {})
+__OMP_RTL_ATTRS(__kmpc_destroy_allocator, AllocAttrs, AttributeSet(), {})
+
+__OMP_RTL_ATTRS(__kmpc_push_target_tripcount_mapper, SetterAttrs, AttributeSet(), {})
+__OMP_RTL_ATTRS(__tgt_target_mapper, ForkAttrs, AttributeSet(), {})
+__OMP_RTL_ATTRS(__tgt_target_nowait_mapper, ForkAttrs, AttributeSet(), {})
+__OMP_RTL_ATTRS(__tgt_target_teams_mapper, ForkAttrs, AttributeSet(), {})
+__OMP_RTL_ATTRS(__tgt_target_teams_nowait_mapper, ForkAttrs, AttributeSet(), {})
+__OMP_RTL_ATTRS(__tgt_register_requires, ForkAttrs, AttributeSet(), {})
+__OMP_RTL_ATTRS(__tgt_target_data_begin_mapper, ForkAttrs, AttributeSet(), {})
+__OMP_RTL_ATTRS(__tgt_target_data_begin_nowait_mapper, ForkAttrs,
+        AttributeSet(), {})
+__OMP_RTL_ATTRS(__tgt_target_data_end_mapper, ForkAttrs, AttributeSet(), {})
+__OMP_RTL_ATTRS(__tgt_target_data_end_nowait_mapper, ForkAttrs,
+        AttributeSet(), {})
+__OMP_RTL_ATTRS(__tgt_target_data_update_mapper, ForkAttrs, AttributeSet(), {})
+__OMP_RTL_ATTRS(__tgt_target_data_update_nowait_mapper, ForkAttrs,
+        AttributeSet(), {})
+__OMP_RTL_ATTRS(__tgt_mapper_num_components, ForkAttrs, AttributeSet(), {})
+__OMP_RTL_ATTRS(__tgt_push_mapper_component, ForkAttrs, AttributeSet(), {})
+__OMP_RTL_ATTRS(__kmpc_task_allow_completion_event, DefaultAttrs,
+                ReturnPtrAttrs, ParamAttrs(ReadOnlyPtrAttrs))
 
 #undef __OMP_RTL_ATTRS
 #undef OMP_RTL_ATTRS
 #undef AttributeSet
 #undef EnumAttr
+#undef EnumAttrInt
+#undef ParamAttrs
 
 ///}
 
@@ -1066,10 +1017,6 @@ __OMP_TRAIT_PROPERTY(device, kind, gpu)
 __OMP_TRAIT_PROPERTY(device, kind, fpga)
 __OMP_TRAIT_PROPERTY(device, kind, any)
 
-__OMP_TRAIT_SELECTOR(device, isa, true)
-
-// TODO: What do we want for ISA?
-
 __OMP_TRAIT_SELECTOR(device, arch, true)
 
 __OMP_TRAIT_PROPERTY(device, arch, arm)
@@ -1078,6 +1025,7 @@ __OMP_TRAIT_PROPERTY(device, arch, aarch64)
 __OMP_TRAIT_PROPERTY(device, arch, aarch64_be)
 __OMP_TRAIT_PROPERTY(device, arch, aarch64_32)
 __OMP_TRAIT_PROPERTY(device, arch, ppc)
+__OMP_TRAIT_PROPERTY(device, arch, ppcle)
 __OMP_TRAIT_PROPERTY(device, arch, ppc64)
 __OMP_TRAIT_PROPERTY(device, arch, ppc64le)
 __OMP_TRAIT_PROPERTY(device, arch, x86)
@@ -1107,6 +1055,8 @@ __OMP_TRAIT_SELECTOR(implementation, extension, true)
 __OMP_TRAIT_PROPERTY(implementation, extension, match_all)
 __OMP_TRAIT_PROPERTY(implementation, extension, match_any)
 __OMP_TRAIT_PROPERTY(implementation, extension, match_none)
+__OMP_TRAIT_PROPERTY(implementation, extension, disable_implicit_base)
+__OMP_TRAIT_PROPERTY(implementation, extension, allow_templates)
 
 __OMP_TRAIT_SET(user)
 
@@ -1116,6 +1066,18 @@ __OMP_TRAIT_PROPERTY(user, condition, true)
 __OMP_TRAIT_PROPERTY(user, condition, false)
 __OMP_TRAIT_PROPERTY(user, condition, unknown)
 
+
+// Note that we put isa last so that the other conditions are checked first.
+// This allows us to issue warnings wrt. isa only if we match otherwise.
+__OMP_TRAIT_SELECTOR(device, isa, true)
+
+// We use "__ANY" as a placeholder in the isa property to denote the
+// conceptual "any", not the literal `any` used in kind. The string we
+// we use is not important except that it will show up in diagnostics.
+OMP_TRAIT_PROPERTY(device_isa___ANY, device, device_isa,
+                   "<any, entirely target dependent>")
+
+
 #undef OMP_TRAIT_SET
 #undef __OMP_TRAIT_SET
 ///}
@@ -1153,3 +1115,27 @@ OMP_LAST_TRAIT_PROPERTY(
 #undef __OMP_REQUIRES_TRAIT
 #undef OMP_REQUIRES_TRAIT
 ///}
+
+
+/// Assumption clauses
+///
+///{
+
+#ifdef OMP_ASSUME_CLAUSE
+#define __OMP_ASSUME_CLAUSE(Identifier, StartsWith, HasDirectiveList, HasExpression) \
+OMP_ASSUME_CLAUSE(Identifier, StartsWith, HasDirectiveList, HasExpression)
+#else
+#define __OMP_ASSUME_CLAUSE(...)
+#endif
+
+__OMP_ASSUME_CLAUSE(llvm::StringLiteral("ext_"), true, false, false)
+__OMP_ASSUME_CLAUSE(llvm::StringLiteral("absent"), false, true, false)
+__OMP_ASSUME_CLAUSE(llvm::StringLiteral("contains"), false, true, false)
+__OMP_ASSUME_CLAUSE(llvm::StringLiteral("holds"), false, false, true)
+__OMP_ASSUME_CLAUSE(llvm::StringLiteral("no_openmp"), false, false, false)
+__OMP_ASSUME_CLAUSE(llvm::StringLiteral("no_openmp_routines"), false, false, false)
+__OMP_ASSUME_CLAUSE(llvm::StringLiteral("no_parallelism"), false, false, false)
+
+#undef __OMP_ASSUME_CLAUSE
+#undef OMP_ASSUME_CLAUSE
+///}
diff --git a/contrib/llvm-project/llvm/include/llvm/FuzzMutate/IRMutator.h b/contrib/llvm-project/llvm/include/llvm/FuzzMutate/IRMutator.h
index 40a1ce8aeec9..423582eace9b 100644
--- a/contrib/llvm-project/llvm/include/llvm/FuzzMutate/IRMutator.h
+++ b/contrib/llvm-project/llvm/include/llvm/FuzzMutate/IRMutator.h
@@ -102,6 +102,17 @@ public:
   void mutate(Instruction &Inst, RandomIRBuilder &IB) override;
 };
 
+class InstModificationIRStrategy : public IRMutationStrategy {
+public:
+  uint64_t getWeight(size_t CurrentSize, size_t MaxSize,
+                     uint64_t CurrentWeight) override {
+    return 4;
+  }
+
+  using IRMutationStrategy::mutate;
+  void mutate(Instruction &Inst, RandomIRBuilder &IB) override;
+};
+
 } // end llvm namespace
 
 #endif // LLVM_FUZZMUTATE_IRMUTATOR_H
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/Argument.h b/contrib/llvm-project/llvm/include/llvm/IR/Argument.h
index af469e8a5d1a..76d780485ea0 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/Argument.h
+++ b/contrib/llvm-project/llvm/include/llvm/IR/Argument.h
@@ -52,7 +52,9 @@ public:
   /// Return true if this argument has the nonnull attribute. Also returns true
   /// if at least one byte is known to be dereferenceable and the pointer is in
   /// addrspace(0).
-  bool hasNonNullAttr() const;
+  /// If AllowUndefOrPoison is true, respect the semantics of nonnull attribute
+  /// and return true even if the argument can be undef or poison.
+  bool hasNonNullAttr(bool AllowUndefOrPoison = true) const;
 
   /// If this argument has the dereferenceable attribute, return the number of
   /// bytes known to be dereferenceable. Otherwise, zero is returned.
@@ -65,6 +67,9 @@ public:
   /// Return true if this argument has the byval attribute.
   bool hasByValAttr() const;
 
+  /// Return true if this argument has the byref attribute.
+  bool hasByRefAttr() const;
+
   /// Return true if this argument has the swiftself attribute.
   bool hasSwiftSelfAttr() const;
 
@@ -72,13 +77,23 @@ public:
   bool hasSwiftErrorAttr() const;
 
   /// Return true if this argument has the byval, inalloca, or preallocated
-  /// attribute. These attributes represent arguments being passed by value.
-  bool hasPassPointeeByValueAttr() const;
+  /// attribute. These attributes represent arguments being passed by value,
+  /// with an associated copy between the caller and callee
+  bool hasPassPointeeByValueCopyAttr() const;
 
   /// If this argument satisfies has hasPassPointeeByValueAttr, return the
   /// in-memory ABI size copied to the stack for the call. Otherwise, return 0.
   uint64_t getPassPointeeByValueCopySize(const DataLayout &DL) const;
 
+  /// Return true if this argument has the byval, sret, inalloca, preallocated,
+  /// or byref attribute. These attributes represent arguments being passed by
+  /// value (which may or may not involve a stack copy)
+  bool hasPointeeInMemoryValueAttr() const;
+
+  /// If hasPointeeInMemoryValueAttr returns true, the in-memory ABI type is
+  /// returned. Otherwise, nullptr.
+  Type *getPointeeInMemoryValueType() const;
+
   /// If this is a byval or inalloca argument, return its alignment.
   /// FIXME: Remove this function once transition to Align is over.
   /// Use getParamAlign() instead.
@@ -90,6 +105,12 @@ public:
   /// If this is a byval argument, return its type.
   Type *getParamByValType() const;
 
+  /// If this is an sret argument, return its type.
+  Type *getParamStructRetType() const;
+
+  /// If this is a byref argument, return its type.
+  Type *getParamByRefType() const;
+
   /// Return true if this argument has the nest attribute.
   bool hasNestAttr() const;
 
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/Assumptions.h b/contrib/llvm-project/llvm/include/llvm/IR/Assumptions.h
new file mode 100644
index 000000000000..f64616c25d87
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/IR/Assumptions.h
@@ -0,0 +1,50 @@
+//===--- Assumptions.h - Assumption handling and organization ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// String assumptions that are known to optimization passes should be placed in
+// the KnownAssumptionStrings set. This can be done in various ways, i.a.,
+// via a static KnownAssumptionString object.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_IR_ASSUMPTIONS_H
+#define LLVM_IR_ASSUMPTIONS_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSet.h"
+
+namespace llvm {
+
+class Function;
+
+/// The key we use for assumption attributes.
+constexpr StringRef AssumptionAttrKey = "llvm.assume";
+
+/// A set of known assumption strings that are accepted without warning and
+/// which can be recommended as typo correction.
+extern StringSet<> KnownAssumptionStrings;
+
+/// Helper that allows to insert a new assumption string in the known assumption
+/// set by creating a (static) object.
+struct KnownAssumptionString {
+  KnownAssumptionString(StringRef AssumptionStr)
+      : AssumptionStr(AssumptionStr) {
+    KnownAssumptionStrings.insert(AssumptionStr);
+  }
+  operator StringRef() const { return AssumptionStr; }
+
+private:
+  StringRef AssumptionStr;
+};
+
+/// Return true if \p F has the assumption \p AssumptionStr attached.
+bool hasAssumption(Function &F, const KnownAssumptionString &AssumptionStr);
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/Attributes.h b/contrib/llvm-project/llvm/include/llvm/IR/Attributes.h
index 58365aa2b764..b4056540663f 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/Attributes.h
+++ b/contrib/llvm-project/llvm/include/llvm/IR/Attributes.h
@@ -108,8 +108,17 @@ public:
                                         unsigned ElemSizeArg,
                                         const Optional<unsigned> &NumElemsArg);
   static Attribute getWithByValType(LLVMContext &Context, Type *Ty);
+  static Attribute getWithStructRetType(LLVMContext &Context, Type *Ty);
+  static Attribute getWithByRefType(LLVMContext &Context, Type *Ty);
   static Attribute getWithPreallocatedType(LLVMContext &Context, Type *Ty);
 
+  /// For a typed attribute, return the equivalent attribute with the type
+  /// changed to \p ReplacementTy.
+  Attribute getWithNewType(LLVMContext &Context, Type *ReplacementTy) {
+    assert(isTypeAttribute() && "this requires a typed attribute");
+    return get(Context, getKindAsEnum(), ReplacementTy);
+  }
+
   static Attribute::AttrKind getAttrKindFromName(StringRef AttrName);
 
   static StringRef getNameFromAttrKind(Attribute::AttrKind AttrKind);
@@ -138,6 +147,9 @@ public:
   /// Return true if the attribute is a type attribute.
   bool isTypeAttribute() const;
 
+  /// Return true if the attribute is any kind of attribute.
+  bool isValid() const { return pImpl; }
+
   /// Return true if the attribute is present.
   bool hasAttribute(AttrKind Val) const;
 
@@ -303,6 +315,8 @@ public:
   uint64_t getDereferenceableBytes() const;
   uint64_t getDereferenceableOrNullBytes() const;
   Type *getByValType() const;
+  Type *getStructRetType() const;
+  Type *getByRefType() const;
   Type *getPreallocatedType() const;
   std::pair<unsigned, Optional<unsigned>> getAllocSizeArgs() const;
   std::string getAsString(bool InAttrGrp = false) const;
@@ -385,6 +399,9 @@ private:
 
   static AttributeList getImpl(LLVMContext &C, ArrayRef<AttributeSet> AttrSets);
 
+  AttributeList setAttributes(LLVMContext &C, unsigned Index,
+                              AttributeSet Attrs) const;
+
 public:
   AttributeList() = default;
 
@@ -503,6 +520,17 @@ public:
     return removeAttributes(C, ArgNo + FirstArgIndex);
   }
 
+  /// Replace the type contained by attribute \p AttrKind at index \p ArgNo wih
+  /// \p ReplacementTy, preserving all other attributes.
+  LLVM_NODISCARD AttributeList replaceAttributeType(LLVMContext &C,
+                                                    unsigned ArgNo,
+                                                    Attribute::AttrKind Kind,
+                                                    Type *ReplacementTy) const {
+    Attribute Attr = getAttribute(ArgNo, Kind);
+    auto Attrs = removeAttribute(C, ArgNo, Kind);
+    return Attrs.addAttribute(C, ArgNo, Attr.getWithNewType(C, ReplacementTy));
+  }
+
   /// \brief Add the dereferenceable attribute to the attribute set at the given
   /// index. Returns a new list because attribute lists are immutable.
   LLVM_NODISCARD AttributeList addDereferenceableAttr(LLVMContext &C,
@@ -626,6 +654,12 @@ public:
   /// Return the byval type for the specified function parameter.
   Type *getParamByValType(unsigned ArgNo) const;
 
+  /// Return the sret type for the specified function parameter.
+  Type *getParamStructRetType(unsigned ArgNo) const;
+
+  /// Return the byref type for the specified function parameter.
+  Type *getParamByRefType(unsigned ArgNo) const;
+
   /// Return the preallocated type for the specified function parameter.
   Type *getParamPreallocatedType(unsigned ArgNo) const;
 
@@ -729,6 +763,8 @@ class AttrBuilder {
   uint64_t DerefOrNullBytes = 0;
   uint64_t AllocSizeArgs = 0;
   Type *ByValType = nullptr;
+  Type *StructRetType = nullptr;
+  Type *ByRefType = nullptr;
   Type *PreallocatedType = nullptr;
 
 public:
@@ -744,7 +780,14 @@ public:
   void clear();
 
   /// Add an attribute to the builder.
-  AttrBuilder &addAttribute(Attribute::AttrKind Val);
+  AttrBuilder &addAttribute(Attribute::AttrKind Val) {
+    assert((unsigned)Val < Attribute::EndAttrKinds &&
+           "Attribute out of range!");
+    assert(!Attribute::doesAttrKindHaveArgument(Val) &&
+           "Adding integer attribute without adding a value!");
+    Attrs[Val] = true;
+    return *this;
+  }
 
   /// Add the Attribute object to the builder.
   AttrBuilder &addAttribute(Attribute A);
@@ -808,6 +851,12 @@ public:
   /// Retrieve the byval type.
   Type *getByValType() const { return ByValType; }
 
+  /// Retrieve the sret type.
+  Type *getStructRetType() const { return StructRetType; }
+
+  /// Retrieve the byref type.
+  Type *getByRefType() const { return ByRefType; }
+
   /// Retrieve the preallocated type.
   Type *getPreallocatedType() const { return PreallocatedType; }
 
@@ -854,6 +903,12 @@ public:
   /// This turns a byval type into the form used internally in Attribute.
   AttrBuilder &addByValAttr(Type *Ty);
 
+  /// This turns a sret type into the form used internally in Attribute.
+  AttrBuilder &addStructRetAttr(Type *Ty);
+
+  /// This turns a byref type into the form used internally in Attribute.
+  AttrBuilder &addByRefAttr(Type *Ty);
+
   /// This turns a preallocated type into the form used internally in Attribute.
   AttrBuilder &addPreallocatedAttr(Type *Ty);
 
@@ -886,10 +941,8 @@ public:
 
   bool td_empty() const { return TargetDepAttrs.empty(); }
 
-  bool operator==(const AttrBuilder &B);
-  bool operator!=(const AttrBuilder &B) {
-    return !(*this == B);
-  }
+  bool operator==(const AttrBuilder &B) const;
+  bool operator!=(const AttrBuilder &B) const { return !(*this == B); }
 };
 
 namespace AttributeFuncs {
@@ -901,9 +954,24 @@ AttrBuilder typeIncompatible(Type *Ty);
 /// attributes for inlining purposes.
 bool areInlineCompatible(const Function &Caller, const Function &Callee);
 
+
+/// Checks  if there are any incompatible function attributes between
+/// \p A and \p B.
+///
+/// \param [in] A - The first function to be compared with.
+/// \param [in] B - The second function to be compared with.
+/// \returns true if the functions have compatible attributes.
+bool areOutlineCompatible(const Function &A, const Function &B);
+
 /// Merge caller's and callee's attributes.
 void mergeAttributesForInlining(Function &Caller, const Function &Callee);
 
+/// Merges the functions attributes from \p ToMerge into function \p Base.
+///
+/// \param [in,out] Base - The function being merged into.
+/// \param [in] ToMerge - The function to merge attributes from.
+void mergeAttributesForOutlining(Function &Base, const Function &ToMerge);
+
 } // end namespace AttributeFuncs
 
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/Attributes.td b/contrib/llvm-project/llvm/include/llvm/IR/Attributes.td
index 395f9dbfb176..f7ffc888c65a 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/Attributes.td
+++ b/contrib/llvm-project/llvm/include/llvm/IR/Attributes.td
@@ -1,3 +1,15 @@
+//===- Attributes.td - Defines all LLVM attributes ---------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines all the LLVM attributes.
+//
+//===----------------------------------------------------------------------===//
+
 /// Attribute base class.
 class Attr<string S> {
   // String representation of this attribute in the IR.
@@ -39,6 +51,9 @@ def Builtin : EnumAttr<"builtin">;
 /// Pass structure by value.
 def ByVal : TypeAttr<"byval">;
 
+/// Mark in-memory ABI type.
+def ByRef : TypeAttr<"byref">;
+
 /// Parameter or return value may not contain uninitialized or poison bits.
 def NoUndef : EnumAttr<"noundef">;
 
@@ -48,6 +63,9 @@ def Cold : EnumAttr<"cold">;
 /// Can only be moved to control-equivalent blocks.
 def Convergent : EnumAttr<"convergent">;
 
+/// Marks function as being in a hot path and frequently called.
+def Hot: EnumAttr<"hot">;
+
 /// Pointer is known to be dereferenceable.
 def Dereferenceable : IntAttr<"dereferenceable">;
 
@@ -88,6 +106,9 @@ def NoAlias : EnumAttr<"noalias">;
 /// Callee isn't recognized as a builtin.
 def NoBuiltin : EnumAttr<"nobuiltin">;
 
+/// Function cannot enter into caller's translation unit.
+def NoCallback : EnumAttr<"nocallback">;
+
 /// Function creates no aliases of pointer.
 def NoCapture : EnumAttr<"nocapture">;
 
@@ -106,7 +127,7 @@ def NoInline : EnumAttr<"noinline">;
 /// Function is called early and/or often, so lazy binding isn't worthwhile.
 def NonLazyBind : EnumAttr<"nonlazybind">;
 
-/// Disable merging for call sites
+/// Disable merging for specified functions or call sites.
 def NoMerge : EnumAttr<"nomerge">;
 
 /// Pointer is known to be not null.
@@ -127,6 +148,9 @@ def NoSync : EnumAttr<"nosync">;
 /// Disable Indirect Branch Tracking.
 def NoCfCheck : EnumAttr<"nocf_check">;
 
+/// Function should be instrumented.
+def NoProfile : EnumAttr<"noprofile">;
+
 /// Function doesn't unwind stack.
 def NoUnwind : EnumAttr<"nounwind">;
 
@@ -189,7 +213,7 @@ def StackProtectStrong : EnumAttr<"sspstrong">;
 def StrictFP : EnumAttr<"strictfp">;
 
 /// Hidden pointer to structure to return.
-def StructRet : EnumAttr<"sret">;
+def StructRet : TypeAttr<"sret">;
 
 /// AddressSanitizer is on.
 def SanitizeAddress : EnumAttr<"sanitize_address">;
@@ -232,6 +256,9 @@ def WriteOnly : EnumAttr<"writeonly">;
 /// Zero extended before/after call.
 def ZExt : EnumAttr<"zeroext">;
 
+/// Function is required to make Forward Progress.
+def MustProgress : TypeAttr<"mustprogress">;
+
 /// Target-independent string attributes.
 def LessPreciseFPMAD : StrBoolAttr<"less-precise-fpmad">;
 def NoInfsFPMath : StrBoolAttr<"no-infs-fp-math">;
@@ -285,3 +312,4 @@ def : MergeRule<"adjustCallerStackProbes">;
 def : MergeRule<"adjustCallerStackProbeSize">;
 def : MergeRule<"adjustMinLegalVectorWidth">;
 def : MergeRule<"adjustNullPointerValidAttr">;
+def : MergeRule<"setAND<MustProgressAttr>">;
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/BasicBlock.h b/contrib/llvm-project/llvm/include/llvm/IR/BasicBlock.h
index 24d568a728c6..b86bb16e1239 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/BasicBlock.h
+++ b/contrib/llvm-project/llvm/include/llvm/IR/BasicBlock.h
@@ -165,19 +165,24 @@ public:
   }
 
   /// Returns a pointer to the first instruction in this block that is not a
-  /// PHINode or a debug intrinsic.
-  const Instruction* getFirstNonPHIOrDbg() const;
-  Instruction* getFirstNonPHIOrDbg() {
+  /// PHINode or a debug intrinsic, or any pseudo operation if \c SkipPseudoOp
+  /// is true.
+  const Instruction *getFirstNonPHIOrDbg(bool SkipPseudoOp = false) const;
+  Instruction *getFirstNonPHIOrDbg(bool SkipPseudoOp = false) {
     return const_cast<Instruction *>(
-                  static_cast<const BasicBlock *>(this)->getFirstNonPHIOrDbg());
+        static_cast<const BasicBlock *>(this)->getFirstNonPHIOrDbg(
+            SkipPseudoOp));
   }
 
   /// Returns a pointer to the first instruction in this block that is not a
-  /// PHINode, a debug intrinsic, or a lifetime intrinsic.
-  const Instruction* getFirstNonPHIOrDbgOrLifetime() const;
-  Instruction* getFirstNonPHIOrDbgOrLifetime() {
+  /// PHINode, a debug intrinsic, or a lifetime intrinsic, or any pseudo
+  /// operation if \c SkipPseudoOp is true.
+  const Instruction *
+  getFirstNonPHIOrDbgOrLifetime(bool SkipPseudoOp = false) const;
+  Instruction *getFirstNonPHIOrDbgOrLifetime(bool SkipPseudoOp = false) {
     return const_cast<Instruction *>(
-        static_cast<const BasicBlock *>(this)->getFirstNonPHIOrDbgOrLifetime());
+        static_cast<const BasicBlock *>(this)->getFirstNonPHIOrDbgOrLifetime(
+            SkipPseudoOp));
   }
 
   /// Returns an iterator to the first instruction in this block that is
@@ -191,16 +196,18 @@ public:
   }
 
   /// Return a const iterator range over the instructions in the block, skipping
-  /// any debug instructions.
+  /// any debug instructions. Skip any pseudo operations as well if \c
+  /// SkipPseudoOp is true.
   iterator_range<filter_iterator<BasicBlock::const_iterator,
                                  std::function<bool(const Instruction &)>>>
-  instructionsWithoutDebug() const;
+  instructionsWithoutDebug(bool SkipPseudoOp = false) const;
 
   /// Return an iterator range over the instructions in the block, skipping any
-  /// debug instructions.
-  iterator_range<filter_iterator<BasicBlock::iterator,
-                                 std::function<bool(Instruction &)>>>
-  instructionsWithoutDebug();
+  /// debug instructions. Skip and any pseudo operations as well if \c
+  /// SkipPseudoOp is true.
+  iterator_range<
+      filter_iterator<BasicBlock::iterator, std::function<bool(Instruction &)>>>
+  instructionsWithoutDebug(bool SkipPseudoOp = false);
 
   /// Return the size of the basic block ignoring debug instructions
   filter_iterator<BasicBlock::const_iterator,
@@ -320,7 +327,9 @@ public:
     phi_iterator_impl() = default;
 
     // Allow conversion between instantiations where valid.
-    template <typename PHINodeU, typename BBIteratorU>
+    template <typename PHINodeU, typename BBIteratorU,
+              typename = std::enable_if_t<
+                  std::is_convertible<PHINodeU *, PHINodeT *>::value>>
     phi_iterator_impl(const phi_iterator_impl<PHINodeU, BBIteratorU> &Arg)
         : PN(Arg.PN) {}
 
@@ -389,22 +398,49 @@ public:
 
   /// Split the basic block into two basic blocks at the specified instruction.
   ///
-  /// Note that all instructions BEFORE the specified iterator stay as part of
-  /// the original basic block, an unconditional branch is added to the original
-  /// BB, and the rest of the instructions in the BB are moved to the new BB,
-  /// including the old terminator.  The newly formed BasicBlock is returned.
-  /// This function invalidates the specified iterator.
+  /// If \p Before is true, splitBasicBlockBefore handles the
+  /// block splitting. Otherwise, execution proceeds as described below.
+  ///
+  /// Note that all instructions BEFORE the specified iterator
+  /// stay as part of the original basic block, an unconditional branch is added
+  /// to the original BB, and the rest of the instructions in the BB are moved
+  /// to the new BB, including the old terminator.  The newly formed basic block
+  /// is returned. This function invalidates the specified iterator.
   ///
   /// Note that this only works on well formed basic blocks (must have a
-  /// terminator), and 'I' must not be the end of instruction list (which would
-  /// cause a degenerate basic block to be formed, having a terminator inside of
-  /// the basic block).
+  /// terminator), and \p 'I' must not be the end of instruction list (which
+  /// would cause a degenerate basic block to be formed, having a terminator
+  /// inside of the basic block).
   ///
   /// Also note that this doesn't preserve any passes. To split blocks while
   /// keeping loop information consistent, use the SplitBlock utility function.
-  BasicBlock *splitBasicBlock(iterator I, const Twine &BBName = "");
-  BasicBlock *splitBasicBlock(Instruction *I, const Twine &BBName = "") {
-    return splitBasicBlock(I->getIterator(), BBName);
+  BasicBlock *splitBasicBlock(iterator I, const Twine &BBName = "",
+                              bool Before = false);
+  BasicBlock *splitBasicBlock(Instruction *I, const Twine &BBName = "",
+                              bool Before = false) {
+    return splitBasicBlock(I->getIterator(), BBName, Before);
+  }
+
+  /// Split the basic block into two basic blocks at the specified instruction
+  /// and insert the new basic blocks as the predecessor of the current block.
+  ///
+  /// This function ensures all instructions AFTER and including the specified
+  /// iterator \p I are part of the original basic block. All Instructions
+  /// BEFORE the iterator \p I are moved to the new BB and an unconditional
+  /// branch is added to the new BB. The new basic block is returned.
+  ///
+  /// Note that this only works on well formed basic blocks (must have a
+  /// terminator), and \p 'I' must not be the end of instruction list (which
+  /// would cause a degenerate basic block to be formed, having a terminator
+  /// inside of the basic block).  \p 'I' cannot be a iterator for a PHINode
+  /// with multiple incoming blocks.
+  ///
+  /// Also note that this doesn't preserve any passes. To split blocks while
+  /// keeping loop information consistent, use the SplitBlockBefore utility
+  /// function.
+  BasicBlock *splitBasicBlockBefore(iterator I, const Twine &BBName = "");
+  BasicBlock *splitBasicBlockBefore(Instruction *I, const Twine &BBName = "") {
+    return splitBasicBlockBefore(I->getIterator(), BBName);
   }
 
   /// Returns true if there are any uses of this basic block other than
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/CallingConv.h b/contrib/llvm-project/llvm/include/llvm/IR/CallingConv.h
index d0906de3ea4e..6a4e368b2e9d 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/CallingConv.h
+++ b/contrib/llvm-project/llvm/include/llvm/IR/CallingConv.h
@@ -241,6 +241,9 @@ namespace CallingConv {
     /// The remainder matches the regular calling convention.
     WASM_EmscriptenInvoke = 99,
 
+    /// Calling convention used for AMD graphics targets.
+    AMDGPU_Gfx = 100,
+
     /// The highest possible calling convention ID. Must be some 2^k - 1.
     MaxID = 1023
   };
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/Constant.h b/contrib/llvm-project/llvm/include/llvm/IR/Constant.h
index 9a1d2b80c48e..71692c746015 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/Constant.h
+++ b/contrib/llvm-project/llvm/include/llvm/IR/Constant.h
@@ -78,11 +78,13 @@ public:
   bool isMinSignedValue() const;
 
   /// Return true if this is a finite and non-zero floating-point scalar
-  /// constant or a vector constant with all finite and non-zero elements.
+  /// constant or a fixed width vector constant with all finite and non-zero
+  /// elements.
   bool isFiniteNonZeroFP() const;
 
-  /// Return true if this is a normal (as opposed to denormal) floating-point
-  /// scalar constant or a vector constant with all normal elements.
+  /// Return true if this is a normal (as opposed to denormal, infinity, nan,
+  /// or zero) floating-point scalar constant or a vector constant with all
+  /// normal elements. See APFloat::isNormal.
   bool isNormalFP() const;
 
   /// Return true if this scalar has an exact multiplicative inverse or this
@@ -99,12 +101,18 @@ public:
   /// lane, the constants still match.
   bool isElementWiseEqual(Value *Y) const;
 
-  /// Return true if this is a vector constant that includes any undefined
+  /// Return true if this is a vector constant that includes any undef or
+  /// poison elements. Since it is impossible to inspect a scalable vector
+  /// element- wise at compile time, this function returns true only if the
+  /// entire vector is undef or poison.
+  bool containsUndefOrPoisonElement() const;
+
+  /// Return true if this is a vector constant that includes any poison
   /// elements.
-  bool containsUndefElement() const;
+  bool containsPoisonElement() const;
 
-  /// Return true if this is a vector constant that includes any constant
-  /// expressions.
+  /// Return true if this is a fixed width vector constant that includes
+  /// any constant expressions.
   bool containsConstantExpression() const;
 
   /// Return true if evaluation of this constant could trap. This is true for
@@ -200,6 +208,16 @@ public:
   /// Try to replace undefined constant C or undefined elements in C with
   /// Replacement. If no changes are made, the constant C is returned.
   static Constant *replaceUndefsWith(Constant *C, Constant *Replacement);
+
+  /// Merges undefs of a Constant with another Constant, along with the
+  /// undefs already present. Other doesn't have to be the same type as C, but
+  /// both must either be scalars or vectors with the same element count. If no
+  /// changes are made, the constant C is returned.
+  static Constant *mergeUndefsWith(Constant *C, Constant *Other);
+
+  /// Return true if a constant is ConstantData or a ConstantAggregate or
+  /// ConstantExpr that contain only ConstantData.
+  bool isManifestConstant() const;
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/ConstantRange.h b/contrib/llvm-project/llvm/include/llvm/IR/ConstantRange.h
index 8ecb9aa0ce02..20e8e67436a4 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/ConstantRange.h
+++ b/contrib/llvm-project/llvm/include/llvm/IR/ConstantRange.h
@@ -150,6 +150,14 @@ public:
                                              const APInt &Other,
                                              unsigned NoWrapKind);
 
+  /// Returns true if ConstantRange calculations are supported for intrinsic
+  /// with \p IntrinsicID.
+  static bool isIntrinsicSupported(Intrinsic::ID IntrinsicID);
+
+  /// Compute range of intrinsic result for the given operand ranges.
+  static ConstantRange intrinsic(Intrinsic::ID IntrinsicID,
+                                 ArrayRef<ConstantRange> Ops);
+
   /// Set up \p Pred and \p RHS such that
   /// ConstantRange::makeExactICmpRegion(Pred, RHS) == *this.  Return true if
   /// successful.
@@ -253,6 +261,14 @@ public:
     return !operator==(CR);
   }
 
+  /// Compute the maximal number of active bits needed to represent every value
+  /// in this range.
+  unsigned getActiveBits() const;
+
+  /// Compute the maximal number of bits needed to represent every value
+  /// in this signed range.
+  unsigned getMinSignedBits() const;
+
   /// Subtract the specified constant from the endpoints of this constant range.
   ConstantRange subtract(const APInt &CI) const;
 
@@ -401,6 +417,11 @@ public:
   /// value in \p Other.
   ConstantRange srem(const ConstantRange &Other) const;
 
+  /// Return a new range representing the possible values resulting from
+  /// a binary-xor of a value in this range by an all-one value,
+  /// aka bitwise complement operation.
+  ConstantRange binaryNot() const;
+
   /// Return a new range representing the possible values resulting
   /// from a binary-and of a value in this range by a value in \p Other.
   ConstantRange binaryAnd(const ConstantRange &Other) const;
@@ -456,8 +477,9 @@ public:
   ConstantRange inverse() const;
 
   /// Calculate absolute value range. If the original range contains signed
-  /// min, then the resulting range will also contain signed min.
-  ConstantRange abs() const;
+  /// min, then the resulting range will contain signed min if and only if
+  /// \p IntMinIsPoison is false.
+  ConstantRange abs(bool IntMinIsPoison = false) const;
 
   /// Represents whether an operation on the given constant range is known to
   /// always or never overflow.
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/Constants.h b/contrib/llvm-project/llvm/include/llvm/IR/Constants.h
index 8e2dba9b2417..ac802232c23d 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/Constants.h
+++ b/contrib/llvm-project/llvm/include/llvm/IR/Constants.h
@@ -88,8 +88,10 @@ public:
 
   static ConstantInt *getTrue(LLVMContext &Context);
   static ConstantInt *getFalse(LLVMContext &Context);
+  static ConstantInt *getBool(LLVMContext &Context, bool V);
   static Constant *getTrue(Type *Ty);
   static Constant *getFalse(Type *Ty);
+  static Constant *getBool(Type *Ty, bool V);
 
   /// If Ty is a vector type, return a Constant with a splat of the given
   /// value. Otherwise return a ConstantInt for the given value.
@@ -592,14 +594,13 @@ class ConstantDataSequential : public ConstantData {
   /// the same value but different type.  For example, 0,0,0,1 could be a 4
   /// element array of i8, or a 1-element array of i32.  They'll both end up in
   /// the same StringMap bucket, linked up.
-  ConstantDataSequential *Next;
+  std::unique_ptr<ConstantDataSequential> Next;
 
   void destroyConstantImpl();
 
 protected:
   explicit ConstantDataSequential(Type *ty, ValueTy VT, const char *Data)
-      : ConstantData(ty, VT), DataElements(Data), Next(nullptr) {}
-  ~ConstantDataSequential() { delete Next; }
+      : ConstantData(ty, VT), DataElements(Data) {}
 
   static Constant *getImpl(StringRef Bytes, Type *Ty);
 
@@ -889,6 +890,42 @@ struct OperandTraits<BlockAddress> :
 
 DEFINE_TRANSPARENT_OPERAND_ACCESSORS(BlockAddress, Value)
 
+/// Wrapper for a function that represents a value that
+/// functionally represents the original function. This can be a function,
+/// global alias to a function, or an ifunc.
+class DSOLocalEquivalent final : public Constant {
+  friend class Constant;
+
+  DSOLocalEquivalent(GlobalValue *GV);
+
+  void *operator new(size_t s) { return User::operator new(s, 1); }
+
+  void destroyConstantImpl();
+  Value *handleOperandChangeImpl(Value *From, Value *To);
+
+public:
+  /// Return a DSOLocalEquivalent for the specified global value.
+  static DSOLocalEquivalent *get(GlobalValue *GV);
+
+  /// Transparently provide more efficient getOperand methods.
+  DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
+
+  GlobalValue *getGlobalValue() const {
+    return cast<GlobalValue>(Op<0>().get());
+  }
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const Value *V) {
+    return V->getValueID() == DSOLocalEquivalentVal;
+  }
+};
+
+template <>
+struct OperandTraits<DSOLocalEquivalent>
+    : public FixedNumOperandTraits<DSOLocalEquivalent, 1> {};
+
+DEFINE_TRANSPARENT_OPERAND_ACCESSORS(DSOLocalEquivalent, Value)
+
 //===----------------------------------------------------------------------===//
 /// A constant value that is initialized with an expression using
 /// other constant values.
@@ -959,6 +996,7 @@ public:
   static Constant *getAnd(Constant *C1, Constant *C2);
   static Constant *getOr(Constant *C1, Constant *C2);
   static Constant *getXor(Constant *C1, Constant *C2);
+  static Constant *getUMin(Constant *C1, Constant *C2);
   static Constant *getShl(Constant *C1, Constant *C2,
                           bool HasNUW = false, bool HasNSW = false);
   static Constant *getLShr(Constant *C1, Constant *C2, bool isExact = false);
@@ -1034,6 +1072,12 @@ public:
     return getLShr(C1, C2, true);
   }
 
+  /// If C is a scalar/fixed width vector of known powers of 2, then this
+  /// function returns a new scalar/fixed width vector obtained from logBase2
+  /// of C. Undef vector elements are set to zero.
+  /// Return a null pointer otherwise.
+  static Constant *getExactLogBase2(Constant *C);
+
   /// Return the identity constant for a binary opcode.
   /// The identity constant C is defined as X op C = X and C op X = X for every
   /// X when the binary operation is commutative. If the binop is not
@@ -1306,13 +1350,16 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ConstantExpr, Constant)
 /// can appear to have different bit patterns at each use. See
 /// LangRef.html#undefvalues for details.
 ///
-class UndefValue final : public ConstantData {
+class UndefValue : public ConstantData {
   friend class Constant;
 
   explicit UndefValue(Type *T) : ConstantData(T, UndefValueVal) {}
 
   void destroyConstantImpl();
 
+protected:
+  explicit UndefValue(Type *T, ValueTy vty) : ConstantData(T, vty) {}
+
 public:
   UndefValue(const UndefValue &) = delete;
 
@@ -1339,7 +1386,49 @@ public:
 
   /// Methods for support type inquiry through isa, cast, and dyn_cast:
   static bool classof(const Value *V) {
-    return V->getValueID() == UndefValueVal;
+    return V->getValueID() == UndefValueVal ||
+           V->getValueID() == PoisonValueVal;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+/// In order to facilitate speculative execution, many instructions do not
+/// invoke immediate undefined behavior when provided with illegal operands,
+/// and return a poison value instead.
+///
+/// see LangRef.html#poisonvalues for details.
+///
+class PoisonValue final : public UndefValue {
+  friend class Constant;
+
+  explicit PoisonValue(Type *T) : UndefValue(T, PoisonValueVal) {}
+
+  void destroyConstantImpl();
+
+public:
+  PoisonValue(const PoisonValue &) = delete;
+
+  /// Static factory methods - Return an 'poison' object of the specified type.
+  static PoisonValue *get(Type *T);
+
+  /// If this poison has array or vector type, return a poison with the right
+  /// element type.
+  PoisonValue *getSequentialElement() const;
+
+  /// If this poison has struct type, return a poison with the right element
+  /// type for the specified element.
+  PoisonValue *getStructElement(unsigned Elt) const;
+
+  /// Return an poison of the right value for the specified GEP index if we can,
+  /// otherwise return null (e.g. if C is a ConstantExpr).
+  PoisonValue *getElementValue(Constant *C) const;
+
+  /// Return an poison of the right value for the specified GEP index.
+  PoisonValue *getElementValue(unsigned Idx) const;
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const Value *V) {
+    return V->getValueID() == PoisonValueVal;
   }
 };
 
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/DIBuilder.h b/contrib/llvm-project/llvm/include/llvm/IR/DIBuilder.h
index d1c7d126b5a9..e0238567f251 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/DIBuilder.h
+++ b/contrib/llvm-project/llvm/include/llvm/IR/DIBuilder.h
@@ -199,6 +199,12 @@ namespace llvm {
                                  unsigned Encoding,
                                  DINode::DIFlags Flags = DINode::FlagZero);
 
+    /// Create debugging information entry for a string
+    /// type.
+    /// \param Name        Type name.
+    /// \param SizeInBits  Size of the type.
+    DIStringType *createStringType(StringRef Name, uint64_t SizeInBits);
+
     /// Create debugging information entry for a qualified
     /// type, e.g. 'const int'.
     /// \param Tag         Tag identifing type, e.g. dwarf::TAG_volatile_type
@@ -488,8 +494,24 @@ namespace llvm {
     /// \param AlignInBits  Alignment.
     /// \param Ty           Element type.
     /// \param Subscripts   Subscripts.
-    DICompositeType *createArrayType(uint64_t Size, uint32_t AlignInBits,
-                                     DIType *Ty, DINodeArray Subscripts);
+    /// \param DataLocation The location of the raw data of a descriptor-based
+    ///                     Fortran array, either a DIExpression* or
+    ///                     a DIVariable*.
+    /// \param Associated   The associated attribute of a descriptor-based
+    ///                     Fortran array, either a DIExpression* or
+    ///                     a DIVariable*.
+    /// \param Allocated    The allocated attribute of a descriptor-based
+    ///                     Fortran array, either a DIExpression* or
+    ///                     a DIVariable*.
+    /// \param Rank         The rank attribute of a descriptor-based
+    ///                     Fortran array, either a DIExpression* or
+    ///                     a DIVariable*.
+    DICompositeType *createArrayType(
+        uint64_t Size, uint32_t AlignInBits, DIType *Ty, DINodeArray Subscripts,
+        PointerUnion<DIExpression *, DIVariable *> DataLocation = nullptr,
+        PointerUnion<DIExpression *, DIVariable *> Associated = nullptr,
+        PointerUnion<DIExpression *, DIVariable *> Allocated = nullptr,
+        PointerUnion<DIExpression *, DIVariable *> Rank = nullptr);
 
     /// Create debugging information entry for a vector type.
     /// \param Size         Array size.
@@ -576,6 +598,12 @@ namespace llvm {
     DISubrange *getOrCreateSubrange(Metadata *Count, Metadata *LowerBound,
                                     Metadata *UpperBound, Metadata *Stride);
 
+    DIGenericSubrange *
+    getOrCreateGenericSubrange(DIGenericSubrange::BoundType Count,
+                               DIGenericSubrange::BoundType LowerBound,
+                               DIGenericSubrange::BoundType UpperBound,
+                               DIGenericSubrange::BoundType Stride);
+
     /// Create a new descriptor for the specified variable.
     /// \param Context     Variable scope.
     /// \param Name        Name of the variable.
@@ -744,14 +772,18 @@ namespace llvm {
     ///                    definitions as they would appear on a command line.
     /// \param IncludePath The path to the module map file.
     /// \param APINotesFile The path to an API notes file for this module.
-    /// \param File        Source file of the module declaration. Used for
-    ///                    Fortran modules.
-    /// \param LineNo      Source line number of the  module declaration.
+    /// \param File        Source file of the module.
+    ///                    Used for Fortran modules.
+    /// \param LineNo      Source line number of the module.
     ///                    Used for Fortran modules.
+    /// \param IsDecl      This is a module declaration; default to false;
+    ///                    when set to true, only Scope and Name are required
+    ///                    as this entry is just a hint for the debugger to find
+    ///                    the corresponding definition in the global scope.
     DIModule *createModule(DIScope *Scope, StringRef Name,
                            StringRef ConfigurationMacros, StringRef IncludePath,
                            StringRef APINotesFile = {}, DIFile *File = nullptr,
-                           unsigned LineNo = 0);
+                           unsigned LineNo = 0, bool IsDecl = false);
 
     /// This creates a descriptor for a lexical block with a new file
     /// attached. This merely extends the existing
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/DataLayout.h b/contrib/llvm-project/llvm/include/llvm/IR/DataLayout.h
index 17297bb8b309..eb031613a935 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/DataLayout.h
+++ b/contrib/llvm-project/llvm/include/llvm/IR/DataLayout.h
@@ -123,6 +123,7 @@ private:
   unsigned AllocaAddrSpace;
   MaybeAlign StackNaturalAlign;
   unsigned ProgramAddrSpace;
+  unsigned DefaultGlobalsAddrSpace;
 
   MaybeAlign FunctionPtrAlign;
   FunctionPtrAlignType TheFunctionPtrAlignType;
@@ -160,12 +161,7 @@ private:
   using PointersTy = SmallVector<PointerAlignElem, 8>;
   PointersTy Pointers;
 
-  PointersTy::const_iterator
-  findPointerLowerBound(uint32_t AddressSpace) const {
-    return const_cast<DataLayout *>(this)->findPointerLowerBound(AddressSpace);
-  }
-
-  PointersTy::iterator findPointerLowerBound(uint32_t AddressSpace);
+  const PointerAlignElem &getPointerAlignElem(uint32_t AddressSpace) const;
 
   // The StructType -> StructLayout map.
   mutable void *LayoutMap = nullptr;
@@ -174,19 +170,25 @@ private:
   /// well-defined bitwise representation.
   SmallVector<unsigned, 8> NonIntegralAddressSpaces;
 
-  void setAlignment(AlignTypeEnum align_type, Align abi_align, Align pref_align,
-                    uint32_t bit_width);
-  Align getAlignmentInfo(AlignTypeEnum align_type, uint32_t bit_width,
-                         bool ABIAlign, Type *Ty) const;
-  void setPointerAlignment(uint32_t AddrSpace, Align ABIAlign, Align PrefAlign,
-                           uint32_t TypeByteWidth, uint32_t IndexWidth);
+  /// Attempts to set the alignment of the given type. Returns an error
+  /// description on failure.
+  Error setAlignment(AlignTypeEnum align_type, Align abi_align,
+                     Align pref_align, uint32_t bit_width);
+
+  /// Attempts to set the alignment of a pointer in the given address space.
+  /// Returns an error description on failure.
+  Error setPointerAlignment(uint32_t AddrSpace, Align ABIAlign, Align PrefAlign,
+                            uint32_t TypeByteWidth, uint32_t IndexWidth);
+
+  /// Internal helper to get alignment for integer of given bitwidth.
+  Align getIntegerAlignment(uint32_t BitWidth, bool abi_or_pref) const;
 
   /// Internal helper method that returns requested alignment for type.
   Align getAlignment(Type *Ty, bool abi_or_pref) const;
 
-  /// Parses a target data specification string. Assert if the string is
-  /// malformed.
-  void parseSpecifier(StringRef LayoutDescription);
+  /// Attempts to parse a target data specification string and reports an error
+  /// if the string is malformed.
+  Error parseSpecifier(StringRef Desc);
 
   // Free all internal data structures.
   void clear();
@@ -213,6 +215,7 @@ public:
     FunctionPtrAlign = DL.FunctionPtrAlign;
     TheFunctionPtrAlignType = DL.TheFunctionPtrAlignType;
     ProgramAddrSpace = DL.ProgramAddrSpace;
+    DefaultGlobalsAddrSpace = DL.DefaultGlobalsAddrSpace;
     ManglingMode = DL.ManglingMode;
     LegalIntWidths = DL.LegalIntWidths;
     Alignments = DL.Alignments;
@@ -229,6 +232,10 @@ public:
   /// Parse a data layout string (with fallback to default values).
   void reset(StringRef LayoutDescription);
 
+  /// Parse a data layout string and return the layout. Return an error
+  /// description on failure.
+  static Expected<DataLayout> parse(StringRef LayoutDescription);
+
   /// Layout endianness...
   bool isLittleEndian() const { return !BigEndian; }
   bool isBigEndian() const { return BigEndian; }
@@ -285,6 +292,9 @@ public:
   }
 
   unsigned getProgramAddressSpace() const { return ProgramAddrSpace; }
+  unsigned getDefaultGlobalsAddressSpace() const {
+    return DefaultGlobalsAddrSpace;
+  }
 
   bool hasMicrosoftFastStdCallMangling() const {
     return ManglingMode == MM_WinCOFFX86;
@@ -378,7 +388,7 @@ public:
 
   bool isNonIntegralAddressSpace(unsigned AddrSpace) const {
     ArrayRef<unsigned> NonIntegralSpaces = getNonIntegralAddressSpaces();
-    return find(NonIntegralSpaces, AddrSpace) != NonIntegralSpaces.end();
+    return is_contained(NonIntegralSpaces, AddrSpace);
   }
 
   bool isNonIntegralPointerType(PointerType *PT) const {
@@ -520,7 +530,9 @@ public:
 
   /// Returns the minimum ABI-required alignment for an integer type of
   /// the specified bitwidth.
-  Align getABIIntegerTypeAlignment(unsigned BitWidth) const;
+  Align getABIIntegerTypeAlignment(unsigned BitWidth) const {
+    return getIntegerAlignment(BitWidth, /* abi_or_pref */ true);
+  }
 
   /// Returns the preferred stack/global alignment for the specified
   /// type.
@@ -678,6 +690,8 @@ inline TypeSize DataLayout::getTypeSizeInBits(Type *Ty) const {
   case Type::PPC_FP128TyID:
   case Type::FP128TyID:
     return TypeSize::Fixed(128);
+  case Type::X86_AMXTyID:
+    return TypeSize::Fixed(8192);
   // In memory objects this is always aligned to a higher boundary, but
   // only 80 bits contain information.
   case Type::X86_FP80TyID:
@@ -686,9 +700,9 @@ inline TypeSize DataLayout::getTypeSizeInBits(Type *Ty) const {
   case Type::ScalableVectorTyID: {
     VectorType *VTy = cast<VectorType>(Ty);
     auto EltCnt = VTy->getElementCount();
-    uint64_t MinBits = EltCnt.Min *
-                        getTypeSizeInBits(VTy->getElementType()).getFixedSize();
-    return TypeSize(MinBits, EltCnt.Scalable);
+    uint64_t MinBits = EltCnt.getKnownMinValue() *
+                       getTypeSizeInBits(VTy->getElementType()).getFixedSize();
+    return TypeSize(MinBits, EltCnt.isScalable());
   }
   default:
     llvm_unreachable("DataLayout::getTypeSizeInBits(): Unsupported type");
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/DebugInfoMetadata.h b/contrib/llvm-project/llvm/include/llvm/IR/DebugInfoMetadata.h
index 7d7cc4de7937..22dd5ee6efac 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/DebugInfoMetadata.h
+++ b/contrib/llvm-project/llvm/include/llvm/IR/DebugInfoMetadata.h
@@ -182,6 +182,7 @@ public:
     case DISubrangeKind:
     case DIEnumeratorKind:
     case DIBasicTypeKind:
+    case DIStringTypeKind:
     case DIDerivedTypeKind:
     case DICompositeTypeKind:
     case DISubroutineTypeKind:
@@ -200,6 +201,7 @@ public:
     case DIObjCPropertyKind:
     case DIImportedEntityKind:
     case DIModuleKind:
+    case DIGenericSubrangeKind:
       return true;
     }
   }
@@ -238,9 +240,8 @@ class GenericDINode : public DINode {
                                 StorageType Storage, bool ShouldCreate = true);
 
   TempGenericDINode cloneImpl() const {
-    return getTemporary(
-        getContext(), getTag(), getHeader(),
-        SmallVector<Metadata *, 4>(dwarf_op_begin(), dwarf_op_end()));
+    return getTemporary(getContext(), getTag(), getHeader(),
+                        SmallVector<Metadata *, 4>(dwarf_operands()));
   }
 
 public:
@@ -350,6 +351,52 @@ public:
   }
 };
 
+class DIGenericSubrange : public DINode {
+  friend class LLVMContextImpl;
+  friend class MDNode;
+
+  DIGenericSubrange(LLVMContext &C, StorageType Storage,
+                    ArrayRef<Metadata *> Ops)
+      : DINode(C, DIGenericSubrangeKind, Storage,
+               dwarf::DW_TAG_generic_subrange, Ops) {}
+
+  ~DIGenericSubrange() = default;
+
+  static DIGenericSubrange *getImpl(LLVMContext &Context, Metadata *CountNode,
+                                    Metadata *LowerBound, Metadata *UpperBound,
+                                    Metadata *Stride, StorageType Storage,
+                                    bool ShouldCreate = true);
+
+  TempDIGenericSubrange cloneImpl() const {
+    return getTemporary(getContext(), getRawCountNode(), getRawLowerBound(),
+                        getRawUpperBound(), getRawStride());
+  }
+
+public:
+  DEFINE_MDNODE_GET(DIGenericSubrange,
+                    (Metadata * CountNode, Metadata *LowerBound,
+                     Metadata *UpperBound, Metadata *Stride),
+                    (CountNode, LowerBound, UpperBound, Stride))
+
+  TempDIGenericSubrange clone() const { return cloneImpl(); }
+
+  Metadata *getRawCountNode() const { return getOperand(0).get(); }
+  Metadata *getRawLowerBound() const { return getOperand(1).get(); }
+  Metadata *getRawUpperBound() const { return getOperand(2).get(); }
+  Metadata *getRawStride() const { return getOperand(3).get(); }
+
+  using BoundType = PointerUnion<DIVariable *, DIExpression *>;
+
+  BoundType getCount() const;
+  BoundType getLowerBound() const;
+  BoundType getUpperBound() const;
+  BoundType getStride() const;
+
+  static bool classof(const Metadata *MD) {
+    return MD->getMetadataID() == DIGenericSubrangeKind;
+  }
+};
+
 /// Enumeration value.
 ///
 /// TODO: Add a pointer to the context (DW_TAG_enumeration_type) once that no
@@ -451,6 +498,7 @@ public:
     default:
       return false;
     case DIBasicTypeKind:
+    case DIStringTypeKind:
     case DIDerivedTypeKind:
     case DICompositeTypeKind:
     case DISubroutineTypeKind:
@@ -697,6 +745,7 @@ public:
     default:
       return false;
     case DIBasicTypeKind:
+    case DIStringTypeKind:
     case DIDerivedTypeKind:
     case DICompositeTypeKind:
     case DISubroutineTypeKind:
@@ -746,6 +795,12 @@ class DIBasicType : public DIType {
 public:
   DEFINE_MDNODE_GET(DIBasicType, (unsigned Tag, StringRef Name),
                     (Tag, Name, 0, 0, 0, FlagZero))
+  DEFINE_MDNODE_GET(DIBasicType,
+                    (unsigned Tag, StringRef Name, uint64_t SizeInBits),
+                    (Tag, Name, SizeInBits, 0, 0, FlagZero))
+  DEFINE_MDNODE_GET(DIBasicType,
+                    (unsigned Tag, MDString *Name, uint64_t SizeInBits),
+                    (Tag, Name, SizeInBits, 0, 0, FlagZero))
   DEFINE_MDNODE_GET(DIBasicType,
                     (unsigned Tag, StringRef Name, uint64_t SizeInBits,
                      uint32_t AlignInBits, unsigned Encoding, DIFlags Flags),
@@ -770,6 +825,81 @@ public:
   }
 };
 
+/// String type, Fortran CHARACTER(n)
+class DIStringType : public DIType {
+  friend class LLVMContextImpl;
+  friend class MDNode;
+
+  unsigned Encoding;
+
+  DIStringType(LLVMContext &C, StorageType Storage, unsigned Tag,
+               uint64_t SizeInBits, uint32_t AlignInBits, unsigned Encoding,
+               ArrayRef<Metadata *> Ops)
+      : DIType(C, DIStringTypeKind, Storage, Tag, 0, SizeInBits, AlignInBits, 0,
+               FlagZero, Ops),
+        Encoding(Encoding) {}
+  ~DIStringType() = default;
+
+  static DIStringType *getImpl(LLVMContext &Context, unsigned Tag,
+                               StringRef Name, Metadata *StringLength,
+                               Metadata *StrLenExp, uint64_t SizeInBits,
+                               uint32_t AlignInBits, unsigned Encoding,
+                               StorageType Storage, bool ShouldCreate = true) {
+    return getImpl(Context, Tag, getCanonicalMDString(Context, Name),
+                   StringLength, StrLenExp, SizeInBits, AlignInBits, Encoding,
+                   Storage, ShouldCreate);
+  }
+  static DIStringType *getImpl(LLVMContext &Context, unsigned Tag,
+                               MDString *Name, Metadata *StringLength,
+                               Metadata *StrLenExp, uint64_t SizeInBits,
+                               uint32_t AlignInBits, unsigned Encoding,
+                               StorageType Storage, bool ShouldCreate = true);
+
+  TempDIStringType cloneImpl() const {
+    return getTemporary(getContext(), getTag(), getRawName(),
+                        getRawStringLength(), getRawStringLengthExp(),
+                        getSizeInBits(), getAlignInBits(), getEncoding());
+  }
+
+public:
+  DEFINE_MDNODE_GET(DIStringType,
+                    (unsigned Tag, StringRef Name, uint64_t SizeInBits,
+                     uint32_t AlignInBits),
+                    (Tag, Name, nullptr, nullptr, SizeInBits, AlignInBits, 0))
+  DEFINE_MDNODE_GET(DIStringType,
+                    (unsigned Tag, MDString *Name, Metadata *StringLength,
+                     Metadata *StringLengthExp, uint64_t SizeInBits,
+                     uint32_t AlignInBits, unsigned Encoding),
+                    (Tag, Name, StringLength, StringLengthExp, SizeInBits,
+                     AlignInBits, Encoding))
+  DEFINE_MDNODE_GET(DIStringType,
+                    (unsigned Tag, StringRef Name, Metadata *StringLength,
+                     Metadata *StringLengthExp, uint64_t SizeInBits,
+                     uint32_t AlignInBits, unsigned Encoding),
+                    (Tag, Name, StringLength, StringLengthExp, SizeInBits,
+                     AlignInBits, Encoding))
+
+  TempDIStringType clone() const { return cloneImpl(); }
+
+  static bool classof(const Metadata *MD) {
+    return MD->getMetadataID() == DIStringTypeKind;
+  }
+
+  DIVariable *getStringLength() const {
+    return cast_or_null<DIVariable>(getRawStringLength());
+  }
+
+  DIExpression *getStringLengthExp() const {
+    return cast_or_null<DIExpression>(getRawStringLengthExp());
+  }
+
+  unsigned getEncoding() const { return Encoding; }
+
+  Metadata *getRawStringLength() const { return getOperand(3); }
+
+  Metadata *getRawStringLengthExp() const { return getOperand(4); }
+};
+
 /// Derived types.
 ///
 /// This includes qualified types, pointers, references, friends, typedefs, and
@@ -942,13 +1072,14 @@ class DICompositeType : public DIType {
           DINodeArray Elements, unsigned RuntimeLang, DIType *VTableHolder,
           DITemplateParameterArray TemplateParams, StringRef Identifier,
           DIDerivedType *Discriminator, Metadata *DataLocation,
+          Metadata *Associated, Metadata *Allocated, Metadata *Rank,
           StorageType Storage, bool ShouldCreate = true) {
-    return getImpl(Context, Tag, getCanonicalMDString(Context, Name), File,
-                   Line, Scope, BaseType, SizeInBits, AlignInBits, OffsetInBits,
-                   Flags, Elements.get(), RuntimeLang, VTableHolder,
-                   TemplateParams.get(),
-                   getCanonicalMDString(Context, Identifier), Discriminator,
-                   DataLocation, Storage, ShouldCreate);
+    return getImpl(
+        Context, Tag, getCanonicalMDString(Context, Name), File, Line, Scope,
+        BaseType, SizeInBits, AlignInBits, OffsetInBits, Flags, Elements.get(),
+        RuntimeLang, VTableHolder, TemplateParams.get(),
+        getCanonicalMDString(Context, Identifier), Discriminator, DataLocation,
+        Associated, Allocated, Rank, Storage, ShouldCreate);
   }
   static DICompositeType *
   getImpl(LLVMContext &Context, unsigned Tag, MDString *Name, Metadata *File,
@@ -957,6 +1088,7 @@ class DICompositeType : public DIType {
           DIFlags Flags, Metadata *Elements, unsigned RuntimeLang,
           Metadata *VTableHolder, Metadata *TemplateParams,
           MDString *Identifier, Metadata *Discriminator, Metadata *DataLocation,
+          Metadata *Associated, Metadata *Allocated, Metadata *Rank,
           StorageType Storage, bool ShouldCreate = true);
 
   TempDICompositeType cloneImpl() const {
@@ -965,7 +1097,8 @@ class DICompositeType : public DIType {
                         getAlignInBits(), getOffsetInBits(), getFlags(),
                         getElements(), getRuntimeLang(), getVTableHolder(),
                         getTemplateParams(), getIdentifier(),
-                        getDiscriminator(), getRawDataLocation());
+                        getDiscriminator(), getRawDataLocation(),
+                        getRawAssociated(), getRawAllocated(), getRawRank());
   }
 
 public:
@@ -977,10 +1110,11 @@ public:
        DINodeArray Elements, unsigned RuntimeLang, DIType *VTableHolder,
        DITemplateParameterArray TemplateParams = nullptr,
        StringRef Identifier = "", DIDerivedType *Discriminator = nullptr,
-       Metadata *DataLocation = nullptr),
+       Metadata *DataLocation = nullptr, Metadata *Associated = nullptr,
+       Metadata *Allocated = nullptr, Metadata *Rank = nullptr),
       (Tag, Name, File, Line, Scope, BaseType, SizeInBits, AlignInBits,
        OffsetInBits, Flags, Elements, RuntimeLang, VTableHolder, TemplateParams,
-       Identifier, Discriminator, DataLocation))
+       Identifier, Discriminator, DataLocation, Associated, Allocated, Rank))
   DEFINE_MDNODE_GET(
       DICompositeType,
       (unsigned Tag, MDString *Name, Metadata *File, unsigned Line,
@@ -988,10 +1122,12 @@ public:
        uint32_t AlignInBits, uint64_t OffsetInBits, DIFlags Flags,
        Metadata *Elements, unsigned RuntimeLang, Metadata *VTableHolder,
        Metadata *TemplateParams = nullptr, MDString *Identifier = nullptr,
-       Metadata *Discriminator = nullptr, Metadata *DataLocation = nullptr),
+       Metadata *Discriminator = nullptr, Metadata *DataLocation = nullptr,
+       Metadata *Associated = nullptr, Metadata *Allocated = nullptr,
+       Metadata *Rank = nullptr),
       (Tag, Name, File, Line, Scope, BaseType, SizeInBits, AlignInBits,
        OffsetInBits, Flags, Elements, RuntimeLang, VTableHolder, TemplateParams,
-       Identifier, Discriminator, DataLocation))
+       Identifier, Discriminator, DataLocation, Associated, Allocated, Rank))
 
   TempDICompositeType clone() const { return cloneImpl(); }
 
@@ -1009,7 +1145,8 @@ public:
              uint64_t OffsetInBits, DIFlags Flags, Metadata *Elements,
              unsigned RuntimeLang, Metadata *VTableHolder,
              Metadata *TemplateParams, Metadata *Discriminator,
-             Metadata *DataLocation);
+             Metadata *DataLocation, Metadata *Associated, Metadata *Allocated,
+             Metadata *Rank);
   static DICompositeType *getODRTypeIfExists(LLVMContext &Context,
                                              MDString &Identifier);
 
@@ -1029,7 +1166,8 @@ public:
                uint64_t OffsetInBits, DIFlags Flags, Metadata *Elements,
                unsigned RuntimeLang, Metadata *VTableHolder,
                Metadata *TemplateParams, Metadata *Discriminator,
-               Metadata *DataLocation);
+               Metadata *DataLocation, Metadata *Associated,
+               Metadata *Allocated, Metadata *Rank);
 
   DIType *getBaseType() const { return cast_or_null<DIType>(getRawBaseType()); }
   DINodeArray getElements() const {
@@ -1058,6 +1196,29 @@ public:
   DIExpression *getDataLocationExp() const {
     return dyn_cast_or_null<DIExpression>(getRawDataLocation());
   }
+  Metadata *getRawAssociated() const { return getOperand(10); }
+  DIVariable *getAssociated() const {
+    return dyn_cast_or_null<DIVariable>(getRawAssociated());
+  }
+  DIExpression *getAssociatedExp() const {
+    return dyn_cast_or_null<DIExpression>(getRawAssociated());
+  }
+  Metadata *getRawAllocated() const { return getOperand(11); }
+  DIVariable *getAllocated() const {
+    return dyn_cast_or_null<DIVariable>(getRawAllocated());
+  }
+  DIExpression *getAllocatedExp() const {
+    return dyn_cast_or_null<DIExpression>(getRawAllocated());
+  }
+  Metadata *getRawRank() const { return getOperand(12); }
+  ConstantInt *getRankConst() const {
+    if (auto *MD = dyn_cast_or_null<ConstantAsMetadata>(getRawRank()))
+      return dyn_cast_or_null<ConstantInt>(MD->getValue());
+    return nullptr;
+  }
+  DIExpression *getRankExp() const {
+    return dyn_cast_or_null<DIExpression>(getRawRank());
+  }
 
   /// Replace operands.
   ///
@@ -1536,6 +1697,18 @@ public:
 
   inline unsigned getDiscriminator() const;
 
+  // For the regular discriminator, it stands for all empty components if all
+  // the lowest 3 bits are non-zero and all higher 29 bits are unused(zero by
+  // default). Here we fully leverage the higher 29 bits for pseudo probe use.
+  // This is the format:
+  // [2:0] - 0x7
+  // [31:3] - pseudo probe fields guaranteed to be non-zero as a whole
+  // So if the lower 3 bits is non-zero and the others has at least one
+  // non-zero bit, it guarantees to be a pseudo probe discriminator
+  inline static bool isPseudoProbeDiscriminator(unsigned Discriminator) {
+    return ((Discriminator & 0x7) == 0x7) && (Discriminator & 0xFFFFFFF8);
+  }
+
   /// Returns a new DILocation with updated \p Discriminator.
   inline const DILocation *cloneWithDiscriminator(unsigned Discriminator) const;
 
@@ -1879,6 +2052,10 @@ public:
     return getNumOperands() > 10 ? getOperandAs<Metadata>(10) : nullptr;
   }
 
+  void replaceRawLinkageName(MDString *LinkageName) {
+    replaceOperandWith(3, LinkageName);
+  }
+
   /// Check if this subprogram describes the given function.
   ///
   /// FIXME: Should this be looking through bitcasts?
@@ -2123,49 +2300,52 @@ class DIModule : public DIScope {
   friend class LLVMContextImpl;
   friend class MDNode;
   unsigned LineNo;
+  bool IsDecl;
 
   DIModule(LLVMContext &Context, StorageType Storage, unsigned LineNo,
-           ArrayRef<Metadata *> Ops)
+           bool IsDecl, ArrayRef<Metadata *> Ops)
       : DIScope(Context, DIModuleKind, Storage, dwarf::DW_TAG_module, Ops),
-        LineNo(LineNo) {}
+        LineNo(LineNo), IsDecl(IsDecl) {}
   ~DIModule() = default;
 
   static DIModule *getImpl(LLVMContext &Context, DIFile *File, DIScope *Scope,
                            StringRef Name, StringRef ConfigurationMacros,
                            StringRef IncludePath, StringRef APINotesFile,
-                           unsigned LineNo, StorageType Storage,
+                           unsigned LineNo, bool IsDecl, StorageType Storage,
                            bool ShouldCreate = true) {
     return getImpl(Context, File, Scope, getCanonicalMDString(Context, Name),
                    getCanonicalMDString(Context, ConfigurationMacros),
                    getCanonicalMDString(Context, IncludePath),
-                   getCanonicalMDString(Context, APINotesFile), LineNo, Storage,
-                   ShouldCreate);
+                   getCanonicalMDString(Context, APINotesFile), LineNo, IsDecl,
+                   Storage, ShouldCreate);
   }
   static DIModule *getImpl(LLVMContext &Context, Metadata *File,
                            Metadata *Scope, MDString *Name,
                            MDString *ConfigurationMacros, MDString *IncludePath,
-                           MDString *APINotesFile, unsigned LineNo,
+                           MDString *APINotesFile, unsigned LineNo, bool IsDecl,
                            StorageType Storage, bool ShouldCreate = true);
 
   TempDIModule cloneImpl() const {
     return getTemporary(getContext(), getFile(), getScope(), getName(),
                         getConfigurationMacros(), getIncludePath(),
-                        getAPINotesFile(), getLineNo());
+                        getAPINotesFile(), getLineNo(), getIsDecl());
   }
 
 public:
   DEFINE_MDNODE_GET(DIModule,
                     (DIFile * File, DIScope *Scope, StringRef Name,
                      StringRef ConfigurationMacros, StringRef IncludePath,
-                     StringRef APINotesFile, unsigned LineNo),
+                     StringRef APINotesFile, unsigned LineNo,
+                     bool IsDecl = false),
                     (File, Scope, Name, ConfigurationMacros, IncludePath,
-                     APINotesFile, LineNo))
+                     APINotesFile, LineNo, IsDecl))
   DEFINE_MDNODE_GET(DIModule,
                     (Metadata * File, Metadata *Scope, MDString *Name,
                      MDString *ConfigurationMacros, MDString *IncludePath,
-                     MDString *APINotesFile, unsigned LineNo),
+                     MDString *APINotesFile, unsigned LineNo,
+                     bool IsDecl = false),
                     (File, Scope, Name, ConfigurationMacros, IncludePath,
-                     APINotesFile, LineNo))
+                     APINotesFile, LineNo, IsDecl))
 
   TempDIModule clone() const { return cloneImpl(); }
 
@@ -2175,6 +2355,7 @@ public:
   StringRef getIncludePath() const { return getStringOperand(4); }
   StringRef getAPINotesFile() const { return getStringOperand(5); }
   unsigned getLineNo() const { return LineNo; }
+  bool getIsDecl() const { return IsDecl; }
 
   Metadata *getRawScope() const { return getOperand(1); }
   MDString *getRawName() const { return getOperandAs<MDString>(2); }
@@ -2409,6 +2590,9 @@ public:
   /// Determine whether this represents a standalone constant value.
   bool isConstant() const;
 
+  /// Determine whether this represents a standalone signed constant value.
+  bool isSignedConstant() const;
+
   using element_iterator = ArrayRef<uint64_t>::iterator;
 
   element_iterator elements_begin() const { return getElements().begin(); }
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/DebugLoc.h b/contrib/llvm-project/llvm/include/llvm/IR/DebugLoc.h
index 4914d733fe0d..4824f2e9f2fd 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/DebugLoc.h
+++ b/contrib/llvm-project/llvm/include/llvm/IR/DebugLoc.h
@@ -68,27 +68,13 @@ namespace llvm {
     /// Check whether this has a trivial destructor.
     bool hasTrivialDestructor() const { return Loc.hasTrivialDestructor(); }
 
-    /// Create a new DebugLoc.
-    ///
-    /// Create a new DebugLoc at the specified line/col and scope/inline.  This
-    /// forwards to \a DILocation::get().
-    ///
-    /// If \c !Scope, returns a default-constructed \a DebugLoc.
-    ///
-    /// FIXME: Remove this.  Users should use DILocation::get().
-    static DebugLoc get(unsigned Line, unsigned Col, const MDNode *Scope,
-                        const MDNode *InlinedAt = nullptr,
-                        bool ImplicitCode = false);
-
     enum { ReplaceLastInlinedAt = true };
     /// Rebuild the entire inlined-at chain for this instruction so that the top of
     /// the chain now is inlined-at the new call site.
     /// \param   InlinedAt    The new outermost inlined-at in the chain.
-    /// \param   ReplaceLast  Replace the last location in the inlined-at chain.
     static DebugLoc appendInlinedAt(const DebugLoc &DL, DILocation *InlinedAt,
                                     LLVMContext &Ctx,
-                                    DenseMap<const MDNode *, MDNode *> &Cache,
-                                    bool ReplaceLast = false);
+                                    DenseMap<const MDNode *, MDNode *> &Cache);
 
     unsigned getLine() const;
     unsigned getCol() const;
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/DerivedTypes.h b/contrib/llvm-project/llvm/include/llvm/IR/DerivedTypes.h
index 3618447168be..c3d97f4520e1 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/DerivedTypes.h
+++ b/contrib/llvm-project/llvm/include/llvm/IR/DerivedTypes.h
@@ -87,12 +87,6 @@ public:
   /// Get a bit mask for this type.
   APInt getMask() const;
 
-  /// This method determines if the width of this IntegerType is a power-of-2
-  /// in terms of 8 bit bytes.
-  /// @returns true if this is a power-of-2 byte width.
-  /// Is this a power-of-2 byte-width IntegerType ?
-  bool isPowerOf2ByteWidth() const;
-
   /// Methods for support type inquiry through isa, cast, and dyn_cast.
   static bool classof(const Type *T) {
     return T->getTypeID() == IntegerTyID;
@@ -273,6 +267,10 @@ public:
     return llvm::StructType::get(Ctx, StructFields);
   }
 
+  /// Return the type with the specified name, or null if there is none by that
+  /// name.
+  static StructType *getTypeByName(LLVMContext &C, StringRef Name);
+
   bool isPacked() const { return (getSubclassData() & SCDB_Packed) != 0; }
 
   /// Return true if this type is uniqued by structural equivalence, false if it
@@ -286,6 +284,9 @@ public:
   /// isSized - Return true if this is a sized type.
   bool isSized(SmallPtrSetImpl<Type *> *Visited = nullptr) const;
 
+  /// Returns true if this struct contains a scalable vector.
+  bool containsScalableVectorType() const;
+
   /// Return true if this is a named struct that has a non-empty name.
   bool hasName() const { return SymbolTableEntry != nullptr; }
 
@@ -423,41 +424,21 @@ public:
   /// Get the number of elements in this vector. It does not make sense to call
   /// this function on a scalable vector, and this will be moved into
   /// FixedVectorType in a future commit
-  unsigned getNumElements() const {
-    ElementCount EC = getElementCount();
-#ifdef STRICT_FIXED_SIZE_VECTORS
-    assert(!EC.Scalable &&
-           "Request for fixed number of elements from scalable vector");
-    return EC.Min;
-#else
-    if (EC.Scalable)
-      WithColor::warning()
-          << "The code that requested the fixed number of elements has made "
-             "the assumption that this vector is not scalable. This assumption "
-             "was not correct, and this may lead to broken code\n";
-    return EC.Min;
-#endif
-  }
+  LLVM_ATTRIBUTE_DEPRECATED(
+      inline unsigned getNumElements() const,
+      "Calling this function via a base VectorType is deprecated. Either call "
+      "getElementCount() and handle the case where Scalable is true or cast to "
+      "FixedVectorType.");
 
   Type *getElementType() const { return ContainedType; }
 
   /// This static method is the primary way to construct an VectorType.
   static VectorType *get(Type *ElementType, ElementCount EC);
 
-  /// Base class getter that specifically constructs a FixedVectorType. This
-  /// function is deprecated, and will be removed after LLVM 11 ships. Since
-  /// this always returns a FixedVectorType via a base VectorType pointer,
-  /// FixedVectorType::get(Type *, unsigned) is strictly better since no cast is
-  /// required to call getNumElements() on the result.
-  LLVM_ATTRIBUTE_DEPRECATED(
-      inline static VectorType *get(Type *ElementType, unsigned NumElements),
-      "The base class version of get with the scalable argument defaulted to "
-      "false is deprecated. Either call VectorType::get(Type *, unsigned, "
-      "bool) and pass false, or call FixedVectorType::get(Type *, unsigned).");
-
   static VectorType *get(Type *ElementType, unsigned NumElements,
                          bool Scalable) {
-    return VectorType::get(ElementType, {NumElements, Scalable});
+    return VectorType::get(ElementType,
+                           ElementCount::get(NumElements, Scalable));
   }
 
   static VectorType *get(Type *ElementType, const VectorType *Other) {
@@ -522,16 +503,18 @@ public:
   /// input type and the same element type.
   static VectorType *getHalfElementsVectorType(VectorType *VTy) {
     auto EltCnt = VTy->getElementCount();
-    assert ((EltCnt.Min & 1) == 0 &&
-            "Cannot halve vector with odd number of elements.");
-    return VectorType::get(VTy->getElementType(), EltCnt/2);
+    assert(EltCnt.isKnownEven() &&
+           "Cannot halve vector with odd number of elements.");
+    return VectorType::get(VTy->getElementType(),
+                           EltCnt.divideCoefficientBy(2));
   }
 
   /// This static method returns a VectorType with twice as many elements as the
   /// input type and the same element type.
   static VectorType *getDoubleElementsVectorType(VectorType *VTy) {
     auto EltCnt = VTy->getElementCount();
-    assert((EltCnt.Min * 2ull) <= UINT_MAX && "Too many elements in vector");
+    assert((EltCnt.getKnownMinValue() * 2ull) <= UINT_MAX &&
+           "Too many elements in vector");
     return VectorType::get(VTy->getElementType(), EltCnt * 2);
   }
 
@@ -549,8 +532,19 @@ public:
   }
 };
 
-inline VectorType *VectorType::get(Type *ElementType, unsigned NumElements) {
-  return VectorType::get(ElementType, NumElements, false);
+unsigned VectorType::getNumElements() const {
+  ElementCount EC = getElementCount();
+#ifdef STRICT_FIXED_SIZE_VECTORS
+  assert(!EC.isScalable() &&
+         "Request for fixed number of elements from scalable vector");
+#else
+  if (EC.isScalable())
+    WithColor::warning()
+        << "The code that requested the fixed number of elements has made the "
+           "assumption that this vector is not scalable. This assumption was "
+           "not correct, and this may lead to broken code\n";
+#endif
+  return EC.getKnownMinValue();
 }
 
 /// Class to represent fixed width SIMD vectors
@@ -596,6 +590,8 @@ public:
   static bool classof(const Type *T) {
     return T->getTypeID() == FixedVectorTyID;
   }
+
+  unsigned getNumElements() const { return ElementQuantity; }
 };
 
 /// Class to represent scalable SIMD vectors
@@ -655,7 +651,7 @@ public:
 };
 
 inline ElementCount VectorType::getElementCount() const {
-  return ElementCount(ElementQuantity, isa<ScalableVectorType>(this));
+  return ElementCount::get(ElementQuantity, isa<ScalableVectorType>(this));
 }
 
 /// Class to represent pointers.
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/DiagnosticInfo.h b/contrib/llvm-project/llvm/include/llvm/IR/DiagnosticInfo.h
index b7e0ecde8629..c457072d50f1 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/DiagnosticInfo.h
+++ b/contrib/llvm-project/llvm/include/llvm/IR/DiagnosticInfo.h
@@ -21,6 +21,7 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/Support/CBindingWrapping.h"
+#include "llvm/Support/TypeSize.h"
 #include "llvm/Support/YAMLTraits.h"
 #include <algorithm>
 #include <cstdint>
@@ -34,6 +35,7 @@ namespace llvm {
 class DiagnosticPrinter;
 class Function;
 class Instruction;
+class InstructionCost;
 class LLVMContext;
 class Module;
 class SMDiagnostic;
@@ -75,7 +77,6 @@ enum DiagnosticKind {
   DK_LastMachineRemark = DK_MachineOptimizationRemarkAnalysis,
   DK_MIRParser,
   DK_PGOProfile,
-  DK_MisExpect,
   DK_Unsupported,
   DK_FirstPluginKind // Must be last value to work with
                      // getNextAvailablePluginDiagnosticKind
@@ -434,8 +435,10 @@ public:
     Argument(StringRef Key, unsigned N);
     Argument(StringRef Key, unsigned long N);
     Argument(StringRef Key, unsigned long long N);
+    Argument(StringRef Key, ElementCount EC);
     Argument(StringRef Key, bool B) : Key(Key), Val(B ? "true" : "false") {}
     Argument(StringRef Key, DebugLoc dl);
+    Argument(StringRef Key, InstructionCost C);
   };
 
   /// \p PassName is the name of the pass emitting this diagnostic. \p
@@ -1012,25 +1015,6 @@ public:
   void print(DiagnosticPrinter &DP) const override;
 };
 
-/// Diagnostic information for MisExpect analysis.
-class DiagnosticInfoMisExpect : public DiagnosticInfoWithLocationBase {
-public:
-    DiagnosticInfoMisExpect(const Instruction *Inst, Twine &Msg);
-
-  /// \see DiagnosticInfo::print.
-  void print(DiagnosticPrinter &DP) const override;
-
-  static bool classof(const DiagnosticInfo *DI) {
-    return DI->getKind() == DK_MisExpect;
-  }
-
-  const Twine &getMsg() const { return Msg; }
-
-private:
-  /// Message to report.
-  const Twine &Msg;
-};
-
 } // end namespace llvm
 
 #endif // LLVM_IR_DIAGNOSTICINFO_H
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/Dominators.h b/contrib/llvm-project/llvm/include/llvm/IR/Dominators.h
index 71595cb15df4..08dbccaf2c01 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/Dominators.h
+++ b/contrib/llvm-project/llvm/include/llvm/IR/Dominators.h
@@ -44,6 +44,9 @@ using BBPostDomTree = PostDomTreeBase<BasicBlock>;
 
 using BBUpdates = ArrayRef<llvm::cfg::Update<BasicBlock *>>;
 
+using BBDomTreeGraphDiff = GraphDiff<BasicBlock *, false>;
+using BBPostDomTreeGraphDiff = GraphDiff<BasicBlock *, true>;
+
 extern template void Calculate<BBDomTree>(BBDomTree &DT);
 extern template void CalculateWithUpdates<BBDomTree>(BBDomTree &DT,
                                                      BBUpdates U);
@@ -62,8 +65,12 @@ extern template void DeleteEdge<BBPostDomTree>(BBPostDomTree &DT,
                                                BasicBlock *From,
                                                BasicBlock *To);
 
-extern template void ApplyUpdates<BBDomTree>(BBDomTree &DT, BBUpdates);
-extern template void ApplyUpdates<BBPostDomTree>(BBPostDomTree &DT, BBUpdates);
+extern template void ApplyUpdates<BBDomTree>(BBDomTree &DT,
+                                             BBDomTreeGraphDiff &,
+                                             BBDomTreeGraphDiff *);
+extern template void ApplyUpdates<BBPostDomTree>(BBPostDomTree &DT,
+                                                 BBPostDomTreeGraphDiff &,
+                                                 BBPostDomTreeGraphDiff *);
 
 extern template bool Verify<BBDomTree>(const BBDomTree &DT,
                                        BBDomTree::VerificationLevel VL);
@@ -158,12 +165,21 @@ class DominatorTree : public DominatorTreeBase<BasicBlock, false> {
   // Ensure base-class overloads are visible.
   using Base::dominates;
 
-  /// Return true if Def dominates a use in User.
+  /// Return true if value Def dominates use U, in the sense that Def is
+  /// available at U, and could be substituted as the used value without
+  /// violating the SSA dominance requirement.
   ///
-  /// This performs the special checks necessary if Def and User are in the same
-  /// basic block. Note that Def doesn't dominate a use in Def itself!
-  bool dominates(const Instruction *Def, const Use &U) const;
-  bool dominates(const Instruction *Def, const Instruction *User) const;
+  /// In particular, it is worth noting that:
+  ///  * Non-instruction Defs dominate everything.
+  ///  * Def does not dominate a use in Def itself (outside of degenerate cases
+  ///    like unreachable code or trivial phi cycles).
+  ///  * Invoke/callbr Defs only dominate uses in their default destination.
+  bool dominates(const Value *Def, const Use &U) const;
+  /// Return true if value Def dominates all possible uses inside instruction
+  /// User. Same comments as for the Use-based API apply.
+  bool dominates(const Value *Def, const Instruction *User) const;
+  // Does not accept Value to avoid ambiguity with dominance checks between
+  // two basic blocks.
   bool dominates(const Instruction *Def, const BasicBlock *BB) const;
 
   /// Return true if an edge dominates a use.
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/FixedMetadataKinds.def b/contrib/llvm-project/llvm/include/llvm/IR/FixedMetadataKinds.def
index 0e1ffef58672..31979cd2f9db 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/FixedMetadataKinds.def
+++ b/contrib/llvm-project/llvm/include/llvm/IR/FixedMetadataKinds.def
@@ -39,5 +39,6 @@ LLVM_FIXED_MD_KIND(MD_irr_loop, "irr_loop", 24)
 LLVM_FIXED_MD_KIND(MD_access_group, "llvm.access.group", 25)
 LLVM_FIXED_MD_KIND(MD_callback, "callback", 26)
 LLVM_FIXED_MD_KIND(MD_preserve_access_index, "llvm.preserve.access.index", 27)
-LLVM_FIXED_MD_KIND(MD_misexpect, "misexpect", 28)
-LLVM_FIXED_MD_KIND(MD_vcall_visibility, "vcall_visibility", 29)
+LLVM_FIXED_MD_KIND(MD_vcall_visibility, "vcall_visibility", 28)
+LLVM_FIXED_MD_KIND(MD_noundef, "noundef", 29)
+LLVM_FIXED_MD_KIND(MD_annotation, "annotation", 30)
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/FixedPointBuilder.h b/contrib/llvm-project/llvm/include/llvm/IR/FixedPointBuilder.h
new file mode 100644
index 000000000000..a99c761ad3e9
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/IR/FixedPointBuilder.h
@@ -0,0 +1,465 @@
+//===- llvm/FixedPointBuilder.h - Builder for fixed-point ops ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the FixedPointBuilder class, which is used as a convenient
+// way to lower fixed-point arithmetic operations to LLVM IR.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_IR_FIXEDPOINTBUILDER_H
+#define LLVM_IR_FIXEDPOINTBUILDER_H
+
+#include "llvm/ADT/APFixedPoint.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+
+namespace llvm {
+
+template <class IRBuilderTy> class FixedPointBuilder {
+  IRBuilderTy &B;
+
+  Value *Convert(Value *Src, const FixedPointSemantics &SrcSema,
+                 const FixedPointSemantics &DstSema, bool DstIsInteger) {
+    unsigned SrcWidth = SrcSema.getWidth();
+    unsigned DstWidth = DstSema.getWidth();
+    unsigned SrcScale = SrcSema.getScale();
+    unsigned DstScale = DstSema.getScale();
+    bool SrcIsSigned = SrcSema.isSigned();
+    bool DstIsSigned = DstSema.isSigned();
+
+    Type *DstIntTy = B.getIntNTy(DstWidth);
+
+    Value *Result = Src;
+    unsigned ResultWidth = SrcWidth;
+
+    // Downscale.
+    if (DstScale < SrcScale) {
+      // When converting to integers, we round towards zero. For negative
+      // numbers, right shifting rounds towards negative infinity. In this case,
+      // we can just round up before shifting.
+      if (DstIsInteger && SrcIsSigned) {
+        Value *Zero = Constant::getNullValue(Result->getType());
+        Value *IsNegative = B.CreateICmpSLT(Result, Zero);
+        Value *LowBits = ConstantInt::get(
+            B.getContext(), APInt::getLowBitsSet(ResultWidth, SrcScale));
+        Value *Rounded = B.CreateAdd(Result, LowBits);
+        Result = B.CreateSelect(IsNegative, Rounded, Result);
+      }
+
+      Result = SrcIsSigned
+                   ? B.CreateAShr(Result, SrcScale - DstScale, "downscale")
+                   : B.CreateLShr(Result, SrcScale - DstScale, "downscale");
+    }
+
+    if (!DstSema.isSaturated()) {
+      // Resize.
+      Result = B.CreateIntCast(Result, DstIntTy, SrcIsSigned, "resize");
+
+      // Upscale.
+      if (DstScale > SrcScale)
+        Result = B.CreateShl(Result, DstScale - SrcScale, "upscale");
+    } else {
+      // Adjust the number of fractional bits.
+      if (DstScale > SrcScale) {
+        // Compare to DstWidth to prevent resizing twice.
+        ResultWidth = std::max(SrcWidth + DstScale - SrcScale, DstWidth);
+        Type *UpscaledTy = B.getIntNTy(ResultWidth);
+        Result = B.CreateIntCast(Result, UpscaledTy, SrcIsSigned, "resize");
+        Result = B.CreateShl(Result, DstScale - SrcScale, "upscale");
+      }
+
+      // Handle saturation.
+      bool LessIntBits = DstSema.getIntegralBits() < SrcSema.getIntegralBits();
+      if (LessIntBits) {
+        Value *Max = ConstantInt::get(
+            B.getContext(),
+            APFixedPoint::getMax(DstSema).getValue().extOrTrunc(ResultWidth));
+        Value *TooHigh = SrcIsSigned ? B.CreateICmpSGT(Result, Max)
+                                     : B.CreateICmpUGT(Result, Max);
+        Result = B.CreateSelect(TooHigh, Max, Result, "satmax");
+      }
+      // Cannot overflow min to dest type if src is unsigned since all fixed
+      // point types can cover the unsigned min of 0.
+      if (SrcIsSigned && (LessIntBits || !DstIsSigned)) {
+        Value *Min = ConstantInt::get(
+            B.getContext(),
+            APFixedPoint::getMin(DstSema).getValue().extOrTrunc(ResultWidth));
+        Value *TooLow = B.CreateICmpSLT(Result, Min);
+        Result = B.CreateSelect(TooLow, Min, Result, "satmin");
+      }
+
+      // Resize the integer part to get the final destination size.
+      if (ResultWidth != DstWidth)
+        Result = B.CreateIntCast(Result, DstIntTy, SrcIsSigned, "resize");
+    }
+    return Result;
+  }
+
+  /// Get the common semantic for two semantics, with the added imposition that
+  /// saturated padded types retain the padding bit.
+  FixedPointSemantics
+  getCommonBinopSemantic(const FixedPointSemantics &LHSSema,
+                         const FixedPointSemantics &RHSSema) {
+    auto C = LHSSema.getCommonSemantics(RHSSema);
+    bool BothPadded =
+        LHSSema.hasUnsignedPadding() && RHSSema.hasUnsignedPadding();
+    return FixedPointSemantics(
+        C.getWidth() + (unsigned)(BothPadded && C.isSaturated()), C.getScale(),
+        C.isSigned(), C.isSaturated(), BothPadded);
+  }
+
+  /// Given a floating point type and a fixed-point semantic, return a floating
+  /// point type which can accommodate the fixed-point semantic. This is either
+  /// \p Ty, or a floating point type with a larger exponent than Ty.
+  Type *getAccommodatingFloatType(Type *Ty, const FixedPointSemantics &Sema) {
+    const fltSemantics *FloatSema = &Ty->getFltSemantics();
+    while (!Sema.fitsInFloatSemantics(*FloatSema))
+      FloatSema = APFixedPoint::promoteFloatSemantics(FloatSema);
+    return Type::getFloatingPointTy(Ty->getContext(), *FloatSema);
+  }
+
+public:
+  FixedPointBuilder(IRBuilderTy &Builder) : B(Builder) {}
+
+  /// Convert an integer value representing a fixed-point number from one
+  /// fixed-point semantic to another fixed-point semantic.
+  /// \p Src     - The source value
+  /// \p SrcSema - The fixed-point semantic of the source value
+  /// \p DstSema - The resulting fixed-point semantic
+  Value *CreateFixedToFixed(Value *Src, const FixedPointSemantics &SrcSema,
+                            const FixedPointSemantics &DstSema) {
+    return Convert(Src, SrcSema, DstSema, false);
+  }
+
+  /// Convert an integer value representing a fixed-point number to an integer
+  /// with the given bit width and signedness.
+  /// \p Src         - The source value
+  /// \p SrcSema     - The fixed-point semantic of the source value
+  /// \p DstWidth    - The bit width of the result value
+  /// \p DstIsSigned - The signedness of the result value
+  Value *CreateFixedToInteger(Value *Src, const FixedPointSemantics &SrcSema,
+                              unsigned DstWidth, bool DstIsSigned) {
+    return Convert(
+        Src, SrcSema,
+        FixedPointSemantics::GetIntegerSemantics(DstWidth, DstIsSigned), true);
+  }
+
+  /// Convert an integer value with the given signedness to an integer value
+  /// representing the given fixed-point semantic.
+  /// \p Src         - The source value
+  /// \p SrcIsSigned - The signedness of the source value
+  /// \p DstSema     - The resulting fixed-point semantic
+  Value *CreateIntegerToFixed(Value *Src, unsigned SrcIsSigned,
+                              const FixedPointSemantics &DstSema) {
+    return Convert(Src,
+                   FixedPointSemantics::GetIntegerSemantics(
+                       Src->getType()->getScalarSizeInBits(), SrcIsSigned),
+                   DstSema, false);
+  }
+
+  Value *CreateFixedToFloating(Value *Src, const FixedPointSemantics &SrcSema,
+                               Type *DstTy) {
+    Value *Result;
+    Type *OpTy = getAccommodatingFloatType(DstTy, SrcSema);
+    // Convert the raw fixed-point value directly to floating point. If the
+    // value is too large to fit, it will be rounded, not truncated.
+    Result = SrcSema.isSigned() ? B.CreateSIToFP(Src, OpTy)
+                                : B.CreateUIToFP(Src, OpTy);
+    // Rescale the integral-in-floating point by the scaling factor. This is
+    // lossless, except for overflow to infinity which is unlikely.
+    Result = B.CreateFMul(Result,
+        ConstantFP::get(OpTy, std::pow(2, -(int)SrcSema.getScale())));
+    if (OpTy != DstTy)
+      Result = B.CreateFPTrunc(Result, DstTy);
+    return Result;
+  }
+
+  Value *CreateFloatingToFixed(Value *Src, const FixedPointSemantics &DstSema) {
+    bool UseSigned = DstSema.isSigned() || DstSema.hasUnsignedPadding();
+    Value *Result = Src;
+    Type *OpTy = getAccommodatingFloatType(Src->getType(), DstSema);
+    if (OpTy != Src->getType())
+      Result = B.CreateFPExt(Result, OpTy);
+    // Rescale the floating point value so that its significant bits (for the
+    // purposes of the conversion) are in the integral range.
+    Result = B.CreateFMul(Result,
+        ConstantFP::get(OpTy, std::pow(2, DstSema.getScale())));
+
+    Type *ResultTy = B.getIntNTy(DstSema.getWidth());
+    if (DstSema.isSaturated()) {
+      Intrinsic::ID IID =
+          UseSigned ? Intrinsic::fptosi_sat : Intrinsic::fptoui_sat;
+      Result = B.CreateIntrinsic(IID, {ResultTy, OpTy}, {Result});
+    } else {
+      Result = UseSigned ? B.CreateFPToSI(Result, ResultTy)
+                         : B.CreateFPToUI(Result, ResultTy);
+    }
+
+    // When saturating unsigned-with-padding using signed operations, we may
+    // get negative values. Emit an extra clamp to zero.
+    if (DstSema.isSaturated() && DstSema.hasUnsignedPadding()) {
+      Constant *Zero = Constant::getNullValue(Result->getType());
+      Result =
+          B.CreateSelect(B.CreateICmpSLT(Result, Zero), Zero, Result, "satmin");
+    }
+
+    return Result;
+  }
+
+  /// Add two fixed-point values and return the result in their common semantic.
+  /// \p LHS     - The left hand side
+  /// \p LHSSema - The semantic of the left hand side
+  /// \p RHS     - The right hand side
+  /// \p RHSSema - The semantic of the right hand side
+  Value *CreateAdd(Value *LHS, const FixedPointSemantics &LHSSema,
+                   Value *RHS, const FixedPointSemantics &RHSSema) {
+    auto CommonSema = getCommonBinopSemantic(LHSSema, RHSSema);
+    bool UseSigned = CommonSema.isSigned() || CommonSema.hasUnsignedPadding();
+
+    Value *WideLHS = CreateFixedToFixed(LHS, LHSSema, CommonSema);
+    Value *WideRHS = CreateFixedToFixed(RHS, RHSSema, CommonSema);
+
+    Value *Result;
+    if (CommonSema.isSaturated()) {
+      Intrinsic::ID IID = UseSigned ? Intrinsic::sadd_sat : Intrinsic::uadd_sat;
+      Result = B.CreateBinaryIntrinsic(IID, WideLHS, WideRHS);
+    } else {
+      Result = B.CreateAdd(WideLHS, WideRHS);
+    }
+
+    return CreateFixedToFixed(Result, CommonSema,
+                              LHSSema.getCommonSemantics(RHSSema));
+  }
+
+  /// Subtract two fixed-point values and return the result in their common
+  /// semantic.
+  /// \p LHS     - The left hand side
+  /// \p LHSSema - The semantic of the left hand side
+  /// \p RHS     - The right hand side
+  /// \p RHSSema - The semantic of the right hand side
+  Value *CreateSub(Value *LHS, const FixedPointSemantics &LHSSema,
+                   Value *RHS, const FixedPointSemantics &RHSSema) {
+    auto CommonSema = getCommonBinopSemantic(LHSSema, RHSSema);
+    bool UseSigned = CommonSema.isSigned() || CommonSema.hasUnsignedPadding();
+
+    Value *WideLHS = CreateFixedToFixed(LHS, LHSSema, CommonSema);
+    Value *WideRHS = CreateFixedToFixed(RHS, RHSSema, CommonSema);
+
+    Value *Result;
+    if (CommonSema.isSaturated()) {
+      Intrinsic::ID IID = UseSigned ? Intrinsic::ssub_sat : Intrinsic::usub_sat;
+      Result = B.CreateBinaryIntrinsic(IID, WideLHS, WideRHS);
+    } else {
+      Result = B.CreateSub(WideLHS, WideRHS);
+    }
+
+    // Subtraction can end up below 0 for padded unsigned operations, so emit
+    // an extra clamp in that case.
+    if (CommonSema.isSaturated() && CommonSema.hasUnsignedPadding()) {
+      Constant *Zero = Constant::getNullValue(Result->getType());
+      Result =
+          B.CreateSelect(B.CreateICmpSLT(Result, Zero), Zero, Result, "satmin");
+    }
+
+    return CreateFixedToFixed(Result, CommonSema,
+                              LHSSema.getCommonSemantics(RHSSema));
+  }
+
+  /// Multiply two fixed-point values and return the result in their common
+  /// semantic.
+  /// \p LHS     - The left hand side
+  /// \p LHSSema - The semantic of the left hand side
+  /// \p RHS     - The right hand side
+  /// \p RHSSema - The semantic of the right hand side
+  Value *CreateMul(Value *LHS, const FixedPointSemantics &LHSSema,
+                   Value *RHS, const FixedPointSemantics &RHSSema) {
+    auto CommonSema = getCommonBinopSemantic(LHSSema, RHSSema);
+    bool UseSigned = CommonSema.isSigned() || CommonSema.hasUnsignedPadding();
+
+    Value *WideLHS = CreateFixedToFixed(LHS, LHSSema, CommonSema);
+    Value *WideRHS = CreateFixedToFixed(RHS, RHSSema, CommonSema);
+
+    Intrinsic::ID IID;
+    if (CommonSema.isSaturated()) {
+      IID = UseSigned ? Intrinsic::smul_fix_sat : Intrinsic::umul_fix_sat;
+    } else {
+      IID = UseSigned ? Intrinsic::smul_fix : Intrinsic::umul_fix;
+    }
+    Value *Result = B.CreateIntrinsic(
+        IID, {WideLHS->getType()},
+        {WideLHS, WideRHS, B.getInt32(CommonSema.getScale())});
+
+    return CreateFixedToFixed(Result, CommonSema,
+                              LHSSema.getCommonSemantics(RHSSema));
+  }
+
+  /// Divide two fixed-point values and return the result in their common
+  /// semantic.
+  /// \p LHS     - The left hand side
+  /// \p LHSSema - The semantic of the left hand side
+  /// \p RHS     - The right hand side
+  /// \p RHSSema - The semantic of the right hand side
+  Value *CreateDiv(Value *LHS, const FixedPointSemantics &LHSSema,
+                   Value *RHS, const FixedPointSemantics &RHSSema) {
+    auto CommonSema = getCommonBinopSemantic(LHSSema, RHSSema);
+    bool UseSigned = CommonSema.isSigned() || CommonSema.hasUnsignedPadding();
+
+    Value *WideLHS = CreateFixedToFixed(LHS, LHSSema, CommonSema);
+    Value *WideRHS = CreateFixedToFixed(RHS, RHSSema, CommonSema);
+
+    Intrinsic::ID IID;
+    if (CommonSema.isSaturated()) {
+      IID = UseSigned ? Intrinsic::sdiv_fix_sat : Intrinsic::udiv_fix_sat;
+    } else {
+      IID = UseSigned ? Intrinsic::sdiv_fix : Intrinsic::udiv_fix;
+    }
+    Value *Result = B.CreateIntrinsic(
+        IID, {WideLHS->getType()},
+        {WideLHS, WideRHS, B.getInt32(CommonSema.getScale())});
+
+    return CreateFixedToFixed(Result, CommonSema,
+                              LHSSema.getCommonSemantics(RHSSema));
+  }
+
+  /// Left shift a fixed-point value by an unsigned integer value. The integer
+  /// value can be any bit width.
+  /// \p LHS     - The left hand side
+  /// \p LHSSema - The semantic of the left hand side
+  /// \p RHS     - The right hand side
+  Value *CreateShl(Value *LHS, const FixedPointSemantics &LHSSema, Value *RHS) {
+    bool UseSigned = LHSSema.isSigned() || LHSSema.hasUnsignedPadding();
+
+    RHS = B.CreateIntCast(RHS, LHS->getType(), /*IsSigned=*/false);
+
+    Value *Result;
+    if (LHSSema.isSaturated()) {
+      Intrinsic::ID IID = UseSigned ? Intrinsic::sshl_sat : Intrinsic::ushl_sat;
+      Result = B.CreateBinaryIntrinsic(IID, LHS, RHS);
+    } else {
+      Result = B.CreateShl(LHS, RHS);
+    }
+
+    return Result;
+  }
+
+  /// Right shift a fixed-point value by an unsigned integer value. The integer
+  /// value can be any bit width.
+  /// \p LHS     - The left hand side
+  /// \p LHSSema - The semantic of the left hand side
+  /// \p RHS     - The right hand side
+  Value *CreateShr(Value *LHS, const FixedPointSemantics &LHSSema, Value *RHS) {
+    RHS = B.CreateIntCast(RHS, LHS->getType(), false);
+
+    return LHSSema.isSigned() ? B.CreateAShr(LHS, RHS) : B.CreateLShr(LHS, RHS);
+  }
+
+  /// Compare two fixed-point values for equality.
+  /// \p LHS     - The left hand side
+  /// \p LHSSema - The semantic of the left hand side
+  /// \p RHS     - The right hand side
+  /// \p RHSSema - The semantic of the right hand side
+  Value *CreateEQ(Value *LHS, const FixedPointSemantics &LHSSema,
+                  Value *RHS, const FixedPointSemantics &RHSSema) {
+    auto CommonSema = getCommonBinopSemantic(LHSSema, RHSSema);
+
+    Value *WideLHS = CreateFixedToFixed(LHS, LHSSema, CommonSema);
+    Value *WideRHS = CreateFixedToFixed(RHS, RHSSema, CommonSema);
+
+    return B.CreateICmpEQ(WideLHS, WideRHS);
+  }
+
+  /// Compare two fixed-point values for inequality.
+  /// \p LHS     - The left hand side
+  /// \p LHSSema - The semantic of the left hand side
+  /// \p RHS     - The right hand side
+  /// \p RHSSema - The semantic of the right hand side
+  Value *CreateNE(Value *LHS, const FixedPointSemantics &LHSSema,
+                  Value *RHS, const FixedPointSemantics &RHSSema) {
+    auto CommonSema = getCommonBinopSemantic(LHSSema, RHSSema);
+
+    Value *WideLHS = CreateFixedToFixed(LHS, LHSSema, CommonSema);
+    Value *WideRHS = CreateFixedToFixed(RHS, RHSSema, CommonSema);
+
+    return B.CreateICmpNE(WideLHS, WideRHS);
+  }
+
+  /// Compare two fixed-point values as LHS < RHS.
+  /// \p LHS     - The left hand side
+  /// \p LHSSema - The semantic of the left hand side
+  /// \p RHS     - The right hand side
+  /// \p RHSSema - The semantic of the right hand side
+  Value *CreateLT(Value *LHS, const FixedPointSemantics &LHSSema,
+                  Value *RHS, const FixedPointSemantics &RHSSema) {
+    auto CommonSema = getCommonBinopSemantic(LHSSema, RHSSema);
+
+    Value *WideLHS = CreateFixedToFixed(LHS, LHSSema, CommonSema);
+    Value *WideRHS = CreateFixedToFixed(RHS, RHSSema, CommonSema);
+
+    return CommonSema.isSigned() ? B.CreateICmpSLT(WideLHS, WideRHS)
+                                 : B.CreateICmpULT(WideLHS, WideRHS);
+  }
+
+  /// Compare two fixed-point values as LHS <= RHS.
+  /// \p LHS     - The left hand side
+  /// \p LHSSema - The semantic of the left hand side
+  /// \p RHS     - The right hand side
+  /// \p RHSSema - The semantic of the right hand side
+  Value *CreateLE(Value *LHS, const FixedPointSemantics &LHSSema,
+                  Value *RHS, const FixedPointSemantics &RHSSema) {
+    auto CommonSema = getCommonBinopSemantic(LHSSema, RHSSema);
+
+    Value *WideLHS = CreateFixedToFixed(LHS, LHSSema, CommonSema);
+    Value *WideRHS = CreateFixedToFixed(RHS, RHSSema, CommonSema);
+
+    return CommonSema.isSigned() ? B.CreateICmpSLE(WideLHS, WideRHS)
+                                 : B.CreateICmpULE(WideLHS, WideRHS);
+  }
+
+  /// Compare two fixed-point values as LHS > RHS.
+  /// \p LHS     - The left hand side
+  /// \p LHSSema - The semantic of the left hand side
+  /// \p RHS     - The right hand side
+  /// \p RHSSema - The semantic of the right hand side
+  Value *CreateGT(Value *LHS, const FixedPointSemantics &LHSSema,
+                  Value *RHS, const FixedPointSemantics &RHSSema) {
+    auto CommonSema = getCommonBinopSemantic(LHSSema, RHSSema);
+
+    Value *WideLHS = CreateFixedToFixed(LHS, LHSSema, CommonSema);
+    Value *WideRHS = CreateFixedToFixed(RHS, RHSSema, CommonSema);
+
+    return CommonSema.isSigned() ? B.CreateICmpSGT(WideLHS, WideRHS)
+                                 : B.CreateICmpUGT(WideLHS, WideRHS);
+  }
+
+  /// Compare two fixed-point values as LHS >= RHS.
+  /// \p LHS     - The left hand side
+  /// \p LHSSema - The semantic of the left hand side
+  /// \p RHS     - The right hand side
+  /// \p RHSSema - The semantic of the right hand side
+  Value *CreateGE(Value *LHS, const FixedPointSemantics &LHSSema,
+                  Value *RHS, const FixedPointSemantics &RHSSema) {
+    auto CommonSema = getCommonBinopSemantic(LHSSema, RHSSema);
+
+    Value *WideLHS = CreateFixedToFixed(LHS, LHSSema, CommonSema);
+    Value *WideRHS = CreateFixedToFixed(RHS, RHSSema, CommonSema);
+
+    return CommonSema.isSigned() ? B.CreateICmpSGE(WideLHS, WideRHS)
+                                 : B.CreateICmpUGE(WideLHS, WideRHS);
+  }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_IR_FIXEDPOINTBUILDER_H
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/Function.h b/contrib/llvm-project/llvm/include/llvm/IR/Function.h
index bb4ec13c7610..7e209bb3769b 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/Function.h
+++ b/contrib/llvm-project/llvm/include/llvm/IR/Function.h
@@ -199,6 +199,15 @@ public:
   /// returns Intrinsic::not_intrinsic!
   bool isIntrinsic() const { return HasLLVMReservedName; }
 
+  /// isTargetIntrinsic - Returns true if IID is an intrinsic specific to a
+  /// certain target. If it is a generic intrinsic false is returned.
+  static bool isTargetIntrinsic(Intrinsic::ID IID);
+
+  /// isTargetIntrinsic - Returns true if this function is an intrinsic and the
+  /// intrinsic is specific to a certain target. If this is not an intrinsic
+  /// or a generic intrinsic, false is returned.
+  bool isTargetIntrinsic() const;
+
   /// Returns true if the function is one of the "Constrained Floating-Point
   /// Intrinsics". Returns false if not, and returns false when
   /// getIntrinsicID() returns Intrinsic::not_intrinsic.
@@ -259,6 +268,12 @@ public:
         getContext(), AttributeList::FunctionIndex, Kind));
   }
 
+  /// A function will have the "coroutine.presplit" attribute if it's
+  /// a coroutine and has not gone through full CoroSplit pass.
+  bool isPresplitCoroutine() const {
+    return hasFnAttribute("coroutine.presplit");
+  }
+
   enum ProfileCountType { PCT_Invalid, PCT_Real, PCT_Synthetic };
 
   /// Class to represent profile counts.
@@ -372,6 +387,9 @@ public:
   void setGC(std::string Str);
   void clearGC();
 
+  /// Returns true if the function has ssp, sspstrong, or sspreq fn attrs.
+  bool hasStackProtectorFnAttr() const;
+
   /// adds the attribute to the list of attributes.
   void addAttribute(unsigned i, Attribute::AttrKind Kind);
 
@@ -463,8 +481,17 @@ public:
 
   /// Extract the byval type for a parameter.
   Type *getParamByValType(unsigned ArgNo) const {
-    Type *Ty = AttributeSets.getParamByValType(ArgNo);
-    return Ty ? Ty : (arg_begin() + ArgNo)->getType()->getPointerElementType();
+    return AttributeSets.getParamByValType(ArgNo);
+  }
+
+  /// Extract the sret type for a parameter.
+  Type *getParamStructRetType(unsigned ArgNo) const {
+    return AttributeSets.getParamStructRetType(ArgNo);
+  }
+
+  /// Extract the byref type for a parameter.
+  Type *getParamByRefType(unsigned ArgNo) const {
+    return AttributeSets.getParamByRefType(ArgNo);
   }
 
   /// Extract the number of dereferenceable bytes for a call or
@@ -606,6 +633,17 @@ public:
     addFnAttr(Attribute::NoRecurse);
   }
 
+  /// Determine if the function is required to make forward progress.
+  bool mustProgress() const {
+    return hasFnAttribute(Attribute::MustProgress) ||
+           hasFnAttribute(Attribute::WillReturn);
+  }
+  void setMustProgress() { addFnAttr(Attribute::MustProgress); }
+
+  /// Determine if the function will return.
+  bool willReturn() const { return hasFnAttribute(Attribute::WillReturn); }
+  void setWillReturn() { addFnAttr(Attribute::WillReturn); }
+
   /// True if the ABI mandates (or the user requested) that this
   /// function be in a unwind table.
   bool hasUWTable() const {
@@ -648,6 +686,10 @@ public:
     return hasFnAttribute(Attribute::OptimizeForSize) || hasMinSize();
   }
 
+  /// Returns the denormal handling type for the default rounding mode of the
+  /// function.
+  DenormalMode getDenormalMode(const fltSemantics &FPType) const;
+
   /// copyAttributesFrom - copy all additional attributes (those not needed to
   /// create a Function) from the Function Src to this one.
   void copyAttributesFrom(const Function *Src);
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/GetElementPtrTypeIterator.h b/contrib/llvm-project/llvm/include/llvm/IR/GetElementPtrTypeIterator.h
index 79ea5791b2fd..6293305a2639 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/GetElementPtrTypeIterator.h
+++ b/contrib/llvm-project/llvm/include/llvm/IR/GetElementPtrTypeIterator.h
@@ -83,7 +83,7 @@ namespace llvm {
         if (isa<ScalableVectorType>(VTy))
           NumElements = Unbounded;
         else
-          NumElements = VTy->getNumElements();
+          NumElements = cast<FixedVectorType>(VTy)->getNumElements();
       } else
         CurTy = dyn_cast<StructType>(Ty);
       ++OpIt;
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/GlobalObject.h b/contrib/llvm-project/llvm/include/llvm/IR/GlobalObject.h
index 3a7b718845cb..d01abdc3b625 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/GlobalObject.h
+++ b/contrib/llvm-project/llvm/include/llvm/IR/GlobalObject.h
@@ -54,7 +54,6 @@ protected:
   Comdat *ObjComdat;
   enum {
     LastAlignmentBit = 4,
-    HasMetadataHashEntryBit,
     HasSectionHashEntryBit,
 
     GlobalObjectBits,
@@ -127,58 +126,13 @@ public:
   Comdat *getComdat() { return ObjComdat; }
   void setComdat(Comdat *C) { ObjComdat = C; }
 
-  /// Check if this has any metadata.
-  bool hasMetadata() const { return hasMetadataHashEntry(); }
-
-  /// Check if this has any metadata of the given kind.
-  bool hasMetadata(unsigned KindID) const {
-    return getMetadata(KindID) != nullptr;
-  }
-  bool hasMetadata(StringRef Kind) const {
-    return getMetadata(Kind) != nullptr;
-  }
-
-  /// Get the current metadata attachments for the given kind, if any.
-  ///
-  /// These functions require that the function have at most a single attachment
-  /// of the given kind, and return \c nullptr if such an attachment is missing.
-  /// @{
-  MDNode *getMetadata(unsigned KindID) const;
-  MDNode *getMetadata(StringRef Kind) const;
-  /// @}
-
-  /// Appends all attachments with the given ID to \c MDs in insertion order.
-  /// If the global has no attachments with the given ID, or if ID is invalid,
-  /// leaves MDs unchanged.
-  /// @{
-  void getMetadata(unsigned KindID, SmallVectorImpl<MDNode *> &MDs) const;
-  void getMetadata(StringRef Kind, SmallVectorImpl<MDNode *> &MDs) const;
-  /// @}
-
-  /// Set a particular kind of metadata attachment.
-  ///
-  /// Sets the given attachment to \c MD, erasing it if \c MD is \c nullptr or
-  /// replacing it if it already exists.
-  /// @{
-  void setMetadata(unsigned KindID, MDNode *MD);
-  void setMetadata(StringRef Kind, MDNode *MD);
-  /// @}
-
-  /// Add a metadata attachment.
-  /// @{
-  void addMetadata(unsigned KindID, MDNode &MD);
-  void addMetadata(StringRef Kind, MDNode &MD);
-  /// @}
-
-  /// Appends all attachments for the global to \c MDs, sorting by attachment
-  /// ID. Attachments with the same ID appear in insertion order.
-  void
-  getAllMetadata(SmallVectorImpl<std::pair<unsigned, MDNode *>> &MDs) const;
-
-  /// Erase all metadata attachments with the given kind.
-  ///
-  /// \returns true if any metadata was removed.
-  bool eraseMetadata(unsigned KindID);
+  using Value::addMetadata;
+  using Value::clearMetadata;
+  using Value::eraseMetadata;
+  using Value::getAllMetadata;
+  using Value::getMetadata;
+  using Value::hasMetadata;
+  using Value::setMetadata;
 
   /// Copy metadata from Src, adjusting offsets by Offset.
   void copyMetadata(const GlobalObject *Src, unsigned Offset);
@@ -204,8 +158,6 @@ public:
            V->getValueID() == Value::GlobalVariableVal;
   }
 
-  void clearMetadata();
-
 private:
   void setGlobalObjectFlag(unsigned Bit, bool Val) {
     unsigned Mask = 1 << Bit;
@@ -213,13 +165,6 @@ private:
                                (Val ? Mask : 0u));
   }
 
-  bool hasMetadataHashEntry() const {
-    return getGlobalValueSubClassData() & (1 << HasMetadataHashEntryBit);
-  }
-  void setHasMetadataHashEntry(bool HasEntry) {
-    setGlobalObjectFlag(HasMetadataHashEntryBit, HasEntry);
-  }
-
   StringRef getSectionImpl() const;
 };
 
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/GlobalVariable.h b/contrib/llvm-project/llvm/include/llvm/IR/GlobalVariable.h
index 12093e337d6e..674d49eb9de6 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/GlobalVariable.h
+++ b/contrib/llvm-project/llvm/include/llvm/IR/GlobalVariable.h
@@ -56,10 +56,11 @@ public:
                  bool isExternallyInitialized = false);
   /// GlobalVariable ctor - This creates a global and inserts it before the
   /// specified other global.
-  GlobalVariable(Module &M, Type *Ty, bool isConstant,
-                 LinkageTypes Linkage, Constant *Initializer,
-                 const Twine &Name = "", GlobalVariable *InsertBefore = nullptr,
-                 ThreadLocalMode = NotThreadLocal, unsigned AddressSpace = 0,
+  GlobalVariable(Module &M, Type *Ty, bool isConstant, LinkageTypes Linkage,
+                 Constant *Initializer, const Twine &Name = "",
+                 GlobalVariable *InsertBefore = nullptr,
+                 ThreadLocalMode = NotThreadLocal,
+                 Optional<unsigned> AddressSpace = None,
                  bool isExternallyInitialized = false);
   GlobalVariable(const GlobalVariable &) = delete;
   GlobalVariable &operator=(const GlobalVariable &) = delete;
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/IRBuilder.h b/contrib/llvm-project/llvm/include/llvm/IR/IRBuilder.h
index b90480ebc59e..9cefc9aa764c 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/IRBuilder.h
+++ b/contrib/llvm-project/llvm/include/llvm/IR/IRBuilder.h
@@ -17,6 +17,7 @@
 #include "llvm-c/Types.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/None.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/IR/BasicBlock.h"
@@ -24,6 +25,7 @@
 #include "llvm/IR/ConstantFolder.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
@@ -91,7 +93,28 @@ public:
 
 /// Common base class shared among various IRBuilders.
 class IRBuilderBase {
-  DebugLoc CurDbgLocation;
+  /// Pairs of (metadata kind, MDNode *) that should be added to all newly
+  /// created instructions, like !dbg metadata.
+  SmallVector<std::pair<unsigned, MDNode *>, 2> MetadataToCopy;
+
+  /// Add or update the an entry (Kind, MD) to MetadataToCopy, if \p MD is not
+  /// null. If \p MD is null, remove the entry with \p Kind.
+  void AddOrRemoveMetadataToCopy(unsigned Kind, MDNode *MD) {
+    if (!MD) {
+      erase_if(MetadataToCopy, [Kind](const std::pair<unsigned, MDNode *> &KV) {
+        return KV.first == Kind;
+      });
+      return;
+    }
+
+    for (auto &KV : MetadataToCopy)
+      if (KV.first == Kind) {
+        KV.second = MD;
+        return;
+      }
+
+    MetadataToCopy.emplace_back(Kind, MD);
+  }
 
 protected:
   BasicBlock *BB;
@@ -125,7 +148,7 @@ public:
   template<typename InstTy>
   InstTy *Insert(InstTy *I, const Twine &Name = "") const {
     Inserter.InsertHelper(I, Name, BB, InsertPt);
-    SetInstDebugLocation(I);
+    AddMetadataToInst(I);
     return I;
   }
 
@@ -182,16 +205,42 @@ public:
   }
 
   /// Set location information used by debugging information.
-  void SetCurrentDebugLocation(DebugLoc L) { CurDbgLocation = std::move(L); }
+  void SetCurrentDebugLocation(DebugLoc L) {
+    AddOrRemoveMetadataToCopy(LLVMContext::MD_dbg, L.getAsMDNode());
+  }
+
+  /// Collect metadata with IDs \p MetadataKinds from \p Src which should be
+  /// added to all created instructions. Entries present in MedataDataToCopy but
+  /// not on \p Src will be dropped from MetadataToCopy.
+  void CollectMetadataToCopy(Instruction *Src,
+                             ArrayRef<unsigned> MetadataKinds) {
+    for (unsigned K : MetadataKinds)
+      AddOrRemoveMetadataToCopy(K, Src->getMetadata(K));
+  }
 
   /// Get location information used by debugging information.
-  const DebugLoc &getCurrentDebugLocation() const { return CurDbgLocation; }
+  DebugLoc getCurrentDebugLocation() const {
+    for (auto &KV : MetadataToCopy)
+      if (KV.first == LLVMContext::MD_dbg)
+        return {cast<DILocation>(KV.second)};
+
+    return {};
+  }
 
   /// If this builder has a current debug location, set it on the
   /// specified instruction.
   void SetInstDebugLocation(Instruction *I) const {
-    if (CurDbgLocation)
-      I->setDebugLoc(CurDbgLocation);
+    for (const auto &KV : MetadataToCopy)
+      if (KV.first == LLVMContext::MD_dbg) {
+        I->setDebugLoc(DebugLoc(KV.second));
+        return;
+      }
+  }
+
+  /// Add all entries in MetadataToCopy to \p I.
+  void AddMetadataToInst(Instruction *I) const {
+    for (auto &KV : MetadataToCopy)
+      I->setMetadata(KV.first, KV.second);
   }
 
   /// Get the return type of the current function that we're emitting
@@ -266,11 +315,19 @@ public:
 
   /// Set the exception handling to be used with constrained floating point
   void setDefaultConstrainedExcept(fp::ExceptionBehavior NewExcept) {
+#ifndef NDEBUG
+    Optional<StringRef> ExceptStr = ExceptionBehaviorToStr(NewExcept);
+    assert(ExceptStr.hasValue() && "Garbage strict exception behavior!");
+#endif
     DefaultConstrainedExcept = NewExcept;
   }
 
   /// Set the rounding mode handling to be used with constrained floating point
   void setDefaultConstrainedRounding(RoundingMode NewRounding) {
+#ifndef NDEBUG
+    Optional<StringRef> RoundingStr = RoundingModeToStr(NewRounding);
+    assert(RoundingStr.hasValue() && "Garbage strict rounding mode!");
+#endif
     DefaultConstrainedRounding = NewRounding;
   }
 
@@ -293,9 +350,8 @@ public:
     }
   }
 
-  void setConstrainedFPCallAttr(CallInst *I) {
-    if (!I->hasFnAttr(Attribute::StrictFP))
-      I->addAttribute(AttributeList::FunctionIndex, Attribute::StrictFP);
+  void setConstrainedFPCallAttr(CallBase *I) {
+    I->addAttribute(AttributeList::FunctionIndex, Attribute::StrictFP);
   }
 
   void setDefaultOperandBundles(ArrayRef<OperandBundleDef> OpBundles) {
@@ -574,12 +630,22 @@ public:
                         NoAliasTag);
   }
 
+  CallInst *CreateMemTransferInst(
+      Intrinsic::ID IntrID, Value *Dst, MaybeAlign DstAlign, Value *Src,
+      MaybeAlign SrcAlign, Value *Size, bool isVolatile = false,
+      MDNode *TBAATag = nullptr, MDNode *TBAAStructTag = nullptr,
+      MDNode *ScopeTag = nullptr, MDNode *NoAliasTag = nullptr);
+
   CallInst *CreateMemCpy(Value *Dst, MaybeAlign DstAlign, Value *Src,
                          MaybeAlign SrcAlign, Value *Size,
                          bool isVolatile = false, MDNode *TBAATag = nullptr,
                          MDNode *TBAAStructTag = nullptr,
                          MDNode *ScopeTag = nullptr,
-                         MDNode *NoAliasTag = nullptr);
+                         MDNode *NoAliasTag = nullptr) {
+    return CreateMemTransferInst(Intrinsic::memcpy, Dst, DstAlign, Src,
+                                 SrcAlign, Size, isVolatile, TBAATag,
+                                 TBAAStructTag, ScopeTag, NoAliasTag);
+  }
 
   CallInst *CreateMemCpyInline(Value *Dst, MaybeAlign DstAlign, Value *Src,
                                MaybeAlign SrcAlign, Value *Size);
@@ -713,11 +779,11 @@ public:
 
   /// Create a vector float max reduction intrinsic of the source
   /// vector.
-  CallInst *CreateFPMaxReduce(Value *Src, bool NoNaN = false);
+  CallInst *CreateFPMaxReduce(Value *Src);
 
   /// Create a vector float min reduction intrinsic of the source
   /// vector.
-  CallInst *CreateFPMinReduce(Value *Src, bool NoNaN = false);
+  CallInst *CreateFPMinReduce(Value *Src);
 
   /// Create a lifetime.start intrinsic.
   ///
@@ -786,7 +852,18 @@ public:
 
   /// Create an assume intrinsic call that allows the optimizer to
   /// assume that the provided condition will be true.
-  CallInst *CreateAssumption(Value *Cond);
+  ///
+  /// The optional argument \p OpBundles specifies operand bundles that are
+  /// added to the call instruction.
+  CallInst *CreateAssumption(Value *Cond,
+                             ArrayRef<OperandBundleDef> OpBundles = llvm::None);
+
+  /// Create a llvm.experimental.noalias.scope.decl intrinsic call.
+  Instruction *CreateNoAliasScopeDeclaration(Value *Scope);
+  Instruction *CreateNoAliasScopeDeclaration(MDNode *ScopeTag) {
+    return CreateNoAliasScopeDeclaration(
+        MetadataAsValue::get(Context, ScopeTag));
+  }
 
   /// Create a call to the experimental.gc.statepoint intrinsic to
   /// start a new statepoint sequence.
@@ -801,7 +878,7 @@ public:
   /// start a new statepoint sequence.
   CallInst *CreateGCStatepointCall(uint64_t ID, uint32_t NumPatchBytes,
                                    Value *ActualCallee, uint32_t Flags,
-                                   ArrayRef<Use> CallArgs,
+                                   ArrayRef<Value *> CallArgs,
                                    Optional<ArrayRef<Use>> TransitionArgs,
                                    Optional<ArrayRef<Use>> DeoptArgs,
                                    ArrayRef<Value *> GCArgs,
@@ -830,7 +907,7 @@ public:
   InvokeInst *CreateGCStatepointInvoke(
       uint64_t ID, uint32_t NumPatchBytes, Value *ActualInvokee,
       BasicBlock *NormalDest, BasicBlock *UnwindDest, uint32_t Flags,
-      ArrayRef<Use> InvokeArgs, Optional<ArrayRef<Use>> TransitionArgs,
+      ArrayRef<Value *> InvokeArgs, Optional<ArrayRef<Use>> TransitionArgs,
       Optional<ArrayRef<Use>> DeoptArgs, ArrayRef<Value *> GCArgs,
       const Twine &Name = "");
 
@@ -858,6 +935,10 @@ public:
                              Type *ResultType,
                              const Twine &Name = "");
 
+  /// Create a call to llvm.vscale, multiplied by \p Scaling. The type of VScale
+  /// will be the same type as that of \p Scaling.
+  Value *CreateVScale(Constant *Scaling, const Twine &Name = "");
+
   /// Create a call to intrinsic \p ID with 1 operand which is mangled on its
   /// type.
   CallInst *CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V,
@@ -898,6 +979,22 @@ public:
     return CreateBinaryIntrinsic(Intrinsic::maximum, LHS, RHS, nullptr, Name);
   }
 
+  /// Create a call to the experimental.vector.extract intrinsic.
+  CallInst *CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx,
+                                const Twine &Name = "") {
+    return CreateIntrinsic(Intrinsic::experimental_vector_extract,
+                           {DstType, SrcVec->getType()}, {SrcVec, Idx}, nullptr,
+                           Name);
+  }
+
+  /// Create a call to the experimental.vector.insert intrinsic.
+  CallInst *CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec,
+                               Value *Idx, const Twine &Name = "") {
+    return CreateIntrinsic(Intrinsic::experimental_vector_insert,
+                           {DstType, SubVec->getType()}, {SrcVec, SubVec, Idx},
+                           nullptr, Name);
+  }
+
 private:
   /// Create a call to a masked intrinsic with given Id.
   CallInst *CreateMaskedIntrinsic(Intrinsic::ID Id, ArrayRef<Value *> Ops,
@@ -998,16 +1095,21 @@ public:
                            ArrayRef<Value *> Args,
                            ArrayRef<OperandBundleDef> OpBundles,
                            const Twine &Name = "") {
-    return Insert(
-        InvokeInst::Create(Ty, Callee, NormalDest, UnwindDest, Args, OpBundles),
-        Name);
+    InvokeInst *II =
+        InvokeInst::Create(Ty, Callee, NormalDest, UnwindDest, Args, OpBundles);
+    if (IsFPConstrained)
+      setConstrainedFPCallAttr(II);
+    return Insert(II, Name);
   }
   InvokeInst *CreateInvoke(FunctionType *Ty, Value *Callee,
                            BasicBlock *NormalDest, BasicBlock *UnwindDest,
                            ArrayRef<Value *> Args = None,
                            const Twine &Name = "") {
-    return Insert(InvokeInst::Create(Ty, Callee, NormalDest, UnwindDest, Args),
-                  Name);
+    InvokeInst *II =
+        InvokeInst::Create(Ty, Callee, NormalDest, UnwindDest, Args);
+    if (IsFPConstrained)
+      setConstrainedFPCallAttr(II);
+    return Insert(II, Name);
   }
 
   InvokeInst *CreateInvoke(FunctionCallee Callee, BasicBlock *NormalDest,
@@ -2428,6 +2530,13 @@ public:
     return Insert(new ShuffleVectorInst(V1, V2, Mask), Name);
   }
 
+  /// Create a unary shuffle. The second vector operand of the IR instruction
+  /// is poison.
+  Value *CreateShuffleVector(Value *V, ArrayRef<int> Mask,
+                             const Twine &Name = "") {
+    return CreateShuffleVector(V, PoisonValue::get(V->getType()), Mask, Name);
+  }
+
   Value *CreateExtractValue(Value *Agg,
                             ArrayRef<unsigned> Idxs,
                             const Twine &Name = "") {
@@ -2492,6 +2601,10 @@ public:
   /// NumElts elements.
   Value *CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name = "");
 
+  /// Return a vector value that contains \arg V broadcasted to \p
+  /// EC elements.
+  Value *CreateVectorSplat(ElementCount EC, Value *V, const Twine &Name = "");
+
   /// Return a value that has been extracted from a larger integer type.
   Value *CreateExtractInteger(const DataLayout &DL, Value *From,
                               IntegerType *ExtractedTy, uint64_t Offset,
@@ -2510,13 +2623,11 @@ public:
 
 private:
   /// Helper function that creates an assume intrinsic call that
-  /// represents an alignment assumption on the provided Ptr, Mask, Type
-  /// and Offset. It may be sometimes useful to do some other logic
-  /// based on this alignment check, thus it can be stored into 'TheCheck'.
+  /// represents an alignment assumption on the provided pointer \p PtrValue
+  /// with offset \p OffsetValue and alignment value \p AlignValue.
   CallInst *CreateAlignmentAssumptionHelper(const DataLayout &DL,
-                                            Value *PtrValue, Value *Mask,
-                                            Type *IntPtrTy, Value *OffsetValue,
-                                            Value **TheCheck);
+                                            Value *PtrValue, Value *AlignValue,
+                                            Value *OffsetValue);
 
 public:
   /// Create an assume intrinsic call that represents an alignment
@@ -2525,13 +2636,9 @@ public:
   /// An optional offset can be provided, and if it is provided, the offset
   /// must be subtracted from the provided pointer to get the pointer with the
   /// specified alignment.
-  ///
-  /// It may be sometimes useful to do some other logic
-  /// based on this alignment check, thus it can be stored into 'TheCheck'.
   CallInst *CreateAlignmentAssumption(const DataLayout &DL, Value *PtrValue,
                                       unsigned Alignment,
-                                      Value *OffsetValue = nullptr,
-                                      Value **TheCheck = nullptr);
+                                      Value *OffsetValue = nullptr);
 
   /// Create an assume intrinsic call that represents an alignment
   /// assumption on the provided pointer.
@@ -2540,15 +2647,11 @@ public:
   /// must be subtracted from the provided pointer to get the pointer with the
   /// specified alignment.
   ///
-  /// It may be sometimes useful to do some other logic
-  /// based on this alignment check, thus it can be stored into 'TheCheck'.
-  ///
   /// This overload handles the condition where the Alignment is dependent
   /// on an existing value rather than a static value.
   CallInst *CreateAlignmentAssumption(const DataLayout &DL, Value *PtrValue,
                                       Value *Alignment,
-                                      Value *OffsetValue = nullptr,
-                                      Value **TheCheck = nullptr);
+                                      Value *OffsetValue = nullptr);
 };
 
 /// This provides a uniform API for creating instructions and inserting
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/IRPrintingPasses.h b/contrib/llvm-project/llvm/include/llvm/IR/IRPrintingPasses.h
index 3a1c489ee09f..2e62be7cd1ec 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/IRPrintingPasses.h
+++ b/contrib/llvm-project/llvm/include/llvm/IR/IRPrintingPasses.h
@@ -18,11 +18,12 @@
 #ifndef LLVM_IR_IRPRINTINGPASSES_H
 #define LLVM_IR_IRPRINTINGPASSES_H
 
-#include "llvm/ADT/StringRef.h"
 #include "llvm/IR/PassManager.h"
 #include <string>
 
 namespace llvm {
+class raw_ostream;
+class StringRef;
 
 /// Create and return a pass that writes the module to the specified
 /// \c raw_ostream.
@@ -44,22 +45,6 @@ void printLLVMNameWithoutPrefix(raw_ostream &OS, StringRef Name);
 /// Return true if a pass is for IR printing.
 bool isIRPrintingPass(Pass *P);
 
-/// isFunctionInPrintList - returns true if a function should be printed via
-//  debugging options like -print-after-all/-print-before-all.
-//  Tells if the function IR should be printed by PrinterPass.
-extern bool isFunctionInPrintList(StringRef FunctionName);
-
-/// forcePrintModuleIR - returns true if IR printing passes should
-//  be printing module IR (even for local-pass printers e.g. function-pass)
-//  to provide more context, as enabled by debugging option -print-module-scope
-//  Tells if IR printer should be printing module IR
-extern bool forcePrintModuleIR();
-
-extern bool shouldPrintBeforePass();
-extern bool shouldPrintBeforePass(StringRef);
-extern bool shouldPrintAfterPass();
-extern bool shouldPrintAfterPass(StringRef);
-
 /// Pass for printing a Module as LLVM's text IR assembly.
 ///
 /// Note: This pass is for use with the new pass manager. Use the create...Pass
@@ -75,6 +60,7 @@ public:
                   bool ShouldPreserveUseListOrder = false);
 
   PreservedAnalyses run(Module &M, AnalysisManager<Module> &);
+  static bool isRequired() { return true; }
 };
 
 /// Pass for printing a Function as LLVM's text IR assembly.
@@ -90,8 +76,9 @@ public:
   PrintFunctionPass(raw_ostream &OS, const std::string &Banner = "");
 
   PreservedAnalyses run(Function &F, AnalysisManager<Function> &);
+  static bool isRequired() { return true; }
 };
 
-} // End llvm namespace
+} // namespace llvm
 
 #endif
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/InstrTypes.h b/contrib/llvm-project/llvm/include/llvm/IR/InstrTypes.h
index 07af00ec9240..955ac8e537fe 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/InstrTypes.h
+++ b/contrib/llvm-project/llvm/include/llvm/IR/InstrTypes.h
@@ -599,12 +599,6 @@ public:
     BasicBlock *InsertAtEnd  ///< The block to insert the instruction into
   );
 
-  /// Check whether it is valid to call getCastOpcode for these types.
-  static bool isCastable(
-    Type *SrcTy, ///< The Type from which the value should be cast.
-    Type *DestTy ///< The Type to which the value should be cast.
-  );
-
   /// Check whether a bitcast between these types is valid
   static bool isBitCastable(
     Type *SrcTy, ///< The Type from which the value should be cast.
@@ -650,8 +644,8 @@ public:
   /// DataLayout argument is to determine the pointer size when examining casts
   /// involving Integer and Pointer types. They are no-op casts if the integer
   /// is the same size as the pointer. However, pointer size varies with
-  /// platform.
-  /// Determine if the described cast is a no-op cast.
+  /// platform.  Note that a precondition of this method is that the cast is
+  /// legal - i.e. the instruction formed with these operands would verify.
   static bool isNoopCast(
     Instruction::CastOps Opcode, ///< Opcode of cast
     Type *SrcTy,         ///< SrcTy of cast
@@ -691,11 +685,14 @@ public:
   /// Return the destination type, as a convenience
   Type* getDestTy() const { return getType(); }
 
-  /// This method can be used to determine if a cast from S to DstTy using
+  /// This method can be used to determine if a cast from SrcTy to DstTy using
   /// Opcode op is valid or not.
   /// @returns true iff the proposed cast is valid.
   /// Determine if a cast is valid without creating one.
-  static bool castIsValid(Instruction::CastOps op, Value *S, Type *DstTy);
+  static bool castIsValid(Instruction::CastOps op, Type *SrcTy, Type *DstTy);
+  static bool castIsValid(Instruction::CastOps op, Value *S, Type *DstTy) {
+    return castIsValid(op, S->getType(), DstTy);
+  }
 
   /// Methods for support type inquiry through isa, cast, and dyn_cast:
   static bool classof(const Instruction *I) {
@@ -805,8 +802,8 @@ public:
   void setPredicate(Predicate P) { setSubclassData<PredicateField>(P); }
 
   static bool isFPPredicate(Predicate P) {
-    assert(FIRST_FCMP_PREDICATE == 0 &&
-           "FIRST_FCMP_PREDICATE is required to be 0");
+    static_assert(FIRST_FCMP_PREDICATE == 0,
+                  "FIRST_FCMP_PREDICATE is required to be 0");
     return P <= LAST_FCMP_PREDICATE;
   }
 
@@ -848,20 +845,38 @@ public:
   /// Return the predicate as if the operands were swapped.
   static Predicate getSwappedPredicate(Predicate pred);
 
-  /// For predicate of kind "is X or equal to 0" returns the predicate "is X".
-  /// For predicate of kind "is X" returns the predicate "is X or equal to 0".
-  /// does not support other kind of predicates.
-  /// @returns the predicate that does not contains is equal to zero if
-  /// it had and vice versa.
-  /// Return the flipped strictness of predicate
-  Predicate getFlippedStrictnessPredicate() const {
-    return getFlippedStrictnessPredicate(getPredicate());
+  /// This is a static version that you can use without an instruction
+  /// available.
+  /// @returns true if the comparison predicate is strict, false otherwise.
+  static bool isStrictPredicate(Predicate predicate);
+
+  /// @returns true if the comparison predicate is strict, false otherwise.
+  /// Determine if this instruction is using an strict comparison predicate.
+  bool isStrictPredicate() const { return isStrictPredicate(getPredicate()); }
+
+  /// This is a static version that you can use without an instruction
+  /// available.
+  /// @returns true if the comparison predicate is non-strict, false otherwise.
+  static bool isNonStrictPredicate(Predicate predicate);
+
+  /// @returns true if the comparison predicate is non-strict, false otherwise.
+  /// Determine if this instruction is using an non-strict comparison predicate.
+  bool isNonStrictPredicate() const {
+    return isNonStrictPredicate(getPredicate());
+  }
+
+  /// For example, SGE -> SGT, SLE -> SLT, ULE -> ULT, UGE -> UGT.
+  /// Returns the strict version of non-strict comparisons.
+  Predicate getStrictPredicate() const {
+    return getStrictPredicate(getPredicate());
   }
 
   /// This is a static version that you can use without an instruction
   /// available.
-  /// Return the flipped strictness of predicate
-  static Predicate getFlippedStrictnessPredicate(Predicate pred);
+  /// @returns the strict version of comparison provided in \p pred.
+  /// If \p pred is not a strict comparison predicate, returns \p pred.
+  /// Returns the strict version of non-strict comparisons.
+  static Predicate getStrictPredicate(Predicate pred);
 
   /// For example, SGT -> SGE, SLT -> SLE, ULT -> ULE, UGT -> UGE.
   /// Returns the non-strict version of strict comparisons.
@@ -876,6 +891,21 @@ public:
   /// Returns the non-strict version of strict comparisons.
   static Predicate getNonStrictPredicate(Predicate pred);
 
+  /// This is a static version that you can use without an instruction
+  /// available.
+  /// Return the flipped strictness of predicate
+  static Predicate getFlippedStrictnessPredicate(Predicate pred);
+
+  /// For predicate of kind "is X or equal to 0" returns the predicate "is X".
+  /// For predicate of kind "is X" returns the predicate "is X or equal to 0".
+  /// does not support other kind of predicates.
+  /// @returns the predicate that does not contains is equal to zero if
+  /// it had and vice versa.
+  /// Return the flipped strictness of predicate
+  Predicate getFlippedStrictnessPredicate() const {
+    return getFlippedStrictnessPredicate(getPredicate());
+  }
+
   /// Provide more efficient getOperand methods.
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
 
@@ -888,9 +918,19 @@ public:
   /// Determine if this CmpInst is commutative.
   bool isCommutative() const;
 
-  /// This is just a convenience that dispatches to the subclasses.
   /// Determine if this is an equals/not equals predicate.
-  bool isEquality() const;
+  /// This is a static version that you can use without an instruction
+  /// available.
+  static bool isEquality(Predicate pred);
+
+  /// Determine if this is an equals/not equals predicate.
+  bool isEquality() const { return isEquality(getPredicate()); }
+
+  /// Return true if the predicate is relational (not EQ or NE).
+  static bool isRelational(Predicate P) { return !isEquality(P); }
+
+  /// Return true if the predicate is relational (not EQ or NE).
+  bool isRelational() const { return !isEquality(); }
 
   /// @returns true if the comparison is signed, false otherwise.
   /// Determine if this instruction is using a signed comparison.
@@ -917,6 +957,30 @@ public:
     return getSignedPredicate(getPredicate());
   }
 
+  /// For example, SLT->ULT, SLE->ULE, SGT->UGT, SGE->UGE, ULT->Failed assert
+  /// @returns the unsigned version of the signed predicate pred.
+  static Predicate getUnsignedPredicate(Predicate pred);
+
+  /// For example, SLT->ULT, SLE->ULE, SGT->UGT, SGE->UGE, ULT->Failed assert
+  /// @returns the unsigned version of the predicate for this instruction (which
+  /// has to be an signed predicate).
+  /// return the unsigned version of a predicate
+  Predicate getUnsignedPredicate() {
+    return getUnsignedPredicate(getPredicate());
+  }
+
+  /// For example, SLT->ULT, ULT->SLT, SLE->ULE, ULE->SLE, EQ->Failed assert
+  /// @returns the unsigned version of the signed predicate pred or
+  ///          the signed version of the signed predicate pred.
+  static Predicate getFlippedSignednessPredicate(Predicate pred);
+
+  /// For example, SLT->ULT, ULT->SLT, SLE->ULE, ULE->SLE, EQ->Failed assert
+  /// @returns the unsigned version of the signed predicate pred or
+  ///          the signed version of the signed predicate pred.
+  Predicate getFlippedSignednessPredicate() {
+    return getFlippedSignednessPredicate(getPredicate());
+  }
+
   /// This is just a convenience.
   /// Determine if this is true when both operands are the same.
   bool isTrueWhenEqual() const {
@@ -1062,7 +1126,7 @@ public:
 
   explicit OperandBundleDefT(const OperandBundleUse &OBU) {
     Tag = std::string(OBU.getTagName());
-    Inputs.insert(Inputs.end(), OBU.Inputs.begin(), OBU.Inputs.end());
+    llvm::append_range(Inputs, OBU.Inputs);
   }
 
   ArrayRef<InputTy> inputs() const { return Inputs; }
@@ -1301,7 +1365,7 @@ public:
   /// Returns true if this CallSite passes the given Value* as an argument to
   /// the called function.
   bool hasArgument(const Value *V) const {
-    return llvm::any_of(args(), [V](const Value *Arg) { return Arg == V; });
+    return llvm::is_contained(args(), V);
   }
 
   Value *getCalledOperand() const { return Op<CalledOperandOpEndIdx>(); }
@@ -1393,14 +1457,18 @@ public:
   ///
   void setAttributes(AttributeList A) { Attrs = A; }
 
-  /// Determine whether this call has the given attribute.
+  /// Determine whether this call has the given attribute. If it does not
+  /// then determine if the called function has the attribute, but only if
+  /// the attribute is allowed for the call.
   bool hasFnAttr(Attribute::AttrKind Kind) const {
     assert(Kind != Attribute::NoBuiltin &&
            "Use CallBase::isNoBuiltin() to check for Attribute::NoBuiltin");
     return hasFnAttrImpl(Kind);
   }
 
-  /// Determine whether this call has the given attribute.
+  /// Determine whether this call has the given attribute. If it does not
+  /// then determine if the called function has the attribute, but only if
+  /// the attribute is allowed for the call.
   bool hasFnAttr(StringRef Kind) const { return hasFnAttrImpl(Kind); }
 
   /// adds the attribute to the list of attributes.
@@ -1447,6 +1515,12 @@ public:
     setAttributes(PAL);
   }
 
+  void removeAttributes(unsigned i, const AttrBuilder &Attrs) {
+    AttributeList PAL = getAttributes();
+    PAL = PAL.removeAttributes(getContext(), i, Attrs);
+    setAttributes(PAL);
+  }
+
   /// Removes the attribute from the given argument
   void removeParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) {
     assert(ArgNo < getNumArgOperands() && "Out of bounds");
@@ -1479,7 +1553,11 @@ public:
   }
 
   /// Determine whether the return value has the given attribute.
-  bool hasRetAttr(Attribute::AttrKind Kind) const;
+  bool hasRetAttr(Attribute::AttrKind Kind) const {
+    return hasRetAttrImpl(Kind);
+  }
+  /// Determine whether the return value has the given attribute.
+  bool hasRetAttr(StringRef Kind) const { return hasRetAttrImpl(Kind); }
 
   /// Determine whether the argument or parameter has the given attribute.
   bool paramHasAttr(unsigned ArgNo, Attribute::AttrKind Kind) const;
@@ -1678,6 +1756,7 @@ public:
   bool onlyReadsMemory() const {
     return doesNotAccessMemory() || hasFnAttr(Attribute::ReadOnly);
   }
+
   void setOnlyReadsMemory() {
     addAttribute(AttributeList::FunctionIndex, Attribute::ReadOnly);
   }
@@ -2158,6 +2237,18 @@ private:
 
     return hasFnAttrOnCalledFunction(Kind);
   }
+
+  /// Determine whether the return value has the given attribute. Supports
+  /// Attribute::AttrKind and StringRef as \p AttrKind types.
+  template <typename AttrKind> bool hasRetAttrImpl(AttrKind Kind) const {
+    if (Attrs.hasAttribute(AttributeList::ReturnIndex, Kind))
+      return true;
+
+    // Look at the callee, if available.
+    if (const Function *F = getCalledFunction())
+      return F->getAttributes().hasAttribute(AttributeList::ReturnIndex, Kind);
+    return false;
+  }
 };
 
 template <>
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/Instruction.h b/contrib/llvm-project/llvm/include/llvm/IR/Instruction.h
index a03eac0ad40d..b99dc62bbb9d 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/Instruction.h
+++ b/contrib/llvm-project/llvm/include/llvm/IR/Instruction.h
@@ -256,13 +256,11 @@ public:
   //===--------------------------------------------------------------------===//
 
   /// Return true if this instruction has any metadata attached to it.
-  bool hasMetadata() const { return DbgLoc || hasMetadataHashEntry(); }
+  bool hasMetadata() const { return DbgLoc || Value::hasMetadata(); }
 
   /// Return true if this instruction has metadata attached to it other than a
   /// debug location.
-  bool hasMetadataOtherThanDebugLoc() const {
-    return hasMetadataHashEntry();
-  }
+  bool hasMetadataOtherThanDebugLoc() const { return Value::hasMetadata(); }
 
   /// Return true if this instruction has the given type of metadata attached.
   bool hasMetadata(unsigned KindID) const {
@@ -301,8 +299,7 @@ public:
   /// debug location.
   void getAllMetadataOtherThanDebugLoc(
       SmallVectorImpl<std::pair<unsigned, MDNode *>> &MDs) const {
-    if (hasMetadataOtherThanDebugLoc())
-      getAllMetadataOtherThanDebugLocImpl(MDs);
+    Value::getAllMetadata(MDs);
   }
 
   /// Fills the AAMDNodes structure with AA metadata from this instruction.
@@ -343,6 +340,11 @@ public:
   }
   /// @}
 
+  /// Adds an !annotation metadata node with \p Annotation to this instruction.
+  /// If this instruction already has !annotation metadata, append \p Annotation
+  /// to the existing node.
+  void addAnnotationMetadata(StringRef Annotation);
+
   /// Sets the metadata on this instruction from the AAMDNodes structure.
   void setAAMetadata(const AAMDNodes &N);
 
@@ -492,21 +494,26 @@ public:
   /// merged DebugLoc.
   void applyMergedLocation(const DILocation *LocA, const DILocation *LocB);
 
-private:
-  /// Return true if we have an entry in the on-the-side metadata hash.
-  bool hasMetadataHashEntry() const {
-    return Bitfield::test<HasMetadataField>(getSubclassDataFromValue());
-  }
+  /// Updates the debug location given that the instruction has been hoisted
+  /// from a block to a predecessor of that block.
+  /// Note: it is undefined behavior to call this on an instruction not
+  /// currently inserted into a function.
+  void updateLocationAfterHoist();
 
+  /// Drop the instruction's debug location. This does not guarantee removal
+  /// of the !dbg source location attachment, as it must set a line 0 location
+  /// with scope information attached on call instructions. To guarantee
+  /// removal of the !dbg attachment, use the \ref setDebugLoc() API.
+  /// Note: it is undefined behavior to call this on an instruction not
+  /// currently inserted into a function.
+  void dropLocation();
+
+private:
   // These are all implemented in Metadata.cpp.
   MDNode *getMetadataImpl(unsigned KindID) const;
   MDNode *getMetadataImpl(StringRef Kind) const;
   void
   getAllMetadataImpl(SmallVectorImpl<std::pair<unsigned, MDNode *>> &) const;
-  void getAllMetadataOtherThanDebugLocImpl(
-      SmallVectorImpl<std::pair<unsigned, MDNode *>> &) const;
-  /// Clear all hashtable-based metadata from this instruction.
-  void clearMetadataHashEntries();
 
 public:
   //===--------------------------------------------------------------------===//
@@ -532,7 +539,7 @@ public:
   /// In LLVM, these are the commutative operators, plus SetEQ and SetNE, when
   /// applied to any type.
   ///
-  bool isCommutative() const { return isCommutative(getOpcode()); }
+  bool isCommutative() const LLVM_READONLY;
   static bool isCommutative(unsigned Opcode) {
     switch (Opcode) {
     case Add: case FAdd:
@@ -626,6 +633,10 @@ public:
   /// generated program.
   bool isSafeToRemove() const;
 
+  /// Return true if the instruction will return (unwinding is considered as
+  /// a form of returning control flow here).
+  bool willReturn() const;
+
   /// Return true if the instruction is a variety of EH-block.
   bool isEHPad() const {
     switch (getOpcode()) {
@@ -643,20 +654,29 @@ public:
   /// llvm.lifetime.end marker.
   bool isLifetimeStartOrEnd() const;
 
+  /// Return true if the instruction is a DbgInfoIntrinsic or PseudoProbeInst.
+  bool isDebugOrPseudoInst() const;
+
   /// Return a pointer to the next non-debug instruction in the same basic
-  /// block as 'this', or nullptr if no such instruction exists.
-  const Instruction *getNextNonDebugInstruction() const;
-  Instruction *getNextNonDebugInstruction() {
+  /// block as 'this', or nullptr if no such instruction exists. Skip any pseudo
+  /// operations if \c SkipPseudoOp is true.
+  const Instruction *
+  getNextNonDebugInstruction(bool SkipPseudoOp = false) const;
+  Instruction *getNextNonDebugInstruction(bool SkipPseudoOp = false) {
     return const_cast<Instruction *>(
-        static_cast<const Instruction *>(this)->getNextNonDebugInstruction());
+        static_cast<const Instruction *>(this)->getNextNonDebugInstruction(
+            SkipPseudoOp));
   }
 
   /// Return a pointer to the previous non-debug instruction in the same basic
-  /// block as 'this', or nullptr if no such instruction exists.
-  const Instruction *getPrevNonDebugInstruction() const;
-  Instruction *getPrevNonDebugInstruction() {
+  /// block as 'this', or nullptr if no such instruction exists. Skip any pseudo
+  /// operations if \c SkipPseudoOp is true.
+  const Instruction *
+  getPrevNonDebugInstruction(bool SkipPseudoOp = false) const;
+  Instruction *getPrevNonDebugInstruction(bool SkipPseudoOp = false) {
     return const_cast<Instruction *>(
-        static_cast<const Instruction *>(this)->getPrevNonDebugInstruction());
+        static_cast<const Instruction *>(this)->getPrevNonDebugInstruction(
+            SkipPseudoOp));
   }
 
   /// Create a copy of 'this' instruction that is identical in all ways except
@@ -787,8 +807,6 @@ private:
     return Value::getSubclassDataFromValue();
   }
 
-  void setHasMetadataHashEntry(bool V) { setSubclassData<HasMetadataField>(V); }
-
   void setParent(BasicBlock *P);
 
 protected:
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/Instructions.h b/contrib/llvm-project/llvm/include/llvm/IR/Instructions.h
index 0afc585dfbe5..00ecc2aa7f37 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/Instructions.h
+++ b/contrib/llvm-project/llvm/include/llvm/IR/Instructions.h
@@ -27,6 +27,7 @@
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CallingConv.h"
+#include "llvm/IR/CFG.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
@@ -105,7 +106,7 @@ public:
 
   /// Get allocation size in bits. Returns None if size can't be determined,
   /// e.g. in case of a VLA.
-  Optional<uint64_t> getAllocationSizeInBits(const DataLayout &DL) const;
+  Optional<TypeSize> getAllocationSizeInBits(const DataLayout &DL) const;
 
   /// Return the type that is being allocated by the instruction.
   Type *getAllocatedType() const { return AllocatedType; }
@@ -1289,6 +1290,30 @@ public:
     return !isEquality(P);
   }
 
+  /// Return true if the predicate is SGT or UGT.
+  ///
+  static bool isGT(Predicate P) {
+    return P == ICMP_SGT || P == ICMP_UGT;
+  }
+
+  /// Return true if the predicate is SLT or ULT.
+  ///
+  static bool isLT(Predicate P) {
+    return P == ICMP_SLT || P == ICMP_ULT;
+  }
+
+  /// Return true if the predicate is SGE or UGE.
+  ///
+  static bool isGE(Predicate P) {
+    return P == ICMP_SGE || P == ICMP_UGE;
+  }
+
+  /// Return true if the predicate is SLE or ULE.
+  ///
+  static bool isLE(Predicate P) {
+    return P == ICMP_SLE || P == ICMP_ULE;
+  }
+
   /// Exchange the two operands to this instruction in such a way that it does
   /// not modify the semantics of the instruction. The predicate value may be
   /// changed to retain the same result if the predicate is order dependent
@@ -1560,6 +1585,16 @@ public:
   static CallInst *Create(CallInst *CI, ArrayRef<OperandBundleDef> Bundles,
                           Instruction *InsertPt = nullptr);
 
+  /// Create a clone of \p CI with a different set of operand bundles and
+  /// insert it before \p InsertPt.
+  ///
+  /// The returned call instruction is identical \p CI in every way except that
+  /// the operand bundle for the new instruction is set to the operand bundle
+  /// in \p Bundle.
+  static CallInst *CreateWithReplacedBundle(CallInst *CI,
+                                            OperandBundleDef Bundle,
+                                            Instruction *InsertPt = nullptr);
+
   /// Generate the IR for a call to malloc:
   /// 1. Compute the malloc call's argument as the specified type's size,
   ///    possibly multiplied by the array size if the array size is not
@@ -2035,8 +2070,9 @@ public:
   /// Examples: shufflevector <4 x n> A, <4 x n> B, <1,2,3>
   ///           shufflevector <4 x n> A, <4 x n> B, <1,2,3,4,5>
   bool changesLength() const {
-    unsigned NumSourceElts =
-        cast<VectorType>(Op<0>()->getType())->getElementCount().Min;
+    unsigned NumSourceElts = cast<VectorType>(Op<0>()->getType())
+                                 ->getElementCount()
+                                 .getKnownMinValue();
     unsigned NumMaskElts = ShuffleMask.size();
     return NumSourceElts != NumMaskElts;
   }
@@ -2045,8 +2081,9 @@ public:
   /// elements than its source vectors.
   /// Example: shufflevector <2 x n> A, <2 x n> B, <1,2,3>
   bool increasesLength() const {
-    unsigned NumSourceElts =
-        cast<VectorType>(Op<0>()->getType())->getNumElements();
+    unsigned NumSourceElts = cast<VectorType>(Op<0>()->getType())
+                                 ->getElementCount()
+                                 .getKnownMinValue();
     unsigned NumMaskElts = ShuffleMask.size();
     return NumSourceElts < NumMaskElts;
   }
@@ -2232,6 +2269,10 @@ public:
   static bool isExtractSubvectorMask(const Constant *Mask, int NumSrcElts,
                                      int &Index) {
     assert(Mask->getType()->isVectorTy() && "Shuffle needs vector constant.");
+    // Not possible to express a shuffle mask for a scalable vector for this
+    // case.
+    if (isa<ScalableVectorType>(Mask->getType()))
+      return false;
     SmallVector<int, 16> MaskAsInts;
     getShuffleMask(Mask, MaskAsInts);
     return isExtractSubvectorMask(MaskAsInts, NumSrcElts, Index);
@@ -2239,7 +2280,13 @@ public:
 
   /// Return true if this shuffle mask is an extract subvector mask.
   bool isExtractSubvectorMask(int &Index) const {
-    int NumSrcElts = cast<VectorType>(Op<0>()->getType())->getNumElements();
+    // Not possible to express a shuffle mask for a scalable vector for this
+    // case.
+    if (isa<ScalableVectorType>(getType()))
+      return false;
+
+    int NumSrcElts =
+        cast<FixedVectorType>(Op<0>()->getType())->getNumElements();
     return isExtractSubvectorMask(ShuffleMask, NumSrcElts, Index);
   }
 
@@ -2743,6 +2790,15 @@ public:
   /// non-undef value.
   bool hasConstantOrUndefValue() const;
 
+  /// If the PHI node is complete which means all of its parent's predecessors
+  /// have incoming value in this PHI, return true, otherwise return false.
+  bool isComplete() const {
+    return llvm::all_of(predecessors(getParent()),
+                        [this](const BasicBlock *Pred) {
+                          return getBasicBlockIndex(Pred) >= 0;
+                        });
+  }
+
   /// Methods for support type inquiry through isa, cast, and dyn_cast:
   static bool classof(const Instruction *I) {
     return I->getOpcode() == Instruction::PHI;
@@ -3768,6 +3824,16 @@ public:
   static InvokeInst *Create(InvokeInst *II, ArrayRef<OperandBundleDef> Bundles,
                             Instruction *InsertPt = nullptr);
 
+  /// Create a clone of \p II with a different set of operand bundles and
+  /// insert it before \p InsertPt.
+  ///
+  /// The returned invoke instruction is identical to \p II in every way except
+  /// that the operand bundle for the new instruction is set to the operand
+  /// bundle in \p Bundle.
+  static InvokeInst *CreateWithReplacedBundle(InvokeInst *II,
+                                              OperandBundleDef Bundles,
+                                              Instruction *InsertPt = nullptr);
+
   // get*Dest - Return the destination basic blocks...
   BasicBlock *getNormalDest() const {
     return cast<BasicBlock>(Op<NormalDestOpEndIdx>());
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicInst.h b/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicInst.h
index 7a8898464e66..df3a1d568756 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicInst.h
+++ b/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicInst.h
@@ -52,6 +52,36 @@ public:
     return getCalledFunction()->getIntrinsicID();
   }
 
+  /// Return true if swapping the first two arguments to the intrinsic produces
+  /// the same result.
+  bool isCommutative() const {
+    switch (getIntrinsicID()) {
+    case Intrinsic::maxnum:
+    case Intrinsic::minnum:
+    case Intrinsic::maximum:
+    case Intrinsic::minimum:
+    case Intrinsic::smax:
+    case Intrinsic::smin:
+    case Intrinsic::umax:
+    case Intrinsic::umin:
+    case Intrinsic::sadd_sat:
+    case Intrinsic::uadd_sat:
+    case Intrinsic::sadd_with_overflow:
+    case Intrinsic::uadd_with_overflow:
+    case Intrinsic::smul_with_overflow:
+    case Intrinsic::umul_with_overflow:
+    case Intrinsic::smul_fix:
+    case Intrinsic::umul_fix:
+    case Intrinsic::smul_fix_sat:
+    case Intrinsic::umul_fix_sat:
+    case Intrinsic::fma:
+    case Intrinsic::fmuladd:
+      return true;
+    default:
+      return false;
+    }
+  }
+
   // Methods for support type inquiry through isa, cast, and dyn_cast:
   static bool classof(const CallInst *I) {
     if (const Function *CF = I->getCalledFunction())
@@ -937,6 +967,55 @@ public:
   }
 };
 
+class PseudoProbeInst : public IntrinsicInst {
+public:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::pseudoprobe;
+  }
+
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+
+  ConstantInt *getFuncGuid() const {
+    return cast<ConstantInt>(const_cast<Value *>(getArgOperand(0)));
+  }
+
+  ConstantInt *getIndex() const {
+    return cast<ConstantInt>(const_cast<Value *>(getArgOperand(1)));
+  }
+
+  ConstantInt *getAttributes() const {
+    return cast<ConstantInt>(const_cast<Value *>(getArgOperand(2)));
+  }
+
+  ConstantInt *getFactor() const {
+    return cast<ConstantInt>(const_cast<Value *>(getArgOperand(3)));
+  }
+};
+
+class NoAliasScopeDeclInst : public IntrinsicInst {
+public:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::experimental_noalias_scope_decl;
+  }
+
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+
+  MDNode *getScopeList() const {
+    auto *MV =
+        cast<MetadataAsValue>(getOperand(Intrinsic::NoAliasScopeDeclScopeArg));
+    return cast<MDNode>(MV->getMetadata());
+  }
+
+  void setScopeList(MDNode *ScopeList) {
+    setOperand(Intrinsic::NoAliasScopeDeclScopeArg,
+               MetadataAsValue::get(getContext(), ScopeList));
+  }
+};
+
 } // end namespace llvm
 
 #endif // LLVM_IR_INTRINSICINST_H
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/Intrinsics.h b/contrib/llvm-project/llvm/include/llvm/IR/Intrinsics.h
index a9e6525e2f3d..f9b6c098a3f2 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/Intrinsics.h
+++ b/contrib/llvm-project/llvm/include/llvm/IR/Intrinsics.h
@@ -34,6 +34,9 @@ class AttributeList;
 /// function known by LLVM. The enum values are returned by
 /// Function::getIntrinsicID().
 namespace Intrinsic {
+  // Abstraction for the arguments of the noalias intrinsics
+  static const int NoAliasScopeDeclScopeArg = 0;
+
   // Intrinsic ID type. This is an opaque typedef to facilitate splitting up
   // the enum into target-specific enums.
   typedef unsigned ID;
@@ -125,7 +128,8 @@ namespace Intrinsic {
       VecElementArgument,
       Subdivide2Argument,
       Subdivide4Argument,
-      VecOfBitcastsToInt
+      VecOfBitcastsToInt,
+      AMX
     } Kind;
 
     union {
@@ -188,10 +192,8 @@ namespace Intrinsic {
     }
 
     static IITDescriptor getVector(unsigned Width, bool IsScalable) {
-      IITDescriptor Result;
-      Result.Kind = Vector;
-      Result.Vector_Width.Min = Width;
-      Result.Vector_Width.Scalable = IsScalable;
+      IITDescriptor Result = {Vector, {0}};
+      Result.Vector_Width = ElementCount::get(Width, IsScalable);
       return Result;
     }
   };
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/Intrinsics.td b/contrib/llvm-project/llvm/include/llvm/IR/Intrinsics.td
index 4918ea876df6..21307ed1bd91 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/Intrinsics.td
+++ b/contrib/llvm-project/llvm/include/llvm/IR/Intrinsics.td
@@ -17,7 +17,9 @@ include "llvm/CodeGen/SDNodeProperties.td"
 //  Properties we keep track of for intrinsics.
 //===----------------------------------------------------------------------===//
 
-class IntrinsicProperty;
+class IntrinsicProperty<bit is_default = false> {
+  bit IsDefault = is_default;
+}
 
 // Intr*Mem - Memory properties.  If no property is set, the worst case
 // is assumed (it may read and write any memory it can get access to and it may
@@ -27,10 +29,6 @@ class IntrinsicProperty;
 // effects.  It may be CSE'd deleted if dead, etc.
 def IntrNoMem : IntrinsicProperty;
 
-// IntrNoSync - Threads executing the intrinsic will not synchronize using
-// memory or other means.
-def IntrNoSync : IntrinsicProperty;
-
 // IntrReadMem - This intrinsic only reads from memory. It does not write to
 // memory and has no other side effects. Therefore, it cannot be moved across
 // potentially aliasing stores. However, it can be reordered otherwise and can
@@ -81,6 +79,11 @@ class NoAlias<AttrIndex idx> : IntrinsicProperty {
   int ArgNo = idx.Value;
 }
 
+// NoUndef - The specified argument is neither undef nor poison.
+class NoUndef<AttrIndex idx> : IntrinsicProperty {
+  int ArgNo = idx.Value;
+}
+
 class Align<AttrIndex idx, int align> : IntrinsicProperty {
   int ArgNo = idx.Value;
   int Align = align;
@@ -117,9 +120,15 @@ class ReadNone<AttrIndex idx> : IntrinsicProperty {
 
 def IntrNoReturn : IntrinsicProperty;
 
-def IntrNoFree : IntrinsicProperty;
+// IntrNoSync - Threads executing the intrinsic will not synchronize using
+// memory or other means. Applied by default.
+def IntrNoSync : IntrinsicProperty<1>;
+
+// Applied by default.
+def IntrNoFree : IntrinsicProperty<1>;
 
-def IntrWillReturn : IntrinsicProperty;
+// Applied by default.
+def IntrWillReturn : IntrinsicProperty<1>;
 
 // IntrCold - Calls to this intrinsic are cold.
 // Parallels the cold attribute on LLVM IR functions.
@@ -152,7 +161,7 @@ def IntrHasSideEffects : IntrinsicProperty;
 
 class LLVMType<ValueType vt> {
   ValueType VT = vt;
-  int isAny = 0;
+  int isAny = false;
 }
 
 class LLVMQualPointerType<LLVMType elty, int addrspace>
@@ -168,7 +177,7 @@ class LLVMAnyPointerType<LLVMType elty>
   : LLVMType<iPTRAny>{
   LLVMType ElTy = elty;
 
-  let isAny = 1;
+  let isAny = true;
 }
 
 // Match the type of another intrinsic parameter.  Number is an index into the
@@ -217,7 +226,7 @@ class LLVMSubdivide4VectorType<int num> : LLVMMatchType<num>;
 class LLVMVectorOfBitcastsToInt<int num> : LLVMMatchType<num>;
 
 def llvm_void_ty       : LLVMType<isVoid>;
-let isAny = 1 in {
+let isAny = true in {
   def llvm_any_ty        : LLVMType<Any>;
   def llvm_anyint_ty     : LLVMType<iAny>;
   def llvm_anyfloat_ty   : LLVMType<fAny>;
@@ -246,6 +255,8 @@ def llvm_token_ty      : LLVMType<token>;                         // token
 def llvm_x86mmx_ty     : LLVMType<x86mmx>;
 def llvm_ptrx86mmx_ty  : LLVMPointerType<llvm_x86mmx_ty>;         // <1 x i64>*
 
+def llvm_x86amx_ty     : LLVMType<x86amx>;
+
 def llvm_v2i1_ty       : LLVMType<v2i1>;     //   2 x i1
 def llvm_v4i1_ty       : LLVMType<v4i1>;     //   4 x i1
 def llvm_v8i1_ty       : LLVMType<v8i1>;     //   8 x i1
@@ -253,6 +264,7 @@ def llvm_v16i1_ty      : LLVMType<v16i1>;    //  16 x i1
 def llvm_v32i1_ty      : LLVMType<v32i1>;    //  32 x i1
 def llvm_v64i1_ty      : LLVMType<v64i1>;    //  64 x i1
 def llvm_v128i1_ty     : LLVMType<v128i1>;   // 128 x i1
+def llvm_v256i1_ty     : LLVMType<v256i1>;   // 256 x i1
 def llvm_v512i1_ty     : LLVMType<v512i1>;   // 512 x i1
 def llvm_v1024i1_ty    : LLVMType<v1024i1>;  //1024 x i1
 
@@ -282,6 +294,7 @@ def llvm_v8i32_ty      : LLVMType<v8i32>;    //  8 x i32
 def llvm_v16i32_ty     : LLVMType<v16i32>;   // 16 x i32
 def llvm_v32i32_ty     : LLVMType<v32i32>;   // 32 x i32
 def llvm_v64i32_ty     : LLVMType<v64i32>;   // 64 x i32
+def llvm_v256i32_ty    : LLVMType<v256i32>;  //256 x i32
 
 def llvm_v1i64_ty      : LLVMType<v1i64>;    //  1 x i64
 def llvm_v2i64_ty      : LLVMType<v2i64>;    //  2 x i64
@@ -331,7 +344,8 @@ class Intrinsic<list<LLVMType> ret_types,
                 list<LLVMType> param_types = [],
                 list<IntrinsicProperty> intr_properties = [],
                 string name = "",
-                list<SDNodeProperty> sd_properties = []> : SDPatternOperator {
+                list<SDNodeProperty> sd_properties = [],
+                bit disable_default_attributes = true> : SDPatternOperator {
   string LLVMName = name;
   string TargetPrefix = "";   // Set to a prefix for target-specific intrinsics.
   list<LLVMType> RetTypes = ret_types;
@@ -339,9 +353,23 @@ class Intrinsic<list<LLVMType> ret_types,
   list<IntrinsicProperty> IntrProperties = intr_properties;
   let Properties = sd_properties;
 
-  bit isTarget = 0;
+  // Disable applying IntrinsicProperties that are marked default with
+  // IntrinsicProperty<1>
+  bit DisableDefaultAttributes = disable_default_attributes;
+
+  bit isTarget = false;
 }
 
+// Intrinisc with default attributes (disable_default_attributes = false).
+class DefaultAttrsIntrinsic<list<LLVMType> ret_types,
+                list<LLVMType> param_types = [],
+                list<IntrinsicProperty> intr_properties = [],
+                string name = "",
+                list<SDNodeProperty> sd_properties = []>
+                : Intrinsic<ret_types, param_types,
+                            intr_properties, name,
+                            sd_properties, /*disable_default_attributes*/ 0> {}
+
 /// GCCBuiltin - If this intrinsic exactly corresponds to a GCC builtin, this
 /// specifies the name of the builtin.  This provides automatic CBE and CFE
 /// support.
@@ -357,10 +385,10 @@ class MSBuiltin<string name> {
 //===--------------- Variable Argument Handling Intrinsics ----------------===//
 //
 
-def int_vastart : Intrinsic<[], [llvm_ptr_ty], [], "llvm.va_start">;
-def int_vacopy  : Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty], [],
+def int_vastart : DefaultAttrsIntrinsic<[], [llvm_ptr_ty], [], "llvm.va_start">;
+def int_vacopy  : DefaultAttrsIntrinsic<[], [llvm_ptr_ty, llvm_ptr_ty], [],
                             "llvm.va_copy">;
-def int_vaend   : Intrinsic<[], [llvm_ptr_ty], [], "llvm.va_end">;
+def int_vaend   : DefaultAttrsIntrinsic<[], [llvm_ptr_ty], [], "llvm.va_end">;
 
 //===------------------- Garbage Collection Intrinsics --------------------===//
 //
@@ -448,12 +476,12 @@ def int_objc_arc_annotation_bottomup_bbend  : Intrinsic<[],
 
 //===--------------------- Code Generator Intrinsics ----------------------===//
 //
-def int_returnaddress : Intrinsic<[llvm_ptr_ty], [llvm_i32_ty],
+def int_returnaddress : DefaultAttrsIntrinsic<[llvm_ptr_ty], [llvm_i32_ty],
                                   [IntrNoMem, ImmArg<ArgIndex<0>>]>;
-def int_addressofreturnaddress : Intrinsic<[llvm_anyptr_ty], [], [IntrNoMem]>;
-def int_frameaddress : Intrinsic<[llvm_anyptr_ty], [llvm_i32_ty],
+def int_addressofreturnaddress : DefaultAttrsIntrinsic<[llvm_anyptr_ty], [], [IntrNoMem]>;
+def int_frameaddress : DefaultAttrsIntrinsic<[llvm_anyptr_ty], [llvm_i32_ty],
                                  [IntrNoMem, ImmArg<ArgIndex<0>>]>;
-def int_sponentry  : Intrinsic<[llvm_anyptr_ty], [], [IntrNoMem]>;
+def int_sponentry  : DefaultAttrsIntrinsic<[llvm_anyptr_ty], [], [IntrNoMem]>;
 def int_read_register  : Intrinsic<[llvm_anyint_ty], [llvm_metadata_ty],
                                    [IntrReadMem], "llvm.read_register">;
 def int_write_register : Intrinsic<[], [llvm_metadata_ty, llvm_anyint_ty],
@@ -464,33 +492,33 @@ def int_read_volatile_register  : Intrinsic<[llvm_anyint_ty], [llvm_metadata_ty]
 
 // Gets the address of the local variable area. This is typically a copy of the
 // stack, frame, or base pointer depending on the type of prologue.
-def int_localaddress : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>;
+def int_localaddress : DefaultAttrsIntrinsic<[llvm_ptr_ty], [], [IntrNoMem]>;
 
 // Escapes local variables to allow access from other functions.
-def int_localescape : Intrinsic<[], [llvm_vararg_ty]>;
+def int_localescape : DefaultAttrsIntrinsic<[], [llvm_vararg_ty]>;
 
 // Given a function and the localaddress of a parent frame, returns a pointer
 // to an escaped allocation indicated by the index.
-def int_localrecover : Intrinsic<[llvm_ptr_ty],
+def int_localrecover : DefaultAttrsIntrinsic<[llvm_ptr_ty],
                                  [llvm_ptr_ty, llvm_ptr_ty, llvm_i32_ty],
                                  [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 // Given the frame pointer passed into an SEH filter function, returns a
 // pointer to the local variable area suitable for use with llvm.localrecover.
-def int_eh_recoverfp : Intrinsic<[llvm_ptr_ty],
+def int_eh_recoverfp : DefaultAttrsIntrinsic<[llvm_ptr_ty],
                                  [llvm_ptr_ty, llvm_ptr_ty],
                                  [IntrNoMem]>;
 
 // Note: we treat stacksave/stackrestore as writemem because we don't otherwise
 // model their dependencies on allocas.
-def int_stacksave     : Intrinsic<[llvm_ptr_ty]>,
+def int_stacksave     : DefaultAttrsIntrinsic<[llvm_ptr_ty]>,
                         GCCBuiltin<"__builtin_stack_save">;
-def int_stackrestore  : Intrinsic<[], [llvm_ptr_ty]>,
+def int_stackrestore  : DefaultAttrsIntrinsic<[], [llvm_ptr_ty]>,
                         GCCBuiltin<"__builtin_stack_restore">;
 
-def int_get_dynamic_area_offset : Intrinsic<[llvm_anyint_ty]>;
+def int_get_dynamic_area_offset : DefaultAttrsIntrinsic<[llvm_anyint_ty]>;
 
-def int_thread_pointer : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>,
+def int_thread_pointer : DefaultAttrsIntrinsic<[llvm_ptr_ty], [], [IntrNoMem]>,
                          GCCBuiltin<"__builtin_thread_pointer">;
 
 // IntrInaccessibleMemOrArgMemOnly is a little more pessimistic than strictly
@@ -498,51 +526,59 @@ def int_thread_pointer : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>,
 // from being reordered overly much with respect to nearby access to the same
 // memory while not impeding optimization.
 def int_prefetch
-    : Intrinsic<[], [ llvm_anyptr_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty ],
+    : DefaultAttrsIntrinsic<[], [ llvm_anyptr_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty ],
                 [IntrInaccessibleMemOrArgMemOnly, IntrWillReturn,
                  ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>,
                  ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
-def int_pcmarker      : Intrinsic<[], [llvm_i32_ty]>;
+def int_pcmarker      : DefaultAttrsIntrinsic<[], [llvm_i32_ty]>;
 
-def int_readcyclecounter : Intrinsic<[llvm_i64_ty]>;
+def int_readcyclecounter : DefaultAttrsIntrinsic<[llvm_i64_ty]>;
 
 // The assume intrinsic is marked as arbitrarily writing so that proper
 // control dependencies will be maintained.
-def int_assume        : Intrinsic<[], [llvm_i1_ty], [IntrWillReturn]>;
+def int_assume        : DefaultAttrsIntrinsic<[], [llvm_i1_ty], [IntrWillReturn,
+                                                     NoUndef<ArgIndex<0>>]>;
+
+// 'llvm.experimental.noalias.scope.decl' intrinsic: Inserted at the location of
+// noalias scope declaration. Makes it possible to identify that a noalias scope
+// is only valid inside the body of a loop.
+//
+// Purpose of the different arguments:
+// - arg0: id.scope: metadata representing the scope declaration.
+def int_experimental_noalias_scope_decl
+    : DefaultAttrsIntrinsic<[], [llvm_metadata_ty],
+        [IntrInaccessibleMemOnly]>; // blocks LICM and some more
 
 // Stack Protector Intrinsic - The stackprotector intrinsic writes the stack
 // guard to the correct place on the stack frame.
-def int_stackprotector : Intrinsic<[], [llvm_ptr_ty, llvm_ptrptr_ty], []>;
-def int_stackguard : Intrinsic<[llvm_ptr_ty], [], []>;
+def int_stackprotector : DefaultAttrsIntrinsic<[], [llvm_ptr_ty, llvm_ptrptr_ty], []>;
+def int_stackguard : DefaultAttrsIntrinsic<[llvm_ptr_ty], [], []>;
 
 // A counter increment for instrumentation based profiling.
 def int_instrprof_increment : Intrinsic<[],
                                         [llvm_ptr_ty, llvm_i64_ty,
-                                         llvm_i32_ty, llvm_i32_ty],
-                                        []>;
+                                         llvm_i32_ty, llvm_i32_ty]>;
 
 // A counter increment with step for instrumentation based profiling.
 def int_instrprof_increment_step : Intrinsic<[],
                                         [llvm_ptr_ty, llvm_i64_ty,
-                                         llvm_i32_ty, llvm_i32_ty, llvm_i64_ty],
-                                        []>;
+                                         llvm_i32_ty, llvm_i32_ty, llvm_i64_ty]>;
 
 // A call to profile runtime for value profiling of target expressions
 // through instrumentation based profiling.
 def int_instrprof_value_profile : Intrinsic<[],
                                             [llvm_ptr_ty, llvm_i64_ty,
                                              llvm_i64_ty, llvm_i32_ty,
-                                             llvm_i32_ty],
-                                            []>;
+                                             llvm_i32_ty]>;
 
-def int_call_preallocated_setup : Intrinsic<[llvm_token_ty], [llvm_i32_ty]>;
-def int_call_preallocated_arg : Intrinsic<[llvm_ptr_ty], [llvm_token_ty, llvm_i32_ty]>;
-def int_call_preallocated_teardown : Intrinsic<[], [llvm_token_ty]>;
+def int_call_preallocated_setup : DefaultAttrsIntrinsic<[llvm_token_ty], [llvm_i32_ty]>;
+def int_call_preallocated_arg : DefaultAttrsIntrinsic<[llvm_ptr_ty], [llvm_token_ty, llvm_i32_ty]>;
+def int_call_preallocated_teardown : DefaultAttrsIntrinsic<[], [llvm_token_ty]>;
 
 //===------------------- Standard C Library Intrinsics --------------------===//
 //
 
-def int_memcpy  : Intrinsic<[],
+def int_memcpy  : DefaultAttrsIntrinsic<[],
                             [llvm_anyptr_ty, llvm_anyptr_ty, llvm_anyint_ty,
                              llvm_i1_ty],
                             [IntrArgMemOnly, IntrWillReturn,
@@ -556,7 +592,7 @@ def int_memcpy  : Intrinsic<[],
 // external function.
 // The third argument (specifying the size) must be a constant.
 def int_memcpy_inline
-    : Intrinsic<[],
+    : DefaultAttrsIntrinsic<[],
       [llvm_anyptr_ty, llvm_anyptr_ty, llvm_anyint_ty, llvm_i1_ty],
       [IntrArgMemOnly, IntrWillReturn,
        NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
@@ -564,13 +600,14 @@ def int_memcpy_inline
        WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>,
        ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
 
-def int_memmove : Intrinsic<[],
+def int_memmove : DefaultAttrsIntrinsic<[],
                             [llvm_anyptr_ty, llvm_anyptr_ty, llvm_anyint_ty,
                              llvm_i1_ty],
                             [IntrArgMemOnly, IntrWillReturn,
                              NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
-                             ReadOnly<ArgIndex<1>>, ImmArg<ArgIndex<3>>]>;
-def int_memset  : Intrinsic<[],
+                             WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>,
+                             ImmArg<ArgIndex<3>>]>;
+def int_memset  : DefaultAttrsIntrinsic<[],
                             [llvm_anyptr_ty, llvm_i8_ty, llvm_anyint_ty,
                              llvm_i1_ty],
                             [IntrWriteMem, IntrArgMemOnly, IntrWillReturn,
@@ -581,65 +618,65 @@ def int_memset  : Intrinsic<[],
 // rounding modes and FP exception handling.
 
 let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrWillReturn] in {
-  def int_fma  : Intrinsic<[llvm_anyfloat_ty],
+  def int_fma  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty],
                            [LLVMMatchType<0>, LLVMMatchType<0>,
                             LLVMMatchType<0>]>;
-  def int_fmuladd : Intrinsic<[llvm_anyfloat_ty],
+  def int_fmuladd : DefaultAttrsIntrinsic<[llvm_anyfloat_ty],
                               [LLVMMatchType<0>, LLVMMatchType<0>,
                                LLVMMatchType<0>]>;
 
   // These functions do not read memory, but are sensitive to the
   // rounding mode. LLVM purposely does not model changes to the FP
   // environment so they can be treated as readnone.
-  def int_sqrt : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
-  def int_powi : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty]>;
-  def int_sin  : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
-  def int_cos  : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
-  def int_pow  : Intrinsic<[llvm_anyfloat_ty],
+  def int_sqrt : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_powi : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty]>;
+  def int_sin  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_cos  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_pow  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty],
                            [LLVMMatchType<0>, LLVMMatchType<0>]>;
-  def int_log  : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
-  def int_log10: Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
-  def int_log2 : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
-  def int_exp  : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
-  def int_exp2 : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
-  def int_fabs : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
-  def int_copysign : Intrinsic<[llvm_anyfloat_ty],
+  def int_log  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_log10: DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_log2 : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_exp  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_exp2 : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_fabs : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_copysign : DefaultAttrsIntrinsic<[llvm_anyfloat_ty],
                                [LLVMMatchType<0>, LLVMMatchType<0>]>;
-  def int_floor : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
-  def int_ceil  : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
-  def int_trunc : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
-  def int_rint  : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
-  def int_nearbyint : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
-  def int_round : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
-  def int_roundeven    : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
-  def int_canonicalize : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>],
+  def int_floor : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_ceil  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_trunc : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_rint  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_nearbyint : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_round : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_roundeven    : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_canonicalize : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>],
                                    [IntrNoMem]>;
 
-  def int_lround : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty]>;
-  def int_llround : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty]>;
-  def int_lrint : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty]>;
-  def int_llrint : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty]>;
+  def int_lround : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty]>;
+  def int_llround : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty]>;
+  def int_lrint : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty]>;
+  def int_llrint : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty]>;
 }
 
-def int_minnum : Intrinsic<[llvm_anyfloat_ty],
+def int_minnum : DefaultAttrsIntrinsic<[llvm_anyfloat_ty],
   [LLVMMatchType<0>, LLVMMatchType<0>],
   [IntrNoMem, IntrSpeculatable, IntrWillReturn, Commutative]
 >;
-def int_maxnum : Intrinsic<[llvm_anyfloat_ty],
+def int_maxnum : DefaultAttrsIntrinsic<[llvm_anyfloat_ty],
   [LLVMMatchType<0>, LLVMMatchType<0>],
   [IntrNoMem, IntrSpeculatable, IntrWillReturn, Commutative]
 >;
-def int_minimum : Intrinsic<[llvm_anyfloat_ty],
+def int_minimum : DefaultAttrsIntrinsic<[llvm_anyfloat_ty],
   [LLVMMatchType<0>, LLVMMatchType<0>],
   [IntrNoMem, IntrSpeculatable, IntrWillReturn, Commutative]
 >;
-def int_maximum : Intrinsic<[llvm_anyfloat_ty],
+def int_maximum : DefaultAttrsIntrinsic<[llvm_anyfloat_ty],
   [LLVMMatchType<0>, LLVMMatchType<0>],
   [IntrNoMem, IntrSpeculatable, IntrWillReturn, Commutative]
 >;
 
 // Internal interface for object size checking
-def int_objectsize : Intrinsic<[llvm_anyint_ty],
+def int_objectsize : DefaultAttrsIntrinsic<[llvm_anyint_ty],
                                [llvm_anyptr_ty, llvm_i1_ty,
                                 llvm_i1_ty, llvm_i1_ty],
                                [IntrNoMem, IntrSpeculatable, IntrWillReturn,
@@ -651,77 +688,77 @@ def int_objectsize : Intrinsic<[llvm_anyint_ty],
 //
 
 let IntrProperties = [IntrInaccessibleMemOnly, IntrWillReturn] in {
-  def int_flt_rounds    : Intrinsic<[llvm_i32_ty], []>;
+  def int_flt_rounds    : DefaultAttrsIntrinsic<[llvm_i32_ty], []>;
 }
 
 //===--------------- Constrained Floating Point Intrinsics ----------------===//
 //
 
 let IntrProperties = [IntrInaccessibleMemOnly, IntrWillReturn] in {
-  def int_experimental_constrained_fadd : Intrinsic<[ llvm_anyfloat_ty ],
+  def int_experimental_constrained_fadd : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ],
                                                     [ LLVMMatchType<0>,
                                                       LLVMMatchType<0>,
                                                       llvm_metadata_ty,
                                                       llvm_metadata_ty ]>;
-  def int_experimental_constrained_fsub : Intrinsic<[ llvm_anyfloat_ty ],
+  def int_experimental_constrained_fsub : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ],
                                                     [ LLVMMatchType<0>,
                                                       LLVMMatchType<0>,
                                                       llvm_metadata_ty,
                                                       llvm_metadata_ty ]>;
-  def int_experimental_constrained_fmul : Intrinsic<[ llvm_anyfloat_ty ],
+  def int_experimental_constrained_fmul : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ],
                                                     [ LLVMMatchType<0>,
                                                       LLVMMatchType<0>,
                                                       llvm_metadata_ty,
                                                       llvm_metadata_ty ]>;
-  def int_experimental_constrained_fdiv : Intrinsic<[ llvm_anyfloat_ty ],
+  def int_experimental_constrained_fdiv : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ],
                                                     [ LLVMMatchType<0>,
                                                       LLVMMatchType<0>,
                                                       llvm_metadata_ty,
                                                       llvm_metadata_ty ]>;
-  def int_experimental_constrained_frem : Intrinsic<[ llvm_anyfloat_ty ],
+  def int_experimental_constrained_frem : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ],
                                                     [ LLVMMatchType<0>,
                                                       LLVMMatchType<0>,
                                                       llvm_metadata_ty,
                                                       llvm_metadata_ty ]>;
 
-  def int_experimental_constrained_fma : Intrinsic<[ llvm_anyfloat_ty ],
+  def int_experimental_constrained_fma : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ],
                                                     [ LLVMMatchType<0>,
                                                       LLVMMatchType<0>,
                                                       LLVMMatchType<0>,
                                                       llvm_metadata_ty,
                                                       llvm_metadata_ty ]>;
 
-  def int_experimental_constrained_fmuladd : Intrinsic<[ llvm_anyfloat_ty ],
+  def int_experimental_constrained_fmuladd : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ],
                                                        [ LLVMMatchType<0>,
                                                          LLVMMatchType<0>,
                                                          LLVMMatchType<0>,
                                                          llvm_metadata_ty,
                                                          llvm_metadata_ty ]>;
 
-  def int_experimental_constrained_fptosi : Intrinsic<[ llvm_anyint_ty ],
+  def int_experimental_constrained_fptosi : DefaultAttrsIntrinsic<[ llvm_anyint_ty ],
                                                     [ llvm_anyfloat_ty,
                                                       llvm_metadata_ty ]>;
 
-  def int_experimental_constrained_fptoui : Intrinsic<[ llvm_anyint_ty ],
+  def int_experimental_constrained_fptoui : DefaultAttrsIntrinsic<[ llvm_anyint_ty ],
                                                     [ llvm_anyfloat_ty,
                                                       llvm_metadata_ty ]>;
 
-  def int_experimental_constrained_sitofp : Intrinsic<[ llvm_anyfloat_ty ],
+  def int_experimental_constrained_sitofp : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ],
                                                        [ llvm_anyint_ty,
                                                          llvm_metadata_ty,
                                                          llvm_metadata_ty ]>;
 
-  def int_experimental_constrained_uitofp : Intrinsic<[ llvm_anyfloat_ty ],
+  def int_experimental_constrained_uitofp : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ],
                                                        [ llvm_anyint_ty,
                                                          llvm_metadata_ty,
                                                          llvm_metadata_ty ]>;
 
-  def int_experimental_constrained_fptrunc : Intrinsic<[ llvm_anyfloat_ty ],
+  def int_experimental_constrained_fptrunc : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ],
                                                        [ llvm_anyfloat_ty,
                                                          llvm_metadata_ty,
                                                          llvm_metadata_ty ]>;
 
-  def int_experimental_constrained_fpext : Intrinsic<[ llvm_anyfloat_ty ],
+  def int_experimental_constrained_fpext : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ],
                                                      [ llvm_anyfloat_ty,
                                                        llvm_metadata_ty ]>;
 
@@ -729,110 +766,110 @@ let IntrProperties = [IntrInaccessibleMemOnly, IntrWillReturn] in {
   // versions of each of them.  When strict rounding and exception control are
   // not required the non-constrained versions of these intrinsics should be
   // used.
-  def int_experimental_constrained_sqrt : Intrinsic<[ llvm_anyfloat_ty ],
+  def int_experimental_constrained_sqrt : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ],
                                                     [ LLVMMatchType<0>,
                                                       llvm_metadata_ty,
                                                       llvm_metadata_ty ]>;
-  def int_experimental_constrained_powi : Intrinsic<[ llvm_anyfloat_ty ],
+  def int_experimental_constrained_powi : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ],
                                                     [ LLVMMatchType<0>,
                                                       llvm_i32_ty,
                                                       llvm_metadata_ty,
                                                       llvm_metadata_ty ]>;
-  def int_experimental_constrained_sin  : Intrinsic<[ llvm_anyfloat_ty ],
+  def int_experimental_constrained_sin  : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ],
                                                     [ LLVMMatchType<0>,
                                                       llvm_metadata_ty,
                                                       llvm_metadata_ty ]>;
-  def int_experimental_constrained_cos  : Intrinsic<[ llvm_anyfloat_ty ],
+  def int_experimental_constrained_cos  : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ],
                                                     [ LLVMMatchType<0>,
                                                       llvm_metadata_ty,
                                                       llvm_metadata_ty ]>;
-  def int_experimental_constrained_pow  : Intrinsic<[ llvm_anyfloat_ty ],
+  def int_experimental_constrained_pow  : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ],
                                                     [ LLVMMatchType<0>,
                                                       LLVMMatchType<0>,
                                                       llvm_metadata_ty,
                                                       llvm_metadata_ty ]>;
-  def int_experimental_constrained_log  : Intrinsic<[ llvm_anyfloat_ty ],
+  def int_experimental_constrained_log  : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ],
                                                     [ LLVMMatchType<0>,
                                                       llvm_metadata_ty,
                                                       llvm_metadata_ty ]>;
-  def int_experimental_constrained_log10: Intrinsic<[ llvm_anyfloat_ty ],
+  def int_experimental_constrained_log10: DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ],
                                                     [ LLVMMatchType<0>,
                                                       llvm_metadata_ty,
                                                       llvm_metadata_ty ]>;
-  def int_experimental_constrained_log2 : Intrinsic<[ llvm_anyfloat_ty ],
+  def int_experimental_constrained_log2 : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ],
                                                     [ LLVMMatchType<0>,
                                                       llvm_metadata_ty,
                                                       llvm_metadata_ty ]>;
-  def int_experimental_constrained_exp  : Intrinsic<[ llvm_anyfloat_ty ],
+  def int_experimental_constrained_exp  : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ],
                                                     [ LLVMMatchType<0>,
                                                       llvm_metadata_ty,
                                                       llvm_metadata_ty ]>;
-  def int_experimental_constrained_exp2 : Intrinsic<[ llvm_anyfloat_ty ],
+  def int_experimental_constrained_exp2 : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ],
                                                     [ LLVMMatchType<0>,
                                                       llvm_metadata_ty,
                                                       llvm_metadata_ty ]>;
-  def int_experimental_constrained_rint  : Intrinsic<[ llvm_anyfloat_ty ],
+  def int_experimental_constrained_rint  : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ],
                                                      [ LLVMMatchType<0>,
                                                        llvm_metadata_ty,
                                                        llvm_metadata_ty ]>;
-  def int_experimental_constrained_nearbyint : Intrinsic<[ llvm_anyfloat_ty ],
+  def int_experimental_constrained_nearbyint : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ],
                                                          [ LLVMMatchType<0>,
                                                            llvm_metadata_ty,
                                                            llvm_metadata_ty ]>;
-  def int_experimental_constrained_lrint : Intrinsic<[ llvm_anyint_ty ],
+  def int_experimental_constrained_lrint : DefaultAttrsIntrinsic<[ llvm_anyint_ty ],
                                                      [ llvm_anyfloat_ty,
                                                        llvm_metadata_ty,
                                                        llvm_metadata_ty ]>;
-  def int_experimental_constrained_llrint : Intrinsic<[ llvm_anyint_ty ],
+  def int_experimental_constrained_llrint : DefaultAttrsIntrinsic<[ llvm_anyint_ty ],
                                                       [ llvm_anyfloat_ty,
                                                         llvm_metadata_ty,
                                                         llvm_metadata_ty ]>;
-  def int_experimental_constrained_maxnum : Intrinsic<[ llvm_anyfloat_ty ],
+  def int_experimental_constrained_maxnum : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ],
                                                       [ LLVMMatchType<0>,
                                                         LLVMMatchType<0>,
                                                         llvm_metadata_ty ]>;
-  def int_experimental_constrained_minnum : Intrinsic<[ llvm_anyfloat_ty ],
+  def int_experimental_constrained_minnum : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ],
                                                       [ LLVMMatchType<0>,
                                                         LLVMMatchType<0>,
                                                         llvm_metadata_ty ]>;
-  def int_experimental_constrained_maximum : Intrinsic<[ llvm_anyfloat_ty ],
+  def int_experimental_constrained_maximum : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ],
                                                        [ LLVMMatchType<0>,
                                                          LLVMMatchType<0>,
                                                          llvm_metadata_ty ]>;
-  def int_experimental_constrained_minimum : Intrinsic<[ llvm_anyfloat_ty ],
+  def int_experimental_constrained_minimum : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ],
                                                        [ LLVMMatchType<0>,
                                                          LLVMMatchType<0>,
                                                          llvm_metadata_ty ]>;
-  def int_experimental_constrained_ceil : Intrinsic<[ llvm_anyfloat_ty ],
+  def int_experimental_constrained_ceil : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ],
                                                     [ LLVMMatchType<0>,
                                                       llvm_metadata_ty ]>;
-  def int_experimental_constrained_floor : Intrinsic<[ llvm_anyfloat_ty ],
+  def int_experimental_constrained_floor : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ],
                                                      [ LLVMMatchType<0>,
                                                        llvm_metadata_ty ]>;
-  def int_experimental_constrained_lround : Intrinsic<[ llvm_anyint_ty ],
+  def int_experimental_constrained_lround : DefaultAttrsIntrinsic<[ llvm_anyint_ty ],
                                                       [ llvm_anyfloat_ty,
                                                         llvm_metadata_ty ]>;
-  def int_experimental_constrained_llround : Intrinsic<[ llvm_anyint_ty ],
+  def int_experimental_constrained_llround : DefaultAttrsIntrinsic<[ llvm_anyint_ty ],
                                                        [ llvm_anyfloat_ty,
                                                          llvm_metadata_ty ]>;
-  def int_experimental_constrained_round : Intrinsic<[ llvm_anyfloat_ty ],
+  def int_experimental_constrained_round : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ],
                                                      [ LLVMMatchType<0>,
                                                       llvm_metadata_ty ]>;
-  def int_experimental_constrained_roundeven : Intrinsic<[ llvm_anyfloat_ty ],
+  def int_experimental_constrained_roundeven : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ],
                                                          [ LLVMMatchType<0>,
                                                            llvm_metadata_ty ]>;
-  def int_experimental_constrained_trunc : Intrinsic<[ llvm_anyfloat_ty ],
+  def int_experimental_constrained_trunc : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ],
                                                      [ LLVMMatchType<0>,
                                                        llvm_metadata_ty ]>;
 
   // Constrained floating-point comparison (quiet and signaling variants).
   // Third operand is the predicate represented as a metadata string.
   def int_experimental_constrained_fcmp
-      : Intrinsic<[ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty> ],
+      : DefaultAttrsIntrinsic<[ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty> ],
                   [ llvm_anyfloat_ty, LLVMMatchType<0>,
                     llvm_metadata_ty, llvm_metadata_ty ]>;
   def int_experimental_constrained_fcmps
-      : Intrinsic<[ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty> ],
+      : DefaultAttrsIntrinsic<[ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty> ],
                   [ llvm_anyfloat_ty, LLVMMatchType<0>,
                     llvm_metadata_ty, llvm_metadata_ty ]>;
 }
@@ -840,10 +877,10 @@ let IntrProperties = [IntrInaccessibleMemOnly, IntrWillReturn] in {
 
 //===------------------------- Expect Intrinsics --------------------------===//
 //
-def int_expect : Intrinsic<[llvm_anyint_ty],
+def int_expect : DefaultAttrsIntrinsic<[llvm_anyint_ty],
   [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem, IntrWillReturn]>;
 
-def int_expect_with_probability : Intrinsic<[llvm_anyint_ty],
+def int_expect_with_probability : DefaultAttrsIntrinsic<[llvm_anyint_ty],
   [LLVMMatchType<0>, LLVMMatchType<0>, llvm_double_ty],
   [IntrNoMem, IntrWillReturn]>;
 
@@ -852,19 +889,19 @@ def int_expect_with_probability : Intrinsic<[llvm_anyint_ty],
 
 // None of these intrinsics accesses memory at all.
 let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrWillReturn] in {
-  def int_bswap: Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>]>;
-  def int_ctpop: Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>]>;
-  def int_bitreverse : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>]>;
-  def int_fshl : Intrinsic<[llvm_anyint_ty],
+  def int_bswap: DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>]>;
+  def int_ctpop: DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>]>;
+  def int_bitreverse : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>]>;
+  def int_fshl : DefaultAttrsIntrinsic<[llvm_anyint_ty],
       [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>]>;
-  def int_fshr : Intrinsic<[llvm_anyint_ty],
+  def int_fshr : DefaultAttrsIntrinsic<[llvm_anyint_ty],
       [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>]>;
 }
 
 let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrWillReturn,
                       ImmArg<ArgIndex<1>>] in {
-  def int_ctlz : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, llvm_i1_ty]>;
-  def int_cttz : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, llvm_i1_ty]>;
+  def int_ctlz : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, llvm_i1_ty]>;
+  def int_cttz : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, llvm_i1_ty]>;
 }
 
 //===------------------------ Debugger Intrinsics -------------------------===//
@@ -875,19 +912,19 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrWillReturn,
 // needed in a few places. These synthetic intrinsics have no
 // side-effects and just mark information about their operands.
 let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrWillReturn] in {
-  def int_dbg_declare      : Intrinsic<[],
+  def int_dbg_declare      : DefaultAttrsIntrinsic<[],
                                        [llvm_metadata_ty,
                                         llvm_metadata_ty,
                                         llvm_metadata_ty]>;
-  def int_dbg_value        : Intrinsic<[],
+  def int_dbg_value        : DefaultAttrsIntrinsic<[],
                                        [llvm_metadata_ty,
                                         llvm_metadata_ty,
                                         llvm_metadata_ty]>;
-  def int_dbg_addr         : Intrinsic<[],
+  def int_dbg_addr         : DefaultAttrsIntrinsic<[],
                                        [llvm_metadata_ty,
                                         llvm_metadata_ty,
                                         llvm_metadata_ty]>;
-  def int_dbg_label        : Intrinsic<[],
+  def int_dbg_label        : DefaultAttrsIntrinsic<[],
                                        [llvm_metadata_ty]>;
 }
 
@@ -917,10 +954,9 @@ def int_eh_unwind_init: Intrinsic<[]>,
 
 def int_eh_dwarf_cfa  : Intrinsic<[llvm_ptr_ty], [llvm_i32_ty]>;
 
-let IntrProperties = [IntrNoMem] in {
-  def int_eh_sjlj_lsda             : Intrinsic<[llvm_ptr_ty]>;
-  def int_eh_sjlj_callsite         : Intrinsic<[], [llvm_i32_ty]>;
-}
+def int_eh_sjlj_lsda             : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>;
+def int_eh_sjlj_callsite         : Intrinsic<[], [llvm_i32_ty], [IntrNoMem]>;
+
 def int_eh_sjlj_functioncontext : Intrinsic<[], [llvm_ptr_ty]>;
 def int_eh_sjlj_setjmp          : Intrinsic<[llvm_i32_ty], [llvm_ptr_ty]>;
 def int_eh_sjlj_longjmp         : Intrinsic<[], [llvm_ptr_ty], [IntrNoReturn]>;
@@ -928,15 +964,15 @@ def int_eh_sjlj_setup_dispatch  : Intrinsic<[], []>;
 
 //===---------------- Generic Variable Attribute Intrinsics----------------===//
 //
-def int_var_annotation : Intrinsic<[],
+def int_var_annotation : DefaultAttrsIntrinsic<[],
                                    [llvm_ptr_ty, llvm_ptr_ty,
-                                    llvm_ptr_ty, llvm_i32_ty],
+                                    llvm_ptr_ty, llvm_i32_ty, llvm_ptr_ty],
                                    [IntrWillReturn], "llvm.var.annotation">;
-def int_ptr_annotation : Intrinsic<[LLVMAnyPointerType<llvm_anyint_ty>],
+def int_ptr_annotation : DefaultAttrsIntrinsic<[LLVMAnyPointerType<llvm_anyint_ty>],
                                    [LLVMMatchType<0>, llvm_ptr_ty, llvm_ptr_ty,
-                                    llvm_i32_ty],
+                                    llvm_i32_ty, llvm_ptr_ty],
                                    [IntrWillReturn], "llvm.ptr.annotation">;
-def int_annotation : Intrinsic<[llvm_anyint_ty],
+def int_annotation : DefaultAttrsIntrinsic<[llvm_anyint_ty],
                                [LLVMMatchType<0>, llvm_ptr_ty,
                                 llvm_ptr_ty, llvm_i32_ty],
                                [IntrWillReturn], "llvm.annotation">;
@@ -944,7 +980,7 @@ def int_annotation : Intrinsic<[llvm_anyint_ty],
 // Annotates the current program point with metadata strings which are emitted
 // as CodeView debug info records. This is expensive, as it disables inlining
 // and is modelled as having side effects.
-def int_codeview_annotation : Intrinsic<[], [llvm_metadata_ty],
+def int_codeview_annotation : DefaultAttrsIntrinsic<[], [llvm_metadata_ty],
                                         [IntrInaccessibleMemOnly, IntrNoDuplicate, IntrWillReturn],
                                         "llvm.codeview.annotation">;
 
@@ -964,99 +1000,124 @@ def int_adjust_trampoline : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty],
 
 // Expose the carry flag from add operations on two integrals.
 let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrWillReturn] in {
-  def int_sadd_with_overflow : Intrinsic<[llvm_anyint_ty,
+  def int_sadd_with_overflow : DefaultAttrsIntrinsic<[llvm_anyint_ty,
                                           LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
                                          [LLVMMatchType<0>, LLVMMatchType<0>]>;
-  def int_uadd_with_overflow : Intrinsic<[llvm_anyint_ty,
+  def int_uadd_with_overflow : DefaultAttrsIntrinsic<[llvm_anyint_ty,
                                           LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
                                          [LLVMMatchType<0>, LLVMMatchType<0>]>;
 
-  def int_ssub_with_overflow : Intrinsic<[llvm_anyint_ty,
+  def int_ssub_with_overflow : DefaultAttrsIntrinsic<[llvm_anyint_ty,
                                           LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
                                          [LLVMMatchType<0>, LLVMMatchType<0>]>;
-  def int_usub_with_overflow : Intrinsic<[llvm_anyint_ty,
+  def int_usub_with_overflow : DefaultAttrsIntrinsic<[llvm_anyint_ty,
                                           LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
                                          [LLVMMatchType<0>, LLVMMatchType<0>]>;
 
-  def int_smul_with_overflow : Intrinsic<[llvm_anyint_ty,
+  def int_smul_with_overflow : DefaultAttrsIntrinsic<[llvm_anyint_ty,
                                           LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
                                          [LLVMMatchType<0>, LLVMMatchType<0>]>;
-  def int_umul_with_overflow : Intrinsic<[llvm_anyint_ty,
+  def int_umul_with_overflow : DefaultAttrsIntrinsic<[llvm_anyint_ty,
                                           LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
                                          [LLVMMatchType<0>, LLVMMatchType<0>]>;
 }
 //===------------------------- Saturation Arithmetic Intrinsics ---------------------===//
 //
-def int_sadd_sat : Intrinsic<[llvm_anyint_ty],
+def int_sadd_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty],
                              [LLVMMatchType<0>, LLVMMatchType<0>],
                              [IntrNoMem, IntrSpeculatable, IntrWillReturn, Commutative]>;
-def int_uadd_sat : Intrinsic<[llvm_anyint_ty],
+def int_uadd_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty],
                              [LLVMMatchType<0>, LLVMMatchType<0>],
                              [IntrNoMem, IntrSpeculatable, IntrWillReturn, Commutative]>;
-def int_ssub_sat : Intrinsic<[llvm_anyint_ty],
+def int_ssub_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty],
+                             [LLVMMatchType<0>, LLVMMatchType<0>],
+                             [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
+def int_usub_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty],
+                             [LLVMMatchType<0>, LLVMMatchType<0>],
+                             [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
+def int_sshl_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty],
                              [LLVMMatchType<0>, LLVMMatchType<0>],
                              [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
-def int_usub_sat : Intrinsic<[llvm_anyint_ty],
+def int_ushl_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty],
                              [LLVMMatchType<0>, LLVMMatchType<0>],
                              [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
 
 //===------------------------- Fixed Point Arithmetic Intrinsics ---------------------===//
 //
-def int_smul_fix : Intrinsic<[llvm_anyint_ty],
+def int_smul_fix : DefaultAttrsIntrinsic<[llvm_anyint_ty],
                              [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
                              [IntrNoMem, IntrSpeculatable, IntrWillReturn,
                               Commutative, ImmArg<ArgIndex<2>>]>;
 
-def int_umul_fix : Intrinsic<[llvm_anyint_ty],
+def int_umul_fix : DefaultAttrsIntrinsic<[llvm_anyint_ty],
                              [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
                              [IntrNoMem, IntrSpeculatable, IntrWillReturn,
                               Commutative, ImmArg<ArgIndex<2>>]>;
 
-def int_sdiv_fix : Intrinsic<[llvm_anyint_ty],
+def int_sdiv_fix : DefaultAttrsIntrinsic<[llvm_anyint_ty],
                              [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
                              [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
-def int_udiv_fix : Intrinsic<[llvm_anyint_ty],
+def int_udiv_fix : DefaultAttrsIntrinsic<[llvm_anyint_ty],
                              [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
                              [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 //===------------------- Fixed Point Saturation Arithmetic Intrinsics ----------------===//
 //
-def int_smul_fix_sat : Intrinsic<[llvm_anyint_ty],
+def int_smul_fix_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty],
                                  [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
                                  [IntrNoMem, IntrSpeculatable, IntrWillReturn,
                                   Commutative, ImmArg<ArgIndex<2>>]>;
-def int_umul_fix_sat : Intrinsic<[llvm_anyint_ty],
+def int_umul_fix_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty],
                                  [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
                                  [IntrNoMem, IntrSpeculatable, IntrWillReturn,
                                   Commutative, ImmArg<ArgIndex<2>>]>;
 
-def int_sdiv_fix_sat : Intrinsic<[llvm_anyint_ty],
+def int_sdiv_fix_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty],
                                  [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
                                  [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
-def int_udiv_fix_sat : Intrinsic<[llvm_anyint_ty],
+def int_udiv_fix_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty],
                                  [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
                                  [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
+//===------------------ Integer Min/Max/Abs Intrinsics --------------------===//
+//
+def int_abs : DefaultAttrsIntrinsic<
+    [llvm_anyint_ty], [LLVMMatchType<0>, llvm_i1_ty],
+    [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<ArgIndex<1>>]>;
+
+def int_smax : DefaultAttrsIntrinsic<
+    [llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
+    [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
+def int_smin : DefaultAttrsIntrinsic<
+    [llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
+    [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
+def int_umax : DefaultAttrsIntrinsic<
+    [llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
+    [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
+def int_umin : DefaultAttrsIntrinsic<
+    [llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
+    [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
+
 //===------------------------- Memory Use Markers -------------------------===//
 //
-def int_lifetime_start  : Intrinsic<[],
+def int_lifetime_start  : DefaultAttrsIntrinsic<[],
                                     [llvm_i64_ty, llvm_anyptr_ty],
                                     [IntrArgMemOnly, IntrWillReturn,
                                      NoCapture<ArgIndex<1>>,
                                      ImmArg<ArgIndex<0>>]>;
-def int_lifetime_end    : Intrinsic<[],
+def int_lifetime_end    : DefaultAttrsIntrinsic<[],
                                     [llvm_i64_ty, llvm_anyptr_ty],
                                     [IntrArgMemOnly, IntrWillReturn,
                                      NoCapture<ArgIndex<1>>,
                                      ImmArg<ArgIndex<0>>]>;
-def int_invariant_start : Intrinsic<[llvm_descriptor_ty],
+def int_invariant_start : DefaultAttrsIntrinsic<[llvm_descriptor_ty],
                                     [llvm_i64_ty, llvm_anyptr_ty],
                                     [IntrArgMemOnly, IntrWillReturn,
                                      NoCapture<ArgIndex<1>>,
                                      ImmArg<ArgIndex<0>>]>;
-def int_invariant_end   : Intrinsic<[],
+def int_invariant_end   : DefaultAttrsIntrinsic<[],
                                     [llvm_descriptor_ty, llvm_i64_ty,
                                      llvm_anyptr_ty],
                                     [IntrArgMemOnly, IntrWillReturn,
@@ -1075,26 +1136,26 @@ def int_invariant_end   : Intrinsic<[],
 // it would remove barrier.
 // Note that it is still experimental, which means that its semantics
 // might change in the future.
-def int_launder_invariant_group : Intrinsic<[llvm_anyptr_ty],
+def int_launder_invariant_group : DefaultAttrsIntrinsic<[llvm_anyptr_ty],
                                             [LLVMMatchType<0>],
                                             [IntrInaccessibleMemOnly, IntrSpeculatable, IntrWillReturn]>;
 
 
-def int_strip_invariant_group : Intrinsic<[llvm_anyptr_ty],
+def int_strip_invariant_group : DefaultAttrsIntrinsic<[llvm_anyptr_ty],
                                           [LLVMMatchType<0>],
                                           [IntrSpeculatable, IntrNoMem, IntrWillReturn]>;
 
 //===------------------------ Stackmap Intrinsics -------------------------===//
 //
-def int_experimental_stackmap : Intrinsic<[],
+def int_experimental_stackmap : DefaultAttrsIntrinsic<[],
                                   [llvm_i64_ty, llvm_i32_ty, llvm_vararg_ty],
                                   [Throws]>;
-def int_experimental_patchpoint_void : Intrinsic<[],
+def int_experimental_patchpoint_void : DefaultAttrsIntrinsic<[],
                                                  [llvm_i64_ty, llvm_i32_ty,
                                                   llvm_ptr_ty, llvm_i32_ty,
                                                   llvm_vararg_ty],
                                                   [Throws]>;
-def int_experimental_patchpoint_i64 : Intrinsic<[llvm_i64_ty],
+def int_experimental_patchpoint_i64 : DefaultAttrsIntrinsic<[llvm_i64_ty],
                                                 [llvm_i64_ty, llvm_i32_ty,
                                                  llvm_ptr_ty, llvm_i32_ty,
                                                  llvm_vararg_ty],
@@ -1139,6 +1200,23 @@ def int_coro_id_retcon_once : Intrinsic<[llvm_token_ty],
      llvm_ptr_ty, llvm_ptr_ty, llvm_ptr_ty],
     []>;
 def int_coro_alloc : Intrinsic<[llvm_i1_ty], [llvm_token_ty], []>;
+def int_coro_id_async : Intrinsic<[llvm_token_ty],
+  [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty],
+  []>;
+def int_coro_async_context_alloc : Intrinsic<[llvm_ptr_ty],
+    [llvm_ptr_ty, llvm_ptr_ty],
+    []>;
+def int_coro_async_context_dealloc : Intrinsic<[],
+    [llvm_ptr_ty],
+    []>;
+def int_coro_async_resume : Intrinsic<[llvm_ptr_ty],
+    [],
+    []>;
+def int_coro_suspend_async : Intrinsic<[llvm_ptr_ty, llvm_ptr_ty, llvm_ptr_ty],
+    [llvm_ptr_ty, llvm_ptr_ty, llvm_vararg_ty],
+    []>;
+def int_coro_prepare_async : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty],
+                                       [IntrNoMem]>;
 def int_coro_begin : Intrinsic<[llvm_ptr_ty], [llvm_token_ty, llvm_ptr_ty],
                                [WriteOnly<ArgIndex<1>>]>;
 
@@ -1147,6 +1225,8 @@ def int_coro_free : Intrinsic<[llvm_ptr_ty], [llvm_token_ty, llvm_ptr_ty],
                                ReadOnly<ArgIndex<1>>,
                                NoCapture<ArgIndex<1>>]>;
 def int_coro_end : Intrinsic<[llvm_i1_ty], [llvm_ptr_ty, llvm_i1_ty], []>;
+def int_coro_end_async
+    : Intrinsic<[llvm_i1_ty], [llvm_ptr_ty, llvm_i1_ty, llvm_vararg_ty], []>;
 
 def int_coro_frame : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>;
 def int_coro_noop : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>;
@@ -1190,32 +1270,47 @@ def int_trap : Intrinsic<[], [], [IntrNoReturn, IntrCold]>,
                GCCBuiltin<"__builtin_trap">;
 def int_debugtrap : Intrinsic<[]>,
                     GCCBuiltin<"__builtin_debugtrap">;
+def int_ubsantrap : Intrinsic<[], [llvm_i8_ty],
+                              [IntrNoReturn, IntrCold, ImmArg<ArgIndex<0>>]>;
 
 // Support for dynamic deoptimization (or de-specialization)
 def int_experimental_deoptimize : Intrinsic<[llvm_any_ty], [llvm_vararg_ty],
                                             [Throws]>;
 
 // Support for speculative runtime guards
-def int_experimental_guard : Intrinsic<[], [llvm_i1_ty, llvm_vararg_ty],
+def int_experimental_guard : DefaultAttrsIntrinsic<[], [llvm_i1_ty, llvm_vararg_ty],
                                        [Throws]>;
 
 // Supports widenable conditions for guards represented as explicit branches.
-def int_experimental_widenable_condition : Intrinsic<[llvm_i1_ty], [],
+def int_experimental_widenable_condition : DefaultAttrsIntrinsic<[llvm_i1_ty], [],
         [IntrInaccessibleMemOnly, IntrWillReturn, IntrSpeculatable]>;
 
 // NOP: calls/invokes to this intrinsic are removed by codegen
-def int_donothing : Intrinsic<[], [], [IntrNoMem, IntrWillReturn]>;
+def int_donothing : DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrWillReturn]>;
 
 // This instruction has no actual effect, though it is treated by the optimizer
 // has having opaque side effects. This may be inserted into loops to ensure
 // that they are not removed even if they turn out to be empty, for languages
 // which specify that infinite loops must be preserved.
-def int_sideeffect : Intrinsic<[], [], [IntrInaccessibleMemOnly, IntrWillReturn]>;
+def int_sideeffect : DefaultAttrsIntrinsic<[], [], [IntrInaccessibleMemOnly, IntrWillReturn]>;
+
+// The pseudoprobe intrinsic works as a place holder to the block it probes.
+// Like the sideeffect intrinsic defined above, this intrinsic is treated by the 
+// optimizer as having opaque side effects so that it won't be get rid of or moved 
+// out of the block it probes.
+def int_pseudoprobe : Intrinsic<[], [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i64_ty],
+                                    [IntrInaccessibleMemOnly, IntrWillReturn]>;
 
 // Intrinsics to support half precision floating point format
 let IntrProperties = [IntrNoMem, IntrWillReturn] in {
-def int_convert_to_fp16   : Intrinsic<[llvm_i16_ty], [llvm_anyfloat_ty]>;
-def int_convert_from_fp16 : Intrinsic<[llvm_anyfloat_ty], [llvm_i16_ty]>;
+def int_convert_to_fp16   : DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_anyfloat_ty]>;
+def int_convert_from_fp16 : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [llvm_i16_ty]>;
+}
+
+// Saturating floating point to integer intrinsics
+let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrWillReturn] in {
+def int_fptoui_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty]>;
+def int_fptosi_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty]>;
 }
 
 // Clear cache intrinsic, default to ignore (ie. emit nothing)
@@ -1224,144 +1319,147 @@ def int_clear_cache : Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty],
                                 [], "llvm.clear_cache">;
 
 // Intrinsic to detect whether its argument is a constant.
-def int_is_constant : Intrinsic<[llvm_i1_ty], [llvm_any_ty],
+def int_is_constant : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_any_ty],
                                 [IntrNoMem, IntrWillReturn, IntrConvergent],
                                 "llvm.is.constant">;
 
 // Intrinsic to mask out bits of a pointer.
-def int_ptrmask: Intrinsic<[llvm_anyptr_ty], [LLVMMatchType<0>, llvm_anyint_ty],
+def int_ptrmask: DefaultAttrsIntrinsic<[llvm_anyptr_ty], [LLVMMatchType<0>, llvm_anyint_ty],
                            [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
 
 //===---------------- Vector Predication Intrinsics --------------===//
 
-// Binary operators
-let IntrProperties = [IntrNoMem, IntrNoSync, IntrWillReturn] in {
-  def int_vp_add : Intrinsic<[ llvm_anyvector_ty ],
+// Speculatable Binary operators
+let IntrProperties = [IntrSpeculatable, IntrNoMem, IntrNoSync, IntrWillReturn] in {
+  def int_vp_add : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
                              [ LLVMMatchType<0>,
                                LLVMMatchType<0>,
                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                                llvm_i32_ty]>;
-  def int_vp_sub : Intrinsic<[ llvm_anyvector_ty ],
+  def int_vp_sub : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
                              [ LLVMMatchType<0>,
                                LLVMMatchType<0>,
                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                                llvm_i32_ty]>;
-  def int_vp_mul  : Intrinsic<[ llvm_anyvector_ty ],
+  def int_vp_mul  : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
                               [ LLVMMatchType<0>,
                                 LLVMMatchType<0>,
                                 LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                                 llvm_i32_ty]>;
-  def int_vp_sdiv : Intrinsic<[ llvm_anyvector_ty ],
+  def int_vp_ashr : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
                               [ LLVMMatchType<0>,
                                 LLVMMatchType<0>,
                                 LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                                 llvm_i32_ty]>;
-  def int_vp_udiv : Intrinsic<[ llvm_anyvector_ty ],
+  def int_vp_lshr : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
                               [ LLVMMatchType<0>,
                                 LLVMMatchType<0>,
                                 LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                                 llvm_i32_ty]>;
-  def int_vp_srem : Intrinsic<[ llvm_anyvector_ty ],
-                              [ LLVMMatchType<0>,
-                                LLVMMatchType<0>,
-                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-                                llvm_i32_ty]>;
-  def int_vp_urem : Intrinsic<[ llvm_anyvector_ty ],
-                              [ LLVMMatchType<0>,
-                                LLVMMatchType<0>,
-                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-                                llvm_i32_ty]>;
-  def int_vp_ashr : Intrinsic<[ llvm_anyvector_ty ],
-                              [ LLVMMatchType<0>,
-                                LLVMMatchType<0>,
-                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-                                llvm_i32_ty]>;
-  def int_vp_lshr : Intrinsic<[ llvm_anyvector_ty ],
-                              [ LLVMMatchType<0>,
-                                LLVMMatchType<0>,
-                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-                                llvm_i32_ty]>;
-  def int_vp_shl : Intrinsic<[ llvm_anyvector_ty ],
+  def int_vp_shl : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
                              [ LLVMMatchType<0>,
                                LLVMMatchType<0>,
                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                                llvm_i32_ty]>;
-  def int_vp_or : Intrinsic<[ llvm_anyvector_ty ],
+  def int_vp_or : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
                             [ LLVMMatchType<0>,
                               LLVMMatchType<0>,
                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                               llvm_i32_ty]>;
-  def int_vp_and : Intrinsic<[ llvm_anyvector_ty ],
+  def int_vp_and : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
                              [ LLVMMatchType<0>,
                                LLVMMatchType<0>,
                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                                llvm_i32_ty]>;
-  def int_vp_xor : Intrinsic<[ llvm_anyvector_ty ],
+  def int_vp_xor : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
                              [ LLVMMatchType<0>,
                                LLVMMatchType<0>,
                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                                llvm_i32_ty]>;
+}
 
+// Non-speculatable binary operators.
+let IntrProperties = [IntrNoMem, IntrNoSync, IntrWillReturn] in {
+  def int_vp_sdiv : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
+                              [ LLVMMatchType<0>,
+                                LLVMMatchType<0>,
+                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                llvm_i32_ty]>;
+  def int_vp_udiv : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
+                              [ LLVMMatchType<0>,
+                                LLVMMatchType<0>,
+                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                llvm_i32_ty]>;
+  def int_vp_srem : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
+                              [ LLVMMatchType<0>,
+                                LLVMMatchType<0>,
+                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                llvm_i32_ty]>;
+  def int_vp_urem : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
+                              [ LLVMMatchType<0>,
+                                LLVMMatchType<0>,
+                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                llvm_i32_ty]>;
 }
 
 def int_get_active_lane_mask:
-  Intrinsic<[llvm_anyvector_ty],
+  DefaultAttrsIntrinsic<[llvm_anyvector_ty],
             [llvm_anyint_ty, LLVMMatchType<1>],
             [IntrNoMem, IntrNoSync, IntrWillReturn]>;
 
 //===-------------------------- Masked Intrinsics -------------------------===//
 //
-def int_masked_store : Intrinsic<[], [llvm_anyvector_ty,
-                                      LLVMAnyPointerType<LLVMMatchType<0>>,
-                                      llvm_i32_ty,
-                                      LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
-                                 [IntrArgMemOnly, IntrWillReturn, ImmArg<ArgIndex<2>>]>;
-
-def int_masked_load  : Intrinsic<[llvm_anyvector_ty],
-                                 [LLVMAnyPointerType<LLVMMatchType<0>>, llvm_i32_ty,
-                                  LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<0>],
-                                 [IntrReadMem, IntrArgMemOnly, IntrWillReturn,
-                                  ImmArg<ArgIndex<1>>]>;
-
-def int_masked_gather: Intrinsic<[llvm_anyvector_ty],
-                                 [LLVMVectorOfAnyPointersToElt<0>, llvm_i32_ty,
-                                  LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-                                  LLVMMatchType<0>],
-                                 [IntrReadMem, IntrWillReturn,
-                                  ImmArg<ArgIndex<1>>]>;
-
-def int_masked_scatter: Intrinsic<[],
-                                  [llvm_anyvector_ty,
-                                   LLVMVectorOfAnyPointersToElt<0>, llvm_i32_ty,
-                                   LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
-                                  [IntrWillReturn, ImmArg<ArgIndex<2>>]>;
-
-def int_masked_expandload: Intrinsic<[llvm_anyvector_ty],
-                                     [LLVMPointerToElt<0>,
-                                      LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-                                      LLVMMatchType<0>],
-                                     [IntrReadMem, IntrWillReturn]>;
-
-def int_masked_compressstore: Intrinsic<[],
-                                     [llvm_anyvector_ty,
-                                      LLVMPointerToElt<0>,
-                                      LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
-                                     [IntrArgMemOnly, IntrWillReturn]>;
+def int_masked_load:
+  DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+            [LLVMAnyPointerType<LLVMMatchType<0>>, llvm_i32_ty,
+             LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<0>],
+            [IntrReadMem, IntrArgMemOnly, IntrWillReturn, ImmArg<ArgIndex<1>>]>;
+
+def int_masked_store:
+  DefaultAttrsIntrinsic<[],
+            [llvm_anyvector_ty, LLVMAnyPointerType<LLVMMatchType<0>>,
+             llvm_i32_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+            [IntrWriteMem, IntrArgMemOnly, IntrWillReturn,
+             ImmArg<ArgIndex<2>>]>;
+
+def int_masked_gather:
+  DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+            [LLVMVectorOfAnyPointersToElt<0>, llvm_i32_ty,
+             LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<0>],
+            [IntrReadMem, IntrWillReturn, ImmArg<ArgIndex<1>>]>;
+
+def int_masked_scatter:
+  DefaultAttrsIntrinsic<[],
+            [llvm_anyvector_ty, LLVMVectorOfAnyPointersToElt<0>, llvm_i32_ty,
+             LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+            [IntrWriteMem, IntrWillReturn, ImmArg<ArgIndex<2>>]>;
+
+def int_masked_expandload:
+  DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+            [LLVMPointerToElt<0>, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+             LLVMMatchType<0>],
+            [IntrReadMem, IntrWillReturn]>;
+
+def int_masked_compressstore:
+  DefaultAttrsIntrinsic<[],
+            [llvm_anyvector_ty, LLVMPointerToElt<0>,
+             LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+            [IntrWriteMem, IntrArgMemOnly, IntrWillReturn]>;
 
 // Test whether a pointer is associated with a type metadata identifier.
-def int_type_test : Intrinsic<[llvm_i1_ty], [llvm_ptr_ty, llvm_metadata_ty],
+def int_type_test : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty, llvm_metadata_ty],
                               [IntrNoMem, IntrWillReturn]>;
 
 // Safely loads a function pointer from a virtual table pointer using type metadata.
-def int_type_checked_load : Intrinsic<[llvm_ptr_ty, llvm_i1_ty],
+def int_type_checked_load : DefaultAttrsIntrinsic<[llvm_ptr_ty, llvm_i1_ty],
                                       [llvm_ptr_ty, llvm_i32_ty, llvm_metadata_ty],
                                       [IntrNoMem, IntrWillReturn]>;
 
 // Create a branch funnel that implements an indirect call to a limited set of
 // callees. This needs to be a musttail call.
-def int_icall_branch_funnel : Intrinsic<[], [llvm_vararg_ty], []>;
+def int_icall_branch_funnel : DefaultAttrsIntrinsic<[], [llvm_vararg_ty], []>;
 
-def int_load_relative: Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty, llvm_anyint_ty],
+def int_load_relative: DefaultAttrsIntrinsic<[llvm_ptr_ty], [llvm_ptr_ty, llvm_anyint_ty],
                                  [IntrReadMem, IntrArgMemOnly]>;
 
 def int_hwasan_check_memaccess :
@@ -1413,54 +1511,55 @@ def int_memset_element_unordered_atomic
 
 //===------------------------ Reduction Intrinsics ------------------------===//
 //
-let IntrProperties = [IntrNoMem, IntrWillReturn] in {
-  def int_experimental_vector_reduce_v2_fadd : Intrinsic<[llvm_anyfloat_ty],
-                                                         [LLVMMatchType<0>,
-                                                          llvm_anyvector_ty]>;
-  def int_experimental_vector_reduce_v2_fmul : Intrinsic<[llvm_anyfloat_ty],
-                                                         [LLVMMatchType<0>,
-                                                          llvm_anyvector_ty]>;
-  def int_experimental_vector_reduce_add : Intrinsic<[LLVMVectorElementType<0>],
-                                                     [llvm_anyvector_ty]>;
-  def int_experimental_vector_reduce_mul : Intrinsic<[LLVMVectorElementType<0>],
-                                                     [llvm_anyvector_ty]>;
-  def int_experimental_vector_reduce_and : Intrinsic<[LLVMVectorElementType<0>],
-                                                     [llvm_anyvector_ty]>;
-  def int_experimental_vector_reduce_or : Intrinsic<[LLVMVectorElementType<0>],
-                                                    [llvm_anyvector_ty]>;
-  def int_experimental_vector_reduce_xor : Intrinsic<[LLVMVectorElementType<0>],
-                                                     [llvm_anyvector_ty]>;
-  def int_experimental_vector_reduce_smax : Intrinsic<[LLVMVectorElementType<0>],
-                                                      [llvm_anyvector_ty]>;
-  def int_experimental_vector_reduce_smin : Intrinsic<[LLVMVectorElementType<0>],
-                                                      [llvm_anyvector_ty]>;
-  def int_experimental_vector_reduce_umax : Intrinsic<[LLVMVectorElementType<0>],
-                                                      [llvm_anyvector_ty]>;
-  def int_experimental_vector_reduce_umin : Intrinsic<[LLVMVectorElementType<0>],
-                                                      [llvm_anyvector_ty]>;
-  def int_experimental_vector_reduce_fmax : Intrinsic<[LLVMVectorElementType<0>],
-                                                      [llvm_anyvector_ty]>;
-  def int_experimental_vector_reduce_fmin : Intrinsic<[LLVMVectorElementType<0>],
-                                                      [llvm_anyvector_ty]>;
+let IntrProperties = [IntrNoMem] in {
+
+  def int_vector_reduce_fadd : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                         [LLVMVectorElementType<0>,
+                                          llvm_anyvector_ty]>;
+  def int_vector_reduce_fmul : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                         [LLVMVectorElementType<0>,
+                                          llvm_anyvector_ty]>;
+  def int_vector_reduce_add : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                        [llvm_anyvector_ty]>;
+  def int_vector_reduce_mul : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                        [llvm_anyvector_ty]>;
+  def int_vector_reduce_and : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                        [llvm_anyvector_ty]>;
+  def int_vector_reduce_or : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                       [llvm_anyvector_ty]>;
+  def int_vector_reduce_xor : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                        [llvm_anyvector_ty]>;
+  def int_vector_reduce_smax : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                         [llvm_anyvector_ty]>;
+  def int_vector_reduce_smin : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                         [llvm_anyvector_ty]>;
+  def int_vector_reduce_umax : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                         [llvm_anyvector_ty]>;
+  def int_vector_reduce_umin : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                         [llvm_anyvector_ty]>;
+  def int_vector_reduce_fmax : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                         [llvm_anyvector_ty]>;
+  def int_vector_reduce_fmin : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                         [llvm_anyvector_ty]>;
 }
 
 //===----- Matrix intrinsics ---------------------------------------------===//
 
 def int_matrix_transpose
-  : Intrinsic<[llvm_anyvector_ty],
+  : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
               [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty],
               [ IntrNoSync, IntrWillReturn, IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<1>>,
                ImmArg<ArgIndex<2>>]>;
 
 def int_matrix_multiply
-  : Intrinsic<[llvm_anyvector_ty],
+  : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
               [llvm_anyvector_ty, llvm_anyvector_ty, llvm_i32_ty, llvm_i32_ty,
                llvm_i32_ty],
               [IntrNoSync, IntrWillReturn, IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<2>>,
                ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
 
 def int_matrix_column_major_load
-  : Intrinsic<[llvm_anyvector_ty],
+  : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
               [LLVMPointerToElt<0>, llvm_i64_ty, llvm_i1_ty,
                llvm_i32_ty, llvm_i32_ty],
               [IntrNoSync, IntrWillReturn, IntrArgMemOnly, IntrReadMem,
@@ -1468,7 +1567,7 @@ def int_matrix_column_major_load
                ImmArg<ArgIndex<4>>]>;
 
 def int_matrix_column_major_store
-  : Intrinsic<[],
+  : DefaultAttrsIntrinsic<[],
               [llvm_anyvector_ty, LLVMPointerToElt<0>,
                llvm_i64_ty, llvm_i1_ty, llvm_i32_ty, llvm_i32_ty],
               [IntrNoSync, IntrWillReturn, IntrArgMemOnly, IntrWriteMem,
@@ -1480,18 +1579,23 @@ def int_matrix_column_major_store
 // Specify that the value given is the number of iterations that the next loop
 // will execute.
 def int_set_loop_iterations :
-  Intrinsic<[], [llvm_anyint_ty], [IntrNoDuplicate]>;
+  DefaultAttrsIntrinsic<[], [llvm_anyint_ty], [IntrNoDuplicate]>;
+
+// Same as the above, but produces a value (the same as the input operand) to
+// be fed into the loop.
+def int_start_loop_iterations :
+  DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoDuplicate]>;
 
 // Specify that the value given is the number of iterations that the next loop
 // will execute. Also test that the given count is not zero, allowing it to
 // control entry to a 'while' loop.
 def int_test_set_loop_iterations :
-  Intrinsic<[llvm_i1_ty], [llvm_anyint_ty], [IntrNoDuplicate]>;
+  DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_anyint_ty], [IntrNoDuplicate]>;
 
 // Decrement loop counter by the given argument. Return false if the loop
 // should exit.
 def int_loop_decrement :
-  Intrinsic<[llvm_i1_ty], [llvm_anyint_ty], [IntrNoDuplicate]>;
+  DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_anyint_ty], [IntrNoDuplicate]>;
 
 // Decrement the first operand (the loop counter) by the second operand (the
 // maximum number of elements processed in an iteration). Return the remaining
@@ -1501,27 +1605,27 @@ def int_loop_decrement :
 // it's scevable, so it's the backends responsibility to handle cases where it
 // may be optimised.
 def int_loop_decrement_reg :
-  Intrinsic<[llvm_anyint_ty],
+  DefaultAttrsIntrinsic<[llvm_anyint_ty],
             [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoDuplicate]>;
 
 //===----- Intrinsics that are used to provide predicate information -----===//
 
-def int_ssa_copy : Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
+def int_ssa_copy : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>],
                              [IntrNoMem, Returned<ArgIndex<0>>]>;
 
 //===------- Intrinsics that are used to preserve debug information -------===//
 
-def int_preserve_array_access_index : Intrinsic<[llvm_anyptr_ty],
+def int_preserve_array_access_index : DefaultAttrsIntrinsic<[llvm_anyptr_ty],
                                                 [llvm_anyptr_ty, llvm_i32_ty,
                                                  llvm_i32_ty],
                                                 [IntrNoMem,
                                                  ImmArg<ArgIndex<1>>,
                                                  ImmArg<ArgIndex<2>>]>;
-def int_preserve_union_access_index : Intrinsic<[llvm_anyptr_ty],
+def int_preserve_union_access_index : DefaultAttrsIntrinsic<[llvm_anyptr_ty],
                                                 [llvm_anyptr_ty, llvm_i32_ty],
                                                 [IntrNoMem,
                                                  ImmArg<ArgIndex<1>>]>;
-def int_preserve_struct_access_index : Intrinsic<[llvm_anyptr_ty],
+def int_preserve_struct_access_index : DefaultAttrsIntrinsic<[llvm_anyptr_ty],
                                                  [llvm_anyptr_ty, llvm_i32_ty,
                                                   llvm_i32_ty],
                                                  [IntrNoMem,
@@ -1529,7 +1633,16 @@ def int_preserve_struct_access_index : Intrinsic<[llvm_anyptr_ty],
                                                   ImmArg<ArgIndex<2>>]>;
 
 //===---------- Intrinsics to query properties of scalable vectors --------===//
-def int_vscale : Intrinsic<[llvm_anyint_ty], [], [IntrNoMem]>;
+def int_vscale : DefaultAttrsIntrinsic<[llvm_anyint_ty], [], [IntrNoMem]>;
+
+//===---------- Intrinsics to perform subvector insertion/extraction ------===//
+def int_experimental_vector_insert : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                                                           [LLVMMatchType<0>, llvm_anyvector_ty, llvm_i64_ty],
+                                                           [IntrNoMem, ImmArg<ArgIndex<2>>]>;
+
+def int_experimental_vector_extract : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                                                            [llvm_anyvector_ty, llvm_i64_ty],
+                                                            [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 //===----------------------------------------------------------------------===//
 
@@ -1550,3 +1663,4 @@ include "llvm/IR/IntrinsicsBPF.td"
 include "llvm/IR/IntrinsicsSystemZ.td"
 include "llvm/IR/IntrinsicsWebAssembly.td"
 include "llvm/IR/IntrinsicsRISCV.td"
+include "llvm/IR/IntrinsicsVE.td"
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsAArch64.td b/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 3f71f644f9a1..da3085171b19 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -12,47 +12,58 @@
 
 let TargetPrefix = "aarch64" in {
 
-def int_aarch64_ldxr : Intrinsic<[llvm_i64_ty], [llvm_anyptr_ty]>;
-def int_aarch64_ldaxr : Intrinsic<[llvm_i64_ty], [llvm_anyptr_ty]>;
-def int_aarch64_stxr : Intrinsic<[llvm_i32_ty], [llvm_i64_ty, llvm_anyptr_ty]>;
-def int_aarch64_stlxr : Intrinsic<[llvm_i32_ty], [llvm_i64_ty, llvm_anyptr_ty]>;
-
-def int_aarch64_ldxp : Intrinsic<[llvm_i64_ty, llvm_i64_ty], [llvm_ptr_ty]>;
-def int_aarch64_ldaxp : Intrinsic<[llvm_i64_ty, llvm_i64_ty], [llvm_ptr_ty]>;
+def int_aarch64_ldxr : Intrinsic<[llvm_i64_ty], [llvm_anyptr_ty],
+                                 [IntrNoFree, IntrWillReturn]>;
+def int_aarch64_ldaxr : Intrinsic<[llvm_i64_ty], [llvm_anyptr_ty],
+                                  [IntrNoFree, IntrWillReturn]>;
+def int_aarch64_stxr : Intrinsic<[llvm_i32_ty], [llvm_i64_ty, llvm_anyptr_ty],
+                                 [IntrNoFree, IntrWillReturn]>;
+def int_aarch64_stlxr : Intrinsic<[llvm_i32_ty], [llvm_i64_ty, llvm_anyptr_ty],
+                                  [IntrNoFree, IntrWillReturn]>;
+
+def int_aarch64_ldxp : Intrinsic<[llvm_i64_ty, llvm_i64_ty], [llvm_ptr_ty],
+                                 [IntrNoFree, IntrWillReturn]>;
+def int_aarch64_ldaxp : Intrinsic<[llvm_i64_ty, llvm_i64_ty], [llvm_ptr_ty],
+                                  [IntrNoFree, IntrWillReturn]>;
 def int_aarch64_stxp : Intrinsic<[llvm_i32_ty],
-                               [llvm_i64_ty, llvm_i64_ty, llvm_ptr_ty]>;
+                               [llvm_i64_ty, llvm_i64_ty, llvm_ptr_ty],
+                               [IntrNoFree, IntrWillReturn]>;
 def int_aarch64_stlxp : Intrinsic<[llvm_i32_ty],
-                                [llvm_i64_ty, llvm_i64_ty, llvm_ptr_ty]>;
+                                  [llvm_i64_ty, llvm_i64_ty, llvm_ptr_ty],
+                                  [IntrNoFree, IntrWillReturn]>;
 
 def int_aarch64_clrex : Intrinsic<[]>;
 
-def int_aarch64_sdiv : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>,
+def int_aarch64_sdiv : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>,
                                 LLVMMatchType<0>], [IntrNoMem]>;
-def int_aarch64_udiv : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>,
+def int_aarch64_udiv : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>,
                                 LLVMMatchType<0>], [IntrNoMem]>;
 
-def int_aarch64_fjcvtzs : Intrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem]>;
+def int_aarch64_fjcvtzs : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem]>;
 
-def int_aarch64_cls: Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
-def int_aarch64_cls64: Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>;
+def int_aarch64_cls: DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
+def int_aarch64_cls64: DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>;
 
 //===----------------------------------------------------------------------===//
 // HINT
 
-def int_aarch64_hint : Intrinsic<[], [llvm_i32_ty]>;
+def int_aarch64_hint : DefaultAttrsIntrinsic<[], [llvm_i32_ty]>;
 
 //===----------------------------------------------------------------------===//
 // Data Barrier Instructions
 
-def int_aarch64_dmb : GCCBuiltin<"__builtin_arm_dmb">, MSBuiltin<"__dmb">, Intrinsic<[], [llvm_i32_ty]>;
-def int_aarch64_dsb : GCCBuiltin<"__builtin_arm_dsb">, MSBuiltin<"__dsb">, Intrinsic<[], [llvm_i32_ty]>;
-def int_aarch64_isb : GCCBuiltin<"__builtin_arm_isb">, MSBuiltin<"__isb">, Intrinsic<[], [llvm_i32_ty]>;
+def int_aarch64_dmb : GCCBuiltin<"__builtin_arm_dmb">, MSBuiltin<"__dmb">,
+                      Intrinsic<[], [llvm_i32_ty], [IntrNoFree, IntrWillReturn]>;
+def int_aarch64_dsb : GCCBuiltin<"__builtin_arm_dsb">, MSBuiltin<"__dsb">,
+                      Intrinsic<[], [llvm_i32_ty], [IntrNoFree, IntrWillReturn]>;
+def int_aarch64_isb : GCCBuiltin<"__builtin_arm_isb">, MSBuiltin<"__isb">,
+                      Intrinsic<[], [llvm_i32_ty], [IntrNoFree, IntrWillReturn]>;
 
 // A space-consuming intrinsic primarily for testing block and jump table
 // placements. The first argument is the number of bytes this "instruction"
 // takes up, the second and return value are essentially chains, used to force
 // ordering during ISel.
-def int_aarch64_space : Intrinsic<[llvm_i64_ty], [llvm_i32_ty, llvm_i64_ty], []>;
+def int_aarch64_space : DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i32_ty, llvm_i64_ty], []>;
 
 }
 
@@ -61,129 +72,133 @@ def int_aarch64_space : Intrinsic<[llvm_i64_ty], [llvm_i32_ty, llvm_i64_ty], []>
 
 let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
   class AdvSIMD_2Scalar_Float_Intrinsic
-    : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
+    : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
                 [IntrNoMem]>;
 
   class AdvSIMD_FPToIntRounding_Intrinsic
-    : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty], [IntrNoMem]>;
+    : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty], [IntrNoMem]>;
 
   class AdvSIMD_1IntArg_Intrinsic
-    : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+    : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>;
   class AdvSIMD_1FloatArg_Intrinsic
-    : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+    : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
   class AdvSIMD_1VectorArg_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>], [IntrNoMem]>;
   class AdvSIMD_1VectorArg_Expand_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>;
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>;
   class AdvSIMD_1VectorArg_Long_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty], [LLVMTruncatedType<0>], [IntrNoMem]>;
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMTruncatedType<0>], [IntrNoMem]>;
   class AdvSIMD_1IntArg_Narrow_Intrinsic
-    : Intrinsic<[llvm_anyint_ty], [llvm_anyint_ty], [IntrNoMem]>;
+    : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyint_ty], [IntrNoMem]>;
   class AdvSIMD_1VectorArg_Narrow_Intrinsic
-    : Intrinsic<[llvm_anyint_ty], [LLVMExtendedType<0>], [IntrNoMem]>;
+    : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMExtendedType<0>], [IntrNoMem]>;
   class AdvSIMD_1VectorArg_Int_Across_Intrinsic
-    : Intrinsic<[llvm_anyint_ty], [llvm_anyvector_ty], [IntrNoMem]>;
+    : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyvector_ty], [IntrNoMem]>;
   class AdvSIMD_1VectorArg_Float_Across_Intrinsic
-    : Intrinsic<[llvm_anyfloat_ty], [llvm_anyvector_ty], [IntrNoMem]>;
+    : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [llvm_anyvector_ty], [IntrNoMem]>;
 
   class AdvSIMD_2IntArg_Intrinsic
-    : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
+    : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
                 [IntrNoMem]>;
   class AdvSIMD_2FloatArg_Intrinsic
-    : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
+    : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
                 [IntrNoMem]>;
   class AdvSIMD_2VectorArg_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
                 [IntrNoMem]>;
   class AdvSIMD_2VectorArg_Compare_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, LLVMMatchType<1>],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, LLVMMatchType<1>],
                 [IntrNoMem]>;
   class AdvSIMD_2Arg_FloatCompare_Intrinsic
-    : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, LLVMMatchType<1>],
+    : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, LLVMMatchType<1>],
                 [IntrNoMem]>;
   class AdvSIMD_2VectorArg_Long_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMTruncatedType<0>, LLVMTruncatedType<0>],
                 [IntrNoMem]>;
   class AdvSIMD_2VectorArg_Wide_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMMatchType<0>, LLVMTruncatedType<0>],
                 [IntrNoMem]>;
   class AdvSIMD_2VectorArg_Narrow_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMExtendedType<0>, LLVMExtendedType<0>],
                 [IntrNoMem]>;
   class AdvSIMD_2Arg_Scalar_Narrow_Intrinsic
-    : Intrinsic<[llvm_anyint_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyint_ty],
                 [LLVMExtendedType<0>, llvm_i32_ty],
                 [IntrNoMem]>;
   class AdvSIMD_2VectorArg_Scalar_Expand_BySize_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [llvm_anyvector_ty],
                 [IntrNoMem]>;
   class AdvSIMD_2VectorArg_Scalar_Wide_BySize_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMTruncatedType<0>],
                 [IntrNoMem]>;
   class AdvSIMD_2VectorArg_Scalar_Wide_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMTruncatedType<0>, llvm_i32_ty],
                 [IntrNoMem]>;
   class AdvSIMD_2VectorArg_Tied_Narrow_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMHalfElementsVectorType<0>, llvm_anyvector_ty],
                 [IntrNoMem]>;
   class AdvSIMD_2VectorArg_Lane_Intrinsic
-    : Intrinsic<[llvm_anyint_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyint_ty],
                 [LLVMMatchType<0>, llvm_anyint_ty, llvm_i32_ty],
                 [IntrNoMem]>;
 
   class AdvSIMD_3VectorArg_Intrinsic
-      : Intrinsic<[llvm_anyvector_ty],
+      : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
                [IntrNoMem]>;
   class AdvSIMD_3VectorArg_Scalar_Intrinsic
-      : Intrinsic<[llvm_anyvector_ty],
+      : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
                [IntrNoMem]>;
   class AdvSIMD_3VectorArg_Tied_Narrow_Intrinsic
-      : Intrinsic<[llvm_anyvector_ty],
+      : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                [LLVMHalfElementsVectorType<0>, llvm_anyvector_ty,
                 LLVMMatchType<1>], [IntrNoMem]>;
   class AdvSIMD_3VectorArg_Scalar_Tied_Narrow_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMHalfElementsVectorType<0>, llvm_anyvector_ty, llvm_i32_ty],
                 [IntrNoMem]>;
   class AdvSIMD_CvtFxToFP_Intrinsic
-    : Intrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty, llvm_i32_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty, llvm_i32_ty],
                 [IntrNoMem]>;
   class AdvSIMD_CvtFPToFx_Intrinsic
-    : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, llvm_i32_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, llvm_i32_ty],
                 [IntrNoMem]>;
 
   class AdvSIMD_1Arg_Intrinsic
-    : Intrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+    : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrNoMem]>;
 
   class AdvSIMD_Dot_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],
                 [IntrNoMem]>;
 
   class AdvSIMD_FP16FML_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],
                 [IntrNoMem]>;
 
   class AdvSIMD_MatMul_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],
                 [IntrNoMem]>;
 
   class AdvSIMD_FML_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],
                 [IntrNoMem]>;
 
+  class AdvSIMD_BF16FML_Intrinsic
+    : DefaultAttrsIntrinsic<[llvm_v4f32_ty],
+                [llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty],
+                [IntrNoMem]>;
 }
 
 // Arithmetic ops
@@ -241,7 +256,7 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
   // 64-bit polynomial multiply really returns an i128, which is not legal. Fake
   // it with a v16i8.
   def int_aarch64_neon_pmull64 :
-        Intrinsic<[llvm_v16i8_ty], [llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>;
+        DefaultAttrsIntrinsic<[llvm_v16i8_ty], [llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>;
 
   // Vector Extending Multiply
   def int_aarch64_neon_fmulx : AdvSIMD_2FloatArg_Intrinsic {
@@ -251,7 +266,7 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
   // Vector Saturating Doubling Long Multiply
   def int_aarch64_neon_sqdmull : AdvSIMD_2VectorArg_Long_Intrinsic;
   def int_aarch64_neon_sqdmulls_scalar
-    : Intrinsic<[llvm_i64_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+    : DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
 
   // Vector Halving Subtract
   def int_aarch64_neon_shsub : AdvSIMD_2VectorArg_Intrinsic;
@@ -421,9 +436,9 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
 
   // Vector Conversions Between Half-Precision and Single-Precision.
   def int_aarch64_neon_vcvtfp2hf
-    : Intrinsic<[llvm_v4i16_ty], [llvm_v4f32_ty], [IntrNoMem]>;
+    : DefaultAttrsIntrinsic<[llvm_v4i16_ty], [llvm_v4f32_ty], [IntrNoMem]>;
   def int_aarch64_neon_vcvthf2fp
-    : Intrinsic<[llvm_v4f32_ty], [llvm_v4i16_ty], [IntrNoMem]>;
+    : DefaultAttrsIntrinsic<[llvm_v4f32_ty], [llvm_v4i16_ty], [IntrNoMem]>;
 
   // Vector Conversions Between Floating-point and Fixed-point.
   def int_aarch64_neon_vcvtfp2fxs : AdvSIMD_CvtFPToFx_Intrinsic;
@@ -453,7 +468,7 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
   def int_aarch64_neon_fcvtxn : AdvSIMD_1VectorArg_Expand_Intrinsic;
 
   // Scalar FP Inexact Narrowing
-  def int_aarch64_sisd_fcvtxn : Intrinsic<[llvm_float_ty], [llvm_double_ty],
+  def int_aarch64_sisd_fcvtxn : DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_double_ty],
                                         [IntrNoMem]>;
 
   // v8.2-A Dot Product
@@ -466,18 +481,21 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
   def int_aarch64_neon_usmmla : AdvSIMD_MatMul_Intrinsic;
   def int_aarch64_neon_usdot : AdvSIMD_Dot_Intrinsic;
   def int_aarch64_neon_bfdot : AdvSIMD_Dot_Intrinsic;
-  def int_aarch64_neon_bfmmla : AdvSIMD_MatMul_Intrinsic;
-  def int_aarch64_neon_bfmlalb : AdvSIMD_FML_Intrinsic;
-  def int_aarch64_neon_bfmlalt : AdvSIMD_FML_Intrinsic;
+  def int_aarch64_neon_bfmmla
+    : DefaultAttrsIntrinsic<[llvm_v4f32_ty],
+                [llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty],
+                [IntrNoMem]>;
+  def int_aarch64_neon_bfmlalb : AdvSIMD_BF16FML_Intrinsic;
+  def int_aarch64_neon_bfmlalt : AdvSIMD_BF16FML_Intrinsic;
 
 
   // v8.6-A Bfloat Intrinsics
   def int_aarch64_neon_bfcvt
-    : Intrinsic<[llvm_bfloat_ty], [llvm_float_ty], [IntrNoMem]>;
+    : DefaultAttrsIntrinsic<[llvm_bfloat_ty], [llvm_float_ty], [IntrNoMem]>;
   def int_aarch64_neon_bfcvtn
-    : Intrinsic<[llvm_v8bf16_ty], [llvm_v4f32_ty], [IntrNoMem]>;
+    : DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v4f32_ty], [IntrNoMem]>;
   def int_aarch64_neon_bfcvtn2
-    : Intrinsic<[llvm_v8bf16_ty],
+    : DefaultAttrsIntrinsic<[llvm_v8bf16_ty],
                 [llvm_v8bf16_ty, llvm_v4f32_ty],
                 [IntrNoMem]>;
 
@@ -490,11 +508,16 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
   // v8.3-A Floating-point complex add
   def int_aarch64_neon_vcadd_rot90  : AdvSIMD_2VectorArg_Intrinsic;
   def int_aarch64_neon_vcadd_rot270 : AdvSIMD_2VectorArg_Intrinsic;
+
+  def int_aarch64_neon_vcmla_rot0   : AdvSIMD_3VectorArg_Intrinsic;
+  def int_aarch64_neon_vcmla_rot90  : AdvSIMD_3VectorArg_Intrinsic;
+  def int_aarch64_neon_vcmla_rot180 : AdvSIMD_3VectorArg_Intrinsic;
+  def int_aarch64_neon_vcmla_rot270 : AdvSIMD_3VectorArg_Intrinsic;
 }
 
 let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
   class AdvSIMD_2Vector2Index_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [llvm_anyvector_ty, llvm_i64_ty, LLVMMatchType<0>, llvm_i64_ty],
                 [IntrNoMem]>;
 }
@@ -504,68 +527,68 @@ def int_aarch64_neon_vcopy_lane: AdvSIMD_2Vector2Index_Intrinsic;
 
 let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
   class AdvSIMD_1Vec_Load_Intrinsic
-      : Intrinsic<[llvm_anyvector_ty], [LLVMAnyPointerType<LLVMMatchType<0>>],
+      : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMAnyPointerType<LLVMMatchType<0>>],
                   [IntrReadMem, IntrArgMemOnly]>;
   class AdvSIMD_1Vec_Store_Lane_Intrinsic
-    : Intrinsic<[], [llvm_anyvector_ty, llvm_i64_ty, llvm_anyptr_ty],
+    : DefaultAttrsIntrinsic<[], [llvm_anyvector_ty, llvm_i64_ty, llvm_anyptr_ty],
                 [IntrArgMemOnly, NoCapture<ArgIndex<2>>]>;
 
   class AdvSIMD_2Vec_Load_Intrinsic
-    : Intrinsic<[LLVMMatchType<0>, llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[LLVMMatchType<0>, llvm_anyvector_ty],
                 [LLVMAnyPointerType<LLVMMatchType<0>>],
                 [IntrReadMem, IntrArgMemOnly]>;
   class AdvSIMD_2Vec_Load_Lane_Intrinsic
-    : Intrinsic<[LLVMMatchType<0>, LLVMMatchType<0>],
+    : DefaultAttrsIntrinsic<[LLVMMatchType<0>, LLVMMatchType<0>],
                 [LLVMMatchType<0>, llvm_anyvector_ty,
                  llvm_i64_ty, llvm_anyptr_ty],
                 [IntrReadMem, IntrArgMemOnly]>;
   class AdvSIMD_2Vec_Store_Intrinsic
-    : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>,
+    : DefaultAttrsIntrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>,
                      LLVMAnyPointerType<LLVMMatchType<0>>],
                 [IntrArgMemOnly, NoCapture<ArgIndex<2>>]>;
   class AdvSIMD_2Vec_Store_Lane_Intrinsic
-    : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>,
+    : DefaultAttrsIntrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>,
                  llvm_i64_ty, llvm_anyptr_ty],
                 [IntrArgMemOnly, NoCapture<ArgIndex<3>>]>;
 
   class AdvSIMD_3Vec_Load_Intrinsic
-    : Intrinsic<[LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty],
                 [LLVMAnyPointerType<LLVMMatchType<0>>],
                 [IntrReadMem, IntrArgMemOnly]>;
   class AdvSIMD_3Vec_Load_Lane_Intrinsic
-    : Intrinsic<[LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
+    : DefaultAttrsIntrinsic<[LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
                 [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty,
                  llvm_i64_ty, llvm_anyptr_ty],
                 [IntrReadMem, IntrArgMemOnly]>;
   class AdvSIMD_3Vec_Store_Intrinsic
-    : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>,
+    : DefaultAttrsIntrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>,
                      LLVMMatchType<0>, LLVMAnyPointerType<LLVMMatchType<0>>],
                 [IntrArgMemOnly, NoCapture<ArgIndex<3>>]>;
   class AdvSIMD_3Vec_Store_Lane_Intrinsic
-    : Intrinsic<[], [llvm_anyvector_ty,
+    : DefaultAttrsIntrinsic<[], [llvm_anyvector_ty,
                  LLVMMatchType<0>, LLVMMatchType<0>,
                  llvm_i64_ty, llvm_anyptr_ty],
                 [IntrArgMemOnly, NoCapture<ArgIndex<4>>]>;
 
   class AdvSIMD_4Vec_Load_Intrinsic
-    : Intrinsic<[LLVMMatchType<0>, LLVMMatchType<0>,
+    : DefaultAttrsIntrinsic<[LLVMMatchType<0>, LLVMMatchType<0>,
                  LLVMMatchType<0>, llvm_anyvector_ty],
                 [LLVMAnyPointerType<LLVMMatchType<0>>],
                 [IntrReadMem, IntrArgMemOnly]>;
   class AdvSIMD_4Vec_Load_Lane_Intrinsic
-    : Intrinsic<[LLVMMatchType<0>, LLVMMatchType<0>,
+    : DefaultAttrsIntrinsic<[LLVMMatchType<0>, LLVMMatchType<0>,
                  LLVMMatchType<0>, LLVMMatchType<0>],
                 [LLVMMatchType<0>, LLVMMatchType<0>,
                  LLVMMatchType<0>, llvm_anyvector_ty,
                  llvm_i64_ty, llvm_anyptr_ty],
                 [IntrReadMem, IntrArgMemOnly]>;
   class AdvSIMD_4Vec_Store_Intrinsic
-    : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>,
+    : DefaultAttrsIntrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>,
                  LLVMMatchType<0>, LLVMMatchType<0>,
                  LLVMAnyPointerType<LLVMMatchType<0>>],
                 [IntrArgMemOnly, NoCapture<ArgIndex<4>>]>;
   class AdvSIMD_4Vec_Store_Lane_Intrinsic
-    : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>,
+    : DefaultAttrsIntrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>,
                  LLVMMatchType<0>, LLVMMatchType<0>,
                  llvm_i64_ty, llvm_anyptr_ty],
                 [IntrArgMemOnly, NoCapture<ArgIndex<5>>]>;
@@ -603,38 +626,38 @@ def int_aarch64_neon_st4lane  : AdvSIMD_4Vec_Store_Lane_Intrinsic;
 
 let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
   class AdvSIMD_Tbl1_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty], [llvm_v16i8_ty, LLVMMatchType<0>],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [llvm_v16i8_ty, LLVMMatchType<0>],
                 [IntrNoMem]>;
   class AdvSIMD_Tbl2_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [llvm_v16i8_ty, llvm_v16i8_ty, LLVMMatchType<0>], [IntrNoMem]>;
   class AdvSIMD_Tbl3_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty,
                  LLVMMatchType<0>],
                 [IntrNoMem]>;
   class AdvSIMD_Tbl4_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty,
                  LLVMMatchType<0>],
                 [IntrNoMem]>;
 
   class AdvSIMD_Tbx1_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMMatchType<0>, llvm_v16i8_ty, LLVMMatchType<0>],
                 [IntrNoMem]>;
   class AdvSIMD_Tbx2_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMMatchType<0>, llvm_v16i8_ty, llvm_v16i8_ty,
                  LLVMMatchType<0>],
                 [IntrNoMem]>;
   class AdvSIMD_Tbx3_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMMatchType<0>, llvm_v16i8_ty, llvm_v16i8_ty,
                  llvm_v16i8_ty, LLVMMatchType<0>],
                 [IntrNoMem]>;
   class AdvSIMD_Tbx4_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMMatchType<0>, llvm_v16i8_ty, llvm_v16i8_ty,
                  llvm_v16i8_ty, llvm_v16i8_ty, LLVMMatchType<0>],
                 [IntrNoMem]>;
@@ -651,7 +674,7 @@ def int_aarch64_neon_tbx4 : AdvSIMD_Tbx4_Intrinsic;
 
 let TargetPrefix = "aarch64" in {
   class FPCR_Get_Intrinsic
-    : Intrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrHasSideEffects]>;
+    : DefaultAttrsIntrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrHasSideEffects]>;
 }
 
 // FPCR
@@ -659,34 +682,34 @@ def int_aarch64_get_fpcr : FPCR_Get_Intrinsic;
 
 let TargetPrefix = "aarch64" in {
   class Crypto_AES_DataKey_Intrinsic
-    : Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+    : DefaultAttrsIntrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
 
   class Crypto_AES_Data_Intrinsic
-    : Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty], [IntrNoMem]>;
+    : DefaultAttrsIntrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty], [IntrNoMem]>;
 
   // SHA intrinsic taking 5 words of the hash (v4i32, i32) and 4 of the schedule
   // (v4i32).
   class Crypto_SHA_5Hash4Schedule_Intrinsic
-    : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty, llvm_v4i32_ty],
+    : DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty, llvm_v4i32_ty],
                 [IntrNoMem]>;
 
   // SHA intrinsic taking 5 words of the hash (v4i32, i32) and 4 of the schedule
   // (v4i32).
   class Crypto_SHA_1Hash_Intrinsic
-    : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
+    : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
 
   // SHA intrinsic taking 8 words of the schedule
   class Crypto_SHA_8Schedule_Intrinsic
-    : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+    : DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
 
   // SHA intrinsic taking 12 words of the schedule
   class Crypto_SHA_12Schedule_Intrinsic
-    : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+    : DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
                 [IntrNoMem]>;
 
   // SHA intrinsic taking 8 words of the hash and 4 of the schedule.
   class Crypto_SHA_8Hash4Schedule_Intrinsic
-    : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+    : DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
                 [IntrNoMem]>;
 }
 
@@ -716,84 +739,96 @@ def int_aarch64_crypto_sha256su1 : Crypto_SHA_12Schedule_Intrinsic;
 
 let TargetPrefix = "aarch64" in {
 
-def int_aarch64_crc32b  : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+def int_aarch64_crc32b  : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
     [IntrNoMem]>;
-def int_aarch64_crc32cb : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+def int_aarch64_crc32cb : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
     [IntrNoMem]>;
-def int_aarch64_crc32h  : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+def int_aarch64_crc32h  : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
     [IntrNoMem]>;
-def int_aarch64_crc32ch : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+def int_aarch64_crc32ch : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
     [IntrNoMem]>;
-def int_aarch64_crc32w  : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+def int_aarch64_crc32w  : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
     [IntrNoMem]>;
-def int_aarch64_crc32cw : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+def int_aarch64_crc32cw : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
     [IntrNoMem]>;
-def int_aarch64_crc32x  : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i64_ty],
+def int_aarch64_crc32x  : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i64_ty],
     [IntrNoMem]>;
-def int_aarch64_crc32cx : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i64_ty],
+def int_aarch64_crc32cx : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i64_ty],
     [IntrNoMem]>;
 }
 
 //===----------------------------------------------------------------------===//
 // Memory Tagging Extensions (MTE) Intrinsics
 let TargetPrefix = "aarch64" in {
-def int_aarch64_irg   : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty, llvm_i64_ty],
+def int_aarch64_irg   : DefaultAttrsIntrinsic<[llvm_ptr_ty], [llvm_ptr_ty, llvm_i64_ty],
     [IntrNoMem, IntrHasSideEffects]>;
-def int_aarch64_addg  : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty, llvm_i64_ty],
+def int_aarch64_addg  : DefaultAttrsIntrinsic<[llvm_ptr_ty], [llvm_ptr_ty, llvm_i64_ty],
     [IntrNoMem]>;
-def int_aarch64_gmi   : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty, llvm_i64_ty],
+def int_aarch64_gmi   : DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_ptr_ty, llvm_i64_ty],
     [IntrNoMem]>;
-def int_aarch64_ldg   : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty, llvm_ptr_ty],
+def int_aarch64_ldg   : DefaultAttrsIntrinsic<[llvm_ptr_ty], [llvm_ptr_ty, llvm_ptr_ty],
     [IntrReadMem]>;
-def int_aarch64_stg   : Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty],
+def int_aarch64_stg   : DefaultAttrsIntrinsic<[], [llvm_ptr_ty, llvm_ptr_ty],
     [IntrWriteMem]>;
-def int_aarch64_subp :  Intrinsic<[llvm_i64_ty], [llvm_ptr_ty, llvm_ptr_ty],
+def int_aarch64_subp :  DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_ptr_ty, llvm_ptr_ty],
     [IntrNoMem]>;
 
 // The following are codegen-only intrinsics for stack instrumentation.
 
 // Generate a randomly tagged stack base pointer.
-def int_aarch64_irg_sp   : Intrinsic<[llvm_ptr_ty], [llvm_i64_ty],
+def int_aarch64_irg_sp   : DefaultAttrsIntrinsic<[llvm_ptr_ty], [llvm_i64_ty],
     [IntrNoMem, IntrHasSideEffects]>;
 
 // Transfer pointer tag with offset.
 // ptr1 = tagp(ptr0, baseptr, tag_offset) returns a pointer where
 // * address is the address in ptr0
 // * tag is a function of (tag in baseptr, tag_offset).
+// ** Beware, this is not the same function as implemented by the ADDG instruction!
+//    Backend optimizations may change tag_offset; the only guarantee is that calls
+//    to tagp with the same pair of (baseptr, tag_offset) will produce pointers
+//    with the same tag value, assuming the set of excluded tags has not changed.
 // Address bits in baseptr and tag bits in ptr0 are ignored.
 // When offset between ptr0 and baseptr is a compile time constant, this can be emitted as
 //   ADDG ptr1, baseptr, (ptr0 - baseptr), tag_offset
 // It is intended that ptr0 is an alloca address, and baseptr is the direct output of llvm.aarch64.irg.sp.
-def int_aarch64_tagp : Intrinsic<[llvm_anyptr_ty], [LLVMMatchType<0>, llvm_ptr_ty, llvm_i64_ty],
+def int_aarch64_tagp : DefaultAttrsIntrinsic<[llvm_anyptr_ty], [LLVMMatchType<0>, llvm_ptr_ty, llvm_i64_ty],
     [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 // Update allocation tags for the memory range to match the tag in the pointer argument.
-def int_aarch64_settag  : Intrinsic<[], [llvm_ptr_ty, llvm_i64_ty],
+def int_aarch64_settag  : DefaultAttrsIntrinsic<[], [llvm_ptr_ty, llvm_i64_ty],
     [IntrWriteMem, IntrArgMemOnly, NoCapture<ArgIndex<0>>, WriteOnly<ArgIndex<0>>]>;
 
 // Update allocation tags for the memory range to match the tag in the pointer argument,
 // and set memory contents to zero.
-def int_aarch64_settag_zero  : Intrinsic<[], [llvm_ptr_ty, llvm_i64_ty],
+def int_aarch64_settag_zero  : DefaultAttrsIntrinsic<[], [llvm_ptr_ty, llvm_i64_ty],
     [IntrWriteMem, IntrArgMemOnly, NoCapture<ArgIndex<0>>, WriteOnly<ArgIndex<0>>]>;
 
 // Update allocation tags for 16-aligned, 16-sized memory region, and store a pair 8-byte values.
-def int_aarch64_stgp  : Intrinsic<[], [llvm_ptr_ty, llvm_i64_ty, llvm_i64_ty],
+def int_aarch64_stgp  : DefaultAttrsIntrinsic<[], [llvm_ptr_ty, llvm_i64_ty, llvm_i64_ty],
     [IntrWriteMem, IntrArgMemOnly, NoCapture<ArgIndex<0>>, WriteOnly<ArgIndex<0>>]>;
 }
 
 // Transactional Memory Extension (TME) Intrinsics
 let TargetPrefix = "aarch64" in {
 def int_aarch64_tstart  : GCCBuiltin<"__builtin_arm_tstart">,
-                         Intrinsic<[llvm_i64_ty]>;
+                         Intrinsic<[llvm_i64_ty], [], [IntrWillReturn]>;
 
-def int_aarch64_tcommit : GCCBuiltin<"__builtin_arm_tcommit">, Intrinsic<[]>;
+def int_aarch64_tcommit : GCCBuiltin<"__builtin_arm_tcommit">, Intrinsic<[], [], [IntrWillReturn]>;
 
 def int_aarch64_tcancel : GCCBuiltin<"__builtin_arm_tcancel">,
-                          Intrinsic<[], [llvm_i64_ty], [ImmArg<ArgIndex<0>>]>;
+                          Intrinsic<[], [llvm_i64_ty], [IntrWillReturn, ImmArg<ArgIndex<0>>]>;
 
 def int_aarch64_ttest   : GCCBuiltin<"__builtin_arm_ttest">,
                           Intrinsic<[llvm_i64_ty], [],
-                                    [IntrNoMem, IntrHasSideEffects]>;
+                                    [IntrNoMem, IntrHasSideEffects, IntrWillReturn]>;
+
+// Armv8.7-A load/store 64-byte intrinsics
+defvar data512 = !listsplat(llvm_i64_ty, 8);
+def int_aarch64_ld64b: Intrinsic<data512, [llvm_ptr_ty]>;
+def int_aarch64_st64b: Intrinsic<[], !listconcat([llvm_ptr_ty], data512)>;
+def int_aarch64_st64bv: Intrinsic<[llvm_i64_ty], !listconcat([llvm_ptr_ty], data512)>;
+def int_aarch64_st64bv0: Intrinsic<[llvm_i64_ty], !listconcat([llvm_ptr_ty], data512)>;
+
 }
 
 def llvm_nxv2i1_ty  : LLVMType<nxv2i1>;
@@ -811,88 +846,88 @@ def llvm_nxv2f64_ty : LLVMType<nxv2f64>;
 let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
 
   class AdvSIMD_SVE_Create_2Vector_Tuple
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [llvm_anyvector_ty, LLVMMatchType<1>],
                 [IntrReadMem]>;
 
   class AdvSIMD_SVE_Create_3Vector_Tuple
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [llvm_anyvector_ty, LLVMMatchType<1>, LLVMMatchType<1>],
                 [IntrReadMem]>;
 
   class AdvSIMD_SVE_Create_4Vector_Tuple
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [llvm_anyvector_ty, LLVMMatchType<1>, LLVMMatchType<1>,
                  LLVMMatchType<1>],
                 [IntrReadMem]>;
 
   class AdvSIMD_SVE_Set_Vector_Tuple
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMMatchType<0>, llvm_i32_ty, llvm_anyvector_ty],
                 [IntrReadMem, ImmArg<ArgIndex<1>>]>;
 
   class AdvSIMD_SVE_Get_Vector_Tuple
-    : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, llvm_i32_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, llvm_i32_ty],
                 [IntrReadMem, IntrArgMemOnly, ImmArg<ArgIndex<1>>]>;
 
   class AdvSIMD_ManyVec_PredLoad_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, LLVMPointerToElt<0>],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, LLVMPointerToElt<0>],
                 [IntrReadMem, IntrArgMemOnly]>;
 
   class AdvSIMD_1Vec_PredLoad_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                  LLVMPointerToElt<0>],
                 [IntrReadMem, IntrArgMemOnly]>;
 
   class AdvSIMD_1Vec_PredStore_Intrinsic
-    : Intrinsic<[],
+    : DefaultAttrsIntrinsic<[],
                 [llvm_anyvector_ty,
                  LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                  LLVMPointerToElt<0>],
                 [IntrArgMemOnly, NoCapture<ArgIndex<2>>]>;
 
   class AdvSIMD_2Vec_PredStore_Intrinsic
-      : Intrinsic<[],
+      : DefaultAttrsIntrinsic<[],
                   [llvm_anyvector_ty, LLVMMatchType<0>,
                    LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMPointerToElt<0>],
                   [IntrArgMemOnly, NoCapture<ArgIndex<3>>]>;
 
   class AdvSIMD_3Vec_PredStore_Intrinsic
-      : Intrinsic<[],
+      : DefaultAttrsIntrinsic<[],
                   [llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>,
                    LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMPointerToElt<0>],
                   [IntrArgMemOnly, NoCapture<ArgIndex<4>>]>;
 
   class AdvSIMD_4Vec_PredStore_Intrinsic
-      : Intrinsic<[],
+      : DefaultAttrsIntrinsic<[],
                   [llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>,
                    LLVMMatchType<0>,
                    LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMPointerToElt<0>],
                   [IntrArgMemOnly, NoCapture<ArgIndex<5>>]>;
 
   class AdvSIMD_SVE_Index_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMVectorElementType<0>,
                  LLVMVectorElementType<0>],
                 [IntrNoMem]>;
 
   class AdvSIMD_Merged1VectorArg_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMMatchType<0>,
                  LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                  LLVMMatchType<0>],
                 [IntrNoMem]>;
 
   class AdvSIMD_2VectorArgIndexed_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMMatchType<0>,
                  LLVMMatchType<0>,
                  llvm_i32_ty],
                 [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
   class AdvSIMD_3VectorArgIndexed_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMMatchType<0>,
                  LLVMMatchType<0>,
                  LLVMMatchType<0>,
@@ -900,20 +935,20 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
                 [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   class AdvSIMD_Pred1VectorArg_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                  LLVMMatchType<0>],
                 [IntrNoMem]>;
 
   class AdvSIMD_Pred2VectorArg_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                  LLVMMatchType<0>,
                  LLVMMatchType<0>],
                 [IntrNoMem]>;
 
   class AdvSIMD_Pred3VectorArg_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                  LLVMMatchType<0>,
                  LLVMMatchType<0>,
@@ -921,77 +956,77 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
                 [IntrNoMem]>;
 
   class AdvSIMD_SVE_Compare_Intrinsic
-    : Intrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+    : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
                 [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                  llvm_anyvector_ty,
                  LLVMMatchType<0>],
                 [IntrNoMem]>;
 
   class AdvSIMD_SVE_CompareWide_Intrinsic
-    : Intrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+    : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
                 [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                  llvm_anyvector_ty,
                  llvm_nxv2i64_ty],
                 [IntrNoMem]>;
 
   class AdvSIMD_SVE_Saturating_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMMatchType<0>,
                  LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
                 [IntrNoMem]>;
 
   class AdvSIMD_SVE_SaturatingWithPattern_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMMatchType<0>,
                  llvm_i32_ty,
                  llvm_i32_ty],
                 [IntrNoMem, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
 
   class AdvSIMD_SVE_Saturating_N_Intrinsic<LLVMType T>
-    : Intrinsic<[T],
+    : DefaultAttrsIntrinsic<[T],
                 [T, llvm_anyvector_ty],
                 [IntrNoMem]>;
 
   class AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic<LLVMType T>
-    : Intrinsic<[T],
+    : DefaultAttrsIntrinsic<[T],
                 [T, llvm_i32_ty, llvm_i32_ty],
                 [IntrNoMem, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
 
   class AdvSIMD_SVE_CNT_Intrinsic
-    : Intrinsic<[LLVMVectorOfBitcastsToInt<0>],
+    : DefaultAttrsIntrinsic<[LLVMVectorOfBitcastsToInt<0>],
                 [LLVMVectorOfBitcastsToInt<0>,
                  LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                  llvm_anyvector_ty],
                 [IntrNoMem]>;
 
   class AdvSIMD_SVE_ReduceWithInit_Intrinsic
-    : Intrinsic<[LLVMVectorElementType<0>],
+    : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
                 [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                  LLVMVectorElementType<0>,
                  llvm_anyvector_ty],
                 [IntrNoMem]>;
 
   class AdvSIMD_SVE_ShiftByImm_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                  LLVMMatchType<0>,
                  llvm_i32_ty],
                 [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
   class AdvSIMD_SVE_ShiftWide_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                  LLVMMatchType<0>,
                  llvm_nxv2i64_ty],
                 [IntrNoMem]>;
 
   class AdvSIMD_SVE_Unpack_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                [LLVMSubdivide2VectorType<0>],
                [IntrNoMem]>;
 
   class AdvSIMD_SVE_CADD_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                  LLVMMatchType<0>,
                  LLVMMatchType<0>,
@@ -999,7 +1034,7 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
                 [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   class AdvSIMD_SVE_CMLA_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                  LLVMMatchType<0>,
                  LLVMMatchType<0>,
@@ -1008,7 +1043,7 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
                 [IntrNoMem, ImmArg<ArgIndex<4>>]>;
 
   class AdvSIMD_SVE_CMLA_LANE_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMMatchType<0>,
                  LLVMMatchType<0>,
                  LLVMMatchType<0>,
@@ -1017,96 +1052,96 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
                 [IntrNoMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
 
   class AdvSIMD_SVE_DUP_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMMatchType<0>,
                  LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                  LLVMVectorElementType<0>],
                 [IntrNoMem]>;
 
   class AdvSIMD_SVE_DUP_Unpred_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty], [LLVMVectorElementType<0>],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMVectorElementType<0>],
                 [IntrNoMem]>;
 
   class AdvSIMD_SVE_DUPQ_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMMatchType<0>,
                  llvm_i64_ty],
                 [IntrNoMem]>;
 
   class AdvSIMD_SVE_EXPA_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMVectorOfBitcastsToInt<0>],
                 [IntrNoMem]>;
 
   class AdvSIMD_SVE_FCVT_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMMatchType<0>,
                  LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                  llvm_anyvector_ty],
                 [IntrNoMem]>;
 
   class AdvSIMD_SVE_FCVTZS_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMVectorOfBitcastsToInt<0>,
                  LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                  llvm_anyvector_ty],
                 [IntrNoMem]>;
 
   class AdvSIMD_SVE_INSR_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMMatchType<0>,
                  LLVMVectorElementType<0>],
                 [IntrNoMem]>;
 
   class AdvSIMD_SVE_PTRUE_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [llvm_i32_ty],
                 [IntrNoMem, ImmArg<ArgIndex<0>>]>;
 
   class AdvSIMD_SVE_PUNPKHI_Intrinsic
-    : Intrinsic<[LLVMHalfElementsVectorType<0>],
+    : DefaultAttrsIntrinsic<[LLVMHalfElementsVectorType<0>],
                 [llvm_anyvector_ty],
                 [IntrNoMem]>;
 
   class AdvSIMD_SVE_SCALE_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                  LLVMMatchType<0>,
                  LLVMVectorOfBitcastsToInt<0>],
                 [IntrNoMem]>;
 
   class AdvSIMD_SVE_SCVTF_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMMatchType<0>,
                  LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                  llvm_anyvector_ty],
                 [IntrNoMem]>;
 
   class AdvSIMD_SVE_TSMUL_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMMatchType<0>,
                  LLVMVectorOfBitcastsToInt<0>],
                 [IntrNoMem]>;
 
   class AdvSIMD_SVE_CNTB_Intrinsic
-    : Intrinsic<[llvm_i64_ty],
+    : DefaultAttrsIntrinsic<[llvm_i64_ty],
                 [llvm_i32_ty],
                 [IntrNoMem, ImmArg<ArgIndex<0>>]>;
 
   class AdvSIMD_SVE_CNTP_Intrinsic
-    : Intrinsic<[llvm_i64_ty],
+    : DefaultAttrsIntrinsic<[llvm_i64_ty],
                 [llvm_anyvector_ty, LLVMMatchType<0>],
                 [IntrNoMem]>;
 
   class AdvSIMD_SVE_DOT_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMMatchType<0>,
                  LLVMSubdivide4VectorType<0>,
                  LLVMSubdivide4VectorType<0>],
                 [IntrNoMem]>;
 
   class AdvSIMD_SVE_DOT_Indexed_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMMatchType<0>,
                  LLVMSubdivide4VectorType<0>,
                  LLVMSubdivide4VectorType<0>,
@@ -1114,65 +1149,65 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
                 [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   class AdvSIMD_SVE_PTEST_Intrinsic
-    : Intrinsic<[llvm_i1_ty],
+    : DefaultAttrsIntrinsic<[llvm_i1_ty],
                 [llvm_anyvector_ty,
                  LLVMMatchType<0>],
                 [IntrNoMem]>;
 
   class AdvSIMD_SVE_TBL_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMMatchType<0>,
                  LLVMVectorOfBitcastsToInt<0>],
                 [IntrNoMem]>;
 
   class AdvSIMD_SVE2_TBX_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMMatchType<0>,
                  LLVMMatchType<0>,
                  LLVMVectorOfBitcastsToInt<0>],
                 [IntrNoMem]>;
 
   class SVE2_1VectorArg_Long_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMSubdivide2VectorType<0>,
                  llvm_i32_ty],
                 [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
   class SVE2_2VectorArg_Long_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMSubdivide2VectorType<0>,
                  LLVMSubdivide2VectorType<0>],
                 [IntrNoMem]>;
 
   class SVE2_2VectorArgIndexed_Long_Intrinsic
-  : Intrinsic<[llvm_anyvector_ty],
+  : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
               [LLVMSubdivide2VectorType<0>,
                LLVMSubdivide2VectorType<0>,
                llvm_i32_ty],
               [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
   class SVE2_2VectorArg_Wide_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMMatchType<0>,
                  LLVMSubdivide2VectorType<0>],
                 [IntrNoMem]>;
 
   class SVE2_2VectorArg_Pred_Long_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                  LLVMMatchType<0>,
                  LLVMSubdivide2VectorType<0>],
                 [IntrNoMem]>;
 
   class SVE2_3VectorArg_Long_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMMatchType<0>,
                  LLVMSubdivide2VectorType<0>,
                  LLVMSubdivide2VectorType<0>],
                 [IntrNoMem]>;
 
   class SVE2_3VectorArgIndexed_Long_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMMatchType<0>,
                  LLVMSubdivide2VectorType<0>,
                  LLVMSubdivide2VectorType<0>,
@@ -1180,45 +1215,45 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
                 [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   class SVE2_1VectorArg_Narrowing_Intrinsic
-    : Intrinsic<[LLVMSubdivide2VectorType<0>],
+    : DefaultAttrsIntrinsic<[LLVMSubdivide2VectorType<0>],
                 [llvm_anyvector_ty],
                 [IntrNoMem]>;
 
   class SVE2_Merged1VectorArg_Narrowing_Intrinsic
-    : Intrinsic<[LLVMSubdivide2VectorType<0>],
+    : DefaultAttrsIntrinsic<[LLVMSubdivide2VectorType<0>],
                 [LLVMSubdivide2VectorType<0>,
                  llvm_anyvector_ty],
                 [IntrNoMem]>;
   class SVE2_2VectorArg_Narrowing_Intrinsic
-      : Intrinsic<
+      : DefaultAttrsIntrinsic<
             [LLVMSubdivide2VectorType<0>],
             [llvm_anyvector_ty, LLVMMatchType<0>],
             [IntrNoMem]>;
 
   class SVE2_Merged2VectorArg_Narrowing_Intrinsic
-      : Intrinsic<
+      : DefaultAttrsIntrinsic<
             [LLVMSubdivide2VectorType<0>],
             [LLVMSubdivide2VectorType<0>, llvm_anyvector_ty, LLVMMatchType<0>],
             [IntrNoMem]>;
 
   class SVE2_1VectorArg_Imm_Narrowing_Intrinsic
-      : Intrinsic<[LLVMSubdivide2VectorType<0>],
+      : DefaultAttrsIntrinsic<[LLVMSubdivide2VectorType<0>],
                   [llvm_anyvector_ty, llvm_i32_ty],
                   [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
   class SVE2_2VectorArg_Imm_Narrowing_Intrinsic
-      : Intrinsic<[LLVMSubdivide2VectorType<0>],
+      : DefaultAttrsIntrinsic<[LLVMSubdivide2VectorType<0>],
                   [LLVMSubdivide2VectorType<0>, llvm_anyvector_ty,
                    llvm_i32_ty],
                   [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
   class SVE2_CONFLICT_DETECT_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMAnyPointerType<llvm_any_ty>,
                  LLVMMatchType<1>]>;
 
   class SVE2_3VectorArg_Indexed_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMMatchType<0>,
                  LLVMSubdivide2VectorType<0>,
                  LLVMSubdivide2VectorType<0>,
@@ -1226,7 +1261,7 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
                 [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   class AdvSIMD_SVE_CDOT_LANE_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMMatchType<0>,
                  LLVMSubdivide4VectorType<0>,
                  LLVMSubdivide4VectorType<0>,
@@ -1243,7 +1278,7 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
   // This class of intrinsics are not intended to be useful within LLVM IR but
   // are instead here to support some of the more regid parts of the ACLE.
   class Builtin_SVCVT<string name, LLVMType OUT, LLVMType PRED, LLVMType IN>
-      : Intrinsic<[OUT], [OUT, PRED, IN], [IntrNoMem]>;
+      : DefaultAttrsIntrinsic<[OUT], [OUT, PRED, IN], [IntrNoMem]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1252,24 +1287,24 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
 let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
 
 class AdvSIMD_SVE_Reduce_Intrinsic
-  : Intrinsic<[LLVMVectorElementType<0>],
+  : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
               [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                llvm_anyvector_ty],
               [IntrNoMem]>;
 
 class AdvSIMD_SVE_SADDV_Reduce_Intrinsic
-  : Intrinsic<[llvm_i64_ty],
+  : DefaultAttrsIntrinsic<[llvm_i64_ty],
               [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                llvm_anyvector_ty],
               [IntrNoMem]>;
 
 class AdvSIMD_SVE_WHILE_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [llvm_anyint_ty, LLVMMatchType<1>],
                 [IntrNoMem]>;
 
 class AdvSIMD_GatherLoad_SV_64b_Offsets_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [
                   LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                   LLVMPointerToElt<0>,
@@ -1278,7 +1313,7 @@ class AdvSIMD_GatherLoad_SV_64b_Offsets_Intrinsic
                 [IntrReadMem, IntrArgMemOnly]>;
 
 class AdvSIMD_GatherLoad_SV_32b_Offsets_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [
                   LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                   LLVMPointerToElt<0>,
@@ -1287,16 +1322,16 @@ class AdvSIMD_GatherLoad_SV_32b_Offsets_Intrinsic
                 [IntrReadMem, IntrArgMemOnly]>;
 
 class AdvSIMD_GatherLoad_VS_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [
                   LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                   llvm_anyvector_ty,
                   llvm_i64_ty
                 ],
-                [IntrReadMem, IntrArgMemOnly]>;
+                [IntrReadMem]>;
 
 class AdvSIMD_ScatterStore_SV_64b_Offsets_Intrinsic
-    : Intrinsic<[],
+    : DefaultAttrsIntrinsic<[],
                [
                  llvm_anyvector_ty,
                  LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
@@ -1306,7 +1341,7 @@ class AdvSIMD_ScatterStore_SV_64b_Offsets_Intrinsic
                [IntrWriteMem, IntrArgMemOnly]>;
 
 class AdvSIMD_ScatterStore_SV_32b_Offsets_Intrinsic
-    : Intrinsic<[],
+    : DefaultAttrsIntrinsic<[],
                [
                  llvm_anyvector_ty,
                  LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
@@ -1316,17 +1351,17 @@ class AdvSIMD_ScatterStore_SV_32b_Offsets_Intrinsic
                [IntrWriteMem, IntrArgMemOnly]>;
 
 class AdvSIMD_ScatterStore_VS_Intrinsic
-    : Intrinsic<[],
+    : DefaultAttrsIntrinsic<[],
                [
                  llvm_anyvector_ty,
                  LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                  llvm_anyvector_ty, llvm_i64_ty
                ],
-               [IntrWriteMem, IntrArgMemOnly]>;
+               [IntrWriteMem]>;
 
 
 class SVE_gather_prf_SV
-    : Intrinsic<[],
+    : DefaultAttrsIntrinsic<[],
                 [
                   LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, // Predicate
                   llvm_ptr_ty, // Base address
@@ -1336,7 +1371,7 @@ class SVE_gather_prf_SV
                 [IntrInaccessibleMemOrArgMemOnly, NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<3>>]>;
 
 class SVE_gather_prf_VS
-    : Intrinsic<[],
+    : DefaultAttrsIntrinsic<[],
                 [
                   LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, // Predicate
                   llvm_anyvector_ty, // Base addresses
@@ -1346,17 +1381,17 @@ class SVE_gather_prf_VS
                 [IntrInaccessibleMemOrArgMemOnly, ImmArg<ArgIndex<3>>]>;
 
 class SVE_MatMul_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMMatchType<0>, LLVMSubdivide4VectorType<0>, LLVMSubdivide4VectorType<0>],
                 [IntrNoMem]>;
 
 class SVE_4Vec_BF16
-    : Intrinsic<[llvm_nxv4f32_ty],
+    : DefaultAttrsIntrinsic<[llvm_nxv4f32_ty],
                 [llvm_nxv4f32_ty, llvm_nxv8bf16_ty, llvm_nxv8bf16_ty],
                 [IntrNoMem]>;
 
 class SVE_4Vec_BF16_Indexed
-    : Intrinsic<[llvm_nxv4f32_ty],
+    : DefaultAttrsIntrinsic<[llvm_nxv4f32_ty],
                 [llvm_nxv4f32_ty, llvm_nxv8bf16_ty, llvm_nxv8bf16_ty, llvm_i64_ty],
                 [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
@@ -1408,7 +1443,7 @@ def int_aarch64_sve_stnt1 : AdvSIMD_1Vec_PredStore_Intrinsic;
 //
 
 def int_aarch64_sve_prf
-  : Intrinsic<[], [llvm_anyvector_ty, llvm_ptr_ty, llvm_i32_ty],
+  : DefaultAttrsIntrinsic<[], [llvm_anyvector_ty, llvm_ptr_ty, llvm_i32_ty],
                   [IntrArgMemOnly, ImmArg<ArgIndex<2>>]>;
 
 // Scalar + 32-bit scaled offset vector, zero extend, packed and
@@ -1572,10 +1607,10 @@ def int_aarch64_sve_cntp : AdvSIMD_SVE_CNTP_Intrinsic;
 // FFR manipulation
 //
 
-def int_aarch64_sve_rdffr   : GCCBuiltin<"__builtin_sve_svrdffr">,   Intrinsic<[llvm_nxv16i1_ty], []>;
-def int_aarch64_sve_rdffr_z : GCCBuiltin<"__builtin_sve_svrdffr_z">, Intrinsic<[llvm_nxv16i1_ty], [llvm_nxv16i1_ty]>;
-def int_aarch64_sve_setffr  : GCCBuiltin<"__builtin_sve_svsetffr">,  Intrinsic<[], []>;
-def int_aarch64_sve_wrffr   : GCCBuiltin<"__builtin_sve_svwrffr">,   Intrinsic<[], [llvm_nxv16i1_ty]>;
+def int_aarch64_sve_rdffr   : GCCBuiltin<"__builtin_sve_svrdffr">,   DefaultAttrsIntrinsic<[llvm_nxv16i1_ty], []>;
+def int_aarch64_sve_rdffr_z : GCCBuiltin<"__builtin_sve_svrdffr_z">, DefaultAttrsIntrinsic<[llvm_nxv16i1_ty], [llvm_nxv16i1_ty]>;
+def int_aarch64_sve_setffr  : GCCBuiltin<"__builtin_sve_svsetffr">,  DefaultAttrsIntrinsic<[], []>;
+def int_aarch64_sve_wrffr   : GCCBuiltin<"__builtin_sve_svwrffr">,   DefaultAttrsIntrinsic<[], [llvm_nxv16i1_ty]>;
 
 //
 // Saturating scalar arithmetic
@@ -1888,11 +1923,11 @@ def int_aarch64_sve_ptest_last  : AdvSIMD_SVE_PTEST_Intrinsic;
 // Reinterpreting data
 //
 
-def int_aarch64_sve_convert_from_svbool : Intrinsic<[llvm_anyvector_ty],
+def int_aarch64_sve_convert_from_svbool : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                                                     [llvm_nxv16i1_ty],
                                                     [IntrNoMem]>;
 
-def int_aarch64_sve_convert_to_svbool : Intrinsic<[llvm_nxv16i1_ty],
+def int_aarch64_sve_convert_to_svbool : DefaultAttrsIntrinsic<[llvm_nxv16i1_ty],
                                                   [llvm_anyvector_ty],
                                                   [IntrNoMem]>;
 
@@ -2307,31 +2342,31 @@ def int_aarch64_sve_xar    : AdvSIMD_2VectorArgIndexed_Intrinsic;
 //
 
 def int_aarch64_sve_aesd    : GCCBuiltin<"__builtin_sve_svaesd_u8">,
-                              Intrinsic<[llvm_nxv16i8_ty],
+                              DefaultAttrsIntrinsic<[llvm_nxv16i8_ty],
                                         [llvm_nxv16i8_ty, llvm_nxv16i8_ty],
                                         [IntrNoMem]>;
 def int_aarch64_sve_aesimc  : GCCBuiltin<"__builtin_sve_svaesimc_u8">,
-                              Intrinsic<[llvm_nxv16i8_ty],
+                              DefaultAttrsIntrinsic<[llvm_nxv16i8_ty],
                                         [llvm_nxv16i8_ty],
                                         [IntrNoMem]>;
 def int_aarch64_sve_aese    : GCCBuiltin<"__builtin_sve_svaese_u8">,
-                              Intrinsic<[llvm_nxv16i8_ty],
+                              DefaultAttrsIntrinsic<[llvm_nxv16i8_ty],
                                         [llvm_nxv16i8_ty, llvm_nxv16i8_ty],
                                         [IntrNoMem]>;
 def int_aarch64_sve_aesmc   : GCCBuiltin<"__builtin_sve_svaesmc_u8">,
-                              Intrinsic<[llvm_nxv16i8_ty],
+                              DefaultAttrsIntrinsic<[llvm_nxv16i8_ty],
                                         [llvm_nxv16i8_ty],
                                         [IntrNoMem]>;
 def int_aarch64_sve_rax1    : GCCBuiltin<"__builtin_sve_svrax1_u64">,
-                              Intrinsic<[llvm_nxv2i64_ty],
+                              DefaultAttrsIntrinsic<[llvm_nxv2i64_ty],
                                         [llvm_nxv2i64_ty, llvm_nxv2i64_ty],
                                         [IntrNoMem]>;
 def int_aarch64_sve_sm4e    : GCCBuiltin<"__builtin_sve_svsm4e_u32">,
-                              Intrinsic<[llvm_nxv4i32_ty],
+                              DefaultAttrsIntrinsic<[llvm_nxv4i32_ty],
                                         [llvm_nxv4i32_ty, llvm_nxv4i32_ty],
                                         [IntrNoMem]>;
 def int_aarch64_sve_sm4ekey : GCCBuiltin<"__builtin_sve_svsm4ekey_u32">,
-                              Intrinsic<[llvm_nxv4i32_ty],
+                              DefaultAttrsIntrinsic<[llvm_nxv4i32_ty],
                                         [llvm_nxv4i32_ty, llvm_nxv4i32_ty],
                                         [IntrNoMem]>;
 //
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 01380afae006..ac2291f9d43b 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -18,7 +18,7 @@ class AMDGPUReadPreloadRegisterIntrinsicNamed<string name>
 
 // Used to tag image and resource intrinsics with information used to generate
 // mem operands.
-class AMDGPURsrcIntrinsic<int rsrcarg, bit isimage = 0> {
+class AMDGPURsrcIntrinsic<int rsrcarg, bit isimage = false> {
   int RsrcArg = rsrcarg;
   bit IsImage = isimage;
 }
@@ -182,6 +182,8 @@ def int_amdgcn_init_exec : Intrinsic<[],
 // Set EXEC according to a thread count packed in an SGPR input:
 //    thread_count = (input >> bitoffset) & 0x7f;
 // This is always moved to the beginning of the basic block.
+// Note: only inreg arguments to the parent function are valid as
+// inputs to this intrinsic, computed values cannot be used.
 def int_amdgcn_init_exec_from_input : Intrinsic<[],
   [llvm_i32_ty,       // 32-bit SGPR input
    llvm_i32_ty],      // bit offset of the thread count
@@ -255,7 +257,17 @@ def int_amdgcn_log_clamp : Intrinsic<
 
 def int_amdgcn_fmul_legacy : GCCBuiltin<"__builtin_amdgcn_fmul_legacy">,
   Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
-  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
+  [IntrNoMem, IntrSpeculatable, IntrWillReturn, Commutative]
+>;
+
+// Fused single-precision multiply-add with legacy behaviour for the multiply,
+// which is that +/- 0.0 * anything (even NaN or infinity) is +0.0. This is
+// intended for use on subtargets that have the v_fma_legacy_f32 and/or
+// v_fmac_legacy_f32 instructions. (Note that v_fma_legacy_f16 is unrelated and
+// has a completely different kind of legacy behaviour.)
+def int_amdgcn_fma_legacy :
+  Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
+  [IntrNoMem, IntrSpeculatable, IntrWillReturn, Commutative]
 >;
 
 def int_amdgcn_rcp : Intrinsic<
@@ -397,11 +409,10 @@ class AMDGPUAtomicIncIntrin : Intrinsic<[llvm_anyint_ty],
 def int_amdgcn_atomic_inc : AMDGPUAtomicIncIntrin;
 def int_amdgcn_atomic_dec : AMDGPUAtomicIncIntrin;
 
-class AMDGPULDSF32Intrin<string clang_builtin> :
-  GCCBuiltin<clang_builtin>,
-  Intrinsic<[llvm_float_ty],
-    [LLVMQualPointerType<llvm_float_ty, 3>,
-    llvm_float_ty,
+class AMDGPULDSIntrin :
+  Intrinsic<[llvm_any_ty],
+    [LLVMQualPointerType<LLVMMatchType<0>, 3>,
+    LLVMMatchType<0>,
     llvm_i32_ty, // ordering
     llvm_i32_ty, // scope
     llvm_i1_ty], // isVolatile
@@ -446,9 +457,9 @@ def int_amdgcn_ds_ordered_swap : AMDGPUDSOrderedIntrinsic;
 def int_amdgcn_ds_append : AMDGPUDSAppendConsumedIntrinsic;
 def int_amdgcn_ds_consume : AMDGPUDSAppendConsumedIntrinsic;
 
-def int_amdgcn_ds_fadd : AMDGPULDSF32Intrin<"__builtin_amdgcn_ds_faddf">;
-def int_amdgcn_ds_fmin : AMDGPULDSF32Intrin<"__builtin_amdgcn_ds_fminf">;
-def int_amdgcn_ds_fmax : AMDGPULDSF32Intrin<"__builtin_amdgcn_ds_fmaxf">;
+def int_amdgcn_ds_fadd : AMDGPULDSIntrin;
+def int_amdgcn_ds_fmin : AMDGPULDSIntrin;
+def int_amdgcn_ds_fmax : AMDGPULDSIntrin;
 
 } // TargetPrefix = "amdgcn"
 
@@ -545,7 +556,7 @@ class AMDGPUSampleVariant<string ucmod, string lcmod, list<AMDGPUArg> extra_addr
 
   // {offset} {bias} {z-compare}
   list<AMDGPUArg> ExtraAddrArgs = extra_addr;
-  bit Gradients = 0;
+  bit Gradients = false;
 
   // Name of the {lod} or {clamp} argument that is appended to the coordinates,
   // if any.
@@ -585,7 +596,7 @@ defset list<AMDGPUSampleVariant> AMDGPUSampleVariants = {
     defm AMDGPUSample : AMDGPUSampleHelper_Compare<"_LZ", "_lz", []>;
   }
 
-  let Gradients = 1 in {
+  let Gradients = true in {
     defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"_D", "_d", []>;
     defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"_CD", "_cd", []>;
   }
@@ -600,12 +611,12 @@ class AMDGPUDimProfile<string opmod,
   string OpMod = opmod; // the corresponding instruction is named IMAGE_OpMod
 
   // These are intended to be overwritten by subclasses
-  bit IsSample = 0;
-  bit IsAtomic = 0;
+  bit IsSample = false;
+  bit IsAtomic = false;
   list<LLVMType> RetTypes = [];
   list<AMDGPUArg> DataArgs = [];
   list<AMDGPUArg> ExtraAddrArgs = [];
-  bit Gradients = 0;
+  bit Gradients = false;
   string LodClampMip = "";
 
   int NumRetAndDataAnyTypes =
@@ -616,7 +627,7 @@ class AMDGPUDimProfile<string opmod,
     arglistconcat<[ExtraAddrArgs,
                    !if(Gradients, dim.GradientArgs, []),
                    !listconcat(!if(IsSample, dim.CoordSliceArgs, dim.CoordSliceIntArgs),
-                               !if(!eq(LodClampMip, ""),
+                               !if(!empty(LodClampMip),
                                    []<AMDGPUArg>,
                                    [AMDGPUArg<LLVMMatchType<0>, LodClampMip>]))],
                   NumRetAndDataAnyTypes>.ret;
@@ -646,7 +657,7 @@ class AMDGPUDimProfileCopy<AMDGPUDimProfile base> : AMDGPUDimProfile<base.OpMod,
 class AMDGPUDimSampleProfile<string opmod,
                              AMDGPUDimProps dim,
                              AMDGPUSampleVariant sample> : AMDGPUDimProfile<opmod, dim> {
-  let IsSample = 1;
+  let IsSample = true;
   let RetTypes = [llvm_any_ty];
   let ExtraAddrArgs = sample.ExtraAddrArgs;
   let Gradients = sample.Gradients;
@@ -657,7 +668,7 @@ class AMDGPUDimNoSampleProfile<string opmod,
                                AMDGPUDimProps dim,
                                list<LLVMType> retty,
                                list<AMDGPUArg> dataargs,
-                               bit Mip = 0> : AMDGPUDimProfile<opmod, dim> {
+                               bit Mip = false> : AMDGPUDimProfile<opmod, dim> {
   let RetTypes = retty;
   let DataArgs = dataargs;
   let LodClampMip = !if(Mip, "mip", "");
@@ -668,7 +679,7 @@ class AMDGPUDimAtomicProfile<string opmod,
                              list<AMDGPUArg> dataargs> : AMDGPUDimProfile<opmod, dim> {
   let RetTypes = [llvm_anyint_ty];
   let DataArgs = dataargs;
-  let IsAtomic = 1;
+  let IsAtomic = true;
 }
 
 class AMDGPUDimGetResInfoProfile<AMDGPUDimProps dim> : AMDGPUDimProfile<"GET_RESINFO", dim> {
@@ -681,13 +692,23 @@ class AMDGPUDimGetResInfoProfile<AMDGPUDimProps dim> : AMDGPUDimProfile<"GET_RES
 // Helper class for figuring out image intrinsic argument indexes.
 class AMDGPUImageDimIntrinsicEval<AMDGPUDimProfile P_> {
   int NumDataArgs = !size(P_.DataArgs);
-  int NumDmaskArgs = !if(P_.IsAtomic, 0, 1);
+  int NumDmaskArgs = !not(P_.IsAtomic);
+  int NumExtraAddrArgs = !size(P_.ExtraAddrArgs);
   int NumVAddrArgs = !size(P_.AddrArgs);
+  int NumGradientArgs = !if(P_.Gradients, !size(P_.Dim.GradientArgs), 0);
+  int NumCoordArgs = !if(P_.IsSample, !size(P_.Dim.CoordSliceArgs), !size(P_.Dim.CoordSliceIntArgs));
   int NumRSrcArgs = 1;
   int NumSampArgs = !if(P_.IsSample, 2, 0);
   int DmaskArgIndex = NumDataArgs;
-  int UnormArgIndex = !add(NumDataArgs, NumDmaskArgs, NumVAddrArgs, NumRSrcArgs, 1);
-  int TexFailCtrlArgIndex = !add(NumDataArgs, NumDmaskArgs, NumVAddrArgs, NumRSrcArgs, NumSampArgs);
+  int VAddrArgIndex = !add(DmaskArgIndex, NumDmaskArgs);
+  int GradientArgIndex = !add(VAddrArgIndex, NumExtraAddrArgs);
+  int CoordArgIndex = !add(GradientArgIndex, NumGradientArgs);
+  int LodArgIndex = !add(VAddrArgIndex, NumVAddrArgs, -1);
+  int MipArgIndex = LodArgIndex;
+  int RsrcArgIndex = !add(VAddrArgIndex, NumVAddrArgs);
+  int SampArgIndex = !add(RsrcArgIndex, NumRSrcArgs);
+  int UnormArgIndex = !add(SampArgIndex, 1);
+  int TexFailCtrlArgIndex = !add(SampArgIndex, NumSampArgs);
   int CachePolicyArgIndex = !add(TexFailCtrlArgIndex, 1);
 }
 
@@ -738,7 +759,7 @@ defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = {
                                             list<AMDGPUArg> dataargs,
                                             list<IntrinsicProperty> props,
                                             list<SDNodeProperty> sdnodeprops,
-                                            bit Mip = 0> {
+                                            bit Mip = false> {
     foreach dim = AMDGPUDims.NoMsaa in {
       def !strconcat(NAME, "_", dim.Name)
         : AMDGPUImageDimIntrinsic<
@@ -752,7 +773,7 @@ defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = {
                                          list<AMDGPUArg> dataargs,
                                          list<IntrinsicProperty> props,
                                          list<SDNodeProperty> sdnodeprops,
-                                         bit Mip = 0> {
+                                         bit Mip = false> {
     foreach dim = AMDGPUDims.All in {
       def !strconcat(NAME, "_", dim.Name)
         : AMDGPUImageDimIntrinsic<
@@ -787,7 +808,7 @@ defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = {
   //////////////////////////////////////////////////////////////////////////
   multiclass AMDGPUImageDimSampleDims<string opmod,
                                       AMDGPUSampleVariant sample,
-                                      bit NoMem = 0> {
+                                      bit NoMem = false> {
     foreach dim = AMDGPUDims.NoMsaa in {
       def !strconcat(NAME, "_", dim.Name) : AMDGPUImageDimIntrinsic<
           AMDGPUDimSampleProfile<opmod, dim, sample>,
@@ -973,9 +994,9 @@ class AMDGPUStructBufferStore<LLVMType data_ty = llvm_any_ty> : Intrinsic <
 def int_amdgcn_struct_buffer_store_format : AMDGPUStructBufferStore;
 def int_amdgcn_struct_buffer_store : AMDGPUStructBufferStore;
 
-class AMDGPURawBufferAtomic<LLVMType data_ty = llvm_any_ty> : Intrinsic <
-  [data_ty],
-  [LLVMMatchType<0>,  // vdata(VGPR)
+class AMDGPURawBufferAtomic<LLVMType data_ty = llvm_any_ty, bit NoRtn = false> : Intrinsic <
+  !if(NoRtn, [], [data_ty]),
+  [!if(NoRtn, data_ty, LLVMMatchType<0>),  // vdata(VGPR)
    llvm_v4i32_ty,     // rsrc(SGPR)
    llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
    llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
@@ -1005,9 +1026,12 @@ def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic<
   [ImmArg<ArgIndex<5>>, IntrWillReturn], "", [SDNPMemOperand]>,
   AMDGPURsrcIntrinsic<2, 0>;
 
-class AMDGPUStructBufferAtomic<LLVMType data_ty = llvm_any_ty> : Intrinsic <
-  [data_ty],
-  [LLVMMatchType<0>,  // vdata(VGPR)
+// gfx908 intrinsic
+def int_amdgcn_raw_buffer_atomic_fadd : AMDGPURawBufferAtomic<llvm_anyfloat_ty>;
+
+class AMDGPUStructBufferAtomic<LLVMType data_ty = llvm_any_ty, bit NoRtn = false> : Intrinsic <
+  !if(NoRtn, [], [data_ty]),
+  [!if(NoRtn, data_ty, LLVMMatchType<0>),  // vdata(VGPR)
    llvm_v4i32_ty,     // rsrc(SGPR)
    llvm_i32_ty,       // vindex(VGPR)
    llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
@@ -1039,6 +1063,10 @@ def int_amdgcn_struct_buffer_atomic_cmpswap : Intrinsic<
   [ImmArg<ArgIndex<6>>, IntrWillReturn], "", [SDNPMemOperand]>,
   AMDGPURsrcIntrinsic<2, 0>;
 
+// gfx908 intrinsic
+def int_amdgcn_struct_buffer_atomic_fadd : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>;
+
+
 // Obsolescent tbuffer intrinsics.
 def int_amdgcn_tbuffer_load : Intrinsic <
     [llvm_any_ty],    // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
@@ -1168,6 +1196,19 @@ def int_amdgcn_buffer_atomic_cmpswap : Intrinsic<
   AMDGPURsrcIntrinsic<2, 0>;
 
 def int_amdgcn_buffer_atomic_csub : AMDGPUBufferAtomic;
+
+class AMDGPUBufferAtomicFP : Intrinsic <
+  [llvm_anyfloat_ty],
+  [LLVMMatchType<0>, // vdata(VGPR)
+   llvm_v4i32_ty,    // rsrc(SGPR)
+   llvm_i32_ty,      // vindex(VGPR)
+   llvm_i32_ty,      // offset(SGPR/VGPR/imm)
+   llvm_i1_ty],      // slc(imm)
+  [ImmArg<ArgIndex<4>>, IntrWillReturn], "", [SDNPMemOperand]>,
+  AMDGPURsrcIntrinsic<1, 0>;
+
+// Legacy form of the intrinsic. raw and struct forms should be preferred.
+def int_amdgcn_buffer_atomic_fadd : AMDGPUBufferAtomicFP;
 } // defset AMDGPUBufferIntrinsics
 
 // Uses that do not set the done bit should set IntrWriteMem on the
@@ -1248,7 +1289,7 @@ def int_amdgcn_s_getreg :
 def int_amdgcn_s_setreg :
   GCCBuiltin<"__builtin_amdgcn_s_setreg">,
   Intrinsic<[], [llvm_i32_ty, llvm_i32_ty],
-  [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>]
+  [IntrNoMem, IntrHasSideEffects, IntrWillReturn, ImmArg<ArgIndex<0>>]
 >;
 
 // int_amdgcn_s_getpc is provided to allow a specific style of position
@@ -1291,6 +1332,7 @@ def int_amdgcn_interp_p2 :
           // See int_amdgcn_v_interp_p1 for why this is IntrNoMem.
 
 // __builtin_amdgcn_interp_p1_f16 <i>, <attr_chan>, <attr>, <high>, <m0>
+// high selects whether high or low 16-bits are loaded from LDS
 def int_amdgcn_interp_p1_f16 :
   GCCBuiltin<"__builtin_amdgcn_interp_p1_f16">,
   Intrinsic<[llvm_float_ty],
@@ -1299,6 +1341,7 @@ def int_amdgcn_interp_p1_f16 :
              ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
 
 // __builtin_amdgcn_interp_p2_f16 <p1>, <j>, <attr_chan>, <attr>, <high>, <m0>
+// high selects whether high or low 16-bits are loaded from LDS
 def int_amdgcn_interp_p2_f16 :
   GCCBuiltin<"__builtin_amdgcn_interp_p2_f16">,
   Intrinsic<[llvm_half_ty],
@@ -1538,6 +1581,10 @@ def int_amdgcn_wqm_vote : Intrinsic<[llvm_i1_ty],
 // FIXME: Should this be IntrNoMem, IntrHasSideEffects, or IntrWillReturn?
 def int_amdgcn_kill : Intrinsic<[], [llvm_i1_ty], []>;
 
+def int_amdgcn_endpgm : GCCBuiltin<"__builtin_amdgcn_endpgm">,
+  Intrinsic<[], [], [IntrNoReturn, IntrCold, IntrNoMem, IntrHasSideEffects]
+>;
+
 // Copies the active channels of the source value to the destination value,
 // with the guarantee that the source value is computed as if the entire
 // program were executed in Whole Wavefront Mode, i.e. with all channels
@@ -1667,10 +1714,19 @@ class AMDGPUGlobalAtomicRtn<LLVMType vt> : Intrinsic <
   [vt],
   [llvm_anyptr_ty,    // vaddr
    vt],               // vdata(VGPR)
-  [IntrArgMemOnly, NoCapture<ArgIndex<0>>], "", [SDNPMemOperand]>;
+  [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>], "",
+  [SDNPMemOperand]>;
 
 def int_amdgcn_global_atomic_csub : AMDGPUGlobalAtomicRtn<llvm_i32_ty>;
 
+// uint4 llvm.amdgcn.image.bvh.intersect.ray <node_ptr>, <ray_extent>, <ray_origin>,
+//                                           <ray_dir>, <ray_inv_dir>, <texture_descr>
+def int_amdgcn_image_bvh_intersect_ray :
+  Intrinsic<[llvm_v4i32_ty],
+            [llvm_anyint_ty, llvm_float_ty, llvm_v4f32_ty, llvm_anyvector_ty,
+             LLVMMatchType<1>, llvm_v4i32_ty],
+            [IntrReadMem, IntrWillReturn]>;
+
 //===----------------------------------------------------------------------===//
 // Deep learning intrinsics.
 //===----------------------------------------------------------------------===//
@@ -1786,25 +1842,7 @@ def int_amdgcn_udot8 :
 // gfx908 intrinsics
 // ===----------------------------------------------------------------------===//
 
-class AMDGPUBufferAtomicNoRtn : Intrinsic <
-  [],
-  [llvm_anyfloat_ty,  // vdata(VGPR)
-   llvm_v4i32_ty,     // rsrc(SGPR)
-   llvm_i32_ty,       // vindex(VGPR)
-   llvm_i32_ty,       // offset(SGPR/VGPR/imm)
-   llvm_i1_ty],       // slc(imm)
-  [ImmArg<ArgIndex<4>>, IntrWillReturn], "", [SDNPMemOperand]>,
-  AMDGPURsrcIntrinsic<1, 0>;
-
-class AMDGPUGlobalAtomicNoRtn : Intrinsic <
-  [],
-  [llvm_anyptr_ty,    // vaddr
-   llvm_anyfloat_ty],               // vdata(VGPR)
-  [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>], "",
-  [SDNPMemOperand]>;
-
-def int_amdgcn_buffer_atomic_fadd    : AMDGPUBufferAtomicNoRtn;
-def int_amdgcn_global_atomic_fadd    : AMDGPUGlobalAtomicNoRtn;
+def int_amdgcn_global_atomic_fadd : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
 
 // llvm.amdgcn.mfma.f32.* vdst, srcA, srcB, srcC, cbsz, abid, blgp
 def int_amdgcn_mfma_f32_32x32x1f32 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_32x32x1f32">,
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsARM.td b/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsARM.td
index df74e446b965..0eb27cc34462 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsARM.td
+++ b/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsARM.td
@@ -791,14 +791,17 @@ def int_arm_neon_vcvtbfp2bf
     : Intrinsic<[llvm_bfloat_ty], [llvm_float_ty], [IntrNoMem]>;
 
 def int_arm_neon_bfdot : Neon_Dot_Intrinsic;
-def int_arm_neon_bfmmla : Neon_MatMul_Intrinsic;
-
-class Neon_FML_Intrinsic
-  : Intrinsic<[llvm_anyvector_ty],
-              [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],
-              [IntrNoMem]>;
-def int_arm_neon_bfmlalb : Neon_FML_Intrinsic;
-def int_arm_neon_bfmlalt : Neon_FML_Intrinsic;
+def int_arm_neon_bfmmla
+    : Intrinsic<[llvm_v4f32_ty],
+                [llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty],
+                [IntrNoMem]>;
+
+class Neon_BF16FML_Intrinsic
+    : Intrinsic<[llvm_v4f32_ty],
+                [llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty],
+                [IntrNoMem]>;
+def int_arm_neon_bfmlalb : Neon_BF16FML_Intrinsic;
+def int_arm_neon_bfmlalt : Neon_BF16FML_Intrinsic;
 
 def int_arm_cls: Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
 def int_arm_cls64: Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>;
@@ -814,9 +817,7 @@ def int_arm_neon_vcadd_rot90  : Neon_2Arg_Intrinsic;
 def int_arm_neon_vcadd_rot270 : Neon_2Arg_Intrinsic;
 
 // GNU eabi mcount
-def int_arm_gnu_eabi_mcount : Intrinsic<[],
-                                    [],
-                                    [IntrReadMem, IntrWriteMem]>;
+def int_arm_gnu_eabi_mcount : Intrinsic<[], [], []>;
 
 def int_arm_mve_pred_i2v : Intrinsic<
   [llvm_anyvector_ty], [llvm_i32_ty], [IntrNoMem]>;
@@ -921,7 +922,7 @@ multiclass MVEPredicatedM<list<LLVMType> rets, list<LLVMType> params,
                           list<IntrinsicProperty> props = [IntrNoMem]> {
   def "": Intrinsic<rets, params, props>;
   def _predicated: Intrinsic<rets, params # [pred,
-      !if(!eq(!cast<string>(rets[0]), "llvm_anyvector_ty"),
+      !if(!eq(rets[0], llvm_anyvector_ty),
           LLVMMatchType<0>, rets[0])], props>;
 }
 
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsBPF.td b/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsBPF.td
index c4d35b2a0a88..4b4dd94b1599 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsBPF.td
+++ b/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsBPF.td
@@ -24,6 +24,14 @@ let TargetPrefix = "bpf" in {  // All intrinsics start with "llvm.bpf."
               Intrinsic<[llvm_i32_ty], [llvm_anyptr_ty, llvm_i64_ty],
               [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   def int_bpf_btf_type_id : GCCBuiltin<"__builtin_bpf_btf_type_id">,
-              Intrinsic<[llvm_i32_ty], [llvm_any_ty, llvm_any_ty, llvm_i64_ty],
+              Intrinsic<[llvm_i64_ty], [llvm_i32_ty, llvm_i64_ty],
               [IntrNoMem]>;
+  def int_bpf_preserve_type_info : GCCBuiltin<"__builtin_bpf_preserve_type_info">,
+              Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i64_ty],
+              [IntrNoMem]>;
+  def int_bpf_preserve_enum_value : GCCBuiltin<"__builtin_bpf_preserve_enum_value">,
+              Intrinsic<[llvm_i64_ty], [llvm_i32_ty, llvm_ptr_ty, llvm_i64_ty],
+              [IntrNoMem]>;
+  def int_bpf_passthrough : GCCBuiltin<"__builtin_bpf_passthrough">,
+              Intrinsic<[llvm_any_ty], [llvm_i32_ty, llvm_any_ty], [IntrNoMem]>;
 }
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsNVVM.td b/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 61293418ec41..2ab48cfc4bb7 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -37,11 +37,6 @@ def llvm_anyi64ptr_ty     : LLVMAnyPointerType<llvm_i64_ty>;     // (space)i64*
 // MISC
 //
 
-// Helper class for construction of n-element list<LLVMtype> [t,t,...,t]
-class RepLLVMType<int N, LLVMType T> {
-  list<LLVMType> ret = !if(N, !listconcat(RepLLVMType<!add(N,-1), T>.ret, [T]), []);
-}
-
 // Helper class that represents a 'fragment' of an NVPTX *MMA instruction.
 // Geom: m<M>n<N>k<K>. E.g. m8n32k16
 // Frag: [abcd]
@@ -54,40 +49,40 @@ class WMMA_REGS<string Geom, string Frag, string PtxEltType> {
   string ft = frag#":"#ptx_elt_type;
   list<LLVMType> regs = !cond(
     // mma.sync.m8n8k4 uses smaller a/b fragments than wmma fp ops
-    !eq(gft,"m8n8k4:a:f16") : RepLLVMType<2, llvm_v2f16_ty>.ret,
-    !eq(gft,"m8n8k4:b:f16") : RepLLVMType<2, llvm_v2f16_ty>.ret,
+    !eq(gft,"m8n8k4:a:f16") : !listsplat(llvm_v2f16_ty, 2),
+    !eq(gft,"m8n8k4:b:f16") : !listsplat(llvm_v2f16_ty, 2),
 
     // fp16 -> fp16/fp32 @  m16n16k16/m8n32k16/m32n8k16
     // All currently supported geometries use the same fragment format,
     // so we only need to consider {fragment, type}.
-    !eq(ft,"a:f16") : RepLLVMType<8, llvm_v2f16_ty>.ret,
-    !eq(ft,"b:f16") : RepLLVMType<8, llvm_v2f16_ty>.ret,
-    !eq(ft,"c:f16") : RepLLVMType<4, llvm_v2f16_ty>.ret,
-    !eq(ft,"d:f16") : RepLLVMType<4, llvm_v2f16_ty>.ret,
-    !eq(ft,"c:f32") : RepLLVMType<8, llvm_float_ty>.ret,
-    !eq(ft,"d:f32") : RepLLVMType<8, llvm_float_ty>.ret,
+    !eq(ft,"a:f16") : !listsplat(llvm_v2f16_ty, 8),
+    !eq(ft,"b:f16") : !listsplat(llvm_v2f16_ty, 8),
+    !eq(ft,"c:f16") : !listsplat(llvm_v2f16_ty, 4),
+    !eq(ft,"d:f16") : !listsplat(llvm_v2f16_ty, 4),
+    !eq(ft,"c:f32") : !listsplat(llvm_float_ty, 8),
+    !eq(ft,"d:f32") : !listsplat(llvm_float_ty, 8),
 
     // u8/s8 -> s32 @ m16n16k16/m8n32k16/m32n8k16
-    !eq(gft,"m16n16k16:a:u8") : RepLLVMType<2, llvm_i32_ty>.ret,
-    !eq(gft,"m16n16k16:a:s8") : RepLLVMType<2, llvm_i32_ty>.ret,
-    !eq(gft,"m16n16k16:b:u8") : RepLLVMType<2, llvm_i32_ty>.ret,
-    !eq(gft,"m16n16k16:b:s8") : RepLLVMType<2, llvm_i32_ty>.ret,
-    !eq(gft,"m16n16k16:c:s32") : RepLLVMType<8, llvm_i32_ty>.ret,
-    !eq(gft,"m16n16k16:d:s32") : RepLLVMType<8, llvm_i32_ty>.ret,
+    !eq(gft,"m16n16k16:a:u8") : !listsplat(llvm_i32_ty, 2),
+    !eq(gft,"m16n16k16:a:s8") : !listsplat(llvm_i32_ty, 2),
+    !eq(gft,"m16n16k16:b:u8") : !listsplat(llvm_i32_ty, 2),
+    !eq(gft,"m16n16k16:b:s8") : !listsplat(llvm_i32_ty, 2),
+    !eq(gft,"m16n16k16:c:s32") : !listsplat(llvm_i32_ty, 8),
+    !eq(gft,"m16n16k16:d:s32") : !listsplat(llvm_i32_ty, 8),
 
     !eq(gft,"m8n32k16:a:u8") : [llvm_i32_ty],
     !eq(gft,"m8n32k16:a:s8") : [llvm_i32_ty],
-    !eq(gft,"m8n32k16:b:u8") : RepLLVMType<4, llvm_i32_ty>.ret,
-    !eq(gft,"m8n32k16:b:s8") : RepLLVMType<4, llvm_i32_ty>.ret,
-    !eq(gft,"m8n32k16:c:s32") : RepLLVMType<8, llvm_i32_ty>.ret,
-    !eq(gft,"m8n32k16:d:s32") : RepLLVMType<8, llvm_i32_ty>.ret,
+    !eq(gft,"m8n32k16:b:u8") : !listsplat(llvm_i32_ty, 4),
+    !eq(gft,"m8n32k16:b:s8") : !listsplat(llvm_i32_ty, 4),
+    !eq(gft,"m8n32k16:c:s32") : !listsplat(llvm_i32_ty, 8),
+    !eq(gft,"m8n32k16:d:s32") : !listsplat(llvm_i32_ty, 8),
 
-    !eq(gft,"m32n8k16:a:u8") : RepLLVMType<4, llvm_i32_ty>.ret,
-    !eq(gft,"m32n8k16:a:s8") : RepLLVMType<4, llvm_i32_ty>.ret,
+    !eq(gft,"m32n8k16:a:u8") : !listsplat(llvm_i32_ty, 4),
+    !eq(gft,"m32n8k16:a:s8") : !listsplat(llvm_i32_ty, 4),
     !eq(gft,"m32n8k16:b:u8") : [llvm_i32_ty],
     !eq(gft,"m32n8k16:b:s8") : [llvm_i32_ty],
-    !eq(gft,"m32n8k16:c:s32") : RepLLVMType<8, llvm_i32_ty>.ret,
-    !eq(gft,"m32n8k16:d:s32") : RepLLVMType<8, llvm_i32_ty>.ret,
+    !eq(gft,"m32n8k16:c:s32") : !listsplat(llvm_i32_ty, 8),
+    !eq(gft,"m32n8k16:d:s32") : !listsplat(llvm_i32_ty, 8),
 
     // u4/s4/b1 -> s32 @ m8n8k32 (u4/s4), m8n8k128(b1)
     !eq(gft,"m8n8k128:a:b1") : [llvm_i32_ty],
@@ -96,10 +91,10 @@ class WMMA_REGS<string Geom, string Frag, string PtxEltType> {
     !eq(gft,"m8n8k128:b:b1") : [llvm_i32_ty],
     !eq(gft,"m8n8k32:b:u4") : [llvm_i32_ty],
     !eq(gft,"m8n8k32:b:s4") : [llvm_i32_ty],
-    !eq(gft,"m8n8k128:c:s32") : RepLLVMType<2, llvm_i32_ty>.ret,
-    !eq(gft,"m8n8k128:d:s32") : RepLLVMType<2, llvm_i32_ty>.ret,
-    !eq(gft,"m8n8k32:c:s32") : RepLLVMType<2, llvm_i32_ty>.ret,
-    !eq(gft,"m8n8k32:d:s32") : RepLLVMType<2, llvm_i32_ty>.ret,
+    !eq(gft,"m8n8k128:c:s32") : !listsplat(llvm_i32_ty, 2),
+    !eq(gft,"m8n8k128:d:s32") : !listsplat(llvm_i32_ty, 2),
+    !eq(gft,"m8n8k32:c:s32") : !listsplat(llvm_i32_ty, 2),
+    !eq(gft,"m8n8k32:d:s32") : !listsplat(llvm_i32_ty, 2),
   );
 }
 
@@ -133,7 +128,7 @@ class MMA_SIGNATURE<WMMA_REGS A, WMMA_REGS B, WMMA_REGS C, WMMA_REGS D> {
      !eq(A.ptx_elt_type, "u4") : [A],
      !eq(A.ptx_elt_type, "b1") : [A],
      // the rest are FP ops identified by accumulator & result type.
-     1: [D, C]
+     true: [D, C]
      );
    string ret = !foldl("", id_frags, a, b, !strconcat(a, ".", b.ptx_elt_type));
 }
@@ -230,19 +225,17 @@ class NVVM_MMA_OPS<int _ = 0> {
                                              ldst_bit_ab_ops,
                                              ldst_subint_cd_ops);
   // Separate A/B/C fragments (loads) from D (stores).
-  list<WMMA_REGS> all_ld_ops = !foldl([]<WMMA_REGS>, all_ldst_ops, a, b,
-                                      !listconcat(a, !if(!eq(b.frag,"d"), [],[b])));
-  list<WMMA_REGS> all_st_ops = !foldl([]<WMMA_REGS>, all_ldst_ops, a, b,
-                                      !listconcat(a, !if(!eq(b.frag,"d"), [b],[])));
+  list<WMMA_REGS> all_ld_ops = !filter(op, all_ldst_ops, !ne(op.frag, "d"));
+  list<WMMA_REGS> all_st_ops = !filter(op, all_ldst_ops, !eq(op.frag, "d"));
 }
 
 def NVVM_MMA_OPS : NVVM_MMA_OPS;
 
-// Returns [1] if this combination of layout/satf is supported, [] otherwise.
+// Returns true if this combination of layout/satf is supported; false otherwise.
 // MMA ops must provide all parameters. Loads and stores -- only frags and layout_a.
 // The class is used to prevent generation of records for the unsupported variants.
 // E.g.
-// foreach _ = NVVM_MMA_SUPPORTED<...>.ret in =
+// if NVVM_MMA_SUPPORTED<...>.ret then
 //   def : FOO<>; // The record will only be defined for supported ops.
 //
 class NVVM_MMA_SUPPORTED<list<WMMA_REGS> frags, string layout_a, string layout_b="-", int satf=-1> {
@@ -268,20 +261,20 @@ class NVVM_MMA_SUPPORTED<list<WMMA_REGS> frags, string layout_a, string layout_b
                # !if(!eq(!size(frags), 4),
                      frags[2].ptx_elt_type # frags[3].ptx_elt_type,
                      "?");
-  list<int> ret = !cond(
+  bit ret = !cond(
     // Sub-int MMA only supports fixed A/B layout.
     // b1 does not support .satf.
-    !eq(mma#":"#satf, "b1:row:col:0") : [1],
+    !eq(mma#":"#satf, "b1:row:col:0") : true,
     // mma.m8n8k4 has no .satf modifier.
     !and(!eq(frags[0].geom, "m8n8k4"),
-         !ne(satf, 0)): [],
+         !ne(satf, 0)): false,
 
     // mma.m8n8k4 has no C=f32 D=f16 variant.
-    !eq(gcd, "m8n8k4:f32f16"): [],
-    !eq(mma, "s4:row:col") : [1],
-    !eq(mma, "u4:row:col") : [1],
-    !eq(mma, "s4:row:col") : [1],
-    !eq(mma, "u4:row:col") : [1],
+    !eq(gcd, "m8n8k4:f32f16"): false,
+    !eq(mma, "s4:row:col") : true,
+    !eq(mma, "u4:row:col") : true,
+    !eq(mma, "s4:row:col") : true,
+    !eq(mma, "u4:row:col") : true,
     // Sub-int load/stores have fixed layout for A and B.
     !and(!eq(layout_b, "-"), // It's a Load or Store op
          !or(!eq(ld,  "b1:a:row"),
@@ -295,13 +288,13 @@ class NVVM_MMA_SUPPORTED<list<WMMA_REGS> frags, string layout_a, string layout_b
              !eq(ld, "u4:a:row"),
              !eq(ld, "u4:b:col"),
              !eq(ldf, "u4:c"),
-             !eq(ldf, "u4:d"))) : [1],
+             !eq(ldf, "u4:d"))) : true,
     // All other sub-int ops are not supported.
-    !eq(t, "b1") : [],
-    !eq(t, "s4") : [],
-    !eq(t, "u4") : [],
+    !eq(t, "b1") : false,
+    !eq(t, "s4") : false,
+    !eq(t, "u4") : false,
     // All other (non sub-int) are OK.
-    1: [1]
+    true: true
   );
 }
 
@@ -314,8 +307,8 @@ class SHFL_INFO<bit sync, string mode, string type, bit return_pred> {
   string Name = "int_nvvm_shfl_" # Suffix;
   string Builtin = "__nvvm_shfl_" # Suffix;
   string IntrName = "llvm.nvvm.shfl." # !subst("_",".", Suffix);
-  list<int> withGccBuiltin = !if(return_pred, [], [1]);
-  list<int> withoutGccBuiltin = !if(return_pred, [1], []);
+  bit withGccBuiltin = !not(return_pred);
+  bit withoutGccBuiltin = return_pred;
   LLVMType OpType = !cond(
     !eq(type,"i32"): llvm_i32_ty,
     !eq(type,"f32"): llvm_float_ty);
@@ -4005,18 +3998,18 @@ def int_nvvm_read_ptx_sreg_warpsize : PTXReadSRegIntrinsic_r32<"warpsize">;
 // SHUFFLE
 //
 // Generate intrinsics for all variants of shfl instruction.
-foreach sync = [0, 1] in {
+foreach sync = [false, true] in {
   foreach mode = ["up", "down", "bfly", "idx"] in {
     foreach type = ["i32", "f32"] in {
-      foreach return_pred = [0, 1] in {
+      foreach return_pred = [false, true] in {
         foreach i = [SHFL_INFO<sync, mode, type, return_pred>] in {
-          foreach _ = i.withGccBuiltin in {
+          if i.withGccBuiltin then {
             def i.Name : GCCBuiltin<i.Builtin>,
                          Intrinsic<i.RetTy, i.ArgsTy,
                                    [IntrInaccessibleMemOnly, IntrConvergent],
                                    i.IntrName>;
           }
-          foreach _ = i.withoutGccBuiltin in {
+          if i.withoutGccBuiltin then {
             def i.Name : Intrinsic<i.RetTy, i.ArgsTy,
                          [IntrInaccessibleMemOnly, IntrConvergent], i.IntrName>;
           }
@@ -4127,11 +4120,11 @@ class NVVM_WMMA_ST<WMMA_REGS Frag, string Layout, int WithStride>
 foreach layout = ["row", "col"] in {
   foreach stride = [0, 1] in {
     foreach frag = NVVM_MMA_OPS.all_ld_ops in
-      foreach _ = NVVM_MMA_SUPPORTED<[frag], layout>.ret in
+      if NVVM_MMA_SUPPORTED<[frag], layout>.ret then
         def WMMA_NAME_LDST<"load", frag, layout, stride>.record
              : NVVM_WMMA_LD<frag, layout, stride>;
     foreach frag = NVVM_MMA_OPS.all_st_ops in
-      foreach _ = NVVM_MMA_SUPPORTED<[frag], layout>.ret in
+      if NVVM_MMA_SUPPORTED<[frag], layout>.ret then
         def WMMA_NAME_LDST<"store", frag, layout, stride>.record
              : NVVM_WMMA_ST<frag, layout, stride>;
   }
@@ -4150,7 +4143,7 @@ foreach layout_a = ["row", "col"] in {
   foreach layout_b = ["row", "col"] in {
     foreach satf = [0, 1] in {
       foreach op = NVVM_MMA_OPS.all_mma_ops in {
-        foreach _ = NVVM_MMA_SUPPORTED<op, layout_a, layout_b, satf>.ret in {
+        if NVVM_MMA_SUPPORTED<op, layout_a, layout_b, satf>.ret then {
           def WMMA_NAME_MMA<layout_a, layout_b, satf,
                             op[0], op[1], op[2], op[3]>.record
             : NVVM_WMMA_MMA<layout_a, layout_b, satf,
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsPowerPC.td b/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsPowerPC.td
index 614a29049686..075b6252d9a5 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsPowerPC.td
+++ b/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsPowerPC.td
@@ -18,10 +18,12 @@
 let TargetPrefix = "ppc" in {  // All intrinsics start with "llvm.ppc.".
   // dcba/dcbf/dcbi/dcbst/dcbt/dcbz/dcbzl(PPC970) instructions.
   def int_ppc_dcba  : Intrinsic<[], [llvm_ptr_ty], []>;
-  def int_ppc_dcbf  : GCCBuiltin<"__builtin_dcbf">,
-                      Intrinsic<[], [llvm_ptr_ty], []>;
-  def int_ppc_dcbfl : Intrinsic<[], [llvm_ptr_ty], []>;
-  def int_ppc_dcbflp: Intrinsic<[], [llvm_ptr_ty], []>;
+  def int_ppc_dcbf : GCCBuiltin<"__builtin_dcbf">,
+                      Intrinsic<[], [llvm_ptr_ty], [IntrArgMemOnly]>;
+  def int_ppc_dcbfl : Intrinsic<[], [llvm_ptr_ty], [IntrArgMemOnly]>;
+  def int_ppc_dcbflp : Intrinsic<[], [llvm_ptr_ty], [IntrArgMemOnly]>;
+  def int_ppc_dcbfps : Intrinsic<[], [llvm_ptr_ty], [IntrArgMemOnly]>;
+  def int_ppc_dcbstps : Intrinsic<[], [llvm_ptr_ty], [IntrArgMemOnly]>;
   def int_ppc_dcbi  : Intrinsic<[], [llvm_ptr_ty], []>;
   def int_ppc_dcbst : Intrinsic<[], [llvm_ptr_ty], []>;
   def int_ppc_dcbt  : Intrinsic<[], [llvm_ptr_ty],
@@ -47,6 +49,13 @@ let TargetPrefix = "ppc" in {  // All intrinsics start with "llvm.ppc.".
   // eieio instruction
   def int_ppc_eieio : Intrinsic<[],[],[]>;
 
+  // Get content from current FPSCR register
+  def int_ppc_readflm : GCCBuiltin<"__builtin_readflm">,
+                        Intrinsic<[llvm_double_ty], [], [IntrNoMem]>;
+  // Set FPSCR register, and return previous content
+  def int_ppc_setflm : GCCBuiltin<"__builtin_setflm">,
+                       Intrinsic<[llvm_double_ty], [llvm_double_ty], []>;
+
   // Intrinsics for [double]word extended forms of divide instructions
   def int_ppc_divwe : GCCBuiltin<"__builtin_divwe">,
                       Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
@@ -61,6 +70,14 @@ let TargetPrefix = "ppc" in {  // All intrinsics start with "llvm.ppc.".
                        Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
                                  [IntrNoMem]>;
 
+  // Generate a random number
+  def int_ppc_darn : GCCBuiltin<"__builtin_darn">,
+                     Intrinsic<[llvm_i64_ty], [], [IntrNoMem]>;
+  def int_ppc_darnraw : GCCBuiltin<"__builtin_darn_raw">,
+                     Intrinsic<[llvm_i64_ty], [], [IntrNoMem]>;
+  def int_ppc_darn32 : GCCBuiltin<"__builtin_darn_32">,
+                     Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>;
+
   // Bit permute doubleword
   def int_ppc_bpermd : GCCBuiltin<"__builtin_bpermd">,
                        Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
@@ -134,6 +151,28 @@ let TargetPrefix = "ppc" in {  // All PPC intrinsics start with "llvm.ppc.".
       Intrinsic<ret_types, param_types, properties>;
 }
 
+//===----------------------------------------------------------------------===//
+// PowerPC MMA Intrinsic Multi Class Definitions.
+//
+
+multiclass PowerPC_MMA_ACC_Intrinsic<list<LLVMType> args> {
+  def NAME: Intrinsic<[llvm_v512i1_ty], args, [IntrNoMem]>;
+  def pp : Intrinsic<[llvm_v512i1_ty], !listconcat([llvm_v512i1_ty], args),
+                     [IntrNoMem]>;
+  def pn : Intrinsic<[llvm_v512i1_ty], !listconcat([llvm_v512i1_ty], args),
+                     [IntrNoMem]>;
+  def np : Intrinsic<[llvm_v512i1_ty], !listconcat([llvm_v512i1_ty], args),
+                     [IntrNoMem]>;
+  def nn : Intrinsic<[llvm_v512i1_ty], !listconcat([llvm_v512i1_ty], args),
+                     [IntrNoMem]>;
+}
+
+multiclass PowerPC_MMA_ACC_PP_Intrinsic<list<LLVMType> args> {
+  def NAME: Intrinsic<[llvm_v512i1_ty], args, [IntrNoMem]>;
+  def pp : Intrinsic<[llvm_v512i1_ty], !listconcat([llvm_v512i1_ty], args),
+                     [IntrNoMem]>;
+}
+
 //===----------------------------------------------------------------------===//
 // PowerPC Altivec Intrinsic Class Definitions.
 //
@@ -186,6 +225,13 @@ class PowerPC_Vec_QQQ_Intrinsic<string GCCIntSuffix>
                          [llvm_v1i128_ty], [llvm_v1i128_ty, llvm_v1i128_ty],
                          [IntrNoMem]>;
 
+/// PowerPC_Vec_QDD_Intrinsic - A PowerPC intrinsic that takes two v2i64
+/// vectors and returns one v1i128. These intrinsics have no side effects.
+class PowerPC_Vec_QDD_Intrinsic<string GCCIntSuffix>
+  : PowerPC_Vec_Intrinsic<GCCIntSuffix,
+                          [llvm_v1i128_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
+                          [IntrNoMem]>;
+
 //===----------------------------------------------------------------------===//
 // PowerPC VSX Intrinsic Class Definitions.
 //
@@ -239,9 +285,9 @@ let TargetPrefix = "ppc" in {  // All intrinsics start with "llvm.ppc.".
 
   // VSCR access.
   def int_ppc_altivec_mfvscr : GCCBuiltin<"__builtin_altivec_mfvscr">,
-              Intrinsic<[llvm_v8i16_ty], [], [IntrReadMem]>;
+              Intrinsic<[llvm_v8i16_ty], [], [IntrNoMem, IntrHasSideEffects]>;
   def int_ppc_altivec_mtvscr : GCCBuiltin<"__builtin_altivec_mtvscr">,
-              Intrinsic<[], [llvm_v4i32_ty], []>;
+              Intrinsic<[], [llvm_v4i32_ty], [IntrNoMem, IntrHasSideEffects]>;
 
 
   // Loads.  These don't map directly to GCC builtins because they represent the
@@ -347,6 +393,28 @@ let TargetPrefix = "ppc" in {  // All intrinsics start with "llvm.ppc.".
               Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
                         [IntrNoMem]>;
 
+  def int_ppc_altivec_vcmpequq : GCCBuiltin<"__builtin_altivec_vcmpequq">,
+              Intrinsic<[llvm_v1i128_ty], [llvm_v1i128_ty, llvm_v1i128_ty],
+                        [IntrNoMem]>;
+  def int_ppc_altivec_vcmpgtsq : GCCBuiltin<"__builtin_altivec_vcmpgtsq">,
+              Intrinsic<[llvm_v1i128_ty], [llvm_v1i128_ty, llvm_v1i128_ty],
+                        [IntrNoMem]>;
+  def int_ppc_altivec_vcmpgtuq : GCCBuiltin<"__builtin_altivec_vcmpgtuq">,
+              Intrinsic<[llvm_v1i128_ty], [llvm_v1i128_ty, llvm_v1i128_ty],
+                        [IntrNoMem]>;
+  def int_ppc_altivec_vcmpequq_p : GCCBuiltin<"__builtin_altivec_vcmpequq_p">,
+              Intrinsic<[llvm_i32_ty],
+                        [llvm_i32_ty,llvm_v1i128_ty,llvm_v1i128_ty],
+                        [IntrNoMem]>;
+  def int_ppc_altivec_vcmpgtsq_p : GCCBuiltin<"__builtin_altivec_vcmpgtsq_p">,
+              Intrinsic<[llvm_i32_ty],
+                        [llvm_i32_ty,llvm_v1i128_ty,llvm_v1i128_ty],
+                        [IntrNoMem]>;
+  def int_ppc_altivec_vcmpgtuq_p : GCCBuiltin<"__builtin_altivec_vcmpgtuq_p">,
+              Intrinsic<[llvm_i32_ty],
+                        [llvm_i32_ty,llvm_v1i128_ty,llvm_v1i128_ty],
+                        [IntrNoMem]>;
+
   // Predicate Comparisons.  The first operand specifies interpretation of CR6.
   def int_ppc_altivec_vcmpbfp_p : GCCBuiltin<"__builtin_altivec_vcmpbfp_p">,
               Intrinsic<[llvm_i32_ty],[llvm_i32_ty,llvm_v4f32_ty,llvm_v4f32_ty],
@@ -429,6 +497,56 @@ let TargetPrefix = "ppc" in {  // All intrinsics start with "llvm.ppc.".
   def int_ppc_altivec_vprtybq : GCCBuiltin<"__builtin_altivec_vprtybq">,
               Intrinsic<[llvm_v1i128_ty],[llvm_v1i128_ty],[IntrNoMem]>;
 
+  // P10 Vector Extract with Mask
+  def int_ppc_altivec_vextractbm : GCCBuiltin<"__builtin_altivec_vextractbm">,
+              Intrinsic<[llvm_i32_ty], [llvm_v16i8_ty], [IntrNoMem]>;
+  def int_ppc_altivec_vextracthm : GCCBuiltin<"__builtin_altivec_vextracthm">,
+              Intrinsic<[llvm_i32_ty], [llvm_v8i16_ty], [IntrNoMem]>;
+  def int_ppc_altivec_vextractwm : GCCBuiltin<"__builtin_altivec_vextractwm">,
+              Intrinsic<[llvm_i32_ty], [llvm_v4i32_ty], [IntrNoMem]>;
+  def int_ppc_altivec_vextractdm : GCCBuiltin<"__builtin_altivec_vextractdm">,
+              Intrinsic<[llvm_i32_ty], [llvm_v2i64_ty], [IntrNoMem]>;
+  def int_ppc_altivec_vextractqm : GCCBuiltin<"__builtin_altivec_vextractqm">,
+              Intrinsic<[llvm_i32_ty], [llvm_v1i128_ty], [IntrNoMem]>;
+
+  // P10 Vector Expand with Mask
+  def int_ppc_altivec_vexpandbm : GCCBuiltin<"__builtin_altivec_vexpandbm">,
+              Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty], [IntrNoMem]>;
+  def int_ppc_altivec_vexpandhm : GCCBuiltin<"__builtin_altivec_vexpandhm">,
+              Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty], [IntrNoMem]>;
+  def int_ppc_altivec_vexpandwm : GCCBuiltin<"__builtin_altivec_vexpandwm">,
+              Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty], [IntrNoMem]>;
+  def int_ppc_altivec_vexpanddm : GCCBuiltin<"__builtin_altivec_vexpanddm">,
+              Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty], [IntrNoMem]>;
+  def int_ppc_altivec_vexpandqm : GCCBuiltin<"__builtin_altivec_vexpandqm">,
+              Intrinsic<[llvm_v1i128_ty], [llvm_v1i128_ty], [IntrNoMem]>;
+
+  // P10 Vector Count with Mask intrinsics.
+  def int_ppc_altivec_vcntmbb : GCCBuiltin<"__builtin_altivec_vcntmbb">,
+              Intrinsic<[llvm_i64_ty], [llvm_v16i8_ty, llvm_i32_ty],
+                        [IntrNoMem, ImmArg<ArgIndex<1>>]>;
+  def int_ppc_altivec_vcntmbh : GCCBuiltin<"__builtin_altivec_vcntmbh">,
+              Intrinsic<[llvm_i64_ty], [llvm_v8i16_ty, llvm_i32_ty],
+                        [IntrNoMem, ImmArg<ArgIndex<1>>]>;
+  def int_ppc_altivec_vcntmbw : GCCBuiltin<"__builtin_altivec_vcntmbw">,
+              Intrinsic<[llvm_i64_ty], [llvm_v4i32_ty, llvm_i32_ty],
+                        [IntrNoMem, ImmArg<ArgIndex<1>>]>;
+  def int_ppc_altivec_vcntmbd : GCCBuiltin<"__builtin_altivec_vcntmbd">,
+              Intrinsic<[llvm_i64_ty], [llvm_v2i64_ty, llvm_i32_ty],
+                        [IntrNoMem, ImmArg<ArgIndex<1>>]>;
+
+  // P10 Move to VSR with Mask Intrinsics.
+  def int_ppc_altivec_mtvsrbm : GCCBuiltin<"__builtin_altivec_mtvsrbm">,
+              Intrinsic<[llvm_v16i8_ty], [llvm_i64_ty], [IntrNoMem]>;
+  def int_ppc_altivec_mtvsrhm : GCCBuiltin<"__builtin_altivec_mtvsrhm">,
+              Intrinsic<[llvm_v8i16_ty], [llvm_i64_ty], [IntrNoMem]>;
+  def int_ppc_altivec_mtvsrwm : GCCBuiltin<"__builtin_altivec_mtvsrwm">,
+              Intrinsic<[llvm_v4i32_ty], [llvm_i64_ty], [IntrNoMem]>;
+  def int_ppc_altivec_mtvsrdm : GCCBuiltin<"__builtin_altivec_mtvsrdm">,
+              Intrinsic<[llvm_v2i64_ty], [llvm_i64_ty], [IntrNoMem]>;
+  def int_ppc_altivec_mtvsrqm : GCCBuiltin<"__builtin_altivec_mtvsrqm">,
+              Intrinsic<[llvm_v1i128_ty], [llvm_i64_ty], [IntrNoMem]>;
+
   // P10 Vector Parallel Bits Deposit/Extract Doubleword Builtins.
   def int_ppc_altivec_vpdepd : GCCBuiltin<"__builtin_altivec_vpdepd">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
@@ -437,6 +555,25 @@ let TargetPrefix = "ppc" in {  // All intrinsics start with "llvm.ppc.".
               Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
                         [IntrNoMem]>;
 
+  // P10 Vector String Isolate Intrinsics.
+  def int_ppc_altivec_vstribr : GCCBuiltin<"__builtin_altivec_vstribr">,
+              Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty], [IntrNoMem]>;
+  def int_ppc_altivec_vstribl : GCCBuiltin<"__builtin_altivec_vstribl">,
+              Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty], [IntrNoMem]>;
+  def int_ppc_altivec_vstrihr : GCCBuiltin<"__builtin_altivec_vstrihr">,
+              Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty], [IntrNoMem]>;
+  def int_ppc_altivec_vstrihl : GCCBuiltin<"__builtin_altivec_vstrihl">,
+              Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty], [IntrNoMem]>;
+  // Predicate Intrinsics: The first operand specifies interpretation of CR6.
+  def int_ppc_altivec_vstribr_p : GCCBuiltin<"__builtin_altivec_vstribr_p">,
+              Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_v16i8_ty], [IntrNoMem]>;
+  def int_ppc_altivec_vstribl_p : GCCBuiltin<"__builtin_altivec_vstribl_p">,
+              Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_v16i8_ty], [IntrNoMem]>;
+  def int_ppc_altivec_vstrihr_p : GCCBuiltin<"__builtin_altivec_vstrihr_p">,
+              Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_v8i16_ty], [IntrNoMem]>;
+  def int_ppc_altivec_vstrihl_p : GCCBuiltin<"__builtin_altivec_vstrihl_p">,
+              Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_v8i16_ty], [IntrNoMem]>;
+
   // P10 Vector Centrifuge Builtin.
   def int_ppc_altivec_vcfuged : GCCBuiltin<"__builtin_altivec_vcfuged">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
@@ -468,27 +605,27 @@ let TargetPrefix = "ppc" in {  // All intrinsics start with "llvm.ppc.".
   // P10 Vector Insert.
   def int_ppc_altivec_vinsblx : GCCBuiltin<"__builtin_altivec_vinsblx">,
               Intrinsic<[llvm_v16i8_ty],
-                        [llvm_v16i8_ty, llvm_i64_ty, llvm_i64_ty],
+                        [llvm_v16i8_ty, llvm_i32_ty, llvm_i32_ty],
                         [IntrNoMem]>;
   def int_ppc_altivec_vinsbrx : GCCBuiltin<"__builtin_altivec_vinsbrx">,
               Intrinsic<[llvm_v16i8_ty],
-                        [llvm_v16i8_ty, llvm_i64_ty, llvm_i64_ty],
+                        [llvm_v16i8_ty, llvm_i32_ty, llvm_i32_ty],
                         [IntrNoMem]>;
   def int_ppc_altivec_vinshlx : GCCBuiltin<"__builtin_altivec_vinshlx">,
               Intrinsic<[llvm_v8i16_ty],
-                        [llvm_v8i16_ty, llvm_i64_ty, llvm_i64_ty],
+                        [llvm_v8i16_ty, llvm_i32_ty, llvm_i32_ty],
                         [IntrNoMem]>;
   def int_ppc_altivec_vinshrx : GCCBuiltin<"__builtin_altivec_vinshrx">,
               Intrinsic<[llvm_v8i16_ty],
-                        [llvm_v8i16_ty, llvm_i64_ty, llvm_i64_ty],
+                        [llvm_v8i16_ty, llvm_i32_ty, llvm_i32_ty],
                         [IntrNoMem]>;
   def int_ppc_altivec_vinswlx : GCCBuiltin<"__builtin_altivec_vinswlx">,
               Intrinsic<[llvm_v4i32_ty],
-                        [llvm_v4i32_ty, llvm_i64_ty, llvm_i64_ty],
+                        [llvm_v4i32_ty, llvm_i32_ty, llvm_i32_ty],
                         [IntrNoMem]>;
   def int_ppc_altivec_vinswrx : GCCBuiltin<"__builtin_altivec_vinswrx">,
               Intrinsic<[llvm_v4i32_ty],
-                        [llvm_v4i32_ty, llvm_i64_ty, llvm_i64_ty],
+                        [llvm_v4i32_ty, llvm_i32_ty, llvm_i32_ty],
                         [IntrNoMem]>;
   def int_ppc_altivec_vinsdlx : GCCBuiltin<"__builtin_altivec_vinsdlx">,
               Intrinsic<[llvm_v2i64_ty],
@@ -500,37 +637,70 @@ let TargetPrefix = "ppc" in {  // All intrinsics start with "llvm.ppc.".
                         [IntrNoMem]>;
   def int_ppc_altivec_vinsbvlx : GCCBuiltin<"__builtin_altivec_vinsbvlx">,
               Intrinsic<[llvm_v16i8_ty],
-                        [llvm_v16i8_ty, llvm_i64_ty, llvm_v16i8_ty],
+                        [llvm_v16i8_ty, llvm_i32_ty, llvm_v16i8_ty],
                         [IntrNoMem]>;
   def int_ppc_altivec_vinsbvrx : GCCBuiltin<"__builtin_altivec_vinsbvrx">,
               Intrinsic<[llvm_v16i8_ty],
-                        [llvm_v16i8_ty, llvm_i64_ty, llvm_v16i8_ty],
+                        [llvm_v16i8_ty, llvm_i32_ty, llvm_v16i8_ty],
                         [IntrNoMem]>;
   def int_ppc_altivec_vinshvlx : GCCBuiltin<"__builtin_altivec_vinshvlx">,
               Intrinsic<[llvm_v8i16_ty],
-                        [llvm_v8i16_ty, llvm_i64_ty, llvm_v8i16_ty],
+                        [llvm_v8i16_ty, llvm_i32_ty, llvm_v8i16_ty],
                         [IntrNoMem]>;
  def int_ppc_altivec_vinshvrx : GCCBuiltin<"__builtin_altivec_vinshvrx">,
               Intrinsic<[llvm_v8i16_ty],
-                        [llvm_v8i16_ty, llvm_i64_ty, llvm_v8i16_ty],
+                        [llvm_v8i16_ty, llvm_i32_ty, llvm_v8i16_ty],
                         [IntrNoMem]>;
   def int_ppc_altivec_vinswvlx : GCCBuiltin<"__builtin_altivec_vinswvlx">,
               Intrinsic<[llvm_v4i32_ty],
-                        [llvm_v4i32_ty, llvm_i64_ty, llvm_v4i32_ty],
+                        [llvm_v4i32_ty, llvm_i32_ty, llvm_v4i32_ty],
                         [IntrNoMem]>;
   def int_ppc_altivec_vinswvrx : GCCBuiltin<"__builtin_altivec_vinswvrx">,
               Intrinsic<[llvm_v4i32_ty],
-                        [llvm_v4i32_ty, llvm_i64_ty, llvm_v4i32_ty],
+                        [llvm_v4i32_ty, llvm_i32_ty, llvm_v4i32_ty],
                         [IntrNoMem]>;
   // P10 Vector Insert with immediate.
   def int_ppc_altivec_vinsw :
               Intrinsic<[llvm_v4i32_ty],
-                        [llvm_v4i32_ty, llvm_i64_ty, llvm_i32_ty],
+                        [llvm_v4i32_ty, llvm_i32_ty, llvm_i32_ty],
                         [IntrNoMem, ImmArg<ArgIndex<2>>]>;
   def int_ppc_altivec_vinsd :
               Intrinsic<[llvm_v2i64_ty],
                         [llvm_v2i64_ty, llvm_i64_ty, llvm_i32_ty],
                         [IntrNoMem, ImmArg<ArgIndex<2>>]>;
+  // P10 Vector Extract.
+  def int_ppc_altivec_vextdubvlx : GCCBuiltin<"__builtin_altivec_vextdubvlx">,
+              Intrinsic<[llvm_v2i64_ty],
+                        [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty],
+                        [IntrNoMem]>;
+  def int_ppc_altivec_vextdubvrx : GCCBuiltin<"__builtin_altivec_vextdubvrx">,
+              Intrinsic<[llvm_v2i64_ty],
+                        [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty],
+                        [IntrNoMem]>;
+  def int_ppc_altivec_vextduhvlx : GCCBuiltin<"__builtin_altivec_vextduhvlx">,
+              Intrinsic<[llvm_v2i64_ty],
+                        [llvm_v8i16_ty, llvm_v8i16_ty, llvm_i32_ty],
+                        [IntrNoMem]>;
+  def int_ppc_altivec_vextduhvrx : GCCBuiltin<"__builtin_altivec_vextduhvrx">,
+              Intrinsic<[llvm_v2i64_ty],
+                        [llvm_v8i16_ty, llvm_v8i16_ty, llvm_i32_ty],
+                        [IntrNoMem]>;
+  def int_ppc_altivec_vextduwvlx : GCCBuiltin<"__builtin_altivec_vextduwvlx">,
+              Intrinsic<[llvm_v2i64_ty],
+                        [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty],
+                        [IntrNoMem]>;
+  def int_ppc_altivec_vextduwvrx : GCCBuiltin<"__builtin_altivec_vextduwvrx">,
+              Intrinsic<[llvm_v2i64_ty],
+                        [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty],
+                        [IntrNoMem]>;
+  def int_ppc_altivec_vextddvlx : GCCBuiltin<"__builtin_altivec_vextddvlx">,
+              Intrinsic<[llvm_v2i64_ty],
+                        [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty],
+                        [IntrNoMem]>;
+  def int_ppc_altivec_vextddvrx : GCCBuiltin<"__builtin_altivec_vextddvrx">,
+              Intrinsic<[llvm_v2i64_ty],
+                        [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty],
+                        [IntrNoMem]>;
 }
 
 // Vector average.
@@ -587,10 +757,12 @@ let TargetPrefix = "ppc" in {  // All PPC intrinsics start with "llvm.ppc.".
   // Saturating multiply-adds.
   def int_ppc_altivec_vmhaddshs : GCCBuiltin<"__builtin_altivec_vmhaddshs">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
-                         llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
+                         llvm_v8i16_ty, llvm_v8i16_ty],
+                         [IntrNoMem, IntrHasSideEffects]>;
   def int_ppc_altivec_vmhraddshs : GCCBuiltin<"__builtin_altivec_vmhraddshs">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
-                         llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
+                         llvm_v8i16_ty, llvm_v8i16_ty],
+                         [IntrNoMem, IntrHasSideEffects]>;
 
   def int_ppc_altivec_vmaddfp : GCCBuiltin<"__builtin_altivec_vmaddfp">,
               Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
@@ -608,7 +780,7 @@ let TargetPrefix = "ppc" in {  // All PPC intrinsics start with "llvm.ppc.".
                        llvm_v4i32_ty], [IntrNoMem]>;
   def int_ppc_altivec_vmsumshs : GCCBuiltin<"__builtin_altivec_vmsumshs">,
             Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty, llvm_v8i16_ty,
-                       llvm_v4i32_ty], [IntrNoMem]>;
+                       llvm_v4i32_ty], [IntrNoMem, IntrHasSideEffects]>;
   def int_ppc_altivec_vmsumubm : GCCBuiltin<"__builtin_altivec_vmsumubm">,
             Intrinsic<[llvm_v4i32_ty], [llvm_v16i8_ty, llvm_v16i8_ty,
                        llvm_v4i32_ty], [IntrNoMem]>;
@@ -620,7 +792,10 @@ let TargetPrefix = "ppc" in {  // All PPC intrinsics start with "llvm.ppc.".
                        llvm_v1i128_ty], [IntrNoMem]>;
   def int_ppc_altivec_vmsumuhs : GCCBuiltin<"__builtin_altivec_vmsumuhs">,
             Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty, llvm_v8i16_ty,
-                       llvm_v4i32_ty], [IntrNoMem]>;
+                       llvm_v4i32_ty], [IntrNoMem, IntrHasSideEffects]>;
+  def int_ppc_altivec_vmsumcud : GCCBuiltin<"__builtin_altivec_vmsumcud">,
+            Intrinsic<[llvm_v1i128_ty],
+                      [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v1i128_ty], [IntrNoMem]>;
 
   // Vector Multiply Instructions.
   def int_ppc_altivec_vmulesb : GCCBuiltin<"__builtin_altivec_vmulesb">,
@@ -632,6 +807,7 @@ let TargetPrefix = "ppc" in {  // All PPC intrinsics start with "llvm.ppc.".
   def int_ppc_altivec_vmulesw : GCCBuiltin<"__builtin_altivec_vmulesw">,
           Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
                     [IntrNoMem]>;
+  def int_ppc_altivec_vmulesd : PowerPC_Vec_QDD_Intrinsic<"vmulesd">;
   def int_ppc_altivec_vmuleub : GCCBuiltin<"__builtin_altivec_vmuleub">,
           Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
                     [IntrNoMem]>;
@@ -641,6 +817,7 @@ let TargetPrefix = "ppc" in {  // All PPC intrinsics start with "llvm.ppc.".
   def int_ppc_altivec_vmuleuw : GCCBuiltin<"__builtin_altivec_vmuleuw">,
           Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
                     [IntrNoMem]>;
+  def int_ppc_altivec_vmuleud : PowerPC_Vec_QDD_Intrinsic<"vmuleud">;
 
   def int_ppc_altivec_vmulosb : GCCBuiltin<"__builtin_altivec_vmulosb">,
           Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
@@ -651,6 +828,7 @@ let TargetPrefix = "ppc" in {  // All PPC intrinsics start with "llvm.ppc.".
   def int_ppc_altivec_vmulosw : GCCBuiltin<"__builtin_altivec_vmulosw">,
           Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
                     [IntrNoMem]>;
+  def int_ppc_altivec_vmulosd : PowerPC_Vec_QDD_Intrinsic<"vmulosd">;
   def int_ppc_altivec_vmuloub : GCCBuiltin<"__builtin_altivec_vmuloub">,
           Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
                     [IntrNoMem]>;
@@ -660,23 +838,38 @@ let TargetPrefix = "ppc" in {  // All PPC intrinsics start with "llvm.ppc.".
   def int_ppc_altivec_vmulouw : GCCBuiltin<"__builtin_altivec_vmulouw">,
           Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
                     [IntrNoMem]>;
+  def int_ppc_altivec_vmuloud : PowerPC_Vec_QDD_Intrinsic<"vmuloud">;
 
   // Vector Sum Instructions.
   def int_ppc_altivec_vsumsws : GCCBuiltin<"__builtin_altivec_vsumsws">,
             Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
-                      [IntrNoMem]>;
+                      [IntrNoMem, IntrHasSideEffects]>;
   def int_ppc_altivec_vsum2sws : GCCBuiltin<"__builtin_altivec_vsum2sws">,
             Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
-                      [IntrNoMem]>;
+                      [IntrNoMem, IntrHasSideEffects]>;
   def int_ppc_altivec_vsum4sbs : GCCBuiltin<"__builtin_altivec_vsum4sbs">,
             Intrinsic<[llvm_v4i32_ty], [llvm_v16i8_ty, llvm_v4i32_ty],
-                      [IntrNoMem]>;
+                      [IntrNoMem, IntrHasSideEffects]>;
   def int_ppc_altivec_vsum4shs : GCCBuiltin<"__builtin_altivec_vsum4shs">,
             Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty, llvm_v4i32_ty],
-                      [IntrNoMem]>;
+                      [IntrNoMem, IntrHasSideEffects]>;
   def int_ppc_altivec_vsum4ubs : GCCBuiltin<"__builtin_altivec_vsum4ubs">,
             Intrinsic<[llvm_v4i32_ty], [llvm_v16i8_ty, llvm_v4i32_ty],
-                      [IntrNoMem]>;
+                      [IntrNoMem, IntrHasSideEffects]>;
+
+  // Vector Sign Extension Instructions
+  def int_ppc_altivec_vextsb2w : GCCBuiltin<"__builtin_altivec_vextsb2w">,
+            Intrinsic<[llvm_v4i32_ty], [llvm_v16i8_ty], [IntrNoMem]>;
+  def int_ppc_altivec_vextsb2d : GCCBuiltin<"__builtin_altivec_vextsb2d">,
+            Intrinsic<[llvm_v2i64_ty], [llvm_v16i8_ty], [IntrNoMem]>;
+  def int_ppc_altivec_vextsh2w : GCCBuiltin<"__builtin_altivec_vextsh2w">,
+            Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty], [IntrNoMem]>;
+  def int_ppc_altivec_vextsh2d : GCCBuiltin<"__builtin_altivec_vextsh2d">,
+            Intrinsic<[llvm_v2i64_ty], [llvm_v8i16_ty], [IntrNoMem]>;
+  def int_ppc_altivec_vextsw2d : GCCBuiltin<"__builtin_altivec_vextsw2d">,
+            Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty], [IntrNoMem]>;
+  def int_ppc_altivec_vextsd2q : GCCBuiltin<"__builtin_altivec_vextsd2q">,
+            Intrinsic<[llvm_v1i128_ty], [llvm_v2i64_ty], [IntrNoMem]>;
 
   // Other multiplies.
   def int_ppc_altivec_vmladduhm : GCCBuiltin<"__builtin_altivec_vmladduhm">,
@@ -689,34 +882,34 @@ let TargetPrefix = "ppc" in {  // All PPC intrinsics start with "llvm.ppc.".
                       [IntrNoMem]>;
   def int_ppc_altivec_vpkshss : GCCBuiltin<"__builtin_altivec_vpkshss">,
             Intrinsic<[llvm_v16i8_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
-                      [IntrNoMem]>;
+                      [IntrNoMem, IntrHasSideEffects]>;
   def int_ppc_altivec_vpkshus : GCCBuiltin<"__builtin_altivec_vpkshus">,
             Intrinsic<[llvm_v16i8_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
-                      [IntrNoMem]>;
+                      [IntrNoMem, IntrHasSideEffects]>;
   def int_ppc_altivec_vpkswss : GCCBuiltin<"__builtin_altivec_vpkswss">,
             Intrinsic<[llvm_v8i16_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
-                      [IntrNoMem]>;
+                      [IntrNoMem, IntrHasSideEffects]>;
   def int_ppc_altivec_vpkswus : GCCBuiltin<"__builtin_altivec_vpkswus">,
             Intrinsic<[llvm_v8i16_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
-                      [IntrNoMem]>;
+                      [IntrNoMem, IntrHasSideEffects]>;
   def int_ppc_altivec_vpksdss : GCCBuiltin<"__builtin_altivec_vpksdss">,
             Intrinsic<[llvm_v4i32_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
-                      [IntrNoMem]>;
+                      [IntrNoMem, IntrHasSideEffects]>;
   def int_ppc_altivec_vpksdus : GCCBuiltin<"__builtin_altivec_vpksdus">,
             Intrinsic<[llvm_v4i32_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
-                      [IntrNoMem]>;
+                      [IntrNoMem, IntrHasSideEffects]>;
   // vpkuhum is lowered to a shuffle.
   def int_ppc_altivec_vpkuhus : GCCBuiltin<"__builtin_altivec_vpkuhus">,
             Intrinsic<[llvm_v16i8_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
-                      [IntrNoMem]>;
+                      [IntrNoMem, IntrHasSideEffects]>;
   // vpkuwum is lowered to a shuffle.
   def int_ppc_altivec_vpkuwus : GCCBuiltin<"__builtin_altivec_vpkuwus">,
             Intrinsic<[llvm_v8i16_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
-                      [IntrNoMem]>;
+                      [IntrNoMem, IntrHasSideEffects]>;
   // vpkudum is lowered to a shuffle.
   def int_ppc_altivec_vpkudus : GCCBuiltin<"__builtin_altivec_vpkudus">,
             Intrinsic<[llvm_v4i32_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
-                      [IntrNoMem]>;
+                      [IntrNoMem, IntrHasSideEffects]>;
 
   // Unpacks.
   def int_ppc_altivec_vupkhpx : GCCBuiltin<"__builtin_altivec_vupkhpx">,
@@ -898,6 +1091,29 @@ def int_ppc_altivec_vrldmi :
                             [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty],
                             [IntrNoMem]>;
 
+def int_ppc_altivec_vrlqnm :
+      PowerPC_Vec_Intrinsic<"vrlqnm", [llvm_v1i128_ty],
+                           [llvm_v1i128_ty, llvm_v1i128_ty],
+                            [IntrNoMem]>;
+def int_ppc_altivec_vrlqmi :
+      PowerPC_Vec_Intrinsic<"vrlqmi", [llvm_v1i128_ty],
+                            [llvm_v1i128_ty, llvm_v1i128_ty, llvm_v1i128_ty],
+                            [IntrNoMem]>;
+
+// Vector Divide Extended Intrinsics.
+def int_ppc_altivec_vdivesw : PowerPC_Vec_WWW_Intrinsic<"vdivesw">;
+def int_ppc_altivec_vdiveuw : PowerPC_Vec_WWW_Intrinsic<"vdiveuw">;
+def int_ppc_altivec_vdivesd : PowerPC_Vec_DDD_Intrinsic<"vdivesd">;
+def int_ppc_altivec_vdiveud : PowerPC_Vec_DDD_Intrinsic<"vdiveud">;
+def int_ppc_altivec_vdivesq : PowerPC_Vec_QQQ_Intrinsic<"vdivesq">;
+def int_ppc_altivec_vdiveuq : PowerPC_Vec_QQQ_Intrinsic<"vdiveuq">;
+
+// Vector Multiply High Intrinsics.
+def int_ppc_altivec_vmulhsw : PowerPC_Vec_WWW_Intrinsic<"vmulhsw">;
+def int_ppc_altivec_vmulhuw : PowerPC_Vec_WWW_Intrinsic<"vmulhuw">;
+def int_ppc_altivec_vmulhsd : PowerPC_Vec_DDD_Intrinsic<"vmulhsd">;
+def int_ppc_altivec_vmulhud : PowerPC_Vec_DDD_Intrinsic<"vmulhud">;
+
 //===----------------------------------------------------------------------===//
 // PowerPC VSX Intrinsic Definitions.
 
@@ -918,12 +1134,8 @@ def int_ppc_vsx_lxvl :
 def int_ppc_vsx_lxvll :
       Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty, llvm_i64_ty], [IntrReadMem,
       IntrArgMemOnly]>;
-def int_ppc_vsx_stxvl :
-      Intrinsic<[], [llvm_v4i32_ty, llvm_ptr_ty, llvm_i64_ty],
-      [IntrWriteMem, IntrArgMemOnly]>;
-def int_ppc_vsx_stxvll :
-      Intrinsic<[], [llvm_v4i32_ty, llvm_ptr_ty, llvm_i64_ty],
-      [IntrWriteMem, IntrArgMemOnly]>;
+def int_ppc_vsx_lxvp :
+      Intrinsic<[llvm_v256i1_ty], [llvm_ptr_ty], [IntrReadMem, IntrArgMemOnly]>;
 
 // Vector store.
 def int_ppc_vsx_stxvw4x : Intrinsic<[], [llvm_v4i32_ty, llvm_ptr_ty],
@@ -934,6 +1146,15 @@ def int_ppc_vsx_stxvw4x_be : Intrinsic<[], [llvm_v4i32_ty, llvm_ptr_ty],
                                        [IntrWriteMem, IntrArgMemOnly]>;
 def int_ppc_vsx_stxvd2x_be : Intrinsic<[], [llvm_v2f64_ty, llvm_ptr_ty],
                                        [IntrWriteMem, IntrArgMemOnly]>;
+def int_ppc_vsx_stxvl :
+      Intrinsic<[], [llvm_v4i32_ty, llvm_ptr_ty, llvm_i64_ty],
+      [IntrWriteMem, IntrArgMemOnly]>;
+def int_ppc_vsx_stxvll :
+      Intrinsic<[], [llvm_v4i32_ty, llvm_ptr_ty, llvm_i64_ty],
+      [IntrWriteMem, IntrArgMemOnly]>;
+def int_ppc_vsx_stxvp :
+      Intrinsic<[], [llvm_v256i1_ty, llvm_ptr_ty], [IntrWriteMem,
+      IntrArgMemOnly]>;
 // Vector and scalar maximum.
 def int_ppc_vsx_xvmaxdp : PowerPC_VSX_Vec_DDD_Intrinsic<"xvmaxdp">;
 def int_ppc_vsx_xvmaxsp : PowerPC_VSX_Vec_FFF_Intrinsic<"xvmaxsp">;
@@ -1060,6 +1281,12 @@ def int_ppc_vsx_xvtstdcsp :
 def int_ppc_vsx_xvcvhpsp :
       PowerPC_VSX_Intrinsic<"xvcvhpsp", [llvm_v4f32_ty],
                             [llvm_v8i16_ty],[IntrNoMem]>;
+def int_ppc_vsx_xvcvspbf16 :
+      PowerPC_VSX_Intrinsic<"xvcvspbf16", [llvm_v16i8_ty],
+                            [llvm_v16i8_ty], [IntrNoMem]>;
+def int_ppc_vsx_xvcvbf16spn :
+      PowerPC_VSX_Intrinsic<"xvcvbf16spn", [llvm_v16i8_ty],
+                            [llvm_v16i8_ty], [IntrNoMem]>;
 def int_ppc_vsx_xxextractuw :
       PowerPC_VSX_Intrinsic<"xxextractuw",[llvm_v2i64_ty],
                             [llvm_v2i64_ty,llvm_i32_ty], [IntrNoMem]>;
@@ -1069,7 +1296,17 @@ def int_ppc_vsx_xxinsertw :
                             [IntrNoMem]>;
 def int_ppc_vsx_xvtlsbb :
       PowerPC_VSX_Intrinsic<"xvtlsbb", [llvm_i32_ty],
-                            [llvm_v16i8_ty, llvm_i1_ty], [IntrNoMem]>;
+                            [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_ppc_vsx_xvtdivdp :
+      PowerPC_VSX_Intrinsic<"xvtdivdp", [llvm_i32_ty],
+                            [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
+def int_ppc_vsx_xvtdivsp :
+      PowerPC_VSX_Intrinsic<"xvtdivsp", [llvm_i32_ty],
+                            [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+def int_ppc_vsx_xvtsqrtdp :
+      PowerPC_VSX_Intrinsic<"xvtsqrtdp", [llvm_i32_ty], [llvm_v2f64_ty], [IntrNoMem]>;
+def int_ppc_vsx_xvtsqrtsp :
+      PowerPC_VSX_Intrinsic<"xvtsqrtsp", [llvm_i32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
 def int_ppc_vsx_xxeval :
       PowerPC_VSX_Intrinsic<"xxeval", [llvm_v2i64_ty],
                            [llvm_v2i64_ty, llvm_v2i64_ty,
@@ -1109,182 +1346,6 @@ def  int_ppc_vsx_xxblendvd: GCCBuiltin<"__builtin_vsx_xxblendvd">,
                  [IntrNoMem]>;
 }
 
-//===----------------------------------------------------------------------===//
-// PowerPC QPX Intrinsics.
-//
-
-let TargetPrefix = "ppc" in {  // All PPC intrinsics start with "llvm.ppc.".
-  /// PowerPC_QPX_Intrinsic - Base class for all QPX intrinsics.
-  class PowerPC_QPX_Intrinsic<string GCCIntSuffix, list<LLVMType> ret_types,
-                              list<LLVMType> param_types,
-                              list<IntrinsicProperty> properties>
-    : GCCBuiltin<!strconcat("__builtin_qpx_", GCCIntSuffix)>,
-      Intrinsic<ret_types, param_types, properties>;
-}
-
-//===----------------------------------------------------------------------===//
-// PowerPC QPX Intrinsic Class Definitions.
-//
-
-/// PowerPC_QPX_FF_Intrinsic - A PowerPC intrinsic that takes one v4f64
-/// vector and returns one.  These intrinsics have no side effects.
-class PowerPC_QPX_FF_Intrinsic<string GCCIntSuffix>
-  : PowerPC_QPX_Intrinsic<GCCIntSuffix,
-                          [llvm_v4f64_ty], [llvm_v4f64_ty], [IntrNoMem]>;
-
-/// PowerPC_QPX_FFF_Intrinsic - A PowerPC intrinsic that takes two v4f64
-/// vectors and returns one.  These intrinsics have no side effects.
-class PowerPC_QPX_FFF_Intrinsic<string GCCIntSuffix>
-  : PowerPC_QPX_Intrinsic<GCCIntSuffix,
-                          [llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty],
-                          [IntrNoMem]>;
-
-/// PowerPC_QPX_FFFF_Intrinsic - A PowerPC intrinsic that takes three v4f64
-/// vectors and returns one.  These intrinsics have no side effects.
-class PowerPC_QPX_FFFF_Intrinsic<string GCCIntSuffix>
-  : PowerPC_QPX_Intrinsic<GCCIntSuffix,
-                          [llvm_v4f64_ty],
-                          [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty],
-                          [IntrNoMem]>;
-
-/// PowerPC_QPX_Load_Intrinsic - A PowerPC intrinsic that takes a pointer
-/// and returns a v4f64.
-class PowerPC_QPX_Load_Intrinsic<string GCCIntSuffix>
-  : PowerPC_QPX_Intrinsic<GCCIntSuffix,
-                          [llvm_v4f64_ty], [llvm_ptr_ty], [IntrReadMem, IntrArgMemOnly]>;
-
-/// PowerPC_QPX_LoadPerm_Intrinsic - A PowerPC intrinsic that takes a pointer
-/// and returns a v4f64 permutation.
-class PowerPC_QPX_LoadPerm_Intrinsic<string GCCIntSuffix>
-  : PowerPC_QPX_Intrinsic<GCCIntSuffix,
-                          [llvm_v4f64_ty], [llvm_ptr_ty], [IntrNoMem]>;
-
-/// PowerPC_QPX_Store_Intrinsic - A PowerPC intrinsic that takes a pointer
-/// and stores a v4f64.
-class PowerPC_QPX_Store_Intrinsic<string GCCIntSuffix>
-  : PowerPC_QPX_Intrinsic<GCCIntSuffix,
-                          [], [llvm_v4f64_ty, llvm_ptr_ty],
-                          [IntrWriteMem, IntrArgMemOnly]>;
-
-//===----------------------------------------------------------------------===//
-// PowerPC QPX Intrinsic Definitions.
-
-let TargetPrefix = "ppc" in {  // All intrinsics start with "llvm.ppc.".
-  // Add Instructions
-  def int_ppc_qpx_qvfadd : PowerPC_QPX_FFF_Intrinsic<"qvfadd">;
-  def int_ppc_qpx_qvfadds : PowerPC_QPX_FFF_Intrinsic<"qvfadds">;
-  def int_ppc_qpx_qvfsub : PowerPC_QPX_FFF_Intrinsic<"qvfsub">;
-  def int_ppc_qpx_qvfsubs : PowerPC_QPX_FFF_Intrinsic<"qvfsubs">;
-
-  // Estimate Instructions
-  def int_ppc_qpx_qvfre : PowerPC_QPX_FF_Intrinsic<"qvfre">;
-  def int_ppc_qpx_qvfres : PowerPC_QPX_FF_Intrinsic<"qvfres">;
-  def int_ppc_qpx_qvfrsqrte : PowerPC_QPX_FF_Intrinsic<"qvfrsqrte">;
-  def int_ppc_qpx_qvfrsqrtes : PowerPC_QPX_FF_Intrinsic<"qvfrsqrtes">;
-
-  // Multiply Instructions
-  def int_ppc_qpx_qvfmul : PowerPC_QPX_FFF_Intrinsic<"qvfmul">;
-  def int_ppc_qpx_qvfmuls : PowerPC_QPX_FFF_Intrinsic<"qvfmuls">;
-  def int_ppc_qpx_qvfxmul : PowerPC_QPX_FFF_Intrinsic<"qvfxmul">;
-  def int_ppc_qpx_qvfxmuls : PowerPC_QPX_FFF_Intrinsic<"qvfxmuls">;
-
-  // Multiply-add instructions
-  def int_ppc_qpx_qvfmadd : PowerPC_QPX_FFFF_Intrinsic<"qvfmadd">;
-  def int_ppc_qpx_qvfmadds : PowerPC_QPX_FFFF_Intrinsic<"qvfmadds">;
-  def int_ppc_qpx_qvfnmadd : PowerPC_QPX_FFFF_Intrinsic<"qvfnmadd">;
-  def int_ppc_qpx_qvfnmadds : PowerPC_QPX_FFFF_Intrinsic<"qvfnmadds">;
-  def int_ppc_qpx_qvfmsub : PowerPC_QPX_FFFF_Intrinsic<"qvfmsub">;
-  def int_ppc_qpx_qvfmsubs : PowerPC_QPX_FFFF_Intrinsic<"qvfmsubs">;
-  def int_ppc_qpx_qvfnmsub : PowerPC_QPX_FFFF_Intrinsic<"qvfnmsub">;
-  def int_ppc_qpx_qvfnmsubs : PowerPC_QPX_FFFF_Intrinsic<"qvfnmsubs">;
-  def int_ppc_qpx_qvfxmadd : PowerPC_QPX_FFFF_Intrinsic<"qvfxmadd">;
-  def int_ppc_qpx_qvfxmadds : PowerPC_QPX_FFFF_Intrinsic<"qvfxmadds">;
-  def int_ppc_qpx_qvfxxnpmadd : PowerPC_QPX_FFFF_Intrinsic<"qvfxxnpmadd">;
-  def int_ppc_qpx_qvfxxnpmadds : PowerPC_QPX_FFFF_Intrinsic<"qvfxxnpmadds">;
-  def int_ppc_qpx_qvfxxcpnmadd : PowerPC_QPX_FFFF_Intrinsic<"qvfxxcpnmadd">;
-  def int_ppc_qpx_qvfxxcpnmadds : PowerPC_QPX_FFFF_Intrinsic<"qvfxxcpnmadds">;
-  def int_ppc_qpx_qvfxxmadd : PowerPC_QPX_FFFF_Intrinsic<"qvfxxmadd">;
-  def int_ppc_qpx_qvfxxmadds : PowerPC_QPX_FFFF_Intrinsic<"qvfxxmadds">;
-
-  // Select Instruction
-  def int_ppc_qpx_qvfsel : PowerPC_QPX_FFFF_Intrinsic<"qvfsel">;
-
-  // Permute Instruction
-  def int_ppc_qpx_qvfperm : PowerPC_QPX_FFFF_Intrinsic<"qvfperm">;
-
-  // Convert and Round Instructions
-  def int_ppc_qpx_qvfctid : PowerPC_QPX_FF_Intrinsic<"qvfctid">;
-  def int_ppc_qpx_qvfctidu : PowerPC_QPX_FF_Intrinsic<"qvfctidu">;
-  def int_ppc_qpx_qvfctidz : PowerPC_QPX_FF_Intrinsic<"qvfctidz">;
-  def int_ppc_qpx_qvfctiduz : PowerPC_QPX_FF_Intrinsic<"qvfctiduz">;
-  def int_ppc_qpx_qvfctiw : PowerPC_QPX_FF_Intrinsic<"qvfctiw">;
-  def int_ppc_qpx_qvfctiwu : PowerPC_QPX_FF_Intrinsic<"qvfctiwu">;
-  def int_ppc_qpx_qvfctiwz : PowerPC_QPX_FF_Intrinsic<"qvfctiwz">;
-  def int_ppc_qpx_qvfctiwuz : PowerPC_QPX_FF_Intrinsic<"qvfctiwuz">;
-  def int_ppc_qpx_qvfcfid : PowerPC_QPX_FF_Intrinsic<"qvfcfid">;
-  def int_ppc_qpx_qvfcfidu : PowerPC_QPX_FF_Intrinsic<"qvfcfidu">;
-  def int_ppc_qpx_qvfcfids : PowerPC_QPX_FF_Intrinsic<"qvfcfids">;
-  def int_ppc_qpx_qvfcfidus : PowerPC_QPX_FF_Intrinsic<"qvfcfidus">;
-  def int_ppc_qpx_qvfrsp : PowerPC_QPX_FF_Intrinsic<"qvfrsp">;
-  def int_ppc_qpx_qvfriz : PowerPC_QPX_FF_Intrinsic<"qvfriz">;
-  def int_ppc_qpx_qvfrin : PowerPC_QPX_FF_Intrinsic<"qvfrin">;
-  def int_ppc_qpx_qvfrip : PowerPC_QPX_FF_Intrinsic<"qvfrip">;
-  def int_ppc_qpx_qvfrim : PowerPC_QPX_FF_Intrinsic<"qvfrim">;
-
-  // Move Instructions
-  def int_ppc_qpx_qvfneg : PowerPC_QPX_FF_Intrinsic<"qvfneg">;
-  def int_ppc_qpx_qvfabs : PowerPC_QPX_FF_Intrinsic<"qvfabs">;
-  def int_ppc_qpx_qvfnabs : PowerPC_QPX_FF_Intrinsic<"qvfnabs">;
-  def int_ppc_qpx_qvfcpsgn : PowerPC_QPX_FFF_Intrinsic<"qvfcpsgn">;
-
-  // Compare Instructions
-  def int_ppc_qpx_qvftstnan : PowerPC_QPX_FFF_Intrinsic<"qvftstnan">;
-  def int_ppc_qpx_qvfcmplt : PowerPC_QPX_FFF_Intrinsic<"qvfcmplt">;
-  def int_ppc_qpx_qvfcmpgt : PowerPC_QPX_FFF_Intrinsic<"qvfcmpgt">;
-  def int_ppc_qpx_qvfcmpeq : PowerPC_QPX_FFF_Intrinsic<"qvfcmpeq">;
-
-  // Load instructions
-  def int_ppc_qpx_qvlfd : PowerPC_QPX_Load_Intrinsic<"qvlfd">;
-  def int_ppc_qpx_qvlfda : PowerPC_QPX_Load_Intrinsic<"qvlfda">;
-  def int_ppc_qpx_qvlfs : PowerPC_QPX_Load_Intrinsic<"qvlfs">;
-  def int_ppc_qpx_qvlfsa : PowerPC_QPX_Load_Intrinsic<"qvlfsa">;
-
-  def int_ppc_qpx_qvlfcda : PowerPC_QPX_Load_Intrinsic<"qvlfcda">;
-  def int_ppc_qpx_qvlfcd : PowerPC_QPX_Load_Intrinsic<"qvlfcd">;
-  def int_ppc_qpx_qvlfcsa : PowerPC_QPX_Load_Intrinsic<"qvlfcsa">;
-  def int_ppc_qpx_qvlfcs : PowerPC_QPX_Load_Intrinsic<"qvlfcs">;
-  def int_ppc_qpx_qvlfiwaa : PowerPC_QPX_Load_Intrinsic<"qvlfiwaa">;
-  def int_ppc_qpx_qvlfiwa : PowerPC_QPX_Load_Intrinsic<"qvlfiwa">;
-  def int_ppc_qpx_qvlfiwza : PowerPC_QPX_Load_Intrinsic<"qvlfiwza">;
-  def int_ppc_qpx_qvlfiwz : PowerPC_QPX_Load_Intrinsic<"qvlfiwz">;
-
-  def int_ppc_qpx_qvlpcld : PowerPC_QPX_LoadPerm_Intrinsic<"qvlpcld">;
-  def int_ppc_qpx_qvlpcls : PowerPC_QPX_LoadPerm_Intrinsic<"qvlpcls">;
-  def int_ppc_qpx_qvlpcrd : PowerPC_QPX_LoadPerm_Intrinsic<"qvlpcrd">;
-  def int_ppc_qpx_qvlpcrs : PowerPC_QPX_LoadPerm_Intrinsic<"qvlpcrs">;
-
-  // Store instructions
-  def int_ppc_qpx_qvstfd : PowerPC_QPX_Store_Intrinsic<"qvstfd">;
-  def int_ppc_qpx_qvstfda : PowerPC_QPX_Store_Intrinsic<"qvstfda">;
-  def int_ppc_qpx_qvstfs : PowerPC_QPX_Store_Intrinsic<"qvstfs">;
-  def int_ppc_qpx_qvstfsa : PowerPC_QPX_Store_Intrinsic<"qvstfsa">;
-
-  def int_ppc_qpx_qvstfcda : PowerPC_QPX_Store_Intrinsic<"qvstfcda">;
-  def int_ppc_qpx_qvstfcd : PowerPC_QPX_Store_Intrinsic<"qvstfcd">;
-  def int_ppc_qpx_qvstfcsa : PowerPC_QPX_Store_Intrinsic<"qvstfcsa">;
-  def int_ppc_qpx_qvstfcs : PowerPC_QPX_Store_Intrinsic<"qvstfcs">;
-  def int_ppc_qpx_qvstfiwa : PowerPC_QPX_Store_Intrinsic<"qvstfiwa">;
-  def int_ppc_qpx_qvstfiw : PowerPC_QPX_Store_Intrinsic<"qvstfiw">;
-
-  // Logical and permutation formation
-  def int_ppc_qpx_qvflogical : PowerPC_QPX_Intrinsic<"qvflogical",
-                          [llvm_v4f64_ty],
-                          [llvm_v4f64_ty, llvm_v4f64_ty, llvm_i32_ty],
-                          [IntrNoMem]>;
-  def int_ppc_qpx_qvgpci : PowerPC_QPX_Intrinsic<"qvgpci",
-                          [llvm_v4f64_ty], [llvm_i32_ty], [IntrNoMem]>;
-}
-
 //===----------------------------------------------------------------------===//
 // PowerPC HTM Intrinsic Definitions.
 
@@ -1349,5 +1410,88 @@ def int_ppc_cfence : Intrinsic<[], [llvm_anyint_ty], []>;
 // PowerPC set FPSCR Intrinsic Definitions.
 def int_ppc_setrnd : GCCBuiltin<"__builtin_setrnd">,
       Intrinsic<[llvm_double_ty], [llvm_i32_ty], []>;
+}
 
+let TargetPrefix = "ppc" in {
+  def int_ppc_vsx_assemble_pair :
+        Intrinsic<[llvm_v256i1_ty],
+                  [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+
+  def int_ppc_vsx_disassemble_pair :
+        Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty],
+                  [llvm_v256i1_ty], [IntrNoMem]>;
+
+  def int_ppc_mma_assemble_acc :
+        Intrinsic<[llvm_v512i1_ty],
+                  [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty],
+                  [IntrNoMem]>;
+
+  def int_ppc_mma_disassemble_acc :
+        Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty],
+                  [llvm_v512i1_ty], [IntrNoMem]>;
+
+  def int_ppc_mma_xxmtacc :
+        Intrinsic<[llvm_v512i1_ty], [llvm_v512i1_ty], [IntrNoMem]>;
+
+  def int_ppc_mma_xxmfacc :
+        Intrinsic<[llvm_v512i1_ty], [llvm_v512i1_ty], [IntrNoMem]>;
+
+  def int_ppc_mma_xxsetaccz :
+        Intrinsic<[llvm_v512i1_ty], [], [IntrNoMem]>;
+
+  // MMA Reduced-Precision: Outer Product Intrinsic Definitions.
+  defm int_ppc_mma_xvi4ger8 :
+        PowerPC_MMA_ACC_PP_Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty]>;
+  defm int_ppc_mma_pmxvi4ger8 :
+        PowerPC_MMA_ACC_PP_Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty,
+                                      llvm_i32_ty, llvm_i32_ty]>;
+
+  defm int_ppc_mma_xvi8ger4 :
+       PowerPC_MMA_ACC_PP_Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty]>;
+  defm int_ppc_mma_pmxvi8ger4 :
+       PowerPC_MMA_ACC_PP_Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty,
+                                     llvm_i32_ty, llvm_i32_ty]>;
+
+  defm int_ppc_mma_xvi16ger2s :
+       PowerPC_MMA_ACC_PP_Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty]>;
+  defm int_ppc_mma_pmxvi16ger2s :
+       PowerPC_MMA_ACC_PP_Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty,
+                                     llvm_i32_ty, llvm_i32_ty]>;
+
+  defm int_ppc_mma_xvf16ger2 :
+       PowerPC_MMA_ACC_Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty]>;
+  defm int_ppc_mma_pmxvf16ger2 :
+       PowerPC_MMA_ACC_Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty,
+                                  llvm_i32_ty, llvm_i32_ty]>;
+  defm int_ppc_mma_xvf32ger :
+       PowerPC_MMA_ACC_Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty]>;
+  defm int_ppc_mma_pmxvf32ger :
+       PowerPC_MMA_ACC_Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty,
+                                  llvm_i32_ty]>;
+  defm int_ppc_mma_xvf64ger :
+       PowerPC_MMA_ACC_Intrinsic<[llvm_v256i1_ty, llvm_v16i8_ty]>;
+  defm int_ppc_mma_pmxvf64ger :
+       PowerPC_MMA_ACC_Intrinsic<[llvm_v256i1_ty, llvm_v16i8_ty, llvm_i32_ty,
+                                  llvm_i32_ty]>;
+
+  // MMA Reduced-Precision: bfloat16 Outer Product Intrinsic Definitions.
+  defm int_ppc_mma_xvbf16ger2 :
+         PowerPC_MMA_ACC_Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty]>;
+  defm int_ppc_mma_pmxvbf16ger2 :
+         PowerPC_MMA_ACC_Intrinsic<
+           [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>;
+
+  // MMA Reduced-Precision: Missing Integer-based Outer Product Operations.
+  defm int_ppc_mma_xvi16ger2 :
+         PowerPC_MMA_ACC_PP_Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty]>;
+  defm int_ppc_mma_pmxvi16ger2 :
+         PowerPC_MMA_ACC_PP_Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty,
+                                       llvm_i32_ty, llvm_i32_ty]>;
+  def int_ppc_mma_xvi8ger4spp :
+        Intrinsic<[llvm_v512i1_ty],
+                  [llvm_v512i1_ty, llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+  def int_ppc_mma_pmxvi8ger4spp :
+        Intrinsic<[llvm_v512i1_ty],
+                  [llvm_v512i1_ty, llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
 }
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsRISCV.td b/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsRISCV.td
index 7590b568c367..c4056895f68e 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsRISCV.td
+++ b/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsRISCV.td
@@ -66,3 +66,1027 @@ let TargetPrefix = "riscv" in {
   defm int_riscv_masked_cmpxchg : MaskedAtomicRMWFiveArgIntrinsics;
 
 } // TargetPrefix = "riscv"
+
+//===----------------------------------------------------------------------===//
+// Vectors
+
+class RISCVVIntrinsic {
+  // These intrinsics may accept illegal integer values in their llvm_any_ty
+  // operand, so they have to be extended. If set to zero then the intrinsic
+  // does not have any operand that must be extended.
+  Intrinsic IntrinsicID = !cast<Intrinsic>(NAME);
+  bits<4> ExtendOperand = 0;
+}
+
+let TargetPrefix = "riscv" in {
+  // We use anyint here but we only support XLen.
+  def int_riscv_vsetvli   : Intrinsic<[llvm_anyint_ty],
+                           /* AVL */  [LLVMMatchType<0>,
+                           /* VSEW */  LLVMMatchType<0>,
+                           /* VLMUL */ LLVMMatchType<0>],
+                                      [IntrNoMem, IntrHasSideEffects,
+                                       ImmArg<ArgIndex<1>>,
+                                       ImmArg<ArgIndex<2>>]>;
+  def int_riscv_vsetvlimax : Intrinsic<[llvm_anyint_ty],
+                            /* VSEW */ [LLVMMatchType<0>,
+                            /* VLMUL */ LLVMMatchType<0>],
+                                      [IntrNoMem, IntrHasSideEffects,
+                                       ImmArg<ArgIndex<0>>,
+                                       ImmArg<ArgIndex<1>>]>;
+
+  // For unit stride load
+  // Input: (pointer, vl)
+  class RISCVUSLoad
+        : Intrinsic<[llvm_anyvector_ty],
+                    [LLVMPointerType<LLVMMatchType<0>>,
+                     llvm_anyint_ty],
+                    [NoCapture<ArgIndex<0>>, IntrReadMem]>, RISCVVIntrinsic;
+  // For unit stride fault-only-first load
+  // Input: (pointer, vl)
+  // Output: (data, vl)
+  // NOTE: We model this with default memory properties since we model writing
+  // VL as a side effect. IntrReadMem, IntrHasSideEffects does not work.
+  class RISCVUSLoadFF
+        : Intrinsic<[llvm_anyvector_ty, llvm_anyint_ty],
+                    [LLVMPointerType<LLVMMatchType<0>>, LLVMMatchType<1>],
+                    [NoCapture<ArgIndex<0>>]>,
+                    RISCVVIntrinsic;
+  // For unit stride load with mask
+  // Input: (maskedoff, pointer, mask, vl)
+  class RISCVUSLoadMask
+        : Intrinsic<[llvm_anyvector_ty ],
+                    [LLVMMatchType<0>,
+                     LLVMPointerType<LLVMMatchType<0>>,
+                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                     llvm_anyint_ty],
+                    [NoCapture<ArgIndex<1>>, IntrReadMem]>, RISCVVIntrinsic;
+  // For unit stride fault-only-first load with mask
+  // Input: (maskedoff, pointer, mask, vl)
+  // Output: (data, vl)
+  // NOTE: We model this with default memory properties since we model writing
+  // VL as a side effect. IntrReadMem, IntrHasSideEffects does not work.
+  class RISCVUSLoadFFMask
+        : Intrinsic<[llvm_anyvector_ty, llvm_anyint_ty],
+                    [LLVMMatchType<0>,
+                     LLVMPointerType<LLVMMatchType<0>>,
+                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                     LLVMMatchType<1>],
+                    [NoCapture<ArgIndex<1>>]>, RISCVVIntrinsic;
+  // For strided load
+  // Input: (pointer, stride, vl)
+  class RISCVSLoad
+        : Intrinsic<[llvm_anyvector_ty],
+                    [LLVMPointerType<LLVMMatchType<0>>,
+                     llvm_anyint_ty, LLVMMatchType<1>],
+                    [NoCapture<ArgIndex<0>>, IntrReadMem]>, RISCVVIntrinsic;
+  // For strided load with mask
+  // Input: (maskedoff, pointer, stride, mask, vl)
+  class RISCVSLoadMask
+        : Intrinsic<[llvm_anyvector_ty ],
+                    [LLVMMatchType<0>,
+                     LLVMPointerType<LLVMMatchType<0>>, llvm_anyint_ty,
+                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<1>],
+                    [NoCapture<ArgIndex<1>>, IntrReadMem]>, RISCVVIntrinsic;
+  // For indexed load
+  // Input: (pointer, index, vl)
+  class RISCVILoad
+        : Intrinsic<[llvm_anyvector_ty],
+                    [LLVMPointerType<LLVMMatchType<0>>,
+                     llvm_anyvector_ty, llvm_anyint_ty],
+                    [NoCapture<ArgIndex<0>>, IntrReadMem]>, RISCVVIntrinsic;
+  // For indexed load with mask
+  // Input: (maskedoff, pointer, index, mask, vl)
+  class RISCVILoadMask
+        : Intrinsic<[llvm_anyvector_ty ],
+                    [LLVMMatchType<0>,
+                     LLVMPointerType<LLVMMatchType<0>>, llvm_anyvector_ty,
+                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty],
+                    [NoCapture<ArgIndex<1>>, IntrReadMem]>, RISCVVIntrinsic;
+  // For unit stride store
+  // Input: (vector_in, pointer, vl)
+  class RISCVUSStore
+        : Intrinsic<[],
+                    [llvm_anyvector_ty,
+                     LLVMPointerType<LLVMMatchType<0>>,
+                     llvm_anyint_ty],
+                    [NoCapture<ArgIndex<1>>, IntrWriteMem]>, RISCVVIntrinsic;
+  // For unit stride store with mask
+  // Input: (vector_in, pointer, mask, vl)
+  class RISCVUSStoreMask
+        : Intrinsic<[],
+                    [llvm_anyvector_ty,
+                     LLVMPointerType<LLVMMatchType<0>>,
+                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                     llvm_anyint_ty],
+                    [NoCapture<ArgIndex<1>>, IntrWriteMem]>, RISCVVIntrinsic;
+  // For strided store
+  // Input: (vector_in, pointer, stride, vl)
+  class RISCVSStore
+        : Intrinsic<[],
+                    [llvm_anyvector_ty,
+                     LLVMPointerType<LLVMMatchType<0>>,
+                     llvm_anyint_ty, LLVMMatchType<1>],
+                    [NoCapture<ArgIndex<1>>, IntrWriteMem]>, RISCVVIntrinsic;
+  // For stride store with mask
+  // Input: (vector_in, pointer, stirde, mask, vl)
+  class RISCVSStoreMask
+        : Intrinsic<[],
+                    [llvm_anyvector_ty,
+                     LLVMPointerType<LLVMMatchType<0>>, llvm_anyint_ty,
+                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<1>],
+                    [NoCapture<ArgIndex<1>>, IntrWriteMem]>, RISCVVIntrinsic;
+  // For indexed store
+  // Input: (vector_in, pointer, index, vl)
+  class RISCVIStore
+        : Intrinsic<[],
+                    [llvm_anyvector_ty,
+                     LLVMPointerType<LLVMMatchType<0>>,
+                     llvm_anyint_ty, llvm_anyint_ty],
+                    [NoCapture<ArgIndex<1>>, IntrWriteMem]>, RISCVVIntrinsic;
+  // For indexed store with mask
+  // Input: (vector_in, pointer, index, mask, vl)
+  class RISCVIStoreMask
+        : Intrinsic<[],
+                    [llvm_anyvector_ty,
+                     LLVMPointerType<LLVMMatchType<0>>, llvm_anyvector_ty,
+                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty],
+                    [NoCapture<ArgIndex<1>>, IntrWriteMem]>, RISCVVIntrinsic;
+  // For destination vector type is the same as source vector.
+  // Input: (vector_in, vl)
+  class RISCVUnaryAANoMask
+        : Intrinsic<[llvm_anyvector_ty],
+                    [LLVMMatchType<0>, llvm_anyint_ty],
+                    [IntrNoMem]>, RISCVVIntrinsic;
+  // For destination vector type is the same as first source vector (with mask).
+  // Input: (vector_in, mask, vl)
+  class RISCVUnaryAAMask
+        : Intrinsic<[llvm_anyvector_ty],
+                    [LLVMMatchType<0>, LLVMMatchType<0>,
+                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty],
+                    [IntrNoMem]>, RISCVVIntrinsic;
+  // For destination vector type is the same as first and second source vector.
+  // Input: (vector_in, vector_in, vl)
+  class RISCVBinaryAAANoMask
+        : Intrinsic<[llvm_anyvector_ty],
+                    [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyint_ty],
+                    [IntrNoMem]>, RISCVVIntrinsic;
+  // For destination vector type is the same as first and second source vector.
+  // Input: (vector_in, vector_in, vl)
+  class RISCVBinaryAAAMask
+        : Intrinsic<[llvm_anyvector_ty],
+                    [LLVMMatchType<0>, LLVMMatchType<0>,
+                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty],
+                    [IntrNoMem]>, RISCVVIntrinsic;
+  // For destination vector type is the same as first source vector.
+  // Input: (vector_in, vector_in/scalar_in, vl)
+  class RISCVBinaryAAXNoMask
+        : Intrinsic<[llvm_anyvector_ty],
+                    [LLVMMatchType<0>, llvm_any_ty, llvm_anyint_ty],
+                    [IntrNoMem]>, RISCVVIntrinsic {
+    let ExtendOperand = 2;
+  }
+  // For destination vector type is the same as first source vector (with mask).
+  // Input: (maskedoff, vector_in, vector_in/scalar_in, mask, vl)
+  class RISCVBinaryAAXMask
+       : Intrinsic<[llvm_anyvector_ty],
+                   [LLVMMatchType<0>, LLVMMatchType<0>, llvm_any_ty,
+                    LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty],
+                   [IntrNoMem]>, RISCVVIntrinsic {
+    let ExtendOperand = 3;
+  }
+  // For destination vector type is NOT the same as first source vector.
+  // Input: (vector_in, vector_in/scalar_in, vl)
+  class RISCVBinaryABXNoMask
+        : Intrinsic<[llvm_anyvector_ty],
+                    [llvm_anyvector_ty, llvm_any_ty, llvm_anyint_ty],
+                    [IntrNoMem]>, RISCVVIntrinsic {
+    let ExtendOperand = 2;
+  }
+  // For destination vector type is NOT the same as first source vector (with mask).
+  // Input: (maskedoff, vector_in, vector_in/scalar_in, mask, vl)
+  class RISCVBinaryABXMask
+        : Intrinsic<[llvm_anyvector_ty],
+                    [LLVMMatchType<0>, llvm_anyvector_ty, llvm_any_ty,
+                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty],
+                    [IntrNoMem]>, RISCVVIntrinsic {
+    let ExtendOperand = 3;
+  }
+  // For binary operations with V0 as input.
+  // Input: (vector_in, vector_in/scalar_in, V0, vl)
+  class RISCVBinaryWithV0
+        : Intrinsic<[llvm_anyvector_ty],
+                    [LLVMMatchType<0>, llvm_any_ty,
+                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                     llvm_anyint_ty],
+                    [IntrNoMem]>, RISCVVIntrinsic {
+    let ExtendOperand = 2;
+  }
+  // For binary operations with mask type output and V0 as input.
+  // Output: (mask type output)
+  // Input: (vector_in, vector_in/scalar_in, V0, vl)
+  class RISCVBinaryMOutWithV0
+        :Intrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+                   [llvm_anyvector_ty, llvm_any_ty,
+                    LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                    llvm_anyint_ty],
+                   [IntrNoMem]>, RISCVVIntrinsic {
+    let ExtendOperand = 2;
+  }
+  // For binary operations with mask type output.
+  // Output: (mask type output)
+  // Input: (vector_in, vector_in/scalar_in, vl)
+  class RISCVBinaryMOut
+        : Intrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+                    [llvm_anyvector_ty, llvm_any_ty, llvm_anyint_ty],
+                    [IntrNoMem]>, RISCVVIntrinsic {
+    let ExtendOperand = 2;
+  }
+  // For binary operations with mask type output without mask.
+  // Output: (mask type output)
+  // Input: (vector_in, vector_in/scalar_in, vl)
+  class RISCVCompareNoMask
+        : Intrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+                    [llvm_anyvector_ty, llvm_any_ty, llvm_anyint_ty],
+                    [IntrNoMem]>, RISCVVIntrinsic {
+    let ExtendOperand = 2;
+  }
+  // For binary operations with mask type output with mask.
+  // Output: (mask type output)
+  // Input: (maskedoff, vector_in, vector_in/scalar_in, mask, vl)
+  class RISCVCompareMask
+        : Intrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+                    [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                     llvm_anyvector_ty, llvm_any_ty,
+                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty],
+                    [IntrNoMem]>, RISCVVIntrinsic {
+    let ExtendOperand = 3;
+  }
+  // For FP classify operations.
+  // Output: (bit mask type output)
+  // Input: (vector_in, vl)
+  class RISCVClassifyNoMask
+        : Intrinsic<[LLVMVectorOfBitcastsToInt<0>],
+                    [llvm_anyvector_ty, llvm_anyint_ty],
+                    [IntrNoMem]>, RISCVVIntrinsic;
+  // For FP classify operations with mask.
+  // Output: (bit mask type output)
+  // Input: (maskedoff, vector_in, mask, vl)
+  class RISCVClassifyMask
+        : Intrinsic<[LLVMVectorOfBitcastsToInt<0>],
+                    [LLVMVectorOfBitcastsToInt<0>, llvm_anyvector_ty,
+                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty],
+                    [IntrNoMem]>, RISCVVIntrinsic;
+  // For Saturating binary operations.
+  // The destination vector type is the same as first source vector.
+  // Input: (vector_in, vector_in/scalar_in, vl)
+  class RISCVSaturatingBinaryAAXNoMask
+        : Intrinsic<[llvm_anyvector_ty],
+                    [LLVMMatchType<0>, llvm_any_ty, llvm_anyint_ty],
+                    [IntrNoMem, IntrHasSideEffects]>, RISCVVIntrinsic {
+    let ExtendOperand = 2;
+  }
+  // For Saturating binary operations with mask.
+  // The destination vector type is the same as first source vector.
+  // Input: (maskedoff, vector_in, vector_in/scalar_in, mask, vl)
+  class RISCVSaturatingBinaryAAXMask
+        : Intrinsic<[llvm_anyvector_ty],
+                    [LLVMMatchType<0>, LLVMMatchType<0>, llvm_any_ty,
+                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty],
+                    [IntrNoMem, IntrHasSideEffects]>, RISCVVIntrinsic {
+    let ExtendOperand = 3;
+  }
+  // For Saturating binary operations.
+  // The destination vector type is NOT the same as first source vector.
+  // Input: (vector_in, vector_in/scalar_in, vl)
+  class RISCVSaturatingBinaryABXNoMask
+        : Intrinsic<[llvm_anyvector_ty],
+                    [llvm_anyvector_ty, llvm_any_ty, llvm_anyint_ty],
+                    [IntrNoMem, IntrHasSideEffects]>, RISCVVIntrinsic {
+    let ExtendOperand = 2;
+  }
+  // For Saturating binary operations with mask.
+  // The destination vector type is NOT the same as first source vector (with mask).
+  // Input: (maskedoff, vector_in, vector_in/scalar_in, mask, vl)
+  class RISCVSaturatingBinaryABXMask
+        : Intrinsic<[llvm_anyvector_ty],
+                    [LLVMMatchType<0>, llvm_anyvector_ty, llvm_any_ty,
+                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty],
+                    [IntrNoMem, IntrHasSideEffects]>, RISCVVIntrinsic {
+    let ExtendOperand = 3;
+  }
+  class RISCVTernaryAAAXNoMask
+        : Intrinsic<[llvm_anyvector_ty],
+                    [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyint_ty,
+                     LLVMMatchType<1>],
+                    [IntrNoMem]>, RISCVVIntrinsic;
+  class RISCVTernaryAAAXMask
+        : Intrinsic<[llvm_anyvector_ty],
+                    [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyint_ty,
+                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<1>],
+                    [IntrNoMem]>, RISCVVIntrinsic;
+  class RISCVTernaryAAXANoMask
+        : Intrinsic<[llvm_anyvector_ty],
+                    [LLVMMatchType<0>, llvm_any_ty, LLVMMatchType<0>,
+                     llvm_anyint_ty],
+                    [IntrNoMem]>, RISCVVIntrinsic {
+    let ExtendOperand = 2;
+  }
+  class RISCVTernaryAAXAMask
+        : Intrinsic<[llvm_anyvector_ty],
+                    [LLVMMatchType<0>, llvm_any_ty, LLVMMatchType<0>,
+                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty],
+                    [IntrNoMem]>, RISCVVIntrinsic {
+    let ExtendOperand = 2;
+  }
+  class RISCVTernaryWideNoMask
+        : Intrinsic< [llvm_anyvector_ty],
+                     [LLVMMatchType<0>, llvm_any_ty, llvm_anyvector_ty,
+                      llvm_anyint_ty],
+                     [IntrNoMem] >, RISCVVIntrinsic {
+    let ExtendOperand = 2;
+  }
+  class RISCVTernaryWideMask
+        : Intrinsic< [llvm_anyvector_ty],
+                     [LLVMMatchType<0>, llvm_any_ty, llvm_anyvector_ty,
+                      LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty],
+                     [IntrNoMem]>, RISCVVIntrinsic {
+    let ExtendOperand = 2;
+  }
+  // For Reduction ternary operations.
+  // For destination vector type is the same as first and third source vector.
+  // Input: (vector_in, vector_in, vector_in, vl)
+  class RISCVReductionNoMask
+        : Intrinsic<[llvm_anyvector_ty],
+                    [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>,
+                     llvm_anyint_ty],
+                    [IntrNoMem]>, RISCVVIntrinsic;
+  // For Reduction ternary operations with mask.
+  // For destination vector type is the same as first and third source vector.
+  // The mask type come from second source vector.
+  // Input: (maskedoff, vector_in, vector_in, vector_in, mask, vl)
+  class RISCVReductionMask
+        : Intrinsic<[llvm_anyvector_ty],
+                    [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>,
+                     LLVMScalarOrSameVectorWidth<1, llvm_i1_ty>, llvm_anyint_ty],
+                    [IntrNoMem]>, RISCVVIntrinsic;
+  // For unary operations with scalar type output without mask
+  // Output: (scalar type)
+  // Input: (vector_in, vl)
+  class RISCVMaskUnarySOutNoMask
+        : Intrinsic<[llvm_anyint_ty],
+                    [llvm_anyvector_ty, LLVMMatchType<0>],
+                    [IntrNoMem]>, RISCVVIntrinsic;
+  // For unary operations with scalar type output with mask
+  // Output: (scalar type)
+  // Input: (vector_in, mask, vl)
+  class RISCVMaskUnarySOutMask
+        : Intrinsic<[llvm_anyint_ty],
+                    [llvm_anyvector_ty, LLVMMatchType<1>, LLVMMatchType<0>],
+                    [IntrNoMem]>, RISCVVIntrinsic;
+  // For destination vector type is NOT the same as source vector.
+  // Input: (vector_in, vl)
+  class RISCVUnaryABNoMask
+        : Intrinsic<[llvm_anyvector_ty],
+                    [llvm_anyvector_ty, llvm_anyint_ty],
+                    [IntrNoMem]>, RISCVVIntrinsic;
+  // For destination vector type is NOT the same as source vector (with mask).
+  // Input: (maskedoff, vector_in, mask, vl)
+  class RISCVUnaryABMask
+        : Intrinsic<[llvm_anyvector_ty],
+                    [LLVMMatchType<0>, llvm_anyvector_ty,
+                     LLVMScalarOrSameVectorWidth<1, llvm_i1_ty>,
+                     llvm_anyint_ty],
+                    [IntrNoMem]>, RISCVVIntrinsic;
+  // For unary operations with the same vector type in/out without mask
+  // Output: (vector)
+  // Input: (vector_in, vl)
+  class RISCVUnaryNoMask
+        : Intrinsic<[llvm_anyvector_ty],
+                    [LLVMMatchType<0>, llvm_anyint_ty],
+                    [IntrNoMem]>, RISCVVIntrinsic;
+  // For mask unary operations with mask type in/out with mask
+  // Output: (mask type output)
+  // Input: (mask type maskedoff, mask type vector_in, mask, vl)
+  class RISCVMaskUnaryMOutMask
+        : Intrinsic<[llvm_anyint_ty],
+                    [LLVMMatchType<0>, LLVMMatchType<0>,
+                     LLVMMatchType<0>, llvm_anyint_ty],
+                    [IntrNoMem]>, RISCVVIntrinsic;
+  // Output: (vector)
+  // Input: (vl)
+  class RISCVNullaryIntrinsic
+        : Intrinsic<[llvm_anyvector_ty],
+                    [llvm_anyint_ty],
+                    [IntrNoMem]>, RISCVVIntrinsic;
+  // For Conversion unary operations.
+  // Input: (vector_in, vl)
+  class RISCVConversionNoMask
+        : Intrinsic<[llvm_anyvector_ty],
+                    [llvm_anyvector_ty, llvm_anyint_ty],
+                    [IntrNoMem]>, RISCVVIntrinsic;
+  // For Conversion unary operations with mask.
+  // Input: (maskedoff, vector_in, mask, vl)
+  class RISCVConversionMask
+        : Intrinsic<[llvm_anyvector_ty],
+                    [LLVMMatchType<0>, llvm_anyvector_ty,
+                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty],
+                    [IntrNoMem]>, RISCVVIntrinsic;
+  // For atomic operations without mask
+  // Input: (base, index, value, vl)
+  class RISCVAMONoMask
+        : Intrinsic<[llvm_anyvector_ty],
+                    [LLVMPointerType<LLVMMatchType<0>>, llvm_anyvector_ty, LLVMMatchType<0>,
+                     llvm_anyint_ty],
+                    [NoCapture<ArgIndex<0>>]>, RISCVVIntrinsic;
+  // For atomic operations with mask
+  // Input: (base, index, value, mask, vl)
+  class RISCVAMOMask
+        : Intrinsic<[llvm_anyvector_ty],
+                    [LLVMPointerType<LLVMMatchType<0>>, llvm_anyvector_ty, LLVMMatchType<0>,
+                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty],
+                    [NoCapture<ArgIndex<0>>]>, RISCVVIntrinsic;
+
+  // For unit stride segment load
+  // Input: (pointer, vl)
+  class RISCVUSSegLoad<int nf>
+        : Intrinsic<!listconcat([llvm_anyvector_ty], !listsplat(LLVMMatchType<0>,
+                                !add(nf, -1))),
+                    [LLVMPointerToElt<0>, llvm_anyint_ty],
+                    [NoCapture<ArgIndex<0>>, IntrReadMem]>, RISCVVIntrinsic;
+  // For unit stride segment load with mask
+  // Input: (maskedoff, pointer, mask, vl)
+  class RISCVUSSegLoadMask<int nf>
+        : Intrinsic<!listconcat([llvm_anyvector_ty], !listsplat(LLVMMatchType<0>,
+                                !add(nf, -1))),
+                    !listconcat(!listsplat(LLVMMatchType<0>, nf),
+                                [LLVMPointerToElt<0>,
+                                 LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                 llvm_anyint_ty]),
+                    [NoCapture<ArgIndex<nf>>, IntrReadMem]>, RISCVVIntrinsic;
+
+  // For unit stride fault-only-first segment load
+  // Input: (pointer, vl)
+  // Output: (data, vl)
+  // NOTE: We model this with default memory properties since we model writing
+  // VL as a side effect. IntrReadMem, IntrHasSideEffects does not work.
+  class RISCVUSSegLoadFF<int nf>
+        : Intrinsic<!listconcat([llvm_anyvector_ty], !listsplat(LLVMMatchType<0>,
+                                !add(nf, -1)), [llvm_anyint_ty]),
+                    [LLVMPointerToElt<0>, LLVMMatchType<1>],
+                    [NoCapture<ArgIndex<0>>]>, RISCVVIntrinsic;
+  // For unit stride fault-only-first segment load with mask
+  // Input: (maskedoff, pointer, mask, vl)
+  // Output: (data, vl)
+  // NOTE: We model this with default memory properties since we model writing
+  // VL as a side effect. IntrReadMem, IntrHasSideEffects does not work.
+  class RISCVUSSegLoadFFMask<int nf>
+        : Intrinsic<!listconcat([llvm_anyvector_ty], !listsplat(LLVMMatchType<0>,
+                                !add(nf, -1)), [llvm_anyint_ty]),
+                    !listconcat(!listsplat(LLVMMatchType<0>, nf),
+                     [LLVMPointerToElt<0>,
+                      LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                      LLVMMatchType<1>]),
+                    [NoCapture<ArgIndex<nf>>]>, RISCVVIntrinsic;
+
+  // For stride segment load
+  // Input: (pointer, offset, vl)
+  class RISCVSSegLoad<int nf>
+        : Intrinsic<!listconcat([llvm_anyvector_ty], !listsplat(LLVMMatchType<0>,
+                                !add(nf, -1))),
+                    [LLVMPointerToElt<0>, llvm_anyint_ty, LLVMMatchType<1>],
+                    [NoCapture<ArgIndex<0>>, IntrReadMem]>, RISCVVIntrinsic;
+  // For stride segment load with mask
+  // Input: (maskedoff, pointer, offset, mask, vl)
+  class RISCVSSegLoadMask<int nf>
+        : Intrinsic<!listconcat([llvm_anyvector_ty], !listsplat(LLVMMatchType<0>,
+                                !add(nf, -1))),
+                    !listconcat(!listsplat(LLVMMatchType<0>, nf),
+                                [LLVMPointerToElt<0>,
+                                 llvm_anyint_ty,
+                                 LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                 LLVMMatchType<1>]),
+                    [NoCapture<ArgIndex<nf>>, IntrReadMem]>, RISCVVIntrinsic;
+
+  // For indexed segment load
+  // Input: (pointer, index, vl)
+  class RISCVISegLoad<int nf>
+        : Intrinsic<!listconcat([llvm_anyvector_ty], !listsplat(LLVMMatchType<0>,
+                                !add(nf, -1))),
+                    [LLVMPointerToElt<0>, llvm_anyvector_ty, llvm_anyint_ty],
+                    [NoCapture<ArgIndex<0>>, IntrReadMem]>, RISCVVIntrinsic;
+  // For indexed segment load with mask
+  // Input: (maskedoff, pointer, index, mask, vl)
+  class RISCVISegLoadMask<int nf>
+        : Intrinsic<!listconcat([llvm_anyvector_ty], !listsplat(LLVMMatchType<0>,
+                                !add(nf, -1))),
+                    !listconcat(!listsplat(LLVMMatchType<0>, nf),
+                                [LLVMPointerToElt<0>,
+                                 llvm_anyvector_ty,
+                                 LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                 llvm_anyint_ty]),
+                    [NoCapture<ArgIndex<nf>>, IntrReadMem]>, RISCVVIntrinsic;
+
+  // For unit stride segment store
+  // Input: (value, pointer, vl)
+  class RISCVUSSegStore<int nf>
+        : Intrinsic<[],
+                    !listconcat([llvm_anyvector_ty],
+                                !listsplat(LLVMMatchType<0>, !add(nf, -1)),
+                                [LLVMPointerToElt<0>, llvm_anyint_ty]),
+                    [NoCapture<ArgIndex<nf>>, IntrWriteMem]>, RISCVVIntrinsic;
+  // For unit stride segment store with mask
+  // Input: (value, pointer, mask, vl)
+  class RISCVUSSegStoreMask<int nf>
+        : Intrinsic<[],
+                    !listconcat([llvm_anyvector_ty],
+                                !listsplat(LLVMMatchType<0>, !add(nf, -1)),
+                                [LLVMPointerToElt<0>,
+                                 LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                 llvm_anyint_ty]),
+                    [NoCapture<ArgIndex<nf>>, IntrWriteMem]>, RISCVVIntrinsic;
+
+  // For stride segment store
+  // Input: (value, pointer, offset, vl)
+  class RISCVSSegStore<int nf>
+        : Intrinsic<[],
+                    !listconcat([llvm_anyvector_ty],
+                                !listsplat(LLVMMatchType<0>, !add(nf, -1)),
+                                [LLVMPointerToElt<0>, llvm_anyint_ty,
+                                 LLVMMatchType<1>]),
+                    [NoCapture<ArgIndex<nf>>, IntrWriteMem]>, RISCVVIntrinsic;
+  // For stride segment store with mask
+  // Input: (value, pointer, offset, mask, vl)
+  class RISCVSSegStoreMask<int nf>
+        : Intrinsic<[],
+                    !listconcat([llvm_anyvector_ty],
+                                !listsplat(LLVMMatchType<0>, !add(nf, -1)),
+                                [LLVMPointerToElt<0>, llvm_anyint_ty,
+                                 LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                 LLVMMatchType<1>]),
+                    [NoCapture<ArgIndex<nf>>, IntrWriteMem]>, RISCVVIntrinsic;
+
+  // For indexed segment store
+  // Input: (value, pointer, offset, vl)
+  class RISCVISegStore<int nf>
+        : Intrinsic<[],
+                    !listconcat([llvm_anyvector_ty],
+                                !listsplat(LLVMMatchType<0>, !add(nf, -1)),
+                                [LLVMPointerToElt<0>, llvm_anyvector_ty,
+                                 llvm_anyint_ty]),
+                    [NoCapture<ArgIndex<nf>>, IntrWriteMem]>, RISCVVIntrinsic;
+  // For indexed segment store with mask
+  // Input: (value, pointer, offset, mask, vl)
+  class RISCVISegStoreMask<int nf>
+        : Intrinsic<[],
+                    !listconcat([llvm_anyvector_ty],
+                                !listsplat(LLVMMatchType<0>, !add(nf, -1)),
+                                [LLVMPointerToElt<0>, llvm_anyvector_ty,
+                                 LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                 llvm_anyint_ty]),
+                    [NoCapture<ArgIndex<nf>>, IntrWriteMem]>, RISCVVIntrinsic;
+
+  multiclass RISCVUSLoad {
+    def "int_riscv_" # NAME : RISCVUSLoad;
+    def "int_riscv_" # NAME # "_mask" : RISCVUSLoadMask;
+  }
+  multiclass RISCVUSLoadFF {
+    def "int_riscv_" # NAME : RISCVUSLoadFF;
+    def "int_riscv_" # NAME # "_mask" : RISCVUSLoadFFMask;
+  }
+  multiclass RISCVSLoad {
+    def "int_riscv_" # NAME : RISCVSLoad;
+    def "int_riscv_" # NAME # "_mask" : RISCVSLoadMask;
+  }
+  multiclass RISCVILoad {
+    def "int_riscv_" # NAME : RISCVILoad;
+    def "int_riscv_" # NAME # "_mask" : RISCVILoadMask;
+  }
+  multiclass RISCVUSStore {
+    def "int_riscv_" # NAME : RISCVUSStore;
+    def "int_riscv_" # NAME # "_mask" : RISCVUSStoreMask;
+  }
+  multiclass RISCVSStore {
+    def "int_riscv_" # NAME : RISCVSStore;
+    def "int_riscv_" # NAME # "_mask" : RISCVSStoreMask;
+  }
+
+  multiclass RISCVIStore {
+    def "int_riscv_" # NAME : RISCVIStore;
+    def "int_riscv_" # NAME # "_mask" : RISCVIStoreMask;
+  }
+  multiclass RISCVUnaryAA {
+    def "int_riscv_" # NAME : RISCVUnaryAANoMask;
+    def "int_riscv_" # NAME # "_mask" : RISCVUnaryAAMask;
+  }
+  multiclass RISCVUnaryAB {
+    def "int_riscv_" # NAME : RISCVUnaryABNoMask;
+    def "int_riscv_" # NAME # "_mask" : RISCVUnaryABMask;
+  }
+  // AAX means the destination type(A) is the same as the first source
+  // type(A). X means any type for the second source operand.
+  multiclass RISCVBinaryAAX {
+    def "int_riscv_" # NAME : RISCVBinaryAAXNoMask;
+    def "int_riscv_" # NAME # "_mask" : RISCVBinaryAAXMask;
+  }
+  // ABX means the destination type(A) is different from the first source
+  // type(B). X means any type for the second source operand.
+  multiclass RISCVBinaryABX {
+    def "int_riscv_" # NAME : RISCVBinaryABXNoMask;
+    def "int_riscv_" # NAME # "_mask" : RISCVBinaryABXMask;
+  }
+  multiclass RISCVBinaryWithV0 {
+    def "int_riscv_" # NAME : RISCVBinaryWithV0;
+  }
+  multiclass RISCVBinaryMaskOutWithV0 {
+    def "int_riscv_" # NAME : RISCVBinaryMOutWithV0;
+  }
+  multiclass RISCVBinaryMaskOut {
+    def "int_riscv_" # NAME : RISCVBinaryMOut;
+  }
+  multiclass RISCVSaturatingBinaryAAX {
+    def "int_riscv_" # NAME : RISCVSaturatingBinaryAAXNoMask;
+    def "int_riscv_" # NAME # "_mask" : RISCVSaturatingBinaryAAXMask;
+  }
+  multiclass RISCVSaturatingBinaryABX {
+    def "int_riscv_" # NAME : RISCVSaturatingBinaryABXNoMask;
+    def "int_riscv_" # NAME # "_mask" : RISCVSaturatingBinaryABXMask;
+  }
+  multiclass RISCVTernaryAAAX {
+    def "int_riscv_" # NAME : RISCVTernaryAAAXNoMask;
+    def "int_riscv_" # NAME # "_mask" : RISCVTernaryAAAXMask;
+  }
+  multiclass RISCVTernaryAAXA {
+    def "int_riscv_" # NAME : RISCVTernaryAAXANoMask;
+    def "int_riscv_" # NAME # "_mask" : RISCVTernaryAAXAMask;
+  }
+  multiclass RISCVCompare {
+    def "int_riscv_" # NAME : RISCVCompareNoMask;
+    def "int_riscv_" # NAME # "_mask" : RISCVCompareMask;
+  }
+  multiclass RISCVClassify {
+    def "int_riscv_" # NAME : RISCVClassifyNoMask;
+    def "int_riscv_" # NAME # "_mask" : RISCVClassifyMask;
+  }
+  multiclass RISCVTernaryWide {
+    def "int_riscv_" # NAME : RISCVTernaryWideNoMask;
+    def "int_riscv_" # NAME # "_mask" : RISCVTernaryWideMask;
+  }
+  multiclass RISCVReduction {
+    def "int_riscv_" # NAME : RISCVReductionNoMask;
+    def "int_riscv_" # NAME # "_mask" : RISCVReductionMask;
+  }
+  multiclass RISCVMaskUnarySOut {
+    def "int_riscv_" # NAME : RISCVMaskUnarySOutNoMask;
+    def "int_riscv_" # NAME # "_mask" : RISCVMaskUnarySOutMask;
+  }
+  multiclass RISCVMaskUnaryMOut {
+    def "int_riscv_" # NAME : RISCVUnaryNoMask;
+    def "int_riscv_" # NAME # "_mask" : RISCVMaskUnaryMOutMask;
+  }
+  multiclass RISCVConversion {
+    def "int_riscv_" #NAME :RISCVConversionNoMask;
+    def "int_riscv_" # NAME # "_mask" : RISCVConversionMask;
+  }
+  multiclass RISCVAMO {
+    def "int_riscv_" # NAME : RISCVAMONoMask;
+    def "int_riscv_" # NAME # "_mask" : RISCVAMOMask;
+  }
+  multiclass RISCVUSSegLoad<int nf> {
+    def "int_riscv_" # NAME : RISCVUSSegLoad<nf>;
+    def "int_riscv_" # NAME # "_mask" : RISCVUSSegLoadMask<nf>;
+  }
+  multiclass RISCVUSSegLoadFF<int nf> {
+    def "int_riscv_" # NAME : RISCVUSSegLoadFF<nf>;
+    def "int_riscv_" # NAME # "_mask" : RISCVUSSegLoadFFMask<nf>;
+  }
+  multiclass RISCVSSegLoad<int nf> {
+    def "int_riscv_" # NAME : RISCVSSegLoad<nf>;
+    def "int_riscv_" # NAME # "_mask" : RISCVSSegLoadMask<nf>;
+  }
+  multiclass RISCVISegLoad<int nf> {
+    def "int_riscv_" # NAME : RISCVISegLoad<nf>;
+    def "int_riscv_" # NAME # "_mask" : RISCVISegLoadMask<nf>;
+  }
+  multiclass RISCVUSSegStore<int nf> {
+    def "int_riscv_" # NAME : RISCVUSSegStore<nf>;
+    def "int_riscv_" # NAME # "_mask" : RISCVUSSegStoreMask<nf>;
+  }
+  multiclass RISCVSSegStore<int nf> {
+    def "int_riscv_" # NAME : RISCVSSegStore<nf>;
+    def "int_riscv_" # NAME # "_mask" : RISCVSSegStoreMask<nf>;
+  }
+  multiclass RISCVISegStore<int nf> {
+    def "int_riscv_" # NAME : RISCVISegStore<nf>;
+    def "int_riscv_" # NAME # "_mask" : RISCVISegStoreMask<nf>;
+  }
+
+  defm vle : RISCVUSLoad;
+  defm vleff : RISCVUSLoadFF;
+  defm vse : RISCVUSStore;
+  defm vlse: RISCVSLoad;
+  defm vsse: RISCVSStore;
+  defm vluxei : RISCVILoad;
+  defm vloxei : RISCVILoad;
+  defm vsoxei : RISCVIStore;
+  defm vsuxei : RISCVIStore;
+
+  def int_riscv_vle1 : RISCVUSLoad;
+  def int_riscv_vse1 : RISCVUSStore;
+
+  defm vamoswap : RISCVAMO;
+  defm vamoadd : RISCVAMO;
+  defm vamoxor : RISCVAMO;
+  defm vamoand : RISCVAMO;
+  defm vamoor : RISCVAMO;
+  defm vamomin : RISCVAMO;
+  defm vamomax : RISCVAMO;
+  defm vamominu : RISCVAMO;
+  defm vamomaxu : RISCVAMO;
+
+  defm vadd : RISCVBinaryAAX;
+  defm vsub : RISCVBinaryAAX;
+  defm vrsub : RISCVBinaryAAX;
+
+  defm vwaddu : RISCVBinaryABX;
+  defm vwadd : RISCVBinaryABX;
+  defm vwaddu_w : RISCVBinaryAAX;
+  defm vwadd_w : RISCVBinaryAAX;
+  defm vwsubu : RISCVBinaryABX;
+  defm vwsub : RISCVBinaryABX;
+  defm vwsubu_w : RISCVBinaryAAX;
+  defm vwsub_w : RISCVBinaryAAX;
+
+  defm vzext : RISCVUnaryAB;
+  defm vsext : RISCVUnaryAB;
+
+  defm vadc : RISCVBinaryWithV0;
+  defm vmadc_carry_in : RISCVBinaryMaskOutWithV0;
+  defm vmadc : RISCVBinaryMaskOut;
+
+  defm vsbc : RISCVBinaryWithV0;
+  defm vmsbc_borrow_in : RISCVBinaryMaskOutWithV0;
+  defm vmsbc : RISCVBinaryMaskOut;
+
+  defm vand : RISCVBinaryAAX;
+  defm vor : RISCVBinaryAAX;
+  defm vxor : RISCVBinaryAAX;
+
+  defm vsll : RISCVBinaryAAX;
+  defm vsrl : RISCVBinaryAAX;
+  defm vsra : RISCVBinaryAAX;
+
+  defm vnsrl : RISCVBinaryABX;
+  defm vnsra : RISCVBinaryABX;
+
+  defm vmseq : RISCVCompare;
+  defm vmsne : RISCVCompare;
+  defm vmsltu : RISCVCompare;
+  defm vmslt : RISCVCompare;
+  defm vmsleu : RISCVCompare;
+  defm vmsle : RISCVCompare;
+  defm vmsgtu : RISCVCompare;
+  defm vmsgt : RISCVCompare;
+
+  defm vminu : RISCVBinaryAAX;
+  defm vmin : RISCVBinaryAAX;
+  defm vmaxu : RISCVBinaryAAX;
+  defm vmax : RISCVBinaryAAX;
+
+  defm vmul : RISCVBinaryAAX;
+  defm vmulh : RISCVBinaryAAX;
+  defm vmulhu : RISCVBinaryAAX;
+  defm vmulhsu : RISCVBinaryAAX;
+
+  defm vdivu : RISCVBinaryAAX;
+  defm vdiv : RISCVBinaryAAX;
+  defm vremu : RISCVBinaryAAX;
+  defm vrem : RISCVBinaryAAX;
+
+  defm vwmul : RISCVBinaryABX;
+  defm vwmulu : RISCVBinaryABX;
+  defm vwmulsu : RISCVBinaryABX;
+
+  defm vmacc : RISCVTernaryAAXA;
+  defm vnmsac : RISCVTernaryAAXA;
+  defm vmadd : RISCVTernaryAAXA;
+  defm vnmsub : RISCVTernaryAAXA;
+
+  defm vwmaccu  : RISCVTernaryWide;
+  defm vwmacc   : RISCVTernaryWide;
+  defm vwmaccus : RISCVTernaryWide;
+  defm vwmaccsu : RISCVTernaryWide;
+
+  defm vfadd : RISCVBinaryAAX;
+  defm vfsub : RISCVBinaryAAX;
+  defm vfrsub : RISCVBinaryAAX;
+
+  defm vfwadd : RISCVBinaryABX;
+  defm vfwsub : RISCVBinaryABX;
+  defm vfwadd_w : RISCVBinaryAAX;
+  defm vfwsub_w : RISCVBinaryAAX;
+
+  defm vsaddu : RISCVSaturatingBinaryAAX;
+  defm vsadd : RISCVSaturatingBinaryAAX;
+  defm vssubu : RISCVSaturatingBinaryAAX;
+  defm vssub : RISCVSaturatingBinaryAAX;
+
+  def int_riscv_vmerge : RISCVBinaryWithV0;
+
+  def int_riscv_vmv_v_v : Intrinsic<[llvm_anyvector_ty],
+                                    [LLVMMatchType<0>, llvm_anyint_ty],
+                                    [IntrNoMem]>, RISCVVIntrinsic;
+  def int_riscv_vmv_v_x : Intrinsic<[llvm_anyint_ty],
+                                    [LLVMVectorElementType<0>, llvm_anyint_ty],
+                                    [IntrNoMem]>, RISCVVIntrinsic {
+    let ExtendOperand = 1;
+  }
+  def int_riscv_vfmv_v_f : Intrinsic<[llvm_anyfloat_ty],
+                                     [LLVMVectorElementType<0>, llvm_anyint_ty],
+                                     [IntrNoMem]>, RISCVVIntrinsic;
+
+  def int_riscv_vmv_x_s : Intrinsic<[LLVMVectorElementType<0>],
+                                    [llvm_anyint_ty],
+                                    [IntrNoMem]>, RISCVVIntrinsic;
+  def int_riscv_vmv_s_x : Intrinsic<[llvm_anyint_ty],
+                                    [LLVMMatchType<0>, LLVMVectorElementType<0>,
+                                     llvm_anyint_ty],
+                                    [IntrNoMem]>, RISCVVIntrinsic {
+    let ExtendOperand = 2;
+  }
+
+  def int_riscv_vfmv_f_s : Intrinsic<[LLVMVectorElementType<0>],
+                                     [llvm_anyfloat_ty],
+                                     [IntrNoMem]>, RISCVVIntrinsic;
+  def int_riscv_vfmv_s_f : Intrinsic<[llvm_anyfloat_ty],
+                                     [LLVMMatchType<0>, LLVMVectorElementType<0>,
+                                      llvm_anyint_ty],
+                                     [IntrNoMem]>, RISCVVIntrinsic;
+
+  defm vfmul : RISCVBinaryAAX;
+  defm vfdiv : RISCVBinaryAAX;
+  defm vfrdiv : RISCVBinaryAAX;
+
+  defm vfwmul : RISCVBinaryABX;
+
+  defm vfmacc : RISCVTernaryAAXA;
+  defm vfnmacc : RISCVTernaryAAXA;
+  defm vfmsac : RISCVTernaryAAXA;
+  defm vfnmsac : RISCVTernaryAAXA;
+  defm vfmadd : RISCVTernaryAAXA;
+  defm vfnmadd : RISCVTernaryAAXA;
+  defm vfmsub : RISCVTernaryAAXA;
+  defm vfnmsub : RISCVTernaryAAXA;
+
+  defm vfwmacc : RISCVTernaryWide;
+  defm vfwnmacc : RISCVTernaryWide;
+  defm vfwmsac : RISCVTernaryWide;
+  defm vfwnmsac : RISCVTernaryWide;
+
+  defm vfsqrt : RISCVUnaryAA;
+  defm vfrsqrt7 : RISCVUnaryAA;
+  defm vfrec7 : RISCVUnaryAA;
+
+  defm vfmin : RISCVBinaryAAX;
+  defm vfmax : RISCVBinaryAAX;
+
+  defm vfsgnj : RISCVBinaryAAX;
+  defm vfsgnjn : RISCVBinaryAAX;
+  defm vfsgnjx : RISCVBinaryAAX;
+
+  defm vfclass : RISCVClassify;
+
+  defm vfmerge : RISCVBinaryWithV0;
+
+  defm vslideup : RISCVTernaryAAAX;
+  defm vslidedown : RISCVTernaryAAAX;
+
+  defm vslide1up : RISCVBinaryAAX;
+  defm vslide1down : RISCVBinaryAAX;
+  defm vfslide1up : RISCVBinaryAAX;
+  defm vfslide1down : RISCVBinaryAAX;
+
+  defm vrgather : RISCVBinaryAAX;
+  defm vrgatherei16 : RISCVBinaryAAX;
+
+  def "int_riscv_vcompress" : RISCVBinaryAAAMask;
+
+  defm vaaddu : RISCVSaturatingBinaryAAX;
+  defm vaadd : RISCVSaturatingBinaryAAX;
+  defm vasubu : RISCVSaturatingBinaryAAX;
+  defm vasub : RISCVSaturatingBinaryAAX;
+
+  defm vsmul : RISCVSaturatingBinaryAAX;
+
+  defm vssrl : RISCVSaturatingBinaryAAX;
+  defm vssra : RISCVSaturatingBinaryAAX;
+
+  defm vnclipu : RISCVSaturatingBinaryABX;
+  defm vnclip : RISCVSaturatingBinaryABX;
+
+  defm vmfeq : RISCVCompare;
+  defm vmfne : RISCVCompare;
+  defm vmflt : RISCVCompare;
+  defm vmfle : RISCVCompare;
+  defm vmfgt : RISCVCompare;
+  defm vmfge : RISCVCompare;
+
+  defm vredsum : RISCVReduction;
+  defm vredand : RISCVReduction;
+  defm vredor : RISCVReduction;
+  defm vredxor : RISCVReduction;
+  defm vredminu : RISCVReduction;
+  defm vredmin : RISCVReduction;
+  defm vredmaxu : RISCVReduction;
+  defm vredmax : RISCVReduction;
+
+  defm vwredsumu : RISCVReduction;
+  defm vwredsum : RISCVReduction;
+
+  defm vfredosum : RISCVReduction;
+  defm vfredsum : RISCVReduction;
+  defm vfredmin : RISCVReduction;
+  defm vfredmax : RISCVReduction;
+
+  defm vfwredsum : RISCVReduction;
+  defm vfwredosum : RISCVReduction;
+
+  def int_riscv_vmand: RISCVBinaryAAANoMask;
+  def int_riscv_vmnand: RISCVBinaryAAANoMask;
+  def int_riscv_vmandnot: RISCVBinaryAAANoMask;
+  def int_riscv_vmxor: RISCVBinaryAAANoMask;
+  def int_riscv_vmor: RISCVBinaryAAANoMask;
+  def int_riscv_vmnor: RISCVBinaryAAANoMask;
+  def int_riscv_vmornot: RISCVBinaryAAANoMask;
+  def int_riscv_vmxnor: RISCVBinaryAAANoMask;
+  def int_riscv_vmclr : RISCVNullaryIntrinsic;
+  def int_riscv_vmset : RISCVNullaryIntrinsic;
+
+  defm vpopc : RISCVMaskUnarySOut;
+  defm vfirst : RISCVMaskUnarySOut;
+  defm vmsbf : RISCVMaskUnaryMOut;
+  defm vmsof : RISCVMaskUnaryMOut;
+  defm vmsif : RISCVMaskUnaryMOut;
+
+  defm vfcvt_xu_f_v : RISCVConversion;
+  defm vfcvt_x_f_v : RISCVConversion;
+  defm vfcvt_rtz_xu_f_v : RISCVConversion;
+  defm vfcvt_rtz_x_f_v : RISCVConversion;
+  defm vfcvt_f_xu_v : RISCVConversion;
+  defm vfcvt_f_x_v : RISCVConversion;
+
+  defm vfwcvt_f_xu_v : RISCVConversion;
+  defm vfwcvt_f_x_v : RISCVConversion;
+  defm vfwcvt_xu_f_v : RISCVConversion;
+  defm vfwcvt_x_f_v : RISCVConversion;
+  defm vfwcvt_rtz_xu_f_v : RISCVConversion;
+  defm vfwcvt_rtz_x_f_v : RISCVConversion;
+  defm vfwcvt_f_f_v : RISCVConversion;
+
+  defm vfncvt_f_xu_w : RISCVConversion;
+  defm vfncvt_f_x_w : RISCVConversion;
+  defm vfncvt_xu_f_w : RISCVConversion;
+  defm vfncvt_x_f_w : RISCVConversion;
+  defm vfncvt_rtz_xu_f_w : RISCVConversion;
+  defm vfncvt_rtz_x_f_w : RISCVConversion;
+  defm vfncvt_f_f_w : RISCVConversion;
+  defm vfncvt_rod_f_f_w : RISCVConversion;
+
+  // Output: (vector)
+  // Input: (mask type input, vl)
+  def int_riscv_viota : Intrinsic<[llvm_anyvector_ty],
+                                  [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                   llvm_anyint_ty],
+                                  [IntrNoMem]>, RISCVVIntrinsic;
+  // Output: (vector)
+  // Input: (maskedoff, mask type vector_in, mask, vl)
+  def int_riscv_viota_mask : Intrinsic<[llvm_anyvector_ty],
+                                       [LLVMMatchType<0>,
+                                        LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                        LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                        llvm_anyint_ty],
+                                       [IntrNoMem]>, RISCVVIntrinsic;
+  // Output: (vector)
+  // Input: (vl)
+  def int_riscv_vid : RISCVNullaryIntrinsic;
+
+  // Output: (vector)
+  // Input: (maskedoff, mask, vl)
+  def int_riscv_vid_mask : Intrinsic<[llvm_anyvector_ty],
+                                     [LLVMMatchType<0>,
+                                      LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                      llvm_anyint_ty],
+                                     [IntrNoMem]>, RISCVVIntrinsic;
+
+  foreach nf = [2, 3, 4, 5, 6, 7, 8] in {
+    defm vlseg # nf : RISCVUSSegLoad<nf>;
+    defm vlseg # nf # ff : RISCVUSSegLoadFF<nf>;
+    defm vlsseg # nf : RISCVSSegLoad<nf>;
+    defm vloxseg # nf : RISCVISegLoad<nf>;
+    defm vluxseg # nf : RISCVISegLoad<nf>;
+    defm vsseg # nf : RISCVUSSegStore<nf>;
+    defm vssseg # nf : RISCVSSegStore<nf>;
+    defm vsoxseg # nf : RISCVISegStore<nf>;
+    defm vsuxseg # nf : RISCVISegStore<nf>;
+  }
+
+} // TargetPrefix = "riscv"
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsVE.td b/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsVE.td
new file mode 100644
index 000000000000..be4bccef0cc1
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsVE.td
@@ -0,0 +1,35 @@
+// Define intrinsics written by hand
+
+// VEL Intrinsic instructions.
+let TargetPrefix = "ve" in {
+  def int_ve_vl_svob : GCCBuiltin<"__builtin_ve_vl_svob">,
+                       Intrinsic<[], [], [IntrHasSideEffects]>;
+
+  def int_ve_vl_pack_f32p : GCCBuiltin<"__builtin_ve_vl_pack_f32p">,
+                            Intrinsic<[llvm_i64_ty], [llvm_ptr_ty, llvm_ptr_ty],
+                                      [IntrReadMem]>;
+  def int_ve_vl_pack_f32a : GCCBuiltin<"__builtin_ve_vl_pack_f32a">,
+                            Intrinsic<[llvm_i64_ty], [llvm_ptr_ty],
+                                      [IntrReadMem]>;
+
+  def int_ve_vl_extract_vm512u :
+      GCCBuiltin<"__builtin_ve_vl_extract_vm512u">,
+      Intrinsic<[LLVMType<v256i1>], [LLVMType<v512i1>], [IntrNoMem]>;
+
+  def int_ve_vl_extract_vm512l :
+      GCCBuiltin<"__builtin_ve_vl_extract_vm512l">,
+      Intrinsic<[LLVMType<v256i1>], [LLVMType<v512i1>], [IntrNoMem]>;
+
+  def int_ve_vl_insert_vm512u :
+      GCCBuiltin<"__builtin_ve_vl_insert_vm512u">,
+      Intrinsic<[LLVMType<v512i1>], [LLVMType<v512i1>, LLVMType<v256i1>],
+                [IntrNoMem]>;
+
+  def int_ve_vl_insert_vm512l :
+      GCCBuiltin<"__builtin_ve_vl_insert_vm512l">,
+      Intrinsic<[LLVMType<v512i1>], [LLVMType<v512i1>, LLVMType<v256i1>],
+                [IntrNoMem]>;
+}
+
+// Define intrinsics automatically generated
+include "llvm/IR/IntrinsicsVEVL.gen.td"
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsVEVL.gen.td b/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsVEVL.gen.td
new file mode 100644
index 000000000000..67cbd307903d
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsVEVL.gen.td
@@ -0,0 +1,1213 @@
+let TargetPrefix = "ve" in def int_ve_vl_vld_vssl : GCCBuiltin<"__builtin_ve_vl_vld_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vld_vssvl : GCCBuiltin<"__builtin_ve_vl_vld_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldnc_vssl : GCCBuiltin<"__builtin_ve_vl_vldnc_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldnc_vssvl : GCCBuiltin<"__builtin_ve_vl_vldnc_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldu_vssl : GCCBuiltin<"__builtin_ve_vl_vldu_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldu_vssvl : GCCBuiltin<"__builtin_ve_vl_vldu_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldunc_vssl : GCCBuiltin<"__builtin_ve_vl_vldunc_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldunc_vssvl : GCCBuiltin<"__builtin_ve_vl_vldunc_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldlsx_vssl : GCCBuiltin<"__builtin_ve_vl_vldlsx_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldlsx_vssvl : GCCBuiltin<"__builtin_ve_vl_vldlsx_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldlsxnc_vssl : GCCBuiltin<"__builtin_ve_vl_vldlsxnc_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldlsxnc_vssvl : GCCBuiltin<"__builtin_ve_vl_vldlsxnc_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldlzx_vssl : GCCBuiltin<"__builtin_ve_vl_vldlzx_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldlzx_vssvl : GCCBuiltin<"__builtin_ve_vl_vldlzx_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldlzxnc_vssl : GCCBuiltin<"__builtin_ve_vl_vldlzxnc_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldlzxnc_vssvl : GCCBuiltin<"__builtin_ve_vl_vldlzxnc_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vld2d_vssl : GCCBuiltin<"__builtin_ve_vl_vld2d_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vld2d_vssvl : GCCBuiltin<"__builtin_ve_vl_vld2d_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vld2dnc_vssl : GCCBuiltin<"__builtin_ve_vl_vld2dnc_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vld2dnc_vssvl : GCCBuiltin<"__builtin_ve_vl_vld2dnc_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldu2d_vssl : GCCBuiltin<"__builtin_ve_vl_vldu2d_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldu2d_vssvl : GCCBuiltin<"__builtin_ve_vl_vldu2d_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldu2dnc_vssl : GCCBuiltin<"__builtin_ve_vl_vldu2dnc_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldu2dnc_vssvl : GCCBuiltin<"__builtin_ve_vl_vldu2dnc_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldl2dsx_vssl : GCCBuiltin<"__builtin_ve_vl_vldl2dsx_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldl2dsx_vssvl : GCCBuiltin<"__builtin_ve_vl_vldl2dsx_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldl2dsxnc_vssl : GCCBuiltin<"__builtin_ve_vl_vldl2dsxnc_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldl2dsxnc_vssvl : GCCBuiltin<"__builtin_ve_vl_vldl2dsxnc_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldl2dzx_vssl : GCCBuiltin<"__builtin_ve_vl_vldl2dzx_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldl2dzx_vssvl : GCCBuiltin<"__builtin_ve_vl_vldl2dzx_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldl2dzxnc_vssl : GCCBuiltin<"__builtin_ve_vl_vldl2dzxnc_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldl2dzxnc_vssvl : GCCBuiltin<"__builtin_ve_vl_vldl2dzxnc_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vst_vssl : GCCBuiltin<"__builtin_ve_vl_vst_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vst_vssml : GCCBuiltin<"__builtin_ve_vl_vst_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstnc_vssl : GCCBuiltin<"__builtin_ve_vl_vstnc_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstnc_vssml : GCCBuiltin<"__builtin_ve_vl_vstnc_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstot_vssl : GCCBuiltin<"__builtin_ve_vl_vstot_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstot_vssml : GCCBuiltin<"__builtin_ve_vl_vstot_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstncot_vssl : GCCBuiltin<"__builtin_ve_vl_vstncot_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstncot_vssml : GCCBuiltin<"__builtin_ve_vl_vstncot_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstu_vssl : GCCBuiltin<"__builtin_ve_vl_vstu_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstu_vssml : GCCBuiltin<"__builtin_ve_vl_vstu_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstunc_vssl : GCCBuiltin<"__builtin_ve_vl_vstunc_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstunc_vssml : GCCBuiltin<"__builtin_ve_vl_vstunc_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstuot_vssl : GCCBuiltin<"__builtin_ve_vl_vstuot_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstuot_vssml : GCCBuiltin<"__builtin_ve_vl_vstuot_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstuncot_vssl : GCCBuiltin<"__builtin_ve_vl_vstuncot_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstuncot_vssml : GCCBuiltin<"__builtin_ve_vl_vstuncot_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstl_vssl : GCCBuiltin<"__builtin_ve_vl_vstl_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstl_vssml : GCCBuiltin<"__builtin_ve_vl_vstl_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstlnc_vssl : GCCBuiltin<"__builtin_ve_vl_vstlnc_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstlnc_vssml : GCCBuiltin<"__builtin_ve_vl_vstlnc_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstlot_vssl : GCCBuiltin<"__builtin_ve_vl_vstlot_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstlot_vssml : GCCBuiltin<"__builtin_ve_vl_vstlot_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstlncot_vssl : GCCBuiltin<"__builtin_ve_vl_vstlncot_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstlncot_vssml : GCCBuiltin<"__builtin_ve_vl_vstlncot_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vst2d_vssl : GCCBuiltin<"__builtin_ve_vl_vst2d_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vst2d_vssml : GCCBuiltin<"__builtin_ve_vl_vst2d_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vst2dnc_vssl : GCCBuiltin<"__builtin_ve_vl_vst2dnc_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vst2dnc_vssml : GCCBuiltin<"__builtin_ve_vl_vst2dnc_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vst2dot_vssl : GCCBuiltin<"__builtin_ve_vl_vst2dot_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vst2dot_vssml : GCCBuiltin<"__builtin_ve_vl_vst2dot_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vst2dncot_vssl : GCCBuiltin<"__builtin_ve_vl_vst2dncot_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vst2dncot_vssml : GCCBuiltin<"__builtin_ve_vl_vst2dncot_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstu2d_vssl : GCCBuiltin<"__builtin_ve_vl_vstu2d_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstu2d_vssml : GCCBuiltin<"__builtin_ve_vl_vstu2d_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstu2dnc_vssl : GCCBuiltin<"__builtin_ve_vl_vstu2dnc_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstu2dnc_vssml : GCCBuiltin<"__builtin_ve_vl_vstu2dnc_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstu2dot_vssl : GCCBuiltin<"__builtin_ve_vl_vstu2dot_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstu2dot_vssml : GCCBuiltin<"__builtin_ve_vl_vstu2dot_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstu2dncot_vssl : GCCBuiltin<"__builtin_ve_vl_vstu2dncot_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstu2dncot_vssml : GCCBuiltin<"__builtin_ve_vl_vstu2dncot_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstl2d_vssl : GCCBuiltin<"__builtin_ve_vl_vstl2d_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstl2d_vssml : GCCBuiltin<"__builtin_ve_vl_vstl2d_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstl2dnc_vssl : GCCBuiltin<"__builtin_ve_vl_vstl2dnc_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstl2dnc_vssml : GCCBuiltin<"__builtin_ve_vl_vstl2dnc_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstl2dot_vssl : GCCBuiltin<"__builtin_ve_vl_vstl2dot_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstl2dot_vssml : GCCBuiltin<"__builtin_ve_vl_vstl2dot_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstl2dncot_vssl : GCCBuiltin<"__builtin_ve_vl_vstl2dncot_vssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vstl2dncot_vssml : GCCBuiltin<"__builtin_ve_vl_vstl2dncot_vssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<i64>, llvm_ptr_ty, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pfchv_ssl : GCCBuiltin<"__builtin_ve_vl_pfchv_ssl">, Intrinsic<[], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrInaccessibleMemOrArgMemOnly]>;
+let TargetPrefix = "ve" in def int_ve_vl_pfchvnc_ssl : GCCBuiltin<"__builtin_ve_vl_pfchvnc_ssl">, Intrinsic<[], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrInaccessibleMemOrArgMemOnly]>;
+let TargetPrefix = "ve" in def int_ve_vl_lsv_vvss : GCCBuiltin<"__builtin_ve_vl_lsv_vvss">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<i64>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_lvsl_svs : GCCBuiltin<"__builtin_ve_vl_lvsl_svs">, Intrinsic<[LLVMType<i64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_lvsd_svs : GCCBuiltin<"__builtin_ve_vl_lvsd_svs">, Intrinsic<[LLVMType<f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_lvss_svs : GCCBuiltin<"__builtin_ve_vl_lvss_svs">, Intrinsic<[LLVMType<f32>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_lvm_mmss : GCCBuiltin<"__builtin_ve_vl_lvm_mmss">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256i1>, LLVMType<i64>, LLVMType<i64>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_lvm_MMss : GCCBuiltin<"__builtin_ve_vl_lvm_MMss">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v512i1>, LLVMType<i64>, LLVMType<i64>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_svm_sms : GCCBuiltin<"__builtin_ve_vl_svm_sms">, Intrinsic<[LLVMType<i64>], [LLVMType<v256i1>, LLVMType<i64>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_svm_sMs : GCCBuiltin<"__builtin_ve_vl_svm_sMs">, Intrinsic<[LLVMType<i64>], [LLVMType<v512i1>, LLVMType<i64>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vbrdd_vsl : GCCBuiltin<"__builtin_ve_vl_vbrdd_vsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vbrdd_vsvl : GCCBuiltin<"__builtin_ve_vl_vbrdd_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vbrdd_vsmvl : GCCBuiltin<"__builtin_ve_vl_vbrdd_vsmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vbrdl_vsl : GCCBuiltin<"__builtin_ve_vl_vbrdl_vsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vbrdl_vsvl : GCCBuiltin<"__builtin_ve_vl_vbrdl_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vbrdl_vsmvl : GCCBuiltin<"__builtin_ve_vl_vbrdl_vsmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vbrds_vsl : GCCBuiltin<"__builtin_ve_vl_vbrds_vsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vbrds_vsvl : GCCBuiltin<"__builtin_ve_vl_vbrds_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vbrds_vsmvl : GCCBuiltin<"__builtin_ve_vl_vbrds_vsmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vbrdw_vsl : GCCBuiltin<"__builtin_ve_vl_vbrdw_vsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vbrdw_vsvl : GCCBuiltin<"__builtin_ve_vl_vbrdw_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vbrdw_vsmvl : GCCBuiltin<"__builtin_ve_vl_vbrdw_vsmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvbrd_vsl : GCCBuiltin<"__builtin_ve_vl_pvbrd_vsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvbrd_vsvl : GCCBuiltin<"__builtin_ve_vl_pvbrd_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvbrd_vsMvl : GCCBuiltin<"__builtin_ve_vl_pvbrd_vsMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmv_vsvl : GCCBuiltin<"__builtin_ve_vl_vmv_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmv_vsvvl : GCCBuiltin<"__builtin_ve_vl_vmv_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmv_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vmv_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vaddul_vvvl : GCCBuiltin<"__builtin_ve_vl_vaddul_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vaddul_vvvvl : GCCBuiltin<"__builtin_ve_vl_vaddul_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vaddul_vsvl : GCCBuiltin<"__builtin_ve_vl_vaddul_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vaddul_vsvvl : GCCBuiltin<"__builtin_ve_vl_vaddul_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vaddul_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vaddul_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vaddul_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vaddul_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vadduw_vvvl : GCCBuiltin<"__builtin_ve_vl_vadduw_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vadduw_vvvvl : GCCBuiltin<"__builtin_ve_vl_vadduw_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vadduw_vsvl : GCCBuiltin<"__builtin_ve_vl_vadduw_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vadduw_vsvvl : GCCBuiltin<"__builtin_ve_vl_vadduw_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vadduw_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vadduw_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vadduw_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vadduw_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvaddu_vvvl : GCCBuiltin<"__builtin_ve_vl_pvaddu_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvaddu_vvvvl : GCCBuiltin<"__builtin_ve_vl_pvaddu_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvaddu_vsvl : GCCBuiltin<"__builtin_ve_vl_pvaddu_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvaddu_vsvvl : GCCBuiltin<"__builtin_ve_vl_pvaddu_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvaddu_vvvMvl : GCCBuiltin<"__builtin_ve_vl_pvaddu_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvaddu_vsvMvl : GCCBuiltin<"__builtin_ve_vl_pvaddu_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vaddswsx_vvvl : GCCBuiltin<"__builtin_ve_vl_vaddswsx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vaddswsx_vvvvl : GCCBuiltin<"__builtin_ve_vl_vaddswsx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vaddswsx_vsvl : GCCBuiltin<"__builtin_ve_vl_vaddswsx_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vaddswsx_vsvvl : GCCBuiltin<"__builtin_ve_vl_vaddswsx_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vaddswsx_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vaddswsx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vaddswsx_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vaddswsx_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vaddswzx_vvvl : GCCBuiltin<"__builtin_ve_vl_vaddswzx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vaddswzx_vvvvl : GCCBuiltin<"__builtin_ve_vl_vaddswzx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vaddswzx_vsvl : GCCBuiltin<"__builtin_ve_vl_vaddswzx_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vaddswzx_vsvvl : GCCBuiltin<"__builtin_ve_vl_vaddswzx_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vaddswzx_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vaddswzx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vaddswzx_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vaddswzx_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvadds_vvvl : GCCBuiltin<"__builtin_ve_vl_pvadds_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvadds_vvvvl : GCCBuiltin<"__builtin_ve_vl_pvadds_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvadds_vsvl : GCCBuiltin<"__builtin_ve_vl_pvadds_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvadds_vsvvl : GCCBuiltin<"__builtin_ve_vl_pvadds_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvadds_vvvMvl : GCCBuiltin<"__builtin_ve_vl_pvadds_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvadds_vsvMvl : GCCBuiltin<"__builtin_ve_vl_pvadds_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vaddsl_vvvl : GCCBuiltin<"__builtin_ve_vl_vaddsl_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vaddsl_vvvvl : GCCBuiltin<"__builtin_ve_vl_vaddsl_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vaddsl_vsvl : GCCBuiltin<"__builtin_ve_vl_vaddsl_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vaddsl_vsvvl : GCCBuiltin<"__builtin_ve_vl_vaddsl_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vaddsl_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vaddsl_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vaddsl_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vaddsl_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubul_vvvl : GCCBuiltin<"__builtin_ve_vl_vsubul_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubul_vvvvl : GCCBuiltin<"__builtin_ve_vl_vsubul_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubul_vsvl : GCCBuiltin<"__builtin_ve_vl_vsubul_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubul_vsvvl : GCCBuiltin<"__builtin_ve_vl_vsubul_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubul_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vsubul_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubul_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vsubul_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubuw_vvvl : GCCBuiltin<"__builtin_ve_vl_vsubuw_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubuw_vvvvl : GCCBuiltin<"__builtin_ve_vl_vsubuw_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubuw_vsvl : GCCBuiltin<"__builtin_ve_vl_vsubuw_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubuw_vsvvl : GCCBuiltin<"__builtin_ve_vl_vsubuw_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubuw_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vsubuw_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubuw_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vsubuw_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsubu_vvvl : GCCBuiltin<"__builtin_ve_vl_pvsubu_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsubu_vvvvl : GCCBuiltin<"__builtin_ve_vl_pvsubu_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsubu_vsvl : GCCBuiltin<"__builtin_ve_vl_pvsubu_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsubu_vsvvl : GCCBuiltin<"__builtin_ve_vl_pvsubu_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsubu_vvvMvl : GCCBuiltin<"__builtin_ve_vl_pvsubu_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsubu_vsvMvl : GCCBuiltin<"__builtin_ve_vl_pvsubu_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubswsx_vvvl : GCCBuiltin<"__builtin_ve_vl_vsubswsx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubswsx_vvvvl : GCCBuiltin<"__builtin_ve_vl_vsubswsx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubswsx_vsvl : GCCBuiltin<"__builtin_ve_vl_vsubswsx_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubswsx_vsvvl : GCCBuiltin<"__builtin_ve_vl_vsubswsx_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubswsx_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vsubswsx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubswsx_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vsubswsx_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubswzx_vvvl : GCCBuiltin<"__builtin_ve_vl_vsubswzx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubswzx_vvvvl : GCCBuiltin<"__builtin_ve_vl_vsubswzx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubswzx_vsvl : GCCBuiltin<"__builtin_ve_vl_vsubswzx_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubswzx_vsvvl : GCCBuiltin<"__builtin_ve_vl_vsubswzx_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubswzx_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vsubswzx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubswzx_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vsubswzx_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsubs_vvvl : GCCBuiltin<"__builtin_ve_vl_pvsubs_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsubs_vvvvl : GCCBuiltin<"__builtin_ve_vl_pvsubs_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsubs_vsvl : GCCBuiltin<"__builtin_ve_vl_pvsubs_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsubs_vsvvl : GCCBuiltin<"__builtin_ve_vl_pvsubs_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsubs_vvvMvl : GCCBuiltin<"__builtin_ve_vl_pvsubs_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsubs_vsvMvl : GCCBuiltin<"__builtin_ve_vl_pvsubs_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubsl_vvvl : GCCBuiltin<"__builtin_ve_vl_vsubsl_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubsl_vvvvl : GCCBuiltin<"__builtin_ve_vl_vsubsl_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubsl_vsvl : GCCBuiltin<"__builtin_ve_vl_vsubsl_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubsl_vsvvl : GCCBuiltin<"__builtin_ve_vl_vsubsl_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubsl_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vsubsl_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsubsl_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vsubsl_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulul_vvvl : GCCBuiltin<"__builtin_ve_vl_vmulul_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulul_vvvvl : GCCBuiltin<"__builtin_ve_vl_vmulul_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulul_vsvl : GCCBuiltin<"__builtin_ve_vl_vmulul_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulul_vsvvl : GCCBuiltin<"__builtin_ve_vl_vmulul_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulul_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vmulul_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulul_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vmulul_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmuluw_vvvl : GCCBuiltin<"__builtin_ve_vl_vmuluw_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmuluw_vvvvl : GCCBuiltin<"__builtin_ve_vl_vmuluw_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmuluw_vsvl : GCCBuiltin<"__builtin_ve_vl_vmuluw_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmuluw_vsvvl : GCCBuiltin<"__builtin_ve_vl_vmuluw_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmuluw_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vmuluw_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmuluw_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vmuluw_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulswsx_vvvl : GCCBuiltin<"__builtin_ve_vl_vmulswsx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulswsx_vvvvl : GCCBuiltin<"__builtin_ve_vl_vmulswsx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulswsx_vsvl : GCCBuiltin<"__builtin_ve_vl_vmulswsx_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulswsx_vsvvl : GCCBuiltin<"__builtin_ve_vl_vmulswsx_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulswsx_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vmulswsx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulswsx_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vmulswsx_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulswzx_vvvl : GCCBuiltin<"__builtin_ve_vl_vmulswzx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulswzx_vvvvl : GCCBuiltin<"__builtin_ve_vl_vmulswzx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulswzx_vsvl : GCCBuiltin<"__builtin_ve_vl_vmulswzx_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulswzx_vsvvl : GCCBuiltin<"__builtin_ve_vl_vmulswzx_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulswzx_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vmulswzx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulswzx_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vmulswzx_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulsl_vvvl : GCCBuiltin<"__builtin_ve_vl_vmulsl_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulsl_vvvvl : GCCBuiltin<"__builtin_ve_vl_vmulsl_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulsl_vsvl : GCCBuiltin<"__builtin_ve_vl_vmulsl_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulsl_vsvvl : GCCBuiltin<"__builtin_ve_vl_vmulsl_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulsl_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vmulsl_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulsl_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vmulsl_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulslw_vvvl : GCCBuiltin<"__builtin_ve_vl_vmulslw_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulslw_vvvvl : GCCBuiltin<"__builtin_ve_vl_vmulslw_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulslw_vsvl : GCCBuiltin<"__builtin_ve_vl_vmulslw_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmulslw_vsvvl : GCCBuiltin<"__builtin_ve_vl_vmulslw_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivul_vvvl : GCCBuiltin<"__builtin_ve_vl_vdivul_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivul_vvvvl : GCCBuiltin<"__builtin_ve_vl_vdivul_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivul_vsvl : GCCBuiltin<"__builtin_ve_vl_vdivul_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivul_vsvvl : GCCBuiltin<"__builtin_ve_vl_vdivul_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivul_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vdivul_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivul_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vdivul_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivuw_vvvl : GCCBuiltin<"__builtin_ve_vl_vdivuw_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivuw_vvvvl : GCCBuiltin<"__builtin_ve_vl_vdivuw_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivuw_vsvl : GCCBuiltin<"__builtin_ve_vl_vdivuw_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivuw_vsvvl : GCCBuiltin<"__builtin_ve_vl_vdivuw_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivuw_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vdivuw_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivuw_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vdivuw_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivul_vvsl : GCCBuiltin<"__builtin_ve_vl_vdivul_vvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivul_vvsvl : GCCBuiltin<"__builtin_ve_vl_vdivul_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivul_vvsmvl : GCCBuiltin<"__builtin_ve_vl_vdivul_vvsmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivuw_vvsl : GCCBuiltin<"__builtin_ve_vl_vdivuw_vvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivuw_vvsvl : GCCBuiltin<"__builtin_ve_vl_vdivuw_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivuw_vvsmvl : GCCBuiltin<"__builtin_ve_vl_vdivuw_vvsmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivswsx_vvvl : GCCBuiltin<"__builtin_ve_vl_vdivswsx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivswsx_vvvvl : GCCBuiltin<"__builtin_ve_vl_vdivswsx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivswsx_vsvl : GCCBuiltin<"__builtin_ve_vl_vdivswsx_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivswsx_vsvvl : GCCBuiltin<"__builtin_ve_vl_vdivswsx_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivswsx_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vdivswsx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivswsx_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vdivswsx_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivswzx_vvvl : GCCBuiltin<"__builtin_ve_vl_vdivswzx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivswzx_vvvvl : GCCBuiltin<"__builtin_ve_vl_vdivswzx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivswzx_vsvl : GCCBuiltin<"__builtin_ve_vl_vdivswzx_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivswzx_vsvvl : GCCBuiltin<"__builtin_ve_vl_vdivswzx_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivswzx_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vdivswzx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivswzx_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vdivswzx_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivswsx_vvsl : GCCBuiltin<"__builtin_ve_vl_vdivswsx_vvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivswsx_vvsvl : GCCBuiltin<"__builtin_ve_vl_vdivswsx_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivswsx_vvsmvl : GCCBuiltin<"__builtin_ve_vl_vdivswsx_vvsmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivswzx_vvsl : GCCBuiltin<"__builtin_ve_vl_vdivswzx_vvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivswzx_vvsvl : GCCBuiltin<"__builtin_ve_vl_vdivswzx_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivswzx_vvsmvl : GCCBuiltin<"__builtin_ve_vl_vdivswzx_vvsmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivsl_vvvl : GCCBuiltin<"__builtin_ve_vl_vdivsl_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivsl_vvvvl : GCCBuiltin<"__builtin_ve_vl_vdivsl_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivsl_vsvl : GCCBuiltin<"__builtin_ve_vl_vdivsl_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivsl_vsvvl : GCCBuiltin<"__builtin_ve_vl_vdivsl_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivsl_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vdivsl_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivsl_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vdivsl_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivsl_vvsl : GCCBuiltin<"__builtin_ve_vl_vdivsl_vvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivsl_vvsvl : GCCBuiltin<"__builtin_ve_vl_vdivsl_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vdivsl_vvsmvl : GCCBuiltin<"__builtin_ve_vl_vdivsl_vvsmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpul_vvvl : GCCBuiltin<"__builtin_ve_vl_vcmpul_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpul_vvvvl : GCCBuiltin<"__builtin_ve_vl_vcmpul_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpul_vsvl : GCCBuiltin<"__builtin_ve_vl_vcmpul_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpul_vsvvl : GCCBuiltin<"__builtin_ve_vl_vcmpul_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpul_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vcmpul_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpul_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vcmpul_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpuw_vvvl : GCCBuiltin<"__builtin_ve_vl_vcmpuw_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpuw_vvvvl : GCCBuiltin<"__builtin_ve_vl_vcmpuw_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpuw_vsvl : GCCBuiltin<"__builtin_ve_vl_vcmpuw_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpuw_vsvvl : GCCBuiltin<"__builtin_ve_vl_vcmpuw_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpuw_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vcmpuw_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpuw_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vcmpuw_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvcmpu_vvvl : GCCBuiltin<"__builtin_ve_vl_pvcmpu_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvcmpu_vvvvl : GCCBuiltin<"__builtin_ve_vl_pvcmpu_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvcmpu_vsvl : GCCBuiltin<"__builtin_ve_vl_pvcmpu_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvcmpu_vsvvl : GCCBuiltin<"__builtin_ve_vl_pvcmpu_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvcmpu_vvvMvl : GCCBuiltin<"__builtin_ve_vl_pvcmpu_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvcmpu_vsvMvl : GCCBuiltin<"__builtin_ve_vl_pvcmpu_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpswsx_vvvl : GCCBuiltin<"__builtin_ve_vl_vcmpswsx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpswsx_vvvvl : GCCBuiltin<"__builtin_ve_vl_vcmpswsx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpswsx_vsvl : GCCBuiltin<"__builtin_ve_vl_vcmpswsx_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpswsx_vsvvl : GCCBuiltin<"__builtin_ve_vl_vcmpswsx_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpswsx_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vcmpswsx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpswsx_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vcmpswsx_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpswzx_vvvl : GCCBuiltin<"__builtin_ve_vl_vcmpswzx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpswzx_vvvvl : GCCBuiltin<"__builtin_ve_vl_vcmpswzx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpswzx_vsvl : GCCBuiltin<"__builtin_ve_vl_vcmpswzx_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpswzx_vsvvl : GCCBuiltin<"__builtin_ve_vl_vcmpswzx_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpswzx_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vcmpswzx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpswzx_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vcmpswzx_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvcmps_vvvl : GCCBuiltin<"__builtin_ve_vl_pvcmps_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvcmps_vvvvl : GCCBuiltin<"__builtin_ve_vl_pvcmps_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvcmps_vsvl : GCCBuiltin<"__builtin_ve_vl_pvcmps_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvcmps_vsvvl : GCCBuiltin<"__builtin_ve_vl_pvcmps_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvcmps_vvvMvl : GCCBuiltin<"__builtin_ve_vl_pvcmps_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvcmps_vsvMvl : GCCBuiltin<"__builtin_ve_vl_pvcmps_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpsl_vvvl : GCCBuiltin<"__builtin_ve_vl_vcmpsl_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpsl_vvvvl : GCCBuiltin<"__builtin_ve_vl_vcmpsl_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpsl_vsvl : GCCBuiltin<"__builtin_ve_vl_vcmpsl_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpsl_vsvvl : GCCBuiltin<"__builtin_ve_vl_vcmpsl_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpsl_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vcmpsl_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcmpsl_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vcmpsl_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmaxswsx_vvvl : GCCBuiltin<"__builtin_ve_vl_vmaxswsx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmaxswsx_vvvvl : GCCBuiltin<"__builtin_ve_vl_vmaxswsx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmaxswsx_vsvl : GCCBuiltin<"__builtin_ve_vl_vmaxswsx_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmaxswsx_vsvvl : GCCBuiltin<"__builtin_ve_vl_vmaxswsx_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmaxswsx_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vmaxswsx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmaxswsx_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vmaxswsx_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmaxswzx_vvvl : GCCBuiltin<"__builtin_ve_vl_vmaxswzx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmaxswzx_vvvvl : GCCBuiltin<"__builtin_ve_vl_vmaxswzx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmaxswzx_vsvl : GCCBuiltin<"__builtin_ve_vl_vmaxswzx_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmaxswzx_vsvvl : GCCBuiltin<"__builtin_ve_vl_vmaxswzx_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmaxswzx_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vmaxswzx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmaxswzx_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vmaxswzx_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvmaxs_vvvl : GCCBuiltin<"__builtin_ve_vl_pvmaxs_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvmaxs_vvvvl : GCCBuiltin<"__builtin_ve_vl_pvmaxs_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvmaxs_vsvl : GCCBuiltin<"__builtin_ve_vl_pvmaxs_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvmaxs_vsvvl : GCCBuiltin<"__builtin_ve_vl_pvmaxs_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvmaxs_vvvMvl : GCCBuiltin<"__builtin_ve_vl_pvmaxs_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvmaxs_vsvMvl : GCCBuiltin<"__builtin_ve_vl_pvmaxs_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vminswsx_vvvl : GCCBuiltin<"__builtin_ve_vl_vminswsx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vminswsx_vvvvl : GCCBuiltin<"__builtin_ve_vl_vminswsx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vminswsx_vsvl : GCCBuiltin<"__builtin_ve_vl_vminswsx_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vminswsx_vsvvl : GCCBuiltin<"__builtin_ve_vl_vminswsx_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vminswsx_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vminswsx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vminswsx_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vminswsx_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vminswzx_vvvl : GCCBuiltin<"__builtin_ve_vl_vminswzx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vminswzx_vvvvl : GCCBuiltin<"__builtin_ve_vl_vminswzx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vminswzx_vsvl : GCCBuiltin<"__builtin_ve_vl_vminswzx_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vminswzx_vsvvl : GCCBuiltin<"__builtin_ve_vl_vminswzx_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vminswzx_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vminswzx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vminswzx_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vminswzx_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvmins_vvvl : GCCBuiltin<"__builtin_ve_vl_pvmins_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvmins_vvvvl : GCCBuiltin<"__builtin_ve_vl_pvmins_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvmins_vsvl : GCCBuiltin<"__builtin_ve_vl_pvmins_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvmins_vsvvl : GCCBuiltin<"__builtin_ve_vl_pvmins_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvmins_vvvMvl : GCCBuiltin<"__builtin_ve_vl_pvmins_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvmins_vsvMvl : GCCBuiltin<"__builtin_ve_vl_pvmins_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmaxsl_vvvl : GCCBuiltin<"__builtin_ve_vl_vmaxsl_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmaxsl_vvvvl : GCCBuiltin<"__builtin_ve_vl_vmaxsl_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmaxsl_vsvl : GCCBuiltin<"__builtin_ve_vl_vmaxsl_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmaxsl_vsvvl : GCCBuiltin<"__builtin_ve_vl_vmaxsl_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmaxsl_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vmaxsl_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmaxsl_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vmaxsl_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vminsl_vvvl : GCCBuiltin<"__builtin_ve_vl_vminsl_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vminsl_vvvvl : GCCBuiltin<"__builtin_ve_vl_vminsl_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vminsl_vsvl : GCCBuiltin<"__builtin_ve_vl_vminsl_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vminsl_vsvvl : GCCBuiltin<"__builtin_ve_vl_vminsl_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vminsl_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vminsl_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vminsl_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vminsl_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vand_vvvl : GCCBuiltin<"__builtin_ve_vl_vand_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vand_vvvvl : GCCBuiltin<"__builtin_ve_vl_vand_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vand_vsvl : GCCBuiltin<"__builtin_ve_vl_vand_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vand_vsvvl : GCCBuiltin<"__builtin_ve_vl_vand_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vand_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vand_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vand_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vand_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvand_vvvl : GCCBuiltin<"__builtin_ve_vl_pvand_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvand_vvvvl : GCCBuiltin<"__builtin_ve_vl_pvand_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvand_vsvl : GCCBuiltin<"__builtin_ve_vl_pvand_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvand_vsvvl : GCCBuiltin<"__builtin_ve_vl_pvand_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvand_vvvMvl : GCCBuiltin<"__builtin_ve_vl_pvand_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvand_vsvMvl : GCCBuiltin<"__builtin_ve_vl_pvand_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vor_vvvl : GCCBuiltin<"__builtin_ve_vl_vor_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vor_vvvvl : GCCBuiltin<"__builtin_ve_vl_vor_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vor_vsvl : GCCBuiltin<"__builtin_ve_vl_vor_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vor_vsvvl : GCCBuiltin<"__builtin_ve_vl_vor_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vor_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vor_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vor_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vor_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvor_vvvl : GCCBuiltin<"__builtin_ve_vl_pvor_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvor_vvvvl : GCCBuiltin<"__builtin_ve_vl_pvor_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvor_vsvl : GCCBuiltin<"__builtin_ve_vl_pvor_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvor_vsvvl : GCCBuiltin<"__builtin_ve_vl_pvor_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvor_vvvMvl : GCCBuiltin<"__builtin_ve_vl_pvor_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvor_vsvMvl : GCCBuiltin<"__builtin_ve_vl_pvor_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vxor_vvvl : GCCBuiltin<"__builtin_ve_vl_vxor_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vxor_vvvvl : GCCBuiltin<"__builtin_ve_vl_vxor_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vxor_vsvl : GCCBuiltin<"__builtin_ve_vl_vxor_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vxor_vsvvl : GCCBuiltin<"__builtin_ve_vl_vxor_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vxor_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vxor_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vxor_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vxor_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvxor_vvvl : GCCBuiltin<"__builtin_ve_vl_pvxor_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvxor_vvvvl : GCCBuiltin<"__builtin_ve_vl_pvxor_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvxor_vsvl : GCCBuiltin<"__builtin_ve_vl_pvxor_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvxor_vsvvl : GCCBuiltin<"__builtin_ve_vl_pvxor_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvxor_vvvMvl : GCCBuiltin<"__builtin_ve_vl_pvxor_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvxor_vsvMvl : GCCBuiltin<"__builtin_ve_vl_pvxor_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_veqv_vvvl : GCCBuiltin<"__builtin_ve_vl_veqv_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_veqv_vvvvl : GCCBuiltin<"__builtin_ve_vl_veqv_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_veqv_vsvl : GCCBuiltin<"__builtin_ve_vl_veqv_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_veqv_vsvvl : GCCBuiltin<"__builtin_ve_vl_veqv_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_veqv_vvvmvl : GCCBuiltin<"__builtin_ve_vl_veqv_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_veqv_vsvmvl : GCCBuiltin<"__builtin_ve_vl_veqv_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pveqv_vvvl : GCCBuiltin<"__builtin_ve_vl_pveqv_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pveqv_vvvvl : GCCBuiltin<"__builtin_ve_vl_pveqv_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pveqv_vsvl : GCCBuiltin<"__builtin_ve_vl_pveqv_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pveqv_vsvvl : GCCBuiltin<"__builtin_ve_vl_pveqv_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pveqv_vvvMvl : GCCBuiltin<"__builtin_ve_vl_pveqv_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pveqv_vsvMvl : GCCBuiltin<"__builtin_ve_vl_pveqv_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vseq_vl : GCCBuiltin<"__builtin_ve_vl_vseq_vl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vseq_vvl : GCCBuiltin<"__builtin_ve_vl_vseq_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvseqlo_vl : GCCBuiltin<"__builtin_ve_vl_pvseqlo_vl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvseqlo_vvl : GCCBuiltin<"__builtin_ve_vl_pvseqlo_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsequp_vl : GCCBuiltin<"__builtin_ve_vl_pvsequp_vl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsequp_vvl : GCCBuiltin<"__builtin_ve_vl_pvsequp_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvseq_vl : GCCBuiltin<"__builtin_ve_vl_pvseq_vl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvseq_vvl : GCCBuiltin<"__builtin_ve_vl_pvseq_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsll_vvvl : GCCBuiltin<"__builtin_ve_vl_vsll_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsll_vvvvl : GCCBuiltin<"__builtin_ve_vl_vsll_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsll_vvsl : GCCBuiltin<"__builtin_ve_vl_vsll_vvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsll_vvsvl : GCCBuiltin<"__builtin_ve_vl_vsll_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsll_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vsll_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsll_vvsmvl : GCCBuiltin<"__builtin_ve_vl_vsll_vvsmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsll_vvvl : GCCBuiltin<"__builtin_ve_vl_pvsll_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsll_vvvvl : GCCBuiltin<"__builtin_ve_vl_pvsll_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsll_vvsl : GCCBuiltin<"__builtin_ve_vl_pvsll_vvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsll_vvsvl : GCCBuiltin<"__builtin_ve_vl_pvsll_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsll_vvvMvl : GCCBuiltin<"__builtin_ve_vl_pvsll_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsll_vvsMvl : GCCBuiltin<"__builtin_ve_vl_pvsll_vvsMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsrl_vvvl : GCCBuiltin<"__builtin_ve_vl_vsrl_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsrl_vvvvl : GCCBuiltin<"__builtin_ve_vl_vsrl_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsrl_vvsl : GCCBuiltin<"__builtin_ve_vl_vsrl_vvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsrl_vvsvl : GCCBuiltin<"__builtin_ve_vl_vsrl_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsrl_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vsrl_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsrl_vvsmvl : GCCBuiltin<"__builtin_ve_vl_vsrl_vvsmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsrl_vvvl : GCCBuiltin<"__builtin_ve_vl_pvsrl_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsrl_vvvvl : GCCBuiltin<"__builtin_ve_vl_pvsrl_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsrl_vvsl : GCCBuiltin<"__builtin_ve_vl_pvsrl_vvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsrl_vvsvl : GCCBuiltin<"__builtin_ve_vl_pvsrl_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsrl_vvvMvl : GCCBuiltin<"__builtin_ve_vl_pvsrl_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsrl_vvsMvl : GCCBuiltin<"__builtin_ve_vl_pvsrl_vvsMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vslawsx_vvvl : GCCBuiltin<"__builtin_ve_vl_vslawsx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vslawsx_vvvvl : GCCBuiltin<"__builtin_ve_vl_vslawsx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vslawsx_vvsl : GCCBuiltin<"__builtin_ve_vl_vslawsx_vvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vslawsx_vvsvl : GCCBuiltin<"__builtin_ve_vl_vslawsx_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vslawsx_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vslawsx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vslawsx_vvsmvl : GCCBuiltin<"__builtin_ve_vl_vslawsx_vvsmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vslawzx_vvvl : GCCBuiltin<"__builtin_ve_vl_vslawzx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vslawzx_vvvvl : GCCBuiltin<"__builtin_ve_vl_vslawzx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vslawzx_vvsl : GCCBuiltin<"__builtin_ve_vl_vslawzx_vvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vslawzx_vvsvl : GCCBuiltin<"__builtin_ve_vl_vslawzx_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vslawzx_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vslawzx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vslawzx_vvsmvl : GCCBuiltin<"__builtin_ve_vl_vslawzx_vvsmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsla_vvvl : GCCBuiltin<"__builtin_ve_vl_pvsla_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsla_vvvvl : GCCBuiltin<"__builtin_ve_vl_pvsla_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsla_vvsl : GCCBuiltin<"__builtin_ve_vl_pvsla_vvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsla_vvsvl : GCCBuiltin<"__builtin_ve_vl_pvsla_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsla_vvvMvl : GCCBuiltin<"__builtin_ve_vl_pvsla_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsla_vvsMvl : GCCBuiltin<"__builtin_ve_vl_pvsla_vvsMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vslal_vvvl : GCCBuiltin<"__builtin_ve_vl_vslal_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vslal_vvvvl : GCCBuiltin<"__builtin_ve_vl_vslal_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vslal_vvsl : GCCBuiltin<"__builtin_ve_vl_vslal_vvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vslal_vvsvl : GCCBuiltin<"__builtin_ve_vl_vslal_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vslal_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vslal_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vslal_vvsmvl : GCCBuiltin<"__builtin_ve_vl_vslal_vvsmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsrawsx_vvvl : GCCBuiltin<"__builtin_ve_vl_vsrawsx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsrawsx_vvvvl : GCCBuiltin<"__builtin_ve_vl_vsrawsx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsrawsx_vvsl : GCCBuiltin<"__builtin_ve_vl_vsrawsx_vvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsrawsx_vvsvl : GCCBuiltin<"__builtin_ve_vl_vsrawsx_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsrawsx_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vsrawsx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsrawsx_vvsmvl : GCCBuiltin<"__builtin_ve_vl_vsrawsx_vvsmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsrawzx_vvvl : GCCBuiltin<"__builtin_ve_vl_vsrawzx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsrawzx_vvvvl : GCCBuiltin<"__builtin_ve_vl_vsrawzx_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsrawzx_vvsl : GCCBuiltin<"__builtin_ve_vl_vsrawzx_vvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsrawzx_vvsvl : GCCBuiltin<"__builtin_ve_vl_vsrawzx_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsrawzx_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vsrawzx_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsrawzx_vvsmvl : GCCBuiltin<"__builtin_ve_vl_vsrawzx_vvsmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsra_vvvl : GCCBuiltin<"__builtin_ve_vl_pvsra_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsra_vvvvl : GCCBuiltin<"__builtin_ve_vl_pvsra_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsra_vvsl : GCCBuiltin<"__builtin_ve_vl_pvsra_vvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsra_vvsvl : GCCBuiltin<"__builtin_ve_vl_pvsra_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsra_vvvMvl : GCCBuiltin<"__builtin_ve_vl_pvsra_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvsra_vvsMvl : GCCBuiltin<"__builtin_ve_vl_pvsra_vvsMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsral_vvvl : GCCBuiltin<"__builtin_ve_vl_vsral_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsral_vvvvl : GCCBuiltin<"__builtin_ve_vl_vsral_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsral_vvsl : GCCBuiltin<"__builtin_ve_vl_vsral_vvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsral_vvsvl : GCCBuiltin<"__builtin_ve_vl_vsral_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsral_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vsral_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsral_vvsmvl : GCCBuiltin<"__builtin_ve_vl_vsral_vvsmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsfa_vvssl : GCCBuiltin<"__builtin_ve_vl_vsfa_vvssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsfa_vvssvl : GCCBuiltin<"__builtin_ve_vl_vsfa_vvssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsfa_vvssmvl : GCCBuiltin<"__builtin_ve_vl_vsfa_vvssmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfaddd_vvvl : GCCBuiltin<"__builtin_ve_vl_vfaddd_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfaddd_vvvvl : GCCBuiltin<"__builtin_ve_vl_vfaddd_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfaddd_vsvl : GCCBuiltin<"__builtin_ve_vl_vfaddd_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfaddd_vsvvl : GCCBuiltin<"__builtin_ve_vl_vfaddd_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfaddd_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vfaddd_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfaddd_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vfaddd_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfadds_vvvl : GCCBuiltin<"__builtin_ve_vl_vfadds_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfadds_vvvvl : GCCBuiltin<"__builtin_ve_vl_vfadds_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfadds_vsvl : GCCBuiltin<"__builtin_ve_vl_vfadds_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfadds_vsvvl : GCCBuiltin<"__builtin_ve_vl_vfadds_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfadds_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vfadds_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfadds_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vfadds_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfadd_vvvl : GCCBuiltin<"__builtin_ve_vl_pvfadd_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfadd_vvvvl : GCCBuiltin<"__builtin_ve_vl_pvfadd_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfadd_vsvl : GCCBuiltin<"__builtin_ve_vl_pvfadd_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfadd_vsvvl : GCCBuiltin<"__builtin_ve_vl_pvfadd_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfadd_vvvMvl : GCCBuiltin<"__builtin_ve_vl_pvfadd_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfadd_vsvMvl : GCCBuiltin<"__builtin_ve_vl_pvfadd_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfsubd_vvvl : GCCBuiltin<"__builtin_ve_vl_vfsubd_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfsubd_vvvvl : GCCBuiltin<"__builtin_ve_vl_vfsubd_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfsubd_vsvl : GCCBuiltin<"__builtin_ve_vl_vfsubd_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfsubd_vsvvl : GCCBuiltin<"__builtin_ve_vl_vfsubd_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfsubd_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vfsubd_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfsubd_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vfsubd_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfsubs_vvvl : GCCBuiltin<"__builtin_ve_vl_vfsubs_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfsubs_vvvvl : GCCBuiltin<"__builtin_ve_vl_vfsubs_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfsubs_vsvl : GCCBuiltin<"__builtin_ve_vl_vfsubs_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfsubs_vsvvl : GCCBuiltin<"__builtin_ve_vl_vfsubs_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfsubs_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vfsubs_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfsubs_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vfsubs_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfsub_vvvl : GCCBuiltin<"__builtin_ve_vl_pvfsub_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfsub_vvvvl : GCCBuiltin<"__builtin_ve_vl_pvfsub_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfsub_vsvl : GCCBuiltin<"__builtin_ve_vl_pvfsub_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfsub_vsvvl : GCCBuiltin<"__builtin_ve_vl_pvfsub_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfsub_vvvMvl : GCCBuiltin<"__builtin_ve_vl_pvfsub_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfsub_vsvMvl : GCCBuiltin<"__builtin_ve_vl_pvfsub_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmuld_vvvl : GCCBuiltin<"__builtin_ve_vl_vfmuld_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmuld_vvvvl : GCCBuiltin<"__builtin_ve_vl_vfmuld_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmuld_vsvl : GCCBuiltin<"__builtin_ve_vl_vfmuld_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmuld_vsvvl : GCCBuiltin<"__builtin_ve_vl_vfmuld_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmuld_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vfmuld_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmuld_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vfmuld_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmuls_vvvl : GCCBuiltin<"__builtin_ve_vl_vfmuls_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmuls_vvvvl : GCCBuiltin<"__builtin_ve_vl_vfmuls_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmuls_vsvl : GCCBuiltin<"__builtin_ve_vl_vfmuls_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmuls_vsvvl : GCCBuiltin<"__builtin_ve_vl_vfmuls_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmuls_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vfmuls_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmuls_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vfmuls_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmul_vvvl : GCCBuiltin<"__builtin_ve_vl_pvfmul_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmul_vvvvl : GCCBuiltin<"__builtin_ve_vl_pvfmul_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmul_vsvl : GCCBuiltin<"__builtin_ve_vl_pvfmul_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmul_vsvvl : GCCBuiltin<"__builtin_ve_vl_pvfmul_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmul_vvvMvl : GCCBuiltin<"__builtin_ve_vl_pvfmul_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmul_vsvMvl : GCCBuiltin<"__builtin_ve_vl_pvfmul_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfdivd_vvvl : GCCBuiltin<"__builtin_ve_vl_vfdivd_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfdivd_vvvvl : GCCBuiltin<"__builtin_ve_vl_vfdivd_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfdivd_vsvl : GCCBuiltin<"__builtin_ve_vl_vfdivd_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfdivd_vsvvl : GCCBuiltin<"__builtin_ve_vl_vfdivd_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfdivd_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vfdivd_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfdivd_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vfdivd_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfdivs_vvvl : GCCBuiltin<"__builtin_ve_vl_vfdivs_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfdivs_vvvvl : GCCBuiltin<"__builtin_ve_vl_vfdivs_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfdivs_vsvl : GCCBuiltin<"__builtin_ve_vl_vfdivs_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfdivs_vsvvl : GCCBuiltin<"__builtin_ve_vl_vfdivs_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfdivs_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vfdivs_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfdivs_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vfdivs_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfsqrtd_vvl : GCCBuiltin<"__builtin_ve_vl_vfsqrtd_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfsqrtd_vvvl : GCCBuiltin<"__builtin_ve_vl_vfsqrtd_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfsqrts_vvl : GCCBuiltin<"__builtin_ve_vl_vfsqrts_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfsqrts_vvvl : GCCBuiltin<"__builtin_ve_vl_vfsqrts_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfcmpd_vvvl : GCCBuiltin<"__builtin_ve_vl_vfcmpd_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfcmpd_vvvvl : GCCBuiltin<"__builtin_ve_vl_vfcmpd_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfcmpd_vsvl : GCCBuiltin<"__builtin_ve_vl_vfcmpd_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfcmpd_vsvvl : GCCBuiltin<"__builtin_ve_vl_vfcmpd_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfcmpd_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vfcmpd_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfcmpd_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vfcmpd_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfcmps_vvvl : GCCBuiltin<"__builtin_ve_vl_vfcmps_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfcmps_vvvvl : GCCBuiltin<"__builtin_ve_vl_vfcmps_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfcmps_vsvl : GCCBuiltin<"__builtin_ve_vl_vfcmps_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfcmps_vsvvl : GCCBuiltin<"__builtin_ve_vl_vfcmps_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfcmps_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vfcmps_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfcmps_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vfcmps_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfcmp_vvvl : GCCBuiltin<"__builtin_ve_vl_pvfcmp_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfcmp_vvvvl : GCCBuiltin<"__builtin_ve_vl_pvfcmp_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfcmp_vsvl : GCCBuiltin<"__builtin_ve_vl_pvfcmp_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfcmp_vsvvl : GCCBuiltin<"__builtin_ve_vl_pvfcmp_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfcmp_vvvMvl : GCCBuiltin<"__builtin_ve_vl_pvfcmp_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfcmp_vsvMvl : GCCBuiltin<"__builtin_ve_vl_pvfcmp_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmaxd_vvvl : GCCBuiltin<"__builtin_ve_vl_vfmaxd_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmaxd_vvvvl : GCCBuiltin<"__builtin_ve_vl_vfmaxd_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmaxd_vsvl : GCCBuiltin<"__builtin_ve_vl_vfmaxd_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmaxd_vsvvl : GCCBuiltin<"__builtin_ve_vl_vfmaxd_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmaxd_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vfmaxd_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmaxd_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vfmaxd_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmaxs_vvvl : GCCBuiltin<"__builtin_ve_vl_vfmaxs_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmaxs_vvvvl : GCCBuiltin<"__builtin_ve_vl_vfmaxs_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmaxs_vsvl : GCCBuiltin<"__builtin_ve_vl_vfmaxs_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmaxs_vsvvl : GCCBuiltin<"__builtin_ve_vl_vfmaxs_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmaxs_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vfmaxs_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmaxs_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vfmaxs_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmax_vvvl : GCCBuiltin<"__builtin_ve_vl_pvfmax_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmax_vvvvl : GCCBuiltin<"__builtin_ve_vl_pvfmax_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmax_vsvl : GCCBuiltin<"__builtin_ve_vl_pvfmax_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmax_vsvvl : GCCBuiltin<"__builtin_ve_vl_pvfmax_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmax_vvvMvl : GCCBuiltin<"__builtin_ve_vl_pvfmax_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmax_vsvMvl : GCCBuiltin<"__builtin_ve_vl_pvfmax_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmind_vvvl : GCCBuiltin<"__builtin_ve_vl_vfmind_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmind_vvvvl : GCCBuiltin<"__builtin_ve_vl_vfmind_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmind_vsvl : GCCBuiltin<"__builtin_ve_vl_vfmind_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmind_vsvvl : GCCBuiltin<"__builtin_ve_vl_vfmind_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmind_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vfmind_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmind_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vfmind_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmins_vvvl : GCCBuiltin<"__builtin_ve_vl_vfmins_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmins_vvvvl : GCCBuiltin<"__builtin_ve_vl_vfmins_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmins_vsvl : GCCBuiltin<"__builtin_ve_vl_vfmins_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmins_vsvvl : GCCBuiltin<"__builtin_ve_vl_vfmins_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmins_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vfmins_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmins_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vfmins_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmin_vvvl : GCCBuiltin<"__builtin_ve_vl_pvfmin_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmin_vvvvl : GCCBuiltin<"__builtin_ve_vl_pvfmin_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmin_vsvl : GCCBuiltin<"__builtin_ve_vl_pvfmin_vsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmin_vsvvl : GCCBuiltin<"__builtin_ve_vl_pvfmin_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmin_vvvMvl : GCCBuiltin<"__builtin_ve_vl_pvfmin_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmin_vsvMvl : GCCBuiltin<"__builtin_ve_vl_pvfmin_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmadd_vvvvl : GCCBuiltin<"__builtin_ve_vl_vfmadd_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmadd_vvvvvl : GCCBuiltin<"__builtin_ve_vl_vfmadd_vvvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmadd_vsvvl : GCCBuiltin<"__builtin_ve_vl_vfmadd_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmadd_vsvvvl : GCCBuiltin<"__builtin_ve_vl_vfmadd_vsvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmadd_vvsvl : GCCBuiltin<"__builtin_ve_vl_vfmadd_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmadd_vvsvvl : GCCBuiltin<"__builtin_ve_vl_vfmadd_vvsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmadd_vvvvmvl : GCCBuiltin<"__builtin_ve_vl_vfmadd_vvvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmadd_vsvvmvl : GCCBuiltin<"__builtin_ve_vl_vfmadd_vsvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmadd_vvsvmvl : GCCBuiltin<"__builtin_ve_vl_vfmadd_vvsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmads_vvvvl : GCCBuiltin<"__builtin_ve_vl_vfmads_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmads_vvvvvl : GCCBuiltin<"__builtin_ve_vl_vfmads_vvvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmads_vsvvl : GCCBuiltin<"__builtin_ve_vl_vfmads_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmads_vsvvvl : GCCBuiltin<"__builtin_ve_vl_vfmads_vsvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmads_vvsvl : GCCBuiltin<"__builtin_ve_vl_vfmads_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmads_vvsvvl : GCCBuiltin<"__builtin_ve_vl_vfmads_vvsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmads_vvvvmvl : GCCBuiltin<"__builtin_ve_vl_vfmads_vvvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmads_vsvvmvl : GCCBuiltin<"__builtin_ve_vl_vfmads_vsvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmads_vvsvmvl : GCCBuiltin<"__builtin_ve_vl_vfmads_vvsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmad_vvvvl : GCCBuiltin<"__builtin_ve_vl_pvfmad_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmad_vvvvvl : GCCBuiltin<"__builtin_ve_vl_pvfmad_vvvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmad_vsvvl : GCCBuiltin<"__builtin_ve_vl_pvfmad_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmad_vsvvvl : GCCBuiltin<"__builtin_ve_vl_pvfmad_vsvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmad_vvsvl : GCCBuiltin<"__builtin_ve_vl_pvfmad_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmad_vvsvvl : GCCBuiltin<"__builtin_ve_vl_pvfmad_vvsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmad_vvvvMvl : GCCBuiltin<"__builtin_ve_vl_pvfmad_vvvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmad_vsvvMvl : GCCBuiltin<"__builtin_ve_vl_pvfmad_vsvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmad_vvsvMvl : GCCBuiltin<"__builtin_ve_vl_pvfmad_vvsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmsbd_vvvvl : GCCBuiltin<"__builtin_ve_vl_vfmsbd_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmsbd_vvvvvl : GCCBuiltin<"__builtin_ve_vl_vfmsbd_vvvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmsbd_vsvvl : GCCBuiltin<"__builtin_ve_vl_vfmsbd_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmsbd_vsvvvl : GCCBuiltin<"__builtin_ve_vl_vfmsbd_vsvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmsbd_vvsvl : GCCBuiltin<"__builtin_ve_vl_vfmsbd_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmsbd_vvsvvl : GCCBuiltin<"__builtin_ve_vl_vfmsbd_vvsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmsbd_vvvvmvl : GCCBuiltin<"__builtin_ve_vl_vfmsbd_vvvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmsbd_vsvvmvl : GCCBuiltin<"__builtin_ve_vl_vfmsbd_vsvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmsbd_vvsvmvl : GCCBuiltin<"__builtin_ve_vl_vfmsbd_vvsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmsbs_vvvvl : GCCBuiltin<"__builtin_ve_vl_vfmsbs_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmsbs_vvvvvl : GCCBuiltin<"__builtin_ve_vl_vfmsbs_vvvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmsbs_vsvvl : GCCBuiltin<"__builtin_ve_vl_vfmsbs_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmsbs_vsvvvl : GCCBuiltin<"__builtin_ve_vl_vfmsbs_vsvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmsbs_vvsvl : GCCBuiltin<"__builtin_ve_vl_vfmsbs_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmsbs_vvsvvl : GCCBuiltin<"__builtin_ve_vl_vfmsbs_vvsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmsbs_vvvvmvl : GCCBuiltin<"__builtin_ve_vl_vfmsbs_vvvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmsbs_vsvvmvl : GCCBuiltin<"__builtin_ve_vl_vfmsbs_vsvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmsbs_vvsvmvl : GCCBuiltin<"__builtin_ve_vl_vfmsbs_vvsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmsb_vvvvl : GCCBuiltin<"__builtin_ve_vl_pvfmsb_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmsb_vvvvvl : GCCBuiltin<"__builtin_ve_vl_pvfmsb_vvvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmsb_vsvvl : GCCBuiltin<"__builtin_ve_vl_pvfmsb_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmsb_vsvvvl : GCCBuiltin<"__builtin_ve_vl_pvfmsb_vsvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmsb_vvsvl : GCCBuiltin<"__builtin_ve_vl_pvfmsb_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmsb_vvsvvl : GCCBuiltin<"__builtin_ve_vl_pvfmsb_vvsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmsb_vvvvMvl : GCCBuiltin<"__builtin_ve_vl_pvfmsb_vvvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmsb_vsvvMvl : GCCBuiltin<"__builtin_ve_vl_pvfmsb_vsvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmsb_vvsvMvl : GCCBuiltin<"__builtin_ve_vl_pvfmsb_vvsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmadd_vvvvl : GCCBuiltin<"__builtin_ve_vl_vfnmadd_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmadd_vvvvvl : GCCBuiltin<"__builtin_ve_vl_vfnmadd_vvvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmadd_vsvvl : GCCBuiltin<"__builtin_ve_vl_vfnmadd_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmadd_vsvvvl : GCCBuiltin<"__builtin_ve_vl_vfnmadd_vsvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmadd_vvsvl : GCCBuiltin<"__builtin_ve_vl_vfnmadd_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmadd_vvsvvl : GCCBuiltin<"__builtin_ve_vl_vfnmadd_vvsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmadd_vvvvmvl : GCCBuiltin<"__builtin_ve_vl_vfnmadd_vvvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmadd_vsvvmvl : GCCBuiltin<"__builtin_ve_vl_vfnmadd_vsvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmadd_vvsvmvl : GCCBuiltin<"__builtin_ve_vl_vfnmadd_vvsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmads_vvvvl : GCCBuiltin<"__builtin_ve_vl_vfnmads_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmads_vvvvvl : GCCBuiltin<"__builtin_ve_vl_vfnmads_vvvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmads_vsvvl : GCCBuiltin<"__builtin_ve_vl_vfnmads_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmads_vsvvvl : GCCBuiltin<"__builtin_ve_vl_vfnmads_vsvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmads_vvsvl : GCCBuiltin<"__builtin_ve_vl_vfnmads_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmads_vvsvvl : GCCBuiltin<"__builtin_ve_vl_vfnmads_vvsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmads_vvvvmvl : GCCBuiltin<"__builtin_ve_vl_vfnmads_vvvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmads_vsvvmvl : GCCBuiltin<"__builtin_ve_vl_vfnmads_vsvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmads_vvsvmvl : GCCBuiltin<"__builtin_ve_vl_vfnmads_vvsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfnmad_vvvvl : GCCBuiltin<"__builtin_ve_vl_pvfnmad_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfnmad_vvvvvl : GCCBuiltin<"__builtin_ve_vl_pvfnmad_vvvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfnmad_vsvvl : GCCBuiltin<"__builtin_ve_vl_pvfnmad_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfnmad_vsvvvl : GCCBuiltin<"__builtin_ve_vl_pvfnmad_vsvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfnmad_vvsvl : GCCBuiltin<"__builtin_ve_vl_pvfnmad_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfnmad_vvsvvl : GCCBuiltin<"__builtin_ve_vl_pvfnmad_vvsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfnmad_vvvvMvl : GCCBuiltin<"__builtin_ve_vl_pvfnmad_vvvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfnmad_vsvvMvl : GCCBuiltin<"__builtin_ve_vl_pvfnmad_vsvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfnmad_vvsvMvl : GCCBuiltin<"__builtin_ve_vl_pvfnmad_vvsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmsbd_vvvvl : GCCBuiltin<"__builtin_ve_vl_vfnmsbd_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmsbd_vvvvvl : GCCBuiltin<"__builtin_ve_vl_vfnmsbd_vvvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmsbd_vsvvl : GCCBuiltin<"__builtin_ve_vl_vfnmsbd_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmsbd_vsvvvl : GCCBuiltin<"__builtin_ve_vl_vfnmsbd_vsvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmsbd_vvsvl : GCCBuiltin<"__builtin_ve_vl_vfnmsbd_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmsbd_vvsvvl : GCCBuiltin<"__builtin_ve_vl_vfnmsbd_vvsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmsbd_vvvvmvl : GCCBuiltin<"__builtin_ve_vl_vfnmsbd_vvvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmsbd_vsvvmvl : GCCBuiltin<"__builtin_ve_vl_vfnmsbd_vsvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmsbd_vvsvmvl : GCCBuiltin<"__builtin_ve_vl_vfnmsbd_vvsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmsbs_vvvvl : GCCBuiltin<"__builtin_ve_vl_vfnmsbs_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmsbs_vvvvvl : GCCBuiltin<"__builtin_ve_vl_vfnmsbs_vvvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmsbs_vsvvl : GCCBuiltin<"__builtin_ve_vl_vfnmsbs_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmsbs_vsvvvl : GCCBuiltin<"__builtin_ve_vl_vfnmsbs_vsvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmsbs_vvsvl : GCCBuiltin<"__builtin_ve_vl_vfnmsbs_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f32>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmsbs_vvsvvl : GCCBuiltin<"__builtin_ve_vl_vfnmsbs_vvsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmsbs_vvvvmvl : GCCBuiltin<"__builtin_ve_vl_vfnmsbs_vvvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmsbs_vsvvmvl : GCCBuiltin<"__builtin_ve_vl_vfnmsbs_vsvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfnmsbs_vvsvmvl : GCCBuiltin<"__builtin_ve_vl_vfnmsbs_vvsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<f32>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfnmsb_vvvvl : GCCBuiltin<"__builtin_ve_vl_pvfnmsb_vvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfnmsb_vvvvvl : GCCBuiltin<"__builtin_ve_vl_pvfnmsb_vvvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfnmsb_vsvvl : GCCBuiltin<"__builtin_ve_vl_pvfnmsb_vsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfnmsb_vsvvvl : GCCBuiltin<"__builtin_ve_vl_pvfnmsb_vsvvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfnmsb_vvsvl : GCCBuiltin<"__builtin_ve_vl_pvfnmsb_vvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfnmsb_vvsvvl : GCCBuiltin<"__builtin_ve_vl_pvfnmsb_vvsvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfnmsb_vvvvMvl : GCCBuiltin<"__builtin_ve_vl_pvfnmsb_vvvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfnmsb_vsvvMvl : GCCBuiltin<"__builtin_ve_vl_pvfnmsb_vsvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfnmsb_vvsvMvl : GCCBuiltin<"__builtin_ve_vl_pvfnmsb_vvsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrcpd_vvl : GCCBuiltin<"__builtin_ve_vl_vrcpd_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrcpd_vvvl : GCCBuiltin<"__builtin_ve_vl_vrcpd_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrcps_vvl : GCCBuiltin<"__builtin_ve_vl_vrcps_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrcps_vvvl : GCCBuiltin<"__builtin_ve_vl_vrcps_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvrcp_vvl : GCCBuiltin<"__builtin_ve_vl_pvrcp_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvrcp_vvvl : GCCBuiltin<"__builtin_ve_vl_pvrcp_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrsqrtd_vvl : GCCBuiltin<"__builtin_ve_vl_vrsqrtd_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrsqrtd_vvvl : GCCBuiltin<"__builtin_ve_vl_vrsqrtd_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrsqrts_vvl : GCCBuiltin<"__builtin_ve_vl_vrsqrts_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrsqrts_vvvl : GCCBuiltin<"__builtin_ve_vl_vrsqrts_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvrsqrt_vvl : GCCBuiltin<"__builtin_ve_vl_pvrsqrt_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvrsqrt_vvvl : GCCBuiltin<"__builtin_ve_vl_pvrsqrt_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrsqrtdnex_vvl : GCCBuiltin<"__builtin_ve_vl_vrsqrtdnex_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrsqrtdnex_vvvl : GCCBuiltin<"__builtin_ve_vl_vrsqrtdnex_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrsqrtsnex_vvl : GCCBuiltin<"__builtin_ve_vl_vrsqrtsnex_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrsqrtsnex_vvvl : GCCBuiltin<"__builtin_ve_vl_vrsqrtsnex_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvrsqrtnex_vvl : GCCBuiltin<"__builtin_ve_vl_pvrsqrtnex_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvrsqrtnex_vvvl : GCCBuiltin<"__builtin_ve_vl_pvrsqrtnex_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtwdsx_vvl : GCCBuiltin<"__builtin_ve_vl_vcvtwdsx_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtwdsx_vvvl : GCCBuiltin<"__builtin_ve_vl_vcvtwdsx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtwdsx_vvmvl : GCCBuiltin<"__builtin_ve_vl_vcvtwdsx_vvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtwdsxrz_vvl : GCCBuiltin<"__builtin_ve_vl_vcvtwdsxrz_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtwdsxrz_vvvl : GCCBuiltin<"__builtin_ve_vl_vcvtwdsxrz_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtwdsxrz_vvmvl : GCCBuiltin<"__builtin_ve_vl_vcvtwdsxrz_vvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtwdzx_vvl : GCCBuiltin<"__builtin_ve_vl_vcvtwdzx_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtwdzx_vvvl : GCCBuiltin<"__builtin_ve_vl_vcvtwdzx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtwdzx_vvmvl : GCCBuiltin<"__builtin_ve_vl_vcvtwdzx_vvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtwdzxrz_vvl : GCCBuiltin<"__builtin_ve_vl_vcvtwdzxrz_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtwdzxrz_vvvl : GCCBuiltin<"__builtin_ve_vl_vcvtwdzxrz_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtwdzxrz_vvmvl : GCCBuiltin<"__builtin_ve_vl_vcvtwdzxrz_vvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtwssx_vvl : GCCBuiltin<"__builtin_ve_vl_vcvtwssx_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtwssx_vvvl : GCCBuiltin<"__builtin_ve_vl_vcvtwssx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtwssx_vvmvl : GCCBuiltin<"__builtin_ve_vl_vcvtwssx_vvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtwssxrz_vvl : GCCBuiltin<"__builtin_ve_vl_vcvtwssxrz_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtwssxrz_vvvl : GCCBuiltin<"__builtin_ve_vl_vcvtwssxrz_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtwssxrz_vvmvl : GCCBuiltin<"__builtin_ve_vl_vcvtwssxrz_vvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtwszx_vvl : GCCBuiltin<"__builtin_ve_vl_vcvtwszx_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtwszx_vvvl : GCCBuiltin<"__builtin_ve_vl_vcvtwszx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtwszx_vvmvl : GCCBuiltin<"__builtin_ve_vl_vcvtwszx_vvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtwszxrz_vvl : GCCBuiltin<"__builtin_ve_vl_vcvtwszxrz_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtwszxrz_vvvl : GCCBuiltin<"__builtin_ve_vl_vcvtwszxrz_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtwszxrz_vvmvl : GCCBuiltin<"__builtin_ve_vl_vcvtwszxrz_vvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvcvtws_vvl : GCCBuiltin<"__builtin_ve_vl_pvcvtws_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvcvtws_vvvl : GCCBuiltin<"__builtin_ve_vl_pvcvtws_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvcvtws_vvMvl : GCCBuiltin<"__builtin_ve_vl_pvcvtws_vvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvcvtwsrz_vvl : GCCBuiltin<"__builtin_ve_vl_pvcvtwsrz_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvcvtwsrz_vvvl : GCCBuiltin<"__builtin_ve_vl_pvcvtwsrz_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvcvtwsrz_vvMvl : GCCBuiltin<"__builtin_ve_vl_pvcvtwsrz_vvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtld_vvl : GCCBuiltin<"__builtin_ve_vl_vcvtld_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtld_vvvl : GCCBuiltin<"__builtin_ve_vl_vcvtld_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtld_vvmvl : GCCBuiltin<"__builtin_ve_vl_vcvtld_vvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtldrz_vvl : GCCBuiltin<"__builtin_ve_vl_vcvtldrz_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtldrz_vvvl : GCCBuiltin<"__builtin_ve_vl_vcvtldrz_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtldrz_vvmvl : GCCBuiltin<"__builtin_ve_vl_vcvtldrz_vvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtdw_vvl : GCCBuiltin<"__builtin_ve_vl_vcvtdw_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtdw_vvvl : GCCBuiltin<"__builtin_ve_vl_vcvtdw_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtsw_vvl : GCCBuiltin<"__builtin_ve_vl_vcvtsw_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtsw_vvvl : GCCBuiltin<"__builtin_ve_vl_vcvtsw_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvcvtsw_vvl : GCCBuiltin<"__builtin_ve_vl_pvcvtsw_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvcvtsw_vvvl : GCCBuiltin<"__builtin_ve_vl_pvcvtsw_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtdl_vvl : GCCBuiltin<"__builtin_ve_vl_vcvtdl_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtdl_vvvl : GCCBuiltin<"__builtin_ve_vl_vcvtdl_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtds_vvl : GCCBuiltin<"__builtin_ve_vl_vcvtds_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtds_vvvl : GCCBuiltin<"__builtin_ve_vl_vcvtds_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtsd_vvl : GCCBuiltin<"__builtin_ve_vl_vcvtsd_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcvtsd_vvvl : GCCBuiltin<"__builtin_ve_vl_vcvtsd_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmrg_vvvml : GCCBuiltin<"__builtin_ve_vl_vmrg_vvvml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmrg_vvvmvl : GCCBuiltin<"__builtin_ve_vl_vmrg_vvvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmrg_vsvml : GCCBuiltin<"__builtin_ve_vl_vmrg_vsvml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmrg_vsvmvl : GCCBuiltin<"__builtin_ve_vl_vmrg_vsvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmrgw_vvvMl : GCCBuiltin<"__builtin_ve_vl_vmrgw_vvvMl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmrgw_vvvMvl : GCCBuiltin<"__builtin_ve_vl_vmrgw_vvvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmrgw_vsvMl : GCCBuiltin<"__builtin_ve_vl_vmrgw_vsvMl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vmrgw_vsvMvl : GCCBuiltin<"__builtin_ve_vl_vmrgw_vsvMvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i32>, LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vshf_vvvsl : GCCBuiltin<"__builtin_ve_vl_vshf_vvvsl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vshf_vvvsvl : GCCBuiltin<"__builtin_ve_vl_vshf_vvvsvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vcp_vvmvl : GCCBuiltin<"__builtin_ve_vl_vcp_vvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vex_vvmvl : GCCBuiltin<"__builtin_ve_vl_vex_vvmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmklat_ml : GCCBuiltin<"__builtin_ve_vl_vfmklat_ml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmklaf_ml : GCCBuiltin<"__builtin_ve_vl_vfmklaf_ml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkat_Ml : GCCBuiltin<"__builtin_ve_vl_pvfmkat_Ml">, Intrinsic<[LLVMType<v512i1>], [LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkaf_Ml : GCCBuiltin<"__builtin_ve_vl_pvfmkaf_Ml">, Intrinsic<[LLVMType<v512i1>], [LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmklgt_mvl : GCCBuiltin<"__builtin_ve_vl_vfmklgt_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmklgt_mvml : GCCBuiltin<"__builtin_ve_vl_vfmklgt_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkllt_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkllt_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkllt_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkllt_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmklne_mvl : GCCBuiltin<"__builtin_ve_vl_vfmklne_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmklne_mvml : GCCBuiltin<"__builtin_ve_vl_vfmklne_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkleq_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkleq_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkleq_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkleq_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmklge_mvl : GCCBuiltin<"__builtin_ve_vl_vfmklge_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmklge_mvml : GCCBuiltin<"__builtin_ve_vl_vfmklge_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmklle_mvl : GCCBuiltin<"__builtin_ve_vl_vfmklle_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmklle_mvml : GCCBuiltin<"__builtin_ve_vl_vfmklle_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmklnum_mvl : GCCBuiltin<"__builtin_ve_vl_vfmklnum_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmklnum_mvml : GCCBuiltin<"__builtin_ve_vl_vfmklnum_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmklnan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmklnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmklnan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmklnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmklgtnan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmklgtnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmklgtnan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmklgtnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmklltnan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmklltnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmklltnan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmklltnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmklnenan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmklnenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmklnenan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmklnenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkleqnan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkleqnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkleqnan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkleqnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmklgenan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmklgenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmklgenan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmklgenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkllenan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkllenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkllenan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkllenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkwgt_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkwgt_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkwgt_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkwgt_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkwlt_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkwlt_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkwlt_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkwlt_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkwne_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkwne_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkwne_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkwne_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkweq_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkweq_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkweq_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkweq_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkwge_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkwge_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkwge_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkwge_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkwle_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkwle_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkwle_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkwle_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkwnum_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkwnum_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkwnum_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkwnum_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkwnan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkwnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkwnan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkwnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkwgtnan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkwgtnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkwgtnan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkwgtnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkwltnan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkwltnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkwltnan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkwltnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkwnenan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkwnenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkwnenan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkwnenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkweqnan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkweqnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkweqnan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkweqnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkwgenan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkwgenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkwgenan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkwgenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkwlenan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkwlenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkwlenan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkwlenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlogt_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwlogt_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupgt_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwupgt_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlogt_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwlogt_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupgt_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwupgt_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlolt_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwlolt_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwuplt_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwuplt_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlolt_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwlolt_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwuplt_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwuplt_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlone_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwlone_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupne_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwupne_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlone_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwlone_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupne_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwupne_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwloeq_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwloeq_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupeq_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwupeq_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwloeq_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwloeq_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupeq_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwupeq_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwloge_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwloge_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupge_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwupge_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwloge_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwloge_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupge_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwupge_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlole_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwlole_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwuple_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwuple_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlole_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwlole_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwuple_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwuple_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlonum_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwlonum_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupnum_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwupnum_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlonum_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwlonum_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupnum_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwupnum_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlonan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwlonan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupnan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwupnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlonan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwlonan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupnan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwupnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlogtnan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwlogtnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupgtnan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwupgtnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlogtnan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwlogtnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupgtnan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwupgtnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwloltnan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwloltnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupltnan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwupltnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwloltnan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwloltnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupltnan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwupltnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlonenan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwlonenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupnenan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwupnenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlonenan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwlonenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupnenan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwupnenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwloeqnan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwloeqnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupeqnan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwupeqnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwloeqnan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwloeqnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupeqnan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwupeqnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlogenan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwlogenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupgenan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwupgenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlogenan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwlogenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwupgenan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwupgenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlolenan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwlolenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwuplenan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwuplenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlolenan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwlolenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwuplenan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkwuplenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwgt_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwgt_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwgt_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmkwgt_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlt_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwlt_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlt_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmkwlt_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwne_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwne_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwne_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmkwne_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkweq_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkweq_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkweq_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmkweq_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwge_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwge_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwge_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmkwge_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwle_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwle_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwle_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmkwle_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwnum_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwnum_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwnum_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmkwnum_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwnan_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwnan_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwnan_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmkwnan_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwgtnan_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwgtnan_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwgtnan_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmkwgtnan_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwltnan_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwltnan_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwltnan_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmkwltnan_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwnenan_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwnenan_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwnenan_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmkwnenan_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkweqnan_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkweqnan_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkweqnan_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmkweqnan_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwgenan_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwgenan_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwgenan_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmkwgenan_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlenan_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkwlenan_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkwlenan_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmkwlenan_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdgt_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkdgt_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdgt_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkdgt_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdlt_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkdlt_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdlt_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkdlt_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdne_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkdne_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdne_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkdne_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdeq_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkdeq_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdeq_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkdeq_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdge_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkdge_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdge_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkdge_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdle_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkdle_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdle_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkdle_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdnum_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkdnum_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdnum_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkdnum_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdnan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkdnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdnan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkdnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdgtnan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkdgtnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdgtnan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkdgtnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdltnan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkdltnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdltnan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkdltnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdnenan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkdnenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdnenan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkdnenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdeqnan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkdeqnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdeqnan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkdeqnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdgenan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkdgenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdgenan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkdgenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdlenan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkdlenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkdlenan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkdlenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmksgt_mvl : GCCBuiltin<"__builtin_ve_vl_vfmksgt_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmksgt_mvml : GCCBuiltin<"__builtin_ve_vl_vfmksgt_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkslt_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkslt_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkslt_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkslt_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmksne_mvl : GCCBuiltin<"__builtin_ve_vl_vfmksne_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmksne_mvml : GCCBuiltin<"__builtin_ve_vl_vfmksne_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkseq_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkseq_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkseq_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkseq_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmksge_mvl : GCCBuiltin<"__builtin_ve_vl_vfmksge_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmksge_mvml : GCCBuiltin<"__builtin_ve_vl_vfmksge_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmksle_mvl : GCCBuiltin<"__builtin_ve_vl_vfmksle_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmksle_mvml : GCCBuiltin<"__builtin_ve_vl_vfmksle_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmksnum_mvl : GCCBuiltin<"__builtin_ve_vl_vfmksnum_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmksnum_mvml : GCCBuiltin<"__builtin_ve_vl_vfmksnum_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmksnan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmksnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmksnan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmksnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmksgtnan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmksgtnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmksgtnan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmksgtnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmksltnan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmksltnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmksltnan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmksltnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmksnenan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmksnenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmksnenan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmksnenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkseqnan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkseqnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkseqnan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkseqnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmksgenan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmksgenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmksgenan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmksgenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkslenan_mvl : GCCBuiltin<"__builtin_ve_vl_vfmkslenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfmkslenan_mvml : GCCBuiltin<"__builtin_ve_vl_vfmkslenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkslogt_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkslogt_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksupgt_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksupgt_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkslogt_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkslogt_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksupgt_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmksupgt_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkslolt_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkslolt_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksuplt_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksuplt_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkslolt_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkslolt_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksuplt_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmksuplt_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkslone_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkslone_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksupne_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksupne_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkslone_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkslone_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksupne_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmksupne_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksloeq_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksloeq_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksupeq_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksupeq_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksloeq_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmksloeq_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksupeq_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmksupeq_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksloge_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksloge_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksupge_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksupge_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksloge_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmksloge_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksupge_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmksupge_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkslole_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkslole_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksuple_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksuple_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkslole_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkslole_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksuple_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmksuple_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkslonum_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkslonum_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksupnum_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksupnum_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkslonum_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkslonum_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksupnum_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmksupnum_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkslonan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkslonan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksupnan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksupnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkslonan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkslonan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksupnan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmksupnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkslogtnan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkslogtnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksupgtnan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksupgtnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkslogtnan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkslogtnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksupgtnan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmksupgtnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksloltnan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksloltnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksupltnan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksupltnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksloltnan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmksloltnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksupltnan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmksupltnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkslonenan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkslonenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksupnenan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksupnenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkslonenan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkslonenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksupnenan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmksupnenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksloeqnan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksloeqnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksupeqnan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksupeqnan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksloeqnan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmksloeqnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksupeqnan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmksupeqnan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkslogenan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkslogenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksupgenan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksupgenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkslogenan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkslogenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksupgenan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmksupgenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkslolenan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkslolenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksuplenan_mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksuplenan_mvl">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkslolenan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmkslolenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksuplenan_mvml : GCCBuiltin<"__builtin_ve_vl_pvfmksuplenan_mvml">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksgt_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksgt_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksgt_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmksgt_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkslt_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkslt_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkslt_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmkslt_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksne_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksne_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksne_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmksne_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkseq_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkseq_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkseq_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmkseq_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksge_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksge_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksge_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmksge_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksle_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksle_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksle_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmksle_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksnum_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksnum_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksnum_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmksnum_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksnan_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksnan_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksnan_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmksnan_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksgtnan_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksgtnan_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksgtnan_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmksgtnan_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksltnan_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksltnan_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksltnan_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmksltnan_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksnenan_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksnenan_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksnenan_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmksnenan_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkseqnan_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkseqnan_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkseqnan_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmkseqnan_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksgenan_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmksgenan_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmksgenan_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmksgenan_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkslenan_Mvl : GCCBuiltin<"__builtin_ve_vl_pvfmkslenan_Mvl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pvfmkslenan_MvMl : GCCBuiltin<"__builtin_ve_vl_pvfmkslenan_MvMl">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v256f64>, LLVMType<v512i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsumwsx_vvl : GCCBuiltin<"__builtin_ve_vl_vsumwsx_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsumwsx_vvml : GCCBuiltin<"__builtin_ve_vl_vsumwsx_vvml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsumwzx_vvl : GCCBuiltin<"__builtin_ve_vl_vsumwzx_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsumwzx_vvml : GCCBuiltin<"__builtin_ve_vl_vsumwzx_vvml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsuml_vvl : GCCBuiltin<"__builtin_ve_vl_vsuml_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsuml_vvml : GCCBuiltin<"__builtin_ve_vl_vsuml_vvml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfsumd_vvl : GCCBuiltin<"__builtin_ve_vl_vfsumd_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfsumd_vvml : GCCBuiltin<"__builtin_ve_vl_vfsumd_vvml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfsums_vvl : GCCBuiltin<"__builtin_ve_vl_vfsums_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfsums_vvml : GCCBuiltin<"__builtin_ve_vl_vfsums_vvml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrmaxswfstsx_vvl : GCCBuiltin<"__builtin_ve_vl_vrmaxswfstsx_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrmaxswfstsx_vvvl : GCCBuiltin<"__builtin_ve_vl_vrmaxswfstsx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrmaxswlstsx_vvl : GCCBuiltin<"__builtin_ve_vl_vrmaxswlstsx_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrmaxswlstsx_vvvl : GCCBuiltin<"__builtin_ve_vl_vrmaxswlstsx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrmaxswfstzx_vvl : GCCBuiltin<"__builtin_ve_vl_vrmaxswfstzx_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrmaxswfstzx_vvvl : GCCBuiltin<"__builtin_ve_vl_vrmaxswfstzx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrmaxswlstzx_vvl : GCCBuiltin<"__builtin_ve_vl_vrmaxswlstzx_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrmaxswlstzx_vvvl : GCCBuiltin<"__builtin_ve_vl_vrmaxswlstzx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrminswfstsx_vvl : GCCBuiltin<"__builtin_ve_vl_vrminswfstsx_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrminswfstsx_vvvl : GCCBuiltin<"__builtin_ve_vl_vrminswfstsx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrminswlstsx_vvl : GCCBuiltin<"__builtin_ve_vl_vrminswlstsx_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrminswlstsx_vvvl : GCCBuiltin<"__builtin_ve_vl_vrminswlstsx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrminswfstzx_vvl : GCCBuiltin<"__builtin_ve_vl_vrminswfstzx_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrminswfstzx_vvvl : GCCBuiltin<"__builtin_ve_vl_vrminswfstzx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrminswlstzx_vvl : GCCBuiltin<"__builtin_ve_vl_vrminswlstzx_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrminswlstzx_vvvl : GCCBuiltin<"__builtin_ve_vl_vrminswlstzx_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrmaxslfst_vvl : GCCBuiltin<"__builtin_ve_vl_vrmaxslfst_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrmaxslfst_vvvl : GCCBuiltin<"__builtin_ve_vl_vrmaxslfst_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrmaxsllst_vvl : GCCBuiltin<"__builtin_ve_vl_vrmaxsllst_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrmaxsllst_vvvl : GCCBuiltin<"__builtin_ve_vl_vrmaxsllst_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrminslfst_vvl : GCCBuiltin<"__builtin_ve_vl_vrminslfst_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrminslfst_vvvl : GCCBuiltin<"__builtin_ve_vl_vrminslfst_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrminsllst_vvl : GCCBuiltin<"__builtin_ve_vl_vrminsllst_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrminsllst_vvvl : GCCBuiltin<"__builtin_ve_vl_vrminsllst_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfrmaxdfst_vvl : GCCBuiltin<"__builtin_ve_vl_vfrmaxdfst_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfrmaxdfst_vvvl : GCCBuiltin<"__builtin_ve_vl_vfrmaxdfst_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfrmaxdlst_vvl : GCCBuiltin<"__builtin_ve_vl_vfrmaxdlst_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfrmaxdlst_vvvl : GCCBuiltin<"__builtin_ve_vl_vfrmaxdlst_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfrmaxsfst_vvl : GCCBuiltin<"__builtin_ve_vl_vfrmaxsfst_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfrmaxsfst_vvvl : GCCBuiltin<"__builtin_ve_vl_vfrmaxsfst_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfrmaxslst_vvl : GCCBuiltin<"__builtin_ve_vl_vfrmaxslst_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfrmaxslst_vvvl : GCCBuiltin<"__builtin_ve_vl_vfrmaxslst_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfrmindfst_vvl : GCCBuiltin<"__builtin_ve_vl_vfrmindfst_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfrmindfst_vvvl : GCCBuiltin<"__builtin_ve_vl_vfrmindfst_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfrmindlst_vvl : GCCBuiltin<"__builtin_ve_vl_vfrmindlst_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfrmindlst_vvvl : GCCBuiltin<"__builtin_ve_vl_vfrmindlst_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfrminsfst_vvl : GCCBuiltin<"__builtin_ve_vl_vfrminsfst_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfrminsfst_vvvl : GCCBuiltin<"__builtin_ve_vl_vfrminsfst_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfrminslst_vvl : GCCBuiltin<"__builtin_ve_vl_vfrminslst_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vfrminslst_vvvl : GCCBuiltin<"__builtin_ve_vl_vfrminslst_vvvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrand_vvl : GCCBuiltin<"__builtin_ve_vl_vrand_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrand_vvml : GCCBuiltin<"__builtin_ve_vl_vrand_vvml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vror_vvl : GCCBuiltin<"__builtin_ve_vl_vror_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vror_vvml : GCCBuiltin<"__builtin_ve_vl_vror_vvml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrxor_vvl : GCCBuiltin<"__builtin_ve_vl_vrxor_vvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vrxor_vvml : GCCBuiltin<"__builtin_ve_vl_vrxor_vvml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgt_vvssl : GCCBuiltin<"__builtin_ve_vl_vgt_vvssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgt_vvssvl : GCCBuiltin<"__builtin_ve_vl_vgt_vvssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgt_vvssml : GCCBuiltin<"__builtin_ve_vl_vgt_vvssml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgt_vvssmvl : GCCBuiltin<"__builtin_ve_vl_vgt_vvssmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtnc_vvssl : GCCBuiltin<"__builtin_ve_vl_vgtnc_vvssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtnc_vvssvl : GCCBuiltin<"__builtin_ve_vl_vgtnc_vvssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtnc_vvssml : GCCBuiltin<"__builtin_ve_vl_vgtnc_vvssml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtnc_vvssmvl : GCCBuiltin<"__builtin_ve_vl_vgtnc_vvssmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtu_vvssl : GCCBuiltin<"__builtin_ve_vl_vgtu_vvssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtu_vvssvl : GCCBuiltin<"__builtin_ve_vl_vgtu_vvssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtu_vvssml : GCCBuiltin<"__builtin_ve_vl_vgtu_vvssml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtu_vvssmvl : GCCBuiltin<"__builtin_ve_vl_vgtu_vvssmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtunc_vvssl : GCCBuiltin<"__builtin_ve_vl_vgtunc_vvssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtunc_vvssvl : GCCBuiltin<"__builtin_ve_vl_vgtunc_vvssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtunc_vvssml : GCCBuiltin<"__builtin_ve_vl_vgtunc_vvssml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtunc_vvssmvl : GCCBuiltin<"__builtin_ve_vl_vgtunc_vvssmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtlsx_vvssl : GCCBuiltin<"__builtin_ve_vl_vgtlsx_vvssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtlsx_vvssvl : GCCBuiltin<"__builtin_ve_vl_vgtlsx_vvssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtlsx_vvssml : GCCBuiltin<"__builtin_ve_vl_vgtlsx_vvssml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtlsx_vvssmvl : GCCBuiltin<"__builtin_ve_vl_vgtlsx_vvssmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtlsxnc_vvssl : GCCBuiltin<"__builtin_ve_vl_vgtlsxnc_vvssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtlsxnc_vvssvl : GCCBuiltin<"__builtin_ve_vl_vgtlsxnc_vvssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtlsxnc_vvssml : GCCBuiltin<"__builtin_ve_vl_vgtlsxnc_vvssml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtlsxnc_vvssmvl : GCCBuiltin<"__builtin_ve_vl_vgtlsxnc_vvssmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtlzx_vvssl : GCCBuiltin<"__builtin_ve_vl_vgtlzx_vvssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtlzx_vvssvl : GCCBuiltin<"__builtin_ve_vl_vgtlzx_vvssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtlzx_vvssml : GCCBuiltin<"__builtin_ve_vl_vgtlzx_vvssml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtlzx_vvssmvl : GCCBuiltin<"__builtin_ve_vl_vgtlzx_vvssmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtlzxnc_vvssl : GCCBuiltin<"__builtin_ve_vl_vgtlzxnc_vvssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtlzxnc_vvssvl : GCCBuiltin<"__builtin_ve_vl_vgtlzxnc_vvssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtlzxnc_vvssml : GCCBuiltin<"__builtin_ve_vl_vgtlzxnc_vvssml">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vgtlzxnc_vvssmvl : GCCBuiltin<"__builtin_ve_vl_vgtlzxnc_vvssmvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsc_vvssl : GCCBuiltin<"__builtin_ve_vl_vsc_vvssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsc_vvssml : GCCBuiltin<"__builtin_ve_vl_vsc_vvssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vscnc_vvssl : GCCBuiltin<"__builtin_ve_vl_vscnc_vvssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vscnc_vvssml : GCCBuiltin<"__builtin_ve_vl_vscnc_vvssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vscot_vvssl : GCCBuiltin<"__builtin_ve_vl_vscot_vvssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vscot_vvssml : GCCBuiltin<"__builtin_ve_vl_vscot_vvssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vscncot_vvssl : GCCBuiltin<"__builtin_ve_vl_vscncot_vvssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vscncot_vvssml : GCCBuiltin<"__builtin_ve_vl_vscncot_vvssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vscu_vvssl : GCCBuiltin<"__builtin_ve_vl_vscu_vvssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vscu_vvssml : GCCBuiltin<"__builtin_ve_vl_vscu_vvssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vscunc_vvssl : GCCBuiltin<"__builtin_ve_vl_vscunc_vvssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vscunc_vvssml : GCCBuiltin<"__builtin_ve_vl_vscunc_vvssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vscuot_vvssl : GCCBuiltin<"__builtin_ve_vl_vscuot_vvssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vscuot_vvssml : GCCBuiltin<"__builtin_ve_vl_vscuot_vvssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vscuncot_vvssl : GCCBuiltin<"__builtin_ve_vl_vscuncot_vvssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vscuncot_vvssml : GCCBuiltin<"__builtin_ve_vl_vscuncot_vvssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vscl_vvssl : GCCBuiltin<"__builtin_ve_vl_vscl_vvssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vscl_vvssml : GCCBuiltin<"__builtin_ve_vl_vscl_vvssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsclnc_vvssl : GCCBuiltin<"__builtin_ve_vl_vsclnc_vvssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsclnc_vvssml : GCCBuiltin<"__builtin_ve_vl_vsclnc_vvssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsclot_vvssl : GCCBuiltin<"__builtin_ve_vl_vsclot_vvssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsclot_vvssml : GCCBuiltin<"__builtin_ve_vl_vsclot_vvssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsclncot_vvssl : GCCBuiltin<"__builtin_ve_vl_vsclncot_vvssl">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vsclncot_vvssml : GCCBuiltin<"__builtin_ve_vl_vsclncot_vvssml">, Intrinsic<[], [LLVMType<v256f64>, LLVMType<v256f64>, LLVMType<i64>, LLVMType<i64>, LLVMType<v256i1>, LLVMType<i32>], [IntrWriteMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_andm_mmm : GCCBuiltin<"__builtin_ve_vl_andm_mmm">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256i1>, LLVMType<v256i1>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_andm_MMM : GCCBuiltin<"__builtin_ve_vl_andm_MMM">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v512i1>, LLVMType<v512i1>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_orm_mmm : GCCBuiltin<"__builtin_ve_vl_orm_mmm">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256i1>, LLVMType<v256i1>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_orm_MMM : GCCBuiltin<"__builtin_ve_vl_orm_MMM">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v512i1>, LLVMType<v512i1>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_xorm_mmm : GCCBuiltin<"__builtin_ve_vl_xorm_mmm">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256i1>, LLVMType<v256i1>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_xorm_MMM : GCCBuiltin<"__builtin_ve_vl_xorm_MMM">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v512i1>, LLVMType<v512i1>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_eqvm_mmm : GCCBuiltin<"__builtin_ve_vl_eqvm_mmm">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256i1>, LLVMType<v256i1>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_eqvm_MMM : GCCBuiltin<"__builtin_ve_vl_eqvm_MMM">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v512i1>, LLVMType<v512i1>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_nndm_mmm : GCCBuiltin<"__builtin_ve_vl_nndm_mmm">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256i1>, LLVMType<v256i1>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_nndm_MMM : GCCBuiltin<"__builtin_ve_vl_nndm_MMM">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v512i1>, LLVMType<v512i1>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_negm_mm : GCCBuiltin<"__builtin_ve_vl_negm_mm">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256i1>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_negm_MM : GCCBuiltin<"__builtin_ve_vl_negm_MM">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v512i1>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_pcvm_sml : GCCBuiltin<"__builtin_ve_vl_pcvm_sml">, Intrinsic<[LLVMType<i64>], [LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_lzvm_sml : GCCBuiltin<"__builtin_ve_vl_lzvm_sml">, Intrinsic<[LLVMType<i64>], [LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_tovm_sml : GCCBuiltin<"__builtin_ve_vl_tovm_sml">, Intrinsic<[LLVMType<i64>], [LLVMType<v256i1>, LLVMType<i32>], [IntrNoMem]>;
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsWebAssembly.td b/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
index 7c9ceb148a47..d306d0ccb90d 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
+++ b/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
@@ -50,9 +50,10 @@ def int_wasm_trunc_saturate_unsigned : Intrinsic<[llvm_anyint_ty],
 //===----------------------------------------------------------------------===//
 
 // throw / rethrow
+// The immediate argument is an index to a tag, which is 0 for C++.
 def int_wasm_throw : Intrinsic<[], [llvm_i32_ty, llvm_ptr_ty],
                                [Throws, IntrNoReturn, ImmArg<ArgIndex<0>>]>;
-def int_wasm_rethrow_in_catch : Intrinsic<[], [], [Throws, IntrNoReturn]>;
+def int_wasm_rethrow : Intrinsic<[], [], [Throws, IntrNoReturn]>;
 
 // Since wasm does not use landingpad instructions, these instructions return
 // exception pointer and selector values until we lower them in WasmEHPrepare.
@@ -60,10 +61,12 @@ def int_wasm_get_exception : Intrinsic<[llvm_ptr_ty], [llvm_token_ty],
                                        [IntrHasSideEffects]>;
 def int_wasm_get_ehselector : Intrinsic<[llvm_i32_ty], [llvm_token_ty],
                                         [IntrHasSideEffects]>;
-// This is the same as llvm.wasm.get.exception except that it does not take a
-// token operand. This is only for instruction selection purpose.
-def int_wasm_extract_exception : Intrinsic<[llvm_ptr_ty], [],
-                                           [IntrHasSideEffects]>;
+
+// wasm.catch returns the pointer to the exception object caught by wasm 'catch'
+// instruction. This returns a single pointer, which is sufficient for C++
+// support. The immediate argument is an index to for a tag, which is 0 for C++.
+def int_wasm_catch : Intrinsic<[llvm_ptr_ty], [llvm_i32_ty],
+                               [IntrHasSideEffects, ImmArg<ArgIndex<0>>]>;
 
 // WebAssembly EH must maintain the landingpads in the order assigned to them
 // by WasmEHPrepare pass to generate landingpad table in EHStreamer. This is
@@ -79,22 +82,23 @@ def int_wasm_lsda : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>;
 //===----------------------------------------------------------------------===//
 
 // wait / notify
-def int_wasm_atomic_wait_i32 :
+def int_wasm_memory_atomic_wait32 :
   Intrinsic<[llvm_i32_ty],
             [LLVMPointerType<llvm_i32_ty>, llvm_i32_ty, llvm_i64_ty],
-            [IntrInaccessibleMemOrArgMemOnly, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>,
-             IntrHasSideEffects],
-             "", [SDNPMemOperand]>;
-def int_wasm_atomic_wait_i64 :
+            [IntrInaccessibleMemOrArgMemOnly, ReadOnly<ArgIndex<0>>,
+             NoCapture<ArgIndex<0>>, IntrHasSideEffects],
+            "", [SDNPMemOperand]>;
+def int_wasm_memory_atomic_wait64 :
   Intrinsic<[llvm_i32_ty],
             [LLVMPointerType<llvm_i64_ty>, llvm_i64_ty, llvm_i64_ty],
-            [IntrInaccessibleMemOrArgMemOnly, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>,
-             IntrHasSideEffects],
-             "", [SDNPMemOperand]>;
-def int_wasm_atomic_notify:
+            [IntrInaccessibleMemOrArgMemOnly, ReadOnly<ArgIndex<0>>,
+             NoCapture<ArgIndex<0>>, IntrHasSideEffects],
+            "", [SDNPMemOperand]>;
+def int_wasm_memory_atomic_notify:
   Intrinsic<[llvm_i32_ty], [LLVMPointerType<llvm_i32_ty>, llvm_i32_ty],
-            [IntrInaccessibleMemOnly, NoCapture<ArgIndex<0>>, IntrHasSideEffects], "",
-            [SDNPMemOperand]>;
+            [IntrInaccessibleMemOnly, NoCapture<ArgIndex<0>>,
+             IntrHasSideEffects],
+            "", [SDNPMemOperand]>;
 
 //===----------------------------------------------------------------------===//
 // SIMD intrinsics
@@ -151,6 +155,7 @@ def int_wasm_dot :
   Intrinsic<[llvm_v4i32_ty],
             [llvm_v8i16_ty, llvm_v8i16_ty],
             [IntrNoMem, IntrSpeculatable]>;
+
 def int_wasm_narrow_signed :
   Intrinsic<[llvm_anyvector_ty],
             [llvm_anyvector_ty, LLVMMatchType<1>],
@@ -159,21 +164,21 @@ def int_wasm_narrow_unsigned :
   Intrinsic<[llvm_anyvector_ty],
             [llvm_anyvector_ty, LLVMMatchType<1>],
             [IntrNoMem, IntrSpeculatable]>;
+
+// TODO: Replace these intrinsics with normal ISel patterns once i32x4 to i64x2
+// widening is merged to the proposal.
 def int_wasm_widen_low_signed :
-  Intrinsic<[llvm_anyvector_ty],
-            [llvm_anyvector_ty],
-            [IntrNoMem, IntrSpeculatable]>;
+  Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty], [IntrNoMem, IntrSpeculatable]>;
 def int_wasm_widen_high_signed :
-  Intrinsic<[llvm_anyvector_ty],
-            [llvm_anyvector_ty],
-            [IntrNoMem, IntrSpeculatable]>;
+  Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty], [IntrNoMem, IntrSpeculatable]>;
 def int_wasm_widen_low_unsigned :
-  Intrinsic<[llvm_anyvector_ty],
-            [llvm_anyvector_ty],
-            [IntrNoMem, IntrSpeculatable]>;
+  Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty], [IntrNoMem, IntrSpeculatable]>;
 def int_wasm_widen_high_unsigned :
-  Intrinsic<[llvm_anyvector_ty],
-            [llvm_anyvector_ty],
+  Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty], [IntrNoMem, IntrSpeculatable]>;
+
+def int_wasm_q15mulr_saturate_signed :
+  Intrinsic<[llvm_v8i16_ty],
+            [llvm_v8i16_ty, llvm_v8i16_ty],
             [IntrNoMem, IntrSpeculatable]>;
 
 // TODO: Replace these intrinsics with normal ISel patterns
@@ -206,6 +211,143 @@ def int_wasm_nearest :
             [LLVMMatchType<0>],
             [IntrNoMem, IntrSpeculatable]>;
 
+// TODO: Replace these intrinsic with normal ISel patterns once the
+// load_zero instructions are merged to the proposal.
+def int_wasm_load32_zero :
+  Intrinsic<[llvm_v4i32_ty],
+            [LLVMPointerType<llvm_i32_ty>],
+            [IntrReadMem, IntrArgMemOnly],
+             "", [SDNPMemOperand]>;
+
+def int_wasm_load64_zero :
+  Intrinsic<[llvm_v2i64_ty],
+            [LLVMPointerType<llvm_i64_ty>],
+            [IntrReadMem, IntrArgMemOnly],
+             "", [SDNPMemOperand]>;
+
+// These intrinsics do not mark their lane index arguments as immediate because
+// that changes the corresponding SDNode from ISD::Constant to
+// ISD::TargetConstant, which would require extra complications in the ISel
+// tablegen patterns. TODO: Replace these intrinsic with normal ISel patterns
+// once the load_lane instructions are merged to the proposal.
+def int_wasm_load8_lane :
+  Intrinsic<[llvm_v16i8_ty],
+            [LLVMPointerType<llvm_i8_ty>, llvm_v16i8_ty, llvm_i32_ty],
+            [IntrReadMem, IntrArgMemOnly],
+            "", [SDNPMemOperand]>;
+def int_wasm_load16_lane :
+  Intrinsic<[llvm_v8i16_ty],
+            [LLVMPointerType<llvm_i16_ty>, llvm_v8i16_ty, llvm_i32_ty],
+            [IntrReadMem, IntrArgMemOnly],
+            "", [SDNPMemOperand]>;
+def int_wasm_load32_lane :
+  Intrinsic<[llvm_v4i32_ty],
+            [LLVMPointerType<llvm_i32_ty>, llvm_v4i32_ty, llvm_i32_ty],
+            [IntrReadMem, IntrArgMemOnly],
+            "", [SDNPMemOperand]>;
+def int_wasm_load64_lane :
+  Intrinsic<[llvm_v2i64_ty],
+            [LLVMPointerType<llvm_i64_ty>, llvm_v2i64_ty, llvm_i32_ty],
+            [IntrReadMem, IntrArgMemOnly],
+            "", [SDNPMemOperand]>;
+def int_wasm_store8_lane :
+  Intrinsic<[],
+            [LLVMPointerType<llvm_i8_ty>, llvm_v16i8_ty, llvm_i32_ty],
+            [IntrWriteMem, IntrArgMemOnly],
+            "", [SDNPMemOperand]>;
+def int_wasm_store16_lane :
+  Intrinsic<[],
+            [LLVMPointerType<llvm_i16_ty>, llvm_v8i16_ty, llvm_i32_ty],
+            [IntrWriteMem, IntrArgMemOnly],
+            "", [SDNPMemOperand]>;
+def int_wasm_store32_lane :
+  Intrinsic<[],
+            [LLVMPointerType<llvm_i32_ty>, llvm_v4i32_ty, llvm_i32_ty],
+            [IntrWriteMem, IntrArgMemOnly],
+            "", [SDNPMemOperand]>;
+def int_wasm_store64_lane :
+  Intrinsic<[],
+            [LLVMPointerType<llvm_i64_ty>, llvm_v2i64_ty, llvm_i32_ty],
+            [IntrWriteMem, IntrArgMemOnly],
+            "", [SDNPMemOperand]>;
+
+// TODO: Replace this intrinsic with normal ISel patterns once popcnt is merged
+// to the proposal.
+def int_wasm_popcnt :
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty], [IntrNoMem, IntrSpeculatable]>;
+
+def int_wasm_extmul_low_signed :
+  Intrinsic<[llvm_anyvector_ty],
+            [LLVMSubdivide2VectorType<0>, LLVMSubdivide2VectorType<0>],
+            [IntrNoMem, IntrSpeculatable]>;
+def int_wasm_extmul_high_signed :
+  Intrinsic<[llvm_anyvector_ty],
+            [LLVMSubdivide2VectorType<0>, LLVMSubdivide2VectorType<0>],
+            [IntrNoMem, IntrSpeculatable]>;
+def int_wasm_extmul_low_unsigned :
+  Intrinsic<[llvm_anyvector_ty],
+            [LLVMSubdivide2VectorType<0>, LLVMSubdivide2VectorType<0>],
+            [IntrNoMem, IntrSpeculatable]>;
+def int_wasm_extmul_high_unsigned :
+  Intrinsic<[llvm_anyvector_ty],
+            [LLVMSubdivide2VectorType<0>, LLVMSubdivide2VectorType<0>],
+            [IntrNoMem, IntrSpeculatable]>;
+
+def int_wasm_extadd_pairwise_signed :
+  Intrinsic<[llvm_anyvector_ty],
+            [LLVMSubdivide2VectorType<0>],
+            [IntrNoMem, IntrSpeculatable]>;
+def int_wasm_extadd_pairwise_unsigned :
+  Intrinsic<[llvm_anyvector_ty],
+            [LLVMSubdivide2VectorType<0>],
+            [IntrNoMem, IntrSpeculatable]>;
+
+def int_wasm_signselect :
+  Intrinsic<[llvm_anyvector_ty],
+            [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
+            [IntrNoMem, IntrSpeculatable]>;
+
+// TODO: Remove this intrinsic and the associated builtin if i64x2.eq gets
+// merged to the proposal.
+def int_wasm_eq :
+  Intrinsic<[llvm_v2i64_ty],
+            [llvm_v2i64_ty, llvm_v2i64_ty],
+            [IntrNoMem, IntrSpeculatable]>;
+
+// TODO: Remove this after experiments have been run. Use the target-agnostic
+// int_prefetch if this becomes specified at some point.
+def int_wasm_prefetch_t :
+  Intrinsic<[], [llvm_ptr_ty],
+            [IntrInaccessibleMemOrArgMemOnly, IntrWillReturn,
+             ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>],
+            "", [SDNPMemOperand]>;
+
+def int_wasm_prefetch_nt :
+  Intrinsic<[], [llvm_ptr_ty],
+            [IntrInaccessibleMemOrArgMemOnly, IntrWillReturn,
+             ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>],
+            "", [SDNPMemOperand]>;
+
+// TODO: Remove these if possible if they are merged to the spec.
+def int_wasm_convert_low_signed :
+  Intrinsic<[llvm_v2f64_ty], [llvm_v4i32_ty],
+            [IntrNoMem, IntrSpeculatable]>;
+def int_wasm_convert_low_unsigned :
+  Intrinsic<[llvm_v2f64_ty], [llvm_v4i32_ty],
+            [IntrNoMem, IntrSpeculatable]>;
+def int_wasm_trunc_saturate_zero_signed :
+  Intrinsic<[llvm_v4i32_ty], [llvm_v2f64_ty],
+            [IntrNoMem, IntrSpeculatable]>;
+def int_wasm_trunc_saturate_zero_unsigned :
+  Intrinsic<[llvm_v4i32_ty], [llvm_v2f64_ty],
+            [IntrNoMem, IntrSpeculatable]>;
+def int_wasm_demote_zero :
+  Intrinsic<[llvm_v4f32_ty], [llvm_v2f64_ty],
+            [IntrNoMem, IntrSpeculatable]>;
+def int_wasm_promote_low :
+  Intrinsic<[llvm_v2f64_ty], [llvm_v4f32_ty],
+            [IntrNoMem, IntrSpeculatable]>;
+
 //===----------------------------------------------------------------------===//
 // Thread-local storage intrinsics
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsX86.td b/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsX86.td
index 3f86fd075d3a..bba12139976e 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -283,11 +283,10 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
                          IntrHasSideEffects]>;
   def int_x86_sse_ldmxcsr :
               Intrinsic<[], [llvm_ptr_ty],
-                        [IntrReadMem, IntrArgMemOnly, IntrHasSideEffects,
                          // FIXME: LDMXCSR does not actually write to memory,
-                         // but Fast and DAG Isel both use writing to memory
-                         // as a proxy for having side effects.
-                         IntrWriteMem]>;
+                         // but intrinsic properties are generated incorrectly
+                         // for IntrReadMem+IntrHasSideEffects.
+                        [/*IntrReadMem, IntrArgMemOnly,*/ IntrHasSideEffects]>;
 }
 
 // Misc.
@@ -4749,26 +4748,26 @@ let TargetPrefix = "x86" in {
 let TargetPrefix = "x86" in {
   // NOTE: These comparison intrinsics are not used by clang as long as the
   //       distinction in signaling behaviour is not implemented.
-  def int_x86_avx512_cmp_ps_512 :
+  def int_x86_avx512_mask_cmp_ps_512 :
               Intrinsic<[llvm_v16i1_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
-                         llvm_i32_ty, llvm_i32_ty],
-                        [IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
-  def int_x86_avx512_cmp_pd_512 :
+                         llvm_i32_ty, llvm_v16i1_ty, llvm_i32_ty],
+                        [IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<4>>]>;
+  def int_x86_avx512_mask_cmp_pd_512 :
               Intrinsic<[llvm_v8i1_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
-                         llvm_i32_ty, llvm_i32_ty],
-                        [IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
-  def int_x86_avx512_cmp_ps_256 :
+                         llvm_i32_ty, llvm_v8i1_ty, llvm_i32_ty],
+                        [IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<4>>]>;
+  def int_x86_avx512_mask_cmp_ps_256 :
               Intrinsic<[llvm_v8i1_ty], [llvm_v8f32_ty, llvm_v8f32_ty,
-                         llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-  def int_x86_avx512_cmp_pd_256 :
+                         llvm_i32_ty, llvm_v8i1_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
+  def int_x86_avx512_mask_cmp_pd_256 :
               Intrinsic<[llvm_v4i1_ty], [llvm_v4f64_ty, llvm_v4f64_ty,
-                         llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-  def int_x86_avx512_cmp_ps_128 :
+                         llvm_i32_ty, llvm_v4i1_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
+  def int_x86_avx512_mask_cmp_ps_128 :
             Intrinsic<[llvm_v4i1_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
-                       llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-  def int_x86_avx512_cmp_pd_128 :
+                       llvm_i32_ty, llvm_v4i1_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
+  def int_x86_avx512_mask_cmp_pd_128 :
             Intrinsic<[llvm_v2i1_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
-                       llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
+                       llvm_i32_ty, llvm_v2i1_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
   def int_x86_avx512_mask_cmp_ss :
         GCCBuiltin<"__builtin_ia32_cmpss_mask">,
@@ -4948,6 +4947,59 @@ let TargetPrefix = "x86" in {
   def int_x86_xresldtrk : GCCBuiltin<"__builtin_ia32_xresldtrk">,
               Intrinsic<[], [], []>;
 }
+
+//===----------------------------------------------------------------------===//
+// Key Locker
+let TargetPrefix = "x86" in {
+  def int_x86_loadiwkey : GCCBuiltin<"__builtin_ia32_loadiwkey">,
+      Intrinsic<[], [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty],
+                []>;
+  def int_x86_encodekey128 :
+      Intrinsic<[llvm_i32_ty, llvm_v2i64_ty, llvm_v2i64_ty,
+                 llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty],
+                [llvm_i32_ty, llvm_v2i64_ty], []>;
+  def int_x86_encodekey256 :
+      Intrinsic<[llvm_i32_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty,
+                 llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty],
+                [llvm_i32_ty, llvm_v2i64_ty, llvm_v2i64_ty], []>;
+  def int_x86_aesenc128kl :
+      Intrinsic<[llvm_i8_ty, llvm_v2i64_ty], [llvm_v2i64_ty, llvm_ptr_ty], []>;
+  def int_x86_aesdec128kl :
+      Intrinsic<[llvm_i8_ty, llvm_v2i64_ty], [llvm_v2i64_ty, llvm_ptr_ty], []>;
+  def int_x86_aesenc256kl :
+      Intrinsic<[llvm_i8_ty, llvm_v2i64_ty], [llvm_v2i64_ty, llvm_ptr_ty], []>;
+  def int_x86_aesdec256kl :
+      Intrinsic<[llvm_i8_ty, llvm_v2i64_ty], [llvm_v2i64_ty, llvm_ptr_ty], []>;
+  def int_x86_aesencwide128kl :
+      Intrinsic<[llvm_i8_ty, llvm_v2i64_ty, llvm_v2i64_ty,
+                 llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty,
+                 llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty],
+                [llvm_ptr_ty, llvm_v2i64_ty, llvm_v2i64_ty,
+                 llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty,
+                 llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty], []>;
+  def int_x86_aesdecwide128kl :
+      Intrinsic<[llvm_i8_ty, llvm_v2i64_ty, llvm_v2i64_ty,
+                 llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty,
+                 llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty],
+                [llvm_ptr_ty, llvm_v2i64_ty, llvm_v2i64_ty,
+                 llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty,
+                 llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty], []>;
+  def int_x86_aesencwide256kl :
+      Intrinsic<[llvm_i8_ty, llvm_v2i64_ty, llvm_v2i64_ty,
+                 llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty,
+                 llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty],
+                [llvm_ptr_ty, llvm_v2i64_ty, llvm_v2i64_ty,
+                 llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty,
+                 llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty], []>;
+  def int_x86_aesdecwide256kl :
+      Intrinsic<[llvm_i8_ty, llvm_v2i64_ty, llvm_v2i64_ty,
+                 llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty,
+                 llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty],
+                [llvm_ptr_ty, llvm_v2i64_ty, llvm_v2i64_ty,
+                 llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty,
+                 llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty], []>;
+}
+
 //===----------------------------------------------------------------------===//
 // AMX - Intel AMX extensions
 
@@ -4959,21 +5011,68 @@ let TargetPrefix = "x86" in {
   def int_x86_tilerelease : GCCBuiltin<"__builtin_ia32_tilerelease">,
               Intrinsic<[], [], []>;
   def int_x86_tilezero : GCCBuiltin<"__builtin_ia32_tilezero">,
-              Intrinsic<[], [llvm_i8_ty], []>;
+              Intrinsic<[], [llvm_i8_ty], [ImmArg<ArgIndex<0>>]>;
   def int_x86_tileloadd64 : GCCBuiltin<"__builtin_ia32_tileloadd64">,
-              Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], []>;
+              Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty],
+                        [ImmArg<ArgIndex<0>>]>;
   def int_x86_tileloaddt164 : GCCBuiltin<"__builtin_ia32_tileloaddt164">,
-              Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], []>;
+              Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty],
+                        [ImmArg<ArgIndex<0>>]>;
   def int_x86_tilestored64 : GCCBuiltin<"__builtin_ia32_tilestored64">,
-              Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], []>;
+              Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty],
+                        [ImmArg<ArgIndex<0>>]>;
   def int_x86_tdpbssd : GCCBuiltin<"__builtin_ia32_tdpbssd">,
-              Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], []>;
+              Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty],
+                        [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>,
+                         ImmArg<ArgIndex<2>>]>;
   def int_x86_tdpbsud : GCCBuiltin<"__builtin_ia32_tdpbsud">,
-              Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], []>;
+              Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty],
+                        [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>,
+                         ImmArg<ArgIndex<2>>]>;
   def int_x86_tdpbusd : GCCBuiltin<"__builtin_ia32_tdpbusd">,
-              Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], []>;
+              Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty],
+                        [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>,
+                         ImmArg<ArgIndex<2>>]>;
   def int_x86_tdpbuud : GCCBuiltin<"__builtin_ia32_tdpbuud">,
-              Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], []>;
+              Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty],
+                        [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>,
+                         ImmArg<ArgIndex<2>>]>;
   def int_x86_tdpbf16ps : GCCBuiltin<"__builtin_ia32_tdpbf16ps">,
-              Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], []>;
+              Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty],
+                        [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>,
+                         ImmArg<ArgIndex<2>>]>;
+  // AMX - internal intrinsics
+  def int_x86_tileloadd64_internal :
+              GCCBuiltin<"__builtin_ia32_tileloadd64_internal">,
+              Intrinsic<[llvm_x86amx_ty],
+                        [llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
+                        []>;
+  def int_x86_tdpbssd_internal :
+              GCCBuiltin<"__builtin_ia32_tdpbssd_internal">,
+              Intrinsic<[llvm_x86amx_ty],
+                        [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty,
+                         llvm_x86amx_ty, llvm_x86amx_ty,
+                         llvm_x86amx_ty], []>;
+  def int_x86_tilestored64_internal :
+              GCCBuiltin<"__builtin_ia32_tilestored64_internal">,
+              Intrinsic<[], [llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty,
+                             llvm_i64_ty, llvm_x86amx_ty], []>;
+  def int_x86_tilezero_internal :
+              GCCBuiltin<"__builtin_ia32_tilezero_internal">,
+              Intrinsic<[llvm_x86amx_ty], [llvm_i16_ty, llvm_i16_ty],
+                        []>;
+}
+
+//===----------------------------------------------------------------------===//
+// UINTR - User Level Interrupt
+
+let TargetPrefix = "x86" in {
+  def int_x86_clui : GCCBuiltin<"__builtin_ia32_clui">,
+              Intrinsic<[], [], []>;
+  def int_x86_stui : GCCBuiltin<"__builtin_ia32_stui">,
+              Intrinsic<[], [], []>;
+  def int_x86_testui : GCCBuiltin<"__builtin_ia32_testui">,
+              Intrinsic<[llvm_i8_ty], [], []>;
+  def int_x86_senduipi : GCCBuiltin<"__builtin_ia32_senduipi">,
+              Intrinsic<[], [llvm_i64_ty], []>;
 }
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/LLVMContext.h b/contrib/llvm-project/llvm/include/llvm/IR/LLVMContext.h
index c465e02c2fc5..8f8a35d07c64 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/LLVMContext.h
+++ b/contrib/llvm-project/llvm/include/llvm/IR/LLVMContext.h
@@ -222,13 +222,23 @@ public:
   void setDiagnosticsHotnessRequested(bool Requested);
 
   /// Return the minimum hotness value a diagnostic would need in order
-  /// to be included in optimization diagnostics. If there is no minimum, this
-  /// returns None.
+  /// to be included in optimization diagnostics.
+  ///
+  /// Three possible return values:
+  /// 0            - threshold is disabled. Everything will be printed out.
+  /// positive int - threshold is set.
+  /// UINT64_MAX   - threshold is not yet set, and needs to be synced from
+  ///                profile summary. Note that in case of missing profile
+  ///                summary, threshold will be kept at "MAX", effectively
+  ///                suppresses all remarks output.
   uint64_t getDiagnosticsHotnessThreshold() const;
 
   /// Set the minimum hotness value a diagnostic needs in order to be
   /// included in optimization diagnostics.
-  void setDiagnosticsHotnessThreshold(uint64_t Threshold);
+  void setDiagnosticsHotnessThreshold(Optional<uint64_t> Threshold);
+
+  /// Return if hotness threshold is requested from PSI.
+  bool isDiagnosticsHotnessThresholdSetFromPSI() const;
 
   /// The "main remark streamer" used by all the specialized remark streamers.
   /// This streamer keeps generic remark metadata in memory throughout the life
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/LLVMRemarkStreamer.h b/contrib/llvm-project/llvm/include/llvm/IR/LLVMRemarkStreamer.h
index 97082a44e62f..e7627e993370 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/LLVMRemarkStreamer.h
+++ b/contrib/llvm-project/llvm/include/llvm/IR/LLVMRemarkStreamer.h
@@ -79,16 +79,15 @@ Expected<std::unique_ptr<ToolOutputFile>>
 setupLLVMOptimizationRemarks(LLVMContext &Context, StringRef RemarksFilename,
                              StringRef RemarksPasses, StringRef RemarksFormat,
                              bool RemarksWithHotness,
-                             unsigned RemarksHotnessThreshold = 0);
+                             Optional<uint64_t> RemarksHotnessThreshold = 0);
 
 /// Setup optimization remarks that output directly to a raw_ostream.
 /// \p OS is managed by the caller and should be open for writing as long as \p
 /// Context is streaming remarks to it.
-Error setupLLVMOptimizationRemarks(LLVMContext &Context, raw_ostream &OS,
-                                   StringRef RemarksPasses,
-                                   StringRef RemarksFormat,
-                                   bool RemarksWithHotness,
-                                   unsigned RemarksHotnessThreshold = 0);
+Error setupLLVMOptimizationRemarks(
+    LLVMContext &Context, raw_ostream &OS, StringRef RemarksPasses,
+    StringRef RemarksFormat, bool RemarksWithHotness,
+    Optional<uint64_t> RemarksHotnessThreshold = 0);
 
 } // end namespace llvm
 
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/LegacyPassManagers.h b/contrib/llvm-project/llvm/include/llvm/IR/LegacyPassManagers.h
index 6b1ddd4d79f8..f4fae184e428 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/LegacyPassManagers.h
+++ b/contrib/llvm-project/llvm/include/llvm/IR/LegacyPassManagers.h
@@ -88,7 +88,6 @@
 namespace llvm {
 template <typename T> class ArrayRef;
 class Module;
-class Pass;
 class StringRef;
 class Value;
 class Timer;
@@ -231,11 +230,11 @@ private:
 
   // Map to keep track of last user of the analysis pass.
   // LastUser->second is the last user of Lastuser->first.
+  // This is kept in sync with InversedLastUser.
   DenseMap<Pass *, Pass *> LastUser;
 
   // Map to keep track of passes that are last used by a pass.
-  // This inverse map is initialized at PM->run() based on
-  // LastUser map.
+  // This is kept in sync with LastUser.
   DenseMap<Pass *, SmallPtrSet<Pass *, 8> > InversedLastUser;
 
   /// Immutable passes are managed by top level manager.
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/MDBuilder.h b/contrib/llvm-project/llvm/include/llvm/IR/MDBuilder.h
index 11e2e2623257..51be8667f1c1 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/MDBuilder.h
+++ b/contrib/llvm-project/llvm/include/llvm/IR/MDBuilder.h
@@ -76,9 +76,8 @@ public:
   /// Return metadata containing the section prefix for a function.
   MDNode *createFunctionSectionPrefix(StringRef Prefix);
 
-  /// return metadata containing expected value
-  MDNode *createMisExpect(uint64_t Index, uint64_t LikelyWeight,
-                          uint64_t UnlikelyWeight);
+  /// Return metadata containing the pseudo probe descriptor for a function.
+  MDNode *createPseudoProbeDesc(uint64_t GUID, uint64_t Hash, Function *F);
 
   //===------------------------------------------------------------------===//
   // Range metadata.
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/MatrixBuilder.h b/contrib/llvm-project/llvm/include/llvm/IR/MatrixBuilder.h
index 5d04b3563dd5..084b1d49569e 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/MatrixBuilder.h
+++ b/contrib/llvm-project/llvm/include/llvm/IR/MatrixBuilder.h
@@ -38,14 +38,19 @@ template <class IRBuilderTy> class MatrixBuilder {
                                                          Value *RHS) {
     assert((LHS->getType()->isVectorTy() || RHS->getType()->isVectorTy()) &&
            "One of the operands must be a matrix (embedded in a vector)");
-    if (LHS->getType()->isVectorTy() && !RHS->getType()->isVectorTy())
+    if (LHS->getType()->isVectorTy() && !RHS->getType()->isVectorTy()) {
+      assert(!isa<ScalableVectorType>(LHS->getType()) &&
+             "LHS Assumed to be fixed width");
       RHS = B.CreateVectorSplat(
-          cast<VectorType>(LHS->getType())->getNumElements(), RHS,
+          cast<VectorType>(LHS->getType())->getElementCount(), RHS,
           "scalar.splat");
-    else if (!LHS->getType()->isVectorTy() && RHS->getType()->isVectorTy())
+    } else if (!LHS->getType()->isVectorTy() && RHS->getType()->isVectorTy()) {
+      assert(!isa<ScalableVectorType>(RHS->getType()) &&
+             "RHS Assumed to be fixed width");
       LHS = B.CreateVectorSplat(
-          cast<VectorType>(RHS->getType())->getNumElements(), LHS,
+          cast<VectorType>(RHS->getType())->getElementCount(), LHS,
           "scalar.splat");
+    }
     return {LHS, RHS};
   }
 
@@ -155,14 +160,19 @@ public:
   /// matrixes.
   Value *CreateAdd(Value *LHS, Value *RHS) {
     assert(LHS->getType()->isVectorTy() || RHS->getType()->isVectorTy());
-    if (LHS->getType()->isVectorTy() && !RHS->getType()->isVectorTy())
+    if (LHS->getType()->isVectorTy() && !RHS->getType()->isVectorTy()) {
+      assert(!isa<ScalableVectorType>(LHS->getType()) &&
+             "LHS Assumed to be fixed width");
       RHS = B.CreateVectorSplat(
-          cast<VectorType>(LHS->getType())->getNumElements(), RHS,
+          cast<VectorType>(LHS->getType())->getElementCount(), RHS,
           "scalar.splat");
-    else if (!LHS->getType()->isVectorTy() && RHS->getType()->isVectorTy())
+    } else if (!LHS->getType()->isVectorTy() && RHS->getType()->isVectorTy()) {
+      assert(!isa<ScalableVectorType>(RHS->getType()) &&
+             "RHS Assumed to be fixed width");
       LHS = B.CreateVectorSplat(
-          cast<VectorType>(RHS->getType())->getNumElements(), LHS,
+          cast<VectorType>(RHS->getType())->getElementCount(), LHS,
           "scalar.splat");
+    }
 
     return cast<VectorType>(LHS->getType())
                    ->getElementType()
@@ -175,14 +185,19 @@ public:
   /// point matrixes.
   Value *CreateSub(Value *LHS, Value *RHS) {
     assert(LHS->getType()->isVectorTy() || RHS->getType()->isVectorTy());
-    if (LHS->getType()->isVectorTy() && !RHS->getType()->isVectorTy())
+    if (LHS->getType()->isVectorTy() && !RHS->getType()->isVectorTy()) {
+      assert(!isa<ScalableVectorType>(LHS->getType()) &&
+             "LHS Assumed to be fixed width");
       RHS = B.CreateVectorSplat(
-          cast<VectorType>(LHS->getType())->getNumElements(), RHS,
+          cast<VectorType>(LHS->getType())->getElementCount(), RHS,
           "scalar.splat");
-    else if (!LHS->getType()->isVectorTy() && RHS->getType()->isVectorTy())
+    } else if (!LHS->getType()->isVectorTy() && RHS->getType()->isVectorTy()) {
+      assert(!isa<ScalableVectorType>(RHS->getType()) &&
+             "RHS Assumed to be fixed width");
       LHS = B.CreateVectorSplat(
-          cast<VectorType>(RHS->getType())->getNumElements(), LHS,
+          cast<VectorType>(RHS->getType())->getElementCount(), LHS,
           "scalar.splat");
+    }
 
     return cast<VectorType>(LHS->getType())
                    ->getElementType()
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/Metadata.def b/contrib/llvm-project/llvm/include/llvm/IR/Metadata.def
index 1df60cadac08..f31be8d1bc0c 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/Metadata.def
+++ b/contrib/llvm-project/llvm/include/llvm/IR/Metadata.def
@@ -114,6 +114,8 @@ HANDLE_SPECIALIZED_MDNODE_BRANCH(DIMacroNode)
 HANDLE_SPECIALIZED_MDNODE_LEAF_UNIQUABLE(DIMacro)
 HANDLE_SPECIALIZED_MDNODE_LEAF_UNIQUABLE(DIMacroFile)
 HANDLE_SPECIALIZED_MDNODE_LEAF_UNIQUABLE(DICommonBlock)
+HANDLE_SPECIALIZED_MDNODE_LEAF_UNIQUABLE(DIStringType)
+HANDLE_SPECIALIZED_MDNODE_LEAF_UNIQUABLE(DIGenericSubrange)
 
 #undef HANDLE_METADATA
 #undef HANDLE_METADATA_LEAF
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/Metadata.h b/contrib/llvm-project/llvm/include/llvm/IR/Metadata.h
index 46526c70ea3b..9a4480b75a30 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/Metadata.h
+++ b/contrib/llvm-project/llvm/include/llvm/IR/Metadata.h
@@ -667,6 +667,12 @@ struct AAMDNodes {
   /// The tag specifying the noalias scope.
   MDNode *NoAlias = nullptr;
 
+  // Shift tbaa Metadata node to start off bytes later
+  static MDNode *ShiftTBAA(MDNode *M, size_t off);
+
+  // Shift tbaa.struct Metadata node to start off bytes later
+  static MDNode *ShiftTBAAStruct(MDNode *M, size_t off);
+
   /// Given two sets of AAMDNodes that apply to the same pointer,
   /// give the best AAMDNodes that are compatible with both (i.e. a set of
   /// nodes whose allowable aliasing conclusions are a subset of those
@@ -680,6 +686,18 @@ struct AAMDNodes {
     Result.NoAlias = Other.NoAlias == NoAlias ? NoAlias : nullptr;
     return Result;
   }
+
+  /// Create a new AAMDNode that describes this AAMDNode after applying a
+  /// constant offset to the start of the pointer
+  AAMDNodes shift(size_t Offset) {
+    AAMDNodes Result;
+    Result.TBAA = TBAA ? ShiftTBAA(TBAA, Offset) : nullptr;
+    Result.TBAAStruct =
+        TBAAStruct ? ShiftTBAAStruct(TBAAStruct, Offset) : nullptr;
+    Result.Scope = Scope;
+    Result.NoAlias = NoAlias;
+    return Result;
+  }
 };
 
 // Specialize DenseMapInfo for AAMDNodes.
@@ -1128,8 +1146,7 @@ class MDTuple : public MDNode {
                           StorageType Storage, bool ShouldCreate = true);
 
   TempMDTuple cloneImpl() const {
-    return getTemporary(getContext(),
-                        SmallVector<Metadata *, 4>(op_begin(), op_end()));
+    return getTemporary(getContext(), SmallVector<Metadata *, 4>(operands()));
   }
 
 public:
@@ -1190,6 +1207,33 @@ void TempMDNodeDeleter::operator()(MDNode *Node) const {
   MDNode::deleteTemporary(Node);
 }
 
+/// This is a simple wrapper around an MDNode which provides a higher-level
+/// interface by hiding the details of how alias analysis information is encoded
+/// in its operands.
+class AliasScopeNode {
+  const MDNode *Node = nullptr;
+
+public:
+  AliasScopeNode() = default;
+  explicit AliasScopeNode(const MDNode *N) : Node(N) {}
+
+  /// Get the MDNode for this AliasScopeNode.
+  const MDNode *getNode() const { return Node; }
+
+  /// Get the MDNode for this AliasScopeNode's domain.
+  const MDNode *getDomain() const {
+    if (Node->getNumOperands() < 2)
+      return nullptr;
+    return dyn_cast_or_null<MDNode>(Node->getOperand(1));
+  }
+  StringRef getName() const {
+    if (Node->getNumOperands() > 2)
+      if (MDString *N = dyn_cast_or_null<MDString>(Node->getOperand(2)))
+        return N->getString();
+    return StringRef();
+  }
+};
+
 /// Typed iterator through MDNode operands.
 ///
 /// An iterator that transforms an \a MDNode::iterator into an iterator over a
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/Module.h b/contrib/llvm-project/llvm/include/llvm/IR/Module.h
index 3f97d048f862..3664b275114d 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/Module.h
+++ b/contrib/llvm-project/llvm/include/llvm/IR/Module.h
@@ -329,10 +329,6 @@ public:
   /// \see LLVMContext::getOperandBundleTagID
   void getOperandBundleTags(SmallVectorImpl<StringRef> &Result) const;
 
-  /// Return the type with the specified name, or null if there is none by that
-  /// name.
-  StructType *getTypeByName(StringRef Name) const;
-
   std::vector<StructType *> getIdentifiedStructTypes() const;
 
 /// @}
@@ -854,12 +850,11 @@ public:
 
   /// Returns profile summary metadata. When IsCS is true, use the context
   /// sensitive profile summary.
-  Metadata *getProfileSummary(bool IsCS);
+  Metadata *getProfileSummary(bool IsCS) const;
   /// @}
 
   /// Returns whether semantic interposition is to be respected.
   bool getSemanticInterposition() const;
-  bool noSemanticInterposition() const;
 
   /// Set whether semantic interposition is to be respected.
   void setSemanticInterposition(bool);
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/ModuleSummaryIndex.h b/contrib/llvm-project/llvm/include/llvm/IR/ModuleSummaryIndex.h
index 12a829b14e36..d5a7ad63737a 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/ModuleSummaryIndex.h
+++ b/contrib/llvm-project/llvm/include/llvm/IR/ModuleSummaryIndex.h
@@ -562,12 +562,11 @@ public:
     /// offsets from the beginning of the value that are passed.
     struct Call {
       uint64_t ParamNo = 0;
-      GlobalValue::GUID Callee = 0;
+      ValueInfo Callee;
       ConstantRange Offsets{/*BitWidth=*/RangeWidth, /*isFullSet=*/true};
 
       Call() = default;
-      Call(uint64_t ParamNo, GlobalValue::GUID Callee,
-           const ConstantRange &Offsets)
+      Call(uint64_t ParamNo, ValueInfo Callee, const ConstantRange &Offsets)
           : ParamNo(ParamNo), Callee(Callee), Offsets(Offsets) {}
     };
 
@@ -597,7 +596,7 @@ public:
             GlobalValue::LinkageTypes::AvailableExternallyLinkage,
             /*NotEligibleToImport=*/true, /*Live=*/true, /*IsLocal=*/false,
             /*CanAutoHide=*/false),
-        /*InsCount=*/0, FunctionSummary::FFlags{}, /*EntryCount=*/0,
+        /*NumInsts=*/0, FunctionSummary::FFlags{}, /*EntryCount=*/0,
         std::vector<ValueInfo>(), std::move(Edges),
         std::vector<GlobalValue::GUID>(),
         std::vector<FunctionSummary::VFuncId>(),
@@ -1061,6 +1060,9 @@ private:
   // some were not. Set when the combined index is created during the thin link.
   bool PartiallySplitLTOUnits = false;
 
+  /// True if some of the FunctionSummary contains a ParamAccess.
+  bool HasParamAccess = false;
+
   std::set<std::string> CfiFunctionDefs;
   std::set<std::string> CfiFunctionDecls;
 
@@ -1213,6 +1215,8 @@ public:
   bool partiallySplitLTOUnits() const { return PartiallySplitLTOUnits; }
   void setPartiallySplitLTOUnits() { PartiallySplitLTOUnits = true; }
 
+  bool hasParamAccess() const { return HasParamAccess; }
+
   bool isGlobalValueLive(const GlobalValueSummary *GVS) const {
     return !WithGlobalValueDeadStripping || GVS->isLive();
   }
@@ -1284,6 +1288,8 @@ public:
   /// Add a global value summary for the given ValueInfo.
   void addGlobalValueSummary(ValueInfo VI,
                              std::unique_ptr<GlobalValueSummary> Summary) {
+    if (const FunctionSummary *FS = dyn_cast<FunctionSummary>(Summary.get()))
+      HasParamAccess |= !FS->paramAccesses().empty();
     addOriginalName(VI.getGUID(), Summary->getOriginalName());
     // Here we have a notionally const VI, but the value it points to is owned
     // by the non-const *this.
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/Operator.h b/contrib/llvm-project/llvm/include/llvm/IR/Operator.h
index acfacbd6c74e..945f7e46e142 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/Operator.h
+++ b/contrib/llvm-project/llvm/include/llvm/IR/Operator.h
@@ -568,6 +568,11 @@ public:
   bool accumulateConstantOffset(
       const DataLayout &DL, APInt &Offset,
       function_ref<bool(Value &, APInt &)> ExternalAnalysis = nullptr) const;
+
+  static bool accumulateConstantOffset(
+      Type *SourceType, ArrayRef<const Value *> Index, const DataLayout &DL,
+      APInt &Offset,
+      function_ref<bool(Value &, APInt &)> ExternalAnalysis = nullptr);
 };
 
 class PtrToIntOperator
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/OptBisect.h b/contrib/llvm-project/llvm/include/llvm/IR/OptBisect.h
index 1b2b0bd7acaa..6c2a1b01d897 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/OptBisect.h
+++ b/contrib/llvm-project/llvm/include/llvm/IR/OptBisect.h
@@ -15,6 +15,7 @@
 #define LLVM_IR_OPTBISECT_H
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/ManagedStatic.h"
 
 namespace llvm {
 
@@ -32,7 +33,7 @@ public:
     return true;
   }
 
-  /// isEnabled should return true before calling shouldRunPass
+  /// isEnabled() should return true before calling shouldRunPass().
   virtual bool isEnabled() const { return false; }
 };
 
@@ -53,6 +54,14 @@ public:
 
   virtual ~OptBisect() = default;
 
+  /// Checks the bisect limit to determine if the specified pass should run.
+  ///
+  /// This forwards to checkPass().
+  bool shouldRunPass(const Pass *P, StringRef IRDescription) override;
+
+  /// isEnabled() should return true before calling shouldRunPass().
+  bool isEnabled() const override { return BisectEnabled; }
+
   /// Checks the bisect limit to determine if the specified pass should run.
   ///
   /// If the bisect limit is set to -1, the function prints a message describing
@@ -64,17 +73,16 @@ public:
   /// Most passes should not call this routine directly. Instead, they are
   /// called through helper routines provided by the pass base classes.  For
   /// instance, function passes should call FunctionPass::skipFunction().
-  bool shouldRunPass(const Pass *P, StringRef IRDescription) override;
-
-  /// isEnabled should return true before calling shouldRunPass
-  bool isEnabled() const override { return BisectEnabled; }
-private:
   bool checkPass(const StringRef PassName, const StringRef TargetDesc);
 
+private:
   bool BisectEnabled = false;
   unsigned LastBisectNum = 0;
 };
 
+/// Singleton instance of the OptBisect class, so multiple pass managers don't
+/// need to coordinate their uses of OptBisect.
+extern ManagedStatic<OptBisect> OptBisector;
 } // end namespace llvm
 
 #endif // LLVM_IR_OPTBISECT_H
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/PassInstrumentation.h b/contrib/llvm-project/llvm/include/llvm/IR/PassInstrumentation.h
index bcc434548e67..291f324b159a 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/PassInstrumentation.h
+++ b/contrib/llvm-project/llvm/include/llvm/IR/PassInstrumentation.h
@@ -44,10 +44,6 @@
 ///      of a pass. For those callbacks returning false means pass will not be
 ///      executed.
 ///
-/// TODO: currently there is no way for a pass to opt-out of execution control
-/// (e.g. become unskippable). PassManager is the only entity that determines
-/// how pass instrumentation affects pass execution.
-///
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_IR_PASSINSTRUMENTATION_H
@@ -56,6 +52,7 @@
 #include "llvm/ADT/Any.h"
 #include "llvm/ADT/FunctionExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
 #include <type_traits>
 
 namespace llvm {
@@ -71,13 +68,17 @@ public:
   // to take them as constant pointers, wrapped with llvm::Any.
   // For the case when IRUnit has been invalidated there is a different
   // callback to use - AfterPassInvalidated.
+  // We call all BeforePassFuncs to determine if a pass should run or not.
+  // BeforeNonSkippedPassFuncs are called only if the pass should run.
   // TODO: currently AfterPassInvalidated does not accept IRUnit, since passing
-  // already invalidated IRUnit is unsafe. There are ways to handle invalidated IRUnits
-  // in a safe way, and we might pursue that as soon as there is a useful instrumentation
-  // that needs it.
+  // already invalidated IRUnit is unsafe. There are ways to handle invalidated
+  // IRUnits in a safe way, and we might pursue that as soon as there is a
+  // useful instrumentation that needs it.
   using BeforePassFunc = bool(StringRef, Any);
-  using AfterPassFunc = void(StringRef, Any);
-  using AfterPassInvalidatedFunc = void(StringRef);
+  using BeforeSkippedPassFunc = void(StringRef, Any);
+  using BeforeNonSkippedPassFunc = void(StringRef, Any);
+  using AfterPassFunc = void(StringRef, Any, const PreservedAnalyses &);
+  using AfterPassInvalidatedFunc = void(StringRef, const PreservedAnalyses &);
   using BeforeAnalysisFunc = void(StringRef, Any);
   using AfterAnalysisFunc = void(StringRef, Any);
 
@@ -88,8 +89,19 @@ public:
   PassInstrumentationCallbacks(const PassInstrumentationCallbacks &) = delete;
   void operator=(const PassInstrumentationCallbacks &) = delete;
 
-  template <typename CallableT> void registerBeforePassCallback(CallableT C) {
-    BeforePassCallbacks.emplace_back(std::move(C));
+  template <typename CallableT>
+  void registerShouldRunOptionalPassCallback(CallableT C) {
+    ShouldRunOptionalPassCallbacks.emplace_back(std::move(C));
+  }
+
+  template <typename CallableT>
+  void registerBeforeSkippedPassCallback(CallableT C) {
+    BeforeSkippedPassCallbacks.emplace_back(std::move(C));
+  }
+
+  template <typename CallableT>
+  void registerBeforeNonSkippedPassCallback(CallableT C) {
+    BeforeNonSkippedPassCallbacks.emplace_back(std::move(C));
   }
 
   template <typename CallableT> void registerAfterPassCallback(CallableT C) {
@@ -111,17 +123,37 @@ public:
     AfterAnalysisCallbacks.emplace_back(std::move(C));
   }
 
+  /// Add a class name to pass name mapping for use by pass instrumentation.
+  void addClassToPassName(StringRef ClassName, StringRef PassName);
+  /// Get the pass name for a given pass class name.
+  StringRef getPassNameForClassName(StringRef ClassName);
+
 private:
   friend class PassInstrumentation;
 
-  SmallVector<llvm::unique_function<BeforePassFunc>, 4> BeforePassCallbacks;
+  /// These are only run on passes that are not required. They return false when
+  /// an optional pass should be skipped.
+  SmallVector<llvm::unique_function<BeforePassFunc>, 4>
+      ShouldRunOptionalPassCallbacks;
+  /// These are run on passes that are skipped.
+  SmallVector<llvm::unique_function<BeforeSkippedPassFunc>, 4>
+      BeforeSkippedPassCallbacks;
+  /// These are run on passes that are about to be run.
+  SmallVector<llvm::unique_function<BeforeNonSkippedPassFunc>, 4>
+      BeforeNonSkippedPassCallbacks;
+  /// These are run on passes that have just run.
   SmallVector<llvm::unique_function<AfterPassFunc>, 4> AfterPassCallbacks;
+  /// These are run passes that have just run on invalidated IR.
   SmallVector<llvm::unique_function<AfterPassInvalidatedFunc>, 4>
       AfterPassInvalidatedCallbacks;
+  /// These are run on analyses that are about to be run.
   SmallVector<llvm::unique_function<BeforeAnalysisFunc>, 4>
       BeforeAnalysisCallbacks;
+  /// These are run on analyses that have been run.
   SmallVector<llvm::unique_function<AfterAnalysisFunc>, 4>
       AfterAnalysisCallbacks;
+
+  StringMap<std::string> ClassToPassName;
 };
 
 /// This class provides instrumentation entry points for the Pass Manager,
@@ -129,6 +161,26 @@ private:
 class PassInstrumentation {
   PassInstrumentationCallbacks *Callbacks;
 
+  // Template argument PassT of PassInstrumentation::runBeforePass could be two
+  // kinds: (1) a regular pass inherited from PassInfoMixin (happen when
+  // creating a adaptor pass for a regular pass); (2) a type-erased PassConcept
+  // created from (1). Here we want to make case (1) skippable unconditionally
+  // since they are regular passes. We call PassConcept::isRequired to decide
+  // for case (2).
+  template <typename PassT>
+  using has_required_t = decltype(std::declval<PassT &>().isRequired());
+
+  template <typename PassT>
+  static std::enable_if_t<is_detected<has_required_t, PassT>::value, bool>
+  isRequired(const PassT &Pass) {
+    return Pass.isRequired();
+  }
+  template <typename PassT>
+  static std::enable_if_t<!is_detected<has_required_t, PassT>::value, bool>
+  isRequired(const PassT &Pass) {
+    return false;
+  }
+
 public:
   /// Callbacks object is not owned by PassInstrumentation, its life-time
   /// should at least match the life-time of corresponding
@@ -139,15 +191,28 @@ public:
 
   /// BeforePass instrumentation point - takes \p Pass instance to be executed
   /// and constant reference to IR it operates on. \Returns true if pass is
-  /// allowed to be executed.
+  /// allowed to be executed. These are only run on optional pass since required
+  /// passes must always be run. This allows these callbacks to print info when
+  /// they want to skip a pass.
   template <typename IRUnitT, typename PassT>
   bool runBeforePass(const PassT &Pass, const IRUnitT &IR) const {
     if (!Callbacks)
       return true;
 
     bool ShouldRun = true;
-    for (auto &C : Callbacks->BeforePassCallbacks)
-      ShouldRun &= C(Pass.name(), llvm::Any(&IR));
+    if (!isRequired(Pass)) {
+      for (auto &C : Callbacks->ShouldRunOptionalPassCallbacks)
+        ShouldRun &= C(Pass.name(), llvm::Any(&IR));
+    }
+
+    if (ShouldRun) {
+      for (auto &C : Callbacks->BeforeNonSkippedPassCallbacks)
+        C(Pass.name(), llvm::Any(&IR));
+    } else {
+      for (auto &C : Callbacks->BeforeSkippedPassCallbacks)
+        C(Pass.name(), llvm::Any(&IR));
+    }
+
     return ShouldRun;
   }
 
@@ -155,20 +220,22 @@ public:
   /// just been executed and constant reference to \p IR it operates on.
   /// \p IR is guaranteed to be valid at this point.
   template <typename IRUnitT, typename PassT>
-  void runAfterPass(const PassT &Pass, const IRUnitT &IR) const {
+  void runAfterPass(const PassT &Pass, const IRUnitT &IR,
+                    const PreservedAnalyses &PA) const {
     if (Callbacks)
       for (auto &C : Callbacks->AfterPassCallbacks)
-        C(Pass.name(), llvm::Any(&IR));
+        C(Pass.name(), llvm::Any(&IR), PA);
   }
 
   /// AfterPassInvalidated instrumentation point - takes \p Pass instance
   /// that has just been executed. For use when IR has been invalidated
   /// by \p Pass execution.
   template <typename IRUnitT, typename PassT>
-  void runAfterPassInvalidated(const PassT &Pass) const {
+  void runAfterPassInvalidated(const PassT &Pass,
+                               const PreservedAnalyses &PA) const {
     if (Callbacks)
       for (auto &C : Callbacks->AfterPassInvalidatedCallbacks)
-        C(Pass.name());
+        C(Pass.name(), PA);
   }
 
   /// BeforeAnalysis instrumentation point - takes \p Analysis instance
@@ -199,8 +266,20 @@ public:
                   ExtraArgsT...) {
     return false;
   }
+
+  template <typename CallableT>
+  void pushBeforeNonSkippedPassCallback(CallableT C) {
+    if (Callbacks)
+      Callbacks->BeforeNonSkippedPassCallbacks.emplace_back(std::move(C));
+  }
+  void popBeforeNonSkippedPassCallback() {
+    if (Callbacks)
+      Callbacks->BeforeNonSkippedPassCallbacks.pop_back();
+  }
 };
 
+bool isSpecialPass(StringRef PassID, const std::vector<StringRef> &Specials);
+
 } // namespace llvm
 
 #endif
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/PassManager.h b/contrib/llvm-project/llvm/include/llvm/IR/PassManager.h
index 4d5f292ba9a1..c669565aa33b 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/PassManager.h
+++ b/contrib/llvm-project/llvm/include/llvm/IR/PassManager.h
@@ -38,6 +38,7 @@
 #define LLVM_IR_PASSMANAGER_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/TinyPtrVector.h"
@@ -510,10 +511,6 @@ public:
       if (!PI.runBeforePass<IRUnitT>(*P, IR))
         continue;
 
-      if (DebugLogging)
-        dbgs() << "Running pass: " << P->name() << " on " << IR.getName()
-               << "\n";
-
       PreservedAnalyses PassPA;
       {
         TimeTraceScope TimeScope(P->name(), IR.getName());
@@ -522,7 +519,7 @@ public:
 
       // Call onto PassInstrumentation's AfterPass callbacks immediately after
       // running the pass.
-      PI.runAfterPass<IRUnitT>(*P, IR);
+      PI.runAfterPass<IRUnitT>(*P, IR, PassPA);
 
       // Update the analysis manager as each pass runs and potentially
       // invalidates analyses.
@@ -551,7 +548,9 @@ public:
     return PA;
   }
 
-  template <typename PassT> void addPass(PassT Pass) {
+  template <typename PassT>
+  std::enable_if_t<!std::is_same<PassT, PassManager>::value>
+  addPass(PassT Pass) {
     using PassModelT =
         detail::PassModel<IRUnitT, PassT, PreservedAnalyses, AnalysisManagerT,
                           ExtraArgTs...>;
@@ -559,7 +558,24 @@ public:
     Passes.emplace_back(new PassModelT(std::move(Pass)));
   }
 
-private:
+  /// When adding a pass manager pass that has the same type as this pass
+  /// manager, simply move the passes over. This is because we don't have use
+  /// cases rely on executing nested pass managers. Doing this could reduce
+  /// implementation complexity and avoid potential invalidation issues that may
+  /// happen with nested pass managers of the same type.
+  template <typename PassT>
+  std::enable_if_t<std::is_same<PassT, PassManager>::value>
+  addPass(PassT &&Pass) {
+    for (auto &P : Pass.Passes)
+      Passes.emplace_back(std::move(P));
+  }
+
+  /// Returns if the pass manager contains any passes.
+  bool isEmpty() const { return Passes.empty(); }
+
+  static bool isRequired() { return true; }
+
+protected:
   using PassConceptT =
       detail::PassConcept<IRUnitT, AnalysisManagerT, ExtraArgTs...>;
 
@@ -649,7 +665,7 @@ public:
   /// when any of its embedded analysis results end up invalidated. We pass an
   /// \c Invalidator object as an argument to \c invalidate() in order to let
   /// the analysis results themselves define the dependency graph on the fly.
-  /// This lets us avoid building building an explicit representation of the
+  /// This lets us avoid building an explicit representation of the
   /// dependencies between analysis results.
   class Invalidator {
   public:
@@ -844,7 +860,7 @@ public:
     return true;
   }
 
-  /// Invalidate a specific analysis pass for an IR module.
+  /// Invalidate a specific analysis pass for an IR unit.
   ///
   /// Note that the analysis result can disregard invalidation, if it determines
   /// it is in fact still valid.
@@ -888,7 +904,7 @@ private:
     return RI == AnalysisResults.end() ? nullptr : &*RI->second->second;
   }
 
-  /// Invalidate a function pass result.
+  /// Invalidate a pass result for a IR unit.
   void invalidateImpl(AnalysisKey *ID, IRUnitT &IR) {
     typename AnalysisResultMapT::iterator RI =
         AnalysisResults.find({ID, &IR});
@@ -902,20 +918,20 @@ private:
     AnalysisResults.erase(RI);
   }
 
-  /// Map type from module analysis pass ID to pass concept pointer.
+  /// Map type from analysis pass ID to pass concept pointer.
   using AnalysisPassMapT =
       DenseMap<AnalysisKey *, std::unique_ptr<PassConceptT>>;
 
-  /// Collection of module analysis passes, indexed by ID.
+  /// Collection of analysis passes, indexed by ID.
   AnalysisPassMapT AnalysisPasses;
 
-  /// Map from function to a list of function analysis results.
+  /// Map from IR unit to a list of analysis results.
   ///
-  /// Provides linear time removal of all analysis results for a function and
+  /// Provides linear time removal of all analysis results for a IR unit and
   /// the ultimate storage for a particular cached analysis result.
   AnalysisResultListMapT AnalysisResultLists;
 
-  /// Map from an analysis ID and function to a particular cached
+  /// Map from an analysis ID and IR unit to a particular cached
   /// analysis result.
   AnalysisResultMapT AnalysisResults;
 
@@ -1059,7 +1075,16 @@ extern template class InnerAnalysisManagerProxy<FunctionAnalysisManager,
 ///
 /// This proxy only exposes the const interface of the outer analysis manager,
 /// to indicate that you cannot cause an outer analysis to run from within an
-/// inner pass.  Instead, you must rely on the \c getCachedResult API.
+/// inner pass.  Instead, you must rely on the \c getCachedResult API.  This is
+/// due to keeping potential future concurrency in mind. To give an example,
+/// running a module analysis before any function passes may give a different
+/// result than running it in a function pass. Both may be valid, but it would
+/// produce non-deterministic results. GlobalsAA is a good analysis example,
+/// because the cached information has the mod/ref info for all memory for each
+/// function at the time the analysis was computed. The information is still
+/// valid after a function transformation, but it may be *different* if
+/// recomputed after that transform. GlobalsAA is never invalidated.
+
 ///
 /// This proxy doesn't manage invalidation in any way -- that is handled by the
 /// recursive return path of each layer of the pass manager.  A consequence of
@@ -1104,9 +1129,9 @@ public:
       for (auto &KeyValuePair : OuterAnalysisInvalidationMap) {
         AnalysisKey *OuterID = KeyValuePair.first;
         auto &InnerIDs = KeyValuePair.second;
-        InnerIDs.erase(llvm::remove_if(InnerIDs, [&](AnalysisKey *InnerID) {
-          return Inv.invalidate(InnerID, IRUnit, PA); }),
-                       InnerIDs.end());
+        llvm::erase_if(InnerIDs, [&](AnalysisKey *InnerID) {
+          return Inv.invalidate(InnerID, IRUnit, PA);
+        });
         if (InnerIDs.empty())
           DeadKeys.push_back(OuterID);
       }
@@ -1130,9 +1155,7 @@ public:
       // analyses that all trigger invalidation on the same outer analysis,
       // this entire system should be changed to some other deterministic
       // data structure such as a `SetVector` of a pair of pointers.
-      auto InvalidatedIt = std::find(InvalidatedIDList.begin(),
-                                     InvalidatedIDList.end(), InvalidatedID);
-      if (InvalidatedIt == InvalidatedIDList.end())
+      if (!llvm::is_contained(InvalidatedIDList, InvalidatedID))
         InvalidatedIDList.push_back(InvalidatedID);
     }
 
@@ -1205,71 +1228,34 @@ using ModuleAnalysisManagerFunctionProxy =
 /// Note that although function passes can access module analyses, module
 /// analyses are not invalidated while the function passes are running, so they
 /// may be stale.  Function analyses will not be stale.
-template <typename FunctionPassT>
 class ModuleToFunctionPassAdaptor
-    : public PassInfoMixin<ModuleToFunctionPassAdaptor<FunctionPassT>> {
+    : public PassInfoMixin<ModuleToFunctionPassAdaptor> {
 public:
-  explicit ModuleToFunctionPassAdaptor(FunctionPassT Pass)
+  using PassConceptT = detail::PassConcept<Function, FunctionAnalysisManager>;
+
+  explicit ModuleToFunctionPassAdaptor(std::unique_ptr<PassConceptT> Pass)
       : Pass(std::move(Pass)) {}
 
   /// Runs the function pass across every function in the module.
-  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM) {
-    FunctionAnalysisManager &FAM =
-        AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
-
-    // Request PassInstrumentation from analysis manager, will use it to run
-    // instrumenting callbacks for the passes later.
-    PassInstrumentation PI = AM.getResult<PassInstrumentationAnalysis>(M);
-
-    PreservedAnalyses PA = PreservedAnalyses::all();
-    for (Function &F : M) {
-      if (F.isDeclaration())
-        continue;
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
 
-      // Check the PassInstrumentation's BeforePass callbacks before running the
-      // pass, skip its execution completely if asked to (callback returns
-      // false).
-      if (!PI.runBeforePass<Function>(Pass, F))
-        continue;
-
-      PreservedAnalyses PassPA;
-      {
-        TimeTraceScope TimeScope(Pass.name(), F.getName());
-        PassPA = Pass.run(F, FAM);
-      }
-
-      PI.runAfterPass(Pass, F);
-
-      // We know that the function pass couldn't have invalidated any other
-      // function's analyses (that's the contract of a function pass), so
-      // directly handle the function analysis manager's invalidation here.
-      FAM.invalidate(F, PassPA);
-
-      // Then intersect the preserved set so that invalidation of module
-      // analyses will eventually occur when the module pass completes.
-      PA.intersect(std::move(PassPA));
-    }
-
-    // The FunctionAnalysisManagerModuleProxy is preserved because (we assume)
-    // the function passes we ran didn't add or remove any functions.
-    //
-    // We also preserve all analyses on Functions, because we did all the
-    // invalidation we needed to do above.
-    PA.preserveSet<AllAnalysesOn<Function>>();
-    PA.preserve<FunctionAnalysisManagerModuleProxy>();
-    return PA;
-  }
+  static bool isRequired() { return true; }
 
 private:
-  FunctionPassT Pass;
+  std::unique_ptr<PassConceptT> Pass;
 };
 
 /// A function to deduce a function pass type and wrap it in the
 /// templated adaptor.
 template <typename FunctionPassT>
-ModuleToFunctionPassAdaptor<FunctionPassT>
+ModuleToFunctionPassAdaptor
 createModuleToFunctionPassAdaptor(FunctionPassT Pass) {
-  return ModuleToFunctionPassAdaptor<FunctionPassT>(std::move(Pass));
+  using PassModelT =
+      detail::PassModel<Function, FunctionPassT, PreservedAnalyses,
+                        FunctionAnalysisManager>;
+
+  return ModuleToFunctionPassAdaptor(
+      std::make_unique<PassModelT>(std::move(Pass)));
 }
 
 /// A utility pass template to force an analysis result to be available.
@@ -1300,6 +1286,7 @@ struct RequireAnalysisPass
 
     return PreservedAnalyses::all();
   }
+  static bool isRequired() { return true; }
 };
 
 /// A no-op pass template which simply forces a specific analysis result
@@ -1360,8 +1347,9 @@ public:
       // false).
       if (!PI.runBeforePass<IRUnitT>(P, IR))
         continue;
-      PA.intersect(P.run(IR, AM, std::forward<Ts>(Args)...));
-      PI.runAfterPass(P, IR);
+      PreservedAnalyses IterPA = P.run(IR, AM, std::forward<Ts>(Args)...);
+      PA.intersect(IterPA);
+      PI.runAfterPass(P, IR, IterPA);
     }
     return PA;
   }
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/PassManagerImpl.h b/contrib/llvm-project/llvm/include/llvm/IR/PassManagerImpl.h
index 978655ac69c4..71a86d1efb15 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/PassManagerImpl.h
+++ b/contrib/llvm-project/llvm/include/llvm/IR/PassManagerImpl.h
@@ -64,9 +64,6 @@ AnalysisManager<IRUnitT, ExtraArgTs...>::getResultImpl(
   // run it to produce a result, which we then add to the cache.
   if (Inserted) {
     auto &P = this->lookUpPass(ID);
-    if (DebugLogging)
-      dbgs() << "Running analysis: " << P.name() << " on " << IR.getName()
-             << "\n";
 
     PassInstrumentation PI;
     if (ID != PassInstrumentationAnalysis::ID()) {
@@ -97,10 +94,6 @@ inline void AnalysisManager<IRUnitT, ExtraArgTs...>::invalidate(
   if (PA.allAnalysesInSetPreserved<AllAnalysesOn<IRUnitT>>())
     return;
 
-  if (DebugLogging)
-    dbgs() << "Invalidating all non-preserved analyses for: " << IR.getName()
-           << "\n";
-
   // Track whether each analysis's result is invalidated in
   // IsResultInvalidated.
   SmallDenseMap<AnalysisKey *, bool, 8> IsResultInvalidated;
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/PassManagerInternal.h b/contrib/llvm-project/llvm/include/llvm/IR/PassManagerInternal.h
index c602c0b5cc20..986ed0b5a7ac 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/PassManagerInternal.h
+++ b/contrib/llvm-project/llvm/include/llvm/IR/PassManagerInternal.h
@@ -48,6 +48,12 @@ struct PassConcept {
 
   /// Polymorphic method to access the name of a pass.
   virtual StringRef name() const = 0;
+
+  /// Polymorphic method to to let a pass optionally exempted from skipping by
+  /// PassInstrumentation.
+  /// To opt-in, pass should implement `static bool isRequired()`. It's no-op
+  /// to have `isRequired` always return false since that is the default.
+  virtual bool isRequired() const = 0;
 };
 
 /// A template wrapper used to implement the polymorphic API.
@@ -81,6 +87,22 @@ struct PassModel : PassConcept<IRUnitT, AnalysisManagerT, ExtraArgTs...> {
 
   StringRef name() const override { return PassT::name(); }
 
+  template <typename T>
+  using has_required_t = decltype(std::declval<T &>().isRequired());
+
+  template <typename T>
+  static std::enable_if_t<is_detected<has_required_t, T>::value, bool>
+  passIsRequiredImpl() {
+    return T::isRequired();
+  }
+  template <typename T>
+  static std::enable_if_t<!is_detected<has_required_t, T>::value, bool>
+  passIsRequiredImpl() {
+    return false;
+  }
+
+  bool isRequired() const override { return passIsRequiredImpl<PassT>(); }
+
   PassT Pass;
 };
 
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/PassTimingInfo.h b/contrib/llvm-project/llvm/include/llvm/IR/PassTimingInfo.h
index b70850fd64d7..e44321b4af66 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/PassTimingInfo.h
+++ b/contrib/llvm-project/llvm/include/llvm/IR/PassTimingInfo.h
@@ -17,11 +17,13 @@
 
 #include "llvm/ADT/Any.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Timer.h"
-#include "llvm/Support/TypeName.h"
 #include <memory>
+#include <utility>
+
 namespace llvm {
 
 class Pass;
@@ -36,11 +38,6 @@ void reportAndResetTimings(raw_ostream *OutStream = nullptr);
 /// Request the timer for this legacy-pass-manager's pass instance.
 Timer *getPassTimer(Pass *);
 
-/// If the user specifies the -time-passes argument on an LLVM tool command line
-/// then the value of this boolean will be true, otherwise false.
-/// This is the storage for the -time-passes option.
-extern bool TimePassesIsEnabled;
-
 /// This class implements -time-passes functionality for new pass manager.
 /// It provides the pass-instrumentation callbacks that measure the pass
 /// execution time. They collect timing info into individual timers as
@@ -68,9 +65,11 @@ class TimePassesHandler {
   raw_ostream *OutStream = nullptr;
 
   bool Enabled;
+  bool PerRun;
 
 public:
-  TimePassesHandler(bool Enabled = TimePassesIsEnabled);
+  TimePassesHandler();
+  TimePassesHandler(bool Enabled, bool PerRun = false);
 
   /// Destructor handles the print action if it has not been handled before.
   ~TimePassesHandler() { print(); }
@@ -98,7 +97,7 @@ private:
   void stopTimer(StringRef PassID);
 
   // Implementation of pass instrumentation callbacks.
-  bool runBeforePass(StringRef PassID);
+  void runBeforePass(StringRef PassID);
   void runAfterPass(StringRef PassID);
 };
 
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/PatternMatch.h b/contrib/llvm-project/llvm/include/llvm/IR/PatternMatch.h
index 4c11bc82510b..166ad23de969 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/PatternMatch.h
+++ b/contrib/llvm-project/llvm/include/llvm/IR/PatternMatch.h
@@ -88,16 +88,29 @@ inline class_match<BinaryOperator> m_BinOp() {
 /// Matches any compare instruction and ignore it.
 inline class_match<CmpInst> m_Cmp() { return class_match<CmpInst>(); }
 
+/// Match an arbitrary undef constant.
+inline class_match<UndefValue> m_Undef() { return class_match<UndefValue>(); }
+
+/// Match an arbitrary poison constant.
+inline class_match<PoisonValue> m_Poison() { return class_match<PoisonValue>(); }
+
+/// Match an arbitrary Constant and ignore it.
+inline class_match<Constant> m_Constant() { return class_match<Constant>(); }
+
 /// Match an arbitrary ConstantInt and ignore it.
 inline class_match<ConstantInt> m_ConstantInt() {
   return class_match<ConstantInt>();
 }
 
-/// Match an arbitrary undef constant.
-inline class_match<UndefValue> m_Undef() { return class_match<UndefValue>(); }
+/// Match an arbitrary ConstantFP and ignore it.
+inline class_match<ConstantFP> m_ConstantFP() {
+  return class_match<ConstantFP>();
+}
 
-/// Match an arbitrary Constant and ignore it.
-inline class_match<Constant> m_Constant() { return class_match<Constant>(); }
+/// Match an arbitrary ConstantExpr and ignore it.
+inline class_match<ConstantExpr> m_ConstantExpr() {
+  return class_match<ConstantExpr>();
+}
 
 /// Match an arbitrary basic block value and ignore it.
 inline class_match<BasicBlock> m_BasicBlock() {
@@ -335,6 +348,33 @@ template <typename Predicate> struct api_pred_ty : public Predicate {
   }
 };
 
+/// This helper class is used to match scalar and vector constants that
+/// satisfy a specified predicate, and bind them to an APFloat.
+/// Undefs are allowed in splat vector constants.
+template <typename Predicate> struct apf_pred_ty : public Predicate {
+  const APFloat *&Res;
+
+  apf_pred_ty(const APFloat *&R) : Res(R) {}
+
+  template <typename ITy> bool match(ITy *V) {
+    if (const auto *CI = dyn_cast<ConstantFP>(V))
+      if (this->isValue(CI->getValue())) {
+        Res = &CI->getValue();
+        return true;
+      }
+    if (V->getType()->isVectorTy())
+      if (const auto *C = dyn_cast<Constant>(V))
+        if (auto *CI = dyn_cast_or_null<ConstantFP>(
+                C->getSplatValue(/* AllowUndef */ true)))
+          if (this->isValue(CI->getValue())) {
+            Res = &CI->getValue();
+            return true;
+          }
+
+    return false;
+  }
+};
+
 ///////////////////////////////////////////////////////////////////////////////
 //
 // Encapsulate constant value queries for use in templated predicate matchers.
@@ -555,6 +595,15 @@ inline cstfp_pred_ty<is_nan> m_NaN() {
   return cstfp_pred_ty<is_nan>();
 }
 
+struct is_nonnan {
+  bool isValue(const APFloat &C) { return !C.isNaN(); }
+};
+/// Match a non-NaN FP constant.
+/// For vectors, this includes constants with undefined elements.
+inline cstfp_pred_ty<is_nonnan> m_NonNaN() {
+  return cstfp_pred_ty<is_nonnan>();
+}
+
 struct is_inf {
   bool isValue(const APFloat &C) { return C.isInfinity(); }
 };
@@ -564,6 +613,37 @@ inline cstfp_pred_ty<is_inf> m_Inf() {
   return cstfp_pred_ty<is_inf>();
 }
 
+struct is_noninf {
+  bool isValue(const APFloat &C) { return !C.isInfinity(); }
+};
+/// Match a non-infinity FP constant, i.e. finite or NaN.
+/// For vectors, this includes constants with undefined elements.
+inline cstfp_pred_ty<is_noninf> m_NonInf() {
+  return cstfp_pred_ty<is_noninf>();
+}
+
+struct is_finite {
+  bool isValue(const APFloat &C) { return C.isFinite(); }
+};
+/// Match a finite FP constant, i.e. not infinity or NaN.
+/// For vectors, this includes constants with undefined elements.
+inline cstfp_pred_ty<is_finite> m_Finite() {
+  return cstfp_pred_ty<is_finite>();
+}
+inline apf_pred_ty<is_finite> m_Finite(const APFloat *&V) { return V; }
+
+struct is_finitenonzero {
+  bool isValue(const APFloat &C) { return C.isFiniteNonZero(); }
+};
+/// Match a finite non-zero FP constant.
+/// For vectors, this includes constants with undefined elements.
+inline cstfp_pred_ty<is_finitenonzero> m_FiniteNonZero() {
+  return cstfp_pred_ty<is_finitenonzero>();
+}
+inline apf_pred_ty<is_finitenonzero> m_FiniteNonZero(const APFloat *&V) {
+  return V;
+}
+
 struct is_any_zero_fp {
   bool isValue(const APFloat &C) { return C.isZero(); }
 };
@@ -591,6 +671,15 @@ inline cstfp_pred_ty<is_neg_zero_fp> m_NegZeroFP() {
   return cstfp_pred_ty<is_neg_zero_fp>();
 }
 
+struct is_non_zero_fp {
+  bool isValue(const APFloat &C) { return C.isNonZero(); }
+};
+/// Match a floating-point non-zero.
+/// For vectors, this includes constants with undefined elements.
+inline cstfp_pred_ty<is_non_zero_fp> m_NonZeroFP() {
+  return cstfp_pred_ty<is_non_zero_fp>();
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 
 template <typename Class> struct bind_ty {
@@ -620,21 +709,38 @@ inline bind_ty<BinaryOperator> m_BinOp(BinaryOperator *&I) { return I; }
 /// Match a with overflow intrinsic, capturing it if we match.
 inline bind_ty<WithOverflowInst> m_WithOverflowInst(WithOverflowInst *&I) { return I; }
 
-/// Match a ConstantInt, capturing the value if we match.
-inline bind_ty<ConstantInt> m_ConstantInt(ConstantInt *&CI) { return CI; }
-
 /// Match a Constant, capturing the value if we match.
 inline bind_ty<Constant> m_Constant(Constant *&C) { return C; }
 
+/// Match a ConstantInt, capturing the value if we match.
+inline bind_ty<ConstantInt> m_ConstantInt(ConstantInt *&CI) { return CI; }
+
 /// Match a ConstantFP, capturing the value if we match.
 inline bind_ty<ConstantFP> m_ConstantFP(ConstantFP *&C) { return C; }
 
+/// Match a ConstantExpr, capturing the value if we match.
+inline bind_ty<ConstantExpr> m_ConstantExpr(ConstantExpr *&C) { return C; }
+
 /// Match a basic block value, capturing it if we match.
 inline bind_ty<BasicBlock> m_BasicBlock(BasicBlock *&V) { return V; }
 inline bind_ty<const BasicBlock> m_BasicBlock(const BasicBlock *&V) {
   return V;
 }
 
+/// Match an arbitrary immediate Constant and ignore it.
+inline match_combine_and<class_match<Constant>,
+                         match_unless<class_match<ConstantExpr>>>
+m_ImmConstant() {
+  return m_CombineAnd(m_Constant(), m_Unless(m_ConstantExpr()));
+}
+
+/// Match an immediate Constant, capturing the value if we match.
+inline match_combine_and<bind_ty<Constant>,
+                         match_unless<class_match<ConstantExpr>>>
+m_ImmConstant(Constant *&C) {
+  return m_CombineAnd(m_Constant(C), m_Unless(m_ConstantExpr()));
+}
+
 /// Match a specified Value*.
 struct specificval_ty {
   const Value *Val;
@@ -705,6 +811,7 @@ struct bind_const_intval_ty {
 
 /// Match a specified integer value or vector of all elements of that
 /// value.
+template <bool AllowUndefs>
 struct specific_intval {
   APInt Val;
 
@@ -714,7 +821,7 @@ struct specific_intval {
     const auto *CI = dyn_cast<ConstantInt>(V);
     if (!CI && V->getType()->isVectorTy())
       if (const auto *C = dyn_cast<Constant>(V))
-        CI = dyn_cast_or_null<ConstantInt>(C->getSplatValue());
+        CI = dyn_cast_or_null<ConstantInt>(C->getSplatValue(AllowUndefs));
 
     return CI && APInt::isSameValue(CI->getValue(), Val);
   }
@@ -722,14 +829,22 @@ struct specific_intval {
 
 /// Match a specific integer value or vector with all elements equal to
 /// the value.
-inline specific_intval m_SpecificInt(APInt V) {
-  return specific_intval(std::move(V));
+inline specific_intval<false> m_SpecificInt(APInt V) {
+  return specific_intval<false>(std::move(V));
 }
 
-inline specific_intval m_SpecificInt(uint64_t V) {
+inline specific_intval<false> m_SpecificInt(uint64_t V) {
   return m_SpecificInt(APInt(64, V));
 }
 
+inline specific_intval<true> m_SpecificIntAllowUndef(APInt V) {
+  return specific_intval<true>(std::move(V));
+}
+
+inline specific_intval<true> m_SpecificIntAllowUndef(uint64_t V) {
+  return m_SpecificIntAllowUndef(APInt(64, V));
+}
+
 /// Match a ConstantInt and bind to its value.  This does not match
 /// ConstantInts wider than 64-bits.
 inline bind_const_intval_ty m_ConstantInt(uint64_t &V) { return V; }
@@ -1442,6 +1557,12 @@ inline CastClass_match<OpTy, Instruction::PtrToInt> m_PtrToInt(const OpTy &Op) {
   return CastClass_match<OpTy, Instruction::PtrToInt>(Op);
 }
 
+/// Matches IntToPtr.
+template <typename OpTy>
+inline CastClass_match<OpTy, Instruction::IntToPtr> m_IntToPtr(const OpTy &Op) {
+  return CastClass_match<OpTy, Instruction::IntToPtr>(Op);
+}
+
 /// Matches Trunc.
 template <typename OpTy>
 inline CastClass_match<OpTy, Instruction::Trunc> m_Trunc(const OpTy &Op) {
@@ -1590,6 +1711,17 @@ struct MaxMin_match {
   MaxMin_match(const LHS_t &LHS, const RHS_t &RHS) : L(LHS), R(RHS) {}
 
   template <typename OpTy> bool match(OpTy *V) {
+    if (auto *II = dyn_cast<IntrinsicInst>(V)) {
+      Intrinsic::ID IID = II->getIntrinsicID();
+      if ((IID == Intrinsic::smax && Pred_t::match(ICmpInst::ICMP_SGT)) ||
+          (IID == Intrinsic::smin && Pred_t::match(ICmpInst::ICMP_SLT)) ||
+          (IID == Intrinsic::umax && Pred_t::match(ICmpInst::ICMP_UGT)) ||
+          (IID == Intrinsic::umin && Pred_t::match(ICmpInst::ICMP_ULT))) {
+        Value *LHS = II->getOperand(0), *RHS = II->getOperand(1);
+        return (L.match(LHS) && R.match(RHS)) ||
+               (Commutable && L.match(RHS) && R.match(LHS));
+      }
+    }
     // Look for "(x pred y) ? x : y" or "(x pred y) ? y : x".
     auto *SI = dyn_cast<SelectInst>(V);
     if (!SI)
@@ -1697,6 +1829,17 @@ inline MaxMin_match<ICmpInst, LHS, RHS, umin_pred_ty> m_UMin(const LHS &L,
   return MaxMin_match<ICmpInst, LHS, RHS, umin_pred_ty>(L, R);
 }
 
+template <typename LHS, typename RHS>
+inline match_combine_or<
+    match_combine_or<MaxMin_match<ICmpInst, LHS, RHS, smax_pred_ty>,
+                     MaxMin_match<ICmpInst, LHS, RHS, smin_pred_ty>>,
+    match_combine_or<MaxMin_match<ICmpInst, LHS, RHS, umax_pred_ty>,
+                     MaxMin_match<ICmpInst, LHS, RHS, umin_pred_ty>>>
+m_MaxOrMin(const LHS &L, const RHS &R) {
+  return m_CombineOr(m_CombineOr(m_SMax(L, R), m_SMin(L, R)),
+                     m_CombineOr(m_UMax(L, R), m_UMin(L, R)));
+}
+
 /// Match an 'ordered' floating point maximum function.
 /// Floating point has one special value 'NaN'. Therefore, there is no total
 /// order. However, if we can ignore the 'NaN' value (for example, because of a
@@ -1987,6 +2130,18 @@ inline typename m_Intrinsic_Ty<Opnd0, Opnd1>::Ty m_FMax(const Opnd0 &Op0,
   return m_Intrinsic<Intrinsic::maxnum>(Op0, Op1);
 }
 
+template <typename Opnd0, typename Opnd1, typename Opnd2>
+inline typename m_Intrinsic_Ty<Opnd0, Opnd1, Opnd2>::Ty
+m_FShl(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2) {
+  return m_Intrinsic<Intrinsic::fshl>(Op0, Op1, Op2);
+}
+
+template <typename Opnd0, typename Opnd1, typename Opnd2>
+inline typename m_Intrinsic_Ty<Opnd0, Opnd1, Opnd2>::Ty
+m_FShr(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2) {
+  return m_Intrinsic<Intrinsic::fshr>(Op0, Op1, Op2);
+}
+
 //===----------------------------------------------------------------------===//
 // Matchers for two-operands operators with the operators in either order
 //
@@ -2048,6 +2203,15 @@ m_Neg(const ValTy &V) {
   return m_Sub(m_ZeroInt(), V);
 }
 
+/// Matches a 'Neg' as 'sub nsw 0, V'.
+template <typename ValTy>
+inline OverflowingBinaryOp_match<cst_pred_ty<is_zero_int>, ValTy,
+                                 Instruction::Sub,
+                                 OverflowingBinaryOperator::NoSignedWrap>
+m_NSWNeg(const ValTy &V) {
+  return m_NSWSub(m_ZeroInt(), V);
+}
+
 /// Matches a 'Not' as 'xor V, -1' or 'xor -1, V'.
 template <typename ValTy>
 inline BinaryOp_match<ValTy, cst_pred_ty<is_all_ones>, Instruction::Xor, true>
@@ -2080,6 +2244,17 @@ m_c_UMax(const LHS &L, const RHS &R) {
   return MaxMin_match<ICmpInst, LHS, RHS, umax_pred_ty, true>(L, R);
 }
 
+template <typename LHS, typename RHS>
+inline match_combine_or<
+    match_combine_or<MaxMin_match<ICmpInst, LHS, RHS, smax_pred_ty, true>,
+                     MaxMin_match<ICmpInst, LHS, RHS, smin_pred_ty, true>>,
+    match_combine_or<MaxMin_match<ICmpInst, LHS, RHS, umax_pred_ty, true>,
+                     MaxMin_match<ICmpInst, LHS, RHS, umin_pred_ty, true>>>
+m_c_MaxOrMin(const LHS &L, const RHS &R) {
+  return m_CombineOr(m_CombineOr(m_c_SMax(L, R), m_c_SMin(L, R)),
+                     m_CombineOr(m_c_UMax(L, R), m_c_UMin(L, R)));
+}
+
 /// Matches FAdd with LHS and RHS in either order.
 template <typename LHS, typename RHS>
 inline BinaryOp_match<LHS, RHS, Instruction::FAdd, true>
@@ -2153,6 +2328,29 @@ inline ExtractValue_match<Ind, Val_t> m_ExtractValue(const Val_t &V) {
   return ExtractValue_match<Ind, Val_t>(V);
 }
 
+/// Matcher for a single index InsertValue instruction.
+template <int Ind, typename T0, typename T1> struct InsertValue_match {
+  T0 Op0;
+  T1 Op1;
+
+  InsertValue_match(const T0 &Op0, const T1 &Op1) : Op0(Op0), Op1(Op1) {}
+
+  template <typename OpTy> bool match(OpTy *V) {
+    if (auto *I = dyn_cast<InsertValueInst>(V)) {
+      return Op0.match(I->getOperand(0)) && Op1.match(I->getOperand(1)) &&
+             I->getNumIndices() == 1 && Ind == I->getIndices()[0];
+    }
+    return false;
+  }
+};
+
+/// Matches a single index InsertValue instruction.
+template <int Ind, typename Val_t, typename Elt_t>
+inline InsertValue_match<Ind, Val_t, Elt_t> m_InsertValue(const Val_t &Val,
+                                                          const Elt_t &Elt) {
+  return InsertValue_match<Ind, Val_t, Elt_t>(Val, Elt);
+}
+
 /// Matches patterns for `vscale`. This can either be a call to `llvm.vscale` or
 /// the constant expression
 ///  `ptrtoint(gep <vscale x 1 x i8>, <vscale x 1 x i8>* null, i32 1>`
@@ -2189,6 +2387,58 @@ inline VScaleVal_match m_VScale(const DataLayout &DL) {
   return VScaleVal_match(DL);
 }
 
+template <typename LHS, typename RHS, unsigned Opcode>
+struct LogicalOp_match {
+  LHS L;
+  RHS R;
+
+  LogicalOp_match(const LHS &L, const RHS &R) : L(L), R(R) {}
+
+  template <typename T> bool match(T *V) {
+    if (auto *I = dyn_cast<Instruction>(V)) {
+      if (!I->getType()->isIntOrIntVectorTy(1))
+        return false;
+
+      if (I->getOpcode() == Opcode && L.match(I->getOperand(0)) &&
+          R.match(I->getOperand(1)))
+        return true;
+
+      if (auto *SI = dyn_cast<SelectInst>(I)) {
+        if (Opcode == Instruction::And) {
+          if (const auto *C = dyn_cast<Constant>(SI->getFalseValue()))
+            if (C->isNullValue() && L.match(SI->getCondition()) &&
+                R.match(SI->getTrueValue()))
+              return true;
+        } else {
+          assert(Opcode == Instruction::Or);
+          if (const auto *C = dyn_cast<Constant>(SI->getTrueValue()))
+            if (C->isOneValue() && L.match(SI->getCondition()) &&
+                R.match(SI->getFalseValue()))
+              return true;
+        }
+      }
+    }
+
+    return false;
+  }
+};
+
+/// Matches L && R either in the form of L & R or L ? R : false.
+/// Note that the latter form is poison-blocking.
+template <typename LHS, typename RHS>
+inline LogicalOp_match<LHS, RHS, Instruction::And>
+m_LogicalAnd(const LHS &L, const RHS &R) {
+  return LogicalOp_match<LHS, RHS, Instruction::And>(L, R);
+}
+
+/// Matches L || R either in the form of L | R or L ? true : R.
+/// Note that the latter form is poison-blocking.
+template <typename LHS, typename RHS>
+inline LogicalOp_match<LHS, RHS, Instruction::Or>
+m_LogicalOr(const LHS &L, const RHS &R) {
+  return LogicalOp_match<LHS, RHS, Instruction::Or>(L, R);
+}
+
 } // end namespace PatternMatch
 } // end namespace llvm
 
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/PredIteratorCache.h b/contrib/llvm-project/llvm/include/llvm/IR/PredIteratorCache.h
index cc835277910b..6bbd7e5e87a0 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/PredIteratorCache.h
+++ b/contrib/llvm-project/llvm/include/llvm/IR/PredIteratorCache.h
@@ -44,7 +44,7 @@ private:
     if (Entry)
       return Entry;
 
-    SmallVector<BasicBlock *, 32> PredCache(pred_begin(BB), pred_end(BB));
+    SmallVector<BasicBlock *, 32> PredCache(predecessors(BB));
     PredCache.push_back(nullptr); // null terminator.
 
     BlockToPredCountMap[BB] = PredCache.size() - 1;
@@ -58,7 +58,7 @@ private:
     auto Result = BlockToPredCountMap.find(BB);
     if (Result != BlockToPredCountMap.end())
       return Result->second;
-    return BlockToPredCountMap[BB] = std::distance(pred_begin(BB), pred_end(BB));
+    return BlockToPredCountMap[BB] = pred_size(BB);
   }
 
 public:
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/PrintPasses.h b/contrib/llvm-project/llvm/include/llvm/IR/PrintPasses.h
new file mode 100644
index 000000000000..1fa7c1893e20
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/IR/PrintPasses.h
@@ -0,0 +1,44 @@
+//===- PrintPasses.h - Determining whether/when to print IR ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_IR_PRINTPASSES_H
+#define LLVM_IR_PRINTPASSES_H
+
+#include "llvm/ADT/StringRef.h"
+#include <vector>
+
+namespace llvm {
+
+// Returns true if printing before/after some pass is enabled, whether all
+// passes or a specific pass.
+bool shouldPrintBeforeSomePass();
+bool shouldPrintAfterSomePass();
+
+// Returns true if we should print before/after a specific pass. The argument
+// should be the pass ID, e.g. "instcombine".
+bool shouldPrintBeforePass(StringRef PassID);
+bool shouldPrintAfterPass(StringRef PassID);
+
+// Returns true if we should print before/after all passes.
+bool shouldPrintBeforeAll();
+bool shouldPrintAfterAll();
+
+// The list of passes to print before/after, if we only want to print
+// before/after specific passes.
+std::vector<std::string> printBeforePasses();
+std::vector<std::string> printAfterPasses();
+
+// Returns true if we should always print the entire module.
+bool forcePrintModuleIR();
+
+// Returns true if we should print the function.
+bool isFunctionInPrintList(StringRef FunctionName);
+
+} // namespace llvm
+
+#endif // LLVM_IR_PRINTPASSES_H
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/PseudoProbe.h b/contrib/llvm-project/llvm/include/llvm/IR/PseudoProbe.h
new file mode 100644
index 000000000000..5165e80caa2d
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/IR/PseudoProbe.h
@@ -0,0 +1,87 @@
+//===- PseudoProbe.h - Pseudo Probe IR Helpers ------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Pseudo probe IR intrinsic and dwarf discriminator manipulation routines.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_IR_PSEUDOPROBE_H
+#define LLVM_IR_PSEUDOPROBE_H
+
+#include "llvm/ADT/Optional.h"
+#include <cassert>
+#include <cstdint>
+#include <limits>
+
+namespace llvm {
+
+class Instruction;
+class BasicBlock;
+
+constexpr const char *PseudoProbeDescMetadataName = "llvm.pseudo_probe_desc";
+
+enum class PseudoProbeType { Block = 0, IndirectCall, DirectCall };
+
+// The saturated distrution factor representing 100% for block probes.
+constexpr static uint64_t PseudoProbeFullDistributionFactor =
+    std::numeric_limits<uint64_t>::max();
+
+struct PseudoProbeDwarfDiscriminator {
+public:
+  // The following APIs encodes/decodes per-probe information to/from a
+  // 32-bit integer which is organized as:
+  //  [2:0] - 0x7, this is reserved for regular discriminator,
+  //          see DWARF discriminator encoding rule
+  //  [18:3] - probe id
+  //  [25:19] - probe distribution factor
+  //  [28:26] - probe type, see PseudoProbeType
+  //  [31:29] - reserved for probe attributes
+  static uint32_t packProbeData(uint32_t Index, uint32_t Type, uint32_t Flags,
+                                uint32_t Factor) {
+    assert(Index <= 0xFFFF && "Probe index too big to encode, exceeding 2^16");
+    assert(Type <= 0x7 && "Probe type too big to encode, exceeding 7");
+    assert(Flags <= 0x7);
+    assert(Factor <= 100 &&
+           "Probe distribution factor too big to encode, exceeding 100");
+    return (Index << 3) | (Factor << 19) | (Type << 26) | 0x7;
+  }
+
+  static uint32_t extractProbeIndex(uint32_t Value) {
+    return (Value >> 3) & 0xFFFF;
+  }
+
+  static uint32_t extractProbeType(uint32_t Value) {
+    return (Value >> 26) & 0x7;
+  }
+
+  static uint32_t extractProbeAttributes(uint32_t Value) {
+    return (Value >> 29) & 0x7;
+  }
+
+  static uint32_t extractProbeFactor(uint32_t Value) {
+    return (Value >> 19) & 0x7F;
+  }
+
+  // The saturated distrution factor representing 100% for callsites.
+  constexpr static uint8_t FullDistributionFactor = 100;
+};
+
+struct PseudoProbe {
+  uint32_t Id;
+  uint32_t Type;
+  uint32_t Attr;
+  float Factor;
+};
+
+Optional<PseudoProbe> extractProbe(const Instruction &Inst);
+
+void setProbeDistributionFactor(Instruction &Inst, float Factor);
+
+} // end namespace llvm
+
+#endif // LLVM_IR_PSEUDOPROBE_H
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/ReplaceConstant.h b/contrib/llvm-project/llvm/include/llvm/IR/ReplaceConstant.h
new file mode 100644
index 000000000000..753f6d558ef8
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/IR/ReplaceConstant.h
@@ -0,0 +1,28 @@
+//===- ReplaceConstant.h - Replacing LLVM constant expressions --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the utility function for replacing LLVM constant
+// expressions by instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_IR_REPLACECONSTANT_H
+#define LLVM_IR_REPLACECONSTANT_H
+
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instruction.h"
+
+namespace llvm {
+
+/// Create a replacement instruction for constant expression \p CE and insert
+/// it before \p Instr.
+Instruction *createReplacementInstr(ConstantExpr *CE, Instruction *Instr);
+
+} // end namespace llvm
+
+#endif // LLVM_IR_REPLACECONSTANT_H
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/RuntimeLibcalls.def b/contrib/llvm-project/llvm/include/llvm/IR/RuntimeLibcalls.def
index 903db6c70498..c73172612b1e 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/RuntimeLibcalls.def
+++ b/contrib/llvm-project/llvm/include/llvm/IR/RuntimeLibcalls.def
@@ -286,7 +286,9 @@ HANDLE_LIBCALL(FPEXT_F64_PPCF128, "__gcc_dtoq")
 HANDLE_LIBCALL(FPEXT_F80_F128, "__extendxftf2")
 HANDLE_LIBCALL(FPEXT_F64_F128, "__extenddftf2")
 HANDLE_LIBCALL(FPEXT_F32_F128, "__extendsftf2")
+HANDLE_LIBCALL(FPEXT_F16_F128, "__extendhftf2")
 HANDLE_LIBCALL(FPEXT_F32_F64, "__extendsfdf2")
+HANDLE_LIBCALL(FPEXT_F16_F64, "__extendhfdf2")
 HANDLE_LIBCALL(FPEXT_F16_F32, "__gnu_h2f_ieee")
 HANDLE_LIBCALL(FPROUND_F32_F16, "__gnu_f2h_ieee")
 HANDLE_LIBCALL(FPROUND_F64_F16, "__truncdfhf2")
@@ -301,6 +303,9 @@ HANDLE_LIBCALL(FPROUND_F80_F64, "__truncxfdf2")
 HANDLE_LIBCALL(FPROUND_F128_F64, "__trunctfdf2")
 HANDLE_LIBCALL(FPROUND_PPCF128_F64, "__gcc_qtod")
 HANDLE_LIBCALL(FPROUND_F128_F80, "__trunctfxf2")
+HANDLE_LIBCALL(FPTOSINT_F16_I32, "__fixhfsi")
+HANDLE_LIBCALL(FPTOSINT_F16_I64, "__fixhfdi")
+HANDLE_LIBCALL(FPTOSINT_F16_I128, "__fixhfti")
 HANDLE_LIBCALL(FPTOSINT_F32_I32, "__fixsfsi")
 HANDLE_LIBCALL(FPTOSINT_F32_I64, "__fixsfdi")
 HANDLE_LIBCALL(FPTOSINT_F32_I128, "__fixsfti")
@@ -316,6 +321,9 @@ HANDLE_LIBCALL(FPTOSINT_F128_I128, "__fixtfti")
 HANDLE_LIBCALL(FPTOSINT_PPCF128_I32, "__gcc_qtou")
 HANDLE_LIBCALL(FPTOSINT_PPCF128_I64, "__fixtfdi")
 HANDLE_LIBCALL(FPTOSINT_PPCF128_I128, "__fixtfti")
+HANDLE_LIBCALL(FPTOUINT_F16_I32, "__fixunshfsi")
+HANDLE_LIBCALL(FPTOUINT_F16_I64, "__fixunshfdi")
+HANDLE_LIBCALL(FPTOUINT_F16_I128, "__fixunshfti")
 HANDLE_LIBCALL(FPTOUINT_F32_I32, "__fixunssfsi")
 HANDLE_LIBCALL(FPTOUINT_F32_I64, "__fixunssfdi")
 HANDLE_LIBCALL(FPTOUINT_F32_I128, "__fixunssfti")
@@ -331,31 +339,37 @@ HANDLE_LIBCALL(FPTOUINT_F128_I128, "__fixunstfti")
 HANDLE_LIBCALL(FPTOUINT_PPCF128_I32, "__fixunstfsi")
 HANDLE_LIBCALL(FPTOUINT_PPCF128_I64, "__fixunstfdi")
 HANDLE_LIBCALL(FPTOUINT_PPCF128_I128, "__fixunstfti")
+HANDLE_LIBCALL(SINTTOFP_I32_F16, "__floatsihf")
 HANDLE_LIBCALL(SINTTOFP_I32_F32, "__floatsisf")
 HANDLE_LIBCALL(SINTTOFP_I32_F64, "__floatsidf")
 HANDLE_LIBCALL(SINTTOFP_I32_F80, "__floatsixf")
 HANDLE_LIBCALL(SINTTOFP_I32_F128, "__floatsitf")
 HANDLE_LIBCALL(SINTTOFP_I32_PPCF128, "__gcc_itoq")
+HANDLE_LIBCALL(SINTTOFP_I64_F16, "__floatdihf")
 HANDLE_LIBCALL(SINTTOFP_I64_F32, "__floatdisf")
 HANDLE_LIBCALL(SINTTOFP_I64_F64, "__floatdidf")
 HANDLE_LIBCALL(SINTTOFP_I64_F80, "__floatdixf")
 HANDLE_LIBCALL(SINTTOFP_I64_F128, "__floatditf")
 HANDLE_LIBCALL(SINTTOFP_I64_PPCF128, "__floatditf")
+HANDLE_LIBCALL(SINTTOFP_I128_F16, "__floattihf")
 HANDLE_LIBCALL(SINTTOFP_I128_F32, "__floattisf")
 HANDLE_LIBCALL(SINTTOFP_I128_F64, "__floattidf")
 HANDLE_LIBCALL(SINTTOFP_I128_F80, "__floattixf")
 HANDLE_LIBCALL(SINTTOFP_I128_F128, "__floattitf")
 HANDLE_LIBCALL(SINTTOFP_I128_PPCF128, "__floattitf")
+HANDLE_LIBCALL(UINTTOFP_I32_F16, "__floatunsihf")
 HANDLE_LIBCALL(UINTTOFP_I32_F32, "__floatunsisf")
 HANDLE_LIBCALL(UINTTOFP_I32_F64, "__floatunsidf")
 HANDLE_LIBCALL(UINTTOFP_I32_F80, "__floatunsixf")
 HANDLE_LIBCALL(UINTTOFP_I32_F128, "__floatunsitf")
 HANDLE_LIBCALL(UINTTOFP_I32_PPCF128, "__gcc_utoq")
+HANDLE_LIBCALL(UINTTOFP_I64_F16, "__floatundihf")
 HANDLE_LIBCALL(UINTTOFP_I64_F32, "__floatundisf")
 HANDLE_LIBCALL(UINTTOFP_I64_F64, "__floatundidf")
 HANDLE_LIBCALL(UINTTOFP_I64_F80, "__floatundixf")
 HANDLE_LIBCALL(UINTTOFP_I64_F128, "__floatunditf")
 HANDLE_LIBCALL(UINTTOFP_I64_PPCF128, "__floatunditf")
+HANDLE_LIBCALL(UINTTOFP_I128_F16, "__floatuntihf")
 HANDLE_LIBCALL(UINTTOFP_I128_F32, "__floatuntisf")
 HANDLE_LIBCALL(UINTTOFP_I128_F64, "__floatuntidf")
 HANDLE_LIBCALL(UINTTOFP_I128_F80, "__floatuntixf")
@@ -544,6 +558,23 @@ HANDLE_LIBCALL(ATOMIC_FETCH_NAND_4, "__atomic_fetch_nand_4")
 HANDLE_LIBCALL(ATOMIC_FETCH_NAND_8, "__atomic_fetch_nand_8")
 HANDLE_LIBCALL(ATOMIC_FETCH_NAND_16, "__atomic_fetch_nand_16")
 
+// Out-of-line atomics libcalls
+#define HLCALLS(A, N)                                                          \
+  HANDLE_LIBCALL(A##N##_RELAX, nullptr)                                        \
+  HANDLE_LIBCALL(A##N##_ACQ, nullptr)                                          \
+  HANDLE_LIBCALL(A##N##_REL, nullptr)                                          \
+  HANDLE_LIBCALL(A##N##_ACQ_REL, nullptr)
+#define HLCALL5(A)                                                             \
+  HLCALLS(A, 1) HLCALLS(A, 2) HLCALLS(A, 4) HLCALLS(A, 8) HLCALLS(A, 16)
+HLCALL5(OUTLINE_ATOMIC_CAS)
+HLCALL5(OUTLINE_ATOMIC_SWP)
+HLCALL5(OUTLINE_ATOMIC_LDADD)
+HLCALL5(OUTLINE_ATOMIC_LDSET)
+HLCALL5(OUTLINE_ATOMIC_LDCLR)
+HLCALL5(OUTLINE_ATOMIC_LDEOR)
+#undef HLCALLS
+#undef HLCALL5
+
 // Stack Protector Fail
 HANDLE_LIBCALL(STACKPROTECTOR_CHECK_FAIL, "__stack_chk_fail")
 
@@ -555,4 +586,3 @@ HANDLE_LIBCALL(RETURN_ADDRESS, nullptr)
 
 HANDLE_LIBCALL(UNKNOWN_LIBCALL, nullptr)
 
-#undef HANDLE_LIBCALL
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/Statepoint.h b/contrib/llvm-project/llvm/include/llvm/IR/Statepoint.h
index 1ace39c10701..6ce15839df46 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/Statepoint.h
+++ b/contrib/llvm-project/llvm/include/llvm/IR/Statepoint.h
@@ -136,7 +136,7 @@ public:
   /// Return an end iterator of the arguments to the underlying call
   const_op_iterator actual_arg_end() const {
     auto I = actual_arg_begin() + actual_arg_size();
-    assert((arg_end() - I) >= 0);
+    assert((arg_end() - I) == 2);
     return I;
   }
   /// range adapter for actual call arguments
@@ -147,16 +147,12 @@ public:
   const_op_iterator gc_transition_args_begin() const {
     if (auto Opt = getOperandBundle(LLVMContext::OB_gc_transition))
       return Opt->Inputs.begin();
-    auto I = actual_arg_end() + 1;
-    assert((arg_end() - I) >= 0);
-    return I;
+    return arg_end();
   }
   const_op_iterator gc_transition_args_end() const {
     if (auto Opt = getOperandBundle(LLVMContext::OB_gc_transition))
       return Opt->Inputs.end();
-    auto I = gc_transition_args_begin() + getNumDeoptArgs();
-    assert((arg_end() - I) >= 0);
-    return I;
+    return arg_end();
   }
 
   /// range adapter for GC transition arguments
@@ -167,19 +163,12 @@ public:
   const_op_iterator deopt_begin() const {
     if (auto Opt = getOperandBundle(LLVMContext::OB_deopt))
       return Opt->Inputs.begin();
-    // The current format has two length prefix bundles between call args and
-    // start of gc args.  This will be removed in the near future.
-    uint64_t NumTrans = getNumGCTransitionArgs();
-    const_op_iterator I = actual_arg_end() + 2 + NumTrans;
-    assert((arg_end() - I) >= 0);
-    return I;
+    return arg_end();
   }
   const_op_iterator deopt_end() const {
     if (auto Opt = getOperandBundle(LLVMContext::OB_deopt))
       return Opt->Inputs.end();
-    auto I = deopt_begin() + getNumDeoptArgs();
-    assert((arg_end() - I) >= 0);
-    return I;
+    return arg_end();
   }
 
   /// range adapter for vm state arguments
@@ -192,30 +181,16 @@ public:
   const_op_iterator gc_args_begin() const {
     if (auto Opt = getOperandBundle(LLVMContext::OB_gc_live))
       return Opt->Inputs.begin();
-
-    // The current format has two length prefix bundles between call args and
-    // start of gc args.  This will be removed in the near future.
-    uint64_t NumTrans = getNumGCTransitionArgs();
-    uint64_t NumDeopt = getNumDeoptArgs();
-    auto I = actual_arg_end() + 2 + NumTrans + NumDeopt;
-    assert((arg_end() - I) >= 0);
-    return I;
+    return arg_end();
   }
 
   /// Return an end iterator for the gc argument range
   const_op_iterator gc_args_end() const {
     if (auto Opt = getOperandBundle(LLVMContext::OB_gc_live))
       return Opt->Inputs.end();
-
     return arg_end();
   }
 
-  /// Return the operand index at which the gc args begin
-  unsigned gcArgsStartIdx() const {
-    assert(!getOperandBundle(LLVMContext::OB_gc_live));
-    return gc_args_begin() - op_begin();
-  }
-
   /// range adapter for gc arguments
   iterator_range<const_op_iterator> gc_args() const {
     return make_range(gc_args_begin(), gc_args_end());
@@ -236,19 +211,6 @@ public:
         return GRI;
     return nullptr;
   }
-
-private:
-  int getNumGCTransitionArgs() const {
-    const Value *NumGCTransitionArgs = *actual_arg_end();
-    return cast<ConstantInt>(NumGCTransitionArgs)->getZExtValue();
-  }
-
-  int getNumDeoptArgs() const {
-    uint64_t NumTrans = getNumGCTransitionArgs();
-    const_op_iterator trans_end = actual_arg_end() + 1 + NumTrans;
-    const Value *NumDeoptArgs = *trans_end;
-    return cast<ConstantInt>(NumDeoptArgs)->getZExtValue();
-  }
 };
 
 /// Common base class for representing values projected from a statepoint.
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/StructuralHash.h b/contrib/llvm-project/llvm/include/llvm/IR/StructuralHash.h
new file mode 100644
index 000000000000..eb63a2140310
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/IR/StructuralHash.h
@@ -0,0 +1,34 @@
+//===- llvm/IR/StructuralHash.h - IR Hash for expensive checks --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides hashing of the LLVM IR structure to be used to check
+// Passes modification status.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_IR_STRUCTURALHASH_H
+#define LLVM_IR_STRUCTURALHASH_H
+
+#ifdef EXPENSIVE_CHECKS
+
+#include <cstdint>
+
+// This header is only meant to be used when -DEXPENSIVE_CHECKS is set
+namespace llvm {
+
+class Function;
+class Module;
+
+uint64_t StructuralHash(const Function &F);
+uint64_t StructuralHash(const Module &M);
+
+} // end namespace llvm
+
+#endif
+
+#endif // LLVM_IR_STRUCTURALHASH_H
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/SymbolTableListTraits.h b/contrib/llvm-project/llvm/include/llvm/IR/SymbolTableListTraits.h
index 5b793e5dbf28..8af712374bfa 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/SymbolTableListTraits.h
+++ b/contrib/llvm-project/llvm/include/llvm/IR/SymbolTableListTraits.h
@@ -76,9 +76,11 @@ private:
   /// getListOwner - Return the object that owns this list.  If this is a list
   /// of instructions, it returns the BasicBlock that owns them.
   ItemParentClass *getListOwner() {
-    size_t Offset(size_t(&((ItemParentClass*)nullptr->*ItemParentClass::
-                           getSublistAccess(static_cast<ValueSubClass*>(nullptr)))));
-    ListTy *Anchor(static_cast<ListTy *>(this));
+    size_t Offset = reinterpret_cast<size_t>(
+        &((ItemParentClass *)nullptr->*ItemParentClass::getSublistAccess(
+                                           static_cast<ValueSubClass *>(
+                                               nullptr))));
+    ListTy *Anchor = static_cast<ListTy *>(this);
     return reinterpret_cast<ItemParentClass*>(reinterpret_cast<char*>(Anchor)-
                                               Offset);
   }
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/Type.h b/contrib/llvm-project/llvm/include/llvm/IR/Type.h
index 1f546884b924..756c69dd6ae9 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/Type.h
+++ b/contrib/llvm-project/llvm/include/llvm/IR/Type.h
@@ -65,6 +65,7 @@ public:
     LabelTyID,     ///< Labels
     MetadataTyID,  ///< Metadata
     X86_MMXTyID,   ///< MMX vectors (64 bits, X86 specific)
+    X86_AMXTyID,   ///< AMX vectors (8192 bits, X86 specific)
     TokenTyID,     ///< Tokens
 
     // Derived types... see DerivedTypes.h file.
@@ -182,6 +183,9 @@ public:
   /// Return true if this is X86 MMX.
   bool isX86_MMXTy() const { return getTypeID() == X86_MMXTyID; }
 
+  /// Return true if this is X86 AMX.
+  bool isX86_AMXTy() const { return getTypeID() == X86_AMXTyID; }
+
   /// Return true if this is a FP type or a vector of FP.
   bool isFPOrFPVectorTy() const { return getScalarType()->isFloatingPointTy(); }
 
@@ -252,7 +256,7 @@ public:
   /// includes all first-class types except struct and array types.
   bool isSingleValueType() const {
     return isFloatingPointTy() || isX86_MMXTy() || isIntegerTy() ||
-           isPointerTy() || isVectorTy();
+           isPointerTy() || isVectorTy() || isX86_AMXTy();
   }
 
   /// Return true if the type is an aggregate type. This means it is valid as
@@ -268,8 +272,8 @@ public:
   bool isSized(SmallPtrSetImpl<Type*> *Visited = nullptr) const {
     // If it's a primitive, it is always sized.
     if (getTypeID() == IntegerTyID || isFloatingPointTy() ||
-        getTypeID() == PointerTyID ||
-        getTypeID() == X86_MMXTyID)
+        getTypeID() == PointerTyID || getTypeID() == X86_MMXTyID ||
+        getTypeID() == X86_AMXTyID)
       return true;
     // If it is not something that can have a size (e.g. a function or label),
     // it doesn't have a size.
@@ -405,6 +409,7 @@ public:
   static Type *getFP128Ty(LLVMContext &C);
   static Type *getPPC_FP128Ty(LLVMContext &C);
   static Type *getX86_MMXTy(LLVMContext &C);
+  static Type *getX86_AMXTy(LLVMContext &C);
   static Type *getTokenTy(LLVMContext &C);
   static IntegerType *getIntNTy(LLVMContext &C, unsigned N);
   static IntegerType *getInt1Ty(LLVMContext &C);
@@ -427,6 +432,26 @@ public:
     }
     llvm_unreachable("Unsupported type in Type::getScalarTy");
   }
+  static Type *getFloatingPointTy(LLVMContext &C, const fltSemantics &S) {
+    Type *Ty;
+    if (&S == &APFloat::IEEEhalf())
+      Ty = Type::getHalfTy(C);
+    else if (&S == &APFloat::BFloat())
+      Ty = Type::getBFloatTy(C);
+    else if (&S == &APFloat::IEEEsingle())
+      Ty = Type::getFloatTy(C);
+    else if (&S == &APFloat::IEEEdouble())
+      Ty = Type::getDoubleTy(C);
+    else if (&S == &APFloat::x87DoubleExtended())
+      Ty = Type::getX86_FP80Ty(C);
+    else if (&S == &APFloat::IEEEquad())
+      Ty = Type::getFP128Ty(C);
+    else {
+      assert(&S == &APFloat::PPCDoubleDouble() && "Unknown FP format");
+      Ty = Type::getPPC_FP128Ty(C);
+    }
+    return Ty;
+  }
 
   //===--------------------------------------------------------------------===//
   // Convenience methods for getting pointer types with one of the above builtin
@@ -440,6 +465,7 @@ public:
   static PointerType *getFP128PtrTy(LLVMContext &C, unsigned AS = 0);
   static PointerType *getPPC_FP128PtrTy(LLVMContext &C, unsigned AS = 0);
   static PointerType *getX86_MMXPtrTy(LLVMContext &C, unsigned AS = 0);
+  static PointerType *getX86_AMXPtrTy(LLVMContext &C, unsigned AS = 0);
   static PointerType *getIntNPtrTy(LLVMContext &C, unsigned N, unsigned AS = 0);
   static PointerType *getInt1PtrTy(LLVMContext &C, unsigned AS = 0);
   static PointerType *getInt8PtrTy(LLVMContext &C, unsigned AS = 0);
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/User.h b/contrib/llvm-project/llvm/include/llvm/IR/User.h
index ebfae1db2980..221bb5b2cb1c 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/User.h
+++ b/contrib/llvm-project/llvm/include/llvm/IR/User.h
@@ -45,7 +45,7 @@ class User : public Value {
   template <unsigned>
   friend struct HungoffOperandTraits;
 
-  LLVM_ATTRIBUTE_ALWAYS_INLINE inline static void *
+  LLVM_ATTRIBUTE_ALWAYS_INLINE static void *
   allocateFixedOperandUser(size_t, unsigned, unsigned);
 
 protected:
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/VPIntrinsics.def b/contrib/llvm-project/llvm/include/llvm/IR/VPIntrinsics.def
index d3e1fc854373..981548c6dde9 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/VPIntrinsics.def
+++ b/contrib/llvm-project/llvm/include/llvm/IR/VPIntrinsics.def
@@ -17,68 +17,140 @@
 // Provide definitions of macros so that users of this file do not have to
 // define everything to use it...
 //
-#ifndef REGISTER_VP_INTRINSIC
-#define REGISTER_VP_INTRINSIC(VPID, MASKPOS, VLENPOS)
+// Register a VP intrinsic and begin its property scope.
+// All VP intrinsic scopes are top level, ie it is illegal to place a
+// BEGIN_REGISTER_VP_INTRINSIC within a VP intrinsic scope.
+// \p VPID     The VP intrinsic id.
+// \p MASKPOS  The mask operand position.
+// \p EVLPOS   The explicit vector length operand position.
+#ifndef BEGIN_REGISTER_VP_INTRINSIC
+#define BEGIN_REGISTER_VP_INTRINSIC(VPID, MASKPOS, EVLPOS)
 #endif
 
-// Map this VP intrinsic to its functional Opcode
-#ifndef HANDLE_VP_TO_OC
-#define HANDLE_VP_TO_OC(VPID, OC)
+// End the property scope of a VP intrinsic.
+#ifndef END_REGISTER_VP_INTRINSIC
+#define END_REGISTER_VP_INTRINSIC(VPID)
 #endif
 
-///// Integer Arithmetic /////
+// Register a new VP SDNode and begin its property scope.
+// When the SDNode scope is nested within a VP intrinsic scope, it is implicitly registered as the canonical SDNode for this VP intrinsic.
+// There is one VP intrinsic that maps directly to one SDNode that goes by the
+// same name.  Since the operands are also the same, we open the property
+// scopes for both the VPIntrinsic and the SDNode at once.
+// \p SDOPC     The SelectionDAG Node id (eg VP_ADD).
+// \p LEGALPOS The operand position of the SDNode that is used for legalizing
+//             this SDNode. This can be `-1`, in which case the return type of
+//             the SDNode is used.
+// \p TDNAME   The name of the TableGen definition of this SDNode.
+// \p MASKPOS  The mask operand position.
+// \p EVLPOS   The explicit vector length operand position.
+#ifndef BEGIN_REGISTER_VP_SDNODE
+#define BEGIN_REGISTER_VP_SDNODE(SDOPC, LEGALPOS, TDNAME, MASKPOS, EVLPOS)
+#endif
+
+// End the property scope of a new VP SDNode.
+#ifndef END_REGISTER_VP_SDNODE
+#define END_REGISTER_VP_SDNODE(SDOPC)
+#endif
+
+// Helper macros for the common "1:1 - Intrinsic : SDNode" case.
+//
+// There is one VP intrinsic that maps directly to one SDNode that goes by the
+// same name.  Since the operands are also the same, we open the property
+// scopes for both the VPIntrinsic and the SDNode at once.
+//
+// \p INTRIN   The canonical name (eg `vp_add`, which at the same time is the
+//             name of the intrinsic and the TableGen def of the SDNode).
+// \p MASKPOS  The mask operand position.
+// \p EVLPOS   The explicit vector length operand position.
+// \p SDOPC    The SelectionDAG Node id (eg VP_ADD).
+// \p LEGALPOS The operand position of the SDNode that is used for legalizing
+//             this SDNode. This can be `-1`, in which case the return type of
+//             the SDNode is used.
+#define BEGIN_REGISTER_VP(INTRIN, MASKPOS, EVLPOS, SDOPC, LEGALPOS) \
+BEGIN_REGISTER_VP_INTRINSIC(INTRIN, MASKPOS, EVLPOS) \
+BEGIN_REGISTER_VP_SDNODE(SDOPC, LEGALPOS, INTRIN, MASKPOS, EVLPOS)
+
+#define END_REGISTER_VP(INTRIN, SDOPC) \
+END_REGISTER_VP_INTRINSIC(INTRIN) \
+END_REGISTER_VP_SDNODE(SDOPC)
+
+
+// The following macros attach properties to the scope they are placed in. This
+// assigns the property to the VP Intrinsic and/or SDNode that belongs to the
+// scope.
+//
+// Property Macros {
+
+// The intrinsic and/or SDNode has the same function as this LLVM IR Opcode.
+// \p OPC  The standard IR opcode.
+#ifndef HANDLE_VP_TO_OPC
+#define HANDLE_VP_TO_OPC(OPC)
+#endif
+
+/// } Property Macros
+
+///// Integer Arithmetic {
+
+// Specialized helper macro for integer binary operators (%x, %y, %mask, %evl).
+#ifdef HELPER_REGISTER_BINARY_INT_VP
+#error "The internal helper macro HELPER_REGISTER_BINARY_INT_VP is already defined!"
+#endif
+#define HELPER_REGISTER_BINARY_INT_VP(INTRIN, SDOPC, OPC) \
+BEGIN_REGISTER_VP(INTRIN, 2, 3, SDOPC, -1) \
+HANDLE_VP_TO_OPC(OPC) \
+END_REGISTER_VP(INTRIN, SDOPC)
+
+
 
 // llvm.vp.add(x,y,mask,vlen)
-REGISTER_VP_INTRINSIC(vp_add, 2, 3)
-HANDLE_VP_TO_OC(vp_add, Add)
+HELPER_REGISTER_BINARY_INT_VP(vp_add, VP_ADD, Add)
 
 // llvm.vp.and(x,y,mask,vlen)
-REGISTER_VP_INTRINSIC(vp_and, 2, 3)
-HANDLE_VP_TO_OC(vp_and, And)
+HELPER_REGISTER_BINARY_INT_VP(vp_and, VP_AND, And)
 
 // llvm.vp.ashr(x,y,mask,vlen)
-REGISTER_VP_INTRINSIC(vp_ashr, 2, 3)
-HANDLE_VP_TO_OC(vp_ashr, AShr)
+HELPER_REGISTER_BINARY_INT_VP(vp_ashr, VP_ASHR, AShr)
 
 // llvm.vp.lshr(x,y,mask,vlen)
-REGISTER_VP_INTRINSIC(vp_lshr, 2, 3)
-HANDLE_VP_TO_OC(vp_lshr, LShr)
+HELPER_REGISTER_BINARY_INT_VP(vp_lshr, VP_LSHR, LShr)
 
 // llvm.vp.mul(x,y,mask,vlen)
-REGISTER_VP_INTRINSIC(vp_mul, 2, 3)
-HANDLE_VP_TO_OC(vp_mul, Mul)
+HELPER_REGISTER_BINARY_INT_VP(vp_mul, VP_MUL, Mul)
 
 // llvm.vp.or(x,y,mask,vlen)
-REGISTER_VP_INTRINSIC(vp_or, 2, 3)
-HANDLE_VP_TO_OC(vp_or, Or)
+HELPER_REGISTER_BINARY_INT_VP(vp_or, VP_OR, Or)
 
 // llvm.vp.sdiv(x,y,mask,vlen)
-REGISTER_VP_INTRINSIC(vp_sdiv, 2, 3)
-HANDLE_VP_TO_OC(vp_sdiv, SDiv)
+HELPER_REGISTER_BINARY_INT_VP(vp_sdiv, VP_SDIV, SDiv)
 
 // llvm.vp.shl(x,y,mask,vlen)
-REGISTER_VP_INTRINSIC(vp_shl, 2, 3)
-HANDLE_VP_TO_OC(vp_shl, Shl)
+HELPER_REGISTER_BINARY_INT_VP(vp_shl, VP_SHL, Shl)
 
 // llvm.vp.srem(x,y,mask,vlen)
-REGISTER_VP_INTRINSIC(vp_srem, 2, 3)
-HANDLE_VP_TO_OC(vp_srem, SRem)
+HELPER_REGISTER_BINARY_INT_VP(vp_srem, VP_SREM, SRem)
 
 // llvm.vp.sub(x,y,mask,vlen)
-REGISTER_VP_INTRINSIC(vp_sub, 2, 3)
-HANDLE_VP_TO_OC(vp_sub, Sub)
+HELPER_REGISTER_BINARY_INT_VP(vp_sub, VP_SUB, Sub)
 
 // llvm.vp.udiv(x,y,mask,vlen)
-REGISTER_VP_INTRINSIC(vp_udiv, 2, 3)
-HANDLE_VP_TO_OC(vp_udiv, UDiv)
+HELPER_REGISTER_BINARY_INT_VP(vp_udiv, VP_UDIV, UDiv)
 
 // llvm.vp.urem(x,y,mask,vlen)
-REGISTER_VP_INTRINSIC(vp_urem, 2, 3)
-HANDLE_VP_TO_OC(vp_urem, URem)
+HELPER_REGISTER_BINARY_INT_VP(vp_urem, VP_UREM, URem)
 
 // llvm.vp.xor(x,y,mask,vlen)
-REGISTER_VP_INTRINSIC(vp_xor, 2, 3)
-HANDLE_VP_TO_OC(vp_xor, Xor)
+HELPER_REGISTER_BINARY_INT_VP(vp_xor, VP_XOR, Xor)
+
+#undef HELPER_REGISTER_BINARY_INT_VP
+
+///// } Integer Arithmetic
+
 
-#undef REGISTER_VP_INTRINSIC
-#undef HANDLE_VP_TO_OC
+#undef BEGIN_REGISTER_VP
+#undef BEGIN_REGISTER_VP_INTRINSIC
+#undef BEGIN_REGISTER_VP_SDNODE
+#undef END_REGISTER_VP
+#undef END_REGISTER_VP_INTRINSIC
+#undef END_REGISTER_VP_SDNODE
+#undef HANDLE_VP_TO_OPC
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/Value.def b/contrib/llvm-project/llvm/include/llvm/IR/Value.def
index aaf1651979a9..0a0125d319c3 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/Value.def
+++ b/contrib/llvm-project/llvm/include/llvm/IR/Value.def
@@ -23,6 +23,11 @@
 #error "Missing macro definition of HANDLE_VALUE*"
 #endif
 
+// If the LLVM_C_API macro is set, then values handled via HANDLE_*_EXCLUDE_LLVM_C_API will not be expanded in areas the HANDLE_* macro is used. If it is not set, then HANDLE_*_EXCLUDE_LLVM_C_API values are handled normally as their HANDLE_* counterparts.
+#ifndef LLVM_C_API
+#define LLVM_C_API 0
+#endif
+
 #ifndef HANDLE_MEMORY_VALUE
 #define HANDLE_MEMORY_VALUE(ValueName) HANDLE_VALUE(ValueName)
 #endif
@@ -55,6 +60,15 @@
 #define HANDLE_CONSTANT_MARKER(MarkerName, ValueName)
 #endif
 
+#ifndef HANDLE_CONSTANT_EXCLUDE_LLVM_C_API
+#define HANDLE_CONSTANT_EXCLUDE_LLVM_C_API(ValueName) HANDLE_CONSTANT(ValueName)
+#endif
+
+#if LLVM_C_API
+#undef HANDLE_CONSTANT_EXCLUDE_LLVM_C_API
+#define HANDLE_CONSTANT_EXCLUDE_LLVM_C_API(ValueName)
+#endif
+
 // Having constant first makes the range check for isa<Constant> faster
 // and smaller by one operation.
 
@@ -65,6 +79,7 @@ HANDLE_GLOBAL_VALUE(GlobalIFunc)
 HANDLE_GLOBAL_VALUE(GlobalVariable)
 HANDLE_CONSTANT(BlockAddress)
 HANDLE_CONSTANT(ConstantExpr)
+HANDLE_CONSTANT_EXCLUDE_LLVM_C_API(DSOLocalEquivalent)
 
 // ConstantAggregate.
 HANDLE_CONSTANT(ConstantArray)
@@ -73,6 +88,7 @@ HANDLE_CONSTANT(ConstantVector)
 
 // ConstantData.
 HANDLE_CONSTANT(UndefValue)
+HANDLE_CONSTANT(PoisonValue)
 HANDLE_CONSTANT(ConstantAggregateZero)
 HANDLE_CONSTANT(ConstantDataArray)
 HANDLE_CONSTANT(ConstantDataVector)
@@ -114,3 +130,5 @@ HANDLE_INSTRUCTION(Instruction)
 #undef HANDLE_INLINE_ASM_VALUE
 #undef HANDLE_VALUE
 #undef HANDLE_CONSTANT_MARKER
+#undef HANDLE_CONSTANT_EXCLUDE_LLVM_C_API
+#undef LLVM_C_API
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/Value.h b/contrib/llvm-project/llvm/include/llvm/IR/Value.h
index 04ca68274626..2a9912d46c89 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/Value.h
+++ b/contrib/llvm-project/llvm/include/llvm/IR/Value.h
@@ -15,6 +15,7 @@
 
 #include "llvm-c/Types.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/IR/Use.h"
 #include "llvm/Support/Alignment.h"
@@ -43,11 +44,11 @@ class GlobalVariable;
 class InlineAsm;
 class Instruction;
 class LLVMContext;
+class MDNode;
 class Module;
 class ModuleSlotTracker;
 class raw_ostream;
 template<typename ValueTy> class StringMapEntry;
-class StringRef;
 class Twine;
 class Type;
 class User;
@@ -110,12 +111,13 @@ protected:
   ///
   /// Note, this should *NOT* be used directly by any class other than User.
   /// User uses this value to find the Use list.
-  enum : unsigned { NumUserOperandsBits = 28 };
+  enum : unsigned { NumUserOperandsBits = 27 };
   unsigned NumUserOperands : NumUserOperandsBits;
 
   // Use the same type as the bitfield above so that MSVC will pack them.
   unsigned IsUsedByMD : 1;
   unsigned HasName : 1;
+  unsigned HasMetadata : 1; // Has metadata attached to this?
   unsigned HasHungOffUses : 1;
   unsigned HasDescriptor : 1;
 
@@ -279,6 +281,10 @@ public:
   /// \note It is an error to call V->takeName(V).
   void takeName(Value *V);
 
+#ifndef NDEBUG
+  std::string getNameOrAsOperand() const;
+#endif
+
   /// Change all uses of this to point to a new Value.
   ///
   /// Go through the uses list for this definition and make each use point to
@@ -424,25 +430,31 @@ public:
     return materialized_users();
   }
 
-  /// Return true if there is exactly one user of this value.
+  /// Return true if there is exactly one use of this value.
   ///
   /// This is specialized because it is a common request and does not require
   /// traversing the whole use list.
-  bool hasOneUse() const {
-    const_use_iterator I = use_begin(), E = use_end();
-    if (I == E) return false;
-    return ++I == E;
-  }
+  bool hasOneUse() const { return hasSingleElement(uses()); }
 
-  /// Return true if this Value has exactly N users.
+  /// Return true if this Value has exactly N uses.
   bool hasNUses(unsigned N) const;
 
-  /// Return true if this value has N users or more.
+  /// Return true if this value has N uses or more.
   ///
   /// This is logically equivalent to getNumUses() >= N.
   bool hasNUsesOrMore(unsigned N) const;
 
-  /// Return true if there is exactly one user of this value that cannot be
+  /// Return true if there is exactly one user of this value.
+  ///
+  /// Note that this is not the same as "has one use". If a value has one use,
+  /// then there certainly is a single user. But if value has several uses,
+  /// it is possible that all uses are in a single user, or not.
+  ///
+  /// This check is potentially costly, since it requires traversing,
+  /// in the worst case, the whole use list of a value.
+  bool hasOneUser() const;
+
+  /// Return true if there is exactly one use of this value that cannot be
   /// dropped.
   ///
   /// This is specialized because it is a common request and does not require
@@ -455,7 +467,7 @@ public:
   /// traversing the whole use list.
   bool hasNUndroppableUses(unsigned N) const;
 
-  /// Return true if this value has N users or more.
+  /// Return true if this value has N uses or more.
   ///
   /// This is logically equivalent to getNumUses() >= N.
   bool hasNUndroppableUsesOrMore(unsigned N) const;
@@ -470,6 +482,12 @@ public:
   void dropDroppableUses(llvm::function_ref<bool(const Use *)> ShouldDrop =
                              [](const Use *) { return true; });
 
+  /// Remove every use of this value in \p User that can safely be removed.
+  void dropDroppableUsesIn(User &Usr);
+
+  /// Remove the droppable use \p U.
+  static void dropDroppableUse(Use &U);
+
   /// Check if this value is used in the specified basic block.
   bool isUsedInBasicBlock(const BasicBlock *BB) const;
 
@@ -534,6 +552,68 @@ public:
   /// Return true if there is metadata referencing this value.
   bool isUsedByMetadata() const { return IsUsedByMD; }
 
+protected:
+  /// Get the current metadata attachments for the given kind, if any.
+  ///
+  /// These functions require that the value have at most a single attachment
+  /// of the given kind, and return \c nullptr if such an attachment is missing.
+  /// @{
+  MDNode *getMetadata(unsigned KindID) const;
+  MDNode *getMetadata(StringRef Kind) const;
+  /// @}
+
+  /// Appends all attachments with the given ID to \c MDs in insertion order.
+  /// If the Value has no attachments with the given ID, or if ID is invalid,
+  /// leaves MDs unchanged.
+  /// @{
+  void getMetadata(unsigned KindID, SmallVectorImpl<MDNode *> &MDs) const;
+  void getMetadata(StringRef Kind, SmallVectorImpl<MDNode *> &MDs) const;
+  /// @}
+
+  /// Appends all metadata attached to this value to \c MDs, sorting by
+  /// KindID. The first element of each pair returned is the KindID, the second
+  /// element is the metadata value. Attachments with the same ID appear in
+  /// insertion order.
+  void
+  getAllMetadata(SmallVectorImpl<std::pair<unsigned, MDNode *>> &MDs) const;
+
+  /// Return true if this value has any metadata attached to it.
+  bool hasMetadata() const { return (bool)HasMetadata; }
+
+  /// Return true if this value has the given type of metadata attached.
+  /// @{
+  bool hasMetadata(unsigned KindID) const {
+    return getMetadata(KindID) != nullptr;
+  }
+  bool hasMetadata(StringRef Kind) const {
+    return getMetadata(Kind) != nullptr;
+  }
+  /// @}
+
+  /// Set a particular kind of metadata attachment.
+  ///
+  /// Sets the given attachment to \c MD, erasing it if \c MD is \c nullptr or
+  /// replacing it if it already exists.
+  /// @{
+  void setMetadata(unsigned KindID, MDNode *Node);
+  void setMetadata(StringRef Kind, MDNode *Node);
+  /// @}
+
+  /// Add a metadata attachment.
+  /// @{
+  void addMetadata(unsigned KindID, MDNode &MD);
+  void addMetadata(StringRef Kind, MDNode &MD);
+  /// @}
+
+  /// Erase all metadata attachments with the given kind.
+  ///
+  /// \returns true if any metadata was removed.
+  bool eraseMetadata(unsigned KindID);
+
+  /// Erase all metadata attached to this Value.
+  void clearMetadata();
+
+public:
   /// Return true if this value is a swifterror value.
   ///
   /// swifterror values can be either a function argument or an alloca with a
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/ValueHandle.h b/contrib/llvm-project/llvm/include/llvm/IR/ValueHandle.h
index badc1ca8d1f6..29560815ea55 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/ValueHandle.h
+++ b/contrib/llvm-project/llvm/include/llvm/IR/ValueHandle.h
@@ -258,13 +258,13 @@ template <> struct simplify_type<const WeakTrackingVH> {
 /// class turns into a trivial wrapper around a pointer.
 template <typename ValueTy>
 class AssertingVH
-#ifndef NDEBUG
-  : public ValueHandleBase
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS
+    : public ValueHandleBase
 #endif
-  {
+{
   friend struct DenseMapInfo<AssertingVH<ValueTy>>;
 
-#ifndef NDEBUG
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS
   Value *getRawValPtr() const { return ValueHandleBase::getValPtr(); }
   void setRawValPtr(Value *P) { ValueHandleBase::operator=(P); }
 #else
@@ -280,14 +280,14 @@ class AssertingVH
   void setValPtr(ValueTy *P) { setRawValPtr(GetAsValue(P)); }
 
 public:
-#ifndef NDEBUG
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS
   AssertingVH() : ValueHandleBase(Assert) {}
   AssertingVH(ValueTy *P) : ValueHandleBase(Assert, GetAsValue(P)) {}
   AssertingVH(const AssertingVH &RHS) : ValueHandleBase(Assert, RHS) {}
 #else
   AssertingVH() : ThePtr(nullptr) {}
   AssertingVH(ValueTy *P) : ThePtr(GetAsValue(P)) {}
-  AssertingVH(const AssertingVH<ValueTy> &) = default;
+  AssertingVH(const AssertingVH &) = default;
 #endif
 
   operator ValueTy*() const {
@@ -442,9 +442,9 @@ public:
 /// PoisoningVH's as it moves. This is required because in non-assert mode this
 /// class turns into a trivial wrapper around a pointer.
 template <typename ValueTy>
-class PoisoningVH
-#ifndef NDEBUG
-    final : public CallbackVH
+class PoisoningVH final
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS
+    : public CallbackVH
 #endif
 {
   friend struct DenseMapInfo<PoisoningVH<ValueTy>>;
@@ -453,7 +453,7 @@ class PoisoningVH
   static Value *GetAsValue(Value *V) { return V; }
   static Value *GetAsValue(const Value *V) { return const_cast<Value *>(V); }
 
-#ifndef NDEBUG
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS
   /// A flag tracking whether this value has been poisoned.
   ///
   /// On delete and RAUW, we leave the value pointer alone so that as a raw
@@ -478,7 +478,7 @@ class PoisoningVH
     Poisoned = true;
     RemoveFromUseList();
   }
-#else // NDEBUG
+#else // LLVM_ENABLE_ABI_BREAKING_CHECKS
   Value *ThePtr = nullptr;
 
   Value *getRawValPtr() const { return ThePtr; }
@@ -486,14 +486,16 @@ class PoisoningVH
 #endif
 
   ValueTy *getValPtr() const {
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS
     assert(!Poisoned && "Accessed a poisoned value handle!");
+#endif
     return static_cast<ValueTy *>(getRawValPtr());
   }
   void setValPtr(ValueTy *P) { setRawValPtr(GetAsValue(P)); }
 
 public:
   PoisoningVH() = default;
-#ifndef NDEBUG
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS
   PoisoningVH(ValueTy *P) : CallbackVH(GetAsValue(P)) {}
   PoisoningVH(const PoisoningVH &RHS)
       : CallbackVH(RHS), Poisoned(RHS.Poisoned) {}
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/Verifier.h b/contrib/llvm-project/llvm/include/llvm/IR/Verifier.h
index 62c33c8325eb..f4381d2ae4a9 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/Verifier.h
+++ b/contrib/llvm-project/llvm/include/llvm/IR/Verifier.h
@@ -116,6 +116,7 @@ public:
 
   Result run(Module &M, ModuleAnalysisManager &);
   Result run(Function &F, FunctionAnalysisManager &);
+  static bool isRequired() { return true; }
 };
 
 /// Check a module for errors, but report debug info errors separately.
@@ -141,6 +142,7 @@ public:
 
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+  static bool isRequired() { return true; }
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/include/llvm/InitializePasses.h b/contrib/llvm-project/llvm/include/llvm/InitializePasses.h
index 06e8507036ac..4f89179a03de 100644
--- a/contrib/llvm-project/llvm/include/llvm/InitializePasses.h
+++ b/contrib/llvm-project/llvm/include/llvm/InitializePasses.h
@@ -73,17 +73,19 @@ void initializeAlignmentFromAssumptionsPass(PassRegistry&);
 void initializeAlwaysInlinerLegacyPassPass(PassRegistry&);
 void initializeAssumeSimplifyPassLegacyPassPass(PassRegistry &);
 void initializeAssumeBuilderPassLegacyPassPass(PassRegistry &);
+void initializeAnnotation2MetadataLegacyPass(PassRegistry &);
+void initializeAnnotationRemarksLegacyPass(PassRegistry &);
 void initializeOpenMPOptLegacyPassPass(PassRegistry &);
 void initializeArgPromotionPass(PassRegistry&);
 void initializeAssumptionCacheTrackerPass(PassRegistry&);
 void initializeAtomicExpandPass(PassRegistry&);
 void initializeAttributorLegacyPassPass(PassRegistry&);
 void initializeAttributorCGSCCLegacyPassPass(PassRegistry &);
-void initializeBBSectionsPreparePass(PassRegistry &);
+void initializeBasicBlockSectionsPass(PassRegistry &);
 void initializeBDCELegacyPassPass(PassRegistry&);
 void initializeBarrierNoopPass(PassRegistry&);
 void initializeBasicAAWrapperPassPass(PassRegistry&);
-void initializeBlockExtractorPass(PassRegistry &);
+void initializeBlockExtractorLegacyPassPass(PassRegistry &);
 void initializeBlockFrequencyInfoWrapperPassPass(PassRegistry&);
 void initializeBoundsCheckingLegacyPassPass(PassRegistry&);
 void initializeBranchFolderPassPass(PassRegistry&);
@@ -110,10 +112,11 @@ void initializeCallGraphViewerPass(PassRegistry&);
 void initializeCallGraphWrapperPassPass(PassRegistry&);
 void initializeCallSiteSplittingLegacyPassPass(PassRegistry&);
 void initializeCalledValuePropagationLegacyPassPass(PassRegistry &);
+void initializeCheckDebugMachineModulePass(PassRegistry &);
 void initializeCodeGenPreparePass(PassRegistry&);
 void initializeConstantHoistingLegacyPassPass(PassRegistry&);
 void initializeConstantMergeLegacyPassPass(PassRegistry&);
-void initializeConstantPropagationPass(PassRegistry&);
+void initializeConstraintEliminationPass(PassRegistry &);
 void initializeControlHeightReductionLegacyPassPass(PassRegistry&);
 void initializeCorrelatedValuePropagationPass(PassRegistry&);
 void initializeCostModelAnalysisPass(PassRegistry&);
@@ -122,8 +125,7 @@ void initializeDAEPass(PassRegistry&);
 void initializeDAHPass(PassRegistry&);
 void initializeDCELegacyPassPass(PassRegistry&);
 void initializeDSELegacyPassPass(PassRegistry&);
-void initializeDataFlowSanitizerPass(PassRegistry&);
-void initializeDeadInstEliminationPass(PassRegistry&);
+void initializeDataFlowSanitizerLegacyPassPass(PassRegistry &);
 void initializeDeadMachineInstructionElimPass(PassRegistry&);
 void initializeDebugifyMachineModulePass(PassRegistry &);
 void initializeDelinearizationPass(PassRegistry&);
@@ -138,7 +140,7 @@ void initializeDomPrinterPass(PassRegistry&);
 void initializeDomViewerPass(PassRegistry&);
 void initializeDominanceFrontierWrapperPassPass(PassRegistry&);
 void initializeDominatorTreeWrapperPassPass(PassRegistry&);
-void initializeDwarfEHPreparePass(PassRegistry&);
+void initializeDwarfEHPrepareLegacyPassPass(PassRegistry &);
 void initializeEarlyCSELegacyPassPass(PassRegistry&);
 void initializeEarlyCSEMemSSALegacyPassPass(PassRegistry&);
 void initializeEarlyIfConverterPass(PassRegistry&);
@@ -177,11 +179,13 @@ void initializeGlobalSplitPass(PassRegistry&);
 void initializeGlobalsAAWrapperPassPass(PassRegistry&);
 void initializeGuardWideningLegacyPassPass(PassRegistry&);
 void initializeHardwareLoopsPass(PassRegistry&);
+void initializeMemProfilerLegacyPassPass(PassRegistry &);
 void initializeHotColdSplittingLegacyPassPass(PassRegistry&);
 void initializeHWAddressSanitizerLegacyPassPass(PassRegistry &);
-void initializeIPCPPass(PassRegistry&);
 void initializeIPSCCPLegacyPassPass(PassRegistry&);
 void initializeIRCELegacyPassPass(PassRegistry&);
+void initializeIROutlinerLegacyPassPass(PassRegistry&);
+void initializeIRSimilarityIdentifierWrapperPassPass(PassRegistry&);
 void initializeIRTranslatorPass(PassRegistry&);
 void initializeIVUsersWrapperPassPass(PassRegistry&);
 void initializeIfConverterPass(PassRegistry&);
@@ -193,7 +197,7 @@ void initializeInferAddressSpacesPass(PassRegistry&);
 void initializeInferFunctionAttrsLegacyPassPass(PassRegistry&);
 void initializeInjectTLIMappingsLegacyPass(PassRegistry &);
 void initializeInlineCostAnalysisPass(PassRegistry&);
-void initializeInstCountPass(PassRegistry&);
+void initializeInstCountLegacyPassPass(PassRegistry &);
 void initializeInstNamerPass(PassRegistry&);
 void initializeInstSimplifyLegacyPassPass(PassRegistry &);
 void initializeInstrProfilingLegacyPassPass(PassRegistry&);
@@ -219,7 +223,7 @@ void initializeLegalizerPass(PassRegistry&);
 void initializeGISelCSEAnalysisWrapperPassPass(PassRegistry &);
 void initializeGISelKnownBitsAnalysisPass(PassRegistry &);
 void initializeLibCallsShrinkWrapLegacyPassPass(PassRegistry&);
-void initializeLintPass(PassRegistry&);
+void initializeLintLegacyPassPass(PassRegistry &);
 void initializeLiveDebugValuesPass(PassRegistry&);
 void initializeLiveDebugVariablesPass(PassRegistry&);
 void initializeLiveIntervalsPass(PassRegistry&);
@@ -235,17 +239,18 @@ void initializeLoopAccessLegacyAnalysisPass(PassRegistry&);
 void initializeLoopDataPrefetchLegacyPassPass(PassRegistry&);
 void initializeLoopDeletionLegacyPassPass(PassRegistry&);
 void initializeLoopDistributeLegacyPass(PassRegistry&);
-void initializeLoopExtractorPass(PassRegistry&);
+void initializeLoopExtractorLegacyPassPass(PassRegistry &);
 void initializeLoopGuardWideningLegacyPassPass(PassRegistry&);
 void initializeLoopFuseLegacyPass(PassRegistry&);
 void initializeLoopIdiomRecognizeLegacyPassPass(PassRegistry&);
 void initializeLoopInfoWrapperPassPass(PassRegistry&);
 void initializeLoopInstSimplifyLegacyPassPass(PassRegistry&);
-void initializeLoopInterchangePass(PassRegistry&);
+void initializeLoopInterchangeLegacyPassPass(PassRegistry &);
+void initializeLoopFlattenLegacyPassPass(PassRegistry&);
 void initializeLoopLoadEliminationPass(PassRegistry&);
 void initializeLoopPassPass(PassRegistry&);
 void initializeLoopPredicationLegacyPassPass(PassRegistry&);
-void initializeLoopRerollPass(PassRegistry&);
+void initializeLoopRerollLegacyPassPass(PassRegistry &);
 void initializeLoopRotateLegacyPassPass(PassRegistry&);
 void initializeLoopSimplifyCFGLegacyPassPass(PassRegistry&);
 void initializeLoopSimplifyPass(PassRegistry&);
@@ -254,8 +259,8 @@ void initializeLoopUnrollAndJamPass(PassRegistry&);
 void initializeLoopUnrollPass(PassRegistry&);
 void initializeLoopUnswitchPass(PassRegistry&);
 void initializeLoopVectorizePass(PassRegistry&);
-void initializeLoopVersioningLICMPass(PassRegistry&);
-void initializeLoopVersioningPassPass(PassRegistry&);
+void initializeLoopVersioningLICMLegacyPassPass(PassRegistry &);
+void initializeLoopVersioningLegacyPassPass(PassRegistry &);
 void initializeLowerAtomicLegacyPassPass(PassRegistry&);
 void initializeLowerConstantIntrinsicsPass(PassRegistry&);
 void initializeLowerEmuTLSPass(PassRegistry&);
@@ -264,9 +269,10 @@ void initializeLowerGuardIntrinsicLegacyPassPass(PassRegistry&);
 void initializeLowerWidenableConditionLegacyPassPass(PassRegistry&);
 void initializeLowerIntrinsicsPass(PassRegistry&);
 void initializeLowerInvokeLegacyPassPass(PassRegistry&);
-void initializeLowerSwitchPass(PassRegistry&);
+void initializeLowerSwitchLegacyPassPass(PassRegistry &);
 void initializeLowerTypeTestsPass(PassRegistry&);
 void initializeLowerMatrixIntrinsicsLegacyPassPass(PassRegistry &);
+void initializeLowerMatrixIntrinsicsMinimalLegacyPassPass(PassRegistry &);
 void initializeMIRCanonicalizerPass(PassRegistry &);
 void initializeMIRNamerPass(PassRegistry &);
 void initializeMIRPrintingPassPass(PassRegistry&);
@@ -280,6 +286,7 @@ void initializeMachineCopyPropagationPass(PassRegistry&);
 void initializeMachineDominanceFrontierPass(PassRegistry&);
 void initializeMachineDominatorTreePass(PassRegistry&);
 void initializeMachineFunctionPrinterPassPass(PassRegistry&);
+void initializeMachineFunctionSplitterPass(PassRegistry &);
 void initializeMachineLICMPass(PassRegistry&);
 void initializeMachineLoopInfoPass(PassRegistry&);
 void initializeMachineModuleInfoWrapperPassPass(PassRegistry &);
@@ -303,7 +310,8 @@ void initializeMergeFunctionsLegacyPassPass(PassRegistry&);
 void initializeMergeICmpsLegacyPassPass(PassRegistry &);
 void initializeMergedLoadStoreMotionLegacyPassPass(PassRegistry&);
 void initializeMetaRenamerPass(PassRegistry&);
-void initializeModuleDebugInfoPrinterPass(PassRegistry&);
+void initializeModuleDebugInfoLegacyPrinterPass(PassRegistry &);
+void initializeModuleMemProfilerLegacyPassPass(PassRegistry &);
 void initializeModuleSummaryIndexWrapperPassPass(PassRegistry&);
 void initializeModuloScheduleTestPass(PassRegistry&);
 void initializeMustExecutePrinterPass(PassRegistry&);
@@ -314,9 +322,9 @@ void initializeNaryReassociateLegacyPassPass(PassRegistry&);
 void initializeNewGVNLegacyPassPass(PassRegistry&);
 void initializeObjCARCAAWrapperPassPass(PassRegistry&);
 void initializeObjCARCAPElimPass(PassRegistry&);
-void initializeObjCARCContractPass(PassRegistry&);
+void initializeObjCARCContractLegacyPassPass(PassRegistry &);
 void initializeObjCARCExpandPass(PassRegistry&);
-void initializeObjCARCOptPass(PassRegistry&);
+void initializeObjCARCOptLegacyPassPass(PassRegistry &);
 void initializeOptimizationRemarkEmitterWrapperPassPass(PassRegistry&);
 void initializeOptimizePHIsPass(PassRegistry&);
 void initializePAEvalPass(PassRegistry&);
@@ -355,13 +363,14 @@ void initializeProfileSummaryInfoWrapperPassPass(PassRegistry&);
 void initializePromoteLegacyPassPass(PassRegistry&);
 void initializePruneEHPass(PassRegistry&);
 void initializeRABasicPass(PassRegistry&);
+void initializePseudoProbeInserterPass(PassRegistry &);
 void initializeRAGreedyPass(PassRegistry&);
 void initializeReachingDefAnalysisPass(PassRegistry&);
 void initializeReassociateLegacyPassPass(PassRegistry&);
 void initializeRedundantDbgInstEliminationPass(PassRegistry&);
 void initializeRegAllocFastPass(PassRegistry&);
 void initializeRegBankSelectPass(PassRegistry&);
-void initializeRegToMemPass(PassRegistry&);
+void initializeRegToMemLegacyPass(PassRegistry&);
 void initializeRegUsageInfoCollectorPass(PassRegistry&);
 void initializeRegUsageInfoPropagationPass(PassRegistry&);
 void initializeRegionInfoPassPass(PassRegistry&);
@@ -384,11 +393,11 @@ void initializeSafepointIRVerifierPass(PassRegistry&);
 void initializeSampleProfileLoaderLegacyPassPass(PassRegistry&);
 void initializeModuleSanitizerCoverageLegacyPassPass(PassRegistry &);
 void initializeScalarEvolutionWrapperPassPass(PassRegistry&);
-void initializeScalarizeMaskedMemIntrinPass(PassRegistry&);
+void initializeScalarizeMaskedMemIntrinLegacyPassPass(PassRegistry &);
 void initializeScalarizerLegacyPassPass(PassRegistry&);
 void initializeScavengerTestPass(PassRegistry&);
 void initializeScopedNoAliasAAWrapperPassPass(PassRegistry&);
-void initializeSeparateConstOffsetFromGEPPass(PassRegistry&);
+void initializeSeparateConstOffsetFromGEPLegacyPassPass(PassRegistry &);
 void initializeShadowStackGCLoweringPass(PassRegistry&);
 void initializeShrinkWrapPass(PassRegistry&);
 void initializeSimpleInlinerPass(PassRegistry&);
@@ -405,16 +414,16 @@ void initializeStackProtectorPass(PassRegistry&);
 void initializeStackSafetyGlobalInfoWrapperPassPass(PassRegistry &);
 void initializeStackSafetyInfoWrapperPassPass(PassRegistry &);
 void initializeStackSlotColoringPass(PassRegistry&);
-void initializeStraightLineStrengthReducePass(PassRegistry&);
+void initializeStraightLineStrengthReduceLegacyPassPass(PassRegistry &);
 void initializeStripDeadDebugInfoPass(PassRegistry&);
 void initializeStripDeadPrototypesLegacyPassPass(PassRegistry&);
 void initializeStripDebugDeclarePass(PassRegistry&);
 void initializeStripDebugMachineModulePass(PassRegistry &);
-void initializeStripGCRelocatesPass(PassRegistry&);
+void initializeStripGCRelocatesLegacyPass(PassRegistry &);
 void initializeStripNonDebugSymbolsPass(PassRegistry&);
-void initializeStripNonLineTableDebugInfoPass(PassRegistry&);
+void initializeStripNonLineTableDebugLegacyPassPass(PassRegistry &);
 void initializeStripSymbolsPass(PassRegistry&);
-void initializeStructurizeCFGPass(PassRegistry&);
+void initializeStructurizeCFGLegacyPassPass(PassRegistry &);
 void initializeTailCallElimPass(PassRegistry&);
 void initializeTailDuplicatePass(PassRegistry&);
 void initializeTargetLibraryInfoWrapperPassPass(PassRegistry&);
@@ -424,8 +433,8 @@ void initializeThreadSanitizerLegacyPassPass(PassRegistry&);
 void initializeTwoAddressInstructionPassPass(PassRegistry&);
 void initializeTypeBasedAAWrapperPassPass(PassRegistry&);
 void initializeTypePromotionPass(PassRegistry&);
-void initializeUnifyFunctionExitNodesPass(PassRegistry&);
-void initializeUnifyLoopExitsPass(PassRegistry &);
+void initializeUnifyFunctionExitNodesLegacyPassPass(PassRegistry &);
+void initializeUnifyLoopExitsLegacyPassPass(PassRegistry &);
 void initializeUnpackMachineBundlesPass(PassRegistry&);
 void initializeUnreachableBlockElimLegacyPassPass(PassRegistry&);
 void initializeUnreachableMachineBlockElimPass(PassRegistry&);
diff --git a/contrib/llvm-project/llvm/include/llvm/InterfaceStub/ELFObjHandler.h b/contrib/llvm-project/llvm/include/llvm/InterfaceStub/ELFObjHandler.h
new file mode 100644
index 000000000000..4ec158c1405f
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/InterfaceStub/ELFObjHandler.h
@@ -0,0 +1,47 @@
+//===- ELFObjHandler.h ------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------------===/
+///
+/// This supports reading and writing of elf dynamic shared objects.
+///
+//===-----------------------------------------------------------------------===/
+
+#ifndef LLVM_TOOLS_ELFABI_ELFOBJHANDLER_H
+#define LLVM_TOOLS_ELFABI_ELFOBJHANDLER_H
+
+#include "llvm/InterfaceStub/ELFStub.h"
+#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Object/ELFTypes.h"
+#include "llvm/Support/FileSystem.h"
+
+namespace llvm {
+
+class MemoryBuffer;
+
+namespace elfabi {
+
+enum class ELFTarget { ELF32LE, ELF32BE, ELF64LE, ELF64BE };
+
+/// Attempt to read a binary ELF file from a MemoryBuffer.
+Expected<std::unique_ptr<ELFStub>> readELFFile(MemoryBufferRef Buf);
+
+/// Attempt to write a binary ELF stub.
+/// This function determines appropriate ELFType using the passed ELFTarget and
+/// then writes a binary ELF stub to a specified file path.
+///
+/// @param FilePath File path for writing the ELF binary.
+/// @param Stub Source ELFStub to generate a binary ELF stub from.
+/// @param OutputFormat Target ELFType to write binary as.
+/// @param WriteIfChanged Whether or not to preserve timestamp if
+///        the output stays the same.
+Error writeBinaryStub(StringRef FilePath, const ELFStub &Stub,
+                      ELFTarget OutputFormat, bool WriteIfChanged = false);
+
+} // end namespace elfabi
+} // end namespace llvm
+
+#endif // LLVM_TOOLS_ELFABI_ELFOBJHANDLER_H
diff --git a/contrib/llvm-project/llvm/include/llvm/InterfaceStub/ELFStub.h b/contrib/llvm-project/llvm/include/llvm/InterfaceStub/ELFStub.h
new file mode 100644
index 000000000000..7832c1c7413b
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/InterfaceStub/ELFStub.h
@@ -0,0 +1,66 @@
+//===- ELFStub.h ------------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------------===/
+///
+/// \file
+/// This file defines an internal representation of an ELF stub.
+///
+//===-----------------------------------------------------------------------===/
+
+#ifndef LLVM_TEXTAPI_ELF_ELFSTUB_H
+#define LLVM_TEXTAPI_ELF_ELFSTUB_H
+
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/Support/VersionTuple.h"
+#include <set>
+#include <vector>
+
+namespace llvm {
+namespace elfabi {
+
+typedef uint16_t ELFArch;
+
+enum class ELFSymbolType {
+  NoType = ELF::STT_NOTYPE,
+  Object = ELF::STT_OBJECT,
+  Func = ELF::STT_FUNC,
+  TLS = ELF::STT_TLS,
+
+  // Type information is 4 bits, so 16 is safely out of range.
+  Unknown = 16,
+};
+
+struct ELFSymbol {
+  ELFSymbol(std::string SymbolName) : Name(SymbolName) {}
+  std::string Name;
+  uint64_t Size;
+  ELFSymbolType Type;
+  bool Undefined;
+  bool Weak;
+  Optional<std::string> Warning;
+  bool operator<(const ELFSymbol &RHS) const { return Name < RHS.Name; }
+};
+
+// A cumulative representation of ELF stubs.
+// Both textual and binary stubs will read into and write from this object.
+class ELFStub {
+  // TODO: Add support for symbol versioning.
+public:
+  VersionTuple TbeVersion;
+  Optional<std::string> SoName;
+  ELFArch Arch;
+  std::vector<std::string> NeededLibs;
+  std::set<ELFSymbol> Symbols;
+
+  ELFStub() {}
+  ELFStub(const ELFStub &Stub);
+  ELFStub(ELFStub &&Stub);
+};
+} // end namespace elfabi
+} // end namespace llvm
+
+#endif // LLVM_TEXTAPI_ELF_ELFSTUB_H
diff --git a/contrib/llvm-project/llvm/include/llvm/InterfaceStub/TBEHandler.h b/contrib/llvm-project/llvm/include/llvm/InterfaceStub/TBEHandler.h
new file mode 100644
index 000000000000..5c523eba037e
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/InterfaceStub/TBEHandler.h
@@ -0,0 +1,43 @@
+//===- TBEHandler.h ---------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------------===/
+///
+/// \file
+/// This file declares an interface for reading and writing .tbe (text-based
+/// ELF) files.
+///
+//===-----------------------------------------------------------------------===/
+
+#ifndef LLVM_TEXTAPI_ELF_TBEHANDLER_H
+#define LLVM_TEXTAPI_ELF_TBEHANDLER_H
+
+#include "llvm/Support/Error.h"
+#include "llvm/Support/VersionTuple.h"
+#include <memory>
+
+namespace llvm {
+
+class raw_ostream;
+class Error;
+class StringRef;
+
+namespace elfabi {
+
+class ELFStub;
+
+const VersionTuple TBEVersionCurrent(1, 0);
+
+/// Attempts to read an ELF interface file from a StringRef buffer.
+Expected<std::unique_ptr<ELFStub>> readTBEFromBuffer(StringRef Buf);
+
+/// Attempts to write an ELF interface file to a raw_ostream.
+Error writeTBEToOutputStream(raw_ostream &OS, const ELFStub &Stub);
+
+} // end namespace elfabi
+} // end namespace llvm
+
+#endif // LLVM_TEXTAPI_ELF_TBEHANDLER_H
diff --git a/contrib/llvm-project/llvm/include/llvm/LTO/Config.h b/contrib/llvm-project/llvm/include/llvm/LTO/Config.h
index 0a3e52316460..88c1452e5aa9 100644
--- a/contrib/llvm-project/llvm/include/llvm/LTO/Config.h
+++ b/contrib/llvm-project/llvm/include/llvm/LTO/Config.h
@@ -1,4 +1,4 @@
-//===-Config.h - LLVM Link Time Optimizer Configuration -------------------===//
+//===-Config.h - LLVM Link Time Optimizer Configuration ---------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -15,9 +15,11 @@
 #define LLVM_LTO_CONFIG_H
 
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Target/TargetOptions.h"
@@ -42,6 +44,8 @@ struct Config {
   TargetOptions Options;
   std::vector<std::string> MAttrs;
   std::vector<std::string> PassPlugins;
+  /// For adding passes that run right before codegen.
+  std::function<void(legacy::PassManager &)> PreCodeGenPassesHook;
   Optional<Reloc::Model> RelocModel = Reloc::PIC_;
   Optional<CodeModel::Model> CodeModel = None;
   CodeGenOpt::Level CGOptLevel = CodeGenOpt::Default;
@@ -50,7 +54,7 @@ struct Config {
   bool DisableVerify = false;
 
   /// Use the new pass manager
-  bool UseNewPM = false;
+  bool UseNewPM = LLVM_ENABLE_NEW_PASS_MANAGER;
 
   /// Flag to indicate that the optimizer should not assume builtins are present
   /// on the target.
@@ -113,16 +117,31 @@ struct Config {
   std::string SplitDwarfOutput;
 
   /// Optimization remarks file path.
-  std::string RemarksFilename = "";
+  std::string RemarksFilename;
 
   /// Optimization remarks pass filter.
-  std::string RemarksPasses = "";
+  std::string RemarksPasses;
 
   /// Whether to emit optimization remarks with hotness informations.
   bool RemarksWithHotness = false;
 
+  /// The minimum hotness value a diagnostic needs in order to be included in
+  /// optimization diagnostics.
+  ///
+  /// The threshold is an Optional value, which maps to one of the 3 states:
+  /// 1. 0            => threshold disabled. All emarks will be printed.
+  /// 2. positive int => manual threshold by user. Remarks with hotness exceed
+  ///                    threshold will be printed.
+  /// 3. None         => 'auto' threshold by user. The actual value is not
+  ///                    available at command line, but will be synced with
+  ///                    hotness threhold from profile summary during
+  ///                    compilation.
+  ///
+  /// If threshold option is not specified, it is disabled by default.
+  llvm::Optional<uint64_t> RemarksHotnessThreshold = 0;
+
   /// The format used for serializing remarks (default: YAML).
-  std::string RemarksFormat = "";
+  std::string RemarksFormat;
 
   /// Whether to emit the pass manager debuggging informations.
   bool DebugPassManager = false;
diff --git a/contrib/llvm-project/llvm/include/llvm/LTO/LTO.h b/contrib/llvm-project/llvm/include/llvm/LTO/LTO.h
index 93456c0ae7ae..4f169137ee85 100644
--- a/contrib/llvm-project/llvm/include/llvm/LTO/LTO.h
+++ b/contrib/llvm-project/llvm/include/llvm/LTO/LTO.h
@@ -82,15 +82,19 @@ std::string getThinLTOOutputFile(const std::string &Path,
                                  const std::string &NewPrefix);
 
 /// Setup optimization remarks.
-Expected<std::unique_ptr<ToolOutputFile>>
-setupLLVMOptimizationRemarks(LLVMContext &Context, StringRef RemarksFilename,
-                             StringRef RemarksPasses, StringRef RemarksFormat,
-                             bool RemarksWithHotness, int Count = -1);
+Expected<std::unique_ptr<ToolOutputFile>> setupLLVMOptimizationRemarks(
+    LLVMContext &Context, StringRef RemarksFilename, StringRef RemarksPasses,
+    StringRef RemarksFormat, bool RemarksWithHotness,
+    Optional<uint64_t> RemarksHotnessThreshold = 0, int Count = -1);
 
 /// Setups the output file for saving statistics.
 Expected<std::unique_ptr<ToolOutputFile>>
 setupStatsFile(StringRef StatsFilename);
 
+/// Produces a container ordering for optimal multi-threaded processing. Returns
+/// ordered indices to elements in the input array.
+std::vector<int> generateModulesOrdering(ArrayRef<BitcodeModule *> R);
+
 class LTO;
 struct SymbolResolution;
 class ThinBackendProc;
diff --git a/contrib/llvm-project/llvm/include/llvm/LTO/LTOBackend.h b/contrib/llvm-project/llvm/include/llvm/LTO/LTOBackend.h
index 0226e4a3fbf5..824c7d143854 100644
--- a/contrib/llvm-project/llvm/include/llvm/LTO/LTOBackend.h
+++ b/contrib/llvm-project/llvm/include/llvm/LTO/LTOBackend.h
@@ -33,6 +33,12 @@ class Target;
 
 namespace lto {
 
+/// Runs middle-end LTO optimizations on \p Mod.
+bool opt(const Config &Conf, TargetMachine *TM, unsigned Task, Module &Mod,
+         bool IsThinLTO, ModuleSummaryIndex *ExportSummary,
+         const ModuleSummaryIndex *ImportSummary,
+         const std::vector<uint8_t> &CmdArgs);
+
 /// Runs a regular LTO backend. The regular LTO backend can also act as the
 /// regular LTO phase of ThinLTO, which may need to access the combined index.
 Error backend(const Config &C, AddStreamFn AddStream,
@@ -44,10 +50,27 @@ Error thinBackend(const Config &C, unsigned Task, AddStreamFn AddStream,
                   Module &M, const ModuleSummaryIndex &CombinedIndex,
                   const FunctionImporter::ImportMapTy &ImportList,
                   const GVSummaryMapTy &DefinedGlobals,
-                  MapVector<StringRef, BitcodeModule> &ModuleMap);
+                  MapVector<StringRef, BitcodeModule> &ModuleMap,
+                  const std::vector<uint8_t> &CmdArgs = std::vector<uint8_t>());
 
 Error finalizeOptimizationRemarks(
     std::unique_ptr<ToolOutputFile> DiagOutputFile);
+
+/// Returns the BitcodeModule that is ThinLTO.
+BitcodeModule *findThinLTOModule(MutableArrayRef<BitcodeModule> BMs);
+
+/// Variant of the above.
+Expected<BitcodeModule> findThinLTOModule(MemoryBufferRef MBRef);
+
+/// Distributed ThinLTO: load the referenced modules, keeping their buffers
+/// alive in the provided OwnedImportLifetimeManager. Returns false if the
+/// operation failed.
+bool loadReferencedModules(
+    const Module &M, const ModuleSummaryIndex &CombinedIndex,
+    FunctionImporter::ImportMapTy &ImportList,
+    MapVector<llvm::StringRef, llvm::BitcodeModule> &ModuleMap,
+    std::vector<std::unique_ptr<llvm::MemoryBuffer>>
+        &OwnedImportsLifetimeManager);
 }
 }
 
diff --git a/contrib/llvm-project/llvm/include/llvm/LTO/legacy/LTOCodeGenerator.h b/contrib/llvm-project/llvm/include/llvm/LTO/legacy/LTOCodeGenerator.h
index d7ccc0d5a6c5..fc7b8fc25bd9 100644
--- a/contrib/llvm-project/llvm/include/llvm/LTO/legacy/LTOCodeGenerator.h
+++ b/contrib/llvm-project/llvm/include/llvm/LTO/legacy/LTOCodeGenerator.h
@@ -93,7 +93,7 @@ struct LTOCodeGenerator {
   void setFileType(CodeGenFileType FT) { FileType = FT; }
 
   void setCpu(StringRef MCpu) { this->MCpu = std::string(MCpu); }
-  void setAttr(StringRef MAttr) { this->MAttr = std::string(MAttr); }
+  void setAttrs(std::vector<std::string> MAttrs) { this->MAttrs = MAttrs; }
   void setOptLevel(unsigned OptLevel);
 
   void setShouldInternalize(bool Value) { ShouldInternalize = Value; }
@@ -145,9 +145,7 @@ struct LTOCodeGenerator {
   /// \note It is up to the linker to remove the intermediate output file.  Do
   /// not try to remove the object file in LTOCodeGenerator's destructor as we
   /// don't who (LTOCodeGenerator or the output file) will last longer.
-  bool compile_to_file(const char **Name, bool DisableVerify,
-                       bool DisableInline, bool DisableGVNLoadPRE,
-                       bool DisableVectorization);
+  bool compile_to_file(const char **Name);
 
   /// As with compile_to_file(), this function compiles the merged module into
   /// single output file. Instead of returning the output file path to the
@@ -155,15 +153,12 @@ struct LTOCodeGenerator {
   /// to the caller. This function should delete the intermediate file once
   /// its content is brought to memory. Return NULL if the compilation was not
   /// successful.
-  std::unique_ptr<MemoryBuffer> compile(bool DisableVerify, bool DisableInline,
-                                        bool DisableGVNLoadPRE,
-                                        bool DisableVectorization);
+  std::unique_ptr<MemoryBuffer> compile();
 
   /// Optimizes the merged module.  Returns true on success.
   ///
   /// Calls \a verifyMergedModuleOnce().
-  bool optimize(bool DisableVerify, bool DisableInline, bool DisableGVNLoadPRE,
-                bool DisableVectorization);
+  bool optimize();
 
   /// Compiles the merged optimized module into a single output file. It brings
   /// the output to a buffer, and returns the buffer to the caller. Return NULL
@@ -183,6 +178,8 @@ struct LTOCodeGenerator {
   /// assume builtins are present on the target.
   void setFreestanding(bool Enabled) { Freestanding = Enabled; }
 
+  void setDisableVerify(bool Value) { DisableVerify = Value; }
+
   void setDiagnosticHandler(lto_diagnostic_handler_t, void *);
 
   LLVMContext &getContext() { return Context; }
@@ -228,7 +225,7 @@ private:
   std::vector<std::string> CodegenOptions;
   std::string FeatureStr;
   std::string MCpu;
-  std::string MAttr;
+  std::vector<std::string> MAttrs;
   std::string NativeObjectPath;
   TargetOptions Options;
   CodeGenOpt::Level CGOptLevel = CodeGenOpt::Default;
@@ -244,6 +241,7 @@ private:
   std::unique_ptr<ToolOutputFile> DiagnosticOutputFile;
   bool Freestanding = false;
   std::unique_ptr<ToolOutputFile> StatsFile = nullptr;
+  bool DisableVerify = false;
 };
 }
 #endif
diff --git a/contrib/llvm-project/llvm/include/llvm/LTO/legacy/LTOModule.h b/contrib/llvm-project/llvm/include/llvm/LTO/legacy/LTOModule.h
index 998a4557dd22..310447d615f9 100644
--- a/contrib/llvm-project/llvm/include/llvm/LTO/legacy/LTOModule.h
+++ b/contrib/llvm-project/llvm/include/llvm/LTO/legacy/LTOModule.h
@@ -48,8 +48,6 @@ private:
 
   std::string LinkerOpts;
 
-  std::string DependentLibraries;
-
   std::unique_ptr<Module> Mod;
   MemoryBufferRef MBRef;
   ModuleSymbolTable SymTab;
diff --git a/contrib/llvm-project/llvm/include/llvm/LinkAllPasses.h b/contrib/llvm-project/llvm/include/llvm/LinkAllPasses.h
index 90e2e24294d4..891d534b4fa6 100644
--- a/contrib/llvm-project/llvm/include/llvm/LinkAllPasses.h
+++ b/contrib/llvm-project/llvm/include/llvm/LinkAllPasses.h
@@ -89,12 +89,10 @@ namespace {
       (void) llvm::createLibCallsShrinkWrapPass();
       (void) llvm::createCalledValuePropagationPass();
       (void) llvm::createConstantMergePass();
-      (void) llvm::createConstantPropagationPass();
       (void) llvm::createControlHeightReductionLegacyPass();
       (void) llvm::createCostModelAnalysisPass();
       (void) llvm::createDeadArgEliminationPass();
       (void) llvm::createDeadCodeEliminationPass();
-      (void) llvm::createDeadInstEliminationPass();
       (void) llvm::createDeadStoreEliminationPass();
       (void) llvm::createDependenceAnalysisWrapperPass();
       (void) llvm::createDomOnlyPrinterPass();
@@ -116,7 +114,6 @@ namespace {
       (void) llvm::createGlobalsAAWrapperPass();
       (void) llvm::createGuardWideningPass();
       (void) llvm::createLoopGuardWideningPass();
-      (void) llvm::createIPConstantPropagationPass();
       (void) llvm::createIPSCCPPass();
       (void) llvm::createInductiveRangeCheckEliminationPass();
       (void) llvm::createIndVarSimplifyPass();
@@ -130,6 +127,7 @@ namespace {
       (void) llvm::createLazyValueInfoPass();
       (void) llvm::createLoopExtractorPass();
       (void) llvm::createLoopInterchangePass();
+      (void) llvm::createLoopFlattenPass();
       (void) llvm::createLoopPredicationPass();
       (void) llvm::createLoopSimplifyPass();
       (void) llvm::createLoopSimplifyCFGPass();
@@ -205,7 +203,7 @@ namespace {
       (void) llvm::createPrintFunctionPass(os);
       (void) llvm::createModuleDebugInfoPrinterPass();
       (void) llvm::createPartialInliningPass();
-      (void) llvm::createLintPass();
+      (void) llvm::createLintLegacyPassPass();
       (void) llvm::createSinkingPass();
       (void) llvm::createLowerAtomicPass();
       (void) llvm::createCorrelatedValuePropagationPass();
@@ -226,7 +224,7 @@ namespace {
       (void) llvm::createMustBeExecutedContextPrinter();
       (void) llvm::createFloat2IntPass();
       (void) llvm::createEliminateAvailableExternallyPass();
-      (void) llvm::createScalarizeMaskedMemIntrinPass();
+      (void)llvm::createScalarizeMaskedMemIntrinLegacyPass();
       (void) llvm::createWarnMissedTransformationsPass();
       (void) llvm::createHardwareLoopsPass();
       (void) llvm::createInjectTLIMappingsLegacyPass();
@@ -241,7 +239,7 @@ namespace {
       llvm::TargetLibraryInfo TLI(TLII);
       llvm::AliasAnalysis AA(TLI);
       llvm::AliasSetTracker X(AA);
-      X.add(nullptr, llvm::LocationSize::unknown(),
+      X.add(nullptr, llvm::LocationSize::beforeOrAfterPointer(),
             llvm::AAMDNodes()); // for -print-alias-sets
       (void) llvm::AreStatisticsEnabled();
       (void) llvm::sys::RunningOnValgrind();
diff --git a/contrib/llvm-project/llvm/include/llvm/MC/MCAsmBackend.h b/contrib/llvm-project/llvm/include/llvm/MC/MCAsmBackend.h
index cc9f42023bc2..94ed3d27e785 100644
--- a/contrib/llvm-project/llvm/include/llvm/MC/MCAsmBackend.h
+++ b/contrib/llvm-project/llvm/include/llvm/MC/MCAsmBackend.h
@@ -144,7 +144,9 @@ public:
   /// \param STI - The MCSubtargetInfo in effect when the instruction was
   /// encoded.
   virtual bool mayNeedRelaxation(const MCInst &Inst,
-                                 const MCSubtargetInfo &STI) const = 0;
+                                 const MCSubtargetInfo &STI) const {
+    return false;
+  }
 
   /// Target specific predicate for whether a given fixup requires the
   /// associated instruction to be relaxed.
@@ -175,6 +177,10 @@ public:
   ///
   virtual unsigned getMinimumNopSize() const { return 1; }
 
+  /// Returns the maximum size of a nop in bytes on this target.
+  ///
+  virtual unsigned getMaximumNopSize() const { return 0; }
+
   /// Write an (optimal) nop sequence of Count bytes to the given output. If the
   /// target cannot generate such a sequence, it should return an error.
   ///
diff --git a/contrib/llvm-project/llvm/include/llvm/MC/MCAsmInfo.h b/contrib/llvm-project/llvm/include/llvm/MC/MCAsmInfo.h
index 46c5a111c891..9b2ac558756e 100644
--- a/contrib/llvm-project/llvm/include/llvm/MC/MCAsmInfo.h
+++ b/contrib/llvm-project/llvm/include/llvm/MC/MCAsmInfo.h
@@ -54,6 +54,15 @@ enum LCOMMType { NoAlignment, ByteAlignment, Log2Alignment };
 /// This class is intended to be used as a base class for asm
 /// properties and features specific to the target.
 class MCAsmInfo {
+public:
+  /// Assembly character literal syntax types.
+  enum AsmCharLiteralSyntax {
+    ACLS_Unknown, /// Unknown; character literals not used by LLVM for this
+                  /// target.
+    ACLS_SingleQuotePrefix, /// The desired character is prefixed by a single
+                            /// quote, e.g., `'A`.
+  };
+
 protected:
   //===------------------------------------------------------------------===//
   // Properties to be set by the target writer, used to configure asm printer.
@@ -177,6 +186,9 @@ protected:
   /// alignment is supported.
   bool UseDotAlignForAlignment = false;
 
+  /// True if the target supports LEB128 directives.
+  bool HasLEB128Directives = true;
+
   //===--- Data Emission Directives -------------------------------------===//
 
   /// This should be set to the directive used to get some number of zero (and
@@ -200,6 +212,16 @@ protected:
   /// doesn't support this, it can be set to null.  Defaults to "\t.asciz\t"
   const char *AscizDirective;
 
+  /// This directive accepts a comma-separated list of bytes for emission as a
+  /// string of bytes.  For targets that do not support this, it shall be set to
+  /// null.  Defaults to null.
+  const char *ByteListDirective = nullptr;
+
+  /// Form used for character literals in the assembly syntax.  Useful for
+  /// producing strings as byte lists.  If a target does not use or support
+  /// this, it shall be set to ACLS_Unknown.  Defaults to ACLS_Unknown.
+  AsmCharLiteralSyntax CharacterLiteralSyntax = ACLS_Unknown;
+
   /// These directives are used to output some unit of integer data to the
   /// current section.  If a data directive is set to null, smaller data
   /// directives will be used to emit the large sizes.  Defaults to "\t.byte\t",
@@ -209,6 +231,9 @@ protected:
   const char *Data32bitsDirective;
   const char *Data64bitsDirective;
 
+  /// True if data directives support signed values
+  bool SupportsSignedData = true;
+
   /// If non-null, a directive that is used to emit a word which should be
   /// relocated as a 64-bit GP-relative offset, e.g. .gpdword on Mips.  Defaults
   /// to nullptr.
@@ -381,6 +406,12 @@ protected:
 
   //===--- Integrated Assembler Information ----------------------------===//
 
+  // Generated object files can use all ELF features supported by GNU ld of
+  // this binutils version and later. INT_MAX means all features can be used,
+  // regardless of GNU ld support. The default value is referenced by
+  // clang/Driver/Options.td.
+  std::pair<int, int> BinutilsVersion = {2, 26};
+
   /// Should we use the integrated assembler?
   /// The integrated assembler should be enabled by default (by the
   /// constructors) when failing to parse a valid piece of assembly (inline
@@ -436,6 +467,7 @@ public:
   const char *getData16bitsDirective() const { return Data16bitsDirective; }
   const char *getData32bitsDirective() const { return Data32bitsDirective; }
   const char *getData64bitsDirective() const { return Data64bitsDirective; }
+  bool supportsSignedData() const { return SupportsSignedData; }
   const char *getGPRel64Directive() const { return GPRel64Directive; }
   const char *getGPRel32Directive() const { return GPRel32Directive; }
   const char *getDTPRel64Directive() const { return DTPRel64Directive; }
@@ -552,12 +584,18 @@ public:
     return UseDotAlignForAlignment;
   }
 
+  bool hasLEB128Directives() const { return HasLEB128Directives; }
+
   const char *getZeroDirective() const { return ZeroDirective; }
   bool doesZeroDirectiveSupportNonZeroValue() const {
     return ZeroDirectiveSupportsNonZeroValue;
   }
   const char *getAsciiDirective() const { return AsciiDirective; }
   const char *getAscizDirective() const { return AscizDirective; }
+  const char *getByteListDirective() const { return ByteListDirective; }
+  AsmCharLiteralSyntax characterLiteralSyntax() const {
+    return CharacterLiteralSyntax;
+  }
   bool getAlignmentIsInBytes() const { return AlignmentIsInBytes; }
   unsigned getTextAlignFillValue() const { return TextAlignFillValue; }
   const char *getGlobalDirective() const { return GlobalDirective; }
@@ -604,10 +642,6 @@ public:
 
   bool doesSupportDebugInformation() const { return SupportsDebugInformation; }
 
-  bool doesSupportExceptionHandling() const {
-    return ExceptionsType != ExceptionHandling::None;
-  }
-
   ExceptionHandling getExceptionHandlingType() const { return ExceptionsType; }
   WinEH::EncodingType getWinEHEncodingType() const { return WinEHEncodingType; }
 
@@ -645,9 +679,17 @@ public:
     return InitialFrameState;
   }
 
+  void setBinutilsVersion(std::pair<int, int> Value) {
+    BinutilsVersion = Value;
+  }
+
   /// Return true if assembly (inline or otherwise) should be parsed.
   bool useIntegratedAssembler() const { return UseIntegratedAssembler; }
 
+  bool binutilsIsAtLeast(int Major, int Minor) const {
+    return BinutilsVersion >= std::make_pair(Major, Minor);
+  }
+
   /// Set whether assembly (inline or otherwise) should be parsed.
   virtual void setUseIntegratedAssembler(bool Value) {
     UseIntegratedAssembler = Value;
diff --git a/contrib/llvm-project/llvm/include/llvm/MC/MCAsmMacro.h b/contrib/llvm-project/llvm/include/llvm/MC/MCAsmMacro.h
index 7eecce0faf64..e3d6a858132d 100644
--- a/contrib/llvm-project/llvm/include/llvm/MC/MCAsmMacro.h
+++ b/contrib/llvm-project/llvm/include/llvm/MC/MCAsmMacro.h
@@ -143,10 +143,16 @@ struct MCAsmMacro {
   StringRef Name;
   StringRef Body;
   MCAsmMacroParameters Parameters;
+  std::vector<std::string> Locals;
+  bool IsFunction = false;
 
 public:
   MCAsmMacro(StringRef N, StringRef B, MCAsmMacroParameters P)
       : Name(N), Body(B), Parameters(std::move(P)) {}
+  MCAsmMacro(StringRef N, StringRef B, MCAsmMacroParameters P,
+             std::vector<std::string> L, bool F)
+      : Name(N), Body(B), Parameters(std::move(P)), Locals(std::move(L)),
+        IsFunction(F) {}
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   void dump() const { dump(dbgs()); }
diff --git a/contrib/llvm-project/llvm/include/llvm/MC/MCAssembler.h b/contrib/llvm-project/llvm/include/llvm/MC/MCAssembler.h
index b57439f02ca5..1b76559d33b3 100644
--- a/contrib/llvm-project/llvm/include/llvm/MC/MCAssembler.h
+++ b/contrib/llvm-project/llvm/include/llvm/MC/MCAssembler.h
@@ -202,6 +202,7 @@ private:
   bool relaxCVInlineLineTable(MCAsmLayout &Layout,
                               MCCVInlineLineTableFragment &DF);
   bool relaxCVDefRange(MCAsmLayout &Layout, MCCVDefRangeFragment &DF);
+  bool relaxPseudoProbeAddr(MCAsmLayout &Layout, MCPseudoProbeAddrFragment &DF);
 
   /// finishLayout - Finalize a layout, including fragment lowering.
   void finishLayout(MCAsmLayout &Layout);
@@ -210,7 +211,12 @@ private:
   handleFixup(const MCAsmLayout &Layout, MCFragment &F, const MCFixup &Fixup);
 
 public:
-  std::vector<std::pair<StringRef, const MCSymbol *>> Symvers;
+  struct Symver {
+    StringRef Name;
+    const MCSymbol *Sym;
+    SMLoc Loc;
+  };
+  std::vector<Symver> Symvers;
 
   /// Construct a new assembler instance.
   //
diff --git a/contrib/llvm-project/llvm/include/llvm/MC/MCCodeView.h b/contrib/llvm-project/llvm/include/llvm/MC/MCCodeView.h
index 2126354cded6..5770f370341d 100644
--- a/contrib/llvm-project/llvm/include/llvm/MC/MCCodeView.h
+++ b/contrib/llvm-project/llvm/include/llvm/MC/MCCodeView.h
@@ -166,8 +166,6 @@ public:
                    unsigned FileNo, unsigned Line, unsigned Column,
                    bool PrologueEnd, bool IsStmt);
 
-  bool isValidCVFileNumber(unsigned FileNumber);
-
   /// Add a line entry.
   void addLineEntry(const MCCVLoc &LineEntry);
 
diff --git a/contrib/llvm-project/llvm/include/llvm/MC/MCContext.h b/contrib/llvm-project/llvm/include/llvm/MC/MCContext.h
index 45be9bb3d225..49ab0ce8d6fd 100644
--- a/contrib/llvm-project/llvm/include/llvm/MC/MCContext.h
+++ b/contrib/llvm-project/llvm/include/llvm/MC/MCContext.h
@@ -22,6 +22,7 @@
 #include "llvm/BinaryFormat/XCOFF.h"
 #include "llvm/MC/MCAsmMacro.h"
 #include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCPseudoProbe.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCTargetOptions.h"
 #include "llvm/MC/SectionKind.h"
@@ -97,6 +98,7 @@ namespace llvm {
     SpecificBumpPtrAllocator<MCSectionMachO> MachOAllocator;
     SpecificBumpPtrAllocator<MCSectionWasm> WasmAllocator;
     SpecificBumpPtrAllocator<MCSectionXCOFF> XCOFFAllocator;
+    SpecificBumpPtrAllocator<MCInst> MCInstAllocator;
 
     /// Bindings of names to symbols.
     SymbolTable Symbols;
@@ -198,6 +200,9 @@ namespace llvm {
     /// The Compile Unit ID that we are currently processing.
     unsigned DwarfCompileUnitID = 0;
 
+    /// A collection of MCPseudoProbe in the current module
+    MCPseudoProbeTable PseudoProbeTable;
+
     // Sections are differentiated by the quadruple (section_name, group_name,
     // unique_id, link_to_symbol_name). Sections sharing the same quadruple are
     // combined into one section.
@@ -380,6 +385,11 @@ namespace llvm {
 
     /// @}
 
+    /// \name McInst Management
+
+    /// Create and return a new MC instruction.
+    MCInst *createMCInst();
+
     /// \name Symbol Management
     /// @{
 
@@ -387,12 +397,16 @@ namespace llvm {
     /// unspecified name.
     MCSymbol *createLinkerPrivateTempSymbol();
 
-    /// Create and return a new assembler temporary symbol with a unique but
-    /// unspecified name.
-    MCSymbol *createTempSymbol(bool CanBeUnnamed = true);
+    /// Create a temporary symbol with a unique name. The name will be omitted
+    /// in the symbol table if UseNamesOnTempLabels is false (default except
+    /// MCAsmStreamer). The overload without Name uses an unspecified name.
+    MCSymbol *createTempSymbol();
+    MCSymbol *createTempSymbol(const Twine &Name, bool AlwaysAddSuffix = true);
 
-    MCSymbol *createTempSymbol(const Twine &Name, bool AlwaysAddSuffix,
-                               bool CanBeUnnamed = true);
+    /// Create a temporary symbol with a unique name whose name cannot be
+    /// omitted in the symbol table. This is rarely used.
+    MCSymbol *createNamedTempSymbol();
+    MCSymbol *createNamedTempSymbol(const Twine &Name);
 
     /// Create the definition of a directional local symbol for numbered label
     /// (used for "1:" definitions).
@@ -558,9 +572,8 @@ namespace llvm {
 
     MCSectionXCOFF *getXCOFFSection(StringRef Section,
                                     XCOFF::StorageMappingClass MappingClass,
-                                    XCOFF::SymbolType CSectType,
-                                    XCOFF::StorageClass StorageClass,
-                                    SectionKind K,
+                                    XCOFF::SymbolType CSectType, SectionKind K,
+                                    bool MultiSymbolsAllowed = false,
                                     const char *BeginSymName = nullptr);
 
     // Create and save a copy of STI and return a reference to the copy.
@@ -744,6 +757,8 @@ namespace llvm {
     }
 
     void undefineMacro(StringRef Name) { MacroMap.erase(Name); }
+
+    MCPseudoProbeTable &getMCPseudoProbeTable() { return PseudoProbeTable; }
   };
 
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/include/llvm/MC/MCDwarf.h b/contrib/llvm-project/llvm/include/llvm/MC/MCDwarf.h
index 70da5f76e766..5bf6496806d8 100644
--- a/contrib/llvm-project/llvm/include/llvm/MC/MCDwarf.h
+++ b/contrib/llvm-project/llvm/include/llvm/MC/MCDwarf.h
@@ -25,6 +25,7 @@
 #include <cassert>
 #include <cstdint>
 #include <string>
+#include <tuple>
 #include <utility>
 #include <vector>
 
@@ -387,11 +388,11 @@ public:
                      int64_t LineDelta, uint64_t AddrDelta, raw_ostream &OS);
 
   /// Utility function to encode a Dwarf pair of LineDelta and AddrDeltas using
-  /// fixed length operands.
-  static bool FixedEncode(MCContext &Context,
-                          MCDwarfLineTableParams Params,
-                          int64_t LineDelta, uint64_t AddrDelta,
-                          raw_ostream &OS, uint32_t *Offset, uint32_t *Size);
+  /// fixed length operands. Returns (Offset, Size, SetDelta).
+  static std::tuple<uint32_t, uint32_t, bool> fixedEncode(MCContext &Context,
+                                                          int64_t LineDelta,
+                                                          uint64_t AddrDelta,
+                                                          raw_ostream &OS);
 
   /// Utility function to emit the encoding to a streamer.
   static void Emit(MCStreamer *MCOS, MCDwarfLineTableParams Params,
diff --git a/contrib/llvm-project/llvm/include/llvm/MC/MCELFObjectWriter.h b/contrib/llvm-project/llvm/include/llvm/MC/MCELFObjectWriter.h
index 8f78b99d3794..5d99c494b11e 100644
--- a/contrib/llvm-project/llvm/include/llvm/MC/MCELFObjectWriter.h
+++ b/contrib/llvm-project/llvm/include/llvm/MC/MCELFObjectWriter.h
@@ -23,7 +23,6 @@ namespace llvm {
 class MCAssembler;
 class MCContext;
 class MCFixup;
-class MCObjectWriter;
 class MCSymbol;
 class MCSymbolELF;
 class MCValue;
diff --git a/contrib/llvm-project/llvm/include/llvm/MC/MCExpr.h b/contrib/llvm-project/llvm/include/llvm/MC/MCExpr.h
index 803c0d443bee..3ffc845e75c0 100644
--- a/contrib/llvm-project/llvm/include/llvm/MC/MCExpr.h
+++ b/contrib/llvm-project/llvm/include/llvm/MC/MCExpr.h
@@ -224,6 +224,7 @@ public:
     VK_WEAKREF, // The link between the symbols in .weakref foo, bar
 
     VK_X86_ABS8,
+    VK_X86_PLTOFF,
 
     VK_ARM_NONE,
     VK_ARM_GOT_PREL,
@@ -299,9 +300,14 @@ public:
     VK_PPC_GOT_TLSLD_HI,    // symbol@got@tlsld@h
     VK_PPC_GOT_TLSLD_HA,    // symbol@got@tlsld@ha
     VK_PPC_GOT_PCREL,       // symbol@got@pcrel
+    VK_PPC_GOT_TLSGD_PCREL, // symbol@got@tlsgd@pcrel
+    VK_PPC_GOT_TLSLD_PCREL, // symbol@got@tlsld@pcrel
+    VK_PPC_GOT_TPREL_PCREL, // symbol@got@tprel@pcrel
+    VK_PPC_TLS_PCREL,       // symbol@tls@pcrel
     VK_PPC_TLSLD,           // symbol@tlsld
     VK_PPC_LOCAL,           // symbol@local
     VK_PPC_NOTOC,           // symbol@notoc
+    VK_PPC_PCREL_OPT,       // .reloc expr, R_PPC64_PCREL_OPT, expr
 
     VK_COFF_IMGREL32, // symbol@imgrel (image-relative)
 
@@ -316,8 +322,9 @@ public:
     VK_Hexagon_IE_GOT,
 
     VK_WASM_TYPEINDEX, // Reference to a symbol's type (signature)
-    VK_WASM_MBREL,     // Memory address relative to memory base
-    VK_WASM_TBREL,     // Table index relative to table bare
+    VK_WASM_TLSREL,    // Memory address relative to __tls_base
+    VK_WASM_MBREL,     // Memory address relative to __memory_base
+    VK_WASM_TBREL,     // Table index relative to __table_base
 
     VK_AMDGPU_GOTPCREL32_LO, // symbol@gotpcrel32@lo
     VK_AMDGPU_GOTPCREL32_HI, // symbol@gotpcrel32@hi
@@ -350,30 +357,20 @@ private:
   /// The symbol being referenced.
   const MCSymbol *Symbol;
 
-  // Subclass data stores VariantKind in bits 0..15, UseParensForSymbolVariant
-  // in bit 16 and HasSubsectionsViaSymbols in bit 17.
+  // Subclass data stores VariantKind in bits 0..15 and HasSubsectionsViaSymbols
+  // in bit 16.
   static const unsigned VariantKindBits = 16;
   static const unsigned VariantKindMask = (1 << VariantKindBits) - 1;
 
-  /// Specifies how the variant kind should be printed.
-  static const unsigned UseParensForSymbolVariantBit = 1 << VariantKindBits;
-
   // FIXME: Remove this bit.
-  static const unsigned HasSubsectionsViaSymbolsBit =
-      1 << (VariantKindBits + 1);
+  static const unsigned HasSubsectionsViaSymbolsBit = 1 << VariantKindBits;
 
   static unsigned encodeSubclassData(VariantKind Kind,
-                              bool UseParensForSymbolVariant,
-                              bool HasSubsectionsViaSymbols) {
+                                     bool HasSubsectionsViaSymbols) {
     return (unsigned)Kind |
-           (UseParensForSymbolVariant ? UseParensForSymbolVariantBit : 0) |
            (HasSubsectionsViaSymbols ? HasSubsectionsViaSymbolsBit : 0);
   }
 
-  bool useParensForSymbolVariant() const {
-    return (getSubclassData() & UseParensForSymbolVariantBit) != 0;
-  }
-
   explicit MCSymbolRefExpr(const MCSymbol *Symbol, VariantKind Kind,
                            const MCAsmInfo *MAI, SMLoc Loc = SMLoc());
 
@@ -400,8 +397,6 @@ public:
     return (VariantKind)(getSubclassData() & VariantKindMask);
   }
 
-  void printVariantKind(raw_ostream &OS) const;
-
   bool hasSubsectionsViaSymbols() const {
     return (getSubclassData() & HasSubsectionsViaSymbolsBit) != 0;
   }
@@ -499,6 +494,7 @@ public:
     Mul,  ///< Multiplication.
     NE,   ///< Inequality comparison.
     Or,   ///< Bitwise or.
+    OrNot, ///< Bitwise or not.
     Shl,  ///< Shift left.
     AShr, ///< Arithmetic shift right.
     LShr, ///< Logical shift right.
diff --git a/contrib/llvm-project/llvm/include/llvm/MC/MCFixup.h b/contrib/llvm-project/llvm/include/llvm/MC/MCFixup.h
index affc846cbdd4..b3a23911d636 100644
--- a/contrib/llvm-project/llvm/include/llvm/MC/MCFixup.h
+++ b/contrib/llvm-project/llvm/include/llvm/MC/MCFixup.h
@@ -9,7 +9,6 @@
 #ifndef LLVM_MC_MCFIXUP_H
 #define LLVM_MC_MCFIXUP_H
 
-#include "llvm/MC/MCExpr.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/SMLoc.h"
diff --git a/contrib/llvm-project/llvm/include/llvm/MC/MCFragment.h b/contrib/llvm-project/llvm/include/llvm/MC/MCFragment.h
index fb7166e82c09..000b0e33e117 100644
--- a/contrib/llvm-project/llvm/include/llvm/MC/MCFragment.h
+++ b/contrib/llvm-project/llvm/include/llvm/MC/MCFragment.h
@@ -37,6 +37,7 @@ public:
     FT_Data,
     FT_CompactEncodedInst,
     FT_Fill,
+    FT_Nops,
     FT_Relaxable,
     FT_Org,
     FT_Dwarf,
@@ -46,6 +47,7 @@ public:
     FT_SymbolId,
     FT_CVInlineLines,
     FT_CVDefRange,
+    FT_PseudoProbe,
     FT_Dummy
   };
 
@@ -63,6 +65,10 @@ private:
   /// The layout order of this fragment.
   unsigned LayoutOrder;
 
+  /// The subsection this fragment belongs to. This is 0 if the fragment is not
+  // in any subsection.
+  unsigned SubsectionNumber = 0;
+
   FragmentType Kind;
 
   /// Whether fragment is being laid out.
@@ -101,6 +107,9 @@ public:
   bool hasInstructions() const { return HasInstructions; }
 
   void dump() const;
+
+  void setSubsectionNumber(unsigned Value) { SubsectionNumber = Value; }
+  unsigned getSubsectionNumber() const { return SubsectionNumber; }
 };
 
 class MCDummyFragment : public MCFragment {
@@ -139,6 +148,7 @@ public:
     case MCFragment::FT_Data:
     case MCFragment::FT_Dwarf:
     case MCFragment::FT_DwarfFrame:
+    case MCFragment::FT_PseudoProbe:
       return true;
     }
   }
@@ -350,6 +360,31 @@ public:
   }
 };
 
+class MCNopsFragment : public MCFragment {
+  /// The number of bytes to insert.
+  int64_t Size;
+  /// Maximum number of bytes allowed in each NOP instruction.
+  int64_t ControlledNopLength;
+
+  /// Source location of the directive that this fragment was created for.
+  SMLoc Loc;
+
+public:
+  MCNopsFragment(int64_t NumBytes, int64_t ControlledNopLength, SMLoc L,
+                 MCSection *Sec = nullptr)
+      : MCFragment(FT_Nops, false, Sec), Size(NumBytes),
+        ControlledNopLength(ControlledNopLength), Loc(L) {}
+
+  int64_t getNumBytes() const { return Size; }
+  int64_t getControlledNopLength() const { return ControlledNopLength; }
+
+  SMLoc getLoc() const { return Loc; }
+
+  static bool classof(const MCFragment *F) {
+    return F->getKind() == MCFragment::FT_Nops;
+  }
+};
+
 class MCOrgFragment : public MCFragment {
   /// Value to use for filling bytes.
   int8_t Value;
@@ -558,6 +593,23 @@ public:
     return F->getKind() == MCFragment::FT_BoundaryAlign;
   }
 };
+
+class MCPseudoProbeAddrFragment : public MCEncodedFragmentWithFixups<8, 1> {
+  /// The expression for the difference of the two symbols that
+  /// make up the address delta between two .pseudoprobe directives.
+  const MCExpr *AddrDelta;
+
+public:
+  MCPseudoProbeAddrFragment(const MCExpr *AddrDelta, MCSection *Sec = nullptr)
+      : MCEncodedFragmentWithFixups<8, 1>(FT_PseudoProbe, false, Sec),
+        AddrDelta(AddrDelta) {}
+
+  const MCExpr &getAddrDelta() const { return *AddrDelta; }
+
+  static bool classof(const MCFragment *F) {
+    return F->getKind() == MCFragment::FT_PseudoProbe;
+  }
+};
 } // end namespace llvm
 
 #endif // LLVM_MC_MCFRAGMENT_H
diff --git a/contrib/llvm-project/llvm/include/llvm/MC/MCInst.h b/contrib/llvm-project/llvm/include/llvm/MC/MCInst.h
index 360dbda58fcb..2ce2ee063daa 100644
--- a/contrib/llvm-project/llvm/include/llvm/MC/MCInst.h
+++ b/contrib/llvm-project/llvm/include/llvm/MC/MCInst.h
@@ -181,7 +181,7 @@ public:
   MCOperand &getOperand(unsigned i) { return Operands[i]; }
   unsigned getNumOperands() const { return Operands.size(); }
 
-  void addOperand(const MCOperand &Op) { Operands.push_back(Op); }
+  void addOperand(const MCOperand Op) { Operands.push_back(Op); }
 
   using iterator = SmallVectorImpl<MCOperand>::iterator;
   using const_iterator = SmallVectorImpl<MCOperand>::const_iterator;
diff --git a/contrib/llvm-project/llvm/include/llvm/MC/MCInstPrinter.h b/contrib/llvm-project/llvm/include/llvm/MC/MCInstPrinter.h
index 71e049b92455..8b9ef178e33c 100644
--- a/contrib/llvm-project/llvm/include/llvm/MC/MCInstPrinter.h
+++ b/contrib/llvm-project/llvm/include/llvm/MC/MCInstPrinter.h
@@ -18,6 +18,7 @@ class MCAsmInfo;
 class MCInst;
 class MCOperand;
 class MCInstrInfo;
+class MCInstrAnalysis;
 class MCRegisterInfo;
 class MCSubtargetInfo;
 class raw_ostream;
@@ -48,6 +49,7 @@ protected:
   const MCAsmInfo &MAI;
   const MCInstrInfo &MII;
   const MCRegisterInfo &MRI;
+  const MCInstrAnalysis *MIA = nullptr;
 
   /// True if we are printing marked up assembly.
   bool UseMarkup = false;
@@ -63,6 +65,9 @@ protected:
   /// (llvm-objdump -d).
   bool PrintBranchImmAsAddress = false;
 
+  /// If true, symbolize branch target and memory reference operands.
+  bool SymbolizeOperands = false;
+
   /// Utility function for printing annotations.
   void printAnnotation(raw_ostream &OS, StringRef Annot);
 
@@ -83,6 +88,10 @@ public:
   /// Specify a stream to emit comments to.
   void setCommentStream(raw_ostream &OS) { CommentStream = &OS; }
 
+  /// Returns a pair containing the mnemonic for \p MI and the number of bits
+  /// left for further processing by printInstruction (generated by tablegen).
+  virtual std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) = 0;
+
   /// Print the specified MCInst to the specified raw_ostream.
   ///
   /// \p Address the address of current instruction on most targets, used to
@@ -115,6 +124,9 @@ public:
     PrintBranchImmAsAddress = Value;
   }
 
+  void setSymbolizeOperands(bool Value) { SymbolizeOperands = Value; }
+  void setMCInstrAnalysis(const MCInstrAnalysis *Value) { MIA = Value; }
+
   /// Utility function to print immediates in decimal or hex.
   format_object<int64_t> formatImm(int64_t Value) const {
     return PrintImmHex ? formatHex(Value) : formatDec(Value);
diff --git a/contrib/llvm-project/llvm/include/llvm/MC/MCInstrDesc.h b/contrib/llvm-project/llvm/include/llvm/MC/MCInstrDesc.h
index 17454e3134a2..cbb061fc6456 100644
--- a/contrib/llvm-project/llvm/include/llvm/MC/MCInstrDesc.h
+++ b/contrib/llvm-project/llvm/include/llvm/MC/MCInstrDesc.h
@@ -27,12 +27,22 @@ class MCInst;
 //===----------------------------------------------------------------------===//
 
 namespace MCOI {
-// Operand constraints
+/// Operand constraints. These are encoded in 16 bits with one of the
+/// low-order 3 bits specifying that a constraint is present and the
+/// corresponding high-order hex digit specifying the constraint value.
+/// This allows for a maximum of 3 constraints.
 enum OperandConstraint {
-  TIED_TO = 0,  // Must be allocated the same register as.
-  EARLY_CLOBBER // Operand is an early clobber register operand
+  TIED_TO = 0,  // Must be allocated the same register as specified value.
+  EARLY_CLOBBER // If present, operand is an early clobber register.
 };
 
+// Define a macro to produce each constraint value.
+#define MCOI_TIED_TO(op) \
+  ((1 << MCOI::TIED_TO) | ((op) << (4 + MCOI::TIED_TO * 4)))
+
+#define MCOI_EARLY_CLOBBER \
+  (1 << MCOI::EARLY_CLOBBER)
+
 /// These are flags set on operands, but should be considered
 /// private, all access should go through the MCOperandInfo accessors.
 /// See the accessors for a description of what these are.
@@ -84,10 +94,9 @@ public:
 
   /// Information about the type of the operand.
   uint8_t OperandType;
-  /// The lower 16 bits are used to specify which constraints are set.
-  /// The higher 16 bits are used to specify the value of constraints (4 bits
-  /// each).
-  uint32_t Constraints;
+
+  /// Operand constraints (see OperandConstraint enum).
+  uint16_t Constraints;
 
   /// Set if this operand is a pointer value and it requires a callback
   /// to look up its register class.
@@ -197,14 +206,14 @@ public:
   const MCPhysReg *ImplicitDefs; // Registers implicitly defined by this instr
   const MCOperandInfo *OpInfo;   // 'NumOperands' entries about operands
 
-  /// Returns the value of the specific constraint if
-  /// it is set. Returns -1 if it is not set.
+  /// Returns the value of the specified operand constraint if
+  /// it is present. Returns -1 if it is not present.
   int getOperandConstraint(unsigned OpNum,
                            MCOI::OperandConstraint Constraint) const {
     if (OpNum < NumOperands &&
         (OpInfo[OpNum].Constraints & (1 << Constraint))) {
-      unsigned Pos = 16 + Constraint * 4;
-      return (int)(OpInfo[OpNum].Constraints >> Pos) & 0xf;
+      unsigned ValuePos = 4 + Constraint * 4;
+      return (int)(OpInfo[OpNum].Constraints >> ValuePos) & 0x0f;
     }
     return -1;
   }
diff --git a/contrib/llvm-project/llvm/include/llvm/MC/MCMachObjectWriter.h b/contrib/llvm-project/llvm/include/llvm/MC/MCMachObjectWriter.h
index 38ba68b78fe1..f4f9c474cdcd 100644
--- a/contrib/llvm-project/llvm/include/llvm/MC/MCMachObjectWriter.h
+++ b/contrib/llvm-project/llvm/include/llvm/MC/MCMachObjectWriter.h
@@ -114,7 +114,7 @@ class MachObjectWriter : public MCObjectWriter {
   /// \name Symbol Table Data
   /// @{
 
-  StringTableBuilder StringTable{StringTableBuilder::MachO};
+  StringTableBuilder StringTable;
   std::vector<MachSymbolData> LocalSymbolData;
   std::vector<MachSymbolData> ExternalSymbolData;
   std::vector<MachSymbolData> UndefinedSymbolData;
@@ -129,6 +129,8 @@ public:
   MachObjectWriter(std::unique_ptr<MCMachObjectTargetWriter> MOTW,
                    raw_pwrite_stream &OS, bool IsLittleEndian)
       : TargetObjectWriter(std::move(MOTW)),
+        StringTable(TargetObjectWriter->is64Bit() ? StringTableBuilder::MachO64
+                                                  : StringTableBuilder::MachO),
         W(OS, IsLittleEndian ? support::little : support::big) {}
 
   support::endian::Writer W;
@@ -233,16 +235,6 @@ public:
     Relocations[Sec].push_back(P);
   }
 
-  void recordScatteredRelocation(const MCAssembler &Asm,
-                                 const MCAsmLayout &Layout,
-                                 const MCFragment *Fragment,
-                                 const MCFixup &Fixup, MCValue Target,
-                                 unsigned Log2Size, uint64_t &FixedValue);
-
-  void recordTLVPRelocation(const MCAssembler &Asm, const MCAsmLayout &Layout,
-                            const MCFragment *Fragment, const MCFixup &Fixup,
-                            MCValue Target, uint64_t &FixedValue);
-
   void recordRelocation(MCAssembler &Asm, const MCAsmLayout &Layout,
                         const MCFragment *Fragment, const MCFixup &Fixup,
                         MCValue Target, uint64_t &FixedValue) override;
diff --git a/contrib/llvm-project/llvm/include/llvm/MC/MCObjectFileInfo.h b/contrib/llvm-project/llvm/include/llvm/MC/MCObjectFileInfo.h
index ca04d8e8d3b6..2e6a84b6861f 100644
--- a/contrib/llvm-project/llvm/include/llvm/MC/MCObjectFileInfo.h
+++ b/contrib/llvm-project/llvm/include/llvm/MC/MCObjectFileInfo.h
@@ -174,6 +174,10 @@ protected:
   /// Section containing metadata on function stack sizes.
   MCSection *StackSizesSection = nullptr;
 
+  /// Section for pseudo probe information used by AutoFDO
+  MCSection *PseudoProbeSection = nullptr;
+  MCSection *PseudoProbeDescSection = nullptr;
+
   // ELF specific sections.
   MCSection *DataRelROSection = nullptr;
   MCSection *MergeableConst4Section = nullptr;
@@ -215,6 +219,7 @@ protected:
   MCSection *XDataSection = nullptr;
   MCSection *SXDataSection = nullptr;
   MCSection *GFIDsSection = nullptr;
+  MCSection *GIATsSection = nullptr;
   MCSection *GLJMPSection = nullptr;
 
   // XCOFF specific sections
@@ -249,7 +254,6 @@ public:
   MCSection *getDataSection() const { return DataSection; }
   MCSection *getBSSSection() const { return BSSSection; }
   MCSection *getReadOnlySection() const { return ReadOnlySection; }
-  MCSection *getLSDASection() const { return LSDASection; }
   MCSection *getCompactUnwindSection() const { return CompactUnwindSection; }
   MCSection *getDwarfAbbrevSection() const { return DwarfAbbrevSection; }
   MCSection *getDwarfInfoSection() const { return DwarfInfoSection; }
@@ -338,6 +342,12 @@ public:
 
   MCSection *getStackSizesSection(const MCSection &TextSec) const;
 
+  MCSection *getBBAddrMapSection(const MCSection &TextSec) const;
+
+  MCSection *getPseudoProbeSection(const MCSection *TextSec) const;
+
+  MCSection *getPseudoProbeDescSection(StringRef FuncName) const;
+
   // ELF specific sections.
   MCSection *getDataRelROSection() const { return DataRelROSection; }
   const MCSection *getMergeableConst4Section() const {
@@ -396,14 +406,13 @@ public:
   MCSection *getXDataSection() const { return XDataSection; }
   MCSection *getSXDataSection() const { return SXDataSection; }
   MCSection *getGFIDsSection() const { return GFIDsSection; }
+  MCSection *getGIATsSection() const { return GIATsSection; }
   MCSection *getGLJMPSection() const { return GLJMPSection; }
 
   // XCOFF specific sections
   MCSection *getTOCBaseSection() const { return TOCBaseSection; }
 
-  MCSection *getEHFrameSection() {
-    return EHFrameSection;
-  }
+  MCSection *getEHFrameSection() const { return EHFrameSection; }
 
   enum Environment { IsMachO, IsELF, IsCOFF, IsWasm, IsXCOFF };
   Environment getObjectFileType() const { return Env; }
diff --git a/contrib/llvm-project/llvm/include/llvm/MC/MCObjectStreamer.h b/contrib/llvm-project/llvm/include/llvm/MC/MCObjectStreamer.h
index c3f3ae5de921..a00000bc11b6 100644
--- a/contrib/llvm-project/llvm/include/llvm/MC/MCObjectStreamer.h
+++ b/contrib/llvm-project/llvm/include/llvm/MC/MCObjectStreamer.h
@@ -179,6 +179,8 @@ public:
                 SMLoc Loc = SMLoc()) override;
   void emitFill(const MCExpr &NumValues, int64_t Size, int64_t Expr,
                 SMLoc Loc = SMLoc()) override;
+  void emitNops(int64_t NumBytes, int64_t ControlledNopLength,
+                SMLoc Loc) override;
   void emitFileDirective(StringRef Filename) override;
 
   void emitAddrsig() override;
diff --git a/contrib/llvm-project/llvm/include/llvm/MC/MCParser/AsmLexer.h b/contrib/llvm-project/llvm/include/llvm/MC/MCParser/AsmLexer.h
index 05b3695bc7a0..e187a28f267d 100644
--- a/contrib/llvm-project/llvm/include/llvm/MC/MCParser/AsmLexer.h
+++ b/contrib/llvm-project/llvm/include/llvm/MC/MCParser/AsmLexer.h
@@ -56,6 +56,7 @@ private:
   bool isAtStartOfComment(const char *Ptr);
   bool isAtStatementSeparator(const char *Ptr);
   int getNextChar();
+  int peekNextChar();
   AsmToken ReturnError(const char *Loc, const std::string &Msg);
 
   AsmToken LexIdentifier();
diff --git a/contrib/llvm-project/llvm/include/llvm/MC/MCParser/MCAsmLexer.h b/contrib/llvm-project/llvm/include/llvm/MC/MCParser/MCAsmLexer.h
index e89abeaac94c..21966d1c742d 100644
--- a/contrib/llvm-project/llvm/include/llvm/MC/MCParser/MCAsmLexer.h
+++ b/contrib/llvm-project/llvm/include/llvm/MC/MCParser/MCAsmLexer.h
@@ -49,7 +49,11 @@ protected: // Can only create subclasses.
   bool SkipSpace = true;
   bool AllowAtInIdentifier;
   bool IsAtStartOfStatement = true;
+  bool LexMasmHexFloats = false;
   bool LexMasmIntegers = false;
+  bool LexMasmStrings = false;
+  bool UseMasmDefaultRadix = false;
+  unsigned DefaultRadix = 10;
   AsmCommentConsumer *CommentConsumer = nullptr;
 
   MCAsmLexer();
@@ -147,9 +151,23 @@ public:
     this->CommentConsumer = CommentConsumer;
   }
 
-  /// Set whether to lex masm-style binary and hex literals. They look like
-  /// 0b1101 and 0ABCh respectively.
+  /// Set whether to lex masm-style binary (e.g., 0b1101) and radix-specified
+  /// literals (e.g., 0ABCh [hex], 576t [decimal], 77o [octal], 1101y [binary]).
   void setLexMasmIntegers(bool V) { LexMasmIntegers = V; }
+
+  /// Set whether to use masm-style default-radix integer literals. If disabled,
+  /// assume decimal unless prefixed (e.g., 0x2c [hex], 077 [octal]).
+  void useMasmDefaultRadix(bool V) { UseMasmDefaultRadix = V; }
+
+  unsigned getMasmDefaultRadix() const { return DefaultRadix; }
+  void setMasmDefaultRadix(unsigned Radix) { DefaultRadix = Radix; }
+
+  /// Set whether to lex masm-style hex float literals, such as 3f800000r.
+  void setLexMasmHexFloats(bool V) { LexMasmHexFloats = V; }
+
+  /// Set whether to lex masm-style string literals, such as 'Can''t find file'
+  /// and "This ""value"" not found".
+  void setLexMasmStrings(bool V) { LexMasmStrings = V; }
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/include/llvm/MC/MCParser/MCAsmParser.h b/contrib/llvm-project/llvm/include/llvm/MC/MCParser/MCAsmParser.h
index a68066e0f50b..391a6b0b575e 100644
--- a/contrib/llvm-project/llvm/include/llvm/MC/MCParser/MCAsmParser.h
+++ b/contrib/llvm-project/llvm/include/llvm/MC/MCParser/MCAsmParser.h
@@ -90,6 +90,20 @@ private:
   IdKind Kind;
 };
 
+// Generic type information for an assembly object.
+// All sizes measured in bytes.
+struct AsmTypeInfo {
+  StringRef Name;
+  unsigned Size = 0;
+  unsigned ElementSize = 0;
+  unsigned Length = 0;
+};
+
+struct AsmFieldInfo {
+  AsmTypeInfo Type;
+  unsigned Offset = 0;
+};
+
 /// Generic Sema callback for assembly parser.
 class MCAsmParserSemaCallback {
 public:
@@ -170,12 +184,17 @@ public:
 
   virtual bool isParsingMasm() const { return false; }
 
-  virtual bool lookUpField(StringRef Name, StringRef &Type,
-                           unsigned &Offset) const {
+  virtual bool defineMacro(StringRef Name, StringRef Value) { return true; }
+
+  virtual bool lookUpField(StringRef Name, AsmFieldInfo &Info) const {
+    return true;
+  }
+  virtual bool lookUpField(StringRef Base, StringRef Member,
+                           AsmFieldInfo &Info) const {
     return true;
   }
-  virtual bool lookUpField(StringRef Base, StringRef Member, StringRef &Type,
-                           unsigned &Offset) const {
+
+  virtual bool lookUpType(StringRef Name, AsmTypeInfo &Info) const {
     return true;
   }
 
@@ -281,7 +300,8 @@ public:
   /// \param Res - The value of the expression. The result is undefined
   /// on error.
   /// \return - False on success.
-  virtual bool parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) = 0;
+  virtual bool parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc,
+                                AsmTypeInfo *TypeInfo) = 0;
 
   /// Parse an arbitrary expression, assuming that an initial '(' has
   /// already been consumed.
diff --git a/contrib/llvm-project/llvm/include/llvm/MC/MCParser/MCTargetAsmParser.h b/contrib/llvm-project/llvm/include/llvm/MC/MCParser/MCTargetAsmParser.h
index 1d10c66b4201..0a1e50d501e9 100644
--- a/contrib/llvm-project/llvm/include/llvm/MC/MCParser/MCTargetAsmParser.h
+++ b/contrib/llvm-project/llvm/include/llvm/MC/MCParser/MCTargetAsmParser.h
@@ -24,7 +24,6 @@
 namespace llvm {
 
 class MCInst;
-class MCParsedAsmOperand;
 class MCStreamer;
 class MCSubtargetInfo;
 template <typename T> class SmallVectorImpl;
@@ -370,7 +369,7 @@ public:
 
   // Target-specific parsing of expression.
   virtual bool parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
-    return getParser().parsePrimaryExpr(Res, EndLoc);
+    return getParser().parsePrimaryExpr(Res, EndLoc, nullptr);
   }
 
   virtual bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
diff --git a/contrib/llvm-project/llvm/include/llvm/MC/MCPseudoProbe.h b/contrib/llvm-project/llvm/include/llvm/MC/MCPseudoProbe.h
new file mode 100644
index 000000000000..b9a6196777de
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/MC/MCPseudoProbe.h
@@ -0,0 +1,178 @@
+//===- MCPseudoProbe.h - Pseudo probe encoding support ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the MCPseudoProbe to support the pseudo
+// probe encoding for AutoFDO. Pseudo probes together with their inline context
+// are encoded in a DFS recursive way in the .pseudoprobe sections. For each
+// .pseudoprobe section, the encoded binary data consist of a single or mutiple
+// function records each for one outlined function. A function record has the
+// following format :
+//
+// FUNCTION BODY (one for each outlined function present in the text section)
+//    GUID (uint64)
+//        GUID of the function
+//    NPROBES (ULEB128)
+//        Number of probes originating from this function.
+//    NUM_INLINED_FUNCTIONS (ULEB128)
+//        Number of callees inlined into this function, aka number of
+//        first-level inlinees
+//    PROBE RECORDS
+//        A list of NPROBES entries. Each entry contains:
+//          INDEX (ULEB128)
+//          TYPE (uint4)
+//            0 - block probe, 1 - indirect call, 2 - direct call
+//          ATTRIBUTE (uint3)
+//            reserved
+//          ADDRESS_TYPE (uint1)
+//            0 - code address, 1 - address delta
+//          CODE_ADDRESS (uint64 or ULEB128)
+//            code address or address delta, depending on ADDRESS_TYPE
+//    INLINED FUNCTION RECORDS
+//        A list of NUM_INLINED_FUNCTIONS entries describing each of the inlined
+//        callees.  Each record contains:
+//          INLINE SITE
+//            GUID of the inlinee (uint64)
+//            ID of the callsite probe (ULEB128)
+//          FUNCTION BODY
+//            A FUNCTION BODY entry describing the inlined function.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_MC_MCPSEUDOPROBE_H
+#define LLVM_MC_MCPSEUDOPROBE_H
+
+#include "llvm/ADT/MapVector.h"
+#include "llvm/MC/MCSection.h"
+#include <functional>
+#include <map>
+#include <vector>
+
+namespace llvm {
+
+class MCStreamer;
+class MCSymbol;
+class MCObjectStreamer;
+
+enum class MCPseudoProbeFlag {
+  // If set, indicates that the probe is encoded as an address delta
+  // instead of a real code address.
+  AddressDelta = 0x1,
+};
+
+/// Instances of this class represent a pseudo probe instance for a pseudo probe
+/// table entry, which is created during a machine instruction is assembled and
+/// uses an address from a temporary label created at the current address in the
+/// current section.
+class MCPseudoProbe {
+  MCSymbol *Label;
+  uint64_t Guid;
+  uint64_t Index;
+  uint8_t Type;
+  uint8_t Attributes;
+
+public:
+  MCPseudoProbe(MCSymbol *Label, uint64_t Guid, uint64_t Index, uint64_t Type,
+                uint64_t Attributes)
+      : Label(Label), Guid(Guid), Index(Index), Type(Type),
+        Attributes(Attributes) {
+    assert(Type <= 0xFF && "Probe type too big to encode, exceeding 2^8");
+    assert(Attributes <= 0xFF &&
+           "Probe attributes too big to encode, exceeding 2^16");
+  }
+
+  MCSymbol *getLabel() const { return Label; }
+
+  uint64_t getGuid() const { return Guid; }
+
+  uint64_t getIndex() const { return Index; }
+
+  uint8_t getType() const { return Type; }
+
+  uint8_t getAttributes() const { return Attributes; }
+
+  void emit(MCObjectStreamer *MCOS, const MCPseudoProbe *LastProbe) const;
+};
+
+// An inline frame has the form <Guid, ProbeID>
+using InlineSite = std::tuple<uint64_t, uint32_t>;
+using MCPseudoProbeInlineStack = SmallVector<InlineSite, 8>;
+
+// A Tri-tree based data structure to group probes by inline stack.
+// A tree is allocated for a standalone .text section. A fake
+// instance is created as the root of a tree.
+// A real instance of this class is created for each function, either an
+// unlined function that has code in .text section or an inlined function.
+class MCPseudoProbeInlineTree {
+  uint64_t Guid;
+  // Set of probes that come with the function.
+  std::vector<MCPseudoProbe> Probes;
+  // Use std::map for a deterministic output.
+  std::map<InlineSite, MCPseudoProbeInlineTree *> Inlinees;
+
+  // Root node has a GUID 0.
+  bool isRoot() { return Guid == 0; }
+  MCPseudoProbeInlineTree *getOrAddNode(InlineSite Site);
+
+public:
+  MCPseudoProbeInlineTree() = default;
+  MCPseudoProbeInlineTree(uint64_t Guid) : Guid(Guid) {}
+  ~MCPseudoProbeInlineTree();
+  void addPseudoProbe(const MCPseudoProbe &Probe,
+                      const MCPseudoProbeInlineStack &InlineStack);
+  void emit(MCObjectStreamer *MCOS, const MCPseudoProbe *&LastProbe);
+};
+
+/// Instances of this class represent the pseudo probes inserted into a compile
+/// unit.
+class MCPseudoProbeSection {
+public:
+  void addPseudoProbe(MCSection *Sec, const MCPseudoProbe &Probe,
+                      const MCPseudoProbeInlineStack &InlineStack) {
+    MCProbeDivisions[Sec].addPseudoProbe(Probe, InlineStack);
+  }
+
+  // TODO: Sort by getOrdinal to ensure a determinstic section order
+  using MCProbeDivisionMap = std::map<MCSection *, MCPseudoProbeInlineTree>;
+
+private:
+  // A collection of MCPseudoProbe for each text section. The MCPseudoProbes
+  // are grouped by GUID of the functions where they are from and will be
+  // encoded by groups. In the comdat scenario where a text section really only
+  // contains the code of a function solely, the probes associated with a comdat
+  // function are still grouped by GUIDs due to inlining that can bring probes
+  // from different functions into one function.
+  MCProbeDivisionMap MCProbeDivisions;
+
+public:
+  const MCProbeDivisionMap &getMCProbes() const { return MCProbeDivisions; }
+
+  bool empty() const { return MCProbeDivisions.empty(); }
+
+  void emit(MCObjectStreamer *MCOS);
+};
+
+class MCPseudoProbeTable {
+  // A collection of MCPseudoProbe in the current module grouped by text
+  // sections. MCPseudoProbes will be encoded into a corresponding
+  // .pseudoprobe section. With functions emitted as separate comdats,
+  // a text section really only contains the code of a function solely, and the
+  // probes associated with the text section will be emitted into a standalone
+  // .pseudoprobe section that shares the same comdat group with the function.
+  MCPseudoProbeSection MCProbeSections;
+
+public:
+  static void emit(MCObjectStreamer *MCOS);
+
+  MCPseudoProbeSection &getProbeSections() { return MCProbeSections; }
+
+#ifndef NDEBUG
+  static int DdgPrintIndent;
+#endif
+};
+} // end namespace llvm
+
+#endif // LLVM_MC_MCPSEUDOPROBE_H
diff --git a/contrib/llvm-project/llvm/include/llvm/MC/MCRegister.h b/contrib/llvm-project/llvm/include/llvm/MC/MCRegister.h
index 1f3c4b8494cc..8bbeab5bef43 100644
--- a/contrib/llvm-project/llvm/include/llvm/MC/MCRegister.h
+++ b/contrib/llvm-project/llvm/include/llvm/MC/MCRegister.h
@@ -20,6 +20,7 @@ using MCPhysReg = uint16_t;
 
 /// Wrapper class representing physical registers. Should be passed by value.
 class MCRegister {
+  friend hash_code hash_value(const MCRegister &);
   unsigned Reg;
 
 public:
@@ -46,31 +47,26 @@ public:
   /// register. StackSlot values do not exist in the MC layer, see
   /// Register::isStackSlot() for the more information on them.
   ///
-  /// Note that isVirtualRegister() and isPhysicalRegister() cannot handle stack
-  /// slots, so if a variable may contains a stack slot, always check
-  /// isStackSlot() first.
   static bool isStackSlot(unsigned Reg) {
-    return !(Reg & VirtualRegFlag) &&
-           uint32_t(Reg & ~VirtualRegFlag) >= FirstStackSlot;
+    return FirstStackSlot <= Reg && Reg < VirtualRegFlag;
   }
 
   /// Return true if the specified register number is in
   /// the physical register namespace.
   static bool isPhysicalRegister(unsigned Reg) {
-    assert(!isStackSlot(Reg) && "Not a register! Check isStackSlot() first.");
-    return Reg >= FirstPhysicalReg && !(Reg & VirtualRegFlag);
-  }
-
-  /// Return true if the specified register number is in the physical register
-  /// namespace.
-  bool isPhysical() const {
-    return isPhysicalRegister(Reg);
+    return FirstPhysicalReg <= Reg && Reg < FirstStackSlot;
   }
 
   constexpr operator unsigned() const {
     return Reg;
   }
 
+  /// Check the provided unsigned value is a valid MCRegister.
+  static MCRegister from(unsigned Val) {
+    assert(Val == NoRegister || isPhysicalRegister(Val));
+    return MCRegister(Val);
+  }
+
   unsigned id() const {
     return Reg;
   }
@@ -110,6 +106,9 @@ template<> struct DenseMapInfo<MCRegister> {
   }
 };
 
+inline hash_code hash_value(const MCRegister &Reg) {
+  return hash_value(Reg.id());
+}
 }
 
 #endif // ifndef LLVM_MC_REGISTER_H
diff --git a/contrib/llvm-project/llvm/include/llvm/MC/MCRegisterInfo.h b/contrib/llvm-project/llvm/include/llvm/MC/MCRegisterInfo.h
index 9864d95d19e0..0c1ac6254ec1 100644
--- a/contrib/llvm-project/llvm/include/llvm/MC/MCRegisterInfo.h
+++ b/contrib/llvm-project/llvm/include/llvm/MC/MCRegisterInfo.h
@@ -675,6 +675,7 @@ public:
 
   MCRegUnitIterator(MCRegister Reg, const MCRegisterInfo *MCRI) {
     assert(Reg && "Null register has no regunits");
+    assert(MCRegister::isPhysicalRegister(Reg.id()));
     // Decode the RegUnits MCRegisterDesc field.
     unsigned RU = MCRI->get(Reg).RegUnits;
     unsigned Scale = RU & 15;
diff --git a/contrib/llvm-project/llvm/include/llvm/MC/MCSchedule.h b/contrib/llvm-project/llvm/include/llvm/MC/MCSchedule.h
index 66c5659af3a7..ee0e5b4df9f0 100644
--- a/contrib/llvm-project/llvm/include/llvm/MC/MCSchedule.h
+++ b/contrib/llvm-project/llvm/include/llvm/MC/MCSchedule.h
@@ -205,7 +205,7 @@ struct MCExtraProcessorInfo {
 /// subtargets can't be done. Nonetheless, the abstract model is
 /// useful. Futhermore, subtargets typically extend this model with processor
 /// specific resources to model any hardware features that can be exploited by
-/// sceduling heuristics and aren't sufficiently represented in the abstract.
+/// scheduling heuristics and aren't sufficiently represented in the abstract.
 ///
 /// The abstract pipeline is built around the notion of an "issue point". This
 /// is merely a reference point for counting machine cycles. The physical
diff --git a/contrib/llvm-project/llvm/include/llvm/MC/MCSectionXCOFF.h b/contrib/llvm-project/llvm/include/llvm/MC/MCSectionXCOFF.h
index eed6b9c2609c..aa39dff07180 100644
--- a/contrib/llvm-project/llvm/include/llvm/MC/MCSectionXCOFF.h
+++ b/contrib/llvm-project/llvm/include/llvm/MC/MCSectionXCOFF.h
@@ -34,22 +34,23 @@ class MCSectionXCOFF final : public MCSection {
 
   XCOFF::StorageMappingClass MappingClass;
   XCOFF::SymbolType Type;
-  XCOFF::StorageClass StorageClass;
   MCSymbolXCOFF *const QualName;
   StringRef SymbolTableName;
+  bool MultiSymbolsAllowed;
   static constexpr unsigned DefaultAlignVal = 4;
 
   MCSectionXCOFF(StringRef Name, XCOFF::StorageMappingClass SMC,
-                 XCOFF::SymbolType ST, XCOFF::StorageClass SC, SectionKind K,
-                 MCSymbolXCOFF *QualName, MCSymbol *Begin,
-                 StringRef SymbolTableName)
+                 XCOFF::SymbolType ST, SectionKind K, MCSymbolXCOFF *QualName,
+                 MCSymbol *Begin, StringRef SymbolTableName,
+                 bool MultiSymbolsAllowed)
       : MCSection(SV_XCOFF, Name, K, Begin), MappingClass(SMC), Type(ST),
-        StorageClass(SC), QualName(QualName), SymbolTableName(SymbolTableName) {
+        QualName(QualName), SymbolTableName(SymbolTableName),
+        MultiSymbolsAllowed(MultiSymbolsAllowed) {
     assert((ST == XCOFF::XTY_SD || ST == XCOFF::XTY_CM || ST == XCOFF::XTY_ER) &&
            "Invalid or unhandled type for csect.");
     assert(QualName != nullptr && "QualName is needed.");
-    QualName->setStorageClass(SC);
     QualName->setRepresentedCsect(this);
+    QualName->setStorageClass(XCOFF::C_HIDEXT);
     // A csect is 4 byte aligned by default, except for undefined symbol csects.
     if (Type != XCOFF::XTY_ER)
       setAlignment(Align(DefaultAlignVal));
@@ -65,7 +66,9 @@ public:
   }
 
   XCOFF::StorageMappingClass getMappingClass() const { return MappingClass; }
-  XCOFF::StorageClass getStorageClass() const { return StorageClass; }
+  XCOFF::StorageClass getStorageClass() const {
+    return QualName->getStorageClass();
+  }
   XCOFF::SymbolType getCSectType() const { return Type; }
   MCSymbolXCOFF *getQualNameSymbol() const { return QualName; }
 
@@ -75,6 +78,7 @@ public:
   bool UseCodeAlign() const override;
   bool isVirtualSection() const override;
   StringRef getSymbolTableName() const { return SymbolTableName; }
+  bool isMultiSymbolsAllowed() const { return MultiSymbolsAllowed; }
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/include/llvm/MC/MCStreamer.h b/contrib/llvm-project/llvm/include/llvm/MC/MCStreamer.h
index 484c62538366..cdc728f73772 100644
--- a/contrib/llvm-project/llvm/include/llvm/MC/MCStreamer.h
+++ b/contrib/llvm-project/llvm/include/llvm/MC/MCStreamer.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_MC_MCSTREAMER_H
 #define LLVM_MC_MCSTREAMER_H
 
+#include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Optional.h"
@@ -20,6 +21,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCLinkerOptimizationHint.h"
+#include "llvm/MC/MCPseudoProbe.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCWinEH.h"
 #include "llvm/Support/Error.h"
@@ -205,6 +207,7 @@ class MCStreamer {
   std::vector<std::unique_ptr<WinEH::FrameInfo>> WinFrameInfos;
 
   WinEH::FrameInfo *CurrentWinFrameInfo;
+  size_t CurrentProcWinFrameInfoStartIndex;
 
   /// Tracks an index to represent the order a symbol was emitted in.
   /// Zero means we did not emit that symbol.
@@ -214,6 +217,10 @@ class MCStreamer {
   /// PushSection.
   SmallVector<std::pair<MCSectionSubPair, MCSectionSubPair>, 4> SectionStack;
 
+  /// Pointer to the parser's SMLoc if available. This is used to provide
+  /// locations for diagnostics.
+  const SMLoc *StartTokLocPtr = nullptr;
+
   /// The next unique ID to use when creating a WinCFI-related section (.pdata
   /// or .xdata). This ID ensures that we have a one-to-one mapping from
   /// code section to unwind info section, which MSVC's incremental linker
@@ -239,6 +246,8 @@ protected:
     return CurrentWinFrameInfo;
   }
 
+  virtual void EmitWindowsUnwindTables(WinEH::FrameInfo *Frame);
+
   virtual void EmitWindowsUnwindTables();
 
   virtual void emitRawTextImpl(StringRef String);
@@ -258,6 +267,11 @@ public:
     TargetStreamer.reset(TS);
   }
 
+  void setStartTokLocPtr(const SMLoc *Loc) { StartTokLocPtr = Loc; }
+  SMLoc getStartTokLoc() const {
+    return StartTokLocPtr ? *StartTokLocPtr : SMLoc();
+  }
+
   /// State management
   ///
   virtual void reset();
@@ -442,6 +456,10 @@ public:
   /// so we can sort on them later.
   void AssignFragment(MCSymbol *Symbol, MCFragment *Fragment);
 
+  /// Returns the mnemonic for \p MI, if the streamer has access to a
+  /// instruction printer and returns an empty string otherwise.
+  virtual StringRef getMnemonic(MCInst &MI) { return ""; }
+
   /// Emit a label for \p Symbol into the current section.
   ///
   /// This corresponds to an assembler statement such as:
@@ -673,6 +691,7 @@ public:
   /// Special case of EmitValue that avoids the client having
   /// to pass in a MCExpr for constant integers.
   virtual void emitIntValue(uint64_t Value, unsigned Size);
+  virtual void emitIntValue(APInt Value);
 
   /// Special case of EmitValue that avoids the client having to pass
   /// in a MCExpr for constant integers & prints in Hex format for certain
@@ -777,6 +796,9 @@ public:
   virtual void emitFill(const MCExpr &NumValues, int64_t Size, int64_t Expr,
                         SMLoc Loc = SMLoc());
 
+  virtual void emitNops(int64_t NumBytes, int64_t ControlledNopLength,
+                        SMLoc Loc);
+
   /// Emit NumBytes worth of zeros.
   /// This function properly handles data in virtual sections.
   void emitZeros(uint64_t NumBytes);
@@ -1028,6 +1050,11 @@ public:
   /// Emit the given \p Instruction into the current section.
   virtual void emitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI);
 
+  /// Emit the a pseudo probe into the current section.
+  virtual void emitPseudoProbe(uint64_t Guid, uint64_t Index, uint64_t Type,
+                               uint64_t Attr,
+                               const MCPseudoProbeInlineStack &InlineStack);
+
   /// Set the bundle alignment mode from now on in the section.
   /// The argument is the power of 2 to which the alignment is set. The
   /// value 0 means turn the bundle alignment off.
@@ -1050,7 +1077,7 @@ public:
   /// Streamer specific finalization.
   virtual void finishImpl();
   /// Finish emission of machine code.
-  void Finish();
+  void Finish(SMLoc EndLoc = SMLoc());
 
   virtual bool mayHaveInstructions(MCSection &Sec) const { return true; }
 };
@@ -1059,28 +1086,6 @@ public:
 /// timing the assembler front end.
 MCStreamer *createNullStreamer(MCContext &Ctx);
 
-/// Create a machine code streamer which will print out assembly for the native
-/// target, suitable for compiling with a native assembler.
-///
-/// \param InstPrint - If given, the instruction printer to use. If not given
-/// the MCInst representation will be printed.  This method takes ownership of
-/// InstPrint.
-///
-/// \param CE - If given, a code emitter to use to show the instruction
-/// encoding inline with the assembly. This method takes ownership of \p CE.
-///
-/// \param TAB - If given, a target asm backend to use to show the fixup
-/// information in conjunction with encoding information. This method takes
-/// ownership of \p TAB.
-///
-/// \param ShowInst - Whether to show the MCInst representation inline with
-/// the assembly.
-MCStreamer *createAsmStreamer(MCContext &Ctx,
-                              std::unique_ptr<formatted_raw_ostream> OS,
-                              bool isVerboseAsm, bool useDwarfDirectory,
-                              MCInstPrinter *InstPrint, MCCodeEmitter *CE,
-                              MCAsmBackend *TAB, bool ShowInst);
-
 } // end namespace llvm
 
 #endif // LLVM_MC_MCSTREAMER_H
diff --git a/contrib/llvm-project/llvm/include/llvm/MC/MCSubtargetInfo.h b/contrib/llvm-project/llvm/include/llvm/MC/MCSubtargetInfo.h
index 61cbb842502e..2c1072d833fb 100644
--- a/contrib/llvm-project/llvm/include/llvm/MC/MCSubtargetInfo.h
+++ b/contrib/llvm-project/llvm/include/llvm/MC/MCSubtargetInfo.h
@@ -54,6 +54,7 @@ struct SubtargetFeatureKV {
 struct SubtargetSubTypeKV {
   const char *Key;                      ///< K-V key string
   FeatureBitArray Implies;              ///< K-V bit mask
+  FeatureBitArray TuneImplies;          ///< K-V bit mask
   const MCSchedModel *SchedModel;
 
   /// Compare routine for std::lower_bound
@@ -74,6 +75,7 @@ struct SubtargetSubTypeKV {
 class MCSubtargetInfo {
   Triple TargetTriple;
   std::string CPU; // CPU being targeted.
+  std::string TuneCPU; // CPU being tuned for.
   ArrayRef<SubtargetFeatureKV> ProcFeatures;  // Processor feature list
   ArrayRef<SubtargetSubTypeKV> ProcDesc;  // Processor descriptions
 
@@ -90,8 +92,8 @@ class MCSubtargetInfo {
 
 public:
   MCSubtargetInfo(const MCSubtargetInfo &) = default;
-  MCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS,
-                  ArrayRef<SubtargetFeatureKV> PF,
+  MCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef TuneCPU,
+                  StringRef FS, ArrayRef<SubtargetFeatureKV> PF,
                   ArrayRef<SubtargetSubTypeKV> PD,
                   const MCWriteProcResEntry *WPR, const MCWriteLatencyEntry *WL,
                   const MCReadAdvanceEntry *RA, const InstrStage *IS,
@@ -103,6 +105,7 @@ public:
 
   const Triple &getTargetTriple() const { return TargetTriple; }
   StringRef getCPU() const { return CPU; }
+  StringRef getTuneCPU() const { return TuneCPU; }
 
   const FeatureBitset& getFeatureBits() const { return FeatureBits; }
   void setFeatureBits(const FeatureBitset &FeatureBits_) {
@@ -118,12 +121,12 @@ protected:
   ///
   /// FIXME: Find a way to stick this in the constructor, since it should only
   /// be called during initialization.
-  void InitMCProcessorInfo(StringRef CPU, StringRef FS);
+  void InitMCProcessorInfo(StringRef CPU, StringRef TuneCPU, StringRef FS);
 
 public:
-  /// Set the features to the default for the given CPU with an appended feature
-  /// string.
-  void setDefaultFeatures(StringRef CPU, StringRef FS);
+  /// Set the features to the default for the given CPU and TuneCPU, with ano
+  /// appended feature string.
+  void setDefaultFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
 
   /// Toggle a feature and return the re-computed feature bits.
   /// This version does not change the implied bits.
@@ -210,15 +213,16 @@ public:
   void initInstrItins(InstrItineraryData &InstrItins) const;
 
   /// Resolve a variant scheduling class for the given MCInst and CPU.
-  virtual unsigned
-  resolveVariantSchedClass(unsigned SchedClass, const MCInst *MI,
-                           unsigned CPUID) const {
+  virtual unsigned resolveVariantSchedClass(unsigned SchedClass,
+                                            const MCInst *MI,
+                                            const MCInstrInfo *MCII,
+                                            unsigned CPUID) const {
     return 0;
   }
 
   /// Check whether the CPU string is valid.
   bool isCPUStringValid(StringRef CPU) const {
-    auto Found = std::lower_bound(ProcDesc.begin(), ProcDesc.end(), CPU);
+    auto Found = llvm::lower_bound(ProcDesc, CPU);
     return Found != ProcDesc.end() && StringRef(Found->Key) == CPU;
   }
 
diff --git a/contrib/llvm-project/llvm/include/llvm/MC/MCSymbol.h b/contrib/llvm-project/llvm/include/llvm/MC/MCSymbol.h
index 84263bf94035..a83781f5c586 100644
--- a/contrib/llvm-project/llvm/include/llvm/MC/MCSymbol.h
+++ b/contrib/llvm-project/llvm/include/llvm/MC/MCSymbol.h
@@ -16,6 +16,7 @@
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFragment.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
@@ -27,7 +28,6 @@ namespace llvm {
 
 class MCAsmInfo;
 class MCContext;
-class MCExpr;
 class MCSection;
 class raw_ostream;
 
@@ -94,7 +94,8 @@ protected:
 
   mutable unsigned IsRegistered : 1;
 
-  /// This symbol is visible outside this translation unit.
+  /// True if this symbol is visible outside this translation unit. Note: ELF
+  /// uses binding instead of this bit.
   mutable unsigned IsExternal : 1;
 
   /// This symbol is private extern.
diff --git a/contrib/llvm-project/llvm/include/llvm/MC/MCSymbolWasm.h b/contrib/llvm-project/llvm/include/llvm/MC/MCSymbolWasm.h
index ffd8a7aad312..ae512fd27be2 100644
--- a/contrib/llvm-project/llvm/include/llvm/MC/MCSymbolWasm.h
+++ b/contrib/llvm-project/llvm/include/llvm/MC/MCSymbolWasm.h
@@ -25,6 +25,7 @@ class MCSymbolWasm : public MCSymbol {
   Optional<StringRef> ExportName;
   wasm::WasmSignature *Signature = nullptr;
   Optional<wasm::WasmGlobalType> GlobalType;
+  Optional<wasm::ValType> TableType;
   Optional<wasm::WasmEventType> EventType;
 
   /// An expression describing how to calculate the size of a symbol. If a
@@ -42,6 +43,7 @@ public:
   bool isFunction() const { return Type == wasm::WASM_SYMBOL_TYPE_FUNCTION; }
   bool isData() const { return Type == wasm::WASM_SYMBOL_TYPE_DATA; }
   bool isGlobal() const { return Type == wasm::WASM_SYMBOL_TYPE_GLOBAL; }
+  bool isTable() const { return Type == wasm::WASM_SYMBOL_TYPE_TABLE; }
   bool isSection() const { return Type == wasm::WASM_SYMBOL_TYPE_SECTION; }
   bool isEvent() const { return Type == wasm::WASM_SYMBOL_TYPE_EVENT; }
   wasm::WasmSymbolType getType() const { return Type; }
@@ -94,6 +96,15 @@ public:
   StringRef getExportName() const { return ExportName.getValue(); }
   void setExportName(StringRef Name) { ExportName = Name; }
 
+  bool isFunctionTable() const {
+    return isTable() && hasTableType() &&
+           getTableType() == wasm::ValType::FUNCREF;
+  }
+  void setFunctionTable() {
+    setType(wasm::WASM_SYMBOL_TYPE_TABLE);
+    setTableType(wasm::ValType::FUNCREF);
+  }
+
   void setUsedInGOT() const { IsUsedInGOT = true; }
   bool isUsedInGOT() const { return IsUsedInGOT; }
 
@@ -109,6 +120,13 @@ public:
   }
   void setGlobalType(wasm::WasmGlobalType GT) { GlobalType = GT; }
 
+  bool hasTableType() const { return TableType.hasValue(); }
+  wasm::ValType getTableType() const {
+    assert(hasTableType());
+    return TableType.getValue();
+  }
+  void setTableType(wasm::ValType TT) { TableType = TT; }
+
   const wasm::WasmEventType &getEventType() const {
     assert(EventType.hasValue());
     return EventType.getValue();
diff --git a/contrib/llvm-project/llvm/include/llvm/MC/MCSymbolXCOFF.h b/contrib/llvm-project/llvm/include/llvm/MC/MCSymbolXCOFF.h
index d0379ec08b7d..752e1e7bba0f 100644
--- a/contrib/llvm-project/llvm/include/llvm/MC/MCSymbolXCOFF.h
+++ b/contrib/llvm-project/llvm/include/llvm/MC/MCSymbolXCOFF.h
@@ -35,8 +35,6 @@ public:
   }
 
   void setStorageClass(XCOFF::StorageClass SC) {
-    assert((!StorageClass.hasValue() || StorageClass.getValue() == SC) &&
-           "Redefining StorageClass of XCOFF MCSymbol.");
     StorageClass = SC;
   };
 
@@ -48,8 +46,6 @@ public:
 
   StringRef getUnqualifiedName() const { return getUnqualifiedName(getName()); }
 
-  bool hasRepresentedCsectSet() const { return RepresentedCsect != nullptr; }
-
   MCSectionXCOFF *getRepresentedCsect() const;
 
   void setRepresentedCsect(MCSectionXCOFF *C);
diff --git a/contrib/llvm-project/llvm/include/llvm/MC/MCTargetOptions.h b/contrib/llvm-project/llvm/include/llvm/MC/MCTargetOptions.h
index 4b786751dbd1..d29a74905ebf 100644
--- a/contrib/llvm-project/llvm/include/llvm/MC/MCTargetOptions.h
+++ b/contrib/llvm-project/llvm/include/llvm/MC/MCTargetOptions.h
@@ -22,6 +22,7 @@ enum class ExceptionHandling {
   ARM,      ///< ARM EHABI
   WinEH,    ///< Windows Exception Handling
   Wasm,     ///< WebAssembly Exception Handling
+  AIX,      ///< AIX Exception Handling
 };
 
 enum class DebugCompressionType {
diff --git a/contrib/llvm-project/llvm/include/llvm/MC/MCWasmObjectWriter.h b/contrib/llvm-project/llvm/include/llvm/MC/MCWasmObjectWriter.h
index 382818ad6867..00da632bbcc6 100644
--- a/contrib/llvm-project/llvm/include/llvm/MC/MCWasmObjectWriter.h
+++ b/contrib/llvm-project/llvm/include/llvm/MC/MCWasmObjectWriter.h
@@ -52,6 +52,10 @@ std::unique_ptr<MCObjectWriter>
 createWasmObjectWriter(std::unique_ptr<MCWasmObjectTargetWriter> MOTW,
                        raw_pwrite_stream &OS);
 
+std::unique_ptr<MCObjectWriter>
+createWasmDwoObjectWriter(std::unique_ptr<MCWasmObjectTargetWriter> MOTW,
+                          raw_pwrite_stream &OS, raw_pwrite_stream &DwoOS);
+
 } // namespace llvm
 
 #endif
diff --git a/contrib/llvm-project/llvm/include/llvm/MC/MCWasmStreamer.h b/contrib/llvm-project/llvm/include/llvm/MC/MCWasmStreamer.h
index 61075e7a5732..6651f071f799 100644
--- a/contrib/llvm-project/llvm/include/llvm/MC/MCWasmStreamer.h
+++ b/contrib/llvm-project/llvm/include/llvm/MC/MCWasmStreamer.h
@@ -59,13 +59,9 @@ public:
                     SMLoc Loc = SMLoc()) override;
   void emitTBSSSymbol(MCSection *Section, MCSymbol *Symbol, uint64_t Size,
                       unsigned ByteAlignment = 0) override;
-  void emitValueImpl(const MCExpr *Value, unsigned Size,
-                     SMLoc Loc = SMLoc()) override;
 
   void emitIdent(StringRef IdentString) override;
 
-  void emitValueToAlignment(unsigned, int64_t, unsigned, unsigned) override;
-
   void finishImpl() override;
 
 private:
diff --git a/contrib/llvm-project/llvm/include/llvm/MC/MCWin64EH.h b/contrib/llvm-project/llvm/include/llvm/MC/MCWin64EH.h
index 60ec06e61b7c..065161d1759e 100644
--- a/contrib/llvm-project/llvm/include/llvm/MC/MCWin64EH.h
+++ b/contrib/llvm-project/llvm/include/llvm/MC/MCWin64EH.h
@@ -53,14 +53,15 @@ struct Instruction {
 class UnwindEmitter : public WinEH::UnwindEmitter {
 public:
   void Emit(MCStreamer &Streamer) const override;
-  void EmitUnwindInfo(MCStreamer &Streamer, WinEH::FrameInfo *FI) const override;
+  void EmitUnwindInfo(MCStreamer &Streamer, WinEH::FrameInfo *FI,
+                      bool HandlerData) const override;
 };
 
 class ARM64UnwindEmitter : public WinEH::UnwindEmitter {
 public:
   void Emit(MCStreamer &Streamer) const override;
-  void EmitUnwindInfo(MCStreamer &Streamer,
-                      WinEH::FrameInfo *FI) const override;
+  void EmitUnwindInfo(MCStreamer &Streamer, WinEH::FrameInfo *FI,
+                      bool HandlerData) const override;
 };
 
 }
diff --git a/contrib/llvm-project/llvm/include/llvm/MC/MCWinCOFFStreamer.h b/contrib/llvm-project/llvm/include/llvm/MC/MCWinCOFFStreamer.h
index 1236304b9e5d..53b2ef0bd96e 100644
--- a/contrib/llvm-project/llvm/include/llvm/MC/MCWinCOFFStreamer.h
+++ b/contrib/llvm-project/llvm/include/llvm/MC/MCWinCOFFStreamer.h
@@ -58,6 +58,7 @@ public:
                         unsigned ByteAlignment) override;
   void emitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                              unsigned ByteAlignment) override;
+  void emitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) override;
   void emitZerofill(MCSection *Section, MCSymbol *Symbol, uint64_t Size,
                     unsigned ByteAlignment, SMLoc Loc = SMLoc()) override;
   void emitTBSSSymbol(MCSection *Section, MCSymbol *Symbol, uint64_t Size,
diff --git a/contrib/llvm-project/llvm/include/llvm/MC/MCWinEH.h b/contrib/llvm-project/llvm/include/llvm/MC/MCWinEH.h
index b1c28c0ecae7..5688255810d0 100644
--- a/contrib/llvm-project/llvm/include/llvm/MC/MCWinEH.h
+++ b/contrib/llvm-project/llvm/include/llvm/MC/MCWinEH.h
@@ -26,6 +26,14 @@ struct Instruction {
 
   Instruction(unsigned Op, MCSymbol *L, unsigned Reg, unsigned Off)
     : Label(L), Offset(Off), Register(Reg), Operation(Op) {}
+
+  bool operator==(const Instruction &I) const {
+    // Check whether two instructions refer to the same operation
+    // applied at a different spot (i.e. pointing at a different label).
+    return Offset == I.Offset && Register == I.Register &&
+           Operation == I.Operation;
+  }
+  bool operator!=(const Instruction &I) const { return !(*this == I); }
 };
 
 struct FrameInfo {
@@ -36,10 +44,12 @@ struct FrameInfo {
   const MCSymbol *Function = nullptr;
   const MCSymbol *PrologEnd = nullptr;
   const MCSymbol *Symbol = nullptr;
-  const MCSection *TextSection = nullptr;
+  MCSection *TextSection = nullptr;
+  uint32_t PackedInfo = 0;
 
   bool HandlesUnwind = false;
   bool HandlesExceptions = false;
+  bool EmitAttempted = false;
 
   int LastFrameInst = -1;
   const FrameInfo *ChainedParent = nullptr;
@@ -53,6 +63,15 @@ struct FrameInfo {
             const FrameInfo *ChainedParent)
       : Begin(BeginFuncEHLabel), Function(Function),
         ChainedParent(ChainedParent) {}
+
+  bool empty() const {
+    if (!Instructions.empty())
+      return false;
+    for (const auto &E : EpilogMap)
+      if (!E.second.empty())
+        return false;
+    return true;
+  }
 };
 
 class UnwindEmitter {
@@ -61,7 +80,8 @@ public:
 
   /// This emits the unwind info sections (.pdata and .xdata in PE/COFF).
   virtual void Emit(MCStreamer &Streamer) const = 0;
-  virtual void EmitUnwindInfo(MCStreamer &Streamer, FrameInfo *FI) const = 0;
+  virtual void EmitUnwindInfo(MCStreamer &Streamer, FrameInfo *FI,
+                              bool HandlerData) const = 0;
 };
 }
 }
diff --git a/contrib/llvm-project/llvm/include/llvm/MC/StringTableBuilder.h b/contrib/llvm-project/llvm/include/llvm/MC/StringTableBuilder.h
index d8bfac03f7f2..3f9c91be05d3 100644
--- a/contrib/llvm-project/llvm/include/llvm/MC/StringTableBuilder.h
+++ b/contrib/llvm-project/llvm/include/llvm/MC/StringTableBuilder.h
@@ -22,7 +22,17 @@ class raw_ostream;
 /// Utility for building string tables with deduplicated suffixes.
 class StringTableBuilder {
 public:
-  enum Kind { ELF, WinCOFF, MachO, RAW, DWARF, XCOFF };
+  enum Kind {
+    ELF,
+    WinCOFF,
+    MachO,
+    MachO64,
+    MachOLinked,
+    MachO64Linked,
+    RAW,
+    DWARF,
+    XCOFF
+  };
 
 private:
   DenseMap<CachedHashStringRef, size_t> StringIndexMap;
diff --git a/contrib/llvm-project/llvm/include/llvm/MC/SubtargetFeature.h b/contrib/llvm-project/llvm/include/llvm/MC/SubtargetFeature.h
index 01ea794a4bc3..cc36b25a4965 100644
--- a/contrib/llvm-project/llvm/include/llvm/MC/SubtargetFeature.h
+++ b/contrib/llvm-project/llvm/include/llvm/MC/SubtargetFeature.h
@@ -30,7 +30,7 @@ namespace llvm {
 class raw_ostream;
 class Triple;
 
-const unsigned MAX_SUBTARGET_WORDS = 3;
+const unsigned MAX_SUBTARGET_WORDS = 4;
 const unsigned MAX_SUBTARGET_FEATURES = MAX_SUBTARGET_WORDS * 64;
 
 /// Container class for subtarget features.
diff --git a/contrib/llvm-project/llvm/include/llvm/MCA/HardwareUnits/Scheduler.h b/contrib/llvm-project/llvm/include/llvm/MCA/HardwareUnits/Scheduler.h
index 6c196757e571..0293364e26ef 100644
--- a/contrib/llvm-project/llvm/include/llvm/MCA/HardwareUnits/Scheduler.h
+++ b/contrib/llvm-project/llvm/include/llvm/MCA/HardwareUnits/Scheduler.h
@@ -267,9 +267,9 @@ public:
   // This routine performs a sanity check.  This routine should only be called
   // when we know that 'IR' is not in the scheduler's instruction queues.
   void sanityCheck(const InstRef &IR) const {
-    assert(find(WaitSet, IR) == WaitSet.end() && "Already in the wait set!");
-    assert(find(ReadySet, IR) == ReadySet.end() && "Already in the ready set!");
-    assert(find(IssuedSet, IR) == IssuedSet.end() && "Already executing!");
+    assert(!is_contained(WaitSet, IR) && "Already in the wait set!");
+    assert(!is_contained(ReadySet, IR) && "Already in the ready set!");
+    assert(!is_contained(IssuedSet, IR) && "Already executing!");
   }
 #endif // !NDEBUG
 };
diff --git a/contrib/llvm-project/llvm/include/llvm/Object/ArchiveWriter.h b/contrib/llvm-project/llvm/include/llvm/Object/ArchiveWriter.h
index 274ffd90c05a..7eaf13e8fb22 100644
--- a/contrib/llvm-project/llvm/include/llvm/Object/ArchiveWriter.h
+++ b/contrib/llvm-project/llvm/include/llvm/Object/ArchiveWriter.h
@@ -39,6 +39,12 @@ Error writeArchive(StringRef ArcName, ArrayRef<NewArchiveMember> NewMembers,
                    bool WriteSymtab, object::Archive::Kind Kind,
                    bool Deterministic, bool Thin,
                    std::unique_ptr<MemoryBuffer> OldArchiveBuf = nullptr);
+
+// writeArchiveToBuffer is similar to writeArchive but returns the Archive in a
+// buffer instead of writing it out to a file.
+Expected<std::unique_ptr<MemoryBuffer>>
+writeArchiveToBuffer(ArrayRef<NewArchiveMember> NewMembers, bool WriteSymtab,
+                     object::Archive::Kind Kind, bool Deterministic, bool Thin);
 }
 
 #endif
diff --git a/contrib/llvm-project/llvm/include/llvm/Object/Binary.h b/contrib/llvm-project/llvm/include/llvm/Object/Binary.h
index e95516f30a40..dd98e1143e25 100644
--- a/contrib/llvm-project/llvm/include/llvm/Object/Binary.h
+++ b/contrib/llvm-project/llvm/include/llvm/Object/Binary.h
@@ -91,6 +91,8 @@ public:
   Binary(const Binary &other) = delete;
   virtual ~Binary();
 
+  virtual Error initContent() { return Error::success(); };
+
   StringRef getData() const;
   StringRef getFileName() const;
   MemoryBufferRef getMemoryBufferRef() const;
@@ -163,8 +165,8 @@ public:
   static Error checkOffset(MemoryBufferRef M, uintptr_t Addr,
                            const uint64_t Size) {
     if (Addr + Size < Addr || Addr + Size < Size ||
-        Addr + Size > uintptr_t(M.getBufferEnd()) ||
-        Addr < uintptr_t(M.getBufferStart())) {
+        Addr + Size > reinterpret_cast<uintptr_t>(M.getBufferEnd()) ||
+        Addr < reinterpret_cast<uintptr_t>(M.getBufferStart())) {
       return errorCodeToError(object_error::unexpected_eof);
     }
     return Error::success();
@@ -178,7 +180,8 @@ DEFINE_ISA_CONVERSION_FUNCTIONS(Binary, LLVMBinaryRef)
 ///
 /// @param Source The data to create the Binary from.
 Expected<std::unique_ptr<Binary>> createBinary(MemoryBufferRef Source,
-                                               LLVMContext *Context = nullptr);
+                                               LLVMContext *Context = nullptr,
+                                               bool InitContent = true);
 
 template <typename T> class OwningBinary {
   std::unique_ptr<T> Bin;
@@ -228,7 +231,9 @@ template <typename T> const T* OwningBinary<T>::getBinary() const {
   return Bin.get();
 }
 
-Expected<OwningBinary<Binary>> createBinary(StringRef Path);
+Expected<OwningBinary<Binary>> createBinary(StringRef Path,
+                                            LLVMContext *Context = nullptr,
+                                            bool InitContent = true);
 
 } // end namespace object
 
diff --git a/contrib/llvm-project/llvm/include/llvm/Object/COFF.h b/contrib/llvm-project/llvm/include/llvm/Object/COFF.h
index 8aef00a8809d..e7cf1b5495c6 100644
--- a/contrib/llvm-project/llvm/include/llvm/Object/COFF.h
+++ b/contrib/llvm-project/llvm/include/llvm/Object/COFF.h
@@ -576,11 +576,22 @@ struct coff_tls_directory {
 
   uint32_t getAlignment() const {
     // Bit [20:24] contains section alignment.
-    uint32_t Shift = (Characteristics & 0x00F00000) >> 20;
+    uint32_t Shift = (Characteristics & COFF::IMAGE_SCN_ALIGN_MASK) >> 20;
     if (Shift > 0)
       return 1U << (Shift - 1);
     return 0;
   }
+
+  void setAlignment(uint32_t Align) {
+    uint32_t AlignBits = 0;
+    if (Align) {
+      assert(llvm::isPowerOf2_32(Align) && "alignment is not a power of 2");
+      assert(llvm::Log2_32(Align) <= 13 && "alignment requested is too large");
+      AlignBits = (llvm::Log2_32(Align) + 1) << 20;
+    }
+    Characteristics =
+        (Characteristics & ~COFF::IMAGE_SCN_ALIGN_MASK) | AlignBits;
+  }
 };
 
 using coff_tls_directory32 = coff_tls_directory<support::little32_t>;
@@ -786,6 +797,8 @@ private:
   const coff_base_reloc_block_header *BaseRelocEnd;
   const debug_directory *DebugDirectoryBegin;
   const debug_directory *DebugDirectoryEnd;
+  const coff_tls_directory32 *TLSDirectory32;
+  const coff_tls_directory64 *TLSDirectory64;
   // Either coff_load_configuration32 or coff_load_configuration64.
   const void *LoadConfig = nullptr;
 
@@ -805,6 +818,7 @@ private:
   Error initExportTablePtr();
   Error initBaseRelocPtr();
   Error initDebugDirectoryPtr();
+  Error initTLSDirectoryPtr();
   Error initLoadConfigPtr();
 
 public:
@@ -976,6 +990,13 @@ public:
     return make_range(debug_directory_begin(), debug_directory_end());
   }
 
+  const coff_tls_directory32 *getTLSDirectory32() const {
+    return TLSDirectory32;
+  }
+  const coff_tls_directory64 *getTLSDirectory64() const {
+    return TLSDirectory64;
+  }
+
   const dos_header *getDOSHeader() const {
     if (!PE32Header && !PE32PlusHeader)
       return nullptr;
diff --git a/contrib/llvm-project/llvm/include/llvm/Object/ELF.h b/contrib/llvm-project/llvm/include/llvm/Object/ELF.h
index b44dd3f48661..447b4c25ce81 100644
--- a/contrib/llvm-project/llvm/include/llvm/Object/ELF.h
+++ b/contrib/llvm-project/llvm/include/llvm/Object/ELF.h
@@ -30,6 +30,43 @@
 namespace llvm {
 namespace object {
 
+struct VerdAux {
+  unsigned Offset;
+  std::string Name;
+};
+
+struct VerDef {
+  unsigned Offset;
+  unsigned Version;
+  unsigned Flags;
+  unsigned Ndx;
+  unsigned Cnt;
+  unsigned Hash;
+  std::string Name;
+  std::vector<VerdAux> AuxV;
+};
+
+struct VernAux {
+  unsigned Hash;
+  unsigned Flags;
+  unsigned Other;
+  unsigned Offset;
+  std::string Name;
+};
+
+struct VerNeed {
+  unsigned Version;
+  unsigned Cnt;
+  unsigned Offset;
+  std::string File;
+  std::vector<VernAux> AuxV;
+};
+
+struct VersionEntry {
+  std::string Name;
+  bool IsVerDef;
+};
+
 StringRef getELFRelocationTypeName(uint32_t Machine, uint32_t Type);
 uint32_t getELFRelativeRelocationType(uint32_t Machine);
 StringRef getELFSectionTypeName(uint32_t Machine, uint32_t Type);
@@ -48,14 +85,51 @@ static inline Error createError(const Twine &Err) {
   return make_error<StringError>(Err, object_error::parse_failed);
 }
 
+enum PPCInstrMasks : uint64_t {
+  PADDI_R12_NO_DISP = 0x0610000039800000,
+  PLD_R12_NO_DISP = 0x04100000E5800000,
+  MTCTR_R12 = 0x7D8903A6,
+  BCTR = 0x4E800420,
+};
+
 template <class ELFT> class ELFFile;
 
+template <class T> struct DataRegion {
+  // This constructor is used when we know the start and the size of a data
+  // region. We assume that Arr does not go past the end of the file.
+  DataRegion(ArrayRef<T> Arr) : First(Arr.data()), Size(Arr.size()) {}
+
+  // Sometimes we only know the start of a data region. We still don't want to
+  // read past the end of the file, so we provide the end of a buffer.
+  DataRegion(const T *Data, const uint8_t *BufferEnd)
+      : First(Data), BufEnd(BufferEnd) {}
+
+  Expected<T> operator[](uint64_t N) {
+    assert(Size || BufEnd);
+    if (Size) {
+      if (N >= *Size)
+        return createError(
+            "the index is greater than or equal to the number of entries (" +
+            Twine(*Size) + ")");
+    } else {
+      const uint8_t *EntryStart = (const uint8_t *)First + N * sizeof(T);
+      if (EntryStart + sizeof(T) > BufEnd)
+        return createError("can't read past the end of the file");
+    }
+    return *(First + N);
+  }
+
+  const T *First;
+  Optional<uint64_t> Size = None;
+  const uint8_t *BufEnd = nullptr;
+};
+
 template <class ELFT>
-std::string getSecIndexForError(const ELFFile<ELFT> *Obj,
-                                const typename ELFT::Shdr *Sec) {
-  auto TableOrErr = Obj->sections();
+std::string getSecIndexForError(const ELFFile<ELFT> &Obj,
+                                const typename ELFT::Shdr &Sec) {
+  auto TableOrErr = Obj.sections();
   if (TableOrErr)
-    return "[index " + std::to_string(Sec - &TableOrErr->front()) + "]";
+    return "[index " + std::to_string(&Sec - &TableOrErr->front()) + "]";
   // To make this helper be more convenient for error reporting purposes we
   // drop the error. But really it should never be triggered. Before this point,
   // our code should have called 'sections()' and reported a proper error on
@@ -65,11 +139,21 @@ std::string getSecIndexForError(const ELFFile<ELFT> *Obj,
 }
 
 template <class ELFT>
-std::string getPhdrIndexForError(const ELFFile<ELFT> *Obj,
-                                 const typename ELFT::Phdr *Phdr) {
-  auto Headers = Obj->program_headers();
+static std::string describe(const ELFFile<ELFT> &Obj,
+                            const typename ELFT::Shdr &Sec) {
+  unsigned SecNdx = &Sec - &cantFail(Obj.sections()).front();
+  return (object::getELFSectionTypeName(Obj.getHeader().e_machine,
+                                        Sec.sh_type) +
+          " section with index " + Twine(SecNdx))
+      .str();
+}
+
+template <class ELFT>
+std::string getPhdrIndexForError(const ELFFile<ELFT> &Obj,
+                                 const typename ELFT::Phdr &Phdr) {
+  auto Headers = Obj.program_headers();
   if (Headers)
-    return ("[index " + Twine(Phdr - &Headers->front()) + "]").str();
+    return ("[index " + Twine(&Phdr - &Headers->front()) + "]").str();
   // See comment in the getSecIndexForError() above.
   llvm::consumeError(Headers.takeError());
   return "[unknown index]";
@@ -83,32 +167,6 @@ template <class ELFT>
 class ELFFile {
 public:
   LLVM_ELF_IMPORT_TYPES_ELFT(ELFT)
-  using uintX_t = typename ELFT::uint;
-  using Elf_Ehdr = typename ELFT::Ehdr;
-  using Elf_Shdr = typename ELFT::Shdr;
-  using Elf_Sym = typename ELFT::Sym;
-  using Elf_Dyn = typename ELFT::Dyn;
-  using Elf_Phdr = typename ELFT::Phdr;
-  using Elf_Rel = typename ELFT::Rel;
-  using Elf_Rela = typename ELFT::Rela;
-  using Elf_Relr = typename ELFT::Relr;
-  using Elf_Verdef = typename ELFT::Verdef;
-  using Elf_Verdaux = typename ELFT::Verdaux;
-  using Elf_Verneed = typename ELFT::Verneed;
-  using Elf_Vernaux = typename ELFT::Vernaux;
-  using Elf_Versym = typename ELFT::Versym;
-  using Elf_Hash = typename ELFT::Hash;
-  using Elf_GnuHash = typename ELFT::GnuHash;
-  using Elf_Nhdr = typename ELFT::Nhdr;
-  using Elf_Note = typename ELFT::Note;
-  using Elf_Note_Iterator = typename ELFT::NoteIterator;
-  using Elf_Dyn_Range = typename ELFT::DynRange;
-  using Elf_Shdr_Range = typename ELFT::ShdrRange;
-  using Elf_Sym_Range = typename ELFT::SymRange;
-  using Elf_Rel_Range = typename ELFT::RelRange;
-  using Elf_Rela_Range = typename ELFT::RelaRange;
-  using Elf_Relr_Range = typename ELFT::RelrRange;
-  using Elf_Phdr_Range = typename ELFT::PhdrRange;
 
   // This is a callback that can be passed to a number of functions.
   // It can be used to ignore non-critical errors (warnings), which is
@@ -118,6 +176,7 @@ public:
   using WarningHandler = llvm::function_ref<Error(const Twine &Msg)>;
 
   const uint8_t *base() const { return Buf.bytes_begin(); }
+  const uint8_t *end() const { return base() + getBufSize(); }
 
   size_t getBufSize() const { return Buf.size(); }
 
@@ -127,26 +186,39 @@ private:
   ELFFile(StringRef Object);
 
 public:
-  const Elf_Ehdr *getHeader() const {
-    return reinterpret_cast<const Elf_Ehdr *>(base());
+  const Elf_Ehdr &getHeader() const {
+    return *reinterpret_cast<const Elf_Ehdr *>(base());
   }
 
   template <typename T>
   Expected<const T *> getEntry(uint32_t Section, uint32_t Entry) const;
   template <typename T>
-  Expected<const T *> getEntry(const Elf_Shdr *Section, uint32_t Entry) const;
+  Expected<const T *> getEntry(const Elf_Shdr &Section, uint32_t Entry) const;
+
+  Expected<std::vector<VerDef>>
+  getVersionDefinitions(const Elf_Shdr &Sec) const;
+  Expected<std::vector<VerNeed>> getVersionDependencies(
+      const Elf_Shdr &Sec,
+      WarningHandler WarnHandler = &defaultWarningHandler) const;
+  Expected<StringRef>
+  getSymbolVersionByIndex(uint32_t SymbolVersionIndex, bool &IsDefault,
+                          SmallVector<Optional<VersionEntry>, 0> &VersionMap,
+                          Optional<bool> IsSymHidden) const;
 
   Expected<StringRef>
-  getStringTable(const Elf_Shdr *Section,
+  getStringTable(const Elf_Shdr &Section,
                  WarningHandler WarnHandler = &defaultWarningHandler) const;
   Expected<StringRef> getStringTableForSymtab(const Elf_Shdr &Section) const;
   Expected<StringRef> getStringTableForSymtab(const Elf_Shdr &Section,
                                               Elf_Shdr_Range Sections) const;
+  Expected<StringRef> getLinkAsStrtab(const typename ELFT::Shdr &Sec) const;
 
   Expected<ArrayRef<Elf_Word>> getSHNDXTable(const Elf_Shdr &Section) const;
   Expected<ArrayRef<Elf_Word>> getSHNDXTable(const Elf_Shdr &Section,
                                              Elf_Shdr_Range Sections) const;
 
+  Expected<uint64_t> getDynSymtabSize() const;
+
   StringRef getRelocationTypeName(uint32_t Type) const;
   void getRelocationTypeName(uint32_t Type,
                              SmallVectorImpl<char> &Result) const;
@@ -156,18 +228,21 @@ public:
   std::string getDynamicTagAsString(uint64_t Type) const;
 
   /// Get the symbol for a given relocation.
-  Expected<const Elf_Sym *> getRelocationSymbol(const Elf_Rel *Rel,
+  Expected<const Elf_Sym *> getRelocationSymbol(const Elf_Rel &Rel,
                                                 const Elf_Shdr *SymTab) const;
 
+  Expected<SmallVector<Optional<VersionEntry>, 0>>
+  loadVersionMap(const Elf_Shdr *VerNeedSec, const Elf_Shdr *VerDefSec) const;
+
   static Expected<ELFFile> create(StringRef Object);
 
   bool isLE() const {
-    return getHeader()->getDataEncoding() == ELF::ELFDATA2LSB;
+    return getHeader().getDataEncoding() == ELF::ELFDATA2LSB;
   }
 
   bool isMipsELF64() const {
-    return getHeader()->e_machine == ELF::EM_MIPS &&
-           getHeader()->getFileClass() == ELF::ELFCLASS64;
+    return getHeader().e_machine == ELF::EM_MIPS &&
+           getHeader().getFileClass() == ELF::ELFCLASS64;
   }
 
   bool isMips64EL() const { return isMipsELF64() && isLE(); }
@@ -176,48 +251,50 @@ public:
 
   Expected<Elf_Dyn_Range> dynamicEntries() const;
 
-  Expected<const uint8_t *> toMappedAddr(uint64_t VAddr) const;
+  Expected<const uint8_t *>
+  toMappedAddr(uint64_t VAddr,
+               WarningHandler WarnHandler = &defaultWarningHandler) const;
 
   Expected<Elf_Sym_Range> symbols(const Elf_Shdr *Sec) const {
     if (!Sec)
       return makeArrayRef<Elf_Sym>(nullptr, nullptr);
-    return getSectionContentsAsArray<Elf_Sym>(Sec);
+    return getSectionContentsAsArray<Elf_Sym>(*Sec);
   }
 
-  Expected<Elf_Rela_Range> relas(const Elf_Shdr *Sec) const {
+  Expected<Elf_Rela_Range> relas(const Elf_Shdr &Sec) const {
     return getSectionContentsAsArray<Elf_Rela>(Sec);
   }
 
-  Expected<Elf_Rel_Range> rels(const Elf_Shdr *Sec) const {
+  Expected<Elf_Rel_Range> rels(const Elf_Shdr &Sec) const {
     return getSectionContentsAsArray<Elf_Rel>(Sec);
   }
 
-  Expected<Elf_Relr_Range> relrs(const Elf_Shdr *Sec) const {
+  Expected<Elf_Relr_Range> relrs(const Elf_Shdr &Sec) const {
     return getSectionContentsAsArray<Elf_Relr>(Sec);
   }
 
-  Expected<std::vector<Elf_Rela>> decode_relrs(Elf_Relr_Range relrs) const;
+  std::vector<Elf_Rel> decode_relrs(Elf_Relr_Range relrs) const;
 
-  Expected<std::vector<Elf_Rela>> android_relas(const Elf_Shdr *Sec) const;
+  Expected<std::vector<Elf_Rela>> android_relas(const Elf_Shdr &Sec) const;
 
   /// Iterate over program header table.
   Expected<Elf_Phdr_Range> program_headers() const {
-    if (getHeader()->e_phnum && getHeader()->e_phentsize != sizeof(Elf_Phdr))
+    if (getHeader().e_phnum && getHeader().e_phentsize != sizeof(Elf_Phdr))
       return createError("invalid e_phentsize: " +
-                         Twine(getHeader()->e_phentsize));
+                         Twine(getHeader().e_phentsize));
 
     uint64_t HeadersSize =
-        (uint64_t)getHeader()->e_phnum * getHeader()->e_phentsize;
-    uint64_t PhOff = getHeader()->e_phoff;
+        (uint64_t)getHeader().e_phnum * getHeader().e_phentsize;
+    uint64_t PhOff = getHeader().e_phoff;
     if (PhOff + HeadersSize < PhOff || PhOff + HeadersSize > getBufSize())
       return createError("program headers are longer than binary of size " +
                          Twine(getBufSize()) + ": e_phoff = 0x" +
-                         Twine::utohexstr(getHeader()->e_phoff) +
-                         ", e_phnum = " + Twine(getHeader()->e_phnum) +
-                         ", e_phentsize = " + Twine(getHeader()->e_phentsize));
+                         Twine::utohexstr(getHeader().e_phoff) +
+                         ", e_phnum = " + Twine(getHeader().e_phnum) +
+                         ", e_phentsize = " + Twine(getHeader().e_phentsize));
 
     auto *Begin = reinterpret_cast<const Elf_Phdr *>(base() + PhOff);
-    return makeArrayRef(Begin, Begin + getHeader()->e_phnum);
+    return makeArrayRef(Begin, Begin + getHeader().e_phnum);
   }
 
   /// Get an iterator over notes in a program header.
@@ -231,9 +308,9 @@ public:
     assert(Phdr.p_type == ELF::PT_NOTE && "Phdr is not of type PT_NOTE");
     ErrorAsOutParameter ErrAsOutParam(&Err);
     if (Phdr.p_offset + Phdr.p_filesz > getBufSize()) {
-      Err = createError("PT_NOTE header has invalid offset (0x" +
-                        Twine::utohexstr(Phdr.p_offset) + ") or size (0x" +
-                        Twine::utohexstr(Phdr.p_filesz) + ")");
+      Err =
+          createError("invalid offset (0x" + Twine::utohexstr(Phdr.p_offset) +
+                      ") or size (0x" + Twine::utohexstr(Phdr.p_filesz) + ")");
       return Elf_Note_Iterator(Err);
     }
     return Elf_Note_Iterator(base() + Phdr.p_offset, Phdr.p_filesz, Err);
@@ -250,10 +327,9 @@ public:
     assert(Shdr.sh_type == ELF::SHT_NOTE && "Shdr is not of type SHT_NOTE");
     ErrorAsOutParameter ErrAsOutParam(&Err);
     if (Shdr.sh_offset + Shdr.sh_size > getBufSize()) {
-      Err = createError("SHT_NOTE section " + getSecIndexForError(this, &Shdr) +
-                        " has invalid offset (0x" +
-                        Twine::utohexstr(Shdr.sh_offset) + ") or size (0x" +
-                        Twine::utohexstr(Shdr.sh_size) + ")");
+      Err =
+          createError("invalid offset (0x" + Twine::utohexstr(Shdr.sh_offset) +
+                      ") or size (0x" + Twine::utohexstr(Shdr.sh_size) + ")");
       return Elf_Note_Iterator(Err);
     }
     return Elf_Note_Iterator(base() + Shdr.sh_offset, Shdr.sh_size, Err);
@@ -291,28 +367,28 @@ public:
   Expected<StringRef> getSectionStringTable(
       Elf_Shdr_Range Sections,
       WarningHandler WarnHandler = &defaultWarningHandler) const;
-  Expected<uint32_t> getSectionIndex(const Elf_Sym *Sym, Elf_Sym_Range Syms,
-                                     ArrayRef<Elf_Word> ShndxTable) const;
-  Expected<const Elf_Shdr *> getSection(const Elf_Sym *Sym,
+  Expected<uint32_t> getSectionIndex(const Elf_Sym &Sym, Elf_Sym_Range Syms,
+                                     DataRegion<Elf_Word> ShndxTable) const;
+  Expected<const Elf_Shdr *> getSection(const Elf_Sym &Sym,
                                         const Elf_Shdr *SymTab,
-                                        ArrayRef<Elf_Word> ShndxTable) const;
-  Expected<const Elf_Shdr *> getSection(const Elf_Sym *Sym,
+                                        DataRegion<Elf_Word> ShndxTable) const;
+  Expected<const Elf_Shdr *> getSection(const Elf_Sym &Sym,
                                         Elf_Sym_Range Symtab,
-                                        ArrayRef<Elf_Word> ShndxTable) const;
+                                        DataRegion<Elf_Word> ShndxTable) const;
   Expected<const Elf_Shdr *> getSection(uint32_t Index) const;
 
   Expected<const Elf_Sym *> getSymbol(const Elf_Shdr *Sec,
                                       uint32_t Index) const;
 
   Expected<StringRef>
-  getSectionName(const Elf_Shdr *Section,
+  getSectionName(const Elf_Shdr &Section,
                  WarningHandler WarnHandler = &defaultWarningHandler) const;
-  Expected<StringRef> getSectionName(const Elf_Shdr *Section,
+  Expected<StringRef> getSectionName(const Elf_Shdr &Section,
                                      StringRef DotShstrtab) const;
   template <typename T>
-  Expected<ArrayRef<T>> getSectionContentsAsArray(const Elf_Shdr *Sec) const;
-  Expected<ArrayRef<uint8_t>> getSectionContents(const Elf_Shdr *Sec) const;
-  Expected<ArrayRef<uint8_t>> getSegmentContents(const Elf_Phdr *Phdr) const;
+  Expected<ArrayRef<T>> getSectionContentsAsArray(const Elf_Shdr &Sec) const;
+  Expected<ArrayRef<uint8_t>> getSectionContents(const Elf_Shdr &Sec) const;
+  Expected<ArrayRef<uint8_t>> getSegmentContents(const Elf_Phdr &Phdr) const;
 };
 
 using ELF32LEFile = ELFFile<ELF32LE>;
@@ -330,29 +406,30 @@ getSection(typename ELFT::ShdrRange Sections, uint32_t Index) {
 
 template <class ELFT>
 inline Expected<uint32_t>
-getExtendedSymbolTableIndex(const typename ELFT::Sym *Sym,
-                            const typename ELFT::Sym *FirstSym,
-                            ArrayRef<typename ELFT::Word> ShndxTable) {
-  assert(Sym->st_shndx == ELF::SHN_XINDEX);
-  unsigned Index = Sym - FirstSym;
-  if (Index >= ShndxTable.size())
+getExtendedSymbolTableIndex(const typename ELFT::Sym &Sym, unsigned SymIndex,
+                            DataRegion<typename ELFT::Word> ShndxTable) {
+  assert(Sym.st_shndx == ELF::SHN_XINDEX);
+  if (!ShndxTable.First)
     return createError(
-        "extended symbol index (" + Twine(Index) +
-        ") is past the end of the SHT_SYMTAB_SHNDX section of size " +
-        Twine(ShndxTable.size()));
+        "found an extended symbol index (" + Twine(SymIndex) +
+        "), but unable to locate the extended symbol index table");
 
-  // The size of the table was checked in getSHNDXTable.
-  return ShndxTable[Index];
+  Expected<typename ELFT::Word> TableOrErr = ShndxTable[SymIndex];
+  if (!TableOrErr)
+    return createError("unable to read an extended symbol table at index " +
+                       Twine(SymIndex) + ": " +
+                       toString(TableOrErr.takeError()));
+  return *TableOrErr;
 }
 
 template <class ELFT>
 Expected<uint32_t>
-ELFFile<ELFT>::getSectionIndex(const Elf_Sym *Sym, Elf_Sym_Range Syms,
-                               ArrayRef<Elf_Word> ShndxTable) const {
-  uint32_t Index = Sym->st_shndx;
+ELFFile<ELFT>::getSectionIndex(const Elf_Sym &Sym, Elf_Sym_Range Syms,
+                               DataRegion<Elf_Word> ShndxTable) const {
+  uint32_t Index = Sym.st_shndx;
   if (Index == ELF::SHN_XINDEX) {
-    auto ErrorOrIndex = getExtendedSymbolTableIndex<ELFT>(
-        Sym, Syms.begin(), ShndxTable);
+    Expected<uint32_t> ErrorOrIndex =
+        getExtendedSymbolTableIndex<ELFT>(Sym, &Sym - Syms.begin(), ShndxTable);
     if (!ErrorOrIndex)
       return ErrorOrIndex.takeError();
     return *ErrorOrIndex;
@@ -364,8 +441,8 @@ ELFFile<ELFT>::getSectionIndex(const Elf_Sym *Sym, Elf_Sym_Range Syms,
 
 template <class ELFT>
 Expected<const typename ELFT::Shdr *>
-ELFFile<ELFT>::getSection(const Elf_Sym *Sym, const Elf_Shdr *SymTab,
-                          ArrayRef<Elf_Word> ShndxTable) const {
+ELFFile<ELFT>::getSection(const Elf_Sym &Sym, const Elf_Shdr *SymTab,
+                          DataRegion<Elf_Word> ShndxTable) const {
   auto SymsOrErr = symbols(SymTab);
   if (!SymsOrErr)
     return SymsOrErr.takeError();
@@ -374,8 +451,8 @@ ELFFile<ELFT>::getSection(const Elf_Sym *Sym, const Elf_Shdr *SymTab,
 
 template <class ELFT>
 Expected<const typename ELFT::Shdr *>
-ELFFile<ELFT>::getSection(const Elf_Sym *Sym, Elf_Sym_Range Symbols,
-                          ArrayRef<Elf_Word> ShndxTable) const {
+ELFFile<ELFT>::getSection(const Elf_Sym &Sym, Elf_Sym_Range Symbols,
+                          DataRegion<Elf_Word> ShndxTable) const {
   auto IndexOrErr = getSectionIndex(Sym, Symbols, ShndxTable);
   if (!IndexOrErr)
     return IndexOrErr.takeError();
@@ -395,7 +472,7 @@ ELFFile<ELFT>::getSymbol(const Elf_Shdr *Sec, uint32_t Index) const {
   Elf_Sym_Range Symbols = *SymsOrErr;
   if (Index >= Symbols.size())
     return createError("unable to get symbol from section " +
-                       getSecIndexForError(this, Sec) +
+                       getSecIndexForError(*this, *Sec) +
                        ": invalid symbol index (" + Twine(Index) + ")");
   return &Symbols[Index];
 }
@@ -403,26 +480,27 @@ ELFFile<ELFT>::getSymbol(const Elf_Shdr *Sec, uint32_t Index) const {
 template <class ELFT>
 template <typename T>
 Expected<ArrayRef<T>>
-ELFFile<ELFT>::getSectionContentsAsArray(const Elf_Shdr *Sec) const {
-  if (Sec->sh_entsize != sizeof(T) && sizeof(T) != 1)
-    return createError("section " + getSecIndexForError(this, Sec) +
-                       " has an invalid sh_entsize: " + Twine(Sec->sh_entsize));
+ELFFile<ELFT>::getSectionContentsAsArray(const Elf_Shdr &Sec) const {
+  if (Sec.sh_entsize != sizeof(T) && sizeof(T) != 1)
+    return createError("section " + getSecIndexForError(*this, Sec) +
+                       " has invalid sh_entsize: expected " + Twine(sizeof(T)) +
+                       ", but got " + Twine(Sec.sh_entsize));
 
-  uintX_t Offset = Sec->sh_offset;
-  uintX_t Size = Sec->sh_size;
+  uintX_t Offset = Sec.sh_offset;
+  uintX_t Size = Sec.sh_size;
 
   if (Size % sizeof(T))
-    return createError("section " + getSecIndexForError(this, Sec) +
+    return createError("section " + getSecIndexForError(*this, Sec) +
                        " has an invalid sh_size (" + Twine(Size) +
                        ") which is not a multiple of its sh_entsize (" +
-                       Twine(Sec->sh_entsize) + ")");
+                       Twine(Sec.sh_entsize) + ")");
   if (std::numeric_limits<uintX_t>::max() - Offset < Size)
-    return createError("section " + getSecIndexForError(this, Sec) +
+    return createError("section " + getSecIndexForError(*this, Sec) +
                        " has a sh_offset (0x" + Twine::utohexstr(Offset) +
                        ") + sh_size (0x" + Twine::utohexstr(Size) +
                        ") that cannot be represented");
   if (Offset + Size > Buf.size())
-    return createError("section " + getSecIndexForError(this, Sec) +
+    return createError("section " + getSecIndexForError(*this, Sec) +
                        " has a sh_offset (0x" + Twine::utohexstr(Offset) +
                        ") + sh_size (0x" + Twine::utohexstr(Size) +
                        ") that is greater than the file size (0x" +
@@ -438,17 +516,17 @@ ELFFile<ELFT>::getSectionContentsAsArray(const Elf_Shdr *Sec) const {
 
 template <class ELFT>
 Expected<ArrayRef<uint8_t>>
-ELFFile<ELFT>::getSegmentContents(const Elf_Phdr *Phdr) const {
-  uintX_t Offset = Phdr->p_offset;
-  uintX_t Size = Phdr->p_filesz;
+ELFFile<ELFT>::getSegmentContents(const Elf_Phdr &Phdr) const {
+  uintX_t Offset = Phdr.p_offset;
+  uintX_t Size = Phdr.p_filesz;
 
   if (std::numeric_limits<uintX_t>::max() - Offset < Size)
-    return createError("program header " + getPhdrIndexForError(this, Phdr) +
+    return createError("program header " + getPhdrIndexForError(*this, Phdr) +
                        " has a p_offset (0x" + Twine::utohexstr(Offset) +
                        ") + p_filesz (0x" + Twine::utohexstr(Size) +
                        ") that cannot be represented");
   if (Offset + Size > Buf.size())
-    return createError("program header  " + getPhdrIndexForError(this, Phdr) +
+    return createError("program header  " + getPhdrIndexForError(*this, Phdr) +
                        " has a p_offset (0x" + Twine::utohexstr(Offset) +
                        ") + p_filesz (0x" + Twine::utohexstr(Size) +
                        ") that is greater than the file size (0x" +
@@ -458,13 +536,13 @@ ELFFile<ELFT>::getSegmentContents(const Elf_Phdr *Phdr) const {
 
 template <class ELFT>
 Expected<ArrayRef<uint8_t>>
-ELFFile<ELFT>::getSectionContents(const Elf_Shdr *Sec) const {
+ELFFile<ELFT>::getSectionContents(const Elf_Shdr &Sec) const {
   return getSectionContentsAsArray<uint8_t>(Sec);
 }
 
 template <class ELFT>
 StringRef ELFFile<ELFT>::getRelocationTypeName(uint32_t Type) const {
-  return getELFRelocationTypeName(getHeader()->e_machine, Type);
+  return getELFRelocationTypeName(getHeader().e_machine, Type);
 }
 
 template <class ELFT>
@@ -500,24 +578,61 @@ void ELFFile<ELFT>::getRelocationTypeName(uint32_t Type,
 
 template <class ELFT>
 uint32_t ELFFile<ELFT>::getRelativeRelocationType() const {
-  return getELFRelativeRelocationType(getHeader()->e_machine);
+  return getELFRelativeRelocationType(getHeader().e_machine);
+}
+
+template <class ELFT>
+Expected<SmallVector<Optional<VersionEntry>, 0>>
+ELFFile<ELFT>::loadVersionMap(const Elf_Shdr *VerNeedSec,
+                              const Elf_Shdr *VerDefSec) const {
+  SmallVector<Optional<VersionEntry>, 0> VersionMap;
+
+  // The first two version indexes are reserved.
+  // Index 0 is VER_NDX_LOCAL, index 1 is VER_NDX_GLOBAL.
+  VersionMap.push_back(VersionEntry());
+  VersionMap.push_back(VersionEntry());
+
+  auto InsertEntry = [&](unsigned N, StringRef Version, bool IsVerdef) {
+    if (N >= VersionMap.size())
+      VersionMap.resize(N + 1);
+    VersionMap[N] = {std::string(Version), IsVerdef};
+  };
+
+  if (VerDefSec) {
+    Expected<std::vector<VerDef>> Defs = getVersionDefinitions(*VerDefSec);
+    if (!Defs)
+      return Defs.takeError();
+    for (const VerDef &Def : *Defs)
+      InsertEntry(Def.Ndx & ELF::VERSYM_VERSION, Def.Name, true);
+  }
+
+  if (VerNeedSec) {
+    Expected<std::vector<VerNeed>> Deps = getVersionDependencies(*VerNeedSec);
+    if (!Deps)
+      return Deps.takeError();
+    for (const VerNeed &Dep : *Deps)
+      for (const VernAux &Aux : Dep.AuxV)
+        InsertEntry(Aux.Other & ELF::VERSYM_VERSION, Aux.Name, false);
+  }
+
+  return VersionMap;
 }
 
 template <class ELFT>
 Expected<const typename ELFT::Sym *>
-ELFFile<ELFT>::getRelocationSymbol(const Elf_Rel *Rel,
+ELFFile<ELFT>::getRelocationSymbol(const Elf_Rel &Rel,
                                    const Elf_Shdr *SymTab) const {
-  uint32_t Index = Rel->getSymbol(isMips64EL());
+  uint32_t Index = Rel.getSymbol(isMips64EL());
   if (Index == 0)
     return nullptr;
-  return getEntry<Elf_Sym>(SymTab, Index);
+  return getEntry<Elf_Sym>(*SymTab, Index);
 }
 
 template <class ELFT>
 Expected<StringRef>
 ELFFile<ELFT>::getSectionStringTable(Elf_Shdr_Range Sections,
                                      WarningHandler WarnHandler) const {
-  uint32_t Index = getHeader()->e_shstrndx;
+  uint32_t Index = getHeader().e_shstrndx;
   if (Index == ELF::SHN_XINDEX) {
     // If the section name string table section index is greater than
     // or equal to SHN_LORESERVE, then the actual index of the section name
@@ -535,7 +650,100 @@ ELFFile<ELFT>::getSectionStringTable(Elf_Shdr_Range Sections,
   if (Index >= Sections.size())
     return createError("section header string table index " + Twine(Index) +
                        " does not exist");
-  return getStringTable(&Sections[Index], WarnHandler);
+  return getStringTable(Sections[Index], WarnHandler);
+}
+
+/// This function finds the number of dynamic symbols using a GNU hash table.
+///
+/// @param Table The GNU hash table for .dynsym.
+template <class ELFT>
+static Expected<uint64_t>
+getDynSymtabSizeFromGnuHash(const typename ELFT::GnuHash &Table,
+                            const void *BufEnd) {
+  using Elf_Word = typename ELFT::Word;
+  if (Table.nbuckets == 0)
+    return Table.symndx + 1;
+  uint64_t LastSymIdx = 0;
+  // Find the index of the first symbol in the last chain.
+  for (Elf_Word Val : Table.buckets())
+    LastSymIdx = std::max(LastSymIdx, (uint64_t)Val);
+  const Elf_Word *It =
+      reinterpret_cast<const Elf_Word *>(Table.values(LastSymIdx).end());
+  // Locate the end of the chain to find the last symbol index.
+  while (It < BufEnd && (*It & 1) == 0) {
+    ++LastSymIdx;
+    ++It;
+  }
+  if (It >= BufEnd) {
+    return createStringError(
+        object_error::parse_failed,
+        "no terminator found for GNU hash section before buffer end");
+  }
+  return LastSymIdx + 1;
+}
+
+/// This function determines the number of dynamic symbols. It reads section
+/// headers first. If section headers are not available, the number of
+/// symbols will be inferred by parsing dynamic hash tables.
+template <class ELFT>
+Expected<uint64_t> ELFFile<ELFT>::getDynSymtabSize() const {
+  // Read .dynsym section header first if available.
+  Expected<Elf_Shdr_Range> SectionsOrError = sections();
+  if (!SectionsOrError)
+    return SectionsOrError.takeError();
+  for (const Elf_Shdr &Sec : *SectionsOrError) {
+    if (Sec.sh_type == ELF::SHT_DYNSYM) {
+      if (Sec.sh_size % Sec.sh_entsize != 0) {
+        return createStringError(object_error::parse_failed,
+                                 "SHT_DYNSYM section has sh_size (" +
+                                     Twine(Sec.sh_size) + ") % sh_entsize (" +
+                                     Twine(Sec.sh_entsize) + ") that is not 0");
+      }
+      return Sec.sh_size / Sec.sh_entsize;
+    }
+  }
+
+  if (!SectionsOrError->empty()) {
+    // Section headers are available but .dynsym header is not found.
+    // Return 0 as .dynsym does not exist.
+    return 0;
+  }
+
+  // Section headers do not exist. Falling back to infer
+  // upper bound of .dynsym from .gnu.hash and .hash.
+  Expected<Elf_Dyn_Range> DynTable = dynamicEntries();
+  if (!DynTable)
+    return DynTable.takeError();
+  llvm::Optional<uint64_t> ElfHash;
+  llvm::Optional<uint64_t> ElfGnuHash;
+  for (const Elf_Dyn &Entry : *DynTable) {
+    switch (Entry.d_tag) {
+    case ELF::DT_HASH:
+      ElfHash = Entry.d_un.d_ptr;
+      break;
+    case ELF::DT_GNU_HASH:
+      ElfGnuHash = Entry.d_un.d_ptr;
+      break;
+    }
+  }
+  if (ElfGnuHash) {
+    Expected<const uint8_t *> TablePtr = toMappedAddr(*ElfGnuHash);
+    if (!TablePtr)
+      return TablePtr.takeError();
+    const Elf_GnuHash *Table =
+        reinterpret_cast<const Elf_GnuHash *>(TablePtr.get());
+    return getDynSymtabSizeFromGnuHash<ELFT>(*Table, this->Buf.bytes_end());
+  }
+
+  // Search SYSV hash table to try to find the upper bound of dynsym.
+  if (ElfHash) {
+    Expected<const uint8_t *> TablePtr = toMappedAddr(*ElfHash);
+    if (!TablePtr)
+      return TablePtr.takeError();
+    const Elf_Hash *Table = reinterpret_cast<const Elf_Hash *>(TablePtr.get());
+    return Table->nchain;
+  }
+  return 0;
 }
 
 template <class ELFT> ELFFile<ELFT>::ELFFile(StringRef Object) : Buf(Object) {}
@@ -551,13 +759,13 @@ Expected<ELFFile<ELFT>> ELFFile<ELFT>::create(StringRef Object) {
 
 template <class ELFT>
 Expected<typename ELFT::ShdrRange> ELFFile<ELFT>::sections() const {
-  const uintX_t SectionTableOffset = getHeader()->e_shoff;
+  const uintX_t SectionTableOffset = getHeader().e_shoff;
   if (SectionTableOffset == 0)
     return ArrayRef<Elf_Shdr>();
 
-  if (getHeader()->e_shentsize != sizeof(Elf_Shdr))
+  if (getHeader().e_shentsize != sizeof(Elf_Shdr))
     return createError("invalid e_shentsize in ELF header: " +
-                       Twine(getHeader()->e_shentsize));
+                       Twine(getHeader().e_shentsize));
 
   const uint64_t FileSize = Buf.size();
   if (SectionTableOffset + sizeof(Elf_Shdr) > FileSize ||
@@ -574,7 +782,7 @@ Expected<typename ELFT::ShdrRange> ELFFile<ELFT>::sections() const {
   const Elf_Shdr *First =
       reinterpret_cast<const Elf_Shdr *>(base() + SectionTableOffset);
 
-  uintX_t NumSections = getHeader()->e_shnum;
+  uintX_t NumSections = getHeader().e_shnum;
   if (NumSections == 0)
     NumSections = First->sh_size;
 
@@ -605,24 +813,228 @@ Expected<const T *> ELFFile<ELFT>::getEntry(uint32_t Section,
   auto SecOrErr = getSection(Section);
   if (!SecOrErr)
     return SecOrErr.takeError();
-  return getEntry<T>(*SecOrErr, Entry);
+  return getEntry<T>(**SecOrErr, Entry);
 }
 
 template <class ELFT>
 template <typename T>
-Expected<const T *> ELFFile<ELFT>::getEntry(const Elf_Shdr *Section,
+Expected<const T *> ELFFile<ELFT>::getEntry(const Elf_Shdr &Section,
                                             uint32_t Entry) const {
-  if (sizeof(T) != Section->sh_entsize)
-    return createError("section " + getSecIndexForError(this, Section) +
-                       " has invalid sh_entsize: expected " + Twine(sizeof(T)) +
-                       ", but got " + Twine(Section->sh_entsize));
-  uint64_t Pos = Section->sh_offset + (uint64_t)Entry * sizeof(T);
-  if (Pos + sizeof(T) > Buf.size())
-    return createError("unable to access section " +
-                       getSecIndexForError(this, Section) + " data at 0x" +
-                       Twine::utohexstr(Pos) +
-                       ": offset goes past the end of file");
-  return reinterpret_cast<const T *>(base() + Pos);
+  Expected<ArrayRef<T>> EntriesOrErr = getSectionContentsAsArray<T>(Section);
+  if (!EntriesOrErr)
+    return EntriesOrErr.takeError();
+
+  ArrayRef<T> Arr = *EntriesOrErr;
+  if (Entry >= Arr.size())
+    return createError(
+        "can't read an entry at 0x" +
+        Twine::utohexstr(Entry * static_cast<uint64_t>(sizeof(T))) +
+        ": it goes past the end of the section (0x" +
+        Twine::utohexstr(Section.sh_size) + ")");
+  return &Arr[Entry];
+}
+
+template <typename ELFT>
+Expected<StringRef> ELFFile<ELFT>::getSymbolVersionByIndex(
+    uint32_t SymbolVersionIndex, bool &IsDefault,
+    SmallVector<Optional<VersionEntry>, 0> &VersionMap,
+    Optional<bool> IsSymHidden) const {
+  size_t VersionIndex = SymbolVersionIndex & llvm::ELF::VERSYM_VERSION;
+
+  // Special markers for unversioned symbols.
+  if (VersionIndex == llvm::ELF::VER_NDX_LOCAL ||
+      VersionIndex == llvm::ELF::VER_NDX_GLOBAL) {
+    IsDefault = false;
+    return "";
+  }
+
+  // Lookup this symbol in the version table.
+  if (VersionIndex >= VersionMap.size() || !VersionMap[VersionIndex])
+    return createError("SHT_GNU_versym section refers to a version index " +
+                       Twine(VersionIndex) + " which is missing");
+
+  const VersionEntry &Entry = *VersionMap[VersionIndex];
+  // A default version (@@) is only available for defined symbols.
+  if (!Entry.IsVerDef || IsSymHidden.getValueOr(false))
+    IsDefault = false;
+  else
+    IsDefault = !(SymbolVersionIndex & llvm::ELF::VERSYM_HIDDEN);
+  return Entry.Name.c_str();
+}
+
+template <class ELFT>
+Expected<std::vector<VerDef>>
+ELFFile<ELFT>::getVersionDefinitions(const Elf_Shdr &Sec) const {
+  Expected<StringRef> StrTabOrErr = getLinkAsStrtab(Sec);
+  if (!StrTabOrErr)
+    return StrTabOrErr.takeError();
+
+  Expected<ArrayRef<uint8_t>> ContentsOrErr = getSectionContents(Sec);
+  if (!ContentsOrErr)
+    return createError("cannot read content of " + describe(*this, Sec) + ": " +
+                       toString(ContentsOrErr.takeError()));
+
+  const uint8_t *Start = ContentsOrErr->data();
+  const uint8_t *End = Start + ContentsOrErr->size();
+
+  auto ExtractNextAux = [&](const uint8_t *&VerdauxBuf,
+                            unsigned VerDefNdx) -> Expected<VerdAux> {
+    if (VerdauxBuf + sizeof(Elf_Verdaux) > End)
+      return createError("invalid " + describe(*this, Sec) +
+                         ": version definition " + Twine(VerDefNdx) +
+                         " refers to an auxiliary entry that goes past the end "
+                         "of the section");
+
+    auto *Verdaux = reinterpret_cast<const Elf_Verdaux *>(VerdauxBuf);
+    VerdauxBuf += Verdaux->vda_next;
+
+    VerdAux Aux;
+    Aux.Offset = VerdauxBuf - Start;
+    if (Verdaux->vda_name <= StrTabOrErr->size())
+      Aux.Name = std::string(StrTabOrErr->drop_front(Verdaux->vda_name));
+    else
+      Aux.Name = ("<invalid vda_name: " + Twine(Verdaux->vda_name) + ">").str();
+    return Aux;
+  };
+
+  std::vector<VerDef> Ret;
+  const uint8_t *VerdefBuf = Start;
+  for (unsigned I = 1; I <= /*VerDefsNum=*/Sec.sh_info; ++I) {
+    if (VerdefBuf + sizeof(Elf_Verdef) > End)
+      return createError("invalid " + describe(*this, Sec) +
+                         ": version definition " + Twine(I) +
+                         " goes past the end of the section");
+
+    if (reinterpret_cast<uintptr_t>(VerdefBuf) % sizeof(uint32_t) != 0)
+      return createError(
+          "invalid " + describe(*this, Sec) +
+          ": found a misaligned version definition entry at offset 0x" +
+          Twine::utohexstr(VerdefBuf - Start));
+
+    unsigned Version = *reinterpret_cast<const Elf_Half *>(VerdefBuf);
+    if (Version != 1)
+      return createError("unable to dump " + describe(*this, Sec) +
+                         ": version " + Twine(Version) +
+                         " is not yet supported");
+
+    const Elf_Verdef *D = reinterpret_cast<const Elf_Verdef *>(VerdefBuf);
+    VerDef &VD = *Ret.emplace(Ret.end());
+    VD.Offset = VerdefBuf - Start;
+    VD.Version = D->vd_version;
+    VD.Flags = D->vd_flags;
+    VD.Ndx = D->vd_ndx;
+    VD.Cnt = D->vd_cnt;
+    VD.Hash = D->vd_hash;
+
+    const uint8_t *VerdauxBuf = VerdefBuf + D->vd_aux;
+    for (unsigned J = 0; J < D->vd_cnt; ++J) {
+      if (reinterpret_cast<uintptr_t>(VerdauxBuf) % sizeof(uint32_t) != 0)
+        return createError("invalid " + describe(*this, Sec) +
+                           ": found a misaligned auxiliary entry at offset 0x" +
+                           Twine::utohexstr(VerdauxBuf - Start));
+
+      Expected<VerdAux> AuxOrErr = ExtractNextAux(VerdauxBuf, I);
+      if (!AuxOrErr)
+        return AuxOrErr.takeError();
+
+      if (J == 0)
+        VD.Name = AuxOrErr->Name;
+      else
+        VD.AuxV.push_back(*AuxOrErr);
+    }
+
+    VerdefBuf += D->vd_next;
+  }
+
+  return Ret;
+}
+
+template <class ELFT>
+Expected<std::vector<VerNeed>>
+ELFFile<ELFT>::getVersionDependencies(const Elf_Shdr &Sec,
+                                      WarningHandler WarnHandler) const {
+  StringRef StrTab;
+  Expected<StringRef> StrTabOrErr = getLinkAsStrtab(Sec);
+  if (!StrTabOrErr) {
+    if (Error E = WarnHandler(toString(StrTabOrErr.takeError())))
+      return std::move(E);
+  } else {
+    StrTab = *StrTabOrErr;
+  }
+
+  Expected<ArrayRef<uint8_t>> ContentsOrErr = getSectionContents(Sec);
+  if (!ContentsOrErr)
+    return createError("cannot read content of " + describe(*this, Sec) + ": " +
+                       toString(ContentsOrErr.takeError()));
+
+  const uint8_t *Start = ContentsOrErr->data();
+  const uint8_t *End = Start + ContentsOrErr->size();
+  const uint8_t *VerneedBuf = Start;
+
+  std::vector<VerNeed> Ret;
+  for (unsigned I = 1; I <= /*VerneedNum=*/Sec.sh_info; ++I) {
+    if (VerneedBuf + sizeof(Elf_Verdef) > End)
+      return createError("invalid " + describe(*this, Sec) +
+                         ": version dependency " + Twine(I) +
+                         " goes past the end of the section");
+
+    if (reinterpret_cast<uintptr_t>(VerneedBuf) % sizeof(uint32_t) != 0)
+      return createError(
+          "invalid " + describe(*this, Sec) +
+          ": found a misaligned version dependency entry at offset 0x" +
+          Twine::utohexstr(VerneedBuf - Start));
+
+    unsigned Version = *reinterpret_cast<const Elf_Half *>(VerneedBuf);
+    if (Version != 1)
+      return createError("unable to dump " + describe(*this, Sec) +
+                         ": version " + Twine(Version) +
+                         " is not yet supported");
+
+    const Elf_Verneed *Verneed =
+        reinterpret_cast<const Elf_Verneed *>(VerneedBuf);
+
+    VerNeed &VN = *Ret.emplace(Ret.end());
+    VN.Version = Verneed->vn_version;
+    VN.Cnt = Verneed->vn_cnt;
+    VN.Offset = VerneedBuf - Start;
+
+    if (Verneed->vn_file < StrTab.size())
+      VN.File = std::string(StrTab.drop_front(Verneed->vn_file));
+    else
+      VN.File = ("<corrupt vn_file: " + Twine(Verneed->vn_file) + ">").str();
+
+    const uint8_t *VernauxBuf = VerneedBuf + Verneed->vn_aux;
+    for (unsigned J = 0; J < Verneed->vn_cnt; ++J) {
+      if (reinterpret_cast<uintptr_t>(VernauxBuf) % sizeof(uint32_t) != 0)
+        return createError("invalid " + describe(*this, Sec) +
+                           ": found a misaligned auxiliary entry at offset 0x" +
+                           Twine::utohexstr(VernauxBuf - Start));
+
+      if (VernauxBuf + sizeof(Elf_Vernaux) > End)
+        return createError(
+            "invalid " + describe(*this, Sec) + ": version dependency " +
+            Twine(I) +
+            " refers to an auxiliary entry that goes past the end "
+            "of the section");
+
+      const Elf_Vernaux *Vernaux =
+          reinterpret_cast<const Elf_Vernaux *>(VernauxBuf);
+
+      VernAux &Aux = *VN.AuxV.emplace(VN.AuxV.end());
+      Aux.Hash = Vernaux->vna_hash;
+      Aux.Flags = Vernaux->vna_flags;
+      Aux.Other = Vernaux->vna_other;
+      Aux.Offset = VernauxBuf - Start;
+      if (StrTab.size() <= Vernaux->vna_name)
+        Aux.Name = "<corrupt>";
+      else
+        Aux.Name = std::string(StrTab.drop_front(Vernaux->vna_name));
+
+      VernauxBuf += Vernaux->vna_next;
+    }
+    VerneedBuf += Verneed->vn_next;
+  }
+  return Ret;
 }
 
 template <class ELFT>
@@ -636,14 +1048,14 @@ ELFFile<ELFT>::getSection(uint32_t Index) const {
 
 template <class ELFT>
 Expected<StringRef>
-ELFFile<ELFT>::getStringTable(const Elf_Shdr *Section,
+ELFFile<ELFT>::getStringTable(const Elf_Shdr &Section,
                               WarningHandler WarnHandler) const {
-  if (Section->sh_type != ELF::SHT_STRTAB)
+  if (Section.sh_type != ELF::SHT_STRTAB)
     if (Error E = WarnHandler("invalid sh_type for string table section " +
-                              getSecIndexForError(this, Section) +
+                              getSecIndexForError(*this, Section) +
                               ": expected SHT_STRTAB, but got " +
                               object::getELFSectionTypeName(
-                                  getHeader()->e_machine, Section->sh_type)))
+                                  getHeader().e_machine, Section.sh_type)))
       return std::move(E);
 
   auto V = getSectionContentsAsArray<char>(Section);
@@ -652,10 +1064,10 @@ ELFFile<ELFT>::getStringTable(const Elf_Shdr *Section,
   ArrayRef<char> Data = *V;
   if (Data.empty())
     return createError("SHT_STRTAB string table section " +
-                       getSecIndexForError(this, Section) + " is empty");
+                       getSecIndexForError(*this, Section) + " is empty");
   if (Data.back() != '\0')
     return createError("SHT_STRTAB string table section " +
-                       getSecIndexForError(this, Section) +
+                       getSecIndexForError(*this, Section) +
                        " is non-null terminated");
   return StringRef(Data.begin(), Data.size());
 }
@@ -674,7 +1086,7 @@ Expected<ArrayRef<typename ELFT::Word>>
 ELFFile<ELFT>::getSHNDXTable(const Elf_Shdr &Section,
                              Elf_Shdr_Range Sections) const {
   assert(Section.sh_type == ELF::SHT_SYMTAB_SHNDX);
-  auto VOrErr = getSectionContentsAsArray<Elf_Word>(&Section);
+  auto VOrErr = getSectionContentsAsArray<Elf_Word>(Section);
   if (!VOrErr)
     return VOrErr.takeError();
   ArrayRef<Elf_Word> V = *VOrErr;
@@ -684,10 +1096,10 @@ ELFFile<ELFT>::getSHNDXTable(const Elf_Shdr &Section,
   const Elf_Shdr &SymTable = **SymTableOrErr;
   if (SymTable.sh_type != ELF::SHT_SYMTAB &&
       SymTable.sh_type != ELF::SHT_DYNSYM)
-    return createError("SHT_SYMTAB_SHNDX section is linked with " +
-                       object::getELFSectionTypeName(getHeader()->e_machine,
-                                                     SymTable.sh_type) +
-                       " section (expected SHT_SYMTAB/SHT_DYNSYM)");
+    return createError(
+        "SHT_SYMTAB_SHNDX section is linked with " +
+        object::getELFSectionTypeName(getHeader().e_machine, SymTable.sh_type) +
+        " section (expected SHT_SYMTAB/SHT_DYNSYM)");
 
   uint64_t Syms = SymTable.sh_size / sizeof(Elf_Sym);
   if (V.size() != Syms)
@@ -715,15 +1127,33 @@ ELFFile<ELFT>::getStringTableForSymtab(const Elf_Shdr &Sec,
   if (Sec.sh_type != ELF::SHT_SYMTAB && Sec.sh_type != ELF::SHT_DYNSYM)
     return createError(
         "invalid sh_type for symbol table, expected SHT_SYMTAB or SHT_DYNSYM");
-  auto SectionOrErr = object::getSection<ELFT>(Sections, Sec.sh_link);
+  Expected<const Elf_Shdr *> SectionOrErr =
+      object::getSection<ELFT>(Sections, Sec.sh_link);
   if (!SectionOrErr)
     return SectionOrErr.takeError();
-  return getStringTable(*SectionOrErr);
+  return getStringTable(**SectionOrErr);
+}
+
+template <class ELFT>
+Expected<StringRef>
+ELFFile<ELFT>::getLinkAsStrtab(const typename ELFT::Shdr &Sec) const {
+  Expected<const typename ELFT::Shdr *> StrTabSecOrErr =
+      getSection(Sec.sh_link);
+  if (!StrTabSecOrErr)
+    return createError("invalid section linked to " + describe(*this, Sec) +
+                       ": " + toString(StrTabSecOrErr.takeError()));
+
+  Expected<StringRef> StrTabOrErr = getStringTable(**StrTabSecOrErr);
+  if (!StrTabOrErr)
+    return createError("invalid string table linked to " +
+                       describe(*this, Sec) + ": " +
+                       toString(StrTabOrErr.takeError()));
+  return *StrTabOrErr;
 }
 
 template <class ELFT>
 Expected<StringRef>
-ELFFile<ELFT>::getSectionName(const Elf_Shdr *Section,
+ELFFile<ELFT>::getSectionName(const Elf_Shdr &Section,
                               WarningHandler WarnHandler) const {
   auto SectionsOrErr = sections();
   if (!SectionsOrErr)
@@ -735,13 +1165,13 @@ ELFFile<ELFT>::getSectionName(const Elf_Shdr *Section,
 }
 
 template <class ELFT>
-Expected<StringRef> ELFFile<ELFT>::getSectionName(const Elf_Shdr *Section,
+Expected<StringRef> ELFFile<ELFT>::getSectionName(const Elf_Shdr &Section,
                                                   StringRef DotShstrtab) const {
-  uint32_t Offset = Section->sh_name;
+  uint32_t Offset = Section.sh_name;
   if (Offset == 0)
     return StringRef();
   if (Offset >= DotShstrtab.size())
-    return createError("a section " + getSecIndexForError(this, Section) +
+    return createError("a section " + getSecIndexForError(*this, Section) +
                        " has an invalid sh_name (0x" +
                        Twine::utohexstr(Offset) +
                        ") offset which goes past the end of the "
diff --git a/contrib/llvm-project/llvm/include/llvm/Object/ELFObjectFile.h b/contrib/llvm-project/llvm/include/llvm/Object/ELFObjectFile.h
index 62ecd8b5a7e5..fed53eef68c3 100644
--- a/contrib/llvm-project/llvm/include/llvm/Object/ELFObjectFile.h
+++ b/contrib/llvm-project/llvm/include/llvm/Object/ELFObjectFile.h
@@ -51,6 +51,12 @@ class ELFObjectFileBase : public ObjectFile {
   friend class ELFSectionRef;
   friend class ELFSymbolRef;
 
+  SubtargetFeatures getMIPSFeatures() const;
+  SubtargetFeatures getARMFeatures() const;
+  SubtargetFeatures getRISCVFeatures() const;
+
+  StringRef getAMDGPUCPUName() const;
+
 protected:
   ELFObjectFileBase(unsigned int Type, MemoryBufferRef Source);
 
@@ -80,11 +86,7 @@ public:
 
   SubtargetFeatures getFeatures() const override;
 
-  SubtargetFeatures getMIPSFeatures() const;
-
-  SubtargetFeatures getARMFeatures() const;
-
-  SubtargetFeatures getRISCVFeatures() const;
+  Optional<StringRef> tryGetCPUName() const override;
 
   void setARMSubArch(Triple &TheTriple) const override;
 
@@ -92,7 +94,8 @@ public:
 
   virtual uint16_t getEMachine() const = 0;
 
-  std::vector<std::pair<DataRefImpl, uint64_t>> getPltAddresses() const;
+  std::vector<std::pair<Optional<DataRefImpl>, uint64_t>>
+  getPltAddresses() const;
 };
 
 class ELFSectionRef : public SectionRef {
@@ -230,30 +233,31 @@ template <class ELFT> class ELFObjectFile : public ELFObjectFileBase {
 public:
   LLVM_ELF_IMPORT_TYPES_ELFT(ELFT)
 
-  using uintX_t = typename ELFT::uint;
-
-  using Elf_Sym = typename ELFT::Sym;
-  using Elf_Shdr = typename ELFT::Shdr;
-  using Elf_Ehdr = typename ELFT::Ehdr;
-  using Elf_Rel = typename ELFT::Rel;
-  using Elf_Rela = typename ELFT::Rela;
-  using Elf_Dyn = typename ELFT::Dyn;
-
   SectionRef toSectionRef(const Elf_Shdr *Sec) const {
     return SectionRef(toDRI(Sec), this);
   }
 
+  ELFSymbolRef toSymbolRef(const Elf_Shdr *SymTable, unsigned SymbolNum) const {
+    return ELFSymbolRef({toDRI(SymTable, SymbolNum), this});
+  }
+
+  bool IsContentValid() const { return ContentValid; }
+
 private:
   ELFObjectFile(MemoryBufferRef Object, ELFFile<ELFT> EF,
                 const Elf_Shdr *DotDynSymSec, const Elf_Shdr *DotSymtabSec,
-                ArrayRef<Elf_Word> ShndxTable);
+                const Elf_Shdr *DotSymtabShndxSec);
+
+  bool ContentValid = false;
 
 protected:
   ELFFile<ELFT> EF;
 
   const Elf_Shdr *DotDynSymSec = nullptr; // Dynamic symbol table section.
   const Elf_Shdr *DotSymtabSec = nullptr; // Symbol table section.
-  ArrayRef<Elf_Word> ShndxTable;
+  const Elf_Shdr *DotSymtabShndxSec = nullptr; // SHT_SYMTAB_SHNDX section.
+
+  Error initContent() override;
 
   void moveSymbolNext(DataRefImpl &Symb) const override;
   Expected<StringRef> getSymbolName(DataRefImpl Symb) const override;
@@ -304,14 +308,6 @@ protected:
   uint64_t getSectionOffset(DataRefImpl Sec) const override;
   StringRef getRelocationTypeName(uint32_t Type) const;
 
-  /// Get the relocation section that contains \a Rel.
-  const Elf_Shdr *getRelSection(DataRefImpl Rel) const {
-    auto RelSecOrErr = EF.getSection(Rel.d.a);
-    if (!RelSecOrErr)
-      report_fatal_error(errorToErrorCode(RelSecOrErr.takeError()).message());
-    return *RelSecOrErr;
-  }
-
   DataRefImpl toDRI(const Elf_Shdr *SymTable, unsigned SymbolNum) const {
     DataRefImpl DRI;
     if (!SymTable) {
@@ -374,7 +370,7 @@ protected:
     for (const Elf_Shdr &Sec : *SectionsOrErr) {
       if (Sec.sh_type == ELF::SHT_ARM_ATTRIBUTES ||
           Sec.sh_type == ELF::SHT_RISCV_ATTRIBUTES) {
-        auto ErrorOrContents = EF.getSectionContents(&Sec);
+        auto ErrorOrContents = EF.getSectionContents(Sec);
         if (!ErrorOrContents)
           return ErrorOrContents.takeError();
 
@@ -397,16 +393,22 @@ protected:
 
 public:
   ELFObjectFile(ELFObjectFile<ELFT> &&Other);
-  static Expected<ELFObjectFile<ELFT>> create(MemoryBufferRef Object);
+  static Expected<ELFObjectFile<ELFT>> create(MemoryBufferRef Object,
+                                              bool InitContent = true);
 
   const Elf_Rel *getRel(DataRefImpl Rel) const;
   const Elf_Rela *getRela(DataRefImpl Rela) const;
 
-  const Elf_Sym *getSymbol(DataRefImpl Sym) const {
-    auto Ret = EF.template getEntry<Elf_Sym>(Sym.d.a, Sym.d.b);
-    if (!Ret)
-      report_fatal_error(errorToErrorCode(Ret.takeError()).message());
-    return *Ret;
+  Expected<const Elf_Sym *> getSymbol(DataRefImpl Sym) const {
+    return EF.template getEntry<Elf_Sym>(Sym.d.a, Sym.d.b);
+  }
+
+  /// Get the relocation section that contains \a Rel.
+  const Elf_Shdr *getRelSection(DataRefImpl Rel) const {
+    auto RelSecOrErr = EF.getSection(Rel.d.a);
+    if (!RelSecOrErr)
+      report_fatal_error(errorToErrorCode(RelSecOrErr.takeError()).message());
+    return *RelSecOrErr;
   }
 
   const Elf_Shdr *getSection(DataRefImpl Sec) const {
@@ -429,9 +431,9 @@ public:
   Triple::ArchType getArch() const override;
   Expected<uint64_t> getStartAddress() const override;
 
-  unsigned getPlatformFlags() const override { return EF.getHeader()->e_flags; }
+  unsigned getPlatformFlags() const override { return EF.getHeader().e_flags; }
 
-  const ELFFile<ELFT> *getELFFile() const { return &EF; }
+  const ELFFile<ELFT> &getELFFile() const { return EF; }
 
   bool isDyldType() const { return isDyldELFObject; }
   static bool classof(const Binary *v) {
@@ -454,9 +456,40 @@ void ELFObjectFile<ELFT>::moveSymbolNext(DataRefImpl &Sym) const {
   ++Sym.d.b;
 }
 
+template <class ELFT> Error ELFObjectFile<ELFT>::initContent() {
+  auto SectionsOrErr = EF.sections();
+  if (!SectionsOrErr)
+    return SectionsOrErr.takeError();
+
+  for (const Elf_Shdr &Sec : *SectionsOrErr) {
+    switch (Sec.sh_type) {
+    case ELF::SHT_DYNSYM: {
+      if (!DotDynSymSec)
+        DotDynSymSec = &Sec;
+      break;
+    }
+    case ELF::SHT_SYMTAB: {
+      if (!DotSymtabSec)
+        DotSymtabSec = &Sec;
+      break;
+    }
+    case ELF::SHT_SYMTAB_SHNDX: {
+      if (!DotSymtabShndxSec)
+        DotSymtabShndxSec = &Sec;
+      break;
+    }
+    }
+  }
+
+  ContentValid = true;
+  return Error::success();
+}
+
 template <class ELFT>
 Expected<StringRef> ELFObjectFile<ELFT>::getSymbolName(DataRefImpl Sym) const {
-  const Elf_Sym *ESym = getSymbol(Sym);
+  Expected<const Elf_Sym *> SymOrErr = getSymbol(Sym);
+  if (!SymOrErr)
+    return SymOrErr.takeError();
   auto SymTabOrErr = EF.getSection(Sym.d.a);
   if (!SymTabOrErr)
     return SymTabOrErr.takeError();
@@ -465,15 +498,15 @@ Expected<StringRef> ELFObjectFile<ELFT>::getSymbolName(DataRefImpl Sym) const {
   if (!StrTabOrErr)
     return StrTabOrErr.takeError();
   const Elf_Shdr *StringTableSec = *StrTabOrErr;
-  auto SymStrTabOrErr = EF.getStringTable(StringTableSec);
+  auto SymStrTabOrErr = EF.getStringTable(*StringTableSec);
   if (!SymStrTabOrErr)
     return SymStrTabOrErr.takeError();
-  Expected<StringRef> Name = ESym->getName(*SymStrTabOrErr);
+  Expected<StringRef> Name = (*SymOrErr)->getName(*SymStrTabOrErr);
   if (Name && !Name->empty())
     return Name;
 
   // If the symbol name is empty use the section name.
-  if (ESym->getType() == ELF::STT_SECTION) {
+  if ((*SymOrErr)->getType() == ELF::STT_SECTION) {
     if (Expected<section_iterator> SecOrErr = getSymbolSection(Sym)) {
       consumeError(Name.takeError());
       return (*SecOrErr)->getName();
@@ -499,15 +532,18 @@ uint64_t ELFObjectFile<ELFT>::getSectionOffset(DataRefImpl Sec) const {
 
 template <class ELFT>
 uint64_t ELFObjectFile<ELFT>::getSymbolValueImpl(DataRefImpl Symb) const {
-  const Elf_Sym *ESym = getSymbol(Symb);
-  uint64_t Ret = ESym->st_value;
-  if (ESym->st_shndx == ELF::SHN_ABS)
+  Expected<const Elf_Sym *> SymOrErr = getSymbol(Symb);
+  if (!SymOrErr)
+    report_fatal_error(SymOrErr.takeError());
+
+  uint64_t Ret = (*SymOrErr)->st_value;
+  if ((*SymOrErr)->st_shndx == ELF::SHN_ABS)
     return Ret;
 
-  const Elf_Ehdr *Header = EF.getHeader();
+  const Elf_Ehdr &Header = EF.getHeader();
   // Clear the ARM/Thumb or microMIPS indicator flag.
-  if ((Header->e_machine == ELF::EM_ARM || Header->e_machine == ELF::EM_MIPS) &&
-      ESym->getType() == ELF::STT_FUNC)
+  if ((Header.e_machine == ELF::EM_ARM || Header.e_machine == ELF::EM_MIPS) &&
+      (*SymOrErr)->getType() == ELF::STT_FUNC)
     Ret &= ~1;
 
   return Ret;
@@ -522,22 +558,34 @@ ELFObjectFile<ELFT>::getSymbolAddress(DataRefImpl Symb) const {
     return SymbolValueOrErr.takeError();
 
   uint64_t Result = *SymbolValueOrErr;
-  const Elf_Sym *ESym = getSymbol(Symb);
-  switch (ESym->st_shndx) {
+  Expected<const Elf_Sym *> SymOrErr = getSymbol(Symb);
+  if (!SymOrErr)
+    return SymOrErr.takeError();
+
+  switch ((*SymOrErr)->st_shndx) {
   case ELF::SHN_COMMON:
   case ELF::SHN_UNDEF:
   case ELF::SHN_ABS:
     return Result;
   }
 
-  const Elf_Ehdr *Header = EF.getHeader();
   auto SymTabOrErr = EF.getSection(Symb.d.a);
   if (!SymTabOrErr)
     return SymTabOrErr.takeError();
-  const Elf_Shdr *SymTab = *SymTabOrErr;
 
-  if (Header->e_type == ELF::ET_REL) {
-    auto SectionOrErr = EF.getSection(ESym, SymTab, ShndxTable);
+  if (EF.getHeader().e_type == ELF::ET_REL) {
+    ArrayRef<Elf_Word> ShndxTable;
+    if (DotSymtabShndxSec) {
+      // TODO: Test this error.
+      if (Expected<ArrayRef<Elf_Word>> ShndxTableOrErr =
+              EF.getSHNDXTable(*DotSymtabShndxSec))
+        ShndxTable = *ShndxTableOrErr;
+      else
+        return ShndxTableOrErr.takeError();
+    }
+
+    Expected<const Elf_Shdr *> SectionOrErr =
+        EF.getSection(**SymOrErr, *SymTabOrErr, ShndxTable);
     if (!SectionOrErr)
       return SectionOrErr.takeError();
     const Elf_Shdr *Section = *SectionOrErr;
@@ -550,52 +598,68 @@ ELFObjectFile<ELFT>::getSymbolAddress(DataRefImpl Symb) const {
 
 template <class ELFT>
 uint32_t ELFObjectFile<ELFT>::getSymbolAlignment(DataRefImpl Symb) const {
-  const Elf_Sym *Sym = getSymbol(Symb);
-  if (Sym->st_shndx == ELF::SHN_COMMON)
-    return Sym->st_value;
+  Expected<const Elf_Sym *> SymOrErr = getSymbol(Symb);
+  if (!SymOrErr)
+    report_fatal_error(SymOrErr.takeError());
+  if ((*SymOrErr)->st_shndx == ELF::SHN_COMMON)
+    return (*SymOrErr)->st_value;
   return 0;
 }
 
 template <class ELFT>
 uint16_t ELFObjectFile<ELFT>::getEMachine() const {
-  return EF.getHeader()->e_machine;
+  return EF.getHeader().e_machine;
 }
 
 template <class ELFT> uint16_t ELFObjectFile<ELFT>::getEType() const {
-  return EF.getHeader()->e_type;
+  return EF.getHeader().e_type;
 }
 
 template <class ELFT>
 uint64_t ELFObjectFile<ELFT>::getSymbolSize(DataRefImpl Sym) const {
-  return getSymbol(Sym)->st_size;
+  Expected<const Elf_Sym *> SymOrErr = getSymbol(Sym);
+  if (!SymOrErr)
+    report_fatal_error(SymOrErr.takeError());
+  return (*SymOrErr)->st_size;
 }
 
 template <class ELFT>
 uint64_t ELFObjectFile<ELFT>::getCommonSymbolSizeImpl(DataRefImpl Symb) const {
-  return getSymbol(Symb)->st_size;
+  return getSymbolSize(Symb);
 }
 
 template <class ELFT>
 uint8_t ELFObjectFile<ELFT>::getSymbolBinding(DataRefImpl Symb) const {
-  return getSymbol(Symb)->getBinding();
+  Expected<const Elf_Sym *> SymOrErr = getSymbol(Symb);
+  if (!SymOrErr)
+    report_fatal_error(SymOrErr.takeError());
+  return (*SymOrErr)->getBinding();
 }
 
 template <class ELFT>
 uint8_t ELFObjectFile<ELFT>::getSymbolOther(DataRefImpl Symb) const {
-  return getSymbol(Symb)->st_other;
+  Expected<const Elf_Sym *> SymOrErr = getSymbol(Symb);
+  if (!SymOrErr)
+    report_fatal_error(SymOrErr.takeError());
+  return (*SymOrErr)->st_other;
 }
 
 template <class ELFT>
 uint8_t ELFObjectFile<ELFT>::getSymbolELFType(DataRefImpl Symb) const {
-  return getSymbol(Symb)->getType();
+  Expected<const Elf_Sym *> SymOrErr = getSymbol(Symb);
+  if (!SymOrErr)
+    report_fatal_error(SymOrErr.takeError());
+  return (*SymOrErr)->getType();
 }
 
 template <class ELFT>
 Expected<SymbolRef::Type>
 ELFObjectFile<ELFT>::getSymbolType(DataRefImpl Symb) const {
-  const Elf_Sym *ESym = getSymbol(Symb);
+  Expected<const Elf_Sym *> SymOrErr = getSymbol(Symb);
+  if (!SymOrErr)
+    return SymOrErr.takeError();
 
-  switch (ESym->getType()) {
+  switch ((*SymOrErr)->getType()) {
   case ELF::STT_NOTYPE:
     return SymbolRef::ST_Unknown;
   case ELF::STT_SECTION:
@@ -615,8 +679,11 @@ ELFObjectFile<ELFT>::getSymbolType(DataRefImpl Symb) const {
 
 template <class ELFT>
 Expected<uint32_t> ELFObjectFile<ELFT>::getSymbolFlags(DataRefImpl Sym) const {
-  const Elf_Sym *ESym = getSymbol(Sym);
+  Expected<const Elf_Sym *> SymOrErr = getSymbol(Sym);
+  if (!SymOrErr)
+    return SymOrErr.takeError();
 
+  const Elf_Sym *ESym = *SymOrErr;
   uint32_t Result = SymbolRef::SF_None;
 
   if (ESym->getBinding() != ELF::STB_LOCAL)
@@ -649,7 +716,7 @@ Expected<uint32_t> ELFObjectFile<ELFT>::getSymbolFlags(DataRefImpl Sym) const {
     // TODO: Test this error.
     return SymbolsOrErr.takeError();
 
-  if (EF.getHeader()->e_machine == ELF::EM_ARM) {
+  if (EF.getHeader().e_machine == ELF::EM_ARM) {
     if (Expected<StringRef> NameOrErr = getSymbolName(Sym)) {
       StringRef Name = *NameOrErr;
       if (Name.startswith("$d") || Name.startswith("$t") ||
@@ -682,7 +749,17 @@ template <class ELFT>
 Expected<section_iterator>
 ELFObjectFile<ELFT>::getSymbolSection(const Elf_Sym *ESym,
                                       const Elf_Shdr *SymTab) const {
-  auto ESecOrErr = EF.getSection(ESym, SymTab, ShndxTable);
+  ArrayRef<Elf_Word> ShndxTable;
+  if (DotSymtabShndxSec) {
+    // TODO: Test this error.
+    Expected<ArrayRef<Elf_Word>> ShndxTableOrErr =
+        EF.getSHNDXTable(*DotSymtabShndxSec);
+    if (!ShndxTableOrErr)
+      return ShndxTableOrErr.takeError();
+    ShndxTable = *ShndxTableOrErr;
+  }
+
+  auto ESecOrErr = EF.getSection(*ESym, SymTab, ShndxTable);
   if (!ESecOrErr)
     return ESecOrErr.takeError();
 
@@ -698,12 +775,14 @@ ELFObjectFile<ELFT>::getSymbolSection(const Elf_Sym *ESym,
 template <class ELFT>
 Expected<section_iterator>
 ELFObjectFile<ELFT>::getSymbolSection(DataRefImpl Symb) const {
-  const Elf_Sym *Sym = getSymbol(Symb);
+  Expected<const Elf_Sym *> SymOrErr = getSymbol(Symb);
+  if (!SymOrErr)
+    return SymOrErr.takeError();
+
   auto SymTabOrErr = EF.getSection(Symb.d.a);
   if (!SymTabOrErr)
     return SymTabOrErr.takeError();
-  const Elf_Shdr *SymTab = *SymTabOrErr;
-  return getSymbolSection(Sym, SymTab);
+  return getSymbolSection(*SymOrErr, *SymTabOrErr);
 }
 
 template <class ELFT>
@@ -714,7 +793,7 @@ void ELFObjectFile<ELFT>::moveSectionNext(DataRefImpl &Sec) const {
 
 template <class ELFT>
 Expected<StringRef> ELFObjectFile<ELFT>::getSectionName(DataRefImpl Sec) const {
-  return EF.getSectionName(&*getSection(Sec));
+  return EF.getSectionName(*getSection(Sec));
 }
 
 template <class ELFT>
@@ -844,7 +923,7 @@ ELFObjectFile<ELFT>::section_rel_begin(DataRefImpl Sec) const {
   if (!SectionsOrErr)
     return relocation_iterator(RelocationRef());
   uintptr_t SHT = reinterpret_cast<uintptr_t>((*SectionsOrErr).begin());
-  RelData.d.a = (Sec.p - SHT) / EF.getHeader()->e_shentsize;
+  RelData.d.a = (Sec.p - SHT) / EF.getHeader().e_shentsize;
   RelData.d.b = 0;
   return relocation_iterator(RelocationRef(RelData, this));
 }
@@ -871,7 +950,7 @@ ELFObjectFile<ELFT>::section_rel_end(DataRefImpl Sec) const {
 template <class ELFT>
 Expected<section_iterator>
 ELFObjectFile<ELFT>::getRelocatedSection(DataRefImpl Sec) const {
-  if (EF.getHeader()->e_type != ELF::ET_REL)
+  if (EF.getHeader().e_type != ELF::ET_REL)
     return section_end();
 
   const Elf_Shdr *EShdr = getSection(Sec);
@@ -930,7 +1009,7 @@ uint64_t ELFObjectFile<ELFT>::getRelocationType(DataRefImpl Rel) const {
 
 template <class ELFT>
 StringRef ELFObjectFile<ELFT>::getRelocationTypeName(uint32_t Type) const {
-  return getELFRelocationTypeName(EF.getHeader()->e_machine, Type);
+  return getELFRelocationTypeName(EF.getHeader().e_machine, Type);
 }
 
 template <class ELFT>
@@ -970,59 +1049,34 @@ ELFObjectFile<ELFT>::getRela(DataRefImpl Rela) const {
 
 template <class ELFT>
 Expected<ELFObjectFile<ELFT>>
-ELFObjectFile<ELFT>::create(MemoryBufferRef Object) {
+ELFObjectFile<ELFT>::create(MemoryBufferRef Object, bool InitContent) {
   auto EFOrErr = ELFFile<ELFT>::create(Object.getBuffer());
   if (Error E = EFOrErr.takeError())
     return std::move(E);
-  auto EF = std::move(*EFOrErr);
 
-  auto SectionsOrErr = EF.sections();
-  if (!SectionsOrErr)
-    return SectionsOrErr.takeError();
-
-  const Elf_Shdr *DotDynSymSec = nullptr;
-  const Elf_Shdr *DotSymtabSec = nullptr;
-  ArrayRef<Elf_Word> ShndxTable;
-  for (const Elf_Shdr &Sec : *SectionsOrErr) {
-    switch (Sec.sh_type) {
-    case ELF::SHT_DYNSYM: {
-      if (!DotDynSymSec)
-        DotDynSymSec = &Sec;
-      break;
-    }
-    case ELF::SHT_SYMTAB: {
-      if (!DotSymtabSec)
-        DotSymtabSec = &Sec;
-      break;
-    }
-    case ELF::SHT_SYMTAB_SHNDX: {
-      auto TableOrErr = EF.getSHNDXTable(Sec);
-      if (!TableOrErr)
-        return TableOrErr.takeError();
-      ShndxTable = *TableOrErr;
-      break;
-    }
-    }
-  }
-  return ELFObjectFile<ELFT>(Object, EF, DotDynSymSec, DotSymtabSec,
-                             ShndxTable);
+  ELFObjectFile<ELFT> Obj = {Object, std::move(*EFOrErr), nullptr, nullptr,
+                             nullptr};
+  if (InitContent)
+    if (Error E = Obj.initContent())
+      return std::move(E);
+  return std::move(Obj);
 }
 
 template <class ELFT>
 ELFObjectFile<ELFT>::ELFObjectFile(MemoryBufferRef Object, ELFFile<ELFT> EF,
                                    const Elf_Shdr *DotDynSymSec,
                                    const Elf_Shdr *DotSymtabSec,
-                                   ArrayRef<Elf_Word> ShndxTable)
+                                   const Elf_Shdr *DotSymtabShndx)
     : ELFObjectFileBase(
           getELFType(ELFT::TargetEndianness == support::little, ELFT::Is64Bits),
           Object),
       EF(EF), DotDynSymSec(DotDynSymSec), DotSymtabSec(DotSymtabSec),
-      ShndxTable(ShndxTable) {}
+      DotSymtabShndxSec(DotSymtabShndx) {}
 
 template <class ELFT>
 ELFObjectFile<ELFT>::ELFObjectFile(ELFObjectFile<ELFT> &&Other)
     : ELFObjectFile(Other.Data, Other.EF, Other.DotDynSymSec,
-                    Other.DotSymtabSec, Other.ShndxTable) {}
+                    Other.DotSymtabSec, Other.DotSymtabShndxSec) {}
 
 template <class ELFT>
 basic_symbol_iterator ELFObjectFile<ELFT>::symbol_begin() const {
@@ -1084,9 +1138,9 @@ uint8_t ELFObjectFile<ELFT>::getBytesInAddress() const {
 template <class ELFT>
 StringRef ELFObjectFile<ELFT>::getFileFormatName() const {
   bool IsLittleEndian = ELFT::TargetEndianness == support::little;
-  switch (EF.getHeader()->e_ident[ELF::EI_CLASS]) {
+  switch (EF.getHeader().e_ident[ELF::EI_CLASS]) {
   case ELF::ELFCLASS32:
-    switch (EF.getHeader()->e_machine) {
+    switch (EF.getHeader().e_machine) {
     case ELF::EM_386:
       return "elf32-i386";
     case ELF::EM_IAMCU:
@@ -1106,9 +1160,11 @@ StringRef ELFObjectFile<ELFT>::getFileFormatName() const {
     case ELF::EM_MSP430:
       return "elf32-msp430";
     case ELF::EM_PPC:
-      return "elf32-powerpc";
+      return (IsLittleEndian ? "elf32-powerpcle" : "elf32-powerpc");
     case ELF::EM_RISCV:
       return "elf32-littleriscv";
+    case ELF::EM_CSKY:
+      return "elf32-csky";
     case ELF::EM_SPARC:
     case ELF::EM_SPARC32PLUS:
       return "elf32-sparc";
@@ -1118,7 +1174,7 @@ StringRef ELFObjectFile<ELFT>::getFileFormatName() const {
       return "elf32-unknown";
     }
   case ELF::ELFCLASS64:
-    switch (EF.getHeader()->e_machine) {
+    switch (EF.getHeader().e_machine) {
     case ELF::EM_386:
       return "elf64-i386";
     case ELF::EM_X86_64:
@@ -1152,7 +1208,7 @@ StringRef ELFObjectFile<ELFT>::getFileFormatName() const {
 
 template <class ELFT> Triple::ArchType ELFObjectFile<ELFT>::getArch() const {
   bool IsLittleEndian = ELFT::TargetEndianness == support::little;
-  switch (EF.getHeader()->e_machine) {
+  switch (EF.getHeader().e_machine) {
   case ELF::EM_386:
   case ELF::EM_IAMCU:
     return Triple::x86;
@@ -1169,7 +1225,7 @@ template <class ELFT> Triple::ArchType ELFObjectFile<ELFT>::getArch() const {
   case ELF::EM_LANAI:
     return Triple::lanai;
   case ELF::EM_MIPS:
-    switch (EF.getHeader()->e_ident[ELF::EI_CLASS]) {
+    switch (EF.getHeader().e_ident[ELF::EI_CLASS]) {
     case ELF::ELFCLASS32:
       return IsLittleEndian ? Triple::mipsel : Triple::mips;
     case ELF::ELFCLASS64:
@@ -1180,11 +1236,11 @@ template <class ELFT> Triple::ArchType ELFObjectFile<ELFT>::getArch() const {
   case ELF::EM_MSP430:
     return Triple::msp430;
   case ELF::EM_PPC:
-    return Triple::ppc;
+    return IsLittleEndian ? Triple::ppcle : Triple::ppc;
   case ELF::EM_PPC64:
     return IsLittleEndian ? Triple::ppc64le : Triple::ppc64;
   case ELF::EM_RISCV:
-    switch (EF.getHeader()->e_ident[ELF::EI_CLASS]) {
+    switch (EF.getHeader().e_ident[ELF::EI_CLASS]) {
     case ELF::ELFCLASS32:
       return Triple::riscv32;
     case ELF::ELFCLASS64:
@@ -1205,7 +1261,7 @@ template <class ELFT> Triple::ArchType ELFObjectFile<ELFT>::getArch() const {
     if (!IsLittleEndian)
       return Triple::UnknownArch;
 
-    unsigned MACH = EF.getHeader()->e_flags & ELF::EF_AMDGPU_MACH;
+    unsigned MACH = EF.getHeader().e_flags & ELF::EF_AMDGPU_MACH;
     if (MACH >= ELF::EF_AMDGPU_MACH_R600_FIRST &&
         MACH <= ELF::EF_AMDGPU_MACH_R600_LAST)
       return Triple::r600;
@@ -1221,6 +1277,8 @@ template <class ELFT> Triple::ArchType ELFObjectFile<ELFT>::getArch() const {
 
   case ELF::EM_VE:
     return Triple::ve;
+  case ELF::EM_CSKY:
+    return Triple::csky;
   default:
     return Triple::UnknownArch;
   }
@@ -1228,7 +1286,7 @@ template <class ELFT> Triple::ArchType ELFObjectFile<ELFT>::getArch() const {
 
 template <class ELFT>
 Expected<uint64_t> ELFObjectFile<ELFT>::getStartAddress() const {
-  return EF.getHeader()->e_entry;
+  return EF.getHeader().e_entry;
 }
 
 template <class ELFT>
@@ -1238,7 +1296,7 @@ ELFObjectFile<ELFT>::getDynamicSymbolIterators() const {
 }
 
 template <class ELFT> bool ELFObjectFile<ELFT>::isRelocatableObject() const {
-  return EF.getHeader()->e_type == ELF::ET_REL;
+  return EF.getHeader().e_type == ELF::ET_REL;
 }
 
 } // end namespace object
diff --git a/contrib/llvm-project/llvm/include/llvm/Object/ELFTypes.h b/contrib/llvm-project/llvm/include/llvm/Object/ELFTypes.h
index 5e85e6cc4653..f64e7c06e03b 100644
--- a/contrib/llvm-project/llvm/include/llvm/Object/ELFTypes.h
+++ b/contrib/llvm-project/llvm/include/llvm/Object/ELFTypes.h
@@ -107,7 +107,34 @@ using ELF64BE = ELFType<support::big, true>;
   using Elf_Word = typename ELFT::Word;                                        \
   using Elf_Sword = typename ELFT::Sword;                                      \
   using Elf_Xword = typename ELFT::Xword;                                      \
-  using Elf_Sxword = typename ELFT::Sxword;
+  using Elf_Sxword = typename ELFT::Sxword;                                    \
+  using uintX_t = typename ELFT::uint;                                         \
+  using Elf_Ehdr = typename ELFT::Ehdr;                                        \
+  using Elf_Shdr = typename ELFT::Shdr;                                        \
+  using Elf_Sym = typename ELFT::Sym;                                          \
+  using Elf_Dyn = typename ELFT::Dyn;                                          \
+  using Elf_Phdr = typename ELFT::Phdr;                                        \
+  using Elf_Rel = typename ELFT::Rel;                                          \
+  using Elf_Rela = typename ELFT::Rela;                                        \
+  using Elf_Relr = typename ELFT::Relr;                                        \
+  using Elf_Verdef = typename ELFT::Verdef;                                    \
+  using Elf_Verdaux = typename ELFT::Verdaux;                                  \
+  using Elf_Verneed = typename ELFT::Verneed;                                  \
+  using Elf_Vernaux = typename ELFT::Vernaux;                                  \
+  using Elf_Versym = typename ELFT::Versym;                                    \
+  using Elf_Hash = typename ELFT::Hash;                                        \
+  using Elf_GnuHash = typename ELFT::GnuHash;                                  \
+  using Elf_Nhdr = typename ELFT::Nhdr;                                        \
+  using Elf_Note = typename ELFT::Note;                                        \
+  using Elf_Note_Iterator = typename ELFT::NoteIterator;                       \
+  using Elf_CGProfile = typename ELFT::CGProfile;                              \
+  using Elf_Dyn_Range = typename ELFT::DynRange;                               \
+  using Elf_Shdr_Range = typename ELFT::ShdrRange;                             \
+  using Elf_Sym_Range = typename ELFT::SymRange;                               \
+  using Elf_Rel_Range = typename ELFT::RelRange;                               \
+  using Elf_Rela_Range = typename ELFT::RelaRange;                             \
+  using Elf_Relr_Range = typename ELFT::RelrRange;                             \
+  using Elf_Phdr_Range = typename ELFT::PhdrRange;                             \
 
 #define LLVM_ELF_COMMA ,
 #define LLVM_ELF_IMPORT_TYPES(E, W)                                            \
@@ -269,7 +296,6 @@ struct Elf_Versym_Impl {
 template <class ELFT>
 struct Elf_Verdef_Impl {
   LLVM_ELF_IMPORT_TYPES_ELFT(ELFT)
-  using Elf_Verdaux = Elf_Verdaux_Impl<ELFT>;
   Elf_Half vd_version; // Version of this structure (e.g. VER_DEF_CURRENT)
   Elf_Half vd_flags;   // Bitwise flags (VER_DEF_*)
   Elf_Half vd_ndx;     // Version index, used in .gnu.version entries
diff --git a/contrib/llvm-project/llvm/include/llvm/Object/MachO.h b/contrib/llvm-project/llvm/include/llvm/Object/MachO.h
index f48e0f1dcd58..7eb017397846 100644
--- a/contrib/llvm-project/llvm/include/llvm/Object/MachO.h
+++ b/contrib/llvm-project/llvm/include/llvm/Object/MachO.h
@@ -615,6 +615,7 @@ public:
     case MachO::PLATFORM_IOSSIMULATOR: return "iossimulator";
     case MachO::PLATFORM_TVOSSIMULATOR: return "tvossimulator";
     case MachO::PLATFORM_WATCHOSSIMULATOR: return "watchossimulator";
+    case MachO::PLATFORM_DRIVERKIT: return "driverkit";
     default:
       std::string ret;
       raw_string_ostream ss(ret);
diff --git a/contrib/llvm-project/llvm/include/llvm/Object/MachOUniversal.h b/contrib/llvm-project/llvm/include/llvm/Object/MachOUniversal.h
index 5e006fd87318..9bcacb510108 100644
--- a/contrib/llvm-project/llvm/include/llvm/Object/MachOUniversal.h
+++ b/contrib/llvm-project/llvm/include/llvm/Object/MachOUniversal.h
@@ -22,8 +22,11 @@
 
 namespace llvm {
 class StringRef;
+class Module;
+class LLVMContext;
 
 namespace object {
+class IRObjectFile;
 
 class MachOUniversalBinary : public Binary {
   virtual void anchor();
@@ -101,6 +104,8 @@ public:
     }
 
     Expected<std::unique_ptr<MachOObjectFile>> getAsObjectFile() const;
+    Expected<std::unique_ptr<IRObjectFile>>
+    getAsIRObject(LLVMContext &Ctx) const;
 
     Expected<std::unique_ptr<Archive>> getAsArchive() const;
   };
@@ -154,6 +159,9 @@ public:
   Expected<std::unique_ptr<MachOObjectFile>>
   getMachOObjectForArch(StringRef ArchName) const;
 
+  Expected<std::unique_ptr<IRObjectFile>>
+  getIRObjectForArch(StringRef ArchName, LLVMContext &Ctx) const;
+
   Expected<std::unique_ptr<Archive>>
   getArchiveForArch(StringRef ArchName) const;
 };
diff --git a/contrib/llvm-project/llvm/include/llvm/Object/MachOUniversalWriter.h b/contrib/llvm-project/llvm/include/llvm/Object/MachOUniversalWriter.h
new file mode 100644
index 000000000000..cdfedcf0379e
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/Object/MachOUniversalWriter.h
@@ -0,0 +1,102 @@
+//===- MachOUniversalWriter.h - MachO universal binary writer----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Declares the Slice class and writeUniversalBinary function for writing a
+// MachO universal binary file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_OBJECT_MACHOUNIVERSALWRITER_H
+#define LLVM_OBJECT_MACHOUNIVERSALWRITER_H
+
+#include "llvm/Object/Archive.h"
+#include "llvm/Object/Binary.h"
+#include "llvm/Object/MachO.h"
+
+namespace llvm {
+class LLVMContext;
+
+namespace object {
+class IRObjectFile;
+
+class Slice {
+  const Binary *B;
+  uint32_t CPUType;
+  uint32_t CPUSubType;
+  std::string ArchName;
+
+  // P2Alignment field stores slice alignment values from universal
+  // binaries. This is also needed to order the slices so the total
+  // file size can be calculated before creating the output buffer.
+  uint32_t P2Alignment;
+
+  Slice(const IRObjectFile &IRO, uint32_t CPUType, uint32_t CPUSubType,
+        std::string ArchName, uint32_t Align);
+
+public:
+  explicit Slice(const MachOObjectFile &O);
+
+  Slice(const MachOObjectFile &O, uint32_t Align);
+
+  /// This constructor takes pre-specified \param CPUType , \param CPUSubType ,
+  /// \param ArchName , \param Align instead of inferring them from the archive
+  /// members.
+  Slice(const Archive &A, uint32_t CPUType, uint32_t CPUSubType,
+        std::string ArchName, uint32_t Align);
+
+  static Expected<Slice> create(const Archive &A,
+                                LLVMContext *LLVMCtx = nullptr);
+
+  static Expected<Slice> create(const IRObjectFile &IRO, uint32_t Align);
+
+  void setP2Alignment(uint32_t Align) { P2Alignment = Align; }
+
+  const Binary *getBinary() const { return B; }
+
+  uint32_t getCPUType() const { return CPUType; }
+
+  uint32_t getCPUSubType() const { return CPUSubType; }
+
+  uint32_t getP2Alignment() const { return P2Alignment; }
+
+  uint64_t getCPUID() const {
+    return static_cast<uint64_t>(CPUType) << 32 | CPUSubType;
+  }
+
+  std::string getArchString() const {
+    if (!ArchName.empty())
+      return ArchName;
+    return ("unknown(" + Twine(CPUType) + "," +
+            Twine(CPUSubType & ~MachO::CPU_SUBTYPE_MASK) + ")")
+        .str();
+  }
+
+  friend bool operator<(const Slice &Lhs, const Slice &Rhs) {
+    if (Lhs.CPUType == Rhs.CPUType)
+      return Lhs.CPUSubType < Rhs.CPUSubType;
+    // force arm64-family to follow after all other slices for
+    // compatibility with cctools lipo
+    if (Lhs.CPUType == MachO::CPU_TYPE_ARM64)
+      return false;
+    if (Rhs.CPUType == MachO::CPU_TYPE_ARM64)
+      return true;
+    // Sort by alignment to minimize file size
+    return Lhs.P2Alignment < Rhs.P2Alignment;
+  }
+};
+
+Error writeUniversalBinary(ArrayRef<Slice> Slices, StringRef OutputFileName);
+
+Expected<std::unique_ptr<MemoryBuffer>>
+writeUniversalBinaryToBuffer(ArrayRef<Slice> Slices);
+
+} // end namespace object
+
+} // end namespace llvm
+
+#endif // LLVM_OBJECT_MACHOUNIVERSALWRITER_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Object/ObjectFile.h b/contrib/llvm-project/llvm/include/llvm/Object/ObjectFile.h
index 8e8937201716..27e40cbdbece 100644
--- a/contrib/llvm-project/llvm/include/llvm/Object/ObjectFile.h
+++ b/contrib/llvm-project/llvm/include/llvm/Object/ObjectFile.h
@@ -327,6 +327,7 @@ public:
   virtual StringRef getFileFormatName() const = 0;
   virtual Triple::ArchType getArch() const = 0;
   virtual SubtargetFeatures getFeatures() const = 0;
+  virtual Optional<StringRef> tryGetCPUName() const { return None; };
   virtual void setARMSubArch(Triple &TheTriple) const { }
   virtual Expected<uint64_t> getStartAddress() const {
     return errorCodeToError(object_error::parse_failed);
@@ -349,7 +350,8 @@ public:
   createObjectFile(StringRef ObjectPath);
 
   static Expected<std::unique_ptr<ObjectFile>>
-  createObjectFile(MemoryBufferRef Object, llvm::file_magic Type);
+  createObjectFile(MemoryBufferRef Object, llvm::file_magic Type,
+                   bool InitContent = true);
   static Expected<std::unique_ptr<ObjectFile>>
   createObjectFile(MemoryBufferRef Object) {
     return createObjectFile(Object, llvm::file_magic::unknown);
@@ -366,7 +368,7 @@ public:
   createXCOFFObjectFile(MemoryBufferRef Object, unsigned FileType);
 
   static Expected<std::unique_ptr<ObjectFile>>
-  createELFObjectFile(MemoryBufferRef Object);
+  createELFObjectFile(MemoryBufferRef Object, bool InitContent = true);
 
   static Expected<std::unique_ptr<MachOObjectFile>>
   createMachOObjectFile(MemoryBufferRef Object,
diff --git a/contrib/llvm-project/llvm/include/llvm/Object/RelocationResolver.h b/contrib/llvm-project/llvm/include/llvm/Object/RelocationResolver.h
index 1246dcc5ec73..46f74e90a91b 100644
--- a/contrib/llvm-project/llvm/include/llvm/Object/RelocationResolver.h
+++ b/contrib/llvm-project/llvm/include/llvm/Object/RelocationResolver.h
@@ -31,11 +31,17 @@
 namespace llvm {
 namespace object {
 
-using RelocationResolver = uint64_t (*)(RelocationRef R, uint64_t S, uint64_t A);
+using SupportsRelocation = bool (*)(uint64_t);
+using RelocationResolver = uint64_t (*)(uint64_t Type, uint64_t Offset,
+                                        uint64_t S, uint64_t LocData,
+                                        int64_t Addend);
 
-std::pair<bool (*)(uint64_t), RelocationResolver>
+std::pair<SupportsRelocation, RelocationResolver>
 getRelocationResolver(const ObjectFile &Obj);
 
+uint64_t resolveRelocation(RelocationResolver Resolver, const RelocationRef &R,
+                           uint64_t S, uint64_t LocData);
+
 } // end namespace object
 } // end namespace llvm
 
diff --git a/contrib/llvm-project/llvm/include/llvm/Object/StackMapParser.h b/contrib/llvm-project/llvm/include/llvm/Object/StackMapParser.h
index b408f4041034..4ee67112ea5e 100644
--- a/contrib/llvm-project/llvm/include/llvm/Object/StackMapParser.h
+++ b/contrib/llvm-project/llvm/include/llvm/Object/StackMapParser.h
@@ -11,6 +11,7 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/iterator_range.h"
+#include "llvm/Object/ELF.h"
 #include "llvm/Support/Endian.h"
 #include <cassert>
 #include <cstddef>
@@ -35,11 +36,13 @@ public:
       return tmp;
     }
 
-    bool operator==(const AccessorIterator &Other) {
+    bool operator==(const AccessorIterator &Other) const {
       return A.P == Other.A.P;
     }
 
-    bool operator!=(const AccessorIterator &Other) { return !(*this == Other); }
+    bool operator!=(const AccessorIterator &Other) const {
+      return !(*this == Other);
+    }
 
     AccessorT& operator*() { return A; }
     AccessorT* operator->() { return &A; }
@@ -318,6 +321,23 @@ public:
     }
   }
 
+  /// Validates the header of the specified stack map section.
+  static Error validateHeader(ArrayRef<uint8_t> StackMapSection) {
+    // See the comment for StackMaps::emitStackmapHeader().
+    if (StackMapSection.size() < 16)
+      return object::createError(
+          "the stack map section size (" + Twine(StackMapSection.size()) +
+          ") is less than the minimum possible size of its header (16)");
+
+    unsigned Version = StackMapSection[0];
+    if (Version != 3)
+      return object::createError(
+          "the version (" + Twine(Version) +
+          ") of the stack map section is unsupported, the "
+          "supported version is 3");
+    return Error::success();
+  }
+
   using function_iterator = AccessorIterator<FunctionAccessor>;
   using constant_iterator = AccessorIterator<ConstantAccessor>;
   using record_iterator = AccessorIterator<RecordAccessor>;
diff --git a/contrib/llvm-project/llvm/include/llvm/Object/SymbolicFile.h b/contrib/llvm-project/llvm/include/llvm/Object/SymbolicFile.h
index a0d8b7225598..012f9f7fad07 100644
--- a/contrib/llvm-project/llvm/include/llvm/Object/SymbolicFile.h
+++ b/contrib/llvm-project/llvm/include/llvm/Object/SymbolicFile.h
@@ -161,18 +161,18 @@ public:
   // construction aux.
   static Expected<std::unique_ptr<SymbolicFile>>
   createSymbolicFile(MemoryBufferRef Object, llvm::file_magic Type,
-                     LLVMContext *Context);
+                     LLVMContext *Context, bool InitContent = true);
 
   static Expected<std::unique_ptr<SymbolicFile>>
   createSymbolicFile(MemoryBufferRef Object) {
     return createSymbolicFile(Object, llvm::file_magic::unknown, nullptr);
   }
-  static Expected<OwningBinary<SymbolicFile>>
-  createSymbolicFile(StringRef ObjectPath);
 
   static bool classof(const Binary *v) {
     return v->isSymbolic();
   }
+
+  static bool isSymbolicFile(file_magic Type, const LLVMContext *Context);
 };
 
 inline BasicSymbolRef::BasicSymbolRef(DataRefImpl SymbolP,
diff --git a/contrib/llvm-project/llvm/include/llvm/Object/Wasm.h b/contrib/llvm-project/llvm/include/llvm/Object/Wasm.h
index dc90c891ab95..f7cd2e622ae3 100644
--- a/contrib/llvm-project/llvm/include/llvm/Object/Wasm.h
+++ b/contrib/llvm-project/llvm/include/llvm/Object/Wasm.h
@@ -36,13 +36,15 @@ class WasmSymbol {
 public:
   WasmSymbol(const wasm::WasmSymbolInfo &Info,
              const wasm::WasmGlobalType *GlobalType,
+             const wasm::WasmTableType *TableType,
              const wasm::WasmEventType *EventType,
              const wasm::WasmSignature *Signature)
-      : Info(Info), GlobalType(GlobalType), EventType(EventType),
-        Signature(Signature) {}
+      : Info(Info), GlobalType(GlobalType), TableType(TableType),
+        EventType(EventType), Signature(Signature) {}
 
   const wasm::WasmSymbolInfo &Info;
   const wasm::WasmGlobalType *GlobalType;
+  const wasm::WasmTableType *TableType;
   const wasm::WasmEventType *EventType;
   const wasm::WasmSignature *Signature;
 
@@ -50,6 +52,8 @@ public:
     return Info.Kind == wasm::WASM_SYMBOL_TYPE_FUNCTION;
   }
 
+  bool isTypeTable() const { return Info.Kind == wasm::WASM_SYMBOL_TYPE_TABLE; }
+
   bool isTypeData() const { return Info.Kind == wasm::WASM_SYMBOL_TYPE_DATA; }
 
   bool isTypeGlobal() const {
@@ -105,6 +109,7 @@ struct WasmSection {
   uint32_t Type = 0;         // Section type (See below)
   uint32_t Offset = 0;       // Offset with in the file
   StringRef Name;            // Section name (User-defined sections only)
+  uint32_t Comdat = UINT32_MAX; // From the "comdat info" section
   ArrayRef<uint8_t> Content; // Section content
   std::vector<wasm::WasmRelocation> Relocations; // Relocations for this section
 };
@@ -146,9 +151,10 @@ public:
   ArrayRef<wasm::WasmElemSegment> elements() const { return ElemSegments; }
   ArrayRef<WasmSegment> dataSegments() const { return DataSegments; }
   ArrayRef<wasm::WasmFunction> functions() const { return Functions; }
-  ArrayRef<wasm::WasmFunctionName> debugNames() const { return DebugNames; }
+  ArrayRef<wasm::WasmDebugName> debugNames() const { return DebugNames; }
   uint32_t startFunction() const { return StartFunction; }
   uint32_t getNumImportedGlobals() const { return NumImportedGlobals; }
+  uint32_t getNumImportedTables() const { return NumImportedTables; }
   uint32_t getNumImportedFunctions() const { return NumImportedFunctions; }
   uint32_t getNumImportedEvents() const { return NumImportedEvents; }
   uint32_t getNumSections() const { return Sections.size(); }
@@ -214,10 +220,13 @@ private:
   bool isValidFunctionIndex(uint32_t Index) const;
   bool isDefinedFunctionIndex(uint32_t Index) const;
   bool isValidGlobalIndex(uint32_t Index) const;
+  bool isValidTableIndex(uint32_t Index) const;
   bool isDefinedGlobalIndex(uint32_t Index) const;
+  bool isDefinedTableIndex(uint32_t Index) const;
   bool isValidEventIndex(uint32_t Index) const;
   bool isDefinedEventIndex(uint32_t Index) const;
   bool isValidFunctionSymbol(uint32_t Index) const;
+  bool isValidTableSymbol(uint32_t Index) const;
   bool isValidGlobalSymbol(uint32_t Index) const;
   bool isValidEventSymbol(uint32_t Index) const;
   bool isValidDataSymbol(uint32_t Index) const;
@@ -277,19 +286,22 @@ private:
   llvm::Optional<size_t> DataCount;
   std::vector<wasm::WasmFunction> Functions;
   std::vector<WasmSymbol> Symbols;
-  std::vector<wasm::WasmFunctionName> DebugNames;
+  std::vector<wasm::WasmDebugName> DebugNames;
   uint32_t StartFunction = -1;
   bool HasLinkingSection = false;
   bool HasDylinkSection = false;
   bool SeenCodeSection = false;
+  bool HasMemory64 = false;
   wasm::WasmLinkingData LinkingData;
   uint32_t NumImportedGlobals = 0;
+  uint32_t NumImportedTables = 0;
   uint32_t NumImportedFunctions = 0;
   uint32_t NumImportedEvents = 0;
   uint32_t CodeSection = 0;
   uint32_t DataSection = 0;
   uint32_t EventSection = 0;
   uint32_t GlobalSection = 0;
+  uint32_t TableSection = 0;
 };
 
 class WasmSectionOrderChecker {
diff --git a/contrib/llvm-project/llvm/include/llvm/Object/WasmTraits.h b/contrib/llvm-project/llvm/include/llvm/Object/WasmTraits.h
deleted file mode 100644
index 3eee8e71b187..000000000000
--- a/contrib/llvm-project/llvm/include/llvm/Object/WasmTraits.h
+++ /dev/null
@@ -1,68 +0,0 @@
-//===- WasmTraits.h - DenseMap traits for the Wasm structures ---*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file provides llvm::DenseMapInfo traits for the Wasm structures.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_OBJECT_WASMTRAITS_H
-#define LLVM_OBJECT_WASMTRAITS_H
-
-#include "llvm/ADT/Hashing.h"
-#include "llvm/BinaryFormat/Wasm.h"
-
-namespace llvm {
-
-template <typename T> struct DenseMapInfo;
-
-// Traits for using WasmSignature in a DenseMap.
-template <> struct DenseMapInfo<wasm::WasmSignature> {
-  static wasm::WasmSignature getEmptyKey() {
-    wasm::WasmSignature Sig;
-    Sig.State = wasm::WasmSignature::Empty;
-    return Sig;
-  }
-  static wasm::WasmSignature getTombstoneKey() {
-    wasm::WasmSignature Sig;
-    Sig.State = wasm::WasmSignature::Tombstone;
-    return Sig;
-  }
-  static unsigned getHashValue(const wasm::WasmSignature &Sig) {
-    uintptr_t H = hash_value(Sig.State);
-    for (auto Ret : Sig.Returns)
-      H = hash_combine(H, Ret);
-    for (auto Param : Sig.Params)
-      H = hash_combine(H, Param);
-    return H;
-  }
-  static bool isEqual(const wasm::WasmSignature &LHS,
-                      const wasm::WasmSignature &RHS) {
-    return LHS == RHS;
-  }
-};
-
-// Traits for using WasmGlobalType in a DenseMap
-template <> struct DenseMapInfo<wasm::WasmGlobalType> {
-  static wasm::WasmGlobalType getEmptyKey() {
-    return wasm::WasmGlobalType{1, true};
-  }
-  static wasm::WasmGlobalType getTombstoneKey() {
-    return wasm::WasmGlobalType{2, true};
-  }
-  static unsigned getHashValue(const wasm::WasmGlobalType &GlobalType) {
-    return hash_combine(GlobalType.Type, GlobalType.Mutable);
-  }
-  static bool isEqual(const wasm::WasmGlobalType &LHS,
-                      const wasm::WasmGlobalType &RHS) {
-    return LHS == RHS;
-  }
-};
-
-} // end namespace llvm
-
-#endif // LLVM_OBJECT_WASMTRAITS_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Object/XCOFFObjectFile.h b/contrib/llvm-project/llvm/include/llvm/Object/XCOFFObjectFile.h
index 9c2470736023..1ac00ed5e2c7 100644
--- a/contrib/llvm-project/llvm/include/llvm/Object/XCOFFObjectFile.h
+++ b/contrib/llvm-project/llvm/include/llvm/Object/XCOFFObjectFile.h
@@ -13,6 +13,8 @@
 #ifndef LLVM_OBJECT_XCOFFOBJECTFILE_H
 #define LLVM_OBJECT_XCOFFOBJECTFILE_H
 
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/BinaryFormat/XCOFF.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/Endian.h"
@@ -370,6 +372,8 @@ public:
 
   Expected<ArrayRef<XCOFFRelocation32>>
   relocations(const XCOFFSectionHeader32 &) const;
+
+  static bool classof(const Binary *B) { return B->isXCOFF(); }
 }; // XCOFFObjectFile
 
 class XCOFFSymbolRef {
@@ -391,6 +395,103 @@ public:
   bool isFunction() const;
 };
 
+class TBVectorExt {
+  friend class XCOFFTracebackTable;
+
+  uint16_t Data;
+  uint32_t VecParmsInfo;
+
+  TBVectorExt(StringRef TBvectorStrRef);
+
+public:
+  uint8_t getNumberOfVRSaved() const;
+  bool isVRSavedOnStack() const;
+  bool hasVarArgs() const;
+  uint8_t getNumberOfVectorParms() const;
+  bool hasVMXInstruction() const;
+  SmallString<32> getVectorParmsInfoString() const;
+};
+
+/// This class provides methods to extract traceback table data from a buffer.
+/// The various accessors may reference the buffer provided via the constructor.
+
+class XCOFFTracebackTable {
+  const uint8_t *const TBPtr;
+  Optional<SmallString<32>> ParmsType;
+  Optional<uint32_t> TraceBackTableOffset;
+  Optional<uint32_t> HandlerMask;
+  Optional<uint32_t> NumOfCtlAnchors;
+  Optional<SmallVector<uint32_t, 8>> ControlledStorageInfoDisp;
+  Optional<StringRef> FunctionName;
+  Optional<uint8_t> AllocaRegister;
+  Optional<TBVectorExt> VecExt;
+  Optional<uint8_t> ExtensionTable;
+
+  XCOFFTracebackTable(const uint8_t *Ptr, uint64_t &Size, Error &Err);
+public:
+  /// Parse an XCOFF Traceback Table from \a Ptr with \a Size bytes.
+  /// Returns an XCOFFTracebackTable upon successful parsing, otherwise an
+  /// Error is returned.
+  ///
+  /// \param[in] Ptr
+  ///   A pointer that points just past the initial 4 bytes of zeros at the
+  ///   beginning of an XCOFF Traceback Table.
+  ///
+  /// \param[in, out] Size
+  ///    A pointer that points to the length of the XCOFF Traceback Table.
+  ///    If the XCOFF Traceback Table is not parsed successfully or there are
+  ///    extra bytes that are not recognized, \a Size will be updated to be the
+  ///    size up to the end of the last successfully parsed field of the table.
+  static Expected<XCOFFTracebackTable> create(const uint8_t *Ptr,
+                                              uint64_t &Size);
+  uint8_t getVersion() const;
+  uint8_t getLanguageID() const;
+
+  bool isGlobalLinkage() const;
+  bool isOutOfLineEpilogOrPrologue() const;
+  bool hasTraceBackTableOffset() const;
+  bool isInternalProcedure() const;
+  bool hasControlledStorage() const;
+  bool isTOCless() const;
+  bool isFloatingPointPresent() const;
+  bool isFloatingPointOperationLogOrAbortEnabled() const;
+
+  bool isInterruptHandler() const;
+  bool isFuncNamePresent() const;
+  bool isAllocaUsed() const;
+  uint8_t getOnConditionDirective() const;
+  bool isCRSaved() const;
+  bool isLRSaved() const;
+
+  bool isBackChainStored() const;
+  bool isFixup() const;
+  uint8_t getNumOfFPRsSaved() const;
+
+  bool hasVectorInfo() const;
+  bool hasExtensionTable() const;
+  uint8_t getNumOfGPRsSaved() const;
+
+  uint8_t getNumberOfFixedParms() const;
+
+  uint8_t getNumberOfFPParms() const;
+  bool hasParmsOnStack() const;
+
+  const Optional<SmallString<32>> &getParmsType() const { return ParmsType; }
+  const Optional<uint32_t> &getTraceBackTableOffset() const {
+    return TraceBackTableOffset;
+  }
+  const Optional<uint32_t> &getHandlerMask() const { return HandlerMask; }
+  const Optional<uint32_t> &getNumOfCtlAnchors() { return NumOfCtlAnchors; }
+  const Optional<SmallVector<uint32_t, 8>> &getControlledStorageInfoDisp() {
+    return ControlledStorageInfoDisp;
+  }
+  const Optional<StringRef> &getFunctionName() const { return FunctionName; }
+  const Optional<uint8_t> &getAllocaRegister() const { return AllocaRegister; }
+  const Optional<TBVectorExt> &getVectorExt() const { return VecExt; }
+  const Optional<uint8_t> &getExtensionTable() const { return ExtensionTable; }
+};
+
+bool doesXCOFFTracebackTableBegin(ArrayRef<uint8_t> Bytes);
 } // namespace object
 } // namespace llvm
 
diff --git a/contrib/llvm-project/llvm/include/llvm/ObjectYAML/ArchiveYAML.h b/contrib/llvm-project/llvm/include/llvm/ObjectYAML/ArchiveYAML.h
new file mode 100644
index 000000000000..8d05feedcc62
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/ObjectYAML/ArchiveYAML.h
@@ -0,0 +1,77 @@
+//===- ArchiveYAML.h - Archive YAMLIO implementation ------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file declares classes for handling the YAML representation of archives.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_OBJECTYAML_ARCHIVEYAML_H
+#define LLVM_OBJECTYAML_ARCHIVEYAML_H
+
+#include "llvm/Support/YAMLTraits.h"
+#include "llvm/ObjectYAML/YAML.h"
+#include "llvm/ADT/MapVector.h"
+
+namespace llvm {
+namespace ArchYAML {
+
+struct Archive {
+  struct Child {
+    struct Field {
+      Field() = default;
+      Field(StringRef Default, unsigned Length)
+          : DefaultValue(Default), MaxLength(Length) {}
+      StringRef Value;
+      StringRef DefaultValue;
+      unsigned MaxLength;
+    };
+
+    Child() {
+      Fields["Name"] = {"", 16};
+      Fields["LastModified"] = {"0", 12};
+      Fields["UID"] = {"0", 6};
+      Fields["GID"] = {"0", 6};
+      Fields["AccessMode"] = {"0", 8};
+      Fields["Size"] = {"0", 10};
+      Fields["Terminator"] = {"`\n", 2};
+    }
+
+    MapVector<StringRef, Field> Fields;
+
+    Optional<yaml::BinaryRef> Content;
+    Optional<llvm::yaml::Hex8> PaddingByte;
+  };
+
+  StringRef Magic;
+  Optional<std::vector<Child>> Members;
+  Optional<yaml::BinaryRef> Content;
+};
+
+} // end namespace ArchYAML
+} // end namespace llvm
+
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ArchYAML::Archive::Child)
+
+namespace llvm {
+namespace yaml {
+
+template <> struct MappingTraits<ArchYAML::Archive> {
+  static void mapping(IO &IO, ArchYAML::Archive &A);
+  static std::string validate(IO &, ArchYAML::Archive &A);
+};
+
+template <> struct MappingTraits<ArchYAML::Archive::Child> {
+  static void mapping(IO &IO, ArchYAML::Archive::Child &C);
+  static std::string validate(IO &, ArchYAML::Archive::Child &C);
+};
+
+} // end namespace yaml
+} // end namespace llvm
+
+#endif // LLVM_OBJECTYAML_ARCHIVEYAML_H
diff --git a/contrib/llvm-project/llvm/include/llvm/ObjectYAML/DWARFEmitter.h b/contrib/llvm-project/llvm/include/llvm/ObjectYAML/DWARFEmitter.h
index 0ec3f90e1686..eb56d1e29326 100644
--- a/contrib/llvm-project/llvm/include/llvm/ObjectYAML/DWARFEmitter.h
+++ b/contrib/llvm-project/llvm/include/llvm/ObjectYAML/DWARFEmitter.h
@@ -33,15 +33,23 @@ Error emitDebugStr(raw_ostream &OS, const Data &DI);
 
 Error emitDebugAranges(raw_ostream &OS, const Data &DI);
 Error emitDebugRanges(raw_ostream &OS, const Data &DI);
-Error emitPubSection(raw_ostream &OS, const PubSection &Sect,
-                     bool IsLittleEndian, bool IsGNUPubSec = false);
+Error emitDebugPubnames(raw_ostream &OS, const Data &DI);
+Error emitDebugPubtypes(raw_ostream &OS, const Data &DI);
+Error emitDebugGNUPubnames(raw_ostream &OS, const Data &DI);
+Error emitDebugGNUPubtypes(raw_ostream &OS, const Data &DI);
 Error emitDebugInfo(raw_ostream &OS, const Data &DI);
 Error emitDebugLine(raw_ostream &OS, const Data &DI);
 Error emitDebugAddr(raw_ostream &OS, const Data &DI);
+Error emitDebugStrOffsets(raw_ostream &OS, const Data &DI);
+Error emitDebugRnglists(raw_ostream &OS, const Data &DI);
+Error emitDebugLoclists(raw_ostream &OS, const Data &DI);
 
+std::function<Error(raw_ostream &, const Data &)>
+getDWARFEmitterByName(StringRef SecName);
 Expected<StringMap<std::unique_ptr<MemoryBuffer>>>
-emitDebugSections(StringRef YAMLString, bool ApplyFixups = false,
-                  bool IsLittleEndian = sys::IsLittleEndianHost);
+emitDebugSections(StringRef YAMLString,
+                  bool IsLittleEndian = sys::IsLittleEndianHost,
+                  bool Is64BitAddrSize = true);
 } // end namespace DWARFYAML
 } // end namespace llvm
 
diff --git a/contrib/llvm-project/llvm/include/llvm/ObjectYAML/DWARFYAML.h b/contrib/llvm-project/llvm/include/llvm/ObjectYAML/DWARFYAML.h
index 9f62a4a2be57..856cea9a1535 100644
--- a/contrib/llvm-project/llvm/include/llvm/ObjectYAML/DWARFYAML.h
+++ b/contrib/llvm-project/llvm/include/llvm/ObjectYAML/DWARFYAML.h
@@ -18,33 +18,15 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/ObjectYAML/YAML.h"
 #include "llvm/Support/YAMLTraits.h"
 #include <cstdint>
+#include <unordered_map>
 #include <vector>
 
 namespace llvm {
 namespace DWARFYAML {
 
-struct InitialLength {
-  uint32_t TotalLength;
-  uint64_t TotalLength64;
-
-  bool isDWARF64() const { return TotalLength == UINT32_MAX; }
-
-  uint64_t getLength() const {
-    return isDWARF64() ? TotalLength64 : TotalLength;
-  }
-
-  void setLength(uint64_t Len) {
-    if (Len >= (uint64_t)UINT32_MAX) {
-      TotalLength64 = Len;
-      TotalLength = UINT32_MAX;
-    } else {
-      TotalLength = Len;
-    }
-  }
-};
-
 struct AttributeAbbrev {
   llvm::dwarf::Attribute Attribute;
   llvm::dwarf::Form Form;
@@ -58,18 +40,23 @@ struct Abbrev {
   std::vector<AttributeAbbrev> Attributes;
 };
 
+struct AbbrevTable {
+  Optional<uint64_t> ID;
+  std::vector<Abbrev> Table;
+};
+
 struct ARangeDescriptor {
   llvm::yaml::Hex64 Address;
-  uint64_t Length;
+  yaml::Hex64 Length;
 };
 
 struct ARange {
   dwarf::DwarfFormat Format;
-  uint64_t Length;
+  Optional<yaml::Hex64> Length;
   uint16_t Version;
-  uint32_t CuOffset;
-  uint8_t AddrSize;
-  uint8_t SegSize;
+  yaml::Hex64 CuOffset;
+  Optional<yaml::Hex8> AddrSize;
+  yaml::Hex8 SegSize;
   std::vector<ARangeDescriptor> Descriptors;
 };
 
@@ -94,7 +81,8 @@ struct PubEntry {
 };
 
 struct PubSection {
-  InitialLength Length;
+  dwarf::DwarfFormat Format;
+  yaml::Hex64 Length;
   uint16_t Version;
   uint32_t UnitOffset;
   uint32_t UnitSize;
@@ -120,11 +108,12 @@ struct DWARFContext {
 
 struct Unit {
   dwarf::DwarfFormat Format;
-  uint64_t Length;
+  Optional<yaml::Hex64> Length;
   uint16_t Version;
+  Optional<uint8_t> AddrSize;
   llvm::dwarf::UnitType Type; // Added in DWARF 5
-  yaml::Hex64 AbbrOffset;
-  uint8_t AddrSize;
+  Optional<uint64_t> AbbrevTableID;
+  Optional<yaml::Hex64> AbbrOffset;
   std::vector<Entry> Entries;
 };
 
@@ -137,7 +126,7 @@ struct File {
 
 struct LineTableOpcode {
   dwarf::LineNumberOps Opcode;
-  uint64_t ExtLen;
+  Optional<uint64_t> ExtLen;
   dwarf::LineNumberExtendedOps SubOpcode;
   uint64_t Data;
   int64_t SData;
@@ -148,16 +137,16 @@ struct LineTableOpcode {
 
 struct LineTable {
   dwarf::DwarfFormat Format;
-  uint64_t Length;
+  Optional<uint64_t> Length;
   uint16_t Version;
-  uint64_t PrologueLength;
+  Optional<uint64_t> PrologueLength;
   uint8_t MinInstLength;
   uint8_t MaxOpsPerInst;
   uint8_t DefaultIsStmt;
   uint8_t LineBase;
   uint8_t LineRange;
-  uint8_t OpcodeBase;
-  std::vector<uint8_t> StandardOpcodeLengths;
+  Optional<uint8_t> OpcodeBase;
+  Optional<std::vector<uint8_t>> StandardOpcodeLengths;
   std::vector<StringRef> IncludeDirs;
   std::vector<File> Files;
   std::vector<LineTableOpcode> Opcodes;
@@ -177,14 +166,56 @@ struct AddrTableEntry {
   std::vector<SegAddrPair> SegAddrPairs;
 };
 
+struct StringOffsetsTable {
+  dwarf::DwarfFormat Format;
+  Optional<yaml::Hex64> Length;
+  yaml::Hex16 Version;
+  yaml::Hex16 Padding;
+  std::vector<yaml::Hex64> Offsets;
+};
+
+struct DWARFOperation {
+  dwarf::LocationAtom Operator;
+  std::vector<yaml::Hex64> Values;
+};
+
+struct RnglistEntry {
+  dwarf::RnglistEntries Operator;
+  std::vector<yaml::Hex64> Values;
+};
+
+struct LoclistEntry {
+  dwarf::LoclistEntries Operator;
+  std::vector<yaml::Hex64> Values;
+  Optional<yaml::Hex64> DescriptionsLength;
+  std::vector<DWARFOperation> Descriptions;
+};
+
+template <typename EntryType> struct ListEntries {
+  Optional<std::vector<EntryType>> Entries;
+  Optional<yaml::BinaryRef> Content;
+};
+
+template <typename EntryType> struct ListTable {
+  dwarf::DwarfFormat Format;
+  Optional<yaml::Hex64> Length;
+  yaml::Hex16 Version;
+  Optional<yaml::Hex8> AddrSize;
+  yaml::Hex8 SegSelectorSize;
+  Optional<uint32_t> OffsetEntryCount;
+  Optional<std::vector<yaml::Hex64>> Offsets;
+  std::vector<ListEntries<EntryType>> Lists;
+};
+
 struct Data {
   bool IsLittleEndian;
   bool Is64BitAddrSize;
-  std::vector<Abbrev> AbbrevDecls;
-  std::vector<StringRef> DebugStrings;
-  std::vector<ARange> ARanges;
-  std::vector<Ranges> DebugRanges;
-  std::vector<AddrTableEntry> DebugAddr;
+  std::vector<AbbrevTable> DebugAbbrev;
+  Optional<std::vector<StringRef>> DebugStrings;
+  Optional<std::vector<StringOffsetsTable>> DebugStrOffsets;
+  Optional<std::vector<ARange>> DebugAranges;
+  Optional<std::vector<Ranges>> DebugRanges;
+  Optional<std::vector<AddrTableEntry>> DebugAddr;
   Optional<PubSection> PubNames;
   Optional<PubSection> PubTypes;
 
@@ -194,10 +225,23 @@ struct Data {
   std::vector<Unit> CompileUnits;
 
   std::vector<LineTable> DebugLines;
+  Optional<std::vector<ListTable<RnglistEntry>>> DebugRnglists;
+  Optional<std::vector<ListTable<LoclistEntry>>> DebugLoclists;
 
   bool isEmpty() const;
 
-  SetVector<StringRef> getUsedSectionNames() const;
+  SetVector<StringRef> getNonEmptySectionNames() const;
+
+  struct AbbrevTableInfo {
+    uint64_t Index;
+    uint64_t Offset;
+  };
+  Expected<AbbrevTableInfo> getAbbrevTableInfoByID(uint64_t ID) const;
+  StringRef getAbbrevTableContentByIndex(uint64_t Index) const;
+
+private:
+  mutable std::unordered_map<uint64_t, AbbrevTableInfo> AbbrevTableInfoMap;
+  mutable std::unordered_map<uint64_t, std::string> AbbrevTableContents;
 };
 
 } // end namespace DWARFYAML
@@ -205,6 +249,7 @@ struct Data {
 
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DWARFYAML::AttributeAbbrev)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DWARFYAML::Abbrev)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DWARFYAML::AbbrevTable)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DWARFYAML::ARangeDescriptor)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DWARFYAML::ARange)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DWARFYAML::RangeEntry)
@@ -218,6 +263,18 @@ LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DWARFYAML::LineTable)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DWARFYAML::LineTableOpcode)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DWARFYAML::SegAddrPair)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DWARFYAML::AddrTableEntry)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DWARFYAML::StringOffsetsTable)
+LLVM_YAML_IS_SEQUENCE_VECTOR(
+    llvm::DWARFYAML::ListTable<DWARFYAML::RnglistEntry>)
+LLVM_YAML_IS_SEQUENCE_VECTOR(
+    llvm::DWARFYAML::ListEntries<DWARFYAML::RnglistEntry>)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DWARFYAML::RnglistEntry)
+LLVM_YAML_IS_SEQUENCE_VECTOR(
+    llvm::DWARFYAML::ListTable<DWARFYAML::LoclistEntry>)
+LLVM_YAML_IS_SEQUENCE_VECTOR(
+    llvm::DWARFYAML::ListEntries<DWARFYAML::LoclistEntry>)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DWARFYAML::LoclistEntry)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DWARFYAML::DWARFOperation)
 
 namespace llvm {
 namespace yaml {
@@ -226,6 +283,10 @@ template <> struct MappingTraits<DWARFYAML::Data> {
   static void mapping(IO &IO, DWARFYAML::Data &DWARF);
 };
 
+template <> struct MappingTraits<DWARFYAML::AbbrevTable> {
+  static void mapping(IO &IO, DWARFYAML::AbbrevTable &AbbrevTable);
+};
+
 template <> struct MappingTraits<DWARFYAML::Abbrev> {
   static void mapping(IO &IO, DWARFYAML::Abbrev &Abbrev);
 };
@@ -286,12 +347,36 @@ template <> struct MappingTraits<DWARFYAML::SegAddrPair> {
   static void mapping(IO &IO, DWARFYAML::SegAddrPair &SegAddrPair);
 };
 
+template <> struct MappingTraits<DWARFYAML::DWARFOperation> {
+  static void mapping(IO &IO, DWARFYAML::DWARFOperation &DWARFOperation);
+};
+
+template <typename EntryType>
+struct MappingTraits<DWARFYAML::ListTable<EntryType>> {
+  static void mapping(IO &IO, DWARFYAML::ListTable<EntryType> &ListTable);
+};
+
+template <typename EntryType>
+struct MappingTraits<DWARFYAML::ListEntries<EntryType>> {
+  static void mapping(IO &IO, DWARFYAML::ListEntries<EntryType> &ListEntries);
+  static std::string validate(IO &IO,
+                              DWARFYAML::ListEntries<EntryType> &ListEntries);
+};
+
+template <> struct MappingTraits<DWARFYAML::RnglistEntry> {
+  static void mapping(IO &IO, DWARFYAML::RnglistEntry &RnglistEntry);
+};
+
+template <> struct MappingTraits<DWARFYAML::LoclistEntry> {
+  static void mapping(IO &IO, DWARFYAML::LoclistEntry &LoclistEntry);
+};
+
 template <> struct MappingTraits<DWARFYAML::AddrTableEntry> {
   static void mapping(IO &IO, DWARFYAML::AddrTableEntry &AddrTable);
 };
 
-template <> struct MappingTraits<DWARFYAML::InitialLength> {
-  static void mapping(IO &IO, DWARFYAML::InitialLength &DWARF);
+template <> struct MappingTraits<DWARFYAML::StringOffsetsTable> {
+  static void mapping(IO &IO, DWARFYAML::StringOffsetsTable &StrOffsetsTable);
 };
 
 template <> struct ScalarEnumerationTraits<dwarf::DwarfFormat> {
@@ -369,6 +454,34 @@ template <> struct ScalarEnumerationTraits<dwarf::Constants> {
   }
 };
 
+#define HANDLE_DW_RLE(unused, name)                                            \
+  io.enumCase(value, "DW_RLE_" #name, dwarf::DW_RLE_##name);
+
+template <> struct ScalarEnumerationTraits<dwarf::RnglistEntries> {
+  static void enumeration(IO &io, dwarf::RnglistEntries &value) {
+#include "llvm/BinaryFormat/Dwarf.def"
+  }
+};
+
+#define HANDLE_DW_LLE(unused, name)                                            \
+  io.enumCase(value, "DW_LLE_" #name, dwarf::DW_LLE_##name);
+
+template <> struct ScalarEnumerationTraits<dwarf::LoclistEntries> {
+  static void enumeration(IO &io, dwarf::LoclistEntries &value) {
+#include "llvm/BinaryFormat/Dwarf.def"
+  }
+};
+
+#define HANDLE_DW_OP(id, name, version, vendor)                                \
+  io.enumCase(value, "DW_OP_" #name, dwarf::DW_OP_##name);
+
+template <> struct ScalarEnumerationTraits<dwarf::LocationAtom> {
+  static void enumeration(IO &io, dwarf::LocationAtom &value) {
+#include "llvm/BinaryFormat/Dwarf.def"
+    io.enumFallback<yaml::Hex8>(value);
+  }
+};
+
 } // end namespace yaml
 } // end namespace llvm
 
diff --git a/contrib/llvm-project/llvm/include/llvm/ObjectYAML/ELFYAML.h b/contrib/llvm-project/llvm/include/llvm/ObjectYAML/ELFYAML.h
index b1ffb20681ea..4f3c76bbd82c 100644
--- a/contrib/llvm-project/llvm/include/llvm/ObjectYAML/ELFYAML.h
+++ b/contrib/llvm-project/llvm/include/llvm/ObjectYAML/ELFYAML.h
@@ -16,6 +16,8 @@
 #define LLVM_OBJECTYAML_ELFYAML_H
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/Object/ELFTypes.h"
 #include "llvm/ObjectYAML/DWARFYAML.h"
 #include "llvm/ObjectYAML/YAML.h"
 #include "llvm/Support/YAMLTraits.h"
@@ -69,6 +71,41 @@ LLVM_YAML_STRONG_TYPEDEF(uint32_t, MIPS_ISA)
 LLVM_YAML_STRONG_TYPEDEF(StringRef, YAMLFlowString)
 LLVM_YAML_STRONG_TYPEDEF(int64_t, YAMLIntUInt)
 
+template <class ELFT>
+unsigned getDefaultShEntSize(unsigned EMachine, ELF_SHT SecType,
+                             StringRef SecName) {
+  if (EMachine == ELF::EM_MIPS && SecType == ELF::SHT_MIPS_ABIFLAGS)
+    return sizeof(object::Elf_Mips_ABIFlags<ELFT>);
+
+  switch (SecType) {
+  case ELF::SHT_SYMTAB:
+  case ELF::SHT_DYNSYM:
+    return sizeof(typename ELFT::Sym);
+  case ELF::SHT_GROUP:
+    return sizeof(typename ELFT::Word);
+  case ELF::SHT_REL:
+    return sizeof(typename ELFT::Rel);
+  case ELF::SHT_RELA:
+    return sizeof(typename ELFT::Rela);
+  case ELF::SHT_RELR:
+    return sizeof(typename ELFT::Relr);
+  case ELF::SHT_DYNAMIC:
+    return sizeof(typename ELFT::Dyn);
+  case ELF::SHT_HASH:
+    return sizeof(typename ELFT::Word);
+  case ELF::SHT_SYMTAB_SHNDX:
+    return sizeof(typename ELFT::Word);
+  case ELF::SHT_GNU_versym:
+    return sizeof(typename ELFT::Half);
+  case ELF::SHT_LLVM_CALL_GRAPH_PROFILE:
+    return sizeof(object::Elf_CGProfile_Impl<ELFT>);
+  default:
+    if (SecName == ".debug_str")
+      return 1;
+    return 0;
+  }
+}
+
 // For now, hardcode 64 bits everywhere that 32 or 64 would be needed
 // since 64-bit can hold 32-bit values too.
 struct FileHeader {
@@ -77,7 +114,7 @@ struct FileHeader {
   ELF_ELFOSABI OSABI;
   llvm::yaml::Hex8 ABIVersion;
   ELF_ET Type;
-  ELF_EM Machine;
+  Optional<ELF_EM> Machine;
   ELF_EF Flags;
   llvm::yaml::Hex64 Entry;
 
@@ -94,24 +131,14 @@ struct SectionHeader {
   StringRef Name;
 };
 
-struct SectionHeaderTable {
-  Optional<std::vector<SectionHeader>> Sections;
-  Optional<std::vector<SectionHeader>> Excluded;
-  Optional<bool> NoHeaders;
-};
-
-struct SectionName {
-  StringRef Section;
-};
-
 struct Symbol {
   StringRef Name;
   ELF_STT Type;
-  StringRef Section;
+  Optional<StringRef> Section;
   Optional<ELF_SHN> Index;
   ELF_STB Binding;
-  llvm::yaml::Hex64 Value;
-  llvm::yaml::Hex64 Size;
+  Optional<llvm::yaml::Hex64> Value;
+  Optional<llvm::yaml::Hex64> Size;
   Optional<uint8_t> Other;
 
   Optional<uint32_t> StName;
@@ -126,6 +153,16 @@ struct DynamicEntry {
   llvm::yaml::Hex64 Val;
 };
 
+struct BBAddrMapEntry {
+  struct BBEntry {
+    llvm::yaml::Hex32 AddressOffset;
+    llvm::yaml::Hex32 Size;
+    llvm::yaml::Hex32 Metadata;
+  };
+  llvm::yaml::Hex64 Address;
+  Optional<std::vector<BBEntry>> BBEntries;
+};
+
 struct StackSizeEntry {
   llvm::yaml::Hex64 Address;
   llvm::yaml::Hex64 Size;
@@ -153,19 +190,29 @@ struct Chunk {
     StackSizes,
     SymtabShndxSection,
     Symver,
+    ARMIndexTable,
     MipsABIFlags,
     Addrsig,
-    Fill,
     LinkerOptions,
     DependentLibraries,
-    CallGraphProfile
+    CallGraphProfile,
+    BBAddrMap,
+
+    // Special chunks.
+    SpecialChunksStart,
+    Fill = SpecialChunksStart,
+    SectionHeaderTable,
   };
 
   ChunkKind Kind;
   StringRef Name;
   Optional<llvm::yaml::Hex64> Offset;
 
-  Chunk(ChunkKind K) : Kind(K) {}
+  // Usually chunks are not created implicitly, but rather loaded from YAML.
+  // This flag is used to signal whether this is the case or not.
+  bool IsImplicit;
+
+  Chunk(ChunkKind K, bool Implicit) : Kind(K), IsImplicit(Implicit) {}
   virtual ~Chunk();
 };
 
@@ -173,25 +220,35 @@ struct Section : public Chunk {
   ELF_SHT Type;
   Optional<ELF_SHF> Flags;
   Optional<llvm::yaml::Hex64> Address;
-  StringRef Link;
+  Optional<StringRef> Link;
   llvm::yaml::Hex64 AddressAlign;
   Optional<llvm::yaml::Hex64> EntSize;
 
-  // Usually sections are not created implicitly, but loaded from YAML.
-  // When they are, this flag is used to signal about that.
-  bool IsImplicit;
+  Optional<yaml::BinaryRef> Content;
+  Optional<llvm::yaml::Hex64> Size;
 
   // Holds the original section index.
   unsigned OriginalSecNdx;
 
-  Section(ChunkKind Kind, bool IsImplicit = false)
-      : Chunk(Kind), IsImplicit(IsImplicit) {}
+  Section(ChunkKind Kind, bool IsImplicit = false) : Chunk(Kind, IsImplicit) {}
 
-  static bool classof(const Chunk *S) { return S->Kind != ChunkKind::Fill; }
+  static bool classof(const Chunk *S) {
+    return S->Kind < ChunkKind::SpecialChunksStart;
+  }
+
+  // Some derived sections might have their own special entries. This method
+  // returns a vector of <entry name, is used> pairs. It is used for section
+  // validation.
+  virtual std::vector<std::pair<StringRef, bool>> getEntries() const {
+    return {};
+  };
 
   // The following members are used to override section fields which is
   // useful for creating invalid objects.
 
+  // This can be used to override the sh_addralign field.
+  Optional<llvm::yaml::Hex64> ShAddrAlign;
+
   // This can be used to override the offset stored in the sh_name field.
   // It does not affect the name stored in the string table.
   Optional<llvm::yaml::Hex64> ShName;
@@ -206,6 +263,12 @@ struct Section : public Chunk {
 
   // This can be used to override the sh_flags field.
   Optional<llvm::yaml::Hex64> ShFlags;
+
+  // This can be used to override the sh_type field. It is useful when we
+  // want to use specific YAML keys for a section of a particular type to
+  // describe the content, but still want to have a different final type
+  // for the section.
+  Optional<ELF_SHT> ShType;
 };
 
 // Fill is a block of data which is placed outside of sections. It is
@@ -215,18 +278,57 @@ struct Fill : Chunk {
   Optional<yaml::BinaryRef> Pattern;
   llvm::yaml::Hex64 Size;
 
-  Fill() : Chunk(ChunkKind::Fill) {}
+  Fill() : Chunk(ChunkKind::Fill, /*Implicit=*/false) {}
 
   static bool classof(const Chunk *S) { return S->Kind == ChunkKind::Fill; }
 };
 
+struct SectionHeaderTable : Chunk {
+  SectionHeaderTable(bool IsImplicit)
+      : Chunk(ChunkKind::SectionHeaderTable, IsImplicit) {}
+
+  static bool classof(const Chunk *S) {
+    return S->Kind == ChunkKind::SectionHeaderTable;
+  }
+
+  Optional<std::vector<SectionHeader>> Sections;
+  Optional<std::vector<SectionHeader>> Excluded;
+  Optional<bool> NoHeaders;
+
+  size_t getNumHeaders(size_t SectionsNum) const {
+    if (IsImplicit)
+      return SectionsNum;
+    if (NoHeaders)
+      return (*NoHeaders) ? 0 : SectionsNum;
+    return (Sections ? Sections->size() : 0) + /*Null section*/ 1;
+  }
+
+  static constexpr StringRef TypeStr = "SectionHeaderTable";
+};
+
+struct BBAddrMapSection : Section {
+  Optional<std::vector<BBAddrMapEntry>> Entries;
+
+  BBAddrMapSection() : Section(ChunkKind::BBAddrMap) {}
+
+  std::vector<std::pair<StringRef, bool>> getEntries() const override {
+    return {{"Entries", Entries.hasValue()}};
+  };
+
+  static bool classof(const Chunk *S) {
+    return S->Kind == ChunkKind::BBAddrMap;
+  }
+};
+
 struct StackSizesSection : Section {
-  Optional<yaml::BinaryRef> Content;
-  Optional<llvm::yaml::Hex64> Size;
   Optional<std::vector<StackSizeEntry>> Entries;
 
   StackSizesSection() : Section(ChunkKind::StackSizes) {}
 
+  std::vector<std::pair<StringRef, bool>> getEntries() const override {
+    return {{"Entries", Entries.hasValue()}};
+  };
+
   static bool classof(const Chunk *S) {
     return S->Kind == ChunkKind::StackSizes;
   }
@@ -237,17 +339,18 @@ struct StackSizesSection : Section {
 };
 
 struct DynamicSection : Section {
-  std::vector<DynamicEntry> Entries;
-  Optional<yaml::BinaryRef> Content;
+  Optional<std::vector<DynamicEntry>> Entries;
 
   DynamicSection() : Section(ChunkKind::Dynamic) {}
 
+  std::vector<std::pair<StringRef, bool>> getEntries() const override {
+    return {{"Entries", Entries.hasValue()}};
+  };
+
   static bool classof(const Chunk *S) { return S->Kind == ChunkKind::Dynamic; }
 };
 
 struct RawContentSection : Section {
-  Optional<yaml::BinaryRef> Content;
-  Optional<llvm::yaml::Hex64> Size;
   Optional<llvm::yaml::Hex64> Info;
 
   RawContentSection() : Section(ChunkKind::RawContent) {}
@@ -261,29 +364,31 @@ struct RawContentSection : Section {
 };
 
 struct NoBitsSection : Section {
-  llvm::yaml::Hex64 Size;
-
   NoBitsSection() : Section(ChunkKind::NoBits) {}
 
   static bool classof(const Chunk *S) { return S->Kind == ChunkKind::NoBits; }
 };
 
 struct NoteSection : Section {
-  Optional<yaml::BinaryRef> Content;
-  Optional<llvm::yaml::Hex64> Size;
   Optional<std::vector<ELFYAML::NoteEntry>> Notes;
 
   NoteSection() : Section(ChunkKind::Note) {}
 
+  std::vector<std::pair<StringRef, bool>> getEntries() const override {
+    return {{"Notes", Notes.hasValue()}};
+  };
+
   static bool classof(const Chunk *S) { return S->Kind == ChunkKind::Note; }
 };
 
 struct HashSection : Section {
-  Optional<yaml::BinaryRef> Content;
-  Optional<llvm::yaml::Hex64> Size;
   Optional<std::vector<uint32_t>> Bucket;
   Optional<std::vector<uint32_t>> Chain;
 
+  std::vector<std::pair<StringRef, bool>> getEntries() const override {
+    return {{"Bucket", Bucket.hasValue()}, {"Chain", Chain.hasValue()}};
+  };
+
   // The following members are used to override section fields.
   // This is useful for creating invalid objects.
   Optional<llvm::yaml::Hex64> NBucket;
@@ -315,8 +420,6 @@ struct GnuHashHeader {
 };
 
 struct GnuHashSection : Section {
-  Optional<yaml::BinaryRef> Content;
-
   Optional<GnuHashHeader> Header;
   Optional<std::vector<llvm::yaml::Hex64>> BloomFilter;
   Optional<std::vector<llvm::yaml::Hex32>> HashBuckets;
@@ -324,6 +427,13 @@ struct GnuHashSection : Section {
 
   GnuHashSection() : Section(ChunkKind::GnuHash) {}
 
+  std::vector<std::pair<StringRef, bool>> getEntries() const override {
+    return {{"Header", Header.hasValue()},
+            {"BloomFilter", BloomFilter.hasValue()},
+            {"HashBuckets", HashBuckets.hasValue()},
+            {"HashValues", HashValues.hasValue()}};
+  };
+
   static bool classof(const Chunk *S) { return S->Kind == ChunkKind::GnuHash; }
 };
 
@@ -341,24 +451,29 @@ struct VerneedEntry {
 };
 
 struct VerneedSection : Section {
-  Optional<yaml::BinaryRef> Content;
   Optional<std::vector<VerneedEntry>> VerneedV;
-  llvm::yaml::Hex64 Info;
+  Optional<llvm::yaml::Hex64> Info;
 
   VerneedSection() : Section(ChunkKind::Verneed) {}
 
+  std::vector<std::pair<StringRef, bool>> getEntries() const override {
+    return {{"Dependencies", VerneedV.hasValue()}};
+  };
+
   static bool classof(const Chunk *S) {
     return S->Kind == ChunkKind::Verneed;
   }
 };
 
 struct AddrsigSection : Section {
-  Optional<yaml::BinaryRef> Content;
-  Optional<llvm::yaml::Hex64> Size;
   Optional<std::vector<YAMLFlowString>> Symbols;
 
   AddrsigSection() : Section(ChunkKind::Addrsig) {}
 
+  std::vector<std::pair<StringRef, bool>> getEntries() const override {
+    return {{"Symbols", Symbols.hasValue()}};
+  };
+
   static bool classof(const Chunk *S) { return S->Kind == ChunkKind::Addrsig; }
 };
 
@@ -369,10 +484,13 @@ struct LinkerOption {
 
 struct LinkerOptionsSection : Section {
   Optional<std::vector<LinkerOption>> Options;
-  Optional<yaml::BinaryRef> Content;
 
   LinkerOptionsSection() : Section(ChunkKind::LinkerOptions) {}
 
+  std::vector<std::pair<StringRef, bool>> getEntries() const override {
+    return {{"Options", Options.hasValue()}};
+  };
+
   static bool classof(const Chunk *S) {
     return S->Kind == ChunkKind::LinkerOptions;
   }
@@ -380,10 +498,13 @@ struct LinkerOptionsSection : Section {
 
 struct DependentLibrariesSection : Section {
   Optional<std::vector<YAMLFlowString>> Libs;
-  Optional<yaml::BinaryRef> Content;
 
   DependentLibrariesSection() : Section(ChunkKind::DependentLibraries) {}
 
+  std::vector<std::pair<StringRef, bool>> getEntries() const override {
+    return {{"Libraries", Libs.hasValue()}};
+  };
+
   static bool classof(const Chunk *S) {
     return S->Kind == ChunkKind::DependentLibraries;
   }
@@ -401,49 +522,62 @@ struct CallGraphEntry {
 
 struct CallGraphProfileSection : Section {
   Optional<std::vector<CallGraphEntry>> Entries;
-  Optional<yaml::BinaryRef> Content;
 
   CallGraphProfileSection() : Section(ChunkKind::CallGraphProfile) {}
 
+  std::vector<std::pair<StringRef, bool>> getEntries() const override {
+    return {{"Entries", Entries.hasValue()}};
+  };
+
   static bool classof(const Chunk *S) {
     return S->Kind == ChunkKind::CallGraphProfile;
   }
 };
 
 struct SymverSection : Section {
-  std::vector<uint16_t> Entries;
+  Optional<std::vector<uint16_t>> Entries;
 
   SymverSection() : Section(ChunkKind::Symver) {}
 
+  std::vector<std::pair<StringRef, bool>> getEntries() const override {
+    return {{"Entries", Entries.hasValue()}};
+  };
+
   static bool classof(const Chunk *S) { return S->Kind == ChunkKind::Symver; }
 };
 
 struct VerdefEntry {
-  uint16_t Version;
-  uint16_t Flags;
-  uint16_t VersionNdx;
-  uint32_t Hash;
+  Optional<uint16_t> Version;
+  Optional<uint16_t> Flags;
+  Optional<uint16_t> VersionNdx;
+  Optional<uint32_t> Hash;
   std::vector<StringRef> VerNames;
 };
 
 struct VerdefSection : Section {
   Optional<std::vector<VerdefEntry>> Entries;
-  Optional<yaml::BinaryRef> Content;
-
-  llvm::yaml::Hex64 Info;
+  Optional<llvm::yaml::Hex64> Info;
 
   VerdefSection() : Section(ChunkKind::Verdef) {}
 
+  std::vector<std::pair<StringRef, bool>> getEntries() const override {
+    return {{"Entries", Entries.hasValue()}};
+  };
+
   static bool classof(const Chunk *S) { return S->Kind == ChunkKind::Verdef; }
 };
 
-struct Group : Section {
+struct GroupSection : Section {
   // Members of a group contain a flag and a list of section indices
   // that are part of the group.
-  std::vector<SectionOrType> Members;
+  Optional<std::vector<SectionOrType>> Members;
   Optional<StringRef> Signature; /* Info */
 
-  Group() : Section(ChunkKind::Group) {}
+  GroupSection() : Section(ChunkKind::Group) {}
+
+  std::vector<std::pair<StringRef, bool>> getEntries() const override {
+    return {{"Members", Members.hasValue()}};
+  };
 
   static bool classof(const Chunk *S) { return S->Kind == ChunkKind::Group; }
 };
@@ -456,11 +590,15 @@ struct Relocation {
 };
 
 struct RelocationSection : Section {
-  std::vector<Relocation> Relocations;
+  Optional<std::vector<Relocation>> Relocations;
   StringRef RelocatableSec; /* Info */
 
   RelocationSection() : Section(ChunkKind::Relocation) {}
 
+  std::vector<std::pair<StringRef, bool>> getEntries() const override {
+    return {{"Relocations", Relocations.hasValue()}};
+  };
+
   static bool classof(const Chunk *S) {
     return S->Kind == ChunkKind::Relocation;
   }
@@ -468,25 +606,51 @@ struct RelocationSection : Section {
 
 struct RelrSection : Section {
   Optional<std::vector<llvm::yaml::Hex64>> Entries;
-  Optional<yaml::BinaryRef> Content;
 
   RelrSection() : Section(ChunkKind::Relr) {}
 
+  std::vector<std::pair<StringRef, bool>> getEntries() const override {
+    return {{"Entries", Entries.hasValue()}};
+  };
+
   static bool classof(const Chunk *S) {
     return S->Kind == ChunkKind::Relr;
   }
 };
 
 struct SymtabShndxSection : Section {
-  std::vector<uint32_t> Entries;
+  Optional<std::vector<uint32_t>> Entries;
 
   SymtabShndxSection() : Section(ChunkKind::SymtabShndxSection) {}
 
+  std::vector<std::pair<StringRef, bool>> getEntries() const override {
+    return {{"Entries", Entries.hasValue()}};
+  };
+
   static bool classof(const Chunk *S) {
     return S->Kind == ChunkKind::SymtabShndxSection;
   }
 };
 
+struct ARMIndexTableEntry {
+  llvm::yaml::Hex32 Offset;
+  llvm::yaml::Hex32 Value;
+};
+
+struct ARMIndexTableSection : Section {
+  Optional<std::vector<ARMIndexTableEntry>> Entries;
+
+  ARMIndexTableSection() : Section(ChunkKind::ARMIndexTable) {}
+
+  std::vector<std::pair<StringRef, bool>> getEntries() const override {
+    return {{"Entries", Entries.hasValue()}};
+  };
+
+  static bool classof(const Chunk *S) {
+    return S->Kind == ChunkKind::ARMIndexTable;
+  }
+};
+
 // Represents .MIPS.abiflags section
 struct MipsABIFlags : Section {
   llvm::yaml::Hex16 Version;
@@ -517,15 +681,15 @@ struct ProgramHeader {
   Optional<llvm::yaml::Hex64> FileSize;
   Optional<llvm::yaml::Hex64> MemSize;
   Optional<llvm::yaml::Hex64> Offset;
+  Optional<StringRef> FirstSec;
+  Optional<StringRef> LastSec;
 
-  std::vector<SectionName> Sections;
-  // This vector is parallel to Sections and contains corresponding chunks.
+  // This vector contains all chunks from [FirstSec, LastSec].
   std::vector<Chunk *> Chunks;
 };
 
 struct Object {
   FileHeader Header;
-  Optional<SectionHeaderTable> SectionHeaders;
   std::vector<ProgramHeader> ProgramHeaders;
 
   // An object might contain output section descriptions as well as
@@ -547,12 +711,26 @@ struct Object {
         Ret.push_back(S);
     return Ret;
   }
+
+  const SectionHeaderTable &getSectionHeaderTable() const {
+    for (const std::unique_ptr<Chunk> &C : Chunks)
+      if (auto *S = dyn_cast<ELFYAML::SectionHeaderTable>(C.get()))
+        return *S;
+    llvm_unreachable("the section header table chunk must always be present");
+  }
+
+  unsigned getMachine() const;
 };
 
+bool shouldAllocateFileSpace(ArrayRef<ProgramHeader> Phdrs,
+                             const NoBitsSection &S);
+
 } // end namespace ELFYAML
 } // end namespace llvm
 
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::StackSizeEntry)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::BBAddrMapEntry)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::BBAddrMapEntry::BBEntry)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::DynamicEntry)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::LinkerOption)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::CallGraphEntry)
@@ -566,7 +744,7 @@ LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::VernauxEntry)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::VerneedEntry)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::Relocation)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::SectionOrType)
-LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::SectionName)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::ARMIndexTableEntry)
 
 namespace llvm {
 namespace yaml {
@@ -690,29 +868,33 @@ struct MappingTraits<ELFYAML::FileHeader> {
   static void mapping(IO &IO, ELFYAML::FileHeader &FileHdr);
 };
 
-template <> struct MappingTraits<ELFYAML::SectionHeaderTable> {
-  static void mapping(IO &IO, ELFYAML::SectionHeaderTable &SecHdrTable);
-  static StringRef validate(IO &IO, ELFYAML::SectionHeaderTable &SecHdrTable);
-};
-
 template <> struct MappingTraits<ELFYAML::SectionHeader> {
   static void mapping(IO &IO, ELFYAML::SectionHeader &SHdr);
 };
 
 template <> struct MappingTraits<ELFYAML::ProgramHeader> {
   static void mapping(IO &IO, ELFYAML::ProgramHeader &FileHdr);
+  static std::string validate(IO &IO, ELFYAML::ProgramHeader &FileHdr);
 };
 
 template <>
 struct MappingTraits<ELFYAML::Symbol> {
   static void mapping(IO &IO, ELFYAML::Symbol &Symbol);
-  static StringRef validate(IO &IO, ELFYAML::Symbol &Symbol);
+  static std::string validate(IO &IO, ELFYAML::Symbol &Symbol);
 };
 
 template <> struct MappingTraits<ELFYAML::StackSizeEntry> {
   static void mapping(IO &IO, ELFYAML::StackSizeEntry &Rel);
 };
 
+template <> struct MappingTraits<ELFYAML::BBAddrMapEntry> {
+  static void mapping(IO &IO, ELFYAML::BBAddrMapEntry &Rel);
+};
+
+template <> struct MappingTraits<ELFYAML::BBAddrMapEntry::BBEntry> {
+  static void mapping(IO &IO, ELFYAML::BBAddrMapEntry::BBEntry &Rel);
+};
+
 template <> struct MappingTraits<ELFYAML::GnuHashHeader> {
   static void mapping(IO &IO, ELFYAML::GnuHashHeader &Rel);
 };
@@ -749,9 +931,13 @@ template <> struct MappingTraits<ELFYAML::Relocation> {
   static void mapping(IO &IO, ELFYAML::Relocation &Rel);
 };
 
+template <> struct MappingTraits<ELFYAML::ARMIndexTableEntry> {
+  static void mapping(IO &IO, ELFYAML::ARMIndexTableEntry &E);
+};
+
 template <> struct MappingTraits<std::unique_ptr<ELFYAML::Chunk>> {
   static void mapping(IO &IO, std::unique_ptr<ELFYAML::Chunk> &C);
-  static StringRef validate(IO &io, std::unique_ptr<ELFYAML::Chunk> &C);
+  static std::string validate(IO &io, std::unique_ptr<ELFYAML::Chunk> &C);
 };
 
 template <>
@@ -763,10 +949,6 @@ template <> struct MappingTraits<ELFYAML::SectionOrType> {
   static void mapping(IO &IO, ELFYAML::SectionOrType &sectionOrType);
 };
 
-template <> struct MappingTraits<ELFYAML::SectionName> {
-  static void mapping(IO &IO, ELFYAML::SectionName &sectionName);
-};
-
 } // end namespace yaml
 } // end namespace llvm
 
diff --git a/contrib/llvm-project/llvm/include/llvm/ObjectYAML/MachOYAML.h b/contrib/llvm-project/llvm/include/llvm/ObjectYAML/MachOYAML.h
index fb6780b6d0ed..94e66c5ae787 100644
--- a/contrib/llvm-project/llvm/include/llvm/ObjectYAML/MachOYAML.h
+++ b/contrib/llvm-project/llvm/include/llvm/ObjectYAML/MachOYAML.h
@@ -220,7 +220,7 @@ template <> struct MappingTraits<MachOYAML::Relocation> {
 
 template <> struct MappingTraits<MachOYAML::Section> {
   static void mapping(IO &IO, MachOYAML::Section &Section);
-  static StringRef validate(IO &io, MachOYAML::Section &Section);
+  static std::string validate(IO &io, MachOYAML::Section &Section);
 };
 
 template <> struct MappingTraits<MachOYAML::NListEntry> {
diff --git a/contrib/llvm-project/llvm/include/llvm/ObjectYAML/MinidumpYAML.h b/contrib/llvm-project/llvm/include/llvm/ObjectYAML/MinidumpYAML.h
index c1711a28dd84..b0cee541cef2 100644
--- a/contrib/llvm-project/llvm/include/llvm/ObjectYAML/MinidumpYAML.h
+++ b/contrib/llvm-project/llvm/include/llvm/ObjectYAML/MinidumpYAML.h
@@ -236,7 +236,7 @@ template <> struct BlockScalarTraits<MinidumpYAML::BlockStringRef> {
 
 template <> struct MappingTraits<std::unique_ptr<MinidumpYAML::Stream>> {
   static void mapping(IO &IO, std::unique_ptr<MinidumpYAML::Stream> &S);
-  static StringRef validate(IO &IO, std::unique_ptr<MinidumpYAML::Stream> &S);
+  static std::string validate(IO &IO, std::unique_ptr<MinidumpYAML::Stream> &S);
 };
 
 template <> struct MappingContextTraits<minidump::MemoryDescriptor, BinaryRef> {
diff --git a/contrib/llvm-project/llvm/include/llvm/ObjectYAML/ObjectYAML.h b/contrib/llvm-project/llvm/include/llvm/ObjectYAML/ObjectYAML.h
index 0015fd3dc501..dd26ce3e9703 100644
--- a/contrib/llvm-project/llvm/include/llvm/ObjectYAML/ObjectYAML.h
+++ b/contrib/llvm-project/llvm/include/llvm/ObjectYAML/ObjectYAML.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_OBJECTYAML_OBJECTYAML_H
 #define LLVM_OBJECTYAML_OBJECTYAML_H
 
+#include "llvm/ObjectYAML/ArchiveYAML.h"
 #include "llvm/ObjectYAML/COFFYAML.h"
 #include "llvm/ObjectYAML/ELFYAML.h"
 #include "llvm/ObjectYAML/MachOYAML.h"
@@ -23,6 +24,7 @@ namespace yaml {
 class IO;
 
 struct YamlObjectFile {
+  std::unique_ptr<ArchYAML::Archive> Arch;
   std::unique_ptr<ELFYAML::Object> Elf;
   std::unique_ptr<COFFYAML::Object> Coff;
   std::unique_ptr<MachOYAML::Object> MachO;
diff --git a/contrib/llvm-project/llvm/include/llvm/ObjectYAML/WasmYAML.h b/contrib/llvm-project/llvm/include/llvm/ObjectYAML/WasmYAML.h
index bffb314e2d3b..80f1b4006205 100644
--- a/contrib/llvm-project/llvm/include/llvm/ObjectYAML/WasmYAML.h
+++ b/contrib/llvm-project/llvm/include/llvm/ObjectYAML/WasmYAML.h
@@ -53,6 +53,7 @@ struct Limits {
 struct Table {
   TableType ElemType;
   Limits TableLimits;
+  uint32_t Index;
 };
 
 struct Export {
@@ -220,6 +221,8 @@ struct NameSection : CustomSection {
   }
 
   std::vector<NameEntry> FunctionNames;
+  std::vector<NameEntry> GlobalNames;
+  std::vector<NameEntry> DataSegmentNames;
 };
 
 struct LinkingSection : CustomSection {
diff --git a/contrib/llvm-project/llvm/include/llvm/ObjectYAML/yaml2obj.h b/contrib/llvm-project/llvm/include/llvm/ObjectYAML/yaml2obj.h
index 34def363a55b..1f693475c946 100644
--- a/contrib/llvm-project/llvm/include/llvm/ObjectYAML/yaml2obj.h
+++ b/contrib/llvm-project/llvm/include/llvm/ObjectYAML/yaml2obj.h
@@ -40,12 +40,17 @@ namespace WasmYAML {
 struct Object;
 }
 
+namespace ArchYAML {
+struct Archive;
+}
+
 namespace yaml {
 class Input;
 struct YamlObjectFile;
 
 using ErrorHandler = llvm::function_ref<void(const Twine &Msg)>;
 
+bool yaml2archive(ArchYAML::Archive &Doc, raw_ostream &Out, ErrorHandler EH);
 bool yaml2coff(COFFYAML::Object &Doc, raw_ostream &Out, ErrorHandler EH);
 bool yaml2elf(ELFYAML::Object &Doc, raw_ostream &Out, ErrorHandler EH,
               uint64_t MaxSize);
diff --git a/contrib/llvm-project/llvm/include/llvm/Option/ArgList.h b/contrib/llvm-project/llvm/include/llvm/Option/ArgList.h
index 74bfadcba726..9ce783978185 100644
--- a/contrib/llvm-project/llvm/include/llvm/Option/ArgList.h
+++ b/contrib/llvm-project/llvm/include/llvm/Option/ArgList.h
@@ -412,6 +412,10 @@ public:
     return ArgStrings[Index];
   }
 
+  void replaceArgString(unsigned Index, const Twine &S) {
+    ArgStrings[Index] = MakeArgString(S);
+  }
+
   unsigned getNumInputArgStrings() const override {
     return NumInputArgStrings;
   }
diff --git a/contrib/llvm-project/llvm/include/llvm/Option/OptParser.td b/contrib/llvm-project/llvm/include/llvm/Option/OptParser.td
index e32355444d7b..9a179c511bd6 100644
--- a/contrib/llvm-project/llvm/include/llvm/Option/OptParser.td
+++ b/contrib/llvm-project/llvm/include/llvm/Option/OptParser.td
@@ -13,7 +13,7 @@
 
 // Define the kinds of options.
 
-class OptionKind<string name, int precedence = 0, bit sentinel = 0> {
+class OptionKind<string name, int precedence = 0, bit sentinel = false> {
   string Name = name;
   // The kind precedence, kinds with lower precedence are matched first.
   int Precedence = precedence;
@@ -24,9 +24,9 @@ class OptionKind<string name, int precedence = 0, bit sentinel = 0> {
 // An option group.
 def KIND_GROUP : OptionKind<"Group">;
 // The input option kind.
-def KIND_INPUT : OptionKind<"Input", 1, 1>;
+def KIND_INPUT : OptionKind<"Input", 1, true>;
 // The unknown option kind.
-def KIND_UNKNOWN : OptionKind<"Unknown", 2, 1>;
+def KIND_UNKNOWN : OptionKind<"Unknown", 2, true>;
 // A flag with no values.
 def KIND_FLAG : OptionKind<"Flag">;
 // An option which prefixes its (single) value.
@@ -97,17 +97,19 @@ class Option<list<string> prefixes, string name, OptionKind kind> {
   OptionGroup Group = ?;
   Option Alias = ?;
   list<string> AliasArgs = [];
-  string MarshallingKind = ?;
+  code MacroPrefix = "";
   code KeyPath = ?;
   code DefaultValue = ?;
-  bit ShouldAlwaysEmit = 0;
-  // Used by the Flag option kind.
-  bit IsPositive = 1;
-  // Used by the String option kind.
+  code ImpliedValue = ?;
+  code ImpliedCheck = "false";
+  code ShouldParse = "true";
+  bit ShouldAlwaysEmit = false;
   code NormalizerRetTy = ?;
   code NormalizedValuesScope = "";
   code Normalizer = "";
   code Denormalizer = "";
+  code ValueMerger = "mergeForwardValue";
+  code ValueExtractor = "extractForwardValue";
   list<code> NormalizedValues = ?;
 }
 
@@ -144,34 +146,85 @@ class ValuesCode<code valuecode> { code ValuesCode = valuecode; }
 
 // Helpers for defining marshalling information.
 
-class MarshallingInfo<code keypath, code defaultvalue> {
-  code KeyPath = keypath;
+class KeyPathAndMacro<string key_path_prefix, string key_path_base,
+                      string macro_prefix = ""> {
+  code KeyPath = !strconcat(key_path_prefix, key_path_base);
+  code MacroPrefix = macro_prefix;
+}
+
+def EmptyKPM : KeyPathAndMacro<"", "">;
+
+class ImpliedByAnyOf<list<string> key_paths, code value = "true"> {
+  code ImpliedCheck = !foldl("false", key_paths, accumulator, key_path,
+                             !strconcat(accumulator, " || ", key_path));
+  code ImpliedValue = value;
+}
+
+class MarshallingInfo<KeyPathAndMacro kpm, code defaultvalue> {
+  code KeyPath = kpm.KeyPath;
+  code MacroPrefix = kpm.MacroPrefix;
   code DefaultValue = defaultvalue;
 }
-class MarshallingInfoString<code keypath, code defaultvalue, code normalizerretty>
-  : MarshallingInfo<keypath, defaultvalue> {
-  string MarshallingKind = "string";
-  code NormalizerRetTy = normalizerretty;
+
+class MarshallingInfoString<KeyPathAndMacro kpm, code defaultvalue="std::string()">
+  : MarshallingInfo<kpm, defaultvalue> {
+  code Normalizer = "normalizeString";
+  code Denormalizer = "denormalizeString";
+}
+
+class MarshallingInfoStringInt<KeyPathAndMacro kpm, code defaultvalue="0", code type="unsigned">
+  : MarshallingInfo<kpm, defaultvalue> {
+  code Normalizer = "normalizeStringIntegral<"#type#">";
+  code Denormalizer = "denormalizeString";
+}
+
+class MarshallingInfoStringVector<KeyPathAndMacro kpm>
+  : MarshallingInfo<kpm, "std::vector<std::string>({})"> {
+  code Normalizer = "normalizeStringVector";
+  code Denormalizer = "denormalizeStringVector";
+}
+
+class MarshallingInfoFlag<KeyPathAndMacro kpm, code defaultvalue = "false">
+  : MarshallingInfo<kpm, defaultvalue> {
+  code Normalizer = "normalizeSimpleFlag";
+  code Denormalizer = "denormalizeSimpleFlag";
+}
+
+class MarshallingInfoNegativeFlag<KeyPathAndMacro kpm, code defaultvalue = "true">
+  : MarshallingInfo<kpm, defaultvalue> {
+  code Normalizer = "normalizeSimpleNegativeFlag";
+  code Denormalizer = "denormalizeSimpleFlag";
+}
+
+class MarshallingInfoBitfieldFlag<KeyPathAndMacro kpm, code value>
+  : MarshallingInfoFlag<kpm, "0u"> {
+  code Normalizer = "makeFlagToValueNormalizer("#value#")";
+  code ValueMerger = "mergeMaskValue";
+  code ValueExtractor = "(extractMaskValue<unsigned, decltype("#value#"), "#value#">)";
 }
 
-class MarshallingInfoFlag<code keypath, code defaultvalue>
-  : MarshallingInfo<keypath, defaultvalue> {
-  string MarshallingKind = "flag";
+// Marshalling info for booleans. Applied to the flag setting keypath to false.
+class MarshallingInfoBooleanFlag<KeyPathAndMacro kpm, code defaultvalue, code value, code name,
+                                 code other_value, code other_name>
+  : MarshallingInfoFlag<kpm, defaultvalue> {
+  code Normalizer = "makeBooleanOptionNormalizer("#value#", "#other_value#", OPT_"#other_name#")";
+  code Denormalizer = "makeBooleanOptionDenormalizer("#value#")";
 }
 
 // Mixins for additional marshalling attributes.
 
-class IsNegative { bit IsPositive = 0; }
-class AlwaysEmit { bit ShouldAlwaysEmit = 1; }
+class ShouldParseIf<code condition> { code ShouldParse = condition; }
+class AlwaysEmit { bit ShouldAlwaysEmit = true; }
 class Normalizer<code normalizer> { code Normalizer = normalizer; }
 class Denormalizer<code denormalizer> { code Denormalizer = denormalizer; }
 class NormalizedValuesScope<code scope> { code NormalizedValuesScope = scope; }
 class NormalizedValues<list<code> definitions> { list<code> NormalizedValues = definitions; } 
-class DenormalizeString { code Denormalizer = "denormalizeString"; }
 class AutoNormalizeEnum {
   code Normalizer = "normalizeSimpleEnum";
   code Denormalizer = "denormalizeSimpleEnum";
 }
+class ValueMerger<code merger> { code ValueMerger = merger; }
+class ValueExtractor<code extractor> { code ValueExtractor = extractor; }
 
 // Predefined options.
 
diff --git a/contrib/llvm-project/llvm/include/llvm/Option/OptTable.h b/contrib/llvm-project/llvm/include/llvm/Option/OptTable.h
index 5db30436069d..58c09b23d237 100644
--- a/contrib/llvm-project/llvm/include/llvm/Option/OptTable.h
+++ b/contrib/llvm-project/llvm/include/llvm/Option/OptTable.h
@@ -13,6 +13,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/Option/OptSpecifier.h"
+#include "llvm/Support/StringSaver.h"
 #include <cassert>
 #include <string>
 #include <vector>
@@ -20,6 +21,7 @@
 namespace llvm {
 
 class raw_ostream;
+template <typename Fn> class function_ref;
 
 namespace opt {
 
@@ -48,7 +50,7 @@ public:
     unsigned ID;
     unsigned char Kind;
     unsigned char Param;
-    unsigned short Flags;
+    unsigned int Flags;
     unsigned short GroupID;
     unsigned short AliasID;
     const char *AliasArgs;
@@ -59,6 +61,8 @@ private:
   /// The option information table.
   std::vector<Info> OptionInfos;
   bool IgnoreCase;
+  bool GroupedShortOptions = false;
+  const char *EnvVar = nullptr;
 
   unsigned TheInputOptionID = 0;
   unsigned TheUnknownOptionID = 0;
@@ -79,6 +83,8 @@ private:
     return OptionInfos[id - 1];
   }
 
+  Arg *parseOneArgGrouped(InputArgList &Args, unsigned &Index) const;
+
 protected:
   OptTable(ArrayRef<Info> OptionInfos, bool IgnoreCase = false);
 
@@ -120,6 +126,12 @@ public:
     return getInfo(id).MetaVar;
   }
 
+  /// Specify the environment variable where initial options should be read.
+  void setInitialOptionsFromEnvironment(const char *E) { EnvVar = E; }
+
+  /// Support grouped short options. e.g. -ab represents -a -b.
+  void setGroupedShortOptions(bool Value) { GroupedShortOptions = Value; }
+
   /// Find possible value for given flags. This is used for shell
   /// autocompletion.
   ///
@@ -140,7 +152,7 @@ public:
   ///
   /// \return The vector of flags which start with Cur.
   std::vector<std::string> findByPrefix(StringRef Cur,
-                                        unsigned short DisableFlags) const;
+                                        unsigned int DisableFlags) const;
 
   /// Find the OptTable option that most closely matches the given string.
   ///
@@ -213,6 +225,18 @@ public:
                          unsigned &MissingArgCount, unsigned FlagsToInclude = 0,
                          unsigned FlagsToExclude = 0) const;
 
+  /// A convenience helper which handles optional initial options populated from
+  /// an environment variable, expands response files recursively and parses
+  /// options.
+  ///
+  /// \param ErrorFn - Called on a formatted error message for missing arguments
+  /// or unknown options.
+  /// \return An InputArgList; on error this will contain all the options which
+  /// could be parsed.
+  InputArgList parseArgs(int Argc, char *const *Argv, OptSpecifier Unknown,
+                         StringSaver &Saver,
+                         function_ref<void(StringRef)> ErrorFn) const;
+
   /// Render the help text for an option table.
   ///
   /// \param OS - The stream to write the help text to.
diff --git a/contrib/llvm-project/llvm/include/llvm/Option/Option.h b/contrib/llvm-project/llvm/include/llvm/Option/Option.h
index 73ee8e0073b8..196cf656355d 100644
--- a/contrib/llvm-project/llvm/include/llvm/Option/Option.h
+++ b/contrib/llvm-project/llvm/include/llvm/Option/Option.h
@@ -213,14 +213,16 @@ public:
   /// Index to the position where argument parsing should resume
   /// (even if the argument is missing values).
   ///
-  /// \param ArgSize The number of bytes taken up by the matched Option prefix
-  ///                and name. This is used to determine where joined values
-  ///                start.
-  Arg *accept(const ArgList &Args, unsigned &Index, unsigned ArgSize) const;
+  /// \p CurArg The argument to be matched. It may be shorter than the
+  /// underlying storage to represent a Joined argument.
+  /// \p GroupedShortOption If true, we are handling the fallback case of
+  /// parsing a prefix of the current argument as a short option.
+  Arg *accept(const ArgList &Args, StringRef CurArg, bool GroupedShortOption,
+              unsigned &Index) const;
 
 private:
-  Arg *acceptInternal(const ArgList &Args, unsigned &Index,
-                      unsigned ArgSize) const;
+  Arg *acceptInternal(const ArgList &Args, StringRef CurArg,
+                      unsigned &Index) const;
 
 public:
   void print(raw_ostream &O) const;
diff --git a/contrib/llvm-project/llvm/include/llvm/Pass.h b/contrib/llvm-project/llvm/include/llvm/Pass.h
index 2fe7aee2e37e..8aa9ba90a9ca 100644
--- a/contrib/llvm-project/llvm/include/llvm/Pass.h
+++ b/contrib/llvm-project/llvm/include/llvm/Pass.h
@@ -69,6 +69,20 @@ enum PassKind {
   PT_PassManager
 };
 
+/// This enumerates the LLVM full LTO or ThinLTO optimization phases.
+enum class ThinOrFullLTOPhase {
+  /// No LTO/ThinLTO behavior needed.
+  None,
+  /// ThinLTO prelink (summary) phase.
+  ThinLTOPreLink,
+  /// ThinLTO postlink (backend compile) phase.
+  ThinLTOPostLink,
+  /// Full LTO prelink phase.
+  FullLTOPreLink,
+  /// Full LTO postlink (backend compile) phase.
+  FullLTOPostLink
+};
+
 //===----------------------------------------------------------------------===//
 /// Pass interface - Implemented by all 'passes'.  Subclass this if you are an
 /// interprocedural optimization or you do not fit into any of the more
@@ -309,6 +323,12 @@ protected:
 /// then the value of this boolean will be true, otherwise false.
 /// This is the storage for the -time-passes option.
 extern bool TimePassesIsEnabled;
+/// If TimePassesPerRun is true, there would be one line of report for
+/// each pass invocation.
+/// If TimePassesPerRun is false, there would be only one line of
+/// report for each pass (even there are more than one pass objects).
+/// (For new pass manager only)
+extern bool TimePassesPerRun;
 
 } // end namespace llvm
 
diff --git a/contrib/llvm-project/llvm/include/llvm/PassAnalysisSupport.h b/contrib/llvm-project/llvm/include/llvm/PassAnalysisSupport.h
index 84df171d38d8..4bed3cb55a90 100644
--- a/contrib/llvm-project/llvm/include/llvm/PassAnalysisSupport.h
+++ b/contrib/llvm-project/llvm/include/llvm/PassAnalysisSupport.h
@@ -17,11 +17,12 @@
 
 #if !defined(LLVM_PASS_H) || defined(LLVM_PASSANALYSISSUPPORT_H)
 #error "Do not include <PassAnalysisSupport.h>; include <Pass.h> instead"
-#endif 
+#endif
 
 #ifndef LLVM_PASSANALYSISSUPPORT_H
 #define LLVM_PASSANALYSISSUPPORT_H
 
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include <cassert>
 #include <tuple>
@@ -58,6 +59,11 @@ private:
   SmallVector<AnalysisID, 0> Used;
   bool PreservesAll = false;
 
+  void pushUnique(VectorType &Set, AnalysisID ID) {
+    if (!llvm::is_contained(Set, ID))
+      Set.push_back(ID);
+  }
+
 public:
   AnalysisUsage() = default;
 
@@ -80,17 +86,17 @@ public:
   ///@{
   /// Add the specified ID to the set of analyses preserved by this pass.
   AnalysisUsage &addPreservedID(const void *ID) {
-    Preserved.push_back(ID);
+    pushUnique(Preserved, ID);
     return *this;
   }
   AnalysisUsage &addPreservedID(char &ID) {
-    Preserved.push_back(&ID);
+    pushUnique(Preserved, &ID);
     return *this;
   }
   /// Add the specified Pass class to the set of analyses preserved by this pass.
   template<class PassClass>
   AnalysisUsage &addPreserved() {
-    Preserved.push_back(&PassClass::ID);
+    pushUnique(Preserved, &PassClass::ID);
     return *this;
   }
   ///@}
@@ -99,17 +105,17 @@ public:
   /// Add the specified ID to the set of analyses used by this pass if they are
   /// available..
   AnalysisUsage &addUsedIfAvailableID(const void *ID) {
-    Used.push_back(ID);
+    pushUnique(Used, ID);
     return *this;
   }
   AnalysisUsage &addUsedIfAvailableID(char &ID) {
-    Used.push_back(&ID);
+    pushUnique(Used, &ID);
     return *this;
   }
   /// Add the specified Pass class to the set of analyses used by this pass.
   template<class PassClass>
   AnalysisUsage &addUsedIfAvailable() {
-    Used.push_back(&PassClass::ID);
+    pushUnique(Used, &PassClass::ID);
     return *this;
   }
   ///@}
@@ -183,7 +189,7 @@ public:
   }
 
   /// Return analysis result or null if it doesn't exist.
-  Pass *getAnalysisIfAvailable(AnalysisID ID, bool Direction) const;
+  Pass *getAnalysisIfAvailable(AnalysisID ID) const;
 
 private:
   /// This keeps track of which passes implements the interfaces that are
@@ -207,7 +213,7 @@ AnalysisType *Pass::getAnalysisIfAvailable() const {
 
   const void *PI = &AnalysisType::ID;
 
-  Pass *ResultPass = Resolver->getAnalysisIfAvailable(PI, true);
+  Pass *ResultPass = Resolver->getAnalysisIfAvailable(PI);
   if (!ResultPass) return nullptr;
 
   // Because the AnalysisType may not be a subclass of pass (for
diff --git a/contrib/llvm-project/llvm/include/llvm/Passes/PassBuilder.h b/contrib/llvm-project/llvm/include/llvm/Passes/PassBuilder.h
index 0357e4a2fc05..28f9e83bf76a 100644
--- a/contrib/llvm-project/llvm/include/llvm/Passes/PassBuilder.h
+++ b/contrib/llvm-project/llvm/include/llvm/Passes/PassBuilder.h
@@ -36,11 +36,15 @@ struct PGOOptions {
   enum CSPGOAction { NoCSAction, CSIRInstr, CSIRUse };
   PGOOptions(std::string ProfileFile = "", std::string CSProfileGenFile = "",
              std::string ProfileRemappingFile = "", PGOAction Action = NoAction,
-             CSPGOAction CSAction = NoCSAction, bool SamplePGOSupport = false)
+             CSPGOAction CSAction = NoCSAction,
+             bool DebugInfoForProfiling = false,
+             bool PseudoProbeForProfiling = false)
       : ProfileFile(ProfileFile), CSProfileGenFile(CSProfileGenFile),
         ProfileRemappingFile(ProfileRemappingFile), Action(Action),
-        CSAction(CSAction),
-        SamplePGOSupport(SamplePGOSupport || Action == SampleUse) {
+        CSAction(CSAction), DebugInfoForProfiling(DebugInfoForProfiling ||
+                                                  (Action == SampleUse &&
+                                                   !PseudoProbeForProfiling)),
+        PseudoProbeForProfiling(PseudoProbeForProfiling) {
     // Note, we do allow ProfileFile.empty() for Action=IRUse LTO can
     // callback with IRUse action without ProfileFile.
 
@@ -55,16 +59,26 @@ struct PGOOptions {
     // a profile.
     assert(this->CSAction != CSIRUse || this->Action == IRUse);
 
-    // If neither Action nor CSAction, SamplePGOSupport needs to be true.
+    // If neither Action nor CSAction, DebugInfoForProfiling or
+    // PseudoProbeForProfiling needs to be true.
     assert(this->Action != NoAction || this->CSAction != NoCSAction ||
-           this->SamplePGOSupport);
+           this->DebugInfoForProfiling || this->PseudoProbeForProfiling);
+
+    // Pseudo probe emission does not work with -fdebug-info-for-profiling since
+    // they both use the discriminator field of debug lines but for different
+    // purposes.
+    if (this->DebugInfoForProfiling && this->PseudoProbeForProfiling) {
+      report_fatal_error(
+          "Pseudo probes cannot be used with -debug-info-for-profiling", false);
+    }
   }
   std::string ProfileFile;
   std::string CSProfileGenFile;
   std::string ProfileRemappingFile;
   PGOAction Action;
   CSPGOAction CSAction;
-  bool SamplePGOSupport;
+  bool DebugInfoForProfiling;
+  bool PseudoProbeForProfiling;
 };
 
 /// Tunable parameters for passes in the default pipelines.
@@ -109,6 +123,13 @@ public:
   /// Tuning option to enable/disable call graph profile. Its default value is
   /// that of the flag: `-enable-npm-call-graph-profile`.
   bool CallGraphProfile;
+
+  /// Tuning option to enable/disable function merging. Its default value is
+  /// false.
+  bool MergeFunctions;
+
+  /// Uniquefy function linkage name. Its default value is false.
+  bool UniqueLinkageNames;
 };
 
 /// This class provides access to building LLVM's passes.
@@ -118,6 +139,7 @@ public:
 /// of the built-in passes, and those may reference these members during
 /// construction.
 class PassBuilder {
+  bool DebugLogging;
   TargetMachine *TM;
   PipelineTuningOptions PTO;
   Optional<PGOOptions> PGOOpt;
@@ -137,18 +159,6 @@ public:
     std::vector<PipelineElement> InnerPipeline;
   };
 
-  /// ThinLTO phase.
-  ///
-  /// This enumerates the LLVM ThinLTO optimization phases.
-  enum class ThinLTOPhase {
-    /// No ThinLTO behavior needed.
-    None,
-    /// ThinLTO prelink (summary) phase.
-    PreLink,
-    /// ThinLTO postlink (backend compile) phase.
-    PostLink
-  };
-
   /// LLVM-provided high-level optimization levels.
   ///
   /// This enumerates the LLVM-provided high-level optimization levels. Each
@@ -259,11 +269,10 @@ public:
     unsigned getSizeLevel() const { return SizeLevel; }
   };
 
-  explicit PassBuilder(TargetMachine *TM = nullptr,
+  explicit PassBuilder(bool DebugLogging = false, TargetMachine *TM = nullptr,
                        PipelineTuningOptions PTO = PipelineTuningOptions(),
                        Optional<PGOOptions> PGOOpt = None,
-                       PassInstrumentationCallbacks *PIC = nullptr)
-      : TM(TM), PTO(PTO), PGOOpt(PGOOpt), PIC(PIC) {}
+                       PassInstrumentationCallbacks *PIC = nullptr);
 
   /// Cross register the analysis managers through their proxies.
   ///
@@ -321,8 +330,7 @@ public:
   /// \p Phase indicates the current ThinLTO phase.
   FunctionPassManager
   buildFunctionSimplificationPipeline(OptimizationLevel Level,
-                                      ThinLTOPhase Phase,
-                                      bool DebugLogging = false);
+                                      ThinOrFullLTOPhase Phase);
 
   /// Construct the core LLVM module canonicalization and simplification
   /// pipeline.
@@ -339,16 +347,13 @@ public:
   /// build them.
   ///
   /// \p Phase indicates the current ThinLTO phase.
-  ModulePassManager
-  buildModuleSimplificationPipeline(OptimizationLevel Level,
-                                    ThinLTOPhase Phase,
-                                    bool DebugLogging = false);
+  ModulePassManager buildModuleSimplificationPipeline(OptimizationLevel Level,
+                                                      ThinOrFullLTOPhase Phase);
 
   /// Construct the module pipeline that performs inlining as well as
   /// the inlining-driven cleanups.
   ModuleInlinerWrapperPass buildInlinerPipeline(OptimizationLevel Level,
-                                                ThinLTOPhase Phase,
-                                                bool DebugLogging = false);
+                                                ThinOrFullLTOPhase Phase);
 
   /// Construct the core LLVM module optimization pipeline.
   ///
@@ -364,7 +369,6 @@ public:
   /// require some transformations for semantic reasons, they should explicitly
   /// build them.
   ModulePassManager buildModuleOptimizationPipeline(OptimizationLevel Level,
-                                                    bool DebugLogging = false,
                                                     bool LTOPreLink = false);
 
   /// Build a per-module default optimization pipeline.
@@ -379,7 +383,6 @@ public:
   /// require some transformations for semantic reasons, they should explicitly
   /// build them.
   ModulePassManager buildPerModuleDefaultPipeline(OptimizationLevel Level,
-                                                  bool DebugLogging = false,
                                                   bool LTOPreLink = false);
 
   /// Build a pre-link, ThinLTO-targeting default optimization pipeline to
@@ -394,9 +397,7 @@ public:
   /// only intended for use when attempting to optimize code. If frontends
   /// require some transformations for semantic reasons, they should explicitly
   /// build them.
-  ModulePassManager
-  buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level,
-                                     bool DebugLogging = false);
+  ModulePassManager buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level);
 
   /// Build an ThinLTO default optimization pipeline to a pass manager.
   ///
@@ -410,7 +411,7 @@ public:
   /// require some transformations for semantic reasons, they should explicitly
   /// build them.
   ModulePassManager
-  buildThinLTODefaultPipeline(OptimizationLevel Level, bool DebugLogging,
+  buildThinLTODefaultPipeline(OptimizationLevel Level,
                               const ModuleSummaryIndex *ImportSummary);
 
   /// Build a pre-link, LTO-targeting default optimization pipeline to a pass
@@ -425,8 +426,7 @@ public:
   /// only intended for use when attempting to optimize code. If frontends
   /// require some transformations for semantic reasons, they should explicitly
   /// build them.
-  ModulePassManager buildLTOPreLinkDefaultPipeline(OptimizationLevel Level,
-                                                   bool DebugLogging = false);
+  ModulePassManager buildLTOPreLinkDefaultPipeline(OptimizationLevel Level);
 
   /// Build an LTO default optimization pipeline to a pass manager.
   ///
@@ -440,11 +440,19 @@ public:
   /// require some transformations for semantic reasons, they should explicitly
   /// build them.
   ModulePassManager buildLTODefaultPipeline(OptimizationLevel Level,
-                                            bool DebugLogging,
                                             ModuleSummaryIndex *ExportSummary);
 
+  /// Build an O0 pipeline with the minimal semantically required passes.
+  ///
+  /// This should only be used for non-LTO and LTO pre-link pipelines.
+  ModulePassManager buildO0DefaultPipeline(OptimizationLevel Level,
+                                           bool LTOPreLink = false);
+
   /// Build the default `AAManager` with the default alias analysis pipeline
   /// registered.
+  ///
+  /// This also adds target-specific alias analyses registered via
+  /// TargetMachine::registerDefaultAliasAnalyses().
   AAManager buildDefaultAAPipeline();
 
   /// Parse a textual pass pipeline description into a \c
@@ -472,13 +480,22 @@ public:
   ///   module(function(loop(lpass1,lpass2,lpass3)))
   ///
   /// This shortcut is especially useful for debugging and testing small pass
-  /// combinations. Note that these shortcuts don't introduce any other magic.
-  /// If the sequence of passes aren't all the exact same kind of pass, it will
-  /// be an error. You cannot mix different levels implicitly, you must
-  /// explicitly form a pass manager in which to nest passes.
-  Error parsePassPipeline(ModulePassManager &MPM, StringRef PipelineText,
-                          bool VerifyEachPass = true,
-                          bool DebugLogging = false);
+  /// combinations.
+  ///
+  /// The sequence of passes aren't necessarily the exact same kind of pass.
+  /// You can mix different levels implicitly if adaptor passes are defined to
+  /// make them work. For example,
+  ///
+  ///   mpass1,fpass1,fpass2,mpass2,lpass1
+  ///
+  /// This pipeline uses only one pass manager: the top-level module manager.
+  /// fpass1,fpass2 and lpass1 are added into the the top-level module manager
+  /// using only adaptor passes. No nested function/loop pass managers are
+  /// added. The purpose is to allow easy pass testing when the user
+  /// specifically want the pass to run under a adaptor directly. This is
+  /// preferred when a pipeline is largely of one type, but one or just a few
+  /// passes are of different types(See PassBuilder.cpp for examples).
+  Error parsePassPipeline(ModulePassManager &MPM, StringRef PipelineText);
 
   /// {{@ Parse a textual pass pipeline description into a specific PassManager
   ///
@@ -487,15 +504,9 @@ public:
   /// this is the valid pipeline text:
   ///
   ///   function(lpass)
-  Error parsePassPipeline(CGSCCPassManager &CGPM, StringRef PipelineText,
-                          bool VerifyEachPass = true,
-                          bool DebugLogging = false);
-  Error parsePassPipeline(FunctionPassManager &FPM, StringRef PipelineText,
-                          bool VerifyEachPass = true,
-                          bool DebugLogging = false);
-  Error parsePassPipeline(LoopPassManager &LPM, StringRef PipelineText,
-                          bool VerifyEachPass = true,
-                          bool DebugLogging = false);
+  Error parsePassPipeline(CGSCCPassManager &CGPM, StringRef PipelineText);
+  Error parsePassPipeline(FunctionPassManager &FPM, StringRef PipelineText);
+  Error parsePassPipeline(LoopPassManager &LPM, StringRef PipelineText);
   /// @}}
 
   /// Parse a textual alias analysis pipeline into the provided AA manager.
@@ -594,17 +605,23 @@ public:
   /// pipeline. This does not apply to 'backend' compiles (LTO and ThinLTO
   /// link-time pipelines).
   void registerPipelineStartEPCallback(
-      const std::function<void(ModulePassManager &)> &C) {
+      const std::function<void(ModulePassManager &, OptimizationLevel)> &C) {
     PipelineStartEPCallbacks.push_back(C);
   }
 
+  /// Register a callback for a default optimizer pipeline extension point.
+  ///
+  /// This extension point allows adding optimization right after passes that do
+  /// basic simplification of the input IR.
+  void registerPipelineEarlySimplificationEPCallback(
+      const std::function<void(ModulePassManager &, OptimizationLevel)> &C) {
+    PipelineEarlySimplificationEPCallbacks.push_back(C);
+  }
+
   /// Register a callback for a default optimizer pipeline extension point
   ///
   /// This extension point allows adding optimizations at the very end of the
-  /// function optimization pipeline. A key difference between this and the
-  /// legacy PassManager's OptimizerLast callback is that this extension point
-  /// is not triggered at O0. Extensions to the O0 pipeline should append their
-  /// passes to the end of the overall pipeline.
+  /// function optimization pipeline.
   void registerOptimizerLastEPCallback(
       const std::function<void(ModulePassManager &, OptimizationLevel)> &C) {
     OptimizerLastEPCallbacks.push_back(C);
@@ -671,17 +688,13 @@ public:
   /// PassManagers and populate the passed ModulePassManager.
   void registerParseTopLevelPipelineCallback(
       const std::function<bool(ModulePassManager &, ArrayRef<PipelineElement>,
-                               bool VerifyEachPass, bool DebugLogging)> &C) {
-    TopLevelPipelineParsingCallbacks.push_back(C);
-  }
+                               bool DebugLogging)> &C);
 
   /// Add PGOInstrumenation passes for O0 only.
-  void addPGOInstrPassesForO0(ModulePassManager &MPM, bool DebugLogging,
-                              bool RunProfileGen, bool IsCS,
-                              std::string ProfileFile,
+  void addPGOInstrPassesForO0(ModulePassManager &MPM, bool RunProfileGen,
+                              bool IsCS, std::string ProfileFile,
                               std::string ProfileRemappingFile);
 
-
   /// Returns PIC. External libraries can use this to register pass
   /// instrumentation callbacks.
   PassInstrumentationCallbacks *getPassInstrumentationCallbacks() const {
@@ -690,38 +703,32 @@ public:
 
 private:
   // O1 pass pipeline
-  FunctionPassManager buildO1FunctionSimplificationPipeline(
-      OptimizationLevel Level, ThinLTOPhase Phase, bool DebugLogging = false);
+  FunctionPassManager
+  buildO1FunctionSimplificationPipeline(OptimizationLevel Level,
+                                        ThinOrFullLTOPhase Phase);
+
+  void addRequiredLTOPreLinkPasses(ModulePassManager &MPM);
 
   static Optional<std::vector<PipelineElement>>
   parsePipelineText(StringRef Text);
 
-  Error parseModulePass(ModulePassManager &MPM, const PipelineElement &E,
-                        bool VerifyEachPass, bool DebugLogging);
-  Error parseCGSCCPass(CGSCCPassManager &CGPM, const PipelineElement &E,
-                       bool VerifyEachPass, bool DebugLogging);
-  Error parseFunctionPass(FunctionPassManager &FPM, const PipelineElement &E,
-                          bool VerifyEachPass, bool DebugLogging);
-  Error parseLoopPass(LoopPassManager &LPM, const PipelineElement &E,
-                      bool VerifyEachPass, bool DebugLogging);
+  Error parseModulePass(ModulePassManager &MPM, const PipelineElement &E);
+  Error parseCGSCCPass(CGSCCPassManager &CGPM, const PipelineElement &E);
+  Error parseFunctionPass(FunctionPassManager &FPM, const PipelineElement &E);
+  Error parseLoopPass(LoopPassManager &LPM, const PipelineElement &E);
   bool parseAAPassName(AAManager &AA, StringRef Name);
 
   Error parseLoopPassPipeline(LoopPassManager &LPM,
-                              ArrayRef<PipelineElement> Pipeline,
-                              bool VerifyEachPass, bool DebugLogging);
+                              ArrayRef<PipelineElement> Pipeline);
   Error parseFunctionPassPipeline(FunctionPassManager &FPM,
-                                  ArrayRef<PipelineElement> Pipeline,
-                                  bool VerifyEachPass, bool DebugLogging);
+                                  ArrayRef<PipelineElement> Pipeline);
   Error parseCGSCCPassPipeline(CGSCCPassManager &CGPM,
-                               ArrayRef<PipelineElement> Pipeline,
-                               bool VerifyEachPass, bool DebugLogging);
+                               ArrayRef<PipelineElement> Pipeline);
   Error parseModulePassPipeline(ModulePassManager &MPM,
-                                ArrayRef<PipelineElement> Pipeline,
-                                bool VerifyEachPass, bool DebugLogging);
+                                ArrayRef<PipelineElement> Pipeline);
 
-  void addPGOInstrPasses(ModulePassManager &MPM, bool DebugLogging,
-                         OptimizationLevel Level, bool RunProfileGen, bool IsCS,
-                         std::string ProfileFile,
+  void addPGOInstrPasses(ModulePassManager &MPM, OptimizationLevel Level,
+                         bool RunProfileGen, bool IsCS, std::string ProfileFile,
                          std::string ProfileRemappingFile);
   void invokePeepholeEPCallbacks(FunctionPassManager &, OptimizationLevel);
 
@@ -741,8 +748,11 @@ private:
   SmallVector<std::function<void(ModulePassManager &, OptimizationLevel)>, 2>
       OptimizerLastEPCallbacks;
   // Module callbacks
-  SmallVector<std::function<void(ModulePassManager &)>, 2>
+  SmallVector<std::function<void(ModulePassManager &, OptimizationLevel)>, 2>
       PipelineStartEPCallbacks;
+  SmallVector<std::function<void(ModulePassManager &, OptimizationLevel)>, 2>
+      PipelineEarlySimplificationEPCallbacks;
+
   SmallVector<std::function<void(ModuleAnalysisManager &)>, 2>
       ModuleAnalysisRegistrationCallbacks;
   SmallVector<std::function<bool(StringRef, ModulePassManager &,
@@ -750,7 +760,7 @@ private:
               2>
       ModulePipelineParsingCallbacks;
   SmallVector<std::function<bool(ModulePassManager &, ArrayRef<PipelineElement>,
-                                 bool VerifyEachPass, bool DebugLogging)>,
+                                 bool DebugLogging)>,
               2>
       TopLevelPipelineParsingCallbacks;
   // CGSCC callbacks
diff --git a/contrib/llvm-project/llvm/include/llvm/Passes/StandardInstrumentations.h b/contrib/llvm-project/llvm/include/llvm/Passes/StandardInstrumentations.h
index 3d3002eecce9..61c86b0468f2 100644
--- a/contrib/llvm-project/llvm/include/llvm/Passes/StandardInstrumentations.h
+++ b/contrib/llvm-project/llvm/include/llvm/Passes/StandardInstrumentations.h
@@ -16,8 +16,13 @@
 #define LLVM_PASSES_STANDARDINSTRUMENTATIONS_H
 
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/IR/PassInstrumentation.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/OptBisect.h"
 #include "llvm/IR/PassTimingInfo.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/IPO/SampleProfileProbe.h"
 
 #include <string>
 #include <utility>
@@ -25,6 +30,8 @@
 namespace llvm {
 
 class Module;
+class Function;
+class PassInstrumentationCallbacks;
 
 /// Instrumentation to print IR before/after passes.
 ///
@@ -32,40 +39,257 @@ class Module;
 /// (typically Loop or SCC).
 class PrintIRInstrumentation {
 public:
-  PrintIRInstrumentation() = default;
   ~PrintIRInstrumentation();
 
   void registerCallbacks(PassInstrumentationCallbacks &PIC);
 
 private:
-  bool printBeforePass(StringRef PassID, Any IR);
+  void printBeforePass(StringRef PassID, Any IR);
   void printAfterPass(StringRef PassID, Any IR);
   void printAfterPassInvalidated(StringRef PassID);
 
+  bool shouldPrintBeforePass(StringRef PassID);
+  bool shouldPrintAfterPass(StringRef PassID);
+
   using PrintModuleDesc = std::tuple<const Module *, std::string, StringRef>;
 
   void pushModuleDesc(StringRef PassID, Any IR);
   PrintModuleDesc popModuleDesc(StringRef PassID);
 
+  PassInstrumentationCallbacks *PIC;
   /// Stack of Module description, enough to print the module after a given
   /// pass.
   SmallVector<PrintModuleDesc, 2> ModuleDescStack;
   bool StoreModuleDesc = false;
 };
 
+class OptNoneInstrumentation {
+public:
+  OptNoneInstrumentation(bool DebugLogging) : DebugLogging(DebugLogging) {}
+  void registerCallbacks(PassInstrumentationCallbacks &PIC);
+
+private:
+  bool DebugLogging;
+  bool shouldRun(StringRef PassID, Any IR);
+};
+
+class OptBisectInstrumentation {
+public:
+  OptBisectInstrumentation() {}
+  void registerCallbacks(PassInstrumentationCallbacks &PIC);
+};
+
+// Debug logging for transformation and analysis passes.
+class PrintPassInstrumentation {
+public:
+  PrintPassInstrumentation(bool DebugLogging) : DebugLogging(DebugLogging) {}
+  void registerCallbacks(PassInstrumentationCallbacks &PIC);
+
+private:
+  bool DebugLogging;
+};
+
+class PreservedCFGCheckerInstrumentation {
+private:
+  // CFG is a map BB -> {(Succ, Multiplicity)}, where BB is a non-leaf basic
+  // block, {(Succ, Multiplicity)} set of all pairs of the block's successors
+  // and the multiplicity of the edge (BB->Succ). As the mapped sets are
+  // unordered the order of successors is not tracked by the CFG. In other words
+  // this allows basic block successors to be swapped by a pass without
+  // reporting a CFG change. CFG can be guarded by basic block tracking pointers
+  // in the Graph (BBGuard). That is if any of the block is deleted or RAUWed
+  // then the CFG is treated poisoned and no block pointer of the Graph is used.
+  struct CFG {
+    struct BBGuard final : public CallbackVH {
+      BBGuard(const BasicBlock *BB) : CallbackVH(BB) {}
+      void deleted() override { CallbackVH::deleted(); }
+      void allUsesReplacedWith(Value *) override { CallbackVH::deleted(); }
+      bool isPoisoned() const { return !getValPtr(); }
+    };
+
+    Optional<DenseMap<intptr_t, BBGuard>> BBGuards;
+    DenseMap<const BasicBlock *, DenseMap<const BasicBlock *, unsigned>> Graph;
+
+    CFG(const Function *F, bool TrackBBLifetime = false);
+
+    bool operator==(const CFG &G) const {
+      return !isPoisoned() && !G.isPoisoned() && Graph == G.Graph;
+    }
+
+    bool isPoisoned() const {
+      if (BBGuards)
+        for (auto &BB : *BBGuards) {
+          if (BB.second.isPoisoned())
+            return true;
+        }
+      return false;
+    }
+
+    static void printDiff(raw_ostream &out, const CFG &Before,
+                          const CFG &After);
+  };
+
+  SmallVector<std::pair<StringRef, Optional<CFG>>, 8> GraphStackBefore;
+
+public:
+  static cl::opt<bool> VerifyPreservedCFG;
+  void registerCallbacks(PassInstrumentationCallbacks &PIC);
+};
+
+// Base class for classes that report changes to the IR.
+// It presents an interface for such classes and provides calls
+// on various events as the new pass manager transforms the IR.
+// It also provides filtering of information based on hidden options
+// specifying which functions are interesting.
+// Calls are made for the following events/queries:
+// 1.  The initial IR processed.
+// 2.  To get the representation of the IR (of type \p T).
+// 3.  When a pass does not change the IR.
+// 4.  When a pass changes the IR (given both before and after representations
+//         of type \p T).
+// 5.  When an IR is invalidated.
+// 6.  When a pass is run on an IR that is not interesting (based on options).
+// 7.  When a pass is ignored (pass manager or adapter pass).
+// 8.  To compare two IR representations (of type \p T).
+template <typename IRUnitT> class ChangeReporter {
+protected:
+  ChangeReporter(bool RunInVerboseMode) : VerboseMode(RunInVerboseMode) {}
+
+public:
+  virtual ~ChangeReporter();
+
+  // Determine if this pass/IR is interesting and if so, save the IR
+  // otherwise it is left on the stack without data.
+  void saveIRBeforePass(Any IR, StringRef PassID);
+  // Compare the IR from before the pass after the pass.
+  void handleIRAfterPass(Any IR, StringRef PassID);
+  // Handle the situation where a pass is invalidated.
+  void handleInvalidatedPass(StringRef PassID);
+
+protected:
+  // Register required callbacks.
+  void registerRequiredCallbacks(PassInstrumentationCallbacks &PIC);
+
+  // Return true when this is a defined function for which printing
+  // of changes is desired.
+  bool isInterestingFunction(const Function &F);
+
+  // Return true when this is a pass for which printing of changes is desired.
+  bool isInterestingPass(StringRef PassID);
+
+  // Return true when this is a pass on IR for which printing
+  // of changes is desired.
+  bool isInteresting(Any IR, StringRef PassID);
+
+  // Called on the first IR processed.
+  virtual void handleInitialIR(Any IR) = 0;
+  // Called before and after a pass to get the representation of the IR.
+  virtual void generateIRRepresentation(Any IR, StringRef PassID,
+                                        IRUnitT &Output) = 0;
+  // Called when the pass is not iteresting.
+  virtual void omitAfter(StringRef PassID, std::string &Name) = 0;
+  // Called when an interesting IR has changed.
+  virtual void handleAfter(StringRef PassID, std::string &Name,
+                           const IRUnitT &Before, const IRUnitT &After,
+                           Any) = 0;
+  // Called when an interesting pass is invalidated.
+  virtual void handleInvalidated(StringRef PassID) = 0;
+  // Called when the IR or pass is not interesting.
+  virtual void handleFiltered(StringRef PassID, std::string &Name) = 0;
+  // Called when an ignored pass is encountered.
+  virtual void handleIgnored(StringRef PassID, std::string &Name) = 0;
+  // Called to compare the before and after representations of the IR.
+  virtual bool same(const IRUnitT &Before, const IRUnitT &After) = 0;
+
+  // Stack of IRs before passes.
+  std::vector<IRUnitT> BeforeStack;
+  // Is this the first IR seen?
+  bool InitialIR = true;
+
+  // Run in verbose mode, printing everything?
+  const bool VerboseMode;
+};
+
+// An abstract template base class that handles printing banners and
+// reporting when things have not changed or are filtered out.
+template <typename IRUnitT>
+class TextChangeReporter : public ChangeReporter<IRUnitT> {
+protected:
+  TextChangeReporter(bool Verbose);
+
+  // Print a module dump of the first IR that is changed.
+  void handleInitialIR(Any IR) override;
+  // Report that the IR was omitted because it did not change.
+  void omitAfter(StringRef PassID, std::string &Name) override;
+  // Report that the pass was invalidated.
+  void handleInvalidated(StringRef PassID) override;
+  // Report that the IR was filtered out.
+  void handleFiltered(StringRef PassID, std::string &Name) override;
+  // Report that the pass was ignored.
+  void handleIgnored(StringRef PassID, std::string &Name) override;
+  // Make substitutions in \p S suitable for reporting changes
+  // after the pass and then print it.
+
+  raw_ostream &Out;
+};
+
+// A change printer based on the string representation of the IR as created
+// by unwrapAndPrint.  The string representation is stored in a std::string
+// to preserve it as the IR changes in each pass.  Note that the banner is
+// included in this representation but it is massaged before reporting.
+class IRChangedPrinter : public TextChangeReporter<std::string> {
+public:
+  IRChangedPrinter(bool VerboseMode)
+      : TextChangeReporter<std::string>(VerboseMode) {}
+  ~IRChangedPrinter() override;
+  void registerCallbacks(PassInstrumentationCallbacks &PIC);
+
+protected:
+  // Called before and after a pass to get the representation of the IR.
+  void generateIRRepresentation(Any IR, StringRef PassID,
+                                std::string &Output) override;
+  // Called when an interesting IR has changed.
+  void handleAfter(StringRef PassID, std::string &Name,
+                   const std::string &Before, const std::string &After,
+                   Any) override;
+  // Called to compare the before and after representations of the IR.
+  bool same(const std::string &Before, const std::string &After) override;
+};
+
+class VerifyInstrumentation {
+  bool DebugLogging;
+
+public:
+  VerifyInstrumentation(bool DebugLogging) : DebugLogging(DebugLogging) {}
+  void registerCallbacks(PassInstrumentationCallbacks &PIC);
+};
+
 /// This class provides an interface to register all the standard pass
 /// instrumentations and manages their state (if any).
 class StandardInstrumentations {
   PrintIRInstrumentation PrintIR;
+  PrintPassInstrumentation PrintPass;
   TimePassesHandler TimePasses;
+  OptNoneInstrumentation OptNone;
+  OptBisectInstrumentation OptBisect;
+  PreservedCFGCheckerInstrumentation PreservedCFGChecker;
+  IRChangedPrinter PrintChangedIR;
+  PseudoProbeVerifier PseudoProbeVerification;
+  VerifyInstrumentation Verify;
+
+  bool VerifyEach;
 
 public:
-  StandardInstrumentations() = default;
+  StandardInstrumentations(bool DebugLogging, bool VerifyEach = false);
 
   void registerCallbacks(PassInstrumentationCallbacks &PIC);
 
   TimePassesHandler &getTimePasses() { return TimePasses; }
 };
+
+extern template class ChangeReporter<std::string>;
+extern template class TextChangeReporter<std::string>;
+
 } // namespace llvm
 
 #endif
diff --git a/contrib/llvm-project/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h b/contrib/llvm-project/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
index bf0dffc9653c..09f21677ec54 100644
--- a/contrib/llvm-project/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
+++ b/contrib/llvm-project/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
@@ -55,7 +55,8 @@ enum class coveragemap_error {
   unsupported_version,
   truncated,
   malformed,
-  decompression_failed
+  decompression_failed,
+  invalid_or_missing_arch_specifier
 };
 
 const std::error_category &coveragemap_category();
@@ -89,6 +90,8 @@ private:
 /// A Counter is an abstract value that describes how to compute the
 /// execution count for a region of code using the collected profile count data.
 struct Counter {
+  /// The CounterExpression kind (Add or Subtract) is encoded in bit 0 next to
+  /// the CounterKind. This means CounterKind has to leave bit 0 free.
   enum CounterKind { Zero, CounterValueReference, Expression };
   static const unsigned EncodingTagBits = 2;
   static const unsigned EncodingTagMask = 0x3;
@@ -218,10 +221,20 @@ struct CounterMappingRegion {
 
     /// A GapRegion is like a CodeRegion, but its count is only set as the
     /// line execution count when its the only region in the line.
-    GapRegion
+    GapRegion,
+
+    /// A BranchRegion represents leaf-level boolean expressions and is
+    /// associated with two counters, each representing the number of times the
+    /// expression evaluates to true or false.
+    BranchRegion
   };
 
+  /// Primary Counter that is also used for Branch Regions (TrueCount).
   Counter Count;
+
+  /// Secondary Counter used for Branch Regions (FalseCount).
+  Counter FalseCount;
+
   unsigned FileID, ExpandedFileID;
   unsigned LineStart, ColumnStart, LineEnd, ColumnEnd;
   RegionKind Kind;
@@ -233,6 +246,15 @@ struct CounterMappingRegion {
         LineStart(LineStart), ColumnStart(ColumnStart), LineEnd(LineEnd),
         ColumnEnd(ColumnEnd), Kind(Kind) {}
 
+  CounterMappingRegion(Counter Count, Counter FalseCount, unsigned FileID,
+                       unsigned ExpandedFileID, unsigned LineStart,
+                       unsigned ColumnStart, unsigned LineEnd,
+                       unsigned ColumnEnd, RegionKind Kind)
+      : Count(Count), FalseCount(FalseCount), FileID(FileID),
+        ExpandedFileID(ExpandedFileID), LineStart(LineStart),
+        ColumnStart(ColumnStart), LineEnd(LineEnd), ColumnEnd(ColumnEnd),
+        Kind(Kind) {}
+
   static CounterMappingRegion
   makeRegion(Counter Count, unsigned FileID, unsigned LineStart,
              unsigned ColumnStart, unsigned LineEnd, unsigned ColumnEnd) {
@@ -262,6 +284,14 @@ struct CounterMappingRegion {
                                 LineEnd, (1U << 31) | ColumnEnd, GapRegion);
   }
 
+  static CounterMappingRegion
+  makeBranchRegion(Counter Count, Counter FalseCount, unsigned FileID,
+                   unsigned LineStart, unsigned ColumnStart, unsigned LineEnd,
+                   unsigned ColumnEnd) {
+    return CounterMappingRegion(Count, FalseCount, FileID, 0, LineStart,
+                                ColumnStart, LineEnd, ColumnEnd, BranchRegion);
+  }
+
   inline LineColPair startLoc() const {
     return LineColPair(LineStart, ColumnStart);
   }
@@ -272,9 +302,17 @@ struct CounterMappingRegion {
 /// Associates a source range with an execution count.
 struct CountedRegion : public CounterMappingRegion {
   uint64_t ExecutionCount;
+  uint64_t FalseExecutionCount;
+  bool Folded;
 
   CountedRegion(const CounterMappingRegion &R, uint64_t ExecutionCount)
-      : CounterMappingRegion(R), ExecutionCount(ExecutionCount) {}
+      : CounterMappingRegion(R), ExecutionCount(ExecutionCount),
+        FalseExecutionCount(0), Folded(false) {}
+
+  CountedRegion(const CounterMappingRegion &R, uint64_t ExecutionCount,
+                uint64_t FalseExecutionCount)
+      : CounterMappingRegion(R), ExecutionCount(ExecutionCount),
+        FalseExecutionCount(FalseExecutionCount), Folded(false) {}
 };
 
 /// A Counter mapping context is used to connect the counters, expressions
@@ -311,6 +349,8 @@ struct FunctionRecord {
   std::vector<std::string> Filenames;
   /// Regions in the function along with their counts.
   std::vector<CountedRegion> CountedRegions;
+  /// Branch Regions in the function along with their counts.
+  std::vector<CountedRegion> CountedBranchRegions;
   /// The number of times this function was executed.
   uint64_t ExecutionCount = 0;
 
@@ -320,10 +360,19 @@ struct FunctionRecord {
   FunctionRecord(FunctionRecord &&FR) = default;
   FunctionRecord &operator=(FunctionRecord &&) = default;
 
-  void pushRegion(CounterMappingRegion Region, uint64_t Count) {
+  void pushRegion(CounterMappingRegion Region, uint64_t Count,
+                  uint64_t FalseCount) {
+    if (Region.Kind == CounterMappingRegion::BranchRegion) {
+      CountedBranchRegions.emplace_back(Region, Count, FalseCount);
+      // If both counters are hard-coded to zero, then this region represents a
+      // constant-folded branch.
+      if (Region.Count.isZero() && Region.FalseCount.isZero())
+        CountedBranchRegions.back().Folded = true;
+      return;
+    }
     if (CountedRegions.empty())
       ExecutionCount = Count;
-    CountedRegions.emplace_back(Region, Count);
+    CountedRegions.emplace_back(Region, Count, FalseCount);
   }
 };
 
@@ -402,7 +451,8 @@ struct CoverageSegment {
         IsRegionEntry(IsRegionEntry), IsGapRegion(false) {}
 
   CoverageSegment(unsigned Line, unsigned Col, uint64_t Count,
-                  bool IsRegionEntry, bool IsGapRegion = false)
+                  bool IsRegionEntry, bool IsGapRegion = false,
+                  bool IsBranchRegion = false)
       : Line(Line), Col(Col), Count(Count), HasCount(true),
         IsRegionEntry(IsRegionEntry), IsGapRegion(IsGapRegion) {}
 
@@ -482,6 +532,7 @@ class CoverageData {
   std::string Filename;
   std::vector<CoverageSegment> Segments;
   std::vector<ExpansionRecord> Expansions;
+  std::vector<CountedRegion> BranchRegions;
 
 public:
   CoverageData() = default;
@@ -505,6 +556,9 @@ public:
 
   /// Expansions that can be further processed.
   ArrayRef<ExpansionRecord> getExpansions() const { return Expansions; }
+
+  /// Branches that can be further processed.
+  ArrayRef<CountedRegion> getBranches() const { return BranchRegions; }
 };
 
 /// The mapping of profile information to coverage data.
@@ -940,7 +994,9 @@ enum CovMapVersion {
   Version3 = 2,
   // Function records are named, uniqued, and moved to a dedicated section.
   Version4 = 3,
-  // The current version is Version4.
+  // Branch regions referring to two counters are added
+  Version5 = 4,
+  // The current version is Version5.
   CurrentVersion = INSTR_PROF_COVMAP_VERSION
 };
 
diff --git a/contrib/llvm-project/llvm/include/llvm/ProfileData/Coverage/CoverageMappingReader.h b/contrib/llvm-project/llvm/include/llvm/ProfileData/Coverage/CoverageMappingReader.h
index 97f4c32eb035..3a611bcb8cd1 100644
--- a/contrib/llvm-project/llvm/include/llvm/ProfileData/Coverage/CoverageMappingReader.h
+++ b/contrib/llvm-project/llvm/include/llvm/ProfileData/Coverage/CoverageMappingReader.h
@@ -67,10 +67,10 @@ public:
     increment();
     return *this;
   }
-  bool operator==(const CoverageMappingIterator &RHS) {
+  bool operator==(const CoverageMappingIterator &RHS) const {
     return Reader == RHS.Reader;
   }
-  bool operator!=(const CoverageMappingIterator &RHS) {
+  bool operator!=(const CoverageMappingIterator &RHS) const {
     return Reader != RHS.Reader;
   }
   Expected<CoverageMappingRecord &> operator*() {
diff --git a/contrib/llvm-project/llvm/include/llvm/ProfileData/GCOV.h b/contrib/llvm-project/llvm/include/llvm/ProfileData/GCOV.h
index 7b9ba4410b65..d4f0b9120577 100644
--- a/contrib/llvm-project/llvm/include/llvm/ProfileData/GCOV.h
+++ b/contrib/llvm-project/llvm/include/llvm/ProfileData/GCOV.h
@@ -15,6 +15,7 @@
 #define LLVM_PROFILEDATA_GCOV_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
@@ -38,7 +39,6 @@ namespace llvm {
 
 class GCOVFunction;
 class GCOVBlock;
-class FileInfo;
 
 namespace GCOV {
 
@@ -47,10 +47,11 @@ enum GCOVVersion { V304, V407, V408, V800, V900 };
 /// A struct for passing gcov options between functions.
 struct Options {
   Options(bool A, bool B, bool C, bool F, bool P, bool U, bool I, bool L,
-          bool N, bool T, bool X)
+          bool M, bool N, bool R, bool T, bool X, std::string SourcePrefix)
       : AllBlocks(A), BranchInfo(B), BranchCount(C), FuncCoverage(F),
         PreservePaths(P), UncondBranch(U), Intermediate(I), LongFileNames(L),
-        NoOutput(N), UseStdout(T), HashFilenames(X) {}
+        Demangle(M), NoOutput(N), RelativeOnly(R), UseStdout(T),
+        HashFilenames(X), SourcePrefix(std::move(SourcePrefix)) {}
 
   bool AllBlocks;
   bool BranchInfo;
@@ -60,9 +61,12 @@ struct Options {
   bool UncondBranch;
   bool Intermediate;
   bool LongFileNames;
+  bool Demangle;
   bool NoOutput;
+  bool RelativeOnly;
   bool UseStdout;
   bool HashFilenames;
+  std::string SourcePrefix;
 };
 
 } // end namespace GCOV
@@ -187,39 +191,38 @@ public:
   bool readGCNO(GCOVBuffer &Buffer);
   bool readGCDA(GCOVBuffer &Buffer);
   GCOV::GCOVVersion getVersion() const { return Version; }
-  uint32_t getChecksum() const { return Checksum; }
   void print(raw_ostream &OS) const;
   void dump() const;
-  void collectLineCounts(FileInfo &FI);
 
   std::vector<std::string> filenames;
   StringMap<unsigned> filenameToIdx;
 
-private:
+public:
   bool GCNOInitialized = false;
   GCOV::GCOVVersion Version;
   uint32_t Checksum = 0;
   StringRef cwd;
-  SmallVector<std::unique_ptr<GCOVFunction>, 16> Functions;
+  SmallVector<std::unique_ptr<GCOVFunction>, 16> functions;
   std::map<uint32_t, GCOVFunction *> IdentToFunction;
   uint32_t RunCount = 0;
   uint32_t ProgramCount = 0;
 
   using iterator = pointee_iterator<
       SmallVectorImpl<std::unique_ptr<GCOVFunction>>::const_iterator>;
-  iterator begin() const { return iterator(Functions.begin()); }
-  iterator end() const { return iterator(Functions.end()); }
+  iterator begin() const { return iterator(functions.begin()); }
+  iterator end() const { return iterator(functions.end()); }
 };
 
 struct GCOVArc {
-  GCOVArc(GCOVBlock &src, GCOVBlock &dst, bool fallthrough)
-      : src(src), dst(dst), fallthrough(fallthrough) {}
+  GCOVArc(GCOVBlock &src, GCOVBlock &dst, uint32_t flags)
+      : src(src), dst(dst), flags(flags) {}
+  bool onTree() const;
 
   GCOVBlock &src;
   GCOVBlock &dst;
-  bool fallthrough;
-  uint64_t Count = 0;
-  uint64_t CyclesCount = 0;
+  uint32_t flags;
+  uint64_t count = 0;
+  uint64_t cycleCount = 0;
 };
 
 /// GCOVFunction - Collects function information.
@@ -230,21 +233,18 @@ public:
 
   GCOVFunction(GCOVFile &file) : file(file) {}
 
-  StringRef getName() const { return Name; }
+  StringRef getName(bool demangle) const;
   StringRef getFilename() const;
-  size_t getNumBlocks() const { return Blocks.size(); }
   uint64_t getEntryCount() const;
-  uint64_t getExitCount() const;
+  GCOVBlock &getExitBlock() const;
 
-  BlockIterator block_begin() const { return Blocks.begin(); }
-  BlockIterator block_end() const { return Blocks.end(); }
-  iterator_range<BlockIterator> blocks() const {
-    return make_range(block_begin(), block_end());
+  iterator_range<BlockIterator> blocksRange() const {
+    return make_range(blocks.begin(), blocks.end());
   }
 
+  uint64_t propagateCounts(const GCOVBlock &v, GCOVArc *pred);
   void print(raw_ostream &OS) const;
   void dump() const;
-  void collectLineCounts(FileInfo &FI);
 
   GCOVFile &file;
   uint32_t ident = 0;
@@ -256,40 +256,31 @@ public:
   uint32_t endColumn = 0;
   uint8_t artificial = 0;
   StringRef Name;
+  mutable SmallString<0> demangled;
   unsigned srcIdx;
-  SmallVector<std::unique_ptr<GCOVBlock>, 0> Blocks;
+  SmallVector<std::unique_ptr<GCOVBlock>, 0> blocks;
   SmallVector<std::unique_ptr<GCOVArc>, 0> arcs, treeArcs;
+  DenseSet<const GCOVBlock *> visited;
 };
 
 /// GCOVBlock - Collects block information.
 class GCOVBlock {
-  struct EdgeWeight {
-    EdgeWeight(GCOVBlock *D) : Dst(D) {}
-
-    GCOVBlock *Dst;
-    uint64_t Count = 0;
-  };
-
 public:
   using EdgeIterator = SmallVectorImpl<GCOVArc *>::const_iterator;
-  using BlockVector = SmallVector<const GCOVBlock *, 4>;
+  using BlockVector = SmallVector<const GCOVBlock *, 1>;
   using BlockVectorLists = SmallVector<BlockVector, 4>;
   using Edges = SmallVector<GCOVArc *, 4>;
 
-  GCOVBlock(GCOVFunction &P, uint32_t N) : Parent(P), Number(N) {}
+  GCOVBlock(uint32_t N) : number(N) {}
 
-  const GCOVFunction &getParent() const { return Parent; }
-  void addLine(uint32_t N) { Lines.push_back(N); }
-  uint32_t getLastLine() const { return Lines.back(); }
-  uint64_t getCount() const { return Counter; }
+  void addLine(uint32_t N) { lines.push_back(N); }
+  uint32_t getLastLine() const { return lines.back(); }
+  uint64_t getCount() const { return count; }
 
   void addSrcEdge(GCOVArc *Edge) { pred.push_back(Edge); }
 
   void addDstEdge(GCOVArc *Edge) { succ.push_back(Edge); }
 
-  size_t getNumSrcEdges() const { return pred.size(); }
-  size_t getNumDstEdges() const { return succ.size(); }
-
   iterator_range<EdgeIterator> srcs() const {
     return make_range(pred.begin(), pred.end());
   }
@@ -300,116 +291,25 @@ public:
 
   void print(raw_ostream &OS) const;
   void dump() const;
-  void collectLineCounts(FileInfo &FI);
-
-  static uint64_t getCycleCount(const Edges &Path);
-  static void unblock(const GCOVBlock *U, BlockVector &Blocked,
-                      BlockVectorLists &BlockLists);
-  static bool lookForCircuit(const GCOVBlock *V, const GCOVBlock *Start,
-                             Edges &Path, BlockVector &Blocked,
-                             BlockVectorLists &BlockLists,
-                             const BlockVector &Blocks, uint64_t &Count);
-  static void getCyclesCount(const BlockVector &Blocks, uint64_t &Count);
+
+  static uint64_t
+  augmentOneCycle(GCOVBlock *src,
+                  std::vector<std::pair<GCOVBlock *, size_t>> &stack);
+  static uint64_t getCyclesCount(const BlockVector &blocks);
   static uint64_t getLineCount(const BlockVector &Blocks);
 
 public:
-  GCOVFunction &Parent;
-  uint32_t Number;
-  uint64_t Counter = 0;
+  uint32_t number;
+  uint64_t count = 0;
   SmallVector<GCOVArc *, 2> pred;
   SmallVector<GCOVArc *, 2> succ;
-  SmallVector<uint32_t, 16> Lines;
-};
-
-struct GCOVCoverage {
-  GCOVCoverage() = default;
-  GCOVCoverage(StringRef Name) : Name(Name) {}
-
-  StringRef Name;
-
-  uint32_t LogicalLines = 0;
-  uint32_t LinesExec = 0;
-
-  uint32_t Branches = 0;
-  uint32_t BranchesExec = 0;
-  uint32_t BranchesTaken = 0;
-};
-
-struct SourceInfo {
-  StringRef filename;
-  std::string name;
-  std::vector<GCOVFunction *> functions;
-  GCOVCoverage coverage;
-  SourceInfo(StringRef filename) : filename(filename) {}
+  SmallVector<uint32_t, 4> lines;
+  bool traversable = false;
+  GCOVArc *incoming = nullptr;
 };
 
-class FileInfo {
-protected:
-  // It is unlikely--but possible--for multiple functions to be on the same
-  // line.
-  // Therefore this typedef allows LineData.Functions to store multiple
-  // functions
-  // per instance. This is rare, however, so optimize for the common case.
-  using FunctionVector = SmallVector<const GCOVFunction *, 1>;
-  using FunctionLines = DenseMap<uint32_t, FunctionVector>;
-  using BlockVector = SmallVector<const GCOVBlock *, 4>;
-  using BlockLines = DenseMap<uint32_t, BlockVector>;
-
-  struct LineData {
-    LineData() = default;
-
-    BlockLines Blocks;
-    FunctionLines Functions;
-    uint32_t LastLine = 0;
-  };
-
-public:
-  friend class GCOVFile;
-  FileInfo(const GCOV::Options &Options) : Options(Options) {}
-
-  void addBlockLine(StringRef Filename, uint32_t Line, const GCOVBlock *Block) {
-    if (Line > LineInfo[Filename].LastLine)
-      LineInfo[Filename].LastLine = Line;
-    LineInfo[Filename].Blocks[Line - 1].push_back(Block);
-  }
-
-  void addFunctionLine(StringRef Filename, uint32_t Line,
-                       const GCOVFunction *Function) {
-    if (Line > LineInfo[Filename].LastLine)
-      LineInfo[Filename].LastLine = Line;
-    LineInfo[Filename].Functions[Line - 1].push_back(Function);
-  }
-
-  void setRunCount(uint32_t Runs) { RunCount = Runs; }
-  void setProgramCount(uint32_t Programs) { ProgramCount = Programs; }
-  void print(raw_ostream &OS, StringRef MainFilename, StringRef GCNOFile,
-             StringRef GCDAFile, GCOVFile &file);
-
-protected:
-  std::string getCoveragePath(StringRef Filename, StringRef MainFilename);
-  std::unique_ptr<raw_ostream> openCoveragePath(StringRef CoveragePath);
-  void printFunctionSummary(raw_ostream &OS, const FunctionVector &Funcs) const;
-  void printBlockInfo(raw_ostream &OS, const GCOVBlock &Block,
-                      uint32_t LineIndex, uint32_t &BlockNo) const;
-  void printBranchInfo(raw_ostream &OS, const GCOVBlock &Block,
-                       GCOVCoverage &Coverage, uint32_t &EdgeNo);
-  void printUncondBranchInfo(raw_ostream &OS, uint32_t &EdgeNo,
-                             uint64_t Count) const;
-
-  void printCoverage(raw_ostream &OS, const GCOVCoverage &Coverage) const;
-  void printFuncCoverage(raw_ostream &OS) const;
-  void printFileCoverage(raw_ostream &OS) const;
-
-  const GCOV::Options &Options;
-  StringMap<LineData> LineInfo;
-  uint32_t RunCount = 0;
-  uint32_t ProgramCount = 0;
-
-  using FuncCoverageMap = MapVector<const GCOVFunction *, GCOVCoverage>;
-
-  FuncCoverageMap FuncCoverages;
-  std::vector<SourceInfo> sources;
-};
+void gcovOneInput(const GCOV::Options &options, StringRef filename,
+                  StringRef gcno, StringRef gcda, GCOVFile &file);
 
 } // end namespace llvm
 
diff --git a/contrib/llvm-project/llvm/include/llvm/ProfileData/InstrProf.h b/contrib/llvm-project/llvm/include/llvm/ProfileData/InstrProf.h
index 62a0c6955708..9c16c353843d 100644
--- a/contrib/llvm-project/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/contrib/llvm-project/llvm/include/llvm/ProfileData/InstrProf.h
@@ -74,9 +74,10 @@ inline StringRef getInstrProfValueProfFuncName() {
   return INSTR_PROF_VALUE_PROF_FUNC_STR;
 }
 
-/// Return the name profile runtime entry point to do value range profiling.
-inline StringRef getInstrProfValueRangeProfFuncName() {
-  return INSTR_PROF_VALUE_RANGE_PROF_FUNC_STR;
+/// Return the name profile runtime entry point to do memop size value
+/// profiling.
+inline StringRef getInstrProfValueProfMemOpFuncName() {
+  return INSTR_PROF_VALUE_PROF_MEMOP_FUNC_STR;
 }
 
 /// Return the name prefix of variables containing instrumented function names.
@@ -561,10 +562,9 @@ StringRef InstrProfSymtab::getFuncNameOrExternalSymbol(uint64_t FuncMD5Hash) {
 
 StringRef InstrProfSymtab::getFuncName(uint64_t FuncMD5Hash) {
   finalizeSymtab();
-  auto Result =
-      std::lower_bound(MD5NameMap.begin(), MD5NameMap.end(), FuncMD5Hash,
-                       [](const std::pair<uint64_t, StringRef> &LHS,
-                          uint64_t RHS) { return LHS.first < RHS; });
+  auto Result = llvm::lower_bound(MD5NameMap, FuncMD5Hash,
+                                  [](const std::pair<uint64_t, StringRef> &LHS,
+                                     uint64_t RHS) { return LHS.first < RHS; });
   if (Result != MD5NameMap.end() && Result->first == FuncMD5Hash)
     return Result->second;
   return StringRef();
@@ -572,10 +572,9 @@ StringRef InstrProfSymtab::getFuncName(uint64_t FuncMD5Hash) {
 
 Function* InstrProfSymtab::getFunction(uint64_t FuncMD5Hash) {
   finalizeSymtab();
-  auto Result =
-      std::lower_bound(MD5FuncMap.begin(), MD5FuncMap.end(), FuncMD5Hash,
-                       [](const std::pair<uint64_t, Function*> &LHS,
-                          uint64_t RHS) { return LHS.first < RHS; });
+  auto Result = llvm::lower_bound(MD5FuncMap, FuncMD5Hash,
+                                  [](const std::pair<uint64_t, Function *> &LHS,
+                                     uint64_t RHS) { return LHS.first < RHS; });
   if (Result != MD5FuncMap.end() && Result->first == FuncMD5Hash)
     return Result->second;
   return nullptr;
@@ -678,8 +677,8 @@ struct InstrProfValueSiteRecord {
   /// Optionally scale merged counts by \p Weight.
   void merge(InstrProfValueSiteRecord &Input, uint64_t Weight,
              function_ref<void(instrprof_error)> Warn);
-  /// Scale up value profile data counts.
-  void scale(uint64_t Weight, function_ref<void(instrprof_error)> Warn);
+  /// Scale up value profile data counts by N (Numerator) / D (Denominator).
+  void scale(uint64_t N, uint64_t D, function_ref<void(instrprof_error)> Warn);
 
   /// Compute the overlap b/w this record and Input record.
   void overlap(InstrProfValueSiteRecord &Input, uint32_t ValueKind,
@@ -753,8 +752,8 @@ struct InstrProfRecord {
              function_ref<void(instrprof_error)> Warn);
 
   /// Scale up profile counts (including value profile data) by
-  /// \p Weight.
-  void scale(uint64_t Weight, function_ref<void(instrprof_error)> Warn);
+  /// a factor of (N / D).
+  void scale(uint64_t N, uint64_t D, function_ref<void(instrprof_error)> Warn);
 
   /// Sort value profile data (per site) by count.
   void sortValueData() {
@@ -839,8 +838,8 @@ private:
                           uint64_t Weight,
                           function_ref<void(instrprof_error)> Warn);
 
-  // Scale up value profile data count.
-  void scaleValueProfData(uint32_t ValueKind, uint64_t Weight,
+  // Scale up value profile data count by N (Numerator) / D (Denominator).
+  void scaleValueProfData(uint32_t ValueKind, uint64_t N, uint64_t D,
                           function_ref<void(instrprof_error)> Warn);
 };
 
@@ -982,7 +981,9 @@ enum ProfVersion {
   // In this version, the frontend PGO stable hash algorithm got fixed and
   // may produce hashes different from Version5.
   Version6 = 6,
-  // The current version is 5.
+  // An additional counter is added around logical operators.
+  Version7 = 7,
+  // The current version is 7.
   CurrentVersion = INSTR_PROF_INDEX_VERSION
 };
 const uint64_t Version = ProfVersion::CurrentVersion;
@@ -1138,7 +1139,8 @@ void getMemOPSizeRangeFromOption(StringRef Str, int64_t &RangeStart,
 
 // Create a COMDAT variable INSTR_PROF_RAW_VERSION_VAR to make the runtime
 // aware this is an ir_level profile so it can set the version flag.
-void createIRLevelProfileFlagVar(Module &M, bool IsCS);
+void createIRLevelProfileFlagVar(Module &M, bool IsCS,
+                                 bool InstrEntryBBEnabled);
 
 // Create the variable for the profile file name.
 void createProfileFileNameVar(Module &M, StringRef InstrProfileOutput);
diff --git a/contrib/llvm-project/llvm/include/llvm/ProfileData/InstrProfData.inc b/contrib/llvm-project/llvm/include/llvm/ProfileData/InstrProfData.inc
index a6913527e67f..f715505ba5e1 100644
--- a/contrib/llvm-project/llvm/include/llvm/ProfileData/InstrProfData.inc
+++ b/contrib/llvm-project/llvm/include/llvm/ProfileData/InstrProfData.inc
@@ -154,17 +154,7 @@ INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last)
 VALUE_PROF_FUNC_PARAM(uint64_t, TargetValue, Type::getInt64Ty(Ctx)) \
                       INSTR_PROF_COMMA
 VALUE_PROF_FUNC_PARAM(void *, Data, Type::getInt8PtrTy(Ctx)) INSTR_PROF_COMMA
-#ifndef VALUE_RANGE_PROF
 VALUE_PROF_FUNC_PARAM(uint32_t, CounterIndex, Type::getInt32Ty(Ctx))
-#else /* VALUE_RANGE_PROF */
-VALUE_PROF_FUNC_PARAM(uint32_t, CounterIndex, Type::getInt32Ty(Ctx)) \
-                      INSTR_PROF_COMMA
-VALUE_PROF_FUNC_PARAM(uint64_t, PreciseRangeStart, Type::getInt64Ty(Ctx)) \
-                      INSTR_PROF_COMMA
-VALUE_PROF_FUNC_PARAM(uint64_t, PreciseRangeLast, Type::getInt64Ty(Ctx)) \
-                      INSTR_PROF_COMMA
-VALUE_PROF_FUNC_PARAM(uint64_t, LargeValue, Type::getInt64Ty(Ctx))
-#endif /*VALUE_RANGE_PROF */
 #undef VALUE_PROF_FUNC_PARAM
 #undef INSTR_PROF_COMMA
 /* VALUE_PROF_FUNC_PARAM end */
@@ -657,9 +647,9 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 /* Raw profile format version (start from 1). */
 #define INSTR_PROF_RAW_VERSION 5
 /* Indexed profile format version (start from 1). */
-#define INSTR_PROF_INDEX_VERSION 6
+#define INSTR_PROF_INDEX_VERSION 7
 /* Coverage mapping format version (start from 0). */
-#define INSTR_PROF_COVMAP_VERSION 3
+#define INSTR_PROF_COVMAP_VERSION 4
 
 /* Profile version is always of type uint64_t. Reserve the upper 8 bits in the
  * version for other variants of profile. We set the lowest bit of the upper 8
@@ -671,6 +661,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define GET_VERSION(V) ((V) & ~VARIANT_MASKS_ALL)
 #define VARIANT_MASK_IR_PROF (0x1ULL << 56)
 #define VARIANT_MASK_CSIR_PROF (0x1ULL << 57)
+#define VARIANT_MASK_INSTR_ENTRY (0x1ULL << 58)
 #define INSTR_PROF_RAW_VERSION_VAR __llvm_profile_raw_version
 #define INSTR_PROF_PROFILE_RUNTIME_VAR __llvm_profile_runtime
 
@@ -753,9 +744,9 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define INSTR_PROF_VALUE_PROF_FUNC __llvm_profile_instrument_target
 #define INSTR_PROF_VALUE_PROF_FUNC_STR \
         INSTR_PROF_QUOTE(INSTR_PROF_VALUE_PROF_FUNC)
-#define INSTR_PROF_VALUE_RANGE_PROF_FUNC __llvm_profile_instrument_range
-#define INSTR_PROF_VALUE_RANGE_PROF_FUNC_STR \
-        INSTR_PROF_QUOTE(INSTR_PROF_VALUE_RANGE_PROF_FUNC)
+#define INSTR_PROF_VALUE_PROF_MEMOP_FUNC __llvm_profile_instrument_memop
+#define INSTR_PROF_VALUE_PROF_MEMOP_FUNC_STR                                   \
+  INSTR_PROF_QUOTE(INSTR_PROF_VALUE_PROF_MEMOP_FUNC)
 
 /* InstrProfile per-function control data alignment.  */
 #define INSTR_PROF_DATA_ALIGNMENT 8
@@ -783,3 +774,121 @@ typedef struct InstrProfValueData {
 #endif
 
 #undef COVMAP_V2_OR_V3
+
+#ifdef INSTR_PROF_VALUE_PROF_MEMOP_API
+
+#ifdef __cplusplus
+#define INSTR_PROF_INLINE inline
+#else
+#define INSTR_PROF_INLINE
+#endif
+
+/* The value range buckets (22 buckets) for the memop size value profiling looks
+ * like:
+ *
+ *   [0, 0]
+ *   [1, 1]
+ *   [2, 2]
+ *   [3, 3]
+ *   [4, 4]
+ *   [5, 5]
+ *   [6, 6]
+ *   [7, 7]
+ *   [8, 8]
+ *   [9, 15]
+ *   [16, 16]
+ *   [17, 31]
+ *   [32, 32]
+ *   [33, 63]
+ *   [64, 64]
+ *   [65, 127]
+ *   [128, 128]
+ *   [129, 255]
+ *   [256, 256]
+ *   [257, 511]
+ *   [512, 512]
+ *   [513, UINT64_MAX]
+ *
+ * Each range has a 'representative value' which is the lower end value of the
+ * range and used to store in the runtime profile data records and the VP
+ * metadata. For example, it's 2 for [2, 2] and 64 for [65, 127].
+ */
+
+/*
+ * Clz and Popcount. This code was copied from
+ * compiler-rt/lib/fuzzer/{FuzzerBuiltins.h,FuzzerBuiltinsMsvc.h} and
+ * llvm/include/llvm/Support/MathExtras.h.
+ */
+#if defined(_MSC_VER) && !defined(__clang__)
+
+#include <intrin.h>
+INSTR_PROF_VISIBILITY INSTR_PROF_INLINE
+int InstProfClzll(unsigned long long X) {
+  unsigned long LeadZeroIdx = 0;
+#if !defined(_M_ARM64) && !defined(_M_X64)
+  // Scan the high 32 bits.
+  if (_BitScanReverse(&LeadZeroIdx, (unsigned long)(X >> 32)))
+    return (int)(63 - (LeadZeroIdx + 32)); // Create a bit offset
+                                                      // from the MSB.
+  // Scan the low 32 bits.
+  if (_BitScanReverse(&LeadZeroIdx, (unsigned long)(X)))
+    return (int)(63 - LeadZeroIdx);
+#else
+  if (_BitScanReverse64(&LeadZeroIdx, X)) return 63 - LeadZeroIdx;
+#endif
+  return 64;
+}
+INSTR_PROF_VISIBILITY INSTR_PROF_INLINE
+int InstProfPopcountll(unsigned long long X) {
+  // This code originates from https://reviews.llvm.org/rG30626254510f.
+  unsigned long long v = X;
+  v = v - ((v >> 1) & 0x5555555555555555ULL);
+  v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL);
+  v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
+  return (int)((unsigned long long)(v * 0x0101010101010101ULL) >> 56);
+}
+
+#else
+
+INSTR_PROF_VISIBILITY INSTR_PROF_INLINE
+int InstProfClzll(unsigned long long X) { return __builtin_clzll(X); }
+INSTR_PROF_VISIBILITY INSTR_PROF_INLINE
+int InstProfPopcountll(unsigned long long X) { return __builtin_popcountll(X); }
+
+#endif  /* defined(_MSC_VER) && !defined(__clang__) */
+
+/* Map an (observed) memop size value to the representative value of its range.
+ * For example, 5 -> 5, 22 -> 17, 99 -> 65, 256 -> 256, 1001 -> 513. */
+INSTR_PROF_VISIBILITY INSTR_PROF_INLINE uint64_t
+InstrProfGetRangeRepValue(uint64_t Value) {
+  if (Value <= 8)
+    // The first ranges are individually tracked. Use the value as is.
+    return Value;
+  else if (Value >= 513)
+    // The last range is mapped to its lowest value.
+    return 513;
+  else if (InstProfPopcountll(Value) == 1)
+    // If it's a power of two, use it as is.
+    return Value;
+  else
+    // Otherwise, take to the previous power of two + 1.
+    return (1 << (64 - InstProfClzll(Value) - 1)) + 1;
+}
+
+/* Return true if the range that an (observed) memop size value belongs to has
+ * only a single value in the range.  For example, 0 -> true, 8 -> true, 10 ->
+ * false, 64 -> true, 100 -> false, 513 -> false. */
+INSTR_PROF_VISIBILITY INSTR_PROF_INLINE unsigned
+InstrProfIsSingleValRange(uint64_t Value) {
+  if (Value <= 8)
+    // The first ranges are individually tracked.
+    return 1;
+  else if (InstProfPopcountll(Value) == 1)
+    // If it's a power of two, there's only one value.
+    return 1;
+  else
+    // Otherwise, there's more than one value in the range.
+    return 0;
+}
+
+#endif /* INSTR_PROF_VALUE_PROF_MEMOP_API */
diff --git a/contrib/llvm-project/llvm/include/llvm/ProfileData/InstrProfReader.h b/contrib/llvm-project/llvm/include/llvm/ProfileData/InstrProfReader.h
index f5f552672bf0..2c2cfb90d4fa 100644
--- a/contrib/llvm-project/llvm/include/llvm/ProfileData/InstrProfReader.h
+++ b/contrib/llvm-project/llvm/include/llvm/ProfileData/InstrProfReader.h
@@ -50,8 +50,12 @@ public:
   InstrProfIterator(InstrProfReader *Reader) : Reader(Reader) { Increment(); }
 
   InstrProfIterator &operator++() { Increment(); return *this; }
-  bool operator==(const InstrProfIterator &RHS) { return Reader == RHS.Reader; }
-  bool operator!=(const InstrProfIterator &RHS) { return Reader != RHS.Reader; }
+  bool operator==(const InstrProfIterator &RHS) const {
+    return Reader == RHS.Reader;
+  }
+  bool operator!=(const InstrProfIterator &RHS) const {
+    return Reader != RHS.Reader;
+  }
   value_type &operator*() { return Record; }
   value_type *operator->() { return &Record; }
 };
@@ -79,6 +83,8 @@ public:
 
   virtual bool hasCSIRLevelProfile() const = 0;
 
+  virtual bool instrEntryBBEnabled() const = 0;
+
   /// Return the PGO symtab. There are three different readers:
   /// Raw, Text, and Indexed profile readers. The first two types
   /// of readers are used only by llvm-profdata tool, while the indexed
@@ -148,6 +154,7 @@ private:
   line_iterator Line;
   bool IsIRLevelProfile = false;
   bool HasCSIRLevelProfile = false;
+  bool InstrEntryBBEnabled = false;
 
   Error readValueProfileData(InstrProfRecord &Record);
 
@@ -164,6 +171,8 @@ public:
 
   bool hasCSIRLevelProfile() const override { return HasCSIRLevelProfile; }
 
+  bool instrEntryBBEnabled() const override { return InstrEntryBBEnabled; }
+
   /// Read the header.
   Error readHeader() override;
 
@@ -224,6 +233,10 @@ public:
     return (Version & VARIANT_MASK_CSIR_PROF) != 0;
   }
 
+  bool instrEntryBBEnabled() const override {
+    return (Version & VARIANT_MASK_INSTR_ENTRY) != 0;
+  }
+
   InstrProfSymtab &getSymtab() override {
     assert(Symtab.get());
     return *Symtab.get();
@@ -360,6 +373,7 @@ struct InstrProfReaderIndexBase {
   virtual uint64_t getVersion() const = 0;
   virtual bool isIRLevelProfile() const = 0;
   virtual bool hasCSIRLevelProfile() const = 0;
+  virtual bool instrEntryBBEnabled() const = 0;
   virtual Error populateSymtab(InstrProfSymtab &) = 0;
 };
 
@@ -408,6 +422,10 @@ public:
     return (FormatVersion & VARIANT_MASK_CSIR_PROF) != 0;
   }
 
+  bool instrEntryBBEnabled() const override {
+    return (FormatVersion & VARIANT_MASK_INSTR_ENTRY) != 0;
+  }
+
   Error populateSymtab(InstrProfSymtab &Symtab) override {
     return Symtab.create(HashTable->keys());
   }
@@ -462,6 +480,10 @@ public:
     return Index->hasCSIRLevelProfile();
   }
 
+  bool instrEntryBBEnabled() const override {
+    return Index->instrEntryBBEnabled();
+  }
+
   /// Return true if the given buffer is in an indexed instrprof format.
   static bool hasFormat(const MemoryBuffer &DataBuffer);
 
diff --git a/contrib/llvm-project/llvm/include/llvm/ProfileData/InstrProfWriter.h b/contrib/llvm-project/llvm/include/llvm/ProfileData/InstrProfWriter.h
index 5882fa2781e2..35c2669d55a6 100644
--- a/contrib/llvm-project/llvm/include/llvm/ProfileData/InstrProfWriter.h
+++ b/contrib/llvm-project/llvm/include/llvm/ProfileData/InstrProfWriter.h
@@ -40,13 +40,16 @@ private:
   bool Sparse;
   StringMap<ProfilingData> FunctionData;
   ProfKind ProfileKind = PF_Unknown;
+  bool InstrEntryBBEnabled;
   // Use raw pointer here for the incomplete type object.
   InstrProfRecordWriterTrait *InfoObj;
 
 public:
-  InstrProfWriter(bool Sparse = false);
+  InstrProfWriter(bool Sparse = false, bool InstrEntryBBEnabled = false);
   ~InstrProfWriter();
 
+  StringMap<ProfilingData> &getProfileData() { return FunctionData; }
+
   /// Add function counts for the given function. If there are already counts
   /// for this function and the hash and number of counts match, each counter is
   /// summed. Optionally scale counts by \p Weight.
@@ -97,6 +100,7 @@ public:
     return Error::success();
   }
 
+  void setInstrEntryBBEnabled(bool Enabled) { InstrEntryBBEnabled = Enabled; }
   // Internal interface for testing purpose only.
   void setValueProfDataEndianness(support::endianness Endianness);
   void setOutputSparse(bool Sparse);
diff --git a/contrib/llvm-project/llvm/include/llvm/ProfileData/ProfileCommon.h b/contrib/llvm-project/llvm/include/llvm/ProfileData/ProfileCommon.h
index 14c305b3d0c0..55b94b2e690d 100644
--- a/contrib/llvm-project/llvm/include/llvm/ProfileData/ProfileCommon.h
+++ b/contrib/llvm-project/llvm/include/llvm/ProfileData/ProfileCommon.h
@@ -17,6 +17,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/IR/ProfileSummary.h"
 #include "llvm/ProfileData/InstrProf.h"
+#include "llvm/ProfileData/SampleProf.h"
 #include "llvm/Support/Error.h"
 #include <algorithm>
 #include <cstdint>
@@ -33,8 +34,8 @@ class FunctionSamples;
 
 } // end namespace sampleprof
 
-inline const char *getHotSectionPrefix() { return ".hot"; }
-inline const char *getUnlikelySectionPrefix() { return ".unlikely"; }
+inline const char *getHotSectionPrefix() { return "hot"; }
+inline const char *getUnlikelySectionPrefix() { return "unlikely"; }
 
 class ProfileSummaryBuilder {
 private:
@@ -89,6 +90,8 @@ public:
 
   void addRecord(const sampleprof::FunctionSamples &FS,
                  bool isCallsiteSample = false);
+  std::unique_ptr<ProfileSummary> computeSummaryForProfiles(
+      const StringMap<sampleprof::FunctionSamples> &Profiles);
   std::unique_ptr<ProfileSummary> getSummary();
 };
 
diff --git a/contrib/llvm-project/llvm/include/llvm/ProfileData/SampleProf.h b/contrib/llvm-project/llvm/include/llvm/ProfileData/SampleProf.h
index 562468333ef4..25d5b2376c11 100644
--- a/contrib/llvm-project/llvm/include/llvm/ProfileData/SampleProf.h
+++ b/contrib/llvm-project/llvm/include/llvm/ProfileData/SampleProf.h
@@ -37,8 +37,6 @@
 
 namespace llvm {
 
-class raw_ostream;
-
 const std::error_category &sampleprof_category();
 
 enum class sampleprof_error {
@@ -56,7 +54,8 @@ enum class sampleprof_error {
   ostream_seek_unsupported,
   compress_failed,
   uncompress_failed,
-  zlib_unavailable
+  zlib_unavailable,
+  hash_mismatch
 };
 
 inline std::error_code make_error_code(sampleprof_error E) {
@@ -122,6 +121,7 @@ enum SecType {
   SecNameTable = 2,
   SecProfileSymbolList = 3,
   SecFuncOffsetTable = 4,
+  SecFuncMetadata = 5,
   // marker for the first type of profile.
   SecFuncProfileFirst = 32,
   SecLBRProfile = SecFuncProfileFirst
@@ -139,6 +139,8 @@ static inline std::string getSecName(SecType Type) {
     return "ProfileSymbolListSection";
   case SecFuncOffsetTable:
     return "FuncOffsetTableSection";
+  case SecFuncMetadata:
+    return "FunctionMetadata";
   case SecLBRProfile:
     return "LBRProfileSection";
   }
@@ -152,6 +154,9 @@ struct SecHdrTableEntry {
   uint64_t Flags;
   uint64_t Offset;
   uint64_t Size;
+  // The index indicating the location of the current entry in
+  // SectionHdrLayout table.
+  uint32_t LayoutIndex;
 };
 
 // Flags common for all sections are defined here. In SecHdrTableEntry::Flags,
@@ -159,7 +164,9 @@ struct SecHdrTableEntry {
 // will be saved in the higher 32 bits.
 enum class SecCommonFlags : uint32_t {
   SecFlagInValid = 0,
-  SecFlagCompress = (1 << 0)
+  SecFlagCompress = (1 << 0),
+  // Indicate the section contains only profile without context.
+  SecFlagFlat = (1 << 1)
 };
 
 // Section specific flags are defined here.
@@ -167,7 +174,10 @@ enum class SecCommonFlags : uint32_t {
 // a new check in verifySecFlag.
 enum class SecNameTableFlags : uint32_t {
   SecFlagInValid = 0,
-  SecFlagMD5Name = (1 << 0)
+  SecFlagMD5Name = (1 << 0),
+  // Store MD5 in fixed length instead of ULEB128 so NameTable can be
+  // accessed like an array.
+  SecFlagFixedLengthMD5 = (1 << 1)
 };
 enum class SecProfSummaryFlags : uint32_t {
   SecFlagInValid = 0,
@@ -177,6 +187,11 @@ enum class SecProfSummaryFlags : uint32_t {
   SecFlagPartial = (1 << 0)
 };
 
+enum class SecFuncMetadataFlags : uint32_t {
+  SecFlagInvalid = 0,
+  SecFlagIsProbeBased = (1 << 0),
+};
+
 // Verify section specific flag is used for the correct section.
 template <class SecFlagType>
 static inline void verifySecFlag(SecType Type, SecFlagType Flag) {
@@ -193,6 +208,9 @@ static inline void verifySecFlag(SecType Type, SecFlagType Flag) {
   case SecProfSummary:
     IsFlagLegal = std::is_same<SecProfSummaryFlags, SecFlagType>();
     break;
+  case SecFuncMetadata:
+    IsFlagLegal = std::is_same<SecFuncMetadataFlags, SecFlagType>();
+    break;
   default:
     break;
   }
@@ -244,6 +262,14 @@ struct LineLocation {
            (LineOffset == O.LineOffset && Discriminator < O.Discriminator);
   }
 
+  bool operator==(const LineLocation &O) const {
+    return LineOffset == O.LineOffset && Discriminator == O.Discriminator;
+  }
+
+  bool operator!=(const LineLocation &O) const {
+    return LineOffset != O.LineOffset || Discriminator != O.Discriminator;
+  }
+
   uint32_t LineOffset;
   uint32_t Discriminator;
 };
@@ -321,6 +347,16 @@ public:
     return SortedTargets;
   }
 
+  /// Prorate call targets by a distribution factor.
+  static const CallTargetMap adjustCallTargets(const CallTargetMap &Targets,
+                                               float DistributionFactor) {
+    CallTargetMap AdjustedTargets;
+    for (const auto &I : Targets) {
+      AdjustedTargets[I.first()] = I.second * DistributionFactor;
+    }
+    return AdjustedTargets;
+  }
+
   /// Merge the samples in \p Other into this record.
   /// Optionally scale sample counts by \p Weight.
   sampleprof_error merge(const SampleRecord &Other, uint64_t Weight = 1) {
@@ -341,7 +377,137 @@ private:
 
 raw_ostream &operator<<(raw_ostream &OS, const SampleRecord &Sample);
 
+// State of context associated with FunctionSamples
+enum ContextStateMask {
+  UnknownContext = 0x0,   // Profile without context
+  RawContext = 0x1,       // Full context profile from input profile
+  SyntheticContext = 0x2, // Synthetic context created for context promotion
+  InlinedContext = 0x4,   // Profile for context that is inlined into caller
+  MergedContext = 0x8     // Profile for context merged into base profile
+};
+
+// Sample context for FunctionSamples. It consists of the calling context,
+// the function name and context state. Internally sample context is represented
+// using StringRef, which is also the input for constructing a `SampleContext`.
+// It can accept and represent both full context string as well as context-less
+// function name.
+// Example of full context string (note the wrapping `[]`):
+//    `[main:3 @ _Z5funcAi:1 @ _Z8funcLeafi]`
+// Example of context-less function name (same as AutoFDO):
+//    `_Z8funcLeafi`
+class SampleContext {
+public:
+  SampleContext() : State(UnknownContext) {}
+  SampleContext(StringRef ContextStr,
+                ContextStateMask CState = UnknownContext) {
+    setContext(ContextStr, CState);
+  }
+
+  // Promote context by removing top frames (represented by `ContextStrToRemove`).
+  // Note that with string representation of context, the promotion is effectively
+  // a substr operation with `ContextStrToRemove` removed from left.
+  void promoteOnPath(StringRef ContextStrToRemove) {
+    assert(FullContext.startswith(ContextStrToRemove));
+
+    // Remove leading context and frame separator " @ ".
+    FullContext = FullContext.substr(ContextStrToRemove.size() + 3);
+    CallingContext = CallingContext.substr(ContextStrToRemove.size() + 3);
+  }
+
+  // Split the top context frame (left-most substr) from context.
+  static std::pair<StringRef, StringRef>
+  splitContextString(StringRef ContextStr) {
+    return ContextStr.split(" @ ");
+  }
+
+  // Decode context string for a frame to get function name and location.
+  // `ContextStr` is in the form of `FuncName:StartLine.Discriminator`.
+  static void decodeContextString(StringRef ContextStr, StringRef &FName,
+                                  LineLocation &LineLoc) {
+    // Get function name
+    auto EntrySplit = ContextStr.split(':');
+    FName = EntrySplit.first;
+
+    LineLoc = {0, 0};
+    if (!EntrySplit.second.empty()) {
+      // Get line offset, use signed int for getAsInteger so string will
+      // be parsed as signed.
+      int LineOffset = 0;
+      auto LocSplit = EntrySplit.second.split('.');
+      LocSplit.first.getAsInteger(10, LineOffset);
+      LineLoc.LineOffset = LineOffset;
+
+      // Get discriminator
+      if (!LocSplit.second.empty())
+        LocSplit.second.getAsInteger(10, LineLoc.Discriminator);
+    }
+  }
+
+  operator StringRef() const { return FullContext; }
+  bool hasState(ContextStateMask S) { return State & (uint32_t)S; }
+  void setState(ContextStateMask S) { State |= (uint32_t)S; }
+  void clearState(ContextStateMask S) { State &= (uint32_t)~S; }
+  bool hasContext() const { return State != UnknownContext; }
+  bool isBaseContext() const { return CallingContext.empty(); }
+  StringRef getNameWithoutContext() const { return Name; }
+  StringRef getCallingContext() const { return CallingContext; }
+  StringRef getNameWithContext(bool WithBracket = false) const {
+    return WithBracket ? InputContext : FullContext;
+  }
+
+private:
+  // Give a context string, decode and populate internal states like
+  // Function name, Calling context and context state. Example of input
+  // `ContextStr`: `[main:3 @ _Z5funcAi:1 @ _Z8funcLeafi]`
+  void setContext(StringRef ContextStr, ContextStateMask CState) {
+    assert(!ContextStr.empty());
+    InputContext = ContextStr;
+    // Note that `[]` wrapped input indicates a full context string, otherwise
+    // it's treated as context-less function name only.
+    bool HasContext = ContextStr.startswith("[");
+    if (!HasContext && CState == UnknownContext) {
+      State = UnknownContext;
+      Name = FullContext = ContextStr;
+    } else {
+      // Assume raw context profile if unspecified
+      if (CState == UnknownContext)
+        State = RawContext;
+      else
+        State = CState;
+
+      // Remove encapsulating '[' and ']' if any
+      if (HasContext)
+        FullContext = ContextStr.substr(1, ContextStr.size() - 2);
+      else
+        FullContext = ContextStr;
+
+      // Caller is to the left of callee in context string
+      auto NameContext = FullContext.rsplit(" @ ");
+      if (NameContext.second.empty()) {
+        Name = NameContext.first;
+        CallingContext = NameContext.second;
+      } else {
+        Name = NameContext.second;
+        CallingContext = NameContext.first;
+      }
+    }
+  }
+
+  // Input context string including bracketed calling context and leaf function
+  // name
+  StringRef InputContext;
+  // Full context string including calling context and leaf function name
+  StringRef FullContext;
+  // Function name for the associated sample profile
+  StringRef Name;
+  // Calling context (leaf function excluded) for the associated sample profile
+  StringRef CallingContext;
+  // State of the associated sample profile
+  uint32_t State;
+};
+
 class FunctionSamples;
+class SampleProfileReaderItaniumRemapper;
 
 using BodySampleMap = std::map<LineLocation, SampleRecord>;
 // NOTE: Using a StringMap here makes parsed profiles consume around 17% more
@@ -369,6 +535,8 @@ public:
                       : sampleprof_error::success;
   }
 
+  void setTotalSamples(uint64_t Num) { TotalSamples = Num; }
+
   sampleprof_error addHeadSamples(uint64_t Num, uint64_t Weight = 1) {
     bool Overflowed;
     TotalHeadSamples =
@@ -397,10 +565,22 @@ public:
   ErrorOr<uint64_t> findSamplesAt(uint32_t LineOffset,
                                   uint32_t Discriminator) const {
     const auto &ret = BodySamples.find(LineLocation(LineOffset, Discriminator));
-    if (ret == BodySamples.end())
+    if (ret == BodySamples.end()) {
+      // For CSSPGO, in order to conserve profile size, we no longer write out
+      // locations profile for those not hit during training, so we need to
+      // treat them as zero instead of error here.
+      if (ProfileIsCS)
+        return 0;
+      return std::error_code();
+      // A missing counter for a probe likely means the probe was not executed.
+      // Treat it as a zero count instead of an unknown count to help edge
+      // weight inference.
+      if (FunctionSamples::ProfileIsProbeBased)
+        return 0;
       return std::error_code();
-    else
+    } else {
       return ret->second.getSamples();
+    }
   }
 
   /// Returns the call target map collected at a given location.
@@ -414,6 +594,16 @@ public:
     return ret->second.getCallTargets();
   }
 
+  /// Returns the call target map collected at a given location specified by \p
+  /// CallSite. If the location is not found in profile, return error.
+  ErrorOr<SampleRecord::CallTargetMap>
+  findCallTargetMapAt(const LineLocation &CallSite) const {
+    const auto &Ret = BodySamples.find(CallSite);
+    if (Ret == BodySamples.end())
+      return std::error_code();
+    return Ret->second.getCallTargets();
+  }
+
   /// Return the function samples at the given callsite location.
   FunctionSamplesMap &functionSamplesAt(const LineLocation &Loc) {
     return CallsiteSamples[Loc];
@@ -428,35 +618,15 @@ public:
     return &iter->second;
   }
 
-  /// Returns a pointer to FunctionSamples at the given callsite location \p Loc
-  /// with callee \p CalleeName. If no callsite can be found, relax the
-  /// restriction to return the FunctionSamples at callsite location \p Loc
-  /// with the maximum total sample count.
-  const FunctionSamples *findFunctionSamplesAt(const LineLocation &Loc,
-                                               StringRef CalleeName) const {
-    std::string CalleeGUID;
-    CalleeName = getRepInFormat(CalleeName, UseMD5, CalleeGUID);
-
-    auto iter = CallsiteSamples.find(Loc);
-    if (iter == CallsiteSamples.end())
-      return nullptr;
-    auto FS = iter->second.find(CalleeName);
-    if (FS != iter->second.end())
-      return &FS->second;
-    // If we cannot find exact match of the callee name, return the FS with
-    // the max total count. Only do this when CalleeName is not provided,
-    // i.e., only for indirect calls.
-    if (!CalleeName.empty())
-      return nullptr;
-    uint64_t MaxTotalSamples = 0;
-    const FunctionSamples *R = nullptr;
-    for (const auto &NameFS : iter->second)
-      if (NameFS.second.getTotalSamples() >= MaxTotalSamples) {
-        MaxTotalSamples = NameFS.second.getTotalSamples();
-        R = &NameFS.second;
-      }
-    return R;
-  }
+  /// Returns a pointer to FunctionSamples at the given callsite location
+  /// \p Loc with callee \p CalleeName. If no callsite can be found, relax
+  /// the restriction to return the FunctionSamples at callsite location
+  /// \p Loc with the maximum total sample count. If \p Remapper is not
+  /// nullptr, use \p Remapper to find FunctionSamples with equivalent name
+  /// as \p CalleeName.
+  const FunctionSamples *
+  findFunctionSamplesAt(const LineLocation &Loc, StringRef CalleeName,
+                        SampleProfileReaderItaniumRemapper *Remapper) const;
 
   bool empty() const { return TotalSamples == 0; }
 
@@ -473,6 +643,11 @@ public:
   /// Return the sample count of the first instruction of the function.
   /// The function can be either a standalone symbol or an inlined function.
   uint64_t getEntrySamples() const {
+    if (FunctionSamples::ProfileIsCS && getHeadSamples()) {
+      // For CS profile, if we already have more accurate head samples
+      // counted by branch sample from caller, use them as entry samples.
+      return getHeadSamples();
+    }
     uint64_t Count = 0;
     // Use either BodySamples or CallsiteSamples which ever has the smaller
     // lineno.
@@ -515,6 +690,24 @@ public:
   sampleprof_error merge(const FunctionSamples &Other, uint64_t Weight = 1) {
     sampleprof_error Result = sampleprof_error::success;
     Name = Other.getName();
+    if (!GUIDToFuncNameMap)
+      GUIDToFuncNameMap = Other.GUIDToFuncNameMap;
+    if (Context.getNameWithContext(true).empty())
+      Context = Other.getContext();
+    if (FunctionHash == 0) {
+      // Set the function hash code for the target profile.
+      FunctionHash = Other.getFunctionHash();
+    } else if (FunctionHash != Other.getFunctionHash()) {
+      // The two profiles coming with different valid hash codes indicates
+      // either:
+      // 1. They are same-named static functions from different compilation
+      // units (without using -unique-internal-linkage-names), or
+      // 2. They are really the same function but from different compilations.
+      // Let's bail out in either case for now, which means one profile is
+      // dropped.
+      return sampleprof_error::hash_mismatch;
+    }
+
     MergeResult(Result, addTotalSamples(Other.getTotalSamples(), Weight));
     MergeResult(Result, addHeadSamples(Other.getHeadSamples(), Weight));
     for (const auto &I : Other.getBodySamples()) {
@@ -566,19 +759,34 @@ public:
   /// Return the function name.
   StringRef getName() const { return Name; }
 
+  /// Return function name with context.
+  StringRef getNameWithContext(bool WithBracket = false) const {
+    return FunctionSamples::ProfileIsCS
+               ? Context.getNameWithContext(WithBracket)
+               : Name;
+  }
+
   /// Return the original function name.
   StringRef getFuncName() const { return getFuncName(Name); }
 
+  void setFunctionHash(uint64_t Hash) { FunctionHash = Hash; }
+
+  uint64_t getFunctionHash() const { return FunctionHash; }
+
   /// Return the canonical name for a function, taking into account
   /// suffix elision policy attributes.
   static StringRef getCanonicalFnName(const Function &F) {
-    static const char *knownSuffixes[] = { ".llvm.", ".part." };
     auto AttrName = "sample-profile-suffix-elision-policy";
     auto Attr = F.getFnAttribute(AttrName).getValueAsString();
+    return getCanonicalFnName(F.getName(), Attr);
+  }
+
+  static StringRef getCanonicalFnName(StringRef FnName, StringRef Attr = "") {
+    static const char *knownSuffixes[] = { ".llvm.", ".part." };
     if (Attr == "" || Attr == "all") {
-      return F.getName().split('.').first;
+      return FnName.split('.').first;
     } else if (Attr == "selected") {
-      StringRef Cand(F.getName());
+      StringRef Cand(FnName);
       for (const auto &Suf : knownSuffixes) {
         StringRef Suffix(Suf);
         auto It = Cand.rfind(Suffix);
@@ -590,11 +798,11 @@ public:
       }
       return Cand;
     } else if (Attr == "none") {
-      return F.getName();
+      return FnName;
     } else {
       assert(false && "internal error: unknown suffix elision policy");
     }
-    return F.getName();
+    return FnName;
   }
 
   /// Translate \p Name into its original name.
@@ -609,16 +817,19 @@ public:
       return Name;
 
     assert(GUIDToFuncNameMap && "GUIDToFuncNameMap needs to be popluated first");
-    auto iter = GUIDToFuncNameMap->find(std::stoull(Name.data()));
-    if (iter == GUIDToFuncNameMap->end())
-      return StringRef();
-    return iter->second;
+    return GUIDToFuncNameMap->lookup(std::stoull(Name.data()));
   }
 
   /// Returns the line offset to the start line of the subprogram.
   /// We assume that a single function will not exceed 65535 LOC.
   static unsigned getOffset(const DILocation *DIL);
 
+  /// Returns a unique call site identifier for a given debug location of a call
+  /// instruction. This is wrapper of two scenarios, the probe-based profile and
+  /// regular profile, to hide implementation details from the sample loader and
+  /// the context tracker.
+  static LineLocation getCallSiteIdentifier(const DILocation *DIL);
+
   /// Get the FunctionSamples of the inline instance where DIL originates
   /// from.
   ///
@@ -628,7 +839,19 @@ public:
   /// tree nodes in the profile.
   ///
   /// \returns the FunctionSamples pointer to the inlined instance.
-  const FunctionSamples *findFunctionSamples(const DILocation *DIL) const;
+  /// If \p Remapper is not nullptr, it will be used to find matching
+  /// FunctionSamples with not exactly the same but equivalent name.
+  const FunctionSamples *findFunctionSamples(
+      const DILocation *DIL,
+      SampleProfileReaderItaniumRemapper *Remapper = nullptr) const;
+
+  static bool ProfileIsProbeBased;
+
+  static bool ProfileIsCS;
+
+  SampleContext &getContext() const { return Context; }
+
+  void setContext(const SampleContext &FContext) { Context = FContext; }
 
   static SampleProfileFormat Format;
 
@@ -646,10 +869,20 @@ public:
     return UseMD5 ? std::stoull(Name.data()) : Function::getGUID(Name);
   }
 
+  // Find all the names in the current FunctionSamples including names in
+  // all the inline instances and names of call targets.
+  void findAllNames(DenseSet<StringRef> &NameSet) const;
+
 private:
   /// Mangled name of the function.
   StringRef Name;
 
+  /// CFG hash value for the function.
+  uint64_t FunctionHash = 0;
+
+  /// Calling context for function profile
+  mutable SampleContext Context;
+
   /// Total number of samples collected inside this function.
   ///
   /// Samples are cumulative, they include all the samples collected
diff --git a/contrib/llvm-project/llvm/include/llvm/ProfileData/SampleProfReader.h b/contrib/llvm-project/llvm/include/llvm/ProfileData/SampleProfReader.h
index 0e8ee7696c54..999e75eddffa 100644
--- a/contrib/llvm-project/llvm/include/llvm/ProfileData/SampleProfReader.h
+++ b/contrib/llvm-project/llvm/include/llvm/ProfileData/SampleProfReader.h
@@ -27,8 +27,9 @@
 //      offsetA[.discriminator]: fnA:num_of_total_samples
 //       offsetA1[.discriminator]: number_of_samples [fn7:num fn8:num ... ]
 //       ...
+//      !CFGChecksum: num
 //
-// This is a nested tree in which the identation represents the nesting level
+// This is a nested tree in which the indentation represents the nesting level
 // of the inline stack. There are no blank lines in the file. And the spacing
 // within a single line is fixed. Additional spaces will result in an error
 // while reading the file.
@@ -47,10 +48,11 @@
 // in the prologue of the function (second number). This head sample
 // count provides an indicator of how frequently the function is invoked.
 //
-// There are two types of lines in the function body.
+// There are three types of lines in the function body.
 //
 // * Sampled line represents the profile information of a source location.
 // * Callsite line represents the profile information of a callsite.
+// * Metadata line represents extra metadata of the function.
 //
 // Each sampled line may contain several items. Some are optional (marked
 // below):
@@ -114,6 +116,18 @@
 //    total number of samples collected for the inlined instance at this
 //    callsite
 //
+// Metadata line can occur in lines with one indent only, containing extra
+// information for the top-level function. Furthermore, metadata can only
+// occur after all the body samples and callsite samples.
+// Each metadata line may contain a particular type of metadata, marked by
+// the starting characters annotated with !. We process each metadata line
+// independently, hence each metadata line has to form an independent piece
+// of information that does not require cross-line reference.
+// We support the following types of metadata:
+//
+// a. CFG Checksum (a.k.a. function hash):
+//   !CFGChecksum: 12345
+//
 //
 // Binary format
 // -------------
@@ -208,10 +222,10 @@
 #ifndef LLVM_PROFILEDATA_SAMPLEPROFREADER_H
 #define LLVM_PROFILEDATA_SAMPLEPROFREADER_H
 
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Twine.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/LLVMContext.h"
@@ -232,6 +246,7 @@
 namespace llvm {
 
 class raw_ostream;
+class Twine;
 
 namespace sampleprof {
 
@@ -275,15 +290,18 @@ public:
     return Remappings->lookup(FunctionName);
   }
 
-  /// Return the samples collected for function \p F if remapper knows
-  /// it is present in SampleMap.
-  FunctionSamples *getSamplesFor(StringRef FunctionName);
+  /// Return the equivalent name in the profile for \p FunctionName if
+  /// it exists.
+  Optional<StringRef> lookUpNameInProfile(StringRef FunctionName);
 
 private:
   // The buffer holding the content read from remapping file.
   std::unique_ptr<MemoryBuffer> Buffer;
   std::unique_ptr<SymbolRemappingReader> Remappings;
-  DenseMap<SymbolRemappingReader::Key, FunctionSamples *> SampleMap;
+  // Map remapping key to the name in the profile. By looking up the
+  // key in the remapper, a given new name can be mapped to the
+  // cannonical name using the NameMap.
+  DenseMap<SymbolRemappingReader::Key, StringRef> NameMap;
   // The Reader the remapper is servicing.
   SampleProfileReader &Reader;
   // Indicate whether remapping has been applied to the profile read
@@ -370,15 +388,19 @@ public:
 
   /// Return the samples collected for function \p F.
   virtual FunctionSamples *getSamplesFor(StringRef Fname) {
-    if (Remapper) {
-      if (auto FS = Remapper->getSamplesFor(Fname))
-        return FS;
-    }
     std::string FGUID;
     Fname = getRepInFormat(Fname, useMD5(), FGUID);
     auto It = Profiles.find(Fname);
     if (It != Profiles.end())
       return &It->second;
+
+    if (Remapper) {
+      if (auto NameInProfile = Remapper->lookUpNameInProfile(Fname)) {
+        auto It = Profiles.find(*NameInProfile);
+        if (It != Profiles.end())
+          return &It->second;
+      }
+    }
     return nullptr;
   }
 
@@ -386,7 +408,7 @@ public:
   StringMap<FunctionSamples> &getProfiles() { return Profiles; }
 
   /// Report a parse error message.
-  void reportError(int64_t LineNumber, Twine Msg) const {
+  void reportError(int64_t LineNumber, const Twine &Msg) const {
     Ctx.diagnose(DiagnosticInfoSampleProfile(Buffer->getBufferIdentifier(),
                                              LineNumber, Msg));
   }
@@ -411,6 +433,12 @@ public:
   /// \brief Return the profile format.
   SampleProfileFormat getFormat() const { return Format; }
 
+  /// Whether input profile is based on pseudo probes.
+  bool profileIsProbeBased() const { return ProfileIsProbeBased; }
+
+  /// Whether input profile is fully context-sensitive
+  bool profileIsCS() const { return ProfileIsCS; }
+
   virtual std::unique_ptr<ProfileSymbolList> getProfileSymbolList() {
     return nullptr;
   };
@@ -423,6 +451,12 @@ public:
   /// Return whether names in the profile are all MD5 numbers.
   virtual bool useMD5() { return false; }
 
+  /// Don't read profile without context if the flag is set. This is only meaningful
+  /// for ExtBinary format.
+  virtual void setSkipFlatProf(bool Skip) {}
+
+  SampleProfileReaderItaniumRemapper *getRemapper() { return Remapper.get(); }
+
 protected:
   /// Map every function to its associated profile.
   ///
@@ -451,6 +485,15 @@ protected:
 
   std::unique_ptr<SampleProfileReaderItaniumRemapper> Remapper;
 
+  /// \brief Whether samples are collected based on pseudo probes.
+  bool ProfileIsProbeBased = false;
+
+  /// Whether function profiles are context-sensitive.
+  bool ProfileIsCS = false;
+
+  /// Number of context-sensitive profiles.
+  uint32_t CSProfileCount = 0;
+
   /// \brief The format of sample.
   SampleProfileFormat Format = SPF_None;
 };
@@ -588,40 +631,25 @@ private:
 
 protected:
   std::vector<SecHdrTableEntry> SecHdrTable;
-  std::unique_ptr<ProfileSymbolList> ProfSymList;
-  std::error_code readSecHdrTableEntry();
+  std::error_code readSecHdrTableEntry(uint32_t Idx);
   std::error_code readSecHdrTable();
-  virtual std::error_code readHeader() override;
-  virtual std::error_code verifySPMagic(uint64_t Magic) override = 0;
-  virtual std::error_code readOneSection(const uint8_t *Start, uint64_t Size,
-                                         const SecHdrTableEntry &Entry) = 0;
-
-public:
-  SampleProfileReaderExtBinaryBase(std::unique_ptr<MemoryBuffer> B,
-                                   LLVMContext &C, SampleProfileFormat Format)
-      : SampleProfileReaderBinary(std::move(B), C, Format) {}
-
-  /// Read sample profiles in extensible format from the associated file.
-  std::error_code readImpl() override;
-
-  /// Get the total size of all \p Type sections.
-  uint64_t getSectionSize(SecType Type);
-  /// Get the total size of header and all sections.
-  uint64_t getFileSize();
-  virtual bool dumpSectionInfo(raw_ostream &OS = dbgs()) override;
-};
 
-class SampleProfileReaderExtBinary : public SampleProfileReaderExtBinaryBase {
-private:
-  virtual std::error_code verifySPMagic(uint64_t Magic) override;
-  virtual std::error_code
-  readOneSection(const uint8_t *Start, uint64_t Size,
-                 const SecHdrTableEntry &Entry) override;
-  std::error_code readProfileSymbolList();
+  std::error_code readFuncMetadata();
   std::error_code readFuncOffsetTable();
   std::error_code readFuncProfiles();
   std::error_code readMD5NameTable();
   std::error_code readNameTableSec(bool IsMD5);
+  std::error_code readProfileSymbolList();
+
+  virtual std::error_code readHeader() override;
+  virtual std::error_code verifySPMagic(uint64_t Magic) override = 0;
+  virtual std::error_code readOneSection(const uint8_t *Start, uint64_t Size,
+                                         const SecHdrTableEntry &Entry);
+  // placeholder for subclasses to dispatch their own section readers.
+  virtual std::error_code readCustomSection(const SecHdrTableEntry &Entry) = 0;
+  virtual ErrorOr<StringRef> readStringFromTable() override;
+
+  std::unique_ptr<ProfileSymbolList> ProfSymList;
 
   /// The table mapping from function name to the offset of its FunctionSample
   /// towards file start.
@@ -631,6 +659,12 @@ private:
   /// Use all functions from the input profile.
   bool UseAllFuncs = true;
 
+  /// Use fixed length MD5 instead of ULEB128 encoding so NameTable doesn't
+  /// need to be read in up front and can be directly accessed using index.
+  bool FixedLengthMD5 = false;
+  /// The starting address of NameTable containing fixed length MD5.
+  const uint8_t *MD5NameMemStart = nullptr;
+
   /// If MD5 is used in NameTable section, the section saves uint64_t data.
   /// The uint64_t data has to be converted to a string and then the string
   /// will be used to initialize StringRef in NameTable.
@@ -640,26 +674,52 @@ private:
   /// the lifetime of MD5StringBuf is not shorter than that of NameTable.
   std::unique_ptr<std::vector<std::string>> MD5StringBuf;
 
+  /// If SkipFlatProf is true, skip the sections with
+  /// SecFlagFlat flag.
+  bool SkipFlatProf = false;
+
 public:
-  SampleProfileReaderExtBinary(std::unique_ptr<MemoryBuffer> B, LLVMContext &C,
-                               SampleProfileFormat Format = SPF_Ext_Binary)
-      : SampleProfileReaderExtBinaryBase(std::move(B), C, Format) {}
+  SampleProfileReaderExtBinaryBase(std::unique_ptr<MemoryBuffer> B,
+                                   LLVMContext &C, SampleProfileFormat Format)
+      : SampleProfileReaderBinary(std::move(B), C, Format) {}
 
-  /// \brief Return true if \p Buffer is in the format supported by this class.
-  static bool hasFormat(const MemoryBuffer &Buffer);
+  /// Read sample profiles in extensible format from the associated file.
+  std::error_code readImpl() override;
 
-  virtual std::unique_ptr<ProfileSymbolList> getProfileSymbolList() override {
-    return std::move(ProfSymList);
-  };
+  /// Get the total size of all \p Type sections.
+  uint64_t getSectionSize(SecType Type);
+  /// Get the total size of header and all sections.
+  uint64_t getFileSize();
+  virtual bool dumpSectionInfo(raw_ostream &OS = dbgs()) override;
 
   /// Collect functions with definitions in Module \p M.
   void collectFuncsFrom(const Module &M) override;
 
   /// Return whether names in the profile are all MD5 numbers.
-  virtual bool useMD5() override {
-    assert(!NameTable.empty() && "NameTable should have been initialized");
-    return MD5StringBuf && !MD5StringBuf->empty();
-  }
+  virtual bool useMD5() override { return MD5StringBuf.get(); }
+
+  virtual std::unique_ptr<ProfileSymbolList> getProfileSymbolList() override {
+    return std::move(ProfSymList);
+  };
+
+  virtual void setSkipFlatProf(bool Skip) override { SkipFlatProf = Skip; }
+};
+
+class SampleProfileReaderExtBinary : public SampleProfileReaderExtBinaryBase {
+private:
+  virtual std::error_code verifySPMagic(uint64_t Magic) override;
+  virtual std::error_code
+  readCustomSection(const SecHdrTableEntry &Entry) override {
+    return sampleprof_error::success;
+  };
+
+public:
+  SampleProfileReaderExtBinary(std::unique_ptr<MemoryBuffer> B, LLVMContext &C,
+                               SampleProfileFormat Format = SPF_Ext_Binary)
+      : SampleProfileReaderExtBinaryBase(std::move(B), C, Format) {}
+
+  /// \brief Return true if \p Buffer is in the format supported by this class.
+  static bool hasFormat(const MemoryBuffer &Buffer);
 };
 
 class SampleProfileReaderCompactBinary : public SampleProfileReaderBinary {
diff --git a/contrib/llvm-project/llvm/include/llvm/ProfileData/SampleProfWriter.h b/contrib/llvm-project/llvm/include/llvm/ProfileData/SampleProfWriter.h
index 7d0df9e44f58..419ebd6eb7ae 100644
--- a/contrib/llvm-project/llvm/include/llvm/ProfileData/SampleProfWriter.h
+++ b/contrib/llvm-project/llvm/include/llvm/ProfileData/SampleProfWriter.h
@@ -15,6 +15,7 @@
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/IR/ProfileSummary.h"
 #include "llvm/ProfileData/SampleProf.h"
 #include "llvm/Support/ErrorOr.h"
@@ -28,6 +29,15 @@
 namespace llvm {
 namespace sampleprof {
 
+enum SectionLayout {
+  DefaultLayout,
+  // The layout splits profile with context information from profile without
+  // context information. When Thinlto is enabled, ThinLTO postlink phase only
+  // has to load profile with context information and can skip the other part.
+  CtxSplitLayout,
+  NumOfLayout,
+};
+
 /// Sample-based profile writer. Base class.
 class SampleProfileWriter {
 public:
@@ -60,6 +70,7 @@ public:
   virtual void setToCompressAllSections() {}
   virtual void setUseMD5() {}
   virtual void setPartialProfile() {}
+  virtual void resetSecLayout(SectionLayout SL) {}
 
 protected:
   SampleProfileWriter(std::unique_ptr<raw_ostream> &OS)
@@ -144,6 +155,36 @@ class SampleProfileWriterRawBinary : public SampleProfileWriterBinary {
   using SampleProfileWriterBinary::SampleProfileWriterBinary;
 };
 
+const std::array<SmallVector<SecHdrTableEntry, 8>, NumOfLayout>
+    ExtBinaryHdrLayoutTable = {
+        // Note that SecFuncOffsetTable section is written after SecLBRProfile
+        // in the profile, but is put before SecLBRProfile in SectionHdrLayout.
+        // This is because sample reader follows the order in SectionHdrLayout
+        // to read each section. To read function profiles on demand, sample
+        // reader need to get the offset of each function profile first.
+        //
+        // DefaultLayout
+        SmallVector<SecHdrTableEntry, 8>({{SecProfSummary, 0, 0, 0, 0},
+                                          {SecNameTable, 0, 0, 0, 0},
+                                          {SecFuncOffsetTable, 0, 0, 0, 0},
+                                          {SecLBRProfile, 0, 0, 0, 0},
+                                          {SecProfileSymbolList, 0, 0, 0, 0},
+                                          {SecFuncMetadata, 0, 0, 0, 0}}),
+        // CtxSplitLayout
+        SmallVector<SecHdrTableEntry, 8>({{SecProfSummary, 0, 0, 0, 0},
+                                          {SecNameTable, 0, 0, 0, 0},
+                                          // profile with context
+                                          // for next two sections
+                                          {SecFuncOffsetTable, 0, 0, 0, 0},
+                                          {SecLBRProfile, 0, 0, 0, 0},
+                                          // profile without context
+                                          // for next two sections
+                                          {SecFuncOffsetTable, 0, 0, 0, 0},
+                                          {SecLBRProfile, 0, 0, 0, 0},
+                                          {SecProfileSymbolList, 0, 0, 0, 0},
+                                          {SecFuncMetadata, 0, 0, 0, 0}}),
+};
+
 class SampleProfileWriterExtBinaryBase : public SampleProfileWriterBinary {
   using SampleProfileWriterBinary::SampleProfileWriterBinary;
 public:
@@ -152,10 +193,45 @@ public:
 
   virtual void setToCompressAllSections() override;
   void setToCompressSection(SecType Type);
+  virtual std::error_code writeSample(const FunctionSamples &S) override;
+
+  // Set to use MD5 to represent string in NameTable.
+  virtual void setUseMD5() override {
+    UseMD5 = true;
+    addSectionFlag(SecNameTable, SecNameTableFlags::SecFlagMD5Name);
+    // MD5 will be stored as plain uint64_t instead of variable-length
+    // quantity format in NameTable section.
+    addSectionFlag(SecNameTable, SecNameTableFlags::SecFlagFixedLengthMD5);
+  }
+
+  // Set the profile to be partial. It means the profile is for
+  // common/shared code. The common profile is usually merged from
+  // profiles collected from running other targets.
+  virtual void setPartialProfile() override {
+    addSectionFlag(SecProfSummary, SecProfSummaryFlags::SecFlagPartial);
+  }
+
+  virtual void setProfileSymbolList(ProfileSymbolList *PSL) override {
+    ProfSymList = PSL;
+  };
+
+  virtual void resetSecLayout(SectionLayout SL) override {
+    verifySecLayout(SL);
+#ifndef NDEBUG
+    // Make sure resetSecLayout is called before any flag setting.
+    for (auto &Entry : SectionHdrLayout) {
+      assert(Entry.Flags == 0 &&
+             "resetSecLayout has to be called before any flag setting");
+    }
+#endif
+    SecLayout = SL;
+    SectionHdrLayout = ExtBinaryHdrLayoutTable[SL];
+  }
 
 protected:
-  uint64_t markSectionStart(SecType Type);
-  std::error_code addNewSection(SecType Sec, uint64_t SectionStart);
+  uint64_t markSectionStart(SecType Type, uint32_t LayoutIdx);
+  std::error_code addNewSection(SecType Sec, uint32_t LayoutIdx,
+                                uint64_t SectionStart);
   template <class SecFlagType>
   void addSectionFlag(SecType Type, SecFlagType Flag) {
     for (auto &Entry : SectionHdrLayout) {
@@ -163,23 +239,55 @@ protected:
         addSecFlag(Entry, Flag);
     }
   }
+  template <class SecFlagType>
+  void addSectionFlag(uint32_t SectionIdx, SecFlagType Flag) {
+    addSecFlag(SectionHdrLayout[SectionIdx], Flag);
+  }
+
+  // placeholder for subclasses to dispatch their own section writers.
+  virtual std::error_code writeCustomSection(SecType Type) = 0;
+  // Verify the SecLayout is supported by the format.
+  virtual void verifySecLayout(SectionLayout SL) = 0;
 
-  virtual void initSectionHdrLayout() = 0;
+  // specify the order to write sections.
   virtual std::error_code
   writeSections(const StringMap<FunctionSamples> &ProfileMap) = 0;
 
+  // Dispatch section writer for each section. \p LayoutIdx is the sequence
+  // number indicating where the section is located in SectionHdrLayout.
+  virtual std::error_code
+  writeOneSection(SecType Type, uint32_t LayoutIdx,
+                  const StringMap<FunctionSamples> &ProfileMap);
+
+  // Helper function to write name table.
+  virtual std::error_code writeNameTable() override;
+
+  std::error_code writeFuncMetadata(const StringMap<FunctionSamples> &Profiles);
+
+  // Functions to write various kinds of sections.
+  std::error_code
+  writeNameTableSection(const StringMap<FunctionSamples> &ProfileMap);
+  std::error_code writeFuncOffsetTable();
+  std::error_code writeProfileSymbolListSection();
+
+  SectionLayout SecLayout = DefaultLayout;
   // Specifiy the order of sections in section header table. Note
-  // the order of sections in the profile may be different that the
+  // the order of sections in SecHdrTable may be different that the
   // order in SectionHdrLayout. sample Reader will follow the order
   // in SectionHdrLayout to read each section.
-  SmallVector<SecHdrTableEntry, 8> SectionHdrLayout;
+  SmallVector<SecHdrTableEntry, 8> SectionHdrLayout =
+      ExtBinaryHdrLayoutTable[DefaultLayout];
+
+  // Save the start of SecLBRProfile so we can compute the offset to the
+  // start of SecLBRProfile for each Function's Profile and will keep it
+  // in FuncOffsetTable.
+  uint64_t SecLBRProfileStart = 0;
 
 private:
   void allocSecHdrTable();
   std::error_code writeSecHdrTable();
   virtual std::error_code
   writeHeader(const StringMap<FunctionSamples> &ProfileMap) override;
-  SecHdrTableEntry &getEntryInLayout(SecType Type);
   std::error_code compressAndOutput();
 
   // We will swap the raw_ostream held by LocalBufStream and that
@@ -196,70 +304,43 @@ private:
   // The location in the output stream where the SecHdrTable should be
   // written to.
   uint64_t SecHdrTableOffset;
-  // Initial Section Flags setting.
+  // The table contains SecHdrTableEntry entries in order of how they are
+  // populated in the writer. It may be different from the order in
+  // SectionHdrLayout which specifies the sequence in which sections will
+  // be read.
   std::vector<SecHdrTableEntry> SecHdrTable;
+
+  // FuncOffsetTable maps function name to its profile offset in SecLBRProfile
+  // section. It is used to load function profile on demand.
+  MapVector<StringRef, uint64_t> FuncOffsetTable;
+  // Whether to use MD5 to represent string.
+  bool UseMD5 = false;
+
+  ProfileSymbolList *ProfSymList = nullptr;
 };
 
 class SampleProfileWriterExtBinary : public SampleProfileWriterExtBinaryBase {
 public:
   SampleProfileWriterExtBinary(std::unique_ptr<raw_ostream> &OS)
-      : SampleProfileWriterExtBinaryBase(OS) {
-    initSectionHdrLayout();
-  }
-
-  virtual std::error_code writeSample(const FunctionSamples &S) override;
-  virtual void setProfileSymbolList(ProfileSymbolList *PSL) override {
-    ProfSymList = PSL;
-  };
-
-  // Set to use MD5 to represent string in NameTable.
-  virtual void setUseMD5() override {
-    UseMD5 = true;
-    addSectionFlag(SecNameTable, SecNameTableFlags::SecFlagMD5Name);
-  }
-
-  // Set the profile to be partial. It means the profile is for
-  // common/shared code. The common profile is usually merged from
-  // profiles collected from running other targets.
-  virtual void setPartialProfile() override {
-    addSectionFlag(SecProfSummary, SecProfSummaryFlags::SecFlagPartial);
-  }
+      : SampleProfileWriterExtBinaryBase(OS) {}
 
 private:
-  virtual void initSectionHdrLayout() override {
-    // Note that SecFuncOffsetTable section is written after SecLBRProfile
-    // in the profile, but is put before SecLBRProfile in SectionHdrLayout.
-    //
-    // This is because sample reader follows the order of SectionHdrLayout to
-    // read each section, to read function profiles on demand sample reader
-    // need to get the offset of each function profile first.
-    //
-    // SecFuncOffsetTable section is written after SecLBRProfile in the
-    // profile because FuncOffsetTable needs to be populated while section
-    // SecLBRProfile is written.
-    SectionHdrLayout = {{SecProfSummary, 0, 0, 0},
-                        {SecNameTable, 0, 0, 0},
-                        {SecFuncOffsetTable, 0, 0, 0},
-                        {SecLBRProfile, 0, 0, 0},
-                        {SecProfileSymbolList, 0, 0, 0}};
-  };
+  std::error_code
+  writeDefaultLayout(const StringMap<FunctionSamples> &ProfileMap);
+  std::error_code
+  writeCtxSplitLayout(const StringMap<FunctionSamples> &ProfileMap);
+
   virtual std::error_code
   writeSections(const StringMap<FunctionSamples> &ProfileMap) override;
 
-  std::error_code writeFuncOffsetTable();
-  virtual std::error_code writeNameTable() override;
-
-  ProfileSymbolList *ProfSymList = nullptr;
+  virtual std::error_code writeCustomSection(SecType Type) override {
+    return sampleprof_error::success;
+  };
 
-  // Save the start of SecLBRProfile so we can compute the offset to the
-  // start of SecLBRProfile for each Function's Profile and will keep it
-  // in FuncOffsetTable.
-  uint64_t SecLBRProfileStart = 0;
-  // FuncOffsetTable maps function name to its profile offset in SecLBRProfile
-  // section. It is used to load function profile on demand.
-  MapVector<StringRef, uint64_t> FuncOffsetTable;
-  // Whether to use MD5 to represent string.
-  bool UseMD5 = false;
+  virtual void verifySecLayout(SectionLayout SL) override {
+    assert((SL == DefaultLayout || SL == CtxSplitLayout) &&
+           "Unsupported layout");
+  }
 };
 
 // CompactBinary is a compact format of binary profile which both reduces
diff --git a/contrib/llvm-project/llvm/include/llvm/Remarks/BitstreamRemarkParser.h b/contrib/llvm-project/llvm/include/llvm/Remarks/BitstreamRemarkParser.h
index 7ebd731693b2..f7553ba53958 100644
--- a/contrib/llvm-project/llvm/include/llvm/Remarks/BitstreamRemarkParser.h
+++ b/contrib/llvm-project/llvm/include/llvm/Remarks/BitstreamRemarkParser.h
@@ -14,13 +14,13 @@
 #ifndef LLVM_REMARKS_BITSTREAM_REMARK_PARSER_H
 #define LLVM_REMARKS_BITSTREAM_REMARK_PARSER_H
 
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Bitstream/BitstreamReader.h"
-#include "llvm/Remarks/BitstreamRemarkContainer.h"
-#include "llvm/Remarks/Remark.h"
-#include "llvm/Remarks/RemarkParser.h"
 #include "llvm/Support/Error.h"
 #include <array>
+#include <cstdint>
 
 namespace llvm {
 namespace remarks {
diff --git a/contrib/llvm-project/llvm/include/llvm/Remarks/HotnessThresholdParser.h b/contrib/llvm-project/llvm/include/llvm/Remarks/HotnessThresholdParser.h
new file mode 100644
index 000000000000..08bbf5f70b81
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/Remarks/HotnessThresholdParser.h
@@ -0,0 +1,63 @@
+//===- HotnessThresholdParser.h - Parser for hotness threshold --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements a simple parser to decode commandline option for
+/// remarks hotness threshold that supports both int and a special 'auto' value.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_REMARKS_HOTNESSTHRESHOLDPARSER_H
+#define LLVM_REMARKS_HOTNESSTHRESHOLDPARSER_H
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/Support/CommandLine.h"
+
+namespace llvm {
+namespace remarks {
+
+// Parse remarks hotness threshold argument value.
+// Valid option values are
+// 1. integer: manually specified threshold; or
+// 2. string 'auto': automatically get threshold from profile summary.
+//
+// Return None Optional if 'auto' is specified, indicating the value will
+// be filled later during PSI.
+inline Expected<Optional<uint64_t>> parseHotnessThresholdOption(StringRef Arg) {
+  if (Arg == "auto")
+    return None;
+
+  int64_t Val;
+  if (Arg.getAsInteger(10, Val))
+    return createStringError(llvm::inconvertibleErrorCode(),
+                             "Not an integer: %s", Arg.data());
+
+  // Negative integer effectively means no threshold
+  return Val < 0 ? 0 : Val;
+}
+
+// A simple CL parser for '*-remarks-hotness-threshold='
+class HotnessThresholdParser : public cl::parser<Optional<uint64_t>> {
+public:
+  HotnessThresholdParser(cl::Option &O) : cl::parser<Optional<uint64_t>>(O) {}
+
+  bool parse(cl::Option &O, StringRef ArgName, StringRef Arg,
+             Optional<uint64_t> &V) {
+    auto ResultOrErr = parseHotnessThresholdOption(Arg);
+    if (!ResultOrErr)
+      return O.error("Invalid argument '" + Arg +
+                     "', only integer or 'auto' is supported.");
+
+    V = *ResultOrErr;
+    return false;
+  }
+};
+
+} // namespace remarks
+} // namespace llvm
+#endif // LLVM_REMARKS_HOTNESSTHRESHOLDPARSER_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/AArch64TargetParser.def b/contrib/llvm-project/llvm/include/llvm/Support/AArch64TargetParser.def
index 13b7cfc4b5cd..332fb555e824 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/AArch64TargetParser.def
+++ b/contrib/llvm-project/llvm/include/llvm/Support/AArch64TargetParser.def
@@ -51,6 +51,21 @@ AARCH64_ARCH("armv8.6-a", ARMV8_6A, "8.6-A", "v8.6a",
               AArch64::AEK_RDM  | AArch64::AEK_RCPC | AArch64::AEK_DOTPROD |
               AArch64::AEK_SM4  | AArch64::AEK_SHA3 | AArch64::AEK_BF16    |
               AArch64::AEK_SHA2 | AArch64::AEK_AES  | AArch64::AEK_I8MM))
+AARCH64_ARCH("armv8.7-a", ARMV8_7A, "8.7-A", "v8.7a",
+             ARMBuildAttrs::CPUArch::v8_A, FK_CRYPTO_NEON_FP_ARMV8,
+             (AArch64::AEK_CRC | AArch64::AEK_FP |
+              AArch64::AEK_SIMD | AArch64::AEK_RAS | AArch64::AEK_LSE |
+              AArch64::AEK_RDM | AArch64::AEK_RCPC | AArch64::AEK_DOTPROD |
+              AArch64::AEK_SM4 | AArch64::AEK_SHA3 | AArch64::AEK_BF16 |
+              AArch64::AEK_SHA2 | AArch64::AEK_AES | AArch64::AEK_I8MM))
+// For v8-R, we do not enable crypto and align with GCC that enables a more
+// minimal set of optional architecture extensions.
+AARCH64_ARCH("armv8-r", ARMV8R, "8-R", "v8r",
+             ARMBuildAttrs::CPUArch::v8_R, FK_CRYPTO_NEON_FP_ARMV8,
+             (AArch64::AEK_CRC     | AArch64::AEK_RDM  | AArch64::AEK_SSBS |
+              AArch64::AEK_DOTPROD | AArch64::AEK_FP   | AArch64::AEK_SIMD |
+              AArch64::AEK_FP16    | AArch64::AEK_FP16FML | AArch64::AEK_RAS |
+              AArch64::AEK_RCPC    | AArch64::AEK_SB))
 #undef AARCH64_ARCH
 
 #ifndef AARCH64_ARCH_EXT_NAME
@@ -91,6 +106,10 @@ AARCH64_ARCH_EXT_NAME("i8mm",         AArch64::AEK_I8MM,        "+i8mm",  "-i8mm
 AARCH64_ARCH_EXT_NAME("f32mm",        AArch64::AEK_F32MM,       "+f32mm", "-f32mm")
 AARCH64_ARCH_EXT_NAME("f64mm",        AArch64::AEK_F64MM,       "+f64mm", "-f64mm")
 AARCH64_ARCH_EXT_NAME("tme",          AArch64::AEK_TME,         "+tme",   "-tme")
+AARCH64_ARCH_EXT_NAME("ls64",         AArch64::AEK_LS64,        "+ls64",  "-ls64")
+AARCH64_ARCH_EXT_NAME("brbe",         AArch64::AEK_BRBE,        "+brbe",  "-brbe")
+AARCH64_ARCH_EXT_NAME("pauth",        AArch64::AEK_PAUTH,       "+pauth", "-pauth")
+AARCH64_ARCH_EXT_NAME("flagm",        AArch64::AEK_FLAGM,       "+flagm", "-flagm")
 #undef AARCH64_ARCH_EXT_NAME
 
 #ifndef AARCH64_CPU_NAME
@@ -130,6 +149,11 @@ AARCH64_CPU_NAME("cortex-a77", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
 AARCH64_CPU_NAME("cortex-a78", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
                  (AArch64::AEK_FP16 | AArch64::AEK_DOTPROD | AArch64::AEK_RCPC |
                   AArch64::AEK_SSBS))
+AARCH64_CPU_NAME("cortex-a78c", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
+                 (AArch64::AEK_FP16 | AArch64::AEK_DOTPROD | AArch64::AEK_RCPC |
+                  AArch64::AEK_SSBS))
+AARCH64_CPU_NAME("cortex-r82", ARMV8R, FK_CRYPTO_NEON_FP_ARMV8, false,
+                 (AArch64::AEK_LSE))
 AARCH64_CPU_NAME("cortex-x1", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
                  (AArch64::AEK_FP16 | AArch64::AEK_DOTPROD | AArch64::AEK_RCPC |
                   AArch64::AEK_SSBS))
@@ -140,6 +164,15 @@ AARCH64_CPU_NAME("neoverse-n1", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
                  (AArch64::AEK_DOTPROD | AArch64::AEK_FP16 |
                   AArch64::AEK_PROFILE | AArch64::AEK_RAS | AArch64::AEK_RCPC |
                   AArch64::AEK_SSBS))
+AARCH64_CPU_NAME("neoverse-n2", ARMV8_5A, FK_CRYPTO_NEON_FP_ARMV8, false,
+                 (AArch64::AEK_BF16 | AArch64::AEK_DOTPROD | AArch64::AEK_FP16 |
+                  AArch64::AEK_I8MM | AArch64::AEK_MTE | AArch64::AEK_RAS |
+                  AArch64::AEK_RCPC | AArch64::AEK_SB | AArch64::AEK_SSBS |
+                  AArch64::AEK_SVE | AArch64::AEK_SVE2 | AArch64::AEK_SVE2BITPERM))
+AARCH64_CPU_NAME("neoverse-v1", ARMV8_4A, FK_CRYPTO_NEON_FP_ARMV8, false,
+                 (AArch64::AEK_RAS | AArch64::AEK_SVE | AArch64::AEK_SSBS |
+                  AArch64::AEK_RCPC | AArch64::AEK_FP16 | AArch64::AEK_BF16 |
+                  AArch64::AEK_DOTPROD ))
 AARCH64_CPU_NAME("cyclone", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
                  (AArch64::AEK_NONE))
 AARCH64_CPU_NAME("apple-a7", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
@@ -156,6 +189,8 @@ AARCH64_CPU_NAME("apple-a12", ARMV8_3A, FK_CRYPTO_NEON_FP_ARMV8, false,
                  (AArch64::AEK_FP16))
 AARCH64_CPU_NAME("apple-a13", ARMV8_4A, FK_CRYPTO_NEON_FP_ARMV8, false,
                  (AArch64::AEK_FP16 | AArch64::AEK_FP16FML))
+AARCH64_CPU_NAME("apple-a14", ARMV8_5A, FK_CRYPTO_NEON_FP_ARMV8, false,
+                 (AArch64::AEK_FP16 | AArch64::AEK_FP16FML))
 AARCH64_CPU_NAME("apple-s4", ARMV8_3A, FK_CRYPTO_NEON_FP_ARMV8, false,
                  (AArch64::AEK_FP16))
 AARCH64_CPU_NAME("apple-s5", ARMV8_3A, FK_CRYPTO_NEON_FP_ARMV8, false,
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/AArch64TargetParser.h b/contrib/llvm-project/llvm/include/llvm/Support/AArch64TargetParser.h
index b045e31bc92a..7c9e245e3889 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/AArch64TargetParser.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/AArch64TargetParser.h
@@ -62,6 +62,10 @@ enum ArchExtKind : uint64_t {
   AEK_I8MM =        1 << 30,
   AEK_F32MM =       1ULL << 31,
   AEK_F64MM =       1ULL << 32,
+  AEK_LS64 =        1ULL << 33,
+  AEK_BRBE =        1ULL << 34,
+  AEK_PAUTH =       1ULL << 35,
+  AEK_FLAGM =       1ULL << 36,
 };
 
 enum class ArchKind {
@@ -104,7 +108,7 @@ const ArchKind ArchKinds[] = {
 };
 
 // FIXME: These should be moved to TargetTuple once it exists
-bool getExtensionFeatures(unsigned Extensions,
+bool getExtensionFeatures(uint64_t Extensions,
                           std::vector<StringRef> &Features);
 bool getArchFeatures(ArchKind AK, std::vector<StringRef> &Features);
 
@@ -117,7 +121,7 @@ StringRef getArchExtFeature(StringRef ArchExt);
 
 // Information by Name
 unsigned getDefaultFPU(StringRef CPU, ArchKind AK);
-unsigned getDefaultExtensions(StringRef CPU, ArchKind AK);
+uint64_t getDefaultExtensions(StringRef CPU, ArchKind AK);
 StringRef getDefaultCPU(StringRef Arch);
 ArchKind getCPUArchKind(StringRef CPU);
 
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/AMDGPUMetadata.h b/contrib/llvm-project/llvm/include/llvm/Support/AMDGPUMetadata.h
index 920c97f7e112..eadc25870096 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/AMDGPUMetadata.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/AMDGPUMetadata.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_SUPPORT_AMDGPUMETADATA_H
 #define LLVM_SUPPORT_AMDGPUMETADATA_H
 
+#include "llvm/ADT/StringRef.h"
 #include <cstdint>
 #include <string>
 #include <system_error>
@@ -430,7 +431,7 @@ struct Metadata final {
 };
 
 /// Converts \p String to \p HSAMetadata.
-std::error_code fromString(std::string String, Metadata &HSAMetadata);
+std::error_code fromString(StringRef String, Metadata &HSAMetadata);
 
 /// Converts \p HSAMetadata to \p String.
 std::error_code toString(Metadata HSAMetadata, std::string &String);
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h b/contrib/llvm-project/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h
index d1c2147536a7..bd84da43dff7 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h
@@ -100,7 +100,7 @@ enum : int32_t {
 #define COMPUTE_PGM_RSRC2(NAME, SHIFT, WIDTH) \
   AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC2_ ## NAME, SHIFT, WIDTH)
 enum : int32_t {
-  COMPUTE_PGM_RSRC2(ENABLE_SGPR_PRIVATE_SEGMENT_WAVEFRONT_OFFSET, 0, 1),
+  COMPUTE_PGM_RSRC2(ENABLE_PRIVATE_SEGMENT, 0, 1),
   COMPUTE_PGM_RSRC2(USER_SGPR_COUNT, 1, 5),
   COMPUTE_PGM_RSRC2(ENABLE_TRAP_HANDLER, 6, 1),
   COMPUTE_PGM_RSRC2(ENABLE_SGPR_WORKGROUP_ID_X, 7, 1),
@@ -162,39 +162,49 @@ struct kernel_descriptor_t {
   uint8_t reserved2[6];
 };
 
+enum : uint32_t {
+  GROUP_SEGMENT_FIXED_SIZE_OFFSET = 0,
+  PRIVATE_SEGMENT_FIXED_SIZE_OFFSET = 4,
+  RESERVED0_OFFSET = 8,
+  KERNEL_CODE_ENTRY_BYTE_OFFSET_OFFSET = 16,
+  RESERVED1_OFFSET = 24,
+  COMPUTE_PGM_RSRC3_OFFSET = 44,
+  COMPUTE_PGM_RSRC1_OFFSET = 48,
+  COMPUTE_PGM_RSRC2_OFFSET = 52,
+  KERNEL_CODE_PROPERTIES_OFFSET = 56,
+  RESERVED2_OFFSET = 58,
+};
+
 static_assert(
     sizeof(kernel_descriptor_t) == 64,
     "invalid size for kernel_descriptor_t");
-static_assert(
-    offsetof(kernel_descriptor_t, group_segment_fixed_size) == 0,
-    "invalid offset for group_segment_fixed_size");
-static_assert(
-    offsetof(kernel_descriptor_t, private_segment_fixed_size) == 4,
-    "invalid offset for private_segment_fixed_size");
-static_assert(
-    offsetof(kernel_descriptor_t, reserved0) == 8,
-    "invalid offset for reserved0");
-static_assert(
-    offsetof(kernel_descriptor_t, kernel_code_entry_byte_offset) == 16,
-    "invalid offset for kernel_code_entry_byte_offset");
-static_assert(
-    offsetof(kernel_descriptor_t, reserved1) == 24,
-    "invalid offset for reserved1");
-static_assert(
-    offsetof(kernel_descriptor_t, compute_pgm_rsrc3) == 44,
-    "invalid offset for compute_pgm_rsrc3");
-static_assert(
-    offsetof(kernel_descriptor_t, compute_pgm_rsrc1) == 48,
-    "invalid offset for compute_pgm_rsrc1");
-static_assert(
-    offsetof(kernel_descriptor_t, compute_pgm_rsrc2) == 52,
-    "invalid offset for compute_pgm_rsrc2");
-static_assert(
-    offsetof(kernel_descriptor_t, kernel_code_properties) == 56,
-    "invalid offset for kernel_code_properties");
-static_assert(
-    offsetof(kernel_descriptor_t, reserved2) == 58,
-    "invalid offset for reserved2");
+static_assert(offsetof(kernel_descriptor_t, group_segment_fixed_size) ==
+                  GROUP_SEGMENT_FIXED_SIZE_OFFSET,
+              "invalid offset for group_segment_fixed_size");
+static_assert(offsetof(kernel_descriptor_t, private_segment_fixed_size) ==
+                  PRIVATE_SEGMENT_FIXED_SIZE_OFFSET,
+              "invalid offset for private_segment_fixed_size");
+static_assert(offsetof(kernel_descriptor_t, reserved0) == RESERVED0_OFFSET,
+              "invalid offset for reserved0");
+static_assert(offsetof(kernel_descriptor_t, kernel_code_entry_byte_offset) ==
+                  KERNEL_CODE_ENTRY_BYTE_OFFSET_OFFSET,
+              "invalid offset for kernel_code_entry_byte_offset");
+static_assert(offsetof(kernel_descriptor_t, reserved1) == RESERVED1_OFFSET,
+              "invalid offset for reserved1");
+static_assert(offsetof(kernel_descriptor_t, compute_pgm_rsrc3) ==
+                  COMPUTE_PGM_RSRC3_OFFSET,
+              "invalid offset for compute_pgm_rsrc3");
+static_assert(offsetof(kernel_descriptor_t, compute_pgm_rsrc1) ==
+                  COMPUTE_PGM_RSRC1_OFFSET,
+              "invalid offset for compute_pgm_rsrc1");
+static_assert(offsetof(kernel_descriptor_t, compute_pgm_rsrc2) ==
+                  COMPUTE_PGM_RSRC2_OFFSET,
+              "invalid offset for compute_pgm_rsrc2");
+static_assert(offsetof(kernel_descriptor_t, kernel_code_properties) ==
+                  KERNEL_CODE_PROPERTIES_OFFSET,
+              "invalid offset for kernel_code_properties");
+static_assert(offsetof(kernel_descriptor_t, reserved2) == RESERVED2_OFFSET,
+              "invalid offset for reserved2");
 
 } // end namespace amdhsa
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/ARMTargetParser.def b/contrib/llvm-project/llvm/include/llvm/Support/ARMTargetParser.def
index 9f51c841e429..37cf0a93bb04 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/ARMTargetParser.def
+++ b/contrib/llvm-project/llvm/include/llvm/Support/ARMTargetParser.def
@@ -118,6 +118,12 @@ ARM_ARCH("armv8.6-a", ARMV8_6A, "8.6-A", "v8.6a",
           ARM::AEK_HWDIVTHUMB | ARM::AEK_DSP  | ARM::AEK_CRC  | ARM::AEK_RAS |
           ARM::AEK_DOTPROD    | ARM::AEK_BF16 | ARM::AEK_SHA2 | ARM::AEK_AES |
           ARM::AEK_I8MM))
+ARM_ARCH("armv8.7-a", ARMV8_7A, "8.7-A", "v8.7a",
+         ARMBuildAttrs::CPUArch::v8_A, FK_CRYPTO_NEON_FP_ARMV8,
+         (ARM::AEK_SEC        | ARM::AEK_MP   | ARM::AEK_VIRT | ARM::AEK_HWDIVARM |
+          ARM::AEK_HWDIVTHUMB | ARM::AEK_DSP  | ARM::AEK_CRC  | ARM::AEK_RAS |
+          ARM::AEK_DOTPROD    | ARM::AEK_BF16 | ARM::AEK_SHA2 | ARM::AEK_AES |
+          ARM::AEK_I8MM))
 ARM_ARCH("armv8-r", ARMV8R, "8-R", "v8r", ARMBuildAttrs::CPUArch::v8_R,
           FK_NEON_FP_ARMV8,
           (ARM::AEK_MP | ARM::AEK_VIRT | ARM::AEK_HWDIVARM | ARM::AEK_HWDIVTHUMB |
@@ -294,12 +300,19 @@ ARM_CPU_NAME("cortex-a76ae", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
             (ARM::AEK_FP16 | ARM::AEK_DOTPROD))
 ARM_CPU_NAME("cortex-a77", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
             (ARM::AEK_FP16 | ARM::AEK_DOTPROD))
-ARM_CPU_NAME("cortex-a78",ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
+ARM_CPU_NAME("cortex-a78", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
              (ARM::AEK_FP16 | ARM::AEK_DOTPROD))
+ARM_CPU_NAME("cortex-a78c", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
+             ARM::AEK_FP16 | ARM::AEK_DOTPROD)
 ARM_CPU_NAME("cortex-x1", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
              (ARM::AEK_FP16 | ARM::AEK_DOTPROD))
 ARM_CPU_NAME("neoverse-n1", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
              (ARM::AEK_FP16 | ARM::AEK_DOTPROD))
+ARM_CPU_NAME("neoverse-n2", ARMV8_5A, FK_CRYPTO_NEON_FP_ARMV8, false,
+             (ARM::AEK_BF16 | ARM::AEK_DOTPROD | ARM::AEK_I8MM | ARM::AEK_RAS |
+              ARM::AEK_SB))
+ARM_CPU_NAME("neoverse-v1", ARMV8_4A, FK_CRYPTO_NEON_FP_ARMV8, false,
+             (ARM::AEK_RAS | ARM::AEK_FP16 | ARM::AEK_BF16 | ARM::AEK_DOTPROD))
 ARM_CPU_NAME("cyclone", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, ARM::AEK_CRC)
 ARM_CPU_NAME("exynos-m3", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, ARM::AEK_CRC)
 ARM_CPU_NAME("exynos-m4", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/ARMTargetParser.h b/contrib/llvm-project/llvm/include/llvm/Support/ARMTargetParser.h
index 4e76b3c4b83e..7dd2abd29212 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/ARMTargetParser.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/ARMTargetParser.h
@@ -250,7 +250,8 @@ StringRef getSubArch(ArchKind AK);
 StringRef getArchExtName(uint64_t ArchExtKind);
 StringRef getArchExtFeature(StringRef ArchExt);
 bool appendArchExtFeatures(StringRef CPU, ARM::ArchKind AK, StringRef ArchExt,
-                           std::vector<StringRef> &Features);
+                           std::vector<StringRef> &Features,
+                           unsigned &ArgFPUKind);
 StringRef getHWDivName(uint64_t HWDivKind);
 
 // Information by Name
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/ARMWinEH.h b/contrib/llvm-project/llvm/include/llvm/Support/ARMWinEH.h
index 857a0d3814a8..327aa9804849 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/ARMWinEH.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/ARMWinEH.h
@@ -31,6 +31,9 @@ enum class ReturnType {
 
 /// RuntimeFunction - An entry in the table of procedure data (.pdata)
 ///
+/// This is ARM specific, but the Function Start RVA, Flag and
+/// ExceptionInformationRVA fields work identically for ARM64.
+///
 ///  3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0
 ///  1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
 /// +---------------------------------------------------------------+
@@ -204,6 +207,85 @@ inline uint16_t StackAdjustment(const RuntimeFunction &RF) {
 /// purpose (r0-r15) and VFP (d0-d31) registers.
 std::pair<uint16_t, uint32_t> SavedRegisterMask(const RuntimeFunction &RF);
 
+/// RuntimeFunctionARM64 - An entry in the table of procedure data (.pdata)
+///
+///  3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0
+///  1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
+/// +---------------------------------------------------------------+
+/// |                     Function Start RVA                        |
+/// +-----------------+---+-+-------+-----+---------------------+---+
+/// |    Frame Size   |CR |H| RegI  |RegF |   Function Length   |Flg|
+/// +-----------------+---+-+-------+-----+---------------------+---+
+///
+/// See https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling
+/// for the full reference for this struct.
+
+class RuntimeFunctionARM64 {
+public:
+  const support::ulittle32_t BeginAddress;
+  const support::ulittle32_t UnwindData;
+
+  RuntimeFunctionARM64(const support::ulittle32_t *Data)
+      : BeginAddress(Data[0]), UnwindData(Data[1]) {}
+
+  RuntimeFunctionARM64(const support::ulittle32_t BeginAddress,
+                       const support::ulittle32_t UnwindData)
+      : BeginAddress(BeginAddress), UnwindData(UnwindData) {}
+
+  RuntimeFunctionFlag Flag() const {
+    return RuntimeFunctionFlag(UnwindData & 0x3);
+  }
+
+  uint32_t ExceptionInformationRVA() const {
+    assert(Flag() == RuntimeFunctionFlag::RFF_Unpacked &&
+           "unpacked form required for this operation");
+    return (UnwindData & ~0x3);
+  }
+
+  uint32_t PackedUnwindData() const {
+    assert((Flag() == RuntimeFunctionFlag::RFF_Packed ||
+            Flag() == RuntimeFunctionFlag::RFF_PackedFragment) &&
+           "packed form required for this operation");
+    return (UnwindData & ~0x3);
+  }
+  uint32_t FunctionLength() const {
+    assert((Flag() == RuntimeFunctionFlag::RFF_Packed ||
+            Flag() == RuntimeFunctionFlag::RFF_PackedFragment) &&
+           "packed form required for this operation");
+    return (((UnwindData & 0x00001ffc) >> 2) << 2);
+  }
+  uint8_t RegF() const {
+    assert((Flag() == RuntimeFunctionFlag::RFF_Packed ||
+            Flag() == RuntimeFunctionFlag::RFF_PackedFragment) &&
+           "packed form required for this operation");
+    return ((UnwindData & 0x0000e000) >> 13);
+  }
+  uint8_t RegI() const {
+    assert((Flag() == RuntimeFunctionFlag::RFF_Packed ||
+            Flag() == RuntimeFunctionFlag::RFF_PackedFragment) &&
+           "packed form required for this operation");
+    return ((UnwindData & 0x000f0000) >> 16);
+  }
+  bool H() const {
+    assert((Flag() == RuntimeFunctionFlag::RFF_Packed ||
+            Flag() == RuntimeFunctionFlag::RFF_PackedFragment) &&
+           "packed form required for this operation");
+    return ((UnwindData & 0x00100000) >> 20);
+  }
+  uint8_t CR() const {
+    assert((Flag() == RuntimeFunctionFlag::RFF_Packed ||
+            Flag() == RuntimeFunctionFlag::RFF_PackedFragment) &&
+           "packed form required for this operation");
+    return ((UnwindData & 0x600000) >> 21);
+  }
+  uint16_t FrameSize() const {
+    assert((Flag() == RuntimeFunctionFlag::RFF_Packed ||
+            Flag() == RuntimeFunctionFlag::RFF_PackedFragment) &&
+           "packed form required for this operation");
+    return ((UnwindData & 0xff800000) >> 23);
+  }
+};
+
 /// ExceptionDataRecord - An entry in the table of exception data (.xdata)
 ///
 /// The format on ARM is:
@@ -416,12 +498,13 @@ struct ExceptionDataRecord {
 
   uint32_t ExceptionHandlerRVA() const {
     assert(X() && "Exception Handler RVA is only valid if the X bit is set");
-    return Data[HeaderWords(*this) + EpilogueCount() + CodeWords()];
+    return Data[HeaderWords(*this) + (E() ? 0 : EpilogueCount()) + CodeWords()];
   }
 
   uint32_t ExceptionHandlerParameter() const {
     assert(X() && "Exception Handler RVA is only valid if the X bit is set");
-    return Data[HeaderWords(*this) + EpilogueCount() + CodeWords() + 1];
+    return Data[HeaderWords(*this) + (E() ? 0 : EpilogueCount()) + CodeWords() +
+                1];
   }
 };
 
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/AlignOf.h b/contrib/llvm-project/llvm/include/llvm/Support/AlignOf.h
index eb42542b777f..f586d7f182aa 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/AlignOf.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/AlignOf.h
@@ -13,41 +13,20 @@
 #ifndef LLVM_SUPPORT_ALIGNOF_H
 #define LLVM_SUPPORT_ALIGNOF_H
 
-#include "llvm/Support/Compiler.h"
-#include <cstddef>
+#include <type_traits>
 
 namespace llvm {
 
-namespace detail {
-
-template <typename T, typename... Ts> class AlignerImpl {
-  T t;
-  AlignerImpl<Ts...> rest;
-  AlignerImpl() = delete;
-};
-
-template <typename T> class AlignerImpl<T> {
-  T t;
-  AlignerImpl() = delete;
-};
-
-template <typename T, typename... Ts> union SizerImpl {
-  char arr[sizeof(T)];
-  SizerImpl<Ts...> rest;
-};
-
-template <typename T> union SizerImpl<T> { char arr[sizeof(T)]; };
-} // end namespace detail
-
 /// A suitably aligned and sized character array member which can hold elements
 /// of any type.
 ///
-/// These types may be arrays, structs, or any other types. This exposes a
-/// `buffer` member which can be used as suitable storage for a placement new of
-/// any of these types.
+/// This template is equivalent to std::aligned_union_t<1, ...>, but we cannot
+/// use it due to a bug in the MSVC x86 compiler:
+/// https://github.com/microsoft/STL/issues/1533
+/// Using `alignas` here works around the bug.
 template <typename T, typename... Ts> struct AlignedCharArrayUnion {
-  alignas(::llvm::detail::AlignerImpl<T, Ts...>) char buffer[sizeof(
-      llvm::detail::SizerImpl<T, Ts...>)];
+  using AlignedUnion = std::aligned_union_t<1, T, Ts...>;
+  alignas(alignof(AlignedUnion)) char buffer[sizeof(AlignedUnion)];
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/Allocator.h b/contrib/llvm-project/llvm/include/llvm/Support/Allocator.h
index 40c967ccc485..245432debce6 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/Allocator.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/Allocator.h
@@ -66,7 +66,8 @@ template <typename AllocatorT = MallocAllocator, size_t SlabSize = 4096,
           size_t SizeThreshold = SlabSize, size_t GrowthDelay = 128>
 class BumpPtrAllocatorImpl
     : public AllocatorBase<BumpPtrAllocatorImpl<AllocatorT, SlabSize,
-                                                SizeThreshold, GrowthDelay>> {
+                                                SizeThreshold, GrowthDelay>>,
+      private AllocatorT {
 public:
   static_assert(SizeThreshold <= SlabSize,
                 "The SizeThreshold must be at most the SlabSize to ensure "
@@ -80,15 +81,15 @@ public:
 
   template <typename T>
   BumpPtrAllocatorImpl(T &&Allocator)
-      : Allocator(std::forward<T &&>(Allocator)) {}
+      : AllocatorT(std::forward<T &&>(Allocator)) {}
 
   // Manually implement a move constructor as we must clear the old allocator's
   // slabs as a matter of correctness.
   BumpPtrAllocatorImpl(BumpPtrAllocatorImpl &&Old)
-      : CurPtr(Old.CurPtr), End(Old.End), Slabs(std::move(Old.Slabs)),
+      : AllocatorT(static_cast<AllocatorT &&>(Old)), CurPtr(Old.CurPtr),
+        End(Old.End), Slabs(std::move(Old.Slabs)),
         CustomSizedSlabs(std::move(Old.CustomSizedSlabs)),
-        BytesAllocated(Old.BytesAllocated), RedZoneSize(Old.RedZoneSize),
-        Allocator(std::move(Old.Allocator)) {
+        BytesAllocated(Old.BytesAllocated), RedZoneSize(Old.RedZoneSize) {
     Old.CurPtr = Old.End = nullptr;
     Old.BytesAllocated = 0;
     Old.Slabs.clear();
@@ -110,7 +111,7 @@ public:
     RedZoneSize = RHS.RedZoneSize;
     Slabs = std::move(RHS.Slabs);
     CustomSizedSlabs = std::move(RHS.CustomSizedSlabs);
-    Allocator = std::move(RHS.Allocator);
+    AllocatorT::operator=(static_cast<AllocatorT &&>(RHS));
 
     RHS.CurPtr = RHS.End = nullptr;
     RHS.BytesAllocated = 0;
@@ -170,7 +171,8 @@ public:
     // If Size is really big, allocate a separate slab for it.
     size_t PaddedSize = SizeToAllocate + Alignment.value() - 1;
     if (PaddedSize > SizeThreshold) {
-      void *NewSlab = Allocator.Allocate(PaddedSize, alignof(std::max_align_t));
+      void *NewSlab =
+          AllocatorT::Allocate(PaddedSize, alignof(std::max_align_t));
       // We own the new slab and don't want anyone reading anyting other than
       // pieces returned from this method.  So poison the whole slab.
       __asan_poison_memory_region(NewSlab, PaddedSize);
@@ -315,9 +317,6 @@ private:
   /// a sanitizer.
   size_t RedZoneSize = 1;
 
-  /// The allocator instance we use to get slabs of memory.
-  AllocatorT Allocator;
-
   static size_t computeSlabSize(unsigned SlabIdx) {
     // Scale the actual allocated slab size based on the number of slabs
     // allocated. Every GrowthDelay slabs allocated, we double
@@ -333,7 +332,7 @@ private:
     size_t AllocatedSlabSize = computeSlabSize(Slabs.size());
 
     void *NewSlab =
-        Allocator.Allocate(AllocatedSlabSize, alignof(std::max_align_t));
+        AllocatorT::Allocate(AllocatedSlabSize, alignof(std::max_align_t));
     // We own the new slab and don't want anyone reading anything other than
     // pieces returned from this method.  So poison the whole slab.
     __asan_poison_memory_region(NewSlab, AllocatedSlabSize);
@@ -349,7 +348,7 @@ private:
     for (; I != E; ++I) {
       size_t AllocatedSlabSize =
           computeSlabSize(std::distance(Slabs.begin(), I));
-      Allocator.Deallocate(*I, AllocatedSlabSize, alignof(std::max_align_t));
+      AllocatorT::Deallocate(*I, AllocatedSlabSize, alignof(std::max_align_t));
     }
   }
 
@@ -358,7 +357,7 @@ private:
     for (auto &PtrAndSize : CustomSizedSlabs) {
       void *Ptr = PtrAndSize.first;
       size_t Size = PtrAndSize.second;
-      Allocator.Deallocate(Ptr, Size, alignof(std::max_align_t));
+      AllocatorT::Deallocate(Ptr, Size, alignof(std::max_align_t));
     }
   }
 
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/AtomicOrdering.h b/contrib/llvm-project/llvm/include/llvm/Support/AtomicOrdering.h
index a8d89955fa2b..27ca825cef46 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/AtomicOrdering.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/AtomicOrdering.h
@@ -21,7 +21,7 @@
 
 namespace llvm {
 
-/// Atomic ordering for C11 / C++11's memody models.
+/// Atomic ordering for C11 / C++11's memory models.
 ///
 /// These values cannot change because they are shared with standard library
 /// implementations as well as with other compilers.
@@ -87,7 +87,7 @@ inline const char *toIRString(AtomicOrdering ao) {
 
 /// Returns true if ao is stronger than other as defined by the AtomicOrdering
 /// lattice, which is based on C++'s definition.
-inline bool isStrongerThan(AtomicOrdering ao, AtomicOrdering other) {
+inline bool isStrongerThan(AtomicOrdering AO, AtomicOrdering Other) {
   static const bool lookup[8][8] = {
       //               NA     UN     RX     CO     AC     RE     AR     SC
       /* NotAtomic */ {false, false, false, false, false, false, false, false},
@@ -99,10 +99,10 @@ inline bool isStrongerThan(AtomicOrdering ao, AtomicOrdering other) {
       /* acq_rel   */ { true,  true,  true,  true,  true,  true, false, false},
       /* seq_cst   */ { true,  true,  true,  true,  true,  true,  true, false},
   };
-  return lookup[static_cast<size_t>(ao)][static_cast<size_t>(other)];
+  return lookup[static_cast<size_t>(AO)][static_cast<size_t>(Other)];
 }
 
-inline bool isAtLeastOrStrongerThan(AtomicOrdering ao, AtomicOrdering other) {
+inline bool isAtLeastOrStrongerThan(AtomicOrdering AO, AtomicOrdering Other) {
   static const bool lookup[8][8] = {
       //               NA     UN     RX     CO     AC     RE     AR     SC
       /* NotAtomic */ { true, false, false, false, false, false, false, false},
@@ -114,26 +114,26 @@ inline bool isAtLeastOrStrongerThan(AtomicOrdering ao, AtomicOrdering other) {
       /* acq_rel   */ { true,  true,  true,  true,  true,  true,  true, false},
       /* seq_cst   */ { true,  true,  true,  true,  true,  true,  true,  true},
   };
-  return lookup[static_cast<size_t>(ao)][static_cast<size_t>(other)];
+  return lookup[static_cast<size_t>(AO)][static_cast<size_t>(Other)];
 }
 
-inline bool isStrongerThanUnordered(AtomicOrdering ao) {
-  return isStrongerThan(ao, AtomicOrdering::Unordered);
+inline bool isStrongerThanUnordered(AtomicOrdering AO) {
+  return isStrongerThan(AO, AtomicOrdering::Unordered);
 }
 
-inline bool isStrongerThanMonotonic(AtomicOrdering ao) {
-  return isStrongerThan(ao, AtomicOrdering::Monotonic);
+inline bool isStrongerThanMonotonic(AtomicOrdering AO) {
+  return isStrongerThan(AO, AtomicOrdering::Monotonic);
 }
 
-inline bool isAcquireOrStronger(AtomicOrdering ao) {
-  return isAtLeastOrStrongerThan(ao, AtomicOrdering::Acquire);
+inline bool isAcquireOrStronger(AtomicOrdering AO) {
+  return isAtLeastOrStrongerThan(AO, AtomicOrdering::Acquire);
 }
 
-inline bool isReleaseOrStronger(AtomicOrdering ao) {
-  return isAtLeastOrStrongerThan(ao, AtomicOrdering::Release);
+inline bool isReleaseOrStronger(AtomicOrdering AO) {
+  return isAtLeastOrStrongerThan(AO, AtomicOrdering::Release);
 }
 
-inline AtomicOrderingCABI toCABI(AtomicOrdering ao) {
+inline AtomicOrderingCABI toCABI(AtomicOrdering AO) {
   static const AtomicOrderingCABI lookup[8] = {
       /* NotAtomic */ AtomicOrderingCABI::relaxed,
       /* Unordered */ AtomicOrderingCABI::relaxed,
@@ -144,7 +144,7 @@ inline AtomicOrderingCABI toCABI(AtomicOrdering ao) {
       /* acq_rel   */ AtomicOrderingCABI::acq_rel,
       /* seq_cst   */ AtomicOrderingCABI::seq_cst,
   };
-  return lookup[static_cast<size_t>(ao)];
+  return lookup[static_cast<size_t>(AO)];
 }
 
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/BinaryItemStream.h b/contrib/llvm-project/llvm/include/llvm/Support/BinaryItemStream.h
index 4cd66adcc01a..4d27013ce368 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/BinaryItemStream.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/BinaryItemStream.h
@@ -88,8 +88,7 @@ private:
     if (Offset >= getLength())
       return make_error<BinaryStreamError>(stream_error_code::stream_too_short);
     ++Offset;
-    auto Iter =
-        std::lower_bound(ItemEndOffsets.begin(), ItemEndOffsets.end(), Offset);
+    auto Iter = llvm::lower_bound(ItemEndOffsets, Offset);
     size_t Idx = std::distance(ItemEndOffsets.begin(), Iter);
     assert(Idx < Items.size() && "binary search for offset failed");
     return Idx;
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/BinaryStreamRef.h b/contrib/llvm-project/llvm/include/llvm/Support/BinaryStreamRef.h
index 5375d6a3a761..ba4c3873586d 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/BinaryStreamRef.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/BinaryStreamRef.h
@@ -121,12 +121,12 @@ public:
 
   bool valid() const { return BorrowedImpl != nullptr; }
 
-  bool operator==(const RefType &Other) const {
-    if (BorrowedImpl != Other.BorrowedImpl)
+  friend bool operator==(const RefType &LHS, const RefType &RHS) {
+    if (LHS.BorrowedImpl != RHS.BorrowedImpl)
       return false;
-    if (ViewOffset != Other.ViewOffset)
+    if (LHS.ViewOffset != RHS.ViewOffset)
       return false;
-    if (Length != Other.Length)
+    if (LHS.Length != RHS.Length)
       return false;
     return true;
   }
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/CFGDiff.h b/contrib/llvm-project/llvm/include/llvm/Support/CFGDiff.h
index 94734ce70e02..c90b9aca78b5 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/CFGDiff.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/CFGDiff.h
@@ -30,67 +30,43 @@
 // a non-inversed graph, the children are naturally the successors when
 // InverseEdge is false and the predecessors when InverseEdge is true.
 
-// We define two base clases that call into GraphDiff, one for successors
-// (CFGSuccessors), where InverseEdge is false, and one for predecessors
-// (CFGPredecessors), where InverseEdge is true.
-// FIXME: Further refactoring may merge the two base classes into a single one
-// templated / parametrized on using succ_iterator/pred_iterator and false/true
-// for the InverseEdge.
-
-// CFGViewChildren and CFGViewPredecessors, both can be parametrized to
-// consider the graph inverted or not (i.e. InverseGraph). Successors
-// implicitly has InverseEdge = false and Predecessors implicitly has
-// InverseEdge = true (see calls to GraphDiff methods in there). The GraphTraits
-// instantiations that follow define the value of InverseGraph.
-
-// GraphTraits instantiations:
-// - GraphDiff<BasicBlock *> is equivalent to InverseGraph = false
-// - GraphDiff<Inverse<BasicBlock *>> is equivalent to InverseGraph = true
-// - second pair item is BasicBlock *, then InverseEdge = false (so it inherits
-// from CFGViewChildren).
-// - second pair item is Inverse<BasicBlock *>, then InverseEdge = true (so it
-// inherits from CFGViewPredecessors).
-
-// The 4 GraphTraits are as follows:
-// 1. std::pair<const GraphDiff<BasicBlock *> *, BasicBlock *>> :
-//        CFGViewChildren<false>
-// Regular CFG, children means successors, InverseGraph = false,
-// InverseEdge = false.
-// 2. std::pair<const GraphDiff<Inverse<BasicBlock *>> *, BasicBlock *>> :
-//        CFGViewChildren<true>
-// Reverse the graph, get successors but reverse-apply updates,
-// InverseGraph = true, InverseEdge = false.
-// 3. std::pair<const GraphDiff<BasicBlock *> *, Inverse<BasicBlock *>>> :
-//        CFGViewPredecessors<false>
-// Regular CFG, reverse edges, so children mean predecessors,
-// InverseGraph = false, InverseEdge = true.
-// 4. std::pair<const GraphDiff<Inverse<BasicBlock *>> *, Inverse<BasicBlock *>>
-//        : CFGViewPredecessors<true>
-// Reverse the graph and the edges, InverseGraph = true, InverseEdge = true.
-
 namespace llvm {
 
-// GraphDiff defines a CFG snapshot: given a set of Update<NodePtr>, provide
-// utilities to skip edges marked as deleted and return a set of edges marked as
-// newly inserted. The current diff treats the CFG as a graph rather than a
+namespace detail {
+template <typename Range>
+auto reverse_if_helper(Range &&R, std::integral_constant<bool, false>) {
+  return std::forward<Range>(R);
+}
+
+template <typename Range>
+auto reverse_if_helper(Range &&R, std::integral_constant<bool, true>) {
+  return llvm::reverse(std::forward<Range>(R));
+}
+
+template <bool B, typename Range> auto reverse_if(Range &&R) {
+  return reverse_if_helper(std::forward<Range>(R),
+                           std::integral_constant<bool, B>{});
+}
+} // namespace detail
+
+// GraphDiff defines a CFG snapshot: given a set of Update<NodePtr>, provides
+// a getChildren method to get a Node's children based on the additional updates
+// in the snapshot. The current diff treats the CFG as a graph rather than a
 // multigraph. Added edges are pruned to be unique, and deleted edges will
 // remove all existing edges between two blocks.
 template <typename NodePtr, bool InverseGraph = false> class GraphDiff {
-  using UpdateMapType = SmallDenseMap<NodePtr, SmallVector<NodePtr, 2>>;
-  struct EdgesInsertedDeleted {
-    UpdateMapType Succ;
-    UpdateMapType Pred;
+  struct DeletesInserts {
+    SmallVector<NodePtr, 2> DI[2];
   };
-  // Store Deleted edges on position 0, and Inserted edges on position 1.
-  EdgesInsertedDeleted Edges[2];
+  using UpdateMapType = SmallDenseMap<NodePtr, DeletesInserts>;
+  UpdateMapType Succ;
+  UpdateMapType Pred;
+
   // By default, it is assumed that, given a CFG and a set of updates, we wish
   // to apply these updates as given. If UpdatedAreReverseApplied is set, the
   // updates will be applied in reverse: deleted edges are considered re-added
   // and inserted edges are considered deleted when returning children.
   bool UpdatedAreReverseApplied;
-  // Using a singleton empty vector for all node requests with no
-  // children.
-  SmallVector<NodePtr, 0> Empty;
 
   // Keep the list of legalized updates for a deterministic order of updates
   // when using a GraphDiff for incremental updates in the DominatorTree.
@@ -98,14 +74,19 @@ template <typename NodePtr, bool InverseGraph = false> class GraphDiff {
   SmallVector<cfg::Update<NodePtr>, 4> LegalizedUpdates;
 
   void printMap(raw_ostream &OS, const UpdateMapType &M) const {
-    for (auto Pair : M)
-      for (auto Child : Pair.second) {
-        OS << "(";
-        Pair.first->printAsOperand(OS, false);
-        OS << ", ";
-        Child->printAsOperand(OS, false);
-        OS << ") ";
+    StringRef DIText[2] = {"Delete", "Insert"};
+    for (auto Pair : M) {
+      for (unsigned IsInsert = 0; IsInsert <= 1; ++IsInsert) {
+        OS << DIText[IsInsert] << " edges: \n";
+        for (auto Child : Pair.second.DI[IsInsert]) {
+          OS << "(";
+          Pair.first->printAsOperand(OS, false);
+          OS << ", ";
+          Child->printAsOperand(OS, false);
+          OS << ") ";
+        }
       }
+    }
     OS << "\n";
   }
 
@@ -113,15 +94,12 @@ public:
   GraphDiff() : UpdatedAreReverseApplied(false) {}
   GraphDiff(ArrayRef<cfg::Update<NodePtr>> Updates,
             bool ReverseApplyUpdates = false) {
-    cfg::LegalizeUpdates<NodePtr>(Updates, LegalizedUpdates, InverseGraph,
-                                  /*ReverseResultOrder=*/true);
-    // The legalized updates are stored in reverse so we can pop_back when doing
-    // incremental updates.
+    cfg::LegalizeUpdates<NodePtr>(Updates, LegalizedUpdates, InverseGraph);
     for (auto U : LegalizedUpdates) {
       unsigned IsInsert =
           (U.getKind() == cfg::UpdateKind::Insert) == !ReverseApplyUpdates;
-      Edges[IsInsert].Succ[U.getFrom()].push_back(U.getTo());
-      Edges[IsInsert].Pred[U.getTo()].push_back(U.getFrom());
+      Succ[U.getFrom()].DI[IsInsert].push_back(U.getTo());
+      Pred[U.getTo()].DI[IsInsert].push_back(U.getFrom());
     }
     UpdatedAreReverseApplied = ReverseApplyUpdates;
   }
@@ -137,55 +115,56 @@ public:
     auto U = LegalizedUpdates.pop_back_val();
     unsigned IsInsert =
         (U.getKind() == cfg::UpdateKind::Insert) == !UpdatedAreReverseApplied;
-    auto &SuccList = Edges[IsInsert].Succ[U.getFrom()];
+    auto &SuccDIList = Succ[U.getFrom()];
+    auto &SuccList = SuccDIList.DI[IsInsert];
     assert(SuccList.back() == U.getTo());
     SuccList.pop_back();
-    if (SuccList.empty())
-      Edges[IsInsert].Succ.erase(U.getFrom());
+    if (SuccList.empty() && SuccDIList.DI[!IsInsert].empty())
+      Succ.erase(U.getFrom());
 
-    auto &PredList = Edges[IsInsert].Pred[U.getTo()];
+    auto &PredDIList = Pred[U.getTo()];
+    auto &PredList = PredDIList.DI[IsInsert];
     assert(PredList.back() == U.getFrom());
     PredList.pop_back();
-    if (PredList.empty())
-      Edges[IsInsert].Pred.erase(U.getTo());
+    if (PredList.empty() && PredDIList.DI[!IsInsert].empty())
+      Pred.erase(U.getTo());
     return U;
   }
 
-  bool ignoreChild(const NodePtr BB, NodePtr EdgeEnd, bool InverseEdge) const {
-    // Used to filter nullptr in clang.
-    if (EdgeEnd == nullptr)
-      return true;
-    auto &DeleteChildren =
-        (InverseEdge != InverseGraph) ? Edges[0].Pred : Edges[0].Succ;
-    auto It = DeleteChildren.find(BB);
-    if (It == DeleteChildren.end())
-      return false;
-    auto &EdgesForBB = It->second;
-    return llvm::find(EdgesForBB, EdgeEnd) != EdgesForBB.end();
-  }
+  using VectRet = SmallVector<NodePtr, 8>;
+  template <bool InverseEdge> VectRet getChildren(NodePtr N) const {
+    using DirectedNodeT =
+        std::conditional_t<InverseEdge, Inverse<NodePtr>, NodePtr>;
+    auto R = children<DirectedNodeT>(N);
+    VectRet Res = VectRet(detail::reverse_if<!InverseEdge>(R));
+
+    // Remove nullptr children for clang.
+    llvm::erase_value(Res, nullptr);
+
+    auto &Children = (InverseEdge != InverseGraph) ? Pred : Succ;
+    auto It = Children.find(N);
+    if (It == Children.end())
+      return Res;
 
-  iterator_range<typename SmallVectorImpl<NodePtr>::const_iterator>
-  getAddedChildren(const NodePtr BB, bool InverseEdge) const {
-    auto &InsertChildren =
-        (InverseEdge != InverseGraph) ? Edges[1].Pred : Edges[1].Succ;
-    auto It = InsertChildren.find(BB);
-    if (It == InsertChildren.end())
-      return make_range(Empty.begin(), Empty.end());
-    return make_range(It->second.begin(), It->second.end());
+    // Remove children present in the CFG but not in the snapshot.
+    for (auto *Child : It->second.DI[0])
+      llvm::erase_value(Res, Child);
+
+    // Add children present in the snapshot for not in the real CFG.
+    auto &AddedChildren = It->second.DI[1];
+    llvm::append_range(Res, AddedChildren);
+
+    return Res;
   }
 
   void print(raw_ostream &OS) const {
     OS << "===== GraphDiff: CFG edge changes to create a CFG snapshot. \n"
           "===== (Note: notion of children/inverse_children depends on "
           "the direction of edges and the graph.)\n";
-    OS << "Children to insert:\n\t";
-    printMap(OS, Edges[1].Succ);
-    OS << "Children to delete:\n\t";
-    printMap(OS, Edges[0].Succ);
-    OS << "Inverse_children to insert:\n\t";
-    printMap(OS, Edges[1].Pred);
-    OS << "Inverse_children to delete:\n\t";
-    printMap(OS, Edges[0].Pred);
+    OS << "Children to delete/insert:\n\t";
+    printMap(OS, Succ);
+    OS << "Inverse_children to delete/insert:\n\t";
+    printMap(OS, Pred);
     OS << "\n";
   }
 
@@ -193,58 +172,6 @@ public:
   LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
 #endif
 };
-
-template <typename GraphT, bool InverseGraph = false, bool InverseEdge = false,
-          typename GT = GraphTraits<GraphT>>
-struct CFGViewChildren {
-  using DataRef = const GraphDiff<typename GT::NodeRef, InverseGraph> *;
-  using NodeRef = std::pair<DataRef, typename GT::NodeRef>;
-
-  template<typename Range>
-  static auto makeChildRange(Range &&R, DataRef DR) {
-    using Iter = WrappedPairNodeDataIterator<decltype(std::forward<Range>(R).begin()), NodeRef, DataRef>;
-    return make_range(Iter(R.begin(), DR), Iter(R.end(), DR));
-  }
-
-  static auto children(NodeRef N) {
-
-    // filter iterator init:
-    auto R = make_range(GT::child_begin(N.second), GT::child_end(N.second));
-    // This lambda is copied into the iterators and persists to callers, ensure
-    // captures are by value or otherwise have sufficient lifetime.
-    auto First = make_filter_range(makeChildRange(R, N.first), [N](NodeRef C) {
-      return !C.first->ignoreChild(N.second, C.second, InverseEdge);
-    });
-
-    // new inserts iterator init:
-    auto InsertVec = N.first->getAddedChildren(N.second, InverseEdge);
-    auto Second = makeChildRange(InsertVec, N.first);
-
-    auto CR = concat<NodeRef>(First, Second);
-
-    // concat_range contains references to other ranges, returning it would
-    // leave those references dangling - the iterators contain
-    // other iterators by value so they're safe to return.
-    return make_range(CR.begin(), CR.end());
-  }
-
-  static auto child_begin(NodeRef N) {
-    return children(N).begin();
-  }
-
-  static auto child_end(NodeRef N) {
-    return children(N).end();
-  }
-
-  using ChildIteratorType = decltype(child_end(std::declval<NodeRef>()));
-};
-
-template <typename T, bool B>
-struct GraphTraits<std::pair<const GraphDiff<T, B> *, T>>
-    : CFGViewChildren<T, B> {};
-template <typename T, bool B>
-struct GraphTraits<std::pair<const GraphDiff<T, B> *, Inverse<T>>>
-    : CFGViewChildren<Inverse<T>, B, true> {};
 } // end namespace llvm
 
 #endif // LLVM_SUPPORT_CFGDIFF_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/CFGUpdate.h b/contrib/llvm-project/llvm/include/llvm/Support/CFGUpdate.h
index af4cd6ed1f1d..3a12b9d86c18 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/CFGUpdate.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/CFGUpdate.h
@@ -14,7 +14,6 @@
 #ifndef LLVM_SUPPORT_CFGUPDATE_H
 #define LLVM_SUPPORT_CFGUPDATE_H
 
-#include "llvm/ADT/APInt.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/Support/Compiler.h"
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/CheckedArithmetic.h b/contrib/llvm-project/llvm/include/llvm/Support/CheckedArithmetic.h
index 035e4533322c..09e6d7ec95dc 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/CheckedArithmetic.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/CheckedArithmetic.h
@@ -28,8 +28,8 @@ template <typename T, typename F>
 std::enable_if_t<std::is_integral<T>::value && sizeof(T) * 8 <= 64,
                  llvm::Optional<T>>
 checkedOp(T LHS, T RHS, F Op, bool Signed = true) {
-  llvm::APInt ALHS(/*BitSize=*/sizeof(T) * 8, LHS, Signed);
-  llvm::APInt ARHS(/*BitSize=*/sizeof(T) * 8, RHS, Signed);
+  llvm::APInt ALHS(sizeof(T) * 8, LHS, Signed);
+  llvm::APInt ARHS(sizeof(T) * 8, RHS, Signed);
   bool Overflow;
   llvm::APInt Out = (ALHS.*Op)(ARHS, Overflow);
   if (Overflow)
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/CommandLine.h b/contrib/llvm-project/llvm/include/llvm/Support/CommandLine.h
index 466945e40a9c..0706aa226c0e 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/CommandLine.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/CommandLine.h
@@ -71,13 +71,6 @@ bool ParseCommandLineOptions(int argc, const char *const *argv,
                              const char *EnvVar = nullptr,
                              bool LongOptionsUseDoubleDash = false);
 
-//===----------------------------------------------------------------------===//
-// ParseEnvironmentOptions - Environment variable option processing alternate
-//                           entry point.
-//
-void ParseEnvironmentOptions(const char *progName, const char *envvar,
-                             const char *Overview = "");
-
 // Function pointer type for printing version information.
 using VersionPrinterTy = std::function<void(raw_ostream &)>;
 
@@ -376,9 +369,22 @@ public:
 
   virtual void setDefault() = 0;
 
+  // Prints the help string for an option.
+  //
+  // This maintains the Indent for multi-line descriptions.
+  // FirstLineIndentedBy is the count of chars of the first line
+  //      i.e. the one containing the --<option name>.
   static void printHelpStr(StringRef HelpStr, size_t Indent,
                            size_t FirstLineIndentedBy);
 
+  // Prints the help string for an enum value.
+  //
+  // This maintains the Indent for multi-line descriptions.
+  // FirstLineIndentedBy is the count of chars of the first line
+  //      i.e. the one containing the =<value>.
+  static void printEnumValHelpStr(StringRef HelpStr, size_t Indent,
+                                  size_t FirstLineIndentedBy);
+
   virtual void getExtraOptionNames(SmallVectorImpl<StringRef> &) {}
 
   // addOccurrence - Wrapper around handleOccurrence that enforces Flags.
@@ -679,7 +685,7 @@ public:
       : Values(Options) {}
 
   template <class Opt> void apply(Opt &O) const {
-    for (auto Value : Values)
+    for (const auto &Value : Values)
       O.getParser().addLiteralOption(Value.Name, Value.Value,
                                      Value.Description);
   }
@@ -1488,7 +1494,7 @@ public:
 
   template <class... Mods>
   explicit opt(const Mods &... Ms)
-      : Option(Optional, NotHidden), Parser(*this) {
+      : Option(llvm::cl::Optional, NotHidden), Parser(*this) {
     apply(this, Ms...);
     done();
   }
@@ -2092,6 +2098,14 @@ bool ExpandResponseFiles(
     llvm::vfs::FileSystem &FS = *llvm::vfs::getRealFileSystem(),
     llvm::Optional<llvm::StringRef> CurrentDir = llvm::None);
 
+/// A convenience helper which concatenates the options specified by the
+/// environment variable EnvVar and command line options, then expands response
+/// files recursively. The tokenizer is a predefined GNU or Windows one.
+/// \return true if all @files were expanded successfully or there were none.
+bool expandResponseFiles(int Argc, const char *const *Argv, const char *EnvVar,
+                         StringSaver &Saver,
+                         SmallVectorImpl<const char *> &NewArgv);
+
 /// Mark all options not part of this category as cl::ReallyHidden.
 ///
 /// \param Category the category of options to keep displaying
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/Compiler.h b/contrib/llvm-project/llvm/include/llvm/Support/Compiler.h
index 80ea76240d6c..9348ada91325 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/Compiler.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/Compiler.h
@@ -146,7 +146,7 @@
 /// LLVM_NODISCARD - Warn if a type or return value is discarded.
 
 // Use the 'nodiscard' attribute in C++17 or newer mode.
-#if __cplusplus > 201402L && LLVM_HAS_CPP_ATTRIBUTE(nodiscard)
+#if defined(__cplusplus) && __cplusplus > 201402L && LLVM_HAS_CPP_ATTRIBUTE(nodiscard)
 #define LLVM_NODISCARD [[nodiscard]]
 #elif LLVM_HAS_CPP_ATTRIBUTE(clang::warn_unused_result)
 #define LLVM_NODISCARD [[clang::warn_unused_result]]
@@ -234,11 +234,11 @@
 /// 3.4 supported this but is buggy in various cases and produces unimplemented
 /// errors, just use it in GCC 4.0 and later.
 #if __has_attribute(always_inline) || LLVM_GNUC_PREREQ(4, 0, 0)
-#define LLVM_ATTRIBUTE_ALWAYS_INLINE __attribute__((always_inline))
+#define LLVM_ATTRIBUTE_ALWAYS_INLINE inline __attribute__((always_inline))
 #elif defined(_MSC_VER)
 #define LLVM_ATTRIBUTE_ALWAYS_INLINE __forceinline
 #else
-#define LLVM_ATTRIBUTE_ALWAYS_INLINE
+#define LLVM_ATTRIBUTE_ALWAYS_INLINE inline
 #endif
 
 #ifdef __GNUC__
@@ -268,7 +268,7 @@
 #endif
 
 /// LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements.
-#if __cplusplus > 201402L && LLVM_HAS_CPP_ATTRIBUTE(fallthrough)
+#if defined(__cplusplus) && __cplusplus > 201402L && LLVM_HAS_CPP_ATTRIBUTE(fallthrough)
 #define LLVM_FALLTHROUGH [[fallthrough]]
 #elif LLVM_HAS_CPP_ATTRIBUTE(gnu::fallthrough)
 #define LLVM_FALLTHROUGH [[gnu::fallthrough]]
@@ -314,19 +314,9 @@
 #endif
 
 // LLVM_ATTRIBUTE_DEPRECATED(decl, "message")
-#if __has_feature(attribute_deprecated_with_message)
-# define LLVM_ATTRIBUTE_DEPRECATED(decl, message) \
-  decl __attribute__((deprecated(message)))
-#elif defined(__GNUC__)
-# define LLVM_ATTRIBUTE_DEPRECATED(decl, message) \
-  decl __attribute__((deprecated))
-#elif defined(_MSC_VER)
-# define LLVM_ATTRIBUTE_DEPRECATED(decl, message) \
-  __declspec(deprecated(message)) decl
-#else
-# define LLVM_ATTRIBUTE_DEPRECATED(decl, message) \
-  decl
-#endif
+// This macro will be removed.
+// Use C++14's attribute instead: [[deprecated("message")]]
+#define LLVM_ATTRIBUTE_DEPRECATED(decl, message) [[deprecated(message)]] decl
 
 /// LLVM_BUILTIN_UNREACHABLE - On compilers which support it, expands
 /// to an expression which states that it is undefined behavior for the
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/CrashRecoveryContext.h b/contrib/llvm-project/llvm/include/llvm/Support/CrashRecoveryContext.h
index 61a1bd405a4d..f756635ee1f9 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/CrashRecoveryContext.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/CrashRecoveryContext.h
@@ -44,11 +44,11 @@ class CrashRecoveryContextCleanup;
 /// executed in any case, whether crash occurs or not. These actions may be used
 /// to reclaim resources in the case of crash.
 class CrashRecoveryContext {
-  void *Impl;
-  CrashRecoveryContextCleanup *head;
+  void *Impl = nullptr;
+  CrashRecoveryContextCleanup *head = nullptr;
 
 public:
-  CrashRecoveryContext() : Impl(nullptr), head(nullptr) {}
+  CrashRecoveryContext();
   ~CrashRecoveryContext();
 
   /// Register cleanup handler, which is used when the recovery context is
@@ -102,6 +102,10 @@ public:
   LLVM_ATTRIBUTE_NORETURN
   void HandleExit(int RetCode);
 
+  /// Throw again a signal or an exception, after it was catched once by a
+  /// CrashRecoveryContext.
+  static bool throwIfCrash(int RetCode);
+
   /// In case of a crash, this is the crash identifier.
   int RetCode = 0;
 
@@ -181,7 +185,7 @@ public:
       : CrashRecoveryContextCleanupBase<
             CrashRecoveryContextDestructorCleanup<T>, T>(context, resource) {}
 
-  virtual void recoverResources() {
+  void recoverResources() override {
     this->resource->~T();
   }
 };
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/DOTGraphTraits.h b/contrib/llvm-project/llvm/include/llvm/Support/DOTGraphTraits.h
index ec01b7d9576a..a73538fa1462 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/DOTGraphTraits.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/DOTGraphTraits.h
@@ -60,7 +60,8 @@ public:
 
   /// isNodeHidden - If the function returns true, the given node is not
   /// displayed in the graph.
-  static bool isNodeHidden(const void *) {
+  template <typename GraphType>
+  static bool isNodeHidden(const void *, const GraphType &) {
     return false;
   }
 
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/Error.h b/contrib/llvm-project/llvm/include/llvm/Support/Error.h
index 9dd1bb7cb96d..c0f7c10aefb4 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/Error.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/Error.h
@@ -629,22 +629,22 @@ private:
 
   storage_type *getStorage() {
     assert(!HasError && "Cannot get value when an error exists!");
-    return reinterpret_cast<storage_type *>(TStorage.buffer);
+    return reinterpret_cast<storage_type *>(&TStorage);
   }
 
   const storage_type *getStorage() const {
     assert(!HasError && "Cannot get value when an error exists!");
-    return reinterpret_cast<const storage_type *>(TStorage.buffer);
+    return reinterpret_cast<const storage_type *>(&TStorage);
   }
 
   error_type *getErrorStorage() {
     assert(HasError && "Cannot get error when a value exists!");
-    return reinterpret_cast<error_type *>(ErrorStorage.buffer);
+    return reinterpret_cast<error_type *>(&ErrorStorage);
   }
 
   const error_type *getErrorStorage() const {
     assert(HasError && "Cannot get error when a value exists!");
-    return reinterpret_cast<const error_type *>(ErrorStorage.buffer);
+    return reinterpret_cast<const error_type *>(&ErrorStorage);
   }
 
   // Used by ExpectedAsOutParameter to reset the checked flag.
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/ErrorHandling.h b/contrib/llvm-project/llvm/include/llvm/Support/ErrorHandling.h
index 7cbc668b3a0e..0ec0242d569d 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/ErrorHandling.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/ErrorHandling.h
@@ -110,9 +110,9 @@ void install_out_of_memory_new_handler();
 /// the following unwind succeeds, e.g. do not trigger additional allocations
 /// in the unwind chain.
 ///
-/// If no error handler is installed (default), then a bad_alloc exception
-/// is thrown, if LLVM is compiled with exception support, otherwise an
-/// assertion is called.
+/// If no error handler is installed (default), throws a bad_alloc exception
+/// if LLVM is compiled with exception support. Otherwise prints the error
+/// to standard error and calls abort().
 LLVM_ATTRIBUTE_NORETURN void report_bad_alloc_error(const char *Reason,
                                                     bool GenCrashDiag = true);
 
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/ErrorOr.h b/contrib/llvm-project/llvm/include/llvm/Support/ErrorOr.h
index 1fbccc1d1e26..b654c9c9c43b 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/ErrorOr.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/ErrorOr.h
@@ -235,17 +235,17 @@ private:
 
   storage_type *getStorage() {
     assert(!HasError && "Cannot get value when an error exists!");
-    return reinterpret_cast<storage_type*>(TStorage.buffer);
+    return reinterpret_cast<storage_type *>(&TStorage);
   }
 
   const storage_type *getStorage() const {
     assert(!HasError && "Cannot get value when an error exists!");
-    return reinterpret_cast<const storage_type*>(TStorage.buffer);
+    return reinterpret_cast<const storage_type *>(&TStorage);
   }
 
   std::error_code *getErrorStorage() {
     assert(HasError && "Cannot get error when a value exists!");
-    return reinterpret_cast<std::error_code *>(ErrorStorage.buffer);
+    return reinterpret_cast<std::error_code *>(&ErrorStorage);
   }
 
   const std::error_code *getErrorStorage() const {
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/ExitCodes.h b/contrib/llvm-project/llvm/include/llvm/Support/ExitCodes.h
new file mode 100644
index 000000000000..b9041f5557d5
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/Support/ExitCodes.h
@@ -0,0 +1,33 @@
+//===-- llvm/Support/ExitCodes.h - Exit codes for exit()  -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains definitions of exit codes for exit() function. They are
+/// either defined by sysexits.h if it is supported, or defined here if
+/// sysexits.h is not supported.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_EXITCODES_H
+#define LLVM_SUPPORT_EXITCODES_H
+
+#include "llvm/Config/llvm-config.h"
+
+#if HAVE_SYSEXITS_H
+#include <sysexits.h>
+#elif __MVS__
+// <sysexits.h> does not exist on z/OS. The only value used in LLVM is
+// EX_IOERR, which is used to signal a special error condition (broken pipe).
+// Define the macro with its usual value from BSD systems, which is chosen to
+// not clash with more standard exit codes like 1.
+#define EX_IOERR 74
+#elif LLVM_ON_UNIX
+#error Exit code EX_IOERR not available
+#endif
+
+#endif
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/FileCheck.h b/contrib/llvm-project/llvm/include/llvm/Support/FileCheck.h
deleted file mode 100644
index 2f0e641394d5..000000000000
--- a/contrib/llvm-project/llvm/include/llvm/Support/FileCheck.h
+++ /dev/null
@@ -1,191 +0,0 @@
-//==-- llvm/Support/FileCheck.h ---------------------------*- C++ -*-==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file This file has some utilities to use FileCheck as an API
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_SUPPORT_FILECHECK_H
-#define LLVM_SUPPORT_FILECHECK_H
-
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/Regex.h"
-#include "llvm/Support/SourceMgr.h"
-#include <string>
-#include <vector>
-
-namespace llvm {
-
-/// Contains info about various FileCheck options.
-struct FileCheckRequest {
-  std::vector<StringRef> CheckPrefixes;
-  std::vector<StringRef> CommentPrefixes;
-  bool NoCanonicalizeWhiteSpace = false;
-  std::vector<StringRef> ImplicitCheckNot;
-  std::vector<StringRef> GlobalDefines;
-  bool AllowEmptyInput = false;
-  bool MatchFullLines = false;
-  bool IgnoreCase = false;
-  bool IsDefaultCheckPrefix = false;
-  bool EnableVarScope = false;
-  bool AllowDeprecatedDagOverlap = false;
-  bool Verbose = false;
-  bool VerboseVerbose = false;
-};
-
-//===----------------------------------------------------------------------===//
-// Summary of a FileCheck diagnostic.
-//===----------------------------------------------------------------------===//
-
-namespace Check {
-
-enum FileCheckKind {
-  CheckNone = 0,
-  CheckPlain,
-  CheckNext,
-  CheckSame,
-  CheckNot,
-  CheckDAG,
-  CheckLabel,
-  CheckEmpty,
-  CheckComment,
-
-  /// Indicates the pattern only matches the end of file. This is used for
-  /// trailing CHECK-NOTs.
-  CheckEOF,
-
-  /// Marks when parsing found a -NOT check combined with another CHECK suffix.
-  CheckBadNot,
-
-  /// Marks when parsing found a -COUNT directive with invalid count value.
-  CheckBadCount
-};
-
-class FileCheckType {
-  FileCheckKind Kind;
-  int Count; ///< optional Count for some checks
-
-public:
-  FileCheckType(FileCheckKind Kind = CheckNone) : Kind(Kind), Count(1) {}
-  FileCheckType(const FileCheckType &) = default;
-  FileCheckType &operator=(const FileCheckType &) = default;
-
-  operator FileCheckKind() const { return Kind; }
-
-  int getCount() const { return Count; }
-  FileCheckType &setCount(int C);
-
-  // \returns a description of \p Prefix.
-  std::string getDescription(StringRef Prefix) const;
-};
-} // namespace Check
-
-struct FileCheckDiag {
-  /// What is the FileCheck directive for this diagnostic?
-  Check::FileCheckType CheckTy;
-  /// Where is the FileCheck directive for this diagnostic?
-  SMLoc CheckLoc;
-  /// What type of match result does this diagnostic describe?
-  ///
-  /// A directive's supplied pattern is said to be either expected or excluded
-  /// depending on whether the pattern must have or must not have a match in
-  /// order for the directive to succeed.  For example, a CHECK directive's
-  /// pattern is expected, and a CHECK-NOT directive's pattern is excluded.
-  /// All match result types whose names end with "Excluded" are for excluded
-  /// patterns, and all others are for expected patterns.
-  ///
-  /// There might be more than one match result for a single pattern.  For
-  /// example, there might be several discarded matches
-  /// (MatchFoundButDiscarded) before either a good match
-  /// (MatchFoundAndExpected) or a failure to match (MatchNoneButExpected),
-  /// and there might be a fuzzy match (MatchFuzzy) after the latter.
-  enum MatchType {
-    /// Indicates a good match for an expected pattern.
-    MatchFoundAndExpected,
-    /// Indicates a match for an excluded pattern.
-    MatchFoundButExcluded,
-    /// Indicates a match for an expected pattern, but the match is on the
-    /// wrong line.
-    MatchFoundButWrongLine,
-    /// Indicates a discarded match for an expected pattern.
-    MatchFoundButDiscarded,
-    /// Indicates no match for an excluded pattern.
-    MatchNoneAndExcluded,
-    /// Indicates no match for an expected pattern, but this might follow good
-    /// matches when multiple matches are expected for the pattern, or it might
-    /// follow discarded matches for the pattern.
-    MatchNoneButExpected,
-    /// Indicates a fuzzy match that serves as a suggestion for the next
-    /// intended match for an expected pattern with too few or no good matches.
-    MatchFuzzy,
-  } MatchTy;
-  /// The search range if MatchTy is MatchNoneAndExcluded or
-  /// MatchNoneButExpected, or the match range otherwise.
-  unsigned InputStartLine;
-  unsigned InputStartCol;
-  unsigned InputEndLine;
-  unsigned InputEndCol;
-  FileCheckDiag(const SourceMgr &SM, const Check::FileCheckType &CheckTy,
-                SMLoc CheckLoc, MatchType MatchTy, SMRange InputRange);
-};
-
-class FileCheckPatternContext;
-struct FileCheckString;
-
-/// FileCheck class takes the request and exposes various methods that
-/// use information from the request.
-class FileCheck {
-  FileCheckRequest Req;
-  std::unique_ptr<FileCheckPatternContext> PatternContext;
-  // C++17 TODO: make this a plain std::vector.
-  std::unique_ptr<std::vector<FileCheckString>> CheckStrings;
-
-public:
-  explicit FileCheck(FileCheckRequest Req);
-  ~FileCheck();
-
-  // Combines the check prefixes into a single regex so that we can efficiently
-  // scan for any of the set.
-  //
-  // The semantics are that the longest-match wins which matches our regex
-  // library.
-  Regex buildCheckPrefixRegex();
-
-  /// Reads the check file from \p Buffer and records the expected strings it
-  /// contains. Errors are reported against \p SM.
-  ///
-  /// Only expected strings whose prefix is one of those listed in \p PrefixRE
-  /// are recorded. \returns true in case of an error, false otherwise.
-  ///
-  /// If \p ImpPatBufferIDRange, then the range (inclusive start, exclusive end)
-  /// of IDs for source buffers added to \p SM for implicit patterns are
-  /// recorded in it.  The range is empty if there are none.
-  bool
-  readCheckFile(SourceMgr &SM, StringRef Buffer, Regex &PrefixRE,
-                std::pair<unsigned, unsigned> *ImpPatBufferIDRange = nullptr);
-
-  bool ValidateCheckPrefixes();
-
-  /// Canonicalizes whitespaces in the file. Line endings are replaced with
-  /// UNIX-style '\n'.
-  StringRef CanonicalizeFile(MemoryBuffer &MB,
-                             SmallVectorImpl<char> &OutputBuffer);
-
-  /// Checks the input to FileCheck provided in the \p Buffer against the
-  /// expected strings read from the check file and record diagnostics emitted
-  /// in \p Diags. Errors are recorded against \p SM.
-  ///
-  /// \returns false if the input fails to satisfy the checks.
-  bool checkInput(SourceMgr &SM, StringRef Buffer,
-                  std::vector<FileCheckDiag> *Diags = nullptr);
-};
-
-} // namespace llvm
-
-#endif
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/FileCollector.h b/contrib/llvm-project/llvm/include/llvm/Support/FileCollector.h
index 2b5e9c669b68..8ea344a347d3 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/FileCollector.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/FileCollector.h
@@ -20,6 +20,35 @@ namespace llvm {
 class FileCollectorFileSystem;
 class Twine;
 
+class FileCollectorBase {
+public:
+  FileCollectorBase();
+  virtual ~FileCollectorBase();
+
+  void addFile(const Twine &file);
+  void addDirectory(const Twine &Dir);
+
+protected:
+  bool markAsSeen(StringRef Path) {
+    if (Path.empty())
+      return false;
+    return Seen.insert(Path).second;
+  }
+
+  virtual void addFileImpl(StringRef SrcPath) = 0;
+
+  virtual llvm::vfs::directory_iterator
+  addDirectoryImpl(const llvm::Twine &Dir,
+                   IntrusiveRefCntPtr<vfs::FileSystem> FS,
+                   std::error_code &EC) = 0;
+
+  /// Synchronizes access to internal data structures.
+  std::mutex Mutex;
+
+  /// Tracks already seen files so they can be skipped.
+  StringSet<> Seen;
+};
+
 /// Captures file system interaction and generates data to be later replayed
 /// with the RedirectingFileSystem.
 ///
@@ -38,16 +67,34 @@ class Twine;
 ///
 /// In order to preserve the relative topology of files we use their real paths
 /// as relative paths inside of the Root.
-class FileCollector {
+class FileCollector : public FileCollectorBase {
 public:
+  /// Helper utility that encapsulates the logic for canonicalizing a virtual
+  /// path and a path to copy from.
+  class PathCanonicalizer {
+  public:
+    struct PathStorage {
+      SmallString<256> CopyFrom;
+      SmallString<256> VirtualPath;
+    };
+
+    /// Canonicalize a pair of virtual and real paths.
+    PathStorage canonicalize(StringRef SrcPath);
+
+  private:
+    /// Replace with a (mostly) real path, or don't modify. Resolves symlinks
+    /// in the directory, using \a CachedDirs to avoid redundant lookups, but
+    /// leaves the filename as a possible symlink.
+    void updateWithRealPath(SmallVectorImpl<char> &Path);
+
+    StringMap<std::string> CachedDirs;
+  };
+
   /// \p Root is the directory where collected files are will be stored.
   /// \p OverlayRoot is VFS mapping root.
   /// \p Root directory gets created in copyFiles unless it already exists.
   FileCollector(std::string Root, std::string OverlayRoot);
 
-  void addFile(const Twine &file);
-  void addDirectory(const Twine &Dir);
-
   /// Write the yaml mapping (for the VFS) to the given file.
   std::error_code writeMapping(StringRef MappingFile);
 
@@ -67,14 +114,6 @@ public:
 private:
   friend FileCollectorFileSystem;
 
-  bool markAsSeen(StringRef Path) {
-    if (Path.empty())
-      return false;
-    return Seen.insert(Path).second;
-  }
-
-  bool getRealPath(StringRef SrcPath, SmallVectorImpl<char> &Result);
-
   void addFileToMapping(StringRef VirtualPath, StringRef RealPath) {
     if (sys::fs::is_directory(VirtualPath))
       VFSWriter.addDirectoryMapping(VirtualPath, RealPath);
@@ -83,14 +122,12 @@ private:
   }
 
 protected:
-  void addFileImpl(StringRef SrcPath);
+  void addFileImpl(StringRef SrcPath) override;
 
   llvm::vfs::directory_iterator
   addDirectoryImpl(const llvm::Twine &Dir,
-                   IntrusiveRefCntPtr<vfs::FileSystem> FS, std::error_code &EC);
-
-  /// Synchronizes access to Seen, VFSWriter and SymlinkMap.
-  std::mutex Mutex;
+                   IntrusiveRefCntPtr<vfs::FileSystem> FS,
+                   std::error_code &EC) override;
 
   /// The directory where collected files are copied to in copyFiles().
   const std::string Root;
@@ -98,14 +135,11 @@ protected:
   /// The root directory where the VFS overlay lives.
   const std::string OverlayRoot;
 
-  /// Tracks already seen files so they can be skipped.
-  StringSet<> Seen;
-
   /// The yaml mapping writer.
   vfs::YAMLVFSWriter VFSWriter;
 
-  /// Caches RealPath calls when resolving symlinks.
-  StringMap<std::string> SymlinkMap;
+  /// Helper utility for canonicalizing paths.
+  PathCanonicalizer Canonicalizer;
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/FileSystem.h b/contrib/llvm-project/llvm/include/llvm/Support/FileSystem.h
index a29a9d787947..2483aae046f5 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/FileSystem.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/FileSystem.h
@@ -34,6 +34,7 @@
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/FileSystem/UniqueID.h"
 #include "llvm/Support/MD5.h"
 #include <cassert>
 #include <cstdint>
@@ -42,7 +43,6 @@
 #include <stack>
 #include <string>
 #include <system_error>
-#include <tuple>
 #include <vector>
 
 #ifdef HAVE_SYS_STAT_H
@@ -131,26 +131,6 @@ inline perms operator~(perms x) {
       static_cast<unsigned short>(~static_cast<unsigned short>(x)));
 }
 
-class UniqueID {
-  uint64_t Device;
-  uint64_t File;
-
-public:
-  UniqueID() = default;
-  UniqueID(uint64_t Device, uint64_t File) : Device(Device), File(File) {}
-
-  bool operator==(const UniqueID &Other) const {
-    return Device == Other.Device && File == Other.File;
-  }
-  bool operator!=(const UniqueID &Other) const { return !(*this == Other); }
-  bool operator<(const UniqueID &Other) const {
-    return std::tie(Device, File) < std::tie(Other.Device, Other.File);
-  }
-
-  uint64_t getDevice() const { return Device; }
-  uint64_t getFile() const { return File; }
-};
-
 /// Represents the result of a call to directory_iterator::status(). This is a
 /// subset of the information returned by a regular sys::fs::status() call, and
 /// represents the information provided by Windows FileFirstFile/FindNextFile.
@@ -1131,6 +1111,43 @@ Expected<file_t>
 openNativeFileForRead(const Twine &Name, OpenFlags Flags = OF_None,
                       SmallVectorImpl<char> *RealPath = nullptr);
 
+/// Try to locks the file during the specified time.
+///
+/// This function implements advisory locking on entire file. If it returns
+/// <em>errc::success</em>, the file is locked by the calling process. Until the
+/// process unlocks the file by calling \a unlockFile, all attempts to lock the
+/// same file will fail/block. The process that locked the file may assume that
+/// none of other processes read or write this file, provided that all processes
+/// lock the file prior to accessing its content.
+///
+/// @param FD      The descriptor representing the file to lock.
+/// @param Timeout Time in milliseconds that the process should wait before
+///                reporting lock failure. Zero value means try to get lock only
+///                once.
+/// @returns errc::success if lock is successfully obtained,
+/// errc::no_lock_available if the file cannot be locked, or platform-specific
+/// error_code otherwise.
+///
+/// @note Care should be taken when using this function in a multithreaded
+/// context, as it may not prevent other threads in the same process from
+/// obtaining a lock on the same file, even if they are using a different file
+/// descriptor.
+std::error_code
+tryLockFile(int FD,
+            std::chrono::milliseconds Timeout = std::chrono::milliseconds(0));
+
+/// Lock the file.
+///
+/// This function acts as @ref tryLockFile but it waits infinitely.
+std::error_code lockFile(int FD);
+
+/// Unlock the file.
+///
+/// @param FD The descriptor representing the file to unlock.
+/// @returns errc::success if lock is successfully released or platform-specific
+/// error_code otherwise.
+std::error_code unlockFile(int FD);
+
 /// @brief Close the file object.  This should be used instead of ::close for
 /// portability. On error, the caller should assume the file is closed, as is
 /// the case for Process::SafelyCloseFileDescriptor
@@ -1142,6 +1159,35 @@ openNativeFileForRead(const Twine &Name, OpenFlags Flags = OF_None,
 /// means that the filesystem may have failed to perform some buffered writes.
 std::error_code closeFile(file_t &F);
 
+/// RAII class that facilitates file locking.
+class FileLocker {
+  int FD; ///< Locked file handle.
+  FileLocker(int FD) : FD(FD) {}
+  friend class llvm::raw_fd_ostream;
+
+public:
+  FileLocker(const FileLocker &L) = delete;
+  FileLocker(FileLocker &&L) : FD(L.FD) { L.FD = -1; }
+  ~FileLocker() {
+    if (FD != -1)
+      unlockFile(FD);
+  }
+  FileLocker &operator=(FileLocker &&L) {
+    FD = L.FD;
+    L.FD = -1;
+    return *this;
+  }
+  FileLocker &operator=(const FileLocker &L) = delete;
+  std::error_code unlock() {
+    if (FD != -1) {
+      std::error_code Result = unlockFile(FD);
+      FD = -1;
+      return Result;
+    }
+    return std::error_code();
+  }
+};
+
 std::error_code getUniqueID(const Twine Path, UniqueID &Result);
 
 /// Get disk space usage information.
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/FileSystem/UniqueID.h b/contrib/llvm-project/llvm/include/llvm/Support/FileSystem/UniqueID.h
new file mode 100644
index 000000000000..229410c8292e
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/Support/FileSystem/UniqueID.h
@@ -0,0 +1,52 @@
+//===- llvm/Support/FileSystem/UniqueID.h - UniqueID for files --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is cut out of llvm/Support/FileSystem.h to allow UniqueID to be
+// reused without bloating the includes.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_FILESYSTEM_UNIQUEID_H
+#define LLVM_SUPPORT_FILESYSTEM_UNIQUEID_H
+
+#include <cstdint>
+
+namespace llvm {
+namespace sys {
+namespace fs {
+
+class UniqueID {
+  uint64_t Device;
+  uint64_t File;
+
+public:
+  UniqueID() = default;
+  UniqueID(uint64_t Device, uint64_t File) : Device(Device), File(File) {}
+
+  bool operator==(const UniqueID &Other) const {
+    return Device == Other.Device && File == Other.File;
+  }
+  bool operator!=(const UniqueID &Other) const { return !(*this == Other); }
+  bool operator<(const UniqueID &Other) const {
+    /// Don't use std::tie since it bloats the compile time of this header.
+    if (Device < Other.Device)
+      return true;
+    if (Other.Device < Device)
+      return false;
+    return File < Other.File;
+  }
+
+  uint64_t getDevice() const { return Device; }
+  uint64_t getFile() const { return File; }
+};
+
+} // end namespace fs
+} // end namespace sys
+} // end namespace llvm
+
+#endif // LLVM_SUPPORT_FILESYSTEM_UNIQUEID_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/FormatVariadic.h b/contrib/llvm-project/llvm/include/llvm/Support/FormatVariadic.h
index dfafc3ccb44e..094b054f773f 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/FormatVariadic.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/FormatVariadic.h
@@ -205,10 +205,10 @@ public:
 //
 // The characters '{' and '}' are reserved and cannot appear anywhere within a
 // replacement sequence.  Outside of a replacement sequence, in order to print
-// a literal '{' or '}' it must be doubled -- "{{" to print a literal '{' and
-// "}}" to print a literal '}'.
+// a literal '{' it must be doubled as "{{".
 //
 // ===Parameter Indexing===
+//
 // `index` specifies the index of the parameter in the parameter pack to format
 // into the output.  Note that it is possible to refer to the same parameter
 // index multiple times in a given format string.  This makes it possible to
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/GenericDomTree.h b/contrib/llvm-project/llvm/include/llvm/Support/GenericDomTree.h
index 10e591a69d36..18e08dbcd175 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/GenericDomTree.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/GenericDomTree.h
@@ -28,6 +28,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/CFGDiff.h"
 #include "llvm/Support/CFGUpdate.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
@@ -37,7 +38,6 @@
 #include <memory>
 #include <type_traits>
 #include <utility>
-#include <vector>
 
 namespace llvm {
 
@@ -60,7 +60,7 @@ template <class NodeT> class DomTreeNodeBase {
   NodeT *TheBB;
   DomTreeNodeBase *IDom;
   unsigned Level;
-  std::vector<DomTreeNodeBase *> Children;
+  SmallVector<DomTreeNodeBase *, 4> Children;
   mutable unsigned DFSNumIn = ~0;
   mutable unsigned DFSNumOut = ~0;
 
@@ -68,9 +68,9 @@ template <class NodeT> class DomTreeNodeBase {
   DomTreeNodeBase(NodeT *BB, DomTreeNodeBase *iDom)
       : TheBB(BB), IDom(iDom), Level(IDom ? IDom->Level + 1 : 0) {}
 
-  using iterator = typename std::vector<DomTreeNodeBase *>::iterator;
+  using iterator = typename SmallVector<DomTreeNodeBase *, 4>::iterator;
   using const_iterator =
-      typename std::vector<DomTreeNodeBase *>::const_iterator;
+      typename SmallVector<DomTreeNodeBase *, 4>::const_iterator;
 
   iterator begin() { return Children.begin(); }
   iterator end() { return Children.end(); }
@@ -211,7 +211,10 @@ void DeleteEdge(DomTreeT &DT, typename DomTreeT::NodePtr From,
 
 template <typename DomTreeT>
 void ApplyUpdates(DomTreeT &DT,
-                  ArrayRef<typename DomTreeT::UpdateType> Updates);
+                  GraphDiff<typename DomTreeT::NodePtr,
+                            DomTreeT::IsPostDominator> &PreViewCFG,
+                  GraphDiff<typename DomTreeT::NodePtr,
+                            DomTreeT::IsPostDominator> *PostViewCFG);
 
 template <typename DomTreeT>
 bool Verify(const DomTreeT &DT, typename DomTreeT::VerificationLevel VL);
@@ -460,8 +463,8 @@ protected:
     return this->Roots[0];
   }
 
-  /// findNearestCommonDominator - Find nearest common dominator basic block
-  /// for basic block A and B. If there is no such block then return nullptr.
+  /// Find nearest common dominator basic block for basic block A and B. A and B
+  /// must have tree nodes.
   NodeT *findNearestCommonDominator(NodeT *A, NodeT *B) const {
     assert(A && B && "Pointers are not valid");
     assert(A->getParent() == B->getParent() &&
@@ -477,18 +480,18 @@ protected:
 
     DomTreeNodeBase<NodeT> *NodeA = getNode(A);
     DomTreeNodeBase<NodeT> *NodeB = getNode(B);
-
-    if (!NodeA || !NodeB) return nullptr;
+    assert(NodeA && "A must be in the tree");
+    assert(NodeB && "B must be in the tree");
 
     // Use level information to go up the tree until the levels match. Then
     // continue going up til we arrive at the same node.
-    while (NodeA && NodeA != NodeB) {
+    while (NodeA != NodeB) {
       if (NodeA->getLevel() < NodeB->getLevel()) std::swap(NodeA, NodeB);
 
       NodeA = NodeA->IDom;
     }
 
-    return NodeA ? NodeA->getBlock() : nullptr;
+    return NodeA->getBlock();
   }
 
   const NodeT *findNearestCommonDominator(const NodeT *A,
@@ -535,10 +538,39 @@ protected:
   /// The type of updates is the same for DomTreeBase<T> and PostDomTreeBase<T>
   /// with the same template parameter T.
   ///
-  /// \param Updates An unordered sequence of updates to perform.
+  /// \param Updates An unordered sequence of updates to perform. The current
+  /// CFG and the reverse of these updates provides the pre-view of the CFG.
   ///
   void applyUpdates(ArrayRef<UpdateType> Updates) {
-    DomTreeBuilder::ApplyUpdates(*this, Updates);
+    GraphDiff<NodePtr, IsPostDominator> PreViewCFG(
+        Updates, /*ReverseApplyUpdates=*/true);
+    DomTreeBuilder::ApplyUpdates(*this, PreViewCFG, nullptr);
+  }
+
+  /// \param Updates An unordered sequence of updates to perform. The current
+  /// CFG and the reverse of these updates provides the pre-view of the CFG.
+  /// \param PostViewUpdates An unordered sequence of update to perform in order
+  /// to obtain a post-view of the CFG. The DT will be updated assuming the
+  /// obtained PostViewCFG is the desired end state.
+  void applyUpdates(ArrayRef<UpdateType> Updates,
+                    ArrayRef<UpdateType> PostViewUpdates) {
+    if (Updates.empty()) {
+      GraphDiff<NodePtr, IsPostDom> PostViewCFG(PostViewUpdates);
+      DomTreeBuilder::ApplyUpdates(*this, PostViewCFG, &PostViewCFG);
+    } else {
+      // PreViewCFG needs to merge Updates and PostViewCFG. The updates in
+      // Updates need to be reversed, and match the direction in PostViewCFG.
+      // The PostViewCFG is created with updates reversed (equivalent to changes
+      // made to the CFG), so the PreViewCFG needs all the updates reverse
+      // applied.
+      SmallVector<UpdateType> AllUpdates(Updates.begin(), Updates.end());
+      for (auto &Update : PostViewUpdates)
+        AllUpdates.push_back(Update);
+      GraphDiff<NodePtr, IsPostDom> PreViewCFG(AllUpdates,
+                                               /*ReverseApplyUpdates=*/true);
+      GraphDiff<NodePtr, IsPostDom> PostViewCFG(PostViewUpdates);
+      DomTreeBuilder::ApplyUpdates(*this, PreViewCFG, &PostViewCFG);
+    }
   }
 
   /// Inform the dominator tree about a CFG edge insertion and update the tree.
@@ -807,9 +839,7 @@ protected:
            "NewBB should have a single successor!");
     NodeRef NewBBSucc = *GraphT::child_begin(NewBB);
 
-    std::vector<NodeRef> PredBlocks;
-    for (auto Pred : children<Inverse<N>>(NewBB))
-      PredBlocks.push_back(Pred);
+    SmallVector<NodeRef, 4> PredBlocks(children<Inverse<N>>(NewBB));
 
     assert(!PredBlocks.empty() && "No predblocks?");
 
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/GenericDomTreeConstruction.h b/contrib/llvm-project/llvm/include/llvm/Support/GenericDomTreeConstruction.h
index 464de4e2b3ba..4b59ad1f017f 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/GenericDomTreeConstruction.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/GenericDomTreeConstruction.h
@@ -58,6 +58,7 @@ struct SemiNCAInfo {
   using TreeNodePtr = DomTreeNodeBase<NodeT> *;
   using RootsT = decltype(DomTreeT::Roots);
   static constexpr bool IsPostDom = DomTreeT::IsPostDominator;
+  using GraphDiffT = GraphDiff<NodePtr, IsPostDom>;
 
   // Information record used by Semi-NCA during tree construction.
   struct InfoRec {
@@ -77,21 +78,17 @@ struct SemiNCAInfo {
   using UpdateT = typename DomTreeT::UpdateType;
   using UpdateKind = typename DomTreeT::UpdateKind;
   struct BatchUpdateInfo {
-    SmallVector<UpdateT, 4> Updates;
-    using NodePtrAndKind = PointerIntPair<NodePtr, 1, UpdateKind>;
-
-    // In order to be able to walk a CFG that is out of sync with the CFG
-    // DominatorTree last knew about, use the list of updates to reconstruct
-    // previous CFG versions of the current CFG. For each node, we store a set
-    // of its virtually added/deleted future successors and predecessors.
-    // Note that these children are from the future relative to what the
-    // DominatorTree knows about -- using them to gets us some snapshot of the
-    // CFG from the past (relative to the state of the CFG).
-    DenseMap<NodePtr, SmallVector<NodePtrAndKind, 4>> FutureSuccessors;
-    DenseMap<NodePtr, SmallVector<NodePtrAndKind, 4>> FuturePredecessors;
+    // Note: Updates inside PreViewCFG are aleady legalized.
+    BatchUpdateInfo(GraphDiffT &PreViewCFG, GraphDiffT *PostViewCFG = nullptr)
+        : PreViewCFG(PreViewCFG), PostViewCFG(PostViewCFG),
+          NumLegalized(PreViewCFG.getNumLegalizedUpdates()) {}
+
     // Remembers if the whole tree was recalculated at some point during the
     // current batch update.
     bool IsRecalculated = false;
+    GraphDiffT &PreViewCFG;
+    GraphDiffT *PostViewCFG;
+    const size_t NumLegalized;
   };
 
   BatchUpdateInfo *BatchUpdates;
@@ -107,66 +104,24 @@ struct SemiNCAInfo {
     // in progress, we need this information to continue it.
   }
 
-  template <bool Inverse>
-  struct ChildrenGetter {
-    using ResultTy = SmallVector<NodePtr, 8>;
-
-    static ResultTy Get(NodePtr N, std::integral_constant<bool, false>) {
-      auto RChildren = reverse(children<NodePtr>(N));
-      return ResultTy(RChildren.begin(), RChildren.end());
-    }
-
-    static ResultTy Get(NodePtr N, std::integral_constant<bool, true>) {
-      auto IChildren = inverse_children<NodePtr>(N);
-      return ResultTy(IChildren.begin(), IChildren.end());
-    }
+  template <bool Inversed>
+  static SmallVector<NodePtr, 8> getChildren(NodePtr N, BatchUpdatePtr BUI) {
+    if (BUI)
+      return BUI->PreViewCFG.template getChildren<Inversed>(N);
+    return getChildren<Inversed>(N);
+  }
 
-    using Tag = std::integral_constant<bool, Inverse>;
-
-    // The function below is the core part of the batch updater. It allows the
-    // Depth Based Search algorithm to perform incremental updates in lockstep
-    // with updates to the CFG. We emulated lockstep CFG updates by getting its
-    // next snapshots by reverse-applying future updates.
-    static ResultTy Get(NodePtr N, BatchUpdatePtr BUI) {
-      ResultTy Res = Get(N, Tag());
-      // If there's no batch update in progress, simply return node's children.
-      if (!BUI) return Res;
-
-      // CFG children are actually its *most current* children, and we have to
-      // reverse-apply the future updates to get the node's children at the
-      // point in time the update was performed.
-      auto &FutureChildren = (Inverse != IsPostDom) ? BUI->FuturePredecessors
-                                                    : BUI->FutureSuccessors;
-      auto FCIt = FutureChildren.find(N);
-      if (FCIt == FutureChildren.end()) return Res;
-
-      for (auto ChildAndKind : FCIt->second) {
-        const NodePtr Child = ChildAndKind.getPointer();
-        const UpdateKind UK = ChildAndKind.getInt();
-
-        // Reverse-apply the future update.
-        if (UK == UpdateKind::Insert) {
-          // If there's an insertion in the future, it means that the edge must
-          // exist in the current CFG, but was not present in it before.
-          assert(llvm::find(Res, Child) != Res.end()
-                 && "Expected child not found in the CFG");
-          Res.erase(std::remove(Res.begin(), Res.end(), Child), Res.end());
-          LLVM_DEBUG(dbgs() << "\tHiding edge " << BlockNamePrinter(N) << " -> "
-                            << BlockNamePrinter(Child) << "\n");
-        } else {
-          // If there's an deletion in the future, it means that the edge cannot
-          // exist in the current CFG, but existed in it before.
-          assert(llvm::find(Res, Child) == Res.end() &&
-                 "Unexpected child found in the CFG");
-          LLVM_DEBUG(dbgs() << "\tShowing virtual edge " << BlockNamePrinter(N)
-                            << " -> " << BlockNamePrinter(Child) << "\n");
-          Res.push_back(Child);
-        }
-      }
+  template <bool Inversed>
+  static SmallVector<NodePtr, 8> getChildren(NodePtr N) {
+    using DirectedNodeT =
+        std::conditional_t<Inversed, Inverse<NodePtr>, NodePtr>;
+    auto R = children<DirectedNodeT>(N);
+    SmallVector<NodePtr, 8> Res(detail::reverse_if<!Inversed>(R));
 
-      return Res;
-    }
-  };
+    // Remove nullptr children for clang.
+    llvm::erase_value(Res, nullptr);
+    return Res;
+  }
 
   NodePtr getIDom(NodePtr BB) const {
     auto InfoIt = NodeToInfo.find(BB);
@@ -208,6 +163,8 @@ struct SemiNCAInfo {
     }
   };
 
+  using NodeOrderMap = DenseMap<NodePtr, unsigned>;
+
   // Custom DFS implementation which can skip nodes based on a provided
   // predicate. It also collects ReverseChildren so that we don't have to spend
   // time getting predecessors in SemiNCA.
@@ -215,9 +172,13 @@ struct SemiNCAInfo {
   // If IsReverse is set to true, the DFS walk will be performed backwards
   // relative to IsPostDom -- using reverse edges for dominators and forward
   // edges for postdominators.
+  //
+  // If SuccOrder is specified then in this order the DFS traverses the children
+  // otherwise the order is implied by the results of getChildren().
   template <bool IsReverse = false, typename DescendCondition>
   unsigned runDFS(NodePtr V, unsigned LastNum, DescendCondition Condition,
-                  unsigned AttachToNum) {
+                  unsigned AttachToNum,
+                  const NodeOrderMap *SuccOrder = nullptr) {
     assert(V);
     SmallVector<NodePtr, 64> WorkList = {V};
     if (NodeToInfo.count(V) != 0) NodeToInfo[V].Parent = AttachToNum;
@@ -233,8 +194,14 @@ struct SemiNCAInfo {
       NumToNode.push_back(BB);
 
       constexpr bool Direction = IsReverse != IsPostDom;  // XOR.
-      for (const NodePtr Succ :
-           ChildrenGetter<Direction>::Get(BB, BatchUpdates)) {
+      auto Successors = getChildren<Direction>(BB, BatchUpdates);
+      if (SuccOrder && Successors.size() > 1)
+        llvm::sort(
+            Successors.begin(), Successors.end(), [=](NodePtr A, NodePtr B) {
+              return SuccOrder->find(A)->second < SuccOrder->find(B)->second;
+            });
+
+      for (const NodePtr Succ : Successors) {
         const auto SIT = NodeToInfo.find(Succ);
         // Don't visit nodes more than once but remember to collect
         // ReverseChildren.
@@ -369,7 +336,7 @@ struct SemiNCAInfo {
   // to CFG nodes within infinite loops.
   static bool HasForwardSuccessors(const NodePtr N, BatchUpdatePtr BUI) {
     assert(N && "N must be a valid node");
-    return !ChildrenGetter<false>::Get(N, BUI).empty();
+    return !getChildren<false>(N, BUI).empty();
   }
 
   static NodePtr GetEntryNode(const DomTreeT &DT) {
@@ -430,6 +397,32 @@ struct SemiNCAInfo {
     // nodes.
     if (Total + 1 != Num) {
       HasNonTrivialRoots = true;
+
+      // SuccOrder is the order of blocks in the function. It is needed to make
+      // the calculation of the FurthestAway node and the whole PostDomTree
+      // immune to swap successors transformation (e.g. canonicalizing branch
+      // predicates). SuccOrder is initialized lazily only for successors of
+      // reverse unreachable nodes.
+      Optional<NodeOrderMap> SuccOrder;
+      auto InitSuccOrderOnce = [&]() {
+        SuccOrder = NodeOrderMap();
+        for (const auto Node : nodes(DT.Parent))
+          if (SNCA.NodeToInfo.count(Node) == 0)
+            for (const auto Succ : getChildren<false>(Node, SNCA.BatchUpdates))
+              SuccOrder->try_emplace(Succ, 0);
+
+        // Add mapping for all entries of SuccOrder.
+        unsigned NodeNum = 0;
+        for (const auto Node : nodes(DT.Parent)) {
+          ++NodeNum;
+          auto Order = SuccOrder->find(Node);
+          if (Order != SuccOrder->end()) {
+            assert(Order->second == 0);
+            Order->second = NodeNum;
+          }
+        }
+      };
+
       // Make another DFS pass over all other nodes to find the
       // reverse-unreachable blocks, and find the furthest paths we'll be able
       // to make.
@@ -454,7 +447,12 @@ struct SemiNCAInfo {
           // expensive and does not always lead to a minimal set of roots.
           LLVM_DEBUG(dbgs() << "\t\t\tRunning forward DFS\n");
 
-          const unsigned NewNum = SNCA.runDFS<true>(I, Num, AlwaysDescend, Num);
+          if (!SuccOrder)
+            InitSuccOrderOnce();
+          assert(SuccOrder);
+
+          const unsigned NewNum =
+              SNCA.runDFS<true>(I, Num, AlwaysDescend, Num, &*SuccOrder);
           const NodePtr FurthestAway = SNCA.NumToNode[NewNum];
           LLVM_DEBUG(dbgs() << "\t\t\tFound a new furthest away node "
                             << "(non-trivial root): "
@@ -530,7 +528,7 @@ struct SemiNCAInfo {
         // If we wound another root in a (forward) DFS walk, remove the current
         // root from the set of roots, as it is reverse-reachable from the other
         // one.
-        if (llvm::find(Roots, N) != Roots.end()) {
+        if (llvm::is_contained(Roots, N)) {
           LLVM_DEBUG(dbgs() << "\tForward DFS walk found another root "
                             << BlockNamePrinter(N) << "\n\tRemoving root "
                             << BlockNamePrinter(Root) << "\n");
@@ -563,12 +561,21 @@ struct SemiNCAInfo {
     auto *Parent = DT.Parent;
     DT.reset();
     DT.Parent = Parent;
-    SemiNCAInfo SNCA(nullptr);  // Since we are rebuilding the whole tree,
-                                // there's no point doing it incrementally.
+    // If the update is using the actual CFG, BUI is null. If it's using a view,
+    // BUI is non-null and the PreCFGView is used. When calculating from
+    // scratch, make the PreViewCFG equal to the PostCFGView, so Post is used.
+    BatchUpdatePtr PostViewBUI = nullptr;
+    if (BUI && BUI->PostViewCFG) {
+      BUI->PreViewCFG = *BUI->PostViewCFG;
+      PostViewBUI = BUI;
+    }
+    // This is rebuilding the whole tree, not incrementally, but PostViewBUI is
+    // used in case the caller needs a DT update with a CFGView.
+    SemiNCAInfo SNCA(PostViewBUI);
 
     // Step #0: Number blocks in depth-first order and initialize variables used
     // in later stages of the algorithm.
-    DT.Roots = FindRoots(DT, nullptr);
+    DT.Roots = FindRoots(DT, PostViewBUI);
     SNCA.doFullDFSWalk(DT, AlwaysDescend);
 
     SNCA.runSemiNCA(DT);
@@ -679,8 +686,7 @@ struct SemiNCAInfo {
     // root.
     if (!DT.isVirtualRoot(To->getIDom())) return false;
 
-    auto RIt = llvm::find(DT.Roots, To->getBlock());
-    if (RIt == DT.Roots.end())
+    if (!llvm::is_contained(DT.Roots, To->getBlock()))
       return false;  // To is not a root, nothing to update.
 
     LLVM_DEBUG(dbgs() << "\t\tAfter the insertion, " << BlockNamePrinter(To)
@@ -787,8 +793,7 @@ struct SemiNCAInfo {
         //
         // Invariant: there is an optimal path from `To` to TN with the minimum
         // depth being CurrentLevel.
-        for (const NodePtr Succ :
-             ChildrenGetter<IsPostDom>::Get(TN->getBlock(), BUI)) {
+        for (const NodePtr Succ : getChildren<IsPostDom>(TN->getBlock(), BUI)) {
           const TreeNodePtr SuccTN = DT.getNode(Succ);
           assert(SuccTN &&
                  "Unreachable successor found at reachable insertion");
@@ -918,8 +923,8 @@ struct SemiNCAInfo {
     // the DomTree about it.
     // The check is O(N), so run it only in debug configuration.
     auto IsSuccessor = [BUI](const NodePtr SuccCandidate, const NodePtr Of) {
-      auto Successors = ChildrenGetter<IsPostDom>::Get(Of, BUI);
-      return llvm::find(Successors, SuccCandidate) != Successors.end();
+      auto Successors = getChildren<IsPostDom>(Of, BUI);
+      return llvm::is_contained(Successors, SuccCandidate);
     };
     (void)IsSuccessor;
     assert(!IsSuccessor(To, From) && "Deleted edge still exists in the CFG!");
@@ -1005,15 +1010,14 @@ struct SemiNCAInfo {
                                const TreeNodePtr TN) {
     LLVM_DEBUG(dbgs() << "IsReachableFromIDom " << BlockNamePrinter(TN)
                       << "\n");
-    for (const NodePtr Pred :
-         ChildrenGetter<!IsPostDom>::Get(TN->getBlock(), BUI)) {
+    auto TNB = TN->getBlock();
+    for (const NodePtr Pred : getChildren<!IsPostDom>(TNB, BUI)) {
       LLVM_DEBUG(dbgs() << "\tPred " << BlockNamePrinter(Pred) << "\n");
       if (!DT.getNode(Pred)) continue;
 
-      const NodePtr Support =
-          DT.findNearestCommonDominator(TN->getBlock(), Pred);
+      const NodePtr Support = DT.findNearestCommonDominator(TNB, Pred);
       LLVM_DEBUG(dbgs() << "\tSupport " << BlockNamePrinter(Support) << "\n");
-      if (Support != TN->getBlock()) {
+      if (Support != TNB) {
         LLVM_DEBUG(dbgs() << "\t" << BlockNamePrinter(TN)
                           << " is reachable from support "
                           << BlockNamePrinter(Support) << "\n");
@@ -1054,7 +1058,7 @@ struct SemiNCAInfo {
       const TreeNodePtr TN = DT.getNode(To);
       assert(TN);
       if (TN->getLevel() > Level) return true;
-      if (llvm::find(AffectedQueue, To) == AffectedQueue.end())
+      if (!llvm::is_contained(AffectedQueue, To))
         AffectedQueue.push_back(To);
 
       return false;
@@ -1144,53 +1148,34 @@ struct SemiNCAInfo {
   //===--------------------- DomTree Batch Updater --------------------------===
   //~~
 
-  static void ApplyUpdates(DomTreeT &DT, ArrayRef<UpdateT> Updates) {
-    const size_t NumUpdates = Updates.size();
+  static void ApplyUpdates(DomTreeT &DT, GraphDiffT &PreViewCFG,
+                           GraphDiffT *PostViewCFG) {
+    // Note: the PostViewCFG is only used when computing from scratch. It's data
+    // should already included in the PreViewCFG for incremental updates.
+    const size_t NumUpdates = PreViewCFG.getNumLegalizedUpdates();
     if (NumUpdates == 0)
       return;
 
     // Take the fast path for a single update and avoid running the batch update
     // machinery.
     if (NumUpdates == 1) {
-      const auto &Update = Updates.front();
-      if (Update.getKind() == UpdateKind::Insert)
-        DT.insertEdge(Update.getFrom(), Update.getTo());
-      else
-        DT.deleteEdge(Update.getFrom(), Update.getTo());
-
+      UpdateT Update = PreViewCFG.popUpdateForIncrementalUpdates();
+      if (!PostViewCFG) {
+        if (Update.getKind() == UpdateKind::Insert)
+          InsertEdge(DT, /*BUI=*/nullptr, Update.getFrom(), Update.getTo());
+        else
+          DeleteEdge(DT, /*BUI=*/nullptr, Update.getFrom(), Update.getTo());
+      } else {
+        BatchUpdateInfo BUI(*PostViewCFG, PostViewCFG);
+        if (Update.getKind() == UpdateKind::Insert)
+          InsertEdge(DT, &BUI, Update.getFrom(), Update.getTo());
+        else
+          DeleteEdge(DT, &BUI, Update.getFrom(), Update.getTo());
+      }
       return;
     }
 
-    BatchUpdateInfo BUI;
-    LLVM_DEBUG(dbgs() << "Legalizing " << BUI.Updates.size() << " updates\n");
-    cfg::LegalizeUpdates<NodePtr>(Updates, BUI.Updates, IsPostDom);
-
-    const size_t NumLegalized = BUI.Updates.size();
-    BUI.FutureSuccessors.reserve(NumLegalized);
-    BUI.FuturePredecessors.reserve(NumLegalized);
-
-    // Use the legalized future updates to initialize future successors and
-    // predecessors. Note that these sets will only decrease size over time, as
-    // the next CFG snapshots slowly approach the actual (current) CFG.
-    for (UpdateT &U : BUI.Updates) {
-      BUI.FutureSuccessors[U.getFrom()].push_back({U.getTo(), U.getKind()});
-      BUI.FuturePredecessors[U.getTo()].push_back({U.getFrom(), U.getKind()});
-    }
-
-#if 0
-    // FIXME: The LLVM_DEBUG macro only plays well with a modular
-    // build of LLVM when the header is marked as textual, but doing
-    // so causes redefinition errors.
-    LLVM_DEBUG(dbgs() << "About to apply " << NumLegalized << " updates\n");
-    LLVM_DEBUG(if (NumLegalized < 32) for (const auto &U
-                                           : reverse(BUI.Updates)) {
-      dbgs() << "\t";
-      U.dump();
-      dbgs() << "\n";
-    });
-    LLVM_DEBUG(dbgs() << "\n");
-#endif
-
+    BatchUpdateInfo BUI(PreViewCFG, PostViewCFG);
     // Recalculate the DominatorTree when the number of updates
     // exceeds a threshold, which usually makes direct updating slower than
     // recalculation. We select this threshold proportional to the
@@ -1200,21 +1185,21 @@ struct SemiNCAInfo {
 
     // Make unittests of the incremental algorithm work
     if (DT.DomTreeNodes.size() <= 100) {
-      if (NumLegalized > DT.DomTreeNodes.size())
+      if (BUI.NumLegalized > DT.DomTreeNodes.size())
         CalculateFromScratch(DT, &BUI);
-    } else if (NumLegalized > DT.DomTreeNodes.size() / 40)
+    } else if (BUI.NumLegalized > DT.DomTreeNodes.size() / 40)
       CalculateFromScratch(DT, &BUI);
 
     // If the DominatorTree was recalculated at some point, stop the batch
     // updates. Full recalculations ignore batch updates and look at the actual
     // CFG.
-    for (size_t i = 0; i < NumLegalized && !BUI.IsRecalculated; ++i)
+    for (size_t i = 0; i < BUI.NumLegalized && !BUI.IsRecalculated; ++i)
       ApplyNextUpdate(DT, BUI);
   }
 
   static void ApplyNextUpdate(DomTreeT &DT, BatchUpdateInfo &BUI) {
-    assert(!BUI.Updates.empty() && "No updates to apply!");
-    UpdateT CurrentUpdate = BUI.Updates.pop_back_val();
+    // Popping the next update, will move the PreViewCFG to the next snapshot.
+    UpdateT CurrentUpdate = BUI.PreViewCFG.popUpdateForIncrementalUpdates();
 #if 0
     // FIXME: The LLVM_DEBUG macro only plays well with a modular
     // build of LLVM when the header is marked as textual, but doing
@@ -1223,21 +1208,6 @@ struct SemiNCAInfo {
     LLVM_DEBUG(CurrentUpdate.dump(); dbgs() << "\n");
 #endif
 
-    // Move to the next snapshot of the CFG by removing the reverse-applied
-    // current update. Since updates are performed in the same order they are
-    // legalized it's sufficient to pop the last item here.
-    auto &FS = BUI.FutureSuccessors[CurrentUpdate.getFrom()];
-    assert(FS.back().getPointer() == CurrentUpdate.getTo() &&
-           FS.back().getInt() == CurrentUpdate.getKind());
-    FS.pop_back();
-    if (FS.empty()) BUI.FutureSuccessors.erase(CurrentUpdate.getFrom());
-
-    auto &FP = BUI.FuturePredecessors[CurrentUpdate.getTo()];
-    assert(FP.back().getPointer() == CurrentUpdate.getFrom() &&
-           FP.back().getInt() == CurrentUpdate.getKind());
-    FP.pop_back();
-    if (FP.empty()) BUI.FuturePredecessors.erase(CurrentUpdate.getTo());
-
     if (CurrentUpdate.getKind() == UpdateKind::Insert)
       InsertEdge(DT, &BUI, CurrentUpdate.getFrom(), CurrentUpdate.getTo());
     else
@@ -1596,19 +1566,11 @@ void Calculate(DomTreeT &DT) {
 template <typename DomTreeT>
 void CalculateWithUpdates(DomTreeT &DT,
                           ArrayRef<typename DomTreeT::UpdateType> Updates) {
-  // TODO: Move BUI creation in common method, reuse in ApplyUpdates.
-  typename SemiNCAInfo<DomTreeT>::BatchUpdateInfo BUI;
-  LLVM_DEBUG(dbgs() << "Legalizing " << BUI.Updates.size() << " updates\n");
-  cfg::LegalizeUpdates<typename DomTreeT::NodePtr>(Updates, BUI.Updates,
-                                                   DomTreeT::IsPostDominator);
-  const size_t NumLegalized = BUI.Updates.size();
-  BUI.FutureSuccessors.reserve(NumLegalized);
-  BUI.FuturePredecessors.reserve(NumLegalized);
-  for (auto &U : BUI.Updates) {
-    BUI.FutureSuccessors[U.getFrom()].push_back({U.getTo(), U.getKind()});
-    BUI.FuturePredecessors[U.getTo()].push_back({U.getFrom(), U.getKind()});
-  }
-
+  // FIXME: Updated to use the PreViewCFG and behave the same as until now.
+  // This behavior is however incorrect; this actually needs the PostViewCFG.
+  GraphDiff<typename DomTreeT::NodePtr, DomTreeT::IsPostDominator> PreViewCFG(
+      Updates, /*ReverseApplyUpdates=*/true);
+  typename SemiNCAInfo<DomTreeT>::BatchUpdateInfo BUI(PreViewCFG);
   SemiNCAInfo<DomTreeT>::CalculateFromScratch(DT, &BUI);
 }
 
@@ -1628,8 +1590,11 @@ void DeleteEdge(DomTreeT &DT, typename DomTreeT::NodePtr From,
 
 template <class DomTreeT>
 void ApplyUpdates(DomTreeT &DT,
-                  ArrayRef<typename DomTreeT::UpdateType> Updates) {
-  SemiNCAInfo<DomTreeT>::ApplyUpdates(DT, Updates);
+                  GraphDiff<typename DomTreeT::NodePtr,
+                            DomTreeT::IsPostDominator> &PreViewCFG,
+                  GraphDiff<typename DomTreeT::NodePtr,
+                            DomTreeT::IsPostDominator> *PostViewCFG) {
+  SemiNCAInfo<DomTreeT>::ApplyUpdates(DT, PreViewCFG, PostViewCFG);
 }
 
 template <class DomTreeT>
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/GlobPattern.h b/contrib/llvm-project/llvm/include/llvm/Support/GlobPattern.h
index 3e5989d02500..b79de6f41c49 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/GlobPattern.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/GlobPattern.h
@@ -31,6 +31,16 @@ public:
   static Expected<GlobPattern> create(StringRef Pat);
   bool match(StringRef S) const;
 
+  // Returns true for glob pattern "*". Can be used to avoid expensive
+  // preparation/acquisition of the input for match().
+  bool isTrivialMatchAll() const {
+    if (Prefix && Prefix->empty()) {
+      assert(!Suffix);
+      return true;
+    }
+    return false;
+  }
+
 private:
   bool matchOne(ArrayRef<BitVector> Pat, StringRef S) const;
 
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/GraphWriter.h b/contrib/llvm-project/llvm/include/llvm/Support/GraphWriter.h
index f9241b1e8081..1f60fbc35126 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/GraphWriter.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/GraphWriter.h
@@ -158,9 +158,7 @@ public:
         writeNode(Node);
   }
 
-  bool isNodeHidden(NodeRef Node) {
-    return DTraits.isNodeHidden(Node);
-  }
+  bool isNodeHidden(NodeRef Node) { return DTraits.isNodeHidden(Node, G); }
 
   void writeNode(NodeRef Node) {
     std::string NodeAttributes = DTraits.getNodeAttributes(Node, G);
@@ -228,10 +226,10 @@ public:
     child_iterator EI = GTraits::child_begin(Node);
     child_iterator EE = GTraits::child_end(Node);
     for (unsigned i = 0; EI != EE && i != 64; ++EI, ++i)
-      if (!DTraits.isNodeHidden(*EI))
+      if (!DTraits.isNodeHidden(*EI, G))
         writeEdge(Node, i, EI);
     for (; EI != EE; ++EI)
-      if (!DTraits.isNodeHidden(*EI))
+      if (!DTraits.isNodeHidden(*EI, G))
         writeEdge(Node, 64, EI);
   }
 
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/Host.h b/contrib/llvm-project/llvm/include/llvm/Support/Host.h
index d4ef389450cc..b3c15f0683b9 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/Host.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/Host.h
@@ -65,6 +65,20 @@ namespace sys {
   StringRef getHostCPUNameForARM(StringRef ProcCpuinfoContent);
   StringRef getHostCPUNameForS390x(StringRef ProcCpuinfoContent);
   StringRef getHostCPUNameForBPF();
+
+  /// Helper functions to extract CPU details from CPUID on x86.
+  namespace x86 {
+  enum class VendorSignatures {
+    UNKNOWN,
+    GENUINE_INTEL,
+    AUTHENTIC_AMD,
+  };
+
+  /// Returns the host CPU's vendor.
+  /// MaxLeaf: if a non-nullptr pointer is specified, the EAX value will be
+  /// assigned to its pointee.
+  VendorSignatures getVendorSignature(unsigned *MaxLeaf = nullptr);
+  } // namespace x86
   }
 }
 }
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/InitLLVM.h b/contrib/llvm-project/llvm/include/llvm/Support/InitLLVM.h
index 3be8d6b6d2e0..879dc1514d10 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/InitLLVM.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/InitLLVM.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_SUPPORT_LLVM_H
 #define LLVM_SUPPORT_LLVM_H
 
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/PrettyStackTrace.h"
@@ -44,7 +45,7 @@ public:
 private:
   BumpPtrAllocator Alloc;
   SmallVector<const char *, 0> Args;
-  PrettyStackTraceProgram StackPrinter;
+  Optional<PrettyStackTraceProgram> StackPrinter;
 };
 } // namespace llvm
 
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/InstructionCost.h b/contrib/llvm-project/llvm/include/llvm/Support/InstructionCost.h
new file mode 100644
index 000000000000..fbc898b878bb
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/Support/InstructionCost.h
@@ -0,0 +1,238 @@
+//===- InstructionCost.h ----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines an InstructionCost class that is used when calculating
+/// the cost of an instruction, or a group of instructions. In addition to a
+/// numeric value representing the cost the class also contains a state that
+/// can be used to encode particular properties, i.e. a cost being invalid or
+/// unknown.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_INSTRUCTIONCOST_H
+#define LLVM_SUPPORT_INSTRUCTIONCOST_H
+
+#include "llvm/ADT/Optional.h"
+
+namespace llvm {
+
+class raw_ostream;
+
+class InstructionCost {
+public:
+  using CostType = int;
+
+  /// These states can currently be used to indicate whether a cost is valid or
+  /// invalid. Examples of an invalid cost might be where the cost is
+  /// prohibitively expensive and the user wants to prevent certain
+  /// optimizations being performed. Or perhaps the cost is simply unknown
+  /// because the operation makes no sense in certain circumstances. These
+  /// states can be expanded in future to support other cases if necessary.
+  enum CostState { Valid, Invalid };
+
+private:
+  CostType Value;
+  CostState State;
+
+  void propagateState(const InstructionCost &RHS) {
+    if (RHS.State == Invalid)
+      State = Invalid;
+  }
+
+public:
+  InstructionCost() = default;
+
+  InstructionCost(CostState) = delete;
+  InstructionCost(CostType Val) : Value(Val), State(Valid) {}
+
+  static InstructionCost getInvalid(CostType Val = 0) {
+    InstructionCost Tmp(Val);
+    Tmp.setInvalid();
+    return Tmp;
+  }
+
+  bool isValid() const { return State == Valid; }
+  void setValid() { State = Valid; }
+  void setInvalid() { State = Invalid; }
+  CostState getState() const { return State; }
+
+  /// This function is intended to be used as sparingly as possible, since the
+  /// class provides the full range of operator support required for arithmetic
+  /// and comparisons.
+  Optional<CostType> getValue() const {
+    if (isValid())
+      return Value;
+    return None;
+  }
+
+  /// For all of the arithmetic operators provided here any invalid state is
+  /// perpetuated and cannot be removed. Once a cost becomes invalid it stays
+  /// invalid, and it also inherits any invalid state from the RHS. Regardless
+  /// of the state, arithmetic and comparisons work on the actual values in the
+  /// same way as they would on a basic type, such as integer.
+
+  InstructionCost &operator+=(const InstructionCost &RHS) {
+    propagateState(RHS);
+    Value += RHS.Value;
+    return *this;
+  }
+
+  InstructionCost &operator+=(const CostType RHS) {
+    InstructionCost RHS2(RHS);
+    *this += RHS2;
+    return *this;
+  }
+
+  InstructionCost &operator-=(const InstructionCost &RHS) {
+    propagateState(RHS);
+    Value -= RHS.Value;
+    return *this;
+  }
+
+  InstructionCost &operator-=(const CostType RHS) {
+    InstructionCost RHS2(RHS);
+    *this -= RHS2;
+    return *this;
+  }
+
+  InstructionCost &operator*=(const InstructionCost &RHS) {
+    propagateState(RHS);
+    Value *= RHS.Value;
+    return *this;
+  }
+
+  InstructionCost &operator*=(const CostType RHS) {
+    InstructionCost RHS2(RHS);
+    *this *= RHS2;
+    return *this;
+  }
+
+  InstructionCost &operator/=(const InstructionCost &RHS) {
+    propagateState(RHS);
+    Value /= RHS.Value;
+    return *this;
+  }
+
+  InstructionCost &operator/=(const CostType RHS) {
+    InstructionCost RHS2(RHS);
+    *this /= RHS2;
+    return *this;
+  }
+
+  InstructionCost &operator++() {
+    *this += 1;
+    return *this;
+  }
+
+  InstructionCost operator++(int) {
+    InstructionCost Copy = *this;
+    ++*this;
+    return Copy;
+  }
+
+  InstructionCost &operator--() {
+    *this -= 1;
+    return *this;
+  }
+
+  InstructionCost operator--(int) {
+    InstructionCost Copy = *this;
+    --*this;
+    return Copy;
+  }
+
+  bool operator==(const InstructionCost &RHS) const {
+    return State == RHS.State && Value == RHS.Value;
+  }
+
+  bool operator!=(const InstructionCost &RHS) const { return !(*this == RHS); }
+
+  bool operator==(const CostType RHS) const {
+    return State == Valid && Value == RHS;
+  }
+
+  bool operator!=(const CostType RHS) const { return !(*this == RHS); }
+
+  /// For the comparison operators we have chosen to use total ordering with
+  /// the following rules:
+  ///  1. If either of the states != Valid then a lexicographical order is
+  ///     applied based upon the state.
+  ///  2. If both states are valid then order based upon value.
+  /// This avoids having to add asserts the comparison operators that the states
+  /// are valid and users can test for validity of the cost explicitly.
+  bool operator<(const InstructionCost &RHS) const {
+    if (State != Valid || RHS.State != Valid)
+      return State < RHS.State;
+    return Value < RHS.Value;
+  }
+
+  bool operator>(const InstructionCost &RHS) const { return RHS < *this; }
+
+  bool operator<=(const InstructionCost &RHS) const { return !(RHS < *this); }
+
+  bool operator>=(const InstructionCost &RHS) const { return !(*this < RHS); }
+
+  bool operator<(const CostType RHS) const {
+    InstructionCost RHS2(RHS);
+    return *this < RHS2;
+  }
+
+  bool operator>(const CostType RHS) const {
+    InstructionCost RHS2(RHS);
+    return *this > RHS2;
+  }
+
+  bool operator<=(const CostType RHS) const {
+    InstructionCost RHS2(RHS);
+    return *this <= RHS2;
+  }
+
+  bool operator>=(const CostType RHS) const {
+    InstructionCost RHS2(RHS);
+    return *this >= RHS2;
+  }
+
+  void print(raw_ostream &OS) const;
+};
+
+inline InstructionCost operator+(const InstructionCost &LHS,
+                                 const InstructionCost &RHS) {
+  InstructionCost LHS2(LHS);
+  LHS2 += RHS;
+  return LHS2;
+}
+
+inline InstructionCost operator-(const InstructionCost &LHS,
+                                 const InstructionCost &RHS) {
+  InstructionCost LHS2(LHS);
+  LHS2 -= RHS;
+  return LHS2;
+}
+
+inline InstructionCost operator*(const InstructionCost &LHS,
+                                 const InstructionCost &RHS) {
+  InstructionCost LHS2(LHS);
+  LHS2 *= RHS;
+  return LHS2;
+}
+
+inline InstructionCost operator/(const InstructionCost &LHS,
+                                 const InstructionCost &RHS) {
+  InstructionCost LHS2(LHS);
+  LHS2 /= RHS;
+  return LHS2;
+}
+
+inline raw_ostream &operator<<(raw_ostream &OS, const InstructionCost &V) {
+  V.print(OS);
+  return OS;
+}
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/JSON.h b/contrib/llvm-project/llvm/include/llvm/Support/JSON.h
index 8b1c66234fe8..c753cee60ec1 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/JSON.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/JSON.h
@@ -253,7 +253,14 @@ inline bool operator!=(const Array &L, const Array &R) { return !(L == R); }
 /// === Converting JSON values to C++ types ===
 ///
 /// The convention is to have a deserializer function findable via ADL:
-///     fromJSON(const json::Value&, T&)->bool
+///     fromJSON(const json::Value&, T&, Path) -> bool
+///
+/// The return value indicates overall success, and Path is used for precise
+/// error reporting. (The Path::Root passed in at the top level fromJSON call
+/// captures any nested error and can render it in context).
+/// If conversion fails, fromJSON calls Path::report() and immediately returns.
+/// This ensures that the first fatal error survives.
+///
 /// Deserializers are provided for:
 ///   - bool
 ///   - int and int64_t
@@ -449,12 +456,12 @@ private:
   friend class Object;
 
   template <typename T, typename... U> void create(U &&... V) {
-    new (reinterpret_cast<T *>(Union.buffer)) T(std::forward<U>(V)...);
+    new (reinterpret_cast<T *>(&Union)) T(std::forward<U>(V)...);
   }
   template <typename T> T &as() const {
     // Using this two-step static_cast via void * instead of reinterpret_cast
     // silences a -Wstrict-aliasing false positive from GCC6 and earlier.
-    void *Storage = static_cast<void *>(Union.buffer);
+    void *Storage = static_cast<void *>(&Union);
     return *static_cast<T *>(Storage);
   }
 
@@ -557,81 +564,169 @@ inline bool Object::erase(StringRef K) {
   return M.erase(ObjectKey(K));
 }
 
+/// A "cursor" marking a position within a Value.
+/// The Value is a tree, and this is the path from the root to the current node.
+/// This is used to associate errors with particular subobjects.
+class Path {
+public:
+  class Root;
+
+  /// Records that the value at the current path is invalid.
+  /// Message is e.g. "expected number" and becomes part of the final error.
+  /// This overwrites any previously written error message in the root.
+  void report(llvm::StringLiteral Message);
+
+  /// The root may be treated as a Path.
+  Path(Root &R) : Parent(nullptr), Seg(&R) {}
+  /// Derives a path for an array element: this[Index]
+  Path index(unsigned Index) const { return Path(this, Segment(Index)); }
+  /// Derives a path for an object field: this.Field
+  Path field(StringRef Field) const { return Path(this, Segment(Field)); }
+
+private:
+  /// One element in a JSON path: an object field (.foo) or array index [27].
+  /// Exception: the root Path encodes a pointer to the Path::Root.
+  class Segment {
+    uintptr_t Pointer;
+    unsigned Offset;
+
+  public:
+    Segment() = default;
+    Segment(Root *R) : Pointer(reinterpret_cast<uintptr_t>(R)) {}
+    Segment(llvm::StringRef Field)
+        : Pointer(reinterpret_cast<uintptr_t>(Field.data())),
+          Offset(static_cast<unsigned>(Field.size())) {}
+    Segment(unsigned Index) : Pointer(0), Offset(Index) {}
+
+    bool isField() const { return Pointer != 0; }
+    StringRef field() const {
+      return StringRef(reinterpret_cast<const char *>(Pointer), Offset);
+    }
+    unsigned index() const { return Offset; }
+    Root *root() const { return reinterpret_cast<Root *>(Pointer); }
+  };
+
+  const Path *Parent;
+  Segment Seg;
+
+  Path(const Path *Parent, Segment S) : Parent(Parent), Seg(S) {}
+};
+
+/// The root is the trivial Path to the root value.
+/// It also stores the latest reported error and the path where it occurred.
+class Path::Root {
+  llvm::StringRef Name;
+  llvm::StringLiteral ErrorMessage;
+  std::vector<Path::Segment> ErrorPath; // Only valid in error state. Reversed.
+
+  friend void Path::report(llvm::StringLiteral Message);
+
+public:
+  Root(llvm::StringRef Name = "") : Name(Name), ErrorMessage("") {}
+  // No copy/move allowed as there are incoming pointers.
+  Root(Root &&) = delete;
+  Root &operator=(Root &&) = delete;
+  Root(const Root &) = delete;
+  Root &operator=(const Root &) = delete;
+
+  /// Returns the last error reported, or else a generic error.
+  Error getError() const;
+  /// Print the root value with the error shown inline as a comment.
+  /// Unrelated parts of the value are elided for brevity, e.g.
+  ///   {
+  ///      "id": 42,
+  ///      "name": /* expected string */ null,
+  ///      "properties": { ... }
+  ///   }
+  void printErrorContext(const Value &, llvm::raw_ostream &) const;
+};
+
 // Standard deserializers are provided for primitive types.
 // See comments on Value.
-inline bool fromJSON(const Value &E, std::string &Out) {
+inline bool fromJSON(const Value &E, std::string &Out, Path P) {
   if (auto S = E.getAsString()) {
     Out = std::string(*S);
     return true;
   }
+  P.report("expected string");
   return false;
 }
-inline bool fromJSON(const Value &E, int &Out) {
+inline bool fromJSON(const Value &E, int &Out, Path P) {
   if (auto S = E.getAsInteger()) {
     Out = *S;
     return true;
   }
+  P.report("expected integer");
   return false;
 }
-inline bool fromJSON(const Value &E, int64_t &Out) {
+inline bool fromJSON(const Value &E, int64_t &Out, Path P) {
   if (auto S = E.getAsInteger()) {
     Out = *S;
     return true;
   }
+  P.report("expected integer");
   return false;
 }
-inline bool fromJSON(const Value &E, double &Out) {
+inline bool fromJSON(const Value &E, double &Out, Path P) {
   if (auto S = E.getAsNumber()) {
     Out = *S;
     return true;
   }
+  P.report("expected number");
   return false;
 }
-inline bool fromJSON(const Value &E, bool &Out) {
+inline bool fromJSON(const Value &E, bool &Out, Path P) {
   if (auto S = E.getAsBoolean()) {
     Out = *S;
     return true;
   }
+  P.report("expected boolean");
   return false;
 }
-inline bool fromJSON(const Value &E, std::nullptr_t &Out) {
+inline bool fromJSON(const Value &E, std::nullptr_t &Out, Path P) {
   if (auto S = E.getAsNull()) {
     Out = *S;
     return true;
   }
+  P.report("expected null");
   return false;
 }
-template <typename T> bool fromJSON(const Value &E, llvm::Optional<T> &Out) {
+template <typename T>
+bool fromJSON(const Value &E, llvm::Optional<T> &Out, Path P) {
   if (E.getAsNull()) {
     Out = llvm::None;
     return true;
   }
   T Result;
-  if (!fromJSON(E, Result))
+  if (!fromJSON(E, Result, P))
     return false;
   Out = std::move(Result);
   return true;
 }
-template <typename T> bool fromJSON(const Value &E, std::vector<T> &Out) {
+template <typename T>
+bool fromJSON(const Value &E, std::vector<T> &Out, Path P) {
   if (auto *A = E.getAsArray()) {
     Out.clear();
     Out.resize(A->size());
     for (size_t I = 0; I < A->size(); ++I)
-      if (!fromJSON((*A)[I], Out[I]))
+      if (!fromJSON((*A)[I], Out[I], P.index(I)))
         return false;
     return true;
   }
+  P.report("expected array");
   return false;
 }
 template <typename T>
-bool fromJSON(const Value &E, std::map<std::string, T> &Out) {
+bool fromJSON(const Value &E, std::map<std::string, T> &Out, Path P) {
   if (auto *O = E.getAsObject()) {
     Out.clear();
     for (const auto &KV : *O)
-      if (!fromJSON(KV.second, Out[std::string(llvm::StringRef(KV.first))]))
+      if (!fromJSON(KV.second, Out[std::string(llvm::StringRef(KV.first))],
+                    P.field(KV.first)))
         return false;
     return true;
   }
+  P.report("expected object");
   return false;
 }
 
@@ -644,42 +739,59 @@ template <typename T> Value toJSON(const llvm::Optional<T> &Opt) {
 ///
 /// Example:
 /// \code
-///   bool fromJSON(const Value &E, MyStruct &R) {
-///     ObjectMapper O(E);
-///     if (!O || !O.map("mandatory_field", R.MandatoryField))
-///       return false;
-///     O.map("optional_field", R.OptionalField);
-///     return true;
+///   bool fromJSON(const Value &E, MyStruct &R, Path P) {
+///     ObjectMapper O(E, P);
+///     // When returning false, error details were already reported.
+///     return O && O.map("mandatory_field", R.MandatoryField) &&
+///         O.mapOptional("optional_field", R.OptionalField);
 ///   }
 /// \endcode
 class ObjectMapper {
 public:
-  ObjectMapper(const Value &E) : O(E.getAsObject()) {}
+  /// If O is not an object, this mapper is invalid and an error is reported.
+  ObjectMapper(const Value &E, Path P) : O(E.getAsObject()), P(P) {
+    if (!O)
+      P.report("expected object");
+  }
 
   /// True if the expression is an object.
   /// Must be checked before calling map().
-  operator bool() { return O; }
+  operator bool() const { return O; }
 
-  /// Maps a property to a field, if it exists.
-  template <typename T> bool map(StringRef Prop, T &Out) {
+  /// Maps a property to a field.
+  /// If the property is missing or invalid, reports an error.
+  template <typename T> bool map(StringLiteral Prop, T &Out) {
     assert(*this && "Must check this is an object before calling map()");
     if (const Value *E = O->get(Prop))
-      return fromJSON(*E, Out);
+      return fromJSON(*E, Out, P.field(Prop));
+    P.field(Prop).report("missing value");
     return false;
   }
 
   /// Maps a property to a field, if it exists.
+  /// If the property exists and is invalid, reports an error.
   /// (Optional requires special handling, because missing keys are OK).
-  template <typename T> bool map(StringRef Prop, llvm::Optional<T> &Out) {
+  template <typename T> bool map(StringLiteral Prop, llvm::Optional<T> &Out) {
     assert(*this && "Must check this is an object before calling map()");
     if (const Value *E = O->get(Prop))
-      return fromJSON(*E, Out);
+      return fromJSON(*E, Out, P.field(Prop));
     Out = llvm::None;
     return true;
   }
 
+  /// Maps a property to a field, if it exists.
+  /// If the property exists and is invalid, reports an error.
+  /// If the property does not exist, Out is unchanged.
+  template <typename T> bool mapOptional(StringLiteral Prop, T &Out) {
+    assert(*this && "Must check this is an object before calling map()");
+    if (const Value *E = O->get(Prop))
+      return fromJSON(*E, Out, P.field(Prop));
+    return true;
+  }
+
 private:
   const Object *O;
+  Path P;
 };
 
 /// Parses the provided JSON source, or returns a ParseError.
@@ -703,9 +815,24 @@ public:
   }
 };
 
+/// Version of parse() that converts the parsed value to the type T.
+/// RootName describes the root object and is used in error messages.
+template <typename T>
+Expected<T> parse(const llvm::StringRef &JSON, const char *RootName = "") {
+  auto V = parse(JSON);
+  if (!V)
+    return V.takeError();
+  Path::Root R(RootName);
+  T Result;
+  if (fromJSON(*V, Result, R))
+    return std::move(Result);
+  return R.getError();
+}
+
 /// json::OStream allows writing well-formed JSON without materializing
 /// all structures as json::Value ahead of time.
 /// It's faster, lower-level, and less safe than OS << json::Value.
+/// It also allows emitting more constructs, such as comments.
 ///
 /// Only one "top-level" object can be written to a stream.
 /// Simplest usage involves passing lambdas (Blocks) to fill in containers:
@@ -791,6 +918,21 @@ class OStream {
     Contents();
     objectEnd();
   }
+  /// Emit an externally-serialized value.
+  /// The caller must write exactly one valid JSON value to the provided stream.
+  /// No validation or formatting of this value occurs.
+  void rawValue(llvm::function_ref<void(raw_ostream &)> Contents) {
+    rawValueBegin();
+    Contents(OS);
+    rawValueEnd();
+  }
+  void rawValue(llvm::StringRef Contents) {
+    rawValue([&](raw_ostream &OS) { OS << Contents; });
+  }
+  /// Emit a JavaScript comment associated with the next printed value.
+  /// The string must be valid until the next attribute or value is emitted.
+  /// Comments are not part of standard JSON, and many parsers reject them!
+  void comment(llvm::StringRef);
 
   // High level functions to output object attributes.
   // Valid only within an object (any number of times).
@@ -817,8 +959,10 @@ class OStream {
   void objectEnd();
   void attributeBegin(llvm::StringRef Key);
   void attributeEnd();
+  raw_ostream &rawValueBegin();
+  void rawValueEnd();
 
- private:
+private:
   void attributeImpl(llvm::StringRef Key, Block Contents) {
     attributeBegin(Key);
     Contents();
@@ -826,18 +970,21 @@ class OStream {
   }
 
   void valueBegin();
+  void flushComment();
   void newline();
 
   enum Context {
     Singleton, // Top level, or object attribute.
     Array,
     Object,
+    RawValue, // External code writing a value to OS directly.
   };
   struct State {
     Context Ctx = Singleton;
     bool HasValue = false;
   };
   llvm::SmallVector<State, 16> Stack; // Never empty.
+  llvm::StringRef PendingComment;
   llvm::raw_ostream &OS;
   unsigned IndentSize;
   unsigned Indent = 0;
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/KnownBits.h b/contrib/llvm-project/llvm/include/llvm/Support/KnownBits.h
index 69040cd23f03..d854aadbd430 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/KnownBits.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/KnownBits.h
@@ -15,6 +15,7 @@
 #define LLVM_SUPPORT_KNOWNBITS_H
 
 #include "llvm/ADT/APInt.h"
+#include "llvm/ADT/Optional.h"
 
 namespace llvm {
 
@@ -97,6 +98,9 @@ public:
   /// Returns true if this value is known to be non-negative.
   bool isNonNegative() const { return Zero.isSignBitSet(); }
 
+  /// Returns true if this value is known to be non-zero.
+  bool isNonZero() const { return !One.isNullValue(); }
+
   /// Returns true if this value is known to be positive.
   bool isStrictlyPositive() const { return Zero.isSignBitSet() && !One.isNullValue(); }
 
@@ -110,18 +114,38 @@ public:
     Zero.setSignBit();
   }
 
-  /// Return the minimal value possible given these KnownBits.
+  /// Return the minimal unsigned value possible given these KnownBits.
   APInt getMinValue() const {
     // Assume that all bits that aren't known-ones are zeros.
     return One;
   }
 
-  /// Return the maximal value possible given these KnownBits.
+  /// Return the minimal signed value possible given these KnownBits.
+  APInt getSignedMinValue() const {
+    // Assume that all bits that aren't known-ones are zeros.
+    APInt Min = One;
+    // Sign bit is unknown.
+    if (Zero.isSignBitClear())
+      Min.setSignBit();
+    return Min;
+  }
+
+  /// Return the maximal unsigned value possible given these KnownBits.
   APInt getMaxValue() const {
     // Assume that all bits that aren't known-zeros are ones.
     return ~Zero;
   }
 
+  /// Return the maximal signed value possible given these KnownBits.
+  APInt getSignedMaxValue() const {
+    // Assume that all bits that aren't known-zeros are ones.
+    APInt Max = ~Zero;
+    // Sign bit is unknown.
+    if (One.isSignBitClear())
+      Max.clearSignBit();
+    return Max;
+  }
+
   /// Return known bits for a truncation of the value we're tracking.
   KnownBits trunc(unsigned BitWidth) const {
     return KnownBits(Zero.trunc(BitWidth), One.trunc(BitWidth));
@@ -166,6 +190,20 @@ public:
     return *this;
   }
 
+  /// Return known bits for a sign extension or truncation of the value we're
+  /// tracking.
+  KnownBits sextOrTrunc(unsigned BitWidth) const {
+    if (BitWidth > getBitWidth())
+      return sext(BitWidth);
+    if (BitWidth < getBitWidth())
+      return trunc(BitWidth);
+    return *this;
+  }
+
+  /// Return known bits for a in-register sign extension of the value we're
+  /// tracking.
+  KnownBits sextInReg(unsigned SrcBitWidth) const;
+
   /// Return a KnownBits with the extracted bits
   /// [bitPosition,bitPosition+numBits).
   KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const {
@@ -173,6 +211,10 @@ public:
                      One.extractBits(NumBits, BitPosition));
   }
 
+  /// Return KnownBits based on this, but updated given that the underlying
+  /// value is known to be greater than or equal to Val.
+  KnownBits makeGE(const APInt &Val) const;
+
   /// Returns the minimum number of trailing zero bits.
   unsigned countMinTrailingZeros() const {
     return Zero.countTrailingOnes();
@@ -233,6 +275,16 @@ public:
     return getBitWidth() - Zero.countPopulation();
   }
 
+  /// Create known bits from a known constant.
+  static KnownBits makeConstant(const APInt &C) {
+    return KnownBits(~C, C);
+  }
+
+  /// Compute known bits common to LHS and RHS.
+  static KnownBits commonBits(const KnownBits &LHS, const KnownBits &RHS) {
+    return KnownBits(LHS.Zero & RHS.Zero, LHS.One & RHS.One);
+  }
+
   /// Compute known bits resulting from adding LHS, RHS and a 1-bit Carry.
   static KnownBits computeForAddCarry(
       const KnownBits &LHS, const KnownBits &RHS, const KnownBits &Carry);
@@ -241,6 +293,84 @@ public:
   static KnownBits computeForAddSub(bool Add, bool NSW, const KnownBits &LHS,
                                     KnownBits RHS);
 
+  /// Compute known bits resulting from multiplying LHS and RHS.
+  static KnownBits computeForMul(const KnownBits &LHS, const KnownBits &RHS);
+
+  /// Compute known bits for udiv(LHS, RHS).
+  static KnownBits udiv(const KnownBits &LHS, const KnownBits &RHS);
+
+  /// Compute known bits for urem(LHS, RHS).
+  static KnownBits urem(const KnownBits &LHS, const KnownBits &RHS);
+
+  /// Compute known bits for srem(LHS, RHS).
+  static KnownBits srem(const KnownBits &LHS, const KnownBits &RHS);
+
+  /// Compute known bits for umax(LHS, RHS).
+  static KnownBits umax(const KnownBits &LHS, const KnownBits &RHS);
+
+  /// Compute known bits for umin(LHS, RHS).
+  static KnownBits umin(const KnownBits &LHS, const KnownBits &RHS);
+
+  /// Compute known bits for smax(LHS, RHS).
+  static KnownBits smax(const KnownBits &LHS, const KnownBits &RHS);
+
+  /// Compute known bits for smin(LHS, RHS).
+  static KnownBits smin(const KnownBits &LHS, const KnownBits &RHS);
+
+  /// Compute known bits for shl(LHS, RHS).
+  /// NOTE: RHS (shift amount) bitwidth doesn't need to be the same as LHS.
+  static KnownBits shl(const KnownBits &LHS, const KnownBits &RHS);
+
+  /// Compute known bits for lshr(LHS, RHS).
+  /// NOTE: RHS (shift amount) bitwidth doesn't need to be the same as LHS.
+  static KnownBits lshr(const KnownBits &LHS, const KnownBits &RHS);
+
+  /// Compute known bits for ashr(LHS, RHS).
+  /// NOTE: RHS (shift amount) bitwidth doesn't need to be the same as LHS.
+  static KnownBits ashr(const KnownBits &LHS, const KnownBits &RHS);
+
+  /// Determine if these known bits always give the same ICMP_EQ result.
+  static Optional<bool> eq(const KnownBits &LHS, const KnownBits &RHS);
+
+  /// Determine if these known bits always give the same ICMP_NE result.
+  static Optional<bool> ne(const KnownBits &LHS, const KnownBits &RHS);
+
+  /// Determine if these known bits always give the same ICMP_UGT result.
+  static Optional<bool> ugt(const KnownBits &LHS, const KnownBits &RHS);
+
+  /// Determine if these known bits always give the same ICMP_UGE result.
+  static Optional<bool> uge(const KnownBits &LHS, const KnownBits &RHS);
+
+  /// Determine if these known bits always give the same ICMP_ULT result.
+  static Optional<bool> ult(const KnownBits &LHS, const KnownBits &RHS);
+
+  /// Determine if these known bits always give the same ICMP_ULE result.
+  static Optional<bool> ule(const KnownBits &LHS, const KnownBits &RHS);
+
+  /// Determine if these known bits always give the same ICMP_SGT result.
+  static Optional<bool> sgt(const KnownBits &LHS, const KnownBits &RHS);
+
+  /// Determine if these known bits always give the same ICMP_SGE result.
+  static Optional<bool> sge(const KnownBits &LHS, const KnownBits &RHS);
+
+  /// Determine if these known bits always give the same ICMP_SLT result.
+  static Optional<bool> slt(const KnownBits &LHS, const KnownBits &RHS);
+
+  /// Determine if these known bits always give the same ICMP_SLE result.
+  static Optional<bool> sle(const KnownBits &LHS, const KnownBits &RHS);
+
+  /// Insert the bits from a smaller known bits starting at bitPosition.
+  void insertBits(const KnownBits &SubBits, unsigned BitPosition) {
+    Zero.insertBits(SubBits.Zero, BitPosition);
+    One.insertBits(SubBits.One, BitPosition);
+  }
+
+  /// Return a subset of the known bits from [bitPosition,bitPosition+numBits).
+  KnownBits extractBits(unsigned NumBits, unsigned BitPosition) {
+    return KnownBits(Zero.extractBits(NumBits, BitPosition),
+                     One.extractBits(NumBits, BitPosition));
+  }
+
   /// Update known bits based on ANDing with RHS.
   KnownBits &operator&=(const KnownBits &RHS);
 
@@ -249,6 +379,17 @@ public:
 
   /// Update known bits based on XORing with RHS.
   KnownBits &operator^=(const KnownBits &RHS);
+
+  /// Compute known bits for the absolute value.
+  KnownBits abs(bool IntMinIsPoison = false) const;
+
+  KnownBits byteSwap() {
+    return KnownBits(Zero.byteSwap(), One.byteSwap());
+  }
+
+  KnownBits reverseBits() {
+    return KnownBits(Zero.reverseBits(), One.reverseBits());
+  }
 };
 
 inline KnownBits operator&(KnownBits LHS, const KnownBits &RHS) {
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/LineIterator.h b/contrib/llvm-project/llvm/include/llvm/Support/LineIterator.h
index 2a1e47bfe5b7..b391412685c4 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/LineIterator.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/LineIterator.h
@@ -9,8 +9,10 @@
 #ifndef LLVM_SUPPORT_LINEITERATOR_H
 #define LLVM_SUPPORT_LINEITERATOR_H
 
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/DataTypes.h"
+#include "llvm/Support/MemoryBufferRef.h"
 #include <iterator>
 
 namespace llvm {
@@ -30,7 +32,7 @@ class MemoryBuffer;
 /// Note that this iterator requires the buffer to be nul terminated.
 class line_iterator
     : public std::iterator<std::forward_iterator_tag, StringRef> {
-  const MemoryBuffer *Buffer = nullptr;
+  Optional<MemoryBufferRef> Buffer;
   char CommentMarker = '\0';
   bool SkipBlanks = true;
 
@@ -41,6 +43,10 @@ public:
   /// Default construct an "end" iterator.
   line_iterator() = default;
 
+  /// Construct a new iterator around an unowned memory buffer.
+  explicit line_iterator(const MemoryBufferRef &Buffer, bool SkipBlanks = true,
+                         char CommentMarker = '\0');
+
   /// Construct a new iterator around some memory buffer.
   explicit line_iterator(const MemoryBuffer &Buffer, bool SkipBlanks = true,
                          char CommentMarker = '\0');
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/MachineValueType.h b/contrib/llvm-project/llvm/include/llvm/Support/MachineValueType.h
index 3bb8220e72e5..1c600d0108b6 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/MachineValueType.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/MachineValueType.h
@@ -111,113 +111,119 @@ namespace llvm {
       v8i64          =  61,   //  8 x i64
       v16i64         =  62,   // 16 x i64
       v32i64         =  63,   // 32 x i64
+      v64i64         =  64,   // 64 x i64
+      v128i64        =  65,   // 128 x i64
+      v256i64        =  66,   // 256 x i64
 
-      v1i128         =  64,   //  1 x i128
+      v1i128         =  67,   //  1 x i128
 
       FIRST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE = v1i1,
       LAST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE = v1i128,
 
-      v2f16          =  65,   //    2 x f16
-      v3f16          =  66,   //    3 x f16
-      v4f16          =  67,   //    4 x f16
-      v8f16          =  68,   //    8 x f16
-      v16f16         =  69,   //   16 x f16
-      v32f16         =  70,   //   32 x f16
-      v64f16         =  71,   //   64 x f16
-      v128f16        =  72,   //  128 x f16
-      v2bf16         =  73,   //    2 x bf16
-      v3bf16         =  74,   //    3 x bf16
-      v4bf16         =  75,   //    4 x bf16
-      v8bf16         =  76,   //    8 x bf16
-      v16bf16        =  77,   //   16 x bf16
-      v32bf16        =  78,   //   32 x bf16
-      v64bf16        =  79,   //   64 x bf16
-      v128bf16       =  80,   //  128 x bf16
-      v1f32          =  81,   //    1 x f32
-      v2f32          =  82,   //    2 x f32
-      v3f32          =  83,   //    3 x f32
-      v4f32          =  84,   //    4 x f32
-      v5f32          =  85,   //    5 x f32
-      v8f32          =  86,   //    8 x f32
-      v16f32         =  87,   //   16 x f32
-      v32f32         =  88,   //   32 x f32
-      v64f32         =  89,   //   64 x f32
-      v128f32        =  90,   //  128 x f32
-      v256f32        =  91,   //  256 x f32
-      v512f32        =  92,   //  512 x f32
-      v1024f32       =  93,   // 1024 x f32
-      v2048f32       =  94,   // 2048 x f32
-      v1f64          =  95,   //    1 x f64
-      v2f64          =  96,   //    2 x f64
-      v4f64          =  97,   //    4 x f64
-      v8f64          =  98,   //    8 x f64
-      v16f64         =  99,   //   16 x f64
-      v32f64         = 100,   //   32 x f64
+      v2f16          =  68,   //    2 x f16
+      v3f16          =  69,   //    3 x f16
+      v4f16          =  70,   //    4 x f16
+      v8f16          =  71,   //    8 x f16
+      v16f16         =  72,   //   16 x f16
+      v32f16         =  73,   //   32 x f16
+      v64f16         =  74,   //   64 x f16
+      v128f16        =  75,   //  128 x f16
+      v2bf16         =  76,   //    2 x bf16
+      v3bf16         =  77,   //    3 x bf16
+      v4bf16         =  78,   //    4 x bf16
+      v8bf16         =  79,   //    8 x bf16
+      v16bf16        =  80,   //   16 x bf16
+      v32bf16        =  81,   //   32 x bf16
+      v64bf16        =  82,   //   64 x bf16
+      v128bf16       =  83,   //  128 x bf16
+      v1f32          =  84,   //    1 x f32
+      v2f32          =  85,   //    2 x f32
+      v3f32          =  86,   //    3 x f32
+      v4f32          =  87,   //    4 x f32
+      v5f32          =  88,   //    5 x f32
+      v8f32          =  89,   //    8 x f32
+      v16f32         =  90,   //   16 x f32
+      v32f32         =  91,   //   32 x f32
+      v64f32         =  92,   //   64 x f32
+      v128f32        =  93,   //  128 x f32
+      v256f32        =  94,   //  256 x f32
+      v512f32        =  95,   //  512 x f32
+      v1024f32       =  96,   // 1024 x f32
+      v2048f32       =  97,   // 2048 x f32
+      v1f64          =  98,   //    1 x f64
+      v2f64          =  99,   //    2 x f64
+      v4f64          = 100,   //    4 x f64
+      v8f64          = 101,   //    8 x f64
+      v16f64         = 102,   //   16 x f64
+      v32f64         = 103,   //   32 x f64
+      v64f64         = 104,   //   64 x f64
+      v128f64        = 105,   //  128 x f64
+      v256f64        = 106,   //  256 x f64
 
       FIRST_FP_FIXEDLEN_VECTOR_VALUETYPE = v2f16,
-      LAST_FP_FIXEDLEN_VECTOR_VALUETYPE = v32f64,
+      LAST_FP_FIXEDLEN_VECTOR_VALUETYPE = v256f64,
 
       FIRST_FIXEDLEN_VECTOR_VALUETYPE = v1i1,
-      LAST_FIXEDLEN_VECTOR_VALUETYPE = v32f64,
-
-      nxv1i1         = 101,   // n x  1 x i1
-      nxv2i1         = 102,   // n x  2 x i1
-      nxv4i1         = 103,   // n x  4 x i1
-      nxv8i1         = 104,   // n x  8 x i1
-      nxv16i1        = 105,   // n x 16 x i1
-      nxv32i1        = 106,   // n x 32 x i1
-      nxv64i1        = 107,   // n x  64 x i1
-
-      nxv1i8         = 108,   // n x  1 x i8
-      nxv2i8         = 109,   // n x  2 x i8
-      nxv4i8         = 110,   // n x  4 x i8
-      nxv8i8         = 111,   // n x  8 x i8
-      nxv16i8        = 112,   // n x 16 x i8
-      nxv32i8        = 113,   // n x 32 x i8
-      nxv64i8        = 114,   // n x  64 x i8
-
-      nxv1i16        = 115,  // n x  1 x i16
-      nxv2i16        = 116,  // n x  2 x i16
-      nxv4i16        = 117,  // n x  4 x i16
-      nxv8i16        = 118,  // n x  8 x i16
-      nxv16i16       = 119,  // n x 16 x i16
-      nxv32i16       = 120,  // n x 32 x i16
-
-      nxv1i32        = 121,  // n x  1 x i32
-      nxv2i32        = 122,  // n x  2 x i32
-      nxv4i32        = 123,  // n x  4 x i32
-      nxv8i32        = 124,  // n x  8 x i32
-      nxv16i32       = 125,  // n x 16 x i32
-      nxv32i32       = 126,  // n x 32 x i32
-
-      nxv1i64        = 127,  // n x  1 x i64
-      nxv2i64        = 128,  // n x  2 x i64
-      nxv4i64        = 129,  // n x  4 x i64
-      nxv8i64        = 130,  // n x  8 x i64
-      nxv16i64       = 131,  // n x 16 x i64
-      nxv32i64       = 132,  // n x 32 x i64
+      LAST_FIXEDLEN_VECTOR_VALUETYPE = v256f64,
+
+      nxv1i1         = 107,   // n x  1 x i1
+      nxv2i1         = 108,   // n x  2 x i1
+      nxv4i1         = 109,   // n x  4 x i1
+      nxv8i1         = 110,   // n x  8 x i1
+      nxv16i1        = 111,   // n x 16 x i1
+      nxv32i1        = 112,   // n x 32 x i1
+      nxv64i1        = 113,   // n x  64 x i1
+
+      nxv1i8         = 114,   // n x  1 x i8
+      nxv2i8         = 115,   // n x  2 x i8
+      nxv4i8         = 116,   // n x  4 x i8
+      nxv8i8         = 117,   // n x  8 x i8
+      nxv16i8        = 118,   // n x 16 x i8
+      nxv32i8        = 119,   // n x 32 x i8
+      nxv64i8        = 120,   // n x  64 x i8
+
+      nxv1i16        = 121,  // n x  1 x i16
+      nxv2i16        = 122,  // n x  2 x i16
+      nxv4i16        = 123,  // n x  4 x i16
+      nxv8i16        = 124,  // n x  8 x i16
+      nxv16i16       = 125,  // n x 16 x i16
+      nxv32i16       = 126,  // n x 32 x i16
+
+      nxv1i32        = 127,  // n x  1 x i32
+      nxv2i32        = 128,  // n x  2 x i32
+      nxv4i32        = 129,  // n x  4 x i32
+      nxv8i32        = 130,  // n x  8 x i32
+      nxv16i32       = 131,  // n x 16 x i32
+      nxv32i32       = 132,  // n x 32 x i32
+
+      nxv1i64        = 133,  // n x  1 x i64
+      nxv2i64        = 134,  // n x  2 x i64
+      nxv4i64        = 135,  // n x  4 x i64
+      nxv8i64        = 136,  // n x  8 x i64
+      nxv16i64       = 137,  // n x 16 x i64
+      nxv32i64       = 138,  // n x 32 x i64
 
       FIRST_INTEGER_SCALABLE_VECTOR_VALUETYPE = nxv1i1,
       LAST_INTEGER_SCALABLE_VECTOR_VALUETYPE = nxv32i64,
 
-      nxv1f16        = 133,   // n x   1 x f16
-      nxv2f16        = 134,  // n x  2 x f16
-      nxv4f16        = 135,  // n x  4 x f16
-      nxv8f16        = 136,  // n x  8 x f16
-      nxv16f16       = 137,   // n x  16 x f16
-      nxv32f16       = 138,   // n x  32 x f16
-      nxv2bf16       = 139,  // n x  2 x bf16
-      nxv4bf16       = 140,  // n x  4 x bf16
-      nxv8bf16       = 141,  // n x  8 x bf16
-      nxv1f32        = 142,  // n x  1 x f32
-      nxv2f32        = 143,  // n x  2 x f32
-      nxv4f32        = 144,  // n x  4 x f32
-      nxv8f32        = 145,  // n x  8 x f32
-      nxv16f32       = 146,  // n x 16 x f32
-      nxv1f64        = 147,  // n x  1 x f64
-      nxv2f64        = 148,  // n x  2 x f64
-      nxv4f64        = 149,  // n x  4 x f64
-      nxv8f64        = 150,  // n x  8 x f64
+      nxv1f16        = 139,   // n x   1 x f16
+      nxv2f16        = 140,  // n x  2 x f16
+      nxv4f16        = 141,  // n x  4 x f16
+      nxv8f16        = 142,  // n x  8 x f16
+      nxv16f16       = 143,   // n x  16 x f16
+      nxv32f16       = 144,   // n x  32 x f16
+      nxv2bf16       = 145,  // n x  2 x bf16
+      nxv4bf16       = 146,  // n x  4 x bf16
+      nxv8bf16       = 147,  // n x  8 x bf16
+      nxv1f32        = 148,  // n x  1 x f32
+      nxv2f32        = 149,  // n x  2 x f32
+      nxv4f32        = 150,  // n x  4 x f32
+      nxv8f32        = 151,  // n x  8 x f32
+      nxv16f32       = 152,  // n x 16 x f32
+      nxv1f64        = 153,  // n x  1 x f64
+      nxv2f64        = 154,  // n x  2 x f64
+      nxv4f64        = 155,  // n x  4 x f64
+      nxv8f64        = 156,  // n x  8 x f64
 
       FIRST_FP_SCALABLE_VECTOR_VALUETYPE = nxv1f16,
       LAST_FP_SCALABLE_VECTOR_VALUETYPE = nxv8f64,
@@ -228,25 +234,27 @@ namespace llvm {
       FIRST_VECTOR_VALUETYPE = v1i1,
       LAST_VECTOR_VALUETYPE  = nxv8f64,
 
-      x86mmx         = 151,   // This is an X86 MMX value
+      x86mmx         = 157,   // This is an X86 MMX value
 
-      Glue           = 152,   // This glues nodes together during pre-RA sched
+      Glue           = 158,   // This glues nodes together during pre-RA sched
 
-      isVoid         = 153,   // This has no value
+      isVoid         = 159,   // This has no value
 
-      Untyped        = 154,   // This value takes a register, but has
+      Untyped        = 160,   // This value takes a register, but has
                               // unspecified type.  The register class
                               // will be determined by the opcode.
 
-      exnref         = 155,   // WebAssembly's exnref type
+      funcref        = 161,   // WebAssembly's funcref type
+      externref      = 162,   // WebAssembly's externref type
+      x86amx         = 163,   // This is an X86 AMX value
 
       FIRST_VALUETYPE =  1,   // This is always the beginning of the list.
-      LAST_VALUETYPE = 156,   // This always remains at the end of the list.
+      LAST_VALUETYPE = 164,   // This always remains at the end of the list.
 
       // This is the current maximum for LAST_VALUETYPE.
       // MVT::MAX_ALLOWED_VALUETYPE is used for asserts and to size bit vectors
       // This value must be a multiple of 32.
-      MAX_ALLOWED_VALUETYPE = 160,
+      MAX_ALLOWED_VALUETYPE = 192,
 
       // A value of type llvm::TokenTy
       token          = 248,
@@ -419,13 +427,43 @@ namespace llvm {
               SimpleTy == MVT::iPTRAny);
     }
 
+    /// Return a vector with the same number of elements as this vector, but
+    /// with the element type converted to an integer type with the same
+    /// bitwidth.
+    MVT changeVectorElementTypeToInteger() const {
+      MVT EltTy = getVectorElementType();
+      MVT IntTy = MVT::getIntegerVT(EltTy.getSizeInBits());
+      MVT VecTy = MVT::getVectorVT(IntTy, getVectorElementCount());
+      assert(VecTy.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE &&
+             "Simple vector VT not representable by simple integer vector VT!");
+      return VecTy;
+    }
+
+    /// Return a VT for a vector type whose attributes match ourselves
+    /// with the exception of the element type that is chosen by the caller.
+    MVT changeVectorElementType(MVT EltVT) const {
+      MVT VecTy = MVT::getVectorVT(EltVT, getVectorElementCount());
+      assert(VecTy.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE &&
+             "Simple vector VT not representable by simple integer vector VT!");
+      return VecTy;
+    }
+
+    /// Return the type converted to an equivalently sized integer or vector
+    /// with integer element type. Similar to changeVectorElementTypeToInteger,
+    /// but also handles scalars.
+    MVT changeTypeToInteger() {
+      if (isVector())
+        return changeVectorElementTypeToInteger();
+      return MVT::getIntegerVT(getSizeInBits());
+    }
+
     /// Return a VT for a vector type with the same element type but
     /// half the number of elements.
     MVT getHalfNumVectorElementsVT() const {
       MVT EltVT = getVectorElementType();
       auto EltCnt = getVectorElementCount();
-      assert(!(EltCnt.Min & 1) && "Splitting vector, but not in half!");
-      return getVectorVT(EltVT, EltCnt / 2);
+      assert(EltCnt.isKnownEven() && "Splitting vector, but not in half!");
+      return getVectorVT(EltVT, EltCnt.divideCoefficientBy(2));
     }
 
     /// Returns true if the given vector is a power of 2.
@@ -529,6 +567,9 @@ namespace llvm {
       case v8i64:
       case v16i64:
       case v32i64:
+      case v64i64:
+      case v128i64:
+      case v256i64:
       case nxv1i64:
       case nxv2i64:
       case nxv4i64:
@@ -586,6 +627,9 @@ namespace llvm {
       case v8f64:
       case v16f64:
       case v32f64:
+      case v64f64:
+      case v128f64:
+      case v256f64:
       case nxv1f64:
       case nxv2f64:
       case nxv4f64:
@@ -608,21 +652,27 @@ namespace llvm {
       case v256i1:
       case v256i8:
       case v256i32:
-      case v256f32: return 256;
+      case v256i64:
+      case v256f32:
+      case v256f64: return 256;
       case v128i1:
       case v128i8:
       case v128i16:
       case v128i32:
+      case v128i64:
       case v128f16:
       case v128bf16:
-      case v128f32: return 128;
+      case v128f32:
+      case v128f64: return 128;
       case v64i1:
       case v64i8:
       case v64i16:
       case v64i32:
+      case v64i64:
       case v64f16:
       case v64bf16:
       case v64f32:
+      case v64f64:
       case nxv64i1:
       case nxv64i8: return 64;
       case v32i1:
@@ -737,12 +787,12 @@ namespace llvm {
     }
 
     ElementCount getVectorElementCount() const {
-      return { getVectorNumElements(), isScalableVector() };
+      return ElementCount::get(getVectorNumElements(), isScalableVector());
     }
 
     /// Given a vector type, return the minimum number of elements it contains.
     unsigned getVectorMinNumElements() const {
-      return getVectorElementCount().Min;
+      return getVectorElementCount().getKnownMinValue();
     }
 
     /// Returns the size of the specified MVT in bits.
@@ -910,21 +960,35 @@ namespace llvm {
       case v32f64: return TypeSize::Fixed(2048);
       case nxv32i64: return TypeSize::Scalable(2048);
       case v128i32:
-      case v128f32:  return TypeSize::Fixed(4096);
+      case v64i64:
+      case v128f32:
+      case v64f64:  return TypeSize::Fixed(4096);
       case v256i32:
-      case v256f32:  return TypeSize::Fixed(8192);
+      case v128i64:
+      case v256f32:
+      case x86amx:
+      case v128f64:  return TypeSize::Fixed(8192);
       case v512i32:
-      case v512f32:  return TypeSize::Fixed(16384);
+      case v256i64:
+      case v512f32:
+      case v256f64:  return TypeSize::Fixed(16384);
       case v1024i32:
       case v1024f32:  return TypeSize::Fixed(32768);
       case v2048i32:
       case v2048f32:  return TypeSize::Fixed(65536);
-      case exnref: return TypeSize::Fixed(0); // opaque type
+      case funcref:
+      case externref: return TypeSize::Fixed(0); // opaque type
       }
     }
 
-    TypeSize getScalarSizeInBits() const {
-      return getScalarType().getSizeInBits();
+    /// Return the size of the specified fixed width value type in bits. The
+    /// function will assert if the type is scalable.
+    uint64_t getFixedSizeInBits() const {
+      return getSizeInBits().getFixedSize();
+    }
+
+    uint64_t getScalarSizeInBits() const {
+      return getScalarType().getSizeInBits().getFixedSize();
     }
 
     /// Return the number of bytes overwritten by a store of the specified value
@@ -950,28 +1014,56 @@ namespace llvm {
 
     /// Returns true if the number of bits for the type is a multiple of an
     /// 8-bit byte.
-    bool isByteSized() const {
-      return getSizeInBits().isByteSized();
+    bool isByteSized() const { return getSizeInBits().isKnownMultipleOf(8); }
+
+    /// Return true if we know at compile time this has more bits than VT.
+    bool knownBitsGT(MVT VT) const {
+      return TypeSize::isKnownGT(getSizeInBits(), VT.getSizeInBits());
+    }
+
+    /// Return true if we know at compile time this has more than or the same
+    /// bits as VT.
+    bool knownBitsGE(MVT VT) const {
+      return TypeSize::isKnownGE(getSizeInBits(), VT.getSizeInBits());
+    }
+
+    /// Return true if we know at compile time this has fewer bits than VT.
+    bool knownBitsLT(MVT VT) const {
+      return TypeSize::isKnownLT(getSizeInBits(), VT.getSizeInBits());
+    }
+
+    /// Return true if we know at compile time this has fewer than or the same
+    /// bits as VT.
+    bool knownBitsLE(MVT VT) const {
+      return TypeSize::isKnownLE(getSizeInBits(), VT.getSizeInBits());
     }
 
     /// Return true if this has more bits than VT.
     bool bitsGT(MVT VT) const {
-      return getSizeInBits() > VT.getSizeInBits();
+      assert(isScalableVector() == VT.isScalableVector() &&
+             "Comparison between scalable and fixed types");
+      return knownBitsGT(VT);
     }
 
     /// Return true if this has no less bits than VT.
     bool bitsGE(MVT VT) const {
-      return getSizeInBits() >= VT.getSizeInBits();
+      assert(isScalableVector() == VT.isScalableVector() &&
+             "Comparison between scalable and fixed types");
+      return knownBitsGE(VT);
     }
 
     /// Return true if this has less bits than VT.
     bool bitsLT(MVT VT) const {
-      return getSizeInBits() < VT.getSizeInBits();
+      assert(isScalableVector() == VT.isScalableVector() &&
+             "Comparison between scalable and fixed types");
+      return knownBitsLT(VT);
     }
 
     /// Return true if this has no more bits than VT.
     bool bitsLE(MVT VT) const {
-      return getSizeInBits() <= VT.getSizeInBits();
+      assert(isScalableVector() == VT.isScalableVector() &&
+             "Comparison between scalable and fixed types");
+      return knownBitsLE(VT);
     }
 
     static MVT getFloatingPointVT(unsigned BitWidth) {
@@ -1072,6 +1164,9 @@ namespace llvm {
         if (NumElements == 8)  return MVT::v8i64;
         if (NumElements == 16) return MVT::v16i64;
         if (NumElements == 32) return MVT::v32i64;
+        if (NumElements == 64) return MVT::v64i64;
+        if (NumElements == 128) return MVT::v128i64;
+        if (NumElements == 256) return MVT::v256i64;
         break;
       case MVT::i128:
         if (NumElements == 1)  return MVT::v1i128;
@@ -1119,6 +1214,9 @@ namespace llvm {
         if (NumElements == 8)  return MVT::v8f64;
         if (NumElements == 16) return MVT::v16f64;
         if (NumElements == 32) return MVT::v32f64;
+        if (NumElements == 64) return MVT::v64f64;
+        if (NumElements == 128) return MVT::v128f64;
+        if (NumElements == 256) return MVT::v256f64;
         break;
       }
       return (MVT::SimpleValueType)(MVT::INVALID_SIMPLE_VALUE_TYPE);
@@ -1207,9 +1305,9 @@ namespace llvm {
     }
 
     static MVT getVectorVT(MVT VT, ElementCount EC) {
-      if (EC.Scalable)
-        return getScalableVectorVT(VT, EC.Min);
-      return getVectorVT(VT, EC.Min);
+      if (EC.isScalable())
+        return getScalableVectorVT(VT, EC.getKnownMinValue());
+      return getVectorVT(VT, EC.getKnownMinValue());
     }
 
     /// Return the value type corresponding to the specified type.  This returns
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/MathExtras.h b/contrib/llvm-project/llvm/include/llvm/Support/MathExtras.h
index 16da3046c8ce..33b9065261e8 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/MathExtras.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/MathExtras.h
@@ -440,7 +440,7 @@ inline uint64_t maxUIntN(uint64_t N) {
 inline int64_t minIntN(int64_t N) {
   assert(N > 0 && N <= 64 && "integer width out of range");
 
-  return -(UINT64_C(1)<<(N-1));
+  return UINT64_C(1) + ~(UINT64_C(1) << (N - 1));
 }
 
 /// Gets the maximum value for a N-bit signed integer.
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/MemoryBuffer.h b/contrib/llvm-project/llvm/include/llvm/Support/MemoryBuffer.h
index f47a8d2d334b..9e6ee2536c5e 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/MemoryBuffer.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/MemoryBuffer.h
@@ -19,14 +19,12 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/CBindingWrapping.h"
 #include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/MemoryBufferRef.h"
 #include <cstddef>
 #include <cstdint>
 #include <memory>
 
 namespace llvm {
-
-class MemoryBufferRef;
-
 namespace sys {
 namespace fs {
 // Duplicated from FileSystem.h to avoid a dependency.
@@ -260,26 +258,6 @@ private:
   using MemoryBuffer::getSTDIN;
 };
 
-class MemoryBufferRef {
-  StringRef Buffer;
-  StringRef Identifier;
-
-public:
-  MemoryBufferRef() = default;
-  MemoryBufferRef(const MemoryBuffer& Buffer)
-      : Buffer(Buffer.getBuffer()), Identifier(Buffer.getBufferIdentifier()) {}
-  MemoryBufferRef(StringRef Buffer, StringRef Identifier)
-      : Buffer(Buffer), Identifier(Identifier) {}
-
-  StringRef getBuffer() const { return Buffer; }
-
-  StringRef getBufferIdentifier() const { return Identifier; }
-
-  const char *getBufferStart() const { return Buffer.begin(); }
-  const char *getBufferEnd() const { return Buffer.end(); }
-  size_t getBufferSize() const { return Buffer.size(); }
-};
-
 // Create wrappers for C Binding types (see CBindingWrapping.h).
 DEFINE_SIMPLE_CONVERSION_FUNCTIONS(MemoryBuffer, LLVMMemoryBufferRef)
 
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/MemoryBufferRef.h b/contrib/llvm-project/llvm/include/llvm/Support/MemoryBufferRef.h
new file mode 100644
index 000000000000..b38a1f3b6565
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/Support/MemoryBufferRef.h
@@ -0,0 +1,56 @@
+//===- MemoryBufferRef.h - Memory Buffer Reference --------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file defines the MemoryBuffer interface.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_MEMORYBUFFERREF_H
+#define LLVM_SUPPORT_MEMORYBUFFERREF_H
+
+#include "llvm/ADT/StringRef.h"
+
+namespace llvm {
+
+class MemoryBuffer;
+
+class MemoryBufferRef {
+  StringRef Buffer;
+  StringRef Identifier;
+
+public:
+  MemoryBufferRef() = default;
+  MemoryBufferRef(const MemoryBuffer &Buffer);
+  MemoryBufferRef(StringRef Buffer, StringRef Identifier)
+      : Buffer(Buffer), Identifier(Identifier) {}
+
+  StringRef getBuffer() const { return Buffer; }
+  StringRef getBufferIdentifier() const { return Identifier; }
+
+  const char *getBufferStart() const { return Buffer.begin(); }
+  const char *getBufferEnd() const { return Buffer.end(); }
+  size_t getBufferSize() const { return Buffer.size(); }
+
+  /// Check pointer identity (not value) of identifier and data.
+  friend bool operator==(const MemoryBufferRef &LHS,
+                         const MemoryBufferRef &RHS) {
+    return LHS.Buffer.begin() == RHS.Buffer.begin() &&
+           LHS.Buffer.end() == RHS.Buffer.end() &&
+           LHS.Identifier.begin() == RHS.Identifier.begin() &&
+           LHS.Identifier.end() == RHS.Identifier.end();
+  }
+
+  friend bool operator!=(const MemoryBufferRef &LHS,
+                         const MemoryBufferRef &RHS) {
+    return !(LHS == RHS);
+  }
+};
+
+} // namespace llvm
+
+#endif // LLVM_SUPPORT_MEMORYBUFFERREF_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/Parallel.h b/contrib/llvm-project/llvm/include/llvm/Support/Parallel.h
index 2c0edfbb1db5..d2f006773836 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/Parallel.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/Parallel.h
@@ -11,6 +11,7 @@
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Config/llvm-config.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Threading.h"
 
@@ -120,13 +121,17 @@ void parallel_sort(RandomAccessIterator Start, RandomAccessIterator End,
                       llvm::Log2_64(std::distance(Start, End)) + 1);
 }
 
+// TaskGroup has a relatively high overhead, so we want to reduce
+// the number of spawn() calls. We'll create up to 1024 tasks here.
+// (Note that 1024 is an arbitrary number. This code probably needs
+// improving to take the number of available cores into account.)
+enum { MaxTasksPerGroup = 1024 };
+
 template <class IterTy, class FuncTy>
 void parallel_for_each(IterTy Begin, IterTy End, FuncTy Fn) {
-  // TaskGroup has a relatively high overhead, so we want to reduce
-  // the number of spawn() calls. We'll create up to 1024 tasks here.
-  // (Note that 1024 is an arbitrary number. This code probably needs
-  // improving to take the number of available cores into account.)
-  ptrdiff_t TaskSize = std::distance(Begin, End) / 1024;
+  // Limit the number of tasks to MaxTasksPerGroup to limit job scheduling
+  // overhead on large inputs.
+  ptrdiff_t TaskSize = std::distance(Begin, End) / MaxTasksPerGroup;
   if (TaskSize == 0)
     TaskSize = 1;
 
@@ -140,7 +145,9 @@ void parallel_for_each(IterTy Begin, IterTy End, FuncTy Fn) {
 
 template <class IndexTy, class FuncTy>
 void parallel_for_each_n(IndexTy Begin, IndexTy End, FuncTy Fn) {
-  ptrdiff_t TaskSize = (End - Begin) / 1024;
+  // Limit the number of tasks to MaxTasksPerGroup to limit job scheduling
+  // overhead on large inputs.
+  ptrdiff_t TaskSize = (End - Begin) / MaxTasksPerGroup;
   if (TaskSize == 0)
     TaskSize = 1;
 
@@ -156,6 +163,50 @@ void parallel_for_each_n(IndexTy Begin, IndexTy End, FuncTy Fn) {
     Fn(J);
 }
 
+template <class IterTy, class ResultTy, class ReduceFuncTy,
+          class TransformFuncTy>
+ResultTy parallel_transform_reduce(IterTy Begin, IterTy End, ResultTy Init,
+                                   ReduceFuncTy Reduce,
+                                   TransformFuncTy Transform) {
+  // Limit the number of tasks to MaxTasksPerGroup to limit job scheduling
+  // overhead on large inputs.
+  size_t NumInputs = std::distance(Begin, End);
+  if (NumInputs == 0)
+    return std::move(Init);
+  size_t NumTasks = std::min(static_cast<size_t>(MaxTasksPerGroup), NumInputs);
+  std::vector<ResultTy> Results(NumTasks, Init);
+  {
+    // Each task processes either TaskSize or TaskSize+1 inputs. Any inputs
+    // remaining after dividing them equally amongst tasks are distributed as
+    // one extra input over the first tasks.
+    TaskGroup TG;
+    size_t TaskSize = NumInputs / NumTasks;
+    size_t RemainingInputs = NumInputs % NumTasks;
+    IterTy TBegin = Begin;
+    for (size_t TaskId = 0; TaskId < NumTasks; ++TaskId) {
+      IterTy TEnd = TBegin + TaskSize + (TaskId < RemainingInputs ? 1 : 0);
+      TG.spawn([=, &Transform, &Reduce, &Results] {
+        // Reduce the result of transformation eagerly within each task.
+        ResultTy R = Init;
+        for (IterTy It = TBegin; It != TEnd; ++It)
+          R = Reduce(R, Transform(*It));
+        Results[TaskId] = R;
+      });
+      TBegin = TEnd;
+    }
+    assert(TBegin == End);
+  }
+
+  // Do a final reduction. There are at most 1024 tasks, so this only adds
+  // constant single-threaded overhead for large inputs. Hopefully most
+  // reductions are cheaper than the transformation.
+  ResultTy FinalResult = std::move(Results.front());
+  for (ResultTy &PartialResult :
+       makeMutableArrayRef(Results.data() + 1, Results.size() - 1))
+    FinalResult = Reduce(FinalResult, std::move(PartialResult));
+  return std::move(FinalResult);
+}
+
 #endif
 
 } // namespace detail
@@ -198,6 +249,22 @@ void parallelForEachN(size_t Begin, size_t End, FuncTy Fn) {
     Fn(I);
 }
 
+template <class IterTy, class ResultTy, class ReduceFuncTy,
+          class TransformFuncTy>
+ResultTy parallelTransformReduce(IterTy Begin, IterTy End, ResultTy Init,
+                                 ReduceFuncTy Reduce,
+                                 TransformFuncTy Transform) {
+#if LLVM_ENABLE_THREADS
+  if (parallel::strategy.ThreadsRequested != 1) {
+    return parallel::detail::parallel_transform_reduce(Begin, End, Init, Reduce,
+                                                       Transform);
+  }
+#endif
+  for (IterTy I = Begin; I != End; ++I)
+    Init = Reduce(std::move(Init), Transform(*I));
+  return std::move(Init);
+}
+
 // Range wrappers.
 template <class RangeTy,
           class Comparator = std::less<decltype(*std::begin(RangeTy()))>>
@@ -210,6 +277,31 @@ void parallelForEach(RangeTy &&R, FuncTy Fn) {
   parallelForEach(std::begin(R), std::end(R), Fn);
 }
 
+template <class RangeTy, class ResultTy, class ReduceFuncTy,
+          class TransformFuncTy>
+ResultTy parallelTransformReduce(RangeTy &&R, ResultTy Init,
+                                 ReduceFuncTy Reduce,
+                                 TransformFuncTy Transform) {
+  return parallelTransformReduce(std::begin(R), std::end(R), Init, Reduce,
+                                 Transform);
+}
+
+// Parallel for-each, but with error handling.
+template <class RangeTy, class FuncTy>
+Error parallelForEachError(RangeTy &&R, FuncTy Fn) {
+  // The transform_reduce algorithm requires that the initial value be copyable.
+  // Error objects are uncopyable. We only need to copy initial success values,
+  // so work around this mismatch via the C API. The C API represents success
+  // values with a null pointer. The joinErrors discards null values and joins
+  // multiple errors into an ErrorList.
+  return unwrap(parallelTransformReduce(
+      std::begin(R), std::end(R), wrap(Error::success()),
+      [](LLVMErrorRef Lhs, LLVMErrorRef Rhs) {
+        return wrap(joinErrors(unwrap(Lhs), unwrap(Rhs)));
+      },
+      [&Fn](auto &&V) { return wrap(Fn(V)); }));
+}
+
 } // namespace llvm
 
 #endif // LLVM_SUPPORT_PARALLEL_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/Path.h b/contrib/llvm-project/llvm/include/llvm/Support/Path.h
index 83bca5b70bc2..af70e086a1b6 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/Path.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/Path.h
@@ -451,10 +451,48 @@ bool has_extension(const Twine &path, Style style = Style::native);
 
 /// Is path absolute?
 ///
+/// According to cppreference.com, C++17 states: "An absolute path is a path
+/// that unambiguously identifies the location of a file without reference to
+/// an additional starting location."
+///
+/// In other words, the rules are:
+/// 1) POSIX style paths with nonempty root directory are absolute.
+/// 2) Windows style paths with nonempty root name and root directory are
+///    absolute.
+/// 3) No other paths are absolute.
+///
+/// \see has_root_name
+/// \see has_root_directory
+///
 /// @param path Input path.
 /// @result True if the path is absolute, false if it is not.
 bool is_absolute(const Twine &path, Style style = Style::native);
 
+/// Is path absolute using GNU rules?
+///
+/// GNU rules are:
+/// 1) Paths starting with a path separator are absolute.
+/// 2) Windows style paths are also absolute if they start with a character
+///    followed by ':'.
+/// 3) No other paths are absolute.
+///
+/// On Windows style the path "C:\Users\Default" has "C:" as root name and "\"
+/// as root directory.
+///
+/// Hence "C:" on Windows is absolute under GNU rules and not absolute under
+/// C++17 because it has no root directory. Likewise "/" and "\" on Windows are
+/// absolute under GNU and are not absolute under C++17 due to empty root name.
+///
+/// \see has_root_name
+/// \see has_root_directory
+///
+/// @param path Input path.
+/// @param style The style of \p path (e.g. Windows or POSIX). "native" style
+/// means to derive the style from the host.
+/// @result True if the path is absolute following GNU rules, false if it is
+/// not.
+bool is_absolute_gnu(const Twine &path, Style style = Style::native);
+
 /// Is path relative?
 ///
 /// @param path Input path.
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/PluginLoader.h b/contrib/llvm-project/llvm/include/llvm/Support/PluginLoader.h
index c0c516bdae03..95c087f03d9b 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/PluginLoader.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/PluginLoader.h
@@ -16,7 +16,11 @@
 #ifndef LLVM_SUPPORT_PLUGINLOADER_H
 #define LLVM_SUPPORT_PLUGINLOADER_H
 
+#ifndef DONT_GET_PLUGIN_LOADER_OPTION
 #include "llvm/Support/CommandLine.h"
+#endif
+
+#include <string>
 
 namespace llvm {
   struct PluginLoader {
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/Process.h b/contrib/llvm-project/llvm/include/llvm/Support/Process.h
index 0ba6d58ba287..729917bb41f4 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/Process.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/Process.h
@@ -29,6 +29,7 @@
 #include "llvm/Support/Chrono.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/Program.h"
 #include <system_error>
 
 namespace llvm {
@@ -107,10 +108,12 @@ public:
   /// considered.
   static Optional<std::string> FindInEnvPath(StringRef EnvName,
                                              StringRef FileName,
-                                             ArrayRef<std::string> IgnoreList);
+                                             ArrayRef<std::string> IgnoreList,
+                                             char Separator = EnvPathSeparator);
 
   static Optional<std::string> FindInEnvPath(StringRef EnvName,
-                                             StringRef FileName);
+                                             StringRef FileName,
+                                             char Separator = EnvPathSeparator);
 
   // This functions ensures that the standard file descriptors (input, output,
   // and error) are properly mapped to a file descriptor before we use any of
@@ -210,8 +213,9 @@ public:
   /// Equivalent to ::exit(), except when running inside a CrashRecoveryContext.
   /// In that case, the control flow will resume after RunSafely(), like for a
   /// crash, rather than exiting the current process.
+  /// Use \arg NoCleanup for calling _exit() instead of exit().
   LLVM_ATTRIBUTE_NORETURN
-  static void Exit(int RetCode);
+  static void Exit(int RetCode, bool NoCleanup = false);
 };
 
 }
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/Program.h b/contrib/llvm-project/llvm/include/llvm/Support/Program.h
index dbda064cda05..bfd271958788 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/Program.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/Program.h
@@ -14,6 +14,7 @@
 #define LLVM_SUPPORT_PROGRAM_H
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Config/llvm-config.h"
@@ -36,7 +37,7 @@ namespace sys {
   typedef unsigned long procid_t; // Must match the type of DWORD on Windows.
   typedef void *process_t;        // Must match the type of HANDLE on Windows.
 #else
-  typedef pid_t procid_t;
+  typedef ::pid_t procid_t;
   typedef procid_t process_t;
 #endif
 
@@ -125,9 +126,11 @@ namespace sys {
       ///< string is non-empty upon return an error occurred while invoking the
       ///< program.
       bool *ExecutionFailed = nullptr,
-      Optional<ProcessStatistics> *ProcStat = nullptr ///< If non-zero, provides
-      /// a pointer to a structure in which process execution statistics will be
-      /// stored.
+      Optional<ProcessStatistics> *ProcStat = nullptr, ///< If non-zero,
+      /// provides a pointer to a structure in which process execution
+      /// statistics will be stored.
+      BitVector *AffinityMask = nullptr ///< CPUs or processors the new
+                                        /// program shall run on.
   );
 
   /// Similar to ExecuteAndWait, but returns immediately.
@@ -140,7 +143,8 @@ namespace sys {
                             ArrayRef<Optional<StringRef>> Redirects = {},
                             unsigned MemoryLimit = 0,
                             std::string *ErrMsg = nullptr,
-                            bool *ExecutionFailed = nullptr);
+                            bool *ExecutionFailed = nullptr,
+                            BitVector *AffinityMask = nullptr);
 
   /// Return true if the given arguments fit within system-specific
   /// argument length limits.
@@ -218,7 +222,7 @@ namespace sys {
   /// to build a single flat command line appropriate for calling CreateProcess
   /// on
   /// Windows.
-  std::string flattenWindowsCommandLine(ArrayRef<StringRef> Args);
+  ErrorOr<std::wstring> flattenWindowsCommandLine(ArrayRef<StringRef> Args);
 #endif
   }
 }
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/RISCVTargetParser.def b/contrib/llvm-project/llvm/include/llvm/Support/RISCVTargetParser.def
index 28de6cd40132..6a06f9258105 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/RISCVTargetParser.def
+++ b/contrib/llvm-project/llvm/include/llvm/Support/RISCVTargetParser.def
@@ -1,3 +1,13 @@
+#ifndef PROC_ALIAS
+#define PROC_ALIAS(NAME, RV32, RV64)
+#endif
+
+PROC_ALIAS("generic", "generic-rv32", "generic-rv64")
+PROC_ALIAS("rocket", "rocket-rv32", "rocket-rv64")
+PROC_ALIAS("sifive-7-series", "sifive-7-rv32", "sifive-7-rv64")
+
+#undef PROC_ALIAS
+
 #ifndef PROC
 #define PROC(ENUM, NAME, FEATURES, DEFAULT_MARCH)
 #endif
@@ -7,7 +17,11 @@ PROC(GENERIC_RV32, {"generic-rv32"}, FK_NONE, {""})
 PROC(GENERIC_RV64, {"generic-rv64"}, FK_64BIT, {""})
 PROC(ROCKET_RV32, {"rocket-rv32"}, FK_NONE, {""})
 PROC(ROCKET_RV64, {"rocket-rv64"}, FK_64BIT, {""})
+PROC(SIFIVE_732, {"sifive-7-rv32"}, FK_NONE, {""})
+PROC(SIFIVE_764, {"sifive-7-rv64"}, FK_64BIT, {""})
 PROC(SIFIVE_E31, {"sifive-e31"}, FK_NONE, {"rv32imac"})
 PROC(SIFIVE_U54, {"sifive-u54"}, FK_64BIT, {"rv64gc"})
+PROC(SIFIVE_E76, {"sifive-e76"}, FK_NONE, {"rv32imafc"})
+PROC(SIFIVE_U74, {"sifive-u74"}, FK_64BIT, {"rv64gc"})
 
 #undef PROC
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/Signals.h b/contrib/llvm-project/llvm/include/llvm/Support/Signals.h
index e0a18e72f2a7..44f5a750ff5c 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/Signals.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/Signals.h
@@ -50,7 +50,9 @@ namespace sys {
   void DisableSystemDialogsOnCrash();
 
   /// Print the stack trace using the given \c raw_ostream object.
-  void PrintStackTrace(raw_ostream &OS);
+  /// \param Depth refers to the number of stackframes to print. If not
+  ///        specified, the entire frame is printed.
+  void PrintStackTrace(raw_ostream &OS, int Depth = 0);
 
   // Run all registered signal handlers.
   void RunSignalHandlers();
@@ -115,6 +117,8 @@ namespace sys {
   /// Context is a system-specific failure context: it is the signal type on
   /// Unix; the ExceptionContext on Windows.
   void CleanupOnSignal(uintptr_t Context);
+
+  void unregisterHandlers();
 } // End sys namespace
 } // End llvm namespace
 
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/Signposts.h b/contrib/llvm-project/llvm/include/llvm/Support/Signposts.h
index b5a8c3d61e3e..8036b1f53663 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/Signposts.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/Signposts.h
@@ -17,9 +17,10 @@
 #ifndef LLVM_SUPPORT_SIGNPOSTS_H
 #define LLVM_SUPPORT_SIGNPOSTS_H
 
+#include "llvm/ADT/StringRef.h"
+
 namespace llvm {
 class SignpostEmitterImpl;
-class Timer;
 
 /// Manages the emission of signposts into the recording method supported by
 /// the OS.
@@ -32,10 +33,10 @@ public:
 
   bool isEnabled() const;
 
-  /// Begin a signposted interval for the given timer.
-  void startTimerInterval(Timer *T);
-  /// End a signposted interval for the given timer.
-  void endTimerInterval(Timer *T);
+  /// Begin a signposted interval for a given object.
+  void startInterval(const void *O, StringRef Name);
+  /// End a signposted interval for a given object.
+  void endInterval(const void *O, StringRef Name);
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/SourceMgr.h b/contrib/llvm-project/llvm/include/llvm/Support/SourceMgr.h
index a0bd3ca2e0c1..28716b42f4ab 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/SourceMgr.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/SourceMgr.h
@@ -172,6 +172,11 @@ public:
   std::pair<unsigned, unsigned> getLineAndColumn(SMLoc Loc,
                                                  unsigned BufferID = 0) const;
 
+  /// Get a string with the \p SMLoc filename and line number
+  /// formatted in the standard style.
+  std::string getFormattedLocationNoOffset(SMLoc Loc,
+                                           bool IncludePath = false) const;
+
   /// Given a line and column number in a mapped buffer, turn it into an SMLoc.
   /// This will return a null SMLoc if the line/column location is invalid.
   SMLoc FindLocForLineAndColumn(unsigned BufferID, unsigned LineNo,
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/SuffixTree.h b/contrib/llvm-project/llvm/include/llvm/Support/SuffixTree.h
index 67d513d032ce..352fba511937 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/SuffixTree.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/SuffixTree.h
@@ -322,10 +322,10 @@ public:
       return It;
     }
 
-    bool operator==(const RepeatedSubstringIterator &Other) {
+    bool operator==(const RepeatedSubstringIterator &Other) const {
       return N == Other.N;
     }
-    bool operator!=(const RepeatedSubstringIterator &Other) {
+    bool operator!=(const RepeatedSubstringIterator &Other) const {
       return !(*this == Other);
     }
 
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/SwapByteOrder.h b/contrib/llvm-project/llvm/include/llvm/Support/SwapByteOrder.h
index 0e544fc7e71e..e8612ba6654b 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/SwapByteOrder.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/SwapByteOrder.h
@@ -22,7 +22,7 @@
 #endif
 
 #if defined(__linux__) || defined(__GNU__) || defined(__HAIKU__) ||            \
-    defined(__EMSCRIPTEN__)
+    defined(__Fuchsia__) || defined(__EMSCRIPTEN__)
 #include <endian.h>
 #elif defined(_AIX)
 #include <sys/machine.h>
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/SymbolRemappingReader.h b/contrib/llvm-project/llvm/include/llvm/Support/SymbolRemappingReader.h
index 2b9ab570eb8b..820cf9e02192 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/SymbolRemappingReader.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/SymbolRemappingReader.h
@@ -68,7 +68,7 @@ namespace llvm {
 
 class SymbolRemappingParseError : public ErrorInfo<SymbolRemappingParseError> {
 public:
-  SymbolRemappingParseError(StringRef File, int64_t Line, Twine Message)
+  SymbolRemappingParseError(StringRef File, int64_t Line, const Twine &Message)
       : File(File), Line(Line), Message(Message.str()) {}
 
   void log(llvm::raw_ostream &OS) const override {
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/TargetOpcodes.def b/contrib/llvm-project/llvm/include/llvm/Support/TargetOpcodes.def
index c069f5d22ba8..a63d40484089 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/TargetOpcodes.def
+++ b/contrib/llvm-project/llvm/include/llvm/Support/TargetOpcodes.def
@@ -77,6 +77,10 @@ HANDLE_TARGET_OPCODE(SUBREG_TO_REG)
 /// DBG_VALUE - a mapping of the llvm.dbg.value intrinsic
 HANDLE_TARGET_OPCODE(DBG_VALUE)
 
+/// DBG_INSTR_REF - A mapping of llvm.dbg.value referring to the instruction
+/// that defines the value, rather than a virtual register.
+HANDLE_TARGET_OPCODE(DBG_INSTR_REF)
+
 /// DBG_LABEL - a mapping of the llvm.dbg.label intrinsic
 HANDLE_TARGET_OPCODE(DBG_LABEL)
 
@@ -106,6 +110,9 @@ HANDLE_TARGET_OPCODE(BUNDLE)
 HANDLE_TARGET_OPCODE(LIFETIME_START)
 HANDLE_TARGET_OPCODE(LIFETIME_END)
 
+/// Pseudo probe
+HANDLE_TARGET_OPCODE(PSEUDO_PROBE)
+
 /// A Stackmap instruction captures the location of live variables at its
 /// position in the instruction stream. It is followed by a shadow of bytes
 /// that must lie within the function and not contain another stackmap.
@@ -294,6 +301,12 @@ HANDLE_TARGET_OPCODE(G_INTRINSIC_TRUNC)
 /// INTRINSIC round intrinsic.
 HANDLE_TARGET_OPCODE(G_INTRINSIC_ROUND)
 
+/// INTRINSIC round to integer intrinsic.
+HANDLE_TARGET_OPCODE(G_INTRINSIC_LRINT)
+
+/// INTRINSIC roundeven intrinsic.
+HANDLE_TARGET_OPCODE(G_INTRINSIC_ROUNDEVEN)
+
 /// INTRINSIC readcyclecounter
 HANDLE_TARGET_OPCODE(G_READCYCLECOUNTER)
 
@@ -469,6 +482,36 @@ HANDLE_TARGET_OPCODE(G_USUBSAT)
 /// Generic saturating signed subtraction.
 HANDLE_TARGET_OPCODE(G_SSUBSAT)
 
+/// Generic saturating unsigned left shift.
+HANDLE_TARGET_OPCODE(G_USHLSAT)
+
+/// Generic saturating signed left shift.
+HANDLE_TARGET_OPCODE(G_SSHLSAT)
+
+// Perform signed fixed point multiplication
+HANDLE_TARGET_OPCODE(G_SMULFIX)
+
+// Perform unsigned fixed point multiplication
+HANDLE_TARGET_OPCODE(G_UMULFIX)
+
+// Perform signed, saturating fixed point multiplication
+HANDLE_TARGET_OPCODE(G_SMULFIXSAT)
+
+// Perform unsigned, saturating fixed point multiplication
+HANDLE_TARGET_OPCODE(G_UMULFIXSAT)
+
+// Perform signed fixed point division
+HANDLE_TARGET_OPCODE(G_SDIVFIX)
+
+// Perform unsigned fixed point division
+HANDLE_TARGET_OPCODE(G_UDIVFIX)
+
+// Perform signed, saturating fixed point division
+HANDLE_TARGET_OPCODE(G_SDIVFIXSAT)
+
+// Perform unsigned, saturating fixed point division
+HANDLE_TARGET_OPCODE(G_UDIVFIXSAT)
+
 /// Generic FP addition.
 HANDLE_TARGET_OPCODE(G_FADD)
 
@@ -493,6 +536,9 @@ HANDLE_TARGET_OPCODE(G_FREM)
 /// Generic FP exponentiation.
 HANDLE_TARGET_OPCODE(G_FPOW)
 
+/// Generic FP exponentiation, with an integer exponent.
+HANDLE_TARGET_OPCODE(G_FPOWI)
+
 /// Generic base-e exponential of a value.
 HANDLE_TARGET_OPCODE(G_FEXP)
 
@@ -571,6 +617,9 @@ HANDLE_TARGET_OPCODE(G_UMIN)
 /// Generic unsigned integer maximum.
 HANDLE_TARGET_OPCODE(G_UMAX)
 
+/// Generic integer absolute value.
+HANDLE_TARGET_OPCODE(G_ABS)
+
 /// Generic BRANCH instruction. This is an unconditional branch.
 HANDLE_TARGET_OPCODE(G_BR)
 
@@ -655,10 +704,36 @@ HANDLE_TARGET_OPCODE(G_READ_REGISTER)
 /// write_register intrinsic
 HANDLE_TARGET_OPCODE(G_WRITE_REGISTER)
 
+/// llvm.memcpy intrinsic
+HANDLE_TARGET_OPCODE(G_MEMCPY)
+
+/// llvm.memmove intrinsic
+HANDLE_TARGET_OPCODE(G_MEMMOVE)
+
+/// llvm.memset intrinsic
+HANDLE_TARGET_OPCODE(G_MEMSET)
+
+/// Vector reductions
+HANDLE_TARGET_OPCODE(G_VECREDUCE_SEQ_FADD)
+HANDLE_TARGET_OPCODE(G_VECREDUCE_SEQ_FMUL)
+HANDLE_TARGET_OPCODE(G_VECREDUCE_FADD)
+HANDLE_TARGET_OPCODE(G_VECREDUCE_FMUL)
+HANDLE_TARGET_OPCODE(G_VECREDUCE_FMAX)
+HANDLE_TARGET_OPCODE(G_VECREDUCE_FMIN)
+HANDLE_TARGET_OPCODE(G_VECREDUCE_ADD)
+HANDLE_TARGET_OPCODE(G_VECREDUCE_MUL)
+HANDLE_TARGET_OPCODE(G_VECREDUCE_AND)
+HANDLE_TARGET_OPCODE(G_VECREDUCE_OR)
+HANDLE_TARGET_OPCODE(G_VECREDUCE_XOR)
+HANDLE_TARGET_OPCODE(G_VECREDUCE_SMAX)
+HANDLE_TARGET_OPCODE(G_VECREDUCE_SMIN)
+HANDLE_TARGET_OPCODE(G_VECREDUCE_UMAX)
+HANDLE_TARGET_OPCODE(G_VECREDUCE_UMIN)
+
 /// Marker for the end of the generic opcode.
 /// This is used to check if an opcode is in the range of the
 /// generic opcodes.
-HANDLE_TARGET_OPCODE_MARKER(PRE_ISEL_GENERIC_OPCODE_END, G_WRITE_REGISTER)
+HANDLE_TARGET_OPCODE_MARKER(PRE_ISEL_GENERIC_OPCODE_END, G_VECREDUCE_UMIN)
 
 /// BUILTIN_OP_END - This must be the last enum value in this list.
 /// The target-specific post-isel opcode values start here.
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/TargetParser.h b/contrib/llvm-project/llvm/include/llvm/Support/TargetParser.h
index f521d8f836b4..450e713f27f2 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/TargetParser.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/TargetParser.h
@@ -62,17 +62,20 @@ enum GPUKind : uint32_t {
   // AMDGCN-based processors.
   GK_GFX600 = 32,
   GK_GFX601 = 33,
+  GK_GFX602 = 34,
 
   GK_GFX700 = 40,
   GK_GFX701 = 41,
   GK_GFX702 = 42,
   GK_GFX703 = 43,
   GK_GFX704 = 44,
+  GK_GFX705 = 45,
 
   GK_GFX801 = 50,
   GK_GFX802 = 51,
   GK_GFX803 = 52,
-  GK_GFX810 = 53,
+  GK_GFX805 = 53,
+  GK_GFX810 = 54,
 
   GK_GFX900 = 60,
   GK_GFX902 = 61,
@@ -80,14 +83,18 @@ enum GPUKind : uint32_t {
   GK_GFX906 = 63,
   GK_GFX908 = 64,
   GK_GFX909 = 65,
+  GK_GFX90C = 66,
 
   GK_GFX1010 = 71,
   GK_GFX1011 = 72,
   GK_GFX1012 = 73,
   GK_GFX1030 = 75,
+  GK_GFX1031 = 76,
+  GK_GFX1032 = 77,
+  GK_GFX1033 = 78,
 
   GK_AMDGCN_FIRST = GK_GFX600,
-  GK_AMDGCN_LAST = GK_GFX1030,
+  GK_AMDGCN_LAST = GK_GFX1033,
 };
 
 /// Instruction set architecture version.
@@ -112,12 +119,18 @@ enum ArchFeatureKind : uint32_t {
   FEATURE_FAST_DENORMAL_F32 = 1 << 5,
 
   // Wavefront 32 is available.
-  FEATURE_WAVE32 = 1 << 6
+  FEATURE_WAVE32 = 1 << 6,
+
+  // Xnack is available.
+  FEATURE_XNACK = 1 << 7,
+
+  // Sram-ecc is available.
+  FEATURE_SRAMECC = 1 << 8,
 };
 
 StringRef getArchNameAMDGCN(GPUKind AK);
 StringRef getArchNameR600(GPUKind AK);
-StringRef getCanonicalArchName(StringRef Arch);
+StringRef getCanonicalArchName(const Triple &T, StringRef Arch);
 GPUKind parseArchAMDGCN(StringRef CPU);
 GPUKind parseArchR600(StringRef CPU);
 unsigned getArchAttrAMDGCN(GPUKind AK);
@@ -149,10 +162,14 @@ enum FeatureKind : unsigned {
 };
 
 bool checkCPUKind(CPUKind Kind, bool IsRV64);
+bool checkTuneCPUKind(CPUKind Kind, bool IsRV64);
 CPUKind parseCPUKind(StringRef CPU);
+CPUKind parseTuneCPUKind(StringRef CPU, bool IsRV64);
 StringRef getMArchFromMcpu(StringRef CPU);
 void fillValidCPUArchList(SmallVectorImpl<StringRef> &Values, bool IsRV64);
+void fillValidTuneCPUArchList(SmallVectorImpl<StringRef> &Values, bool IsRV64);
 bool getCPUFeaturesExceptStdExt(CPUKind Kind, std::vector<StringRef> &Features);
+StringRef resolveTuneCPUAlias(StringRef TuneCPU, bool IsRV64);
 
 } // namespace RISCV
 
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/TargetRegistry.h b/contrib/llvm-project/llvm/include/llvm/Support/TargetRegistry.h
index d91eabae8235..2c65eb60f910 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/TargetRegistry.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/TargetRegistry.h
@@ -510,6 +510,8 @@ public:
         S = createWasmStreamer(Ctx, std::move(TAB), std::move(OW),
                                std::move(Emitter), RelaxAll);
       break;
+    case Triple::GOFF:
+      report_fatal_error("GOFF MCObjectStreamer not implemented yet");
     case Triple::XCOFF:
       S = createXCOFFStreamer(Ctx, std::move(TAB), std::move(OW),
                               std::move(Emitter), RelaxAll);
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/TaskQueue.h b/contrib/llvm-project/llvm/include/llvm/Support/TaskQueue.h
index 4ceb056391af..6901a550b62f 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/TaskQueue.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/TaskQueue.h
@@ -98,7 +98,7 @@ public:
         IsTaskInFlight = true;
       }
     }
-    return std::move(F);
+    return F;
   }
 
 private:
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/Threading.h b/contrib/llvm-project/llvm/include/llvm/Support/Threading.h
index 13000575f270..46cf82524e57 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/Threading.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/Threading.h
@@ -210,7 +210,7 @@ void llvm_execute_on_thread_async(
     return heavyweight_hardware_concurrency();
   }
 
-  /// Returns a default thread strategy where all available hardware ressources
+  /// Returns a default thread strategy where all available hardware resources
   /// are to be used, except for those initially excluded by an affinity mask.
   /// This function takes affinity into consideration. Returns 1 when LLVM is
   /// configured with LLVM_ENABLE_THREADS=OFF.
@@ -220,6 +220,16 @@ void llvm_execute_on_thread_async(
     return S;
   }
 
+  /// Returns an optimal thread strategy to execute specified amount of tasks.
+  /// This strategy should prevent us from creating too many threads if we
+  /// occasionaly have an unexpectedly small amount of tasks.
+  inline ThreadPoolStrategy optimal_concurrency(unsigned TaskCount = 0) {
+    ThreadPoolStrategy S;
+    S.Limit = true;
+    S.ThreadsRequested = TaskCount;
+    return S;
+  }
+
   /// Return the current thread id, as used in various OS system calls.
   /// Note that not all platforms guarantee that the value returned will be
   /// unique across the entire system, so portable code should not assume
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/ToolOutputFile.h b/contrib/llvm-project/llvm/include/llvm/Support/ToolOutputFile.h
index cf01b9ecefc5..ec1d6ae52268 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/ToolOutputFile.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/ToolOutputFile.h
@@ -35,6 +35,7 @@ class ToolOutputFile {
     /// The flag which indicates whether we should not delete the file.
     bool Keep;
 
+    StringRef getFilename() { return Filename; }
     explicit CleanupInstaller(StringRef Filename);
     ~CleanupInstaller();
   } Installer;
@@ -57,6 +58,9 @@ public:
   /// Return the contained raw_fd_ostream.
   raw_fd_ostream &os() { return *OS; }
 
+  /// Return the filename initialized with.
+  StringRef getFilename() { return Installer.getFilename(); }
+
   /// Indicate that the tool's job wrt this output file has been successful and
   /// the file should not be deleted.
   void keep() { Installer.Keep = true; }
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/TrigramIndex.h b/contrib/llvm-project/llvm/include/llvm/Support/TrigramIndex.h
index d635694eb5fd..0be6a1012718 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/TrigramIndex.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/TrigramIndex.h
@@ -27,7 +27,7 @@
 #define LLVM_SUPPORT_TRIGRAMINDEX_H
 
 #include "llvm/ADT/SmallVector.h"
-
+#include "llvm/ADT/StringRef.h"
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -38,7 +38,7 @@ class StringRef;
 class TrigramIndex {
  public:
   /// Inserts a new Regex into the index.
-  void insert(std::string Regex);
+  void insert(const std::string &Regex);
 
   /// Returns true, if special case list definitely does not have a line
   /// that matches the query. Returns false, if it's not sure.
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/TypeSize.h b/contrib/llvm-project/llvm/include/llvm/Support/TypeSize.h
index 76564c401e8e..d277affdbb23 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/TypeSize.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/TypeSize.h
@@ -15,152 +15,417 @@
 #ifndef LLVM_SUPPORT_TYPESIZE_H
 #define LLVM_SUPPORT_TYPESIZE_H
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/WithColor.h"
 
-#include <cstdint>
+#include <algorithm>
+#include <array>
 #include <cassert>
+#include <cstdint>
+#include <type_traits>
 
 namespace llvm {
 
-template <typename T> struct DenseMapInfo;
+template <typename LeafTy> struct LinearPolyBaseTypeTraits {};
+
+//===----------------------------------------------------------------------===//
+// LinearPolyBase - a base class for linear polynomials with multiple
+// dimensions. This can e.g. be used to describe offsets that are have both a
+// fixed and scalable component.
+//===----------------------------------------------------------------------===//
+
+/// LinearPolyBase describes a linear polynomial:
+///  c0 * scale0 + c1 * scale1 + ... + cK * scaleK
+/// where the scale is implicit, so only the coefficients are encoded.
+template <typename LeafTy>
+class LinearPolyBase {
+public:
+  using ScalarTy = typename LinearPolyBaseTypeTraits<LeafTy>::ScalarTy;
+  static constexpr auto Dimensions = LinearPolyBaseTypeTraits<LeafTy>::Dimensions;
+  static_assert(Dimensions != std::numeric_limits<unsigned>::max(),
+                "Dimensions out of range");
+
+private:
+  std::array<ScalarTy, Dimensions> Coefficients;
+
+protected:
+  LinearPolyBase(ArrayRef<ScalarTy> Values) {
+    std::copy(Values.begin(), Values.end(), Coefficients.begin());
+  }
 
-class ElementCount {
 public:
-  unsigned Min;  // Minimum number of vector elements.
-  bool Scalable; // If true, NumElements is a multiple of 'Min' determined
-                 // at runtime rather than compile time.
+  friend LeafTy &operator+=(LeafTy &LHS, const LeafTy &RHS) {
+    for (unsigned I=0; I<Dimensions; ++I)
+      LHS.Coefficients[I] += RHS.Coefficients[I];
+    return LHS;
+  }
 
-  ElementCount() = default;
+  friend LeafTy &operator-=(LeafTy &LHS, const LeafTy &RHS) {
+    for (unsigned I=0; I<Dimensions; ++I)
+      LHS.Coefficients[I] -= RHS.Coefficients[I];
+    return LHS;
+  }
 
-  ElementCount(unsigned Min, bool Scalable)
-  : Min(Min), Scalable(Scalable) {}
+  friend LeafTy &operator*=(LeafTy &LHS, ScalarTy RHS) {
+    for (auto &C : LHS.Coefficients)
+      C *= RHS;
+    return LHS;
+  }
 
-  ElementCount operator*(unsigned RHS) {
-    return { Min * RHS, Scalable };
+  friend LeafTy operator+(const LeafTy &LHS, const LeafTy &RHS) {
+    LeafTy Copy = LHS;
+    return Copy += RHS;
   }
-  ElementCount operator/(unsigned RHS) {
-    assert(Min % RHS == 0 && "Min is not a multiple of RHS.");
-    return { Min / RHS, Scalable };
+
+  friend LeafTy operator-(const LeafTy &LHS, const LeafTy &RHS) {
+    LeafTy Copy = LHS;
+    return Copy -= RHS;
   }
 
-  bool operator==(const ElementCount& RHS) const {
-    return Min == RHS.Min && Scalable == RHS.Scalable;
+  friend LeafTy operator*(const LeafTy &LHS, ScalarTy RHS) {
+    LeafTy Copy = LHS;
+    return Copy *= RHS;
   }
-  bool operator!=(const ElementCount& RHS) const {
+
+  template <typename U = ScalarTy>
+  friend typename std::enable_if_t<std::is_signed<U>::value, LeafTy>
+  operator-(const LeafTy &LHS) {
+    LeafTy Copy = LHS;
+    return Copy *= -1;
+  }
+
+  bool operator==(const LinearPolyBase &RHS) const {
+    return std::equal(Coefficients.begin(), Coefficients.end(),
+                      RHS.Coefficients.begin());
+  }
+
+  bool operator!=(const LinearPolyBase &RHS) const {
     return !(*this == RHS);
   }
-  bool operator==(unsigned RHS) const { return Min == RHS && !Scalable; }
-  bool operator!=(unsigned RHS) const { return !(*this == RHS); }
 
-  ElementCount NextPowerOf2() const {
-    return ElementCount(llvm::NextPowerOf2(Min), Scalable);
+  bool isZero() const {
+    return all_of(Coefficients, [](const ScalarTy &C) { return C == 0; });
   }
+  bool isNonZero() const { return !isZero(); }
+  explicit operator bool() const { return isNonZero(); }
+
+  ScalarTy getValue(unsigned Dim) const { return Coefficients[Dim]; }
 };
 
-// This class is used to represent the size of types. If the type is of fixed
-// size, it will represent the exact size. If the type is a scalable vector,
-// it will represent the known minimum size.
-class TypeSize {
-  uint64_t MinSize;   // The known minimum size.
-  bool IsScalable;    // If true, then the runtime size is an integer multiple
-                      // of MinSize.
+//===----------------------------------------------------------------------===//
+// StackOffset - Represent an offset with named fixed and scalable components.
+//===----------------------------------------------------------------------===//
+
+class StackOffset;
+template <> struct LinearPolyBaseTypeTraits<StackOffset> {
+  using ScalarTy = int64_t;
+  static constexpr unsigned Dimensions = 2;
+};
+
+/// StackOffset is a class to represent an offset with 2 dimensions,
+/// named fixed and scalable, respectively. This class allows a value for both
+/// dimensions to depict e.g. "8 bytes and 16 scalable bytes", which is needed
+/// to represent stack offsets.
+class StackOffset : public LinearPolyBase<StackOffset> {
+protected:
+  StackOffset(ScalarTy Fixed, ScalarTy Scalable)
+      : LinearPolyBase<StackOffset>({Fixed, Scalable}) {}
+
+public:
+  StackOffset() : StackOffset({0, 0}) {}
+  StackOffset(const LinearPolyBase<StackOffset> &Other)
+      : LinearPolyBase<StackOffset>(Other) {}
+  static StackOffset getFixed(ScalarTy Fixed) { return {Fixed, 0}; }
+  static StackOffset getScalable(ScalarTy Scalable) { return {0, Scalable}; }
+  static StackOffset get(ScalarTy Fixed, ScalarTy Scalable) {
+    return {Fixed, Scalable};
+  }
+
+  ScalarTy getFixed() const { return this->getValue(0); }
+  ScalarTy getScalable() const { return this->getValue(1); }
+};
+
+//===----------------------------------------------------------------------===//
+// UnivariateLinearPolyBase - a base class for linear polynomials with multiple
+// dimensions, but where only one dimension can be set at any time.
+// This can e.g. be used to describe sizes that are either fixed or scalable.
+//===----------------------------------------------------------------------===//
 
+/// UnivariateLinearPolyBase is a base class for ElementCount and TypeSize.
+/// Like LinearPolyBase it tries to represent a linear polynomial
+/// where only one dimension can be set at any time, e.g.
+///   0 * scale0 + 0 * scale1 + ... + cJ * scaleJ + ... + 0 * scaleK
+/// The dimension that is set is the univariate dimension.
+template <typename LeafTy>
+class UnivariateLinearPolyBase {
 public:
-  constexpr TypeSize(uint64_t MinSize, bool Scalable)
-    : MinSize(MinSize), IsScalable(Scalable) {}
+  using ScalarTy = typename LinearPolyBaseTypeTraits<LeafTy>::ScalarTy;
+  static constexpr auto Dimensions = LinearPolyBaseTypeTraits<LeafTy>::Dimensions;
+  static_assert(Dimensions != std::numeric_limits<unsigned>::max(),
+                "Dimensions out of range");
+
+protected:
+  ScalarTy Value;         // The value at the univeriate dimension.
+  unsigned UnivariateDim; // The univeriate dimension.
+
+  UnivariateLinearPolyBase(ScalarTy Val, unsigned UnivariateDim)
+      : Value(Val), UnivariateDim(UnivariateDim) {
+    assert(UnivariateDim < Dimensions && "Dimension out of range");
+  }
+
+  friend LeafTy &operator+=(LeafTy &LHS, const LeafTy &RHS) {
+    assert(LHS.UnivariateDim == RHS.UnivariateDim && "Invalid dimensions");
+    LHS.Value += RHS.Value;
+    return LHS;
+  }
+
+  friend LeafTy &operator-=(LeafTy &LHS, const LeafTy &RHS) {
+    assert(LHS.UnivariateDim == RHS.UnivariateDim && "Invalid dimensions");
+    LHS.Value -= RHS.Value;
+    return LHS;
+  }
+
+  friend LeafTy &operator*=(LeafTy &LHS, ScalarTy RHS) {
+    LHS.Value *= RHS;
+    return LHS;
+  }
 
-  static constexpr TypeSize Fixed(uint64_t Size) {
-    return TypeSize(Size, /*IsScalable=*/false);
+  friend LeafTy operator+(const LeafTy &LHS, const LeafTy &RHS) {
+    LeafTy Copy = LHS;
+    return Copy += RHS;
   }
 
-  static constexpr TypeSize Scalable(uint64_t MinSize) {
-    return TypeSize(MinSize, /*IsScalable=*/true);
+  friend LeafTy operator-(const LeafTy &LHS, const LeafTy &RHS) {
+    LeafTy Copy = LHS;
+    return Copy -= RHS;
   }
 
-  // Scalable vector types with the same minimum size as a fixed size type are
-  // not guaranteed to be the same size at runtime, so they are never
-  // considered to be equal.
-  friend bool operator==(const TypeSize &LHS, const TypeSize &RHS) {
-    return LHS.MinSize == RHS.MinSize && LHS.IsScalable == RHS.IsScalable;
+  friend LeafTy operator*(const LeafTy &LHS, ScalarTy RHS) {
+    LeafTy Copy = LHS;
+    return Copy *= RHS;
   }
 
-  friend bool operator!=(const TypeSize &LHS, const TypeSize &RHS) {
-    return !(LHS == RHS);
+  template <typename U = ScalarTy>
+  friend typename std::enable_if<std::is_signed<U>::value, LeafTy>::type
+  operator-(const LeafTy &LHS) {
+    LeafTy Copy = LHS;
+    return Copy *= -1;
   }
 
-  // For many cases, size ordering between scalable and fixed size types cannot
+public:
+  bool operator==(const UnivariateLinearPolyBase &RHS) const {
+    return Value == RHS.Value && UnivariateDim == RHS.UnivariateDim;
+  }
+
+  bool operator!=(const UnivariateLinearPolyBase &RHS) const {
+    return !(*this == RHS);
+  }
+
+  bool isZero() const { return !Value; }
+  bool isNonZero() const { return !isZero(); }
+  explicit operator bool() const { return isNonZero(); }
+  ScalarTy getValue() const { return Value; }
+  ScalarTy getValue(unsigned Dim) const {
+    return Dim == UnivariateDim ? Value : 0;
+  }
+
+  /// Add \p RHS to the value at the univariate dimension.
+  LeafTy getWithIncrement(ScalarTy RHS) {
+    return static_cast<LeafTy>(
+        UnivariateLinearPolyBase(Value + RHS, UnivariateDim));
+  }
+
+  /// Subtract \p RHS from the value at the univariate dimension.
+  LeafTy getWithDecrement(ScalarTy RHS) {
+    return static_cast<LeafTy>(
+        UnivariateLinearPolyBase(Value - RHS, UnivariateDim));
+  }
+};
+
+
+//===----------------------------------------------------------------------===//
+// LinearPolySize - base class for fixed- or scalable sizes.
+//  ^  ^ 
+//  |  |
+//  |  +----- ElementCount - Leaf class to represent an element count
+//  |                        (vscale x unsigned)
+//  |
+//  +-------- TypeSize - Leaf class to represent a type size
+//                       (vscale x uint64_t)
+//===----------------------------------------------------------------------===//
+
+/// LinearPolySize is a base class to represent sizes. It is either
+/// fixed-sized or it is scalable-sized, but it cannot be both.
+template <typename LeafTy>
+class LinearPolySize : public UnivariateLinearPolyBase<LeafTy> {
+  // Make the parent class a friend, so that it can access the protected
+  // conversion/copy-constructor for UnivariatePolyBase<LeafTy> ->
+  // LinearPolySize<LeafTy>.
+  friend class UnivariateLinearPolyBase<LeafTy>;
+
+public:
+  using ScalarTy = typename UnivariateLinearPolyBase<LeafTy>::ScalarTy;
+  enum Dims : unsigned { FixedDim = 0, ScalableDim = 1 };
+
+protected:
+  LinearPolySize(ScalarTy MinVal, Dims D)
+      : UnivariateLinearPolyBase<LeafTy>(MinVal, D) {}
+
+  LinearPolySize(const UnivariateLinearPolyBase<LeafTy> &V)
+      : UnivariateLinearPolyBase<LeafTy>(V) {}
+
+public:
+
+  static LeafTy getFixed(ScalarTy MinVal) {
+    return static_cast<LeafTy>(LinearPolySize(MinVal, FixedDim));
+  }
+  static LeafTy getScalable(ScalarTy MinVal) {
+    return static_cast<LeafTy>(LinearPolySize(MinVal, ScalableDim));
+  }
+  static LeafTy get(ScalarTy MinVal, bool Scalable) {
+    return static_cast<LeafTy>(
+        LinearPolySize(MinVal, Scalable ? ScalableDim : FixedDim));
+  }
+  static LeafTy getNull() { return get(0, false); }
+
+  /// Returns the minimum value this size can represent.
+  ScalarTy getKnownMinValue() const { return this->getValue(); }
+  /// Returns whether the size is scaled by a runtime quantity (vscale).
+  bool isScalable() const { return this->UnivariateDim == ScalableDim; }
+  /// A return value of true indicates we know at compile time that the number
+  /// of elements (vscale * Min) is definitely even. However, returning false
+  /// does not guarantee that the total number of elements is odd.
+  bool isKnownEven() const { return (getKnownMinValue() & 0x1) == 0; }
+  /// This function tells the caller whether the element count is known at
+  /// compile time to be a multiple of the scalar value RHS.
+  bool isKnownMultipleOf(ScalarTy RHS) const {
+    return getKnownMinValue() % RHS == 0;
+  }
+
+  // Return the minimum value with the assumption that the count is exact.
+  // Use in places where a scalable count doesn't make sense (e.g. non-vector
+  // types, or vectors in backends which don't support scalable vectors).
+  ScalarTy getFixedValue() const {
+    assert(!isScalable() &&
+           "Request for a fixed element count on a scalable object");
+    return getKnownMinValue();
+  }
+
+  // For some cases, size ordering between scalable and fixed size types cannot
   // be determined at compile time, so such comparisons aren't allowed.
   //
   // e.g. <vscale x 2 x i16> could be bigger than <4 x i32> with a runtime
   // vscale >= 5, equal sized with a vscale of 4, and smaller with
   // a vscale <= 3.
   //
-  // If the scalable flags match, just perform the requested comparison
-  // between the minimum sizes.
-  friend bool operator<(const TypeSize &LHS, const TypeSize &RHS) {
-    assert(LHS.IsScalable == RHS.IsScalable &&
-           "Ordering comparison of scalable and fixed types");
+  // All the functions below make use of the fact vscale is always >= 1, which
+  // means that <vscale x 4 x i32> is guaranteed to be >= <4 x i32>, etc.
 
-    return LHS.MinSize < RHS.MinSize;
+  static bool isKnownLT(const LinearPolySize &LHS, const LinearPolySize &RHS) {
+    if (!LHS.isScalable() || RHS.isScalable())
+      return LHS.getKnownMinValue() < RHS.getKnownMinValue();
+    return false;
   }
 
-  friend bool operator>(const TypeSize &LHS, const TypeSize &RHS) {
-    return RHS < LHS;
+  static bool isKnownGT(const LinearPolySize &LHS, const LinearPolySize &RHS) {
+    if (LHS.isScalable() || !RHS.isScalable())
+      return LHS.getKnownMinValue() > RHS.getKnownMinValue();
+    return false;
   }
 
-  friend bool operator<=(const TypeSize &LHS, const TypeSize &RHS) {
-    return !(RHS < LHS);
+  static bool isKnownLE(const LinearPolySize &LHS, const LinearPolySize &RHS) {
+    if (!LHS.isScalable() || RHS.isScalable())
+      return LHS.getKnownMinValue() <= RHS.getKnownMinValue();
+    return false;
   }
 
-  friend bool operator>=(const TypeSize &LHS, const TypeSize& RHS) {
-    return !(LHS < RHS);
+  static bool isKnownGE(const LinearPolySize &LHS, const LinearPolySize &RHS) {
+    if (LHS.isScalable() || !RHS.isScalable())
+      return LHS.getKnownMinValue() >= RHS.getKnownMinValue();
+    return false;
   }
 
-  // Convenience operators to obtain relative sizes independently of
-  // the scalable flag.
-  TypeSize operator*(unsigned RHS) const {
-    return { MinSize * RHS, IsScalable };
+  /// We do not provide the '/' operator here because division for polynomial
+  /// types does not work in the same way as for normal integer types. We can
+  /// only divide the minimum value (or coefficient) by RHS, which is not the
+  /// same as
+  ///   (Min * Vscale) / RHS
+  /// The caller is recommended to use this function in combination with
+  /// isKnownMultipleOf(RHS), which lets the caller know if it's possible to
+  /// perform a lossless divide by RHS.
+  LeafTy divideCoefficientBy(ScalarTy RHS) const {
+    return static_cast<LeafTy>(
+        LinearPolySize::get(getKnownMinValue() / RHS, isScalable()));
   }
 
-  friend TypeSize operator*(const unsigned LHS, const TypeSize &RHS) {
-    return { LHS * RHS.MinSize, RHS.IsScalable };
+  LeafTy coefficientNextPowerOf2() const {
+    return static_cast<LeafTy>(LinearPolySize::get(
+        static_cast<ScalarTy>(llvm::NextPowerOf2(getKnownMinValue())),
+        isScalable()));
   }
 
-  TypeSize operator/(unsigned RHS) const {
-    return { MinSize / RHS, IsScalable };
+  /// Printing function.
+  void print(raw_ostream &OS) const {
+    if (isScalable())
+      OS << "vscale x ";
+    OS << getKnownMinValue();
   }
+};
 
-  // Return the minimum size with the assumption that the size is exact.
-  // Use in places where a scalable size doesn't make sense (e.g. non-vector
-  // types, or vectors in backends which don't support scalable vectors).
-  uint64_t getFixedSize() const {
-    assert(!IsScalable && "Request for a fixed size on a scalable object");
-    return MinSize;
-  }
+class ElementCount;
+template <> struct LinearPolyBaseTypeTraits<ElementCount> {
+  using ScalarTy = unsigned;
+  static constexpr unsigned Dimensions = 2;
+};
 
-  // Return the known minimum size. Use in places where the scalable property
-  // doesn't matter (e.g. determining alignment) or in conjunction with the
-  // isScalable method below.
-  uint64_t getKnownMinSize() const {
-    return MinSize;
-  }
+class ElementCount : public LinearPolySize<ElementCount> {
+public:
 
-  // Return whether or not the size is scalable.
-  bool isScalable() const {
-    return IsScalable;
-  }
+  ElementCount(const LinearPolySize<ElementCount> &V) : LinearPolySize(V) {}
 
-  // Returns true if the number of bits is a multiple of an 8-bit byte.
-  bool isByteSized() const {
-    return (MinSize & 7) == 0;
+  /// Counting predicates.
+  ///
+  ///@{ Number of elements..
+  /// Exactly one element.
+  bool isScalar() const { return !isScalable() && getKnownMinValue() == 1; }
+  /// One or more elements.
+  bool isVector() const {
+    return (isScalable() && getKnownMinValue() != 0) || getKnownMinValue() > 1;
   }
+  ///@}
+};
 
-  // Returns true if the type size is non-zero.
-  bool isNonZero() const { return MinSize != 0; }
+// This class is used to represent the size of types. If the type is of fixed
+class TypeSize;
+template <> struct LinearPolyBaseTypeTraits<TypeSize> {
+  using ScalarTy = uint64_t;
+  static constexpr unsigned Dimensions = 2;
+};
 
-  // Returns true if the type size is zero.
-  bool isZero() const { return MinSize == 0; }
+// TODO: Most functionality in this class will gradually be phased out
+// so it will resemble LinearPolySize as much as possible.
+//
+// TypeSize is used to represent the size of types. If the type is of fixed
+// size, it will represent the exact size. If the type is a scalable vector,
+// it will represent the known minimum size.
+class TypeSize : public LinearPolySize<TypeSize> {
+public:
+  TypeSize(const LinearPolySize<TypeSize> &V) : LinearPolySize(V) {}
+  TypeSize(ScalarTy MinVal, bool IsScalable)
+      : LinearPolySize(LinearPolySize::get(MinVal, IsScalable)) {}
+
+  static TypeSize Fixed(ScalarTy MinVal) { return TypeSize(MinVal, false); }
+  static TypeSize Scalable(ScalarTy MinVal) { return TypeSize(MinVal, true); }
+
+  ScalarTy getFixedSize() const { return getFixedValue(); }
+  ScalarTy getKnownMinSize() const { return getKnownMinValue(); }
+
+  // All code for this class below this point is needed because of the
+  // temporary implicit conversion to uint64_t. The operator overloads are
+  // needed because otherwise the conversion of the parent class
+  // UnivariateLinearPolyBase -> TypeSize is ambiguous.
+  // TODO: Remove the implicit conversion.
 
   // Casts to a uint64_t if this is a fixed-width size.
   //
@@ -173,68 +438,54 @@ public:
   // To determine how to upgrade the code:
   //
   //   if (<algorithm works for both scalable and fixed-width vectors>)
-  //     use getKnownMinSize()
+  //     use getKnownMinValue()
   //   else if (<algorithm works only for fixed-width vectors>) {
   //     if <algorithm can be adapted for both scalable and fixed-width vectors>
-  //       update the algorithm and use getKnownMinSize()
+  //       update the algorithm and use getKnownMinValue()
   //     else
-  //       bail out early for scalable vectors and use getFixedSize()
+  //       bail out early for scalable vectors and use getFixedValue()
   //   }
-  operator uint64_t() const {
+  operator ScalarTy() const {
 #ifdef STRICT_FIXED_SIZE_VECTORS
-    return getFixedSize();
+    return getFixedValue();
 #else
     if (isScalable())
       WithColor::warning() << "Compiler has made implicit assumption that "
                               "TypeSize is not scalable. This may or may not "
                               "lead to broken code.\n";
-    return getKnownMinSize();
+    return getKnownMinValue();
 #endif
   }
 
-  // Additional convenience operators needed to avoid ambiguous parses.
-  // TODO: Make uint64_t the default operator?
-  TypeSize operator*(uint64_t RHS) const {
-    return { MinSize * RHS, IsScalable };
+  // Additional operators needed to avoid ambiguous parses
+  // because of the implicit conversion hack.
+  friend TypeSize operator*(const TypeSize &LHS, const int RHS) {
+    return LHS * (ScalarTy)RHS;
   }
-
-  TypeSize operator*(int RHS) const {
-    return { MinSize * RHS, IsScalable };
+  friend TypeSize operator*(const TypeSize &LHS, const unsigned RHS) {
+    return LHS * (ScalarTy)RHS;
   }
-
-  TypeSize operator*(int64_t RHS) const {
-    return { MinSize * RHS, IsScalable };
+  friend TypeSize operator*(const TypeSize &LHS, const int64_t RHS) {
+    return LHS * (ScalarTy)RHS;
   }
-
-  friend TypeSize operator*(const uint64_t LHS, const TypeSize &RHS) {
-    return { LHS * RHS.MinSize, RHS.IsScalable };
-  }
-
   friend TypeSize operator*(const int LHS, const TypeSize &RHS) {
-    return { LHS * RHS.MinSize, RHS.IsScalable };
-  }
-
-  friend TypeSize operator*(const int64_t LHS, const TypeSize &RHS) {
-    return { LHS * RHS.MinSize, RHS.IsScalable };
-  }
-
-  TypeSize operator/(uint64_t RHS) const {
-    return { MinSize / RHS, IsScalable };
+    return RHS * LHS;
   }
-
-  TypeSize operator/(int RHS) const {
-    return { MinSize / RHS, IsScalable };
+  friend TypeSize operator*(const unsigned LHS, const TypeSize &RHS) {
+    return RHS * LHS;
   }
-
-  TypeSize operator/(int64_t RHS) const {
-    return { MinSize / RHS, IsScalable };
+  friend TypeSize operator*(const int64_t LHS, const TypeSize &RHS) {
+    return RHS * LHS;
   }
-
-  TypeSize NextPowerOf2() const {
-    return TypeSize(llvm::NextPowerOf2(MinSize), IsScalable);
+  friend TypeSize operator*(const uint64_t LHS, const TypeSize &RHS) {
+    return RHS * LHS;
   }
 };
 
+//===----------------------------------------------------------------------===//
+// Utilities
+//===----------------------------------------------------------------------===//
+
 /// Returns a TypeSize with a known minimum size that is the next integer
 /// (mod 2**64) that is greater than or equal to \p Value and is a multiple
 /// of \p Align. \p Align must be non-zero.
@@ -242,21 +493,35 @@ public:
 /// Similar to the alignTo functions in MathExtras.h
 inline TypeSize alignTo(TypeSize Size, uint64_t Align) {
   assert(Align != 0u && "Align must be non-zero");
-  return {(Size.getKnownMinSize() + Align - 1) / Align * Align,
+  return {(Size.getKnownMinValue() + Align - 1) / Align * Align,
           Size.isScalable()};
 }
 
+/// Stream operator function for `LinearPolySize`.
+template <typename LeafTy>
+inline raw_ostream &operator<<(raw_ostream &OS,
+                               const LinearPolySize<LeafTy> &PS) {
+  PS.print(OS);
+  return OS;
+}
+
+template <typename T> struct DenseMapInfo;
 template <> struct DenseMapInfo<ElementCount> {
-  static inline ElementCount getEmptyKey() { return {~0U, true}; }
-  static inline ElementCount getTombstoneKey() { return {~0U - 1, false}; }
-  static unsigned getHashValue(const ElementCount& EltCnt) {
-    if (EltCnt.Scalable)
-      return (EltCnt.Min * 37U) - 1U;
+  static inline ElementCount getEmptyKey() {
+    return ElementCount::getScalable(~0U);
+  }
+  static inline ElementCount getTombstoneKey() {
+    return ElementCount::getFixed(~0U - 1);
+  }
+  static unsigned getHashValue(const ElementCount &EltCnt) {
+    unsigned HashVal = EltCnt.getKnownMinValue() * 37U;
+    if (EltCnt.isScalable())
+      return (HashVal - 1U);
 
-    return EltCnt.Min * 37U;
+    return HashVal;
   }
 
-  static bool isEqual(const ElementCount& LHS, const ElementCount& RHS) {
+  static bool isEqual(const ElementCount &LHS, const ElementCount &RHS) {
     return LHS == RHS;
   }
 };
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/VirtualFileSystem.h b/contrib/llvm-project/llvm/include/llvm/Support/VirtualFileSystem.h
index af09c21085c5..c6ddbf60efdf 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/VirtualFileSystem.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/VirtualFileSystem.h
@@ -37,6 +37,7 @@
 namespace llvm {
 
 class MemoryBuffer;
+class MemoryBufferRef;
 class Twine;
 
 namespace vfs {
@@ -463,7 +464,8 @@ public:
   /// false if the file or directory already exists in the file system with
   /// different contents.
   bool addFileNoOwn(const Twine &Path, time_t ModificationTime,
-                    llvm::MemoryBuffer *Buffer, Optional<uint32_t> User = None,
+                    const llvm::MemoryBufferRef &Buffer,
+                    Optional<uint32_t> User = None,
                     Optional<uint32_t> Group = None,
                     Optional<llvm::sys::fs::file_type> Type = None,
                     Optional<llvm::sys::fs::perms> Perms = None);
@@ -498,7 +500,7 @@ llvm::sys::fs::UniqueID getNextVirtualUniqueID();
 
 /// Gets a \p FileSystem for a virtual file system described in YAML
 /// format.
-IntrusiveRefCntPtr<FileSystem>
+std::unique_ptr<FileSystem>
 getVFSFromYAML(std::unique_ptr<llvm::MemoryBuffer> Buffer,
                llvm::SourceMgr::DiagHandlerTy DiagHandler,
                StringRef YAMLFilePath, void *DiagContext = nullptr,
@@ -649,9 +651,12 @@ private:
   friend class VFSFromYamlDirIterImpl;
   friend class RedirectingFileSystemParser;
 
-  bool shouldUseExternalFS() const {
-    return ExternalFSValidWD && IsFallthrough;
-  }
+  bool shouldUseExternalFS() const { return IsFallthrough; }
+
+  /// Canonicalize path by removing ".", "..", "./", components. This is
+  /// a VFS request, do not bother about symlinks in the path components
+  /// but canonicalize in order to perform the correct entry search.
+  std::error_code makeCanonical(SmallVectorImpl<char> &Path) const;
 
   // In a RedirectingFileSystem, keys can be specified in Posix or Windows
   // style (or even a mixture of both), so this comparison helper allows
@@ -670,9 +675,6 @@ private:
   /// The current working directory of the file system.
   std::string WorkingDirectory;
 
-  /// Whether the current working directory is valid for the external FS.
-  bool ExternalFSValidWD = false;
-
   /// The file system to use for external references.
   IntrusiveRefCntPtr<FileSystem> ExternalFS;
 
@@ -720,15 +722,20 @@ private:
 
 public:
   /// Looks up \p Path in \c Roots.
-  ErrorOr<Entry *> lookupPath(const Twine &Path) const;
+  ErrorOr<Entry *> lookupPath(StringRef Path) const;
 
   /// Parses \p Buffer, which is expected to be in YAML format and
   /// returns a virtual file system representing its contents.
-  static RedirectingFileSystem *
+  static std::unique_ptr<RedirectingFileSystem>
   create(std::unique_ptr<MemoryBuffer> Buffer,
          SourceMgr::DiagHandlerTy DiagHandler, StringRef YAMLFilePath,
          void *DiagContext, IntrusiveRefCntPtr<FileSystem> ExternalFS);
 
+  /// Redirect each of the remapped files from first to second.
+  static std::unique_ptr<RedirectingFileSystem>
+  create(ArrayRef<std::pair<std::string, std::string>> RemappedFiles,
+         bool UseExternalNames, FileSystem &ExternalFS);
+
   ErrorOr<Status> status(const Twine &Path) override;
   ErrorOr<std::unique_ptr<File>> openFileForRead(const Twine &Path) override;
 
@@ -749,6 +756,10 @@ public:
 
   StringRef getExternalContentsPrefixDir() const;
 
+  void setFallthrough(bool Fallthrough);
+
+  std::vector<llvm::StringRef> getRoots() const;
+
   void dump(raw_ostream &OS) const;
   void dumpEntry(raw_ostream &OS, Entry *E, int NumSpaces = 0) const;
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/Win64EH.h b/contrib/llvm-project/llvm/include/llvm/Support/Win64EH.h
index 8220131e5be9..9359fcb4286a 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/Win64EH.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/Win64EH.h
@@ -38,12 +38,14 @@ enum UnwindOpcodes {
   // The following set of unwind opcodes is for ARM64.  They are documented at
   // https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling
   UOP_AllocMedium,
+  UOP_SaveR19R20X,
   UOP_SaveFPLRX,
   UOP_SaveFPLR,
   UOP_SaveReg,
   UOP_SaveRegX,
   UOP_SaveRegP,
   UOP_SaveRegPX,
+  UOP_SaveLRPair,
   UOP_SaveFReg,
   UOP_SaveFRegX,
   UOP_SaveFRegP,
@@ -51,7 +53,11 @@ enum UnwindOpcodes {
   UOP_SetFP,
   UOP_AddFP,
   UOP_Nop,
-  UOP_End
+  UOP_End,
+  UOP_SaveNext,
+  UOP_TrapFrame,
+  UOP_Context,
+  UOP_ClearUnwoundToCall
 };
 
 /// UnwindCode - This union describes a single operation in a function prolog,
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/X86TargetParser.def b/contrib/llvm-project/llvm/include/llvm/Support/X86TargetParser.def
index 697f8c70f962..ec19ce4e7cdd 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/X86TargetParser.def
+++ b/contrib/llvm-project/llvm/include/llvm/Support/X86TargetParser.def
@@ -44,6 +44,7 @@ X86_CPU_TYPE(INTEL_KNM,           "knm")
 X86_CPU_TYPE(INTEL_GOLDMONT,      "goldmont")
 X86_CPU_TYPE(INTEL_GOLDMONT_PLUS, "goldmont-plus")
 X86_CPU_TYPE(INTEL_TREMONT,       "tremont")
+X86_CPU_TYPE(AMDFAM19H,           "amdfam19h")
 
 // Alternate names supported by __builtin_cpu_is and target multiversioning.
 X86_CPU_TYPE_ALIAS(INTEL_BONNELL,    "atom")
@@ -84,6 +85,9 @@ X86_CPU_SUBTYPE(AMDFAM17H_ZNVER2,            "znver2")
 X86_CPU_SUBTYPE(INTEL_COREI7_CASCADELAKE,    "cascadelake")
 X86_CPU_SUBTYPE(INTEL_COREI7_TIGERLAKE,      "tigerlake")
 X86_CPU_SUBTYPE(INTEL_COREI7_COOPERLAKE,     "cooperlake")
+X86_CPU_SUBTYPE(INTEL_COREI7_SAPPHIRERAPIDS, "sapphirerapids")
+X86_CPU_SUBTYPE(INTEL_COREI7_ALDERLAKE,      "alderlake")
+X86_CPU_SUBTYPE(AMDFAM19H_ZNVER3,            "znver3")
 #undef X86_CPU_SUBTYPE
 
 
@@ -153,6 +157,8 @@ X86_FEATURE       (F16C,            "f16c")
 X86_FEATURE       (FSGSBASE,        "fsgsbase")
 X86_FEATURE       (FXSR,            "fxsr")
 X86_FEATURE       (INVPCID,         "invpcid")
+X86_FEATURE       (KL,              "kl")
+X86_FEATURE       (WIDEKL,          "widekl")
 X86_FEATURE       (LWP,             "lwp")
 X86_FEATURE       (LZCNT,           "lzcnt")
 X86_FEATURE       (MOVBE,           "movbe")
@@ -175,6 +181,7 @@ X86_FEATURE       (SHA,             "sha")
 X86_FEATURE       (SHSTK,           "shstk")
 X86_FEATURE       (TBM,             "tbm")
 X86_FEATURE       (TSXLDTRK,        "tsxldtrk")
+X86_FEATURE       (UINTR,           "uintr")
 X86_FEATURE       (VAES,            "vaes")
 X86_FEATURE       (VZEROUPPER,      "vzeroupper")
 X86_FEATURE       (WAITPKG,         "waitpkg")
@@ -184,6 +191,8 @@ X86_FEATURE       (XSAVE,           "xsave")
 X86_FEATURE       (XSAVEC,          "xsavec")
 X86_FEATURE       (XSAVEOPT,        "xsaveopt")
 X86_FEATURE       (XSAVES,          "xsaves")
+X86_FEATURE       (HRESET,          "hreset")
+X86_FEATURE       (AVXVNNI,         "avxvnni")
 // These features aren't really CPU features, but the frontend can set them.
 X86_FEATURE       (RETPOLINE_EXTERNAL_THUNK,    "retpoline-external-thunk")
 X86_FEATURE       (RETPOLINE_INDIRECT_BRANCHES, "retpoline-indirect-branches")
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/X86TargetParser.h b/contrib/llvm-project/llvm/include/llvm/Support/X86TargetParser.h
index 66c474b5c275..2d5083023a11 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/X86TargetParser.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/X86TargetParser.h
@@ -14,6 +14,7 @@
 #define LLVM_SUPPORT_X86TARGETPARSERCOMMON_H
 
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
 
 namespace llvm {
 class StringRef;
@@ -99,6 +100,8 @@ enum CPUKind {
   CK_IcelakeClient,
   CK_IcelakeServer,
   CK_Tigerlake,
+  CK_SapphireRapids,
+  CK_Alderlake,
   CK_KNL,
   CK_KNM,
   CK_Lakemont,
@@ -118,18 +121,26 @@ enum CPUKind {
   CK_BDVER4,
   CK_ZNVER1,
   CK_ZNVER2,
+  CK_ZNVER3,
   CK_x86_64,
+  CK_x86_64_v2,
+  CK_x86_64_v3,
+  CK_x86_64_v4,
   CK_Geode,
 };
 
 /// Parse \p CPU string into a CPUKind. Will only accept 64-bit capable CPUs if
 /// \p Only64Bit is true.
 CPUKind parseArchX86(StringRef CPU, bool Only64Bit = false);
+CPUKind parseTuneCPU(StringRef CPU, bool Only64Bit = false);
 
 /// Provide a list of valid CPU names. If \p Only64Bit is true, the list will
 /// only contain 64-bit capable CPUs.
 void fillValidCPUArchList(SmallVectorImpl<StringRef> &Values,
-                          bool ArchIs32Bit);
+                          bool Only64Bit = false);
+/// Provide a list of valid -mtune names.
+void fillValidTuneCPUList(SmallVectorImpl<StringRef> &Values,
+                          bool Only64Bit = false);
 
 /// Get the key feature prioritizing target multiversioning.
 ProcessorFeatures getKeyFeature(CPUKind Kind);
@@ -137,10 +148,10 @@ ProcessorFeatures getKeyFeature(CPUKind Kind);
 /// Fill in the features that \p CPU supports into \p Features.
 void getFeaturesForCPU(StringRef CPU, SmallVectorImpl<StringRef> &Features);
 
-/// Fill \p Features with the features that are implied to be enabled/disabled
+/// Set or clear entries in \p Features that are implied to be enabled/disabled
 /// by the provided \p Feature.
-void getImpliedFeatures(StringRef Feature, bool Enabled,
-                        SmallVectorImpl<StringRef> &Features);
+void updateImpliedFeatures(StringRef Feature, bool Enabled,
+                           StringMap<bool> &Features);
 
 } // namespace X86
 } // namespace llvm
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/YAMLParser.h b/contrib/llvm-project/llvm/include/llvm/Support/YAMLParser.h
index 53009d7ff4aa..759e11afd447 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/YAMLParser.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/YAMLParser.h
@@ -40,6 +40,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/SMLoc.h"
+#include "llvm/Support/SourceMgr.h"
 #include <cassert>
 #include <cstddef>
 #include <iterator>
@@ -51,7 +52,6 @@
 namespace llvm {
 
 class MemoryBufferRef;
-class SourceMgr;
 class raw_ostream;
 class Twine;
 
@@ -78,6 +78,9 @@ bool scanTokens(StringRef Input);
 /// escaped, but emitted verbatim.
 std::string escape(StringRef Input, bool EscapePrintable = true);
 
+/// Parse \p S as a bool according to https://yaml.org/type/bool.html.
+llvm::Optional<bool> parseBool(StringRef S);
+
 /// This class represents a YAML stream potentially containing multiple
 ///        documents.
 class Stream {
@@ -100,7 +103,10 @@ public:
     return !failed();
   }
 
-  void printError(Node *N, const Twine &Msg);
+  void printError(Node *N, const Twine &Msg,
+                  SourceMgr::DiagKind Kind = SourceMgr::DK_Error);
+  void printError(const SMRange &Range, const Twine &Msg,
+                  SourceMgr::DiagKind Kind = SourceMgr::DK_Error);
 
 private:
   friend class Document;
@@ -222,7 +228,7 @@ public:
 
   /// Gets the value of this node as a StringRef.
   ///
-  /// \param Storage is used to store the content of the returned StringRef iff
+  /// \param Storage is used to store the content of the returned StringRef if
   ///        it requires any modification from how it appeared in the source.
   ///        This happens with escaped characters and multi-line literals.
   StringRef getValue(SmallVectorImpl<char> &Storage) const;
@@ -509,7 +515,6 @@ public:
       : Node(NK_Alias, D, StringRef(), StringRef()), Name(Val) {}
 
   StringRef getName() const { return Name; }
-  Node *getTarget();
 
   static bool classof(const Node *N) { return N->getType() == NK_Alias; }
 
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/YAMLTraits.h b/contrib/llvm-project/llvm/include/llvm/Support/YAMLTraits.h
index 44e34a4a09b4..9ac9eb300983 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/YAMLTraits.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/YAMLTraits.h
@@ -19,7 +19,9 @@
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Regex.h"
+#include "llvm/Support/SMLoc.h"
 #include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/VersionTuple.h"
 #include "llvm/Support/YAMLParser.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
@@ -61,7 +63,7 @@ struct MappingTraits {
   // Must provide:
   // static void mapping(IO &io, T &fields);
   // Optionally may provide:
-  // static StringRef validate(IO &io, T &fields);
+  // static std::string validate(IO &io, T &fields);
   //
   // The optional flow flag will cause generated YAML to use a flow mapping
   // (e.g. { a: 0, b: 1 }):
@@ -83,7 +85,7 @@ template <class T, class Context> struct MappingContextTraits {
   // Must provide:
   // static void mapping(IO &io, T &fields, Context &Ctx);
   // Optionally may provide:
-  // static StringRef validate(IO &io, T &fields, Context &Ctx);
+  // static std::string validate(IO &io, T &fields, Context &Ctx);
   //
   // The optional flow flag will cause generated YAML to use a flow mapping
   // (e.g. { a: 0, b: 1 }):
@@ -421,7 +423,7 @@ template <class T> struct has_MappingTraits<T, EmptyContext> {
 
 // Test if MappingContextTraits<T>::validate() is defined on type T.
 template <class T, class Context> struct has_MappingValidateTraits {
-  using Signature_validate = StringRef (*)(class IO &, T &, Context &);
+  using Signature_validate = std::string (*)(class IO &, T &, Context &);
 
   template <typename U>
   static char test(SameType<Signature_validate, &U::validate>*);
@@ -435,7 +437,7 @@ template <class T, class Context> struct has_MappingValidateTraits {
 
 // Test if MappingTraits<T>::validate() is defined on type T.
 template <class T> struct has_MappingValidateTraits<T, EmptyContext> {
-  using Signature_validate = StringRef (*)(class IO &, T &);
+  using Signature_validate = std::string (*)(class IO &, T &);
 
   template <typename U>
   static char test(SameType<Signature_validate, &U::validate> *);
@@ -637,6 +639,7 @@ inline bool isNull(StringRef S) {
 }
 
 inline bool isBool(StringRef S) {
+  // FIXME: using parseBool is causing multiple tests to fail.
   return S.equals("true") || S.equals("True") || S.equals("TRUE") ||
          S.equals("false") || S.equals("False") || S.equals("FALSE");
 }
@@ -789,6 +792,7 @@ public:
   virtual NodeKind getNodeKind() = 0;
 
   virtual void setError(const Twine &) = 0;
+  virtual void setAllowUnknownKeys(bool Allow);
 
   template <typename T>
   void enumCase(T &Val, const char* Str, const T ConstVal) {
@@ -902,24 +906,7 @@ private:
   template <typename T, typename Context>
   void processKeyWithDefault(const char *Key, Optional<T> &Val,
                              const Optional<T> &DefaultValue, bool Required,
-                             Context &Ctx) {
-    assert(DefaultValue.hasValue() == false &&
-           "Optional<T> shouldn't have a value!");
-    void *SaveInfo;
-    bool UseDefault = true;
-    const bool sameAsDefault = outputting() && !Val.hasValue();
-    if (!outputting() && !Val.hasValue())
-      Val = T();
-    if (Val.hasValue() &&
-        this->preflightKey(Key, Required, sameAsDefault, UseDefault,
-                           SaveInfo)) {
-      yamlize(*this, Val.getValue(), Required, Ctx);
-      this->postflightKey(SaveInfo);
-    } else {
-      if (UseDefault)
-        Val = DefaultValue;
-    }
-  }
+                             Context &Ctx);
 
   template <typename T, typename Context>
   void processKeyWithDefault(const char *Key, T &Val, const T &DefaultValue,
@@ -1057,7 +1044,7 @@ yamlize(IO &io, T &Val, bool, Context &Ctx) {
   else
     io.beginMapping();
   if (io.outputting()) {
-    StringRef Err = MappingTraits<T>::validate(io, Val);
+    std::string Err = MappingTraits<T>::validate(io, Val);
     if (!Err.empty()) {
       errs() << Err << "\n";
       assert(Err.empty() && "invalid struct trying to be written as yaml");
@@ -1065,7 +1052,7 @@ yamlize(IO &io, T &Val, bool, Context &Ctx) {
   }
   detail::doMapping(io, Val, Ctx);
   if (!io.outputting()) {
-    StringRef Err = MappingTraits<T>::validate(io, Val);
+    std::string Err = MappingTraits<T>::validate(io, Val);
     if (!Err.empty())
       io.setError(Err);
   }
@@ -1487,9 +1474,10 @@ private:
 
     static bool classof(const MapHNode *) { return true; }
 
-    using NameToNode = StringMap<std::unique_ptr<HNode>>;
+    using NameToNodeAndLoc =
+        StringMap<std::pair<std::unique_ptr<HNode>, SMRange>>;
 
-    NameToNode Mapping;
+    NameToNodeAndLoc Mapping;
     SmallVector<std::string, 6> ValidKeys;
   };
 
@@ -1511,6 +1499,11 @@ private:
   std::unique_ptr<Input::HNode> createHNodes(Node *node);
   void setError(HNode *hnode, const Twine &message);
   void setError(Node *node, const Twine &message);
+  void setError(const SMRange &Range, const Twine &message);
+
+  void reportWarning(HNode *hnode, const Twine &message);
+  void reportWarning(Node *hnode, const Twine &message);
+  void reportWarning(const SMRange &Range, const Twine &message);
 
 public:
   // These are only used by operator>>. They could be private
@@ -1521,6 +1514,8 @@ public:
   /// Returns the current node that's being parsed by the YAML Parser.
   const Node *getCurrentNode() const;
 
+  void setAllowUnknownKeys(bool Allow) override;
+
 private:
   SourceMgr                           SrcMgr; // must be before Strm
   std::unique_ptr<llvm::yaml::Stream> Strm;
@@ -1531,6 +1526,7 @@ private:
   std::vector<bool>                   BitValuesUsed;
   HNode *CurrentNode = nullptr;
   bool                                ScalarMatchFound = false;
+  bool AllowUnknownKeys = false;
 };
 
 ///
@@ -1590,7 +1586,7 @@ public:
 private:
   void output(StringRef s);
   void outputUpToEndOfLine(StringRef s);
-  void newLineCheck();
+  void newLineCheck(bool EmptySequence = false);
   void outputNewLine();
   void paddedKey(StringRef key);
   void flowKey(StringRef Key);
@@ -1625,6 +1621,42 @@ private:
   StringRef PaddingBeforeContainer;
 };
 
+template <typename T, typename Context>
+void IO::processKeyWithDefault(const char *Key, Optional<T> &Val,
+                               const Optional<T> &DefaultValue, bool Required,
+                               Context &Ctx) {
+  assert(DefaultValue.hasValue() == false &&
+         "Optional<T> shouldn't have a value!");
+  void *SaveInfo;
+  bool UseDefault = true;
+  const bool sameAsDefault = outputting() && !Val.hasValue();
+  if (!outputting() && !Val.hasValue())
+    Val = T();
+  if (Val.hasValue() &&
+      this->preflightKey(Key, Required, sameAsDefault, UseDefault, SaveInfo)) {
+
+    // When reading an Optional<X> key from a YAML description, we allow the
+    // special "<none>" value, which can be used to specify that no value was
+    // requested, i.e. the DefaultValue will be assigned. The DefaultValue is
+    // usually None.
+    bool IsNone = false;
+    if (!outputting())
+      if (auto *Node = dyn_cast<ScalarNode>(((Input *)this)->getCurrentNode()))
+        // We use rtrim to ignore possible white spaces that might exist when a
+        // comment is present on the same line.
+        IsNone = Node->getRawValue().rtrim(' ') == "<none>";
+
+    if (IsNone)
+      Val = DefaultValue;
+    else
+      yamlize(*this, Val.getValue(), Required, Ctx);
+    this->postflightKey(SaveInfo);
+  } else {
+    if (UseDefault)
+      Val = DefaultValue;
+  }
+}
+
 /// YAML I/O does conversion based on types. But often native data types
 /// are just a typedef of built in intergral types (e.g. int).  But the C++
 /// type matching system sees through the typedef and all the typedefed types
@@ -1685,6 +1717,12 @@ struct ScalarTraits<Hex64> {
   static QuotingType mustQuote(StringRef) { return QuotingType::None; }
 };
 
+template <> struct ScalarTraits<VersionTuple> {
+  static void output(const VersionTuple &Value, void *, llvm::raw_ostream &Out);
+  static StringRef input(StringRef, void *, VersionTuple &);
+  static QuotingType mustQuote(StringRef) { return QuotingType::None; }
+};
+
 // Define non-member operator>> so that Input can stream in a document list.
 template <typename T>
 inline std::enable_if_t<has_DocumentListTraits<T>::value, Input &>
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/raw_ostream.h b/contrib/llvm-project/llvm/include/llvm/Support/raw_ostream.h
index 8d289f7c765f..7d572fe06f6f 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/raw_ostream.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/raw_ostream.h
@@ -15,7 +15,9 @@
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/DataTypes.h"
 #include <cassert>
+#include <chrono>
 #include <cstddef>
 #include <cstdint>
 #include <cstring>
@@ -30,12 +32,14 @@ class format_object_base;
 class FormattedString;
 class FormattedNumber;
 class FormattedBytes;
+template <class T> class LLVM_NODISCARD Expected;
 
 namespace sys {
 namespace fs {
 enum FileAccess : unsigned;
 enum OpenFlags : unsigned;
 enum CreationDisposition : unsigned;
+class FileLocker;
 } // end namespace fs
 } // end namespace sys
 
@@ -44,7 +48,16 @@ enum CreationDisposition : unsigned;
 /// buffered disciplines etc. It is a simple buffer that outputs
 /// a chunk at a time.
 class raw_ostream {
+public:
+  // Class kinds to support LLVM-style RTTI.
+  enum class OStreamKind {
+    OK_OStream,
+    OK_FDStream,
+  };
+
 private:
+  OStreamKind Kind;
+
   /// The buffer is handled in such a way that the buffer is
   /// uninitialized, unbuffered, or out of space when OutBufCur >=
   /// OutBufEnd. Thus a single comparison suffices to determine if we
@@ -102,9 +115,10 @@ public:
   static constexpr Colors SAVEDCOLOR = Colors::SAVEDCOLOR;
   static constexpr Colors RESET = Colors::RESET;
 
-  explicit raw_ostream(bool unbuffered = false)
-      : BufferMode(unbuffered ? BufferKind::Unbuffered
-                              : BufferKind::InternalBuffer) {
+  explicit raw_ostream(bool unbuffered = false,
+                       OStreamKind K = OStreamKind::OK_OStream)
+      : Kind(K), BufferMode(unbuffered ? BufferKind::Unbuffered
+                                       : BufferKind::InternalBuffer) {
     // Start out ready to flush.
     OutBufStart = OutBufEnd = OutBufCur = nullptr;
   }
@@ -117,6 +131,8 @@ public:
   /// tell - Return the current offset with the file.
   uint64_t tell() const { return current_pos() + GetNumBytesInBuffer(); }
 
+  OStreamKind get_kind() const { return Kind; }
+
   //===--------------------------------------------------------------------===//
   // Configuration Interface
   //===--------------------------------------------------------------------===//
@@ -385,8 +401,9 @@ class raw_pwrite_stream : public raw_ostream {
   void anchor() override;
 
 public:
-  explicit raw_pwrite_stream(bool Unbuffered = false)
-      : raw_ostream(Unbuffered) {}
+  explicit raw_pwrite_stream(bool Unbuffered = false,
+                             OStreamKind K = OStreamKind::OK_OStream)
+      : raw_ostream(Unbuffered, K) {}
   void pwrite(const char *Ptr, size_t Size, uint64_t Offset) {
 #ifndef NDEBUG
     uint64_t Pos = tell();
@@ -409,6 +426,7 @@ class raw_fd_ostream : public raw_pwrite_stream {
   int FD;
   bool ShouldClose;
   bool SupportsSeeking = false;
+  mutable Optional<bool> HasColors;
 
 #ifdef _WIN32
   /// True if this fd refers to a Windows console device. Mintty and other
@@ -432,10 +450,17 @@ class raw_fd_ostream : public raw_pwrite_stream {
   /// Determine an efficient buffer size.
   size_t preferred_buffer_size() const override;
 
+  void anchor() override;
+
+protected:
   /// Set the flag indicating that an output error has been encountered.
   void error_detected(std::error_code EC) { this->EC = EC; }
 
-  void anchor() override;
+  /// Return the file descriptor.
+  int get_fd() const { return FD; }
+
+  // Update the file position by increasing \p Delta.
+  void inc_pos(uint64_t Delta) { pos += Delta; }
 
 public:
   /// Open the specified file for writing. If an error occurs, information
@@ -460,7 +485,8 @@ public:
   /// FD is the file descriptor that this writes to.  If ShouldClose is true,
   /// this closes the file when the stream is destroyed. If FD is for stdout or
   /// stderr, it will not be closed.
-  raw_fd_ostream(int fd, bool shouldClose, bool unbuffered=false);
+  raw_fd_ostream(int fd, bool shouldClose, bool unbuffered = false,
+                 OStreamKind K = OStreamKind::OK_OStream);
 
   ~raw_fd_ostream() override;
 
@@ -468,7 +494,7 @@ public:
   /// fsync.
   void close();
 
-  bool supportsSeeking() { return SupportsSeeking; }
+  bool supportsSeeking() const { return SupportsSeeking; }
 
   /// Flushes the stream and repositions the underlying file descriptor position
   /// to the offset specified from the beginning of the file.
@@ -496,6 +522,38 @@ public:
   ///      - from The Zen of Python, by Tim Peters
   ///
   void clear_error() { EC = std::error_code(); }
+
+  /// Locks the underlying file.
+  ///
+  /// @returns RAII object that releases the lock upon leaving the scope, if the
+  ///          locking was successful. Otherwise returns corresponding
+  ///          error code.
+  ///
+  /// The function blocks the current thread until the lock become available or
+  /// error occurs.
+  ///
+  /// Possible use of this function may be as follows:
+  ///
+  ///   @code{.cpp}
+  ///   if (auto L = stream.lock()) {
+  ///     // ... do action that require file to be locked.
+  ///   } else {
+  ///     handleAllErrors(std::move(L.takeError()), [&](ErrorInfoBase &EIB) {
+  ///       // ... handle lock error.
+  ///     });
+  ///   }
+  ///   @endcode
+  LLVM_NODISCARD Expected<sys::fs::FileLocker> lock();
+
+  /// Tries to lock the underlying file within the specified period.
+  ///
+  /// @returns RAII object that releases the lock upon leaving the scope, if the
+  ///          locking was successful. Otherwise returns corresponding
+  ///          error code.
+  ///
+  /// It is used as @ref lock.
+  LLVM_NODISCARD
+  Expected<sys::fs::FileLocker> tryLockFor(std::chrono::milliseconds Timeout);
 };
 
 /// This returns a reference to a raw_fd_ostream for standard output. Use it
@@ -512,6 +570,34 @@ raw_fd_ostream &errs();
 /// This returns a reference to a raw_ostream which simply discards output.
 raw_ostream &nulls();
 
+//===----------------------------------------------------------------------===//
+// File Streams
+//===----------------------------------------------------------------------===//
+
+/// A raw_ostream of a file for reading/writing/seeking.
+///
+class raw_fd_stream : public raw_fd_ostream {
+public:
+  /// Open the specified file for reading/writing/seeking. If an error occurs,
+  /// information about the error is put into EC, and the stream should be
+  /// immediately destroyed.
+  raw_fd_stream(StringRef Filename, std::error_code &EC);
+
+  /// This reads the \p Size bytes into a buffer pointed by \p Ptr.
+  ///
+  /// \param Ptr The start of the buffer to hold data to be read.
+  ///
+  /// \param Size The number of bytes to be read.
+  ///
+  /// On success, the number of bytes read is returned, and the file position is
+  /// advanced by this number. On error, -1 is returned, use error() to get the
+  /// error code.
+  ssize_t read(char *Ptr, size_t Size);
+
+  /// Check if \p OS is a pointer of type raw_fd_stream*.
+  static bool classof(const raw_ostream *OS);
+};
+
 //===----------------------------------------------------------------------===//
 // Output Stream Adaptors
 //===----------------------------------------------------------------------===//
@@ -601,6 +687,18 @@ public:
   ~buffer_ostream() override { OS << str(); }
 };
 
+class buffer_unique_ostream : public raw_svector_ostream {
+  std::unique_ptr<raw_ostream> OS;
+  SmallVector<char, 0> Buffer;
+
+  virtual void anchor() override;
+
+public:
+  buffer_unique_ostream(std::unique_ptr<raw_ostream> OS)
+      : raw_svector_ostream(Buffer), OS(std::move(OS)) {}
+  ~buffer_unique_ostream() override { *OS << str(); }
+};
+
 } // end namespace llvm
 
 #endif // LLVM_SUPPORT_RAW_OSTREAM_H
diff --git a/contrib/llvm-project/llvm/include/llvm/TableGen/DirectiveEmitter.h b/contrib/llvm-project/llvm/include/llvm/TableGen/DirectiveEmitter.h
new file mode 100644
index 000000000000..27ad0665a0e8
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/TableGen/DirectiveEmitter.h
@@ -0,0 +1,211 @@
+#ifndef LLVM_TABLEGEN_DIRECTIVEEMITTER_H
+#define LLVM_TABLEGEN_DIRECTIVEEMITTER_H
+
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/TableGen/Record.h"
+
+namespace llvm {
+
+// Wrapper class that contains DirectiveLanguage's information defined in
+// DirectiveBase.td and provides helper methods for accessing it.
+class DirectiveLanguage {
+public:
+  explicit DirectiveLanguage(const llvm::RecordKeeper &Records)
+      : Records(Records) {
+    const auto &DirectiveLanguages = getDirectiveLanguages();
+    Def = DirectiveLanguages[0];
+  }
+
+  StringRef getName() const { return Def->getValueAsString("name"); }
+
+  StringRef getCppNamespace() const {
+    return Def->getValueAsString("cppNamespace");
+  }
+
+  StringRef getDirectivePrefix() const {
+    return Def->getValueAsString("directivePrefix");
+  }
+
+  StringRef getClausePrefix() const {
+    return Def->getValueAsString("clausePrefix");
+  }
+
+  StringRef getIncludeHeader() const {
+    return Def->getValueAsString("includeHeader");
+  }
+
+  StringRef getClauseEnumSetClass() const {
+    return Def->getValueAsString("clauseEnumSetClass");
+  }
+
+  StringRef getFlangClauseBaseClass() const {
+    return Def->getValueAsString("flangClauseBaseClass");
+  }
+
+  bool hasMakeEnumAvailableInNamespace() const {
+    return Def->getValueAsBit("makeEnumAvailableInNamespace");
+  }
+
+  bool hasEnableBitmaskEnumInNamespace() const {
+    return Def->getValueAsBit("enableBitmaskEnumInNamespace");
+  }
+
+  const std::vector<Record *> getDirectives() const {
+    return Records.getAllDerivedDefinitions("Directive");
+  }
+
+  const std::vector<Record *> getClauses() const {
+    return Records.getAllDerivedDefinitions("Clause");
+  }
+
+  bool HasValidityErrors() const;
+
+private:
+  const llvm::Record *Def;
+  const llvm::RecordKeeper &Records;
+
+  const std::vector<Record *> getDirectiveLanguages() const {
+    return Records.getAllDerivedDefinitions("DirectiveLanguage");
+  }
+};
+
+// Base record class used for Directive and Clause class defined in
+// DirectiveBase.td.
+class BaseRecord {
+public:
+  explicit BaseRecord(const llvm::Record *Def) : Def(Def) {}
+
+  StringRef getName() const { return Def->getValueAsString("name"); }
+
+  StringRef getAlternativeName() const {
+    return Def->getValueAsString("alternativeName");
+  }
+
+  // Returns the name of the directive formatted for output. Whitespace are
+  // replaced with underscores.
+  std::string getFormattedName() {
+    StringRef Name = Def->getValueAsString("name");
+    std::string N = Name.str();
+    std::replace(N.begin(), N.end(), ' ', '_');
+    return N;
+  }
+
+  bool isDefault() const { return Def->getValueAsBit("isDefault"); }
+
+  // Returns the record name.
+  const StringRef getRecordName() const { return Def->getName(); }
+
+protected:
+  const llvm::Record *Def;
+};
+
+// Wrapper class that contains a Directive's information defined in
+// DirectiveBase.td and provides helper methods for accessing it.
+class Directive : public BaseRecord {
+public:
+  explicit Directive(const llvm::Record *Def) : BaseRecord(Def) {}
+
+  std::vector<Record *> getAllowedClauses() const {
+    return Def->getValueAsListOfDefs("allowedClauses");
+  }
+
+  std::vector<Record *> getAllowedOnceClauses() const {
+    return Def->getValueAsListOfDefs("allowedOnceClauses");
+  }
+
+  std::vector<Record *> getAllowedExclusiveClauses() const {
+    return Def->getValueAsListOfDefs("allowedExclusiveClauses");
+  }
+
+  std::vector<Record *> getRequiredClauses() const {
+    return Def->getValueAsListOfDefs("requiredClauses");
+  }
+};
+
+// Wrapper class that contains Clause's information defined in DirectiveBase.td
+// and provides helper methods for accessing it.
+class Clause : public BaseRecord {
+public:
+  explicit Clause(const llvm::Record *Def) : BaseRecord(Def) {}
+
+  // Optional field.
+  StringRef getClangClass() const {
+    return Def->getValueAsString("clangClass");
+  }
+
+  // Optional field.
+  StringRef getFlangClass() const {
+    return Def->getValueAsString("flangClass");
+  }
+
+  // Get the formatted name for Flang parser class. The generic formatted class
+  // name is constructed from the name were the first letter of each word is
+  // captitalized and the underscores are removed.
+  // ex: async -> Async
+  //     num_threads -> NumThreads
+  std::string getFormattedParserClassName() {
+    StringRef Name = Def->getValueAsString("name");
+    std::string N = Name.str();
+    bool Cap = true;
+    std::transform(N.begin(), N.end(), N.begin(), [&Cap](unsigned char C) {
+      if (Cap == true) {
+        C = llvm::toUpper(C);
+        Cap = false;
+      } else if (C == '_') {
+        Cap = true;
+      }
+      return C;
+    });
+    N.erase(std::remove(N.begin(), N.end(), '_'), N.end());
+    return N;
+  }
+
+  // Optional field.
+  StringRef getEnumName() const {
+    return Def->getValueAsString("enumClauseValue");
+  }
+
+  std::vector<Record *> getClauseVals() const {
+    return Def->getValueAsListOfDefs("allowedClauseValues");
+  }
+
+  bool isValueOptional() const { return Def->getValueAsBit("isValueOptional"); }
+
+  bool isValueList() const { return Def->getValueAsBit("isValueList"); }
+
+  StringRef getDefaultValue() const {
+    return Def->getValueAsString("defaultValue");
+  }
+
+  bool isImplicit() const { return Def->getValueAsBit("isImplicit"); }
+};
+
+// Wrapper class that contains VersionedClause's information defined in
+// DirectiveBase.td and provides helper methods for accessing it.
+class VersionedClause {
+public:
+  explicit VersionedClause(const llvm::Record *Def) : Def(Def) {}
+
+  // Return the specific clause record wrapped in the Clause class.
+  Clause getClause() const { return Clause{Def->getValueAsDef("clause")}; }
+
+  int64_t getMinVersion() const { return Def->getValueAsInt("minVersion"); }
+
+  int64_t getMaxVersion() const { return Def->getValueAsInt("maxVersion"); }
+
+private:
+  const llvm::Record *Def;
+};
+
+class ClauseVal : public BaseRecord {
+public:
+  explicit ClauseVal(const llvm::Record *Def) : BaseRecord(Def) {}
+
+  int getValue() const { return Def->getValueAsInt("value"); }
+
+  bool isUserVisible() const { return Def->getValueAsBit("isUserValue"); }
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm-project/llvm/include/llvm/TableGen/Error.h b/contrib/llvm-project/llvm/include/llvm/TableGen/Error.h
index cf990427f577..f63b50ad786c 100644
--- a/contrib/llvm-project/llvm/include/llvm/TableGen/Error.h
+++ b/contrib/llvm-project/llvm/include/llvm/TableGen/Error.h
@@ -15,23 +15,38 @@
 #define LLVM_TABLEGEN_ERROR_H
 
 #include "llvm/Support/SourceMgr.h"
+#include "llvm/TableGen/Record.h"
 
 namespace llvm {
 
 void PrintNote(const Twine &Msg);
 void PrintNote(ArrayRef<SMLoc> NoteLoc, const Twine &Msg);
 
+LLVM_ATTRIBUTE_NORETURN void PrintFatalNote(const Twine &Msg);
+LLVM_ATTRIBUTE_NORETURN void PrintFatalNote(ArrayRef<SMLoc> ErrorLoc,
+                                            const Twine &Msg);
+LLVM_ATTRIBUTE_NORETURN void PrintFatalNote(const Record *Rec,
+                                            const Twine &Msg);
+LLVM_ATTRIBUTE_NORETURN void PrintFatalNote(const RecordVal *RecVal,
+                                            const Twine &Msg);
+
+void PrintWarning(const Twine &Msg);
 void PrintWarning(ArrayRef<SMLoc> WarningLoc, const Twine &Msg);
 void PrintWarning(const char *Loc, const Twine &Msg);
-void PrintWarning(const Twine &Msg);
 
+void PrintError(const Twine &Msg);
 void PrintError(ArrayRef<SMLoc> ErrorLoc, const Twine &Msg);
 void PrintError(const char *Loc, const Twine &Msg);
-void PrintError(const Twine &Msg);
+void PrintError(const Record *Rec, const Twine &Msg);
+void PrintError(const RecordVal *RecVal, const Twine &Msg);
 
 LLVM_ATTRIBUTE_NORETURN void PrintFatalError(const Twine &Msg);
 LLVM_ATTRIBUTE_NORETURN void PrintFatalError(ArrayRef<SMLoc> ErrorLoc,
                                              const Twine &Msg);
+LLVM_ATTRIBUTE_NORETURN void PrintFatalError(const Record *Rec,
+                                             const Twine &Msg);
+LLVM_ATTRIBUTE_NORETURN void PrintFatalError(const RecordVal *RecVal,
+                                             const Twine &Msg);
 
 extern SourceMgr SrcMgr;
 extern unsigned ErrorsPrinted;
diff --git a/contrib/llvm-project/llvm/include/llvm/TableGen/Record.h b/contrib/llvm-project/llvm/include/llvm/TableGen/Record.h
index a082fe5d74a1..b71aa0a89056 100644
--- a/contrib/llvm-project/llvm/include/llvm/TableGen/Record.h
+++ b/contrib/llvm-project/llvm/include/llvm/TableGen/Record.h
@@ -20,10 +20,12 @@
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/SMLoc.h"
+#include "llvm/Support/Timer.h"
 #include "llvm/Support/TrailingObjects.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
@@ -57,7 +59,6 @@ public:
   enum RecTyKind {
     BitRecTyKind,
     BitsRecTyKind,
-    CodeRecTyKind,
     IntRecTyKind,
     StringRecTyKind,
     ListRecTyKind,
@@ -67,6 +68,7 @@ public:
 
 private:
   RecTyKind Kind;
+  /// ListRecTy of the list that has elements of this type.
   ListRecTy *ListTy = nullptr;
 
 public:
@@ -87,7 +89,7 @@ public:
   /// a bit set is not an int, but they are convertible.
   virtual bool typeIsA(const RecTy *RHS) const;
 
-  /// Returns the type representing list<this>.
+  /// Returns the type representing list<thistype>.
   ListRecTy *getListTy();
 };
 
@@ -136,24 +138,6 @@ public:
   bool typeIsA(const RecTy *RHS) const override;
 };
 
-/// 'code' - Represent a code fragment
-class CodeRecTy : public RecTy {
-  static CodeRecTy Shared;
-
-  CodeRecTy() : RecTy(CodeRecTyKind) {}
-
-public:
-  static bool classof(const RecTy *RT) {
-    return RT->getRecTyKind() == CodeRecTyKind;
-  }
-
-  static CodeRecTy *get() { return &Shared; }
-
-  std::string getAsString() const override { return "code"; }
-
-  bool typeIsConvertibleTo(const RecTy *RHS) const override;
-};
-
 /// 'int' - Represent an integer value of no particular size
 class IntRecTy : public RecTy {
   static IntRecTy Shared;
@@ -190,14 +174,14 @@ public:
   bool typeIsConvertibleTo(const RecTy *RHS) const override;
 };
 
-/// 'list<Ty>' - Represent a list of values, all of which must be of
-/// the specified type.
+/// 'list<Ty>' - Represent a list of element values, all of which must be of
+/// the specified type. The type is stored in ElementTy.
 class ListRecTy : public RecTy {
   friend ListRecTy *RecTy::getListTy();
 
-  RecTy *Ty;
+  RecTy *ElementTy;
 
-  explicit ListRecTy(RecTy *T) : RecTy(ListRecTyKind), Ty(T) {}
+  explicit ListRecTy(RecTy *T) : RecTy(ListRecTyKind), ElementTy(T) {}
 
 public:
   static bool classof(const RecTy *RT) {
@@ -205,7 +189,7 @@ public:
   }
 
   static ListRecTy *get(RecTy *T) { return T->getListTy(); }
-  RecTy *getElementType() const { return Ty; }
+  RecTy *getElementType() const { return ElementTy; }
 
   std::string getAsString() const override;
 
@@ -304,7 +288,6 @@ protected:
     IK_FirstTypedInit,
     IK_BitInit,
     IK_BitsInit,
-    IK_CodeInit,
     IK_DagInit,
     IK_DefInit,
     IK_FieldInit,
@@ -337,6 +320,7 @@ private:
   virtual void anchor();
 
 public:
+  /// Get the kind (type) of the value.
   InitKind getKind() const { return Kind; }
 
 protected:
@@ -347,63 +331,61 @@ public:
   Init &operator=(const Init &) = delete;
   virtual ~Init() = default;
 
-  /// This virtual method should be overridden by values that may
-  /// not be completely specified yet.
+  /// Is this a complete value with no unset (uninitialized) subvalues?
   virtual bool isComplete() const { return true; }
 
   /// Is this a concrete and fully resolved value without any references or
   /// stuck operations? Unset values are concrete.
   virtual bool isConcrete() const { return false; }
 
-  /// Print out this value.
+  /// Print this value.
   void print(raw_ostream &OS) const { OS << getAsString(); }
 
-  /// Convert this value to a string form.
+  /// Convert this value to a literal form.
   virtual std::string getAsString() const = 0;
-  /// Convert this value to a string form,
-  /// without adding quote markers.  This primaruly affects
-  /// StringInits where we will not surround the string value with
-  /// quotes.
+
+  /// Convert this value to a literal form,
+  /// without adding quotes around a string.
   virtual std::string getAsUnquotedString() const { return getAsString(); }
 
-  /// Debugging method that may be called through a debugger, just
+  /// Debugging method that may be called through a debugger; just
   /// invokes print on stderr.
   void dump() const;
 
-  /// If this initializer is convertible to Ty, return an initializer whose
-  /// type is-a Ty, generating a !cast operation if required. Otherwise, return
-  /// nullptr.
+  /// If this value is convertible to type \p Ty, return a value whose
+  /// type is \p Ty, generating a !cast operation if required.
+  /// Otherwise, return null.
   virtual Init *getCastTo(RecTy *Ty) const = 0;
 
-  /// Convert to an initializer whose type is-a Ty, or return nullptr if this
-  /// is not possible (this can happen if the initializer's type is convertible
-  /// to Ty, but there are unresolved references).
+  /// Convert to a value whose type is \p Ty, or return null if this
+  /// is not possible. This can happen if the value's type is convertible
+  /// to \p Ty, but there are unresolved references.
   virtual Init *convertInitializerTo(RecTy *Ty) const = 0;
 
-  /// This method is used to implement the bitrange
-  /// selection operator.  Given an initializer, it selects the specified bits
-  /// out, returning them as a new init of bits type.  If it is not legal to use
-  /// the bit subscript operator on this initializer, return null.
+  /// This function is used to implement the bit range
+  /// selection operator. Given a value, it selects the specified bits,
+  /// returning them as a new \p Init of type \p bits. If it is not legal
+  /// to use the bit selection operator on this value, null is returned.
   virtual Init *convertInitializerBitRange(ArrayRef<unsigned> Bits) const {
     return nullptr;
   }
 
-  /// This method is used to implement the list slice
-  /// selection operator.  Given an initializer, it selects the specified list
-  /// elements, returning them as a new init of list type.  If it is not legal
-  /// to take a slice of this, return null.
+  /// This function is used to implement the list slice
+  /// selection operator.  Given a value, it selects the specified list
+  /// elements, returning them as a new \p Init of type \p list. If it
+  /// is not legal to use the slice operator, null is returned.
   virtual Init *convertInitListSlice(ArrayRef<unsigned> Elements) const {
     return nullptr;
   }
 
-  /// This method is used to implement the FieldInit class.
-  /// Implementors of this method should return the type of the named field if
-  /// they are of record type.
+  /// This function is used to implement the FieldInit class.
+  /// Implementors of this method should return the type of the named
+  /// field if they are of type record.
   virtual RecTy *getFieldType(StringInit *FieldName) const {
     return nullptr;
   }
 
-  /// This method is used by classes that refer to other
+  /// This function is used by classes that refer to other
   /// variables which may not be defined at the time the expression is formed.
   /// If a value is set for the variable later, this method will be called on
   /// users of the value to allow the value to propagate out.
@@ -411,8 +393,7 @@ public:
     return const_cast<Init *>(this);
   }
 
-  /// This method is used to return the initializer for the specified
-  /// bit.
+  /// Get the \p Init value of the specified bit.
   virtual Init *getBit(unsigned Bit) const = 0;
 };
 
@@ -420,14 +401,14 @@ inline raw_ostream &operator<<(raw_ostream &OS, const Init &I) {
   I.print(OS); return OS;
 }
 
-/// This is the common super-class of types that have a specific,
-/// explicit, type.
+/// This is the common superclass of types that have a specific,
+/// explicit type, stored in ValueTy.
 class TypedInit : public Init {
-  RecTy *Ty;
+  RecTy *ValueTy;
 
 protected:
   explicit TypedInit(InitKind K, RecTy *T, uint8_t Opc = 0)
-    : Init(K, Opc), Ty(T) {}
+      : Init(K, Opc), ValueTy(T) {}
 
 public:
   TypedInit(const TypedInit &) = delete;
@@ -438,7 +419,8 @@ public:
            I->getKind() <= IK_LastTypedInit;
   }
 
-  RecTy *getType() const { return Ty; }
+  /// Get the type of the Init as a RecTy.
+  RecTy *getType() const { return ValueTy; }
 
   Init *getCastTo(RecTy *Ty) const override;
   Init *convertInitializerTo(RecTy *Ty) const override;
@@ -448,12 +430,11 @@ public:
 
   /// This method is used to implement the FieldInit class.
   /// Implementors of this method should return the type of the named field if
-  /// they are of record type.
-  ///
+  /// they are of type record.
   RecTy *getFieldType(StringInit *FieldName) const override;
 };
 
-/// '?' - Represents an uninitialized value
+/// '?' - Represents an uninitialized value.
 class UnsetInit : public Init {
   UnsetInit() : Init(IK_UnsetInit) {}
 
@@ -465,6 +446,7 @@ public:
     return I->getKind() == IK_UnsetInit;
   }
 
+  /// Get the singleton unset Init.
   static UnsetInit *get();
 
   Init *getCastTo(RecTy *Ty) const override;
@@ -474,8 +456,12 @@ public:
     return const_cast<UnsetInit*>(this);
   }
 
+  /// Is this a complete value with no unset (uninitialized) subvalues?
   bool isComplete() const override { return false; }
+
   bool isConcrete() const override { return true; }
+
+  /// Get the string representation of the Init.
   std::string getAsString() const override { return "?"; }
 };
 
@@ -592,10 +578,18 @@ public:
 
 /// "foo" - Represent an initialization by a string value.
 class StringInit : public TypedInit {
+public:
+  enum StringFormat {
+    SF_String, // Format as "text"
+    SF_Code,   // Format as [{text}]
+  };
+
+private:
   StringRef Value;
+  StringFormat Format;
 
-  explicit StringInit(StringRef V)
-      : TypedInit(IK_StringInit, StringRecTy::get()), Value(V) {}
+  explicit StringInit(StringRef V, StringFormat Fmt)
+      : TypedInit(IK_StringInit, StringRecTy::get()), Value(V), Format(Fmt) {}
 
 public:
   StringInit(const StringInit &) = delete;
@@ -605,50 +599,25 @@ public:
     return I->getKind() == IK_StringInit;
   }
 
-  static StringInit *get(StringRef);
-
-  StringRef getValue() const { return Value; }
-
-  Init *convertInitializerTo(RecTy *Ty) const override;
-
-  bool isConcrete() const override { return true; }
-  std::string getAsString() const override { return "\"" + Value.str() + "\""; }
-
-  std::string getAsUnquotedString() const override {
-    return std::string(Value);
-  }
-
-  Init *getBit(unsigned Bit) const override {
-    llvm_unreachable("Illegal bit reference off string");
-  }
-};
-
-class CodeInit : public TypedInit {
-  StringRef Value;
-  SMLoc Loc;
-
-  explicit CodeInit(StringRef V, const SMLoc &Loc)
-      : TypedInit(IK_CodeInit, static_cast<RecTy *>(CodeRecTy::get())),
-        Value(V), Loc(Loc) {}
-
-public:
-  CodeInit(const StringInit &) = delete;
-  CodeInit &operator=(const StringInit &) = delete;
+  static StringInit *get(StringRef, StringFormat Fmt = SF_String);
 
-  static bool classof(const Init *I) {
-    return I->getKind() == IK_CodeInit;
+  static StringFormat determineFormat(StringFormat Fmt1, StringFormat Fmt2) {
+    return (Fmt1 == SF_Code || Fmt2 == SF_Code) ? SF_Code : SF_String;
   }
 
-  static CodeInit *get(StringRef, const SMLoc &Loc);
-
   StringRef getValue() const { return Value; }
-  const SMLoc &getLoc() const { return Loc; }
+  StringFormat getFormat() const { return Format; }  
+  bool hasCodeFormat() const { return Format == SF_Code; }
 
   Init *convertInitializerTo(RecTy *Ty) const override;
 
   bool isConcrete() const override { return true; }
+
   std::string getAsString() const override {
-    return "[{" + Value.str() + "}]";
+    if (Format == SF_String)
+      return "\"" + Value.str() + "\"";
+    else
+      return "[{" + Value.str() + "}]";
   }
 
   std::string getAsUnquotedString() const override {
@@ -755,7 +724,7 @@ public:
 ///
 class UnOpInit : public OpInit, public FoldingSetNode {
 public:
-  enum UnaryOp : uint8_t { CAST, HEAD, TAIL, SIZE, EMPTY, GETOP };
+  enum UnaryOp : uint8_t { CAST, NOT, HEAD, TAIL, SIZE, EMPTY, GETDAGOP };
 
 private:
   Init *LHS;
@@ -804,9 +773,9 @@ public:
 /// !op (X, Y) - Combine two inits.
 class BinOpInit : public OpInit, public FoldingSetNode {
 public:
-  enum BinaryOp : uint8_t { ADD, MUL, AND, OR, SHL, SRA, SRL, LISTCONCAT,
-                            LISTSPLAT, STRCONCAT, CONCAT, EQ, NE, LE, LT, GE,
-                            GT, SETOP };
+  enum BinaryOp : uint8_t { ADD, SUB, MUL, AND, OR, XOR, SHL, SRA, SRL, LISTCONCAT,
+                            LISTSPLAT, STRCONCAT, INTERLEAVE, CONCAT, EQ,
+                            NE, LE, LT, GE, GT, SETDAGOP };
 
 private:
   Init *LHS, *RHS;
@@ -826,7 +795,6 @@ public:
                         RecTy *Type);
   static Init *getStrConcat(Init *lhs, Init *rhs);
   static Init *getListConcat(TypedInit *lhs, Init *rhs);
-  static Init *getListSplat(TypedInit *lhs, Init *rhs);
 
   void Profile(FoldingSetNodeID &ID) const;
 
@@ -862,7 +830,7 @@ public:
 /// !op (X, Y, Z) - Combine two inits.
 class TernOpInit : public OpInit, public FoldingSetNode {
 public:
-  enum TernaryOp : uint8_t { SUBST, FOREACH, IF, DAG };
+  enum TernaryOp : uint8_t { SUBST, FOREACH, FILTER, IF, DAG, SUBSTR };
 
 private:
   Init *LHS, *MHS, *RHS;
@@ -1397,30 +1365,70 @@ public:
 //  High-Level Classes
 //===----------------------------------------------------------------------===//
 
+/// This class represents a field in a record, including its name, type,
+/// value, and source location.
 class RecordVal {
   friend class Record;
 
+public:
+  enum FieldKind {
+    FK_Normal,        // A normal record field.
+    FK_NonconcreteOK, // A field that can be nonconcrete ('field' keyword).
+    FK_TemplateArg,   // A template argument.
+  };
+
+private:
   Init *Name;
-  PointerIntPair<RecTy *, 1, bool> TyAndPrefix;
+  SMLoc Loc; // Source location of definition of name.
+  PointerIntPair<RecTy *, 2, FieldKind> TyAndKind;
   Init *Value;
 
 public:
-  RecordVal(Init *N, RecTy *T, bool P);
+  RecordVal(Init *N, RecTy *T, FieldKind K);
+  RecordVal(Init *N, SMLoc Loc, RecTy *T, FieldKind K);
 
+  /// Get the name of the field as a StringRef.
   StringRef getName() const;
+
+  /// Get the name of the field as an Init.
   Init *getNameInit() const { return Name; }
 
+  /// Get the name of the field as a std::string.
   std::string getNameInitAsString() const {
     return getNameInit()->getAsUnquotedString();
   }
 
-  bool getPrefix() const { return TyAndPrefix.getInt(); }
-  RecTy *getType() const { return TyAndPrefix.getPointer(); }
+  /// Get the source location of the point where the field was defined.
+  const SMLoc &getLoc() const { return Loc; }
+
+  /// Is this a field where nonconcrete values are okay?
+  bool isNonconcreteOK() const {
+    return TyAndKind.getInt() == FK_NonconcreteOK;
+  }
+
+  /// Is this a template argument?
+  bool isTemplateArg() const {
+    return TyAndKind.getInt() == FK_TemplateArg;
+  }
+
+  /// Get the type of the field value as a RecTy.
+  RecTy *getType() const { return TyAndKind.getPointer(); }
+
+  /// Get the type of the field for printing purposes.
+  std::string getPrintType() const;
+
+  /// Get the value of the field as an Init.
   Init *getValue() const { return Value; }
 
+  /// Set the value of the field from an Init.
   bool setValue(Init *V);
 
+  /// Set the value and source location of the field.
+  bool setValue(Init *V, SMLoc NewLoc);
+
   void dump() const;
+
+  /// Print the value to an output stream, possibly with a semicolon.
   void print(raw_ostream &OS, bool PrintSem = true) const;
 };
 
@@ -1438,15 +1446,18 @@ class Record {
   SmallVector<SMLoc, 4> Locs;
   SmallVector<Init *, 0> TemplateArgs;
   SmallVector<RecordVal, 0> Values;
+  // Vector of [source location, condition Init, message Init].
+  SmallVector<std::tuple<SMLoc, Init *, Init *>, 0> Assertions;
 
-  // All superclasses in the inheritance forest in reverse preorder (yes, it
+  // All superclasses in the inheritance forest in post-order (yes, it
   // must be a forest; diamond-shaped inheritance is not allowed).
   SmallVector<std::pair<Record *, SMRange>, 0> SuperClasses;
 
   // Tracks Record instances. Not owned by Record.
   RecordKeeper &TrackedRecords;
 
-  DefInit *TheInit = nullptr;
+  // The DefInit corresponding to this record.
+  DefInit *CorrespondingDefInit = nullptr;
 
   // Unique record ID.
   unsigned ID;
@@ -1470,8 +1481,8 @@ public:
       : Record(StringInit::get(N), locs, records, false, Class) {}
 
   // When copy-constructing a Record, we must still guarantee a globally unique
-  // ID number.  Don't copy TheInit either since it's owned by the original
-  // record. All other fields can be copied normally.
+  // ID number. Don't copy CorrespondingDefInit either, since it's owned by the
+  // original record. All other fields can be copied normally.
   Record(const Record &O)
     : Name(O.Name), Locs(O.Locs), TemplateArgs(O.TemplateArgs),
       Values(O.Values), SuperClasses(O.SuperClasses),
@@ -1511,11 +1522,18 @@ public:
 
   ArrayRef<RecordVal> getValues() const { return Values; }
 
+  ArrayRef<std::tuple<SMLoc, Init *, Init *>> getAssertions() const {
+    return Assertions;
+  }
+
   ArrayRef<std::pair<Record *, SMRange>>  getSuperClasses() const {
     return SuperClasses;
   }
 
-  /// Append the direct super classes of this record to Classes.
+  /// Determine whether this record has the specified direct superclass.
+  bool hasDirectSuperClass(const Record *SuperClass) const;
+
+  /// Append the direct superclasses of this record to Classes.
   void getDirectSuperClasses(SmallVectorImpl<Record *> &Classes) const;
 
   bool isTemplateArg(Init *Name) const {
@@ -1565,6 +1583,10 @@ public:
     removeValue(StringInit::get(Name));
   }
 
+  void addAssertion(SMLoc Loc, Init *Condition, Init *Message) {
+    Assertions.push_back(std::make_tuple(Loc, Condition, Message));
+  }
+
   bool isSubClassOf(const Record *R) const {
     for (const auto &SCPair : SuperClasses)
       if (SCPair.first == R)
@@ -1585,7 +1607,8 @@ public:
   }
 
   void addSuperClass(Record *R, SMRange Range) {
-    assert(!TheInit && "changing type of record after it has been referenced");
+    assert(!CorrespondingDefInit &&
+           "changing type of record after it has been referenced");
     assert(!isSubClassOf(R) && "Already subclassing record!");
     SuperClasses.push_back(std::make_pair(R, Range));
   }
@@ -1612,13 +1635,15 @@ public:
     return IsAnonymous;
   }
 
-  void print(raw_ostream &OS) const;
   void dump() const;
 
   //===--------------------------------------------------------------------===//
   // High-level methods useful to tablegen back-ends
   //
 
+  ///Return the source location for the named field.
+  SMLoc getFieldLoc(StringRef FieldName) const;
+
   /// Return the initializer for a value with the specified name,
   /// or throw an exception if the field does not exist.
   Init *getValueInit(StringRef FieldName) const;
@@ -1633,6 +1658,11 @@ public:
   /// or if the value is not a string.
   StringRef getValueAsString(StringRef FieldName) const;
 
+  /// This method looks up the specified field and returns
+  /// its value as a string, throwing an exception if the field if the value is
+  /// not a string and llvm::Optional() if the field does not exist.
+  llvm::Optional<StringRef> getValueAsOptionalString(StringRef FieldName) const;
+
   /// This method looks up the specified field and returns
   /// its value as a BitsInit, throwing an exception if the field does not exist
   /// or if the value is not the right type.
@@ -1694,26 +1724,50 @@ raw_ostream &operator<<(raw_ostream &OS, const Record &R);
 
 class RecordKeeper {
   friend class RecordRecTy;
+
   using RecordMap = std::map<std::string, std::unique_ptr<Record>, std::less<>>;
+  using GlobalMap = std::map<std::string, Init *, std::less<>>;
+
+  std::string InputFilename;
   RecordMap Classes, Defs;
+  mutable StringMap<std::vector<Record *>> ClassRecordsMap;
   FoldingSet<RecordRecTy> RecordTypePool;
   std::map<std::string, Init *, std::less<>> ExtraGlobals;
   unsigned AnonCounter = 0;
 
+  // These members are for the phase timing feature. We need a timer group,
+  // the last timer started, and a flag to say whether the last timer
+  // is the special "backend overall timer."
+  TimerGroup *TimingGroup = nullptr;
+  Timer *LastTimer = nullptr;
+  bool BackendTimer = false;
+
 public:
+  /// Get the main TableGen input file's name.
+  const std::string getInputFilename() const { return InputFilename; }
+
+  /// Get the map of classes.
   const RecordMap &getClasses() const { return Classes; }
+
+  /// Get the map of records (defs).
   const RecordMap &getDefs() const { return Defs; }
 
+  /// Get the map of global variables.
+  const GlobalMap &getGlobals() const { return ExtraGlobals; }
+
+  /// Get the class with the specified name.
   Record *getClass(StringRef Name) const {
     auto I = Classes.find(Name);
     return I == Classes.end() ? nullptr : I->second.get();
   }
 
+  /// Get the concrete record with the specified name.
   Record *getDef(StringRef Name) const {
     auto I = Defs.find(Name);
     return I == Defs.end() ? nullptr : I->second.get();
   }
 
+  /// Get the \p Init value of the specified global variable.
   Init *getGlobal(StringRef Name) const {
     if (Record *R = getDef(Name))
       return R->getDefInit();
@@ -1721,6 +1775,10 @@ public:
     return It == ExtraGlobals.end() ? nullptr : It->second;
   }
 
+  void saveInputFilename(std::string Filename) {
+    InputFilename = Filename;
+  }
+
   void addClass(std::unique_ptr<Record> R) {
     bool Ins = Classes.insert(std::make_pair(std::string(R->getName()),
                                              std::move(R))).second;
@@ -1744,14 +1802,42 @@ public:
 
   Init *getNewAnonymousName();
 
+  /// Start phase timing; called if the --time-phases option is specified.
+  void startPhaseTiming() {
+    TimingGroup = new TimerGroup("TableGen", "TableGen Phase Timing");
+  }
+
+  /// Start timing a phase. Automatically stops any previous phase timer.
+  void startTimer(StringRef Name);
+
+  /// Stop timing a phase.
+  void stopTimer();
+
+  /// Start timing the overall backend. If the backend itself starts a timer,
+  /// then this timer is cleared.
+  void startBackendTimer(StringRef Name);
+
+  /// Stop timing the overall backend.
+  void stopBackendTimer();
+
+  /// Stop phase timing and print the report.
+  void stopPhaseTiming() {
+    if (TimingGroup)
+      delete TimingGroup;
+  }
+
   //===--------------------------------------------------------------------===//
-  // High-level helper methods, useful for tablegen backends...
+  // High-level helper methods, useful for tablegen backends.
 
-  /// This method returns all concrete definitions
-  /// that derive from the specified class name.  A class with the specified
-  /// name must exist.
+  /// Get all the concrete records that inherit from the one specified
+  /// class. The class must be defined.
   std::vector<Record *> getAllDerivedDefinitions(StringRef ClassName) const;
 
+  /// Get all the concrete records that inherit from all the specified
+  /// classes. The classes must be defined.
+  std::vector<Record *> getAllDerivedDefinitions(
+      ArrayRef<StringRef> ClassNames) const;
+
   void dump() const;
 };
 
@@ -1781,8 +1867,6 @@ struct LessRecordFieldName {
 };
 
 struct LessRecordRegister {
-  static bool ascii_isdigit(char x) { return x >= '0' && x <= '9'; }
-
   struct RecordParts {
     SmallVector<std::pair< bool, StringRef>, 4> Parts;
 
@@ -1793,18 +1877,18 @@ struct LessRecordRegister {
       size_t Len = 0;
       const char *Start = Rec.data();
       const char *Curr = Start;
-      bool isDigitPart = ascii_isdigit(Curr[0]);
+      bool IsDigitPart = isDigit(Curr[0]);
       for (size_t I = 0, E = Rec.size(); I != E; ++I, ++Len) {
-        bool isDigit = ascii_isdigit(Curr[I]);
-        if (isDigit != isDigitPart) {
-          Parts.push_back(std::make_pair(isDigitPart, StringRef(Start, Len)));
+        bool IsDigit = isDigit(Curr[I]);
+        if (IsDigit != IsDigitPart) {
+          Parts.push_back(std::make_pair(IsDigitPart, StringRef(Start, Len)));
           Len = 0;
           Start = &Curr[I];
-          isDigitPart = ascii_isdigit(Curr[I]);
+          IsDigitPart = isDigit(Curr[I]);
         }
       }
       // Push the last part.
-      Parts.push_back(std::make_pair(isDigitPart, StringRef(Start, Len)));
+      Parts.push_back(std::make_pair(IsDigitPart, StringRef(Start, Len)));
     }
 
     size_t size() { return Parts.size(); }
@@ -1927,25 +2011,6 @@ public:
   bool keepUnsetBits() const override { return true; }
 };
 
-/// Resolve all references to a specific RecordVal.
-//
-// TODO: This is used for resolving references to template arguments, in a
-//       rather inefficient way. Change those uses to resolve all template
-//       arguments simultaneously and get rid of this class.
-class RecordValResolver final : public Resolver {
-  const RecordVal *RV;
-
-public:
-  explicit RecordValResolver(Record &R, const RecordVal *RV)
-      : Resolver(&R), RV(RV) {}
-
-  Init *resolve(Init *VarName) override {
-    if (VarName == RV->getNameInit())
-      return RV->getValue();
-    return nullptr;
-  }
-};
-
 /// Delegate resolving to a sub-resolver, but shadow some variable names.
 class ShadowResolver final : public Resolver {
   Resolver &R;
@@ -1996,6 +2061,7 @@ public:
   Init *resolve(Init *VarName) override;
 };
 
+void EmitDetailedRecords(RecordKeeper &RK, raw_ostream &OS);
 void EmitJSON(RecordKeeper &RK, raw_ostream &OS);
 
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/include/llvm/TableGen/SearchableTable.td b/contrib/llvm-project/llvm/include/llvm/TableGen/SearchableTable.td
index 2680c71218ea..61dfa5c70706 100644
--- a/contrib/llvm-project/llvm/include/llvm/TableGen/SearchableTable.td
+++ b/contrib/llvm-project/llvm/include/llvm/TableGen/SearchableTable.td
@@ -67,9 +67,13 @@ class GenericTable {
   // List of the names of fields of collected records that contain the data for
   // table entries, in the order that is used for initialization in C++.
   //
-  // For each field of the table named XXX, TableGen will look for a value
-  // called TypeOf_XXX and use that as a more detailed description of the
-  // type of the field if present. This is required for fields whose type
+  // TableGen needs to know the type of the fields so that it can format
+  // the initializers correctly. It can infer the type of bit, bits, string,
+  // Intrinsic, and Instruction values. 
+  //
+  // For each field of the table named xxx, TableGen will look for a field
+  // named TypeOf_xxx and use that as a more detailed description of the
+  // type of the field. This is required for fields whose type
   // cannot be deduced automatically, such as enum fields. For example:
   //
   //   def MyEnum : GenericEnum {
@@ -85,15 +89,15 @@ class GenericTable {
   //   def MyTable : GenericTable {
   //     let FilterClass = "MyTableEntry";
   //     let Fields = ["V", ...];
-  //     GenericEnum TypeOf_V = MyEnum;
+  //     string TypeOf_V = "MyEnum";
   //   }
   //
-  // Fields of type bit, bits<N>, string, Intrinsic, and Instruction (or
-  // derived classes of those) are supported natively.
+  // If a string field was initialized with a code literal, TableGen will
+  // emit the code verbatim. However, if a string field was initialized
+  // in some other way, but should be interpreted as code, then a TypeOf_xxx
+  // field is necessary, with a value of "code":
   //
-  // Additionally, fields of type `code` can appear, where the value is used
-  // verbatim as an initializer. However, these fields cannot be used as
-  // search keys.
+  //     string TypeOf_Predicate = "code";
   list<string> Fields;
 
   // (Optional) List of fields that make up the primary key.
@@ -103,7 +107,7 @@ class GenericTable {
   string PrimaryKeyName;
 
   // See SearchIndex.EarlyOut
-  bit PrimaryKeyEarlyOut = 0;
+  bit PrimaryKeyEarlyOut = false;
 }
 
 // Define a record derived from this class to generate an additional search
@@ -124,7 +128,7 @@ class SearchIndex {
   // instructions.
   //
   // Can only be used when the first field is an integral (non-string) type.
-  bit EarlyOut = 0;
+  bit EarlyOut = false;
 }
 
 // Legacy table type with integrated enum.
diff --git a/contrib/llvm-project/llvm/include/llvm/Target/CGPassBuilderOption.h b/contrib/llvm-project/llvm/include/llvm/Target/CGPassBuilderOption.h
new file mode 100644
index 000000000000..c3a221e01ceb
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/Target/CGPassBuilderOption.h
@@ -0,0 +1,65 @@
+//===- CGPassBuilderOption.h - Options for pass builder ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the CCState and CCValAssign classes, used for lowering
+// and implementing calling conventions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_PASSBUILDER_OPTION_H
+#define LLVM_CODEGEN_PASSBUILDER_OPTION_H
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Target/TargetOptions.h"
+#include <vector>
+
+namespace llvm {
+class TargetMachine;
+
+enum class RunOutliner { TargetDefault, AlwaysOutline, NeverOutline };
+enum class RegAllocType { Default, Basic, Fast, Greedy, PBQP };
+enum class CFLAAType { None, Steensgaard, Andersen, Both };
+
+// Not one-on-one but mostly corresponding to commandline options in
+// TargetPassConfig.cpp.
+struct CGPassBuilderOption {
+  Optional<bool> OptimizeRegAlloc;
+  Optional<bool> EnableIPRA;
+  bool DebugPM = false;
+  bool DisableVerify = false;
+  bool EnableImplicitNullChecks = false;
+  bool EnableBlockPlacementStats = false;
+  bool MISchedPostRA = false;
+  bool EarlyLiveIntervals = false;
+
+  bool DisableLSR = false;
+  bool DisableCGP = false;
+  bool PrintLSR = false;
+  bool DisableMergeICmps = false;
+  bool DisablePartialLibcallInlining = false;
+  bool DisableConstantHoisting = false;
+  bool PrintISelInput = false;
+  bool PrintGCInfo = false;
+  bool RequiresCodeGenSCCOrder = false;
+
+  RunOutliner EnableMachineOutliner = RunOutliner::TargetDefault;
+  RegAllocType RegAlloc = RegAllocType::Default;
+  CFLAAType UseCFLAA = CFLAAType::None;
+  Optional<GlobalISelAbortMode> EnableGlobalISelAbort;
+
+  Optional<bool> VerifyMachineCode;
+  Optional<bool> EnableFastISelOption;
+  Optional<bool> EnableGlobalISelOption;
+};
+
+CGPassBuilderOption getCGPassBuilderOption();
+
+} // namespace llvm
+
+#endif // LLVM_CODEGEN_PASSBUILDER_OPTION_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Target/GenericOpcodes.td b/contrib/llvm-project/llvm/include/llvm/Target/GenericOpcodes.td
index 3d8262b2404f..209925969df3 100644
--- a/contrib/llvm-project/llvm/include/llvm/Target/GenericOpcodes.td
+++ b/contrib/llvm-project/llvm/include/llvm/Target/GenericOpcodes.td
@@ -16,7 +16,7 @@
 //------------------------------------------------------------------------------
 
 class GenericInstruction : StandardPseudoInstruction {
-  let isPreISelOpcode = 1;
+  let isPreISelOpcode = true;
 }
 
 // Provide a variant of an instruction with the same operands, but
@@ -31,8 +31,8 @@ class ConstrainedIntruction<GenericInstruction baseInst> :
 
   // TODO: Do we need a better way to mark reads from FP mode than
   // hasSideEffects?
-  let hasSideEffects = 1;
-  let mayRaiseFPException = 1;
+  let hasSideEffects = true;
+  let mayRaiseFPException = true;
 }
 
 // Extend the underlying scalar type of an operation, leaving the high bits
@@ -40,7 +40,7 @@ class ConstrainedIntruction<GenericInstruction baseInst> :
 def G_ANYEXT : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type1:$src);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 // Sign extend the underlying scalar type of an operation, copying the sign bit
@@ -48,7 +48,7 @@ def G_ANYEXT : GenericInstruction {
 def G_SEXT : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type1:$src);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 // Sign extend the a value from an arbitrary bit position, copying the sign bit
@@ -62,7 +62,7 @@ def G_SEXT : GenericInstruction {
 def G_SEXT_INREG : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src, untyped_imm_0:$sz);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 // Zero extend the underlying scalar type of an operation, putting zero bits
@@ -70,7 +70,7 @@ def G_SEXT_INREG : GenericInstruction {
 def G_ZEXT : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type1:$src);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 
@@ -79,150 +79,150 @@ def G_ZEXT : GenericInstruction {
 def G_TRUNC : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type1:$src);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 def G_IMPLICIT_DEF : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 def G_PHI : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins variable_ops);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 def G_FRAME_INDEX : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins unknown:$src2);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 def G_GLOBAL_VALUE : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins unknown:$src);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 def G_INTTOPTR : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type1:$src);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 def G_PTRTOINT : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type1:$src);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 def G_BITCAST : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type1:$src);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 // Only supports scalar result types
 def G_CONSTANT : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins unknown:$imm);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 // Only supports scalar result types
 def G_FCONSTANT : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins unknown:$imm);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 def G_VASTART : GenericInstruction {
   let OutOperandList = (outs);
   let InOperandList = (ins type0:$list);
-  let hasSideEffects = 0;
-  let mayStore = 1;
+  let hasSideEffects = false;
+  let mayStore = true;
 }
 
 def G_VAARG : GenericInstruction {
   let OutOperandList = (outs type0:$val);
   let InOperandList = (ins type1:$list, unknown:$align);
-  let hasSideEffects = 0;
-  let mayLoad = 1;
-  let mayStore = 1;
+  let hasSideEffects = false;
+  let mayLoad = true;
+  let mayStore = true;
 }
 
 def G_CTLZ : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type1:$src);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 def G_CTLZ_ZERO_UNDEF : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type1:$src);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 def G_CTTZ : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type1:$src);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 def G_CTTZ_ZERO_UNDEF : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type1:$src);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 def G_CTPOP : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type1:$src);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 def G_BSWAP : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 def G_BITREVERSE : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 def G_ADDRSPACE_CAST : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type1:$src);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 def G_BLOCK_ADDR : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins unknown:$ba);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 def G_JUMP_TABLE : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins unknown:$jti);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 def G_DYN_STACKALLOC : GenericInstruction {
   let OutOperandList = (outs ptype0:$dst);
   let InOperandList = (ins type1:$size, i32imm:$align);
-  let hasSideEffects = 1;
+  let hasSideEffects = true;
 }
 
 def G_FREEZE : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 //------------------------------------------------------------------------------
@@ -233,101 +233,101 @@ def G_FREEZE : GenericInstruction {
 def G_ADD : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1, type0:$src2);
-  let hasSideEffects = 0;
-  let isCommutable = 1;
+  let hasSideEffects = false;
+  let isCommutable = true;
 }
 
 // Generic subtraction.
 def G_SUB : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1, type0:$src2);
-  let hasSideEffects = 0;
-  let isCommutable = 0;
+  let hasSideEffects = false;
+  let isCommutable = false;
 }
 
 // Generic multiplication.
 def G_MUL : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1, type0:$src2);
-  let hasSideEffects = 0;
-  let isCommutable = 1;
+  let hasSideEffects = false;
+  let isCommutable = true;
 }
 
 // Generic signed division.
 def G_SDIV : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1, type0:$src2);
-  let hasSideEffects = 0;
-  let isCommutable = 0;
+  let hasSideEffects = false;
+  let isCommutable = false;
 }
 
 // Generic unsigned division.
 def G_UDIV : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1, type0:$src2);
-  let hasSideEffects = 0;
-  let isCommutable = 0;
+  let hasSideEffects = false;
+  let isCommutable = false;
 }
 
 // Generic signed remainder.
 def G_SREM : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1, type0:$src2);
-  let hasSideEffects = 0;
-  let isCommutable = 0;
+  let hasSideEffects = false;
+  let isCommutable = false;
 }
 
 // Generic unsigned remainder.
 def G_UREM : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1, type0:$src2);
-  let hasSideEffects = 0;
-  let isCommutable = 0;
+  let hasSideEffects = false;
+  let isCommutable = false;
 }
 
 // Generic bitwise and.
 def G_AND : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1, type0:$src2);
-  let hasSideEffects = 0;
-  let isCommutable = 1;
+  let hasSideEffects = false;
+  let isCommutable = true;
 }
 
 // Generic bitwise or.
 def G_OR : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1, type0:$src2);
-  let hasSideEffects = 0;
-  let isCommutable = 1;
+  let hasSideEffects = false;
+  let isCommutable = true;
 }
 
 // Generic bitwise xor.
 def G_XOR : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1, type0:$src2);
-  let hasSideEffects = 0;
-  let isCommutable = 1;
+  let hasSideEffects = false;
+  let isCommutable = true;
 }
 
 // Generic left-shift.
 def G_SHL : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1, type1:$src2);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 // Generic logical right-shift.
 def G_LSHR : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1, type1:$src2);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 // Generic arithmetic right-shift.
 def G_ASHR : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1, type1:$src2);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 /// Funnel 'double' shifts take 3 operands, 2 inputs and the shift amount.
@@ -335,7 +335,7 @@ def G_ASHR : GenericInstruction {
 def G_FSHL : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1, type0:$src2, type1:$src3);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 /// Funnel 'double' shifts take 3 operands, 2 inputs and the shift amount.
@@ -343,35 +343,35 @@ def G_FSHL : GenericInstruction {
 def G_FSHR : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1, type0:$src2, type1:$src3);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 // Generic integer comparison.
 def G_ICMP : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins unknown:$tst, type1:$src1, type1:$src2);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 // Generic floating-point comparison.
 def G_FCMP : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins unknown:$tst, type1:$src1, type1:$src2);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 // Generic select
 def G_SELECT : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type1:$tst, type0:$src1, type0:$src2);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 // Generic pointer offset.
 def G_PTR_ADD : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1, type1:$src2);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 // Generic pointer mask. type1 should be an integer with the same
@@ -379,39 +379,46 @@ def G_PTR_ADD : GenericInstruction {
 def G_PTRMASK : GenericInstruction {
   let OutOperandList = (outs ptype0:$dst);
   let InOperandList = (ins ptype0:$src, type1:$bits);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 // Generic signed integer minimum.
 def G_SMIN : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1, type0:$src2);
-  let hasSideEffects = 0;
-  let isCommutable = 1;
+  let hasSideEffects = false;
+  let isCommutable = true;
 }
 
 // Generic signed integer maximum.
 def G_SMAX : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1, type0:$src2);
-  let hasSideEffects = 0;
-  let isCommutable = 1;
+  let hasSideEffects = false;
+  let isCommutable = true;
 }
 
 // Generic unsigned integer minimum.
 def G_UMIN : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1, type0:$src2);
-  let hasSideEffects = 0;
-  let isCommutable = 1;
+  let hasSideEffects = false;
+  let isCommutable = true;
 }
 
 // Generic unsigned integer maximum.
 def G_UMAX : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1, type0:$src2);
-  let hasSideEffects = 0;
-  let isCommutable = 1;
+  let hasSideEffects = false;
+  let isCommutable = true;
+}
+
+// Generic integer absolute value.
+def G_ABS : GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src);
+  let hasSideEffects = false;
 }
 
 //------------------------------------------------------------------------------
@@ -422,73 +429,73 @@ def G_UMAX : GenericInstruction {
 def G_UADDO : GenericInstruction {
   let OutOperandList = (outs type0:$dst, type1:$carry_out);
   let InOperandList = (ins type0:$src1, type0:$src2);
-  let hasSideEffects = 0;
-  let isCommutable = 1;
+  let hasSideEffects = false;
+  let isCommutable = true;
 }
 
 // Generic unsigned addition consuming and producing a carry flag.
 def G_UADDE : GenericInstruction {
   let OutOperandList = (outs type0:$dst, type1:$carry_out);
   let InOperandList = (ins type0:$src1, type0:$src2, type1:$carry_in);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 // Generic signed addition producing a carry flag.
 def G_SADDO : GenericInstruction {
   let OutOperandList = (outs type0:$dst, type1:$carry_out);
   let InOperandList = (ins type0:$src1, type0:$src2);
-  let hasSideEffects = 0;
-  let isCommutable = 1;
+  let hasSideEffects = false;
+  let isCommutable = true;
 }
 
 // Generic signed addition consuming and producing a carry flag.
 def G_SADDE : GenericInstruction {
   let OutOperandList = (outs type0:$dst, type1:$carry_out);
   let InOperandList = (ins type0:$src1, type0:$src2, type1:$carry_in);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 // Generic unsigned subtraction producing a carry flag.
 def G_USUBO : GenericInstruction {
   let OutOperandList = (outs type0:$dst, type1:$carry_out);
   let InOperandList = (ins type0:$src1, type0:$src2);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 // Generic unsigned subtraction consuming and producing a carry flag.
 def G_USUBE : GenericInstruction {
   let OutOperandList = (outs type0:$dst, type1:$carry_out);
   let InOperandList = (ins type0:$src1, type0:$src2, type1:$carry_in);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 // Generic signed subtraction producing a carry flag.
 def G_SSUBO : GenericInstruction {
   let OutOperandList = (outs type0:$dst, type1:$carry_out);
   let InOperandList = (ins type0:$src1, type0:$src2);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 // Generic signed subtraction consuming and producing a carry flag.
 def G_SSUBE : GenericInstruction {
   let OutOperandList = (outs type0:$dst, type1:$carry_out);
   let InOperandList = (ins type0:$src1, type0:$src2, type1:$carry_in);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 // Generic unsigned multiplication producing a carry flag.
 def G_UMULO : GenericInstruction {
   let OutOperandList = (outs type0:$dst, type1:$carry_out);
   let InOperandList = (ins type0:$src1, type0:$src2);
-  let hasSideEffects = 0;
-  let isCommutable = 1;
+  let hasSideEffects = false;
+  let isCommutable = true;
 }
 
 // Generic signed multiplication producing a carry flag.
 def G_SMULO : GenericInstruction {
   let OutOperandList = (outs type0:$dst, type1:$carry_out);
   let InOperandList = (ins type0:$src1, type0:$src2);
-  let hasSideEffects = 0;
-  let isCommutable = 1;
+  let hasSideEffects = false;
+  let isCommutable = true;
 }
 
 // Multiply two numbers at twice the incoming bit width (unsigned) and return
@@ -496,8 +503,8 @@ def G_SMULO : GenericInstruction {
 def G_UMULH : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1, type0:$src2);
-  let hasSideEffects = 0;
-  let isCommutable = 1;
+  let hasSideEffects = false;
+  let isCommutable = true;
 }
 
 // Multiply two numbers at twice the incoming bit width (signed) and return
@@ -505,8 +512,8 @@ def G_UMULH : GenericInstruction {
 def G_SMULH : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1, type0:$src2);
-  let hasSideEffects = 0;
-  let isCommutable = 1;
+  let hasSideEffects = false;
+  let isCommutable = true;
 }
 
 //------------------------------------------------------------------------------
@@ -517,32 +524,119 @@ def G_SMULH : GenericInstruction {
 def G_UADDSAT : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1, type0:$src2);
-  let hasSideEffects = 0;
-  let isCommutable = 1;
+  let hasSideEffects = false;
+  let isCommutable = true;
 }
 
 // Generic saturating signed addition.
 def G_SADDSAT : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1, type0:$src2);
-  let hasSideEffects = 0;
-  let isCommutable = 1;
+  let hasSideEffects = false;
+  let isCommutable = true;
 }
 
 // Generic saturating unsigned subtraction.
 def G_USUBSAT : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1, type0:$src2);
-  let hasSideEffects = 0;
-  let isCommutable = 0;
+  let hasSideEffects = false;
+  let isCommutable = false;
 }
 
 // Generic saturating signed subtraction.
 def G_SSUBSAT : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1, type0:$src2);
-  let hasSideEffects = 0;
-  let isCommutable = 0;
+  let hasSideEffects = false;
+  let isCommutable = false;
+}
+
+// Generic saturating unsigned left shift.
+def G_USHLSAT : GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src1, type1:$src2);
+  let hasSideEffects = false;
+  let isCommutable = false;
+}
+
+// Generic saturating signed left shift.
+def G_SSHLSAT : GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src1, type1:$src2);
+  let hasSideEffects = false;
+  let isCommutable = false;
+}
+
+/// RESULT = [US]MULFIX(LHS, RHS, SCALE) - Perform fixed point
+/// multiplication on 2 integers with the same width and scale. SCALE
+/// represents the scale of both operands as fixed point numbers. This
+/// SCALE parameter must be a constant integer. A scale of zero is
+/// effectively performing multiplication on 2 integers.
+def G_SMULFIX : GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src0, type0:$src1, untyped_imm_0:$scale);
+  let hasSideEffects = false;
+  let isCommutable = true;
+}
+
+def G_UMULFIX : GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src0, type0:$src1, untyped_imm_0:$scale);
+  let hasSideEffects = false;
+  let isCommutable = true;
+}
+
+/// Same as the corresponding unsaturated fixed point instructions, but the
+/// result is clamped between the min and max values representable by the
+/// bits of the first 2 operands.
+def G_SMULFIXSAT : GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src0, type0:$src1, untyped_imm_0:$scale);
+  let hasSideEffects = false;
+  let isCommutable = true;
+}
+
+def G_UMULFIXSAT : GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src0, type0:$src1, untyped_imm_0:$scale);
+  let hasSideEffects = false;
+  let isCommutable = true;
+}
+
+/// RESULT = [US]DIVFIX(LHS, RHS, SCALE) - Perform fixed point division on
+/// 2 integers with the same width and scale. SCALE represents the scale
+/// of both operands as fixed point numbers. This SCALE parameter must be a
+/// constant integer.
+def G_SDIVFIX : GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src0, type0:$src1, untyped_imm_0:$scale);
+  let hasSideEffects = false;
+  let isCommutable = false;
+}
+
+def G_UDIVFIX : GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src0, type0:$src1, untyped_imm_0:$scale);
+  let hasSideEffects = false;
+  let isCommutable = false;
+}
+
+/// Same as the corresponding unsaturated fixed point instructions,
+/// but the result is clamped between the min and max values
+/// representable by the bits of the first 2 operands.
+def G_SDIVFIXSAT : GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src0, type0:$src1, untyped_imm_0:$scale);
+  let hasSideEffects = false;
+  let isCommutable = false;
+}
+
+def G_UDIVFIXSAT : GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src0, type0:$src1, untyped_imm_0:$scale);
+  let hasSideEffects = false;
+  let isCommutable = false;
 }
 
 //------------------------------------------------------------------------------
@@ -552,61 +646,61 @@ def G_SSUBSAT : GenericInstruction {
 def G_FNEG : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 def G_FPEXT : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type1:$src);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 def G_FPTRUNC : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type1:$src);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 def G_FPTOSI : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type1:$src);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 def G_FPTOUI : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type1:$src);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 def G_SITOFP : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type1:$src);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 def G_UITOFP : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type1:$src);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 def G_FABS : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 def G_FCOPYSIGN : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src0, type1:$src1);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 def G_FCANONICALIZE : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 // FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two
@@ -619,15 +713,15 @@ def G_FCANONICALIZE : GenericInstruction {
 def G_FMINNUM : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1, type0:$src2);
-  let hasSideEffects = 0;
-  let isCommutable = 1;
+  let hasSideEffects = false;
+  let isCommutable = true;
 }
 
 def G_FMAXNUM : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1, type0:$src2);
-  let hasSideEffects = 0;
-  let isCommutable = 1;
+  let hasSideEffects = false;
+  let isCommutable = true;
 }
 
 // FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimum or maximum on
@@ -637,15 +731,15 @@ def G_FMAXNUM : GenericInstruction {
 def G_FMINNUM_IEEE : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1, type0:$src2);
-  let hasSideEffects = 0;
-  let isCommutable = 1;
+  let hasSideEffects = false;
+  let isCommutable = true;
 }
 
 def G_FMAXNUM_IEEE : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1, type0:$src2);
-  let hasSideEffects = 0;
-  let isCommutable = 1;
+  let hasSideEffects = false;
+  let isCommutable = true;
 }
 
 // FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0
@@ -654,15 +748,15 @@ def G_FMAXNUM_IEEE : GenericInstruction {
 def G_FMINIMUM : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1, type0:$src2);
-  let hasSideEffects = 0;
-  let isCommutable = 1;
+  let hasSideEffects = false;
+  let isCommutable = true;
 }
 
 def G_FMAXIMUM : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1, type0:$src2);
-  let hasSideEffects = 0;
-  let isCommutable = 1;
+  let hasSideEffects = false;
+  let isCommutable = true;
 }
 
 //------------------------------------------------------------------------------
@@ -673,24 +767,24 @@ def G_FMAXIMUM : GenericInstruction {
 def G_FADD : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1, type0:$src2);
-  let hasSideEffects = 0;
-  let isCommutable = 1;
+  let hasSideEffects = false;
+  let isCommutable = true;
 }
 
 // Generic FP subtraction.
 def G_FSUB : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1, type0:$src2);
-  let hasSideEffects = 0;
-  let isCommutable = 0;
+  let hasSideEffects = false;
+  let isCommutable = false;
 }
 
 // Generic FP multiplication.
 def G_FMUL : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1, type0:$src2);
-  let hasSideEffects = 0;
-  let isCommutable = 1;
+  let hasSideEffects = false;
+  let isCommutable = true;
 }
 
 // Generic fused multiply-add instruction.
@@ -698,8 +792,8 @@ def G_FMUL : GenericInstruction {
 def G_FMA : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1, type0:$src2, type0:$src3);
-  let hasSideEffects = 0;
-  let isCommutable = 0;
+  let hasSideEffects = false;
+  let isCommutable = false;
 }
 
 /// Generic FP multiply and add. Perform a * b + c, while getting the
@@ -707,85 +801,92 @@ def G_FMA : GenericInstruction {
 def G_FMAD : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1, type0:$src2, type0:$src3);
-  let hasSideEffects = 0;
-  let isCommutable = 0;
+  let hasSideEffects = false;
+  let isCommutable = false;
 }
 
 // Generic FP division.
 def G_FDIV : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1, type0:$src2);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 // Generic FP remainder.
 def G_FREM : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1, type0:$src2);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 // Floating point exponentiation.
 def G_FPOW : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1, type0:$src2);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
+}
+
+// Floating point exponentiation, with an integer power.
+def G_FPOWI : GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src0, type1:$src1);
+  let hasSideEffects = false;
 }
 
 // Floating point base-e exponential of a value.
 def G_FEXP : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 // Floating point base-2 exponential of a value.
 def G_FEXP2 : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 // Floating point base-e logarithm of a value.
 def G_FLOG : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 // Floating point base-2 logarithm of a value.
 def G_FLOG2 : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 // Floating point base-10 logarithm of a value.
 def G_FLOG10 : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 // Floating point ceiling of a value.
 def G_FCEIL : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 // Floating point cosine of a value.
 def G_FCOS : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 // Floating point sine of a value.
 def G_FSIN : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 // Floating point square root of a value.
@@ -795,28 +896,28 @@ def G_FSIN : GenericInstruction {
 def G_FSQRT : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 // Floating point floor of a value.
 def G_FFLOOR : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 // Floating point round to next integer.
 def G_FRINT : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 // Floating point round to the nearest integer.
 def G_FNEARBYINT : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 //------------------------------------------------------------------------------
@@ -825,19 +926,31 @@ def G_FNEARBYINT : GenericInstruction {
 def G_INTRINSIC_TRUNC : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 def G_INTRINSIC_ROUND : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
+}
+
+def G_INTRINSIC_LRINT : GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type1:$src);
+  let hasSideEffects = false;
+}
+
+def G_INTRINSIC_ROUNDEVEN : GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src1);
+  let hasSideEffects = false;
 }
 
 def G_READCYCLECOUNTER : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins);
-  let hasSideEffects = 1;
+  let hasSideEffects = true;
 }
 
 //------------------------------------------------------------------------------
@@ -852,24 +965,24 @@ def G_READCYCLECOUNTER : GenericInstruction {
 def G_LOAD : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins ptype1:$addr);
-  let hasSideEffects = 0;
-  let mayLoad = 1;
+  let hasSideEffects = false;
+  let mayLoad = true;
 }
 
 // Generic sign-extended load. Expects a MachineMemOperand in addition to explicit operands.
 def G_SEXTLOAD : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins ptype1:$addr);
-  let hasSideEffects = 0;
-  let mayLoad = 1;
+  let hasSideEffects = false;
+  let mayLoad = true;
 }
 
 // Generic zero-extended load. Expects a MachineMemOperand in addition to explicit operands.
 def G_ZEXTLOAD : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins ptype1:$addr);
-  let hasSideEffects = 0;
-  let mayLoad = 1;
+  let hasSideEffects = false;
+  let mayLoad = true;
 }
 
 // Generic indexed load. Combines a GEP with a load. $newaddr is set to $base + $offset.
@@ -878,32 +991,32 @@ def G_ZEXTLOAD : GenericInstruction {
 def G_INDEXED_LOAD : GenericInstruction {
   let OutOperandList = (outs type0:$dst, ptype1:$newaddr);
   let InOperandList = (ins ptype1:$base, type2:$offset, unknown:$am);
-  let hasSideEffects = 0;
-  let mayLoad = 1;
+  let hasSideEffects = false;
+  let mayLoad = true;
 }
 
 // Same as G_INDEXED_LOAD except that the load performed is sign-extending, as with G_SEXTLOAD.
 def G_INDEXED_SEXTLOAD : GenericInstruction {
   let OutOperandList = (outs type0:$dst, ptype1:$newaddr);
   let InOperandList = (ins ptype1:$base, type2:$offset, unknown:$am);
-  let hasSideEffects = 0;
-  let mayLoad = 1;
+  let hasSideEffects = false;
+  let mayLoad = true;
 }
 
 // Same as G_INDEXED_LOAD except that the load performed is zero-extending, as with G_ZEXTLOAD.
 def G_INDEXED_ZEXTLOAD : GenericInstruction {
   let OutOperandList = (outs type0:$dst, ptype1:$newaddr);
   let InOperandList = (ins ptype1:$base, type2:$offset, unknown:$am);
-  let hasSideEffects = 0;
-  let mayLoad = 1;
+  let hasSideEffects = false;
+  let mayLoad = true;
 }
 
 // Generic store. Expects a MachineMemOperand in addition to explicit operands.
 def G_STORE : GenericInstruction {
   let OutOperandList = (outs);
   let InOperandList = (ins type0:$src, ptype1:$addr);
-  let hasSideEffects = 0;
-  let mayStore = 1;
+  let hasSideEffects = false;
+  let mayStore = true;
 }
 
 // Combines a store with a GEP. See description of G_INDEXED_LOAD for indexing behaviour.
@@ -911,8 +1024,8 @@ def G_INDEXED_STORE : GenericInstruction {
   let OutOperandList = (outs ptype0:$newaddr);
   let InOperandList = (ins type1:$src, ptype0:$base, ptype2:$offset,
                            unknown:$am);
-  let hasSideEffects = 0;
-  let mayStore = 1;
+  let hasSideEffects = false;
+  let mayStore = true;
 }
 
 // Generic atomic cmpxchg with internal success check. Expects a
@@ -920,9 +1033,9 @@ def G_INDEXED_STORE : GenericInstruction {
 def G_ATOMIC_CMPXCHG_WITH_SUCCESS : GenericInstruction {
   let OutOperandList = (outs type0:$oldval, type1:$success);
   let InOperandList = (ins type2:$addr, type0:$cmpval, type0:$newval);
-  let hasSideEffects = 0;
-  let mayLoad = 1;
-  let mayStore = 1;
+  let hasSideEffects = false;
+  let mayLoad = true;
+  let mayStore = true;
 }
 
 // Generic atomic cmpxchg. Expects a MachineMemOperand in addition to explicit
@@ -930,9 +1043,9 @@ def G_ATOMIC_CMPXCHG_WITH_SUCCESS : GenericInstruction {
 def G_ATOMIC_CMPXCHG : GenericInstruction {
   let OutOperandList = (outs type0:$oldval);
   let InOperandList = (ins ptype1:$addr, type0:$cmpval, type0:$newval);
-  let hasSideEffects = 0;
-  let mayLoad = 1;
-  let mayStore = 1;
+  let hasSideEffects = false;
+  let mayLoad = true;
+  let mayStore = true;
 }
 
 // Generic atomicrmw. Expects a MachineMemOperand in addition to explicit
@@ -940,9 +1053,9 @@ def G_ATOMIC_CMPXCHG : GenericInstruction {
 class G_ATOMICRMW_OP : GenericInstruction {
   let OutOperandList = (outs type0:$oldval);
   let InOperandList = (ins ptype1:$addr, type0:$val);
-  let hasSideEffects = 0;
-  let mayLoad = 1;
-  let mayStore = 1;
+  let hasSideEffects = false;
+  let mayLoad = true;
+  let mayStore = true;
 }
 
 def G_ATOMICRMW_XCHG : G_ATOMICRMW_OP;
@@ -962,7 +1075,7 @@ def G_ATOMICRMW_FSUB : G_ATOMICRMW_OP;
 def G_FENCE : GenericInstruction {
   let OutOperandList = (outs);
   let InOperandList = (ins i32imm:$ordering, i32imm:$scope);
-  let hasSideEffects = 1;
+  let hasSideEffects = true;
 }
 
 //------------------------------------------------------------------------------
@@ -975,7 +1088,7 @@ def G_FENCE : GenericInstruction {
 def G_EXTRACT : GenericInstruction {
   let OutOperandList = (outs type0:$res);
   let InOperandList = (ins type1:$src, untyped_imm_0:$offset);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 // Extract multiple registers specified size, starting from blocks given by
@@ -987,14 +1100,14 @@ def G_EXTRACT : GenericInstruction {
 def G_UNMERGE_VALUES : GenericInstruction {
   let OutOperandList = (outs type0:$dst0, variable_ops);
   let InOperandList = (ins type1:$src);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 // Insert a smaller register into a larger one at the specified bit-index.
 def G_INSERT : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src, type1:$op, untyped_imm_0:$offset);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 // Concatenate multiple registers of the same size into a wider register.
@@ -1004,7 +1117,7 @@ def G_INSERT : GenericInstruction {
 def G_MERGE_VALUES : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type1:$src0, variable_ops);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 /// Create a vector from multiple scalar registers. No implicit
@@ -1013,7 +1126,7 @@ def G_MERGE_VALUES : GenericInstruction {
 def G_BUILD_VECTOR : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type1:$src0, variable_ops);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 /// Like G_BUILD_VECTOR, but truncates the larger operand types to fit the
@@ -1021,24 +1134,24 @@ def G_BUILD_VECTOR : GenericInstruction {
 def G_BUILD_VECTOR_TRUNC : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type1:$src0, variable_ops);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 /// Create a vector by concatenating vectors together.
 def G_CONCAT_VECTORS : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type1:$src0, variable_ops);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 // Intrinsic without side effects.
 def G_INTRINSIC : GenericInstruction {
   let OutOperandList = (outs);
   let InOperandList = (ins unknown:$intrin, variable_ops);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 
   // Conservatively assume this is convergent. If there turnes out to
-  // be a need, there should be separate convergent intrinsic opcode.s
+  // be a need, there should be separate convergent intrinsic opcodes.
   let isConvergent = 1;
 }
 
@@ -1046,13 +1159,13 @@ def G_INTRINSIC : GenericInstruction {
 def G_INTRINSIC_W_SIDE_EFFECTS : GenericInstruction {
   let OutOperandList = (outs);
   let InOperandList = (ins unknown:$intrin, variable_ops);
-  let hasSideEffects = 1;
-  let mayLoad = 1;
-  let mayStore = 1;
+  let hasSideEffects = true;
+  let mayLoad = true;
+  let mayStore = true;
 
   // Conservatively assume this is convergent. If there turnes out to
-  // be a need, there should be separate convergent intrinsic opcode.s
-  let isConvergent = 1;
+  // be a need, there should be separate convergent intrinsic opcodes.
+  let isConvergent = true;
 }
 
 //------------------------------------------------------------------------------
@@ -1063,61 +1176,61 @@ def G_INTRINSIC_W_SIDE_EFFECTS : GenericInstruction {
 def G_BR : GenericInstruction {
   let OutOperandList = (outs);
   let InOperandList = (ins unknown:$src1);
-  let hasSideEffects = 0;
-  let isBranch = 1;
-  let isTerminator = 1;
-  let isBarrier = 1;
+  let hasSideEffects = false;
+  let isBranch = true;
+  let isTerminator = true;
+  let isBarrier = true;
 }
 
 // Generic conditional branch.
 def G_BRCOND : GenericInstruction {
   let OutOperandList = (outs);
   let InOperandList = (ins type0:$tst, unknown:$truebb);
-  let hasSideEffects = 0;
-  let isBranch = 1;
-  let isTerminator = 1;
+  let hasSideEffects = false;
+  let isBranch = true;
+  let isTerminator = true;
 }
 
 // Generic indirect branch.
 def G_BRINDIRECT : GenericInstruction {
   let OutOperandList = (outs);
   let InOperandList = (ins type0:$src1);
-  let hasSideEffects = 0;
-  let isBranch = 1;
-  let isTerminator = 1;
-  let isBarrier = 1;
-  let isIndirectBranch = 1;
+  let hasSideEffects = false;
+  let isBranch = true;
+  let isTerminator = true;
+  let isBarrier = true;
+  let isIndirectBranch = true;
 }
 
 // Generic branch to jump table entry
 def G_BRJT : GenericInstruction {
   let OutOperandList = (outs);
   let InOperandList = (ins ptype0:$tbl, unknown:$jti, type1:$idx);
-  let hasSideEffects = 0;
-  let isBranch = 1;
-  let isTerminator = 1;
-  let isBarrier = 1;
-  let isIndirectBranch = 1;
+  let hasSideEffects = false;
+  let isBranch = true;
+  let isTerminator = true;
+  let isBarrier = true;
+  let isIndirectBranch = true;
 }
 
 def G_READ_REGISTER : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins unknown:$register);
-  let hasSideEffects = 1;
+  let hasSideEffects = true;
 
   // Assume convergent. It's probably not worth the effort of somehow
   // modeling convergent and nonconvergent register accesses.
-  let isConvergent = 1;
+  let isConvergent = true;
 }
 
 def G_WRITE_REGISTER : GenericInstruction {
   let OutOperandList = (outs);
   let InOperandList = (ins unknown:$register, type0:$value);
-  let hasSideEffects = 1;
+  let hasSideEffects = true;
 
   // Assume convergent. It's probably not worth the effort of somehow
   // modeling convergent and nonconvergent register accesses.
-  let isConvergent = 1;
+  let isConvergent = true;
 }
 
 //------------------------------------------------------------------------------
@@ -1128,14 +1241,14 @@ def G_WRITE_REGISTER : GenericInstruction {
 def G_INSERT_VECTOR_ELT : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src, type1:$elt, type2:$idx);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 // Generic extractelement.
 def G_EXTRACT_VECTOR_ELT : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type1:$src, type2:$idx);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
 // Generic shufflevector.
@@ -1145,9 +1258,47 @@ def G_EXTRACT_VECTOR_ELT : GenericInstruction {
 def G_SHUFFLE_VECTOR: GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type1:$v1, type1:$v2, unknown:$mask);
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 
+//------------------------------------------------------------------------------
+// Vector reductions
+//------------------------------------------------------------------------------
+
+class VectorReduction : GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type1:$v);
+  let hasSideEffects = false;
+}
+
+def G_VECREDUCE_SEQ_FADD : GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type1:$acc, type2:$v);
+  let hasSideEffects = false;
+}
+
+def G_VECREDUCE_SEQ_FMUL : GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type1:$acc, type2:$v);
+  let hasSideEffects = false;
+}
+
+def G_VECREDUCE_FADD : VectorReduction;
+def G_VECREDUCE_FMUL : VectorReduction;
+
+def G_VECREDUCE_FMAX : VectorReduction;
+def G_VECREDUCE_FMIN : VectorReduction;
+
+def G_VECREDUCE_ADD : VectorReduction;
+def G_VECREDUCE_MUL : VectorReduction;
+def G_VECREDUCE_AND : VectorReduction;
+def G_VECREDUCE_OR : VectorReduction;
+def G_VECREDUCE_XOR : VectorReduction;
+def G_VECREDUCE_SMAX : VectorReduction;
+def G_VECREDUCE_SMIN : VectorReduction;
+def G_VECREDUCE_UMAX : VectorReduction;
+def G_VECREDUCE_UMIN : VectorReduction;
+
 //------------------------------------------------------------------------------
 // Constrained floating point ops
 //------------------------------------------------------------------------------
@@ -1159,3 +1310,30 @@ def G_STRICT_FDIV : ConstrainedIntruction<G_FDIV>;
 def G_STRICT_FREM : ConstrainedIntruction<G_FREM>;
 def G_STRICT_FMA : ConstrainedIntruction<G_FMA>;
 def G_STRICT_FSQRT : ConstrainedIntruction<G_FSQRT>;
+
+//------------------------------------------------------------------------------
+// Memory intrinsics
+//------------------------------------------------------------------------------
+
+def G_MEMCPY : GenericInstruction {
+  let OutOperandList = (outs);
+  let InOperandList = (ins ptype0:$dst_addr, ptype1:$src_addr, type2:$size, untyped_imm_0:$tailcall);
+  let hasSideEffects = false;
+  let mayLoad = true;
+  let mayStore = true;
+}
+
+def G_MEMMOVE : GenericInstruction {
+  let OutOperandList = (outs);
+  let InOperandList = (ins ptype0:$dst_addr, ptype1:$src_addr, type2:$size, untyped_imm_0:$tailcall);
+  let hasSideEffects = false;
+  let mayLoad = true;
+  let mayStore = true;
+}
+
+def G_MEMSET : GenericInstruction {
+  let OutOperandList = (outs);
+  let InOperandList = (ins ptype0:$dst_addr, type1:$value, type2:$size, untyped_imm_0:$tailcall);
+  let hasSideEffects = false;
+  let mayStore = true;
+}
diff --git a/contrib/llvm-project/llvm/include/llvm/Target/GlobalISel/Combine.td b/contrib/llvm-project/llvm/include/llvm/Target/GlobalISel/Combine.td
index 1dd3e374b524..e2c7a90a1b16 100644
--- a/contrib/llvm-project/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/contrib/llvm-project/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -85,6 +85,7 @@ class GIDefMatchData<string type> : GIDefKind {
 
 def extending_load_matchdata : GIDefMatchData<"PreferredTuple">;
 def indexed_load_store_matchdata : GIDefMatchData<"IndexedLoadStoreMatchInfo">;
+def instruction_steps_matchdata: GIDefMatchData<"InstructionStepsMatchInfo">;
 
 /// The operator at the root of a GICombineRule.Match dag.
 def match;
@@ -125,11 +126,18 @@ def extending_loads : GICombineRule<
   (apply [{ Helper.applyCombineExtendingLoads(*${root}, ${matchinfo}); }])>;
 def combines_for_extload: GICombineGroup<[extending_loads]>;
 
-def sext_already_extended : GICombineRule<
+def sext_trunc_sextload : GICombineRule<
   (defs root:$d),
   (match (wip_match_opcode G_SEXT_INREG):$d,
-         [{ return Helper.matchSextAlreadyExtended(*${d}); }]),
-  (apply [{ Helper.applySextAlreadyExtended(*${d}); }])>;
+         [{ return Helper.matchSextTruncSextLoad(*${d}); }]),
+  (apply [{ Helper.applySextTruncSextLoad(*${d}); }])>;
+
+def sext_inreg_of_load_matchdata : GIDefMatchData<"std::tuple<Register, unsigned>">;
+def sext_inreg_of_load : GICombineRule<
+  (defs root:$root, sext_inreg_of_load_matchdata:$matchinfo),
+  (match (wip_match_opcode G_SEXT_INREG):$root,
+         [{ return Helper.matchSextInRegOfLoad(*${root}, ${matchinfo}); }]),
+  (apply [{ return Helper.applySextInRegOfLoad(*${root}, ${matchinfo}); }])>;
 
 def combine_indexed_load_store : GICombineRule<
   (defs root:$root, indexed_load_store_matchdata:$matchinfo),
@@ -137,13 +145,11 @@ def combine_indexed_load_store : GICombineRule<
          [{ return Helper.matchCombineIndexedLoadStore(*${root}, ${matchinfo}); }]),
   (apply [{ Helper.applyCombineIndexedLoadStore(*${root}, ${matchinfo}); }])>;
 
-// FIXME: Is there a reason this wasn't in tryCombine? I've left it out of
-//        all_combines because it wasn't there.
-def elide_br_by_inverting_cond : GICombineRule<
+def opt_brcond_by_inverting_cond : GICombineRule<
   (defs root:$root),
   (match (wip_match_opcode G_BR):$root,
-         [{ return Helper.matchElideBrByInvertingCond(*${root}); }]),
-  (apply [{ Helper.applyElideBrByInvertingCond(*${root}); }])>;
+         [{ return Helper.matchOptBrCondByInvertingCond(*${root}); }]),
+  (apply [{ Helper.applyOptBrCondByInvertingCond(*${root}); }])>;
 
 def ptr_add_immed_matchdata : GIDefMatchData<"PtrAddChain">;
 def ptr_add_immed_chain : GICombineRule<
@@ -152,6 +158,23 @@ def ptr_add_immed_chain : GICombineRule<
          [{ return Helper.matchPtrAddImmedChain(*${d}, ${matchinfo}); }]),
   (apply [{ Helper.applyPtrAddImmedChain(*${d}, ${matchinfo}); }])>;
 
+// Fold shift (shift base x), y -> shift base, (x+y), if shifts are same
+def shift_immed_matchdata : GIDefMatchData<"RegisterImmPair">;
+def shift_immed_chain : GICombineRule<
+  (defs root:$d, shift_immed_matchdata:$matchinfo),
+  (match (wip_match_opcode G_SHL, G_ASHR, G_LSHR, G_SSHLSAT, G_USHLSAT):$d,
+         [{ return Helper.matchShiftImmedChain(*${d}, ${matchinfo}); }]),
+  (apply [{ Helper.applyShiftImmedChain(*${d}, ${matchinfo}); }])>;
+
+// Transform shift (logic (shift X, C0), Y), C1
+//        -> logic (shift X, (C0+C1)), (shift Y, C1), if shifts are same
+def shift_of_shifted_logic_matchdata : GIDefMatchData<"ShiftOfShiftedLogic">;
+def shift_of_shifted_logic_chain : GICombineRule<
+  (defs root:$d, shift_of_shifted_logic_matchdata:$matchinfo),
+  (match (wip_match_opcode G_SHL, G_ASHR, G_LSHR, G_USHLSAT, G_SSHLSAT):$d,
+         [{ return Helper.matchShiftOfShiftedLogic(*${d}, ${matchinfo}); }]),
+  (apply [{ Helper.applyShiftOfShiftedLogic(*${d}, ${matchinfo}); }])>;
+
 def mul_to_shl_matchdata : GIDefMatchData<"unsigned">;
 def mul_to_shl : GICombineRule<
   (defs root:$d, mul_to_shl_matchdata:$matchinfo),
@@ -159,6 +182,14 @@ def mul_to_shl : GICombineRule<
          [{ return Helper.matchCombineMulToShl(*${mi}, ${matchinfo}); }]),
   (apply [{ Helper.applyCombineMulToShl(*${mi}, ${matchinfo}); }])>;
 
+// shl ([asz]ext x), y => zext (shl x, y), if shift does not overflow int
+def reduce_shl_of_extend_matchdata : GIDefMatchData<"RegisterImmPair">;
+def reduce_shl_of_extend : GICombineRule<
+  (defs root:$dst, reduce_shl_of_extend_matchdata:$matchinfo),
+  (match (G_SHL $dst, $src0, $src1):$mi,
+         [{ return Helper.matchCombineShlOfExtend(*${mi}, ${matchinfo}); }]),
+  (apply [{ Helper.applyCombineShlOfExtend(*${mi}, ${matchinfo}); }])>;
+
 // [us]itofp(undef) = 0, because the result value is bounded.
 def undef_to_fp_zero : GICombineRule<
   (defs root:$root),
@@ -178,11 +209,17 @@ def undef_to_negative_one: GICombineRule<
          [{ return Helper.matchAnyExplicitUseIsUndef(*${root}); }]),
   (apply [{ Helper.replaceInstWithConstant(*${root}, -1); }])>;
 
+def binop_left_undef_to_zero: GICombineRule<
+  (defs root:$root),
+  (match (wip_match_opcode G_SHL):$root,
+         [{ return Helper.matchOperandIsUndef(*${root}, 1); }]),
+  (apply [{ Helper.replaceInstWithConstant(*${root}, 0); }])>;
+
 // Instructions where if any source operand is undef, the instruction can be
 // replaced with undef.
 def propagate_undef_any_op: GICombineRule<
   (defs root:$root),
-  (match (wip_match_opcode G_ADD, G_FPTOSI, G_FPTOUI, G_SUB, G_XOR):$root,
+  (match (wip_match_opcode G_ADD, G_FPTOSI, G_FPTOUI, G_SUB, G_XOR, G_TRUNC):$root,
          [{ return Helper.matchAnyExplicitUseIsUndef(*${root}); }]),
   (apply [{ Helper.replaceInstWithUndef(*${root}); }])>;
 
@@ -209,6 +246,24 @@ def select_same_val: GICombineRule<
   (apply [{ return Helper.replaceSingleDefInstWithOperand(*${root}, 2); }])
 >;
 
+// Fold (undef ? x : y) -> y
+def select_undef_cmp: GICombineRule<
+  (defs root:$root),
+  (match (wip_match_opcode G_SELECT):$root,
+    [{ return Helper.matchUndefSelectCmp(*${root}); }]),
+  (apply [{ return Helper.replaceSingleDefInstWithOperand(*${root}, 2); }])
+>;
+
+// Fold (true ? x : y) -> x
+// Fold (false ? x : y) -> y
+def select_constant_cmp_matchdata : GIDefMatchData<"unsigned">;
+def select_constant_cmp: GICombineRule<
+  (defs root:$root, select_constant_cmp_matchdata:$matchinfo),
+  (match (wip_match_opcode G_SELECT):$root,
+    [{ return Helper.matchConstantSelectCmp(*${root}, ${matchinfo}); }]),
+  (apply [{ return Helper.replaceSingleDefInstWithOperand(*${root}, ${matchinfo}); }])
+>;
+
 // Fold x op 0 -> x
 def right_identity_zero: GICombineRule<
   (defs root:$root),
@@ -217,6 +272,14 @@ def right_identity_zero: GICombineRule<
   (apply [{ return Helper.replaceSingleDefInstWithOperand(*${root}, 1); }])
 >;
 
+// Fold x op 1 -> x
+def right_identity_one: GICombineRule<
+  (defs root:$root),
+  (match (wip_match_opcode G_MUL):$root,
+    [{ return Helper.matchConstantOp(${root}->getOperand(2), 1); }]),
+  (apply [{ return Helper.replaceSingleDefInstWithOperand(*${root}, 1); }])
+>;
+
 // Fold (x op x) - > x
 def binop_same_val: GICombineRule<
   (defs root:$root),
@@ -233,6 +296,13 @@ def binop_left_to_zero: GICombineRule<
   (apply [{ return Helper.replaceSingleDefInstWithOperand(*${root}, 1); }])
 >;
 
+def urem_pow2_to_mask : GICombineRule<
+  (defs root:$root),
+  (match (wip_match_opcode G_UREM):$root,
+    [{ return Helper.matchOperandIsKnownToBeAPowerOfTwo(*${root}, 2); }]),
+  (apply [{ return Helper.applySimplifyURemByPow2(*${root}); }])
+>;
+
 // Fold (x op 0) - > 0
 def binop_right_to_zero: GICombineRule<
   (defs root:$root),
@@ -257,9 +327,240 @@ def simplify_add_to_sub: GICombineRule <
   (apply [{ return Helper.applySimplifyAddToSub(*${root}, ${info});}])
 >;
 
+// Fold fp_op(cst) to the constant result of the floating point operation.
+def constant_fp_op_matchinfo: GIDefMatchData<"Optional<APFloat>">;
+def constant_fp_op: GICombineRule <
+  (defs root:$root, constant_fp_op_matchinfo:$info),
+  (match (wip_match_opcode G_FNEG, G_FABS, G_FPTRUNC, G_FSQRT, G_FLOG2):$root,
+    [{ return Helper.matchCombineConstantFoldFpUnary(*${root}, ${info}); }]),
+  (apply [{ return Helper.applyCombineConstantFoldFpUnary(*${root}, ${info}); }])
+>;
+
+// Fold int2ptr(ptr2int(x)) -> x
+def p2i_to_i2p_matchinfo: GIDefMatchData<"Register">;
+def p2i_to_i2p: GICombineRule<
+  (defs root:$root, p2i_to_i2p_matchinfo:$info),
+  (match (wip_match_opcode G_INTTOPTR):$root,
+    [{ return Helper.matchCombineI2PToP2I(*${root}, ${info}); }]),
+  (apply [{ return Helper.applyCombineI2PToP2I(*${root}, ${info}); }])
+>;
+
+// Fold ptr2int(int2ptr(x)) -> x
+def i2p_to_p2i_matchinfo: GIDefMatchData<"Register">;
+def i2p_to_p2i: GICombineRule<
+  (defs root:$root, i2p_to_p2i_matchinfo:$info),
+  (match (wip_match_opcode G_PTRTOINT):$root,
+    [{ return Helper.matchCombineP2IToI2P(*${root}, ${info}); }]),
+  (apply [{ return Helper.applyCombineP2IToI2P(*${root}, ${info}); }])
+>;
+
+// Fold add ptrtoint(x), y -> ptrtoint (ptr_add x), y
+def add_p2i_to_ptradd_matchinfo : GIDefMatchData<"std::pair<Register, bool>">;
+def add_p2i_to_ptradd : GICombineRule<
+  (defs root:$root, add_p2i_to_ptradd_matchinfo:$info),
+  (match (wip_match_opcode G_ADD):$root,
+    [{ return Helper.matchCombineAddP2IToPtrAdd(*${root}, ${info}); }]),
+  (apply [{ return Helper.applyCombineAddP2IToPtrAdd(*${root}, ${info}); }])
+>;
+
+// Fold (ptr_add (int2ptr C1), C2) -> C1 + C2
+def const_ptradd_to_i2p_matchinfo : GIDefMatchData<"int64_t">;
+def const_ptradd_to_i2p: GICombineRule<
+  (defs root:$root, const_ptradd_to_i2p_matchinfo:$info),
+  (match (wip_match_opcode G_PTR_ADD):$root,
+    [{ return Helper.matchCombineConstPtrAddToI2P(*${root}, ${info}); }]),
+  (apply [{ return Helper.applyCombineConstPtrAddToI2P(*${root}, ${info}); }])
+>;
+
+// Simplify: (logic_op (op x...), (op y...)) -> (op (logic_op x, y))
+def hoist_logic_op_with_same_opcode_hands: GICombineRule <
+  (defs root:$root, instruction_steps_matchdata:$info),
+  (match (wip_match_opcode G_AND, G_OR, G_XOR):$root,
+    [{ return Helper.matchHoistLogicOpWithSameOpcodeHands(*${root}, ${info}); }]),
+  (apply [{ return Helper.applyBuildInstructionSteps(*${root}, ${info});}])
+>;
+
+// Fold ashr (shl x, C), C -> sext_inreg (C)
+def shl_ashr_to_sext_inreg_matchinfo : GIDefMatchData<"std::tuple<Register, int64_t>">;
+def shl_ashr_to_sext_inreg : GICombineRule<
+  (defs root:$root, shl_ashr_to_sext_inreg_matchinfo:$info),
+  (match (wip_match_opcode G_ASHR): $root,
+    [{ return Helper.matchAshrShlToSextInreg(*${root}, ${info}); }]),
+  (apply [{ return Helper.applyAshShlToSextInreg(*${root}, ${info});}])
+>;
+// Fold (x & y) -> x or (x & y) -> y when (x & y) is known to equal x or equal y.
+def redundant_and_matchinfo : GIDefMatchData<"Register">;
+def redundant_and: GICombineRule <
+  (defs root:$root, redundant_and_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_AND):$root,
+         [{ return Helper.matchRedundantAnd(*${root}, ${matchinfo}); }]),
+  (apply [{ return Helper.replaceSingleDefInstWithReg(*${root}, ${matchinfo}); }])
+>;
+
+// Fold (x | y) -> x or (x | y) -> y when (x | y) is known to equal x or equal y.
+def redundant_or_matchinfo : GIDefMatchData<"Register">;
+def redundant_or: GICombineRule <
+  (defs root:$root, redundant_or_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_OR):$root,
+         [{ return Helper.matchRedundantOr(*${root}, ${matchinfo}); }]),
+  (apply [{ return Helper.replaceSingleDefInstWithReg(*${root}, ${matchinfo}); }])
+>;
+
+// If the input is already sign extended, just drop the extension.
+// sext_inreg x, K ->
+//   if computeNumSignBits(x) >= (x.getScalarSizeInBits() - K + 1)
+def redundant_sext_inreg: GICombineRule <
+  (defs root:$root),
+  (match (wip_match_opcode G_SEXT_INREG):$root,
+         [{ return Helper.matchRedundantSExtInReg(*${root}); }]),
+     (apply [{ return Helper.replaceSingleDefInstWithOperand(*${root}, 1); }])
+>;
+
+// Fold (anyext (trunc x)) -> x if the source type is same as
+// the destination type.
+def anyext_trunc_fold_matchinfo : GIDefMatchData<"Register">;
+def anyext_trunc_fold: GICombineRule <
+  (defs root:$root, anyext_trunc_fold_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_ANYEXT):$root,
+         [{ return Helper.matchCombineAnyExtTrunc(*${root}, ${matchinfo}); }]),
+  (apply [{ return Helper.applyCombineAnyExtTrunc(*${root}, ${matchinfo}); }])
+>;
+
+// Fold ([asz]ext ([asz]ext x)) -> ([asz]ext x).
+def ext_ext_fold_matchinfo : GIDefMatchData<"std::tuple<Register, unsigned>">;
+def ext_ext_fold: GICombineRule <
+  (defs root:$root, ext_ext_fold_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_ANYEXT, G_SEXT, G_ZEXT):$root,
+         [{ return Helper.matchCombineExtOfExt(*${root}, ${matchinfo}); }]),
+  (apply [{ return Helper.applyCombineExtOfExt(*${root}, ${matchinfo}); }])
+>;
+
+def not_cmp_fold_matchinfo : GIDefMatchData<"SmallVector<Register, 4>">;
+def not_cmp_fold : GICombineRule<
+  (defs root:$d, not_cmp_fold_matchinfo:$info),
+  (match (wip_match_opcode G_XOR): $d,
+  [{ return Helper.matchNotCmp(*${d}, ${info}); }]),
+  (apply [{ return Helper.applyNotCmp(*${d}, ${info}); }])
+>;
+
+// Fold (fneg (fneg x)) -> x.
+def fneg_fneg_fold_matchinfo : GIDefMatchData<"Register">;
+def fneg_fneg_fold: GICombineRule <
+  (defs root:$root, fneg_fneg_fold_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_FNEG):$root,
+         [{ return Helper.matchCombineFNegOfFNeg(*${root}, ${matchinfo}); }]),
+  (apply [{ return Helper.replaceSingleDefInstWithReg(*${root}, ${matchinfo}); }])
+>;
+
+// Fold (unmerge(merge x, y, z)) -> z, y, z.
+def unmerge_merge_matchinfo : GIDefMatchData<"SmallVector<Register, 8>">;
+def unmerge_merge : GICombineRule<
+  (defs root:$d, unmerge_merge_matchinfo:$info),
+  (match (wip_match_opcode G_UNMERGE_VALUES): $d,
+  [{ return Helper.matchCombineUnmergeMergeToPlainValues(*${d}, ${info}); }]),
+  (apply [{ return Helper.applyCombineUnmergeMergeToPlainValues(*${d}, ${info}); }])
+>;
+
+// Fold (fabs (fabs x)) -> (fabs x).
+def fabs_fabs_fold_matchinfo : GIDefMatchData<"Register">;
+def fabs_fabs_fold: GICombineRule<
+  (defs root:$root, fabs_fabs_fold_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_FABS):$root,
+         [{ return Helper.matchCombineFAbsOfFAbs(*${root}, ${matchinfo}); }]),
+  (apply [{ return Helper.applyCombineFAbsOfFAbs(*${root}, ${matchinfo}); }])
+>;
+
+// Fold (unmerge cst) -> cst1, cst2, ...
+def unmerge_cst_matchinfo : GIDefMatchData<"SmallVector<APInt, 8>">;
+def unmerge_cst : GICombineRule<
+  (defs root:$d, unmerge_cst_matchinfo:$info),
+  (match (wip_match_opcode G_UNMERGE_VALUES): $d,
+  [{ return Helper.matchCombineUnmergeConstant(*${d}, ${info}); }]),
+  (apply [{ return Helper.applyCombineUnmergeConstant(*${d}, ${info}); }])
+>;
+
+// Transform x,y<dead> = unmerge z -> x = trunc z.
+def unmerge_dead_to_trunc : GICombineRule<
+  (defs root:$d),
+  (match (wip_match_opcode G_UNMERGE_VALUES): $d,
+  [{ return Helper.matchCombineUnmergeWithDeadLanesToTrunc(*${d}); }]),
+  (apply [{ return Helper.applyCombineUnmergeWithDeadLanesToTrunc(*${d}); }])
+>;
+
+// Transform x,y = unmerge(zext(z)) -> x = zext z; y = 0.
+def unmerge_zext_to_zext : GICombineRule<
+  (defs root:$d),
+  (match (wip_match_opcode G_UNMERGE_VALUES): $d,
+  [{ return Helper.matchCombineUnmergeZExtToZExt(*${d}); }]),
+  (apply [{ return Helper.applyCombineUnmergeZExtToZExt(*${d}); }])
+>;
+
+// Fold trunc ([asz]ext x) -> x or ([asz]ext x) or (trunc x).
+def trunc_ext_fold_matchinfo : GIDefMatchData<"std::pair<Register, unsigned>">;
+def trunc_ext_fold: GICombineRule <
+  (defs root:$root, trunc_ext_fold_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_TRUNC):$root,
+         [{ return Helper.matchCombineTruncOfExt(*${root}, ${matchinfo}); }]),
+  (apply [{ return Helper.applyCombineTruncOfExt(*${root}, ${matchinfo}); }])
+>;
+
+// Fold trunc (shl x, K) -> shl (trunc x), K => K < VT.getScalarSizeInBits().
+def trunc_shl_matchinfo : GIDefMatchData<"std::pair<Register, Register>">;
+def trunc_shl: GICombineRule <
+  (defs root:$root, trunc_shl_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_TRUNC):$root,
+         [{ return Helper.matchCombineTruncOfShl(*${root}, ${matchinfo}); }]),
+  (apply [{ return Helper.applyCombineTruncOfShl(*${root}, ${matchinfo}); }])
+>;
+
+// Transform (mul x, -1) -> (sub 0, x)
+def mul_by_neg_one: GICombineRule <
+  (defs root:$root),
+  (match (wip_match_opcode G_MUL):$root,
+         [{ return Helper.matchConstantOp(${root}->getOperand(2), -1); }]),
+  (apply [{ return Helper.applyCombineMulByNegativeOne(*${root}); }])
+>;
+
+// Fold (xor (and x, y), y) -> (and (not x), y)
+def xor_of_and_with_same_reg_matchinfo :
+    GIDefMatchData<"std::pair<Register, Register>">;
+def xor_of_and_with_same_reg: GICombineRule <
+  (defs root:$root, xor_of_and_with_same_reg_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_XOR):$root,
+         [{ return Helper.matchXorOfAndWithSameReg(*${root}, ${matchinfo}); }]),
+  (apply [{ return Helper.applyXorOfAndWithSameReg(*${root}, ${matchinfo}); }])
+>;
+
+// Transform (ptr_add 0, x) -> (int_to_ptr x)
+def ptr_add_with_zero: GICombineRule<
+  (defs root:$root),
+  (match (wip_match_opcode G_PTR_ADD):$root,
+         [{ return Helper.matchPtrAddZero(*${root}); }]),
+  (apply [{ return Helper.applyPtrAddZero(*${root}); }])>;
+
+def regs_small_vec : GIDefMatchData<"SmallVector<Register, 4>">;
+def combine_insert_vec_elts_build_vector : GICombineRule<
+  (defs root:$root, regs_small_vec:$info),
+  (match (wip_match_opcode G_INSERT_VECTOR_ELT):$root,
+    [{ return Helper.matchCombineInsertVecElts(*${root}, ${info}); }]),
+  (apply [{ return Helper.applyCombineInsertVecElts(*${root}, ${info}); }])>;
+
+def load_or_combine_matchdata :
+GIDefMatchData<"std::function<void(MachineIRBuilder &)>">;
+def load_or_combine : GICombineRule<
+  (defs root:$root, load_or_combine_matchdata:$info),
+  (match (wip_match_opcode G_OR):$root,
+    [{ return Helper.matchLoadOrCombine(*${root}, ${info}); }]),
+  (apply [{ return Helper.applyLoadOrCombine(*${root}, ${info}); }])>;
+
+// Currently only the one combine above.
+def insert_vec_elt_combines : GICombineGroup<
+                            [combine_insert_vec_elts_build_vector]>;
+
 // FIXME: These should use the custom predicate feature once it lands.
 def undef_combines : GICombineGroup<[undef_to_fp_zero, undef_to_int_zero,
                                      undef_to_negative_one,
+                                     binop_left_undef_to_zero,
                                      propagate_undef_any_op,
                                      propagate_undef_all_ops,
                                      propagate_undef_shuffle_mask,
@@ -267,9 +568,31 @@ def undef_combines : GICombineGroup<[undef_to_fp_zero, undef_to_int_zero,
 
 def identity_combines : GICombineGroup<[select_same_val, right_identity_zero,
                                         binop_same_val, binop_left_to_zero,
-                                        binop_right_to_zero]>;
-
-def trivial_combines : GICombineGroup<[copy_prop, mul_to_shl]>;
-def all_combines : GICombineGroup<[trivial_combines, ptr_add_immed_chain,
-    combines_for_extload, combine_indexed_load_store, undef_combines,
-    identity_combines, simplify_add_to_sub]>;
+                                        binop_right_to_zero, p2i_to_i2p,
+                                        i2p_to_p2i, anyext_trunc_fold,
+                                        fneg_fneg_fold, right_identity_one]>;
+
+def const_combines : GICombineGroup<[constant_fp_op, const_ptradd_to_i2p]>;
+
+def known_bits_simplifications : GICombineGroup<[
+  redundant_and, redundant_sext_inreg, redundant_or, urem_pow2_to_mask]>;
+
+def width_reduction_combines : GICombineGroup<[reduce_shl_of_extend]>;
+
+def select_combines : GICombineGroup<[select_undef_cmp, select_constant_cmp]>;
+
+def trivial_combines : GICombineGroup<[copy_prop, mul_to_shl, add_p2i_to_ptradd,
+                                       mul_by_neg_one]>;
+
+def all_combines : GICombineGroup<[trivial_combines, insert_vec_elt_combines,
+    ptr_add_immed_chain, combines_for_extload, combine_indexed_load_store,
+    undef_combines, identity_combines, simplify_add_to_sub,
+    hoist_logic_op_with_same_opcode_hands,
+    shl_ashr_to_sext_inreg, sext_inreg_of_load,
+    width_reduction_combines, select_combines,
+    known_bits_simplifications, ext_ext_fold,
+    not_cmp_fold, opt_brcond_by_inverting_cond,
+    unmerge_merge, fabs_fabs_fold, unmerge_cst, unmerge_dead_to_trunc,
+    unmerge_zext_to_zext, trunc_ext_fold, trunc_shl,
+    const_combines, xor_of_and_with_same_reg, ptr_add_with_zero,
+    shift_immed_chain, shift_of_shifted_logic_chain, load_or_combine]>;
diff --git a/contrib/llvm-project/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td b/contrib/llvm-project/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
index 150834e65b2d..6fb8a6b15dd7 100644
--- a/contrib/llvm-project/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
+++ b/contrib/llvm-project/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
@@ -26,8 +26,8 @@ class GINodeEquiv<Instruction i, SDNode node> {
   // SelectionDAG has separate nodes for atomic and non-atomic memory operations
   // (ISD::LOAD, ISD::ATOMIC_LOAD, ISD::STORE, ISD::ATOMIC_STORE) but GlobalISel
   // stores this information in the MachineMemoryOperand.
-  bit CheckMMOIsNonAtomic = 0;
-  bit CheckMMOIsAtomic = 0;
+  bit CheckMMOIsNonAtomic = false;
+  bit CheckMMOIsAtomic = false;
 
   // SelectionDAG has one node for all loads and uses predicates to
   // differentiate them. GlobalISel on the other hand uses separate opcodes.
@@ -52,6 +52,8 @@ def : GINodeEquiv<G_BITCAST, bitconvert>;
 def : GINodeEquiv<G_CONSTANT, imm>;
 def : GINodeEquiv<G_FCONSTANT, fpimm>;
 def : GINodeEquiv<G_IMPLICIT_DEF, undef>;
+def : GINodeEquiv<G_FRAME_INDEX, frameindex>;
+def : GINodeEquiv<G_BLOCK_ADDR, blockaddress>;
 def : GINodeEquiv<G_ADD, add>;
 def : GINodeEquiv<G_SUB, sub>;
 def : GINodeEquiv<G_MUL, mul>;
@@ -71,6 +73,16 @@ def : GINodeEquiv<G_SADDSAT, saddsat>;
 def : GINodeEquiv<G_UADDSAT, uaddsat>;
 def : GINodeEquiv<G_SSUBSAT, ssubsat>;
 def : GINodeEquiv<G_USUBSAT, usubsat>;
+def : GINodeEquiv<G_SSHLSAT, sshlsat>;
+def : GINodeEquiv<G_USHLSAT, ushlsat>;
+def : GINodeEquiv<G_SMULFIX, smulfix>;
+def : GINodeEquiv<G_UMULFIX, umulfix>;
+def : GINodeEquiv<G_SMULFIXSAT, smulfixsat>;
+def : GINodeEquiv<G_UMULFIXSAT, umulfixsat>;
+def : GINodeEquiv<G_SDIVFIX, sdivfix>;
+def : GINodeEquiv<G_UDIVFIX, udivfix>;
+def : GINodeEquiv<G_SDIVFIXSAT, sdivfixsat>;
+def : GINodeEquiv<G_UDIVFIXSAT, udivfixsat>;
 def : GINodeEquiv<G_SELECT, select>;
 def : GINodeEquiv<G_FNEG, fneg>;
 def : GINodeEquiv<G_FPEXT, fpextend>;
@@ -104,7 +116,7 @@ def : GINodeEquiv<G_CTTZ, cttz>;
 def : GINodeEquiv<G_CTLZ_ZERO_UNDEF, ctlz_zero_undef>;
 def : GINodeEquiv<G_CTTZ_ZERO_UNDEF, cttz_zero_undef>;
 def : GINodeEquiv<G_CTPOP, ctpop>;
-def : GINodeEquiv<G_EXTRACT_VECTOR_ELT, vector_extract>;
+def : GINodeEquiv<G_EXTRACT_VECTOR_ELT, extractelt>;
 def : GINodeEquiv<G_CONCAT_VECTORS, concat_vectors>;
 def : GINodeEquiv<G_BUILD_VECTOR, build_vector>;
 def : GINodeEquiv<G_FCEIL, fceil>;
@@ -117,11 +129,13 @@ def : GINodeEquiv<G_FRINT, frint>;
 def : GINodeEquiv<G_FNEARBYINT, fnearbyint>;
 def : GINodeEquiv<G_INTRINSIC_TRUNC, ftrunc>;
 def : GINodeEquiv<G_INTRINSIC_ROUND, fround>;
+def : GINodeEquiv<G_INTRINSIC_LRINT, lrint>;
 def : GINodeEquiv<G_FCOPYSIGN, fcopysign>;
 def : GINodeEquiv<G_SMIN, smin>;
 def : GINodeEquiv<G_SMAX, smax>;
 def : GINodeEquiv<G_UMIN, umin>;
 def : GINodeEquiv<G_UMAX, umax>;
+def : GINodeEquiv<G_ABS, abs>;
 def : GINodeEquiv<G_FMINNUM, fminnum>;
 def : GINodeEquiv<G_FMAXNUM, fmaxnum>;
 def : GINodeEquiv<G_FMINNUM_IEEE, fminnum_ieee>;
@@ -144,7 +158,7 @@ def : GINodeEquiv<G_STRICT_FSQRT, strict_fsqrt>;
 // separate nodes for them. This GINodeEquiv maps the non-atomic loads to
 // G_LOAD with a non-atomic MachineMemOperand.
 def : GINodeEquiv<G_LOAD, ld> {
-  let CheckMMOIsNonAtomic = 1;
+  let CheckMMOIsNonAtomic = true;
   let IfSignExtend = G_SEXTLOAD;
   let IfZeroExtend = G_ZEXTLOAD;
 }
@@ -160,11 +174,17 @@ def : GINodeEquiv<G_ICMP, setcc> {
 // G_STORE handles both atomic and non-atomic stores where as SelectionDAG had
 // separate nodes for them. This GINodeEquiv maps the non-atomic stores to
 // G_STORE with a non-atomic MachineMemOperand.
-def : GINodeEquiv<G_STORE, st> { let CheckMMOIsNonAtomic = 1; }
+def : GINodeEquiv<G_STORE, st> { let CheckMMOIsNonAtomic = true; }
 
 def : GINodeEquiv<G_LOAD, atomic_load> {
-  let CheckMMOIsNonAtomic = 0;
-  let CheckMMOIsAtomic = 1;
+  let CheckMMOIsNonAtomic = false;
+  let CheckMMOIsAtomic = true;
+}
+
+// Operands are swapped for atomic_store vs. regular store
+def : GINodeEquiv<G_STORE, atomic_store> {
+  let CheckMMOIsNonAtomic = false;
+  let CheckMMOIsAtomic = true;
 }
 
 def : GINodeEquiv<G_ATOMIC_CMPXCHG, atomic_cmp_swap>;
diff --git a/contrib/llvm-project/llvm/include/llvm/Target/Target.td b/contrib/llvm-project/llvm/include/llvm/Target/Target.td
index aab5376db453..1c97d70a477f 100644
--- a/contrib/llvm-project/llvm/include/llvm/Target/Target.td
+++ b/contrib/llvm-project/llvm/include/llvm/Target/Target.td
@@ -110,9 +110,9 @@ class SubRegIndex<int size, int offset = 0> {
 // ComposedSubRegIndex - A sub-register that is the result of composing A and B.
 // Offset is set to the sum of A and B's Offsets. Size is set to B's Size.
 class ComposedSubRegIndex<SubRegIndex A, SubRegIndex B>
-  : SubRegIndex<B.Size, !if(!eq(A.Offset, -1), -1,
-                        !if(!eq(B.Offset, -1), -1,
-                            !add(A.Offset, B.Offset)))> {
+  : SubRegIndex<B.Size, !cond(!eq(A.Offset, -1): -1,
+                              !eq(B.Offset, -1): -1,
+                              true:              !add(A.Offset, B.Offset))> {
   // See SubRegIndex.
   let ComposedOf = [A, B];
 }
@@ -175,12 +175,12 @@ class Register<string n, list<string> altNames = []> {
   // completely determined by the value of its sub-registers.  For example, the
   // x86 register AX is covered by its sub-registers AL and AH, but EAX is not
   // covered by its sub-register AX.
-  bit CoveredBySubRegs = 0;
+  bit CoveredBySubRegs = false;
 
   // HWEncoding - The target specific hardware encoding for this register.
   bits<16> HWEncoding = 0;
 
-  bit isArtificial = 0;
+  bit isArtificial = false;
 }
 
 // RegisterWithSubRegs - This can be used to define instances of Register which
@@ -252,7 +252,7 @@ class RegisterClass<string namespace, list<ValueType> regTypes, int alignment,
   // isAllocatable - Specify that the register class can be used for virtual
   // registers and register allocation.  Some register classes are only used to
   // model instruction operand constraints, and should have isAllocatable = 0.
-  bit isAllocatable = 1;
+  bit isAllocatable = true;
 
   // AltOrders - List of alternative allocation orders. The default order is
   // MemberList itself, and that is good enough for most targets since the
@@ -278,7 +278,7 @@ class RegisterClass<string namespace, list<ValueType> regTypes, int alignment,
 
   // Generate register pressure set for this register class and any class
   // synthesized from it. Set to 0 to inhibit unneeded pressure sets.
-  bit GeneratePressureSet = 1;
+  bit GeneratePressureSet = true;
 
   // Weight override for register pressure calculation. This is the value
   // TargetRegisterClass::getRegClassWeight() will return. The weight is in
@@ -452,7 +452,7 @@ class InstructionEncoding {
   // DecodeInstB() is not able to determine if all possible values of ?? are
   // valid or not. If DecodeInstB() returns Fail the decoder will attempt to
   // decode the bitpattern as InstA too.
-  bit hasCompleteDecoder = 1;
+  bit hasCompleteDecoder = true;
 }
 
 // Allows specifying an InstructionEncoding by HwMode. If an Instruction specifies
@@ -506,59 +506,59 @@ class Instruction : InstructionEncoding {
 
   // Indicates if this is a pre-isel opcode that should be
   // legalized/regbankselected/selected.
-  bit isPreISelOpcode = 0;
+  bit isPreISelOpcode = false;
 
   // These bits capture information about the high-level semantics of the
   // instruction.
-  bit isReturn     = 0;     // Is this instruction a return instruction?
-  bit isBranch     = 0;     // Is this instruction a branch instruction?
-  bit isEHScopeReturn = 0;  // Does this instruction end an EH scope?
-  bit isIndirectBranch = 0; // Is this instruction an indirect branch?
-  bit isCompare    = 0;     // Is this instruction a comparison instruction?
-  bit isMoveImm    = 0;     // Is this instruction a move immediate instruction?
-  bit isMoveReg    = 0;     // Is this instruction a move register instruction?
-  bit isBitcast    = 0;     // Is this instruction a bitcast instruction?
-  bit isSelect     = 0;     // Is this instruction a select instruction?
-  bit isBarrier    = 0;     // Can control flow fall through this instruction?
-  bit isCall       = 0;     // Is this instruction a call instruction?
-  bit isAdd        = 0;     // Is this instruction an add instruction?
-  bit isTrap       = 0;     // Is this instruction a trap instruction?
-  bit canFoldAsLoad = 0;    // Can this be folded as a simple memory operand?
-  bit mayLoad      = ?;     // Is it possible for this inst to read memory?
-  bit mayStore     = ?;     // Is it possible for this inst to write memory?
-  bit mayRaiseFPException = 0; // Can this raise a floating-point exception?
-  bit isConvertibleToThreeAddress = 0;  // Can this 2-addr instruction promote?
-  bit isCommutable = 0;     // Is this 3 operand instruction commutable?
-  bit isTerminator = 0;     // Is this part of the terminator for a basic block?
-  bit isReMaterializable = 0; // Is this instruction re-materializable?
-  bit isPredicable = 0;     // 1 means this instruction is predicable
-                            // even if it does not have any operand
-                            // tablegen can identify as a predicate
-  bit isUnpredicable = 0;   // 1 means this instruction is not predicable
-                            // even if it _does_ have a predicate operand
-  bit hasDelaySlot = 0;     // Does this instruction have an delay slot?
-  bit usesCustomInserter = 0; // Pseudo instr needing special help.
-  bit hasPostISelHook = 0;  // To be *adjusted* after isel by target hook.
-  bit hasCtrlDep   = 0;     // Does this instruction r/w ctrl-flow chains?
-  bit isNotDuplicable = 0;  // Is it unsafe to duplicate this instruction?
-  bit isConvergent = 0;     // Is this instruction convergent?
-  bit isAuthenticated = 0;  // Does this instruction authenticate a pointer?
-  bit isAsCheapAsAMove = 0; // As cheap (or cheaper) than a move instruction.
-  bit hasExtraSrcRegAllocReq = 0; // Sources have special regalloc requirement?
-  bit hasExtraDefRegAllocReq = 0; // Defs have special regalloc requirement?
-  bit isRegSequence = 0;    // Is this instruction a kind of reg sequence?
-                            // If so, make sure to override
-                            // TargetInstrInfo::getRegSequenceLikeInputs.
-  bit isPseudo     = 0;     // Is this instruction a pseudo-instruction?
-                            // If so, won't have encoding information for
-                            // the [MC]CodeEmitter stuff.
-  bit isExtractSubreg = 0;  // Is this instruction a kind of extract subreg?
-                             // If so, make sure to override
-                             // TargetInstrInfo::getExtractSubregLikeInputs.
-  bit isInsertSubreg = 0;   // Is this instruction a kind of insert subreg?
-                            // If so, make sure to override
-                            // TargetInstrInfo::getInsertSubregLikeInputs.
-  bit variadicOpsAreDefs = 0; // Are variadic operands definitions?
+  bit isReturn     = false;     // Is this instruction a return instruction?
+  bit isBranch     = false;     // Is this instruction a branch instruction?
+  bit isEHScopeReturn = false;  // Does this instruction end an EH scope?
+  bit isIndirectBranch = false; // Is this instruction an indirect branch?
+  bit isCompare    = false;     // Is this instruction a comparison instruction?
+  bit isMoveImm    = false;     // Is this instruction a move immediate instruction?
+  bit isMoveReg    = false;     // Is this instruction a move register instruction?
+  bit isBitcast    = false;     // Is this instruction a bitcast instruction?
+  bit isSelect     = false;     // Is this instruction a select instruction?
+  bit isBarrier    = false;     // Can control flow fall through this instruction?
+  bit isCall       = false;     // Is this instruction a call instruction?
+  bit isAdd        = false;     // Is this instruction an add instruction?
+  bit isTrap       = false;     // Is this instruction a trap instruction?
+  bit canFoldAsLoad = false;    // Can this be folded as a simple memory operand?
+  bit mayLoad      = ?;         // Is it possible for this inst to read memory?
+  bit mayStore     = ?;         // Is it possible for this inst to write memory?
+  bit mayRaiseFPException = false; // Can this raise a floating-point exception?
+  bit isConvertibleToThreeAddress = false;  // Can this 2-addr instruction promote?
+  bit isCommutable = false;     // Is this 3 operand instruction commutable?
+  bit isTerminator = false;     // Is this part of the terminator for a basic block?
+  bit isReMaterializable = false; // Is this instruction re-materializable?
+  bit isPredicable = false;     // 1 means this instruction is predicable
+                                // even if it does not have any operand
+                                // tablegen can identify as a predicate
+  bit isUnpredicable = false;   // 1 means this instruction is not predicable
+                                // even if it _does_ have a predicate operand
+  bit hasDelaySlot = false;     // Does this instruction have an delay slot?
+  bit usesCustomInserter = false; // Pseudo instr needing special help.
+  bit hasPostISelHook = false;  // To be *adjusted* after isel by target hook.
+  bit hasCtrlDep   = false;     // Does this instruction r/w ctrl-flow chains?
+  bit isNotDuplicable = false;  // Is it unsafe to duplicate this instruction?
+  bit isConvergent = false;     // Is this instruction convergent?
+  bit isAuthenticated = false;  // Does this instruction authenticate a pointer?
+  bit isAsCheapAsAMove = false; // As cheap (or cheaper) than a move instruction.
+  bit hasExtraSrcRegAllocReq = false; // Sources have special regalloc requirement?
+  bit hasExtraDefRegAllocReq = false; // Defs have special regalloc requirement?
+  bit isRegSequence = false;    // Is this instruction a kind of reg sequence?
+                                // If so, make sure to override
+                                // TargetInstrInfo::getRegSequenceLikeInputs.
+  bit isPseudo     = false;     // Is this instruction a pseudo-instruction?
+                                // If so, won't have encoding information for
+                                // the [MC]CodeEmitter stuff.
+  bit isExtractSubreg = false;  // Is this instruction a kind of extract subreg?
+                                // If so, make sure to override
+                                // TargetInstrInfo::getExtractSubregLikeInputs.
+  bit isInsertSubreg = false;   // Is this instruction a kind of insert subreg?
+                                // If so, make sure to override
+                                // TargetInstrInfo::getInsertSubregLikeInputs.
+  bit variadicOpsAreDefs = false; // Are variadic operands definitions?
 
   // Does the instruction have side effects that are not captured by any
   // operands of the instruction or other flags?
@@ -581,15 +581,15 @@ class Instruction : InstructionEncoding {
   //   CodeEmitter unchanged, but duplicates a canonical instruction
   //   definition's encoding and should be ignored when constructing the
   //   assembler match tables.
-  bit isCodeGenOnly = 0;
+  bit isCodeGenOnly = false;
 
   // Is this instruction a pseudo instruction for use by the assembler parser.
-  bit isAsmParserOnly = 0;
+  bit isAsmParserOnly = false;
 
   // This instruction is not expected to be queried for scheduling latencies
   // and therefore needs no scheduling information even for a complete
   // scheduling model.
-  bit hasNoSchedulingInfo = 0;
+  bit hasNoSchedulingInfo = false;
 
   InstrItinClass Itinerary = NoItinerary;// Execution steps used for scheduling.
 
@@ -630,13 +630,13 @@ class Instruction : InstructionEncoding {
   /// UseNamedOperandTable - If set, the operand indices of this instruction
   /// can be queried via the getNamedOperandIdx() function which is generated
   /// by TableGen.
-  bit UseNamedOperandTable = 0;
+  bit UseNamedOperandTable = false;
 
   /// Should FastISel ignore this instruction. For certain ISAs, they have
   /// instructions which map to the same ISD Opcode, value type operands and
   /// instruction selection predicates. FastISel cannot handle such cases, but
   /// SelectionDAG can.
-  bit FastISelShouldIgnore = 0;
+  bit FastISelShouldIgnore = false;
 }
 
 /// Defines an additional encoding that disassembles to the given instruction
@@ -651,7 +651,7 @@ class AdditionalEncoding<Instruction I> : InstructionEncoding {
 /// pseudo.
 class PseudoInstExpansion<dag Result> {
   dag ResultInst = Result;     // The instruction to generate.
-  bit isPseudo = 1;
+  bit isPseudo = true;
 }
 
 /// Predicates - These are extra conditionals which are turned into instruction
@@ -662,7 +662,7 @@ class Predicate<string cond> {
   /// AssemblerMatcherPredicate - If this feature can be used by the assembler
   /// matcher, this is true.  Targets should set this by inheriting their
   /// feature from the AssemblerPredicate class in addition to Predicate.
-  bit AssemblerMatcherPredicate = 0;
+  bit AssemblerMatcherPredicate = false;
 
   /// AssemblerCondDag - Set of subtarget features being tested used
   /// as alternative condition string used for assembler matcher. Must be used
@@ -688,7 +688,7 @@ class Predicate<string cond> {
   /// every function change. Most predicates can leave this at '0'.
   ///
   /// Ignored by SelectionDAG, it always recomputes the predicate on every use.
-  bit RecomputePerFunction = 0;
+  bit RecomputePerFunction = false;
 }
 
 /// NoHonorSignDependentRounding - This predicate is true if support for
@@ -788,7 +788,7 @@ class AsmOperandClass {
   /// marked as IsOptional.
   ///
   /// Optional arguments must be at the end of the operand list.
-  bit IsOptional = 0;
+  bit IsOptional = false;
 
   /// The name of the method on the target specific asm parser that returns the
   /// default operand for this optional operand. This method is only used if
@@ -809,7 +809,7 @@ class Operand<ValueType ty> : DAGOperand {
   ValueType Type = ty;
   string PrintMethod = "printOperand";
   string EncoderMethod = "";
-  bit hasCompleteDecoder = 1;
+  bit hasCompleteDecoder = true;
   string OperandType = "OPERAND_UNKNOWN";
   dag MIOperandInfo = (ops);
 
@@ -877,8 +877,8 @@ def f64imm : Operand<f64>;
 // have the same LLT).
 class TypedOperand<string Ty> : Operand<untyped> {
   let OperandType = Ty;
-  bit IsPointer = 0;
-  bit IsImmediate = 0;
+  bit IsPointer = false;
+  bit IsImmediate = false;
 }
 
 def type0 : TypedOperand<"OPERAND_GENERIC_0">;
@@ -888,7 +888,7 @@ def type3 : TypedOperand<"OPERAND_GENERIC_3">;
 def type4 : TypedOperand<"OPERAND_GENERIC_4">;
 def type5 : TypedOperand<"OPERAND_GENERIC_5">;
 
-let IsPointer = 1 in {
+let IsPointer = true in {
   def ptype0 : TypedOperand<"OPERAND_GENERIC_0">;
   def ptype1 : TypedOperand<"OPERAND_GENERIC_1">;
   def ptype2 : TypedOperand<"OPERAND_GENERIC_2">;
@@ -900,7 +900,7 @@ let IsPointer = 1 in {
 // untyped_imm is for operands where isImm() will be true. It currently has no
 // special behaviour and is only used for clarity.
 def untyped_imm_0 : TypedOperand<"OPERAND_GENERIC_IMM_0"> {
-  let IsImmediate = 1;
+  let IsImmediate = true;
 }
 
 /// zero_reg definition - Special node to stand for the zero register.
@@ -952,7 +952,7 @@ class InstrInfo {
   // For instance, while both Sparc and PowerPC are big-endian platforms, the
   // Sparc manual specifies its instructions in the format [31..0] (big), while
   // PowerPC specifies them using the format [0..31] (little).
-  bit isLittleEndianEncoding = 0;
+  bit isLittleEndianEncoding = false;
 
   // The instruction properties mayLoad, mayStore, and hasSideEffects are unset
   // by default, and TableGen will infer their value from the instruction
@@ -963,7 +963,7 @@ class InstrInfo {
   // is set, it will guess a safe value instead.
   //
   // This option is a temporary migration help. It will go away.
-  bit guessInstructionProperties = 1;
+  bit guessInstructionProperties = true;
 
   // TableGen's instruction encoder generator has support for matching operands
   // to bit-field variables both by name and by position. While matching by
@@ -975,7 +975,7 @@ class InstrInfo {
   // This option is temporary; it will go away once the TableGen decoder
   // generator has better support for complex operands and targets have
   // migrated away from using positionally encoded operands.
-  bit decodePositionallyEncodedOperands = 0;
+  bit decodePositionallyEncodedOperands = false;
 
   // When set, this indicates that there will be no overlap between those
   // operands that are matched by ordering (positional operands) and those
@@ -984,7 +984,7 @@ class InstrInfo {
   // This option is temporary; it will go away once the TableGen decoder
   // generator has better support for complex operands and targets have
   // migrated away from using positionally encoded operands.
-  bit noNamedPositionallyEncodedOperands = 0;
+  bit noNamedPositionallyEncodedOperands = false;
 }
 
 // Standard Pseudo Instructions.
@@ -994,31 +994,31 @@ class InstrInfo {
 // targets that set guessInstructionProperties=0. Any local definition of
 // mayLoad/mayStore takes precedence over these default values.
 class StandardPseudoInstruction : Instruction {
-  let mayLoad = 0;
-  let mayStore = 0;
-  let isCodeGenOnly = 1;
-  let isPseudo = 1;
-  let hasNoSchedulingInfo = 1;
+  let mayLoad = false;
+  let mayStore = false;
+  let isCodeGenOnly = true;
+  let isPseudo = true;
+  let hasNoSchedulingInfo = true;
   let Namespace = "TargetOpcode";
 }
 def PHI : StandardPseudoInstruction {
   let OutOperandList = (outs unknown:$dst);
   let InOperandList = (ins variable_ops);
   let AsmString = "PHINODE";
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 def INLINEASM : StandardPseudoInstruction {
   let OutOperandList = (outs);
   let InOperandList = (ins variable_ops);
   let AsmString = "";
-  let hasSideEffects = 0;  // Note side effect is encoded in an operand.
+  let hasSideEffects = false;  // Note side effect is encoded in an operand.
 }
 def INLINEASM_BR : StandardPseudoInstruction {
   let OutOperandList = (outs);
   let InOperandList = (ins variable_ops);
   let AsmString = "";
   // Unlike INLINEASM, this is always treated as having side-effects.
-  let hasSideEffects = 1;
+  let hasSideEffects = true;
   // Despite potentially branching, this instruction is intentionally _not_
   // marked as a terminator or a branch.
 }
@@ -1026,164 +1026,177 @@ def CFI_INSTRUCTION : StandardPseudoInstruction {
   let OutOperandList = (outs);
   let InOperandList = (ins i32imm:$id);
   let AsmString = "";
-  let hasCtrlDep = 1;
-  let hasSideEffects = 0;
-  let isNotDuplicable = 1;
+  let hasCtrlDep = true;
+  let hasSideEffects = false;
+  let isNotDuplicable = true;
 }
 def EH_LABEL : StandardPseudoInstruction {
   let OutOperandList = (outs);
   let InOperandList = (ins i32imm:$id);
   let AsmString = "";
-  let hasCtrlDep = 1;
-  let hasSideEffects = 0;
-  let isNotDuplicable = 1;
+  let hasCtrlDep = true;
+  let hasSideEffects = false;
+  let isNotDuplicable = true;
 }
 def GC_LABEL : StandardPseudoInstruction {
   let OutOperandList = (outs);
   let InOperandList = (ins i32imm:$id);
   let AsmString = "";
-  let hasCtrlDep = 1;
-  let hasSideEffects = 0;
-  let isNotDuplicable = 1;
+  let hasCtrlDep = true;
+  let hasSideEffects = false;
+  let isNotDuplicable = true;
 }
 def ANNOTATION_LABEL : StandardPseudoInstruction {
   let OutOperandList = (outs);
   let InOperandList = (ins i32imm:$id);
   let AsmString = "";
-  let hasCtrlDep = 1;
-  let hasSideEffects = 0;
-  let isNotDuplicable = 1;
+  let hasCtrlDep = true;
+  let hasSideEffects = false;
+  let isNotDuplicable = true;
 }
 def KILL : StandardPseudoInstruction {
   let OutOperandList = (outs);
   let InOperandList = (ins variable_ops);
   let AsmString = "";
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 def EXTRACT_SUBREG : StandardPseudoInstruction {
   let OutOperandList = (outs unknown:$dst);
   let InOperandList = (ins unknown:$supersrc, i32imm:$subidx);
   let AsmString = "";
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 def INSERT_SUBREG : StandardPseudoInstruction {
   let OutOperandList = (outs unknown:$dst);
   let InOperandList = (ins unknown:$supersrc, unknown:$subsrc, i32imm:$subidx);
   let AsmString = "";
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
   let Constraints = "$supersrc = $dst";
 }
 def IMPLICIT_DEF : StandardPseudoInstruction {
   let OutOperandList = (outs unknown:$dst);
   let InOperandList = (ins);
   let AsmString = "";
-  let hasSideEffects = 0;
-  let isReMaterializable = 1;
-  let isAsCheapAsAMove = 1;
+  let hasSideEffects = false;
+  let isReMaterializable = true;
+  let isAsCheapAsAMove = true;
 }
 def SUBREG_TO_REG : StandardPseudoInstruction {
   let OutOperandList = (outs unknown:$dst);
   let InOperandList = (ins unknown:$implsrc, unknown:$subsrc, i32imm:$subidx);
   let AsmString = "";
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 def COPY_TO_REGCLASS : StandardPseudoInstruction {
   let OutOperandList = (outs unknown:$dst);
   let InOperandList = (ins unknown:$src, i32imm:$regclass);
   let AsmString = "";
-  let hasSideEffects = 0;
-  let isAsCheapAsAMove = 1;
+  let hasSideEffects = false;
+  let isAsCheapAsAMove = true;
 }
 def DBG_VALUE : StandardPseudoInstruction {
   let OutOperandList = (outs);
   let InOperandList = (ins variable_ops);
   let AsmString = "DBG_VALUE";
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
+}
+def DBG_INSTR_REF : StandardPseudoInstruction {
+  let OutOperandList = (outs);
+  let InOperandList = (ins variable_ops);
+  let AsmString = "DBG_INSTR_REF";
+  let hasSideEffects = false;
 }
 def DBG_LABEL : StandardPseudoInstruction {
   let OutOperandList = (outs);
   let InOperandList = (ins unknown:$label);
   let AsmString = "DBG_LABEL";
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 def REG_SEQUENCE : StandardPseudoInstruction {
   let OutOperandList = (outs unknown:$dst);
   let InOperandList = (ins unknown:$supersrc, variable_ops);
   let AsmString = "";
-  let hasSideEffects = 0;
-  let isAsCheapAsAMove = 1;
+  let hasSideEffects = false;
+  let isAsCheapAsAMove = true;
 }
 def COPY : StandardPseudoInstruction {
   let OutOperandList = (outs unknown:$dst);
   let InOperandList = (ins unknown:$src);
   let AsmString = "";
-  let hasSideEffects = 0;
-  let isAsCheapAsAMove = 1;
-  let hasNoSchedulingInfo = 0;
+  let hasSideEffects = false;
+  let isAsCheapAsAMove = true;
+  let hasNoSchedulingInfo = false;
 }
 def BUNDLE : StandardPseudoInstruction {
   let OutOperandList = (outs);
   let InOperandList = (ins variable_ops);
   let AsmString = "BUNDLE";
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 def LIFETIME_START : StandardPseudoInstruction {
   let OutOperandList = (outs);
   let InOperandList = (ins i32imm:$id);
   let AsmString = "LIFETIME_START";
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
 }
 def LIFETIME_END : StandardPseudoInstruction {
   let OutOperandList = (outs);
   let InOperandList = (ins i32imm:$id);
   let AsmString = "LIFETIME_END";
-  let hasSideEffects = 0;
+  let hasSideEffects = false;
+}
+def PSEUDO_PROBE : StandardPseudoInstruction {
+  let OutOperandList = (outs);
+  let InOperandList = (ins i64imm:$guid, i64imm:$index, i8imm:$type, i32imm:$attr);
+  let AsmString = "PSEUDO_PROBE";
+  let hasSideEffects = 1;
 }
+
 def STACKMAP : StandardPseudoInstruction {
   let OutOperandList = (outs);
   let InOperandList = (ins i64imm:$id, i32imm:$nbytes, variable_ops);
-  let hasSideEffects = 1;
-  let isCall = 1;
-  let mayLoad = 1;
-  let usesCustomInserter = 1;
+  let hasSideEffects = true;
+  let isCall = true;
+  let mayLoad = true;
+  let usesCustomInserter = true;
 }
 def PATCHPOINT : StandardPseudoInstruction {
   let OutOperandList = (outs unknown:$dst);
   let InOperandList = (ins i64imm:$id, i32imm:$nbytes, unknown:$callee,
                        i32imm:$nargs, i32imm:$cc, variable_ops);
-  let hasSideEffects = 1;
-  let isCall = 1;
-  let mayLoad = 1;
-  let usesCustomInserter = 1;
+  let hasSideEffects = true;
+  let isCall = true;
+  let mayLoad = true;
+  let usesCustomInserter = true;
 }
 def STATEPOINT : StandardPseudoInstruction {
-  let OutOperandList = (outs);
+  let OutOperandList = (outs variable_ops);
   let InOperandList = (ins variable_ops);
-  let usesCustomInserter = 1;
-  let mayLoad = 1;
-  let mayStore = 1;
-  let hasSideEffects = 1;
-  let isCall = 1;
+  let usesCustomInserter = true;
+  let mayLoad = true;
+  let mayStore = true;
+  let hasSideEffects = true;
+  let isCall = true;
 }
 def LOAD_STACK_GUARD : StandardPseudoInstruction {
   let OutOperandList = (outs ptr_rc:$dst);
   let InOperandList = (ins);
-  let mayLoad = 1;
-  bit isReMaterializable = 1;
-  let hasSideEffects = 0;
-  bit isPseudo = 1;
+  let mayLoad = true;
+  bit isReMaterializable = true;
+  let hasSideEffects = false;
+  bit isPseudo = true;
 }
 def PREALLOCATED_SETUP : StandardPseudoInstruction {
   let OutOperandList = (outs);
   let InOperandList = (ins i32imm:$a);
-  let usesCustomInserter = 1;
-  let hasSideEffects = 1;
+  let usesCustomInserter = true;
+  let hasSideEffects = true;
 }
 def PREALLOCATED_ARG : StandardPseudoInstruction {
   let OutOperandList = (outs ptr_rc:$loc);
   let InOperandList = (ins i32imm:$a, i32imm:$b);
-  let usesCustomInserter = 1;
-  let hasSideEffects = 1;
+  let usesCustomInserter = true;
+  let hasSideEffects = true;
 }
 def LOCAL_ESCAPE : StandardPseudoInstruction {
   // This instruction is really just a label. It has to be part of the chain so
@@ -1191,93 +1204,94 @@ def LOCAL_ESCAPE : StandardPseudoInstruction {
   // no side effects.
   let OutOperandList = (outs);
   let InOperandList = (ins ptr_rc:$symbol, i32imm:$id);
-  let hasSideEffects = 0;
-  let hasCtrlDep = 1;
+  let hasSideEffects = false;
+  let hasCtrlDep = true;
 }
 def FAULTING_OP : StandardPseudoInstruction {
   let OutOperandList = (outs unknown:$dst);
   let InOperandList = (ins variable_ops);
-  let usesCustomInserter = 1;
-  let hasSideEffects = 1;
-  let mayLoad = 1;
-  let mayStore = 1;
-  let isTerminator = 1;
-  let isBranch = 1;
+  let usesCustomInserter = true;
+  let hasSideEffects = true;
+  let mayLoad = true;
+  let mayStore = true;
+  let isTerminator = true;
+  let isBranch = true;
 }
 def PATCHABLE_OP : StandardPseudoInstruction {
   let OutOperandList = (outs);
   let InOperandList = (ins variable_ops);
-  let usesCustomInserter = 1;
-  let mayLoad = 1;
-  let mayStore = 1;
-  let hasSideEffects = 1;
+  let usesCustomInserter = true;
+  let mayLoad = true;
+  let mayStore = true;
+  let hasSideEffects = true;
 }
 def PATCHABLE_FUNCTION_ENTER : StandardPseudoInstruction {
   let OutOperandList = (outs);
   let InOperandList = (ins);
   let AsmString = "# XRay Function Enter.";
-  let usesCustomInserter = 1;
-  let hasSideEffects = 1;
+  let usesCustomInserter = true;
+  let hasSideEffects = true;
 }
 def PATCHABLE_RET : StandardPseudoInstruction {
   let OutOperandList = (outs);
   let InOperandList = (ins variable_ops);
   let AsmString = "# XRay Function Patchable RET.";
-  let usesCustomInserter = 1;
-  let hasSideEffects = 1;
-  let isTerminator = 1;
-  let isReturn = 1;
+  let usesCustomInserter = true;
+  let hasSideEffects = true;
+  let isTerminator = true;
+  let isReturn = true;
 }
 def PATCHABLE_FUNCTION_EXIT : StandardPseudoInstruction {
   let OutOperandList = (outs);
   let InOperandList = (ins);
   let AsmString = "# XRay Function Exit.";
-  let usesCustomInserter = 1;
-  let hasSideEffects = 1;
-  let isReturn = 0; // Original return instruction will follow
+  let usesCustomInserter = true;
+  let hasSideEffects = true;
+  let isReturn = false; // Original return instruction will follow
 }
 def PATCHABLE_TAIL_CALL : StandardPseudoInstruction {
   let OutOperandList = (outs);
   let InOperandList = (ins variable_ops);
   let AsmString = "# XRay Tail Call Exit.";
-  let usesCustomInserter = 1;
-  let hasSideEffects = 1;
-  let isReturn = 1;
+  let usesCustomInserter = true;
+  let hasSideEffects = true;
+  let isReturn = true;
 }
 def PATCHABLE_EVENT_CALL : StandardPseudoInstruction {
   let OutOperandList = (outs);
   let InOperandList = (ins ptr_rc:$event, unknown:$size);
   let AsmString = "# XRay Custom Event Log.";
-  let usesCustomInserter = 1;
-  let isCall = 1;
-  let mayLoad = 1;
-  let mayStore = 1;
-  let hasSideEffects = 1;
+  let usesCustomInserter = true;
+  let isCall = true;
+  let mayLoad = true;
+  let mayStore = true;
+  let hasSideEffects = true;
 }
 def PATCHABLE_TYPED_EVENT_CALL : StandardPseudoInstruction {
   let OutOperandList = (outs);
   let InOperandList = (ins unknown:$type, ptr_rc:$event, unknown:$size);
   let AsmString = "# XRay Typed Event Log.";
-  let usesCustomInserter = 1;
-  let isCall = 1;
-  let mayLoad = 1;
-  let mayStore = 1;
-  let hasSideEffects = 1;
+  let usesCustomInserter = true;
+  let isCall = true;
+  let mayLoad = true;
+  let mayStore = true;
+  let hasSideEffects = true;
 }
 def FENTRY_CALL : StandardPseudoInstruction {
   let OutOperandList = (outs);
   let InOperandList = (ins);
   let AsmString = "# FEntry call";
-  let usesCustomInserter = 1;
-  let mayLoad = 1;
-  let mayStore = 1;
-  let hasSideEffects = 1;
+  let usesCustomInserter = true;
+  let isCall = true;
+  let mayLoad = true;
+  let mayStore = true;
+  let hasSideEffects = true;
 }
 def ICALL_BRANCH_FUNNEL : StandardPseudoInstruction {
   let OutOperandList = (outs);
   let InOperandList = (ins variable_ops);
   let AsmString = "";
-  let hasSideEffects = 1;
+  let hasSideEffects = true;
 }
 
 // Generic opcodes used in GlobalISel.
@@ -1303,7 +1317,7 @@ class AsmParser {
 
   // ShouldEmitMatchRegisterName - Set to false if the target needs a hand
   // written register name matcher
-  bit ShouldEmitMatchRegisterName = 1;
+  bit ShouldEmitMatchRegisterName = true;
 
   // Set to true if the target needs a generated 'alternative register name'
   // matcher.
@@ -1311,7 +1325,7 @@ class AsmParser {
   // This generates a function which can be used to lookup registers from
   // their aliases. This function will fail when called on targets where
   // several registers share the same alias (i.e. not a 1:1 mapping).
-  bit ShouldEmitMatchRegisterAltName = 0;
+  bit ShouldEmitMatchRegisterAltName = false;
 
   // Set to true if MatchRegisterName and MatchRegisterAltName functions
   // should be generated even if there are duplicate register names. The
@@ -1319,11 +1333,11 @@ class AsmParser {
   // (e.g. in validateTargetOperandClass), and there are no guarantees about
   // which numeric register identifier will be returned in the case of
   // multiple matches.
-  bit AllowDuplicateRegisterNames = 0;
+  bit AllowDuplicateRegisterNames = false;
 
   // HasMnemonicFirst - Set to false if target instructions don't always
   // start with a mnemonic as the first token.
-  bit HasMnemonicFirst = 1;
+  bit HasMnemonicFirst = true;
 
   // ReportMultipleNearMisses -
   // When 0, the assembly matcher reports an error for one encoding or operand
@@ -1331,7 +1345,7 @@ class AsmParser {
   // When 1, the assembly matcher returns a list of encodings that were close
   // to matching the parsed instruction, so to allow more detailed error
   // messages.
-  bit ReportMultipleNearMisses = 0;
+  bit ReportMultipleNearMisses = false;
 }
 def DefaultAsmParser : AsmParser;
 
@@ -1342,7 +1356,7 @@ def DefaultAsmParser : AsmParser;
 //
 class AsmParserVariant {
   // Variant - AsmParsers can be of multiple different variants.  Variants are
-  // used to support targets that need to parser multiple formats for the
+  // used to support targets that need to parse multiple formats for the
   // assembly language.
   int Variant = 0;
 
@@ -1378,7 +1392,7 @@ def all_of;
 /// AssemblerPredicate - This is a Predicate that can be used when the assembler
 /// matches instructions and aliases.
 class AssemblerPredicate<dag cond, string name = ""> {
-  bit AssemblerMatcherPredicate = 1;
+  bit AssemblerMatcherPredicate = true;
   dag AssemblerCondDag = cond;
   string PredicateName = name;
 }
@@ -1453,7 +1467,7 @@ class InstAlias<string Asm, dag Result, int Emit = 1, string VariantName = ""> {
   // Setting this to 0 will cause the alias to ignore the Result instruction's
   // defined AsmMatchConverter and instead use the function generated by the
   // dag Result.
-  bit UseInstAsmMatchConverter = 1;
+  bit UseInstAsmMatchConverter = true;
 
   // Assembler variant name to use for this alias. If not specified then
   // assembler variants will be determined based on AsmString
@@ -1558,7 +1572,8 @@ class ComplexDeprecationPredicate<string dep> {
 // by the scheduler.  Each Processor definition requires corresponding
 // instruction itineraries.
 //
-class Processor<string n, ProcessorItineraries pi, list<SubtargetFeature> f> {
+class Processor<string n, ProcessorItineraries pi, list<SubtargetFeature> f,
+                list<SubtargetFeature> tunef = []> {
   // Name - Chip set name.  Used by command line (-mcpu=) to determine the
   // appropriate target chip.
   //
@@ -1574,6 +1589,12 @@ class Processor<string n, ProcessorItineraries pi, list<SubtargetFeature> f> {
 
   // Features - list of
   list<SubtargetFeature> Features = f;
+
+  // TuneFeatures - list of features for tuning for this CPU. If the target
+  // supports -mtune, this should contain the list of features used to make
+  // microarchitectural optimization decisions for a given processor.  While
+  // Features should contain the architectural features for the processor.
+  list<SubtargetFeature> TuneFeatures = tunef;
 }
 
 // ProcessorModel allows subtargets to specify the more general
@@ -1582,8 +1603,9 @@ class Processor<string n, ProcessorItineraries pi, list<SubtargetFeature> f> {
 //
 // Although this class always passes NoItineraries to the Processor
 // class, the SchedMachineModel may still define valid Itineraries.
-class ProcessorModel<string n, SchedMachineModel m, list<SubtargetFeature> f>
-  : Processor<n, NoItineraries, f> {
+class ProcessorModel<string n, SchedMachineModel m, list<SubtargetFeature> f,
+                     list<SubtargetFeature> tunef = []>
+  : Processor<n, NoItineraries, f, tunef> {
   let SchedModel = m;
 }
 
diff --git a/contrib/llvm-project/llvm/include/llvm/Target/TargetCallingConv.td b/contrib/llvm-project/llvm/include/llvm/Target/TargetCallingConv.td
index 057f33083e08..b3d4fe9d0dbb 100644
--- a/contrib/llvm-project/llvm/include/llvm/Target/TargetCallingConv.td
+++ b/contrib/llvm-project/llvm/include/llvm/Target/TargetCallingConv.td
@@ -187,15 +187,15 @@ class CallingConv<list<CCAction> actions> {
 
   /// If true, this calling convention will be emitted as externally visible in
   /// the llvm namespaces instead of as a static function.
-  bit Entry = 0;
+  bit Entry = false;
 
-  bit Custom = 0;
+  bit Custom = false;
 }
 
 /// CustomCallingConv - An instance of this is used to declare calling
 /// conventions that are implemented using a custom function of the same name.
 class CustomCallingConv : CallingConv<[]> {
-  let Custom = 1;
+  let Custom = true;
 }
 
 /// CalleeSavedRegs - A list of callee saved registers for a given calling
diff --git a/contrib/llvm-project/llvm/include/llvm/Target/TargetInstrPredicate.td b/contrib/llvm-project/llvm/include/llvm/Target/TargetInstrPredicate.td
index 5623461c648d..9f2cde9d9230 100644
--- a/contrib/llvm-project/llvm/include/llvm/Target/TargetInstrPredicate.td
+++ b/contrib/llvm-project/llvm/include/llvm/Target/TargetInstrPredicate.td
@@ -11,7 +11,7 @@
 // MCInstPredicate definitions are used by target scheduling models to describe
 // constraints on instructions.
 //
-// Here is an example of an MCInstPredicate definition in tablegen:
+// Here is an example of an MCInstPredicate definition in TableGen:
 //
 // def MCInstPredicateExample : CheckAll<[
 //    CheckOpcode<[BLR]>,
@@ -126,6 +126,11 @@ class CheckRegOperand<int Index, Register R> : CheckOperandBase<Index> {
 // Check if register operand at index `Index` is the invalid register.
 class CheckInvalidRegOperand<int Index> : CheckOperandBase<Index>;
 
+// Return true if machine operand at position `Index` is a valid
+// register operand.
+class CheckValidRegOperand<int Index> :
+  CheckNot<CheckInvalidRegOperand<Index>>;
+
 // Check that the operand at position `Index` is immediate `Imm`.
 // If field `FunctionMapper` is a non-empty string, then function
 // `FunctionMapper` is applied to the operand value, and the return value is then
@@ -254,6 +259,20 @@ class CheckFunctionPredicate<string MCInstFn, string MachineInstrFn> : MCInstPre
   string MachineInstrFnName = MachineInstrFn;
 }
 
+// Similar to CheckFunctionPredicate. However it assumes that MachineInstrFn is
+// a method in TargetInstrInfo, and MCInstrFn takes an extra pointer to
+// MCInstrInfo.
+//
+// It Expands to:
+//  - TIIPointer->MachineInstrFn(MI)
+//  - MCInstrFn(MI, MCII);
+class CheckFunctionPredicateWithTII<string MCInstFn, string MachineInstrFn, string
+TIIPointer = "TII"> : MCInstPredicate {
+  string MCInstFnName = MCInstFn;
+  string TIIPtrName = TIIPointer;
+  string MachineInstrFnName = MachineInstrFn;
+}
+
 // Used to classify machine instructions based on a machine instruction
 // predicate.
 //
@@ -300,8 +319,8 @@ class DepBreakingClass<list<Instruction> opcodes, MCInstPredicate pred,
 //  - A list of subtarget hooks (Delegates) that are called from this function.
 //
 class STIPredicateDecl<string name, MCInstPredicate default = FalsePred,
-                       bit overrides = 1, bit expandForMC = 1,
-                       bit updatesOpcodeMask = 0,
+                       bit overrides = true, bit expandForMC = true,
+                       bit updatesOpcodeMask = false,
                        list<STIPredicateDecl> delegates = []> {
   string Name = name;
 
@@ -336,7 +355,7 @@ class STIPredicate<STIPredicateDecl declaration,
 
 // Convenience classes and definitions used by processor scheduling models to
 // describe dependency breaking instructions and move elimination candidates.
-let UpdatesOpcodeMask = 1 in {
+let UpdatesOpcodeMask = true in {
 
 def IsZeroIdiomDecl : STIPredicateDecl<"isZeroIdiom">;
 
diff --git a/contrib/llvm-project/llvm/include/llvm/Target/TargetItinerary.td b/contrib/llvm-project/llvm/include/llvm/Target/TargetItinerary.td
index d364fab038b5..a432d4e42b61 100644
--- a/contrib/llvm-project/llvm/include/llvm/Target/TargetItinerary.td
+++ b/contrib/llvm-project/llvm/include/llvm/Target/TargetItinerary.td
@@ -8,7 +8,7 @@
 //
 // This file defines the target-independent scheduling interfaces
 // which should be implemented by each target that uses instruction
-// itineraries for scheduling. Itineraries are details reservation
+// itineraries for scheduling. Itineraries are detailed reservation
 // tables for each instruction class. They are most appropriate for
 // in-order machine with complicated scheduling or bundling constraints.
 //
diff --git a/contrib/llvm-project/llvm/include/llvm/Target/TargetLoweringObjectFile.h b/contrib/llvm-project/llvm/include/llvm/Target/TargetLoweringObjectFile.h
index cc6c93b6ee2b..ff27ceaeac35 100644
--- a/contrib/llvm-project/llvm/include/llvm/Target/TargetLoweringObjectFile.h
+++ b/contrib/llvm-project/llvm/include/llvm/Target/TargetLoweringObjectFile.h
@@ -38,6 +38,7 @@ class Module;
 class SectionKind;
 class StringRef;
 class TargetMachine;
+class DSOLocalEquivalent;
 
 class TargetLoweringObjectFile : public MCObjectFileInfo {
   /// Name-mangler for global names.
@@ -47,6 +48,7 @@ protected:
   bool SupportIndirectSymViaGOTPCRel = false;
   bool SupportGOTPCRelWithOffset = true;
   bool SupportDebugThreadLocalLocation = true;
+  bool SupportDSOLocalEquivalentLowering = false;
 
   /// PersonalityEncoding, LSDAEncoding, TTypeEncoding - Some encoding values
   /// for EH.
@@ -61,6 +63,8 @@ protected:
   /// This section contains the static destructor pointer list.
   MCSection *StaticDtorSection = nullptr;
 
+  const TargetMachine *TM = nullptr;
+
 public:
   TargetLoweringObjectFile() = default;
   TargetLoweringObjectFile(const TargetLoweringObjectFile &) = delete;
@@ -81,6 +85,9 @@ public:
   /// Emit the module-level metadata that the platform cares about.
   virtual void emitModuleMetadata(MCStreamer &Streamer, Module &M) const {}
 
+  /// Emit Call Graph Profile metadata.
+  void emitCGProfileMetadata(MCStreamer &Streamer, Module &M) const;
+
   /// Get the module-level metadata that the platform cares about.
   virtual void getModuleMetadata(Module &M) {}
 
@@ -118,6 +125,10 @@ public:
 
   virtual MCSection *getSectionForJumpTable(const Function &F,
                                             const TargetMachine &TM) const;
+  virtual MCSection *getSectionForLSDA(const Function &F,
+                                       const TargetMachine &TM) const {
+    return LSDASection;
+  }
 
   virtual bool shouldPutJumpTableInFunctionSection(bool UsesLabelDifference,
                                                    const Function &F) const;
@@ -151,7 +162,7 @@ public:
   unsigned getPersonalityEncoding() const { return PersonalityEncoding; }
   unsigned getLSDAEncoding() const { return LSDAEncoding; }
   unsigned getTTypeEncoding() const { return TTypeEncoding; }
-  unsigned getCallSiteEncoding() const { return CallSiteEncoding; }
+  unsigned getCallSiteEncoding() const;
 
   const MCExpr *getTTypeReference(const MCSymbolRefExpr *Sym, unsigned Encoding,
                                   MCStreamer &Streamer) const;
@@ -176,6 +187,17 @@ public:
     return nullptr;
   }
 
+  /// Target supports a native lowering of a dso_local_equivalent constant
+  /// without needing to replace it with equivalent IR.
+  bool supportDSOLocalEquivalentLowering() const {
+    return SupportDSOLocalEquivalentLowering;
+  }
+
+  virtual const MCExpr *lowerDSOLocalEquivalent(const DSOLocalEquivalent *Equiv,
+                                                const TargetMachine &TM) const {
+    return nullptr;
+  }
+
   /// Target supports replacing a data "PC"-relative access to a symbol
   /// through another symbol, by accessing the later via a GOT entry instead?
   bool supportIndirectSymViaGOTPCRel() const {
@@ -203,12 +225,6 @@ public:
     return nullptr;
   }
 
-  virtual void emitLinkerFlagsForGlobal(raw_ostream &OS,
-                                        const GlobalValue *GV) const {}
-
-  virtual void emitLinkerFlagsForUsed(raw_ostream &OS,
-                                      const GlobalValue *GV) const {}
-
   /// If supported, return the section to use for the llvm.commandline
   /// metadata. Otherwise, return nullptr.
   virtual MCSection *getSectionForCommandLines() const {
@@ -226,7 +242,8 @@ public:
   /// On targets that support TOC entries, return a section for the entry given
   /// the symbol it refers to.
   /// TODO: Implement this interface for existing ELF targets.
-  virtual MCSection *getSectionForTOCEntry(const MCSymbol *S) const {
+  virtual MCSection *getSectionForTOCEntry(const MCSymbol *S,
+                                           const TargetMachine &TM) const {
     return nullptr;
   }
 
@@ -247,7 +264,8 @@ public:
 
   /// If supported, return the function entry point symbol.
   /// Otherwise, returns nulltpr.
-  virtual MCSymbol *getFunctionEntryPointSymbol(const Function *F,
+  /// Func must be a function or an alias which has a function as base object.
+  virtual MCSymbol *getFunctionEntryPointSymbol(const GlobalValue *Func,
                                                 const TargetMachine &TM) const {
     return nullptr;
   }
diff --git a/contrib/llvm-project/llvm/include/llvm/Target/TargetMachine.h b/contrib/llvm-project/llvm/include/llvm/Target/TargetMachine.h
index 6d539f1145ee..f9a054dbed3d 100644
--- a/contrib/llvm-project/llvm/include/llvm/Target/TargetMachine.h
+++ b/contrib/llvm-project/llvm/include/llvm/Target/TargetMachine.h
@@ -16,24 +16,36 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CodeGen.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Target/CGPassBuilderOption.h"
 #include "llvm/Target/TargetOptions.h"
 #include <string>
 
 namespace llvm {
 
+class AAManager;
+template <typename IRUnitT, typename AnalysisManagerT, typename... ExtraArgTs>
+class PassManager;
+using ModulePassManager = PassManager<Module>;
+
 class Function;
 class GlobalValue;
+class MachineFunctionPassManager;
+class MachineFunctionAnalysisManager;
 class MachineModuleInfoWrapperPass;
 class Mangler;
 class MCAsmInfo;
 class MCContext;
 class MCInstrInfo;
 class MCRegisterInfo;
+class MCStreamer;
 class MCSubtargetInfo;
 class MCSymbol;
 class raw_pwrite_stream;
+class PassBuilder;
 class PassManagerBuilder;
 struct PerFunctionMIParsingState;
 class SMDiagnostic;
@@ -111,6 +123,7 @@ public:
   const Triple &getTargetTriple() const { return TargetTriple; }
   StringRef getTargetCPU() const { return TargetCPU; }
   StringRef getTargetFeatureString() const { return TargetFS; }
+  void setTargetFeatureString(StringRef FS) { TargetFS = std::string(FS); }
 
   /// Virtual method implemented by subclasses that returns a reference to that
   /// target's TargetSubtargetInfo-derived member variable.
@@ -241,7 +254,9 @@ public:
     Options.SupportsDebugEntryValues = Enable;
   }
 
-  bool shouldPrintMachineCode() const { return Options.PrintMachineCode; }
+  bool getAIXExtendedAltivecABI() const {
+    return Options.EnableAIXExtendedAltivecABI;
+  }
 
   bool getUniqueSectionNames() const { return Options.UniqueSectionNames; }
 
@@ -262,6 +277,16 @@ public:
     return Options.FunctionSections;
   }
 
+  /// Return true if visibility attribute should not be emitted in XCOFF,
+  /// corresponding to -mignore-xcoff-visibility.
+  bool getIgnoreXCOFFVisibility() const {
+    return Options.IgnoreXCOFFVisibility;
+  }
+
+  /// Return true if XCOFF traceback table should be emitted,
+  /// corresponding to -xcoff-traceback-table.
+  bool getXCOFFTracebackTable() const { return Options.XCOFFTracebackTable; }
+
   /// If basic blocks should be emitted into their own section,
   /// corresponding to -fbasic-block-sections.
   llvm::BasicBlockSection getBBSectionsType() const {
@@ -273,6 +298,19 @@ public:
     return Options.BBSectionsFuncListBuf.get();
   }
 
+  /// Returns true if a cast between SrcAS and DestAS is a noop.
+  virtual bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const {
+    return false;
+  }
+
+  /// If the specified generic pointer could be assumed as a pointer to a
+  /// specific address space, return that address space.
+  ///
+  /// Under offloading programming, the offloading target may be passed with
+  /// values only prepared on the host side and could assume certain
+  /// properties.
+  virtual unsigned getAssumedAddrSpace(const Value *V) const { return -1; }
+
   /// Get a \c TargetIRAnalysis appropriate for the target.
   ///
   /// This is used to construct the new pass manager's target IR analysis pass,
@@ -290,6 +328,15 @@ public:
   /// PassManagerBuilder::addExtension.
   virtual void adjustPassManager(PassManagerBuilder &) {}
 
+  /// Allow the target to modify the pass pipeline with New Pass Manager
+  /// (similar to adjustPassManager for Legacy Pass manager).
+  virtual void registerPassBuilderCallbacks(PassBuilder &,
+                                            bool DebugPassManager) {}
+
+  /// Allow the target to register alias analyses with the AAManager for use
+  /// with the new pass manager. Only affects the "default" AAManager.
+  virtual void registerDefaultAliasAnalyses(AAManager &) {}
+
   /// Add passes to the specified pass manager to get the specified file
   /// emitted.  Typically this will involve several steps of code generation.
   /// This method should return true if emission of this file type is not
@@ -329,6 +376,8 @@ public:
   /// The integer bit size to use for SjLj based exception handling.
   static constexpr unsigned DefaultSjLjDataSize = 32;
   virtual unsigned getSjLjDataSize() const { return DefaultSjLjDataSize; }
+
+  static std::pair<int, int> parseBinutilsVersion(StringRef Version);
 };
 
 /// This class describes a target machine that is implemented with the LLVM
@@ -364,6 +413,21 @@ public:
                       bool DisableVerify = true,
                       MachineModuleInfoWrapperPass *MMIWP = nullptr) override;
 
+  virtual Error buildCodeGenPipeline(ModulePassManager &,
+                                     MachineFunctionPassManager &,
+                                     MachineFunctionAnalysisManager &,
+                                     raw_pwrite_stream &, raw_pwrite_stream *,
+                                     CodeGenFileType, CGPassBuilderOption,
+                                     PassInstrumentationCallbacks *) {
+    return make_error<StringError>("buildCodeGenPipeline is not overriden",
+                                   inconvertibleErrorCode());
+  }
+
+  virtual std::pair<StringRef, bool> getPassNameFromLegacyName(StringRef) {
+    llvm_unreachable(
+        "getPassNameFromLegacyName parseMIRPipeline is not overriden");
+  }
+
   /// Add passes to the specified pass manager to get machine code emitted with
   /// the MCJIT. This method returns true if machine code is not supported. It
   /// fills the MCContext Ctx pointer which can be used to build custom
@@ -384,6 +448,10 @@ public:
                      raw_pwrite_stream *DwoOut, CodeGenFileType FileType,
                      MCContext &Context);
 
+  Expected<std::unique_ptr<MCStreamer>>
+  createMCStreamer(raw_pwrite_stream &Out, raw_pwrite_stream *DwoOut,
+                   CodeGenFileType FileType, MCContext &Ctx);
+
   /// True if the target uses physical regs (as nearly all targets do). False
   /// for stack machines such as WebAssembly and other virtual-register
   /// machines. If true, all vregs must be allocated before PEI. If false, then
diff --git a/contrib/llvm-project/llvm/include/llvm/Target/TargetOptions.h b/contrib/llvm-project/llvm/include/llvm/Target/TargetOptions.h
index d73686b2bdd8..fd014d46e758 100644
--- a/contrib/llvm-project/llvm/include/llvm/Target/TargetOptions.h
+++ b/contrib/llvm-project/llvm/include/llvm/Target/TargetOptions.h
@@ -67,9 +67,18 @@ namespace llvm {
     Labels, // Do not use Basic Block Sections but label basic blocks.  This
             // is useful when associating profile counts from virtual addresses
             // to basic blocks.
+    Preset, // Similar to list but the blocks are identified by passes which
+            // seek to use Basic Block Sections, e.g. MachineFunctionSplitter.
+            // This option cannot be set via the command line.
     None    // Do not use Basic Block Sections.
   };
 
+  enum class StackProtectorGuards {
+    None,
+    TLS,
+    Global
+  };
+
   enum class EABI {
     Unknown,
     Default, // Default means not specified
@@ -113,33 +122,34 @@ namespace llvm {
   class TargetOptions {
   public:
     TargetOptions()
-        : PrintMachineCode(false), UnsafeFPMath(false), NoInfsFPMath(false),
-          NoNaNsFPMath(false), NoTrappingFPMath(true),
-          NoSignedZerosFPMath(false),
+        : UnsafeFPMath(false), NoInfsFPMath(false), NoNaNsFPMath(false),
+          NoTrappingFPMath(true), NoSignedZerosFPMath(false),
+          EnableAIXExtendedAltivecABI(false),
           HonorSignDependentRoundingFPMathOption(false), NoZerosInBSS(false),
           GuaranteedTailCallOpt(false), StackSymbolOrdering(true),
           EnableFastISel(false), EnableGlobalISel(false), UseInitArray(false),
           DisableIntegratedAS(false), RelaxELFRelocations(false),
           FunctionSections(false), DataSections(false),
+          IgnoreXCOFFVisibility(false), XCOFFTracebackTable(true),
           UniqueSectionNames(true), UniqueBasicBlockSectionNames(false),
           TrapUnreachable(false), NoTrapAfterNoreturn(false), TLSSize(0),
           EmulatedTLS(false), ExplicitEmulatedTLS(false), EnableIPRA(false),
           EmitStackSizeSection(false), EnableMachineOutliner(false),
-          SupportsDefaultOutlining(false), EmitAddrsig(false),
-          EmitCallSiteInfo(false), SupportsDebugEntryValues(false),
-          EnableDebugEntryValues(false), ForceDwarfFrameSection(false),
-          XRayOmitFunctionIndex(false),
+          EnableMachineFunctionSplitter(false), SupportsDefaultOutlining(false),
+          EmitAddrsig(false), EmitCallSiteInfo(false),
+          SupportsDebugEntryValues(false), EnableDebugEntryValues(false),
+          PseudoProbeForProfiling(false), ValueTrackingVariableLocations(false),
+          ForceDwarfFrameSection(false), XRayOmitFunctionIndex(false),
           FPDenormalMode(DenormalMode::IEEE, DenormalMode::IEEE) {}
 
-    /// PrintMachineCode - This flag is enabled when the -print-machineinstrs
-    /// option is specified on the command line, and should enable debugging
-    /// output from the code generator.
-    unsigned PrintMachineCode : 1;
-
     /// DisableFramePointerElim - This returns true if frame pointer elimination
     /// optimization should be disabled for the given machine function.
     bool DisableFramePointerElim(const MachineFunction &MF) const;
 
+    /// If greater than 0, override the default value of
+    /// MCAsmInfo::BinutilsVersion.
+    std::pair<int, int> BinutilsVersion{0, 0};
+
     /// UnsafeFPMath - This flag is enabled when the
     /// -enable-unsafe-fp-math flag is specified on the command line.  When
     /// this flag is off (the default), the code generator is not allowed to
@@ -170,6 +180,12 @@ namespace llvm {
     /// argument or result as insignificant.
     unsigned NoSignedZerosFPMath : 1;
 
+    /// EnableAIXExtendedAltivecABI - This flag returns true when -vec-extabi is
+    /// specified. The code generator is then able to use both volatile and
+    /// nonvolitle vector regisers. When false, the code generator only uses
+    /// volatile vector registers which is the default setting on AIX.
+    unsigned EnableAIXExtendedAltivecABI : 1;
+
     /// HonorSignDependentRoundingFPMath - This returns true when the
     /// -enable-sign-dependent-rounding-fp-math is specified.  If this returns
     /// false (the default), the code generator is allowed to assume that the
@@ -232,6 +248,12 @@ namespace llvm {
     /// Emit data into separate sections.
     unsigned DataSections : 1;
 
+    /// Do not emit visibility attribute for xcoff.
+    unsigned IgnoreXCOFFVisibility : 1;
+
+    /// Emit XCOFF traceback table.
+    unsigned XCOFFTracebackTable : 1;
+
     unsigned UniqueSectionNames : 1;
 
     /// Use unique names for basic block sections.
@@ -263,6 +285,9 @@ namespace llvm {
     /// Enables the MachineOutliner pass.
     unsigned EnableMachineOutliner : 1;
 
+    /// Enables the MachineFunctionSplitter pass.
+    unsigned EnableMachineFunctionSplitter : 1;
+
     /// Set if the target supports default outlining behaviour.
     unsigned SupportsDefaultOutlining : 1;
 
@@ -291,12 +316,30 @@ namespace llvm {
     /// production.
     bool ShouldEmitDebugEntryValues() const;
 
+    /// Emit pseudo probes into the binary for sample profiling
+    unsigned PseudoProbeForProfiling : 1;
+
+    // When set to true, use experimental new debug variable location tracking,
+    // which seeks to follow the values of variables rather than their location,
+    // post isel.
+    unsigned ValueTrackingVariableLocations : 1;
+
     /// Emit DWARF debug frame section.
     unsigned ForceDwarfFrameSection : 1;
 
     /// Emit XRay Function Index section
     unsigned XRayOmitFunctionIndex : 1;
 
+    /// Stack protector guard offset to use.
+    unsigned StackProtectorGuardOffset : 32;
+
+    /// Stack protector guard mode to use, e.g. tls, global.
+    StackProtectorGuards StackProtectorGuard =
+                                         StackProtectorGuards::None;
+
+    /// Stack protector guard reg to use, e.g. usually fs or gs in X86.
+    std::string StackProtectorGuardReg = "None";
+
     /// FloatABIType - This setting is set by -float-abi=xxx option is specfied
     /// on the command line. This setting may either be Default, Soft, or Hard.
     /// Default selects the target's default behavior. Soft selects the ABI for
diff --git a/contrib/llvm-project/llvm/include/llvm/Target/TargetPfmCounters.td b/contrib/llvm-project/llvm/include/llvm/Target/TargetPfmCounters.td
index e1d5013c1291..b00f3e19c35f 100644
--- a/contrib/llvm-project/llvm/include/llvm/Target/TargetPfmCounters.td
+++ b/contrib/llvm-project/llvm/include/llvm/Target/TargetPfmCounters.td
@@ -7,6 +7,8 @@
 //===----------------------------------------------------------------------===//
 //
 // This file defines the target-independent interfaces for performance counters.
+//
+//===----------------------------------------------------------------------===//
 
 // Definition of a hardware counters from libpfm identifiers.
 class PfmCounter<string counter> {
diff --git a/contrib/llvm-project/llvm/include/llvm/Target/TargetSchedule.td b/contrib/llvm-project/llvm/include/llvm/Target/TargetSchedule.td
index 9f2f27ddcb25..a822878ead7f 100644
--- a/contrib/llvm-project/llvm/include/llvm/Target/TargetSchedule.td
+++ b/contrib/llvm-project/llvm/include/llvm/Target/TargetSchedule.td
@@ -87,7 +87,7 @@ class SchedMachineModel {
   // Per-cycle resources tables.
   ProcessorItineraries Itineraries = NoItineraries;
 
-  bit PostRAScheduler = 0; // Enable Post RegAlloc Scheduler pass.
+  bit PostRAScheduler = false; // Enable Post RegAlloc Scheduler pass.
 
   // Subtargets that define a model for only a subset of instructions
   // that have a scheduling class (itinerary class or SchedRW list)
@@ -96,13 +96,13 @@ class SchedMachineModel {
   // be an error. This should only be set during initial bringup,
   // or there will be no way to catch simple errors in the model
   // resulting from changes to the instruction definitions.
-  bit CompleteModel = 1;
+  bit CompleteModel = true;
 
   // Indicates that we should do full overlap checking for multiple InstrRWs
   // defining the same instructions within the same SchedMachineModel.
   // FIXME: Remove when all in tree targets are clean with the full check
   // enabled.
-  bit FullInstRWOverlapCheck = 1;
+  bit FullInstRWOverlapCheck = true;
 
   // A processor may only implement part of published ISA, due to either new ISA
   // extensions, (e.g. Pentium 4 doesn't have AVX) or implementation
@@ -118,12 +118,12 @@ class SchedMachineModel {
   // field.
   list<Predicate> UnsupportedFeatures = [];
 
-  bit NoModel = 0; // Special tag to indicate missing machine model.
+  bit NoModel = false; // Special tag to indicate missing machine model.
 }
 
 def NoSchedModel : SchedMachineModel {
-  let NoModel = 1;
-  let CompleteModel = 0;
+  let NoModel = true;
+  let CompleteModel = false;
 }
 
 // Define a kind of processor resource that may be common across
@@ -254,14 +254,14 @@ class ProcWriteResources<list<ProcResourceKind> resources> {
   list<int> ResourceCycles = [];
   int Latency = 1;
   int NumMicroOps = 1;
-  bit BeginGroup = 0;
-  bit EndGroup = 0;
+  bit BeginGroup = false;
+  bit EndGroup = false;
   // Allow a processor to mark some scheduling classes as unsupported
   // for stronger verification.
-  bit Unsupported = 0;
+  bit Unsupported = false;
   // Allow a processor to mark some scheduling classes as single-issue.
   // SingleIssue is an alias for Begin/End Group.
-  bit SingleIssue = 0;
+  bit SingleIssue = false;
   SchedMachineModel SchedModel = ?;
 }
 
@@ -317,7 +317,7 @@ class ProcReadAdvance<int cycles, list<SchedWrite> writes = []> {
   list<SchedWrite> ValidWrites = writes;
   // Allow a processor to mark some scheduling classes as unsupported
   // for stronger verification.
-  bit Unsupported = 0;
+  bit Unsupported = false;
   SchedMachineModel SchedModel = ?;
 }
 
@@ -395,7 +395,7 @@ class SchedVar<SchedPredicateBase pred, list<SchedReadWrite> selected> {
 // SchedModel silences warnings but is ignored.
 class SchedVariant<list<SchedVar> variants> {
   list<SchedVar> Variants = variants;
-  bit Variadic = 0;
+  bit Variadic = false;
   SchedMachineModel SchedModel = ?;
 }
 
@@ -428,7 +428,7 @@ class InstRW<list<SchedReadWrite> rw, dag instrlist> {
   dag Instrs = instrlist;
   SchedMachineModel SchedModel = ?;
   // Allow a subtarget to mark some instructions as unsupported.
-  bit Unsupported = 0;
+  bit Unsupported = false;
 }
 
 // Map a set of itinerary classes to SchedReadWrite resources. This is
@@ -535,7 +535,7 @@ class SchedAlias<SchedReadWrite match, SchedReadWrite alias> {
 
 class RegisterFile<int numPhysRegs, list<RegisterClass> Classes = [],
                    list<int> Costs = [], list<bit> AllowMoveElim = [],
-                   int MaxMoveElimPerCy = 0, bit AllowZeroMoveElimOnly = 0> {
+                   int MaxMoveElimPerCy = 0, bit AllowZeroMoveElimOnly = false> {
   list<RegisterClass> RegClasses = Classes;
   list<int> RegCosts = Costs;
   list<bit> AllowMoveElimination = AllowMoveElim;
diff --git a/contrib/llvm-project/llvm/include/llvm/Target/TargetSelectionDAG.td b/contrib/llvm-project/llvm/include/llvm/Target/TargetSelectionDAG.td
index de809bb10d49..a09feca6ca9b 100644
--- a/contrib/llvm-project/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/contrib/llvm-project/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -164,6 +164,9 @@ def SDTIntToFPOp : SDTypeProfile<1, 1, [    // [su]int_to_fp
 def SDTFPToIntOp : SDTypeProfile<1, 1, [    // fp_to_[su]int
   SDTCisInt<0>, SDTCisFP<1>, SDTCisSameNumEltsAs<0, 1>
 ]>;
+def SDTFPToIntSatOp : SDTypeProfile<1, 2, [    // fp_to_[su]int_sat
+  SDTCisInt<0>, SDTCisFP<1>, SDTCisInt<2>, SDTCisSameNumEltsAs<0, 1>
+]>;
 def SDTExtInreg : SDTypeProfile<1, 2, [     // sext_inreg
   SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisVT<2, OtherVT>,
   SDTCisVTSmallerThanOp<2, 1>
@@ -212,6 +215,8 @@ def SDTCatchret : SDTypeProfile<0, 2, [     // catchret
 
 def SDTNone : SDTypeProfile<0, 0, []>;      // ret, trap
 
+def SDTUBSANTrap : SDTypeProfile<0, 1, []>;      // ubsantrap
+
 def SDTLoad : SDTypeProfile<1, 1, [         // load
   SDTCisPtrTy<1>
 ]>;
@@ -245,6 +250,10 @@ def SDTVecInsert : SDTypeProfile<1, 3, [    // vector insert
 def SDTVecReduce : SDTypeProfile<1, 1, [    // vector reduction
   SDTCisInt<0>, SDTCisVec<1>
 ]>;
+def SDTFPVecReduce : SDTypeProfile<1, 1, [  // FP vector reduction
+  SDTCisFP<0>, SDTCisVec<1>
+]>;
+
 
 def SDTSubVecExtract : SDTypeProfile<1, 2, [// subvector extract
   SDTCisSubVecOfVec<0,1>, SDTCisInt<2>
@@ -396,6 +405,8 @@ def saddsat    : SDNode<"ISD::SADDSAT"   , SDTIntBinOp, [SDNPCommutative]>;
 def uaddsat    : SDNode<"ISD::UADDSAT"   , SDTIntBinOp, [SDNPCommutative]>;
 def ssubsat    : SDNode<"ISD::SSUBSAT"   , SDTIntBinOp>;
 def usubsat    : SDNode<"ISD::USUBSAT"   , SDTIntBinOp>;
+def sshlsat    : SDNode<"ISD::SSHLSAT"   , SDTIntBinOp>;
+def ushlsat    : SDNode<"ISD::USHLSAT"   , SDTIntBinOp>;
 
 def smulfix    : SDNode<"ISD::SMULFIX"   , SDTIntScaledBinOp, [SDNPCommutative]>;
 def smulfixsat : SDNode<"ISD::SMULFIXSAT", SDTIntScaledBinOp, [SDNPCommutative]>;
@@ -432,14 +443,15 @@ def vecreduce_smax  : SDNode<"ISD::VECREDUCE_SMAX", SDTVecReduce>;
 def vecreduce_umax  : SDNode<"ISD::VECREDUCE_UMAX", SDTVecReduce>;
 def vecreduce_smin  : SDNode<"ISD::VECREDUCE_SMIN", SDTVecReduce>;
 def vecreduce_umin  : SDNode<"ISD::VECREDUCE_UMIN", SDTVecReduce>;
+def vecreduce_fadd  : SDNode<"ISD::VECREDUCE_FADD", SDTFPVecReduce>;
 
 def fadd       : SDNode<"ISD::FADD"       , SDTFPBinOp, [SDNPCommutative]>;
 def fsub       : SDNode<"ISD::FSUB"       , SDTFPBinOp>;
 def fmul       : SDNode<"ISD::FMUL"       , SDTFPBinOp, [SDNPCommutative]>;
 def fdiv       : SDNode<"ISD::FDIV"       , SDTFPBinOp>;
 def frem       : SDNode<"ISD::FREM"       , SDTFPBinOp>;
-def fma        : SDNode<"ISD::FMA"        , SDTFPTernaryOp>;
-def fmad       : SDNode<"ISD::FMAD"       , SDTFPTernaryOp>;
+def fma        : SDNode<"ISD::FMA"        , SDTFPTernaryOp, [SDNPCommutative]>;
+def fmad       : SDNode<"ISD::FMAD"       , SDTFPTernaryOp, [SDNPCommutative]>;
 def fabs       : SDNode<"ISD::FABS"       , SDTFPUnaryOp>;
 def fminnum    : SDNode<"ISD::FMINNUM"    , SDTFPBinOp,
                                   [SDNPCommutative, SDNPAssociative]>;
@@ -482,6 +494,8 @@ def sint_to_fp : SDNode<"ISD::SINT_TO_FP" , SDTIntToFPOp>;
 def uint_to_fp : SDNode<"ISD::UINT_TO_FP" , SDTIntToFPOp>;
 def fp_to_sint : SDNode<"ISD::FP_TO_SINT" , SDTFPToIntOp>;
 def fp_to_uint : SDNode<"ISD::FP_TO_UINT" , SDTFPToIntOp>;
+def fp_to_sint_sat : SDNode<"ISD::FP_TO_SINT_SAT" , SDTFPToIntSatOp>;
+def fp_to_uint_sat : SDNode<"ISD::FP_TO_UINT_SAT" , SDTFPToIntSatOp>;
 def f16_to_fp  : SDNode<"ISD::FP16_TO_FP" , SDTIntToFPOp>;
 def fp_to_f16  : SDNode<"ISD::FP_TO_FP16" , SDTFPToIntOp>;
 
@@ -496,7 +510,7 @@ def strict_fdiv       : SDNode<"ISD::STRICT_FDIV",
 def strict_frem       : SDNode<"ISD::STRICT_FREM",
                                SDTFPBinOp, [SDNPHasChain]>;
 def strict_fma        : SDNode<"ISD::STRICT_FMA",
-                               SDTFPTernaryOp, [SDNPHasChain]>;
+                               SDTFPTernaryOp, [SDNPHasChain, SDNPCommutative]>;
 def strict_fsqrt      : SDNode<"ISD::STRICT_FSQRT",
                                SDTFPUnaryOp, [SDNPHasChain]>;
 def strict_fsin       : SDNode<"ISD::STRICT_FSIN",
@@ -553,6 +567,8 @@ def strict_sint_to_fp : SDNode<"ISD::STRICT_SINT_TO_FP",
                                SDTIntToFPOp, [SDNPHasChain]>;
 def strict_uint_to_fp : SDNode<"ISD::STRICT_UINT_TO_FP",
                                SDTIntToFPOp, [SDNPHasChain]>;
+def strict_fsetcc  : SDNode<"ISD::STRICT_FSETCC",  SDTSetCC, [SDNPHasChain]>;
+def strict_fsetccs : SDNode<"ISD::STRICT_FSETCCS", SDTSetCC, [SDNPHasChain]>;
 
 def setcc      : SDNode<"ISD::SETCC"      , SDTSetCC>;
 def select     : SDNode<"ISD::SELECT"     , SDTSelect>;
@@ -571,6 +587,8 @@ def trap       : SDNode<"ISD::TRAP"       , SDTNone,
                         [SDNPHasChain, SDNPSideEffect]>;
 def debugtrap  : SDNode<"ISD::DEBUGTRAP"  , SDTNone,
                         [SDNPHasChain, SDNPSideEffect]>;
+def ubsantrap  : SDNode<"ISD::UBSANTRAP"  , SDTUBSANTrap,
+                        [SDNPHasChain, SDNPSideEffect]>;
 
 def prefetch   : SDNode<"ISD::PREFETCH"   , SDTPrefetch,
                         [SDNPHasChain, SDNPMayLoad, SDNPMayStore,
@@ -634,6 +652,7 @@ def ist        : SDNode<"ISD::STORE"      , SDTIStore,
 
 def vector_shuffle : SDNode<"ISD::VECTOR_SHUFFLE", SDTVecShuffle, []>;
 def build_vector : SDNode<"ISD::BUILD_VECTOR", SDTypeProfile<1, -1, []>, []>;
+def splat_vector : SDNode<"ISD::SPLAT_VECTOR", SDTypeProfile<1, 1, []>, []>;
 def scalar_to_vector : SDNode<"ISD::SCALAR_TO_VECTOR", SDTypeProfile<1, 1, []>,
                               []>;
 
@@ -749,7 +768,7 @@ class PatFrags<dag ops, list<dag> frags, code pred = [{}],
   // This is useful when Fragments involves associative / commutative
   // operators: a single piece of code can easily refer to all operands even
   // when re-associated / commuted variants of the fragment are matched.
-  bit PredicateCodeUsesOperands = 0;
+  bit PredicateCodeUsesOperands = false;
 
   // Define a few pre-packaged predicates. This helps GlobalISel import
   // existing rules from SelectionDAG for many common cases.
@@ -848,13 +867,13 @@ class ImmLeaf<ValueType vt, code pred, SDNodeXForm xform = NOOP_SDNodeXForm,
               SDNode ImmNode = imm>
   : PatFrag<(ops), (vt ImmNode), [{}], xform> {
   let ImmediateCode = pred;
-  bit FastIselShouldIgnore = 0;
+  bit FastIselShouldIgnore = false;
 
   // Is the data type of the immediate an APInt?
-  bit IsAPInt = 0;
+  bit IsAPInt = false;
 
   // Is the data type of the immediate an APFloat?
-  bit IsAPFloat = 0;
+  bit IsAPFloat = false;
 }
 
 // Convenience wrapper for ImmLeaf to use timm/TargetConstant instead
@@ -871,8 +890,8 @@ class TImmLeaf<ValueType vt, code pred, SDNodeXForm xform = NOOP_SDNodeXForm,
 // IntImmLeaf will allow GlobalISel to import the rule.
 class IntImmLeaf<ValueType vt, code pred, SDNodeXForm xform = NOOP_SDNodeXForm>
     : ImmLeaf<vt, pred, xform> {
-  let IsAPInt = 1;
-  let FastIselShouldIgnore = 1;
+  let IsAPInt = true;
+  let FastIselShouldIgnore = true;
 }
 
 // An ImmLeaf except that Imm is an APFloat.
@@ -881,8 +900,8 @@ class IntImmLeaf<ValueType vt, code pred, SDNodeXForm xform = NOOP_SDNodeXForm>
 // generate code for rules that make use of it.
 class FPImmLeaf<ValueType vt, code pred, SDNodeXForm xform = NOOP_SDNodeXForm>
   : ImmLeaf<vt, pred, xform, fpimm> {
-  let IsAPFloat = 1;
-  let FastIselShouldIgnore = 1;
+  let IsAPFloat = true;
+  let FastIselShouldIgnore = true;
 }
 
 // Leaf fragments.
@@ -890,17 +909,23 @@ class FPImmLeaf<ValueType vt, code pred, SDNodeXForm xform = NOOP_SDNodeXForm>
 def vtInt      : PatLeaf<(vt),  [{ return N->getVT().isInteger(); }]>;
 def vtFP       : PatLeaf<(vt),  [{ return N->getVT().isFloatingPoint(); }]>;
 
-// Use ISD::isBuildVectorAllOnes or ISD::isBuildVectorAllZeros to look for
-// the corresponding build_vector. Will look through bitcasts except when used
-// as a pattern root.
-def immAllOnesV; // ISD::isBuildVectorAllOnes
-def immAllZerosV; // ISD::isBuildVectorAllZeros
+// Use ISD::isConstantSplatVectorAllOnes or ISD::isConstantSplatVectorAllZeros
+// to look for the corresponding build_vector or splat_vector. Will look through
+// bitcasts and check for either opcode, except when used as a pattern root.
+// When used as a pattern root, only fixed-length build_vector and scalable
+// splat_vector are supported.
+def immAllOnesV; // ISD::isConstantSplatVectorAllOnes
+def immAllZerosV; // ISD::isConstantSplatVectorAllZeros
 
 // Other helper fragments.
 def not  : PatFrag<(ops node:$in), (xor node:$in, -1)>;
 def vnot : PatFrag<(ops node:$in), (xor node:$in, immAllOnesV)>;
 def ineg : PatFrag<(ops node:$in), (sub 0, node:$in)>;
 
+def zanyext : PatFrags<(ops node:$op),
+                       [(zext node:$op),
+                        (anyext node:$op)]>;
+
 // null_frag - The null pattern operator is used in multiclass instantiations
 // which accept an SDPatternOperator for use in matching patterns for internal
 // definitions. When expanding a pattern, if the null fragment is referenced
@@ -910,222 +935,222 @@ def null_frag : SDPatternOperator;
 
 // load fragments.
 def unindexedload : PatFrag<(ops node:$ptr), (ld node:$ptr)> {
-  let IsLoad = 1;
-  let IsUnindexed = 1;
+  let IsLoad = true;
+  let IsUnindexed = true;
 }
 def load : PatFrag<(ops node:$ptr), (unindexedload node:$ptr)> {
-  let IsLoad = 1;
-  let IsNonExtLoad = 1;
+  let IsLoad = true;
+  let IsNonExtLoad = true;
 }
 
 // extending load fragments.
 def extload   : PatFrag<(ops node:$ptr), (unindexedload node:$ptr)> {
-  let IsLoad = 1;
-  let IsAnyExtLoad = 1;
+  let IsLoad = true;
+  let IsAnyExtLoad = true;
 }
 def sextload  : PatFrag<(ops node:$ptr), (unindexedload node:$ptr)> {
-  let IsLoad = 1;
-  let IsSignExtLoad = 1;
+  let IsLoad = true;
+  let IsSignExtLoad = true;
 }
 def zextload  : PatFrag<(ops node:$ptr), (unindexedload node:$ptr)> {
-  let IsLoad = 1;
-  let IsZeroExtLoad = 1;
+  let IsLoad = true;
+  let IsZeroExtLoad = true;
 }
 
 def extloadi1  : PatFrag<(ops node:$ptr), (extload node:$ptr)> {
-  let IsLoad = 1;
+  let IsLoad = true;
   let MemoryVT = i1;
 }
 def extloadi8  : PatFrag<(ops node:$ptr), (extload node:$ptr)> {
-  let IsLoad = 1;
+  let IsLoad = true;
   let MemoryVT = i8;
 }
 def extloadi16 : PatFrag<(ops node:$ptr), (extload node:$ptr)> {
-  let IsLoad = 1;
+  let IsLoad = true;
   let MemoryVT = i16;
 }
 def extloadi32 : PatFrag<(ops node:$ptr), (extload node:$ptr)> {
-  let IsLoad = 1;
+  let IsLoad = true;
   let MemoryVT = i32;
 }
 def extloadf16 : PatFrag<(ops node:$ptr), (extload node:$ptr)> {
-  let IsLoad = 1;
+  let IsLoad = true;
   let MemoryVT = f16;
 }
 def extloadf32 : PatFrag<(ops node:$ptr), (extload node:$ptr)> {
-  let IsLoad = 1;
+  let IsLoad = true;
   let MemoryVT = f32;
 }
 def extloadf64 : PatFrag<(ops node:$ptr), (extload node:$ptr)> {
-  let IsLoad = 1;
+  let IsLoad = true;
   let MemoryVT = f64;
 }
 
 def sextloadi1  : PatFrag<(ops node:$ptr), (sextload node:$ptr)> {
-  let IsLoad = 1;
+  let IsLoad = true;
   let MemoryVT = i1;
 }
 def sextloadi8  : PatFrag<(ops node:$ptr), (sextload node:$ptr)> {
-  let IsLoad = 1;
+  let IsLoad = true;
   let MemoryVT = i8;
 }
 def sextloadi16 : PatFrag<(ops node:$ptr), (sextload node:$ptr)> {
-  let IsLoad = 1;
+  let IsLoad = true;
   let MemoryVT = i16;
 }
 def sextloadi32 : PatFrag<(ops node:$ptr), (sextload node:$ptr)> {
-  let IsLoad = 1;
+  let IsLoad = true;
   let MemoryVT = i32;
 }
 
 def zextloadi1  : PatFrag<(ops node:$ptr), (zextload node:$ptr)> {
-  let IsLoad = 1;
+  let IsLoad = true;
   let MemoryVT = i1;
 }
 def zextloadi8  : PatFrag<(ops node:$ptr), (zextload node:$ptr)> {
-  let IsLoad = 1;
+  let IsLoad = true;
   let MemoryVT = i8;
 }
 def zextloadi16 : PatFrag<(ops node:$ptr), (zextload node:$ptr)> {
-  let IsLoad = 1;
+  let IsLoad = true;
   let MemoryVT = i16;
 }
 def zextloadi32 : PatFrag<(ops node:$ptr), (zextload node:$ptr)> {
-  let IsLoad = 1;
+  let IsLoad = true;
   let MemoryVT = i32;
 }
 
 def extloadvi1  : PatFrag<(ops node:$ptr), (extload node:$ptr)> {
-  let IsLoad = 1;
+  let IsLoad = true;
   let ScalarMemoryVT = i1;
 }
 def extloadvi8  : PatFrag<(ops node:$ptr), (extload node:$ptr)> {
-  let IsLoad = 1;
+  let IsLoad = true;
   let ScalarMemoryVT = i8;
 }
 def extloadvi16 : PatFrag<(ops node:$ptr), (extload node:$ptr)> {
-  let IsLoad = 1;
+  let IsLoad = true;
   let ScalarMemoryVT = i16;
 }
 def extloadvi32 : PatFrag<(ops node:$ptr), (extload node:$ptr)> {
-  let IsLoad = 1;
+  let IsLoad = true;
   let ScalarMemoryVT = i32;
 }
 def extloadvf32 : PatFrag<(ops node:$ptr), (extload node:$ptr)> {
-  let IsLoad = 1;
+  let IsLoad = true;
   let ScalarMemoryVT = f32;
 }
 def extloadvf64 : PatFrag<(ops node:$ptr), (extload node:$ptr)> {
-  let IsLoad = 1;
+  let IsLoad = true;
   let ScalarMemoryVT = f64;
 }
 
 def sextloadvi1  : PatFrag<(ops node:$ptr), (sextload node:$ptr)> {
-  let IsLoad = 1;
+  let IsLoad = true;
   let ScalarMemoryVT = i1;
 }
 def sextloadvi8  : PatFrag<(ops node:$ptr), (sextload node:$ptr)> {
-  let IsLoad = 1;
+  let IsLoad = true;
   let ScalarMemoryVT = i8;
 }
 def sextloadvi16 : PatFrag<(ops node:$ptr), (sextload node:$ptr)> {
-  let IsLoad = 1;
+  let IsLoad = true;
   let ScalarMemoryVT = i16;
 }
 def sextloadvi32 : PatFrag<(ops node:$ptr), (sextload node:$ptr)> {
-  let IsLoad = 1;
+  let IsLoad = true;
   let ScalarMemoryVT = i32;
 }
 
 def zextloadvi1  : PatFrag<(ops node:$ptr), (zextload node:$ptr)> {
-  let IsLoad = 1;
+  let IsLoad = true;
   let ScalarMemoryVT = i1;
 }
 def zextloadvi8  : PatFrag<(ops node:$ptr), (zextload node:$ptr)> {
-  let IsLoad = 1;
+  let IsLoad = true;
   let ScalarMemoryVT = i8;
 }
 def zextloadvi16 : PatFrag<(ops node:$ptr), (zextload node:$ptr)> {
-  let IsLoad = 1;
+  let IsLoad = true;
   let ScalarMemoryVT = i16;
 }
 def zextloadvi32 : PatFrag<(ops node:$ptr), (zextload node:$ptr)> {
-  let IsLoad = 1;
+  let IsLoad = true;
   let ScalarMemoryVT = i32;
 }
 
 // store fragments.
 def unindexedstore : PatFrag<(ops node:$val, node:$ptr),
                              (st node:$val, node:$ptr)> {
-  let IsStore = 1;
-  let IsUnindexed = 1;
+  let IsStore = true;
+  let IsUnindexed = true;
 }
 def store : PatFrag<(ops node:$val, node:$ptr),
                     (unindexedstore node:$val, node:$ptr)> {
-  let IsStore = 1;
-  let IsTruncStore = 0;
+  let IsStore = true;
+  let IsTruncStore = false;
 }
 
 // truncstore fragments.
 def truncstore : PatFrag<(ops node:$val, node:$ptr),
                          (unindexedstore node:$val, node:$ptr)> {
-  let IsStore = 1;
-  let IsTruncStore = 1;
+  let IsStore = true;
+  let IsTruncStore = true;
 }
 def truncstorei8 : PatFrag<(ops node:$val, node:$ptr),
                            (truncstore node:$val, node:$ptr)> {
-  let IsStore = 1;
+  let IsStore = true;
   let MemoryVT = i8;
 }
 def truncstorei16 : PatFrag<(ops node:$val, node:$ptr),
                             (truncstore node:$val, node:$ptr)> {
-  let IsStore = 1;
+  let IsStore = true;
   let MemoryVT = i16;
 }
 def truncstorei32 : PatFrag<(ops node:$val, node:$ptr),
                             (truncstore node:$val, node:$ptr)> {
-  let IsStore = 1;
+  let IsStore = true;
   let MemoryVT = i32;
 }
 def truncstoref16 : PatFrag<(ops node:$val, node:$ptr),
                             (truncstore node:$val, node:$ptr)> {
-  let IsStore = 1;
+  let IsStore = true;
   let MemoryVT = f16;
 }
 def truncstoref32 : PatFrag<(ops node:$val, node:$ptr),
                             (truncstore node:$val, node:$ptr)> {
-  let IsStore = 1;
+  let IsStore = true;
   let MemoryVT = f32;
 }
 def truncstoref64 : PatFrag<(ops node:$val, node:$ptr),
                             (truncstore node:$val, node:$ptr)> {
-  let IsStore = 1;
+  let IsStore = true;
   let MemoryVT = f64;
 }
 
 def truncstorevi8 : PatFrag<(ops node:$val, node:$ptr),
                             (truncstore node:$val, node:$ptr)> {
-  let IsStore = 1;
+  let IsStore = true;
   let ScalarMemoryVT = i8;
 }
 
 def truncstorevi16 : PatFrag<(ops node:$val, node:$ptr),
                              (truncstore node:$val, node:$ptr)> {
-  let IsStore = 1;
+  let IsStore = true;
   let ScalarMemoryVT = i16;
 }
 
 def truncstorevi32 : PatFrag<(ops node:$val, node:$ptr),
                              (truncstore node:$val, node:$ptr)> {
-  let IsStore = 1;
+  let IsStore = true;
   let ScalarMemoryVT = i32;
 }
 
 // indexed store fragments.
 def istore : PatFrag<(ops node:$val, node:$base, node:$offset),
                      (ist node:$val, node:$base, node:$offset)> {
-  let IsStore = 1;
-  let IsTruncStore = 0;
+  let IsStore = true;
+  let IsTruncStore = false;
 }
 
 def pre_store : PatFrag<(ops node:$val, node:$base, node:$offset),
@@ -1136,8 +1161,8 @@ def pre_store : PatFrag<(ops node:$val, node:$base, node:$offset),
 
 def itruncstore : PatFrag<(ops node:$val, node:$base, node:$offset),
                           (ist node:$val, node:$base, node:$offset)> {
-  let IsStore = 1;
-  let IsTruncStore = 1;
+  let IsStore = true;
+  let IsTruncStore = true;
 }
 def pre_truncst : PatFrag<(ops node:$val, node:$base, node:$offset),
                           (itruncstore node:$val, node:$base, node:$offset), [{
@@ -1146,37 +1171,37 @@ def pre_truncst : PatFrag<(ops node:$val, node:$base, node:$offset),
 }]>;
 def pre_truncsti1 : PatFrag<(ops node:$val, node:$base, node:$offset),
                             (pre_truncst node:$val, node:$base, node:$offset)> {
-  let IsStore = 1;
+  let IsStore = true;
   let MemoryVT = i1;
 }
 def pre_truncsti8 : PatFrag<(ops node:$val, node:$base, node:$offset),
                             (pre_truncst node:$val, node:$base, node:$offset)> {
-  let IsStore = 1;
+  let IsStore = true;
   let MemoryVT = i8;
 }
 def pre_truncsti16 : PatFrag<(ops node:$val, node:$base, node:$offset),
                              (pre_truncst node:$val, node:$base, node:$offset)> {
-  let IsStore = 1;
+  let IsStore = true;
   let MemoryVT = i16;
 }
 def pre_truncsti32 : PatFrag<(ops node:$val, node:$base, node:$offset),
                              (pre_truncst node:$val, node:$base, node:$offset)> {
-  let IsStore = 1;
+  let IsStore = true;
   let MemoryVT = i32;
 }
 def pre_truncstf32 : PatFrag<(ops node:$val, node:$base, node:$offset),
                              (pre_truncst node:$val, node:$base, node:$offset)> {
-  let IsStore = 1;
+  let IsStore = true;
   let MemoryVT = f32;
 }
 def pre_truncstvi8 : PatFrag<(ops node:$val, node:$base, node:$offset),
                              (pre_truncst node:$val, node:$base, node:$offset)> {
-  let IsStore = 1;
+  let IsStore = true;
   let ScalarMemoryVT = i8;
 }
 def pre_truncstvi16 : PatFrag<(ops node:$val, node:$base, node:$offset),
                               (pre_truncst node:$val, node:$base, node:$offset)> {
-  let IsStore = 1;
+  let IsStore = true;
   let ScalarMemoryVT = i16;
 }
 
@@ -1193,37 +1218,37 @@ def post_truncst : PatFrag<(ops node:$val, node:$base, node:$offset),
 }]>;
 def post_truncsti1 : PatFrag<(ops node:$val, node:$base, node:$offset),
                              (post_truncst node:$val, node:$base, node:$offset)> {
-  let IsStore = 1;
+  let IsStore = true;
   let MemoryVT = i1;
 }
 def post_truncsti8 : PatFrag<(ops node:$val, node:$base, node:$offset),
                              (post_truncst node:$val, node:$base, node:$offset)> {
-  let IsStore = 1;
+  let IsStore = true;
   let MemoryVT = i8;
 }
 def post_truncsti16 : PatFrag<(ops node:$val, node:$base, node:$offset),
                               (post_truncst node:$val, node:$base, node:$offset)> {
-  let IsStore = 1;
+  let IsStore = true;
   let MemoryVT = i16;
 }
 def post_truncsti32 : PatFrag<(ops node:$val, node:$base, node:$offset),
                               (post_truncst node:$val, node:$base, node:$offset)> {
-  let IsStore = 1;
+  let IsStore = true;
   let MemoryVT = i32;
 }
 def post_truncstf32 : PatFrag<(ops node:$val, node:$base, node:$offset),
                               (post_truncst node:$val, node:$base, node:$offset)> {
-  let IsStore = 1;
+  let IsStore = true;
   let MemoryVT = f32;
 }
 def post_truncstvi8 : PatFrag<(ops node:$val, node:$base, node:$offset),
                               (post_truncst node:$val, node:$base, node:$offset)> {
-  let IsStore = 1;
+  let IsStore = true;
   let ScalarMemoryVT = i8;
 }
 def post_truncstvi16 : PatFrag<(ops node:$val, node:$base, node:$offset),
                                (post_truncst node:$val, node:$base, node:$offset)> {
-  let IsStore = 1;
+  let IsStore = true;
   let ScalarMemoryVT = i16;
 }
 
@@ -1420,82 +1445,88 @@ def any_sint_to_fp : PatFrags<(ops node:$src),
 def any_uint_to_fp : PatFrags<(ops node:$src),
                               [(strict_uint_to_fp node:$src),
                                (uint_to_fp node:$src)]>;
+def any_fsetcc : PatFrags<(ops node:$lhs, node:$rhs, node:$pred),
+                          [(strict_fsetcc node:$lhs, node:$rhs, node:$pred),
+                           (setcc node:$lhs, node:$rhs, node:$pred)]>;
+def any_fsetccs : PatFrags<(ops node:$lhs, node:$rhs, node:$pred),
+                          [(strict_fsetccs node:$lhs, node:$rhs, node:$pred),
+                           (setcc node:$lhs, node:$rhs, node:$pred)]>;
 
 multiclass binary_atomic_op_ord<SDNode atomic_op> {
   def NAME#_monotonic : PatFrag<(ops node:$ptr, node:$val),
       (!cast<SDPatternOperator>(NAME) node:$ptr, node:$val)> {
-    let IsAtomic = 1;
-    let IsAtomicOrderingMonotonic = 1;
+    let IsAtomic = true;
+    let IsAtomicOrderingMonotonic = true;
   }
   def NAME#_acquire : PatFrag<(ops node:$ptr, node:$val),
       (!cast<SDPatternOperator>(NAME) node:$ptr, node:$val)> {
-    let IsAtomic = 1;
-    let IsAtomicOrderingAcquire = 1;
+    let IsAtomic = true;
+    let IsAtomicOrderingAcquire = true;
   }
   def NAME#_release : PatFrag<(ops node:$ptr, node:$val),
       (!cast<SDPatternOperator>(NAME) node:$ptr, node:$val)> {
-    let IsAtomic = 1;
-    let IsAtomicOrderingRelease = 1;
+    let IsAtomic = true;
+    let IsAtomicOrderingRelease = true;
   }
   def NAME#_acq_rel : PatFrag<(ops node:$ptr, node:$val),
       (!cast<SDPatternOperator>(NAME) node:$ptr, node:$val)> {
-    let IsAtomic = 1;
-    let IsAtomicOrderingAcquireRelease = 1;
+    let IsAtomic = true;
+    let IsAtomicOrderingAcquireRelease = true;
   }
   def NAME#_seq_cst : PatFrag<(ops node:$ptr, node:$val),
       (!cast<SDPatternOperator>(NAME) node:$ptr, node:$val)> {
-    let IsAtomic = 1;
-    let IsAtomicOrderingSequentiallyConsistent = 1;
+    let IsAtomic = true;
+    let IsAtomicOrderingSequentiallyConsistent = true;
   }
 }
 
 multiclass ternary_atomic_op_ord<SDNode atomic_op> {
   def NAME#_monotonic : PatFrag<(ops node:$ptr, node:$cmp, node:$val),
       (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val)> {
-    let IsAtomic = 1;
-    let IsAtomicOrderingMonotonic = 1;
+    let IsAtomic = true;
+    let IsAtomicOrderingMonotonic = true;
   }
   def NAME#_acquire : PatFrag<(ops node:$ptr, node:$cmp, node:$val),
       (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val)> {
-    let IsAtomic = 1;
-    let IsAtomicOrderingAcquire = 1;
+    let IsAtomic = true;
+    let IsAtomicOrderingAcquire = true;
   }
   def NAME#_release : PatFrag<(ops node:$ptr, node:$cmp, node:$val),
       (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val)> {
-    let IsAtomic = 1;
-    let IsAtomicOrderingRelease = 1;
+    let IsAtomic = true;
+    let IsAtomicOrderingRelease = true;
   }
   def NAME#_acq_rel : PatFrag<(ops node:$ptr, node:$cmp, node:$val),
       (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val)> {
-    let IsAtomic = 1;
-    let IsAtomicOrderingAcquireRelease = 1;
+    let IsAtomic = true;
+    let IsAtomicOrderingAcquireRelease = true;
   }
   def NAME#_seq_cst : PatFrag<(ops node:$ptr, node:$cmp, node:$val),
       (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val)> {
-    let IsAtomic = 1;
-    let IsAtomicOrderingSequentiallyConsistent = 1;
+    let IsAtomic = true;
+    let IsAtomicOrderingSequentiallyConsistent = true;
   }
 }
 
 multiclass binary_atomic_op<SDNode atomic_op, bit IsInt = 1> {
   def _8 : PatFrag<(ops node:$ptr, node:$val),
                    (atomic_op  node:$ptr, node:$val)> {
-    let IsAtomic = 1;
+    let IsAtomic = true;
     let MemoryVT = !if(IsInt, i8, ?);
   }
   def _16 : PatFrag<(ops node:$ptr, node:$val),
                     (atomic_op node:$ptr, node:$val)> {
-    let IsAtomic = 1;
+    let IsAtomic = true;
     let MemoryVT = !if(IsInt, i16, f16);
   }
   def _32 : PatFrag<(ops node:$ptr, node:$val),
                     (atomic_op node:$ptr, node:$val)> {
-    let IsAtomic = 1;
+    let IsAtomic = true;
     let MemoryVT = !if(IsInt, i32, f32);
   }
   def _64 : PatFrag<(ops node:$ptr, node:$val),
                     (atomic_op node:$ptr, node:$val)> {
-    let IsAtomic = 1;
+    let IsAtomic = true;
     let MemoryVT = !if(IsInt, i64, f64);
   }
 
@@ -1508,22 +1539,22 @@ multiclass binary_atomic_op<SDNode atomic_op, bit IsInt = 1> {
 multiclass ternary_atomic_op<SDNode atomic_op> {
   def _8 : PatFrag<(ops node:$ptr, node:$cmp, node:$val),
                    (atomic_op  node:$ptr, node:$cmp, node:$val)> {
-    let IsAtomic = 1;
+    let IsAtomic = true;
     let MemoryVT = i8;
   }
   def _16 : PatFrag<(ops node:$ptr, node:$cmp, node:$val),
                     (atomic_op node:$ptr, node:$cmp, node:$val)> {
-    let IsAtomic = 1;
+    let IsAtomic = true;
     let MemoryVT = i16;
   }
   def _32 : PatFrag<(ops node:$ptr, node:$cmp, node:$val),
                     (atomic_op node:$ptr, node:$cmp, node:$val)> {
-    let IsAtomic = 1;
+    let IsAtomic = true;
     let MemoryVT = i32;
   }
   def _64 : PatFrag<(ops node:$ptr, node:$cmp, node:$val),
                     (atomic_op node:$ptr, node:$cmp, node:$val)> {
-    let IsAtomic = 1;
+    let IsAtomic = true;
     let MemoryVT = i64;
   }
 
@@ -1551,25 +1582,25 @@ defm atomic_cmp_swap  : ternary_atomic_op<atomic_cmp_swap>;
 def atomic_load_8 :
   PatFrag<(ops node:$ptr),
           (atomic_load node:$ptr)> {
-  let IsAtomic = 1;
+  let IsAtomic = true;
   let MemoryVT = i8;
 }
 def atomic_load_16 :
   PatFrag<(ops node:$ptr),
           (atomic_load node:$ptr)> {
-  let IsAtomic = 1;
+  let IsAtomic = true;
   let MemoryVT = i16;
 }
 def atomic_load_32 :
   PatFrag<(ops node:$ptr),
           (atomic_load node:$ptr)> {
-  let IsAtomic = 1;
+  let IsAtomic = true;
   let MemoryVT = i32;
 }
 def atomic_load_64 :
   PatFrag<(ops node:$ptr),
           (atomic_load node:$ptr)> {
-  let IsAtomic = 1;
+  let IsAtomic = true;
   let MemoryVT = i64;
 }
 
diff --git a/contrib/llvm-project/llvm/include/llvm/Testing/Support/SupportHelpers.h b/contrib/llvm-project/llvm/include/llvm/Testing/Support/SupportHelpers.h
index 38726b1cfaf7..2419fc95d817 100644
--- a/contrib/llvm-project/llvm/include/llvm/Testing/Support/SupportHelpers.h
+++ b/contrib/llvm-project/llvm/include/llvm/Testing/Support/SupportHelpers.h
@@ -12,6 +12,8 @@
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Support/raw_os_ostream.h"
 #include "gmock/gmock-matchers.h"
 #include "gtest/gtest-printers.h"
@@ -103,7 +105,143 @@ detail::ValueIsMatcher<InnerMatcher> ValueIs(const InnerMatcher &ValueMatcher) {
   return detail::ValueIsMatcher<InnerMatcher>(ValueMatcher);
 }
 namespace unittest {
+
 SmallString<128> getInputFileDirectory(const char *Argv0);
+
+/// A RAII object that creates a temporary directory upon initialization and
+/// removes it upon destruction.
+class TempDir {
+  SmallString<128> Path;
+
+public:
+  /// Creates a managed temporary directory.
+  ///
+  /// @param Name The name of the directory to create.
+  /// @param Unique If true, the directory will be created using
+  ///               llvm::sys::fs::createUniqueDirectory.
+  explicit TempDir(StringRef Name, bool Unique = false) {
+    std::error_code EC;
+    if (Unique) {
+      EC = llvm::sys::fs::createUniqueDirectory(Name, Path);
+      if (!EC) {
+        // Resolve any symlinks in the new directory.
+        std::string UnresolvedPath(Path.str());
+        EC = llvm::sys::fs::real_path(UnresolvedPath, Path);
+      }
+    } else {
+      Path = Name;
+      EC = llvm::sys::fs::create_directory(Path);
+    }
+    if (EC)
+      Path.clear();
+    EXPECT_FALSE(EC) << EC.message();
+  }
+
+  ~TempDir() {
+    if (!Path.empty()) {
+      EXPECT_FALSE(llvm::sys::fs::remove_directories(Path.str()));
+    }
+  }
+
+  TempDir(const TempDir &) = delete;
+  TempDir &operator=(const TempDir &) = delete;
+
+  TempDir(TempDir &&) = default;
+  TempDir &operator=(TempDir &&) = default;
+
+  /// The path to the temporary directory.
+  StringRef path() const { return Path; }
+
+  /// The null-terminated C string pointing to the path.
+  const char *c_str() { return Path.c_str(); }
+
+  /// Creates a new path by appending the argument to the path of the managed
+  /// directory using the native path separator.
+  SmallString<128> path(StringRef component) const {
+    SmallString<128> Result(Path);
+    SmallString<128> ComponentToAppend(component);
+    llvm::sys::path::native(ComponentToAppend);
+    llvm::sys::path::append(Result, Twine(ComponentToAppend));
+    return Result;
+  }
+};
+
+/// A RAII object that creates a link upon initialization and
+/// removes it upon destruction.
+///
+/// The link may be a soft or a hard link, depending on the platform.
+class TempLink {
+  SmallString<128> Path;
+
+public:
+  /// Creates a managed link at path Link pointing to Target.
+  TempLink(StringRef Target, StringRef Link) {
+    Path = Link;
+    std::error_code EC = sys::fs::create_link(Target, Link);
+    if (EC)
+      Path.clear();
+    EXPECT_FALSE(EC);
+  }
+  ~TempLink() {
+    if (!Path.empty()) {
+      EXPECT_FALSE(llvm::sys::fs::remove(Path.str()));
+    }
+  }
+
+  TempLink(const TempLink &) = delete;
+  TempLink &operator=(const TempLink &) = delete;
+
+  TempLink(TempLink &&) = default;
+  TempLink &operator=(TempLink &&) = default;
+
+  /// The path to the link.
+  StringRef path() const { return Path; }
+};
+
+/// A RAII object that creates a file upon initialization and
+/// removes it upon destruction.
+class TempFile {
+  SmallString<128> Path;
+
+public:
+  /// Creates a managed file.
+  ///
+  /// @param Name The name of the file to create.
+  /// @param Contents The string to write to the file.
+  /// @param Unique If true, the file will be created using
+  ///               llvm::sys::fs::createTemporaryFile.
+  TempFile(StringRef Name, StringRef Suffix = "", StringRef Contents = "",
+           bool Unique = false) {
+    std::error_code EC;
+    int fd;
+    if (Unique) {
+      EC = llvm::sys::fs::createTemporaryFile(Name, Suffix, fd, Path);
+    } else {
+      Path = Name;
+      if (!Suffix.empty()) {
+        Path.append(".");
+        Path.append(Suffix);
+      }
+      EC = llvm::sys::fs::openFileForWrite(Path, fd);
+    }
+    EXPECT_FALSE(EC);
+    raw_fd_ostream OS(fd, /*shouldClose*/ true);
+    OS << Contents;
+    OS.flush();
+    EXPECT_FALSE(OS.error());
+    if (EC || OS.error())
+      Path.clear();
+  }
+  ~TempFile() {
+    if (!Path.empty()) {
+      EXPECT_FALSE(llvm::sys::fs::remove(Path.str()));
+    }
+  }
+
+  /// The path to the file.
+  StringRef path() const { return Path; }
+};
+
 } // namespace unittest
 } // namespace llvm
 
diff --git a/contrib/llvm-project/llvm/include/llvm/TextAPI/ELF/ELFStub.h b/contrib/llvm-project/llvm/include/llvm/TextAPI/ELF/ELFStub.h
deleted file mode 100644
index 76b2af121662..000000000000
--- a/contrib/llvm-project/llvm/include/llvm/TextAPI/ELF/ELFStub.h
+++ /dev/null
@@ -1,68 +0,0 @@
-//===- ELFStub.h ------------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===-----------------------------------------------------------------------===/
-///
-/// \file
-/// This file defines an internal representation of an ELF stub.
-///
-//===-----------------------------------------------------------------------===/
-
-#ifndef LLVM_TEXTAPI_ELF_ELFSTUB_H
-#define LLVM_TEXTAPI_ELF_ELFSTUB_H
-
-#include "llvm/BinaryFormat/ELF.h"
-#include "llvm/Support/VersionTuple.h"
-#include <vector>
-#include <set>
-
-namespace llvm {
-namespace elfabi {
-
-typedef uint16_t ELFArch;
-
-enum class ELFSymbolType {
-  NoType = ELF::STT_NOTYPE,
-  Object = ELF::STT_OBJECT,
-  Func = ELF::STT_FUNC,
-  TLS = ELF::STT_TLS,
-
-  // Type information is 4 bits, so 16 is safely out of range.
-  Unknown = 16,
-};
-
-struct ELFSymbol {
-  ELFSymbol(std::string SymbolName) : Name(SymbolName) {}
-  std::string Name;
-  uint64_t Size;
-  ELFSymbolType Type;
-  bool Undefined;
-  bool Weak;
-  Optional<std::string> Warning;
-  bool operator<(const ELFSymbol &RHS) const {
-    return Name < RHS.Name;
-  }
-};
-
-// A cumulative representation of ELF stubs.
-// Both textual and binary stubs will read into and write from this object.
-class ELFStub {
-// TODO: Add support for symbol versioning.
-public:
-  VersionTuple TbeVersion;
-  Optional<std::string> SoName;
-  ELFArch Arch;
-  std::vector<std::string> NeededLibs;
-  std::set<ELFSymbol> Symbols;
-
-  ELFStub() {}
-  ELFStub(const ELFStub &Stub);
-  ELFStub(ELFStub &&Stub);
-};
-} // end namespace elfabi
-} // end namespace llvm
-
-#endif // LLVM_TEXTAPI_ELF_ELFSTUB_H
diff --git a/contrib/llvm-project/llvm/include/llvm/TextAPI/ELF/TBEHandler.h b/contrib/llvm-project/llvm/include/llvm/TextAPI/ELF/TBEHandler.h
deleted file mode 100644
index 76484410987f..000000000000
--- a/contrib/llvm-project/llvm/include/llvm/TextAPI/ELF/TBEHandler.h
+++ /dev/null
@@ -1,43 +0,0 @@
-//===- TBEHandler.h ---------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===-----------------------------------------------------------------------===/
-///
-/// \file
-/// This file declares an interface for reading and writing .tbe (text-based
-/// ELF) files.
-///
-//===-----------------------------------------------------------------------===/
-
-#ifndef LLVM_TEXTAPI_ELF_TBEHANDLER_H
-#define LLVM_TEXTAPI_ELF_TBEHANDLER_H
-
-#include "llvm/Support/VersionTuple.h"
-#include "llvm/Support/Error.h"
-#include <memory>
-
-namespace llvm {
-
-class raw_ostream;
-class Error;
-class StringRef;
-
-namespace elfabi {
-
-class ELFStub;
-
-const VersionTuple TBEVersionCurrent(1, 0);
-
-/// Attempts to read an ELF interface file from a StringRef buffer.
-Expected<std::unique_ptr<ELFStub>> readTBEFromBuffer(StringRef Buf);
-
-/// Attempts to write an ELF interface file to a raw_ostream.
-Error writeTBEToOutputStream(raw_ostream &OS, const ELFStub &Stub);
-
-} // end namespace elfabi
-} // end namespace llvm
-
-#endif // LLVM_TEXTAPI_ELF_TBEHANDLER_H
diff --git a/contrib/llvm-project/llvm/include/llvm/TextAPI/MachO/Platform.h b/contrib/llvm-project/llvm/include/llvm/TextAPI/MachO/Platform.h
index a22aae9b7dce..fc59b8678af7 100644
--- a/contrib/llvm-project/llvm/include/llvm/TextAPI/MachO/Platform.h
+++ b/contrib/llvm-project/llvm/include/llvm/TextAPI/MachO/Platform.h
@@ -29,7 +29,8 @@ enum class PlatformKind : unsigned {
   macCatalyst = MachO::PLATFORM_MACCATALYST,
   iOSSimulator = MachO::PLATFORM_IOSSIMULATOR,
   tvOSSimulator = MachO::PLATFORM_TVOSSIMULATOR,
-  watchOSSimulator = MachO::PLATFORM_WATCHOSSIMULATOR
+  watchOSSimulator = MachO::PLATFORM_WATCHOSSIMULATOR,
+  driverKit = MachO::PLATFORM_DRIVERKIT,
 };
 
 using PlatformSet = SmallSet<PlatformKind, 3>;
@@ -42,4 +43,4 @@ StringRef getPlatformName(PlatformKind Platform);
 } // end namespace MachO.
 } // end namespace llvm.
 
-#endif // LLVM_TEXTAPI_MACHO_PLATFORM_H
\ No newline at end of file
+#endif // LLVM_TEXTAPI_MACHO_PLATFORM_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h b/contrib/llvm-project/llvm/include/llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h
index 887c8807904e..e5e24e0b6311 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h
@@ -17,7 +17,6 @@
 #ifndef LLVM_TRANSFORMS_AGGRESSIVE_INSTCOMBINE_INSTCOMBINE_H
 #define LLVM_TRANSFORMS_AGGRESSIVE_INSTCOMBINE_INSTCOMBINE_H
 
-#include "llvm/IR/Function.h"
 #include "llvm/IR/PassManager.h"
 
 namespace llvm {
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Coroutines.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Coroutines.h
index ef05f549fbc1..204359254d4e 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Coroutines.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Coroutines.h
@@ -23,7 +23,7 @@ void addCoroutinePassesToExtensionPoints(PassManagerBuilder &Builder);
 Pass *createCoroEarlyLegacyPass();
 
 /// Split up coroutines into multiple functions driving their state machines.
-Pass *createCoroSplitLegacyPass();
+Pass *createCoroSplitLegacyPass(bool ReuseFrameSlot = false);
 
 /// Analyze coroutines use sites, devirtualize resume/destroy calls and elide
 /// heap allocation for coroutine frame where possible.
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Coroutines/CoroCleanup.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Coroutines/CoroCleanup.h
index c3caa55c25ce..7ecdc050335d 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Coroutines/CoroCleanup.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Coroutines/CoroCleanup.h
@@ -22,6 +22,7 @@ class Function;
 
 struct CoroCleanupPass : PassInfoMixin<CoroCleanupPass> {
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+  static bool isRequired() { return true; }
 };
 } // end namespace llvm
 
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Coroutines/CoroEarly.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Coroutines/CoroEarly.h
index 0f5d1e40eb17..3f5ec2abd172 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Coroutines/CoroEarly.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Coroutines/CoroEarly.h
@@ -25,6 +25,7 @@ class Function;
 
 struct CoroEarlyPass : PassInfoMixin<CoroEarlyPass> {
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+  static bool isRequired() { return true; }
 };
 } // end namespace llvm
 
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Coroutines/CoroElide.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Coroutines/CoroElide.h
index 348e8e355ea0..ff73cf20c5bf 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Coroutines/CoroElide.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Coroutines/CoroElide.h
@@ -24,6 +24,7 @@ class Function;
 
 struct CoroElidePass : PassInfoMixin<CoroElidePass> {
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+  static bool isRequired() { return true; }
 };
 } // end namespace llvm
 
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Coroutines/CoroSplit.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Coroutines/CoroSplit.h
index 40424e5a7e6a..f4eef19b20e5 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Coroutines/CoroSplit.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Coroutines/CoroSplit.h
@@ -22,8 +22,13 @@
 namespace llvm {
 
 struct CoroSplitPass : PassInfoMixin<CoroSplitPass> {
+  CoroSplitPass(bool ReuseFrameSlot = false) : ReuseFrameSlot(ReuseFrameSlot) {}
+
   PreservedAnalyses run(LazyCallGraph::SCC &C, CGSCCAnalysisManager &AM,
                         LazyCallGraph &CG, CGSCCUpdateResult &UR);
+  static bool isRequired() { return true; }
+
+  bool ReuseFrameSlot;
 };
 } // end namespace llvm
 
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/HelloNew/HelloWorld.h b/contrib/llvm-project/llvm/include/llvm/Transforms/HelloNew/HelloWorld.h
new file mode 100644
index 000000000000..6c753032f913
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/HelloNew/HelloWorld.h
@@ -0,0 +1,23 @@
+//===-- HelloWorld.h - Example Transformations ------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_HELLONEW_HELLOWORLD_H
+#define LLVM_TRANSFORMS_HELLONEW_HELLOWORLD_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class HelloWorldPass : public PassInfoMixin<HelloWorldPass> {
+public:
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_HELLONEW_HELLOWORLD_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/IPO.h b/contrib/llvm-project/llvm/include/llvm/Transforms/IPO.h
index 28e454d3b0fc..af357181597a 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/IPO.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/IPO.h
@@ -29,6 +29,13 @@ class BasicBlock;
 class GlobalValue;
 class raw_ostream;
 
+//===----------------------------------------------------------------------===//
+//
+// This pass adds !annotation metadata to entries in the
+// @llvm.global.annotations global constant.
+//
+ModulePass *createAnnotation2MetadataLegacyPass();
+
 //===----------------------------------------------------------------------===//
 //
 // These functions removes symbols from functions and modules.  If OnlyDebugInfo
@@ -155,12 +162,6 @@ Pass *createArgumentPromotionPass(unsigned maxElements = 3);
 /// createOpenMPOptLegacyPass - OpenMP specific optimizations.
 Pass *createOpenMPOptLegacyPass();
 
-//===----------------------------------------------------------------------===//
-/// createIPConstantPropagationPass - This pass propagates constants from call
-/// sites into the bodies of functions.
-///
-ModulePass *createIPConstantPropagationPass();
-
 //===----------------------------------------------------------------------===//
 /// createIPSCCPPass - This pass propagates constants from call sites into the
 /// bodies of functions, and keeps track of whether basic blocks are executable
@@ -214,6 +215,11 @@ ModulePass *createMergeFunctionsPass();
 /// function(s).
 ModulePass *createHotColdSplittingPass();
 
+//===----------------------------------------------------------------------===//
+/// createIROutlinerPass - This pass finds similar code regions and factors
+/// those regions out into functions.
+ModulePass *createIROutlinerPass();
+
 //===----------------------------------------------------------------------===//
 /// createPartialInliningPass - This pass inlines parts of functions.
 ///
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/AlwaysInliner.h b/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/AlwaysInliner.h
index 64e25230f6da..6a208dfa6a25 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/AlwaysInliner.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/AlwaysInliner.h
@@ -34,6 +34,7 @@ public:
       : InsertLifetime(InsertLifetime) {}
 
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &);
+  static bool isRequired() { return true; }
 };
 
 /// Create a legacy pass manager instance of a pass to inline and remove
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/Annotation2Metadata.h b/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/Annotation2Metadata.h
new file mode 100644
index 000000000000..cf7137b088c5
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/Annotation2Metadata.h
@@ -0,0 +1,30 @@
+//===- Annotation2Metadata.h - Add !annotation metadata. --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// New pass manager pass to convert @llvm.global.annotations to !annotation
+// metadata.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_IPO_ANNOTATION2METADATA_H
+#define LLVM_TRANSFORMS_IPO_ANNOTATION2METADATA_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class Module;
+
+/// Pass to convert @llvm.global.annotations to !annotation metadata.
+struct Annotation2MetadataPass : public PassInfoMixin<Annotation2MetadataPass> {
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_IPO_SCCP_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/Attributor.h b/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/Attributor.h
index bed180e6717a..dbaf945986e4 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/Attributor.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/Attributor.h
@@ -97,38 +97,44 @@
 #ifndef LLVM_TRANSFORMS_IPO_ATTRIBUTOR_H
 #define LLVM_TRANSFORMS_IPO_ATTRIBUTOR_H
 
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/GraphTraits.h"
 #include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
-#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumeBundleQueries.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/CGSCCPassManager.h"
-#include "llvm/Analysis/CallGraph.h"
-#include "llvm/Analysis/InlineCost.h"
 #include "llvm/Analysis/LazyCallGraph.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MustExecute.h"
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/AbstractCallSite.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Support/Allocator.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/TimeProfiler.h"
 #include "llvm/Transforms/Utils/CallGraphUpdater.h"
 
 namespace llvm {
 
+struct AADepGraphNode;
+struct AADepGraph;
 struct Attributor;
 struct AbstractAttribute;
 struct InformationCache;
 struct AAIsDead;
 
+class AAManager;
+class AAResults;
 class Function;
 
-/// Simple enum classes that forces properties to be spelled out explicitly.
-///
+/// The value passed to the line option that defines the maximal initialization
+/// chain length.
+extern unsigned MaxInitializationChainLength;
+
 ///{
 enum class ChangeStatus {
   CHANGED,
@@ -144,6 +150,74 @@ enum class DepClassTy {
 };
 ///}
 
+/// The data structure for the nodes of a dependency graph
+struct AADepGraphNode {
+public:
+  virtual ~AADepGraphNode(){};
+  using DepTy = PointerIntPair<AADepGraphNode *, 1>;
+
+protected:
+  /// Set of dependency graph nodes which should be updated if this one
+  /// is updated. The bit encodes if it is optional.
+  TinyPtrVector<DepTy> Deps;
+
+  static AADepGraphNode *DepGetVal(DepTy &DT) { return DT.getPointer(); }
+  static AbstractAttribute *DepGetValAA(DepTy &DT) {
+    return cast<AbstractAttribute>(DT.getPointer());
+  }
+
+  operator AbstractAttribute *() { return cast<AbstractAttribute>(this); }
+
+public:
+  using iterator =
+      mapped_iterator<TinyPtrVector<DepTy>::iterator, decltype(&DepGetVal)>;
+  using aaiterator =
+      mapped_iterator<TinyPtrVector<DepTy>::iterator, decltype(&DepGetValAA)>;
+
+  aaiterator begin() { return aaiterator(Deps.begin(), &DepGetValAA); }
+  aaiterator end() { return aaiterator(Deps.end(), &DepGetValAA); }
+  iterator child_begin() { return iterator(Deps.begin(), &DepGetVal); }
+  iterator child_end() { return iterator(Deps.end(), &DepGetVal); }
+
+  virtual void print(raw_ostream &OS) const { OS << "AADepNode Impl\n"; }
+  TinyPtrVector<DepTy> &getDeps() { return Deps; }
+
+  friend struct Attributor;
+  friend struct AADepGraph;
+};
+
+/// The data structure for the dependency graph
+///
+/// Note that in this graph if there is an edge from A to B (A -> B),
+/// then it means that B depends on A, and when the state of A is
+/// updated, node B should also be updated
+struct AADepGraph {
+  AADepGraph() {}
+  ~AADepGraph() {}
+
+  using DepTy = AADepGraphNode::DepTy;
+  static AADepGraphNode *DepGetVal(DepTy &DT) { return DT.getPointer(); }
+  using iterator =
+      mapped_iterator<TinyPtrVector<DepTy>::iterator, decltype(&DepGetVal)>;
+
+  /// There is no root node for the dependency graph. But the SCCIterator
+  /// requires a single entry point, so we maintain a fake("synthetic") root
+  /// node that depends on every node.
+  AADepGraphNode SyntheticRoot;
+  AADepGraphNode *GetEntryNode() { return &SyntheticRoot; }
+
+  iterator begin() { return SyntheticRoot.child_begin(); }
+  iterator end() { return SyntheticRoot.child_end(); }
+
+  void viewGraph();
+
+  /// Dump graph to file
+  void dumpGraph();
+
+  /// Print dependency graph
+  void print();
+};
+
 /// Helper to describe and deal with positions in the LLVM-IR.
 ///
 /// A position in the IR is described by an anchor value and an "offset" that
@@ -263,8 +337,14 @@ struct IRPosition {
 
   /// Return the associated function, if any.
   Function *getAssociatedFunction() const {
-    if (auto *CB = dyn_cast<CallBase>(&getAnchorValue()))
+    if (auto *CB = dyn_cast<CallBase>(&getAnchorValue())) {
+      // We reuse the logic that associates callback calles to arguments of a
+      // call site here to identify the callback callee as the associated
+      // function.
+      if (Argument *Arg = getAssociatedArgument())
+        return Arg->getParent();
       return CB->getCalledFunction();
+    }
     return getAnchorScope();
   }
 
@@ -312,10 +392,11 @@ struct IRPosition {
 
   /// Return the value this abstract attribute is associated with.
   Value &getAssociatedValue() const {
-    if (getArgNo() < 0 || isa<Argument>(&getAnchorValue()))
+    if (getCallSiteArgNo() < 0 || isa<Argument>(&getAnchorValue()))
       return getAnchorValue();
     assert(isa<CallBase>(&getAnchorValue()) && "Expected a call base!");
-    return *cast<CallBase>(&getAnchorValue())->getArgOperand(getArgNo());
+    return *cast<CallBase>(&getAnchorValue())
+                ->getArgOperand(getCallSiteArgNo());
   }
 
   /// Return the type this abstract attribute is associated with.
@@ -325,19 +406,22 @@ struct IRPosition {
     return getAssociatedValue().getType();
   }
 
-  /// Return the argument number of the associated value if it is an argument or
-  /// call site argument, otherwise a negative value.
-  int getArgNo() const {
-    switch (getPositionKind()) {
-    case IRPosition::IRP_ARGUMENT:
-      return cast<Argument>(getAsValuePtr())->getArgNo();
-    case IRPosition::IRP_CALL_SITE_ARGUMENT: {
-      Use &U = *getAsUsePtr();
-      return cast<CallBase>(U.getUser())->getArgOperandNo(&U);
-    }
-    default:
-      return -1;
-    }
+  /// Return the callee argument number of the associated value if it is an
+  /// argument or call site argument, otherwise a negative value. In contrast to
+  /// `getCallSiteArgNo` this method will always return the "argument number"
+  /// from the perspective of the callee. This may not the same as the call site
+  /// if this is a callback call.
+  int getCalleeArgNo() const {
+    return getArgNo(/* CallbackCalleeArgIfApplicable */ true);
+  }
+
+  /// Return the call site argument number of the associated value if it is an
+  /// argument or call site argument, otherwise a negative value. In contrast to
+  /// `getCalleArgNo` this method will always return the "operand number" from
+  /// the perspective of the call site. This may not the same as the callee
+  /// perspective if this is a callback call.
+  int getCallSiteArgNo() const {
+    return getArgNo(/* CallbackCalleeArgIfApplicable */ false);
   }
 
   /// Return the index in the attribute list for this position.
@@ -354,7 +438,7 @@ struct IRPosition {
       return AttributeList::ReturnIndex;
     case IRPosition::IRP_ARGUMENT:
     case IRPosition::IRP_CALL_SITE_ARGUMENT:
-      return getArgNo() + AttributeList::FirstArgIndex;
+      return getCallSiteArgNo() + AttributeList::FirstArgIndex;
     }
     llvm_unreachable(
         "There is no attribute index for a floating or invalid position!");
@@ -439,6 +523,17 @@ struct IRPosition {
     }
   }
 
+  /// Return true if the position is an argument or call site argument.
+  bool isArgumentPosition() const {
+    switch (getPositionKind()) {
+    case IRPosition::IRP_ARGUMENT:
+    case IRPosition::IRP_CALL_SITE_ARGUMENT:
+      return true;
+    default:
+      return false;
+    }
+  }
+
   /// Special DenseMap key values.
   ///
   ///{
@@ -485,6 +580,25 @@ private:
     verify();
   }
 
+  /// Return the callee argument number of the associated value if it is an
+  /// argument or call site argument. See also `getCalleeArgNo` and
+  /// `getCallSiteArgNo`.
+  int getArgNo(bool CallbackCalleeArgIfApplicable) const {
+    if (CallbackCalleeArgIfApplicable)
+      if (Argument *Arg = getAssociatedArgument())
+        return Arg->getArgNo();
+    switch (getPositionKind()) {
+    case IRPosition::IRP_ARGUMENT:
+      return cast<Argument>(getAsValuePtr())->getArgNo();
+    case IRPosition::IRP_CALL_SITE_ARGUMENT: {
+      Use &U = *getAsUsePtr();
+      return cast<CallBase>(U.getUser())->getArgOperandNo(&U);
+    }
+    default:
+      return -1;
+    }
+  }
+
   /// IRPosition for the use \p U. The position kind \p PK needs to be
   /// IRP_CALL_SITE_ARGUMENT, the anchor value is the user, the associated value
   /// the used value.
@@ -648,7 +762,10 @@ struct InformationCache {
             [&](const Function &F) {
               return AG.getAnalysis<PostDominatorTreeAnalysis>(F);
             }),
-        AG(AG), CGSCC(CGSCC) {}
+        AG(AG), CGSCC(CGSCC) {
+    if (CGSCC)
+      initializeModuleSlice(*CGSCC);
+  }
 
   ~InformationCache() {
     // The FunctionInfo objects are allocated via a BumpPtrAllocator, we call
@@ -657,6 +774,68 @@ struct InformationCache {
       It.getSecond()->~FunctionInfo();
   }
 
+  /// Apply \p CB to all uses of \p F. If \p LookThroughConstantExprUses is
+  /// true, constant expression users are not given to \p CB but their uses are
+  /// traversed transitively.
+  template <typename CBTy>
+  static void foreachUse(Function &F, CBTy CB,
+                         bool LookThroughConstantExprUses = true) {
+    SmallVector<Use *, 8> Worklist(make_pointer_range(F.uses()));
+
+    for (unsigned Idx = 0; Idx < Worklist.size(); ++Idx) {
+      Use &U = *Worklist[Idx];
+
+      // Allow use in constant bitcasts and simply look through them.
+      if (LookThroughConstantExprUses && isa<ConstantExpr>(U.getUser())) {
+        for (Use &CEU : cast<ConstantExpr>(U.getUser())->uses())
+          Worklist.push_back(&CEU);
+        continue;
+      }
+
+      CB(U);
+    }
+  }
+
+  /// Initialize the ModuleSlice member based on \p SCC. ModuleSlices contains
+  /// (a subset of) all functions that we can look at during this SCC traversal.
+  /// This includes functions (transitively) called from the SCC and the
+  /// (transitive) callers of SCC functions. We also can look at a function if
+  /// there is a "reference edge", i.a., if the function somehow uses (!=calls)
+  /// a function in the SCC or a caller of a function in the SCC.
+  void initializeModuleSlice(SetVector<Function *> &SCC) {
+    ModuleSlice.insert(SCC.begin(), SCC.end());
+
+    SmallPtrSet<Function *, 16> Seen;
+    SmallVector<Function *, 16> Worklist(SCC.begin(), SCC.end());
+    while (!Worklist.empty()) {
+      Function *F = Worklist.pop_back_val();
+      ModuleSlice.insert(F);
+
+      for (Instruction &I : instructions(*F))
+        if (auto *CB = dyn_cast<CallBase>(&I))
+          if (Function *Callee = CB->getCalledFunction())
+            if (Seen.insert(Callee).second)
+              Worklist.push_back(Callee);
+    }
+
+    Seen.clear();
+    Worklist.append(SCC.begin(), SCC.end());
+    while (!Worklist.empty()) {
+      Function *F = Worklist.pop_back_val();
+      ModuleSlice.insert(F);
+
+      // Traverse all transitive uses.
+      foreachUse(*F, [&](Use &U) {
+        if (auto *UsrI = dyn_cast<Instruction>(U.getUser()))
+          if (Seen.insert(UsrI->getFunction()).second)
+            Worklist.push_back(UsrI->getFunction());
+      });
+    }
+  }
+
+  /// The slice of the module we are allowed to look at.
+  SmallPtrSet<Function *, 8> ModuleSlice;
+
   /// A vector type to hold instructions.
   using InstructionVectorTy = SmallVector<Instruction *, 8>;
 
@@ -685,9 +864,7 @@ struct InformationCache {
   }
 
   /// Return AliasAnalysis Result for function \p F.
-  AAResults *getAAResultsForFunction(const Function &F) {
-    return AG.getAnalysis<AAManager>(F);
-  }
+  AAResults *getAAResultsForFunction(const Function &F);
 
   /// Return true if \p Arg is involved in a must-tail call, thus the argument
   /// of the caller or callee.
@@ -715,6 +892,26 @@ struct InformationCache {
   /// Return the map conaining all the knowledge we have from `llvm.assume`s.
   const RetainedKnowledgeMap &getKnowledgeMap() const { return KnowledgeMap; }
 
+  /// Return if \p To is potentially reachable form \p From or not
+  /// If the same query was answered, return cached result
+  bool getPotentiallyReachable(const Instruction &From, const Instruction &To) {
+    auto KeyPair = std::make_pair(&From, &To);
+    auto Iter = PotentiallyReachableMap.find(KeyPair);
+    if (Iter != PotentiallyReachableMap.end())
+      return Iter->second;
+    const Function &F = *From.getFunction();
+    bool Result = isPotentiallyReachable(
+        &From, &To, nullptr, AG.getAnalysis<DominatorTreeAnalysis>(F),
+        AG.getAnalysis<LoopAnalysis>(F));
+    PotentiallyReachableMap.insert(std::make_pair(KeyPair, Result));
+    return Result;
+  }
+
+  /// Check whether \p F is part of module slice.
+  bool isInModuleSlice(const Function &F) {
+    return ModuleSlice.count(const_cast<Function *>(&F));
+  }
+
 private:
   struct FunctionInfo {
     ~FunctionInfo();
@@ -774,6 +971,10 @@ private:
   /// Set of inlineable functions
   SmallPtrSet<const Function *, 8> InlineableFunctions;
 
+  /// A map for caching results of queries for isPotentiallyReachable
+  DenseMap<std::pair<const Instruction *, const Instruction *>, bool>
+      PotentiallyReachableMap;
+
   /// Give the Attributor access to the members so
   /// Attributor::identifyDefaultAbstractAttributes(...) can initialize them.
   friend struct Attributor;
@@ -876,6 +1077,7 @@ struct Attributor {
   /// attribute. Using this after Attributor started running is restricted to
   /// only the Attributor itself. Initial seeding of AAs can be done via this
   /// function.
+  /// NOTE: ForceUpdate is ignored in any stage other than the update stage.
   template <typename AAType>
   const AAType &getOrCreateAAFor(const IRPosition &IRP,
                                  const AbstractAttribute *QueryingAA = nullptr,
@@ -883,7 +1085,7 @@ struct Attributor {
                                  DepClassTy DepClass = DepClassTy::OPTIONAL,
                                  bool ForceUpdate = false) {
     if (AAType *AAPtr = lookupAAFor<AAType>(IRP, QueryingAA, TrackDependence)) {
-      if (ForceUpdate)
+      if (ForceUpdate && Phase == AttributorPhase::UPDATE)
         updateAA(*AAPtr);
       return *AAPtr;
     }
@@ -893,7 +1095,7 @@ struct Attributor {
     auto &AA = AAType::createForPosition(IRP, *this);
 
     // If we are currenty seeding attributes, enforce seeding rules.
-    if (SeedingPeriod && !shouldSeedAttribute(AA)) {
+    if (Phase == AttributorPhase::SEEDING && !shouldSeedAttribute(AA)) {
       AA.getState().indicatePessimisticFixpoint();
       return AA;
     }
@@ -907,6 +1109,9 @@ struct Attributor {
       Invalidate |= FnScope->hasFnAttribute(Attribute::Naked) ||
                     FnScope->hasFnAttribute(Attribute::OptimizeNone);
 
+    // Avoid too many nested initializations to prevent a stack overflow.
+    Invalidate |= InitializationChainLength > MaxInitializationChainLength;
+
     // Bootstrap the new attribute with an initial update to propagate
     // information, e.g., function -> call site. If it is not on a given
     // Allowed we will not perform updates at all.
@@ -915,24 +1120,39 @@ struct Attributor {
       return AA;
     }
 
-    AA.initialize(*this);
+    {
+      TimeTraceScope TimeScope(AA.getName() + "::initialize");
+      ++InitializationChainLength;
+      AA.initialize(*this);
+      --InitializationChainLength;
+    }
 
-    // We can initialize (=look at) code outside the current function set but
-    // not call update because that would again spawn new abstract attributes in
-    // potentially unconnected code regions (=SCCs).
+    // Initialize and update is allowed for code outside of the current function
+    // set, but only if it is part of module slice we are allowed to look at.
+    // Only exception is AAIsDeadFunction whose initialization is prevented
+    // directly, since we don't to compute it twice.
     if (FnScope && !Functions.count(const_cast<Function *>(FnScope))) {
+      if (!getInfoCache().isInModuleSlice(*FnScope)) {
+        AA.getState().indicatePessimisticFixpoint();
+        return AA;
+      }
+    }
+
+    // If this is queried in the manifest stage, we force the AA to indicate
+    // pessimistic fixpoint immediately.
+    if (Phase == AttributorPhase::MANIFEST) {
       AA.getState().indicatePessimisticFixpoint();
       return AA;
     }
 
     // Allow seeded attributes to declare dependencies.
     // Remember the seeding state.
-    bool OldSeedingPeriod = SeedingPeriod;
-    SeedingPeriod = false;
+    AttributorPhase OldPhase = Phase;
+    Phase = AttributorPhase::UPDATE;
 
     updateAA(AA);
 
-    SeedingPeriod = OldSeedingPeriod;
+    Phase = OldPhase;
 
     if (TrackDependence && AA.getState().isValidState())
       recordDependence(AA, const_cast<AbstractAttribute &>(*QueryingAA),
@@ -1001,7 +1221,11 @@ struct Attributor {
     assert(!AAPtr && "Attribute already in map!");
     AAPtr = &AA;
 
-    AllAbstractAttributes.push_back(&AA);
+    // Register AA with the synthetic root only before the manifest stage.
+    if (Phase == AttributorPhase::SEEDING || Phase == AttributorPhase::UPDATE)
+      DG.SyntheticRoot.Deps.push_back(
+          AADepGraphNode::DepTy(&AA, unsigned(DepClassTy::REQUIRED)));
+
     return AA;
   }
 
@@ -1310,6 +1534,22 @@ struct Attributor {
   bool checkForAllReadWriteInstructions(function_ref<bool(Instruction &)> Pred,
                                         AbstractAttribute &QueryingAA);
 
+  /// Create a shallow wrapper for \p F such that \p F has internal linkage
+  /// afterwards. It also sets the original \p F 's name to anonymous
+  ///
+  /// A wrapper is a function with the same type (and attributes) as \p F
+  /// that will only call \p F and return the result, if any.
+  ///
+  /// Assuming the declaration of looks like:
+  ///   rty F(aty0 arg0, ..., atyN argN);
+  ///
+  /// The wrapper will then look as follows:
+  ///   rty wrapper(aty0 arg0, ..., atyN argN) {
+  ///     return F(arg0, ..., argN);
+  ///   }
+  ///
+  static void createShallowWrapper(Function &F);
+
   /// Return the data layout associated with the anchor scope.
   const DataLayout &getDataLayout() const { return InfoCache.DL; }
 
@@ -1333,6 +1573,10 @@ private:
   /// Rewrites function signitures and updates the call graph.
   ChangeStatus cleanupIR();
 
+  /// Identify internal functions that are effectively dead, thus not reachable
+  /// from a live entry point. The functions are added to ToBeDeletedFunctions.
+  void identifyDeadInternalFunctions();
+
   /// Run `::update` on \p AA and track the dependences queried while doing so.
   /// Also adjust the state if we know further updates are not necessary.
   ChangeStatus updateAA(AbstractAttribute &AA);
@@ -1363,12 +1607,6 @@ private:
   /// See getOrCreateAAFor.
   bool shouldSeedAttribute(AbstractAttribute &AA);
 
-  /// The set of all abstract attributes.
-  ///{
-  using AAVector = SmallVector<AbstractAttribute *, 64>;
-  AAVector AllAbstractAttributes;
-  ///}
-
   /// A nested map to lookup abstract attributes based on the argument position
   /// on the outer level, and the addresses of the static member (AAType::ID) on
   /// the inner level.
@@ -1390,6 +1628,9 @@ private:
   /// Helper to update an underlying call graph.
   CallGraphUpdater &CGUpdater;
 
+  /// Abstract Attribute dependency graph
+  AADepGraph DG;
+
   /// Set of functions for which we modified the content such that it might
   /// impact the call graph.
   SmallPtrSet<Function *, 8> CGModifiedFunctions;
@@ -1428,9 +1669,17 @@ private:
   /// Invoke instructions with at least a single dead successor block.
   SmallVector<WeakVH, 16> InvokeWithDeadSuccessor;
 
-  /// Wheather attributes are being `seeded`, always false after ::run function
-  /// gets called \see getOrCreateAAFor.
-  bool SeedingPeriod = true;
+  /// A flag that indicates which stage of the process we are in. Initially, the
+  /// phase is SEEDING. Phase is changed in `Attributor::run()`
+  enum class AttributorPhase {
+    SEEDING,
+    UPDATE,
+    MANIFEST,
+    CLEANUP,
+  } Phase = AttributorPhase::SEEDING;
+
+  /// The current initialization chain length. Tracked to avoid stack overflows.
+  unsigned InitializationChainLength = 0;
 
   /// Functions, blocks, and instructions we delete after manifest is done.
   ///
@@ -1439,6 +1688,8 @@ private:
   SmallPtrSet<BasicBlock *, 8> ToBeDeletedBlocks;
   SmallDenseSet<WeakVH, 8> ToBeDeletedInsts;
   ///}
+
+  friend AADepGraph;
 };
 
 /// An interface to query the internal state of an abstract attribute.
@@ -1917,7 +2168,7 @@ struct StateWrapper : public BaseType, public StateTy {
   StateType &getState() override { return *this; }
 
   /// See AbstractAttribute::getState(...).
-  const AbstractState &getState() const override { return *this; }
+  const StateType &getState() const override { return *this; }
 };
 
 /// Helper class that provides common functionality to manifest IR attributes.
@@ -2011,7 +2262,7 @@ struct IRAttribute : public BaseType {
 ///       both directions will be added in the future.
 /// NOTE: The mechanics of adding a new "concrete" abstract attribute are
 ///       described in the file comment.
-struct AbstractAttribute : public IRPosition {
+struct AbstractAttribute : public IRPosition, public AADepGraphNode {
   using StateType = AbstractState;
 
   AbstractAttribute(const IRPosition &IRP) : IRPosition(IRP) {}
@@ -2019,6 +2270,14 @@ struct AbstractAttribute : public IRPosition {
   /// Virtual destructor.
   virtual ~AbstractAttribute() {}
 
+  /// This function is used to identify if an \p DGN is of type
+  /// AbstractAttribute so that the dyn_cast and cast can use such information
+  /// to cast an AADepGraphNode to an AbstractAttribute.
+  ///
+  /// We eagerly return true here because all AADepGraphNodes except for the
+  /// Synthethis Node are of type AbstractAttribute
+  static bool classof(const AADepGraphNode *DGN) { return true; }
+
   /// Initialize the state with the information in the Attributor \p A.
   ///
   /// This function is called by the Attributor once all abstract attributes
@@ -2039,7 +2298,8 @@ struct AbstractAttribute : public IRPosition {
 
   /// Helper functions, for debug purposes only.
   ///{
-  virtual void print(raw_ostream &OS) const;
+  void print(raw_ostream &OS) const override;
+  virtual void printWithDeps(raw_ostream &OS) const;
   void dump() const { print(dbgs()); }
 
   /// This function should return the "summarized" assumed state as string.
@@ -2087,12 +2347,6 @@ protected:
   ///
   /// \Return CHANGED if the internal state changed, otherwise UNCHANGED.
   virtual ChangeStatus updateImpl(Attributor &A) = 0;
-
-private:
-  /// Set of abstract attributes which were queried by this one. The bit encodes
-  /// if there is an optional of required dependence.
-  using DepTy = PointerIntPair<AbstractAttribute *, 1>;
-  TinyPtrVector<DepTy> Deps;
 };
 
 /// Forward declarations of output streams for debug purposes.
@@ -2374,16 +2628,17 @@ struct AAReachability : public StateWrapper<BooleanState, AbstractAttribute> {
   /// Returns true if 'From' instruction is assumed to reach, 'To' instruction.
   /// Users should provide two positions they are interested in, and the class
   /// determines (and caches) reachability.
-  bool isAssumedReachable(const Instruction *From,
-                          const Instruction *To) const {
-    return isPotentiallyReachable(From, To);
+  bool isAssumedReachable(Attributor &A, const Instruction &From,
+                          const Instruction &To) const {
+    return A.getInfoCache().getPotentiallyReachable(From, To);
   }
 
   /// Returns true if 'From' instruction is known to reach, 'To' instruction.
   /// Users should provide two positions they are interested in, and the class
   /// determines (and caches) reachability.
-  bool isKnownReachable(const Instruction *From, const Instruction *To) const {
-    return isPotentiallyReachable(From, To);
+  bool isKnownReachable(Attributor &A, const Instruction &From,
+                        const Instruction &To) const {
+    return A.getInfoCache().getPotentiallyReachable(From, To);
   }
 
   /// Create an abstract attribute view for the position \p IRP.
@@ -2546,6 +2801,12 @@ public:
     return F.hasPersonalityFn() && !canSimplifyInvokeNoUnwind(&F);
   }
 
+  /// Return if the edge from \p From BB to \p To BB is assumed dead.
+  /// This is specifically useful in AAReachability.
+  virtual bool isEdgeDead(const BasicBlock *From, const BasicBlock *To) const {
+    return false;
+  }
+
   /// See AbstractAttribute::getName()
   const std::string getName() const override { return "AAIsDead"; }
 
@@ -3202,7 +3463,7 @@ struct AAValueConstantRange
 
   /// See AbstractAttribute::getState(...).
   IntegerRangeState &getState() override { return *this; }
-  const AbstractState &getState() const override { return *this; }
+  const IntegerRangeState &getState() const override { return *this; }
 
   /// Create an abstract attribute view for the position \p IRP.
   static AAValueConstantRange &createForPosition(const IRPosition &IRP,
@@ -3250,6 +3511,279 @@ struct AAValueConstantRange
   static const char ID;
 };
 
+/// A class for a set state.
+/// The assumed boolean state indicates whether the corresponding set is full
+/// set or not. If the assumed state is false, this is the worst state. The
+/// worst state (invalid state) of set of potential values is when the set
+/// contains every possible value (i.e. we cannot in any way limit the value
+/// that the target position can take). That never happens naturally, we only
+/// force it. As for the conditions under which we force it, see
+/// AAPotentialValues.
+template <typename MemberTy, typename KeyInfo = DenseMapInfo<MemberTy>>
+struct PotentialValuesState : AbstractState {
+  using SetTy = DenseSet<MemberTy, KeyInfo>;
+
+  PotentialValuesState() : IsValidState(true), UndefIsContained(false) {}
+
+  PotentialValuesState(bool IsValid)
+      : IsValidState(IsValid), UndefIsContained(false) {}
+
+  /// See AbstractState::isValidState(...)
+  bool isValidState() const override { return IsValidState.isValidState(); }
+
+  /// See AbstractState::isAtFixpoint(...)
+  bool isAtFixpoint() const override { return IsValidState.isAtFixpoint(); }
+
+  /// See AbstractState::indicatePessimisticFixpoint(...)
+  ChangeStatus indicatePessimisticFixpoint() override {
+    return IsValidState.indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractState::indicateOptimisticFixpoint(...)
+  ChangeStatus indicateOptimisticFixpoint() override {
+    return IsValidState.indicateOptimisticFixpoint();
+  }
+
+  /// Return the assumed state
+  PotentialValuesState &getAssumed() { return *this; }
+  const PotentialValuesState &getAssumed() const { return *this; }
+
+  /// Return this set. We should check whether this set is valid or not by
+  /// isValidState() before calling this function.
+  const SetTy &getAssumedSet() const {
+    assert(isValidState() && "This set shoud not be used when it is invalid!");
+    return Set;
+  }
+
+  /// Returns whether this state contains an undef value or not.
+  bool undefIsContained() const {
+    assert(isValidState() && "This flag shoud not be used when it is invalid!");
+    return UndefIsContained;
+  }
+
+  bool operator==(const PotentialValuesState &RHS) const {
+    if (isValidState() != RHS.isValidState())
+      return false;
+    if (!isValidState() && !RHS.isValidState())
+      return true;
+    if (undefIsContained() != RHS.undefIsContained())
+      return false;
+    return Set == RHS.getAssumedSet();
+  }
+
+  /// Maximum number of potential values to be tracked.
+  /// This is set by -attributor-max-potential-values command line option
+  static unsigned MaxPotentialValues;
+
+  /// Return empty set as the best state of potential values.
+  static PotentialValuesState getBestState() {
+    return PotentialValuesState(true);
+  }
+
+  static PotentialValuesState getBestState(PotentialValuesState &PVS) {
+    return getBestState();
+  }
+
+  /// Return full set as the worst state of potential values.
+  static PotentialValuesState getWorstState() {
+    return PotentialValuesState(false);
+  }
+
+  /// Union assumed set with the passed value.
+  void unionAssumed(const MemberTy &C) { insert(C); }
+
+  /// Union assumed set with assumed set of the passed state \p PVS.
+  void unionAssumed(const PotentialValuesState &PVS) { unionWith(PVS); }
+
+  /// Union assumed set with an undef value.
+  void unionAssumedWithUndef() { unionWithUndef(); }
+
+  /// "Clamp" this state with \p PVS.
+  PotentialValuesState operator^=(const PotentialValuesState &PVS) {
+    IsValidState ^= PVS.IsValidState;
+    unionAssumed(PVS);
+    return *this;
+  }
+
+  PotentialValuesState operator&=(const PotentialValuesState &PVS) {
+    IsValidState &= PVS.IsValidState;
+    unionAssumed(PVS);
+    return *this;
+  }
+
+private:
+  /// Check the size of this set, and invalidate when the size is no
+  /// less than \p MaxPotentialValues threshold.
+  void checkAndInvalidate() {
+    if (Set.size() >= MaxPotentialValues)
+      indicatePessimisticFixpoint();
+  }
+
+  /// If this state contains both undef and not undef, we can reduce
+  /// undef to the not undef value.
+  void reduceUndefValue() { UndefIsContained = UndefIsContained & Set.empty(); }
+
+  /// Insert an element into this set.
+  void insert(const MemberTy &C) {
+    if (!isValidState())
+      return;
+    Set.insert(C);
+    checkAndInvalidate();
+  }
+
+  /// Take union with R.
+  void unionWith(const PotentialValuesState &R) {
+    /// If this is a full set, do nothing.;
+    if (!isValidState())
+      return;
+    /// If R is full set, change L to a full set.
+    if (!R.isValidState()) {
+      indicatePessimisticFixpoint();
+      return;
+    }
+    for (const MemberTy &C : R.Set)
+      Set.insert(C);
+    UndefIsContained |= R.undefIsContained();
+    reduceUndefValue();
+    checkAndInvalidate();
+  }
+
+  /// Take union with an undef value.
+  void unionWithUndef() {
+    UndefIsContained = true;
+    reduceUndefValue();
+  }
+
+  /// Take intersection with R.
+  void intersectWith(const PotentialValuesState &R) {
+    /// If R is a full set, do nothing.
+    if (!R.isValidState())
+      return;
+    /// If this is a full set, change this to R.
+    if (!isValidState()) {
+      *this = R;
+      return;
+    }
+    SetTy IntersectSet;
+    for (const MemberTy &C : Set) {
+      if (R.Set.count(C))
+        IntersectSet.insert(C);
+    }
+    Set = IntersectSet;
+    UndefIsContained &= R.undefIsContained();
+    reduceUndefValue();
+  }
+
+  /// A helper state which indicate whether this state is valid or not.
+  BooleanState IsValidState;
+
+  /// Container for potential values
+  SetTy Set;
+
+  /// Flag for undef value
+  bool UndefIsContained;
+};
+
+using PotentialConstantIntValuesState = PotentialValuesState<APInt>;
+
+raw_ostream &operator<<(raw_ostream &OS,
+                        const PotentialConstantIntValuesState &R);
+
+/// An abstract interface for potential values analysis.
+///
+/// This AA collects potential values for each IR position.
+/// An assumed set of potential values is initialized with the empty set (the
+/// best state) and it will grow monotonically as we find more potential values
+/// for this position.
+/// The set might be forced to the worst state, that is, to contain every
+/// possible value for this position in 2 cases.
+///   1. We surpassed the \p MaxPotentialValues threshold. This includes the
+///      case that this position is affected (e.g. because of an operation) by a
+///      Value that is in the worst state.
+///   2. We tried to initialize on a Value that we cannot handle (e.g. an
+///      operator we do not currently handle).
+///
+/// TODO: Support values other than constant integers.
+struct AAPotentialValues
+    : public StateWrapper<PotentialConstantIntValuesState, AbstractAttribute> {
+  using Base = StateWrapper<PotentialConstantIntValuesState, AbstractAttribute>;
+  AAPotentialValues(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
+
+  /// See AbstractAttribute::getState(...).
+  PotentialConstantIntValuesState &getState() override { return *this; }
+  const PotentialConstantIntValuesState &getState() const override {
+    return *this;
+  }
+
+  /// Create an abstract attribute view for the position \p IRP.
+  static AAPotentialValues &createForPosition(const IRPosition &IRP,
+                                              Attributor &A);
+
+  /// Return assumed constant for the associated value
+  Optional<ConstantInt *>
+  getAssumedConstantInt(Attributor &A,
+                        const Instruction *CtxI = nullptr) const {
+    if (!isValidState())
+      return nullptr;
+    if (getAssumedSet().size() == 1)
+      return cast<ConstantInt>(ConstantInt::get(getAssociatedValue().getType(),
+                                                *(getAssumedSet().begin())));
+    if (getAssumedSet().size() == 0) {
+      if (undefIsContained())
+        return cast<ConstantInt>(
+            ConstantInt::get(getAssociatedValue().getType(), 0));
+      return llvm::None;
+    }
+
+    return nullptr;
+  }
+
+  /// See AbstractAttribute::getName()
+  const std::string getName() const override { return "AAPotentialValues"; }
+
+  /// See AbstractAttribute::getIdAddr()
+  const char *getIdAddr() const override { return &ID; }
+
+  /// This function should return true if the type of the \p AA is
+  /// AAPotentialValues
+  static bool classof(const AbstractAttribute *AA) {
+    return (AA->getIdAddr() == &ID);
+  }
+
+  /// Unique ID (due to the unique address)
+  static const char ID;
+};
+
+/// An abstract interface for all noundef attributes.
+struct AANoUndef
+    : public IRAttribute<Attribute::NoUndef,
+                         StateWrapper<BooleanState, AbstractAttribute>> {
+  AANoUndef(const IRPosition &IRP, Attributor &A) : IRAttribute(IRP) {}
+
+  /// Return true if we assume that the underlying value is noundef.
+  bool isAssumedNoUndef() const { return getAssumed(); }
+
+  /// Return true if we know that underlying value is noundef.
+  bool isKnownNoUndef() const { return getKnown(); }
+
+  /// Create an abstract attribute view for the position \p IRP.
+  static AANoUndef &createForPosition(const IRPosition &IRP, Attributor &A);
+
+  /// See AbstractAttribute::getName()
+  const std::string getName() const override { return "AANoUndef"; }
+
+  /// See AbstractAttribute::getIdAddr()
+  const char *getIdAddr() const override { return &ID; }
+
+  /// This function should return true if the type of the \p AA is AANoUndef
+  static bool classof(const AbstractAttribute *AA) {
+    return (AA->getIdAddr() == &ID);
+  }
+
+  /// Unique ID (due to the unique address)
+  static const char ID;
+};
+
 /// Run options, used by the pass manager.
 enum AttributorRunOption {
   NONE = 0,
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/BlockExtractor.h b/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/BlockExtractor.h
new file mode 100644
index 000000000000..deeb5ebe23d9
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/BlockExtractor.h
@@ -0,0 +1,25 @@
+//===- BlockExtractor.h - Extracts blocks into their own functions --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass extracts the specified basic blocks from the module into their
+// own functions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_IPO_BLOCKEXTRACTOR_H
+#define LLVM_TRANSFORMS_IPO_BLOCKEXTRACTOR_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+struct BlockExtractorPass : PassInfoMixin<BlockExtractorPass> {
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_IPO_BLOCKEXTRACTOR_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/CalledValuePropagation.h b/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/CalledValuePropagation.h
index c2626d0867b4..782633799ede 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/CalledValuePropagation.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/CalledValuePropagation.h
@@ -19,7 +19,6 @@
 #ifndef LLVM_TRANSFORMS_IPO_CALLEDVALUEPROPAGATION_H
 #define LLVM_TRANSFORMS_IPO_CALLEDVALUEPROPAGATION_H
 
-#include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 
 namespace llvm {
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/CrossDSOCFI.h b/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/CrossDSOCFI.h
index 8440df639729..d34a51081101 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/CrossDSOCFI.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/CrossDSOCFI.h
@@ -14,7 +14,6 @@
 #ifndef LLVM_TRANSFORMS_IPO_CROSSDSOCFI_H
 #define LLVM_TRANSFORMS_IPO_CROSSDSOCFI_H
 
-#include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 
 namespace llvm {
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/ForceFunctionAttrs.h b/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/ForceFunctionAttrs.h
index 7379009b2592..fd99843d0449 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/ForceFunctionAttrs.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/ForceFunctionAttrs.h
@@ -13,7 +13,6 @@
 #ifndef LLVM_TRANSFORMS_IPO_FORCEFUNCTIONATTRS_H
 #define LLVM_TRANSFORMS_IPO_FORCEFUNCTIONATTRS_H
 
-#include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 
 namespace llvm {
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/IROutliner.h b/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/IROutliner.h
new file mode 100644
index 000000000000..eefcbe5235c1
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/IROutliner.h
@@ -0,0 +1,358 @@
+//===- IROutliner.h - Extract similar IR regions into functions ------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// \file
+// The interface file for the IROutliner which is used by the IROutliner Pass.
+//
+// The outliner uses the IRSimilarityIdentifier to identify the similar regions
+// of code.  It evaluates each set of IRSimilarityCandidates with an estimate of
+// whether it will provide code size reduction.  Each region is extracted using
+// the code extractor.  These extracted functions are consolidated into a single
+// function and called from the extracted call site.
+//
+// For example:
+// \code
+//   %1 = add i32 %a, %b
+//   %2 = add i32 %b, %a
+//   %3 = add i32 %b, %a
+//   %4 = add i32 %a, %b
+// \endcode
+// would become function
+// \code
+// define internal void outlined_ir_function(i32 %0, i32 %1) {
+//   %1 = add i32 %0, %1
+//   %2 = add i32 %1, %0
+//   ret void
+// }
+// \endcode
+// with calls:
+// \code
+//   call void outlined_ir_function(i32 %a, i32 %b)
+//   call void outlined_ir_function(i32 %b, i32 %a)
+// \endcode
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_IPO_IROUTLINER_H
+#define LLVM_TRANSFORMS_IPO_IROUTLINER_H
+
+#include "llvm/Analysis/IRSimilarityIdentifier.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/ValueMap.h"
+#include "llvm/Support/InstructionCost.h"
+#include "llvm/Transforms/Utils/CodeExtractor.h"
+#include <set>
+
+struct OutlinableGroup;
+
+namespace llvm {
+using namespace IRSimilarity;
+
+class Module;
+class TargetTransformInfo;
+class OptimizationRemarkEmitter;
+
+/// The OutlinableRegion holds all the information for a specific region, or
+/// sequence of instructions. This includes what values need to be hoisted to
+/// arguments from the extracted function, inputs and outputs to the region, and
+/// mapping from the extracted function arguments to overall function arguments.
+struct OutlinableRegion {
+  /// Describes the region of code.
+  IRSimilarityCandidate *Candidate;
+
+  /// If this region is outlined, the front and back IRInstructionData could
+  /// potentially become invalidated if the only new instruction is a call.
+  /// This ensures that we replace in the instruction in the IRInstructionData.
+  IRInstructionData *NewFront = nullptr;
+  IRInstructionData *NewBack = nullptr;
+
+  /// The number of extracted inputs from the CodeExtractor.
+  unsigned NumExtractedInputs;
+
+  /// The corresponding BasicBlock with the appropriate stores for this
+  /// OutlinableRegion in the overall function.
+  unsigned OutputBlockNum;
+
+  /// Mapping the extracted argument number to the argument number in the
+  /// overall function.  Since there will be inputs, such as elevated constants
+  /// that are not the same in each region in a SimilarityGroup, or values that
+  /// cannot be sunk into the extracted section in every region, we must keep
+  /// track of which extracted argument maps to which overall argument.
+  DenseMap<unsigned, unsigned> ExtractedArgToAgg;
+  DenseMap<unsigned, unsigned> AggArgToExtracted;
+
+  /// Mapping of the argument number in the deduplicated function
+  /// to a given constant, which is used when creating the arguments to the call
+  /// to the newly created deduplicated function.  This is handled separately
+  /// since the CodeExtractor does not recognize constants.
+  DenseMap<unsigned, Constant *> AggArgToConstant;
+
+  /// The global value numbers that are used as outputs for this section. Once
+  /// extracted, each output will be stored to an output register.  This
+  /// documents the global value numbers that are used in this pattern.
+  SmallVector<unsigned, 4> GVNStores;
+
+  /// Used to create an outlined function.
+  CodeExtractor *CE = nullptr;
+
+  /// The call site of the extracted region.
+  CallInst *Call = nullptr;
+
+  /// The function for the extracted region.
+  Function *ExtractedFunction = nullptr;
+
+  /// Flag for whether we have split out the IRSimilarityCanidate. That is,
+  /// make the region contained the IRSimilarityCandidate its own BasicBlock.
+  bool CandidateSplit = false;
+
+  /// Flag for whether we should not consider this region for extraction.
+  bool IgnoreRegion = false;
+
+  /// The BasicBlock that is before the start of the region BasicBlock,
+  /// only defined when the region has been split.
+  BasicBlock *PrevBB = nullptr;
+
+  /// The BasicBlock that contains the starting instruction of the region.
+  BasicBlock *StartBB = nullptr;
+
+  /// The BasicBlock that contains the ending instruction of the region.
+  BasicBlock *EndBB = nullptr;
+
+  /// The BasicBlock that is after the start of the region BasicBlock,
+  /// only defined when the region has been split.
+  BasicBlock *FollowBB = nullptr;
+
+  /// The Outlinable Group that contains this region and structurally similar
+  /// regions to this region.
+  OutlinableGroup *Parent = nullptr;
+
+  OutlinableRegion(IRSimilarityCandidate &C, OutlinableGroup &Group)
+      : Candidate(&C), Parent(&Group) {
+    StartBB = C.getStartBB();
+    EndBB = C.getEndBB();
+  }
+
+  /// For the contained region, split the parent BasicBlock at the starting and
+  /// ending instructions of the contained IRSimilarityCandidate.
+  void splitCandidate();
+
+  /// For the contained region, reattach the BasicBlock at the starting and
+  /// ending instructions of the contained IRSimilarityCandidate, or if the
+  /// function has been extracted, the start and end of the BasicBlock
+  /// containing the called function.
+  void reattachCandidate();
+
+  /// Get the size of the code removed from the region.
+  ///
+  /// \param [in] TTI - The TargetTransformInfo for the parent function.
+  /// \returns the code size of the region
+  InstructionCost getBenefit(TargetTransformInfo &TTI);
+};
+
+/// This class is a pass that identifies similarity in a Module, extracts
+/// instances of the similarity, and then consolidating the similar regions
+/// in an effort to reduce code size.  It uses the IRSimilarityIdentifier pass
+/// to identify the similar regions of code, and then extracts the similar
+/// sections into a single function.  See the above for an example as to
+/// how code is extracted and consolidated into a single function.
+class IROutliner {
+public:
+  IROutliner(function_ref<TargetTransformInfo &(Function &)> GTTI,
+             function_ref<IRSimilarityIdentifier &(Module &)> GIRSI,
+             function_ref<OptimizationRemarkEmitter &(Function &)> GORE)
+      : getTTI(GTTI), getIRSI(GIRSI), getORE(GORE) {}
+  bool run(Module &M);
+
+private:
+  /// Find repeated similar code sequences in \p M and outline them into new
+  /// Functions.
+  ///
+  /// \param [in] M - The module to outline from.
+  /// \returns The number of Functions created.
+  unsigned doOutline(Module &M);
+
+  /// Remove all the IRSimilarityCandidates from \p CandidateVec that have
+  /// instructions contained in a previously outlined region and put the
+  /// remaining regions in \p CurrentGroup.
+  ///
+  /// \param [in] CandidateVec - List of similarity candidates for regions with
+  /// the same similarity structure.
+  /// \param [in,out] CurrentGroup - Contains the potential sections to
+  /// be outlined.
+  void
+  pruneIncompatibleRegions(std::vector<IRSimilarityCandidate> &CandidateVec,
+                           OutlinableGroup &CurrentGroup);
+
+  /// Create the function based on the overall types found in the current
+  /// regions being outlined.
+  ///
+  /// \param M - The module to outline from.
+  /// \param [in,out] CG - The OutlinableGroup for the regions to be outlined.
+  /// \param [in] FunctionNameSuffix - How many functions have we previously
+  /// created.
+  /// \returns the newly created function.
+  Function *createFunction(Module &M, OutlinableGroup &CG,
+                           unsigned FunctionNameSuffix);
+
+  /// Identify the needed extracted inputs in a section, and add to the overall
+  /// function if needed.
+  ///
+  /// \param [in] M - The module to outline from.
+  /// \param [in,out] Region - The region to be extracted.
+  /// \param [in] NotSame - The global value numbers of the Values in the region
+  /// that do not have the same Constant in each strucutrally similar region.
+  void findAddInputsOutputs(Module &M, OutlinableRegion &Region,
+                            DenseSet<unsigned> &NotSame);
+
+  /// Find the number of instructions that will be removed by extracting the
+  /// OutlinableRegions in \p CurrentGroup.
+  ///
+  /// \param [in] CurrentGroup - The collection of OutlinableRegions to be
+  /// analyzed.
+  /// \returns the number of outlined instructions across all regions.
+  InstructionCost findBenefitFromAllRegions(OutlinableGroup &CurrentGroup);
+
+  /// Find the number of instructions that will be added by reloading arguments.
+  ///
+  /// \param [in] CurrentGroup - The collection of OutlinableRegions to be
+  /// analyzed.
+  /// \returns the number of added reload instructions across all regions.
+  InstructionCost findCostOutputReloads(OutlinableGroup &CurrentGroup);
+
+  /// Find the cost and the benefit of \p CurrentGroup and save it back to
+  /// \p CurrentGroup.
+  ///
+  /// \param [in] M - The module being analyzed
+  /// \param [in,out] CurrentGroup - The overall outlined section
+  void findCostBenefit(Module &M, OutlinableGroup &CurrentGroup);
+
+  /// Update the output mapping based on the load instruction, and the outputs
+  /// of the extracted function.
+  ///
+  /// \param Region - The region extracted
+  /// \param Outputs - The outputs from the extracted function.
+  /// \param LI - The load instruction used to update the mapping.
+  void updateOutputMapping(OutlinableRegion &Region,
+                           ArrayRef<Value *> Outputs, LoadInst *LI);
+
+  /// Extract \p Region into its own function.
+  ///
+  /// \param [in] Region - The region to be extracted into its own function.
+  /// \returns True if it was successfully outlined.
+  bool extractSection(OutlinableRegion &Region);
+
+  /// For the similarities found, and the extracted sections, create a single
+  /// outlined function with appropriate output blocks as necessary.
+  ///
+  /// \param [in] M - The module to outline from
+  /// \param [in] CurrentGroup - The set of extracted sections to consolidate.
+  /// \param [in,out] FuncsToRemove - List of functions to remove from the
+  /// module after outlining is completed.
+  /// \param [in,out] OutlinedFunctionNum - the number of new outlined
+  /// functions.
+  void deduplicateExtractedSections(Module &M, OutlinableGroup &CurrentGroup,
+                                    std::vector<Function *> &FuncsToRemove,
+                                    unsigned &OutlinedFunctionNum);
+
+  /// If true, enables us to outline from functions that have LinkOnceFromODR
+  /// linkages.
+  bool OutlineFromLinkODRs = false;
+
+  /// If false, we do not worry if the cost is greater than the benefit.  This
+  /// is for debugging and testing, so that we can test small cases to ensure
+  /// that the outlining is being done correctly.
+  bool CostModel = true;
+
+  /// The set of outlined Instructions, identified by their location in the
+  /// sequential ordering of instructions in a Module.
+  DenseSet<unsigned> Outlined;
+
+  /// TargetTransformInfo lambda for target specific information.
+  function_ref<TargetTransformInfo &(Function &)> getTTI;
+
+  /// A mapping from newly created reloaded output values to the original value.
+  /// If an value is replace by an output from an outlined region, this maps
+  /// that Value, back to its original Value.
+  DenseMap<Value *, Value *> OutputMappings;
+
+  /// IRSimilarityIdentifier lambda to retrieve IRSimilarityIdentifier.
+  function_ref<IRSimilarityIdentifier &(Module &)> getIRSI;
+
+  /// The optimization remark emitter for the pass.
+  function_ref<OptimizationRemarkEmitter &(Function &)> getORE;
+
+  /// The memory allocator used to allocate the CodeExtractors.
+  SpecificBumpPtrAllocator<CodeExtractor> ExtractorAllocator;
+
+  /// The memory allocator used to allocate the OutlinableRegions.
+  SpecificBumpPtrAllocator<OutlinableRegion> RegionAllocator;
+
+  /// The memory allocator used to allocate new IRInstructionData.
+  SpecificBumpPtrAllocator<IRInstructionData> InstDataAllocator;
+
+  /// Custom InstVisitor to classify different instructions for whether it can
+  /// be analyzed for similarity.  This is needed as there may be instruction we
+  /// can identify as having similarity, but are more complicated to outline.
+  struct InstructionAllowed : public InstVisitor<InstructionAllowed, bool> {
+    InstructionAllowed() {}
+
+    // TODO: Determine a scheme to resolve when the label is similar enough.
+    bool visitBranchInst(BranchInst &BI) { return false; }
+    // TODO: Determine a scheme to resolve when the labels are similar enough.
+    bool visitPHINode(PHINode &PN) { return false; }
+    // TODO: Handle allocas.
+    bool visitAllocaInst(AllocaInst &AI) { return false; }
+    // VAArg instructions are not allowed since this could cause difficulty when
+    // differentiating between different sets of variable instructions in
+    // the deduplicated outlined regions.
+    bool visitVAArgInst(VAArgInst &VI) { return false; }
+    // We exclude all exception handling cases since they are so context
+    // dependent.
+    bool visitLandingPadInst(LandingPadInst &LPI) { return false; }
+    bool visitFuncletPadInst(FuncletPadInst &FPI) { return false; }
+    // DebugInfo should be included in the regions, but should not be
+    // analyzed for similarity as it has no bearing on the outcome of the
+    // program.
+    bool visitDbgInfoIntrinsic(DbgInfoIntrinsic &DII) { return true; }
+    // TODO: Handle specific intrinsics individually from those that can be
+    // handled.
+    bool IntrinsicInst(IntrinsicInst &II) { return false; }
+    // We only handle CallInsts that are not indirect, since we cannot guarantee
+    // that they have a name in these cases.
+    bool visitCallInst(CallInst &CI) {
+      Function *F = CI.getCalledFunction();
+      if (!F || CI.isIndirectCall() || !F->hasName())
+        return false;
+      return true;
+    }
+    // TODO: Handle FreezeInsts.  Since a frozen value could be frozen inside
+    // the outlined region, and then returned as an output, this will have to be
+    // handled differently.
+    bool visitFreezeInst(FreezeInst &CI) { return false; }
+    // TODO: We do not current handle similarity that changes the control flow.
+    bool visitInvokeInst(InvokeInst &II) { return false; }
+    // TODO: We do not current handle similarity that changes the control flow.
+    bool visitCallBrInst(CallBrInst &CBI) { return false; }
+    // TODO: Handle interblock similarity.
+    bool visitTerminator(Instruction &I) { return false; }
+    bool visitInstruction(Instruction &I) { return true; }
+  };
+
+  /// A InstVisitor used to exclude certain instructions from being outlined.
+  InstructionAllowed InstructionClassifier;
+};
+
+/// Pass to outline similar regions.
+class IROutlinerPass : public PassInfoMixin<IROutlinerPass> {
+public:
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_IPO_IROUTLINER_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/Inliner.h b/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/Inliner.h
index 3454b0af0d9f..21ff86994ce1 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/Inliner.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/Inliner.h
@@ -14,8 +14,9 @@
 #include "llvm/Analysis/InlineAdvisor.h"
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/Analysis/LazyCallGraph.h"
+#include "llvm/Analysis/ReplayInlineAdvisor.h"
+#include "llvm/Analysis/Utils/ImportedFunctionsInliningStatistics.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/Transforms/Utils/ImportedFunctionsInliningStatistics.h"
 #include <utility>
 
 namespace llvm {
@@ -96,10 +97,8 @@ protected:
 /// passes be composed to achieve the same end result.
 class InlinerPass : public PassInfoMixin<InlinerPass> {
 public:
-  InlinerPass() = default;
-  ~InlinerPass();
-  InlinerPass(InlinerPass &&Arg)
-      : ImportedFunctionsStats(std::move(Arg.ImportedFunctionsStats)) {}
+  InlinerPass(bool OnlyMandatory = false) : OnlyMandatory(OnlyMandatory) {}
+  InlinerPass(InlinerPass &&Arg) = default;
 
   PreservedAnalyses run(LazyCallGraph::SCC &C, CGSCCAnalysisManager &AM,
                         LazyCallGraph &CG, CGSCCUpdateResult &UR);
@@ -107,8 +106,8 @@ public:
 private:
   InlineAdvisor &getAdvisor(const ModuleAnalysisManagerCGSCCProxy::Result &MAM,
                             FunctionAnalysisManager &FAM, Module &M);
-  std::unique_ptr<ImportedFunctionsInliningStatistics> ImportedFunctionsStats;
-  Optional<DefaultInlineAdvisor> OwnedDefaultAdvisor;
+  std::unique_ptr<InlineAdvisor> OwnedAdvisor;
+  const bool OnlyMandatory;
 };
 
 /// Module pass, wrapping the inliner pass. This works in conjunction with the
@@ -121,6 +120,7 @@ class ModuleInlinerWrapperPass
 public:
   ModuleInlinerWrapperPass(
       InlineParams Params = getInlineParams(), bool Debugging = false,
+      bool MandatoryFirst = true,
       InliningAdvisorMode Mode = InliningAdvisorMode::Default,
       unsigned MaxDevirtIterations = 0);
   ModuleInlinerWrapperPass(ModuleInlinerWrapperPass &&Arg) = default;
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/LoopExtractor.h b/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/LoopExtractor.h
new file mode 100644
index 000000000000..def3c5943919
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/LoopExtractor.h
@@ -0,0 +1,32 @@
+//===- LoopExtractor.h - Extract each loop into a new function ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// A pass wrapper around the ExtractLoop() scalar transformation to extract each
+// top-level loop into its own new function. If the loop is the ONLY loop in a
+// given function, it is not touched. This is a pass most useful for debugging
+// via bugpoint.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_IPO_LOOPEXTRACTOR_H
+#define LLVM_TRANSFORMS_IPO_LOOPEXTRACTOR_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+struct LoopExtractorPass : public PassInfoMixin<LoopExtractorPass> {
+  LoopExtractorPass(unsigned NumLoops = ~0) : NumLoops(NumLoops) {}
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+
+private:
+  unsigned NumLoops;
+};
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_IPO_LOOPEXTRACTOR_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/LowerTypeTests.h b/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/LowerTypeTests.h
index 5e91ae599363..eb682c437b94 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/LowerTypeTests.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/LowerTypeTests.h
@@ -198,10 +198,14 @@ bool isJumpTableCanonical(Function *F);
 } // end namespace lowertypetests
 
 class LowerTypeTestsPass : public PassInfoMixin<LowerTypeTestsPass> {
+  bool UseCommandLine = false;
+
+  ModuleSummaryIndex *ExportSummary = nullptr;
+  const ModuleSummaryIndex *ImportSummary = nullptr;
+  bool DropTypeTests = true;
+
 public:
-  ModuleSummaryIndex *ExportSummary;
-  const ModuleSummaryIndex *ImportSummary;
-  bool DropTypeTests;
+  LowerTypeTestsPass() : UseCommandLine(true) {}
   LowerTypeTestsPass(ModuleSummaryIndex *ExportSummary,
                      const ModuleSummaryIndex *ImportSummary,
                      bool DropTypeTests = false)
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/OpenMPOpt.h b/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/OpenMPOpt.h
index d96187b73f9b..9b72ee0afd28 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/OpenMPOpt.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/OpenMPOpt.h
@@ -33,6 +33,11 @@ struct OpenMPInModule {
   bool isKnown() { return Value != OpenMP::UNKNOWN; }
   operator bool() { return Value != OpenMP::NOT_FOUND; }
 
+  /// Does this function \p F contain any OpenMP runtime calls?
+  bool containsOMPRuntimeCalls(Function *F) const {
+    return FuncsWithOMPRuntimeCalls.contains(F);
+  }
+
   /// Return the known kernels (=GPU entry points) in the module.
   SmallPtrSetImpl<Kernel> &getKernels() { return Kernels; }
 
@@ -42,6 +47,11 @@ struct OpenMPInModule {
 private:
   enum class OpenMP { FOUND, NOT_FOUND, UNKNOWN } Value = OpenMP::UNKNOWN;
 
+  friend bool containsOpenMP(Module &M, OpenMPInModule &OMPInModule);
+
+  /// In which functions are OpenMP runtime calls present?
+  SmallPtrSet<Function *, 32> FuncsWithOMPRuntimeCalls;
+
   /// Collection of known kernels (=GPU entry points) in the module.
   SmallPtrSet<Kernel, 8> Kernels;
 };
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h b/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h
new file mode 100644
index 000000000000..da0bdae0eaee
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h
@@ -0,0 +1,152 @@
+//===- Transforms/IPO/SampleContextTracker.h --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file provides the interface for context-sensitive profile tracker used
+/// by CSSPGO.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_IPO_SAMPLECONTEXTTRACKER_H
+#define LLVM_TRANSFORMS_IPO_SAMPLECONTEXTTRACKER_H
+
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/ProfileData/SampleProf.h"
+#include <list>
+#include <map>
+#include <vector>
+
+using namespace llvm;
+using namespace sampleprof;
+
+namespace llvm {
+
+// Internal trie tree representation used for tracking context tree and sample
+// profiles. The path from root node to a given node represents the context of
+// that nodes' profile.
+class ContextTrieNode {
+public:
+  ContextTrieNode(ContextTrieNode *Parent = nullptr,
+                  StringRef FName = StringRef(),
+                  FunctionSamples *FSamples = nullptr,
+                  LineLocation CallLoc = {0, 0})
+      : ParentContext(Parent), FuncName(FName), FuncSamples(FSamples),
+        CallSiteLoc(CallLoc){};
+  ContextTrieNode *getChildContext(const LineLocation &CallSite,
+                                   StringRef CalleeName);
+  ContextTrieNode *getHottestChildContext(const LineLocation &CallSite);
+  ContextTrieNode *getOrCreateChildContext(const LineLocation &CallSite,
+                                           StringRef CalleeName,
+                                           bool AllowCreate = true);
+
+  ContextTrieNode &moveToChildContext(const LineLocation &CallSite,
+                                      ContextTrieNode &&NodeToMove,
+                                      StringRef ContextStrToRemove,
+                                      bool DeleteNode = true);
+  void removeChildContext(const LineLocation &CallSite, StringRef CalleeName);
+  std::map<uint32_t, ContextTrieNode> &getAllChildContext();
+  const StringRef getFuncName() const;
+  FunctionSamples *getFunctionSamples() const;
+  void setFunctionSamples(FunctionSamples *FSamples);
+  LineLocation getCallSiteLoc() const;
+  ContextTrieNode *getParentContext() const;
+  void setParentContext(ContextTrieNode *Parent);
+  void dump();
+
+private:
+  static uint32_t nodeHash(StringRef ChildName, const LineLocation &Callsite);
+
+  // Map line+discriminator location to child context
+  std::map<uint32_t, ContextTrieNode> AllChildContext;
+
+  // Link to parent context node
+  ContextTrieNode *ParentContext;
+
+  // Function name for current context
+  StringRef FuncName;
+
+  // Function Samples for current context
+  FunctionSamples *FuncSamples;
+
+  // Callsite location in parent context
+  LineLocation CallSiteLoc;
+};
+
+// Profile tracker that manages profiles and its associated context. It
+// provides interfaces used by sample profile loader to query context profile or
+// base profile for given function or location; it also manages context tree
+// manipulation that is needed to accommodate inline decisions so we have
+// accurate post-inline profile for functions. Internally context profiles
+// are organized in a trie, with each node representing profile for specific
+// calling context and the context is identified by path from root to the node.
+class SampleContextTracker {
+public:
+  using ContextSamplesTy = SmallSet<FunctionSamples *, 16>;
+
+  SampleContextTracker(StringMap<FunctionSamples> &Profiles);
+  // Query context profile for a specific callee with given name at a given
+  // call-site. The full context is identified by location of call instruction.
+  FunctionSamples *getCalleeContextSamplesFor(const CallBase &Inst,
+                                              StringRef CalleeName);
+  // Get samples for indirect call targets for call site at given location.
+  std::vector<const FunctionSamples *>
+  getIndirectCalleeContextSamplesFor(const DILocation *DIL);
+  // Query context profile for a given location. The full context
+  // is identified by input DILocation.
+  FunctionSamples *getContextSamplesFor(const DILocation *DIL);
+  // Query context profile for a given sample contxt of a function.
+  FunctionSamples *getContextSamplesFor(const SampleContext &Context);
+  // Get all context profile for given function.
+  ContextSamplesTy &getAllContextSamplesFor(const Function &Func);
+  ContextSamplesTy &getAllContextSamplesFor(StringRef Name);
+  // Query base profile for a given function. A base profile is a merged view
+  // of all context profiles for contexts that are not inlined.
+  FunctionSamples *getBaseSamplesFor(const Function &Func,
+                                     bool MergeContext = true);
+  // Query base profile for a given function by name.
+  FunctionSamples *getBaseSamplesFor(StringRef Name, bool MergeContext);
+  // Mark a context profile as inlined when function is inlined.
+  // This makes sure that inlined context profile will be excluded in
+  // function's base profile.
+  void markContextSamplesInlined(const FunctionSamples *InlinedSamples);
+  void promoteMergeContextSamplesTree(const Instruction &Inst,
+                                      StringRef CalleeName);
+  void addCallGraphEdges(CallGraph &CG, StringMap<Function *> &SymbolMap);
+  // Dump the internal context profile trie.
+  void dump();
+
+private:
+  ContextTrieNode *getContextFor(const DILocation *DIL);
+  ContextTrieNode *getContextFor(const SampleContext &Context);
+  ContextTrieNode *getCalleeContextFor(const DILocation *DIL,
+                                       StringRef CalleeName);
+  ContextTrieNode *getOrCreateContextPath(const SampleContext &Context,
+                                          bool AllowCreate);
+  ContextTrieNode *getTopLevelContextNode(StringRef FName);
+  ContextTrieNode &addTopLevelContextNode(StringRef FName);
+  ContextTrieNode &promoteMergeContextSamplesTree(ContextTrieNode &NodeToPromo);
+  void mergeContextNode(ContextTrieNode &FromNode, ContextTrieNode &ToNode,
+                        StringRef ContextStrToRemove);
+  ContextTrieNode &promoteMergeContextSamplesTree(ContextTrieNode &FromNode,
+                                                  ContextTrieNode &ToNodeParent,
+                                                  StringRef ContextStrToRemove);
+
+  // Map from function name to context profiles (excluding base profile)
+  StringMap<ContextSamplesTy> FuncToCtxtProfileSet;
+
+  // Root node for context trie tree
+  ContextTrieNode RootContext;
+};
+
+} // end namespace llvm
+#endif // LLVM_TRANSFORMS_IPO_SAMPLECONTEXTTRACKER_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/SampleProfile.h b/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/SampleProfile.h
index a5ad44551bf6..3d929b974044 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/SampleProfile.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/SampleProfile.h
@@ -24,17 +24,18 @@ class Module;
 /// The sample profiler data loader pass.
 class SampleProfileLoaderPass : public PassInfoMixin<SampleProfileLoaderPass> {
 public:
-  SampleProfileLoaderPass(std::string File = "", std::string RemappingFile = "",
-                          bool IsThinLTOPreLink = false)
+  SampleProfileLoaderPass(
+      std::string File = "", std::string RemappingFile = "",
+      ThinOrFullLTOPhase LTOPhase = ThinOrFullLTOPhase::None)
       : ProfileFileName(File), ProfileRemappingFileName(RemappingFile),
-        IsThinLTOPreLink(IsThinLTOPreLink) {}
+        LTOPhase(LTOPhase) {}
 
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
 
 private:
   std::string ProfileFileName;
   std::string ProfileRemappingFileName;
-  bool IsThinLTOPreLink;
+  ThinOrFullLTOPhase LTOPhase;
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/SampleProfileProbe.h b/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/SampleProfileProbe.h
new file mode 100644
index 000000000000..0fd79d8ff7f3
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/SampleProfileProbe.h
@@ -0,0 +1,147 @@
+//===- Transforms/IPO/SampleProfileProbe.h ----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file provides the interface for the pseudo probe implementation for
+/// AutoFDO.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_IPO_SAMPLEPROFILEPROBE_H
+#define LLVM_TRANSFORMS_IPO_SAMPLEPROFILEPROBE_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/Analysis/LazyCallGraph.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/PassInstrumentation.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/PseudoProbe.h"
+#include "llvm/ProfileData/SampleProf.h"
+#include "llvm/Target/TargetMachine.h"
+#include <unordered_map>
+
+namespace llvm {
+
+class Module;
+
+using namespace sampleprof;
+using BlockIdMap = std::unordered_map<BasicBlock *, uint32_t>;
+using InstructionIdMap = std::unordered_map<Instruction *, uint32_t>;
+using ProbeFactorMap = std::unordered_map<uint64_t, float>;
+using FuncProbeFactorMap = StringMap<ProbeFactorMap>;
+
+enum class PseudoProbeReservedId { Invalid = 0, Last = Invalid };
+
+class PseudoProbeDescriptor {
+  uint64_t FunctionGUID;
+  uint64_t FunctionHash;
+
+public:
+  PseudoProbeDescriptor(uint64_t GUID, uint64_t Hash)
+      : FunctionGUID(GUID), FunctionHash(Hash) {}
+  uint64_t getFunctionGUID() const { return FunctionGUID; }
+  uint64_t getFunctionHash() const { return FunctionHash; }
+};
+
+// A pseudo probe verifier that can be run after each IR passes to detect the
+// violation of updating probe factors. In principle, the sum of distribution
+// factor for a probe should be identical before and after a pass. For a
+// function pass, the factor sum for a probe would be typically 100%.
+class PseudoProbeVerifier {
+public:
+  void registerCallbacks(PassInstrumentationCallbacks &PIC);
+
+  // Implementation of pass instrumentation callbacks for new pass manager.
+  void runAfterPass(StringRef PassID, Any IR);
+
+private:
+  // Allow a little bias due the rounding to integral factors.
+  constexpr static float DistributionFactorVariance = 0.02f;
+  // Distribution factors from last pass.
+  FuncProbeFactorMap FunctionProbeFactors;
+
+  void collectProbeFactors(const BasicBlock *BB, ProbeFactorMap &ProbeFactors);
+  void runAfterPass(const Module *M);
+  void runAfterPass(const LazyCallGraph::SCC *C);
+  void runAfterPass(const Function *F);
+  void runAfterPass(const Loop *L);
+  bool shouldVerifyFunction(const Function *F);
+  void verifyProbeFactors(const Function *F,
+                          const ProbeFactorMap &ProbeFactors);
+};
+
+// This class serves sample counts correlation for SampleProfileLoader by
+// analyzing pseudo probes and their function descriptors injected by
+// SampleProfileProber.
+class PseudoProbeManager {
+  DenseMap<uint64_t, PseudoProbeDescriptor> GUIDToProbeDescMap;
+
+  const PseudoProbeDescriptor *getDesc(const Function &F) const;
+
+public:
+  PseudoProbeManager(const Module &M);
+  bool moduleIsProbed(const Module &M) const;
+  bool profileIsValid(const Function &F, const FunctionSamples &Samples) const;
+};
+
+/// Sample profile pseudo prober.
+///
+/// Insert pseudo probes for block sampling and value sampling.
+class SampleProfileProber {
+public:
+  // Give an empty module id when the prober is not used for instrumentation.
+  SampleProfileProber(Function &F, const std::string &CurModuleUniqueId);
+  void instrumentOneFunc(Function &F, TargetMachine *TM);
+
+private:
+  Function *getFunction() const { return F; }
+  uint64_t getFunctionHash() const { return FunctionHash; }
+  uint32_t getBlockId(const BasicBlock *BB) const;
+  uint32_t getCallsiteId(const Instruction *Call) const;
+  void computeCFGHash();
+  void computeProbeIdForBlocks();
+  void computeProbeIdForCallsites();
+
+  Function *F;
+
+  /// The current module ID that is used to name a static object as a comdat
+  /// group.
+  std::string CurModuleUniqueId;
+
+  /// A CFG hash code used to identify a function code changes.
+  uint64_t FunctionHash;
+
+  /// Map basic blocks to the their pseudo probe ids.
+  BlockIdMap BlockProbeIds;
+
+  /// Map indirect calls to the their pseudo probe ids.
+  InstructionIdMap CallProbeIds;
+
+  /// The ID of the last probe, Can be used to number a new probe.
+  uint32_t LastProbeId;
+};
+
+class SampleProfileProbePass : public PassInfoMixin<SampleProfileProbePass> {
+  TargetMachine *TM;
+
+public:
+  SampleProfileProbePass(TargetMachine *TM) : TM(TM) {}
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
+class PseudoProbeUpdatePass : public PassInfoMixin<PseudoProbeUpdatePass> {
+  void runOnFunction(Function &F, FunctionAnalysisManager &FAM);
+
+public:
+  PseudoProbeUpdatePass() {}
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
+} // end namespace llvm
+#endif // LLVM_TRANSFORMS_IPO_SAMPLEPROFILEPROBE_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/StripSymbols.h b/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/StripSymbols.h
new file mode 100644
index 000000000000..dd76d481d668
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/StripSymbols.h
@@ -0,0 +1,47 @@
+//===- StripSymbols.h - Strip symbols and debug info from a module --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The StripSymbols transformation implements code stripping. Specifically, it
+// can delete:
+//
+//   * names for virtual registers
+//   * symbols for internal globals and functions
+//   * debug information
+//
+// Note that this transformation makes code much less readable, so it should
+// only be used in situations where the 'strip' utility would be used, such as
+// reducing code size or making it harder to reverse engineer code.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_IPO_STRIPSYMBOLS_H
+#define LLVM_TRANSFORMS_IPO_STRIPSYMBOLS_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+struct StripSymbolsPass : PassInfoMixin<StripSymbolsPass> {
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
+struct StripNonDebugSymbolsPass : PassInfoMixin<StripNonDebugSymbolsPass> {
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
+struct StripDebugDeclarePass : PassInfoMixin<StripDebugDeclarePass> {
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
+struct StripDeadDebugInfoPass : PassInfoMixin<StripDeadDebugInfoPass> {
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_IPO_STRIPSYMBOLS_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/WholeProgramDevirt.h b/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/WholeProgramDevirt.h
index 86e28cfead80..6e92f8fd3f0d 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/WholeProgramDevirt.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/IPO/WholeProgramDevirt.h
@@ -223,6 +223,9 @@ void setAfterReturnValues(MutableArrayRef<VirtualCallTarget> Targets,
 struct WholeProgramDevirtPass : public PassInfoMixin<WholeProgramDevirtPass> {
   ModuleSummaryIndex *ExportSummary;
   const ModuleSummaryIndex *ImportSummary;
+  bool UseCommandLine = false;
+  WholeProgramDevirtPass()
+      : ExportSummary(nullptr), ImportSummary(nullptr), UseCommandLine(true) {}
   WholeProgramDevirtPass(ModuleSummaryIndex *ExportSummary,
                          const ModuleSummaryIndex *ImportSummary)
       : ExportSummary(ExportSummary), ImportSummary(ImportSummary) {
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h b/contrib/llvm-project/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h
new file mode 100644
index 000000000000..aae0694e4cab
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h
@@ -0,0 +1,528 @@
+//===- InstCombiner.h - InstCombine implementation --------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file provides the interface for the instcombine pass implementation.
+/// The interface is used for generic transformations in this folder and
+/// target specific combinations in the targets.
+/// The visitor implementation is in \c InstCombinerImpl in
+/// \c InstCombineInternal.h.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_INSTCOMBINE_INSTCOMBINER_H
+#define LLVM_TRANSFORMS_INSTCOMBINE_INSTCOMBINER_H
+
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/TargetFolder.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
+#include <cassert>
+
+#define DEBUG_TYPE "instcombine"
+
+namespace llvm {
+
+class AAResults;
+class AssumptionCache;
+class ProfileSummaryInfo;
+class TargetLibraryInfo;
+class TargetTransformInfo;
+
+/// The core instruction combiner logic.
+///
+/// This class provides both the logic to recursively visit instructions and
+/// combine them.
+class LLVM_LIBRARY_VISIBILITY InstCombiner {
+  /// Only used to call target specific inst combining.
+  TargetTransformInfo &TTI;
+
+public:
+  /// Maximum size of array considered when transforming.
+  uint64_t MaxArraySizeForCombine = 0;
+
+  /// An IRBuilder that automatically inserts new instructions into the
+  /// worklist.
+  using BuilderTy = IRBuilder<TargetFolder, IRBuilderCallbackInserter>;
+  BuilderTy &Builder;
+
+protected:
+  /// A worklist of the instructions that need to be simplified.
+  InstCombineWorklist &Worklist;
+
+  // Mode in which we are running the combiner.
+  const bool MinimizeSize;
+
+  AAResults *AA;
+
+  // Required analyses.
+  AssumptionCache &AC;
+  TargetLibraryInfo &TLI;
+  DominatorTree &DT;
+  const DataLayout &DL;
+  const SimplifyQuery SQ;
+  OptimizationRemarkEmitter &ORE;
+  BlockFrequencyInfo *BFI;
+  ProfileSummaryInfo *PSI;
+
+  // Optional analyses. When non-null, these can both be used to do better
+  // combining and will be updated to reflect any changes.
+  LoopInfo *LI;
+
+  bool MadeIRChange = false;
+
+public:
+  InstCombiner(InstCombineWorklist &Worklist, BuilderTy &Builder,
+               bool MinimizeSize, AAResults *AA, AssumptionCache &AC,
+               TargetLibraryInfo &TLI, TargetTransformInfo &TTI,
+               DominatorTree &DT, OptimizationRemarkEmitter &ORE,
+               BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
+               const DataLayout &DL, LoopInfo *LI)
+      : TTI(TTI), Builder(Builder), Worklist(Worklist),
+        MinimizeSize(MinimizeSize), AA(AA), AC(AC), TLI(TLI), DT(DT), DL(DL),
+        SQ(DL, &TLI, &DT, &AC), ORE(ORE), BFI(BFI), PSI(PSI), LI(LI) {}
+
+  virtual ~InstCombiner() {}
+
+  /// Return the source operand of a potentially bitcasted value while
+  /// optionally checking if it has one use. If there is no bitcast or the one
+  /// use check is not met, return the input value itself.
+  static Value *peekThroughBitcast(Value *V, bool OneUseOnly = false) {
+    if (auto *BitCast = dyn_cast<BitCastInst>(V))
+      if (!OneUseOnly || BitCast->hasOneUse())
+        return BitCast->getOperand(0);
+
+    // V is not a bitcast or V has more than one use and OneUseOnly is true.
+    return V;
+  }
+
+  /// Assign a complexity or rank value to LLVM Values. This is used to reduce
+  /// the amount of pattern matching needed for compares and commutative
+  /// instructions. For example, if we have:
+  ///   icmp ugt X, Constant
+  /// or
+  ///   xor (add X, Constant), cast Z
+  ///
+  /// We do not have to consider the commuted variants of these patterns because
+  /// canonicalization based on complexity guarantees the above ordering.
+  ///
+  /// This routine maps IR values to various complexity ranks:
+  ///   0 -> undef
+  ///   1 -> Constants
+  ///   2 -> Other non-instructions
+  ///   3 -> Arguments
+  ///   4 -> Cast and (f)neg/not instructions
+  ///   5 -> Other instructions
+  static unsigned getComplexity(Value *V) {
+    if (isa<Instruction>(V)) {
+      if (isa<CastInst>(V) || match(V, m_Neg(PatternMatch::m_Value())) ||
+          match(V, m_Not(PatternMatch::m_Value())) ||
+          match(V, m_FNeg(PatternMatch::m_Value())))
+        return 4;
+      return 5;
+    }
+    if (isa<Argument>(V))
+      return 3;
+    return isa<Constant>(V) ? (isa<UndefValue>(V) ? 0 : 1) : 2;
+  }
+
+  /// Predicate canonicalization reduces the number of patterns that need to be
+  /// matched by other transforms. For example, we may swap the operands of a
+  /// conditional branch or select to create a compare with a canonical
+  /// (inverted) predicate which is then more likely to be matched with other
+  /// values.
+  static bool isCanonicalPredicate(CmpInst::Predicate Pred) {
+    switch (Pred) {
+    case CmpInst::ICMP_NE:
+    case CmpInst::ICMP_ULE:
+    case CmpInst::ICMP_SLE:
+    case CmpInst::ICMP_UGE:
+    case CmpInst::ICMP_SGE:
+    // TODO: There are 16 FCMP predicates. Should others be (not) canonical?
+    case CmpInst::FCMP_ONE:
+    case CmpInst::FCMP_OLE:
+    case CmpInst::FCMP_OGE:
+      return false;
+    default:
+      return true;
+    }
+  }
+
+  /// Given an exploded icmp instruction, return true if the comparison only
+  /// checks the sign bit. If it only checks the sign bit, set TrueIfSigned if
+  /// the result of the comparison is true when the input value is signed.
+  static bool isSignBitCheck(ICmpInst::Predicate Pred, const APInt &RHS,
+                             bool &TrueIfSigned) {
+    switch (Pred) {
+    case ICmpInst::ICMP_SLT: // True if LHS s< 0
+      TrueIfSigned = true;
+      return RHS.isNullValue();
+    case ICmpInst::ICMP_SLE: // True if LHS s<= -1
+      TrueIfSigned = true;
+      return RHS.isAllOnesValue();
+    case ICmpInst::ICMP_SGT: // True if LHS s> -1
+      TrueIfSigned = false;
+      return RHS.isAllOnesValue();
+    case ICmpInst::ICMP_SGE: // True if LHS s>= 0
+      TrueIfSigned = false;
+      return RHS.isNullValue();
+    case ICmpInst::ICMP_UGT:
+      // True if LHS u> RHS and RHS == sign-bit-mask - 1
+      TrueIfSigned = true;
+      return RHS.isMaxSignedValue();
+    case ICmpInst::ICMP_UGE:
+      // True if LHS u>= RHS and RHS == sign-bit-mask (2^7, 2^15, 2^31, etc)
+      TrueIfSigned = true;
+      return RHS.isMinSignedValue();
+    case ICmpInst::ICMP_ULT:
+      // True if LHS u< RHS and RHS == sign-bit-mask (2^7, 2^15, 2^31, etc)
+      TrueIfSigned = false;
+      return RHS.isMinSignedValue();
+    case ICmpInst::ICMP_ULE:
+      // True if LHS u<= RHS and RHS == sign-bit-mask - 1
+      TrueIfSigned = false;
+      return RHS.isMaxSignedValue();
+    default:
+      return false;
+    }
+  }
+
+  /// Add one to a Constant
+  static Constant *AddOne(Constant *C) {
+    return ConstantExpr::getAdd(C, ConstantInt::get(C->getType(), 1));
+  }
+
+  /// Subtract one from a Constant
+  static Constant *SubOne(Constant *C) {
+    return ConstantExpr::getSub(C, ConstantInt::get(C->getType(), 1));
+  }
+
+  llvm::Optional<std::pair<
+      CmpInst::Predicate,
+      Constant *>> static getFlippedStrictnessPredicateAndConstant(CmpInst::
+                                                                       Predicate
+                                                                           Pred,
+                                                                   Constant *C);
+
+  static bool shouldAvoidAbsorbingNotIntoSelect(const SelectInst &SI) {
+    // a ? b : false and a ? true : b are the canonical form of logical and/or.
+    // This includes !a ? b : false and !a ? true : b. Absorbing the not into
+    // the select by swapping operands would break recognition of this pattern
+    // in other analyses, so don't do that.
+    return match(&SI, PatternMatch::m_LogicalAnd(PatternMatch::m_Value(),
+                                                 PatternMatch::m_Value())) ||
+           match(&SI, PatternMatch::m_LogicalOr(PatternMatch::m_Value(),
+                                                PatternMatch::m_Value()));
+  }
+
+  /// Return true if the specified value is free to invert (apply ~ to).
+  /// This happens in cases where the ~ can be eliminated.  If WillInvertAllUses
+  /// is true, work under the assumption that the caller intends to remove all
+  /// uses of V and only keep uses of ~V.
+  ///
+  /// See also: canFreelyInvertAllUsersOf()
+  static bool isFreeToInvert(Value *V, bool WillInvertAllUses) {
+    // ~(~(X)) -> X.
+    if (match(V, m_Not(PatternMatch::m_Value())))
+      return true;
+
+    // Constants can be considered to be not'ed values.
+    if (match(V, PatternMatch::m_AnyIntegralConstant()))
+      return true;
+
+    // Compares can be inverted if all of their uses are being modified to use
+    // the ~V.
+    if (isa<CmpInst>(V))
+      return WillInvertAllUses;
+
+    // If `V` is of the form `A + Constant` then `-1 - V` can be folded into
+    // `(-1 - Constant) - A` if we are willing to invert all of the uses.
+    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(V))
+      if (BO->getOpcode() == Instruction::Add ||
+          BO->getOpcode() == Instruction::Sub)
+        if (isa<Constant>(BO->getOperand(0)) ||
+            isa<Constant>(BO->getOperand(1)))
+          return WillInvertAllUses;
+
+    // Selects with invertible operands are freely invertible
+    if (match(V,
+              m_Select(PatternMatch::m_Value(), m_Not(PatternMatch::m_Value()),
+                       m_Not(PatternMatch::m_Value()))))
+      return WillInvertAllUses;
+
+    return false;
+  }
+
+  /// Given i1 V, can every user of V be freely adapted if V is changed to !V ?
+  /// InstCombine's freelyInvertAllUsersOf() must be kept in sync with this fn.
+  ///
+  /// See also: isFreeToInvert()
+  static bool canFreelyInvertAllUsersOf(Value *V, Value *IgnoredUser) {
+    // Look at every user of V.
+    for (Use &U : V->uses()) {
+      if (U.getUser() == IgnoredUser)
+        continue; // Don't consider this user.
+
+      auto *I = cast<Instruction>(U.getUser());
+      switch (I->getOpcode()) {
+      case Instruction::Select:
+        if (U.getOperandNo() != 0) // Only if the value is used as select cond.
+          return false;
+        if (shouldAvoidAbsorbingNotIntoSelect(*cast<SelectInst>(I)))
+          return false;
+        break;
+      case Instruction::Br:
+        assert(U.getOperandNo() == 0 && "Must be branching on that value.");
+        break; // Free to invert by swapping true/false values/destinations.
+      case Instruction::Xor: // Can invert 'xor' if it's a 'not', by ignoring
+                             // it.
+        if (!match(I, m_Not(PatternMatch::m_Value())))
+          return false; // Not a 'not'.
+        break;
+      default:
+        return false; // Don't know, likely not freely invertible.
+      }
+      // So far all users were free to invert...
+    }
+    return true; // Can freely invert all users!
+  }
+
+  /// Some binary operators require special handling to avoid poison and
+  /// undefined behavior. If a constant vector has undef elements, replace those
+  /// undefs with identity constants if possible because those are always safe
+  /// to execute. If no identity constant exists, replace undef with some other
+  /// safe constant.
+  static Constant *
+  getSafeVectorConstantForBinop(BinaryOperator::BinaryOps Opcode, Constant *In,
+                                bool IsRHSConstant) {
+    auto *InVTy = cast<FixedVectorType>(In->getType());
+
+    Type *EltTy = InVTy->getElementType();
+    auto *SafeC = ConstantExpr::getBinOpIdentity(Opcode, EltTy, IsRHSConstant);
+    if (!SafeC) {
+      // TODO: Should this be available as a constant utility function? It is
+      // similar to getBinOpAbsorber().
+      if (IsRHSConstant) {
+        switch (Opcode) {
+        case Instruction::SRem: // X % 1 = 0
+        case Instruction::URem: // X %u 1 = 0
+          SafeC = ConstantInt::get(EltTy, 1);
+          break;
+        case Instruction::FRem: // X % 1.0 (doesn't simplify, but it is safe)
+          SafeC = ConstantFP::get(EltTy, 1.0);
+          break;
+        default:
+          llvm_unreachable(
+              "Only rem opcodes have no identity constant for RHS");
+        }
+      } else {
+        switch (Opcode) {
+        case Instruction::Shl:  // 0 << X = 0
+        case Instruction::LShr: // 0 >>u X = 0
+        case Instruction::AShr: // 0 >> X = 0
+        case Instruction::SDiv: // 0 / X = 0
+        case Instruction::UDiv: // 0 /u X = 0
+        case Instruction::SRem: // 0 % X = 0
+        case Instruction::URem: // 0 %u X = 0
+        case Instruction::Sub:  // 0 - X (doesn't simplify, but it is safe)
+        case Instruction::FSub: // 0.0 - X (doesn't simplify, but it is safe)
+        case Instruction::FDiv: // 0.0 / X (doesn't simplify, but it is safe)
+        case Instruction::FRem: // 0.0 % X = 0
+          SafeC = Constant::getNullValue(EltTy);
+          break;
+        default:
+          llvm_unreachable("Expected to find identity constant for opcode");
+        }
+      }
+    }
+    assert(SafeC && "Must have safe constant for binop");
+    unsigned NumElts = InVTy->getNumElements();
+    SmallVector<Constant *, 16> Out(NumElts);
+    for (unsigned i = 0; i != NumElts; ++i) {
+      Constant *C = In->getAggregateElement(i);
+      Out[i] = isa<UndefValue>(C) ? SafeC : C;
+    }
+    return ConstantVector::get(Out);
+  }
+
+  /// Create and insert the idiom we use to indicate a block is unreachable
+  /// without having to rewrite the CFG from within InstCombine.
+  static void CreateNonTerminatorUnreachable(Instruction *InsertAt) {
+    auto &Ctx = InsertAt->getContext();
+    new StoreInst(ConstantInt::getTrue(Ctx),
+                  UndefValue::get(Type::getInt1PtrTy(Ctx)), InsertAt);
+  }
+
+  void addToWorklist(Instruction *I) { Worklist.push(I); }
+
+  AssumptionCache &getAssumptionCache() const { return AC; }
+  TargetLibraryInfo &getTargetLibraryInfo() const { return TLI; }
+  DominatorTree &getDominatorTree() const { return DT; }
+  const DataLayout &getDataLayout() const { return DL; }
+  const SimplifyQuery &getSimplifyQuery() const { return SQ; }
+  OptimizationRemarkEmitter &getOptimizationRemarkEmitter() const {
+    return ORE;
+  }
+  BlockFrequencyInfo *getBlockFrequencyInfo() const { return BFI; }
+  ProfileSummaryInfo *getProfileSummaryInfo() const { return PSI; }
+  LoopInfo *getLoopInfo() const { return LI; }
+
+  // Call target specific combiners
+  Optional<Instruction *> targetInstCombineIntrinsic(IntrinsicInst &II);
+  Optional<Value *>
+  targetSimplifyDemandedUseBitsIntrinsic(IntrinsicInst &II, APInt DemandedMask,
+                                         KnownBits &Known,
+                                         bool &KnownBitsComputed);
+  Optional<Value *> targetSimplifyDemandedVectorEltsIntrinsic(
+      IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
+      APInt &UndefElts2, APInt &UndefElts3,
+      std::function<void(Instruction *, unsigned, APInt, APInt &)>
+          SimplifyAndSetOp);
+
+  /// Inserts an instruction \p New before instruction \p Old
+  ///
+  /// Also adds the new instruction to the worklist and returns \p New so that
+  /// it is suitable for use as the return from the visitation patterns.
+  Instruction *InsertNewInstBefore(Instruction *New, Instruction &Old) {
+    assert(New && !New->getParent() &&
+           "New instruction already inserted into a basic block!");
+    BasicBlock *BB = Old.getParent();
+    BB->getInstList().insert(Old.getIterator(), New); // Insert inst
+    Worklist.push(New);
+    return New;
+  }
+
+  /// Same as InsertNewInstBefore, but also sets the debug loc.
+  Instruction *InsertNewInstWith(Instruction *New, Instruction &Old) {
+    New->setDebugLoc(Old.getDebugLoc());
+    return InsertNewInstBefore(New, Old);
+  }
+
+  /// A combiner-aware RAUW-like routine.
+  ///
+  /// This method is to be used when an instruction is found to be dead,
+  /// replaceable with another preexisting expression. Here we add all uses of
+  /// I to the worklist, replace all uses of I with the new value, then return
+  /// I, so that the inst combiner will know that I was modified.
+  Instruction *replaceInstUsesWith(Instruction &I, Value *V) {
+    // If there are no uses to replace, then we return nullptr to indicate that
+    // no changes were made to the program.
+    if (I.use_empty())
+      return nullptr;
+
+    Worklist.pushUsersToWorkList(I); // Add all modified instrs to worklist.
+
+    // If we are replacing the instruction with itself, this must be in a
+    // segment of unreachable code, so just clobber the instruction.
+    if (&I == V)
+      V = UndefValue::get(I.getType());
+
+    LLVM_DEBUG(dbgs() << "IC: Replacing " << I << "\n"
+                      << "    with " << *V << '\n');
+
+    I.replaceAllUsesWith(V);
+    return &I;
+  }
+
+  /// Replace operand of instruction and add old operand to the worklist.
+  Instruction *replaceOperand(Instruction &I, unsigned OpNum, Value *V) {
+    Worklist.addValue(I.getOperand(OpNum));
+    I.setOperand(OpNum, V);
+    return &I;
+  }
+
+  /// Replace use and add the previously used value to the worklist.
+  void replaceUse(Use &U, Value *NewValue) {
+    Worklist.addValue(U);
+    U = NewValue;
+  }
+
+  /// Combiner aware instruction erasure.
+  ///
+  /// When dealing with an instruction that has side effects or produces a void
+  /// value, we can't rely on DCE to delete the instruction. Instead, visit
+  /// methods should return the value returned by this function.
+  virtual Instruction *eraseInstFromFunction(Instruction &I) = 0;
+
+  void computeKnownBits(const Value *V, KnownBits &Known, unsigned Depth,
+                        const Instruction *CxtI) const {
+    llvm::computeKnownBits(V, Known, DL, Depth, &AC, CxtI, &DT);
+  }
+
+  KnownBits computeKnownBits(const Value *V, unsigned Depth,
+                             const Instruction *CxtI) const {
+    return llvm::computeKnownBits(V, DL, Depth, &AC, CxtI, &DT);
+  }
+
+  bool isKnownToBeAPowerOfTwo(const Value *V, bool OrZero = false,
+                              unsigned Depth = 0,
+                              const Instruction *CxtI = nullptr) {
+    return llvm::isKnownToBeAPowerOfTwo(V, DL, OrZero, Depth, &AC, CxtI, &DT);
+  }
+
+  bool MaskedValueIsZero(const Value *V, const APInt &Mask, unsigned Depth = 0,
+                         const Instruction *CxtI = nullptr) const {
+    return llvm::MaskedValueIsZero(V, Mask, DL, Depth, &AC, CxtI, &DT);
+  }
+
+  unsigned ComputeNumSignBits(const Value *Op, unsigned Depth = 0,
+                              const Instruction *CxtI = nullptr) const {
+    return llvm::ComputeNumSignBits(Op, DL, Depth, &AC, CxtI, &DT);
+  }
+
+  OverflowResult computeOverflowForUnsignedMul(const Value *LHS,
+                                               const Value *RHS,
+                                               const Instruction *CxtI) const {
+    return llvm::computeOverflowForUnsignedMul(LHS, RHS, DL, &AC, CxtI, &DT);
+  }
+
+  OverflowResult computeOverflowForSignedMul(const Value *LHS, const Value *RHS,
+                                             const Instruction *CxtI) const {
+    return llvm::computeOverflowForSignedMul(LHS, RHS, DL, &AC, CxtI, &DT);
+  }
+
+  OverflowResult computeOverflowForUnsignedAdd(const Value *LHS,
+                                               const Value *RHS,
+                                               const Instruction *CxtI) const {
+    return llvm::computeOverflowForUnsignedAdd(LHS, RHS, DL, &AC, CxtI, &DT);
+  }
+
+  OverflowResult computeOverflowForSignedAdd(const Value *LHS, const Value *RHS,
+                                             const Instruction *CxtI) const {
+    return llvm::computeOverflowForSignedAdd(LHS, RHS, DL, &AC, CxtI, &DT);
+  }
+
+  OverflowResult computeOverflowForUnsignedSub(const Value *LHS,
+                                               const Value *RHS,
+                                               const Instruction *CxtI) const {
+    return llvm::computeOverflowForUnsignedSub(LHS, RHS, DL, &AC, CxtI, &DT);
+  }
+
+  OverflowResult computeOverflowForSignedSub(const Value *LHS, const Value *RHS,
+                                             const Instruction *CxtI) const {
+    return llvm::computeOverflowForSignedSub(LHS, RHS, DL, &AC, CxtI, &DT);
+  }
+
+  virtual bool SimplifyDemandedBits(Instruction *I, unsigned OpNo,
+                                    const APInt &DemandedMask, KnownBits &Known,
+                                    unsigned Depth = 0) = 0;
+  virtual Value *
+  SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, APInt &UndefElts,
+                             unsigned Depth = 0,
+                             bool AllowMultipleUsers = false) = 0;
+};
+
+} // namespace llvm
+
+#undef DEBUG_TYPE
+
+#endif
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Instrumentation.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Instrumentation.h
index d4373d7b39ea..c960d5b0ab50 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Instrumentation.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Instrumentation.h
@@ -66,6 +66,9 @@ struct GCOVOptions {
   // Add the 'noredzone' attribute to added runtime library calls.
   bool NoRedZone;
 
+  // Use atomic profile counter increments.
+  bool Atomic = false;
+
   // Regexes separated by a semi-colon to filter the files to instrument.
   std::string Filter;
 
@@ -143,9 +146,8 @@ ModulePass *createInstrProfilingLegacyPass(
 ModulePass *createInstrOrderFilePass();
 
 // Insert DataFlowSanitizer (dynamic data flow analysis) instrumentation
-ModulePass *createDataFlowSanitizerPass(
-    const std::vector<std::string> &ABIListFiles = std::vector<std::string>(),
-    void *(*getArgTLS)() = nullptr, void *(*getRetValTLS)() = nullptr);
+ModulePass *createDataFlowSanitizerLegacyPassPass(
+    const std::vector<std::string> &ABIListFiles = std::vector<std::string>());
 
 // Options for sanitizer coverage instrumentation.
 struct SanitizerCoverageOptions {
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizer.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizer.h
index fea6064042ae..53ad0cbf9968 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizer.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizer.h
@@ -102,6 +102,7 @@ public:
                                 bool Recover = false,
                                 bool UseAfterScope = false);
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+  static bool isRequired() { return true; }
 
 private:
   bool CompileKernel;
@@ -122,6 +123,7 @@ public:
                                       bool UseGlobalGC = true,
                                       bool UseOdrIndicator = false);
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+  static bool isRequired() { return true; }
 
 private:
   bool CompileKernel;
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Instrumentation/BoundsChecking.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Instrumentation/BoundsChecking.h
index 120c6a8fb09f..8d70f1429b99 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Instrumentation/BoundsChecking.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Instrumentation/BoundsChecking.h
@@ -17,6 +17,7 @@ namespace llvm {
 /// stores, and other memory intrinsics.
 struct BoundsCheckingPass : PassInfoMixin<BoundsCheckingPass> {
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+  static bool isRequired() { return true; }
 };
 
 
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Instrumentation/DataFlowSanitizer.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Instrumentation/DataFlowSanitizer.h
new file mode 100644
index 000000000000..9b57b1f9a9ea
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Instrumentation/DataFlowSanitizer.h
@@ -0,0 +1,32 @@
+//===- DataFlowSanitizer.h - dynamic data flow analysis -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_TRANSFORMS_INSTRUMENTATION_DATAFLOWSANITIZER_H
+#define LLVM_TRANSFORMS_INSTRUMENTATION_DATAFLOWSANITIZER_H
+
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include <string>
+#include <vector>
+
+namespace llvm {
+
+class DataFlowSanitizerPass : public PassInfoMixin<DataFlowSanitizerPass> {
+private:
+  std::vector<std::string> ABIListFiles;
+
+public:
+  DataFlowSanitizerPass(
+      const std::vector<std::string> &ABIListFiles = std::vector<std::string>())
+      : ABIListFiles(ABIListFiles) {}
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+  static bool isRequired() { return true; }
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Instrumentation/GCOVProfiler.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Instrumentation/GCOVProfiler.h
index b3971e49754e..2766cc5e6263 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Instrumentation/GCOVProfiler.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Instrumentation/GCOVProfiler.h
@@ -26,5 +26,5 @@ private:
   GCOVOptions GCOVOpts;
 };
 
-} // End llvm namespace
+} // namespace llvm
 #endif
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Instrumentation/HWAddressSanitizer.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Instrumentation/HWAddressSanitizer.h
index e3104eeb1d36..68b47320f650 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Instrumentation/HWAddressSanitizer.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Instrumentation/HWAddressSanitizer.h
@@ -27,6 +27,7 @@ public:
   explicit HWAddressSanitizerPass(bool CompileKernel = false,
                                   bool Recover = false);
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM);
+  static bool isRequired() { return true; }
 
 private:
   bool CompileKernel;
@@ -36,6 +37,24 @@ private:
 FunctionPass *createHWAddressSanitizerLegacyPassPass(bool CompileKernel = false,
                                                      bool Recover = false);
 
+namespace HWASanAccessInfo {
+
+// Bit field positions for the accessinfo parameter to
+// llvm.hwasan.check.memaccess. Shared between the pass and the backend. Bits
+// 0-15 are also used by the runtime.
+enum {
+  AccessSizeShift = 0, // 4 bits
+  IsWriteShift = 4,
+  RecoverShift = 5,
+  MatchAllShift = 16, // 8 bits
+  HasMatchAllShift = 24,
+  CompileKernelShift = 25,
+};
+
+enum { RuntimeMask = 0xffff };
+
+} // namespace HWASanAccessInfo
+
 } // namespace llvm
 
 #endif
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Instrumentation/InstrProfiling.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Instrumentation/InstrProfiling.h
index 263d3b629589..5ce72cd59ac2 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Instrumentation/InstrProfiling.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Instrumentation/InstrProfiling.h
@@ -68,11 +68,6 @@ private:
   // vector of counter load/store pairs to be register promoted.
   std::vector<LoadStorePair> PromotionCandidates;
 
-  // The start value of precise value profile range for memory intrinsic sizes.
-  int64_t MemOPSizeRangeStart;
-  // The end value of precise value profile range for memory intrinsic sizes.
-  int64_t MemOPSizeRangeLast;
-
   int64_t TotalCountersPromoted = 0;
 
   /// Lower instrumentation intrinsics in the function. Returns true if there
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Instrumentation/MemProfiler.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Instrumentation/MemProfiler.h
new file mode 100644
index 000000000000..ac6a07d299a6
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Instrumentation/MemProfiler.h
@@ -0,0 +1,51 @@
+//===--------- Definition of the MemProfiler class --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the MemProfiler class.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_TRANSFORMS_INSTRUMENTATION_MEMPROFILER_H
+#define LLVM_TRANSFORMS_INSTRUMENTATION_MEMPROFILER_H
+
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+/// Public interface to the memory profiler pass for instrumenting code to
+/// profile memory accesses.
+///
+/// The profiler itself is a function pass that works by inserting various
+/// calls to the MemProfiler runtime library functions. The runtime library
+/// essentially replaces malloc() and free() with custom implementations that
+/// record data about the allocations.
+class MemProfilerPass : public PassInfoMixin<MemProfilerPass> {
+public:
+  explicit MemProfilerPass();
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+  static bool isRequired() { return true; }
+};
+
+/// Public interface to the memory profiler module pass for instrumenting code
+/// to profile memory allocations and accesses.
+class ModuleMemProfilerPass : public PassInfoMixin<ModuleMemProfilerPass> {
+public:
+  explicit ModuleMemProfilerPass();
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+  static bool isRequired() { return true; }
+};
+
+// Insert MemProfiler instrumentation
+FunctionPass *createMemProfilerFunctionPass();
+ModulePass *createModuleMemProfilerLegacyPassPass();
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Instrumentation/MemorySanitizer.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Instrumentation/MemorySanitizer.h
index 01a86ee3f1fd..f5f9ec7829bd 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Instrumentation/MemorySanitizer.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Instrumentation/MemorySanitizer.h
@@ -41,6 +41,7 @@ struct MemorySanitizerPass : public PassInfoMixin<MemorySanitizerPass> {
 
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+  static bool isRequired() { return true; }
 
 private:
   MemorySanitizerOptions Options;
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Instrumentation/SanitizerCoverage.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Instrumentation/SanitizerCoverage.h
index 999086a29f87..e3d268cb0781 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Instrumentation/SanitizerCoverage.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Instrumentation/SanitizerCoverage.h
@@ -46,6 +46,7 @@ public:
                                                *vfs::getRealFileSystem());
   }
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+  static bool isRequired() { return true; }
 
 private:
   SanitizerCoverageOptions Options;
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Instrumentation/ThreadSanitizer.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Instrumentation/ThreadSanitizer.h
index ce0e46745abb..f9c507624e6d 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Instrumentation/ThreadSanitizer.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Instrumentation/ThreadSanitizer.h
@@ -28,6 +28,7 @@ FunctionPass *createThreadSanitizerLegacyPassPass();
 struct ThreadSanitizerPass : public PassInfoMixin<ThreadSanitizerPass> {
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+  static bool isRequired() { return true; }
 };
 
 } // namespace llvm
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/ObjCARC.h b/contrib/llvm-project/llvm/include/llvm/Transforms/ObjCARC.h
index 2f114c75e2e2..a89df95385c8 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/ObjCARC.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/ObjCARC.h
@@ -14,6 +14,8 @@
 #ifndef LLVM_TRANSFORMS_OBJCARC_H
 #define LLVM_TRANSFORMS_OBJCARC_H
 
+#include "llvm/IR/PassManager.h"
+
 namespace llvm {
 
 class Pass;
@@ -42,6 +44,22 @@ Pass *createObjCARCContractPass();
 //
 Pass *createObjCARCOptPass();
 
+struct ObjCARCOptPass : public PassInfoMixin<ObjCARCOptPass> {
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
+struct ObjCARCContractPass : public PassInfoMixin<ObjCARCContractPass> {
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
+struct ObjCARCAPElimPass : public PassInfoMixin<ObjCARCAPElimPass> {
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
+struct ObjCARCExpandPass : public PassInfoMixin<ObjCARCExpandPass> {
+  PreservedAnalyses run(Function &M, FunctionAnalysisManager &AM);
+};
+
 } // End llvm namespace
 
 #endif
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar.h
index a1aacec76979..3db1613d7457 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_TRANSFORMS_SCALAR_H
 #define LLVM_TRANSFORMS_SCALAR_H
 
+#include "llvm/Transforms/Utils/SimplifyCFGOptions.h"
 #include <functional>
 
 namespace llvm {
@@ -23,12 +24,6 @@ class FunctionPass;
 class ModulePass;
 class Pass;
 
-//===----------------------------------------------------------------------===//
-//
-// ConstantPropagation - A worklist driven constant propagation pass
-//
-FunctionPass *createConstantPropagationPass();
-
 //===----------------------------------------------------------------------===//
 //
 // AlignmentFromAssumptions - Use assume intrinsics to set load/store
@@ -38,16 +33,15 @@ FunctionPass *createAlignmentFromAssumptionsPass();
 
 //===----------------------------------------------------------------------===//
 //
-// SCCP - Sparse conditional constant propagation.
+// AnnotationRemarks - Emit remarks for !annotation metadata.
 //
-FunctionPass *createSCCPPass();
+FunctionPass *createAnnotationRemarksLegacyPass();
 
 //===----------------------------------------------------------------------===//
 //
-// DeadInstElimination - This pass quickly removes trivially dead instructions
-// without modifying the CFG of the function.  It is a FunctionPass.
+// SCCP - Sparse conditional constant propagation.
 //
-Pass *createDeadInstEliminationPass();
+FunctionPass *createSCCPPass();
 
 //===----------------------------------------------------------------------===//
 //
@@ -161,6 +155,12 @@ Pass *createLoopPredicationPass();
 //
 Pass *createLoopInterchangePass();
 
+//===----------------------------------------------------------------------===//
+//
+// LoopFlatten - This pass flattens nested loops into a single loop.
+//
+FunctionPass *createLoopFlattenPass();
+
 //===----------------------------------------------------------------------===//
 //
 // LoopStrengthReduce - This pass is strength reduces GEP instructions that use
@@ -190,7 +190,8 @@ Pass *createLoopUnrollPass(int OptLevel = 2, bool OnlyWhenForced = false,
                            int Count = -1, int AllowPartial = -1,
                            int Runtime = -1, int UpperBound = -1,
                            int AllowPeeling = -1);
-// Create an unrolling pass for full unrolling that uses exact trip count only.
+// Create an unrolling pass for full unrolling that uses exact trip count only
+// and also does peeling.
 Pass *createSimpleLoopUnrollPass(int OptLevel = 2, bool OnlyWhenForced = false,
                                  bool ForgetAllSCEV = false);
 
@@ -210,7 +211,7 @@ Pass *createLoopRerollPass();
 //
 // LoopRotate - This pass is a simple loop rotating pass.
 //
-Pass *createLoopRotatePass(int MaxHeaderSize = -1);
+Pass *createLoopRotatePass(int MaxHeaderSize = -1, bool PrepareForLTO = false);
 
 //===----------------------------------------------------------------------===//
 //
@@ -245,10 +246,12 @@ FunctionPass *createReassociatePass();
 //===----------------------------------------------------------------------===//
 //
 // JumpThreading - Thread control through mult-pred/multi-succ blocks where some
-// preds always go to some succ. Thresholds other than minus one override the
-// internal BB duplication default threshold.
+// preds always go to some succ. If FreezeSelectCond is true, unfold the
+// condition of a select that unfolds to branch. Thresholds other than minus one
+// override the internal BB duplication default threshold.
 //
-FunctionPass *createJumpThreadingPass(int Threshold = -1);
+FunctionPass *createJumpThreadingPass(bool FreezeSelectCond = false,
+                                      int Threshold = -1);
 
 //===----------------------------------------------------------------------===//
 //
@@ -256,8 +259,7 @@ FunctionPass *createJumpThreadingPass(int Threshold = -1);
 // simplify terminator instructions, convert switches to lookup tables, etc.
 //
 FunctionPass *createCFGSimplificationPass(
-    unsigned Threshold = 1, bool ForwardSwitchCond = false,
-    bool ConvertSwitch = false, bool KeepLoops = true, bool SinkCommon = false,
+    SimplifyCFGOptions Options = SimplifyCFGOptions(),
     std::function<bool(const Function &)> Ftor = nullptr);
 
 //===----------------------------------------------------------------------===//
@@ -344,6 +346,13 @@ Pass *createLoopDeletionPass();
 //
 FunctionPass *createConstantHoistingPass();
 
+//===----------------------------------------------------------------------===//
+//
+// ConstraintElimination - This pass eliminates conditions based on found
+//                         constraints.
+//
+FunctionPass *createConstraintEliminationPass();
+
 //===----------------------------------------------------------------------===//
 //
 // Sink - Code Sinking
@@ -368,6 +377,13 @@ Pass *createLowerGuardIntrinsicPass();
 //
 Pass *createLowerMatrixIntrinsicsPass();
 
+//===----------------------------------------------------------------------===//
+//
+// LowerMatrixIntrinsicsMinimal - Lower matrix intrinsics to vector operations
+//                               (lightweight, does not require extra analysis)
+//
+Pass *createLowerMatrixIntrinsicsMinimalPass();
+
 //===----------------------------------------------------------------------===//
 //
 // LowerWidenableCondition - Lower widenable condition to i1 true.
@@ -523,6 +539,21 @@ Pass *createLoopSimplifyCFGPass();
 // transformations.
 //
 Pass *createWarnMissedTransformationsPass();
+
+//===----------------------------------------------------------------------===//
+//
+// This pass does instruction simplification on each
+// instruction in a function.
+//
+FunctionPass *createInstSimplifyLegacyPass();
+
+
+//===----------------------------------------------------------------------===//
+//
+// createScalarizeMaskedMemIntrinPass - Replace masked load, store, gather
+// and scatter intrinsics with scalar code when target doesn't support them.
+//
+FunctionPass *createScalarizeMaskedMemIntrinLegacyPass();
 } // End llvm namespace
 
 #endif
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/AlignmentFromAssumptions.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/AlignmentFromAssumptions.h
index be119b8ab855..10b6e1c6a21b 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/AlignmentFromAssumptions.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/AlignmentFromAssumptions.h
@@ -37,9 +37,9 @@ struct AlignmentFromAssumptionsPass
   ScalarEvolution *SE = nullptr;
   DominatorTree *DT = nullptr;
 
-  bool extractAlignmentInfo(CallInst *I, Value *&AAPtr, const SCEV *&AlignSCEV,
-                            const SCEV *&OffSCEV);
-  bool processAssumption(CallInst *I);
+  bool extractAlignmentInfo(CallInst *I, unsigned Idx, Value *&AAPtr,
+                            const SCEV *&AlignSCEV, const SCEV *&OffSCEV);
+  bool processAssumption(CallInst *I, unsigned Idx);
 };
 }
 
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/AnnotationRemarks.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/AnnotationRemarks.h
new file mode 100644
index 000000000000..f1619766dcf4
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/AnnotationRemarks.h
@@ -0,0 +1,26 @@
+//===- AnnotationRemarks.cpp - Emit remarks for !annotation MD --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// \file
+// This file defines AnnotationRemarksPass for the new pass manager.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SCALAR_ANNOTATION_REMARKS_H
+#define LLVM_TRANSFORMS_SCALAR_ANNOTATION_REMARKS_H
+
+#include "llvm/IR/Function.h"
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+struct AnnotationRemarksPass : public PassInfoMixin<AnnotationRemarksPass> {
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_SCALAR_ANNOTATION_REMARKS_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/ConstantHoisting.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/ConstantHoisting.h
index 26d4a2476a86..11379e59467f 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/ConstantHoisting.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/ConstantHoisting.h
@@ -198,7 +198,6 @@ private:
   // constant GEP base.
   bool emitBaseConstants(GlobalVariable *BaseGV);
   void deleteDeadCastInst() const;
-  bool optimizeConstants(Function &Fn);
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/ConstraintElimination.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/ConstraintElimination.h
new file mode 100644
index 000000000000..544a6c2eae55
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/ConstraintElimination.h
@@ -0,0 +1,24 @@
+//===- ConstraintElimination.h - Constraint elimination pass ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SCALAR_CONSTRAINTELIMINATION_H
+#define LLVM_TRANSFORMS_SCALAR_CONSTRAINTELIMINATION_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class ConstraintEliminationPass
+    : public PassInfoMixin<ConstraintEliminationPass> {
+public:
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_SCALAR_CONSTRAINTELIMINATION_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/DCE.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/DCE.h
index 974e4b20d152..4d83296b1d86 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/DCE.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/DCE.h
@@ -23,6 +23,12 @@ class DCEPass : public PassInfoMixin<DCEPass> {
 public:
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
+
+class RedundantDbgInstEliminationPass
+    : public PassInfoMixin<RedundantDbgInstEliminationPass> {
+public:
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
 }
 
 #endif // LLVM_TRANSFORMS_SCALAR_DCE_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/GVN.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/GVN.h
index f2818c6b792e..d6b3c8ca7219 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/GVN.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/GVN.h
@@ -46,11 +46,12 @@ class FunctionPass;
 class IntrinsicInst;
 class LoadInst;
 class LoopInfo;
+class MemorySSA;
+class MemorySSAUpdater;
 class OptimizationRemarkEmitter;
 class PHINode;
 class TargetLibraryInfo;
 class Value;
-
 /// A private "module" namespace for types and utilities used by GVN. These
 /// are implementation details and should not be used by clients.
 namespace gvn LLVM_LIBRARY_VISIBILITY {
@@ -72,6 +73,7 @@ struct GVNOptions {
   Optional<bool> AllowPRE = None;
   Optional<bool> AllowLoadPRE = None;
   Optional<bool> AllowLoadInLoopPRE = None;
+  Optional<bool> AllowLoadPRESplitBackedge = None;
   Optional<bool> AllowMemDep = None;
 
   GVNOptions() = default;
@@ -93,6 +95,12 @@ struct GVNOptions {
     return *this;
   }
 
+  /// Enables or disables PRE of loads in GVN.
+  GVNOptions &setLoadPRESplitBackedge(bool LoadPRESplitBackedge) {
+    AllowLoadPRESplitBackedge = LoadPRESplitBackedge;
+    return *this;
+  }
+
   /// Enables or disables use of MemDepAnalysis.
   GVNOptions &setMemDep(bool MemDep) {
     AllowMemDep = MemDep;
@@ -129,6 +137,7 @@ public:
   bool isPREEnabled() const;
   bool isLoadPREEnabled() const;
   bool isLoadInLoopPREEnabled() const;
+  bool isLoadPRESplitBackedgeEnabled() const;
   bool isMemDepEnabled() const;
 
   /// This class holds the mapping between values and value numbers.  It is used
@@ -211,6 +220,7 @@ private:
   OptimizationRemarkEmitter *ORE = nullptr;
   ImplicitControlFlowTracking *ICF = nullptr;
   LoopInfo *LI = nullptr;
+  MemorySSAUpdater *MSSAU = nullptr;
 
   ValueTable VN;
 
@@ -246,7 +256,7 @@ private:
   bool runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT,
                const TargetLibraryInfo &RunTLI, AAResults &RunAA,
                MemoryDependenceResults *RunMD, LoopInfo *LI,
-               OptimizationRemarkEmitter *ORE);
+               OptimizationRemarkEmitter *ORE, MemorySSA *MSSA = nullptr);
 
   /// Push a new Value to the LeaderTable onto the list for its value number.
   void addToLeaderTable(uint32_t N, Value *V, const BasicBlock *BB) {
@@ -328,7 +338,6 @@ private:
                                  BasicBlock *Curr, unsigned int ValNo);
   Value *findLeader(const BasicBlock *BB, uint32_t num);
   void cleanupGlobalSets();
-  void fillImplicitControlFlowInfo(BasicBlock *BB);
   void verifyRemoved(const Instruction *I) const;
   bool splitCriticalEdges();
   BasicBlock *splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ);
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/IndVarSimplify.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/IndVarSimplify.h
index 3c20537ab76a..b5d544f1149c 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/IndVarSimplify.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/IndVarSimplify.h
@@ -23,7 +23,11 @@ class Loop;
 class LPMUpdater;
 
 class IndVarSimplifyPass : public PassInfoMixin<IndVarSimplifyPass> {
+  /// Perform IV widening during the pass.
+  bool WidenIndVars;
+
 public:
+  IndVarSimplifyPass(bool WidenIndVars = true) : WidenIndVars(WidenIndVars) {}
   PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
                         LoopStandardAnalysisResults &AR, LPMUpdater &U);
 };
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/InferAddressSpaces.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/InferAddressSpaces.h
new file mode 100644
index 000000000000..9a56b073f1c6
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/InferAddressSpaces.h
@@ -0,0 +1,27 @@
+//===- InferAddressSpace.h - ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SCALAR_INFERADDRESSSPACES_H
+#define LLVM_TRANSFORMS_SCALAR_INFERADDRESSSPACES_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+struct InferAddressSpacesPass : PassInfoMixin<InferAddressSpacesPass> {
+  InferAddressSpacesPass();
+  InferAddressSpacesPass(unsigned AddressSpace);
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+private:
+  unsigned FlatAddrSpace = 0;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_SCALAR_INFERADDRESSSPACES_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/InstSimplifyPass.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/InstSimplifyPass.h
index 0c30b6260536..f36695a8c2b7 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/InstSimplifyPass.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/InstSimplifyPass.h
@@ -36,10 +36,6 @@ public:
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
-/// Create a legacy pass that does instruction simplification on each
-/// instruction in a function.
-FunctionPass *createInstSimplifyLegacyPass();
-
 } // end namespace llvm
 
 #endif // LLVM_TRANSFORMS_UTILS_INSTSIMPLIFYPASS_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/JumpThreading.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/JumpThreading.h
index 327bf6d00c47..951f4e487753 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/JumpThreading.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/JumpThreading.h
@@ -19,7 +19,6 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/DomTreeUpdater.h"
@@ -29,6 +28,7 @@
 
 namespace llvm {
 
+class AAResults;
 class BasicBlock;
 class BinaryOperator;
 class BranchInst;
@@ -41,6 +41,8 @@ class IntrinsicInst;
 class LazyValueInfo;
 class LoadInst;
 class PHINode;
+class SelectInst;
+class SwitchInst;
 class TargetLibraryInfo;
 class Value;
 
@@ -77,7 +79,7 @@ enum ConstantPreference { WantInteger, WantBlockAddress };
 class JumpThreadingPass : public PassInfoMixin<JumpThreadingPass> {
   TargetLibraryInfo *TLI;
   LazyValueInfo *LVI;
-  AliasAnalysis *AA;
+  AAResults *AA;
   DomTreeUpdater *DTU;
   std::unique_ptr<BlockFrequencyInfo> BFI;
   std::unique_ptr<BranchProbabilityInfo> BPI;
@@ -91,15 +93,16 @@ class JumpThreadingPass : public PassInfoMixin<JumpThreadingPass> {
 
   unsigned BBDupThreshold;
   unsigned DefaultBBDupThreshold;
+  bool InsertFreezeWhenUnfoldingSelect;
 
 public:
-  JumpThreadingPass(int T = -1);
+  JumpThreadingPass(bool InsertFreezeWhenUnfoldingSelect = false, int T = -1);
 
   // Glue for old PM.
-  bool runImpl(Function &F, TargetLibraryInfo *TLI_, LazyValueInfo *LVI_,
-               AliasAnalysis *AA_, DomTreeUpdater *DTU_, bool HasProfileData_,
-               std::unique_ptr<BlockFrequencyInfo> BFI_,
-               std::unique_ptr<BranchProbabilityInfo> BPI_);
+  bool runImpl(Function &F, TargetLibraryInfo *TLI, LazyValueInfo *LVI,
+               AAResults *AA, DomTreeUpdater *DTU, bool HasProfileData,
+               std::unique_ptr<BlockFrequencyInfo> BFI,
+               std::unique_ptr<BranchProbabilityInfo> BPI);
 
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 
@@ -108,65 +111,65 @@ public:
     BPI.reset();
   }
 
-  void FindLoopHeaders(Function &F);
-  bool ProcessBlock(BasicBlock *BB);
-  bool MaybeMergeBasicBlockIntoOnlyPred(BasicBlock *BB);
-  void UpdateSSA(BasicBlock *BB, BasicBlock *NewBB,
+  void findLoopHeaders(Function &F);
+  bool processBlock(BasicBlock *BB);
+  bool maybeMergeBasicBlockIntoOnlyPred(BasicBlock *BB);
+  void updateSSA(BasicBlock *BB, BasicBlock *NewBB,
                  DenseMap<Instruction *, Value *> &ValueMapping);
-  DenseMap<Instruction *, Value *> CloneInstructions(BasicBlock::iterator BI,
+  DenseMap<Instruction *, Value *> cloneInstructions(BasicBlock::iterator BI,
                                                      BasicBlock::iterator BE,
                                                      BasicBlock *NewBB,
                                                      BasicBlock *PredBB);
-  bool TryThreadEdge(BasicBlock *BB,
+  bool tryThreadEdge(BasicBlock *BB,
                      const SmallVectorImpl<BasicBlock *> &PredBBs,
                      BasicBlock *SuccBB);
-  void ThreadEdge(BasicBlock *BB, const SmallVectorImpl<BasicBlock *> &PredBBs,
+  void threadEdge(BasicBlock *BB, const SmallVectorImpl<BasicBlock *> &PredBBs,
                   BasicBlock *SuccBB);
-  bool DuplicateCondBranchOnPHIIntoPred(
+  bool duplicateCondBranchOnPHIIntoPred(
       BasicBlock *BB, const SmallVectorImpl<BasicBlock *> &PredBBs);
 
-  bool ComputeValueKnownInPredecessorsImpl(
+  bool computeValueKnownInPredecessorsImpl(
       Value *V, BasicBlock *BB, jumpthreading::PredValueInfo &Result,
       jumpthreading::ConstantPreference Preference,
       DenseSet<Value *> &RecursionSet, Instruction *CxtI = nullptr);
   bool
-  ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB,
+  computeValueKnownInPredecessors(Value *V, BasicBlock *BB,
                                   jumpthreading::PredValueInfo &Result,
                                   jumpthreading::ConstantPreference Preference,
                                   Instruction *CxtI = nullptr) {
     DenseSet<Value *> RecursionSet;
-    return ComputeValueKnownInPredecessorsImpl(V, BB, Result, Preference,
+    return computeValueKnownInPredecessorsImpl(V, BB, Result, Preference,
                                                RecursionSet, CxtI);
   }
 
-  Constant *EvaluateOnPredecessorEdge(BasicBlock *BB, BasicBlock *PredPredBB,
+  Constant *evaluateOnPredecessorEdge(BasicBlock *BB, BasicBlock *PredPredBB,
                                       Value *cond);
-  bool MaybeThreadThroughTwoBasicBlocks(BasicBlock *BB, Value *Cond);
-  void ThreadThroughTwoBasicBlocks(BasicBlock *PredPredBB, BasicBlock *PredBB,
+  bool maybethreadThroughTwoBasicBlocks(BasicBlock *BB, Value *Cond);
+  void threadThroughTwoBasicBlocks(BasicBlock *PredPredBB, BasicBlock *PredBB,
                                    BasicBlock *BB, BasicBlock *SuccBB);
-  bool ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
+  bool processThreadableEdges(Value *Cond, BasicBlock *BB,
                               jumpthreading::ConstantPreference Preference,
                               Instruction *CxtI = nullptr);
 
-  bool ProcessBranchOnPHI(PHINode *PN);
-  bool ProcessBranchOnXOR(BinaryOperator *BO);
-  bool ProcessImpliedCondition(BasicBlock *BB);
+  bool processBranchOnPHI(PHINode *PN);
+  bool processBranchOnXOR(BinaryOperator *BO);
+  bool processImpliedCondition(BasicBlock *BB);
 
-  bool SimplifyPartiallyRedundantLoad(LoadInst *LI);
-  void UnfoldSelectInstr(BasicBlock *Pred, BasicBlock *BB, SelectInst *SI,
+  bool simplifyPartiallyRedundantLoad(LoadInst *LI);
+  void unfoldSelectInstr(BasicBlock *Pred, BasicBlock *BB, SelectInst *SI,
                          PHINode *SIUse, unsigned Idx);
 
-  bool TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB);
-  bool TryToUnfoldSelect(SwitchInst *SI, BasicBlock *BB);
-  bool TryToUnfoldSelectInCurrBB(BasicBlock *BB);
+  bool tryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB);
+  bool tryToUnfoldSelect(SwitchInst *SI, BasicBlock *BB);
+  bool tryToUnfoldSelectInCurrBB(BasicBlock *BB);
 
-  bool ProcessGuards(BasicBlock *BB);
-  bool ThreadGuard(BasicBlock *BB, IntrinsicInst *Guard, BranchInst *BI);
+  bool processGuards(BasicBlock *BB);
+  bool threadGuard(BasicBlock *BB, IntrinsicInst *Guard, BranchInst *BI);
 
 private:
-  BasicBlock *SplitBlockPreds(BasicBlock *BB, ArrayRef<BasicBlock *> Preds,
+  BasicBlock *splitBlockPreds(BasicBlock *BB, ArrayRef<BasicBlock *> Preds,
                               const char *Suffix);
-  void UpdateBlockFreqAndEdgeWeight(BasicBlock *PredBB, BasicBlock *BB,
+  void updateBlockFreqAndEdgeWeight(BasicBlock *PredBB, BasicBlock *BB,
                                     BasicBlock *NewBB, BasicBlock *SuccBB);
   /// Check if the block has profile metadata for its outgoing edges.
   bool doesBlockHaveProfileData(BasicBlock *BB);
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/LoopFlatten.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/LoopFlatten.h
new file mode 100644
index 000000000000..41f91f090013
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/LoopFlatten.h
@@ -0,0 +1,32 @@
+//===- LoopFlatten.h - Loop Flatten ----------------  -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides the interface for the Loop Flatten Pass.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SCALAR_LOOPFLATTEN_H
+#define LLVM_TRANSFORMS_SCALAR_LOOPFLATTEN_H
+
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+
+namespace llvm {
+
+class LoopFlattenPass : public PassInfoMixin<LoopFlattenPass> {
+public:
+  LoopFlattenPass() = default;
+
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_SCALAR_LOOPFLATTEN_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/LoopIdiomRecognize.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/LoopIdiomRecognize.h
index d2fff8bb5743..0c6406d86185 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/LoopIdiomRecognize.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/LoopIdiomRecognize.h
@@ -23,6 +23,19 @@ namespace llvm {
 class Loop;
 class LPMUpdater;
 
+/// Options to disable Loop Idiom Recognize, which can be shared with other
+/// passes.
+struct DisableLIRP {
+  /// When true, the entire pass is disabled.
+  static bool All;
+
+  /// When true, Memset is disabled.
+  static bool Memset;
+
+  /// When true, Memcpy is disabled.
+  static bool Memcpy;
+};
+
 /// Performs Loop Idiom Recognize Pass.
 class LoopIdiomRecognizePass : public PassInfoMixin<LoopIdiomRecognizePass> {
 public:
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/LoopInterchange.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/LoopInterchange.h
new file mode 100644
index 000000000000..9f50fc5a4127
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/LoopInterchange.h
@@ -0,0 +1,24 @@
+//===- LoopInterchange.h - Loop interchange pass --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SCALAR_LOOPINTERCHANGE_H
+#define LLVM_TRANSFORMS_SCALAR_LOOPINTERCHANGE_H
+
+#include "llvm/IR/PassManager.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+
+namespace llvm {
+
+struct LoopInterchangePass : public PassInfoMixin<LoopInterchangePass> {
+  PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
+                        LoopStandardAnalysisResults &AR, LPMUpdater &U);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_SCALAR_LOOPINTERCHANGE_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/LoopPassManager.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/LoopPassManager.h
index 9b2f0fcab95b..2a342fcda3c2 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/LoopPassManager.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/LoopPassManager.h
@@ -36,41 +36,163 @@
 #ifndef LLVM_TRANSFORMS_SCALAR_LOOPPASSMANAGER_H
 #define LLVM_TRANSFORMS_SCALAR_LOOPPASSMANAGER_H
 
-#include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/PriorityWorklist.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/BasicAliasAnalysis.h"
-#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/MemorySSA.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/LoopNestAnalysis.h"
 #include "llvm/IR/Dominators.h"
+#include "llvm/IR/PassInstrumentation.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Transforms/Utils/LCSSA.h"
 #include "llvm/Transforms/Utils/LoopSimplify.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
+#include <memory>
 
 namespace llvm {
 
 // Forward declarations of an update tracking API used in the pass manager.
 class LPMUpdater;
 
+namespace {
+
+template <typename PassT>
+using HasRunOnLoopT = decltype(std::declval<PassT>().run(
+    std::declval<Loop &>(), std::declval<LoopAnalysisManager &>(),
+    std::declval<LoopStandardAnalysisResults &>(),
+    std::declval<LPMUpdater &>()));
+
+} // namespace
+
 // Explicit specialization and instantiation declarations for the pass manager.
 // See the comments on the definition of the specialization for details on how
 // it differs from the primary template.
 template <>
-PreservedAnalyses
-PassManager<Loop, LoopAnalysisManager, LoopStandardAnalysisResults &,
-            LPMUpdater &>::run(Loop &InitialL, LoopAnalysisManager &AM,
-                               LoopStandardAnalysisResults &AnalysisResults,
-                               LPMUpdater &U);
-extern template class PassManager<Loop, LoopAnalysisManager,
-                                  LoopStandardAnalysisResults &, LPMUpdater &>;
+class PassManager<Loop, LoopAnalysisManager, LoopStandardAnalysisResults &,
+                  LPMUpdater &>
+    : public PassInfoMixin<
+          PassManager<Loop, LoopAnalysisManager, LoopStandardAnalysisResults &,
+                      LPMUpdater &>> {
+public:
+  /// Construct a pass manager.
+  ///
+  /// If \p DebugLogging is true, we'll log our progress to llvm::dbgs().
+  explicit PassManager(bool DebugLogging = false)
+      : DebugLogging(DebugLogging) {}
+
+  // FIXME: These are equivalent to the default move constructor/move
+  // assignment. However, using = default triggers linker errors due to the
+  // explicit instantiations below. Find a way to use the default and remove the
+  // duplicated code here.
+  PassManager(PassManager &&Arg)
+      : IsLoopNestPass(std::move(Arg.IsLoopNestPass)),
+        LoopPasses(std::move(Arg.LoopPasses)),
+        LoopNestPasses(std::move(Arg.LoopNestPasses)),
+        DebugLogging(std::move(Arg.DebugLogging)) {}
+
+  PassManager &operator=(PassManager &&RHS) {
+    IsLoopNestPass = std::move(RHS.IsLoopNestPass);
+    LoopPasses = std::move(RHS.LoopPasses);
+    LoopNestPasses = std::move(RHS.LoopNestPasses);
+    DebugLogging = std::move(RHS.DebugLogging);
+    return *this;
+  }
+
+  PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
+                        LoopStandardAnalysisResults &AR, LPMUpdater &U);
+
+  /// Add either a loop pass or a loop-nest pass to the pass manager. Append \p
+  /// Pass to the list of loop passes if it has a dedicated \fn run() method for
+  /// loops and to the list of loop-nest passes if the \fn run() method is for
+  /// loop-nests instead. Also append whether \p Pass is loop-nest pass or not
+  /// to the end of \var IsLoopNestPass so we can easily identify the types of
+  /// passes in the pass manager later.
+  template <typename PassT>
+  std::enable_if_t<is_detected<HasRunOnLoopT, PassT>::value>
+  addPass(PassT Pass) {
+    using LoopPassModelT =
+        detail::PassModel<Loop, PassT, PreservedAnalyses, LoopAnalysisManager,
+                          LoopStandardAnalysisResults &, LPMUpdater &>;
+    IsLoopNestPass.push_back(false);
+    LoopPasses.emplace_back(new LoopPassModelT(std::move(Pass)));
+  }
+
+  template <typename PassT>
+  std::enable_if_t<!is_detected<HasRunOnLoopT, PassT>::value>
+  addPass(PassT Pass) {
+    using LoopNestPassModelT =
+        detail::PassModel<LoopNest, PassT, PreservedAnalyses,
+                          LoopAnalysisManager, LoopStandardAnalysisResults &,
+                          LPMUpdater &>;
+    IsLoopNestPass.push_back(true);
+    LoopNestPasses.emplace_back(new LoopNestPassModelT(std::move(Pass)));
+  }
+
+  // Specializations of `addPass` for `RepeatedPass`. These are necessary since
+  // `RepeatedPass` has a templated `run` method that will result in incorrect
+  // detection of `HasRunOnLoopT`.
+  template <typename PassT>
+  std::enable_if_t<is_detected<HasRunOnLoopT, PassT>::value>
+  addPass(RepeatedPass<PassT> Pass) {
+    using RepeatedLoopPassModelT =
+        detail::PassModel<Loop, RepeatedPass<PassT>, PreservedAnalyses,
+                          LoopAnalysisManager, LoopStandardAnalysisResults &,
+                          LPMUpdater &>;
+    IsLoopNestPass.push_back(false);
+    LoopPasses.emplace_back(new RepeatedLoopPassModelT(std::move(Pass)));
+  }
+
+  template <typename PassT>
+  std::enable_if_t<!is_detected<HasRunOnLoopT, PassT>::value>
+  addPass(RepeatedPass<PassT> Pass) {
+    using RepeatedLoopNestPassModelT =
+        detail::PassModel<LoopNest, RepeatedPass<PassT>, PreservedAnalyses,
+                          LoopAnalysisManager, LoopStandardAnalysisResults &,
+                          LPMUpdater &>;
+    IsLoopNestPass.push_back(true);
+    LoopNestPasses.emplace_back(
+        new RepeatedLoopNestPassModelT(std::move(Pass)));
+  }
+
+  bool isEmpty() const { return LoopPasses.empty() && LoopNestPasses.empty(); }
+
+  static bool isRequired() { return true; }
+
+  size_t getNumLoopPasses() const { return LoopPasses.size(); }
+  size_t getNumLoopNestPasses() const { return LoopNestPasses.size(); }
+
+protected:
+  using LoopPassConceptT =
+      detail::PassConcept<Loop, LoopAnalysisManager,
+                          LoopStandardAnalysisResults &, LPMUpdater &>;
+  using LoopNestPassConceptT =
+      detail::PassConcept<LoopNest, LoopAnalysisManager,
+                          LoopStandardAnalysisResults &, LPMUpdater &>;
+
+  // BitVector that identifies whether the passes are loop passes or loop-nest
+  // passes (true for loop-nest passes).
+  BitVector IsLoopNestPass;
+  std::vector<std::unique_ptr<LoopPassConceptT>> LoopPasses;
+  std::vector<std::unique_ptr<LoopNestPassConceptT>> LoopNestPasses;
+
+  /// Flag indicating whether we should do debug logging.
+  bool DebugLogging;
+
+  /// Run either a loop pass or a loop-nest pass. Returns `None` if
+  /// PassInstrumentation's BeforePass returns false. Otherwise, returns the
+  /// preserved analyses of the pass.
+  template <typename IRUnitT, typename PassT>
+  Optional<PreservedAnalyses>
+  runSinglePass(IRUnitT &IR, PassT &Pass, LoopAnalysisManager &AM,
+                LoopStandardAnalysisResults &AR, LPMUpdater &U,
+                PassInstrumentation &PI);
+
+  PreservedAnalyses runWithLoopNestPasses(Loop &L, LoopAnalysisManager &AM,
+                                          LoopStandardAnalysisResults &AR,
+                                          LPMUpdater &U);
+  PreservedAnalyses runWithoutLoopNestPasses(Loop &L, LoopAnalysisManager &AM,
+                                             LoopStandardAnalysisResults &AR,
+                                             LPMUpdater &U);
+};
 
 /// The Loop pass manager.
 ///
@@ -103,7 +225,7 @@ using RequireAnalysisLoopPass =
     RequireAnalysisPass<AnalysisT, Loop, LoopAnalysisManager,
                         LoopStandardAnalysisResults &, LPMUpdater &>;
 
-template <typename LoopPassT> class FunctionToLoopPassAdaptor;
+class FunctionToLoopPassAdaptor;
 
 /// This class provides an interface for updating the loop pass manager based
 /// on mutations to the loop nest.
@@ -111,6 +233,13 @@ template <typename LoopPassT> class FunctionToLoopPassAdaptor;
 /// A reference to an instance of this class is passed as an argument to each
 /// Loop pass, and Loop passes should use it to update LPM infrastructure if
 /// they modify the loop nest structure.
+///
+/// \c LPMUpdater comes with two modes: the loop mode and the loop-nest mode. In
+/// loop mode, all the loops in the function will be pushed into the worklist
+/// and when new loops are added to the pipeline, their subloops are also
+/// inserted recursively. On the other hand, in loop-nest mode, only top-level
+/// loops are contained in the worklist and the addition of new (top-level)
+/// loops will not trigger the addition of their subloops.
 class LPMUpdater {
 public:
   /// This can be queried by loop passes which run other loop passes (like pass
@@ -132,6 +261,8 @@ public:
   /// state, this routine will mark that the current loop should be skipped by
   /// the rest of the pass management infrastructure.
   void markLoopAsDeleted(Loop &L, llvm::StringRef Name) {
+    assert((!LoopNestMode || L.isOutermost()) &&
+           "L should be a top-level loop in loop-nest mode.");
     LAM.clear(L, Name);
     assert((&L == CurrentL || CurrentL->contains(&L)) &&
            "Cannot delete a loop outside of the "
@@ -147,6 +278,8 @@ public:
   /// loops within them will be visited in postorder as usual for the loop pass
   /// manager.
   void addChildLoops(ArrayRef<Loop *> NewChildLoops) {
+    assert(!LoopNestMode &&
+           "Child loops should not be pushed in loop-nest mode.");
     // Insert ourselves back into the worklist first, as this loop should be
     // revisited after all the children have been processed.
     Worklist.insert(CurrentL);
@@ -178,7 +311,10 @@ public:
              "All of the new loops must be siblings of the current loop!");
 #endif
 
-    appendLoopsToWorklist(NewSibLoops, Worklist);
+    if (LoopNestMode)
+      Worklist.insert(NewSibLoops);
+    else
+      appendLoopsToWorklist(NewSibLoops, Worklist);
 
     // No need to skip the current loop or revisit it, as sibling loops
     // shouldn't impact anything.
@@ -198,7 +334,7 @@ public:
   }
 
 private:
-  template <typename LoopPassT> friend class llvm::FunctionToLoopPassAdaptor;
+  friend class llvm::FunctionToLoopPassAdaptor;
 
   /// The \c FunctionToLoopPassAdaptor's worklist of loops to process.
   SmallPriorityWorklist<Loop *, 4> &Worklist;
@@ -208,6 +344,7 @@ private:
 
   Loop *CurrentL;
   bool SkipCurrentLoop;
+  const bool LoopNestMode;
 
 #ifndef NDEBUG
   // In debug builds we also track the parent loop to implement asserts even in
@@ -216,10 +353,33 @@ private:
 #endif
 
   LPMUpdater(SmallPriorityWorklist<Loop *, 4> &Worklist,
-             LoopAnalysisManager &LAM)
-      : Worklist(Worklist), LAM(LAM) {}
+             LoopAnalysisManager &LAM, bool LoopNestMode = false)
+      : Worklist(Worklist), LAM(LAM), LoopNestMode(LoopNestMode) {}
 };
 
+template <typename IRUnitT, typename PassT>
+Optional<PreservedAnalyses> LoopPassManager::runSinglePass(
+    IRUnitT &IR, PassT &Pass, LoopAnalysisManager &AM,
+    LoopStandardAnalysisResults &AR, LPMUpdater &U, PassInstrumentation &PI) {
+  // Check the PassInstrumentation's BeforePass callbacks before running the
+  // pass, skip its execution completely if asked to (callback returns false).
+  if (!PI.runBeforePass<IRUnitT>(*Pass, IR))
+    return None;
+
+  PreservedAnalyses PA;
+  {
+    TimeTraceScope TimeScope(Pass->name(), IR.getName());
+    PA = Pass->run(IR, AM, AR, U);
+  }
+
+  // do not pass deleted Loop into the instrumentation
+  if (U.skipCurrentLoop())
+    PI.runAfterPassInvalidated<IRUnitT>(*Pass, PA);
+  else
+    PI.runAfterPass<IRUnitT>(*Pass, IR, PA);
+  return PA;
+}
+
 /// Adaptor that maps from a function to its loops.
 ///
 /// Designed to allow composition of a LoopPass(Manager) and a
@@ -227,161 +387,107 @@ private:
 /// FunctionAnalysisManager it will run the \c LoopAnalysisManagerFunctionProxy
 /// analysis prior to running the loop passes over the function to enable a \c
 /// LoopAnalysisManager to be used within this run safely.
-template <typename LoopPassT>
+///
+/// The adaptor comes with two modes: the loop mode and the loop-nest mode, and
+/// the worklist updater lived inside will be in the same mode as the adaptor
+/// (refer to the documentation of \c LPMUpdater for more detailed explanation).
+/// Specifically, in loop mode, all loops in the funciton will be pushed into
+/// the worklist and processed by \p Pass, while only top-level loops are
+/// processed in loop-nest mode. Please refer to the various specializations of
+/// \fn createLoopFunctionToLoopPassAdaptor to see when loop mode and loop-nest
+/// mode are used.
 class FunctionToLoopPassAdaptor
-    : public PassInfoMixin<FunctionToLoopPassAdaptor<LoopPassT>> {
+    : public PassInfoMixin<FunctionToLoopPassAdaptor> {
 public:
-  explicit FunctionToLoopPassAdaptor(LoopPassT Pass, bool UseMemorySSA = false,
-                                     bool DebugLogging = false)
+  using PassConceptT =
+      detail::PassConcept<Loop, LoopAnalysisManager,
+                          LoopStandardAnalysisResults &, LPMUpdater &>;
+
+  explicit FunctionToLoopPassAdaptor(std::unique_ptr<PassConceptT> Pass,
+                                     bool UseMemorySSA = false,
+                                     bool UseBlockFrequencyInfo = false,
+                                     bool DebugLogging = false,
+                                     bool LoopNestMode = false)
       : Pass(std::move(Pass)), LoopCanonicalizationFPM(DebugLogging),
-        UseMemorySSA(UseMemorySSA) {
+        UseMemorySSA(UseMemorySSA),
+        UseBlockFrequencyInfo(UseBlockFrequencyInfo),
+        LoopNestMode(LoopNestMode) {
     LoopCanonicalizationFPM.addPass(LoopSimplifyPass());
     LoopCanonicalizationFPM.addPass(LCSSAPass());
   }
 
   /// Runs the loop passes across every loop in the function.
-  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM) {
-    // Before we even compute any loop analyses, first run a miniature function
-    // pass pipeline to put loops into their canonical form. Note that we can
-    // directly build up function analyses after this as the function pass
-    // manager handles all the invalidation at that layer.
-    PassInstrumentation PI = AM.getResult<PassInstrumentationAnalysis>(F);
-
-    PreservedAnalyses PA = PreservedAnalyses::all();
-    // Check the PassInstrumentation's BeforePass callbacks before running the
-    // canonicalization pipeline.
-    if (PI.runBeforePass<Function>(LoopCanonicalizationFPM, F)) {
-      PA = LoopCanonicalizationFPM.run(F, AM);
-      PI.runAfterPass<Function>(LoopCanonicalizationFPM, F);
-    }
-
-    // Get the loop structure for this function
-    LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
-
-    // If there are no loops, there is nothing to do here.
-    if (LI.empty())
-      return PA;
-
-    // Get the analysis results needed by loop passes.
-    MemorySSA *MSSA = UseMemorySSA
-                          ? (&AM.getResult<MemorySSAAnalysis>(F).getMSSA())
-                          : nullptr;
-    LoopStandardAnalysisResults LAR = {AM.getResult<AAManager>(F),
-                                       AM.getResult<AssumptionAnalysis>(F),
-                                       AM.getResult<DominatorTreeAnalysis>(F),
-                                       AM.getResult<LoopAnalysis>(F),
-                                       AM.getResult<ScalarEvolutionAnalysis>(F),
-                                       AM.getResult<TargetLibraryAnalysis>(F),
-                                       AM.getResult<TargetIRAnalysis>(F),
-                                       MSSA};
-
-    // Setup the loop analysis manager from its proxy. It is important that
-    // this is only done when there are loops to process and we have built the
-    // LoopStandardAnalysisResults object. The loop analyses cached in this
-    // manager have access to those analysis results and so it must invalidate
-    // itself when they go away.
-    auto &LAMFP = AM.getResult<LoopAnalysisManagerFunctionProxy>(F);
-    if (UseMemorySSA)
-      LAMFP.markMSSAUsed();
-    LoopAnalysisManager &LAM = LAMFP.getManager();
-
-    // A postorder worklist of loops to process.
-    SmallPriorityWorklist<Loop *, 4> Worklist;
-
-    // Register the worklist and loop analysis manager so that loop passes can
-    // update them when they mutate the loop nest structure.
-    LPMUpdater Updater(Worklist, LAM);
-
-    // Add the loop nests in the reverse order of LoopInfo. See method
-    // declaration.
-    appendLoopsToWorklist(LI, Worklist);
-
-    do {
-      Loop *L = Worklist.pop_back_val();
-
-      // Reset the update structure for this loop.
-      Updater.CurrentL = L;
-      Updater.SkipCurrentLoop = false;
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 
-#ifndef NDEBUG
-      // Save a parent loop pointer for asserts.
-      Updater.ParentL = L->getParentLoop();
+  static bool isRequired() { return true; }
 
-      // Verify the loop structure and LCSSA form before visiting the loop.
-      L->verifyLoop();
-      assert(L->isRecursivelyLCSSAForm(LAR.DT, LI) &&
-             "Loops must remain in LCSSA form!");
-#endif
-      // Check the PassInstrumentation's BeforePass callbacks before running the
-      // pass, skip its execution completely if asked to (callback returns
-      // false).
-      if (!PI.runBeforePass<Loop>(Pass, *L))
-        continue;
-
-      PreservedAnalyses PassPA;
-      {
-        TimeTraceScope TimeScope(Pass.name());
-        PassPA = Pass.run(*L, LAM, LAR, Updater);
-      }
-
-      // Do not pass deleted Loop into the instrumentation.
-      if (Updater.skipCurrentLoop())
-        PI.runAfterPassInvalidated<Loop>(Pass);
-      else
-        PI.runAfterPass<Loop>(Pass, *L);
-
-      // FIXME: We should verify the set of analyses relevant to Loop passes
-      // are preserved.
-
-      // If the loop hasn't been deleted, we need to handle invalidation here.
-      if (!Updater.skipCurrentLoop())
-        // We know that the loop pass couldn't have invalidated any other
-        // loop's analyses (that's the contract of a loop pass), so directly
-        // handle the loop analysis manager's invalidation here.
-        LAM.invalidate(*L, PassPA);
-
-      // Then intersect the preserved set so that invalidation of module
-      // analyses will eventually occur when the module pass completes.
-      PA.intersect(std::move(PassPA));
-    } while (!Worklist.empty());
-
-    // By definition we preserve the proxy. We also preserve all analyses on
-    // Loops. This precludes *any* invalidation of loop analyses by the proxy,
-    // but that's OK because we've taken care to invalidate analyses in the
-    // loop analysis manager incrementally above.
-    PA.preserveSet<AllAnalysesOn<Loop>>();
-    PA.preserve<LoopAnalysisManagerFunctionProxy>();
-    // We also preserve the set of standard analyses.
-    PA.preserve<DominatorTreeAnalysis>();
-    PA.preserve<LoopAnalysis>();
-    PA.preserve<ScalarEvolutionAnalysis>();
-    if (UseMemorySSA)
-      PA.preserve<MemorySSAAnalysis>();
-    // FIXME: What we really want to do here is preserve an AA category, but
-    // that concept doesn't exist yet.
-    PA.preserve<AAManager>();
-    PA.preserve<BasicAA>();
-    PA.preserve<GlobalsAA>();
-    PA.preserve<SCEVAA>();
-    return PA;
-  }
+  bool isLoopNestMode() const { return LoopNestMode; }
 
 private:
-  LoopPassT Pass;
+  std::unique_ptr<PassConceptT> Pass;
 
   FunctionPassManager LoopCanonicalizationFPM;
 
   bool UseMemorySSA = false;
+  bool UseBlockFrequencyInfo = false;
+  const bool LoopNestMode;
 };
 
 /// A function to deduce a loop pass type and wrap it in the templated
 /// adaptor.
+///
+/// If \p Pass is a loop pass, the returned adaptor will be in loop mode.
 template <typename LoopPassT>
-FunctionToLoopPassAdaptor<LoopPassT>
+inline std::enable_if_t<is_detected<HasRunOnLoopT, LoopPassT>::value,
+                        FunctionToLoopPassAdaptor>
 createFunctionToLoopPassAdaptor(LoopPassT Pass, bool UseMemorySSA = false,
+                                bool UseBlockFrequencyInfo = false,
+                                bool DebugLogging = false) {
+  using PassModelT =
+      detail::PassModel<Loop, LoopPassT, PreservedAnalyses, LoopAnalysisManager,
+                        LoopStandardAnalysisResults &, LPMUpdater &>;
+  return FunctionToLoopPassAdaptor(
+      std::make_unique<PassModelT>(std::move(Pass)), UseMemorySSA,
+      UseBlockFrequencyInfo, DebugLogging, false);
+}
+
+/// If \p Pass is a loop-nest pass, \p Pass will first be wrapped into a
+/// \c LoopPassManager and the returned adaptor will be in loop-nest mode.
+template <typename LoopNestPassT>
+inline std::enable_if_t<!is_detected<HasRunOnLoopT, LoopNestPassT>::value,
+                        FunctionToLoopPassAdaptor>
+createFunctionToLoopPassAdaptor(LoopNestPassT Pass, bool UseMemorySSA = false,
+                                bool UseBlockFrequencyInfo = false,
                                 bool DebugLogging = false) {
-  return FunctionToLoopPassAdaptor<LoopPassT>(std::move(Pass), UseMemorySSA,
-                                              DebugLogging);
+  LoopPassManager LPM(DebugLogging);
+  LPM.addPass(std::move(Pass));
+  using PassModelT =
+      detail::PassModel<Loop, LoopPassManager, PreservedAnalyses,
+                        LoopAnalysisManager, LoopStandardAnalysisResults &,
+                        LPMUpdater &>;
+  return FunctionToLoopPassAdaptor(std::make_unique<PassModelT>(std::move(LPM)),
+                                   UseMemorySSA, UseBlockFrequencyInfo,
+                                   DebugLogging, true);
+}
+
+/// If \p Pass is an instance of \c LoopPassManager, the returned adaptor will
+/// be in loop-nest mode if the pass manager contains only loop-nest passes.
+template <>
+inline FunctionToLoopPassAdaptor
+createFunctionToLoopPassAdaptor<LoopPassManager>(LoopPassManager LPM,
+                                                 bool UseMemorySSA,
+                                                 bool UseBlockFrequencyInfo,
+                                                 bool DebugLogging) {
+  // Check if LPM contains any loop pass and if it does not, returns an adaptor
+  // in loop-nest mode.
+  using PassModelT =
+      detail::PassModel<Loop, LoopPassManager, PreservedAnalyses,
+                        LoopAnalysisManager, LoopStandardAnalysisResults &,
+                        LPMUpdater &>;
+  bool LoopNestMode = (LPM.getNumLoopPasses() == 0);
+  return FunctionToLoopPassAdaptor(std::make_unique<PassModelT>(std::move(LPM)),
+                                   UseMemorySSA, UseBlockFrequencyInfo,
+                                   DebugLogging, LoopNestMode);
 }
 
 /// Pass for printing a loop's contents as textual IR.
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/LoopReroll.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/LoopReroll.h
new file mode 100644
index 000000000000..6ae309e48a28
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/LoopReroll.h
@@ -0,0 +1,27 @@
+//===- LoopReroll.h - Loop rerolling pass ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SCALAR_LOOPREROLL_H
+#define LLVM_TRANSFORMS_SCALAR_LOOPREROLL_H
+
+#include "llvm/IR/PassManager.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+
+namespace llvm {
+
+class Function;
+
+class LoopRerollPass : public PassInfoMixin<LoopRerollPass> {
+public:
+  PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
+                        LoopStandardAnalysisResults &AR, LPMUpdater &U);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_SCALAR_LOOPREROLL_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/LoopRotation.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/LoopRotation.h
index 254e6072906a..f68ac70da324 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/LoopRotation.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/LoopRotation.h
@@ -22,12 +22,14 @@ namespace llvm {
 /// A simple loop rotation transformation.
 class LoopRotatePass : public PassInfoMixin<LoopRotatePass> {
 public:
-  LoopRotatePass(bool EnableHeaderDuplication = true);
+  LoopRotatePass(bool EnableHeaderDuplication = true,
+                 bool PrepareForLTO = false);
   PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
                         LoopStandardAnalysisResults &AR, LPMUpdater &U);
 
 private:
   const bool EnableHeaderDuplication;
+  const bool PrepareForLTO;
 };
 }
 
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/LoopUnrollPass.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/LoopUnrollPass.h
index 7b049bdc8ad1..30cc08cb42ae 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/LoopUnrollPass.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/LoopUnrollPass.h
@@ -22,7 +22,7 @@ class Function;
 class Loop;
 class LPMUpdater;
 
-/// Loop unroll pass that only does full loop unrolling.
+/// Loop unroll pass that only does full loop unrolling and peeling.
 class LoopFullUnrollPass : public PassInfoMixin<LoopFullUnrollPass> {
   const int OptLevel;
 
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/LoopVersioningLICM.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/LoopVersioningLICM.h
new file mode 100644
index 000000000000..87d6d6759db2
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/LoopVersioningLICM.h
@@ -0,0 +1,25 @@
+//===- LoopVersioningLICM.h - LICM Loop Versioning ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SCALAR_LOOPVERSIONINGLICM_H
+#define LLVM_TRANSFORMS_SCALAR_LOOPVERSIONINGLICM_H
+
+#include "llvm/IR/PassManager.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+
+namespace llvm {
+
+class LoopVersioningLICMPass : public PassInfoMixin<LoopVersioningLICMPass> {
+public:
+  PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
+                        LoopStandardAnalysisResults &LAR, LPMUpdater &U);
+};
+
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_SCALAR_LOOPVERSIONINGLICM_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/LowerAtomic.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/LowerAtomic.h
index 40f8ca571f19..1d5550829f93 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/LowerAtomic.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/LowerAtomic.h
@@ -22,6 +22,7 @@ namespace llvm {
 class LowerAtomicPass : public PassInfoMixin<LowerAtomicPass> {
 public:
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &);
+  static bool isRequired() { return true; }
 };
 }
 
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/LowerExpectIntrinsic.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/LowerExpectIntrinsic.h
index 4e47ff70d557..22b2e649e4d4 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/LowerExpectIntrinsic.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/LowerExpectIntrinsic.h
@@ -17,6 +17,7 @@
 
 #include "llvm/IR/Function.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/Support/CommandLine.h"
 
 namespace llvm {
 
@@ -31,6 +32,8 @@ struct LowerExpectIntrinsicPass : PassInfoMixin<LowerExpectIntrinsicPass> {
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &);
 };
 
+extern cl::opt<uint32_t> LikelyBranchWeight;
+extern cl::opt<uint32_t> UnlikelyBranchWeight;
 }
 
 #endif
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/LowerMatrixIntrinsics.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/LowerMatrixIntrinsics.h
index 2f75cd5017aa..a2a31d302ccb 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/LowerMatrixIntrinsics.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/LowerMatrixIntrinsics.h
@@ -16,8 +16,14 @@
 #include "llvm/IR/PassManager.h"
 
 namespace llvm {
-struct LowerMatrixIntrinsicsPass : PassInfoMixin<LowerMatrixIntrinsicsPass> {
+class LowerMatrixIntrinsicsPass
+    : public PassInfoMixin<LowerMatrixIntrinsicsPass> {
+  bool Minimal;
+
+public:
+  LowerMatrixIntrinsicsPass(bool Minimal = false) : Minimal(Minimal) {}
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+  static bool isRequired() { return true; }
 };
 } // namespace llvm
 
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
index 8fc6c23e6944..635b706d0bef 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
@@ -14,7 +14,6 @@
 #ifndef LLVM_TRANSFORMS_SCALAR_MEMCPYOPTIMIZER_H
 #define LLVM_TRANSFORMS_SCALAR_MEMCPYOPTIMIZER_H
 
-#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/PassManager.h"
 #include <cstdint>
@@ -22,14 +21,19 @@
 
 namespace llvm {
 
+class AAResults;
 class AssumptionCache;
+class CallBase;
 class CallInst;
 class DominatorTree;
 class Function;
 class Instruction;
+class LoadInst;
 class MemCpyInst;
 class MemMoveInst;
 class MemoryDependenceResults;
+class MemorySSA;
+class MemorySSAUpdater;
 class MemSetInst;
 class StoreInst;
 class TargetLibraryInfo;
@@ -38,9 +42,11 @@ class Value;
 class MemCpyOptPass : public PassInfoMixin<MemCpyOptPass> {
   MemoryDependenceResults *MD = nullptr;
   TargetLibraryInfo *TLI = nullptr;
-  std::function<AliasAnalysis &()> LookupAliasAnalysis;
-  std::function<AssumptionCache &()> LookupAssumptionCache;
-  std::function<DominatorTree &()> LookupDomTree;
+  AAResults *AA = nullptr;
+  AssumptionCache *AC = nullptr;
+  DominatorTree *DT = nullptr;
+  MemorySSA *MSSA = nullptr;
+  MemorySSAUpdater *MSSAU = nullptr;
 
 public:
   MemCpyOptPass() = default;
@@ -48,11 +54,9 @@ public:
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 
   // Glue for the old PM.
-  bool runImpl(Function &F, MemoryDependenceResults *MD_,
-               TargetLibraryInfo *TLI_,
-               std::function<AliasAnalysis &()> LookupAliasAnalysis_,
-               std::function<AssumptionCache &()> LookupAssumptionCache_,
-               std::function<DominatorTree &()> LookupDomTree_);
+  bool runImpl(Function &F, MemoryDependenceResults *MD, TargetLibraryInfo *TLI,
+               AAResults *AA, AssumptionCache *AC, DominatorTree *DT,
+               MemorySSA *MSSA);
 
 private:
   // Helper functions
@@ -60,15 +64,18 @@ private:
   bool processMemSet(MemSetInst *SI, BasicBlock::iterator &BBI);
   bool processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI);
   bool processMemMove(MemMoveInst *M);
-  bool performCallSlotOptzn(Instruction *cpy, Value *cpyDst, Value *cpySrc,
-                            uint64_t cpyLen, Align cpyAlign, CallInst *C);
+  bool performCallSlotOptzn(Instruction *cpyLoad, Instruction *cpyStore,
+                            Value *cpyDst, Value *cpySrc, uint64_t cpyLen,
+                            Align cpyAlign, CallInst *C);
   bool processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep);
   bool processMemSetMemCpyDependence(MemCpyInst *MemCpy, MemSetInst *MemSet);
   bool performMemCpyToMemSetOptzn(MemCpyInst *MemCpy, MemSetInst *MemSet);
   bool processByValArgument(CallBase &CB, unsigned ArgNo);
   Instruction *tryMergingIntoMemset(Instruction *I, Value *StartPtr,
                                     Value *ByteVal);
+  bool moveUp(StoreInst *SI, Instruction *P, const LoadInst *LI);
 
+  void eraseInstruction(Instruction *I);
   bool iterateOnFunction(Function &F);
 };
 
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/NaryReassociate.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/NaryReassociate.h
index 26f5fe185dd5..5fa7427b2603 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/NaryReassociate.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/NaryReassociate.h
@@ -114,7 +114,7 @@ private:
   bool doOneIteration(Function &F);
 
   // Reassociates I for better CSE.
-  Instruction *tryReassociate(Instruction *I);
+  Instruction *tryReassociate(Instruction *I, const SCEV *&OrigSCEV);
 
   // Reassociate GEP for better CSE.
   Instruction *tryReassociateGEP(GetElementPtrInst *GEP);
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/Reg2Mem.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/Reg2Mem.h
new file mode 100644
index 000000000000..25f6563d7dcf
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/Reg2Mem.h
@@ -0,0 +1,27 @@
+//===- Reg2Mem.h - Convert registers to allocas -----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides the interface for the RegToMem Pass.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SCALAR_REG2MEM_H
+#define LLVM_TRANSFORMS_SCALAR_REG2MEM_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class RegToMemPass : public PassInfoMixin<RegToMemPass> {
+public:
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_SCALAR_REG2MEM_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/SROA.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/SROA.h
index 864a0cbd9db1..6ef7c6b22c0b 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/SROA.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/SROA.h
@@ -18,6 +18,7 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/IR/ValueHandle.h"
 #include <vector>
 
 namespace llvm {
@@ -77,8 +78,8 @@ class SROA : public PassInfoMixin<SROA> {
 
   /// A collection of instructions to delete.
   /// We try to batch deletions to simplify code and make things a bit more
-  /// efficient.
-  SetVector<Instruction *, SmallVector<Instruction *, 8>> DeadInsts;
+  /// efficient. We also make sure there is no dangling pointers.
+  SmallVector<WeakVH, 8> DeadInsts;
 
   /// Post-promotion worklist.
   ///
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/ScalarizeMaskedMemIntrin.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/ScalarizeMaskedMemIntrin.h
new file mode 100644
index 000000000000..19339ca13242
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/ScalarizeMaskedMemIntrin.h
@@ -0,0 +1,29 @@
+//===- ScalarizeMaskedMemIntrin.h - Scalarize unsupported masked mem ----===//
+//                                    instrinsics
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass replaces masked memory intrinsics - when unsupported by the target
+// - with a chain of basic blocks, that deal with the elements one-by-one if the
+// appropriate mask bit is set.
+//
+//===----------------------------------------------------------------------===//
+//
+#ifndef LLVM_TRANSFORMS_SCALAR_SCALARIZE_MASKED_MEMINTRIN_H
+#define LLVM_TRANSFORMS_SCALAR_SCALARIZE_MASKED_MEMINTRIN_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+struct ScalarizeMaskedMemIntrinPass
+    : public PassInfoMixin<ScalarizeMaskedMemIntrinPass> {
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/SeparateConstOffsetFromGEP.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/SeparateConstOffsetFromGEP.h
new file mode 100644
index 000000000000..5bd6ce164dc3
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/SeparateConstOffsetFromGEP.h
@@ -0,0 +1,27 @@
+//===- SeparateConstOffsetFromGEP.h ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SCALAR_SEPARATECONSTOFFSETFROMGEP_H
+#define LLVM_TRANSFORMS_SCALAR_SEPARATECONSTOFFSETFROMGEP_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class SeparateConstOffsetFromGEPPass
+    : public PassInfoMixin<SeparateConstOffsetFromGEPPass> {
+  bool LowerGEP;
+
+public:
+  SeparateConstOffsetFromGEPPass(bool LowerGEP = false) : LowerGEP(LowerGEP) {}
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_SCALAR_SEPARATECONSTOFFSETFROMGEP_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/SimplifyCFG.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/SimplifyCFG.h
index f9792d38bbe6..7c5393851ae6 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/SimplifyCFG.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/SimplifyCFG.h
@@ -14,9 +14,9 @@
 #ifndef LLVM_TRANSFORMS_SCALAR_SIMPLIFYCFG_H
 #define LLVM_TRANSFORMS_SCALAR_SIMPLIFYCFG_H
 
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/Transforms/Utils/SimplifyCFGOptions.h"
 
 namespace llvm {
 
@@ -34,13 +34,7 @@ public:
   /// rather than optimal IR. That is, by default we bypass transformations that
   /// are likely to improve performance but make analysis for other passes more
   /// difficult.
-  SimplifyCFGPass()
-      : SimplifyCFGPass(SimplifyCFGOptions()
-                            .forwardSwitchCondToPhi(false)
-                            .convertSwitchToLookupTable(false)
-                            .needCanonicalLoops(true)
-                            .sinkCommonInsts(false)) {}
-
+  SimplifyCFGPass();
 
   /// Construct a pass with optional optimizations.
   SimplifyCFGPass(const SimplifyCFGOptions &PassOptions);
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/StraightLineStrengthReduce.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/StraightLineStrengthReduce.h
new file mode 100644
index 000000000000..11233cc65efa
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/StraightLineStrengthReduce.h
@@ -0,0 +1,24 @@
+//===- StraightLineStrengthReduce.h - -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SCALAR_STRAIGHTLINESTRENGTHREDUCE_H
+#define LLVM_TRANSFORMS_SCALAR_STRAIGHTLINESTRENGTHREDUCE_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class StraightLineStrengthReducePass
+    : public PassInfoMixin<StraightLineStrengthReducePass> {
+public:
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_SCALAR_STRAIGHTLINESTRENGTHREDUCE_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/StructurizeCFG.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/StructurizeCFG.h
new file mode 100644
index 000000000000..50d41acd529e
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Scalar/StructurizeCFG.h
@@ -0,0 +1,20 @@
+//===- StructurizeCFG.h ---------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SCALAR_STRUCTURIZECFG_H
+#define LLVM_TRANSFORMS_SCALAR_STRUCTURIZECFG_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+struct StructurizeCFGPass : PassInfoMixin<StructurizeCFGPass> {
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_SCALAR_STRUCTURIZECFG_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Utils.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils.h
index 75edefac1cbd..9162a86183db 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Utils.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils.h
@@ -117,7 +117,7 @@ extern char &LoopSimplifyID;
 
 /// This function returns a new pass that downgrades the debug info in the
 /// module to line tables only.
-ModulePass *createStripNonLineTableDebugInfoPass();
+ModulePass *createStripNonLineTableDebugLegacyPass();
 
 //===----------------------------------------------------------------------===//
 //
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h
index 0a63654feb98..1dda73913826 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h
@@ -74,7 +74,7 @@ bool EliminateUnreachableBlocks(Function &F, DomTreeUpdater *DTU = nullptr,
 /// in it, fold them away. This handles the case when all entries to the PHI
 /// nodes in a block are guaranteed equal, such as when the block has exactly
 /// one predecessor.
-void FoldSingleEntryPHINodes(BasicBlock *BB,
+bool FoldSingleEntryPHINodes(BasicBlock *BB,
                              MemoryDependenceResults *MemDep = nullptr);
 
 /// Examine each PHI in the given block and delete it if it is dead. Also
@@ -196,7 +196,8 @@ struct CriticalEdgeSplittingOptions {
 /// to.
 BasicBlock *SplitCriticalEdge(Instruction *TI, unsigned SuccNum,
                               const CriticalEdgeSplittingOptions &Options =
-                                  CriticalEdgeSplittingOptions());
+                                  CriticalEdgeSplittingOptions(),
+                              const Twine &BBName = "");
 
 inline BasicBlock *
 SplitCriticalEdge(BasicBlock *BB, succ_iterator SI,
@@ -244,19 +245,71 @@ unsigned SplitAllCriticalEdges(Function &F,
                                const CriticalEdgeSplittingOptions &Options =
                                    CriticalEdgeSplittingOptions());
 
-/// Split the edge connecting specified block.
+/// Split the edge connecting the specified blocks, and return the newly created
+/// basic block between \p From and \p To.
 BasicBlock *SplitEdge(BasicBlock *From, BasicBlock *To,
                       DominatorTree *DT = nullptr, LoopInfo *LI = nullptr,
-                      MemorySSAUpdater *MSSAU = nullptr);
+                      MemorySSAUpdater *MSSAU = nullptr,
+                      const Twine &BBName = "");
 
-/// Split the specified block at the specified instruction - everything before
-/// SplitPt stays in Old and everything starting with SplitPt moves to a new
-/// block. The two blocks are joined by an unconditional branch and the loop
-/// info is updated.
+/// Split the specified block at the specified instruction.
+///
+/// If \p Before is true, splitBlockBefore handles the block
+/// splitting. Otherwise, execution proceeds as described below.
+///
+/// Everything before \p SplitPt stays in \p Old and everything starting with \p
+/// SplitPt moves to a new block. The two blocks are joined by an unconditional
+/// branch. The new block with name \p BBName is returned.
+///
+/// FIXME: deprecated, switch to the DomTreeUpdater-based one.
+BasicBlock *SplitBlock(BasicBlock *Old, Instruction *SplitPt, DominatorTree *DT,
+                       LoopInfo *LI = nullptr,
+                       MemorySSAUpdater *MSSAU = nullptr,
+                       const Twine &BBName = "", bool Before = false);
+
+/// Split the specified block at the specified instruction.
+///
+/// If \p Before is true, splitBlockBefore handles the block
+/// splitting. Otherwise, execution proceeds as described below.
+///
+/// Everything before \p SplitPt stays in \p Old and everything starting with \p
+/// SplitPt moves to a new block. The two blocks are joined by an unconditional
+/// branch. The new block with name \p BBName is returned.
 BasicBlock *SplitBlock(BasicBlock *Old, Instruction *SplitPt,
-                       DominatorTree *DT = nullptr, LoopInfo *LI = nullptr,
+                       DomTreeUpdater *DTU = nullptr, LoopInfo *LI = nullptr,
                        MemorySSAUpdater *MSSAU = nullptr,
-                       const Twine &BBName = "");
+                       const Twine &BBName = "", bool Before = false);
+
+/// Split the specified block at the specified instruction \p SplitPt.
+/// All instructions before \p SplitPt are moved to a new block and all
+/// instructions after \p SplitPt stay in the old block. The new block and the
+/// old block are joined by inserting an unconditional branch to the end of the
+/// new block. The new block with name \p BBName is returned.
+BasicBlock *splitBlockBefore(BasicBlock *Old, Instruction *SplitPt,
+                             DomTreeUpdater *DTU, LoopInfo *LI,
+                             MemorySSAUpdater *MSSAU, const Twine &BBName = "");
+
+/// This method introduces at least one new basic block into the function and
+/// moves some of the predecessors of BB to be predecessors of the new block.
+/// The new predecessors are indicated by the Preds array. The new block is
+/// given a suffix of 'Suffix'. Returns new basic block to which predecessors
+/// from Preds are now pointing.
+///
+/// If BB is a landingpad block then additional basicblock might be introduced.
+/// It will have Suffix+".split_lp". See SplitLandingPadPredecessors for more
+/// details on this case.
+///
+/// This currently updates the LLVM IR, DominatorTree, LoopInfo, and LCCSA but
+/// no other analyses. In particular, it does not preserve LoopSimplify
+/// (because it's complicated to handle the case where one of the edges being
+/// split is an exit of a loop with other exits).
+///
+/// FIXME: deprecated, switch to the DomTreeUpdater-based one.
+BasicBlock *SplitBlockPredecessors(BasicBlock *BB, ArrayRef<BasicBlock *> Preds,
+                                   const char *Suffix, DominatorTree *DT,
+                                   LoopInfo *LI = nullptr,
+                                   MemorySSAUpdater *MSSAU = nullptr,
+                                   bool PreserveLCSSA = false);
 
 /// This method introduces at least one new basic block into the function and
 /// moves some of the predecessors of BB to be predecessors of the new block.
@@ -274,11 +327,32 @@ BasicBlock *SplitBlock(BasicBlock *Old, Instruction *SplitPt,
 /// split is an exit of a loop with other exits).
 BasicBlock *SplitBlockPredecessors(BasicBlock *BB, ArrayRef<BasicBlock *> Preds,
                                    const char *Suffix,
-                                   DominatorTree *DT = nullptr,
+                                   DomTreeUpdater *DTU = nullptr,
                                    LoopInfo *LI = nullptr,
                                    MemorySSAUpdater *MSSAU = nullptr,
                                    bool PreserveLCSSA = false);
 
+/// This method transforms the landing pad, OrigBB, by introducing two new basic
+/// blocks into the function. One of those new basic blocks gets the
+/// predecessors listed in Preds. The other basic block gets the remaining
+/// predecessors of OrigBB. The landingpad instruction OrigBB is clone into both
+/// of the new basic blocks. The new blocks are given the suffixes 'Suffix1' and
+/// 'Suffix2', and are returned in the NewBBs vector.
+///
+/// This currently updates the LLVM IR, DominatorTree, LoopInfo, and LCCSA but
+/// no other analyses. In particular, it does not preserve LoopSimplify
+/// (because it's complicated to handle the case where one of the edges being
+/// split is an exit of a loop with other exits).
+///
+/// FIXME: deprecated, switch to the DomTreeUpdater-based one.
+void SplitLandingPadPredecessors(BasicBlock *OrigBB,
+                                 ArrayRef<BasicBlock *> Preds,
+                                 const char *Suffix, const char *Suffix2,
+                                 SmallVectorImpl<BasicBlock *> &NewBBs,
+                                 DominatorTree *DT, LoopInfo *LI = nullptr,
+                                 MemorySSAUpdater *MSSAU = nullptr,
+                                 bool PreserveLCSSA = false);
+
 /// This method transforms the landing pad, OrigBB, by introducing two new basic
 /// blocks into the function. One of those new basic blocks gets the
 /// predecessors listed in Preds. The other basic block gets the remaining
@@ -293,7 +367,7 @@ BasicBlock *SplitBlockPredecessors(BasicBlock *BB, ArrayRef<BasicBlock *> Preds,
 void SplitLandingPadPredecessors(
     BasicBlock *OrigBB, ArrayRef<BasicBlock *> Preds, const char *Suffix,
     const char *Suffix2, SmallVectorImpl<BasicBlock *> &NewBBs,
-    DominatorTree *DT = nullptr, LoopInfo *LI = nullptr,
+    DomTreeUpdater *DTU = nullptr, LoopInfo *LI = nullptr,
     MemorySSAUpdater *MSSAU = nullptr, bool PreserveLCSSA = false);
 
 /// This method duplicates the specified return instruction into a predecessor
@@ -304,6 +378,35 @@ ReturnInst *FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB,
                                        BasicBlock *Pred,
                                        DomTreeUpdater *DTU = nullptr);
 
+/// Split the containing block at the specified instruction - everything before
+/// SplitBefore stays in the old basic block, and the rest of the instructions
+/// in the BB are moved to a new block. The two blocks are connected by a
+/// conditional branch (with value of Cmp being the condition).
+/// Before:
+///   Head
+///   SplitBefore
+///   Tail
+/// After:
+///   Head
+///   if (Cond)
+///     ThenBlock
+///   SplitBefore
+///   Tail
+///
+/// If \p ThenBlock is not specified, a new block will be created for it.
+/// If \p Unreachable is true, the newly created block will end with
+/// UnreachableInst, otherwise it branches to Tail.
+/// Returns the NewBasicBlock's terminator.
+///
+/// Updates DT and LI if given.
+///
+/// FIXME: deprecated, switch to the DomTreeUpdater-based one.
+Instruction *SplitBlockAndInsertIfThen(Value *Cond, Instruction *SplitBefore,
+                                       bool Unreachable, MDNode *BranchWeights,
+                                       DominatorTree *DT,
+                                       LoopInfo *LI = nullptr,
+                                       BasicBlock *ThenBlock = nullptr);
+
 /// Split the containing block at the specified instruction - everything before
 /// SplitBefore stays in the old basic block, and the rest of the instructions
 /// in the BB are moved to a new block. The two blocks are connected by a
@@ -328,7 +431,7 @@ ReturnInst *FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB,
 Instruction *SplitBlockAndInsertIfThen(Value *Cond, Instruction *SplitBefore,
                                        bool Unreachable,
                                        MDNode *BranchWeights = nullptr,
-                                       DominatorTree *DT = nullptr,
+                                       DomTreeUpdater *DTU = nullptr,
                                        LoopInfo *LI = nullptr,
                                        BasicBlock *ThenBlock = nullptr);
 
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/BuildLibCalls.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/BuildLibCalls.h
index 90517e806e02..e7d41933a6c9 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/BuildLibCalls.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/BuildLibCalls.h
@@ -96,6 +96,10 @@ namespace llvm {
                        IRBuilderBase &B, const DataLayout &DL,
                        const TargetLibraryInfo *TLI);
 
+  /// Emit a call to the mempcpy function.
+  Value *emitMemPCpy(Value *Dst, Value *Src, Value *Len, IRBuilderBase &B,
+                     const DataLayout &DL, const TargetLibraryInfo *TLI);
+
   /// Emit a call to the memchr function. This assumes that Ptr is a pointer,
   /// Val is an i32 value, and Len is an 'intptr_t' value.
   Value *emitMemChr(Value *Ptr, Value *Val, Value *Len, IRBuilderBase &B,
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/CallGraphUpdater.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/CallGraphUpdater.h
index 22954b469186..f8211d60938e 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/CallGraphUpdater.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/CallGraphUpdater.h
@@ -87,7 +87,7 @@ public:
   /// If a new function was created by outlining, this method can be called
   /// to update the call graph for the new function. Note that the old one
   /// still needs to be re-analyzed or manually updated.
-  void registerOutlinedFunction(Function &NewFn);
+  void registerOutlinedFunction(Function &OriginalFn, Function &NewFn);
 
   /// Replace \p OldFn in the call graph (and SCC) with \p NewFn. The uses
   /// outside the call graph and the function \p OldFn are not modified.
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/Cloning.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/Cloning.h
index dffb7801bc8e..aa960c625630 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/Cloning.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/Cloning.h
@@ -268,6 +268,49 @@ void updateProfileCallee(
     Function *Callee, int64_t entryDelta,
     const ValueMap<const Value *, WeakTrackingVH> *VMap = nullptr);
 
+/// Find the 'llvm.experimental.noalias.scope.decl' intrinsics in the specified
+/// basic blocks and extract their scope. These are candidates for duplication
+/// when cloning.
+void identifyNoAliasScopesToClone(
+    ArrayRef<BasicBlock *> BBs, SmallVectorImpl<MDNode *> &NoAliasDeclScopes);
+
+/// Find the 'llvm.experimental.noalias.scope.decl' intrinsics in the specified
+/// instruction range and extract their scope. These are candidates for
+/// duplication when cloning.
+void identifyNoAliasScopesToClone(
+    BasicBlock::iterator Start, BasicBlock::iterator End,
+    SmallVectorImpl<MDNode *> &NoAliasDeclScopes);
+
+/// Duplicate the specified list of noalias decl scopes.
+/// The 'Ext' string is added as an extension to the name.
+/// Afterwards, the ClonedScopes contains the mapping of the original scope
+/// MDNode onto the cloned scope.
+/// Be aware that the cloned scopes are still part of the original scope domain.
+void cloneNoAliasScopes(
+    ArrayRef<MDNode *> NoAliasDeclScopes,
+    DenseMap<MDNode *, MDNode *> &ClonedScopes,
+    StringRef Ext, LLVMContext &Context);
+
+/// Adapt the metadata for the specified instruction according to the
+/// provided mapping. This is normally used after cloning an instruction, when
+/// some noalias scopes needed to be cloned.
+void adaptNoAliasScopes(
+    llvm::Instruction *I, const DenseMap<MDNode *, MDNode *> &ClonedScopes,
+    LLVMContext &Context);
+
+/// Clone the specified noalias decl scopes. Then adapt all instructions in the
+/// NewBlocks basicblocks to the cloned versions.
+/// 'Ext' will be added to the duplicate scope names.
+void cloneAndAdaptNoAliasScopes(ArrayRef<MDNode *> NoAliasDeclScopes,
+                                ArrayRef<BasicBlock *> NewBlocks,
+                                LLVMContext &Context, StringRef Ext);
+
+/// Clone the specified noalias decl scopes. Then adapt all instructions in the
+/// [IStart, IEnd] (IEnd included !) range to the cloned versions. 'Ext' will be
+/// added to the duplicate scope names.
+void cloneAndAdaptNoAliasScopes(ArrayRef<MDNode *> NoAliasDeclScopes,
+                                Instruction *IStart, Instruction *IEnd,
+                                LLVMContext &Context, StringRef Ext);
 } // end namespace llvm
 
 #endif // LLVM_TRANSFORMS_UTILS_CLONING_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/Debugify.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/Debugify.h
index 6f11d0a7d062..30e7d8e87adf 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/Debugify.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/Debugify.h
@@ -13,8 +13,11 @@
 #ifndef LLVM_TRANSFORM_UTILS_DEBUGIFY_H
 #define LLVM_TRANSFORM_UTILS_DEBUGIFY_H
 
-#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Bitcode/BitcodeWriterPass.h"
+#include "llvm/IR/IRPrintingPasses.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/PassManager.h"
 
 namespace llvm {
@@ -37,8 +40,6 @@ bool applyDebugifyMetadata(
 /// Returns true if any change was made.
 bool stripDebugifyMetadata(Module &M);
 
-} // namespace llvm
-
 llvm::ModulePass *createDebugifyModulePass();
 llvm::FunctionPass *createDebugifyFunctionPass();
 
@@ -74,6 +75,8 @@ struct DebugifyStatistics {
 /// Map pass names to a per-pass DebugifyStatistics instance.
 using DebugifyStatsMap = llvm::MapVector<llvm::StringRef, DebugifyStatistics>;
 
+void exportDebugifyStats(StringRef Path, const DebugifyStatsMap &Map);
+
 llvm::ModulePass *
 createCheckDebugifyModulePass(bool Strip = false,
                               llvm::StringRef NameOfWrappedPass = "",
@@ -89,4 +92,60 @@ struct NewPMCheckDebugifyPass
   llvm::PreservedAnalyses run(llvm::Module &M, llvm::ModuleAnalysisManager &AM);
 };
 
+struct DebugifyEachInstrumentation {
+  DebugifyStatsMap StatsMap;
+
+  void registerCallbacks(PassInstrumentationCallbacks &PIC);
+};
+
+/// DebugifyCustomPassManager wraps each pass with the debugify passes if
+/// needed.
+/// NOTE: We support legacy custom pass manager only.
+/// TODO: Add New PM support for custom pass manager.
+class DebugifyCustomPassManager : public legacy::PassManager {
+  DebugifyStatsMap DIStatsMap;
+  bool EnableDebugifyEach = false;
+
+public:
+  using super = legacy::PassManager;
+
+  void add(Pass *P) override {
+    // Wrap each pass with (-check)-debugify passes if requested, making
+    // exceptions for passes which shouldn't see -debugify instrumentation.
+    bool WrapWithDebugify = EnableDebugifyEach && !P->getAsImmutablePass() &&
+                            !isIRPrintingPass(P) && !isBitcodeWriterPass(P);
+    if (!WrapWithDebugify) {
+      super::add(P);
+      return;
+    }
+
+    // Apply -debugify/-check-debugify before/after each pass and collect
+    // debug info loss statistics.
+    PassKind Kind = P->getPassKind();
+    StringRef Name = P->getPassName();
+
+    // TODO: Implement Debugify for LoopPass.
+    switch (Kind) {
+    case PT_Function:
+      super::add(createDebugifyFunctionPass());
+      super::add(P);
+      super::add(createCheckDebugifyFunctionPass(true, Name, &DIStatsMap));
+      break;
+    case PT_Module:
+      super::add(createDebugifyModulePass());
+      super::add(P);
+      super::add(createCheckDebugifyModulePass(true, Name, &DIStatsMap));
+      break;
+    default:
+      super::add(P);
+      break;
+    }
+  }
+
+  void enableDebugifyEach() { EnableDebugifyEach = true; }
+
+  const DebugifyStatsMap &getDebugifyStatsMap() const { return DIStatsMap; }
+};
+} // namespace llvm
+
 #endif // LLVM_TRANSFORM_UTILS_DEBUGIFY_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/FixIrreducible.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/FixIrreducible.h
new file mode 100644
index 000000000000..0c00b7bdbaf9
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/FixIrreducible.h
@@ -0,0 +1,20 @@
+//===- FixIrreducible.h - Convert irreducible control-flow into loops -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_UTILS_FIXIRREDUCIBLE_H
+#define LLVM_TRANSFORMS_UTILS_FIXIRREDUCIBLE_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+struct FixIrreduciblePass : PassInfoMixin<FixIrreduciblePass> {
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_UTILS_FIXIRREDUCIBLE_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/ImportedFunctionsInliningStatistics.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/ImportedFunctionsInliningStatistics.h
deleted file mode 100644
index 033ea05b77fa..000000000000
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/ImportedFunctionsInliningStatistics.h
+++ /dev/null
@@ -1,106 +0,0 @@
-//===-- ImportedFunctionsInliningStatistics.h -------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// Generating inliner statistics for imported functions, mostly useful for
-// ThinLTO.
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TRANSFORMS_UTILS_IMPORTEDFUNCTIONSINLININGSTATISTICS_H
-#define LLVM_TRANSFORMS_UTILS_IMPORTEDFUNCTIONSINLININGSTATISTICS_H
-
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringMap.h"
-#include "llvm/ADT/StringRef.h"
-#include <string>
-#include <vector>
-
-namespace llvm {
-class Module;
-class Function;
-/// Calculate and dump ThinLTO specific inliner stats.
-/// The main statistics are:
-/// (1) Number of inlined imported functions,
-/// (2) Number of imported functions inlined into importing module (indirect),
-/// (3) Number of non imported functions inlined into importing module
-/// (indirect).
-/// The difference between first and the second is that first stat counts
-/// all performed inlines on imported functions, but the second one only the
-/// functions that have been eventually inlined to a function in the importing
-/// module (by a chain of inlines). Because llvm uses bottom-up inliner, it is
-/// possible to e.g. import function `A`, `B` and then inline `B` to `A`,
-/// and after this `A` might be too big to be inlined into some other function
-/// that calls it. It calculates this statistic by building graph, where
-/// the nodes are functions, and edges are performed inlines and then by marking
-/// the edges starting from not imported function.
-///
-/// If `Verbose` is set to true, then it also dumps statistics
-/// per each inlined function, sorted by the greatest inlines count like
-/// - number of performed inlines
-/// - number of performed inlines to importing module
-class ImportedFunctionsInliningStatistics {
-private:
-  /// InlineGraphNode represents node in graph of inlined functions.
-  struct InlineGraphNode {
-    // Default-constructible and movable.
-    InlineGraphNode() = default;
-    InlineGraphNode(InlineGraphNode &&) = default;
-    InlineGraphNode &operator=(InlineGraphNode &&) = default;
-
-    llvm::SmallVector<InlineGraphNode *, 8> InlinedCallees;
-    /// Incremented every direct inline.
-    int32_t NumberOfInlines = 0;
-    /// Number of inlines into non imported function (possibly indirect via
-    /// intermediate inlines). Computed based on graph search.
-    int32_t NumberOfRealInlines = 0;
-    bool Imported = false;
-    bool Visited = false;
-  };
-
-public:
-  ImportedFunctionsInliningStatistics() = default;
-  ImportedFunctionsInliningStatistics(
-      const ImportedFunctionsInliningStatistics &) = delete;
-
-  /// Set information like AllFunctions, ImportedFunctions, ModuleName.
-  void setModuleInfo(const Module &M);
-  /// Record inline of @param Callee to @param Caller for statistis.
-  void recordInline(const Function &Caller, const Function &Callee);
-  /// Dump stats computed with InlinerStatistics class.
-  /// If @param Verbose is true then separate statistics for every inlined
-  /// function will be printed.
-  void dump(bool Verbose);
-
-private:
-  /// Creates new Node in NodeMap and sets attributes, or returns existed one.
-  InlineGraphNode &createInlineGraphNode(const Function &);
-  void calculateRealInlines();
-  void dfs(InlineGraphNode &GraphNode);
-
-  using NodesMapTy =
-      llvm::StringMap<std::unique_ptr<InlineGraphNode>>;
-  using SortedNodesTy =
-      std::vector<const NodesMapTy::MapEntryTy*>;
-  /// Returns vector of elements sorted by
-  /// (-NumberOfInlines, -NumberOfRealInlines, FunctionName).
-  SortedNodesTy getSortedNodes();
-
-private:
-  /// This map manage life of all InlineGraphNodes. Unique pointer to
-  /// InlineGraphNode used since the node pointers are also saved in the
-  /// InlinedCallees vector. If it would store InlineGraphNode instead then the
-  /// address of the node would not be invariant.
-  NodesMapTy NodesMap;
-  /// Non external functions that have some other function inlined inside.
-  std::vector<StringRef> NonImportedCallers;
-  int AllFunctions = 0;
-  int ImportedFunctions = 0;
-  StringRef ModuleName;
-};
-
-} // llvm
-
-#endif // LLVM_TRANSFORMS_UTILS_IMPORTEDFUNCTIONSINLININGSTATISTICS_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/InstructionNamer.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/InstructionNamer.h
new file mode 100644
index 000000000000..4f4cc2666f10
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/InstructionNamer.h
@@ -0,0 +1,20 @@
+//===- InstructionNamer.h - Give anonymous instructions names -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_UTILS_INSTRUCTIONNAMER_H
+#define LLVM_TRANSFORMS_UTILS_INSTRUCTIONNAMER_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+struct InstructionNamerPass : PassInfoMixin<InstructionNamerPass> {
+  PreservedAnalyses run(Function &, FunctionAnalysisManager &);
+};
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_UTILS_INSTRUCTIONNAMER_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/Local.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/Local.h
index f55e336f1f6a..c712dda483e4 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/Local.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/Local.h
@@ -16,7 +16,6 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/TinyPtrVector.h"
 #include "llvm/Analysis/Utils/Local.h"
@@ -30,6 +29,8 @@
 #include "llvm/IR/Value.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils/SimplifyCFGOptions.h"
 #include <cstdint>
 #include <limits>
 
@@ -58,73 +59,6 @@ class StoreInst;
 class TargetLibraryInfo;
 class TargetTransformInfo;
 
-/// A set of parameters used to control the transforms in the SimplifyCFG pass.
-/// Options may change depending on the position in the optimization pipeline.
-/// For example, canonical form that includes switches and branches may later be
-/// replaced by lookup tables and selects.
-struct SimplifyCFGOptions {
-  int BonusInstThreshold;
-  bool ForwardSwitchCondToPhi;
-  bool ConvertSwitchToLookupTable;
-  bool NeedCanonicalLoop;
-  bool SinkCommonInsts;
-  bool SimplifyCondBranch;
-  bool FoldTwoEntryPHINode;
-
-  AssumptionCache *AC;
-
-  SimplifyCFGOptions(unsigned BonusThreshold = 1,
-                     bool ForwardSwitchCond = false,
-                     bool SwitchToLookup = false, bool CanonicalLoops = true,
-                     bool SinkCommon = false,
-                     AssumptionCache *AssumpCache = nullptr,
-                     bool SimplifyCondBranch = true,
-                     bool FoldTwoEntryPHINode = true)
-      : BonusInstThreshold(BonusThreshold),
-        ForwardSwitchCondToPhi(ForwardSwitchCond),
-        ConvertSwitchToLookupTable(SwitchToLookup),
-        NeedCanonicalLoop(CanonicalLoops),
-        SinkCommonInsts(SinkCommon),
-        SimplifyCondBranch(SimplifyCondBranch),
-        FoldTwoEntryPHINode(FoldTwoEntryPHINode),
-        AC(AssumpCache) {}
-
-  // Support 'builder' pattern to set members by name at construction time.
-  SimplifyCFGOptions &bonusInstThreshold(int I) {
-    BonusInstThreshold = I;
-    return *this;
-  }
-  SimplifyCFGOptions &forwardSwitchCondToPhi(bool B) {
-    ForwardSwitchCondToPhi = B;
-    return *this;
-  }
-  SimplifyCFGOptions &convertSwitchToLookupTable(bool B) {
-    ConvertSwitchToLookupTable = B;
-    return *this;
-  }
-  SimplifyCFGOptions &needCanonicalLoops(bool B) {
-    NeedCanonicalLoop = B;
-    return *this;
-  }
-  SimplifyCFGOptions &sinkCommonInsts(bool B) {
-    SinkCommonInsts = B;
-    return *this;
-  }
-  SimplifyCFGOptions &setAssumptionCache(AssumptionCache *Cache) {
-    AC = Cache;
-    return *this;
-  }
-  SimplifyCFGOptions &setSimplifyCondBranch(bool B) {
-    SimplifyCondBranch = B;
-    return *this;
-  }
-
-  SimplifyCFGOptions &setFoldTwoEntryPHINode(bool B) {
-    FoldTwoEntryPHINode = B;
-    return *this;
-  }
-};
-
 //===----------------------------------------------------------------------===//
 //  Local constant propagation.
 //
@@ -160,7 +94,9 @@ bool wouldInstructionBeTriviallyDead(Instruction *I,
 /// recursively. Return true if any instructions were deleted.
 bool RecursivelyDeleteTriviallyDeadInstructions(
     Value *V, const TargetLibraryInfo *TLI = nullptr,
-    MemorySSAUpdater *MSSAU = nullptr);
+    MemorySSAUpdater *MSSAU = nullptr,
+    std::function<void(Value *)> AboutToDeleteCallback =
+        std::function<void(Value *)>());
 
 /// Delete all of the instructions in `DeadInsts`, and all other instructions
 /// that deleting these in turn causes to be trivially dead.
@@ -172,7 +108,9 @@ bool RecursivelyDeleteTriviallyDeadInstructions(
 /// empty afterward.
 void RecursivelyDeleteTriviallyDeadInstructions(
     SmallVectorImpl<WeakTrackingVH> &DeadInsts,
-    const TargetLibraryInfo *TLI = nullptr, MemorySSAUpdater *MSSAU = nullptr);
+    const TargetLibraryInfo *TLI = nullptr, MemorySSAUpdater *MSSAU = nullptr,
+    std::function<void(Value *)> AboutToDeleteCallback =
+        std::function<void(Value *)>());
 
 /// Same functionality as RecursivelyDeleteTriviallyDeadInstructions, but allow
 /// instructions that are not trivially dead. These will be ignored.
@@ -180,7 +118,9 @@ void RecursivelyDeleteTriviallyDeadInstructions(
 /// were found and deleted.
 bool RecursivelyDeleteTriviallyDeadInstructionsPermissive(
     SmallVectorImpl<WeakTrackingVH> &DeadInsts,
-    const TargetLibraryInfo *TLI = nullptr, MemorySSAUpdater *MSSAU = nullptr);
+    const TargetLibraryInfo *TLI = nullptr, MemorySSAUpdater *MSSAU = nullptr,
+    std::function<void(Value *)> AboutToDeleteCallback =
+        std::function<void(Value *)>());
 
 /// If the specified value is an effectively dead PHI node, due to being a
 /// def-use chain of single-use nodes that either forms a cycle or is terminated
@@ -209,20 +149,6 @@ bool replaceDbgUsesWithUndef(Instruction *I);
 //  Control Flow Graph Restructuring.
 //
 
-/// Like BasicBlock::removePredecessor, this method is called when we're about
-/// to delete Pred as a predecessor of BB. If BB contains any PHI nodes, this
-/// drops the entries in the PHI nodes for Pred.
-///
-/// Unlike the removePredecessor method, this attempts to simplify uses of PHI
-/// nodes that collapse into identity values.  For example, if we have:
-///   x = phi(1, 0, 0, 0)
-///   y = and x, z
-///
-/// .. and delete the predecessor corresponding to the '1', this will attempt to
-/// recursively fold the 'and' to 0.
-void RemovePredecessorAndSimplify(BasicBlock *BB, BasicBlock *Pred,
-                                  DomTreeUpdater *DTU = nullptr);
-
 /// BB is a block with one predecessor and its predecessor is known to have one
 /// successor (BB!). Eliminate the edge between them, moving the instructions in
 /// the predecessor into BB. This deletes the predecessor block.
@@ -246,9 +172,11 @@ bool EliminateDuplicatePHINodes(BasicBlock *BB);
 /// It returns true if a modification was made, possibly deleting the basic
 /// block that was pointed to. LoopHeaders is an optional input parameter
 /// providing the set of loop headers that SimplifyCFG should not eliminate.
+extern cl::opt<bool> RequireAndPreserveDomTree;
 bool simplifyCFG(BasicBlock *BB, const TargetTransformInfo &TTI,
+                 DomTreeUpdater *DTU = nullptr,
                  const SimplifyCFGOptions &Options = {},
-                 SmallPtrSetImpl<BasicBlock *> *LoopHeaders = nullptr);
+                 ArrayRef<WeakVH> LoopHeaders = {});
 
 /// This function is used to flatten a CFG. For example, it uses parallel-and
 /// and parallel-or mode to collapse if-conditions and merge if-regions with
@@ -258,7 +186,9 @@ bool FlattenCFG(BasicBlock *BB, AAResults *AA = nullptr);
 /// If this basic block is ONLY a setcc and a branch, and if a predecessor
 /// branches to us and one of our successors, fold the setcc into the
 /// predecessor and use logical operations to pick the right destination.
-bool FoldBranchToCommonDest(BranchInst *BI, MemorySSAUpdater *MSSAU = nullptr,
+bool FoldBranchToCommonDest(BranchInst *BI, llvm::DomTreeUpdater *DTU = nullptr,
+                            MemorySSAUpdater *MSSAU = nullptr,
+                            const TargetTransformInfo *TTI = nullptr,
                             unsigned BonusInstThreshold = 1);
 
 /// This function takes a virtual register computed by an Instruction and
@@ -365,10 +295,6 @@ bool replaceDbgDeclare(Value *Address, Value *NewAddress, DIBuilder &Builder,
 void replaceDbgValueForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
                               DIBuilder &Builder, int Offset = 0);
 
-/// Finds alloca where the value comes from.
-AllocaInst *findAllocaForValue(Value *V,
-                               DenseMap<Value *, AllocaInst *> &AllocaForValue);
-
 /// Assuming the instruction \p I is going to be deleted, attempt to salvage
 /// debug users of \p I by writing the effect of \p I in a DIExpression. If it
 /// cannot be salvaged changes its debug uses to undef.
@@ -406,9 +332,13 @@ DIExpression *salvageDebugInfoImpl(Instruction &I, DIExpression *DIExpr,
 bool replaceAllDbgUsesWith(Instruction &From, Value &To, Instruction &DomPoint,
                            DominatorTree &DT);
 
-/// Remove all instructions from a basic block other than it's terminator
-/// and any present EH pad instructions.
-unsigned removeAllNonTerminatorAndEHPadInstructions(BasicBlock *BB);
+/// Remove all instructions from a basic block other than its terminator
+/// and any present EH pad instructions. Returns a pair where the first element
+/// is the number of instructions (excluding debug info instrinsics) that have
+/// been removed, and the second element is the number of debug info intrinsics
+/// that have been removed.
+std::pair<unsigned, unsigned>
+removeAllNonTerminatorAndEHPadInstructions(BasicBlock *BB);
 
 /// Insert an unreachable instruction before the specified
 /// instruction, making it and the rest of the code in the block dead.
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/LoopPeel.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/LoopPeel.h
new file mode 100644
index 000000000000..8f857e1e5c21
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/LoopPeel.h
@@ -0,0 +1,40 @@
+//===- llvm/Transforms/Utils/LoopPeel.h ----- Peeling utilities -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines some loop peeling utilities. It does not define any
+// actual pass or policy.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_UTILS_LOOPPEEL_H
+#define LLVM_TRANSFORMS_UTILS_LOOPPEEL_H
+
+#include "llvm/Analysis/TargetTransformInfo.h"
+
+namespace llvm {
+
+bool canPeel(Loop *L);
+
+bool peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI, ScalarEvolution *SE,
+              DominatorTree *DT, AssumptionCache *AC, bool PreserveLCSSA);
+
+TargetTransformInfo::PeelingPreferences
+gatherPeelingPreferences(Loop *L, ScalarEvolution &SE,
+                         const TargetTransformInfo &TTI,
+                         Optional<bool> UserAllowPeeling,
+                         Optional<bool> UserAllowProfileBasedPeeling,
+                         bool UnrollingSpecficValues = false);
+
+void computePeelCount(Loop *L, unsigned LoopSize,
+                      TargetTransformInfo::PeelingPreferences &PP,
+                      unsigned &TripCount, ScalarEvolution &SE,
+                      unsigned Threshold = UINT_MAX);
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_UTILS_LOOPPEEL_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/LoopRotationUtils.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/LoopRotationUtils.h
index 1e80722ed8b8..61bf93b74a15 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/LoopRotationUtils.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/LoopRotationUtils.h
@@ -33,7 +33,8 @@ class TargetTransformInfo;
 bool LoopRotation(Loop *L, LoopInfo *LI, const TargetTransformInfo *TTI,
                   AssumptionCache *AC, DominatorTree *DT, ScalarEvolution *SE,
                   MemorySSAUpdater *MSSAU, const SimplifyQuery &SQ,
-                  bool RotationOnly, unsigned Threshold, bool IsUtilMode);
+                  bool RotationOnly, unsigned Threshold, bool IsUtilMode,
+                  bool PrepareForLTO = false);
 
 } // namespace llvm
 
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/LoopUtils.h
index 60446bca5317..951660bbab28 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -26,6 +26,8 @@ class AAResults;
 class AliasSet;
 class AliasSetTracker;
 class BasicBlock;
+class BlockFrequencyInfo;
+class ICFLoopSafetyInfo;
 class IRBuilderBase;
 class Loop;
 class LoopInfo;
@@ -38,7 +40,6 @@ class ScalarEvolution;
 class SCEV;
 class SCEVExpander;
 class TargetLibraryInfo;
-class TargetTransformInfo;
 class LPPassManager;
 class Instruction;
 struct RuntimeCheckingPtrGroup;
@@ -74,9 +75,14 @@ bool formDedicatedExitBlocks(Loop *L, DominatorTree *DT, LoopInfo *LI,
 /// changes to CFG, preserved.
 ///
 /// Returns true if any modifications are made.
-bool formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
-                              const DominatorTree &DT, const LoopInfo &LI,
-                              ScalarEvolution *SE);
+///
+/// This function may introduce unused PHI nodes. If \p PHIsToRemove is not
+/// nullptr, those are added to it (before removing, the caller has to check if
+/// they still do not have any uses). Otherwise the PHIs are directly removed.
+bool formLCSSAForInstructions(
+    SmallVectorImpl<Instruction *> &Worklist, const DominatorTree &DT,
+    const LoopInfo &LI, ScalarEvolution *SE, IRBuilderBase &Builder,
+    SmallVectorImpl<PHINode *> *PHIsToRemove = nullptr);
 
 /// Put loop into LCSSA form.
 ///
@@ -105,9 +111,28 @@ bool formLCSSA(Loop &L, const DominatorTree &DT, const LoopInfo *LI,
 bool formLCSSARecursively(Loop &L, const DominatorTree &DT, const LoopInfo *LI,
                           ScalarEvolution *SE);
 
-struct SinkAndHoistLICMFlags {
-  bool NoOfMemAccTooLarge;
-  unsigned LicmMssaOptCounter;
+/// Flags controlling how much is checked when sinking or hoisting
+/// instructions.  The number of memory access in the loop (and whether there
+/// are too many) is determined in the constructors when using MemorySSA.
+class SinkAndHoistLICMFlags {
+public:
+  // Explicitly set limits.
+  SinkAndHoistLICMFlags(unsigned LicmMssaOptCap,
+                        unsigned LicmMssaNoAccForPromotionCap, bool IsSink,
+                        Loop *L = nullptr, MemorySSA *MSSA = nullptr);
+  // Use default limits.
+  SinkAndHoistLICMFlags(bool IsSink, Loop *L = nullptr,
+                        MemorySSA *MSSA = nullptr);
+
+  void setIsSink(bool B) { IsSink = B; }
+  bool getIsSink() { return IsSink; }
+  bool tooManyMemoryAccesses() { return NoOfMemAccTooLarge; }
+  bool tooManyClobberingCalls() { return LicmMssaOptCounter >= LicmMssaOptCap; }
+  void incrementClobberingCalls() { ++LicmMssaOptCounter; }
+
+protected:
+  bool NoOfMemAccTooLarge = false;
+  unsigned LicmMssaOptCounter = 0;
   unsigned LicmMssaOptCap;
   unsigned LicmMssaNoAccForPromotionCap;
   bool IsSink;
@@ -118,12 +143,13 @@ struct SinkAndHoistLICMFlags {
 /// reverse depth first order w.r.t the DominatorTree. This allows us to visit
 /// uses before definitions, allowing us to sink a loop body in one pass without
 /// iteration. Takes DomTreeNode, AAResults, LoopInfo, DominatorTree,
-/// TargetLibraryInfo, Loop, AliasSet information for all
+/// BlockFrequencyInfo, TargetLibraryInfo, Loop, AliasSet information for all
 /// instructions of the loop and loop safety information as
 /// arguments. Diagnostics is emitted via \p ORE. It returns changed status.
 bool sinkRegion(DomTreeNode *, AAResults *, LoopInfo *, DominatorTree *,
-                TargetLibraryInfo *, TargetTransformInfo *, Loop *,
-                AliasSetTracker *, MemorySSAUpdater *, ICFLoopSafetyInfo *,
+                BlockFrequencyInfo *, TargetLibraryInfo *,
+                TargetTransformInfo *, Loop *, AliasSetTracker *,
+                MemorySSAUpdater *, ICFLoopSafetyInfo *,
                 SinkAndHoistLICMFlags &, OptimizationRemarkEmitter *);
 
 /// Walk the specified region of the CFG (defined by all blocks
@@ -131,13 +157,14 @@ bool sinkRegion(DomTreeNode *, AAResults *, LoopInfo *, DominatorTree *,
 /// first order w.r.t the DominatorTree.  This allows us to visit definitions
 /// before uses, allowing us to hoist a loop body in one pass without iteration.
 /// Takes DomTreeNode, AAResults, LoopInfo, DominatorTree,
-/// TargetLibraryInfo, Loop, AliasSet information for all instructions of the
-/// loop and loop safety information as arguments. Diagnostics is emitted via \p
-/// ORE. It returns changed status.
+/// BlockFrequencyInfo, TargetLibraryInfo, Loop, AliasSet information for all
+/// instructions of the loop and loop safety information as arguments.
+/// Diagnostics is emitted via \p ORE. It returns changed status.
 bool hoistRegion(DomTreeNode *, AAResults *, LoopInfo *, DominatorTree *,
-                 TargetLibraryInfo *, Loop *, AliasSetTracker *,
-                 MemorySSAUpdater *, ScalarEvolution *, ICFLoopSafetyInfo *,
-                 SinkAndHoistLICMFlags &, OptimizationRemarkEmitter *);
+                 BlockFrequencyInfo *, TargetLibraryInfo *, Loop *,
+                 AliasSetTracker *, MemorySSAUpdater *, ScalarEvolution *,
+                 ICFLoopSafetyInfo *, SinkAndHoistLICMFlags &,
+                 OptimizationRemarkEmitter *);
 
 /// This function deletes dead loops. The caller of this function needs to
 /// guarantee that the loop is infact dead.
@@ -153,6 +180,12 @@ bool hoistRegion(DomTreeNode *, AAResults *, LoopInfo *, DominatorTree *,
 void deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE,
                     LoopInfo *LI, MemorySSA *MSSA = nullptr);
 
+/// Remove the backedge of the specified loop.  Handles loop nests and general
+/// loop structures subject to the precondition that the loop has no parent
+/// loop and has a single latch block.  Preserves all listed analyses.
+void breakLoopBackedge(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
+                       LoopInfo &LI, MemorySSA *MSSA);
+
 /// Try to promote memory values to scalars by sinking stores out of
 /// the loop and moving loads to before the loop.  We do this by looping over
 /// the stores in the loop, looking for stores to Must pointers which are
@@ -187,6 +220,13 @@ Optional<const MDOperand *> findStringMetadataForLoop(const Loop *TheLoop,
 /// Find named metadata for a loop with an integer value.
 llvm::Optional<int> getOptionalIntLoopAttribute(Loop *TheLoop, StringRef Name);
 
+/// Find a combination of metadata ("llvm.loop.vectorize.width" and
+/// "llvm.loop.vectorize.scalable.enable") for a loop and use it to construct a
+/// ElementCount. If the metadata "llvm.loop.vectorize.width" cannot be found
+/// then None is returned.
+Optional<ElementCount>
+getOptionalElementCountLoopAttribute(Loop *TheLoop);
+
 /// Create a new loop identifier for a loop created from a loop transformation.
 ///
 /// @param OrigLoopID The loop ID of the loop before the transformation.
@@ -222,6 +262,9 @@ bool hasDisableAllTransformsHint(const Loop *L);
 /// Look for the loop attribute that disables the LICM transformation heuristics.
 bool hasDisableLICMTransformsHint(const Loop *L);
 
+/// Look for the loop attribute that requires progress within the loop.
+bool hasMustProgress(const Loop *L);
+
 /// The mode sets how eager a transformation should be applied.
 enum TransformationMode {
   /// The pass can use heuristics to determine whether a transformation should
@@ -264,6 +307,9 @@ TransformationMode hasLICMVersioningTransformation(Loop *L);
 void addStringMetadataToLoop(Loop *TheLoop, const char *MDString,
                              unsigned V = 0);
 
+/// Returns true if Name is applied to TheLoop and enabled.
+bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name);
+
 /// Returns a loop's estimated trip count based on branch weight metadata.
 /// In addition if \p EstimatedLoopInvocationWeight is not null it is
 /// initialized with weight of loop's latch leading to the exit.
@@ -309,35 +355,29 @@ bool canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
                         OptimizationRemarkEmitter *ORE = nullptr);
 
 /// Returns a Min/Max operation corresponding to MinMaxRecurrenceKind.
-Value *createMinMaxOp(IRBuilderBase &Builder,
-                      RecurrenceDescriptor::MinMaxRecurrenceKind RK,
-                      Value *Left, Value *Right);
+Value *createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left,
+                      Value *Right);
 
 /// Generates an ordered vector reduction using extracts to reduce the value.
-Value *
-getOrderedReduction(IRBuilderBase &Builder, Value *Acc, Value *Src, unsigned Op,
-                    RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
-                        RecurrenceDescriptor::MRK_Invalid,
-                    ArrayRef<Value *> RedOps = None);
+Value *getOrderedReduction(IRBuilderBase &Builder, Value *Acc, Value *Src,
+                           unsigned Op, RecurKind MinMaxKind = RecurKind::None,
+                           ArrayRef<Value *> RedOps = None);
 
 /// Generates a vector reduction using shufflevectors to reduce the value.
 /// Fast-math-flags are propagated using the IRBuilder's setting.
 Value *getShuffleReduction(IRBuilderBase &Builder, Value *Src, unsigned Op,
-                           RecurrenceDescriptor::MinMaxRecurrenceKind
-                               MinMaxKind = RecurrenceDescriptor::MRK_Invalid,
+                           RecurKind MinMaxKind = RecurKind::None,
                            ArrayRef<Value *> RedOps = None);
 
 /// Create a target reduction of the given vector. The reduction operation
 /// is described by the \p Opcode parameter. min/max reductions require
-/// additional information supplied in \p Flags.
+/// additional information supplied in \p RdxKind.
 /// The target is queried to determine if intrinsics or shuffle sequences are
 /// required to implement the reduction.
 /// Fast-math-flags are propagated using the IRBuilder's setting.
 Value *createSimpleTargetReduction(IRBuilderBase &B,
-                                   const TargetTransformInfo *TTI,
-                                   unsigned Opcode, Value *Src,
-                                   TargetTransformInfo::ReductionFlags Flags =
-                                       TargetTransformInfo::ReductionFlags(),
+                                   const TargetTransformInfo *TTI, Value *Src,
+                                   RecurKind RdxKind,
                                    ArrayRef<Value *> RedOps = None);
 
 /// Create a generic target reduction using a recurrence descriptor \p Desc
@@ -345,8 +385,7 @@ Value *createSimpleTargetReduction(IRBuilderBase &B,
 /// required to implement the reduction.
 /// Fast-math-flags are propagated using the RecurrenceDescriptor.
 Value *createTargetReduction(IRBuilderBase &B, const TargetTransformInfo *TTI,
-                             RecurrenceDescriptor &Desc, Value *Src,
-                             bool NoNaN = false);
+                             RecurrenceDescriptor &Desc, Value *Src);
 
 /// Get the intersection (logical and) of all of the potential IR flags
 /// of each scalar operation (VL) that will be converted into a vector (I).
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/LoopVersioning.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/LoopVersioning.h
index 1efdcc65b39a..4a8831ed45b2 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/LoopVersioning.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/LoopVersioning.h
@@ -16,6 +16,7 @@
 #define LLVM_TRANSFORMS_UTILS_LOOPVERSIONING_H
 
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 
@@ -24,7 +25,6 @@ namespace llvm {
 class Loop;
 class LoopAccessInfo;
 class LoopInfo;
-class ScalarEvolution;
 struct RuntimeCheckingPtrGroup;
 typedef std::pair<const RuntimeCheckingPtrGroup *,
                   const RuntimeCheckingPtrGroup *>
@@ -43,9 +43,9 @@ public:
   /// It uses runtime check provided by the user. If \p UseLAIChecks is true,
   /// we will retain the default checks made by LAI. Otherwise, construct an
   /// object having no checks and we expect the user to add them.
-  LoopVersioning(const LoopAccessInfo &LAI, Loop *L, LoopInfo *LI,
-                 DominatorTree *DT, ScalarEvolution *SE,
-                 bool UseLAIChecks = true);
+  LoopVersioning(const LoopAccessInfo &LAI,
+                 ArrayRef<RuntimePointerCheck> Checks, Loop *L, LoopInfo *LI,
+                 DominatorTree *DT, ScalarEvolution *SE);
 
   /// Performs the CFG manipulation part of versioning the loop including
   /// the DominatorTree and LoopInfo updates.
@@ -75,12 +75,6 @@ public:
   /// loop may alias (i.e. one of the memchecks failed).
   Loop *getNonVersionedLoop() { return NonVersionedLoop; }
 
-  /// Sets the runtime alias checks for versioning the loop.
-  void setAliasChecks(ArrayRef<RuntimePointerCheck> Checks);
-
-  /// Sets the runtime SCEV checks for versioning the loop.
-  void setSCEVChecks(SCEVUnionPredicate Check);
-
   /// Annotate memory instructions in the versioned loop with no-alias
   /// metadata based on the memchecks issued.
   ///
@@ -129,7 +123,7 @@ private:
   SmallVector<RuntimePointerCheck, 4> AliasChecks;
 
   /// The set of SCEV checks that we are versioning for.
-  SCEVUnionPredicate Preds;
+  const SCEVUnionPredicate &Preds;
 
   /// Maps a pointer to the pointer checking group that the pointer
   /// belongs to.
@@ -148,6 +142,14 @@ private:
   DominatorTree *DT;
   ScalarEvolution *SE;
 };
+
+/// Expose LoopVersioning as a pass.  Currently this is only used for
+/// unit-testing.  It adds all memchecks necessary to remove all may-aliasing
+/// array accesses from the loop.
+class LoopVersioningPass : public PassInfoMixin<LoopVersioningPass> {
+public:
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
+};
 }
 
 #endif
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/LowerSwitch.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/LowerSwitch.h
new file mode 100644
index 000000000000..97086987ffcb
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/LowerSwitch.h
@@ -0,0 +1,26 @@
+//===- LowerSwitch.h - Eliminate Switch instructions ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The LowerSwitch transformation rewrites switch instructions with a sequence
+// of branches, which allows targets to get away with not implementing the
+// switch instruction until it is convenient.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_UTILS_LOWERSWITCH_H
+#define LLVM_TRANSFORMS_UTILS_LOWERSWITCH_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+struct LowerSwitchPass : public PassInfoMixin<LowerSwitchPass> {
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_UTILS_LOWERSWITCH_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/MatrixUtils.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/MatrixUtils.h
new file mode 100644
index 000000000000..39a0d4bf40cc
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/MatrixUtils.h
@@ -0,0 +1,94 @@
+//===- MatrixUtils.h - Utilities to lower matrix intrinsics -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Utilities for generating tiled loops for matrix operations.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_UTILS_MATRIXUTILS_H
+#define LLVM_TRANSFORMS_UTILS_MATRIXUTILS_H
+
+#include "llvm/ADT/StringRef.h"
+
+namespace llvm {
+class DomTreeUpdater;
+class BasicBlock;
+class Value;
+class Loop;
+class LoopInfo;
+class IRBuilderBase;
+
+/// A helper struct to create IR loop nests for tiling in IR of the following
+/// form:
+///   for CurrentColumn = 0..NumColumns
+///     for CurrentRow = 0..NumRows
+///       for CurrentInner = 0..NumInner
+struct TileInfo {
+  /// Number of rows of the matrix.
+  unsigned NumRows;
+
+  /// Number of columns of the matrix.
+  unsigned NumColumns;
+
+  /// Number of columns of the first matrix of a multiply /
+  /// number of rows of the second matrix of a multiply.
+  unsigned NumInner;
+
+  /// Number of rows/columns in a tile.
+  unsigned TileSize = -1;
+
+  /// Start row of the current tile to compute.
+  Value *CurrentRow;
+
+  /// Start column of the current tile to compute.
+  Value *CurrentCol;
+
+  /// Current tile offset during the tile computation.
+  Value *CurrentK;
+
+  /// Header of the outermost loop iterating from 0..NumColumns.
+  BasicBlock *ColumnLoopHeader = nullptr;
+
+  /// Header of the second loop iterating from 0..NumRows.
+  BasicBlock *RowLoopHeader = nullptr;
+  /// Latch of the second loop iterating from 0..NumRows.
+  BasicBlock *RowLoopLatch = nullptr;
+  /// Header of the innermost loop iterating from 0..NumInner.
+  BasicBlock *InnerLoopHeader = nullptr;
+  /// Latch of the innermost loop iterating from 0..NumInner.
+  BasicBlock *InnerLoopLatch = nullptr;
+
+  TileInfo(unsigned NumRows, unsigned NumColumns, unsigned NumInner,
+           unsigned TileSize)
+      : NumRows(NumRows), NumColumns(NumColumns), NumInner(NumInner),
+        TileSize(TileSize) {}
+
+  /// Creates an IR loop nests for tiling of the form below. Returns the block
+  /// for the inner loop body and sets {Column,Row,Inner}LoopHeader/Latch
+  /// fields.
+  ///
+  /// for CurrentColumn = 0..NumColumns
+  ///   for CurrentRow = 0..NumRows
+  ///     for CurrentInner = 0..NumInner
+  BasicBlock *CreateTiledLoops(BasicBlock *Start, BasicBlock *End,
+                               IRBuilderBase &B, DomTreeUpdater &DTU,
+                               LoopInfo &LI);
+
+private:
+  /// Creates a new loop with header, body and latch blocks that iterates from
+  /// [0, Bound). Updates \p Preheader to branch to the new header and uses \p
+  /// Exit as exit block.  Adds the new loop blocks to \L and applies dominator
+  /// tree updates to \p DTU.
+  static BasicBlock *CreateLoop(BasicBlock *Preheader, BasicBlock *Exit,
+                                Value *Bound, Value *Step, StringRef Name,
+                                IRBuilderBase &B, DomTreeUpdater &DTU, Loop *L,
+                                LoopInfo &LI);
+};
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/MetaRenamer.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/MetaRenamer.h
new file mode 100644
index 000000000000..fff3dff75837
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/MetaRenamer.h
@@ -0,0 +1,26 @@
+//===- MetaRenamer.h - Rename everything with metasyntatic names ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass renames everything with metasyntatic names. The intent is to use
+// this pass after bugpoint reduction to conceal the nature of the original
+// program.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_UTILS_METARENAMER_H
+#define LLVM_TRANSFORMS_UTILS_METARENAMER_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+struct MetaRenamerPass : PassInfoMixin<MetaRenamerPass> {
+  PreservedAnalyses run(Module &, ModuleAnalysisManager &);
+};
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_UTILS_METARENAMER_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/MisExpect.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/MisExpect.h
deleted file mode 100644
index 1dbe8cb95936..000000000000
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/MisExpect.h
+++ /dev/null
@@ -1,43 +0,0 @@
-//===--- MisExpect.h - Check the use of llvm.expect with PGO data ---------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This contains code to emit warnings for potentially incorrect usage of the
-// llvm.expect intrinsic. This utility extracts the threshold values from
-// metadata associated with the instrumented Branch or Switch instruction. The
-// threshold values are then used to determine if a warning should be emmited.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
-
-namespace llvm {
-namespace misexpect {
-
-/// verifyMisExpect - compares PGO counters to the thresholds used for
-/// llvm.expect and warns if the PGO counters are outside of the expected
-/// range.
-/// \param I The Instruction being checked
-/// \param Weights A vector of profile weights for each target block
-/// \param Ctx The current LLVM context
-void verifyMisExpect(llvm::Instruction *I,
-                     const llvm::SmallVector<uint32_t, 4> &Weights,
-                     llvm::LLVMContext &Ctx);
-
-/// checkClangInstrumentation - verify if llvm.expect matches PGO profile
-/// This function checks the frontend instrumentation in the backend when
-/// lowering llvm.expect intrinsics. It checks for existing metadata, and
-/// then validates the use of llvm.expect against the assigned branch weights.
-//
-/// \param I the Instruction being checked
-void checkFrontendInstrumentation(Instruction &I);
-
-} // namespace misexpect
-} // namespace llvm
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/PredicateInfo.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/PredicateInfo.h
index 657b97c67a8b..c922476ac79d 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/PredicateInfo.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/PredicateInfo.h
@@ -70,6 +70,13 @@ class raw_ostream;
 
 enum PredicateType { PT_Branch, PT_Assume, PT_Switch };
 
+/// Constraint for a predicate of the form "cmp Pred Op, OtherOp", where Op
+/// is the value the constraint applies to (the ssa.copy result).
+struct PredicateConstraint {
+  CmpInst::Predicate Predicate;
+  Value *OtherOp;
+};
+
 // Base class for all predicate information we provide.
 // All of our predicate information has at least a comparison.
 class PredicateBase : public ilist_node<PredicateBase> {
@@ -83,37 +90,34 @@ public:
   // predicates, this is different to OriginalOp which refers to the initial
   // operand.
   Value *RenamedOp;
+  // The condition associated with this predicate.
+  Value *Condition;
+
   PredicateBase(const PredicateBase &) = delete;
   PredicateBase &operator=(const PredicateBase &) = delete;
   PredicateBase() = delete;
   virtual ~PredicateBase() = default;
-
-protected:
-  PredicateBase(PredicateType PT, Value *Op) : Type(PT), OriginalOp(Op) {}
-};
-
-class PredicateWithCondition : public PredicateBase {
-public:
-  Value *Condition;
   static bool classof(const PredicateBase *PB) {
     return PB->Type == PT_Assume || PB->Type == PT_Branch ||
            PB->Type == PT_Switch;
   }
 
+  /// Fetch condition in the form of PredicateConstraint, if possible.
+  Optional<PredicateConstraint> getConstraint() const;
+
 protected:
-  PredicateWithCondition(PredicateType PT, Value *Op, Value *Condition)
-      : PredicateBase(PT, Op), Condition(Condition) {}
+  PredicateBase(PredicateType PT, Value *Op, Value *Condition)
+      : Type(PT), OriginalOp(Op), Condition(Condition) {}
 };
 
 // Provides predicate information for assumes.  Since assumes are always true,
 // we simply provide the assume instruction, so you can tell your relative
 // position to it.
-class PredicateAssume : public PredicateWithCondition {
+class PredicateAssume : public PredicateBase {
 public:
   IntrinsicInst *AssumeInst;
   PredicateAssume(Value *Op, IntrinsicInst *AssumeInst, Value *Condition)
-      : PredicateWithCondition(PT_Assume, Op, Condition),
-        AssumeInst(AssumeInst) {}
+      : PredicateBase(PT_Assume, Op, Condition), AssumeInst(AssumeInst) {}
   PredicateAssume() = delete;
   static bool classof(const PredicateBase *PB) {
     return PB->Type == PT_Assume;
@@ -123,7 +127,7 @@ public:
 // Mixin class for edge predicates.  The FROM block is the block where the
 // predicate originates, and the TO block is the block where the predicate is
 // valid.
-class PredicateWithEdge : public PredicateWithCondition {
+class PredicateWithEdge : public PredicateBase {
 public:
   BasicBlock *From;
   BasicBlock *To;
@@ -135,7 +139,7 @@ public:
 protected:
   PredicateWithEdge(PredicateType PType, Value *Op, BasicBlock *From,
                     BasicBlock *To, Value *Cond)
-      : PredicateWithCondition(PType, Op, Cond), From(From), To(To) {}
+      : PredicateBase(PType, Op, Cond), From(From), To(To) {}
 };
 
 // Provides predicate information for branches.
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h
index 0c88f9f79e76..547245cfb963 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h
@@ -26,410 +26,486 @@
 #include "llvm/Support/CommandLine.h"
 
 namespace llvm {
-  extern cl::opt<unsigned> SCEVCheapExpansionBudget;
+extern cl::opt<unsigned> SCEVCheapExpansionBudget;
+
+/// Return true if the given expression is safe to expand in the sense that
+/// all materialized values are safe to speculate anywhere their operands are
+/// defined.
+bool isSafeToExpand(const SCEV *S, ScalarEvolution &SE);
+
+/// Return true if the given expression is safe to expand in the sense that
+/// all materialized values are defined and safe to speculate at the specified
+/// location and their operands are defined at this location.
+bool isSafeToExpandAt(const SCEV *S, const Instruction *InsertionPoint,
+                      ScalarEvolution &SE);
+
+/// struct for holding enough information to help calculate the cost of the
+/// given SCEV when expanded into IR.
+struct SCEVOperand {
+  explicit SCEVOperand(unsigned Opc, int Idx, const SCEV *S) :
+    ParentOpcode(Opc), OperandIdx(Idx), S(S) { }
+  /// LLVM instruction opcode that uses the operand.
+  unsigned ParentOpcode;
+  /// The use index of an expanded instruction.
+  int OperandIdx;
+  /// The SCEV operand to be costed.
+  const SCEV* S;
+};
+
+/// This class uses information about analyze scalars to rewrite expressions
+/// in canonical form.
+///
+/// Clients should create an instance of this class when rewriting is needed,
+/// and destroy it when finished to allow the release of the associated
+/// memory.
+class SCEVExpander : public SCEVVisitor<SCEVExpander, Value *> {
+  ScalarEvolution &SE;
+  const DataLayout &DL;
+
+  // New instructions receive a name to identify them with the current pass.
+  const char *IVName;
+
+  /// Indicates whether LCSSA phis should be created for inserted values.
+  bool PreserveLCSSA;
+
+  // InsertedExpressions caches Values for reuse, so must track RAUW.
+  DenseMap<std::pair<const SCEV *, Instruction *>, TrackingVH<Value>>
+      InsertedExpressions;
+
+  // InsertedValues only flags inserted instructions so needs no RAUW.
+  DenseSet<AssertingVH<Value>> InsertedValues;
+  DenseSet<AssertingVH<Value>> InsertedPostIncValues;
+
+  /// Keep track of the existing IR values re-used during expansion.
+  /// FIXME: Ideally re-used instructions would not be added to
+  /// InsertedValues/InsertedPostIncValues.
+  SmallPtrSet<Value *, 16> ReusedValues;
+
+  /// A memoization of the "relevant" loop for a given SCEV.
+  DenseMap<const SCEV *, const Loop *> RelevantLoops;
+
+  /// Addrecs referring to any of the given loops are expanded in post-inc
+  /// mode. For example, expanding {1,+,1}<L> in post-inc mode returns the add
+  /// instruction that adds one to the phi for {0,+,1}<L>, as opposed to a new
+  /// phi starting at 1. This is only supported in non-canonical mode.
+  PostIncLoopSet PostIncLoops;
+
+  /// When this is non-null, addrecs expanded in the loop it indicates should
+  /// be inserted with increments at IVIncInsertPos.
+  const Loop *IVIncInsertLoop;
+
+  /// When expanding addrecs in the IVIncInsertLoop loop, insert the IV
+  /// increment at this position.
+  Instruction *IVIncInsertPos;
+
+  /// Phis that complete an IV chain. Reuse
+  DenseSet<AssertingVH<PHINode>> ChainedPhis;
+
+  /// When true, SCEVExpander tries to expand expressions in "canonical" form.
+  /// When false, expressions are expanded in a more literal form.
+  ///
+  /// In "canonical" form addrecs are expanded as arithmetic based on a
+  /// canonical induction variable. Note that CanonicalMode doesn't guarantee
+  /// that all expressions are expanded in "canonical" form. For some
+  /// expressions literal mode can be preferred.
+  bool CanonicalMode;
+
+  /// When invoked from LSR, the expander is in "strength reduction" mode. The
+  /// only difference is that phi's are only reused if they are already in
+  /// "expanded" form.
+  bool LSRMode;
+
+  typedef IRBuilder<TargetFolder, IRBuilderCallbackInserter> BuilderType;
+  BuilderType Builder;
+
+  // RAII object that stores the current insertion point and restores it when
+  // the object is destroyed. This includes the debug location.  Duplicated
+  // from InsertPointGuard to add SetInsertPoint() which is used to updated
+  // InsertPointGuards stack when insert points are moved during SCEV
+  // expansion.
+  class SCEVInsertPointGuard {
+    IRBuilderBase &Builder;
+    AssertingVH<BasicBlock> Block;
+    BasicBlock::iterator Point;
+    DebugLoc DbgLoc;
+    SCEVExpander *SE;
+
+    SCEVInsertPointGuard(const SCEVInsertPointGuard &) = delete;
+    SCEVInsertPointGuard &operator=(const SCEVInsertPointGuard &) = delete;
 
-  /// Return true if the given expression is safe to expand in the sense that
-  /// all materialized values are safe to speculate anywhere their operands are
-  /// defined.
-  bool isSafeToExpand(const SCEV *S, ScalarEvolution &SE);
+  public:
+    SCEVInsertPointGuard(IRBuilderBase &B, SCEVExpander *SE)
+        : Builder(B), Block(B.GetInsertBlock()), Point(B.GetInsertPoint()),
+          DbgLoc(B.getCurrentDebugLocation()), SE(SE) {
+      SE->InsertPointGuards.push_back(this);
+    }
 
-  /// Return true if the given expression is safe to expand in the sense that
-  /// all materialized values are defined and safe to speculate at the specified
-  /// location and their operands are defined at this location.
-  bool isSafeToExpandAt(const SCEV *S, const Instruction *InsertionPoint,
-                        ScalarEvolution &SE);
+    ~SCEVInsertPointGuard() {
+      // These guards should always created/destroyed in FIFO order since they
+      // are used to guard lexically scoped blocks of code in
+      // ScalarEvolutionExpander.
+      assert(SE->InsertPointGuards.back() == this);
+      SE->InsertPointGuards.pop_back();
+      Builder.restoreIP(IRBuilderBase::InsertPoint(Block, Point));
+      Builder.SetCurrentDebugLocation(DbgLoc);
+    }
 
-  /// This class uses information about analyze scalars to rewrite expressions
-  /// in canonical form.
-  ///
-  /// Clients should create an instance of this class when rewriting is needed,
-  /// and destroy it when finished to allow the release of the associated
-  /// memory.
-  class SCEVExpander : public SCEVVisitor<SCEVExpander, Value*> {
-    ScalarEvolution &SE;
-    const DataLayout &DL;
-
-    // New instructions receive a name to identify them with the current pass.
-    const char* IVName;
-
-    // InsertedExpressions caches Values for reuse, so must track RAUW.
-    DenseMap<std::pair<const SCEV *, Instruction *>, TrackingVH<Value>>
-        InsertedExpressions;
-
-    // InsertedValues only flags inserted instructions so needs no RAUW.
-    DenseSet<AssertingVH<Value>> InsertedValues;
-    DenseSet<AssertingVH<Value>> InsertedPostIncValues;
-
-    /// A memoization of the "relevant" loop for a given SCEV.
-    DenseMap<const SCEV *, const Loop *> RelevantLoops;
-
-    /// Addrecs referring to any of the given loops are expanded in post-inc
-    /// mode. For example, expanding {1,+,1}<L> in post-inc mode returns the add
-    /// instruction that adds one to the phi for {0,+,1}<L>, as opposed to a new
-    /// phi starting at 1. This is only supported in non-canonical mode.
-    PostIncLoopSet PostIncLoops;
-
-    /// When this is non-null, addrecs expanded in the loop it indicates should
-    /// be inserted with increments at IVIncInsertPos.
-    const Loop *IVIncInsertLoop;
-
-    /// When expanding addrecs in the IVIncInsertLoop loop, insert the IV
-    /// increment at this position.
-    Instruction *IVIncInsertPos;
-
-    /// Phis that complete an IV chain. Reuse
-    DenseSet<AssertingVH<PHINode>> ChainedPhis;
-
-    /// When true, SCEVExpander tries to expand expressions in "canonical" form.
-    /// When false, expressions are expanded in a more literal form.
-    ///
-    /// In "canonical" form addrecs are expanded as arithmetic based on a
-    /// canonical induction variable. Note that CanonicalMode doesn't guarantee
-    /// that all expressions are expanded in "canonical" form. For some
-    /// expressions literal mode can be preferred.
-    bool CanonicalMode;
-
-    /// When invoked from LSR, the expander is in "strength reduction" mode. The
-    /// only difference is that phi's are only reused if they are already in
-    /// "expanded" form.
-    bool LSRMode;
-
-    typedef IRBuilder<TargetFolder> BuilderType;
-    BuilderType Builder;
-
-    // RAII object that stores the current insertion point and restores it when
-    // the object is destroyed. This includes the debug location.  Duplicated
-    // from InsertPointGuard to add SetInsertPoint() which is used to updated
-    // InsertPointGuards stack when insert points are moved during SCEV
-    // expansion.
-    class SCEVInsertPointGuard {
-      IRBuilderBase &Builder;
-      AssertingVH<BasicBlock> Block;
-      BasicBlock::iterator Point;
-      DebugLoc DbgLoc;
-      SCEVExpander *SE;
-
-      SCEVInsertPointGuard(const SCEVInsertPointGuard &) = delete;
-      SCEVInsertPointGuard &operator=(const SCEVInsertPointGuard &) = delete;
-
-    public:
-      SCEVInsertPointGuard(IRBuilderBase &B, SCEVExpander *SE)
-          : Builder(B), Block(B.GetInsertBlock()), Point(B.GetInsertPoint()),
-            DbgLoc(B.getCurrentDebugLocation()), SE(SE) {
-        SE->InsertPointGuards.push_back(this);
-      }
-
-      ~SCEVInsertPointGuard() {
-        // These guards should always created/destroyed in FIFO order since they
-        // are used to guard lexically scoped blocks of code in
-        // ScalarEvolutionExpander.
-        assert(SE->InsertPointGuards.back() == this);
-        SE->InsertPointGuards.pop_back();
-        Builder.restoreIP(IRBuilderBase::InsertPoint(Block, Point));
-        Builder.SetCurrentDebugLocation(DbgLoc);
-      }
-
-      BasicBlock::iterator GetInsertPoint() const { return Point; }
-      void SetInsertPoint(BasicBlock::iterator I) { Point = I; }
-    };
-
-    /// Stack of pointers to saved insert points, used to keep insert points
-    /// consistent when instructions are moved.
-    SmallVector<SCEVInsertPointGuard *, 8> InsertPointGuards;
+    BasicBlock::iterator GetInsertPoint() const { return Point; }
+    void SetInsertPoint(BasicBlock::iterator I) { Point = I; }
+  };
+
+  /// Stack of pointers to saved insert points, used to keep insert points
+  /// consistent when instructions are moved.
+  SmallVector<SCEVInsertPointGuard *, 8> InsertPointGuards;
 
 #ifndef NDEBUG
-    const char *DebugType;
+  const char *DebugType;
 #endif
 
-    friend struct SCEVVisitor<SCEVExpander, Value*>;
-
-  public:
-    /// Construct a SCEVExpander in "canonical" mode.
-    explicit SCEVExpander(ScalarEvolution &se, const DataLayout &DL,
-                          const char *name)
-        : SE(se), DL(DL), IVName(name), IVIncInsertLoop(nullptr),
-          IVIncInsertPos(nullptr), CanonicalMode(true), LSRMode(false),
-          Builder(se.getContext(), TargetFolder(DL)) {
+  friend struct SCEVVisitor<SCEVExpander, Value *>;
+
+public:
+  /// Construct a SCEVExpander in "canonical" mode.
+  explicit SCEVExpander(ScalarEvolution &se, const DataLayout &DL,
+                        const char *name, bool PreserveLCSSA = true)
+      : SE(se), DL(DL), IVName(name), PreserveLCSSA(PreserveLCSSA),
+        IVIncInsertLoop(nullptr), IVIncInsertPos(nullptr), CanonicalMode(true),
+        LSRMode(false),
+        Builder(se.getContext(), TargetFolder(DL),
+                IRBuilderCallbackInserter(
+                    [this](Instruction *I) { rememberInstruction(I); })) {
 #ifndef NDEBUG
-      DebugType = "";
+    DebugType = "";
 #endif
-    }
+  }
 
-    ~SCEVExpander() {
-      // Make sure the insert point guard stack is consistent.
-      assert(InsertPointGuards.empty());
-    }
+  ~SCEVExpander() {
+    // Make sure the insert point guard stack is consistent.
+    assert(InsertPointGuards.empty());
+  }
 
 #ifndef NDEBUG
-    void setDebugType(const char* s) { DebugType = s; }
+  void setDebugType(const char *s) { DebugType = s; }
 #endif
 
-    /// Erase the contents of the InsertedExpressions map so that users trying
-    /// to expand the same expression into multiple BasicBlocks or different
-    /// places within the same BasicBlock can do so.
-    void clear() {
-      InsertedExpressions.clear();
-      InsertedValues.clear();
-      InsertedPostIncValues.clear();
-      ChainedPhis.clear();
+  /// Erase the contents of the InsertedExpressions map so that users trying
+  /// to expand the same expression into multiple BasicBlocks or different
+  /// places within the same BasicBlock can do so.
+  void clear() {
+    InsertedExpressions.clear();
+    InsertedValues.clear();
+    InsertedPostIncValues.clear();
+    ReusedValues.clear();
+    ChainedPhis.clear();
+  }
+
+  /// Return a vector containing all instructions inserted during expansion.
+  SmallVector<Instruction *, 32> getAllInsertedInstructions() const {
+    SmallVector<Instruction *, 32> Result;
+    for (auto &VH : InsertedValues) {
+      Value *V = VH;
+      if (ReusedValues.contains(V))
+        continue;
+      if (auto *Inst = dyn_cast<Instruction>(V))
+        Result.push_back(Inst);
     }
-
-    /// Return true for expressions that can't be evaluated at runtime
-    /// within given \b Budget.
-    ///
-    /// At is a parameter which specifies point in code where user is going to
-    /// expand this expression. Sometimes this knowledge can lead to
-    /// a less pessimistic cost estimation.
-    bool isHighCostExpansion(const SCEV *Expr, Loop *L, unsigned Budget,
-                             const TargetTransformInfo *TTI,
-                             const Instruction *At) {
-      assert(TTI && "This function requires TTI to be provided.");
-      assert(At && "This function requires At instruction to be provided.");
-      if (!TTI)      // In assert-less builds, avoid crashing
-        return true; // by always claiming to be high-cost.
-      SmallVector<const SCEV *, 8> Worklist;
-      SmallPtrSet<const SCEV *, 8> Processed;
-      int BudgetRemaining = Budget * TargetTransformInfo::TCC_Basic;
-      Worklist.emplace_back(Expr);
-      while (!Worklist.empty()) {
-        const SCEV *S = Worklist.pop_back_val();
-        if (isHighCostExpansionHelper(S, L, *At, BudgetRemaining, *TTI,
-                                      Processed, Worklist))
-          return true;
-      }
-      assert(BudgetRemaining >= 0 && "Should have returned from inner loop.");
-      return false;
+    for (auto &VH : InsertedPostIncValues) {
+      Value *V = VH;
+      if (ReusedValues.contains(V))
+        continue;
+      if (auto *Inst = dyn_cast<Instruction>(V))
+        Result.push_back(Inst);
     }
 
-    /// This method returns the canonical induction variable of the specified
-    /// type for the specified loop (inserting one if there is none).  A
-    /// canonical induction variable starts at zero and steps by one on each
-    /// iteration.
-    PHINode *getOrInsertCanonicalInductionVariable(const Loop *L, Type *Ty);
-
-    /// Return the induction variable increment's IV operand.
-    Instruction *getIVIncOperand(Instruction *IncV, Instruction *InsertPos,
-                                 bool allowScale);
-
-    /// Utility for hoisting an IV increment.
-    bool hoistIVInc(Instruction *IncV, Instruction *InsertPos);
-
-    /// replace congruent phis with their most canonical representative. Return
-    /// the number of phis eliminated.
-    unsigned replaceCongruentIVs(Loop *L, const DominatorTree *DT,
-                                 SmallVectorImpl<WeakTrackingVH> &DeadInsts,
-                                 const TargetTransformInfo *TTI = nullptr);
-
-    /// Insert code to directly compute the specified SCEV expression into the
-    /// program.  The inserted code is inserted into the specified block.
-    Value *expandCodeFor(const SCEV *SH, Type *Ty, Instruction *I);
-
-    /// Insert code to directly compute the specified SCEV expression into the
-    /// program.  The inserted code is inserted into the SCEVExpander's current
-    /// insertion point. If a type is specified, the result will be expanded to
-    /// have that type, with a cast if necessary.
-    Value *expandCodeFor(const SCEV *SH, Type *Ty = nullptr);
-
-
-    /// Generates a code sequence that evaluates this predicate.  The inserted
-    /// instructions will be at position \p Loc.  The result will be of type i1
-    /// and will have a value of 0 when the predicate is false and 1 otherwise.
-    Value *expandCodeForPredicate(const SCEVPredicate *Pred, Instruction *Loc);
-
-    /// A specialized variant of expandCodeForPredicate, handling the case when
-    /// we are expanding code for a SCEVEqualPredicate.
-    Value *expandEqualPredicate(const SCEVEqualPredicate *Pred,
-                                Instruction *Loc);
-
-    /// Generates code that evaluates if the \p AR expression will overflow.
-    Value *generateOverflowCheck(const SCEVAddRecExpr *AR, Instruction *Loc,
-                                 bool Signed);
-
-    /// A specialized variant of expandCodeForPredicate, handling the case when
-    /// we are expanding code for a SCEVWrapPredicate.
-    Value *expandWrapPredicate(const SCEVWrapPredicate *P, Instruction *Loc);
-
-    /// A specialized variant of expandCodeForPredicate, handling the case when
-    /// we are expanding code for a SCEVUnionPredicate.
-    Value *expandUnionPredicate(const SCEVUnionPredicate *Pred,
-                                Instruction *Loc);
-
-    /// Set the current IV increment loop and position.
-    void setIVIncInsertPos(const Loop *L, Instruction *Pos) {
-      assert(!CanonicalMode &&
-             "IV increment positions are not supported in CanonicalMode");
-      IVIncInsertLoop = L;
-      IVIncInsertPos = Pos;
-    }
+    return Result;
+  }
 
-    /// Enable post-inc expansion for addrecs referring to the given
-    /// loops. Post-inc expansion is only supported in non-canonical mode.
-    void setPostInc(const PostIncLoopSet &L) {
-      assert(!CanonicalMode &&
-             "Post-inc expansion is not supported in CanonicalMode");
-      PostIncLoops = L;
+  /// Return true for expressions that can't be evaluated at runtime
+  /// within given \b Budget.
+  ///
+  /// At is a parameter which specifies point in code where user is going to
+  /// expand this expression. Sometimes this knowledge can lead to
+  /// a less pessimistic cost estimation.
+  bool isHighCostExpansion(const SCEV *Expr, Loop *L, unsigned Budget,
+                           const TargetTransformInfo *TTI,
+                           const Instruction *At) {
+    assert(TTI && "This function requires TTI to be provided.");
+    assert(At && "This function requires At instruction to be provided.");
+    if (!TTI)      // In assert-less builds, avoid crashing
+      return true; // by always claiming to be high-cost.
+    SmallVector<SCEVOperand, 8> Worklist;
+    SmallPtrSet<const SCEV *, 8> Processed;
+    int BudgetRemaining = Budget * TargetTransformInfo::TCC_Basic;
+    Worklist.emplace_back(-1, -1, Expr);
+    while (!Worklist.empty()) {
+      const SCEVOperand WorkItem = Worklist.pop_back_val();
+      if (isHighCostExpansionHelper(WorkItem, L, *At, BudgetRemaining,
+                                    *TTI, Processed, Worklist))
+        return true;
     }
+    assert(BudgetRemaining >= 0 && "Should have returned from inner loop.");
+    return false;
+  }
+
+  /// Return the induction variable increment's IV operand.
+  Instruction *getIVIncOperand(Instruction *IncV, Instruction *InsertPos,
+                               bool allowScale);
+
+  /// Utility for hoisting an IV increment.
+  bool hoistIVInc(Instruction *IncV, Instruction *InsertPos);
+
+  /// replace congruent phis with their most canonical representative. Return
+  /// the number of phis eliminated.
+  unsigned replaceCongruentIVs(Loop *L, const DominatorTree *DT,
+                               SmallVectorImpl<WeakTrackingVH> &DeadInsts,
+                               const TargetTransformInfo *TTI = nullptr);
+
+  /// Insert code to directly compute the specified SCEV expression into the
+  /// program.  The code is inserted into the specified block.
+  Value *expandCodeFor(const SCEV *SH, Type *Ty, Instruction *I) {
+    return expandCodeForImpl(SH, Ty, I, true);
+  }
+
+  /// Insert code to directly compute the specified SCEV expression into the
+  /// program.  The code is inserted into the SCEVExpander's current
+  /// insertion point. If a type is specified, the result will be expanded to
+  /// have that type, with a cast if necessary.
+  Value *expandCodeFor(const SCEV *SH, Type *Ty = nullptr) {
+    return expandCodeForImpl(SH, Ty, true);
+  }
+
+  /// Generates a code sequence that evaluates this predicate.  The inserted
+  /// instructions will be at position \p Loc.  The result will be of type i1
+  /// and will have a value of 0 when the predicate is false and 1 otherwise.
+  Value *expandCodeForPredicate(const SCEVPredicate *Pred, Instruction *Loc);
+
+  /// A specialized variant of expandCodeForPredicate, handling the case when
+  /// we are expanding code for a SCEVEqualPredicate.
+  Value *expandEqualPredicate(const SCEVEqualPredicate *Pred, Instruction *Loc);
+
+  /// Generates code that evaluates if the \p AR expression will overflow.
+  Value *generateOverflowCheck(const SCEVAddRecExpr *AR, Instruction *Loc,
+                               bool Signed);
+
+  /// A specialized variant of expandCodeForPredicate, handling the case when
+  /// we are expanding code for a SCEVWrapPredicate.
+  Value *expandWrapPredicate(const SCEVWrapPredicate *P, Instruction *Loc);
+
+  /// A specialized variant of expandCodeForPredicate, handling the case when
+  /// we are expanding code for a SCEVUnionPredicate.
+  Value *expandUnionPredicate(const SCEVUnionPredicate *Pred, Instruction *Loc);
+
+  /// Set the current IV increment loop and position.
+  void setIVIncInsertPos(const Loop *L, Instruction *Pos) {
+    assert(!CanonicalMode &&
+           "IV increment positions are not supported in CanonicalMode");
+    IVIncInsertLoop = L;
+    IVIncInsertPos = Pos;
+  }
+
+  /// Enable post-inc expansion for addrecs referring to the given
+  /// loops. Post-inc expansion is only supported in non-canonical mode.
+  void setPostInc(const PostIncLoopSet &L) {
+    assert(!CanonicalMode &&
+           "Post-inc expansion is not supported in CanonicalMode");
+    PostIncLoops = L;
+  }
+
+  /// Disable all post-inc expansion.
+  void clearPostInc() {
+    PostIncLoops.clear();
+
+    // When we change the post-inc loop set, cached expansions may no
+    // longer be valid.
+    InsertedPostIncValues.clear();
+  }
+
+  /// Disable the behavior of expanding expressions in canonical form rather
+  /// than in a more literal form. Non-canonical mode is useful for late
+  /// optimization passes.
+  void disableCanonicalMode() { CanonicalMode = false; }
+
+  void enableLSRMode() { LSRMode = true; }
+
+  /// Set the current insertion point. This is useful if multiple calls to
+  /// expandCodeFor() are going to be made with the same insert point and the
+  /// insert point may be moved during one of the expansions (e.g. if the
+  /// insert point is not a block terminator).
+  void setInsertPoint(Instruction *IP) {
+    assert(IP);
+    Builder.SetInsertPoint(IP);
+  }
+
+  /// Clear the current insertion point. This is useful if the instruction
+  /// that had been serving as the insertion point may have been deleted.
+  void clearInsertPoint() { Builder.ClearInsertionPoint(); }
+
+  /// Set location information used by debugging information.
+  void SetCurrentDebugLocation(DebugLoc L) {
+    Builder.SetCurrentDebugLocation(std::move(L));
+  }
+
+  /// Get location information used by debugging information.
+  DebugLoc getCurrentDebugLocation() const {
+    return Builder.getCurrentDebugLocation();
+  }
+
+  /// Return true if the specified instruction was inserted by the code
+  /// rewriter.  If so, the client should not modify the instruction. Note that
+  /// this also includes instructions re-used during expansion.
+  bool isInsertedInstruction(Instruction *I) const {
+    return InsertedValues.count(I) || InsertedPostIncValues.count(I);
+  }
+
+  void setChainedPhi(PHINode *PN) { ChainedPhis.insert(PN); }
+
+  /// Try to find the ValueOffsetPair for S. The function is mainly used to
+  /// check whether S can be expanded cheaply.  If this returns a non-None
+  /// value, we know we can codegen the `ValueOffsetPair` into a suitable
+  /// expansion identical with S so that S can be expanded cheaply.
+  ///
+  /// L is a hint which tells in which loop to look for the suitable value.
+  /// On success return value which is equivalent to the expanded S at point
+  /// At. Return nullptr if value was not found.
+  ///
+  /// Note that this function does not perform an exhaustive search. I.e if it
+  /// didn't find any value it does not mean that there is no such value.
+  ///
+  Optional<ScalarEvolution::ValueOffsetPair>
+  getRelatedExistingExpansion(const SCEV *S, const Instruction *At, Loop *L);
 
-    /// Disable all post-inc expansion.
-    void clearPostInc() {
-      PostIncLoops.clear();
+  /// Returns a suitable insert point after \p I, that dominates \p
+  /// MustDominate. Skips instructions inserted by the expander.
+  BasicBlock::iterator findInsertPointAfter(Instruction *I,
+                                            Instruction *MustDominate);
 
-      // When we change the post-inc loop set, cached expansions may no
-      // longer be valid.
-      InsertedPostIncValues.clear();
-    }
+private:
+  LLVMContext &getContext() const { return SE.getContext(); }
 
-    /// Disable the behavior of expanding expressions in canonical form rather
-    /// than in a more literal form. Non-canonical mode is useful for late
-    /// optimization passes.
-    void disableCanonicalMode() { CanonicalMode = false; }
+  /// Insert code to directly compute the specified SCEV expression into the
+  /// program. The code is inserted into the SCEVExpander's current
+  /// insertion point. If a type is specified, the result will be expanded to
+  /// have that type, with a cast if necessary. If \p Root is true, this
+  /// indicates that \p SH is the top-level expression to expand passed from
+  /// an external client call.
+  Value *expandCodeForImpl(const SCEV *SH, Type *Ty, bool Root);
 
-    void enableLSRMode() { LSRMode = true; }
+  /// Insert code to directly compute the specified SCEV expression into the
+  /// program. The code is inserted into the specified block. If \p
+  /// Root is true, this indicates that \p SH is the top-level expression to
+  /// expand passed from an external client call.
+  Value *expandCodeForImpl(const SCEV *SH, Type *Ty, Instruction *I, bool Root);
 
-    /// Set the current insertion point. This is useful if multiple calls to
-    /// expandCodeFor() are going to be made with the same insert point and the
-    /// insert point may be moved during one of the expansions (e.g. if the
-    /// insert point is not a block terminator).
-    void setInsertPoint(Instruction *IP) {
-      assert(IP);
-      Builder.SetInsertPoint(IP);
-    }
+  /// Recursive helper function for isHighCostExpansion.
+  bool isHighCostExpansionHelper(
+    const SCEVOperand &WorkItem, Loop *L, const Instruction &At,
+    int &BudgetRemaining, const TargetTransformInfo &TTI,
+    SmallPtrSetImpl<const SCEV *> &Processed,
+    SmallVectorImpl<SCEVOperand> &Worklist);
 
-    /// Clear the current insertion point. This is useful if the instruction
-    /// that had been serving as the insertion point may have been deleted.
-    void clearInsertPoint() { Builder.ClearInsertionPoint(); }
+  /// Insert the specified binary operator, doing a small amount of work to
+  /// avoid inserting an obviously redundant operation, and hoisting to an
+  /// outer loop when the opportunity is there and it is safe.
+  Value *InsertBinop(Instruction::BinaryOps Opcode, Value *LHS, Value *RHS,
+                     SCEV::NoWrapFlags Flags, bool IsSafeToHoist);
 
-    /// Set location information used by debugging information.
-    void SetCurrentDebugLocation(DebugLoc L) {
-      Builder.SetCurrentDebugLocation(std::move(L));
-    }
+  /// Arrange for there to be a cast of V to Ty at IP, reusing an existing
+  /// cast if a suitable one exists, moving an existing cast if a suitable one
+  /// exists but isn't in the right place, or creating a new one.
+  Value *ReuseOrCreateCast(Value *V, Type *Ty, Instruction::CastOps Op,
+                           BasicBlock::iterator IP);
 
-    /// Get location information used by debugging information.
-    const DebugLoc &getCurrentDebugLocation() const {
-      return Builder.getCurrentDebugLocation();
-    }
+  /// Insert a cast of V to the specified type, which must be possible with a
+  /// noop cast, doing what we can to share the casts.
+  Value *InsertNoopCastOfTo(Value *V, Type *Ty);
 
-    /// Return true if the specified instruction was inserted by the code
-    /// rewriter.  If so, the client should not modify the instruction.
-    bool isInsertedInstruction(Instruction *I) const {
-      return InsertedValues.count(I) || InsertedPostIncValues.count(I);
-    }
+  /// Expand a SCEVAddExpr with a pointer type into a GEP instead of using
+  /// ptrtoint+arithmetic+inttoptr.
+  Value *expandAddToGEP(const SCEV *const *op_begin, const SCEV *const *op_end,
+                        PointerType *PTy, Type *Ty, Value *V);
+  Value *expandAddToGEP(const SCEV *Op, PointerType *PTy, Type *Ty, Value *V);
 
-    void setChainedPhi(PHINode *PN) { ChainedPhis.insert(PN); }
-
-    /// Try to find existing LLVM IR value for S available at the point At.
-    Value *getExactExistingExpansion(const SCEV *S, const Instruction *At,
-                                     Loop *L);
-
-    /// Try to find the ValueOffsetPair for S. The function is mainly used to
-    /// check whether S can be expanded cheaply.  If this returns a non-None
-    /// value, we know we can codegen the `ValueOffsetPair` into a suitable
-    /// expansion identical with S so that S can be expanded cheaply.
-    ///
-    /// L is a hint which tells in which loop to look for the suitable value.
-    /// On success return value which is equivalent to the expanded S at point
-    /// At. Return nullptr if value was not found.
-    ///
-    /// Note that this function does not perform an exhaustive search. I.e if it
-    /// didn't find any value it does not mean that there is no such value.
-    ///
-    Optional<ScalarEvolution::ValueOffsetPair>
-    getRelatedExistingExpansion(const SCEV *S, const Instruction *At, Loop *L);
-
-  private:
-    LLVMContext &getContext() const { return SE.getContext(); }
-
-    /// Recursive helper function for isHighCostExpansion.
-    bool isHighCostExpansionHelper(const SCEV *S, Loop *L,
-                                   const Instruction &At, int &BudgetRemaining,
-                                   const TargetTransformInfo &TTI,
-                                   SmallPtrSetImpl<const SCEV *> &Processed,
-                                   SmallVectorImpl<const SCEV *> &Worklist);
-
-    /// Insert the specified binary operator, doing a small amount of work to
-    /// avoid inserting an obviously redundant operation, and hoisting to an
-    /// outer loop when the opportunity is there and it is safe.
-    Value *InsertBinop(Instruction::BinaryOps Opcode, Value *LHS, Value *RHS,
-                       SCEV::NoWrapFlags Flags, bool IsSafeToHoist);
-
-    /// Arrange for there to be a cast of V to Ty at IP, reusing an existing
-    /// cast if a suitable one exists, moving an existing cast if a suitable one
-    /// exists but isn't in the right place, or creating a new one.
-    Value *ReuseOrCreateCast(Value *V, Type *Ty,
-                             Instruction::CastOps Op,
-                             BasicBlock::iterator IP);
-
-    /// Insert a cast of V to the specified type, which must be possible with a
-    /// noop cast, doing what we can to share the casts.
-    Value *InsertNoopCastOfTo(Value *V, Type *Ty);
-
-    /// Expand a SCEVAddExpr with a pointer type into a GEP instead of using
-    /// ptrtoint+arithmetic+inttoptr.
-    Value *expandAddToGEP(const SCEV *const *op_begin,
-                          const SCEV *const *op_end,
-                          PointerType *PTy, Type *Ty, Value *V);
-    Value *expandAddToGEP(const SCEV *Op, PointerType *PTy, Type *Ty, Value *V);
-
-    /// Find a previous Value in ExprValueMap for expand.
-    ScalarEvolution::ValueOffsetPair
-    FindValueInExprValueMap(const SCEV *S, const Instruction *InsertPt);
-
-    Value *expand(const SCEV *S);
-
-    /// Determine the most "relevant" loop for the given SCEV.
-    const Loop *getRelevantLoop(const SCEV *);
-
-    Value *visitConstant(const SCEVConstant *S) {
-      return S->getValue();
-    }
+  /// Find a previous Value in ExprValueMap for expand.
+  ScalarEvolution::ValueOffsetPair
+  FindValueInExprValueMap(const SCEV *S, const Instruction *InsertPt);
 
-    Value *visitTruncateExpr(const SCEVTruncateExpr *S);
+  Value *expand(const SCEV *S);
 
-    Value *visitZeroExtendExpr(const SCEVZeroExtendExpr *S);
+  /// Determine the most "relevant" loop for the given SCEV.
+  const Loop *getRelevantLoop(const SCEV *);
 
-    Value *visitSignExtendExpr(const SCEVSignExtendExpr *S);
+  Value *visitConstant(const SCEVConstant *S) { return S->getValue(); }
 
-    Value *visitAddExpr(const SCEVAddExpr *S);
+  Value *visitPtrToIntExpr(const SCEVPtrToIntExpr *S);
 
-    Value *visitMulExpr(const SCEVMulExpr *S);
+  Value *visitTruncateExpr(const SCEVTruncateExpr *S);
 
-    Value *visitUDivExpr(const SCEVUDivExpr *S);
+  Value *visitZeroExtendExpr(const SCEVZeroExtendExpr *S);
 
-    Value *visitAddRecExpr(const SCEVAddRecExpr *S);
+  Value *visitSignExtendExpr(const SCEVSignExtendExpr *S);
 
-    Value *visitSMaxExpr(const SCEVSMaxExpr *S);
+  Value *visitAddExpr(const SCEVAddExpr *S);
 
-    Value *visitUMaxExpr(const SCEVUMaxExpr *S);
+  Value *visitMulExpr(const SCEVMulExpr *S);
 
-    Value *visitSMinExpr(const SCEVSMinExpr *S);
+  Value *visitUDivExpr(const SCEVUDivExpr *S);
 
-    Value *visitUMinExpr(const SCEVUMinExpr *S);
+  Value *visitAddRecExpr(const SCEVAddRecExpr *S);
 
-    Value *visitUnknown(const SCEVUnknown *S) {
-      return S->getValue();
-    }
+  Value *visitSMaxExpr(const SCEVSMaxExpr *S);
 
-    void rememberInstruction(Value *I);
+  Value *visitUMaxExpr(const SCEVUMaxExpr *S);
 
-    bool isNormalAddRecExprPHI(PHINode *PN, Instruction *IncV, const Loop *L);
+  Value *visitSMinExpr(const SCEVSMinExpr *S);
 
-    bool isExpandedAddRecExprPHI(PHINode *PN, Instruction *IncV, const Loop *L);
+  Value *visitUMinExpr(const SCEVUMinExpr *S);
 
-    Value *expandAddRecExprLiterally(const SCEVAddRecExpr *);
-    PHINode *getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized,
-                                       const Loop *L,
-                                       Type *ExpandTy,
-                                       Type *IntTy,
-                                       Type *&TruncTy,
-                                       bool &InvertStep);
-    Value *expandIVInc(PHINode *PN, Value *StepV, const Loop *L,
-                       Type *ExpandTy, Type *IntTy, bool useSubtract);
+  Value *visitUnknown(const SCEVUnknown *S) { return S->getValue(); }
 
-    void hoistBeforePos(DominatorTree *DT, Instruction *InstToHoist,
-                        Instruction *Pos, PHINode *LoopPhi);
+  void rememberInstruction(Value *I);
 
-    void fixupInsertPoints(Instruction *I);
-  };
-}
+  bool isNormalAddRecExprPHI(PHINode *PN, Instruction *IncV, const Loop *L);
+
+  bool isExpandedAddRecExprPHI(PHINode *PN, Instruction *IncV, const Loop *L);
+
+  Value *expandAddRecExprLiterally(const SCEVAddRecExpr *);
+  PHINode *getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized,
+                                     const Loop *L, Type *ExpandTy, Type *IntTy,
+                                     Type *&TruncTy, bool &InvertStep);
+  Value *expandIVInc(PHINode *PN, Value *StepV, const Loop *L, Type *ExpandTy,
+                     Type *IntTy, bool useSubtract);
+
+  void hoistBeforePos(DominatorTree *DT, Instruction *InstToHoist,
+                      Instruction *Pos, PHINode *LoopPhi);
+
+  void fixupInsertPoints(Instruction *I);
+
+  /// If required, create LCSSA PHIs for \p Users' operand \p OpIdx. If new
+  /// LCSSA PHIs have been created, return the LCSSA PHI available at \p User.
+  /// If no PHIs have been created, return the unchanged operand \p OpIdx.
+  Value *fixupLCSSAFormFor(Instruction *User, unsigned OpIdx);
+};
+
+/// Helper to remove instructions inserted during SCEV expansion, unless they
+/// are marked as used.
+class SCEVExpanderCleaner {
+  SCEVExpander &Expander;
+
+  DominatorTree &DT;
+
+  /// Indicates whether the result of the expansion is used. If false, the
+  /// instructions added during expansion are removed.
+  bool ResultUsed;
+
+public:
+  SCEVExpanderCleaner(SCEVExpander &Expander, DominatorTree &DT)
+      : Expander(Expander), DT(DT), ResultUsed(false) {}
+
+  ~SCEVExpanderCleaner();
+
+  /// Indicate that the result of the expansion is used.
+  void markResultUsed() { ResultUsed = true; }
+};
+} // namespace llvm
 
 #endif
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h
new file mode 100644
index 000000000000..fb3a7490346f
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h
@@ -0,0 +1,77 @@
+//===- SimplifyCFGOptions.h - Control structure for SimplifyCFG -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// A set of parameters used to control the transforms in the SimplifyCFG pass.
+// Options may change depending on the position in the optimization pipeline.
+// For example, canonical form that includes switches and branches may later be
+// replaced by lookup tables and selects.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_UTILS_SIMPLIFYCFGOPTIONS_H
+#define LLVM_TRANSFORMS_UTILS_SIMPLIFYCFGOPTIONS_H
+
+namespace llvm {
+
+class AssumptionCache;
+
+struct SimplifyCFGOptions {
+  int BonusInstThreshold = 1;
+  bool ForwardSwitchCondToPhi = false;
+  bool ConvertSwitchToLookupTable = false;
+  bool NeedCanonicalLoop = true;
+  bool HoistCommonInsts = false;
+  bool SinkCommonInsts = false;
+  bool SimplifyCondBranch = true;
+  bool FoldTwoEntryPHINode = true;
+
+  AssumptionCache *AC = nullptr;
+
+  // Support 'builder' pattern to set members by name at construction time.
+  SimplifyCFGOptions &bonusInstThreshold(int I) {
+    BonusInstThreshold = I;
+    return *this;
+  }
+  SimplifyCFGOptions &forwardSwitchCondToPhi(bool B) {
+    ForwardSwitchCondToPhi = B;
+    return *this;
+  }
+  SimplifyCFGOptions &convertSwitchToLookupTable(bool B) {
+    ConvertSwitchToLookupTable = B;
+    return *this;
+  }
+  SimplifyCFGOptions &needCanonicalLoops(bool B) {
+    NeedCanonicalLoop = B;
+    return *this;
+  }
+  SimplifyCFGOptions &hoistCommonInsts(bool B) {
+    HoistCommonInsts = B;
+    return *this;
+  }
+  SimplifyCFGOptions &sinkCommonInsts(bool B) {
+    SinkCommonInsts = B;
+    return *this;
+  }
+  SimplifyCFGOptions &setAssumptionCache(AssumptionCache *Cache) {
+    AC = Cache;
+    return *this;
+  }
+  SimplifyCFGOptions &setSimplifyCondBranch(bool B) {
+    SimplifyCondBranch = B;
+    return *this;
+  }
+
+  SimplifyCFGOptions &setFoldTwoEntryPHINode(bool B) {
+    FoldTwoEntryPHINode = B;
+    return *this;
+  }
+};
+
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_UTILS_SIMPLIFYCFGOPTIONS_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/SimplifyIndVar.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/SimplifyIndVar.h
index 53b15e4aa66c..4ba56fb45afa 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/SimplifyIndVar.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/SimplifyIndVar.h
@@ -15,6 +15,8 @@
 #ifndef LLVM_TRANSFORMS_UTILS_SIMPLIFYINDVAR_H
 #define LLVM_TRANSFORMS_UTILS_SIMPLIFYINDVAR_H
 
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/ValueHandle.h"
 
 namespace llvm {
@@ -57,6 +59,27 @@ bool simplifyLoopIVs(Loop *L, ScalarEvolution *SE, DominatorTree *DT,
                      LoopInfo *LI, const TargetTransformInfo *TTI,
                      SmallVectorImpl<WeakTrackingVH> &Dead);
 
+/// Collect information about induction variables that are used by sign/zero
+/// extend operations. This information is recorded by CollectExtend and provides
+/// the input to WidenIV.
+struct WideIVInfo {
+  PHINode *NarrowIV = nullptr;
+
+  // Widest integer type created [sz]ext
+  Type *WidestNativeType = nullptr;
+
+  // Was a sext user seen before a zext?
+  bool IsSigned = false;
+};
+
+/// Widen Induction Variables - Extend the width of an IV to cover its
+/// widest uses.
+PHINode *createWideIV(const WideIVInfo &WI,
+    LoopInfo *LI, ScalarEvolution *SE, SCEVExpander &Rewriter,
+    DominatorTree *DT, SmallVectorImpl<WeakTrackingVH> &DeadInsts,
+    unsigned &NumElimExt, unsigned &NumWidened,
+    bool HasGuards, bool UsePostIncrementRanges);
+
 } // end namespace llvm
 
 #endif // LLVM_TRANSFORMS_UTILS_SIMPLIFYINDVAR_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h
index d6ee19365c72..8703434e1696 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h
@@ -24,7 +24,6 @@ class CallInst;
 class DataLayout;
 class Instruction;
 class IRBuilderBase;
-class TargetLibraryInfo;
 class Function;
 class OptimizationRemarkEmitter;
 class BlockFrequencyInfo;
@@ -60,6 +59,7 @@ private:
   Value *optimizeStrpCpyChk(CallInst *CI, IRBuilderBase &B, LibFunc Func);
   Value *optimizeStrpNCpyChk(CallInst *CI, IRBuilderBase &B, LibFunc Func);
   Value *optimizeStrLenChk(CallInst *CI, IRBuilderBase &B);
+  Value *optimizeMemPCpyChk(CallInst *CI, IRBuilderBase &B);
   Value *optimizeMemCCpyChk(CallInst *CI, IRBuilderBase &B);
   Value *optimizeSNPrintfChk(CallInst *CI, IRBuilderBase &B);
   Value *optimizeSPrintfChk(CallInst *CI,IRBuilderBase &B);
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/SizeOpts.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/SizeOpts.h
index 08d963475f23..3c1173b747d3 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/SizeOpts.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/SizeOpts.h
@@ -19,7 +19,6 @@
 
 extern llvm::cl::opt<bool> EnablePGSO;
 extern llvm::cl::opt<bool> PGSOLargeWorkingSetSizeOnly;
-extern llvm::cl::opt<bool> PGSOIRPassOrTestOnly;
 extern llvm::cl::opt<bool> PGSOColdCodeOnly;
 extern llvm::cl::opt<bool> PGSOColdCodeOnlyForInstrPGO;
 extern llvm::cl::opt<bool> PGSOColdCodeOnlyForSamplePGO;
@@ -60,11 +59,6 @@ bool shouldFuncOptimizeForSizeImpl(const FuncT *F, ProfileSummaryInfo *PSI,
     return true;
   if (!EnablePGSO)
     return false;
-  // Temporarily enable size optimizations only for the IR pass or test query
-  // sites for gradual commit/rollout. This is to be removed later.
-  if (PGSOIRPassOrTestOnly && !(QueryType == PGSOQueryType::IRPass ||
-                                QueryType == PGSOQueryType::Test))
-    return false;
   if (isPGSOColdCodeOnly(PSI))
     return AdapterT::isFunctionColdInCallGraph(F, PSI, *BFI);
   if (PSI->hasSampleProfile())
@@ -85,11 +79,6 @@ bool shouldOptimizeForSizeImpl(BlockTOrBlockFreq BBOrBlockFreq, ProfileSummaryIn
     return true;
   if (!EnablePGSO)
     return false;
-  // Temporarily enable size optimizations only for the IR pass or test query
-  // sites for gradual commit/rollout. This is to be removed later.
-  if (PGSOIRPassOrTestOnly && !(QueryType == PGSOQueryType::IRPass ||
-                                QueryType == PGSOQueryType::Test))
-    return false;
   if (isPGSOColdCodeOnly(PSI))
     return AdapterT::isColdBlock(BBOrBlockFreq, PSI, BFI);
   if (PSI->hasSampleProfile())
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/StripGCRelocates.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/StripGCRelocates.h
new file mode 100644
index 000000000000..13e6d8ac26a7
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/StripGCRelocates.h
@@ -0,0 +1,25 @@
+//===- StripGCRelocates.h - -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_UTILS_STRIPGCRELOCATES_H
+#define LLVM_TRANSFORMS_UTILS_STRIPGCRELOCATES_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class Function;
+
+class StripGCRelocates : public PassInfoMixin<StripGCRelocates> {
+public:
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_UTILS_STRIPGCRELOCATES_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/StripNonLineTableDebugInfo.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/StripNonLineTableDebugInfo.h
new file mode 100644
index 000000000000..20d0aabd2938
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/StripNonLineTableDebugInfo.h
@@ -0,0 +1,26 @@
+//===- StripNonLineTableDebugInfo.h - -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_UTILS_STRIPNONLINETABLEDEBUGINFO_H
+#define LLVM_TRANSFORMS_UTILS_STRIPNONLINETABLEDEBUGINFO_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class Module;
+
+class StripNonLineTableDebugInfoPass
+    : public PassInfoMixin<StripNonLineTableDebugInfoPass> {
+public:
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_UTILS_STRIPNONLINETABLEDEBUGINFO_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/UnifyFunctionExitNodes.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/UnifyFunctionExitNodes.h
index ff70446e163d..20b360212506 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/UnifyFunctionExitNodes.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/UnifyFunctionExitNodes.h
@@ -7,46 +7,39 @@
 //===----------------------------------------------------------------------===//
 //
 // This pass is used to ensure that functions have at most one return and one
-// unwind instruction in them.  Additionally, it keeps track of which node is
-// the new exit node of the CFG.  If there are no return or unwind instructions
-// in the function, the getReturnBlock/getUnwindBlock methods will return a null
-// pointer.
+// unreachable instruction in them.
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_TRANSFORMS_UTILS_UNIFYFUNCTIONEXITNODES_H
 #define LLVM_TRANSFORMS_UTILS_UNIFYFUNCTIONEXITNODES_H
 
+#include "llvm/IR/PassManager.h"
 #include "llvm/Pass.h"
 
 namespace llvm {
 
 class BasicBlock;
 
-struct UnifyFunctionExitNodes : public FunctionPass {
-  BasicBlock *ReturnBlock = nullptr;
-  BasicBlock *UnwindBlock = nullptr;
-  BasicBlock *UnreachableBlock;
-
+class UnifyFunctionExitNodesLegacyPass : public FunctionPass {
 public:
   static char ID; // Pass identification, replacement for typeid
-  UnifyFunctionExitNodes();
+  UnifyFunctionExitNodesLegacyPass();
 
   // We can preserve non-critical-edgeness when we unify function exit nodes
   void getAnalysisUsage(AnalysisUsage &AU) const override;
 
-  // getReturn|Unwind|UnreachableBlock - Return the new single (or nonexistent)
-  // return, unwind, or unreachable  basic blocks in the CFG.
-  //
-  BasicBlock *getReturnBlock() const { return ReturnBlock; }
-  BasicBlock *getUnwindBlock() const { return UnwindBlock; }
-  BasicBlock *getUnreachableBlock() const { return UnreachableBlock; }
-
   bool runOnFunction(Function &F) override;
 };
 
 Pass *createUnifyFunctionExitNodesPass();
 
+class UnifyFunctionExitNodesPass
+    : public PassInfoMixin<UnifyFunctionExitNodesPass> {
+public:
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
 } // end namespace llvm
 
 #endif // LLVM_TRANSFORMS_UTILS_UNIFYFUNCTIONEXITNODES_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/UnifyLoopExits.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/UnifyLoopExits.h
new file mode 100644
index 000000000000..0b219cd12222
--- /dev/null
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/UnifyLoopExits.h
@@ -0,0 +1,22 @@
+//===- UnifyLoopExits.h - Redirect exiting edges to one block -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_UTILS_UNIFYLOOPEXITS_H
+#define LLVM_TRANSFORMS_UTILS_UNIFYLOOPEXITS_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class UnifyLoopExitsPass : public PassInfoMixin<UnifyLoopExitsPass> {
+public:
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_UTILS_UNIFYLOOPEXITS_H
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/UnrollLoop.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
index bb3d02b95956..4254bd71a41c 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
@@ -92,16 +92,6 @@ bool UnrollRuntimeLoopRemainder(
     const TargetTransformInfo *TTI, bool PreserveLCSSA,
     Loop **ResultLoop = nullptr);
 
-void computePeelCount(Loop *L, unsigned LoopSize,
-                      TargetTransformInfo::UnrollingPreferences &UP,
-                      TargetTransformInfo::PeelingPreferences &PP,
-                      unsigned &TripCount, ScalarEvolution &SE);
-
-bool canPeel(Loop *L);
-
-bool peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI, ScalarEvolution *SE,
-              DominatorTree *DT, AssumptionCache *AC, bool PreserveLCSSA);
-
 LoopUnrollResult UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
                                   unsigned TripMultiple, bool UnrollRemainder,
                                   LoopInfo *LI, ScalarEvolution *SE,
@@ -121,7 +111,6 @@ bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI,
                         unsigned &TripMultiple, unsigned LoopSize,
                         TargetTransformInfo::UnrollingPreferences &UP,
                         TargetTransformInfo::PeelingPreferences &PP,
-
                         bool &UseUpperBound);
 
 void simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
@@ -138,12 +127,6 @@ TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
     Optional<bool> UserAllowPartial, Optional<bool> UserRuntime,
     Optional<bool> UserUpperBound, Optional<unsigned> UserFullUnrollMaxCount);
 
-TargetTransformInfo::PeelingPreferences
-gatherPeelingPreferences(Loop *L, ScalarEvolution &SE,
-                         const TargetTransformInfo &TTI,
-                         Optional<bool> UserAllowPeeling,
-                         Optional<bool> UserAllowProfileBasedPeeling);
-
 unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
                              bool &NotDuplicatable, bool &Convergent,
                              const TargetTransformInfo &TTI,
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index c6c3450f7760..246db0fd2dd9 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -29,6 +29,7 @@
 #include "llvm/ADT/MapVector.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Support/TypeSize.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 
 namespace llvm {
@@ -43,8 +44,14 @@ namespace llvm {
 /// for example 'force', means a decision has been made. So, we need to be
 /// careful NOT to add them if the user hasn't specifically asked so.
 class LoopVectorizeHints {
-  enum HintKind { HK_WIDTH, HK_UNROLL, HK_FORCE, HK_ISVECTORIZED,
-                  HK_PREDICATE };
+  enum HintKind {
+    HK_WIDTH,
+    HK_UNROLL,
+    HK_FORCE,
+    HK_ISVECTORIZED,
+    HK_PREDICATE,
+    HK_SCALABLE
+  };
 
   /// Hint - associates name and validation with the hint value.
   struct Hint {
@@ -73,6 +80,9 @@ class LoopVectorizeHints {
   /// Vector Predicate
   Hint Predicate;
 
+  /// Says whether we should use fixed width or scalable vectorization.
+  Hint Scalable;
+
   /// Return the loop metadata prefix.
   static StringRef Prefix() { return "llvm.loop."; }
 
@@ -98,7 +108,9 @@ public:
   /// Dumps all the hint information.
   void emitRemarkWithHints() const;
 
-  unsigned getWidth() const { return Width.Value; }
+  ElementCount getWidth() const {
+    return ElementCount::get(Width.Value, isScalable());
+  }
   unsigned getInterleave() const { return Interleave.Value; }
   unsigned getIsVectorized() const { return IsVectorized.Value; }
   unsigned getPredicate() const { return Predicate.Value; }
@@ -109,6 +121,8 @@ public:
     return (ForceKind)Force.Value;
   }
 
+  bool isScalable() const { return Scalable.Value; }
+
   /// If hints are provided that force vectorization, use the AlwaysPrint
   /// pass name to force the frontend to print the diagnostic.
   const char *vectorizeAnalysisPassName() const;
@@ -119,7 +133,9 @@ public:
     // enabled by default because can be unsafe or inefficient. For example,
     // reordering floating-point operations will change the way round-off
     // error accumulates in the loop.
-    return getForce() == LoopVectorizeHints::FK_Enabled || getWidth() > 1;
+    ElementCount EC = getWidth();
+    return getForce() == LoopVectorizeHints::FK_Enabled ||
+           EC.getKnownMinValue() > 1;
   }
 
   bool isPotentiallyUnsafe() const {
@@ -202,9 +218,10 @@ public:
       Function *F, std::function<const LoopAccessInfo &(Loop &)> *GetLAA,
       LoopInfo *LI, OptimizationRemarkEmitter *ORE,
       LoopVectorizationRequirements *R, LoopVectorizeHints *H, DemandedBits *DB,
-      AssumptionCache *AC)
+      AssumptionCache *AC, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
       : TheLoop(L), LI(LI), PSE(PSE), TTI(TTI), TLI(TLI), DT(DT),
-        GetLAA(GetLAA), ORE(ORE), Requirements(R), Hints(H), DB(DB), AC(AC) {}
+        GetLAA(GetLAA), ORE(ORE), Requirements(R), Hints(H), DB(DB), AC(AC),
+        BFI(BFI), PSI(PSI) {}
 
   /// ReductionList contains the reduction descriptors for all
   /// of the reductions that were found in the loop.
@@ -229,6 +246,7 @@ public:
 
   /// Return true if we can vectorize this loop while folding its tail by
   /// masking, and mark all respective loads/stores for masking.
+  /// This object's state is only modified iff this function returns true.
   bool prepareToFoldTailByMasking();
 
   /// Returns the primary induction variable.
@@ -287,6 +305,19 @@ public:
   /// Returns true if the value V is uniform within the loop.
   bool isUniform(Value *V);
 
+  /// A uniform memory op is a load or store which accesses the same memory
+  /// location on all lanes.
+  bool isUniformMemOp(Instruction &I) {
+    Value *Ptr = getLoadStorePointerOperand(&I);
+    if (!Ptr)
+      return false;
+    // Note: There's nothing inherent which prevents predicated loads and
+    // stores from being uniform.  The current lowering simply doesn't handle
+    // it; in particular, the cost model distinguishes scatter/gather from
+    // scalar w/predication, and we currently rely on the scalar path.
+    return isUniform(Ptr) && !blockNeedsPredication(I.getParent());
+  }
+
   /// Returns the information that we collected about runtime memory check.
   const RuntimePointerChecking *getRuntimePointerChecking() const {
     return LAI->getRuntimePointerChecking();
@@ -294,17 +325,21 @@ public:
 
   const LoopAccessInfo *getLAI() const { return LAI; }
 
+  bool isSafeForAnyVectorWidth() const {
+    return LAI->getDepChecker().isSafeForAnyVectorWidth();
+  }
+
   unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); }
 
-  uint64_t getMaxSafeRegisterWidth() const {
-    return LAI->getDepChecker().getMaxSafeRegisterWidth();
+  uint64_t getMaxSafeVectorWidthInBits() const {
+    return LAI->getDepChecker().getMaxSafeVectorWidthInBits();
   }
 
   bool hasStride(Value *V) { return LAI->hasStride(V); }
 
   /// Returns true if vector representation of the instruction \p I
   /// requires mask.
-  bool isMaskRequired(const Instruction *I) { return (MaskedOp.count(I) != 0); }
+  bool isMaskRequired(const Instruction *I) { return MaskedOp.contains(I); }
 
   unsigned getNumStores() const { return LAI->getNumStores(); }
   unsigned getNumLoads() const { return LAI->getNumLoads(); }
@@ -361,16 +396,17 @@ private:
   bool canVectorizeOuterLoop();
 
   /// Return true if all of the instructions in the block can be speculatively
-  /// executed, and record the loads/stores that require masking. If's that
-  /// guard loads can be ignored under "assume safety" unless \p PreserveGuards
-  /// is true. This can happen when we introduces guards for which the original
-  /// "unguarded-loads are safe" assumption does not hold. For example, the
-  /// vectorizer's fold-tail transformation changes the loop to execute beyond
-  /// its original trip-count, under a proper guard, which should be preserved.
+  /// executed, and record the loads/stores that require masking.
   /// \p SafePtrs is a list of addresses that are known to be legal and we know
   /// that we can read from them without segfault.
-  bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs,
-                            bool PreserveGuards = false);
+  /// \p MaskedOp is a list of instructions that have to be transformed into
+  /// calls to the appropriate masked intrinsic when the loop is vectorized.
+  /// \p ConditionalAssumes is a list of assume instructions in predicated
+  /// blocks that must be dropped if the CFG gets flattened.
+  bool blockCanBePredicated(
+      BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs,
+      SmallPtrSetImpl<const Instruction *> &MaskedOp,
+      SmallPtrSetImpl<Instruction *> &ConditionalAssumes) const;
 
   /// Updates the vectorization state by adding \p Phi to the inductions list.
   /// This can set \p Phi as the main induction of the loop if \p Phi is a
@@ -478,6 +514,10 @@ private:
   /// Assume instructions in predicated blocks must be dropped if the CFG gets
   /// flattened.
   SmallPtrSet<Instruction *, 8> ConditionalAssumes;
+
+  /// BFI and PSI are used to check for profile guided size optimizations.
+  BlockFrequencyInfo *BFI;
+  ProfileSummaryInfo *PSI;
 };
 
 } // namespace llvm
diff --git a/contrib/llvm-project/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/contrib/llvm-project/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
index 77236dec75dc..52a57939209c 100644
--- a/contrib/llvm-project/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
+++ b/contrib/llvm-project/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
@@ -22,11 +22,11 @@
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/IR/PassManager.h"
 
 namespace llvm {
 
+class AAResults;
 class AssumptionCache;
 class BasicBlock;
 class CmpInst;
@@ -34,6 +34,7 @@ class DataLayout;
 class DemandedBits;
 class DominatorTree;
 class Function;
+class GetElementPtrInst;
 class InsertElementInst;
 class InsertValueInst;
 class Instruction;
@@ -63,7 +64,7 @@ struct SLPVectorizerPass : public PassInfoMixin<SLPVectorizerPass> {
   ScalarEvolution *SE = nullptr;
   TargetTransformInfo *TTI = nullptr;
   TargetLibraryInfo *TLI = nullptr;
-  AliasAnalysis *AA = nullptr;
+  AAResults *AA = nullptr;
   LoopInfo *LI = nullptr;
   DominatorTree *DT = nullptr;
   AssumptionCache *AC = nullptr;
@@ -75,7 +76,7 @@ public:
 
   // Glue for old PM.
   bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_,
-               TargetLibraryInfo *TLI_, AliasAnalysis *AA_, LoopInfo *LI_,
+               TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_,
                DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_,
                OptimizationRemarkEmitter *ORE_);
 
diff --git a/contrib/llvm-project/llvm/include/llvm/module.modulemap b/contrib/llvm-project/llvm/include/llvm/module.modulemap
index 778a17c8aeee..a199f7f2d79a 100644
--- a/contrib/llvm-project/llvm/include/llvm/module.modulemap
+++ b/contrib/llvm-project/llvm/include/llvm/module.modulemap
@@ -30,6 +30,7 @@ module LLVM_Backend {
 
     // These are intended for (repeated) textual inclusion.
     textual header "CodeGen/DIEValue.def"
+    textual header "CodeGen/MachinePassRegistry.def"
   }
 }
 
@@ -65,6 +66,7 @@ module LLVM_BinaryFormat {
     textual header "BinaryFormat/ELFRelocs/ARC.def"
     textual header "BinaryFormat/ELFRelocs/AVR.def"
     textual header "BinaryFormat/ELFRelocs/BPF.def"
+    textual header "BinaryFormat/ELFRelocs/CSKY.def"
     textual header "BinaryFormat/ELFRelocs/Hexagon.def"
     textual header "BinaryFormat/ELFRelocs/i386.def"
     textual header "BinaryFormat/ELFRelocs/Lanai.def"
@@ -188,23 +190,30 @@ module LLVM_ExecutionEngine {
   exclude header "ExecutionEngine/Orc/RemoteObjectLayer.h"
 
   // Exclude headers from LLVM_OrcSupport.
-  exclude header "ExecutionEngine/Orc/OrcError.h"
+  exclude header "ExecutionEngine/Orc/Shared/OrcError.h"
   exclude header "ExecutionEngine/Orc/RPC/RPCUtils.h"
   exclude header "ExecutionEngine/Orc/RPC/RPCSerialization.h"
   exclude header "ExecutionEngine/Orc/RPC/RawByteChannel.h"
 
 }
 
+module LLVM_FileCheck {
+  requires cplusplus
+
+  umbrella "FileCheck"
+  module * { export * }
+}
+
 // Orc utilities that don't depend only on Support (not ExecutionEngine or
 // IR). This is a workaround for ExecutionEngine's broken layering, and will
 // be removed in the future.
 module LLVM_OrcSupport {
   requires cplusplus
 
-  header "ExecutionEngine/Orc/OrcError.h"
-  header "ExecutionEngine/Orc/RPC/RPCUtils.h"
-  header "ExecutionEngine/Orc/RPC/RPCSerialization.h"
-  header "ExecutionEngine/Orc/RPC/RawByteChannel.h"
+  header "ExecutionEngine/Orc/Shared/OrcError.h"
+  header "ExecutionEngine/Orc/Shared/RPCUtils.h"
+  header "ExecutionEngine/Orc/Shared/Serialization.h"
+  header "ExecutionEngine/Orc/Shared/RawByteChannel.h"
 
   export *
 }
@@ -253,6 +262,7 @@ module LLVM_intrinsic_gen {
   module IR_CFG { header "IR/CFG.h" export * }
   module IR_ConstantRange { header "IR/ConstantRange.h" export * }
   module IR_Dominators { header "IR/Dominators.h" export * }
+  module IR_FixedPointBuilder { header "IR/FixedPointBuilder.h" export * }
   module Analysis_PostDominators { header "Analysis/PostDominators.h" export * }
   module Analysis_DomTreeUpdater { header "Analysis/DomTreeUpdater.h" export * }
   module IR_IRBuilder { header "IR/IRBuilder.h" export * }
diff --git a/contrib/llvm-project/llvm/lib/Analysis/AliasAnalysis.cpp b/contrib/llvm-project/llvm/lib/Analysis/AliasAnalysis.cpp
index fec2415a0e45..fae7a84332fd 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/AliasAnalysis.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/AliasAnalysis.cpp
@@ -24,6 +24,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/CFLAndersAliasAnalysis.h"
 #include "llvm/Analysis/CFLSteensAliasAnalysis.h"
@@ -54,12 +55,17 @@
 #include <functional>
 #include <iterator>
 
+#define DEBUG_TYPE "aa"
+
 using namespace llvm;
 
+STATISTIC(NumNoAlias,   "Number of NoAlias results");
+STATISTIC(NumMayAlias,  "Number of MayAlias results");
+STATISTIC(NumMustAlias, "Number of MustAlias results");
+
 /// Allow disabling BasicAA from the AA results. This is particularly useful
 /// when testing to isolate a single AA implementation.
-static cl::opt<bool> DisableBasicAA("disable-basic-aa", cl::Hidden,
-                                    cl::init(false));
+cl::opt<bool> DisableBasicAA("disable-basic-aa", cl::Hidden, cl::init(false));
 
 AAResults::AAResults(AAResults &&Arg)
     : TLI(Arg.TLI), AAs(std::move(Arg.AAs)), AADeps(std::move(Arg.AADeps)) {
@@ -110,12 +116,25 @@ AliasResult AAResults::alias(const MemoryLocation &LocA,
 
 AliasResult AAResults::alias(const MemoryLocation &LocA,
                              const MemoryLocation &LocB, AAQueryInfo &AAQI) {
+  AliasResult Result = MayAlias;
+
+  Depth++;
   for (const auto &AA : AAs) {
-    auto Result = AA->alias(LocA, LocB, AAQI);
+    Result = AA->alias(LocA, LocB, AAQI);
     if (Result != MayAlias)
-      return Result;
+      break;
+  }
+  Depth--;
+
+  if (Depth == 0) {
+    if (Result == NoAlias)
+      ++NumNoAlias;
+    else if (Result == MustAlias)
+      ++NumMustAlias;
+    else
+      ++NumMayAlias;
   }
-  return MayAlias;
+  return Result;
 }
 
 bool AAResults::pointsToConstantMemory(const MemoryLocation &Loc,
@@ -215,7 +234,7 @@ ModRefInfo AAResults::getModRefInfo(const CallBase *Call,
         unsigned ArgIdx = std::distance(Call->arg_begin(), AI);
         MemoryLocation ArgLoc =
             MemoryLocation::getForArgument(Call, ArgIdx, TLI);
-        AliasResult ArgAlias = alias(ArgLoc, Loc);
+        AliasResult ArgAlias = alias(ArgLoc, Loc, AAQI);
         if (ArgAlias != NoAlias) {
           ModRefInfo ArgMask = getArgModRefInfo(Call, ArgIdx);
           AllArgsMask = unionModRef(AllArgsMask, ArgMask);
@@ -235,7 +254,7 @@ ModRefInfo AAResults::getModRefInfo(const CallBase *Call,
 
   // If Loc is a constant memory location, the call definitely could not
   // modify the memory location.
-  if (isModSet(Result) && pointsToConstantMemory(Loc, /*OrLocal*/ false))
+  if (isModSet(Result) && pointsToConstantMemory(Loc, AAQI, /*OrLocal*/ false))
     Result = clearMod(Result);
 
   return Result;
@@ -312,7 +331,7 @@ ModRefInfo AAResults::getModRefInfo(const CallBase *Call1,
 
       // ModRefC1 indicates what Call1 might do to Call2ArgLoc, and we use
       // above ArgMask to update dependence info.
-      ModRefInfo ModRefC1 = getModRefInfo(Call1, Call2ArgLoc);
+      ModRefInfo ModRefC1 = getModRefInfo(Call1, Call2ArgLoc, AAQI);
       ArgMask = intersectModRef(ArgMask, ModRefC1);
 
       // Conservatively clear IsMustAlias unless only MustAlias is found.
@@ -353,7 +372,7 @@ ModRefInfo AAResults::getModRefInfo(const CallBase *Call1,
       // might Mod Call1ArgLoc, then we care about either a Mod or a Ref by
       // Call2. If Call1 might Ref, then we care only about a Mod by Call2.
       ModRefInfo ArgModRefC1 = getArgModRefInfo(Call1, Call1ArgIdx);
-      ModRefInfo ModRefC2 = getModRefInfo(Call2, Call1ArgLoc);
+      ModRefInfo ModRefC2 = getModRefInfo(Call2, Call1ArgLoc, AAQI);
       if ((isModSet(ArgModRefC1) && isModOrRefSet(ModRefC2)) ||
           (isRefSet(ArgModRefC1) && isModSet(ModRefC2)))
         R = intersectModRef(unionModRef(R, ArgModRefC1), Result);
@@ -628,6 +647,43 @@ ModRefInfo AAResults::getModRefInfo(const AtomicRMWInst *RMW,
   return ModRefInfo::ModRef;
 }
 
+ModRefInfo AAResults::getModRefInfo(const Instruction *I,
+                                    const Optional<MemoryLocation> &OptLoc,
+                                    AAQueryInfo &AAQIP) {
+  if (OptLoc == None) {
+    if (const auto *Call = dyn_cast<CallBase>(I)) {
+      return createModRefInfo(getModRefBehavior(Call));
+    }
+  }
+
+  const MemoryLocation &Loc = OptLoc.getValueOr(MemoryLocation());
+
+  switch (I->getOpcode()) {
+  case Instruction::VAArg:
+    return getModRefInfo((const VAArgInst *)I, Loc, AAQIP);
+  case Instruction::Load:
+    return getModRefInfo((const LoadInst *)I, Loc, AAQIP);
+  case Instruction::Store:
+    return getModRefInfo((const StoreInst *)I, Loc, AAQIP);
+  case Instruction::Fence:
+    return getModRefInfo((const FenceInst *)I, Loc, AAQIP);
+  case Instruction::AtomicCmpXchg:
+    return getModRefInfo((const AtomicCmpXchgInst *)I, Loc, AAQIP);
+  case Instruction::AtomicRMW:
+    return getModRefInfo((const AtomicRMWInst *)I, Loc, AAQIP);
+  case Instruction::Call:
+    return getModRefInfo((const CallInst *)I, Loc, AAQIP);
+  case Instruction::Invoke:
+    return getModRefInfo((const InvokeInst *)I, Loc, AAQIP);
+  case Instruction::CatchPad:
+    return getModRefInfo((const CatchPadInst *)I, Loc, AAQIP);
+  case Instruction::CatchRet:
+    return getModRefInfo((const CatchReturnInst *)I, Loc, AAQIP);
+  default:
+    return ModRefInfo::NoModRef;
+  }
+}
+
 /// Return information about whether a particular call site modifies
 /// or reads the specified memory location \p MemLoc before instruction \p I
 /// in a BasicBlock.
@@ -641,8 +697,7 @@ ModRefInfo AAResults::callCapturesBefore(const Instruction *I,
   if (!DT)
     return ModRefInfo::ModRef;
 
-  const Value *Object =
-      GetUnderlyingObject(MemLoc.Ptr, I->getModule()->getDataLayout());
+  const Value *Object = getUnderlyingObject(MemLoc.Ptr);
   if (!isIdentifiedObject(Object) || isa<GlobalValue>(Object) ||
       isa<Constant>(Object))
     return ModRefInfo::ModRef;
@@ -670,7 +725,7 @@ ModRefInfo AAResults::callCapturesBefore(const Instruction *I,
          !Call->isByValArgument(ArgNo)))
       continue;
 
-    AliasResult AR = alias(MemoryLocation(*CI), MemoryLocation(Object));
+    AliasResult AR = alias(*CI, Object);
     // If this is a no-capture pointer argument, see if we can tell that it
     // is impossible to alias the pointer we're checking.  If not, we have to
     // assume that the call could touch the pointer, even though it doesn't
@@ -828,8 +883,8 @@ bool AAResultsWrapperPass::runOnFunction(Function &F) {
 
 void AAResultsWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
-  AU.addRequired<BasicAAWrapperPass>();
-  AU.addRequired<TargetLibraryInfoWrapperPass>();
+  AU.addRequiredTransitive<BasicAAWrapperPass>();
+  AU.addRequiredTransitive<TargetLibraryInfoWrapperPass>();
 
   // We also need to mark all the alias analysis passes we will potentially
   // probe in runOnFunction as used here to ensure the legacy pass manager
@@ -845,6 +900,13 @@ void AAResultsWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addUsedIfAvailable<ExternalAAWrapperPass>();
 }
 
+AAManager::Result AAManager::run(Function &F, FunctionAnalysisManager &AM) {
+  Result R(AM.getResult<TargetLibraryAnalysis>(F));
+  for (auto &Getter : ResultGetters)
+    (*Getter)(F, AM, R);
+  return R;
+}
+
 AAResults llvm::createLegacyPMAAResults(Pass &P, Function &F,
                                         BasicAAResult &BAR) {
   AAResults AAR(P.getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F));
@@ -881,9 +943,9 @@ bool llvm::isNoAliasCall(const Value *V) {
   return false;
 }
 
-bool llvm::isNoAliasArgument(const Value *V) {
+static bool isNoAliasOrByValArgument(const Value *V) {
   if (const Argument *A = dyn_cast<Argument>(V))
-    return A->hasNoAliasAttr();
+    return A->hasNoAliasAttr() || A->hasByValAttr();
   return false;
 }
 
@@ -894,13 +956,13 @@ bool llvm::isIdentifiedObject(const Value *V) {
     return true;
   if (isNoAliasCall(V))
     return true;
-  if (const Argument *A = dyn_cast<Argument>(V))
-    return A->hasNoAliasAttr() || A->hasByValAttr();
+  if (isNoAliasOrByValArgument(V))
+    return true;
   return false;
 }
 
 bool llvm::isIdentifiedFunctionLocal(const Value *V) {
-  return isa<AllocaInst>(V) || isNoAliasCall(V) || isNoAliasArgument(V);
+  return isa<AllocaInst>(V) || isNoAliasCall(V) || isNoAliasOrByValArgument(V);
 }
 
 void llvm::getAAResultsAnalysisUsage(AnalysisUsage &AU) {
diff --git a/contrib/llvm-project/llvm/lib/Analysis/AliasAnalysisEvaluator.cpp b/contrib/llvm-project/llvm/lib/Analysis/AliasAnalysisEvaluator.cpp
index b1433c579af8..bbfa82bcca6a 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/AliasAnalysisEvaluator.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/AliasAnalysisEvaluator.cpp
@@ -140,13 +140,13 @@ void AAEvaluator::runInternal(Function &F, AAResults &AA) {
   // iterate over the worklist, and run the full (n^2)/2 disambiguations
   for (SetVector<Value *>::iterator I1 = Pointers.begin(), E = Pointers.end();
        I1 != E; ++I1) {
-    auto I1Size = LocationSize::unknown();
+    auto I1Size = LocationSize::afterPointer();
     Type *I1ElTy = cast<PointerType>((*I1)->getType())->getElementType();
     if (I1ElTy->isSized())
       I1Size = LocationSize::precise(DL.getTypeStoreSize(I1ElTy));
 
     for (SetVector<Value *>::iterator I2 = Pointers.begin(); I2 != I1; ++I2) {
-      auto I2Size = LocationSize::unknown();
+      auto I2Size = LocationSize::afterPointer();
       Type *I2ElTy = cast<PointerType>((*I2)->getType())->getElementType();
       if (I2ElTy->isSized())
         I2Size = LocationSize::precise(DL.getTypeStoreSize(I2ElTy));
@@ -231,7 +231,7 @@ void AAEvaluator::runInternal(Function &F, AAResults &AA) {
   // Mod/ref alias analysis: compare all pairs of calls and values
   for (CallBase *Call : Calls) {
     for (auto Pointer : Pointers) {
-      auto Size = LocationSize::unknown();
+      auto Size = LocationSize::afterPointer();
       Type *ElTy = cast<PointerType>(Pointer->getType())->getElementType();
       if (ElTy->isSized())
         Size = LocationSize::precise(DL.getTypeStoreSize(ElTy));
diff --git a/contrib/llvm-project/llvm/lib/Analysis/AliasSetTracker.cpp b/contrib/llvm-project/llvm/lib/Analysis/AliasSetTracker.cpp
index 5cc68f05dc0e..486b4d99dfae 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/AliasSetTracker.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/AliasSetTracker.cpp
@@ -11,7 +11,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/AliasSetTracker.h"
-#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/GuardUtils.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryLocation.h"
@@ -21,24 +20,20 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstIterator.h"
-#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Value.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/AtomicOrdering.h"
-#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include <cassert>
-#include <cstdint>
-#include <vector>
 
 using namespace llvm;
 
@@ -88,7 +83,7 @@ void AliasSet::mergeSetIn(AliasSet &AS, AliasSetTracker &AST) {
       addRef();
     }
   } else if (ASHadUnknownInsts) {
-    UnknownInsts.insert(UnknownInsts.end(), AS.UnknownInsts.begin(), AS.UnknownInsts.end());
+    llvm::append_range(UnknownInsts, AS.UnknownInsts);
     AS.UnknownInsts.clear();
   }
 
@@ -443,7 +438,9 @@ void AliasSetTracker::addUnknown(Instruction *Inst) {
       break;
       // FIXME: Add lifetime/invariant intrinsics (See: PR30807).
     case Intrinsic::assume:
+    case Intrinsic::experimental_noalias_scope_decl:
     case Intrinsic::sideeffect:
+    case Intrinsic::pseudoprobe:
       return;
     }
   }
@@ -675,8 +672,10 @@ void AliasSet::print(raw_ostream &OS) const {
     for (iterator I = begin(), E = end(); I != E; ++I) {
       if (I != begin()) OS << ", ";
       I.getPointer()->printAsOperand(OS << "(");
-      if (I.getSize() == LocationSize::unknown())
-        OS << ", unknown)";
+      if (I.getSize() == LocationSize::afterPointer())
+        OS << ", unknown after)";
+      else if (I.getSize() == LocationSize::beforeOrAfterPointer())
+        OS << ", unknown before-or-after)";
       else
         OS << ", " << I.getSize() << ")";
     }
@@ -740,8 +739,6 @@ AliasSetTracker::ASTCallbackVH::operator=(Value *V) {
 namespace {
 
   class AliasSetPrinter : public FunctionPass {
-    AliasSetTracker *Tracker;
-
   public:
     static char ID; // Pass identification, replacement for typeid
 
@@ -756,12 +753,11 @@ namespace {
 
     bool runOnFunction(Function &F) override {
       auto &AAWP = getAnalysis<AAResultsWrapperPass>();
-      Tracker = new AliasSetTracker(AAWP.getAAResults());
+      AliasSetTracker Tracker(AAWP.getAAResults());
       errs() << "Alias sets for function '" << F.getName() << "':\n";
       for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I)
-        Tracker->add(&*I);
-      Tracker->print(errs());
-      delete Tracker;
+        Tracker.add(&*I);
+      Tracker.print(errs());
       return false;
     }
   };
@@ -775,3 +771,16 @@ INITIALIZE_PASS_BEGIN(AliasSetPrinter, "print-alias-sets",
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_END(AliasSetPrinter, "print-alias-sets",
                 "Alias Set Printer", false, true)
+
+AliasSetsPrinterPass::AliasSetsPrinterPass(raw_ostream &OS) : OS(OS) {}
+
+PreservedAnalyses AliasSetsPrinterPass::run(Function &F,
+                                            FunctionAnalysisManager &AM) {
+  auto &AA = AM.getResult<AAManager>(F);
+  AliasSetTracker Tracker(AA);
+  OS << "Alias sets for function '" << F.getName() << "':\n";
+  for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I)
+    Tracker.add(&*I);
+  Tracker.print(OS);
+  return PreservedAnalyses::all();
+}
diff --git a/contrib/llvm-project/llvm/lib/Analysis/Analysis.cpp b/contrib/llvm-project/llvm/lib/Analysis/Analysis.cpp
index af718526684b..db5167061509 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/Analysis.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/Analysis.cpp
@@ -50,19 +50,20 @@ void llvm::initializeAnalysis(PassRegistry &Registry) {
   initializeAAResultsWrapperPassPass(Registry);
   initializeGlobalsAAWrapperPassPass(Registry);
   initializeIVUsersWrapperPassPass(Registry);
-  initializeInstCountPass(Registry);
+  initializeInstCountLegacyPassPass(Registry);
   initializeIntervalPartitionPass(Registry);
+  initializeIRSimilarityIdentifierWrapperPassPass(Registry);
   initializeLazyBranchProbabilityInfoPassPass(Registry);
   initializeLazyBlockFrequencyInfoPassPass(Registry);
   initializeLazyValueInfoWrapperPassPass(Registry);
   initializeLazyValueInfoPrinterPass(Registry);
   initializeLegacyDivergenceAnalysisPass(Registry);
-  initializeLintPass(Registry);
+  initializeLintLegacyPassPass(Registry);
   initializeLoopInfoWrapperPassPass(Registry);
   initializeMemDepPrinterPass(Registry);
   initializeMemDerefPrinterPass(Registry);
   initializeMemoryDependenceWrapperPassPass(Registry);
-  initializeModuleDebugInfoPrinterPass(Registry);
+  initializeModuleDebugInfoLegacyPrinterPass(Registry);
   initializeModuleSummaryIndexWrapperPassPass(Registry);
   initializeMustExecutePrinterPass(Registry);
   initializeMustBeExecutedContextPrinterPass(Registry);
diff --git a/contrib/llvm-project/llvm/lib/Analysis/AssumeBundleQueries.cpp b/contrib/llvm-project/llvm/lib/Analysis/AssumeBundleQueries.cpp
index 972d0d3ea7f2..0084e2f13f5f 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/AssumeBundleQueries.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/AssumeBundleQueries.cpp
@@ -108,10 +108,17 @@ llvm::getKnowledgeFromBundle(CallInst &Assume,
   Result.AttrKind = Attribute::getAttrKindFromName(BOI.Tag->getKey());
   if (bundleHasArgument(BOI, ABA_WasOn))
     Result.WasOn = getValueFromBundleOpInfo(Assume, BOI, ABA_WasOn);
+  auto GetArgOr1 = [&](unsigned Idx) -> unsigned {
+    if (auto *ConstInt = dyn_cast<ConstantInt>(
+            getValueFromBundleOpInfo(Assume, BOI, ABA_Argument + Idx)))
+      return ConstInt->getZExtValue();
+    return 1;
+  };
   if (BOI.End - BOI.Begin > ABA_Argument)
-    Result.ArgValue =
-        cast<ConstantInt>(getValueFromBundleOpInfo(Assume, BOI, ABA_Argument))
-            ->getZExtValue();
+    Result.ArgValue = GetArgOr1(0);
+  if (Result.AttrKind == Attribute::Alignment)
+    if (BOI.End - BOI.Begin > ABA_Argument + 1)
+      Result.ArgValue = MinAlign(Result.ArgValue, GetArgOr1(1));
   return Result;
 }
 
@@ -172,12 +179,15 @@ llvm::getKnowledgeForValue(const Value *V,
       if (!II || Elem.Index == AssumptionCache::ExprResultIdx)
         continue;
       if (RetainedKnowledge RK = getKnowledgeFromBundle(
-              *II, II->bundle_op_info_begin()[Elem.Index]))
+              *II, II->bundle_op_info_begin()[Elem.Index])) {
+        if (V != RK.WasOn)
+          continue;
         if (is_contained(AttrKinds, RK.AttrKind) &&
             Filter(RK, II, &II->bundle_op_info_begin()[Elem.Index])) {
           NumUsefullAssumeQueries++;
           return RK;
         }
+      }
     }
     return RetainedKnowledge::none();
   }
diff --git a/contrib/llvm-project/llvm/lib/Analysis/AssumptionCache.cpp b/contrib/llvm-project/llvm/lib/Analysis/AssumptionCache.cpp
index 16bfd5c75902..70053fdf8d30 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/AssumptionCache.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/AssumptionCache.cpp
@@ -102,13 +102,12 @@ findAffectedValues(CallInst *CI,
         }
 
         Value *B;
-        ConstantInt *C;
         // (A & B) or (A | B) or (A ^ B).
         if (match(V, m_BitwiseLogic(m_Value(A), m_Value(B)))) {
           AddAffected(A);
           AddAffected(B);
         // (A << C) or (A >>_s C) or (A >>_u C) where C is some constant.
-        } else if (match(V, m_Shift(m_Value(A), m_ConstantInt(C)))) {
+        } else if (match(V, m_Shift(m_Value(A), m_ConstantInt()))) {
           AddAffected(A);
         }
       };
@@ -116,6 +115,14 @@ findAffectedValues(CallInst *CI,
       AddAffectedFromEq(A);
       AddAffectedFromEq(B);
     }
+
+    Value *X;
+    // Handle (A + C1) u< C2, which is the canonical form of A > C3 && A < C4,
+    // and recognized by LVI at least.
+    if (Pred == ICmpInst::ICMP_ULT &&
+        match(A, m_Add(m_Value(X), m_ConstantInt())) &&
+        match(B, m_ConstantInt()))
+      AddAffected(X);
   }
 }
 
@@ -156,15 +163,11 @@ void AssumptionCache::unregisterAssumption(CallInst *CI) {
       AffectedValues.erase(AVI);
   }
 
-  AssumeHandles.erase(
-      remove_if(AssumeHandles, [CI](ResultElem &RE) { return CI == RE; }),
-      AssumeHandles.end());
+  erase_value(AssumeHandles, CI);
 }
 
 void AssumptionCache::AffectedValueCallbackVH::deleted() {
-  auto AVI = AC->AffectedValues.find(getValPtr());
-  if (AVI != AC->AffectedValues.end())
-    AC->AffectedValues.erase(AVI);
+  AC->AffectedValues.erase(getValPtr());
   // 'this' now dangles!
 }
 
@@ -175,7 +178,7 @@ void AssumptionCache::transferAffectedValuesInCache(Value *OV, Value *NV) {
     return;
 
   for (auto &A : AVI->second)
-    if (std::find(NAVV.begin(), NAVV.end(), A) == NAVV.end())
+    if (!llvm::is_contained(NAVV, A))
       NAVV.push_back(A);
   AffectedValues.erase(OV);
 }
diff --git a/contrib/llvm-project/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/contrib/llvm-project/llvm/lib/Analysis/BasicAliasAnalysis.cpp
index 33f122728d2a..97d0cb63ef99 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/BasicAliasAnalysis.cpp
@@ -14,6 +14,7 @@
 
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -66,7 +67,7 @@ using namespace llvm;
 
 /// Enable analysis of recursive PHI nodes.
 static cl::opt<bool> EnableRecPhiAnalysis("basic-aa-recphi", cl::Hidden,
-                                          cl::init(false));
+                                          cl::init(true));
 
 /// By default, even on 32-bit architectures we use 64-bit integers for
 /// calculations. This will allow us to more-aggressively decompose indexing
@@ -91,7 +92,7 @@ STATISTIC(SearchTimes, "Number of times a GEP is decomposed");
 const unsigned MaxNumPhiBBsValueReachabilityCheck = 20;
 
 // The max limit of the search depth in DecomposeGEPExpression() and
-// GetUnderlyingObject(), both functions need to use the same search
+// getUnderlyingObject(), both functions need to use the same search
 // depth otherwise the algorithm in aliasGEP will assert.
 static const unsigned MaxLookupSearchDepth = 6;
 
@@ -115,50 +116,6 @@ bool BasicAAResult::invalidate(Function &Fn, const PreservedAnalyses &PA,
 // Useful predicates
 //===----------------------------------------------------------------------===//
 
-/// Returns true if the pointer is to a function-local object that never
-/// escapes from the function.
-static bool isNonEscapingLocalObject(
-    const Value *V,
-    SmallDenseMap<const Value *, bool, 8> *IsCapturedCache = nullptr) {
-  SmallDenseMap<const Value *, bool, 8>::iterator CacheIt;
-  if (IsCapturedCache) {
-    bool Inserted;
-    std::tie(CacheIt, Inserted) = IsCapturedCache->insert({V, false});
-    if (!Inserted)
-      // Found cached result, return it!
-      return CacheIt->second;
-  }
-
-  // If this is a local allocation, check to see if it escapes.
-  if (isa<AllocaInst>(V) || isNoAliasCall(V)) {
-    // Set StoreCaptures to True so that we can assume in our callers that the
-    // pointer is not the result of a load instruction. Currently
-    // PointerMayBeCaptured doesn't have any special analysis for the
-    // StoreCaptures=false case; if it did, our callers could be refined to be
-    // more precise.
-    auto Ret = !PointerMayBeCaptured(V, false, /*StoreCaptures=*/true);
-    if (IsCapturedCache)
-      CacheIt->second = Ret;
-    return Ret;
-  }
-
-  // If this is an argument that corresponds to a byval or noalias argument,
-  // then it has not escaped before entering the function.  Check if it escapes
-  // inside the function.
-  if (const Argument *A = dyn_cast<Argument>(V))
-    if (A->hasByValAttr() || A->hasNoAliasAttr()) {
-      // Note even if the argument is marked nocapture, we still need to check
-      // for copies made inside the function. The nocapture attribute only
-      // specifies that there are no copies made that outlive the function.
-      auto Ret = !PointerMayBeCaptured(V, false, /*StoreCaptures=*/true);
-      if (IsCapturedCache)
-        CacheIt->second = Ret;
-      return Ret;
-    }
-
-  return false;
-}
-
 /// Returns true if the pointer is one which would have been considered an
 /// escape by isNonEscapingLocalObject.
 static bool isEscapeSource(const Value *V) {
@@ -455,20 +412,22 @@ static unsigned getMaxPointerSize(const DataLayout &DL) {
 /// specified amount, but which may have other unrepresented high bits. As
 /// such, the gep cannot necessarily be reconstructed from its decomposed form.
 ///
-/// When DataLayout is around, this function is capable of analyzing everything
-/// that GetUnderlyingObject can look through. To be able to do that
-/// GetUnderlyingObject and DecomposeGEPExpression must use the same search
-/// depth (MaxLookupSearchDepth). When DataLayout not is around, it just looks
-/// through pointer casts.
-bool BasicAAResult::DecomposeGEPExpression(const Value *V,
-       DecomposedGEP &Decomposed, const DataLayout &DL, AssumptionCache *AC,
-       DominatorTree *DT) {
+/// This function is capable of analyzing everything that getUnderlyingObject
+/// can look through. To be able to do that getUnderlyingObject and
+/// DecomposeGEPExpression must use the same search depth
+/// (MaxLookupSearchDepth).
+BasicAAResult::DecomposedGEP
+BasicAAResult::DecomposeGEPExpression(const Value *V, const DataLayout &DL,
+                                      AssumptionCache *AC, DominatorTree *DT) {
   // Limit recursion depth to limit compile time in crazy cases.
   unsigned MaxLookup = MaxLookupSearchDepth;
   SearchTimes++;
+  const Instruction *CxtI = dyn_cast<Instruction>(V);
 
   unsigned MaxPointerSize = getMaxPointerSize(DL);
-  Decomposed.VarIndices.clear();
+  DecomposedGEP Decomposed;
+  Decomposed.Offset = APInt(MaxPointerSize, 0);
+  Decomposed.HasCompileTimeConstantScale = true;
   do {
     // See if this is a bitcast or GEP.
     const Operator *Op = dyn_cast<Operator>(V);
@@ -481,7 +440,7 @@ bool BasicAAResult::DecomposeGEPExpression(const Value *V,
         }
       }
       Decomposed.Base = V;
-      return false;
+      return Decomposed;
     }
 
     if (Op->getOpcode() == Instruction::BitCast ||
@@ -515,13 +474,13 @@ bool BasicAAResult::DecomposeGEPExpression(const Value *V,
       }
 
       Decomposed.Base = V;
-      return false;
+      return Decomposed;
     }
 
     // Don't attempt to analyze GEPs over unsized objects.
     if (!GEPOp->getSourceElementType()->isSized()) {
       Decomposed.Base = V;
-      return false;
+      return Decomposed;
     }
 
     // Don't attempt to analyze GEPs if index scale is not a compile-time
@@ -529,7 +488,7 @@ bool BasicAAResult::DecomposeGEPExpression(const Value *V,
     if (isa<ScalableVectorType>(GEPOp->getSourceElementType())) {
       Decomposed.Base = V;
       Decomposed.HasCompileTimeConstantScale = false;
-      return false;
+      return Decomposed;
     }
 
     unsigned AS = GEPOp->getPointerAddressSpace();
@@ -548,8 +507,7 @@ bool BasicAAResult::DecomposeGEPExpression(const Value *V,
         if (FieldNo == 0)
           continue;
 
-        Decomposed.StructOffset +=
-          DL.getStructLayout(STy)->getElementOffset(FieldNo);
+        Decomposed.Offset += DL.getStructLayout(STy)->getElementOffset(FieldNo);
         continue;
       }
 
@@ -557,10 +515,9 @@ bool BasicAAResult::DecomposeGEPExpression(const Value *V,
       if (const ConstantInt *CIdx = dyn_cast<ConstantInt>(Index)) {
         if (CIdx->isZero())
           continue;
-        Decomposed.OtherOffset +=
-            (DL.getTypeAllocSize(GTI.getIndexedType()).getFixedSize() *
-             CIdx->getValue().sextOrSelf(MaxPointerSize))
-                .sextOrTrunc(MaxPointerSize);
+        Decomposed.Offset +=
+            DL.getTypeAllocSize(GTI.getIndexedType()).getFixedSize() *
+            CIdx->getValue().sextOrTrunc(MaxPointerSize);
         continue;
       }
 
@@ -593,9 +550,10 @@ bool BasicAAResult::DecomposeGEPExpression(const Value *V,
       // FIXME: C1*Scale and the other operations in the decomposed
       // (C1*Scale)*V+C2*Scale can also overflow. We should check for this
       // possibility.
-      APInt WideScaledOffset = IndexOffset.sextOrTrunc(MaxPointerSize*2) *
-                                 Scale.sext(MaxPointerSize*2);
-      if (WideScaledOffset.getMinSignedBits() > MaxPointerSize) {
+      bool Overflow;
+      APInt ScaledOffset = IndexOffset.sextOrTrunc(MaxPointerSize)
+                           .smul_ov(Scale, Overflow);
+      if (Overflow) {
         Index = OrigIndex;
         IndexScale = 1;
         IndexOffset = 0;
@@ -604,7 +562,7 @@ bool BasicAAResult::DecomposeGEPExpression(const Value *V,
         if (PointerSize > Width)
           SExtBits += PointerSize - Width;
       } else {
-        Decomposed.OtherOffset += IndexOffset.sextOrTrunc(MaxPointerSize) * Scale;
+        Decomposed.Offset += ScaledOffset;
         Scale *= IndexScale.sextOrTrunc(MaxPointerSize);
       }
 
@@ -627,18 +585,14 @@ bool BasicAAResult::DecomposeGEPExpression(const Value *V,
       Scale = adjustToPointerSize(Scale, PointerSize);
 
       if (!!Scale) {
-        VariableGEPIndex Entry = {Index, ZExtBits, SExtBits, Scale};
+        VariableGEPIndex Entry = {Index, ZExtBits, SExtBits, Scale, CxtI};
         Decomposed.VarIndices.push_back(Entry);
       }
     }
 
     // Take care of wrap-arounds
-    if (GepHasConstantOffset) {
-      Decomposed.StructOffset =
-          adjustToPointerSize(Decomposed.StructOffset, PointerSize);
-      Decomposed.OtherOffset =
-          adjustToPointerSize(Decomposed.OtherOffset, PointerSize);
-    }
+    if (GepHasConstantOffset)
+      Decomposed.Offset = adjustToPointerSize(Decomposed.Offset, PointerSize);
 
     // Analyze the base pointer next.
     V = GEPOp->getOperand(0);
@@ -647,7 +601,7 @@ bool BasicAAResult::DecomposeGEPExpression(const Value *V,
   // If the chain of expressions is too deep, just return early.
   Decomposed.Base = V;
   SearchLimitReached++;
-  return true;
+  return Decomposed;
 }
 
 /// Returns whether the given pointer value points to memory that is local to
@@ -661,7 +615,7 @@ bool BasicAAResult::pointsToConstantMemory(const MemoryLocation &Loc,
   SmallVector<const Value *, 16> Worklist;
   Worklist.push_back(Loc.Ptr);
   do {
-    const Value *V = GetUnderlyingObject(Worklist.pop_back_val(), DL);
+    const Value *V = getUnderlyingObject(Worklist.pop_back_val());
     if (!Visited.insert(V).second) {
       Visited.clear();
       return AAResultBase::pointsToConstantMemory(Loc, AAQI, OrLocal);
@@ -698,8 +652,7 @@ bool BasicAAResult::pointsToConstantMemory(const MemoryLocation &Loc,
         Visited.clear();
         return AAResultBase::pointsToConstantMemory(Loc, AAQI, OrLocal);
       }
-      for (Value *IncValue : PN->incoming_values())
-        Worklist.push_back(IncValue);
+      append_range(Worklist, PN->incoming_values());
       continue;
     }
 
@@ -844,23 +797,8 @@ AliasResult BasicAAResult::alias(const MemoryLocation &LocA,
                                  AAQueryInfo &AAQI) {
   assert(notDifferentParent(LocA.Ptr, LocB.Ptr) &&
          "BasicAliasAnalysis doesn't support interprocedural queries.");
-
-  // If we have a directly cached entry for these locations, we have recursed
-  // through this once, so just return the cached results. Notably, when this
-  // happens, we don't clear the cache.
-  auto CacheIt = AAQI.AliasCache.find(AAQueryInfo::LocPair(LocA, LocB));
-  if (CacheIt != AAQI.AliasCache.end())
-    return CacheIt->second;
-
-  CacheIt = AAQI.AliasCache.find(AAQueryInfo::LocPair(LocB, LocA));
-  if (CacheIt != AAQI.AliasCache.end())
-    return CacheIt->second;
-
-  AliasResult Alias = aliasCheck(LocA.Ptr, LocA.Size, LocA.AATags, LocB.Ptr,
-                                 LocB.Size, LocB.AATags, AAQI);
-
-  VisitedPhiBBs.clear();
-  return Alias;
+  return aliasCheck(LocA.Ptr, LocA.Size, LocA.AATags, LocB.Ptr, LocB.Size,
+                    LocB.AATags, AAQI);
 }
 
 /// Checks to see if the specified callsite can clobber the specified memory
@@ -875,7 +813,7 @@ ModRefInfo BasicAAResult::getModRefInfo(const CallBase *Call,
   assert(notDifferentParent(Call, Loc.Ptr) &&
          "AliasAnalysis query involving multiple functions!");
 
-  const Value *Object = GetUnderlyingObject(Loc.Ptr, DL);
+  const Value *Object = getUnderlyingObject(Loc.Ptr);
 
   // Calls marked 'tail' cannot read or write allocas from the current frame
   // because the current frame might be destroyed by the time they run. However,
@@ -924,8 +862,9 @@ ModRefInfo BasicAAResult::getModRefInfo(const CallBase *Call,
 
       // If this is a no-capture pointer argument, see if we can tell that it
       // is impossible to alias the pointer we're checking.
-      AliasResult AR = getBestAAResults().alias(MemoryLocation(*CI),
-                                                MemoryLocation(Object), AAQI);
+      AliasResult AR = getBestAAResults().alias(
+          MemoryLocation::getBeforeOrAfter(*CI),
+          MemoryLocation::getBeforeOrAfter(Object), AAQI);
       if (AR != MustAlias)
         IsMustAlias = false;
       // Operand doesn't alias 'Object', continue looking for other aliases
@@ -971,26 +910,19 @@ ModRefInfo BasicAAResult::getModRefInfo(const CallBase *Call,
   if (isMallocOrCallocLikeFn(Call, &TLI)) {
     // Be conservative if the accessed pointer may alias the allocation -
     // fallback to the generic handling below.
-    if (getBestAAResults().alias(MemoryLocation(Call), Loc, AAQI) == NoAlias)
+    if (getBestAAResults().alias(MemoryLocation::getBeforeOrAfter(Call),
+                                 Loc, AAQI) == NoAlias)
       return ModRefInfo::NoModRef;
   }
 
-  // The semantics of memcpy intrinsics forbid overlap between their respective
-  // operands, i.e., source and destination of any given memcpy must no-alias.
-  // If Loc must-aliases either one of these two locations, then it necessarily
-  // no-aliases the other.
+  // The semantics of memcpy intrinsics either exactly overlap or do not
+  // overlap, i.e., source and destination of any given memcpy are either
+  // no-alias or must-alias.
   if (auto *Inst = dyn_cast<AnyMemCpyInst>(Call)) {
-    AliasResult SrcAA, DestAA;
-
-    if ((SrcAA = getBestAAResults().alias(MemoryLocation::getForSource(Inst),
-                                          Loc, AAQI)) == MustAlias)
-      // Loc is exactly the memcpy source thus disjoint from memcpy dest.
-      return ModRefInfo::Ref;
-    if ((DestAA = getBestAAResults().alias(MemoryLocation::getForDest(Inst),
-                                           Loc, AAQI)) == MustAlias)
-      // The converse case.
-      return ModRefInfo::Mod;
-
+    AliasResult SrcAA =
+        getBestAAResults().alias(MemoryLocation::getForSource(Inst), Loc, AAQI);
+    AliasResult DestAA =
+        getBestAAResults().alias(MemoryLocation::getForDest(Inst), Loc, AAQI);
     // It's also possible for Loc to alias both src and dest, or neither.
     ModRefInfo rv = ModRefInfo::NoModRef;
     if (SrcAA != NoAlias)
@@ -1015,6 +947,9 @@ ModRefInfo BasicAAResult::getModRefInfo(const CallBase *Call,
   // the guard invokes the "deopt" continuation.
   if (isIntrinsicCall(Call, Intrinsic::experimental_guard))
     return ModRefInfo::Ref;
+  // The same applies to deoptimize which is essentially a guard(false).
+  if (isIntrinsicCall(Call, Intrinsic::experimental_deoptimize))
+    return ModRefInfo::Ref;
 
   // Like assumes, invariant.start intrinsics were also marked as arbitrarily
   // writing so that proper control dependencies are maintained but they never
@@ -1081,166 +1016,6 @@ ModRefInfo BasicAAResult::getModRefInfo(const CallBase *Call1,
   return AAResultBase::getModRefInfo(Call1, Call2, AAQI);
 }
 
-/// Provide ad-hoc rules to disambiguate accesses through two GEP operators,
-/// both having the exact same pointer operand.
-static AliasResult aliasSameBasePointerGEPs(const GEPOperator *GEP1,
-                                            LocationSize MaybeV1Size,
-                                            const GEPOperator *GEP2,
-                                            LocationSize MaybeV2Size,
-                                            const DataLayout &DL) {
-  assert(GEP1->getPointerOperand()->stripPointerCastsAndInvariantGroups() ==
-             GEP2->getPointerOperand()->stripPointerCastsAndInvariantGroups() &&
-         GEP1->getPointerOperandType() == GEP2->getPointerOperandType() &&
-         "Expected GEPs with the same pointer operand");
-
-  // Try to determine whether GEP1 and GEP2 index through arrays, into structs,
-  // such that the struct field accesses provably cannot alias.
-  // We also need at least two indices (the pointer, and the struct field).
-  if (GEP1->getNumIndices() != GEP2->getNumIndices() ||
-      GEP1->getNumIndices() < 2)
-    return MayAlias;
-
-  // If we don't know the size of the accesses through both GEPs, we can't
-  // determine whether the struct fields accessed can't alias.
-  if (MaybeV1Size == LocationSize::unknown() ||
-      MaybeV2Size == LocationSize::unknown())
-    return MayAlias;
-
-  const uint64_t V1Size = MaybeV1Size.getValue();
-  const uint64_t V2Size = MaybeV2Size.getValue();
-
-  ConstantInt *C1 =
-      dyn_cast<ConstantInt>(GEP1->getOperand(GEP1->getNumOperands() - 1));
-  ConstantInt *C2 =
-      dyn_cast<ConstantInt>(GEP2->getOperand(GEP2->getNumOperands() - 1));
-
-  // If the last (struct) indices are constants and are equal, the other indices
-  // might be also be dynamically equal, so the GEPs can alias.
-  if (C1 && C2) {
-    unsigned BitWidth = std::max(C1->getBitWidth(), C2->getBitWidth());
-    if (C1->getValue().sextOrSelf(BitWidth) ==
-        C2->getValue().sextOrSelf(BitWidth))
-      return MayAlias;
-  }
-
-  // Find the last-indexed type of the GEP, i.e., the type you'd get if
-  // you stripped the last index.
-  // On the way, look at each indexed type.  If there's something other
-  // than an array, different indices can lead to different final types.
-  SmallVector<Value *, 8> IntermediateIndices;
-
-  // Insert the first index; we don't need to check the type indexed
-  // through it as it only drops the pointer indirection.
-  assert(GEP1->getNumIndices() > 1 && "Not enough GEP indices to examine");
-  IntermediateIndices.push_back(GEP1->getOperand(1));
-
-  // Insert all the remaining indices but the last one.
-  // Also, check that they all index through arrays.
-  for (unsigned i = 1, e = GEP1->getNumIndices() - 1; i != e; ++i) {
-    if (!isa<ArrayType>(GetElementPtrInst::getIndexedType(
-            GEP1->getSourceElementType(), IntermediateIndices)))
-      return MayAlias;
-    IntermediateIndices.push_back(GEP1->getOperand(i + 1));
-  }
-
-  auto *Ty = GetElementPtrInst::getIndexedType(
-    GEP1->getSourceElementType(), IntermediateIndices);
-  StructType *LastIndexedStruct = dyn_cast<StructType>(Ty);
-
-  if (isa<ArrayType>(Ty) || isa<VectorType>(Ty)) {
-    // We know that:
-    // - both GEPs begin indexing from the exact same pointer;
-    // - the last indices in both GEPs are constants, indexing into a sequential
-    //   type (array or vector);
-    // - both GEPs only index through arrays prior to that.
-    //
-    // Because array indices greater than the number of elements are valid in
-    // GEPs, unless we know the intermediate indices are identical between
-    // GEP1 and GEP2 we cannot guarantee that the last indexed arrays don't
-    // partially overlap. We also need to check that the loaded size matches
-    // the element size, otherwise we could still have overlap.
-    Type *LastElementTy = GetElementPtrInst::getTypeAtIndex(Ty, (uint64_t)0);
-    const uint64_t ElementSize =
-        DL.getTypeStoreSize(LastElementTy).getFixedSize();
-    if (V1Size != ElementSize || V2Size != ElementSize)
-      return MayAlias;
-
-    for (unsigned i = 0, e = GEP1->getNumIndices() - 1; i != e; ++i)
-      if (GEP1->getOperand(i + 1) != GEP2->getOperand(i + 1))
-        return MayAlias;
-
-    // Now we know that the array/pointer that GEP1 indexes into and that
-    // that GEP2 indexes into must either precisely overlap or be disjoint.
-    // Because they cannot partially overlap and because fields in an array
-    // cannot overlap, if we can prove the final indices are different between
-    // GEP1 and GEP2, we can conclude GEP1 and GEP2 don't alias.
-
-    // If the last indices are constants, we've already checked they don't
-    // equal each other so we can exit early.
-    if (C1 && C2)
-      return NoAlias;
-    {
-      Value *GEP1LastIdx = GEP1->getOperand(GEP1->getNumOperands() - 1);
-      Value *GEP2LastIdx = GEP2->getOperand(GEP2->getNumOperands() - 1);
-      if (isa<PHINode>(GEP1LastIdx) || isa<PHINode>(GEP2LastIdx)) {
-        // If one of the indices is a PHI node, be safe and only use
-        // computeKnownBits so we don't make any assumptions about the
-        // relationships between the two indices. This is important if we're
-        // asking about values from different loop iterations. See PR32314.
-        // TODO: We may be able to change the check so we only do this when
-        // we definitely looked through a PHINode.
-        if (GEP1LastIdx != GEP2LastIdx &&
-            GEP1LastIdx->getType() == GEP2LastIdx->getType()) {
-          KnownBits Known1 = computeKnownBits(GEP1LastIdx, DL);
-          KnownBits Known2 = computeKnownBits(GEP2LastIdx, DL);
-          if (Known1.Zero.intersects(Known2.One) ||
-              Known1.One.intersects(Known2.Zero))
-            return NoAlias;
-        }
-      } else if (isKnownNonEqual(GEP1LastIdx, GEP2LastIdx, DL))
-        return NoAlias;
-    }
-    return MayAlias;
-  } else if (!LastIndexedStruct || !C1 || !C2) {
-    return MayAlias;
-  }
-
-  if (C1->getValue().getActiveBits() > 64 ||
-      C2->getValue().getActiveBits() > 64)
-    return MayAlias;
-
-  // We know that:
-  // - both GEPs begin indexing from the exact same pointer;
-  // - the last indices in both GEPs are constants, indexing into a struct;
-  // - said indices are different, hence, the pointed-to fields are different;
-  // - both GEPs only index through arrays prior to that.
-  //
-  // This lets us determine that the struct that GEP1 indexes into and the
-  // struct that GEP2 indexes into must either precisely overlap or be
-  // completely disjoint.  Because they cannot partially overlap, indexing into
-  // different non-overlapping fields of the struct will never alias.
-
-  // Therefore, the only remaining thing needed to show that both GEPs can't
-  // alias is that the fields are not overlapping.
-  const StructLayout *SL = DL.getStructLayout(LastIndexedStruct);
-  const uint64_t StructSize = SL->getSizeInBytes();
-  const uint64_t V1Off = SL->getElementOffset(C1->getZExtValue());
-  const uint64_t V2Off = SL->getElementOffset(C2->getZExtValue());
-
-  auto EltsDontOverlap = [StructSize](uint64_t V1Off, uint64_t V1Size,
-                                      uint64_t V2Off, uint64_t V2Size) {
-    return V1Off < V2Off && V1Off + V1Size <= V2Off &&
-           ((V2Off + V2Size <= StructSize) ||
-            (V2Off + V2Size - StructSize <= V1Off));
-  };
-
-  if (EltsDontOverlap(V1Off, V1Size, V2Off, V2Size) ||
-      EltsDontOverlap(V2Off, V2Size, V1Off, V1Size))
-    return NoAlias;
-
-  return MayAlias;
-}
-
 // If a we have (a) a GEP and (b) a pointer based on an alloca, and the
 // beginning of the object the GEP points would have a negative offset with
 // repsect to the alloca, that means the GEP can not alias pointer (b).
@@ -1276,7 +1051,7 @@ bool BasicAAResult::isGEPBaseAtNegativeOffset(const GEPOperator *GEPOp,
       const DecomposedGEP &DecompGEP, const DecomposedGEP &DecompObject,
       LocationSize MaybeObjectAccessSize) {
   // If the object access size is unknown, or the GEP isn't inbounds, bail.
-  if (MaybeObjectAccessSize == LocationSize::unknown() || !GEPOp->isInBounds())
+  if (!MaybeObjectAccessSize.hasValue() || !GEPOp->isInBounds())
     return false;
 
   const uint64_t ObjectAccessSize = MaybeObjectAccessSize.getValue();
@@ -1289,9 +1064,6 @@ bool BasicAAResult::isGEPBaseAtNegativeOffset(const GEPOperator *GEPOp,
       !DecompObject.VarIndices.empty())
     return false;
 
-  APInt ObjectBaseOffset = DecompObject.StructOffset +
-                           DecompObject.OtherOffset;
-
   // If the GEP has no variable indices, we know the precise offset
   // from the base, then use it. If the GEP has variable indices,
   // we can't get exact GEP offset to identify pointer alias. So return
@@ -1299,33 +1071,21 @@ bool BasicAAResult::isGEPBaseAtNegativeOffset(const GEPOperator *GEPOp,
   if (!DecompGEP.VarIndices.empty())
     return false;
 
-  APInt GEPBaseOffset = DecompGEP.StructOffset;
-  GEPBaseOffset += DecompGEP.OtherOffset;
-
-  return GEPBaseOffset.sge(ObjectBaseOffset + (int64_t)ObjectAccessSize);
+  return DecompGEP.Offset.sge(DecompObject.Offset + (int64_t)ObjectAccessSize);
 }
 
 /// Provides a bunch of ad-hoc rules to disambiguate a GEP instruction against
 /// another pointer.
 ///
 /// We know that V1 is a GEP, but we don't know anything about V2.
-/// UnderlyingV1 is GetUnderlyingObject(GEP1, DL), UnderlyingV2 is the same for
+/// UnderlyingV1 is getUnderlyingObject(GEP1), UnderlyingV2 is the same for
 /// V2.
 AliasResult BasicAAResult::aliasGEP(
     const GEPOperator *GEP1, LocationSize V1Size, const AAMDNodes &V1AAInfo,
     const Value *V2, LocationSize V2Size, const AAMDNodes &V2AAInfo,
     const Value *UnderlyingV1, const Value *UnderlyingV2, AAQueryInfo &AAQI) {
-  DecomposedGEP DecompGEP1, DecompGEP2;
-  unsigned MaxPointerSize = getMaxPointerSize(DL);
-  DecompGEP1.StructOffset = DecompGEP1.OtherOffset = APInt(MaxPointerSize, 0);
-  DecompGEP2.StructOffset = DecompGEP2.OtherOffset = APInt(MaxPointerSize, 0);
-  DecompGEP1.HasCompileTimeConstantScale =
-      DecompGEP2.HasCompileTimeConstantScale = true;
-
-  bool GEP1MaxLookupReached =
-    DecomposeGEPExpression(GEP1, DecompGEP1, DL, &AC, DT);
-  bool GEP2MaxLookupReached =
-    DecomposeGEPExpression(V2, DecompGEP2, DL, &AC, DT);
+  DecomposedGEP DecompGEP1 = DecomposeGEPExpression(GEP1, DL, &AC, DT);
+  DecomposedGEP DecompGEP2 = DecomposeGEPExpression(V2, DL, &AC, DT);
 
   // Don't attempt to analyze the decomposed GEP if index scale is not a
   // compile-time constant.
@@ -1333,18 +1093,14 @@ AliasResult BasicAAResult::aliasGEP(
       !DecompGEP2.HasCompileTimeConstantScale)
     return MayAlias;
 
-  APInt GEP1BaseOffset = DecompGEP1.StructOffset + DecompGEP1.OtherOffset;
-  APInt GEP2BaseOffset = DecompGEP2.StructOffset + DecompGEP2.OtherOffset;
-
   assert(DecompGEP1.Base == UnderlyingV1 && DecompGEP2.Base == UnderlyingV2 &&
          "DecomposeGEPExpression returned a result different from "
-         "GetUnderlyingObject");
+         "getUnderlyingObject");
 
   // If the GEP's offset relative to its base is such that the base would
   // fall below the start of the object underlying V2, then the GEP and V2
   // cannot alias.
-  if (!GEP1MaxLookupReached && !GEP2MaxLookupReached &&
-      isGEPBaseAtNegativeOffset(GEP1, DecompGEP1, DecompGEP2, V2Size))
+  if (isGEPBaseAtNegativeOffset(GEP1, DecompGEP1, DecompGEP2, V2Size))
     return NoAlias;
   // If we have two gep instructions with must-alias or not-alias'ing base
   // pointers, figure out if the indexes to the GEP tell us anything about the
@@ -1352,32 +1108,22 @@ AliasResult BasicAAResult::aliasGEP(
   if (const GEPOperator *GEP2 = dyn_cast<GEPOperator>(V2)) {
     // Check for the GEP base being at a negative offset, this time in the other
     // direction.
-    if (!GEP1MaxLookupReached && !GEP2MaxLookupReached &&
-        isGEPBaseAtNegativeOffset(GEP2, DecompGEP2, DecompGEP1, V1Size))
+    if (isGEPBaseAtNegativeOffset(GEP2, DecompGEP2, DecompGEP1, V1Size))
       return NoAlias;
     // Do the base pointers alias?
-    AliasResult BaseAlias =
-        aliasCheck(UnderlyingV1, LocationSize::unknown(), AAMDNodes(),
-                   UnderlyingV2, LocationSize::unknown(), AAMDNodes(), AAQI);
-
-    // Check for geps of non-aliasing underlying pointers where the offsets are
-    // identical.
-    if ((BaseAlias == MayAlias) && V1Size == V2Size) {
-      // Do the base pointers alias assuming type and size.
-      AliasResult PreciseBaseAlias = aliasCheck(
-          UnderlyingV1, V1Size, V1AAInfo, UnderlyingV2, V2Size, V2AAInfo, AAQI);
-      if (PreciseBaseAlias == NoAlias) {
-        // See if the computed offset from the common pointer tells us about the
-        // relation of the resulting pointer.
-        // If the max search depth is reached the result is undefined
-        if (GEP2MaxLookupReached || GEP1MaxLookupReached)
-          return MayAlias;
-
-        // Same offsets.
-        if (GEP1BaseOffset == GEP2BaseOffset &&
-            DecompGEP1.VarIndices == DecompGEP2.VarIndices)
-          return NoAlias;
-      }
+    AliasResult BaseAlias = getBestAAResults().alias(
+        MemoryLocation::getBeforeOrAfter(UnderlyingV1),
+        MemoryLocation::getBeforeOrAfter(UnderlyingV2), AAQI);
+
+    // For GEPs with identical offsets, we can preserve the size and AAInfo
+    // when performing the alias check on the underlying objects.
+    if (BaseAlias == MayAlias && DecompGEP1.Offset == DecompGEP2.Offset &&
+        DecompGEP1.VarIndices == DecompGEP2.VarIndices) {
+      AliasResult PreciseBaseAlias = getBestAAResults().alias(
+          MemoryLocation(UnderlyingV1, V1Size, V1AAInfo),
+          MemoryLocation(UnderlyingV2, V2Size, V2AAInfo), AAQI);
+      if (PreciseBaseAlias == NoAlias)
+        return NoAlias;
     }
 
     // If we get a No or May, then return it immediately, no amount of analysis
@@ -1387,28 +1133,9 @@ AliasResult BasicAAResult::aliasGEP(
       return BaseAlias;
     }
 
-    // Otherwise, we have a MustAlias.  Since the base pointers alias each other
-    // exactly, see if the computed offset from the common pointer tells us
-    // about the relation of the resulting pointer.
-    // If we know the two GEPs are based off of the exact same pointer (and not
-    // just the same underlying object), see if that tells us anything about
-    // the resulting pointers.
-    if (GEP1->getPointerOperand()->stripPointerCastsAndInvariantGroups() ==
-            GEP2->getPointerOperand()->stripPointerCastsAndInvariantGroups() &&
-        GEP1->getPointerOperandType() == GEP2->getPointerOperandType()) {
-      AliasResult R = aliasSameBasePointerGEPs(GEP1, V1Size, GEP2, V2Size, DL);
-      // If we couldn't find anything interesting, don't abandon just yet.
-      if (R != MayAlias)
-        return R;
-    }
-
-    // If the max search depth is reached, the result is undefined
-    if (GEP2MaxLookupReached || GEP1MaxLookupReached)
-      return MayAlias;
-
     // Subtract the GEP2 pointer from the GEP1 pointer to find out their
     // symbolic difference.
-    GEP1BaseOffset -= GEP2BaseOffset;
+    DecompGEP1.Offset -= DecompGEP2.Offset;
     GetIndexDifference(DecompGEP1.VarIndices, DecompGEP2.VarIndices);
 
   } else {
@@ -1417,12 +1144,12 @@ AliasResult BasicAAResult::aliasGEP(
     // pointer, we know they cannot alias.
 
     // If both accesses are unknown size, we can't do anything useful here.
-    if (V1Size == LocationSize::unknown() && V2Size == LocationSize::unknown())
+    if (!V1Size.hasValue() && !V2Size.hasValue())
       return MayAlias;
 
-    AliasResult R = aliasCheck(UnderlyingV1, LocationSize::unknown(),
-                               AAMDNodes(), V2, LocationSize::unknown(),
-                               V2AAInfo, AAQI, nullptr, UnderlyingV2);
+    AliasResult R = getBestAAResults().alias(
+        MemoryLocation::getBeforeOrAfter(UnderlyingV1),
+        MemoryLocation(V2, V2Size, V2AAInfo), AAQI);
     if (R != MustAlias) {
       // If V2 may alias GEP base pointer, conservatively returns MayAlias.
       // If V2 is known not to alias GEP base pointer, then the two values
@@ -1432,10 +1159,6 @@ AliasResult BasicAAResult::aliasGEP(
       assert(R == NoAlias || R == MayAlias);
       return R;
     }
-
-    // If the max search depth is reached the result is undefined
-    if (GEP1MaxLookupReached)
-      return MayAlias;
   }
 
   // In the two GEP Case, if there is no difference in the offsets of the
@@ -1444,17 +1167,17 @@ AliasResult BasicAAResult::aliasGEP(
   //
   // In the other case, if we have getelementptr <ptr>, 0, 0, 0, 0, ... and V2
   // must aliases the GEP, the end result is a must alias also.
-  if (GEP1BaseOffset == 0 && DecompGEP1.VarIndices.empty())
+  if (DecompGEP1.Offset == 0 && DecompGEP1.VarIndices.empty())
     return MustAlias;
 
   // If there is a constant difference between the pointers, but the difference
   // is less than the size of the associated memory object, then we know
   // that the objects are partially overlapping.  If the difference is
   // greater, we know they do not overlap.
-  if (GEP1BaseOffset != 0 && DecompGEP1.VarIndices.empty()) {
-    if (GEP1BaseOffset.sge(0)) {
-      if (V2Size != LocationSize::unknown()) {
-        if (GEP1BaseOffset.ult(V2Size.getValue()))
+  if (DecompGEP1.Offset != 0 && DecompGEP1.VarIndices.empty()) {
+    if (DecompGEP1.Offset.sge(0)) {
+      if (V2Size.hasValue()) {
+        if (DecompGEP1.Offset.ult(V2Size.getValue()))
           return PartialAlias;
         return NoAlias;
       }
@@ -1465,11 +1188,8 @@ AliasResult BasicAAResult::aliasGEP(
       // ---------------->|
       // |-->V1Size       |-------> V2Size
       // GEP1             V2
-      // We need to know that V2Size is not unknown, otherwise we might have
-      // stripped a gep with negative index ('gep <ptr>, -1, ...).
-      if (V1Size != LocationSize::unknown() &&
-          V2Size != LocationSize::unknown()) {
-        if ((-GEP1BaseOffset).ult(V1Size.getValue()))
+      if (V1Size.hasValue()) {
+        if ((-DecompGEP1.Offset).ult(V1Size.getValue()))
           return PartialAlias;
         return NoAlias;
       }
@@ -1477,24 +1197,24 @@ AliasResult BasicAAResult::aliasGEP(
   }
 
   if (!DecompGEP1.VarIndices.empty()) {
-    APInt Modulo(MaxPointerSize, 0);
-    bool AllPositive = true;
+    APInt GCD;
+    bool AllNonNegative = DecompGEP1.Offset.isNonNegative();
+    bool AllNonPositive = DecompGEP1.Offset.isNonPositive();
     for (unsigned i = 0, e = DecompGEP1.VarIndices.size(); i != e; ++i) {
+      const APInt &Scale = DecompGEP1.VarIndices[i].Scale;
+      if (i == 0)
+        GCD = Scale.abs();
+      else
+        GCD = APIntOps::GreatestCommonDivisor(GCD, Scale.abs());
 
-      // Try to distinguish something like &A[i][1] against &A[42][0].
-      // Grab the least significant bit set in any of the scales. We
-      // don't need std::abs here (even if the scale's negative) as we'll
-      // be ^'ing Modulo with itself later.
-      Modulo |= DecompGEP1.VarIndices[i].Scale;
-
-      if (AllPositive) {
+      if (AllNonNegative || AllNonPositive) {
         // If the Value could change between cycles, then any reasoning about
         // the Value this cycle may not hold in the next cycle. We'll just
         // give up if we can't determine conditions that hold for every cycle:
         const Value *V = DecompGEP1.VarIndices[i].V;
+        const Instruction *CxtI = DecompGEP1.VarIndices[i].CxtI;
 
-        KnownBits Known =
-            computeKnownBits(V, DL, 0, &AC, dyn_cast<Instruction>(GEP1), DT);
+        KnownBits Known = computeKnownBits(V, DL, 0, &AC, CxtI, DT);
         bool SignKnownZero = Known.isNonNegative();
         bool SignKnownOne = Known.isNegative();
 
@@ -1504,36 +1224,77 @@ AliasResult BasicAAResult::aliasGEP(
         SignKnownZero |= IsZExt;
         SignKnownOne &= !IsZExt;
 
-        // If the variable begins with a zero then we know it's
-        // positive, regardless of whether the value is signed or
-        // unsigned.
-        APInt Scale = DecompGEP1.VarIndices[i].Scale;
-        AllPositive =
-            (SignKnownZero && Scale.sge(0)) || (SignKnownOne && Scale.slt(0));
+        AllNonNegative &= (SignKnownZero && Scale.isNonNegative()) ||
+                          (SignKnownOne && Scale.isNonPositive());
+        AllNonPositive &= (SignKnownZero && Scale.isNonPositive()) ||
+                          (SignKnownOne && Scale.isNonNegative());
       }
     }
 
-    Modulo = Modulo ^ (Modulo & (Modulo - 1));
-
-    // We can compute the difference between the two addresses
-    // mod Modulo. Check whether that difference guarantees that the
-    // two locations do not alias.
-    APInt ModOffset = GEP1BaseOffset & (Modulo - 1);
-    if (V1Size != LocationSize::unknown() &&
-        V2Size != LocationSize::unknown() && ModOffset.uge(V2Size.getValue()) &&
-        (Modulo - ModOffset).uge(V1Size.getValue()))
+    // We now have accesses at two offsets from the same base:
+    //  1. (...)*GCD + DecompGEP1.Offset with size V1Size
+    //  2. 0 with size V2Size
+    // Using arithmetic modulo GCD, the accesses are at
+    // [ModOffset..ModOffset+V1Size) and [0..V2Size). If the first access fits
+    // into the range [V2Size..GCD), then we know they cannot overlap.
+    APInt ModOffset = DecompGEP1.Offset.srem(GCD);
+    if (ModOffset.isNegative())
+      ModOffset += GCD; // We want mod, not rem.
+    if (V1Size.hasValue() && V2Size.hasValue() &&
+        ModOffset.uge(V2Size.getValue()) &&
+        (GCD - ModOffset).uge(V1Size.getValue()))
       return NoAlias;
 
-    // If we know all the variables are positive, then GEP1 >= GEP1BasePtr.
-    // If GEP1BasePtr > V2 (GEP1BaseOffset > 0) then we know the pointers
-    // don't alias if V2Size can fit in the gap between V2 and GEP1BasePtr.
-    if (AllPositive && GEP1BaseOffset.sgt(0) &&
-        V2Size != LocationSize::unknown() &&
-        GEP1BaseOffset.uge(V2Size.getValue()))
+    // If we know all the variables are non-negative, then the total offset is
+    // also non-negative and >= DecompGEP1.Offset. We have the following layout:
+    // [0, V2Size) ... [TotalOffset, TotalOffer+V1Size]
+    // If DecompGEP1.Offset >= V2Size, the accesses don't alias.
+    if (AllNonNegative && V2Size.hasValue() &&
+        DecompGEP1.Offset.uge(V2Size.getValue()))
       return NoAlias;
+    // Similarly, if the variables are non-positive, then the total offset is
+    // also non-positive and <= DecompGEP1.Offset. We have the following layout:
+    // [TotalOffset, TotalOffset+V1Size) ... [0, V2Size)
+    // If -DecompGEP1.Offset >= V1Size, the accesses don't alias.
+    if (AllNonPositive && V1Size.hasValue() &&
+        (-DecompGEP1.Offset).uge(V1Size.getValue()))
+      return NoAlias;
+
+    if (V1Size.hasValue() && V2Size.hasValue()) {
+      // Try to determine whether abs(VarIndex) > 0.
+      Optional<APInt> MinAbsVarIndex;
+      if (DecompGEP1.VarIndices.size() == 1) {
+        // VarIndex = Scale*V. If V != 0 then abs(VarIndex) >= abs(Scale).
+        const VariableGEPIndex &Var = DecompGEP1.VarIndices[0];
+        if (isKnownNonZero(Var.V, DL, 0, &AC, Var.CxtI, DT))
+          MinAbsVarIndex = Var.Scale.abs();
+      } else if (DecompGEP1.VarIndices.size() == 2) {
+        // VarIndex = Scale*V0 + (-Scale)*V1.
+        // If V0 != V1 then abs(VarIndex) >= abs(Scale).
+        // Check that VisitedPhiBBs is empty, to avoid reasoning about
+        // inequality of values across loop iterations.
+        const VariableGEPIndex &Var0 = DecompGEP1.VarIndices[0];
+        const VariableGEPIndex &Var1 = DecompGEP1.VarIndices[1];
+        if (Var0.Scale == -Var1.Scale && Var0.ZExtBits == Var1.ZExtBits &&
+            Var0.SExtBits == Var1.SExtBits && VisitedPhiBBs.empty() &&
+            isKnownNonEqual(Var0.V, Var1.V, DL, &AC, /* CxtI */ nullptr, DT))
+          MinAbsVarIndex = Var0.Scale.abs();
+      }
+
+      if (MinAbsVarIndex) {
+        // The constant offset will have added at least +/-MinAbsVarIndex to it.
+        APInt OffsetLo = DecompGEP1.Offset - *MinAbsVarIndex;
+        APInt OffsetHi = DecompGEP1.Offset + *MinAbsVarIndex;
+        // Check that an access at OffsetLo or lower, and an access at OffsetHi
+        // or higher both do not alias.
+        if (OffsetLo.isNegative() && (-OffsetLo).uge(V1Size.getValue()) &&
+            OffsetHi.isNonNegative() && OffsetHi.uge(V2Size.getValue()))
+          return NoAlias;
+      }
+    }
 
     if (constantOffsetHeuristic(DecompGEP1.VarIndices, V1Size, V2Size,
-                                GEP1BaseOffset, &AC, DT))
+                                DecompGEP1.Offset, &AC, DT))
       return NoAlias;
   }
 
@@ -1561,31 +1322,33 @@ AliasResult
 BasicAAResult::aliasSelect(const SelectInst *SI, LocationSize SISize,
                            const AAMDNodes &SIAAInfo, const Value *V2,
                            LocationSize V2Size, const AAMDNodes &V2AAInfo,
-                           const Value *UnderV2, AAQueryInfo &AAQI) {
+                           AAQueryInfo &AAQI) {
   // If the values are Selects with the same condition, we can do a more precise
   // check: just check for aliases between the values on corresponding arms.
   if (const SelectInst *SI2 = dyn_cast<SelectInst>(V2))
     if (SI->getCondition() == SI2->getCondition()) {
-      AliasResult Alias =
-          aliasCheck(SI->getTrueValue(), SISize, SIAAInfo, SI2->getTrueValue(),
-                     V2Size, V2AAInfo, AAQI);
+      AliasResult Alias = getBestAAResults().alias(
+          MemoryLocation(SI->getTrueValue(), SISize, SIAAInfo),
+          MemoryLocation(SI2->getTrueValue(), V2Size, V2AAInfo), AAQI);
       if (Alias == MayAlias)
         return MayAlias;
-      AliasResult ThisAlias =
-          aliasCheck(SI->getFalseValue(), SISize, SIAAInfo,
-                     SI2->getFalseValue(), V2Size, V2AAInfo, AAQI);
+      AliasResult ThisAlias = getBestAAResults().alias(
+          MemoryLocation(SI->getFalseValue(), SISize, SIAAInfo),
+          MemoryLocation(SI2->getFalseValue(), V2Size, V2AAInfo), AAQI);
       return MergeAliasResults(ThisAlias, Alias);
     }
 
   // If both arms of the Select node NoAlias or MustAlias V2, then returns
   // NoAlias / MustAlias. Otherwise, returns MayAlias.
-  AliasResult Alias = aliasCheck(V2, V2Size, V2AAInfo, SI->getTrueValue(),
-                                 SISize, SIAAInfo, AAQI, UnderV2);
+  AliasResult Alias = getBestAAResults().alias(
+      MemoryLocation(V2, V2Size, V2AAInfo),
+      MemoryLocation(SI->getTrueValue(), SISize, SIAAInfo), AAQI);
   if (Alias == MayAlias)
     return MayAlias;
 
-  AliasResult ThisAlias = aliasCheck(V2, V2Size, V2AAInfo, SI->getFalseValue(),
-                                     SISize, SIAAInfo, AAQI, UnderV2);
+  AliasResult ThisAlias = getBestAAResults().alias(
+      MemoryLocation(V2, V2Size, V2AAInfo),
+      MemoryLocation(SI->getFalseValue(), SISize, SIAAInfo), AAQI);
   return MergeAliasResults(ThisAlias, Alias);
 }
 
@@ -1595,80 +1358,41 @@ AliasResult BasicAAResult::aliasPHI(const PHINode *PN, LocationSize PNSize,
                                     const AAMDNodes &PNAAInfo, const Value *V2,
                                     LocationSize V2Size,
                                     const AAMDNodes &V2AAInfo,
-                                    const Value *UnderV2, AAQueryInfo &AAQI) {
-  // Track phi nodes we have visited. We use this information when we determine
-  // value equivalence.
-  VisitedPhiBBs.insert(PN->getParent());
-
+                                    AAQueryInfo &AAQI) {
   // If the values are PHIs in the same block, we can do a more precise
   // as well as efficient check: just check for aliases between the values
   // on corresponding edges.
   if (const PHINode *PN2 = dyn_cast<PHINode>(V2))
     if (PN2->getParent() == PN->getParent()) {
-      AAQueryInfo::LocPair Locs(MemoryLocation(PN, PNSize, PNAAInfo),
-                                MemoryLocation(V2, V2Size, V2AAInfo));
-      if (PN > V2)
-        std::swap(Locs.first, Locs.second);
-      // Analyse the PHIs' inputs under the assumption that the PHIs are
-      // NoAlias.
-      // If the PHIs are May/MustAlias there must be (recursively) an input
-      // operand from outside the PHIs' cycle that is MayAlias/MustAlias or
-      // there must be an operation on the PHIs within the PHIs' value cycle
-      // that causes a MayAlias.
-      // Pretend the phis do not alias.
-      AliasResult Alias = NoAlias;
-      AliasResult OrigAliasResult;
-      {
-        // Limited lifetime iterator invalidated by the aliasCheck call below.
-        auto CacheIt = AAQI.AliasCache.find(Locs);
-        assert((CacheIt != AAQI.AliasCache.end()) &&
-               "There must exist an entry for the phi node");
-        OrigAliasResult = CacheIt->second;
-        CacheIt->second = NoAlias;
-      }
-
+      Optional<AliasResult> Alias;
       for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
-        AliasResult ThisAlias =
-            aliasCheck(PN->getIncomingValue(i), PNSize, PNAAInfo,
-                       PN2->getIncomingValueForBlock(PN->getIncomingBlock(i)),
-                       V2Size, V2AAInfo, AAQI);
-        Alias = MergeAliasResults(ThisAlias, Alias);
-        if (Alias == MayAlias)
+        AliasResult ThisAlias = getBestAAResults().alias(
+            MemoryLocation(PN->getIncomingValue(i), PNSize, PNAAInfo),
+            MemoryLocation(
+                PN2->getIncomingValueForBlock(PN->getIncomingBlock(i)), V2Size,
+                V2AAInfo),
+            AAQI);
+        if (Alias)
+          *Alias = MergeAliasResults(*Alias, ThisAlias);
+        else
+          Alias = ThisAlias;
+        if (*Alias == MayAlias)
           break;
       }
-
-      // Reset if speculation failed.
-      if (Alias != NoAlias) {
-        auto Pair =
-            AAQI.AliasCache.insert(std::make_pair(Locs, OrigAliasResult));
-        assert(!Pair.second && "Entry must have existed");
-        Pair.first->second = OrigAliasResult;
-      }
-      return Alias;
+      return *Alias;
     }
 
   SmallVector<Value *, 4> V1Srcs;
-  // For a recursive phi, that recurses through a contant gep, we can perform
-  // aliasing calculations using the other phi operands with an unknown size to
-  // specify that an unknown number of elements after the initial value are
-  // potentially accessed.
+  // If a phi operand recurses back to the phi, we can still determine NoAlias
+  // if we don't alias the underlying objects of the other phi operands, as we
+  // know that the recursive phi needs to be based on them in some way.
   bool isRecursive = false;
   auto CheckForRecPhi = [&](Value *PV) {
     if (!EnableRecPhiAnalysis)
       return false;
-    if (GEPOperator *PVGEP = dyn_cast<GEPOperator>(PV)) {
-      // Check whether the incoming value is a GEP that advances the pointer
-      // result of this PHI node (e.g. in a loop). If this is the case, we
-      // would recurse and always get a MayAlias. Handle this case specially
-      // below. We need to ensure that the phi is inbounds and has a constant
-      // positive operand so that we can check for alias with the initial value
-      // and an unknown but positive size.
-      if (PVGEP->getPointerOperand() == PN && PVGEP->isInBounds() &&
-          PVGEP->getNumIndices() == 1 && isa<ConstantInt>(PVGEP->idx_begin()) &&
-          !cast<ConstantInt>(PVGEP->idx_begin())->isNegative()) {
-        isRecursive = true;
-        return true;
-      }
+    if (getUnderlyingObject(PV) == PN) {
+      isRecursive = true;
+      return true;
     }
     return false;
   };
@@ -1714,14 +1438,30 @@ AliasResult BasicAAResult::aliasPHI(const PHINode *PN, LocationSize PNSize,
   if (V1Srcs.empty())
     return MayAlias;
 
-  // If this PHI node is recursive, set the size of the accessed memory to
-  // unknown to represent all the possible values the GEP could advance the
-  // pointer to.
+  // If this PHI node is recursive, indicate that the pointer may be moved
+  // across iterations. We can only prove NoAlias if different underlying
+  // objects are involved.
   if (isRecursive)
-    PNSize = LocationSize::unknown();
-
-  AliasResult Alias = aliasCheck(V2, V2Size, V2AAInfo, V1Srcs[0], PNSize,
-                                 PNAAInfo, AAQI, UnderV2);
+    PNSize = LocationSize::beforeOrAfterPointer();
+
+  // In the recursive alias queries below, we may compare values from two
+  // different loop iterations. Keep track of visited phi blocks, which will
+  // be used when determining value equivalence.
+  bool BlockInserted = VisitedPhiBBs.insert(PN->getParent()).second;
+  auto _ = make_scope_exit([&]() {
+    if (BlockInserted)
+      VisitedPhiBBs.erase(PN->getParent());
+  });
+
+  // If we inserted a block into VisitedPhiBBs, alias analysis results that
+  // have been cached earlier may no longer be valid. Perform recursive queries
+  // with a new AAQueryInfo.
+  AAQueryInfo NewAAQI;
+  AAQueryInfo *UseAAQI = BlockInserted ? &NewAAQI : &AAQI;
+
+  AliasResult Alias = getBestAAResults().alias(
+      MemoryLocation(V2, V2Size, V2AAInfo),
+      MemoryLocation(V1Srcs[0], PNSize, PNAAInfo), *UseAAQI);
 
   // Early exit if the check of the first PHI source against V2 is MayAlias.
   // Other results are not possible.
@@ -1737,8 +1477,9 @@ AliasResult BasicAAResult::aliasPHI(const PHINode *PN, LocationSize PNSize,
   for (unsigned i = 1, e = V1Srcs.size(); i != e; ++i) {
     Value *V = V1Srcs[i];
 
-    AliasResult ThisAlias =
-        aliasCheck(V2, V2Size, V2AAInfo, V, PNSize, PNAAInfo, AAQI, UnderV2);
+    AliasResult ThisAlias = getBestAAResults().alias(
+        MemoryLocation(V2, V2Size, V2AAInfo),
+        MemoryLocation(V, PNSize, PNAAInfo), *UseAAQI);
     Alias = MergeAliasResults(ThisAlias, Alias);
     if (Alias == MayAlias)
       break;
@@ -1750,10 +1491,10 @@ AliasResult BasicAAResult::aliasPHI(const PHINode *PN, LocationSize PNSize,
 /// Provides a bunch of ad-hoc rules to disambiguate in common cases, such as
 /// array references.
 AliasResult BasicAAResult::aliasCheck(const Value *V1, LocationSize V1Size,
-                                      AAMDNodes V1AAInfo, const Value *V2,
-                                      LocationSize V2Size, AAMDNodes V2AAInfo,
-                                      AAQueryInfo &AAQI, const Value *O1,
-                                      const Value *O2) {
+                                      const AAMDNodes &V1AAInfo,
+                                      const Value *V2, LocationSize V2Size,
+                                      const AAMDNodes &V2AAInfo,
+                                      AAQueryInfo &AAQI) {
   // If either of the memory references is empty, it doesn't matter what the
   // pointer values are.
   if (V1Size.isZero() || V2Size.isZero())
@@ -1781,11 +1522,8 @@ AliasResult BasicAAResult::aliasCheck(const Value *V1, LocationSize V1Size,
     return NoAlias; // Scalars cannot alias each other
 
   // Figure out what objects these things are pointing to if we can.
-  if (O1 == nullptr)
-    O1 = GetUnderlyingObject(V1, DL, MaxLookupSearchDepth);
-
-  if (O2 == nullptr)
-    O2 = GetUnderlyingObject(V2, DL, MaxLookupSearchDepth);
+  const Value *O1 = getUnderlyingObject(V1, MaxLookupSearchDepth);
+  const Value *O2 = getUnderlyingObject(V2, MaxLookupSearchDepth);
 
   // Null values in the default address space don't point to any object, so they
   // don't alias any other pointer.
@@ -1840,86 +1578,120 @@ AliasResult BasicAAResult::aliasCheck(const Value *V1, LocationSize V1Size,
           TLI, NullIsValidLocation)))
     return NoAlias;
 
+  // If one the accesses may be before the accessed pointer, canonicalize this
+  // by using unknown after-pointer sizes for both accesses. This is
+  // equivalent, because regardless of which pointer is lower, one of them
+  // will always came after the other, as long as the underlying objects aren't
+  // disjoint. We do this so that the rest of BasicAA does not have to deal
+  // with accesses before the base pointer, and to improve cache utilization by
+  // merging equivalent states.
+  if (V1Size.mayBeBeforePointer() || V2Size.mayBeBeforePointer()) {
+    V1Size = LocationSize::afterPointer();
+    V2Size = LocationSize::afterPointer();
+  }
+
   // Check the cache before climbing up use-def chains. This also terminates
   // otherwise infinitely recursive queries.
   AAQueryInfo::LocPair Locs(MemoryLocation(V1, V1Size, V1AAInfo),
                             MemoryLocation(V2, V2Size, V2AAInfo));
   if (V1 > V2)
     std::swap(Locs.first, Locs.second);
-  std::pair<AAQueryInfo::AliasCacheT::iterator, bool> Pair =
-      AAQI.AliasCache.try_emplace(Locs, MayAlias);
-  if (!Pair.second)
-    return Pair.first->second;
-
-  // FIXME: This isn't aggressively handling alias(GEP, PHI) for example: if the
-  // GEP can't simplify, we don't even look at the PHI cases.
-  if (!isa<GEPOperator>(V1) && isa<GEPOperator>(V2)) {
-    std::swap(V1, V2);
-    std::swap(V1Size, V2Size);
-    std::swap(O1, O2);
-    std::swap(V1AAInfo, V2AAInfo);
+  const auto &Pair = AAQI.AliasCache.try_emplace(
+      Locs, AAQueryInfo::CacheEntry{NoAlias, 0});
+  if (!Pair.second) {
+    auto &Entry = Pair.first->second;
+    if (!Entry.isDefinitive()) {
+      // Remember that we used an assumption.
+      ++Entry.NumAssumptionUses;
+      ++AAQI.NumAssumptionUses;
+    }
+    return Entry.Result;
   }
+
+  int OrigNumAssumptionUses = AAQI.NumAssumptionUses;
+  unsigned OrigNumAssumptionBasedResults = AAQI.AssumptionBasedResults.size();
+  AliasResult Result = aliasCheckRecursive(V1, V1Size, V1AAInfo, V2, V2Size,
+                                           V2AAInfo, AAQI, O1, O2);
+
+  auto It = AAQI.AliasCache.find(Locs);
+  assert(It != AAQI.AliasCache.end() && "Must be in cache");
+  auto &Entry = It->second;
+
+  // Check whether a NoAlias assumption has been used, but disproven.
+  bool AssumptionDisproven = Entry.NumAssumptionUses > 0 && Result != NoAlias;
+  if (AssumptionDisproven)
+    Result = MayAlias;
+
+  // This is a definitive result now, when considered as a root query.
+  AAQI.NumAssumptionUses -= Entry.NumAssumptionUses;
+  Entry.Result = Result;
+  Entry.NumAssumptionUses = -1;
+
+  // If the assumption has been disproven, remove any results that may have
+  // been based on this assumption. Do this after the Entry updates above to
+  // avoid iterator invalidation.
+  if (AssumptionDisproven)
+    while (AAQI.AssumptionBasedResults.size() > OrigNumAssumptionBasedResults)
+      AAQI.AliasCache.erase(AAQI.AssumptionBasedResults.pop_back_val());
+
+  // The result may still be based on assumptions higher up in the chain.
+  // Remember it, so it can be purged from the cache later.
+  if (OrigNumAssumptionUses != AAQI.NumAssumptionUses && Result != MayAlias)
+    AAQI.AssumptionBasedResults.push_back(Locs);
+  return Result;
+}
+
+AliasResult BasicAAResult::aliasCheckRecursive(
+    const Value *V1, LocationSize V1Size, const AAMDNodes &V1AAInfo,
+    const Value *V2, LocationSize V2Size, const AAMDNodes &V2AAInfo,
+    AAQueryInfo &AAQI, const Value *O1, const Value *O2) {
   if (const GEPOperator *GV1 = dyn_cast<GEPOperator>(V1)) {
     AliasResult Result =
         aliasGEP(GV1, V1Size, V1AAInfo, V2, V2Size, V2AAInfo, O1, O2, AAQI);
-    if (Result != MayAlias) {
-      auto ItInsPair = AAQI.AliasCache.insert(std::make_pair(Locs, Result));
-      assert(!ItInsPair.second && "Entry must have existed");
-      ItInsPair.first->second = Result;
+    if (Result != MayAlias)
+      return Result;
+  } else if (const GEPOperator *GV2 = dyn_cast<GEPOperator>(V2)) {
+    AliasResult Result =
+        aliasGEP(GV2, V2Size, V2AAInfo, V1, V1Size, V1AAInfo, O2, O1, AAQI);
+    if (Result != MayAlias)
       return Result;
-    }
   }
 
-  if (isa<PHINode>(V2) && !isa<PHINode>(V1)) {
-    std::swap(V1, V2);
-    std::swap(O1, O2);
-    std::swap(V1Size, V2Size);
-    std::swap(V1AAInfo, V2AAInfo);
-  }
   if (const PHINode *PN = dyn_cast<PHINode>(V1)) {
     AliasResult Result =
-        aliasPHI(PN, V1Size, V1AAInfo, V2, V2Size, V2AAInfo, O2, AAQI);
-    if (Result != MayAlias) {
-      Pair = AAQI.AliasCache.try_emplace(Locs, Result);
-      assert(!Pair.second && "Entry must have existed");
-      return Pair.first->second = Result;
-    }
+        aliasPHI(PN, V1Size, V1AAInfo, V2, V2Size, V2AAInfo, AAQI);
+    if (Result != MayAlias)
+      return Result;
+  } else if (const PHINode *PN = dyn_cast<PHINode>(V2)) {
+    AliasResult Result =
+        aliasPHI(PN, V2Size, V2AAInfo, V1, V1Size, V1AAInfo, AAQI);
+    if (Result != MayAlias)
+      return Result;
   }
 
-  if (isa<SelectInst>(V2) && !isa<SelectInst>(V1)) {
-    std::swap(V1, V2);
-    std::swap(O1, O2);
-    std::swap(V1Size, V2Size);
-    std::swap(V1AAInfo, V2AAInfo);
-  }
   if (const SelectInst *S1 = dyn_cast<SelectInst>(V1)) {
     AliasResult Result =
-        aliasSelect(S1, V1Size, V1AAInfo, V2, V2Size, V2AAInfo, O2, AAQI);
-    if (Result != MayAlias) {
-      Pair = AAQI.AliasCache.try_emplace(Locs, Result);
-      assert(!Pair.second && "Entry must have existed");
-      return Pair.first->second = Result;
-    }
+        aliasSelect(S1, V1Size, V1AAInfo, V2, V2Size, V2AAInfo, AAQI);
+    if (Result != MayAlias)
+      return Result;
+  } else if (const SelectInst *S2 = dyn_cast<SelectInst>(V2)) {
+    AliasResult Result =
+        aliasSelect(S2, V2Size, V2AAInfo, V1, V1Size, V1AAInfo, AAQI);
+    if (Result != MayAlias)
+      return Result;
   }
 
   // If both pointers are pointing into the same object and one of them
   // accesses the entire object, then the accesses must overlap in some way.
-  if (O1 == O2)
+  if (O1 == O2) {
+    bool NullIsValidLocation = NullPointerIsDefined(&F);
     if (V1Size.isPrecise() && V2Size.isPrecise() &&
         (isObjectSize(O1, V1Size.getValue(), DL, TLI, NullIsValidLocation) ||
-         isObjectSize(O2, V2Size.getValue(), DL, TLI, NullIsValidLocation))) {
-      Pair = AAQI.AliasCache.try_emplace(Locs, PartialAlias);
-      assert(!Pair.second && "Entry must have existed");
-      return Pair.first->second = PartialAlias;
-    }
+         isObjectSize(O2, V2Size.getValue(), DL, TLI, NullIsValidLocation)))
+      return PartialAlias;
+  }
 
-  // Recurse back into the best AA results we have, potentially with refined
-  // memory locations. We have already ensured that BasicAA has a MayAlias
-  // cache result for these, so any recursion back into BasicAA won't loop.
-  AliasResult Result = getBestAAResults().alias(Locs.first, Locs.second, AAQI);
-  Pair = AAQI.AliasCache.try_emplace(Locs, Result);
-  assert(!Pair.second && "Entry must have existed");
-  return Pair.first->second = Result;
+  return MayAlias;
 }
 
 /// Check whether two Values can be considered equivalent.
@@ -1988,7 +1760,7 @@ void BasicAAResult::GetIndexDifference(
 
     // If we didn't consume this entry, add it to the end of the Dest list.
     if (!!Scale) {
-      VariableGEPIndex Entry = {V, ZExtBits, SExtBits, -Scale};
+      VariableGEPIndex Entry = {V, ZExtBits, SExtBits, -Scale, Src[i].CxtI};
       Dest.push_back(Entry);
     }
   }
@@ -1998,8 +1770,8 @@ bool BasicAAResult::constantOffsetHeuristic(
     const SmallVectorImpl<VariableGEPIndex> &VarIndices,
     LocationSize MaybeV1Size, LocationSize MaybeV2Size, const APInt &BaseOffset,
     AssumptionCache *AC, DominatorTree *DT) {
-  if (VarIndices.size() != 2 || MaybeV1Size == LocationSize::unknown() ||
-      MaybeV2Size == LocationSize::unknown())
+  if (VarIndices.size() != 2 || !MaybeV1Size.hasValue() ||
+      !MaybeV2Size.hasValue())
     return false;
 
   const uint64_t V1Size = MaybeV1Size.getValue();
@@ -2059,13 +1831,12 @@ bool BasicAAResult::constantOffsetHeuristic(
 AnalysisKey BasicAA::Key;
 
 BasicAAResult BasicAA::run(Function &F, FunctionAnalysisManager &AM) {
-  return BasicAAResult(F.getParent()->getDataLayout(),
-                       F,
-                       AM.getResult<TargetLibraryAnalysis>(F),
-                       AM.getResult<AssumptionAnalysis>(F),
-                       &AM.getResult<DominatorTreeAnalysis>(F),
-                       AM.getCachedResult<LoopAnalysis>(F),
-                       AM.getCachedResult<PhiValuesAnalysis>(F));
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
+  auto *LI = AM.getCachedResult<LoopAnalysis>(F);
+  auto *PV = AM.getCachedResult<PhiValuesAnalysis>(F);
+  return BasicAAResult(F.getParent()->getDataLayout(), F, TLI, AC, DT, LI, PV);
 }
 
 BasicAAWrapperPass::BasicAAWrapperPass() : FunctionPass(ID) {
@@ -2107,9 +1878,9 @@ bool BasicAAWrapperPass::runOnFunction(Function &F) {
 
 void BasicAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
-  AU.addRequired<AssumptionCacheTracker>();
-  AU.addRequired<DominatorTreeWrapperPass>();
-  AU.addRequired<TargetLibraryInfoWrapperPass>();
+  AU.addRequiredTransitive<AssumptionCacheTracker>();
+  AU.addRequiredTransitive<DominatorTreeWrapperPass>();
+  AU.addRequiredTransitive<TargetLibraryInfoWrapperPass>();
   AU.addUsedIfAvailable<PhiValuesWrapperPass>();
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Analysis/BranchProbabilityInfo.cpp b/contrib/llvm-project/llvm/lib/Analysis/BranchProbabilityInfo.cpp
index a396b5ad21c6..884ba484ae19 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/BranchProbabilityInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/BranchProbabilityInfo.cpp
@@ -61,6 +61,7 @@ INITIALIZE_PASS_BEGIN(BranchProbabilityInfoWrapperPass, "branch-prob",
                       "Branch Probability Analysis", false, true)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
 INITIALIZE_PASS_END(BranchProbabilityInfoWrapperPass, "branch-prob",
                     "Branch Probability Analysis", false, true)
@@ -95,8 +96,6 @@ char BranchProbabilityInfoWrapperPass::ID = 0;
 // Probability of the edge BB2->BB3 = 4 / (124 + 4) = 0.03125
 static const uint32_t LBH_TAKEN_WEIGHT = 124;
 static const uint32_t LBH_NONTAKEN_WEIGHT = 4;
-// Unlikely edges within a loop are half as likely as other edges
-static const uint32_t LBH_UNLIKELY_WEIGHT = 62;
 
 /// Unreachable-terminating branch taken probability.
 ///
@@ -105,20 +104,6 @@ static const uint32_t LBH_UNLIKELY_WEIGHT = 62;
 /// All reachable probability will proportionally share the remaining part.
 static const BranchProbability UR_TAKEN_PROB = BranchProbability::getRaw(1);
 
-/// Weight for a branch taken going into a cold block.
-///
-/// This is the weight for a branch taken toward a block marked
-/// cold.  A block is marked cold if it's postdominated by a
-/// block containing a call to a cold function.  Cold functions
-/// are those marked with attribute 'cold'.
-static const uint32_t CC_TAKEN_WEIGHT = 4;
-
-/// Weight for a branch not-taken into a cold block.
-///
-/// This is the weight for a branch not taken toward a block marked
-/// cold.
-static const uint32_t CC_NONTAKEN_WEIGHT = 64;
-
 static const uint32_t PH_TAKEN_WEIGHT = 20;
 static const uint32_t PH_NONTAKEN_WEIGHT = 12;
 
@@ -135,144 +120,183 @@ static const uint32_t FPH_ORD_WEIGHT = 1024 * 1024 - 1;
 /// exceptional case, so the result is unlikely.
 static const uint32_t FPH_UNO_WEIGHT = 1;
 
-/// Invoke-terminating normal branch taken weight
-///
-/// This is the weight for branching to the normal destination of an invoke
-/// instruction. We expect this to happen most of the time. Set the weight to an
-/// absurdly high value so that nested loops subsume it.
-static const uint32_t IH_TAKEN_WEIGHT = 1024 * 1024 - 1;
+/// Set of dedicated "absolute" execution weights for a block. These weights are
+/// meaningful relative to each other and their derivatives only.
+enum class BlockExecWeight : std::uint32_t {
+  /// Special weight used for cases with exact zero probability.
+  ZERO = 0x0,
+  /// Minimal possible non zero weight.
+  LOWEST_NON_ZERO = 0x1,
+  /// Weight to an 'unreachable' block.
+  UNREACHABLE = ZERO,
+  /// Weight to a block containing non returning call.
+  NORETURN = LOWEST_NON_ZERO,
+  /// Weight to 'unwind' block of an invoke instruction.
+  UNWIND = LOWEST_NON_ZERO,
+  /// Weight to a 'cold' block. Cold blocks are the ones containing calls marked
+  /// with attribute 'cold'.
+  COLD = 0xffff,
+  /// Default weight is used in cases when there is no dedicated execution
+  /// weight set. It is not propagated through the domination line either.
+  DEFAULT = 0xfffff
+};
+
+BranchProbabilityInfo::SccInfo::SccInfo(const Function &F) {
+  // Record SCC numbers of blocks in the CFG to identify irreducible loops.
+  // FIXME: We could only calculate this if the CFG is known to be irreducible
+  // (perhaps cache this info in LoopInfo if we can easily calculate it there?).
+  int SccNum = 0;
+  for (scc_iterator<const Function *> It = scc_begin(&F); !It.isAtEnd();
+       ++It, ++SccNum) {
+    // Ignore single-block SCCs since they either aren't loops or LoopInfo will
+    // catch them.
+    const std::vector<const BasicBlock *> &Scc = *It;
+    if (Scc.size() == 1)
+      continue;
 
-/// Invoke-terminating normal branch not-taken weight.
-///
-/// This is the weight for branching to the unwind destination of an invoke
-/// instruction. This is essentially never taken.
-static const uint32_t IH_NONTAKEN_WEIGHT = 1;
-
-static void UpdatePDTWorklist(const BasicBlock *BB, PostDominatorTree *PDT,
-                              SmallVectorImpl<const BasicBlock *> &WorkList,
-                              SmallPtrSetImpl<const BasicBlock *> &TargetSet) {
-  SmallVector<BasicBlock *, 8> Descendants;
-  SmallPtrSet<const BasicBlock *, 16> NewItems;
-
-  PDT->getDescendants(const_cast<BasicBlock *>(BB), Descendants);
-  for (auto *BB : Descendants)
-    if (TargetSet.insert(BB).second)
-      for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
-        if (!TargetSet.count(*PI))
-          NewItems.insert(*PI);
-  WorkList.insert(WorkList.end(), NewItems.begin(), NewItems.end());
+    LLVM_DEBUG(dbgs() << "BPI: SCC " << SccNum << ":");
+    for (const auto *BB : Scc) {
+      LLVM_DEBUG(dbgs() << " " << BB->getName());
+      SccNums[BB] = SccNum;
+      calculateSccBlockType(BB, SccNum);
+    }
+    LLVM_DEBUG(dbgs() << "\n");
+  }
 }
 
-/// Compute a set of basic blocks that are post-dominated by unreachables.
-void BranchProbabilityInfo::computePostDominatedByUnreachable(
-    const Function &F, PostDominatorTree *PDT) {
-  SmallVector<const BasicBlock *, 8> WorkList;
-  for (auto &BB : F) {
-    const Instruction *TI = BB.getTerminator();
-    if (TI->getNumSuccessors() == 0) {
-      if (isa<UnreachableInst>(TI) ||
-          // If this block is terminated by a call to
-          // @llvm.experimental.deoptimize then treat it like an unreachable
-          // since the @llvm.experimental.deoptimize call is expected to
-          // practically never execute.
-          BB.getTerminatingDeoptimizeCall())
-        UpdatePDTWorklist(&BB, PDT, WorkList, PostDominatedByUnreachable);
-    }
+int BranchProbabilityInfo::SccInfo::getSCCNum(const BasicBlock *BB) const {
+  auto SccIt = SccNums.find(BB);
+  if (SccIt == SccNums.end())
+    return -1;
+  return SccIt->second;
+}
+
+void BranchProbabilityInfo::SccInfo::getSccEnterBlocks(
+    int SccNum, SmallVectorImpl<BasicBlock *> &Enters) const {
+
+  for (auto MapIt : SccBlocks[SccNum]) {
+    const auto *BB = MapIt.first;
+    if (isSCCHeader(BB, SccNum))
+      for (const auto *Pred : predecessors(BB))
+        if (getSCCNum(Pred) != SccNum)
+          Enters.push_back(const_cast<BasicBlock *>(BB));
   }
+}
 
-  while (!WorkList.empty()) {
-    const BasicBlock *BB = WorkList.pop_back_val();
-    if (PostDominatedByUnreachable.count(BB))
-      continue;
-    // If the terminator is an InvokeInst, check only the normal destination
-    // block as the unwind edge of InvokeInst is also very unlikely taken.
-    if (auto *II = dyn_cast<InvokeInst>(BB->getTerminator())) {
-      if (PostDominatedByUnreachable.count(II->getNormalDest()))
-        UpdatePDTWorklist(BB, PDT, WorkList, PostDominatedByUnreachable);
-    }
-    // If all the successors are unreachable, BB is unreachable as well.
-    else if (!successors(BB).empty() &&
-             llvm::all_of(successors(BB), [this](const BasicBlock *Succ) {
-               return PostDominatedByUnreachable.count(Succ);
-             }))
-      UpdatePDTWorklist(BB, PDT, WorkList, PostDominatedByUnreachable);
+void BranchProbabilityInfo::SccInfo::getSccExitBlocks(
+    int SccNum, SmallVectorImpl<BasicBlock *> &Exits) const {
+  for (auto MapIt : SccBlocks[SccNum]) {
+    const auto *BB = MapIt.first;
+    if (isSCCExitingBlock(BB, SccNum))
+      for (const auto *Succ : successors(BB))
+        if (getSCCNum(Succ) != SccNum)
+          Exits.push_back(const_cast<BasicBlock *>(BB));
   }
 }
 
-/// compute a set of basic blocks that are post-dominated by ColdCalls.
-void BranchProbabilityInfo::computePostDominatedByColdCall(
-    const Function &F, PostDominatorTree *PDT) {
-  SmallVector<const BasicBlock *, 8> WorkList;
-  for (auto &BB : F)
-    for (auto &I : BB)
-      if (const CallInst *CI = dyn_cast<CallInst>(&I))
-        if (CI->hasFnAttr(Attribute::Cold))
-          UpdatePDTWorklist(&BB, PDT, WorkList, PostDominatedByColdCall);
+uint32_t BranchProbabilityInfo::SccInfo::getSccBlockType(const BasicBlock *BB,
+                                                         int SccNum) const {
+  assert(getSCCNum(BB) == SccNum);
 
-  while (!WorkList.empty()) {
-    const BasicBlock *BB = WorkList.pop_back_val();
+  assert(SccBlocks.size() > static_cast<unsigned>(SccNum) && "Unknown SCC");
+  const auto &SccBlockTypes = SccBlocks[SccNum];
 
-    // If the terminator is an InvokeInst, check only the normal destination
-    // block as the unwind edge of InvokeInst is also very unlikely taken.
-    if (auto *II = dyn_cast<InvokeInst>(BB->getTerminator())) {
-      if (PostDominatedByColdCall.count(II->getNormalDest()))
-        UpdatePDTWorklist(BB, PDT, WorkList, PostDominatedByColdCall);
-    }
-    // If all of successor are post dominated then BB is also done.
-    else if (!successors(BB).empty() &&
-             llvm::all_of(successors(BB), [this](const BasicBlock *Succ) {
-               return PostDominatedByColdCall.count(Succ);
-             }))
-      UpdatePDTWorklist(BB, PDT, WorkList, PostDominatedByColdCall);
+  auto It = SccBlockTypes.find(BB);
+  if (It != SccBlockTypes.end()) {
+    return It->second;
   }
+  return Inner;
 }
 
-/// Calculate edge weights for successors lead to unreachable.
-///
-/// Predict that a successor which leads necessarily to an
-/// unreachable-terminated block as extremely unlikely.
-bool BranchProbabilityInfo::calcUnreachableHeuristics(const BasicBlock *BB) {
-  const Instruction *TI = BB->getTerminator();
-  (void) TI;
-  assert(TI->getNumSuccessors() > 1 && "expected more than one successor!");
-  assert(!isa<InvokeInst>(TI) &&
-         "Invokes should have already been handled by calcInvokeHeuristics");
+void BranchProbabilityInfo::SccInfo::calculateSccBlockType(const BasicBlock *BB,
+                                                           int SccNum) {
+  assert(getSCCNum(BB) == SccNum);
+  uint32_t BlockType = Inner;
 
-  SmallVector<unsigned, 4> UnreachableEdges;
-  SmallVector<unsigned, 4> ReachableEdges;
+  if (llvm::any_of(predecessors(BB), [&](const BasicBlock *Pred) {
+        // Consider any block that is an entry point to the SCC as
+        // a header.
+        return getSCCNum(Pred) != SccNum;
+      }))
+    BlockType |= Header;
 
-  for (const_succ_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I)
-    if (PostDominatedByUnreachable.count(*I))
-      UnreachableEdges.push_back(I.getSuccessorIndex());
-    else
-      ReachableEdges.push_back(I.getSuccessorIndex());
+  if (llvm::any_of(successors(BB), [&](const BasicBlock *Succ) {
+        return getSCCNum(Succ) != SccNum;
+      }))
+    BlockType |= Exiting;
 
-  // Skip probabilities if all were reachable.
-  if (UnreachableEdges.empty())
-    return false;
+  // Lazily compute the set of headers for a given SCC and cache the results
+  // in the SccHeaderMap.
+  if (SccBlocks.size() <= static_cast<unsigned>(SccNum))
+    SccBlocks.resize(SccNum + 1);
+  auto &SccBlockTypes = SccBlocks[SccNum];
+
+  if (BlockType != Inner) {
+    bool IsInserted;
+    std::tie(std::ignore, IsInserted) =
+        SccBlockTypes.insert(std::make_pair(BB, BlockType));
+    assert(IsInserted && "Duplicated block in SCC");
+  }
+}
 
-  SmallVector<BranchProbability, 4> EdgeProbabilities(
-      BB->getTerminator()->getNumSuccessors(), BranchProbability::getUnknown());
-  if (ReachableEdges.empty()) {
-    BranchProbability Prob(1, UnreachableEdges.size());
-    for (unsigned SuccIdx : UnreachableEdges)
-      EdgeProbabilities[SuccIdx] = Prob;
-    setEdgeProbability(BB, EdgeProbabilities);
-    return true;
+BranchProbabilityInfo::LoopBlock::LoopBlock(const BasicBlock *BB,
+                                            const LoopInfo &LI,
+                                            const SccInfo &SccI)
+    : BB(BB) {
+  LD.first = LI.getLoopFor(BB);
+  if (!LD.first) {
+    LD.second = SccI.getSCCNum(BB);
   }
+}
 
-  auto UnreachableProb = UR_TAKEN_PROB;
-  auto ReachableProb =
-      (BranchProbability::getOne() - UR_TAKEN_PROB * UnreachableEdges.size()) /
-      ReachableEdges.size();
+bool BranchProbabilityInfo::isLoopEnteringEdge(const LoopEdge &Edge) const {
+  const auto &SrcBlock = Edge.first;
+  const auto &DstBlock = Edge.second;
+  return (DstBlock.getLoop() &&
+          !DstBlock.getLoop()->contains(SrcBlock.getLoop())) ||
+         // Assume that SCCs can't be nested.
+         (DstBlock.getSccNum() != -1 &&
+          SrcBlock.getSccNum() != DstBlock.getSccNum());
+}
 
-  for (unsigned SuccIdx : UnreachableEdges)
-    EdgeProbabilities[SuccIdx] = UnreachableProb;
-  for (unsigned SuccIdx : ReachableEdges)
-    EdgeProbabilities[SuccIdx] = ReachableProb;
+bool BranchProbabilityInfo::isLoopExitingEdge(const LoopEdge &Edge) const {
+  return isLoopEnteringEdge({Edge.second, Edge.first});
+}
 
-  setEdgeProbability(BB, EdgeProbabilities);
-  return true;
+bool BranchProbabilityInfo::isLoopEnteringExitingEdge(
+    const LoopEdge &Edge) const {
+  return isLoopEnteringEdge(Edge) || isLoopExitingEdge(Edge);
+}
+
+bool BranchProbabilityInfo::isLoopBackEdge(const LoopEdge &Edge) const {
+  const auto &SrcBlock = Edge.first;
+  const auto &DstBlock = Edge.second;
+  return SrcBlock.belongsToSameLoop(DstBlock) &&
+         ((DstBlock.getLoop() &&
+           DstBlock.getLoop()->getHeader() == DstBlock.getBlock()) ||
+          (DstBlock.getSccNum() != -1 &&
+           SccI->isSCCHeader(DstBlock.getBlock(), DstBlock.getSccNum())));
+}
+
+void BranchProbabilityInfo::getLoopEnterBlocks(
+    const LoopBlock &LB, SmallVectorImpl<BasicBlock *> &Enters) const {
+  if (LB.getLoop()) {
+    auto *Header = LB.getLoop()->getHeader();
+    Enters.append(pred_begin(Header), pred_end(Header));
+  } else {
+    assert(LB.getSccNum() != -1 && "LB doesn't belong to any loop?");
+    SccI->getSccEnterBlocks(LB.getSccNum(), Enters);
+  }
+}
+
+void BranchProbabilityInfo::getLoopExitBlocks(
+    const LoopBlock &LB, SmallVectorImpl<BasicBlock *> &Exits) const {
+  if (LB.getLoop()) {
+    LB.getLoop()->getExitBlocks(Exits);
+  } else {
+    assert(LB.getSccNum() != -1 && "LB doesn't belong to any loop?");
+    SccI->getSccExitBlocks(LB.getSccNum(), Exits);
+  }
 }
 
 // Propagate existing explicit probabilities from either profile data or
@@ -315,7 +339,12 @@ bool BranchProbabilityInfo::calcMetadataWeights(const BasicBlock *BB) {
            "Too many bits for uint32_t");
     Weights.push_back(Weight->getZExtValue());
     WeightSum += Weights.back();
-    if (PostDominatedByUnreachable.count(TI->getSuccessor(I - 1)))
+    const LoopBlock SrcLoopBB = getLoopBlock(BB);
+    const LoopBlock DstLoopBB = getLoopBlock(TI->getSuccessor(I - 1));
+    auto EstimatedWeight = getEstimatedEdgeWeight({SrcLoopBB, DstLoopBB});
+    if (EstimatedWeight &&
+        EstimatedWeight.getValue() <=
+            static_cast<uint32_t>(BlockExecWeight::UNREACHABLE))
       UnreachableIdxs.push_back(I - 1);
     else
       ReachableIdxs.push_back(I - 1);
@@ -420,60 +449,6 @@ bool BranchProbabilityInfo::calcMetadataWeights(const BasicBlock *BB) {
   return true;
 }
 
-/// Calculate edge weights for edges leading to cold blocks.
-///
-/// A cold block is one post-dominated by  a block with a call to a
-/// cold function.  Those edges are unlikely to be taken, so we give
-/// them relatively low weight.
-///
-/// Return true if we could compute the weights for cold edges.
-/// Return false, otherwise.
-bool BranchProbabilityInfo::calcColdCallHeuristics(const BasicBlock *BB) {
-  const Instruction *TI = BB->getTerminator();
-  (void) TI;
-  assert(TI->getNumSuccessors() > 1 && "expected more than one successor!");
-  assert(!isa<InvokeInst>(TI) &&
-         "Invokes should have already been handled by calcInvokeHeuristics");
-
-  // Determine which successors are post-dominated by a cold block.
-  SmallVector<unsigned, 4> ColdEdges;
-  SmallVector<unsigned, 4> NormalEdges;
-  for (const_succ_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I)
-    if (PostDominatedByColdCall.count(*I))
-      ColdEdges.push_back(I.getSuccessorIndex());
-    else
-      NormalEdges.push_back(I.getSuccessorIndex());
-
-  // Skip probabilities if no cold edges.
-  if (ColdEdges.empty())
-    return false;
-
-  SmallVector<BranchProbability, 4> EdgeProbabilities(
-      BB->getTerminator()->getNumSuccessors(), BranchProbability::getUnknown());
-  if (NormalEdges.empty()) {
-    BranchProbability Prob(1, ColdEdges.size());
-    for (unsigned SuccIdx : ColdEdges)
-      EdgeProbabilities[SuccIdx] = Prob;
-    setEdgeProbability(BB, EdgeProbabilities);
-    return true;
-  }
-
-  auto ColdProb = BranchProbability::getBranchProbability(
-      CC_TAKEN_WEIGHT,
-      (CC_TAKEN_WEIGHT + CC_NONTAKEN_WEIGHT) * uint64_t(ColdEdges.size()));
-  auto NormalProb = BranchProbability::getBranchProbability(
-      CC_NONTAKEN_WEIGHT,
-      (CC_TAKEN_WEIGHT + CC_NONTAKEN_WEIGHT) * uint64_t(NormalEdges.size()));
-
-  for (unsigned SuccIdx : ColdEdges)
-    EdgeProbabilities[SuccIdx] = ColdProb;
-  for (unsigned SuccIdx : NormalEdges)
-    EdgeProbabilities[SuccIdx] = NormalProb;
-
-  setEdgeProbability(BB, EdgeProbabilities);
-  return true;
-}
-
 // Calculate Edge Weights using "Pointer Heuristics". Predict a comparison
 // between two pointer or pointer and NULL will fail.
 bool BranchProbabilityInfo::calcPointerHeuristics(const BasicBlock *BB) {
@@ -511,38 +486,6 @@ bool BranchProbabilityInfo::calcPointerHeuristics(const BasicBlock *BB) {
   return true;
 }
 
-static int getSCCNum(const BasicBlock *BB,
-                     const BranchProbabilityInfo::SccInfo &SccI) {
-  auto SccIt = SccI.SccNums.find(BB);
-  if (SccIt == SccI.SccNums.end())
-    return -1;
-  return SccIt->second;
-}
-
-// Consider any block that is an entry point to the SCC as a header.
-static bool isSCCHeader(const BasicBlock *BB, int SccNum,
-                        BranchProbabilityInfo::SccInfo &SccI) {
-  assert(getSCCNum(BB, SccI) == SccNum);
-
-  // Lazily compute the set of headers for a given SCC and cache the results
-  // in the SccHeaderMap.
-  if (SccI.SccHeaders.size() <= static_cast<unsigned>(SccNum))
-    SccI.SccHeaders.resize(SccNum + 1);
-  auto &HeaderMap = SccI.SccHeaders[SccNum];
-  bool Inserted;
-  BranchProbabilityInfo::SccHeaderMap::iterator HeaderMapIt;
-  std::tie(HeaderMapIt, Inserted) = HeaderMap.insert(std::make_pair(BB, false));
-  if (Inserted) {
-    bool IsHeader = llvm::any_of(make_range(pred_begin(BB), pred_end(BB)),
-                                 [&](const BasicBlock *Pred) {
-                                   return getSCCNum(Pred, SccI) != SccNum;
-                                 });
-    HeaderMapIt->second = IsHeader;
-    return IsHeader;
-  } else
-    return HeaderMapIt->second;
-}
-
 // Compute the unlikely successors to the block BB in the loop L, specifically
 // those that are unlikely because this is a loop, and add them to the
 // UnlikelyBlocks set.
@@ -625,8 +568,7 @@ computeUnlikelySuccessors(const BasicBlock *BB, Loop *L,
       // we can constant-evaluate the compare to see if it makes the branch be
       // taken or not.
       Constant *CmpLHSConst = dyn_cast<Constant>(V);
-      if (!CmpLHSConst ||
-          std::find(succ_begin(BB), succ_end(BB), B) == succ_end(BB))
+      if (!CmpLHSConst || !llvm::is_contained(successors(BB), B))
         continue;
       // First collapse InstChain
       for (Instruction *I : llvm::reverse(InstChain)) {
@@ -650,92 +592,324 @@ computeUnlikelySuccessors(const BasicBlock *BB, Loop *L,
   }
 }
 
-// Calculate Edge Weights using "Loop Branch Heuristics". Predict backedges
-// as taken, exiting edges as not-taken.
-bool BranchProbabilityInfo::calcLoopBranchHeuristics(const BasicBlock *BB,
-                                                     const LoopInfo &LI,
-                                                     SccInfo &SccI) {
-  int SccNum;
-  Loop *L = LI.getLoopFor(BB);
-  if (!L) {
-    SccNum = getSCCNum(BB, SccI);
-    if (SccNum < 0)
-      return false;
+Optional<uint32_t>
+BranchProbabilityInfo::getEstimatedBlockWeight(const BasicBlock *BB) const {
+  auto WeightIt = EstimatedBlockWeight.find(BB);
+  if (WeightIt == EstimatedBlockWeight.end())
+    return None;
+  return WeightIt->second;
+}
+
+Optional<uint32_t>
+BranchProbabilityInfo::getEstimatedLoopWeight(const LoopData &L) const {
+  auto WeightIt = EstimatedLoopWeight.find(L);
+  if (WeightIt == EstimatedLoopWeight.end())
+    return None;
+  return WeightIt->second;
+}
+
+Optional<uint32_t>
+BranchProbabilityInfo::getEstimatedEdgeWeight(const LoopEdge &Edge) const {
+  // For edges entering a loop take weight of a loop rather than an individual
+  // block in the loop.
+  return isLoopEnteringEdge(Edge)
+             ? getEstimatedLoopWeight(Edge.second.getLoopData())
+             : getEstimatedBlockWeight(Edge.second.getBlock());
+}
+
+template <class IterT>
+Optional<uint32_t> BranchProbabilityInfo::getMaxEstimatedEdgeWeight(
+    const LoopBlock &SrcLoopBB, iterator_range<IterT> Successors) const {
+  SmallVector<uint32_t, 4> Weights;
+  Optional<uint32_t> MaxWeight;
+  for (const BasicBlock *DstBB : Successors) {
+    const LoopBlock DstLoopBB = getLoopBlock(DstBB);
+    auto Weight = getEstimatedEdgeWeight({SrcLoopBB, DstLoopBB});
+
+    if (!Weight)
+      return None;
+
+    if (!MaxWeight || MaxWeight.getValue() < Weight.getValue())
+      MaxWeight = Weight;
   }
 
-  SmallPtrSet<const BasicBlock*, 8> UnlikelyBlocks;
-  if (L)
-    computeUnlikelySuccessors(BB, L, UnlikelyBlocks);
+  return MaxWeight;
+}
 
-  SmallVector<unsigned, 8> BackEdges;
-  SmallVector<unsigned, 8> ExitingEdges;
-  SmallVector<unsigned, 8> InEdges; // Edges from header to the loop.
-  SmallVector<unsigned, 8> UnlikelyEdges;
+// Updates \p LoopBB's weight and returns true. If \p LoopBB has already
+// an associated weight it is unchanged and false is returned.
+//
+// Please note by the algorithm the weight is not expected to change once set
+// thus 'false' status is used to track visited blocks.
+bool BranchProbabilityInfo::updateEstimatedBlockWeight(
+    LoopBlock &LoopBB, uint32_t BBWeight,
+    SmallVectorImpl<BasicBlock *> &BlockWorkList,
+    SmallVectorImpl<LoopBlock> &LoopWorkList) {
+  BasicBlock *BB = LoopBB.getBlock();
+
+  // In general, weight is assigned to a block when it has final value and
+  // can't/shouldn't be changed.  However, there are cases when a block
+  // inherently has several (possibly "contradicting") weights. For example,
+  // "unwind" block may also contain "cold" call. In that case the first
+  // set weight is favored and all consequent weights are ignored.
+  if (!EstimatedBlockWeight.insert({BB, BBWeight}).second)
+    return false;
 
-  for (const_succ_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) {
-    // Use LoopInfo if we have it, otherwise fall-back to SCC info to catch
-    // irreducible loops.
-    if (L) {
-      if (UnlikelyBlocks.count(*I) != 0)
-        UnlikelyEdges.push_back(I.getSuccessorIndex());
-      else if (!L->contains(*I))
-        ExitingEdges.push_back(I.getSuccessorIndex());
-      else if (L->getHeader() == *I)
-        BackEdges.push_back(I.getSuccessorIndex());
-      else
-        InEdges.push_back(I.getSuccessorIndex());
-    } else {
-      if (getSCCNum(*I, SccI) != SccNum)
-        ExitingEdges.push_back(I.getSuccessorIndex());
-      else if (isSCCHeader(*I, SccNum, SccI))
-        BackEdges.push_back(I.getSuccessorIndex());
-      else
-        InEdges.push_back(I.getSuccessorIndex());
+  for (BasicBlock *PredBlock : predecessors(BB)) {
+    LoopBlock PredLoop = getLoopBlock(PredBlock);
+    // Add affected block/loop to a working list.
+    if (isLoopExitingEdge({PredLoop, LoopBB})) {
+      if (!EstimatedLoopWeight.count(PredLoop.getLoopData()))
+        LoopWorkList.push_back(PredLoop);
+    } else if (!EstimatedBlockWeight.count(PredBlock))
+      BlockWorkList.push_back(PredBlock);
+  }
+  return true;
+}
+
+// Starting from \p BB traverse through dominator blocks and assign \p BBWeight
+// to all such blocks that are post dominated by \BB. In other words to all
+// blocks that the one is executed if and only if another one is executed.
+// Importantly, we skip loops here for two reasons. First weights of blocks in
+// a loop should be scaled by trip count (yet possibly unknown). Second there is
+// no any value in doing that because that doesn't give any additional
+// information regarding distribution of probabilities inside the loop.
+// Exception is loop 'enter' and 'exit' edges that are handled in a special way
+// at calcEstimatedHeuristics.
+//
+// In addition, \p WorkList is populated with basic blocks if at leas one
+// successor has updated estimated weight.
+void BranchProbabilityInfo::propagateEstimatedBlockWeight(
+    const LoopBlock &LoopBB, DominatorTree *DT, PostDominatorTree *PDT,
+    uint32_t BBWeight, SmallVectorImpl<BasicBlock *> &BlockWorkList,
+    SmallVectorImpl<LoopBlock> &LoopWorkList) {
+  const BasicBlock *BB = LoopBB.getBlock();
+  const auto *DTStartNode = DT->getNode(BB);
+  const auto *PDTStartNode = PDT->getNode(BB);
+
+  // TODO: Consider propagating weight down the domination line as well.
+  for (const auto *DTNode = DTStartNode; DTNode != nullptr;
+       DTNode = DTNode->getIDom()) {
+    auto *DomBB = DTNode->getBlock();
+    // Consider blocks which lie on one 'line'.
+    if (!PDT->dominates(PDTStartNode, PDT->getNode(DomBB)))
+      // If BB doesn't post dominate DomBB it will not post dominate dominators
+      // of DomBB as well.
+      break;
+
+    LoopBlock DomLoopBB = getLoopBlock(DomBB);
+    const LoopEdge Edge{DomLoopBB, LoopBB};
+    // Don't propagate weight to blocks belonging to different loops.
+    if (!isLoopEnteringExitingEdge(Edge)) {
+      if (!updateEstimatedBlockWeight(DomLoopBB, BBWeight, BlockWorkList,
+                                      LoopWorkList))
+        // If DomBB has weight set then all it's predecessors are already
+        // processed (since we propagate weight up to the top of IR each time).
+        break;
+    } else if (isLoopExitingEdge(Edge)) {
+      LoopWorkList.push_back(DomLoopBB);
     }
   }
+}
+
+Optional<uint32_t> BranchProbabilityInfo::getInitialEstimatedBlockWeight(
+    const BasicBlock *BB) {
+  // Returns true if \p BB has call marked with "NoReturn" attribute.
+  auto hasNoReturn = [&](const BasicBlock *BB) {
+    for (const auto &I : reverse(*BB))
+      if (const CallInst *CI = dyn_cast<CallInst>(&I))
+        if (CI->hasFnAttr(Attribute::NoReturn))
+          return true;
 
-  if (BackEdges.empty() && ExitingEdges.empty() && UnlikelyEdges.empty())
     return false;
+  };
 
-  // Collect the sum of probabilities of back-edges/in-edges/exiting-edges, and
-  // normalize them so that they sum up to one.
-  unsigned Denom = (BackEdges.empty() ? 0 : LBH_TAKEN_WEIGHT) +
-                   (InEdges.empty() ? 0 : LBH_TAKEN_WEIGHT) +
-                   (UnlikelyEdges.empty() ? 0 : LBH_UNLIKELY_WEIGHT) +
-                   (ExitingEdges.empty() ? 0 : LBH_NONTAKEN_WEIGHT);
+  // Important note regarding the order of checks. They are ordered by weight
+  // from lowest to highest. Doing that allows to avoid "unstable" results
+  // when several conditions heuristics can be applied simultaneously.
+  if (isa<UnreachableInst>(BB->getTerminator()) ||
+      // If this block is terminated by a call to
+      // @llvm.experimental.deoptimize then treat it like an unreachable
+      // since it is expected to practically never execute.
+      // TODO: Should we actually treat as never returning call?
+      BB->getTerminatingDeoptimizeCall())
+    return hasNoReturn(BB)
+               ? static_cast<uint32_t>(BlockExecWeight::NORETURN)
+               : static_cast<uint32_t>(BlockExecWeight::UNREACHABLE);
+
+  // Check if the block is 'unwind' handler of  some invoke instruction.
+  for (const auto *Pred : predecessors(BB))
+    if (Pred)
+      if (const auto *II = dyn_cast<InvokeInst>(Pred->getTerminator()))
+        if (II->getUnwindDest() == BB)
+          return static_cast<uint32_t>(BlockExecWeight::UNWIND);
+
+  // Check if the block contains 'cold' call.
+  for (const auto &I : *BB)
+    if (const CallInst *CI = dyn_cast<CallInst>(&I))
+      if (CI->hasFnAttr(Attribute::Cold))
+        return static_cast<uint32_t>(BlockExecWeight::COLD);
+
+  return None;
+}
 
-  SmallVector<BranchProbability, 4> EdgeProbabilities(
-      BB->getTerminator()->getNumSuccessors(), BranchProbability::getUnknown());
-  if (uint32_t numBackEdges = BackEdges.size()) {
-    BranchProbability TakenProb = BranchProbability(LBH_TAKEN_WEIGHT, Denom);
-    auto Prob = TakenProb / numBackEdges;
-    for (unsigned SuccIdx : BackEdges)
-      EdgeProbabilities[SuccIdx] = Prob;
-  }
+// Does RPO traversal over all blocks in \p F and assigns weights to
+// 'unreachable', 'noreturn', 'cold', 'unwind' blocks. In addition it does its
+// best to propagate the weight to up/down the IR.
+void BranchProbabilityInfo::computeEestimateBlockWeight(
+    const Function &F, DominatorTree *DT, PostDominatorTree *PDT) {
+  SmallVector<BasicBlock *, 8> BlockWorkList;
+  SmallVector<LoopBlock, 8> LoopWorkList;
+
+  // By doing RPO we make sure that all predecessors already have weights
+  // calculated before visiting theirs successors.
+  ReversePostOrderTraversal<const Function *> RPOT(&F);
+  for (const auto *BB : RPOT)
+    if (auto BBWeight = getInitialEstimatedBlockWeight(BB))
+      // If we were able to find estimated weight for the block set it to this
+      // block and propagate up the IR.
+      propagateEstimatedBlockWeight(getLoopBlock(BB), DT, PDT,
+                                    BBWeight.getValue(), BlockWorkList,
+                                    LoopWorkList);
+
+  // BlockWorklist/LoopWorkList contains blocks/loops with at least one
+  // successor/exit having estimated weight. Try to propagate weight to such
+  // blocks/loops from successors/exits.
+  // Process loops and blocks. Order is not important.
+  do {
+    while (!LoopWorkList.empty()) {
+      const LoopBlock LoopBB = LoopWorkList.pop_back_val();
+
+      if (EstimatedLoopWeight.count(LoopBB.getLoopData()))
+        continue;
 
-  if (uint32_t numInEdges = InEdges.size()) {
-    BranchProbability TakenProb = BranchProbability(LBH_TAKEN_WEIGHT, Denom);
-    auto Prob = TakenProb / numInEdges;
-    for (unsigned SuccIdx : InEdges)
-      EdgeProbabilities[SuccIdx] = Prob;
-  }
+      SmallVector<BasicBlock *, 4> Exits;
+      getLoopExitBlocks(LoopBB, Exits);
+      auto LoopWeight = getMaxEstimatedEdgeWeight(
+          LoopBB, make_range(Exits.begin(), Exits.end()));
+
+      if (LoopWeight) {
+        // If we never exit the loop then we can enter it once at maximum.
+        if (LoopWeight <= static_cast<uint32_t>(BlockExecWeight::UNREACHABLE))
+          LoopWeight = static_cast<uint32_t>(BlockExecWeight::LOWEST_NON_ZERO);
+
+        EstimatedLoopWeight.insert(
+            {LoopBB.getLoopData(), LoopWeight.getValue()});
+        // Add all blocks entering the loop into working list.
+        getLoopEnterBlocks(LoopBB, BlockWorkList);
+      }
+    }
 
-  if (uint32_t numExitingEdges = ExitingEdges.size()) {
-    BranchProbability NotTakenProb = BranchProbability(LBH_NONTAKEN_WEIGHT,
-                                                       Denom);
-    auto Prob = NotTakenProb / numExitingEdges;
-    for (unsigned SuccIdx : ExitingEdges)
-      EdgeProbabilities[SuccIdx] = Prob;
+    while (!BlockWorkList.empty()) {
+      // We can reach here only if BlockWorkList is not empty.
+      const BasicBlock *BB = BlockWorkList.pop_back_val();
+      if (EstimatedBlockWeight.count(BB))
+        continue;
+
+      // We take maximum over all weights of successors. In other words we take
+      // weight of "hot" path. In theory we can probably find a better function
+      // which gives higher accuracy results (comparing to "maximum") but I
+      // can't
+      // think of any right now. And I doubt it will make any difference in
+      // practice.
+      const LoopBlock LoopBB = getLoopBlock(BB);
+      auto MaxWeight = getMaxEstimatedEdgeWeight(LoopBB, successors(BB));
+
+      if (MaxWeight)
+        propagateEstimatedBlockWeight(LoopBB, DT, PDT, MaxWeight.getValue(),
+                                      BlockWorkList, LoopWorkList);
+    }
+  } while (!BlockWorkList.empty() || !LoopWorkList.empty());
+}
+
+// Calculate edge probabilities based on block's estimated weight.
+// Note that gathered weights were not scaled for loops. Thus edges entering
+// and exiting loops requires special processing.
+bool BranchProbabilityInfo::calcEstimatedHeuristics(const BasicBlock *BB) {
+  assert(BB->getTerminator()->getNumSuccessors() > 1 &&
+         "expected more than one successor!");
+
+  const LoopBlock LoopBB = getLoopBlock(BB);
+
+  SmallPtrSet<const BasicBlock *, 8> UnlikelyBlocks;
+  uint32_t TC = LBH_TAKEN_WEIGHT / LBH_NONTAKEN_WEIGHT;
+  if (LoopBB.getLoop())
+    computeUnlikelySuccessors(BB, LoopBB.getLoop(), UnlikelyBlocks);
+
+  // Changed to 'true' if at least one successor has estimated weight.
+  bool FoundEstimatedWeight = false;
+  SmallVector<uint32_t, 4> SuccWeights;
+  uint64_t TotalWeight = 0;
+  // Go over all successors of BB and put their weights into SuccWeights.
+  for (const_succ_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) {
+    const BasicBlock *SuccBB = *I;
+    Optional<uint32_t> Weight;
+    const LoopBlock SuccLoopBB = getLoopBlock(SuccBB);
+    const LoopEdge Edge{LoopBB, SuccLoopBB};
+
+    Weight = getEstimatedEdgeWeight(Edge);
+
+    if (isLoopExitingEdge(Edge) &&
+        // Avoid adjustment of ZERO weight since it should remain unchanged.
+        Weight != static_cast<uint32_t>(BlockExecWeight::ZERO)) {
+      // Scale down loop exiting weight by trip count.
+      Weight = std::max(
+          static_cast<uint32_t>(BlockExecWeight::LOWEST_NON_ZERO),
+          Weight.getValueOr(static_cast<uint32_t>(BlockExecWeight::DEFAULT)) /
+              TC);
+    }
+    bool IsUnlikelyEdge = LoopBB.getLoop() && UnlikelyBlocks.contains(SuccBB);
+    if (IsUnlikelyEdge &&
+        // Avoid adjustment of ZERO weight since it should remain unchanged.
+        Weight != static_cast<uint32_t>(BlockExecWeight::ZERO)) {
+      // 'Unlikely' blocks have twice lower weight.
+      Weight = std::max(
+          static_cast<uint32_t>(BlockExecWeight::LOWEST_NON_ZERO),
+          Weight.getValueOr(static_cast<uint32_t>(BlockExecWeight::DEFAULT)) /
+              2);
+    }
+
+    if (Weight)
+      FoundEstimatedWeight = true;
+
+    auto WeightVal =
+        Weight.getValueOr(static_cast<uint32_t>(BlockExecWeight::DEFAULT));
+    TotalWeight += WeightVal;
+    SuccWeights.push_back(WeightVal);
   }
 
-  if (uint32_t numUnlikelyEdges = UnlikelyEdges.size()) {
-    BranchProbability UnlikelyProb = BranchProbability(LBH_UNLIKELY_WEIGHT,
-                                                       Denom);
-    auto Prob = UnlikelyProb / numUnlikelyEdges;
-    for (unsigned SuccIdx : UnlikelyEdges)
-      EdgeProbabilities[SuccIdx] = Prob;
+  // If non of blocks have estimated weight bail out.
+  // If TotalWeight is 0 that means weight of each successor is 0 as well and
+  // equally likely. Bail out early to not deal with devision by zero.
+  if (!FoundEstimatedWeight || TotalWeight == 0)
+    return false;
+
+  assert(SuccWeights.size() == succ_size(BB) && "Missed successor?");
+  const unsigned SuccCount = SuccWeights.size();
+
+  // If the sum of weights does not fit in 32 bits, scale every weight down
+  // accordingly.
+  if (TotalWeight > UINT32_MAX) {
+    uint64_t ScalingFactor = TotalWeight / UINT32_MAX + 1;
+    TotalWeight = 0;
+    for (unsigned Idx = 0; Idx < SuccCount; ++Idx) {
+      SuccWeights[Idx] /= ScalingFactor;
+      if (SuccWeights[Idx] == static_cast<uint32_t>(BlockExecWeight::ZERO))
+        SuccWeights[Idx] =
+            static_cast<uint32_t>(BlockExecWeight::LOWEST_NON_ZERO);
+      TotalWeight += SuccWeights[Idx];
+    }
+    assert(TotalWeight <= UINT32_MAX && "Total weight overflows");
   }
 
+  // Finally set probabilities to edges according to estimated block weights.
+  SmallVector<BranchProbability, 4> EdgeProbabilities(
+      SuccCount, BranchProbability::getUnknown());
+
+  for (unsigned Idx = 0; Idx < SuccCount; ++Idx) {
+    EdgeProbabilities[Idx] =
+        BranchProbability(SuccWeights[Idx], (uint32_t)TotalWeight);
+  }
   setEdgeProbability(BB, EdgeProbabilities);
   return true;
 }
@@ -766,7 +940,7 @@ bool BranchProbabilityInfo::calcZeroHeuristics(const BasicBlock *BB,
   // we don't have information about probabilities.
   if (Instruction *LHS = dyn_cast<Instruction>(CI->getOperand(0)))
     if (LHS->getOpcode() == Instruction::And)
-      if (ConstantInt *AndRHS = dyn_cast<ConstantInt>(LHS->getOperand(1)))
+      if (ConstantInt *AndRHS = GetConstantInt(LHS->getOperand(1)))
         if (AndRHS->getValue().isPowerOf2())
           return false;
 
@@ -782,7 +956,8 @@ bool BranchProbabilityInfo::calcZeroHeuristics(const BasicBlock *BB,
       Func == LibFunc_strcmp ||
       Func == LibFunc_strncasecmp ||
       Func == LibFunc_strncmp ||
-      Func == LibFunc_memcmp) {
+      Func == LibFunc_memcmp ||
+      Func == LibFunc_bcmp) {
     // strcmp and similar functions return zero, negative, or positive, if the
     // first string is equal, less, or greater than the second. We consider it
     // likely that the strings are not equal, so a comparison with zero is
@@ -900,18 +1075,6 @@ bool BranchProbabilityInfo::calcFloatingPointHeuristics(const BasicBlock *BB) {
   return true;
 }
 
-bool BranchProbabilityInfo::calcInvokeHeuristics(const BasicBlock *BB) {
-  const InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator());
-  if (!II)
-    return false;
-
-  BranchProbability TakenProb(IH_TAKEN_WEIGHT,
-                              IH_TAKEN_WEIGHT + IH_NONTAKEN_WEIGHT);
-  setEdgeProbability(
-      BB, SmallVector<BranchProbability, 2>({TakenProb, TakenProb.getCompl()}));
-  return true;
-}
-
 void BranchProbabilityInfo::releaseMemory() {
   Probs.clear();
   Handles.clear();
@@ -951,8 +1114,7 @@ BranchProbabilityInfo::getHotSucc(const BasicBlock *BB) const {
   auto MaxProb = BranchProbability::getZero();
   const BasicBlock *MaxSucc = nullptr;
 
-  for (const_succ_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) {
-    const BasicBlock *Succ = *I;
+  for (const auto *Succ : successors(BB)) {
     auto Prob = getEdgeProbability(BB, Succ);
     if (Prob > MaxProb) {
       MaxProb = Prob;
@@ -975,6 +1137,10 @@ BranchProbability
 BranchProbabilityInfo::getEdgeProbability(const BasicBlock *Src,
                                           unsigned IndexInSuccessors) const {
   auto I = Probs.find(std::make_pair(Src, IndexInSuccessors));
+  assert((Probs.end() == Probs.find(std::make_pair(Src, 0))) ==
+             (Probs.end() == I) &&
+         "Probability for I-th successor must always be defined along with the "
+         "probability for the first successor");
 
   if (I != Probs.end())
     return I->second;
@@ -993,44 +1159,32 @@ BranchProbabilityInfo::getEdgeProbability(const BasicBlock *Src,
 BranchProbability
 BranchProbabilityInfo::getEdgeProbability(const BasicBlock *Src,
                                           const BasicBlock *Dst) const {
+  if (!Probs.count(std::make_pair(Src, 0)))
+    return BranchProbability(llvm::count(successors(Src), Dst), succ_size(Src));
+
   auto Prob = BranchProbability::getZero();
-  bool FoundProb = false;
-  uint32_t EdgeCount = 0;
   for (const_succ_iterator I = succ_begin(Src), E = succ_end(Src); I != E; ++I)
-    if (*I == Dst) {
-      ++EdgeCount;
-      auto MapI = Probs.find(std::make_pair(Src, I.getSuccessorIndex()));
-      if (MapI != Probs.end()) {
-        FoundProb = true;
-        Prob += MapI->second;
-      }
-    }
-  uint32_t succ_num = std::distance(succ_begin(Src), succ_end(Src));
-  return FoundProb ? Prob : BranchProbability(EdgeCount, succ_num);
-}
+    if (*I == Dst)
+      Prob += Probs.find(std::make_pair(Src, I.getSuccessorIndex()))->second;
 
-/// Set the edge probability for a given edge specified by PredBlock and an
-/// index to the successors.
-void BranchProbabilityInfo::setEdgeProbability(const BasicBlock *Src,
-                                               unsigned IndexInSuccessors,
-                                               BranchProbability Prob) {
-  Probs[std::make_pair(Src, IndexInSuccessors)] = Prob;
-  Handles.insert(BasicBlockCallbackVH(Src, this));
-  LLVM_DEBUG(dbgs() << "set edge " << Src->getName() << " -> "
-                    << IndexInSuccessors << " successor probability to " << Prob
-                    << "\n");
+  return Prob;
 }
 
 /// Set the edge probability for all edges at once.
 void BranchProbabilityInfo::setEdgeProbability(
     const BasicBlock *Src, const SmallVectorImpl<BranchProbability> &Probs) {
   assert(Src->getTerminator()->getNumSuccessors() == Probs.size());
+  eraseBlock(Src); // Erase stale data if any.
   if (Probs.size() == 0)
     return; // Nothing to set.
 
+  Handles.insert(BasicBlockCallbackVH(Src, this));
   uint64_t TotalNumerator = 0;
   for (unsigned SuccIdx = 0; SuccIdx < Probs.size(); ++SuccIdx) {
-    setEdgeProbability(Src, SuccIdx, Probs[SuccIdx]);
+    this->Probs[std::make_pair(Src, SuccIdx)] = Probs[SuccIdx];
+    LLVM_DEBUG(dbgs() << "set edge " << Src->getName() << " -> " << SuccIdx
+                      << " successor probability to " << Probs[SuccIdx]
+                      << "\n");
     TotalNumerator += Probs[SuccIdx].getNumerator();
   }
 
@@ -1043,6 +1197,25 @@ void BranchProbabilityInfo::setEdgeProbability(
   assert(TotalNumerator >= BranchProbability::getDenominator() - Probs.size());
 }
 
+void BranchProbabilityInfo::copyEdgeProbabilities(BasicBlock *Src,
+                                                  BasicBlock *Dst) {
+  eraseBlock(Dst); // Erase stale data if any.
+  unsigned NumSuccessors = Src->getTerminator()->getNumSuccessors();
+  assert(NumSuccessors == Dst->getTerminator()->getNumSuccessors());
+  if (NumSuccessors == 0)
+    return; // Nothing to set.
+  if (this->Probs.find(std::make_pair(Src, 0)) == this->Probs.end())
+    return; // No probability is set for edges from Src. Keep the same for Dst.
+
+  Handles.insert(BasicBlockCallbackVH(Dst, this));
+  for (unsigned SuccIdx = 0; SuccIdx < NumSuccessors; ++SuccIdx) {
+    auto Prob = this->Probs[std::make_pair(Src, SuccIdx)];
+    this->Probs[std::make_pair(Dst, SuccIdx)] = Prob;
+    LLVM_DEBUG(dbgs() << "set edge " << Dst->getName() << " -> " << SuccIdx
+                      << " successor probability to " << Prob << "\n");
+  }
+}
+
 raw_ostream &
 BranchProbabilityInfo::printEdgeProbability(raw_ostream &OS,
                                             const BasicBlock *Src,
@@ -1056,52 +1229,55 @@ BranchProbabilityInfo::printEdgeProbability(raw_ostream &OS,
 }
 
 void BranchProbabilityInfo::eraseBlock(const BasicBlock *BB) {
-  for (const_succ_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) {
-    auto MapI = Probs.find(std::make_pair(BB, I.getSuccessorIndex()));
-    if (MapI != Probs.end())
-      Probs.erase(MapI);
+  LLVM_DEBUG(dbgs() << "eraseBlock " << BB->getName() << "\n");
+
+  // Note that we cannot use successors of BB because the terminator of BB may
+  // have changed when eraseBlock is called as a BasicBlockCallbackVH callback.
+  // Instead we remove prob data for the block by iterating successors by their
+  // indices from 0 till the last which exists. There could not be prob data for
+  // a pair (BB, N) if there is no data for (BB, N-1) because the data is always
+  // set for all successors from 0 to M at once by the method
+  // setEdgeProbability().
+  Handles.erase(BasicBlockCallbackVH(BB, this));
+  for (unsigned I = 0;; ++I) {
+    auto MapI = Probs.find(std::make_pair(BB, I));
+    if (MapI == Probs.end()) {
+      assert(Probs.count(std::make_pair(BB, I + 1)) == 0 &&
+             "Must be no more successors");
+      return;
+    }
+    Probs.erase(MapI);
   }
 }
 
-void BranchProbabilityInfo::calculate(const Function &F, const LoopInfo &LI,
+void BranchProbabilityInfo::calculate(const Function &F, const LoopInfo &LoopI,
                                       const TargetLibraryInfo *TLI,
+                                      DominatorTree *DT,
                                       PostDominatorTree *PDT) {
   LLVM_DEBUG(dbgs() << "---- Branch Probability Info : " << F.getName()
                     << " ----\n\n");
   LastF = &F; // Store the last function we ran on for printing.
-  assert(PostDominatedByUnreachable.empty());
-  assert(PostDominatedByColdCall.empty());
+  LI = &LoopI;
 
-  // Record SCC numbers of blocks in the CFG to identify irreducible loops.
-  // FIXME: We could only calculate this if the CFG is known to be irreducible
-  // (perhaps cache this info in LoopInfo if we can easily calculate it there?).
-  int SccNum = 0;
-  SccInfo SccI;
-  for (scc_iterator<const Function *> It = scc_begin(&F); !It.isAtEnd();
-       ++It, ++SccNum) {
-    // Ignore single-block SCCs since they either aren't loops or LoopInfo will
-    // catch them.
-    const std::vector<const BasicBlock *> &Scc = *It;
-    if (Scc.size() == 1)
-      continue;
+  SccI = std::make_unique<SccInfo>(F);
 
-    LLVM_DEBUG(dbgs() << "BPI: SCC " << SccNum << ":");
-    for (auto *BB : Scc) {
-      LLVM_DEBUG(dbgs() << " " << BB->getName());
-      SccI.SccNums[BB] = SccNum;
-    }
-    LLVM_DEBUG(dbgs() << "\n");
-  }
+  assert(EstimatedBlockWeight.empty());
+  assert(EstimatedLoopWeight.empty());
 
+  std::unique_ptr<DominatorTree> DTPtr;
   std::unique_ptr<PostDominatorTree> PDTPtr;
 
+  if (!DT) {
+    DTPtr = std::make_unique<DominatorTree>(const_cast<Function &>(F));
+    DT = DTPtr.get();
+  }
+
   if (!PDT) {
     PDTPtr = std::make_unique<PostDominatorTree>(const_cast<Function &>(F));
     PDT = PDTPtr.get();
   }
 
-  computePostDominatedByUnreachable(F, PDT);
-  computePostDominatedByColdCall(F, PDT);
+  computeEestimateBlockWeight(F, DT, PDT);
 
   // Walk the basic blocks in post-order so that we can build up state about
   // the successors of a block iteratively.
@@ -1113,13 +1289,7 @@ void BranchProbabilityInfo::calculate(const Function &F, const LoopInfo &LI,
       continue;
     if (calcMetadataWeights(BB))
       continue;
-    if (calcInvokeHeuristics(BB))
-      continue;
-    if (calcUnreachableHeuristics(BB))
-      continue;
-    if (calcColdCallHeuristics(BB))
-      continue;
-    if (calcLoopBranchHeuristics(BB, LI, SccI))
+    if (calcEstimatedHeuristics(BB))
       continue;
     if (calcPointerHeuristics(BB))
       continue;
@@ -1129,8 +1299,9 @@ void BranchProbabilityInfo::calculate(const Function &F, const LoopInfo &LI,
       continue;
   }
 
-  PostDominatedByUnreachable.clear();
-  PostDominatedByColdCall.clear();
+  EstimatedLoopWeight.clear();
+  EstimatedBlockWeight.clear();
+  SccI.reset();
 
   if (PrintBranchProb &&
       (PrintBranchProbFuncName.empty() ||
@@ -1147,6 +1318,7 @@ void BranchProbabilityInfoWrapperPass::getAnalysisUsage(
   AU.addRequired<DominatorTreeWrapperPass>();
   AU.addRequired<LoopInfoWrapperPass>();
   AU.addRequired<TargetLibraryInfoWrapperPass>();
+  AU.addRequired<DominatorTreeWrapperPass>();
   AU.addRequired<PostDominatorTreeWrapperPass>();
   AU.setPreservesAll();
 }
@@ -1155,9 +1327,10 @@ bool BranchProbabilityInfoWrapperPass::runOnFunction(Function &F) {
   const LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   const TargetLibraryInfo &TLI =
       getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   PostDominatorTree &PDT =
       getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
-  BPI.calculate(F, LI, &TLI, &PDT);
+  BPI.calculate(F, LI, &TLI, &DT, &PDT);
   return false;
 }
 
@@ -1174,6 +1347,7 @@ BranchProbabilityAnalysis::run(Function &F, FunctionAnalysisManager &AM) {
   BranchProbabilityInfo BPI;
   BPI.calculate(F, AM.getResult<LoopAnalysis>(F),
                 &AM.getResult<TargetLibraryAnalysis>(F),
+                &AM.getResult<DominatorTreeAnalysis>(F),
                 &AM.getResult<PostDominatorTreeAnalysis>(F));
   return BPI;
 }
diff --git a/contrib/llvm-project/llvm/lib/Analysis/CFG.cpp b/contrib/llvm-project/llvm/lib/Analysis/CFG.cpp
index b46a6951dd25..33602ed71675 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/CFG.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/CFG.cpp
@@ -14,9 +14,18 @@
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/Dominators.h"
+#include "llvm/Support/CommandLine.h"
 
 using namespace llvm;
 
+// The max number of basic blocks explored during reachability analysis between
+// two basic blocks. This is kept reasonably small to limit compile time when
+// repeatedly used by clients of this analysis (such as captureTracking).
+static cl::opt<unsigned> DefaultMaxBBsToExplore(
+    "dom-tree-reachability-max-bbs-to-explore", cl::Hidden,
+    cl::desc("Max number of BBs to explore for reachability analysis"),
+    cl::init(32));
+
 /// FindFunctionBackedges - Analyze the specified function to find all of the
 /// loop backedges in the function and return them.  This is a relatively cheap
 /// (compared to computing dominators and loop info) analysis.
@@ -94,7 +103,7 @@ bool llvm::isCriticalEdge(const Instruction *TI, const BasicBlock *Dest,
   assert(TI->isTerminator() && "Must be a terminator to have successors!");
   if (TI->getNumSuccessors() == 1) return false;
 
-  assert(find(predecessors(Dest), TI->getParent()) != pred_end(Dest) &&
+  assert(is_contained(predecessors(Dest), TI->getParent()) &&
          "No edge between TI's block and Dest.");
 
   const_pred_iterator I = pred_begin(Dest), E = pred_end(Dest);
@@ -152,9 +161,7 @@ bool llvm::isPotentiallyReachableFromMany(
 
   const Loop *StopLoop = LI ? getOutermostLoop(LI, StopBB) : nullptr;
 
-  // Limit the number of blocks we visit. The goal is to avoid run-away compile
-  // times on large CFGs without hampering sensible code. Arbitrarily chosen.
-  unsigned Limit = 32;
+  unsigned Limit = DefaultMaxBBsToExplore;
   SmallPtrSet<const BasicBlock*, 32> Visited;
   do {
     BasicBlock *BB = Worklist.pop_back_val();
diff --git a/contrib/llvm-project/llvm/lib/Analysis/CFGPrinter.cpp b/contrib/llvm-project/llvm/lib/Analysis/CFGPrinter.cpp
index cf4afc8cfd9c..33b5a4603272 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/CFGPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/CFGPrinter.cpp
@@ -272,16 +272,17 @@ FunctionPass *llvm::createCFGOnlyPrinterLegacyPassPass() {
 
 void DOTGraphTraits<DOTFuncInfo *>::computeHiddenNodes(const Function *F) {
   auto evaluateBB = [&](const BasicBlock *Node) {
-    if (succ_begin(Node) == succ_end(Node)) {
+    if (succ_empty(Node)) {
       const Instruction *TI = Node->getTerminator();
       isHiddenBasicBlock[Node] =
           (HideUnreachablePaths && isa<UnreachableInst>(TI)) ||
           (HideDeoptimizePaths && Node->getTerminatingDeoptimizeCall());
       return;
     }
-    isHiddenBasicBlock[Node] = std::all_of(
-        succ_begin(Node), succ_end(Node),
-        [this](const BasicBlock *BB) { return isHiddenBasicBlock[BB]; });
+    isHiddenBasicBlock[Node] =
+        llvm::all_of(successors(Node), [this](const BasicBlock *BB) {
+          return isHiddenBasicBlock[BB];
+        });
   };
   /// The post order traversal iteration is done to know the status of
   /// isHiddenBasicBlock for all the successors on the current BB.
@@ -289,7 +290,8 @@ void DOTGraphTraits<DOTFuncInfo *>::computeHiddenNodes(const Function *F) {
            evaluateBB);
 }
 
-bool DOTGraphTraits<DOTFuncInfo *>::isNodeHidden(const BasicBlock *Node) {
+bool DOTGraphTraits<DOTFuncInfo *>::isNodeHidden(const BasicBlock *Node,
+                                                 const DOTFuncInfo *CFGInfo) {
   // If both restricting flags are false, all nodes are displayed.
   if (!HideUnreachablePaths && !HideDeoptimizePaths)
     return false;
diff --git a/contrib/llvm-project/llvm/lib/Analysis/CFLAndersAliasAnalysis.cpp b/contrib/llvm-project/llvm/lib/Analysis/CFLAndersAliasAnalysis.cpp
index 179f0633df06..2be23a56cc7b 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/CFLAndersAliasAnalysis.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/CFLAndersAliasAnalysis.cpp
@@ -559,8 +559,7 @@ bool CFLAndersAAResult::FunctionInfo::mayAlias(
 
     if (RangePair.first != RangePair.second) {
       // Be conservative about unknown sizes
-      if (MaybeLHSSize == LocationSize::unknown() ||
-          MaybeRHSSize == LocationSize::unknown())
+      if (!MaybeLHSSize.hasValue() || !MaybeRHSSize.hasValue())
         return true;
 
       const uint64_t LHSSize = MaybeLHSSize.getValue();
diff --git a/contrib/llvm-project/llvm/lib/Analysis/CGSCCPassManager.cpp b/contrib/llvm-project/llvm/lib/Analysis/CGSCCPassManager.cpp
index fd3166f8cd0c..3230e9036b8e 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/CGSCCPassManager.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/CGSCCPassManager.cpp
@@ -20,10 +20,13 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/PassManagerImpl.h"
+#include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TimeProfiler.h"
+#include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
 #include <iterator>
@@ -36,6 +39,11 @@ using namespace llvm;
 // template typedefs.
 namespace llvm {
 
+static cl::opt<bool> AbortOnMaxDevirtIterationsReached(
+    "abort-on-max-devirt-iterations-reached",
+    cl::desc("Abort when the max iterations for devirtualization CGSCC repeat "
+             "pass is reached"));
+
 // Explicit instantiations for the core proxy templates.
 template class AllAnalysesOn<LazyCallGraph::SCC>;
 template class AnalysisManager<LazyCallGraph::SCC, LazyCallGraph &>;
@@ -78,9 +86,6 @@ PassManager<LazyCallGraph::SCC, CGSCCAnalysisManager, LazyCallGraph &,
     if (!PI.runBeforePass(*Pass, *C))
       continue;
 
-    if (DebugLogging)
-      dbgs() << "Running pass: " << Pass->name() << " on " << *C << "\n";
-
     PreservedAnalyses PassPA;
     {
       TimeTraceScope TimeScope(Pass->name());
@@ -88,9 +93,9 @@ PassManager<LazyCallGraph::SCC, CGSCCAnalysisManager, LazyCallGraph &,
     }
 
     if (UR.InvalidatedSCCs.count(C))
-      PI.runAfterPassInvalidated<LazyCallGraph::SCC>(*Pass);
+      PI.runAfterPassInvalidated<LazyCallGraph::SCC>(*Pass, PassPA);
     else
-      PI.runAfterPass<LazyCallGraph::SCC>(*Pass, *C);
+      PI.runAfterPass<LazyCallGraph::SCC>(*Pass, *C, PassPA);
 
     // Update the SCC if necessary.
     C = UR.UpdatedC ? UR.UpdatedC : C;
@@ -143,6 +148,452 @@ PassManager<LazyCallGraph::SCC, CGSCCAnalysisManager, LazyCallGraph &,
   return PA;
 }
 
+PreservedAnalyses
+ModuleToPostOrderCGSCCPassAdaptor::run(Module &M, ModuleAnalysisManager &AM) {
+  // Setup the CGSCC analysis manager from its proxy.
+  CGSCCAnalysisManager &CGAM =
+      AM.getResult<CGSCCAnalysisManagerModuleProxy>(M).getManager();
+
+  // Get the call graph for this module.
+  LazyCallGraph &CG = AM.getResult<LazyCallGraphAnalysis>(M);
+
+  // Get Function analysis manager from its proxy.
+  FunctionAnalysisManager &FAM =
+      AM.getCachedResult<FunctionAnalysisManagerModuleProxy>(M)->getManager();
+
+  // We keep worklists to allow us to push more work onto the pass manager as
+  // the passes are run.
+  SmallPriorityWorklist<LazyCallGraph::RefSCC *, 1> RCWorklist;
+  SmallPriorityWorklist<LazyCallGraph::SCC *, 1> CWorklist;
+
+  // Keep sets for invalidated SCCs and RefSCCs that should be skipped when
+  // iterating off the worklists.
+  SmallPtrSet<LazyCallGraph::RefSCC *, 4> InvalidRefSCCSet;
+  SmallPtrSet<LazyCallGraph::SCC *, 4> InvalidSCCSet;
+
+  SmallDenseSet<std::pair<LazyCallGraph::Node *, LazyCallGraph::SCC *>, 4>
+      InlinedInternalEdges;
+
+  CGSCCUpdateResult UR = {
+      RCWorklist, CWorklist, InvalidRefSCCSet,         InvalidSCCSet,
+      nullptr,    nullptr,   PreservedAnalyses::all(), InlinedInternalEdges,
+      {}};
+
+  // Request PassInstrumentation from analysis manager, will use it to run
+  // instrumenting callbacks for the passes later.
+  PassInstrumentation PI = AM.getResult<PassInstrumentationAnalysis>(M);
+
+  PreservedAnalyses PA = PreservedAnalyses::all();
+  CG.buildRefSCCs();
+  for (auto RCI = CG.postorder_ref_scc_begin(),
+            RCE = CG.postorder_ref_scc_end();
+       RCI != RCE;) {
+    assert(RCWorklist.empty() &&
+           "Should always start with an empty RefSCC worklist");
+    // The postorder_ref_sccs range we are walking is lazily constructed, so
+    // we only push the first one onto the worklist. The worklist allows us
+    // to capture *new* RefSCCs created during transformations.
+    //
+    // We really want to form RefSCCs lazily because that makes them cheaper
+    // to update as the program is simplified and allows us to have greater
+    // cache locality as forming a RefSCC touches all the parts of all the
+    // functions within that RefSCC.
+    //
+    // We also eagerly increment the iterator to the next position because
+    // the CGSCC passes below may delete the current RefSCC.
+    RCWorklist.insert(&*RCI++);
+
+    do {
+      LazyCallGraph::RefSCC *RC = RCWorklist.pop_back_val();
+      if (InvalidRefSCCSet.count(RC)) {
+        LLVM_DEBUG(dbgs() << "Skipping an invalid RefSCC...\n");
+        continue;
+      }
+
+      assert(CWorklist.empty() &&
+             "Should always start with an empty SCC worklist");
+
+      LLVM_DEBUG(dbgs() << "Running an SCC pass across the RefSCC: " << *RC
+                        << "\n");
+
+      // The top of the worklist may *also* be the same SCC we just ran over
+      // (and invalidated for). Keep track of that last SCC we processed due
+      // to SCC update to avoid redundant processing when an SCC is both just
+      // updated itself and at the top of the worklist.
+      LazyCallGraph::SCC *LastUpdatedC = nullptr;
+
+      // Push the initial SCCs in reverse post-order as we'll pop off the
+      // back and so see this in post-order.
+      for (LazyCallGraph::SCC &C : llvm::reverse(*RC))
+        CWorklist.insert(&C);
+
+      do {
+        LazyCallGraph::SCC *C = CWorklist.pop_back_val();
+        // Due to call graph mutations, we may have invalid SCCs or SCCs from
+        // other RefSCCs in the worklist. The invalid ones are dead and the
+        // other RefSCCs should be queued above, so we just need to skip both
+        // scenarios here.
+        if (InvalidSCCSet.count(C)) {
+          LLVM_DEBUG(dbgs() << "Skipping an invalid SCC...\n");
+          continue;
+        }
+        if (LastUpdatedC == C) {
+          LLVM_DEBUG(dbgs() << "Skipping redundant run on SCC: " << *C << "\n");
+          continue;
+        }
+        if (&C->getOuterRefSCC() != RC) {
+          LLVM_DEBUG(dbgs() << "Skipping an SCC that is now part of some other "
+                               "RefSCC...\n");
+          continue;
+        }
+
+        // Ensure we can proxy analysis updates from the CGSCC analysis manager
+        // into the the Function analysis manager by getting a proxy here.
+        // This also needs to update the FunctionAnalysisManager, as this may be
+        // the first time we see this SCC.
+        CGAM.getResult<FunctionAnalysisManagerCGSCCProxy>(*C, CG).updateFAM(
+            FAM);
+
+        // Each time we visit a new SCC pulled off the worklist,
+        // a transformation of a child SCC may have also modified this parent
+        // and invalidated analyses. So we invalidate using the update record's
+        // cross-SCC preserved set. This preserved set is intersected by any
+        // CGSCC pass that handles invalidation (primarily pass managers) prior
+        // to marking its SCC as preserved. That lets us track everything that
+        // might need invalidation across SCCs without excessive invalidations
+        // on a single SCC.
+        //
+        // This essentially allows SCC passes to freely invalidate analyses
+        // of any ancestor SCC. If this becomes detrimental to successfully
+        // caching analyses, we could force each SCC pass to manually
+        // invalidate the analyses for any SCCs other than themselves which
+        // are mutated. However, that seems to lose the robustness of the
+        // pass-manager driven invalidation scheme.
+        CGAM.invalidate(*C, UR.CrossSCCPA);
+
+        do {
+          // Check that we didn't miss any update scenario.
+          assert(!InvalidSCCSet.count(C) && "Processing an invalid SCC!");
+          assert(C->begin() != C->end() && "Cannot have an empty SCC!");
+          assert(&C->getOuterRefSCC() == RC &&
+                 "Processing an SCC in a different RefSCC!");
+
+          LastUpdatedC = UR.UpdatedC;
+          UR.UpdatedRC = nullptr;
+          UR.UpdatedC = nullptr;
+
+          // Check the PassInstrumentation's BeforePass callbacks before
+          // running the pass, skip its execution completely if asked to
+          // (callback returns false).
+          if (!PI.runBeforePass<LazyCallGraph::SCC>(*Pass, *C))
+            continue;
+
+          PreservedAnalyses PassPA;
+          {
+            TimeTraceScope TimeScope(Pass->name());
+            PassPA = Pass->run(*C, CGAM, CG, UR);
+          }
+
+          if (UR.InvalidatedSCCs.count(C))
+            PI.runAfterPassInvalidated<LazyCallGraph::SCC>(*Pass, PassPA);
+          else
+            PI.runAfterPass<LazyCallGraph::SCC>(*Pass, *C, PassPA);
+
+          // Update the SCC and RefSCC if necessary.
+          C = UR.UpdatedC ? UR.UpdatedC : C;
+          RC = UR.UpdatedRC ? UR.UpdatedRC : RC;
+
+          if (UR.UpdatedC) {
+            // If we're updating the SCC, also update the FAM inside the proxy's
+            // result.
+            CGAM.getResult<FunctionAnalysisManagerCGSCCProxy>(*C, CG).updateFAM(
+                FAM);
+          }
+
+          // If the CGSCC pass wasn't able to provide a valid updated SCC,
+          // the current SCC may simply need to be skipped if invalid.
+          if (UR.InvalidatedSCCs.count(C)) {
+            LLVM_DEBUG(dbgs() << "Skipping invalidated root or island SCC!\n");
+            break;
+          }
+          // Check that we didn't miss any update scenario.
+          assert(C->begin() != C->end() && "Cannot have an empty SCC!");
+
+          // We handle invalidating the CGSCC analysis manager's information
+          // for the (potentially updated) SCC here. Note that any other SCCs
+          // whose structure has changed should have been invalidated by
+          // whatever was updating the call graph. This SCC gets invalidated
+          // late as it contains the nodes that were actively being
+          // processed.
+          CGAM.invalidate(*C, PassPA);
+
+          // Then intersect the preserved set so that invalidation of module
+          // analyses will eventually occur when the module pass completes.
+          // Also intersect with the cross-SCC preserved set to capture any
+          // cross-SCC invalidation.
+          UR.CrossSCCPA.intersect(PassPA);
+          PA.intersect(std::move(PassPA));
+
+          // The pass may have restructured the call graph and refined the
+          // current SCC and/or RefSCC. We need to update our current SCC and
+          // RefSCC pointers to follow these. Also, when the current SCC is
+          // refined, re-run the SCC pass over the newly refined SCC in order
+          // to observe the most precise SCC model available. This inherently
+          // cannot cycle excessively as it only happens when we split SCCs
+          // apart, at most converging on a DAG of single nodes.
+          // FIXME: If we ever start having RefSCC passes, we'll want to
+          // iterate there too.
+          if (UR.UpdatedC)
+            LLVM_DEBUG(dbgs()
+                       << "Re-running SCC passes after a refinement of the "
+                          "current SCC: "
+                       << *UR.UpdatedC << "\n");
+
+          // Note that both `C` and `RC` may at this point refer to deleted,
+          // invalid SCC and RefSCCs respectively. But we will short circuit
+          // the processing when we check them in the loop above.
+        } while (UR.UpdatedC);
+      } while (!CWorklist.empty());
+
+      // We only need to keep internal inlined edge information within
+      // a RefSCC, clear it to save on space and let the next time we visit
+      // any of these functions have a fresh start.
+      InlinedInternalEdges.clear();
+    } while (!RCWorklist.empty());
+  }
+
+  // By definition we preserve the call garph, all SCC analyses, and the
+  // analysis proxies by handling them above and in any nested pass managers.
+  PA.preserveSet<AllAnalysesOn<LazyCallGraph::SCC>>();
+  PA.preserve<LazyCallGraphAnalysis>();
+  PA.preserve<CGSCCAnalysisManagerModuleProxy>();
+  PA.preserve<FunctionAnalysisManagerModuleProxy>();
+  return PA;
+}
+
+PreservedAnalyses DevirtSCCRepeatedPass::run(LazyCallGraph::SCC &InitialC,
+                                             CGSCCAnalysisManager &AM,
+                                             LazyCallGraph &CG,
+                                             CGSCCUpdateResult &UR) {
+  PreservedAnalyses PA = PreservedAnalyses::all();
+  PassInstrumentation PI =
+      AM.getResult<PassInstrumentationAnalysis>(InitialC, CG);
+
+  // The SCC may be refined while we are running passes over it, so set up
+  // a pointer that we can update.
+  LazyCallGraph::SCC *C = &InitialC;
+
+  // Struct to track the counts of direct and indirect calls in each function
+  // of the SCC.
+  struct CallCount {
+    int Direct;
+    int Indirect;
+  };
+
+  // Put value handles on all of the indirect calls and return the number of
+  // direct calls for each function in the SCC.
+  auto ScanSCC = [](LazyCallGraph::SCC &C,
+                    SmallMapVector<Value *, WeakTrackingVH, 16> &CallHandles) {
+    assert(CallHandles.empty() && "Must start with a clear set of handles.");
+
+    SmallDenseMap<Function *, CallCount> CallCounts;
+    CallCount CountLocal = {0, 0};
+    for (LazyCallGraph::Node &N : C) {
+      CallCount &Count =
+          CallCounts.insert(std::make_pair(&N.getFunction(), CountLocal))
+              .first->second;
+      for (Instruction &I : instructions(N.getFunction()))
+        if (auto *CB = dyn_cast<CallBase>(&I)) {
+          if (CB->getCalledFunction()) {
+            ++Count.Direct;
+          } else {
+            ++Count.Indirect;
+            CallHandles.insert({CB, WeakTrackingVH(CB)});
+          }
+        }
+    }
+
+    return CallCounts;
+  };
+
+  UR.IndirectVHs.clear();
+  // Populate the initial call handles and get the initial call counts.
+  auto CallCounts = ScanSCC(*C, UR.IndirectVHs);
+
+  for (int Iteration = 0;; ++Iteration) {
+    if (!PI.runBeforePass<LazyCallGraph::SCC>(*Pass, *C))
+      continue;
+
+    PreservedAnalyses PassPA = Pass->run(*C, AM, CG, UR);
+
+    if (UR.InvalidatedSCCs.count(C))
+      PI.runAfterPassInvalidated<LazyCallGraph::SCC>(*Pass, PassPA);
+    else
+      PI.runAfterPass<LazyCallGraph::SCC>(*Pass, *C, PassPA);
+
+    // If the SCC structure has changed, bail immediately and let the outer
+    // CGSCC layer handle any iteration to reflect the refined structure.
+    if (UR.UpdatedC && UR.UpdatedC != C) {
+      PA.intersect(std::move(PassPA));
+      break;
+    }
+
+    // Check that we didn't miss any update scenario.
+    assert(!UR.InvalidatedSCCs.count(C) && "Processing an invalid SCC!");
+    assert(C->begin() != C->end() && "Cannot have an empty SCC!");
+
+    // Check whether any of the handles were devirtualized.
+    bool Devirt = llvm::any_of(UR.IndirectVHs, [](auto &P) -> bool {
+      if (P.second) {
+        if (CallBase *CB = dyn_cast<CallBase>(P.second)) {
+          if (CB->getCalledFunction()) {
+            LLVM_DEBUG(dbgs() << "Found devirtualized call: " << *CB << "\n");
+            return true;
+          }
+        }
+      }
+      return false;
+    });
+
+    // Rescan to build up a new set of handles and count how many direct
+    // calls remain. If we decide to iterate, this also sets up the input to
+    // the next iteration.
+    UR.IndirectVHs.clear();
+    auto NewCallCounts = ScanSCC(*C, UR.IndirectVHs);
+
+    // If we haven't found an explicit devirtualization already see if we
+    // have decreased the number of indirect calls and increased the number
+    // of direct calls for any function in the SCC. This can be fooled by all
+    // manner of transformations such as DCE and other things, but seems to
+    // work well in practice.
+    if (!Devirt)
+      // Iterate over the keys in NewCallCounts, if Function also exists in
+      // CallCounts, make the check below.
+      for (auto &Pair : NewCallCounts) {
+        auto &CallCountNew = Pair.second;
+        auto CountIt = CallCounts.find(Pair.first);
+        if (CountIt != CallCounts.end()) {
+          const auto &CallCountOld = CountIt->second;
+          if (CallCountOld.Indirect > CallCountNew.Indirect &&
+              CallCountOld.Direct < CallCountNew.Direct) {
+            Devirt = true;
+            break;
+          }
+        }
+      }
+
+    if (!Devirt) {
+      PA.intersect(std::move(PassPA));
+      break;
+    }
+
+    // Otherwise, if we've already hit our max, we're done.
+    if (Iteration >= MaxIterations) {
+      if (AbortOnMaxDevirtIterationsReached)
+        report_fatal_error("Max devirtualization iterations reached");
+      LLVM_DEBUG(
+          dbgs() << "Found another devirtualization after hitting the max "
+                    "number of repetitions ("
+                 << MaxIterations << ") on SCC: " << *C << "\n");
+      PA.intersect(std::move(PassPA));
+      break;
+    }
+
+    LLVM_DEBUG(
+        dbgs() << "Repeating an SCC pass after finding a devirtualization in: "
+               << *C << "\n");
+
+    // Move over the new call counts in preparation for iterating.
+    CallCounts = std::move(NewCallCounts);
+
+    // Update the analysis manager with each run and intersect the total set
+    // of preserved analyses so we're ready to iterate.
+    AM.invalidate(*C, PassPA);
+
+    PA.intersect(std::move(PassPA));
+  }
+
+  // Note that we don't add any preserved entries here unlike a more normal
+  // "pass manager" because we only handle invalidation *between* iterations,
+  // not after the last iteration.
+  return PA;
+}
+
+PreservedAnalyses CGSCCToFunctionPassAdaptor::run(LazyCallGraph::SCC &C,
+                                                  CGSCCAnalysisManager &AM,
+                                                  LazyCallGraph &CG,
+                                                  CGSCCUpdateResult &UR) {
+  // Setup the function analysis manager from its proxy.
+  FunctionAnalysisManager &FAM =
+      AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager();
+
+  SmallVector<LazyCallGraph::Node *, 4> Nodes;
+  for (LazyCallGraph::Node &N : C)
+    Nodes.push_back(&N);
+
+  // The SCC may get split while we are optimizing functions due to deleting
+  // edges. If this happens, the current SCC can shift, so keep track of
+  // a pointer we can overwrite.
+  LazyCallGraph::SCC *CurrentC = &C;
+
+  LLVM_DEBUG(dbgs() << "Running function passes across an SCC: " << C << "\n");
+
+  PreservedAnalyses PA = PreservedAnalyses::all();
+  for (LazyCallGraph::Node *N : Nodes) {
+    // Skip nodes from other SCCs. These may have been split out during
+    // processing. We'll eventually visit those SCCs and pick up the nodes
+    // there.
+    if (CG.lookupSCC(*N) != CurrentC)
+      continue;
+
+    Function &F = N->getFunction();
+
+    PassInstrumentation PI = FAM.getResult<PassInstrumentationAnalysis>(F);
+    if (!PI.runBeforePass<Function>(*Pass, F))
+      continue;
+
+    PreservedAnalyses PassPA;
+    {
+      TimeTraceScope TimeScope(Pass->name());
+      PassPA = Pass->run(F, FAM);
+    }
+
+    PI.runAfterPass<Function>(*Pass, F, PassPA);
+
+    // We know that the function pass couldn't have invalidated any other
+    // function's analyses (that's the contract of a function pass), so
+    // directly handle the function analysis manager's invalidation here.
+    FAM.invalidate(F, PassPA);
+
+    // Then intersect the preserved set so that invalidation of module
+    // analyses will eventually occur when the module pass completes.
+    PA.intersect(std::move(PassPA));
+
+    // If the call graph hasn't been preserved, update it based on this
+    // function pass. This may also update the current SCC to point to
+    // a smaller, more refined SCC.
+    auto PAC = PA.getChecker<LazyCallGraphAnalysis>();
+    if (!PAC.preserved() && !PAC.preservedSet<AllAnalysesOn<Module>>()) {
+      CurrentC = &updateCGAndAnalysisManagerForFunctionPass(CG, *CurrentC, *N,
+                                                            AM, UR, FAM);
+      assert(CG.lookupSCC(*N) == CurrentC &&
+             "Current SCC not updated to the SCC containing the current node!");
+    }
+  }
+
+  // By definition we preserve the proxy. And we preserve all analyses on
+  // Functions. This precludes *any* invalidation of function analyses by the
+  // proxy, but that's OK because we've taken care to invalidate analyses in
+  // the function analysis manager incrementally above.
+  PA.preserveSet<AllAnalysesOn<Function>>();
+  PA.preserve<FunctionAnalysisManagerCGSCCProxy>();
+
+  // We've also ensured that we updated the call graph along the way.
+  PA.preserve<LazyCallGraphAnalysis>();
+
+  return PA;
+}
+
 bool CGSCCAnalysisManagerModuleProxy::Result::invalidate(
     Module &M, const PreservedAnalyses &PA,
     ModuleAnalysisManager::Invalidator &Inv) {
@@ -382,7 +833,7 @@ incorporateNewSCCRange(const SCCRangeT &NewSCCRange, LazyCallGraph &G,
                        CGSCCAnalysisManager &AM, CGSCCUpdateResult &UR) {
   using SCC = LazyCallGraph::SCC;
 
-  if (NewSCCRange.begin() == NewSCCRange.end())
+  if (NewSCCRange.empty())
     return C;
 
   // Add the current SCC to the worklist as its shape has changed.
@@ -466,46 +917,61 @@ static LazyCallGraph::SCC &updateCGAndAnalysisManagerForPass(
   // First walk the function and handle all called functions. We do this first
   // because if there is a single call edge, whether there are ref edges is
   // irrelevant.
-  for (Instruction &I : instructions(F))
-    if (auto *CB = dyn_cast<CallBase>(&I))
-      if (Function *Callee = CB->getCalledFunction())
+  for (Instruction &I : instructions(F)) {
+    if (auto *CB = dyn_cast<CallBase>(&I)) {
+      if (Function *Callee = CB->getCalledFunction()) {
         if (Visited.insert(Callee).second && !Callee->isDeclaration()) {
-          Node &CalleeN = *G.lookup(*Callee);
-          Edge *E = N->lookup(CalleeN);
+          Node *CalleeN = G.lookup(*Callee);
+          assert(CalleeN &&
+                 "Visited function should already have an associated node");
+          Edge *E = N->lookup(*CalleeN);
           assert((E || !FunctionPass) &&
                  "No function transformations should introduce *new* "
                  "call edges! Any new calls should be modeled as "
                  "promoted existing ref edges!");
-          bool Inserted = RetainedEdges.insert(&CalleeN).second;
+          bool Inserted = RetainedEdges.insert(CalleeN).second;
           (void)Inserted;
           assert(Inserted && "We should never visit a function twice.");
           if (!E)
-            NewCallEdges.insert(&CalleeN);
+            NewCallEdges.insert(CalleeN);
           else if (!E->isCall())
-            PromotedRefTargets.insert(&CalleeN);
+            PromotedRefTargets.insert(CalleeN);
         }
+      } else {
+        // We can miss devirtualization if an indirect call is created then
+        // promoted before updateCGAndAnalysisManagerForPass runs.
+        auto *Entry = UR.IndirectVHs.find(CB);
+        if (Entry == UR.IndirectVHs.end())
+          UR.IndirectVHs.insert({CB, WeakTrackingVH(CB)});
+        else if (!Entry->second)
+          Entry->second = WeakTrackingVH(CB);
+      }
+    }
+  }
 
   // Now walk all references.
   for (Instruction &I : instructions(F))
     for (Value *Op : I.operand_values())
-      if (auto *C = dyn_cast<Constant>(Op))
-        if (Visited.insert(C).second)
-          Worklist.push_back(C);
+      if (auto *OpC = dyn_cast<Constant>(Op))
+        if (Visited.insert(OpC).second)
+          Worklist.push_back(OpC);
 
   auto VisitRef = [&](Function &Referee) {
-    Node &RefereeN = *G.lookup(Referee);
-    Edge *E = N->lookup(RefereeN);
+    Node *RefereeN = G.lookup(Referee);
+    assert(RefereeN &&
+           "Visited function should already have an associated node");
+    Edge *E = N->lookup(*RefereeN);
     assert((E || !FunctionPass) &&
            "No function transformations should introduce *new* ref "
            "edges! Any new ref edges would require IPO which "
            "function passes aren't allowed to do!");
-    bool Inserted = RetainedEdges.insert(&RefereeN).second;
+    bool Inserted = RetainedEdges.insert(RefereeN).second;
     (void)Inserted;
     assert(Inserted && "We should never visit a function twice.");
     if (!E)
-      NewRefEdges.insert(&RefereeN);
+      NewRefEdges.insert(RefereeN);
     else if (E->isCall())
-      DemotedCallTargets.insert(&RefereeN);
+      DemotedCallTargets.insert(RefereeN);
   };
   LazyCallGraph::visitReferences(Worklist, Visited, VisitRef);
 
@@ -528,15 +994,17 @@ static LazyCallGraph::SCC &updateCGAndAnalysisManagerForPass(
     // TODO: This only allows trivial edges to be added for now.
     assert((RC == &TargetRC ||
            RC->isAncestorOf(TargetRC)) && "New call edge is not trivial!");
-    RC->insertTrivialCallEdge(N, *CallTarget);
+    // Add a trivial ref edge to be promoted later on alongside
+    // PromotedRefTargets.
+    RC->insertTrivialRefEdge(N, *CallTarget);
   }
 
   // Include synthetic reference edges to known, defined lib functions.
-  for (auto *F : G.getLibFunctions())
+  for (auto *LibFn : G.getLibFunctions())
     // While the list of lib functions doesn't have repeats, don't re-visit
     // anything handled above.
-    if (!Visited.count(F))
-      VisitRef(*F);
+    if (!Visited.count(LibFn))
+      VisitRef(*LibFn);
 
   // First remove all of the edges that are no longer present in this function.
   // The first step makes these edges uniformly ref edges and accumulates them
@@ -563,23 +1031,20 @@ static LazyCallGraph::SCC &updateCGAndAnalysisManagerForPass(
     DeadTargets.push_back(&E.getNode());
   }
   // Remove the easy cases quickly and actually pull them out of our list.
-  DeadTargets.erase(
-      llvm::remove_if(DeadTargets,
-                      [&](Node *TargetN) {
-                        SCC &TargetC = *G.lookupSCC(*TargetN);
-                        RefSCC &TargetRC = TargetC.getOuterRefSCC();
-
-                        // We can't trivially remove internal targets, so skip
-                        // those.
-                        if (&TargetRC == RC)
-                          return false;
-
-                        RC->removeOutgoingEdge(N, *TargetN);
-                        LLVM_DEBUG(dbgs() << "Deleting outgoing edge from '"
-                                          << N << "' to '" << TargetN << "'\n");
-                        return true;
-                      }),
-      DeadTargets.end());
+  llvm::erase_if(DeadTargets, [&](Node *TargetN) {
+    SCC &TargetC = *G.lookupSCC(*TargetN);
+    RefSCC &TargetRC = TargetC.getOuterRefSCC();
+
+    // We can't trivially remove internal targets, so skip
+    // those.
+    if (&TargetRC == RC)
+      return false;
+
+    RC->removeOutgoingEdge(N, *TargetN);
+    LLVM_DEBUG(dbgs() << "Deleting outgoing edge from '" << N << "' to '"
+                      << TargetN << "'\n");
+    return true;
+  });
 
   // Now do a batch removal of the internal ref edges left.
   auto NewRefSCCs = RC->removeInternalRefEdge(N, DeadTargets);
@@ -643,6 +1108,11 @@ static LazyCallGraph::SCC &updateCGAndAnalysisManagerForPass(
                                C, AM, UR);
   }
 
+  // We added a ref edge earlier for new call edges, promote those to call edges
+  // alongside PromotedRefTargets.
+  for (Node *E : NewCallEdges)
+    PromotedRefTargets.insert(E);
+
   // Now promote ref edges into call edges.
   for (Node *CallTarget : PromotedRefTargets) {
     SCC &TargetC = *G.lookupSCC(*CallTarget);
diff --git a/contrib/llvm-project/llvm/lib/Analysis/CallGraph.cpp b/contrib/llvm-project/llvm/lib/Analysis/CallGraph.cpp
index 55adb454b733..9b212e564a46 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/CallGraph.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/CallGraph.cpp
@@ -167,20 +167,6 @@ Function *CallGraph::removeFunctionFromModule(CallGraphNode *CGN) {
   return F;
 }
 
-/// spliceFunction - Replace the function represented by this node by another.
-/// This does not rescan the body of the function, so it is suitable when
-/// splicing the body of the old function to the new while also updating all
-/// callers from old to new.
-void CallGraph::spliceFunction(const Function *From, const Function *To) {
-  assert(FunctionMap.count(From) && "No CallGraphNode for function!");
-  assert(!FunctionMap.count(To) &&
-         "Pointing CallGraphNode at a function that already exists");
-  FunctionMapTy::iterator I = FunctionMap.find(From);
-  I->second->F = const_cast<Function*>(To);
-  FunctionMap[To] = std::move(I->second);
-  FunctionMap.erase(I);
-}
-
 // getOrInsertFunction - This method is identical to calling operator[], but
 // it will insert a new CallGraphNode for the specified function if one does
 // not already exist.
@@ -281,13 +267,37 @@ void CallGraphNode::replaceCallEdge(CallBase &Call, CallBase &NewCall,
       I->second = NewNode;
       NewNode->AddRef();
 
-      // Refresh callback references.
-      forEachCallbackFunction(Call, [=](Function *CB) {
-        removeOneAbstractEdgeTo(CG->getOrInsertFunction(CB));
+      // Refresh callback references. Do not resize CalledFunctions if the
+      // number of callbacks is the same for new and old call sites.
+      SmallVector<CallGraphNode *, 4u> OldCBs;
+      SmallVector<CallGraphNode *, 4u> NewCBs;
+      forEachCallbackFunction(Call, [this, &OldCBs](Function *CB) {
+        OldCBs.push_back(CG->getOrInsertFunction(CB));
       });
-      forEachCallbackFunction(NewCall, [=](Function *CB) {
-        addCalledFunction(nullptr, CG->getOrInsertFunction(CB));
+      forEachCallbackFunction(NewCall, [this, &NewCBs](Function *CB) {
+        NewCBs.push_back(CG->getOrInsertFunction(CB));
       });
+      if (OldCBs.size() == NewCBs.size()) {
+        for (unsigned N = 0; N < OldCBs.size(); ++N) {
+          CallGraphNode *OldNode = OldCBs[N];
+          CallGraphNode *NewNode = NewCBs[N];
+          for (auto J = CalledFunctions.begin();; ++J) {
+            assert(J != CalledFunctions.end() &&
+                   "Cannot find callsite to update!");
+            if (!J->first && J->second == OldNode) {
+              J->second = NewNode;
+              OldNode->DropRef();
+              NewNode->AddRef();
+              break;
+            }
+          }
+        }
+      } else {
+        for (auto *CGN : OldCBs)
+          removeOneAbstractEdgeTo(CGN);
+        for (auto *CGN : NewCBs)
+          addCalledFunction(nullptr, CGN);
+      }
       return;
     }
   }
diff --git a/contrib/llvm-project/llvm/lib/Analysis/CallGraphSCCPass.cpp b/contrib/llvm-project/llvm/lib/Analysis/CallGraphSCCPass.cpp
index 91f8029cc326..38057d44e2b8 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/CallGraphSCCPass.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/CallGraphSCCPass.cpp
@@ -21,13 +21,14 @@
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/IR/AbstractCallSite.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LegacyPassManagers.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/OptBisect.h"
 #include "llvm/IR/PassTimingInfo.h"
+#include "llvm/IR/PrintPasses.h"
+#include "llvm/IR/StructuralHash.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -42,8 +43,8 @@ using namespace llvm;
 
 #define DEBUG_TYPE "cgscc-passmgr"
 
-static cl::opt<unsigned>
-MaxIterations("max-cg-scc-iterations", cl::ReallyHidden, cl::init(4));
+cl::opt<unsigned> MaxDevirtIterations("max-devirt-iterations", cl::ReallyHidden,
+                                      cl::init(4));
 
 STATISTIC(MaxSCCIterations, "Maximum CGSCCPassMgr iterations on one SCC");
 
@@ -466,16 +467,30 @@ bool CGPassManager::RunAllPassesOnSCC(CallGraphSCC &CurSCC, CallGraph &CG,
 
     initializeAnalysisImpl(P);
 
+#ifdef EXPENSIVE_CHECKS
+    uint64_t RefHash = StructuralHash(CG.getModule());
+#endif
+
     // Actually run this pass on the current SCC.
-    Changed |= RunPassOnSCC(P, CurSCC, CG,
-                            CallGraphUpToDate, DevirtualizedCall);
+    bool LocalChanged =
+        RunPassOnSCC(P, CurSCC, CG, CallGraphUpToDate, DevirtualizedCall);
 
-    if (Changed)
+    Changed |= LocalChanged;
+
+#ifdef EXPENSIVE_CHECKS
+    if (!LocalChanged && (RefHash != StructuralHash(CG.getModule()))) {
+      llvm::errs() << "Pass modifies its input and doesn't report it: "
+                   << P->getPassName() << "\n";
+      llvm_unreachable("Pass modifies its input and doesn't report it");
+    }
+#endif
+    if (LocalChanged)
       dumpPassInfo(P, MODIFICATION_MSG, ON_CG_MSG, "");
     dumpPreservedSet(P);
 
     verifyPreservedAnalysis(P);
-    removeNotPreservedAnalysis(P);
+    if (LocalChanged)
+      removeNotPreservedAnalysis(P);
     recordAvailableAnalysis(P);
     removeDeadPasses(P, "", ON_CG_MSG);
   }
@@ -524,12 +539,12 @@ bool CGPassManager::runOnModule(Module &M) {
                  << '\n');
       DevirtualizedCall = false;
       Changed |= RunAllPassesOnSCC(CurSCC, CG, DevirtualizedCall);
-    } while (Iteration++ < MaxIterations && DevirtualizedCall);
+    } while (Iteration++ < MaxDevirtIterations && DevirtualizedCall);
 
     if (DevirtualizedCall)
       LLVM_DEBUG(dbgs() << "  CGSCCPASSMGR: Stopped iteration after "
                         << Iteration
-                        << " times, due to -max-cg-scc-iterations\n");
+                        << " times, due to -max-devirt-iterations\n");
 
     MaxSCCIterations.updateMax(Iteration);
   }
diff --git a/contrib/llvm-project/llvm/lib/Analysis/CallPrinter.cpp b/contrib/llvm-project/llvm/lib/Analysis/CallPrinter.cpp
index bb447411ec47..872a91ad7cbf 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/CallPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/CallPrinter.cpp
@@ -143,7 +143,8 @@ struct DOTGraphTraits<CallGraphDOTInfo *> : public DefaultDOTGraphTraits {
            std::string(CGInfo->getModule()->getModuleIdentifier());
   }
 
-  static bool isNodeHidden(const CallGraphNode *Node) {
+  static bool isNodeHidden(const CallGraphNode *Node,
+                           const CallGraphDOTInfo *CGInfo) {
     if (CallMultiGraph || Node->getFunction())
       return false;
     return true;
@@ -195,7 +196,7 @@ struct DOTGraphTraits<CallGraphDOTInfo *> : public DefaultDOTGraphTraits {
     Function *F = Node->getFunction();
     if (F == nullptr)
       return "";
-    std::string attrs = "";
+    std::string attrs;
     if (ShowHeatColors) {
       uint64_t freq = CGInfo->getFreq(F);
       std::string color = getHeatColor(freq, CGInfo->getMaxFreq());
diff --git a/contrib/llvm-project/llvm/lib/Analysis/CaptureTracking.cpp b/contrib/llvm-project/llvm/lib/Analysis/CaptureTracking.cpp
index 8b101e3b2cc4..b2fc6e603f9e 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/CaptureTracking.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/CaptureTracking.cpp
@@ -18,6 +18,7 @@
 #include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -29,6 +30,13 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "capture-tracking"
+
+STATISTIC(NumCaptured,          "Number of pointers maybe captured");
+STATISTIC(NumNotCaptured,       "Number of pointers not captured");
+STATISTIC(NumCapturedBefore,    "Number of pointers maybe captured before");
+STATISTIC(NumNotCapturedBefore, "Number of pointers not captured before");
+
 /// The default value for MaxUsesToExplore argument. It's relatively small to
 /// keep the cost of analysis reasonable for clients like BasicAliasAnalysis,
 /// where the results can't be cached.
@@ -159,9 +167,6 @@ namespace {
       if (isa<ReturnInst>(U->getUser()) && !ReturnCaptures)
         return false;
 
-      if (!shouldExplore(U))
-        return false;
-
       Captured = true;
       return true;
     }
@@ -197,6 +202,10 @@ bool llvm::PointerMayBeCaptured(const Value *V,
 
   SimpleCaptureTracker SCT(ReturnCaptures);
   PointerMayBeCaptured(V, &SCT, MaxUsesToExplore);
+  if (SCT.Captured)
+    ++NumCaptured;
+  else
+    ++NumNotCaptured;
   return SCT.Captured;
 }
 
@@ -225,6 +234,10 @@ bool llvm::PointerMayBeCapturedBefore(const Value *V, bool ReturnCaptures,
 
   CapturesBefore CB(ReturnCaptures, I, DT, IncludeI);
   PointerMayBeCaptured(V, &CB, MaxUsesToExplore);
+  if (CB.Captured)
+    ++NumCapturedBefore;
+  else
+    ++NumNotCapturedBefore;
   return CB.Captured;
 }
 
@@ -243,21 +256,24 @@ void llvm::PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker,
     for (const Use &U : V->uses()) {
       // If there are lots of uses, conservatively say that the value
       // is captured to avoid taking too much compile time.
-      if (Count++ >= MaxUsesToExplore)
-        return Tracker->tooManyUses();
+      if (Count++ >= MaxUsesToExplore) {
+        Tracker->tooManyUses();
+        return false;
+      }
       if (!Visited.insert(&U).second)
         continue;
       if (!Tracker->shouldExplore(&U))
         continue;
       Worklist.push_back(&U);
     }
+    return true;
   };
-  AddUses(V);
+  if (!AddUses(V))
+    return;
 
   while (!Worklist.empty()) {
     const Use *U = Worklist.pop_back_val();
     Instruction *I = cast<Instruction>(U->getUser());
-    V = U->get();
 
     switch (I->getOpcode()) {
     case Instruction::Call:
@@ -273,11 +289,12 @@ void llvm::PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker,
       // The pointer is not captured if returned pointer is not captured.
       // NOTE: CaptureTracking users should not assume that only functions
       // marked with nocapture do not capture. This means that places like
-      // GetUnderlyingObject in ValueTracking or DecomposeGEPExpression
+      // getUnderlyingObject in ValueTracking or DecomposeGEPExpression
       // in BasicAA also need to know about this property.
       if (isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(Call,
                                                                       true)) {
-        AddUses(Call);
+        if (!AddUses(Call))
+          return;
         break;
       }
 
@@ -295,13 +312,11 @@ void llvm::PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker,
       // that loading a value from a pointer does not cause the pointer to be
       // captured, even though the loaded value might be the pointer itself
       // (think of self-referential objects).
-      for (auto IdxOpPair : enumerate(Call->data_ops())) {
-        int Idx = IdxOpPair.index();
-        Value *A = IdxOpPair.value();
-        if (A == V && !Call->doesNotCapture(Idx))
-          // The parameter is not marked 'nocapture' - captured.
-          if (Tracker->captured(U))
-            return;
+      if (Call->isDataOperand(U) &&
+          !Call->doesNotCapture(Call->getDataOperandNo(U))) {
+        // The parameter is not marked 'nocapture' - captured.
+        if (Tracker->captured(U))
+          return;
       }
       break;
     }
@@ -315,9 +330,9 @@ void llvm::PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker,
       // "va-arg" from a pointer does not cause it to be captured.
       break;
     case Instruction::Store:
-        // Stored the pointer - conservatively assume it may be captured.
-        // Volatile stores make the address observable.
-      if (V == I->getOperand(0) || cast<StoreInst>(I)->isVolatile())
+      // Stored the pointer - conservatively assume it may be captured.
+      // Volatile stores make the address observable.
+      if (U->getOperandNo() == 0 || cast<StoreInst>(I)->isVolatile())
         if (Tracker->captured(U))
           return;
       break;
@@ -328,7 +343,7 @@ void llvm::PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker,
       // but the value being stored is.
       // Volatile stores make the address observable.
       auto *ARMWI = cast<AtomicRMWInst>(I);
-      if (ARMWI->getValOperand() == V || ARMWI->isVolatile())
+      if (U->getOperandNo() == 1 || ARMWI->isVolatile())
         if (Tracker->captured(U))
           return;
       break;
@@ -340,7 +355,7 @@ void llvm::PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker,
       // but the value being stored is.
       // Volatile stores make the address observable.
       auto *ACXI = cast<AtomicCmpXchgInst>(I);
-      if (ACXI->getCompareOperand() == V || ACXI->getNewValOperand() == V ||
+      if (U->getOperandNo() == 1 || U->getOperandNo() == 2 ||
           ACXI->isVolatile())
         if (Tracker->captured(U))
           return;
@@ -352,17 +367,18 @@ void llvm::PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker,
     case Instruction::Select:
     case Instruction::AddrSpaceCast:
       // The original value is not captured via this if the new value isn't.
-      AddUses(I);
+      if (!AddUses(I))
+        return;
       break;
     case Instruction::ICmp: {
-      unsigned Idx = (I->getOperand(0) == V) ? 0 : 1;
+      unsigned Idx = U->getOperandNo();
       unsigned OtherIdx = 1 - Idx;
       if (auto *CPN = dyn_cast<ConstantPointerNull>(I->getOperand(OtherIdx))) {
         // Don't count comparisons of a no-alias return value against null as
         // captures. This allows us to ignore comparisons of malloc results
         // with null, for example.
         if (CPN->getType()->getAddressSpace() == 0)
-          if (isNoAliasCall(V->stripPointerCasts()))
+          if (isNoAliasCall(U->get()->stripPointerCasts()))
             break;
         if (!I->getFunction()->nullPointerIsDefined()) {
           auto *O = I->getOperand(Idx)->stripPointerCastsSameRepresentation();
@@ -395,3 +411,44 @@ void llvm::PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker,
 
   // All uses examined.
 }
+
+bool llvm::isNonEscapingLocalObject(
+    const Value *V, SmallDenseMap<const Value *, bool, 8> *IsCapturedCache) {
+  SmallDenseMap<const Value *, bool, 8>::iterator CacheIt;
+  if (IsCapturedCache) {
+    bool Inserted;
+    std::tie(CacheIt, Inserted) = IsCapturedCache->insert({V, false});
+    if (!Inserted)
+      // Found cached result, return it!
+      return CacheIt->second;
+  }
+
+  // If this is a local allocation, check to see if it escapes.
+  if (isa<AllocaInst>(V) || isNoAliasCall(V)) {
+    // Set StoreCaptures to True so that we can assume in our callers that the
+    // pointer is not the result of a load instruction. Currently
+    // PointerMayBeCaptured doesn't have any special analysis for the
+    // StoreCaptures=false case; if it did, our callers could be refined to be
+    // more precise.
+    auto Ret = !PointerMayBeCaptured(V, false, /*StoreCaptures=*/true);
+    if (IsCapturedCache)
+      CacheIt->second = Ret;
+    return Ret;
+  }
+
+  // If this is an argument that corresponds to a byval or noalias argument,
+  // then it has not escaped before entering the function.  Check if it escapes
+  // inside the function.
+  if (const Argument *A = dyn_cast<Argument>(V))
+    if (A->hasByValAttr() || A->hasNoAliasAttr()) {
+      // Note even if the argument is marked nocapture, we still need to check
+      // for copies made inside the function. The nocapture attribute only
+      // specifies that there are no copies made that outlive the function.
+      auto Ret = !PointerMayBeCaptured(V, false, /*StoreCaptures=*/true);
+      if (IsCapturedCache)
+        CacheIt->second = Ret;
+      return Ret;
+    }
+
+  return false;
+}
diff --git a/contrib/llvm-project/llvm/lib/Analysis/CodeMetrics.cpp b/contrib/llvm-project/llvm/lib/Analysis/CodeMetrics.cpp
index 0b2b6f9bfa46..157811c04eb5 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/CodeMetrics.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/CodeMetrics.cpp
@@ -112,9 +112,9 @@ void CodeMetrics::collectEphemeralValues(
 
 /// Fill in the current structure with information gleaned from the specified
 /// block.
-void CodeMetrics::analyzeBasicBlock(const BasicBlock *BB,
-                                    const TargetTransformInfo &TTI,
-                                    const SmallPtrSetImpl<const Value*> &EphValues) {
+void CodeMetrics::analyzeBasicBlock(
+    const BasicBlock *BB, const TargetTransformInfo &TTI,
+    const SmallPtrSetImpl<const Value *> &EphValues, bool PrepareForLTO) {
   ++NumBlocks;
   unsigned NumInstsBeforeThisBB = NumInsts;
   for (const Instruction &I : *BB) {
@@ -125,11 +125,16 @@ void CodeMetrics::analyzeBasicBlock(const BasicBlock *BB,
     // Special handling for calls.
     if (const auto *Call = dyn_cast<CallBase>(&I)) {
       if (const Function *F = Call->getCalledFunction()) {
+        bool IsLoweredToCall = TTI.isLoweredToCall(F);
         // If a function is both internal and has a single use, then it is
         // extremely likely to get inlined in the future (it was probably
         // exposed by an interleaved devirtualization pass).
-        if (!Call->isNoInline() && F->hasInternalLinkage() && F->hasOneUse())
+        // When preparing for LTO, liberally consider calls as inline
+        // candidates.
+        if (!Call->isNoInline() && IsLoweredToCall &&
+            ((F->hasInternalLinkage() && F->hasOneUse()) || PrepareForLTO)) {
           ++NumInlineCandidates;
+        }
 
         // If this call is to function itself, then the function is recursive.
         // Inlining it into other functions is a bad idea, because this is
@@ -138,7 +143,7 @@ void CodeMetrics::analyzeBasicBlock(const BasicBlock *BB,
         if (F == BB->getParent())
           isRecursive = true;
 
-        if (TTI.isLoweredToCall(F))
+        if (IsLoweredToCall)
           ++NumCalls;
       } else {
         // We don't want inline asm to count as a call - that would prevent loop
diff --git a/contrib/llvm-project/llvm/lib/Analysis/ConstantFolding.cpp b/contrib/llvm-project/llvm/lib/Analysis/ConstantFolding.cpp
index 6feffcbb98e1..cc1ce4c65821 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/ConstantFolding.cpp
@@ -18,6 +18,7 @@
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
+#include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
@@ -41,6 +42,8 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/IntrinsicsARM.h"
+#include "llvm/IR/IntrinsicsWebAssembly.h"
 #include "llvm/IR/IntrinsicsX86.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/Type.h"
@@ -102,16 +105,16 @@ Constant *FoldBitCast(Constant *C, Type *DestTy, const DataLayout &DL) {
          "Invalid constantexpr bitcast!");
 
   // Catch the obvious splat cases.
-  if (C->isNullValue() && !DestTy->isX86_MMXTy())
+  if (C->isNullValue() && !DestTy->isX86_MMXTy() && !DestTy->isX86_AMXTy())
     return Constant::getNullValue(DestTy);
-  if (C->isAllOnesValue() && !DestTy->isX86_MMXTy() &&
+  if (C->isAllOnesValue() && !DestTy->isX86_MMXTy() && !DestTy->isX86_AMXTy() &&
       !DestTy->isPtrOrPtrVectorTy()) // Don't get ones for ptr types!
     return Constant::getAllOnesValue(DestTy);
 
   if (auto *VTy = dyn_cast<VectorType>(C->getType())) {
     // Handle a vector->scalar integer/fp cast.
     if (isa<IntegerType>(DestTy) || DestTy->isFloatingPointTy()) {
-      unsigned NumSrcElts = VTy->getNumElements();
+      unsigned NumSrcElts = cast<FixedVectorType>(VTy)->getNumElements();
       Type *SrcEltTy = VTy->getElementType();
 
       // If the vector is a vector of floating point, convert it to vector of int
@@ -154,8 +157,8 @@ Constant *FoldBitCast(Constant *C, Type *DestTy, const DataLayout &DL) {
     return ConstantExpr::getBitCast(C, DestTy);
 
   // If the element types match, IR can fold it.
-  unsigned NumDstElt = DestVTy->getNumElements();
-  unsigned NumSrcElt = cast<VectorType>(C->getType())->getNumElements();
+  unsigned NumDstElt = cast<FixedVectorType>(DestVTy)->getNumElements();
+  unsigned NumSrcElt = cast<FixedVectorType>(C->getType())->getNumElements();
   if (NumDstElt == NumSrcElt)
     return ConstantExpr::getBitCast(C, DestTy);
 
@@ -292,7 +295,11 @@ Constant *FoldBitCast(Constant *C, Type *DestTy, const DataLayout &DL) {
 /// If this constant is a constant offset from a global, return the global and
 /// the constant. Because of constantexprs, this function is recursive.
 bool llvm::IsConstantOffsetFromGlobal(Constant *C, GlobalValue *&GV,
-                                      APInt &Offset, const DataLayout &DL) {
+                                      APInt &Offset, const DataLayout &DL,
+                                      DSOLocalEquivalent **DSOEquiv) {
+  if (DSOEquiv)
+    *DSOEquiv = nullptr;
+
   // Trivial case, constant is the global.
   if ((GV = dyn_cast<GlobalValue>(C))) {
     unsigned BitWidth = DL.getIndexTypeSizeInBits(GV->getType());
@@ -300,6 +307,15 @@ bool llvm::IsConstantOffsetFromGlobal(Constant *C, GlobalValue *&GV,
     return true;
   }
 
+  if (auto *FoundDSOEquiv = dyn_cast<DSOLocalEquivalent>(C)) {
+    if (DSOEquiv)
+      *DSOEquiv = FoundDSOEquiv;
+    GV = FoundDSOEquiv->getGlobalValue();
+    unsigned BitWidth = DL.getIndexTypeSizeInBits(GV->getType());
+    Offset = APInt(BitWidth, 0);
+    return true;
+  }
+
   // Otherwise, if this isn't a constant expr, bail out.
   auto *CE = dyn_cast<ConstantExpr>(C);
   if (!CE) return false;
@@ -307,7 +323,8 @@ bool llvm::IsConstantOffsetFromGlobal(Constant *C, GlobalValue *&GV,
   // Look through ptr->int and ptr->ptr casts.
   if (CE->getOpcode() == Instruction::PtrToInt ||
       CE->getOpcode() == Instruction::BitCast)
-    return IsConstantOffsetFromGlobal(CE->getOperand(0), GV, Offset, DL);
+    return IsConstantOffsetFromGlobal(CE->getOperand(0), GV, Offset, DL,
+                                      DSOEquiv);
 
   // i32* getelementptr ([5 x i32]* @a, i32 0, i32 5)
   auto *GEP = dyn_cast<GEPOperator>(CE);
@@ -318,7 +335,8 @@ bool llvm::IsConstantOffsetFromGlobal(Constant *C, GlobalValue *&GV,
   APInt TmpOffset(BitWidth, 0);
 
   // If the base isn't a global+constant, we aren't either.
-  if (!IsConstantOffsetFromGlobal(CE->getOperand(0), GV, TmpOffset, DL))
+  if (!IsConstantOffsetFromGlobal(CE->getOperand(0), GV, TmpOffset, DL,
+                                  DSOEquiv))
     return false;
 
   // Otherwise, add any offset that our operands provide.
@@ -340,12 +358,13 @@ Constant *llvm::ConstantFoldLoadThroughBitcast(Constant *C, Type *DestTy,
 
     // Catch the obvious splat cases (since all-zeros can coerce non-integral
     // pointers legally).
-    if (C->isNullValue() && !DestTy->isX86_MMXTy())
+    if (C->isNullValue() && !DestTy->isX86_MMXTy() && !DestTy->isX86_AMXTy())
       return Constant::getNullValue(DestTy);
     if (C->isAllOnesValue() &&
         (DestTy->isIntegerTy() || DestTy->isFloatingPointTy() ||
          DestTy->isVectorTy()) &&
-        !DestTy->isX86_MMXTy() && !DestTy->isPtrOrPtrVectorTy())
+        !DestTy->isX86_AMXTy() && !DestTy->isX86_MMXTy() &&
+        !DestTy->isPtrOrPtrVectorTy())
       // Get ones when the input is trivial, but
       // only for supported types inside getAllOnesValue.
       return Constant::getAllOnesValue(DestTy);
@@ -489,8 +508,8 @@ bool ReadDataFromGlobal(Constant *C, uint64_t ByteOffset, unsigned char *CurPtr,
       NumElts = AT->getNumElements();
       EltTy = AT->getElementType();
     } else {
-      NumElts = cast<VectorType>(C->getType())->getNumElements();
-      EltTy = cast<VectorType>(C->getType())->getElementType();
+      NumElts = cast<FixedVectorType>(C->getType())->getNumElements();
+      EltTy = cast<FixedVectorType>(C->getType())->getElementType();
     }
     uint64_t EltSize = DL.getTypeAllocSize(EltTy);
     uint64_t Index = ByteOffset / EltSize;
@@ -557,14 +576,16 @@ Constant *FoldReinterpretLoadFromConstPtr(Constant *C, Type *LoadTy,
 
     C = FoldBitCast(C, MapTy->getPointerTo(AS), DL);
     if (Constant *Res = FoldReinterpretLoadFromConstPtr(C, MapTy, DL)) {
-      if (Res->isNullValue() && !LoadTy->isX86_MMXTy())
+      if (Res->isNullValue() && !LoadTy->isX86_MMXTy() &&
+          !LoadTy->isX86_AMXTy())
         // Materializing a zero can be done trivially without a bitcast
         return Constant::getNullValue(LoadTy);
       Type *CastTy = LoadTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(LoadTy) : LoadTy;
       Res = FoldBitCast(Res, CastTy, DL);
       if (LoadTy->isPtrOrPtrVectorTy()) {
         // For vector of pointer, we needed to first convert to a vector of integer, then do vector inttoptr
-        if (Res->isNullValue() && !LoadTy->isX86_MMXTy())
+        if (Res->isNullValue() && !LoadTy->isX86_MMXTy() &&
+            !LoadTy->isX86_AMXTy())
           return Constant::getNullValue(LoadTy);
         if (DL.isNonIntegralPointerType(LoadTy->getScalarType()))
           // Be careful not to replace a load of an addrspace value with an inttoptr here
@@ -717,7 +738,7 @@ Constant *llvm::ConstantFoldLoadFromConstPtr(Constant *C, Type *Ty,
 
   // If this load comes from anywhere in a constant global, and if the global
   // is all undef or zero, we know what it loads.
-  if (auto *GV = dyn_cast<GlobalVariable>(GetUnderlyingObject(CE, DL))) {
+  if (auto *GV = dyn_cast<GlobalVariable>(getUnderlyingObject(CE))) {
     if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
       if (GV->getInitializer()->isNullValue())
         return Constant::getNullValue(Ty);
@@ -1070,6 +1091,8 @@ Constant *ConstantFoldInstOperandsImpl(const Value *InstOrCE, unsigned Opcode,
   default: return nullptr;
   case Instruction::ICmp:
   case Instruction::FCmp: llvm_unreachable("Invalid for compares");
+  case Instruction::Freeze:
+    return isGuaranteedNotToBeUndefOrPoison(Ops[0]) ? Ops[0] : nullptr;
   case Instruction::Call:
     if (auto *F = dyn_cast<Function>(Ops.back())) {
       const auto *Call = cast<CallBase>(InstOrCE);
@@ -1433,6 +1456,12 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) {
   case Intrinsic::launder_invariant_group:
   case Intrinsic::strip_invariant_group:
   case Intrinsic::masked_load:
+  case Intrinsic::get_active_lane_mask:
+  case Intrinsic::abs:
+  case Intrinsic::smax:
+  case Intrinsic::smin:
+  case Intrinsic::umax:
+  case Intrinsic::umin:
   case Intrinsic::sadd_with_overflow:
   case Intrinsic::uadd_with_overflow:
   case Intrinsic::ssub_with_overflow:
@@ -1447,15 +1476,25 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) {
   case Intrinsic::smul_fix_sat:
   case Intrinsic::bitreverse:
   case Intrinsic::is_constant:
-  case Intrinsic::experimental_vector_reduce_add:
-  case Intrinsic::experimental_vector_reduce_mul:
-  case Intrinsic::experimental_vector_reduce_and:
-  case Intrinsic::experimental_vector_reduce_or:
-  case Intrinsic::experimental_vector_reduce_xor:
-  case Intrinsic::experimental_vector_reduce_smin:
-  case Intrinsic::experimental_vector_reduce_smax:
-  case Intrinsic::experimental_vector_reduce_umin:
-  case Intrinsic::experimental_vector_reduce_umax:
+  case Intrinsic::vector_reduce_add:
+  case Intrinsic::vector_reduce_mul:
+  case Intrinsic::vector_reduce_and:
+  case Intrinsic::vector_reduce_or:
+  case Intrinsic::vector_reduce_xor:
+  case Intrinsic::vector_reduce_smin:
+  case Intrinsic::vector_reduce_smax:
+  case Intrinsic::vector_reduce_umin:
+  case Intrinsic::vector_reduce_umax:
+  // Target intrinsics
+  case Intrinsic::arm_mve_vctp8:
+  case Intrinsic::arm_mve_vctp16:
+  case Intrinsic::arm_mve_vctp32:
+  case Intrinsic::arm_mve_vctp64:
+  // WebAssembly float semantics are always known
+  case Intrinsic::wasm_trunc_signed:
+  case Intrinsic::wasm_trunc_unsigned:
+  case Intrinsic::wasm_trunc_saturate_signed:
+  case Intrinsic::wasm_trunc_saturate_unsigned:
     return true;
 
   // Floating point operations cannot be folded in strictfp functions in
@@ -1476,6 +1515,8 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) {
   case Intrinsic::powi:
   case Intrinsic::fma:
   case Intrinsic::fmuladd:
+  case Intrinsic::fptoui_sat:
+  case Intrinsic::fptosi_sat:
   case Intrinsic::convert_from_fp16:
   case Intrinsic::convert_to_fp16:
   case Intrinsic::amdgcn_cos:
@@ -1484,6 +1525,7 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) {
   case Intrinsic::amdgcn_cubesc:
   case Intrinsic::amdgcn_cubetc:
   case Intrinsic::amdgcn_fmul_legacy:
+  case Intrinsic::amdgcn_fma_legacy:
   case Intrinsic::amdgcn_fract:
   case Intrinsic::amdgcn_ldexp:
   case Intrinsic::amdgcn_sin:
@@ -1691,31 +1733,31 @@ Constant *ConstantFoldVectorReduce(Intrinsic::ID IID, Constant *Op) {
       return nullptr;
     const APInt &X = CI->getValue();
     switch (IID) {
-    case Intrinsic::experimental_vector_reduce_add:
+    case Intrinsic::vector_reduce_add:
       Acc = Acc + X;
       break;
-    case Intrinsic::experimental_vector_reduce_mul:
+    case Intrinsic::vector_reduce_mul:
       Acc = Acc * X;
       break;
-    case Intrinsic::experimental_vector_reduce_and:
+    case Intrinsic::vector_reduce_and:
       Acc = Acc & X;
       break;
-    case Intrinsic::experimental_vector_reduce_or:
+    case Intrinsic::vector_reduce_or:
       Acc = Acc | X;
       break;
-    case Intrinsic::experimental_vector_reduce_xor:
+    case Intrinsic::vector_reduce_xor:
       Acc = Acc ^ X;
       break;
-    case Intrinsic::experimental_vector_reduce_smin:
+    case Intrinsic::vector_reduce_smin:
       Acc = APIntOps::smin(Acc, X);
       break;
-    case Intrinsic::experimental_vector_reduce_smax:
+    case Intrinsic::vector_reduce_smax:
       Acc = APIntOps::smax(Acc, X);
       break;
-    case Intrinsic::experimental_vector_reduce_umin:
+    case Intrinsic::vector_reduce_umin:
       Acc = APIntOps::umin(Acc, X);
       break;
-    case Intrinsic::experimental_vector_reduce_umax:
+    case Intrinsic::vector_reduce_umax:
       Acc = APIntOps::umax(Acc, X);
       break;
     }
@@ -1766,19 +1808,6 @@ double getValueAsDouble(ConstantFP *Op) {
   return APF.convertToDouble();
 }
 
-static bool isManifestConstant(const Constant *c) {
-  if (isa<ConstantData>(c)) {
-    return true;
-  } else if (isa<ConstantAggregate>(c) || isa<ConstantExpr>(c)) {
-    for (const Value *subc : c->operand_values()) {
-      if (!isManifestConstant(cast<Constant>(subc)))
-        return false;
-    }
-    return true;
-  }
-  return false;
-}
-
 static bool getConstIntOrUndef(Value *Op, const APInt *&C) {
   if (auto *CI = dyn_cast<ConstantInt>(Op)) {
     C = &CI->getValue();
@@ -1803,15 +1832,18 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
     // We know we have a "Constant" argument. But we want to only
     // return true for manifest constants, not those that depend on
     // constants with unknowable values, e.g. GlobalValue or BlockAddress.
-    if (isManifestConstant(Operands[0]))
+    if (Operands[0]->isManifestConstant())
       return ConstantInt::getTrue(Ty->getContext());
     return nullptr;
   }
   if (isa<UndefValue>(Operands[0])) {
     // cosine(arg) is between -1 and 1. cosine(invalid arg) is NaN.
     // ctpop() is between 0 and bitwidth, pick 0 for undef.
+    // fptoui.sat and fptosi.sat can always fold to zero (for a zero input).
     if (IntrinsicID == Intrinsic::cos ||
-        IntrinsicID == Intrinsic::ctpop)
+        IntrinsicID == Intrinsic::ctpop ||
+        IntrinsicID == Intrinsic::fptoui_sat ||
+        IntrinsicID == Intrinsic::fptosi_sat)
       return Constant::getNullValue(Ty);
     if (IntrinsicID == Intrinsic::bswap ||
         IntrinsicID == Intrinsic::bitreverse ||
@@ -1848,11 +1880,55 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
       return ConstantInt::get(Ty->getContext(), Val.bitcastToAPInt());
     }
 
+    APFloat U = Op->getValueAPF();
+
+    if (IntrinsicID == Intrinsic::wasm_trunc_signed ||
+        IntrinsicID == Intrinsic::wasm_trunc_unsigned ||
+        IntrinsicID == Intrinsic::wasm_trunc_saturate_signed ||
+        IntrinsicID == Intrinsic::wasm_trunc_saturate_unsigned) {
+
+      bool Saturating = IntrinsicID == Intrinsic::wasm_trunc_saturate_signed ||
+                        IntrinsicID == Intrinsic::wasm_trunc_saturate_unsigned;
+      bool Signed = IntrinsicID == Intrinsic::wasm_trunc_signed ||
+                    IntrinsicID == Intrinsic::wasm_trunc_saturate_signed;
+
+      if (U.isNaN())
+        return Saturating ? ConstantInt::get(Ty, 0) : nullptr;
+
+      unsigned Width = Ty->getIntegerBitWidth();
+      APSInt Int(Width, !Signed);
+      bool IsExact = false;
+      APFloat::opStatus Status =
+          U.convertToInteger(Int, APFloat::rmTowardZero, &IsExact);
+
+      if (Status == APFloat::opOK || Status == APFloat::opInexact)
+        return ConstantInt::get(Ty, Int);
+
+      if (!Saturating)
+        return nullptr;
+
+      if (U.isNegative())
+        return Signed ? ConstantInt::get(Ty, APInt::getSignedMinValue(Width))
+                      : ConstantInt::get(Ty, APInt::getMinValue(Width));
+      else
+        return Signed ? ConstantInt::get(Ty, APInt::getSignedMaxValue(Width))
+                      : ConstantInt::get(Ty, APInt::getMaxValue(Width));
+    }
+
+    if (IntrinsicID == Intrinsic::fptoui_sat ||
+        IntrinsicID == Intrinsic::fptosi_sat) {
+      // convertToInteger() already has the desired saturation semantics.
+      APSInt Int(Ty->getIntegerBitWidth(),
+                 IntrinsicID == Intrinsic::fptoui_sat);
+      bool IsExact;
+      U.convertToInteger(Int, APFloat::rmTowardZero, &IsExact);
+      return ConstantInt::get(Ty, Int);
+    }
+
     if (!Ty->isHalfTy() && !Ty->isFloatTy() && !Ty->isDoubleTy())
       return nullptr;
 
     // Use internal versions of these intrinsics.
-    APFloat U = Op->getValueAPF();
 
     if (IntrinsicID == Intrinsic::nearbyint || IntrinsicID == Intrinsic::rint) {
       U.roundToIntegral(APFloat::rmNearestTiesToEven);
@@ -2186,15 +2262,15 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
   if (isa<ConstantAggregateZero>(Operands[0])) {
     switch (IntrinsicID) {
     default: break;
-    case Intrinsic::experimental_vector_reduce_add:
-    case Intrinsic::experimental_vector_reduce_mul:
-    case Intrinsic::experimental_vector_reduce_and:
-    case Intrinsic::experimental_vector_reduce_or:
-    case Intrinsic::experimental_vector_reduce_xor:
-    case Intrinsic::experimental_vector_reduce_smin:
-    case Intrinsic::experimental_vector_reduce_smax:
-    case Intrinsic::experimental_vector_reduce_umin:
-    case Intrinsic::experimental_vector_reduce_umax:
+    case Intrinsic::vector_reduce_add:
+    case Intrinsic::vector_reduce_mul:
+    case Intrinsic::vector_reduce_and:
+    case Intrinsic::vector_reduce_or:
+    case Intrinsic::vector_reduce_xor:
+    case Intrinsic::vector_reduce_smin:
+    case Intrinsic::vector_reduce_smax:
+    case Intrinsic::vector_reduce_umin:
+    case Intrinsic::vector_reduce_umax:
       return ConstantInt::get(Ty, 0);
     }
   }
@@ -2205,15 +2281,15 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
     auto *Op = cast<Constant>(Operands[0]);
     switch (IntrinsicID) {
     default: break;
-    case Intrinsic::experimental_vector_reduce_add:
-    case Intrinsic::experimental_vector_reduce_mul:
-    case Intrinsic::experimental_vector_reduce_and:
-    case Intrinsic::experimental_vector_reduce_or:
-    case Intrinsic::experimental_vector_reduce_xor:
-    case Intrinsic::experimental_vector_reduce_smin:
-    case Intrinsic::experimental_vector_reduce_smax:
-    case Intrinsic::experimental_vector_reduce_umin:
-    case Intrinsic::experimental_vector_reduce_umax:
+    case Intrinsic::vector_reduce_add:
+    case Intrinsic::vector_reduce_mul:
+    case Intrinsic::vector_reduce_and:
+    case Intrinsic::vector_reduce_or:
+    case Intrinsic::vector_reduce_xor:
+    case Intrinsic::vector_reduce_smin:
+    case Intrinsic::vector_reduce_smax:
+    case Intrinsic::vector_reduce_umin:
+    case Intrinsic::vector_reduce_umax:
       if (Constant *C = ConstantFoldVectorReduce(IntrinsicID, Op))
         return C;
       break;
@@ -2251,6 +2327,25 @@ static Constant *ConstantFoldScalarCall2(StringRef Name,
                                          const CallBase *Call) {
   assert(Operands.size() == 2 && "Wrong number of operands.");
 
+  if (Ty->isFloatingPointTy()) {
+    // TODO: We should have undef handling for all of the FP intrinsics that
+    //       are attempted to be folded in this function.
+    bool IsOp0Undef = isa<UndefValue>(Operands[0]);
+    bool IsOp1Undef = isa<UndefValue>(Operands[1]);
+    switch (IntrinsicID) {
+    case Intrinsic::maxnum:
+    case Intrinsic::minnum:
+    case Intrinsic::maximum:
+    case Intrinsic::minimum:
+      // If one argument is undef, return the other argument.
+      if (IsOp0Undef)
+        return Operands[1];
+      if (IsOp1Undef)
+        return Operands[0];
+      break;
+    }
+  }
+
   if (auto *Op1 = dyn_cast<ConstantFP>(Operands[0])) {
     if (!Ty->isHalfTy() && !Ty->isFloatTy() && !Ty->isDoubleTy())
       return nullptr;
@@ -2298,8 +2393,8 @@ static Constant *ConstantFoldScalarCall2(StringRef Name,
       if (IntrinsicID == Intrinsic::amdgcn_fmul_legacy) {
         const APFloat &C1 = Op1->getValueAPF();
         const APFloat &C2 = Op2->getValueAPF();
-        // The legacy behaviour is that multiplying zero by anything, even NaN
-        // or infinity, gives +0.0.
+        // The legacy behaviour is that multiplying +/- 0.0 by anything, even
+        // NaN or infinity, gives +0.0.
         if (C1.isZero() || C2.isZero())
           return ConstantFP::getNullValue(Ty);
         return ConstantFP::get(Ty->getContext(), C1 * C2);
@@ -2378,8 +2473,37 @@ static Constant *ConstantFoldScalarCall2(StringRef Name,
         !getConstIntOrUndef(Operands[1], C1))
       return nullptr;
 
+    unsigned BitWidth = Ty->getScalarSizeInBits();
     switch (IntrinsicID) {
     default: break;
+    case Intrinsic::smax:
+      if (!C0 && !C1)
+        return UndefValue::get(Ty);
+      if (!C0 || !C1)
+        return ConstantInt::get(Ty, APInt::getSignedMaxValue(BitWidth));
+      return ConstantInt::get(Ty, C0->sgt(*C1) ? *C0 : *C1);
+
+    case Intrinsic::smin:
+      if (!C0 && !C1)
+        return UndefValue::get(Ty);
+      if (!C0 || !C1)
+        return ConstantInt::get(Ty, APInt::getSignedMinValue(BitWidth));
+      return ConstantInt::get(Ty, C0->slt(*C1) ? *C0 : *C1);
+
+    case Intrinsic::umax:
+      if (!C0 && !C1)
+        return UndefValue::get(Ty);
+      if (!C0 || !C1)
+        return ConstantInt::get(Ty, APInt::getMaxValue(BitWidth));
+      return ConstantInt::get(Ty, C0->ugt(*C1) ? *C0 : *C1);
+
+    case Intrinsic::umin:
+      if (!C0 && !C1)
+        return UndefValue::get(Ty);
+      if (!C0 || !C1)
+        return ConstantInt::get(Ty, APInt::getMinValue(BitWidth));
+      return ConstantInt::get(Ty, C0->ult(*C1) ? *C0 : *C1);
+
     case Intrinsic::usub_with_overflow:
     case Intrinsic::ssub_with_overflow:
     case Intrinsic::uadd_with_overflow:
@@ -2464,6 +2588,18 @@ static Constant *ConstantFoldScalarCall2(StringRef Name,
         return ConstantInt::get(Ty, C0->countTrailingZeros());
       else
         return ConstantInt::get(Ty, C0->countLeadingZeros());
+
+    case Intrinsic::abs:
+      // Undef or minimum val operand with poison min --> undef
+      assert(C1 && "Must be constant int");
+      if (C1->isOneValue() && (!C0 || C0->isMinSignedValue()))
+        return UndefValue::get(Ty);
+
+      // Undef operand with no poison min --> 0 (sign bit must be clear)
+      if (C1->isNullValue() && !C0)
+        return Constant::getNullValue(Ty);
+
+      return ConstantInt::get(Ty, C0->abs());
     }
 
     return nullptr;
@@ -2592,6 +2728,19 @@ static Constant *ConstantFoldScalarCall3(StringRef Name,
       if (const auto *Op3 = dyn_cast<ConstantFP>(Operands[2])) {
         switch (IntrinsicID) {
         default: break;
+        case Intrinsic::amdgcn_fma_legacy: {
+          const APFloat &C1 = Op1->getValueAPF();
+          const APFloat &C2 = Op2->getValueAPF();
+          // The legacy behaviour is that multiplying +/- 0.0 by anything, even
+          // NaN or infinity, gives +0.0.
+          if (C1.isZero() || C2.isZero()) {
+            const APFloat &C3 = Op3->getValueAPF();
+            // It's tempting to just return C3 here, but that would give the
+            // wrong result if C3 was -0.0.
+            return ConstantFP::get(Ty->getContext(), APFloat(0.0f) + C3);
+          }
+          LLVM_FALLTHROUGH;
+        }
         case Intrinsic::fma:
         case Intrinsic::fmuladd: {
           APFloat V = Op1->getValueAPF();
@@ -2719,7 +2868,8 @@ static Constant *ConstantFoldVectorCall(StringRef Name,
   SmallVector<Constant *, 4> Lane(Operands.size());
   Type *Ty = FVTy->getElementType();
 
-  if (IntrinsicID == Intrinsic::masked_load) {
+  switch (IntrinsicID) {
+  case Intrinsic::masked_load: {
     auto *SrcPtr = Operands[0];
     auto *Mask = Operands[2];
     auto *Passthru = Operands[3];
@@ -2757,6 +2907,51 @@ static Constant *ConstantFoldVectorCall(StringRef Name,
       return nullptr;
     return ConstantVector::get(NewElements);
   }
+  case Intrinsic::arm_mve_vctp8:
+  case Intrinsic::arm_mve_vctp16:
+  case Intrinsic::arm_mve_vctp32:
+  case Intrinsic::arm_mve_vctp64: {
+    if (auto *Op = dyn_cast<ConstantInt>(Operands[0])) {
+      unsigned Lanes = FVTy->getNumElements();
+      uint64_t Limit = Op->getZExtValue();
+      // vctp64 are currently modelled as returning a v4i1, not a v2i1. Make
+      // sure we get the limit right in that case and set all relevant lanes.
+      if (IntrinsicID == Intrinsic::arm_mve_vctp64)
+        Limit *= 2;
+
+      SmallVector<Constant *, 16> NCs;
+      for (unsigned i = 0; i < Lanes; i++) {
+        if (i < Limit)
+          NCs.push_back(ConstantInt::getTrue(Ty));
+        else
+          NCs.push_back(ConstantInt::getFalse(Ty));
+      }
+      return ConstantVector::get(NCs);
+    }
+    break;
+  }
+  case Intrinsic::get_active_lane_mask: {
+    auto *Op0 = dyn_cast<ConstantInt>(Operands[0]);
+    auto *Op1 = dyn_cast<ConstantInt>(Operands[1]);
+    if (Op0 && Op1) {
+      unsigned Lanes = FVTy->getNumElements();
+      uint64_t Base = Op0->getZExtValue();
+      uint64_t Limit = Op1->getZExtValue();
+
+      SmallVector<Constant *, 16> NCs;
+      for (unsigned i = 0; i < Lanes; i++) {
+        if (Base + i < Limit)
+          NCs.push_back(ConstantInt::getTrue(Ty));
+        else
+          NCs.push_back(ConstantInt::getFalse(Ty));
+      }
+      return ConstantVector::get(NCs);
+    }
+    break;
+  }
+  default:
+    break;
+  }
 
   for (unsigned I = 0, E = FVTy->getNumElements(); I != E; ++I) {
     // Gather a column of constants.
diff --git a/contrib/llvm-project/llvm/lib/Analysis/ConstraintSystem.cpp b/contrib/llvm-project/llvm/lib/Analysis/ConstraintSystem.cpp
new file mode 100644
index 000000000000..9739c6af5769
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Analysis/ConstraintSystem.cpp
@@ -0,0 +1,158 @@
+//===- ConstraintSytem.cpp - A system of linear constraints. ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/ConstraintSystem.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Debug.h"
+
+#include <algorithm>
+#include <string>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "constraint-system"
+
+bool ConstraintSystem::eliminateUsingFM() {
+  // Implementation of Fourier–Motzkin elimination, with some tricks from the
+  // paper Pugh, William. "The Omega test: a fast and practical integer
+  // programming algorithm for dependence
+  //  analysis."
+  // Supercomputing'91: Proceedings of the 1991 ACM/
+  // IEEE conference on Supercomputing. IEEE, 1991.
+  assert(!Constraints.empty() &&
+         "should only be called for non-empty constraint systems");
+  unsigned NumVariables = Constraints[0].size();
+  SmallVector<SmallVector<int64_t, 8>, 4> NewSystem;
+
+  unsigned NumConstraints = Constraints.size();
+  uint32_t NewGCD = 1;
+  // FIXME do not use copy
+  for (unsigned R1 = 0; R1 < NumConstraints; R1++) {
+    if (Constraints[R1][1] == 0) {
+      SmallVector<int64_t, 8> NR;
+      NR.push_back(Constraints[R1][0]);
+      for (unsigned i = 2; i < NumVariables; i++) {
+        NR.push_back(Constraints[R1][i]);
+      }
+      NewSystem.push_back(std::move(NR));
+      continue;
+    }
+
+    // FIXME do not use copy
+    for (unsigned R2 = R1 + 1; R2 < NumConstraints; R2++) {
+      if (R1 == R2)
+        continue;
+
+      // FIXME: can we do better than just dropping things here?
+      if (Constraints[R2][1] == 0)
+        continue;
+
+      if ((Constraints[R1][1] < 0 && Constraints[R2][1] < 0) ||
+          (Constraints[R1][1] > 0 && Constraints[R2][1] > 0))
+        continue;
+
+      unsigned LowerR = R1;
+      unsigned UpperR = R2;
+      if (Constraints[UpperR][1] < 0)
+        std::swap(LowerR, UpperR);
+
+      SmallVector<int64_t, 8> NR;
+      for (unsigned I = 0; I < NumVariables; I++) {
+        if (I == 1)
+          continue;
+
+        int64_t M1, M2, N;
+        if (MulOverflow(Constraints[UpperR][I],
+                                   ((-1) * Constraints[LowerR][1] / GCD), M1))
+          return false;
+        if (MulOverflow(Constraints[LowerR][I],
+                                   (Constraints[UpperR][1] / GCD), M2))
+          return false;
+        if (AddOverflow(M1, M2, N))
+          return false;
+        NR.push_back(N);
+
+        NewGCD = APIntOps::GreatestCommonDivisor({32, (uint32_t)NR.back()},
+                                                 {32, NewGCD})
+                     .getZExtValue();
+      }
+      NewSystem.push_back(std::move(NR));
+      // Give up if the new system gets too big.
+      if (NewSystem.size() > 500)
+        return false;
+    }
+  }
+  Constraints = std::move(NewSystem);
+  GCD = NewGCD;
+
+  return true;
+}
+
+bool ConstraintSystem::mayHaveSolutionImpl() {
+  while (!Constraints.empty() && Constraints[0].size() > 1) {
+    if (!eliminateUsingFM())
+      return true;
+  }
+
+  if (Constraints.empty() || Constraints[0].size() > 1)
+    return true;
+
+  return all_of(Constraints, [](auto &R) { return R[0] >= 0; });
+}
+
+void ConstraintSystem::dump(ArrayRef<std::string> Names) const {
+  if (Constraints.empty())
+    return;
+
+  for (auto &Row : Constraints) {
+    SmallVector<std::string, 16> Parts;
+    for (unsigned I = 1, S = Row.size(); I < S; ++I) {
+      if (Row[I] == 0)
+        continue;
+      std::string Coefficient;
+      if (Row[I] != 1)
+        Coefficient = std::to_string(Row[I]) + " * ";
+      Parts.push_back(Coefficient + Names[I - 1]);
+    }
+    assert(!Parts.empty() && "need to have at least some parts");
+    LLVM_DEBUG(dbgs() << join(Parts, std::string(" + "))
+                      << " <= " << std::to_string(Row[0]) << "\n");
+  }
+}
+
+void ConstraintSystem::dump() const {
+  SmallVector<std::string, 16> Names;
+  for (unsigned i = 1; i < Constraints.back().size(); ++i)
+    Names.push_back("x" + std::to_string(i));
+  LLVM_DEBUG(dbgs() << "---\n");
+  dump(Names);
+}
+
+bool ConstraintSystem::mayHaveSolution() {
+  LLVM_DEBUG(dump());
+  bool HasSolution = mayHaveSolutionImpl();
+  LLVM_DEBUG(dbgs() << (HasSolution ? "sat" : "unsat") << "\n");
+  return HasSolution;
+}
+
+bool ConstraintSystem::isConditionImplied(SmallVector<int64_t, 8> R) {
+  // If all variable coefficients are 0, we have 'C >= 0'. If the constant is >=
+  // 0, R is always true, regardless of the system.
+  if (all_of(makeArrayRef(R).drop_front(1), [](int64_t C) { return C == 0; }))
+    return R[0] >= 0;
+
+  // If there is no solution with the negation of R added to the system, the
+  // condition must hold based on the existing constraints.
+  R = ConstraintSystem::negate(R);
+
+  auto NewSystem = *this;
+  NewSystem.addVariableRow(R);
+  return !NewSystem.mayHaveSolution();
+}
diff --git a/contrib/llvm-project/llvm/lib/Analysis/CostModel.cpp b/contrib/llvm-project/llvm/lib/Analysis/CostModel.cpp
index 953da964c435..19c307b4ef8e 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/CostModel.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/CostModel.cpp
@@ -35,7 +35,10 @@ static cl::opt<TargetTransformInfo::TargetCostKind> CostKind(
                clEnumValN(TargetTransformInfo::TCK_Latency,
                           "latency", "Instruction latency"),
                clEnumValN(TargetTransformInfo::TCK_CodeSize,
-                          "code-size", "Code size")));
+                          "code-size", "Code size"),
+               clEnumValN(TargetTransformInfo::TCK_SizeAndLatency,
+                          "size-latency", "Code size and latency")));
+
 
 #define CM_NAME "cost-model"
 #define DEBUG_TYPE CM_NAME
@@ -54,7 +57,7 @@ namespace {
     /// Returns -1 if the cost is unknown.
     /// Note, this method does not cache the cost calculation and it
     /// can be expensive in some cases.
-    unsigned getInstructionCost(const Instruction *I) const {
+    InstructionCost getInstructionCost(const Instruction *I) const {
       return TTI->getInstructionCost(I, TargetTransformInfo::TCK_RecipThroughput);
     }
 
@@ -100,9 +103,9 @@ void CostModelAnalysis::print(raw_ostream &OS, const Module*) const {
 
   for (BasicBlock &B : *F) {
     for (Instruction &Inst : B) {
-      unsigned Cost = TTI->getInstructionCost(&Inst, CostKind);
-      if (Cost != (unsigned)-1)
-        OS << "Cost Model: Found an estimated cost of " << Cost;
+      InstructionCost Cost = TTI->getInstructionCost(&Inst, CostKind);
+      if (auto CostVal = Cost.getValue())
+        OS << "Cost Model: Found an estimated cost of " << *CostVal;
       else
         OS << "Cost Model: Unknown cost";
 
diff --git a/contrib/llvm-project/llvm/lib/Analysis/DDG.cpp b/contrib/llvm-project/llvm/lib/Analysis/DDG.cpp
index 280d9ef79efa..da5de75a038c 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/DDG.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/DDG.cpp
@@ -49,7 +49,7 @@ bool DDGNode::collectInstructions(
       assert(!isa<PiBlockDDGNode>(PN) && "Nested PiBlocks are not supported.");
       SmallVector<Instruction *, 8> TmpIList;
       PN->collectInstructions(Pred, TmpIList);
-      IList.insert(IList.end(), TmpIList.begin(), TmpIList.end());
+      llvm::append_range(IList, TmpIList);
     }
   } else
     llvm_unreachable("unimplemented type of node");
@@ -190,8 +190,7 @@ DataDependenceGraph::DataDependenceGraph(Function &F, DependenceInfo &D)
   // directions.
   BasicBlockListType BBList;
   for (auto &SCC : make_range(scc_begin(&F), scc_end(&F)))
-    for (BasicBlock * BB : SCC)
-      BBList.push_back(BB);
+    append_range(BBList, SCC);
   std::reverse(BBList.begin(), BBList.end());
   DDGBuilder(*this, D, BBList).populate();
 }
@@ -207,8 +206,7 @@ DataDependenceGraph::DataDependenceGraph(Loop &L, LoopInfo &LI,
   LoopBlocksDFS DFS(&L);
   DFS.perform(&LI);
   BasicBlockListType BBList;
-  for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO()))
-    BBList.push_back(BB);
+  append_range(BBList, make_range(DFS.beginRPO(), DFS.endRPO()));
   DDGBuilder(*this, D, BBList).populate();
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Analysis/DDGPrinter.cpp b/contrib/llvm-project/llvm/lib/Analysis/DDGPrinter.cpp
new file mode 100644
index 000000000000..51bd54809857
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Analysis/DDGPrinter.cpp
@@ -0,0 +1,150 @@
+//===- DDGPrinter.cpp - DOT printer for the data dependence graph ----------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//
+// This file defines the `-dot-ddg` analysis pass, which emits DDG in DOT format
+// in a file named `ddg.<graph-name>.dot` for each loop  in a function.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/DDGPrinter.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/GraphWriter.h"
+
+using namespace llvm;
+
+static cl::opt<bool> DotOnly("dot-ddg-only", cl::init(false), cl::Hidden,
+                             cl::ZeroOrMore, cl::desc("simple ddg dot graph"));
+static cl::opt<std::string> DDGDotFilenamePrefix(
+    "dot-ddg-filename-prefix", cl::init("ddg"), cl::Hidden,
+    cl::desc("The prefix used for the DDG dot file names."));
+
+static void writeDDGToDotFile(DataDependenceGraph &G, bool DOnly = false);
+
+//===--------------------------------------------------------------------===//
+// Implementation of DDG DOT Printer for a loop
+//===--------------------------------------------------------------------===//
+PreservedAnalyses DDGDotPrinterPass::run(Loop &L, LoopAnalysisManager &AM,
+                                         LoopStandardAnalysisResults &AR,
+                                         LPMUpdater &U) {
+  writeDDGToDotFile(*AM.getResult<DDGAnalysis>(L, AR), DotOnly);
+  return PreservedAnalyses::all();
+}
+
+static void writeDDGToDotFile(DataDependenceGraph &G, bool DOnly) {
+  std::string Filename =
+      Twine(DDGDotFilenamePrefix + "." + G.getName() + ".dot").str();
+  errs() << "Writing '" << Filename << "'...";
+
+  std::error_code EC;
+  raw_fd_ostream File(Filename, EC, sys::fs::F_Text);
+
+  if (!EC)
+    // We only provide the constant verson of the DOTGraphTrait specialization,
+    // hence the conversion to const pointer
+    WriteGraph(File, (const DataDependenceGraph *)&G, DOnly);
+  else
+    errs() << "  error opening file for writing!";
+  errs() << "\n";
+}
+
+//===--------------------------------------------------------------------===//
+// DDG DOT Printer Implementation
+//===--------------------------------------------------------------------===//
+std::string DDGDotGraphTraits::getNodeLabel(const DDGNode *Node,
+                                            const DataDependenceGraph *Graph) {
+  if (isSimple())
+    return getSimpleNodeLabel(Node, Graph);
+  else
+    return getVerboseNodeLabel(Node, Graph);
+}
+
+std::string DDGDotGraphTraits::getEdgeAttributes(
+    const DDGNode *Node, GraphTraits<const DDGNode *>::ChildIteratorType I,
+    const DataDependenceGraph *G) {
+  const DDGEdge *E = static_cast<const DDGEdge *>(*I.getCurrent());
+  if (isSimple())
+    return getSimpleEdgeAttributes(Node, E, G);
+  else
+    return getVerboseEdgeAttributes(Node, E, G);
+}
+
+bool DDGDotGraphTraits::isNodeHidden(const DDGNode *Node,
+                                     const DataDependenceGraph *Graph) {
+  if (isSimple() && isa<RootDDGNode>(Node))
+    return true;
+  assert(Graph && "expected a valid graph pointer");
+  return Graph->getPiBlock(*Node) != nullptr;
+}
+
+std::string
+DDGDotGraphTraits::getSimpleNodeLabel(const DDGNode *Node,
+                                      const DataDependenceGraph *G) {
+  std::string Str;
+  raw_string_ostream OS(Str);
+  if (isa<SimpleDDGNode>(Node))
+    for (auto *II : static_cast<const SimpleDDGNode *>(Node)->getInstructions())
+      OS << *II << "\n";
+  else if (isa<PiBlockDDGNode>(Node))
+    OS << "pi-block\nwith\n"
+       << cast<PiBlockDDGNode>(Node)->getNodes().size() << " nodes\n";
+  else if (isa<RootDDGNode>(Node))
+    OS << "root\n";
+  else
+    llvm_unreachable("Unimplemented type of node");
+  return OS.str();
+}
+
+std::string
+DDGDotGraphTraits::getVerboseNodeLabel(const DDGNode *Node,
+                                       const DataDependenceGraph *G) {
+  std::string Str;
+  raw_string_ostream OS(Str);
+  OS << "<kind:" << Node->getKind() << ">\n";
+  if (isa<SimpleDDGNode>(Node))
+    for (auto *II : static_cast<const SimpleDDGNode *>(Node)->getInstructions())
+      OS << *II << "\n";
+  else if (isa<PiBlockDDGNode>(Node)) {
+    OS << "--- start of nodes in pi-block ---\n";
+    unsigned Count = 0;
+    const auto &PNodes = cast<PiBlockDDGNode>(Node)->getNodes();
+    for (auto *PN : PNodes) {
+      OS << getVerboseNodeLabel(PN, G);
+      if (++Count != PNodes.size())
+        OS << "\n";
+    }
+    OS << "--- end of nodes in pi-block ---\n";
+  } else if (isa<RootDDGNode>(Node))
+    OS << "root\n";
+  else
+    llvm_unreachable("Unimplemented type of node");
+  return OS.str();
+}
+
+std::string DDGDotGraphTraits::getSimpleEdgeAttributes(
+    const DDGNode *Src, const DDGEdge *Edge, const DataDependenceGraph *G) {
+  std::string Str;
+  raw_string_ostream OS(Str);
+  DDGEdge::EdgeKind Kind = Edge->getKind();
+  OS << "label=\"[" << Kind << "]\"";
+  return OS.str();
+}
+
+std::string DDGDotGraphTraits::getVerboseEdgeAttributes(
+    const DDGNode *Src, const DDGEdge *Edge, const DataDependenceGraph *G) {
+  std::string Str;
+  raw_string_ostream OS(Str);
+  DDGEdge::EdgeKind Kind = Edge->getKind();
+  OS << "label=\"[";
+  if (Kind == DDGEdge::EdgeKind::MemoryDependence)
+    OS << G->getDependenceString(*Src, Edge->getTargetNode());
+  else
+    OS << Kind;
+  OS << "]\"";
+  return OS.str();
+}
diff --git a/contrib/llvm-project/llvm/lib/Analysis/Delinearization.cpp b/contrib/llvm-project/llvm/lib/Analysis/Delinearization.cpp
index 60cd1b5317d6..87a41bbf16a5 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/Delinearization.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/Delinearization.cpp
@@ -13,6 +13,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Analysis/Delinearization.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/ScalarEvolution.h"
@@ -23,6 +24,7 @@
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/IR/Type.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
@@ -54,22 +56,8 @@ public:
   void print(raw_ostream &O, const Module *M = nullptr) const override;
 };
 
-} // end anonymous namespace
-
-void Delinearization::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.setPreservesAll();
-  AU.addRequired<LoopInfoWrapperPass>();
-  AU.addRequired<ScalarEvolutionWrapperPass>();
-}
-
-bool Delinearization::runOnFunction(Function &F) {
-  this->F = &F;
-  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
-  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  return false;
-}
-
-void Delinearization::print(raw_ostream &O, const Module *) const {
+void printDelinearization(raw_ostream &O, Function *F, LoopInfo *LI,
+                          ScalarEvolution *SE) {
   O << "Delinearization on function " << F->getName() << ":\n";
   for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) {
     Instruction *Inst = &(*I);
@@ -120,6 +108,25 @@ void Delinearization::print(raw_ostream &O, const Module *) const {
   }
 }
 
+} // end anonymous namespace
+
+void Delinearization::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<LoopInfoWrapperPass>();
+  AU.addRequired<ScalarEvolutionWrapperPass>();
+}
+
+bool Delinearization::runOnFunction(Function &F) {
+  this->F = &F;
+  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  return false;
+}
+
+void Delinearization::print(raw_ostream &O, const Module *) const {
+  printDelinearization(O, F, LI, SE);
+}
+
 char Delinearization::ID = 0;
 static const char delinearization_name[] = "Delinearization";
 INITIALIZE_PASS_BEGIN(Delinearization, DL_NAME, delinearization_name, true,
@@ -128,3 +135,12 @@ INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_END(Delinearization, DL_NAME, delinearization_name, true, true)
 
 FunctionPass *llvm::createDelinearizationPass() { return new Delinearization; }
+
+DelinearizationPrinterPass::DelinearizationPrinterPass(raw_ostream &OS)
+    : OS(OS) {}
+PreservedAnalyses DelinearizationPrinterPass::run(Function &F,
+                                                  FunctionAnalysisManager &AM) {
+  printDelinearization(OS, &F, &AM.getResult<LoopAnalysis>(F),
+                       &AM.getResult<ScalarEvolutionAnalysis>(F));
+  return PreservedAnalyses::all();
+}
diff --git a/contrib/llvm-project/llvm/lib/Analysis/DemandedBits.cpp b/contrib/llvm-project/llvm/lib/Analysis/DemandedBits.cpp
index aaee8c21f289..dd11b0b02bf8 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/DemandedBits.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/DemandedBits.cpp
@@ -80,7 +80,7 @@ void DemandedBitsWrapperPass::print(raw_ostream &OS, const Module *M) const {
 
 static bool isAlwaysLive(Instruction *I) {
   return I->isTerminator() || isa<DbgInfoIntrinsic>(I) || I->isEHPad() ||
-         I->mayHaveSideEffects();
+         I->mayHaveSideEffects() || !I->willReturn();
 }
 
 void DemandedBits::determineLiveOperandBits(
@@ -115,7 +115,7 @@ void DemandedBits::determineLiveOperandBits(
   default: break;
   case Instruction::Call:
   case Instruction::Invoke:
-    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(UserI))
+    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(UserI)) {
       switch (II->getIntrinsicID()) {
       default: break;
       case Intrinsic::bswap:
@@ -170,10 +170,33 @@ void DemandedBits::determineLiveOperandBits(
         }
         break;
       }
+      case Intrinsic::umax:
+      case Intrinsic::umin:
+      case Intrinsic::smax:
+      case Intrinsic::smin:
+        // If low bits of result are not demanded, they are also not demanded
+        // for the min/max operands.
+        AB = APInt::getBitsSetFrom(BitWidth, AOut.countTrailingZeros());
+        break;
       }
+    }
     break;
   case Instruction::Add:
+    if (AOut.isMask()) {
+      AB = AOut;
+    } else {
+      ComputeKnownBits(BitWidth, UserI->getOperand(0), UserI->getOperand(1));
+      AB = determineLiveOperandBitsAdd(OperandNo, AOut, Known, Known2);
+    }
+    break;
   case Instruction::Sub:
+    if (AOut.isMask()) {
+      AB = AOut;
+    } else {
+      ComputeKnownBits(BitWidth, UserI->getOperand(0), UserI->getOperand(1));
+      AB = determineLiveOperandBitsSub(OperandNo, AOut, Known, Known2);
+    }
+    break;
   case Instruction::Mul:
     // Find the highest live output bit. We don't need any more input
     // bits than that (adds, and thus subtracts, ripple only to the
@@ -469,6 +492,86 @@ void DemandedBits::print(raw_ostream &OS) {
   }
 }
 
+static APInt determineLiveOperandBitsAddCarry(unsigned OperandNo,
+                                              const APInt &AOut,
+                                              const KnownBits &LHS,
+                                              const KnownBits &RHS,
+                                              bool CarryZero, bool CarryOne) {
+  assert(!(CarryZero && CarryOne) &&
+         "Carry can't be zero and one at the same time");
+
+  // The following check should be done by the caller, as it also indicates
+  // that LHS and RHS don't need to be computed.
+  //
+  // if (AOut.isMask())
+  //   return AOut;
+
+  // Boundary bits' carry out is unaffected by their carry in.
+  APInt Bound = (LHS.Zero & RHS.Zero) | (LHS.One & RHS.One);
+
+  // First, the alive carry bits are determined from the alive output bits:
+  // Let demand ripple to the right but only up to any set bit in Bound.
+  //   AOut         = -1----
+  //   Bound        = ----1-
+  //   ACarry&~AOut = --111-
+  APInt RBound = Bound.reverseBits();
+  APInt RAOut = AOut.reverseBits();
+  APInt RProp = RAOut + (RAOut | ~RBound);
+  APInt RACarry = RProp ^ ~RBound;
+  APInt ACarry = RACarry.reverseBits();
+
+  // Then, the alive input bits are determined from the alive carry bits:
+  APInt NeededToMaintainCarryZero;
+  APInt NeededToMaintainCarryOne;
+  if (OperandNo == 0) {
+    NeededToMaintainCarryZero = LHS.Zero | ~RHS.Zero;
+    NeededToMaintainCarryOne = LHS.One | ~RHS.One;
+  } else {
+    NeededToMaintainCarryZero = RHS.Zero | ~LHS.Zero;
+    NeededToMaintainCarryOne = RHS.One | ~LHS.One;
+  }
+
+  // As in computeForAddCarry
+  APInt PossibleSumZero = ~LHS.Zero + ~RHS.Zero + !CarryZero;
+  APInt PossibleSumOne = LHS.One + RHS.One + CarryOne;
+
+  // The below is simplified from
+  //
+  // APInt CarryKnownZero = ~(PossibleSumZero ^ LHS.Zero ^ RHS.Zero);
+  // APInt CarryKnownOne = PossibleSumOne ^ LHS.One ^ RHS.One;
+  // APInt CarryUnknown = ~(CarryKnownZero | CarryKnownOne);
+  //
+  // APInt NeededToMaintainCarry =
+  //   (CarryKnownZero & NeededToMaintainCarryZero) |
+  //   (CarryKnownOne  & NeededToMaintainCarryOne) |
+  //   CarryUnknown;
+
+  APInt NeededToMaintainCarry = (~PossibleSumZero | NeededToMaintainCarryZero) &
+                                (PossibleSumOne | NeededToMaintainCarryOne);
+
+  APInt AB = AOut | (ACarry & NeededToMaintainCarry);
+  return AB;
+}
+
+APInt DemandedBits::determineLiveOperandBitsAdd(unsigned OperandNo,
+                                                const APInt &AOut,
+                                                const KnownBits &LHS,
+                                                const KnownBits &RHS) {
+  return determineLiveOperandBitsAddCarry(OperandNo, AOut, LHS, RHS, true,
+                                          false);
+}
+
+APInt DemandedBits::determineLiveOperandBitsSub(unsigned OperandNo,
+                                                const APInt &AOut,
+                                                const KnownBits &LHS,
+                                                const KnownBits &RHS) {
+  KnownBits NRHS;
+  NRHS.Zero = RHS.One;
+  NRHS.One = RHS.Zero;
+  return determineLiveOperandBitsAddCarry(OperandNo, AOut, LHS, NRHS, false,
+                                          true);
+}
+
 FunctionPass *llvm::createDemandedBitsWrapperPass() {
   return new DemandedBitsWrapperPass();
 }
diff --git a/contrib/llvm-project/llvm/lib/Analysis/DependenceAnalysis.cpp b/contrib/llvm-project/llvm/lib/Analysis/DependenceAnalysis.cpp
index bcfeef7fb8ab..c2c61131e40e 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/DependenceAnalysis.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/DependenceAnalysis.cpp
@@ -653,14 +653,16 @@ static AliasResult underlyingObjectsAlias(AAResults *AA,
                                           const MemoryLocation &LocB) {
   // Check the original locations (minus size) for noalias, which can happen for
   // tbaa, incompatible underlying object locations, etc.
-  MemoryLocation LocAS(LocA.Ptr, LocationSize::unknown(), LocA.AATags);
-  MemoryLocation LocBS(LocB.Ptr, LocationSize::unknown(), LocB.AATags);
+  MemoryLocation LocAS =
+      MemoryLocation::getBeforeOrAfter(LocA.Ptr, LocA.AATags);
+  MemoryLocation LocBS =
+      MemoryLocation::getBeforeOrAfter(LocB.Ptr, LocB.AATags);
   if (AA->alias(LocAS, LocBS) == NoAlias)
     return NoAlias;
 
   // Check the underlying objects are the same
-  const Value *AObj = GetUnderlyingObject(LocA.Ptr, DL);
-  const Value *BObj = GetUnderlyingObject(LocB.Ptr, DL);
+  const Value *AObj = getUnderlyingObject(LocA.Ptr);
+  const Value *BObj = getUnderlyingObject(LocB.Ptr);
 
   // If the underlying objects are the same, they must alias
   if (AObj == BObj)
@@ -871,8 +873,8 @@ void DependenceInfo::removeMatchingExtensions(Subscript *Pair) {
   const SCEV *Dst = Pair->Dst;
   if ((isa<SCEVZeroExtendExpr>(Src) && isa<SCEVZeroExtendExpr>(Dst)) ||
       (isa<SCEVSignExtendExpr>(Src) && isa<SCEVSignExtendExpr>(Dst))) {
-    const SCEVCastExpr *SrcCast = cast<SCEVCastExpr>(Src);
-    const SCEVCastExpr *DstCast = cast<SCEVCastExpr>(Dst);
+    const SCEVIntegralCastExpr *SrcCast = cast<SCEVIntegralCastExpr>(Src);
+    const SCEVIntegralCastExpr *DstCast = cast<SCEVIntegralCastExpr>(Dst);
     const SCEV *SrcCastOp = SrcCast->getOperand();
     const SCEV *DstCastOp = DstCast->getOperand();
     if (SrcCastOp->getType() == DstCastOp->getType()) {
@@ -969,8 +971,8 @@ bool DependenceInfo::isKnownPredicate(ICmpInst::Predicate Pred, const SCEV *X,
          isa<SCEVSignExtendExpr>(Y)) ||
         (isa<SCEVZeroExtendExpr>(X) &&
          isa<SCEVZeroExtendExpr>(Y))) {
-      const SCEVCastExpr *CX = cast<SCEVCastExpr>(X);
-      const SCEVCastExpr *CY = cast<SCEVCastExpr>(Y);
+      const SCEVIntegralCastExpr *CX = cast<SCEVIntegralCastExpr>(X);
+      const SCEVIntegralCastExpr *CY = cast<SCEVIntegralCastExpr>(Y);
       const SCEV *Xop = CX->getOperand();
       const SCEV *Yop = CY->getOperand();
       if (Xop->getType() == Yop->getType()) {
@@ -1459,19 +1461,6 @@ static APInt ceilingOfQuotient(const APInt &A, const APInt &B) {
     return Q;
 }
 
-
-static
-APInt maxAPInt(APInt A, APInt B) {
-  return A.sgt(B) ? A : B;
-}
-
-
-static
-APInt minAPInt(APInt A, APInt B) {
-  return A.slt(B) ? A : B;
-}
-
-
 // exactSIVtest -
 // When we have a pair of subscripts of the form [c1 + a1*i] and [c2 + a2*i],
 // where i is an induction variable, c1 and c2 are loop invariant, and a1
@@ -1542,18 +1531,18 @@ bool DependenceInfo::exactSIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
   // test(BM/G, LM-X) and test(-BM/G, X-UM)
   APInt TMUL = BM.sdiv(G);
   if (TMUL.sgt(0)) {
-    TL = maxAPInt(TL, ceilingOfQuotient(-X, TMUL));
+    TL = APIntOps::smax(TL, ceilingOfQuotient(-X, TMUL));
     LLVM_DEBUG(dbgs() << "\t    TL = " << TL << "\n");
     if (UMvalid) {
-      TU = minAPInt(TU, floorOfQuotient(UM - X, TMUL));
+      TU = APIntOps::smin(TU, floorOfQuotient(UM - X, TMUL));
       LLVM_DEBUG(dbgs() << "\t    TU = " << TU << "\n");
     }
   }
   else {
-    TU = minAPInt(TU, floorOfQuotient(-X, TMUL));
+    TU = APIntOps::smin(TU, floorOfQuotient(-X, TMUL));
     LLVM_DEBUG(dbgs() << "\t    TU = " << TU << "\n");
     if (UMvalid) {
-      TL = maxAPInt(TL, ceilingOfQuotient(UM - X, TMUL));
+      TL = APIntOps::smax(TL, ceilingOfQuotient(UM - X, TMUL));
       LLVM_DEBUG(dbgs() << "\t    TL = " << TL << "\n");
     }
   }
@@ -1561,18 +1550,18 @@ bool DependenceInfo::exactSIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
   // test(AM/G, LM-Y) and test(-AM/G, Y-UM)
   TMUL = AM.sdiv(G);
   if (TMUL.sgt(0)) {
-    TL = maxAPInt(TL, ceilingOfQuotient(-Y, TMUL));
+    TL = APIntOps::smax(TL, ceilingOfQuotient(-Y, TMUL));
     LLVM_DEBUG(dbgs() << "\t    TL = " << TL << "\n");
     if (UMvalid) {
-      TU = minAPInt(TU, floorOfQuotient(UM - Y, TMUL));
+      TU = APIntOps::smin(TU, floorOfQuotient(UM - Y, TMUL));
       LLVM_DEBUG(dbgs() << "\t    TU = " << TU << "\n");
     }
   }
   else {
-    TU = minAPInt(TU, floorOfQuotient(-Y, TMUL));
+    TU = APIntOps::smin(TU, floorOfQuotient(-Y, TMUL));
     LLVM_DEBUG(dbgs() << "\t    TU = " << TU << "\n");
     if (UMvalid) {
-      TL = maxAPInt(TL, ceilingOfQuotient(UM - Y, TMUL));
+      TL = APIntOps::smax(TL, ceilingOfQuotient(UM - Y, TMUL));
       LLVM_DEBUG(dbgs() << "\t    TL = " << TL << "\n");
     }
   }
@@ -1591,11 +1580,11 @@ bool DependenceInfo::exactSIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
   LLVM_DEBUG(dbgs() << "\t    exploring LT direction\n");
   TMUL = AM - BM;
   if (TMUL.sgt(0)) {
-    TL = maxAPInt(TL, ceilingOfQuotient(X - Y + 1, TMUL));
+    TL = APIntOps::smax(TL, ceilingOfQuotient(X - Y + 1, TMUL));
     LLVM_DEBUG(dbgs() << "\t\t    TL = " << TL << "\n");
   }
   else {
-    TU = minAPInt(TU, floorOfQuotient(X - Y + 1, TMUL));
+    TU = APIntOps::smin(TU, floorOfQuotient(X - Y + 1, TMUL));
     LLVM_DEBUG(dbgs() << "\t\t    TU = " << TU << "\n");
   }
   if (TL.sle(TU)) {
@@ -1608,20 +1597,20 @@ bool DependenceInfo::exactSIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
   TL = SaveTL;
   LLVM_DEBUG(dbgs() << "\t    exploring EQ direction\n");
   if (TMUL.sgt(0)) {
-    TL = maxAPInt(TL, ceilingOfQuotient(X - Y, TMUL));
+    TL = APIntOps::smax(TL, ceilingOfQuotient(X - Y, TMUL));
     LLVM_DEBUG(dbgs() << "\t\t    TL = " << TL << "\n");
   }
   else {
-    TU = minAPInt(TU, floorOfQuotient(X - Y, TMUL));
+    TU = APIntOps::smin(TU, floorOfQuotient(X - Y, TMUL));
     LLVM_DEBUG(dbgs() << "\t\t    TU = " << TU << "\n");
   }
   TMUL = BM - AM;
   if (TMUL.sgt(0)) {
-    TL = maxAPInt(TL, ceilingOfQuotient(Y - X, TMUL));
+    TL = APIntOps::smax(TL, ceilingOfQuotient(Y - X, TMUL));
     LLVM_DEBUG(dbgs() << "\t\t    TL = " << TL << "\n");
   }
   else {
-    TU = minAPInt(TU, floorOfQuotient(Y - X, TMUL));
+    TU = APIntOps::smin(TU, floorOfQuotient(Y - X, TMUL));
     LLVM_DEBUG(dbgs() << "\t\t    TU = " << TU << "\n");
   }
   if (TL.sle(TU)) {
@@ -1634,11 +1623,11 @@ bool DependenceInfo::exactSIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
   TL = SaveTL;
   LLVM_DEBUG(dbgs() << "\t    exploring GT direction\n");
   if (TMUL.sgt(0)) {
-    TL = maxAPInt(TL, ceilingOfQuotient(Y - X + 1, TMUL));
+    TL = APIntOps::smax(TL, ceilingOfQuotient(Y - X + 1, TMUL));
     LLVM_DEBUG(dbgs() << "\t\t    TL = " << TL << "\n");
   }
   else {
-    TU = minAPInt(TU, floorOfQuotient(Y - X + 1, TMUL));
+    TU = APIntOps::smin(TU, floorOfQuotient(Y - X + 1, TMUL));
     LLVM_DEBUG(dbgs() << "\t\t    TU = " << TU << "\n");
   }
   if (TL.sle(TU)) {
@@ -1950,18 +1939,18 @@ bool DependenceInfo::exactRDIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
   // test(BM/G, LM-X) and test(-BM/G, X-UM)
   APInt TMUL = BM.sdiv(G);
   if (TMUL.sgt(0)) {
-    TL = maxAPInt(TL, ceilingOfQuotient(-X, TMUL));
+    TL = APIntOps::smax(TL, ceilingOfQuotient(-X, TMUL));
     LLVM_DEBUG(dbgs() << "\t    TL = " << TL << "\n");
     if (SrcUMvalid) {
-      TU = minAPInt(TU, floorOfQuotient(SrcUM - X, TMUL));
+      TU = APIntOps::smin(TU, floorOfQuotient(SrcUM - X, TMUL));
       LLVM_DEBUG(dbgs() << "\t    TU = " << TU << "\n");
     }
   }
   else {
-    TU = minAPInt(TU, floorOfQuotient(-X, TMUL));
+    TU = APIntOps::smin(TU, floorOfQuotient(-X, TMUL));
     LLVM_DEBUG(dbgs() << "\t    TU = " << TU << "\n");
     if (SrcUMvalid) {
-      TL = maxAPInt(TL, ceilingOfQuotient(SrcUM - X, TMUL));
+      TL = APIntOps::smax(TL, ceilingOfQuotient(SrcUM - X, TMUL));
       LLVM_DEBUG(dbgs() << "\t    TL = " << TL << "\n");
     }
   }
@@ -1969,18 +1958,18 @@ bool DependenceInfo::exactRDIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
   // test(AM/G, LM-Y) and test(-AM/G, Y-UM)
   TMUL = AM.sdiv(G);
   if (TMUL.sgt(0)) {
-    TL = maxAPInt(TL, ceilingOfQuotient(-Y, TMUL));
+    TL = APIntOps::smax(TL, ceilingOfQuotient(-Y, TMUL));
     LLVM_DEBUG(dbgs() << "\t    TL = " << TL << "\n");
     if (DstUMvalid) {
-      TU = minAPInt(TU, floorOfQuotient(DstUM - Y, TMUL));
+      TU = APIntOps::smin(TU, floorOfQuotient(DstUM - Y, TMUL));
       LLVM_DEBUG(dbgs() << "\t    TU = " << TU << "\n");
     }
   }
   else {
-    TU = minAPInt(TU, floorOfQuotient(-Y, TMUL));
+    TU = APIntOps::smin(TU, floorOfQuotient(-Y, TMUL));
     LLVM_DEBUG(dbgs() << "\t    TU = " << TU << "\n");
     if (DstUMvalid) {
-      TL = maxAPInt(TL, ceilingOfQuotient(DstUM - Y, TMUL));
+      TL = APIntOps::smax(TL, ceilingOfQuotient(DstUM - Y, TMUL));
       LLVM_DEBUG(dbgs() << "\t    TL = " << TL << "\n");
     }
   }
diff --git a/contrib/llvm-project/llvm/lib/Analysis/DependenceGraphBuilder.cpp b/contrib/llvm-project/llvm/lib/Analysis/DependenceGraphBuilder.cpp
index 7a98d844e4cb..6b90db4bafe1 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/DependenceGraphBuilder.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/DependenceGraphBuilder.cpp
@@ -140,75 +140,74 @@ template <class G> void AbstractDependenceGraphBuilder<G>::createPiBlocks() {
       if (*N == PiNode || NodesInSCC.count(N))
         continue;
 
-      for (NodeType *SCCNode : NL) {
-
-        enum Direction {
-          Incoming,      // Incoming edges to the SCC
-          Outgoing,      // Edges going ot of the SCC
-          DirectionCount // To make the enum usable as an array index.
-        };
-
-        // Use these flags to help us avoid creating redundant edges. If there
-        // are more than one edges from an outside node to inside nodes, we only
-        // keep one edge from that node to the pi-block node. Similarly, if
-        // there are more than one edges from inside nodes to an outside node,
-        // we only keep one edge from the pi-block node to the outside node.
-        // There is a flag defined for each direction (incoming vs outgoing) and
-        // for each type of edge supported, using a two-dimensional boolean
-        // array.
-        using EdgeKind = typename EdgeType::EdgeKind;
-        EnumeratedArray<bool, EdgeKind> EdgeAlreadyCreated[DirectionCount]{
-            false, false};
-
-        auto createEdgeOfKind = [this](NodeType &Src, NodeType &Dst,
-                                       const EdgeKind K) {
-          switch (K) {
-          case EdgeKind::RegisterDefUse:
-            createDefUseEdge(Src, Dst);
-            break;
-          case EdgeKind::MemoryDependence:
-            createMemoryEdge(Src, Dst);
-            break;
-          case EdgeKind::Rooted:
-            createRootedEdge(Src, Dst);
-            break;
-          default:
-            llvm_unreachable("Unsupported type of edge.");
-          }
-        };
-
-        auto reconnectEdges = [&](NodeType *Src, NodeType *Dst, NodeType *New,
-                                  const Direction Dir) {
-          if (!Src->hasEdgeTo(*Dst))
-            return;
-          LLVM_DEBUG(dbgs()
-                     << "reconnecting("
-                     << (Dir == Direction::Incoming ? "incoming)" : "outgoing)")
-                     << ":\nSrc:" << *Src << "\nDst:" << *Dst
-                     << "\nNew:" << *New << "\n");
-          assert((Dir == Direction::Incoming || Dir == Direction::Outgoing) &&
-                 "Invalid direction.");
-
-          SmallVector<EdgeType *, 10> EL;
-          Src->findEdgesTo(*Dst, EL);
-          for (EdgeType *OldEdge : EL) {
-            EdgeKind Kind = OldEdge->getKind();
-            if (!EdgeAlreadyCreated[Dir][Kind]) {
-              if (Dir == Direction::Incoming) {
-                createEdgeOfKind(*Src, *New, Kind);
-                LLVM_DEBUG(dbgs() << "created edge from Src to New.\n");
-              } else if (Dir == Direction::Outgoing) {
-                createEdgeOfKind(*New, *Dst, Kind);
-                LLVM_DEBUG(dbgs() << "created edge from New to Dst.\n");
-              }
-              EdgeAlreadyCreated[Dir][Kind] = true;
+      enum Direction {
+        Incoming,      // Incoming edges to the SCC
+        Outgoing,      // Edges going ot of the SCC
+        DirectionCount // To make the enum usable as an array index.
+      };
+
+      // Use these flags to help us avoid creating redundant edges. If there
+      // are more than one edges from an outside node to inside nodes, we only
+      // keep one edge from that node to the pi-block node. Similarly, if
+      // there are more than one edges from inside nodes to an outside node,
+      // we only keep one edge from the pi-block node to the outside node.
+      // There is a flag defined for each direction (incoming vs outgoing) and
+      // for each type of edge supported, using a two-dimensional boolean
+      // array.
+      using EdgeKind = typename EdgeType::EdgeKind;
+      EnumeratedArray<bool, EdgeKind> EdgeAlreadyCreated[DirectionCount]{false,
+                                                                         false};
+
+      auto createEdgeOfKind = [this](NodeType &Src, NodeType &Dst,
+                                     const EdgeKind K) {
+        switch (K) {
+        case EdgeKind::RegisterDefUse:
+          createDefUseEdge(Src, Dst);
+          break;
+        case EdgeKind::MemoryDependence:
+          createMemoryEdge(Src, Dst);
+          break;
+        case EdgeKind::Rooted:
+          createRootedEdge(Src, Dst);
+          break;
+        default:
+          llvm_unreachable("Unsupported type of edge.");
+        }
+      };
+
+      auto reconnectEdges = [&](NodeType *Src, NodeType *Dst, NodeType *New,
+                                const Direction Dir) {
+        if (!Src->hasEdgeTo(*Dst))
+          return;
+        LLVM_DEBUG(
+            dbgs() << "reconnecting("
+                   << (Dir == Direction::Incoming ? "incoming)" : "outgoing)")
+                   << ":\nSrc:" << *Src << "\nDst:" << *Dst << "\nNew:" << *New
+                   << "\n");
+        assert((Dir == Direction::Incoming || Dir == Direction::Outgoing) &&
+               "Invalid direction.");
+
+        SmallVector<EdgeType *, 10> EL;
+        Src->findEdgesTo(*Dst, EL);
+        for (EdgeType *OldEdge : EL) {
+          EdgeKind Kind = OldEdge->getKind();
+          if (!EdgeAlreadyCreated[Dir][Kind]) {
+            if (Dir == Direction::Incoming) {
+              createEdgeOfKind(*Src, *New, Kind);
+              LLVM_DEBUG(dbgs() << "created edge from Src to New.\n");
+            } else if (Dir == Direction::Outgoing) {
+              createEdgeOfKind(*New, *Dst, Kind);
+              LLVM_DEBUG(dbgs() << "created edge from New to Dst.\n");
             }
-            Src->removeEdge(*OldEdge);
-            destroyEdge(*OldEdge);
-            LLVM_DEBUG(dbgs() << "removed old edge between Src and Dst.\n\n");
+            EdgeAlreadyCreated[Dir][Kind] = true;
           }
-        };
+          Src->removeEdge(*OldEdge);
+          destroyEdge(*OldEdge);
+          LLVM_DEBUG(dbgs() << "removed old edge between Src and Dst.\n\n");
+        }
+      };
 
+      for (NodeType *SCCNode : NL) {
         // Process incoming edges incident to the pi-block node.
         reconnectEdges(N, SCCNode, &PiNode, Direction::Incoming);
 
@@ -492,16 +491,14 @@ void AbstractDependenceGraphBuilder<G>::sortNodesTopologically() {
       // Put members of the pi-block right after the pi-block itself, for
       // convenience.
       const NodeListType &PiBlockMembers = getNodesInPiBlock(*N);
-      NodesInPO.insert(NodesInPO.end(), PiBlockMembers.begin(),
-                       PiBlockMembers.end());
+      llvm::append_range(NodesInPO, PiBlockMembers);
     }
     NodesInPO.push_back(N);
   }
 
   size_t OldSize = Graph.Nodes.size();
   Graph.Nodes.clear();
-  for (NodeType *N : reverse(NodesInPO))
-    Graph.Nodes.push_back(N);
+  append_range(Graph.Nodes, reverse(NodesInPO));
   if (Graph.Nodes.size() != OldSize)
     assert(false &&
            "Expected the number of nodes to stay the same after the sort");
diff --git a/contrib/llvm-project/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp b/contrib/llvm-project/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp
new file mode 100644
index 000000000000..e138e82c8b05
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp
@@ -0,0 +1,531 @@
+//===- DevelopmentModeInlineAdvisor.cpp - runtime-loadable model runner  --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a model runner using Tensorflow C APIs, allowing the
+// loading of a model from a command line option.
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/Config/config.h"
+#if defined(LLVM_HAVE_TF_API)
+
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/InlineSizeEstimatorAnalysis.h"
+#include "llvm/Analysis/MLInlineAdvisor.h"
+#include "llvm/Analysis/Utils/TFUtils.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ManagedStatic.h"
+
+#include <vector>
+
+using namespace llvm;
+
+static cl::opt<std::string> TrainingLog(
+    "training-log", cl::Hidden,
+    cl::desc("Path where the development - mode inlining log is saved."));
+
+static cl::opt<std::string> TFModelUnderTrainingPath(
+    "ml-inliner-model-under-training", cl::Hidden,
+    cl::desc(R"(Path to SavedModel from the previous training iteration.
+The directory is also expected to contain a JSON specification of the 
+outputs expected to be logged, where the first entry must be the 
+inlining decision. The file containing the specification should be 
+called output_spec.json. The expected JSON value is an array of 
+dictionaries. Each dictionary should have 2 keys: 
+
+- "tensor_spec, followed by the TensorSpec description of the
+output; and 
+- "logging_name", a string indicating the name to use when
+logging the output values. 
+
+Example:
+[
+  {
+    "logging_name" : "some_name", 
+    "tensor_spec" : { 
+      "name" : "model_name", 
+      "port" : 0,
+      "shape" : [2, 3],
+      "type" : "float"
+      }
+  }
+]
+
+The first value must always correspond to the decision.)"));
+
+static cl::opt<std::string> TFOutputSpecOverride(
+    "ml-inliner-output-spec-override", cl::Hidden,
+    cl::desc("Override the path to the output spec json file. See "
+             "-ml-inliner-model-under-training documentation for the "
+             "specification of that file."));
+
+static cl::opt<std::string> TFFeedPrefix("ml-inliner-trained-model-feed-prefix",
+                                         cl::Hidden, cl::init("action_"),
+                                         cl::desc("Prefix for feature names."));
+
+namespace {
+/// An InlineEvent, used by TrainingLogger.
+struct InlineEvent {
+  /// What the default policy's decision would have been.
+  int64_t DefaultDecision = 0;
+
+  /// What we advised. When training off the default policy, this is the same as
+  /// DefaultDecision.
+  int64_t AdvisedDecision = 0;
+
+  /// What actually happened. This would be 'false' in the case of an inline
+  /// error, even if AdvisedDecision were true, otherwise it agrees with
+  /// AdvisedDecision.
+  bool Effect = false;
+
+  /// What the change in size was: size_after - size_before
+  int64_t Reward = 0;
+};
+
+/// Collect data we may use for training a model, and write it as a textual
+/// Tensorflow SequenceExample
+/// (https://www.tensorflow.org/api_docs/python/tf/train/SequenceExample)
+/// protobuf (https://developers.google.com/protocol-buffers).
+/// Because this is a protobuf, we cannot just stream the events as they come.
+/// Internally, TrainingLogger stores data in column-major format, because that
+/// lines up with how TF SequenceExample represents it.
+class ModelUnderTrainingRunner;
+class TrainingLogger final {
+public:
+  TrainingLogger(StringRef LogFileName, const ModelUnderTrainingRunner *MUTR);
+
+  /// Log one inlining event.
+  void logInlineEvent(const InlineEvent &Event,
+                      const MLModelRunner &ModelRunner);
+
+  /// Print the stored tensors.
+  void print();
+
+private:
+  StringRef LogFileName;
+  const ModelUnderTrainingRunner *const MUTR;
+  std::unique_ptr<Logger> L;
+  std::vector<bool> Effects;
+  /// There's at least one output. We'll set this to a different value if MUTR
+  /// is avaliable.
+  size_t OutputCount = 1;
+  /// Set these 2 clearly OOB, to make sure we set them later.
+  size_t DefaultDecisionPos = std::numeric_limits<size_t>::max();
+  size_t DecisionPos = std::numeric_limits<size_t>::max();
+};
+
+/// An extension of the MLInlineAdvisor for the 'development' mode, targeting
+/// the offline training scenario. Note that training happens outside of the
+/// compiler, this facility is concerned with producing training data ("logs").
+/// This InlineAdvisor can operate in the following modes:
+///
+/// 1) collect logs for the default policy. This is useful for bootstrapping
+/// training, which will be considerably faster by starting from a reasonable
+/// policy.
+///
+/// 2) collect logs for the ML policy, using a model from a previous
+/// training. Potentially, that model uses internally some small random
+/// perturbation of its weights, to induce exploration (setting this up is the
+/// responsibility of the training algorithm). The logs would then be used to
+/// retrain and improve on this model.
+///
+/// 3) use the provided model, with no logging. This is useful for end to end
+/// validation - the model, in this case, is a release candidate and shouldn't
+/// have random perturbations. It is a convenience feature: rather than needing
+/// to take the release candidate model and compile it in 'release' mode,
+/// validate it, then potentially discard it, it's easier to just pass the model
+/// to the compiler, albeit compilation would be slower, as a one-off. Once the
+/// model behaves satisfactorily, it can be compiled AOT, for efficiency, in
+/// release mode. The expectation is that a well-trained model provides a good
+/// policy over a sufficiently diverse codebase, over many changes (i.e.
+/// training happens seldom).
+class DevelopmentModeMLInlineAdvisor : public MLInlineAdvisor {
+public:
+  DevelopmentModeMLInlineAdvisor(
+      Module &M, ModuleAnalysisManager &MAM,
+      std::unique_ptr<MLModelRunner> ModelRunner,
+      std::function<bool(CallBase &)> GetDefaultAdvice, bool IsDoingInference,
+      std::unique_ptr<TrainingLogger> Logger);
+
+  size_t getTotalSizeEstimate();
+
+  virtual ~DevelopmentModeMLInlineAdvisor();
+  void updateNativeSizeEstimate(int64_t Change) {
+    *CurrentNativeSize += Change;
+  }
+  void resetNativeSize(Function *F) {
+    FAM.invalidate<InlineSizeEstimatorAnalysis>(*F);
+  }
+
+  std::unique_ptr<MLInlineAdvice>
+  getAdviceFromModel(CallBase &CB, OptimizationRemarkEmitter &ORE) override;
+
+  Optional<size_t> getNativeSizeEstimate(const Function &F) const;
+
+private:
+  bool isLogging() const { return !!Logger; }
+  std::unique_ptr<MLInlineAdvice> getMandatoryAdviceImpl(CallBase &CB) override;
+
+  std::function<bool(CallBase &)> GetDefaultAdvice;
+  const bool IsDoingInference;
+  std::unique_ptr<TrainingLogger> Logger;
+
+  const Optional<int32_t> InitialNativeSize;
+  Optional<int32_t> CurrentNativeSize;
+};
+
+/// A variant of MLInlineAdvice that tracks all non-trivial inlining
+/// decisions, for training/logging.
+class LoggingMLInlineAdvice : public MLInlineAdvice {
+public:
+  LoggingMLInlineAdvice(DevelopmentModeMLInlineAdvisor *Advisor, CallBase &CB,
+                        OptimizationRemarkEmitter &ORE, bool Recommendation,
+                        TrainingLogger &Logger,
+                        Optional<size_t> CallerSizeEstimateBefore,
+                        Optional<size_t> CalleeSizeEstimateBefore,
+                        bool DefaultDecision, bool Mandatory = false)
+      : MLInlineAdvice(Advisor, CB, ORE, Recommendation), Logger(Logger),
+        CallerSizeEstimateBefore(CallerSizeEstimateBefore),
+        CalleeSizeEstimateBefore(CalleeSizeEstimateBefore),
+        DefaultDecision(DefaultDecision), Mandatory(Mandatory) {}
+
+  virtual ~LoggingMLInlineAdvice() = default;
+
+private:
+  DevelopmentModeMLInlineAdvisor *getAdvisor() const {
+    return static_cast<DevelopmentModeMLInlineAdvisor *>(Advisor);
+  }
+  void recordInliningImpl() override {
+    MLInlineAdvice::recordInliningImpl();
+    getAdvisor()->resetNativeSize(Caller);
+    int Reward = std::numeric_limits<int>::max();
+    if (InlineSizeEstimatorAnalysis::isEvaluatorRequested() &&
+        !getAdvisor()->isForcedToStop()) {
+      int NativeSizeAfter = *getAdvisor()->getNativeSizeEstimate(*Caller) +
+                            *CalleeSizeEstimateBefore;
+      Reward = NativeSizeAfter -
+               (*CallerSizeEstimateBefore + *CalleeSizeEstimateBefore);
+      getAdvisor()->updateNativeSizeEstimate(Reward);
+    }
+    log(Reward, /*Success=*/true);
+  }
+
+  void recordInliningWithCalleeDeletedImpl() override {
+    MLInlineAdvice::recordInliningWithCalleeDeletedImpl();
+    getAdvisor()->resetNativeSize(Caller);
+    if (InlineSizeEstimatorAnalysis::isEvaluatorRequested() &&
+        !getAdvisor()->isForcedToStop()) {
+      int NativeSizeAfter = *getAdvisor()->getNativeSizeEstimate(*Caller);
+      int Reward = NativeSizeAfter -
+                   (*CallerSizeEstimateBefore + *CalleeSizeEstimateBefore);
+      getAdvisor()->updateNativeSizeEstimate(Reward);
+      log(Reward, /*Success=*/true);
+    }
+  }
+
+  void recordUnsuccessfulInliningImpl(const InlineResult &Result) override {
+    MLInlineAdvice::recordUnsuccessfulInliningImpl(Result);
+    log(NoReward, /*Success=*/false);
+  }
+
+  void recordUnattemptedInliningImpl() override {
+    MLInlineAdvice::recordUnattemptedInliningImpl();
+    log(NoReward, /*Success=*/false);
+  }
+
+  void log(int64_t Reward, bool Success) {
+    if (Mandatory)
+      return;
+    InlineEvent Event;
+    Event.AdvisedDecision = isInliningRecommended();
+    Event.DefaultDecision = DefaultDecision;
+    Event.Effect = Success;
+    Event.Reward = Reward;
+    Logger.logInlineEvent(Event, getAdvisor()->getModelRunner());
+  }
+
+  static const int64_t NoReward = 0;
+  TrainingLogger &Logger;
+  const Optional<size_t> CallerSizeEstimateBefore;
+  const Optional<size_t> CalleeSizeEstimateBefore;
+  const int64_t DefaultDecision;
+  const int64_t Mandatory;
+};
+
+/// A pseudo model runner. We use it to store feature values when collecting
+/// logs for the default policy, but never ask it to 'run'.
+class NoInferenceModelRunner : public MLModelRunner {
+public:
+  NoInferenceModelRunner(LLVMContext &Ctx)
+      : MLModelRunner(Ctx), Features(NumberOfFeatures) {}
+  void setFeature(FeatureIndex Index, int64_t Value) override {
+    Features[static_cast<int>(Index)] = Value;
+  }
+
+  int64_t getFeature(int Index) const override { return Features[Index]; }
+  bool run() override {
+    llvm_unreachable("We shouldn't call run on this model runner.");
+  }
+
+private:
+  InlineFeatures Features;
+};
+
+/// ModelUnderTrainingRunner - training mode implementation. It uses TF C APIs
+/// to dynamically load and evaluate a TF SavedModel
+/// (https://www.tensorflow.org/guide/saved_model). Runtime performance is
+/// sacrificed for ease of use while training.
+class ModelUnderTrainingRunner final : public MLModelRunner {
+public:
+  ModelUnderTrainingRunner(LLVMContext &Ctx, const std::string &ModelPath);
+
+  bool run() override;
+
+  // Disallows copy and assign.
+  ModelUnderTrainingRunner(const ModelUnderTrainingRunner &) = delete;
+  ModelUnderTrainingRunner &
+  operator=(const ModelUnderTrainingRunner &) = delete;
+
+  void setFeature(FeatureIndex Index, int64_t Value) override;
+  int64_t getFeature(int Index) const override;
+  bool isValid() const { return !!Evaluator; }
+
+  const std::vector<LoggedFeatureSpec> &outputLoggedFeatureSpecs() const {
+    return OutputSpecs;
+  }
+
+  const Optional<TFModelEvaluator::EvaluationResult> &
+  lastEvaluationResult() const {
+    return LastEvaluationResult;
+  }
+
+private:
+  std::unique_ptr<TFModelEvaluator> Evaluator;
+  std::vector<LoggedFeatureSpec> OutputSpecs;
+  Optional<TFModelEvaluator::EvaluationResult> LastEvaluationResult;
+
+  // The training framework needs some additional features.
+  const std::vector<TensorSpec> TrainingOnlyFeatures{
+      TensorSpec::createSpec<int64_t>(TFFeedPrefix + "inlining_default", {1}),
+      TensorSpec::createSpec<float>(TFFeedPrefix + "discount", {1}),
+      TensorSpec::createSpec<float>(TFFeedPrefix + "reward", {1}),
+      TensorSpec::createSpec<int32_t>(TFFeedPrefix + "step_type", {1})};
+};
+} // namespace
+
+TrainingLogger::TrainingLogger(StringRef LogFileName,
+                               const ModelUnderTrainingRunner *MUTR)
+    : LogFileName(LogFileName), MUTR(MUTR) {
+  // The first output is the inlining decision.
+  if (MUTR)
+    OutputCount = MUTR->outputLoggedFeatureSpecs().size();
+  std::vector<LoggedFeatureSpec> FT;
+
+  for (size_t I = 0; I < NumberOfFeatures; ++I)
+    FT.push_back(
+        {TensorSpec::createSpec<int64_t>(FeatureNameMap.at(I), {1}), None});
+  if (MUTR && MUTR->outputLoggedFeatureSpecs().size() > 1)
+    append_range(FT, drop_begin(MUTR->outputLoggedFeatureSpecs()));
+
+  DefaultDecisionPos = FT.size();
+  FT.push_back(
+      {TensorSpec::createSpec<int64_t>(DefaultDecisionName, {1}), None});
+
+  DecisionPos = FT.size();
+  FT.push_back({TensorSpec::createSpec<int64_t>(DecisionName, {1}), None});
+
+  L = std::make_unique<Logger>(
+      FT, TensorSpec::createSpec<int64_t>(RewardName, {1}),
+      InlineSizeEstimatorAnalysis::isEvaluatorRequested());
+}
+
+/// Log one inlining event.
+void TrainingLogger::logInlineEvent(const InlineEvent &Event,
+                                    const MLModelRunner &ModelRunner) {
+  size_t CurrentFeature = 0;
+  for (; CurrentFeature < NumberOfFeatures; ++CurrentFeature) {
+    int64_t F = ModelRunner.getFeature(CurrentFeature);
+    L->logTensorValue(CurrentFeature, &F);
+  }
+
+  for (size_t I = 1; I < OutputCount; ++I) {
+    const auto &Result = *MUTR->lastEvaluationResult();
+    auto &Spec = MUTR->outputLoggedFeatureSpecs()[I].Spec;
+    const char *RawData =
+        reinterpret_cast<const char *>(Result.getUntypedTensorValue(I));
+    L->logTensorValue(CurrentFeature, RawData,
+                      Spec.getElementCount() * Spec.getElementByteSize());
+    ++CurrentFeature;
+  }
+
+  assert(CurrentFeature == DefaultDecisionPos);
+  L->logTensorValue(DefaultDecisionPos, &Event.DefaultDecision);
+  L->logTensorValue(DecisionPos, &Event.AdvisedDecision);
+  if (InlineSizeEstimatorAnalysis::isEvaluatorRequested())
+    L->logReward(Event.Reward);
+
+  // For debugging / later use
+  Effects.push_back(Event.Effect);
+}
+
+void TrainingLogger::print() {
+  std::error_code EC;
+  raw_fd_ostream OutFile(LogFileName, EC);
+  L->print(OutFile);
+}
+
+DevelopmentModeMLInlineAdvisor::DevelopmentModeMLInlineAdvisor(
+    Module &M, ModuleAnalysisManager &MAM,
+    std::unique_ptr<MLModelRunner> ModelRunner,
+    std::function<bool(CallBase &)> GetDefaultAdvice, bool IsDoingInference,
+    std::unique_ptr<TrainingLogger> Logger)
+    : MLInlineAdvisor(M, MAM, std::move(ModelRunner)),
+      GetDefaultAdvice(GetDefaultAdvice), IsDoingInference(IsDoingInference),
+      Logger(std::move(Logger)),
+      InitialNativeSize(isLogging() ? getTotalSizeEstimate() : 0),
+      CurrentNativeSize(InitialNativeSize) {
+  // We cannot have the case of neither inference nor logging.
+  assert(IsDoingInference || isLogging());
+}
+
+DevelopmentModeMLInlineAdvisor::~DevelopmentModeMLInlineAdvisor() {
+  if (isLogging())
+    Logger->print();
+}
+
+Optional<size_t>
+DevelopmentModeMLInlineAdvisor::getNativeSizeEstimate(const Function &F) const {
+  if (!InlineSizeEstimatorAnalysis::isEvaluatorRequested())
+    return None;
+  auto &R =
+      FAM.getResult<InlineSizeEstimatorAnalysis>(const_cast<Function &>(F));
+  if (!R) {
+    F.getParent()->getContext().emitError(
+        "Native size estimator is not present.");
+    return 0;
+  }
+  return *R;
+}
+
+std::unique_ptr<MLInlineAdvice>
+DevelopmentModeMLInlineAdvisor::getMandatoryAdviceImpl(CallBase &CB) {
+  return std::make_unique<LoggingMLInlineAdvice>(
+      /*Advisor=*/this,
+      /*CB=*/CB, /*ORE=*/getCallerORE(CB), /*Recommendation=*/true,
+      /*Logger=*/*Logger,
+      /*CallerSizeEstimateBefore=*/getNativeSizeEstimate(*CB.getCaller()),
+      /*CalleeSizeEstimateBefore=*/
+      getNativeSizeEstimate(*CB.getCalledFunction()),
+      /*DefaultDecision=*/true, /*Mandatory*/ true);
+}
+
+std::unique_ptr<MLInlineAdvice>
+DevelopmentModeMLInlineAdvisor::getAdviceFromModel(
+    CallBase &CB, OptimizationRemarkEmitter &ORE) {
+  if (IsDoingInference && !isLogging())
+    return MLInlineAdvisor::getAdviceFromModel(CB, ORE);
+
+  bool DefaultAdvice = GetDefaultAdvice(CB);
+  auto Recommendation = IsDoingInference ? ModelRunner->run() : DefaultAdvice;
+  return std::make_unique<LoggingMLInlineAdvice>(
+      /*Advisor=*/this,
+      /*CB=*/CB, /*ORE=*/ORE, /*Recommendation=*/Recommendation,
+      /*Logger=*/*Logger,
+      /*CallerSizeEstimateBefore=*/getNativeSizeEstimate(*CB.getCaller()),
+      /*CalleeSizeEstimateBefore=*/
+      getNativeSizeEstimate(*CB.getCalledFunction()),
+      /*DefaultDecision=*/DefaultAdvice);
+}
+
+size_t DevelopmentModeMLInlineAdvisor::getTotalSizeEstimate() {
+  if (!InlineSizeEstimatorAnalysis::isEvaluatorRequested())
+    return 0;
+  size_t Ret = 0;
+  for (auto &F : M) {
+    if (F.isDeclaration())
+      continue;
+    if (isFunctionDeleted(&F))
+      continue;
+    Ret += *getNativeSizeEstimate(F);
+  }
+  return Ret;
+}
+
+ModelUnderTrainingRunner::ModelUnderTrainingRunner(LLVMContext &Ctx,
+                                                   const std::string &ModelPath)
+    : MLModelRunner(Ctx) {
+  std::vector<TensorSpec> InputSpecs;
+  for (size_t I = 0; I < NumberOfFeatures; ++I)
+    InputSpecs.push_back(
+        TensorSpec::createSpec<int64_t>(TFFeedPrefix + FeatureNameMap[I], {1}));
+  append_range(InputSpecs, TrainingOnlyFeatures);
+  if (auto MaybeOutSpecs =
+          loadOutputSpecs(Ctx, DecisionName, ModelPath, TFOutputSpecOverride))
+    OutputSpecs = std::move(*MaybeOutSpecs);
+  else
+    return;
+
+  Evaluator = std::make_unique<TFModelEvaluator>(
+      ModelPath, InputSpecs, [&](size_t I) { return OutputSpecs[I].Spec; },
+      OutputSpecs.size());
+  if (!Evaluator || !Evaluator->isValid()) {
+    Ctx.emitError("Failed to create inliner saved model evaluator");
+    Evaluator.reset();
+    return;
+  }
+}
+
+bool ModelUnderTrainingRunner::run() {
+  LastEvaluationResult = Evaluator->evaluate();
+  if (!LastEvaluationResult.hasValue()) {
+    Ctx.emitError("Error evaluating model.");
+    return false;
+  }
+  int64_t Decision = *LastEvaluationResult->getTensorValue<int64_t>(0);
+  return static_cast<bool>(Decision);
+}
+
+int64_t ModelUnderTrainingRunner::getFeature(int Index) const {
+  return *Evaluator->getInput<int64_t>(Index);
+}
+
+void ModelUnderTrainingRunner::setFeature(FeatureIndex Index, int64_t Value) {
+  size_t NumericIndex = static_cast<size_t>(Index);
+  *(Evaluator->getInput<int64_t>(NumericIndex)) = Value;
+}
+
+std::unique_ptr<InlineAdvisor> llvm::getDevelopmentModeAdvisor(
+    Module &M, ModuleAnalysisManager &MAM,
+    std::function<bool(CallBase &)> GetDefaultAdvice) {
+  auto &Ctx = M.getContext();
+  std::unique_ptr<MLModelRunner> Runner;
+  ModelUnderTrainingRunner *MUTRPtr = nullptr;
+  bool IsDoingInference = false;
+  if (TFModelUnderTrainingPath.empty())
+    Runner.reset(new NoInferenceModelRunner(Ctx));
+  else {
+    auto MUTR = std::make_unique<ModelUnderTrainingRunner>(
+        Ctx, TFModelUnderTrainingPath);
+    if (!MUTR || !MUTR->isValid()) {
+      Ctx.emitError("Could not load the policy model from the provided path");
+      return nullptr;
+    }
+    IsDoingInference = true;
+    MUTRPtr = MUTR.get();
+    Runner = std::move(MUTR);
+  }
+  std::unique_ptr<TrainingLogger> Logger;
+  if (!TrainingLog.empty())
+    Logger = std::make_unique<TrainingLogger>(TrainingLog, MUTRPtr);
+
+  return std::make_unique<DevelopmentModeMLInlineAdvisor>(
+      M, MAM, std::move(Runner), GetDefaultAdvice, IsDoingInference,
+      std::move(Logger));
+}
+#endif // defined(LLVM_HAVE_TF_API)
diff --git a/contrib/llvm-project/llvm/lib/Analysis/DivergenceAnalysis.cpp b/contrib/llvm-project/llvm/lib/Analysis/DivergenceAnalysis.cpp
index 343406c9bba1..287c13278014 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/DivergenceAnalysis.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/DivergenceAnalysis.cpp
@@ -1,4 +1,4 @@
-//===- DivergenceAnalysis.cpp --------- Divergence Analysis Implementation -==//
+//===---- DivergenceAnalysis.cpp --- Divergence Analysis Implementation ----==//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -84,7 +84,6 @@
 #include "llvm/IR/Value.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include <vector>
 
 using namespace llvm;
 
@@ -97,42 +96,18 @@ DivergenceAnalysis::DivergenceAnalysis(
     : F(F), RegionLoop(RegionLoop), DT(DT), LI(LI), SDA(SDA),
       IsLCSSAForm(IsLCSSAForm) {}
 
-void DivergenceAnalysis::markDivergent(const Value &DivVal) {
+bool DivergenceAnalysis::markDivergent(const Value &DivVal) {
+  if (isAlwaysUniform(DivVal))
+    return false;
   assert(isa<Instruction>(DivVal) || isa<Argument>(DivVal));
   assert(!isAlwaysUniform(DivVal) && "cannot be a divergent");
-  DivergentValues.insert(&DivVal);
+  return DivergentValues.insert(&DivVal).second;
 }
 
 void DivergenceAnalysis::addUniformOverride(const Value &UniVal) {
   UniformOverrides.insert(&UniVal);
 }
 
-bool DivergenceAnalysis::updateTerminator(const Instruction &Term) const {
-  if (Term.getNumSuccessors() <= 1)
-    return false;
-  if (auto *BranchTerm = dyn_cast<BranchInst>(&Term)) {
-    assert(BranchTerm->isConditional());
-    return isDivergent(*BranchTerm->getCondition());
-  }
-  if (auto *SwitchTerm = dyn_cast<SwitchInst>(&Term)) {
-    return isDivergent(*SwitchTerm->getCondition());
-  }
-  if (isa<InvokeInst>(Term)) {
-    return false; // ignore abnormal executions through landingpad
-  }
-
-  llvm_unreachable("unexpected terminator");
-}
-
-bool DivergenceAnalysis::updateNormalInstruction(const Instruction &I) const {
-  // TODO function calls with side effects, etc
-  for (const auto &Op : I.operands()) {
-    if (isDivergent(*Op))
-      return true;
-  }
-  return false;
-}
-
 bool DivergenceAnalysis::isTemporalDivergent(const BasicBlock &ObservingBlock,
                                              const Value &Val) const {
   const auto *Inst = dyn_cast<const Instruction>(&Val);
@@ -143,39 +118,13 @@ bool DivergenceAnalysis::isTemporalDivergent(const BasicBlock &ObservingBlock,
   for (const auto *Loop = LI.getLoopFor(Inst->getParent());
        Loop != RegionLoop && !Loop->contains(&ObservingBlock);
        Loop = Loop->getParentLoop()) {
-    if (DivergentLoops.find(Loop) != DivergentLoops.end())
+    if (DivergentLoops.contains(Loop))
       return true;
   }
 
   return false;
 }
 
-bool DivergenceAnalysis::updatePHINode(const PHINode &Phi) const {
-  // joining divergent disjoint path in Phi parent block
-  if (!Phi.hasConstantOrUndefValue() && isJoinDivergent(*Phi.getParent())) {
-    return true;
-  }
-
-  // An incoming value could be divergent by itself.
-  // Otherwise, an incoming value could be uniform within the loop
-  // that carries its definition but it may appear divergent
-  // from outside the loop. This happens when divergent loop exits
-  // drop definitions of that uniform value in different iterations.
-  //
-  // for (int i = 0; i < n; ++i) { // 'i' is uniform inside the loop
-  //   if (i % thread_id == 0) break;    // divergent loop exit
-  // }
-  // int divI = i;                 // divI is divergent
-  for (size_t i = 0; i < Phi.getNumIncomingValues(); ++i) {
-    const auto *InVal = Phi.getIncomingValue(i);
-    if (isDivergent(*Phi.getIncomingValue(i)) ||
-        isTemporalDivergent(*Phi.getParent(), *InVal)) {
-      return true;
-    }
-  }
-  return false;
-}
-
 bool DivergenceAnalysis::inRegion(const Instruction &I) const {
   return I.getParent() && inRegion(*I.getParent());
 }
@@ -184,69 +133,103 @@ bool DivergenceAnalysis::inRegion(const BasicBlock &BB) const {
   return (!RegionLoop && BB.getParent() == &F) || RegionLoop->contains(&BB);
 }
 
-static bool usesLiveOut(const Instruction &I, const Loop *DivLoop) {
-  for (auto &Op : I.operands()) {
-    auto *OpInst = dyn_cast<Instruction>(&Op);
+void DivergenceAnalysis::pushUsers(const Value &V) {
+  const auto *I = dyn_cast<const Instruction>(&V);
+
+  if (I && I->isTerminator()) {
+    analyzeControlDivergence(*I);
+    return;
+  }
+
+  for (const auto *User : V.users()) {
+    const auto *UserInst = dyn_cast<const Instruction>(User);
+    if (!UserInst)
+      continue;
+
+    // only compute divergent inside loop
+    if (!inRegion(*UserInst))
+      continue;
+
+    // All users of divergent values are immediate divergent
+    if (markDivergent(*UserInst))
+      Worklist.push_back(UserInst);
+  }
+}
+
+static const Instruction *getIfCarriedInstruction(const Use &U,
+                                                  const Loop &DivLoop) {
+  const auto *I = dyn_cast<const Instruction>(&U);
+  if (!I)
+    return nullptr;
+  if (!DivLoop.contains(I))
+    return nullptr;
+  return I;
+}
+
+void DivergenceAnalysis::analyzeTemporalDivergence(const Instruction &I,
+                                                   const Loop &OuterDivLoop) {
+  if (isAlwaysUniform(I))
+    return;
+  if (isDivergent(I))
+    return;
+
+  LLVM_DEBUG(dbgs() << "Analyze temporal divergence: " << I.getName() << "\n");
+  assert((isa<PHINode>(I) || !IsLCSSAForm) &&
+         "In LCSSA form all users of loop-exiting defs are Phi nodes.");
+  for (const Use &Op : I.operands()) {
+    const auto *OpInst = getIfCarriedInstruction(Op, OuterDivLoop);
     if (!OpInst)
       continue;
-    if (DivLoop->contains(OpInst->getParent()))
-      return true;
+    if (markDivergent(I))
+      pushUsers(I);
+    return;
   }
-  return false;
 }
 
 // marks all users of loop-carried values of the loop headed by LoopHeader as
 // divergent
-void DivergenceAnalysis::taintLoopLiveOuts(const BasicBlock &LoopHeader) {
-  auto *DivLoop = LI.getLoopFor(&LoopHeader);
-  assert(DivLoop && "loopHeader is not actually part of a loop");
+void DivergenceAnalysis::analyzeLoopExitDivergence(const BasicBlock &DivExit,
+                                                   const Loop &OuterDivLoop) {
+  // All users are in immediate exit blocks
+  if (IsLCSSAForm) {
+    for (const auto &Phi : DivExit.phis()) {
+      analyzeTemporalDivergence(Phi, OuterDivLoop);
+    }
+    return;
+  }
 
-  SmallVector<BasicBlock *, 8> TaintStack;
-  DivLoop->getExitBlocks(TaintStack);
+  // For non-LCSSA we have to follow all live out edges wherever they may lead.
+  const BasicBlock &LoopHeader = *OuterDivLoop.getHeader();
+  SmallVector<const BasicBlock *, 8> TaintStack;
+  TaintStack.push_back(&DivExit);
 
   // Otherwise potential users of loop-carried values could be anywhere in the
   // dominance region of DivLoop (including its fringes for phi nodes)
   DenseSet<const BasicBlock *> Visited;
-  for (auto *Block : TaintStack) {
-    Visited.insert(Block);
-  }
-  Visited.insert(&LoopHeader);
+  Visited.insert(&DivExit);
 
-  while (!TaintStack.empty()) {
-    auto *UserBlock = TaintStack.back();
-    TaintStack.pop_back();
+  do {
+    auto *UserBlock = TaintStack.pop_back_val();
 
     // don't spread divergence beyond the region
     if (!inRegion(*UserBlock))
       continue;
 
-    assert(!DivLoop->contains(UserBlock) &&
+    assert(!OuterDivLoop.contains(UserBlock) &&
            "irreducible control flow detected");
 
     // phi nodes at the fringes of the dominance region
     if (!DT.dominates(&LoopHeader, UserBlock)) {
       // all PHI nodes of UserBlock become divergent
       for (auto &Phi : UserBlock->phis()) {
-        Worklist.push_back(&Phi);
+        analyzeTemporalDivergence(Phi, OuterDivLoop);
       }
       continue;
     }
 
-    // taint outside users of values carried by DivLoop
+    // Taint outside users of values carried by OuterDivLoop.
     for (auto &I : *UserBlock) {
-      if (isAlwaysUniform(I))
-        continue;
-      if (isDivergent(I))
-        continue;
-      if (!usesLiveOut(I, DivLoop))
-        continue;
-
-      markDivergent(I);
-      if (I.isTerminator()) {
-        propagateBranchDivergence(I);
-      } else {
-        pushUsers(I);
-      }
+      analyzeTemporalDivergence(I, OuterDivLoop);
     }
 
     // visit all blocks in the dominance region
@@ -256,56 +239,57 @@ void DivergenceAnalysis::taintLoopLiveOuts(const BasicBlock &LoopHeader) {
       }
       TaintStack.push_back(SuccBlock);
     }
-  }
+  } while (!TaintStack.empty());
 }
 
-void DivergenceAnalysis::pushPHINodes(const BasicBlock &Block) {
-  for (const auto &Phi : Block.phis()) {
-    if (isDivergent(Phi))
-      continue;
-    Worklist.push_back(&Phi);
+void DivergenceAnalysis::propagateLoopExitDivergence(const BasicBlock &DivExit,
+                                                     const Loop &InnerDivLoop) {
+  LLVM_DEBUG(dbgs() << "\tpropLoopExitDiv " << DivExit.getName() << "\n");
+
+  // Find outer-most loop that does not contain \p DivExit
+  const Loop *DivLoop = &InnerDivLoop;
+  const Loop *OuterDivLoop = DivLoop;
+  const Loop *ExitLevelLoop = LI.getLoopFor(&DivExit);
+  const unsigned LoopExitDepth =
+      ExitLevelLoop ? ExitLevelLoop->getLoopDepth() : 0;
+  while (DivLoop && DivLoop->getLoopDepth() > LoopExitDepth) {
+    DivergentLoops.insert(DivLoop); // all crossed loops are divergent
+    OuterDivLoop = DivLoop;
+    DivLoop = DivLoop->getParentLoop();
   }
-}
-
-void DivergenceAnalysis::pushUsers(const Value &V) {
-  for (const auto *User : V.users()) {
-    const auto *UserInst = dyn_cast<const Instruction>(User);
-    if (!UserInst)
-      continue;
-
-    if (isDivergent(*UserInst))
-      continue;
+  LLVM_DEBUG(dbgs() << "\tOuter-most left loop: " << OuterDivLoop->getName()
+                    << "\n");
 
-    // only compute divergent inside loop
-    if (!inRegion(*UserInst))
-      continue;
-    Worklist.push_back(UserInst);
-  }
+  analyzeLoopExitDivergence(DivExit, *OuterDivLoop);
 }
 
-bool DivergenceAnalysis::propagateJoinDivergence(const BasicBlock &JoinBlock,
-                                                 const Loop *BranchLoop) {
-  LLVM_DEBUG(dbgs() << "\tpropJoinDiv " << JoinBlock.getName() << "\n");
+// this is a divergent join point - mark all phi nodes as divergent and push
+// them onto the stack.
+void DivergenceAnalysis::taintAndPushPhiNodes(const BasicBlock &JoinBlock) {
+  LLVM_DEBUG(dbgs() << "taintAndPushPhiNodes in " << JoinBlock.getName()
+                    << "\n");
 
   // ignore divergence outside the region
   if (!inRegion(JoinBlock)) {
-    return false;
+    return;
   }
 
   // push non-divergent phi nodes in JoinBlock to the worklist
-  pushPHINodes(JoinBlock);
-
-  // disjoint-paths divergent at JoinBlock
-  markBlockJoinDivergent(JoinBlock);
-
-  // JoinBlock is a divergent loop exit
-  return BranchLoop && !BranchLoop->contains(&JoinBlock);
+  for (const auto &Phi : JoinBlock.phis()) {
+    if (isDivergent(Phi))
+      continue;
+    // FIXME Theoretically ,the 'undef' value could be replaced by any other
+    // value causing spurious divergence.
+    if (Phi.hasConstantOrUndefValue())
+      continue;
+    if (markDivergent(Phi))
+      Worklist.push_back(&Phi);
+  }
 }
 
-void DivergenceAnalysis::propagateBranchDivergence(const Instruction &Term) {
-  LLVM_DEBUG(dbgs() << "propBranchDiv " << Term.getParent()->getName() << "\n");
-
-  markDivergent(Term);
+void DivergenceAnalysis::analyzeControlDivergence(const Instruction &Term) {
+  LLVM_DEBUG(dbgs() << "analyzeControlDiv " << Term.getParent()->getName()
+                    << "\n");
 
   // Don't propagate divergence from unreachable blocks.
   if (!DT.isReachableFromEntry(Term.getParent()))
@@ -313,113 +297,45 @@ void DivergenceAnalysis::propagateBranchDivergence(const Instruction &Term) {
 
   const auto *BranchLoop = LI.getLoopFor(Term.getParent());
 
-  // whether there is a divergent loop exit from BranchLoop (if any)
-  bool IsBranchLoopDivergent = false;
+  const auto &DivDesc = SDA.getJoinBlocks(Term);
 
-  // iterate over all blocks reachable by disjoint from Term within the loop
-  // also iterates over loop exits that become divergent due to Term.
-  for (const auto *JoinBlock : SDA.join_blocks(Term)) {
-    IsBranchLoopDivergent |= propagateJoinDivergence(*JoinBlock, BranchLoop);
+  // Iterate over all blocks now reachable by a disjoint path join
+  for (const auto *JoinBlock : DivDesc.JoinDivBlocks) {
+    taintAndPushPhiNodes(*JoinBlock);
   }
 
-  // Branch loop is a divergent loop due to the divergent branch in Term
-  if (IsBranchLoopDivergent) {
-    assert(BranchLoop);
-    if (!DivergentLoops.insert(BranchLoop).second) {
-      return;
-    }
-    propagateLoopDivergence(*BranchLoop);
-  }
-}
-
-void DivergenceAnalysis::propagateLoopDivergence(const Loop &ExitingLoop) {
-  LLVM_DEBUG(dbgs() << "propLoopDiv " << ExitingLoop.getName() << "\n");
-
-  // don't propagate beyond region
-  if (!inRegion(*ExitingLoop.getHeader()))
-    return;
-
-  const auto *BranchLoop = ExitingLoop.getParentLoop();
-
-  // Uses of loop-carried values could occur anywhere
-  // within the dominance region of the definition. All loop-carried
-  // definitions are dominated by the loop header (reducible control).
-  // Thus all users have to be in the dominance region of the loop header,
-  // except PHI nodes that can also live at the fringes of the dom region
-  // (incoming defining value).
-  if (!IsLCSSAForm)
-    taintLoopLiveOuts(*ExitingLoop.getHeader());
-
-  // whether there is a divergent loop exit from BranchLoop (if any)
-  bool IsBranchLoopDivergent = false;
-
-  // iterate over all blocks reachable by disjoint paths from exits of
-  // ExitingLoop also iterates over loop exits (of BranchLoop) that in turn
-  // become divergent.
-  for (const auto *JoinBlock : SDA.join_blocks(ExitingLoop)) {
-    IsBranchLoopDivergent |= propagateJoinDivergence(*JoinBlock, BranchLoop);
-  }
-
-  // Branch loop is a divergent due to divergent loop exit in ExitingLoop
-  if (IsBranchLoopDivergent) {
-    assert(BranchLoop);
-    if (!DivergentLoops.insert(BranchLoop).second) {
-      return;
-    }
-    propagateLoopDivergence(*BranchLoop);
+  assert(DivDesc.LoopDivBlocks.empty() || BranchLoop);
+  for (const auto *DivExitBlock : DivDesc.LoopDivBlocks) {
+    propagateLoopExitDivergence(*DivExitBlock, *BranchLoop);
   }
 }
 
 void DivergenceAnalysis::compute() {
-  for (auto *DivVal : DivergentValues) {
+  // Initialize worklist.
+  auto DivValuesCopy = DivergentValues;
+  for (const auto *DivVal : DivValuesCopy) {
+    assert(isDivergent(*DivVal) && "Worklist invariant violated!");
     pushUsers(*DivVal);
   }
 
-  // propagate divergence
+  // All values on the Worklist are divergent.
+  // Their users may not have been updated yed.
   while (!Worklist.empty()) {
     const Instruction &I = *Worklist.back();
     Worklist.pop_back();
 
-    // maintain uniformity of overrides
-    if (isAlwaysUniform(I))
-      continue;
-
-    bool WasDivergent = isDivergent(I);
-    if (WasDivergent)
-      continue;
-
-    // propagate divergence caused by terminator
-    if (I.isTerminator()) {
-      if (updateTerminator(I)) {
-        // propagate control divergence to affected instructions
-        propagateBranchDivergence(I);
-        continue;
-      }
-    }
-
-    // update divergence of I due to divergent operands
-    bool DivergentUpd = false;
-    const auto *Phi = dyn_cast<const PHINode>(&I);
-    if (Phi) {
-      DivergentUpd = updatePHINode(*Phi);
-    } else {
-      DivergentUpd = updateNormalInstruction(I);
-    }
-
     // propagate value divergence to users
-    if (DivergentUpd) {
-      markDivergent(I);
-      pushUsers(I);
-    }
+    assert(isDivergent(I) && "Worklist invariant violated!");
+    pushUsers(I);
   }
 }
 
 bool DivergenceAnalysis::isAlwaysUniform(const Value &V) const {
-  return UniformOverrides.find(&V) != UniformOverrides.end();
+  return UniformOverrides.contains(&V);
 }
 
 bool DivergenceAnalysis::isDivergent(const Value &V) const {
-  return DivergentValues.find(&V) != DivergentValues.end();
+  return DivergentValues.contains(&V);
 }
 
 bool DivergenceAnalysis::isDivergentUse(const Use &U) const {
@@ -444,7 +360,7 @@ GPUDivergenceAnalysis::GPUDivergenceAnalysis(Function &F,
                                              const PostDominatorTree &PDT,
                                              const LoopInfo &LI,
                                              const TargetTransformInfo &TTI)
-    : SDA(DT, PDT, LI), DA(F, nullptr, DT, LI, SDA, false) {
+    : SDA(DT, PDT, LI), DA(F, nullptr, DT, LI, SDA, /* LCSSA */ false) {
   for (auto &I : instructions(F)) {
     if (TTI.isSourceOfDivergence(&I)) {
       DA.markDivergent(I);
diff --git a/contrib/llvm-project/llvm/lib/Analysis/DomTreeUpdater.cpp b/contrib/llvm-project/llvm/lib/Analysis/DomTreeUpdater.cpp
index 9594da0a4f91..8ac7d9d4efd0 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/DomTreeUpdater.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/DomTreeUpdater.cpp
@@ -32,8 +32,7 @@ bool DomTreeUpdater::isUpdateValid(
   // Since isUpdateValid() must be called *after* the Terminator of From is
   // altered we can determine if the update is unnecessary for batch updates
   // or invalid for a single update.
-  const bool HasEdge = llvm::any_of(
-      successors(From), [To](const BasicBlock *B) { return B == To; });
+  const bool HasEdge = llvm::is_contained(successors(From), To);
 
   // If the IR does not match the update,
   // 1. In batch updates, this update is unnecessary.
@@ -167,7 +166,7 @@ bool DomTreeUpdater::hasPendingPostDomTreeUpdates() const {
 bool DomTreeUpdater::isBBPendingDeletion(llvm::BasicBlock *DelBB) const {
   if (Strategy == UpdateStrategy::Eager || DeletedBBs.empty())
     return false;
-  return DeletedBBs.count(DelBB) != 0;
+  return DeletedBBs.contains(DelBB);
 }
 
 // The DT and PDT require the nodes related to updates
diff --git a/contrib/llvm-project/llvm/lib/Analysis/EHPersonalities.cpp b/contrib/llvm-project/llvm/lib/Analysis/EHPersonalities.cpp
index 2242541696a4..a982f266b2d6 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/EHPersonalities.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/EHPersonalities.cpp
@@ -24,22 +24,23 @@ EHPersonality llvm::classifyEHPersonality(const Value *Pers) {
   if (!F)
     return EHPersonality::Unknown;
   return StringSwitch<EHPersonality>(F->getName())
-    .Case("__gnat_eh_personality",     EHPersonality::GNU_Ada)
-    .Case("__gxx_personality_v0",      EHPersonality::GNU_CXX)
-    .Case("__gxx_personality_seh0",    EHPersonality::GNU_CXX)
-    .Case("__gxx_personality_sj0",     EHPersonality::GNU_CXX_SjLj)
-    .Case("__gcc_personality_v0",      EHPersonality::GNU_C)
-    .Case("__gcc_personality_seh0",    EHPersonality::GNU_C)
-    .Case("__gcc_personality_sj0",     EHPersonality::GNU_C_SjLj)
-    .Case("__objc_personality_v0",     EHPersonality::GNU_ObjC)
-    .Case("_except_handler3",          EHPersonality::MSVC_X86SEH)
-    .Case("_except_handler4",          EHPersonality::MSVC_X86SEH)
-    .Case("__C_specific_handler",      EHPersonality::MSVC_Win64SEH)
-    .Case("__CxxFrameHandler3",        EHPersonality::MSVC_CXX)
-    .Case("ProcessCLRException",       EHPersonality::CoreCLR)
-    .Case("rust_eh_personality",       EHPersonality::Rust)
-    .Case("__gxx_wasm_personality_v0", EHPersonality::Wasm_CXX)
-    .Default(EHPersonality::Unknown);
+      .Case("__gnat_eh_personality", EHPersonality::GNU_Ada)
+      .Case("__gxx_personality_v0", EHPersonality::GNU_CXX)
+      .Case("__gxx_personality_seh0", EHPersonality::GNU_CXX)
+      .Case("__gxx_personality_sj0", EHPersonality::GNU_CXX_SjLj)
+      .Case("__gcc_personality_v0", EHPersonality::GNU_C)
+      .Case("__gcc_personality_seh0", EHPersonality::GNU_C)
+      .Case("__gcc_personality_sj0", EHPersonality::GNU_C_SjLj)
+      .Case("__objc_personality_v0", EHPersonality::GNU_ObjC)
+      .Case("_except_handler3", EHPersonality::MSVC_X86SEH)
+      .Case("_except_handler4", EHPersonality::MSVC_X86SEH)
+      .Case("__C_specific_handler", EHPersonality::MSVC_TableSEH)
+      .Case("__CxxFrameHandler3", EHPersonality::MSVC_CXX)
+      .Case("ProcessCLRException", EHPersonality::CoreCLR)
+      .Case("rust_eh_personality", EHPersonality::Rust)
+      .Case("__gxx_wasm_personality_v0", EHPersonality::Wasm_CXX)
+      .Case("__xlcxx_personality_v1", EHPersonality::XL_CXX)
+      .Default(EHPersonality::Unknown);
 }
 
 StringRef llvm::getEHPersonalityName(EHPersonality Pers) {
@@ -51,11 +52,14 @@ StringRef llvm::getEHPersonalityName(EHPersonality Pers) {
   case EHPersonality::GNU_C_SjLj:    return "__gcc_personality_sj0";
   case EHPersonality::GNU_ObjC:      return "__objc_personality_v0";
   case EHPersonality::MSVC_X86SEH:   return "_except_handler3";
-  case EHPersonality::MSVC_Win64SEH: return "__C_specific_handler";
+  case EHPersonality::MSVC_TableSEH:
+    return "__C_specific_handler";
   case EHPersonality::MSVC_CXX:      return "__CxxFrameHandler3";
   case EHPersonality::CoreCLR:       return "ProcessCLRException";
   case EHPersonality::Rust:          return "rust_eh_personality";
   case EHPersonality::Wasm_CXX:      return "__gxx_wasm_personality_v0";
+  case EHPersonality::XL_CXX:
+    return "__xlcxx_personality_v1";
   case EHPersonality::Unknown:       llvm_unreachable("Unknown EHPersonality!");
   }
 
diff --git a/contrib/llvm-project/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp b/contrib/llvm-project/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp
new file mode 100644
index 000000000000..33519038e225
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp
@@ -0,0 +1,88 @@
+//===- FunctionPropertiesAnalysis.cpp - Function Properties Analysis ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the FunctionPropertiesInfo and FunctionPropertiesAnalysis
+// classes used to extract function properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/FunctionPropertiesAnalysis.h"
+#include "llvm/IR/Instructions.h"
+
+using namespace llvm;
+
+FunctionPropertiesInfo
+FunctionPropertiesInfo::getFunctionPropertiesInfo(const Function &F,
+                                                  const LoopInfo &LI) {
+
+  FunctionPropertiesInfo FPI;
+
+  FPI.Uses = ((!F.hasLocalLinkage()) ? 1 : 0) + F.getNumUses();
+
+  for (const auto &BB : F) {
+    ++FPI.BasicBlockCount;
+
+    if (const auto *BI = dyn_cast<BranchInst>(BB.getTerminator())) {
+      if (BI->isConditional())
+        FPI.BlocksReachedFromConditionalInstruction += BI->getNumSuccessors();
+    } else if (const auto *SI = dyn_cast<SwitchInst>(BB.getTerminator())) {
+      FPI.BlocksReachedFromConditionalInstruction +=
+          (SI->getNumCases() + (nullptr != SI->getDefaultDest()));
+    }
+
+    for (const auto &I : BB) {
+      if (auto *CS = dyn_cast<CallBase>(&I)) {
+        const auto *Callee = CS->getCalledFunction();
+        if (Callee && !Callee->isIntrinsic() && !Callee->isDeclaration())
+          ++FPI.DirectCallsToDefinedFunctions;
+      }
+      if (I.getOpcode() == Instruction::Load) {
+        ++FPI.LoadInstCount;
+      } else if (I.getOpcode() == Instruction::Store) {
+        ++FPI.StoreInstCount;
+      }
+    }
+    // Loop Depth of the Basic Block
+    int64_t LoopDepth;
+    LoopDepth = LI.getLoopDepth(&BB);
+    if (FPI.MaxLoopDepth < LoopDepth)
+      FPI.MaxLoopDepth = LoopDepth;
+  }
+  FPI.TopLevelLoopCount += llvm::size(LI);
+  return FPI;
+}
+
+void FunctionPropertiesInfo::print(raw_ostream &OS) const {
+  OS << "BasicBlockCount: " << BasicBlockCount << "\n"
+     << "BlocksReachedFromConditionalInstruction: "
+     << BlocksReachedFromConditionalInstruction << "\n"
+     << "Uses: " << Uses << "\n"
+     << "DirectCallsToDefinedFunctions: " << DirectCallsToDefinedFunctions
+     << "\n"
+     << "LoadInstCount: " << LoadInstCount << "\n"
+     << "StoreInstCount: " << StoreInstCount << "\n"
+     << "MaxLoopDepth: " << MaxLoopDepth << "\n"
+     << "TopLevelLoopCount: " << TopLevelLoopCount << "\n\n";
+}
+
+AnalysisKey FunctionPropertiesAnalysis::Key;
+
+FunctionPropertiesInfo
+FunctionPropertiesAnalysis::run(Function &F, FunctionAnalysisManager &FAM) {
+  return FunctionPropertiesInfo::getFunctionPropertiesInfo(
+      F, FAM.getResult<LoopAnalysis>(F));
+}
+
+PreservedAnalyses
+FunctionPropertiesPrinterPass::run(Function &F, FunctionAnalysisManager &AM) {
+  OS << "Printing analysis results of CFA for function "
+     << "'" << F.getName() << "':"
+     << "\n";
+  AM.getResult<FunctionPropertiesAnalysis>(F).print(OS);
+  return PreservedAnalyses::all();
+}
diff --git a/contrib/llvm-project/llvm/lib/Analysis/GlobalsModRef.cpp b/contrib/llvm-project/llvm/lib/Analysis/GlobalsModRef.cpp
index 8c8ccf04ebba..145baf82b65b 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/GlobalsModRef.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/GlobalsModRef.cpp
@@ -44,7 +44,7 @@ STATISTIC(NumIndirectGlobalVars, "Number of indirect global objects");
 // An option to enable unsafe alias results from the GlobalsModRef analysis.
 // When enabled, GlobalsModRef will provide no-alias results which in extremely
 // rare cases may not be conservatively correct. In particular, in the face of
-// transforms which cause assymetry between how effective GetUnderlyingObject
+// transforms which cause asymmetry between how effective getUnderlyingObject
 // is for two pointers, it may produce incorrect results.
 //
 // These unsafe results have been returned by GMR for many years without
@@ -367,7 +367,8 @@ bool GlobalsAAResult::AnalyzeUsesOfPointer(Value *V,
     } else if (Operator::getOpcode(I) == Instruction::GetElementPtr) {
       if (AnalyzeUsesOfPointer(I, Readers, Writers))
         return true;
-    } else if (Operator::getOpcode(I) == Instruction::BitCast) {
+    } else if (Operator::getOpcode(I) == Instruction::BitCast ||
+               Operator::getOpcode(I) == Instruction::AddrSpaceCast) {
       if (AnalyzeUsesOfPointer(I, Readers, Writers, OkayStoreDest))
         return true;
     } else if (auto *Call = dyn_cast<CallBase>(I)) {
@@ -435,8 +436,7 @@ bool GlobalsAAResult::AnalyzeIndirectGlobalMemory(GlobalVariable *GV) {
         continue;
 
       // Check the value being stored.
-      Value *Ptr = GetUnderlyingObject(SI->getOperand(0),
-                                       GV->getParent()->getDataLayout());
+      Value *Ptr = getUnderlyingObject(SI->getOperand(0));
 
       if (!isAllocLikeFn(Ptr, &GetTLI(*SI->getFunction())))
         return false; // Too hard to analyze.
@@ -661,12 +661,12 @@ static bool isNonEscapingGlobalNoAliasWithLoad(const GlobalValue *GV,
       return false;
 
     if (auto *LI = dyn_cast<LoadInst>(Input)) {
-      Inputs.push_back(GetUnderlyingObject(LI->getPointerOperand(), DL));
+      Inputs.push_back(getUnderlyingObject(LI->getPointerOperand()));
       continue;
     }
     if (auto *SI = dyn_cast<SelectInst>(Input)) {
-      const Value *LHS = GetUnderlyingObject(SI->getTrueValue(), DL);
-      const Value *RHS = GetUnderlyingObject(SI->getFalseValue(), DL);
+      const Value *LHS = getUnderlyingObject(SI->getTrueValue());
+      const Value *RHS = getUnderlyingObject(SI->getFalseValue());
       if (Visited.insert(LHS).second)
         Inputs.push_back(LHS);
       if (Visited.insert(RHS).second)
@@ -675,7 +675,7 @@ static bool isNonEscapingGlobalNoAliasWithLoad(const GlobalValue *GV,
     }
     if (auto *PN = dyn_cast<PHINode>(Input)) {
       for (const Value *Op : PN->incoming_values()) {
-        Op = GetUnderlyingObject(Op, DL);
+        Op = getUnderlyingObject(Op);
         if (Visited.insert(Op).second)
           Inputs.push_back(Op);
       }
@@ -774,7 +774,7 @@ bool GlobalsAAResult::isNonEscapingGlobalNoAlias(const GlobalValue *GV,
     if (auto *LI = dyn_cast<LoadInst>(Input)) {
       // A pointer loaded from a global would have been captured, and we know
       // that the global is non-escaping, so no alias.
-      const Value *Ptr = GetUnderlyingObject(LI->getPointerOperand(), DL);
+      const Value *Ptr = getUnderlyingObject(LI->getPointerOperand());
       if (isNonEscapingGlobalNoAliasWithLoad(GV, Ptr, Depth, DL))
         // The load does not alias with GV.
         continue;
@@ -782,8 +782,8 @@ bool GlobalsAAResult::isNonEscapingGlobalNoAlias(const GlobalValue *GV,
       return false;
     }
     if (auto *SI = dyn_cast<SelectInst>(Input)) {
-      const Value *LHS = GetUnderlyingObject(SI->getTrueValue(), DL);
-      const Value *RHS = GetUnderlyingObject(SI->getFalseValue(), DL);
+      const Value *LHS = getUnderlyingObject(SI->getTrueValue());
+      const Value *RHS = getUnderlyingObject(SI->getFalseValue());
       if (Visited.insert(LHS).second)
         Inputs.push_back(LHS);
       if (Visited.insert(RHS).second)
@@ -792,7 +792,7 @@ bool GlobalsAAResult::isNonEscapingGlobalNoAlias(const GlobalValue *GV,
     }
     if (auto *PN = dyn_cast<PHINode>(Input)) {
       for (const Value *Op : PN->incoming_values()) {
-        Op = GetUnderlyingObject(Op, DL);
+        Op = getUnderlyingObject(Op);
         if (Visited.insert(Op).second)
           Inputs.push_back(Op);
       }
@@ -827,8 +827,10 @@ AliasResult GlobalsAAResult::alias(const MemoryLocation &LocA,
                                    const MemoryLocation &LocB,
                                    AAQueryInfo &AAQI) {
   // Get the base object these pointers point to.
-  const Value *UV1 = GetUnderlyingObject(LocA.Ptr, DL);
-  const Value *UV2 = GetUnderlyingObject(LocB.Ptr, DL);
+  const Value *UV1 =
+      getUnderlyingObject(LocA.Ptr->stripPointerCastsAndInvariantGroups());
+  const Value *UV2 =
+      getUnderlyingObject(LocB.Ptr->stripPointerCastsAndInvariantGroups());
 
   // If either of the underlying values is a global, they may be non-addr-taken
   // globals, which we can answer queries about.
@@ -915,14 +917,15 @@ ModRefInfo GlobalsAAResult::getModRefInfoForArgument(const CallBase *Call,
   // is based on GV, return the conservative result.
   for (auto &A : Call->args()) {
     SmallVector<const Value*, 4> Objects;
-    GetUnderlyingObjects(A, Objects, DL);
+    getUnderlyingObjects(A, Objects);
 
     // All objects must be identified.
     if (!all_of(Objects, isIdentifiedObject) &&
         // Try ::alias to see if all objects are known not to alias GV.
         !all_of(Objects, [&](const Value *V) {
-          return this->alias(MemoryLocation(V), MemoryLocation(GV), AAQI) ==
-                 NoAlias;
+          return this->alias(MemoryLocation::getBeforeOrAfter(V),
+                             MemoryLocation::getBeforeOrAfter(GV),
+                             AAQI) == NoAlias;
         }))
       return ConservativeResult;
 
@@ -942,7 +945,7 @@ ModRefInfo GlobalsAAResult::getModRefInfo(const CallBase *Call,
   // If we are asking for mod/ref info of a direct call with a pointer to a
   // global we are tracking, return information if we have it.
   if (const GlobalValue *GV =
-          dyn_cast<GlobalValue>(GetUnderlyingObject(Loc.Ptr, DL)))
+          dyn_cast<GlobalValue>(getUnderlyingObject(Loc.Ptr)))
     // If GV is internal to this IR and there is no function with local linkage
     // that has had their address taken, keep looking for a tighter ModRefInfo.
     if (GV->hasLocalLinkage() && !UnknownFunctionsWithLocalLinkage)
diff --git a/contrib/llvm-project/llvm/lib/Analysis/IRSimilarityIdentifier.cpp b/contrib/llvm-project/llvm/lib/Analysis/IRSimilarityIdentifier.cpp
new file mode 100644
index 000000000000..25443a667908
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Analysis/IRSimilarityIdentifier.cpp
@@ -0,0 +1,937 @@
+//===- IRSimilarityIdentifier.cpp - Find similarity in a module -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// \file
+// Implementation file for the IRSimilarityIdentifier for identifying
+// similarities in IR including the IRInstructionMapper.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/IRSimilarityIdentifier.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/User.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/SuffixTree.h"
+
+using namespace llvm;
+using namespace IRSimilarity;
+
+IRInstructionData::IRInstructionData(Instruction &I, bool Legality,
+                                     IRInstructionDataList &IDList)
+    : Inst(&I), Legal(Legality), IDL(&IDList) {
+  // We check for whether we have a comparison instruction.  If it is, we
+  // find the "less than" version of the predicate for consistency for
+  // comparison instructions throught the program.
+  if (CmpInst *C = dyn_cast<CmpInst>(&I)) {
+    CmpInst::Predicate Predicate = predicateForConsistency(C);
+    if (Predicate != C->getPredicate())
+      RevisedPredicate = Predicate;
+  }
+
+  // Here we collect the operands and their types for determining whether
+  // the structure of the operand use matches between two different candidates.
+  for (Use &OI : I.operands()) {
+    if (isa<CmpInst>(I) && RevisedPredicate.hasValue()) {
+      // If we have a CmpInst where the predicate is reversed, it means the
+      // operands must be reversed as well.
+      OperVals.insert(OperVals.begin(), OI.get());
+      continue;
+    }
+
+    OperVals.push_back(OI.get());
+  }
+}
+
+CmpInst::Predicate IRInstructionData::predicateForConsistency(CmpInst *CI) {
+  switch (CI->getPredicate()) {
+  case CmpInst::FCMP_OGT:
+  case CmpInst::FCMP_UGT:
+  case CmpInst::FCMP_OGE:
+  case CmpInst::FCMP_UGE:
+  case CmpInst::ICMP_SGT:
+  case CmpInst::ICMP_UGT:
+  case CmpInst::ICMP_SGE:
+  case CmpInst::ICMP_UGE:
+    return CI->getSwappedPredicate();
+  default:
+    return CI->getPredicate();
+  }
+}
+
+CmpInst::Predicate IRInstructionData::getPredicate() const {
+  assert(isa<CmpInst>(Inst) &&
+         "Can only get a predicate from a compare instruction");
+
+  if (RevisedPredicate.hasValue())
+    return RevisedPredicate.getValue();
+  
+  return cast<CmpInst>(Inst)->getPredicate();
+}
+
+static StringRef getCalledFunctionName(CallInst &CI) {
+  assert(CI.getCalledFunction() != nullptr && "Called Function is nullptr?");
+
+  return CI.getCalledFunction()->getName();
+}
+
+bool IRSimilarity::isClose(const IRInstructionData &A,
+                           const IRInstructionData &B) {
+
+  if (!A.Legal || !B.Legal)
+    return false;
+
+  // Check if we are performing the same sort of operation on the same types
+  // but not on the same values.
+  if (!A.Inst->isSameOperationAs(B.Inst)) {
+    // If there is a predicate, this means that either there is a swapped
+    // predicate, or that the types are different, we want to make sure that
+    // the predicates are equivalent via swapping.
+    if (isa<CmpInst>(A.Inst) && isa<CmpInst>(B.Inst)) {
+
+      if (A.getPredicate() != B.getPredicate())
+        return false;
+
+      // If the predicates are the same via swap, make sure that the types are
+      // still the same.
+      auto ZippedTypes = zip(A.OperVals, B.OperVals);
+
+      return all_of(
+          ZippedTypes, [](std::tuple<llvm::Value *, llvm::Value *> R) {
+            return std::get<0>(R)->getType() == std::get<1>(R)->getType();
+          });
+    }
+
+    return false;
+  }
+
+  // Since any GEP Instruction operands after the first operand cannot be
+  // defined by a register, we must make sure that the operands after the first
+  // are the same in the two instructions
+  if (auto *GEP = dyn_cast<GetElementPtrInst>(A.Inst)) {
+    auto *OtherGEP = cast<GetElementPtrInst>(B.Inst);
+
+    // If the instructions do not have the same inbounds restrictions, we do
+    // not consider them the same.
+    if (GEP->isInBounds() != OtherGEP->isInBounds())
+      return false;
+
+    auto ZippedOperands = zip(GEP->indices(), OtherGEP->indices());
+
+    // We increment here since we do not care about the first instruction,
+    // we only care about the following operands since they must be the
+    // exact same to be considered similar.
+    return all_of(drop_begin(ZippedOperands),
+                  [](std::tuple<llvm::Use &, llvm::Use &> R) {
+                    return std::get<0>(R) == std::get<1>(R);
+                  });
+  }
+
+  // If the instructions are functions, we make sure that the function name is
+  // the same.  We already know that the types are since is isSameOperationAs is
+  // true.
+  if (isa<CallInst>(A.Inst) && isa<CallInst>(B.Inst)) {
+    CallInst *CIA = cast<CallInst>(A.Inst);
+    CallInst *CIB = cast<CallInst>(B.Inst);
+    if (getCalledFunctionName(*CIA).compare(getCalledFunctionName(*CIB)) != 0)
+      return false;
+  }
+
+  return true;
+}
+
+// TODO: This is the same as the MachineOutliner, and should be consolidated
+// into the same interface.
+void IRInstructionMapper::convertToUnsignedVec(
+    BasicBlock &BB, std::vector<IRInstructionData *> &InstrList,
+    std::vector<unsigned> &IntegerMapping) {
+  BasicBlock::iterator It = BB.begin();
+
+  std::vector<unsigned> IntegerMappingForBB;
+  std::vector<IRInstructionData *> InstrListForBB;
+
+  HaveLegalRange = false;
+  CanCombineWithPrevInstr = false;
+  AddedIllegalLastTime = true;
+
+  for (BasicBlock::iterator Et = BB.end(); It != Et; ++It) {
+    switch (InstClassifier.visit(*It)) {
+    case InstrType::Legal:
+      mapToLegalUnsigned(It, IntegerMappingForBB, InstrListForBB);
+      break;
+    case InstrType::Illegal:
+      mapToIllegalUnsigned(It, IntegerMappingForBB, InstrListForBB);
+      break;
+    case InstrType::Invisible:
+      AddedIllegalLastTime = false;
+      break;
+    }
+  }
+
+  if (HaveLegalRange) {
+    mapToIllegalUnsigned(It, IntegerMappingForBB, InstrListForBB, true);
+    for_each(InstrListForBB,
+             [this](IRInstructionData *ID) { this->IDL->push_back(*ID); });
+    llvm::append_range(InstrList, InstrListForBB);
+    llvm::append_range(IntegerMapping, IntegerMappingForBB);
+  }
+}
+
+// TODO: This is the same as the MachineOutliner, and should be consolidated
+// into the same interface.
+unsigned IRInstructionMapper::mapToLegalUnsigned(
+    BasicBlock::iterator &It, std::vector<unsigned> &IntegerMappingForBB,
+    std::vector<IRInstructionData *> &InstrListForBB) {
+  // We added something legal, so we should unset the AddedLegalLastTime
+  // flag.
+  AddedIllegalLastTime = false;
+
+  // If we have at least two adjacent legal instructions (which may have
+  // invisible instructions in between), remember that.
+  if (CanCombineWithPrevInstr)
+    HaveLegalRange = true;
+  CanCombineWithPrevInstr = true;
+
+  // Get the integer for this instruction or give it the current
+  // LegalInstrNumber.
+  IRInstructionData *ID = allocateIRInstructionData(*It, true, *IDL);
+  InstrListForBB.push_back(ID);
+
+  // Add to the instruction list
+  bool WasInserted;
+  DenseMap<IRInstructionData *, unsigned, IRInstructionDataTraits>::iterator
+      ResultIt;
+  std::tie(ResultIt, WasInserted) =
+      InstructionIntegerMap.insert(std::make_pair(ID, LegalInstrNumber));
+  unsigned INumber = ResultIt->second;
+
+  // There was an insertion.
+  if (WasInserted)
+    LegalInstrNumber++;
+
+  IntegerMappingForBB.push_back(INumber);
+
+  // Make sure we don't overflow or use any integers reserved by the DenseMap.
+  assert(LegalInstrNumber < IllegalInstrNumber &&
+         "Instruction mapping overflow!");
+
+  assert(LegalInstrNumber != DenseMapInfo<unsigned>::getEmptyKey() &&
+         "Tried to assign DenseMap tombstone or empty key to instruction.");
+  assert(LegalInstrNumber != DenseMapInfo<unsigned>::getTombstoneKey() &&
+         "Tried to assign DenseMap tombstone or empty key to instruction.");
+
+  return INumber;
+}
+
+IRInstructionData *
+IRInstructionMapper::allocateIRInstructionData(Instruction &I, bool Legality,
+                                               IRInstructionDataList &IDL) {
+  return new (InstDataAllocator->Allocate()) IRInstructionData(I, Legality, IDL);
+}
+
+IRInstructionDataList *
+IRInstructionMapper::allocateIRInstructionDataList() {
+  return new (IDLAllocator->Allocate()) IRInstructionDataList();
+}
+
+// TODO: This is the same as the MachineOutliner, and should be consolidated
+// into the same interface.
+unsigned IRInstructionMapper::mapToIllegalUnsigned(
+    BasicBlock::iterator &It, std::vector<unsigned> &IntegerMappingForBB,
+    std::vector<IRInstructionData *> &InstrListForBB, bool End) {
+  // Can't combine an illegal instruction. Set the flag.
+  CanCombineWithPrevInstr = false;
+
+  // Only add one illegal number per range of legal numbers.
+  if (AddedIllegalLastTime)
+    return IllegalInstrNumber;
+
+  IRInstructionData *ID = nullptr;
+  if (!End)
+    ID = allocateIRInstructionData(*It, false, *IDL);
+  InstrListForBB.push_back(ID);
+
+  // Remember that we added an illegal number last time.
+  AddedIllegalLastTime = true;
+  unsigned INumber = IllegalInstrNumber;
+  IntegerMappingForBB.push_back(IllegalInstrNumber--);
+
+  assert(LegalInstrNumber < IllegalInstrNumber &&
+         "Instruction mapping overflow!");
+
+  assert(IllegalInstrNumber != DenseMapInfo<unsigned>::getEmptyKey() &&
+         "IllegalInstrNumber cannot be DenseMap tombstone or empty key!");
+
+  assert(IllegalInstrNumber != DenseMapInfo<unsigned>::getTombstoneKey() &&
+         "IllegalInstrNumber cannot be DenseMap tombstone or empty key!");
+
+  return INumber;
+}
+
+IRSimilarityCandidate::IRSimilarityCandidate(unsigned StartIdx, unsigned Len,
+                                             IRInstructionData *FirstInstIt,
+                                             IRInstructionData *LastInstIt)
+    : StartIdx(StartIdx), Len(Len) {
+
+  assert(FirstInstIt != nullptr && "Instruction is nullptr!");
+  assert(LastInstIt != nullptr && "Instruction is nullptr!");
+  assert(StartIdx + Len > StartIdx &&
+         "Overflow for IRSimilarityCandidate range?");
+  assert(Len - 1 == static_cast<unsigned>(std::distance(
+                        iterator(FirstInstIt), iterator(LastInstIt))) &&
+         "Length of the first and last IRInstructionData do not match the "
+         "given length");
+
+  // We iterate over the given instructions, and map each unique value
+  // to a unique number in the IRSimilarityCandidate ValueToNumber and
+  // NumberToValue maps.  A constant get its own value globally, the individual
+  // uses of the constants are not considered to be unique.
+  //
+  // IR:                    Mapping Added:
+  // %add1 = add i32 %a, c1    %add1 -> 3, %a -> 1, c1 -> 2
+  // %add2 = add i32 %a, %1    %add2 -> 4
+  // %add3 = add i32 c2, c1    %add3 -> 6, c2 -> 5
+  //
+  // when replace with global values, starting from 1, would be
+  //
+  // 3 = add i32 1, 2
+  // 4 = add i32 1, 3
+  // 6 = add i32 5, 2
+  unsigned LocalValNumber = 1;
+  IRInstructionDataList::iterator ID = iterator(*FirstInstIt);
+  for (unsigned Loc = StartIdx; Loc < StartIdx + Len; Loc++, ID++) {
+    // Map the operand values to an unsigned integer if it does not already
+    // have an unsigned integer assigned to it.
+    for (Value *Arg : ID->OperVals)
+      if (ValueToNumber.find(Arg) == ValueToNumber.end()) {
+        ValueToNumber.try_emplace(Arg, LocalValNumber);
+        NumberToValue.try_emplace(LocalValNumber, Arg);
+        LocalValNumber++;
+      }
+
+    // Mapping the instructions to an unsigned integer if it is not already
+    // exist in the mapping.
+    if (ValueToNumber.find(ID->Inst) == ValueToNumber.end()) {
+      ValueToNumber.try_emplace(ID->Inst, LocalValNumber);
+      NumberToValue.try_emplace(LocalValNumber, ID->Inst);
+      LocalValNumber++;
+    }
+  }
+
+  // Setting the first and last instruction data pointers for the candidate.  If
+  // we got through the entire for loop without hitting an assert, we know
+  // that both of these instructions are not nullptrs.
+  FirstInst = FirstInstIt;
+  LastInst = LastInstIt;
+}
+
+bool IRSimilarityCandidate::isSimilar(const IRSimilarityCandidate &A,
+                                      const IRSimilarityCandidate &B) {
+  if (A.getLength() != B.getLength())
+    return false;
+
+  auto InstrDataForBoth =
+      zip(make_range(A.begin(), A.end()), make_range(B.begin(), B.end()));
+
+  return all_of(InstrDataForBoth,
+                [](std::tuple<IRInstructionData &, IRInstructionData &> R) {
+                  IRInstructionData &A = std::get<0>(R);
+                  IRInstructionData &B = std::get<1>(R);
+                  if (!A.Legal || !B.Legal)
+                    return false;
+                  return isClose(A, B);
+                });
+}
+
+/// Determine if one or more of the assigned global value numbers for the
+/// operands in \p TargetValueNumbers is in the current mapping set for operand
+/// numbers in \p SourceOperands.  The set of possible corresponding global
+/// value numbers are replaced with the most recent version of compatible
+/// values.
+///
+/// \param [in] SourceValueToNumberMapping - The mapping of a Value to global
+/// value number for the source IRInstructionCandidate.
+/// \param [in, out] CurrentSrcTgtNumberMapping - The current mapping of source
+/// IRSimilarityCandidate global value numbers to a set of possible numbers in
+/// the target.
+/// \param [in] SourceOperands - The operands in the original
+/// IRSimilarityCandidate in the current instruction.
+/// \param [in] TargetValueNumbers - The global value numbers of the operands in
+/// the corresponding Instruction in the other IRSimilarityCandidate.
+/// \returns true if there exists a possible mapping between the source
+/// Instruction operands and the target Instruction operands, and false if not.
+static bool checkNumberingAndReplaceCommutative(
+  const DenseMap<Value *, unsigned> &SourceValueToNumberMapping,
+  DenseMap<unsigned, DenseSet<unsigned>> &CurrentSrcTgtNumberMapping,
+  ArrayRef<Value *> &SourceOperands,
+  DenseSet<unsigned> &TargetValueNumbers){
+
+  DenseMap<unsigned, DenseSet<unsigned>>::iterator ValueMappingIt;
+
+  unsigned ArgVal;
+  bool WasInserted;
+
+  // Iterate over the operands in the source IRSimilarityCandidate to determine
+  // whether there exists an operand in the other IRSimilarityCandidate that
+  // creates a valid mapping of Value to Value between the
+  // IRSimilarityCaniddates.
+  for (Value *V : SourceOperands) {
+    ArgVal = SourceValueToNumberMapping.find(V)->second;
+
+    std::tie(ValueMappingIt, WasInserted) = CurrentSrcTgtNumberMapping.insert(
+        std::make_pair(ArgVal, TargetValueNumbers));
+
+    // Instead of finding a current mapping, we inserted a set.  This means a
+    // mapping did not exist for the source Instruction operand, it has no
+    // current constraints we need to check.
+    if (WasInserted)
+      continue;
+
+    // If a mapping already exists for the source operand to the values in the
+    // other IRSimilarityCandidate we need to iterate over the items in other
+    // IRSimilarityCandidate's Instruction to determine whether there is a valid
+    // mapping of Value to Value.
+    DenseSet<unsigned> NewSet;
+    for (unsigned &Curr : ValueMappingIt->second)
+      // If we can find the value in the mapping, we add it to the new set.
+      if (TargetValueNumbers.contains(Curr))
+        NewSet.insert(Curr);
+
+    // If we could not find a Value, return 0.
+    if (NewSet.empty())
+      return false;
+    
+    // Otherwise replace the old mapping with the newly constructed one.
+    if (NewSet.size() != ValueMappingIt->second.size())
+      ValueMappingIt->second.swap(NewSet);
+
+    // We have reached no conclusions about the mapping, and cannot remove
+    // any items from the other operands, so we move to check the next operand.
+    if (ValueMappingIt->second.size() != 1)
+      continue;
+
+
+    unsigned ValToRemove = *ValueMappingIt->second.begin();
+    // When there is only one item left in the mapping for and operand, remove
+    // the value from the other operands.  If it results in there being no
+    // mapping, return false, it means the mapping is wrong
+    for (Value *InnerV : SourceOperands) {
+      if (V == InnerV)
+        continue;
+
+      unsigned InnerVal = SourceValueToNumberMapping.find(InnerV)->second;
+      ValueMappingIt = CurrentSrcTgtNumberMapping.find(InnerVal);
+      if (ValueMappingIt == CurrentSrcTgtNumberMapping.end())
+        continue;
+
+      ValueMappingIt->second.erase(ValToRemove);
+      if (ValueMappingIt->second.empty())
+        return false;
+    }
+  }
+
+  return true;
+}
+
+/// Determine if operand number \p TargetArgVal is in the current mapping set
+/// for operand number \p SourceArgVal.
+///
+/// \param [in, out] CurrentSrcTgtNumberMapping current mapping of global
+/// value numbers from source IRSimilarityCandidate to target
+/// IRSimilarityCandidate.
+/// \param [in] SourceArgVal The global value number for an operand in the
+/// in the original candidate.
+/// \param [in] TargetArgVal The global value number for the corresponding
+/// operand in the other candidate.
+/// \returns True if there exists a mapping and false if not.
+bool checkNumberingAndReplace(
+    DenseMap<unsigned, DenseSet<unsigned>> &CurrentSrcTgtNumberMapping,
+    unsigned SourceArgVal, unsigned TargetArgVal) {
+  // We are given two unsigned integers representing the global values of
+  // the operands in different IRSimilarityCandidates and a current mapping
+  // between the two.
+  //
+  // Source Operand GVN: 1
+  // Target Operand GVN: 2
+  // CurrentMapping: {1: {1, 2}}
+  //
+  // Since we have mapping, and the target operand is contained in the set, we
+  // update it to:
+  // CurrentMapping: {1: {2}}
+  // and can return true. But, if the mapping was
+  // CurrentMapping: {1: {3}}
+  // we would return false.
+
+  bool WasInserted;
+  DenseMap<unsigned, DenseSet<unsigned>>::iterator Val;
+
+  std::tie(Val, WasInserted) = CurrentSrcTgtNumberMapping.insert(
+      std::make_pair(SourceArgVal, DenseSet<unsigned>({TargetArgVal})));
+
+  // If we created a new mapping, then we are done.
+  if (WasInserted)
+    return true;
+
+  // If there is more than one option in the mapping set, and the target value
+  // is included in the mapping set replace that set with one that only includes
+  // the target value, as it is the only valid mapping via the non commutative
+  // instruction.
+
+  DenseSet<unsigned> &TargetSet = Val->second;
+  if (TargetSet.size() > 1 && TargetSet.contains(TargetArgVal)) {
+    TargetSet.clear();
+    TargetSet.insert(TargetArgVal);
+    return true;
+  }
+
+  // Return true if we can find the value in the set.
+  return TargetSet.contains(TargetArgVal);
+}
+
+bool IRSimilarityCandidate::compareNonCommutativeOperandMapping(
+    OperandMapping A, OperandMapping B) {
+  // Iterators to keep track of where we are in the operands for each
+  // Instruction.
+  ArrayRef<Value *>::iterator VItA = A.OperVals.begin();
+  ArrayRef<Value *>::iterator VItB = B.OperVals.begin();
+  unsigned OperandLength = A.OperVals.size();
+
+  // For each operand, get the value numbering and ensure it is consistent.
+  for (unsigned Idx = 0; Idx < OperandLength; Idx++, VItA++, VItB++) {
+    unsigned OperValA = A.IRSC.ValueToNumber.find(*VItA)->second;
+    unsigned OperValB = B.IRSC.ValueToNumber.find(*VItB)->second;
+
+    // Attempt to add a set with only the target value.  If there is no mapping
+    // we can create it here.
+    //
+    // For an instruction like a subtraction:
+    // IRSimilarityCandidateA:  IRSimilarityCandidateB:
+    // %resultA = sub %a, %b    %resultB = sub %d, %e
+    //
+    // We map %a -> %d and %b -> %e.
+    //
+    // And check to see whether their mapping is consistent in
+    // checkNumberingAndReplace.
+
+    if (!checkNumberingAndReplace(A.ValueNumberMapping, OperValA, OperValB))
+      return false;
+
+    if (!checkNumberingAndReplace(B.ValueNumberMapping, OperValB, OperValA))
+      return false;
+  }
+  return true;
+}
+
+bool IRSimilarityCandidate::compareCommutativeOperandMapping(
+    OperandMapping A, OperandMapping B) {
+  DenseSet<unsigned> ValueNumbersA;      
+  DenseSet<unsigned> ValueNumbersB;
+
+  ArrayRef<Value *>::iterator VItA = A.OperVals.begin();
+  ArrayRef<Value *>::iterator VItB = B.OperVals.begin();
+  unsigned OperandLength = A.OperVals.size();
+
+  // Find the value number sets for the operands.
+  for (unsigned Idx = 0; Idx < OperandLength;
+       Idx++, VItA++, VItB++) {
+    ValueNumbersA.insert(A.IRSC.ValueToNumber.find(*VItA)->second);
+    ValueNumbersB.insert(B.IRSC.ValueToNumber.find(*VItB)->second);
+  }
+
+  // Iterate over the operands in the first IRSimilarityCandidate and make sure
+  // there exists a possible mapping with the operands in the second
+  // IRSimilarityCandidate.
+  if (!checkNumberingAndReplaceCommutative(A.IRSC.ValueToNumber,
+                                           A.ValueNumberMapping, A.OperVals,
+                                           ValueNumbersB))
+    return false;
+
+  // Iterate over the operands in the second IRSimilarityCandidate and make sure
+  // there exists a possible mapping with the operands in the first
+  // IRSimilarityCandidate.
+  if (!checkNumberingAndReplaceCommutative(B.IRSC.ValueToNumber,
+                                           B.ValueNumberMapping, B.OperVals,
+                                           ValueNumbersA))
+    return false;
+
+  return true;
+}
+
+bool IRSimilarityCandidate::compareStructure(const IRSimilarityCandidate &A,
+                                             const IRSimilarityCandidate &B) {
+  if (A.getLength() != B.getLength())
+    return false;
+
+  if (A.ValueToNumber.size() != B.ValueToNumber.size())
+    return false;
+
+  iterator ItA = A.begin();
+  iterator ItB = B.begin();
+
+  // These sets create a create a mapping between the values in one candidate
+  // to values in the other candidate.  If we create a set with one element,
+  // and that same element maps to the original element in the candidate
+  // we have a good mapping.
+  DenseMap<unsigned, DenseSet<unsigned>> ValueNumberMappingA;
+  DenseMap<unsigned, DenseSet<unsigned>> ValueNumberMappingB;
+  DenseMap<unsigned, DenseSet<unsigned>>::iterator ValueMappingIt;
+
+  bool WasInserted;
+
+  // Iterate over the instructions contained in each candidate
+  unsigned SectionLength = A.getStartIdx() + A.getLength();
+  for (unsigned Loc = A.getStartIdx(); Loc < SectionLength;
+       ItA++, ItB++, Loc++) {
+    // Make sure the instructions are similar to one another.
+    if (!isClose(*ItA, *ItB))
+      return false;
+
+    Instruction *IA = ItA->Inst;
+    Instruction *IB = ItB->Inst;
+
+    if (!ItA->Legal || !ItB->Legal)
+      return false;
+
+    // Get the operand sets for the instructions.
+    ArrayRef<Value *> OperValsA = ItA->OperVals;
+    ArrayRef<Value *> OperValsB = ItB->OperVals;
+
+    unsigned InstValA = A.ValueToNumber.find(IA)->second;
+    unsigned InstValB = B.ValueToNumber.find(IB)->second;
+
+    // Ensure that the mappings for the instructions exists.
+    std::tie(ValueMappingIt, WasInserted) = ValueNumberMappingA.insert(
+        std::make_pair(InstValA, DenseSet<unsigned>({InstValB})));
+    if (!WasInserted && !ValueMappingIt->second.contains(InstValB))
+      return false;
+
+    std::tie(ValueMappingIt, WasInserted) = ValueNumberMappingB.insert(
+        std::make_pair(InstValB, DenseSet<unsigned>({InstValA})));
+    if (!WasInserted && !ValueMappingIt->second.contains(InstValA))
+      return false;
+
+    // We have different paths for commutative instructions and non-commutative
+    // instructions since commutative instructions could allow multiple mappings
+    // to certain values.
+    if (IA->isCommutative() && !isa<FPMathOperator>(IA)) {
+      if (!compareCommutativeOperandMapping(
+              {A, OperValsA, ValueNumberMappingA},
+              {B, OperValsB, ValueNumberMappingB}))
+        return false;
+      continue;
+    }
+
+    // Handle the non-commutative cases.
+    if (!compareNonCommutativeOperandMapping(
+            {A, OperValsA, ValueNumberMappingA},
+            {B, OperValsB, ValueNumberMappingB}))
+      return false;
+  }
+  return true;
+}
+
+bool IRSimilarityCandidate::overlap(const IRSimilarityCandidate &A,
+                                    const IRSimilarityCandidate &B) {
+  auto DoesOverlap = [](const IRSimilarityCandidate &X,
+                        const IRSimilarityCandidate &Y) {
+    // Check:
+    // XXXXXX        X starts before Y ends
+    //      YYYYYYY  Y starts after X starts
+    return X.StartIdx <= Y.getEndIdx() && Y.StartIdx >= X.StartIdx;
+  };
+
+  return DoesOverlap(A, B) || DoesOverlap(B, A);
+}
+
+void IRSimilarityIdentifier::populateMapper(
+    Module &M, std::vector<IRInstructionData *> &InstrList,
+    std::vector<unsigned> &IntegerMapping) {
+
+  std::vector<IRInstructionData *> InstrListForModule;
+  std::vector<unsigned> IntegerMappingForModule;
+  // Iterate over the functions in the module to map each Instruction in each
+  // BasicBlock to an unsigned integer.
+  for (Function &F : M) {
+
+    if (F.empty())
+      continue;
+
+    for (BasicBlock &BB : F) {
+
+      if (BB.sizeWithoutDebug() < 2)
+        continue;
+
+      // BB has potential to have similarity since it has a size greater than 2
+      // and can therefore match other regions greater than 2. Map it to a list
+      // of unsigned integers.
+      Mapper.convertToUnsignedVec(BB, InstrListForModule,
+                                  IntegerMappingForModule);
+    }
+  }
+
+  // Insert the InstrListForModule at the end of the overall InstrList so that
+  // we can have a long InstrList for the entire set of Modules being analyzed.
+  llvm::append_range(InstrList, InstrListForModule);
+  // Do the same as above, but for IntegerMapping.
+  llvm::append_range(IntegerMapping, IntegerMappingForModule);
+}
+
+void IRSimilarityIdentifier::populateMapper(
+    ArrayRef<std::unique_ptr<Module>> &Modules,
+    std::vector<IRInstructionData *> &InstrList,
+    std::vector<unsigned> &IntegerMapping) {
+
+  // Iterate over, and map the instructions in each module.
+  for (const std::unique_ptr<Module> &M : Modules)
+    populateMapper(*M, InstrList, IntegerMapping);
+}
+
+/// From a repeated subsequence, find all the different instances of the
+/// subsequence from the \p InstrList, and create an IRSimilarityCandidate from
+/// the IRInstructionData in subsequence.
+///
+/// \param [in] Mapper - The instruction mapper for sanity checks.
+/// \param [in] InstrList - The vector that holds the instruction data.
+/// \param [in] IntegerMapping - The vector that holds the mapped integers.
+/// \param [out] CandsForRepSubstring - The vector to store the generated
+/// IRSimilarityCandidates.
+static void createCandidatesFromSuffixTree(
+    IRInstructionMapper Mapper, std::vector<IRInstructionData *> &InstrList,
+    std::vector<unsigned> &IntegerMapping, SuffixTree::RepeatedSubstring &RS,
+    std::vector<IRSimilarityCandidate> &CandsForRepSubstring) {
+
+  unsigned StringLen = RS.Length;
+
+  // Create an IRSimilarityCandidate for instance of this subsequence \p RS.
+  for (const unsigned &StartIdx : RS.StartIndices) {
+    unsigned EndIdx = StartIdx + StringLen - 1;
+
+    // Check that this subsequence does not contain an illegal instruction.
+    bool ContainsIllegal = false;
+    for (unsigned CurrIdx = StartIdx; CurrIdx <= EndIdx; CurrIdx++) {
+      unsigned Key = IntegerMapping[CurrIdx];
+      if (Key > Mapper.IllegalInstrNumber) {
+        ContainsIllegal = true;
+        break;
+      }
+    }
+
+    // If we have an illegal instruction, we should not create an
+    // IRSimilarityCandidate for this region.
+    if (ContainsIllegal)
+      continue;
+
+    // We are getting iterators to the instructions in this region of code
+    // by advancing the start and end indices from the start of the
+    // InstrList.
+    std::vector<IRInstructionData *>::iterator StartIt = InstrList.begin();
+    std::advance(StartIt, StartIdx);
+    std::vector<IRInstructionData *>::iterator EndIt = InstrList.begin();
+    std::advance(EndIt, EndIdx);
+
+    CandsForRepSubstring.emplace_back(StartIdx, StringLen, *StartIt, *EndIt);
+  }
+}
+
+/// From the list of IRSimilarityCandidates, perform a comparison between each
+/// IRSimilarityCandidate to determine if there are overlapping
+/// IRInstructionData, or if they do not have the same structure.
+///
+/// \param [in] CandsForRepSubstring - The vector containing the
+/// IRSimilarityCandidates.
+/// \param [out] StructuralGroups - the mapping of unsigned integers to vector
+/// of IRSimilarityCandidates where each of the IRSimilarityCandidates in the
+/// vector are structurally similar to one another.
+static void findCandidateStructures(
+    std::vector<IRSimilarityCandidate> &CandsForRepSubstring,
+    DenseMap<unsigned, SimilarityGroup> &StructuralGroups) {
+  std::vector<IRSimilarityCandidate>::iterator CandIt, CandEndIt, InnerCandIt,
+      InnerCandEndIt;
+
+  // IRSimilarityCandidates each have a structure for operand use.  It is
+  // possible that two instances of the same subsequences have different
+  // structure. Each type of structure found is assigned a number.  This
+  // DenseMap maps an IRSimilarityCandidate to which type of similarity
+  // discovered it fits within.
+  DenseMap<IRSimilarityCandidate *, unsigned> CandToGroup;
+
+  // Find the compatibility from each candidate to the others to determine
+  // which candidates overlap and which have the same structure by mapping
+  // each structure to a different group.
+  bool SameStructure;
+  bool Inserted;
+  unsigned CurrentGroupNum = 0;
+  unsigned OuterGroupNum;
+  DenseMap<IRSimilarityCandidate *, unsigned>::iterator CandToGroupIt;
+  DenseMap<IRSimilarityCandidate *, unsigned>::iterator CandToGroupItInner;
+  DenseMap<unsigned, SimilarityGroup>::iterator CurrentGroupPair;
+
+  // Iterate over the candidates to determine its structural and overlapping
+  // compatibility with other instructions
+  for (CandIt = CandsForRepSubstring.begin(),
+      CandEndIt = CandsForRepSubstring.end();
+       CandIt != CandEndIt; CandIt++) {
+
+    // Determine if it has an assigned structural group already.
+    CandToGroupIt = CandToGroup.find(&*CandIt);
+    if (CandToGroupIt == CandToGroup.end()) {
+      // If not, we assign it one, and add it to our mapping.
+      std::tie(CandToGroupIt, Inserted) =
+          CandToGroup.insert(std::make_pair(&*CandIt, CurrentGroupNum++));
+    }
+
+    // Get the structural group number from the iterator.
+    OuterGroupNum = CandToGroupIt->second;
+
+    // Check if we already have a list of IRSimilarityCandidates for the current
+    // structural group.  Create one if one does not exist.
+    CurrentGroupPair = StructuralGroups.find(OuterGroupNum);
+    if (CurrentGroupPair == StructuralGroups.end())
+      std::tie(CurrentGroupPair, Inserted) = StructuralGroups.insert(
+          std::make_pair(OuterGroupNum, SimilarityGroup({*CandIt})));
+
+    // Iterate over the IRSimilarityCandidates following the current
+    // IRSimilarityCandidate in the list to determine whether the two
+    // IRSimilarityCandidates are compatible.  This is so we do not repeat pairs
+    // of IRSimilarityCandidates.
+    for (InnerCandIt = std::next(CandIt),
+        InnerCandEndIt = CandsForRepSubstring.end();
+         InnerCandIt != InnerCandEndIt; InnerCandIt++) {
+
+      // We check if the inner item has a group already, if it does, we skip it.
+      CandToGroupItInner = CandToGroup.find(&*InnerCandIt);
+      if (CandToGroupItInner != CandToGroup.end())
+        continue;
+
+      // Otherwise we determine if they have the same structure and add it to
+      // vector if they match.
+      SameStructure =
+          IRSimilarityCandidate::compareStructure(*CandIt, *InnerCandIt);
+      if (!SameStructure)
+        continue;
+
+      CandToGroup.insert(std::make_pair(&*InnerCandIt, OuterGroupNum));
+      CurrentGroupPair->second.push_back(*InnerCandIt);
+    }
+  }
+}
+
+void IRSimilarityIdentifier::findCandidates(
+    std::vector<IRInstructionData *> &InstrList,
+    std::vector<unsigned> &IntegerMapping) {
+  SuffixTree ST(IntegerMapping);
+
+  std::vector<IRSimilarityCandidate> CandsForRepSubstring;
+  std::vector<SimilarityGroup> NewCandidateGroups;
+
+  DenseMap<unsigned, SimilarityGroup> StructuralGroups;
+
+  // Iterate over the subsequences found by the Suffix Tree to create
+  // IRSimilarityCandidates for each repeated subsequence and determine which
+  // instances are structurally similar to one another.
+  for (auto It = ST.begin(), Et = ST.end(); It != Et; ++It) {
+    createCandidatesFromSuffixTree(Mapper, InstrList, IntegerMapping, *It,
+                                   CandsForRepSubstring);
+
+    if (CandsForRepSubstring.size() < 2)
+      continue;
+
+    findCandidateStructures(CandsForRepSubstring, StructuralGroups);
+    for (std::pair<unsigned, SimilarityGroup> &Group : StructuralGroups)
+      // We only add the group if it contains more than one
+      // IRSimilarityCandidate.  If there is only one, that means there is no
+      // other repeated subsequence with the same structure.
+      if (Group.second.size() > 1)
+        SimilarityCandidates->push_back(Group.second);
+
+    CandsForRepSubstring.clear();
+    StructuralGroups.clear();
+    NewCandidateGroups.clear();
+  }
+}
+
+SimilarityGroupList &IRSimilarityIdentifier::findSimilarity(
+    ArrayRef<std::unique_ptr<Module>> Modules) {
+  resetSimilarityCandidates();
+
+  std::vector<IRInstructionData *> InstrList;
+  std::vector<unsigned> IntegerMapping;
+
+  populateMapper(Modules, InstrList, IntegerMapping);
+  findCandidates(InstrList, IntegerMapping);
+
+  return SimilarityCandidates.getValue();
+}
+
+SimilarityGroupList &IRSimilarityIdentifier::findSimilarity(Module &M) {
+  resetSimilarityCandidates();
+
+  std::vector<IRInstructionData *> InstrList;
+  std::vector<unsigned> IntegerMapping;
+
+  populateMapper(M, InstrList, IntegerMapping);
+  findCandidates(InstrList, IntegerMapping);
+
+  return SimilarityCandidates.getValue();
+}
+
+INITIALIZE_PASS(IRSimilarityIdentifierWrapperPass, "ir-similarity-identifier",
+                "ir-similarity-identifier", false, true)
+
+IRSimilarityIdentifierWrapperPass::IRSimilarityIdentifierWrapperPass()
+    : ModulePass(ID) {
+  initializeIRSimilarityIdentifierWrapperPassPass(
+      *PassRegistry::getPassRegistry());
+}
+
+bool IRSimilarityIdentifierWrapperPass::doInitialization(Module &M) {
+  IRSI.reset(new IRSimilarityIdentifier(M));
+  return false;
+}
+
+bool IRSimilarityIdentifierWrapperPass::doFinalization(Module &M) {
+  IRSI.reset();
+  return false;
+}
+
+bool IRSimilarityIdentifierWrapperPass::runOnModule(Module &M) {
+  // All the real work is done in the constructor for the pass.
+  IRSI.reset(new IRSimilarityIdentifier(M));
+  return false;
+}
+
+AnalysisKey IRSimilarityAnalysis::Key;
+IRSimilarityIdentifier IRSimilarityAnalysis::run(Module &M,
+                                               ModuleAnalysisManager &) {
+
+  return IRSimilarityIdentifier(M);
+}
+
+PreservedAnalyses
+IRSimilarityAnalysisPrinterPass::run(Module &M, ModuleAnalysisManager &AM) {
+  IRSimilarityIdentifier &IRSI = AM.getResult<IRSimilarityAnalysis>(M);
+  Optional<SimilarityGroupList> &SimilarityCandidatesOpt = IRSI.getSimilarity();
+
+  for (std::vector<IRSimilarityCandidate> &CandVec : *SimilarityCandidatesOpt) {
+    OS << CandVec.size() << " candidates of length "
+       << CandVec.begin()->getLength() << ".  Found in: \n";
+    for (IRSimilarityCandidate &Cand : CandVec) {
+      OS << "  Function: " << Cand.front()->Inst->getFunction()->getName().str()
+         << ",  Basic Block: ";
+      if (Cand.front()->Inst->getParent()->getName().str() == "")
+        OS << "(unnamed)\n";
+      else
+        OS << Cand.front()->Inst->getParent()->getName().str() << "\n";
+    }
+  }
+
+  return PreservedAnalyses::all();
+}
+
+char IRSimilarityIdentifierWrapperPass::ID = 0;
diff --git a/contrib/llvm-project/llvm/lib/Analysis/IVDescriptors.cpp b/contrib/llvm-project/llvm/lib/Analysis/IVDescriptors.cpp
index 6686848d75c9..94a24ccf2155 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/IVDescriptors.cpp
@@ -12,7 +12,6 @@
 
 #include "llvm/Analysis/IVDescriptors.h"
 #include "llvm/ADT/ScopeExit.h"
-#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/DemandedBits.h"
 #include "llvm/Analysis/DomTreeUpdater.h"
@@ -48,33 +47,36 @@ bool RecurrenceDescriptor::areAllUsesIn(Instruction *I,
   return true;
 }
 
-bool RecurrenceDescriptor::isIntegerRecurrenceKind(RecurrenceKind Kind) {
+bool RecurrenceDescriptor::isIntegerRecurrenceKind(RecurKind Kind) {
   switch (Kind) {
   default:
     break;
-  case RK_IntegerAdd:
-  case RK_IntegerMult:
-  case RK_IntegerOr:
-  case RK_IntegerAnd:
-  case RK_IntegerXor:
-  case RK_IntegerMinMax:
+  case RecurKind::Add:
+  case RecurKind::Mul:
+  case RecurKind::Or:
+  case RecurKind::And:
+  case RecurKind::Xor:
+  case RecurKind::SMax:
+  case RecurKind::SMin:
+  case RecurKind::UMax:
+  case RecurKind::UMin:
     return true;
   }
   return false;
 }
 
-bool RecurrenceDescriptor::isFloatingPointRecurrenceKind(RecurrenceKind Kind) {
-  return (Kind != RK_NoRecurrence) && !isIntegerRecurrenceKind(Kind);
+bool RecurrenceDescriptor::isFloatingPointRecurrenceKind(RecurKind Kind) {
+  return (Kind != RecurKind::None) && !isIntegerRecurrenceKind(Kind);
 }
 
-bool RecurrenceDescriptor::isArithmeticRecurrenceKind(RecurrenceKind Kind) {
+bool RecurrenceDescriptor::isArithmeticRecurrenceKind(RecurKind Kind) {
   switch (Kind) {
   default:
     break;
-  case RK_IntegerAdd:
-  case RK_IntegerMult:
-  case RK_FloatAdd:
-  case RK_FloatMult:
+  case RecurKind::Add:
+  case RecurKind::Mul:
+  case RecurKind::FAdd:
+  case RecurKind::FMul:
     return true;
   }
   return false;
@@ -187,7 +189,7 @@ static void collectCastsToIgnore(Loop *TheLoop, Instruction *Exit,
   }
 }
 
-bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind,
+bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
                                            Loop *TheLoop, bool HasFunNoNaNAttr,
                                            RecurrenceDescriptor &RedDes,
                                            DemandedBits *DB,
@@ -241,11 +243,14 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind,
   if (RecurrenceType->isFloatingPointTy()) {
     if (!isFloatingPointRecurrenceKind(Kind))
       return false;
-  } else {
+  } else if (RecurrenceType->isIntegerTy()) {
     if (!isIntegerRecurrenceKind(Kind))
       return false;
     if (isArithmeticRecurrenceKind(Kind))
       Start = lookThroughAnd(Phi, RecurrenceType, VisitedInsts, CastInsts);
+  } else {
+    // Pointer min/max may exist, but it is not supported as a reduction op.
+    return false;
   }
 
   Worklist.push_back(Start);
@@ -271,8 +276,7 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind,
   //      * An instruction type other than PHI or the reduction operation.
   //      * A PHI in the header other than the initial PHI.
   while (!Worklist.empty()) {
-    Instruction *Cur = Worklist.back();
-    Worklist.pop_back();
+    Instruction *Cur = Worklist.pop_back_val();
 
     // No Users.
     // If the instruction has no users then this is a broken chain and can't be
@@ -303,29 +307,35 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind,
       // FIXME: FMF is allowed on phi, but propagation is not handled correctly.
       if (isa<FPMathOperator>(ReduxDesc.getPatternInst()) && !IsAPhi)
         FMF &= ReduxDesc.getPatternInst()->getFastMathFlags();
+      // Update this reduction kind if we matched a new instruction.
+      // TODO: Can we eliminate the need for a 2nd InstDesc by keeping 'Kind'
+      //       state accurate while processing the worklist?
+      if (ReduxDesc.getRecKind() != RecurKind::None)
+        Kind = ReduxDesc.getRecKind();
     }
 
     bool IsASelect = isa<SelectInst>(Cur);
 
     // A conditional reduction operation must only have 2 or less uses in
     // VisitedInsts.
-    if (IsASelect && (Kind == RK_FloatAdd || Kind == RK_FloatMult) &&
+    if (IsASelect && (Kind == RecurKind::FAdd || Kind == RecurKind::FMul) &&
         hasMultipleUsesOf(Cur, VisitedInsts, 2))
       return false;
 
     // A reduction operation must only have one use of the reduction value.
-    if (!IsAPhi && !IsASelect && Kind != RK_IntegerMinMax &&
-        Kind != RK_FloatMinMax && hasMultipleUsesOf(Cur, VisitedInsts, 1))
+    if (!IsAPhi && !IsASelect && !isMinMaxRecurrenceKind(Kind) &&
+        hasMultipleUsesOf(Cur, VisitedInsts, 1))
       return false;
 
     // All inputs to a PHI node must be a reduction value.
     if (IsAPhi && Cur != Phi && !areAllUsesIn(Cur, VisitedInsts))
       return false;
 
-    if (Kind == RK_IntegerMinMax &&
+    if (isIntMinMaxRecurrenceKind(Kind) &&
         (isa<ICmpInst>(Cur) || isa<SelectInst>(Cur)))
       ++NumCmpSelectPatternInst;
-    if (Kind == RK_FloatMinMax && (isa<FCmpInst>(Cur) || isa<SelectInst>(Cur)))
+    if (isFPMinMaxRecurrenceKind(Kind) &&
+        (isa<FCmpInst>(Cur) || isa<SelectInst>(Cur)))
       ++NumCmpSelectPatternInst;
 
     // Check  whether we found a reduction operator.
@@ -390,8 +400,7 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind,
 
   // This means we have seen one but not the other instruction of the
   // pattern or more than just a select and cmp.
-  if ((Kind == RK_IntegerMinMax || Kind == RK_FloatMinMax) &&
-      NumCmpSelectPatternInst != 2)
+  if (isMinMaxRecurrenceKind(Kind) && NumCmpSelectPatternInst != 2)
     return false;
 
   if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction)
@@ -409,7 +418,7 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind,
     // can be ignore in the cost model. If we compute a different type than we
     // did when evaluating the 'and', the 'and' will not be eliminated, and we
     // will end up with different kinds of operations in the recurrence
-    // expression (e.g., RK_IntegerAND, RK_IntegerADD). We give up if this is
+    // expression (e.g., IntegerAND, IntegerADD). We give up if this is
     // the case.
     //
     // The vectorizer relies on InstCombine to perform the actual
@@ -437,6 +446,8 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind,
     //       instructions that are a part of the reduction. The vectorizer cost
     //       model could then apply the recurrence type to these instructions,
     //       without needing a white list of instructions to ignore.
+    //       This may also be useful for the inloop reductions, if it can be
+    //       kept simple enough.
     collectCastsToIgnore(TheLoop, ExitInstruction, RecurrenceType, CastInsts);
   }
 
@@ -447,61 +458,50 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind,
   // is saved as part of the RecurrenceDescriptor.
 
   // Save the description of this reduction variable.
-  RecurrenceDescriptor RD(
-      RdxStart, ExitInstruction, Kind, FMF, ReduxDesc.getMinMaxKind(),
-      ReduxDesc.getUnsafeAlgebraInst(), RecurrenceType, IsSigned, CastInsts);
+  RecurrenceDescriptor RD(RdxStart, ExitInstruction, Kind, FMF,
+                          ReduxDesc.getUnsafeAlgebraInst(), RecurrenceType,
+                          IsSigned, CastInsts);
   RedDes = RD;
 
   return true;
 }
 
-/// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction
-/// pattern corresponding to a min(X, Y) or max(X, Y).
 RecurrenceDescriptor::InstDesc
-RecurrenceDescriptor::isMinMaxSelectCmpPattern(Instruction *I, InstDesc &Prev) {
-
-  assert((isa<ICmpInst>(I) || isa<FCmpInst>(I) || isa<SelectInst>(I)) &&
-         "Expect a select instruction");
-  Instruction *Cmp = nullptr;
-  SelectInst *Select = nullptr;
+RecurrenceDescriptor::isMinMaxSelectCmpPattern(Instruction *I,
+                                               const InstDesc &Prev) {
+  assert((isa<CmpInst>(I) || isa<SelectInst>(I)) &&
+         "Expected a cmp or select instruction");
 
   // We must handle the select(cmp()) as a single instruction. Advance to the
   // select.
-  if ((Cmp = dyn_cast<ICmpInst>(I)) || (Cmp = dyn_cast<FCmpInst>(I))) {
-    if (!Cmp->hasOneUse() || !(Select = dyn_cast<SelectInst>(*I->user_begin())))
-      return InstDesc(false, I);
-    return InstDesc(Select, Prev.getMinMaxKind());
+  CmpInst::Predicate Pred;
+  if (match(I, m_OneUse(m_Cmp(Pred, m_Value(), m_Value())))) {
+    if (auto *Select = dyn_cast<SelectInst>(*I->user_begin()))
+      return InstDesc(Select, Prev.getRecKind());
   }
 
-  // Only handle single use cases for now.
-  if (!(Select = dyn_cast<SelectInst>(I)))
-    return InstDesc(false, I);
-  if (!(Cmp = dyn_cast<ICmpInst>(I->getOperand(0))) &&
-      !(Cmp = dyn_cast<FCmpInst>(I->getOperand(0))))
-    return InstDesc(false, I);
-  if (!Cmp->hasOneUse())
+  // Only match select with single use cmp condition.
+  if (!match(I, m_Select(m_OneUse(m_Cmp(Pred, m_Value(), m_Value())), m_Value(),
+                         m_Value())))
     return InstDesc(false, I);
 
-  Value *CmpLeft;
-  Value *CmpRight;
-
   // Look for a min/max pattern.
-  if (m_UMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
-    return InstDesc(Select, MRK_UIntMin);
-  else if (m_UMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
-    return InstDesc(Select, MRK_UIntMax);
-  else if (m_SMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
-    return InstDesc(Select, MRK_SIntMax);
-  else if (m_SMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
-    return InstDesc(Select, MRK_SIntMin);
-  else if (m_OrdFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
-    return InstDesc(Select, MRK_FloatMin);
-  else if (m_OrdFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
-    return InstDesc(Select, MRK_FloatMax);
-  else if (m_UnordFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
-    return InstDesc(Select, MRK_FloatMin);
-  else if (m_UnordFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
-    return InstDesc(Select, MRK_FloatMax);
+  if (match(I, m_UMin(m_Value(), m_Value())))
+    return InstDesc(I, RecurKind::UMin);
+  if (match(I, m_UMax(m_Value(), m_Value())))
+    return InstDesc(I, RecurKind::UMax);
+  if (match(I, m_SMax(m_Value(), m_Value())))
+    return InstDesc(I, RecurKind::SMax);
+  if (match(I, m_SMin(m_Value(), m_Value())))
+    return InstDesc(I, RecurKind::SMin);
+  if (match(I, m_OrdFMin(m_Value(), m_Value())))
+    return InstDesc(I, RecurKind::FMin);
+  if (match(I, m_OrdFMax(m_Value(), m_Value())))
+    return InstDesc(I, RecurKind::FMax);
+  if (match(I, m_UnordFMin(m_Value(), m_Value())))
+    return InstDesc(I, RecurKind::FMin);
+  if (match(I, m_UnordFMax(m_Value(), m_Value())))
+    return InstDesc(I, RecurKind::FMax);
 
   return InstDesc(false, I);
 }
@@ -516,8 +516,7 @@ RecurrenceDescriptor::isMinMaxSelectCmpPattern(Instruction *I, InstDesc &Prev) {
 /// %add = fadd %0, %sum.1
 /// %sum.2 = select %cmp, %add, %sum.1
 RecurrenceDescriptor::InstDesc
-RecurrenceDescriptor::isConditionalRdxPattern(
-    RecurrenceKind Kind, Instruction *I) {
+RecurrenceDescriptor::isConditionalRdxPattern(RecurKind Kind, Instruction *I) {
   SelectInst *SI = dyn_cast<SelectInst>(I);
   if (!SI)
     return InstDesc(false, I);
@@ -545,16 +544,16 @@ RecurrenceDescriptor::isConditionalRdxPattern(
   if ((m_FAdd(m_Value(Op1), m_Value(Op2)).match(I1)  ||
        m_FSub(m_Value(Op1), m_Value(Op2)).match(I1)) &&
       I1->isFast())
-    return InstDesc(Kind == RK_FloatAdd, SI);
+    return InstDesc(Kind == RecurKind::FAdd, SI);
 
   if (m_FMul(m_Value(Op1), m_Value(Op2)).match(I1) && (I1->isFast()))
-    return InstDesc(Kind == RK_FloatMult, SI);
+    return InstDesc(Kind == RecurKind::FMul, SI);
 
   return InstDesc(false, I);
 }
 
 RecurrenceDescriptor::InstDesc
-RecurrenceDescriptor::isRecurrenceInstr(Instruction *I, RecurrenceKind Kind,
+RecurrenceDescriptor::isRecurrenceInstr(Instruction *I, RecurKind Kind,
                                         InstDesc &Prev, bool HasFunNoNaNAttr) {
   Instruction *UAI = Prev.getUnsafeAlgebraInst();
   if (!UAI && isa<FPMathOperator>(I) && !I->hasAllowReassoc())
@@ -564,31 +563,32 @@ RecurrenceDescriptor::isRecurrenceInstr(Instruction *I, RecurrenceKind Kind,
   default:
     return InstDesc(false, I);
   case Instruction::PHI:
-    return InstDesc(I, Prev.getMinMaxKind(), Prev.getUnsafeAlgebraInst());
+    return InstDesc(I, Prev.getRecKind(), Prev.getUnsafeAlgebraInst());
   case Instruction::Sub:
   case Instruction::Add:
-    return InstDesc(Kind == RK_IntegerAdd, I);
+    return InstDesc(Kind == RecurKind::Add, I);
   case Instruction::Mul:
-    return InstDesc(Kind == RK_IntegerMult, I);
+    return InstDesc(Kind == RecurKind::Mul, I);
   case Instruction::And:
-    return InstDesc(Kind == RK_IntegerAnd, I);
+    return InstDesc(Kind == RecurKind::And, I);
   case Instruction::Or:
-    return InstDesc(Kind == RK_IntegerOr, I);
+    return InstDesc(Kind == RecurKind::Or, I);
   case Instruction::Xor:
-    return InstDesc(Kind == RK_IntegerXor, I);
+    return InstDesc(Kind == RecurKind::Xor, I);
+  case Instruction::FDiv:
   case Instruction::FMul:
-    return InstDesc(Kind == RK_FloatMult, I, UAI);
+    return InstDesc(Kind == RecurKind::FMul, I, UAI);
   case Instruction::FSub:
   case Instruction::FAdd:
-    return InstDesc(Kind == RK_FloatAdd, I, UAI);
+    return InstDesc(Kind == RecurKind::FAdd, I, UAI);
   case Instruction::Select:
-    if (Kind == RK_FloatAdd || Kind == RK_FloatMult)
+    if (Kind == RecurKind::FAdd || Kind == RecurKind::FMul)
       return isConditionalRdxPattern(Kind, I);
     LLVM_FALLTHROUGH;
   case Instruction::FCmp:
   case Instruction::ICmp:
-    if (Kind != RK_IntegerMinMax &&
-        (!HasFunNoNaNAttr || Kind != RK_FloatMinMax))
+    if (!isIntMinMaxRecurrenceKind(Kind) &&
+        (!HasFunNoNaNAttr || !isFPMinMaxRecurrenceKind(Kind)))
       return InstDesc(false, I);
     return isMinMaxSelectCmpPattern(I, Prev);
   }
@@ -618,50 +618,69 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
   bool HasFunNoNaNAttr =
       F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
 
-  if (AddReductionVar(Phi, RK_IntegerAdd, TheLoop, HasFunNoNaNAttr, RedDes, DB,
+  if (AddReductionVar(Phi, RecurKind::Add, TheLoop, HasFunNoNaNAttr, RedDes, DB,
                       AC, DT)) {
     LLVM_DEBUG(dbgs() << "Found an ADD reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RK_IntegerMult, TheLoop, HasFunNoNaNAttr, RedDes, DB,
+  if (AddReductionVar(Phi, RecurKind::Mul, TheLoop, HasFunNoNaNAttr, RedDes, DB,
                       AC, DT)) {
     LLVM_DEBUG(dbgs() << "Found a MUL reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RK_IntegerOr, TheLoop, HasFunNoNaNAttr, RedDes, DB,
+  if (AddReductionVar(Phi, RecurKind::Or, TheLoop, HasFunNoNaNAttr, RedDes, DB,
                       AC, DT)) {
     LLVM_DEBUG(dbgs() << "Found an OR reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RK_IntegerAnd, TheLoop, HasFunNoNaNAttr, RedDes, DB,
+  if (AddReductionVar(Phi, RecurKind::And, TheLoop, HasFunNoNaNAttr, RedDes, DB,
                       AC, DT)) {
     LLVM_DEBUG(dbgs() << "Found an AND reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RK_IntegerXor, TheLoop, HasFunNoNaNAttr, RedDes, DB,
+  if (AddReductionVar(Phi, RecurKind::Xor, TheLoop, HasFunNoNaNAttr, RedDes, DB,
                       AC, DT)) {
     LLVM_DEBUG(dbgs() << "Found a XOR reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RK_IntegerMinMax, TheLoop, HasFunNoNaNAttr, RedDes,
+  if (AddReductionVar(Phi, RecurKind::SMax, TheLoop, HasFunNoNaNAttr, RedDes,
                       DB, AC, DT)) {
-    LLVM_DEBUG(dbgs() << "Found a MINMAX reduction PHI." << *Phi << "\n");
+    LLVM_DEBUG(dbgs() << "Found a SMAX reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RK_FloatMult, TheLoop, HasFunNoNaNAttr, RedDes, DB,
-                      AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::SMin, TheLoop, HasFunNoNaNAttr, RedDes,
+                      DB, AC, DT)) {
+    LLVM_DEBUG(dbgs() << "Found a SMIN reduction PHI." << *Phi << "\n");
+    return true;
+  }
+  if (AddReductionVar(Phi, RecurKind::UMax, TheLoop, HasFunNoNaNAttr, RedDes,
+                      DB, AC, DT)) {
+    LLVM_DEBUG(dbgs() << "Found a UMAX reduction PHI." << *Phi << "\n");
+    return true;
+  }
+  if (AddReductionVar(Phi, RecurKind::UMin, TheLoop, HasFunNoNaNAttr, RedDes,
+                      DB, AC, DT)) {
+    LLVM_DEBUG(dbgs() << "Found a UMIN reduction PHI." << *Phi << "\n");
+    return true;
+  }
+  if (AddReductionVar(Phi, RecurKind::FMul, TheLoop, HasFunNoNaNAttr, RedDes,
+                      DB, AC, DT)) {
     LLVM_DEBUG(dbgs() << "Found an FMult reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RK_FloatAdd, TheLoop, HasFunNoNaNAttr, RedDes, DB,
-                      AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::FAdd, TheLoop, HasFunNoNaNAttr, RedDes,
+                      DB, AC, DT)) {
     LLVM_DEBUG(dbgs() << "Found an FAdd reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RK_FloatMinMax, TheLoop, HasFunNoNaNAttr, RedDes, DB,
-                      AC, DT)) {
-    LLVM_DEBUG(dbgs() << "Found an float MINMAX reduction PHI." << *Phi
-                      << "\n");
+  if (AddReductionVar(Phi, RecurKind::FMax, TheLoop, HasFunNoNaNAttr, RedDes,
+                      DB, AC, DT)) {
+    LLVM_DEBUG(dbgs() << "Found a float MAX reduction PHI." << *Phi << "\n");
+    return true;
+  }
+  if (AddReductionVar(Phi, RecurKind::FMin, TheLoop, HasFunNoNaNAttr, RedDes,
+                      DB, AC, DT)) {
+    LLVM_DEBUG(dbgs() << "Found a float MIN reduction PHI." << *Phi << "\n");
     return true;
   }
   // Not a reduction of known type.
@@ -745,57 +764,143 @@ bool RecurrenceDescriptor::isFirstOrderRecurrence(
 
 /// This function returns the identity element (or neutral element) for
 /// the operation K.
-Constant *RecurrenceDescriptor::getRecurrenceIdentity(RecurrenceKind K,
-                                                      Type *Tp) {
+Constant *RecurrenceDescriptor::getRecurrenceIdentity(RecurKind K, Type *Tp) {
   switch (K) {
-  case RK_IntegerXor:
-  case RK_IntegerAdd:
-  case RK_IntegerOr:
+  case RecurKind::Xor:
+  case RecurKind::Add:
+  case RecurKind::Or:
     // Adding, Xoring, Oring zero to a number does not change it.
     return ConstantInt::get(Tp, 0);
-  case RK_IntegerMult:
+  case RecurKind::Mul:
     // Multiplying a number by 1 does not change it.
     return ConstantInt::get(Tp, 1);
-  case RK_IntegerAnd:
+  case RecurKind::And:
     // AND-ing a number with an all-1 value does not change it.
     return ConstantInt::get(Tp, -1, true);
-  case RK_FloatMult:
+  case RecurKind::FMul:
     // Multiplying a number by 1 does not change it.
     return ConstantFP::get(Tp, 1.0L);
-  case RK_FloatAdd:
+  case RecurKind::FAdd:
     // Adding zero to a number does not change it.
     return ConstantFP::get(Tp, 0.0L);
+  case RecurKind::UMin:
+    return ConstantInt::get(Tp, -1);
+  case RecurKind::UMax:
+    return ConstantInt::get(Tp, 0);
+  case RecurKind::SMin:
+    return ConstantInt::get(Tp,
+                            APInt::getSignedMaxValue(Tp->getIntegerBitWidth()));
+  case RecurKind::SMax:
+    return ConstantInt::get(Tp,
+                            APInt::getSignedMinValue(Tp->getIntegerBitWidth()));
+  case RecurKind::FMin:
+    return ConstantFP::getInfinity(Tp, true);
+  case RecurKind::FMax:
+    return ConstantFP::getInfinity(Tp, false);
   default:
     llvm_unreachable("Unknown recurrence kind");
   }
 }
 
-/// This function translates the recurrence kind to an LLVM binary operator.
-unsigned RecurrenceDescriptor::getRecurrenceBinOp(RecurrenceKind Kind) {
+unsigned RecurrenceDescriptor::getOpcode(RecurKind Kind) {
   switch (Kind) {
-  case RK_IntegerAdd:
+  case RecurKind::Add:
     return Instruction::Add;
-  case RK_IntegerMult:
+  case RecurKind::Mul:
     return Instruction::Mul;
-  case RK_IntegerOr:
+  case RecurKind::Or:
     return Instruction::Or;
-  case RK_IntegerAnd:
+  case RecurKind::And:
     return Instruction::And;
-  case RK_IntegerXor:
+  case RecurKind::Xor:
     return Instruction::Xor;
-  case RK_FloatMult:
+  case RecurKind::FMul:
     return Instruction::FMul;
-  case RK_FloatAdd:
+  case RecurKind::FAdd:
     return Instruction::FAdd;
-  case RK_IntegerMinMax:
+  case RecurKind::SMax:
+  case RecurKind::SMin:
+  case RecurKind::UMax:
+  case RecurKind::UMin:
     return Instruction::ICmp;
-  case RK_FloatMinMax:
+  case RecurKind::FMax:
+  case RecurKind::FMin:
     return Instruction::FCmp;
   default:
     llvm_unreachable("Unknown recurrence operation");
   }
 }
 
+SmallVector<Instruction *, 4>
+RecurrenceDescriptor::getReductionOpChain(PHINode *Phi, Loop *L) const {
+  SmallVector<Instruction *, 4> ReductionOperations;
+  unsigned RedOp = getOpcode(Kind);
+
+  // Search down from the Phi to the LoopExitInstr, looking for instructions
+  // with a single user of the correct type for the reduction.
+
+  // Note that we check that the type of the operand is correct for each item in
+  // the chain, including the last (the loop exit value). This can come up from
+  // sub, which would otherwise be treated as an add reduction. MinMax also need
+  // to check for a pair of icmp/select, for which we use getNextInstruction and
+  // isCorrectOpcode functions to step the right number of instruction, and
+  // check the icmp/select pair.
+  // FIXME: We also do not attempt to look through Phi/Select's yet, which might
+  // be part of the reduction chain, or attempt to looks through And's to find a
+  // smaller bitwidth. Subs are also currently not allowed (which are usually
+  // treated as part of a add reduction) as they are expected to generally be
+  // more expensive than out-of-loop reductions, and need to be costed more
+  // carefully.
+  unsigned ExpectedUses = 1;
+  if (RedOp == Instruction::ICmp || RedOp == Instruction::FCmp)
+    ExpectedUses = 2;
+
+  auto getNextInstruction = [&](Instruction *Cur) {
+    if (RedOp == Instruction::ICmp || RedOp == Instruction::FCmp) {
+      // We are expecting a icmp/select pair, which we go to the next select
+      // instruction if we can. We already know that Cur has 2 uses.
+      if (isa<SelectInst>(*Cur->user_begin()))
+        return cast<Instruction>(*Cur->user_begin());
+      else
+        return cast<Instruction>(*std::next(Cur->user_begin()));
+    }
+    return cast<Instruction>(*Cur->user_begin());
+  };
+  auto isCorrectOpcode = [&](Instruction *Cur) {
+    if (RedOp == Instruction::ICmp || RedOp == Instruction::FCmp) {
+      Value *LHS, *RHS;
+      return SelectPatternResult::isMinOrMax(
+          matchSelectPattern(Cur, LHS, RHS).Flavor);
+    }
+    return Cur->getOpcode() == RedOp;
+  };
+
+  // The loop exit instruction we check first (as a quick test) but add last. We
+  // check the opcode is correct (and dont allow them to be Subs) and that they
+  // have expected to have the expected number of uses. They will have one use
+  // from the phi and one from a LCSSA value, no matter the type.
+  if (!isCorrectOpcode(LoopExitInstr) || !LoopExitInstr->hasNUses(2))
+    return {};
+
+  // Check that the Phi has one (or two for min/max) uses.
+  if (!Phi->hasNUses(ExpectedUses))
+    return {};
+  Instruction *Cur = getNextInstruction(Phi);
+
+  // Each other instruction in the chain should have the expected number of uses
+  // and be the correct opcode.
+  while (Cur != LoopExitInstr) {
+    if (!isCorrectOpcode(Cur) || !Cur->hasNUses(ExpectedUses))
+      return {};
+
+    ReductionOperations.push_back(Cur);
+    Cur = getNextInstruction(Cur);
+  }
+
+  ReductionOperations.push_back(Cur);
+  return ReductionOperations;
+}
+
 InductionDescriptor::InductionDescriptor(Value *Start, InductionKind K,
                                          const SCEV *Step, BinaryOperator *BOp,
                                          SmallVectorImpl<Instruction *> *Casts)
@@ -834,13 +939,6 @@ InductionDescriptor::InductionDescriptor(Value *Start, InductionKind K,
   }
 }
 
-int InductionDescriptor::getConsecutiveDirection() const {
-  ConstantInt *ConstStep = getConstIntStepValue();
-  if (ConstStep && (ConstStep->isOne() || ConstStep->isMinusOne()))
-    return ConstStep->getSExtValue();
-  return 0;
-}
-
 ConstantInt *InductionDescriptor::getConstIntStepValue() const {
   if (isa<SCEVConstant>(Step))
     return dyn_cast<ConstantInt>(cast<SCEVConstant>(Step)->getValue());
diff --git a/contrib/llvm-project/llvm/lib/Analysis/ImportedFunctionsInliningStatistics.cpp b/contrib/llvm-project/llvm/lib/Analysis/ImportedFunctionsInliningStatistics.cpp
new file mode 100644
index 000000000000..a7b5fda237d1
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Analysis/ImportedFunctionsInliningStatistics.cpp
@@ -0,0 +1,212 @@
+//===-- ImportedFunctionsInliningStats.cpp ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// Generating inliner statistics for imported functions, mostly useful for
+// ThinLTO.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/Utils/ImportedFunctionsInliningStatistics.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <iomanip>
+#include <sstream>
+using namespace llvm;
+
+cl::opt<InlinerFunctionImportStatsOpts> InlinerFunctionImportStats(
+    "inliner-function-import-stats",
+    cl::init(InlinerFunctionImportStatsOpts::No),
+    cl::values(clEnumValN(InlinerFunctionImportStatsOpts::Basic, "basic",
+                          "basic statistics"),
+               clEnumValN(InlinerFunctionImportStatsOpts::Verbose, "verbose",
+                          "printing of statistics for each inlined function")),
+    cl::Hidden, cl::desc("Enable inliner stats for imported functions"));
+
+ImportedFunctionsInliningStatistics::InlineGraphNode &
+ImportedFunctionsInliningStatistics::createInlineGraphNode(const Function &F) {
+
+  auto &ValueLookup = NodesMap[F.getName()];
+  if (!ValueLookup) {
+    ValueLookup = std::make_unique<InlineGraphNode>();
+    ValueLookup->Imported = F.hasMetadata("thinlto_src_module");
+  }
+  return *ValueLookup;
+}
+
+void ImportedFunctionsInliningStatistics::recordInline(const Function &Caller,
+                                                       const Function &Callee) {
+
+  InlineGraphNode &CallerNode = createInlineGraphNode(Caller);
+  InlineGraphNode &CalleeNode = createInlineGraphNode(Callee);
+  CalleeNode.NumberOfInlines++;
+
+  if (!CallerNode.Imported && !CalleeNode.Imported) {
+    // Direct inline from not imported callee to not imported caller, so we
+    // don't have to add this to graph. It might be very helpful if you wanna
+    // get the inliner statistics in compile step where there are no imported
+    // functions. In this case the graph would be empty.
+    CalleeNode.NumberOfRealInlines++;
+    return;
+  }
+
+  CallerNode.InlinedCallees.push_back(&CalleeNode);
+  if (!CallerNode.Imported) {
+    // We could avoid second lookup, but it would make the code ultra ugly.
+    auto It = NodesMap.find(Caller.getName());
+    assert(It != NodesMap.end() && "The node should be already there.");
+    // Save Caller as a starting node for traversal. The string has to be one
+    // from map because Caller can disappear (and function name with it).
+    NonImportedCallers.push_back(It->first());
+  }
+}
+
+void ImportedFunctionsInliningStatistics::setModuleInfo(const Module &M) {
+  ModuleName = M.getName();
+  for (const auto &F : M.functions()) {
+    if (F.isDeclaration())
+      continue;
+    AllFunctions++;
+    ImportedFunctions += int(F.hasMetadata("thinlto_src_module"));
+  }
+}
+static std::string getStatString(const char *Msg, int32_t Fraction, int32_t All,
+                                 const char *PercentageOfMsg,
+                                 bool LineEnd = true) {
+  double Result = 0;
+  if (All != 0)
+    Result = 100 * static_cast<double>(Fraction) / All;
+
+  std::stringstream Str;
+  Str << std::setprecision(4) << Msg << ": " << Fraction << " [" << Result
+      << "% of " << PercentageOfMsg << "]";
+  if (LineEnd)
+    Str << "\n";
+  return Str.str();
+}
+
+void ImportedFunctionsInliningStatistics::dump(const bool Verbose) {
+  calculateRealInlines();
+  NonImportedCallers.clear();
+
+  int32_t InlinedImportedFunctionsCount = 0;
+  int32_t InlinedNotImportedFunctionsCount = 0;
+
+  int32_t InlinedImportedFunctionsToImportingModuleCount = 0;
+  int32_t InlinedNotImportedFunctionsToImportingModuleCount = 0;
+
+  const auto SortedNodes = getSortedNodes();
+  std::string Out;
+  Out.reserve(5000);
+  raw_string_ostream Ostream(Out);
+
+  Ostream << "------- Dumping inliner stats for [" << ModuleName
+          << "] -------\n";
+
+  if (Verbose)
+    Ostream << "-- List of inlined functions:\n";
+
+  for (const auto &Node : SortedNodes) {
+    assert(Node->second->NumberOfInlines >= Node->second->NumberOfRealInlines);
+    if (Node->second->NumberOfInlines == 0)
+      continue;
+
+    if (Node->second->Imported) {
+      InlinedImportedFunctionsCount++;
+      InlinedImportedFunctionsToImportingModuleCount +=
+          int(Node->second->NumberOfRealInlines > 0);
+    } else {
+      InlinedNotImportedFunctionsCount++;
+      InlinedNotImportedFunctionsToImportingModuleCount +=
+          int(Node->second->NumberOfRealInlines > 0);
+    }
+
+    if (Verbose)
+      Ostream << "Inlined "
+              << (Node->second->Imported ? "imported " : "not imported ")
+              << "function [" << Node->first() << "]"
+              << ": #inlines = " << Node->second->NumberOfInlines
+              << ", #inlines_to_importing_module = "
+              << Node->second->NumberOfRealInlines << "\n";
+  }
+
+  auto InlinedFunctionsCount =
+      InlinedImportedFunctionsCount + InlinedNotImportedFunctionsCount;
+  auto NotImportedFuncCount = AllFunctions - ImportedFunctions;
+  auto ImportedNotInlinedIntoModule =
+      ImportedFunctions - InlinedImportedFunctionsToImportingModuleCount;
+
+  Ostream << "-- Summary:\n"
+          << "All functions: " << AllFunctions
+          << ", imported functions: " << ImportedFunctions << "\n"
+          << getStatString("inlined functions", InlinedFunctionsCount,
+                           AllFunctions, "all functions")
+          << getStatString("imported functions inlined anywhere",
+                           InlinedImportedFunctionsCount, ImportedFunctions,
+                           "imported functions")
+          << getStatString("imported functions inlined into importing module",
+                           InlinedImportedFunctionsToImportingModuleCount,
+                           ImportedFunctions, "imported functions",
+                           /*LineEnd=*/false)
+          << getStatString(", remaining", ImportedNotInlinedIntoModule,
+                           ImportedFunctions, "imported functions")
+          << getStatString("non-imported functions inlined anywhere",
+                           InlinedNotImportedFunctionsCount,
+                           NotImportedFuncCount, "non-imported functions")
+          << getStatString(
+                 "non-imported functions inlined into importing module",
+                 InlinedNotImportedFunctionsToImportingModuleCount,
+                 NotImportedFuncCount, "non-imported functions");
+  Ostream.flush();
+  dbgs() << Out;
+}
+
+void ImportedFunctionsInliningStatistics::calculateRealInlines() {
+  // Removing duplicated Callers.
+  llvm::sort(NonImportedCallers);
+  NonImportedCallers.erase(
+      std::unique(NonImportedCallers.begin(), NonImportedCallers.end()),
+      NonImportedCallers.end());
+
+  for (const auto &Name : NonImportedCallers) {
+    auto &Node = *NodesMap[Name];
+    if (!Node.Visited)
+      dfs(Node);
+  }
+}
+
+void ImportedFunctionsInliningStatistics::dfs(InlineGraphNode &GraphNode) {
+  assert(!GraphNode.Visited);
+  GraphNode.Visited = true;
+  for (auto *const InlinedFunctionNode : GraphNode.InlinedCallees) {
+    InlinedFunctionNode->NumberOfRealInlines++;
+    if (!InlinedFunctionNode->Visited)
+      dfs(*InlinedFunctionNode);
+  }
+}
+
+ImportedFunctionsInliningStatistics::SortedNodesTy
+ImportedFunctionsInliningStatistics::getSortedNodes() {
+  SortedNodesTy SortedNodes;
+  SortedNodes.reserve(NodesMap.size());
+  for (const NodesMapTy::value_type &Node : NodesMap)
+    SortedNodes.push_back(&Node);
+
+  llvm::sort(SortedNodes, [&](const SortedNodesTy::value_type &Lhs,
+                              const SortedNodesTy::value_type &Rhs) {
+    if (Lhs->second->NumberOfInlines != Rhs->second->NumberOfInlines)
+      return Lhs->second->NumberOfInlines > Rhs->second->NumberOfInlines;
+    if (Lhs->second->NumberOfRealInlines != Rhs->second->NumberOfRealInlines)
+      return Lhs->second->NumberOfRealInlines >
+             Rhs->second->NumberOfRealInlines;
+    return Lhs->first() < Rhs->first();
+  });
+  return SortedNodes;
+}
diff --git a/contrib/llvm-project/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp b/contrib/llvm-project/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp
index c32aa0340ceb..b112ed2e4439 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp
@@ -22,9 +22,7 @@
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include <string>
-#include <utility>
-#include <vector>
+#include <memory>
 
 using namespace llvm;
 
diff --git a/contrib/llvm-project/llvm/lib/Analysis/InlineAdvisor.cpp b/contrib/llvm-project/llvm/lib/Analysis/InlineAdvisor.cpp
index 74a536d1ce2f..9a2276a16132 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/InlineAdvisor.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/InlineAdvisor.cpp
@@ -16,6 +16,7 @@
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/ReplayInlineAdvisor.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/DebugInfoMetadata.h"
@@ -48,45 +49,33 @@ static cl::opt<int>
                         cl::desc("Scale to limit the cost of inline deferral"),
                         cl::init(2), cl::Hidden);
 
-namespace {
-class DefaultInlineAdvice : public InlineAdvice {
-public:
-  DefaultInlineAdvice(DefaultInlineAdvisor *Advisor, CallBase &CB,
-                      Optional<InlineCost> OIC, OptimizationRemarkEmitter &ORE)
-      : InlineAdvice(Advisor, CB, ORE, OIC.hasValue()), OriginalCB(&CB),
-        OIC(OIC) {}
-
-private:
-  void recordUnsuccessfulInliningImpl(const InlineResult &Result) override {
-    using namespace ore;
-    llvm::setInlineRemark(*OriginalCB, std::string(Result.getFailureReason()) +
-                                           "; " + inlineCostStr(*OIC));
-    ORE.emit([&]() {
-      return OptimizationRemarkMissed(DEBUG_TYPE, "NotInlined", DLoc, Block)
-             << NV("Callee", Callee) << " will not be inlined into "
-             << NV("Caller", Caller) << ": "
-             << NV("Reason", Result.getFailureReason());
-    });
-  }
+extern cl::opt<InlinerFunctionImportStatsOpts> InlinerFunctionImportStats;
 
-  void recordInliningWithCalleeDeletedImpl() override {
-    emitInlinedInto(ORE, DLoc, Block, *Callee, *Caller, *OIC);
-  }
+void DefaultInlineAdvice::recordUnsuccessfulInliningImpl(
+    const InlineResult &Result) {
+  using namespace ore;
+  llvm::setInlineRemark(*OriginalCB, std::string(Result.getFailureReason()) +
+                                         "; " + inlineCostStr(*OIC));
+  ORE.emit([&]() {
+    return OptimizationRemarkMissed(DEBUG_TYPE, "NotInlined", DLoc, Block)
+           << NV("Callee", Callee) << " will not be inlined into "
+           << NV("Caller", Caller) << ": "
+           << NV("Reason", Result.getFailureReason());
+  });
+}
 
-  void recordInliningImpl() override {
+void DefaultInlineAdvice::recordInliningWithCalleeDeletedImpl() {
+  if (EmitRemarks)
     emitInlinedInto(ORE, DLoc, Block, *Callee, *Caller, *OIC);
-  }
-
-private:
-  CallBase *const OriginalCB;
-  Optional<InlineCost> OIC;
-};
+}
 
-} // namespace
+void DefaultInlineAdvice::recordInliningImpl() {
+  if (EmitRemarks)
+    emitInlinedInto(ORE, DLoc, Block, *Callee, *Caller, *OIC);
+}
 
-llvm::Optional<llvm::InlineCost>
-getDefaultInlineAdvice(CallBase &CB, FunctionAnalysisManager &FAM,
-                       const InlineParams &Params) {
+llvm::Optional<llvm::InlineCost> static getDefaultInlineAdvice(
+    CallBase &CB, FunctionAnalysisManager &FAM, const InlineParams &Params) {
   Function &Caller = *CB.getCaller();
   ProfileSummaryInfo *PSI =
       FAM.getResult<ModuleAnalysisManagerFunctionProxy>(Caller)
@@ -114,11 +103,11 @@ getDefaultInlineAdvice(CallBase &CB, FunctionAnalysisManager &FAM,
                          GetBFI, PSI, RemarksEnabled ? &ORE : nullptr);
   };
   return llvm::shouldInline(CB, GetInlineCost, ORE,
-                            Params.EnableDeferral.hasValue() &&
-                                Params.EnableDeferral.getValue());
+                            Params.EnableDeferral.getValueOr(false));
 }
 
-std::unique_ptr<InlineAdvice> DefaultInlineAdvisor::getAdvice(CallBase &CB) {
+std::unique_ptr<InlineAdvice>
+DefaultInlineAdvisor::getAdviceImpl(CallBase &CB) {
   auto OIC = getDefaultInlineAdvice(CB, FAM, Params);
   return std::make_unique<DefaultInlineAdvice>(
       this, CB, OIC,
@@ -144,8 +133,20 @@ void InlineAdvisor::freeDeletedFunctions() {
   DeletedFunctions.clear();
 }
 
+void InlineAdvice::recordInlineStatsIfNeeded() {
+  if (Advisor->ImportedFunctionsStats)
+    Advisor->ImportedFunctionsStats->recordInline(*Caller, *Callee);
+}
+
+void InlineAdvice::recordInlining() {
+  markRecorded();
+  recordInlineStatsIfNeeded();
+  recordInliningImpl();
+}
+
 void InlineAdvice::recordInliningWithCalleeDeleted() {
   markRecorded();
+  recordInlineStatsIfNeeded();
   Advisor->markFunctionAsDeleted(Callee);
   recordInliningWithCalleeDeletedImpl();
 }
@@ -153,14 +154,28 @@ void InlineAdvice::recordInliningWithCalleeDeleted() {
 AnalysisKey InlineAdvisorAnalysis::Key;
 
 bool InlineAdvisorAnalysis::Result::tryCreate(InlineParams Params,
-                                              InliningAdvisorMode Mode) {
+                                              InliningAdvisorMode Mode,
+                                              StringRef ReplayFile) {
   auto &FAM = MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
   switch (Mode) {
   case InliningAdvisorMode::Default:
-    Advisor.reset(new DefaultInlineAdvisor(FAM, Params));
+    Advisor.reset(new DefaultInlineAdvisor(M, FAM, Params));
+    // Restrict replay to default advisor, ML advisors are stateful so
+    // replay will need augmentations to interleave with them correctly.
+    if (!ReplayFile.empty()) {
+      Advisor = std::make_unique<ReplayInlineAdvisor>(
+          M, FAM, M.getContext(), std::move(Advisor), ReplayFile,
+          /* EmitRemarks =*/true);
+    }
     break;
   case InliningAdvisorMode::Development:
-    // To be added subsequently under conditional compilation.
+#ifdef LLVM_HAVE_TF_API
+    Advisor =
+        llvm::getDevelopmentModeAdvisor(M, MAM, [&FAM, Params](CallBase &CB) {
+          auto OIC = getDefaultInlineAdvice(CB, FAM, Params);
+          return OIC.hasValue();
+        });
+#endif
     break;
   case InliningAdvisorMode::Release:
 #ifdef LLVM_HAVE_TF_AOT
@@ -168,6 +183,7 @@ bool InlineAdvisorAnalysis::Result::tryCreate(InlineParams Params,
 #endif
     break;
   }
+
   return !!Advisor;
 }
 
@@ -366,9 +382,35 @@ llvm::shouldInline(CallBase &CB,
   return IC;
 }
 
+std::string llvm::getCallSiteLocation(DebugLoc DLoc) {
+  std::ostringstream CallSiteLoc;
+  bool First = true;
+  for (DILocation *DIL = DLoc.get(); DIL; DIL = DIL->getInlinedAt()) {
+    if (!First)
+      CallSiteLoc << " @ ";
+    // Note that negative line offset is actually possible, but we use
+    // unsigned int to match line offset representation in remarks so
+    // it's directly consumable by relay advisor.
+    uint32_t Offset =
+        DIL->getLine() - DIL->getScope()->getSubprogram()->getLine();
+    uint32_t Discriminator = DIL->getBaseDiscriminator();
+    StringRef Name = DIL->getScope()->getSubprogram()->getLinkageName();
+    if (Name.empty())
+      Name = DIL->getScope()->getSubprogram()->getName();
+    CallSiteLoc << Name.str() << ":" << llvm::utostr(Offset) << ":"
+                << llvm::utostr(DIL->getColumn());
+    if (Discriminator)
+      CallSiteLoc << "." << llvm::utostr(Discriminator);
+    First = false;
+  }
+
+  return CallSiteLoc.str();
+}
+
 void llvm::addLocationToRemarks(OptimizationRemark &Remark, DebugLoc DLoc) {
-  if (!DLoc.get())
+  if (!DLoc.get()) {
     return;
+  }
 
   bool First = true;
   Remark << " at callsite ";
@@ -381,11 +423,14 @@ void llvm::addLocationToRemarks(OptimizationRemark &Remark, DebugLoc DLoc) {
     StringRef Name = DIL->getScope()->getSubprogram()->getLinkageName();
     if (Name.empty())
       Name = DIL->getScope()->getSubprogram()->getName();
-    Remark << Name << ":" << ore::NV("Line", Offset);
+    Remark << Name << ":" << ore::NV("Line", Offset) << ":"
+           << ore::NV("Column", DIL->getColumn());
     if (Discriminator)
       Remark << "." << ore::NV("Disc", Discriminator);
     First = false;
   }
+
+  Remark << ";";
 }
 
 void llvm::emitInlinedInto(OptimizationRemarkEmitter &ORE, DebugLoc DLoc,
@@ -406,3 +451,64 @@ void llvm::emitInlinedInto(OptimizationRemarkEmitter &ORE, DebugLoc DLoc,
     return Remark;
   });
 }
+
+InlineAdvisor::InlineAdvisor(Module &M, FunctionAnalysisManager &FAM)
+    : M(M), FAM(FAM) {
+  if (InlinerFunctionImportStats != InlinerFunctionImportStatsOpts::No) {
+    ImportedFunctionsStats =
+        std::make_unique<ImportedFunctionsInliningStatistics>();
+    ImportedFunctionsStats->setModuleInfo(M);
+  }
+}
+
+InlineAdvisor::~InlineAdvisor() {
+  if (ImportedFunctionsStats) {
+    assert(InlinerFunctionImportStats != InlinerFunctionImportStatsOpts::No);
+    ImportedFunctionsStats->dump(InlinerFunctionImportStats ==
+                                 InlinerFunctionImportStatsOpts::Verbose);
+  }
+
+  freeDeletedFunctions();
+}
+
+std::unique_ptr<InlineAdvice> InlineAdvisor::getMandatoryAdvice(CallBase &CB,
+                                                                bool Advice) {
+  return std::make_unique<InlineAdvice>(this, CB, getCallerORE(CB), Advice);
+}
+
+InlineAdvisor::MandatoryInliningKind
+InlineAdvisor::getMandatoryKind(CallBase &CB, FunctionAnalysisManager &FAM,
+                                OptimizationRemarkEmitter &ORE) {
+  auto &Callee = *CB.getCalledFunction();
+
+  auto GetTLI = [&](Function &F) -> const TargetLibraryInfo & {
+    return FAM.getResult<TargetLibraryAnalysis>(F);
+  };
+
+  auto &TIR = FAM.getResult<TargetIRAnalysis>(Callee);
+
+  auto TrivialDecision =
+      llvm::getAttributeBasedInliningDecision(CB, &Callee, TIR, GetTLI);
+
+  if (TrivialDecision.hasValue()) {
+    if (TrivialDecision->isSuccess())
+      return MandatoryInliningKind::Always;
+    else
+      return MandatoryInliningKind::Never;
+  }
+  return MandatoryInliningKind::NotMandatory;
+}
+
+std::unique_ptr<InlineAdvice> InlineAdvisor::getAdvice(CallBase &CB,
+                                                       bool MandatoryOnly) {
+  if (!MandatoryOnly)
+    return getAdviceImpl(CB);
+  bool Advice = CB.getCaller() != CB.getCalledFunction() &&
+                MandatoryInliningKind::Always ==
+                    getMandatoryKind(CB, FAM, getCallerORE(CB));
+  return getMandatoryAdvice(CB, Advice);
+}
+
+OptimizationRemarkEmitter &InlineAdvisor::getCallerORE(CallBase &CB) {
+  return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*CB.getCaller());
+}
diff --git a/contrib/llvm-project/llvm/lib/Analysis/InlineCost.cpp b/contrib/llvm-project/llvm/lib/Analysis/InlineCost.cpp
index 33d714406d7f..a35f5e11f0e0 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/InlineCost.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/InlineCost.cpp
@@ -71,6 +71,20 @@ static cl::opt<int>
                           cl::init(45), cl::ZeroOrMore,
                           cl::desc("Threshold for inlining cold callsites"));
 
+static cl::opt<bool> InlineEnableCostBenefitAnalysis(
+    "inline-enable-cost-benefit-analysis", cl::Hidden, cl::init(false),
+    cl::desc("Enable the cost-benefit analysis for the inliner"));
+
+static cl::opt<int> InlineSavingsMultiplier(
+    "inline-savings-multiplier", cl::Hidden, cl::init(8), cl::ZeroOrMore,
+    cl::desc("Multiplier to multiply cycle savings by during inlining"));
+
+static cl::opt<int>
+    InlineSizeAllowance("inline-size-allowance", cl::Hidden, cl::init(100),
+                        cl::ZeroOrMore,
+                        cl::desc("The maximum size of a callee that get's "
+                                 "inlined without sufficient cycle savings"));
+
 // We introduce this threshold to help performance of instrumentation based
 // PGO before we actually hook up inliner with analysis passes such as BPI and
 // BFI.
@@ -183,6 +197,9 @@ protected:
   CallBase &CandidateCall;
 
   /// Extension points for handling callsite features.
+  // Called before a basic block was analyzed.
+  virtual void onBlockStart(const BasicBlock *BB) {}
+
   /// Called after a basic block was analyzed.
   virtual void onBlockAnalyzed(const BasicBlock *BB) {}
 
@@ -454,12 +471,24 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
   /// Ignore the threshold when finalizing analysis.
   const bool IgnoreThreshold;
 
+  // True if the cost-benefit-analysis-based inliner is enabled.
+  const bool CostBenefitAnalysisEnabled;
+
   /// Inlining cost measured in abstract units, accounts for all the
   /// instructions expected to be executed for a given function invocation.
   /// Instructions that are statically proven to be dead based on call-site
   /// arguments are not counted here.
   int Cost = 0;
 
+  // The cumulative cost at the beginning of the basic block being analyzed.  At
+  // the end of analyzing each basic block, "Cost - CostAtBBStart" represents
+  // the size of that basic block.
+  int CostAtBBStart = 0;
+
+  // The static size of live but cold basic blocks.  This is "static" in the
+  // sense that it's not weighted by profile counts at all.
+  int ColdSize = 0;
+
   bool SingleBB = true;
 
   unsigned SROACostSavings = 0;
@@ -597,7 +626,21 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
     SROACostSavings += InlineConstants::InstrCost;
   }
 
+  void onBlockStart(const BasicBlock *BB) override { CostAtBBStart = Cost; }
+
   void onBlockAnalyzed(const BasicBlock *BB) override {
+    if (CostBenefitAnalysisEnabled) {
+      // Keep track of the static size of live but cold basic blocks.  For now,
+      // we define a cold basic block to be one that's never executed.
+      assert(GetBFI && "GetBFI must be available");
+      BlockFrequencyInfo *BFI = &(GetBFI(F));
+      assert(BFI && "BFI must be available");
+      auto ProfileCount = BFI->getBlockProfileCount(BB);
+      assert(ProfileCount.hasValue());
+      if (ProfileCount.getValue() == 0)
+        ColdSize += Cost - CostAtBBStart;
+    }
+
     auto *TI = BB->getTerminator();
     // If we had any successors at this point, than post-inlining is likely to
     // have them as well. Note that we assume any basic blocks which existed
@@ -628,6 +671,131 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
     InstructionCostDetailMap[I].ThresholdAfter = Threshold;
   }
 
+  bool isCostBenefitAnalysisEnabled() {
+    if (!InlineEnableCostBenefitAnalysis)
+      return false;
+
+    if (!PSI || !PSI->hasProfileSummary())
+      return false;
+
+    if (!GetBFI)
+      return false;
+
+    auto *Caller = CandidateCall.getParent()->getParent();
+    if (!Caller->getEntryCount())
+      return false;
+
+    BlockFrequencyInfo *CallerBFI = &(GetBFI(*Caller));
+    if (!CallerBFI)
+      return false;
+
+    // For now, limit to hot call site.
+    if (!PSI->isHotCallSite(CandidateCall, CallerBFI))
+      return false;
+
+    if (!F.getEntryCount())
+      return false;
+
+    BlockFrequencyInfo *CalleeBFI = &(GetBFI(F));
+    if (!CalleeBFI)
+      return false;
+
+    return true;
+  }
+
+  // Determine whether we should inline the given call site, taking into account
+  // both the size cost and the cycle savings.  Return None if we don't have
+  // suficient profiling information to determine.
+  Optional<bool> costBenefitAnalysis() {
+    if (!CostBenefitAnalysisEnabled)
+      return None;
+
+    // buildInlinerPipeline in the pass builder sets HotCallSiteThreshold to 0
+    // for the prelink phase of the AutoFDO + ThinLTO build.  Honor the logic by
+    // falling back to the cost-based metric.
+    // TODO: Improve this hacky condition.
+    if (Threshold == 0)
+      return None;
+
+    assert(GetBFI);
+    BlockFrequencyInfo *CalleeBFI = &(GetBFI(F));
+    assert(CalleeBFI);
+
+    // The cycle savings expressed as the sum of InlineConstants::InstrCost
+    // multiplied by the estimated dynamic count of each instruction we can
+    // avoid.  Savings come from the call site cost, such as argument setup and
+    // the call instruction, as well as the instructions that are folded.
+    //
+    // We use 128-bit APInt here to avoid potential overflow.  This variable
+    // should stay well below 10^^24 (or 2^^80) in practice.  This "worst" case
+    // assumes that we can avoid or fold a billion instructions, each with a
+    // profile count of 10^^15 -- roughly the number of cycles for a 24-hour
+    // period on a 4GHz machine.
+    APInt CycleSavings(128, 0);
+
+    for (auto &BB : F) {
+      APInt CurrentSavings(128, 0);
+      for (auto &I : BB) {
+        if (BranchInst *BI = dyn_cast<BranchInst>(&I)) {
+          // Count a conditional branch as savings if it becomes unconditional.
+          if (BI->isConditional() &&
+              dyn_cast_or_null<ConstantInt>(
+                  SimplifiedValues.lookup(BI->getCondition()))) {
+            CurrentSavings += InlineConstants::InstrCost;
+          }
+        } else if (Value *V = dyn_cast<Value>(&I)) {
+          // Count an instruction as savings if we can fold it.
+          if (SimplifiedValues.count(V)) {
+            CurrentSavings += InlineConstants::InstrCost;
+          }
+        }
+        // TODO: Consider other forms of savings like switch statements,
+        // indirect calls becoming direct, SROACostSavings, LoadEliminationCost,
+        // etc.
+      }
+
+      auto ProfileCount = CalleeBFI->getBlockProfileCount(&BB);
+      assert(ProfileCount.hasValue());
+      CurrentSavings *= ProfileCount.getValue();
+      CycleSavings += CurrentSavings;
+    }
+
+    // Compute the cycle savings per call.
+    auto EntryProfileCount = F.getEntryCount();
+    assert(EntryProfileCount.hasValue());
+    auto EntryCount = EntryProfileCount.getCount();
+    CycleSavings += EntryCount / 2;
+    CycleSavings = CycleSavings.udiv(EntryCount);
+
+    // Compute the total savings for the call site.
+    auto *CallerBB = CandidateCall.getParent();
+    BlockFrequencyInfo *CallerBFI = &(GetBFI(*(CallerBB->getParent())));
+    CycleSavings += getCallsiteCost(this->CandidateCall, DL);
+    CycleSavings *= CallerBFI->getBlockProfileCount(CallerBB).getValue();
+
+    // Remove the cost of the cold basic blocks.
+    int Size = Cost - ColdSize;
+
+    // Allow tiny callees to be inlined regardless of whether they meet the
+    // savings threshold.
+    Size = Size > InlineSizeAllowance ? Size - InlineSizeAllowance : 1;
+
+    // Return true if the savings justify the cost of inlining.  Specifically,
+    // we evaluate the following inequality:
+    //
+    //  CycleSavings      PSI->getOrCompHotCountThreshold()
+    // -------------- >= -----------------------------------
+    //       Size              InlineSavingsMultiplier
+    //
+    // Note that the left hand side is specific to a call site.  The right hand
+    // side is a constant for the entire executable.
+    APInt LHS = CycleSavings;
+    LHS *= InlineSavingsMultiplier;
+    APInt RHS(128, PSI->getOrCompHotCountThreshold());
+    RHS *= Size;
+    return LHS.uge(RHS);
+  }
+
   InlineResult finalizeAnalysis() override {
     // Loops generally act a lot like calls in that they act like barriers to
     // movement, require a certain amount of setup, etc. So when optimising for
@@ -656,6 +824,13 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
     else if (NumVectorInstructions <= NumInstructions / 2)
       Threshold -= VectorBonus / 2;
 
+    if (auto Result = costBenefitAnalysis()) {
+      if (Result.getValue())
+        return InlineResult::success();
+      else
+        return InlineResult::failure("Cost over threshold.");
+    }
+
     if (IgnoreThreshold || Cost < std::max(1, Threshold))
       return InlineResult::success();
     return InlineResult::failure("Cost over threshold.");
@@ -726,9 +901,11 @@ public:
       bool IgnoreThreshold = false)
       : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, PSI, ORE),
         ComputeFullInlineCost(OptComputeFullInlineCost ||
-                              Params.ComputeFullInlineCost || ORE),
+                              Params.ComputeFullInlineCost || ORE ||
+                              isCostBenefitAnalysisEnabled()),
         Params(Params), Threshold(Params.DefaultThreshold),
         BoostIndirectCalls(BoostIndirect), IgnoreThreshold(IgnoreThreshold),
+        CostBenefitAnalysisEnabled(isCostBenefitAnalysisEnabled()),
         Writer(this) {}
 
   /// Annotation Writer for instruction details
@@ -841,11 +1018,11 @@ bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
 bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) {
   SmallVector<Value *, 4> Operands;
   Operands.push_back(GEP.getOperand(0));
-  for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
-    if (Constant *SimpleOp = SimplifiedValues.lookup(*I))
+  for (const Use &Op : GEP.indices())
+    if (Constant *SimpleOp = SimplifiedValues.lookup(Op))
       Operands.push_back(SimpleOp);
     else
-      Operands.push_back(*I);
+      Operands.push_back(Op);
   return TargetTransformInfo::TCC_Free ==
          TTI.getUserCost(&GEP, Operands,
                          TargetTransformInfo::TCK_SizeAndLatency);
@@ -867,7 +1044,7 @@ bool CallAnalyzer::visitAlloca(AllocaInst &I) {
       // is needed to track stack usage during inlining.
       Type *Ty = I.getAllocatedType();
       AllocatedSize = SaturatingMultiplyAdd(
-          AllocSize->getLimitedValue(), DL.getTypeAllocSize(Ty).getFixedSize(),
+          AllocSize->getLimitedValue(), DL.getTypeAllocSize(Ty).getKnownMinSize(),
           AllocatedSize);
       if (AllocatedSize > InlineConstants::MaxSimplifiedDynamicAllocaToInline) {
         HasDynamicAlloca = true;
@@ -881,7 +1058,7 @@ bool CallAnalyzer::visitAlloca(AllocaInst &I) {
   if (I.isStaticAlloca()) {
     Type *Ty = I.getAllocatedType();
     AllocatedSize =
-        SaturatingAdd(DL.getTypeAllocSize(Ty).getFixedSize(), AllocatedSize);
+        SaturatingAdd(DL.getTypeAllocSize(Ty).getKnownMinSize(), AllocatedSize);
   }
 
   // We will happily inline static alloca instructions.
@@ -1017,8 +1194,8 @@ bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
 
   // Lambda to check whether a GEP's indices are all constant.
   auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) {
-    for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
-      if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I))
+    for (const Use &Op : GEP.indices())
+      if (!isa<Constant>(Op) && !SimplifiedValues.lookup(Op))
         return false;
     return true;
   };
@@ -1101,7 +1278,7 @@ bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
   // integer is large enough to represent the pointer.
   unsigned IntegerSize = I.getType()->getScalarSizeInBits();
   unsigned AS = I.getOperand(0)->getType()->getPointerAddressSpace();
-  if (IntegerSize >= DL.getPointerSizeInBits(AS)) {
+  if (IntegerSize == DL.getPointerSizeInBits(AS)) {
     std::pair<Value *, APInt> BaseAndOffset =
         ConstantOffsetPtrs.lookup(I.getOperand(0));
     if (BaseAndOffset.first)
@@ -1403,6 +1580,7 @@ void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) {
   // Finally, take the target-specific inlining threshold multiplier into
   // account.
   Threshold *= TTI.getInliningThresholdMultiplier();
+  Threshold += TTI.adjustInliningThreshold(&Call);
 
   SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
   VectorBonus = Threshold * VectorBonusPercent / 100;
@@ -1884,8 +2062,8 @@ bool CallAnalyzer::visitInstruction(Instruction &I) {
 
   // We found something we don't understand or can't handle. Mark any SROA-able
   // values in the operand list as no longer viable.
-  for (User::op_iterator OI = I.op_begin(), OE = I.op_end(); OI != OE; ++OI)
-    disableSROA(*OI);
+  for (const Use &Op : I.operands())
+    disableSROA(Op);
 
   return false;
 }
@@ -1900,7 +2078,7 @@ bool CallAnalyzer::visitInstruction(Instruction &I) {
 InlineResult
 CallAnalyzer::analyzeBlock(BasicBlock *BB,
                            SmallPtrSetImpl<const Value *> &EphValues) {
-  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+  for (Instruction &I : *BB) {
     // FIXME: Currently, the number of instructions in a function regardless of
     // our ability to simplify them during inline to constants or dead code,
     // are actually used by the vector bonus heuristic. As long as that's true,
@@ -1911,12 +2089,16 @@ CallAnalyzer::analyzeBlock(BasicBlock *BB,
     if (isa<DbgInfoIntrinsic>(I))
       continue;
 
+    // Skip pseudo-probes.
+    if (isa<PseudoProbeInst>(I))
+      continue;
+
     // Skip ephemeral values.
-    if (EphValues.count(&*I))
+    if (EphValues.count(&I))
       continue;
 
     ++NumInstructions;
-    if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy())
+    if (isa<ExtractElementInst>(I) || I.getType()->isVectorTy())
       ++NumVectorInstructions;
 
     // If the instruction simplified to a constant, there is no cost to this
@@ -1924,14 +2106,14 @@ CallAnalyzer::analyzeBlock(BasicBlock *BB,
     // all of the per-instruction logic. The visit tree returns true if we
     // consumed the instruction in any way, and false if the instruction's base
     // cost should count against inlining.
-    onInstructionAnalysisStart(&*I);
+    onInstructionAnalysisStart(&I);
 
-    if (Base::visit(&*I))
+    if (Base::visit(&I))
       ++NumInstructionsSimplified;
     else
       onMissedSimplification();
 
-    onInstructionAnalysisFinish(&*I);
+    onInstructionAnalysisFinish(&I);
     using namespace ore;
     // If the visit this instruction detected an uninlinable pattern, abort.
     InlineResult IR = InlineResult::success();
@@ -2092,23 +2274,23 @@ InlineResult CallAnalyzer::analyze() {
   // Populate our simplified values by mapping from function arguments to call
   // arguments with known important simplifications.
   auto CAI = CandidateCall.arg_begin();
-  for (Function::arg_iterator FAI = F.arg_begin(), FAE = F.arg_end();
-       FAI != FAE; ++FAI, ++CAI) {
+  for (Argument &FAI : F.args()) {
     assert(CAI != CandidateCall.arg_end());
     if (Constant *C = dyn_cast<Constant>(CAI))
-      SimplifiedValues[&*FAI] = C;
+      SimplifiedValues[&FAI] = C;
 
     Value *PtrArg = *CAI;
     if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) {
-      ConstantOffsetPtrs[&*FAI] = std::make_pair(PtrArg, C->getValue());
+      ConstantOffsetPtrs[&FAI] = std::make_pair(PtrArg, C->getValue());
 
       // We can SROA any pointer arguments derived from alloca instructions.
       if (auto *SROAArg = dyn_cast<AllocaInst>(PtrArg)) {
-        SROAArgValues[&*FAI] = SROAArg;
+        SROAArgValues[&FAI] = SROAArg;
         onInitializeSROAArg(SROAArg);
         EnabledSROAAllocas.insert(SROAArg);
       }
     }
+    ++CAI;
   }
   NumConstantArgs = SimplifiedValues.size();
   NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
@@ -2142,6 +2324,8 @@ InlineResult CallAnalyzer::analyze() {
     if (BB->empty())
       continue;
 
+    onBlockStart(BB);
+
     // Disallow inlining a blockaddress with uses other than strictly callbr.
     // A blockaddress only has defined behavior for an indirect branch in the
     // same function, and we do not currently support inlining indirect
@@ -2328,6 +2512,13 @@ Optional<InlineResult> llvm::getAttributeBasedInliningDecision(
   if (!Callee)
     return InlineResult::failure("indirect call");
 
+  // When callee coroutine function is inlined into caller coroutine function
+  // before coro-split pass,
+  // coro-early pass can not handle this quiet well.
+  // So we won't inline the coroutine function if it have not been unsplited
+  if (Callee->isPresplitCoroutine())
+    return InlineResult::failure("unsplited coroutine call");
+
   // Never inline calls with byval arguments that does not have the alloca
   // address space. Since byval arguments can be replaced with a copy to an
   // alloca, the inlined code would need to be adjusted to handle that the
@@ -2378,6 +2569,15 @@ Optional<InlineResult> llvm::getAttributeBasedInliningDecision(
   if (Call.isNoInline())
     return InlineResult::failure("noinline call site attribute");
 
+  // Don't inline functions if one does not have any stack protector attribute
+  // but the other does.
+  if (Caller->hasStackProtectorFnAttr() && !Callee->hasStackProtectorFnAttr())
+    return InlineResult::failure(
+        "stack protected caller but callee requested no stack protector");
+  if (Callee->hasStackProtectorFnAttr() && !Caller->hasStackProtectorFnAttr())
+    return InlineResult::failure(
+        "stack protected callee but caller requested no stack protector");
+
   return None;
 }
 
@@ -2419,25 +2619,26 @@ InlineCost llvm::getInlineCost(
 
 InlineResult llvm::isInlineViable(Function &F) {
   bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice);
-  for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
+  for (BasicBlock &BB : F) {
     // Disallow inlining of functions which contain indirect branches.
-    if (isa<IndirectBrInst>(BI->getTerminator()))
+    if (isa<IndirectBrInst>(BB.getTerminator()))
       return InlineResult::failure("contains indirect branches");
 
     // Disallow inlining of blockaddresses which are used by non-callbr
     // instructions.
-    if (BI->hasAddressTaken())
-      for (User *U : BlockAddress::get(&*BI)->users())
+    if (BB.hasAddressTaken())
+      for (User *U : BlockAddress::get(&BB)->users())
         if (!isa<CallBrInst>(*U))
           return InlineResult::failure("blockaddress used outside of callbr");
 
-    for (auto &II : *BI) {
+    for (auto &II : BB) {
       CallBase *Call = dyn_cast<CallBase>(&II);
       if (!Call)
         continue;
 
       // Disallow recursive calls.
-      if (&F == Call->getCalledFunction())
+      Function *Callee = Call->getCalledFunction();
+      if (&F == Callee)
         return InlineResult::failure("recursive call");
 
       // Disallow calls which expose returns-twice to a function not previously
@@ -2446,8 +2647,8 @@ InlineResult llvm::isInlineViable(Function &F) {
           cast<CallInst>(Call)->canReturnTwice())
         return InlineResult::failure("exposes returns-twice attribute");
 
-      if (Call->getCalledFunction())
-        switch (Call->getCalledFunction()->getIntrinsicID()) {
+      if (Callee)
+        switch (Callee->getIntrinsicID()) {
         default:
           break;
         case llvm::Intrinsic::icall_branch_funnel:
@@ -2574,7 +2775,7 @@ InlineCostAnnotationPrinterPass::run(Function &F,
   // We can add a flag which determines InlineParams for this run. Right now,
   // the default InlineParams are used.
   const InlineParams Params = llvm::getInlineParams();
-    for (BasicBlock &BB : F) {
+  for (BasicBlock &BB : F) {
     for (Instruction &I : BB) {
       if (CallInst *CI = dyn_cast<CallInst>(&I)) {
         Function *CalledFunction = CI->getCalledFunction();
diff --git a/contrib/llvm-project/llvm/lib/Analysis/InlineFeaturesAnalysis.cpp b/contrib/llvm-project/llvm/lib/Analysis/InlineFeaturesAnalysis.cpp
deleted file mode 100644
index 90f521bbaab4..000000000000
--- a/contrib/llvm-project/llvm/lib/Analysis/InlineFeaturesAnalysis.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-//===- InlineFeaturesAnalysis.cpp - Feature extraction for ML Policies ----===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements an analysis extracting function features, which may be
-// used by ML-driven policies, for example.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Analysis/InlineFeaturesAnalysis.h"
-#include "llvm/IR/Instructions.h"
-
-using namespace llvm;
-
-AnalysisKey InlineFeaturesAnalysis::Key;
-
-InlineFeaturesAnalysis::Result
-InlineFeaturesAnalysis::run(const Function &F, FunctionAnalysisManager &FAM) {
-  Result Ret;
-  Ret.Uses = ((!F.hasLocalLinkage()) ? 1 : 0) + F.getNumUses();
-  for (const auto &BB : F) {
-    ++Ret.BasicBlockCount;
-    if (const auto *BI = dyn_cast<BranchInst>(BB.getTerminator())) {
-      if (BI->isConditional())
-        Ret.BlocksReachedFromConditionalInstruction += BI->getNumSuccessors();
-    } else if (const auto *SI = dyn_cast<SwitchInst>(BB.getTerminator()))
-      Ret.BlocksReachedFromConditionalInstruction +=
-          (SI->getNumCases() + (nullptr != SI->getDefaultDest()));
-    for (const auto &I : BB)
-      if (auto *CS = dyn_cast<CallBase>(&I)) {
-        const auto *Callee = CS->getCalledFunction();
-        if (Callee && !Callee->isIntrinsic() && !Callee->isDeclaration())
-          ++Ret.DirectCallsToDefinedFunctions;
-      }
-  }
-  return Ret;
-}
\ No newline at end of file
diff --git a/contrib/llvm-project/llvm/lib/Analysis/InlineSizeEstimatorAnalysis.cpp b/contrib/llvm-project/llvm/lib/Analysis/InlineSizeEstimatorAnalysis.cpp
index ebc59879d357..3c90e82fb952 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/InlineSizeEstimatorAnalysis.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/InlineSizeEstimatorAnalysis.cpp
@@ -67,8 +67,6 @@ public:
   static const size_t NumNamedFeatures =
       static_cast<size_t>(NamedFeatureIndex::NumNamedFeatures);
   struct FunctionFeatures {
-    static std::vector<std::pair<size_t, size_t>>
-        ImportantInstructionSuccessions;
     static const size_t FeatureCount;
 
     std::array<int32_t, NumNamedFeatures> NamedFeatures = {0};
@@ -84,53 +82,38 @@ public:
 
   static FunctionFeatures getFunctionFeatures(Function &F,
                                               FunctionAnalysisManager &FAM);
-
-private:
-  /// Sort once the feature tuples.
-  struct SortFeatureTuples {
-    bool IsSorted = false;
-    SortFeatureTuples() {
-      std::sort(FunctionFeatures::ImportantInstructionSuccessions.begin(),
-                FunctionFeatures::ImportantInstructionSuccessions.end());
-      IsSorted = true;
-    }
-  };
-
-  static llvm::ManagedStatic<SortFeatureTuples> TupleSorter;
-
-  static bool ensureSortedTuples() { return TupleSorter->IsSorted; }
 };
-llvm::ManagedStatic<IRToNativeSizeLearning::SortFeatureTuples>
-    IRToNativeSizeLearning::TupleSorter;
 
 // This is a point in time - we determined including these pairs of
 // consecutive instructions (in the IR layout available at inline time) as
 // features improves the model performance. We want to move away from manual
 // feature selection.
-// The vector is given in opcode pairs rather than labels because 1) labels
-// weren't readily available, and 2) the successions were hand - extracted
-std::vector<std::pair<size_t, size_t>>
-    IRToNativeSizeLearning::FunctionFeatures::ImportantInstructionSuccessions =
-        {{1, 34},  {15, 27}, {53, 53}, {53, 34}, {1, 11},  {32, 2},  {2, 48},
-         {28, 48}, {1, 45},  {49, 32}, {57, 56}, {55, 53}, {1, 28},  {57, 34},
-         {1, 1},   {32, 28}, {32, 15}, {49, 28}, {53, 1},  {2, 53},  {48, 34},
-         {28, 53}, {2, 32},  {1, 40},  {32, 48}, {29, 56}, {56, 32}, {55, 56},
-         {48, 56}, {1, 31},  {33, 34}, {2, 28},  {1, 12},  {55, 1},  {31, 31},
-         {65, 1},  {33, 56}, {32, 32}, {13, 13}, {1, 26},  {13, 26}, {2, 1},
-         {1, 33},  {47, 49}, {64, 1},  {2, 38},  {34, 53}, {48, 2},  {55, 34},
-         {34, 32}, {1, 5},   {56, 13}, {2, 2},   {2, 49},  {33, 2},  {49, 39},
-         {56, 49}, {33, 49}, {32, 39}, {39, 57}, {29, 33}, {31, 34}, {32, 29},
-         {47, 15}, {13, 34}, {2, 33},  {32, 49}, {49, 34}, {56, 33}, {1, 30},
-         {33, 33}, {31, 33}, {2, 29},  {56, 7},  {32, 13}, {2, 55},  {56, 56},
-         {2, 34},  {1, 42},  {34, 49}, {1, 20},  {32, 33}, {1, 25},  {53, 28},
-         {1, 14},  {31, 49}, {28, 2},  {2, 13},  {2, 56},  {1, 32},  {56, 53},
-         {65, 65}, {33, 53}, {64, 64}, {13, 2},  {34, 33}, {1, 4},   {49, 2},
-         {1, 9},   {56, 1},  {33, 1},  {53, 57}, {32, 53}, {13, 56}, {32, 56},
-         {55, 55}, {1, 18},  {49, 56}, {34, 34}, {1, 7},   {56, 64}, {32, 1},
-         {13, 33}, {55, 28}, {49, 33}, {57, 57}, {56, 34}, {34, 56}, {33, 32},
-         {32, 40}, {1, 29},  {53, 2},  {34, 1},  {32, 34}, {49, 49}, {1, 24},
-         {40, 34}, {1, 13},  {38, 34}, {29, 2},  {34, 2},  {1, 39},  {1, 22},
-         {1, 27},  {49, 1},  {1, 8},   {56, 2}};
+// The array is given in opcode pairs rather than labels because 1) labels
+// weren't readily available, and 2) the successions were hand - extracted.
+//
+// This array must be sorted.
+static const std::array<std::pair<size_t, size_t>, 137>
+    ImportantInstructionSuccessions{
+        {{1, 1},   {1, 4},   {1, 5},   {1, 7},   {1, 8},   {1, 9},   {1, 11},
+         {1, 12},  {1, 13},  {1, 14},  {1, 18},  {1, 20},  {1, 22},  {1, 24},
+         {1, 25},  {1, 26},  {1, 27},  {1, 28},  {1, 29},  {1, 30},  {1, 31},
+         {1, 32},  {1, 33},  {1, 34},  {1, 39},  {1, 40},  {1, 42},  {1, 45},
+         {2, 1},   {2, 2},   {2, 13},  {2, 28},  {2, 29},  {2, 32},  {2, 33},
+         {2, 34},  {2, 38},  {2, 48},  {2, 49},  {2, 53},  {2, 55},  {2, 56},
+         {13, 2},  {13, 13}, {13, 26}, {13, 33}, {13, 34}, {13, 56}, {15, 27},
+         {28, 2},  {28, 48}, {28, 53}, {29, 2},  {29, 33}, {29, 56}, {31, 31},
+         {31, 33}, {31, 34}, {31, 49}, {32, 1},  {32, 2},  {32, 13}, {32, 15},
+         {32, 28}, {32, 29}, {32, 32}, {32, 33}, {32, 34}, {32, 39}, {32, 40},
+         {32, 48}, {32, 49}, {32, 53}, {32, 56}, {33, 1},  {33, 2},  {33, 32},
+         {33, 33}, {33, 34}, {33, 49}, {33, 53}, {33, 56}, {34, 1},  {34, 2},
+         {34, 32}, {34, 33}, {34, 34}, {34, 49}, {34, 53}, {34, 56}, {38, 34},
+         {39, 57}, {40, 34}, {47, 15}, {47, 49}, {48, 2},  {48, 34}, {48, 56},
+         {49, 1},  {49, 2},  {49, 28}, {49, 32}, {49, 33}, {49, 34}, {49, 39},
+         {49, 49}, {49, 56}, {53, 1},  {53, 2},  {53, 28}, {53, 34}, {53, 53},
+         {53, 57}, {55, 1},  {55, 28}, {55, 34}, {55, 53}, {55, 55}, {55, 56},
+         {56, 1},  {56, 2},  {56, 7},  {56, 13}, {56, 32}, {56, 33}, {56, 34},
+         {56, 49}, {56, 53}, {56, 56}, {56, 64}, {57, 34}, {57, 56}, {57, 57},
+         {64, 1},  {64, 64}, {65, 1},  {65, 65}}};
 
 // We have: 9 calculated features (the features here); 1 feature for each
 // instruction opcode; and 1 feature for each manually-identified sequence.
@@ -140,16 +123,15 @@ std::vector<std::pair<size_t, size_t>>
 // Note that instruction opcodes start from 1. For convenience, we also have an
 // always 0 feature for the '0' opcode, hence the extra 1.
 const size_t IRToNativeSizeLearning::FunctionFeatures::FeatureCount =
-    IRToNativeSizeLearning::FunctionFeatures::ImportantInstructionSuccessions
-        .size() +
-    getMaxInstructionID() + 1 + IRToNativeSizeLearning::NumNamedFeatures;
+    ImportantInstructionSuccessions.size() + getMaxInstructionID() + 1 +
+    IRToNativeSizeLearning::NumNamedFeatures;
 
 size_t getSize(Function &F, TargetTransformInfo &TTI) {
   size_t Ret = 0;
-  for (auto &BB : F)
-    for (auto &I : BB)
-      Ret += TTI.getInstructionCost(
-          &I, TargetTransformInfo::TargetCostKind::TCK_CodeSize);
+  for (const auto &BB : F)
+    for (const auto &I : BB)
+      Ret += *(TTI.getInstructionCost(
+          &I, TargetTransformInfo::TargetCostKind::TCK_CodeSize).getValue());
   return Ret;
 }
 
@@ -161,8 +143,8 @@ size_t getSize(Function &F, FunctionAnalysisManager &FAM) {
 unsigned getMaxDominatorTreeDepth(const Function &F,
                                   const DominatorTree &Tree) {
   unsigned Ret = 0;
-  for (auto &BB : F)
-    if (auto *TN = Tree.getNode(&BB))
+  for (const auto &BB : F)
+    if (const auto *TN = Tree.getNode(&BB))
       Ret = std::max(Ret, TN->getLevel());
   return Ret;
 }
@@ -171,42 +153,37 @@ unsigned getMaxDominatorTreeDepth(const Function &F,
 IRToNativeSizeLearning::FunctionFeatures
 IRToNativeSizeLearning::getFunctionFeatures(Function &F,
                                             FunctionAnalysisManager &FAM) {
-  assert(ensureSortedTuples() && "expected lazy initialization");
+  assert(llvm::is_sorted(ImportantInstructionSuccessions) &&
+         "expected function features are sorted");
 
   auto &DomTree = FAM.getResult<DominatorTreeAnalysis>(F);
   FunctionFeatures FF;
   size_t InstrCount = getMaxInstructionID() + 1;
   FF.InstructionHistogram.resize(InstrCount);
 
-  FF.InstructionPairHistogram.resize(
-      FunctionFeatures::ImportantInstructionSuccessions.size());
+  FF.InstructionPairHistogram.resize(ImportantInstructionSuccessions.size());
 
-  auto StartID = 0;
-  auto LastID = StartID;
+  int StartID = 0;
+  int LastID = StartID;
   auto getPairIndex = [](size_t a, size_t b) {
-    auto I =
-        std::find(FunctionFeatures::ImportantInstructionSuccessions.begin(),
-                  FunctionFeatures::ImportantInstructionSuccessions.end(),
-                  std::make_pair(a, b));
-    if (I == FunctionFeatures::ImportantInstructionSuccessions.end())
+    auto I = llvm::find(ImportantInstructionSuccessions, std::make_pair(a, b));
+    if (I == ImportantInstructionSuccessions.end())
       return -1;
-    return static_cast<int>(std::distance(
-        FunctionFeatures::ImportantInstructionSuccessions.begin(), I));
+    return static_cast<int>(
+        std::distance(ImportantInstructionSuccessions.begin(), I));
   };
 
   // We don't want debug calls, because they'd just add noise.
-  for (auto &BB : F) {
-    for (auto I = BB.instructionsWithoutDebug().begin(),
-              E = BB.instructionsWithoutDebug().end();
-         I != E; ++I) {
-      auto ID = I->getOpcode();
+  for (const auto &BB : F) {
+    for (const auto &I : BB.instructionsWithoutDebug()) {
+      auto ID = I.getOpcode();
 
       ++FF.InstructionHistogram[ID];
       int PairIndex = getPairIndex(LastID, ID);
       if (PairIndex >= 0)
         ++FF.InstructionPairHistogram[PairIndex];
       LastID = ID;
-      if (isa<CallBase>(*I))
+      if (isa<CallBase>(I))
         ++FF[NamedFeatureIndex::Calls];
     }
   }
@@ -244,19 +221,18 @@ InlineSizeEstimatorAnalysis::InlineSizeEstimatorAnalysis() {
   if (!isEvaluatorRequested()) {
     return;
   }
-  std::vector<std::string> InputNames{"serving_default_input_1"};
-  std::vector<std::string> OutputName{"StatefulPartitionedCall"};
+  std::vector<TensorSpec> InputSpecs{TensorSpec::createSpec<int32_t>(
+      "serving_default_input_1",
+      {1, static_cast<int64_t>(
+              IRToNativeSizeLearning::FunctionFeatures::FeatureCount)})};
+  std::vector<TensorSpec> OutputSpecs{
+      TensorSpec::createSpec<float>("StatefulPartitionedCall", {1})};
   Evaluator = std::make_unique<TFModelEvaluator>(
-      TFIR2NativeModelPath.getValue().c_str(), InputNames, OutputName);
+      TFIR2NativeModelPath.getValue().c_str(), InputSpecs, OutputSpecs);
   if (!Evaluator || !Evaluator->isValid()) {
     Evaluator.reset();
     return;
   }
-  static const std::vector<int64_t> Dim{
-      1, static_cast<int64_t>(
-             IRToNativeSizeLearning::FunctionFeatures::FeatureCount)};
-
-  Evaluator->initInput<int32_t>(0, Dim);
 }
 
 InlineSizeEstimatorAnalysis::Result
@@ -296,4 +272,12 @@ InlineSizeEstimatorAnalysis::run(const Function &F,
   return None;
 }
 bool InlineSizeEstimatorAnalysis::isEvaluatorRequested() { return false; }
-#endif
\ No newline at end of file
+#endif
+
+PreservedAnalyses
+InlineSizeEstimatorAnalysisPrinterPass::run(Function &F,
+                                            FunctionAnalysisManager &AM) {
+  OS << "[InlineSizeEstimatorAnalysis] size estimate for " << F.getName()
+     << ": " << AM.getResult<InlineSizeEstimatorAnalysis>(F) << "\n";
+  return PreservedAnalyses::all();
+}
diff --git a/contrib/llvm-project/llvm/lib/Analysis/InstCount.cpp b/contrib/llvm-project/llvm/lib/Analysis/InstCount.cpp
index bb9c7b7eb11f..8366bee083f2 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/InstCount.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/InstCount.cpp
@@ -10,6 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Analysis/InstCount.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/IR/Function.h"
@@ -23,57 +24,71 @@ using namespace llvm;
 
 #define DEBUG_TYPE "instcount"
 
-STATISTIC(TotalInsts , "Number of instructions (of all types)");
+STATISTIC(TotalInsts, "Number of instructions (of all types)");
 STATISTIC(TotalBlocks, "Number of basic blocks");
-STATISTIC(TotalFuncs , "Number of non-external functions");
+STATISTIC(TotalFuncs, "Number of non-external functions");
 
-#define HANDLE_INST(N, OPCODE, CLASS) \
-  STATISTIC(Num ## OPCODE ## Inst, "Number of " #OPCODE " insts");
+#define HANDLE_INST(N, OPCODE, CLASS)                                          \
+  STATISTIC(Num##OPCODE##Inst, "Number of " #OPCODE " insts");
 
 #include "llvm/IR/Instruction.def"
 
 namespace {
-  class InstCount : public FunctionPass, public InstVisitor<InstCount> {
-    friend class InstVisitor<InstCount>;
+class InstCount : public InstVisitor<InstCount> {
+  friend class InstVisitor<InstCount>;
 
-    void visitFunction  (Function &F) { ++TotalFuncs; }
-    void visitBasicBlock(BasicBlock &BB) { ++TotalBlocks; }
+  void visitFunction(Function &F) { ++TotalFuncs; }
+  void visitBasicBlock(BasicBlock &BB) { ++TotalBlocks; }
 
-#define HANDLE_INST(N, OPCODE, CLASS) \
-    void visit##OPCODE(CLASS &) { ++Num##OPCODE##Inst; ++TotalInsts; }
+#define HANDLE_INST(N, OPCODE, CLASS)                                          \
+  void visit##OPCODE(CLASS &) {                                                \
+    ++Num##OPCODE##Inst;                                                       \
+    ++TotalInsts;                                                              \
+  }
 
 #include "llvm/IR/Instruction.def"
 
-    void visitInstruction(Instruction &I) {
-      errs() << "Instruction Count does not know about " << I;
-      llvm_unreachable(nullptr);
-    }
-  public:
-    static char ID; // Pass identification, replacement for typeid
-    InstCount() : FunctionPass(ID) {
-      initializeInstCountPass(*PassRegistry::getPassRegistry());
-    }
+  void visitInstruction(Instruction &I) {
+    errs() << "Instruction Count does not know about " << I;
+    llvm_unreachable(nullptr);
+  }
+};
+} // namespace
 
-    bool runOnFunction(Function &F) override;
+PreservedAnalyses InstCountPass::run(Function &F,
+                                     FunctionAnalysisManager &FAM) {
+  LLVM_DEBUG(dbgs() << "INSTCOUNT: running on function " << F.getName()
+                    << "\n");
+  InstCount().visit(F);
 
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.setPreservesAll();
-    }
-    void print(raw_ostream &O, const Module *M) const override {}
+  return PreservedAnalyses::all();
+}
 
+namespace {
+class InstCountLegacyPass : public FunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  InstCountLegacyPass() : FunctionPass(ID) {
+    initializeInstCountLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    LLVM_DEBUG(dbgs() << "INSTCOUNT: running on function " << F.getName()
+                      << "\n");
+    InstCount().visit(F);
+    return false;
   };
-}
 
-char InstCount::ID = 0;
-INITIALIZE_PASS(InstCount, "instcount",
-                "Counts the various types of Instructions", false, true)
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+  }
 
-FunctionPass *llvm::createInstCountPass() { return new InstCount(); }
+  void print(raw_ostream &O, const Module *M) const override {}
+};
+} // namespace
 
-// InstCount::run - This is the main Analysis entry point for a
-// function.
-//
-bool InstCount::runOnFunction(Function &F) {
-  visit(F);
-  return false;
-}
+char InstCountLegacyPass::ID = 0;
+INITIALIZE_PASS(InstCountLegacyPass, "instcount",
+                "Counts the various types of Instructions", false, true)
+
+FunctionPass *llvm::createInstCountPass() { return new InstCountLegacyPass(); }
diff --git a/contrib/llvm-project/llvm/lib/Analysis/InstructionSimplify.cpp b/contrib/llvm-project/llvm/lib/Analysis/InstructionSimplify.cpp
index e744a966a104..a12816885c40 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -228,64 +228,56 @@ static bool valueDominatesPHI(Value *V, PHINode *P, const DominatorTree *DT) {
   return false;
 }
 
-/// Simplify "A op (B op' C)" by distributing op over op', turning it into
-/// "(A op B) op' (A op C)".  Here "op" is given by Opcode and "op'" is
-/// given by OpcodeToExpand, while "A" corresponds to LHS and "B op' C" to RHS.
-/// Also performs the transform "(A op' B) op C" -> "(A op C) op' (B op C)".
-/// Returns the simplified value, or null if no simplification was performed.
-static Value *ExpandBinOp(Instruction::BinaryOps Opcode, Value *LHS, Value *RHS,
-                          Instruction::BinaryOps OpcodeToExpand,
+/// Try to simplify a binary operator of form "V op OtherOp" where V is
+/// "(B0 opex B1)" by distributing 'op' across 'opex' as
+/// "(B0 op OtherOp) opex (B1 op OtherOp)".
+static Value *expandBinOp(Instruction::BinaryOps Opcode, Value *V,
+                          Value *OtherOp, Instruction::BinaryOps OpcodeToExpand,
                           const SimplifyQuery &Q, unsigned MaxRecurse) {
-  // Recursion is always used, so bail out at once if we already hit the limit.
-  if (!MaxRecurse--)
+  auto *B = dyn_cast<BinaryOperator>(V);
+  if (!B || B->getOpcode() != OpcodeToExpand)
+    return nullptr;
+  Value *B0 = B->getOperand(0), *B1 = B->getOperand(1);
+  Value *L = SimplifyBinOp(Opcode, B0, OtherOp, Q.getWithoutUndef(),
+                           MaxRecurse);
+  if (!L)
+    return nullptr;
+  Value *R = SimplifyBinOp(Opcode, B1, OtherOp, Q.getWithoutUndef(),
+                           MaxRecurse);
+  if (!R)
     return nullptr;
 
-  // Check whether the expression has the form "(A op' B) op C".
-  if (BinaryOperator *Op0 = dyn_cast<BinaryOperator>(LHS))
-    if (Op0->getOpcode() == OpcodeToExpand) {
-      // It does!  Try turning it into "(A op C) op' (B op C)".
-      Value *A = Op0->getOperand(0), *B = Op0->getOperand(1), *C = RHS;
-      // Do "A op C" and "B op C" both simplify?
-      if (Value *L = SimplifyBinOp(Opcode, A, C, Q, MaxRecurse))
-        if (Value *R = SimplifyBinOp(Opcode, B, C, Q, MaxRecurse)) {
-          // They do! Return "L op' R" if it simplifies or is already available.
-          // If "L op' R" equals "A op' B" then "L op' R" is just the LHS.
-          if ((L == A && R == B) || (Instruction::isCommutative(OpcodeToExpand)
-                                     && L == B && R == A)) {
-            ++NumExpand;
-            return LHS;
-          }
-          // Otherwise return "L op' R" if it simplifies.
-          if (Value *V = SimplifyBinOp(OpcodeToExpand, L, R, Q, MaxRecurse)) {
-            ++NumExpand;
-            return V;
-          }
-        }
-    }
+  // Does the expanded pair of binops simplify to the existing binop?
+  if ((L == B0 && R == B1) ||
+      (Instruction::isCommutative(OpcodeToExpand) && L == B1 && R == B0)) {
+    ++NumExpand;
+    return B;
+  }
 
-  // Check whether the expression has the form "A op (B op' C)".
-  if (BinaryOperator *Op1 = dyn_cast<BinaryOperator>(RHS))
-    if (Op1->getOpcode() == OpcodeToExpand) {
-      // It does!  Try turning it into "(A op B) op' (A op C)".
-      Value *A = LHS, *B = Op1->getOperand(0), *C = Op1->getOperand(1);
-      // Do "A op B" and "A op C" both simplify?
-      if (Value *L = SimplifyBinOp(Opcode, A, B, Q, MaxRecurse))
-        if (Value *R = SimplifyBinOp(Opcode, A, C, Q, MaxRecurse)) {
-          // They do! Return "L op' R" if it simplifies or is already available.
-          // If "L op' R" equals "B op' C" then "L op' R" is just the RHS.
-          if ((L == B && R == C) || (Instruction::isCommutative(OpcodeToExpand)
-                                     && L == C && R == B)) {
-            ++NumExpand;
-            return RHS;
-          }
-          // Otherwise return "L op' R" if it simplifies.
-          if (Value *V = SimplifyBinOp(OpcodeToExpand, L, R, Q, MaxRecurse)) {
-            ++NumExpand;
-            return V;
-          }
-        }
-    }
+  // Otherwise, return "L op' R" if it simplifies.
+  Value *S = SimplifyBinOp(OpcodeToExpand, L, R, Q, MaxRecurse);
+  if (!S)
+    return nullptr;
 
+  ++NumExpand;
+  return S;
+}
+
+/// Try to simplify binops of form "A op (B op' C)" or the commuted variant by
+/// distributing op over op'.
+static Value *expandCommutativeBinOp(Instruction::BinaryOps Opcode,
+                                     Value *L, Value *R,
+                                     Instruction::BinaryOps OpcodeToExpand,
+                                     const SimplifyQuery &Q,
+                                     unsigned MaxRecurse) {
+  // Recursion is always used, so bail out at once if we already hit the limit.
+  if (!MaxRecurse--)
+    return nullptr;
+
+  if (Value *V = expandBinOp(Opcode, L, R, OpcodeToExpand, Q, MaxRecurse))
+    return V;
+  if (Value *V = expandBinOp(Opcode, R, L, OpcodeToExpand, Q, MaxRecurse))
+    return V;
   return nullptr;
 }
 
@@ -423,9 +415,9 @@ static Value *ThreadBinOpOverSelect(Instruction::BinaryOps Opcode, Value *LHS,
     return TV;
 
   // If one branch simplified to undef, return the other one.
-  if (TV && isa<UndefValue>(TV))
+  if (TV && Q.isUndefValue(TV))
     return FV;
-  if (FV && isa<UndefValue>(FV))
+  if (FV && Q.isUndefValue(FV))
     return TV;
 
   // If applying the operation did not change the true and false select values,
@@ -620,7 +612,7 @@ static Value *SimplifyAddInst(Value *Op0, Value *Op1, bool IsNSW, bool IsNUW,
     return C;
 
   // X + undef -> undef
-  if (match(Op1, m_Undef()))
+  if (Q.isUndefValue(Op1))
     return Op1;
 
   // X + 0 -> X
@@ -740,7 +732,7 @@ static Value *SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
 
   // X - undef -> undef
   // undef - X -> undef
-  if (match(Op0, m_Undef()) || match(Op1, m_Undef()))
+  if (Q.isUndefValue(Op0) || Q.isUndefValue(Op1))
     return UndefValue::get(Op0->getType());
 
   // X - 0 -> X
@@ -875,7 +867,7 @@ static Value *SimplifyMulInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
 
   // X * undef -> 0
   // X * 0 -> 0
-  if (match(Op1, m_CombineOr(m_Undef(), m_Zero())))
+  if (Q.isUndefValue(Op1) || match(Op1, m_Zero()))
     return Constant::getNullValue(Op0->getType());
 
   // X * 1 -> X
@@ -901,8 +893,8 @@ static Value *SimplifyMulInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
     return V;
 
   // Mul distributes over Add. Try some generic simplifications based on this.
-  if (Value *V = ExpandBinOp(Instruction::Mul, Op0, Op1, Instruction::Add,
-                             Q, MaxRecurse))
+  if (Value *V = expandCommutativeBinOp(Instruction::Mul, Op0, Op1,
+                                        Instruction::Add, Q, MaxRecurse))
     return V;
 
   // If the operation is with the result of a select instruction, check whether
@@ -928,36 +920,37 @@ Value *llvm::SimplifyMulInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) {
 
 /// Check for common or similar folds of integer division or integer remainder.
 /// This applies to all 4 opcodes (sdiv/udiv/srem/urem).
-static Value *simplifyDivRem(Value *Op0, Value *Op1, bool IsDiv) {
+static Value *simplifyDivRem(Value *Op0, Value *Op1, bool IsDiv,
+                             const SimplifyQuery &Q) {
   Type *Ty = Op0->getType();
 
-  // X / undef -> undef
-  // X % undef -> undef
-  if (match(Op1, m_Undef()))
-    return Op1;
+  // X / undef -> poison
+  // X % undef -> poison
+  if (Q.isUndefValue(Op1))
+    return PoisonValue::get(Ty);
 
-  // X / 0 -> undef
-  // X % 0 -> undef
+  // X / 0 -> poison
+  // X % 0 -> poison
   // We don't need to preserve faults!
   if (match(Op1, m_Zero()))
-    return UndefValue::get(Ty);
+    return PoisonValue::get(Ty);
 
-  // If any element of a constant divisor fixed width vector is zero or undef,
-  // the whole op is undef.
+  // If any element of a constant divisor fixed width vector is zero or undef
+  // the behavior is undefined and we can fold the whole op to poison.
   auto *Op1C = dyn_cast<Constant>(Op1);
   auto *VTy = dyn_cast<FixedVectorType>(Ty);
   if (Op1C && VTy) {
     unsigned NumElts = VTy->getNumElements();
     for (unsigned i = 0; i != NumElts; ++i) {
       Constant *Elt = Op1C->getAggregateElement(i);
-      if (Elt && (Elt->isNullValue() || isa<UndefValue>(Elt)))
-        return UndefValue::get(Ty);
+      if (Elt && (Elt->isNullValue() || Q.isUndefValue(Elt)))
+        return PoisonValue::get(Ty);
     }
   }
 
   // undef / X -> 0
   // undef % X -> 0
-  if (match(Op0, m_Undef()))
+  if (Q.isUndefValue(Op0))
     return Constant::getNullValue(Ty);
 
   // 0 / X -> 0
@@ -1051,7 +1044,7 @@ static Value *simplifyDiv(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1,
   if (Constant *C = foldOrCommuteConstant(Opcode, Op0, Op1, Q))
     return C;
 
-  if (Value *V = simplifyDivRem(Op0, Op1, true))
+  if (Value *V = simplifyDivRem(Op0, Op1, true, Q))
     return V;
 
   bool IsSigned = Opcode == Instruction::SDiv;
@@ -1109,7 +1102,7 @@ static Value *simplifyRem(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1,
   if (Constant *C = foldOrCommuteConstant(Opcode, Op0, Op1, Q))
     return C;
 
-  if (Value *V = simplifyDivRem(Op0, Op1, false))
+  if (Value *V = simplifyDivRem(Op0, Op1, false, Q))
     return V;
 
   // (X % Y) % Y -> X % Y
@@ -1204,14 +1197,14 @@ Value *llvm::SimplifyURemInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) {
   return ::SimplifyURemInst(Op0, Op1, Q, RecursionLimit);
 }
 
-/// Returns true if a shift by \c Amount always yields undef.
-static bool isUndefShift(Value *Amount) {
+/// Returns true if a shift by \c Amount always yields poison.
+static bool isPoisonShift(Value *Amount, const SimplifyQuery &Q) {
   Constant *C = dyn_cast<Constant>(Amount);
   if (!C)
     return false;
 
-  // X shift by undef -> undef because it may shift by the bitwidth.
-  if (isa<UndefValue>(C))
+  // X shift by undef -> poison because it may shift by the bitwidth.
+  if (Q.isUndefValue(C))
     return true;
 
   // Shifting by the bitwidth or more is undefined.
@@ -1222,9 +1215,10 @@ static bool isUndefShift(Value *Amount) {
 
   // If all lanes of a vector shift are undefined the whole shift is.
   if (isa<ConstantVector>(C) || isa<ConstantDataVector>(C)) {
-    for (unsigned I = 0, E = cast<VectorType>(C->getType())->getNumElements();
+    for (unsigned I = 0,
+                  E = cast<FixedVectorType>(C->getType())->getNumElements();
          I != E; ++I)
-      if (!isUndefShift(C->getAggregateElement(I)))
+      if (!isPoisonShift(C->getAggregateElement(I), Q))
         return false;
     return true;
   }
@@ -1252,8 +1246,8 @@ static Value *SimplifyShift(Instruction::BinaryOps Opcode, Value *Op0,
     return Op0;
 
   // Fold undefined shifts.
-  if (isUndefShift(Op1))
-    return UndefValue::get(Op0->getType());
+  if (isPoisonShift(Op1, Q))
+    return PoisonValue::get(Op0->getType());
 
   // If the operation is with the result of a select instruction, check whether
   // operating on either branch of the select always yields the same value.
@@ -1271,7 +1265,7 @@ static Value *SimplifyShift(Instruction::BinaryOps Opcode, Value *Op0,
   // the number of bits in the type, the shift is undefined.
   KnownBits Known = computeKnownBits(Op1, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
   if (Known.One.getLimitedValue() >= Known.getBitWidth())
-    return UndefValue::get(Op0->getType());
+    return PoisonValue::get(Op0->getType());
 
   // If all valid bits in the shift amount are known zero, the first operand is
   // unchanged.
@@ -1296,7 +1290,7 @@ static Value *SimplifyRightShift(Instruction::BinaryOps Opcode, Value *Op0,
 
   // undef >> X -> 0
   // undef >> X -> undef (if it's exact)
-  if (match(Op0, m_Undef()))
+  if (Q.isUndefValue(Op0))
     return isExact ? Op0 : Constant::getNullValue(Op0->getType());
 
   // The low bit cannot be shifted out of an exact shift if it is set.
@@ -1318,7 +1312,7 @@ static Value *SimplifyShlInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
 
   // undef << X -> 0
   // undef << X -> undef if (if it's NSW/NUW)
-  if (match(Op0, m_Undef()))
+  if (Q.isUndefValue(Op0))
     return isNSW || isNUW ? Op0 : Constant::getNullValue(Op0->getType());
 
   // (X >> A) << A -> X
@@ -1704,25 +1698,27 @@ static Value *simplifyAndOrOfICmpsWithLimitConst(ICmpInst *Cmp0, ICmpInst *Cmp1,
   if (!Cmp0->isEquality())
     return nullptr;
 
-  // The equality compare must be against a constant. Convert the 'null' pointer
-  // constant to an integer zero value.
-  APInt MinMaxC;
-  const APInt *C;
-  if (match(Cmp0->getOperand(1), m_APInt(C)))
-    MinMaxC = *C;
-  else if (isa<ConstantPointerNull>(Cmp0->getOperand(1)))
-    MinMaxC = APInt::getNullValue(8);
-  else
-    return nullptr;
-
   // The non-equality compare must include a common operand (X). Canonicalize
   // the common operand as operand 0 (the predicate is swapped if the common
   // operand was operand 1).
   ICmpInst::Predicate Pred0 = Cmp0->getPredicate();
   Value *X = Cmp0->getOperand(0);
   ICmpInst::Predicate Pred1;
-  if (!match(Cmp1, m_c_ICmp(Pred1, m_Specific(X), m_Value())) ||
-      ICmpInst::isEquality(Pred1))
+  bool HasNotOp = match(Cmp1, m_c_ICmp(Pred1, m_Not(m_Specific(X)), m_Value()));
+  if (!HasNotOp && !match(Cmp1, m_c_ICmp(Pred1, m_Specific(X), m_Value())))
+    return nullptr;
+  if (ICmpInst::isEquality(Pred1))
+    return nullptr;
+
+  // The equality compare must be against a constant. Flip bits if we matched
+  // a bitwise not. Convert a null pointer constant to an integer zero value.
+  APInt MinMaxC;
+  const APInt *C;
+  if (match(Cmp0->getOperand(1), m_APInt(C)))
+    MinMaxC = HasNotOp ? ~*C : *C;
+  else if (isa<ConstantPointerNull>(Cmp0->getOperand(1)))
+    MinMaxC = APInt::getNullValue(8);
+  else
     return nullptr;
 
   // DeMorganize if this is 'or': P0 || P1 --> !P0 && !P1.
@@ -2003,6 +1999,30 @@ static Value *omitCheckForZeroBeforeInvertedMulWithOverflow(Value *Op0,
   return NotOp1;
 }
 
+/// Given a bitwise logic op, check if the operands are add/sub with a common
+/// source value and inverted constant (identity: C - X -> ~(X + ~C)).
+static Value *simplifyLogicOfAddSub(Value *Op0, Value *Op1,
+                                    Instruction::BinaryOps Opcode) {
+  assert(Op0->getType() == Op1->getType() && "Mismatched binop types");
+  assert(BinaryOperator::isBitwiseLogicOp(Opcode) && "Expected logic op");
+  Value *X;
+  Constant *C1, *C2;
+  if ((match(Op0, m_Add(m_Value(X), m_Constant(C1))) &&
+       match(Op1, m_Sub(m_Constant(C2), m_Specific(X)))) ||
+      (match(Op1, m_Add(m_Value(X), m_Constant(C1))) &&
+       match(Op0, m_Sub(m_Constant(C2), m_Specific(X))))) {
+    if (ConstantExpr::getNot(C1) == C2) {
+      // (X + C) & (~C - X) --> (X + C) & ~(X + C) --> 0
+      // (X + C) | (~C - X) --> (X + C) | ~(X + C) --> -1
+      // (X + C) ^ (~C - X) --> (X + C) ^ ~(X + C) --> -1
+      Type *Ty = Op0->getType();
+      return Opcode == Instruction::And ? ConstantInt::getNullValue(Ty)
+                                        : ConstantInt::getAllOnesValue(Ty);
+    }
+  }
+  return nullptr;
+}
+
 /// Given operands for an And, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyAndInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
@@ -2011,7 +2031,7 @@ static Value *SimplifyAndInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
     return C;
 
   // X & undef -> 0
-  if (match(Op1, m_Undef()))
+  if (Q.isUndefValue(Op1))
     return Constant::getNullValue(Op0->getType());
 
   // X & X = X
@@ -2039,6 +2059,9 @@ static Value *SimplifyAndInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
   if (match(Op1, m_c_Or(m_Specific(Op0), m_Value())))
     return Op0;
 
+  if (Value *V = simplifyLogicOfAddSub(Op0, Op1, Instruction::And))
+    return V;
+
   // A mask that only clears known zeros of a shifted value is a no-op.
   Value *X;
   const APInt *Mask;
@@ -2095,21 +2118,30 @@ static Value *SimplifyAndInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
     return V;
 
   // And distributes over Or.  Try some generic simplifications based on this.
-  if (Value *V = ExpandBinOp(Instruction::And, Op0, Op1, Instruction::Or,
-                             Q, MaxRecurse))
+  if (Value *V = expandCommutativeBinOp(Instruction::And, Op0, Op1,
+                                        Instruction::Or, Q, MaxRecurse))
     return V;
 
   // And distributes over Xor.  Try some generic simplifications based on this.
-  if (Value *V = ExpandBinOp(Instruction::And, Op0, Op1, Instruction::Xor,
-                             Q, MaxRecurse))
+  if (Value *V = expandCommutativeBinOp(Instruction::And, Op0, Op1,
+                                        Instruction::Xor, Q, MaxRecurse))
     return V;
 
-  // If the operation is with the result of a select instruction, check whether
-  // operating on either branch of the select always yields the same value.
-  if (isa<SelectInst>(Op0) || isa<SelectInst>(Op1))
+  if (isa<SelectInst>(Op0) || isa<SelectInst>(Op1)) {
+    if (Op0->getType()->isIntOrIntVectorTy(1)) {
+      // A & (A && B) -> A && B
+      if (match(Op1, m_Select(m_Specific(Op0), m_Value(), m_Zero())))
+        return Op1;
+      else if (match(Op0, m_Select(m_Specific(Op1), m_Value(), m_Zero())))
+        return Op0;
+    }
+    // If the operation is with the result of a select instruction, check
+    // whether operating on either branch of the select always yields the same
+    // value.
     if (Value *V = ThreadBinOpOverSelect(Instruction::And, Op0, Op1, Q,
                                          MaxRecurse))
       return V;
+  }
 
   // If the operation is with the result of a phi instruction, check whether
   // operating on all incoming values of the phi always yields the same value.
@@ -2169,7 +2201,7 @@ static Value *SimplifyOrInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
   // X | undef -> -1
   // X | -1 = -1
   // Do not return Op1 because it may contain undef elements if it's a vector.
-  if (match(Op1, m_Undef()) || match(Op1, m_AllOnes()))
+  if (Q.isUndefValue(Op1) || match(Op1, m_AllOnes()))
     return Constant::getAllOnesValue(Op0->getType());
 
   // X | X = X
@@ -2198,7 +2230,10 @@ static Value *SimplifyOrInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
   if (match(Op1, m_Not(m_c_And(m_Specific(Op0), m_Value()))))
     return Constant::getAllOnesValue(Op0->getType());
 
-  Value *A, *B;
+  if (Value *V = simplifyLogicOfAddSub(Op0, Op1, Instruction::Or))
+    return V;
+
+  Value *A, *B, *NotA;
   // (A & ~B) | (A ^ B) -> (A ^ B)
   // (~B & A) | (A ^ B) -> (A ^ B)
   // (A & ~B) | (B ^ A) -> (B ^ A)
@@ -2227,6 +2262,7 @@ static Value *SimplifyOrInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
        match(Op1, m_c_Xor(m_Not(m_Specific(A)), m_Specific(B)))))
     return Op1;
 
+  // Commute the 'or' operands.
   // (~A ^ B) | (A & B) -> (~A ^ B)
   // (~A ^ B) | (B & A) -> (~A ^ B)
   // (B ^ ~A) | (A & B) -> (B ^ ~A)
@@ -2236,6 +2272,25 @@ static Value *SimplifyOrInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
        match(Op0, m_c_Xor(m_Not(m_Specific(A)), m_Specific(B)))))
     return Op0;
 
+  // (~A & B) | ~(A | B) --> ~A
+  // (~A & B) | ~(B | A) --> ~A
+  // (B & ~A) | ~(A | B) --> ~A
+  // (B & ~A) | ~(B | A) --> ~A
+  if (match(Op0, m_c_And(m_CombineAnd(m_Value(NotA), m_Not(m_Value(A))),
+                         m_Value(B))) &&
+      match(Op1, m_Not(m_c_Or(m_Specific(A), m_Specific(B)))))
+    return NotA;
+
+  // Commute the 'or' operands.
+  // ~(A | B) | (~A & B) --> ~A
+  // ~(B | A) | (~A & B) --> ~A
+  // ~(A | B) | (B & ~A) --> ~A
+  // ~(B | A) | (B & ~A) --> ~A
+  if (match(Op1, m_c_And(m_CombineAnd(m_Value(NotA), m_Not(m_Value(A))),
+                         m_Value(B))) &&
+      match(Op0, m_Not(m_c_Or(m_Specific(A), m_Specific(B)))))
+    return NotA;
+
   if (Value *V = simplifyAndOrOfCmps(Q, Op0, Op1, false))
     return V;
 
@@ -2253,16 +2308,25 @@ static Value *SimplifyOrInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
     return V;
 
   // Or distributes over And.  Try some generic simplifications based on this.
-  if (Value *V = ExpandBinOp(Instruction::Or, Op0, Op1, Instruction::And, Q,
-                             MaxRecurse))
+  if (Value *V = expandCommutativeBinOp(Instruction::Or, Op0, Op1,
+                                        Instruction::And, Q, MaxRecurse))
     return V;
 
-  // If the operation is with the result of a select instruction, check whether
-  // operating on either branch of the select always yields the same value.
-  if (isa<SelectInst>(Op0) || isa<SelectInst>(Op1))
+  if (isa<SelectInst>(Op0) || isa<SelectInst>(Op1)) {
+    if (Op0->getType()->isIntOrIntVectorTy(1)) {
+      // A | (A || B) -> A || B
+      if (match(Op1, m_Select(m_Specific(Op0), m_One(), m_Value())))
+        return Op1;
+      else if (match(Op0, m_Select(m_Specific(Op1), m_One(), m_Value())))
+        return Op0;
+    }
+    // If the operation is with the result of a select instruction, check
+    // whether operating on either branch of the select always yields the same
+    // value.
     if (Value *V = ThreadBinOpOverSelect(Instruction::Or, Op0, Op1, Q,
                                          MaxRecurse))
       return V;
+  }
 
   // (A & C1)|(B & C2)
   const APInt *C1, *C2;
@@ -2311,7 +2375,7 @@ static Value *SimplifyXorInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
     return C;
 
   // A ^ undef -> undef
-  if (match(Op1, m_Undef()))
+  if (Q.isUndefValue(Op1))
     return Op1;
 
   // A ^ 0 = A
@@ -2327,6 +2391,9 @@ static Value *SimplifyXorInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
       match(Op1, m_Not(m_Specific(Op0))))
     return Constant::getAllOnesValue(Op0->getType());
 
+  if (Value *V = simplifyLogicOfAddSub(Op0, Op1, Instruction::Xor))
+    return V;
+
   // Try some generic simplifications for associative operations.
   if (Value *V = SimplifyAssociativeBinOp(Instruction::Xor, Op0, Op1, Q,
                                           MaxRecurse))
@@ -2533,8 +2600,8 @@ computePointerICmp(const DataLayout &DL, const TargetLibraryInfo *TLI,
     // memory within the lifetime of the current function (allocas, byval
     // arguments, globals), then determine the comparison result here.
     SmallVector<const Value *, 8> LHSUObjs, RHSUObjs;
-    GetUnderlyingObjects(LHS, LHSUObjs, DL);
-    GetUnderlyingObjects(RHS, RHSUObjs, DL);
+    getUnderlyingObjects(LHS, LHSUObjs);
+    getUnderlyingObjects(RHS, RHSUObjs);
 
     // Is the set of underlying objects all noalias calls?
     auto IsNAC = [](ArrayRef<const Value *> Objects) {
@@ -2741,7 +2808,7 @@ static Value *simplifyICmpWithConstant(CmpInst::Predicate Pred, Value *LHS,
   }
 
   const APInt *C;
-  if (!match(RHS, m_APInt(C)))
+  if (!match(RHS, m_APIntAllowUndef(C)))
     return nullptr;
 
   // Rule out tautological comparisons (eg., ult 0 or uge 0).
@@ -2759,17 +2826,166 @@ static Value *simplifyICmpWithConstant(CmpInst::Predicate Pred, Value *LHS,
       return ConstantInt::getFalse(ITy);
   }
 
+  // (mul nuw/nsw X, MulC) != C --> true  (if C is not a multiple of MulC)
+  // (mul nuw/nsw X, MulC) == C --> false (if C is not a multiple of MulC)
+  const APInt *MulC;
+  if (ICmpInst::isEquality(Pred) &&
+      ((match(LHS, m_NUWMul(m_Value(), m_APIntAllowUndef(MulC))) &&
+        *MulC != 0 && C->urem(*MulC) != 0) ||
+       (match(LHS, m_NSWMul(m_Value(), m_APIntAllowUndef(MulC))) &&
+        *MulC != 0 && C->srem(*MulC) != 0)))
+    return ConstantInt::get(ITy, Pred == ICmpInst::ICMP_NE);
+
   return nullptr;
 }
 
+static Value *simplifyICmpWithBinOpOnLHS(
+    CmpInst::Predicate Pred, BinaryOperator *LBO, Value *RHS,
+    const SimplifyQuery &Q, unsigned MaxRecurse) {
+  Type *ITy = GetCompareTy(RHS); // The return type.
+
+  Value *Y = nullptr;
+  // icmp pred (or X, Y), X
+  if (match(LBO, m_c_Or(m_Value(Y), m_Specific(RHS)))) {
+    if (Pred == ICmpInst::ICMP_ULT)
+      return getFalse(ITy);
+    if (Pred == ICmpInst::ICMP_UGE)
+      return getTrue(ITy);
+
+    if (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SGE) {
+      KnownBits RHSKnown = computeKnownBits(RHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+      KnownBits YKnown = computeKnownBits(Y, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+      if (RHSKnown.isNonNegative() && YKnown.isNegative())
+        return Pred == ICmpInst::ICMP_SLT ? getTrue(ITy) : getFalse(ITy);
+      if (RHSKnown.isNegative() || YKnown.isNonNegative())
+        return Pred == ICmpInst::ICMP_SLT ? getFalse(ITy) : getTrue(ITy);
+    }
+  }
+
+  // icmp pred (and X, Y), X
+  if (match(LBO, m_c_And(m_Value(), m_Specific(RHS)))) {
+    if (Pred == ICmpInst::ICMP_UGT)
+      return getFalse(ITy);
+    if (Pred == ICmpInst::ICMP_ULE)
+      return getTrue(ITy);
+  }
+
+  // icmp pred (urem X, Y), Y
+  if (match(LBO, m_URem(m_Value(), m_Specific(RHS)))) {
+    switch (Pred) {
+    default:
+      break;
+    case ICmpInst::ICMP_SGT:
+    case ICmpInst::ICMP_SGE: {
+      KnownBits Known = computeKnownBits(RHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+      if (!Known.isNonNegative())
+        break;
+      LLVM_FALLTHROUGH;
+    }
+    case ICmpInst::ICMP_EQ:
+    case ICmpInst::ICMP_UGT:
+    case ICmpInst::ICMP_UGE:
+      return getFalse(ITy);
+    case ICmpInst::ICMP_SLT:
+    case ICmpInst::ICMP_SLE: {
+      KnownBits Known = computeKnownBits(RHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+      if (!Known.isNonNegative())
+        break;
+      LLVM_FALLTHROUGH;
+    }
+    case ICmpInst::ICMP_NE:
+    case ICmpInst::ICMP_ULT:
+    case ICmpInst::ICMP_ULE:
+      return getTrue(ITy);
+    }
+  }
+
+  // icmp pred (urem X, Y), X
+  if (match(LBO, m_URem(m_Specific(RHS), m_Value()))) {
+    if (Pred == ICmpInst::ICMP_ULE)
+      return getTrue(ITy);
+    if (Pred == ICmpInst::ICMP_UGT)
+      return getFalse(ITy);
+  }
+
+  // x >> y <=u x
+  // x udiv y <=u x.
+  if (match(LBO, m_LShr(m_Specific(RHS), m_Value())) ||
+      match(LBO, m_UDiv(m_Specific(RHS), m_Value()))) {
+    // icmp pred (X op Y), X
+    if (Pred == ICmpInst::ICMP_UGT)
+      return getFalse(ITy);
+    if (Pred == ICmpInst::ICMP_ULE)
+      return getTrue(ITy);
+  }
+
+  // (x*C1)/C2 <= x for C1 <= C2.
+  // This holds even if the multiplication overflows: Assume that x != 0 and
+  // arithmetic is modulo M. For overflow to occur we must have C1 >= M/x and
+  // thus C2 >= M/x. It follows that (x*C1)/C2 <= (M-1)/C2 <= ((M-1)*x)/M < x.
+  //
+  // Additionally, either the multiplication and division might be represented
+  // as shifts:
+  // (x*C1)>>C2 <= x for C1 < 2**C2.
+  // (x<<C1)/C2 <= x for 2**C1 < C2.
+  const APInt *C1, *C2;
+  if ((match(LBO, m_UDiv(m_Mul(m_Specific(RHS), m_APInt(C1)), m_APInt(C2))) &&
+       C1->ule(*C2)) ||
+      (match(LBO, m_LShr(m_Mul(m_Specific(RHS), m_APInt(C1)), m_APInt(C2))) &&
+       C1->ule(APInt(C2->getBitWidth(), 1) << *C2)) ||
+      (match(LBO, m_UDiv(m_Shl(m_Specific(RHS), m_APInt(C1)), m_APInt(C2))) &&
+       (APInt(C1->getBitWidth(), 1) << *C1).ule(*C2))) {
+    if (Pred == ICmpInst::ICMP_UGT)
+      return getFalse(ITy);
+    if (Pred == ICmpInst::ICMP_ULE)
+      return getTrue(ITy);
+  }
+
+  return nullptr;
+}
+
+
+// If only one of the icmp's operands has NSW flags, try to prove that:
+//
+//   icmp slt (x + C1), (x +nsw C2)
+//
+// is equivalent to:
+//
+//   icmp slt C1, C2
+//
+// which is true if x + C2 has the NSW flags set and:
+// *) C1 < C2 && C1 >= 0, or
+// *) C2 < C1 && C1 <= 0.
+//
+static bool trySimplifyICmpWithAdds(CmpInst::Predicate Pred, Value *LHS,
+                                    Value *RHS) {
+  // TODO: only support icmp slt for now.
+  if (Pred != CmpInst::ICMP_SLT)
+    return false;
+
+  // Canonicalize nsw add as RHS.
+  if (!match(RHS, m_NSWAdd(m_Value(), m_Value())))
+    std::swap(LHS, RHS);
+  if (!match(RHS, m_NSWAdd(m_Value(), m_Value())))
+    return false;
+
+  Value *X;
+  const APInt *C1, *C2;
+  if (!match(LHS, m_c_Add(m_Value(X), m_APInt(C1))) ||
+      !match(RHS, m_c_Add(m_Specific(X), m_APInt(C2))))
+    return false;
+
+  return (C1->slt(*C2) && C1->isNonNegative()) ||
+         (C2->slt(*C1) && C1->isNonPositive());
+}
+
+
 /// TODO: A large part of this logic is duplicated in InstCombine's
 /// foldICmpBinOp(). We should be able to share that and avoid the code
 /// duplication.
 static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
                                     Value *RHS, const SimplifyQuery &Q,
                                     unsigned MaxRecurse) {
-  Type *ITy = GetCompareTy(LHS); // The return type.
-
   BinaryOperator *LBO = dyn_cast<BinaryOperator>(LHS);
   BinaryOperator *RBO = dyn_cast<BinaryOperator>(RHS);
   if (MaxRecurse && (LBO || RBO)) {
@@ -2813,8 +3029,9 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
         return V;
 
     // icmp (X+Y), (X+Z) -> icmp Y,Z for equalities or if there is no overflow.
-    if (A && C && (A == C || A == D || B == C || B == D) && NoLHSWrapProblem &&
-        NoRHSWrapProblem) {
+    bool CanSimplify = (NoLHSWrapProblem && NoRHSWrapProblem) ||
+                       trySimplifyICmpWithAdds(Pred, LHS, RHS);
+    if (A && C && (A == C || A == D || B == C || B == D) && CanSimplify) {
       // Determine Y and Z in the form icmp (X+Y), (X+Z).
       Value *Y, *Z;
       if (A == C) {
@@ -2840,195 +3057,66 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
     }
   }
 
-  {
-    Value *Y = nullptr;
-    // icmp pred (or X, Y), X
-    if (LBO && match(LBO, m_c_Or(m_Value(Y), m_Specific(RHS)))) {
-      if (Pred == ICmpInst::ICMP_ULT)
-        return getFalse(ITy);
-      if (Pred == ICmpInst::ICMP_UGE)
-        return getTrue(ITy);
-
-      if (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SGE) {
-        KnownBits RHSKnown = computeKnownBits(RHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
-        KnownBits YKnown = computeKnownBits(Y, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
-        if (RHSKnown.isNonNegative() && YKnown.isNegative())
-          return Pred == ICmpInst::ICMP_SLT ? getTrue(ITy) : getFalse(ITy);
-        if (RHSKnown.isNegative() || YKnown.isNonNegative())
-          return Pred == ICmpInst::ICMP_SLT ? getFalse(ITy) : getTrue(ITy);
-      }
-    }
-    // icmp pred X, (or X, Y)
-    if (RBO && match(RBO, m_c_Or(m_Value(Y), m_Specific(LHS)))) {
-      if (Pred == ICmpInst::ICMP_ULE)
-        return getTrue(ITy);
-      if (Pred == ICmpInst::ICMP_UGT)
-        return getFalse(ITy);
-
-      if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE) {
-        KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
-        KnownBits YKnown = computeKnownBits(Y, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
-        if (LHSKnown.isNonNegative() && YKnown.isNegative())
-          return Pred == ICmpInst::ICMP_SGT ? getTrue(ITy) : getFalse(ITy);
-        if (LHSKnown.isNegative() || YKnown.isNonNegative())
-          return Pred == ICmpInst::ICMP_SGT ? getFalse(ITy) : getTrue(ITy);
-      }
-    }
-  }
+  if (LBO)
+    if (Value *V = simplifyICmpWithBinOpOnLHS(Pred, LBO, RHS, Q, MaxRecurse))
+      return V;
 
-  // icmp pred (and X, Y), X
-  if (LBO && match(LBO, m_c_And(m_Value(), m_Specific(RHS)))) {
-    if (Pred == ICmpInst::ICMP_UGT)
-      return getFalse(ITy);
-    if (Pred == ICmpInst::ICMP_ULE)
-      return getTrue(ITy);
-  }
-  // icmp pred X, (and X, Y)
-  if (RBO && match(RBO, m_c_And(m_Value(), m_Specific(LHS)))) {
-    if (Pred == ICmpInst::ICMP_UGE)
-      return getTrue(ITy);
-    if (Pred == ICmpInst::ICMP_ULT)
-      return getFalse(ITy);
-  }
+  if (RBO)
+    if (Value *V = simplifyICmpWithBinOpOnLHS(
+            ICmpInst::getSwappedPredicate(Pred), RBO, LHS, Q, MaxRecurse))
+      return V;
 
   // 0 - (zext X) pred C
   if (!CmpInst::isUnsigned(Pred) && match(LHS, m_Neg(m_ZExt(m_Value())))) {
-    if (ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS)) {
-      if (RHSC->getValue().isStrictlyPositive()) {
-        if (Pred == ICmpInst::ICMP_SLT)
-          return ConstantInt::getTrue(RHSC->getContext());
-        if (Pred == ICmpInst::ICMP_SGE)
-          return ConstantInt::getFalse(RHSC->getContext());
-        if (Pred == ICmpInst::ICMP_EQ)
-          return ConstantInt::getFalse(RHSC->getContext());
-        if (Pred == ICmpInst::ICMP_NE)
-          return ConstantInt::getTrue(RHSC->getContext());
+    const APInt *C;
+    if (match(RHS, m_APInt(C))) {
+      if (C->isStrictlyPositive()) {
+        if (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_NE)
+          return ConstantInt::getTrue(GetCompareTy(RHS));
+        if (Pred == ICmpInst::ICMP_SGE || Pred == ICmpInst::ICMP_EQ)
+          return ConstantInt::getFalse(GetCompareTy(RHS));
       }
-      if (RHSC->getValue().isNonNegative()) {
+      if (C->isNonNegative()) {
         if (Pred == ICmpInst::ICMP_SLE)
-          return ConstantInt::getTrue(RHSC->getContext());
+          return ConstantInt::getTrue(GetCompareTy(RHS));
         if (Pred == ICmpInst::ICMP_SGT)
-          return ConstantInt::getFalse(RHSC->getContext());
+          return ConstantInt::getFalse(GetCompareTy(RHS));
       }
     }
   }
 
-  // icmp pred (urem X, Y), Y
-  if (LBO && match(LBO, m_URem(m_Value(), m_Specific(RHS)))) {
-    switch (Pred) {
-    default:
-      break;
-    case ICmpInst::ICMP_SGT:
-    case ICmpInst::ICMP_SGE: {
-      KnownBits Known = computeKnownBits(RHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
-      if (!Known.isNonNegative())
-        break;
-      LLVM_FALLTHROUGH;
-    }
-    case ICmpInst::ICMP_EQ:
-    case ICmpInst::ICMP_UGT:
-    case ICmpInst::ICMP_UGE:
-      return getFalse(ITy);
-    case ICmpInst::ICMP_SLT:
-    case ICmpInst::ICMP_SLE: {
-      KnownBits Known = computeKnownBits(RHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
-      if (!Known.isNonNegative())
-        break;
-      LLVM_FALLTHROUGH;
-    }
-    case ICmpInst::ICMP_NE:
-    case ICmpInst::ICMP_ULT:
-    case ICmpInst::ICMP_ULE:
-      return getTrue(ITy);
-    }
-  }
-
-  // icmp pred X, (urem Y, X)
-  if (RBO && match(RBO, m_URem(m_Value(), m_Specific(LHS)))) {
-    switch (Pred) {
-    default:
-      break;
-    case ICmpInst::ICMP_SGT:
-    case ICmpInst::ICMP_SGE: {
-      KnownBits Known = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
-      if (!Known.isNonNegative())
-        break;
-      LLVM_FALLTHROUGH;
-    }
-    case ICmpInst::ICMP_NE:
-    case ICmpInst::ICMP_UGT:
-    case ICmpInst::ICMP_UGE:
-      return getTrue(ITy);
-    case ICmpInst::ICMP_SLT:
-    case ICmpInst::ICMP_SLE: {
-      KnownBits Known = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
-      if (!Known.isNonNegative())
-        break;
-      LLVM_FALLTHROUGH;
-    }
-    case ICmpInst::ICMP_EQ:
-    case ICmpInst::ICMP_ULT:
-    case ICmpInst::ICMP_ULE:
-      return getFalse(ITy);
+  //   If C2 is a power-of-2 and C is not:
+  //   (C2 << X) == C --> false
+  //   (C2 << X) != C --> true
+  const APInt *C;
+  if (match(LHS, m_Shl(m_Power2(), m_Value())) &&
+      match(RHS, m_APIntAllowUndef(C)) && !C->isPowerOf2()) {
+    // C2 << X can equal zero in some circumstances.
+    // This simplification might be unsafe if C is zero.
+    //
+    // We know it is safe if:
+    // - The shift is nsw. We can't shift out the one bit.
+    // - The shift is nuw. We can't shift out the one bit.
+    // - C2 is one.
+    // - C isn't zero.
+    if (Q.IIQ.hasNoSignedWrap(cast<OverflowingBinaryOperator>(LBO)) ||
+        Q.IIQ.hasNoUnsignedWrap(cast<OverflowingBinaryOperator>(LBO)) ||
+        match(LHS, m_Shl(m_One(), m_Value())) || !C->isNullValue()) {
+      if (Pred == ICmpInst::ICMP_EQ)
+        return ConstantInt::getFalse(GetCompareTy(RHS));
+      if (Pred == ICmpInst::ICMP_NE)
+        return ConstantInt::getTrue(GetCompareTy(RHS));
     }
   }
 
-  // x >> y <=u x
-  // x udiv y <=u x.
-  if (LBO && (match(LBO, m_LShr(m_Specific(RHS), m_Value())) ||
-              match(LBO, m_UDiv(m_Specific(RHS), m_Value())))) {
-    // icmp pred (X op Y), X
+  // TODO: This is overly constrained. LHS can be any power-of-2.
+  // (1 << X)  >u 0x8000 --> false
+  // (1 << X) <=u 0x8000 --> true
+  if (match(LHS, m_Shl(m_One(), m_Value())) && match(RHS, m_SignMask())) {
     if (Pred == ICmpInst::ICMP_UGT)
-      return getFalse(ITy);
+      return ConstantInt::getFalse(GetCompareTy(RHS));
     if (Pred == ICmpInst::ICMP_ULE)
-      return getTrue(ITy);
-  }
-
-  // x >=u x >> y
-  // x >=u x udiv y.
-  if (RBO && (match(RBO, m_LShr(m_Specific(LHS), m_Value())) ||
-              match(RBO, m_UDiv(m_Specific(LHS), m_Value())))) {
-    // icmp pred X, (X op Y)
-    if (Pred == ICmpInst::ICMP_ULT)
-      return getFalse(ITy);
-    if (Pred == ICmpInst::ICMP_UGE)
-      return getTrue(ITy);
-  }
-
-  // handle:
-  //   CI2 << X == CI
-  //   CI2 << X != CI
-  //
-  //   where CI2 is a power of 2 and CI isn't
-  if (auto *CI = dyn_cast<ConstantInt>(RHS)) {
-    const APInt *CI2Val, *CIVal = &CI->getValue();
-    if (LBO && match(LBO, m_Shl(m_APInt(CI2Val), m_Value())) &&
-        CI2Val->isPowerOf2()) {
-      if (!CIVal->isPowerOf2()) {
-        // CI2 << X can equal zero in some circumstances,
-        // this simplification is unsafe if CI is zero.
-        //
-        // We know it is safe if:
-        // - The shift is nsw, we can't shift out the one bit.
-        // - The shift is nuw, we can't shift out the one bit.
-        // - CI2 is one
-        // - CI isn't zero
-        if (Q.IIQ.hasNoSignedWrap(cast<OverflowingBinaryOperator>(LBO)) ||
-            Q.IIQ.hasNoUnsignedWrap(cast<OverflowingBinaryOperator>(LBO)) ||
-            CI2Val->isOneValue() || !CI->isZero()) {
-          if (Pred == ICmpInst::ICMP_EQ)
-            return ConstantInt::getFalse(RHS->getContext());
-          if (Pred == ICmpInst::ICMP_NE)
-            return ConstantInt::getTrue(RHS->getContext());
-        }
-      }
-      if (CIVal->isSignMask() && CI2Val->isOneValue()) {
-        if (Pred == ICmpInst::ICMP_UGT)
-          return ConstantInt::getFalse(RHS->getContext());
-        if (Pred == ICmpInst::ICMP_ULE)
-          return ConstantInt::getTrue(RHS->getContext());
-      }
-    }
+      return ConstantInt::getTrue(GetCompareTy(RHS));
   }
 
   if (MaxRecurse && LBO && RBO && LBO->getOpcode() == RBO->getOpcode() &&
@@ -3226,55 +3314,38 @@ static Value *simplifyICmpWithMinMax(CmpInst::Predicate Pred, Value *LHS,
       break;
     }
     case CmpInst::ICMP_UGE:
-      // Always true.
       return getTrue(ITy);
     case CmpInst::ICMP_ULT:
-      // Always false.
       return getFalse(ITy);
     }
   }
 
-  // Variants on "max(x,y) >= min(x,z)".
+  // Comparing 1 each of min/max with a common operand?
+  // Canonicalize min operand to RHS.
+  if (match(LHS, m_UMin(m_Value(), m_Value())) ||
+      match(LHS, m_SMin(m_Value(), m_Value()))) {
+    std::swap(LHS, RHS);
+    Pred = ICmpInst::getSwappedPredicate(Pred);
+  }
+
   Value *C, *D;
   if (match(LHS, m_SMax(m_Value(A), m_Value(B))) &&
       match(RHS, m_SMin(m_Value(C), m_Value(D))) &&
       (A == C || A == D || B == C || B == D)) {
-    // max(x, ?) pred min(x, ?).
+    // smax(A, B) >=s smin(A, D) --> true
     if (Pred == CmpInst::ICMP_SGE)
-      // Always true.
       return getTrue(ITy);
+    // smax(A, B) <s smin(A, D) --> false
     if (Pred == CmpInst::ICMP_SLT)
-      // Always false.
-      return getFalse(ITy);
-  } else if (match(LHS, m_SMin(m_Value(A), m_Value(B))) &&
-             match(RHS, m_SMax(m_Value(C), m_Value(D))) &&
-             (A == C || A == D || B == C || B == D)) {
-    // min(x, ?) pred max(x, ?).
-    if (Pred == CmpInst::ICMP_SLE)
-      // Always true.
-      return getTrue(ITy);
-    if (Pred == CmpInst::ICMP_SGT)
-      // Always false.
       return getFalse(ITy);
   } else if (match(LHS, m_UMax(m_Value(A), m_Value(B))) &&
              match(RHS, m_UMin(m_Value(C), m_Value(D))) &&
              (A == C || A == D || B == C || B == D)) {
-    // max(x, ?) pred min(x, ?).
+    // umax(A, B) >=u umin(A, D) --> true
     if (Pred == CmpInst::ICMP_UGE)
-      // Always true.
       return getTrue(ITy);
+    // umax(A, B) <u umin(A, D) --> false
     if (Pred == CmpInst::ICMP_ULT)
-      // Always false.
-      return getFalse(ITy);
-  } else if (match(LHS, m_UMin(m_Value(A), m_Value(B))) &&
-             match(RHS, m_UMax(m_Value(C), m_Value(D))) &&
-             (A == C || A == D || B == C || B == D)) {
-    // min(x, ?) pred max(x, ?).
-    if (Pred == CmpInst::ICMP_ULE)
-      // Always true.
-      return getTrue(ITy);
-    if (Pred == CmpInst::ICMP_UGT)
-      // Always false.
       return getFalse(ITy);
   }
 
@@ -3327,12 +3398,12 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
   // For EQ and NE, we can always pick a value for the undef to make the
   // predicate pass or fail, so we can return undef.
   // Matches behavior in llvm::ConstantFoldCompareInstruction.
-  if (isa<UndefValue>(RHS) && ICmpInst::isEquality(Pred))
+  if (Q.isUndefValue(RHS) && ICmpInst::isEquality(Pred))
     return UndefValue::get(ITy);
 
   // icmp X, X -> true/false
   // icmp X, undef -> true/false because undef could be X.
-  if (LHS == RHS || isa<UndefValue>(RHS))
+  if (LHS == RHS || Q.isUndefValue(RHS))
     return ConstantInt::get(ITy, CmpInst::isTrueWhenEqual(Pred));
 
   if (Value *V = simplifyICmpOfBools(Pred, LHS, RHS, Q))
@@ -3588,7 +3659,7 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
         // expression GEP with the same indices and a null base pointer to see
         // what constant folding can make out of it.
         Constant *Null = Constant::getNullValue(GLHS->getPointerOperandType());
-        SmallVector<Value *, 4> IndicesLHS(GLHS->idx_begin(), GLHS->idx_end());
+        SmallVector<Value *, 4> IndicesLHS(GLHS->indices());
         Constant *NewLHS = ConstantExpr::getGetElementPtr(
             GLHS->getSourceElementType(), Null, IndicesLHS);
 
@@ -3659,7 +3730,7 @@ static Value *SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
 
   // fcmp pred x, undef  and  fcmp pred undef, x
   // fold to true if unordered, false if ordered
-  if (isa<UndefValue>(LHS) || isa<UndefValue>(RHS)) {
+  if (Q.isUndefValue(LHS) || Q.isUndefValue(RHS)) {
     // Choosing NaN for the undef will always make unordered comparison succeed
     // and ordered comparison fail.
     return ConstantInt::get(RetTy, CmpInst::isUnordered(Pred));
@@ -3703,6 +3774,21 @@ static Value *SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
           break;
         }
       }
+
+      // LHS == Inf
+      if (Pred == FCmpInst::FCMP_OEQ && isKnownNeverInfinity(LHS, Q.TLI))
+        return getFalse(RetTy);
+      // LHS != Inf
+      if (Pred == FCmpInst::FCMP_UNE && isKnownNeverInfinity(LHS, Q.TLI))
+        return getTrue(RetTy);
+      // LHS == Inf || LHS == NaN
+      if (Pred == FCmpInst::FCMP_UEQ && isKnownNeverInfinity(LHS, Q.TLI) &&
+          isKnownNeverNaN(LHS, Q.TLI))
+        return getFalse(RetTy);
+      // LHS != Inf && LHS != NaN
+      if (Pred == FCmpInst::FCMP_ONE && isKnownNeverInfinity(LHS, Q.TLI) &&
+          isKnownNeverNaN(LHS, Q.TLI))
+        return getTrue(RetTy);
     }
     if (C->isNegative() && !C->isNegZero()) {
       assert(!C->isNaN() && "Unexpected NaN constant!");
@@ -3834,18 +3920,33 @@ static Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
   // We can't replace %sel with %add unless we strip away the flags (which will
   // be done in InstCombine).
   // TODO: This is unsound, because it only catches some forms of refinement.
-  if (!AllowRefinement && canCreatePoison(I))
+  if (!AllowRefinement && canCreatePoison(cast<Operator>(I)))
     return nullptr;
 
+  // The simplification queries below may return the original value. Consider:
+  //   %div = udiv i32 %arg, %arg2
+  //   %mul = mul nsw i32 %div, %arg2
+  //   %cmp = icmp eq i32 %mul, %arg
+  //   %sel = select i1 %cmp, i32 %div, i32 undef
+  // Replacing %arg by %mul, %div becomes "udiv i32 %mul, %arg2", which
+  // simplifies back to %arg. This can only happen because %mul does not
+  // dominate %div. To ensure a consistent return value contract, we make sure
+  // that this case returns nullptr as well.
+  auto PreventSelfSimplify = [V](Value *Simplified) {
+    return Simplified != V ? Simplified : nullptr;
+  };
+
   // If this is a binary operator, try to simplify it with the replaced op.
   if (auto *B = dyn_cast<BinaryOperator>(I)) {
     if (MaxRecurse) {
       if (B->getOperand(0) == Op)
-        return SimplifyBinOp(B->getOpcode(), RepOp, B->getOperand(1), Q,
-                             MaxRecurse - 1);
+        return PreventSelfSimplify(SimplifyBinOp(B->getOpcode(), RepOp,
+                                                 B->getOperand(1), Q,
+                                                 MaxRecurse - 1));
       if (B->getOperand(1) == Op)
-        return SimplifyBinOp(B->getOpcode(), B->getOperand(0), RepOp, Q,
-                             MaxRecurse - 1);
+        return PreventSelfSimplify(SimplifyBinOp(B->getOpcode(),
+                                                 B->getOperand(0), RepOp, Q,
+                                                 MaxRecurse - 1));
     }
   }
 
@@ -3853,11 +3954,13 @@ static Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
   if (CmpInst *C = dyn_cast<CmpInst>(I)) {
     if (MaxRecurse) {
       if (C->getOperand(0) == Op)
-        return SimplifyCmpInst(C->getPredicate(), RepOp, C->getOperand(1), Q,
-                               MaxRecurse - 1);
+        return PreventSelfSimplify(SimplifyCmpInst(C->getPredicate(), RepOp,
+                                                   C->getOperand(1), Q,
+                                                   MaxRecurse - 1));
       if (C->getOperand(1) == Op)
-        return SimplifyCmpInst(C->getPredicate(), C->getOperand(0), RepOp, Q,
-                               MaxRecurse - 1);
+        return PreventSelfSimplify(SimplifyCmpInst(C->getPredicate(),
+                                                   C->getOperand(0), RepOp, Q,
+                                                   MaxRecurse - 1));
     }
   }
 
@@ -3867,8 +3970,8 @@ static Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
       SmallVector<Value *, 8> NewOps(GEP->getNumOperands());
       transform(GEP->operands(), NewOps.begin(),
                 [&](Value *V) { return V == Op ? RepOp : V; });
-      return SimplifyGEPInst(GEP->getSourceElementType(), NewOps, Q,
-                             MaxRecurse - 1);
+      return PreventSelfSimplify(SimplifyGEPInst(GEP->getSourceElementType(),
+                                                 NewOps, Q, MaxRecurse - 1));
     }
   }
 
@@ -3987,10 +4090,8 @@ static Value *simplifySelectWithICmpCond(Value *CondVal, Value *TrueVal,
 
     // Test for a bogus zero-shift-guard-op around funnel-shift or rotate.
     Value *ShAmt;
-    auto isFsh = m_CombineOr(m_Intrinsic<Intrinsic::fshl>(m_Value(X), m_Value(),
-                                                          m_Value(ShAmt)),
-                             m_Intrinsic<Intrinsic::fshr>(m_Value(), m_Value(X),
-                                                          m_Value(ShAmt)));
+    auto isFsh = m_CombineOr(m_FShl(m_Value(X), m_Value(), m_Value(ShAmt)),
+                             m_FShr(m_Value(), m_Value(X), m_Value(ShAmt)));
     // (ShAmt == 0) ? fshl(X, *, ShAmt) : X --> X
     // (ShAmt == 0) ? fshr(*, X, ShAmt) : X --> X
     if (match(TrueVal, isFsh) && FalseVal == X && CmpLHS == ShAmt)
@@ -4001,17 +4102,24 @@ static Value *simplifySelectWithICmpCond(Value *CondVal, Value *TrueVal,
     // intrinsics do not have that problem.
     // We do not allow this transform for the general funnel shift case because
     // that would not preserve the poison safety of the original code.
-    auto isRotate = m_CombineOr(m_Intrinsic<Intrinsic::fshl>(m_Value(X),
-                                                             m_Deferred(X),
-                                                             m_Value(ShAmt)),
-                                m_Intrinsic<Intrinsic::fshr>(m_Value(X),
-                                                             m_Deferred(X),
-                                                             m_Value(ShAmt)));
+    auto isRotate =
+        m_CombineOr(m_FShl(m_Value(X), m_Deferred(X), m_Value(ShAmt)),
+                    m_FShr(m_Value(X), m_Deferred(X), m_Value(ShAmt)));
     // (ShAmt == 0) ? X : fshl(X, X, ShAmt) --> fshl(X, X, ShAmt)
     // (ShAmt == 0) ? X : fshr(X, X, ShAmt) --> fshr(X, X, ShAmt)
     if (match(FalseVal, isRotate) && TrueVal == X && CmpLHS == ShAmt &&
         Pred == ICmpInst::ICMP_EQ)
       return FalseVal;
+
+    // X == 0 ? abs(X) : -abs(X) --> -abs(X)
+    // X == 0 ? -abs(X) : abs(X) --> abs(X)
+    if (match(TrueVal, m_Intrinsic<Intrinsic::abs>(m_Specific(CmpLHS))) &&
+        match(FalseVal, m_Neg(m_Intrinsic<Intrinsic::abs>(m_Specific(CmpLHS)))))
+      return FalseVal;
+    if (match(TrueVal,
+              m_Neg(m_Intrinsic<Intrinsic::abs>(m_Specific(CmpLHS)))) &&
+        match(FalseVal, m_Intrinsic<Intrinsic::abs>(m_Specific(CmpLHS))))
+      return FalseVal;
   }
 
   // Check for other compares that behave like bit test.
@@ -4019,10 +4127,12 @@ static Value *simplifySelectWithICmpCond(Value *CondVal, Value *TrueVal,
                                               TrueVal, FalseVal))
     return V;
 
-  // If we have an equality comparison, then we know the value in one of the
-  // arms of the select. See if substituting this value into the arm and
+  // If we have a scalar equality comparison, then we know the value in one of
+  // the arms of the select. See if substituting this value into the arm and
   // simplifying the result yields the same value as the other arm.
-  if (Pred == ICmpInst::ICMP_EQ) {
+  // Note that the equivalence/replacement opportunity does not hold for vectors
+  // because each element of a vector select is chosen independently.
+  if (Pred == ICmpInst::ICMP_EQ && !CondVal->getType()->isVectorTy()) {
     if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, Q,
                                /* AllowRefinement */ false, MaxRecurse) ==
             TrueVal ||
@@ -4083,7 +4193,7 @@ static Value *SimplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal,
         return ConstantFoldSelectInstruction(CondC, TrueC, FalseC);
 
     // select undef, X, Y -> X or Y
-    if (isa<UndefValue>(CondC))
+    if (Q.isUndefValue(CondC))
       return isa<Constant>(FalseVal) ? FalseVal : TrueVal;
 
     // TODO: Vector constants with undef elements don't simplify.
@@ -4109,16 +4219,24 @@ static Value *SimplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal,
   if (TrueVal == FalseVal)
     return TrueVal;
 
-  if (isa<UndefValue>(TrueVal))   // select ?, undef, X -> X
+  // If the true or false value is undef, we can fold to the other value as
+  // long as the other value isn't poison.
+  // select ?, undef, X -> X
+  if (Q.isUndefValue(TrueVal) &&
+      isGuaranteedNotToBeUndefOrPoison(FalseVal, Q.AC, Q.CxtI, Q.DT))
     return FalseVal;
-  if (isa<UndefValue>(FalseVal))   // select ?, X, undef -> X
+  // select ?, X, undef -> X
+  if (Q.isUndefValue(FalseVal) &&
+      isGuaranteedNotToBeUndefOrPoison(TrueVal, Q.AC, Q.CxtI, Q.DT))
     return TrueVal;
 
   // Deal with partial undef vector constants: select ?, VecC, VecC' --> VecC''
   Constant *TrueC, *FalseC;
-  if (TrueVal->getType()->isVectorTy() && match(TrueVal, m_Constant(TrueC)) &&
+  if (isa<FixedVectorType>(TrueVal->getType()) &&
+      match(TrueVal, m_Constant(TrueC)) &&
       match(FalseVal, m_Constant(FalseC))) {
-    unsigned NumElts = cast<VectorType>(TrueC->getType())->getNumElements();
+    unsigned NumElts =
+        cast<FixedVectorType>(TrueC->getType())->getNumElements();
     SmallVector<Constant *, 16> NewC;
     for (unsigned i = 0; i != NumElts; ++i) {
       // Bail out on incomplete vector constants.
@@ -4131,9 +4249,11 @@ static Value *SimplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal,
       // one element is undef, choose the defined element as the safe result.
       if (TEltC == FEltC)
         NewC.push_back(TEltC);
-      else if (isa<UndefValue>(TEltC))
+      else if (Q.isUndefValue(TEltC) &&
+               isGuaranteedNotToBeUndefOrPoison(FEltC))
         NewC.push_back(FEltC);
-      else if (isa<UndefValue>(FEltC))
+      else if (Q.isUndefValue(FEltC) &&
+               isGuaranteedNotToBeUndefOrPoison(TEltC))
         NewC.push_back(TEltC);
       else
         break;
@@ -4184,7 +4304,12 @@ static Value *SimplifyGEPInst(Type *SrcTy, ArrayRef<Value *> Ops,
   else if (VectorType *VT = dyn_cast<VectorType>(Ops[1]->getType()))
     GEPTy = VectorType::get(GEPTy, VT->getElementCount());
 
-  if (isa<UndefValue>(Ops[0]))
+  // getelementptr poison, idx -> poison
+  // getelementptr baseptr, poison -> poison
+  if (any_of(Ops, [](const auto *V) { return isa<PoisonValue>(V); }))
+    return PoisonValue::get(GEPTy);
+
+  if (Q.isUndefValue(Ops[0]))
     return UndefValue::get(GEPTy);
 
   bool IsScalableVec = isa<ScalableVectorType>(SrcTy);
@@ -4207,9 +4332,7 @@ static Value *SimplifyGEPInst(Type *SrcTy, ArrayRef<Value *> Ops,
       // doesn't truncate the pointers.
       if (Ops[1]->getType()->getScalarSizeInBits() ==
           Q.DL.getPointerSizeInBits(AS)) {
-        auto PtrToIntOrZero = [GEPTy](Value *P) -> Value * {
-          if (match(P, m_Zero()))
-            return Constant::getNullValue(GEPTy);
+        auto PtrToInt = [GEPTy](Value *P) -> Value * {
           Value *Temp;
           if (match(P, m_PtrToInt(m_Value(Temp))))
             if (Temp->getType() == GEPTy)
@@ -4217,10 +4340,14 @@ static Value *SimplifyGEPInst(Type *SrcTy, ArrayRef<Value *> Ops,
           return nullptr;
         };
 
+        // FIXME: The following transforms are only legal if P and V have the
+        // same provenance (PR44403). Check whether getUnderlyingObject() is
+        // the same?
+
         // getelementptr V, (sub P, V) -> P if P points to a type of size 1.
         if (TyAllocSize == 1 &&
             match(Ops[1], m_Sub(m_Value(P), m_PtrToInt(m_Specific(Ops[0])))))
-          if (Value *R = PtrToIntOrZero(P))
+          if (Value *R = PtrToInt(P))
             return R;
 
         // getelementptr V, (ashr (sub P, V), C) -> Q
@@ -4229,7 +4356,7 @@ static Value *SimplifyGEPInst(Type *SrcTy, ArrayRef<Value *> Ops,
                   m_AShr(m_Sub(m_Value(P), m_PtrToInt(m_Specific(Ops[0]))),
                          m_ConstantInt(C))) &&
             TyAllocSize == 1ULL << C)
-          if (Value *R = PtrToIntOrZero(P))
+          if (Value *R = PtrToInt(P))
             return R;
 
         // getelementptr V, (sdiv (sub P, V), C) -> Q
@@ -4237,7 +4364,7 @@ static Value *SimplifyGEPInst(Type *SrcTy, ArrayRef<Value *> Ops,
         if (match(Ops[1],
                   m_SDiv(m_Sub(m_Value(P), m_PtrToInt(m_Specific(Ops[0]))),
                          m_SpecificInt(TyAllocSize))))
-          if (Value *R = PtrToIntOrZero(P))
+          if (Value *R = PtrToInt(P))
             return R;
       }
     }
@@ -4254,15 +4381,21 @@ static Value *SimplifyGEPInst(Type *SrcTy, ArrayRef<Value *> Ops,
           Ops[0]->stripAndAccumulateInBoundsConstantOffsets(Q.DL,
                                                             BasePtrOffset);
 
+      // Avoid creating inttoptr of zero here: While LLVMs treatment of
+      // inttoptr is generally conservative, this particular case is folded to
+      // a null pointer, which will have incorrect provenance.
+
       // gep (gep V, C), (sub 0, V) -> C
       if (match(Ops.back(),
-                m_Sub(m_Zero(), m_PtrToInt(m_Specific(StrippedBasePtr))))) {
+                m_Sub(m_Zero(), m_PtrToInt(m_Specific(StrippedBasePtr)))) &&
+          !BasePtrOffset.isNullValue()) {
         auto *CI = ConstantInt::get(GEPTy->getContext(), BasePtrOffset);
         return ConstantExpr::getIntToPtr(CI, GEPTy);
       }
       // gep (gep V, C), (xor V, -1) -> C-1
       if (match(Ops.back(),
-                m_Xor(m_PtrToInt(m_Specific(StrippedBasePtr)), m_AllOnes()))) {
+                m_Xor(m_PtrToInt(m_Specific(StrippedBasePtr)), m_AllOnes())) &&
+          !BasePtrOffset.isOneValue()) {
         auto *CI = ConstantInt::get(GEPTy->getContext(), BasePtrOffset - 1);
         return ConstantExpr::getIntToPtr(CI, GEPTy);
       }
@@ -4293,7 +4426,7 @@ static Value *SimplifyInsertValueInst(Value *Agg, Value *Val,
       return ConstantFoldInsertValueInstruction(CAgg, CVal, Idxs);
 
   // insertvalue x, undef, n -> x
-  if (match(Val, m_Undef()))
+  if (Q.isUndefValue(Val))
     return Agg;
 
   // insertvalue x, (extractvalue y, n), n
@@ -4301,7 +4434,7 @@ static Value *SimplifyInsertValueInst(Value *Agg, Value *Val,
     if (EV->getAggregateOperand()->getType() == Agg->getType() &&
         EV->getIndices() == Idxs) {
       // insertvalue undef, (extractvalue y, n), n -> y
-      if (match(Agg, m_Undef()))
+      if (Q.isUndefValue(Agg))
         return EV->getAggregateOperand();
 
       // insertvalue y, (extractvalue y, n), n -> y
@@ -4325,22 +4458,23 @@ Value *llvm::SimplifyInsertElementInst(Value *Vec, Value *Val, Value *Idx,
   auto *ValC = dyn_cast<Constant>(Val);
   auto *IdxC = dyn_cast<Constant>(Idx);
   if (VecC && ValC && IdxC)
-    return ConstantFoldInsertElementInstruction(VecC, ValC, IdxC);
+    return ConstantExpr::getInsertElement(VecC, ValC, IdxC);
 
-  // For fixed-length vector, fold into undef if index is out of bounds.
+  // For fixed-length vector, fold into poison if index is out of bounds.
   if (auto *CI = dyn_cast<ConstantInt>(Idx)) {
     if (isa<FixedVectorType>(Vec->getType()) &&
         CI->uge(cast<FixedVectorType>(Vec->getType())->getNumElements()))
-      return UndefValue::get(Vec->getType());
+      return PoisonValue::get(Vec->getType());
   }
 
   // If index is undef, it might be out of bounds (see above case)
-  if (isa<UndefValue>(Idx))
-    return UndefValue::get(Vec->getType());
+  if (Q.isUndefValue(Idx))
+    return PoisonValue::get(Vec->getType());
 
-  // If the scalar is undef, and there is no risk of propagating poison from the
-  // vector value, simplify to the vector value.
-  if (isa<UndefValue>(Val) && isGuaranteedNotToBeUndefOrPoison(Vec))
+  // If the scalar is poison, or it is undef and there is no risk of
+  // propagating poison from the vector value, simplify to the vector value.
+  if (isa<PoisonValue>(Val) ||
+      (Q.isUndefValue(Val) && isGuaranteedNotToBePoison(Vec)))
     return Vec;
 
   // If we are extracting a value from a vector, then inserting it into the same
@@ -4384,18 +4518,18 @@ Value *llvm::SimplifyExtractValueInst(Value *Agg, ArrayRef<unsigned> Idxs,
 
 /// Given operands for an ExtractElementInst, see if we can fold the result.
 /// If not, this returns null.
-static Value *SimplifyExtractElementInst(Value *Vec, Value *Idx, const SimplifyQuery &,
-                                         unsigned) {
+static Value *SimplifyExtractElementInst(Value *Vec, Value *Idx,
+                                         const SimplifyQuery &Q, unsigned) {
   auto *VecVTy = cast<VectorType>(Vec->getType());
   if (auto *CVec = dyn_cast<Constant>(Vec)) {
     if (auto *CIdx = dyn_cast<Constant>(Idx))
-      return ConstantFoldExtractElementInstruction(CVec, CIdx);
+      return ConstantExpr::getExtractElement(CVec, CIdx);
 
     // The index is not relevant if our vector is a splat.
     if (auto *Splat = CVec->getSplatValue())
       return Splat;
 
-    if (isa<UndefValue>(Vec))
+    if (Q.isUndefValue(Vec))
       return UndefValue::get(VecVTy->getElementType());
   }
 
@@ -4404,16 +4538,16 @@ static Value *SimplifyExtractElementInst(Value *Vec, Value *Idx, const SimplifyQ
   if (auto *IdxC = dyn_cast<ConstantInt>(Idx)) {
     // For fixed-length vector, fold into undef if index is out of bounds.
     if (isa<FixedVectorType>(VecVTy) &&
-        IdxC->getValue().uge(VecVTy->getNumElements()))
-      return UndefValue::get(VecVTy->getElementType());
+        IdxC->getValue().uge(cast<FixedVectorType>(VecVTy)->getNumElements()))
+      return PoisonValue::get(VecVTy->getElementType());
     if (Value *Elt = findScalarElement(Vec, IdxC->getZExtValue()))
       return Elt;
   }
 
   // An undef extract index can be arbitrarily chosen to be an out-of-range
-  // index value, which would result in the instruction being undef.
-  if (isa<UndefValue>(Idx))
-    return UndefValue::get(VecVTy->getElementType());
+  // index value, which would result in the instruction being poison.
+  if (Q.isUndefValue(Idx))
+    return PoisonValue::get(VecVTy->getElementType());
 
   return nullptr;
 }
@@ -4425,6 +4559,10 @@ Value *llvm::SimplifyExtractElementInst(Value *Vec, Value *Idx,
 
 /// See if we can fold the given phi. If not, returns null.
 static Value *SimplifyPHINode(PHINode *PN, const SimplifyQuery &Q) {
+  // WARNING: no matter how worthwhile it may seem, we can not perform PHI CSE
+  //          here, because the PHI we may succeed simplifying to was not
+  //          def-reachable from the original PHI!
+
   // If all of the PHI's incoming values are the same then replace the PHI node
   // with the common value.
   Value *CommonValue = nullptr;
@@ -4432,7 +4570,7 @@ static Value *SimplifyPHINode(PHINode *PN, const SimplifyQuery &Q) {
   for (Value *Incoming : PN->incoming_values()) {
     // If the incoming value is the phi node itself, it can safely be skipped.
     if (Incoming == PN) continue;
-    if (isa<UndefValue>(Incoming)) {
+    if (Q.isUndefValue(Incoming)) {
       // Remember that we saw an undef value, but otherwise ignore them.
       HasUndefInput = true;
       continue;
@@ -4510,7 +4648,7 @@ static Value *foldIdentityShuffles(int DestElt, Value *Op0, Value *Op1,
     return nullptr;
 
   // The mask value chooses which source operand we need to look at next.
-  int InVecNumElts = cast<VectorType>(Op0->getType())->getNumElements();
+  int InVecNumElts = cast<FixedVectorType>(Op0->getType())->getNumElements();
   int RootElt = MaskVal;
   Value *SourceOp = Op0;
   if (MaskVal >= InVecNumElts) {
@@ -4557,16 +4695,16 @@ static Value *SimplifyShuffleVectorInst(Value *Op0, Value *Op1,
   unsigned MaskNumElts = Mask.size();
   ElementCount InVecEltCount = InVecTy->getElementCount();
 
-  bool Scalable = InVecEltCount.Scalable;
+  bool Scalable = InVecEltCount.isScalable();
 
   SmallVector<int, 32> Indices;
   Indices.assign(Mask.begin(), Mask.end());
 
   // Canonicalization: If mask does not select elements from an input vector,
-  // replace that input vector with undef.
+  // replace that input vector with poison.
   if (!Scalable) {
     bool MaskSelects0 = false, MaskSelects1 = false;
-    unsigned InVecNumElts = InVecEltCount.Min;
+    unsigned InVecNumElts = InVecEltCount.getKnownMinValue();
     for (unsigned i = 0; i != MaskNumElts; ++i) {
       if (Indices[i] == -1)
         continue;
@@ -4576,9 +4714,9 @@ static Value *SimplifyShuffleVectorInst(Value *Op0, Value *Op1,
         MaskSelects1 = true;
     }
     if (!MaskSelects0)
-      Op0 = UndefValue::get(InVecTy);
+      Op0 = PoisonValue::get(InVecTy);
     if (!MaskSelects1)
-      Op1 = UndefValue::get(InVecTy);
+      Op1 = PoisonValue::get(InVecTy);
   }
 
   auto *Op0Const = dyn_cast<Constant>(Op0);
@@ -4587,15 +4725,16 @@ static Value *SimplifyShuffleVectorInst(Value *Op0, Value *Op1,
   // If all operands are constant, constant fold the shuffle. This
   // transformation depends on the value of the mask which is not known at
   // compile time for scalable vectors
-  if (!Scalable && Op0Const && Op1Const)
-    return ConstantFoldShuffleVectorInstruction(Op0Const, Op1Const, Mask);
+  if (Op0Const && Op1Const)
+    return ConstantExpr::getShuffleVector(Op0Const, Op1Const, Mask);
 
   // Canonicalization: if only one input vector is constant, it shall be the
   // second one. This transformation depends on the value of the mask which
   // is not known at compile time for scalable vectors
   if (!Scalable && Op0Const && !Op1Const) {
     std::swap(Op0, Op1);
-    ShuffleVectorInst::commuteShuffleMask(Indices, InVecEltCount.Min);
+    ShuffleVectorInst::commuteShuffleMask(Indices,
+                                          InVecEltCount.getKnownMinValue());
   }
 
   // A splat of an inserted scalar constant becomes a vector constant:
@@ -4627,7 +4766,7 @@ static Value *SimplifyShuffleVectorInst(Value *Op0, Value *Op1,
   // A shuffle of a splat is always the splat itself. Legal if the shuffle's
   // value type is same as the input vectors' type.
   if (auto *OpShuf = dyn_cast<ShuffleVectorInst>(Op0))
-    if (isa<UndefValue>(Op1) && RetTy == InVecTy &&
+    if (Q.isUndefValue(Op1) && RetTy == InVecTy &&
         is_splat(OpShuf->getShuffleMask()))
       return Op0;
 
@@ -4639,7 +4778,7 @@ static Value *SimplifyShuffleVectorInst(Value *Op0, Value *Op1,
   // Don't fold a shuffle with undef mask elements. This may get folded in a
   // better way using demanded bits or other analysis.
   // TODO: Should we allow this?
-  if (find(Indices, -1) != Indices.end())
+  if (is_contained(Indices, -1))
     return nullptr;
 
   // Check if every element of this shuffle can be mapped back to the
@@ -4708,19 +4847,20 @@ static Constant *propagateNaN(Constant *In) {
 /// transforms based on undef/NaN because the operation itself makes no
 /// difference to the result.
 static Constant *simplifyFPOp(ArrayRef<Value *> Ops,
-                              FastMathFlags FMF = FastMathFlags()) {
+                              FastMathFlags FMF,
+                              const SimplifyQuery &Q) {
   for (Value *V : Ops) {
     bool IsNan = match(V, m_NaN());
     bool IsInf = match(V, m_Inf());
-    bool IsUndef = match(V, m_Undef());
+    bool IsUndef = Q.isUndefValue(V);
 
     // If this operation has 'nnan' or 'ninf' and at least 1 disallowed operand
     // (an undef operand can be chosen to be Nan/Inf), then the result of
-    // this operation is poison. That result can be relaxed to undef.
+    // this operation is poison.
     if (FMF.noNaNs() && (IsNan || IsUndef))
-      return UndefValue::get(V->getType());
+      return PoisonValue::get(V->getType());
     if (FMF.noInfs() && (IsInf || IsUndef))
-      return UndefValue::get(V->getType());
+      return PoisonValue::get(V->getType());
 
     if (IsUndef || IsNan)
       return propagateNaN(cast<Constant>(V));
@@ -4735,7 +4875,7 @@ static Value *SimplifyFAddInst(Value *Op0, Value *Op1, FastMathFlags FMF,
   if (Constant *C = foldOrCommuteConstant(Instruction::FAdd, Op0, Op1, Q))
     return C;
 
-  if (Constant *C = simplifyFPOp({Op0, Op1}, FMF))
+  if (Constant *C = simplifyFPOp({Op0, Op1}, FMF, Q))
     return C;
 
   // fadd X, -0 ==> X
@@ -4782,7 +4922,7 @@ static Value *SimplifyFSubInst(Value *Op0, Value *Op1, FastMathFlags FMF,
   if (Constant *C = foldOrCommuteConstant(Instruction::FSub, Op0, Op1, Q))
     return C;
 
-  if (Constant *C = simplifyFPOp({Op0, Op1}, FMF))
+  if (Constant *C = simplifyFPOp({Op0, Op1}, FMF, Q))
     return C;
 
   // fsub X, +0 ==> X
@@ -4824,7 +4964,7 @@ static Value *SimplifyFSubInst(Value *Op0, Value *Op1, FastMathFlags FMF,
 
 static Value *SimplifyFMAFMul(Value *Op0, Value *Op1, FastMathFlags FMF,
                               const SimplifyQuery &Q, unsigned MaxRecurse) {
-  if (Constant *C = simplifyFPOp({Op0, Op1}, FMF))
+  if (Constant *C = simplifyFPOp({Op0, Op1}, FMF, Q))
     return C;
 
   // fmul X, 1.0 ==> X
@@ -4891,7 +5031,7 @@ static Value *SimplifyFDivInst(Value *Op0, Value *Op1, FastMathFlags FMF,
   if (Constant *C = foldOrCommuteConstant(Instruction::FDiv, Op0, Op1, Q))
     return C;
 
-  if (Constant *C = simplifyFPOp({Op0, Op1}, FMF))
+  if (Constant *C = simplifyFPOp({Op0, Op1}, FMF, Q))
     return C;
 
   // X / 1.0 -> X
@@ -4936,7 +5076,7 @@ static Value *SimplifyFRemInst(Value *Op0, Value *Op1, FastMathFlags FMF,
   if (Constant *C = foldOrCommuteConstant(Instruction::FRem, Op0, Op1, Q))
     return C;
 
-  if (Constant *C = simplifyFPOp({Op0, Op1}, FMF))
+  if (Constant *C = simplifyFPOp({Op0, Op1}, FMF, Q))
     return C;
 
   // Unlike fdiv, the result of frem always matches the sign of the dividend.
@@ -5181,6 +5321,15 @@ static Value *simplifyUnaryIntrinsic(Function *F, Value *Op0,
     // bitreverse(bitreverse(x)) -> x
     if (match(Op0, m_BitReverse(m_Value(X)))) return X;
     break;
+  case Intrinsic::ctpop: {
+    // If everything but the lowest bit is zero, that bit is the pop-count. Ex:
+    // ctpop(and X, 1) --> and X, 1
+    unsigned BitWidth = Op0->getType()->getScalarSizeInBits();
+    if (MaskedValueIsZero(Op0, APInt::getHighBitsSet(BitWidth, BitWidth - 1),
+                          Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
+      return Op0;
+    break;
+  }
   case Intrinsic::exp:
     // exp(log(x)) -> x
     if (Q.CxtI->hasAllowReassoc() &&
@@ -5233,27 +5382,156 @@ static Value *simplifyUnaryIntrinsic(Function *F, Value *Op0,
   return nullptr;
 }
 
+static Intrinsic::ID getMaxMinOpposite(Intrinsic::ID IID) {
+  switch (IID) {
+  case Intrinsic::smax: return Intrinsic::smin;
+  case Intrinsic::smin: return Intrinsic::smax;
+  case Intrinsic::umax: return Intrinsic::umin;
+  case Intrinsic::umin: return Intrinsic::umax;
+  default: llvm_unreachable("Unexpected intrinsic");
+  }
+}
+
+static APInt getMaxMinLimit(Intrinsic::ID IID, unsigned BitWidth) {
+  switch (IID) {
+  case Intrinsic::smax: return APInt::getSignedMaxValue(BitWidth);
+  case Intrinsic::smin: return APInt::getSignedMinValue(BitWidth);
+  case Intrinsic::umax: return APInt::getMaxValue(BitWidth);
+  case Intrinsic::umin: return APInt::getMinValue(BitWidth);
+  default: llvm_unreachable("Unexpected intrinsic");
+  }
+}
+
+static ICmpInst::Predicate getMaxMinPredicate(Intrinsic::ID IID) {
+  switch (IID) {
+  case Intrinsic::smax: return ICmpInst::ICMP_SGE;
+  case Intrinsic::smin: return ICmpInst::ICMP_SLE;
+  case Intrinsic::umax: return ICmpInst::ICMP_UGE;
+  case Intrinsic::umin: return ICmpInst::ICMP_ULE;
+  default: llvm_unreachable("Unexpected intrinsic");
+  }
+}
+
+/// Given a min/max intrinsic, see if it can be removed based on having an
+/// operand that is another min/max intrinsic with shared operand(s). The caller
+/// is expected to swap the operand arguments to handle commutation.
+static Value *foldMinMaxSharedOp(Intrinsic::ID IID, Value *Op0, Value *Op1) {
+  Value *X, *Y;
+  if (!match(Op0, m_MaxOrMin(m_Value(X), m_Value(Y))))
+    return nullptr;
+
+  auto *MM0 = dyn_cast<IntrinsicInst>(Op0);
+  if (!MM0)
+    return nullptr;
+  Intrinsic::ID IID0 = MM0->getIntrinsicID();
+
+  if (Op1 == X || Op1 == Y ||
+      match(Op1, m_c_MaxOrMin(m_Specific(X), m_Specific(Y)))) {
+    // max (max X, Y), X --> max X, Y
+    if (IID0 == IID)
+      return MM0;
+    // max (min X, Y), X --> X
+    if (IID0 == getMaxMinOpposite(IID))
+      return Op1;
+  }
+  return nullptr;
+}
+
 static Value *simplifyBinaryIntrinsic(Function *F, Value *Op0, Value *Op1,
                                       const SimplifyQuery &Q) {
   Intrinsic::ID IID = F->getIntrinsicID();
   Type *ReturnType = F->getReturnType();
+  unsigned BitWidth = ReturnType->getScalarSizeInBits();
   switch (IID) {
+  case Intrinsic::abs:
+    // abs(abs(x)) -> abs(x). We don't need to worry about the nsw arg here.
+    // It is always ok to pick the earlier abs. We'll just lose nsw if its only
+    // on the outer abs.
+    if (match(Op0, m_Intrinsic<Intrinsic::abs>(m_Value(), m_Value())))
+      return Op0;
+    break;
+
+  case Intrinsic::smax:
+  case Intrinsic::smin:
+  case Intrinsic::umax:
+  case Intrinsic::umin: {
+    // If the arguments are the same, this is a no-op.
+    if (Op0 == Op1)
+      return Op0;
+
+    // Canonicalize constant operand as Op1.
+    if (isa<Constant>(Op0))
+      std::swap(Op0, Op1);
+
+    // Assume undef is the limit value.
+    if (Q.isUndefValue(Op1))
+      return ConstantInt::get(ReturnType, getMaxMinLimit(IID, BitWidth));
+
+    const APInt *C;
+    if (match(Op1, m_APIntAllowUndef(C))) {
+      // Clamp to limit value. For example:
+      // umax(i8 %x, i8 255) --> 255
+      if (*C == getMaxMinLimit(IID, BitWidth))
+        return ConstantInt::get(ReturnType, *C);
+
+      // If the constant op is the opposite of the limit value, the other must
+      // be larger/smaller or equal. For example:
+      // umin(i8 %x, i8 255) --> %x
+      if (*C == getMaxMinLimit(getMaxMinOpposite(IID), BitWidth))
+        return Op0;
+
+      // Remove nested call if constant operands allow it. Example:
+      // max (max X, 7), 5 -> max X, 7
+      auto *MinMax0 = dyn_cast<IntrinsicInst>(Op0);
+      if (MinMax0 && MinMax0->getIntrinsicID() == IID) {
+        // TODO: loosen undef/splat restrictions for vector constants.
+        Value *M00 = MinMax0->getOperand(0), *M01 = MinMax0->getOperand(1);
+        const APInt *InnerC;
+        if ((match(M00, m_APInt(InnerC)) || match(M01, m_APInt(InnerC))) &&
+            ((IID == Intrinsic::smax && InnerC->sge(*C)) ||
+             (IID == Intrinsic::smin && InnerC->sle(*C)) ||
+             (IID == Intrinsic::umax && InnerC->uge(*C)) ||
+             (IID == Intrinsic::umin && InnerC->ule(*C))))
+          return Op0;
+      }
+    }
+
+    if (Value *V = foldMinMaxSharedOp(IID, Op0, Op1))
+      return V;
+    if (Value *V = foldMinMaxSharedOp(IID, Op1, Op0))
+      return V;
+
+    ICmpInst::Predicate Pred = getMaxMinPredicate(IID);
+    if (isICmpTrue(Pred, Op0, Op1, Q.getWithoutUndef(), RecursionLimit))
+      return Op0;
+    if (isICmpTrue(Pred, Op1, Op0, Q.getWithoutUndef(), RecursionLimit))
+      return Op1;
+
+    if (Optional<bool> Imp =
+            isImpliedByDomCondition(Pred, Op0, Op1, Q.CxtI, Q.DL))
+      return *Imp ? Op0 : Op1;
+    if (Optional<bool> Imp =
+            isImpliedByDomCondition(Pred, Op1, Op0, Q.CxtI, Q.DL))
+      return *Imp ? Op1 : Op0;
+
+    break;
+  }
   case Intrinsic::usub_with_overflow:
   case Intrinsic::ssub_with_overflow:
     // X - X -> { 0, false }
-    if (Op0 == Op1)
+    // X - undef -> { 0, false }
+    // undef - X -> { 0, false }
+    if (Op0 == Op1 || Q.isUndefValue(Op0) || Q.isUndefValue(Op1))
       return Constant::getNullValue(ReturnType);
-    LLVM_FALLTHROUGH;
+    break;
   case Intrinsic::uadd_with_overflow:
   case Intrinsic::sadd_with_overflow:
-    // X - undef -> { undef, false }
-    // undef - X -> { undef, false }
-    // X + undef -> { undef, false }
-    // undef + x -> { undef, false }
-    if (isa<UndefValue>(Op0) || isa<UndefValue>(Op1)) {
+    // X + undef -> { -1, false }
+    // undef + x -> { -1, false }
+    if (Q.isUndefValue(Op0) || Q.isUndefValue(Op1)) {
       return ConstantStruct::get(
           cast<StructType>(ReturnType),
-          {UndefValue::get(ReturnType->getStructElementType(0)),
+          {Constant::getAllOnesValue(ReturnType->getStructElementType(0)),
            Constant::getNullValue(ReturnType->getStructElementType(1))});
     }
     break;
@@ -5265,7 +5543,7 @@ static Value *simplifyBinaryIntrinsic(Function *F, Value *Op0, Value *Op1,
       return Constant::getNullValue(ReturnType);
     // undef * X -> { 0, false }
     // X * undef -> { 0, false }
-    if (match(Op0, m_Undef()) || match(Op1, m_Undef()))
+    if (Q.isUndefValue(Op0) || Q.isUndefValue(Op1))
       return Constant::getNullValue(ReturnType);
     break;
   case Intrinsic::uadd_sat:
@@ -5279,7 +5557,7 @@ static Value *simplifyBinaryIntrinsic(Function *F, Value *Op0, Value *Op1,
     // sat(undef + X) -> -1
     // For unsigned: Assume undef is MAX, thus we saturate to MAX (-1).
     // For signed: Assume undef is ~X, in which case X + ~X = -1.
-    if (match(Op0, m_Undef()) || match(Op1, m_Undef()))
+    if (Q.isUndefValue(Op0) || Q.isUndefValue(Op1))
       return Constant::getAllOnesValue(ReturnType);
 
     // X + 0 -> X
@@ -5296,7 +5574,7 @@ static Value *simplifyBinaryIntrinsic(Function *F, Value *Op0, Value *Op1,
     LLVM_FALLTHROUGH;
   case Intrinsic::ssub_sat:
     // X - X -> 0, X - undef -> 0, undef - X -> 0
-    if (Op0 == Op1 || match(Op0, m_Undef()) || match(Op1, m_Undef()))
+    if (Op0 == Op1 || Q.isUndefValue(Op0) || Q.isUndefValue(Op1))
       return Constant::getNullValue(ReturnType);
     // X - 0 -> X
     if (match(Op1, m_Zero()))
@@ -5334,18 +5612,43 @@ static Value *simplifyBinaryIntrinsic(Function *F, Value *Op0, Value *Op1,
     // If the arguments are the same, this is a no-op.
     if (Op0 == Op1) return Op0;
 
-    // If one argument is undef, return the other argument.
-    if (match(Op0, m_Undef()))
-      return Op1;
-    if (match(Op1, m_Undef()))
+    // Canonicalize constant operand as Op1.
+    if (isa<Constant>(Op0))
+      std::swap(Op0, Op1);
+
+    // If an argument is undef, return the other argument.
+    if (Q.isUndefValue(Op1))
       return Op0;
 
-    // If one argument is NaN, return other or NaN appropriately.
     bool PropagateNaN = IID == Intrinsic::minimum || IID == Intrinsic::maximum;
-    if (match(Op0, m_NaN()))
-      return PropagateNaN ? Op0 : Op1;
+    bool IsMin = IID == Intrinsic::minimum || IID == Intrinsic::minnum;
+
+    // minnum(X, nan) -> X
+    // maxnum(X, nan) -> X
+    // minimum(X, nan) -> nan
+    // maximum(X, nan) -> nan
     if (match(Op1, m_NaN()))
-      return PropagateNaN ? Op1 : Op0;
+      return PropagateNaN ? propagateNaN(cast<Constant>(Op1)) : Op0;
+
+    // In the following folds, inf can be replaced with the largest finite
+    // float, if the ninf flag is set.
+    const APFloat *C;
+    if (match(Op1, m_APFloat(C)) &&
+        (C->isInfinity() || (Q.CxtI->hasNoInfs() && C->isLargest()))) {
+      // minnum(X, -inf) -> -inf
+      // maxnum(X, +inf) -> +inf
+      // minimum(X, -inf) -> -inf if nnan
+      // maximum(X, +inf) -> +inf if nnan
+      if (C->isNegative() == IsMin && (!PropagateNaN || Q.CxtI->hasNoNaNs()))
+        return ConstantFP::get(ReturnType, *C);
+
+      // minnum(X, +inf) -> X if nnan
+      // maxnum(X, -inf) -> X if nnan
+      // minimum(X, +inf) -> X
+      // maximum(X, -inf) -> X
+      if (C->isNegative() != IsMin && (PropagateNaN || Q.CxtI->hasNoNaNs()))
+        return Op0;
+    }
 
     // Min/max of the same operation with common operand:
     // m(m(X, Y)), X --> m(X, Y) (4 commuted variants)
@@ -5358,20 +5661,6 @@ static Value *simplifyBinaryIntrinsic(Function *F, Value *Op0, Value *Op1,
           (M1->getOperand(0) == Op0 || M1->getOperand(1) == Op0))
         return Op1;
 
-    // min(X, -Inf) --> -Inf (and commuted variant)
-    // max(X, +Inf) --> +Inf (and commuted variant)
-    bool UseNegInf = IID == Intrinsic::minnum || IID == Intrinsic::minimum;
-    const APFloat *C;
-    if ((match(Op0, m_APFloat(C)) && C->isInfinity() &&
-         C->isNegative() == UseNegInf) ||
-        (match(Op1, m_APFloat(C)) && C->isInfinity() &&
-         C->isNegative() == UseNegInf))
-      return ConstantFP::getInfinity(ReturnType, UseNegInf);
-
-    // TODO: minnum(nnan x, inf) -> x
-    // TODO: minnum(nnan ninf x, flt_max) -> x
-    // TODO: maxnum(nnan x, -inf) -> x
-    // TODO: maxnum(nnan ninf x, -flt_max) -> x
     break;
   }
   default:
@@ -5414,11 +5703,11 @@ static Value *simplifyIntrinsic(CallBase *Call, const SimplifyQuery &Q) {
           *ShAmtArg = Call->getArgOperand(2);
 
     // If both operands are undef, the result is undef.
-    if (match(Op0, m_Undef()) && match(Op1, m_Undef()))
+    if (Q.isUndefValue(Op0) && Q.isUndefValue(Op1))
       return UndefValue::get(F->getReturnType());
 
     // If shift amount is undef, assume it is zero.
-    if (match(ShAmtArg, m_Undef()))
+    if (Q.isUndefValue(ShAmtArg))
       return Call->getArgOperand(IID == Intrinsic::fshl ? 0 : 1);
 
     const APInt *ShAmtC;
@@ -5435,7 +5724,7 @@ static Value *simplifyIntrinsic(CallBase *Call, const SimplifyQuery &Q) {
     Value *Op0 = Call->getArgOperand(0);
     Value *Op1 = Call->getArgOperand(1);
     Value *Op2 = Call->getArgOperand(2);
-    if (Value *V = simplifyFPOp({ Op0, Op1, Op2 }))
+    if (Value *V = simplifyFPOp({ Op0, Op1, Op2 }, {}, Q))
       return V;
     return nullptr;
   }
@@ -5444,28 +5733,9 @@ static Value *simplifyIntrinsic(CallBase *Call, const SimplifyQuery &Q) {
   }
 }
 
-Value *llvm::SimplifyCall(CallBase *Call, const SimplifyQuery &Q) {
-  Value *Callee = Call->getCalledOperand();
-
-  // musttail calls can only be simplified if they are also DCEd.
-  // As we can't guarantee this here, don't simplify them.
-  if (Call->isMustTailCall())
-    return nullptr;
-
-  // call undef -> undef
-  // call null -> undef
-  if (isa<UndefValue>(Callee) || isa<ConstantPointerNull>(Callee))
-    return UndefValue::get(Call->getType());
-
-  Function *F = dyn_cast<Function>(Callee);
-  if (!F)
-    return nullptr;
-
-  if (F->isIntrinsic())
-    if (Value *Ret = simplifyIntrinsic(Call, Q))
-      return Ret;
-
-  if (!canConstantFoldCallTo(Call, F))
+static Value *tryConstantFoldCall(CallBase *Call, const SimplifyQuery &Q) {
+  auto *F = dyn_cast<Function>(Call->getCalledOperand());
+  if (!F || !canConstantFoldCallTo(Call, F))
     return nullptr;
 
   SmallVector<Constant *, 4> ConstantArgs;
@@ -5484,10 +5754,33 @@ Value *llvm::SimplifyCall(CallBase *Call, const SimplifyQuery &Q) {
   return ConstantFoldCall(Call, F, ConstantArgs, Q.TLI);
 }
 
+Value *llvm::SimplifyCall(CallBase *Call, const SimplifyQuery &Q) {
+  // musttail calls can only be simplified if they are also DCEd.
+  // As we can't guarantee this here, don't simplify them.
+  if (Call->isMustTailCall())
+    return nullptr;
+
+  // call undef -> poison
+  // call null -> poison
+  Value *Callee = Call->getCalledOperand();
+  if (isa<UndefValue>(Callee) || isa<ConstantPointerNull>(Callee))
+    return PoisonValue::get(Call->getType());
+
+  if (Value *V = tryConstantFoldCall(Call, Q))
+    return V;
+
+  auto *F = dyn_cast<Function>(Callee);
+  if (F && F->isIntrinsic())
+    if (Value *Ret = simplifyIntrinsic(Call, Q))
+      return Ret;
+
+  return nullptr;
+}
+
 /// Given operands for a Freeze, see if we can fold the result.
 static Value *SimplifyFreezeInst(Value *Op0, const SimplifyQuery &Q) {
   // Use a utility function defined in ValueTracking.
-  if (llvm::isGuaranteedNotToBeUndefOrPoison(Op0, Q.CxtI, Q.DT))
+  if (llvm::isGuaranteedNotToBeUndefOrPoison(Op0, Q.AC, Q.CxtI, Q.DT))
     return Op0;
   // We have room for improvement.
   return nullptr;
@@ -5596,7 +5889,7 @@ Value *llvm::SimplifyInstruction(Instruction *I, const SimplifyQuery &SQ,
                                 I->getOperand(2), Q);
     break;
   case Instruction::GetElementPtr: {
-    SmallVector<Value *, 8> Ops(I->op_begin(), I->op_end());
+    SmallVector<Value *, 8> Ops(I->operands());
     Result = SimplifyGEPInst(cast<GetElementPtrInst>(I)->getSourceElementType(),
                              Ops, Q);
     break;
@@ -5733,13 +6026,6 @@ static bool replaceAndRecursivelySimplifyImpl(
   return Simplified;
 }
 
-bool llvm::recursivelySimplifyInstruction(Instruction *I,
-                                          const TargetLibraryInfo *TLI,
-                                          const DominatorTree *DT,
-                                          AssumptionCache *AC) {
-  return replaceAndRecursivelySimplifyImpl(I, nullptr, TLI, DT, AC, nullptr);
-}
-
 bool llvm::replaceAndRecursivelySimplify(
     Instruction *I, Value *SimpleV, const TargetLibraryInfo *TLI,
     const DominatorTree *DT, AssumptionCache *AC,
diff --git a/contrib/llvm-project/llvm/lib/Analysis/Interval.cpp b/contrib/llvm-project/llvm/lib/Analysis/Interval.cpp
index 07d6e27c13be..e228ec4f2126 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/Interval.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/Interval.cpp
@@ -22,17 +22,6 @@ using namespace llvm;
 // Interval Implementation
 //===----------------------------------------------------------------------===//
 
-// isLoop - Find out if there is a back edge in this interval...
-bool Interval::isLoop() const {
-  // There is a loop in this interval iff one of the predecessors of the header
-  // node lives in the interval.
-  for (::pred_iterator I = ::pred_begin(HeaderNode), E = ::pred_end(HeaderNode);
-       I != E; ++I)
-    if (contains(*I))
-      return true;
-  return false;
-}
-
 void Interval::print(raw_ostream &OS) const {
   OS << "-------------------------------------------------------------\n"
        << "Interval Contents:\n";
diff --git a/contrib/llvm-project/llvm/lib/Analysis/LazyCallGraph.cpp b/contrib/llvm-project/llvm/lib/Analysis/LazyCallGraph.cpp
index efded17cef4e..f2c85a69f125 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/LazyCallGraph.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/LazyCallGraph.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
@@ -255,6 +256,24 @@ void LazyCallGraph::SCC::verify() {
            "Must set low link to -1 when adding a node to an SCC!");
     for (Edge &E : **N)
       assert(E.getNode().isPopulated() && "Can't have an unpopulated node!");
+
+#ifdef EXPENSIVE_CHECKS
+    // Verify that all nodes in this SCC can reach all other nodes.
+    SmallVector<Node *, 4> Worklist;
+    SmallPtrSet<Node *, 4> Visited;
+    Worklist.push_back(N);
+    while (!Worklist.empty()) {
+      Node *VisitingNode = Worklist.pop_back_val();
+      if (!Visited.insert(VisitingNode).second)
+        continue;
+      for (Edge &E : (*VisitingNode)->calls())
+        Worklist.push_back(&E.getNode());
+    }
+    for (Node *NodeToVisit : Nodes) {
+      assert(Visited.contains(NodeToVisit) &&
+             "Cannot reach all nodes within SCC");
+    }
+#endif
   }
 }
 #endif
@@ -357,6 +376,31 @@ void LazyCallGraph::RefSCC::verify() {
         }
       }
   }
+
+#ifdef EXPENSIVE_CHECKS
+  // Verify that all nodes in this RefSCC can reach all other nodes.
+  SmallVector<Node *> Nodes;
+  for (SCC *C : SCCs) {
+    for (Node &N : *C)
+      Nodes.push_back(&N);
+  }
+  for (Node *N : Nodes) {
+    SmallVector<Node *, 4> Worklist;
+    SmallPtrSet<Node *, 4> Visited;
+    Worklist.push_back(N);
+    while (!Worklist.empty()) {
+      Node *VisitingNode = Worklist.pop_back_val();
+      if (!Visited.insert(VisitingNode).second)
+        continue;
+      for (Edge &E : **VisitingNode)
+        Worklist.push_back(&E.getNode());
+    }
+    for (Node *NodeToVisit : Nodes) {
+      assert(Visited.contains(NodeToVisit) &&
+             "Cannot reach all nodes within RefSCC");
+    }
+  }
+#endif
 }
 #endif
 
@@ -822,7 +866,7 @@ LazyCallGraph::RefSCC::switchInternalEdgeToRef(Node &SourceN, Node &TargetN) {
             PendingSCCStack.clear();
             while (!DFSStack.empty())
               OldSCC.Nodes.push_back(DFSStack.pop_back_val().first);
-            for (Node &N : make_range(OldSCC.begin() + OldSize, OldSCC.end())) {
+            for (Node &N : drop_begin(OldSCC, OldSize)) {
               N.DFSNumber = N.LowLink = -1;
               G->SCCMap[&N] = &OldSCC;
             }
@@ -1373,23 +1417,6 @@ LazyCallGraph::RefSCC::removeInternalRefEdge(Node &SourceN,
   return Result;
 }
 
-void LazyCallGraph::RefSCC::handleTrivialEdgeInsertion(Node &SourceN,
-                                                       Node &TargetN) {
-  // The only trivial case that requires any graph updates is when we add new
-  // ref edge and may connect different RefSCCs along that path. This is only
-  // because of the parents set. Every other part of the graph remains constant
-  // after this edge insertion.
-  assert(G->lookupRefSCC(SourceN) == this && "Source must be in this RefSCC.");
-  RefSCC &TargetRC = *G->lookupRefSCC(TargetN);
-  if (&TargetRC == this)
-    return;
-
-#ifdef EXPENSIVE_CHECKS
-  assert(TargetRC.isDescendantOf(*this) &&
-         "Target must be a descendant of the Source.");
-#endif
-}
-
 void LazyCallGraph::RefSCC::insertTrivialCallEdge(Node &SourceN,
                                                   Node &TargetN) {
 #ifndef NDEBUG
@@ -1420,9 +1447,6 @@ void LazyCallGraph::RefSCC::insertTrivialCallEdge(Node &SourceN,
     // Create the new edge.
     SourceN->Edges.emplace_back(TargetN, Edge::Call);
   }
-
-  // Now that we have the edge, handle the graph fallout.
-  handleTrivialEdgeInsertion(SourceN, TargetN);
 }
 
 void LazyCallGraph::RefSCC::insertTrivialRefEdge(Node &SourceN, Node &TargetN) {
@@ -1449,9 +1473,6 @@ void LazyCallGraph::RefSCC::insertTrivialRefEdge(Node &SourceN, Node &TargetN) {
 
   // Create the new edge.
   SourceN->Edges.emplace_back(TargetN, Edge::Ref);
-
-  // Now that we have the edge, handle the graph fallout.
-  handleTrivialEdgeInsertion(SourceN, TargetN);
 }
 
 void LazyCallGraph::RefSCC::replaceNodeFunction(Node &N, Function &NewF) {
@@ -1565,19 +1586,213 @@ void LazyCallGraph::removeDeadFunction(Function &F) {
   // allocators.
 }
 
-void LazyCallGraph::addNewFunctionIntoSCC(Function &NewF, SCC &C) {
-  addNodeToSCC(C, createNode(NewF));
+// Gets the Edge::Kind from one function to another by looking at the function's
+// instructions. Asserts if there is no edge.
+// Useful for determining what type of edge should exist between functions when
+// the edge hasn't been created yet.
+static LazyCallGraph::Edge::Kind getEdgeKind(Function &OriginalFunction,
+                                             Function &NewFunction) {
+  // In release builds, assume that if there are no direct calls to the new
+  // function, then there is a ref edge. In debug builds, keep track of
+  // references to assert that there is actually a ref edge if there is no call
+  // edge.
+#ifndef NDEBUG
+  SmallVector<Constant *, 16> Worklist;
+  SmallPtrSet<Constant *, 16> Visited;
+#endif
+
+  for (Instruction &I : instructions(OriginalFunction)) {
+    if (auto *CB = dyn_cast<CallBase>(&I)) {
+      if (Function *Callee = CB->getCalledFunction()) {
+        if (Callee == &NewFunction)
+          return LazyCallGraph::Edge::Kind::Call;
+      }
+    }
+#ifndef NDEBUG
+    for (Value *Op : I.operand_values()) {
+      if (Constant *C = dyn_cast<Constant>(Op)) {
+        if (Visited.insert(C).second)
+          Worklist.push_back(C);
+      }
+    }
+#endif
+  }
+
+#ifndef NDEBUG
+  bool FoundNewFunction = false;
+  LazyCallGraph::visitReferences(Worklist, Visited, [&](Function &F) {
+    if (&F == &NewFunction)
+      FoundNewFunction = true;
+  });
+  assert(FoundNewFunction && "No edge from original function to new function");
+#endif
+
+  return LazyCallGraph::Edge::Kind::Ref;
 }
 
-void LazyCallGraph::addNewFunctionIntoRefSCC(Function &NewF, RefSCC &RC) {
-  Node &N = createNode(NewF);
+void LazyCallGraph::addSplitFunction(Function &OriginalFunction,
+                                     Function &NewFunction) {
+  assert(lookup(OriginalFunction) &&
+         "Original function's node should already exist");
+  Node &OriginalN = get(OriginalFunction);
+  SCC *OriginalC = lookupSCC(OriginalN);
+  RefSCC *OriginalRC = lookupRefSCC(OriginalN);
 
-  auto *C = createSCC(RC, SmallVector<Node *, 1>());
-  addNodeToSCC(*C, N);
+#ifndef NDEBUG
+  OriginalRC->verify();
+  auto VerifyOnExit = make_scope_exit([&]() { OriginalRC->verify(); });
+#endif
 
-  auto Index = RC.SCCIndices.size();
-  RC.SCCIndices[C] = Index;
-  RC.SCCs.push_back(C);
+  assert(!lookup(NewFunction) &&
+         "New function's node should not already exist");
+  Node &NewN = initNode(NewFunction);
+
+  Edge::Kind EK = getEdgeKind(OriginalFunction, NewFunction);
+
+  SCC *NewC = nullptr;
+  for (Edge &E : *NewN) {
+    Node &EN = E.getNode();
+    if (EK == Edge::Kind::Call && E.isCall() && lookupSCC(EN) == OriginalC) {
+      // If the edge to the new function is a call edge and there is a call edge
+      // from the new function to any function in the original function's SCC,
+      // it is in the same SCC (and RefSCC) as the original function.
+      NewC = OriginalC;
+      NewC->Nodes.push_back(&NewN);
+      break;
+    }
+  }
+
+  if (!NewC) {
+    for (Edge &E : *NewN) {
+      Node &EN = E.getNode();
+      if (lookupRefSCC(EN) == OriginalRC) {
+        // If there is any edge from the new function to any function in the
+        // original function's RefSCC, it is in the same RefSCC as the original
+        // function but a new SCC.
+        RefSCC *NewRC = OriginalRC;
+        NewC = createSCC(*NewRC, SmallVector<Node *, 1>({&NewN}));
+
+        // The new function's SCC is not the same as the original function's
+        // SCC, since that case was handled earlier. If the edge from the
+        // original function to the new function was a call edge, then we need
+        // to insert the newly created function's SCC before the original
+        // function's SCC. Otherwise either the new SCC comes after the original
+        // function's SCC, or it doesn't matter, and in both cases we can add it
+        // to the very end.
+        int InsertIndex = EK == Edge::Kind::Call ? NewRC->SCCIndices[OriginalC]
+                                                 : NewRC->SCCIndices.size();
+        NewRC->SCCs.insert(NewRC->SCCs.begin() + InsertIndex, NewC);
+        for (int I = InsertIndex, Size = NewRC->SCCs.size(); I < Size; ++I)
+          NewRC->SCCIndices[NewRC->SCCs[I]] = I;
+
+        break;
+      }
+    }
+  }
+
+  if (!NewC) {
+    // We didn't find any edges back to the original function's RefSCC, so the
+    // new function belongs in a new RefSCC. The new RefSCC goes before the
+    // original function's RefSCC.
+    RefSCC *NewRC = createRefSCC(*this);
+    NewC = createSCC(*NewRC, SmallVector<Node *, 1>({&NewN}));
+    NewRC->SCCIndices[NewC] = 0;
+    NewRC->SCCs.push_back(NewC);
+    auto OriginalRCIndex = RefSCCIndices.find(OriginalRC)->second;
+    PostOrderRefSCCs.insert(PostOrderRefSCCs.begin() + OriginalRCIndex, NewRC);
+    for (int I = OriginalRCIndex, Size = PostOrderRefSCCs.size(); I < Size; ++I)
+      RefSCCIndices[PostOrderRefSCCs[I]] = I;
+  }
+
+  SCCMap[&NewN] = NewC;
+
+  OriginalN->insertEdgeInternal(NewN, EK);
+}
+
+void LazyCallGraph::addSplitRefRecursiveFunctions(
+    Function &OriginalFunction, ArrayRef<Function *> NewFunctions) {
+  assert(!NewFunctions.empty() && "Can't add zero functions");
+  assert(lookup(OriginalFunction) &&
+         "Original function's node should already exist");
+  Node &OriginalN = get(OriginalFunction);
+  RefSCC *OriginalRC = lookupRefSCC(OriginalN);
+
+#ifndef NDEBUG
+  OriginalRC->verify();
+  auto VerifyOnExit = make_scope_exit([&]() {
+    OriginalRC->verify();
+#ifdef EXPENSIVE_CHECKS
+    for (Function *NewFunction : NewFunctions)
+      lookupRefSCC(get(*NewFunction))->verify();
+#endif
+  });
+#endif
+
+  bool ExistsRefToOriginalRefSCC = false;
+
+  for (Function *NewFunction : NewFunctions) {
+    Node &NewN = initNode(*NewFunction);
+
+    OriginalN->insertEdgeInternal(NewN, Edge::Kind::Ref);
+
+    // Check if there is any edge from any new function back to any function in
+    // the original function's RefSCC.
+    for (Edge &E : *NewN) {
+      if (lookupRefSCC(E.getNode()) == OriginalRC) {
+        ExistsRefToOriginalRefSCC = true;
+        break;
+      }
+    }
+  }
+
+  RefSCC *NewRC;
+  if (ExistsRefToOriginalRefSCC) {
+    // If there is any edge from any new function to any function in the
+    // original function's RefSCC, all new functions will be in the same RefSCC
+    // as the original function.
+    NewRC = OriginalRC;
+  } else {
+    // Otherwise the new functions are in their own RefSCC.
+    NewRC = createRefSCC(*this);
+    // The new RefSCC goes before the original function's RefSCC in postorder
+    // since there are only edges from the original function's RefSCC to the new
+    // RefSCC.
+    auto OriginalRCIndex = RefSCCIndices.find(OriginalRC)->second;
+    PostOrderRefSCCs.insert(PostOrderRefSCCs.begin() + OriginalRCIndex, NewRC);
+    for (int I = OriginalRCIndex, Size = PostOrderRefSCCs.size(); I < Size; ++I)
+      RefSCCIndices[PostOrderRefSCCs[I]] = I;
+  }
+
+  for (Function *NewFunction : NewFunctions) {
+    Node &NewN = get(*NewFunction);
+    // Each new function is in its own new SCC. The original function can only
+    // have a ref edge to new functions, and no other existing functions can
+    // have references to new functions. Each new function only has a ref edge
+    // to the other new functions.
+    SCC *NewC = createSCC(*NewRC, SmallVector<Node *, 1>({&NewN}));
+    // The new SCCs are either sibling SCCs or parent SCCs to all other existing
+    // SCCs in the RefSCC. Either way, they can go at the back of the postorder
+    // SCC list.
+    auto Index = NewRC->SCCIndices.size();
+    NewRC->SCCIndices[NewC] = Index;
+    NewRC->SCCs.push_back(NewC);
+    SCCMap[&NewN] = NewC;
+  }
+
+#ifndef NDEBUG
+  for (Function *F1 : NewFunctions) {
+    assert(getEdgeKind(OriginalFunction, *F1) == Edge::Kind::Ref &&
+           "Expected ref edges from original function to every new function");
+    Node &N1 = get(*F1);
+    for (Function *F2 : NewFunctions) {
+      if (F1 == F2)
+        continue;
+      Node &N2 = get(*F2);
+      assert(!N1->lookup(N2)->isCall() &&
+             "Edges between new functions must be ref edges");
+    }
+  }
+#endif
 }
 
 LazyCallGraph::Node &LazyCallGraph::insertInto(Function &F, Node *&MappedN) {
@@ -1594,21 +1809,14 @@ void LazyCallGraph::updateGraphPtrs() {
     RC->G = this;
 }
 
-LazyCallGraph::Node &LazyCallGraph::createNode(Function &F) {
-  assert(!lookup(F) && "node already exists");
-
+LazyCallGraph::Node &LazyCallGraph::initNode(Function &F) {
   Node &N = get(F);
-  NodeMap[&F] = &N;
   N.DFSNumber = N.LowLink = -1;
   N.populate();
+  NodeMap[&F] = &N;
   return N;
 }
 
-void LazyCallGraph::addNodeToSCC(LazyCallGraph::SCC &C, Node &N) {
-  C.Nodes.push_back(&N);
-  SCCMap[&N] = &C;
-}
-
 template <typename RootsT, typename GetBeginT, typename GetEndT,
           typename GetNodeT, typename FormSCCCallbackT>
 void LazyCallGraph::buildGenericSCCs(RootsT &&Roots, GetBeginT &&GetBegin,
@@ -1750,10 +1958,7 @@ void LazyCallGraph::buildRefSCCs() {
   for (Edge &E : *this)
     Roots.push_back(&E.getNode());
 
-  // The roots will be popped of a stack, so use reverse to get a less
-  // surprising order. This doesn't change any of the semantics anywhere.
-  std::reverse(Roots.begin(), Roots.end());
-
+  // The roots will be iterated in order.
   buildGenericSCCs(
       Roots,
       [](Node &N) {
diff --git a/contrib/llvm-project/llvm/lib/Analysis/LazyValueInfo.cpp b/contrib/llvm-project/llvm/lib/Analysis/LazyValueInfo.cpp
index f5ffa7286b3b..ba2b6fe94c18 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/LazyValueInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/LazyValueInfo.cpp
@@ -36,6 +36,7 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Support/raw_ostream.h"
 #include <map>
 using namespace llvm;
@@ -150,15 +151,21 @@ namespace {
 } // end anonymous namespace
 
 namespace {
+  using NonNullPointerSet = SmallDenseSet<AssertingVH<Value>, 2>;
+
   /// This is the cache kept by LazyValueInfo which
   /// maintains information about queries across the clients' queries.
   class LazyValueInfoCache {
     /// This is all of the cached information for one basic block. It contains
     /// the per-value lattice elements, as well as a separate set for
-    /// overdefined values to reduce memory usage.
+    /// overdefined values to reduce memory usage. Additionally pointers
+    /// dereferenced in the block are cached for nullability queries.
     struct BlockCacheEntry {
       SmallDenseMap<AssertingVH<Value>, ValueLatticeElement, 4> LatticeElements;
       SmallDenseSet<AssertingVH<Value>, 4> OverDefined;
+      // None indicates that the nonnull pointers for this basic block
+      // block have not been computed yet.
+      Optional<NonNullPointerSet> NonNullPointers;
     };
 
     /// Cached information per basic block.
@@ -220,6 +227,19 @@ namespace {
       return LatticeIt->second;
     }
 
+    bool isNonNullAtEndOfBlock(
+        Value *V, BasicBlock *BB,
+        function_ref<NonNullPointerSet(BasicBlock *)> InitFn) {
+      BlockCacheEntry *Entry = getOrCreateBlockEntry(BB);
+      if (!Entry->NonNullPointers) {
+        Entry->NonNullPointers = InitFn(BB);
+        for (Value *V : *Entry->NonNullPointers)
+          addValueHandle(V);
+      }
+
+      return Entry->NonNullPointers->count(V);
+    }
+
     /// clear - Empty the cache.
     void clear() {
       BlockCache.clear();
@@ -244,6 +264,8 @@ void LazyValueInfoCache::eraseValue(Value *V) {
   for (auto &Pair : BlockCache) {
     Pair.second->LatticeElements.erase(V);
     Pair.second->OverDefined.erase(V);
+    if (Pair.second->NonNullPointers)
+      Pair.second->NonNullPointers->erase(V);
   }
 
   auto HandleIt = ValueHandles.find_as(V);
@@ -311,7 +333,7 @@ void LazyValueInfoCache::threadEdgeImpl(BasicBlock *OldSucc,
 
     if (!changed) continue;
 
-    worklist.insert(worklist.end(), succ_begin(ToUpdate), succ_end(ToUpdate));
+    llvm::append_range(worklist, successors(ToUpdate));
   }
 }
 
@@ -388,8 +410,8 @@ class LazyValueInfoImpl {
                                                        BasicBlock *BB);
   Optional<ValueLatticeElement> solveBlockValueSelect(SelectInst *S,
                                                       BasicBlock *BB);
-  Optional<ConstantRange> getRangeForOperand(unsigned Op, Instruction *I,
-                                             BasicBlock *BB);
+  Optional<ConstantRange> getRangeFor(Value *V, Instruction *CxtI,
+                                      BasicBlock *BB);
   Optional<ValueLatticeElement> solveBlockValueBinaryOpImpl(
       Instruction *I, BasicBlock *BB,
       std::function<ConstantRange(const ConstantRange &,
@@ -400,12 +422,11 @@ class LazyValueInfoImpl {
                                                     BasicBlock *BB);
   Optional<ValueLatticeElement> solveBlockValueOverflowIntrinsic(
       WithOverflowInst *WO, BasicBlock *BB);
-  Optional<ValueLatticeElement> solveBlockValueSaturatingIntrinsic(
-      SaturatingInst *SI, BasicBlock *BB);
   Optional<ValueLatticeElement> solveBlockValueIntrinsic(IntrinsicInst *II,
                                                          BasicBlock *BB);
   Optional<ValueLatticeElement> solveBlockValueExtractValue(
       ExtractValueInst *EVI, BasicBlock *BB);
+  bool isNonNullAtEndOfBlock(Value *Val, BasicBlock *BB);
   void intersectAssumeOrGuardBlockValueConstantRange(Value *Val,
                                                      ValueLatticeElement &BBLV,
                                                      Instruction *BBI);
@@ -413,14 +434,16 @@ class LazyValueInfoImpl {
   void solve();
 
 public:
-  /// This is the query interface to determine the lattice
-  /// value for the specified Value* at the end of the specified block.
+  /// This is the query interface to determine the lattice value for the
+  /// specified Value* at the context instruction (if specified) or at the
+  /// start of the block.
   ValueLatticeElement getValueInBlock(Value *V, BasicBlock *BB,
                                       Instruction *CxtI = nullptr);
 
-  /// This is the query interface to determine the lattice
-  /// value for the specified Value* at the specified instruction (generally
-  /// from an assume intrinsic).
+  /// This is the query interface to determine the lattice value for the
+  /// specified Value* at the specified instruction using only information
+  /// from assumes/guards and range metadata. Unlike getValueInBlock(), no
+  /// recursive query is performed.
   ValueLatticeElement getValueAt(Value *V, Instruction *CxtI);
 
   /// This is the query interface to determine the lattice
@@ -605,53 +628,43 @@ Optional<ValueLatticeElement> LazyValueInfoImpl::solveBlockValueImpl(
   return getFromRangeMetadata(BBI);
 }
 
-static bool InstructionDereferencesPointer(Instruction *I, Value *Ptr) {
+static void AddNonNullPointer(Value *Ptr, NonNullPointerSet &PtrSet) {
+  // TODO: Use NullPointerIsDefined instead.
+  if (Ptr->getType()->getPointerAddressSpace() == 0)
+    PtrSet.insert(getUnderlyingObject(Ptr));
+}
+
+static void AddNonNullPointersByInstruction(
+    Instruction *I, NonNullPointerSet &PtrSet) {
   if (LoadInst *L = dyn_cast<LoadInst>(I)) {
-    return L->getPointerAddressSpace() == 0 &&
-           GetUnderlyingObject(L->getPointerOperand(),
-                               L->getModule()->getDataLayout()) == Ptr;
-  }
-  if (StoreInst *S = dyn_cast<StoreInst>(I)) {
-    return S->getPointerAddressSpace() == 0 &&
-           GetUnderlyingObject(S->getPointerOperand(),
-                               S->getModule()->getDataLayout()) == Ptr;
-  }
-  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) {
-    if (MI->isVolatile()) return false;
+    AddNonNullPointer(L->getPointerOperand(), PtrSet);
+  } else if (StoreInst *S = dyn_cast<StoreInst>(I)) {
+    AddNonNullPointer(S->getPointerOperand(), PtrSet);
+  } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) {
+    if (MI->isVolatile()) return;
 
     // FIXME: check whether it has a valuerange that excludes zero?
     ConstantInt *Len = dyn_cast<ConstantInt>(MI->getLength());
-    if (!Len || Len->isZero()) return false;
+    if (!Len || Len->isZero()) return;
 
-    if (MI->getDestAddressSpace() == 0)
-      if (GetUnderlyingObject(MI->getRawDest(),
-                              MI->getModule()->getDataLayout()) == Ptr)
-        return true;
+    AddNonNullPointer(MI->getRawDest(), PtrSet);
     if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI))
-      if (MTI->getSourceAddressSpace() == 0)
-        if (GetUnderlyingObject(MTI->getRawSource(),
-                                MTI->getModule()->getDataLayout()) == Ptr)
-          return true;
+      AddNonNullPointer(MTI->getRawSource(), PtrSet);
   }
-  return false;
 }
 
-/// Return true if the allocation associated with Val is ever dereferenced
-/// within the given basic block.  This establishes the fact Val is not null,
-/// but does not imply that the memory at Val is dereferenceable.  (Val may
-/// point off the end of the dereferenceable part of the object.)
-static bool isObjectDereferencedInBlock(Value *Val, BasicBlock *BB) {
-  assert(Val->getType()->isPointerTy());
+bool LazyValueInfoImpl::isNonNullAtEndOfBlock(Value *Val, BasicBlock *BB) {
+  if (NullPointerIsDefined(BB->getParent(),
+                           Val->getType()->getPointerAddressSpace()))
+    return false;
 
-  const DataLayout &DL = BB->getModule()->getDataLayout();
-  Value *UnderlyingVal = GetUnderlyingObject(Val, DL);
-  // If 'GetUnderlyingObject' didn't converge, skip it. It won't converge
-  // inside InstructionDereferencesPointer either.
-  if (UnderlyingVal == GetUnderlyingObject(UnderlyingVal, DL, 1))
+  Val = getUnderlyingObject(Val);
+  return TheCache.isNonNullAtEndOfBlock(Val, BB, [](BasicBlock *BB) {
+    NonNullPointerSet NonNullPointers;
     for (Instruction &I : *BB)
-      if (InstructionDereferencesPointer(&I, UnderlyingVal))
-        return true;
-  return false;
+      AddNonNullPointersByInstruction(&I, NonNullPointers);
+    return NonNullPointers;
+  });
 }
 
 Optional<ValueLatticeElement> LazyValueInfoImpl::solveBlockValueNonLocal(
@@ -662,16 +675,7 @@ Optional<ValueLatticeElement> LazyValueInfoImpl::solveBlockValueNonLocal(
   // value is overdefined.
   if (BB == &BB->getParent()->getEntryBlock()) {
     assert(isa<Argument>(Val) && "Unknown live-in to the entry block");
-    // Before giving up, see if we can prove the pointer non-null local to
-    // this particular block.
-    PointerType *PTy = dyn_cast<PointerType>(Val->getType());
-    if (PTy &&
-        (isKnownNonZero(Val, DL) ||
-          (isObjectDereferencedInBlock(Val, BB) &&
-           !NullPointerIsDefined(BB->getParent(), PTy->getAddressSpace()))))
-      return ValueLatticeElement::getNot(ConstantPointerNull::get(PTy));
-    else
-      return ValueLatticeElement::getOverdefined();
+    return ValueLatticeElement::getOverdefined();
   }
 
   // Loop over all of our predecessors, merging what we know from them into
@@ -696,14 +700,6 @@ Optional<ValueLatticeElement> LazyValueInfoImpl::solveBlockValueNonLocal(
     if (Result.isOverdefined()) {
       LLVM_DEBUG(dbgs() << " compute BB '" << BB->getName()
                         << "' - overdefined because of pred (non local).\n");
-      // Before giving up, see if we can prove the pointer non-null local to
-      // this particular block.
-      PointerType *PTy = dyn_cast<PointerType>(Val->getType());
-      if (PTy && isObjectDereferencedInBlock(Val, BB) &&
-          !NullPointerIsDefined(BB->getParent(), PTy->getAddressSpace())) {
-        Result = ValueLatticeElement::getNot(ConstantPointerNull::get(PTy));
-      }
-
       return Result;
     }
   }
@@ -776,16 +772,23 @@ void LazyValueInfoImpl::intersectAssumeOrGuardBlockValueConstantRange(
   }
 
   // If guards are not used in the module, don't spend time looking for them
-  if (!GuardDecl || GuardDecl->use_empty())
-    return;
+  if (GuardDecl && !GuardDecl->use_empty() &&
+      BBI->getIterator() != BB->begin()) {
+    for (Instruction &I : make_range(std::next(BBI->getIterator().getReverse()),
+                                     BB->rend())) {
+      Value *Cond = nullptr;
+      if (match(&I, m_Intrinsic<Intrinsic::experimental_guard>(m_Value(Cond))))
+        BBLV = intersect(BBLV, getValueFromCondition(Val, Cond));
+    }
+  }
 
-  if (BBI->getIterator() == BB->begin())
-    return;
-  for (Instruction &I : make_range(std::next(BBI->getIterator().getReverse()),
-                                   BB->rend())) {
-    Value *Cond = nullptr;
-    if (match(&I, m_Intrinsic<Intrinsic::experimental_guard>(m_Value(Cond))))
-      BBLV = intersect(BBLV, getValueFromCondition(Val, Cond));
+  if (BBLV.isOverdefined()) {
+    // Check whether we're checking at the terminator, and the pointer has
+    // been dereferenced in this block.
+    PointerType *PTy = dyn_cast<PointerType>(Val->getType());
+    if (PTy && BB->getTerminator() == BBI &&
+        isNonNullAtEndOfBlock(Val, BB))
+      BBLV = ValueLatticeElement::getNot(ConstantPointerNull::get(PTy));
   }
 }
 
@@ -919,20 +922,19 @@ Optional<ValueLatticeElement> LazyValueInfoImpl::solveBlockValueSelect(
   return Result;
 }
 
-Optional<ConstantRange> LazyValueInfoImpl::getRangeForOperand(unsigned Op,
-                                                              Instruction *I,
-                                                              BasicBlock *BB) {
-  Optional<ValueLatticeElement> OptVal = getBlockValue(I->getOperand(Op), BB);
+Optional<ConstantRange> LazyValueInfoImpl::getRangeFor(Value *V,
+                                                       Instruction *CxtI,
+                                                       BasicBlock *BB) {
+  Optional<ValueLatticeElement> OptVal = getBlockValue(V, BB);
   if (!OptVal)
     return None;
 
   ValueLatticeElement &Val = *OptVal;
-  intersectAssumeOrGuardBlockValueConstantRange(I->getOperand(Op), Val, I);
+  intersectAssumeOrGuardBlockValueConstantRange(V, Val, CxtI);
   if (Val.isConstantRange())
     return Val.getConstantRange();
 
-  const unsigned OperandBitWidth =
-    DL.getTypeSizeInBits(I->getOperand(Op)->getType());
+  const unsigned OperandBitWidth = DL.getTypeSizeInBits(V->getType());
   return ConstantRange::getFull(OperandBitWidth);
 }
 
@@ -962,7 +964,7 @@ Optional<ValueLatticeElement> LazyValueInfoImpl::solveBlockValueCast(
   // Figure out the range of the LHS.  If that fails, we still apply the
   // transfer rule on the full set since we may be able to locally infer
   // interesting facts.
-  Optional<ConstantRange> LHSRes = getRangeForOperand(0, CI, BB);
+  Optional<ConstantRange> LHSRes = getRangeFor(CI->getOperand(0), CI, BB);
   if (!LHSRes.hasValue())
     // More work to do before applying this transfer rule.
     return None;
@@ -985,8 +987,8 @@ Optional<ValueLatticeElement> LazyValueInfoImpl::solveBlockValueBinaryOpImpl(
   // conservative range, but apply the transfer rule anyways.  This
   // lets us pick up facts from expressions like "and i32 (call i32
   // @foo()), 32"
-  Optional<ConstantRange> LHSRes = getRangeForOperand(0, I, BB);
-  Optional<ConstantRange> RHSRes = getRangeForOperand(1, I, BB);
+  Optional<ConstantRange> LHSRes = getRangeFor(I->getOperand(0), I, BB);
+  Optional<ConstantRange> RHSRes = getRangeFor(I->getOperand(1), I, BB);
   if (!LHSRes.hasValue() || !RHSRes.hasValue())
     // More work to do before applying this transfer rule.
     return None;
@@ -1036,43 +1038,24 @@ LazyValueInfoImpl::solveBlockValueOverflowIntrinsic(WithOverflowInst *WO,
       });
 }
 
-Optional<ValueLatticeElement>
-LazyValueInfoImpl::solveBlockValueSaturatingIntrinsic(SaturatingInst *SI,
-                                                      BasicBlock *BB) {
-  switch (SI->getIntrinsicID()) {
-  case Intrinsic::uadd_sat:
-    return solveBlockValueBinaryOpImpl(
-        SI, BB, [](const ConstantRange &CR1, const ConstantRange &CR2) {
-          return CR1.uadd_sat(CR2);
-        });
-  case Intrinsic::usub_sat:
-    return solveBlockValueBinaryOpImpl(
-        SI, BB, [](const ConstantRange &CR1, const ConstantRange &CR2) {
-          return CR1.usub_sat(CR2);
-        });
-  case Intrinsic::sadd_sat:
-    return solveBlockValueBinaryOpImpl(
-        SI, BB, [](const ConstantRange &CR1, const ConstantRange &CR2) {
-          return CR1.sadd_sat(CR2);
-        });
-  case Intrinsic::ssub_sat:
-    return solveBlockValueBinaryOpImpl(
-        SI, BB, [](const ConstantRange &CR1, const ConstantRange &CR2) {
-          return CR1.ssub_sat(CR2);
-        });
-  default:
-    llvm_unreachable("All llvm.sat intrinsic are handled.");
-  }
-}
-
 Optional<ValueLatticeElement> LazyValueInfoImpl::solveBlockValueIntrinsic(
     IntrinsicInst *II, BasicBlock *BB) {
-  if (auto *SI = dyn_cast<SaturatingInst>(II))
-    return solveBlockValueSaturatingIntrinsic(SI, BB);
+  if (!ConstantRange::isIntrinsicSupported(II->getIntrinsicID())) {
+    LLVM_DEBUG(dbgs() << " compute BB '" << BB->getName()
+                      << "' - overdefined (unknown intrinsic).\n");
+    return ValueLatticeElement::getOverdefined();
+  }
 
-  LLVM_DEBUG(dbgs() << " compute BB '" << BB->getName()
-                    << "' - overdefined (unknown intrinsic).\n");
-  return ValueLatticeElement::getOverdefined();
+  SmallVector<ConstantRange, 2> OpRanges;
+  for (Value *Op : II->args()) {
+    Optional<ConstantRange> Range = getRangeFor(Op, II, BB);
+    if (!Range)
+      return None;
+    OpRanges.push_back(*Range);
+  }
+
+  return ValueLatticeElement::getRange(
+      ConstantRange::intrinsic(II->getIntrinsicID(), OpRanges));
 }
 
 Optional<ValueLatticeElement> LazyValueInfoImpl::solveBlockValueExtractValue(
@@ -1116,6 +1099,26 @@ static bool matchICmpOperand(const APInt *&Offset, Value *LHS, Value *Val,
   return false;
 }
 
+/// Get value range for a "(Val + Offset) Pred RHS" condition.
+static ValueLatticeElement getValueFromSimpleICmpCondition(
+    CmpInst::Predicate Pred, Value *RHS, const APInt *Offset) {
+  ConstantRange RHSRange(RHS->getType()->getIntegerBitWidth(),
+                         /*isFullSet=*/true);
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS))
+    RHSRange = ConstantRange(CI->getValue());
+  else if (Instruction *I = dyn_cast<Instruction>(RHS))
+    if (auto *Ranges = I->getMetadata(LLVMContext::MD_range))
+      RHSRange = getConstantRangeFromMetadata(*Ranges);
+
+  ConstantRange TrueValues =
+      ConstantRange::makeAllowedICmpRegion(Pred, RHSRange);
+
+  if (Offset)
+    TrueValues = TrueValues.subtract(*Offset);
+
+  return ValueLatticeElement::getRange(std::move(TrueValues));
+}
+
 static ValueLatticeElement getValueFromICmpCondition(Value *Val, ICmpInst *ICI,
                                                      bool isTrueDest) {
   Value *LHS = ICI->getOperand(0);
@@ -1138,30 +1141,27 @@ static ValueLatticeElement getValueFromICmpCondition(Value *Val, ICmpInst *ICI,
     return ValueLatticeElement::getOverdefined();
 
   const APInt *Offset = nullptr;
-  if (!matchICmpOperand(Offset, LHS, Val, EdgePred)) {
-    std::swap(LHS, RHS);
-    EdgePred = CmpInst::getSwappedPredicate(EdgePred);
-    if (!matchICmpOperand(Offset, LHS, Val, EdgePred))
-      return ValueLatticeElement::getOverdefined();
+  if (matchICmpOperand(Offset, LHS, Val, EdgePred))
+    return getValueFromSimpleICmpCondition(EdgePred, RHS, Offset);
+
+  CmpInst::Predicate SwappedPred = CmpInst::getSwappedPredicate(EdgePred);
+  if (matchICmpOperand(Offset, RHS, Val, SwappedPred))
+    return getValueFromSimpleICmpCondition(SwappedPred, LHS, Offset);
+
+  // If (Val & Mask) == C then all the masked bits are known and we can compute
+  // a value range based on that.
+  const APInt *Mask, *C;
+  if (EdgePred == ICmpInst::ICMP_EQ &&
+      match(LHS, m_And(m_Specific(Val), m_APInt(Mask))) &&
+      match(RHS, m_APInt(C))) {
+    KnownBits Known;
+    Known.Zero = ~*C & *Mask;
+    Known.One = *C & *Mask;
+    return ValueLatticeElement::getRange(
+        ConstantRange::fromKnownBits(Known, /*IsSigned*/ false));
   }
 
-  // Calculate the range of values that are allowed by the comparison.
-  ConstantRange RHSRange(RHS->getType()->getIntegerBitWidth(),
-                         /*isFullSet=*/true);
-  if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS))
-    RHSRange = ConstantRange(CI->getValue());
-  else if (Instruction *I = dyn_cast<Instruction>(RHS))
-    if (auto *Ranges = I->getMetadata(LLVMContext::MD_range))
-      RHSRange = getConstantRangeFromMetadata(*Ranges);
-
-  // If we're interested in the false dest, invert the condition
-  ConstantRange TrueValues =
-      ConstantRange::makeAllowedICmpRegion(EdgePred, RHSRange);
-
-  if (Offset) // Apply the offset from above.
-    TrueValues = TrueValues.subtract(*Offset);
-
-  return ValueLatticeElement::getRange(std::move(TrueValues));
+  return ValueLatticeElement::getOverdefined();
 }
 
 // Handle conditions of the form
@@ -1201,26 +1201,36 @@ getValueFromConditionImpl(Value *Val, Value *Cond, bool isTrueDest,
       if (EVI->getNumIndices() == 1 && *EVI->idx_begin() == 1)
         return getValueFromOverflowCondition(Val, WO, isTrueDest);
 
-  // Handle conditions in the form of (cond1 && cond2), we know that on the
-  // true dest path both of the conditions hold. Similarly for conditions of
-  // the form (cond1 || cond2), we know that on the false dest path neither
-  // condition holds.
-  BinaryOperator *BO = dyn_cast<BinaryOperator>(Cond);
-  if (!BO || (isTrueDest && BO->getOpcode() != BinaryOperator::And) ||
-             (!isTrueDest && BO->getOpcode() != BinaryOperator::Or))
+  Value *L, *R;
+  bool IsAnd;
+  if (match(Cond, m_LogicalAnd(m_Value(L), m_Value(R))))
+    IsAnd = true;
+  else if (match(Cond, m_LogicalOr(m_Value(L), m_Value(R))))
+    IsAnd = false;
+  else
     return ValueLatticeElement::getOverdefined();
 
   // Prevent infinite recursion if Cond references itself as in this example:
   //  Cond: "%tmp4 = and i1 %tmp4, undef"
   //    BL: "%tmp4 = and i1 %tmp4, undef"
   //    BR: "i1 undef"
-  Value *BL = BO->getOperand(0);
-  Value *BR = BO->getOperand(1);
-  if (BL == Cond || BR == Cond)
+  if (L == Cond || R == Cond)
     return ValueLatticeElement::getOverdefined();
 
-  return intersect(getValueFromCondition(Val, BL, isTrueDest, Visited),
-                   getValueFromCondition(Val, BR, isTrueDest, Visited));
+  // if (L && R) -> intersect L and R
+  // if (!(L || R)) -> intersect L and R
+  // if (L || R) -> union L and R
+  // if (!(L && R)) -> union L and R
+  if (isTrueDest ^ IsAnd) {
+    ValueLatticeElement V = getValueFromCondition(Val, L, isTrueDest, Visited);
+    if (V.isOverdefined())
+      return V;
+    V.mergeIn(getValueFromCondition(Val, R, isTrueDest, Visited));
+    return V;
+  }
+
+  return intersect(getValueFromCondition(Val, L, isTrueDest, Visited),
+                   getValueFromCondition(Val, R, isTrueDest, Visited));
 }
 
 static ValueLatticeElement
@@ -1244,15 +1254,15 @@ ValueLatticeElement getValueFromCondition(Value *Val, Value *Cond,
 
 // Return true if Usr has Op as an operand, otherwise false.
 static bool usesOperand(User *Usr, Value *Op) {
-  return find(Usr->operands(), Op) != Usr->op_end();
+  return is_contained(Usr->operands(), Op);
 }
 
 // Return true if the instruction type of Val is supported by
-// constantFoldUser(). Currently CastInst and BinaryOperator only.  Call this
-// before calling constantFoldUser() to find out if it's even worth attempting
-// to call it.
+// constantFoldUser(). Currently CastInst, BinaryOperator and FreezeInst only.
+// Call this before calling constantFoldUser() to find out if it's even worth
+// attempting to call it.
 static bool isOperationFoldable(User *Usr) {
-  return isa<CastInst>(Usr) || isa<BinaryOperator>(Usr);
+  return isa<CastInst>(Usr) || isa<BinaryOperator>(Usr) || isa<FreezeInst>(Usr);
 }
 
 // Check if Usr can be simplified to an integer constant when the value of one
@@ -1283,6 +1293,9 @@ static ValueLatticeElement constantFoldUser(User *Usr, Value *Op,
             SimplifyBinOp(BO->getOpcode(), LHS, RHS, DL))) {
       return ValueLatticeElement::getRange(ConstantRange(C->getValue()));
     }
+  } else if (isa<FreezeInst>(Usr)) {
+    assert(cast<FreezeInst>(Usr)->getOperand(0) == Op && "Operand 0 isn't Op");
+    return ValueLatticeElement::getRange(ConstantRange(OpConstVal));
   }
   return ValueLatticeElement::getOverdefined();
 }
@@ -1585,12 +1598,12 @@ static bool isKnownNonConstant(Value *V) {
   return false;
 }
 
-Constant *LazyValueInfo::getConstant(Value *V, BasicBlock *BB,
-                                     Instruction *CxtI) {
+Constant *LazyValueInfo::getConstant(Value *V, Instruction *CxtI) {
   // Bail out early if V is known not to be a Constant.
   if (isKnownNonConstant(V))
     return nullptr;
 
+  BasicBlock *BB = CxtI->getParent();
   ValueLatticeElement Result =
       getImpl(PImpl, AC, BB->getModule()).getValueInBlock(V, BB, CxtI);
 
@@ -1604,11 +1617,11 @@ Constant *LazyValueInfo::getConstant(Value *V, BasicBlock *BB,
   return nullptr;
 }
 
-ConstantRange LazyValueInfo::getConstantRange(Value *V, BasicBlock *BB,
-                                              Instruction *CxtI,
+ConstantRange LazyValueInfo::getConstantRange(Value *V, Instruction *CxtI,
                                               bool UndefAllowed) {
   assert(V->getType()->isIntegerTy());
   unsigned Width = V->getType()->getIntegerBitWidth();
+  BasicBlock *BB = CxtI->getParent();
   ValueLatticeElement Result =
       getImpl(PImpl, AC, BB->getModule()).getValueInBlock(V, BB, CxtI);
   if (Result.isUnknown())
@@ -1741,7 +1754,7 @@ LazyValueInfo::getPredicateOnEdge(unsigned Pred, Value *V, Constant *C,
 
 LazyValueInfo::Tristate
 LazyValueInfo::getPredicateAt(unsigned Pred, Value *V, Constant *C,
-                              Instruction *CxtI) {
+                              Instruction *CxtI, bool UseBlockValue) {
   // Is or is not NonNull are common predicates being queried. If
   // isKnownNonZero can tell us the result of the predicate, we can
   // return it quickly. But this is only a fastpath, and falling
@@ -1755,7 +1768,10 @@ LazyValueInfo::getPredicateAt(unsigned Pred, Value *V, Constant *C,
     else if (Pred == ICmpInst::ICMP_NE)
       return LazyValueInfo::True;
   }
-  ValueLatticeElement Result = getImpl(PImpl, AC, M).getValueAt(V, CxtI);
+
+  ValueLatticeElement Result = UseBlockValue
+      ? getImpl(PImpl, AC, M).getValueInBlock(V, CxtI->getParent(), CxtI)
+      : getImpl(PImpl, AC, M).getValueAt(V, CxtI);
   Tristate Ret = getPredicateResult(Pred, C, Result, DL, TLI);
   if (Ret != Unknown)
     return Ret;
diff --git a/contrib/llvm-project/llvm/lib/Analysis/LegacyDivergenceAnalysis.cpp b/contrib/llvm-project/llvm/lib/Analysis/LegacyDivergenceAnalysis.cpp
index 10ead1019206..30eec5a611f7 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/LegacyDivergenceAnalysis.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/LegacyDivergenceAnalysis.cpp
@@ -299,9 +299,9 @@ FunctionPass *llvm::createLegacyDivergenceAnalysisPass() {
 }
 
 void LegacyDivergenceAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<DominatorTreeWrapperPass>();
-  AU.addRequired<PostDominatorTreeWrapperPass>();
-  AU.addRequired<LoopInfoWrapperPass>();
+  AU.addRequiredTransitive<DominatorTreeWrapperPass>();
+  AU.addRequiredTransitive<PostDominatorTreeWrapperPass>();
+  AU.addRequiredTransitive<LoopInfoWrapperPass>();
   AU.setPreservesAll();
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Analysis/Lint.cpp b/contrib/llvm-project/llvm/lib/Analysis/Lint.cpp
index 564c00dbad98..e188c23cf32b 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/Lint.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/Lint.cpp
@@ -63,6 +63,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "llvm/InitializePasses.h"
@@ -80,134 +81,102 @@
 using namespace llvm;
 
 namespace {
-  namespace MemRef {
-    static const unsigned Read     = 1;
-    static const unsigned Write    = 2;
-    static const unsigned Callee   = 4;
-    static const unsigned Branchee = 8;
-  } // end namespace MemRef
-
-  class Lint : public FunctionPass, public InstVisitor<Lint> {
-    friend class InstVisitor<Lint>;
-
-    void visitFunction(Function &F);
-
-    void visitCallBase(CallBase &CB);
-    void visitMemoryReference(Instruction &I, Value *Ptr, uint64_t Size,
-                              MaybeAlign Alignment, Type *Ty, unsigned Flags);
-    void visitEHBeginCatch(IntrinsicInst *II);
-    void visitEHEndCatch(IntrinsicInst *II);
-
-    void visitReturnInst(ReturnInst &I);
-    void visitLoadInst(LoadInst &I);
-    void visitStoreInst(StoreInst &I);
-    void visitXor(BinaryOperator &I);
-    void visitSub(BinaryOperator &I);
-    void visitLShr(BinaryOperator &I);
-    void visitAShr(BinaryOperator &I);
-    void visitShl(BinaryOperator &I);
-    void visitSDiv(BinaryOperator &I);
-    void visitUDiv(BinaryOperator &I);
-    void visitSRem(BinaryOperator &I);
-    void visitURem(BinaryOperator &I);
-    void visitAllocaInst(AllocaInst &I);
-    void visitVAArgInst(VAArgInst &I);
-    void visitIndirectBrInst(IndirectBrInst &I);
-    void visitExtractElementInst(ExtractElementInst &I);
-    void visitInsertElementInst(InsertElementInst &I);
-    void visitUnreachableInst(UnreachableInst &I);
-
-    Value *findValue(Value *V, bool OffsetOk) const;
-    Value *findValueImpl(Value *V, bool OffsetOk,
-                         SmallPtrSetImpl<Value *> &Visited) const;
-
-  public:
-    Module *Mod;
-    const DataLayout *DL;
-    AliasAnalysis *AA;
-    AssumptionCache *AC;
-    DominatorTree *DT;
-    TargetLibraryInfo *TLI;
-
-    std::string Messages;
-    raw_string_ostream MessagesStr;
-
-    static char ID; // Pass identification, replacement for typeid
-    Lint() : FunctionPass(ID), MessagesStr(Messages) {
-      initializeLintPass(*PassRegistry::getPassRegistry());
-    }
-
-    bool runOnFunction(Function &F) override;
-
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.setPreservesAll();
-      AU.addRequired<AAResultsWrapperPass>();
-      AU.addRequired<AssumptionCacheTracker>();
-      AU.addRequired<TargetLibraryInfoWrapperPass>();
-      AU.addRequired<DominatorTreeWrapperPass>();
-    }
-    void print(raw_ostream &O, const Module *M) const override {}
-
-    void WriteValues(ArrayRef<const Value *> Vs) {
-      for (const Value *V : Vs) {
-        if (!V)
-          continue;
-        if (isa<Instruction>(V)) {
-          MessagesStr << *V << '\n';
-        } else {
-          V->printAsOperand(MessagesStr, true, Mod);
-          MessagesStr << '\n';
-        }
+namespace MemRef {
+static const unsigned Read = 1;
+static const unsigned Write = 2;
+static const unsigned Callee = 4;
+static const unsigned Branchee = 8;
+} // end namespace MemRef
+
+class Lint : public InstVisitor<Lint> {
+  friend class InstVisitor<Lint>;
+
+  void visitFunction(Function &F);
+
+  void visitCallBase(CallBase &CB);
+  void visitMemoryReference(Instruction &I, const MemoryLocation &Loc,
+                            MaybeAlign Alignment, Type *Ty, unsigned Flags);
+  void visitEHBeginCatch(IntrinsicInst *II);
+  void visitEHEndCatch(IntrinsicInst *II);
+
+  void visitReturnInst(ReturnInst &I);
+  void visitLoadInst(LoadInst &I);
+  void visitStoreInst(StoreInst &I);
+  void visitXor(BinaryOperator &I);
+  void visitSub(BinaryOperator &I);
+  void visitLShr(BinaryOperator &I);
+  void visitAShr(BinaryOperator &I);
+  void visitShl(BinaryOperator &I);
+  void visitSDiv(BinaryOperator &I);
+  void visitUDiv(BinaryOperator &I);
+  void visitSRem(BinaryOperator &I);
+  void visitURem(BinaryOperator &I);
+  void visitAllocaInst(AllocaInst &I);
+  void visitVAArgInst(VAArgInst &I);
+  void visitIndirectBrInst(IndirectBrInst &I);
+  void visitExtractElementInst(ExtractElementInst &I);
+  void visitInsertElementInst(InsertElementInst &I);
+  void visitUnreachableInst(UnreachableInst &I);
+
+  Value *findValue(Value *V, bool OffsetOk) const;
+  Value *findValueImpl(Value *V, bool OffsetOk,
+                       SmallPtrSetImpl<Value *> &Visited) const;
+
+public:
+  Module *Mod;
+  const DataLayout *DL;
+  AliasAnalysis *AA;
+  AssumptionCache *AC;
+  DominatorTree *DT;
+  TargetLibraryInfo *TLI;
+
+  std::string Messages;
+  raw_string_ostream MessagesStr;
+
+  Lint(Module *Mod, const DataLayout *DL, AliasAnalysis *AA,
+       AssumptionCache *AC, DominatorTree *DT, TargetLibraryInfo *TLI)
+      : Mod(Mod), DL(DL), AA(AA), AC(AC), DT(DT), TLI(TLI),
+        MessagesStr(Messages) {}
+
+  void WriteValues(ArrayRef<const Value *> Vs) {
+    for (const Value *V : Vs) {
+      if (!V)
+        continue;
+      if (isa<Instruction>(V)) {
+        MessagesStr << *V << '\n';
+      } else {
+        V->printAsOperand(MessagesStr, true, Mod);
+        MessagesStr << '\n';
       }
     }
+  }
 
-    /// A check failed, so printout out the condition and the message.
-    ///
-    /// This provides a nice place to put a breakpoint if you want to see why
-    /// something is not correct.
-    void CheckFailed(const Twine &Message) { MessagesStr << Message << '\n'; }
-
-    /// A check failed (with values to print).
-    ///
-    /// This calls the Message-only version so that the above is easier to set
-    /// a breakpoint on.
-    template <typename T1, typename... Ts>
-    void CheckFailed(const Twine &Message, const T1 &V1, const Ts &...Vs) {
-      CheckFailed(Message);
-      WriteValues({V1, Vs...});
-    }
-  };
+  /// A check failed, so printout out the condition and the message.
+  ///
+  /// This provides a nice place to put a breakpoint if you want to see why
+  /// something is not correct.
+  void CheckFailed(const Twine &Message) { MessagesStr << Message << '\n'; }
+
+  /// A check failed (with values to print).
+  ///
+  /// This calls the Message-only version so that the above is easier to set
+  /// a breakpoint on.
+  template <typename T1, typename... Ts>
+  void CheckFailed(const Twine &Message, const T1 &V1, const Ts &... Vs) {
+    CheckFailed(Message);
+    WriteValues({V1, Vs...});
+  }
+};
 } // end anonymous namespace
 
-char Lint::ID = 0;
-INITIALIZE_PASS_BEGIN(Lint, "lint", "Statically lint-checks LLVM IR",
-                      false, true)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_END(Lint, "lint", "Statically lint-checks LLVM IR",
-                    false, true)
-
 // Assert - We know that cond should be true, if not print an error message.
-#define Assert(C, ...) \
-    do { if (!(C)) { CheckFailed(__VA_ARGS__); return; } } while (false)
-
-// Lint::run - This is the main Analysis entry point for a
-// function.
-//
-bool Lint::runOnFunction(Function &F) {
-  Mod = F.getParent();
-  DL = &F.getParent()->getDataLayout();
-  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
-  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
-  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
-  visit(F);
-  dbgs() << MessagesStr.str();
-  Messages.clear();
-  return false;
-}
+#define Assert(C, ...)                                                         \
+  do {                                                                         \
+    if (!(C)) {                                                                \
+      CheckFailed(__VA_ARGS__);                                                \
+      return;                                                                  \
+    }                                                                          \
+  } while (false)
 
 void Lint::visitFunction(Function &F) {
   // This isn't undefined behavior, it's just a little unusual, and it's a
@@ -221,7 +190,7 @@ void Lint::visitFunction(Function &F) {
 void Lint::visitCallBase(CallBase &I) {
   Value *Callee = I.getCalledOperand();
 
-  visitMemoryReference(I, Callee, MemoryLocation::UnknownSize, None, nullptr,
+  visitMemoryReference(I, MemoryLocation::getAfter(Callee), None, nullptr,
                        MemRef::Callee);
 
   if (Function *F = dyn_cast<Function>(findValue(Callee,
@@ -281,10 +250,10 @@ void Lint::visitCallBase(CallBase &I) {
 
         // Check that an sret argument points to valid memory.
         if (Formal->hasStructRetAttr() && Actual->getType()->isPointerTy()) {
-          Type *Ty =
-            cast<PointerType>(Formal->getType())->getElementType();
-          visitMemoryReference(I, Actual, DL->getTypeStoreSize(Ty),
-                               DL->getABITypeAlign(Ty), Ty,
+          Type *Ty = Formal->getParamStructRetType();
+          MemoryLocation Loc(
+              Actual, LocationSize::precise(DL->getTypeStoreSize(Ty)));
+          visitMemoryReference(I, Loc, DL->getABITypeAlign(Ty), Ty,
                                MemRef::Read | MemRef::Write);
         }
       }
@@ -309,25 +278,24 @@ void Lint::visitCallBase(CallBase &I) {
     }
   }
 
-
   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I))
     switch (II->getIntrinsicID()) {
-    default: break;
+    default:
+      break;
 
-    // TODO: Check more intrinsics
+      // TODO: Check more intrinsics
 
     case Intrinsic::memcpy: {
       MemCpyInst *MCI = cast<MemCpyInst>(&I);
-      // TODO: If the size is known, use it.
-      visitMemoryReference(I, MCI->getDest(), MemoryLocation::UnknownSize,
+      visitMemoryReference(I, MemoryLocation::getForDest(MCI),
                            MCI->getDestAlign(), nullptr, MemRef::Write);
-      visitMemoryReference(I, MCI->getSource(), MemoryLocation::UnknownSize,
+      visitMemoryReference(I, MemoryLocation::getForSource(MCI),
                            MCI->getSourceAlign(), nullptr, MemRef::Read);
 
       // Check that the memcpy arguments don't overlap. The AliasAnalysis API
       // isn't expressive enough for what we really want to do. Known partial
       // overlap is not distinguished from the case where nothing is known.
-      auto Size = LocationSize::unknown();
+      auto Size = LocationSize::afterPointer();
       if (const ConstantInt *Len =
               dyn_cast<ConstantInt>(findValue(MCI->getLength(),
                                               /*OffsetOk=*/false)))
@@ -341,10 +309,10 @@ void Lint::visitCallBase(CallBase &I) {
     case Intrinsic::memcpy_inline: {
       MemCpyInlineInst *MCII = cast<MemCpyInlineInst>(&I);
       const uint64_t Size = MCII->getLength()->getValue().getLimitedValue();
-      visitMemoryReference(I, MCII->getDest(), Size, MCII->getDestAlign(),
-                           nullptr, MemRef::Write);
-      visitMemoryReference(I, MCII->getSource(), Size, MCII->getSourceAlign(),
-                           nullptr, MemRef::Read);
+      visitMemoryReference(I, MemoryLocation::getForDest(MCII),
+                           MCII->getDestAlign(), nullptr, MemRef::Write);
+      visitMemoryReference(I, MemoryLocation::getForSource(MCII),
+                           MCII->getSourceAlign(), nullptr, MemRef::Read);
 
       // Check that the memcpy arguments don't overlap. The AliasAnalysis API
       // isn't expressive enough for what we really want to do. Known partial
@@ -356,17 +324,15 @@ void Lint::visitCallBase(CallBase &I) {
     }
     case Intrinsic::memmove: {
       MemMoveInst *MMI = cast<MemMoveInst>(&I);
-      // TODO: If the size is known, use it.
-      visitMemoryReference(I, MMI->getDest(), MemoryLocation::UnknownSize,
+      visitMemoryReference(I, MemoryLocation::getForDest(MMI),
                            MMI->getDestAlign(), nullptr, MemRef::Write);
-      visitMemoryReference(I, MMI->getSource(), MemoryLocation::UnknownSize,
+      visitMemoryReference(I, MemoryLocation::getForSource(MMI),
                            MMI->getSourceAlign(), nullptr, MemRef::Read);
       break;
     }
     case Intrinsic::memset: {
       MemSetInst *MSI = cast<MemSetInst>(&I);
-      // TODO: If the size is known, use it.
-      visitMemoryReference(I, MSI->getDest(), MemoryLocation::UnknownSize,
+      visitMemoryReference(I, MemoryLocation::getForDest(MSI),
                            MSI->getDestAlign(), nullptr, MemRef::Write);
       break;
     }
@@ -376,26 +342,31 @@ void Lint::visitCallBase(CallBase &I) {
              "Undefined behavior: va_start called in a non-varargs function",
              &I);
 
-      visitMemoryReference(I, I.getArgOperand(0), MemoryLocation::UnknownSize,
-                           None, nullptr, MemRef::Read | MemRef::Write);
+      visitMemoryReference(I, MemoryLocation::getForArgument(&I, 0, TLI), None,
+                           nullptr, MemRef::Read | MemRef::Write);
       break;
     case Intrinsic::vacopy:
-      visitMemoryReference(I, I.getArgOperand(0), MemoryLocation::UnknownSize,
-                           None, nullptr, MemRef::Write);
-      visitMemoryReference(I, I.getArgOperand(1), MemoryLocation::UnknownSize,
-                           None, nullptr, MemRef::Read);
+      visitMemoryReference(I, MemoryLocation::getForArgument(&I, 0, TLI), None,
+                           nullptr, MemRef::Write);
+      visitMemoryReference(I, MemoryLocation::getForArgument(&I, 1, TLI), None,
+                           nullptr, MemRef::Read);
       break;
     case Intrinsic::vaend:
-      visitMemoryReference(I, I.getArgOperand(0), MemoryLocation::UnknownSize,
-                           None, nullptr, MemRef::Read | MemRef::Write);
+      visitMemoryReference(I, MemoryLocation::getForArgument(&I, 0, TLI), None,
+                           nullptr, MemRef::Read | MemRef::Write);
       break;
 
     case Intrinsic::stackrestore:
       // Stackrestore doesn't read or write memory, but it sets the
       // stack pointer, which the compiler may read from or write to
       // at any time, so check it for both readability and writeability.
-      visitMemoryReference(I, I.getArgOperand(0), MemoryLocation::UnknownSize,
-                           None, nullptr, MemRef::Read | MemRef::Write);
+      visitMemoryReference(I, MemoryLocation::getForArgument(&I, 0, TLI), None,
+                           nullptr, MemRef::Read | MemRef::Write);
+      break;
+    case Intrinsic::get_active_lane_mask:
+      if (auto *TripCount = dyn_cast<ConstantInt>(I.getArgOperand(1)))
+        Assert(!TripCount->isZero(), "get_active_lane_mask: operand #2 "
+               "must be greater than 0", &I);
       break;
     }
 }
@@ -413,13 +384,14 @@ void Lint::visitReturnInst(ReturnInst &I) {
 
 // TODO: Check that the reference is in bounds.
 // TODO: Check readnone/readonly function attributes.
-void Lint::visitMemoryReference(Instruction &I, Value *Ptr, uint64_t Size,
+void Lint::visitMemoryReference(Instruction &I, const MemoryLocation &Loc,
                                 MaybeAlign Align, Type *Ty, unsigned Flags) {
   // If no memory is being referenced, it doesn't matter if the pointer
   // is valid.
-  if (Size == 0)
+  if (Loc.Size.isZero())
     return;
 
+  Value *Ptr = const_cast<Value *>(Loc.Ptr);
   Value *UnderlyingObject = findValue(Ptr, /*OffsetOk=*/true);
   Assert(!isa<ConstantPointerNull>(UnderlyingObject),
          "Undefined behavior: Null pointer dereference", &I);
@@ -487,9 +459,8 @@ void Lint::visitMemoryReference(Instruction &I, Value *Ptr, uint64_t Size,
 
     // Accesses from before the start or after the end of the object are not
     // defined.
-    Assert(Size == MemoryLocation::UnknownSize ||
-               BaseSize == MemoryLocation::UnknownSize ||
-               (Offset >= 0 && Offset + Size <= BaseSize),
+    Assert(!Loc.Size.hasValue() || BaseSize == MemoryLocation::UnknownSize ||
+               (Offset >= 0 && Offset + Loc.Size.getValue() <= BaseSize),
            "Undefined behavior: Buffer overflow", &I);
 
     // Accesses that say that the memory is more aligned than it is are not
@@ -503,15 +474,13 @@ void Lint::visitMemoryReference(Instruction &I, Value *Ptr, uint64_t Size,
 }
 
 void Lint::visitLoadInst(LoadInst &I) {
-  visitMemoryReference(I, I.getPointerOperand(),
-                       DL->getTypeStoreSize(I.getType()), I.getAlign(),
-                       I.getType(), MemRef::Read);
+  visitMemoryReference(I, MemoryLocation::get(&I), I.getAlign(), I.getType(),
+                       MemRef::Read);
 }
 
 void Lint::visitStoreInst(StoreInst &I) {
-  visitMemoryReference(I, I.getPointerOperand(),
-                       DL->getTypeStoreSize(I.getOperand(0)->getType()),
-                       I.getAlign(), I.getOperand(0)->getType(), MemRef::Write);
+  visitMemoryReference(I, MemoryLocation::get(&I), I.getAlign(),
+                       I.getOperand(0)->getType(), MemRef::Write);
 }
 
 void Lint::visitXor(BinaryOperator &I) {
@@ -553,7 +522,8 @@ static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT,
 
   VectorType *VecTy = dyn_cast<VectorType>(V->getType());
   if (!VecTy) {
-    KnownBits Known = computeKnownBits(V, DL, 0, AC, dyn_cast<Instruction>(V), DT);
+    KnownBits Known =
+        computeKnownBits(V, DL, 0, AC, dyn_cast<Instruction>(V), DT);
     return Known.isZero();
   }
 
@@ -567,7 +537,8 @@ static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT,
 
   // For a vector, KnownZero will only be true if all values are zero, so check
   // this per component
-  for (unsigned I = 0, N = VecTy->getNumElements(); I != N; ++I) {
+  for (unsigned I = 0, N = cast<FixedVectorType>(VecTy)->getNumElements();
+       I != N; ++I) {
     Constant *Elem = C->getAggregateElement(I);
     if (isa<UndefValue>(Elem))
       return true;
@@ -610,12 +581,12 @@ void Lint::visitAllocaInst(AllocaInst &I) {
 }
 
 void Lint::visitVAArgInst(VAArgInst &I) {
-  visitMemoryReference(I, I.getOperand(0), MemoryLocation::UnknownSize, None,
-                       nullptr, MemRef::Read | MemRef::Write);
+  visitMemoryReference(I, MemoryLocation::get(&I), None, nullptr,
+                       MemRef::Read | MemRef::Write);
 }
 
 void Lint::visitIndirectBrInst(IndirectBrInst &I) {
-  visitMemoryReference(I, I.getAddress(), MemoryLocation::UnknownSize, None,
+  visitMemoryReference(I, MemoryLocation::getAfter(I.getAddress()), None,
                        nullptr, MemRef::Branchee);
 
   Assert(I.getNumDestinations() != 0,
@@ -625,14 +596,17 @@ void Lint::visitIndirectBrInst(IndirectBrInst &I) {
 void Lint::visitExtractElementInst(ExtractElementInst &I) {
   if (ConstantInt *CI = dyn_cast<ConstantInt>(findValue(I.getIndexOperand(),
                                                         /*OffsetOk=*/false)))
-    Assert(CI->getValue().ult(I.getVectorOperandType()->getNumElements()),
-           "Undefined result: extractelement index out of range", &I);
+    Assert(
+        CI->getValue().ult(
+            cast<FixedVectorType>(I.getVectorOperandType())->getNumElements()),
+        "Undefined result: extractelement index out of range", &I);
 }
 
 void Lint::visitInsertElementInst(InsertElementInst &I) {
   if (ConstantInt *CI = dyn_cast<ConstantInt>(findValue(I.getOperand(2),
                                                         /*OffsetOk=*/false)))
-    Assert(CI->getValue().ult(I.getType()->getNumElements()),
+    Assert(CI->getValue().ult(
+               cast<FixedVectorType>(I.getType())->getNumElements()),
            "Undefined result: insertelement index out of range", &I);
 }
 
@@ -669,7 +643,7 @@ Value *Lint::findValueImpl(Value *V, bool OffsetOk,
   // TODO: Look through eliminable cast pairs.
   // TODO: Look through calls with unique return values.
   // TODO: Look through vector insert/extract/shuffle.
-  V = OffsetOk ? GetUnderlyingObject(V, *DL) : V->stripPointerCasts();
+  V = OffsetOk ? getUnderlyingObject(V) : V->stripPointerCasts();
   if (LoadInst *L = dyn_cast<LoadInst>(V)) {
     BasicBlock::iterator BBI = L->getIterator();
     BasicBlock *BB = L->getParent();
@@ -678,11 +652,13 @@ Value *Lint::findValueImpl(Value *V, bool OffsetOk,
       if (!VisitedBlocks.insert(BB).second)
         break;
       if (Value *U =
-          FindAvailableLoadedValue(L, BB, BBI, DefMaxInstsToScan, AA))
+              FindAvailableLoadedValue(L, BB, BBI, DefMaxInstsToScan, AA))
         return findValueImpl(U, OffsetOk, Visited);
-      if (BBI != BB->begin()) break;
+      if (BBI != BB->begin())
+        break;
       BB = BB->getUniquePredecessor();
-      if (!BB) break;
+      if (!BB)
+        break;
       BBI = BB->end();
     }
   } else if (PHINode *PN = dyn_cast<PHINode>(V)) {
@@ -692,8 +668,8 @@ Value *Lint::findValueImpl(Value *V, bool OffsetOk,
     if (CI->isNoopCast(*DL))
       return findValueImpl(CI->getOperand(0), OffsetOk, Visited);
   } else if (ExtractValueInst *Ex = dyn_cast<ExtractValueInst>(V)) {
-    if (Value *W = FindInsertedValue(Ex->getAggregateOperand(),
-                                     Ex->getIndices()))
+    if (Value *W =
+            FindInsertedValue(Ex->getAggregateOperand(), Ex->getIndices()))
       if (W != V)
         return findValueImpl(W, OffsetOk, Visited);
   } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) {
@@ -724,22 +700,75 @@ Value *Lint::findValueImpl(Value *V, bool OffsetOk,
   return V;
 }
 
+PreservedAnalyses LintPass::run(Function &F, FunctionAnalysisManager &AM) {
+  auto *Mod = F.getParent();
+  auto *DL = &F.getParent()->getDataLayout();
+  auto *AA = &AM.getResult<AAManager>(F);
+  auto *AC = &AM.getResult<AssumptionAnalysis>(F);
+  auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
+  auto *TLI = &AM.getResult<TargetLibraryAnalysis>(F);
+  Lint L(Mod, DL, AA, AC, DT, TLI);
+  L.visit(F);
+  dbgs() << L.MessagesStr.str();
+  return PreservedAnalyses::all();
+}
+
+class LintLegacyPass : public FunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  LintLegacyPass() : FunctionPass(ID) {
+    initializeLintLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+  }
+  void print(raw_ostream &O, const Module *M) const override {}
+};
+
+char LintLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LintLegacyPass, "lint", "Statically lint-checks LLVM IR",
+                      false, true)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(LintLegacyPass, "lint", "Statically lint-checks LLVM IR",
+                    false, true)
+
+bool LintLegacyPass::runOnFunction(Function &F) {
+  auto *Mod = F.getParent();
+  auto *DL = &F.getParent()->getDataLayout();
+  auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+  auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+  Lint L(Mod, DL, AA, AC, DT, TLI);
+  L.visit(F);
+  dbgs() << L.MessagesStr.str();
+  return false;
+}
+
 //===----------------------------------------------------------------------===//
 //  Implement the public interfaces to this file...
 //===----------------------------------------------------------------------===//
 
-FunctionPass *llvm::createLintPass() {
-  return new Lint();
-}
+FunctionPass *llvm::createLintLegacyPassPass() { return new LintLegacyPass(); }
 
 /// lintFunction - Check a function for errors, printing messages on stderr.
 ///
 void llvm::lintFunction(const Function &f) {
-  Function &F = const_cast<Function&>(f);
+  Function &F = const_cast<Function &>(f);
   assert(!F.isDeclaration() && "Cannot lint external functions");
 
   legacy::FunctionPassManager FPM(F.getParent());
-  Lint *V = new Lint();
+  auto *V = new LintLegacyPass();
   FPM.add(V);
   FPM.run(F);
 }
@@ -748,7 +777,7 @@ void llvm::lintFunction(const Function &f) {
 ///
 void llvm::lintModule(const Module &M) {
   legacy::PassManager PM;
-  Lint *V = new Lint();
+  auto *V = new LintLegacyPass();
   PM.add(V);
-  PM.run(const_cast<Module&>(M));
+  PM.run(const_cast<Module &>(M));
 }
diff --git a/contrib/llvm-project/llvm/lib/Analysis/Loads.cpp b/contrib/llvm-project/llvm/lib/Analysis/Loads.cpp
index e5245225d905..8f373f70f216 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/Loads.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/Loads.cpp
@@ -12,7 +12,9 @@
 
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -107,11 +109,50 @@ static bool isDereferenceableAndAlignedPointer(
     return isDereferenceableAndAlignedPointer(ASC->getOperand(0), Alignment,
                                               Size, DL, CtxI, DT, Visited, MaxDepth);
 
-  if (const auto *Call = dyn_cast<CallBase>(V))
+  if (const auto *Call = dyn_cast<CallBase>(V)) {
     if (auto *RP = getArgumentAliasingToReturnedPointer(Call, true))
       return isDereferenceableAndAlignedPointer(RP, Alignment, Size, DL, CtxI,
                                                 DT, Visited, MaxDepth);
 
+    // If we have a call we can't recurse through, check to see if this is an
+    // allocation function for which we can establish an minimum object size.
+    // Such a minimum object size is analogous to a deref_or_null attribute in
+    // that we still need to prove the result non-null at point of use.
+    // NOTE: We can only use the object size as a base fact as we a) need to
+    // prove alignment too, and b) don't want the compile time impact of a
+    // separate recursive walk.
+    ObjectSizeOpts Opts;
+    // TODO: It may be okay to round to align, but that would imply that
+    // accessing slightly out of bounds was legal, and we're currently
+    // inconsistent about that.  For the moment, be conservative.
+    Opts.RoundToAlign = false;
+    Opts.NullIsUnknownSize = true;
+    uint64_t ObjSize;
+    // TODO: Plumb through TLI so that malloc routines and such working.
+    if (getObjectSize(V, ObjSize, DL, nullptr, Opts)) {
+      APInt KnownDerefBytes(Size.getBitWidth(), ObjSize);
+      if (KnownDerefBytes.getBoolValue() && KnownDerefBytes.uge(Size) &&
+          isKnownNonZero(V, DL, 0, nullptr, CtxI, DT) &&
+          // TODO: We're currently inconsistent about whether deref(N) is a
+          // global fact or a point in time fact.  Once D61652 eventually
+          // lands, this check will be restricted to the point in time
+          // variant. For that variant, we need to prove that object hasn't
+          // been conditionally freed before ontext instruction - if it has, we
+          // might be hoisting over the inverse conditional and creating a
+          // dynamic use after free. 
+          !PointerMayBeCapturedBefore(V, true, true, CtxI, DT, true)) {
+        // As we recursed through GEPs to get here, we've incrementally
+        // checked that each step advanced by a multiple of the alignment. If
+        // our base is properly aligned, then the original offset accessed
+        // must also be. 
+        Type *Ty = V->getType();
+        assert(Ty->isSized() && "must be sized");
+        APInt Offset(DL.getTypeStoreSizeInBits(Ty), 0);
+        return isAligned(V, Offset, Alignment, DL);
+      }
+    }
+  }
+
   // If we don't know, assume the worst.
   return false;
 }
@@ -199,7 +240,7 @@ bool llvm::isDereferenceableAndAlignedInLoop(LoadInst *LI, Loop *L,
   Value *Ptr = LI->getPointerOperand();
 
   APInt EltSize(DL.getIndexTypeSizeInBits(Ptr->getType()),
-                DL.getTypeStoreSize(LI->getType()));
+                DL.getTypeStoreSize(LI->getType()).getFixedSize());
   const Align Alignment = LI->getAlign();
 
   Instruction *HeaderFirstNonPHI = L->getHeader()->getFirstNonPHI();
@@ -222,9 +263,7 @@ bool llvm::isDereferenceableAndAlignedInLoop(LoadInst *LI, Loop *L,
   if (Step->getAPInt() != EltSize)
     return false;
 
-  // TODO: If the symbolic trip count has a small bound (max count), we might
-  // be able to prove safety.
-  auto TC = SE.getSmallConstantTripCount(L);
+  auto TC = SE.getSmallConstantMaxTripCount(L);
   if (!TC)
     return false;
 
@@ -503,3 +542,23 @@ Value *llvm::FindAvailablePtrLoadStore(Value *Ptr, Type *AccessTy,
   // block.
   return nullptr;
 }
+
+bool llvm::canReplacePointersIfEqual(Value *A, Value *B, const DataLayout &DL,
+                                     Instruction *CtxI) {
+  Type *Ty = A->getType();
+  assert(Ty == B->getType() && Ty->isPointerTy() &&
+         "values must have matching pointer types");
+
+  // NOTE: The checks in the function are incomplete and currently miss illegal
+  // cases! The current implementation is a starting point and the
+  // implementation should be made stricter over time.
+  if (auto *C = dyn_cast<Constant>(B)) {
+    // Do not allow replacing a pointer with a constant pointer, unless it is
+    // either null or at least one byte is dereferenceable.
+    APInt OneByte(DL.getPointerTypeSizeInBits(Ty), 1);
+    return C->isNullValue() ||
+           isDereferenceableAndAlignedPointer(B, Align(1), OneByte, DL, CtxI);
+  }
+
+  return true;
+}
diff --git a/contrib/llvm-project/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/contrib/llvm-project/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index f409cd322146..e632fe25c24c 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -149,27 +149,23 @@ const SCEV *llvm::replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE,
   // symbolic stride replaced by one.
   ValueToValueMap::const_iterator SI =
       PtrToStride.find(OrigPtr ? OrigPtr : Ptr);
-  if (SI != PtrToStride.end()) {
-    Value *StrideVal = SI->second;
+  if (SI == PtrToStride.end())
+    // For a non-symbolic stride, just return the original expression.
+    return OrigSCEV;
 
-    // Strip casts.
-    StrideVal = stripIntegerCast(StrideVal);
+  Value *StrideVal = stripIntegerCast(SI->second);
 
-    ScalarEvolution *SE = PSE.getSE();
-    const auto *U = cast<SCEVUnknown>(SE->getSCEV(StrideVal));
-    const auto *CT =
-        static_cast<const SCEVConstant *>(SE->getOne(StrideVal->getType()));
-
-    PSE.addPredicate(*SE->getEqualPredicate(U, CT));
-    auto *Expr = PSE.getSCEV(Ptr);
+  ScalarEvolution *SE = PSE.getSE();
+  const auto *U = cast<SCEVUnknown>(SE->getSCEV(StrideVal));
+  const auto *CT =
+    static_cast<const SCEVConstant *>(SE->getOne(StrideVal->getType()));
 
-    LLVM_DEBUG(dbgs() << "LAA: Replacing SCEV: " << *OrigSCEV
-                      << " by: " << *Expr << "\n");
-    return Expr;
-  }
+  PSE.addPredicate(*SE->getEqualPredicate(U, CT));
+  auto *Expr = PSE.getSCEV(Ptr);
 
-  // Otherwise, just return the SCEV of the original pointer.
-  return OrigSCEV;
+  LLVM_DEBUG(dbgs() << "LAA: Replacing SCEV: " << *OrigSCEV
+	     << " by: " << *Expr << "\n");
+  return Expr;
 }
 
 RuntimeCheckingPtrGroup::RuntimeCheckingPtrGroup(
@@ -227,9 +223,10 @@ void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, bool WritePtr,
       ScEnd = SE->getUMaxExpr(AR->getStart(), ScEnd);
     }
     // Add the size of the pointed element to ScEnd.
-    unsigned EltSize =
-      Ptr->getType()->getPointerElementType()->getScalarSizeInBits() / 8;
-    const SCEV *EltSizeSCEV = SE->getConstant(ScEnd->getType(), EltSize);
+    auto &DL = Lp->getHeader()->getModule()->getDataLayout();
+    Type *IdxTy = DL.getIndexType(Ptr->getType());
+    const SCEV *EltSizeSCEV =
+        SE->getStoreSizeOfExpr(IdxTy, Ptr->getType()->getPointerElementType());
     ScEnd = SE->getAddExpr(ScEnd, EltSizeSCEV);
   }
 
@@ -508,16 +505,16 @@ public:
   typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
   typedef SmallVector<MemAccessInfo, 8> MemAccessInfoList;
 
-  AccessAnalysis(const DataLayout &Dl, Loop *TheLoop, AAResults *AA,
-                 LoopInfo *LI, MemoryDepChecker::DepCandidates &DA,
+  AccessAnalysis(Loop *TheLoop, AAResults *AA, LoopInfo *LI,
+                 MemoryDepChecker::DepCandidates &DA,
                  PredicatedScalarEvolution &PSE)
-      : DL(Dl), TheLoop(TheLoop), AST(*AA), LI(LI), DepCands(DA),
+      : TheLoop(TheLoop), AST(*AA), LI(LI), DepCands(DA),
         IsRTCheckAnalysisNeeded(false), PSE(PSE) {}
 
   /// Register a load  and whether it is only read from.
   void addLoad(MemoryLocation &Loc, bool IsReadOnly) {
     Value *Ptr = const_cast<Value*>(Loc.Ptr);
-    AST.add(Ptr, LocationSize::unknown(), Loc.AATags);
+    AST.add(Ptr, LocationSize::beforeOrAfterPointer(), Loc.AATags);
     Accesses.insert(MemAccessInfo(Ptr, false));
     if (IsReadOnly)
       ReadOnlyPtr.insert(Ptr);
@@ -526,7 +523,7 @@ public:
   /// Register a store.
   void addStore(MemoryLocation &Loc) {
     Value *Ptr = const_cast<Value*>(Loc.Ptr);
-    AST.add(Ptr, LocationSize::unknown(), Loc.AATags);
+    AST.add(Ptr, LocationSize::beforeOrAfterPointer(), Loc.AATags);
     Accesses.insert(MemAccessInfo(Ptr, true));
   }
 
@@ -585,8 +582,6 @@ private:
   /// Set of all accesses.
   PtrAccessSet Accesses;
 
-  const DataLayout &DL;
-
   /// The loop being checked.
   const Loop *TheLoop;
 
@@ -732,7 +727,7 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
     // First, count how many write and read accesses are in the alias set. Also
     // collect MemAccessInfos for later.
     SmallVector<MemAccessInfo, 4> AccessInfos;
-    for (auto A : AS) {
+    for (const auto &A : AS) {
       Value *Ptr = A.getValue();
       bool IsWrite = Accesses.count(MemAccessInfo(Ptr, true));
 
@@ -866,7 +861,7 @@ void AccessAnalysis::processMemAccesses() {
   // compatibility and potential for underlying-object overlap. As a result, we
   // only need to check for potential pointer dependencies within each alias
   // set.
-  for (auto &AS : AST) {
+  for (const auto &AS : AST) {
     // Note that both the alias-set tracker and the alias sets themselves used
     // linked lists internally and so the iteration order here is deterministic
     // (matching the original instruction order within each set).
@@ -886,12 +881,12 @@ void AccessAnalysis::processMemAccesses() {
       bool UseDeferred = SetIteration > 0;
       PtrAccessSet &S = UseDeferred ? DeferredAccesses : Accesses;
 
-      for (auto AV : AS) {
+      for (const auto &AV : AS) {
         Value *Ptr = AV.getValue();
 
         // For a single memory access in AliasSetTracker, Accesses may contain
         // both read and write, and they both need to be handled for CheckDeps.
-        for (auto AC : S) {
+        for (const auto &AC : S) {
           if (AC.getPointer() != Ptr)
             continue;
 
@@ -938,7 +933,7 @@ void AccessAnalysis::processMemAccesses() {
           typedef SmallVector<const Value *, 16> ValueVector;
           ValueVector TempObjects;
 
-          GetUnderlyingObjects(Ptr, TempObjects, DL, LI);
+          getUnderlyingObjects(Ptr, TempObjects, LI);
           LLVM_DEBUG(dbgs()
                      << "Underlying objects for pointer " << *Ptr << "\n");
           for (const Value *UnderlyingObj : TempObjects) {
@@ -992,7 +987,7 @@ static bool isNoWrapAddRec(Value *Ptr, const SCEVAddRecExpr *AR,
 
   // Make sure there is only one non-const index and analyze that.
   Value *NonConstIndex = nullptr;
-  for (Value *Index : make_range(GEP->idx_begin(), GEP->idx_end()))
+  for (Value *Index : GEP->indices())
     if (!isa<ConstantInt>(Index)) {
       if (NonConstIndex)
         return false;
@@ -1142,7 +1137,7 @@ bool llvm::sortPtrAccesses(ArrayRef<Value *> VL, const DataLayout &DL,
   // first pointer in the array.
   Value *Ptr0 = VL[0];
   const SCEV *Scev0 = SE.getSCEV(Ptr0);
-  Value *Obj0 = GetUnderlyingObject(Ptr0, DL);
+  Value *Obj0 = getUnderlyingObject(Ptr0);
 
   llvm::SmallSet<int64_t, 4> Offsets;
   for (auto *Ptr : VL) {
@@ -1153,7 +1148,7 @@ bool llvm::sortPtrAccesses(ArrayRef<Value *> VL, const DataLayout &DL,
       return false;
     // If a pointer refers to a different underlying object, bail - the
     // pointers are by definition incomparable.
-    Value *CurrObj = GetUnderlyingObject(Ptr, DL);
+    Value *CurrObj = getUnderlyingObject(Ptr);
     if (CurrObj != Obj0)
       return false;
 
@@ -1343,7 +1338,7 @@ bool MemoryDepChecker::couldPreventStoreLoadForward(uint64_t Distance,
     // If the number of vector iteration between the store and the load are
     // small we could incur conflicts.
     if (Distance % VF && Distance / VF < NumItersForStoreLoadThroughMemory) {
-      MaxVFWithoutSLForwardIssues = (VF >>= 1);
+      MaxVFWithoutSLForwardIssues = (VF >> 1);
       break;
     }
   }
@@ -1659,7 +1654,7 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
   LLVM_DEBUG(dbgs() << "LAA: Positive distance " << Val.getSExtValue()
                     << " with max VF = " << MaxVF << '\n');
   uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8;
-  MaxSafeRegisterWidth = std::min(MaxSafeRegisterWidth, MaxVFInBits);
+  MaxSafeVectorWidthInBits = std::min(MaxSafeVectorWidthInBits, MaxVFInBits);
   return Dependence::BackwardVectorizable;
 }
 
@@ -1771,7 +1766,7 @@ bool LoopAccessInfo::canAnalyzeLoop() {
                     << TheLoop->getHeader()->getName() << '\n');
 
   // We can only analyze innermost loops.
-  if (!TheLoop->empty()) {
+  if (!TheLoop->isInnermost()) {
     LLVM_DEBUG(dbgs() << "LAA: loop is not the innermost loop\n");
     recordAnalysis("NotInnerMostLoop") << "loop is not the innermost loop";
     return false;
@@ -1786,29 +1781,9 @@ bool LoopAccessInfo::canAnalyzeLoop() {
     return false;
   }
 
-  // We must have a single exiting block.
-  if (!TheLoop->getExitingBlock()) {
-    LLVM_DEBUG(
-        dbgs() << "LAA: loop control flow is not understood by analyzer\n");
-    recordAnalysis("CFGNotUnderstood")
-        << "loop control flow is not understood by analyzer";
-    return false;
-  }
-
-  // We only handle bottom-tested loops, i.e. loop in which the condition is
-  // checked at the end of each iteration. With that we can assume that all
-  // instructions in the loop are executed the same number of times.
-  if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
-    LLVM_DEBUG(
-        dbgs() << "LAA: loop control flow is not understood by analyzer\n");
-    recordAnalysis("CFGNotUnderstood")
-        << "loop control flow is not understood by analyzer";
-    return false;
-  }
-
   // ScalarEvolution needs to be able to find the exit count.
   const SCEV *ExitCount = PSE->getBackedgeTakenCount();
-  if (ExitCount == PSE->getSE()->getCouldNotCompute()) {
+  if (isa<SCEVCouldNotCompute>(ExitCount)) {
     recordAnalysis("CantComputeNumberOfIterations")
         << "could not determine number of loop iterations";
     LLVM_DEBUG(dbgs() << "LAA: SCEV could not compute the loop exit count.\n");
@@ -1947,10 +1922,9 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
   }
 
   MemoryDepChecker::DepCandidates DependentAccesses;
-  AccessAnalysis Accesses(TheLoop->getHeader()->getModule()->getDataLayout(),
-                          TheLoop, AA, LI, DependentAccesses, *PSE);
+  AccessAnalysis Accesses(TheLoop, AA, LI, DependentAccesses, *PSE);
 
-  // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects
+  // Holds the analyzed pointers. We don't want to call getUnderlyingObjects
   // multiple times on the same object. If the ptr is accessed twice, once
   // for read and once for write, it will only appear once (on the write
   // list). This is okay, since we are going to check for conflicts between
@@ -2152,12 +2126,8 @@ bool LoopAccessInfo::isUniform(Value *V) const {
 }
 
 void LoopAccessInfo::collectStridedAccess(Value *MemAccess) {
-  Value *Ptr = nullptr;
-  if (LoadInst *LI = dyn_cast<LoadInst>(MemAccess))
-    Ptr = LI->getPointerOperand();
-  else if (StoreInst *SI = dyn_cast<StoreInst>(MemAccess))
-    Ptr = SI->getPointerOperand();
-  else
+  Value *Ptr = getLoadStorePointerOperand(MemAccess);
+  if (!Ptr)
     return;
 
   Value *Stride = getStrideFromPointer(Ptr, PSE->getSE(), TheLoop);
diff --git a/contrib/llvm-project/llvm/lib/Analysis/LoopAnalysisManager.cpp b/contrib/llvm-project/llvm/lib/Analysis/LoopAnalysisManager.cpp
index 21017c04da99..4ad5641da147 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/LoopAnalysisManager.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/LoopAnalysisManager.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/LoopInfo.h"
diff --git a/contrib/llvm-project/llvm/lib/Analysis/LoopCacheAnalysis.cpp b/contrib/llvm-project/llvm/lib/Analysis/LoopCacheAnalysis.cpp
index 6ba247a87c22..cf68596bfbc3 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/LoopCacheAnalysis.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/LoopCacheAnalysis.cpp
@@ -29,7 +29,11 @@
 #include "llvm/ADT/BreadthFirstIterator.h"
 #include "llvm/ADT/Sequence.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/DependenceAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 
@@ -145,7 +149,7 @@ IndexedReference::IndexedReference(Instruction &StoreOrLoadInst,
 
 Optional<bool> IndexedReference::hasSpacialReuse(const IndexedReference &Other,
                                                  unsigned CLS,
-                                                 AliasAnalysis &AA) const {
+                                                 AAResults &AA) const {
   assert(IsValid && "Expecting a valid reference");
 
   if (BasePointer != Other.getBasePointer() && !isAliased(Other, AA)) {
@@ -202,7 +206,7 @@ Optional<bool> IndexedReference::hasTemporalReuse(const IndexedReference &Other,
                                                   unsigned MaxDistance,
                                                   const Loop &L,
                                                   DependenceInfo &DI,
-                                                  AliasAnalysis &AA) const {
+                                                  AAResults &AA) const {
   assert(IsValid && "Expecting a valid reference");
 
   if (BasePointer != Other.getBasePointer() && !isAliased(Other, AA)) {
@@ -457,7 +461,7 @@ bool IndexedReference::isSimpleAddRecurrence(const SCEV &Subscript,
 }
 
 bool IndexedReference::isAliased(const IndexedReference &Other,
-                                 AliasAnalysis &AA) const {
+                                 AAResults &AA) const {
   const auto &Loc1 = MemoryLocation::get(&StoreOrLoadInst);
   const auto &Loc2 = MemoryLocation::get(&Other.StoreOrLoadInst);
   return AA.isMustAlias(Loc1, Loc2);
@@ -476,7 +480,7 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const CacheCost &CC) {
 
 CacheCost::CacheCost(const LoopVectorTy &Loops, const LoopInfo &LI,
                      ScalarEvolution &SE, TargetTransformInfo &TTI,
-                     AliasAnalysis &AA, DependenceInfo &DI,
+                     AAResults &AA, DependenceInfo &DI,
                      Optional<unsigned> TRT)
     : Loops(Loops), TripCounts(), LoopCosts(),
       TRT((TRT == None) ? Optional<unsigned>(TemporalReuseThreshold) : TRT),
@@ -495,14 +499,13 @@ CacheCost::CacheCost(const LoopVectorTy &Loops, const LoopInfo &LI,
 std::unique_ptr<CacheCost>
 CacheCost::getCacheCost(Loop &Root, LoopStandardAnalysisResults &AR,
                         DependenceInfo &DI, Optional<unsigned> TRT) {
-  if (Root.getParentLoop()) {
+  if (!Root.isOutermost()) {
     LLVM_DEBUG(dbgs() << "Expecting the outermost loop in a loop nest\n");
     return nullptr;
   }
 
   LoopVectorTy Loops;
-  for (Loop *L : breadth_first(&Root))
-    Loops.push_back(L);
+  append_range(Loops, breadth_first(&Root));
 
   if (!getInnerMostLoop(Loops)) {
     LLVM_DEBUG(dbgs() << "Cannot compute cache cost of loop nest with more "
diff --git a/contrib/llvm-project/llvm/lib/Analysis/LoopInfo.cpp b/contrib/llvm-project/llvm/lib/Analysis/LoopInfo.cpp
index b5af210f1b92..a85869b16333 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/LoopInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/LoopInfo.cpp
@@ -34,6 +34,7 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/IR/PrintPasses.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -431,6 +432,10 @@ static bool isBlockInLCSSAForm(const Loop &L, const BasicBlock &BB,
     for (const Use &U : I.uses()) {
       const Instruction *UI = cast<Instruction>(U.getUser());
       const BasicBlock *UserBB = UI->getParent();
+
+      // For practical purposes, we consider that the use in a PHI
+      // occurs in the respective predecessor block. For more info,
+      // see the `phi` doc in LangRef and the LCSSA doc.
       if (const PHINode *P = dyn_cast<PHINode>(UI))
         UserBB = P->getIncomingBlock(U);
 
@@ -535,6 +540,22 @@ void Loop::setLoopAlreadyUnrolled() {
   setLoopID(NewLoopID);
 }
 
+void Loop::setLoopMustProgress() {
+  LLVMContext &Context = getHeader()->getContext();
+
+  MDNode *MustProgress = findOptionMDForLoop(this, "llvm.loop.mustprogress");
+
+  if (MustProgress)
+    return;
+
+  MDNode *MustProgressMD =
+      MDNode::get(Context, MDString::get(Context, "llvm.loop.mustprogress"));
+  MDNode *LoopID = getLoopID();
+  MDNode *NewLoopID =
+      makePostTransformationMetadata(Context, LoopID, {}, {MustProgressMD});
+  setLoopID(NewLoopID);
+}
+
 bool Loop::isAnnotatedParallel() const {
   MDNode *DesiredLoopIdMetadata = getLoopID();
 
@@ -546,7 +567,7 @@ bool Loop::isAnnotatedParallel() const {
   SmallPtrSet<MDNode *, 4>
       ParallelAccessGroups; // For scalable 'contains' check.
   if (ParallelAccesses) {
-    for (const MDOperand &MD : drop_begin(ParallelAccesses->operands(), 1)) {
+    for (const MDOperand &MD : drop_begin(ParallelAccesses->operands())) {
       MDNode *AccGroup = cast<MDNode>(MD.get());
       assert(isValidAsAccessGroup(AccGroup) &&
              "List item must be an access group");
@@ -764,7 +785,7 @@ void UnloopUpdater::removeBlocksFromAncestors() {
 
 /// Update the parent loop for all subloops directly nested within unloop.
 void UnloopUpdater::updateSubloopParents() {
-  while (!Unloop.empty()) {
+  while (!Unloop.isInnermost()) {
     Loop *Subloop = *std::prev(Unloop.end());
     Unloop.removeChildLoop(std::prev(Unloop.end()));
 
@@ -862,7 +883,7 @@ void LoopInfo::erase(Loop *Unloop) {
   auto InvalidateOnExit = make_scope_exit([&]() { destroy(Unloop); });
 
   // First handle the special case of no parent loop to simplify the algorithm.
-  if (!Unloop->getParentLoop()) {
+  if (Unloop->isOutermost()) {
     // Since BBLoop had no parent, Unloop blocks are no longer in a loop.
     for (Loop::block_iterator I = Unloop->block_begin(),
                               E = Unloop->block_end();
@@ -887,7 +908,7 @@ void LoopInfo::erase(Loop *Unloop) {
     }
 
     // Move all of the subloops to the top-level.
-    while (!Unloop->empty())
+    while (!Unloop->isInnermost())
       addTopLevelLoop(Unloop->removeChildLoop(std::prev(Unloop->end())));
 
     return;
@@ -1017,8 +1038,7 @@ MDNode *llvm::makePostTransformationMetadata(LLVMContext &Context,
   SmallVector<Metadata *, 4> MDs;
 
   // Reserve first location for self reference to the LoopID metadata node.
-  TempMDTuple TempNode = MDNode::getTemporary(Context, None);
-  MDs.push_back(TempNode.get());
+  MDs.push_back(nullptr);
 
   // Remove metadata for the transformation that has been applied or that became
   // outdated.
diff --git a/contrib/llvm-project/llvm/lib/Analysis/LoopNestAnalysis.cpp b/contrib/llvm-project/llvm/lib/Analysis/LoopNestAnalysis.cpp
index 61e53de93151..7133abcc3504 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/LoopNestAnalysis.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/LoopNestAnalysis.cpp
@@ -42,8 +42,7 @@ static bool checkLoopsStructure(const Loop &OuterLoop, const Loop &InnerLoop,
 
 LoopNest::LoopNest(Loop &Root, ScalarEvolution &SE)
     : MaxPerfectDepth(getMaxPerfectDepth(Root, SE)) {
-  for (Loop *L : breadth_first(&Root))
-    Loops.push_back(L);
+  append_range(Loops, breadth_first(&Root));
 }
 
 std::unique_ptr<LoopNest> LoopNest::getLoopNest(Loop &Root,
@@ -53,8 +52,8 @@ std::unique_ptr<LoopNest> LoopNest::getLoopNest(Loop &Root,
 
 bool LoopNest::arePerfectlyNested(const Loop &OuterLoop, const Loop &InnerLoop,
                                   ScalarEvolution &SE) {
-  assert(!OuterLoop.getSubLoops().empty() && "Outer loop should have subloops");
-  assert(InnerLoop.getParentLoop() && "Inner loop should have a parent");
+  assert(!OuterLoop.isInnermost() && "Outer loop should have subloops");
+  assert(!InnerLoop.isOutermost() && "Inner loop should have a parent");
   LLVM_DEBUG(dbgs() << "Checking whether loop '" << OuterLoop.getName()
                     << "' and '" << InnerLoop.getName()
                     << "' are perfectly nested.\n");
@@ -206,6 +205,31 @@ unsigned LoopNest::getMaxPerfectDepth(const Loop &Root, ScalarEvolution &SE) {
   return CurrentDepth;
 }
 
+const BasicBlock &LoopNest::skipEmptyBlockUntil(const BasicBlock *From,
+                                                const BasicBlock *End) {
+  assert(From && "Expecting valid From");
+  assert(End && "Expecting valid End");
+
+  if (From == End || !From->getSingleSuccessor())
+    return *From;
+
+  auto IsEmpty = [](const BasicBlock *BB) {
+    return (BB->getInstList().size() == 1);
+  };
+
+  // Visited is used to avoid running into an infinite loop.
+  SmallPtrSet<const BasicBlock *, 4> Visited;
+  const BasicBlock *BB = From->getSingleSuccessor();
+  const BasicBlock *PredBB = BB;
+  while (BB && BB != End && IsEmpty(BB) && !Visited.count(BB)) {
+    Visited.insert(BB);
+    PredBB = BB;
+    BB = BB->getSingleSuccessor();
+  }
+
+  return (BB == End) ? *End : *PredBB;
+}
+
 static bool checkLoopsStructure(const Loop &OuterLoop, const Loop &InnerLoop,
                                 ScalarEvolution &SE) {
   // The inner loop must be the only outer loop's child.
@@ -228,34 +252,92 @@ static bool checkLoopsStructure(const Loop &OuterLoop, const Loop &InnerLoop,
       InnerLoop.getExitingBlock() != InnerLoopLatch || !InnerLoopExit)
     return false;
 
+  // Returns whether the block `ExitBlock` contains at least one LCSSA Phi node.
+  auto ContainsLCSSAPhi = [](const BasicBlock &ExitBlock) {
+    return any_of(ExitBlock.phis(), [](const PHINode &PN) {
+      return PN.getNumIncomingValues() == 1;
+    });
+  };
+
+  // Returns whether the block `BB` qualifies for being an extra Phi block. The
+  // extra Phi block is the additional block inserted after the exit block of an
+  // "guarded" inner loop which contains "only" Phi nodes corresponding to the
+  // LCSSA Phi nodes in the exit block.
+  auto IsExtraPhiBlock = [&](const BasicBlock &BB) {
+    return BB.getFirstNonPHI() == BB.getTerminator() &&
+           all_of(BB.phis(), [&](const PHINode &PN) {
+             return all_of(PN.blocks(), [&](const BasicBlock *IncomingBlock) {
+               return IncomingBlock == InnerLoopExit ||
+                      IncomingBlock == OuterLoopHeader;
+             });
+           });
+  };
+
+  const BasicBlock *ExtraPhiBlock = nullptr;
   // Ensure the only branch that may exist between the loops is the inner loop
   // guard.
   if (OuterLoopHeader != InnerLoopPreHeader) {
-    const BranchInst *BI =
-        dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
-
-    if (!BI || BI != InnerLoop.getLoopGuardBranch())
-      return false;
-
-    // The successors of the inner loop guard should be the inner loop
-    // preheader and the outer loop latch.
-    for (const BasicBlock *Succ : BI->successors()) {
-      if (Succ == InnerLoopPreHeader)
-        continue;
-      if (Succ == OuterLoopLatch)
-        continue;
-
-      DEBUG_WITH_TYPE(VerboseDebug, {
-        dbgs() << "Inner loop guard successor " << Succ->getName()
-               << " doesn't lead to inner loop preheader or "
-                  "outer loop latch.\n";
-      });
-      return false;
+    const BasicBlock &SingleSucc =
+        LoopNest::skipEmptyBlockUntil(OuterLoopHeader, InnerLoopPreHeader);
+
+    // no conditional branch present
+    if (&SingleSucc != InnerLoopPreHeader) {
+      const BranchInst *BI = dyn_cast<BranchInst>(SingleSucc.getTerminator());
+
+      if (!BI || BI != InnerLoop.getLoopGuardBranch())
+        return false;
+
+      bool InnerLoopExitContainsLCSSA = ContainsLCSSAPhi(*InnerLoopExit);
+
+      // The successors of the inner loop guard should be the inner loop
+      // preheader or the outer loop latch possibly through empty blocks.
+      for (const BasicBlock *Succ : BI->successors()) {
+        const BasicBlock *PotentialInnerPreHeader = Succ;
+        const BasicBlock *PotentialOuterLatch = Succ;
+
+        // Ensure the inner loop guard successor is empty before skipping
+        // blocks.
+        if (Succ->getInstList().size() == 1) {
+          PotentialInnerPreHeader =
+              &LoopNest::skipEmptyBlockUntil(Succ, InnerLoopPreHeader);
+          PotentialOuterLatch =
+              &LoopNest::skipEmptyBlockUntil(Succ, OuterLoopLatch);
+        }
+
+        if (PotentialInnerPreHeader == InnerLoopPreHeader)
+          continue;
+        if (PotentialOuterLatch == OuterLoopLatch)
+          continue;
+
+        // If `InnerLoopExit` contains LCSSA Phi instructions, additional block
+        // may be inserted before the `OuterLoopLatch` to which `BI` jumps. The
+        // loops are still considered perfectly nested if the extra block only
+        // contains Phi instructions from InnerLoopExit and OuterLoopHeader.
+        if (InnerLoopExitContainsLCSSA && IsExtraPhiBlock(*Succ) &&
+            Succ->getSingleSuccessor() == OuterLoopLatch) {
+          // Points to the extra block so that we can reference it later in the
+          // final check. We can also conclude that the inner loop is
+          // guarded and there exists LCSSA Phi node in the exit block later if
+          // we see a non-null `ExtraPhiBlock`.
+          ExtraPhiBlock = Succ;
+          continue;
+        }
+
+        DEBUG_WITH_TYPE(VerboseDebug, {
+          dbgs() << "Inner loop guard successor " << Succ->getName()
+                 << " doesn't lead to inner loop preheader or "
+                    "outer loop latch.\n";
+        });
+        return false;
+      }
     }
   }
 
-  // Ensure the inner loop exit block leads to the outer loop latch.
-  if (InnerLoopExit->getSingleSuccessor() != OuterLoopLatch) {
+  // Ensure the inner loop exit block lead to the outer loop latch possibly
+  // through empty blocks.
+  const BasicBlock &SuccInner =
+      LoopNest::skipEmptyBlockUntil(InnerLoop.getExitBlock(), OuterLoopLatch);
+  if (&SuccInner != OuterLoopLatch && &SuccInner != ExtraPhiBlock) {
     DEBUG_WITH_TYPE(
         VerboseDebug,
         dbgs() << "Inner loop exit block " << *InnerLoopExit
@@ -266,6 +348,8 @@ static bool checkLoopsStructure(const Loop &OuterLoop, const Loop &InnerLoop,
   return true;
 }
 
+AnalysisKey LoopNestAnalysis::Key;
+
 raw_ostream &llvm::operator<<(raw_ostream &OS, const LoopNest &LN) {
   OS << "IsPerfect=";
   if (LN.getMaxPerfectDepth() == LN.getNestDepth())
diff --git a/contrib/llvm-project/llvm/lib/Analysis/LoopPass.cpp b/contrib/llvm-project/llvm/lib/Analysis/LoopPass.cpp
index 520f06003dd2..9e470e998e67 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/LoopPass.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/LoopPass.cpp
@@ -15,11 +15,12 @@
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/IR/Dominators.h"
-#include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/OptBisect.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/PassTimingInfo.h"
+#include "llvm/IR/PrintPasses.h"
+#include "llvm/IR/StructuralHash.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/TimeProfiler.h"
@@ -76,7 +77,7 @@ LPPassManager::LPPassManager()
 
 // Insert loop into loop nest (LoopInfo) and loop queue (LQ).
 void LPPassManager::addLoop(Loop &L) {
-  if (!L.getParentLoop()) {
+  if (L.isOutermost()) {
     // This is the top level loop.
     LQ.push_front(&L);
     return;
@@ -116,7 +117,7 @@ void LPPassManager::markLoopAsDeleted(Loop &L) {
   // there. However, we have to be careful to not remove the back of the queue
   // as that is assumed to match the current loop.
   assert(LQ.back() == CurrentLoop && "Loop queue back isn't the current loop!");
-  LQ.erase(std::remove(LQ.begin(), LQ.end(), &L), LQ.end());
+  llvm::erase_value(LQ, &L);
 
   if (&L == CurrentLoop) {
     CurrentLoopDeleted = true;
@@ -191,7 +192,19 @@ bool LPPassManager::runOnFunction(Function &F) {
       {
         PassManagerPrettyStackEntry X(P, *CurrentLoop->getHeader());
         TimeRegion PassTimer(getPassTimer(P));
+#ifdef EXPENSIVE_CHECKS
+        uint64_t RefHash = StructuralHash(F);
+#endif
         LocalChanged = P->runOnLoop(CurrentLoop, *this);
+
+#ifdef EXPENSIVE_CHECKS
+        if (!LocalChanged && (RefHash != StructuralHash(F))) {
+          llvm::errs() << "Pass modifies its input and doesn't report it: "
+                       << P->getPassName() << "\n";
+          llvm_unreachable("Pass modifies its input and doesn't report it");
+        }
+#endif
+
         Changed |= LocalChanged;
         if (EmitICRemark) {
           unsigned NewSize = F.getInstructionCount();
@@ -241,7 +254,8 @@ bool LPPassManager::runOnFunction(Function &F) {
         F.getContext().yield();
       }
 
-      removeNotPreservedAnalysis(P);
+      if (LocalChanged)
+        removeNotPreservedAnalysis(P);
       recordAvailableAnalysis(P);
       removeDeadPasses(P,
                        CurrentLoopDeleted ? "<deleted>"
diff --git a/contrib/llvm-project/llvm/lib/Analysis/MLInlineAdvisor.cpp b/contrib/llvm-project/llvm/lib/Analysis/MLInlineAdvisor.cpp
index 45873f260f23..89f4ff427dff 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/MLInlineAdvisor.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/MLInlineAdvisor.cpp
@@ -11,14 +11,17 @@
 // 'release' mode) or a runtime-loaded model (the 'development' case).
 //
 //===----------------------------------------------------------------------===//
+#include "llvm/Config/config.h"
+#if defined(LLVM_HAVE_TF_AOT) || defined(LLVM_HAVE_TF_API)
+
 #include <limits>
 #include <unordered_map>
 #include <unordered_set>
 
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/FunctionPropertiesAnalysis.h"
 #include "llvm/Analysis/InlineCost.h"
-#include "llvm/Analysis/InlineFeaturesAnalysis.h"
 #include "llvm/Analysis/MLInlineAdvisor.h"
 #include "llvm/Analysis/MLModelRunner.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
@@ -63,8 +66,8 @@ CallBase *getInlinableCS(Instruction &I) {
 MLInlineAdvisor::MLInlineAdvisor(Module &M, ModuleAnalysisManager &MAM,
                                  std::unique_ptr<MLModelRunner> Runner)
     : InlineAdvisor(
-          MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager()),
-      M(M), ModelRunner(std::move(Runner)), CG(new CallGraph(M)),
+          M, MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager()),
+      ModelRunner(std::move(Runner)), CG(new CallGraph(M)),
       InitialIRSize(getModuleIRSize()), CurrentIRSize(InitialIRSize) {
   assert(ModelRunner);
 
@@ -115,7 +118,8 @@ void MLInlineAdvisor::onPassEntry() {
 }
 
 int64_t MLInlineAdvisor::getLocalCalls(Function &F) {
-  return FAM.getResult<InlineFeaturesAnalysis>(F).DirectCallsToDefinedFunctions;
+  return FAM.getResult<FunctionPropertiesAnalysis>(F)
+      .DirectCallsToDefinedFunctions;
 }
 
 // Update the internal state of the advisor, and force invalidate feature
@@ -130,7 +134,7 @@ void MLInlineAdvisor::onSuccessfulInlining(const MLInlineAdvice &Advice,
   Function *Callee = Advice.getCallee();
 
   // The caller features aren't valid anymore.
-  FAM.invalidate<InlineFeaturesAnalysis>(*Caller);
+  FAM.invalidate<FunctionPropertiesAnalysis>(*Caller);
   int64_t IRSizeAfter =
       getIRSize(*Caller) + (CalleeWasDeleted ? 0 : Advice.CalleeIRSize);
   CurrentIRSize += IRSizeAfter - (Advice.CallerIRSize + Advice.CalleeIRSize);
@@ -143,14 +147,15 @@ void MLInlineAdvisor::onSuccessfulInlining(const MLInlineAdvice &Advice,
   // For edges, we 'forget' the edges that the caller and callee used to have
   // before inlining, and add back what they currently have together.
   int64_t NewCallerAndCalleeEdges =
-      FAM.getResult<InlineFeaturesAnalysis>(*Caller)
+      FAM.getResult<FunctionPropertiesAnalysis>(*Caller)
           .DirectCallsToDefinedFunctions;
 
   if (CalleeWasDeleted)
     --NodeCount;
   else
-    NewCallerAndCalleeEdges += FAM.getResult<InlineFeaturesAnalysis>(*Callee)
-                                   .DirectCallsToDefinedFunctions;
+    NewCallerAndCalleeEdges +=
+        FAM.getResult<FunctionPropertiesAnalysis>(*Callee)
+            .DirectCallsToDefinedFunctions;
   EdgeCount += (NewCallerAndCalleeEdges - Advice.CallerAndCalleeEdges);
   assert(CurrentIRSize >= 0 && EdgeCount >= 0 && NodeCount >= 0);
 }
@@ -163,32 +168,27 @@ int64_t MLInlineAdvisor::getModuleIRSize() const {
   return Ret;
 }
 
-std::unique_ptr<InlineAdvice> MLInlineAdvisor::getAdvice(CallBase &CB) {
+std::unique_ptr<InlineAdvice> MLInlineAdvisor::getAdviceImpl(CallBase &CB) {
   auto &Caller = *CB.getCaller();
   auto &Callee = *CB.getCalledFunction();
 
   auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {
     return FAM.getResult<AssumptionAnalysis>(F);
   };
-  auto GetTLI = [&](Function &F) -> const TargetLibraryInfo & {
-    return FAM.getResult<TargetLibraryAnalysis>(F);
-  };
-
   auto &TIR = FAM.getResult<TargetIRAnalysis>(Callee);
   auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(Caller);
 
-  auto TrivialDecision =
-      llvm::getAttributeBasedInliningDecision(CB, &Callee, TIR, GetTLI);
-
+  auto MandatoryKind = InlineAdvisor::getMandatoryKind(CB, FAM, ORE);
   // If this is a "never inline" case, there won't be any changes to internal
   // state we need to track, so we can just return the base InlineAdvice, which
   // will do nothing interesting.
   // Same thing if this is a recursive case.
-  if ((TrivialDecision.hasValue() && !TrivialDecision->isSuccess()) ||
+  if (MandatoryKind == InlineAdvisor::MandatoryInliningKind::Never ||
       &Caller == &Callee)
-    return std::make_unique<InlineAdvice>(this, CB, ORE, false);
+    return getMandatoryAdvice(CB, false);
 
-  bool Mandatory = TrivialDecision.hasValue() && TrivialDecision->isSuccess();
+  bool Mandatory =
+      MandatoryKind == InlineAdvisor::MandatoryInliningKind::Always;
 
   // If we need to stop, we won't want to track anymore any state changes, so
   // we just return the base InlineAdvice, which acts as a noop.
@@ -214,15 +214,15 @@ std::unique_ptr<InlineAdvice> MLInlineAdvisor::getAdvice(CallBase &CB) {
   }
 
   if (Mandatory)
-    return getMandatoryAdvice(CB, ORE);
+    return getMandatoryAdvice(CB, true);
 
   auto NrCtantParams = 0;
   for (auto I = CB.arg_begin(), E = CB.arg_end(); I != E; ++I) {
     NrCtantParams += (isa<Constant>(*I));
   }
 
-  auto &CallerBefore = FAM.getResult<InlineFeaturesAnalysis>(Caller);
-  auto &CalleeBefore = FAM.getResult<InlineFeaturesAnalysis>(Callee);
+  auto &CallerBefore = FAM.getResult<FunctionPropertiesAnalysis>(Caller);
+  auto &CalleeBefore = FAM.getResult<FunctionPropertiesAnalysis>(Callee);
 
   ModelRunner->setFeature(FeatureIndex::CalleeBasicBlockCount,
                           CalleeBefore.BasicBlockCount);
@@ -249,10 +249,22 @@ MLInlineAdvisor::getAdviceFromModel(CallBase &CB,
   return std::make_unique<MLInlineAdvice>(this, CB, ORE, ModelRunner->run());
 }
 
+std::unique_ptr<InlineAdvice> MLInlineAdvisor::getMandatoryAdvice(CallBase &CB,
+                                                                  bool Advice) {
+  // Make sure we track inlinings in all cases - mandatory or not.
+  if (Advice && !ForceStop)
+    return getMandatoryAdviceImpl(CB);
+
+  // If this is a "never inline" case, there won't be any changes to internal
+  // state we need to track, so we can just return the base InlineAdvice, which
+  // will do nothing interesting.
+  // Same if we are forced to stop - we don't track anymore.
+  return std::make_unique<InlineAdvice>(this, CB, getCallerORE(CB), Advice);
+}
+
 std::unique_ptr<MLInlineAdvice>
-MLInlineAdvisor::getMandatoryAdvice(CallBase &CB,
-                                    OptimizationRemarkEmitter &ORE) {
-  return std::make_unique<MLInlineAdvice>(this, CB, ORE, true);
+MLInlineAdvisor::getMandatoryAdviceImpl(CallBase &CB) {
+  return std::make_unique<MLInlineAdvice>(this, CB, getCallerORE(CB), true);
 }
 
 void MLInlineAdvice::reportContextForRemark(
@@ -298,4 +310,5 @@ void MLInlineAdvice::recordUnattemptedInliningImpl() {
     reportContextForRemark(R);
     return R;
   });
-}
\ No newline at end of file
+}
+#endif // defined(LLVM_HAVE_TF_AOT) || defined(LLVM_HAVE_TF_API)
diff --git a/contrib/llvm-project/llvm/lib/Analysis/MemDepPrinter.cpp b/contrib/llvm-project/llvm/lib/Analysis/MemDepPrinter.cpp
index 9524ec96bb61..00642347102a 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/MemDepPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/MemDepPrinter.cpp
@@ -14,6 +14,7 @@
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -71,9 +72,6 @@ namespace {
       assert(dep.isUnknown() && "unexpected dependence type");
       return InstTypePair(dep.getInst(), Unknown);
     }
-    static InstTypePair getInstTypePair(const Instruction* inst, DepType type) {
-      return InstTypePair(inst, type);
-    }
   };
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Analysis/MemDerefPrinter.cpp b/contrib/llvm-project/llvm/lib/Analysis/MemDerefPrinter.cpp
index 564410b8af08..0078ceacbad2 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/MemDerefPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/MemDerefPrinter.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Analysis/MemDerefPrinter.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/IR/DataLayout.h"
@@ -17,6 +18,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+
 using namespace llvm;
 
 namespace {
@@ -76,3 +78,35 @@ void MemDerefPrinter::print(raw_ostream &OS, const Module *M) const {
     OS << "\n\n";
   }
 }
+
+PreservedAnalyses MemDerefPrinterPass::run(Function &F,
+                                           FunctionAnalysisManager &AM) {
+  OS << "Memory Dereferencibility of pointers in function '" << F.getName()
+     << "'\n";
+
+  SmallVector<Value *, 4> Deref;
+  SmallPtrSet<Value *, 4> DerefAndAligned;
+
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  for (auto &I : instructions(F)) {
+    if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
+      Value *PO = LI->getPointerOperand();
+      if (isDereferenceablePointer(PO, LI->getType(), DL))
+        Deref.push_back(PO);
+      if (isDereferenceableAndAlignedPointer(
+              PO, LI->getType(), MaybeAlign(LI->getAlignment()), DL))
+        DerefAndAligned.insert(PO);
+    }
+  }
+
+  OS << "The following are dereferenceable:\n";
+  for (Value *V : Deref) {
+    V->print(OS);
+    if (DerefAndAligned.count(V))
+      OS << "\t(aligned)";
+    else
+      OS << "\t(unaligned)";
+    OS << "\n\n";
+  }
+  return PreservedAnalyses::all();
+}
diff --git a/contrib/llvm-project/llvm/lib/Analysis/MemoryBuiltins.cpp b/contrib/llvm-project/llvm/lib/Analysis/MemoryBuiltins.cpp
index 0b61b1c0eabd..5dda96a2ca94 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/MemoryBuiltins.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/MemoryBuiltins.cpp
@@ -72,6 +72,7 @@ struct AllocFnsTy {
 // know which functions are nounwind, noalias, nocapture parameters, etc.
 static const std::pair<LibFunc, AllocFnsTy> AllocationFnData[] = {
   {LibFunc_malloc,              {MallocLike,  1, 0,  -1}},
+  {LibFunc_vec_malloc,          {MallocLike,  1, 0,  -1}},
   {LibFunc_valloc,              {MallocLike,  1, 0,  -1}},
   {LibFunc_Znwj,                {OpNewLike,   1, 0,  -1}}, // new(unsigned int)
   {LibFunc_ZnwjRKSt9nothrow_t,  {MallocLike,  2, 0,  -1}}, // new(unsigned int, nothrow)
@@ -103,7 +104,9 @@ static const std::pair<LibFunc, AllocFnsTy> AllocationFnData[] = {
   {LibFunc_msvc_new_array_longlong_nothrow, {MallocLike,  2, 0,  -1}}, // new[](unsigned long long, nothrow)
   {LibFunc_aligned_alloc,       {AlignedAllocLike, 2, 1,  -1}},
   {LibFunc_calloc,              {CallocLike,  2, 0,   1}},
+  {LibFunc_vec_calloc,          {CallocLike,  2, 0,   1}},
   {LibFunc_realloc,             {ReallocLike, 2, 1,  -1}},
+  {LibFunc_vec_realloc,         {ReallocLike, 2, 1,  -1}},
   {LibFunc_reallocf,            {ReallocLike, 2, 1,  -1}},
   {LibFunc_strdup,              {StrDupLike,  1, -1, -1}},
   {LibFunc_strndup,             {StrDupLike,  2, 1,  -1}}
@@ -378,9 +381,8 @@ PointerType *llvm::getMallocType(const CallInst *CI,
   unsigned NumOfBitCastUses = 0;
 
   // Determine if CallInst has a bitcast use.
-  for (Value::const_user_iterator UI = CI->user_begin(), E = CI->user_end();
-       UI != E;)
-    if (const BitCastInst *BCI = dyn_cast<BitCastInst>(*UI++)) {
+  for (const User *U : CI->users())
+    if (const BitCastInst *BCI = dyn_cast<BitCastInst>(U)) {
       MallocType = cast<PointerType>(BCI->getDestTy());
       NumOfBitCastUses++;
     }
@@ -566,8 +568,16 @@ Value *llvm::lowerObjectSizeCall(IntrinsicInst *ObjectSize,
       Value *UseZero =
           Builder.CreateICmpULT(SizeOffsetPair.first, SizeOffsetPair.second);
       ResultSize = Builder.CreateZExtOrTrunc(ResultSize, ResultType);
-      return Builder.CreateSelect(UseZero, ConstantInt::get(ResultType, 0),
-                                  ResultSize);
+      Value *Ret = Builder.CreateSelect(
+          UseZero, ConstantInt::get(ResultType, 0), ResultSize);
+
+      // The non-constant size expression cannot evaluate to -1.
+      if (!isa<Constant>(SizeOffsetPair.first) ||
+          !isa<Constant>(SizeOffsetPair.second))
+        Builder.CreateAssumption(
+            Builder.CreateICmpNE(Ret, ConstantInt::get(ResultType, -1)));
+
+      return Ret;
     }
   }
 
@@ -676,13 +686,14 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitAllocaInst(AllocaInst &I) {
 }
 
 SizeOffsetType ObjectSizeOffsetVisitor::visitArgument(Argument &A) {
+  Type *MemoryTy = A.getPointeeInMemoryValueType();
   // No interprocedural analysis is done at the moment.
-  if (!A.hasPassPointeeByValueAttr()) {
+  if (!MemoryTy|| !MemoryTy->isSized()) {
     ++ObjectVisitorArgument;
     return unknown();
   }
-  PointerType *PT = cast<PointerType>(A.getType());
-  APInt Size(IntTyBits, DL.getTypeAllocSize(PT->getElementType()));
+
+  APInt Size(IntTyBits, DL.getTypeAllocSize(MemoryTy));
   return std::make_pair(align(Size, A.getParamAlignment()), Zero);
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp b/contrib/llvm-project/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
index 566eba5c54af..886b5bf4acd3 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -148,7 +148,7 @@ static ModRefInfo GetLocation(const Instruction *Inst, MemoryLocation &Loc,
 
   if (const CallInst *CI = isFreeCall(Inst, &TLI)) {
     // calls to free() deallocate the entire structure
-    Loc = MemoryLocation(CI->getArgOperand(0));
+    Loc = MemoryLocation::getAfter(CI->getArgOperand(0));
     return ModRefInfo::Mod;
   }
 
@@ -166,6 +166,12 @@ static ModRefInfo GetLocation(const Instruction *Inst, MemoryLocation &Loc,
       // These intrinsics don't really modify the memory, but returning Mod
       // will allow them to be handled conservatively.
       return ModRefInfo::Mod;
+    case Intrinsic::masked_load:
+      Loc = MemoryLocation::getForArgument(II, 0, TLI);
+      return ModRefInfo::Ref;
+    case Intrinsic::masked_store:
+      Loc = MemoryLocation::getForArgument(II, 1, TLI);
+      return ModRefInfo::Mod;
     default:
       break;
     }
@@ -338,7 +344,9 @@ MemoryDependenceResults::getInvariantGroupPointerDependency(LoadInst *LI,
       // If we hit load/store with the same invariant.group metadata (and the
       // same pointer operand) we can assume that value pointed by pointer
       // operand didn't change.
-      if ((isa<LoadInst>(U) || isa<StoreInst>(U)) &&
+      if ((isa<LoadInst>(U) ||
+           (isa<StoreInst>(U) &&
+            cast<StoreInst>(U)->getPointerOperand() == Ptr)) &&
           U->hasMetadata(LLVMContext::MD_invariant_group))
         ClosestDependency = GetClosestDependency(ClosestDependency, U);
     }
@@ -362,6 +370,8 @@ MemoryDependenceResults::getInvariantGroupPointerDependency(LoadInst *LI,
 MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom(
     const MemoryLocation &MemLoc, bool isLoad, BasicBlock::iterator ScanIt,
     BasicBlock *BB, Instruction *QueryInst, unsigned *Limit) {
+  // We can batch AA queries, because IR does not change during a MemDep query.
+  BatchAAResults BatchAA(AA);
   bool isInvariantLoad = false;
 
   unsigned DefaultLimit = getDefaultBlockScanLimit();
@@ -406,8 +416,6 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom(
       isInvariantLoad = true;
   }
 
-  const DataLayout &DL = BB->getModule()->getDataLayout();
-
   // Return "true" if and only if the instruction I is either a non-simple
   // load or a non-simple store.
   auto isNonSimpleLoadOrStore = [](Instruction *I) -> bool {
@@ -442,15 +450,32 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom(
     if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
       // If we reach a lifetime begin or end marker, then the query ends here
       // because the value is undefined.
-      if (II->getIntrinsicID() == Intrinsic::lifetime_start) {
+      Intrinsic::ID ID = II->getIntrinsicID();
+      switch (ID) {
+      case Intrinsic::lifetime_start: {
         // FIXME: This only considers queries directly on the invariant-tagged
         // pointer, not on query pointers that are indexed off of them.  It'd
         // be nice to handle that at some point (the right approach is to use
         // GetPointerBaseWithConstantOffset).
-        if (AA.isMustAlias(MemoryLocation(II->getArgOperand(1)), MemLoc))
+        MemoryLocation ArgLoc = MemoryLocation::getAfter(II->getArgOperand(1));
+        if (BatchAA.isMustAlias(ArgLoc, MemLoc))
           return MemDepResult::getDef(II);
         continue;
       }
+      case Intrinsic::masked_load:
+      case Intrinsic::masked_store: {
+        MemoryLocation Loc;
+        /*ModRefInfo MR =*/ GetLocation(II, Loc, TLI);
+        AliasResult R = BatchAA.alias(Loc, MemLoc);
+        if (R == NoAlias)
+          continue;
+        if (R == MustAlias)
+          return MemDepResult::getDef(II);
+        if (ID == Intrinsic::masked_load)
+          continue;
+        return MemDepResult::getClobber(II);
+      }
+      }
     }
 
     // Values depend on loads if the pointers are must aliased.  This means
@@ -487,7 +512,7 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom(
       MemoryLocation LoadLoc = MemoryLocation::get(LI);
 
       // If we found a pointer, check if it could be the same as our pointer.
-      AliasResult R = AA.alias(LoadLoc, MemLoc);
+      AliasResult R = BatchAA.alias(LoadLoc, MemLoc);
 
       if (isLoad) {
         if (R == NoAlias)
@@ -518,7 +543,7 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom(
         continue;
 
       // Stores don't alias loads from read-only memory.
-      if (AA.pointsToConstantMemory(LoadLoc))
+      if (BatchAA.pointsToConstantMemory(LoadLoc))
         continue;
 
       // Stores depend on may/must aliased loads.
@@ -549,7 +574,7 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom(
       // If alias analysis can tell that this store is guaranteed to not modify
       // the query pointer, ignore it.  Use getModRefInfo to handle cases where
       // the query pointer points to constant memory etc.
-      if (!isModOrRefSet(AA.getModRefInfo(SI, MemLoc)))
+      if (!isModOrRefSet(BatchAA.getModRefInfo(SI, MemLoc)))
         continue;
 
       // Ok, this store might clobber the query pointer.  Check to see if it is
@@ -558,7 +583,7 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom(
       MemoryLocation StoreLoc = MemoryLocation::get(SI);
 
       // If we found a pointer, check if it could be the same as our pointer.
-      AliasResult R = AA.alias(StoreLoc, MemLoc);
+      AliasResult R = BatchAA.alias(StoreLoc, MemLoc);
 
       if (R == NoAlias)
         continue;
@@ -576,8 +601,8 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom(
     // looking for a clobber in many cases; that's an alias property and is
     // handled by BasicAA.
     if (isa<AllocaInst>(Inst) || isNoAliasFn(Inst, &TLI)) {
-      const Value *AccessPtr = GetUnderlyingObject(MemLoc.Ptr, DL);
-      if (AccessPtr == Inst || AA.isMustAlias(Inst, AccessPtr))
+      const Value *AccessPtr = getUnderlyingObject(MemLoc.Ptr);
+      if (AccessPtr == Inst || BatchAA.isMustAlias(Inst, AccessPtr))
         return MemDepResult::getDef(Inst);
     }
 
@@ -594,9 +619,10 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom(
         continue;
 
     // See if this instruction (e.g. a call or vaarg) mod/ref's the pointer.
-    ModRefInfo MR = AA.getModRefInfo(Inst, MemLoc);
+    ModRefInfo MR = BatchAA.getModRefInfo(Inst, MemLoc);
     // If necessary, perform additional analysis.
     if (isModAndRefSet(MR))
+      // TODO: Support callCapturesBefore() on BatchAAResults.
       MR = AA.callCapturesBefore(Inst, MemLoc, &DT);
     switch (clearMust(MR)) {
     case ModRefInfo::NoModRef:
@@ -728,8 +754,7 @@ MemoryDependenceResults::getNonLocalCallDependency(CallBase *QueryCall) {
   } else {
     // Seed DirtyBlocks with each of the preds of QueryInst's block.
     BasicBlock *QueryBB = QueryCall->getParent();
-    for (BasicBlock *Pred : PredCache.get(QueryBB))
-      DirtyBlocks.push_back(Pred);
+    append_range(DirtyBlocks, PredCache.get(QueryBB));
     ++NumUncacheNonLocal;
   }
 
@@ -743,8 +768,7 @@ MemoryDependenceResults::getNonLocalCallDependency(CallBase *QueryCall) {
 
   // Iterate while we still have blocks to update.
   while (!DirtyBlocks.empty()) {
-    BasicBlock *DirtyBB = DirtyBlocks.back();
-    DirtyBlocks.pop_back();
+    BasicBlock *DirtyBB = DirtyBlocks.pop_back_val();
 
     // Already processed this block?
     if (!Visited.insert(DirtyBB).second)
@@ -814,8 +838,7 @@ MemoryDependenceResults::getNonLocalCallDependency(CallBase *QueryCall) {
 
       // If the block *is* completely transparent to the load, we need to check
       // the predecessors of this block.  Add them to our worklist.
-      for (BasicBlock *Pred : PredCache.get(DirtyBB))
-        DirtyBlocks.push_back(Pred);
+      append_range(DirtyBlocks, PredCache.get(DirtyBB));
     }
   }
 
@@ -992,7 +1015,7 @@ SortNonLocalDepInfoCache(MemoryDependenceResults::NonLocalDepInfo &Cache,
       NonLocalDepEntry Val = Cache.back();
       Cache.pop_back();
       MemoryDependenceResults::NonLocalDepInfo::iterator Entry =
-          std::upper_bound(Cache.begin(), Cache.end(), Val);
+          llvm::upper_bound(Cache, Val);
       Cache.insert(Entry, Val);
     }
     break;
diff --git a/contrib/llvm-project/llvm/lib/Analysis/MemoryLocation.cpp b/contrib/llvm-project/llvm/lib/Analysis/MemoryLocation.cpp
index 4c31d6786ed8..ef9cda37ce35 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/MemoryLocation.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/MemoryLocation.cpp
@@ -20,8 +20,10 @@ using namespace llvm;
 
 void LocationSize::print(raw_ostream &OS) const {
   OS << "LocationSize::";
-  if (*this == unknown())
-    OS << "unknown";
+  if (*this == beforeOrAfterPointer())
+    OS << "beforeOrAfterPointer";
+  else if (*this == afterPointer())
+    OS << "afterPointer";
   else if (*this == mapEmpty())
     OS << "mapEmpty";
   else if (*this == mapTombstone())
@@ -57,8 +59,8 @@ MemoryLocation MemoryLocation::get(const VAArgInst *VI) {
   AAMDNodes AATags;
   VI->getAAMetadata(AATags);
 
-  return MemoryLocation(VI->getPointerOperand(), LocationSize::unknown(),
-                        AATags);
+  return MemoryLocation(VI->getPointerOperand(),
+                        LocationSize::afterPointer(), AATags);
 }
 
 MemoryLocation MemoryLocation::get(const AtomicCmpXchgInst *CXI) {
@@ -109,7 +111,7 @@ MemoryLocation MemoryLocation::getForSource(const AtomicMemTransferInst *MTI) {
 }
 
 MemoryLocation MemoryLocation::getForSource(const AnyMemTransferInst *MTI) {
-  auto Size = LocationSize::unknown();
+  auto Size = LocationSize::afterPointer();
   if (ConstantInt *C = dyn_cast<ConstantInt>(MTI->getLength()))
     Size = LocationSize::precise(C->getValue().getZExtValue());
 
@@ -130,7 +132,7 @@ MemoryLocation MemoryLocation::getForDest(const AtomicMemIntrinsic *MI) {
 }
 
 MemoryLocation MemoryLocation::getForDest(const AnyMemIntrinsic *MI) {
-  auto Size = LocationSize::unknown();
+  auto Size = LocationSize::afterPointer();
   if (ConstantInt *C = dyn_cast<ConstantInt>(MI->getLength()))
     Size = LocationSize::precise(C->getValue().getZExtValue());
 
@@ -158,13 +160,14 @@ MemoryLocation MemoryLocation::getForArgument(const CallBase *Call,
       break;
     case Intrinsic::memset:
     case Intrinsic::memcpy:
+    case Intrinsic::memcpy_inline:
     case Intrinsic::memmove:
       assert((ArgIdx == 0 || ArgIdx == 1) &&
              "Invalid argument index for memory intrinsic");
       if (ConstantInt *LenCI = dyn_cast<ConstantInt>(II->getArgOperand(2)))
         return MemoryLocation(Arg, LocationSize::precise(LenCI->getZExtValue()),
                               AATags);
-      break;
+      return MemoryLocation::getAfter(Arg, AATags);
 
     case Intrinsic::lifetime_start:
     case Intrinsic::lifetime_end:
@@ -176,6 +179,21 @@ MemoryLocation MemoryLocation::getForArgument(const CallBase *Call,
               cast<ConstantInt>(II->getArgOperand(0))->getZExtValue()),
           AATags);
 
+    case Intrinsic::masked_load:
+      assert(ArgIdx == 0 && "Invalid argument index");
+      return MemoryLocation(
+          Arg,
+          LocationSize::upperBound(DL.getTypeStoreSize(II->getType())),
+          AATags);
+
+    case Intrinsic::masked_store:
+      assert(ArgIdx == 1 && "Invalid argument index");
+      return MemoryLocation(
+          Arg,
+          LocationSize::upperBound(
+              DL.getTypeStoreSize(II->getArgOperand(0)->getType())),
+          AATags);
+
     case Intrinsic::invariant_end:
       // The first argument to an invariant.end is a "descriptor" type (e.g. a
       // pointer to a empty struct) which is never actually dereferenced.
@@ -210,20 +228,48 @@ MemoryLocation MemoryLocation::getForArgument(const CallBase *Call,
   // LoopIdiomRecognizer likes to turn loops into calls to memset_pattern16
   // whenever possible.
   LibFunc F;
-  if (TLI && Call->getCalledFunction() &&
-      TLI->getLibFunc(*Call->getCalledFunction(), F) &&
-      F == LibFunc_memset_pattern16 && TLI->has(F)) {
-    assert((ArgIdx == 0 || ArgIdx == 1) &&
-           "Invalid argument index for memset_pattern16");
-    if (ArgIdx == 1)
-      return MemoryLocation(Arg, LocationSize::precise(16), AATags);
-    if (const ConstantInt *LenCI =
-            dyn_cast<ConstantInt>(Call->getArgOperand(2)))
-      return MemoryLocation(Arg, LocationSize::precise(LenCI->getZExtValue()),
-                            AATags);
+  if (TLI && TLI->getLibFunc(*Call, F) && TLI->has(F)) {
+    switch (F) {
+    case LibFunc_memset_pattern16:
+      assert((ArgIdx == 0 || ArgIdx == 1) &&
+             "Invalid argument index for memset_pattern16");
+      if (ArgIdx == 1)
+        return MemoryLocation(Arg, LocationSize::precise(16), AATags);
+      if (const ConstantInt *LenCI =
+              dyn_cast<ConstantInt>(Call->getArgOperand(2)))
+        return MemoryLocation(Arg, LocationSize::precise(LenCI->getZExtValue()),
+                              AATags);
+      return MemoryLocation::getAfter(Arg, AATags);
+    case LibFunc_bcmp:
+    case LibFunc_memcmp:
+      assert((ArgIdx == 0 || ArgIdx == 1) &&
+             "Invalid argument index for memcmp/bcmp");
+      if (const ConstantInt *LenCI =
+              dyn_cast<ConstantInt>(Call->getArgOperand(2)))
+        return MemoryLocation(Arg, LocationSize::precise(LenCI->getZExtValue()),
+                              AATags);
+      return MemoryLocation::getAfter(Arg, AATags);
+    case LibFunc_memchr:
+      assert((ArgIdx == 0) && "Invalid argument index for memchr");
+      if (const ConstantInt *LenCI =
+              dyn_cast<ConstantInt>(Call->getArgOperand(2)))
+        return MemoryLocation(Arg, LocationSize::precise(LenCI->getZExtValue()),
+                              AATags);
+      return MemoryLocation::getAfter(Arg, AATags);
+    case LibFunc_memccpy:
+      assert((ArgIdx == 0 || ArgIdx == 1) &&
+             "Invalid argument index for memccpy");
+      // We only know an upper bound on the number of bytes read/written.
+      if (const ConstantInt *LenCI =
+              dyn_cast<ConstantInt>(Call->getArgOperand(3)))
+        return MemoryLocation(
+            Arg, LocationSize::upperBound(LenCI->getZExtValue()), AATags);
+      return MemoryLocation::getAfter(Arg, AATags);
+    default:
+      break;
+    };
   }
   // FIXME: Handle memset_pattern4 and memset_pattern8 also.
 
-  return MemoryLocation(Call->getArgOperand(ArgIdx), LocationSize::unknown(),
-                        AATags);
+  return MemoryLocation::getBeforeOrAfter(Call->getArgOperand(ArgIdx), AATags);
 }
diff --git a/contrib/llvm-project/llvm/lib/Analysis/MemorySSA.cpp b/contrib/llvm-project/llvm/lib/Analysis/MemorySSA.cpp
index f2f5fd70f471..4722b68e20e9 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/MemorySSA.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/MemorySSA.cpp
@@ -24,6 +24,7 @@
 #include "llvm/ADT/iterator.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CFGPrinter.h"
 #include "llvm/Analysis/IteratedDominanceFrontier.h"
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Config/llvm-config.h"
@@ -59,6 +60,11 @@ using namespace llvm;
 
 #define DEBUG_TYPE "memoryssa"
 
+static cl::opt<std::string>
+    DotCFGMSSA("dot-cfg-mssa",
+               cl::value_desc("file name for generated dot file"),
+               cl::desc("file name for generated dot file"), cl::init(""));
+
 INITIALIZE_PASS_BEGIN(MemorySSAWrapperPass, "memoryssa", "Memory SSA", false,
                       true)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
@@ -264,7 +270,6 @@ instructionClobbersQuery(const MemoryDef *MD, const MemoryLocation &UseLoc,
                          const Instruction *UseInst, AliasAnalysisType &AA) {
   Instruction *DefInst = MD->getMemoryInst();
   assert(DefInst && "Defining instruction not actually an instruction");
-  const auto *UseCall = dyn_cast<CallBase>(UseInst);
   Optional<AliasResult> AR;
 
   if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(DefInst)) {
@@ -276,15 +281,10 @@ instructionClobbersQuery(const MemoryDef *MD, const MemoryLocation &UseLoc,
     // clobbers where they don't really exist at all. Please see D43269 for
     // context.
     switch (II->getIntrinsicID()) {
-    case Intrinsic::lifetime_start:
-      if (UseCall)
-        return {false, NoAlias};
-      AR = AA.alias(MemoryLocation(II->getArgOperand(1)), UseLoc);
-      return {AR != NoAlias, AR};
-    case Intrinsic::lifetime_end:
     case Intrinsic::invariant_start:
     case Intrinsic::invariant_end:
     case Intrinsic::assume:
+    case Intrinsic::experimental_noalias_scope_decl:
       return {false, NoAlias};
     case Intrinsic::dbg_addr:
     case Intrinsic::dbg_declare:
@@ -296,14 +296,14 @@ instructionClobbersQuery(const MemoryDef *MD, const MemoryLocation &UseLoc,
     }
   }
 
-  if (UseCall) {
-    ModRefInfo I = AA.getModRefInfo(DefInst, UseCall);
+  if (auto *CB = dyn_cast_or_null<CallBase>(UseInst)) {
+    ModRefInfo I = AA.getModRefInfo(DefInst, CB);
     AR = isMustSet(I) ? MustAlias : MayAlias;
     return {isModOrRefSet(I), AR};
   }
 
   if (auto *DefLoad = dyn_cast<LoadInst>(DefInst))
-    if (auto *UseLoad = dyn_cast<LoadInst>(UseInst))
+    if (auto *UseLoad = dyn_cast_or_null<LoadInst>(UseInst))
       return {!areLoadsReorderable(UseLoad, DefLoad), MayAlias};
 
   ModRefInfo I = AA.getModRefInfo(DefInst, UseLoc);
@@ -357,28 +357,15 @@ struct UpwardsMemoryQuery {
 
 } // end anonymous namespace
 
-static bool lifetimeEndsAt(MemoryDef *MD, const MemoryLocation &Loc,
-                           BatchAAResults &AA) {
-  Instruction *Inst = MD->getMemoryInst();
-  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
-    switch (II->getIntrinsicID()) {
-    case Intrinsic::lifetime_end:
-      return AA.alias(MemoryLocation(II->getArgOperand(1)), Loc) == MustAlias;
-    default:
-      return false;
-    }
-  }
-  return false;
-}
-
 template <typename AliasAnalysisType>
 static bool isUseTriviallyOptimizableToLiveOnEntry(AliasAnalysisType &AA,
                                                    const Instruction *I) {
   // If the memory can't be changed, then loads of the memory can't be
   // clobbered.
-  return isa<LoadInst>(I) && (I->hasMetadata(LLVMContext::MD_invariant_load) ||
-                              AA.pointsToConstantMemory(MemoryLocation(
-                                  cast<LoadInst>(I)->getPointerOperand())));
+  if (auto *LI = dyn_cast<LoadInst>(I))
+    return I->hasMetadata(LLVMContext::MD_invariant_load) ||
+           AA.pointsToConstantMemory(MemoryLocation::get(LI));
+  return false;
 }
 
 /// Verifies that `Start` is clobbered by `ClobberAt`, and that nothing
@@ -465,10 +452,15 @@ checkClobberSanity(const MemoryAccess *Start, MemoryAccess *ClobberAt,
       }
 
       assert(isa<MemoryPhi>(MA));
-      Worklist.append(
-          upward_defs_begin({const_cast<MemoryAccess *>(MA), MAP.second},
-                            MSSA.getDomTree()),
-          upward_defs_end());
+
+      // Add reachable phi predecessors
+      for (auto ItB = upward_defs_begin(
+                    {const_cast<MemoryAccess *>(MA), MAP.second},
+                    MSSA.getDomTree()),
+                ItE = upward_defs_end();
+           ItB != ItE; ++ItB)
+        if (MSSA.getDomTree().isReachableFromEntry(ItB.getPhiArgBlock()))
+          Worklist.emplace_back(*ItB);
     }
   }
 
@@ -519,9 +511,16 @@ template <class AliasAnalysisType> class ClobberWalker {
   UpwardsMemoryQuery *Query;
   unsigned *UpwardWalkLimit;
 
-  // Phi optimization bookkeeping
+  // Phi optimization bookkeeping:
+  // List of DefPath to process during the current phi optimization walk.
   SmallVector<DefPath, 32> Paths;
+  // List of visited <Access, Location> pairs; we can skip paths already
+  // visited with the same memory location.
   DenseSet<ConstMemoryAccessPair> VisitedPhis;
+  // Record if phi translation has been performed during the current phi
+  // optimization walk, as merging alias results after phi translation can
+  // yield incorrect results. Context in PR46156.
+  bool PerformedPhiTranslation = false;
 
   /// Find the nearest def or phi that `From` can legally be optimized to.
   const MemoryAccess *getWalkTarget(const MemoryPhi *From) const {
@@ -596,8 +595,9 @@ template <class AliasAnalysisType> class ClobberWalker {
 
   void addSearches(MemoryPhi *Phi, SmallVectorImpl<ListIndex> &PausedSearches,
                    ListIndex PriorNode) {
-    auto UpwardDefs = make_range(
-        upward_defs_begin({Phi, Paths[PriorNode].Loc}, DT), upward_defs_end());
+    auto UpwardDefsBegin = upward_defs_begin({Phi, Paths[PriorNode].Loc}, DT,
+                                             &PerformedPhiTranslation);
+    auto UpwardDefs = make_range(UpwardDefsBegin, upward_defs_end());
     for (const MemoryAccessPair &P : UpwardDefs) {
       PausedSearches.push_back(Paths.size());
       Paths.emplace_back(P.second, P.first, PriorNode);
@@ -651,8 +651,16 @@ template <class AliasAnalysisType> class ClobberWalker {
       //   - We still cache things for A, so C only needs to walk up a bit.
       // If this behavior becomes problematic, we can fix without a ton of extra
       // work.
-      if (!VisitedPhis.insert({Node.Last, Node.Loc}).second)
+      if (!VisitedPhis.insert({Node.Last, Node.Loc}).second) {
+        if (PerformedPhiTranslation) {
+          // If visiting this path performed Phi translation, don't continue,
+          // since it may not be correct to merge results from two paths if one
+          // relies on the phi translation.
+          TerminatedPath Term{Node.Last, PathIndex};
+          return Term;
+        }
         continue;
+      }
 
       const MemoryAccess *SkipStopWhere = nullptr;
       if (Query->SkipSelfAccess && Node.Loc == Query->StartingLoc) {
@@ -765,7 +773,7 @@ template <class AliasAnalysisType> class ClobberWalker {
   /// terminates when a MemoryAccess that clobbers said MemoryLocation is found.
   OptznResult tryOptimizePhi(MemoryPhi *Phi, MemoryAccess *Start,
                              const MemoryLocation &Loc) {
-    assert(Paths.empty() && VisitedPhis.empty() &&
+    assert(Paths.empty() && VisitedPhis.empty() && !PerformedPhiTranslation &&
            "Reset the optimization state.");
 
     Paths.emplace_back(Loc, Start, Phi, None);
@@ -921,6 +929,7 @@ template <class AliasAnalysisType> class ClobberWalker {
   void resetPhiOptznState() {
     Paths.clear();
     VisitedPhis.clear();
+    PerformedPhiTranslation = false;
   }
 
 public:
@@ -1439,15 +1448,6 @@ void MemorySSA::OptimizeUses::optimizeUsesInBlock(
       }
 
       MemoryDef *MD = cast<MemoryDef>(VersionStack[UpperBound]);
-      // If the lifetime of the pointer ends at this instruction, it's live on
-      // entry.
-      if (!UseMLOC.IsCall && lifetimeEndsAt(MD, UseMLOC.getLoc(), *AA)) {
-        // Reset UpperBound to liveOnEntryDef's place in the stack
-        UpperBound = 0;
-        FoundClobberResult = true;
-        LocInfo.AR = MustAlias;
-        break;
-      }
       ClobberAlias CA = instructionClobbersQuery(MD, MU, UseMLOC, *AA);
       if (CA.IsClobber) {
         FoundClobberResult = true;
@@ -1709,8 +1709,11 @@ MemoryUseOrDef *MemorySSA::createDefinedAccess(Instruction *I,
   if (CreationMustSucceed)
     assert(NewAccess != nullptr && "Tried to create a memory access for a "
                                    "non-memory touching instruction");
-  if (NewAccess)
+  if (NewAccess) {
+    assert((!Definition || !isa<MemoryUse>(Definition)) &&
+           "A use cannot be a defining access");
     NewAccess->setDefiningAccess(Definition);
+  }
   return NewAccess;
 }
 
@@ -1739,9 +1742,15 @@ MemoryUseOrDef *MemorySSA::createNewAccess(Instruction *I,
   // dependencies here.
   // FIXME: Replace this special casing with a more accurate modelling of
   // assume's control dependency.
-  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
-    if (II->getIntrinsicID() == Intrinsic::assume)
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+    switch (II->getIntrinsicID()) {
+    default:
+      break;
+    case Intrinsic::assume:
+    case Intrinsic::experimental_noalias_scope_decl:
       return nullptr;
+    }
+  }
 
   // Using a nonstandard AA pipelines might leave us with unexpected modref
   // results for I, so add a check to not model instructions that may not read
@@ -1751,8 +1760,8 @@ MemoryUseOrDef *MemorySSA::createNewAccess(Instruction *I,
 
   bool Def, Use;
   if (Template) {
-    Def = dyn_cast_or_null<MemoryDef>(Template) != nullptr;
-    Use = dyn_cast_or_null<MemoryUse>(Template) != nullptr;
+    Def = isa<MemoryDef>(Template);
+    Use = isa<MemoryUse>(Template);
 #if !defined(NDEBUG)
     ModRefInfo ModRef = AAP->getModRefInfo(I, None);
     bool DefCheck, UseCheck;
@@ -1789,23 +1798,6 @@ MemoryUseOrDef *MemorySSA::createNewAccess(Instruction *I,
   return MUD;
 }
 
-/// Returns true if \p Replacer dominates \p Replacee .
-bool MemorySSA::dominatesUse(const MemoryAccess *Replacer,
-                             const MemoryAccess *Replacee) const {
-  if (isa<MemoryUseOrDef>(Replacee))
-    return DT->dominates(Replacer->getBlock(), Replacee->getBlock());
-  const auto *MP = cast<MemoryPhi>(Replacee);
-  // For a phi node, the use occurs in the predecessor block of the phi node.
-  // Since we may occur multiple times in the phi node, we have to check each
-  // operand to ensure Replacer dominates each operand where Replacee occurs.
-  for (const Use &Arg : MP->operands()) {
-    if (Arg.get() != Replacee &&
-        !DT->dominates(Replacer->getBlock(), MP->getIncomingBlock(Arg)))
-      return false;
-  }
-  return true;
-}
-
 /// Properly remove \p MA from all of MemorySSA's lookup tables.
 void MemorySSA::removeFromLookups(MemoryAccess *MA) {
   assert(MA->use_empty() &&
@@ -1989,8 +1981,7 @@ void MemorySSA::verifyOrderingDominationAndDefUses(Function &F) const {
              "Incomplete MemoryPhi Node");
       for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
         verifyUseInDefs(Phi->getIncomingValue(I), Phi);
-        assert(find(predecessors(&B), Phi->getIncomingBlock(I)) !=
-                   pred_end(&B) &&
+        assert(is_contained(predecessors(&B), Phi->getIncomingBlock(I)) &&
                "Incoming phi block not a block predecessor");
       }
 #endif
@@ -2237,9 +2228,98 @@ void MemorySSAPrinterLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<MemorySSAWrapperPass>();
 }
 
+class DOTFuncMSSAInfo {
+private:
+  const Function &F;
+  MemorySSAAnnotatedWriter MSSAWriter;
+
+public:
+  DOTFuncMSSAInfo(const Function &F, MemorySSA &MSSA)
+      : F(F), MSSAWriter(&MSSA) {}
+
+  const Function *getFunction() { return &F; }
+  MemorySSAAnnotatedWriter &getWriter() { return MSSAWriter; }
+};
+
+namespace llvm {
+
+template <>
+struct GraphTraits<DOTFuncMSSAInfo *> : public GraphTraits<const BasicBlock *> {
+  static NodeRef getEntryNode(DOTFuncMSSAInfo *CFGInfo) {
+    return &(CFGInfo->getFunction()->getEntryBlock());
+  }
+
+  // nodes_iterator/begin/end - Allow iteration over all nodes in the graph
+  using nodes_iterator = pointer_iterator<Function::const_iterator>;
+
+  static nodes_iterator nodes_begin(DOTFuncMSSAInfo *CFGInfo) {
+    return nodes_iterator(CFGInfo->getFunction()->begin());
+  }
+
+  static nodes_iterator nodes_end(DOTFuncMSSAInfo *CFGInfo) {
+    return nodes_iterator(CFGInfo->getFunction()->end());
+  }
+
+  static size_t size(DOTFuncMSSAInfo *CFGInfo) {
+    return CFGInfo->getFunction()->size();
+  }
+};
+
+template <>
+struct DOTGraphTraits<DOTFuncMSSAInfo *> : public DefaultDOTGraphTraits {
+
+  DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
+
+  static std::string getGraphName(DOTFuncMSSAInfo *CFGInfo) {
+    return "MSSA CFG for '" + CFGInfo->getFunction()->getName().str() +
+           "' function";
+  }
+
+  std::string getNodeLabel(const BasicBlock *Node, DOTFuncMSSAInfo *CFGInfo) {
+    return DOTGraphTraits<DOTFuncInfo *>::getCompleteNodeLabel(
+        Node, nullptr,
+        [CFGInfo](raw_string_ostream &OS, const BasicBlock &BB) -> void {
+          BB.print(OS, &CFGInfo->getWriter(), true, true);
+        },
+        [](std::string &S, unsigned &I, unsigned Idx) -> void {
+          std::string Str = S.substr(I, Idx - I);
+          StringRef SR = Str;
+          if (SR.count(" = MemoryDef(") || SR.count(" = MemoryPhi(") ||
+              SR.count("MemoryUse("))
+            return;
+          DOTGraphTraits<DOTFuncInfo *>::eraseComment(S, I, Idx);
+        });
+  }
+
+  static std::string getEdgeSourceLabel(const BasicBlock *Node,
+                                        const_succ_iterator I) {
+    return DOTGraphTraits<DOTFuncInfo *>::getEdgeSourceLabel(Node, I);
+  }
+
+  /// Display the raw branch weights from PGO.
+  std::string getEdgeAttributes(const BasicBlock *Node, const_succ_iterator I,
+                                DOTFuncMSSAInfo *CFGInfo) {
+    return "";
+  }
+
+  std::string getNodeAttributes(const BasicBlock *Node,
+                                DOTFuncMSSAInfo *CFGInfo) {
+    return getNodeLabel(Node, CFGInfo).find(';') != std::string::npos
+               ? "style=filled, fillcolor=lightpink"
+               : "";
+  }
+};
+
+} // namespace llvm
+
 bool MemorySSAPrinterLegacyPass::runOnFunction(Function &F) {
   auto &MSSA = getAnalysis<MemorySSAWrapperPass>().getMSSA();
-  MSSA.print(dbgs());
+  if (DotCFGMSSA != "") {
+    DOTFuncMSSAInfo CFGInfo(F, MSSA);
+    WriteGraph(&CFGInfo, "", false, "MSSA", DotCFGMSSA);
+  } else
+    MSSA.print(dbgs());
+
   if (VerifyMemorySSA)
     MSSA.verifyMemorySSA();
   return false;
@@ -2265,8 +2345,14 @@ bool MemorySSAAnalysis::Result::invalidate(
 
 PreservedAnalyses MemorySSAPrinterPass::run(Function &F,
                                             FunctionAnalysisManager &AM) {
-  OS << "MemorySSA for function: " << F.getName() << "\n";
-  AM.getResult<MemorySSAAnalysis>(F).getMSSA().print(OS);
+  auto &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
+  if (DotCFGMSSA != "") {
+    DOTFuncMSSAInfo CFGInfo(F, MSSA);
+    WriteGraph(&CFGInfo, "", false, "MSSA", DotCFGMSSA);
+  } else {
+    OS << "MemorySSA for function: " << F.getName() << "\n";
+    MSSA.print(OS);
+  }
 
   return PreservedAnalyses::all();
 }
@@ -2336,7 +2422,7 @@ MemorySSA::ClobberWalkerBase<AliasAnalysisType>::getClobberingMemoryAccessBase(
   UpwardsMemoryQuery Q;
   Q.OriginalAccess = StartingUseOrDef;
   Q.StartingLoc = Loc;
-  Q.Inst = I;
+  Q.Inst = nullptr;
   Q.IsCall = false;
 
   // Unlike the other function, do not walk to the def of a def, because we are
@@ -2458,3 +2544,19 @@ void MemoryDef::deleteMe(DerivedUser *Self) {
 void MemoryUse::deleteMe(DerivedUser *Self) {
   delete static_cast<MemoryUse *>(Self);
 }
+
+bool upward_defs_iterator::IsGuaranteedLoopInvariant(Value *Ptr) const {
+  auto IsGuaranteedLoopInvariantBase = [](Value *Ptr) {
+    Ptr = Ptr->stripPointerCasts();
+    if (!isa<Instruction>(Ptr))
+      return true;
+    return isa<AllocaInst>(Ptr);
+  };
+
+  Ptr = Ptr->stripPointerCasts();
+  if (auto *GEP = dyn_cast<GEPOperator>(Ptr)) {
+    return IsGuaranteedLoopInvariantBase(GEP->getPointerOperand()) &&
+           GEP->hasAllConstantIndices();
+  }
+  return IsGuaranteedLoopInvariantBase(Ptr);
+}
diff --git a/contrib/llvm-project/llvm/lib/Analysis/MemorySSAUpdater.cpp b/contrib/llvm-project/llvm/lib/Analysis/MemorySSAUpdater.cpp
index 85af091772e7..99fa58b8872a 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/MemorySSAUpdater.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/MemorySSAUpdater.cpp
@@ -319,8 +319,7 @@ void MemorySSAUpdater::insertDef(MemoryDef *MD, bool RenameUses) {
   bool DefBeforeSameBlock = false;
   if (DefBefore->getBlock() == MD->getBlock() &&
       !(isa<MemoryPhi>(DefBefore) &&
-        std::find(InsertedPHIs.begin(), InsertedPHIs.end(), DefBefore) !=
-            InsertedPHIs.end()))
+        llvm::is_contained(InsertedPHIs, DefBefore)))
     DefBeforeSameBlock = true;
 
   // There is a def before us, which means we can replace any store/phi uses
@@ -343,6 +342,8 @@ void MemorySSAUpdater::insertDef(MemoryDef *MD, bool RenameUses) {
 
   SmallVector<WeakVH, 8> FixupList(InsertedPHIs.begin(), InsertedPHIs.end());
 
+  SmallSet<WeakVH, 8> ExistingPhis;
+
   // Remember the index where we may insert new phis.
   unsigned NewPhiIndex = InsertedPHIs.size();
   if (!DefBeforeSameBlock) {
@@ -383,6 +384,8 @@ void MemorySSAUpdater::insertDef(MemoryDef *MD, bool RenameUses) {
       if (!MPhi) {
         MPhi = MSSA->createMemoryPhi(BBIDF);
         NewInsertedPHIs.push_back(MPhi);
+      } else {
+        ExistingPhis.insert(MPhi);
       }
       // Add the phis created into the IDF blocks to NonOptPhis, so they are not
       // optimized out as trivial by the call to getPreviousDefFromEnd below.
@@ -428,10 +431,11 @@ void MemorySSAUpdater::insertDef(MemoryDef *MD, bool RenameUses) {
   if (NewPhiSize)
     tryRemoveTrivialPhis(ArrayRef<WeakVH>(&InsertedPHIs[NewPhiIndex], NewPhiSize));
 
-  // Now that all fixups are done, rename all uses if we are asked.
-  if (RenameUses) {
+  // Now that all fixups are done, rename all uses if we are asked. Skip
+  // renaming for defs in unreachable blocks.
+  BasicBlock *StartBlock = MD->getBlock();
+  if (RenameUses && MSSA->getDomTree().getNode(StartBlock)) {
     SmallPtrSet<BasicBlock *, 16> Visited;
-    BasicBlock *StartBlock = MD->getBlock();
     // We are guaranteed there is a def in the block, because we just got it
     // handed to us in this function.
     MemoryAccess *FirstDef = &*MSSA->getWritableBlockDefs(StartBlock)->begin();
@@ -448,6 +452,13 @@ void MemorySSAUpdater::insertDef(MemoryDef *MD, bool RenameUses) {
       if (Phi)
         MSSA->renamePass(Phi->getBlock(), nullptr, Visited);
     }
+    // Existing Phi blocks may need renaming too, if an access was previously
+    // optimized and the inserted Defs "covers" the Optimized value.
+    for (auto &MP : ExistingPhis) {
+      MemoryPhi *Phi = dyn_cast_or_null<MemoryPhi>(MP);
+      if (Phi)
+        MSSA->renamePass(Phi->getBlock(), nullptr, Visited);
+    }
   }
 }
 
@@ -544,6 +555,20 @@ void MemorySSAUpdater::removeDuplicatePhiEdgesBetween(const BasicBlock *From,
   }
 }
 
+/// If all arguments of a MemoryPHI are defined by the same incoming
+/// argument, return that argument.
+static MemoryAccess *onlySingleValue(MemoryPhi *MP) {
+  MemoryAccess *MA = nullptr;
+
+  for (auto &Arg : MP->operands()) {
+    if (!MA)
+      MA = cast<MemoryAccess>(Arg);
+    else if (MA != Arg)
+      return nullptr;
+  }
+  return MA;
+}
+
 static MemoryAccess *getNewDefiningAccessForClone(MemoryAccess *MA,
                                                   const ValueToValueMapTy &VMap,
                                                   PhiToDefMap &MPhiMap,
@@ -700,6 +725,10 @@ void MemorySSAUpdater::updateForClonedLoop(const LoopBlocksRPO &LoopBlocks,
           NewPhi->addIncoming(IncPhi, IncBB);
       }
     }
+    if (auto *SingleAccess = onlySingleValue(NewPhi)) {
+      MPhiMap[Phi] = SingleAccess;
+      removeMemoryAccess(NewPhi);
+    }
   };
 
   auto ProcessBlock = [&](BasicBlock *BB) {
@@ -782,27 +811,42 @@ void MemorySSAUpdater::updateExitBlocksForClonedLoop(
 }
 
 void MemorySSAUpdater::applyUpdates(ArrayRef<CFGUpdate> Updates,
-                                    DominatorTree &DT) {
+                                    DominatorTree &DT, bool UpdateDT) {
   SmallVector<CFGUpdate, 4> DeleteUpdates;
+  SmallVector<CFGUpdate, 4> RevDeleteUpdates;
   SmallVector<CFGUpdate, 4> InsertUpdates;
   for (auto &Update : Updates) {
     if (Update.getKind() == DT.Insert)
       InsertUpdates.push_back({DT.Insert, Update.getFrom(), Update.getTo()});
-    else
+    else {
       DeleteUpdates.push_back({DT.Delete, Update.getFrom(), Update.getTo()});
+      RevDeleteUpdates.push_back({DT.Insert, Update.getFrom(), Update.getTo()});
+    }
   }
 
   if (!DeleteUpdates.empty()) {
-    // Update for inserted edges: use newDT and snapshot CFG as if deletes had
-    // not occurred.
-    // FIXME: This creates a new DT, so it's more expensive to do mix
-    // delete/inserts vs just inserts. We can do an incremental update on the DT
-    // to revert deletes, than re-delete the edges. Teaching DT to do this, is
-    // part of a pending cleanup.
-    DominatorTree NewDT(DT, DeleteUpdates);
-    GraphDiff<BasicBlock *> GD(DeleteUpdates, /*ReverseApplyUpdates=*/true);
-    applyInsertUpdates(InsertUpdates, NewDT, &GD);
+    if (!UpdateDT) {
+      SmallVector<CFGUpdate, 0> Empty;
+      // Deletes are reversed applied, because this CFGView is pretending the
+      // deletes did not happen yet, hence the edges still exist.
+      DT.applyUpdates(Empty, RevDeleteUpdates);
+    } else {
+      // Apply all updates, with the RevDeleteUpdates as PostCFGView.
+      DT.applyUpdates(Updates, RevDeleteUpdates);
+    }
+
+    // Note: the MSSA update below doesn't distinguish between a GD with
+    // (RevDelete,false) and (Delete, true), but this matters for the DT
+    // updates above; for "children" purposes they are equivalent; but the
+    // updates themselves convey the desired update, used inside DT only.
+    GraphDiff<BasicBlock *> GD(RevDeleteUpdates);
+    applyInsertUpdates(InsertUpdates, DT, &GD);
+    // Update DT to redelete edges; this matches the real CFG so we can perform
+    // the standard update without a postview of the CFG.
+    DT.applyUpdates(DeleteUpdates);
   } else {
+    if (UpdateDT)
+      DT.applyUpdates(Updates);
     GraphDiff<BasicBlock *> GD;
     applyInsertUpdates(InsertUpdates, DT, &GD);
   }
@@ -832,8 +876,8 @@ void MemorySSAUpdater::applyInsertUpdates(ArrayRef<CFGUpdate> Updates,
       // Check number of predecessors, we only care if there's more than one.
       unsigned Count = 0;
       BasicBlock *Pred = nullptr;
-      for (auto &Pair : children<GraphDiffInvBBPair>({GD, BB})) {
-        Pred = Pair.second;
+      for (auto *Pi : GD->template getChildren</*InverseEdge=*/true>(BB)) {
+        Pred = Pi;
         Count++;
         if (Count == 2)
           break;
@@ -926,8 +970,7 @@ void MemorySSAUpdater::applyInsertUpdates(ArrayRef<CFGUpdate> Updates,
     auto *BB = BBPredPair.first;
     const auto &AddedBlockSet = BBPredPair.second.Added;
     auto &PrevBlockSet = BBPredPair.second.Prev;
-    for (auto &Pair : children<GraphDiffInvBBPair>({GD, BB})) {
-      BasicBlock *Pi = Pair.second;
+    for (auto *Pi : GD->template getChildren</*InverseEdge=*/true>(BB)) {
       if (!AddedBlockSet.count(Pi))
         PrevBlockSet.insert(Pi);
       EdgeCountMap[{Pi, BB}]++;
@@ -1078,10 +1121,8 @@ void MemorySSAUpdater::applyInsertUpdates(ArrayRef<CFGUpdate> Updates,
         for (unsigned I = 0, E = IDFPhi->getNumIncomingValues(); I < E; ++I)
           IDFPhi->setIncomingValue(I, GetLastDef(IDFPhi->getIncomingBlock(I)));
       } else {
-        for (auto &Pair : children<GraphDiffInvBBPair>({GD, BBIDF})) {
-          BasicBlock *Pi = Pair.second;
+        for (auto *Pi : GD->template getChildren</*InverseEdge=*/true>(BBIDF))
           IDFPhi->addIncoming(GetLastDef(Pi), Pi);
-        }
       }
     }
   }
@@ -1227,20 +1268,6 @@ void MemorySSAUpdater::moveAllAfterMergeBlocks(BasicBlock *From, BasicBlock *To,
       MPhi->setIncomingBlock(MPhi->getBasicBlockIndex(From), To);
 }
 
-/// If all arguments of a MemoryPHI are defined by the same incoming
-/// argument, return that argument.
-static MemoryAccess *onlySingleValue(MemoryPhi *MP) {
-  MemoryAccess *MA = nullptr;
-
-  for (auto &Arg : MP->operands()) {
-    if (!MA)
-      MA = cast<MemoryAccess>(Arg);
-    else if (MA != Arg)
-      return nullptr;
-  }
-  return MA;
-}
-
 void MemorySSAUpdater::wireOldPredecessorsToNewImmediatePredecessor(
     BasicBlock *Old, BasicBlock *New, ArrayRef<BasicBlock *> Preds,
     bool IdenticalEdgesWereMerged) {
@@ -1314,6 +1341,7 @@ void MemorySSAUpdater::removeMemoryAccess(MemoryAccess *MA, bool OptimizePhis) {
     // Note: We assume MemorySSA is not used in metadata since it's not really
     // part of the IR.
 
+    assert(NewDefTarget != MA && "Going into an infinite loop");
     while (!MA->use_empty()) {
       Use &U = *MA->use_begin();
       if (auto *MUD = dyn_cast<MemoryUseOrDef>(U.getUser()))
diff --git a/contrib/llvm-project/llvm/lib/Analysis/ModuleDebugInfoPrinter.cpp b/contrib/llvm-project/llvm/lib/Analysis/ModuleDebugInfoPrinter.cpp
index 52b884fb88e0..64fd5eb1acd4 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/ModuleDebugInfoPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/ModuleDebugInfoPrinter.cpp
@@ -14,9 +14,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Analysis/ModuleDebugInfoPrinter.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -24,32 +26,34 @@
 using namespace llvm;
 
 namespace {
-  class ModuleDebugInfoPrinter : public ModulePass {
-    DebugInfoFinder Finder;
-  public:
-    static char ID; // Pass identification, replacement for typeid
-    ModuleDebugInfoPrinter() : ModulePass(ID) {
-      initializeModuleDebugInfoPrinterPass(*PassRegistry::getPassRegistry());
-    }
+class ModuleDebugInfoLegacyPrinter : public ModulePass {
+  DebugInfoFinder Finder;
 
-    bool runOnModule(Module &M) override;
+public:
+  static char ID; // Pass identification, replacement for typeid
+  ModuleDebugInfoLegacyPrinter() : ModulePass(ID) {
+    initializeModuleDebugInfoLegacyPrinterPass(
+        *PassRegistry::getPassRegistry());
+  }
 
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.setPreservesAll();
-    }
-    void print(raw_ostream &O, const Module *M) const override;
-  };
+  bool runOnModule(Module &M) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+  }
+  void print(raw_ostream &O, const Module *M) const override;
+};
 }
 
-char ModuleDebugInfoPrinter::ID = 0;
-INITIALIZE_PASS(ModuleDebugInfoPrinter, "module-debuginfo",
+char ModuleDebugInfoLegacyPrinter::ID = 0;
+INITIALIZE_PASS(ModuleDebugInfoLegacyPrinter, "module-debuginfo",
                 "Decodes module-level debug info", false, true)
 
 ModulePass *llvm::createModuleDebugInfoPrinterPass() {
-  return new ModuleDebugInfoPrinter();
+  return new ModuleDebugInfoLegacyPrinter();
 }
 
-bool ModuleDebugInfoPrinter::runOnModule(Module &M) {
+bool ModuleDebugInfoLegacyPrinter::runOnModule(Module &M) {
   Finder.processModule(M);
   return false;
 }
@@ -67,7 +71,8 @@ static void printFile(raw_ostream &O, StringRef Filename, StringRef Directory,
     O << ":" << Line;
 }
 
-void ModuleDebugInfoPrinter::print(raw_ostream &O, const Module *M) const {
+static void printModuleDebugInfo(raw_ostream &O, const Module *M,
+                                 const DebugInfoFinder &Finder) {
   // Printing the nodes directly isn't particularly helpful (since they
   // reference other nodes that won't be printed, particularly for the
   // filenames), so just print a few useful things.
@@ -126,3 +131,18 @@ void ModuleDebugInfoPrinter::print(raw_ostream &O, const Module *M) const {
     O << '\n';
   }
 }
+
+void ModuleDebugInfoLegacyPrinter::print(raw_ostream &O,
+                                         const Module *M) const {
+  printModuleDebugInfo(O, M, Finder);
+}
+
+ModuleDebugInfoPrinterPass::ModuleDebugInfoPrinterPass(raw_ostream &OS)
+    : OS(OS) {}
+
+PreservedAnalyses ModuleDebugInfoPrinterPass::run(Module &M,
+                                                  ModuleAnalysisManager &AM) {
+  Finder.processModule(M);
+  printModuleDebugInfo(OS, &M, Finder);
+  return PreservedAnalyses::all();
+}
diff --git a/contrib/llvm-project/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp b/contrib/llvm-project/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
index e7d529d0b51e..5f7746eeed15 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
@@ -145,7 +145,7 @@ static void addVCallToSet(DevirtCallSite Call, GlobalValue::GUID Guid,
                           SetVector<FunctionSummary::ConstVCall> &ConstVCalls) {
   std::vector<uint64_t> Args;
   // Start from the second argument to skip the "this" pointer.
-  for (auto &Arg : make_range(Call.CB.arg_begin() + 1, Call.CB.arg_end())) {
+  for (auto &Arg : drop_begin(Call.CB.args())) {
     auto *CI = dyn_cast<ConstantInt>(Arg);
     if (!CI || CI->getBitWidth() > 64) {
       VCalls.insert({Guid, Call.Offset});
@@ -472,7 +472,7 @@ static void computeFunctionSummary(
       F.hasFnAttribute(Attribute::AlwaysInline)};
   std::vector<FunctionSummary::ParamAccess> ParamAccesses;
   if (auto *SSI = GetSSICallback(F))
-    ParamAccesses = SSI->getParamAccesses();
+    ParamAccesses = SSI->getParamAccesses(Index);
   auto FuncSummary = std::make_unique<FunctionSummary>(
       Flags, NumInsts, FunFlags, /*EntryCount=*/0, std::move(Refs),
       CallGraphEdges.takeVector(), TypeTests.takeVector(),
diff --git a/contrib/llvm-project/llvm/lib/Analysis/MustExecute.cpp b/contrib/llvm-project/llvm/lib/Analysis/MustExecute.cpp
index 6e3ff67bdddb..1e7626013eed 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/MustExecute.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/MustExecute.cpp
@@ -16,9 +16,11 @@
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/AssemblyAnnotationWriter.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
@@ -300,30 +302,31 @@ bool ICFLoopSafetyInfo::doesNotWriteMemoryBefore(const Instruction &I,
 }
 
 namespace {
-  struct MustExecutePrinter : public FunctionPass {
+struct MustExecutePrinter : public FunctionPass {
 
-    static char ID; // Pass identification, replacement for typeid
-    MustExecutePrinter() : FunctionPass(ID) {
-      initializeMustExecutePrinterPass(*PassRegistry::getPassRegistry());
-    }
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.setPreservesAll();
-      AU.addRequired<DominatorTreeWrapperPass>();
-      AU.addRequired<LoopInfoWrapperPass>();
-    }
-    bool runOnFunction(Function &F) override;
-  };
-  struct MustBeExecutedContextPrinter : public ModulePass {
-    static char ID;
+  static char ID; // Pass identification, replacement for typeid
+  MustExecutePrinter() : FunctionPass(ID) {
+    initializeMustExecutePrinterPass(*PassRegistry::getPassRegistry());
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+  }
+  bool runOnFunction(Function &F) override;
+};
+struct MustBeExecutedContextPrinter : public ModulePass {
+  static char ID;
 
-    MustBeExecutedContextPrinter() : ModulePass(ID) {
-      initializeMustBeExecutedContextPrinterPass(*PassRegistry::getPassRegistry());
-    }
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.setPreservesAll();
-    }
-    bool runOnModule(Module &M) override;
-  };
+  MustBeExecutedContextPrinter() : ModulePass(ID) {
+    initializeMustBeExecutedContextPrinterPass(
+        *PassRegistry::getPassRegistry());
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+  }
+  bool runOnModule(Module &M) override;
+};
 }
 
 char MustExecutePrinter::ID = 0;
@@ -339,15 +342,16 @@ FunctionPass *llvm::createMustExecutePrinter() {
 }
 
 char MustBeExecutedContextPrinter::ID = 0;
-INITIALIZE_PASS_BEGIN(
-    MustBeExecutedContextPrinter, "print-must-be-executed-contexts",
-    "print the must-be-executed-contexed for all instructions", false, true)
+INITIALIZE_PASS_BEGIN(MustBeExecutedContextPrinter,
+                      "print-must-be-executed-contexts",
+                      "print the must-be-executed-context for all instructions",
+                      false, true)
 INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_END(MustBeExecutedContextPrinter,
                     "print-must-be-executed-contexts",
-                    "print the must-be-executed-contexed for all instructions",
+                    "print the must-be-executed-context for all instructions",
                     false, true)
 
 ModulePass *llvm::createMustBeExecutedContextPrinter() {
@@ -627,8 +631,7 @@ MustBeExecutedContextExplorer::findForwardJoinPoint(const BasicBlock *InitBB) {
       if (!TransfersExecution)
         return nullptr;
 
-      for (const BasicBlock *AdjacentBB : successors(ToBB))
-        Worklist.push_back(AdjacentBB);
+      append_range(Worklist, successors(ToBB));
     }
   }
 
@@ -835,3 +838,42 @@ const Instruction *MustBeExecutedIterator::advance() {
   Tail = nullptr;
   return nullptr;
 }
+
+PreservedAnalyses MustExecutePrinterPass::run(Function &F,
+                                              FunctionAnalysisManager &AM) {
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+
+  MustExecuteAnnotatedWriter Writer(F, DT, LI);
+  F.print(OS, &Writer);
+  return PreservedAnalyses::all();
+}
+
+PreservedAnalyses
+MustBeExecutedContextPrinterPass::run(Module &M, ModuleAnalysisManager &AM) {
+  FunctionAnalysisManager &FAM =
+      AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  GetterTy<const LoopInfo> LIGetter = [&](const Function &F) {
+    return &FAM.getResult<LoopAnalysis>(const_cast<Function &>(F));
+  };
+  GetterTy<const DominatorTree> DTGetter = [&](const Function &F) {
+    return &FAM.getResult<DominatorTreeAnalysis>(const_cast<Function &>(F));
+  };
+  GetterTy<const PostDominatorTree> PDTGetter = [&](const Function &F) {
+    return &FAM.getResult<PostDominatorTreeAnalysis>(const_cast<Function &>(F));
+  };
+
+  MustBeExecutedContextExplorer Explorer(
+      /* ExploreInterBlock */ true,
+      /* ExploreCFGForward */ true,
+      /* ExploreCFGBackward */ true, LIGetter, DTGetter, PDTGetter);
+
+  for (Function &F : M) {
+    for (Instruction &I : instructions(F)) {
+      OS << "-- Explore context of: " << I << "\n";
+      for (const Instruction *CI : Explorer.range(&I))
+        OS << "  [F: " << CI->getFunction()->getName() << "] " << *CI << "\n";
+    }
+  }
+  return PreservedAnalyses::all();
+}
diff --git a/contrib/llvm-project/llvm/lib/Analysis/ObjCARCAliasAnalysis.cpp b/contrib/llvm-project/llvm/lib/Analysis/ObjCARCAliasAnalysis.cpp
index 80e019f5fc92..786d03f694b2 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/ObjCARCAliasAnalysis.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/ObjCARCAliasAnalysis.cpp
@@ -54,10 +54,11 @@ AliasResult ObjCARCAAResult::alias(const MemoryLocation &LocA,
 
   // If that failed, climb to the underlying object, including climbing through
   // ObjC-specific no-ops, and try making an imprecise alias query.
-  const Value *UA = GetUnderlyingObjCPtr(SA, DL);
-  const Value *UB = GetUnderlyingObjCPtr(SB, DL);
+  const Value *UA = GetUnderlyingObjCPtr(SA);
+  const Value *UB = GetUnderlyingObjCPtr(SB);
   if (UA != SA || UB != SB) {
-    Result = AAResultBase::alias(MemoryLocation(UA), MemoryLocation(UB), AAQI);
+    Result = AAResultBase::alias(MemoryLocation::getBeforeOrAfter(UA),
+                                 MemoryLocation::getBeforeOrAfter(UB), AAQI);
     // We can't use MustAlias or PartialAlias results here because
     // GetUnderlyingObjCPtr may return an offsetted pointer value.
     if (Result == NoAlias)
@@ -83,10 +84,10 @@ bool ObjCARCAAResult::pointsToConstantMemory(const MemoryLocation &Loc,
 
   // If that failed, climb to the underlying object, including climbing through
   // ObjC-specific no-ops, and try making an imprecise alias query.
-  const Value *U = GetUnderlyingObjCPtr(S, DL);
+  const Value *U = GetUnderlyingObjCPtr(S);
   if (U != S)
-    return AAResultBase::pointsToConstantMemory(MemoryLocation(U), AAQI,
-                                                OrLocal);
+    return AAResultBase::pointsToConstantMemory(
+        MemoryLocation::getBeforeOrAfter(U), AAQI, OrLocal);
 
   // If that failed, fail. We don't need to chain here, since that's covered
   // by the earlier precise query.
@@ -133,6 +134,8 @@ ModRefInfo ObjCARCAAResult::getModRefInfo(const CallBase *Call,
   return AAResultBase::getModRefInfo(Call, Loc, AAQI);
 }
 
+AnalysisKey ObjCARCAA::Key;
+
 ObjCARCAAResult ObjCARCAA::run(Function &F, FunctionAnalysisManager &AM) {
   return ObjCARCAAResult(F.getParent()->getDataLayout());
 }
diff --git a/contrib/llvm-project/llvm/lib/Analysis/ObjCARCAnalysisUtils.cpp b/contrib/llvm-project/llvm/lib/Analysis/ObjCARCAnalysisUtils.cpp
index 56d1cb421225..d34a3c636362 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/ObjCARCAnalysisUtils.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/ObjCARCAnalysisUtils.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/ObjCARCAnalysisUtils.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Support/CommandLine.h"
 
 using namespace llvm;
@@ -23,3 +24,22 @@ bool llvm::objcarc::EnableARCOpts;
 static cl::opt<bool, true> EnableARCOptimizations(
     "enable-objc-arc-opts", cl::desc("enable/disable all ARC Optimizations"),
     cl::location(EnableARCOpts), cl::init(true), cl::Hidden);
+
+bool llvm::objcarc::IsPotentialRetainableObjPtr(const Value *Op,
+                                                AAResults &AA) {
+  // First make the rudimentary check.
+  if (!IsPotentialRetainableObjPtr(Op))
+    return false;
+
+  // Objects in constant memory are not reference-counted.
+  if (AA.pointsToConstantMemory(Op))
+    return false;
+
+  // Pointers in constant memory are not pointing to reference-counted objects.
+  if (const LoadInst *LI = dyn_cast<LoadInst>(Op))
+    if (AA.pointsToConstantMemory(LI->getPointerOperand()))
+      return false;
+
+  // Otherwise assume the worst.
+  return true;
+}
diff --git a/contrib/llvm-project/llvm/lib/Analysis/ObjCARCInstKind.cpp b/contrib/llvm-project/llvm/lib/Analysis/ObjCARCInstKind.cpp
index fb416a79ac26..951907804227 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/ObjCARCInstKind.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/ObjCARCInstKind.cpp
@@ -19,7 +19,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/ObjCARCInstKind.h"
-#include "llvm/ADT/StringSwitch.h"
 #include "llvm/Analysis/ObjCARCAnalysisUtils.h"
 #include "llvm/IR/Intrinsics.h"
 
diff --git a/contrib/llvm-project/llvm/lib/Analysis/OptimizationRemarkEmitter.cpp b/contrib/llvm-project/llvm/lib/Analysis/OptimizationRemarkEmitter.cpp
index 2cdf7a177216..6f3d4d536c40 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/OptimizationRemarkEmitter.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/OptimizationRemarkEmitter.cpp
@@ -15,6 +15,7 @@
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/LazyBlockFrequencyInfo.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/LLVMContext.h"
@@ -36,7 +37,7 @@ OptimizationRemarkEmitter::OptimizationRemarkEmitter(const Function *F)
   LI.analyze(DT);
 
   // Then compute BranchProbabilityInfo.
-  BranchProbabilityInfo BPI(*F, LI);
+  BranchProbabilityInfo BPI(*F, LI, nullptr, &DT, nullptr);
 
   // Finally compute BFI.
   OwnedBFI = std::make_unique<BlockFrequencyInfo>(*F, BPI, LI);
@@ -96,9 +97,17 @@ OptimizationRemarkEmitterWrapperPass::OptimizationRemarkEmitterWrapperPass()
 bool OptimizationRemarkEmitterWrapperPass::runOnFunction(Function &Fn) {
   BlockFrequencyInfo *BFI;
 
-  if (Fn.getContext().getDiagnosticsHotnessRequested())
+  auto &Context = Fn.getContext();
+  if (Context.getDiagnosticsHotnessRequested()) {
     BFI = &getAnalysis<LazyBlockFrequencyInfoPass>().getBFI();
-  else
+    // Get hotness threshold from PSI. This should only happen once.
+    if (Context.isDiagnosticsHotnessThresholdSetFromPSI()) {
+      if (ProfileSummaryInfo *PSI =
+              &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI())
+        Context.setDiagnosticsHotnessThreshold(
+            PSI->getOrCompHotCountThreshold());
+    }
+  } else
     BFI = nullptr;
 
   ORE = std::make_unique<OptimizationRemarkEmitter>(&Fn, BFI);
@@ -108,6 +117,7 @@ bool OptimizationRemarkEmitterWrapperPass::runOnFunction(Function &Fn) {
 void OptimizationRemarkEmitterWrapperPass::getAnalysisUsage(
     AnalysisUsage &AU) const {
   LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU);
+  AU.addRequired<ProfileSummaryInfoWrapperPass>();
   AU.setPreservesAll();
 }
 
@@ -117,10 +127,19 @@ OptimizationRemarkEmitter
 OptimizationRemarkEmitterAnalysis::run(Function &F,
                                        FunctionAnalysisManager &AM) {
   BlockFrequencyInfo *BFI;
+  auto &Context = F.getContext();
 
-  if (F.getContext().getDiagnosticsHotnessRequested())
+  if (Context.getDiagnosticsHotnessRequested()) {
     BFI = &AM.getResult<BlockFrequencyAnalysis>(F);
-  else
+    // Get hotness threshold from PSI. This should only happen once.
+    if (Context.isDiagnosticsHotnessThresholdSetFromPSI()) {
+      auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
+      if (ProfileSummaryInfo *PSI =
+              MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()))
+        Context.setDiagnosticsHotnessThreshold(
+            PSI->getOrCompHotCountThreshold());
+    }
+  } else
     BFI = nullptr;
 
   return OptimizationRemarkEmitter(&F, BFI);
@@ -133,5 +152,6 @@ static const char ore_name[] = "Optimization Remark Emitter";
 INITIALIZE_PASS_BEGIN(OptimizationRemarkEmitterWrapperPass, ORE_NAME, ore_name,
                       false, true)
 INITIALIZE_PASS_DEPENDENCY(LazyBFIPass)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
 INITIALIZE_PASS_END(OptimizationRemarkEmitterWrapperPass, ORE_NAME, ore_name,
                     false, true)
diff --git a/contrib/llvm-project/llvm/lib/Analysis/PhiValues.cpp b/contrib/llvm-project/llvm/lib/Analysis/PhiValues.cpp
index 198647dafbef..656a17e9dc30 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/PhiValues.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/PhiValues.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/PhiValues.h"
-#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/InitializePasses.h"
diff --git a/contrib/llvm-project/llvm/lib/Analysis/RegionInfo.cpp b/contrib/llvm-project/llvm/lib/Analysis/RegionInfo.cpp
index 88629517d484..3ba0bb9eaf2c 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/RegionInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/RegionInfo.cpp
@@ -17,11 +17,8 @@
 #include "llvm/Analysis/RegionInfoImpl.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
-#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
diff --git a/contrib/llvm-project/llvm/lib/Analysis/RegionPass.cpp b/contrib/llvm-project/llvm/lib/Analysis/RegionPass.cpp
index 6c0d17b45c62..a73607dbef61 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/RegionPass.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/RegionPass.cpp
@@ -15,6 +15,7 @@
 #include "llvm/Analysis/RegionPass.h"
 #include "llvm/IR/OptBisect.h"
 #include "llvm/IR/PassTimingInfo.h"
+#include "llvm/IR/StructuralHash.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Timer.h"
 #include "llvm/Support/raw_ostream.h"
@@ -30,8 +31,6 @@ char RGPassManager::ID = 0;
 
 RGPassManager::RGPassManager()
   : FunctionPass(ID), PMDataManager() {
-  skipThisRegion = false;
-  redoThisRegion = false;
   RI = nullptr;
   CurrentRegion = nullptr;
 }
@@ -75,8 +74,6 @@ bool RGPassManager::runOnFunction(Function &F) {
   while (!RQ.empty()) {
 
     CurrentRegion  = RQ.back();
-    skipThisRegion = false;
-    redoThisRegion = false;
 
     // Run all passes on the current Region.
     for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
@@ -90,63 +87,60 @@ bool RGPassManager::runOnFunction(Function &F) {
 
       initializeAnalysisImpl(P);
 
+      bool LocalChanged = false;
       {
         PassManagerPrettyStackEntry X(P, *CurrentRegion->getEntry());
 
         TimeRegion PassTimer(getPassTimer(P));
-        Changed |= P->runOnRegion(CurrentRegion, *this);
+#ifdef EXPENSIVE_CHECKS
+        uint64_t RefHash = StructuralHash(F);
+#endif
+        LocalChanged = P->runOnRegion(CurrentRegion, *this);
+
+#ifdef EXPENSIVE_CHECKS
+        if (!LocalChanged && (RefHash != StructuralHash(F))) {
+          llvm::errs() << "Pass modifies its input and doesn't report it: "
+                       << P->getPassName() << "\n";
+          llvm_unreachable("Pass modifies its input and doesn't report it");
+        }
+#endif
+
+        Changed |= LocalChanged;
       }
 
       if (isPassDebuggingExecutionsOrMore()) {
-        if (Changed)
+        if (LocalChanged)
           dumpPassInfo(P, MODIFICATION_MSG, ON_REGION_MSG,
-                       skipThisRegion ? "<deleted>" :
                                       CurrentRegion->getNameStr());
         dumpPreservedSet(P);
       }
 
-      if (!skipThisRegion) {
-        // Manually check that this region is still healthy. This is done
-        // instead of relying on RegionInfo::verifyRegion since RegionInfo
-        // is a function pass and it's really expensive to verify every
-        // Region in the function every time. That level of checking can be
-        // enabled with the -verify-region-info option.
-        {
-          TimeRegion PassTimer(getPassTimer(P));
-          CurrentRegion->verifyRegion();
-        }
-
-        // Then call the regular verifyAnalysis functions.
-        verifyPreservedAnalysis(P);
+      // Manually check that this region is still healthy. This is done
+      // instead of relying on RegionInfo::verifyRegion since RegionInfo
+      // is a function pass and it's really expensive to verify every
+      // Region in the function every time. That level of checking can be
+      // enabled with the -verify-region-info option.
+      {
+        TimeRegion PassTimer(getPassTimer(P));
+        CurrentRegion->verifyRegion();
       }
 
-      removeNotPreservedAnalysis(P);
+      // Then call the regular verifyAnalysis functions.
+      verifyPreservedAnalysis(P);
+
+      if (LocalChanged)
+        removeNotPreservedAnalysis(P);
       recordAvailableAnalysis(P);
       removeDeadPasses(P,
-                       (!isPassDebuggingExecutionsOrMore() || skipThisRegion) ?
-                       "<deleted>" :  CurrentRegion->getNameStr(),
+                       (!isPassDebuggingExecutionsOrMore())
+                           ? "<deleted>"
+                           : CurrentRegion->getNameStr(),
                        ON_REGION_MSG);
-
-      if (skipThisRegion)
-        // Do not run other passes on this region.
-        break;
     }
 
-    // If the region was deleted, release all the region passes. This frees up
-    // some memory, and avoids trouble with the pass manager trying to call
-    // verifyAnalysis on them.
-    if (skipThisRegion)
-      for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
-        Pass *P = getContainedPass(Index);
-        freePass(P, "<deleted>", ON_REGION_MSG);
-      }
-
     // Pop the region from queue after running all passes.
     RQ.pop_back();
 
-    if (redoThisRegion)
-      RQ.push_back(CurrentRegion);
-
     // Free all region nodes created in region passes.
     RI->clearNodeCache();
   }
diff --git a/contrib/llvm-project/llvm/lib/Analysis/ReleaseModeModelRunner.cpp b/contrib/llvm-project/llvm/lib/Analysis/ReleaseModeModelRunner.cpp
index 4c0ffbc17ff7..0b038b3e1c30 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/ReleaseModeModelRunner.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/ReleaseModeModelRunner.cpp
@@ -10,6 +10,8 @@
 // Only inference is supported.
 //
 //===----------------------------------------------------------------------===//
+#include "llvm/Config/config.h"
+#if defined(LLVM_HAVE_TF_AOT)
 
 #include "llvm/Analysis/InlineModelFeatureMaps.h"
 #include "llvm/Analysis/MLInlineAdvisor.h"
@@ -23,8 +25,8 @@
 using namespace llvm;
 namespace {
 
-static const char *const FeedPrefix = "feed_";
-static const char *const FetchPrefix = "fetch_";
+const char FeedPrefix[] = "feed_";
+const char FetchPrefix[] = "fetch_";
 
 /// MLModelRunner - production mode implementation. It uses a AOT-compiled
 /// SavedModel for efficient execution.
@@ -85,3 +87,4 @@ llvm::getReleaseModeAdvisor(Module &M, ModuleAnalysisManager &MAM) {
   auto AOTRunner = std::make_unique<ReleaseModeModelRunner>(M.getContext());
   return std::make_unique<MLInlineAdvisor>(M, MAM, std::move(AOTRunner));
 }
+#endif // defined(LLVM_HAVE_TF_AOT)
diff --git a/contrib/llvm-project/llvm/lib/Analysis/ReplayInlineAdvisor.cpp b/contrib/llvm-project/llvm/lib/Analysis/ReplayInlineAdvisor.cpp
new file mode 100644
index 000000000000..b9dac2f3ff11
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Analysis/ReplayInlineAdvisor.cpp
@@ -0,0 +1,82 @@
+//===- ReplayInlineAdvisor.cpp - Replay InlineAdvisor ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements ReplayInlineAdvisor that replays inline decisions based
+// on previous inline remarks from optimization remark log. This is a best
+// effort approach useful for testing compiler/source changes while holding
+// inlining steady.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/ReplayInlineAdvisor.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Support/LineIterator.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "inline-replay"
+
+ReplayInlineAdvisor::ReplayInlineAdvisor(
+    Module &M, FunctionAnalysisManager &FAM, LLVMContext &Context,
+    std::unique_ptr<InlineAdvisor> OriginalAdvisor, StringRef RemarksFile,
+    bool EmitRemarks)
+    : InlineAdvisor(M, FAM), OriginalAdvisor(std::move(OriginalAdvisor)),
+      HasReplayRemarks(false), EmitRemarks(EmitRemarks) {
+  auto BufferOrErr = MemoryBuffer::getFileOrSTDIN(RemarksFile);
+  std::error_code EC = BufferOrErr.getError();
+  if (EC) {
+    Context.emitError("Could not open remarks file: " + EC.message());
+    return;
+  }
+
+  // Example for inline remarks to parse:
+  //   main:3:1.1: _Z3subii inlined into main at callsite sum:1 @ main:3:1.1
+  // We use the callsite string after `at callsite` to replay inlining.
+  line_iterator LineIt(*BufferOrErr.get(), /*SkipBlanks=*/true);
+  for (; !LineIt.is_at_eof(); ++LineIt) {
+    StringRef Line = *LineIt;
+    auto Pair = Line.split(" at callsite ");
+
+    auto Callee = Pair.first.split(" inlined into").first.rsplit(": ").second;
+
+    auto CallSite = Pair.second.split(";").first;
+
+    if (Callee.empty() || CallSite.empty())
+      continue;
+
+    std::string Combined = (Callee + CallSite).str();
+    InlineSitesFromRemarks.insert(Combined);
+  }
+
+  HasReplayRemarks = true;
+}
+
+std::unique_ptr<InlineAdvice> ReplayInlineAdvisor::getAdviceImpl(CallBase &CB) {
+  assert(HasReplayRemarks);
+
+  Function &Caller = *CB.getCaller();
+  auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(Caller);
+
+  if (InlineSitesFromRemarks.empty())
+    return std::make_unique<DefaultInlineAdvice>(this, CB, None, ORE,
+                                                 EmitRemarks);
+
+  std::string CallSiteLoc = getCallSiteLocation(CB.getDebugLoc());
+  StringRef Callee = CB.getCalledFunction()->getName();
+  std::string Combined = (Callee + CallSiteLoc).str();
+  auto Iter = InlineSitesFromRemarks.find(Combined);
+
+  Optional<InlineCost> InlineRecommended = None;
+  if (Iter != InlineSitesFromRemarks.end()) {
+    InlineRecommended = llvm::InlineCost::getAlways("found in replay");
+  }
+
+  return std::make_unique<DefaultInlineAdvice>(this, CB, InlineRecommended, ORE,
+                                               EmitRemarks);
+}
diff --git a/contrib/llvm-project/llvm/lib/Analysis/ScalarEvolution.cpp b/contrib/llvm-project/llvm/lib/Analysis/ScalarEvolution.cpp
index 3c96b3f20461..1a9ae68573e9 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -135,6 +135,7 @@
 #include <vector>
 
 using namespace llvm;
+using namespace PatternMatch;
 
 #define DEBUG_TYPE "scalar-evolution"
 
@@ -226,6 +227,11 @@ ClassifyExpressions("scalar-evolution-classify-expressions",
     cl::Hidden, cl::init(true),
     cl::desc("When printing analysis, include information on every instruction"));
 
+static cl::opt<bool> UseExpensiveRangeSharpening(
+    "scalar-evolution-use-expensive-range-sharpening", cl::Hidden,
+    cl::init(false),
+    cl::desc("Use more powerful methods of sharpening expression ranges. May "
+             "be costly in terms of compile time"));
 
 //===----------------------------------------------------------------------===//
 //                           SCEV class definitions
@@ -243,10 +249,17 @@ LLVM_DUMP_METHOD void SCEV::dump() const {
 #endif
 
 void SCEV::print(raw_ostream &OS) const {
-  switch (static_cast<SCEVTypes>(getSCEVType())) {
+  switch (getSCEVType()) {
   case scConstant:
     cast<SCEVConstant>(this)->getValue()->printAsOperand(OS, false);
     return;
+  case scPtrToInt: {
+    const SCEVPtrToIntExpr *PtrToInt = cast<SCEVPtrToIntExpr>(this);
+    const SCEV *Op = PtrToInt->getOperand();
+    OS << "(ptrtoint " << *Op->getType() << " " << *Op << " to "
+       << *PtrToInt->getType() << ")";
+    return;
+  }
   case scTruncate: {
     const SCEVTruncateExpr *Trunc = cast<SCEVTruncateExpr>(this);
     const SCEV *Op = Trunc->getOperand();
@@ -304,6 +317,8 @@ void SCEV::print(raw_ostream &OS) const {
     case scSMinExpr:
       OpStr = " smin ";
       break;
+    default:
+      llvm_unreachable("There are no other nary expression types.");
     }
     OS << "(";
     for (SCEVNAryExpr::op_iterator I = NAry->op_begin(), E = NAry->op_end();
@@ -320,6 +335,10 @@ void SCEV::print(raw_ostream &OS) const {
         OS << "<nuw>";
       if (NAry->hasNoSignedWrap())
         OS << "<nsw>";
+      break;
+    default:
+      // Nothing to print for other nary expressions.
+      break;
     }
     return;
   }
@@ -361,9 +380,10 @@ void SCEV::print(raw_ostream &OS) const {
 }
 
 Type *SCEV::getType() const {
-  switch (static_cast<SCEVTypes>(getSCEVType())) {
+  switch (getSCEVType()) {
   case scConstant:
     return cast<SCEVConstant>(this)->getType();
+  case scPtrToInt:
   case scTruncate:
   case scZeroExtend:
   case scSignExtend:
@@ -445,28 +465,42 @@ ScalarEvolution::getConstant(Type *Ty, uint64_t V, bool isSigned) {
   return getConstant(ConstantInt::get(ITy, V, isSigned));
 }
 
-SCEVCastExpr::SCEVCastExpr(const FoldingSetNodeIDRef ID,
-                           unsigned SCEVTy, const SCEV *op, Type *ty)
-  : SCEV(ID, SCEVTy, computeExpressionSize(op)), Op(op), Ty(ty) {}
+SCEVCastExpr::SCEVCastExpr(const FoldingSetNodeIDRef ID, SCEVTypes SCEVTy,
+                           const SCEV *op, Type *ty)
+    : SCEV(ID, SCEVTy, computeExpressionSize(op)), Ty(ty) {
+  Operands[0] = op;
+}
+
+SCEVPtrToIntExpr::SCEVPtrToIntExpr(const FoldingSetNodeIDRef ID, const SCEV *Op,
+                                   Type *ITy)
+    : SCEVCastExpr(ID, scPtrToInt, Op, ITy) {
+  assert(getOperand()->getType()->isPointerTy() && Ty->isIntegerTy() &&
+         "Must be a non-bit-width-changing pointer-to-integer cast!");
+}
+
+SCEVIntegralCastExpr::SCEVIntegralCastExpr(const FoldingSetNodeIDRef ID,
+                                           SCEVTypes SCEVTy, const SCEV *op,
+                                           Type *ty)
+    : SCEVCastExpr(ID, SCEVTy, op, ty) {}
 
-SCEVTruncateExpr::SCEVTruncateExpr(const FoldingSetNodeIDRef ID,
-                                   const SCEV *op, Type *ty)
-  : SCEVCastExpr(ID, scTruncate, op, ty) {
-  assert(Op->getType()->isIntOrPtrTy() && Ty->isIntOrPtrTy() &&
+SCEVTruncateExpr::SCEVTruncateExpr(const FoldingSetNodeIDRef ID, const SCEV *op,
+                                   Type *ty)
+    : SCEVIntegralCastExpr(ID, scTruncate, op, ty) {
+  assert(getOperand()->getType()->isIntOrPtrTy() && Ty->isIntOrPtrTy() &&
          "Cannot truncate non-integer value!");
 }
 
 SCEVZeroExtendExpr::SCEVZeroExtendExpr(const FoldingSetNodeIDRef ID,
                                        const SCEV *op, Type *ty)
-  : SCEVCastExpr(ID, scZeroExtend, op, ty) {
-  assert(Op->getType()->isIntOrPtrTy() && Ty->isIntOrPtrTy() &&
+    : SCEVIntegralCastExpr(ID, scZeroExtend, op, ty) {
+  assert(getOperand()->getType()->isIntOrPtrTy() && Ty->isIntOrPtrTy() &&
          "Cannot zero extend non-integer value!");
 }
 
 SCEVSignExtendExpr::SCEVSignExtendExpr(const FoldingSetNodeIDRef ID,
                                        const SCEV *op, Type *ty)
-  : SCEVCastExpr(ID, scSignExtend, op, ty) {
-  assert(Op->getType()->isIntOrPtrTy() && Ty->isIntOrPtrTy() &&
+    : SCEVIntegralCastExpr(ID, scSignExtend, op, ty) {
+  assert(getOperand()->getType()->isIntOrPtrTy() && Ty->isIntOrPtrTy() &&
          "Cannot sign extend non-integer value!");
 }
 
@@ -665,7 +699,7 @@ static int CompareSCEVComplexity(
     return 0;
 
   // Primarily, sort the SCEVs by their getSCEVType().
-  unsigned LType = LHS->getSCEVType(), RType = RHS->getSCEVType();
+  SCEVTypes LType = LHS->getSCEVType(), RType = RHS->getSCEVType();
   if (LType != RType)
     return (int)LType - (int)RType;
 
@@ -674,7 +708,7 @@ static int CompareSCEVComplexity(
   // Aside from the getSCEVType() ordering, the particular ordering
   // isn't very important except that it's beneficial to be consistent,
   // so that (a + b) and (b + a) don't end up as different expressions.
-  switch (static_cast<SCEVTypes>(LType)) {
+  switch (LType) {
   case scUnknown: {
     const SCEVUnknown *LU = cast<SCEVUnknown>(LHS);
     const SCEVUnknown *RU = cast<SCEVUnknown>(RHS);
@@ -776,6 +810,7 @@ static int CompareSCEVComplexity(
     return X;
   }
 
+  case scPtrToInt:
   case scTruncate:
   case scZeroExtend:
   case scSignExtend: {
@@ -999,6 +1034,115 @@ const SCEV *SCEVAddRecExpr::evaluateAtIteration(const SCEV *It,
 //                    SCEV Expression folder implementations
 //===----------------------------------------------------------------------===//
 
+const SCEV *ScalarEvolution::getPtrToIntExpr(const SCEV *Op, Type *Ty,
+                                             unsigned Depth) {
+  assert(Ty->isIntegerTy() && "Target type must be an integer type!");
+  assert(Depth <= 1 && "getPtrToIntExpr() should self-recurse at most once.");
+
+  // We could be called with an integer-typed operands during SCEV rewrites.
+  // Since the operand is an integer already, just perform zext/trunc/self cast.
+  if (!Op->getType()->isPointerTy())
+    return getTruncateOrZeroExtend(Op, Ty);
+
+  // What would be an ID for such a SCEV cast expression?
+  FoldingSetNodeID ID;
+  ID.AddInteger(scPtrToInt);
+  ID.AddPointer(Op);
+
+  void *IP = nullptr;
+
+  // Is there already an expression for such a cast?
+  if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP))
+    return getTruncateOrZeroExtend(S, Ty);
+
+  // If not, is this expression something we can't reduce any further?
+  if (isa<SCEVUnknown>(Op)) {
+    // Create an explicit cast node.
+    // We can reuse the existing insert position since if we get here,
+    // we won't have made any changes which would invalidate it.
+    Type *IntPtrTy = getDataLayout().getIntPtrType(Op->getType());
+    assert(getDataLayout().getTypeSizeInBits(getEffectiveSCEVType(
+               Op->getType())) == getDataLayout().getTypeSizeInBits(IntPtrTy) &&
+           "We can only model ptrtoint if SCEV's effective (integer) type is "
+           "sufficiently wide to represent all possible pointer values.");
+    SCEV *S = new (SCEVAllocator)
+        SCEVPtrToIntExpr(ID.Intern(SCEVAllocator), Op, IntPtrTy);
+    UniqueSCEVs.InsertNode(S, IP);
+    addToLoopUseLists(S);
+    return getTruncateOrZeroExtend(S, Ty);
+  }
+
+  assert(Depth == 0 &&
+         "getPtrToIntExpr() should not self-recurse for non-SCEVUnknown's.");
+
+  // Otherwise, we've got some expression that is more complex than just a
+  // single SCEVUnknown. But we don't want to have a SCEVPtrToIntExpr of an
+  // arbitrary expression, we want to have SCEVPtrToIntExpr of an SCEVUnknown
+  // only, and the expressions must otherwise be integer-typed.
+  // So sink the cast down to the SCEVUnknown's.
+
+  /// The SCEVPtrToIntSinkingRewriter takes a scalar evolution expression,
+  /// which computes a pointer-typed value, and rewrites the whole expression
+  /// tree so that *all* the computations are done on integers, and the only
+  /// pointer-typed operands in the expression are SCEVUnknown.
+  class SCEVPtrToIntSinkingRewriter
+      : public SCEVRewriteVisitor<SCEVPtrToIntSinkingRewriter> {
+    using Base = SCEVRewriteVisitor<SCEVPtrToIntSinkingRewriter>;
+
+  public:
+    SCEVPtrToIntSinkingRewriter(ScalarEvolution &SE) : SCEVRewriteVisitor(SE) {}
+
+    static const SCEV *rewrite(const SCEV *Scev, ScalarEvolution &SE) {
+      SCEVPtrToIntSinkingRewriter Rewriter(SE);
+      return Rewriter.visit(Scev);
+    }
+
+    const SCEV *visit(const SCEV *S) {
+      Type *STy = S->getType();
+      // If the expression is not pointer-typed, just keep it as-is.
+      if (!STy->isPointerTy())
+        return S;
+      // Else, recursively sink the cast down into it.
+      return Base::visit(S);
+    }
+
+    const SCEV *visitAddExpr(const SCEVAddExpr *Expr) {
+      SmallVector<const SCEV *, 2> Operands;
+      bool Changed = false;
+      for (auto *Op : Expr->operands()) {
+        Operands.push_back(visit(Op));
+        Changed |= Op != Operands.back();
+      }
+      return !Changed ? Expr : SE.getAddExpr(Operands, Expr->getNoWrapFlags());
+    }
+
+    const SCEV *visitMulExpr(const SCEVMulExpr *Expr) {
+      SmallVector<const SCEV *, 2> Operands;
+      bool Changed = false;
+      for (auto *Op : Expr->operands()) {
+        Operands.push_back(visit(Op));
+        Changed |= Op != Operands.back();
+      }
+      return !Changed ? Expr : SE.getMulExpr(Operands, Expr->getNoWrapFlags());
+    }
+
+    const SCEV *visitUnknown(const SCEVUnknown *Expr) {
+      Type *ExprPtrTy = Expr->getType();
+      assert(ExprPtrTy->isPointerTy() &&
+             "Should only reach pointer-typed SCEVUnknown's.");
+      Type *ExprIntPtrTy = SE.getDataLayout().getIntPtrType(ExprPtrTy);
+      return SE.getPtrToIntExpr(Expr, ExprIntPtrTy, /*Depth=*/1);
+    }
+  };
+
+  // And actually perform the cast sinking.
+  const SCEV *IntOp = SCEVPtrToIntSinkingRewriter::rewrite(Op, *this);
+  assert(IntOp->getType()->isIntegerTy() &&
+         "We must have succeeded in sinking the cast, "
+         "and ending up with an integer-typed expression!");
+  return getTruncateOrZeroExtend(IntOp, Ty);
+}
+
 const SCEV *ScalarEvolution::getTruncateExpr(const SCEV *Op, Type *Ty,
                                              unsigned Depth) {
   assert(getTypeSizeInBits(Op->getType()) > getTypeSizeInBits(Ty) &&
@@ -1050,7 +1194,8 @@ const SCEV *ScalarEvolution::getTruncateExpr(const SCEV *Op, Type *Ty,
     for (unsigned i = 0, e = CommOp->getNumOperands(); i != e && numTruncs < 2;
          ++i) {
       const SCEV *S = getTruncateExpr(CommOp->getOperand(i), Ty, Depth + 1);
-      if (!isa<SCEVCastExpr>(CommOp->getOperand(i)) && isa<SCEVTruncateExpr>(S))
+      if (!isa<SCEVIntegralCastExpr>(CommOp->getOperand(i)) &&
+          isa<SCEVTruncateExpr>(S))
         numTruncs++;
       Operands.push_back(S);
     }
@@ -1077,6 +1222,11 @@ const SCEV *ScalarEvolution::getTruncateExpr(const SCEV *Op, Type *Ty,
     return getAddRecExpr(Operands, AddRec->getLoop(), SCEV::FlagAnyWrap);
   }
 
+  // Return zero if truncating to known zeros.
+  uint32_t MinTrailingZeros = GetMinTrailingZeros(Op);
+  if (MinTrailingZeros >= getTypeSizeInBits(Ty))
+    return getZero(Ty);
+
   // The cast wasn't folded; create an explicit cast node. We can reuse
   // the existing insert position since if we get here, we won't have
   // made any changes which would invalidate it.
@@ -1237,7 +1387,7 @@ static const SCEV *getPreStartForExtend(const SCEVAddRecExpr *AR, Type *Ty,
       // If we know `AR` == {`PreStart`+`Step`,+,`Step`} is `WrapType` (FlagNSW
       // or FlagNUW) and that `PreStart` + `Step` is `WrapType` too, then
       // `PreAR` == {`PreStart`,+,`Step`} is also `WrapType`.  Cache this fact.
-      const_cast<SCEVAddRecExpr *>(PreAR)->setNoWrapFlags(WrapType);
+      SE->setNoWrapFlags(const_cast<SCEVAddRecExpr *>(PreAR), WrapType);
     }
     return PreStart;
   }
@@ -1441,7 +1591,7 @@ ScalarEvolution::getZeroExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth) {
 
       if (!AR->hasNoUnsignedWrap()) {
         auto NewFlags = proveNoWrapViaConstantRanges(AR);
-        const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(NewFlags);
+        setNoWrapFlags(const_cast<SCEVAddRecExpr *>(AR), NewFlags);
       }
 
       // If we have special knowledge that this addrec won't overflow,
@@ -1461,8 +1611,7 @@ ScalarEvolution::getZeroExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth) {
       // that value once it has finished.
       const SCEV *MaxBECount = getConstantMaxBackedgeTakenCount(L);
       if (!isa<SCEVCouldNotCompute>(MaxBECount)) {
-        // Manually compute the final value for AR, checking for
-        // overflow.
+        // Manually compute the final value for AR, checking for overflow.
 
         // Check whether the backedge-taken count can be losslessly casted to
         // the addrec's type. The count is always unsigned.
@@ -1490,7 +1639,7 @@ ScalarEvolution::getZeroExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth) {
                        SCEV::FlagAnyWrap, Depth + 1);
           if (ZAdd == OperandExtendedAdd) {
             // Cache knowledge of AR NUW, which is propagated to this AddRec.
-            const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNUW);
+            setNoWrapFlags(const_cast<SCEVAddRecExpr *>(AR), SCEV::FlagNUW);
             // Return the expression with the addrec on the outside.
             return getAddRecExpr(
                 getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this,
@@ -1509,7 +1658,7 @@ ScalarEvolution::getZeroExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth) {
           if (ZAdd == OperandExtendedAdd) {
             // Cache knowledge of AR NW, which is propagated to this AddRec.
             // Negative step causes unsigned wrap, but it still can't self-wrap.
-            const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNW);
+            setNoWrapFlags(const_cast<SCEVAddRecExpr *>(AR), SCEV::FlagNW);
             // Return the expression with the addrec on the outside.
             return getAddRecExpr(
                 getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this,
@@ -1529,27 +1678,24 @@ ScalarEvolution::getZeroExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth) {
       // doing extra work that may not pay off.
       if (!isa<SCEVCouldNotCompute>(MaxBECount) || HasGuards ||
           !AC.assumptions().empty()) {
-        // If the backedge is guarded by a comparison with the pre-inc
-        // value the addrec is safe. Also, if the entry is guarded by
-        // a comparison with the start value and the backedge is
-        // guarded by a comparison with the post-inc value, the addrec
-        // is safe.
-        if (isKnownPositive(Step)) {
-          const SCEV *N = getConstant(APInt::getMinValue(BitWidth) -
-                                      getUnsignedRangeMax(Step));
-          if (isLoopBackedgeGuardedByCond(L, ICmpInst::ICMP_ULT, AR, N) ||
-              isKnownOnEveryIteration(ICmpInst::ICMP_ULT, AR, N)) {
-            // Cache knowledge of AR NUW, which is propagated to this
-            // AddRec.
-            const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNUW);
-            // Return the expression with the addrec on the outside.
-            return getAddRecExpr(
+
+        auto NewFlags = proveNoUnsignedWrapViaInduction(AR);
+        setNoWrapFlags(const_cast<SCEVAddRecExpr *>(AR), NewFlags);
+        if (AR->hasNoUnsignedWrap()) {
+          // Same as nuw case above - duplicated here to avoid a compile time
+          // issue.  It's not clear that the order of checks does matter, but
+          // it's one of two issue possible causes for a change which was
+          // reverted.  Be conservative for the moment.
+          return getAddRecExpr(
                 getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this,
                                                          Depth + 1),
                 getZeroExtendExpr(Step, Ty, Depth + 1), L,
                 AR->getNoWrapFlags());
-          }
-        } else if (isKnownNegative(Step)) {
+        }
+        
+        // For a negative step, we can extend the operands iff doing so only
+        // traverses values in the range zext([0,UINT_MAX]). 
+        if (isKnownNegative(Step)) {
           const SCEV *N = getConstant(APInt::getMaxValue(BitWidth) -
                                       getSignedRangeMin(Step));
           if (isLoopBackedgeGuardedByCond(L, ICmpInst::ICMP_UGT, AR, N) ||
@@ -1557,7 +1703,7 @@ ScalarEvolution::getZeroExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth) {
             // Cache knowledge of AR NW, which is propagated to this
             // AddRec.  Negative step causes unsigned wrap, but it
             // still can't self-wrap.
-            const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNW);
+            setNoWrapFlags(const_cast<SCEVAddRecExpr *>(AR), SCEV::FlagNW);
             // Return the expression with the addrec on the outside.
             return getAddRecExpr(
                 getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this,
@@ -1586,7 +1732,7 @@ ScalarEvolution::getZeroExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth) {
       }
 
       if (proveNoWrapByVaryingStart<SCEVZeroExtendExpr>(Start, Step, L)) {
-        const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNUW);
+        setNoWrapFlags(const_cast<SCEVAddRecExpr *>(AR), SCEV::FlagNUW);
         return getAddRecExpr(
             getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this, Depth + 1),
             getZeroExtendExpr(Step, Ty, Depth + 1), L, AR->getNoWrapFlags());
@@ -1785,7 +1931,7 @@ ScalarEvolution::getSignExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth) {
 
       if (!AR->hasNoSignedWrap()) {
         auto NewFlags = proveNoWrapViaConstantRanges(AR);
-        const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(NewFlags);
+        setNoWrapFlags(const_cast<SCEVAddRecExpr *>(AR), NewFlags);
       }
 
       // If we have special knowledge that this addrec won't overflow,
@@ -1834,7 +1980,7 @@ ScalarEvolution::getSignExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth) {
                        SCEV::FlagAnyWrap, Depth + 1);
           if (SAdd == OperandExtendedAdd) {
             // Cache knowledge of AR NSW, which is propagated to this AddRec.
-            const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNSW);
+            setNoWrapFlags(const_cast<SCEVAddRecExpr *>(AR), SCEV::FlagNSW);
             // Return the expression with the addrec on the outside.
             return getAddRecExpr(
                 getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this,
@@ -1859,7 +2005,7 @@ ScalarEvolution::getSignExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth) {
             // Thus (AR is not NW => SAdd != OperandExtendedAdd) <=>
             // (SAdd == OperandExtendedAdd => AR is NW)
 
-            const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNW);
+            setNoWrapFlags(const_cast<SCEVAddRecExpr *>(AR), SCEV::FlagNW);
 
             // Return the expression with the addrec on the outside.
             return getAddRecExpr(
@@ -1871,33 +2017,16 @@ ScalarEvolution::getSignExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth) {
         }
       }
 
-      // Normally, in the cases we can prove no-overflow via a
-      // backedge guarding condition, we can also compute a backedge
-      // taken count for the loop.  The exceptions are assumptions and
-      // guards present in the loop -- SCEV is not great at exploiting
-      // these to compute max backedge taken counts, but can still use
-      // these to prove lack of overflow.  Use this fact to avoid
-      // doing extra work that may not pay off.
-
-      if (!isa<SCEVCouldNotCompute>(MaxBECount) || HasGuards ||
-          !AC.assumptions().empty()) {
-        // If the backedge is guarded by a comparison with the pre-inc
-        // value the addrec is safe. Also, if the entry is guarded by
-        // a comparison with the start value and the backedge is
-        // guarded by a comparison with the post-inc value, the addrec
-        // is safe.
-        ICmpInst::Predicate Pred;
-        const SCEV *OverflowLimit =
-            getSignedOverflowLimitForStep(Step, &Pred, this);
-        if (OverflowLimit &&
-            (isLoopBackedgeGuardedByCond(L, Pred, AR, OverflowLimit) ||
-             isKnownOnEveryIteration(Pred, AR, OverflowLimit))) {
-          // Cache knowledge of AR NSW, then propagate NSW to the wide AddRec.
-          const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNSW);
-          return getAddRecExpr(
-              getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this, Depth + 1),
-              getSignExtendExpr(Step, Ty, Depth + 1), L, AR->getNoWrapFlags());
-        }
+      auto NewFlags = proveNoSignedWrapViaInduction(AR);
+      setNoWrapFlags(const_cast<SCEVAddRecExpr *>(AR), NewFlags);
+      if (AR->hasNoSignedWrap()) {
+        // Same as nsw case above - duplicated here to avoid a compile time
+        // issue.  It's not clear that the order of checks does matter, but
+        // it's one of two issue possible causes for a change which was
+        // reverted.  Be conservative for the moment.
+        return getAddRecExpr(
+            getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this, Depth + 1),
+            getSignExtendExpr(Step, Ty, Depth + 1), L, AR->getNoWrapFlags());
       }
 
       // sext({C,+,Step}) --> (sext(D) + sext({C-D,+,Step}))<nuw><nsw>
@@ -1918,7 +2047,7 @@ ScalarEvolution::getSignExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth) {
       }
 
       if (proveNoWrapByVaryingStart<SCEVSignExtendExpr>(Start, Step, L)) {
-        const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNSW);
+        setNoWrapFlags(const_cast<SCEVAddRecExpr *>(AR), SCEV::FlagNSW);
         return getAddRecExpr(
             getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this, Depth + 1),
             getSignExtendExpr(Step, Ty, Depth + 1), L, AR->getNoWrapFlags());
@@ -2048,7 +2177,7 @@ CollectAddOperandsWithScales(DenseMap<const SCEV *, APInt> &M,
       } else {
         // A multiplication of a constant with some other value. Update
         // the map.
-        SmallVector<const SCEV *, 4> MulOps(Mul->op_begin()+1, Mul->op_end());
+        SmallVector<const SCEV *, 4> MulOps(drop_begin(Mul->operands()));
         const SCEV *Key = SE.getMulExpr(MulOps);
         auto Pair = M.insert({Key, NewScale});
         if (Pair.second) {
@@ -2152,9 +2281,9 @@ bool ScalarEvolution::isAvailableAtLoopEntry(const SCEV *S, const Loop *L) {
 
 /// Get a canonical add expression, or something simpler if possible.
 const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
-                                        SCEV::NoWrapFlags Flags,
+                                        SCEV::NoWrapFlags OrigFlags,
                                         unsigned Depth) {
-  assert(!(Flags & ~(SCEV::FlagNUW | SCEV::FlagNSW)) &&
+  assert(!(OrigFlags & ~(SCEV::FlagNUW | SCEV::FlagNSW)) &&
          "only nuw or nsw allowed");
   assert(!Ops.empty() && "Cannot get empty add!");
   if (Ops.size() == 1) return Ops[0];
@@ -2168,8 +2297,6 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
   // Sort by complexity, this groups all similar expression types together.
   GroupByComplexity(Ops, &LI, DT);
 
-  Flags = StrengthenNoWrapFlags(this, scAddExpr, Ops, Flags);
-
   // If there are any constants, fold them together.
   unsigned Idx = 0;
   if (const SCEVConstant *LHSC = dyn_cast<SCEVConstant>(Ops[0])) {
@@ -2192,12 +2319,20 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
     if (Ops.size() == 1) return Ops[0];
   }
 
+  // Delay expensive flag strengthening until necessary.
+  auto ComputeFlags = [this, OrigFlags](const ArrayRef<const SCEV *> Ops) {
+    return StrengthenNoWrapFlags(this, scAddExpr, Ops, OrigFlags);
+  };
+
   // Limit recursion calls depth.
   if (Depth > MaxArithDepth || hasHugeExpression(Ops))
-    return getOrCreateAddExpr(Ops, Flags);
+    return getOrCreateAddExpr(Ops, ComputeFlags(Ops));
 
   if (SCEV *S = std::get<0>(findExistingSCEVInCache(scAddExpr, Ops))) {
-    static_cast<SCEVAddExpr *>(S)->setNoWrapFlags(Flags);
+    // Don't strengthen flags if we have no new information.
+    SCEVAddExpr *Add = static_cast<SCEVAddExpr *>(S);
+    if (Add->getNoWrapFlags(OrigFlags) != OrigFlags)
+      Add->setNoWrapFlags(ComputeFlags(Ops));
     return S;
   }
 
@@ -2223,7 +2358,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
       FoundMatch = true;
     }
   if (FoundMatch)
-    return getAddExpr(Ops, Flags, Depth + 1);
+    return getAddExpr(Ops, OrigFlags, Depth + 1);
 
   // Check for truncates. If all the operands are truncated from the same
   // type, see if factoring out the truncate would permit the result to be
@@ -2458,11 +2593,16 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
 
     // If we found some loop invariants, fold them into the recurrence.
     if (!LIOps.empty()) {
+      // Compute nowrap flags for the addition of the loop-invariant ops and
+      // the addrec. Temporarily push it as an operand for that purpose.
+      LIOps.push_back(AddRec);
+      SCEV::NoWrapFlags Flags = ComputeFlags(LIOps);
+      LIOps.pop_back();
+
       //  NLI + LI + {Start,+,Step}  -->  NLI + {LI+Start,+,Step}
       LIOps.push_back(AddRec->getStart());
 
-      SmallVector<const SCEV *, 4> AddRecOps(AddRec->op_begin(),
-                                             AddRec->op_end());
+      SmallVector<const SCEV *, 4> AddRecOps(AddRec->operands());
       // This follows from the fact that the no-wrap flags on the outer add
       // expression are applicable on the 0th iteration, when the add recurrence
       // will be equal to its start value.
@@ -2500,8 +2640,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
         "AddRecExprs are not sorted in reverse dominance order?");
       if (AddRecLoop == cast<SCEVAddRecExpr>(Ops[OtherIdx])->getLoop()) {
         // Other + {A,+,B}<L> + {C,+,D}<L>  -->  Other + {A+C,+,B+D}<L>
-        SmallVector<const SCEV *, 4> AddRecOps(AddRec->op_begin(),
-                                               AddRec->op_end());
+        SmallVector<const SCEV *, 4> AddRecOps(AddRec->operands());
         for (; OtherIdx != Ops.size() && isa<SCEVAddRecExpr>(Ops[OtherIdx]);
              ++OtherIdx) {
           const auto *OtherAddRec = cast<SCEVAddRecExpr>(Ops[OtherIdx]);
@@ -2532,7 +2671,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
 
   // Okay, it looks like we really DO need an add expr.  Check to see if we
   // already have one, otherwise create a new one.
-  return getOrCreateAddExpr(Ops, Flags);
+  return getOrCreateAddExpr(Ops, ComputeFlags(Ops));
 }
 
 const SCEV *
@@ -2576,7 +2715,7 @@ ScalarEvolution::getOrCreateAddRecExpr(ArrayRef<const SCEV *> Ops,
     UniqueSCEVs.InsertNode(S, IP);
     addToLoopUseLists(S);
   }
-  S->setNoWrapFlags(Flags);
+  setNoWrapFlags(S, Flags);
   return S;
 }
 
@@ -2658,9 +2797,9 @@ static bool containsConstantInAddMulChain(const SCEV *StartExpr) {
 
 /// Get a canonical multiply expression, or something simpler if possible.
 const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
-                                        SCEV::NoWrapFlags Flags,
+                                        SCEV::NoWrapFlags OrigFlags,
                                         unsigned Depth) {
-  assert(Flags == maskFlags(Flags, SCEV::FlagNUW | SCEV::FlagNSW) &&
+  assert(OrigFlags == maskFlags(OrigFlags, SCEV::FlagNUW | SCEV::FlagNSW) &&
          "only nuw or nsw allowed");
   assert(!Ops.empty() && "Cannot get empty mul!");
   if (Ops.size() == 1) return Ops[0];
@@ -2674,24 +2813,52 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
   // Sort by complexity, this groups all similar expression types together.
   GroupByComplexity(Ops, &LI, DT);
 
-  Flags = StrengthenNoWrapFlags(this, scMulExpr, Ops, Flags);
+  // If there are any constants, fold them together.
+  unsigned Idx = 0;
+  if (const SCEVConstant *LHSC = dyn_cast<SCEVConstant>(Ops[0])) {
+    ++Idx;
+    assert(Idx < Ops.size());
+    while (const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(Ops[Idx])) {
+      // We found two constants, fold them together!
+      Ops[0] = getConstant(LHSC->getAPInt() * RHSC->getAPInt());
+      if (Ops.size() == 2) return Ops[0];
+      Ops.erase(Ops.begin()+1);  // Erase the folded element
+      LHSC = cast<SCEVConstant>(Ops[0]);
+    }
+
+    // If we have a multiply of zero, it will always be zero.
+    if (LHSC->getValue()->isZero())
+      return LHSC;
+
+    // If we are left with a constant one being multiplied, strip it off.
+    if (LHSC->getValue()->isOne()) {
+      Ops.erase(Ops.begin());
+      --Idx;
+    }
+
+    if (Ops.size() == 1)
+      return Ops[0];
+  }
+
+  // Delay expensive flag strengthening until necessary.
+  auto ComputeFlags = [this, OrigFlags](const ArrayRef<const SCEV *> Ops) {
+    return StrengthenNoWrapFlags(this, scMulExpr, Ops, OrigFlags);
+  };
 
-  // Limit recursion calls depth, but fold all-constant expressions.
-  // `Ops` is sorted, so it's enough to check just last one.
-  if ((Depth > MaxArithDepth || hasHugeExpression(Ops)) &&
-      !isa<SCEVConstant>(Ops.back()))
-    return getOrCreateMulExpr(Ops, Flags);
+  // Limit recursion calls depth.
+  if (Depth > MaxArithDepth || hasHugeExpression(Ops))
+    return getOrCreateMulExpr(Ops, ComputeFlags(Ops));
 
   if (SCEV *S = std::get<0>(findExistingSCEVInCache(scMulExpr, Ops))) {
-    static_cast<SCEVMulExpr *>(S)->setNoWrapFlags(Flags);
+    // Don't strengthen flags if we have no new information.
+    SCEVMulExpr *Mul = static_cast<SCEVMulExpr *>(S);
+    if (Mul->getNoWrapFlags(OrigFlags) != OrigFlags)
+      Mul->setNoWrapFlags(ComputeFlags(Ops));
     return S;
   }
 
-  // If there are any constants, fold them together.
-  unsigned Idx = 0;
   if (const SCEVConstant *LHSC = dyn_cast<SCEVConstant>(Ops[0])) {
-
-    if (Ops.size() == 2)
+    if (Ops.size() == 2) {
       // C1*(C2+V) -> C1*C2 + C1*V
       if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Ops[1]))
         // If any of Add's ops are Adds or Muls with a constant, apply this
@@ -2707,28 +2874,9 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
                                        SCEV::FlagAnyWrap, Depth + 1),
                             SCEV::FlagAnyWrap, Depth + 1);
 
-    ++Idx;
-    while (const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(Ops[Idx])) {
-      // We found two constants, fold them together!
-      ConstantInt *Fold =
-          ConstantInt::get(getContext(), LHSC->getAPInt() * RHSC->getAPInt());
-      Ops[0] = getConstant(Fold);
-      Ops.erase(Ops.begin()+1);  // Erase the folded element
-      if (Ops.size() == 1) return Ops[0];
-      LHSC = cast<SCEVConstant>(Ops[0]);
-    }
-
-    // If we are left with a constant one being multiplied, strip it off.
-    if (cast<SCEVConstant>(Ops[0])->getValue()->isOne()) {
-      Ops.erase(Ops.begin());
-      --Idx;
-    } else if (cast<SCEVConstant>(Ops[0])->getValue()->isZero()) {
-      // If we have a multiply of zero, it will always be zero.
-      return Ops[0];
-    } else if (Ops[0]->isAllOnesValue()) {
-      // If we have a mul by -1 of an add, try distributing the -1 among the
-      // add operands.
-      if (Ops.size() == 2) {
+      if (Ops[0]->isAllOnesValue()) {
+        // If we have a mul by -1 of an add, try distributing the -1 among the
+        // add operands.
         if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Ops[1])) {
           SmallVector<const SCEV *, 4> NewOps;
           bool AnyFolded = false;
@@ -2752,9 +2900,6 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
         }
       }
     }
-
-    if (Ops.size() == 1)
-      return Ops[0];
   }
 
   // Skip over the add expression until we get to a multiply.
@@ -2816,8 +2961,9 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
       //
       // No self-wrap cannot be guaranteed after changing the step size, but
       // will be inferred if either NUW or NSW is true.
-      Flags = AddRec->getNoWrapFlags(clearFlags(Flags, SCEV::FlagNW));
-      const SCEV *NewRec = getAddRecExpr(NewOps, AddRecLoop, Flags);
+      SCEV::NoWrapFlags Flags = ComputeFlags({Scale, AddRec});
+      const SCEV *NewRec = getAddRecExpr(
+          NewOps, AddRecLoop, AddRec->getNoWrapFlags(Flags));
 
       // If all of the other operands were loop invariant, we are done.
       if (Ops.size() == 1) return NewRec;
@@ -2910,7 +3056,7 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
 
   // Okay, it looks like we really DO need an mul expr.  Check to see if we
   // already have one, otherwise create a new one.
-  return getOrCreateMulExpr(Ops, Flags);
+  return getOrCreateMulExpr(Ops, ComputeFlags(Ops));
 }
 
 /// Represents an unsigned remainder expression based on unsigned division.
@@ -3034,8 +3180,7 @@ const SCEV *ScalarEvolution::getUDivExpr(const SCEV *LHS,
             const SCEV *Op = M->getOperand(i);
             const SCEV *Div = getUDivExpr(Op, RHSC);
             if (!isa<SCEVUDivExpr>(Div) && getMulExpr(Div, RHSC) == Op) {
-              Operands = SmallVector<const SCEV *, 4>(M->op_begin(),
-                                                      M->op_end());
+              Operands = SmallVector<const SCEV *, 4>(M->operands());
               Operands[i] = Div;
               return getMulExpr(Operands);
             }
@@ -3129,8 +3274,7 @@ const SCEV *ScalarEvolution::getUDivExactExpr(const SCEV *LHS,
     // first element of the mulexpr.
     if (const auto *LHSCst = dyn_cast<SCEVConstant>(Mul->getOperand(0))) {
       if (LHSCst == RHSCst) {
-        SmallVector<const SCEV *, 2> Operands;
-        Operands.append(Mul->op_begin() + 1, Mul->op_end());
+        SmallVector<const SCEV *, 2> Operands(drop_begin(Mul->operands()));
         return getMulExpr(Operands);
       }
 
@@ -3220,8 +3364,7 @@ ScalarEvolution::getAddRecExpr(SmallVectorImpl<const SCEV *> &Operands,
             ? (L->getLoopDepth() < NestedLoop->getLoopDepth())
             : (!NestedLoop->contains(L) &&
                DT.dominates(L->getHeader(), NestedLoop->getHeader()))) {
-      SmallVector<const SCEV *, 4> NestedOperands(NestedAR->op_begin(),
-                                                  NestedAR->op_end());
+      SmallVector<const SCEV *, 4> NestedOperands(NestedAR->operands());
       Operands[0] = NestedAR->getStart();
       // AddRecs require their operands be loop-invariant with respect to their
       // loops. Don't perform this transformation if it would break this
@@ -3274,12 +3417,12 @@ ScalarEvolution::getGEPExpr(GEPOperator *GEP,
   // flow and the no-overflow bits may not be valid for the expression in any
   // context. This can be fixed similarly to how these flags are handled for
   // adds.
-  SCEV::NoWrapFlags Wrap = GEP->isInBounds() ? SCEV::FlagNSW
-                                             : SCEV::FlagAnyWrap;
+  SCEV::NoWrapFlags OffsetWrap =
+      GEP->isInBounds() ? SCEV::FlagNSW : SCEV::FlagAnyWrap;
 
-  const SCEV *TotalOffset = getZero(IntIdxTy);
   Type *CurTy = GEP->getType();
   bool FirstIter = true;
+  SmallVector<const SCEV *, 4> Offsets;
   for (const SCEV *IndexExpr : IndexExprs) {
     // Compute the (potentially symbolic) offset in bytes for this index.
     if (StructType *STy = dyn_cast<StructType>(CurTy)) {
@@ -3287,9 +3430,7 @@ ScalarEvolution::getGEPExpr(GEPOperator *GEP,
       ConstantInt *Index = cast<SCEVConstant>(IndexExpr)->getValue();
       unsigned FieldNo = Index->getZExtValue();
       const SCEV *FieldOffset = getOffsetOfExpr(IntIdxTy, STy, FieldNo);
-
-      // Add the field offset to the running total offset.
-      TotalOffset = getAddExpr(TotalOffset, FieldOffset);
+      Offsets.push_back(FieldOffset);
 
       // Update CurTy to the type of the field at Index.
       CurTy = STy->getTypeAtIndex(Index);
@@ -3309,19 +3450,27 @@ ScalarEvolution::getGEPExpr(GEPOperator *GEP,
       IndexExpr = getTruncateOrSignExtend(IndexExpr, IntIdxTy);
 
       // Multiply the index by the element size to compute the element offset.
-      const SCEV *LocalOffset = getMulExpr(IndexExpr, ElementSize, Wrap);
-
-      // Add the element offset to the running total offset.
-      TotalOffset = getAddExpr(TotalOffset, LocalOffset);
+      const SCEV *LocalOffset = getMulExpr(IndexExpr, ElementSize, OffsetWrap);
+      Offsets.push_back(LocalOffset);
     }
   }
 
-  // Add the total offset from all the GEP indices to the base.
-  return getAddExpr(BaseExpr, TotalOffset, Wrap);
+  // Handle degenerate case of GEP without offsets.
+  if (Offsets.empty())
+    return BaseExpr;
+
+  // Add the offsets together, assuming nsw if inbounds.
+  const SCEV *Offset = getAddExpr(Offsets, OffsetWrap);
+  // Add the base address and the offset. We cannot use the nsw flag, as the
+  // base address is unsigned. However, if we know that the offset is
+  // non-negative, we can use nuw.
+  SCEV::NoWrapFlags BaseWrap = GEP->isInBounds() && isKnownNonNegative(Offset)
+                                   ? SCEV::FlagNUW : SCEV::FlagAnyWrap;
+  return getAddExpr(BaseExpr, Offset, BaseWrap);
 }
 
 std::tuple<SCEV *, FoldingSetNodeID, void *>
-ScalarEvolution::findExistingSCEVInCache(int SCEVType,
+ScalarEvolution::findExistingSCEVInCache(SCEVTypes SCEVType,
                                          ArrayRef<const SCEV *> Ops) {
   FoldingSetNodeID ID;
   void *IP = nullptr;
@@ -3332,7 +3481,17 @@ ScalarEvolution::findExistingSCEVInCache(int SCEVType,
       UniqueSCEVs.FindNodeOrInsertPos(ID, IP), std::move(ID), IP);
 }
 
-const SCEV *ScalarEvolution::getMinMaxExpr(unsigned Kind,
+const SCEV *ScalarEvolution::getAbsExpr(const SCEV *Op, bool IsNSW) {
+  SCEV::NoWrapFlags Flags = IsNSW ? SCEV::FlagNSW : SCEV::FlagAnyWrap;
+  return getSMaxExpr(Op, getNegativeSCEV(Op, Flags));
+}
+
+const SCEV *ScalarEvolution::getSignumExpr(const SCEV *Op) {
+  Type *Ty = Op->getType();
+  return getSMinExpr(getSMaxExpr(Op, getMinusOne(Ty)), getOne(Ty));
+}
+
+const SCEV *ScalarEvolution::getMinMaxExpr(SCEVTypes Kind,
                                            SmallVectorImpl<const SCEV *> &Ops) {
   assert(!Ops.empty() && "Cannot get empty (u|s)(min|max)!");
   if (Ops.size() == 1) return Ops[0];
@@ -3456,8 +3615,8 @@ const SCEV *ScalarEvolution::getMinMaxExpr(unsigned Kind,
     return ExistingSCEV;
   const SCEV **O = SCEVAllocator.Allocate<const SCEV *>(Ops.size());
   std::uninitialized_copy(Ops.begin(), Ops.end(), O);
-  SCEV *S = new (SCEVAllocator) SCEVMinMaxExpr(
-      ID.Intern(SCEVAllocator), static_cast<SCEVTypes>(Kind), O, Ops.size());
+  SCEV *S = new (SCEVAllocator)
+      SCEVMinMaxExpr(ID.Intern(SCEVAllocator), Kind, O, Ops.size());
 
   UniqueSCEVs.InsertNode(S, IP);
   addToLoopUseLists(S);
@@ -3502,25 +3661,42 @@ const SCEV *ScalarEvolution::getUMinExpr(SmallVectorImpl<const SCEV *> &Ops) {
   return getMinMaxExpr(scUMinExpr, Ops);
 }
 
+const SCEV *
+ScalarEvolution::getSizeOfScalableVectorExpr(Type *IntTy,
+                                             ScalableVectorType *ScalableTy) {
+  Constant *NullPtr = Constant::getNullValue(ScalableTy->getPointerTo());
+  Constant *One = ConstantInt::get(IntTy, 1);
+  Constant *GEP = ConstantExpr::getGetElementPtr(ScalableTy, NullPtr, One);
+  // Note that the expression we created is the final expression, we don't
+  // want to simplify it any further Also, if we call a normal getSCEV(),
+  // we'll end up in an endless recursion. So just create an SCEVUnknown.
+  return getUnknown(ConstantExpr::getPtrToInt(GEP, IntTy));
+}
+
 const SCEV *ScalarEvolution::getSizeOfExpr(Type *IntTy, Type *AllocTy) {
-  // We can bypass creating a target-independent
-  // constant expression and then folding it back into a ConstantInt.
-  // This is just a compile-time optimization.
-  if (isa<ScalableVectorType>(AllocTy)) {
-    Constant *NullPtr = Constant::getNullValue(AllocTy->getPointerTo());
-    Constant *One = ConstantInt::get(IntTy, 1);
-    Constant *GEP = ConstantExpr::getGetElementPtr(AllocTy, NullPtr, One);
-    return getSCEV(ConstantExpr::getPtrToInt(GEP, IntTy));
-  }
+  if (auto *ScalableAllocTy = dyn_cast<ScalableVectorType>(AllocTy))
+    return getSizeOfScalableVectorExpr(IntTy, ScalableAllocTy);
+  // We can bypass creating a target-independent constant expression and then
+  // folding it back into a ConstantInt. This is just a compile-time
+  // optimization.
   return getConstant(IntTy, getDataLayout().getTypeAllocSize(AllocTy));
 }
 
+const SCEV *ScalarEvolution::getStoreSizeOfExpr(Type *IntTy, Type *StoreTy) {
+  if (auto *ScalableStoreTy = dyn_cast<ScalableVectorType>(StoreTy))
+    return getSizeOfScalableVectorExpr(IntTy, ScalableStoreTy);
+  // We can bypass creating a target-independent constant expression and then
+  // folding it back into a ConstantInt. This is just a compile-time
+  // optimization.
+  return getConstant(IntTy, getDataLayout().getTypeStoreSize(StoreTy));
+}
+
 const SCEV *ScalarEvolution::getOffsetOfExpr(Type *IntTy,
                                              StructType *STy,
                                              unsigned FieldNo) {
-  // We can bypass creating a target-independent
-  // constant expression and then folding it back into a ConstantInt.
-  // This is just a compile-time optimization.
+  // We can bypass creating a target-independent constant expression and then
+  // folding it back into a ConstantInt. This is just a compile-time
+  // optimization.
   return getConstant(
       IntTy, getDataLayout().getStructLayout(STy)->getElementOffset(FieldNo));
 }
@@ -3744,8 +3920,7 @@ const SCEV *ScalarEvolution::getNegativeSCEV(const SCEV *V,
 
   Type *Ty = V->getType();
   Ty = getEffectiveSCEVType(Ty);
-  return getMulExpr(
-      V, getConstant(cast<ConstantInt>(Constant::getAllOnesValue(Ty))), Flags);
+  return getMulExpr(V, getMinusOne(Ty), Flags);
 }
 
 /// If Expr computes ~A, return A else return nullptr
@@ -3779,9 +3954,8 @@ const SCEV *ScalarEvolution::getNotSCEV(const SCEV *V) {
           return (const SCEV *)nullptr;
         MatchedOperands.push_back(Matched);
       }
-      return getMinMaxExpr(
-          SCEVMinMaxExpr::negate(static_cast<SCEVTypes>(MME->getSCEVType())),
-          MatchedOperands);
+      return getMinMaxExpr(SCEVMinMaxExpr::negate(MME->getSCEVType()),
+                           MatchedOperands);
     };
     if (const SCEV *Replaced = MatchMinMaxNegation(MME))
       return Replaced;
@@ -3789,9 +3963,7 @@ const SCEV *ScalarEvolution::getNotSCEV(const SCEV *V) {
 
   Type *Ty = V->getType();
   Ty = getEffectiveSCEVType(Ty);
-  const SCEV *AllOnes =
-                   getConstant(cast<ConstantInt>(Constant::getAllOnesValue(Ty)));
-  return getMinusSCEV(AllOnes, V);
+  return getMinusSCEV(getMinusOne(Ty), V);
 }
 
 const SCEV *ScalarEvolution::getMinusSCEV(const SCEV *LHS, const SCEV *RHS,
@@ -3938,6 +4110,7 @@ const SCEV *ScalarEvolution::getUMinFromMismatchedTypes(
       MaxType = getWiderType(MaxType, S->getType());
     else
       MaxType = S->getType();
+  assert(MaxType && "Failed to find maximum type!");
 
   // Extend all ops to max type.
   SmallVector<const SCEV *, 2> PromotedOps;
@@ -3954,7 +4127,7 @@ const SCEV *ScalarEvolution::getPointerBase(const SCEV *V) {
     return V;
 
   while (true) {
-    if (const SCEVCastExpr *Cast = dyn_cast<SCEVCastExpr>(V)) {
+    if (const SCEVIntegralCastExpr *Cast = dyn_cast<SCEVIntegralCastExpr>(V)) {
       V = Cast->getOperand();
     } else if (const SCEVNAryExpr *NAry = dyn_cast<SCEVNAryExpr>(V)) {
       const SCEV *PtrOp = nullptr;
@@ -4257,6 +4430,107 @@ ScalarEvolution::proveNoWrapViaConstantRanges(const SCEVAddRecExpr *AR) {
   return Result;
 }
 
+SCEV::NoWrapFlags
+ScalarEvolution::proveNoSignedWrapViaInduction(const SCEVAddRecExpr *AR) {
+  SCEV::NoWrapFlags Result = AR->getNoWrapFlags();
+
+  if (AR->hasNoSignedWrap())
+    return Result;
+
+  if (!AR->isAffine())
+    return Result;
+
+  const SCEV *Step = AR->getStepRecurrence(*this);
+  const Loop *L = AR->getLoop();
+
+  // Check whether the backedge-taken count is SCEVCouldNotCompute.
+  // Note that this serves two purposes: It filters out loops that are
+  // simply not analyzable, and it covers the case where this code is
+  // being called from within backedge-taken count analysis, such that
+  // attempting to ask for the backedge-taken count would likely result
+  // in infinite recursion. In the later case, the analysis code will
+  // cope with a conservative value, and it will take care to purge
+  // that value once it has finished.
+  const SCEV *MaxBECount = getConstantMaxBackedgeTakenCount(L);
+
+  // Normally, in the cases we can prove no-overflow via a
+  // backedge guarding condition, we can also compute a backedge
+  // taken count for the loop.  The exceptions are assumptions and
+  // guards present in the loop -- SCEV is not great at exploiting
+  // these to compute max backedge taken counts, but can still use
+  // these to prove lack of overflow.  Use this fact to avoid
+  // doing extra work that may not pay off.
+
+  if (isa<SCEVCouldNotCompute>(MaxBECount) && !HasGuards &&
+      AC.assumptions().empty())
+    return Result;
+
+  // If the backedge is guarded by a comparison with the pre-inc  value the
+  // addrec is safe. Also, if the entry is guarded by a comparison with the
+  // start value and the backedge is guarded by a comparison with the post-inc
+  // value, the addrec is safe.
+  ICmpInst::Predicate Pred;
+  const SCEV *OverflowLimit =
+    getSignedOverflowLimitForStep(Step, &Pred, this);
+  if (OverflowLimit &&
+      (isLoopBackedgeGuardedByCond(L, Pred, AR, OverflowLimit) ||
+       isKnownOnEveryIteration(Pred, AR, OverflowLimit))) {
+    Result = setFlags(Result, SCEV::FlagNSW);
+  }
+  return Result;
+}
+SCEV::NoWrapFlags
+ScalarEvolution::proveNoUnsignedWrapViaInduction(const SCEVAddRecExpr *AR) {
+  SCEV::NoWrapFlags Result = AR->getNoWrapFlags();
+
+  if (AR->hasNoUnsignedWrap())
+    return Result;
+
+  if (!AR->isAffine())
+    return Result;
+
+  const SCEV *Step = AR->getStepRecurrence(*this);
+  unsigned BitWidth = getTypeSizeInBits(AR->getType());
+  const Loop *L = AR->getLoop();
+
+  // Check whether the backedge-taken count is SCEVCouldNotCompute.
+  // Note that this serves two purposes: It filters out loops that are
+  // simply not analyzable, and it covers the case where this code is
+  // being called from within backedge-taken count analysis, such that
+  // attempting to ask for the backedge-taken count would likely result
+  // in infinite recursion. In the later case, the analysis code will
+  // cope with a conservative value, and it will take care to purge
+  // that value once it has finished.
+  const SCEV *MaxBECount = getConstantMaxBackedgeTakenCount(L);
+
+  // Normally, in the cases we can prove no-overflow via a
+  // backedge guarding condition, we can also compute a backedge
+  // taken count for the loop.  The exceptions are assumptions and
+  // guards present in the loop -- SCEV is not great at exploiting
+  // these to compute max backedge taken counts, but can still use
+  // these to prove lack of overflow.  Use this fact to avoid
+  // doing extra work that may not pay off.
+
+  if (isa<SCEVCouldNotCompute>(MaxBECount) && !HasGuards &&
+      AC.assumptions().empty())
+    return Result;
+
+  // If the backedge is guarded by a comparison with the pre-inc  value the
+  // addrec is safe. Also, if the entry is guarded by a comparison with the
+  // start value and the backedge is guarded by a comparison with the post-inc
+  // value, the addrec is safe.
+  if (isKnownPositive(Step)) {
+    const SCEV *N = getConstant(APInt::getMinValue(BitWidth) -
+                                getUnsignedRangeMax(Step));
+    if (isLoopBackedgeGuardedByCond(L, ICmpInst::ICMP_ULT, AR, N) ||
+        isKnownOnEveryIteration(ICmpInst::ICMP_ULT, AR, N)) {
+      Result = setFlags(Result, SCEV::FlagNUW);
+    }
+  }
+
+  return Result;
+}
+
 namespace {
 
 /// Represents an abstract binary operation.  This may exist as a
@@ -4268,6 +4542,7 @@ struct BinaryOp {
   Value *RHS;
   bool IsNSW = false;
   bool IsNUW = false;
+  bool IsExact = false;
 
   /// Op is set if this BinaryOp corresponds to a concrete LLVM instruction or
   /// constant expression.
@@ -4280,11 +4555,14 @@ struct BinaryOp {
       IsNSW = OBO->hasNoSignedWrap();
       IsNUW = OBO->hasNoUnsignedWrap();
     }
+    if (auto *PEO = dyn_cast<PossiblyExactOperator>(Op))
+      IsExact = PEO->isExact();
   }
 
   explicit BinaryOp(unsigned Opcode, Value *LHS, Value *RHS, bool IsNSW = false,
-                    bool IsNUW = false)
-      : Opcode(Opcode), LHS(LHS), RHS(RHS), IsNSW(IsNSW), IsNUW(IsNUW) {}
+                    bool IsNUW = false, bool IsExact = false)
+      : Opcode(Opcode), LHS(LHS), RHS(RHS), IsNSW(IsNSW), IsNUW(IsNUW),
+        IsExact(IsExact) {}
 };
 
 } // end anonymous namespace
@@ -4981,8 +5259,15 @@ static bool IsAvailableOnEntry(const Loop *L, DominatorTree &DT, const SCEV *S,
 
     bool follow(const SCEV *S) {
       switch (S->getSCEVType()) {
-      case scConstant: case scTruncate: case scZeroExtend: case scSignExtend:
-      case scAddExpr: case scMulExpr: case scUMaxExpr: case scSMaxExpr:
+      case scConstant:
+      case scPtrToInt:
+      case scTruncate:
+      case scZeroExtend:
+      case scSignExtend:
+      case scAddExpr:
+      case scMulExpr:
+      case scUMaxExpr:
+      case scSMaxExpr:
       case scUMinExpr:
       case scSMinExpr:
         // These expressions are available if their operand(s) is/are.
@@ -5020,7 +5305,7 @@ static bool IsAvailableOnEntry(const Loop *L, DominatorTree &DT, const SCEV *S,
         // We do not try to smart about these at all.
         return setUnavailable();
       }
-      llvm_unreachable("switch should be fully covered!");
+      llvm_unreachable("Unknown SCEV kind!");
     }
 
     bool isDone() { return TraversalDone; }
@@ -5240,6 +5525,9 @@ uint32_t ScalarEvolution::GetMinTrailingZerosImpl(const SCEV *S) {
   if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S))
     return C->getAPInt().countTrailingZeros();
 
+  if (const SCEVPtrToIntExpr *I = dyn_cast<SCEVPtrToIntExpr>(S))
+    return GetMinTrailingZeros(I->getOperand());
+
   if (const SCEVTruncateExpr *T = dyn_cast<SCEVTruncateExpr>(S))
     return std::min(GetMinTrailingZeros(T->getOperand()),
                     (uint32_t)getTypeSizeInBits(T->getType()));
@@ -5331,6 +5619,15 @@ static Optional<ConstantRange> GetRangeFromMetadata(Value *V) {
   return None;
 }
 
+void ScalarEvolution::setNoWrapFlags(SCEVAddRecExpr *AddRec,
+                                     SCEV::NoWrapFlags Flags) {
+  if (AddRec->getNoWrapFlags(Flags) != Flags) {
+    AddRec->setNoWrapFlags(Flags);
+    UnsignedRanges.erase(AddRec);
+    SignedRanges.erase(AddRec);
+  }
+}
+
 /// Determine the range for a particular SCEV.  If SignHint is
 /// HINT_RANGE_UNSIGNED (resp. HINT_RANGE_SIGNED) then getRange prefers ranges
 /// with a "cleaner" unsigned (resp. signed) representation.
@@ -5445,6 +5742,11 @@ ScalarEvolution::getRangeRef(const SCEV *S,
                                                      RangeType));
   }
 
+  if (const SCEVPtrToIntExpr *PtrToInt = dyn_cast<SCEVPtrToIntExpr>(S)) {
+    ConstantRange X = getRangeRef(PtrToInt->getOperand(), SignHint);
+    return setRange(PtrToInt, SignHint, X);
+  }
+
   if (const SCEVTruncateExpr *Trunc = dyn_cast<SCEVTruncateExpr>(S)) {
     ConstantRange X = getRangeRef(Trunc->getOperand(), SignHint);
     return setRange(Trunc, SignHint,
@@ -5497,16 +5799,28 @@ ScalarEvolution::getRangeRef(const SCEV *S,
         auto RangeFromAffine = getRangeForAffineAR(
             AddRec->getStart(), AddRec->getStepRecurrence(*this), MaxBECount,
             BitWidth);
-        if (!RangeFromAffine.isFullSet())
-          ConservativeResult =
-              ConservativeResult.intersectWith(RangeFromAffine, RangeType);
+        ConservativeResult =
+            ConservativeResult.intersectWith(RangeFromAffine, RangeType);
 
         auto RangeFromFactoring = getRangeViaFactoring(
             AddRec->getStart(), AddRec->getStepRecurrence(*this), MaxBECount,
             BitWidth);
-        if (!RangeFromFactoring.isFullSet())
+        ConservativeResult =
+            ConservativeResult.intersectWith(RangeFromFactoring, RangeType);
+      }
+
+      // Now try symbolic BE count and more powerful methods.
+      if (UseExpensiveRangeSharpening) {
+        const SCEV *SymbolicMaxBECount =
+            getSymbolicMaxBackedgeTakenCount(AddRec->getLoop());
+        if (!isa<SCEVCouldNotCompute>(SymbolicMaxBECount) &&
+            getTypeSizeInBits(MaxBECount->getType()) <= BitWidth &&
+            AddRec->hasNoSelfWrap()) {
+          auto RangeFromAffineNew = getRangeForAffineNoSelfWrappingAR(
+              AddRec, SymbolicMaxBECount, BitWidth, SignHint);
           ConservativeResult =
-              ConservativeResult.intersectWith(RangeFromFactoring, RangeType);
+              ConservativeResult.intersectWith(RangeFromAffineNew, RangeType);
+        }
       }
     }
 
@@ -5677,6 +5991,74 @@ ConstantRange ScalarEvolution::getRangeForAffineAR(const SCEV *Start,
   return SR.intersectWith(UR, ConstantRange::Smallest);
 }
 
+ConstantRange ScalarEvolution::getRangeForAffineNoSelfWrappingAR(
+    const SCEVAddRecExpr *AddRec, const SCEV *MaxBECount, unsigned BitWidth,
+    ScalarEvolution::RangeSignHint SignHint) {
+  assert(AddRec->isAffine() && "Non-affine AddRecs are not suppored!\n");
+  assert(AddRec->hasNoSelfWrap() &&
+         "This only works for non-self-wrapping AddRecs!");
+  const bool IsSigned = SignHint == HINT_RANGE_SIGNED;
+  const SCEV *Step = AddRec->getStepRecurrence(*this);
+  // Only deal with constant step to save compile time.
+  if (!isa<SCEVConstant>(Step))
+    return ConstantRange::getFull(BitWidth);
+  // Let's make sure that we can prove that we do not self-wrap during
+  // MaxBECount iterations. We need this because MaxBECount is a maximum
+  // iteration count estimate, and we might infer nw from some exit for which we
+  // do not know max exit count (or any other side reasoning).
+  // TODO: Turn into assert at some point.
+  if (getTypeSizeInBits(MaxBECount->getType()) >
+      getTypeSizeInBits(AddRec->getType()))
+    return ConstantRange::getFull(BitWidth);
+  MaxBECount = getNoopOrZeroExtend(MaxBECount, AddRec->getType());
+  const SCEV *RangeWidth = getMinusOne(AddRec->getType());
+  const SCEV *StepAbs = getUMinExpr(Step, getNegativeSCEV(Step));
+  const SCEV *MaxItersWithoutWrap = getUDivExpr(RangeWidth, StepAbs);
+  if (!isKnownPredicateViaConstantRanges(ICmpInst::ICMP_ULE, MaxBECount,
+                                         MaxItersWithoutWrap))
+    return ConstantRange::getFull(BitWidth);
+
+  ICmpInst::Predicate LEPred =
+      IsSigned ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE;
+  ICmpInst::Predicate GEPred =
+      IsSigned ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE;
+  const SCEV *End = AddRec->evaluateAtIteration(MaxBECount, *this);
+
+  // We know that there is no self-wrap. Let's take Start and End values and
+  // look at all intermediate values V1, V2, ..., Vn that IndVar takes during
+  // the iteration. They either lie inside the range [Min(Start, End),
+  // Max(Start, End)] or outside it:
+  //
+  // Case 1:   RangeMin    ...    Start V1 ... VN End ...           RangeMax;
+  // Case 2:   RangeMin Vk ... V1 Start    ...    End Vn ... Vk + 1 RangeMax;
+  //
+  // No self wrap flag guarantees that the intermediate values cannot be BOTH
+  // outside and inside the range [Min(Start, End), Max(Start, End)]. Using that
+  // knowledge, let's try to prove that we are dealing with Case 1. It is so if
+  // Start <= End and step is positive, or Start >= End and step is negative.
+  const SCEV *Start = AddRec->getStart();
+  ConstantRange StartRange = getRangeRef(Start, SignHint);
+  ConstantRange EndRange = getRangeRef(End, SignHint);
+  ConstantRange RangeBetween = StartRange.unionWith(EndRange);
+  // If they already cover full iteration space, we will know nothing useful
+  // even if we prove what we want to prove.
+  if (RangeBetween.isFullSet())
+    return RangeBetween;
+  // Only deal with ranges that do not wrap (i.e. RangeMin < RangeMax).
+  bool IsWrappedSet = IsSigned ? RangeBetween.isSignWrappedSet()
+                               : RangeBetween.isWrappedSet();
+  if (IsWrappedSet)
+    return ConstantRange::getFull(BitWidth);
+
+  if (isKnownPositive(Step) &&
+      isKnownPredicateViaConstantRanges(LEPred, Start, End))
+    return RangeBetween;
+  else if (isKnownNegative(Step) &&
+           isKnownPredicateViaConstantRanges(GEPred, Start, End))
+    return RangeBetween;
+  return ConstantRange::getFull(BitWidth);
+}
+
 ConstantRange ScalarEvolution::getRangeViaFactoring(const SCEV *Start,
                                                     const SCEV *Step,
                                                     const SCEV *MaxBECount,
@@ -5709,7 +6091,7 @@ ConstantRange ScalarEvolution::getRangeViaFactoring(const SCEV *Start,
       }
 
       // Peel off a cast operation
-      if (auto *SCast = dyn_cast<SCEVCastExpr>(S)) {
+      if (auto *SCast = dyn_cast<SCEVIntegralCastExpr>(S)) {
         CastOp = SCast->getSCEVType();
         S = SCast->getOperand();
       }
@@ -5910,7 +6292,7 @@ bool ScalarEvolution::isAddRecNeverPoison(const Instruction *I, const Loop *L) {
     const Instruction *Poison = PoisonStack.pop_back_val();
 
     for (auto *PoisonUser : Poison->users()) {
-      if (propagatesPoison(cast<Instruction>(PoisonUser))) {
+      if (propagatesPoison(cast<Operator>(PoisonUser))) {
         if (Pushed.insert(cast<Instruction>(PoisonUser)).second)
           PoisonStack.push_back(cast<Instruction>(PoisonUser));
       } else if (auto *BI = dyn_cast<BranchInst>(PoisonUser)) {
@@ -5974,6 +6356,7 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
   } else if (ConstantInt *CI = dyn_cast<ConstantInt>(V))
     return getConstant(CI);
   else if (isa<ConstantPointerNull>(V))
+    // FIXME: we shouldn't special-case null pointer constant.
     return getZero(V->getType());
   else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V))
     return GA->isInterposable() ? getUnknown(V) : getSCEV(GA->getAliasee());
@@ -6264,6 +6647,15 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
           }
         }
       }
+      if (BO->IsExact) {
+        // Given exact arithmetic in-bounds right-shift by a constant,
+        // we can lower it into:  (abs(x) EXACT/u (1<<C)) * signum(x)
+        const SCEV *X = getSCEV(BO->LHS);
+        const SCEV *AbsX = getAbsExpr(X, /*IsNSW=*/false);
+        APInt Mult = APInt::getOneBitSet(BitWidth, AShrAmt);
+        const SCEV *Div = getUDivExactExpr(AbsX, getConstant(Mult));
+        return getMulExpr(Div, getSignumExpr(X), SCEV::FlagNSW);
+      }
       break;
     }
     }
@@ -6300,6 +6692,29 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
       return getSCEV(U->getOperand(0));
     break;
 
+  case Instruction::PtrToInt: {
+    // Pointer to integer cast is straight-forward, so do model it.
+    Value *Ptr = U->getOperand(0);
+    const SCEV *Op = getSCEV(Ptr);
+    Type *DstIntTy = U->getType();
+    // SCEV doesn't have constant pointer expression type, but it supports
+    // nullptr constant (and only that one), which is modelled in SCEV as a
+    // zero integer constant. So just skip the ptrtoint cast for constants.
+    if (isa<SCEVConstant>(Op))
+      return getTruncateOrZeroExtend(Op, DstIntTy);
+    Type *PtrTy = Ptr->getType();
+    Type *IntPtrTy = getDataLayout().getIntPtrType(PtrTy);
+    // But only if effective SCEV (integer) type is wide enough to represent
+    // all possible pointer values.
+    if (getDataLayout().getTypeSizeInBits(getEffectiveSCEVType(PtrTy)) !=
+        getDataLayout().getTypeSizeInBits(IntPtrTy))
+      return getUnknown(V);
+    return getPtrToIntExpr(Op, DstIntTy);
+  }
+  case Instruction::IntToPtr:
+    // Just don't deal with inttoptr casts.
+    return getUnknown(V);
+
   case Instruction::SDiv:
     // If both operands are non-negative, this is just an udiv.
     if (isKnownNonNegative(getSCEV(U->getOperand(0))) &&
@@ -6314,11 +6729,6 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
       return getURemExpr(getSCEV(U->getOperand(0)), getSCEV(U->getOperand(1)));
     break;
 
-  // It's tempting to handle inttoptr and ptrtoint as no-ops, however this can
-  // lead to pointer expressions which cannot safely be expanded to GEPs,
-  // because ScalarEvolution doesn't respect the GEP aliasing rules when
-  // simplifying integer expressions.
-
   case Instruction::GetElementPtr:
     return createNodeForGEP(cast<GEPOperator>(U));
 
@@ -6339,6 +6749,45 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
   case Instruction::Invoke:
     if (Value *RV = cast<CallBase>(U)->getReturnedArgOperand())
       return getSCEV(RV);
+
+    if (auto *II = dyn_cast<IntrinsicInst>(U)) {
+      switch (II->getIntrinsicID()) {
+      case Intrinsic::abs:
+        return getAbsExpr(
+            getSCEV(II->getArgOperand(0)),
+            /*IsNSW=*/cast<ConstantInt>(II->getArgOperand(1))->isOne());
+      case Intrinsic::umax:
+        return getUMaxExpr(getSCEV(II->getArgOperand(0)),
+                           getSCEV(II->getArgOperand(1)));
+      case Intrinsic::umin:
+        return getUMinExpr(getSCEV(II->getArgOperand(0)),
+                           getSCEV(II->getArgOperand(1)));
+      case Intrinsic::smax:
+        return getSMaxExpr(getSCEV(II->getArgOperand(0)),
+                           getSCEV(II->getArgOperand(1)));
+      case Intrinsic::smin:
+        return getSMinExpr(getSCEV(II->getArgOperand(0)),
+                           getSCEV(II->getArgOperand(1)));
+      case Intrinsic::usub_sat: {
+        const SCEV *X = getSCEV(II->getArgOperand(0));
+        const SCEV *Y = getSCEV(II->getArgOperand(1));
+        const SCEV *ClampedY = getUMinExpr(X, Y);
+        return getMinusSCEV(X, ClampedY, SCEV::FlagNUW);
+      }
+      case Intrinsic::uadd_sat: {
+        const SCEV *X = getSCEV(II->getArgOperand(0));
+        const SCEV *Y = getSCEV(II->getArgOperand(1));
+        const SCEV *ClampedX = getUMinExpr(X, getNotSCEV(Y));
+        return getAddExpr(ClampedX, Y, SCEV::FlagNUW);
+      }
+      case Intrinsic::start_loop_iterations:
+        // A start_loop_iterations is just equivalent to the first operand for
+        // SCEV purposes.
+        return getSCEV(II->getArgOperand(0));
+      default:
+        break;
+      }
+    }
     break;
   }
 
@@ -6371,8 +6820,9 @@ unsigned ScalarEvolution::getSmallConstantTripCount(const Loop *L) {
   return 0;
 }
 
-unsigned ScalarEvolution::getSmallConstantTripCount(const Loop *L,
-                                                    BasicBlock *ExitingBlock) {
+unsigned
+ScalarEvolution::getSmallConstantTripCount(const Loop *L,
+                                           const BasicBlock *ExitingBlock) {
   assert(ExitingBlock && "Must pass a non-null exiting block!");
   assert(L->isLoopExiting(ExitingBlock) &&
          "Exiting block must actually branch out of the loop!");
@@ -6409,7 +6859,7 @@ unsigned ScalarEvolution::getSmallConstantTripMultiple(const Loop *L) {
 /// that control exits the loop via ExitingBlock.
 unsigned
 ScalarEvolution::getSmallConstantTripMultiple(const Loop *L,
-                                              BasicBlock *ExitingBlock) {
+                                              const BasicBlock *ExitingBlock) {
   assert(ExitingBlock && "Must pass a non-null exiting block!");
   assert(L->isLoopExiting(ExitingBlock) &&
          "Exiting block must actually branch out of the loop!");
@@ -6440,13 +6890,14 @@ ScalarEvolution::getSmallConstantTripMultiple(const Loop *L,
 }
 
 const SCEV *ScalarEvolution::getExitCount(const Loop *L,
-                                          BasicBlock *ExitingBlock,
+                                          const BasicBlock *ExitingBlock,
                                           ExitCountKind Kind) {
   switch (Kind) {
   case Exact:
+  case SymbolicMaximum:
     return getBackedgeTakenInfo(L).getExact(ExitingBlock, this);
   case ConstantMaximum:
-    return getBackedgeTakenInfo(L).getMax(ExitingBlock, this);
+    return getBackedgeTakenInfo(L).getConstantMax(ExitingBlock, this);
   };
   llvm_unreachable("Invalid ExitCountKind!");
 }
@@ -6463,13 +6914,15 @@ const SCEV *ScalarEvolution::getBackedgeTakenCount(const Loop *L,
   case Exact:
     return getBackedgeTakenInfo(L).getExact(L, this);
   case ConstantMaximum:
-    return getBackedgeTakenInfo(L).getMax(this);
+    return getBackedgeTakenInfo(L).getConstantMax(this);
+  case SymbolicMaximum:
+    return getBackedgeTakenInfo(L).getSymbolicMax(L, this);
   };
   llvm_unreachable("Invalid ExitCountKind!");
 }
 
 bool ScalarEvolution::isBackedgeTakenCountMaxOrZero(const Loop *L) {
-  return getBackedgeTakenInfo(L).isMaxOrZero(this);
+  return getBackedgeTakenInfo(L).isConstantMaxOrZero(this);
 }
 
 /// Push PHI nodes in the header of the given loop onto the given Worklist.
@@ -6499,7 +6952,7 @@ ScalarEvolution::getPredicatedBackedgeTakenInfo(const Loop *L) {
   return PredicatedBackedgeTakenCounts.find(L)->second = std::move(Result);
 }
 
-const ScalarEvolution::BackedgeTakenInfo &
+ScalarEvolution::BackedgeTakenInfo &
 ScalarEvolution::getBackedgeTakenInfo(const Loop *L) {
   // Initially insert an invalid entry for this loop. If the insertion
   // succeeds, proceed to actually compute a backedge-taken count and
@@ -6523,12 +6976,11 @@ ScalarEvolution::getBackedgeTakenInfo(const Loop *L) {
   const SCEV *BEExact = Result.getExact(L, this);
   if (BEExact != getCouldNotCompute()) {
     assert(isLoopInvariant(BEExact, L) &&
-           isLoopInvariant(Result.getMax(this), L) &&
+           isLoopInvariant(Result.getConstantMax(this), L) &&
            "Computed backedge-taken count isn't loop invariant for loop!");
     ++NumTripCountsComputed;
-  }
-  else if (Result.getMax(this) == getCouldNotCompute() &&
-           isa<PHINode>(L->getHeader()->begin())) {
+  } else if (Result.getConstantMax(this) == getCouldNotCompute() &&
+             isa<PHINode>(L->getHeader()->begin())) {
     // Only count loops that have phi nodes as not being computable.
     ++NumTripCountsNotComputed;
   }
@@ -6769,7 +7221,7 @@ ScalarEvolution::BackedgeTakenInfo::getExact(const Loop *L, ScalarEvolution *SE,
 
 /// Get the exact not taken count for this loop exit.
 const SCEV *
-ScalarEvolution::BackedgeTakenInfo::getExact(BasicBlock *ExitingBlock,
+ScalarEvolution::BackedgeTakenInfo::getExact(const BasicBlock *ExitingBlock,
                                              ScalarEvolution *SE) const {
   for (auto &ENT : ExitNotTaken)
     if (ENT.ExitingBlock == ExitingBlock && ENT.hasAlwaysTruePredicate())
@@ -6778,9 +7230,8 @@ ScalarEvolution::BackedgeTakenInfo::getExact(BasicBlock *ExitingBlock,
   return SE->getCouldNotCompute();
 }
 
-const SCEV *
-ScalarEvolution::BackedgeTakenInfo::getMax(BasicBlock *ExitingBlock,
-                                           ScalarEvolution *SE) const {
+const SCEV *ScalarEvolution::BackedgeTakenInfo::getConstantMax(
+    const BasicBlock *ExitingBlock, ScalarEvolution *SE) const {
   for (auto &ENT : ExitNotTaken)
     if (ENT.ExitingBlock == ExitingBlock && ENT.hasAlwaysTruePredicate())
       return ENT.MaxNotTaken;
@@ -6788,22 +7239,32 @@ ScalarEvolution::BackedgeTakenInfo::getMax(BasicBlock *ExitingBlock,
   return SE->getCouldNotCompute();
 }
 
-/// getMax - Get the max backedge taken count for the loop.
+/// getConstantMax - Get the constant max backedge taken count for the loop.
 const SCEV *
-ScalarEvolution::BackedgeTakenInfo::getMax(ScalarEvolution *SE) const {
+ScalarEvolution::BackedgeTakenInfo::getConstantMax(ScalarEvolution *SE) const {
   auto PredicateNotAlwaysTrue = [](const ExitNotTakenInfo &ENT) {
     return !ENT.hasAlwaysTruePredicate();
   };
 
-  if (any_of(ExitNotTaken, PredicateNotAlwaysTrue) || !getMax())
+  if (any_of(ExitNotTaken, PredicateNotAlwaysTrue) || !getConstantMax())
     return SE->getCouldNotCompute();
 
-  assert((isa<SCEVCouldNotCompute>(getMax()) || isa<SCEVConstant>(getMax())) &&
+  assert((isa<SCEVCouldNotCompute>(getConstantMax()) ||
+          isa<SCEVConstant>(getConstantMax())) &&
          "No point in having a non-constant max backedge taken count!");
-  return getMax();
+  return getConstantMax();
+}
+
+const SCEV *
+ScalarEvolution::BackedgeTakenInfo::getSymbolicMax(const Loop *L,
+                                                   ScalarEvolution *SE) {
+  if (!SymbolicMax)
+    SymbolicMax = SE->computeSymbolicMaxBackedgeTakenCount(L);
+  return SymbolicMax;
 }
 
-bool ScalarEvolution::BackedgeTakenInfo::isMaxOrZero(ScalarEvolution *SE) const {
+bool ScalarEvolution::BackedgeTakenInfo::isConstantMaxOrZero(
+    ScalarEvolution *SE) const {
   auto PredicateNotAlwaysTrue = [](const ExitNotTakenInfo &ENT) {
     return !ENT.hasAlwaysTruePredicate();
   };
@@ -6812,8 +7273,8 @@ bool ScalarEvolution::BackedgeTakenInfo::isMaxOrZero(ScalarEvolution *SE) const
 
 bool ScalarEvolution::BackedgeTakenInfo::hasOperand(const SCEV *S,
                                                     ScalarEvolution *SE) const {
-  if (getMax() && getMax() != SE->getCouldNotCompute() &&
-      SE->hasOperand(getMax(), S))
+  if (getConstantMax() && getConstantMax() != SE->getCouldNotCompute() &&
+      SE->hasOperand(getConstantMax(), S))
     return true;
 
   for (auto &ENT : ExitNotTaken)
@@ -6866,10 +7327,9 @@ ScalarEvolution::ExitLimit::ExitLimit(const SCEV *E, const SCEV *M,
 /// Allocate memory for BackedgeTakenInfo and copy the not-taken count of each
 /// computable exit into a persistent ExitNotTakenInfo array.
 ScalarEvolution::BackedgeTakenInfo::BackedgeTakenInfo(
-    ArrayRef<ScalarEvolution::BackedgeTakenInfo::EdgeExitInfo>
-        ExitCounts,
-    bool Complete, const SCEV *MaxCount, bool MaxOrZero)
-    : MaxAndComplete(MaxCount, Complete), MaxOrZero(MaxOrZero) {
+    ArrayRef<ScalarEvolution::BackedgeTakenInfo::EdgeExitInfo> ExitCounts,
+    bool IsComplete, const SCEV *ConstantMax, bool MaxOrZero)
+    : ConstantMax(ConstantMax), IsComplete(IsComplete), MaxOrZero(MaxOrZero) {
   using EdgeExitInfo = ScalarEvolution::BackedgeTakenInfo::EdgeExitInfo;
 
   ExitNotTaken.reserve(ExitCounts.size());
@@ -6889,7 +7349,8 @@ ScalarEvolution::BackedgeTakenInfo::BackedgeTakenInfo(
         return ExitNotTakenInfo(ExitBB, EL.ExactNotTaken, EL.MaxNotTaken,
                                 std::move(Predicate));
       });
-  assert((isa<SCEVCouldNotCompute>(MaxCount) || isa<SCEVConstant>(MaxCount)) &&
+  assert((isa<SCEVCouldNotCompute>(ConstantMax) ||
+          isa<SCEVConstant>(ConstantMax)) &&
          "No point in having a non-constant max backedge taken count!");
 }
 
@@ -7078,114 +7539,10 @@ ScalarEvolution::ExitLimit ScalarEvolution::computeExitLimitFromCondCached(
 ScalarEvolution::ExitLimit ScalarEvolution::computeExitLimitFromCondImpl(
     ExitLimitCacheTy &Cache, const Loop *L, Value *ExitCond, bool ExitIfTrue,
     bool ControlsExit, bool AllowPredicates) {
-  // Check if the controlling expression for this loop is an And or Or.
-  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(ExitCond)) {
-    if (BO->getOpcode() == Instruction::And) {
-      // Recurse on the operands of the and.
-      bool EitherMayExit = !ExitIfTrue;
-      ExitLimit EL0 = computeExitLimitFromCondCached(
-          Cache, L, BO->getOperand(0), ExitIfTrue,
-          ControlsExit && !EitherMayExit, AllowPredicates);
-      ExitLimit EL1 = computeExitLimitFromCondCached(
-          Cache, L, BO->getOperand(1), ExitIfTrue,
-          ControlsExit && !EitherMayExit, AllowPredicates);
-      // Be robust against unsimplified IR for the form "and i1 X, true"
-      if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->getOperand(1)))
-        return CI->isOne() ? EL0 : EL1;
-      if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->getOperand(0)))
-        return CI->isOne() ? EL1 : EL0;
-      const SCEV *BECount = getCouldNotCompute();
-      const SCEV *MaxBECount = getCouldNotCompute();
-      if (EitherMayExit) {
-        // Both conditions must be true for the loop to continue executing.
-        // Choose the less conservative count.
-        if (EL0.ExactNotTaken == getCouldNotCompute() ||
-            EL1.ExactNotTaken == getCouldNotCompute())
-          BECount = getCouldNotCompute();
-        else
-          BECount =
-              getUMinFromMismatchedTypes(EL0.ExactNotTaken, EL1.ExactNotTaken);
-        if (EL0.MaxNotTaken == getCouldNotCompute())
-          MaxBECount = EL1.MaxNotTaken;
-        else if (EL1.MaxNotTaken == getCouldNotCompute())
-          MaxBECount = EL0.MaxNotTaken;
-        else
-          MaxBECount =
-              getUMinFromMismatchedTypes(EL0.MaxNotTaken, EL1.MaxNotTaken);
-      } else {
-        // Both conditions must be true at the same time for the loop to exit.
-        // For now, be conservative.
-        if (EL0.MaxNotTaken == EL1.MaxNotTaken)
-          MaxBECount = EL0.MaxNotTaken;
-        if (EL0.ExactNotTaken == EL1.ExactNotTaken)
-          BECount = EL0.ExactNotTaken;
-      }
-
-      // There are cases (e.g. PR26207) where computeExitLimitFromCond is able
-      // to be more aggressive when computing BECount than when computing
-      // MaxBECount.  In these cases it is possible for EL0.ExactNotTaken and
-      // EL1.ExactNotTaken to match, but for EL0.MaxNotTaken and EL1.MaxNotTaken
-      // to not.
-      if (isa<SCEVCouldNotCompute>(MaxBECount) &&
-          !isa<SCEVCouldNotCompute>(BECount))
-        MaxBECount = getConstant(getUnsignedRangeMax(BECount));
-
-      return ExitLimit(BECount, MaxBECount, false,
-                       {&EL0.Predicates, &EL1.Predicates});
-    }
-    if (BO->getOpcode() == Instruction::Or) {
-      // Recurse on the operands of the or.
-      bool EitherMayExit = ExitIfTrue;
-      ExitLimit EL0 = computeExitLimitFromCondCached(
-          Cache, L, BO->getOperand(0), ExitIfTrue,
-          ControlsExit && !EitherMayExit, AllowPredicates);
-      ExitLimit EL1 = computeExitLimitFromCondCached(
-          Cache, L, BO->getOperand(1), ExitIfTrue,
-          ControlsExit && !EitherMayExit, AllowPredicates);
-      // Be robust against unsimplified IR for the form "or i1 X, true"
-      if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->getOperand(1)))
-        return CI->isZero() ? EL0 : EL1;
-      if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->getOperand(0)))
-        return CI->isZero() ? EL1 : EL0;
-      const SCEV *BECount = getCouldNotCompute();
-      const SCEV *MaxBECount = getCouldNotCompute();
-      if (EitherMayExit) {
-        // Both conditions must be false for the loop to continue executing.
-        // Choose the less conservative count.
-        if (EL0.ExactNotTaken == getCouldNotCompute() ||
-            EL1.ExactNotTaken == getCouldNotCompute())
-          BECount = getCouldNotCompute();
-        else
-          BECount =
-              getUMinFromMismatchedTypes(EL0.ExactNotTaken, EL1.ExactNotTaken);
-        if (EL0.MaxNotTaken == getCouldNotCompute())
-          MaxBECount = EL1.MaxNotTaken;
-        else if (EL1.MaxNotTaken == getCouldNotCompute())
-          MaxBECount = EL0.MaxNotTaken;
-        else
-          MaxBECount =
-              getUMinFromMismatchedTypes(EL0.MaxNotTaken, EL1.MaxNotTaken);
-      } else {
-        // Both conditions must be false at the same time for the loop to exit.
-        // For now, be conservative.
-        if (EL0.MaxNotTaken == EL1.MaxNotTaken)
-          MaxBECount = EL0.MaxNotTaken;
-        if (EL0.ExactNotTaken == EL1.ExactNotTaken)
-          BECount = EL0.ExactNotTaken;
-      }
-      // There are cases (e.g. PR26207) where computeExitLimitFromCond is able
-      // to be more aggressive when computing BECount than when computing
-      // MaxBECount.  In these cases it is possible for EL0.ExactNotTaken and
-      // EL1.ExactNotTaken to match, but for EL0.MaxNotTaken and EL1.MaxNotTaken
-      // to not.
-      if (isa<SCEVCouldNotCompute>(MaxBECount) &&
-          !isa<SCEVCouldNotCompute>(BECount))
-        MaxBECount = getConstant(getUnsignedRangeMax(BECount));
-
-      return ExitLimit(BECount, MaxBECount, false,
-                       {&EL0.Predicates, &EL1.Predicates});
-    }
-  }
+  // Handle BinOp conditions (And, Or).
+  if (auto LimitFromBinOp = computeExitLimitFromCondFromBinOp(
+          Cache, L, ExitCond, ExitIfTrue, ControlsExit, AllowPredicates))
+    return *LimitFromBinOp;
 
   // With an icmp, it may be feasible to compute an exact backedge-taken count.
   // Proceed to the next level to examine the icmp.
@@ -7217,6 +7574,95 @@ ScalarEvolution::ExitLimit ScalarEvolution::computeExitLimitFromCondImpl(
   return computeExitCountExhaustively(L, ExitCond, ExitIfTrue);
 }
 
+Optional<ScalarEvolution::ExitLimit>
+ScalarEvolution::computeExitLimitFromCondFromBinOp(
+    ExitLimitCacheTy &Cache, const Loop *L, Value *ExitCond, bool ExitIfTrue,
+    bool ControlsExit, bool AllowPredicates) {
+  // Check if the controlling expression for this loop is an And or Or.
+  Value *Op0, *Op1;
+  bool IsAnd = false;
+  if (match(ExitCond, m_LogicalAnd(m_Value(Op0), m_Value(Op1))))
+    IsAnd = true;
+  else if (match(ExitCond, m_LogicalOr(m_Value(Op0), m_Value(Op1))))
+    IsAnd = false;
+  else
+    return None;
+
+  // EitherMayExit is true in these two cases:
+  //   br (and Op0 Op1), loop, exit
+  //   br (or  Op0 Op1), exit, loop
+  bool EitherMayExit = IsAnd ^ ExitIfTrue;
+  ExitLimit EL0 = computeExitLimitFromCondCached(Cache, L, Op0, ExitIfTrue,
+                                                 ControlsExit && !EitherMayExit,
+                                                 AllowPredicates);
+  ExitLimit EL1 = computeExitLimitFromCondCached(Cache, L, Op1, ExitIfTrue,
+                                                 ControlsExit && !EitherMayExit,
+                                                 AllowPredicates);
+
+  // Be robust against unsimplified IR for the form "op i1 X, NeutralElement"
+  const Constant *NeutralElement = ConstantInt::get(ExitCond->getType(), IsAnd);
+  if (isa<ConstantInt>(Op1))
+    return Op1 == NeutralElement ? EL0 : EL1;
+  if (isa<ConstantInt>(Op0))
+    return Op0 == NeutralElement ? EL1 : EL0;
+
+  const SCEV *BECount = getCouldNotCompute();
+  const SCEV *MaxBECount = getCouldNotCompute();
+  if (EitherMayExit) {
+    // Both conditions must be same for the loop to continue executing.
+    // Choose the less conservative count.
+    // If ExitCond is a short-circuit form (select), using
+    // umin(EL0.ExactNotTaken, EL1.ExactNotTaken) is unsafe in general.
+    // To see the detailed examples, please see
+    // test/Analysis/ScalarEvolution/exit-count-select.ll
+    bool PoisonSafe = isa<BinaryOperator>(ExitCond);
+    if (!PoisonSafe)
+      // Even if ExitCond is select, we can safely derive BECount using both
+      // EL0 and EL1 in these cases:
+      // (1) EL0.ExactNotTaken is non-zero
+      // (2) EL1.ExactNotTaken is non-poison
+      // (3) EL0.ExactNotTaken is zero (BECount should be simply zero and
+      //     it cannot be umin(0, ..))
+      // The PoisonSafe assignment below is simplified and the assertion after
+      // BECount calculation fully guarantees the condition (3).
+      PoisonSafe = isa<SCEVConstant>(EL0.ExactNotTaken) ||
+                   isa<SCEVConstant>(EL1.ExactNotTaken);
+    if (EL0.ExactNotTaken != getCouldNotCompute() &&
+        EL1.ExactNotTaken != getCouldNotCompute() && PoisonSafe) {
+      BECount =
+          getUMinFromMismatchedTypes(EL0.ExactNotTaken, EL1.ExactNotTaken);
+
+      // If EL0.ExactNotTaken was zero and ExitCond was a short-circuit form,
+      // it should have been simplified to zero (see the condition (3) above)
+      assert(!isa<BinaryOperator>(ExitCond) || !EL0.ExactNotTaken->isZero() ||
+             BECount->isZero());
+    }
+    if (EL0.MaxNotTaken == getCouldNotCompute())
+      MaxBECount = EL1.MaxNotTaken;
+    else if (EL1.MaxNotTaken == getCouldNotCompute())
+      MaxBECount = EL0.MaxNotTaken;
+    else
+      MaxBECount = getUMinFromMismatchedTypes(EL0.MaxNotTaken, EL1.MaxNotTaken);
+  } else {
+    // Both conditions must be same at the same time for the loop to exit.
+    // For now, be conservative.
+    if (EL0.ExactNotTaken == EL1.ExactNotTaken)
+      BECount = EL0.ExactNotTaken;
+  }
+
+  // There are cases (e.g. PR26207) where computeExitLimitFromCond is able
+  // to be more aggressive when computing BECount than when computing
+  // MaxBECount.  In these cases it is possible for EL0.ExactNotTaken and
+  // EL1.ExactNotTaken to match, but for EL0.MaxNotTaken and EL1.MaxNotTaken
+  // to not.
+  if (isa<SCEVCouldNotCompute>(MaxBECount) &&
+      !isa<SCEVCouldNotCompute>(BECount))
+    MaxBECount = getConstant(getUnsignedRangeMax(BECount));
+
+  return ExitLimit(BECount, MaxBECount, false,
+                   { &EL0.Predicates, &EL1.Predicates });
+}
+
 ScalarEvolution::ExitLimit
 ScalarEvolution::computeExitLimitFromICmp(const Loop *L,
                                           ICmpInst *ExitCond,
@@ -7911,100 +8357,110 @@ const SCEV *ScalarEvolution::getSCEVAtScope(const SCEV *V, const Loop *L) {
 /// SCEVConstant, because SCEVConstant is restricted to ConstantInt.
 /// Returns NULL if the SCEV isn't representable as a Constant.
 static Constant *BuildConstantFromSCEV(const SCEV *V) {
-  switch (static_cast<SCEVTypes>(V->getSCEVType())) {
-    case scCouldNotCompute:
-    case scAddRecExpr:
-      break;
-    case scConstant:
-      return cast<SCEVConstant>(V)->getValue();
-    case scUnknown:
-      return dyn_cast<Constant>(cast<SCEVUnknown>(V)->getValue());
-    case scSignExtend: {
-      const SCEVSignExtendExpr *SS = cast<SCEVSignExtendExpr>(V);
-      if (Constant *CastOp = BuildConstantFromSCEV(SS->getOperand()))
-        return ConstantExpr::getSExt(CastOp, SS->getType());
-      break;
-    }
-    case scZeroExtend: {
-      const SCEVZeroExtendExpr *SZ = cast<SCEVZeroExtendExpr>(V);
-      if (Constant *CastOp = BuildConstantFromSCEV(SZ->getOperand()))
-        return ConstantExpr::getZExt(CastOp, SZ->getType());
-      break;
-    }
-    case scTruncate: {
-      const SCEVTruncateExpr *ST = cast<SCEVTruncateExpr>(V);
-      if (Constant *CastOp = BuildConstantFromSCEV(ST->getOperand()))
-        return ConstantExpr::getTrunc(CastOp, ST->getType());
-      break;
-    }
-    case scAddExpr: {
-      const SCEVAddExpr *SA = cast<SCEVAddExpr>(V);
-      if (Constant *C = BuildConstantFromSCEV(SA->getOperand(0))) {
-        if (PointerType *PTy = dyn_cast<PointerType>(C->getType())) {
-          unsigned AS = PTy->getAddressSpace();
-          Type *DestPtrTy = Type::getInt8PtrTy(C->getContext(), AS);
-          C = ConstantExpr::getBitCast(C, DestPtrTy);
-        }
-        for (unsigned i = 1, e = SA->getNumOperands(); i != e; ++i) {
-          Constant *C2 = BuildConstantFromSCEV(SA->getOperand(i));
-          if (!C2) return nullptr;
-
-          // First pointer!
-          if (!C->getType()->isPointerTy() && C2->getType()->isPointerTy()) {
-            unsigned AS = C2->getType()->getPointerAddressSpace();
-            std::swap(C, C2);
-            Type *DestPtrTy = Type::getInt8PtrTy(C->getContext(), AS);
-            // The offsets have been converted to bytes.  We can add bytes to an
-            // i8* by GEP with the byte count in the first index.
-            C = ConstantExpr::getBitCast(C, DestPtrTy);
-          }
-
-          // Don't bother trying to sum two pointers. We probably can't
-          // statically compute a load that results from it anyway.
-          if (C2->getType()->isPointerTy())
-            return nullptr;
+  switch (V->getSCEVType()) {
+  case scCouldNotCompute:
+  case scAddRecExpr:
+    return nullptr;
+  case scConstant:
+    return cast<SCEVConstant>(V)->getValue();
+  case scUnknown:
+    return dyn_cast<Constant>(cast<SCEVUnknown>(V)->getValue());
+  case scSignExtend: {
+    const SCEVSignExtendExpr *SS = cast<SCEVSignExtendExpr>(V);
+    if (Constant *CastOp = BuildConstantFromSCEV(SS->getOperand()))
+      return ConstantExpr::getSExt(CastOp, SS->getType());
+    return nullptr;
+  }
+  case scZeroExtend: {
+    const SCEVZeroExtendExpr *SZ = cast<SCEVZeroExtendExpr>(V);
+    if (Constant *CastOp = BuildConstantFromSCEV(SZ->getOperand()))
+      return ConstantExpr::getZExt(CastOp, SZ->getType());
+    return nullptr;
+  }
+  case scPtrToInt: {
+    const SCEVPtrToIntExpr *P2I = cast<SCEVPtrToIntExpr>(V);
+    if (Constant *CastOp = BuildConstantFromSCEV(P2I->getOperand()))
+      return ConstantExpr::getPtrToInt(CastOp, P2I->getType());
 
-          if (PointerType *PTy = dyn_cast<PointerType>(C->getType())) {
-            if (PTy->getElementType()->isStructTy())
-              C2 = ConstantExpr::getIntegerCast(
-                  C2, Type::getInt32Ty(C->getContext()), true);
-            C = ConstantExpr::getGetElementPtr(PTy->getElementType(), C, C2);
-          } else
-            C = ConstantExpr::getAdd(C, C2);
-        }
-        return C;
+    return nullptr;
+  }
+  case scTruncate: {
+    const SCEVTruncateExpr *ST = cast<SCEVTruncateExpr>(V);
+    if (Constant *CastOp = BuildConstantFromSCEV(ST->getOperand()))
+      return ConstantExpr::getTrunc(CastOp, ST->getType());
+    return nullptr;
+  }
+  case scAddExpr: {
+    const SCEVAddExpr *SA = cast<SCEVAddExpr>(V);
+    if (Constant *C = BuildConstantFromSCEV(SA->getOperand(0))) {
+      if (PointerType *PTy = dyn_cast<PointerType>(C->getType())) {
+        unsigned AS = PTy->getAddressSpace();
+        Type *DestPtrTy = Type::getInt8PtrTy(C->getContext(), AS);
+        C = ConstantExpr::getBitCast(C, DestPtrTy);
       }
-      break;
-    }
-    case scMulExpr: {
-      const SCEVMulExpr *SM = cast<SCEVMulExpr>(V);
-      if (Constant *C = BuildConstantFromSCEV(SM->getOperand(0))) {
-        // Don't bother with pointers at all.
-        if (C->getType()->isPointerTy()) return nullptr;
-        for (unsigned i = 1, e = SM->getNumOperands(); i != e; ++i) {
-          Constant *C2 = BuildConstantFromSCEV(SM->getOperand(i));
-          if (!C2 || C2->getType()->isPointerTy()) return nullptr;
-          C = ConstantExpr::getMul(C, C2);
+      for (unsigned i = 1, e = SA->getNumOperands(); i != e; ++i) {
+        Constant *C2 = BuildConstantFromSCEV(SA->getOperand(i));
+        if (!C2)
+          return nullptr;
+
+        // First pointer!
+        if (!C->getType()->isPointerTy() && C2->getType()->isPointerTy()) {
+          unsigned AS = C2->getType()->getPointerAddressSpace();
+          std::swap(C, C2);
+          Type *DestPtrTy = Type::getInt8PtrTy(C->getContext(), AS);
+          // The offsets have been converted to bytes.  We can add bytes to an
+          // i8* by GEP with the byte count in the first index.
+          C = ConstantExpr::getBitCast(C, DestPtrTy);
         }
-        return C;
+
+        // Don't bother trying to sum two pointers. We probably can't
+        // statically compute a load that results from it anyway.
+        if (C2->getType()->isPointerTy())
+          return nullptr;
+
+        if (PointerType *PTy = dyn_cast<PointerType>(C->getType())) {
+          if (PTy->getElementType()->isStructTy())
+            C2 = ConstantExpr::getIntegerCast(
+                C2, Type::getInt32Ty(C->getContext()), true);
+          C = ConstantExpr::getGetElementPtr(PTy->getElementType(), C, C2);
+        } else
+          C = ConstantExpr::getAdd(C, C2);
       }
-      break;
+      return C;
     }
-    case scUDivExpr: {
-      const SCEVUDivExpr *SU = cast<SCEVUDivExpr>(V);
-      if (Constant *LHS = BuildConstantFromSCEV(SU->getLHS()))
-        if (Constant *RHS = BuildConstantFromSCEV(SU->getRHS()))
-          if (LHS->getType() == RHS->getType())
-            return ConstantExpr::getUDiv(LHS, RHS);
-      break;
+    return nullptr;
+  }
+  case scMulExpr: {
+    const SCEVMulExpr *SM = cast<SCEVMulExpr>(V);
+    if (Constant *C = BuildConstantFromSCEV(SM->getOperand(0))) {
+      // Don't bother with pointers at all.
+      if (C->getType()->isPointerTy())
+        return nullptr;
+      for (unsigned i = 1, e = SM->getNumOperands(); i != e; ++i) {
+        Constant *C2 = BuildConstantFromSCEV(SM->getOperand(i));
+        if (!C2 || C2->getType()->isPointerTy())
+          return nullptr;
+        C = ConstantExpr::getMul(C, C2);
+      }
+      return C;
     }
-    case scSMaxExpr:
-    case scUMaxExpr:
-    case scSMinExpr:
-    case scUMinExpr:
-      break; // TODO: smax, umax, smin, umax.
+    return nullptr;
   }
-  return nullptr;
+  case scUDivExpr: {
+    const SCEVUDivExpr *SU = cast<SCEVUDivExpr>(V);
+    if (Constant *LHS = BuildConstantFromSCEV(SU->getLHS()))
+      if (Constant *RHS = BuildConstantFromSCEV(SU->getRHS()))
+        if (LHS->getType() == RHS->getType())
+          return ConstantExpr::getUDiv(LHS, RHS);
+    return nullptr;
+  }
+  case scSMaxExpr:
+  case scUMaxExpr:
+  case scSMinExpr:
+  case scUMinExpr:
+    return nullptr; // TODO: smax, umax, smin, umax.
+  }
+  llvm_unreachable("Unknown SCEV kind!");
 }
 
 const SCEV *ScalarEvolution::computeSCEVAtScope(const SCEV *V, const Loop *L) {
@@ -8015,22 +8471,22 @@ const SCEV *ScalarEvolution::computeSCEVAtScope(const SCEV *V, const Loop *L) {
   if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(V)) {
     if (Instruction *I = dyn_cast<Instruction>(SU->getValue())) {
       if (PHINode *PN = dyn_cast<PHINode>(I)) {
-        const Loop *LI = this->LI[I->getParent()];
+        const Loop *CurrLoop = this->LI[I->getParent()];
         // Looking for loop exit value.
-        if (LI && LI->getParentLoop() == L &&
-            PN->getParent() == LI->getHeader()) {
+        if (CurrLoop && CurrLoop->getParentLoop() == L &&
+            PN->getParent() == CurrLoop->getHeader()) {
           // Okay, there is no closed form solution for the PHI node.  Check
           // to see if the loop that contains it has a known backedge-taken
           // count.  If so, we may be able to force computation of the exit
           // value.
-          const SCEV *BackedgeTakenCount = getBackedgeTakenCount(LI);
+          const SCEV *BackedgeTakenCount = getBackedgeTakenCount(CurrLoop);
           // This trivial case can show up in some degenerate cases where
           // the incoming IR has not yet been fully simplified.
           if (BackedgeTakenCount->isZero()) {
             Value *InitValue = nullptr;
             bool MultipleInitValues = false;
             for (unsigned i = 0; i < PN->getNumIncomingValues(); i++) {
-              if (!LI->contains(PN->getIncomingBlock(i))) {
+              if (!CurrLoop->contains(PN->getIncomingBlock(i))) {
                 if (!InitValue)
                   InitValue = PN->getIncomingValue(i);
                 else if (InitValue != PN->getIncomingValue(i)) {
@@ -8048,17 +8504,18 @@ const SCEV *ScalarEvolution::computeSCEVAtScope(const SCEV *V, const Loop *L) {
               isKnownPositive(BackedgeTakenCount) &&
               PN->getNumIncomingValues() == 2) {
 
-            unsigned InLoopPred = LI->contains(PN->getIncomingBlock(0)) ? 0 : 1;
+            unsigned InLoopPred =
+                CurrLoop->contains(PN->getIncomingBlock(0)) ? 0 : 1;
             Value *BackedgeVal = PN->getIncomingValue(InLoopPred);
-            if (LI->isLoopInvariant(BackedgeVal))
+            if (CurrLoop->isLoopInvariant(BackedgeVal))
               return getSCEV(BackedgeVal);
           }
           if (auto *BTCC = dyn_cast<SCEVConstant>(BackedgeTakenCount)) {
             // Okay, we know how many times the containing loop executes.  If
             // this is a constant evolving PHI node, get the final value at
             // the specified iteration number.
-            Constant *RV =
-                getConstantEvolutionLoopExitValue(PN, BTCC->getAPInt(), LI);
+            Constant *RV = getConstantEvolutionLoopExitValue(
+                PN, BTCC->getAPInt(), CurrLoop);
             if (RV) return getSCEV(RV);
           }
         }
@@ -8114,9 +8571,10 @@ const SCEV *ScalarEvolution::computeSCEVAtScope(const SCEV *V, const Loop *L) {
           if (const CmpInst *CI = dyn_cast<CmpInst>(I))
             C = ConstantFoldCompareInstOperands(CI->getPredicate(), Operands[0],
                                                 Operands[1], DL, &TLI);
-          else if (const LoadInst *LI = dyn_cast<LoadInst>(I)) {
-            if (!LI->isVolatile())
-              C = ConstantFoldLoadFromConstPtr(Operands[0], LI->getType(), DL);
+          else if (const LoadInst *Load = dyn_cast<LoadInst>(I)) {
+            if (!Load->isVolatile())
+              C = ConstantFoldLoadFromConstPtr(Operands[0], Load->getType(),
+                                               DL);
           } else
             C = ConstantFoldInstOperands(I, Operands, DL, &TLI);
           if (!C) return V;
@@ -8233,6 +8691,13 @@ const SCEV *ScalarEvolution::computeSCEVAtScope(const SCEV *V, const Loop *L) {
     return getTruncateExpr(Op, Cast->getType());
   }
 
+  if (const SCEVPtrToIntExpr *Cast = dyn_cast<SCEVPtrToIntExpr>(V)) {
+    const SCEV *Op = getSCEVAtScope(Cast->getOperand(), L);
+    if (Op == Cast->getOperand())
+      return Cast; // must be loop invariant
+    return getPtrToIntExpr(Op, Cast->getType());
+  }
+
   llvm_unreachable("Unknown SCEV type!");
 }
 
@@ -8647,7 +9112,10 @@ ScalarEvolution::howFarToZero(const SCEV *V, const Loop *L, bool ControlsExit,
   // 1*N = -Start; -1*N = Start (mod 2^BW), so:
   //   N = Distance (as unsigned)
   if (StepC->getValue()->isOne() || StepC->getValue()->isMinusOne()) {
-    APInt MaxBECount = getUnsignedRangeMax(Distance);
+    APInt MaxBECount = getUnsignedRangeMax(applyLoopGuards(Distance, L));
+    APInt MaxBECountBase = getUnsignedRangeMax(Distance);
+    if (MaxBECountBase.ult(MaxBECount))
+      MaxBECount = MaxBECountBase;
 
     // When a loop like "for (int i = 0; i != n; ++i) { /* body */ }" is rotated,
     // we end up with a loop whose backedge-taken count is n - 1.  Detect this
@@ -8712,18 +9180,19 @@ ScalarEvolution::howFarToNonZero(const SCEV *V, const Loop *L) {
   return getCouldNotCompute();
 }
 
-std::pair<BasicBlock *, BasicBlock *>
-ScalarEvolution::getPredecessorWithUniqueSuccessorForBB(BasicBlock *BB) {
+std::pair<const BasicBlock *, const BasicBlock *>
+ScalarEvolution::getPredecessorWithUniqueSuccessorForBB(const BasicBlock *BB)
+    const {
   // If the block has a unique predecessor, then there is no path from the
   // predecessor to the block that does not go through the direct edge
   // from the predecessor to the block.
-  if (BasicBlock *Pred = BB->getSinglePredecessor())
+  if (const BasicBlock *Pred = BB->getSinglePredecessor())
     return {Pred, BB};
 
   // A loop's header is defined to be a block that dominates the loop.
   // If the header has a unique predecessor outside the loop, it must be
   // a block that has exactly one successor that can reach the loop.
-  if (Loop *L = LI.getLoopFor(BB))
+  if (const Loop *L = LI.getLoopFor(BB))
     return {L->getLoopPredecessor(), L->getHeader()};
 
   return {nullptr, nullptr};
@@ -9052,6 +9521,14 @@ bool ScalarEvolution::isKnownPredicate(ICmpInst::Predicate Pred,
   return isKnownViaNonRecursiveReasoning(Pred, LHS, RHS);
 }
 
+bool ScalarEvolution::isKnownPredicateAt(ICmpInst::Predicate Pred,
+                                         const SCEV *LHS, const SCEV *RHS,
+                                         const Instruction *Context) {
+  // TODO: Analyze guards and assumes from Context's block.
+  return isKnownPredicate(Pred, LHS, RHS) ||
+         isBasicBlockEntryGuardedByCond(Context->getParent(), Pred, LHS, RHS);
+}
+
 bool ScalarEvolution::isKnownOnEveryIteration(ICmpInst::Predicate Pred,
                                               const SCEVAddRecExpr *LHS,
                                               const SCEV *RHS) {
@@ -9060,31 +9537,30 @@ bool ScalarEvolution::isKnownOnEveryIteration(ICmpInst::Predicate Pred,
          isLoopBackedgeGuardedByCond(L, Pred, LHS->getPostIncExpr(*this), RHS);
 }
 
-bool ScalarEvolution::isMonotonicPredicate(const SCEVAddRecExpr *LHS,
-                                           ICmpInst::Predicate Pred,
-                                           bool &Increasing) {
-  bool Result = isMonotonicPredicateImpl(LHS, Pred, Increasing);
+Optional<ScalarEvolution::MonotonicPredicateType>
+ScalarEvolution::getMonotonicPredicateType(const SCEVAddRecExpr *LHS,
+                                           ICmpInst::Predicate Pred) {
+  auto Result = getMonotonicPredicateTypeImpl(LHS, Pred);
 
 #ifndef NDEBUG
   // Verify an invariant: inverting the predicate should turn a monotonically
   // increasing change to a monotonically decreasing one, and vice versa.
-  bool IncreasingSwapped;
-  bool ResultSwapped = isMonotonicPredicateImpl(
-      LHS, ICmpInst::getSwappedPredicate(Pred), IncreasingSwapped);
+  if (Result) {
+    auto ResultSwapped =
+        getMonotonicPredicateTypeImpl(LHS, ICmpInst::getSwappedPredicate(Pred));
 
-  assert(Result == ResultSwapped && "should be able to analyze both!");
-  if (ResultSwapped)
-    assert(Increasing == !IncreasingSwapped &&
+    assert(ResultSwapped.hasValue() && "should be able to analyze both!");
+    assert(ResultSwapped.getValue() != Result.getValue() &&
            "monotonicity should flip as we flip the predicate");
+  }
 #endif
 
   return Result;
 }
 
-bool ScalarEvolution::isMonotonicPredicateImpl(const SCEVAddRecExpr *LHS,
-                                               ICmpInst::Predicate Pred,
-                                               bool &Increasing) {
-
+Optional<ScalarEvolution::MonotonicPredicateType>
+ScalarEvolution::getMonotonicPredicateTypeImpl(const SCEVAddRecExpr *LHS,
+                                               ICmpInst::Predicate Pred) {
   // A zero step value for LHS means the induction variable is essentially a
   // loop invariant value. We don't really depend on the predicate actually
   // flipping from false to true (for increasing predicates, and the other way
@@ -9095,56 +9571,46 @@ bool ScalarEvolution::isMonotonicPredicateImpl(const SCEVAddRecExpr *LHS,
   // where SCEV can prove X >= 0 but not prove X > 0, so it is helpful to be
   // as general as possible.
 
-  switch (Pred) {
-  default:
-    return false; // Conservative answer
-
-  case ICmpInst::ICMP_UGT:
-  case ICmpInst::ICMP_UGE:
-  case ICmpInst::ICMP_ULT:
-  case ICmpInst::ICMP_ULE:
-    if (!LHS->hasNoUnsignedWrap())
-      return false;
+  // Only handle LE/LT/GE/GT predicates.
+  if (!ICmpInst::isRelational(Pred))
+    return None;
 
-    Increasing = Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_UGE;
-    return true;
+  bool IsGreater = ICmpInst::isGE(Pred) || ICmpInst::isGT(Pred);
+  assert((IsGreater || ICmpInst::isLE(Pred) || ICmpInst::isLT(Pred)) &&
+         "Should be greater or less!");
 
-  case ICmpInst::ICMP_SGT:
-  case ICmpInst::ICMP_SGE:
-  case ICmpInst::ICMP_SLT:
-  case ICmpInst::ICMP_SLE: {
+  // Check that AR does not wrap.
+  if (ICmpInst::isUnsigned(Pred)) {
+    if (!LHS->hasNoUnsignedWrap())
+      return None;
+    return IsGreater ? MonotonicallyIncreasing : MonotonicallyDecreasing;
+  } else {
+    assert(ICmpInst::isSigned(Pred) &&
+           "Relational predicate is either signed or unsigned!");
     if (!LHS->hasNoSignedWrap())
-      return false;
+      return None;
 
     const SCEV *Step = LHS->getStepRecurrence(*this);
 
-    if (isKnownNonNegative(Step)) {
-      Increasing = Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SGE;
-      return true;
-    }
-
-    if (isKnownNonPositive(Step)) {
-      Increasing = Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SLE;
-      return true;
-    }
+    if (isKnownNonNegative(Step))
+      return IsGreater ? MonotonicallyIncreasing : MonotonicallyDecreasing;
 
-    return false;
-  }
+    if (isKnownNonPositive(Step))
+      return !IsGreater ? MonotonicallyIncreasing : MonotonicallyDecreasing;
 
+    return None;
   }
-
-  llvm_unreachable("switch has default clause!");
 }
 
-bool ScalarEvolution::isLoopInvariantPredicate(
-    ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS, const Loop *L,
-    ICmpInst::Predicate &InvariantPred, const SCEV *&InvariantLHS,
-    const SCEV *&InvariantRHS) {
+Optional<ScalarEvolution::LoopInvariantPredicate>
+ScalarEvolution::getLoopInvariantPredicate(ICmpInst::Predicate Pred,
+                                           const SCEV *LHS, const SCEV *RHS,
+                                           const Loop *L) {
 
   // If there is a loop-invariant, force it into the RHS, otherwise bail out.
   if (!isLoopInvariant(RHS, L)) {
     if (!isLoopInvariant(LHS, L))
-      return false;
+      return None;
 
     std::swap(LHS, RHS);
     Pred = ICmpInst::getSwappedPredicate(Pred);
@@ -9152,12 +9618,11 @@ bool ScalarEvolution::isLoopInvariantPredicate(
 
   const SCEVAddRecExpr *ArLHS = dyn_cast<SCEVAddRecExpr>(LHS);
   if (!ArLHS || ArLHS->getLoop() != L)
-    return false;
-
-  bool Increasing;
-  if (!isMonotonicPredicate(ArLHS, Pred, Increasing))
-    return false;
+    return None;
 
+  auto MonotonicType = getMonotonicPredicateType(ArLHS, Pred);
+  if (!MonotonicType)
+    return None;
   // If the predicate "ArLHS `Pred` RHS" monotonically increases from false to
   // true as the loop iterates, and the backedge is control dependent on
   // "ArLHS `Pred` RHS" == true then we can reason as follows:
@@ -9175,16 +9640,77 @@ bool ScalarEvolution::isLoopInvariantPredicate(
   //
   // A similar reasoning applies for a monotonically decreasing predicate, by
   // replacing true with false and false with true in the above two bullets.
-
+  bool Increasing = *MonotonicType == ScalarEvolution::MonotonicallyIncreasing;
   auto P = Increasing ? Pred : ICmpInst::getInversePredicate(Pred);
 
   if (!isLoopBackedgeGuardedByCond(L, P, LHS, RHS))
-    return false;
+    return None;
 
-  InvariantPred = Pred;
-  InvariantLHS = ArLHS->getStart();
-  InvariantRHS = RHS;
-  return true;
+  return ScalarEvolution::LoopInvariantPredicate(Pred, ArLHS->getStart(), RHS);
+}
+
+Optional<ScalarEvolution::LoopInvariantPredicate>
+ScalarEvolution::getLoopInvariantExitCondDuringFirstIterations(
+    ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS, const Loop *L,
+    const Instruction *Context, const SCEV *MaxIter) {
+  // Try to prove the following set of facts:
+  // - The predicate is monotonic in the iteration space.
+  // - If the check does not fail on the 1st iteration:
+  //   - No overflow will happen during first MaxIter iterations;
+  //   - It will not fail on the MaxIter'th iteration.
+  // If the check does fail on the 1st iteration, we leave the loop and no
+  // other checks matter.
+
+  // If there is a loop-invariant, force it into the RHS, otherwise bail out.
+  if (!isLoopInvariant(RHS, L)) {
+    if (!isLoopInvariant(LHS, L))
+      return None;
+
+    std::swap(LHS, RHS);
+    Pred = ICmpInst::getSwappedPredicate(Pred);
+  }
+
+  auto *AR = dyn_cast<SCEVAddRecExpr>(LHS);
+  if (!AR || AR->getLoop() != L)
+    return None;
+
+  // The predicate must be relational (i.e. <, <=, >=, >).
+  if (!ICmpInst::isRelational(Pred))
+    return None;
+
+  // TODO: Support steps other than +/- 1.
+  const SCEV *Step = AR->getStepRecurrence(*this);
+  auto *One = getOne(Step->getType());
+  auto *MinusOne = getNegativeSCEV(One);
+  if (Step != One && Step != MinusOne)
+    return None;
+
+  // Type mismatch here means that MaxIter is potentially larger than max
+  // unsigned value in start type, which mean we cannot prove no wrap for the
+  // indvar.
+  if (AR->getType() != MaxIter->getType())
+    return None;
+
+  // Value of IV on suggested last iteration.
+  const SCEV *Last = AR->evaluateAtIteration(MaxIter, *this);
+  // Does it still meet the requirement?
+  if (!isLoopBackedgeGuardedByCond(L, Pred, Last, RHS))
+    return None;
+  // Because step is +/- 1 and MaxIter has same type as Start (i.e. it does
+  // not exceed max unsigned value of this type), this effectively proves
+  // that there is no wrap during the iteration. To prove that there is no
+  // signed/unsigned wrap, we need to check that
+  // Start <= Last for step = 1 or Start >= Last for step = -1.
+  ICmpInst::Predicate NoOverflowPred =
+      CmpInst::isSigned(Pred) ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE;
+  if (Step == MinusOne)
+    NoOverflowPred = CmpInst::getSwappedPredicate(NoOverflowPred);
+  const SCEV *Start = AR->getStart();
+  if (!isKnownPredicateAt(NoOverflowPred, Start, Last, Context))
+    return None;
+
+  // Everything is fine.
+  return ScalarEvolution::LoopInvariantPredicate(Pred, Start, RHS);
 }
 
 bool ScalarEvolution::isKnownPredicateViaConstantRanges(
@@ -9269,6 +9795,24 @@ bool ScalarEvolution::isKnownPredicateViaNoOverflow(ICmpInst::Predicate Pred,
     if (MatchBinaryAddToConst(LHS, RHS, C, SCEV::FlagNSW) && C.isNegative())
       return true;
     break;
+
+  case ICmpInst::ICMP_UGE:
+    std::swap(LHS, RHS);
+    LLVM_FALLTHROUGH;
+  case ICmpInst::ICMP_ULE:
+    // X u<= (X + C)<nuw> for any C
+    if (MatchBinaryAddToConst(RHS, LHS, C, SCEV::FlagNUW))
+      return true;
+    break;
+
+  case ICmpInst::ICMP_UGT:
+    std::swap(LHS, RHS);
+    LLVM_FALLTHROUGH;
+  case ICmpInst::ICMP_ULT:
+    // X u< (X + C)<nuw> if C != 0
+    if (MatchBinaryAddToConst(RHS, LHS, C, SCEV::FlagNUW) && !C.isNullValue())
+      return true;
+    break;
   }
 
   return false;
@@ -9296,14 +9840,14 @@ bool ScalarEvolution::isKnownPredicateViaSplitting(ICmpInst::Predicate Pred,
          isKnownPredicate(CmpInst::ICMP_SLT, LHS, RHS);
 }
 
-bool ScalarEvolution::isImpliedViaGuard(BasicBlock *BB,
+bool ScalarEvolution::isImpliedViaGuard(const BasicBlock *BB,
                                         ICmpInst::Predicate Pred,
                                         const SCEV *LHS, const SCEV *RHS) {
   // No need to even try if we know the module has no guards.
   if (!HasGuards)
     return false;
 
-  return any_of(*BB, [&](Instruction &I) {
+  return any_of(*BB, [&](const Instruction &I) {
     using namespace llvm::PatternMatch;
 
     Value *Condition;
@@ -9426,24 +9970,14 @@ ScalarEvolution::isLoopBackedgeGuardedByCond(const Loop *L,
   return false;
 }
 
-bool
-ScalarEvolution::isLoopEntryGuardedByCond(const Loop *L,
-                                          ICmpInst::Predicate Pred,
-                                          const SCEV *LHS, const SCEV *RHS) {
-  // Interpret a null as meaning no loop, where there is obviously no guard
-  // (interprocedural conditions notwithstanding).
-  if (!L) return false;
-
+bool ScalarEvolution::isBasicBlockEntryGuardedByCond(const BasicBlock *BB,
+                                                     ICmpInst::Predicate Pred,
+                                                     const SCEV *LHS,
+                                                     const SCEV *RHS) {
   if (VerifyIR)
-    assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()) &&
+    assert(!verifyFunction(*BB->getParent(), &dbgs()) &&
            "This cannot be done on broken IR!");
 
-  // Both LHS and RHS must be available at loop entry.
-  assert(isAvailableAtLoopEntry(LHS, L) &&
-         "LHS is not available at Loop Entry");
-  assert(isAvailableAtLoopEntry(RHS, L) &&
-         "RHS is not available at Loop Entry");
-
   if (isKnownViaNonRecursiveReasoning(Pred, LHS, RHS))
     return true;
 
@@ -9467,7 +10001,7 @@ ScalarEvolution::isLoopEntryGuardedByCond(const Loop *L,
   }
 
   // Try to prove (Pred, LHS, RHS) using isImpliedViaGuard.
-  auto ProveViaGuard = [&](BasicBlock *Block) {
+  auto ProveViaGuard = [&](const BasicBlock *Block) {
     if (isImpliedViaGuard(Block, Pred, LHS, RHS))
       return true;
     if (ProvingStrictComparison) {
@@ -9484,35 +10018,39 @@ ScalarEvolution::isLoopEntryGuardedByCond(const Loop *L,
   };
 
   // Try to prove (Pred, LHS, RHS) using isImpliedCond.
-  auto ProveViaCond = [&](Value *Condition, bool Inverse) {
-    if (isImpliedCond(Pred, LHS, RHS, Condition, Inverse))
+  auto ProveViaCond = [&](const Value *Condition, bool Inverse) {
+    const Instruction *Context = &BB->front();
+    if (isImpliedCond(Pred, LHS, RHS, Condition, Inverse, Context))
       return true;
     if (ProvingStrictComparison) {
       if (!ProvedNonStrictComparison)
-        ProvedNonStrictComparison =
-            isImpliedCond(NonStrictPredicate, LHS, RHS, Condition, Inverse);
+        ProvedNonStrictComparison = isImpliedCond(NonStrictPredicate, LHS, RHS,
+                                                  Condition, Inverse, Context);
       if (!ProvedNonEquality)
-        ProvedNonEquality =
-            isImpliedCond(ICmpInst::ICMP_NE, LHS, RHS, Condition, Inverse);
+        ProvedNonEquality = isImpliedCond(ICmpInst::ICMP_NE, LHS, RHS,
+                                          Condition, Inverse, Context);
       if (ProvedNonStrictComparison && ProvedNonEquality)
         return true;
     }
     return false;
   };
 
-  // Starting at the loop predecessor, climb up the predecessor chain, as long
+  // Starting at the block's predecessor, climb up the predecessor chain, as long
   // as there are predecessors that can be found that have unique successors
-  // leading to the original header.
-  for (std::pair<BasicBlock *, BasicBlock *>
-         Pair(L->getLoopPredecessor(), L->getHeader());
-       Pair.first;
-       Pair = getPredecessorWithUniqueSuccessorForBB(Pair.first)) {
-
+  // leading to the original block.
+  const Loop *ContainingLoop = LI.getLoopFor(BB);
+  const BasicBlock *PredBB;
+  if (ContainingLoop && ContainingLoop->getHeader() == BB)
+    PredBB = ContainingLoop->getLoopPredecessor();
+  else
+    PredBB = BB->getSinglePredecessor();
+  for (std::pair<const BasicBlock *, const BasicBlock *> Pair(PredBB, BB);
+       Pair.first; Pair = getPredecessorWithUniqueSuccessorForBB(Pair.first)) {
     if (ProveViaGuard(Pair.first))
       return true;
 
-    BranchInst *LoopEntryPredicate =
-      dyn_cast<BranchInst>(Pair.first->getTerminator());
+    const BranchInst *LoopEntryPredicate =
+        dyn_cast<BranchInst>(Pair.first->getTerminator());
     if (!LoopEntryPredicate ||
         LoopEntryPredicate->isUnconditional())
       continue;
@@ -9527,7 +10065,7 @@ ScalarEvolution::isLoopEntryGuardedByCond(const Loop *L,
     if (!AssumeVH)
       continue;
     auto *CI = cast<CallInst>(AssumeVH);
-    if (!DT.dominates(CI, L->getHeader()))
+    if (!DT.dominates(CI, BB))
       continue;
 
     if (ProveViaCond(CI->getArgOperand(0), false))
@@ -9537,10 +10075,27 @@ ScalarEvolution::isLoopEntryGuardedByCond(const Loop *L,
   return false;
 }
 
-bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred,
-                                    const SCEV *LHS, const SCEV *RHS,
-                                    Value *FoundCondValue,
-                                    bool Inverse) {
+bool ScalarEvolution::isLoopEntryGuardedByCond(const Loop *L,
+                                               ICmpInst::Predicate Pred,
+                                               const SCEV *LHS,
+                                               const SCEV *RHS) {
+  // Interpret a null as meaning no loop, where there is obviously no guard
+  // (interprocedural conditions notwithstanding).
+  if (!L)
+    return false;
+
+  // Both LHS and RHS must be available at loop entry.
+  assert(isAvailableAtLoopEntry(LHS, L) &&
+         "LHS is not available at Loop Entry");
+  assert(isAvailableAtLoopEntry(RHS, L) &&
+         "RHS is not available at Loop Entry");
+  return isBasicBlockEntryGuardedByCond(L->getHeader(), Pred, LHS, RHS);
+}
+
+bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS,
+                                    const SCEV *RHS,
+                                    const Value *FoundCondValue, bool Inverse,
+                                    const Instruction *Context) {
   if (!PendingLoopPredicates.insert(FoundCondValue).second)
     return false;
 
@@ -9548,19 +10103,23 @@ bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred,
       make_scope_exit([&]() { PendingLoopPredicates.erase(FoundCondValue); });
 
   // Recursively handle And and Or conditions.
-  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(FoundCondValue)) {
+  if (const BinaryOperator *BO = dyn_cast<BinaryOperator>(FoundCondValue)) {
     if (BO->getOpcode() == Instruction::And) {
       if (!Inverse)
-        return isImpliedCond(Pred, LHS, RHS, BO->getOperand(0), Inverse) ||
-               isImpliedCond(Pred, LHS, RHS, BO->getOperand(1), Inverse);
+        return isImpliedCond(Pred, LHS, RHS, BO->getOperand(0), Inverse,
+                             Context) ||
+               isImpliedCond(Pred, LHS, RHS, BO->getOperand(1), Inverse,
+                             Context);
     } else if (BO->getOpcode() == Instruction::Or) {
       if (Inverse)
-        return isImpliedCond(Pred, LHS, RHS, BO->getOperand(0), Inverse) ||
-               isImpliedCond(Pred, LHS, RHS, BO->getOperand(1), Inverse);
+        return isImpliedCond(Pred, LHS, RHS, BO->getOperand(0), Inverse,
+                             Context) ||
+               isImpliedCond(Pred, LHS, RHS, BO->getOperand(1), Inverse,
+                             Context);
     }
   }
 
-  ICmpInst *ICI = dyn_cast<ICmpInst>(FoundCondValue);
+  const ICmpInst *ICI = dyn_cast<ICmpInst>(FoundCondValue);
   if (!ICI) return false;
 
   // Now that we found a conditional branch that dominates the loop or controls
@@ -9574,17 +10133,36 @@ bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred,
   const SCEV *FoundLHS = getSCEV(ICI->getOperand(0));
   const SCEV *FoundRHS = getSCEV(ICI->getOperand(1));
 
-  return isImpliedCond(Pred, LHS, RHS, FoundPred, FoundLHS, FoundRHS);
+  return isImpliedCond(Pred, LHS, RHS, FoundPred, FoundLHS, FoundRHS, Context);
 }
 
 bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS,
                                     const SCEV *RHS,
                                     ICmpInst::Predicate FoundPred,
-                                    const SCEV *FoundLHS,
-                                    const SCEV *FoundRHS) {
+                                    const SCEV *FoundLHS, const SCEV *FoundRHS,
+                                    const Instruction *Context) {
   // Balance the types.
   if (getTypeSizeInBits(LHS->getType()) <
       getTypeSizeInBits(FoundLHS->getType())) {
+    // For unsigned and equality predicates, try to prove that both found
+    // operands fit into narrow unsigned range. If so, try to prove facts in
+    // narrow types.
+    if (!CmpInst::isSigned(FoundPred)) {
+      auto *NarrowType = LHS->getType();
+      auto *WideType = FoundLHS->getType();
+      auto BitWidth = getTypeSizeInBits(NarrowType);
+      const SCEV *MaxValue = getZeroExtendExpr(
+          getConstant(APInt::getMaxValue(BitWidth)), WideType);
+      if (isKnownPredicate(ICmpInst::ICMP_ULE, FoundLHS, MaxValue) &&
+          isKnownPredicate(ICmpInst::ICMP_ULE, FoundRHS, MaxValue)) {
+        const SCEV *TruncFoundLHS = getTruncateExpr(FoundLHS, NarrowType);
+        const SCEV *TruncFoundRHS = getTruncateExpr(FoundRHS, NarrowType);
+        if (isImpliedCondBalancedTypes(Pred, LHS, RHS, FoundPred, TruncFoundLHS,
+                                       TruncFoundRHS, Context))
+          return true;
+      }
+    }
+
     if (CmpInst::isSigned(Pred)) {
       LHS = getSignExtendExpr(LHS, FoundLHS->getType());
       RHS = getSignExtendExpr(RHS, FoundLHS->getType());
@@ -9602,7 +10180,17 @@ bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS,
       FoundRHS = getZeroExtendExpr(FoundRHS, LHS->getType());
     }
   }
+  return isImpliedCondBalancedTypes(Pred, LHS, RHS, FoundPred, FoundLHS,
+                                    FoundRHS, Context);
+}
 
+bool ScalarEvolution::isImpliedCondBalancedTypes(
+    ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS,
+    ICmpInst::Predicate FoundPred, const SCEV *FoundLHS, const SCEV *FoundRHS,
+    const Instruction *Context) {
+  assert(getTypeSizeInBits(LHS->getType()) ==
+             getTypeSizeInBits(FoundLHS->getType()) &&
+         "Types should be balanced!");
   // Canonicalize the query to match the way instcombine will have
   // canonicalized the comparison.
   if (SimplifyICmpOperands(Pred, LHS, RHS))
@@ -9625,16 +10213,16 @@ bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS,
 
   // Check whether the found predicate is the same as the desired predicate.
   if (FoundPred == Pred)
-    return isImpliedCondOperands(Pred, LHS, RHS, FoundLHS, FoundRHS);
+    return isImpliedCondOperands(Pred, LHS, RHS, FoundLHS, FoundRHS, Context);
 
   // Check whether swapping the found predicate makes it the same as the
   // desired predicate.
   if (ICmpInst::getSwappedPredicate(FoundPred) == Pred) {
     if (isa<SCEVConstant>(RHS))
-      return isImpliedCondOperands(Pred, LHS, RHS, FoundRHS, FoundLHS);
+      return isImpliedCondOperands(Pred, LHS, RHS, FoundRHS, FoundLHS, Context);
     else
-      return isImpliedCondOperands(ICmpInst::getSwappedPredicate(Pred),
-                                   RHS, LHS, FoundLHS, FoundRHS);
+      return isImpliedCondOperands(ICmpInst::getSwappedPredicate(Pred), RHS,
+                                   LHS, FoundLHS, FoundRHS, Context);
   }
 
   // Unsigned comparison is the same as signed comparison when both the operands
@@ -9642,7 +10230,7 @@ bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS,
   if (CmpInst::isUnsigned(FoundPred) &&
       CmpInst::getSignedPredicate(FoundPred) == Pred &&
       isKnownNonNegative(FoundLHS) && isKnownNonNegative(FoundRHS))
-    return isImpliedCondOperands(Pred, LHS, RHS, FoundLHS, FoundRHS);
+    return isImpliedCondOperands(Pred, LHS, RHS, FoundLHS, FoundRHS, Context);
 
   // Check if we can make progress by sharpening ranges.
   if (FoundPred == ICmpInst::ICMP_NE &&
@@ -9679,8 +10267,8 @@ bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS,
         case ICmpInst::ICMP_UGE:
           // We know V `Pred` SharperMin.  If this implies LHS `Pred`
           // RHS, we're done.
-          if (isImpliedCondOperands(Pred, LHS, RHS, V,
-                                    getConstant(SharperMin)))
+          if (isImpliedCondOperands(Pred, LHS, RHS, V, getConstant(SharperMin),
+                                    Context))
             return true;
           LLVM_FALLTHROUGH;
 
@@ -9695,10 +10283,26 @@ bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS,
           //
           // If V `Pred` Min implies LHS `Pred` RHS, we're done.
 
-          if (isImpliedCondOperands(Pred, LHS, RHS, V, getConstant(Min)))
+          if (isImpliedCondOperands(Pred, LHS, RHS, V, getConstant(Min),
+                                    Context))
+            return true;
+          break;
+
+        // `LHS < RHS` and `LHS <= RHS` are handled in the same way as `RHS > LHS` and `RHS >= LHS` respectively.
+        case ICmpInst::ICMP_SLE:
+        case ICmpInst::ICMP_ULE:
+          if (isImpliedCondOperands(CmpInst::getSwappedPredicate(Pred), RHS,
+                                    LHS, V, getConstant(SharperMin), Context))
             return true;
           LLVM_FALLTHROUGH;
 
+        case ICmpInst::ICMP_SLT:
+        case ICmpInst::ICMP_ULT:
+          if (isImpliedCondOperands(CmpInst::getSwappedPredicate(Pred), RHS,
+                                    LHS, V, getConstant(Min), Context))
+            return true;
+          break;
+
         default:
           // No change
           break;
@@ -9709,11 +10313,12 @@ bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS,
   // Check whether the actual condition is beyond sufficient.
   if (FoundPred == ICmpInst::ICMP_EQ)
     if (ICmpInst::isTrueWhenEqual(Pred))
-      if (isImpliedCondOperands(Pred, LHS, RHS, FoundLHS, FoundRHS))
+      if (isImpliedCondOperands(Pred, LHS, RHS, FoundLHS, FoundRHS, Context))
         return true;
   if (Pred == ICmpInst::ICMP_NE)
     if (!ICmpInst::isTrueWhenEqual(FoundPred))
-      if (isImpliedCondOperands(FoundPred, LHS, RHS, FoundLHS, FoundRHS))
+      if (isImpliedCondOperands(FoundPred, LHS, RHS, FoundLHS, FoundRHS,
+                                Context))
         return true;
 
   // Otherwise assume the worst.
@@ -9792,6 +10397,51 @@ Optional<APInt> ScalarEvolution::computeConstantDifference(const SCEV *More,
   return None;
 }
 
+bool ScalarEvolution::isImpliedCondOperandsViaAddRecStart(
+    ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS,
+    const SCEV *FoundLHS, const SCEV *FoundRHS, const Instruction *Context) {
+  // Try to recognize the following pattern:
+  //
+  //   FoundRHS = ...
+  // ...
+  // loop:
+  //   FoundLHS = {Start,+,W}
+  // context_bb: // Basic block from the same loop
+  //   known(Pred, FoundLHS, FoundRHS)
+  //
+  // If some predicate is known in the context of a loop, it is also known on
+  // each iteration of this loop, including the first iteration. Therefore, in
+  // this case, `FoundLHS Pred FoundRHS` implies `Start Pred FoundRHS`. Try to
+  // prove the original pred using this fact.
+  if (!Context)
+    return false;
+  const BasicBlock *ContextBB = Context->getParent();
+  // Make sure AR varies in the context block.
+  if (auto *AR = dyn_cast<SCEVAddRecExpr>(FoundLHS)) {
+    const Loop *L = AR->getLoop();
+    // Make sure that context belongs to the loop and executes on 1st iteration
+    // (if it ever executes at all).
+    if (!L->contains(ContextBB) || !DT.dominates(ContextBB, L->getLoopLatch()))
+      return false;
+    if (!isAvailableAtLoopEntry(FoundRHS, AR->getLoop()))
+      return false;
+    return isImpliedCondOperands(Pred, LHS, RHS, AR->getStart(), FoundRHS);
+  }
+
+  if (auto *AR = dyn_cast<SCEVAddRecExpr>(FoundRHS)) {
+    const Loop *L = AR->getLoop();
+    // Make sure that context belongs to the loop and executes on 1st iteration
+    // (if it ever executes at all).
+    if (!L->contains(ContextBB) || !DT.dominates(ContextBB, L->getLoopLatch()))
+      return false;
+    if (!isAvailableAtLoopEntry(FoundLHS, AR->getLoop()))
+      return false;
+    return isImpliedCondOperands(Pred, LHS, RHS, FoundLHS, AR->getStart());
+  }
+
+  return false;
+}
+
 bool ScalarEvolution::isImpliedCondOperandsViaNoOverflow(
     ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS,
     const SCEV *FoundLHS, const SCEV *FoundRHS) {
@@ -9972,6 +10622,10 @@ bool ScalarEvolution::isImpliedViaMerge(ICmpInst::Predicate Pred,
       if (!dominates(RHS, IncBB))
         return false;
       const SCEV *L = getSCEV(LPhi->getIncomingValueForBlock(IncBB));
+      // Make sure L does not refer to a value from a potentially previous
+      // iteration of a loop.
+      if (!properlyDominates(L, IncBB))
+        return false;
       if (!ProvedEasily(L, RHS))
         return false;
     }
@@ -9982,13 +10636,18 @@ bool ScalarEvolution::isImpliedViaMerge(ICmpInst::Predicate Pred,
 bool ScalarEvolution::isImpliedCondOperands(ICmpInst::Predicate Pred,
                                             const SCEV *LHS, const SCEV *RHS,
                                             const SCEV *FoundLHS,
-                                            const SCEV *FoundRHS) {
+                                            const SCEV *FoundRHS,
+                                            const Instruction *Context) {
   if (isImpliedCondOperandsViaRanges(Pred, LHS, RHS, FoundLHS, FoundRHS))
     return true;
 
   if (isImpliedCondOperandsViaNoOverflow(Pred, LHS, RHS, FoundLHS, FoundRHS))
     return true;
 
+  if (isImpliedCondOperandsViaAddRecStart(Pred, LHS, RHS, FoundLHS, FoundRHS,
+                                          Context))
+    return true;
+
   return isImpliedCondOperandsHelper(Pred, LHS, RHS,
                                      FoundLHS, FoundRHS) ||
          // ~x < ~y --> x > y
@@ -10005,7 +10664,7 @@ static bool IsMinMaxConsistingOf(const SCEV *MaybeMinMaxExpr,
   if (!MinMaxExpr)
     return false;
 
-  return find(MinMaxExpr->operands(), Candidate) != MinMaxExpr->op_end();
+  return is_contained(MinMaxExpr->operands(), Candidate);
 }
 
 static bool IsKnownPredicateViaAddRecStart(ScalarEvolution &SE,
@@ -10087,13 +10746,31 @@ bool ScalarEvolution::isImpliedViaOperations(ICmpInst::Predicate Pred,
   // We want to avoid hurting the compile time with analysis of too big trees.
   if (Depth > MaxSCEVOperationsImplicationDepth)
     return false;
-  // We only want to work with ICMP_SGT comparison so far.
-  // TODO: Extend to ICMP_UGT?
-  if (Pred == ICmpInst::ICMP_SLT) {
-    Pred = ICmpInst::ICMP_SGT;
+
+  // We only want to work with GT comparison so far.
+  if (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_SLT) {
+    Pred = CmpInst::getSwappedPredicate(Pred);
     std::swap(LHS, RHS);
     std::swap(FoundLHS, FoundRHS);
   }
+
+  // For unsigned, try to reduce it to corresponding signed comparison.
+  if (Pred == ICmpInst::ICMP_UGT)
+    // We can replace unsigned predicate with its signed counterpart if all
+    // involved values are non-negative.
+    // TODO: We could have better support for unsigned.
+    if (isKnownNonNegative(FoundLHS) && isKnownNonNegative(FoundRHS)) {
+      // Knowing that both FoundLHS and FoundRHS are non-negative, and knowing
+      // FoundLHS >u FoundRHS, we also know that FoundLHS >s FoundRHS. Let us
+      // use this fact to prove that LHS and RHS are non-negative.
+      const SCEV *MinusOne = getMinusOne(LHS->getType());
+      if (isImpliedCondOperands(ICmpInst::ICMP_SGT, LHS, MinusOne, FoundLHS,
+                                FoundRHS) &&
+          isImpliedCondOperands(ICmpInst::ICMP_SGT, RHS, MinusOne, FoundLHS,
+                                FoundRHS))
+        Pred = ICmpInst::ICMP_SGT;
+    }
+
   if (Pred != ICmpInst::ICMP_SGT)
     return false;
 
@@ -10133,7 +10810,7 @@ bool ScalarEvolution::isImpliedViaOperations(ICmpInst::Predicate Pred,
 
     auto *LL = LHSAddExpr->getOperand(0);
     auto *LR = LHSAddExpr->getOperand(1);
-    auto *MinusOne = getNegativeSCEV(getOne(RHS->getType()));
+    auto *MinusOne = getMinusOne(RHS->getType());
 
     // Checks that S1 >= 0 && S2 > RHS, trivially or using the found context.
     auto IsSumGreaterThanRHS = [&](const SCEV *S1, const SCEV *S2) {
@@ -10206,7 +10883,7 @@ bool ScalarEvolution::isImpliedViaOperations(ICmpInst::Predicate Pred,
       // 1. If FoundLHS is negative, then the result is 0.
       // 2. If FoundLHS is non-negative, then the result is non-negative.
       // Anyways, the result is non-negative.
-      auto *MinusOne = getNegativeSCEV(getOne(WTy));
+      auto *MinusOne = getMinusOne(WTy);
       auto *NegDenomMinusOne = getMinusSCEV(MinusOne, DenominatorExt);
       if (isKnownNegative(RHS) &&
           IsSGTViaContext(FoundRHSExt, NegDenomMinusOne))
@@ -10561,7 +11238,13 @@ ScalarEvolution::howManyLessThans(const SCEV *LHS, const SCEV *RHS,
   if (isLoopEntryGuardedByCond(L, Cond, getMinusSCEV(Start, Stride), RHS))
     BECount = BECountIfBackedgeTaken;
   else {
-    End = IsSigned ? getSMaxExpr(RHS, Start) : getUMaxExpr(RHS, Start);
+    // If we know that RHS >= Start in the context of loop, then we know that
+    // max(RHS, Start) = RHS at this point.
+    if (isLoopEntryGuardedByCond(
+            L, IsSigned ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE, RHS, Start))
+      End = RHS;
+    else
+      End = IsSigned ? getSMaxExpr(RHS, Start) : getUMaxExpr(RHS, Start);
     BECount = computeBECount(getMinusSCEV(End, Start), Stride, false);
   }
 
@@ -10628,8 +11311,15 @@ ScalarEvolution::howManyGreaterThans(const SCEV *LHS, const SCEV *RHS,
 
   const SCEV *Start = IV->getStart();
   const SCEV *End = RHS;
-  if (!isLoopEntryGuardedByCond(L, Cond, getAddExpr(Start, Stride), RHS))
-    End = IsSigned ? getSMinExpr(RHS, Start) : getUMinExpr(RHS, Start);
+  if (!isLoopEntryGuardedByCond(L, Cond, getAddExpr(Start, Stride), RHS)) {
+    // If we know that Start >= RHS in the context of loop, then we know that
+    // min(RHS, Start) = RHS at this point.
+    if (isLoopEntryGuardedByCond(
+            L, IsSigned ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE, Start, RHS))
+      End = RHS;
+    else
+      End = IsSigned ? getSMinExpr(RHS, Start) : getUMinExpr(RHS, Start);
+  }
 
   const SCEV *BECount = computeBECount(getMinusSCEV(Start, End), Stride, false);
 
@@ -10669,7 +11359,7 @@ const SCEV *SCEVAddRecExpr::getNumIterationsInRange(const ConstantRange &Range,
   // If the start is a non-zero constant, shift the range to simplify things.
   if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(getStart()))
     if (!SC->getValue()->isZero()) {
-      SmallVector<const SCEV *, 4> Operands(op_begin(), op_end());
+      SmallVector<const SCEV *, 4> Operands(operands());
       Operands[0] = SE.getZero(SC->getType());
       const SCEV *Shifted = SE.getAddRecExpr(Operands, getLoop(),
                                              getNoWrapFlags(FlagNW));
@@ -10952,9 +11642,7 @@ static bool findArrayDimensionsRec(ScalarEvolution &SE,
   }
 
   // Remove all SCEVConstants.
-  Terms.erase(
-      remove_if(Terms, [](const SCEV *E) { return isa<SCEVConstant>(E); }),
-      Terms.end());
+  erase_if(Terms, [](const SCEV *E) { return isa<SCEVConstant>(E); });
 
   if (Terms.size() > 0)
     if (!findArrayDimensionsRec(SE, Terms, Sizes))
@@ -11282,7 +11970,7 @@ void ScalarEvolution::SCEVCallbackVH::allUsesReplacedWith(Value *V) {
   // so that future queries will recompute the expressions using the new
   // value.
   Value *Old = getValPtr();
-  SmallVector<User *, 16> Worklist(Old->user_begin(), Old->user_end());
+  SmallVector<User *, 16> Worklist(Old->users());
   SmallPtrSet<User *, 8> Visited;
   while (!Worklist.empty()) {
     User *U = Worklist.pop_back_val();
@@ -11295,7 +11983,7 @@ void ScalarEvolution::SCEVCallbackVH::allUsesReplacedWith(Value *V) {
     if (PHINode *PN = dyn_cast<PHINode>(U))
       SE->ConstantEvolutionLoopExitValue.erase(PN);
     SE->eraseValueFromMap(U);
-    Worklist.insert(Worklist.end(), U->user_begin(), U->user_end());
+    llvm::append_range(Worklist, U->users());
   }
   // Delete the Old value.
   if (PHINode *PN = dyn_cast<PHINode>(Old))
@@ -11577,9 +12265,10 @@ ScalarEvolution::getLoopDisposition(const SCEV *S, const Loop *L) {
 
 ScalarEvolution::LoopDisposition
 ScalarEvolution::computeLoopDisposition(const SCEV *S, const Loop *L) {
-  switch (static_cast<SCEVTypes>(S->getSCEVType())) {
+  switch (S->getSCEVType()) {
   case scConstant:
     return LoopInvariant;
+  case scPtrToInt:
   case scTruncate:
   case scZeroExtend:
   case scSignExtend:
@@ -11684,9 +12373,10 @@ ScalarEvolution::getBlockDisposition(const SCEV *S, const BasicBlock *BB) {
 
 ScalarEvolution::BlockDisposition
 ScalarEvolution::computeBlockDisposition(const SCEV *S, const BasicBlock *BB) {
-  switch (static_cast<SCEVTypes>(S->getSCEVType())) {
+  switch (S->getSCEVType()) {
   case scConstant:
     return ProperlyDominatesBlock;
+  case scPtrToInt:
   case scTruncate:
   case scZeroExtend:
   case scSignExtend:
@@ -11858,7 +12548,7 @@ void ScalarEvolution::verify() const {
 
   while (!LoopStack.empty()) {
     auto *L = LoopStack.pop_back_val();
-    LoopStack.insert(LoopStack.end(), L->begin(), L->end());
+    llvm::append_range(LoopStack, *L);
 
     auto *CurBECount = SCM.visit(
         const_cast<ScalarEvolution *>(this)->getBackedgeTakenCount(L));
@@ -11902,6 +12592,25 @@ void ScalarEvolution::verify() const {
       std::abort();
     }
   }
+
+  // Collect all valid loops currently in LoopInfo.
+  SmallPtrSet<Loop *, 32> ValidLoops;
+  SmallVector<Loop *, 32> Worklist(LI.begin(), LI.end());
+  while (!Worklist.empty()) {
+    Loop *L = Worklist.pop_back_val();
+    if (ValidLoops.contains(L))
+      continue;
+    ValidLoops.insert(L);
+    Worklist.append(L->begin(), L->end());
+  }
+  // Check for SCEV expressions referencing invalid/deleted loops.
+  for (auto &KV : ValueExprMap) {
+    auto *AR = dyn_cast<SCEVAddRecExpr>(KV.second);
+    if (!AR)
+      continue;
+    assert(ValidLoops.contains(AR->getLoop()) &&
+           "AddRec references invalid loop");
+  }
 }
 
 bool ScalarEvolution::invalidate(
@@ -11934,6 +12643,11 @@ ScalarEvolutionVerifierPass::run(Function &F, FunctionAnalysisManager &AM) {
 
 PreservedAnalyses
 ScalarEvolutionPrinterPass::run(Function &F, FunctionAnalysisManager &AM) {
+  // For compatibility with opt's -analyze feature under legacy pass manager
+  // which was not ported to NPM. This keeps tests using
+  // update_analyze_test_checks.py working.
+  OS << "Printing analysis 'Scalar Evolution Analysis' for function '"
+     << F.getName() << "':\n";
   AM.getResult<ScalarEvolutionAnalysis>(F).print(OS);
   return PreservedAnalyses::all();
 }
@@ -12429,11 +13143,24 @@ void PredicatedScalarEvolution::print(raw_ostream &OS, unsigned Depth) const {
 }
 
 // Match the mathematical pattern A - (A / B) * B, where A and B can be
-// arbitrary expressions.
+// arbitrary expressions. Also match zext (trunc A to iB) to iY, which is used
+// for URem with constant power-of-2 second operands.
 // It's not always easy, as A and B can be folded (imagine A is X / 2, and B is
 // 4, A / B becomes X / 8).
 bool ScalarEvolution::matchURem(const SCEV *Expr, const SCEV *&LHS,
                                 const SCEV *&RHS) {
+  // Try to match 'zext (trunc A to iB) to iY', which is used
+  // for URem with constant power-of-2 second operands. Make sure the size of
+  // the operand A matches the size of the whole expressions.
+  if (const auto *ZExt = dyn_cast<SCEVZeroExtendExpr>(Expr))
+    if (const auto *Trunc = dyn_cast<SCEVTruncateExpr>(ZExt->getOperand(0))) {
+      LHS = Trunc->getOperand();
+      if (LHS->getType() != Expr->getType())
+        LHS = getZeroExtendExpr(LHS, Expr->getType());
+      RHS = getConstant(APInt(getTypeSizeInBits(Expr->getType()), 1)
+                        << getTypeSizeInBits(Trunc->getType()));
+      return true;
+    }
   const auto *Add = dyn_cast<SCEVAddExpr>(Expr);
   if (Add == nullptr || Add->getNumOperands() != 2)
     return false;
@@ -12467,3 +13194,146 @@ bool ScalarEvolution::matchURem(const SCEV *Expr, const SCEV *&LHS,
            MatchURemWithDivisor(getNegativeSCEV(Mul->getOperand(0)));
   return false;
 }
+
+const SCEV *
+ScalarEvolution::computeSymbolicMaxBackedgeTakenCount(const Loop *L) {
+  SmallVector<BasicBlock*, 16> ExitingBlocks;
+  L->getExitingBlocks(ExitingBlocks);
+
+  // Form an expression for the maximum exit count possible for this loop. We
+  // merge the max and exact information to approximate a version of
+  // getConstantMaxBackedgeTakenCount which isn't restricted to just constants.
+  SmallVector<const SCEV*, 4> ExitCounts;
+  for (BasicBlock *ExitingBB : ExitingBlocks) {
+    const SCEV *ExitCount = getExitCount(L, ExitingBB);
+    if (isa<SCEVCouldNotCompute>(ExitCount))
+      ExitCount = getExitCount(L, ExitingBB,
+                                  ScalarEvolution::ConstantMaximum);
+    if (!isa<SCEVCouldNotCompute>(ExitCount)) {
+      assert(DT.dominates(ExitingBB, L->getLoopLatch()) &&
+             "We should only have known counts for exiting blocks that "
+             "dominate latch!");
+      ExitCounts.push_back(ExitCount);
+    }
+  }
+  if (ExitCounts.empty())
+    return getCouldNotCompute();
+  return getUMinFromMismatchedTypes(ExitCounts);
+}
+
+/// This rewriter is similar to SCEVParameterRewriter (it replaces SCEVUnknown
+/// components following the Map (Value -> SCEV)), but skips AddRecExpr because
+/// we cannot guarantee that the replacement is loop invariant in the loop of
+/// the AddRec.
+class SCEVLoopGuardRewriter : public SCEVRewriteVisitor<SCEVLoopGuardRewriter> {
+  ValueToSCEVMapTy &Map;
+
+public:
+  SCEVLoopGuardRewriter(ScalarEvolution &SE, ValueToSCEVMapTy &M)
+      : SCEVRewriteVisitor(SE), Map(M) {}
+
+  const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) { return Expr; }
+
+  const SCEV *visitUnknown(const SCEVUnknown *Expr) {
+    auto I = Map.find(Expr->getValue());
+    if (I == Map.end())
+      return Expr;
+    return I->second;
+  }
+};
+
+const SCEV *ScalarEvolution::applyLoopGuards(const SCEV *Expr, const Loop *L) {
+  auto CollectCondition = [&](ICmpInst::Predicate Predicate, const SCEV *LHS,
+                              const SCEV *RHS, ValueToSCEVMapTy &RewriteMap) {
+    if (!isa<SCEVUnknown>(LHS)) {
+      std::swap(LHS, RHS);
+      Predicate = CmpInst::getSwappedPredicate(Predicate);
+    }
+
+    // For now, limit to conditions that provide information about unknown
+    // expressions.
+    auto *LHSUnknown = dyn_cast<SCEVUnknown>(LHS);
+    if (!LHSUnknown)
+      return;
+
+    // TODO: use information from more predicates.
+    switch (Predicate) {
+    case CmpInst::ICMP_ULT: {
+      if (!containsAddRecurrence(RHS)) {
+        const SCEV *Base = LHS;
+        auto I = RewriteMap.find(LHSUnknown->getValue());
+        if (I != RewriteMap.end())
+          Base = I->second;
+
+        RewriteMap[LHSUnknown->getValue()] =
+            getUMinExpr(Base, getMinusSCEV(RHS, getOne(RHS->getType())));
+      }
+      break;
+    }
+    case CmpInst::ICMP_ULE: {
+      if (!containsAddRecurrence(RHS)) {
+        const SCEV *Base = LHS;
+        auto I = RewriteMap.find(LHSUnknown->getValue());
+        if (I != RewriteMap.end())
+          Base = I->second;
+        RewriteMap[LHSUnknown->getValue()] = getUMinExpr(Base, RHS);
+      }
+      break;
+    }
+    case CmpInst::ICMP_EQ:
+      if (isa<SCEVConstant>(RHS))
+        RewriteMap[LHSUnknown->getValue()] = RHS;
+      break;
+    case CmpInst::ICMP_NE:
+      if (isa<SCEVConstant>(RHS) &&
+          cast<SCEVConstant>(RHS)->getValue()->isNullValue())
+        RewriteMap[LHSUnknown->getValue()] =
+            getUMaxExpr(LHS, getOne(RHS->getType()));
+      break;
+    default:
+      break;
+    }
+  };
+  // Starting at the loop predecessor, climb up the predecessor chain, as long
+  // as there are predecessors that can be found that have unique successors
+  // leading to the original header.
+  // TODO: share this logic with isLoopEntryGuardedByCond.
+  ValueToSCEVMapTy RewriteMap;
+  for (std::pair<const BasicBlock *, const BasicBlock *> Pair(
+           L->getLoopPredecessor(), L->getHeader());
+       Pair.first; Pair = getPredecessorWithUniqueSuccessorForBB(Pair.first)) {
+
+    const BranchInst *LoopEntryPredicate =
+        dyn_cast<BranchInst>(Pair.first->getTerminator());
+    if (!LoopEntryPredicate || LoopEntryPredicate->isUnconditional())
+      continue;
+
+    // TODO: use information from more complex conditions, e.g. AND expressions.
+    auto *Cmp = dyn_cast<ICmpInst>(LoopEntryPredicate->getCondition());
+    if (!Cmp)
+      continue;
+
+    auto Predicate = Cmp->getPredicate();
+    if (LoopEntryPredicate->getSuccessor(1) == Pair.second)
+      Predicate = CmpInst::getInversePredicate(Predicate);
+    CollectCondition(Predicate, getSCEV(Cmp->getOperand(0)),
+                     getSCEV(Cmp->getOperand(1)), RewriteMap);
+  }
+
+  // Also collect information from assumptions dominating the loop.
+  for (auto &AssumeVH : AC.assumptions()) {
+    if (!AssumeVH)
+      continue;
+    auto *AssumeI = cast<CallInst>(AssumeVH);
+    auto *Cmp = dyn_cast<ICmpInst>(AssumeI->getOperand(0));
+    if (!Cmp || !DT.dominates(AssumeI, L->getHeader()))
+      continue;
+    CollectCondition(Cmp->getPredicate(), getSCEV(Cmp->getOperand(0)),
+                     getSCEV(Cmp->getOperand(1)), RewriteMap);
+  }
+
+  if (RewriteMap.empty())
+    return Expr;
+  SCEVLoopGuardRewriter Rewriter(*this, RewriteMap);
+  return Rewriter.visit(Expr);
+}
diff --git a/contrib/llvm-project/llvm/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp b/contrib/llvm-project/llvm/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp
index 79640256f695..8f289feb3dcf 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp
@@ -82,10 +82,12 @@ AliasResult SCEVAAResult::alias(const MemoryLocation &LocA,
   Value *BO = GetBaseValue(BS);
   if ((AO && AO != LocA.Ptr) || (BO && BO != LocB.Ptr))
     if (alias(MemoryLocation(AO ? AO : LocA.Ptr,
-                             AO ? LocationSize::unknown() : LocA.Size,
+                             AO ? LocationSize::beforeOrAfterPointer()
+                                : LocA.Size,
                              AO ? AAMDNodes() : LocA.AATags),
               MemoryLocation(BO ? BO : LocB.Ptr,
-                             BO ? LocationSize::unknown() : LocB.Size,
+                             BO ? LocationSize::beforeOrAfterPointer()
+                                : LocB.Size,
                              BO ? AAMDNodes() : LocB.AATags),
               AAQI) == NoAlias)
       return NoAlias;
diff --git a/contrib/llvm-project/llvm/lib/Analysis/ScalarEvolutionDivision.cpp b/contrib/llvm-project/llvm/lib/Analysis/ScalarEvolutionDivision.cpp
index 19bf5766f448..64e908bdf342 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/ScalarEvolutionDivision.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/ScalarEvolutionDivision.cpp
@@ -215,16 +215,14 @@ void SCEVDivision::visitMulExpr(const SCEVMulExpr *Numerator) {
     return cannotDivide(Numerator);
 
   // The Remainder is obtained by replacing Denominator by 0 in Numerator.
-  ValueToValueMap RewriteMap;
-  RewriteMap[cast<SCEVUnknown>(Denominator)->getValue()] =
-      cast<SCEVConstant>(Zero)->getValue();
-  Remainder = SCEVParameterRewriter::rewrite(Numerator, SE, RewriteMap, true);
+  ValueToSCEVMapTy RewriteMap;
+  RewriteMap[cast<SCEVUnknown>(Denominator)->getValue()] = Zero;
+  Remainder = SCEVParameterRewriter::rewrite(Numerator, SE, RewriteMap);
 
   if (Remainder->isZero()) {
     // The Quotient is obtained by replacing Denominator by 1 in Numerator.
-    RewriteMap[cast<SCEVUnknown>(Denominator)->getValue()] =
-        cast<SCEVConstant>(One)->getValue();
-    Quotient = SCEVParameterRewriter::rewrite(Numerator, SE, RewriteMap, true);
+    RewriteMap[cast<SCEVUnknown>(Denominator)->getValue()] = One;
+    Quotient = SCEVParameterRewriter::rewrite(Numerator, SE, RewriteMap);
     return;
   }
 
diff --git a/contrib/llvm-project/llvm/lib/Analysis/ScopedNoAliasAA.cpp b/contrib/llvm-project/llvm/lib/Analysis/ScopedNoAliasAA.cpp
index 8928678d6ab2..6b38d6716b92 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/ScopedNoAliasAA.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/ScopedNoAliasAA.cpp
@@ -34,6 +34,7 @@
 #include "llvm/Analysis/ScopedNoAliasAA.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
@@ -50,31 +51,6 @@ using namespace llvm;
 static cl::opt<bool> EnableScopedNoAlias("enable-scoped-noalias",
                                          cl::init(true), cl::Hidden);
 
-namespace {
-
-/// This is a simple wrapper around an MDNode which provides a higher-level
-/// interface by hiding the details of how alias analysis information is encoded
-/// in its operands.
-class AliasScopeNode {
-  const MDNode *Node = nullptr;
-
-public:
-  AliasScopeNode() = default;
-  explicit AliasScopeNode(const MDNode *N) : Node(N) {}
-
-  /// Get the MDNode for this AliasScopeNode.
-  const MDNode *getNode() const { return Node; }
-
-  /// Get the MDNode for this AliasScopeNode's domain.
-  const MDNode *getDomain() const {
-    if (Node->getNumOperands() < 2)
-      return nullptr;
-    return dyn_cast_or_null<MDNode>(Node->getOperand(1));
-  }
-};
-
-} // end anonymous namespace
-
 AliasResult ScopedNoAliasAAResult::alias(const MemoryLocation &LocA,
                                          const MemoryLocation &LocB,
                                          AAQueryInfo &AAQI) {
@@ -185,7 +161,7 @@ ScopedNoAliasAAResult ScopedNoAliasAA::run(Function &F,
 
 char ScopedNoAliasAAWrapperPass::ID = 0;
 
-INITIALIZE_PASS(ScopedNoAliasAAWrapperPass, "scoped-noalias",
+INITIALIZE_PASS(ScopedNoAliasAAWrapperPass, "scoped-noalias-aa",
                 "Scoped NoAlias Alias Analysis", false, true)
 
 ImmutablePass *llvm::createScopedNoAliasAAWrapperPass() {
diff --git a/contrib/llvm-project/llvm/lib/Analysis/StackLifetime.cpp b/contrib/llvm-project/llvm/lib/Analysis/StackLifetime.cpp
index 9727b7a33d1f..ab5f2db7d1cd 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/StackLifetime.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/StackLifetime.cpp
@@ -11,6 +11,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/AssemblyAnnotationWriter.h"
 #include "llvm/IR/BasicBlock.h"
@@ -63,13 +64,28 @@ bool StackLifetime::isAliveAfter(const AllocaInst *AI,
   return getLiveRange(AI).test(InstNum);
 }
 
-static bool readMarker(const Instruction *I, bool *IsStart) {
-  if (!I->isLifetimeStartOrEnd())
-    return false;
+// Returns unique alloca annotated by lifetime marker only if
+// markers has the same size and points to the alloca start.
+static const AllocaInst *findMatchingAlloca(const IntrinsicInst &II,
+                                            const DataLayout &DL) {
+  const AllocaInst *AI = findAllocaForValue(II.getArgOperand(1), true);
+  if (!AI)
+    return nullptr;
 
-  auto *II = cast<IntrinsicInst>(I);
-  *IsStart = II->getIntrinsicID() == Intrinsic::lifetime_start;
-  return true;
+  auto AllocaSizeInBits = AI->getAllocationSizeInBits(DL);
+  if (!AllocaSizeInBits)
+    return nullptr;
+  int64_t AllocaSize = AllocaSizeInBits.getValue() / 8;
+
+  auto *Size = dyn_cast<ConstantInt>(II.getArgOperand(0));
+  if (!Size)
+    return nullptr;
+  int64_t LifetimeSize = Size->getSExtValue();
+
+  if (LifetimeSize != -1 && LifetimeSize != AllocaSize)
+    return nullptr;
+
+  return AI;
 }
 
 void StackLifetime::collectMarkers() {
@@ -77,28 +93,27 @@ void StackLifetime::collectMarkers() {
   DenseMap<const BasicBlock *, SmallDenseMap<const IntrinsicInst *, Marker>>
       BBMarkerSet;
 
+  const DataLayout &DL = F.getParent()->getDataLayout();
+
   // Compute the set of start/end markers per basic block.
-  for (unsigned AllocaNo = 0; AllocaNo < NumAllocas; ++AllocaNo) {
-    const AllocaInst *AI = Allocas[AllocaNo];
-    SmallVector<const Instruction *, 8> WorkList;
-    WorkList.push_back(AI);
-    while (!WorkList.empty()) {
-      const Instruction *I = WorkList.pop_back_val();
-      for (const User *U : I->users()) {
-        if (auto *BI = dyn_cast<BitCastInst>(U)) {
-          WorkList.push_back(BI);
-          continue;
-        }
-        auto *UI = dyn_cast<IntrinsicInst>(U);
-        if (!UI)
-          continue;
-        bool IsStart;
-        if (!readMarker(UI, &IsStart))
-          continue;
-        if (IsStart)
-          InterestingAllocas.set(AllocaNo);
-        BBMarkerSet[UI->getParent()][UI] = {AllocaNo, IsStart};
+  for (const BasicBlock *BB : depth_first(&F)) {
+    for (const Instruction &I : *BB) {
+      const IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
+      if (!II || !II->isLifetimeStartOrEnd())
+        continue;
+      const AllocaInst *AI = findMatchingAlloca(*II, DL);
+      if (!AI) {
+        HasUnknownLifetimeStartOrEnd = true;
+        continue;
       }
+      auto It = AllocaNumbering.find(AI);
+      if (It == AllocaNumbering.end())
+        continue;
+      auto AllocaNo = It->second;
+      bool IsStart = II->getIntrinsicID() == Intrinsic::lifetime_start;
+      if (IsStart)
+        InterestingAllocas.set(AllocaNo);
+      BBMarkerSet[BB][II] = {AllocaNo, IsStart};
     }
   }
 
@@ -277,7 +292,7 @@ LLVM_DUMP_METHOD void StackLifetime::dumpBlockLiveness() const {
     const BasicBlock *BB = IT.getFirst();
     const BlockLifetimeInfo &BlockInfo = BlockLiveness.find(BB)->getSecond();
     auto BlockRange = BlockInstRange.find(BB)->getSecond();
-    dbgs() << "  BB [" << BlockRange.first << ", " << BlockRange.second
+    dbgs() << "  BB (" << BB->getName() << ") [" << BlockRange.first << ", " << BlockRange.second
            << "): begin " << BlockInfo.Begin << ", end " << BlockInfo.End
            << ", livein " << BlockInfo.LiveIn << ", liveout "
            << BlockInfo.LiveOut << "\n";
@@ -304,6 +319,20 @@ StackLifetime::StackLifetime(const Function &F,
 }
 
 void StackLifetime::run() {
+  if (HasUnknownLifetimeStartOrEnd) {
+    // There is marker which we can't assign to a specific alloca, so we
+    // fallback to the most conservative results for the type.
+    switch (Type) {
+    case LivenessType::May:
+      LiveRanges.resize(NumAllocas, getFullLiveRange());
+      break;
+    case LivenessType::Must:
+      LiveRanges.resize(NumAllocas, LiveRange(Instructions.size()));
+      break;
+    }
+    return;
+  }
+
   LiveRanges.resize(NumAllocas, LiveRange(Instructions.size()));
   for (unsigned I = 0; I < NumAllocas; ++I)
     if (!InterestingAllocas.test(I))
diff --git a/contrib/llvm-project/llvm/lib/Analysis/StackSafetyAnalysis.cpp b/contrib/llvm-project/llvm/lib/Analysis/StackSafetyAnalysis.cpp
index bbfc303aefac..73096eb4baef 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/StackSafetyAnalysis.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/StackSafetyAnalysis.cpp
@@ -22,6 +22,7 @@
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/ModuleSummaryIndex.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
@@ -37,6 +38,25 @@ using namespace llvm;
 STATISTIC(NumAllocaStackSafe, "Number of safe allocas");
 STATISTIC(NumAllocaTotal, "Number of total allocas");
 
+STATISTIC(NumCombinedCalleeLookupTotal,
+          "Number of total callee lookups on combined index.");
+STATISTIC(NumCombinedCalleeLookupFailed,
+          "Number of failed callee lookups on combined index.");
+STATISTIC(NumModuleCalleeLookupTotal,
+          "Number of total callee lookups on module index.");
+STATISTIC(NumModuleCalleeLookupFailed,
+          "Number of failed callee lookups on module index.");
+STATISTIC(NumCombinedParamAccessesBefore,
+          "Number of total param accesses before generateParamAccessSummary.");
+STATISTIC(NumCombinedParamAccessesAfter,
+          "Number of total param accesses after generateParamAccessSummary.");
+STATISTIC(NumCombinedDataFlowNodes,
+          "Number of total nodes in combined index for dataflow processing.");
+STATISTIC(NumIndexCalleeUnhandled, "Number of index callee which are unhandled.");
+STATISTIC(NumIndexCalleeMultipleWeak, "Number of index callee non-unique weak.");
+STATISTIC(NumIndexCalleeMultipleExternal, "Number of index callee non-unique external.");
+
+
 static cl::opt<int> StackSafetyMaxIterations("stack-safety-max-iterations",
                                              cl::init(20), cl::Hidden);
 
@@ -48,26 +68,48 @@ static cl::opt<bool> StackSafetyRun("stack-safety-run", cl::init(false),
 
 namespace {
 
+// Check if we should bailout for such ranges.
+bool isUnsafe(const ConstantRange &R) {
+  return R.isEmptySet() || R.isFullSet() || R.isUpperSignWrapped();
+}
+
+ConstantRange addOverflowNever(const ConstantRange &L, const ConstantRange &R) {
+  assert(!L.isSignWrappedSet());
+  assert(!R.isSignWrappedSet());
+  if (L.signedAddMayOverflow(R) !=
+      ConstantRange::OverflowResult::NeverOverflows)
+    return ConstantRange::getFull(L.getBitWidth());
+  ConstantRange Result = L.add(R);
+  assert(!Result.isSignWrappedSet());
+  return Result;
+}
+
+ConstantRange unionNoWrap(const ConstantRange &L, const ConstantRange &R) {
+  assert(!L.isSignWrappedSet());
+  assert(!R.isSignWrappedSet());
+  auto Result = L.unionWith(R);
+  // Two non-wrapped sets can produce wrapped.
+  if (Result.isSignWrappedSet())
+    Result = ConstantRange::getFull(Result.getBitWidth());
+  return Result;
+}
+
 /// Describes use of address in as a function call argument.
 template <typename CalleeTy> struct CallInfo {
   /// Function being called.
   const CalleeTy *Callee = nullptr;
   /// Index of argument which pass address.
   size_t ParamNo = 0;
-  // Offset range of address from base address (alloca or calling function
-  // argument).
-  // Range should never set to empty-set, that is an invalid access range
-  // that can cause empty-set to be propagated with ConstantRange::add
-  ConstantRange Offset;
-  CallInfo(const CalleeTy *Callee, size_t ParamNo, const ConstantRange &Offset)
-      : Callee(Callee), ParamNo(ParamNo), Offset(Offset) {}
-};
 
-template <typename CalleeTy>
-raw_ostream &operator<<(raw_ostream &OS, const CallInfo<CalleeTy> &P) {
-  return OS << "@" << P.Callee->getName() << "(arg" << P.ParamNo << ", "
-            << P.Offset << ")";
-}
+  CallInfo(const CalleeTy *Callee, size_t ParamNo)
+      : Callee(Callee), ParamNo(ParamNo) {}
+
+  struct Less {
+    bool operator()(const CallInfo &L, const CallInfo &R) const {
+      return std::tie(L.ParamNo, L.Callee) < std::tie(R.ParamNo, R.Callee);
+    }
+  };
+};
 
 /// Describe uses of address (alloca or parameter) inside of the function.
 template <typename CalleeTy> struct UseInfo {
@@ -76,37 +118,29 @@ template <typename CalleeTy> struct UseInfo {
   ConstantRange Range;
 
   // List of calls which pass address as an argument.
-  SmallVector<CallInfo<CalleeTy>, 4> Calls;
+  // Value is offset range of address from base address (alloca or calling
+  // function argument). Range should never set to empty-set, that is an invalid
+  // access range that can cause empty-set to be propagated with
+  // ConstantRange::add
+  using CallsTy = std::map<CallInfo<CalleeTy>, ConstantRange,
+                           typename CallInfo<CalleeTy>::Less>;
+  CallsTy Calls;
 
   UseInfo(unsigned PointerSize) : Range{PointerSize, false} {}
 
-  void updateRange(const ConstantRange &R) {
-    assert(!R.isUpperSignWrapped());
-    Range = Range.unionWith(R);
-    assert(!Range.isUpperSignWrapped());
-  }
+  void updateRange(const ConstantRange &R) { Range = unionNoWrap(Range, R); }
 };
 
 template <typename CalleeTy>
 raw_ostream &operator<<(raw_ostream &OS, const UseInfo<CalleeTy> &U) {
   OS << U.Range;
   for (auto &Call : U.Calls)
-    OS << ", " << Call;
+    OS << ", "
+       << "@" << Call.first.Callee->getName() << "(arg" << Call.first.ParamNo
+       << ", " << Call.second << ")";
   return OS;
 }
 
-// Check if we should bailout for such ranges.
-bool isUnsafe(const ConstantRange &R) {
-  return R.isEmptySet() || R.isFullSet() || R.isUpperSignWrapped();
-}
-
-ConstantRange addOverflowNever(const ConstantRange &L, const ConstantRange &R) {
-  if (L.signedAddMayOverflow(R) !=
-      ConstantRange::OverflowResult::NeverOverflows)
-    return ConstantRange(L.getBitWidth(), true);
-  return L.add(R);
-}
-
 /// Calculate the allocation size of a given alloca. Returns empty range
 // in case of confution.
 ConstantRange getStaticAllocaSizeRange(const AllocaInst &AI) {
@@ -174,6 +208,7 @@ template <typename CalleeTy> struct FunctionInfo {
     } else {
       assert(Allocas.empty());
     }
+    O << "\n";
   }
 };
 
@@ -384,7 +419,11 @@ bool StackSafetyLocalAnalysis::analyzeAllUses(Value *Ptr,
         }
 
         assert(isa<Function>(Callee) || isa<GlobalAlias>(Callee));
-        US.Calls.emplace_back(Callee, ArgNo, offsetFrom(UI, Ptr));
+        ConstantRange Offsets = offsetFrom(UI, Ptr);
+        auto Insert =
+            US.Calls.emplace(CallInfo<GlobalValue>(Callee, ArgNo), Offsets);
+        if (!Insert.second)
+          Insert.first->second = Insert.first->second.unionWith(Offsets);
         break;
       }
 
@@ -417,7 +456,7 @@ FunctionInfo<GlobalValue> StackSafetyLocalAnalysis::run() {
     analyzeAllUses(AI, UI, SL);
   }
 
-  for (Argument &A : make_range(F.arg_begin(), F.arg_end())) {
+  for (Argument &A : F.args()) {
     // Non pointers and bypass arguments are not going to be used in any global
     // processing.
     if (A.getType()->isPointerTy() && !A.hasByValAttr()) {
@@ -490,18 +529,18 @@ template <typename CalleeTy>
 bool StackSafetyDataFlowAnalysis<CalleeTy>::updateOneUse(UseInfo<CalleeTy> &US,
                                                          bool UpdateToFullSet) {
   bool Changed = false;
-  for (auto &CS : US.Calls) {
-    assert(!CS.Offset.isEmptySet() &&
+  for (auto &KV : US.Calls) {
+    assert(!KV.second.isEmptySet() &&
            "Param range can't be empty-set, invalid offset range");
 
     ConstantRange CalleeRange =
-        getArgumentAccessRange(CS.Callee, CS.ParamNo, CS.Offset);
+        getArgumentAccessRange(KV.first.Callee, KV.first.ParamNo, KV.second);
     if (!US.Range.contains(CalleeRange)) {
       Changed = true;
       if (UpdateToFullSet)
         US.Range = UnknownRange;
       else
-        US.Range = US.Range.unionWith(CalleeRange);
+        US.updateRange(CalleeRange);
     }
   }
   return Changed;
@@ -535,7 +574,7 @@ void StackSafetyDataFlowAnalysis<CalleeTy>::runDataFlow() {
     auto &FS = F.second;
     for (auto &KV : FS.Params)
       for (auto &CS : KV.second.Calls)
-        Callees.push_back(CS.Callee);
+        Callees.push_back(CS.first.Callee);
 
     llvm::sort(Callees);
     Callees.erase(std::unique(Callees.begin(), Callees.end()), Callees.end());
@@ -570,14 +609,52 @@ StackSafetyDataFlowAnalysis<CalleeTy>::run() {
   return Functions;
 }
 
-FunctionSummary *resolveCallee(GlobalValueSummary *S) {
+FunctionSummary *findCalleeFunctionSummary(ValueInfo VI, StringRef ModuleId) {
+  if (!VI)
+    return nullptr;
+  auto SummaryList = VI.getSummaryList();
+  GlobalValueSummary* S = nullptr;
+  for (const auto& GVS : SummaryList) {
+    if (!GVS->isLive())
+      continue;
+    if (const AliasSummary *AS = dyn_cast<AliasSummary>(GVS.get()))
+      if (!AS->hasAliasee())
+        continue;
+    if (!isa<FunctionSummary>(GVS->getBaseObject()))
+      continue;
+    if (GlobalValue::isLocalLinkage(GVS->linkage())) {
+      if (GVS->modulePath() == ModuleId) {
+        S = GVS.get();
+        break;
+      }
+    } else if (GlobalValue::isExternalLinkage(GVS->linkage())) {
+      if (S) {
+        ++NumIndexCalleeMultipleExternal;
+        return nullptr;
+      }
+      S = GVS.get();
+    } else if (GlobalValue::isWeakLinkage(GVS->linkage())) {
+      if (S) {
+        ++NumIndexCalleeMultipleWeak;
+        return nullptr;
+      }
+      S = GVS.get();
+    } else if (GlobalValue::isAvailableExternallyLinkage(GVS->linkage()) ||
+               GlobalValue::isLinkOnceLinkage(GVS->linkage())) {
+      if (SummaryList.size() == 1)
+        S = GVS.get();
+      // According thinLTOResolvePrevailingGUID these are unlikely prevailing.
+    } else {
+      ++NumIndexCalleeUnhandled;
+    }
+  };
   while (S) {
     if (!S->isLive() || !S->isDSOLocal())
       return nullptr;
     if (FunctionSummary *FS = dyn_cast<FunctionSummary>(S))
       return FS;
     AliasSummary *AS = dyn_cast<AliasSummary>(S);
-    if (!AS)
+    if (!AS || !AS->hasAliasee())
       return nullptr;
     S = AS->getBaseObject();
     if (S == AS)
@@ -602,16 +679,6 @@ const Function *findCalleeInModule(const GlobalValue *GV) {
   return nullptr;
 }
 
-GlobalValueSummary *getGlobalValueSummary(const ModuleSummaryIndex *Index,
-                                          uint64_t ValueGUID) {
-  auto VI = Index->getValueInfo(ValueGUID);
-  if (!VI || VI.getSummaryList().empty())
-    return nullptr;
-  assert(VI.getSummaryList().size() == 1);
-  auto &Summary = VI.getSummaryList()[0];
-  return Summary.get();
-}
-
 const ConstantRange *findParamAccess(const FunctionSummary &FS,
                                      uint32_t ParamNo) {
   assert(FS.isLive());
@@ -625,31 +692,34 @@ const ConstantRange *findParamAccess(const FunctionSummary &FS,
 void resolveAllCalls(UseInfo<GlobalValue> &Use,
                      const ModuleSummaryIndex *Index) {
   ConstantRange FullSet(Use.Range.getBitWidth(), true);
-  for (auto &C : Use.Calls) {
-    const Function *F = findCalleeInModule(C.Callee);
+  // Move Use.Calls to a temp storage and repopulate - don't use std::move as it
+  // leaves Use.Calls in an undefined state.
+  UseInfo<GlobalValue>::CallsTy TmpCalls;
+  std::swap(TmpCalls, Use.Calls);
+  for (const auto &C : TmpCalls) {
+    const Function *F = findCalleeInModule(C.first.Callee);
     if (F) {
-      C.Callee = F;
+      Use.Calls.emplace(CallInfo<GlobalValue>(F, C.first.ParamNo), C.second);
       continue;
     }
 
     if (!Index)
       return Use.updateRange(FullSet);
-    GlobalValueSummary *GVS = getGlobalValueSummary(Index, C.Callee->getGUID());
-
-    FunctionSummary *FS = resolveCallee(GVS);
-    if (!FS)
+    FunctionSummary *FS =
+        findCalleeFunctionSummary(Index->getValueInfo(C.first.Callee->getGUID()),
+                                  C.first.Callee->getParent()->getModuleIdentifier());
+    ++NumModuleCalleeLookupTotal;
+    if (!FS) {
+      ++NumModuleCalleeLookupFailed;
       return Use.updateRange(FullSet);
-    const ConstantRange *Found = findParamAccess(*FS, C.ParamNo);
-    if (!Found)
+    }
+    const ConstantRange *Found = findParamAccess(*FS, C.first.ParamNo);
+    if (!Found || Found->isFullSet())
       return Use.updateRange(FullSet);
     ConstantRange Access = Found->sextOrTrunc(Use.Range.getBitWidth());
-    Use.updateRange(addOverflowNever(Access, C.Offset));
-    C.Callee = nullptr;
+    if (!Access.isEmptySet())
+      Use.updateRange(addOverflowNever(Access, C.second));
   }
-
-  Use.Calls.erase(std::remove_if(Use.Calls.begin(), Use.Calls.end(),
-                                 [](auto &T) { return !T.Callee; }),
-                  Use.Calls.end());
 }
 
 GVToSSI createGlobalStackSafetyInfo(
@@ -663,8 +733,11 @@ GVToSSI createGlobalStackSafetyInfo(
   auto Copy = Functions;
 
   for (auto &FnKV : Copy)
-    for (auto &KV : FnKV.second.Params)
+    for (auto &KV : FnKV.second.Params) {
       resolveAllCalls(KV.second, Index);
+      if (KV.second.Range.isFullSet())
+        KV.second.Calls.clear();
+    }
 
   uint32_t PointerSize = Copy.begin()
                              ->first->getParent()
@@ -679,8 +752,8 @@ GVToSSI createGlobalStackSafetyInfo(
       auto &A = KV.second;
       resolveAllCalls(A, Index);
       for (auto &C : A.Calls) {
-        A.updateRange(
-            SSDFA.getArgumentAccessRange(C.Callee, C.ParamNo, C.Offset));
+        A.updateRange(SSDFA.getArgumentAccessRange(C.first.Callee,
+                                                   C.first.ParamNo, C.second));
       }
       // FIXME: This is needed only to preserve calls in print() results.
       A.Calls = SrcF.Allocas.find(KV.first)->second.Calls;
@@ -749,7 +822,7 @@ const StackSafetyGlobalInfo::InfoTy &StackSafetyGlobalInfo::getInfo() const {
 }
 
 std::vector<FunctionSummary::ParamAccess>
-StackSafetyInfo::getParamAccesses() const {
+StackSafetyInfo::getParamAccesses(ModuleSummaryIndex &Index) const {
   // Implementation transforms internal representation of parameter information
   // into FunctionSummary format.
   std::vector<FunctionSummary::ParamAccess> ParamAccesses;
@@ -770,13 +843,21 @@ StackSafetyInfo::getParamAccesses() const {
       // will make ParamAccess::Range as FullSet anyway. So we can drop the
       // entire parameter like we did above.
       // TODO(vitalybuka): Return already filtered parameters from getInfo().
-      if (C.Offset.isFullSet()) {
+      if (C.second.isFullSet()) {
         ParamAccesses.pop_back();
         break;
       }
-      Param.Calls.emplace_back(C.ParamNo, C.Callee->getGUID(), C.Offset);
+      Param.Calls.emplace_back(C.first.ParamNo,
+                               Index.getOrInsertValueInfo(C.first.Callee),
+                               C.second);
     }
   }
+  for (FunctionSummary::ParamAccess &Param : ParamAccesses) {
+    sort(Param.Calls, [](const FunctionSummary::ParamAccess::Call &L,
+                         const FunctionSummary::ParamAccess::Call &R) {
+      return std::tie(L.ParamNo, L.Callee) < std::tie(R.ParamNo, R.Callee);
+    });
+  }
   return ParamAccesses;
 }
 
@@ -921,14 +1002,28 @@ bool llvm::needsParamAccessSummary(const Module &M) {
 }
 
 void llvm::generateParamAccessSummary(ModuleSummaryIndex &Index) {
+  if (!Index.hasParamAccess())
+    return;
   const ConstantRange FullSet(FunctionSummary::ParamAccess::RangeWidth, true);
+
+  auto CountParamAccesses = [&](auto &Stat) {
+    if (!AreStatisticsEnabled())
+      return;
+    for (auto &GVS : Index)
+      for (auto &GV : GVS.second.SummaryList)
+        if (FunctionSummary *FS = dyn_cast<FunctionSummary>(GV.get()))
+          Stat += FS->paramAccesses().size();
+  };
+
+  CountParamAccesses(NumCombinedParamAccessesBefore);
+
   std::map<const FunctionSummary *, FunctionInfo<FunctionSummary>> Functions;
 
   // Convert the ModuleSummaryIndex to a FunctionMap
   for (auto &GVS : Index) {
     for (auto &GV : GVS.second.SummaryList) {
       FunctionSummary *FS = dyn_cast<FunctionSummary>(GV.get());
-      if (!FS)
+      if (!FS || FS->paramAccesses().empty())
         continue;
       if (FS->isLive() && FS->isDSOLocal()) {
         FunctionInfo<FunctionSummary> FI;
@@ -940,14 +1035,17 @@ void llvm::generateParamAccessSummary(ModuleSummaryIndex &Index) {
           US.Range = PS.Use;
           for (auto &Call : PS.Calls) {
             assert(!Call.Offsets.isFullSet());
-            FunctionSummary *S = resolveCallee(
-                Index.findSummaryInModule(Call.Callee, FS->modulePath()));
+            FunctionSummary *S =
+                findCalleeFunctionSummary(Call.Callee, FS->modulePath());
+            ++NumCombinedCalleeLookupTotal;
             if (!S) {
+              ++NumCombinedCalleeLookupFailed;
               US.Range = FullSet;
               US.Calls.clear();
               break;
             }
-            US.Calls.emplace_back(S, Call.ParamNo, Call.Offsets);
+            US.Calls.emplace(CallInfo<FunctionSummary>(S, Call.ParamNo),
+                             Call.Offsets);
           }
         }
         Functions.emplace(FS, std::move(FI));
@@ -958,12 +1056,16 @@ void llvm::generateParamAccessSummary(ModuleSummaryIndex &Index) {
       FS->setParamAccesses({});
     }
   }
+  NumCombinedDataFlowNodes += Functions.size();
   StackSafetyDataFlowAnalysis<FunctionSummary> SSDFA(
       FunctionSummary::ParamAccess::RangeWidth, std::move(Functions));
   for (auto &KV : SSDFA.run()) {
     std::vector<FunctionSummary::ParamAccess> NewParams;
     NewParams.reserve(KV.second.Params.size());
     for (auto &Param : KV.second.Params) {
+      // It's not needed as FullSet is processed the same as a missing value.
+      if (Param.second.Range.isFullSet())
+        continue;
       NewParams.emplace_back();
       FunctionSummary::ParamAccess &New = NewParams.back();
       New.ParamNo = Param.first;
@@ -972,6 +1074,8 @@ void llvm::generateParamAccessSummary(ModuleSummaryIndex &Index) {
     const_cast<FunctionSummary *>(KV.first)->setParamAccesses(
         std::move(NewParams));
   }
+
+  CountParamAccesses(NumCombinedParamAccessesAfter);
 }
 
 static const char LocalPassArg[] = "stack-safety-local";
diff --git a/contrib/llvm-project/llvm/lib/Analysis/SyncDependenceAnalysis.cpp b/contrib/llvm-project/llvm/lib/Analysis/SyncDependenceAnalysis.cpp
index ccf520dcea66..67a1365b698d 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/SyncDependenceAnalysis.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/SyncDependenceAnalysis.cpp
@@ -1,5 +1,4 @@
-//===- SyncDependenceAnalysis.cpp - Divergent Branch Dependence Calculation
-//--===//
+//===--- SyncDependenceAnalysis.cpp - Compute Control Divergence Effects --===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -99,280 +98,364 @@
 // loop exit and the loop header (_after_ SSA construction).
 //
 //===----------------------------------------------------------------------===//
+#include "llvm/Analysis/SyncDependenceAnalysis.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/PostDominators.h"
-#include "llvm/Analysis/SyncDependenceAnalysis.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 
+#include <functional>
 #include <stack>
 #include <unordered_set>
 
 #define DEBUG_TYPE "sync-dependence"
 
+// The SDA algorithm operates on a modified CFG - we modify the edges leaving
+// loop headers as follows:
+//
+// * We remove all edges leaving all loop headers.
+// * We add additional edges from the loop headers to their exit blocks.
+//
+// The modification is virtual, that is whenever we visit a loop header we
+// pretend it had different successors.
+namespace {
+using namespace llvm;
+
+// Custom Post-Order Traveral
+//
+// We cannot use the vanilla (R)PO computation of LLVM because:
+// * We (virtually) modify the CFG.
+// * We want a loop-compact block enumeration, that is the numbers assigned by
+//   the traveral to the blocks of a loop are an interval.
+using POCB = std::function<void(const BasicBlock &)>;
+using VisitedSet = std::set<const BasicBlock *>;
+using BlockStack = std::vector<const BasicBlock *>;
+
+// forward
+static void computeLoopPO(const LoopInfo &LI, Loop &Loop, POCB CallBack,
+                          VisitedSet &Finalized);
+
+// for a nested region (top-level loop or nested loop)
+static void computeStackPO(BlockStack &Stack, const LoopInfo &LI, Loop *Loop,
+                           POCB CallBack, VisitedSet &Finalized) {
+  const auto *LoopHeader = Loop ? Loop->getHeader() : nullptr;
+  while (!Stack.empty()) {
+    const auto *NextBB = Stack.back();
+
+    auto *NestedLoop = LI.getLoopFor(NextBB);
+    bool IsNestedLoop = NestedLoop != Loop;
+
+    // Treat the loop as a node
+    if (IsNestedLoop) {
+      SmallVector<BasicBlock *, 3> NestedExits;
+      NestedLoop->getUniqueExitBlocks(NestedExits);
+      bool PushedNodes = false;
+      for (const auto *NestedExitBB : NestedExits) {
+        if (NestedExitBB == LoopHeader)
+          continue;
+        if (Loop && !Loop->contains(NestedExitBB))
+          continue;
+        if (Finalized.count(NestedExitBB))
+          continue;
+        PushedNodes = true;
+        Stack.push_back(NestedExitBB);
+      }
+      if (!PushedNodes) {
+        // All loop exits finalized -> finish this node
+        Stack.pop_back();
+        computeLoopPO(LI, *NestedLoop, CallBack, Finalized);
+      }
+      continue;
+    }
+
+    // DAG-style
+    bool PushedNodes = false;
+    for (const auto *SuccBB : successors(NextBB)) {
+      if (SuccBB == LoopHeader)
+        continue;
+      if (Loop && !Loop->contains(SuccBB))
+        continue;
+      if (Finalized.count(SuccBB))
+        continue;
+      PushedNodes = true;
+      Stack.push_back(SuccBB);
+    }
+    if (!PushedNodes) {
+      // Never push nodes twice
+      Stack.pop_back();
+      if (!Finalized.insert(NextBB).second)
+        continue;
+      CallBack(*NextBB);
+    }
+  }
+}
+
+static void computeTopLevelPO(Function &F, const LoopInfo &LI, POCB CallBack) {
+  VisitedSet Finalized;
+  BlockStack Stack;
+  Stack.reserve(24); // FIXME made-up number
+  Stack.push_back(&F.getEntryBlock());
+  computeStackPO(Stack, LI, nullptr, CallBack, Finalized);
+}
+
+static void computeLoopPO(const LoopInfo &LI, Loop &Loop, POCB CallBack,
+                          VisitedSet &Finalized) {
+  /// Call CallBack on all loop blocks.
+  std::vector<const BasicBlock *> Stack;
+  const auto *LoopHeader = Loop.getHeader();
+
+  // Visit the header last
+  Finalized.insert(LoopHeader);
+  CallBack(*LoopHeader);
+
+  // Initialize with immediate successors
+  for (const auto *BB : successors(LoopHeader)) {
+    if (!Loop.contains(BB))
+      continue;
+    if (BB == LoopHeader)
+      continue;
+    Stack.push_back(BB);
+  }
+
+  // Compute PO inside region
+  computeStackPO(Stack, LI, &Loop, CallBack, Finalized);
+}
+
+} // namespace
+
 namespace llvm {
 
-ConstBlockSet SyncDependenceAnalysis::EmptyBlockSet;
+ControlDivergenceDesc SyncDependenceAnalysis::EmptyDivergenceDesc;
 
 SyncDependenceAnalysis::SyncDependenceAnalysis(const DominatorTree &DT,
                                                const PostDominatorTree &PDT,
                                                const LoopInfo &LI)
-    : FuncRPOT(DT.getRoot()->getParent()), DT(DT), PDT(PDT), LI(LI) {}
+    : DT(DT), PDT(PDT), LI(LI) {
+  computeTopLevelPO(*DT.getRoot()->getParent(), LI,
+                    [&](const BasicBlock &BB) { LoopPO.appendBlock(BB); });
+}
 
 SyncDependenceAnalysis::~SyncDependenceAnalysis() {}
 
-using FunctionRPOT = ReversePostOrderTraversal<const Function *>;
-
 // divergence propagator for reducible CFGs
 struct DivergencePropagator {
-  const FunctionRPOT &FuncRPOT;
+  const ModifiedPO &LoopPOT;
   const DominatorTree &DT;
   const PostDominatorTree &PDT;
   const LoopInfo &LI;
-
-  // identified join points
-  std::unique_ptr<ConstBlockSet> JoinBlocks;
-
-  // reached loop exits (by a path disjoint to a path to the loop header)
-  SmallPtrSet<const BasicBlock *, 4> ReachedLoopExits;
-
-  // if DefMap[B] == C then C is the dominating definition at block B
-  // if DefMap[B] ~ undef then we haven't seen B yet
-  // if DefMap[B] == B then B is a join point of disjoint paths from X or B is
-  // an immediate successor of X (initial value).
-  using DefiningBlockMap = std::map<const BasicBlock *, const BasicBlock *>;
-  DefiningBlockMap DefMap;
-
-  // all blocks with pending visits
-  std::unordered_set<const BasicBlock *> PendingUpdates;
-
-  DivergencePropagator(const FunctionRPOT &FuncRPOT, const DominatorTree &DT,
-                       const PostDominatorTree &PDT, const LoopInfo &LI)
-      : FuncRPOT(FuncRPOT), DT(DT), PDT(PDT), LI(LI),
-        JoinBlocks(new ConstBlockSet) {}
-
-  // set the definition at @block and mark @block as pending for a visit
-  void addPending(const BasicBlock &Block, const BasicBlock &DefBlock) {
-    bool WasAdded = DefMap.emplace(&Block, &DefBlock).second;
-    if (WasAdded)
-      PendingUpdates.insert(&Block);
-  }
+  const BasicBlock &DivTermBlock;
+
+  // * if BlockLabels[IndexOf(B)] == C then C is the dominating definition at
+  //   block B
+  // * if BlockLabels[IndexOf(B)] ~ undef then we haven't seen B yet
+  // * if BlockLabels[IndexOf(B)] == B then B is a join point of disjoint paths
+  // from X or B is an immediate successor of X (initial value).
+  using BlockLabelVec = std::vector<const BasicBlock *>;
+  BlockLabelVec BlockLabels;
+  // divergent join and loop exit descriptor.
+  std::unique_ptr<ControlDivergenceDesc> DivDesc;
+
+  DivergencePropagator(const ModifiedPO &LoopPOT, const DominatorTree &DT,
+                       const PostDominatorTree &PDT, const LoopInfo &LI,
+                       const BasicBlock &DivTermBlock)
+      : LoopPOT(LoopPOT), DT(DT), PDT(PDT), LI(LI), DivTermBlock(DivTermBlock),
+        BlockLabels(LoopPOT.size(), nullptr),
+        DivDesc(new ControlDivergenceDesc) {}
 
   void printDefs(raw_ostream &Out) {
-    Out << "Propagator::DefMap {\n";
-    for (const auto *Block : FuncRPOT) {
-      auto It = DefMap.find(Block);
-      Out << Block->getName() << " : ";
-      if (It == DefMap.end()) {
-        Out << "\n";
+    Out << "Propagator::BlockLabels {\n";
+    for (int BlockIdx = (int)BlockLabels.size() - 1; BlockIdx > 0; --BlockIdx) {
+      const auto *Label = BlockLabels[BlockIdx];
+      Out << LoopPOT.getBlockAt(BlockIdx)->getName().str() << "(" << BlockIdx
+          << ") : ";
+      if (!Label) {
+        Out << "<null>\n";
       } else {
-        const auto *DefBlock = It->second;
-        Out << (DefBlock ? DefBlock->getName() : "<null>") << "\n";
+        Out << Label->getName() << "\n";
       }
     }
     Out << "}\n";
   }
 
-  // process @succBlock with reaching definition @defBlock
-  // the original divergent branch was in @parentLoop (if any)
-  void visitSuccessor(const BasicBlock &SuccBlock, const Loop *ParentLoop,
-                      const BasicBlock &DefBlock) {
+  // Push a definition (\p PushedLabel) to \p SuccBlock and return whether this
+  // causes a divergent join.
+  bool computeJoin(const BasicBlock &SuccBlock, const BasicBlock &PushedLabel) {
+    auto SuccIdx = LoopPOT.getIndexOf(SuccBlock);
 
-    // @succBlock is a loop exit
-    if (ParentLoop && !ParentLoop->contains(&SuccBlock)) {
-      DefMap.emplace(&SuccBlock, &DefBlock);
-      ReachedLoopExits.insert(&SuccBlock);
-      return;
+    // unset or same reaching label
+    const auto *OldLabel = BlockLabels[SuccIdx];
+    if (!OldLabel || (OldLabel == &PushedLabel)) {
+      BlockLabels[SuccIdx] = &PushedLabel;
+      return false;
     }
 
-    // first reaching def?
-    auto ItLastDef = DefMap.find(&SuccBlock);
-    if (ItLastDef == DefMap.end()) {
-      addPending(SuccBlock, DefBlock);
-      return;
-    }
+    // Update the definition
+    BlockLabels[SuccIdx] = &SuccBlock;
+    return true;
+  }
 
-    // a join of at least two definitions
-    if (ItLastDef->second != &DefBlock) {
-      // do we know this join already?
-      if (!JoinBlocks->insert(&SuccBlock).second)
-        return;
+  // visiting a virtual loop exit edge from the loop header --> temporal
+  // divergence on join
+  bool visitLoopExitEdge(const BasicBlock &ExitBlock,
+                         const BasicBlock &DefBlock, bool FromParentLoop) {
+    // Pushing from a non-parent loop cannot cause temporal divergence.
+    if (!FromParentLoop)
+      return visitEdge(ExitBlock, DefBlock);
+
+    if (!computeJoin(ExitBlock, DefBlock))
+      return false;
+
+    // Identified a divergent loop exit
+    DivDesc->LoopDivBlocks.insert(&ExitBlock);
+    LLVM_DEBUG(dbgs() << "\tDivergent loop exit: " << ExitBlock.getName()
+                      << "\n");
+    return true;
+  }
 
-      // update the definition
-      addPending(SuccBlock, SuccBlock);
-    }
+  // process \p SuccBlock with reaching definition \p DefBlock
+  bool visitEdge(const BasicBlock &SuccBlock, const BasicBlock &DefBlock) {
+    if (!computeJoin(SuccBlock, DefBlock))
+      return false;
+
+    // Divergent, disjoint paths join.
+    DivDesc->JoinDivBlocks.insert(&SuccBlock);
+    LLVM_DEBUG(dbgs() << "\tDivergent join: " << SuccBlock.getName());
+    return true;
   }
 
-  // find all blocks reachable by two disjoint paths from @rootTerm.
-  // This method works for both divergent terminators and loops with
-  // divergent exits.
-  // @rootBlock is either the block containing the branch or the header of the
-  // divergent loop.
-  // @nodeSuccessors is the set of successors of the node (Loop or Terminator)
-  // headed by @rootBlock.
-  // @parentLoop is the parent loop of the Loop or the loop that contains the
-  // Terminator.
-  template <typename SuccessorIterable>
-  std::unique_ptr<ConstBlockSet>
-  computeJoinPoints(const BasicBlock &RootBlock,
-                    SuccessorIterable NodeSuccessors, const Loop *ParentLoop) {
-    assert(JoinBlocks);
-
-    LLVM_DEBUG(dbgs() << "SDA:computeJoinPoints. Parent loop: " << (ParentLoop ? ParentLoop->getName() : "<null>") << "\n" );
+  std::unique_ptr<ControlDivergenceDesc> computeJoinPoints() {
+    assert(DivDesc);
+
+    LLVM_DEBUG(dbgs() << "SDA:computeJoinPoints: " << DivTermBlock.getName()
+                      << "\n");
+
+    const auto *DivBlockLoop = LI.getLoopFor(&DivTermBlock);
+
+    // Early stopping criterion
+    int FloorIdx = LoopPOT.size() - 1;
+    const BasicBlock *FloorLabel = nullptr;
 
     // bootstrap with branch targets
-    for (const auto *SuccBlock : NodeSuccessors) {
-      DefMap.emplace(SuccBlock, SuccBlock);
+    int BlockIdx = 0;
 
-      if (ParentLoop && !ParentLoop->contains(SuccBlock)) {
-        // immediate loop exit from node.
-        ReachedLoopExits.insert(SuccBlock);
-      } else {
-        // regular successor
-        PendingUpdates.insert(SuccBlock);
-      }
-    }
+    for (const auto *SuccBlock : successors(&DivTermBlock)) {
+      auto SuccIdx = LoopPOT.getIndexOf(*SuccBlock);
+      BlockLabels[SuccIdx] = SuccBlock;
 
-    LLVM_DEBUG(
-      dbgs() << "SDA: rpo order:\n";
-      for (const auto * RpoBlock : FuncRPOT) {
-        dbgs() << "- " << RpoBlock->getName() << "\n";
-      }
-    );
+      // Find the successor with the highest index to start with
+      BlockIdx = std::max<int>(BlockIdx, SuccIdx);
+      FloorIdx = std::min<int>(FloorIdx, SuccIdx);
 
-    auto ItBeginRPO = FuncRPOT.begin();
-    auto ItEndRPO = FuncRPOT.end();
+      // Identify immediate divergent loop exits
+      if (!DivBlockLoop)
+        continue;
 
-    // skip until term (TODO RPOT won't let us start at @term directly)
-    for (; *ItBeginRPO != &RootBlock; ++ItBeginRPO) {
-      assert(ItBeginRPO != ItEndRPO && "Unable to find RootBlock");
+      const auto *BlockLoop = LI.getLoopFor(SuccBlock);
+      if (BlockLoop && DivBlockLoop->contains(BlockLoop))
+        continue;
+      DivDesc->LoopDivBlocks.insert(SuccBlock);
+      LLVM_DEBUG(dbgs() << "\tImmediate divergent loop exit: "
+                        << SuccBlock->getName() << "\n");
     }
 
     // propagate definitions at the immediate successors of the node in RPO
-    auto ItBlockRPO = ItBeginRPO;
-    while ((++ItBlockRPO != ItEndRPO) &&
-           !PendingUpdates.empty()) {
-      const auto *Block = *ItBlockRPO;
-      LLVM_DEBUG(dbgs() << "SDA::joins. visiting " << Block->getName() << "\n");
+    for (; BlockIdx >= FloorIdx; --BlockIdx) {
+      LLVM_DEBUG(dbgs() << "Before next visit:\n"; printDefs(dbgs()));
 
-      // skip Block if not pending update
-      auto ItPending = PendingUpdates.find(Block);
-      if (ItPending == PendingUpdates.end())
+      // Any label available here
+      const auto *Label = BlockLabels[BlockIdx];
+      if (!Label)
         continue;
-      PendingUpdates.erase(ItPending);
 
-      // propagate definition at Block to its successors
-      auto ItDef = DefMap.find(Block);
-      const auto *DefBlock = ItDef->second;
-      assert(DefBlock);
+      // Ok. Get the block
+      const auto *Block = LoopPOT.getBlockAt(BlockIdx);
+      LLVM_DEBUG(dbgs() << "SDA::joins. visiting " << Block->getName() << "\n");
 
       auto *BlockLoop = LI.getLoopFor(Block);
-      if (ParentLoop &&
-          (ParentLoop != BlockLoop && ParentLoop->contains(BlockLoop))) {
-        // if the successor is the header of a nested loop pretend its a
-        // single node with the loop's exits as successors
+      bool IsLoopHeader = BlockLoop && BlockLoop->getHeader() == Block;
+      bool CausedJoin = false;
+      int LoweredFloorIdx = FloorIdx;
+      if (IsLoopHeader) {
+        // Disconnect from immediate successors and propagate directly to loop
+        // exits.
         SmallVector<BasicBlock *, 4> BlockLoopExits;
         BlockLoop->getExitBlocks(BlockLoopExits);
+
+        bool IsParentLoop = BlockLoop->contains(&DivTermBlock);
         for (const auto *BlockLoopExit : BlockLoopExits) {
-          visitSuccessor(*BlockLoopExit, ParentLoop, *DefBlock);
+          CausedJoin |= visitLoopExitEdge(*BlockLoopExit, *Label, IsParentLoop);
+          LoweredFloorIdx = std::min<int>(LoweredFloorIdx,
+                                          LoopPOT.getIndexOf(*BlockLoopExit));
         }
-
       } else {
-        // the successors are either on the same loop level or loop exits
+        // Acyclic successor case
         for (const auto *SuccBlock : successors(Block)) {
-          visitSuccessor(*SuccBlock, ParentLoop, *DefBlock);
+          CausedJoin |= visitEdge(*SuccBlock, *Label);
+          LoweredFloorIdx =
+              std::min<int>(LoweredFloorIdx, LoopPOT.getIndexOf(*SuccBlock));
         }
       }
-    }
 
-    LLVM_DEBUG(dbgs() << "SDA::joins. After propagation:\n"; printDefs(dbgs()));
-
-    // We need to know the definition at the parent loop header to decide
-    // whether the definition at the header is different from the definition at
-    // the loop exits, which would indicate a divergent loop exits.
-    //
-    // A // loop header
-    // |
-    // B // nested loop header
-    // |
-    // C -> X (exit from B loop) -..-> (A latch)
-    // |
-    // D -> back to B (B latch)
-    // |
-    // proper exit from both loops
-    //
-    // analyze reached loop exits
-    if (!ReachedLoopExits.empty()) {
-      const BasicBlock *ParentLoopHeader =
-          ParentLoop ? ParentLoop->getHeader() : nullptr;
-
-      assert(ParentLoop);
-      auto ItHeaderDef = DefMap.find(ParentLoopHeader);
-      const auto *HeaderDefBlock = (ItHeaderDef == DefMap.end()) ? nullptr : ItHeaderDef->second;
-
-      LLVM_DEBUG(printDefs(dbgs()));
-      assert(HeaderDefBlock && "no definition at header of carrying loop");
-
-      for (const auto *ExitBlock : ReachedLoopExits) {
-        auto ItExitDef = DefMap.find(ExitBlock);
-        assert((ItExitDef != DefMap.end()) &&
-               "no reaching def at reachable loop exit");
-        if (ItExitDef->second != HeaderDefBlock) {
-          JoinBlocks->insert(ExitBlock);
-        }
+      // Floor update
+      if (CausedJoin) {
+        // 1. Different labels pushed to successors
+        FloorIdx = LoweredFloorIdx;
+      } else if (FloorLabel != Label) {
+        // 2. No join caused BUT we pushed a label that is different than the
+        // last pushed label
+        FloorIdx = LoweredFloorIdx;
+        FloorLabel = Label;
       }
     }
 
-    return std::move(JoinBlocks);
+    LLVM_DEBUG(dbgs() << "SDA::joins. After propagation:\n"; printDefs(dbgs()));
+
+    return std::move(DivDesc);
   }
 };
 
-const ConstBlockSet &SyncDependenceAnalysis::join_blocks(const Loop &Loop) {
-  using LoopExitVec = SmallVector<BasicBlock *, 4>;
-  LoopExitVec LoopExits;
-  Loop.getExitBlocks(LoopExits);
-  if (LoopExits.size() < 1) {
-    return EmptyBlockSet;
+#ifndef NDEBUG
+static void printBlockSet(ConstBlockSet &Blocks, raw_ostream &Out) {
+  Out << "[";
+  bool First = true;
+  for (const auto *BB : Blocks) {
+    if (!First)
+      Out << ", ";
+    First = false;
+    Out << BB->getName();
   }
-
-  // already available in cache?
-  auto ItCached = CachedLoopExitJoins.find(&Loop);
-  if (ItCached != CachedLoopExitJoins.end()) {
-    return *ItCached->second;
-  }
-
-  // compute all join points
-  DivergencePropagator Propagator{FuncRPOT, DT, PDT, LI};
-  auto JoinBlocks = Propagator.computeJoinPoints<const LoopExitVec &>(
-      *Loop.getHeader(), LoopExits, Loop.getParentLoop());
-
-  auto ItInserted = CachedLoopExitJoins.emplace(&Loop, std::move(JoinBlocks));
-  assert(ItInserted.second);
-  return *ItInserted.first->second;
+  Out << "]";
 }
+#endif
 
-const ConstBlockSet &
-SyncDependenceAnalysis::join_blocks(const Instruction &Term) {
+const ControlDivergenceDesc &
+SyncDependenceAnalysis::getJoinBlocks(const Instruction &Term) {
   // trivial case
-  if (Term.getNumSuccessors() < 1) {
-    return EmptyBlockSet;
+  if (Term.getNumSuccessors() <= 1) {
+    return EmptyDivergenceDesc;
   }
 
   // already available in cache?
-  auto ItCached = CachedBranchJoins.find(&Term);
-  if (ItCached != CachedBranchJoins.end())
+  auto ItCached = CachedControlDivDescs.find(&Term);
+  if (ItCached != CachedControlDivDescs.end())
     return *ItCached->second;
 
   // compute all join points
-  DivergencePropagator Propagator{FuncRPOT, DT, PDT, LI};
+  // Special handling of divergent loop exits is not needed for LCSSA
   const auto &TermBlock = *Term.getParent();
-  auto JoinBlocks = Propagator.computeJoinPoints<const_succ_range>(
-      TermBlock, successors(Term.getParent()), LI.getLoopFor(&TermBlock));
+  DivergencePropagator Propagator(LoopPO, DT, PDT, LI, TermBlock);
+  auto DivDesc = Propagator.computeJoinPoints();
+
+  LLVM_DEBUG(dbgs() << "Result (" << Term.getParent()->getName() << "):\n";
+             dbgs() << "JoinDivBlocks: ";
+             printBlockSet(DivDesc->JoinDivBlocks, dbgs());
+             dbgs() << "\nLoopDivBlocks: ";
+             printBlockSet(DivDesc->LoopDivBlocks, dbgs()); dbgs() << "\n";);
 
-  auto ItInserted = CachedBranchJoins.emplace(&Term, std::move(JoinBlocks));
+  auto ItInserted = CachedControlDivDescs.emplace(&Term, std::move(DivDesc));
   assert(ItInserted.second);
   return *ItInserted.first->second;
 }
diff --git a/contrib/llvm-project/llvm/lib/Analysis/TFUtils.cpp b/contrib/llvm-project/llvm/lib/Analysis/TFUtils.cpp
index 19e6d626e238..1377cac217ab 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/TFUtils.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/TFUtils.cpp
@@ -10,17 +10,23 @@
 // This file implements utilities for interfacing with tensorflow C APIs.
 //
 //===----------------------------------------------------------------------===//
+#include "llvm/Config/config.h"
+#if defined(LLVM_HAVE_TF_API)
 
-#include "llvm/Analysis/Utils/TFUtils.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/Utils/TFUtils.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/JSON.h"
 #include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
 
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/c_api_experimental.h"
 
 #include <cassert>
+#include <numeric>
 
 using namespace llvm;
 
@@ -58,6 +64,89 @@ TFStatusPtr createTFStatus() {
 TFSessionOptionsPtr createTFSessionOptions() {
   return TFSessionOptionsPtr(TF_NewSessionOptions(), &TF_DeleteSessionOptions);
 }
+
+/// Write the values of one tensor as a list.
+template <typename T>
+void writeTensorValues(raw_ostream &OutFile, const char *TensorData,
+                       size_t ElemCount) {
+  OutFile << "[";
+  const T *TypedData = reinterpret_cast<const T *>(TensorData);
+  for (size_t I = 0; I < ElemCount; ++I) {
+    if (I > 0)
+      OutFile << ", ";
+    OutFile << TypedData[I];
+  }
+  OutFile << "]";
+}
+
+/// Write a list of tensors as a sequence of TensorFlow FeatureList protobufs.
+/// The tensors are assumed to be stored contiguously, in row-major format,
+/// in the TensorData buffer. Each tensor has the shape given by Spec. The
+/// feature name in the output is either the provided LoggingName, if
+/// specified, otherwise it's the name of the tensor (as given by Spec).
+void writeRawTensorsAsFeatureLists(raw_ostream &OutFile,
+                                   const LoggedFeatureSpec &LoggedSpec,
+                                   const char *TensorData, size_t TensorCount,
+                                   bool FinalReward = false) {
+  const char *FieldName = "<invalid>";
+  std::function<void(const char *)> ValueWriter;
+  const auto &Spec = LoggedSpec.Spec;
+  // The 'Feature' protobuf only has 3 possible fields: float_list,
+  // int64_list, or bytes_list, so we capture int32 values as int64. We don't
+  // support any other types.
+  if (Spec.isElementType<int64_t>()) {
+    FieldName = "int64_list";
+    ValueWriter = [&](const char *Data) {
+      writeTensorValues<int64_t>(OutFile, Data, Spec.getElementCount());
+    };
+  } else if (Spec.isElementType<int32_t>()) {
+    FieldName = "int64_list";
+    ValueWriter = [&](const char *Data) {
+      writeTensorValues<int32_t>(OutFile, Data, Spec.getElementCount());
+    };
+
+  } else if (Spec.isElementType<float>()) {
+    FieldName = "float_list";
+    ValueWriter = [&](const char *Data) {
+      writeTensorValues<float>(OutFile, Data, Spec.getElementCount());
+    };
+
+  } else {
+    llvm_unreachable("Unsupported tensor type.");
+  }
+
+  OutFile << "  feature_list: {\n";
+  OutFile << "    key: "
+          << "\""
+          << (LoggedSpec.LoggingName ? *LoggedSpec.LoggingName : Spec.name())
+          << "\" ";
+  OutFile << "value: {\n";
+  size_t TensorByteSize = Spec.getElementCount() * Spec.getElementByteSize();
+
+  auto WriteFeatureProto = [&](const char *P) {
+    OutFile << "      feature: { " << FieldName << ": { value: ";
+    ValueWriter(P);
+    OutFile << " } }\n";
+  };
+
+  const char *CurrentTensor = TensorData;
+  static int64_t Zero = 0;
+  // Write all but the last value. If this is the final reward, don't increment
+  // the CurrentTensor, and just write 0.
+  for (size_t I = 0; I < TensorCount - 1; ++I) {
+    if (FinalReward)
+      WriteFeatureProto(reinterpret_cast<const char *>(&Zero));
+    else {
+      WriteFeatureProto(CurrentTensor);
+      CurrentTensor += TensorByteSize;
+    }
+  }
+
+  WriteFeatureProto(CurrentTensor);
+
+  OutFile << "    }\n";
+  OutFile << "  }\n";
+}
 } // namespace
 
 namespace llvm {
@@ -81,12 +170,122 @@ private:
   std::vector<TF_Tensor *> Output;
 };
 
+size_t TensorSpec::getElementByteSize() const {
+  return TF_DataTypeSize(static_cast<TF_DataType>(TypeIndex));
+}
+
+TensorSpec::TensorSpec(const std::string &Name, int Port, int TypeIndex,
+                       const std::vector<int64_t> &Shape)
+    : Name(Name), Port(Port), TypeIndex(TypeIndex), Shape(Shape),
+      ElementCount(std::accumulate(Shape.begin(), Shape.end(), 1,
+                                   std::multiplies<int64_t>())) {}
+
+Optional<TensorSpec> getTensorSpecFromJSON(LLVMContext &Ctx,
+                                           const json::Value &Value) {
+  auto EmitError = [&](const llvm::Twine &Message) -> Optional<TensorSpec> {
+    std::string S;
+    llvm::raw_string_ostream OS(S);
+    OS << Value;
+    Ctx.emitError("Unable to parse JSON Value as spec (" + Message + "): " + S);
+    return None;
+  };
+  // FIXME: accept a Path as a parameter, and use it for error reporting.
+  json::Path::Root Root("tensor_spec");
+  json::ObjectMapper Mapper(Value, Root);
+  if (!Mapper)
+    return EmitError("Value is not a dict");
+
+  std::string TensorName;
+  int TensorPort = -1;
+  std::string TensorType;
+  std::vector<int64_t> TensorShape;
+
+  if (!Mapper.map<std::string>("name", TensorName))
+    return EmitError("'name' property not present or not a string");
+  if (!Mapper.map<std::string>("type", TensorType))
+    return EmitError("'type' property not present or not a string");
+  if (!Mapper.map<int>("port", TensorPort))
+    return EmitError("'port' property not present or not an int");
+  if (!Mapper.map<std::vector<int64_t>>("shape", TensorShape))
+    return EmitError("'shape' property not present or not an int array");
+
+#define PARSE_TYPE(T, E)                                                       \
+  if (TensorType == #T)                                                        \
+    return TensorSpec::createSpec<T>(TensorName, TensorShape, TensorPort);
+  TFUTILS_SUPPORTED_TYPES(PARSE_TYPE)
+#undef PARSE_TYPE
+  return None;
+}
+
+Optional<std::vector<LoggedFeatureSpec>>
+loadOutputSpecs(LLVMContext &Ctx, StringRef ExpectedDecisionName,
+                StringRef ModelPath, StringRef SpecFileOverride) {
+  SmallVector<char, 128> OutputSpecsPath;
+  StringRef FileName = SpecFileOverride;
+  if (FileName.empty()) {
+    llvm::sys::path::append(OutputSpecsPath, ModelPath, "output_spec.json");
+    FileName = {OutputSpecsPath.data(), OutputSpecsPath.size()};
+  }
+
+  auto BufferOrError = MemoryBuffer::getFileOrSTDIN(FileName);
+  if (!BufferOrError) {
+    Ctx.emitError("Error opening output specs file: " + FileName + " : " +
+                  BufferOrError.getError().message());
+    return None;
+  }
+  auto ParsedJSONValues = json::parse(BufferOrError.get()->getBuffer());
+  if (!ParsedJSONValues) {
+    Ctx.emitError("Could not parse specs file: " + FileName);
+    return None;
+  }
+  auto ValuesArray = ParsedJSONValues->getAsArray();
+  if (!ValuesArray) {
+    Ctx.emitError("Expected an array of {tensor_spec:<TensorSpec>, "
+                  "logging_name:<name>} dictionaries");
+    return None;
+  }
+  std::vector<LoggedFeatureSpec> Ret;
+  for (const auto &Value : *ValuesArray)
+    if (const auto *Obj = Value.getAsObject())
+      if (const auto *SpecPart = Obj->get("tensor_spec"))
+        if (auto TensorSpec = getTensorSpecFromJSON(Ctx, *SpecPart))
+          if (auto LoggingName = Obj->getString("logging_name")) {
+            if (!TensorSpec->isElementType<int64_t>() &&
+                !TensorSpec->isElementType<int32_t>() &&
+                !TensorSpec->isElementType<float>()) {
+              Ctx.emitError(
+                  "Only int64, int32, and float tensors are supported. "
+                  "Found unsupported type for tensor named " +
+                  TensorSpec->name());
+              return None;
+            }
+            Ret.push_back({*TensorSpec, LoggingName->str()});
+          }
+
+  if (ValuesArray->size() != Ret.size()) {
+    Ctx.emitError(
+        "Unable to parse output spec. It should be a json file containing an "
+        "array of dictionaries. Each dictionary must have a 'tensor_spec' key, "
+        "with a json object describing a TensorSpec; and a 'logging_name' key, "
+        "which is a string to use as name when logging this tensor in the "
+        "training log.");
+    return None;
+  }
+  if (Ret.empty() || *Ret[0].LoggingName != ExpectedDecisionName) {
+    Ctx.emitError("The first output spec must describe the decision tensor, "
+                  "and must have the logging_name " +
+                  StringRef(ExpectedDecisionName));
+    return None;
+  }
+  return Ret;
+}
+
 class TFModelEvaluatorImpl {
 public:
   TFModelEvaluatorImpl(StringRef SavedModelPath,
-                       const std::vector<std::string> &InputNames,
-                       const std::vector<std::string> &OutputNames,
-                       const char *Tags);
+                       const std::vector<TensorSpec> &InputSpecs,
+                       function_ref<TensorSpec(size_t)> GetOutputSpecs,
+                       size_t OutputSpecsSize, const char *Tags);
 
   bool isValid() const { return IsValid; }
   size_t OutputSize() const { return OutputFeed.size(); }
@@ -130,16 +329,18 @@ private:
 
   /// Reusable utility for ensuring we can bind the requested Name to a node in
   /// the SavedModel Graph.
-  bool checkReportAndInvalidate(const TF_Output &Output, StringRef Name);
+  bool checkReportAndInvalidate(const TF_Output &Output,
+                                const TensorSpec &OutputSpec);
 };
 } // namespace llvm
 
 TFModelEvaluatorImpl::TFModelEvaluatorImpl(
-    StringRef SavedModelPath, const std::vector<std::string> &InputNames,
-    const std::vector<std::string> &OutputNames, const char *Tags)
+    StringRef SavedModelPath, const std::vector<TensorSpec> &InputSpecs,
+    function_ref<TensorSpec(size_t)> GetOutputSpecs, size_t OutputSpecsSize,
+    const char *Tags = "serve")
     : Graph(createTFGraph()), Options(createTFSessionOptions()),
-      InputFeed(InputNames.size()), Input(InputNames.size()),
-      OutputFeed(OutputNames.size()) {
+      InputFeed(InputSpecs.size()), Input(InputSpecs.size()),
+      OutputFeed(OutputSpecsSize) {
   if (!ensureInitTF()) {
     errs() << "Tensorflow should have been initialized";
     return;
@@ -153,30 +354,44 @@ TFModelEvaluatorImpl::TFModelEvaluatorImpl(
     errs() << TF_Message(Status.get());
     invalidate();
   }
-  for (size_t I = 0; I < InputNames.size(); ++I) {
+  for (size_t I = 0; I < InputSpecs.size(); ++I) {
+    auto &InputSpec = InputSpecs[I];
     InputFeed[I] = {
-        TF_GraphOperationByName(Graph.get(), (InputNames[I]).c_str()), 0};
-    if (!checkReportAndInvalidate(InputFeed[I], InputNames[I]))
+        TF_GraphOperationByName(Graph.get(), (InputSpec.name()).c_str()),
+        InputSpec.port()};
+    if (!checkReportAndInvalidate(InputFeed[I], InputSpec))
       return;
+    initInput(I, static_cast<TF_DataType>(InputSpec.typeIndex()),
+              InputSpec.shape());
   }
-  for (size_t I = 0; I < OutputNames.size(); ++I) {
+  for (size_t I = 0; I < OutputSpecsSize; ++I) {
+    auto OutputSpec = GetOutputSpecs(I);
     OutputFeed[I] = {
-        TF_GraphOperationByName(Graph.get(), (OutputNames[I]).c_str()), 0};
-    if (!checkReportAndInvalidate(OutputFeed[I], OutputNames[I]))
+        TF_GraphOperationByName(Graph.get(), (OutputSpec.name()).c_str()),
+        OutputSpec.port()};
+    if (!checkReportAndInvalidate(OutputFeed[I], OutputSpec))
       return;
   }
 }
 
-TFModelEvaluator::TFModelEvaluator(StringRef SavedModelPath,
-                                   const std::vector<std::string> &InputNames,
-                                   const std::vector<std::string> &OutputNames,
-                                   const char *Tags)
-    : Impl(new TFModelEvaluatorImpl(SavedModelPath, InputNames, OutputNames,
-                                    Tags)) {
+TFModelEvaluator::TFModelEvaluator(
+    StringRef SavedModelPath, const std::vector<TensorSpec> &InputSpecs,
+    function_ref<TensorSpec(size_t)> GetOutputSpecs, size_t OutputSpecsSize,
+    const char *Tags)
+    : Impl(new TFModelEvaluatorImpl(SavedModelPath, InputSpecs, GetOutputSpecs,
+                                    OutputSpecsSize, Tags)) {
   if (!Impl->isValid())
     Impl.reset();
 }
 
+TFModelEvaluator::TFModelEvaluator(StringRef SavedModelPath,
+                                   const std::vector<TensorSpec> &InputSpecs,
+                                   const std::vector<TensorSpec> &OutputSpecs,
+                                   const char *Tags)
+    : TFModelEvaluator(
+          SavedModelPath, InputSpecs, [&](size_t I) { return OutputSpecs[I]; },
+          OutputSpecs.size(), Tags) {}
+
 TFModelEvaluatorImpl::~TFModelEvaluatorImpl() {
   for (auto *T : Input) {
     TF_DeleteTensor(T);
@@ -190,11 +405,11 @@ TFModelEvaluatorImpl::~TFModelEvaluatorImpl() {
     errs() << "Could not delete TF session";
 }
 
-bool TFModelEvaluatorImpl::checkReportAndInvalidate(const TF_Output &Output,
-                                                    StringRef Name) {
+bool TFModelEvaluatorImpl::checkReportAndInvalidate(
+    const TF_Output &Output, const TensorSpec &OutputSpec) {
   if (Output.oper)
     return true;
-  errs() << "Could not find TF_Output named: " + Name;
+  errs() << "Could not find TF_Output named: " + OutputSpec.name();
   IsValid = false;
   return IsValid;
 }
@@ -236,54 +451,55 @@ TFModelEvaluator::EvaluationResult::EvaluationResult(
 TFModelEvaluator::EvaluationResult::EvaluationResult(EvaluationResult &&Other)
     : Impl(std::move(Other.Impl)) {}
 
-void *TFModelEvaluator::EvaluationResult::getUntypedTensorValue(size_t Index) {
-  return TF_TensorData(Impl->getOutput()[Index]);
+TFModelEvaluator::EvaluationResult &
+TFModelEvaluator::EvaluationResult::operator=(EvaluationResult &&Other) {
+  Impl = std::move(Other.Impl);
+  return *this;
 }
 
-void TFModelEvaluator::initInput(size_t Index, int TypeIndex,
-                                 const std::vector<int64_t> &Dimensions) {
-  Impl->initInput(Index, static_cast<TF_DataType>(TypeIndex), Dimensions);
+void *TFModelEvaluator::EvaluationResult::getUntypedTensorValue(size_t Index) {
+  return TF_TensorData(Impl->getOutput()[Index]);
 }
 
-template <> int TFModelEvaluator::getModelTypeIndex<float>() {
-  return TF_FLOAT;
+const void *
+TFModelEvaluator::EvaluationResult::getUntypedTensorValue(size_t Index) const {
+  return TF_TensorData(Impl->getOutput()[Index]);
 }
 
-template <> int TFModelEvaluator::getModelTypeIndex<double>() {
-  return TF_DOUBLE;
-}
+#define TFUTILS_GETDATATYPE_IMPL(T, E)                                         \
+  template <> int TensorSpec::getDataType<T>() { return E; }
 
-template <> int TFModelEvaluator::getModelTypeIndex<int8_t>() {
-  return TF_INT8;
-}
+TFUTILS_SUPPORTED_TYPES(TFUTILS_GETDATATYPE_IMPL)
 
-template <> int TFModelEvaluator::getModelTypeIndex<uint8_t>() {
-  return TF_UINT8;
-}
+#undef TFUTILS_GETDATATYPE_IMPL
 
-template <> int TFModelEvaluator::getModelTypeIndex<int16_t>() {
-  return TF_INT16;
-}
-
-template <> int TFModelEvaluator::getModelTypeIndex<uint16_t>() {
-  return TF_UINT16;
-}
+TFModelEvaluator::EvaluationResult::~EvaluationResult() {}
+TFModelEvaluator::~TFModelEvaluator() {}
 
-template <> int TFModelEvaluator::getModelTypeIndex<int32_t>() {
-  return TF_INT32;
-}
+void Logger::print(raw_ostream &OS) {
+  if (RawLogData.empty())
+    return;
+  if (RawLogData[0].empty())
+    return;
+  size_t Tensor0Size = FeatureSpecs[0].Spec.getElementCount() *
+                       FeatureSpecs[0].Spec.getElementByteSize();
+  size_t NumberOfRecords = RawLogData[0].size() / Tensor0Size;
+  if (NumberOfRecords == 0)
+    return;
+  size_t RewardSize =
+      RewardSpec.getElementCount() * RewardSpec.getElementByteSize();
+  size_t NumberOfRewards = RawLogData.back().size() / RewardSize;
 
-template <> int TFModelEvaluator::getModelTypeIndex<uint32_t>() {
-  return TF_UINT32;
-}
+  OS << "feature_lists: {\n";
+  for (size_t I = 0; I < FeatureSpecs.size(); ++I)
+    writeRawTensorsAsFeatureLists(OS, FeatureSpecs[I], RawLogData[I].data(),
+                                  NumberOfRecords);
 
-template <> int TFModelEvaluator::getModelTypeIndex<int64_t>() {
-  return TF_INT64;
-}
+  if (IncludeReward)
+    writeRawTensorsAsFeatureLists(OS, {RewardSpec, None},
+                                  RawLogData.back().data(), NumberOfRecords,
+                                  NumberOfRewards == 1);
 
-template <> int TFModelEvaluator::getModelTypeIndex<uint64_t>() {
-  return TF_UINT64;
+  OS << "}\n";
 }
-
-TFModelEvaluator::EvaluationResult::~EvaluationResult() {}
-TFModelEvaluator::~TFModelEvaluator() {}
+#endif // defined(LLVM_HAVE_TF_API)
diff --git a/contrib/llvm-project/llvm/lib/Analysis/TargetLibraryInfo.cpp b/contrib/llvm-project/llvm/lib/Analysis/TargetLibraryInfo.cpp
index 60cfb04634c4..a4de21a2541e 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/TargetLibraryInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/TargetLibraryInfo.cpp
@@ -24,6 +24,8 @@ static cl::opt<TargetLibraryInfoImpl::VectorLibrary> ClVectorLibrary(
                           "No vector functions library"),
                clEnumValN(TargetLibraryInfoImpl::Accelerate, "Accelerate",
                           "Accelerate framework"),
+               clEnumValN(TargetLibraryInfoImpl::LIBMVEC_X86, "LIBMVEC-X86",
+                          "GLIBC Vector Math library"),
                clEnumValN(TargetLibraryInfoImpl::MASSV, "MASSV",
                           "IBM MASS vector library"),
                clEnumValN(TargetLibraryInfoImpl::SVML, "SVML",
@@ -549,6 +551,14 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
     TLI.setUnavailable(LibFunc_nvvm_reflect);
   }
 
+  // These vec_malloc/free routines are only available on AIX.
+  if (!T.isOSAIX()) {
+    TLI.setUnavailable(LibFunc_vec_calloc);
+    TLI.setUnavailable(LibFunc_vec_malloc);
+    TLI.setUnavailable(LibFunc_vec_realloc);
+    TLI.setUnavailable(LibFunc_vec_free);
+  }
+
   TLI.addVectorizableFunctionsFromVecLib(ClVectorLibrary);
 }
 
@@ -636,7 +646,7 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
                                                    const DataLayout *DL) const {
   LLVMContext &Ctx = FTy.getContext();
   Type *PCharTy = Type::getInt8PtrTy(Ctx);
-  Type *SizeTTy = DL ? DL->getIntPtrType(Ctx, /*AS=*/0) : nullptr;
+  Type *SizeTTy = DL ? DL->getIntPtrType(Ctx, /*AddressSpace=*/0) : nullptr;
   auto IsSizeTTy = [SizeTTy](Type *Ty) {
     return SizeTTy ? Ty == SizeTTy : Ty->isIntegerTy();
   };
@@ -829,6 +839,7 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
   case LibFunc_system:
     return (NumParams == 1 && FTy.getParamType(0)->isPointerTy());
   case LibFunc_malloc:
+  case LibFunc_vec_malloc:
     return (NumParams == 1 && FTy.getReturnType()->isPointerTy());
   case LibFunc_memcmp:
     return (NumParams == 3 && FTy.getReturnType()->isIntegerTy(32) &&
@@ -847,6 +858,7 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
     return (NumParams >= 2 && FTy.getParamType(1)->isPointerTy());
 
   case LibFunc_memcpy_chk:
+  case LibFunc_mempcpy_chk:
   case LibFunc_memmove_chk:
     --NumParams;
     if (!IsSizeTTy(FTy.getParamType(NumParams)))
@@ -882,6 +894,7 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
     return (FTy.getReturnType()->isPointerTy());
   case LibFunc_realloc:
   case LibFunc_reallocf:
+  case LibFunc_vec_realloc:
     return (NumParams == 2 && FTy.getReturnType() == PCharTy &&
             FTy.getParamType(0) == FTy.getReturnType() &&
             IsSizeTTy(FTy.getParamType(1)));
@@ -909,6 +922,7 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
   case LibFunc_bzero:
     return (NumParams == 2 && FTy.getParamType(0)->isPointerTy());
   case LibFunc_calloc:
+  case LibFunc_vec_calloc:
     return (NumParams == 2 && FTy.getReturnType()->isPointerTy());
 
   case LibFunc_atof:
@@ -959,6 +973,7 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
   case LibFunc_mkdir:
   case LibFunc_mktime:
   case LibFunc_times:
+  case LibFunc_vec_free:
     return (NumParams != 0 && FTy.getParamType(0)->isPointerTy());
 
   case LibFunc_fopen:
@@ -1228,6 +1243,15 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
   case LibFunc_ZdaPvmSt11align_val_t:
     return (NumParams == 3 && FTy.getParamType(0)->isPointerTy());
 
+  // void __atomic_load(size_t, void *, void *, int)
+  case LibFunc_atomic_load:
+  // void __atomic_store(size_t, void *, void *, int)
+  case LibFunc_atomic_store:
+    return (NumParams == 4 && FTy.getParamType(0)->isIntegerTy() &&
+            FTy.getParamType(1)->isPointerTy() &&
+            FTy.getParamType(2)->isPointerTy() &&
+            FTy.getParamType(3)->isIntegerTy());
+
   case LibFunc_memset_pattern16:
     return (!FTy.isVarArg() && NumParams == 3 &&
             FTy.getParamType(0)->isPointerTy() &&
@@ -1531,10 +1555,10 @@ static bool compareWithVectorFnName(const VecDesc &LHS, StringRef S) {
 }
 
 void TargetLibraryInfoImpl::addVectorizableFunctions(ArrayRef<VecDesc> Fns) {
-  VectorDescs.insert(VectorDescs.end(), Fns.begin(), Fns.end());
+  llvm::append_range(VectorDescs, Fns);
   llvm::sort(VectorDescs, compareByScalarFnName);
 
-  ScalarDescs.insert(ScalarDescs.end(), Fns.begin(), Fns.end());
+  llvm::append_range(ScalarDescs, Fns);
   llvm::sort(ScalarDescs, compareByVectorFnName);
 }
 
@@ -1549,6 +1573,14 @@ void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib(
     addVectorizableFunctions(VecFuncs);
     break;
   }
+  case LIBMVEC_X86: {
+    const VecDesc VecFuncs[] = {
+    #define TLI_DEFINE_LIBMVEC_X86_VECFUNCS
+    #include "llvm/Analysis/VecFuncs.def"
+    };
+    addVectorizableFunctions(VecFuncs);
+    break;
+  }
   case MASSV: {
     const VecDesc VecFuncs[] = {
     #define TLI_DEFINE_MASSV_VECFUNCS
diff --git a/contrib/llvm-project/llvm/lib/Analysis/TargetTransformInfo.cpp b/contrib/llvm-project/llvm/lib/Analysis/TargetTransformInfo.cpp
index 2f051e53790b..e498401eb8b5 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -71,6 +71,7 @@ IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id,
   if (const auto *FPMO = dyn_cast<FPMathOperator>(&CI))
     FMF = FPMO->getFastMathFlags();
 
+  Arguments.insert(Arguments.begin(), CI.arg_begin(), CI.arg_end());
   FunctionType *FTy =
     CI.getCalledFunction()->getFunctionType();
   ParamTys.insert(ParamTys.begin(), FTy->param_begin(), FTy->param_end());
@@ -78,9 +79,10 @@ IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id,
 
 IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id,
                                                  const CallBase &CI,
-                                                 unsigned Factor) :
-    RetTy(CI.getType()), IID(Id), VF(Factor) {
+                                                 ElementCount Factor)
+    : RetTy(CI.getType()), IID(Id), VF(Factor) {
 
+  assert(!Factor.isScalable() && "Scalable vectors are not yet supported");
   if (auto *FPMO = dyn_cast<FPMathOperator>(&CI))
     FMF = FPMO->getFastMathFlags();
 
@@ -92,9 +94,9 @@ IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id,
 
 IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id,
                                                  const CallBase &CI,
-                                                 unsigned Factor,
-                                                 unsigned ScalarCost) :
-    RetTy(CI.getType()), IID(Id), VF(Factor), ScalarizationCost(ScalarCost) {
+                                                 ElementCount Factor,
+                                                 unsigned ScalarCost)
+    : RetTy(CI.getType()), IID(Id), VF(Factor), ScalarizationCost(ScalarCost) {
 
   if (const auto *FPMO = dyn_cast<FPMathOperator>(&CI))
     FMF = FPMO->getFastMathFlags();
@@ -215,7 +217,11 @@ bool HardwareLoopInfo::isHardwareLoopCandidate(ScalarEvolution &SE,
     // Note that this block may not be the loop latch block, even if the loop
     // has a latch block.
     ExitBlock = BB;
-    ExitCount = EC;
+    TripCount = SE.getAddExpr(EC, SE.getOne(EC->getType()));
+
+    if (!EC->getType()->isPointerTy() && EC->getType() != CountType)
+      TripCount = SE.getZeroExtendExpr(TripCount, CountType);
+
     break;
   }
 
@@ -241,6 +247,11 @@ unsigned TargetTransformInfo::getInliningThresholdMultiplier() const {
   return TTIImpl->getInliningThresholdMultiplier();
 }
 
+unsigned
+TargetTransformInfo::adjustInliningThreshold(const CallBase *CB) const {
+  return TTIImpl->adjustInliningThreshold(CB);
+}
+
 int TargetTransformInfo::getInlinerVectorBonusPercent() const {
   return TTIImpl->getInlinerVectorBonusPercent();
 }
@@ -296,6 +307,10 @@ bool TargetTransformInfo::isNoopAddrSpaceCast(unsigned FromAS,
   return TTIImpl->isNoopAddrSpaceCast(FromAS, ToAS);
 }
 
+unsigned TargetTransformInfo::getAssumedAddrSpace(const Value *V) const {
+  return TTIImpl->getAssumedAddrSpace(V);
+}
+
 Value *TargetTransformInfo::rewriteIntrinsicWithAddressSpace(
     IntrinsicInst *II, Value *OldV, Value *NewV) const {
   return TTIImpl->rewriteIntrinsicWithAddressSpace(II, OldV, NewV);
@@ -322,6 +337,29 @@ bool TargetTransformInfo::emitGetActiveLaneMask() const {
   return TTIImpl->emitGetActiveLaneMask();
 }
 
+Optional<Instruction *>
+TargetTransformInfo::instCombineIntrinsic(InstCombiner &IC,
+                                          IntrinsicInst &II) const {
+  return TTIImpl->instCombineIntrinsic(IC, II);
+}
+
+Optional<Value *> TargetTransformInfo::simplifyDemandedUseBitsIntrinsic(
+    InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known,
+    bool &KnownBitsComputed) const {
+  return TTIImpl->simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known,
+                                                   KnownBitsComputed);
+}
+
+Optional<Value *> TargetTransformInfo::simplifyDemandedVectorEltsIntrinsic(
+    InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
+    APInt &UndefElts2, APInt &UndefElts3,
+    std::function<void(Instruction *, unsigned, APInt, APInt &)>
+        SimplifyAndSetOp) const {
+  return TTIImpl->simplifyDemandedVectorEltsIntrinsic(
+      IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
+      SimplifyAndSetOp);
+}
+
 void TargetTransformInfo::getUnrollingPreferences(
     Loop *L, ScalarEvolution &SE, UnrollingPreferences &UP) const {
   return TTIImpl->getUnrollingPreferences(L, SE, UP);
@@ -353,6 +391,10 @@ bool TargetTransformInfo::isLSRCostLess(LSRCost &C1, LSRCost &C2) const {
   return TTIImpl->isLSRCostLess(C1, C2);
 }
 
+bool TargetTransformInfo::isNumRegsMajorCostOfLSR() const {
+  return TTIImpl->isNumRegsMajorCostOfLSR();
+}
+
 bool TargetTransformInfo::isProfitableLSRChainElement(Instruction *I) const {
   return TTIImpl->isProfitableLSRChainElement(I);
 }
@@ -454,6 +496,10 @@ bool TargetTransformInfo::isTypeLegal(Type *Ty) const {
   return TTIImpl->isTypeLegal(Ty);
 }
 
+unsigned TargetTransformInfo::getRegUsageForType(Type *Ty) const {
+  return TTIImpl->getRegUsageForType(Ty);
+}
+
 bool TargetTransformInfo::shouldBuildLookupTables() const {
   return TTIImpl->shouldBuildLookupTables();
 }
@@ -547,11 +593,11 @@ int TargetTransformInfo::getIntImmCost(const APInt &Imm, Type *Ty,
   return Cost;
 }
 
-int
-TargetTransformInfo::getIntImmCostInst(unsigned Opcode, unsigned Idx,
-                                       const APInt &Imm, Type *Ty,
-                                       TTI::TargetCostKind CostKind) const {
-  int Cost = TTIImpl->getIntImmCostInst(Opcode, Idx, Imm, Ty, CostKind);
+int TargetTransformInfo::getIntImmCostInst(unsigned Opcode, unsigned Idx,
+                                           const APInt &Imm, Type *Ty,
+                                           TTI::TargetCostKind CostKind,
+                                           Instruction *Inst) const {
+  int Cost = TTIImpl->getIntImmCostInst(Opcode, Idx, Imm, Ty, CostKind, Inst);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
@@ -586,6 +632,10 @@ unsigned TargetTransformInfo::getMinVectorRegisterBitWidth() const {
   return TTIImpl->getMinVectorRegisterBitWidth();
 }
 
+Optional<unsigned> TargetTransformInfo::getMaxVScale() const {
+  return TTIImpl->getMaxVScale();
+}
+
 bool TargetTransformInfo::shouldMaximizeVectorBandwidth(bool OptSize) const {
   return TTIImpl->shouldMaximizeVectorBandwidth(OptSize);
 }
@@ -594,6 +644,11 @@ unsigned TargetTransformInfo::getMinimumVF(unsigned ElemWidth) const {
   return TTIImpl->getMinimumVF(ElemWidth);
 }
 
+unsigned TargetTransformInfo::getMaximumVF(unsigned ElemWidth,
+                                           unsigned Opcode) const {
+  return TTIImpl->getMaximumVF(ElemWidth, Opcode);
+}
+
 bool TargetTransformInfo::shouldConsiderAddressTypePromotion(
     const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
   return TTIImpl->shouldConsiderAddressTypePromotion(
@@ -707,12 +762,57 @@ int TargetTransformInfo::getShuffleCost(ShuffleKind Kind, VectorType *Ty,
   return Cost;
 }
 
+TTI::CastContextHint
+TargetTransformInfo::getCastContextHint(const Instruction *I) {
+  if (!I)
+    return CastContextHint::None;
+
+  auto getLoadStoreKind = [](const Value *V, unsigned LdStOp, unsigned MaskedOp,
+                             unsigned GatScatOp) {
+    const Instruction *I = dyn_cast<Instruction>(V);
+    if (!I)
+      return CastContextHint::None;
+
+    if (I->getOpcode() == LdStOp)
+      return CastContextHint::Normal;
+
+    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+      if (II->getIntrinsicID() == MaskedOp)
+        return TTI::CastContextHint::Masked;
+      if (II->getIntrinsicID() == GatScatOp)
+        return TTI::CastContextHint::GatherScatter;
+    }
+
+    return TTI::CastContextHint::None;
+  };
+
+  switch (I->getOpcode()) {
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::FPExt:
+    return getLoadStoreKind(I->getOperand(0), Instruction::Load,
+                            Intrinsic::masked_load, Intrinsic::masked_gather);
+  case Instruction::Trunc:
+  case Instruction::FPTrunc:
+    if (I->hasOneUse())
+      return getLoadStoreKind(*I->user_begin(), Instruction::Store,
+                              Intrinsic::masked_store,
+                              Intrinsic::masked_scatter);
+    break;
+  default:
+    return CastContextHint::None;
+  }
+
+  return TTI::CastContextHint::None;
+}
+
 int TargetTransformInfo::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                                          CastContextHint CCH,
                                           TTI::TargetCostKind CostKind,
                                           const Instruction *I) const {
   assert((I == nullptr || I->getOpcode() == Opcode) &&
          "Opcode should reflect passed instruction.");
-  int Cost = TTIImpl->getCastInstrCost(Opcode, Dst, Src, CostKind, I);
+  int Cost = TTIImpl->getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
@@ -734,11 +834,13 @@ int TargetTransformInfo::getCFInstrCost(unsigned Opcode,
 
 int TargetTransformInfo::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
                                             Type *CondTy,
+                                            CmpInst::Predicate VecPred,
                                             TTI::TargetCostKind CostKind,
                                             const Instruction *I) const {
   assert((I == nullptr || I->getOpcode() == Opcode) &&
          "Opcode should reflect passed instruction.");
-  int Cost = TTIImpl->getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I);
+  int Cost =
+      TTIImpl->getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
@@ -846,6 +948,13 @@ int TargetTransformInfo::getMinMaxReductionCost(
   return Cost;
 }
 
+InstructionCost TargetTransformInfo::getExtendedAddReductionCost(
+    bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *Ty,
+    TTI::TargetCostKind CostKind) const {
+  return TTIImpl->getExtendedAddReductionCost(IsMLA, IsUnsigned, ResTy, Ty,
+                                              CostKind);
+}
+
 unsigned
 TargetTransformInfo::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) const {
   return TTIImpl->getCostOfKeepingLiveOverCall(Tys);
@@ -945,6 +1054,16 @@ bool TargetTransformInfo::useReductionIntrinsic(unsigned Opcode, Type *Ty,
   return TTIImpl->useReductionIntrinsic(Opcode, Ty, Flags);
 }
 
+bool TargetTransformInfo::preferInLoopReduction(unsigned Opcode, Type *Ty,
+                                                ReductionFlags Flags) const {
+  return TTIImpl->preferInLoopReduction(Opcode, Ty, Flags);
+}
+
+bool TargetTransformInfo::preferPredicatedReductionSelect(
+    unsigned Opcode, Type *Ty, ReductionFlags Flags) const {
+  return TTIImpl->preferPredicatedReductionSelect(Opcode, Ty, Flags);
+}
+
 bool TargetTransformInfo::shouldExpandReduction(const IntrinsicInst *II) const {
   return TTIImpl->shouldExpandReduction(II);
 }
@@ -953,6 +1072,10 @@ unsigned TargetTransformInfo::getGISelRematGlobalCost() const {
   return TTIImpl->getGISelRematGlobalCost();
 }
 
+bool TargetTransformInfo::supportsScalableVectors() const {
+  return TTIImpl->supportsScalableVectors();
+}
+
 int TargetTransformInfo::getInstructionLatency(const Instruction *I) const {
   return TTIImpl->getInstructionLatency(I);
 }
@@ -966,7 +1089,8 @@ static bool matchPairwiseShuffleMask(ShuffleVectorInst *SI, bool IsLeft,
   else if (!SI)
     return false;
 
-  SmallVector<int, 32> Mask(SI->getType()->getNumElements(), -1);
+  SmallVector<int, 32> Mask(
+      cast<FixedVectorType>(SI->getType())->getNumElements(), -1);
 
   // Build a mask of 0, 2, ... (left) or 1, 3, ... (right) depending on whether
   // we look at the left or right side.
@@ -1105,7 +1229,7 @@ TTI::ReductionKind TTI::matchPairwiseReduction(
   if (!RD)
     return TTI::RK_None;
 
-  auto *VecTy = cast<VectorType>(RdxStart->getType());
+  auto *VecTy = cast<FixedVectorType>(RdxStart->getType());
   unsigned NumVecElems = VecTy->getNumElements();
   if (!isPowerOf2_32(NumVecElems))
     return TTI::RK_None;
@@ -1170,7 +1294,7 @@ TTI::ReductionKind TTI::matchVectorSplittingReduction(
   if (!RD)
     return TTI::RK_None;
 
-  auto *VecTy = cast<VectorType>(ReduxRoot->getOperand(0)->getType());
+  auto *VecTy = cast<FixedVectorType>(ReduxRoot->getOperand(0)->getType());
   unsigned NumVecElems = VecTy->getNumElements();
   if (!isPowerOf2_32(NumVecElems))
     return TTI::RK_None;
@@ -1229,6 +1353,18 @@ TTI::ReductionKind TTI::matchVectorSplittingReduction(
   return RD->Kind;
 }
 
+TTI::ReductionKind
+TTI::matchVectorReduction(const ExtractElementInst *Root, unsigned &Opcode,
+                          VectorType *&Ty, bool &IsPairwise) {
+  TTI::ReductionKind RdxKind = matchVectorSplittingReduction(Root, Opcode, Ty);
+  if (RdxKind != TTI::ReductionKind::RK_None) {
+    IsPairwise = false;
+    return RdxKind;
+  }
+  IsPairwise = true;
+  return matchPairwiseReduction(Root, Opcode, Ty);
+}
+
 int TargetTransformInfo::getInstructionThroughput(const Instruction *I) const {
   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
 
diff --git a/contrib/llvm-project/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp b/contrib/llvm-project/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
index da4520066b46..268acb682cf1 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
@@ -111,6 +111,7 @@
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
@@ -736,3 +737,84 @@ bool TypeBasedAAWrapperPass::doFinalization(Module &M) {
 void TypeBasedAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
 }
+
+MDNode *AAMDNodes::ShiftTBAA(MDNode *MD, size_t Offset) {
+  // Fast path if there's no offset
+  if (Offset == 0)
+    return MD;
+  // Fast path if there's no path tbaa node (and thus scalar)
+  if (!isStructPathTBAA(MD))
+    return MD;
+
+  TBAAStructTagNode Tag(MD);
+  SmallVector<Metadata *, 5> Sub;
+  Sub.push_back(MD->getOperand(0));
+  Sub.push_back(MD->getOperand(1));
+  ConstantInt *InnerOffset = mdconst::extract<ConstantInt>(MD->getOperand(2));
+
+  if (Tag.isNewFormat()) {
+    ConstantInt *InnerSize = mdconst::extract<ConstantInt>(MD->getOperand(3));
+
+    if (InnerOffset->getZExtValue() + InnerSize->getZExtValue() <= Offset) {
+      return nullptr;
+    }
+
+    uint64_t NewSize = InnerSize->getZExtValue();
+    uint64_t NewOffset = InnerOffset->getZExtValue() - Offset;
+    if (InnerOffset->getZExtValue() < Offset) {
+      NewOffset = 0;
+      NewSize -= Offset - InnerOffset->getZExtValue();
+    }
+
+    Sub.push_back(ConstantAsMetadata::get(
+        ConstantInt::get(InnerOffset->getType(), NewOffset)));
+
+    Sub.push_back(ConstantAsMetadata::get(
+        ConstantInt::get(InnerSize->getType(), NewSize)));
+
+    // immutable type
+    if (MD->getNumOperands() >= 5)
+      Sub.push_back(MD->getOperand(4));
+  } else {
+    if (InnerOffset->getZExtValue() < Offset)
+      return nullptr;
+
+    Sub.push_back(ConstantAsMetadata::get(ConstantInt::get(
+        InnerOffset->getType(), InnerOffset->getZExtValue() - Offset)));
+
+    // immutable type
+    if (MD->getNumOperands() >= 4)
+      Sub.push_back(MD->getOperand(3));
+  }
+  return MDNode::get(MD->getContext(), Sub);
+}
+
+MDNode *AAMDNodes::ShiftTBAAStruct(MDNode *MD, size_t Offset) {
+  // Fast path if there's no offset
+  if (Offset == 0)
+    return MD;
+  SmallVector<Metadata *, 3> Sub;
+  for (size_t i = 0, size = MD->getNumOperands(); i < size; i += 3) {
+    ConstantInt *InnerOffset = mdconst::extract<ConstantInt>(MD->getOperand(i));
+    ConstantInt *InnerSize =
+        mdconst::extract<ConstantInt>(MD->getOperand(i + 1));
+    // Don't include any triples that aren't in bounds
+    if (InnerOffset->getZExtValue() + InnerSize->getZExtValue() <= Offset)
+      continue;
+
+    uint64_t NewSize = InnerSize->getZExtValue();
+    uint64_t NewOffset = InnerOffset->getZExtValue() - Offset;
+    if (InnerOffset->getZExtValue() < Offset) {
+      NewOffset = 0;
+      NewSize -= Offset - InnerOffset->getZExtValue();
+    }
+
+    // Shift the offset of the triple
+    Sub.push_back(ConstantAsMetadata::get(
+        ConstantInt::get(InnerOffset->getType(), NewOffset)));
+    Sub.push_back(ConstantAsMetadata::get(
+        ConstantInt::get(InnerSize->getType(), NewSize)));
+    Sub.push_back(MD->getOperand(i + 2));
+  }
+  return MDNode::get(MD->getContext(), Sub);
+}
\ No newline at end of file
diff --git a/contrib/llvm-project/llvm/lib/Analysis/VFABIDemangling.cpp b/contrib/llvm-project/llvm/lib/Analysis/VFABIDemangling.cpp
index 0192a216b2f7..faa46537ad17 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/VFABIDemangling.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/VFABIDemangling.cpp
@@ -290,9 +290,9 @@ bool verifyAllVectorsHaveSameWidth(FunctionType *Signature) {
 
   assert(VecTys.size() > 1 && "Invalid number of elements.");
   const ElementCount EC = VecTys[0]->getElementCount();
-  return llvm::all_of(
-      llvm::make_range(VecTys.begin() + 1, VecTys.end()),
-      [&EC](VectorType *VTy) { return (EC == VTy->getElementCount()); });
+  return llvm::all_of(llvm::drop_begin(VecTys), [&EC](VectorType *VTy) {
+    return (EC == VTy->getElementCount());
+  });
 }
 
 #endif // NDEBUG
@@ -310,7 +310,7 @@ ElementCount getECFromSignature(FunctionType *Signature) {
     if (auto *VTy = dyn_cast<VectorType>(Ty))
       return VTy->getElementCount();
 
-  return ElementCount(/*Min=*/1, /*Scalable=*/false);
+  return ElementCount::getFixed(/*Min=*/1);
 }
 } // namespace
 
@@ -442,7 +442,7 @@ Optional<VFInfo> VFABI::tryDemangleForVFABI(StringRef MangledName,
     if (!F)
       return None;
     const ElementCount EC = getECFromSignature(F->getFunctionType());
-    VF = EC.Min;
+    VF = EC.getKnownMinValue();
   }
 
   // Sanity checks.
diff --git a/contrib/llvm-project/llvm/lib/Analysis/ValueTracking.cpp b/contrib/llvm-project/llvm/lib/Analysis/ValueTracking.cpp
index 43caaa62c2ec..75486d3c80e7 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/ValueTracking.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/ValueTracking.cpp
@@ -77,8 +77,6 @@
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
-const unsigned MaxDepth = 6;
-
 // Controls the number of uses of the value searched for possible
 // dominating comparisons.
 static cl::opt<unsigned> DomConditionsMaxUses("dom-conditions-max-uses",
@@ -117,7 +115,7 @@ struct Query {
   /// bits in x, etc. Regarding the mutual recursion, computeKnownBits can call
   /// isKnownNonZero, which calls computeKnownBits and isKnownToBeAPowerOfTwo
   /// (all of which can call computeKnownBits), and so on.
-  std::array<const Value *, MaxDepth> Excluded;
+  std::array<const Value *, MaxAnalysisRecursionDepth> Excluded;
 
   /// If true, it is safe to use metadata during simplification.
   InstrInfoQuery IIQ;
@@ -172,8 +170,8 @@ static bool getShuffleDemandedElts(const ShuffleVectorInst *Shuf,
     return false;
 
   int NumElts =
-      cast<VectorType>(Shuf->getOperand(0)->getType())->getNumElements();
-  int NumMaskElts = Shuf->getType()->getNumElements();
+      cast<FixedVectorType>(Shuf->getOperand(0)->getType())->getNumElements();
+  int NumMaskElts = cast<FixedVectorType>(Shuf->getType())->getNumElements();
   DemandedLHS = DemandedRHS = APInt::getNullValue(NumElts);
   if (DemandedElts.isNullValue())
     return true;
@@ -352,13 +350,14 @@ bool llvm::isKnownNegative(const Value *V, const DataLayout &DL, unsigned Depth,
   return Known.isNegative();
 }
 
-static bool isKnownNonEqual(const Value *V1, const Value *V2, const Query &Q);
+static bool isKnownNonEqual(const Value *V1, const Value *V2, unsigned Depth,
+                            const Query &Q);
 
 bool llvm::isKnownNonEqual(const Value *V1, const Value *V2,
                            const DataLayout &DL, AssumptionCache *AC,
                            const Instruction *CxtI, const DominatorTree *DT,
                            bool UseInstrInfo) {
-  return ::isKnownNonEqual(V1, V2,
+  return ::isKnownNonEqual(V1, V2, 0,
                            Query(DL, AC, safeCxtI(V1, safeCxtI(V2, CxtI)), DT,
                                  UseInstrInfo, /*ORE=*/nullptr));
 }
@@ -417,7 +416,6 @@ static void computeKnownBitsMul(const Value *Op0, const Value *Op1, bool NSW,
                                 const APInt &DemandedElts, KnownBits &Known,
                                 KnownBits &Known2, unsigned Depth,
                                 const Query &Q) {
-  unsigned BitWidth = Known.getBitWidth();
   computeKnownBits(Op1, DemandedElts, Known, Depth + 1, Q);
   computeKnownBits(Op0, DemandedElts, Known2, Depth + 1, Q);
 
@@ -435,89 +433,18 @@ static void computeKnownBitsMul(const Value *Op0, const Value *Op1, bool NSW,
       bool isKnownNegativeOp0 = Known2.isNegative();
       // The product of two numbers with the same sign is non-negative.
       isKnownNonNegative = (isKnownNegativeOp1 && isKnownNegativeOp0) ||
-        (isKnownNonNegativeOp1 && isKnownNonNegativeOp0);
+                           (isKnownNonNegativeOp1 && isKnownNonNegativeOp0);
       // The product of a negative number and a non-negative number is either
       // negative or zero.
       if (!isKnownNonNegative)
-        isKnownNegative = (isKnownNegativeOp1 && isKnownNonNegativeOp0 &&
-                           isKnownNonZero(Op0, Depth, Q)) ||
-                          (isKnownNegativeOp0 && isKnownNonNegativeOp1 &&
-                           isKnownNonZero(Op1, Depth, Q));
+        isKnownNegative =
+            (isKnownNegativeOp1 && isKnownNonNegativeOp0 &&
+             Known2.isNonZero()) ||
+            (isKnownNegativeOp0 && isKnownNonNegativeOp1 && Known.isNonZero());
     }
   }
 
-  assert(!Known.hasConflict() && !Known2.hasConflict());
-  // Compute a conservative estimate for high known-0 bits.
-  unsigned LeadZ =  std::max(Known.countMinLeadingZeros() +
-                             Known2.countMinLeadingZeros(),
-                             BitWidth) - BitWidth;
-  LeadZ = std::min(LeadZ, BitWidth);
-
-  // The result of the bottom bits of an integer multiply can be
-  // inferred by looking at the bottom bits of both operands and
-  // multiplying them together.
-  // We can infer at least the minimum number of known trailing bits
-  // of both operands. Depending on number of trailing zeros, we can
-  // infer more bits, because (a*b) <=> ((a/m) * (b/n)) * (m*n) assuming
-  // a and b are divisible by m and n respectively.
-  // We then calculate how many of those bits are inferrable and set
-  // the output. For example, the i8 mul:
-  //  a = XXXX1100 (12)
-  //  b = XXXX1110 (14)
-  // We know the bottom 3 bits are zero since the first can be divided by
-  // 4 and the second by 2, thus having ((12/4) * (14/2)) * (2*4).
-  // Applying the multiplication to the trimmed arguments gets:
-  //    XX11 (3)
-  //    X111 (7)
-  // -------
-  //    XX11
-  //   XX11
-  //  XX11
-  // XX11
-  // -------
-  // XXXXX01
-  // Which allows us to infer the 2 LSBs. Since we're multiplying the result
-  // by 8, the bottom 3 bits will be 0, so we can infer a total of 5 bits.
-  // The proof for this can be described as:
-  // Pre: (C1 >= 0) && (C1 < (1 << C5)) && (C2 >= 0) && (C2 < (1 << C6)) &&
-  //      (C7 == (1 << (umin(countTrailingZeros(C1), C5) +
-  //                    umin(countTrailingZeros(C2), C6) +
-  //                    umin(C5 - umin(countTrailingZeros(C1), C5),
-  //                         C6 - umin(countTrailingZeros(C2), C6)))) - 1)
-  // %aa = shl i8 %a, C5
-  // %bb = shl i8 %b, C6
-  // %aaa = or i8 %aa, C1
-  // %bbb = or i8 %bb, C2
-  // %mul = mul i8 %aaa, %bbb
-  // %mask = and i8 %mul, C7
-  //   =>
-  // %mask = i8 ((C1*C2)&C7)
-  // Where C5, C6 describe the known bits of %a, %b
-  // C1, C2 describe the known bottom bits of %a, %b.
-  // C7 describes the mask of the known bits of the result.
-  APInt Bottom0 = Known.One;
-  APInt Bottom1 = Known2.One;
-
-  // How many times we'd be able to divide each argument by 2 (shr by 1).
-  // This gives us the number of trailing zeros on the multiplication result.
-  unsigned TrailBitsKnown0 = (Known.Zero | Known.One).countTrailingOnes();
-  unsigned TrailBitsKnown1 = (Known2.Zero | Known2.One).countTrailingOnes();
-  unsigned TrailZero0 = Known.countMinTrailingZeros();
-  unsigned TrailZero1 = Known2.countMinTrailingZeros();
-  unsigned TrailZ = TrailZero0 + TrailZero1;
-
-  // Figure out the fewest known-bits operand.
-  unsigned SmallestOperand = std::min(TrailBitsKnown0 - TrailZero0,
-                                      TrailBitsKnown1 - TrailZero1);
-  unsigned ResultBitsKnown = std::min(SmallestOperand + TrailZ, BitWidth);
-
-  APInt BottomKnown = Bottom0.getLoBits(TrailBitsKnown0) *
-                      Bottom1.getLoBits(TrailBitsKnown1);
-
-  Known.resetAll();
-  Known.Zero.setHighBits(LeadZ);
-  Known.Zero |= (~BottomKnown).getLoBits(ResultBitsKnown);
-  Known.One |= BottomKnown.getLoBits(ResultBitsKnown);
+  Known = KnownBits::computeForMul(Known, Known2);
 
   // Only make use of no-wrap flags if we failed to compute the sign bit
   // directly.  This matters if the multiplication always overflows, in
@@ -549,10 +476,10 @@ void llvm::computeKnownBitsFromRangeMetadata(const MDNode &Ranges,
     // The first CommonPrefixBits of all values in Range are equal.
     unsigned CommonPrefixBits =
         (Range.getUnsignedMax() ^ Range.getUnsignedMin()).countLeadingZeros();
-
     APInt Mask = APInt::getHighBitsSet(BitWidth, CommonPrefixBits);
-    Known.One &= Range.getUnsignedMax() & Mask;
-    Known.Zero &= ~Range.getUnsignedMax() & Mask;
+    APInt UnsignedMax = Range.getUnsignedMax().zextOrTrunc(BitWidth);
+    Known.One &= UnsignedMax & Mask;
+    Known.Zero &= ~UnsignedMax & Mask;
   }
 }
 
@@ -582,9 +509,7 @@ static bool isEphemeralValueOf(const Instruction *I, const Value *E) {
       if (V == I || isSafeToSpeculativelyExecute(V)) {
        EphValues.insert(V);
        if (const User *U = dyn_cast<User>(V))
-         for (User::const_op_iterator J = U->op_begin(), JE = U->op_end();
-              J != JE; ++J)
-           WorkSet.push_back(*J);
+         append_range(WorkSet, U->operands());
       }
     }
   }
@@ -601,6 +526,7 @@ bool llvm::isAssumeLikeIntrinsic(const Instruction *I) {
       // FIXME: This list is repeated from NoTTI::getIntrinsicCost.
       case Intrinsic::assume:
       case Intrinsic::sideeffect:
+      case Intrinsic::pseudoprobe:
       case Intrinsic::dbg_declare:
       case Intrinsic::dbg_value:
       case Intrinsic::dbg_label:
@@ -608,6 +534,7 @@ bool llvm::isAssumeLikeIntrinsic(const Instruction *I) {
       case Intrinsic::invariant_end:
       case Intrinsic::lifetime_start:
       case Intrinsic::lifetime_end:
+      case Intrinsic::experimental_noalias_scope_decl:
       case Intrinsic::objectsize:
       case Intrinsic::ptr_annotation:
       case Intrinsic::var_annotation:
@@ -662,41 +589,30 @@ bool llvm::isValidAssumeForContext(const Instruction *Inv,
   return false;
 }
 
+static bool cmpExcludesZero(CmpInst::Predicate Pred, const Value *RHS) {
+  // v u> y implies v != 0.
+  if (Pred == ICmpInst::ICMP_UGT)
+    return true;
+
+  // Special-case v != 0 to also handle v != null.
+  if (Pred == ICmpInst::ICMP_NE)
+    return match(RHS, m_Zero());
+
+  // All other predicates - rely on generic ConstantRange handling.
+  const APInt *C;
+  if (!match(RHS, m_APInt(C)))
+    return false;
+
+  ConstantRange TrueValues = ConstantRange::makeExactICmpRegion(Pred, *C);
+  return !TrueValues.contains(APInt::getNullValue(C->getBitWidth()));
+}
+
 static bool isKnownNonZeroFromAssume(const Value *V, const Query &Q) {
   // Use of assumptions is context-sensitive. If we don't have a context, we
   // cannot use them!
   if (!Q.AC || !Q.CxtI)
     return false;
 
-  // Note that the patterns below need to be kept in sync with the code
-  // in AssumptionCache::updateAffectedValues.
-
-  auto CmpExcludesZero = [V](ICmpInst *Cmp) {
-    auto m_V = m_CombineOr(m_Specific(V), m_PtrToInt(m_Specific(V)));
-
-    Value *RHS;
-    CmpInst::Predicate Pred;
-    if (!match(Cmp, m_c_ICmp(Pred, m_V, m_Value(RHS))))
-      return false;
-    // assume(v u> y) -> assume(v != 0)
-    if (Pred == ICmpInst::ICMP_UGT)
-      return true;
-
-    // assume(v != 0)
-    // We special-case this one to ensure that we handle `assume(v != null)`.
-    if (Pred == ICmpInst::ICMP_NE)
-      return match(RHS, m_Zero());
-
-    // All other predicates - rely on generic ConstantRange handling.
-    ConstantInt *CI;
-    if (!match(RHS, m_ConstantInt(CI)))
-      return false;
-    ConstantRange RHSRange(CI->getValue());
-    ConstantRange TrueValues =
-        ConstantRange::makeAllowedICmpRegion(Pred, RHSRange);
-    return !TrueValues.contains(APInt::getNullValue(CI->getBitWidth()));
-  };
-
   if (Q.CxtI && V->getType()->isPointerTy()) {
     SmallVector<Attribute::AttrKind, 2> AttrKinds{Attribute::NonNull};
     if (!NullPointerIsDefined(Q.CxtI->getFunction(),
@@ -723,12 +639,13 @@ static bool isKnownNonZeroFromAssume(const Value *V, const Query &Q) {
     assert(I->getCalledFunction()->getIntrinsicID() == Intrinsic::assume &&
            "must be an assume intrinsic");
 
-    Value *Arg = I->getArgOperand(0);
-    ICmpInst *Cmp = dyn_cast<ICmpInst>(Arg);
-    if (!Cmp)
-      continue;
+    Value *RHS;
+    CmpInst::Predicate Pred;
+    auto m_V = m_CombineOr(m_Specific(V), m_PtrToInt(m_Specific(V)));
+    if (!match(I->getArgOperand(0), m_c_ICmp(Pred, m_V, m_Value(RHS))))
+      return false;
 
-    if (CmpExcludesZero(Cmp) && isValidAssumeForContext(I, Q.CxtI, Q.DT))
+    if (cmpExcludesZero(Pred, RHS) && isValidAssumeForContext(I, Q.CxtI, Q.DT))
       return true;
   }
 
@@ -744,6 +661,14 @@ static void computeKnownBitsFromAssume(const Value *V, KnownBits &Known,
 
   unsigned BitWidth = Known.getBitWidth();
 
+  // Refine Known set if the pointer alignment is set by assume bundles.
+  if (V->getType()->isPointerTy()) {
+    if (RetainedKnowledge RK = getKnowledgeValidInContext(
+            V, {Attribute::Alignment}, Q.CxtI, Q.DT, Q.AC)) {
+      Known.Zero.setLowBits(Log2_32(RK.ArgValue));
+    }
+  }
+
   // Note that the patterns below need to be kept in sync with the code
   // in AssumptionCache::updateAffectedValues.
 
@@ -778,7 +703,7 @@ static void computeKnownBitsFromAssume(const Value *V, KnownBits &Known,
     }
 
     // The remaining tests are all recursive, so bail out if we hit the limit.
-    if (Depth == MaxDepth)
+    if (Depth == MaxAnalysisRecursionDepth)
       continue;
 
     ICmpInst *Cmp = dyn_cast<ICmpInst>(Arg);
@@ -1044,26 +969,31 @@ static void computeKnownBitsFromAssume(const Value *V, KnownBits &Known,
 
 /// Compute known bits from a shift operator, including those with a
 /// non-constant shift amount. Known is the output of this function. Known2 is a
-/// pre-allocated temporary with the same bit width as Known. KZF and KOF are
-/// operator-specific functions that, given the known-zero or known-one bits
-/// respectively, and a shift amount, compute the implied known-zero or
-/// known-one bits of the shift operator's result respectively for that shift
-/// amount. The results from calling KZF and KOF are conservatively combined for
-/// all permitted shift amounts.
+/// pre-allocated temporary with the same bit width as Known and on return
+/// contains the known bit of the shift value source. KF is an
+/// operator-specific function that, given the known-bits and a shift amount,
+/// compute the implied known-bits of the shift operator's result respectively
+/// for that shift amount. The results from calling KF are conservatively
+/// combined for all permitted shift amounts.
 static void computeKnownBitsFromShiftOperator(
     const Operator *I, const APInt &DemandedElts, KnownBits &Known,
     KnownBits &Known2, unsigned Depth, const Query &Q,
-    function_ref<APInt(const APInt &, unsigned)> KZF,
-    function_ref<APInt(const APInt &, unsigned)> KOF) {
+    function_ref<KnownBits(const KnownBits &, const KnownBits &)> KF) {
   unsigned BitWidth = Known.getBitWidth();
-
+  computeKnownBits(I->getOperand(0), DemandedElts, Known2, Depth + 1, Q);
   computeKnownBits(I->getOperand(1), DemandedElts, Known, Depth + 1, Q);
-  if (Known.isConstant()) {
-    unsigned ShiftAmt = Known.getConstant().getLimitedValue(BitWidth - 1);
 
-    computeKnownBits(I->getOperand(0), DemandedElts, Known, Depth + 1, Q);
-    Known.Zero = KZF(Known.Zero, ShiftAmt);
-    Known.One  = KOF(Known.One, ShiftAmt);
+  // Note: We cannot use Known.Zero.getLimitedValue() here, because if
+  // BitWidth > 64 and any upper bits are known, we'll end up returning the
+  // limit value (which implies all bits are known).
+  uint64_t ShiftAmtKZ = Known.Zero.zextOrTrunc(64).getZExtValue();
+  uint64_t ShiftAmtKO = Known.One.zextOrTrunc(64).getZExtValue();
+  bool ShiftAmtIsConstant = Known.isConstant();
+  bool MaxShiftAmtIsOutOfRange = Known.getMaxValue().uge(BitWidth);
+
+  if (ShiftAmtIsConstant) {
+    Known = KF(Known2, Known);
+
     // If the known bits conflict, this must be an overflowing left shift, so
     // the shift result is poison. We can return anything we want. Choose 0 for
     // the best folding opportunity.
@@ -1077,17 +1007,11 @@ static void computeKnownBitsFromShiftOperator(
   // LHS, the value could be poison, but bail out because the check below is
   // expensive.
   // TODO: Should we just carry on?
-  if (Known.getMaxValue().uge(BitWidth)) {
+  if (MaxShiftAmtIsOutOfRange) {
     Known.resetAll();
     return;
   }
 
-  // Note: We cannot use Known.Zero.getLimitedValue() here, because if
-  // BitWidth > 64 and any upper bits are known, we'll end up returning the
-  // limit value (which implies all bits are known).
-  uint64_t ShiftAmtKZ = Known.Zero.zextOrTrunc(64).getZExtValue();
-  uint64_t ShiftAmtKO = Known.One.zextOrTrunc(64).getZExtValue();
-
   // It would be more-clearly correct to use the two temporaries for this
   // calculation. Reusing the APInts here to prevent unnecessary allocations.
   Known.resetAll();
@@ -1106,8 +1030,6 @@ static void computeKnownBitsFromShiftOperator(
       return;
   }
 
-  computeKnownBits(I->getOperand(0), DemandedElts, Known2, Depth + 1, Q);
-
   Known.Zero.setAllBits();
   Known.One.setAllBits();
   for (unsigned ShiftAmt = 0; ShiftAmt < BitWidth; ++ShiftAmt) {
@@ -1128,8 +1050,8 @@ static void computeKnownBitsFromShiftOperator(
         continue;
     }
 
-    Known.Zero &= KZF(Known2.Zero, ShiftAmt);
-    Known.One  &= KOF(Known2.One, ShiftAmt);
+    Known = KnownBits::commonBits(
+        Known, KF(Known2, KnownBits::makeConstant(APInt(32, ShiftAmt))));
   }
 
   // If the known bits conflict, the result is poison. Return a 0 and hope the
@@ -1193,19 +1115,9 @@ static void computeKnownBitsFromOperator(const Operator *I,
     break;
   }
   case Instruction::UDiv: {
-    // For the purposes of computing leading zeros we can conservatively
-    // treat a udiv as a logical right shift by the power of 2 known to
-    // be less than the denominator.
-    computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
-    unsigned LeadZ = Known2.countMinLeadingZeros();
-
-    Known2.resetAll();
+    computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
     computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q);
-    unsigned RHSMaxLeadingZeros = Known2.countMaxLeadingZeros();
-    if (RHSMaxLeadingZeros != BitWidth)
-      LeadZ = std::min(BitWidth, LeadZ + BitWidth - RHSMaxLeadingZeros - 1);
-
-    Known.Zero.setHighBits(LeadZ);
+    Known = KnownBits::udiv(Known, Known2);
     break;
   }
   case Instruction::Select: {
@@ -1214,59 +1126,40 @@ static void computeKnownBitsFromOperator(const Operator *I,
     if (SelectPatternResult::isMinOrMax(SPF)) {
       computeKnownBits(RHS, Known, Depth + 1, Q);
       computeKnownBits(LHS, Known2, Depth + 1, Q);
-    } else {
-      computeKnownBits(I->getOperand(2), Known, Depth + 1, Q);
-      computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q);
+      switch (SPF) {
+      default:
+        llvm_unreachable("Unhandled select pattern flavor!");
+      case SPF_SMAX:
+        Known = KnownBits::smax(Known, Known2);
+        break;
+      case SPF_SMIN:
+        Known = KnownBits::smin(Known, Known2);
+        break;
+      case SPF_UMAX:
+        Known = KnownBits::umax(Known, Known2);
+        break;
+      case SPF_UMIN:
+        Known = KnownBits::umin(Known, Known2);
+        break;
+      }
+      break;
     }
 
-    unsigned MaxHighOnes = 0;
-    unsigned MaxHighZeros = 0;
-    if (SPF == SPF_SMAX) {
-      // If both sides are negative, the result is negative.
-      if (Known.isNegative() && Known2.isNegative())
-        // We can derive a lower bound on the result by taking the max of the
-        // leading one bits.
-        MaxHighOnes =
-            std::max(Known.countMinLeadingOnes(), Known2.countMinLeadingOnes());
-      // If either side is non-negative, the result is non-negative.
-      else if (Known.isNonNegative() || Known2.isNonNegative())
-        MaxHighZeros = 1;
-    } else if (SPF == SPF_SMIN) {
-      // If both sides are non-negative, the result is non-negative.
-      if (Known.isNonNegative() && Known2.isNonNegative())
-        // We can derive an upper bound on the result by taking the max of the
-        // leading zero bits.
-        MaxHighZeros = std::max(Known.countMinLeadingZeros(),
-                                Known2.countMinLeadingZeros());
-      // If either side is negative, the result is negative.
-      else if (Known.isNegative() || Known2.isNegative())
-        MaxHighOnes = 1;
-    } else if (SPF == SPF_UMAX) {
-      // We can derive a lower bound on the result by taking the max of the
-      // leading one bits.
-      MaxHighOnes =
-          std::max(Known.countMinLeadingOnes(), Known2.countMinLeadingOnes());
-    } else if (SPF == SPF_UMIN) {
-      // We can derive an upper bound on the result by taking the max of the
-      // leading zero bits.
-      MaxHighZeros =
-          std::max(Known.countMinLeadingZeros(), Known2.countMinLeadingZeros());
-    } else if (SPF == SPF_ABS) {
+    computeKnownBits(I->getOperand(2), Known, Depth + 1, Q);
+    computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q);
+
+    // Only known if known in both the LHS and RHS.
+    Known = KnownBits::commonBits(Known, Known2);
+
+    if (SPF == SPF_ABS) {
       // RHS from matchSelectPattern returns the negation part of abs pattern.
       // If the negate has an NSW flag we can assume the sign bit of the result
       // will be 0 because that makes abs(INT_MIN) undefined.
       if (match(RHS, m_Neg(m_Specific(LHS))) &&
           Q.IIQ.hasNoSignedWrap(cast<Instruction>(RHS)))
-        MaxHighZeros = 1;
+        Known.Zero.setSignBit();
     }
 
-    // Only known if known in both the LHS and RHS.
-    Known.One &= Known2.One;
-    Known.Zero &= Known2.Zero;
-    if (MaxHighOnes > 0)
-      Known.One.setHighBits(MaxHighOnes);
-    if (MaxHighZeros > 0)
-      Known.Zero.setHighBits(MaxHighZeros);
     break;
   }
   case Instruction::FPTrunc:
@@ -1321,58 +1214,37 @@ static void computeKnownBitsFromOperator(const Operator *I,
     break;
   }
   case Instruction::Shl: {
-    // (shl X, C1) & C2 == 0   iff   (X & C2 >>u C1) == 0
     bool NSW = Q.IIQ.hasNoSignedWrap(cast<OverflowingBinaryOperator>(I));
-    auto KZF = [NSW](const APInt &KnownZero, unsigned ShiftAmt) {
-      APInt KZResult = KnownZero << ShiftAmt;
-      KZResult.setLowBits(ShiftAmt); // Low bits known 0.
+    auto KF = [NSW](const KnownBits &KnownVal, const KnownBits &KnownAmt) {
+      KnownBits Result = KnownBits::shl(KnownVal, KnownAmt);
       // If this shift has "nsw" keyword, then the result is either a poison
       // value or has the same sign bit as the first operand.
-      if (NSW && KnownZero.isSignBitSet())
-        KZResult.setSignBit();
-      return KZResult;
-    };
-
-    auto KOF = [NSW](const APInt &KnownOne, unsigned ShiftAmt) {
-      APInt KOResult = KnownOne << ShiftAmt;
-      if (NSW && KnownOne.isSignBitSet())
-        KOResult.setSignBit();
-      return KOResult;
+      if (NSW) {
+        if (KnownVal.Zero.isSignBitSet())
+          Result.Zero.setSignBit();
+        if (KnownVal.One.isSignBitSet())
+          Result.One.setSignBit();
+      }
+      return Result;
     };
-
     computeKnownBitsFromShiftOperator(I, DemandedElts, Known, Known2, Depth, Q,
-                                      KZF, KOF);
+                                      KF);
     break;
   }
   case Instruction::LShr: {
-    // (lshr X, C1) & C2 == 0   iff  (-1 >> C1) & C2 == 0
-    auto KZF = [](const APInt &KnownZero, unsigned ShiftAmt) {
-      APInt KZResult = KnownZero.lshr(ShiftAmt);
-      // High bits known zero.
-      KZResult.setHighBits(ShiftAmt);
-      return KZResult;
+    auto KF = [](const KnownBits &KnownVal, const KnownBits &KnownAmt) {
+      return KnownBits::lshr(KnownVal, KnownAmt);
     };
-
-    auto KOF = [](const APInt &KnownOne, unsigned ShiftAmt) {
-      return KnownOne.lshr(ShiftAmt);
-    };
-
     computeKnownBitsFromShiftOperator(I, DemandedElts, Known, Known2, Depth, Q,
-                                      KZF, KOF);
+                                      KF);
     break;
   }
   case Instruction::AShr: {
-    // (ashr X, C1) & C2 == 0   iff  (-1 >> C1) & C2 == 0
-    auto KZF = [](const APInt &KnownZero, unsigned ShiftAmt) {
-      return KnownZero.ashr(ShiftAmt);
+    auto KF = [](const KnownBits &KnownVal, const KnownBits &KnownAmt) {
+      return KnownBits::ashr(KnownVal, KnownAmt);
     };
-
-    auto KOF = [](const APInt &KnownOne, unsigned ShiftAmt) {
-      return KnownOne.ashr(ShiftAmt);
-    };
-
     computeKnownBitsFromShiftOperator(I, DemandedElts, Known, Known2, Depth, Q,
-                                      KZF, KOF);
+                                      KF);
     break;
   }
   case Instruction::Sub: {
@@ -1388,86 +1260,45 @@ static void computeKnownBitsFromOperator(const Operator *I,
     break;
   }
   case Instruction::SRem:
-    if (ConstantInt *Rem = dyn_cast<ConstantInt>(I->getOperand(1))) {
-      APInt RA = Rem->getValue().abs();
-      if (RA.isPowerOf2()) {
-        APInt LowBits = RA - 1;
-        computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
-
-        // The low bits of the first operand are unchanged by the srem.
-        Known.Zero = Known2.Zero & LowBits;
-        Known.One = Known2.One & LowBits;
-
-        // If the first operand is non-negative or has all low bits zero, then
-        // the upper bits are all zero.
-        if (Known2.isNonNegative() || LowBits.isSubsetOf(Known2.Zero))
-          Known.Zero |= ~LowBits;
-
-        // If the first operand is negative and not all low bits are zero, then
-        // the upper bits are all one.
-        if (Known2.isNegative() && LowBits.intersects(Known2.One))
-          Known.One |= ~LowBits;
-
-        assert((Known.Zero & Known.One) == 0 && "Bits known to be one AND zero?");
-        break;
-      }
-    }
-
-    // The sign bit is the LHS's sign bit, except when the result of the
-    // remainder is zero.
-    computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
-    // If it's known zero, our sign bit is also zero.
-    if (Known2.isNonNegative())
-      Known.makeNonNegative();
-
+    computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
+    computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q);
+    Known = KnownBits::srem(Known, Known2);
     break;
-  case Instruction::URem: {
-    if (ConstantInt *Rem = dyn_cast<ConstantInt>(I->getOperand(1))) {
-      const APInt &RA = Rem->getValue();
-      if (RA.isPowerOf2()) {
-        APInt LowBits = (RA - 1);
-        computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
-        Known.Zero |= ~LowBits;
-        Known.One &= LowBits;
-        break;
-      }
-    }
 
-    // Since the result is less than or equal to either operand, any leading
-    // zero bits in either operand must also exist in the result.
+  case Instruction::URem:
     computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
     computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q);
-
-    unsigned Leaders =
-        std::max(Known.countMinLeadingZeros(), Known2.countMinLeadingZeros());
-    Known.resetAll();
-    Known.Zero.setHighBits(Leaders);
+    Known = KnownBits::urem(Known, Known2);
     break;
-  }
   case Instruction::Alloca:
     Known.Zero.setLowBits(Log2(cast<AllocaInst>(I)->getAlign()));
     break;
   case Instruction::GetElementPtr: {
     // Analyze all of the subscripts of this getelementptr instruction
     // to determine if we can prove known low zero bits.
-    KnownBits LocalKnown(BitWidth);
-    computeKnownBits(I->getOperand(0), LocalKnown, Depth + 1, Q);
-    unsigned TrailZ = LocalKnown.countMinTrailingZeros();
+    computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
+    // Accumulate the constant indices in a separate variable
+    // to minimize the number of calls to computeForAddSub.
+    APInt AccConstIndices(BitWidth, 0, /*IsSigned*/ true);
 
     gep_type_iterator GTI = gep_type_begin(I);
     for (unsigned i = 1, e = I->getNumOperands(); i != e; ++i, ++GTI) {
       // TrailZ can only become smaller, short-circuit if we hit zero.
-      if (TrailZ == 0)
+      if (Known.isUnknown())
         break;
 
       Value *Index = I->getOperand(i);
+
+      // Handle case when index is zero.
+      Constant *CIndex = dyn_cast<Constant>(Index);
+      if (CIndex && CIndex->isZeroValue())
+        continue;
+
       if (StructType *STy = GTI.getStructTypeOrNull()) {
         // Handle struct member offset arithmetic.
 
-        // Handle case when index is vector zeroinitializer
-        Constant *CIndex = cast<Constant>(Index);
-        if (CIndex->isZeroValue())
-          continue;
+        assert(CIndex &&
+               "Access to structure field must be known at compile time");
 
         if (CIndex->getType()->isVectorTy())
           Index = CIndex->getSplatValue();
@@ -1475,26 +1306,56 @@ static void computeKnownBitsFromOperator(const Operator *I,
         unsigned Idx = cast<ConstantInt>(Index)->getZExtValue();
         const StructLayout *SL = Q.DL.getStructLayout(STy);
         uint64_t Offset = SL->getElementOffset(Idx);
-        TrailZ = std::min<unsigned>(TrailZ,
-                                    countTrailingZeros(Offset));
+        AccConstIndices += Offset;
+        continue;
+      }
+
+      // Handle array index arithmetic.
+      Type *IndexedTy = GTI.getIndexedType();
+      if (!IndexedTy->isSized()) {
+        Known.resetAll();
+        break;
+      }
+
+      unsigned IndexBitWidth = Index->getType()->getScalarSizeInBits();
+      KnownBits IndexBits(IndexBitWidth);
+      computeKnownBits(Index, IndexBits, Depth + 1, Q);
+      TypeSize IndexTypeSize = Q.DL.getTypeAllocSize(IndexedTy);
+      uint64_t TypeSizeInBytes = IndexTypeSize.getKnownMinSize();
+      KnownBits ScalingFactor(IndexBitWidth);
+      // Multiply by current sizeof type.
+      // &A[i] == A + i * sizeof(*A[i]).
+      if (IndexTypeSize.isScalable()) {
+        // For scalable types the only thing we know about sizeof is
+        // that this is a multiple of the minimum size.
+        ScalingFactor.Zero.setLowBits(countTrailingZeros(TypeSizeInBytes));
+      } else if (IndexBits.isConstant()) {
+        APInt IndexConst = IndexBits.getConstant();
+        APInt ScalingFactor(IndexBitWidth, TypeSizeInBytes);
+        IndexConst *= ScalingFactor;
+        AccConstIndices += IndexConst.sextOrTrunc(BitWidth);
+        continue;
       } else {
-        // Handle array index arithmetic.
-        Type *IndexedTy = GTI.getIndexedType();
-        if (!IndexedTy->isSized()) {
-          TrailZ = 0;
-          break;
-        }
-        unsigned GEPOpiBits = Index->getType()->getScalarSizeInBits();
-        uint64_t TypeSize = Q.DL.getTypeAllocSize(IndexedTy).getKnownMinSize();
-        LocalKnown.Zero = LocalKnown.One = APInt(GEPOpiBits, 0);
-        computeKnownBits(Index, LocalKnown, Depth + 1, Q);
-        TrailZ = std::min(TrailZ,
-                          unsigned(countTrailingZeros(TypeSize) +
-                                   LocalKnown.countMinTrailingZeros()));
+        ScalingFactor =
+            KnownBits::makeConstant(APInt(IndexBitWidth, TypeSizeInBytes));
       }
-    }
+      IndexBits = KnownBits::computeForMul(IndexBits, ScalingFactor);
 
-    Known.Zero.setLowBits(TrailZ);
+      // If the offsets have a different width from the pointer, according
+      // to the language reference we need to sign-extend or truncate them
+      // to the width of the pointer.
+      IndexBits = IndexBits.sextOrTrunc(BitWidth);
+
+      // Note that inbounds does *not* guarantee nsw for the addition, as only
+      // the offset is signed, while the base address is unsigned.
+      Known = KnownBits::computeForAddSub(
+          /*Add=*/true, /*NSW=*/false, Known, IndexBits);
+    }
+    if (!Known.isUnknown() && !AccConstIndices.isNullValue()) {
+      KnownBits Index = KnownBits::makeConstant(AccConstIndices);
+      Known = KnownBits::computeForAddSub(
+          /*Add=*/true, /*NSW=*/false, Known, Index);
+    }
     break;
   }
   case Instruction::PHI: {
@@ -1593,7 +1454,7 @@ static void computeKnownBitsFromOperator(const Operator *I,
 
     // Otherwise take the unions of the known bit sets of the operands,
     // taking conservative care to avoid excessive recursion.
-    if (Depth < MaxDepth - 1 && !Known.Zero && !Known.One) {
+    if (Depth < MaxAnalysisRecursionDepth - 1 && !Known.Zero && !Known.One) {
       // Skip if every incoming value references to ourself.
       if (dyn_cast_or_null<UndefValue>(P->hasConstantValue()))
         break;
@@ -1615,12 +1476,11 @@ static void computeKnownBitsFromOperator(const Operator *I,
         Known2 = KnownBits(BitWidth);
         // Recurse, but cap the recursion to one level, because we don't
         // want to waste time spinning around in loops.
-        computeKnownBits(IncValue, Known2, MaxDepth - 1, RecQ);
-        Known.Zero &= Known2.Zero;
-        Known.One &= Known2.One;
+        computeKnownBits(IncValue, Known2, MaxAnalysisRecursionDepth - 1, RecQ);
+        Known = KnownBits::commonBits(Known, Known2);
         // If all bits have been ruled out, there's no need to check
         // more operands.
-        if (!Known.Zero && !Known.One)
+        if (Known.isUnknown())
           break;
       }
     }
@@ -1642,6 +1502,12 @@ static void computeKnownBitsFromOperator(const Operator *I,
     if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
       switch (II->getIntrinsicID()) {
       default: break;
+      case Intrinsic::abs: {
+        computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
+        bool IntMinIsPoison = match(II->getArgOperand(1), m_One());
+        Known = Known2.abs(IntMinIsPoison);
+        break;
+      }
       case Intrinsic::bitreverse:
         computeKnownBits(I->getOperand(0), DemandedElts, Known2, Depth + 1, Q);
         Known.Zero |= Known2.Zero.reverseBits();
@@ -1655,7 +1521,7 @@ static void computeKnownBitsFromOperator(const Operator *I,
       case Intrinsic::ctlz: {
         computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
         // If we have a known 1, its position is our upper bound.
-        unsigned PossibleLZ = Known2.One.countLeadingZeros();
+        unsigned PossibleLZ = Known2.countMaxLeadingZeros();
         // If this call is undefined for 0, the result will be less than 2^n.
         if (II->getArgOperand(1) == ConstantInt::getTrue(II->getContext()))
           PossibleLZ = std::min(PossibleLZ, BitWidth - 1);
@@ -1666,7 +1532,7 @@ static void computeKnownBitsFromOperator(const Operator *I,
       case Intrinsic::cttz: {
         computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
         // If we have a known 1, its position is our upper bound.
-        unsigned PossibleTZ = Known2.One.countTrailingZeros();
+        unsigned PossibleTZ = Known2.countMaxTrailingZeros();
         // If this call is undefined for 0, the result will be less than 2^n.
         if (II->getArgOperand(1) == ConstantInt::getTrue(II->getContext()))
           PossibleTZ = std::min(PossibleTZ, BitWidth - 1);
@@ -1737,6 +1603,26 @@ static void computeKnownBitsFromOperator(const Operator *I,
         }
         break;
       }
+      case Intrinsic::umin:
+        computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
+        computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q);
+        Known = KnownBits::umin(Known, Known2);
+        break;
+      case Intrinsic::umax:
+        computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
+        computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q);
+        Known = KnownBits::umax(Known, Known2);
+        break;
+      case Intrinsic::smin:
+        computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
+        computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q);
+        Known = KnownBits::smin(Known, Known2);
+        break;
+      case Intrinsic::smax:
+        computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
+        computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q);
+        Known = KnownBits::smax(Known, Known2);
+        break;
       case Intrinsic::x86_sse42_crc32_64_64:
         Known.Zero.setBitsFrom(32);
         break;
@@ -1769,8 +1655,7 @@ static void computeKnownBitsFromOperator(const Operator *I,
     if (!!DemandedRHS) {
       const Value *RHS = Shuf->getOperand(1);
       computeKnownBits(RHS, DemandedRHS, Known2, Depth + 1, Q);
-      Known.One &= Known2.One;
-      Known.Zero &= Known2.Zero;
+      Known = KnownBits::commonBits(Known, Known2);
     }
     break;
   }
@@ -1799,8 +1684,7 @@ static void computeKnownBitsFromOperator(const Operator *I,
     DemandedVecElts.clearBit(EltIdx);
     if (!!DemandedVecElts) {
       computeKnownBits(Vec, DemandedVecElts, Known2, Depth + 1, Q);
-      Known.One &= Known2.One;
-      Known.Zero &= Known2.Zero;
+      Known = KnownBits::commonBits(Known, Known2);
     }
     break;
   }
@@ -1850,6 +1734,11 @@ static void computeKnownBitsFromOperator(const Operator *I,
       }
     }
     break;
+  case Instruction::Freeze:
+    if (isGuaranteedNotToBePoison(I->getOperand(0), Q.AC, Q.CxtI, Q.DT,
+                                  Depth + 1))
+      computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
+    break;
   }
 }
 
@@ -1895,7 +1784,7 @@ void computeKnownBits(const Value *V, const APInt &DemandedElts,
   }
 
   assert(V && "No Value?");
-  assert(Depth <= MaxDepth && "Limit Search Depth");
+  assert(Depth <= MaxAnalysisRecursionDepth && "Limit Search Depth");
 
 #ifndef NDEBUG
   Type *Ty = V->getType();
@@ -1926,8 +1815,7 @@ void computeKnownBits(const Value *V, const APInt &DemandedElts,
   const APInt *C;
   if (match(V, m_APInt(C))) {
     // We know all of the bits for a scalar constant or a splat vector constant!
-    Known.One = *C;
-    Known.Zero = ~Known.One;
+    Known = KnownBits::makeConstant(*C);
     return;
   }
   // Null and aggregate-zero are all-zeros.
@@ -1982,9 +1870,8 @@ void computeKnownBits(const Value *V, const APInt &DemandedElts,
   // assumptions.  Confirm that we've handled them all.
   assert(!isa<ConstantData>(V) && "Unhandled constant data!");
 
-  // Limit search depth.
   // All recursive calls that increase depth must come after this.
-  if (Depth == MaxDepth)
+  if (Depth == MaxAnalysisRecursionDepth)
     return;
 
   // A weak GlobalAlias is totally unknown. A non-weak GlobalAlias has
@@ -2001,7 +1888,7 @@ void computeKnownBits(const Value *V, const APInt &DemandedElts,
   // Aligned pointers have trailing zeros - refine Known.Zero set
   if (isa<PointerType>(V->getType())) {
     Align Alignment = V->getPointerAlignment(Q.DL);
-    Known.Zero.setLowBits(countTrailingZeros(Alignment.value()));
+    Known.Zero.setLowBits(Log2(Alignment));
   }
 
   // computeKnownBitsFromAssume strictly refines Known.
@@ -2019,7 +1906,7 @@ void computeKnownBits(const Value *V, const APInt &DemandedElts,
 /// types and vectors of integers.
 bool isKnownToBeAPowerOfTwo(const Value *V, bool OrZero, unsigned Depth,
                             const Query &Q) {
-  assert(Depth <= MaxDepth && "Limit Search Depth");
+  assert(Depth <= MaxAnalysisRecursionDepth && "Limit Search Depth");
 
   // Attempt to match against constants.
   if (OrZero && match(V, m_Power2OrZero()))
@@ -2038,7 +1925,7 @@ bool isKnownToBeAPowerOfTwo(const Value *V, bool OrZero, unsigned Depth,
     return true;
 
   // The remaining tests are all recursive, so bail out if we hit the limit.
-  if (Depth++ == MaxDepth)
+  if (Depth++ == MaxAnalysisRecursionDepth)
     return false;
 
   Value *X = nullptr, *Y = nullptr;
@@ -2167,7 +2054,7 @@ static bool isGEPKnownNonNull(const GEPOperator *GEP, unsigned Depth,
     // to recurse 10k times just because we have 10k GEP operands. We don't
     // bail completely out because we want to handle constant GEPs regardless
     // of depth.
-    if (Depth++ >= MaxDepth)
+    if (Depth++ >= MaxAnalysisRecursionDepth)
       continue;
 
     if (isKnownNonZero(GTI.getOperand(), Depth, Q))
@@ -2199,7 +2086,8 @@ static bool isKnownNonNullFromDominatingCondition(const Value *V,
       if (auto *CalledFunc = CB->getCalledFunction())
         for (const Argument &Arg : CalledFunc->args())
           if (CB->getArgOperand(Arg.getArgNo()) == V &&
-              Arg.hasNonNullAttr() && DT->dominates(CB, CtxI))
+              Arg.hasNonNullAttr(/* AllowUndefOrPoison */ false) &&
+              DT->dominates(CB, CtxI))
             return true;
 
     // If the value is used as a load/store, then the pointer must be non null.
@@ -2212,10 +2100,17 @@ static bool isKnownNonNullFromDominatingCondition(const Value *V,
     }
 
     // Consider only compare instructions uniquely controlling a branch
+    Value *RHS;
     CmpInst::Predicate Pred;
-    if (!match(const_cast<User *>(U),
-               m_c_ICmp(Pred, m_Specific(V), m_Zero())) ||
-        (Pred != ICmpInst::ICMP_EQ && Pred != ICmpInst::ICMP_NE))
+    if (!match(U, m_c_ICmp(Pred, m_Specific(V), m_Value(RHS))))
+      continue;
+
+    bool NonNullIfTrue;
+    if (cmpExcludesZero(Pred, RHS))
+      NonNullIfTrue = true;
+    else if (cmpExcludesZero(CmpInst::getInversePredicate(Pred), RHS))
+      NonNullIfTrue = false;
+    else
       continue;
 
     SmallVector<const User *, 4> WorkList;
@@ -2232,24 +2127,23 @@ static bool isKnownNonNullFromDominatingCondition(const Value *V,
         // propagate "pred != null" condition through AND because it is only
         // correct to assume that all conditions of AND are met in true branch.
         // TODO: Support similar logic of OR and EQ predicate?
-        if (Pred == ICmpInst::ICMP_NE)
-          if (auto *BO = dyn_cast<BinaryOperator>(Curr))
-            if (BO->getOpcode() == Instruction::And) {
-              for (auto *BOU : BO->users())
-                if (Visited.insert(BOU).second)
-                  WorkList.push_back(BOU);
-              continue;
-            }
+        if (NonNullIfTrue)
+          if (match(Curr, m_LogicalAnd(m_Value(), m_Value()))) {
+            for (auto *CurrU : Curr->users())
+              if (Visited.insert(CurrU).second)
+                WorkList.push_back(CurrU);
+            continue;
+          }
 
         if (const BranchInst *BI = dyn_cast<BranchInst>(Curr)) {
           assert(BI->isConditional() && "uses a comparison!");
 
           BasicBlock *NonNullSuccessor =
-              BI->getSuccessor(Pred == ICmpInst::ICMP_EQ ? 1 : 0);
+              BI->getSuccessor(NonNullIfTrue ? 0 : 1);
           BasicBlockEdge Edge(BI->getParent(), NonNullSuccessor);
           if (Edge.isSingleEdge() && DT->dominates(Edge, CtxI->getParent()))
             return true;
-        } else if (Pred == ICmpInst::ICMP_NE && isGuard(Curr) &&
+        } else if (NonNullIfTrue && isGuard(Curr) &&
                    DT->dominates(cast<Instruction>(Curr), CtxI)) {
           return true;
         }
@@ -2302,8 +2196,9 @@ bool isKnownNonZero(const Value *V, const APInt &DemandedElts, unsigned Depth,
       // See the comment for IntToPtr/PtrToInt instructions below.
       if (CE->getOpcode() == Instruction::IntToPtr ||
           CE->getOpcode() == Instruction::PtrToInt)
-        if (Q.DL.getTypeSizeInBits(CE->getOperand(0)->getType()) <=
-            Q.DL.getTypeSizeInBits(CE->getType()))
+        if (Q.DL.getTypeSizeInBits(CE->getOperand(0)->getType())
+                .getFixedSize() <=
+            Q.DL.getTypeSizeInBits(CE->getType()).getFixedSize())
           return isKnownNonZero(CE->getOperand(0), Depth, Q);
     }
 
@@ -2349,19 +2244,24 @@ bool isKnownNonZero(const Value *V, const APInt &DemandedElts, unsigned Depth,
     return true;
 
   // Some of the tests below are recursive, so bail out if we hit the limit.
-  if (Depth++ >= MaxDepth)
+  if (Depth++ >= MaxAnalysisRecursionDepth)
     return false;
 
   // Check for pointer simplifications.
-  if (V->getType()->isPointerTy()) {
+
+  if (PointerType *PtrTy = dyn_cast<PointerType>(V->getType())) {
     // Alloca never returns null, malloc might.
     if (isa<AllocaInst>(V) && Q.DL.getAllocaAddrSpace() == 0)
       return true;
 
-    // A byval, inalloca, or nonnull argument is never null.
-    if (const Argument *A = dyn_cast<Argument>(V))
-      if (A->hasPassPointeeByValueAttr() || A->hasNonNullAttr())
+    // A byval, inalloca may not be null in a non-default addres space. A
+    // nonnull argument is assumed never 0.
+    if (const Argument *A = dyn_cast<Argument>(V)) {
+      if (((A->hasPassPointeeByValueCopyAttr() &&
+            !NullPointerIsDefined(A->getParent(), PtrTy->getAddressSpace())) ||
+           A->hasNonNullAttr()))
         return true;
+    }
 
     // A Load tagged with nonnull metadata is never null.
     if (const LoadInst *LI = dyn_cast<LoadInst>(V))
@@ -2389,23 +2289,22 @@ bool isKnownNonZero(const Value *V, const APInt &DemandedElts, unsigned Depth,
     // truncating casts, e.g., int2ptr/ptr2int with appropriate sizes, as well
     // as casts that can alter the value, e.g., AddrSpaceCasts.
     if (const GEPOperator *GEP = dyn_cast<GEPOperator>(V))
-      if (isGEPKnownNonNull(GEP, Depth, Q))
-        return true;
+      return isGEPKnownNonNull(GEP, Depth, Q);
 
     if (auto *BCO = dyn_cast<BitCastOperator>(V))
       return isKnownNonZero(BCO->getOperand(0), Depth, Q);
 
     if (auto *I2P = dyn_cast<IntToPtrInst>(V))
-      if (Q.DL.getTypeSizeInBits(I2P->getSrcTy()) <=
-          Q.DL.getTypeSizeInBits(I2P->getDestTy()))
+      if (Q.DL.getTypeSizeInBits(I2P->getSrcTy()).getFixedSize() <=
+          Q.DL.getTypeSizeInBits(I2P->getDestTy()).getFixedSize())
         return isKnownNonZero(I2P->getOperand(0), Depth, Q);
   }
 
   // Similar to int2ptr above, we can look through ptr2int here if the cast
   // is a no-op or an extend and not a truncate.
   if (auto *P2I = dyn_cast<PtrToIntInst>(V))
-    if (Q.DL.getTypeSizeInBits(P2I->getSrcTy()) <=
-        Q.DL.getTypeSizeInBits(P2I->getDestTy()))
+    if (Q.DL.getTypeSizeInBits(P2I->getSrcTy()).getFixedSize() <=
+        Q.DL.getTypeSizeInBits(P2I->getDestTy()).getFixedSize())
       return isKnownNonZero(P2I->getOperand(0), Depth, Q);
 
   unsigned BitWidth = getBitWidth(V->getType()->getScalarType(), Q.DL);
@@ -2532,23 +2431,35 @@ bool isKnownNonZero(const Value *V, const APInt &DemandedElts, unsigned Depth,
         }
       }
     }
-    // Check if all incoming values are non-zero constant.
-    bool AllNonZeroConstants = llvm::all_of(PN->operands(), [](Value *V) {
-      return isa<ConstantInt>(V) && !cast<ConstantInt>(V)->isZero();
+    // Check if all incoming values are non-zero using recursion.
+    Query RecQ = Q;
+    unsigned NewDepth = std::max(Depth, MaxAnalysisRecursionDepth - 1);
+    return llvm::all_of(PN->operands(), [&](const Use &U) {
+      if (U.get() == PN)
+        return true;
+      RecQ.CxtI = PN->getIncomingBlock(U)->getTerminator();
+      return isKnownNonZero(U.get(), DemandedElts, NewDepth, RecQ);
     });
-    if (AllNonZeroConstants)
-      return true;
   }
   // ExtractElement
   else if (const auto *EEI = dyn_cast<ExtractElementInst>(V)) {
     const Value *Vec = EEI->getVectorOperand();
     const Value *Idx = EEI->getIndexOperand();
     auto *CIdx = dyn_cast<ConstantInt>(Idx);
-    unsigned NumElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
-    APInt DemandedVecElts = APInt::getAllOnesValue(NumElts);
-    if (CIdx && CIdx->getValue().ult(NumElts))
-      DemandedVecElts = APInt::getOneBitSet(NumElts, CIdx->getZExtValue());
-    return isKnownNonZero(Vec, DemandedVecElts, Depth, Q);
+    if (auto *VecTy = dyn_cast<FixedVectorType>(Vec->getType())) {
+      unsigned NumElts = VecTy->getNumElements();
+      APInt DemandedVecElts = APInt::getAllOnesValue(NumElts);
+      if (CIdx && CIdx->getValue().ult(NumElts))
+        DemandedVecElts = APInt::getOneBitSet(NumElts, CIdx->getZExtValue());
+      return isKnownNonZero(Vec, DemandedVecElts, Depth, Q);
+    }
+  }
+  // Freeze
+  else if (const FreezeInst *FI = dyn_cast<FreezeInst>(V)) {
+    auto *Op = FI->getOperand(0);
+    if (isKnownNonZero(Op, Depth, Q) &&
+        isGuaranteedNotToBePoison(Op, Q.AC, Q.CxtI, Q.DT, Depth))
+      return true;
   }
 
   KnownBits Known(BitWidth);
@@ -2569,7 +2480,8 @@ bool isKnownNonZero(const Value* V, unsigned Depth, const Query& Q) {
 }
 
 /// Return true if V2 == V1 + X, where X is known non-zero.
-static bool isAddOfNonZero(const Value *V1, const Value *V2, const Query &Q) {
+static bool isAddOfNonZero(const Value *V1, const Value *V2, unsigned Depth,
+                           const Query &Q) {
   const BinaryOperator *BO = dyn_cast<BinaryOperator>(V1);
   if (!BO || BO->getOpcode() != Instruction::Add)
     return false;
@@ -2580,24 +2492,75 @@ static bool isAddOfNonZero(const Value *V1, const Value *V2, const Query &Q) {
     Op = BO->getOperand(0);
   else
     return false;
-  return isKnownNonZero(Op, 0, Q);
+  return isKnownNonZero(Op, Depth + 1, Q);
 }
 
+
 /// Return true if it is known that V1 != V2.
-static bool isKnownNonEqual(const Value *V1, const Value *V2, const Query &Q) {
+static bool isKnownNonEqual(const Value *V1, const Value *V2, unsigned Depth,
+                            const Query &Q) {
   if (V1 == V2)
     return false;
   if (V1->getType() != V2->getType())
     // We can't look through casts yet.
     return false;
-  if (isAddOfNonZero(V1, V2, Q) || isAddOfNonZero(V2, V1, Q))
+
+  if (Depth >= MaxAnalysisRecursionDepth)
+    return false;
+
+  // See if we can recurse through (exactly one of) our operands.  This
+  // requires our operation be 1-to-1 and map every input value to exactly
+  // one output value.  Such an operation is invertible.
+  auto *O1 = dyn_cast<Operator>(V1);
+  auto *O2 = dyn_cast<Operator>(V2);
+  if (O1 && O2 && O1->getOpcode() == O2->getOpcode()) {
+    switch (O1->getOpcode()) {
+    default: break;
+    case Instruction::Add:
+    case Instruction::Sub:
+      // Assume operand order has been canonicalized
+      if (O1->getOperand(0) == O2->getOperand(0))
+        return isKnownNonEqual(O1->getOperand(1), O2->getOperand(1),
+                               Depth + 1, Q);
+      if (O1->getOperand(1) == O2->getOperand(1))
+        return isKnownNonEqual(O1->getOperand(0), O2->getOperand(0),
+                               Depth + 1, Q);
+      break;
+    case Instruction::Mul: {
+      // invertible if A * B == (A * B) mod 2^N where A, and B are integers
+      // and N is the bitwdith.  The nsw case is non-obvious, but proven by
+      // alive2: https://alive2.llvm.org/ce/z/Z6D5qK
+      auto *OBO1 = cast<OverflowingBinaryOperator>(O1);
+      auto *OBO2 = cast<OverflowingBinaryOperator>(O2);
+      if ((!OBO1->hasNoUnsignedWrap() || !OBO2->hasNoUnsignedWrap()) &&
+          (!OBO1->hasNoSignedWrap() || !OBO2->hasNoSignedWrap()))
+        break;
+
+      // Assume operand order has been canonicalized
+      if (O1->getOperand(1) == O2->getOperand(1) &&
+          isa<ConstantInt>(O1->getOperand(1)) &&
+          !cast<ConstantInt>(O1->getOperand(1))->isZero())
+        return isKnownNonEqual(O1->getOperand(0), O2->getOperand(0),
+                               Depth + 1, Q);
+      break;
+    }
+    case Instruction::SExt:
+    case Instruction::ZExt:
+      if (O1->getOperand(0)->getType() == O2->getOperand(0)->getType())
+        return isKnownNonEqual(O1->getOperand(0), O2->getOperand(0),
+                               Depth + 1, Q);
+      break;
+    };
+  }
+  
+  if (isAddOfNonZero(V1, V2, Depth, Q) || isAddOfNonZero(V2, V1, Depth, Q))
     return true;
 
   if (V1->getType()->isIntOrIntVectorTy()) {
     // Are any known bits in V1 contradictory to known bits in V2? If V1
     // has a known zero where V2 has a known one, they must not be equal.
-    KnownBits Known1 = computeKnownBits(V1, 0, Q);
-    KnownBits Known2 = computeKnownBits(V2, 0, Q);
+    KnownBits Known1 = computeKnownBits(V1, Depth, Q);
+    KnownBits Known2 = computeKnownBits(V2, Depth, Q);
 
     if (Known1.Zero.intersects(Known2.One) ||
         Known2.Zero.intersects(Known1.One))
@@ -2709,7 +2672,7 @@ static unsigned ComputeNumSignBitsImpl(const Value *V,
     return 1;
 
 #ifndef NDEBUG
-  assert(Depth <= MaxDepth && "Limit Search Depth");
+  assert(Depth <= MaxAnalysisRecursionDepth && "Limit Search Depth");
 
   if (auto *FVTy = dyn_cast<FixedVectorType>(Ty)) {
     assert(
@@ -2736,8 +2699,8 @@ static unsigned ComputeNumSignBitsImpl(const Value *V,
   // Note that ConstantInt is handled by the general computeKnownBits case
   // below.
 
-  if (Depth == MaxDepth)
-    return 1;  // Limit search depth.
+  if (Depth == MaxAnalysisRecursionDepth)
+    return 1;
 
   if (auto *U = dyn_cast<Operator>(V)) {
     switch (Operator::getOpcode(V)) {
@@ -2929,11 +2892,13 @@ static unsigned ComputeNumSignBitsImpl(const Value *V,
 
       // Take the minimum of all incoming values.  This can't infinitely loop
       // because of our depth threshold.
-      Tmp = ComputeNumSignBits(PN->getIncomingValue(0), Depth + 1, Q);
-      for (unsigned i = 1, e = NumIncomingValues; i != e; ++i) {
+      Query RecQ = Q;
+      Tmp = TyBits;
+      for (unsigned i = 0, e = NumIncomingValues; i != e; ++i) {
         if (Tmp == 1) return Tmp;
+        RecQ.CxtI = PN->getIncomingBlock(i)->getTerminator();
         Tmp = std::min(
-            Tmp, ComputeNumSignBits(PN->getIncomingValue(i), Depth + 1, Q));
+            Tmp, ComputeNumSignBits(PN->getIncomingValue(i), Depth + 1, RecQ));
       }
       return Tmp;
     }
@@ -2981,10 +2946,22 @@ static unsigned ComputeNumSignBitsImpl(const Value *V,
       // fall-back.
       if (Tmp == 1)
         break;
-      assert(Tmp <= Ty->getScalarSizeInBits() &&
-             "Failed to determine minimum sign bits");
+      assert(Tmp <= TyBits && "Failed to determine minimum sign bits");
       return Tmp;
     }
+    case Instruction::Call: {
+      if (const auto *II = dyn_cast<IntrinsicInst>(U)) {
+        switch (II->getIntrinsicID()) {
+        default: break;
+        case Intrinsic::abs:
+          Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
+          if (Tmp == 1) break;
+
+          // Absolute value reduces number of sign bits by at most 1.
+          return Tmp - 1;
+        }
+      }
+    }
     }
   }
 
@@ -3012,7 +2989,7 @@ static unsigned ComputeNumSignBitsImpl(const Value *V,
 bool llvm::ComputeMultiple(Value *V, unsigned Base, Value *&Multiple,
                            bool LookThroughSExt, unsigned Depth) {
   assert(V && "No Value?");
-  assert(Depth <= MaxDepth && "Limit Search Depth");
+  assert(Depth <= MaxAnalysisRecursionDepth && "Limit Search Depth");
   assert(V->getType()->isIntegerTy() && "Not integer or pointer type!");
 
   Type *T = V->getType();
@@ -3040,7 +3017,7 @@ bool llvm::ComputeMultiple(Value *V, unsigned Base, Value *&Multiple,
     return true;
   }
 
-  if (Depth == MaxDepth) return false;  // Limit search depth.
+  if (Depth == MaxAnalysisRecursionDepth) return false;
 
   Operator *I = dyn_cast<Operator>(V);
   if (!I) return false;
@@ -3074,11 +3051,11 @@ bool llvm::ComputeMultiple(Value *V, unsigned Base, Value *&Multiple,
     if (ComputeMultiple(Op0, Base, Mul0, LookThroughSExt, Depth+1)) {
       if (Constant *Op1C = dyn_cast<Constant>(Op1))
         if (Constant *MulC = dyn_cast<Constant>(Mul0)) {
-          if (Op1C->getType()->getPrimitiveSizeInBits() <
-              MulC->getType()->getPrimitiveSizeInBits())
+          if (Op1C->getType()->getPrimitiveSizeInBits().getFixedSize() <
+              MulC->getType()->getPrimitiveSizeInBits().getFixedSize())
             Op1C = ConstantExpr::getZExt(Op1C, MulC->getType());
-          if (Op1C->getType()->getPrimitiveSizeInBits() >
-              MulC->getType()->getPrimitiveSizeInBits())
+          if (Op1C->getType()->getPrimitiveSizeInBits().getFixedSize() >
+              MulC->getType()->getPrimitiveSizeInBits().getFixedSize())
             MulC = ConstantExpr::getZExt(MulC, Op1C->getType());
 
           // V == Base * (Mul0 * Op1), so return (Mul0 * Op1)
@@ -3098,11 +3075,11 @@ bool llvm::ComputeMultiple(Value *V, unsigned Base, Value *&Multiple,
     if (ComputeMultiple(Op1, Base, Mul1, LookThroughSExt, Depth+1)) {
       if (Constant *Op0C = dyn_cast<Constant>(Op0))
         if (Constant *MulC = dyn_cast<Constant>(Mul1)) {
-          if (Op0C->getType()->getPrimitiveSizeInBits() <
-              MulC->getType()->getPrimitiveSizeInBits())
+          if (Op0C->getType()->getPrimitiveSizeInBits().getFixedSize() <
+              MulC->getType()->getPrimitiveSizeInBits().getFixedSize())
             Op0C = ConstantExpr::getZExt(Op0C, MulC->getType());
-          if (Op0C->getType()->getPrimitiveSizeInBits() >
-              MulC->getType()->getPrimitiveSizeInBits())
+          if (Op0C->getType()->getPrimitiveSizeInBits().getFixedSize() >
+              MulC->getType()->getPrimitiveSizeInBits().getFixedSize())
             MulC = ConstantExpr::getZExt(MulC, Op0C->getType());
 
           // V == Base * (Mul1 * Op0), so return (Mul1 * Op0)
@@ -3242,8 +3219,7 @@ bool llvm::CannotBeNegativeZero(const Value *V, const TargetLibraryInfo *TLI,
   if (auto *CFP = dyn_cast<ConstantFP>(V))
     return !CFP->getValueAPF().isNegZero();
 
-  // Limit search depth.
-  if (Depth == MaxDepth)
+  if (Depth == MaxAnalysisRecursionDepth)
     return false;
 
   auto *Op = dyn_cast<Operator>(V);
@@ -3311,8 +3287,8 @@ static bool cannotBeOrderedLessThanZeroImpl(const Value *V,
     }
   }
 
-  if (Depth == MaxDepth)
-    return false; // Limit search depth.
+  if (Depth == MaxAnalysisRecursionDepth)
+    return false;
 
   const Operator *I = dyn_cast<Operator>(V);
   if (!I)
@@ -3464,7 +3440,7 @@ bool llvm::isKnownNeverInfinity(const Value *V, const TargetLibraryInfo *TLI,
   if (auto *CFP = dyn_cast<ConstantFP>(V))
     return !CFP->isInfinity();
 
-  if (Depth == MaxDepth)
+  if (Depth == MaxAnalysisRecursionDepth)
     return false;
 
   if (auto *Inst = dyn_cast<Instruction>(V)) {
@@ -3473,20 +3449,30 @@ bool llvm::isKnownNeverInfinity(const Value *V, const TargetLibraryInfo *TLI,
       return isKnownNeverInfinity(Inst->getOperand(1), TLI, Depth + 1) &&
              isKnownNeverInfinity(Inst->getOperand(2), TLI, Depth + 1);
     }
-    case Instruction::UIToFP:
-      // If the input type fits into the floating type the result is finite.
-      return ilogb(APFloat::getLargest(
-                 Inst->getType()->getScalarType()->getFltSemantics())) >=
-             (int)Inst->getOperand(0)->getType()->getScalarSizeInBits();
+    case Instruction::SIToFP:
+    case Instruction::UIToFP: {
+      // Get width of largest magnitude integer (remove a bit if signed).
+      // This still works for a signed minimum value because the largest FP
+      // value is scaled by some fraction close to 2.0 (1.0 + 0.xxxx).
+      int IntSize = Inst->getOperand(0)->getType()->getScalarSizeInBits();
+      if (Inst->getOpcode() == Instruction::SIToFP)
+        --IntSize;
+
+      // If the exponent of the largest finite FP value can hold the largest
+      // integer, the result of the cast must be finite.
+      Type *FPTy = Inst->getType()->getScalarType();
+      return ilogb(APFloat::getLargest(FPTy->getFltSemantics())) >= IntSize;
+    }
     default:
       break;
     }
   }
 
   // try to handle fixed width vector constants
-  if (isa<FixedVectorType>(V->getType()) && isa<Constant>(V)) {
+  auto *VFVTy = dyn_cast<FixedVectorType>(V->getType());
+  if (VFVTy && isa<Constant>(V)) {
     // For vectors, verify that each element is not infinity.
-    unsigned NumElts = cast<VectorType>(V->getType())->getNumElements();
+    unsigned NumElts = VFVTy->getNumElements();
     for (unsigned i = 0; i != NumElts; ++i) {
       Constant *Elt = cast<Constant>(V)->getAggregateElement(i);
       if (!Elt)
@@ -3518,7 +3504,7 @@ bool llvm::isKnownNeverNaN(const Value *V, const TargetLibraryInfo *TLI,
   if (auto *CFP = dyn_cast<ConstantFP>(V))
     return !CFP->isNaN();
 
-  if (Depth == MaxDepth)
+  if (Depth == MaxAnalysisRecursionDepth)
     return false;
 
   if (auto *Inst = dyn_cast<Instruction>(V)) {
@@ -3588,9 +3574,10 @@ bool llvm::isKnownNeverNaN(const Value *V, const TargetLibraryInfo *TLI,
   }
 
   // Try to handle fixed width vector constants
-  if (isa<FixedVectorType>(V->getType()) && isa<Constant>(V)) {
+  auto *VFVTy = dyn_cast<FixedVectorType>(V->getType());
+  if (VFVTy && isa<Constant>(V)) {
     // For vectors, verify that each element is not NaN.
-    unsigned NumElts = cast<VectorType>(V->getType())->getNumElements();
+    unsigned NumElts = VFVTy->getNumElements();
     for (unsigned i = 0; i != NumElts; ++i) {
       Constant *Elt = cast<Constant>(V)->getAggregateElement(i);
       if (!Elt)
@@ -3668,12 +3655,13 @@ Value *llvm::isBytewiseValue(Value *V, const DataLayout &DL) {
 
   if (auto *CE = dyn_cast<ConstantExpr>(C)) {
     if (CE->getOpcode() == Instruction::IntToPtr) {
-      auto PS = DL.getPointerSizeInBits(
-          cast<PointerType>(CE->getType())->getAddressSpace());
-      return isBytewiseValue(
-          ConstantExpr::getIntegerCast(CE->getOperand(0),
-                                       Type::getIntNTy(Ctx, PS), false),
-          DL);
+      if (auto *PtrTy = dyn_cast<PointerType>(CE->getType())) {
+        unsigned BitWidth = DL.getPointerSizeInBits(PtrTy->getAddressSpace());
+        return isBytewiseValue(
+            ConstantExpr::getIntegerCast(CE->getOperand(0),
+                                         Type::getIntNTy(Ctx, BitWidth), false),
+            DL);
+      }
     }
   }
 
@@ -4142,8 +4130,7 @@ static bool isSameUnderlyingObjectInLoop(const PHINode *PN,
   return true;
 }
 
-Value *llvm::GetUnderlyingObject(Value *V, const DataLayout &DL,
-                                 unsigned MaxLookup) {
+Value *llvm::getUnderlyingObject(Value *V, unsigned MaxLookup) {
   if (!V->getType()->isPointerTy())
     return V;
   for (unsigned Count = 0; MaxLookup == 0 || Count < MaxLookup; ++Count) {
@@ -4188,16 +4175,15 @@ Value *llvm::GetUnderlyingObject(Value *V, const DataLayout &DL,
   return V;
 }
 
-void llvm::GetUnderlyingObjects(const Value *V,
+void llvm::getUnderlyingObjects(const Value *V,
                                 SmallVectorImpl<const Value *> &Objects,
-                                const DataLayout &DL, LoopInfo *LI,
-                                unsigned MaxLookup) {
+                                LoopInfo *LI, unsigned MaxLookup) {
   SmallPtrSet<const Value *, 4> Visited;
   SmallVector<const Value *, 4> Worklist;
   Worklist.push_back(V);
   do {
     const Value *P = Worklist.pop_back_val();
-    P = GetUnderlyingObject(P, DL, MaxLookup);
+    P = getUnderlyingObject(P, MaxLookup);
 
     if (!Visited.insert(P).second)
       continue;
@@ -4221,8 +4207,7 @@ void llvm::GetUnderlyingObjects(const Value *V,
       // underlying objects.
       if (!LI || !LI->isLoopHeader(PN->getParent()) ||
           isSameUnderlyingObjectInLoop(PN, LI))
-        for (Value *IncValue : PN->incoming_values())
-          Worklist.push_back(IncValue);
+        append_range(Worklist, PN->incoming_values());
       continue;
     }
 
@@ -4258,19 +4243,18 @@ static const Value *getUnderlyingObjectFromInt(const Value *V) {
   } while (true);
 }
 
-/// This is a wrapper around GetUnderlyingObjects and adds support for basic
+/// This is a wrapper around getUnderlyingObjects and adds support for basic
 /// ptrtoint+arithmetic+inttoptr sequences.
-/// It returns false if unidentified object is found in GetUnderlyingObjects.
+/// It returns false if unidentified object is found in getUnderlyingObjects.
 bool llvm::getUnderlyingObjectsForCodeGen(const Value *V,
-                          SmallVectorImpl<Value *> &Objects,
-                          const DataLayout &DL) {
+                                          SmallVectorImpl<Value *> &Objects) {
   SmallPtrSet<const Value *, 16> Visited;
   SmallVector<const Value *, 4> Working(1, V);
   do {
     V = Working.pop_back_val();
 
     SmallVector<const Value *, 4> Objs;
-    GetUnderlyingObjects(V, Objs, DL);
+    getUnderlyingObjects(V, Objs);
 
     for (const Value *V : Objs) {
       if (!Visited.insert(V).second)
@@ -4283,7 +4267,7 @@ bool llvm::getUnderlyingObjectsForCodeGen(const Value *V,
           continue;
         }
       }
-      // If GetUnderlyingObjects fails to find an identifiable object,
+      // If getUnderlyingObjects fails to find an identifiable object,
       // getUnderlyingObjectsForCodeGen also fails for safety.
       if (!isIdentifiedObject(V)) {
         Objects.clear();
@@ -4295,18 +4279,72 @@ bool llvm::getUnderlyingObjectsForCodeGen(const Value *V,
   return true;
 }
 
-/// Return true if the only users of this pointer are lifetime markers.
-bool llvm::onlyUsedByLifetimeMarkers(const Value *V) {
+AllocaInst *llvm::findAllocaForValue(Value *V, bool OffsetZero) {
+  AllocaInst *Result = nullptr;
+  SmallPtrSet<Value *, 4> Visited;
+  SmallVector<Value *, 4> Worklist;
+
+  auto AddWork = [&](Value *V) {
+    if (Visited.insert(V).second)
+      Worklist.push_back(V);
+  };
+
+  AddWork(V);
+  do {
+    V = Worklist.pop_back_val();
+    assert(Visited.count(V));
+
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(V)) {
+      if (Result && Result != AI)
+        return nullptr;
+      Result = AI;
+    } else if (CastInst *CI = dyn_cast<CastInst>(V)) {
+      AddWork(CI->getOperand(0));
+    } else if (PHINode *PN = dyn_cast<PHINode>(V)) {
+      for (Value *IncValue : PN->incoming_values())
+        AddWork(IncValue);
+    } else if (auto *SI = dyn_cast<SelectInst>(V)) {
+      AddWork(SI->getTrueValue());
+      AddWork(SI->getFalseValue());
+    } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(V)) {
+      if (OffsetZero && !GEP->hasAllZeroIndices())
+        return nullptr;
+      AddWork(GEP->getPointerOperand());
+    } else {
+      return nullptr;
+    }
+  } while (!Worklist.empty());
+
+  return Result;
+}
+
+static bool onlyUsedByLifetimeMarkersOrDroppableInstsHelper(
+    const Value *V, bool AllowLifetime, bool AllowDroppable) {
   for (const User *U : V->users()) {
     const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U);
-    if (!II) return false;
-
-    if (!II->isLifetimeStartOrEnd())
+    if (!II)
       return false;
+
+    if (AllowLifetime && II->isLifetimeStartOrEnd())
+      continue;
+
+    if (AllowDroppable && II->isDroppable())
+      continue;
+
+    return false;
   }
   return true;
 }
 
+bool llvm::onlyUsedByLifetimeMarkers(const Value *V) {
+  return onlyUsedByLifetimeMarkersOrDroppableInstsHelper(
+      V, /* AllowLifetime */ true, /* AllowDroppable */ false);
+}
+bool llvm::onlyUsedByLifetimeMarkersOrDroppableInsts(const Value *V) {
+  return onlyUsedByLifetimeMarkersOrDroppableInstsHelper(
+      V, /* AllowLifetime */ true, /* AllowDroppable */ true);
+}
+
 bool llvm::mustSuppressSpeculation(const LoadInst &LI) {
   if (!LI.isUnordered())
     return true;
@@ -4352,7 +4390,7 @@ bool llvm::isSafeToSpeculativelyExecute(const Value *V,
     if (*Denominator == 0)
       return false;
     // It's safe to hoist if the denominator is not 0 or -1.
-    if (*Denominator != -1)
+    if (!Denominator->isAllOnesValue())
       return true;
     // At this point we know that the denominator is -1.  It is safe to hoist as
     // long we know that the numerator is not INT_MIN.
@@ -4660,31 +4698,30 @@ bool llvm::isOverflowIntrinsicNoWrap(const WithOverflowInst *WO,
   return llvm::any_of(GuardingBranches, AllUsesGuardedByBranch);
 }
 
-bool llvm::canCreatePoison(const Instruction *I) {
+static bool canCreateUndefOrPoison(const Operator *Op, bool PoisonOnly) {
   // See whether I has flags that may create poison
-  if (isa<OverflowingBinaryOperator>(I) &&
-      (I->hasNoSignedWrap() || I->hasNoUnsignedWrap()))
-    return true;
-  if (isa<PossiblyExactOperator>(I) && I->isExact())
-    return true;
-  if (auto *FP = dyn_cast<FPMathOperator>(I)) {
+  if (const auto *OvOp = dyn_cast<OverflowingBinaryOperator>(Op)) {
+    if (OvOp->hasNoSignedWrap() || OvOp->hasNoUnsignedWrap())
+      return true;
+  }
+  if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(Op))
+    if (ExactOp->isExact())
+      return true;
+  if (const auto *FP = dyn_cast<FPMathOperator>(Op)) {
     auto FMF = FP->getFastMathFlags();
     if (FMF.noNaNs() || FMF.noInfs())
       return true;
   }
-  if (auto *GEP = dyn_cast<GetElementPtrInst>(I))
-    if (GEP->isInBounds())
-      return true;
 
-  unsigned Opcode = I->getOpcode();
+  unsigned Opcode = Op->getOpcode();
 
-  // Check whether opcode is a poison-generating operation
+  // Check whether opcode is a poison/undef-generating operation
   switch (Opcode) {
   case Instruction::Shl:
   case Instruction::AShr:
   case Instruction::LShr: {
     // Shifts return poison if shiftwidth is larger than the bitwidth.
-    if (auto *C = dyn_cast<Constant>(I->getOperand(1))) {
+    if (auto *C = dyn_cast<Constant>(Op->getOperand(1))) {
       SmallVector<Constant *, 4> ShiftAmounts;
       if (auto *FVTy = dyn_cast<FixedVectorType>(C->getType())) {
         unsigned NumElts = FVTy->getNumElements();
@@ -4696,8 +4733,8 @@ bool llvm::canCreatePoison(const Instruction *I) {
         ShiftAmounts.push_back(C);
 
       bool Safe = llvm::all_of(ShiftAmounts, [](Constant *C) {
-        auto *CI = dyn_cast<ConstantInt>(C);
-        return CI && CI->getZExtValue() < C->getType()->getIntegerBitWidth();
+        auto *CI = dyn_cast_or_null<ConstantInt>(C);
+        return CI && CI->getValue().ult(C->getType()->getIntegerBitWidth());
       });
       return !Safe;
     }
@@ -4710,72 +4747,138 @@ bool llvm::canCreatePoison(const Instruction *I) {
     return true;
   case Instruction::Call:
   case Instruction::CallBr:
-  case Instruction::Invoke:
-    // Function calls can return a poison value even if args are non-poison
-    // values.
-    return true;
+  case Instruction::Invoke: {
+    const auto *CB = cast<CallBase>(Op);
+    return !CB->hasRetAttr(Attribute::NoUndef);
+  }
   case Instruction::InsertElement:
   case Instruction::ExtractElement: {
     // If index exceeds the length of the vector, it returns poison
-    auto *VTy = cast<VectorType>(I->getOperand(0)->getType());
-    unsigned IdxOp = I->getOpcode() == Instruction::InsertElement ? 2 : 1;
-    auto *Idx = dyn_cast<ConstantInt>(I->getOperand(IdxOp));
-    if (!Idx || Idx->getZExtValue() >= VTy->getElementCount().Min)
+    auto *VTy = cast<VectorType>(Op->getOperand(0)->getType());
+    unsigned IdxOp = Op->getOpcode() == Instruction::InsertElement ? 2 : 1;
+    auto *Idx = dyn_cast<ConstantInt>(Op->getOperand(IdxOp));
+    if (!Idx || Idx->getValue().uge(VTy->getElementCount().getKnownMinValue()))
       return true;
     return false;
   }
+  case Instruction::ShuffleVector: {
+    // shufflevector may return undef.
+    if (PoisonOnly)
+      return false;
+    ArrayRef<int> Mask = isa<ConstantExpr>(Op)
+                             ? cast<ConstantExpr>(Op)->getShuffleMask()
+                             : cast<ShuffleVectorInst>(Op)->getShuffleMask();
+    return is_contained(Mask, UndefMaskElem);
+  }
   case Instruction::FNeg:
   case Instruction::PHI:
   case Instruction::Select:
   case Instruction::URem:
   case Instruction::SRem:
-  case Instruction::ShuffleVector:
   case Instruction::ExtractValue:
   case Instruction::InsertValue:
   case Instruction::Freeze:
   case Instruction::ICmp:
   case Instruction::FCmp:
-  case Instruction::GetElementPtr:
     return false;
-  default:
-    if (isa<CastInst>(I))
+  case Instruction::GetElementPtr: {
+    const auto *GEP = cast<GEPOperator>(Op);
+    return GEP->isInBounds();
+  }
+  default: {
+    const auto *CE = dyn_cast<ConstantExpr>(Op);
+    if (isa<CastInst>(Op) || (CE && CE->isCast()))
       return false;
-    else if (isa<BinaryOperator>(I))
+    else if (Instruction::isBinaryOp(Opcode))
       return false;
     // Be conservative and return true.
     return true;
   }
+  }
 }
 
-bool llvm::isGuaranteedNotToBeUndefOrPoison(const Value *V,
-                                            const Instruction *CtxI,
-                                            const DominatorTree *DT,
-                                            unsigned Depth) {
+bool llvm::canCreateUndefOrPoison(const Operator *Op) {
+  return ::canCreateUndefOrPoison(Op, /*PoisonOnly=*/false);
+}
+
+bool llvm::canCreatePoison(const Operator *Op) {
+  return ::canCreateUndefOrPoison(Op, /*PoisonOnly=*/true);
+}
+
+static bool directlyImpliesPoison(const Value *ValAssumedPoison,
+                                  const Value *V, unsigned Depth) {
+  if (ValAssumedPoison == V)
+    return true;
+
+  const unsigned MaxDepth = 2;
   if (Depth >= MaxDepth)
     return false;
 
-  // If the value is a freeze instruction, then it can never
-  // be undef or poison.
-  if (isa<FreezeInst>(V))
+  const auto *I = dyn_cast<Instruction>(V);
+  if (I && propagatesPoison(cast<Operator>(I))) {
+    return any_of(I->operands(), [=](const Value *Op) {
+      return directlyImpliesPoison(ValAssumedPoison, Op, Depth + 1);
+    });
+  }
+  return false;
+}
+
+static bool impliesPoison(const Value *ValAssumedPoison, const Value *V,
+                          unsigned Depth) {
+  if (isGuaranteedNotToBeUndefOrPoison(ValAssumedPoison))
     return true;
-  // TODO: Some instructions are guaranteed to return neither undef
-  // nor poison if their arguments are not poison/undef.
+
+  if (directlyImpliesPoison(ValAssumedPoison, V, /* Depth */ 0))
+    return true;
+
+  const unsigned MaxDepth = 2;
+  if (Depth >= MaxDepth)
+    return false;
+
+  const auto *I = dyn_cast<Instruction>(ValAssumedPoison);
+  if (I && !canCreatePoison(cast<Operator>(I))) {
+    return all_of(I->operands(), [=](const Value *Op) {
+      return impliesPoison(Op, V, Depth + 1);
+    });
+  }
+  return false;
+}
+
+bool llvm::impliesPoison(const Value *ValAssumedPoison, const Value *V) {
+  return ::impliesPoison(ValAssumedPoison, V, /* Depth */ 0);
+}
+
+static bool programUndefinedIfUndefOrPoison(const Value *V,
+                                            bool PoisonOnly);
+
+static bool isGuaranteedNotToBeUndefOrPoison(const Value *V,
+                                             AssumptionCache *AC,
+                                             const Instruction *CtxI,
+                                             const DominatorTree *DT,
+                                             unsigned Depth, bool PoisonOnly) {
+  if (Depth >= MaxAnalysisRecursionDepth)
+    return false;
+
+  if (isa<MetadataAsValue>(V))
+    return false;
+
+  if (const auto *A = dyn_cast<Argument>(V)) {
+    if (A->hasAttribute(Attribute::NoUndef))
+      return true;
+  }
 
   if (auto *C = dyn_cast<Constant>(V)) {
-    // TODO: We can analyze ConstExpr by opcode to determine if there is any
-    //       possibility of poison.
-    if (isa<UndefValue>(C) || isa<ConstantExpr>(C))
-      return false;
+    if (isa<UndefValue>(C))
+      return PoisonOnly && !isa<PoisonValue>(C);
 
     if (isa<ConstantInt>(C) || isa<GlobalVariable>(C) || isa<ConstantFP>(V) ||
         isa<ConstantPointerNull>(C) || isa<Function>(C))
       return true;
 
-    if (C->getType()->isVectorTy())
-      return !C->containsUndefElement() && !C->containsConstantExpression();
-
-    // TODO: Recursively analyze aggregates or other constants.
-    return false;
+    if (C->getType()->isVectorTy() && !isa<ConstantExpr>(C))
+      return (PoisonOnly ? !C->containsPoisonElement()
+                         : !C->containsUndefOrPoisonElement()) &&
+             !C->containsConstantExpression();
   }
 
   // Strip cast operations from a pointer value.
@@ -4792,40 +4895,45 @@ bool llvm::isGuaranteedNotToBeUndefOrPoison(const Value *V,
     return true;
 
   auto OpCheck = [&](const Value *V) {
-    return isGuaranteedNotToBeUndefOrPoison(V, CtxI, DT, Depth + 1);
+    return isGuaranteedNotToBeUndefOrPoison(V, AC, CtxI, DT, Depth + 1,
+                                            PoisonOnly);
   };
 
-  if (auto *I = dyn_cast<Instruction>(V)) {
-    switch (I->getOpcode()) {
-    case Instruction::GetElementPtr: {
-      auto *GEPI = dyn_cast<GetElementPtrInst>(I);
-      if (!GEPI->isInBounds() && llvm::all_of(GEPI->operands(), OpCheck))
-        return true;
-      break;
-    }
-    case Instruction::FCmp: {
-      auto *FI = dyn_cast<FCmpInst>(I);
-      if (FI->getFastMathFlags().none() &&
-          llvm::all_of(FI->operands(), OpCheck))
-        return true;
-      break;
-    }
-    case Instruction::BitCast:
-    case Instruction::PHI:
-    case Instruction::ICmp:
-      if (llvm::all_of(I->operands(), OpCheck))
+  if (auto *Opr = dyn_cast<Operator>(V)) {
+    // If the value is a freeze instruction, then it can never
+    // be undef or poison.
+    if (isa<FreezeInst>(V))
+      return true;
+
+    if (const auto *CB = dyn_cast<CallBase>(V)) {
+      if (CB->hasRetAttr(Attribute::NoUndef))
         return true;
-      break;
-    default:
-      break;
     }
 
-    if (programUndefinedIfPoison(I) && I->getType()->isIntegerTy(1))
-      // Note: once we have an agreement that poison is a value-wise concept,
-      // we can remove the isIntegerTy(1) constraint.
+    if (const auto *PN = dyn_cast<PHINode>(V)) {
+      unsigned Num = PN->getNumIncomingValues();
+      bool IsWellDefined = true;
+      for (unsigned i = 0; i < Num; ++i) {
+        auto *TI = PN->getIncomingBlock(i)->getTerminator();
+        if (!isGuaranteedNotToBeUndefOrPoison(PN->getIncomingValue(i), AC, TI,
+                                              DT, Depth + 1, PoisonOnly)) {
+          IsWellDefined = false;
+          break;
+        }
+      }
+      if (IsWellDefined)
+        return true;
+    } else if (!canCreateUndefOrPoison(Opr) && all_of(Opr->operands(), OpCheck))
       return true;
   }
 
+  if (auto *I = dyn_cast<LoadInst>(V))
+    if (I->getMetadata(LLVMContext::MD_noundef))
+      return true;
+
+  if (programUndefinedIfUndefOrPoison(V, PoisonOnly))
+    return true;
+
   // CxtI may be null or a cloned instruction.
   if (!CtxI || !CtxI->getParent() || !DT)
     return false;
@@ -4844,20 +4952,48 @@ bool llvm::isGuaranteedNotToBeUndefOrPoison(const Value *V,
   while (Dominator) {
     auto *TI = Dominator->getBlock()->getTerminator();
 
+    Value *Cond = nullptr;
     if (auto BI = dyn_cast<BranchInst>(TI)) {
-      if (BI->isConditional() && BI->getCondition() == V)
-        return true;
+      if (BI->isConditional())
+        Cond = BI->getCondition();
     } else if (auto SI = dyn_cast<SwitchInst>(TI)) {
-      if (SI->getCondition() == V)
+      Cond = SI->getCondition();
+    }
+
+    if (Cond) {
+      if (Cond == V)
         return true;
+      else if (PoisonOnly && isa<Operator>(Cond)) {
+        // For poison, we can analyze further
+        auto *Opr = cast<Operator>(Cond);
+        if (propagatesPoison(Opr) && is_contained(Opr->operand_values(), V))
+          return true;
+      }
     }
 
     Dominator = Dominator->getIDom();
   }
 
+  SmallVector<Attribute::AttrKind, 2> AttrKinds{Attribute::NoUndef};
+  if (getKnowledgeValidInContext(V, AttrKinds, CtxI, DT, AC))
+    return true;
+
   return false;
 }
 
+bool llvm::isGuaranteedNotToBeUndefOrPoison(const Value *V, AssumptionCache *AC,
+                                            const Instruction *CtxI,
+                                            const DominatorTree *DT,
+                                            unsigned Depth) {
+  return ::isGuaranteedNotToBeUndefOrPoison(V, AC, CtxI, DT, Depth, false);
+}
+
+bool llvm::isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC,
+                                     const Instruction *CtxI,
+                                     const DominatorTree *DT, unsigned Depth) {
+  return ::isGuaranteedNotToBeUndefOrPoison(V, AC, CtxI, DT, Depth, true);
+}
+
 OverflowResult llvm::computeOverflowForSignedAdd(const AddOperator *Add,
                                                  const DataLayout &DL,
                                                  AssumptionCache *AC,
@@ -4882,49 +5018,14 @@ bool llvm::isGuaranteedToTransferExecutionToSuccessor(const Instruction *I) {
   // arbitrary length of time, but programs aren't allowed to rely on that.
 
   // If there is no successor, then execution can't transfer to it.
-  if (const auto *CRI = dyn_cast<CleanupReturnInst>(I))
-    return !CRI->unwindsToCaller();
-  if (const auto *CatchSwitch = dyn_cast<CatchSwitchInst>(I))
-    return !CatchSwitch->unwindsToCaller();
-  if (isa<ResumeInst>(I))
-    return false;
   if (isa<ReturnInst>(I))
     return false;
   if (isa<UnreachableInst>(I))
     return false;
 
-  // Calls can throw, or contain an infinite loop, or kill the process.
-  if (const auto *CB = dyn_cast<CallBase>(I)) {
-    // Call sites that throw have implicit non-local control flow.
-    if (!CB->doesNotThrow())
-      return false;
-
-    // A function which doens't throw and has "willreturn" attribute will
-    // always return.
-    if (CB->hasFnAttr(Attribute::WillReturn))
-      return true;
-
-    // Non-throwing call sites can loop infinitely, call exit/pthread_exit
-    // etc. and thus not return.  However, LLVM already assumes that
-    //
-    //  - Thread exiting actions are modeled as writes to memory invisible to
-    //    the program.
-    //
-    //  - Loops that don't have side effects (side effects are volatile/atomic
-    //    stores and IO) always terminate (see http://llvm.org/PR965).
-    //    Furthermore IO itself is also modeled as writes to memory invisible to
-    //    the program.
-    //
-    // We rely on those assumptions here, and use the memory effects of the call
-    // target as a proxy for checking that it always returns.
-
-    // FIXME: This isn't aggressive enough; a call which only writes to a global
-    // is guaranteed to return.
-    return CB->onlyReadsMemory() || CB->onlyAccessesArgMemory();
-  }
-
-  // Other instructions return normally.
-  return true;
+  // An instruction that returns without throwing must transfer control flow
+  // to a successor.
+  return !I->mayThrow() && I->willReturn();
 }
 
 bool llvm::isGuaranteedToTransferExecutionToSuccessor(const BasicBlock *BB) {
@@ -4951,7 +5052,7 @@ bool llvm::isGuaranteedToExecuteForEveryIteration(const Instruction *I,
   llvm_unreachable("Instruction not contained in its own parent basic block.");
 }
 
-bool llvm::propagatesPoison(const Instruction *I) {
+bool llvm::propagatesPoison(const Operator *I) {
   switch (I->getOpcode()) {
   case Instruction::Freeze:
   case Instruction::Select:
@@ -4972,86 +5073,141 @@ bool llvm::propagatesPoison(const Instruction *I) {
   }
 }
 
-const Value *llvm::getGuaranteedNonPoisonOp(const Instruction *I) {
+void llvm::getGuaranteedNonPoisonOps(const Instruction *I,
+                                     SmallPtrSetImpl<const Value *> &Operands) {
   switch (I->getOpcode()) {
     case Instruction::Store:
-      return cast<StoreInst>(I)->getPointerOperand();
+      Operands.insert(cast<StoreInst>(I)->getPointerOperand());
+      break;
 
     case Instruction::Load:
-      return cast<LoadInst>(I)->getPointerOperand();
+      Operands.insert(cast<LoadInst>(I)->getPointerOperand());
+      break;
 
     case Instruction::AtomicCmpXchg:
-      return cast<AtomicCmpXchgInst>(I)->getPointerOperand();
+      Operands.insert(cast<AtomicCmpXchgInst>(I)->getPointerOperand());
+      break;
 
     case Instruction::AtomicRMW:
-      return cast<AtomicRMWInst>(I)->getPointerOperand();
+      Operands.insert(cast<AtomicRMWInst>(I)->getPointerOperand());
+      break;
 
     case Instruction::UDiv:
     case Instruction::SDiv:
     case Instruction::URem:
     case Instruction::SRem:
-      return I->getOperand(1);
+      Operands.insert(I->getOperand(1));
+      break;
 
     case Instruction::Call:
-      if (auto *II = dyn_cast<IntrinsicInst>(I)) {
-        switch (II->getIntrinsicID()) {
-        case Intrinsic::assume:
-          return II->getArgOperand(0);
-        default:
-          return nullptr;
-        }
+    case Instruction::Invoke: {
+      const CallBase *CB = cast<CallBase>(I);
+      if (CB->isIndirectCall())
+        Operands.insert(CB->getCalledOperand());
+      for (unsigned i = 0; i < CB->arg_size(); ++i) {
+        if (CB->paramHasAttr(i, Attribute::NoUndef))
+          Operands.insert(CB->getArgOperand(i));
       }
-      return nullptr;
+      break;
+    }
 
     default:
-      return nullptr;
+      break;
   }
 }
 
 bool llvm::mustTriggerUB(const Instruction *I,
                          const SmallSet<const Value *, 16>& KnownPoison) {
-  auto *NotPoison = getGuaranteedNonPoisonOp(I);
-  return (NotPoison && KnownPoison.count(NotPoison));
-}
+  SmallPtrSet<const Value *, 4> NonPoisonOps;
+  getGuaranteedNonPoisonOps(I, NonPoisonOps);
+
+  for (const auto *V : NonPoisonOps)
+    if (KnownPoison.count(V))
+      return true;
 
+  return false;
+}
 
-bool llvm::programUndefinedIfPoison(const Instruction *PoisonI) {
-  // We currently only look for uses of poison values within the same basic
+static bool programUndefinedIfUndefOrPoison(const Value *V,
+                                            bool PoisonOnly) {
+  // We currently only look for uses of values within the same basic
   // block, as that makes it easier to guarantee that the uses will be
-  // executed given that PoisonI is executed.
+  // executed given that Inst is executed.
   //
   // FIXME: Expand this to consider uses beyond the same basic block. To do
   // this, look out for the distinction between post-dominance and strong
   // post-dominance.
-  const BasicBlock *BB = PoisonI->getParent();
+  const BasicBlock *BB = nullptr;
+  BasicBlock::const_iterator Begin;
+  if (const auto *Inst = dyn_cast<Instruction>(V)) {
+    BB = Inst->getParent();
+    Begin = Inst->getIterator();
+    Begin++;
+  } else if (const auto *Arg = dyn_cast<Argument>(V)) {
+    BB = &Arg->getParent()->getEntryBlock();
+    Begin = BB->begin();
+  } else {
+    return false;
+  }
+
+  // Limit number of instructions we look at, to avoid scanning through large
+  // blocks. The current limit is chosen arbitrarily.
+  unsigned ScanLimit = 32;
+  BasicBlock::const_iterator End = BB->end();
+
+  if (!PoisonOnly) {
+    // Be conservative & just check whether a value is passed to a noundef
+    // argument.
+    // Instructions that raise UB with a poison operand are well-defined
+    // or have unclear semantics when the input is partially undef.
+    // For example, 'udiv x, (undef | 1)' isn't UB.
+
+    for (auto &I : make_range(Begin, End)) {
+      if (isa<DbgInfoIntrinsic>(I))
+        continue;
+      if (--ScanLimit == 0)
+        break;
 
-  // Set of instructions that we have proved will yield poison if PoisonI
+      if (const auto *CB = dyn_cast<CallBase>(&I)) {
+        for (unsigned i = 0; i < CB->arg_size(); ++i) {
+          if (CB->paramHasAttr(i, Attribute::NoUndef) &&
+              CB->getArgOperand(i) == V)
+            return true;
+        }
+      }
+      if (!isGuaranteedToTransferExecutionToSuccessor(&I))
+        break;
+    }
+    return false;
+  }
+
+  // Set of instructions that we have proved will yield poison if Inst
   // does.
   SmallSet<const Value *, 16> YieldsPoison;
   SmallSet<const BasicBlock *, 4> Visited;
-  YieldsPoison.insert(PoisonI);
-  Visited.insert(PoisonI->getParent());
 
-  BasicBlock::const_iterator Begin = PoisonI->getIterator(), End = BB->end();
+  YieldsPoison.insert(V);
+  auto Propagate = [&](const User *User) {
+    if (propagatesPoison(cast<Operator>(User)))
+      YieldsPoison.insert(User);
+  };
+  for_each(V->users(), Propagate);
+  Visited.insert(BB);
 
-  unsigned Iter = 0;
-  while (Iter++ < MaxDepth) {
+  while (true) {
     for (auto &I : make_range(Begin, End)) {
-      if (&I != PoisonI) {
-        if (mustTriggerUB(&I, YieldsPoison))
-          return true;
-        if (!isGuaranteedToTransferExecutionToSuccessor(&I))
-          return false;
-      }
+      if (isa<DbgInfoIntrinsic>(I))
+        continue;
+      if (--ScanLimit == 0)
+        return false;
+      if (mustTriggerUB(&I, YieldsPoison))
+        return true;
+      if (!isGuaranteedToTransferExecutionToSuccessor(&I))
+        return false;
 
       // Mark poison that propagates from I through uses of I.
-      if (YieldsPoison.count(&I)) {
-        for (const User *User : I.users()) {
-          const Instruction *UserI = cast<Instruction>(User);
-          if (propagatesPoison(UserI))
-            YieldsPoison.insert(User);
-        }
-      }
+      if (YieldsPoison.count(&I))
+        for_each(I.users(), Propagate);
     }
 
     if (auto *NextBB = BB->getSingleSuccessor()) {
@@ -5068,6 +5224,14 @@ bool llvm::programUndefinedIfPoison(const Instruction *PoisonI) {
   return false;
 }
 
+bool llvm::programUndefinedIfUndefOrPoison(const Instruction *Inst) {
+  return ::programUndefinedIfUndefOrPoison(Inst, false);
+}
+
+bool llvm::programUndefinedIfPoison(const Instruction *Inst) {
+  return ::programUndefinedIfUndefOrPoison(Inst, true);
+}
+
 static bool isKnownNonNaN(const Value *V, FastMathFlags FMF) {
   if (FMF.noNaNs())
     return true;
@@ -5430,10 +5594,10 @@ static SelectPatternResult matchSelectPattern(CmpInst::Predicate Pred,
     // elements because those can not be back-propagated for analysis.
     Value *OutputZeroVal = nullptr;
     if (match(TrueVal, m_AnyZeroFP()) && !match(FalseVal, m_AnyZeroFP()) &&
-        !cast<Constant>(TrueVal)->containsUndefElement())
+        !cast<Constant>(TrueVal)->containsUndefOrPoisonElement())
       OutputZeroVal = TrueVal;
     else if (match(FalseVal, m_AnyZeroFP()) && !match(TrueVal, m_AnyZeroFP()) &&
-             !cast<Constant>(FalseVal)->containsUndefElement())
+             !cast<Constant>(FalseVal)->containsUndefOrPoisonElement())
       OutputZeroVal = FalseVal;
 
     if (OutputZeroVal) {
@@ -5710,7 +5874,7 @@ static Value *lookThroughCast(CmpInst *CmpI, Value *V1, Value *V2,
 SelectPatternResult llvm::matchSelectPattern(Value *V, Value *&LHS, Value *&RHS,
                                              Instruction::CastOps *CastOp,
                                              unsigned Depth) {
-  if (Depth >= MaxDepth)
+  if (Depth >= MaxAnalysisRecursionDepth)
     return {SPF_UNKNOWN, SPNB_NA, false};
 
   SelectInst *SI = dyn_cast<SelectInst>(V);
@@ -5789,6 +5953,46 @@ CmpInst::Predicate llvm::getInverseMinMaxPred(SelectPatternFlavor SPF) {
   return getMinMaxPred(getInverseMinMaxFlavor(SPF));
 }
 
+std::pair<Intrinsic::ID, bool>
+llvm::canConvertToMinOrMaxIntrinsic(ArrayRef<Value *> VL) {
+  // Check if VL contains select instructions that can be folded into a min/max
+  // vector intrinsic and return the intrinsic if it is possible.
+  // TODO: Support floating point min/max.
+  bool AllCmpSingleUse = true;
+  SelectPatternResult SelectPattern;
+  SelectPattern.Flavor = SPF_UNKNOWN;
+  if (all_of(VL, [&SelectPattern, &AllCmpSingleUse](Value *I) {
+        Value *LHS, *RHS;
+        auto CurrentPattern = matchSelectPattern(I, LHS, RHS);
+        if (!SelectPatternResult::isMinOrMax(CurrentPattern.Flavor) ||
+            CurrentPattern.Flavor == SPF_FMINNUM ||
+            CurrentPattern.Flavor == SPF_FMAXNUM ||
+            !I->getType()->isIntOrIntVectorTy())
+          return false;
+        if (SelectPattern.Flavor != SPF_UNKNOWN &&
+            SelectPattern.Flavor != CurrentPattern.Flavor)
+          return false;
+        SelectPattern = CurrentPattern;
+        AllCmpSingleUse &=
+            match(I, m_Select(m_OneUse(m_Value()), m_Value(), m_Value()));
+        return true;
+      })) {
+    switch (SelectPattern.Flavor) {
+    case SPF_SMIN:
+      return {Intrinsic::smin, AllCmpSingleUse};
+    case SPF_UMIN:
+      return {Intrinsic::umin, AllCmpSingleUse};
+    case SPF_SMAX:
+      return {Intrinsic::smax, AllCmpSingleUse};
+    case SPF_UMAX:
+      return {Intrinsic::umax, AllCmpSingleUse};
+    default:
+      llvm_unreachable("unexpected select pattern flavor");
+    }
+  }
+  return {Intrinsic::not_intrinsic, false};
+}
+
 /// Return true if "icmp Pred LHS RHS" is always true.
 static bool isTruePredicate(CmpInst::Predicate Pred, const Value *LHS,
                             const Value *RHS, const DataLayout &DL,
@@ -5968,25 +6172,25 @@ static Optional<bool> isImpliedCondICmps(const ICmpInst *LHS,
 
 /// Return true if LHS implies RHS is true.  Return false if LHS implies RHS is
 /// false.  Otherwise, return None if we can't infer anything.  We expect the
-/// RHS to be an icmp and the LHS to be an 'and' or an 'or' instruction.
+/// RHS to be an icmp and the LHS to be an 'and', 'or', or a 'select' instruction.
 static Optional<bool>
-isImpliedCondAndOr(const BinaryOperator *LHS, CmpInst::Predicate RHSPred,
+isImpliedCondAndOr(const Instruction *LHS, CmpInst::Predicate RHSPred,
                    const Value *RHSOp0, const Value *RHSOp1,
-
                    const DataLayout &DL, bool LHSIsTrue, unsigned Depth) {
-  // The LHS must be an 'or' or an 'and' instruction.
+  // The LHS must be an 'or', 'and', or a 'select' instruction.
   assert((LHS->getOpcode() == Instruction::And ||
-          LHS->getOpcode() == Instruction::Or) &&
-         "Expected LHS to be 'and' or 'or'.");
+          LHS->getOpcode() == Instruction::Or ||
+          LHS->getOpcode() == Instruction::Select) &&
+         "Expected LHS to be 'and', 'or', or 'select'.");
 
-  assert(Depth <= MaxDepth && "Hit recursion limit");
+  assert(Depth <= MaxAnalysisRecursionDepth && "Hit recursion limit");
 
   // If the result of an 'or' is false, then we know both legs of the 'or' are
   // false.  Similarly, if the result of an 'and' is true, then we know both
   // legs of the 'and' are true.
-  Value *ALHS, *ARHS;
-  if ((!LHSIsTrue && match(LHS, m_Or(m_Value(ALHS), m_Value(ARHS)))) ||
-      (LHSIsTrue && match(LHS, m_And(m_Value(ALHS), m_Value(ARHS))))) {
+  const Value *ALHS, *ARHS;
+  if ((!LHSIsTrue && match(LHS, m_LogicalOr(m_Value(ALHS), m_Value(ARHS)))) ||
+      (LHSIsTrue && match(LHS, m_LogicalAnd(m_Value(ALHS), m_Value(ARHS))))) {
     // FIXME: Make this non-recursion.
     if (Optional<bool> Implication = isImpliedCondition(
             ALHS, RHSPred, RHSOp0, RHSOp1, DL, LHSIsTrue, Depth + 1))
@@ -6004,7 +6208,7 @@ llvm::isImpliedCondition(const Value *LHS, CmpInst::Predicate RHSPred,
                          const Value *RHSOp0, const Value *RHSOp1,
                          const DataLayout &DL, bool LHSIsTrue, unsigned Depth) {
   // Bail out when we hit the limit.
-  if (Depth == MaxDepth)
+  if (Depth == MaxAnalysisRecursionDepth)
     return None;
 
   // A mismatch occurs when we compare a scalar cmp to a vector cmp, for
@@ -6027,13 +6231,14 @@ llvm::isImpliedCondition(const Value *LHS, CmpInst::Predicate RHSPred,
     return isImpliedCondICmps(LHSCmp, RHSPred, RHSOp0, RHSOp1, DL, LHSIsTrue,
                               Depth);
 
-  /// The LHS should be an 'or' or an 'and' instruction.  We expect the RHS to
-  /// be / an icmp. FIXME: Add support for and/or on the RHS.
-  const BinaryOperator *LHSBO = dyn_cast<BinaryOperator>(LHS);
-  if (LHSBO) {
-    if ((LHSBO->getOpcode() == Instruction::And ||
-         LHSBO->getOpcode() == Instruction::Or))
-      return isImpliedCondAndOr(LHSBO, RHSPred, RHSOp0, RHSOp1, DL, LHSIsTrue,
+  /// The LHS should be an 'or', 'and', or a 'select' instruction.  We expect
+  /// the RHS to be an icmp.
+  /// FIXME: Add support for and/or/select on the RHS.
+  if (const Instruction *LHSI = dyn_cast<Instruction>(LHS)) {
+    if ((LHSI->getOpcode() == Instruction::And ||
+         LHSI->getOpcode() == Instruction::Or ||
+         LHSI->getOpcode() == Instruction::Select))
+      return isImpliedCondAndOr(LHSI, RHSPred, RHSOp0, RHSOp1, DL, LHSIsTrue,
                                 Depth);
   }
   return None;
@@ -6266,6 +6471,13 @@ static void setLimitsForIntrinsic(const IntrinsicInst &II, APInt &Lower,
   unsigned Width = Lower.getBitWidth();
   const APInt *C;
   switch (II.getIntrinsicID()) {
+  case Intrinsic::ctpop:
+  case Intrinsic::ctlz:
+  case Intrinsic::cttz:
+    // Maximum of set/clear bits is the bit width.
+    assert(Lower == 0 && "Expected lower bound to be zero");
+    Upper = Width + 1;
+    break;
   case Intrinsic::uadd_sat:
     // uadd.sat(x, C) produces [C, UINT_MAX].
     if (match(II.getOperand(0), m_APInt(C)) ||
@@ -6317,6 +6529,41 @@ static void setLimitsForIntrinsic(const IntrinsicInst &II, APInt &Lower,
       }
     }
     break;
+  case Intrinsic::umin:
+  case Intrinsic::umax:
+  case Intrinsic::smin:
+  case Intrinsic::smax:
+    if (!match(II.getOperand(0), m_APInt(C)) &&
+        !match(II.getOperand(1), m_APInt(C)))
+      break;
+
+    switch (II.getIntrinsicID()) {
+    case Intrinsic::umin:
+      Upper = *C + 1;
+      break;
+    case Intrinsic::umax:
+      Lower = *C;
+      break;
+    case Intrinsic::smin:
+      Lower = APInt::getSignedMinValue(Width);
+      Upper = *C + 1;
+      break;
+    case Intrinsic::smax:
+      Lower = *C;
+      Upper = APInt::getSignedMaxValue(Width) + 1;
+      break;
+    default:
+      llvm_unreachable("Must be min/max intrinsic");
+    }
+    break;
+  case Intrinsic::abs:
+    // If abs of SIGNED_MIN is poison, then the result is [0..SIGNED_MAX],
+    // otherwise it is [0..SIGNED_MIN], as -SIGNED_MIN == SIGNED_MIN.
+    if (match(II.getOperand(1), m_One()))
+      Upper = APInt::getSignedMaxValue(Width) + 1;
+    else
+      Upper = APInt::getSignedMinValue(Width) + 1;
+    break;
   default:
     break;
   }
@@ -6381,7 +6628,7 @@ ConstantRange llvm::computeConstantRange(const Value *V, bool UseInstrInfo,
                                          unsigned Depth) {
   assert(V->getType()->isIntOrIntVectorTy() && "Expected integer instruction");
 
-  if (Depth == MaxDepth)
+  if (Depth == MaxAnalysisRecursionDepth)
     return ConstantRange::getFull(V->getType()->getScalarSizeInBits());
 
   const APInt *C;
diff --git a/contrib/llvm-project/llvm/lib/Analysis/VectorUtils.cpp b/contrib/llvm-project/llvm/lib/Analysis/VectorUtils.cpp
index 23531b65ea32..9a4c96b6f7c2 100644
--- a/contrib/llvm-project/llvm/lib/Analysis/VectorUtils.cpp
+++ b/contrib/llvm-project/llvm/lib/Analysis/VectorUtils.cpp
@@ -43,13 +43,18 @@ static cl::opt<unsigned> MaxInterleaveGroupFactor(
 /// hasVectorInstrinsicScalarOpd).
 bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
   switch (ID) {
-  case Intrinsic::bswap: // Begin integer bit-manipulation.
+  case Intrinsic::abs:   // Begin integer bit-manipulation.
+  case Intrinsic::bswap:
   case Intrinsic::bitreverse:
   case Intrinsic::ctpop:
   case Intrinsic::ctlz:
   case Intrinsic::cttz:
   case Intrinsic::fshl:
   case Intrinsic::fshr:
+  case Intrinsic::smax:
+  case Intrinsic::smin:
+  case Intrinsic::umax:
+  case Intrinsic::umin:
   case Intrinsic::sadd_sat:
   case Intrinsic::ssub_sat:
   case Intrinsic::uadd_sat:
@@ -94,6 +99,7 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
 bool llvm::hasVectorInstrinsicScalarOpd(Intrinsic::ID ID,
                                         unsigned ScalarOpdIdx) {
   switch (ID) {
+  case Intrinsic::abs:
   case Intrinsic::ctlz:
   case Intrinsic::cttz:
   case Intrinsic::powi:
@@ -119,7 +125,8 @@ Intrinsic::ID llvm::getVectorIntrinsicIDForCall(const CallInst *CI,
 
   if (isTriviallyVectorizable(ID) || ID == Intrinsic::lifetime_start ||
       ID == Intrinsic::lifetime_end || ID == Intrinsic::assume ||
-      ID == Intrinsic::sideeffect)
+      ID == Intrinsic::experimental_noalias_scope_decl ||
+      ID == Intrinsic::sideeffect || ID == Intrinsic::pseudoprobe)
     return ID;
   return Intrinsic::not_intrinsic;
 }
@@ -130,7 +137,7 @@ Intrinsic::ID llvm::getVectorIntrinsicIDForCall(const CallInst *CI,
 unsigned llvm::getGEPInductionOperand(const GetElementPtrInst *Gep) {
   const DataLayout &DL = Gep->getModule()->getDataLayout();
   unsigned LastOperand = Gep->getNumOperands() - 1;
-  unsigned GEPAllocSize = DL.getTypeAllocSize(Gep->getResultElementType());
+  TypeSize GEPAllocSize = DL.getTypeAllocSize(Gep->getResultElementType());
 
   // Walk backwards and try to peel off zeros.
   while (LastOperand > 1 && match(Gep->getOperand(LastOperand), m_Zero())) {
@@ -202,7 +209,7 @@ Value *llvm::getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *Lp) {
 
   if (Ptr != OrigPtr)
     // Strip off casts.
-    while (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(V))
+    while (const SCEVIntegralCastExpr *C = dyn_cast<SCEVIntegralCastExpr>(V))
       V = C->getOperand();
 
   const SCEVAddRecExpr *S = dyn_cast<SCEVAddRecExpr>(V);
@@ -235,7 +242,7 @@ Value *llvm::getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *Lp) {
 
   // Strip off casts.
   Type *StripedOffRecurrenceCast = nullptr;
-  if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(V)) {
+  if (const SCEVIntegralCastExpr *C = dyn_cast<SCEVIntegralCastExpr>(V)) {
     StripedOffRecurrenceCast = C->getType();
     V = C->getOperand();
   }
@@ -284,6 +291,10 @@ Value *llvm::findScalarElement(Value *V, unsigned EltNo) {
     if (EltNo == IIElt)
       return III->getOperand(1);
 
+    // Guard against infinite loop on malformed, unreachable IR.
+    if (III == III->getOperand(0))
+      return nullptr;
+
     // Otherwise, the insertelement doesn't modify the value, recurse on its
     // vector input.
     return findScalarElement(III->getOperand(0), EltNo);
@@ -336,7 +347,7 @@ int llvm::getSplatIndex(ArrayRef<int> Mask) {
 /// This function is not fully general. It checks only 2 cases:
 /// the input value is (1) a splat constant vector or (2) a sequence
 /// of instructions that broadcasts a scalar at element 0.
-const llvm::Value *llvm::getSplatValue(const Value *V) {
+Value *llvm::getSplatValue(const Value *V) {
   if (isa<VectorType>(V->getType()))
     if (auto *C = dyn_cast<Constant>(V))
       return C->getSplatValue();
@@ -351,12 +362,8 @@ const llvm::Value *llvm::getSplatValue(const Value *V) {
   return nullptr;
 }
 
-// This setting is based on its counterpart in value tracking, but it could be
-// adjusted if needed.
-const unsigned MaxDepth = 6;
-
 bool llvm::isSplatValue(const Value *V, int Index, unsigned Depth) {
-  assert(Depth <= MaxDepth && "Limit Search Depth");
+  assert(Depth <= MaxAnalysisRecursionDepth && "Limit Search Depth");
 
   if (isa<VectorType>(V->getType())) {
     if (isa<UndefValue>(V))
@@ -383,7 +390,7 @@ bool llvm::isSplatValue(const Value *V, int Index, unsigned Depth) {
   }
 
   // The remaining tests are all recursive, so bail out if we hit the limit.
-  if (Depth++ == MaxDepth)
+  if (Depth++ == MaxAnalysisRecursionDepth)
     return false;
 
   // If both operands of a binop are splats, the result is a splat.
@@ -414,8 +421,7 @@ void llvm::narrowShuffleMaskElts(int Scale, ArrayRef<int> Mask,
   ScaledMask.clear();
   for (int MaskElt : Mask) {
     if (MaskElt >= 0) {
-      assert(((uint64_t)Scale * MaskElt + (Scale - 1)) <=
-                 std::numeric_limits<int32_t>::max() &&
+      assert(((uint64_t)Scale * MaskElt + (Scale - 1)) <= INT32_MAX &&
              "Overflowed 32-bits");
     }
     for (int SliceElt = 0; SliceElt != Scale; ++SliceElt)
@@ -817,15 +823,14 @@ static Value *concatenateTwoVectors(IRBuilderBase &Builder, Value *V1,
          VecTy1->getScalarType() == VecTy2->getScalarType() &&
          "Expect two vectors with the same element type");
 
-  unsigned NumElts1 = VecTy1->getNumElements();
-  unsigned NumElts2 = VecTy2->getNumElements();
+  unsigned NumElts1 = cast<FixedVectorType>(VecTy1)->getNumElements();
+  unsigned NumElts2 = cast<FixedVectorType>(VecTy2)->getNumElements();
   assert(NumElts1 >= NumElts2 && "Unexpect the first vector has less elements");
 
   if (NumElts1 > NumElts2) {
     // Extend with UNDEFs.
     V2 = Builder.CreateShuffleVector(
-        V2, UndefValue::get(VecTy2),
-        createSequentialMask(0, NumElts2, NumElts1 - NumElts2));
+        V2, createSequentialMask(0, NumElts2, NumElts1 - NumElts2));
   }
 
   return Builder.CreateShuffleVector(
@@ -861,13 +866,22 @@ Value *llvm::concatenateVectors(IRBuilderBase &Builder,
 }
 
 bool llvm::maskIsAllZeroOrUndef(Value *Mask) {
+  assert(isa<VectorType>(Mask->getType()) &&
+         isa<IntegerType>(Mask->getType()->getScalarType()) &&
+         cast<IntegerType>(Mask->getType()->getScalarType())->getBitWidth() ==
+             1 &&
+         "Mask must be a vector of i1");
+
   auto *ConstMask = dyn_cast<Constant>(Mask);
   if (!ConstMask)
     return false;
   if (ConstMask->isNullValue() || isa<UndefValue>(ConstMask))
     return true;
-  for (unsigned I = 0,
-                E = cast<VectorType>(ConstMask->getType())->getNumElements();
+  if (isa<ScalableVectorType>(ConstMask->getType()))
+    return false;
+  for (unsigned
+           I = 0,
+           E = cast<FixedVectorType>(ConstMask->getType())->getNumElements();
        I != E; ++I) {
     if (auto *MaskElt = ConstMask->getAggregateElement(I))
       if (MaskElt->isNullValue() || isa<UndefValue>(MaskElt))
@@ -879,13 +893,22 @@ bool llvm::maskIsAllZeroOrUndef(Value *Mask) {
 
 
 bool llvm::maskIsAllOneOrUndef(Value *Mask) {
+  assert(isa<VectorType>(Mask->getType()) &&
+         isa<IntegerType>(Mask->getType()->getScalarType()) &&
+         cast<IntegerType>(Mask->getType()->getScalarType())->getBitWidth() ==
+             1 &&
+         "Mask must be a vector of i1");
+
   auto *ConstMask = dyn_cast<Constant>(Mask);
   if (!ConstMask)
     return false;
   if (ConstMask->isAllOnesValue() || isa<UndefValue>(ConstMask))
     return true;
-  for (unsigned I = 0,
-                E = cast<VectorType>(ConstMask->getType())->getNumElements();
+  if (isa<ScalableVectorType>(ConstMask->getType()))
+    return false;
+  for (unsigned
+           I = 0,
+           E = cast<FixedVectorType>(ConstMask->getType())->getNumElements();
        I != E; ++I) {
     if (auto *MaskElt = ConstMask->getAggregateElement(I))
       if (MaskElt->isAllOnesValue() || isa<UndefValue>(MaskElt))
@@ -898,8 +921,14 @@ bool llvm::maskIsAllOneOrUndef(Value *Mask) {
 /// TODO: This is a lot like known bits, but for
 /// vectors.  Is there something we can common this with?
 APInt llvm::possiblyDemandedEltsInMask(Value *Mask) {
-
-  const unsigned VWidth = cast<VectorType>(Mask->getType())->getNumElements();
+  assert(isa<FixedVectorType>(Mask->getType()) &&
+         isa<IntegerType>(Mask->getType()->getScalarType()) &&
+         cast<IntegerType>(Mask->getType()->getScalarType())->getBitWidth() ==
+             1 &&
+         "Mask must be a fixed width vector of i1");
+
+  const unsigned VWidth =
+      cast<FixedVectorType>(Mask->getType())->getNumElements();
   APInt DemandedElts = APInt::getAllOnesValue(VWidth);
   if (auto *CV = dyn_cast<ConstantVector>(Mask))
     for (unsigned i = 0; i < VWidth; i++)
diff --git a/contrib/llvm-project/llvm/lib/Analysis/models/inliner/README.txt b/contrib/llvm-project/llvm/lib/Analysis/models/inliner/README.txt
new file mode 100644
index 000000000000..a76bb8930d6e
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Analysis/models/inliner/README.txt
@@ -0,0 +1,3 @@
+Reference model for inliner -Oz decision policy.
+Note that, currently, this model is also referenced by test/Transforms/Inline/ML
+tests - if replacing it, check those tests, too.
diff --git a/contrib/llvm-project/llvm/lib/Analysis/models/inliner/output_spec.json b/contrib/llvm-project/llvm/lib/Analysis/models/inliner/output_spec.json
new file mode 100644
index 000000000000..5f9d13d8f8b8
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Analysis/models/inliner/output_spec.json
@@ -0,0 +1,14 @@
+[
+    {
+        "logging_name": "inlining_decision",
+        "tensor_spec": {
+            "name": "StatefulPartitionedCall",
+            "port": 0,
+            "type": "int64_t",
+            "shape": [
+                1
+            ]
+        }
+    }
+]
+ 
\ No newline at end of file
diff --git a/contrib/llvm-project/llvm/lib/AsmParser/LLLexer.cpp b/contrib/llvm-project/llvm/lib/AsmParser/LLLexer.cpp
index 777ce3abdddd..427de74f91ac 100644
--- a/contrib/llvm-project/llvm/lib/AsmParser/LLLexer.cpp
+++ b/contrib/llvm-project/llvm/lib/AsmParser/LLLexer.cpp
@@ -531,6 +531,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(undef);
   KEYWORD(null);
   KEYWORD(none);
+  KEYWORD(poison);
   KEYWORD(to);
   KEYWORD(caller);
   KEYWORD(within);
@@ -624,6 +625,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(amdgpu_ps);
   KEYWORD(amdgpu_cs);
   KEYWORD(amdgpu_kernel);
+  KEYWORD(amdgpu_gfx);
   KEYWORD(tailcc);
 
   KEYWORD(cc);
@@ -651,6 +653,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(nest);
   KEYWORD(noalias);
   KEYWORD(nobuiltin);
+  KEYWORD(nocallback);
   KEYWORD(nocapture);
   KEYWORD(noduplicate);
   KEYWORD(nofree);
@@ -660,6 +663,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(nonlazybind);
   KEYWORD(nomerge);
   KEYWORD(nonnull);
+  KEYWORD(noprofile);
   KEYWORD(noredzone);
   KEYWORD(noreturn);
   KEYWORD(nosync);
@@ -697,6 +701,8 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(writeonly);
   KEYWORD(zeroext);
   KEYWORD(immarg);
+  KEYWORD(byref);
+  KEYWORD(mustprogress);
 
   KEYWORD(type);
   KEYWORD(opaque);
@@ -721,6 +727,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(vscale);
   KEYWORD(x);
   KEYWORD(blockaddress);
+  KEYWORD(dso_local_equivalent);
 
   // Metadata types.
   KEYWORD(distinct);
@@ -834,6 +841,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   TYPEKEYWORD("label",     Type::getLabelTy(Context));
   TYPEKEYWORD("metadata",  Type::getMetadataTy(Context));
   TYPEKEYWORD("x86_mmx",   Type::getX86_MMXTy(Context));
+  TYPEKEYWORD("x86_amx",   Type::getX86_AMXTy(Context));
   TYPEKEYWORD("token",     Type::getTokenTy(Context));
 
 #undef TYPEKEYWORD
diff --git a/contrib/llvm-project/llvm/lib/AsmParser/LLParser.cpp b/contrib/llvm-project/llvm/lib/AsmParser/LLParser.cpp
index c9f21ee83826..2a3fb8fb6658 100644
--- a/contrib/llvm-project/llvm/lib/AsmParser/LLParser.cpp
+++ b/contrib/llvm-project/llvm/lib/AsmParser/LLParser.cpp
@@ -65,20 +65,20 @@ bool LLParser::Run(bool UpgradeDebugInfo,
   Lex.Lex();
 
   if (Context.shouldDiscardValueNames())
-    return Error(
+    return error(
         Lex.getLoc(),
         "Can't read textual IR with a Context that discards named Values");
 
   if (M) {
-    if (ParseTargetDefinitions())
+    if (parseTargetDefinitions())
       return true;
 
     if (auto LayoutOverride = DataLayoutCallback(M->getTargetTriple()))
       M->setDataLayout(*LayoutOverride);
   }
 
-  return ParseTopLevelEntities() || ValidateEndOfModule(UpgradeDebugInfo) ||
-         ValidateEndOfIndex();
+  return parseTopLevelEntities() || validateEndOfModule(UpgradeDebugInfo) ||
+         validateEndOfIndex();
 }
 
 bool LLParser::parseStandaloneConstantValue(Constant *&C,
@@ -87,10 +87,10 @@ bool LLParser::parseStandaloneConstantValue(Constant *&C,
   Lex.Lex();
 
   Type *Ty = nullptr;
-  if (ParseType(Ty) || parseConstantValue(Ty, C))
+  if (parseType(Ty) || parseConstantValue(Ty, C))
     return true;
   if (Lex.getKind() != lltok::Eof)
-    return Error(Lex.getLoc(), "expected end of string");
+    return error(Lex.getLoc(), "expected end of string");
   return false;
 }
 
@@ -102,7 +102,7 @@ bool LLParser::parseTypeAtBeginning(Type *&Ty, unsigned &Read,
   Read = 0;
   SMLoc Start = Lex.getLoc();
   Ty = nullptr;
-  if (ParseType(Ty))
+  if (parseType(Ty))
     return true;
   SMLoc End = Lex.getLoc();
   Read = End.getPointer() - Start.getPointer();
@@ -123,9 +123,9 @@ void LLParser::restoreParsingState(const SlotMapping *Slots) {
         std::make_pair(I.first, std::make_pair(I.second, LocTy())));
 }
 
-/// ValidateEndOfModule - Do final validity and sanity checks at the end of the
+/// validateEndOfModule - Do final validity and sanity checks at the end of the
 /// module.
-bool LLParser::ValidateEndOfModule(bool UpgradeDebugInfo) {
+bool LLParser::validateEndOfModule(bool UpgradeDebugInfo) {
   if (!M)
     return false;
   // Handle any function attribute group forward references.
@@ -190,39 +190,39 @@ bool LLParser::ValidateEndOfModule(bool UpgradeDebugInfo) {
   // If there are entries in ForwardRefBlockAddresses at this point, the
   // function was never defined.
   if (!ForwardRefBlockAddresses.empty())
-    return Error(ForwardRefBlockAddresses.begin()->first.Loc,
+    return error(ForwardRefBlockAddresses.begin()->first.Loc,
                  "expected function name in blockaddress");
 
   for (const auto &NT : NumberedTypes)
     if (NT.second.second.isValid())
-      return Error(NT.second.second,
+      return error(NT.second.second,
                    "use of undefined type '%" + Twine(NT.first) + "'");
 
   for (StringMap<std::pair<Type*, LocTy> >::iterator I =
        NamedTypes.begin(), E = NamedTypes.end(); I != E; ++I)
     if (I->second.second.isValid())
-      return Error(I->second.second,
+      return error(I->second.second,
                    "use of undefined type named '" + I->getKey() + "'");
 
   if (!ForwardRefComdats.empty())
-    return Error(ForwardRefComdats.begin()->second,
+    return error(ForwardRefComdats.begin()->second,
                  "use of undefined comdat '$" +
                      ForwardRefComdats.begin()->first + "'");
 
   if (!ForwardRefVals.empty())
-    return Error(ForwardRefVals.begin()->second.second,
+    return error(ForwardRefVals.begin()->second.second,
                  "use of undefined value '@" + ForwardRefVals.begin()->first +
-                 "'");
+                     "'");
 
   if (!ForwardRefValIDs.empty())
-    return Error(ForwardRefValIDs.begin()->second.second,
+    return error(ForwardRefValIDs.begin()->second.second,
                  "use of undefined value '@" +
-                 Twine(ForwardRefValIDs.begin()->first) + "'");
+                     Twine(ForwardRefValIDs.begin()->first) + "'");
 
   if (!ForwardRefMDNodes.empty())
-    return Error(ForwardRefMDNodes.begin()->second.second,
+    return error(ForwardRefMDNodes.begin()->second.second,
                  "use of undefined metadata '!" +
-                 Twine(ForwardRefMDNodes.begin()->first) + "'");
+                     Twine(ForwardRefMDNodes.begin()->first) + "'");
 
   // Resolve metadata cycles.
   for (auto &N : NumberedMetadata) {
@@ -275,22 +275,22 @@ bool LLParser::ValidateEndOfModule(bool UpgradeDebugInfo) {
 }
 
 /// Do final validity and sanity checks at the end of the index.
-bool LLParser::ValidateEndOfIndex() {
+bool LLParser::validateEndOfIndex() {
   if (!Index)
     return false;
 
   if (!ForwardRefValueInfos.empty())
-    return Error(ForwardRefValueInfos.begin()->second.front().second,
+    return error(ForwardRefValueInfos.begin()->second.front().second,
                  "use of undefined summary '^" +
                      Twine(ForwardRefValueInfos.begin()->first) + "'");
 
   if (!ForwardRefAliasees.empty())
-    return Error(ForwardRefAliasees.begin()->second.front().second,
+    return error(ForwardRefAliasees.begin()->second.front().second,
                  "use of undefined summary '^" +
                      Twine(ForwardRefAliasees.begin()->first) + "'");
 
   if (!ForwardRefTypeIds.empty())
-    return Error(ForwardRefTypeIds.begin()->second.front().second,
+    return error(ForwardRefTypeIds.begin()->second.front().second,
                  "use of undefined type id summary '^" +
                      Twine(ForwardRefTypeIds.begin()->first) + "'");
 
@@ -301,15 +301,15 @@ bool LLParser::ValidateEndOfIndex() {
 // Top-Level Entities
 //===----------------------------------------------------------------------===//
 
-bool LLParser::ParseTargetDefinitions() {
+bool LLParser::parseTargetDefinitions() {
   while (true) {
     switch (Lex.getKind()) {
     case lltok::kw_target:
-      if (ParseTargetDefinition())
+      if (parseTargetDefinition())
         return true;
       break;
     case lltok::kw_source_filename:
-      if (ParseSourceFileName())
+      if (parseSourceFileName())
         return true;
       break;
     default:
@@ -318,7 +318,7 @@ bool LLParser::ParseTargetDefinitions() {
   }
 }
 
-bool LLParser::ParseTopLevelEntities() {
+bool LLParser::parseTopLevelEntities() {
   // If there is no Module, then parse just the summary index entries.
   if (!M) {
     while (true) {
@@ -326,11 +326,11 @@ bool LLParser::ParseTopLevelEntities() {
       case lltok::Eof:
         return false;
       case lltok::SummaryID:
-        if (ParseSummaryEntry())
+        if (parseSummaryEntry())
           return true;
         break;
       case lltok::kw_source_filename:
-        if (ParseSourceFileName())
+        if (parseSourceFileName())
           return true;
         break;
       default:
@@ -341,27 +341,64 @@ bool LLParser::ParseTopLevelEntities() {
   }
   while (true) {
     switch (Lex.getKind()) {
-    default:         return TokError("expected top-level entity");
+    default:
+      return tokError("expected top-level entity");
     case lltok::Eof: return false;
-    case lltok::kw_declare: if (ParseDeclare()) return true; break;
-    case lltok::kw_define:  if (ParseDefine()) return true; break;
-    case lltok::kw_module:  if (ParseModuleAsm()) return true; break;
-    case lltok::kw_deplibs: if (ParseDepLibs()) return true; break;
-    case lltok::LocalVarID: if (ParseUnnamedType()) return true; break;
-    case lltok::LocalVar:   if (ParseNamedType()) return true; break;
-    case lltok::GlobalID:   if (ParseUnnamedGlobal()) return true; break;
-    case lltok::GlobalVar:  if (ParseNamedGlobal()) return true; break;
+    case lltok::kw_declare:
+      if (parseDeclare())
+        return true;
+      break;
+    case lltok::kw_define:
+      if (parseDefine())
+        return true;
+      break;
+    case lltok::kw_module:
+      if (parseModuleAsm())
+        return true;
+      break;
+    case lltok::kw_deplibs:
+      if (parseDepLibs())
+        return true;
+      break;
+    case lltok::LocalVarID:
+      if (parseUnnamedType())
+        return true;
+      break;
+    case lltok::LocalVar:
+      if (parseNamedType())
+        return true;
+      break;
+    case lltok::GlobalID:
+      if (parseUnnamedGlobal())
+        return true;
+      break;
+    case lltok::GlobalVar:
+      if (parseNamedGlobal())
+        return true;
+      break;
     case lltok::ComdatVar:  if (parseComdat()) return true; break;
-    case lltok::exclaim:    if (ParseStandaloneMetadata()) return true; break;
+    case lltok::exclaim:
+      if (parseStandaloneMetadata())
+        return true;
+      break;
     case lltok::SummaryID:
-      if (ParseSummaryEntry())
+      if (parseSummaryEntry())
+        return true;
+      break;
+    case lltok::MetadataVar:
+      if (parseNamedMetadata())
+        return true;
+      break;
+    case lltok::kw_attributes:
+      if (parseUnnamedAttrGrp())
+        return true;
+      break;
+    case lltok::kw_uselistorder:
+      if (parseUseListOrder())
         return true;
       break;
-    case lltok::MetadataVar:if (ParseNamedMetadata()) return true; break;
-    case lltok::kw_attributes: if (ParseUnnamedAttrGrp()) return true; break;
-    case lltok::kw_uselistorder: if (ParseUseListOrder()) return true; break;
     case lltok::kw_uselistorder_bb:
-      if (ParseUseListOrderBB())
+      if (parseUseListOrderBB())
         return true;
       break;
     }
@@ -370,13 +407,14 @@ bool LLParser::ParseTopLevelEntities() {
 
 /// toplevelentity
 ///   ::= 'module' 'asm' STRINGCONSTANT
-bool LLParser::ParseModuleAsm() {
+bool LLParser::parseModuleAsm() {
   assert(Lex.getKind() == lltok::kw_module);
   Lex.Lex();
 
   std::string AsmStr;
-  if (ParseToken(lltok::kw_asm, "expected 'module asm'") ||
-      ParseStringConstant(AsmStr)) return true;
+  if (parseToken(lltok::kw_asm, "expected 'module asm'") ||
+      parseStringConstant(AsmStr))
+    return true;
 
   M->appendModuleInlineAsm(AsmStr);
   return false;
@@ -385,22 +423,23 @@ bool LLParser::ParseModuleAsm() {
 /// toplevelentity
 ///   ::= 'target' 'triple' '=' STRINGCONSTANT
 ///   ::= 'target' 'datalayout' '=' STRINGCONSTANT
-bool LLParser::ParseTargetDefinition() {
+bool LLParser::parseTargetDefinition() {
   assert(Lex.getKind() == lltok::kw_target);
   std::string Str;
   switch (Lex.Lex()) {
-  default: return TokError("unknown target property");
+  default:
+    return tokError("unknown target property");
   case lltok::kw_triple:
     Lex.Lex();
-    if (ParseToken(lltok::equal, "expected '=' after target triple") ||
-        ParseStringConstant(Str))
+    if (parseToken(lltok::equal, "expected '=' after target triple") ||
+        parseStringConstant(Str))
       return true;
     M->setTargetTriple(Str);
     return false;
   case lltok::kw_datalayout:
     Lex.Lex();
-    if (ParseToken(lltok::equal, "expected '=' after target datalayout") ||
-        ParseStringConstant(Str))
+    if (parseToken(lltok::equal, "expected '=' after target datalayout") ||
+        parseStringConstant(Str))
       return true;
     M->setDataLayout(Str);
     return false;
@@ -409,11 +448,11 @@ bool LLParser::ParseTargetDefinition() {
 
 /// toplevelentity
 ///   ::= 'source_filename' '=' STRINGCONSTANT
-bool LLParser::ParseSourceFileName() {
+bool LLParser::parseSourceFileName() {
   assert(Lex.getKind() == lltok::kw_source_filename);
   Lex.Lex();
-  if (ParseToken(lltok::equal, "expected '=' after source_filename") ||
-      ParseStringConstant(SourceFileName))
+  if (parseToken(lltok::equal, "expected '=' after source_filename") ||
+      parseStringConstant(SourceFileName))
     return true;
   if (M)
     M->setSourceFileName(SourceFileName);
@@ -424,11 +463,11 @@ bool LLParser::ParseSourceFileName() {
 ///   ::= 'deplibs' '=' '[' ']'
 ///   ::= 'deplibs' '=' '[' STRINGCONSTANT (',' STRINGCONSTANT)* ']'
 /// FIXME: Remove in 4.0. Currently parse, but ignore.
-bool LLParser::ParseDepLibs() {
+bool LLParser::parseDepLibs() {
   assert(Lex.getKind() == lltok::kw_deplibs);
   Lex.Lex();
-  if (ParseToken(lltok::equal, "expected '=' after deplibs") ||
-      ParseToken(lltok::lsquare, "expected '=' after deplibs"))
+  if (parseToken(lltok::equal, "expected '=' after deplibs") ||
+      parseToken(lltok::lsquare, "expected '=' after deplibs"))
     return true;
 
   if (EatIfPresent(lltok::rsquare))
@@ -436,31 +475,32 @@ bool LLParser::ParseDepLibs() {
 
   do {
     std::string Str;
-    if (ParseStringConstant(Str)) return true;
+    if (parseStringConstant(Str))
+      return true;
   } while (EatIfPresent(lltok::comma));
 
-  return ParseToken(lltok::rsquare, "expected ']' at end of list");
+  return parseToken(lltok::rsquare, "expected ']' at end of list");
 }
 
-/// ParseUnnamedType:
+/// parseUnnamedType:
 ///   ::= LocalVarID '=' 'type' type
-bool LLParser::ParseUnnamedType() {
+bool LLParser::parseUnnamedType() {
   LocTy TypeLoc = Lex.getLoc();
   unsigned TypeID = Lex.getUIntVal();
   Lex.Lex(); // eat LocalVarID;
 
-  if (ParseToken(lltok::equal, "expected '=' after name") ||
-      ParseToken(lltok::kw_type, "expected 'type' after '='"))
+  if (parseToken(lltok::equal, "expected '=' after name") ||
+      parseToken(lltok::kw_type, "expected 'type' after '='"))
     return true;
 
   Type *Result = nullptr;
-  if (ParseStructDefinition(TypeLoc, "",
-                            NumberedTypes[TypeID], Result)) return true;
+  if (parseStructDefinition(TypeLoc, "", NumberedTypes[TypeID], Result))
+    return true;
 
   if (!isa<StructType>(Result)) {
     std::pair<Type*, LocTy> &Entry = NumberedTypes[TypeID];
     if (Entry.first)
-      return Error(TypeLoc, "non-struct types may not be recursive");
+      return error(TypeLoc, "non-struct types may not be recursive");
     Entry.first = Result;
     Entry.second = SMLoc();
   }
@@ -470,23 +510,23 @@ bool LLParser::ParseUnnamedType() {
 
 /// toplevelentity
 ///   ::= LocalVar '=' 'type' type
-bool LLParser::ParseNamedType() {
+bool LLParser::parseNamedType() {
   std::string Name = Lex.getStrVal();
   LocTy NameLoc = Lex.getLoc();
   Lex.Lex();  // eat LocalVar.
 
-  if (ParseToken(lltok::equal, "expected '=' after name") ||
-      ParseToken(lltok::kw_type, "expected 'type' after name"))
+  if (parseToken(lltok::equal, "expected '=' after name") ||
+      parseToken(lltok::kw_type, "expected 'type' after name"))
     return true;
 
   Type *Result = nullptr;
-  if (ParseStructDefinition(NameLoc, Name,
-                            NamedTypes[Name], Result)) return true;
+  if (parseStructDefinition(NameLoc, Name, NamedTypes[Name], Result))
+    return true;
 
   if (!isa<StructType>(Result)) {
     std::pair<Type*, LocTy> &Entry = NamedTypes[Name];
     if (Entry.first)
-      return Error(NameLoc, "non-struct types may not be recursive");
+      return error(NameLoc, "non-struct types may not be recursive");
     Entry.first = Result;
     Entry.second = SMLoc();
   }
@@ -496,7 +536,7 @@ bool LLParser::ParseNamedType() {
 
 /// toplevelentity
 ///   ::= 'declare' FunctionHeader
-bool LLParser::ParseDeclare() {
+bool LLParser::parseDeclare() {
   assert(Lex.getKind() == lltok::kw_declare);
   Lex.Lex();
 
@@ -504,13 +544,13 @@ bool LLParser::ParseDeclare() {
   while (Lex.getKind() == lltok::MetadataVar) {
     unsigned MDK;
     MDNode *N;
-    if (ParseMetadataAttachment(MDK, N))
+    if (parseMetadataAttachment(MDK, N))
       return true;
     MDs.push_back({MDK, N});
   }
 
   Function *F;
-  if (ParseFunctionHeader(F, false))
+  if (parseFunctionHeader(F, false))
     return true;
   for (auto &MD : MDs)
     F->addMetadata(MD.first, *MD.second);
@@ -519,33 +559,32 @@ bool LLParser::ParseDeclare() {
 
 /// toplevelentity
 ///   ::= 'define' FunctionHeader (!dbg !56)* '{' ...
-bool LLParser::ParseDefine() {
+bool LLParser::parseDefine() {
   assert(Lex.getKind() == lltok::kw_define);
   Lex.Lex();
 
   Function *F;
-  return ParseFunctionHeader(F, true) ||
-         ParseOptionalFunctionMetadata(*F) ||
-         ParseFunctionBody(*F);
+  return parseFunctionHeader(F, true) || parseOptionalFunctionMetadata(*F) ||
+         parseFunctionBody(*F);
 }
 
-/// ParseGlobalType
+/// parseGlobalType
 ///   ::= 'constant'
 ///   ::= 'global'
-bool LLParser::ParseGlobalType(bool &IsConstant) {
+bool LLParser::parseGlobalType(bool &IsConstant) {
   if (Lex.getKind() == lltok::kw_constant)
     IsConstant = true;
   else if (Lex.getKind() == lltok::kw_global)
     IsConstant = false;
   else {
     IsConstant = false;
-    return TokError("expected 'global' or 'constant'");
+    return tokError("expected 'global' or 'constant'");
   }
   Lex.Lex();
   return false;
 }
 
-bool LLParser::ParseOptionalUnnamedAddr(
+bool LLParser::parseOptionalUnnamedAddr(
     GlobalVariable::UnnamedAddr &UnnamedAddr) {
   if (EatIfPresent(lltok::kw_unnamed_addr))
     UnnamedAddr = GlobalValue::UnnamedAddr::Global;
@@ -556,16 +595,17 @@ bool LLParser::ParseOptionalUnnamedAddr(
   return false;
 }
 
-/// ParseUnnamedGlobal:
+/// parseUnnamedGlobal:
 ///   OptionalVisibility (ALIAS | IFUNC) ...
 ///   OptionalLinkage OptionalPreemptionSpecifier OptionalVisibility
 ///   OptionalDLLStorageClass
 ///                                                     ...   -> global variable
 ///   GlobalID '=' OptionalVisibility (ALIAS | IFUNC) ...
-///   GlobalID '=' OptionalLinkage OptionalPreemptionSpecifier OptionalVisibility
+///   GlobalID '=' OptionalLinkage OptionalPreemptionSpecifier
+///   OptionalVisibility
 ///                OptionalDLLStorageClass
 ///                                                     ...   -> global variable
-bool LLParser::ParseUnnamedGlobal() {
+bool LLParser::parseUnnamedGlobal() {
   unsigned VarID = NumberedVals.size();
   std::string Name;
   LocTy NameLoc = Lex.getLoc();
@@ -573,11 +613,11 @@ bool LLParser::ParseUnnamedGlobal() {
   // Handle the GlobalID form.
   if (Lex.getKind() == lltok::GlobalID) {
     if (Lex.getUIntVal() != VarID)
-      return Error(Lex.getLoc(), "variable expected to be numbered '%" +
-                   Twine(VarID) + "'");
+      return error(Lex.getLoc(),
+                   "variable expected to be numbered '%" + Twine(VarID) + "'");
     Lex.Lex(); // eat GlobalID;
 
-    if (ParseToken(lltok::equal, "expected '=' after name"))
+    if (parseToken(lltok::equal, "expected '=' after name"))
       return true;
   }
 
@@ -586,25 +626,25 @@ bool LLParser::ParseUnnamedGlobal() {
   bool DSOLocal;
   GlobalVariable::ThreadLocalMode TLM;
   GlobalVariable::UnnamedAddr UnnamedAddr;
-  if (ParseOptionalLinkage(Linkage, HasLinkage, Visibility, DLLStorageClass,
+  if (parseOptionalLinkage(Linkage, HasLinkage, Visibility, DLLStorageClass,
                            DSOLocal) ||
-      ParseOptionalThreadLocal(TLM) || ParseOptionalUnnamedAddr(UnnamedAddr))
+      parseOptionalThreadLocal(TLM) || parseOptionalUnnamedAddr(UnnamedAddr))
     return true;
 
   if (Lex.getKind() != lltok::kw_alias && Lex.getKind() != lltok::kw_ifunc)
-    return ParseGlobal(Name, NameLoc, Linkage, HasLinkage, Visibility,
+    return parseGlobal(Name, NameLoc, Linkage, HasLinkage, Visibility,
                        DLLStorageClass, DSOLocal, TLM, UnnamedAddr);
 
   return parseIndirectSymbol(Name, NameLoc, Linkage, Visibility,
                              DLLStorageClass, DSOLocal, TLM, UnnamedAddr);
 }
 
-/// ParseNamedGlobal:
+/// parseNamedGlobal:
 ///   GlobalVar '=' OptionalVisibility (ALIAS | IFUNC) ...
 ///   GlobalVar '=' OptionalLinkage OptionalPreemptionSpecifier
 ///                 OptionalVisibility OptionalDLLStorageClass
 ///                                                     ...   -> global variable
-bool LLParser::ParseNamedGlobal() {
+bool LLParser::parseNamedGlobal() {
   assert(Lex.getKind() == lltok::GlobalVar);
   LocTy NameLoc = Lex.getLoc();
   std::string Name = Lex.getStrVal();
@@ -615,14 +655,14 @@ bool LLParser::ParseNamedGlobal() {
   bool DSOLocal;
   GlobalVariable::ThreadLocalMode TLM;
   GlobalVariable::UnnamedAddr UnnamedAddr;
-  if (ParseToken(lltok::equal, "expected '=' in global variable") ||
-      ParseOptionalLinkage(Linkage, HasLinkage, Visibility, DLLStorageClass,
+  if (parseToken(lltok::equal, "expected '=' in global variable") ||
+      parseOptionalLinkage(Linkage, HasLinkage, Visibility, DLLStorageClass,
                            DSOLocal) ||
-      ParseOptionalThreadLocal(TLM) || ParseOptionalUnnamedAddr(UnnamedAddr))
+      parseOptionalThreadLocal(TLM) || parseOptionalUnnamedAddr(UnnamedAddr))
     return true;
 
   if (Lex.getKind() != lltok::kw_alias && Lex.getKind() != lltok::kw_ifunc)
-    return ParseGlobal(Name, NameLoc, Linkage, HasLinkage, Visibility,
+    return parseGlobal(Name, NameLoc, Linkage, HasLinkage, Visibility,
                        DLLStorageClass, DSOLocal, TLM, UnnamedAddr);
 
   return parseIndirectSymbol(Name, NameLoc, Linkage, Visibility,
@@ -635,16 +675,16 @@ bool LLParser::parseComdat() {
   LocTy NameLoc = Lex.getLoc();
   Lex.Lex();
 
-  if (ParseToken(lltok::equal, "expected '=' here"))
+  if (parseToken(lltok::equal, "expected '=' here"))
     return true;
 
-  if (ParseToken(lltok::kw_comdat, "expected comdat keyword"))
-    return TokError("expected comdat type");
+  if (parseToken(lltok::kw_comdat, "expected comdat keyword"))
+    return tokError("expected comdat type");
 
   Comdat::SelectionKind SK;
   switch (Lex.getKind()) {
   default:
-    return TokError("unknown selection kind");
+    return tokError("unknown selection kind");
   case lltok::kw_any:
     SK = Comdat::Any;
     break;
@@ -667,7 +707,7 @@ bool LLParser::parseComdat() {
   Module::ComdatSymTabType &ComdatSymTab = M->getComdatSymbolTable();
   Module::ComdatSymTabType::iterator I = ComdatSymTab.find(Name);
   if (I != ComdatSymTab.end() && !ForwardRefComdats.erase(Name))
-    return Error(NameLoc, "redefinition of comdat '$" + Name + "'");
+    return error(NameLoc, "redefinition of comdat '$" + Name + "'");
 
   Comdat *C;
   if (I != ComdatSymTab.end())
@@ -681,20 +721,21 @@ bool LLParser::parseComdat() {
 
 // MDString:
 //   ::= '!' STRINGCONSTANT
-bool LLParser::ParseMDString(MDString *&Result) {
+bool LLParser::parseMDString(MDString *&Result) {
   std::string Str;
-  if (ParseStringConstant(Str)) return true;
+  if (parseStringConstant(Str))
+    return true;
   Result = MDString::get(Context, Str);
   return false;
 }
 
 // MDNode:
 //   ::= '!' MDNodeNumber
-bool LLParser::ParseMDNodeID(MDNode *&Result) {
+bool LLParser::parseMDNodeID(MDNode *&Result) {
   // !{ ..., !42, ... }
   LocTy IDLoc = Lex.getLoc();
   unsigned MID = 0;
-  if (ParseUInt32(MID))
+  if (parseUInt32(MID))
     return true;
 
   // If not a forward reference, just return it now.
@@ -712,61 +753,60 @@ bool LLParser::ParseMDNodeID(MDNode *&Result) {
   return false;
 }
 
-/// ParseNamedMetadata:
+/// parseNamedMetadata:
 ///   !foo = !{ !1, !2 }
-bool LLParser::ParseNamedMetadata() {
+bool LLParser::parseNamedMetadata() {
   assert(Lex.getKind() == lltok::MetadataVar);
   std::string Name = Lex.getStrVal();
   Lex.Lex();
 
-  if (ParseToken(lltok::equal, "expected '=' here") ||
-      ParseToken(lltok::exclaim, "Expected '!' here") ||
-      ParseToken(lltok::lbrace, "Expected '{' here"))
+  if (parseToken(lltok::equal, "expected '=' here") ||
+      parseToken(lltok::exclaim, "Expected '!' here") ||
+      parseToken(lltok::lbrace, "Expected '{' here"))
     return true;
 
   NamedMDNode *NMD = M->getOrInsertNamedMetadata(Name);
   if (Lex.getKind() != lltok::rbrace)
     do {
       MDNode *N = nullptr;
-      // Parse DIExpressions inline as a special case. They are still MDNodes,
+      // parse DIExpressions inline as a special case. They are still MDNodes,
       // so they can still appear in named metadata. Remove this logic if they
       // become plain Metadata.
       if (Lex.getKind() == lltok::MetadataVar &&
           Lex.getStrVal() == "DIExpression") {
-        if (ParseDIExpression(N, /*IsDistinct=*/false))
+        if (parseDIExpression(N, /*IsDistinct=*/false))
           return true;
-      } else if (ParseToken(lltok::exclaim, "Expected '!' here") ||
-                 ParseMDNodeID(N)) {
+      } else if (parseToken(lltok::exclaim, "Expected '!' here") ||
+                 parseMDNodeID(N)) {
         return true;
       }
       NMD->addOperand(N);
     } while (EatIfPresent(lltok::comma));
 
-  return ParseToken(lltok::rbrace, "expected end of metadata node");
+  return parseToken(lltok::rbrace, "expected end of metadata node");
 }
 
-/// ParseStandaloneMetadata:
+/// parseStandaloneMetadata:
 ///   !42 = !{...}
-bool LLParser::ParseStandaloneMetadata() {
+bool LLParser::parseStandaloneMetadata() {
   assert(Lex.getKind() == lltok::exclaim);
   Lex.Lex();
   unsigned MetadataID = 0;
 
   MDNode *Init;
-  if (ParseUInt32(MetadataID) ||
-      ParseToken(lltok::equal, "expected '=' here"))
+  if (parseUInt32(MetadataID) || parseToken(lltok::equal, "expected '=' here"))
     return true;
 
   // Detect common error, from old metadata syntax.
   if (Lex.getKind() == lltok::Type)
-    return TokError("unexpected type in metadata definition");
+    return tokError("unexpected type in metadata definition");
 
   bool IsDistinct = EatIfPresent(lltok::kw_distinct);
   if (Lex.getKind() == lltok::MetadataVar) {
-    if (ParseSpecializedMDNode(Init, IsDistinct))
+    if (parseSpecializedMDNode(Init, IsDistinct))
       return true;
-  } else if (ParseToken(lltok::exclaim, "Expected '!' here") ||
-             ParseMDTuple(Init, IsDistinct))
+  } else if (parseToken(lltok::exclaim, "Expected '!' here") ||
+             parseMDTuple(Init, IsDistinct))
     return true;
 
   // See if this was forward referenced, if so, handle it.
@@ -778,7 +818,7 @@ bool LLParser::ParseStandaloneMetadata() {
     assert(NumberedMetadata[MetadataID] == Init && "Tracking VH didn't work");
   } else {
     if (NumberedMetadata.count(MetadataID))
-      return TokError("Metadata id is already used");
+      return tokError("Metadata id is already used");
     NumberedMetadata[MetadataID].reset(Init);
   }
 
@@ -786,18 +826,25 @@ bool LLParser::ParseStandaloneMetadata() {
 }
 
 // Skips a single module summary entry.
-bool LLParser::SkipModuleSummaryEntry() {
+bool LLParser::skipModuleSummaryEntry() {
   // Each module summary entry consists of a tag for the entry
-  // type, followed by a colon, then the fields surrounded by nested sets of
-  // parentheses. The "tag:" looks like a Label. Once parsing support is
-  // in place we will look for the tokens corresponding to the expected tags.
+  // type, followed by a colon, then the fields which may be surrounded by
+  // nested sets of parentheses. The "tag:" looks like a Label. Once parsing
+  // support is in place we will look for the tokens corresponding to the
+  // expected tags.
   if (Lex.getKind() != lltok::kw_gv && Lex.getKind() != lltok::kw_module &&
-      Lex.getKind() != lltok::kw_typeid)
-    return TokError(
-        "Expected 'gv', 'module', or 'typeid' at the start of summary entry");
+      Lex.getKind() != lltok::kw_typeid && Lex.getKind() != lltok::kw_flags &&
+      Lex.getKind() != lltok::kw_blockcount)
+    return tokError(
+        "Expected 'gv', 'module', 'typeid', 'flags' or 'blockcount' at the "
+        "start of summary entry");
+  if (Lex.getKind() == lltok::kw_flags)
+    return parseSummaryIndexFlags();
+  if (Lex.getKind() == lltok::kw_blockcount)
+    return parseBlockCount();
   Lex.Lex();
-  if (ParseToken(lltok::colon, "expected ':' at start of summary entry") ||
-      ParseToken(lltok::lparen, "expected '(' at start of summary entry"))
+  if (parseToken(lltok::colon, "expected ':' at start of summary entry") ||
+      parseToken(lltok::lparen, "expected '(' at start of summary entry"))
     return true;
   // Now walk through the parenthesized entry, until the number of open
   // parentheses goes back down to 0 (the first '(' was parsed above).
@@ -811,7 +858,7 @@ bool LLParser::SkipModuleSummaryEntry() {
       NumOpenParen--;
       break;
     case lltok::Eof:
-      return TokError("found end of file while parsing summary entry");
+      return tokError("found end of file while parsing summary entry");
     default:
       // Skip everything in between parentheses.
       break;
@@ -823,7 +870,7 @@ bool LLParser::SkipModuleSummaryEntry() {
 
 /// SummaryEntry
 ///   ::= SummaryID '=' GVEntry | ModuleEntry | TypeIdEntry
-bool LLParser::ParseSummaryEntry() {
+bool LLParser::parseSummaryEntry() {
   assert(Lex.getKind() == lltok::SummaryID);
   unsigned SummaryID = Lex.getUIntVal();
 
@@ -832,35 +879,35 @@ bool LLParser::ParseSummaryEntry() {
   Lex.setIgnoreColonInIdentifiers(true);
 
   Lex.Lex();
-  if (ParseToken(lltok::equal, "expected '=' here"))
+  if (parseToken(lltok::equal, "expected '=' here"))
     return true;
 
   // If we don't have an index object, skip the summary entry.
   if (!Index)
-    return SkipModuleSummaryEntry();
+    return skipModuleSummaryEntry();
 
   bool result = false;
   switch (Lex.getKind()) {
   case lltok::kw_gv:
-    result = ParseGVEntry(SummaryID);
+    result = parseGVEntry(SummaryID);
     break;
   case lltok::kw_module:
-    result = ParseModuleEntry(SummaryID);
+    result = parseModuleEntry(SummaryID);
     break;
   case lltok::kw_typeid:
-    result = ParseTypeIdEntry(SummaryID);
+    result = parseTypeIdEntry(SummaryID);
     break;
   case lltok::kw_typeidCompatibleVTable:
-    result = ParseTypeIdCompatibleVtableEntry(SummaryID);
+    result = parseTypeIdCompatibleVtableEntry(SummaryID);
     break;
   case lltok::kw_flags:
-    result = ParseSummaryIndexFlags();
+    result = parseSummaryIndexFlags();
     break;
   case lltok::kw_blockcount:
-    result = ParseBlockCount();
+    result = parseBlockCount();
     break;
   default:
-    result = Error(Lex.getLoc(), "unexpected summary kind");
+    result = error(Lex.getLoc(), "unexpected summary kind");
     break;
   }
   Lex.setIgnoreColonInIdentifiers(false);
@@ -910,16 +957,16 @@ bool LLParser::parseIndirectSymbol(const std::string &Name, LocTy NameLoc,
   GlobalValue::LinkageTypes Linkage = (GlobalValue::LinkageTypes) L;
 
   if(IsAlias && !GlobalAlias::isValidLinkage(Linkage))
-    return Error(NameLoc, "invalid linkage type for alias");
+    return error(NameLoc, "invalid linkage type for alias");
 
   if (!isValidVisibilityForLinkage(Visibility, L))
-    return Error(NameLoc,
+    return error(NameLoc,
                  "symbol with local linkage must have default visibility");
 
   Type *Ty;
   LocTy ExplicitTypeLoc = Lex.getLoc();
-  if (ParseType(Ty) ||
-      ParseToken(lltok::comma, "expected comma after alias or ifunc's type"))
+  if (parseType(Ty) ||
+      parseToken(lltok::comma, "expected comma after alias or ifunc's type"))
     return true;
 
   Constant *Aliasee;
@@ -928,33 +975,31 @@ bool LLParser::parseIndirectSymbol(const std::string &Name, LocTy NameLoc,
       Lex.getKind() != lltok::kw_getelementptr &&
       Lex.getKind() != lltok::kw_addrspacecast &&
       Lex.getKind() != lltok::kw_inttoptr) {
-    if (ParseGlobalTypeAndValue(Aliasee))
+    if (parseGlobalTypeAndValue(Aliasee))
       return true;
   } else {
     // The bitcast dest type is not present, it is implied by the dest type.
     ValID ID;
-    if (ParseValID(ID))
+    if (parseValID(ID))
       return true;
     if (ID.Kind != ValID::t_Constant)
-      return Error(AliaseeLoc, "invalid aliasee");
+      return error(AliaseeLoc, "invalid aliasee");
     Aliasee = ID.ConstantVal;
   }
 
   Type *AliaseeType = Aliasee->getType();
   auto *PTy = dyn_cast<PointerType>(AliaseeType);
   if (!PTy)
-    return Error(AliaseeLoc, "An alias or ifunc must have pointer type");
+    return error(AliaseeLoc, "An alias or ifunc must have pointer type");
   unsigned AddrSpace = PTy->getAddressSpace();
 
   if (IsAlias && Ty != PTy->getElementType())
-    return Error(
-        ExplicitTypeLoc,
-        "explicit pointee type doesn't match operand's pointee type");
+    return error(ExplicitTypeLoc,
+                 "explicit pointee type doesn't match operand's pointee type");
 
   if (!IsAlias && !PTy->getElementType()->isFunctionTy())
-    return Error(
-        ExplicitTypeLoc,
-        "explicit pointee type should be a function type");
+    return error(ExplicitTypeLoc,
+                 "explicit pointee type should be a function type");
 
   GlobalValue *GVal = nullptr;
 
@@ -964,7 +1009,7 @@ bool LLParser::parseIndirectSymbol(const std::string &Name, LocTy NameLoc,
     GVal = M->getNamedValue(Name);
     if (GVal) {
       if (!ForwardRefVals.erase(Name))
-        return Error(NameLoc, "redefinition of global '@" + Name + "'");
+        return error(NameLoc, "redefinition of global '@" + Name + "'");
     }
   } else {
     auto I = ForwardRefValIDs.find(NumberedVals.size());
@@ -998,10 +1043,10 @@ bool LLParser::parseIndirectSymbol(const std::string &Name, LocTy NameLoc,
     if (Lex.getKind() == lltok::kw_partition) {
       Lex.Lex();
       GA->setPartition(Lex.getStrVal());
-      if (ParseToken(lltok::StringConstant, "expected partition string"))
+      if (parseToken(lltok::StringConstant, "expected partition string"))
         return true;
     } else {
-      return TokError("unknown alias or ifunc property!");
+      return tokError("unknown alias or ifunc property!");
     }
   }
 
@@ -1011,7 +1056,7 @@ bool LLParser::parseIndirectSymbol(const std::string &Name, LocTy NameLoc,
   if (GVal) {
     // Verify that types agree.
     if (GVal->getType() != GA->getType())
-      return Error(
+      return error(
           ExplicitTypeLoc,
           "forward reference and definition of alias have different types");
 
@@ -1034,7 +1079,7 @@ bool LLParser::parseIndirectSymbol(const std::string &Name, LocTy NameLoc,
   return false;
 }
 
-/// ParseGlobal
+/// parseGlobal
 ///   ::= GlobalVar '=' OptionalLinkage OptionalPreemptionSpecifier
 ///       OptionalVisibility OptionalDLLStorageClass
 ///       OptionalThreadLocal OptionalUnnamedAddr OptionalAddrSpace
@@ -1047,13 +1092,13 @@ bool LLParser::parseIndirectSymbol(const std::string &Name, LocTy NameLoc,
 /// Everything up to and including OptionalUnnamedAddr has been parsed
 /// already.
 ///
-bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc,
+bool LLParser::parseGlobal(const std::string &Name, LocTy NameLoc,
                            unsigned Linkage, bool HasLinkage,
                            unsigned Visibility, unsigned DLLStorageClass,
                            bool DSOLocal, GlobalVariable::ThreadLocalMode TLM,
                            GlobalVariable::UnnamedAddr UnnamedAddr) {
   if (!isValidVisibilityForLinkage(Visibility, Linkage))
-    return Error(NameLoc,
+    return error(NameLoc,
                  "symbol with local linkage must have default visibility");
 
   unsigned AddrSpace;
@@ -1062,12 +1107,11 @@ bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc,
   LocTy TyLoc;
 
   Type *Ty = nullptr;
-  if (ParseOptionalAddrSpace(AddrSpace) ||
-      ParseOptionalToken(lltok::kw_externally_initialized,
+  if (parseOptionalAddrSpace(AddrSpace) ||
+      parseOptionalToken(lltok::kw_externally_initialized,
                          IsExternallyInitialized,
                          &IsExternallyInitializedLoc) ||
-      ParseGlobalType(IsConstant) ||
-      ParseType(Ty, TyLoc))
+      parseGlobalType(IsConstant) || parseType(Ty, TyLoc))
     return true;
 
   // If the linkage is specified and is external, then no initializer is
@@ -1076,12 +1120,12 @@ bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc,
   if (!HasLinkage ||
       !GlobalValue::isValidDeclarationLinkage(
           (GlobalValue::LinkageTypes)Linkage)) {
-    if (ParseGlobalValue(Ty, Init))
+    if (parseGlobalValue(Ty, Init))
       return true;
   }
 
   if (Ty->isFunctionTy() || !PointerType::isValidElementType(Ty))
-    return Error(TyLoc, "invalid type for global variable");
+    return error(TyLoc, "invalid type for global variable");
 
   GlobalValue *GVal = nullptr;
 
@@ -1090,7 +1134,7 @@ bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc,
     GVal = M->getNamedValue(Name);
     if (GVal) {
       if (!ForwardRefVals.erase(Name))
-        return Error(NameLoc, "redefinition of global '@" + Name + "'");
+        return error(NameLoc, "redefinition of global '@" + Name + "'");
     }
   } else {
     auto I = ForwardRefValIDs.find(NumberedVals.size());
@@ -1107,8 +1151,9 @@ bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc,
                             AddrSpace);
   } else {
     if (GVal->getValueType() != Ty)
-      return Error(TyLoc,
-            "forward reference and definition of global have different types");
+      return error(
+          TyLoc,
+          "forward reference and definition of global have different types");
 
     GV = cast<GlobalVariable>(GVal);
 
@@ -1131,26 +1176,27 @@ bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc,
   GV->setThreadLocalMode(TLM);
   GV->setUnnamedAddr(UnnamedAddr);
 
-  // Parse attributes on the global.
+  // parse attributes on the global.
   while (Lex.getKind() == lltok::comma) {
     Lex.Lex();
 
     if (Lex.getKind() == lltok::kw_section) {
       Lex.Lex();
       GV->setSection(Lex.getStrVal());
-      if (ParseToken(lltok::StringConstant, "expected global section string"))
+      if (parseToken(lltok::StringConstant, "expected global section string"))
         return true;
     } else if (Lex.getKind() == lltok::kw_partition) {
       Lex.Lex();
       GV->setPartition(Lex.getStrVal());
-      if (ParseToken(lltok::StringConstant, "expected partition string"))
+      if (parseToken(lltok::StringConstant, "expected partition string"))
         return true;
     } else if (Lex.getKind() == lltok::kw_align) {
       MaybeAlign Alignment;
-      if (ParseOptionalAlignment(Alignment)) return true;
+      if (parseOptionalAlignment(Alignment))
+        return true;
       GV->setAlignment(Alignment);
     } else if (Lex.getKind() == lltok::MetadataVar) {
-      if (ParseGlobalObjectMetadataAttachment(*GV))
+      if (parseGlobalObjectMetadataAttachment(*GV))
         return true;
     } else {
       Comdat *C;
@@ -1159,14 +1205,14 @@ bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc,
       if (C)
         GV->setComdat(C);
       else
-        return TokError("unknown global variable property!");
+        return tokError("unknown global variable property!");
     }
   }
 
   AttrBuilder Attrs;
   LocTy BuiltinLoc;
   std::vector<unsigned> FwdRefAttrGrps;
-  if (ParseFnAttributeValuePairs(Attrs, FwdRefAttrGrps, false, BuiltinLoc))
+  if (parseFnAttributeValuePairs(Attrs, FwdRefAttrGrps, false, BuiltinLoc))
     return true;
   if (Attrs.hasAttributes() || !FwdRefAttrGrps.empty()) {
     GV->setAttributes(AttributeSet::get(Context, Attrs));
@@ -1176,37 +1222,37 @@ bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc,
   return false;
 }
 
-/// ParseUnnamedAttrGrp
+/// parseUnnamedAttrGrp
 ///   ::= 'attributes' AttrGrpID '=' '{' AttrValPair+ '}'
-bool LLParser::ParseUnnamedAttrGrp() {
+bool LLParser::parseUnnamedAttrGrp() {
   assert(Lex.getKind() == lltok::kw_attributes);
   LocTy AttrGrpLoc = Lex.getLoc();
   Lex.Lex();
 
   if (Lex.getKind() != lltok::AttrGrpID)
-    return TokError("expected attribute group id");
+    return tokError("expected attribute group id");
 
   unsigned VarID = Lex.getUIntVal();
   std::vector<unsigned> unused;
   LocTy BuiltinLoc;
   Lex.Lex();
 
-  if (ParseToken(lltok::equal, "expected '=' here") ||
-      ParseToken(lltok::lbrace, "expected '{' here") ||
-      ParseFnAttributeValuePairs(NumberedAttrBuilders[VarID], unused, true,
+  if (parseToken(lltok::equal, "expected '=' here") ||
+      parseToken(lltok::lbrace, "expected '{' here") ||
+      parseFnAttributeValuePairs(NumberedAttrBuilders[VarID], unused, true,
                                  BuiltinLoc) ||
-      ParseToken(lltok::rbrace, "expected end of attribute group"))
+      parseToken(lltok::rbrace, "expected end of attribute group"))
     return true;
 
   if (!NumberedAttrBuilders[VarID].hasAttributes())
-    return Error(AttrGrpLoc, "attribute group has no attributes");
+    return error(AttrGrpLoc, "attribute group has no attributes");
 
   return false;
 }
 
-/// ParseFnAttributeValuePairs
+/// parseFnAttributeValuePairs
 ///   ::= <attr> | <attr> '=' <value>
-bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B,
+bool LLParser::parseFnAttributeValuePairs(AttrBuilder &B,
                                           std::vector<unsigned> &FwdRefAttrGrps,
                                           bool inAttrGrp, LocTy &BuiltinLoc) {
   bool HaveError = false;
@@ -1220,7 +1266,7 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B,
     switch (Token) {
     default:
       if (!inAttrGrp) return HaveError;
-      return Error(Lex.getLoc(), "unterminated attribute group");
+      return error(Lex.getLoc(), "unterminated attribute group");
     case lltok::rbrace:
       // Finished.
       return false;
@@ -1230,9 +1276,9 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B,
       //
       //   define void @foo() #1 { ... }
       if (inAttrGrp)
-        HaveError |=
-          Error(Lex.getLoc(),
-              "cannot have an attribute group reference in an attribute group");
+        HaveError |= error(
+            Lex.getLoc(),
+            "cannot have an attribute group reference in an attribute group");
 
       unsigned AttrGrpNum = Lex.getUIntVal();
       if (inAttrGrp) break;
@@ -1243,7 +1289,7 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B,
     }
     // Target-dependent attributes:
     case lltok::StringConstant: {
-      if (ParseStringAttribute(B))
+      if (parseStringAttribute(B))
         return true;
       continue;
     }
@@ -1257,11 +1303,11 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B,
       if (inAttrGrp) {
         Lex.Lex();
         uint32_t Value = 0;
-        if (ParseToken(lltok::equal, "expected '=' here") || ParseUInt32(Value))
+        if (parseToken(lltok::equal, "expected '=' here") || parseUInt32(Value))
           return true;
         Alignment = Align(Value);
       } else {
-        if (ParseOptionalAlignment(Alignment))
+        if (parseOptionalAlignment(Alignment))
           return true;
       }
       B.addAlignmentAttr(Alignment);
@@ -1271,11 +1317,11 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B,
       unsigned Alignment;
       if (inAttrGrp) {
         Lex.Lex();
-        if (ParseToken(lltok::equal, "expected '=' here") ||
-            ParseUInt32(Alignment))
+        if (parseToken(lltok::equal, "expected '=' here") ||
+            parseUInt32(Alignment))
           return true;
       } else {
-        if (ParseOptionalStackAlignment(Alignment))
+        if (parseOptionalStackAlignment(Alignment))
           return true;
       }
       B.addStackAlignmentAttr(Alignment);
@@ -1294,6 +1340,7 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B,
     case lltok::kw_argmemonly: B.addAttribute(Attribute::ArgMemOnly); break;
     case lltok::kw_builtin: B.addAttribute(Attribute::Builtin); break;
     case lltok::kw_cold: B.addAttribute(Attribute::Cold); break;
+    case lltok::kw_hot: B.addAttribute(Attribute::Hot); break;
     case lltok::kw_convergent: B.addAttribute(Attribute::Convergent); break;
     case lltok::kw_inaccessiblememonly:
       B.addAttribute(Attribute::InaccessibleMemOnly); break;
@@ -1302,8 +1349,14 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B,
     case lltok::kw_inlinehint: B.addAttribute(Attribute::InlineHint); break;
     case lltok::kw_jumptable: B.addAttribute(Attribute::JumpTable); break;
     case lltok::kw_minsize: B.addAttribute(Attribute::MinSize); break;
+    case lltok::kw_mustprogress:
+      B.addAttribute(Attribute::MustProgress);
+      break;
     case lltok::kw_naked: B.addAttribute(Attribute::Naked); break;
     case lltok::kw_nobuiltin: B.addAttribute(Attribute::NoBuiltin); break;
+    case lltok::kw_nocallback:
+      B.addAttribute(Attribute::NoCallback);
+      break;
     case lltok::kw_noduplicate: B.addAttribute(Attribute::NoDuplicate); break;
     case lltok::kw_nofree: B.addAttribute(Attribute::NoFree); break;
     case lltok::kw_noimplicitfloat:
@@ -1315,6 +1368,7 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B,
     case lltok::kw_noreturn: B.addAttribute(Attribute::NoReturn); break;
     case lltok::kw_nosync: B.addAttribute(Attribute::NoSync); break;
     case lltok::kw_nocf_check: B.addAttribute(Attribute::NoCfCheck); break;
+    case lltok::kw_noprofile: B.addAttribute(Attribute::NoProfile); break;
     case lltok::kw_norecurse: B.addAttribute(Attribute::NoRecurse); break;
     case lltok::kw_nounwind: B.addAttribute(Attribute::NoUnwind); break;
     case lltok::kw_null_pointer_is_valid:
@@ -1354,19 +1408,18 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B,
     case lltok::kw_writeonly: B.addAttribute(Attribute::WriteOnly); break;
     case lltok::kw_preallocated: {
       Type *Ty;
-      if (ParsePreallocated(Ty))
+      if (parsePreallocated(Ty))
         return true;
       B.addPreallocatedAttr(Ty);
       break;
     }
 
-    // Error handling.
+    // error handling.
     case lltok::kw_inreg:
     case lltok::kw_signext:
     case lltok::kw_zeroext:
       HaveError |=
-        Error(Lex.getLoc(),
-              "invalid use of attribute on a function");
+          error(Lex.getLoc(), "invalid use of attribute on a function");
       break;
     case lltok::kw_byval:
     case lltok::kw_dereferenceable:
@@ -1382,13 +1435,14 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B,
     case lltok::kw_swifterror:
     case lltok::kw_swiftself:
     case lltok::kw_immarg:
+    case lltok::kw_byref:
       HaveError |=
-        Error(Lex.getLoc(),
-              "invalid use of parameter-only attribute on a function");
+          error(Lex.getLoc(),
+                "invalid use of parameter-only attribute on a function");
       break;
     }
 
-    // ParsePreallocated() consumes token
+    // parsePreallocated() consumes token
     if (Token != lltok::kw_preallocated)
       Lex.Lex();
   }
@@ -1424,22 +1478,22 @@ Value *LLParser::checkValidVariableType(LocTy Loc, const Twine &Name, Type *Ty,
       return Val;
   }
   if (Ty->isLabelTy())
-    Error(Loc, "'" + Name + "' is not a basic block");
+    error(Loc, "'" + Name + "' is not a basic block");
   else
-    Error(Loc, "'" + Name + "' defined with type '" +
+    error(Loc, "'" + Name + "' defined with type '" +
                    getTypeString(Val->getType()) + "' but expected '" +
                    getTypeString(SuggestedTy) + "'");
   return nullptr;
 }
 
-/// GetGlobalVal - Get a value with the specified name or ID, creating a
+/// getGlobalVal - Get a value with the specified name or ID, creating a
 /// forward reference record if needed.  This can return null if the value
 /// exists but does not have the right type.
-GlobalValue *LLParser::GetGlobalVal(const std::string &Name, Type *Ty,
+GlobalValue *LLParser::getGlobalVal(const std::string &Name, Type *Ty,
                                     LocTy Loc, bool IsCall) {
   PointerType *PTy = dyn_cast<PointerType>(Ty);
   if (!PTy) {
-    Error(Loc, "global variable reference must have pointer type");
+    error(Loc, "global variable reference must have pointer type");
     return nullptr;
   }
 
@@ -1466,11 +1520,11 @@ GlobalValue *LLParser::GetGlobalVal(const std::string &Name, Type *Ty,
   return FwdVal;
 }
 
-GlobalValue *LLParser::GetGlobalVal(unsigned ID, Type *Ty, LocTy Loc,
+GlobalValue *LLParser::getGlobalVal(unsigned ID, Type *Ty, LocTy Loc,
                                     bool IsCall) {
   PointerType *PTy = dyn_cast<PointerType>(Ty);
   if (!PTy) {
-    Error(Loc, "global variable reference must have pointer type");
+    error(Loc, "global variable reference must have pointer type");
     return nullptr;
   }
 
@@ -1516,56 +1570,56 @@ Comdat *LLParser::getComdat(const std::string &Name, LocTy Loc) {
 // Helper Routines.
 //===----------------------------------------------------------------------===//
 
-/// ParseToken - If the current token has the specified kind, eat it and return
+/// parseToken - If the current token has the specified kind, eat it and return
 /// success.  Otherwise, emit the specified error and return failure.
-bool LLParser::ParseToken(lltok::Kind T, const char *ErrMsg) {
+bool LLParser::parseToken(lltok::Kind T, const char *ErrMsg) {
   if (Lex.getKind() != T)
-    return TokError(ErrMsg);
+    return tokError(ErrMsg);
   Lex.Lex();
   return false;
 }
 
-/// ParseStringConstant
+/// parseStringConstant
 ///   ::= StringConstant
-bool LLParser::ParseStringConstant(std::string &Result) {
+bool LLParser::parseStringConstant(std::string &Result) {
   if (Lex.getKind() != lltok::StringConstant)
-    return TokError("expected string constant");
+    return tokError("expected string constant");
   Result = Lex.getStrVal();
   Lex.Lex();
   return false;
 }
 
-/// ParseUInt32
+/// parseUInt32
 ///   ::= uint32
-bool LLParser::ParseUInt32(uint32_t &Val) {
+bool LLParser::parseUInt32(uint32_t &Val) {
   if (Lex.getKind() != lltok::APSInt || Lex.getAPSIntVal().isSigned())
-    return TokError("expected integer");
+    return tokError("expected integer");
   uint64_t Val64 = Lex.getAPSIntVal().getLimitedValue(0xFFFFFFFFULL+1);
   if (Val64 != unsigned(Val64))
-    return TokError("expected 32-bit integer (too large)");
+    return tokError("expected 32-bit integer (too large)");
   Val = Val64;
   Lex.Lex();
   return false;
 }
 
-/// ParseUInt64
+/// parseUInt64
 ///   ::= uint64
-bool LLParser::ParseUInt64(uint64_t &Val) {
+bool LLParser::parseUInt64(uint64_t &Val) {
   if (Lex.getKind() != lltok::APSInt || Lex.getAPSIntVal().isSigned())
-    return TokError("expected integer");
+    return tokError("expected integer");
   Val = Lex.getAPSIntVal().getLimitedValue();
   Lex.Lex();
   return false;
 }
 
-/// ParseTLSModel
+/// parseTLSModel
 ///   := 'localdynamic'
 ///   := 'initialexec'
 ///   := 'localexec'
-bool LLParser::ParseTLSModel(GlobalVariable::ThreadLocalMode &TLM) {
+bool LLParser::parseTLSModel(GlobalVariable::ThreadLocalMode &TLM) {
   switch (Lex.getKind()) {
     default:
-      return TokError("expected localdynamic, initialexec or localexec");
+      return tokError("expected localdynamic, initialexec or localexec");
     case lltok::kw_localdynamic:
       TLM = GlobalVariable::LocalDynamicTLSModel;
       break;
@@ -1581,11 +1635,11 @@ bool LLParser::ParseTLSModel(GlobalVariable::ThreadLocalMode &TLM) {
   return false;
 }
 
-/// ParseOptionalThreadLocal
+/// parseOptionalThreadLocal
 ///   := /*empty*/
 ///   := 'thread_local'
 ///   := 'thread_local' '(' tlsmodel ')'
-bool LLParser::ParseOptionalThreadLocal(GlobalVariable::ThreadLocalMode &TLM) {
+bool LLParser::parseOptionalThreadLocal(GlobalVariable::ThreadLocalMode &TLM) {
   TLM = GlobalVariable::NotThreadLocal;
   if (!EatIfPresent(lltok::kw_thread_local))
     return false;
@@ -1593,39 +1647,40 @@ bool LLParser::ParseOptionalThreadLocal(GlobalVariable::ThreadLocalMode &TLM) {
   TLM = GlobalVariable::GeneralDynamicTLSModel;
   if (Lex.getKind() == lltok::lparen) {
     Lex.Lex();
-    return ParseTLSModel(TLM) ||
-      ParseToken(lltok::rparen, "expected ')' after thread local model");
+    return parseTLSModel(TLM) ||
+           parseToken(lltok::rparen, "expected ')' after thread local model");
   }
   return false;
 }
 
-/// ParseOptionalAddrSpace
+/// parseOptionalAddrSpace
 ///   := /*empty*/
 ///   := 'addrspace' '(' uint32 ')'
-bool LLParser::ParseOptionalAddrSpace(unsigned &AddrSpace, unsigned DefaultAS) {
+bool LLParser::parseOptionalAddrSpace(unsigned &AddrSpace, unsigned DefaultAS) {
   AddrSpace = DefaultAS;
   if (!EatIfPresent(lltok::kw_addrspace))
     return false;
-  return ParseToken(lltok::lparen, "expected '(' in address space") ||
-         ParseUInt32(AddrSpace) ||
-         ParseToken(lltok::rparen, "expected ')' in address space");
+  return parseToken(lltok::lparen, "expected '(' in address space") ||
+         parseUInt32(AddrSpace) ||
+         parseToken(lltok::rparen, "expected ')' in address space");
 }
 
-/// ParseStringAttribute
+/// parseStringAttribute
 ///   := StringConstant
 ///   := StringConstant '=' StringConstant
-bool LLParser::ParseStringAttribute(AttrBuilder &B) {
+bool LLParser::parseStringAttribute(AttrBuilder &B) {
   std::string Attr = Lex.getStrVal();
   Lex.Lex();
   std::string Val;
-  if (EatIfPresent(lltok::equal) && ParseStringConstant(Val))
+  if (EatIfPresent(lltok::equal) && parseStringConstant(Val))
     return true;
   B.addAttribute(Attr, Val);
   return false;
 }
 
-/// ParseOptionalParamAttrs - Parse a potentially empty list of parameter attributes.
-bool LLParser::ParseOptionalParamAttrs(AttrBuilder &B) {
+/// parseOptionalParamAttrs - parse a potentially empty list of parameter
+/// attributes.
+bool LLParser::parseOptionalParamAttrs(AttrBuilder &B) {
   bool HaveError = false;
 
   B.clear();
@@ -1636,45 +1691,59 @@ bool LLParser::ParseOptionalParamAttrs(AttrBuilder &B) {
     default:  // End of attributes.
       return HaveError;
     case lltok::StringConstant: {
-      if (ParseStringAttribute(B))
+      if (parseStringAttribute(B))
         return true;
       continue;
     }
     case lltok::kw_align: {
       MaybeAlign Alignment;
-      if (ParseOptionalAlignment(Alignment, true))
+      if (parseOptionalAlignment(Alignment, true))
         return true;
       B.addAlignmentAttr(Alignment);
       continue;
     }
     case lltok::kw_byval: {
       Type *Ty;
-      if (ParseByValWithOptionalType(Ty))
+      if (parseRequiredTypeAttr(Ty, lltok::kw_byval))
         return true;
       B.addByValAttr(Ty);
       continue;
     }
+    case lltok::kw_sret: {
+      Type *Ty;
+      if (parseRequiredTypeAttr(Ty, lltok::kw_sret))
+        return true;
+      B.addStructRetAttr(Ty);
+      continue;
+    }
     case lltok::kw_preallocated: {
       Type *Ty;
-      if (ParsePreallocated(Ty))
+      if (parsePreallocated(Ty))
         return true;
       B.addPreallocatedAttr(Ty);
       continue;
     }
     case lltok::kw_dereferenceable: {
       uint64_t Bytes;
-      if (ParseOptionalDerefAttrBytes(lltok::kw_dereferenceable, Bytes))
+      if (parseOptionalDerefAttrBytes(lltok::kw_dereferenceable, Bytes))
         return true;
       B.addDereferenceableAttr(Bytes);
       continue;
     }
     case lltok::kw_dereferenceable_or_null: {
       uint64_t Bytes;
-      if (ParseOptionalDerefAttrBytes(lltok::kw_dereferenceable_or_null, Bytes))
+      if (parseOptionalDerefAttrBytes(lltok::kw_dereferenceable_or_null, Bytes))
         return true;
       B.addDereferenceableOrNullAttr(Bytes);
       continue;
     }
+    case lltok::kw_byref: {
+      Type *Ty;
+      if (parseByRef(Ty))
+        return true;
+      B.addByRefAttr(Ty);
+      continue;
+    }
     case lltok::kw_inalloca:        B.addAttribute(Attribute::InAlloca); break;
     case lltok::kw_inreg:           B.addAttribute(Attribute::InReg); break;
     case lltok::kw_nest:            B.addAttribute(Attribute::Nest); break;
@@ -1689,7 +1758,6 @@ bool LLParser::ParseOptionalParamAttrs(AttrBuilder &B) {
     case lltok::kw_readonly:        B.addAttribute(Attribute::ReadOnly); break;
     case lltok::kw_returned:        B.addAttribute(Attribute::Returned); break;
     case lltok::kw_signext:         B.addAttribute(Attribute::SExt); break;
-    case lltok::kw_sret:            B.addAttribute(Attribute::StructRet); break;
     case lltok::kw_swifterror:      B.addAttribute(Attribute::SwiftError); break;
     case lltok::kw_swiftself:       B.addAttribute(Attribute::SwiftSelf); break;
     case lltok::kw_writeonly:       B.addAttribute(Attribute::WriteOnly); break;
@@ -1703,6 +1771,7 @@ bool LLParser::ParseOptionalParamAttrs(AttrBuilder &B) {
     case lltok::kw_inlinehint:
     case lltok::kw_jumptable:
     case lltok::kw_minsize:
+    case lltok::kw_mustprogress:
     case lltok::kw_naked:
     case lltok::kw_nobuiltin:
     case lltok::kw_noduplicate:
@@ -1710,6 +1779,7 @@ bool LLParser::ParseOptionalParamAttrs(AttrBuilder &B) {
     case lltok::kw_noinline:
     case lltok::kw_nonlazybind:
     case lltok::kw_nomerge:
+    case lltok::kw_noprofile:
     case lltok::kw_noredzone:
     case lltok::kw_noreturn:
     case lltok::kw_nocf_check:
@@ -1731,7 +1801,8 @@ bool LLParser::ParseOptionalParamAttrs(AttrBuilder &B) {
     case lltok::kw_shadowcallstack:
     case lltok::kw_strictfp:
     case lltok::kw_uwtable:
-      HaveError |= Error(Lex.getLoc(), "invalid use of function-only attribute");
+      HaveError |=
+          error(Lex.getLoc(), "invalid use of function-only attribute");
       break;
     }
 
@@ -1739,8 +1810,9 @@ bool LLParser::ParseOptionalParamAttrs(AttrBuilder &B) {
   }
 }
 
-/// ParseOptionalReturnAttrs - Parse a potentially empty list of return attributes.
-bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) {
+/// parseOptionalReturnAttrs - parse a potentially empty list of return
+/// attributes.
+bool LLParser::parseOptionalReturnAttrs(AttrBuilder &B) {
   bool HaveError = false;
 
   B.clear();
@@ -1751,27 +1823,27 @@ bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) {
     default:  // End of attributes.
       return HaveError;
     case lltok::StringConstant: {
-      if (ParseStringAttribute(B))
+      if (parseStringAttribute(B))
         return true;
       continue;
     }
     case lltok::kw_dereferenceable: {
       uint64_t Bytes;
-      if (ParseOptionalDerefAttrBytes(lltok::kw_dereferenceable, Bytes))
+      if (parseOptionalDerefAttrBytes(lltok::kw_dereferenceable, Bytes))
         return true;
       B.addDereferenceableAttr(Bytes);
       continue;
     }
     case lltok::kw_dereferenceable_or_null: {
       uint64_t Bytes;
-      if (ParseOptionalDerefAttrBytes(lltok::kw_dereferenceable_or_null, Bytes))
+      if (parseOptionalDerefAttrBytes(lltok::kw_dereferenceable_or_null, Bytes))
         return true;
       B.addDereferenceableOrNullAttr(Bytes);
       continue;
     }
     case lltok::kw_align: {
       MaybeAlign Alignment;
-      if (ParseOptionalAlignment(Alignment))
+      if (parseOptionalAlignment(Alignment))
         return true;
       B.addAlignmentAttr(Alignment);
       continue;
@@ -1785,7 +1857,7 @@ bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) {
     case lltok::kw_signext:         B.addAttribute(Attribute::SExt); break;
     case lltok::kw_zeroext:         B.addAttribute(Attribute::ZExt); break;
 
-    // Error handling.
+    // error handling.
     case lltok::kw_byval:
     case lltok::kw_inalloca:
     case lltok::kw_nest:
@@ -1795,7 +1867,9 @@ bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) {
     case lltok::kw_swifterror:
     case lltok::kw_swiftself:
     case lltok::kw_immarg:
-      HaveError |= Error(Lex.getLoc(), "invalid use of parameter-only attribute");
+    case lltok::kw_byref:
+      HaveError |=
+          error(Lex.getLoc(), "invalid use of parameter-only attribute");
       break;
 
     case lltok::kw_alignstack:
@@ -1806,6 +1880,7 @@ bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) {
     case lltok::kw_inlinehint:
     case lltok::kw_jumptable:
     case lltok::kw_minsize:
+    case lltok::kw_mustprogress:
     case lltok::kw_naked:
     case lltok::kw_nobuiltin:
     case lltok::kw_noduplicate:
@@ -1813,6 +1888,7 @@ bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) {
     case lltok::kw_noinline:
     case lltok::kw_nonlazybind:
     case lltok::kw_nomerge:
+    case lltok::kw_noprofile:
     case lltok::kw_noredzone:
     case lltok::kw_noreturn:
     case lltok::kw_nocf_check:
@@ -1834,15 +1910,17 @@ bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) {
     case lltok::kw_shadowcallstack:
     case lltok::kw_strictfp:
     case lltok::kw_uwtable:
-      HaveError |= Error(Lex.getLoc(), "invalid use of function-only attribute");
+      HaveError |=
+          error(Lex.getLoc(), "invalid use of function-only attribute");
       break;
     case lltok::kw_readnone:
     case lltok::kw_readonly:
-      HaveError |= Error(Lex.getLoc(), "invalid use of attribute on return type");
+      HaveError |=
+          error(Lex.getLoc(), "invalid use of attribute on return type");
       break;
     case lltok::kw_preallocated:
       HaveError |=
-          Error(Lex.getLoc(),
+          error(Lex.getLoc(),
                 "invalid use of parameter-only/call site-only attribute");
       break;
     }
@@ -1882,7 +1960,7 @@ static unsigned parseOptionalLinkageAux(lltok::Kind Kind, bool &HasLinkage) {
   }
 }
 
-/// ParseOptionalLinkage
+/// parseOptionalLinkage
 ///   ::= /*empty*/
 ///   ::= 'private'
 ///   ::= 'internal'
@@ -1895,25 +1973,24 @@ static unsigned parseOptionalLinkageAux(lltok::Kind Kind, bool &HasLinkage) {
 ///   ::= 'common'
 ///   ::= 'extern_weak'
 ///   ::= 'external'
-bool LLParser::ParseOptionalLinkage(unsigned &Res, bool &HasLinkage,
+bool LLParser::parseOptionalLinkage(unsigned &Res, bool &HasLinkage,
                                     unsigned &Visibility,
-                                    unsigned &DLLStorageClass,
-                                    bool &DSOLocal) {
+                                    unsigned &DLLStorageClass, bool &DSOLocal) {
   Res = parseOptionalLinkageAux(Lex.getKind(), HasLinkage);
   if (HasLinkage)
     Lex.Lex();
-  ParseOptionalDSOLocal(DSOLocal);
-  ParseOptionalVisibility(Visibility);
-  ParseOptionalDLLStorageClass(DLLStorageClass);
+  parseOptionalDSOLocal(DSOLocal);
+  parseOptionalVisibility(Visibility);
+  parseOptionalDLLStorageClass(DLLStorageClass);
 
   if (DSOLocal && DLLStorageClass == GlobalValue::DLLImportStorageClass) {
-    return Error(Lex.getLoc(), "dso_location and DLL-StorageClass mismatch");
+    return error(Lex.getLoc(), "dso_location and DLL-StorageClass mismatch");
   }
 
   return false;
 }
 
-void LLParser::ParseOptionalDSOLocal(bool &DSOLocal) {
+void LLParser::parseOptionalDSOLocal(bool &DSOLocal) {
   switch (Lex.getKind()) {
   default:
     DSOLocal = false;
@@ -1929,13 +2006,13 @@ void LLParser::ParseOptionalDSOLocal(bool &DSOLocal) {
   }
 }
 
-/// ParseOptionalVisibility
+/// parseOptionalVisibility
 ///   ::= /*empty*/
 ///   ::= 'default'
 ///   ::= 'hidden'
 ///   ::= 'protected'
 ///
-void LLParser::ParseOptionalVisibility(unsigned &Res) {
+void LLParser::parseOptionalVisibility(unsigned &Res) {
   switch (Lex.getKind()) {
   default:
     Res = GlobalValue::DefaultVisibility;
@@ -1953,12 +2030,12 @@ void LLParser::ParseOptionalVisibility(unsigned &Res) {
   Lex.Lex();
 }
 
-/// ParseOptionalDLLStorageClass
+/// parseOptionalDLLStorageClass
 ///   ::= /*empty*/
 ///   ::= 'dllimport'
 ///   ::= 'dllexport'
 ///
-void LLParser::ParseOptionalDLLStorageClass(unsigned &Res) {
+void LLParser::parseOptionalDLLStorageClass(unsigned &Res) {
   switch (Lex.getKind()) {
   default:
     Res = GlobalValue::DefaultStorageClass;
@@ -1973,7 +2050,7 @@ void LLParser::ParseOptionalDLLStorageClass(unsigned &Res) {
   Lex.Lex();
 }
 
-/// ParseOptionalCallingConv
+/// parseOptionalCallingConv
 ///   ::= /*empty*/
 ///   ::= 'ccc'
 ///   ::= 'fastcc'
@@ -2019,7 +2096,7 @@ void LLParser::ParseOptionalDLLStorageClass(unsigned &Res) {
 ///   ::= 'tailcc'
 ///   ::= 'cc' UINT
 ///
-bool LLParser::ParseOptionalCallingConv(unsigned &CC) {
+bool LLParser::parseOptionalCallingConv(unsigned &CC) {
   switch (Lex.getKind()) {
   default:                       CC = CallingConv::C; return false;
   case lltok::kw_ccc:            CC = CallingConv::C; break;
@@ -2059,6 +2136,7 @@ bool LLParser::ParseOptionalCallingConv(unsigned &CC) {
   case lltok::kw_hhvm_ccc:       CC = CallingConv::HHVM_C; break;
   case lltok::kw_cxx_fast_tlscc: CC = CallingConv::CXX_FAST_TLS; break;
   case lltok::kw_amdgpu_vs:      CC = CallingConv::AMDGPU_VS; break;
+  case lltok::kw_amdgpu_gfx:     CC = CallingConv::AMDGPU_Gfx; break;
   case lltok::kw_amdgpu_ls:      CC = CallingConv::AMDGPU_LS; break;
   case lltok::kw_amdgpu_hs:      CC = CallingConv::AMDGPU_HS; break;
   case lltok::kw_amdgpu_es:      CC = CallingConv::AMDGPU_ES; break;
@@ -2069,7 +2147,7 @@ bool LLParser::ParseOptionalCallingConv(unsigned &CC) {
   case lltok::kw_tailcc:         CC = CallingConv::Tail; break;
   case lltok::kw_cc: {
       Lex.Lex();
-      return ParseUInt32(CC);
+      return parseUInt32(CC);
     }
   }
 
@@ -2077,28 +2155,28 @@ bool LLParser::ParseOptionalCallingConv(unsigned &CC) {
   return false;
 }
 
-/// ParseMetadataAttachment
+/// parseMetadataAttachment
 ///   ::= !dbg !42
-bool LLParser::ParseMetadataAttachment(unsigned &Kind, MDNode *&MD) {
+bool LLParser::parseMetadataAttachment(unsigned &Kind, MDNode *&MD) {
   assert(Lex.getKind() == lltok::MetadataVar && "Expected metadata attachment");
 
   std::string Name = Lex.getStrVal();
   Kind = M->getMDKindID(Name);
   Lex.Lex();
 
-  return ParseMDNode(MD);
+  return parseMDNode(MD);
 }
 
-/// ParseInstructionMetadata
+/// parseInstructionMetadata
 ///   ::= !dbg !42 (',' !dbg !57)*
-bool LLParser::ParseInstructionMetadata(Instruction &Inst) {
+bool LLParser::parseInstructionMetadata(Instruction &Inst) {
   do {
     if (Lex.getKind() != lltok::MetadataVar)
-      return TokError("expected metadata after comma");
+      return tokError("expected metadata after comma");
 
     unsigned MDK;
     MDNode *N;
-    if (ParseMetadataAttachment(MDK, N))
+    if (parseMetadataAttachment(MDK, N))
       return true;
 
     Inst.setMetadata(MDK, N);
@@ -2110,31 +2188,31 @@ bool LLParser::ParseInstructionMetadata(Instruction &Inst) {
   return false;
 }
 
-/// ParseGlobalObjectMetadataAttachment
+/// parseGlobalObjectMetadataAttachment
 ///   ::= !dbg !57
-bool LLParser::ParseGlobalObjectMetadataAttachment(GlobalObject &GO) {
+bool LLParser::parseGlobalObjectMetadataAttachment(GlobalObject &GO) {
   unsigned MDK;
   MDNode *N;
-  if (ParseMetadataAttachment(MDK, N))
+  if (parseMetadataAttachment(MDK, N))
     return true;
 
   GO.addMetadata(MDK, *N);
   return false;
 }
 
-/// ParseOptionalFunctionMetadata
+/// parseOptionalFunctionMetadata
 ///   ::= (!dbg !57)*
-bool LLParser::ParseOptionalFunctionMetadata(Function &F) {
+bool LLParser::parseOptionalFunctionMetadata(Function &F) {
   while (Lex.getKind() == lltok::MetadataVar)
-    if (ParseGlobalObjectMetadataAttachment(F))
+    if (parseGlobalObjectMetadataAttachment(F))
       return true;
   return false;
 }
 
-/// ParseOptionalAlignment
+/// parseOptionalAlignment
 ///   ::= /* empty */
 ///   ::= 'align' 4
-bool LLParser::ParseOptionalAlignment(MaybeAlign &Alignment, bool AllowParens) {
+bool LLParser::parseOptionalAlignment(MaybeAlign &Alignment, bool AllowParens) {
   Alignment = None;
   if (!EatIfPresent(lltok::kw_align))
     return false;
@@ -2148,26 +2226,26 @@ bool LLParser::ParseOptionalAlignment(MaybeAlign &Alignment, bool AllowParens) {
       HaveParens = true;
   }
 
-  if (ParseUInt32(Value))
+  if (parseUInt32(Value))
     return true;
 
   if (HaveParens && !EatIfPresent(lltok::rparen))
-    return Error(ParenLoc, "expected ')'");
+    return error(ParenLoc, "expected ')'");
 
   if (!isPowerOf2_32(Value))
-    return Error(AlignLoc, "alignment is not a power of two");
+    return error(AlignLoc, "alignment is not a power of two");
   if (Value > Value::MaximumAlignment)
-    return Error(AlignLoc, "huge alignments are not supported yet");
+    return error(AlignLoc, "huge alignments are not supported yet");
   Alignment = Align(Value);
   return false;
 }
 
-/// ParseOptionalDerefAttrBytes
+/// parseOptionalDerefAttrBytes
 ///   ::= /* empty */
 ///   ::= AttrKind '(' 4 ')'
 ///
 /// where AttrKind is either 'dereferenceable' or 'dereferenceable_or_null'.
-bool LLParser::ParseOptionalDerefAttrBytes(lltok::Kind AttrKind,
+bool LLParser::parseOptionalDerefAttrBytes(lltok::Kind AttrKind,
                                            uint64_t &Bytes) {
   assert((AttrKind == lltok::kw_dereferenceable ||
           AttrKind == lltok::kw_dereferenceable_or_null) &&
@@ -2178,24 +2256,25 @@ bool LLParser::ParseOptionalDerefAttrBytes(lltok::Kind AttrKind,
     return false;
   LocTy ParenLoc = Lex.getLoc();
   if (!EatIfPresent(lltok::lparen))
-    return Error(ParenLoc, "expected '('");
+    return error(ParenLoc, "expected '('");
   LocTy DerefLoc = Lex.getLoc();
-  if (ParseUInt64(Bytes)) return true;
+  if (parseUInt64(Bytes))
+    return true;
   ParenLoc = Lex.getLoc();
   if (!EatIfPresent(lltok::rparen))
-    return Error(ParenLoc, "expected ')'");
+    return error(ParenLoc, "expected ')'");
   if (!Bytes)
-    return Error(DerefLoc, "dereferenceable bytes must be non-zero");
+    return error(DerefLoc, "dereferenceable bytes must be non-zero");
   return false;
 }
 
-/// ParseOptionalCommaAlign
+/// parseOptionalCommaAlign
 ///   ::=
 ///   ::= ',' align 4
 ///
 /// This returns with AteExtraComma set to true if it ate an excess comma at the
 /// end.
-bool LLParser::ParseOptionalCommaAlign(MaybeAlign &Alignment,
+bool LLParser::parseOptionalCommaAlign(MaybeAlign &Alignment,
                                        bool &AteExtraComma) {
   AteExtraComma = false;
   while (EatIfPresent(lltok::comma)) {
@@ -2206,22 +2285,22 @@ bool LLParser::ParseOptionalCommaAlign(MaybeAlign &Alignment,
     }
 
     if (Lex.getKind() != lltok::kw_align)
-      return Error(Lex.getLoc(), "expected metadata or 'align'");
+      return error(Lex.getLoc(), "expected metadata or 'align'");
 
-    if (ParseOptionalAlignment(Alignment)) return true;
+    if (parseOptionalAlignment(Alignment))
+      return true;
   }
 
   return false;
 }
 
-/// ParseOptionalCommaAddrSpace
+/// parseOptionalCommaAddrSpace
 ///   ::=
 ///   ::= ',' addrspace(1)
 ///
 /// This returns with AteExtraComma set to true if it ate an excess comma at the
 /// end.
-bool LLParser::ParseOptionalCommaAddrSpace(unsigned &AddrSpace,
-                                           LocTy &Loc,
+bool LLParser::parseOptionalCommaAddrSpace(unsigned &AddrSpace, LocTy &Loc,
                                            bool &AteExtraComma) {
   AteExtraComma = false;
   while (EatIfPresent(lltok::comma)) {
@@ -2233,9 +2312,9 @@ bool LLParser::ParseOptionalCommaAddrSpace(unsigned &AddrSpace,
 
     Loc = Lex.getLoc();
     if (Lex.getKind() != lltok::kw_addrspace)
-      return Error(Lex.getLoc(), "expected metadata or 'addrspace'");
+      return error(Lex.getLoc(), "expected metadata or 'addrspace'");
 
-    if (ParseOptionalAddrSpace(AddrSpace))
+    if (parseOptionalAddrSpace(AddrSpace))
       return true;
   }
 
@@ -2248,18 +2327,18 @@ bool LLParser::parseAllocSizeArguments(unsigned &BaseSizeArg,
 
   auto StartParen = Lex.getLoc();
   if (!EatIfPresent(lltok::lparen))
-    return Error(StartParen, "expected '('");
+    return error(StartParen, "expected '('");
 
-  if (ParseUInt32(BaseSizeArg))
+  if (parseUInt32(BaseSizeArg))
     return true;
 
   if (EatIfPresent(lltok::comma)) {
     auto HowManyAt = Lex.getLoc();
     unsigned HowMany;
-    if (ParseUInt32(HowMany))
+    if (parseUInt32(HowMany))
       return true;
     if (HowMany == BaseSizeArg)
-      return Error(HowManyAt,
+      return error(HowManyAt,
                    "'allocsize' indices can't refer to the same parameter");
     HowManyArg = HowMany;
   } else
@@ -2267,42 +2346,42 @@ bool LLParser::parseAllocSizeArguments(unsigned &BaseSizeArg,
 
   auto EndParen = Lex.getLoc();
   if (!EatIfPresent(lltok::rparen))
-    return Error(EndParen, "expected ')'");
+    return error(EndParen, "expected ')'");
   return false;
 }
 
-/// ParseScopeAndOrdering
+/// parseScopeAndOrdering
 ///   if isAtomic: ::= SyncScope? AtomicOrdering
 ///   else: ::=
 ///
 /// This sets Scope and Ordering to the parsed values.
-bool LLParser::ParseScopeAndOrdering(bool isAtomic, SyncScope::ID &SSID,
+bool LLParser::parseScopeAndOrdering(bool IsAtomic, SyncScope::ID &SSID,
                                      AtomicOrdering &Ordering) {
-  if (!isAtomic)
+  if (!IsAtomic)
     return false;
 
-  return ParseScope(SSID) || ParseOrdering(Ordering);
+  return parseScope(SSID) || parseOrdering(Ordering);
 }
 
-/// ParseScope
+/// parseScope
 ///   ::= syncscope("singlethread" | "<target scope>")?
 ///
 /// This sets synchronization scope ID to the ID of the parsed value.
-bool LLParser::ParseScope(SyncScope::ID &SSID) {
+bool LLParser::parseScope(SyncScope::ID &SSID) {
   SSID = SyncScope::System;
   if (EatIfPresent(lltok::kw_syncscope)) {
     auto StartParenAt = Lex.getLoc();
     if (!EatIfPresent(lltok::lparen))
-      return Error(StartParenAt, "Expected '(' in syncscope");
+      return error(StartParenAt, "Expected '(' in syncscope");
 
     std::string SSN;
     auto SSNAt = Lex.getLoc();
-    if (ParseStringConstant(SSN))
-      return Error(SSNAt, "Expected synchronization scope name");
+    if (parseStringConstant(SSN))
+      return error(SSNAt, "Expected synchronization scope name");
 
     auto EndParenAt = Lex.getLoc();
     if (!EatIfPresent(lltok::rparen))
-      return Error(EndParenAt, "Expected ')' in syncscope");
+      return error(EndParenAt, "Expected ')' in syncscope");
 
     SSID = Context.getOrInsertSyncScopeID(SSN);
   }
@@ -2310,13 +2389,14 @@ bool LLParser::ParseScope(SyncScope::ID &SSID) {
   return false;
 }
 
-/// ParseOrdering
+/// parseOrdering
 ///   ::= AtomicOrdering
 ///
 /// This sets Ordering to the parsed value.
-bool LLParser::ParseOrdering(AtomicOrdering &Ordering) {
+bool LLParser::parseOrdering(AtomicOrdering &Ordering) {
   switch (Lex.getKind()) {
-  default: return TokError("Expected ordering on atomic instruction");
+  default:
+    return tokError("Expected ordering on atomic instruction");
   case lltok::kw_unordered: Ordering = AtomicOrdering::Unordered; break;
   case lltok::kw_monotonic: Ordering = AtomicOrdering::Monotonic; break;
   // Not specified yet:
@@ -2332,50 +2412,53 @@ bool LLParser::ParseOrdering(AtomicOrdering &Ordering) {
   return false;
 }
 
-/// ParseOptionalStackAlignment
+/// parseOptionalStackAlignment
 ///   ::= /* empty */
 ///   ::= 'alignstack' '(' 4 ')'
-bool LLParser::ParseOptionalStackAlignment(unsigned &Alignment) {
+bool LLParser::parseOptionalStackAlignment(unsigned &Alignment) {
   Alignment = 0;
   if (!EatIfPresent(lltok::kw_alignstack))
     return false;
   LocTy ParenLoc = Lex.getLoc();
   if (!EatIfPresent(lltok::lparen))
-    return Error(ParenLoc, "expected '('");
+    return error(ParenLoc, "expected '('");
   LocTy AlignLoc = Lex.getLoc();
-  if (ParseUInt32(Alignment)) return true;
+  if (parseUInt32(Alignment))
+    return true;
   ParenLoc = Lex.getLoc();
   if (!EatIfPresent(lltok::rparen))
-    return Error(ParenLoc, "expected ')'");
+    return error(ParenLoc, "expected ')'");
   if (!isPowerOf2_32(Alignment))
-    return Error(AlignLoc, "stack alignment is not a power of two");
+    return error(AlignLoc, "stack alignment is not a power of two");
   return false;
 }
 
-/// ParseIndexList - This parses the index list for an insert/extractvalue
+/// parseIndexList - This parses the index list for an insert/extractvalue
 /// instruction.  This sets AteExtraComma in the case where we eat an extra
 /// comma at the end of the line and find that it is followed by metadata.
 /// Clients that don't allow metadata can call the version of this function that
 /// only takes one argument.
 ///
-/// ParseIndexList
+/// parseIndexList
 ///    ::=  (',' uint32)+
 ///
-bool LLParser::ParseIndexList(SmallVectorImpl<unsigned> &Indices,
+bool LLParser::parseIndexList(SmallVectorImpl<unsigned> &Indices,
                               bool &AteExtraComma) {
   AteExtraComma = false;
 
   if (Lex.getKind() != lltok::comma)
-    return TokError("expected ',' as start of index list");
+    return tokError("expected ',' as start of index list");
 
   while (EatIfPresent(lltok::comma)) {
     if (Lex.getKind() == lltok::MetadataVar) {
-      if (Indices.empty()) return TokError("expected index");
+      if (Indices.empty())
+        return tokError("expected index");
       AteExtraComma = true;
       return false;
     }
     unsigned Idx = 0;
-    if (ParseUInt32(Idx)) return true;
+    if (parseUInt32(Idx))
+      return true;
     Indices.push_back(Idx);
   }
 
@@ -2386,12 +2469,12 @@ bool LLParser::ParseIndexList(SmallVectorImpl<unsigned> &Indices,
 // Type Parsing.
 //===----------------------------------------------------------------------===//
 
-/// ParseType - Parse a type.
-bool LLParser::ParseType(Type *&Result, const Twine &Msg, bool AllowVoid) {
+/// parseType - parse a type.
+bool LLParser::parseType(Type *&Result, const Twine &Msg, bool AllowVoid) {
   SMLoc TypeLoc = Lex.getLoc();
   switch (Lex.getKind()) {
   default:
-    return TokError(Msg);
+    return tokError(Msg);
   case lltok::Type:
     // Type ::= 'float' | 'void' (etc)
     Result = Lex.getTyVal();
@@ -2399,23 +2482,23 @@ bool LLParser::ParseType(Type *&Result, const Twine &Msg, bool AllowVoid) {
     break;
   case lltok::lbrace:
     // Type ::= StructType
-    if (ParseAnonStructType(Result, false))
+    if (parseAnonStructType(Result, false))
       return true;
     break;
   case lltok::lsquare:
     // Type ::= '[' ... ']'
     Lex.Lex(); // eat the lsquare.
-    if (ParseArrayVectorType(Result, false))
+    if (parseArrayVectorType(Result, false))
       return true;
     break;
   case lltok::less: // Either vector or packed struct.
     // Type ::= '<' ... '>'
     Lex.Lex();
     if (Lex.getKind() == lltok::lbrace) {
-      if (ParseAnonStructType(Result, true) ||
-          ParseToken(lltok::greater, "expected '>' at end of packed struct"))
+      if (parseAnonStructType(Result, true) ||
+          parseToken(lltok::greater, "expected '>' at end of packed struct"))
         return true;
-    } else if (ParseArrayVectorType(Result, true))
+    } else if (parseArrayVectorType(Result, true))
       return true;
     break;
   case lltok::LocalVar: {
@@ -2449,23 +2532,23 @@ bool LLParser::ParseType(Type *&Result, const Twine &Msg, bool AllowVoid) {
   }
   }
 
-  // Parse the type suffixes.
+  // parse the type suffixes.
   while (true) {
     switch (Lex.getKind()) {
     // End of type.
     default:
       if (!AllowVoid && Result->isVoidTy())
-        return Error(TypeLoc, "void type only allowed for function results");
+        return error(TypeLoc, "void type only allowed for function results");
       return false;
 
     // Type ::= Type '*'
     case lltok::star:
       if (Result->isLabelTy())
-        return TokError("basic block pointers are invalid");
+        return tokError("basic block pointers are invalid");
       if (Result->isVoidTy())
-        return TokError("pointers to void are invalid - use i8* instead");
+        return tokError("pointers to void are invalid - use i8* instead");
       if (!PointerType::isValidElementType(Result))
-        return TokError("pointer to this type is invalid");
+        return tokError("pointer to this type is invalid");
       Result = PointerType::getUnqual(Result);
       Lex.Lex();
       break;
@@ -2473,14 +2556,14 @@ bool LLParser::ParseType(Type *&Result, const Twine &Msg, bool AllowVoid) {
     // Type ::= Type 'addrspace' '(' uint32 ')' '*'
     case lltok::kw_addrspace: {
       if (Result->isLabelTy())
-        return TokError("basic block pointers are invalid");
+        return tokError("basic block pointers are invalid");
       if (Result->isVoidTy())
-        return TokError("pointers to void are invalid; use i8* instead");
+        return tokError("pointers to void are invalid; use i8* instead");
       if (!PointerType::isValidElementType(Result))
-        return TokError("pointer to this type is invalid");
+        return tokError("pointer to this type is invalid");
       unsigned AddrSpace;
-      if (ParseOptionalAddrSpace(AddrSpace) ||
-          ParseToken(lltok::star, "expected '*' in address space"))
+      if (parseOptionalAddrSpace(AddrSpace) ||
+          parseToken(lltok::star, "expected '*' in address space"))
         return true;
 
       Result = PointerType::get(Result, AddrSpace);
@@ -2489,55 +2572,55 @@ bool LLParser::ParseType(Type *&Result, const Twine &Msg, bool AllowVoid) {
 
     /// Types '(' ArgTypeListI ')' OptFuncAttrs
     case lltok::lparen:
-      if (ParseFunctionType(Result))
+      if (parseFunctionType(Result))
         return true;
       break;
     }
   }
 }
 
-/// ParseParameterList
+/// parseParameterList
 ///    ::= '(' ')'
 ///    ::= '(' Arg (',' Arg)* ')'
 ///  Arg
 ///    ::= Type OptionalAttributes Value OptionalAttributes
-bool LLParser::ParseParameterList(SmallVectorImpl<ParamInfo> &ArgList,
+bool LLParser::parseParameterList(SmallVectorImpl<ParamInfo> &ArgList,
                                   PerFunctionState &PFS, bool IsMustTailCall,
                                   bool InVarArgsFunc) {
-  if (ParseToken(lltok::lparen, "expected '(' in call"))
+  if (parseToken(lltok::lparen, "expected '(' in call"))
     return true;
 
   while (Lex.getKind() != lltok::rparen) {
     // If this isn't the first argument, we need a comma.
     if (!ArgList.empty() &&
-        ParseToken(lltok::comma, "expected ',' in argument list"))
+        parseToken(lltok::comma, "expected ',' in argument list"))
       return true;
 
-    // Parse an ellipsis if this is a musttail call in a variadic function.
+    // parse an ellipsis if this is a musttail call in a variadic function.
     if (Lex.getKind() == lltok::dotdotdot) {
       const char *Msg = "unexpected ellipsis in argument list for ";
       if (!IsMustTailCall)
-        return TokError(Twine(Msg) + "non-musttail call");
+        return tokError(Twine(Msg) + "non-musttail call");
       if (!InVarArgsFunc)
-        return TokError(Twine(Msg) + "musttail call in non-varargs function");
+        return tokError(Twine(Msg) + "musttail call in non-varargs function");
       Lex.Lex();  // Lex the '...', it is purely for readability.
-      return ParseToken(lltok::rparen, "expected ')' at end of argument list");
+      return parseToken(lltok::rparen, "expected ')' at end of argument list");
     }
 
-    // Parse the argument.
+    // parse the argument.
     LocTy ArgLoc;
     Type *ArgTy = nullptr;
     AttrBuilder ArgAttrs;
     Value *V;
-    if (ParseType(ArgTy, ArgLoc))
+    if (parseType(ArgTy, ArgLoc))
       return true;
 
     if (ArgTy->isMetadataTy()) {
-      if (ParseMetadataAsValue(V, PFS))
+      if (parseMetadataAsValue(V, PFS))
         return true;
     } else {
       // Otherwise, handle normal operands.
-      if (ParseOptionalParamAttrs(ArgAttrs) || ParseValue(ArgTy, V, PFS))
+      if (parseOptionalParamAttrs(ArgAttrs) || parseValue(ArgTy, V, PFS))
         return true;
     }
     ArgList.push_back(ParamInfo(
@@ -2545,45 +2628,41 @@ bool LLParser::ParseParameterList(SmallVectorImpl<ParamInfo> &ArgList,
   }
 
   if (IsMustTailCall && InVarArgsFunc)
-    return TokError("expected '...' at end of argument list for musttail call "
+    return tokError("expected '...' at end of argument list for musttail call "
                     "in varargs function");
 
   Lex.Lex();  // Lex the ')'.
   return false;
 }
 
-/// ParseByValWithOptionalType
-///   ::= byval
-///   ::= byval(<ty>)
-bool LLParser::ParseByValWithOptionalType(Type *&Result) {
+/// parseRequiredTypeAttr
+///   ::= attrname(<ty>)
+bool LLParser::parseRequiredTypeAttr(Type *&Result, lltok::Kind AttrName) {
   Result = nullptr;
-  if (!EatIfPresent(lltok::kw_byval))
+  if (!EatIfPresent(AttrName))
     return true;
   if (!EatIfPresent(lltok::lparen))
-    return false;
-  if (ParseType(Result))
+    return error(Lex.getLoc(), "expected '('");
+  if (parseType(Result))
     return true;
   if (!EatIfPresent(lltok::rparen))
-    return Error(Lex.getLoc(), "expected ')'");
+    return error(Lex.getLoc(), "expected ')'");
   return false;
 }
 
-/// ParsePreallocated
+/// parsePreallocated
 ///   ::= preallocated(<ty>)
-bool LLParser::ParsePreallocated(Type *&Result) {
-  Result = nullptr;
-  if (!EatIfPresent(lltok::kw_preallocated))
-    return true;
-  if (!EatIfPresent(lltok::lparen))
-    return Error(Lex.getLoc(), "expected '('");
-  if (ParseType(Result))
-    return true;
-  if (!EatIfPresent(lltok::rparen))
-    return Error(Lex.getLoc(), "expected ')'");
-  return false;
+bool LLParser::parsePreallocated(Type *&Result) {
+  return parseRequiredTypeAttr(Result, lltok::kw_preallocated);
+}
+
+/// parseByRef
+///   ::= byref(<type>)
+bool LLParser::parseByRef(Type *&Result) {
+  return parseRequiredTypeAttr(Result, lltok::kw_byref);
 }
 
-/// ParseOptionalOperandBundles
+/// parseOptionalOperandBundles
 ///    ::= /*empty*/
 ///    ::= '[' OperandBundle [, OperandBundle ]* ']'
 ///
@@ -2592,7 +2671,7 @@ bool LLParser::ParsePreallocated(Type *&Result) {
 ///    ::= bundle-tag '(' Type Value [, Type Value ]* ')'
 ///
 /// bundle-tag ::= String Constant
-bool LLParser::ParseOptionalOperandBundles(
+bool LLParser::parseOptionalOperandBundles(
     SmallVectorImpl<OperandBundleDef> &BundleList, PerFunctionState &PFS) {
   LocTy BeginLoc = Lex.getLoc();
   if (!EatIfPresent(lltok::lsquare))
@@ -2601,26 +2680,26 @@ bool LLParser::ParseOptionalOperandBundles(
   while (Lex.getKind() != lltok::rsquare) {
     // If this isn't the first operand bundle, we need a comma.
     if (!BundleList.empty() &&
-        ParseToken(lltok::comma, "expected ',' in input list"))
+        parseToken(lltok::comma, "expected ',' in input list"))
       return true;
 
     std::string Tag;
-    if (ParseStringConstant(Tag))
+    if (parseStringConstant(Tag))
       return true;
 
-    if (ParseToken(lltok::lparen, "expected '(' in operand bundle"))
+    if (parseToken(lltok::lparen, "expected '(' in operand bundle"))
       return true;
 
     std::vector<Value *> Inputs;
     while (Lex.getKind() != lltok::rparen) {
       // If this isn't the first input, we need a comma.
       if (!Inputs.empty() &&
-          ParseToken(lltok::comma, "expected ',' in input list"))
+          parseToken(lltok::comma, "expected ',' in input list"))
         return true;
 
       Type *Ty = nullptr;
       Value *Input = nullptr;
-      if (ParseType(Ty) || ParseValue(Ty, Input, PFS))
+      if (parseType(Ty) || parseValue(Ty, Input, PFS))
         return true;
       Inputs.push_back(Input);
     }
@@ -2631,13 +2710,13 @@ bool LLParser::ParseOptionalOperandBundles(
   }
 
   if (BundleList.empty())
-    return Error(BeginLoc, "operand bundle set must not be empty");
+    return error(BeginLoc, "operand bundle set must not be empty");
 
   Lex.Lex(); // Lex the ']'.
   return false;
 }
 
-/// ParseArgumentList - Parse the argument list for a function type or function
+/// parseArgumentList - parse the argument list for a function type or function
 /// prototype.
 ///   ::= '(' ArgTypeListI ')'
 /// ArgTypeListI
@@ -2646,17 +2725,17 @@ bool LLParser::ParseOptionalOperandBundles(
 ///   ::= ArgTypeList ',' '...'
 ///   ::= ArgType (',' ArgType)*
 ///
-bool LLParser::ParseArgumentList(SmallVectorImpl<ArgInfo> &ArgList,
-                                 bool &isVarArg){
+bool LLParser::parseArgumentList(SmallVectorImpl<ArgInfo> &ArgList,
+                                 bool &IsVarArg) {
   unsigned CurValID = 0;
-  isVarArg = false;
+  IsVarArg = false;
   assert(Lex.getKind() == lltok::lparen);
   Lex.Lex(); // eat the (.
 
   if (Lex.getKind() == lltok::rparen) {
     // empty
   } else if (Lex.getKind() == lltok::dotdotdot) {
-    isVarArg = true;
+    IsVarArg = true;
     Lex.Lex();
   } else {
     LocTy TypeLoc = Lex.getLoc();
@@ -2664,25 +2743,25 @@ bool LLParser::ParseArgumentList(SmallVectorImpl<ArgInfo> &ArgList,
     AttrBuilder Attrs;
     std::string Name;
 
-    if (ParseType(ArgTy) ||
-        ParseOptionalParamAttrs(Attrs)) return true;
+    if (parseType(ArgTy) || parseOptionalParamAttrs(Attrs))
+      return true;
 
     if (ArgTy->isVoidTy())
-      return Error(TypeLoc, "argument can not have void type");
+      return error(TypeLoc, "argument can not have void type");
 
     if (Lex.getKind() == lltok::LocalVar) {
       Name = Lex.getStrVal();
       Lex.Lex();
     } else if (Lex.getKind() == lltok::LocalVarID) {
       if (Lex.getUIntVal() != CurValID)
-        return Error(TypeLoc, "argument expected to be numbered '%" +
+        return error(TypeLoc, "argument expected to be numbered '%" +
                                   Twine(CurValID) + "'");
       ++CurValID;
       Lex.Lex();
     }
 
     if (!FunctionType::isValidArgumentType(ArgTy))
-      return Error(TypeLoc, "invalid type for function argument");
+      return error(TypeLoc, "invalid type for function argument");
 
     ArgList.emplace_back(TypeLoc, ArgTy,
                          AttributeSet::get(ArgTy->getContext(), Attrs),
@@ -2691,16 +2770,17 @@ bool LLParser::ParseArgumentList(SmallVectorImpl<ArgInfo> &ArgList,
     while (EatIfPresent(lltok::comma)) {
       // Handle ... at end of arg list.
       if (EatIfPresent(lltok::dotdotdot)) {
-        isVarArg = true;
+        IsVarArg = true;
         break;
       }
 
       // Otherwise must be an argument type.
       TypeLoc = Lex.getLoc();
-      if (ParseType(ArgTy) || ParseOptionalParamAttrs(Attrs)) return true;
+      if (parseType(ArgTy) || parseOptionalParamAttrs(Attrs))
+        return true;
 
       if (ArgTy->isVoidTy())
-        return Error(TypeLoc, "argument can not have void type");
+        return error(TypeLoc, "argument can not have void type");
 
       if (Lex.getKind() == lltok::LocalVar) {
         Name = Lex.getStrVal();
@@ -2708,7 +2788,7 @@ bool LLParser::ParseArgumentList(SmallVectorImpl<ArgInfo> &ArgList,
       } else {
         if (Lex.getKind() == lltok::LocalVarID) {
           if (Lex.getUIntVal() != CurValID)
-            return Error(TypeLoc, "argument expected to be numbered '%" +
+            return error(TypeLoc, "argument expected to be numbered '%" +
                                       Twine(CurValID) + "'");
           Lex.Lex();
         }
@@ -2717,7 +2797,7 @@ bool LLParser::ParseArgumentList(SmallVectorImpl<ArgInfo> &ArgList,
       }
 
       if (!ArgTy->isFirstClassType())
-        return Error(TypeLoc, "invalid type for function argument");
+        return error(TypeLoc, "invalid type for function argument");
 
       ArgList.emplace_back(TypeLoc, ArgTy,
                            AttributeSet::get(ArgTy->getContext(), Attrs),
@@ -2725,28 +2805,28 @@ bool LLParser::ParseArgumentList(SmallVectorImpl<ArgInfo> &ArgList,
     }
   }
 
-  return ParseToken(lltok::rparen, "expected ')' at end of argument list");
+  return parseToken(lltok::rparen, "expected ')' at end of argument list");
 }
 
-/// ParseFunctionType
+/// parseFunctionType
 ///  ::= Type ArgumentList OptionalAttrs
-bool LLParser::ParseFunctionType(Type *&Result) {
+bool LLParser::parseFunctionType(Type *&Result) {
   assert(Lex.getKind() == lltok::lparen);
 
   if (!FunctionType::isValidReturnType(Result))
-    return TokError("invalid function return type");
+    return tokError("invalid function return type");
 
   SmallVector<ArgInfo, 8> ArgList;
-  bool isVarArg;
-  if (ParseArgumentList(ArgList, isVarArg))
+  bool IsVarArg;
+  if (parseArgumentList(ArgList, IsVarArg))
     return true;
 
   // Reject names on the arguments lists.
   for (unsigned i = 0, e = ArgList.size(); i != e; ++i) {
     if (!ArgList[i].Name.empty())
-      return Error(ArgList[i].Loc, "argument name invalid in function type");
+      return error(ArgList[i].Loc, "argument name invalid in function type");
     if (ArgList[i].Attrs.hasAttributes())
-      return Error(ArgList[i].Loc,
+      return error(ArgList[i].Loc,
                    "argument attributes invalid in function type");
   }
 
@@ -2754,27 +2834,28 @@ bool LLParser::ParseFunctionType(Type *&Result) {
   for (unsigned i = 0, e = ArgList.size(); i != e; ++i)
     ArgListTy.push_back(ArgList[i].Ty);
 
-  Result = FunctionType::get(Result, ArgListTy, isVarArg);
+  Result = FunctionType::get(Result, ArgListTy, IsVarArg);
   return false;
 }
 
-/// ParseAnonStructType - Parse an anonymous struct type, which is inlined into
+/// parseAnonStructType - parse an anonymous struct type, which is inlined into
 /// other structs.
-bool LLParser::ParseAnonStructType(Type *&Result, bool Packed) {
+bool LLParser::parseAnonStructType(Type *&Result, bool Packed) {
   SmallVector<Type*, 8> Elts;
-  if (ParseStructBody(Elts)) return true;
+  if (parseStructBody(Elts))
+    return true;
 
   Result = StructType::get(Context, Elts, Packed);
   return false;
 }
 
-/// ParseStructDefinition - Parse a struct in a 'type' definition.
-bool LLParser::ParseStructDefinition(SMLoc TypeLoc, StringRef Name,
-                                     std::pair<Type*, LocTy> &Entry,
+/// parseStructDefinition - parse a struct in a 'type' definition.
+bool LLParser::parseStructDefinition(SMLoc TypeLoc, StringRef Name,
+                                     std::pair<Type *, LocTy> &Entry,
                                      Type *&ResultTy) {
   // If the type was already defined, diagnose the redefinition.
   if (Entry.first && !Entry.second.isValid())
-    return Error(TypeLoc, "redefinition of type");
+    return error(TypeLoc, "redefinition of type");
 
   // If we have opaque, just return without filling in the definition for the
   // struct.  This counts as a definition as far as the .ll file goes.
@@ -2797,12 +2878,12 @@ bool LLParser::ParseStructDefinition(SMLoc TypeLoc, StringRef Name,
   // forward referenced and not allowed to be recursive.
   if (Lex.getKind() != lltok::lbrace) {
     if (Entry.first)
-      return Error(TypeLoc, "forward references to non-struct type");
+      return error(TypeLoc, "forward references to non-struct type");
 
     ResultTy = nullptr;
     if (isPacked)
-      return ParseArrayVectorType(ResultTy, true);
-    return ParseType(ResultTy);
+      return parseArrayVectorType(ResultTy, true);
+    return parseType(ResultTy);
   }
 
   // This type is being defined, so clear the location to indicate this.
@@ -2815,8 +2896,8 @@ bool LLParser::ParseStructDefinition(SMLoc TypeLoc, StringRef Name,
   StructType *STy = cast<StructType>(Entry.first);
 
   SmallVector<Type*, 8> Body;
-  if (ParseStructBody(Body) ||
-      (isPacked && ParseToken(lltok::greater, "expected '>' in packed struct")))
+  if (parseStructBody(Body) ||
+      (isPacked && parseToken(lltok::greater, "expected '>' in packed struct")))
     return true;
 
   STy->setBody(Body, isPacked);
@@ -2824,13 +2905,13 @@ bool LLParser::ParseStructDefinition(SMLoc TypeLoc, StringRef Name,
   return false;
 }
 
-/// ParseStructType: Handles packed and unpacked types.  </> parsed elsewhere.
+/// parseStructType: Handles packed and unpacked types.  </> parsed elsewhere.
 ///   StructType
 ///     ::= '{' '}'
 ///     ::= '{' Type (',' Type)* '}'
 ///     ::= '<' '{' '}' '>'
 ///     ::= '<' '{' Type (',' Type)* '}' '>'
-bool LLParser::ParseStructBody(SmallVectorImpl<Type*> &Body) {
+bool LLParser::parseStructBody(SmallVectorImpl<Type *> &Body) {
   assert(Lex.getKind() == lltok::lbrace);
   Lex.Lex(); // Consume the '{'
 
@@ -2840,37 +2921,39 @@ bool LLParser::ParseStructBody(SmallVectorImpl<Type*> &Body) {
 
   LocTy EltTyLoc = Lex.getLoc();
   Type *Ty = nullptr;
-  if (ParseType(Ty)) return true;
+  if (parseType(Ty))
+    return true;
   Body.push_back(Ty);
 
   if (!StructType::isValidElementType(Ty))
-    return Error(EltTyLoc, "invalid element type for struct");
+    return error(EltTyLoc, "invalid element type for struct");
 
   while (EatIfPresent(lltok::comma)) {
     EltTyLoc = Lex.getLoc();
-    if (ParseType(Ty)) return true;
+    if (parseType(Ty))
+      return true;
 
     if (!StructType::isValidElementType(Ty))
-      return Error(EltTyLoc, "invalid element type for struct");
+      return error(EltTyLoc, "invalid element type for struct");
 
     Body.push_back(Ty);
   }
 
-  return ParseToken(lltok::rbrace, "expected '}' at end of struct");
+  return parseToken(lltok::rbrace, "expected '}' at end of struct");
 }
 
-/// ParseArrayVectorType - Parse an array or vector type, assuming the first
+/// parseArrayVectorType - parse an array or vector type, assuming the first
 /// token has already been consumed.
 ///   Type
 ///     ::= '[' APSINTVAL 'x' Types ']'
 ///     ::= '<' APSINTVAL 'x' Types '>'
 ///     ::= '<' 'vscale' 'x' APSINTVAL 'x' Types '>'
-bool LLParser::ParseArrayVectorType(Type *&Result, bool isVector) {
+bool LLParser::parseArrayVectorType(Type *&Result, bool IsVector) {
   bool Scalable = false;
 
-  if (isVector && Lex.getKind() == lltok::kw_vscale) {
+  if (IsVector && Lex.getKind() == lltok::kw_vscale) {
     Lex.Lex(); // consume the 'vscale'
-    if (ParseToken(lltok::kw_x, "expected 'x' after vscale"))
+    if (parseToken(lltok::kw_x, "expected 'x' after vscale"))
       return true;
 
     Scalable = true;
@@ -2878,34 +2961,35 @@ bool LLParser::ParseArrayVectorType(Type *&Result, bool isVector) {
 
   if (Lex.getKind() != lltok::APSInt || Lex.getAPSIntVal().isSigned() ||
       Lex.getAPSIntVal().getBitWidth() > 64)
-    return TokError("expected number in address space");
+    return tokError("expected number in address space");
 
   LocTy SizeLoc = Lex.getLoc();
   uint64_t Size = Lex.getAPSIntVal().getZExtValue();
   Lex.Lex();
 
-  if (ParseToken(lltok::kw_x, "expected 'x' after element count"))
-      return true;
+  if (parseToken(lltok::kw_x, "expected 'x' after element count"))
+    return true;
 
   LocTy TypeLoc = Lex.getLoc();
   Type *EltTy = nullptr;
-  if (ParseType(EltTy)) return true;
+  if (parseType(EltTy))
+    return true;
 
-  if (ParseToken(isVector ? lltok::greater : lltok::rsquare,
+  if (parseToken(IsVector ? lltok::greater : lltok::rsquare,
                  "expected end of sequential type"))
     return true;
 
-  if (isVector) {
+  if (IsVector) {
     if (Size == 0)
-      return Error(SizeLoc, "zero element vector is illegal");
+      return error(SizeLoc, "zero element vector is illegal");
     if ((unsigned)Size != Size)
-      return Error(SizeLoc, "size too large for vector");
+      return error(SizeLoc, "size too large for vector");
     if (!VectorType::isValidElementType(EltTy))
-      return Error(TypeLoc, "invalid vector element type");
+      return error(TypeLoc, "invalid vector element type");
     Result = VectorType::get(EltTy, unsigned(Size), Scalable);
   } else {
     if (!ArrayType::isValidElementType(EltTy))
-      return Error(TypeLoc, "invalid array element type");
+      return error(TypeLoc, "invalid array element type");
     Result = ArrayType::get(EltTy, Size);
   }
   return false;
@@ -2945,22 +3029,22 @@ LLParser::PerFunctionState::~PerFunctionState() {
   }
 }
 
-bool LLParser::PerFunctionState::FinishFunction() {
+bool LLParser::PerFunctionState::finishFunction() {
   if (!ForwardRefVals.empty())
-    return P.Error(ForwardRefVals.begin()->second.second,
+    return P.error(ForwardRefVals.begin()->second.second,
                    "use of undefined value '%" + ForwardRefVals.begin()->first +
-                   "'");
+                       "'");
   if (!ForwardRefValIDs.empty())
-    return P.Error(ForwardRefValIDs.begin()->second.second,
+    return P.error(ForwardRefValIDs.begin()->second.second,
                    "use of undefined value '%" +
-                   Twine(ForwardRefValIDs.begin()->first) + "'");
+                       Twine(ForwardRefValIDs.begin()->first) + "'");
   return false;
 }
 
-/// GetVal - Get a value with the specified name or ID, creating a
+/// getVal - Get a value with the specified name or ID, creating a
 /// forward reference record if needed.  This can return null if the value
 /// exists but does not have the right type.
-Value *LLParser::PerFunctionState::GetVal(const std::string &Name, Type *Ty,
+Value *LLParser::PerFunctionState::getVal(const std::string &Name, Type *Ty,
                                           LocTy Loc, bool IsCall) {
   // Look this name up in the normal function symbol table.
   Value *Val = F.getValueSymbolTable()->lookup(Name);
@@ -2979,7 +3063,7 @@ Value *LLParser::PerFunctionState::GetVal(const std::string &Name, Type *Ty,
 
   // Don't make placeholders with invalid type.
   if (!Ty->isFirstClassType()) {
-    P.Error(Loc, "invalid use of a non-first-class type");
+    P.error(Loc, "invalid use of a non-first-class type");
     return nullptr;
   }
 
@@ -2995,7 +3079,7 @@ Value *LLParser::PerFunctionState::GetVal(const std::string &Name, Type *Ty,
   return FwdVal;
 }
 
-Value *LLParser::PerFunctionState::GetVal(unsigned ID, Type *Ty, LocTy Loc,
+Value *LLParser::PerFunctionState::getVal(unsigned ID, Type *Ty, LocTy Loc,
                                           bool IsCall) {
   // Look this name up in the normal function symbol table.
   Value *Val = ID < NumberedVals.size() ? NumberedVals[ID] : nullptr;
@@ -3013,7 +3097,7 @@ Value *LLParser::PerFunctionState::GetVal(unsigned ID, Type *Ty, LocTy Loc,
     return P.checkValidVariableType(Loc, "%" + Twine(ID), Ty, Val, IsCall);
 
   if (!Ty->isFirstClassType()) {
-    P.Error(Loc, "invalid use of a non-first-class type");
+    P.error(Loc, "invalid use of a non-first-class type");
     return nullptr;
   }
 
@@ -3029,15 +3113,15 @@ Value *LLParser::PerFunctionState::GetVal(unsigned ID, Type *Ty, LocTy Loc,
   return FwdVal;
 }
 
-/// SetInstName - After an instruction is parsed and inserted into its
+/// setInstName - After an instruction is parsed and inserted into its
 /// basic block, this installs its name.
-bool LLParser::PerFunctionState::SetInstName(int NameID,
+bool LLParser::PerFunctionState::setInstName(int NameID,
                                              const std::string &NameStr,
                                              LocTy NameLoc, Instruction *Inst) {
   // If this instruction has void type, it cannot have a name or ID specified.
   if (Inst->getType()->isVoidTy()) {
     if (NameID != -1 || !NameStr.empty())
-      return P.Error(NameLoc, "instructions returning void cannot have a name");
+      return P.error(NameLoc, "instructions returning void cannot have a name");
     return false;
   }
 
@@ -3049,15 +3133,16 @@ bool LLParser::PerFunctionState::SetInstName(int NameID,
       NameID = NumberedVals.size();
 
     if (unsigned(NameID) != NumberedVals.size())
-      return P.Error(NameLoc, "instruction expected to be numbered '%" +
-                     Twine(NumberedVals.size()) + "'");
+      return P.error(NameLoc, "instruction expected to be numbered '%" +
+                                  Twine(NumberedVals.size()) + "'");
 
     auto FI = ForwardRefValIDs.find(NameID);
     if (FI != ForwardRefValIDs.end()) {
       Value *Sentinel = FI->second.first;
       if (Sentinel->getType() != Inst->getType())
-        return P.Error(NameLoc, "instruction forward referenced with type '" +
-                       getTypeString(FI->second.first->getType()) + "'");
+        return P.error(NameLoc, "instruction forward referenced with type '" +
+                                    getTypeString(FI->second.first->getType()) +
+                                    "'");
 
       Sentinel->replaceAllUsesWith(Inst);
       Sentinel->deleteValue();
@@ -3073,8 +3158,9 @@ bool LLParser::PerFunctionState::SetInstName(int NameID,
   if (FI != ForwardRefVals.end()) {
     Value *Sentinel = FI->second.first;
     if (Sentinel->getType() != Inst->getType())
-      return P.Error(NameLoc, "instruction forward referenced with type '" +
-                     getTypeString(FI->second.first->getType()) + "'");
+      return P.error(NameLoc, "instruction forward referenced with type '" +
+                                  getTypeString(FI->second.first->getType()) +
+                                  "'");
 
     Sentinel->replaceAllUsesWith(Inst);
     Sentinel->deleteValue();
@@ -3085,46 +3171,46 @@ bool LLParser::PerFunctionState::SetInstName(int NameID,
   Inst->setName(NameStr);
 
   if (Inst->getName() != NameStr)
-    return P.Error(NameLoc, "multiple definition of local value named '" +
-                   NameStr + "'");
+    return P.error(NameLoc, "multiple definition of local value named '" +
+                                NameStr + "'");
   return false;
 }
 
-/// GetBB - Get a basic block with the specified name or ID, creating a
+/// getBB - Get a basic block with the specified name or ID, creating a
 /// forward reference record if needed.
-BasicBlock *LLParser::PerFunctionState::GetBB(const std::string &Name,
+BasicBlock *LLParser::PerFunctionState::getBB(const std::string &Name,
                                               LocTy Loc) {
   return dyn_cast_or_null<BasicBlock>(
-      GetVal(Name, Type::getLabelTy(F.getContext()), Loc, /*IsCall=*/false));
+      getVal(Name, Type::getLabelTy(F.getContext()), Loc, /*IsCall=*/false));
 }
 
-BasicBlock *LLParser::PerFunctionState::GetBB(unsigned ID, LocTy Loc) {
+BasicBlock *LLParser::PerFunctionState::getBB(unsigned ID, LocTy Loc) {
   return dyn_cast_or_null<BasicBlock>(
-      GetVal(ID, Type::getLabelTy(F.getContext()), Loc, /*IsCall=*/false));
+      getVal(ID, Type::getLabelTy(F.getContext()), Loc, /*IsCall=*/false));
 }
 
-/// DefineBB - Define the specified basic block, which is either named or
+/// defineBB - Define the specified basic block, which is either named or
 /// unnamed.  If there is an error, this returns null otherwise it returns
 /// the block being defined.
-BasicBlock *LLParser::PerFunctionState::DefineBB(const std::string &Name,
+BasicBlock *LLParser::PerFunctionState::defineBB(const std::string &Name,
                                                  int NameID, LocTy Loc) {
   BasicBlock *BB;
   if (Name.empty()) {
     if (NameID != -1 && unsigned(NameID) != NumberedVals.size()) {
-      P.Error(Loc, "label expected to be numbered '" +
+      P.error(Loc, "label expected to be numbered '" +
                        Twine(NumberedVals.size()) + "'");
       return nullptr;
     }
-    BB = GetBB(NumberedVals.size(), Loc);
+    BB = getBB(NumberedVals.size(), Loc);
     if (!BB) {
-      P.Error(Loc, "unable to create block numbered '" +
+      P.error(Loc, "unable to create block numbered '" +
                        Twine(NumberedVals.size()) + "'");
       return nullptr;
     }
   } else {
-    BB = GetBB(Name, Loc);
+    BB = getBB(Name, Loc);
     if (!BB) {
-      P.Error(Loc, "unable to create block named '" + Name + "'");
+      P.error(Loc, "unable to create block named '" + Name + "'");
       return nullptr;
     }
   }
@@ -3149,16 +3235,17 @@ BasicBlock *LLParser::PerFunctionState::DefineBB(const std::string &Name,
 // Constants.
 //===----------------------------------------------------------------------===//
 
-/// ParseValID - Parse an abstract value that doesn't necessarily have a
+/// parseValID - parse an abstract value that doesn't necessarily have a
 /// type implied.  For example, if we parse "4" we don't know what integer type
 /// it has.  The value will later be combined with its type and checked for
 /// sanity.  PFS is used to convert function-local operands of metadata (since
 /// metadata operands are not just parsed here but also converted to values).
 /// PFS can be null when we are not parsing metadata values inside a function.
-bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
+bool LLParser::parseValID(ValID &ID, PerFunctionState *PFS) {
   ID.Loc = Lex.getLoc();
   switch (Lex.getKind()) {
-  default: return TokError("expected value token");
+  default:
+    return tokError("expected value token");
   case lltok::GlobalID:  // @42
     ID.UIntVal = Lex.getUIntVal();
     ID.Kind = ValID::t_GlobalID;
@@ -3193,6 +3280,7 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
     break;
   case lltok::kw_null: ID.Kind = ValID::t_Null; break;
   case lltok::kw_undef: ID.Kind = ValID::t_Undef; break;
+  case lltok::kw_poison: ID.Kind = ValID::t_Poison; break;
   case lltok::kw_zeroinitializer: ID.Kind = ValID::t_Zero; break;
   case lltok::kw_none: ID.Kind = ValID::t_None; break;
 
@@ -3200,8 +3288,8 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
     // ValID ::= '{' ConstVector '}'
     Lex.Lex();
     SmallVector<Constant*, 16> Elts;
-    if (ParseGlobalValueVector(Elts) ||
-        ParseToken(lltok::rbrace, "expected end of struct constant"))
+    if (parseGlobalValueVector(Elts) ||
+        parseToken(lltok::rbrace, "expected end of struct constant"))
       return true;
 
     ID.ConstantStructElts = std::make_unique<Constant *[]>(Elts.size());
@@ -3219,10 +3307,10 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
 
     SmallVector<Constant*, 16> Elts;
     LocTy FirstEltLoc = Lex.getLoc();
-    if (ParseGlobalValueVector(Elts) ||
+    if (parseGlobalValueVector(Elts) ||
         (isPackedStruct &&
-         ParseToken(lltok::rbrace, "expected end of packed struct")) ||
-        ParseToken(lltok::greater, "expected end of constant"))
+         parseToken(lltok::rbrace, "expected end of packed struct")) ||
+        parseToken(lltok::greater, "expected end of constant"))
       return true;
 
     if (isPackedStruct) {
@@ -3235,20 +3323,21 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
     }
 
     if (Elts.empty())
-      return Error(ID.Loc, "constant vector must not be empty");
+      return error(ID.Loc, "constant vector must not be empty");
 
     if (!Elts[0]->getType()->isIntegerTy() &&
         !Elts[0]->getType()->isFloatingPointTy() &&
         !Elts[0]->getType()->isPointerTy())
-      return Error(FirstEltLoc,
-            "vector elements must have integer, pointer or floating point type");
+      return error(
+          FirstEltLoc,
+          "vector elements must have integer, pointer or floating point type");
 
     // Verify that all the vector elements have the same type.
     for (unsigned i = 1, e = Elts.size(); i != e; ++i)
       if (Elts[i]->getType() != Elts[0]->getType())
-        return Error(FirstEltLoc,
-                     "vector element #" + Twine(i) +
-                    " is not of type '" + getTypeString(Elts[0]->getType()));
+        return error(FirstEltLoc, "vector element #" + Twine(i) +
+                                      " is not of type '" +
+                                      getTypeString(Elts[0]->getType()));
 
     ID.ConstantVal = ConstantVector::get(Elts);
     ID.Kind = ValID::t_Constant;
@@ -3258,8 +3347,8 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
     Lex.Lex();
     SmallVector<Constant*, 16> Elts;
     LocTy FirstEltLoc = Lex.getLoc();
-    if (ParseGlobalValueVector(Elts) ||
-        ParseToken(lltok::rsquare, "expected end of array constant"))
+    if (parseGlobalValueVector(Elts) ||
+        parseToken(lltok::rsquare, "expected end of array constant"))
       return true;
 
     // Handle empty element.
@@ -3271,17 +3360,17 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
     }
 
     if (!Elts[0]->getType()->isFirstClassType())
-      return Error(FirstEltLoc, "invalid array element type: " +
-                   getTypeString(Elts[0]->getType()));
+      return error(FirstEltLoc, "invalid array element type: " +
+                                    getTypeString(Elts[0]->getType()));
 
     ArrayType *ATy = ArrayType::get(Elts[0]->getType(), Elts.size());
 
     // Verify all elements are correct type!
     for (unsigned i = 0, e = Elts.size(); i != e; ++i) {
       if (Elts[i]->getType() != Elts[0]->getType())
-        return Error(FirstEltLoc,
-                     "array element #" + Twine(i) +
-                     " is not of type '" + getTypeString(Elts[0]->getType()));
+        return error(FirstEltLoc, "array element #" + Twine(i) +
+                                      " is not of type '" +
+                                      getTypeString(Elts[0]->getType()));
     }
 
     ID.ConstantVal = ConstantArray::get(ATy, Elts);
@@ -3292,7 +3381,8 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
     Lex.Lex();
     ID.ConstantVal = ConstantDataArray::getString(Context, Lex.getStrVal(),
                                                   false);
-    if (ParseToken(lltok::StringConstant, "expected string")) return true;
+    if (parseToken(lltok::StringConstant, "expected string"))
+      return true;
     ID.Kind = ValID::t_Constant;
     return false;
 
@@ -3301,12 +3391,12 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
     //             STRINGCONSTANT
     bool HasSideEffect, AlignStack, AsmDialect;
     Lex.Lex();
-    if (ParseOptionalToken(lltok::kw_sideeffect, HasSideEffect) ||
-        ParseOptionalToken(lltok::kw_alignstack, AlignStack) ||
-        ParseOptionalToken(lltok::kw_inteldialect, AsmDialect) ||
-        ParseStringConstant(ID.StrVal) ||
-        ParseToken(lltok::comma, "expected comma in inline asm expression") ||
-        ParseToken(lltok::StringConstant, "expected constraint string"))
+    if (parseOptionalToken(lltok::kw_sideeffect, HasSideEffect) ||
+        parseOptionalToken(lltok::kw_alignstack, AlignStack) ||
+        parseOptionalToken(lltok::kw_inteldialect, AsmDialect) ||
+        parseStringConstant(ID.StrVal) ||
+        parseToken(lltok::comma, "expected comma in inline asm expression") ||
+        parseToken(lltok::StringConstant, "expected constraint string"))
       return true;
     ID.StrVal2 = Lex.getStrVal();
     ID.UIntVal = unsigned(HasSideEffect) | (unsigned(AlignStack)<<1) |
@@ -3321,17 +3411,18 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
 
     ValID Fn, Label;
 
-    if (ParseToken(lltok::lparen, "expected '(' in block address expression") ||
-        ParseValID(Fn) ||
-        ParseToken(lltok::comma, "expected comma in block address expression")||
-        ParseValID(Label) ||
-        ParseToken(lltok::rparen, "expected ')' in block address expression"))
+    if (parseToken(lltok::lparen, "expected '(' in block address expression") ||
+        parseValID(Fn) ||
+        parseToken(lltok::comma,
+                   "expected comma in block address expression") ||
+        parseValID(Label) ||
+        parseToken(lltok::rparen, "expected ')' in block address expression"))
       return true;
 
     if (Fn.Kind != ValID::t_GlobalID && Fn.Kind != ValID::t_GlobalName)
-      return Error(Fn.Loc, "expected function name in blockaddress");
+      return error(Fn.Loc, "expected function name in blockaddress");
     if (Label.Kind != ValID::t_LocalID && Label.Kind != ValID::t_LocalName)
-      return Error(Label.Loc, "expected basic block name in blockaddress");
+      return error(Label.Loc, "expected basic block name in blockaddress");
 
     // Try to find the function (but skip it if it's forward-referenced).
     GlobalValue *GV = nullptr;
@@ -3345,10 +3436,10 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
     if (GV) {
       // Confirm that it's actually a function with a definition.
       if (!isa<Function>(GV))
-        return Error(Fn.Loc, "expected function name in blockaddress");
+        return error(Fn.Loc, "expected function name in blockaddress");
       F = cast<Function>(GV);
       if (F->isDeclaration())
-        return Error(Fn.Loc, "cannot take blockaddress inside a declaration");
+        return error(Fn.Loc, "cannot take blockaddress inside a declaration");
     }
 
     if (!F) {
@@ -3372,19 +3463,19 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
     BasicBlock *BB;
     if (BlockAddressPFS && F == &BlockAddressPFS->getFunction()) {
       if (Label.Kind == ValID::t_LocalID)
-        BB = BlockAddressPFS->GetBB(Label.UIntVal, Label.Loc);
+        BB = BlockAddressPFS->getBB(Label.UIntVal, Label.Loc);
       else
-        BB = BlockAddressPFS->GetBB(Label.StrVal, Label.Loc);
+        BB = BlockAddressPFS->getBB(Label.StrVal, Label.Loc);
       if (!BB)
-        return Error(Label.Loc, "referenced value is not a basic block");
+        return error(Label.Loc, "referenced value is not a basic block");
     } else {
       if (Label.Kind == ValID::t_LocalID)
-        return Error(Label.Loc, "cannot take address of numeric label after "
+        return error(Label.Loc, "cannot take address of numeric label after "
                                 "the function is defined");
       BB = dyn_cast_or_null<BasicBlock>(
           F->getValueSymbolTable()->lookup(Label.StrVal));
       if (!BB)
-        return Error(Label.Loc, "referenced value is not a basic block");
+        return error(Label.Loc, "referenced value is not a basic block");
     }
 
     ID.ConstantVal = BlockAddress::get(F, BB);
@@ -3392,6 +3483,39 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
     return false;
   }
 
+  case lltok::kw_dso_local_equivalent: {
+    // ValID ::= 'dso_local_equivalent' @foo
+    Lex.Lex();
+
+    ValID Fn;
+
+    if (parseValID(Fn))
+      return true;
+
+    if (Fn.Kind != ValID::t_GlobalID && Fn.Kind != ValID::t_GlobalName)
+      return error(Fn.Loc,
+                   "expected global value name in dso_local_equivalent");
+
+    // Try to find the function (but skip it if it's forward-referenced).
+    GlobalValue *GV = nullptr;
+    if (Fn.Kind == ValID::t_GlobalID) {
+      if (Fn.UIntVal < NumberedVals.size())
+        GV = NumberedVals[Fn.UIntVal];
+    } else if (!ForwardRefVals.count(Fn.StrVal)) {
+      GV = M->getNamedValue(Fn.StrVal);
+    }
+
+    assert(GV && "Could not find a corresponding global variable");
+
+    if (!GV->getValueType()->isFunctionTy())
+      return error(Fn.Loc, "expected a function, alias to function, or ifunc "
+                           "in dso_local_equivalent");
+
+    ID.ConstantVal = DSOLocalEquivalent::get(GV);
+    ID.Kind = ValID::t_Constant;
+    return false;
+  }
+
   case lltok::kw_trunc:
   case lltok::kw_zext:
   case lltok::kw_sext:
@@ -3409,16 +3533,16 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
     Type *DestTy = nullptr;
     Constant *SrcVal;
     Lex.Lex();
-    if (ParseToken(lltok::lparen, "expected '(' after constantexpr cast") ||
-        ParseGlobalTypeAndValue(SrcVal) ||
-        ParseToken(lltok::kw_to, "expected 'to' in constantexpr cast") ||
-        ParseType(DestTy) ||
-        ParseToken(lltok::rparen, "expected ')' at end of constantexpr cast"))
+    if (parseToken(lltok::lparen, "expected '(' after constantexpr cast") ||
+        parseGlobalTypeAndValue(SrcVal) ||
+        parseToken(lltok::kw_to, "expected 'to' in constantexpr cast") ||
+        parseType(DestTy) ||
+        parseToken(lltok::rparen, "expected ')' at end of constantexpr cast"))
       return true;
     if (!CastInst::castIsValid((Instruction::CastOps)Opc, SrcVal, DestTy))
-      return Error(ID.Loc, "invalid cast opcode for cast from '" +
-                   getTypeString(SrcVal->getType()) + "' to '" +
-                   getTypeString(DestTy) + "'");
+      return error(ID.Loc, "invalid cast opcode for cast from '" +
+                               getTypeString(SrcVal->getType()) + "' to '" +
+                               getTypeString(DestTy) + "'");
     ID.ConstantVal = ConstantExpr::getCast((Instruction::CastOps)Opc,
                                                  SrcVal, DestTy);
     ID.Kind = ValID::t_Constant;
@@ -3428,16 +3552,16 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
     Lex.Lex();
     Constant *Val;
     SmallVector<unsigned, 4> Indices;
-    if (ParseToken(lltok::lparen, "expected '(' in extractvalue constantexpr")||
-        ParseGlobalTypeAndValue(Val) ||
-        ParseIndexList(Indices) ||
-        ParseToken(lltok::rparen, "expected ')' in extractvalue constantexpr"))
+    if (parseToken(lltok::lparen,
+                   "expected '(' in extractvalue constantexpr") ||
+        parseGlobalTypeAndValue(Val) || parseIndexList(Indices) ||
+        parseToken(lltok::rparen, "expected ')' in extractvalue constantexpr"))
       return true;
 
     if (!Val->getType()->isAggregateType())
-      return Error(ID.Loc, "extractvalue operand must be aggregate type");
+      return error(ID.Loc, "extractvalue operand must be aggregate type");
     if (!ExtractValueInst::getIndexedType(Val->getType(), Indices))
-      return Error(ID.Loc, "invalid indices for extractvalue");
+      return error(ID.Loc, "invalid indices for extractvalue");
     ID.ConstantVal = ConstantExpr::getExtractValue(Val, Indices);
     ID.Kind = ValID::t_Constant;
     return false;
@@ -3446,21 +3570,21 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
     Lex.Lex();
     Constant *Val0, *Val1;
     SmallVector<unsigned, 4> Indices;
-    if (ParseToken(lltok::lparen, "expected '(' in insertvalue constantexpr")||
-        ParseGlobalTypeAndValue(Val0) ||
-        ParseToken(lltok::comma, "expected comma in insertvalue constantexpr")||
-        ParseGlobalTypeAndValue(Val1) ||
-        ParseIndexList(Indices) ||
-        ParseToken(lltok::rparen, "expected ')' in insertvalue constantexpr"))
+    if (parseToken(lltok::lparen, "expected '(' in insertvalue constantexpr") ||
+        parseGlobalTypeAndValue(Val0) ||
+        parseToken(lltok::comma,
+                   "expected comma in insertvalue constantexpr") ||
+        parseGlobalTypeAndValue(Val1) || parseIndexList(Indices) ||
+        parseToken(lltok::rparen, "expected ')' in insertvalue constantexpr"))
       return true;
     if (!Val0->getType()->isAggregateType())
-      return Error(ID.Loc, "insertvalue operand must be aggregate type");
+      return error(ID.Loc, "insertvalue operand must be aggregate type");
     Type *IndexedType =
         ExtractValueInst::getIndexedType(Val0->getType(), Indices);
     if (!IndexedType)
-      return Error(ID.Loc, "invalid indices for insertvalue");
+      return error(ID.Loc, "invalid indices for insertvalue");
     if (IndexedType != Val1->getType())
-      return Error(ID.Loc, "insertvalue operand and field disagree in type: '" +
+      return error(ID.Loc, "insertvalue operand and field disagree in type: '" +
                                getTypeString(Val1->getType()) +
                                "' instead of '" + getTypeString(IndexedType) +
                                "'");
@@ -3473,28 +3597,28 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
     unsigned PredVal, Opc = Lex.getUIntVal();
     Constant *Val0, *Val1;
     Lex.Lex();
-    if (ParseCmpPredicate(PredVal, Opc) ||
-        ParseToken(lltok::lparen, "expected '(' in compare constantexpr") ||
-        ParseGlobalTypeAndValue(Val0) ||
-        ParseToken(lltok::comma, "expected comma in compare constantexpr") ||
-        ParseGlobalTypeAndValue(Val1) ||
-        ParseToken(lltok::rparen, "expected ')' in compare constantexpr"))
+    if (parseCmpPredicate(PredVal, Opc) ||
+        parseToken(lltok::lparen, "expected '(' in compare constantexpr") ||
+        parseGlobalTypeAndValue(Val0) ||
+        parseToken(lltok::comma, "expected comma in compare constantexpr") ||
+        parseGlobalTypeAndValue(Val1) ||
+        parseToken(lltok::rparen, "expected ')' in compare constantexpr"))
       return true;
 
     if (Val0->getType() != Val1->getType())
-      return Error(ID.Loc, "compare operands must have the same type");
+      return error(ID.Loc, "compare operands must have the same type");
 
     CmpInst::Predicate Pred = (CmpInst::Predicate)PredVal;
 
     if (Opc == Instruction::FCmp) {
       if (!Val0->getType()->isFPOrFPVectorTy())
-        return Error(ID.Loc, "fcmp requires floating point operands");
+        return error(ID.Loc, "fcmp requires floating point operands");
       ID.ConstantVal = ConstantExpr::getFCmp(Pred, Val0, Val1);
     } else {
       assert(Opc == Instruction::ICmp && "Unexpected opcode for CmpInst!");
       if (!Val0->getType()->isIntOrIntVectorTy() &&
           !Val0->getType()->isPtrOrPtrVectorTy())
-        return Error(ID.Loc, "icmp requires pointer or integer operands");
+        return error(ID.Loc, "icmp requires pointer or integer operands");
       ID.ConstantVal = ConstantExpr::getICmp(Pred, Val0, Val1);
     }
     ID.Kind = ValID::t_Constant;
@@ -3506,16 +3630,16 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
     unsigned Opc = Lex.getUIntVal();
     Constant *Val;
     Lex.Lex();
-    if (ParseToken(lltok::lparen, "expected '(' in unary constantexpr") ||
-        ParseGlobalTypeAndValue(Val) ||
-        ParseToken(lltok::rparen, "expected ')' in unary constantexpr"))
+    if (parseToken(lltok::lparen, "expected '(' in unary constantexpr") ||
+        parseGlobalTypeAndValue(Val) ||
+        parseToken(lltok::rparen, "expected ')' in unary constantexpr"))
       return true;
 
     // Check that the type is valid for the operator.
     switch (Opc) {
     case Instruction::FNeg:
       if (!Val->getType()->isFPOrFPVectorTy())
-        return Error(ID.Loc, "constexpr requires fp operands");
+        return error(ID.Loc, "constexpr requires fp operands");
       break;
     default: llvm_unreachable("Unknown unary operator!");
     }
@@ -3561,14 +3685,14 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
       if (EatIfPresent(lltok::kw_exact))
         Exact = true;
     }
-    if (ParseToken(lltok::lparen, "expected '(' in binary constantexpr") ||
-        ParseGlobalTypeAndValue(Val0) ||
-        ParseToken(lltok::comma, "expected comma in binary constantexpr") ||
-        ParseGlobalTypeAndValue(Val1) ||
-        ParseToken(lltok::rparen, "expected ')' in binary constantexpr"))
+    if (parseToken(lltok::lparen, "expected '(' in binary constantexpr") ||
+        parseGlobalTypeAndValue(Val0) ||
+        parseToken(lltok::comma, "expected comma in binary constantexpr") ||
+        parseGlobalTypeAndValue(Val1) ||
+        parseToken(lltok::rparen, "expected ')' in binary constantexpr"))
       return true;
     if (Val0->getType() != Val1->getType())
-      return Error(ID.Loc, "operands of constexpr must have same type");
+      return error(ID.Loc, "operands of constexpr must have same type");
     // Check that the type is valid for the operator.
     switch (Opc) {
     case Instruction::Add:
@@ -3582,7 +3706,7 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
     case Instruction::AShr:
     case Instruction::LShr:
       if (!Val0->getType()->isIntOrIntVectorTy())
-        return Error(ID.Loc, "constexpr requires integer operands");
+        return error(ID.Loc, "constexpr requires integer operands");
       break;
     case Instruction::FAdd:
     case Instruction::FSub:
@@ -3590,7 +3714,7 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
     case Instruction::FDiv:
     case Instruction::FRem:
       if (!Val0->getType()->isFPOrFPVectorTy())
-        return Error(ID.Loc, "constexpr requires fp operands");
+        return error(ID.Loc, "constexpr requires fp operands");
       break;
     default: llvm_unreachable("Unknown binary operator!");
     }
@@ -3611,16 +3735,16 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
     unsigned Opc = Lex.getUIntVal();
     Constant *Val0, *Val1;
     Lex.Lex();
-    if (ParseToken(lltok::lparen, "expected '(' in logical constantexpr") ||
-        ParseGlobalTypeAndValue(Val0) ||
-        ParseToken(lltok::comma, "expected comma in logical constantexpr") ||
-        ParseGlobalTypeAndValue(Val1) ||
-        ParseToken(lltok::rparen, "expected ')' in logical constantexpr"))
+    if (parseToken(lltok::lparen, "expected '(' in logical constantexpr") ||
+        parseGlobalTypeAndValue(Val0) ||
+        parseToken(lltok::comma, "expected comma in logical constantexpr") ||
+        parseGlobalTypeAndValue(Val1) ||
+        parseToken(lltok::rparen, "expected ')' in logical constantexpr"))
       return true;
     if (Val0->getType() != Val1->getType())
-      return Error(ID.Loc, "operands of constexpr must have same type");
+      return error(ID.Loc, "operands of constexpr must have same type");
     if (!Val0->getType()->isIntOrIntVectorTy())
-      return Error(ID.Loc,
+      return error(ID.Loc,
                    "constexpr requires integer or integer vector operands");
     ID.ConstantVal = ConstantExpr::get(Opc, Val0, Val1);
     ID.Kind = ValID::t_Constant;
@@ -3641,31 +3765,31 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
     if (Opc == Instruction::GetElementPtr)
       InBounds = EatIfPresent(lltok::kw_inbounds);
 
-    if (ParseToken(lltok::lparen, "expected '(' in constantexpr"))
+    if (parseToken(lltok::lparen, "expected '(' in constantexpr"))
       return true;
 
     LocTy ExplicitTypeLoc = Lex.getLoc();
     if (Opc == Instruction::GetElementPtr) {
-      if (ParseType(Ty) ||
-          ParseToken(lltok::comma, "expected comma after getelementptr's type"))
+      if (parseType(Ty) ||
+          parseToken(lltok::comma, "expected comma after getelementptr's type"))
         return true;
     }
 
     Optional<unsigned> InRangeOp;
-    if (ParseGlobalValueVector(
+    if (parseGlobalValueVector(
             Elts, Opc == Instruction::GetElementPtr ? &InRangeOp : nullptr) ||
-        ParseToken(lltok::rparen, "expected ')' in constantexpr"))
+        parseToken(lltok::rparen, "expected ')' in constantexpr"))
       return true;
 
     if (Opc == Instruction::GetElementPtr) {
       if (Elts.size() == 0 ||
           !Elts[0]->getType()->isPtrOrPtrVectorTy())
-        return Error(ID.Loc, "base of getelementptr must be a pointer");
+        return error(ID.Loc, "base of getelementptr must be a pointer");
 
       Type *BaseType = Elts[0]->getType();
       auto *BasePointerType = cast<PointerType>(BaseType->getScalarType());
       if (Ty != BasePointerType->getElementType())
-        return Error(
+        return error(
             ExplicitTypeLoc,
             "explicit pointee type doesn't match operand's pointee type");
 
@@ -3678,11 +3802,11 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
       for (Constant *Val : Indices) {
         Type *ValTy = Val->getType();
         if (!ValTy->isIntOrIntVectorTy())
-          return Error(ID.Loc, "getelementptr index must be an integer");
+          return error(ID.Loc, "getelementptr index must be an integer");
         if (auto *ValVTy = dyn_cast<VectorType>(ValTy)) {
           unsigned ValNumEl = cast<FixedVectorType>(ValVTy)->getNumElements();
           if (GEPWidth && (ValNumEl != GEPWidth))
-            return Error(
+            return error(
                 ID.Loc,
                 "getelementptr vector index has a wrong number of elements");
           // GEPWidth may have been unknown because the base is a scalar,
@@ -3693,14 +3817,14 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
 
       SmallPtrSet<Type*, 4> Visited;
       if (!Indices.empty() && !Ty->isSized(&Visited))
-        return Error(ID.Loc, "base element of getelementptr must be sized");
+        return error(ID.Loc, "base element of getelementptr must be sized");
 
       if (!GetElementPtrInst::getIndexedType(Ty, Indices))
-        return Error(ID.Loc, "invalid getelementptr indices");
+        return error(ID.Loc, "invalid getelementptr indices");
 
       if (InRangeOp) {
         if (*InRangeOp == 0)
-          return Error(ID.Loc,
+          return error(ID.Loc,
                        "inrange keyword may not appear on pointer operand");
         --*InRangeOp;
       }
@@ -3709,31 +3833,31 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
                                                       InBounds, InRangeOp);
     } else if (Opc == Instruction::Select) {
       if (Elts.size() != 3)
-        return Error(ID.Loc, "expected three operands to select");
+        return error(ID.Loc, "expected three operands to select");
       if (const char *Reason = SelectInst::areInvalidOperands(Elts[0], Elts[1],
                                                               Elts[2]))
-        return Error(ID.Loc, Reason);
+        return error(ID.Loc, Reason);
       ID.ConstantVal = ConstantExpr::getSelect(Elts[0], Elts[1], Elts[2]);
     } else if (Opc == Instruction::ShuffleVector) {
       if (Elts.size() != 3)
-        return Error(ID.Loc, "expected three operands to shufflevector");
+        return error(ID.Loc, "expected three operands to shufflevector");
       if (!ShuffleVectorInst::isValidOperands(Elts[0], Elts[1], Elts[2]))
-        return Error(ID.Loc, "invalid operands to shufflevector");
+        return error(ID.Loc, "invalid operands to shufflevector");
       SmallVector<int, 16> Mask;
       ShuffleVectorInst::getShuffleMask(cast<Constant>(Elts[2]), Mask);
       ID.ConstantVal = ConstantExpr::getShuffleVector(Elts[0], Elts[1], Mask);
     } else if (Opc == Instruction::ExtractElement) {
       if (Elts.size() != 2)
-        return Error(ID.Loc, "expected two operands to extractelement");
+        return error(ID.Loc, "expected two operands to extractelement");
       if (!ExtractElementInst::isValidOperands(Elts[0], Elts[1]))
-        return Error(ID.Loc, "invalid extractelement operands");
+        return error(ID.Loc, "invalid extractelement operands");
       ID.ConstantVal = ConstantExpr::getExtractElement(Elts[0], Elts[1]);
     } else {
       assert(Opc == Instruction::InsertElement && "Unknown opcode");
       if (Elts.size() != 3)
-      return Error(ID.Loc, "expected three operands to insertelement");
+        return error(ID.Loc, "expected three operands to insertelement");
       if (!InsertElementInst::isValidOperands(Elts[0], Elts[1], Elts[2]))
-        return Error(ID.Loc, "invalid insertelement operands");
+        return error(ID.Loc, "invalid insertelement operands");
       ID.ConstantVal =
                  ConstantExpr::getInsertElement(Elts[0], Elts[1],Elts[2]);
     }
@@ -3747,22 +3871,21 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
   return false;
 }
 
-/// ParseGlobalValue - Parse a global value with the specified type.
-bool LLParser::ParseGlobalValue(Type *Ty, Constant *&C) {
+/// parseGlobalValue - parse a global value with the specified type.
+bool LLParser::parseGlobalValue(Type *Ty, Constant *&C) {
   C = nullptr;
   ValID ID;
   Value *V = nullptr;
-  bool Parsed = ParseValID(ID) ||
-                ConvertValIDToValue(Ty, ID, V, nullptr, /*IsCall=*/false);
+  bool Parsed = parseValID(ID) ||
+                convertValIDToValue(Ty, ID, V, nullptr, /*IsCall=*/false);
   if (V && !(C = dyn_cast<Constant>(V)))
-    return Error(ID.Loc, "global values must be constants");
+    return error(ID.Loc, "global values must be constants");
   return Parsed;
 }
 
-bool LLParser::ParseGlobalTypeAndValue(Constant *&V) {
+bool LLParser::parseGlobalTypeAndValue(Constant *&V) {
   Type *Ty = nullptr;
-  return ParseType(Ty) ||
-         ParseGlobalValue(Ty, V);
+  return parseType(Ty) || parseGlobalValue(Ty, V);
 }
 
 bool LLParser::parseOptionalComdat(StringRef GlobalName, Comdat *&C) {
@@ -3774,24 +3897,24 @@ bool LLParser::parseOptionalComdat(StringRef GlobalName, Comdat *&C) {
 
   if (EatIfPresent(lltok::lparen)) {
     if (Lex.getKind() != lltok::ComdatVar)
-      return TokError("expected comdat variable");
+      return tokError("expected comdat variable");
     C = getComdat(Lex.getStrVal(), Lex.getLoc());
     Lex.Lex();
-    if (ParseToken(lltok::rparen, "expected ')' after comdat var"))
+    if (parseToken(lltok::rparen, "expected ')' after comdat var"))
       return true;
   } else {
     if (GlobalName.empty())
-      return TokError("comdat cannot be unnamed");
+      return tokError("comdat cannot be unnamed");
     C = getComdat(std::string(GlobalName), KwLoc);
   }
 
   return false;
 }
 
-/// ParseGlobalValueVector
+/// parseGlobalValueVector
 ///   ::= /*empty*/
 ///   ::= [inrange] TypeAndValue (',' [inrange] TypeAndValue)*
-bool LLParser::ParseGlobalValueVector(SmallVectorImpl<Constant *> &Elts,
+bool LLParser::parseGlobalValueVector(SmallVectorImpl<Constant *> &Elts,
                                       Optional<unsigned> *InRangeOp) {
   // Empty list.
   if (Lex.getKind() == lltok::rbrace ||
@@ -3805,16 +3928,17 @@ bool LLParser::ParseGlobalValueVector(SmallVectorImpl<Constant *> &Elts,
       *InRangeOp = Elts.size();
 
     Constant *C;
-    if (ParseGlobalTypeAndValue(C)) return true;
+    if (parseGlobalTypeAndValue(C))
+      return true;
     Elts.push_back(C);
   } while (EatIfPresent(lltok::comma));
 
   return false;
 }
 
-bool LLParser::ParseMDTuple(MDNode *&MD, bool IsDistinct) {
+bool LLParser::parseMDTuple(MDNode *&MD, bool IsDistinct) {
   SmallVector<Metadata *, 16> Elts;
-  if (ParseMDNodeVector(Elts))
+  if (parseMDNodeVector(Elts))
     return true;
 
   MD = (IsDistinct ? MDTuple::getDistinct : MDTuple::get)(Context, Elts);
@@ -3825,21 +3949,20 @@ bool LLParser::ParseMDTuple(MDNode *&MD, bool IsDistinct) {
 ///  ::= !{ ... }
 ///  ::= !7
 ///  ::= !DILocation(...)
-bool LLParser::ParseMDNode(MDNode *&N) {
+bool LLParser::parseMDNode(MDNode *&N) {
   if (Lex.getKind() == lltok::MetadataVar)
-    return ParseSpecializedMDNode(N);
+    return parseSpecializedMDNode(N);
 
-  return ParseToken(lltok::exclaim, "expected '!' here") ||
-         ParseMDNodeTail(N);
+  return parseToken(lltok::exclaim, "expected '!' here") || parseMDNodeTail(N);
 }
 
-bool LLParser::ParseMDNodeTail(MDNode *&N) {
+bool LLParser::parseMDNodeTail(MDNode *&N) {
   // !{ ... }
   if (Lex.getKind() == lltok::lbrace)
-    return ParseMDTuple(N);
+    return parseMDTuple(N);
 
   // !42
-  return ParseMDNodeID(N);
+  return parseMDNodeID(N);
 }
 
 namespace {
@@ -4037,9 +4160,9 @@ struct MDSignedOrUnsignedField
 namespace llvm {
 
 template <>
-bool LLParser::ParseMDField(LocTy Loc, StringRef Name, MDAPSIntField &Result) {
+bool LLParser::parseMDField(LocTy Loc, StringRef Name, MDAPSIntField &Result) {
   if (Lex.getKind() != lltok::APSInt)
-    return TokError("expected integer");
+    return tokError("expected integer");
 
   Result.assign(Lex.getAPSIntVal());
   Lex.Lex();
@@ -4047,14 +4170,14 @@ bool LLParser::ParseMDField(LocTy Loc, StringRef Name, MDAPSIntField &Result) {
 }
 
 template <>
-bool LLParser::ParseMDField(LocTy Loc, StringRef Name,
+bool LLParser::parseMDField(LocTy Loc, StringRef Name,
                             MDUnsignedField &Result) {
   if (Lex.getKind() != lltok::APSInt || Lex.getAPSIntVal().isSigned())
-    return TokError("expected unsigned integer");
+    return tokError("expected unsigned integer");
 
   auto &U = Lex.getAPSIntVal();
   if (U.ugt(Result.Max))
-    return TokError("value for '" + Name + "' too large, limit is " +
+    return tokError("value for '" + Name + "' too large, limit is " +
                     Twine(Result.Max));
   Result.assign(U.getZExtValue());
   assert(Result.Val <= Result.Max && "Expected value in range");
@@ -4063,25 +4186,25 @@ bool LLParser::ParseMDField(LocTy Loc, StringRef Name,
 }
 
 template <>
-bool LLParser::ParseMDField(LocTy Loc, StringRef Name, LineField &Result) {
-  return ParseMDField(Loc, Name, static_cast<MDUnsignedField &>(Result));
+bool LLParser::parseMDField(LocTy Loc, StringRef Name, LineField &Result) {
+  return parseMDField(Loc, Name, static_cast<MDUnsignedField &>(Result));
 }
 template <>
-bool LLParser::ParseMDField(LocTy Loc, StringRef Name, ColumnField &Result) {
-  return ParseMDField(Loc, Name, static_cast<MDUnsignedField &>(Result));
+bool LLParser::parseMDField(LocTy Loc, StringRef Name, ColumnField &Result) {
+  return parseMDField(Loc, Name, static_cast<MDUnsignedField &>(Result));
 }
 
 template <>
-bool LLParser::ParseMDField(LocTy Loc, StringRef Name, DwarfTagField &Result) {
+bool LLParser::parseMDField(LocTy Loc, StringRef Name, DwarfTagField &Result) {
   if (Lex.getKind() == lltok::APSInt)
-    return ParseMDField(Loc, Name, static_cast<MDUnsignedField &>(Result));
+    return parseMDField(Loc, Name, static_cast<MDUnsignedField &>(Result));
 
   if (Lex.getKind() != lltok::DwarfTag)
-    return TokError("expected DWARF tag");
+    return tokError("expected DWARF tag");
 
   unsigned Tag = dwarf::getTag(Lex.getStrVal());
   if (Tag == dwarf::DW_TAG_invalid)
-    return TokError("invalid DWARF tag" + Twine(" '") + Lex.getStrVal() + "'");
+    return tokError("invalid DWARF tag" + Twine(" '") + Lex.getStrVal() + "'");
   assert(Tag <= Result.Max && "Expected valid DWARF tag");
 
   Result.assign(Tag);
@@ -4090,18 +4213,18 @@ bool LLParser::ParseMDField(LocTy Loc, StringRef Name, DwarfTagField &Result) {
 }
 
 template <>
-bool LLParser::ParseMDField(LocTy Loc, StringRef Name,
+bool LLParser::parseMDField(LocTy Loc, StringRef Name,
                             DwarfMacinfoTypeField &Result) {
   if (Lex.getKind() == lltok::APSInt)
-    return ParseMDField(Loc, Name, static_cast<MDUnsignedField &>(Result));
+    return parseMDField(Loc, Name, static_cast<MDUnsignedField &>(Result));
 
   if (Lex.getKind() != lltok::DwarfMacinfo)
-    return TokError("expected DWARF macinfo type");
+    return tokError("expected DWARF macinfo type");
 
   unsigned Macinfo = dwarf::getMacinfo(Lex.getStrVal());
   if (Macinfo == dwarf::DW_MACINFO_invalid)
-    return TokError(
-        "invalid DWARF macinfo type" + Twine(" '") + Lex.getStrVal() + "'");
+    return tokError("invalid DWARF macinfo type" + Twine(" '") +
+                    Lex.getStrVal() + "'");
   assert(Macinfo <= Result.Max && "Expected valid DWARF macinfo type");
 
   Result.assign(Macinfo);
@@ -4110,17 +4233,17 @@ bool LLParser::ParseMDField(LocTy Loc, StringRef Name,
 }
 
 template <>
-bool LLParser::ParseMDField(LocTy Loc, StringRef Name,
+bool LLParser::parseMDField(LocTy Loc, StringRef Name,
                             DwarfVirtualityField &Result) {
   if (Lex.getKind() == lltok::APSInt)
-    return ParseMDField(Loc, Name, static_cast<MDUnsignedField &>(Result));
+    return parseMDField(Loc, Name, static_cast<MDUnsignedField &>(Result));
 
   if (Lex.getKind() != lltok::DwarfVirtuality)
-    return TokError("expected DWARF virtuality code");
+    return tokError("expected DWARF virtuality code");
 
   unsigned Virtuality = dwarf::getVirtuality(Lex.getStrVal());
   if (Virtuality == dwarf::DW_VIRTUALITY_invalid)
-    return TokError("invalid DWARF virtuality code" + Twine(" '") +
+    return tokError("invalid DWARF virtuality code" + Twine(" '") +
                     Lex.getStrVal() + "'");
   assert(Virtuality <= Result.Max && "Expected valid DWARF virtuality code");
   Result.assign(Virtuality);
@@ -4129,16 +4252,16 @@ bool LLParser::ParseMDField(LocTy Loc, StringRef Name,
 }
 
 template <>
-bool LLParser::ParseMDField(LocTy Loc, StringRef Name, DwarfLangField &Result) {
+bool LLParser::parseMDField(LocTy Loc, StringRef Name, DwarfLangField &Result) {
   if (Lex.getKind() == lltok::APSInt)
-    return ParseMDField(Loc, Name, static_cast<MDUnsignedField &>(Result));
+    return parseMDField(Loc, Name, static_cast<MDUnsignedField &>(Result));
 
   if (Lex.getKind() != lltok::DwarfLang)
-    return TokError("expected DWARF language");
+    return tokError("expected DWARF language");
 
   unsigned Lang = dwarf::getLanguage(Lex.getStrVal());
   if (!Lang)
-    return TokError("invalid DWARF language" + Twine(" '") + Lex.getStrVal() +
+    return tokError("invalid DWARF language" + Twine(" '") + Lex.getStrVal() +
                     "'");
   assert(Lang <= Result.Max && "Expected valid DWARF language");
   Result.assign(Lang);
@@ -4147,17 +4270,17 @@ bool LLParser::ParseMDField(LocTy Loc, StringRef Name, DwarfLangField &Result) {
 }
 
 template <>
-bool LLParser::ParseMDField(LocTy Loc, StringRef Name, DwarfCCField &Result) {
+bool LLParser::parseMDField(LocTy Loc, StringRef Name, DwarfCCField &Result) {
   if (Lex.getKind() == lltok::APSInt)
-    return ParseMDField(Loc, Name, static_cast<MDUnsignedField &>(Result));
+    return parseMDField(Loc, Name, static_cast<MDUnsignedField &>(Result));
 
   if (Lex.getKind() != lltok::DwarfCC)
-    return TokError("expected DWARF calling convention");
+    return tokError("expected DWARF calling convention");
 
   unsigned CC = dwarf::getCallingConvention(Lex.getStrVal());
   if (!CC)
-    return TokError("invalid DWARF calling convention" + Twine(" '") + Lex.getStrVal() +
-                    "'");
+    return tokError("invalid DWARF calling convention" + Twine(" '") +
+                    Lex.getStrVal() + "'");
   assert(CC <= Result.Max && "Expected valid DWARF calling convention");
   Result.assign(CC);
   Lex.Lex();
@@ -4165,16 +4288,17 @@ bool LLParser::ParseMDField(LocTy Loc, StringRef Name, DwarfCCField &Result) {
 }
 
 template <>
-bool LLParser::ParseMDField(LocTy Loc, StringRef Name, EmissionKindField &Result) {
+bool LLParser::parseMDField(LocTy Loc, StringRef Name,
+                            EmissionKindField &Result) {
   if (Lex.getKind() == lltok::APSInt)
-    return ParseMDField(Loc, Name, static_cast<MDUnsignedField &>(Result));
+    return parseMDField(Loc, Name, static_cast<MDUnsignedField &>(Result));
 
   if (Lex.getKind() != lltok::EmissionKind)
-    return TokError("expected emission kind");
+    return tokError("expected emission kind");
 
   auto Kind = DICompileUnit::getEmissionKind(Lex.getStrVal());
   if (!Kind)
-    return TokError("invalid emission kind" + Twine(" '") + Lex.getStrVal() +
+    return tokError("invalid emission kind" + Twine(" '") + Lex.getStrVal() +
                     "'");
   assert(*Kind <= Result.Max && "Expected valid emission kind");
   Result.assign(*Kind);
@@ -4183,17 +4307,17 @@ bool LLParser::ParseMDField(LocTy Loc, StringRef Name, EmissionKindField &Result
 }
 
 template <>
-bool LLParser::ParseMDField(LocTy Loc, StringRef Name,
+bool LLParser::parseMDField(LocTy Loc, StringRef Name,
                             NameTableKindField &Result) {
   if (Lex.getKind() == lltok::APSInt)
-    return ParseMDField(Loc, Name, static_cast<MDUnsignedField &>(Result));
+    return parseMDField(Loc, Name, static_cast<MDUnsignedField &>(Result));
 
   if (Lex.getKind() != lltok::NameTableKind)
-    return TokError("expected nameTable kind");
+    return tokError("expected nameTable kind");
 
   auto Kind = DICompileUnit::getNameTableKind(Lex.getStrVal());
   if (!Kind)
-    return TokError("invalid nameTable kind" + Twine(" '") + Lex.getStrVal() +
+    return tokError("invalid nameTable kind" + Twine(" '") + Lex.getStrVal() +
                     "'");
   assert(((unsigned)*Kind) <= Result.Max && "Expected valid nameTable kind");
   Result.assign((unsigned)*Kind);
@@ -4202,17 +4326,17 @@ bool LLParser::ParseMDField(LocTy Loc, StringRef Name,
 }
 
 template <>
-bool LLParser::ParseMDField(LocTy Loc, StringRef Name,
+bool LLParser::parseMDField(LocTy Loc, StringRef Name,
                             DwarfAttEncodingField &Result) {
   if (Lex.getKind() == lltok::APSInt)
-    return ParseMDField(Loc, Name, static_cast<MDUnsignedField &>(Result));
+    return parseMDField(Loc, Name, static_cast<MDUnsignedField &>(Result));
 
   if (Lex.getKind() != lltok::DwarfAttEncoding)
-    return TokError("expected DWARF type attribute encoding");
+    return tokError("expected DWARF type attribute encoding");
 
   unsigned Encoding = dwarf::getAttributeEncoding(Lex.getStrVal());
   if (!Encoding)
-    return TokError("invalid DWARF type attribute encoding" + Twine(" '") +
+    return tokError("invalid DWARF type attribute encoding" + Twine(" '") +
                     Lex.getStrVal() + "'");
   assert(Encoding <= Result.Max && "Expected valid DWARF language");
   Result.assign(Encoding);
@@ -4225,29 +4349,29 @@ bool LLParser::ParseMDField(LocTy Loc, StringRef Name,
 ///  ::= DIFlagVector
 ///  ::= DIFlagVector '|' DIFlagFwdDecl '|' uint32 '|' DIFlagPublic
 template <>
-bool LLParser::ParseMDField(LocTy Loc, StringRef Name, DIFlagField &Result) {
+bool LLParser::parseMDField(LocTy Loc, StringRef Name, DIFlagField &Result) {
 
-  // Parser for a single flag.
+  // parser for a single flag.
   auto parseFlag = [&](DINode::DIFlags &Val) {
     if (Lex.getKind() == lltok::APSInt && !Lex.getAPSIntVal().isSigned()) {
       uint32_t TempVal = static_cast<uint32_t>(Val);
-      bool Res = ParseUInt32(TempVal);
+      bool Res = parseUInt32(TempVal);
       Val = static_cast<DINode::DIFlags>(TempVal);
       return Res;
     }
 
     if (Lex.getKind() != lltok::DIFlag)
-      return TokError("expected debug info flag");
+      return tokError("expected debug info flag");
 
     Val = DINode::getFlag(Lex.getStrVal());
     if (!Val)
-      return TokError(Twine("invalid debug info flag flag '") +
+      return tokError(Twine("invalid debug info flag flag '") +
                       Lex.getStrVal() + "'");
     Lex.Lex();
     return false;
   };
 
-  // Parse the flags and combine them together.
+  // parse the flags and combine them together.
   DINode::DIFlags Combined = DINode::FlagZero;
   do {
     DINode::DIFlags Val;
@@ -4265,29 +4389,29 @@ bool LLParser::ParseMDField(LocTy Loc, StringRef Name, DIFlagField &Result) {
 ///  ::= DISPFlagVector
 ///  ::= DISPFlagVector '|' DISPFlag* '|' uint32
 template <>
-bool LLParser::ParseMDField(LocTy Loc, StringRef Name, DISPFlagField &Result) {
+bool LLParser::parseMDField(LocTy Loc, StringRef Name, DISPFlagField &Result) {
 
-  // Parser for a single flag.
+  // parser for a single flag.
   auto parseFlag = [&](DISubprogram::DISPFlags &Val) {
     if (Lex.getKind() == lltok::APSInt && !Lex.getAPSIntVal().isSigned()) {
       uint32_t TempVal = static_cast<uint32_t>(Val);
-      bool Res = ParseUInt32(TempVal);
+      bool Res = parseUInt32(TempVal);
       Val = static_cast<DISubprogram::DISPFlags>(TempVal);
       return Res;
     }
 
     if (Lex.getKind() != lltok::DISPFlag)
-      return TokError("expected debug info flag");
+      return tokError("expected debug info flag");
 
     Val = DISubprogram::getFlag(Lex.getStrVal());
     if (!Val)
-      return TokError(Twine("invalid subprogram debug info flag '") +
+      return tokError(Twine("invalid subprogram debug info flag '") +
                       Lex.getStrVal() + "'");
     Lex.Lex();
     return false;
   };
 
-  // Parse the flags and combine them together.
+  // parse the flags and combine them together.
   DISubprogram::DISPFlags Combined = DISubprogram::SPFlagZero;
   do {
     DISubprogram::DISPFlags Val;
@@ -4301,17 +4425,16 @@ bool LLParser::ParseMDField(LocTy Loc, StringRef Name, DISPFlagField &Result) {
 }
 
 template <>
-bool LLParser::ParseMDField(LocTy Loc, StringRef Name,
-                            MDSignedField &Result) {
+bool LLParser::parseMDField(LocTy Loc, StringRef Name, MDSignedField &Result) {
   if (Lex.getKind() != lltok::APSInt)
-    return TokError("expected signed integer");
+    return tokError("expected signed integer");
 
   auto &S = Lex.getAPSIntVal();
   if (S < Result.Min)
-    return TokError("value for '" + Name + "' too small, limit is " +
+    return tokError("value for '" + Name + "' too small, limit is " +
                     Twine(Result.Min));
   if (S > Result.Max)
-    return TokError("value for '" + Name + "' too large, limit is " +
+    return tokError("value for '" + Name + "' too large, limit is " +
                     Twine(Result.Max));
   Result.assign(S.getExtValue());
   assert(Result.Val >= Result.Min && "Expected value in range");
@@ -4321,10 +4444,10 @@ bool LLParser::ParseMDField(LocTy Loc, StringRef Name,
 }
 
 template <>
-bool LLParser::ParseMDField(LocTy Loc, StringRef Name, MDBoolField &Result) {
+bool LLParser::parseMDField(LocTy Loc, StringRef Name, MDBoolField &Result) {
   switch (Lex.getKind()) {
   default:
-    return TokError("expected 'true' or 'false'");
+    return tokError("expected 'true' or 'false'");
   case lltok::kw_true:
     Result.assign(true);
     break;
@@ -4337,17 +4460,17 @@ bool LLParser::ParseMDField(LocTy Loc, StringRef Name, MDBoolField &Result) {
 }
 
 template <>
-bool LLParser::ParseMDField(LocTy Loc, StringRef Name, MDField &Result) {
+bool LLParser::parseMDField(LocTy Loc, StringRef Name, MDField &Result) {
   if (Lex.getKind() == lltok::kw_null) {
     if (!Result.AllowNull)
-      return TokError("'" + Name + "' cannot be null");
+      return tokError("'" + Name + "' cannot be null");
     Lex.Lex();
     Result.assign(nullptr);
     return false;
   }
 
   Metadata *MD;
-  if (ParseMetadata(MD, nullptr))
+  if (parseMetadata(MD, nullptr))
     return true;
 
   Result.assign(MD);
@@ -4355,12 +4478,12 @@ bool LLParser::ParseMDField(LocTy Loc, StringRef Name, MDField &Result) {
 }
 
 template <>
-bool LLParser::ParseMDField(LocTy Loc, StringRef Name,
+bool LLParser::parseMDField(LocTy Loc, StringRef Name,
                             MDSignedOrMDField &Result) {
   // Try to parse a signed int.
   if (Lex.getKind() == lltok::APSInt) {
     MDSignedField Res = Result.A;
-    if (!ParseMDField(Loc, Name, Res)) {
+    if (!parseMDField(Loc, Name, Res)) {
       Result.assign(Res);
       return false;
     }
@@ -4369,7 +4492,7 @@ bool LLParser::ParseMDField(LocTy Loc, StringRef Name,
 
   // Otherwise, try to parse as an MDField.
   MDField Res = Result.B;
-  if (!ParseMDField(Loc, Name, Res)) {
+  if (!parseMDField(Loc, Name, Res)) {
     Result.assign(Res);
     return false;
   }
@@ -4378,23 +4501,23 @@ bool LLParser::ParseMDField(LocTy Loc, StringRef Name,
 }
 
 template <>
-bool LLParser::ParseMDField(LocTy Loc, StringRef Name, MDStringField &Result) {
+bool LLParser::parseMDField(LocTy Loc, StringRef Name, MDStringField &Result) {
   LocTy ValueLoc = Lex.getLoc();
   std::string S;
-  if (ParseStringConstant(S))
+  if (parseStringConstant(S))
     return true;
 
   if (!Result.AllowEmpty && S.empty())
-    return Error(ValueLoc, "'" + Name + "' cannot be empty");
+    return error(ValueLoc, "'" + Name + "' cannot be empty");
 
   Result.assign(S.empty() ? nullptr : MDString::get(Context, S));
   return false;
 }
 
 template <>
-bool LLParser::ParseMDField(LocTy Loc, StringRef Name, MDFieldList &Result) {
+bool LLParser::parseMDField(LocTy Loc, StringRef Name, MDFieldList &Result) {
   SmallVector<Metadata *, 4> MDs;
-  if (ParseMDNodeVector(MDs))
+  if (parseMDNodeVector(MDs))
     return true;
 
   Result.assign(std::move(MDs));
@@ -4402,14 +4525,14 @@ bool LLParser::ParseMDField(LocTy Loc, StringRef Name, MDFieldList &Result) {
 }
 
 template <>
-bool LLParser::ParseMDField(LocTy Loc, StringRef Name,
+bool LLParser::parseMDField(LocTy Loc, StringRef Name,
                             ChecksumKindField &Result) {
   Optional<DIFile::ChecksumKind> CSKind =
       DIFile::getChecksumKind(Lex.getStrVal());
 
   if (Lex.getKind() != lltok::ChecksumKind || !CSKind)
-    return TokError(
-        "invalid checksum kind" + Twine(" '") + Lex.getStrVal() + "'");
+    return tokError("invalid checksum kind" + Twine(" '") + Lex.getStrVal() +
+                    "'");
 
   Result.assign(*CSKind);
   Lex.Lex();
@@ -4419,12 +4542,12 @@ bool LLParser::ParseMDField(LocTy Loc, StringRef Name,
 } // end namespace llvm
 
 template <class ParserTy>
-bool LLParser::ParseMDFieldsImplBody(ParserTy parseField) {
+bool LLParser::parseMDFieldsImplBody(ParserTy ParseField) {
   do {
     if (Lex.getKind() != lltok::LabelStr)
-      return TokError("expected field label here");
+      return tokError("expected field label here");
 
-    if (parseField())
+    if (ParseField())
       return true;
   } while (EatIfPresent(lltok::comma));
 
@@ -4432,67 +4555,70 @@ bool LLParser::ParseMDFieldsImplBody(ParserTy parseField) {
 }
 
 template <class ParserTy>
-bool LLParser::ParseMDFieldsImpl(ParserTy parseField, LocTy &ClosingLoc) {
+bool LLParser::parseMDFieldsImpl(ParserTy ParseField, LocTy &ClosingLoc) {
   assert(Lex.getKind() == lltok::MetadataVar && "Expected metadata type name");
   Lex.Lex();
 
-  if (ParseToken(lltok::lparen, "expected '(' here"))
+  if (parseToken(lltok::lparen, "expected '(' here"))
     return true;
   if (Lex.getKind() != lltok::rparen)
-    if (ParseMDFieldsImplBody(parseField))
+    if (parseMDFieldsImplBody(ParseField))
       return true;
 
   ClosingLoc = Lex.getLoc();
-  return ParseToken(lltok::rparen, "expected ')' here");
+  return parseToken(lltok::rparen, "expected ')' here");
 }
 
 template <class FieldTy>
-bool LLParser::ParseMDField(StringRef Name, FieldTy &Result) {
+bool LLParser::parseMDField(StringRef Name, FieldTy &Result) {
   if (Result.Seen)
-    return TokError("field '" + Name + "' cannot be specified more than once");
+    return tokError("field '" + Name + "' cannot be specified more than once");
 
   LocTy Loc = Lex.getLoc();
   Lex.Lex();
-  return ParseMDField(Loc, Name, Result);
+  return parseMDField(Loc, Name, Result);
 }
 
-bool LLParser::ParseSpecializedMDNode(MDNode *&N, bool IsDistinct) {
+bool LLParser::parseSpecializedMDNode(MDNode *&N, bool IsDistinct) {
   assert(Lex.getKind() == lltok::MetadataVar && "Expected metadata type name");
 
 #define HANDLE_SPECIALIZED_MDNODE_LEAF(CLASS)                                  \
   if (Lex.getStrVal() == #CLASS)                                               \
-    return Parse##CLASS(N, IsDistinct);
+    return parse##CLASS(N, IsDistinct);
 #include "llvm/IR/Metadata.def"
 
-  return TokError("expected metadata type");
+  return tokError("expected metadata type");
 }
 
 #define DECLARE_FIELD(NAME, TYPE, INIT) TYPE NAME INIT
 #define NOP_FIELD(NAME, TYPE, INIT)
 #define REQUIRE_FIELD(NAME, TYPE, INIT)                                        \
   if (!NAME.Seen)                                                              \
-    return Error(ClosingLoc, "missing required field '" #NAME "'");
+    return error(ClosingLoc, "missing required field '" #NAME "'");
 #define PARSE_MD_FIELD(NAME, TYPE, DEFAULT)                                    \
   if (Lex.getStrVal() == #NAME)                                                \
-    return ParseMDField(#NAME, NAME);
+    return parseMDField(#NAME, NAME);
 #define PARSE_MD_FIELDS()                                                      \
   VISIT_MD_FIELDS(DECLARE_FIELD, DECLARE_FIELD)                                \
   do {                                                                         \
     LocTy ClosingLoc;                                                          \
-    if (ParseMDFieldsImpl([&]() -> bool {                                      \
-      VISIT_MD_FIELDS(PARSE_MD_FIELD, PARSE_MD_FIELD)                          \
-      return TokError(Twine("invalid field '") + Lex.getStrVal() + "'");       \
-    }, ClosingLoc))                                                            \
+    if (parseMDFieldsImpl(                                                     \
+            [&]() -> bool {                                                    \
+              VISIT_MD_FIELDS(PARSE_MD_FIELD, PARSE_MD_FIELD)                  \
+              return tokError(Twine("invalid field '") + Lex.getStrVal() +     \
+                              "'");                                            \
+            },                                                                 \
+            ClosingLoc))                                                       \
       return true;                                                             \
     VISIT_MD_FIELDS(NOP_FIELD, REQUIRE_FIELD)                                  \
   } while (false)
 #define GET_OR_DISTINCT(CLASS, ARGS)                                           \
   (IsDistinct ? CLASS::getDistinct ARGS : CLASS::get ARGS)
 
-/// ParseDILocationFields:
+/// parseDILocationFields:
 ///   ::= !DILocation(line: 43, column: 8, scope: !5, inlinedAt: !6,
 ///   isImplicitCode: true)
-bool LLParser::ParseDILocation(MDNode *&Result, bool IsDistinct) {
+bool LLParser::parseDILocation(MDNode *&Result, bool IsDistinct) {
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
   OPTIONAL(line, LineField, );                                                 \
   OPTIONAL(column, ColumnField, );                                             \
@@ -4508,9 +4634,9 @@ bool LLParser::ParseDILocation(MDNode *&Result, bool IsDistinct) {
   return false;
 }
 
-/// ParseGenericDINode:
+/// parseGenericDINode:
 ///   ::= !GenericDINode(tag: 15, header: "...", operands: {...})
-bool LLParser::ParseGenericDINode(MDNode *&Result, bool IsDistinct) {
+bool LLParser::parseGenericDINode(MDNode *&Result, bool IsDistinct) {
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
   REQUIRED(tag, DwarfTagField, );                                              \
   OPTIONAL(header, MDStringField, );                                           \
@@ -4523,11 +4649,11 @@ bool LLParser::ParseGenericDINode(MDNode *&Result, bool IsDistinct) {
   return false;
 }
 
-/// ParseDISubrange:
+/// parseDISubrange:
 ///   ::= !DISubrange(count: 30, lowerBound: 2)
 ///   ::= !DISubrange(count: !node, lowerBound: 2)
 ///   ::= !DISubrange(lowerBound: !node1, upperBound: !node2, stride: !node3)
-bool LLParser::ParseDISubrange(MDNode *&Result, bool IsDistinct) {
+bool LLParser::parseDISubrange(MDNode *&Result, bool IsDistinct) {
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
   OPTIONAL(count, MDSignedOrMDField, (-1, -1, INT64_MAX, false));              \
   OPTIONAL(lowerBound, MDSignedOrMDField, );                                   \
@@ -4565,9 +4691,42 @@ bool LLParser::ParseDISubrange(MDNode *&Result, bool IsDistinct) {
   return false;
 }
 
-/// ParseDIEnumerator:
+/// parseDIGenericSubrange:
+///   ::= !DIGenericSubrange(lowerBound: !node1, upperBound: !node2, stride:
+///   !node3)
+bool LLParser::parseDIGenericSubrange(MDNode *&Result, bool IsDistinct) {
+#define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
+  OPTIONAL(count, MDSignedOrMDField, );                                        \
+  OPTIONAL(lowerBound, MDSignedOrMDField, );                                   \
+  OPTIONAL(upperBound, MDSignedOrMDField, );                                   \
+  OPTIONAL(stride, MDSignedOrMDField, );
+  PARSE_MD_FIELDS();
+#undef VISIT_MD_FIELDS
+
+  auto ConvToMetadata = [&](MDSignedOrMDField Bound) -> Metadata * {
+    if (Bound.isMDSignedField())
+      return DIExpression::get(
+          Context, {dwarf::DW_OP_consts,
+                    static_cast<uint64_t>(Bound.getMDSignedValue())});
+    if (Bound.isMDField())
+      return Bound.getMDFieldValue();
+    return nullptr;
+  };
+
+  Metadata *Count = ConvToMetadata(count);
+  Metadata *LowerBound = ConvToMetadata(lowerBound);
+  Metadata *UpperBound = ConvToMetadata(upperBound);
+  Metadata *Stride = ConvToMetadata(stride);
+
+  Result = GET_OR_DISTINCT(DIGenericSubrange,
+                           (Context, Count, LowerBound, UpperBound, Stride));
+
+  return false;
+}
+
+/// parseDIEnumerator:
 ///   ::= !DIEnumerator(value: 30, isUnsigned: true, name: "SomeKind")
-bool LLParser::ParseDIEnumerator(MDNode *&Result, bool IsDistinct) {
+bool LLParser::parseDIEnumerator(MDNode *&Result, bool IsDistinct) {
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
   REQUIRED(name, MDStringField, );                                             \
   REQUIRED(value, MDAPSIntField, );                                            \
@@ -4576,7 +4735,7 @@ bool LLParser::ParseDIEnumerator(MDNode *&Result, bool IsDistinct) {
 #undef VISIT_MD_FIELDS
 
   if (isUnsigned.Val && value.Val.isNegative())
-    return TokError("unsigned enumerator with negative value");
+    return tokError("unsigned enumerator with negative value");
 
   APSInt Value(value.Val);
   // Add a leading zero so that unsigned values with the msb set are not
@@ -4590,10 +4749,10 @@ bool LLParser::ParseDIEnumerator(MDNode *&Result, bool IsDistinct) {
   return false;
 }
 
-/// ParseDIBasicType:
+/// parseDIBasicType:
 ///   ::= !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32,
 ///                    encoding: DW_ATE_encoding, flags: 0)
-bool LLParser::ParseDIBasicType(MDNode *&Result, bool IsDistinct) {
+bool LLParser::parseDIBasicType(MDNode *&Result, bool IsDistinct) {
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
   OPTIONAL(tag, DwarfTagField, (dwarf::DW_TAG_base_type));                     \
   OPTIONAL(name, MDStringField, );                                             \
@@ -4609,12 +4768,33 @@ bool LLParser::ParseDIBasicType(MDNode *&Result, bool IsDistinct) {
   return false;
 }
 
-/// ParseDIDerivedType:
+/// parseDIStringType:
+///   ::= !DIStringType(name: "character(4)", size: 32, align: 32)
+bool LLParser::parseDIStringType(MDNode *&Result, bool IsDistinct) {
+#define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
+  OPTIONAL(tag, DwarfTagField, (dwarf::DW_TAG_string_type));                   \
+  OPTIONAL(name, MDStringField, );                                             \
+  OPTIONAL(stringLength, MDField, );                                           \
+  OPTIONAL(stringLengthExpression, MDField, );                                 \
+  OPTIONAL(size, MDUnsignedField, (0, UINT64_MAX));                            \
+  OPTIONAL(align, MDUnsignedField, (0, UINT32_MAX));                           \
+  OPTIONAL(encoding, DwarfAttEncodingField, );
+  PARSE_MD_FIELDS();
+#undef VISIT_MD_FIELDS
+
+  Result = GET_OR_DISTINCT(DIStringType,
+                           (Context, tag.Val, name.Val, stringLength.Val,
+                            stringLengthExpression.Val, size.Val, align.Val,
+                            encoding.Val));
+  return false;
+}
+
+/// parseDIDerivedType:
 ///   ::= !DIDerivedType(tag: DW_TAG_pointer_type, name: "int", file: !0,
 ///                      line: 7, scope: !1, baseType: !2, size: 32,
 ///                      align: 32, offset: 0, flags: 0, extraData: !3,
 ///                      dwarfAddressSpace: 3)
-bool LLParser::ParseDIDerivedType(MDNode *&Result, bool IsDistinct) {
+bool LLParser::parseDIDerivedType(MDNode *&Result, bool IsDistinct) {
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
   REQUIRED(tag, DwarfTagField, );                                              \
   OPTIONAL(name, MDStringField, );                                             \
@@ -4643,7 +4823,7 @@ bool LLParser::ParseDIDerivedType(MDNode *&Result, bool IsDistinct) {
   return false;
 }
 
-bool LLParser::ParseDICompositeType(MDNode *&Result, bool IsDistinct) {
+bool LLParser::parseDICompositeType(MDNode *&Result, bool IsDistinct) {
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
   REQUIRED(tag, DwarfTagField, );                                              \
   OPTIONAL(name, MDStringField, );                                             \
@@ -4661,17 +4841,28 @@ bool LLParser::ParseDICompositeType(MDNode *&Result, bool IsDistinct) {
   OPTIONAL(templateParams, MDField, );                                         \
   OPTIONAL(identifier, MDStringField, );                                       \
   OPTIONAL(discriminator, MDField, );                                          \
-  OPTIONAL(dataLocation, MDField, );
+  OPTIONAL(dataLocation, MDField, );                                           \
+  OPTIONAL(associated, MDField, );                                             \
+  OPTIONAL(allocated, MDField, );                                              \
+  OPTIONAL(rank, MDSignedOrMDField, );
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
 
+  Metadata *Rank = nullptr;
+  if (rank.isMDSignedField())
+    Rank = ConstantAsMetadata::get(ConstantInt::getSigned(
+        Type::getInt64Ty(Context), rank.getMDSignedValue()));
+  else if (rank.isMDField())
+    Rank = rank.getMDFieldValue();
+
   // If this has an identifier try to build an ODR type.
   if (identifier.Val)
     if (auto *CT = DICompositeType::buildODRType(
             Context, *identifier.Val, tag.Val, name.Val, file.Val, line.Val,
             scope.Val, baseType.Val, size.Val, align.Val, offset.Val, flags.Val,
             elements.Val, runtimeLang.Val, vtableHolder.Val, templateParams.Val,
-            discriminator.Val, dataLocation.Val)) {
+            discriminator.Val, dataLocation.Val, associated.Val, allocated.Val,
+            Rank)) {
       Result = CT;
       return false;
     }
@@ -4683,11 +4874,12 @@ bool LLParser::ParseDICompositeType(MDNode *&Result, bool IsDistinct) {
       (Context, tag.Val, name.Val, file.Val, line.Val, scope.Val, baseType.Val,
        size.Val, align.Val, offset.Val, flags.Val, elements.Val,
        runtimeLang.Val, vtableHolder.Val, templateParams.Val, identifier.Val,
-       discriminator.Val, dataLocation.Val));
+       discriminator.Val, dataLocation.Val, associated.Val, allocated.Val,
+       Rank));
   return false;
 }
 
-bool LLParser::ParseDISubroutineType(MDNode *&Result, bool IsDistinct) {
+bool LLParser::parseDISubroutineType(MDNode *&Result, bool IsDistinct) {
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
   OPTIONAL(flags, DIFlagField, );                                              \
   OPTIONAL(cc, DwarfCCField, );                                                \
@@ -4700,12 +4892,12 @@ bool LLParser::ParseDISubroutineType(MDNode *&Result, bool IsDistinct) {
   return false;
 }
 
-/// ParseDIFileType:
+/// parseDIFileType:
 ///   ::= !DIFileType(filename: "path/to/file", directory: "/path/to/dir",
 ///                   checksumkind: CSK_MD5,
 ///                   checksum: "000102030405060708090a0b0c0d0e0f",
 ///                   source: "source file contents")
-bool LLParser::ParseDIFile(MDNode *&Result, bool IsDistinct) {
+bool LLParser::parseDIFile(MDNode *&Result, bool IsDistinct) {
   // The default constructed value for checksumkind is required, but will never
   // be used, as the parser checks if the field was actually Seen before using
   // the Val.
@@ -4732,14 +4924,14 @@ bool LLParser::ParseDIFile(MDNode *&Result, bool IsDistinct) {
   return false;
 }
 
-/// ParseDICompileUnit:
+/// parseDICompileUnit:
 ///   ::= !DICompileUnit(language: DW_LANG_C99, file: !0, producer: "clang",
 ///                      isOptimized: true, flags: "-O2", runtimeVersion: 1,
 ///                      splitDebugFilename: "abc.debug",
 ///                      emissionKind: FullDebug, enums: !1, retainedTypes: !2,
 ///                      globals: !4, imports: !5, macros: !6, dwoId: 0x0abcd,
 ///                      sysroot: "/", sdk: "MacOSX.sdk")
-bool LLParser::ParseDICompileUnit(MDNode *&Result, bool IsDistinct) {
+bool LLParser::parseDICompileUnit(MDNode *&Result, bool IsDistinct) {
   if (!IsDistinct)
     return Lex.Error("missing 'distinct', required for !DICompileUnit");
 
@@ -4776,7 +4968,7 @@ bool LLParser::ParseDICompileUnit(MDNode *&Result, bool IsDistinct) {
   return false;
 }
 
-/// ParseDISubprogram:
+/// parseDISubprogram:
 ///   ::= !DISubprogram(scope: !0, name: "foo", linkageName: "_Zfoo",
 ///                     file: !1, line: 7, type: !2, isLocal: false,
 ///                     isDefinition: true, scopeLine: 8, containingType: !3,
@@ -4784,7 +4976,7 @@ bool LLParser::ParseDICompileUnit(MDNode *&Result, bool IsDistinct) {
 ///                     virtualIndex: 10, thisAdjustment: 4, flags: 11,
 ///                     spFlags: 10, isOptimized: false, templateParams: !4,
 ///                     declaration: !5, retainedNodes: !6, thrownTypes: !7)
-bool LLParser::ParseDISubprogram(MDNode *&Result, bool IsDistinct) {
+bool LLParser::parseDISubprogram(MDNode *&Result, bool IsDistinct) {
   auto Loc = Lex.getLoc();
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
   OPTIONAL(scope, MDField, );                                                  \
@@ -4830,9 +5022,9 @@ bool LLParser::ParseDISubprogram(MDNode *&Result, bool IsDistinct) {
   return false;
 }
 
-/// ParseDILexicalBlock:
+/// parseDILexicalBlock:
 ///   ::= !DILexicalBlock(scope: !0, file: !2, line: 7, column: 9)
-bool LLParser::ParseDILexicalBlock(MDNode *&Result, bool IsDistinct) {
+bool LLParser::parseDILexicalBlock(MDNode *&Result, bool IsDistinct) {
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
   REQUIRED(scope, MDField, (/* AllowNull */ false));                           \
   OPTIONAL(file, MDField, );                                                   \
@@ -4846,9 +5038,9 @@ bool LLParser::ParseDILexicalBlock(MDNode *&Result, bool IsDistinct) {
   return false;
 }
 
-/// ParseDILexicalBlockFile:
+/// parseDILexicalBlockFile:
 ///   ::= !DILexicalBlockFile(scope: !0, file: !2, discriminator: 9)
-bool LLParser::ParseDILexicalBlockFile(MDNode *&Result, bool IsDistinct) {
+bool LLParser::parseDILexicalBlockFile(MDNode *&Result, bool IsDistinct) {
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
   REQUIRED(scope, MDField, (/* AllowNull */ false));                           \
   OPTIONAL(file, MDField, );                                                   \
@@ -4861,9 +5053,9 @@ bool LLParser::ParseDILexicalBlockFile(MDNode *&Result, bool IsDistinct) {
   return false;
 }
 
-/// ParseDICommonBlock:
+/// parseDICommonBlock:
 ///   ::= !DICommonBlock(scope: !0, file: !2, name: "COMMON name", line: 9)
-bool LLParser::ParseDICommonBlock(MDNode *&Result, bool IsDistinct) {
+bool LLParser::parseDICommonBlock(MDNode *&Result, bool IsDistinct) {
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
   REQUIRED(scope, MDField, );                                                  \
   OPTIONAL(declaration, MDField, );                                            \
@@ -4879,9 +5071,9 @@ bool LLParser::ParseDICommonBlock(MDNode *&Result, bool IsDistinct) {
   return false;
 }
 
-/// ParseDINamespace:
+/// parseDINamespace:
 ///   ::= !DINamespace(scope: !0, file: !2, name: "SomeNamespace", line: 9)
-bool LLParser::ParseDINamespace(MDNode *&Result, bool IsDistinct) {
+bool LLParser::parseDINamespace(MDNode *&Result, bool IsDistinct) {
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
   REQUIRED(scope, MDField, );                                                  \
   OPTIONAL(name, MDStringField, );                                             \
@@ -4894,9 +5086,10 @@ bool LLParser::ParseDINamespace(MDNode *&Result, bool IsDistinct) {
   return false;
 }
 
-/// ParseDIMacro:
-///   ::= !DIMacro(macinfo: type, line: 9, name: "SomeMacro", value: "SomeValue")
-bool LLParser::ParseDIMacro(MDNode *&Result, bool IsDistinct) {
+/// parseDIMacro:
+///   ::= !DIMacro(macinfo: type, line: 9, name: "SomeMacro", value:
+///   "SomeValue")
+bool LLParser::parseDIMacro(MDNode *&Result, bool IsDistinct) {
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
   REQUIRED(type, DwarfMacinfoTypeField, );                                     \
   OPTIONAL(line, LineField, );                                                 \
@@ -4910,9 +5103,9 @@ bool LLParser::ParseDIMacro(MDNode *&Result, bool IsDistinct) {
   return false;
 }
 
-/// ParseDIMacroFile:
+/// parseDIMacroFile:
 ///   ::= !DIMacroFile(line: 9, file: !2, nodes: !3)
-bool LLParser::ParseDIMacroFile(MDNode *&Result, bool IsDistinct) {
+bool LLParser::parseDIMacroFile(MDNode *&Result, bool IsDistinct) {
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
   OPTIONAL(type, DwarfMacinfoTypeField, (dwarf::DW_MACINFO_start_file));       \
   OPTIONAL(line, LineField, );                                                 \
@@ -4926,11 +5119,11 @@ bool LLParser::ParseDIMacroFile(MDNode *&Result, bool IsDistinct) {
   return false;
 }
 
-/// ParseDIModule:
+/// parseDIModule:
 ///   ::= !DIModule(scope: !0, name: "SomeModule", configMacros:
 ///   "-DNDEBUG", includePath: "/usr/include", apinotes: "module.apinotes",
-///   file: !1, line: 4)
-bool LLParser::ParseDIModule(MDNode *&Result, bool IsDistinct) {
+///   file: !1, line: 4, isDecl: false)
+bool LLParser::parseDIModule(MDNode *&Result, bool IsDistinct) {
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
   REQUIRED(scope, MDField, );                                                  \
   REQUIRED(name, MDStringField, );                                             \
@@ -4938,19 +5131,20 @@ bool LLParser::ParseDIModule(MDNode *&Result, bool IsDistinct) {
   OPTIONAL(includePath, MDStringField, );                                      \
   OPTIONAL(apinotes, MDStringField, );                                         \
   OPTIONAL(file, MDField, );                                                   \
-  OPTIONAL(line, LineField, );
+  OPTIONAL(line, LineField, );                                                 \
+  OPTIONAL(isDecl, MDBoolField, );
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
 
   Result = GET_OR_DISTINCT(DIModule, (Context, file.Val, scope.Val, name.Val,
                                       configMacros.Val, includePath.Val,
-                                      apinotes.Val, line.Val));
+                                      apinotes.Val, line.Val, isDecl.Val));
   return false;
 }
 
-/// ParseDITemplateTypeParameter:
+/// parseDITemplateTypeParameter:
 ///   ::= !DITemplateTypeParameter(name: "Ty", type: !1, defaulted: false)
-bool LLParser::ParseDITemplateTypeParameter(MDNode *&Result, bool IsDistinct) {
+bool LLParser::parseDITemplateTypeParameter(MDNode *&Result, bool IsDistinct) {
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
   OPTIONAL(name, MDStringField, );                                             \
   REQUIRED(type, MDField, );                                                   \
@@ -4963,11 +5157,11 @@ bool LLParser::ParseDITemplateTypeParameter(MDNode *&Result, bool IsDistinct) {
   return false;
 }
 
-/// ParseDITemplateValueParameter:
+/// parseDITemplateValueParameter:
 ///   ::= !DITemplateValueParameter(tag: DW_TAG_template_value_parameter,
 ///                                 name: "V", type: !1, defaulted: false,
 ///                                 value: i32 7)
-bool LLParser::ParseDITemplateValueParameter(MDNode *&Result, bool IsDistinct) {
+bool LLParser::parseDITemplateValueParameter(MDNode *&Result, bool IsDistinct) {
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
   OPTIONAL(tag, DwarfTagField, (dwarf::DW_TAG_template_value_parameter));      \
   OPTIONAL(name, MDStringField, );                                             \
@@ -4984,12 +5178,12 @@ bool LLParser::ParseDITemplateValueParameter(MDNode *&Result, bool IsDistinct) {
   return false;
 }
 
-/// ParseDIGlobalVariable:
+/// parseDIGlobalVariable:
 ///   ::= !DIGlobalVariable(scope: !0, name: "foo", linkageName: "foo",
 ///                         file: !1, line: 7, type: !2, isLocal: false,
 ///                         isDefinition: true, templateParams: !3,
 ///                         declaration: !4, align: 8)
-bool LLParser::ParseDIGlobalVariable(MDNode *&Result, bool IsDistinct) {
+bool LLParser::parseDIGlobalVariable(MDNode *&Result, bool IsDistinct) {
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
   REQUIRED(name, MDStringField, (/* AllowEmpty */ false));                     \
   OPTIONAL(scope, MDField, );                                                  \
@@ -5013,14 +5207,14 @@ bool LLParser::ParseDIGlobalVariable(MDNode *&Result, bool IsDistinct) {
   return false;
 }
 
-/// ParseDILocalVariable:
+/// parseDILocalVariable:
 ///   ::= !DILocalVariable(arg: 7, scope: !0, name: "foo",
 ///                        file: !1, line: 7, type: !2, arg: 2, flags: 7,
 ///                        align: 8)
 ///   ::= !DILocalVariable(scope: !0, name: "foo",
 ///                        file: !1, line: 7, type: !2, arg: 2, flags: 7,
 ///                        align: 8)
-bool LLParser::ParseDILocalVariable(MDNode *&Result, bool IsDistinct) {
+bool LLParser::parseDILocalVariable(MDNode *&Result, bool IsDistinct) {
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
   REQUIRED(scope, MDField, (/* AllowNull */ false));                           \
   OPTIONAL(name, MDStringField, );                                             \
@@ -5039,9 +5233,9 @@ bool LLParser::ParseDILocalVariable(MDNode *&Result, bool IsDistinct) {
   return false;
 }
 
-/// ParseDILabel:
+/// parseDILabel:
 ///   ::= !DILabel(scope: !0, name: "foo", file: !1, line: 7)
-bool LLParser::ParseDILabel(MDNode *&Result, bool IsDistinct) {
+bool LLParser::parseDILabel(MDNode *&Result, bool IsDistinct) {
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
   REQUIRED(scope, MDField, (/* AllowNull */ false));                           \
   REQUIRED(name, MDStringField, );                                             \
@@ -5055,13 +5249,13 @@ bool LLParser::ParseDILabel(MDNode *&Result, bool IsDistinct) {
   return false;
 }
 
-/// ParseDIExpression:
+/// parseDIExpression:
 ///   ::= !DIExpression(0, 7, -1)
-bool LLParser::ParseDIExpression(MDNode *&Result, bool IsDistinct) {
+bool LLParser::parseDIExpression(MDNode *&Result, bool IsDistinct) {
   assert(Lex.getKind() == lltok::MetadataVar && "Expected metadata type name");
   Lex.Lex();
 
-  if (ParseToken(lltok::lparen, "expected '(' here"))
+  if (parseToken(lltok::lparen, "expected '(' here"))
     return true;
 
   SmallVector<uint64_t, 8> Elements;
@@ -5073,7 +5267,7 @@ bool LLParser::ParseDIExpression(MDNode *&Result, bool IsDistinct) {
           Elements.push_back(Op);
           continue;
         }
-        return TokError(Twine("invalid DWARF op '") + Lex.getStrVal() + "'");
+        return tokError(Twine("invalid DWARF op '") + Lex.getStrVal() + "'");
       }
 
       if (Lex.getKind() == lltok::DwarfAttEncoding) {
@@ -5082,29 +5276,30 @@ bool LLParser::ParseDIExpression(MDNode *&Result, bool IsDistinct) {
           Elements.push_back(Op);
           continue;
         }
-        return TokError(Twine("invalid DWARF attribute encoding '") + Lex.getStrVal() + "'");
+        return tokError(Twine("invalid DWARF attribute encoding '") +
+                        Lex.getStrVal() + "'");
       }
 
       if (Lex.getKind() != lltok::APSInt || Lex.getAPSIntVal().isSigned())
-        return TokError("expected unsigned integer");
+        return tokError("expected unsigned integer");
 
       auto &U = Lex.getAPSIntVal();
       if (U.ugt(UINT64_MAX))
-        return TokError("element too large, limit is " + Twine(UINT64_MAX));
+        return tokError("element too large, limit is " + Twine(UINT64_MAX));
       Elements.push_back(U.getZExtValue());
       Lex.Lex();
     } while (EatIfPresent(lltok::comma));
 
-  if (ParseToken(lltok::rparen, "expected ')' here"))
+  if (parseToken(lltok::rparen, "expected ')' here"))
     return true;
 
   Result = GET_OR_DISTINCT(DIExpression, (Context, Elements));
   return false;
 }
 
-/// ParseDIGlobalVariableExpression:
+/// parseDIGlobalVariableExpression:
 ///   ::= !DIGlobalVariableExpression(var: !0, expr: !1)
-bool LLParser::ParseDIGlobalVariableExpression(MDNode *&Result,
+bool LLParser::parseDIGlobalVariableExpression(MDNode *&Result,
                                                bool IsDistinct) {
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
   REQUIRED(var, MDField, );                                                    \
@@ -5117,10 +5312,10 @@ bool LLParser::ParseDIGlobalVariableExpression(MDNode *&Result,
   return false;
 }
 
-/// ParseDIObjCProperty:
+/// parseDIObjCProperty:
 ///   ::= !DIObjCProperty(name: "foo", file: !1, line: 7, setter: "setFoo",
 ///                       getter: "getFoo", attributes: 7, type: !2)
-bool LLParser::ParseDIObjCProperty(MDNode *&Result, bool IsDistinct) {
+bool LLParser::parseDIObjCProperty(MDNode *&Result, bool IsDistinct) {
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
   OPTIONAL(name, MDStringField, );                                             \
   OPTIONAL(file, MDField, );                                                   \
@@ -5138,10 +5333,10 @@ bool LLParser::ParseDIObjCProperty(MDNode *&Result, bool IsDistinct) {
   return false;
 }
 
-/// ParseDIImportedEntity:
+/// parseDIImportedEntity:
 ///   ::= !DIImportedEntity(tag: DW_TAG_imported_module, scope: !0, entity: !1,
 ///                         line: 7, name: "foo")
-bool LLParser::ParseDIImportedEntity(MDNode *&Result, bool IsDistinct) {
+bool LLParser::parseDIImportedEntity(MDNode *&Result, bool IsDistinct) {
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
   REQUIRED(tag, DwarfTagField, );                                              \
   REQUIRED(scope, MDField, );                                                  \
@@ -5163,45 +5358,45 @@ bool LLParser::ParseDIImportedEntity(MDNode *&Result, bool IsDistinct) {
 #undef REQUIRE_FIELD
 #undef DECLARE_FIELD
 
-/// ParseMetadataAsValue
+/// parseMetadataAsValue
 ///  ::= metadata i32 %local
 ///  ::= metadata i32 @global
 ///  ::= metadata i32 7
 ///  ::= metadata !0
 ///  ::= metadata !{...}
 ///  ::= metadata !"string"
-bool LLParser::ParseMetadataAsValue(Value *&V, PerFunctionState &PFS) {
+bool LLParser::parseMetadataAsValue(Value *&V, PerFunctionState &PFS) {
   // Note: the type 'metadata' has already been parsed.
   Metadata *MD;
-  if (ParseMetadata(MD, &PFS))
+  if (parseMetadata(MD, &PFS))
     return true;
 
   V = MetadataAsValue::get(Context, MD);
   return false;
 }
 
-/// ParseValueAsMetadata
+/// parseValueAsMetadata
 ///  ::= i32 %local
 ///  ::= i32 @global
 ///  ::= i32 7
-bool LLParser::ParseValueAsMetadata(Metadata *&MD, const Twine &TypeMsg,
+bool LLParser::parseValueAsMetadata(Metadata *&MD, const Twine &TypeMsg,
                                     PerFunctionState *PFS) {
   Type *Ty;
   LocTy Loc;
-  if (ParseType(Ty, TypeMsg, Loc))
+  if (parseType(Ty, TypeMsg, Loc))
     return true;
   if (Ty->isMetadataTy())
-    return Error(Loc, "invalid metadata-value-metadata roundtrip");
+    return error(Loc, "invalid metadata-value-metadata roundtrip");
 
   Value *V;
-  if (ParseValue(Ty, V, PFS))
+  if (parseValue(Ty, V, PFS))
     return true;
 
   MD = ValueAsMetadata::get(V);
   return false;
 }
 
-/// ParseMetadata
+/// parseMetadata
 ///  ::= i32 %local
 ///  ::= i32 @global
 ///  ::= i32 7
@@ -5209,10 +5404,10 @@ bool LLParser::ParseValueAsMetadata(Metadata *&MD, const Twine &TypeMsg,
 ///  ::= !{...}
 ///  ::= !"string"
 ///  ::= !DILocation(...)
-bool LLParser::ParseMetadata(Metadata *&MD, PerFunctionState *PFS) {
+bool LLParser::parseMetadata(Metadata *&MD, PerFunctionState *PFS) {
   if (Lex.getKind() == lltok::MetadataVar) {
     MDNode *N;
-    if (ParseSpecializedMDNode(N))
+    if (parseSpecializedMDNode(N))
       return true;
     MD = N;
     return false;
@@ -5221,7 +5416,7 @@ bool LLParser::ParseMetadata(Metadata *&MD, PerFunctionState *PFS) {
   // ValueAsMetadata:
   // <type> <value>
   if (Lex.getKind() != lltok::exclaim)
-    return ParseValueAsMetadata(MD, "expected metadata operand", PFS);
+    return parseValueAsMetadata(MD, "expected metadata operand", PFS);
 
   // '!'.
   assert(Lex.getKind() == lltok::exclaim && "Expected '!' here");
@@ -5231,7 +5426,7 @@ bool LLParser::ParseMetadata(Metadata *&MD, PerFunctionState *PFS) {
   //   ::= '!' STRINGCONSTANT
   if (Lex.getKind() == lltok::StringConstant) {
     MDString *S;
-    if (ParseMDString(S))
+    if (parseMDString(S))
       return true;
     MD = S;
     return false;
@@ -5241,7 +5436,7 @@ bool LLParser::ParseMetadata(Metadata *&MD, PerFunctionState *PFS) {
   // !{ ... }
   // !7
   MDNode *N;
-  if (ParseMDNodeTail(N))
+  if (parseMDNodeTail(N))
     return true;
   MD = N;
   return false;
@@ -5251,48 +5446,52 @@ bool LLParser::ParseMetadata(Metadata *&MD, PerFunctionState *PFS) {
 // Function Parsing.
 //===----------------------------------------------------------------------===//
 
-bool LLParser::ConvertValIDToValue(Type *Ty, ValID &ID, Value *&V,
+bool LLParser::convertValIDToValue(Type *Ty, ValID &ID, Value *&V,
                                    PerFunctionState *PFS, bool IsCall) {
   if (Ty->isFunctionTy())
-    return Error(ID.Loc, "functions are not values, refer to them as pointers");
+    return error(ID.Loc, "functions are not values, refer to them as pointers");
 
   switch (ID.Kind) {
   case ValID::t_LocalID:
-    if (!PFS) return Error(ID.Loc, "invalid use of function-local name");
-    V = PFS->GetVal(ID.UIntVal, Ty, ID.Loc, IsCall);
+    if (!PFS)
+      return error(ID.Loc, "invalid use of function-local name");
+    V = PFS->getVal(ID.UIntVal, Ty, ID.Loc, IsCall);
     return V == nullptr;
   case ValID::t_LocalName:
-    if (!PFS) return Error(ID.Loc, "invalid use of function-local name");
-    V = PFS->GetVal(ID.StrVal, Ty, ID.Loc, IsCall);
+    if (!PFS)
+      return error(ID.Loc, "invalid use of function-local name");
+    V = PFS->getVal(ID.StrVal, Ty, ID.Loc, IsCall);
     return V == nullptr;
   case ValID::t_InlineAsm: {
     if (!ID.FTy || !InlineAsm::Verify(ID.FTy, ID.StrVal2))
-      return Error(ID.Loc, "invalid type for inline asm constraint string");
+      return error(ID.Loc, "invalid type for inline asm constraint string");
     V = InlineAsm::get(ID.FTy, ID.StrVal, ID.StrVal2, ID.UIntVal & 1,
                        (ID.UIntVal >> 1) & 1,
                        (InlineAsm::AsmDialect(ID.UIntVal >> 2)));
     return false;
   }
   case ValID::t_GlobalName:
-    V = GetGlobalVal(ID.StrVal, Ty, ID.Loc, IsCall);
+    V = getGlobalVal(ID.StrVal, Ty, ID.Loc, IsCall);
     return V == nullptr;
   case ValID::t_GlobalID:
-    V = GetGlobalVal(ID.UIntVal, Ty, ID.Loc, IsCall);
+    V = getGlobalVal(ID.UIntVal, Ty, ID.Loc, IsCall);
     return V == nullptr;
   case ValID::t_APSInt:
     if (!Ty->isIntegerTy())
-      return Error(ID.Loc, "integer constant must have integer type");
+      return error(ID.Loc, "integer constant must have integer type");
     ID.APSIntVal = ID.APSIntVal.extOrTrunc(Ty->getPrimitiveSizeInBits());
     V = ConstantInt::get(Context, ID.APSIntVal);
     return false;
   case ValID::t_APFloat:
     if (!Ty->isFloatingPointTy() ||
         !ConstantFP::isValueValidForType(Ty, ID.APFloatVal))
-      return Error(ID.Loc, "floating point constant invalid for type");
+      return error(ID.Loc, "floating point constant invalid for type");
 
     // The lexer has no type info, so builds all half, bfloat, float, and double
     // FP constants as double.  Fix this here.  Long double does not need this.
     if (&ID.APFloatVal.getSemantics() == &APFloat::IEEEdouble()) {
+      // Check for signaling before potentially converting and losing that info.
+      bool IsSNAN = ID.APFloatVal.isSignaling();
       bool Ignored;
       if (Ty->isHalfTy())
         ID.APFloatVal.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven,
@@ -5303,66 +5502,81 @@ bool LLParser::ConvertValIDToValue(Type *Ty, ValID &ID, Value *&V,
       else if (Ty->isFloatTy())
         ID.APFloatVal.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven,
                               &Ignored);
+      if (IsSNAN) {
+        // The convert call above may quiet an SNaN, so manufacture another
+        // SNaN. The bitcast works because the payload (significand) parameter
+        // is truncated to fit.
+        APInt Payload = ID.APFloatVal.bitcastToAPInt();
+        ID.APFloatVal = APFloat::getSNaN(ID.APFloatVal.getSemantics(),
+                                         ID.APFloatVal.isNegative(), &Payload);
+      }
     }
     V = ConstantFP::get(Context, ID.APFloatVal);
 
     if (V->getType() != Ty)
-      return Error(ID.Loc, "floating point constant does not have type '" +
-                   getTypeString(Ty) + "'");
+      return error(ID.Loc, "floating point constant does not have type '" +
+                               getTypeString(Ty) + "'");
 
     return false;
   case ValID::t_Null:
     if (!Ty->isPointerTy())
-      return Error(ID.Loc, "null must be a pointer type");
+      return error(ID.Loc, "null must be a pointer type");
     V = ConstantPointerNull::get(cast<PointerType>(Ty));
     return false;
   case ValID::t_Undef:
     // FIXME: LabelTy should not be a first-class type.
     if (!Ty->isFirstClassType() || Ty->isLabelTy())
-      return Error(ID.Loc, "invalid type for undef constant");
+      return error(ID.Loc, "invalid type for undef constant");
     V = UndefValue::get(Ty);
     return false;
   case ValID::t_EmptyArray:
     if (!Ty->isArrayTy() || cast<ArrayType>(Ty)->getNumElements() != 0)
-      return Error(ID.Loc, "invalid empty array initializer");
+      return error(ID.Loc, "invalid empty array initializer");
     V = UndefValue::get(Ty);
     return false;
   case ValID::t_Zero:
     // FIXME: LabelTy should not be a first-class type.
     if (!Ty->isFirstClassType() || Ty->isLabelTy())
-      return Error(ID.Loc, "invalid type for null constant");
+      return error(ID.Loc, "invalid type for null constant");
     V = Constant::getNullValue(Ty);
     return false;
   case ValID::t_None:
     if (!Ty->isTokenTy())
-      return Error(ID.Loc, "invalid type for none constant");
+      return error(ID.Loc, "invalid type for none constant");
     V = Constant::getNullValue(Ty);
     return false;
+  case ValID::t_Poison:
+    // FIXME: LabelTy should not be a first-class type.
+    if (!Ty->isFirstClassType() || Ty->isLabelTy())
+      return error(ID.Loc, "invalid type for poison constant");
+    V = PoisonValue::get(Ty);
+    return false;
   case ValID::t_Constant:
     if (ID.ConstantVal->getType() != Ty)
-      return Error(ID.Loc, "constant expression type mismatch");
-
+      return error(ID.Loc, "constant expression type mismatch");
     V = ID.ConstantVal;
     return false;
   case ValID::t_ConstantStruct:
   case ValID::t_PackedConstantStruct:
     if (StructType *ST = dyn_cast<StructType>(Ty)) {
       if (ST->getNumElements() != ID.UIntVal)
-        return Error(ID.Loc,
+        return error(ID.Loc,
                      "initializer with struct type has wrong # elements");
       if (ST->isPacked() != (ID.Kind == ValID::t_PackedConstantStruct))
-        return Error(ID.Loc, "packed'ness of initializer and type don't match");
+        return error(ID.Loc, "packed'ness of initializer and type don't match");
 
       // Verify that the elements are compatible with the structtype.
       for (unsigned i = 0, e = ID.UIntVal; i != e; ++i)
         if (ID.ConstantStructElts[i]->getType() != ST->getElementType(i))
-          return Error(ID.Loc, "element " + Twine(i) +
-                    " of struct initializer doesn't match struct element type");
+          return error(
+              ID.Loc,
+              "element " + Twine(i) +
+                  " of struct initializer doesn't match struct element type");
 
       V = ConstantStruct::get(
           ST, makeArrayRef(ID.ConstantStructElts.get(), ID.UIntVal));
     } else
-      return Error(ID.Loc, "constant expression type mismatch");
+      return error(ID.Loc, "constant expression type mismatch");
     return false;
   }
   llvm_unreachable("Invalid ValID");
@@ -5372,7 +5586,7 @@ bool LLParser::parseConstantValue(Type *Ty, Constant *&C) {
   C = nullptr;
   ValID ID;
   auto Loc = Lex.getLoc();
-  if (ParseValID(ID, /*PFS=*/nullptr))
+  if (parseValID(ID, /*PFS=*/nullptr))
     return true;
   switch (ID.Kind) {
   case ValID::t_APSInt:
@@ -5382,7 +5596,7 @@ bool LLParser::parseConstantValue(Type *Ty, Constant *&C) {
   case ValID::t_ConstantStruct:
   case ValID::t_PackedConstantStruct: {
     Value *V;
-    if (ConvertValIDToValue(Ty, ID, V, /*PFS=*/nullptr, /*IsCall=*/false))
+    if (convertValIDToValue(Ty, ID, V, /*PFS=*/nullptr, /*IsCall=*/false))
       return true;
     assert(isa<Constant>(V) && "Expected a constant value");
     C = cast<Constant>(V);
@@ -5392,30 +5606,30 @@ bool LLParser::parseConstantValue(Type *Ty, Constant *&C) {
     C = Constant::getNullValue(Ty);
     return false;
   default:
-    return Error(Loc, "expected a constant value");
+    return error(Loc, "expected a constant value");
   }
 }
 
-bool LLParser::ParseValue(Type *Ty, Value *&V, PerFunctionState *PFS) {
+bool LLParser::parseValue(Type *Ty, Value *&V, PerFunctionState *PFS) {
   V = nullptr;
   ValID ID;
-  return ParseValID(ID, PFS) ||
-         ConvertValIDToValue(Ty, ID, V, PFS, /*IsCall=*/false);
+  return parseValID(ID, PFS) ||
+         convertValIDToValue(Ty, ID, V, PFS, /*IsCall=*/false);
 }
 
-bool LLParser::ParseTypeAndValue(Value *&V, PerFunctionState *PFS) {
+bool LLParser::parseTypeAndValue(Value *&V, PerFunctionState *PFS) {
   Type *Ty = nullptr;
-  return ParseType(Ty) ||
-         ParseValue(Ty, V, PFS);
+  return parseType(Ty) || parseValue(Ty, V, PFS);
 }
 
-bool LLParser::ParseTypeAndBasicBlock(BasicBlock *&BB, LocTy &Loc,
+bool LLParser::parseTypeAndBasicBlock(BasicBlock *&BB, LocTy &Loc,
                                       PerFunctionState &PFS) {
   Value *V;
   Loc = Lex.getLoc();
-  if (ParseTypeAndValue(V, PFS)) return true;
+  if (parseTypeAndValue(V, PFS))
+    return true;
   if (!isa<BasicBlock>(V))
-    return Error(Loc, "expected a basic block");
+    return error(Loc, "expected a basic block");
   BB = cast<BasicBlock>(V);
   return false;
 }
@@ -5425,8 +5639,8 @@ bool LLParser::ParseTypeAndBasicBlock(BasicBlock *&BB, LocTy &Loc,
 ///       OptionalCallingConv OptRetAttrs OptUnnamedAddr Type GlobalName
 ///       '(' ArgList ')' OptAddrSpace OptFuncAttrs OptSection OptionalAlign
 ///       OptGC OptionalPrefix OptionalPrologue OptPersonalityFn
-bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
-  // Parse the linkage.
+bool LLParser::parseFunctionHeader(Function *&Fn, bool IsDefine) {
+  // parse the linkage.
   LocTy LinkageLoc = Lex.getLoc();
   unsigned Linkage;
   unsigned Visibility;
@@ -5437,10 +5651,10 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
   bool HasLinkage;
   Type *RetType = nullptr;
   LocTy RetTypeLoc = Lex.getLoc();
-  if (ParseOptionalLinkage(Linkage, HasLinkage, Visibility, DLLStorageClass,
+  if (parseOptionalLinkage(Linkage, HasLinkage, Visibility, DLLStorageClass,
                            DSOLocal) ||
-      ParseOptionalCallingConv(CC) || ParseOptionalReturnAttrs(RetAttrs) ||
-      ParseType(RetType, RetTypeLoc, true /*void allowed*/))
+      parseOptionalCallingConv(CC) || parseOptionalReturnAttrs(RetAttrs) ||
+      parseType(RetType, RetTypeLoc, true /*void allowed*/))
     return true;
 
   // Verify that the linkage is ok.
@@ -5448,8 +5662,8 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
   case GlobalValue::ExternalLinkage:
     break; // always ok.
   case GlobalValue::ExternalWeakLinkage:
-    if (isDefine)
-      return Error(LinkageLoc, "invalid linkage for function definition");
+    if (IsDefine)
+      return error(LinkageLoc, "invalid linkage for function definition");
     break;
   case GlobalValue::PrivateLinkage:
   case GlobalValue::InternalLinkage:
@@ -5458,20 +5672,20 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
   case GlobalValue::LinkOnceODRLinkage:
   case GlobalValue::WeakAnyLinkage:
   case GlobalValue::WeakODRLinkage:
-    if (!isDefine)
-      return Error(LinkageLoc, "invalid linkage for function declaration");
+    if (!IsDefine)
+      return error(LinkageLoc, "invalid linkage for function declaration");
     break;
   case GlobalValue::AppendingLinkage:
   case GlobalValue::CommonLinkage:
-    return Error(LinkageLoc, "invalid function linkage type");
+    return error(LinkageLoc, "invalid function linkage type");
   }
 
   if (!isValidVisibilityForLinkage(Visibility, Linkage))
-    return Error(LinkageLoc,
+    return error(LinkageLoc,
                  "symbol with local linkage must have default visibility");
 
   if (!FunctionType::isValidReturnType(RetType))
-    return Error(RetTypeLoc, "invalid function return type");
+    return error(RetTypeLoc, "invalid function return type");
 
   LocTy NameLoc = Lex.getLoc();
 
@@ -5482,19 +5696,19 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
     unsigned NameID = Lex.getUIntVal();
 
     if (NameID != NumberedVals.size())
-      return TokError("function expected to be numbered '%" +
+      return tokError("function expected to be numbered '%" +
                       Twine(NumberedVals.size()) + "'");
   } else {
-    return TokError("expected function name");
+    return tokError("expected function name");
   }
 
   Lex.Lex();
 
   if (Lex.getKind() != lltok::lparen)
-    return TokError("expected '(' in function argument list");
+    return tokError("expected '(' in function argument list");
 
   SmallVector<ArgInfo, 8> ArgList;
-  bool isVarArg;
+  bool IsVarArg;
   AttrBuilder FuncAttrs;
   std::vector<unsigned> FwdRefAttrGrps;
   LocTy BuiltinLoc;
@@ -5509,29 +5723,24 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
   Constant *PersonalityFn = nullptr;
   Comdat *C;
 
-  if (ParseArgumentList(ArgList, isVarArg) ||
-      ParseOptionalUnnamedAddr(UnnamedAddr) ||
-      ParseOptionalProgramAddrSpace(AddrSpace) ||
-      ParseFnAttributeValuePairs(FuncAttrs, FwdRefAttrGrps, false,
+  if (parseArgumentList(ArgList, IsVarArg) ||
+      parseOptionalUnnamedAddr(UnnamedAddr) ||
+      parseOptionalProgramAddrSpace(AddrSpace) ||
+      parseFnAttributeValuePairs(FuncAttrs, FwdRefAttrGrps, false,
                                  BuiltinLoc) ||
-      (EatIfPresent(lltok::kw_section) &&
-       ParseStringConstant(Section)) ||
-      (EatIfPresent(lltok::kw_partition) &&
-       ParseStringConstant(Partition)) ||
+      (EatIfPresent(lltok::kw_section) && parseStringConstant(Section)) ||
+      (EatIfPresent(lltok::kw_partition) && parseStringConstant(Partition)) ||
       parseOptionalComdat(FunctionName, C) ||
-      ParseOptionalAlignment(Alignment) ||
-      (EatIfPresent(lltok::kw_gc) &&
-       ParseStringConstant(GC)) ||
-      (EatIfPresent(lltok::kw_prefix) &&
-       ParseGlobalTypeAndValue(Prefix)) ||
-      (EatIfPresent(lltok::kw_prologue) &&
-       ParseGlobalTypeAndValue(Prologue)) ||
+      parseOptionalAlignment(Alignment) ||
+      (EatIfPresent(lltok::kw_gc) && parseStringConstant(GC)) ||
+      (EatIfPresent(lltok::kw_prefix) && parseGlobalTypeAndValue(Prefix)) ||
+      (EatIfPresent(lltok::kw_prologue) && parseGlobalTypeAndValue(Prologue)) ||
       (EatIfPresent(lltok::kw_personality) &&
-       ParseGlobalTypeAndValue(PersonalityFn)))
+       parseGlobalTypeAndValue(PersonalityFn)))
     return true;
 
   if (FuncAttrs.contains(Attribute::Builtin))
-    return Error(BuiltinLoc, "'builtin' attribute not valid on function");
+    return error(BuiltinLoc, "'builtin' attribute not valid on function");
 
   // If the alignment was parsed as an attribute, move to the alignment field.
   if (FuncAttrs.hasAlignmentAttr()) {
@@ -5554,10 +5763,9 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
                          AttributeSet::get(Context, RetAttrs), Attrs);
 
   if (PAL.hasAttribute(1, Attribute::StructRet) && !RetType->isVoidTy())
-    return Error(RetTypeLoc, "functions with 'sret' argument must return void");
+    return error(RetTypeLoc, "functions with 'sret' argument must return void");
 
-  FunctionType *FT =
-    FunctionType::get(RetType, ParamTypeList, isVarArg);
+  FunctionType *FT = FunctionType::get(RetType, ParamTypeList, IsVarArg);
   PointerType *PFT = PointerType::get(FT, AddrSpace);
 
   Fn = nullptr;
@@ -5568,20 +5776,24 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
     if (FRVI != ForwardRefVals.end()) {
       Fn = M->getFunction(FunctionName);
       if (!Fn)
-        return Error(FRVI->second.second, "invalid forward reference to "
-                     "function as global value!");
+        return error(FRVI->second.second, "invalid forward reference to "
+                                          "function as global value!");
       if (Fn->getType() != PFT)
-        return Error(FRVI->second.second, "invalid forward reference to "
-                     "function '" + FunctionName + "' with wrong type: "
-                     "expected '" + getTypeString(PFT) + "' but was '" +
-                     getTypeString(Fn->getType()) + "'");
+        return error(FRVI->second.second,
+                     "invalid forward reference to "
+                     "function '" +
+                         FunctionName +
+                         "' with wrong type: "
+                         "expected '" +
+                         getTypeString(PFT) + "' but was '" +
+                         getTypeString(Fn->getType()) + "'");
       ForwardRefVals.erase(FRVI);
     } else if ((Fn = M->getFunction(FunctionName))) {
       // Reject redefinitions.
-      return Error(NameLoc, "invalid redefinition of function '" +
-                   FunctionName + "'");
+      return error(NameLoc,
+                   "invalid redefinition of function '" + FunctionName + "'");
     } else if (M->getNamedValue(FunctionName)) {
-      return Error(NameLoc, "redefinition of function '@" + FunctionName + "'");
+      return error(NameLoc, "redefinition of function '@" + FunctionName + "'");
     }
 
   } else {
@@ -5591,10 +5803,12 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
     if (I != ForwardRefValIDs.end()) {
       Fn = cast<Function>(I->second.first);
       if (Fn->getType() != PFT)
-        return Error(NameLoc, "type of definition and forward reference of '@" +
-                     Twine(NumberedVals.size()) + "' disagree: "
-                     "expected '" + getTypeString(PFT) + "' but was '" +
-                     getTypeString(Fn->getType()) + "'");
+        return error(NameLoc, "type of definition and forward reference of '@" +
+                                  Twine(NumberedVals.size()) +
+                                  "' disagree: "
+                                  "expected '" +
+                                  getTypeString(PFT) + "' but was '" +
+                                  getTypeString(Fn->getType()) + "'");
       ForwardRefValIDs.erase(I);
     }
   }
@@ -5637,11 +5851,11 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
     ArgIt->setName(ArgList[i].Name);
 
     if (ArgIt->getName() != ArgList[i].Name)
-      return Error(ArgList[i].Loc, "redefinition of argument '%" +
-                   ArgList[i].Name + "'");
+      return error(ArgList[i].Loc,
+                   "redefinition of argument '%" + ArgList[i].Name + "'");
   }
 
-  if (isDefine)
+  if (IsDefine)
     return false;
 
   // Check the declaration has no block address forward references.
@@ -5655,7 +5869,7 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
   }
   auto Blocks = ForwardRefBlockAddresses.find(ID);
   if (Blocks != ForwardRefBlockAddresses.end())
-    return Error(Blocks->first.Loc,
+    return error(Blocks->first.Loc,
                  "cannot take blockaddress inside a declaration");
   return false;
 }
@@ -5682,11 +5896,11 @@ bool LLParser::PerFunctionState::resolveForwardRefBlockAddresses() {
            "Expected local id or name");
     BasicBlock *BB;
     if (BBID.Kind == ValID::t_LocalName)
-      BB = GetBB(BBID.StrVal, BBID.Loc);
+      BB = getBB(BBID.StrVal, BBID.Loc);
     else
-      BB = GetBB(BBID.UIntVal, BBID.Loc);
+      BB = getBB(BBID.UIntVal, BBID.Loc);
     if (!BB)
-      return P.Error(BBID.Loc, "referenced value is not a basic block");
+      return P.error(BBID.Loc, "referenced value is not a basic block");
 
     GV->replaceAllUsesWith(BlockAddress::get(&F, BB));
     GV->eraseFromParent();
@@ -5696,11 +5910,11 @@ bool LLParser::PerFunctionState::resolveForwardRefBlockAddresses() {
   return false;
 }
 
-/// ParseFunctionBody
+/// parseFunctionBody
 ///   ::= '{' BasicBlock+ UseListOrderDirective* '}'
-bool LLParser::ParseFunctionBody(Function &Fn) {
+bool LLParser::parseFunctionBody(Function &Fn) {
   if (Lex.getKind() != lltok::lbrace)
-    return TokError("expected '{' in function body");
+    return tokError("expected '{' in function body");
   Lex.Lex();  // eat the {.
 
   int FunctionNumber = -1;
@@ -5716,26 +5930,27 @@ bool LLParser::ParseFunctionBody(Function &Fn) {
 
   // We need at least one basic block.
   if (Lex.getKind() == lltok::rbrace || Lex.getKind() == lltok::kw_uselistorder)
-    return TokError("function body requires at least one basic block");
+    return tokError("function body requires at least one basic block");
 
   while (Lex.getKind() != lltok::rbrace &&
          Lex.getKind() != lltok::kw_uselistorder)
-    if (ParseBasicBlock(PFS)) return true;
+    if (parseBasicBlock(PFS))
+      return true;
 
   while (Lex.getKind() != lltok::rbrace)
-    if (ParseUseListOrder(&PFS))
+    if (parseUseListOrder(&PFS))
       return true;
 
   // Eat the }.
   Lex.Lex();
 
   // Verify function is ok.
-  return PFS.FinishFunction();
+  return PFS.finishFunction();
 }
 
-/// ParseBasicBlock
+/// parseBasicBlock
 ///   ::= (LabelStr|LabelID)? Instruction*
-bool LLParser::ParseBasicBlock(PerFunctionState &PFS) {
+bool LLParser::parseBasicBlock(PerFunctionState &PFS) {
   // If this basic block starts out with a name, remember it.
   std::string Name;
   int NameID = -1;
@@ -5748,13 +5963,13 @@ bool LLParser::ParseBasicBlock(PerFunctionState &PFS) {
     Lex.Lex();
   }
 
-  BasicBlock *BB = PFS.DefineBB(Name, NameID, NameLoc);
+  BasicBlock *BB = PFS.defineBB(Name, NameID, NameLoc);
   if (!BB)
     return true;
 
   std::string NameStr;
 
-  // Parse the instructions in this block until we get a terminator.
+  // parse the instructions in this block until we get a terminator.
   Instruction *Inst;
   do {
     // This instruction may have three possibilities for a name: a) none
@@ -5766,17 +5981,18 @@ bool LLParser::ParseBasicBlock(PerFunctionState &PFS) {
     if (Lex.getKind() == lltok::LocalVarID) {
       NameID = Lex.getUIntVal();
       Lex.Lex();
-      if (ParseToken(lltok::equal, "expected '=' after instruction id"))
+      if (parseToken(lltok::equal, "expected '=' after instruction id"))
         return true;
     } else if (Lex.getKind() == lltok::LocalVar) {
       NameStr = Lex.getStrVal();
       Lex.Lex();
-      if (ParseToken(lltok::equal, "expected '=' after instruction name"))
+      if (parseToken(lltok::equal, "expected '=' after instruction name"))
         return true;
     }
 
-    switch (ParseInstruction(Inst, BB, PFS)) {
-    default: llvm_unreachable("Unknown ParseInstruction result!");
+    switch (parseInstruction(Inst, BB, PFS)) {
+    default:
+      llvm_unreachable("Unknown parseInstruction result!");
     case InstError: return true;
     case InstNormal:
       BB->getInstList().push_back(Inst);
@@ -5784,7 +6000,7 @@ bool LLParser::ParseBasicBlock(PerFunctionState &PFS) {
       // With a normal result, we check to see if the instruction is followed by
       // a comma and metadata.
       if (EatIfPresent(lltok::comma))
-        if (ParseInstructionMetadata(*Inst))
+        if (parseInstructionMetadata(*Inst))
           return true;
       break;
     case InstExtraComma:
@@ -5792,13 +6008,14 @@ bool LLParser::ParseBasicBlock(PerFunctionState &PFS) {
 
       // If the instruction parser ate an extra comma at the end of it, it
       // *must* be followed by metadata.
-      if (ParseInstructionMetadata(*Inst))
+      if (parseInstructionMetadata(*Inst))
         return true;
       break;
     }
 
     // Set the name on the instruction.
-    if (PFS.SetInstName(NameID, NameStr, NameLoc, Inst)) return true;
+    if (PFS.setInstName(NameID, NameStr, NameLoc, Inst))
+      return true;
   } while (!Inst->isTerminator());
 
   return false;
@@ -5808,37 +6025,50 @@ bool LLParser::ParseBasicBlock(PerFunctionState &PFS) {
 // Instruction Parsing.
 //===----------------------------------------------------------------------===//
 
-/// ParseInstruction - Parse one of the many different instructions.
+/// parseInstruction - parse one of the many different instructions.
 ///
-int LLParser::ParseInstruction(Instruction *&Inst, BasicBlock *BB,
+int LLParser::parseInstruction(Instruction *&Inst, BasicBlock *BB,
                                PerFunctionState &PFS) {
   lltok::Kind Token = Lex.getKind();
   if (Token == lltok::Eof)
-    return TokError("found end of file when expecting more instructions");
+    return tokError("found end of file when expecting more instructions");
   LocTy Loc = Lex.getLoc();
   unsigned KeywordVal = Lex.getUIntVal();
   Lex.Lex();  // Eat the keyword.
 
   switch (Token) {
-  default:                    return Error(Loc, "expected instruction opcode");
+  default:
+    return error(Loc, "expected instruction opcode");
   // Terminator Instructions.
   case lltok::kw_unreachable: Inst = new UnreachableInst(Context); return false;
-  case lltok::kw_ret:         return ParseRet(Inst, BB, PFS);
-  case lltok::kw_br:          return ParseBr(Inst, PFS);
-  case lltok::kw_switch:      return ParseSwitch(Inst, PFS);
-  case lltok::kw_indirectbr:  return ParseIndirectBr(Inst, PFS);
-  case lltok::kw_invoke:      return ParseInvoke(Inst, PFS);
-  case lltok::kw_resume:      return ParseResume(Inst, PFS);
-  case lltok::kw_cleanupret:  return ParseCleanupRet(Inst, PFS);
-  case lltok::kw_catchret:    return ParseCatchRet(Inst, PFS);
-  case lltok::kw_catchswitch: return ParseCatchSwitch(Inst, PFS);
-  case lltok::kw_catchpad:    return ParseCatchPad(Inst, PFS);
-  case lltok::kw_cleanuppad:  return ParseCleanupPad(Inst, PFS);
-  case lltok::kw_callbr:      return ParseCallBr(Inst, PFS);
+  case lltok::kw_ret:
+    return parseRet(Inst, BB, PFS);
+  case lltok::kw_br:
+    return parseBr(Inst, PFS);
+  case lltok::kw_switch:
+    return parseSwitch(Inst, PFS);
+  case lltok::kw_indirectbr:
+    return parseIndirectBr(Inst, PFS);
+  case lltok::kw_invoke:
+    return parseInvoke(Inst, PFS);
+  case lltok::kw_resume:
+    return parseResume(Inst, PFS);
+  case lltok::kw_cleanupret:
+    return parseCleanupRet(Inst, PFS);
+  case lltok::kw_catchret:
+    return parseCatchRet(Inst, PFS);
+  case lltok::kw_catchswitch:
+    return parseCatchSwitch(Inst, PFS);
+  case lltok::kw_catchpad:
+    return parseCatchPad(Inst, PFS);
+  case lltok::kw_cleanuppad:
+    return parseCleanupPad(Inst, PFS);
+  case lltok::kw_callbr:
+    return parseCallBr(Inst, PFS);
   // Unary Operators.
   case lltok::kw_fneg: {
     FastMathFlags FMF = EatFastMathFlagsIfPresent();
-    int Res = ParseUnaryOp(Inst, PFS, KeywordVal, /*IsFP*/true);
+    int Res = parseUnaryOp(Inst, PFS, KeywordVal, /*IsFP*/ true);
     if (Res != 0)
       return Res;
     if (FMF.any())
@@ -5854,7 +6084,8 @@ int LLParser::ParseInstruction(Instruction *&Inst, BasicBlock *BB,
     bool NSW = EatIfPresent(lltok::kw_nsw);
     if (!NUW) NUW = EatIfPresent(lltok::kw_nuw);
 
-    if (ParseArithmetic(Inst, PFS, KeywordVal, /*IsFP*/false)) return true;
+    if (parseArithmetic(Inst, PFS, KeywordVal, /*IsFP*/ false))
+      return true;
 
     if (NUW) cast<BinaryOperator>(Inst)->setHasNoUnsignedWrap(true);
     if (NSW) cast<BinaryOperator>(Inst)->setHasNoSignedWrap(true);
@@ -5866,7 +6097,7 @@ int LLParser::ParseInstruction(Instruction *&Inst, BasicBlock *BB,
   case lltok::kw_fdiv:
   case lltok::kw_frem: {
     FastMathFlags FMF = EatFastMathFlagsIfPresent();
-    int Res = ParseArithmetic(Inst, PFS, KeywordVal, /*IsFP*/true);
+    int Res = parseArithmetic(Inst, PFS, KeywordVal, /*IsFP*/ true);
     if (Res != 0)
       return Res;
     if (FMF.any())
@@ -5880,21 +6111,25 @@ int LLParser::ParseInstruction(Instruction *&Inst, BasicBlock *BB,
   case lltok::kw_ashr: {
     bool Exact = EatIfPresent(lltok::kw_exact);
 
-    if (ParseArithmetic(Inst, PFS, KeywordVal, /*IsFP*/false)) return true;
+    if (parseArithmetic(Inst, PFS, KeywordVal, /*IsFP*/ false))
+      return true;
     if (Exact) cast<BinaryOperator>(Inst)->setIsExact(true);
     return false;
   }
 
   case lltok::kw_urem:
-  case lltok::kw_srem:   return ParseArithmetic(Inst, PFS, KeywordVal,
-                                                /*IsFP*/false);
+  case lltok::kw_srem:
+    return parseArithmetic(Inst, PFS, KeywordVal,
+                           /*IsFP*/ false);
   case lltok::kw_and:
   case lltok::kw_or:
-  case lltok::kw_xor:    return ParseLogical(Inst, PFS, KeywordVal);
-  case lltok::kw_icmp:   return ParseCompare(Inst, PFS, KeywordVal);
+  case lltok::kw_xor:
+    return parseLogical(Inst, PFS, KeywordVal);
+  case lltok::kw_icmp:
+    return parseCompare(Inst, PFS, KeywordVal);
   case lltok::kw_fcmp: {
     FastMathFlags FMF = EatFastMathFlagsIfPresent();
-    int Res = ParseCompare(Inst, PFS, KeywordVal);
+    int Res = parseCompare(Inst, PFS, KeywordVal);
     if (Res != 0)
       return Res;
     if (FMF.any())
@@ -5915,63 +6150,84 @@ int LLParser::ParseInstruction(Instruction *&Inst, BasicBlock *BB,
   case lltok::kw_fptoui:
   case lltok::kw_fptosi:
   case lltok::kw_inttoptr:
-  case lltok::kw_ptrtoint:       return ParseCast(Inst, PFS, KeywordVal);
+  case lltok::kw_ptrtoint:
+    return parseCast(Inst, PFS, KeywordVal);
   // Other.
   case lltok::kw_select: {
     FastMathFlags FMF = EatFastMathFlagsIfPresent();
-    int Res = ParseSelect(Inst, PFS);
+    int Res = parseSelect(Inst, PFS);
     if (Res != 0)
       return Res;
     if (FMF.any()) {
       if (!isa<FPMathOperator>(Inst))
-        return Error(Loc, "fast-math-flags specified for select without "
+        return error(Loc, "fast-math-flags specified for select without "
                           "floating-point scalar or vector return type");
       Inst->setFastMathFlags(FMF);
     }
     return 0;
   }
-  case lltok::kw_va_arg:         return ParseVA_Arg(Inst, PFS);
-  case lltok::kw_extractelement: return ParseExtractElement(Inst, PFS);
-  case lltok::kw_insertelement:  return ParseInsertElement(Inst, PFS);
-  case lltok::kw_shufflevector:  return ParseShuffleVector(Inst, PFS);
+  case lltok::kw_va_arg:
+    return parseVAArg(Inst, PFS);
+  case lltok::kw_extractelement:
+    return parseExtractElement(Inst, PFS);
+  case lltok::kw_insertelement:
+    return parseInsertElement(Inst, PFS);
+  case lltok::kw_shufflevector:
+    return parseShuffleVector(Inst, PFS);
   case lltok::kw_phi: {
     FastMathFlags FMF = EatFastMathFlagsIfPresent();
-    int Res = ParsePHI(Inst, PFS);
+    int Res = parsePHI(Inst, PFS);
     if (Res != 0)
       return Res;
     if (FMF.any()) {
       if (!isa<FPMathOperator>(Inst))
-        return Error(Loc, "fast-math-flags specified for phi without "
+        return error(Loc, "fast-math-flags specified for phi without "
                           "floating-point scalar or vector return type");
       Inst->setFastMathFlags(FMF);
     }
     return 0;
   }
-  case lltok::kw_landingpad:     return ParseLandingPad(Inst, PFS);
-  case lltok::kw_freeze:         return ParseFreeze(Inst, PFS);
+  case lltok::kw_landingpad:
+    return parseLandingPad(Inst, PFS);
+  case lltok::kw_freeze:
+    return parseFreeze(Inst, PFS);
   // Call.
-  case lltok::kw_call:     return ParseCall(Inst, PFS, CallInst::TCK_None);
-  case lltok::kw_tail:     return ParseCall(Inst, PFS, CallInst::TCK_Tail);
-  case lltok::kw_musttail: return ParseCall(Inst, PFS, CallInst::TCK_MustTail);
-  case lltok::kw_notail:   return ParseCall(Inst, PFS, CallInst::TCK_NoTail);
+  case lltok::kw_call:
+    return parseCall(Inst, PFS, CallInst::TCK_None);
+  case lltok::kw_tail:
+    return parseCall(Inst, PFS, CallInst::TCK_Tail);
+  case lltok::kw_musttail:
+    return parseCall(Inst, PFS, CallInst::TCK_MustTail);
+  case lltok::kw_notail:
+    return parseCall(Inst, PFS, CallInst::TCK_NoTail);
   // Memory.
-  case lltok::kw_alloca:         return ParseAlloc(Inst, PFS);
-  case lltok::kw_load:           return ParseLoad(Inst, PFS);
-  case lltok::kw_store:          return ParseStore(Inst, PFS);
-  case lltok::kw_cmpxchg:        return ParseCmpXchg(Inst, PFS);
-  case lltok::kw_atomicrmw:      return ParseAtomicRMW(Inst, PFS);
-  case lltok::kw_fence:          return ParseFence(Inst, PFS);
-  case lltok::kw_getelementptr: return ParseGetElementPtr(Inst, PFS);
-  case lltok::kw_extractvalue:  return ParseExtractValue(Inst, PFS);
-  case lltok::kw_insertvalue:   return ParseInsertValue(Inst, PFS);
+  case lltok::kw_alloca:
+    return parseAlloc(Inst, PFS);
+  case lltok::kw_load:
+    return parseLoad(Inst, PFS);
+  case lltok::kw_store:
+    return parseStore(Inst, PFS);
+  case lltok::kw_cmpxchg:
+    return parseCmpXchg(Inst, PFS);
+  case lltok::kw_atomicrmw:
+    return parseAtomicRMW(Inst, PFS);
+  case lltok::kw_fence:
+    return parseFence(Inst, PFS);
+  case lltok::kw_getelementptr:
+    return parseGetElementPtr(Inst, PFS);
+  case lltok::kw_extractvalue:
+    return parseExtractValue(Inst, PFS);
+  case lltok::kw_insertvalue:
+    return parseInsertValue(Inst, PFS);
   }
 }
 
-/// ParseCmpPredicate - Parse an integer or fp predicate, based on Kind.
-bool LLParser::ParseCmpPredicate(unsigned &P, unsigned Opc) {
+/// parseCmpPredicate - parse an integer or fp predicate, based on Kind.
+bool LLParser::parseCmpPredicate(unsigned &P, unsigned Opc) {
   if (Opc == Instruction::FCmp) {
     switch (Lex.getKind()) {
-    default: return TokError("expected fcmp predicate (e.g. 'oeq')");
+    default:
+      return tokError("expected fcmp predicate (e.g. 'oeq')");
     case lltok::kw_oeq: P = CmpInst::FCMP_OEQ; break;
     case lltok::kw_one: P = CmpInst::FCMP_ONE; break;
     case lltok::kw_olt: P = CmpInst::FCMP_OLT; break;
@@ -5991,7 +6247,8 @@ bool LLParser::ParseCmpPredicate(unsigned &P, unsigned Opc) {
     }
   } else {
     switch (Lex.getKind()) {
-    default: return TokError("expected icmp predicate (e.g. 'eq')");
+    default:
+      return tokError("expected icmp predicate (e.g. 'eq')");
     case lltok::kw_eq:  P = CmpInst::ICMP_EQ; break;
     case lltok::kw_ne:  P = CmpInst::ICMP_NE; break;
     case lltok::kw_slt: P = CmpInst::ICMP_SLT; break;
@@ -6012,45 +6269,48 @@ bool LLParser::ParseCmpPredicate(unsigned &P, unsigned Opc) {
 // Terminator Instructions.
 //===----------------------------------------------------------------------===//
 
-/// ParseRet - Parse a return instruction.
+/// parseRet - parse a return instruction.
 ///   ::= 'ret' void (',' !dbg, !1)*
 ///   ::= 'ret' TypeAndValue (',' !dbg, !1)*
-bool LLParser::ParseRet(Instruction *&Inst, BasicBlock *BB,
+bool LLParser::parseRet(Instruction *&Inst, BasicBlock *BB,
                         PerFunctionState &PFS) {
   SMLoc TypeLoc = Lex.getLoc();
   Type *Ty = nullptr;
-  if (ParseType(Ty, true /*void allowed*/)) return true;
+  if (parseType(Ty, true /*void allowed*/))
+    return true;
 
   Type *ResType = PFS.getFunction().getReturnType();
 
   if (Ty->isVoidTy()) {
     if (!ResType->isVoidTy())
-      return Error(TypeLoc, "value doesn't match function result type '" +
-                   getTypeString(ResType) + "'");
+      return error(TypeLoc, "value doesn't match function result type '" +
+                                getTypeString(ResType) + "'");
 
     Inst = ReturnInst::Create(Context);
     return false;
   }
 
   Value *RV;
-  if (ParseValue(Ty, RV, PFS)) return true;
+  if (parseValue(Ty, RV, PFS))
+    return true;
 
   if (ResType != RV->getType())
-    return Error(TypeLoc, "value doesn't match function result type '" +
-                 getTypeString(ResType) + "'");
+    return error(TypeLoc, "value doesn't match function result type '" +
+                              getTypeString(ResType) + "'");
 
   Inst = ReturnInst::Create(Context, RV);
   return false;
 }
 
-/// ParseBr
+/// parseBr
 ///   ::= 'br' TypeAndValue
 ///   ::= 'br' TypeAndValue ',' TypeAndValue ',' TypeAndValue
-bool LLParser::ParseBr(Instruction *&Inst, PerFunctionState &PFS) {
+bool LLParser::parseBr(Instruction *&Inst, PerFunctionState &PFS) {
   LocTy Loc, Loc2;
   Value *Op0;
   BasicBlock *Op1, *Op2;
-  if (ParseTypeAndValue(Op0, Loc, PFS)) return true;
+  if (parseTypeAndValue(Op0, Loc, PFS))
+    return true;
 
   if (BasicBlock *BB = dyn_cast<BasicBlock>(Op0)) {
     Inst = BranchInst::Create(BB);
@@ -6058,52 +6318,52 @@ bool LLParser::ParseBr(Instruction *&Inst, PerFunctionState &PFS) {
   }
 
   if (Op0->getType() != Type::getInt1Ty(Context))
-    return Error(Loc, "branch condition must have 'i1' type");
+    return error(Loc, "branch condition must have 'i1' type");
 
-  if (ParseToken(lltok::comma, "expected ',' after branch condition") ||
-      ParseTypeAndBasicBlock(Op1, Loc, PFS) ||
-      ParseToken(lltok::comma, "expected ',' after true destination") ||
-      ParseTypeAndBasicBlock(Op2, Loc2, PFS))
+  if (parseToken(lltok::comma, "expected ',' after branch condition") ||
+      parseTypeAndBasicBlock(Op1, Loc, PFS) ||
+      parseToken(lltok::comma, "expected ',' after true destination") ||
+      parseTypeAndBasicBlock(Op2, Loc2, PFS))
     return true;
 
   Inst = BranchInst::Create(Op1, Op2, Op0);
   return false;
 }
 
-/// ParseSwitch
+/// parseSwitch
 ///  Instruction
 ///    ::= 'switch' TypeAndValue ',' TypeAndValue '[' JumpTable ']'
 ///  JumpTable
 ///    ::= (TypeAndValue ',' TypeAndValue)*
-bool LLParser::ParseSwitch(Instruction *&Inst, PerFunctionState &PFS) {
+bool LLParser::parseSwitch(Instruction *&Inst, PerFunctionState &PFS) {
   LocTy CondLoc, BBLoc;
   Value *Cond;
   BasicBlock *DefaultBB;
-  if (ParseTypeAndValue(Cond, CondLoc, PFS) ||
-      ParseToken(lltok::comma, "expected ',' after switch condition") ||
-      ParseTypeAndBasicBlock(DefaultBB, BBLoc, PFS) ||
-      ParseToken(lltok::lsquare, "expected '[' with switch table"))
+  if (parseTypeAndValue(Cond, CondLoc, PFS) ||
+      parseToken(lltok::comma, "expected ',' after switch condition") ||
+      parseTypeAndBasicBlock(DefaultBB, BBLoc, PFS) ||
+      parseToken(lltok::lsquare, "expected '[' with switch table"))
     return true;
 
   if (!Cond->getType()->isIntegerTy())
-    return Error(CondLoc, "switch condition must have integer type");
+    return error(CondLoc, "switch condition must have integer type");
 
-  // Parse the jump table pairs.
+  // parse the jump table pairs.
   SmallPtrSet<Value*, 32> SeenCases;
   SmallVector<std::pair<ConstantInt*, BasicBlock*>, 32> Table;
   while (Lex.getKind() != lltok::rsquare) {
     Value *Constant;
     BasicBlock *DestBB;
 
-    if (ParseTypeAndValue(Constant, CondLoc, PFS) ||
-        ParseToken(lltok::comma, "expected ',' after case value") ||
-        ParseTypeAndBasicBlock(DestBB, PFS))
+    if (parseTypeAndValue(Constant, CondLoc, PFS) ||
+        parseToken(lltok::comma, "expected ',' after case value") ||
+        parseTypeAndBasicBlock(DestBB, PFS))
       return true;
 
     if (!SeenCases.insert(Constant).second)
-      return Error(CondLoc, "duplicate case value in switch");
+      return error(CondLoc, "duplicate case value in switch");
     if (!isa<ConstantInt>(Constant))
-      return Error(CondLoc, "case value is not a constant integer");
+      return error(CondLoc, "case value is not a constant integer");
 
     Table.push_back(std::make_pair(cast<ConstantInt>(Constant), DestBB));
   }
@@ -6117,37 +6377,37 @@ bool LLParser::ParseSwitch(Instruction *&Inst, PerFunctionState &PFS) {
   return false;
 }
 
-/// ParseIndirectBr
+/// parseIndirectBr
 ///  Instruction
 ///    ::= 'indirectbr' TypeAndValue ',' '[' LabelList ']'
-bool LLParser::ParseIndirectBr(Instruction *&Inst, PerFunctionState &PFS) {
+bool LLParser::parseIndirectBr(Instruction *&Inst, PerFunctionState &PFS) {
   LocTy AddrLoc;
   Value *Address;
-  if (ParseTypeAndValue(Address, AddrLoc, PFS) ||
-      ParseToken(lltok::comma, "expected ',' after indirectbr address") ||
-      ParseToken(lltok::lsquare, "expected '[' with indirectbr"))
+  if (parseTypeAndValue(Address, AddrLoc, PFS) ||
+      parseToken(lltok::comma, "expected ',' after indirectbr address") ||
+      parseToken(lltok::lsquare, "expected '[' with indirectbr"))
     return true;
 
   if (!Address->getType()->isPointerTy())
-    return Error(AddrLoc, "indirectbr address must have pointer type");
+    return error(AddrLoc, "indirectbr address must have pointer type");
 
-  // Parse the destination list.
+  // parse the destination list.
   SmallVector<BasicBlock*, 16> DestList;
 
   if (Lex.getKind() != lltok::rsquare) {
     BasicBlock *DestBB;
-    if (ParseTypeAndBasicBlock(DestBB, PFS))
+    if (parseTypeAndBasicBlock(DestBB, PFS))
       return true;
     DestList.push_back(DestBB);
 
     while (EatIfPresent(lltok::comma)) {
-      if (ParseTypeAndBasicBlock(DestBB, PFS))
+      if (parseTypeAndBasicBlock(DestBB, PFS))
         return true;
       DestList.push_back(DestBB);
     }
   }
 
-  if (ParseToken(lltok::rsquare, "expected ']' at end of block list"))
+  if (parseToken(lltok::rsquare, "expected ']' at end of block list"))
     return true;
 
   IndirectBrInst *IBI = IndirectBrInst::Create(Address, DestList.size());
@@ -6157,10 +6417,10 @@ bool LLParser::ParseIndirectBr(Instruction *&Inst, PerFunctionState &PFS) {
   return false;
 }
 
-/// ParseInvoke
+/// parseInvoke
 ///   ::= 'invoke' OptionalCallingConv OptionalAttrs Type Value ParamList
 ///       OptionalAttrs 'to' TypeAndValue 'unwind' TypeAndValue
-bool LLParser::ParseInvoke(Instruction *&Inst, PerFunctionState &PFS) {
+bool LLParser::parseInvoke(Instruction *&Inst, PerFunctionState &PFS) {
   LocTy CallLoc = Lex.getLoc();
   AttrBuilder RetAttrs, FnAttrs;
   std::vector<unsigned> FwdRefAttrGrps;
@@ -6174,17 +6434,17 @@ bool LLParser::ParseInvoke(Instruction *&Inst, PerFunctionState &PFS) {
   SmallVector<OperandBundleDef, 2> BundleList;
 
   BasicBlock *NormalBB, *UnwindBB;
-  if (ParseOptionalCallingConv(CC) || ParseOptionalReturnAttrs(RetAttrs) ||
-      ParseOptionalProgramAddrSpace(InvokeAddrSpace) ||
-      ParseType(RetType, RetTypeLoc, true /*void allowed*/) ||
-      ParseValID(CalleeID) || ParseParameterList(ArgList, PFS) ||
-      ParseFnAttributeValuePairs(FnAttrs, FwdRefAttrGrps, false,
+  if (parseOptionalCallingConv(CC) || parseOptionalReturnAttrs(RetAttrs) ||
+      parseOptionalProgramAddrSpace(InvokeAddrSpace) ||
+      parseType(RetType, RetTypeLoc, true /*void allowed*/) ||
+      parseValID(CalleeID) || parseParameterList(ArgList, PFS) ||
+      parseFnAttributeValuePairs(FnAttrs, FwdRefAttrGrps, false,
                                  NoBuiltinLoc) ||
-      ParseOptionalOperandBundles(BundleList, PFS) ||
-      ParseToken(lltok::kw_to, "expected 'to' in invoke") ||
-      ParseTypeAndBasicBlock(NormalBB, PFS) ||
-      ParseToken(lltok::kw_unwind, "expected 'unwind' in invoke") ||
-      ParseTypeAndBasicBlock(UnwindBB, PFS))
+      parseOptionalOperandBundles(BundleList, PFS) ||
+      parseToken(lltok::kw_to, "expected 'to' in invoke") ||
+      parseTypeAndBasicBlock(NormalBB, PFS) ||
+      parseToken(lltok::kw_unwind, "expected 'unwind' in invoke") ||
+      parseTypeAndBasicBlock(UnwindBB, PFS))
     return true;
 
   // If RetType is a non-function pointer type, then this is the short syntax
@@ -6198,7 +6458,7 @@ bool LLParser::ParseInvoke(Instruction *&Inst, PerFunctionState &PFS) {
       ParamTypes.push_back(ArgList[i].V->getType());
 
     if (!FunctionType::isValidReturnType(RetType))
-      return Error(RetTypeLoc, "Invalid result type for LLVM function");
+      return error(RetTypeLoc, "Invalid result type for LLVM function");
 
     Ty = FunctionType::get(RetType, ParamTypes, false);
   }
@@ -6207,7 +6467,7 @@ bool LLParser::ParseInvoke(Instruction *&Inst, PerFunctionState &PFS) {
 
   // Look up the callee.
   Value *Callee;
-  if (ConvertValIDToValue(PointerType::get(Ty, InvokeAddrSpace), CalleeID,
+  if (convertValIDToValue(PointerType::get(Ty, InvokeAddrSpace), CalleeID,
                           Callee, &PFS, /*IsCall=*/true))
     return true;
 
@@ -6224,21 +6484,21 @@ bool LLParser::ParseInvoke(Instruction *&Inst, PerFunctionState &PFS) {
     if (I != E) {
       ExpectedTy = *I++;
     } else if (!Ty->isVarArg()) {
-      return Error(ArgList[i].Loc, "too many arguments specified");
+      return error(ArgList[i].Loc, "too many arguments specified");
     }
 
     if (ExpectedTy && ExpectedTy != ArgList[i].V->getType())
-      return Error(ArgList[i].Loc, "argument is not of expected type '" +
-                   getTypeString(ExpectedTy) + "'");
+      return error(ArgList[i].Loc, "argument is not of expected type '" +
+                                       getTypeString(ExpectedTy) + "'");
     Args.push_back(ArgList[i].V);
     ArgAttrs.push_back(ArgList[i].Attrs);
   }
 
   if (I != E)
-    return Error(CallLoc, "not enough parameters specified for call");
+    return error(CallLoc, "not enough parameters specified for call");
 
   if (FnAttrs.hasAlignmentAttr())
-    return Error(CallLoc, "invoke instructions may not have an alignment");
+    return error(CallLoc, "invoke instructions may not have an alignment");
 
   // Finish off the Attribute and check them
   AttributeList PAL =
@@ -6254,11 +6514,11 @@ bool LLParser::ParseInvoke(Instruction *&Inst, PerFunctionState &PFS) {
   return false;
 }
 
-/// ParseResume
+/// parseResume
 ///   ::= 'resume' TypeAndValue
-bool LLParser::ParseResume(Instruction *&Inst, PerFunctionState &PFS) {
+bool LLParser::parseResume(Instruction *&Inst, PerFunctionState &PFS) {
   Value *Exn; LocTy ExnLoc;
-  if (ParseTypeAndValue(Exn, ExnLoc, PFS))
+  if (parseTypeAndValue(Exn, ExnLoc, PFS))
     return true;
 
   ResumeInst *RI = ResumeInst::Create(Exn);
@@ -6266,29 +6526,29 @@ bool LLParser::ParseResume(Instruction *&Inst, PerFunctionState &PFS) {
   return false;
 }
 
-bool LLParser::ParseExceptionArgs(SmallVectorImpl<Value *> &Args,
+bool LLParser::parseExceptionArgs(SmallVectorImpl<Value *> &Args,
                                   PerFunctionState &PFS) {
-  if (ParseToken(lltok::lsquare, "expected '[' in catchpad/cleanuppad"))
+  if (parseToken(lltok::lsquare, "expected '[' in catchpad/cleanuppad"))
     return true;
 
   while (Lex.getKind() != lltok::rsquare) {
     // If this isn't the first argument, we need a comma.
     if (!Args.empty() &&
-        ParseToken(lltok::comma, "expected ',' in argument list"))
+        parseToken(lltok::comma, "expected ',' in argument list"))
       return true;
 
-    // Parse the argument.
+    // parse the argument.
     LocTy ArgLoc;
     Type *ArgTy = nullptr;
-    if (ParseType(ArgTy, ArgLoc))
+    if (parseType(ArgTy, ArgLoc))
       return true;
 
     Value *V;
     if (ArgTy->isMetadataTy()) {
-      if (ParseMetadataAsValue(V, PFS))
+      if (parseMetadataAsValue(V, PFS))
         return true;
     } else {
-      if (ParseValue(ArgTy, V, PFS))
+      if (parseValue(ArgTy, V, PFS))
         return true;
     }
     Args.push_back(V);
@@ -6298,27 +6558,27 @@ bool LLParser::ParseExceptionArgs(SmallVectorImpl<Value *> &Args,
   return false;
 }
 
-/// ParseCleanupRet
+/// parseCleanupRet
 ///   ::= 'cleanupret' from Value unwind ('to' 'caller' | TypeAndValue)
-bool LLParser::ParseCleanupRet(Instruction *&Inst, PerFunctionState &PFS) {
+bool LLParser::parseCleanupRet(Instruction *&Inst, PerFunctionState &PFS) {
   Value *CleanupPad = nullptr;
 
-  if (ParseToken(lltok::kw_from, "expected 'from' after cleanupret"))
+  if (parseToken(lltok::kw_from, "expected 'from' after cleanupret"))
     return true;
 
-  if (ParseValue(Type::getTokenTy(Context), CleanupPad, PFS))
+  if (parseValue(Type::getTokenTy(Context), CleanupPad, PFS))
     return true;
 
-  if (ParseToken(lltok::kw_unwind, "expected 'unwind' in cleanupret"))
+  if (parseToken(lltok::kw_unwind, "expected 'unwind' in cleanupret"))
     return true;
 
   BasicBlock *UnwindBB = nullptr;
   if (Lex.getKind() == lltok::kw_to) {
     Lex.Lex();
-    if (ParseToken(lltok::kw_caller, "expected 'caller' in cleanupret"))
+    if (parseToken(lltok::kw_caller, "expected 'caller' in cleanupret"))
       return true;
   } else {
-    if (ParseTypeAndBasicBlock(UnwindBB, PFS)) {
+    if (parseTypeAndBasicBlock(UnwindBB, PFS)) {
       return true;
     }
   }
@@ -6327,65 +6587,64 @@ bool LLParser::ParseCleanupRet(Instruction *&Inst, PerFunctionState &PFS) {
   return false;
 }
 
-/// ParseCatchRet
+/// parseCatchRet
 ///   ::= 'catchret' from Parent Value 'to' TypeAndValue
-bool LLParser::ParseCatchRet(Instruction *&Inst, PerFunctionState &PFS) {
+bool LLParser::parseCatchRet(Instruction *&Inst, PerFunctionState &PFS) {
   Value *CatchPad = nullptr;
 
-  if (ParseToken(lltok::kw_from, "expected 'from' after catchret"))
+  if (parseToken(lltok::kw_from, "expected 'from' after catchret"))
     return true;
 
-  if (ParseValue(Type::getTokenTy(Context), CatchPad, PFS))
+  if (parseValue(Type::getTokenTy(Context), CatchPad, PFS))
     return true;
 
   BasicBlock *BB;
-  if (ParseToken(lltok::kw_to, "expected 'to' in catchret") ||
-      ParseTypeAndBasicBlock(BB, PFS))
-      return true;
+  if (parseToken(lltok::kw_to, "expected 'to' in catchret") ||
+      parseTypeAndBasicBlock(BB, PFS))
+    return true;
 
   Inst = CatchReturnInst::Create(CatchPad, BB);
   return false;
 }
 
-/// ParseCatchSwitch
+/// parseCatchSwitch
 ///   ::= 'catchswitch' within Parent
-bool LLParser::ParseCatchSwitch(Instruction *&Inst, PerFunctionState &PFS) {
+bool LLParser::parseCatchSwitch(Instruction *&Inst, PerFunctionState &PFS) {
   Value *ParentPad;
 
-  if (ParseToken(lltok::kw_within, "expected 'within' after catchswitch"))
+  if (parseToken(lltok::kw_within, "expected 'within' after catchswitch"))
     return true;
 
   if (Lex.getKind() != lltok::kw_none && Lex.getKind() != lltok::LocalVar &&
       Lex.getKind() != lltok::LocalVarID)
-    return TokError("expected scope value for catchswitch");
+    return tokError("expected scope value for catchswitch");
 
-  if (ParseValue(Type::getTokenTy(Context), ParentPad, PFS))
+  if (parseValue(Type::getTokenTy(Context), ParentPad, PFS))
     return true;
 
-  if (ParseToken(lltok::lsquare, "expected '[' with catchswitch labels"))
+  if (parseToken(lltok::lsquare, "expected '[' with catchswitch labels"))
     return true;
 
   SmallVector<BasicBlock *, 32> Table;
   do {
     BasicBlock *DestBB;
-    if (ParseTypeAndBasicBlock(DestBB, PFS))
+    if (parseTypeAndBasicBlock(DestBB, PFS))
       return true;
     Table.push_back(DestBB);
   } while (EatIfPresent(lltok::comma));
 
-  if (ParseToken(lltok::rsquare, "expected ']' after catchswitch labels"))
+  if (parseToken(lltok::rsquare, "expected ']' after catchswitch labels"))
     return true;
 
-  if (ParseToken(lltok::kw_unwind,
-                 "expected 'unwind' after catchswitch scope"))
+  if (parseToken(lltok::kw_unwind, "expected 'unwind' after catchswitch scope"))
     return true;
 
   BasicBlock *UnwindBB = nullptr;
   if (EatIfPresent(lltok::kw_to)) {
-    if (ParseToken(lltok::kw_caller, "expected 'caller' in catchswitch"))
+    if (parseToken(lltok::kw_caller, "expected 'caller' in catchswitch"))
       return true;
   } else {
-    if (ParseTypeAndBasicBlock(UnwindBB, PFS))
+    if (parseTypeAndBasicBlock(UnwindBB, PFS))
       return true;
   }
 
@@ -6397,45 +6656,45 @@ bool LLParser::ParseCatchSwitch(Instruction *&Inst, PerFunctionState &PFS) {
   return false;
 }
 
-/// ParseCatchPad
+/// parseCatchPad
 ///   ::= 'catchpad' ParamList 'to' TypeAndValue 'unwind' TypeAndValue
-bool LLParser::ParseCatchPad(Instruction *&Inst, PerFunctionState &PFS) {
+bool LLParser::parseCatchPad(Instruction *&Inst, PerFunctionState &PFS) {
   Value *CatchSwitch = nullptr;
 
-  if (ParseToken(lltok::kw_within, "expected 'within' after catchpad"))
+  if (parseToken(lltok::kw_within, "expected 'within' after catchpad"))
     return true;
 
   if (Lex.getKind() != lltok::LocalVar && Lex.getKind() != lltok::LocalVarID)
-    return TokError("expected scope value for catchpad");
+    return tokError("expected scope value for catchpad");
 
-  if (ParseValue(Type::getTokenTy(Context), CatchSwitch, PFS))
+  if (parseValue(Type::getTokenTy(Context), CatchSwitch, PFS))
     return true;
 
   SmallVector<Value *, 8> Args;
-  if (ParseExceptionArgs(Args, PFS))
+  if (parseExceptionArgs(Args, PFS))
     return true;
 
   Inst = CatchPadInst::Create(CatchSwitch, Args);
   return false;
 }
 
-/// ParseCleanupPad
+/// parseCleanupPad
 ///   ::= 'cleanuppad' within Parent ParamList
-bool LLParser::ParseCleanupPad(Instruction *&Inst, PerFunctionState &PFS) {
+bool LLParser::parseCleanupPad(Instruction *&Inst, PerFunctionState &PFS) {
   Value *ParentPad = nullptr;
 
-  if (ParseToken(lltok::kw_within, "expected 'within' after cleanuppad"))
+  if (parseToken(lltok::kw_within, "expected 'within' after cleanuppad"))
     return true;
 
   if (Lex.getKind() != lltok::kw_none && Lex.getKind() != lltok::LocalVar &&
       Lex.getKind() != lltok::LocalVarID)
-    return TokError("expected scope value for cleanuppad");
+    return tokError("expected scope value for cleanuppad");
 
-  if (ParseValue(Type::getTokenTy(Context), ParentPad, PFS))
+  if (parseValue(Type::getTokenTy(Context), ParentPad, PFS))
     return true;
 
   SmallVector<Value *, 8> Args;
-  if (ParseExceptionArgs(Args, PFS))
+  if (parseExceptionArgs(Args, PFS))
     return true;
 
   Inst = CleanupPadInst::Create(ParentPad, Args);
@@ -6446,32 +6705,32 @@ bool LLParser::ParseCleanupPad(Instruction *&Inst, PerFunctionState &PFS) {
 // Unary Operators.
 //===----------------------------------------------------------------------===//
 
-/// ParseUnaryOp
+/// parseUnaryOp
 ///  ::= UnaryOp TypeAndValue ',' Value
 ///
 /// If IsFP is false, then any integer operand is allowed, if it is true, any fp
 /// operand is allowed.
-bool LLParser::ParseUnaryOp(Instruction *&Inst, PerFunctionState &PFS,
+bool LLParser::parseUnaryOp(Instruction *&Inst, PerFunctionState &PFS,
                             unsigned Opc, bool IsFP) {
   LocTy Loc; Value *LHS;
-  if (ParseTypeAndValue(LHS, Loc, PFS))
+  if (parseTypeAndValue(LHS, Loc, PFS))
     return true;
 
   bool Valid = IsFP ? LHS->getType()->isFPOrFPVectorTy()
                     : LHS->getType()->isIntOrIntVectorTy();
 
   if (!Valid)
-    return Error(Loc, "invalid operand type for instruction");
+    return error(Loc, "invalid operand type for instruction");
 
   Inst = UnaryOperator::Create((Instruction::UnaryOps)Opc, LHS);
   return false;
 }
 
-/// ParseCallBr
+/// parseCallBr
 ///   ::= 'callbr' OptionalCallingConv OptionalAttrs Type Value ParamList
 ///       OptionalAttrs OptionalOperandBundles 'to' TypeAndValue
 ///       '[' LabelList ']'
-bool LLParser::ParseCallBr(Instruction *&Inst, PerFunctionState &PFS) {
+bool LLParser::parseCallBr(Instruction *&Inst, PerFunctionState &PFS) {
   LocTy CallLoc = Lex.getLoc();
   AttrBuilder RetAttrs, FnAttrs;
   std::vector<unsigned> FwdRefAttrGrps;
@@ -6484,34 +6743,34 @@ bool LLParser::ParseCallBr(Instruction *&Inst, PerFunctionState &PFS) {
   SmallVector<OperandBundleDef, 2> BundleList;
 
   BasicBlock *DefaultDest;
-  if (ParseOptionalCallingConv(CC) || ParseOptionalReturnAttrs(RetAttrs) ||
-      ParseType(RetType, RetTypeLoc, true /*void allowed*/) ||
-      ParseValID(CalleeID) || ParseParameterList(ArgList, PFS) ||
-      ParseFnAttributeValuePairs(FnAttrs, FwdRefAttrGrps, false,
+  if (parseOptionalCallingConv(CC) || parseOptionalReturnAttrs(RetAttrs) ||
+      parseType(RetType, RetTypeLoc, true /*void allowed*/) ||
+      parseValID(CalleeID) || parseParameterList(ArgList, PFS) ||
+      parseFnAttributeValuePairs(FnAttrs, FwdRefAttrGrps, false,
                                  NoBuiltinLoc) ||
-      ParseOptionalOperandBundles(BundleList, PFS) ||
-      ParseToken(lltok::kw_to, "expected 'to' in callbr") ||
-      ParseTypeAndBasicBlock(DefaultDest, PFS) ||
-      ParseToken(lltok::lsquare, "expected '[' in callbr"))
+      parseOptionalOperandBundles(BundleList, PFS) ||
+      parseToken(lltok::kw_to, "expected 'to' in callbr") ||
+      parseTypeAndBasicBlock(DefaultDest, PFS) ||
+      parseToken(lltok::lsquare, "expected '[' in callbr"))
     return true;
 
-  // Parse the destination list.
+  // parse the destination list.
   SmallVector<BasicBlock *, 16> IndirectDests;
 
   if (Lex.getKind() != lltok::rsquare) {
     BasicBlock *DestBB;
-    if (ParseTypeAndBasicBlock(DestBB, PFS))
+    if (parseTypeAndBasicBlock(DestBB, PFS))
       return true;
     IndirectDests.push_back(DestBB);
 
     while (EatIfPresent(lltok::comma)) {
-      if (ParseTypeAndBasicBlock(DestBB, PFS))
+      if (parseTypeAndBasicBlock(DestBB, PFS))
         return true;
       IndirectDests.push_back(DestBB);
     }
   }
 
-  if (ParseToken(lltok::rsquare, "expected ']' at end of block list"))
+  if (parseToken(lltok::rsquare, "expected ']' at end of block list"))
     return true;
 
   // If RetType is a non-function pointer type, then this is the short syntax
@@ -6525,7 +6784,7 @@ bool LLParser::ParseCallBr(Instruction *&Inst, PerFunctionState &PFS) {
       ParamTypes.push_back(ArgList[i].V->getType());
 
     if (!FunctionType::isValidReturnType(RetType))
-      return Error(RetTypeLoc, "Invalid result type for LLVM function");
+      return error(RetTypeLoc, "Invalid result type for LLVM function");
 
     Ty = FunctionType::get(RetType, ParamTypes, false);
   }
@@ -6534,7 +6793,7 @@ bool LLParser::ParseCallBr(Instruction *&Inst, PerFunctionState &PFS) {
 
   // Look up the callee.
   Value *Callee;
-  if (ConvertValIDToValue(PointerType::getUnqual(Ty), CalleeID, Callee, &PFS,
+  if (convertValIDToValue(PointerType::getUnqual(Ty), CalleeID, Callee, &PFS,
                           /*IsCall=*/true))
     return true;
 
@@ -6551,21 +6810,21 @@ bool LLParser::ParseCallBr(Instruction *&Inst, PerFunctionState &PFS) {
     if (I != E) {
       ExpectedTy = *I++;
     } else if (!Ty->isVarArg()) {
-      return Error(ArgList[i].Loc, "too many arguments specified");
+      return error(ArgList[i].Loc, "too many arguments specified");
     }
 
     if (ExpectedTy && ExpectedTy != ArgList[i].V->getType())
-      return Error(ArgList[i].Loc, "argument is not of expected type '" +
+      return error(ArgList[i].Loc, "argument is not of expected type '" +
                                        getTypeString(ExpectedTy) + "'");
     Args.push_back(ArgList[i].V);
     ArgAttrs.push_back(ArgList[i].Attrs);
   }
 
   if (I != E)
-    return Error(CallLoc, "not enough parameters specified for call");
+    return error(CallLoc, "not enough parameters specified for call");
 
   if (FnAttrs.hasAlignmentAttr())
-    return Error(CallLoc, "callbr instructions may not have an alignment");
+    return error(CallLoc, "callbr instructions may not have an alignment");
 
   // Finish off the Attribute and check them
   AttributeList PAL =
@@ -6586,70 +6845,70 @@ bool LLParser::ParseCallBr(Instruction *&Inst, PerFunctionState &PFS) {
 // Binary Operators.
 //===----------------------------------------------------------------------===//
 
-/// ParseArithmetic
+/// parseArithmetic
 ///  ::= ArithmeticOps TypeAndValue ',' Value
 ///
 /// If IsFP is false, then any integer operand is allowed, if it is true, any fp
 /// operand is allowed.
-bool LLParser::ParseArithmetic(Instruction *&Inst, PerFunctionState &PFS,
+bool LLParser::parseArithmetic(Instruction *&Inst, PerFunctionState &PFS,
                                unsigned Opc, bool IsFP) {
   LocTy Loc; Value *LHS, *RHS;
-  if (ParseTypeAndValue(LHS, Loc, PFS) ||
-      ParseToken(lltok::comma, "expected ',' in arithmetic operation") ||
-      ParseValue(LHS->getType(), RHS, PFS))
+  if (parseTypeAndValue(LHS, Loc, PFS) ||
+      parseToken(lltok::comma, "expected ',' in arithmetic operation") ||
+      parseValue(LHS->getType(), RHS, PFS))
     return true;
 
   bool Valid = IsFP ? LHS->getType()->isFPOrFPVectorTy()
                     : LHS->getType()->isIntOrIntVectorTy();
 
   if (!Valid)
-    return Error(Loc, "invalid operand type for instruction");
+    return error(Loc, "invalid operand type for instruction");
 
   Inst = BinaryOperator::Create((Instruction::BinaryOps)Opc, LHS, RHS);
   return false;
 }
 
-/// ParseLogical
+/// parseLogical
 ///  ::= ArithmeticOps TypeAndValue ',' Value {
-bool LLParser::ParseLogical(Instruction *&Inst, PerFunctionState &PFS,
+bool LLParser::parseLogical(Instruction *&Inst, PerFunctionState &PFS,
                             unsigned Opc) {
   LocTy Loc; Value *LHS, *RHS;
-  if (ParseTypeAndValue(LHS, Loc, PFS) ||
-      ParseToken(lltok::comma, "expected ',' in logical operation") ||
-      ParseValue(LHS->getType(), RHS, PFS))
+  if (parseTypeAndValue(LHS, Loc, PFS) ||
+      parseToken(lltok::comma, "expected ',' in logical operation") ||
+      parseValue(LHS->getType(), RHS, PFS))
     return true;
 
   if (!LHS->getType()->isIntOrIntVectorTy())
-    return Error(Loc,"instruction requires integer or integer vector operands");
+    return error(Loc,
+                 "instruction requires integer or integer vector operands");
 
   Inst = BinaryOperator::Create((Instruction::BinaryOps)Opc, LHS, RHS);
   return false;
 }
 
-/// ParseCompare
+/// parseCompare
 ///  ::= 'icmp' IPredicates TypeAndValue ',' Value
 ///  ::= 'fcmp' FPredicates TypeAndValue ',' Value
-bool LLParser::ParseCompare(Instruction *&Inst, PerFunctionState &PFS,
+bool LLParser::parseCompare(Instruction *&Inst, PerFunctionState &PFS,
                             unsigned Opc) {
-  // Parse the integer/fp comparison predicate.
+  // parse the integer/fp comparison predicate.
   LocTy Loc;
   unsigned Pred;
   Value *LHS, *RHS;
-  if (ParseCmpPredicate(Pred, Opc) ||
-      ParseTypeAndValue(LHS, Loc, PFS) ||
-      ParseToken(lltok::comma, "expected ',' after compare value") ||
-      ParseValue(LHS->getType(), RHS, PFS))
+  if (parseCmpPredicate(Pred, Opc) || parseTypeAndValue(LHS, Loc, PFS) ||
+      parseToken(lltok::comma, "expected ',' after compare value") ||
+      parseValue(LHS->getType(), RHS, PFS))
     return true;
 
   if (Opc == Instruction::FCmp) {
     if (!LHS->getType()->isFPOrFPVectorTy())
-      return Error(Loc, "fcmp requires floating point operands");
+      return error(Loc, "fcmp requires floating point operands");
     Inst = new FCmpInst(CmpInst::Predicate(Pred), LHS, RHS);
   } else {
     assert(Opc == Instruction::ICmp && "Unknown opcode for CmpInst!");
     if (!LHS->getType()->isIntOrIntVectorTy() &&
         !LHS->getType()->isPtrOrPtrVectorTy())
-      return Error(Loc, "icmp requires integer operands");
+      return error(Loc, "icmp requires integer operands");
     Inst = new ICmpInst(CmpInst::Predicate(Pred), LHS, RHS);
   }
   return false;
@@ -6659,133 +6918,132 @@ bool LLParser::ParseCompare(Instruction *&Inst, PerFunctionState &PFS,
 // Other Instructions.
 //===----------------------------------------------------------------------===//
 
-
-/// ParseCast
+/// parseCast
 ///   ::= CastOpc TypeAndValue 'to' Type
-bool LLParser::ParseCast(Instruction *&Inst, PerFunctionState &PFS,
+bool LLParser::parseCast(Instruction *&Inst, PerFunctionState &PFS,
                          unsigned Opc) {
   LocTy Loc;
   Value *Op;
   Type *DestTy = nullptr;
-  if (ParseTypeAndValue(Op, Loc, PFS) ||
-      ParseToken(lltok::kw_to, "expected 'to' after cast value") ||
-      ParseType(DestTy))
+  if (parseTypeAndValue(Op, Loc, PFS) ||
+      parseToken(lltok::kw_to, "expected 'to' after cast value") ||
+      parseType(DestTy))
     return true;
 
   if (!CastInst::castIsValid((Instruction::CastOps)Opc, Op, DestTy)) {
     CastInst::castIsValid((Instruction::CastOps)Opc, Op, DestTy);
-    return Error(Loc, "invalid cast opcode for cast from '" +
-                 getTypeString(Op->getType()) + "' to '" +
-                 getTypeString(DestTy) + "'");
+    return error(Loc, "invalid cast opcode for cast from '" +
+                          getTypeString(Op->getType()) + "' to '" +
+                          getTypeString(DestTy) + "'");
   }
   Inst = CastInst::Create((Instruction::CastOps)Opc, Op, DestTy);
   return false;
 }
 
-/// ParseSelect
+/// parseSelect
 ///   ::= 'select' TypeAndValue ',' TypeAndValue ',' TypeAndValue
-bool LLParser::ParseSelect(Instruction *&Inst, PerFunctionState &PFS) {
+bool LLParser::parseSelect(Instruction *&Inst, PerFunctionState &PFS) {
   LocTy Loc;
   Value *Op0, *Op1, *Op2;
-  if (ParseTypeAndValue(Op0, Loc, PFS) ||
-      ParseToken(lltok::comma, "expected ',' after select condition") ||
-      ParseTypeAndValue(Op1, PFS) ||
-      ParseToken(lltok::comma, "expected ',' after select value") ||
-      ParseTypeAndValue(Op2, PFS))
+  if (parseTypeAndValue(Op0, Loc, PFS) ||
+      parseToken(lltok::comma, "expected ',' after select condition") ||
+      parseTypeAndValue(Op1, PFS) ||
+      parseToken(lltok::comma, "expected ',' after select value") ||
+      parseTypeAndValue(Op2, PFS))
     return true;
 
   if (const char *Reason = SelectInst::areInvalidOperands(Op0, Op1, Op2))
-    return Error(Loc, Reason);
+    return error(Loc, Reason);
 
   Inst = SelectInst::Create(Op0, Op1, Op2);
   return false;
 }
 
-/// ParseVA_Arg
+/// parseVAArg
 ///   ::= 'va_arg' TypeAndValue ',' Type
-bool LLParser::ParseVA_Arg(Instruction *&Inst, PerFunctionState &PFS) {
+bool LLParser::parseVAArg(Instruction *&Inst, PerFunctionState &PFS) {
   Value *Op;
   Type *EltTy = nullptr;
   LocTy TypeLoc;
-  if (ParseTypeAndValue(Op, PFS) ||
-      ParseToken(lltok::comma, "expected ',' after vaarg operand") ||
-      ParseType(EltTy, TypeLoc))
+  if (parseTypeAndValue(Op, PFS) ||
+      parseToken(lltok::comma, "expected ',' after vaarg operand") ||
+      parseType(EltTy, TypeLoc))
     return true;
 
   if (!EltTy->isFirstClassType())
-    return Error(TypeLoc, "va_arg requires operand with first class type");
+    return error(TypeLoc, "va_arg requires operand with first class type");
 
   Inst = new VAArgInst(Op, EltTy);
   return false;
 }
 
-/// ParseExtractElement
+/// parseExtractElement
 ///   ::= 'extractelement' TypeAndValue ',' TypeAndValue
-bool LLParser::ParseExtractElement(Instruction *&Inst, PerFunctionState &PFS) {
+bool LLParser::parseExtractElement(Instruction *&Inst, PerFunctionState &PFS) {
   LocTy Loc;
   Value *Op0, *Op1;
-  if (ParseTypeAndValue(Op0, Loc, PFS) ||
-      ParseToken(lltok::comma, "expected ',' after extract value") ||
-      ParseTypeAndValue(Op1, PFS))
+  if (parseTypeAndValue(Op0, Loc, PFS) ||
+      parseToken(lltok::comma, "expected ',' after extract value") ||
+      parseTypeAndValue(Op1, PFS))
     return true;
 
   if (!ExtractElementInst::isValidOperands(Op0, Op1))
-    return Error(Loc, "invalid extractelement operands");
+    return error(Loc, "invalid extractelement operands");
 
   Inst = ExtractElementInst::Create(Op0, Op1);
   return false;
 }
 
-/// ParseInsertElement
+/// parseInsertElement
 ///   ::= 'insertelement' TypeAndValue ',' TypeAndValue ',' TypeAndValue
-bool LLParser::ParseInsertElement(Instruction *&Inst, PerFunctionState &PFS) {
+bool LLParser::parseInsertElement(Instruction *&Inst, PerFunctionState &PFS) {
   LocTy Loc;
   Value *Op0, *Op1, *Op2;
-  if (ParseTypeAndValue(Op0, Loc, PFS) ||
-      ParseToken(lltok::comma, "expected ',' after insertelement value") ||
-      ParseTypeAndValue(Op1, PFS) ||
-      ParseToken(lltok::comma, "expected ',' after insertelement value") ||
-      ParseTypeAndValue(Op2, PFS))
+  if (parseTypeAndValue(Op0, Loc, PFS) ||
+      parseToken(lltok::comma, "expected ',' after insertelement value") ||
+      parseTypeAndValue(Op1, PFS) ||
+      parseToken(lltok::comma, "expected ',' after insertelement value") ||
+      parseTypeAndValue(Op2, PFS))
     return true;
 
   if (!InsertElementInst::isValidOperands(Op0, Op1, Op2))
-    return Error(Loc, "invalid insertelement operands");
+    return error(Loc, "invalid insertelement operands");
 
   Inst = InsertElementInst::Create(Op0, Op1, Op2);
   return false;
 }
 
-/// ParseShuffleVector
+/// parseShuffleVector
 ///   ::= 'shufflevector' TypeAndValue ',' TypeAndValue ',' TypeAndValue
-bool LLParser::ParseShuffleVector(Instruction *&Inst, PerFunctionState &PFS) {
+bool LLParser::parseShuffleVector(Instruction *&Inst, PerFunctionState &PFS) {
   LocTy Loc;
   Value *Op0, *Op1, *Op2;
-  if (ParseTypeAndValue(Op0, Loc, PFS) ||
-      ParseToken(lltok::comma, "expected ',' after shuffle mask") ||
-      ParseTypeAndValue(Op1, PFS) ||
-      ParseToken(lltok::comma, "expected ',' after shuffle value") ||
-      ParseTypeAndValue(Op2, PFS))
+  if (parseTypeAndValue(Op0, Loc, PFS) ||
+      parseToken(lltok::comma, "expected ',' after shuffle mask") ||
+      parseTypeAndValue(Op1, PFS) ||
+      parseToken(lltok::comma, "expected ',' after shuffle value") ||
+      parseTypeAndValue(Op2, PFS))
     return true;
 
   if (!ShuffleVectorInst::isValidOperands(Op0, Op1, Op2))
-    return Error(Loc, "invalid shufflevector operands");
+    return error(Loc, "invalid shufflevector operands");
 
   Inst = new ShuffleVectorInst(Op0, Op1, Op2);
   return false;
 }
 
-/// ParsePHI
+/// parsePHI
 ///   ::= 'phi' Type '[' Value ',' Value ']' (',' '[' Value ',' Value ']')*
-int LLParser::ParsePHI(Instruction *&Inst, PerFunctionState &PFS) {
+int LLParser::parsePHI(Instruction *&Inst, PerFunctionState &PFS) {
   Type *Ty = nullptr;  LocTy TypeLoc;
   Value *Op0, *Op1;
 
-  if (ParseType(Ty, TypeLoc) ||
-      ParseToken(lltok::lsquare, "expected '[' in phi value list") ||
-      ParseValue(Ty, Op0, PFS) ||
-      ParseToken(lltok::comma, "expected ',' after insertelement value") ||
-      ParseValue(Type::getLabelTy(Context), Op1, PFS) ||
-      ParseToken(lltok::rsquare, "expected ']' in phi value list"))
+  if (parseType(Ty, TypeLoc) ||
+      parseToken(lltok::lsquare, "expected '[' in phi value list") ||
+      parseValue(Ty, Op0, PFS) ||
+      parseToken(lltok::comma, "expected ',' after insertelement value") ||
+      parseValue(Type::getLabelTy(Context), Op1, PFS) ||
+      parseToken(lltok::rsquare, "expected ']' in phi value list"))
     return true;
 
   bool AteExtraComma = false;
@@ -6802,16 +7060,16 @@ int LLParser::ParsePHI(Instruction *&Inst, PerFunctionState &PFS) {
       break;
     }
 
-    if (ParseToken(lltok::lsquare, "expected '[' in phi value list") ||
-        ParseValue(Ty, Op0, PFS) ||
-        ParseToken(lltok::comma, "expected ',' after insertelement value") ||
-        ParseValue(Type::getLabelTy(Context), Op1, PFS) ||
-        ParseToken(lltok::rsquare, "expected ']' in phi value list"))
+    if (parseToken(lltok::lsquare, "expected '[' in phi value list") ||
+        parseValue(Ty, Op0, PFS) ||
+        parseToken(lltok::comma, "expected ',' after insertelement value") ||
+        parseValue(Type::getLabelTy(Context), Op1, PFS) ||
+        parseToken(lltok::rsquare, "expected ']' in phi value list"))
       return true;
   }
 
   if (!Ty->isFirstClassType())
-    return Error(TypeLoc, "phi node must have first class type");
+    return error(TypeLoc, "phi node must have first class type");
 
   PHINode *PN = PHINode::Create(Ty, PHIVals.size());
   for (unsigned i = 0, e = PHIVals.size(); i != e; ++i)
@@ -6820,16 +7078,16 @@ int LLParser::ParsePHI(Instruction *&Inst, PerFunctionState &PFS) {
   return AteExtraComma ? InstExtraComma : InstNormal;
 }
 
-/// ParseLandingPad
+/// parseLandingPad
 ///   ::= 'landingpad' Type 'personality' TypeAndValue 'cleanup'? Clause+
 /// Clause
 ///   ::= 'catch' TypeAndValue
 ///   ::= 'filter'
 ///   ::= 'filter' TypeAndValue ( ',' TypeAndValue )*
-bool LLParser::ParseLandingPad(Instruction *&Inst, PerFunctionState &PFS) {
+bool LLParser::parseLandingPad(Instruction *&Inst, PerFunctionState &PFS) {
   Type *Ty = nullptr; LocTy TyLoc;
 
-  if (ParseType(Ty, TyLoc))
+  if (parseType(Ty, TyLoc))
     return true;
 
   std::unique_ptr<LandingPadInst> LP(LandingPadInst::Create(Ty, 0));
@@ -6842,26 +7100,26 @@ bool LLParser::ParseLandingPad(Instruction *&Inst, PerFunctionState &PFS) {
     else if (EatIfPresent(lltok::kw_filter))
       CT = LandingPadInst::Filter;
     else
-      return TokError("expected 'catch' or 'filter' clause type");
+      return tokError("expected 'catch' or 'filter' clause type");
 
     Value *V;
     LocTy VLoc;
-    if (ParseTypeAndValue(V, VLoc, PFS))
+    if (parseTypeAndValue(V, VLoc, PFS))
       return true;
 
     // A 'catch' type expects a non-array constant. A filter clause expects an
     // array constant.
     if (CT == LandingPadInst::Catch) {
       if (isa<ArrayType>(V->getType()))
-        Error(VLoc, "'catch' clause has an invalid type");
+        error(VLoc, "'catch' clause has an invalid type");
     } else {
       if (!isa<ArrayType>(V->getType()))
-        Error(VLoc, "'filter' clause has an invalid type");
+        error(VLoc, "'filter' clause has an invalid type");
     }
 
     Constant *CV = dyn_cast<Constant>(V);
     if (!CV)
-      return Error(VLoc, "clause argument must be a constant");
+      return error(VLoc, "clause argument must be a constant");
     LP->addClause(CV);
   }
 
@@ -6869,19 +7127,19 @@ bool LLParser::ParseLandingPad(Instruction *&Inst, PerFunctionState &PFS) {
   return false;
 }
 
-/// ParseFreeze
+/// parseFreeze
 ///   ::= 'freeze' Type Value
-bool LLParser::ParseFreeze(Instruction *&Inst, PerFunctionState &PFS) {
+bool LLParser::parseFreeze(Instruction *&Inst, PerFunctionState &PFS) {
   LocTy Loc;
   Value *Op;
-  if (ParseTypeAndValue(Op, Loc, PFS))
+  if (parseTypeAndValue(Op, Loc, PFS))
     return true;
 
   Inst = new FreezeInst(Op);
   return false;
 }
 
-/// ParseCall
+/// parseCall
 ///   ::= 'call' OptionalFastMathFlags OptionalCallingConv
 ///           OptionalAttrs Type Value ParameterList OptionalAttrs
 ///   ::= 'tail' 'call' OptionalFastMathFlags OptionalCallingConv
@@ -6890,7 +7148,7 @@ bool LLParser::ParseFreeze(Instruction *&Inst, PerFunctionState &PFS) {
 ///           OptionalAttrs Type Value ParameterList OptionalAttrs
 ///   ::= 'notail' 'call'  OptionalFastMathFlags OptionalCallingConv
 ///           OptionalAttrs Type Value ParameterList OptionalAttrs
-bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
+bool LLParser::parseCall(Instruction *&Inst, PerFunctionState &PFS,
                          CallInst::TailCallKind TCK) {
   AttrBuilder RetAttrs, FnAttrs;
   std::vector<unsigned> FwdRefAttrGrps;
@@ -6905,20 +7163,20 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
   LocTy CallLoc = Lex.getLoc();
 
   if (TCK != CallInst::TCK_None &&
-      ParseToken(lltok::kw_call,
+      parseToken(lltok::kw_call,
                  "expected 'tail call', 'musttail call', or 'notail call'"))
     return true;
 
   FastMathFlags FMF = EatFastMathFlagsIfPresent();
 
-  if (ParseOptionalCallingConv(CC) || ParseOptionalReturnAttrs(RetAttrs) ||
-      ParseOptionalProgramAddrSpace(CallAddrSpace) ||
-      ParseType(RetType, RetTypeLoc, true /*void allowed*/) ||
-      ParseValID(CalleeID) ||
-      ParseParameterList(ArgList, PFS, TCK == CallInst::TCK_MustTail,
+  if (parseOptionalCallingConv(CC) || parseOptionalReturnAttrs(RetAttrs) ||
+      parseOptionalProgramAddrSpace(CallAddrSpace) ||
+      parseType(RetType, RetTypeLoc, true /*void allowed*/) ||
+      parseValID(CalleeID) ||
+      parseParameterList(ArgList, PFS, TCK == CallInst::TCK_MustTail,
                          PFS.getFunction().isVarArg()) ||
-      ParseFnAttributeValuePairs(FnAttrs, FwdRefAttrGrps, false, BuiltinLoc) ||
-      ParseOptionalOperandBundles(BundleList, PFS))
+      parseFnAttributeValuePairs(FnAttrs, FwdRefAttrGrps, false, BuiltinLoc) ||
+      parseOptionalOperandBundles(BundleList, PFS))
     return true;
 
   // If RetType is a non-function pointer type, then this is the short syntax
@@ -6932,7 +7190,7 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
       ParamTypes.push_back(ArgList[i].V->getType());
 
     if (!FunctionType::isValidReturnType(RetType))
-      return Error(RetTypeLoc, "Invalid result type for LLVM function");
+      return error(RetTypeLoc, "Invalid result type for LLVM function");
 
     Ty = FunctionType::get(RetType, ParamTypes, false);
   }
@@ -6941,7 +7199,7 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
 
   // Look up the callee.
   Value *Callee;
-  if (ConvertValIDToValue(PointerType::get(Ty, CallAddrSpace), CalleeID, Callee,
+  if (convertValIDToValue(PointerType::get(Ty, CallAddrSpace), CalleeID, Callee,
                           &PFS, /*IsCall=*/true))
     return true;
 
@@ -6959,21 +7217,21 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
     if (I != E) {
       ExpectedTy = *I++;
     } else if (!Ty->isVarArg()) {
-      return Error(ArgList[i].Loc, "too many arguments specified");
+      return error(ArgList[i].Loc, "too many arguments specified");
     }
 
     if (ExpectedTy && ExpectedTy != ArgList[i].V->getType())
-      return Error(ArgList[i].Loc, "argument is not of expected type '" +
-                   getTypeString(ExpectedTy) + "'");
+      return error(ArgList[i].Loc, "argument is not of expected type '" +
+                                       getTypeString(ExpectedTy) + "'");
     Args.push_back(ArgList[i].V);
     Attrs.push_back(ArgList[i].Attrs);
   }
 
   if (I != E)
-    return Error(CallLoc, "not enough parameters specified for call");
+    return error(CallLoc, "not enough parameters specified for call");
 
   if (FnAttrs.hasAlignmentAttr())
-    return Error(CallLoc, "call instructions may not have an alignment");
+    return error(CallLoc, "call instructions may not have an alignment");
 
   // Finish off the Attribute and check them
   AttributeList PAL =
@@ -6986,8 +7244,8 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
   if (FMF.any()) {
     if (!isa<FPMathOperator>(CI)) {
       CI->deleteValue();
-      return Error(CallLoc, "fast-math-flags specified for call without "
-                   "floating-point scalar or vector return type");
+      return error(CallLoc, "fast-math-flags specified for call without "
+                            "floating-point scalar or vector return type");
     }
     CI->setFastMathFlags(FMF);
   }
@@ -7001,10 +7259,10 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
 // Memory Instructions.
 //===----------------------------------------------------------------------===//
 
-/// ParseAlloc
+/// parseAlloc
 ///   ::= 'alloca' 'inalloca'? 'swifterror'? Type (',' TypeAndValue)?
 ///       (',' 'align' i32)? (',', 'addrspace(n))?
-int LLParser::ParseAlloc(Instruction *&Inst, PerFunctionState &PFS) {
+int LLParser::parseAlloc(Instruction *&Inst, PerFunctionState &PFS) {
   Value *Size = nullptr;
   LocTy SizeLoc, TyLoc, ASLoc;
   MaybeAlign Alignment;
@@ -7014,36 +7272,37 @@ int LLParser::ParseAlloc(Instruction *&Inst, PerFunctionState &PFS) {
   bool IsInAlloca = EatIfPresent(lltok::kw_inalloca);
   bool IsSwiftError = EatIfPresent(lltok::kw_swifterror);
 
-  if (ParseType(Ty, TyLoc)) return true;
+  if (parseType(Ty, TyLoc))
+    return true;
 
   if (Ty->isFunctionTy() || !PointerType::isValidElementType(Ty))
-    return Error(TyLoc, "invalid type for alloca");
+    return error(TyLoc, "invalid type for alloca");
 
   bool AteExtraComma = false;
   if (EatIfPresent(lltok::comma)) {
     if (Lex.getKind() == lltok::kw_align) {
-      if (ParseOptionalAlignment(Alignment))
+      if (parseOptionalAlignment(Alignment))
         return true;
-      if (ParseOptionalCommaAddrSpace(AddrSpace, ASLoc, AteExtraComma))
+      if (parseOptionalCommaAddrSpace(AddrSpace, ASLoc, AteExtraComma))
         return true;
     } else if (Lex.getKind() == lltok::kw_addrspace) {
       ASLoc = Lex.getLoc();
-      if (ParseOptionalAddrSpace(AddrSpace))
+      if (parseOptionalAddrSpace(AddrSpace))
         return true;
     } else if (Lex.getKind() == lltok::MetadataVar) {
       AteExtraComma = true;
     } else {
-      if (ParseTypeAndValue(Size, SizeLoc, PFS))
+      if (parseTypeAndValue(Size, SizeLoc, PFS))
         return true;
       if (EatIfPresent(lltok::comma)) {
         if (Lex.getKind() == lltok::kw_align) {
-          if (ParseOptionalAlignment(Alignment))
+          if (parseOptionalAlignment(Alignment))
             return true;
-          if (ParseOptionalCommaAddrSpace(AddrSpace, ASLoc, AteExtraComma))
+          if (parseOptionalCommaAddrSpace(AddrSpace, ASLoc, AteExtraComma))
             return true;
         } else if (Lex.getKind() == lltok::kw_addrspace) {
           ASLoc = Lex.getLoc();
-          if (ParseOptionalAddrSpace(AddrSpace))
+          if (parseOptionalAddrSpace(AddrSpace))
             return true;
         } else if (Lex.getKind() == lltok::MetadataVar) {
           AteExtraComma = true;
@@ -7053,11 +7312,11 @@ int LLParser::ParseAlloc(Instruction *&Inst, PerFunctionState &PFS) {
   }
 
   if (Size && !Size->getType()->isIntegerTy())
-    return Error(SizeLoc, "element count must have integer type");
+    return error(SizeLoc, "element count must have integer type");
 
   SmallPtrSet<Type *, 4> Visited;
   if (!Alignment && !Ty->isSized(&Visited))
-    return Error(TyLoc, "Cannot allocate unsized type");
+    return error(TyLoc, "Cannot allocate unsized type");
   if (!Alignment)
     Alignment = M->getDataLayout().getPrefTypeAlign(Ty);
   AllocaInst *AI = new AllocaInst(Ty, AddrSpace, Size, *Alignment);
@@ -7067,11 +7326,11 @@ int LLParser::ParseAlloc(Instruction *&Inst, PerFunctionState &PFS) {
   return AteExtraComma ? InstExtraComma : InstNormal;
 }
 
-/// ParseLoad
+/// parseLoad
 ///   ::= 'load' 'volatile'? TypeAndValue (',' 'align' i32)?
 ///   ::= 'load' 'atomic' 'volatile'? TypeAndValue
 ///       'singlethread'? AtomicOrdering (',' 'align' i32)?
-int LLParser::ParseLoad(Instruction *&Inst, PerFunctionState &PFS) {
+int LLParser::parseLoad(Instruction *&Inst, PerFunctionState &PFS) {
   Value *Val; LocTy Loc;
   MaybeAlign Alignment;
   bool AteExtraComma = false;
@@ -7092,39 +7351,39 @@ int LLParser::ParseLoad(Instruction *&Inst, PerFunctionState &PFS) {
 
   Type *Ty;
   LocTy ExplicitTypeLoc = Lex.getLoc();
-  if (ParseType(Ty) ||
-      ParseToken(lltok::comma, "expected comma after load's type") ||
-      ParseTypeAndValue(Val, Loc, PFS) ||
-      ParseScopeAndOrdering(isAtomic, SSID, Ordering) ||
-      ParseOptionalCommaAlign(Alignment, AteExtraComma))
+  if (parseType(Ty) ||
+      parseToken(lltok::comma, "expected comma after load's type") ||
+      parseTypeAndValue(Val, Loc, PFS) ||
+      parseScopeAndOrdering(isAtomic, SSID, Ordering) ||
+      parseOptionalCommaAlign(Alignment, AteExtraComma))
     return true;
 
   if (!Val->getType()->isPointerTy() || !Ty->isFirstClassType())
-    return Error(Loc, "load operand must be a pointer to a first class type");
+    return error(Loc, "load operand must be a pointer to a first class type");
   if (isAtomic && !Alignment)
-    return Error(Loc, "atomic load must have explicit non-zero alignment");
+    return error(Loc, "atomic load must have explicit non-zero alignment");
   if (Ordering == AtomicOrdering::Release ||
       Ordering == AtomicOrdering::AcquireRelease)
-    return Error(Loc, "atomic load cannot use Release ordering");
+    return error(Loc, "atomic load cannot use Release ordering");
 
   if (Ty != cast<PointerType>(Val->getType())->getElementType())
-    return Error(ExplicitTypeLoc,
+    return error(ExplicitTypeLoc,
                  "explicit pointee type doesn't match operand's pointee type");
   SmallPtrSet<Type *, 4> Visited;
   if (!Alignment && !Ty->isSized(&Visited))
-    return Error(ExplicitTypeLoc, "loading unsized types is not allowed");
+    return error(ExplicitTypeLoc, "loading unsized types is not allowed");
   if (!Alignment)
     Alignment = M->getDataLayout().getABITypeAlign(Ty);
   Inst = new LoadInst(Ty, Val, "", isVolatile, *Alignment, Ordering, SSID);
   return AteExtraComma ? InstExtraComma : InstNormal;
 }
 
-/// ParseStore
+/// parseStore
 
 ///   ::= 'store' 'volatile'? TypeAndValue ',' TypeAndValue (',' 'align' i32)?
 ///   ::= 'store' 'atomic' 'volatile'? TypeAndValue ',' TypeAndValue
 ///       'singlethread'? AtomicOrdering (',' 'align' i32)?
-int LLParser::ParseStore(Instruction *&Inst, PerFunctionState &PFS) {
+int LLParser::parseStore(Instruction *&Inst, PerFunctionState &PFS) {
   Value *Val, *Ptr; LocTy Loc, PtrLoc;
   MaybeAlign Alignment;
   bool AteExtraComma = false;
@@ -7143,27 +7402,27 @@ int LLParser::ParseStore(Instruction *&Inst, PerFunctionState &PFS) {
     Lex.Lex();
   }
 
-  if (ParseTypeAndValue(Val, Loc, PFS) ||
-      ParseToken(lltok::comma, "expected ',' after store operand") ||
-      ParseTypeAndValue(Ptr, PtrLoc, PFS) ||
-      ParseScopeAndOrdering(isAtomic, SSID, Ordering) ||
-      ParseOptionalCommaAlign(Alignment, AteExtraComma))
+  if (parseTypeAndValue(Val, Loc, PFS) ||
+      parseToken(lltok::comma, "expected ',' after store operand") ||
+      parseTypeAndValue(Ptr, PtrLoc, PFS) ||
+      parseScopeAndOrdering(isAtomic, SSID, Ordering) ||
+      parseOptionalCommaAlign(Alignment, AteExtraComma))
     return true;
 
   if (!Ptr->getType()->isPointerTy())
-    return Error(PtrLoc, "store operand must be a pointer");
+    return error(PtrLoc, "store operand must be a pointer");
   if (!Val->getType()->isFirstClassType())
-    return Error(Loc, "store operand must be a first class value");
+    return error(Loc, "store operand must be a first class value");
   if (cast<PointerType>(Ptr->getType())->getElementType() != Val->getType())
-    return Error(Loc, "stored value and pointer type do not match");
+    return error(Loc, "stored value and pointer type do not match");
   if (isAtomic && !Alignment)
-    return Error(Loc, "atomic store must have explicit non-zero alignment");
+    return error(Loc, "atomic store must have explicit non-zero alignment");
   if (Ordering == AtomicOrdering::Acquire ||
       Ordering == AtomicOrdering::AcquireRelease)
-    return Error(Loc, "atomic store cannot use Acquire ordering");
+    return error(Loc, "atomic store cannot use Acquire ordering");
   SmallPtrSet<Type *, 4> Visited;
   if (!Alignment && !Val->getType()->isSized(&Visited))
-    return Error(Loc, "storing unsized types is not allowed");
+    return error(Loc, "storing unsized types is not allowed");
   if (!Alignment)
     Alignment = M->getDataLayout().getABITypeAlign(Val->getType());
 
@@ -7171,10 +7430,10 @@ int LLParser::ParseStore(Instruction *&Inst, PerFunctionState &PFS) {
   return AteExtraComma ? InstExtraComma : InstNormal;
 }
 
-/// ParseCmpXchg
+/// parseCmpXchg
 ///   ::= 'cmpxchg' 'weak'? 'volatile'? TypeAndValue ',' TypeAndValue ','
 ///       TypeAndValue 'singlethread'? AtomicOrdering AtomicOrdering
-int LLParser::ParseCmpXchg(Instruction *&Inst, PerFunctionState &PFS) {
+int LLParser::parseCmpXchg(Instruction *&Inst, PerFunctionState &PFS) {
   Value *Ptr, *Cmp, *New; LocTy PtrLoc, CmpLoc, NewLoc;
   bool AteExtraComma = false;
   AtomicOrdering SuccessOrdering = AtomicOrdering::NotAtomic;
@@ -7189,33 +7448,33 @@ int LLParser::ParseCmpXchg(Instruction *&Inst, PerFunctionState &PFS) {
   if (EatIfPresent(lltok::kw_volatile))
     isVolatile = true;
 
-  if (ParseTypeAndValue(Ptr, PtrLoc, PFS) ||
-      ParseToken(lltok::comma, "expected ',' after cmpxchg address") ||
-      ParseTypeAndValue(Cmp, CmpLoc, PFS) ||
-      ParseToken(lltok::comma, "expected ',' after cmpxchg cmp operand") ||
-      ParseTypeAndValue(New, NewLoc, PFS) ||
-      ParseScopeAndOrdering(true /*Always atomic*/, SSID, SuccessOrdering) ||
-      ParseOrdering(FailureOrdering))
+  if (parseTypeAndValue(Ptr, PtrLoc, PFS) ||
+      parseToken(lltok::comma, "expected ',' after cmpxchg address") ||
+      parseTypeAndValue(Cmp, CmpLoc, PFS) ||
+      parseToken(lltok::comma, "expected ',' after cmpxchg cmp operand") ||
+      parseTypeAndValue(New, NewLoc, PFS) ||
+      parseScopeAndOrdering(true /*Always atomic*/, SSID, SuccessOrdering) ||
+      parseOrdering(FailureOrdering))
     return true;
 
   if (SuccessOrdering == AtomicOrdering::Unordered ||
       FailureOrdering == AtomicOrdering::Unordered)
-    return TokError("cmpxchg cannot be unordered");
+    return tokError("cmpxchg cannot be unordered");
   if (isStrongerThan(FailureOrdering, SuccessOrdering))
-    return TokError("cmpxchg failure argument shall be no stronger than the "
+    return tokError("cmpxchg failure argument shall be no stronger than the "
                     "success argument");
   if (FailureOrdering == AtomicOrdering::Release ||
       FailureOrdering == AtomicOrdering::AcquireRelease)
-    return TokError(
+    return tokError(
         "cmpxchg failure ordering cannot include release semantics");
   if (!Ptr->getType()->isPointerTy())
-    return Error(PtrLoc, "cmpxchg operand must be a pointer");
+    return error(PtrLoc, "cmpxchg operand must be a pointer");
   if (cast<PointerType>(Ptr->getType())->getElementType() != Cmp->getType())
-    return Error(CmpLoc, "compare value and pointer type do not match");
+    return error(CmpLoc, "compare value and pointer type do not match");
   if (cast<PointerType>(Ptr->getType())->getElementType() != New->getType())
-    return Error(NewLoc, "new value and pointer type do not match");
+    return error(NewLoc, "new value and pointer type do not match");
   if (!New->getType()->isFirstClassType())
-    return Error(NewLoc, "cmpxchg operand must be a first class value");
+    return error(NewLoc, "cmpxchg operand must be a first class value");
 
   Align Alignment(
       PFS.getFunction().getParent()->getDataLayout().getTypeStoreSize(
@@ -7229,10 +7488,10 @@ int LLParser::ParseCmpXchg(Instruction *&Inst, PerFunctionState &PFS) {
   return AteExtraComma ? InstExtraComma : InstNormal;
 }
 
-/// ParseAtomicRMW
+/// parseAtomicRMW
 ///   ::= 'atomicrmw' 'volatile'? BinOp TypeAndValue ',' TypeAndValue
 ///       'singlethread'? AtomicOrdering
-int LLParser::ParseAtomicRMW(Instruction *&Inst, PerFunctionState &PFS) {
+int LLParser::parseAtomicRMW(Instruction *&Inst, PerFunctionState &PFS) {
   Value *Ptr, *Val; LocTy PtrLoc, ValLoc;
   bool AteExtraComma = false;
   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
@@ -7245,7 +7504,8 @@ int LLParser::ParseAtomicRMW(Instruction *&Inst, PerFunctionState &PFS) {
     isVolatile = true;
 
   switch (Lex.getKind()) {
-  default: return TokError("expected binary operation in atomicrmw");
+  default:
+    return tokError("expected binary operation in atomicrmw");
   case lltok::kw_xchg: Operation = AtomicRMWInst::Xchg; break;
   case lltok::kw_add: Operation = AtomicRMWInst::Add; break;
   case lltok::kw_sub: Operation = AtomicRMWInst::Sub; break;
@@ -7268,43 +7528,43 @@ int LLParser::ParseAtomicRMW(Instruction *&Inst, PerFunctionState &PFS) {
   }
   Lex.Lex();  // Eat the operation.
 
-  if (ParseTypeAndValue(Ptr, PtrLoc, PFS) ||
-      ParseToken(lltok::comma, "expected ',' after atomicrmw address") ||
-      ParseTypeAndValue(Val, ValLoc, PFS) ||
-      ParseScopeAndOrdering(true /*Always atomic*/, SSID, Ordering))
+  if (parseTypeAndValue(Ptr, PtrLoc, PFS) ||
+      parseToken(lltok::comma, "expected ',' after atomicrmw address") ||
+      parseTypeAndValue(Val, ValLoc, PFS) ||
+      parseScopeAndOrdering(true /*Always atomic*/, SSID, Ordering))
     return true;
 
   if (Ordering == AtomicOrdering::Unordered)
-    return TokError("atomicrmw cannot be unordered");
+    return tokError("atomicrmw cannot be unordered");
   if (!Ptr->getType()->isPointerTy())
-    return Error(PtrLoc, "atomicrmw operand must be a pointer");
+    return error(PtrLoc, "atomicrmw operand must be a pointer");
   if (cast<PointerType>(Ptr->getType())->getElementType() != Val->getType())
-    return Error(ValLoc, "atomicrmw value and pointer type do not match");
+    return error(ValLoc, "atomicrmw value and pointer type do not match");
 
   if (Operation == AtomicRMWInst::Xchg) {
     if (!Val->getType()->isIntegerTy() &&
         !Val->getType()->isFloatingPointTy()) {
-      return Error(ValLoc, "atomicrmw " +
-                   AtomicRMWInst::getOperationName(Operation) +
-                   " operand must be an integer or floating point type");
+      return error(ValLoc,
+                   "atomicrmw " + AtomicRMWInst::getOperationName(Operation) +
+                       " operand must be an integer or floating point type");
     }
   } else if (IsFP) {
     if (!Val->getType()->isFloatingPointTy()) {
-      return Error(ValLoc, "atomicrmw " +
-                   AtomicRMWInst::getOperationName(Operation) +
-                   " operand must be a floating point type");
+      return error(ValLoc, "atomicrmw " +
+                               AtomicRMWInst::getOperationName(Operation) +
+                               " operand must be a floating point type");
     }
   } else {
     if (!Val->getType()->isIntegerTy()) {
-      return Error(ValLoc, "atomicrmw " +
-                   AtomicRMWInst::getOperationName(Operation) +
-                   " operand must be an integer");
+      return error(ValLoc, "atomicrmw " +
+                               AtomicRMWInst::getOperationName(Operation) +
+                               " operand must be an integer");
     }
   }
 
   unsigned Size = Val->getType()->getPrimitiveSizeInBits();
   if (Size < 8 || (Size & (Size - 1)))
-    return Error(ValLoc, "atomicrmw operand must be power-of-two byte-sized"
+    return error(ValLoc, "atomicrmw operand must be power-of-two byte-sized"
                          " integer");
   Align Alignment(
       PFS.getFunction().getParent()->getDataLayout().getTypeStoreSize(
@@ -7316,26 +7576,26 @@ int LLParser::ParseAtomicRMW(Instruction *&Inst, PerFunctionState &PFS) {
   return AteExtraComma ? InstExtraComma : InstNormal;
 }
 
-/// ParseFence
+/// parseFence
 ///   ::= 'fence' 'singlethread'? AtomicOrdering
-int LLParser::ParseFence(Instruction *&Inst, PerFunctionState &PFS) {
+int LLParser::parseFence(Instruction *&Inst, PerFunctionState &PFS) {
   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
   SyncScope::ID SSID = SyncScope::System;
-  if (ParseScopeAndOrdering(true /*Always atomic*/, SSID, Ordering))
+  if (parseScopeAndOrdering(true /*Always atomic*/, SSID, Ordering))
     return true;
 
   if (Ordering == AtomicOrdering::Unordered)
-    return TokError("fence cannot be unordered");
+    return tokError("fence cannot be unordered");
   if (Ordering == AtomicOrdering::Monotonic)
-    return TokError("fence cannot be monotonic");
+    return tokError("fence cannot be monotonic");
 
   Inst = new FenceInst(Context, Ordering, SSID);
   return InstNormal;
 }
 
-/// ParseGetElementPtr
+/// parseGetElementPtr
 ///   ::= 'getelementptr' 'inbounds'? TypeAndValue (',' TypeAndValue)*
-int LLParser::ParseGetElementPtr(Instruction *&Inst, PerFunctionState &PFS) {
+int LLParser::parseGetElementPtr(Instruction *&Inst, PerFunctionState &PFS) {
   Value *Ptr = nullptr;
   Value *Val = nullptr;
   LocTy Loc, EltLoc;
@@ -7344,18 +7604,18 @@ int LLParser::ParseGetElementPtr(Instruction *&Inst, PerFunctionState &PFS) {
 
   Type *Ty = nullptr;
   LocTy ExplicitTypeLoc = Lex.getLoc();
-  if (ParseType(Ty) ||
-      ParseToken(lltok::comma, "expected comma after getelementptr's type") ||
-      ParseTypeAndValue(Ptr, Loc, PFS))
+  if (parseType(Ty) ||
+      parseToken(lltok::comma, "expected comma after getelementptr's type") ||
+      parseTypeAndValue(Ptr, Loc, PFS))
     return true;
 
   Type *BaseType = Ptr->getType();
   PointerType *BasePointerType = dyn_cast<PointerType>(BaseType->getScalarType());
   if (!BasePointerType)
-    return Error(Loc, "base of getelementptr must be a pointer");
+    return error(Loc, "base of getelementptr must be a pointer");
 
   if (Ty != BasePointerType->getElementType())
-    return Error(ExplicitTypeLoc,
+    return error(ExplicitTypeLoc,
                  "explicit pointee type doesn't match operand's pointee type");
 
   SmallVector<Value*, 16> Indices;
@@ -7364,22 +7624,24 @@ int LLParser::ParseGetElementPtr(Instruction *&Inst, PerFunctionState &PFS) {
   // All vector parameters should have the same vector width.
   ElementCount GEPWidth = BaseType->isVectorTy()
                               ? cast<VectorType>(BaseType)->getElementCount()
-                              : ElementCount(0, false);
+                              : ElementCount::getFixed(0);
 
   while (EatIfPresent(lltok::comma)) {
     if (Lex.getKind() == lltok::MetadataVar) {
       AteExtraComma = true;
       break;
     }
-    if (ParseTypeAndValue(Val, EltLoc, PFS)) return true;
+    if (parseTypeAndValue(Val, EltLoc, PFS))
+      return true;
     if (!Val->getType()->isIntOrIntVectorTy())
-      return Error(EltLoc, "getelementptr index must be an integer");
+      return error(EltLoc, "getelementptr index must be an integer");
 
     if (auto *ValVTy = dyn_cast<VectorType>(Val->getType())) {
       ElementCount ValNumEl = ValVTy->getElementCount();
-      if (GEPWidth != ElementCount(0, false) && GEPWidth != ValNumEl)
-        return Error(EltLoc,
-          "getelementptr vector index has a wrong number of elements");
+      if (GEPWidth != ElementCount::getFixed(0) && GEPWidth != ValNumEl)
+        return error(
+            EltLoc,
+            "getelementptr vector index has a wrong number of elements");
       GEPWidth = ValNumEl;
     }
     Indices.push_back(Val);
@@ -7387,55 +7649,55 @@ int LLParser::ParseGetElementPtr(Instruction *&Inst, PerFunctionState &PFS) {
 
   SmallPtrSet<Type*, 4> Visited;
   if (!Indices.empty() && !Ty->isSized(&Visited))
-    return Error(Loc, "base element of getelementptr must be sized");
+    return error(Loc, "base element of getelementptr must be sized");
 
   if (!GetElementPtrInst::getIndexedType(Ty, Indices))
-    return Error(Loc, "invalid getelementptr indices");
+    return error(Loc, "invalid getelementptr indices");
   Inst = GetElementPtrInst::Create(Ty, Ptr, Indices);
   if (InBounds)
     cast<GetElementPtrInst>(Inst)->setIsInBounds(true);
   return AteExtraComma ? InstExtraComma : InstNormal;
 }
 
-/// ParseExtractValue
+/// parseExtractValue
 ///   ::= 'extractvalue' TypeAndValue (',' uint32)+
-int LLParser::ParseExtractValue(Instruction *&Inst, PerFunctionState &PFS) {
+int LLParser::parseExtractValue(Instruction *&Inst, PerFunctionState &PFS) {
   Value *Val; LocTy Loc;
   SmallVector<unsigned, 4> Indices;
   bool AteExtraComma;
-  if (ParseTypeAndValue(Val, Loc, PFS) ||
-      ParseIndexList(Indices, AteExtraComma))
+  if (parseTypeAndValue(Val, Loc, PFS) ||
+      parseIndexList(Indices, AteExtraComma))
     return true;
 
   if (!Val->getType()->isAggregateType())
-    return Error(Loc, "extractvalue operand must be aggregate type");
+    return error(Loc, "extractvalue operand must be aggregate type");
 
   if (!ExtractValueInst::getIndexedType(Val->getType(), Indices))
-    return Error(Loc, "invalid indices for extractvalue");
+    return error(Loc, "invalid indices for extractvalue");
   Inst = ExtractValueInst::Create(Val, Indices);
   return AteExtraComma ? InstExtraComma : InstNormal;
 }
 
-/// ParseInsertValue
+/// parseInsertValue
 ///   ::= 'insertvalue' TypeAndValue ',' TypeAndValue (',' uint32)+
-int LLParser::ParseInsertValue(Instruction *&Inst, PerFunctionState &PFS) {
+int LLParser::parseInsertValue(Instruction *&Inst, PerFunctionState &PFS) {
   Value *Val0, *Val1; LocTy Loc0, Loc1;
   SmallVector<unsigned, 4> Indices;
   bool AteExtraComma;
-  if (ParseTypeAndValue(Val0, Loc0, PFS) ||
-      ParseToken(lltok::comma, "expected comma after insertvalue operand") ||
-      ParseTypeAndValue(Val1, Loc1, PFS) ||
-      ParseIndexList(Indices, AteExtraComma))
+  if (parseTypeAndValue(Val0, Loc0, PFS) ||
+      parseToken(lltok::comma, "expected comma after insertvalue operand") ||
+      parseTypeAndValue(Val1, Loc1, PFS) ||
+      parseIndexList(Indices, AteExtraComma))
     return true;
 
   if (!Val0->getType()->isAggregateType())
-    return Error(Loc0, "insertvalue operand must be aggregate type");
+    return error(Loc0, "insertvalue operand must be aggregate type");
 
   Type *IndexedType = ExtractValueInst::getIndexedType(Val0->getType(), Indices);
   if (!IndexedType)
-    return Error(Loc0, "invalid indices for insertvalue");
+    return error(Loc0, "invalid indices for insertvalue");
   if (IndexedType != Val1->getType())
-    return Error(Loc1, "insertvalue operand and field disagree in type: '" +
+    return error(Loc1, "insertvalue operand and field disagree in type: '" +
                            getTypeString(Val1->getType()) + "' instead of '" +
                            getTypeString(IndexedType) + "'");
   Inst = InsertValueInst::Create(Val0, Val1, Indices);
@@ -7446,12 +7708,12 @@ int LLParser::ParseInsertValue(Instruction *&Inst, PerFunctionState &PFS) {
 // Embedded metadata.
 //===----------------------------------------------------------------------===//
 
-/// ParseMDNodeVector
+/// parseMDNodeVector
 ///   ::= { Element (',' Element)* }
 /// Element
 ///   ::= 'null' | TypeAndValue
-bool LLParser::ParseMDNodeVector(SmallVectorImpl<Metadata *> &Elts) {
-  if (ParseToken(lltok::lbrace, "expected '{' here"))
+bool LLParser::parseMDNodeVector(SmallVectorImpl<Metadata *> &Elts) {
+  if (parseToken(lltok::lbrace, "expected '{' here"))
     return true;
 
   // Check for an empty list.
@@ -7466,12 +7728,12 @@ bool LLParser::ParseMDNodeVector(SmallVectorImpl<Metadata *> &Elts) {
     }
 
     Metadata *MD;
-    if (ParseMetadata(MD, nullptr))
+    if (parseMetadata(MD, nullptr))
       return true;
     Elts.push_back(MD);
   } while (EatIfPresent(lltok::comma));
 
-  return ParseToken(lltok::rbrace, "expected end of metadata node");
+  return parseToken(lltok::rbrace, "expected end of metadata node");
 }
 
 //===----------------------------------------------------------------------===//
@@ -7480,7 +7742,7 @@ bool LLParser::ParseMDNodeVector(SmallVectorImpl<Metadata *> &Elts) {
 bool LLParser::sortUseListOrder(Value *V, ArrayRef<unsigned> Indexes,
                                 SMLoc Loc) {
   if (V->use_empty())
-    return Error(Loc, "value has no uses");
+    return error(Loc, "value has no uses");
 
   unsigned NumUses = 0;
   SmallDenseMap<const Use *, unsigned, 16> Order;
@@ -7490,9 +7752,9 @@ bool LLParser::sortUseListOrder(Value *V, ArrayRef<unsigned> Indexes,
     Order[&U] = Indexes[NumUses - 1];
   }
   if (NumUses < 2)
-    return Error(Loc, "value only has one use");
+    return error(Loc, "value only has one use");
   if (Order.size() != Indexes.size() || NumUses > Indexes.size())
-    return Error(Loc,
+    return error(Loc,
                  "wrong number of indexes, expected " + Twine(V->getNumUses()));
 
   V->sortUseList([&](const Use &L, const Use &R) {
@@ -7501,11 +7763,11 @@ bool LLParser::sortUseListOrder(Value *V, ArrayRef<unsigned> Indexes,
   return false;
 }
 
-/// ParseUseListOrderIndexes
+/// parseUseListOrderIndexes
 ///   ::= '{' uint32 (',' uint32)+ '}'
-bool LLParser::ParseUseListOrderIndexes(SmallVectorImpl<unsigned> &Indexes) {
+bool LLParser::parseUseListOrderIndexes(SmallVectorImpl<unsigned> &Indexes) {
   SMLoc Loc = Lex.getLoc();
-  if (ParseToken(lltok::lbrace, "expected '{' here"))
+  if (parseToken(lltok::lbrace, "expected '{' here"))
     return true;
   if (Lex.getKind() == lltok::rbrace)
     return Lex.Error("expected non-empty list of uselistorder indexes");
@@ -7519,7 +7781,7 @@ bool LLParser::ParseUseListOrderIndexes(SmallVectorImpl<unsigned> &Indexes) {
   assert(Indexes.empty() && "Expected empty order vector");
   do {
     unsigned Index;
-    if (ParseUInt32(Index))
+    if (parseUInt32(Index))
       return true;
 
     // Update consistency checks.
@@ -7530,50 +7792,51 @@ bool LLParser::ParseUseListOrderIndexes(SmallVectorImpl<unsigned> &Indexes) {
     Indexes.push_back(Index);
   } while (EatIfPresent(lltok::comma));
 
-  if (ParseToken(lltok::rbrace, "expected '}' here"))
+  if (parseToken(lltok::rbrace, "expected '}' here"))
     return true;
 
   if (Indexes.size() < 2)
-    return Error(Loc, "expected >= 2 uselistorder indexes");
+    return error(Loc, "expected >= 2 uselistorder indexes");
   if (Offset != 0 || Max >= Indexes.size())
-    return Error(Loc, "expected distinct uselistorder indexes in range [0, size)");
+    return error(Loc,
+                 "expected distinct uselistorder indexes in range [0, size)");
   if (IsOrdered)
-    return Error(Loc, "expected uselistorder indexes to change the order");
+    return error(Loc, "expected uselistorder indexes to change the order");
 
   return false;
 }
 
-/// ParseUseListOrder
+/// parseUseListOrder
 ///   ::= 'uselistorder' Type Value ',' UseListOrderIndexes
-bool LLParser::ParseUseListOrder(PerFunctionState *PFS) {
+bool LLParser::parseUseListOrder(PerFunctionState *PFS) {
   SMLoc Loc = Lex.getLoc();
-  if (ParseToken(lltok::kw_uselistorder, "expected uselistorder directive"))
+  if (parseToken(lltok::kw_uselistorder, "expected uselistorder directive"))
     return true;
 
   Value *V;
   SmallVector<unsigned, 16> Indexes;
-  if (ParseTypeAndValue(V, PFS) ||
-      ParseToken(lltok::comma, "expected comma in uselistorder directive") ||
-      ParseUseListOrderIndexes(Indexes))
+  if (parseTypeAndValue(V, PFS) ||
+      parseToken(lltok::comma, "expected comma in uselistorder directive") ||
+      parseUseListOrderIndexes(Indexes))
     return true;
 
   return sortUseListOrder(V, Indexes, Loc);
 }
 
-/// ParseUseListOrderBB
+/// parseUseListOrderBB
 ///   ::= 'uselistorder_bb' @foo ',' %bar ',' UseListOrderIndexes
-bool LLParser::ParseUseListOrderBB() {
+bool LLParser::parseUseListOrderBB() {
   assert(Lex.getKind() == lltok::kw_uselistorder_bb);
   SMLoc Loc = Lex.getLoc();
   Lex.Lex();
 
   ValID Fn, Label;
   SmallVector<unsigned, 16> Indexes;
-  if (ParseValID(Fn) ||
-      ParseToken(lltok::comma, "expected comma in uselistorder_bb directive") ||
-      ParseValID(Label) ||
-      ParseToken(lltok::comma, "expected comma in uselistorder_bb directive") ||
-      ParseUseListOrderIndexes(Indexes))
+  if (parseValID(Fn) ||
+      parseToken(lltok::comma, "expected comma in uselistorder_bb directive") ||
+      parseValID(Label) ||
+      parseToken(lltok::comma, "expected comma in uselistorder_bb directive") ||
+      parseUseListOrderIndexes(Indexes))
     return true;
 
   // Check the function.
@@ -7583,25 +7846,26 @@ bool LLParser::ParseUseListOrderBB() {
   else if (Fn.Kind == ValID::t_GlobalID)
     GV = Fn.UIntVal < NumberedVals.size() ? NumberedVals[Fn.UIntVal] : nullptr;
   else
-    return Error(Fn.Loc, "expected function name in uselistorder_bb");
+    return error(Fn.Loc, "expected function name in uselistorder_bb");
   if (!GV)
-    return Error(Fn.Loc, "invalid function forward reference in uselistorder_bb");
+    return error(Fn.Loc,
+                 "invalid function forward reference in uselistorder_bb");
   auto *F = dyn_cast<Function>(GV);
   if (!F)
-    return Error(Fn.Loc, "expected function name in uselistorder_bb");
+    return error(Fn.Loc, "expected function name in uselistorder_bb");
   if (F->isDeclaration())
-    return Error(Fn.Loc, "invalid declaration in uselistorder_bb");
+    return error(Fn.Loc, "invalid declaration in uselistorder_bb");
 
   // Check the basic block.
   if (Label.Kind == ValID::t_LocalID)
-    return Error(Label.Loc, "invalid numeric label in uselistorder_bb");
+    return error(Label.Loc, "invalid numeric label in uselistorder_bb");
   if (Label.Kind != ValID::t_LocalName)
-    return Error(Label.Loc, "expected basic block name in uselistorder_bb");
+    return error(Label.Loc, "expected basic block name in uselistorder_bb");
   Value *V = F->getValueSymbolTable()->lookup(Label.StrVal);
   if (!V)
-    return Error(Label.Loc, "invalid basic block in uselistorder_bb");
+    return error(Label.Loc, "invalid basic block in uselistorder_bb");
   if (!isa<BasicBlock>(V))
-    return Error(Label.Loc, "expected basic block in uselistorder_bb");
+    return error(Label.Loc, "expected basic block in uselistorder_bb");
 
   return sortUseListOrder(V, Indexes, Loc);
 }
@@ -7609,32 +7873,32 @@ bool LLParser::ParseUseListOrderBB() {
 /// ModuleEntry
 ///   ::= 'module' ':' '(' 'path' ':' STRINGCONSTANT ',' 'hash' ':' Hash ')'
 /// Hash ::= '(' UInt32 ',' UInt32 ',' UInt32 ',' UInt32 ',' UInt32 ')'
-bool LLParser::ParseModuleEntry(unsigned ID) {
+bool LLParser::parseModuleEntry(unsigned ID) {
   assert(Lex.getKind() == lltok::kw_module);
   Lex.Lex();
 
   std::string Path;
-  if (ParseToken(lltok::colon, "expected ':' here") ||
-      ParseToken(lltok::lparen, "expected '(' here") ||
-      ParseToken(lltok::kw_path, "expected 'path' here") ||
-      ParseToken(lltok::colon, "expected ':' here") ||
-      ParseStringConstant(Path) ||
-      ParseToken(lltok::comma, "expected ',' here") ||
-      ParseToken(lltok::kw_hash, "expected 'hash' here") ||
-      ParseToken(lltok::colon, "expected ':' here") ||
-      ParseToken(lltok::lparen, "expected '(' here"))
+  if (parseToken(lltok::colon, "expected ':' here") ||
+      parseToken(lltok::lparen, "expected '(' here") ||
+      parseToken(lltok::kw_path, "expected 'path' here") ||
+      parseToken(lltok::colon, "expected ':' here") ||
+      parseStringConstant(Path) ||
+      parseToken(lltok::comma, "expected ',' here") ||
+      parseToken(lltok::kw_hash, "expected 'hash' here") ||
+      parseToken(lltok::colon, "expected ':' here") ||
+      parseToken(lltok::lparen, "expected '(' here"))
     return true;
 
   ModuleHash Hash;
-  if (ParseUInt32(Hash[0]) || ParseToken(lltok::comma, "expected ',' here") ||
-      ParseUInt32(Hash[1]) || ParseToken(lltok::comma, "expected ',' here") ||
-      ParseUInt32(Hash[2]) || ParseToken(lltok::comma, "expected ',' here") ||
-      ParseUInt32(Hash[3]) || ParseToken(lltok::comma, "expected ',' here") ||
-      ParseUInt32(Hash[4]))
+  if (parseUInt32(Hash[0]) || parseToken(lltok::comma, "expected ',' here") ||
+      parseUInt32(Hash[1]) || parseToken(lltok::comma, "expected ',' here") ||
+      parseUInt32(Hash[2]) || parseToken(lltok::comma, "expected ',' here") ||
+      parseUInt32(Hash[3]) || parseToken(lltok::comma, "expected ',' here") ||
+      parseUInt32(Hash[4]))
     return true;
 
-  if (ParseToken(lltok::rparen, "expected ')' here") ||
-      ParseToken(lltok::rparen, "expected ')' here"))
+  if (parseToken(lltok::rparen, "expected ')' here") ||
+      parseToken(lltok::rparen, "expected ')' here"))
     return true;
 
   auto ModuleEntry = Index->addModule(Path, ID, Hash);
@@ -7645,21 +7909,21 @@ bool LLParser::ParseModuleEntry(unsigned ID) {
 
 /// TypeIdEntry
 ///   ::= 'typeid' ':' '(' 'name' ':' STRINGCONSTANT ',' TypeIdSummary ')'
-bool LLParser::ParseTypeIdEntry(unsigned ID) {
+bool LLParser::parseTypeIdEntry(unsigned ID) {
   assert(Lex.getKind() == lltok::kw_typeid);
   Lex.Lex();
 
   std::string Name;
-  if (ParseToken(lltok::colon, "expected ':' here") ||
-      ParseToken(lltok::lparen, "expected '(' here") ||
-      ParseToken(lltok::kw_name, "expected 'name' here") ||
-      ParseToken(lltok::colon, "expected ':' here") ||
-      ParseStringConstant(Name))
+  if (parseToken(lltok::colon, "expected ':' here") ||
+      parseToken(lltok::lparen, "expected '(' here") ||
+      parseToken(lltok::kw_name, "expected 'name' here") ||
+      parseToken(lltok::colon, "expected ':' here") ||
+      parseStringConstant(Name))
     return true;
 
   TypeIdSummary &TIS = Index->getOrInsertTypeIdSummary(Name);
-  if (ParseToken(lltok::comma, "expected ',' here") ||
-      ParseTypeIdSummary(TIS) || ParseToken(lltok::rparen, "expected ')' here"))
+  if (parseToken(lltok::comma, "expected ',' here") ||
+      parseTypeIdSummary(TIS) || parseToken(lltok::rparen, "expected ')' here"))
     return true;
 
   // Check if this ID was forward referenced, and if so, update the
@@ -7679,20 +7943,20 @@ bool LLParser::ParseTypeIdEntry(unsigned ID) {
 
 /// TypeIdSummary
 ///   ::= 'summary' ':' '(' TypeTestResolution [',' OptionalWpdResolutions]? ')'
-bool LLParser::ParseTypeIdSummary(TypeIdSummary &TIS) {
-  if (ParseToken(lltok::kw_summary, "expected 'summary' here") ||
-      ParseToken(lltok::colon, "expected ':' here") ||
-      ParseToken(lltok::lparen, "expected '(' here") ||
-      ParseTypeTestResolution(TIS.TTRes))
+bool LLParser::parseTypeIdSummary(TypeIdSummary &TIS) {
+  if (parseToken(lltok::kw_summary, "expected 'summary' here") ||
+      parseToken(lltok::colon, "expected ':' here") ||
+      parseToken(lltok::lparen, "expected '(' here") ||
+      parseTypeTestResolution(TIS.TTRes))
     return true;
 
   if (EatIfPresent(lltok::comma)) {
     // Expect optional wpdResolutions field
-    if (ParseOptionalWpdResolutions(TIS.WPDRes))
+    if (parseOptionalWpdResolutions(TIS.WPDRes))
       return true;
   }
 
-  if (ParseToken(lltok::rparen, "expected ')' here"))
+  if (parseToken(lltok::rparen, "expected ')' here"))
     return true;
 
   return false;
@@ -7705,40 +7969,40 @@ static ValueInfo EmptyVI =
 ///   ::= 'typeidCompatibleVTable' ':' '(' 'name' ':' STRINGCONSTANT ','
 ///   TypeIdCompatibleVtableInfo
 ///   ')'
-bool LLParser::ParseTypeIdCompatibleVtableEntry(unsigned ID) {
+bool LLParser::parseTypeIdCompatibleVtableEntry(unsigned ID) {
   assert(Lex.getKind() == lltok::kw_typeidCompatibleVTable);
   Lex.Lex();
 
   std::string Name;
-  if (ParseToken(lltok::colon, "expected ':' here") ||
-      ParseToken(lltok::lparen, "expected '(' here") ||
-      ParseToken(lltok::kw_name, "expected 'name' here") ||
-      ParseToken(lltok::colon, "expected ':' here") ||
-      ParseStringConstant(Name))
+  if (parseToken(lltok::colon, "expected ':' here") ||
+      parseToken(lltok::lparen, "expected '(' here") ||
+      parseToken(lltok::kw_name, "expected 'name' here") ||
+      parseToken(lltok::colon, "expected ':' here") ||
+      parseStringConstant(Name))
     return true;
 
   TypeIdCompatibleVtableInfo &TI =
       Index->getOrInsertTypeIdCompatibleVtableSummary(Name);
-  if (ParseToken(lltok::comma, "expected ',' here") ||
-      ParseToken(lltok::kw_summary, "expected 'summary' here") ||
-      ParseToken(lltok::colon, "expected ':' here") ||
-      ParseToken(lltok::lparen, "expected '(' here"))
+  if (parseToken(lltok::comma, "expected ',' here") ||
+      parseToken(lltok::kw_summary, "expected 'summary' here") ||
+      parseToken(lltok::colon, "expected ':' here") ||
+      parseToken(lltok::lparen, "expected '(' here"))
     return true;
 
   IdToIndexMapType IdToIndexMap;
-  // Parse each call edge
+  // parse each call edge
   do {
     uint64_t Offset;
-    if (ParseToken(lltok::lparen, "expected '(' here") ||
-        ParseToken(lltok::kw_offset, "expected 'offset' here") ||
-        ParseToken(lltok::colon, "expected ':' here") || ParseUInt64(Offset) ||
-        ParseToken(lltok::comma, "expected ',' here"))
+    if (parseToken(lltok::lparen, "expected '(' here") ||
+        parseToken(lltok::kw_offset, "expected 'offset' here") ||
+        parseToken(lltok::colon, "expected ':' here") || parseUInt64(Offset) ||
+        parseToken(lltok::comma, "expected ',' here"))
       return true;
 
     LocTy Loc = Lex.getLoc();
     unsigned GVId;
     ValueInfo VI;
-    if (ParseGVReference(VI, GVId))
+    if (parseGVReference(VI, GVId))
       return true;
 
     // Keep track of the TypeIdCompatibleVtableInfo array index needing a
@@ -7748,25 +8012,23 @@ bool LLParser::ParseTypeIdCompatibleVtableEntry(unsigned ID) {
       IdToIndexMap[GVId].push_back(std::make_pair(TI.size(), Loc));
     TI.push_back({Offset, VI});
 
-    if (ParseToken(lltok::rparen, "expected ')' in call"))
+    if (parseToken(lltok::rparen, "expected ')' in call"))
       return true;
   } while (EatIfPresent(lltok::comma));
 
   // Now that the TI vector is finalized, it is safe to save the locations
   // of any forward GV references that need updating later.
   for (auto I : IdToIndexMap) {
+    auto &Infos = ForwardRefValueInfos[I.first];
     for (auto P : I.second) {
       assert(TI[P.first].VTableVI == EmptyVI &&
              "Forward referenced ValueInfo expected to be empty");
-      auto FwdRef = ForwardRefValueInfos.insert(std::make_pair(
-          I.first, std::vector<std::pair<ValueInfo *, LocTy>>()));
-      FwdRef.first->second.push_back(
-          std::make_pair(&TI[P.first].VTableVI, P.second));
+      Infos.emplace_back(&TI[P.first].VTableVI, P.second);
     }
   }
 
-  if (ParseToken(lltok::rparen, "expected ')' here") ||
-      ParseToken(lltok::rparen, "expected ')' here"))
+  if (parseToken(lltok::rparen, "expected ')' here") ||
+      parseToken(lltok::rparen, "expected ')' here"))
     return true;
 
   // Check if this ID was forward referenced, and if so, update the
@@ -7790,12 +8052,12 @@ bool LLParser::ParseTypeIdCompatibleVtableEntry(unsigned ID) {
 ///         'sizeM1BitWidth' ':' SizeM1BitWidth [',' 'alignLog2' ':' UInt64]?
 ///         [',' 'sizeM1' ':' UInt64]? [',' 'bitMask' ':' UInt8]?
 ///         [',' 'inlinesBits' ':' UInt64]? ')'
-bool LLParser::ParseTypeTestResolution(TypeTestResolution &TTRes) {
-  if (ParseToken(lltok::kw_typeTestRes, "expected 'typeTestRes' here") ||
-      ParseToken(lltok::colon, "expected ':' here") ||
-      ParseToken(lltok::lparen, "expected '(' here") ||
-      ParseToken(lltok::kw_kind, "expected 'kind' here") ||
-      ParseToken(lltok::colon, "expected ':' here"))
+bool LLParser::parseTypeTestResolution(TypeTestResolution &TTRes) {
+  if (parseToken(lltok::kw_typeTestRes, "expected 'typeTestRes' here") ||
+      parseToken(lltok::colon, "expected ':' here") ||
+      parseToken(lltok::lparen, "expected '(' here") ||
+      parseToken(lltok::kw_kind, "expected 'kind' here") ||
+      parseToken(lltok::colon, "expected ':' here"))
     return true;
 
   switch (Lex.getKind()) {
@@ -7818,34 +8080,34 @@ bool LLParser::ParseTypeTestResolution(TypeTestResolution &TTRes) {
     TTRes.TheKind = TypeTestResolution::AllOnes;
     break;
   default:
-    return Error(Lex.getLoc(), "unexpected TypeTestResolution kind");
+    return error(Lex.getLoc(), "unexpected TypeTestResolution kind");
   }
   Lex.Lex();
 
-  if (ParseToken(lltok::comma, "expected ',' here") ||
-      ParseToken(lltok::kw_sizeM1BitWidth, "expected 'sizeM1BitWidth' here") ||
-      ParseToken(lltok::colon, "expected ':' here") ||
-      ParseUInt32(TTRes.SizeM1BitWidth))
+  if (parseToken(lltok::comma, "expected ',' here") ||
+      parseToken(lltok::kw_sizeM1BitWidth, "expected 'sizeM1BitWidth' here") ||
+      parseToken(lltok::colon, "expected ':' here") ||
+      parseUInt32(TTRes.SizeM1BitWidth))
     return true;
 
-  // Parse optional fields
+  // parse optional fields
   while (EatIfPresent(lltok::comma)) {
     switch (Lex.getKind()) {
     case lltok::kw_alignLog2:
       Lex.Lex();
-      if (ParseToken(lltok::colon, "expected ':'") ||
-          ParseUInt64(TTRes.AlignLog2))
+      if (parseToken(lltok::colon, "expected ':'") ||
+          parseUInt64(TTRes.AlignLog2))
         return true;
       break;
     case lltok::kw_sizeM1:
       Lex.Lex();
-      if (ParseToken(lltok::colon, "expected ':'") || ParseUInt64(TTRes.SizeM1))
+      if (parseToken(lltok::colon, "expected ':'") || parseUInt64(TTRes.SizeM1))
         return true;
       break;
     case lltok::kw_bitMask: {
       unsigned Val;
       Lex.Lex();
-      if (ParseToken(lltok::colon, "expected ':'") || ParseUInt32(Val))
+      if (parseToken(lltok::colon, "expected ':'") || parseUInt32(Val))
         return true;
       assert(Val <= 0xff);
       TTRes.BitMask = (uint8_t)Val;
@@ -7853,16 +8115,16 @@ bool LLParser::ParseTypeTestResolution(TypeTestResolution &TTRes) {
     }
     case lltok::kw_inlineBits:
       Lex.Lex();
-      if (ParseToken(lltok::colon, "expected ':'") ||
-          ParseUInt64(TTRes.InlineBits))
+      if (parseToken(lltok::colon, "expected ':'") ||
+          parseUInt64(TTRes.InlineBits))
         return true;
       break;
     default:
-      return Error(Lex.getLoc(), "expected optional TypeTestResolution field");
+      return error(Lex.getLoc(), "expected optional TypeTestResolution field");
     }
   }
 
-  if (ParseToken(lltok::rparen, "expected ')' here"))
+  if (parseToken(lltok::rparen, "expected ')' here"))
     return true;
 
   return false;
@@ -7871,26 +8133,26 @@ bool LLParser::ParseTypeTestResolution(TypeTestResolution &TTRes) {
 /// OptionalWpdResolutions
 ///   ::= 'wpsResolutions' ':' '(' WpdResolution [',' WpdResolution]* ')'
 /// WpdResolution ::= '(' 'offset' ':' UInt64 ',' WpdRes ')'
-bool LLParser::ParseOptionalWpdResolutions(
+bool LLParser::parseOptionalWpdResolutions(
     std::map<uint64_t, WholeProgramDevirtResolution> &WPDResMap) {
-  if (ParseToken(lltok::kw_wpdResolutions, "expected 'wpdResolutions' here") ||
-      ParseToken(lltok::colon, "expected ':' here") ||
-      ParseToken(lltok::lparen, "expected '(' here"))
+  if (parseToken(lltok::kw_wpdResolutions, "expected 'wpdResolutions' here") ||
+      parseToken(lltok::colon, "expected ':' here") ||
+      parseToken(lltok::lparen, "expected '(' here"))
     return true;
 
   do {
     uint64_t Offset;
     WholeProgramDevirtResolution WPDRes;
-    if (ParseToken(lltok::lparen, "expected '(' here") ||
-        ParseToken(lltok::kw_offset, "expected 'offset' here") ||
-        ParseToken(lltok::colon, "expected ':' here") || ParseUInt64(Offset) ||
-        ParseToken(lltok::comma, "expected ',' here") || ParseWpdRes(WPDRes) ||
-        ParseToken(lltok::rparen, "expected ')' here"))
+    if (parseToken(lltok::lparen, "expected '(' here") ||
+        parseToken(lltok::kw_offset, "expected 'offset' here") ||
+        parseToken(lltok::colon, "expected ':' here") || parseUInt64(Offset) ||
+        parseToken(lltok::comma, "expected ',' here") || parseWpdRes(WPDRes) ||
+        parseToken(lltok::rparen, "expected ')' here"))
       return true;
     WPDResMap[Offset] = WPDRes;
   } while (EatIfPresent(lltok::comma));
 
-  if (ParseToken(lltok::rparen, "expected ')' here"))
+  if (parseToken(lltok::rparen, "expected ')' here"))
     return true;
 
   return false;
@@ -7904,12 +8166,12 @@ bool LLParser::ParseOptionalWpdResolutions(
 ///         [',' OptionalResByArg]? ')'
 ///   ::= 'wpdRes' ':' '(' 'kind' ':' 'branchFunnel'
 ///         [',' OptionalResByArg]? ')'
-bool LLParser::ParseWpdRes(WholeProgramDevirtResolution &WPDRes) {
-  if (ParseToken(lltok::kw_wpdRes, "expected 'wpdRes' here") ||
-      ParseToken(lltok::colon, "expected ':' here") ||
-      ParseToken(lltok::lparen, "expected '(' here") ||
-      ParseToken(lltok::kw_kind, "expected 'kind' here") ||
-      ParseToken(lltok::colon, "expected ':' here"))
+bool LLParser::parseWpdRes(WholeProgramDevirtResolution &WPDRes) {
+  if (parseToken(lltok::kw_wpdRes, "expected 'wpdRes' here") ||
+      parseToken(lltok::colon, "expected ':' here") ||
+      parseToken(lltok::lparen, "expected '(' here") ||
+      parseToken(lltok::kw_kind, "expected 'kind' here") ||
+      parseToken(lltok::colon, "expected ':' here"))
     return true;
 
   switch (Lex.getKind()) {
@@ -7923,30 +8185,30 @@ bool LLParser::ParseWpdRes(WholeProgramDevirtResolution &WPDRes) {
     WPDRes.TheKind = WholeProgramDevirtResolution::BranchFunnel;
     break;
   default:
-    return Error(Lex.getLoc(), "unexpected WholeProgramDevirtResolution kind");
+    return error(Lex.getLoc(), "unexpected WholeProgramDevirtResolution kind");
   }
   Lex.Lex();
 
-  // Parse optional fields
+  // parse optional fields
   while (EatIfPresent(lltok::comma)) {
     switch (Lex.getKind()) {
     case lltok::kw_singleImplName:
       Lex.Lex();
-      if (ParseToken(lltok::colon, "expected ':' here") ||
-          ParseStringConstant(WPDRes.SingleImplName))
+      if (parseToken(lltok::colon, "expected ':' here") ||
+          parseStringConstant(WPDRes.SingleImplName))
         return true;
       break;
     case lltok::kw_resByArg:
-      if (ParseOptionalResByArg(WPDRes.ResByArg))
+      if (parseOptionalResByArg(WPDRes.ResByArg))
         return true;
       break;
     default:
-      return Error(Lex.getLoc(),
+      return error(Lex.getLoc(),
                    "expected optional WholeProgramDevirtResolution field");
     }
   }
 
-  if (ParseToken(lltok::rparen, "expected ')' here"))
+  if (parseToken(lltok::rparen, "expected ')' here"))
     return true;
 
   return false;
@@ -7959,22 +8221,22 @@ bool LLParser::ParseWpdRes(WholeProgramDevirtResolution &WPDRes) {
 ///                  'virtualConstProp' )
 ///                [',' 'info' ':' UInt64]? [',' 'byte' ':' UInt32]?
 ///                [',' 'bit' ':' UInt32]? ')'
-bool LLParser::ParseOptionalResByArg(
+bool LLParser::parseOptionalResByArg(
     std::map<std::vector<uint64_t>, WholeProgramDevirtResolution::ByArg>
         &ResByArg) {
-  if (ParseToken(lltok::kw_resByArg, "expected 'resByArg' here") ||
-      ParseToken(lltok::colon, "expected ':' here") ||
-      ParseToken(lltok::lparen, "expected '(' here"))
+  if (parseToken(lltok::kw_resByArg, "expected 'resByArg' here") ||
+      parseToken(lltok::colon, "expected ':' here") ||
+      parseToken(lltok::lparen, "expected '(' here"))
     return true;
 
   do {
     std::vector<uint64_t> Args;
-    if (ParseArgs(Args) || ParseToken(lltok::comma, "expected ',' here") ||
-        ParseToken(lltok::kw_byArg, "expected 'byArg here") ||
-        ParseToken(lltok::colon, "expected ':' here") ||
-        ParseToken(lltok::lparen, "expected '(' here") ||
-        ParseToken(lltok::kw_kind, "expected 'kind' here") ||
-        ParseToken(lltok::colon, "expected ':' here"))
+    if (parseArgs(Args) || parseToken(lltok::comma, "expected ',' here") ||
+        parseToken(lltok::kw_byArg, "expected 'byArg here") ||
+        parseToken(lltok::colon, "expected ':' here") ||
+        parseToken(lltok::lparen, "expected '(' here") ||
+        parseToken(lltok::kw_kind, "expected 'kind' here") ||
+        parseToken(lltok::colon, "expected ':' here"))
       return true;
 
     WholeProgramDevirtResolution::ByArg ByArg;
@@ -7992,45 +8254,45 @@ bool LLParser::ParseOptionalResByArg(
       ByArg.TheKind = WholeProgramDevirtResolution::ByArg::VirtualConstProp;
       break;
     default:
-      return Error(Lex.getLoc(),
+      return error(Lex.getLoc(),
                    "unexpected WholeProgramDevirtResolution::ByArg kind");
     }
     Lex.Lex();
 
-    // Parse optional fields
+    // parse optional fields
     while (EatIfPresent(lltok::comma)) {
       switch (Lex.getKind()) {
       case lltok::kw_info:
         Lex.Lex();
-        if (ParseToken(lltok::colon, "expected ':' here") ||
-            ParseUInt64(ByArg.Info))
+        if (parseToken(lltok::colon, "expected ':' here") ||
+            parseUInt64(ByArg.Info))
           return true;
         break;
       case lltok::kw_byte:
         Lex.Lex();
-        if (ParseToken(lltok::colon, "expected ':' here") ||
-            ParseUInt32(ByArg.Byte))
+        if (parseToken(lltok::colon, "expected ':' here") ||
+            parseUInt32(ByArg.Byte))
           return true;
         break;
       case lltok::kw_bit:
         Lex.Lex();
-        if (ParseToken(lltok::colon, "expected ':' here") ||
-            ParseUInt32(ByArg.Bit))
+        if (parseToken(lltok::colon, "expected ':' here") ||
+            parseUInt32(ByArg.Bit))
           return true;
         break;
       default:
-        return Error(Lex.getLoc(),
+        return error(Lex.getLoc(),
                      "expected optional whole program devirt field");
       }
     }
 
-    if (ParseToken(lltok::rparen, "expected ')' here"))
+    if (parseToken(lltok::rparen, "expected ')' here"))
       return true;
 
     ResByArg[Args] = ByArg;
   } while (EatIfPresent(lltok::comma));
 
-  if (ParseToken(lltok::rparen, "expected ')' here"))
+  if (parseToken(lltok::rparen, "expected ')' here"))
     return true;
 
   return false;
@@ -8038,20 +8300,20 @@ bool LLParser::ParseOptionalResByArg(
 
 /// OptionalResByArg
 ///   ::= 'args' ':' '(' UInt64[, UInt64]* ')'
-bool LLParser::ParseArgs(std::vector<uint64_t> &Args) {
-  if (ParseToken(lltok::kw_args, "expected 'args' here") ||
-      ParseToken(lltok::colon, "expected ':' here") ||
-      ParseToken(lltok::lparen, "expected '(' here"))
+bool LLParser::parseArgs(std::vector<uint64_t> &Args) {
+  if (parseToken(lltok::kw_args, "expected 'args' here") ||
+      parseToken(lltok::colon, "expected ':' here") ||
+      parseToken(lltok::lparen, "expected '(' here"))
     return true;
 
   do {
     uint64_t Val;
-    if (ParseUInt64(Val))
+    if (parseUInt64(Val))
       return true;
     Args.push_back(Val);
   } while (EatIfPresent(lltok::comma));
 
-  if (ParseToken(lltok::rparen, "expected ')' here"))
+  if (parseToken(lltok::rparen, "expected ')' here"))
     return true;
 
   return false;
@@ -8072,7 +8334,7 @@ static void resolveFwdRef(ValueInfo *Fwd, ValueInfo &Resolved) {
 
 /// Stores the given Name/GUID and associated summary into the Index.
 /// Also updates any forward references to the associated entry ID.
-void LLParser::AddGlobalValueToIndex(
+void LLParser::addGlobalValueToIndex(
     std::string Name, GlobalValue::GUID GUID, GlobalValue::LinkageTypes Linkage,
     unsigned ID, std::unique_ptr<GlobalValueSummary> Summary) {
   // First create the ValueInfo utilizing the Name or GUID.
@@ -8134,46 +8396,48 @@ void LLParser::AddGlobalValueToIndex(
   }
 }
 
-/// ParseSummaryIndexFlags
+/// parseSummaryIndexFlags
 ///   ::= 'flags' ':' UInt64
-bool LLParser::ParseSummaryIndexFlags() {
+bool LLParser::parseSummaryIndexFlags() {
   assert(Lex.getKind() == lltok::kw_flags);
   Lex.Lex();
 
-  if (ParseToken(lltok::colon, "expected ':' here"))
+  if (parseToken(lltok::colon, "expected ':' here"))
     return true;
   uint64_t Flags;
-  if (ParseUInt64(Flags))
+  if (parseUInt64(Flags))
     return true;
-  Index->setFlags(Flags);
+  if (Index)
+    Index->setFlags(Flags);
   return false;
 }
 
-/// ParseBlockCount
+/// parseBlockCount
 ///   ::= 'blockcount' ':' UInt64
-bool LLParser::ParseBlockCount() {
+bool LLParser::parseBlockCount() {
   assert(Lex.getKind() == lltok::kw_blockcount);
   Lex.Lex();
 
-  if (ParseToken(lltok::colon, "expected ':' here"))
+  if (parseToken(lltok::colon, "expected ':' here"))
     return true;
   uint64_t BlockCount;
-  if (ParseUInt64(BlockCount))
+  if (parseUInt64(BlockCount))
     return true;
-  Index->setBlockCount(BlockCount);
+  if (Index)
+    Index->setBlockCount(BlockCount);
   return false;
 }
 
-/// ParseGVEntry
+/// parseGVEntry
 ///   ::= 'gv' ':' '(' ('name' ':' STRINGCONSTANT | 'guid' ':' UInt64)
 ///         [',' 'summaries' ':' Summary[',' Summary]* ]? ')'
 /// Summary ::= '(' (FunctionSummary | VariableSummary | AliasSummary) ')'
-bool LLParser::ParseGVEntry(unsigned ID) {
+bool LLParser::parseGVEntry(unsigned ID) {
   assert(Lex.getKind() == lltok::kw_gv);
   Lex.Lex();
 
-  if (ParseToken(lltok::colon, "expected ':' here") ||
-      ParseToken(lltok::lparen, "expected '(' here"))
+  if (parseToken(lltok::colon, "expected ':' here") ||
+      parseToken(lltok::lparen, "expected '(' here"))
     return true;
 
   std::string Name;
@@ -8181,23 +8445,23 @@ bool LLParser::ParseGVEntry(unsigned ID) {
   switch (Lex.getKind()) {
   case lltok::kw_name:
     Lex.Lex();
-    if (ParseToken(lltok::colon, "expected ':' here") ||
-        ParseStringConstant(Name))
+    if (parseToken(lltok::colon, "expected ':' here") ||
+        parseStringConstant(Name))
       return true;
     // Can't create GUID/ValueInfo until we have the linkage.
     break;
   case lltok::kw_guid:
     Lex.Lex();
-    if (ParseToken(lltok::colon, "expected ':' here") || ParseUInt64(GUID))
+    if (parseToken(lltok::colon, "expected ':' here") || parseUInt64(GUID))
       return true;
     break;
   default:
-    return Error(Lex.getLoc(), "expected name or guid tag");
+    return error(Lex.getLoc(), "expected name or guid tag");
   }
 
   if (!EatIfPresent(lltok::comma)) {
     // No summaries. Wrap up.
-    if (ParseToken(lltok::rparen, "expected ')' here"))
+    if (parseToken(lltok::rparen, "expected ')' here"))
       return true;
     // This was created for a call to an external or indirect target.
     // A GUID with no summary came from a VALUE_GUID record, dummy GUID
@@ -8205,37 +8469,37 @@ bool LLParser::ParseGVEntry(unsigned ID) {
     // an external definition. We pass ExternalLinkage since that is only
     // used when the GUID must be computed from Name, and in that case
     // the symbol must have external linkage.
-    AddGlobalValueToIndex(Name, GUID, GlobalValue::ExternalLinkage, ID,
+    addGlobalValueToIndex(Name, GUID, GlobalValue::ExternalLinkage, ID,
                           nullptr);
     return false;
   }
 
   // Have a list of summaries
-  if (ParseToken(lltok::kw_summaries, "expected 'summaries' here") ||
-      ParseToken(lltok::colon, "expected ':' here") ||
-      ParseToken(lltok::lparen, "expected '(' here"))
+  if (parseToken(lltok::kw_summaries, "expected 'summaries' here") ||
+      parseToken(lltok::colon, "expected ':' here") ||
+      parseToken(lltok::lparen, "expected '(' here"))
     return true;
   do {
     switch (Lex.getKind()) {
     case lltok::kw_function:
-      if (ParseFunctionSummary(Name, GUID, ID))
+      if (parseFunctionSummary(Name, GUID, ID))
         return true;
       break;
     case lltok::kw_variable:
-      if (ParseVariableSummary(Name, GUID, ID))
+      if (parseVariableSummary(Name, GUID, ID))
         return true;
       break;
     case lltok::kw_alias:
-      if (ParseAliasSummary(Name, GUID, ID))
+      if (parseAliasSummary(Name, GUID, ID))
         return true;
       break;
     default:
-      return Error(Lex.getLoc(), "expected summary type");
+      return error(Lex.getLoc(), "expected summary type");
     }
   } while (EatIfPresent(lltok::comma));
 
-  if (ParseToken(lltok::rparen, "expected ')' here") ||
-      ParseToken(lltok::rparen, "expected ')' here"))
+  if (parseToken(lltok::rparen, "expected ')' here") ||
+      parseToken(lltok::rparen, "expected ')' here"))
     return true;
 
   return false;
@@ -8246,7 +8510,7 @@ bool LLParser::ParseGVEntry(unsigned ID) {
 ///         ',' 'insts' ':' UInt32 [',' OptionalFFlags]? [',' OptionalCalls]?
 ///         [',' OptionalTypeIdInfo]? [',' OptionalParamAccesses]?
 ///         [',' OptionalRefs]? ')'
-bool LLParser::ParseFunctionSummary(std::string Name, GlobalValue::GUID GUID,
+bool LLParser::parseFunctionSummary(std::string Name, GlobalValue::GUID GUID,
                                     unsigned ID) {
   assert(Lex.getKind() == lltok::kw_function);
   Lex.Lex();
@@ -8262,44 +8526,44 @@ bool LLParser::ParseFunctionSummary(std::string Name, GlobalValue::GUID GUID,
   std::vector<ValueInfo> Refs;
   // Default is all-zeros (conservative values).
   FunctionSummary::FFlags FFlags = {};
-  if (ParseToken(lltok::colon, "expected ':' here") ||
-      ParseToken(lltok::lparen, "expected '(' here") ||
-      ParseModuleReference(ModulePath) ||
-      ParseToken(lltok::comma, "expected ',' here") || ParseGVFlags(GVFlags) ||
-      ParseToken(lltok::comma, "expected ',' here") ||
-      ParseToken(lltok::kw_insts, "expected 'insts' here") ||
-      ParseToken(lltok::colon, "expected ':' here") || ParseUInt32(InstCount))
+  if (parseToken(lltok::colon, "expected ':' here") ||
+      parseToken(lltok::lparen, "expected '(' here") ||
+      parseModuleReference(ModulePath) ||
+      parseToken(lltok::comma, "expected ',' here") || parseGVFlags(GVFlags) ||
+      parseToken(lltok::comma, "expected ',' here") ||
+      parseToken(lltok::kw_insts, "expected 'insts' here") ||
+      parseToken(lltok::colon, "expected ':' here") || parseUInt32(InstCount))
     return true;
 
-  // Parse optional fields
+  // parse optional fields
   while (EatIfPresent(lltok::comma)) {
     switch (Lex.getKind()) {
     case lltok::kw_funcFlags:
-      if (ParseOptionalFFlags(FFlags))
+      if (parseOptionalFFlags(FFlags))
         return true;
       break;
     case lltok::kw_calls:
-      if (ParseOptionalCalls(Calls))
+      if (parseOptionalCalls(Calls))
         return true;
       break;
     case lltok::kw_typeIdInfo:
-      if (ParseOptionalTypeIdInfo(TypeIdInfo))
+      if (parseOptionalTypeIdInfo(TypeIdInfo))
         return true;
       break;
     case lltok::kw_refs:
-      if (ParseOptionalRefs(Refs))
+      if (parseOptionalRefs(Refs))
         return true;
       break;
     case lltok::kw_params:
-      if (ParseOptionalParamAccesses(ParamAccesses))
+      if (parseOptionalParamAccesses(ParamAccesses))
         return true;
       break;
     default:
-      return Error(Lex.getLoc(), "expected optional function summary field");
+      return error(Lex.getLoc(), "expected optional function summary field");
     }
   }
 
-  if (ParseToken(lltok::rparen, "expected ')' here"))
+  if (parseToken(lltok::rparen, "expected ')' here"))
     return true;
 
   auto FS = std::make_unique<FunctionSummary>(
@@ -8313,7 +8577,7 @@ bool LLParser::ParseFunctionSummary(std::string Name, GlobalValue::GUID GUID,
 
   FS->setModulePath(ModulePath);
 
-  AddGlobalValueToIndex(Name, GUID, (GlobalValue::LinkageTypes)GVFlags.Linkage,
+  addGlobalValueToIndex(Name, GUID, (GlobalValue::LinkageTypes)GVFlags.Linkage,
                         ID, std::move(FS));
 
   return false;
@@ -8322,7 +8586,7 @@ bool LLParser::ParseFunctionSummary(std::string Name, GlobalValue::GUID GUID,
 /// VariableSummary
 ///   ::= 'variable' ':' '(' 'module' ':' ModuleReference ',' GVFlags
 ///         [',' OptionalRefs]? ')'
-bool LLParser::ParseVariableSummary(std::string Name, GlobalValue::GUID GUID,
+bool LLParser::parseVariableSummary(std::string Name, GlobalValue::GUID GUID,
                                     unsigned ID) {
   assert(Lex.getKind() == lltok::kw_variable);
   Lex.Lex();
@@ -8337,31 +8601,31 @@ bool LLParser::ParseVariableSummary(std::string Name, GlobalValue::GUID GUID,
                                         GlobalObject::VCallVisibilityPublic);
   std::vector<ValueInfo> Refs;
   VTableFuncList VTableFuncs;
-  if (ParseToken(lltok::colon, "expected ':' here") ||
-      ParseToken(lltok::lparen, "expected '(' here") ||
-      ParseModuleReference(ModulePath) ||
-      ParseToken(lltok::comma, "expected ',' here") || ParseGVFlags(GVFlags) ||
-      ParseToken(lltok::comma, "expected ',' here") ||
-      ParseGVarFlags(GVarFlags))
+  if (parseToken(lltok::colon, "expected ':' here") ||
+      parseToken(lltok::lparen, "expected '(' here") ||
+      parseModuleReference(ModulePath) ||
+      parseToken(lltok::comma, "expected ',' here") || parseGVFlags(GVFlags) ||
+      parseToken(lltok::comma, "expected ',' here") ||
+      parseGVarFlags(GVarFlags))
     return true;
 
-  // Parse optional fields
+  // parse optional fields
   while (EatIfPresent(lltok::comma)) {
     switch (Lex.getKind()) {
     case lltok::kw_vTableFuncs:
-      if (ParseOptionalVTableFuncs(VTableFuncs))
+      if (parseOptionalVTableFuncs(VTableFuncs))
         return true;
       break;
     case lltok::kw_refs:
-      if (ParseOptionalRefs(Refs))
+      if (parseOptionalRefs(Refs))
         return true;
       break;
     default:
-      return Error(Lex.getLoc(), "expected optional variable summary field");
+      return error(Lex.getLoc(), "expected optional variable summary field");
     }
   }
 
-  if (ParseToken(lltok::rparen, "expected ')' here"))
+  if (parseToken(lltok::rparen, "expected ')' here"))
     return true;
 
   auto GS =
@@ -8370,7 +8634,7 @@ bool LLParser::ParseVariableSummary(std::string Name, GlobalValue::GUID GUID,
   GS->setModulePath(ModulePath);
   GS->setVTableFuncs(std::move(VTableFuncs));
 
-  AddGlobalValueToIndex(Name, GUID, (GlobalValue::LinkageTypes)GVFlags.Linkage,
+  addGlobalValueToIndex(Name, GUID, (GlobalValue::LinkageTypes)GVFlags.Linkage,
                         ID, std::move(GS));
 
   return false;
@@ -8379,7 +8643,7 @@ bool LLParser::ParseVariableSummary(std::string Name, GlobalValue::GUID GUID,
 /// AliasSummary
 ///   ::= 'alias' ':' '(' 'module' ':' ModuleReference ',' GVFlags ','
 ///         'aliasee' ':' GVReference ')'
-bool LLParser::ParseAliasSummary(std::string Name, GlobalValue::GUID GUID,
+bool LLParser::parseAliasSummary(std::string Name, GlobalValue::GUID GUID,
                                  unsigned ID) {
   assert(Lex.getKind() == lltok::kw_alias);
   LocTy Loc = Lex.getLoc();
@@ -8389,21 +8653,21 @@ bool LLParser::ParseAliasSummary(std::string Name, GlobalValue::GUID GUID,
   GlobalValueSummary::GVFlags GVFlags = GlobalValueSummary::GVFlags(
       /*Linkage=*/GlobalValue::ExternalLinkage, /*NotEligibleToImport=*/false,
       /*Live=*/false, /*IsLocal=*/false, /*CanAutoHide=*/false);
-  if (ParseToken(lltok::colon, "expected ':' here") ||
-      ParseToken(lltok::lparen, "expected '(' here") ||
-      ParseModuleReference(ModulePath) ||
-      ParseToken(lltok::comma, "expected ',' here") || ParseGVFlags(GVFlags) ||
-      ParseToken(lltok::comma, "expected ',' here") ||
-      ParseToken(lltok::kw_aliasee, "expected 'aliasee' here") ||
-      ParseToken(lltok::colon, "expected ':' here"))
+  if (parseToken(lltok::colon, "expected ':' here") ||
+      parseToken(lltok::lparen, "expected '(' here") ||
+      parseModuleReference(ModulePath) ||
+      parseToken(lltok::comma, "expected ',' here") || parseGVFlags(GVFlags) ||
+      parseToken(lltok::comma, "expected ',' here") ||
+      parseToken(lltok::kw_aliasee, "expected 'aliasee' here") ||
+      parseToken(lltok::colon, "expected ':' here"))
     return true;
 
   ValueInfo AliaseeVI;
   unsigned GVId;
-  if (ParseGVReference(AliaseeVI, GVId))
+  if (parseGVReference(AliaseeVI, GVId))
     return true;
 
-  if (ParseToken(lltok::rparen, "expected ')' here"))
+  if (parseToken(lltok::rparen, "expected ')' here"))
     return true;
 
   auto AS = std::make_unique<AliasSummary>(GVFlags);
@@ -8412,16 +8676,14 @@ bool LLParser::ParseAliasSummary(std::string Name, GlobalValue::GUID GUID,
 
   // Record forward reference if the aliasee is not parsed yet.
   if (AliaseeVI.getRef() == FwdVIRef) {
-    auto FwdRef = ForwardRefAliasees.insert(
-        std::make_pair(GVId, std::vector<std::pair<AliasSummary *, LocTy>>()));
-    FwdRef.first->second.push_back(std::make_pair(AS.get(), Loc));
+    ForwardRefAliasees[GVId].emplace_back(AS.get(), Loc);
   } else {
     auto Summary = Index->findSummaryInModule(AliaseeVI, ModulePath);
     assert(Summary && "Aliasee must be a definition");
     AS->setAliasee(AliaseeVI, Summary);
   }
 
-  AddGlobalValueToIndex(Name, GUID, (GlobalValue::LinkageTypes)GVFlags.Linkage,
+  addGlobalValueToIndex(Name, GUID, (GlobalValue::LinkageTypes)GVFlags.Linkage,
                         ID, std::move(AS));
 
   return false;
@@ -8429,9 +8691,9 @@ bool LLParser::ParseAliasSummary(std::string Name, GlobalValue::GUID GUID,
 
 /// Flag
 ///   ::= [0|1]
-bool LLParser::ParseFlag(unsigned &Val) {
+bool LLParser::parseFlag(unsigned &Val) {
   if (Lex.getKind() != lltok::APSInt || Lex.getAPSIntVal().isSigned())
-    return TokError("expected integer");
+    return tokError("expected integer");
   Val = (unsigned)Lex.getAPSIntVal().getBoolValue();
   Lex.Lex();
   return false;
@@ -8444,12 +8706,12 @@ bool LLParser::ParseFlag(unsigned &Val) {
 ///        [',' 'noInline' ':' Flag]? ')'
 ///        [',' 'alwaysInline' ':' Flag]? ')'
 
-bool LLParser::ParseOptionalFFlags(FunctionSummary::FFlags &FFlags) {
+bool LLParser::parseOptionalFFlags(FunctionSummary::FFlags &FFlags) {
   assert(Lex.getKind() == lltok::kw_funcFlags);
   Lex.Lex();
 
-  if (ParseToken(lltok::colon, "expected ':' in funcFlags") |
-      ParseToken(lltok::lparen, "expected '(' in funcFlags"))
+  if (parseToken(lltok::colon, "expected ':' in funcFlags") |
+      parseToken(lltok::lparen, "expected '(' in funcFlags"))
     return true;
 
   do {
@@ -8457,46 +8719,46 @@ bool LLParser::ParseOptionalFFlags(FunctionSummary::FFlags &FFlags) {
     switch (Lex.getKind()) {
     case lltok::kw_readNone:
       Lex.Lex();
-      if (ParseToken(lltok::colon, "expected ':'") || ParseFlag(Val))
+      if (parseToken(lltok::colon, "expected ':'") || parseFlag(Val))
         return true;
       FFlags.ReadNone = Val;
       break;
     case lltok::kw_readOnly:
       Lex.Lex();
-      if (ParseToken(lltok::colon, "expected ':'") || ParseFlag(Val))
+      if (parseToken(lltok::colon, "expected ':'") || parseFlag(Val))
         return true;
       FFlags.ReadOnly = Val;
       break;
     case lltok::kw_noRecurse:
       Lex.Lex();
-      if (ParseToken(lltok::colon, "expected ':'") || ParseFlag(Val))
+      if (parseToken(lltok::colon, "expected ':'") || parseFlag(Val))
         return true;
       FFlags.NoRecurse = Val;
       break;
     case lltok::kw_returnDoesNotAlias:
       Lex.Lex();
-      if (ParseToken(lltok::colon, "expected ':'") || ParseFlag(Val))
+      if (parseToken(lltok::colon, "expected ':'") || parseFlag(Val))
         return true;
       FFlags.ReturnDoesNotAlias = Val;
       break;
     case lltok::kw_noInline:
       Lex.Lex();
-      if (ParseToken(lltok::colon, "expected ':'") || ParseFlag(Val))
+      if (parseToken(lltok::colon, "expected ':'") || parseFlag(Val))
         return true;
       FFlags.NoInline = Val;
       break;
     case lltok::kw_alwaysInline:
       Lex.Lex();
-      if (ParseToken(lltok::colon, "expected ':'") || ParseFlag(Val))
+      if (parseToken(lltok::colon, "expected ':'") || parseFlag(Val))
         return true;
       FFlags.AlwaysInline = Val;
       break;
     default:
-      return Error(Lex.getLoc(), "expected function flag type");
+      return error(Lex.getLoc(), "expected function flag type");
     }
   } while (EatIfPresent(lltok::comma));
 
-  if (ParseToken(lltok::rparen, "expected ')' in funcFlags"))
+  if (parseToken(lltok::rparen, "expected ')' in funcFlags"))
     return true;
 
   return false;
@@ -8506,26 +8768,26 @@ bool LLParser::ParseOptionalFFlags(FunctionSummary::FFlags &FFlags) {
 ///   := 'calls' ':' '(' Call [',' Call]* ')'
 /// Call ::= '(' 'callee' ':' GVReference
 ///            [( ',' 'hotness' ':' Hotness | ',' 'relbf' ':' UInt32 )]? ')'
-bool LLParser::ParseOptionalCalls(std::vector<FunctionSummary::EdgeTy> &Calls) {
+bool LLParser::parseOptionalCalls(std::vector<FunctionSummary::EdgeTy> &Calls) {
   assert(Lex.getKind() == lltok::kw_calls);
   Lex.Lex();
 
-  if (ParseToken(lltok::colon, "expected ':' in calls") |
-      ParseToken(lltok::lparen, "expected '(' in calls"))
+  if (parseToken(lltok::colon, "expected ':' in calls") |
+      parseToken(lltok::lparen, "expected '(' in calls"))
     return true;
 
   IdToIndexMapType IdToIndexMap;
-  // Parse each call edge
+  // parse each call edge
   do {
     ValueInfo VI;
-    if (ParseToken(lltok::lparen, "expected '(' in call") ||
-        ParseToken(lltok::kw_callee, "expected 'callee' in call") ||
-        ParseToken(lltok::colon, "expected ':'"))
+    if (parseToken(lltok::lparen, "expected '(' in call") ||
+        parseToken(lltok::kw_callee, "expected 'callee' in call") ||
+        parseToken(lltok::colon, "expected ':'"))
       return true;
 
     LocTy Loc = Lex.getLoc();
     unsigned GVId;
-    if (ParseGVReference(VI, GVId))
+    if (parseGVReference(VI, GVId))
       return true;
 
     CalleeInfo::HotnessType Hotness = CalleeInfo::HotnessType::Unknown;
@@ -8533,11 +8795,11 @@ bool LLParser::ParseOptionalCalls(std::vector<FunctionSummary::EdgeTy> &Calls) {
     if (EatIfPresent(lltok::comma)) {
       // Expect either hotness or relbf
       if (EatIfPresent(lltok::kw_hotness)) {
-        if (ParseToken(lltok::colon, "expected ':'") || ParseHotness(Hotness))
+        if (parseToken(lltok::colon, "expected ':'") || parseHotness(Hotness))
           return true;
       } else {
-        if (ParseToken(lltok::kw_relbf, "expected relbf") ||
-            ParseToken(lltok::colon, "expected ':'") || ParseUInt32(RelBF))
+        if (parseToken(lltok::kw_relbf, "expected relbf") ||
+            parseToken(lltok::colon, "expected ':'") || parseUInt32(RelBF))
           return true;
       }
     }
@@ -8548,24 +8810,22 @@ bool LLParser::ParseOptionalCalls(std::vector<FunctionSummary::EdgeTy> &Calls) {
       IdToIndexMap[GVId].push_back(std::make_pair(Calls.size(), Loc));
     Calls.push_back(FunctionSummary::EdgeTy{VI, CalleeInfo(Hotness, RelBF)});
 
-    if (ParseToken(lltok::rparen, "expected ')' in call"))
+    if (parseToken(lltok::rparen, "expected ')' in call"))
       return true;
   } while (EatIfPresent(lltok::comma));
 
   // Now that the Calls vector is finalized, it is safe to save the locations
   // of any forward GV references that need updating later.
   for (auto I : IdToIndexMap) {
+    auto &Infos = ForwardRefValueInfos[I.first];
     for (auto P : I.second) {
       assert(Calls[P.first].first.getRef() == FwdVIRef &&
              "Forward referenced ValueInfo expected to be empty");
-      auto FwdRef = ForwardRefValueInfos.insert(std::make_pair(
-          I.first, std::vector<std::pair<ValueInfo *, LocTy>>()));
-      FwdRef.first->second.push_back(
-          std::make_pair(&Calls[P.first].first, P.second));
+      Infos.emplace_back(&Calls[P.first].first, P.second);
     }
   }
 
-  if (ParseToken(lltok::rparen, "expected ')' in calls"))
+  if (parseToken(lltok::rparen, "expected ')' in calls"))
     return true;
 
   return false;
@@ -8573,7 +8833,7 @@ bool LLParser::ParseOptionalCalls(std::vector<FunctionSummary::EdgeTy> &Calls) {
 
 /// Hotness
 ///   := ('unknown'|'cold'|'none'|'hot'|'critical')
-bool LLParser::ParseHotness(CalleeInfo::HotnessType &Hotness) {
+bool LLParser::parseHotness(CalleeInfo::HotnessType &Hotness) {
   switch (Lex.getKind()) {
   case lltok::kw_unknown:
     Hotness = CalleeInfo::HotnessType::Unknown;
@@ -8591,7 +8851,7 @@ bool LLParser::ParseHotness(CalleeInfo::HotnessType &Hotness) {
     Hotness = CalleeInfo::HotnessType::Critical;
     break;
   default:
-    return Error(Lex.getLoc(), "invalid call edge hotness");
+    return error(Lex.getLoc(), "invalid call edge hotness");
   }
   Lex.Lex();
   return false;
@@ -8600,32 +8860,32 @@ bool LLParser::ParseHotness(CalleeInfo::HotnessType &Hotness) {
 /// OptionalVTableFuncs
 ///   := 'vTableFuncs' ':' '(' VTableFunc [',' VTableFunc]* ')'
 /// VTableFunc ::= '(' 'virtFunc' ':' GVReference ',' 'offset' ':' UInt64 ')'
-bool LLParser::ParseOptionalVTableFuncs(VTableFuncList &VTableFuncs) {
+bool LLParser::parseOptionalVTableFuncs(VTableFuncList &VTableFuncs) {
   assert(Lex.getKind() == lltok::kw_vTableFuncs);
   Lex.Lex();
 
-  if (ParseToken(lltok::colon, "expected ':' in vTableFuncs") |
-      ParseToken(lltok::lparen, "expected '(' in vTableFuncs"))
+  if (parseToken(lltok::colon, "expected ':' in vTableFuncs") |
+      parseToken(lltok::lparen, "expected '(' in vTableFuncs"))
     return true;
 
   IdToIndexMapType IdToIndexMap;
-  // Parse each virtual function pair
+  // parse each virtual function pair
   do {
     ValueInfo VI;
-    if (ParseToken(lltok::lparen, "expected '(' in vTableFunc") ||
-        ParseToken(lltok::kw_virtFunc, "expected 'callee' in vTableFunc") ||
-        ParseToken(lltok::colon, "expected ':'"))
+    if (parseToken(lltok::lparen, "expected '(' in vTableFunc") ||
+        parseToken(lltok::kw_virtFunc, "expected 'callee' in vTableFunc") ||
+        parseToken(lltok::colon, "expected ':'"))
       return true;
 
     LocTy Loc = Lex.getLoc();
     unsigned GVId;
-    if (ParseGVReference(VI, GVId))
+    if (parseGVReference(VI, GVId))
       return true;
 
     uint64_t Offset;
-    if (ParseToken(lltok::comma, "expected comma") ||
-        ParseToken(lltok::kw_offset, "expected offset") ||
-        ParseToken(lltok::colon, "expected ':'") || ParseUInt64(Offset))
+    if (parseToken(lltok::comma, "expected comma") ||
+        parseToken(lltok::kw_offset, "expected offset") ||
+        parseToken(lltok::colon, "expected ':'") || parseUInt64(Offset))
       return true;
 
     // Keep track of the VTableFuncs array index needing a forward reference.
@@ -8635,55 +8895,53 @@ bool LLParser::ParseOptionalVTableFuncs(VTableFuncList &VTableFuncs) {
       IdToIndexMap[GVId].push_back(std::make_pair(VTableFuncs.size(), Loc));
     VTableFuncs.push_back({VI, Offset});
 
-    if (ParseToken(lltok::rparen, "expected ')' in vTableFunc"))
+    if (parseToken(lltok::rparen, "expected ')' in vTableFunc"))
       return true;
   } while (EatIfPresent(lltok::comma));
 
   // Now that the VTableFuncs vector is finalized, it is safe to save the
   // locations of any forward GV references that need updating later.
   for (auto I : IdToIndexMap) {
+    auto &Infos = ForwardRefValueInfos[I.first];
     for (auto P : I.second) {
       assert(VTableFuncs[P.first].FuncVI == EmptyVI &&
              "Forward referenced ValueInfo expected to be empty");
-      auto FwdRef = ForwardRefValueInfos.insert(std::make_pair(
-          I.first, std::vector<std::pair<ValueInfo *, LocTy>>()));
-      FwdRef.first->second.push_back(
-          std::make_pair(&VTableFuncs[P.first].FuncVI, P.second));
+      Infos.emplace_back(&VTableFuncs[P.first].FuncVI, P.second);
     }
   }
 
-  if (ParseToken(lltok::rparen, "expected ')' in vTableFuncs"))
+  if (parseToken(lltok::rparen, "expected ')' in vTableFuncs"))
     return true;
 
   return false;
 }
 
 /// ParamNo := 'param' ':' UInt64
-bool LLParser::ParseParamNo(uint64_t &ParamNo) {
-  if (ParseToken(lltok::kw_param, "expected 'param' here") ||
-      ParseToken(lltok::colon, "expected ':' here") || ParseUInt64(ParamNo))
+bool LLParser::parseParamNo(uint64_t &ParamNo) {
+  if (parseToken(lltok::kw_param, "expected 'param' here") ||
+      parseToken(lltok::colon, "expected ':' here") || parseUInt64(ParamNo))
     return true;
   return false;
 }
 
 /// ParamAccessOffset := 'offset' ':' '[' APSINTVAL ',' APSINTVAL ']'
-bool LLParser::ParseParamAccessOffset(ConstantRange &Range) {
+bool LLParser::parseParamAccessOffset(ConstantRange &Range) {
   APSInt Lower;
   APSInt Upper;
   auto ParseAPSInt = [&](APSInt &Val) {
     if (Lex.getKind() != lltok::APSInt)
-      return TokError("expected integer");
+      return tokError("expected integer");
     Val = Lex.getAPSIntVal();
     Val = Val.extOrTrunc(FunctionSummary::ParamAccess::RangeWidth);
     Val.setIsSigned(true);
     Lex.Lex();
     return false;
   };
-  if (ParseToken(lltok::kw_offset, "expected 'offset' here") ||
-      ParseToken(lltok::colon, "expected ':' here") ||
-      ParseToken(lltok::lsquare, "expected '[' here") || ParseAPSInt(Lower) ||
-      ParseToken(lltok::comma, "expected ',' here") || ParseAPSInt(Upper) ||
-      ParseToken(lltok::rsquare, "expected ']' here"))
+  if (parseToken(lltok::kw_offset, "expected 'offset' here") ||
+      parseToken(lltok::colon, "expected ':' here") ||
+      parseToken(lltok::lsquare, "expected '[' here") || ParseAPSInt(Lower) ||
+      parseToken(lltok::comma, "expected ',' here") || ParseAPSInt(Upper) ||
+      parseToken(lltok::rsquare, "expected ']' here"))
     return true;
 
   ++Upper;
@@ -8697,26 +8955,29 @@ bool LLParser::ParseParamAccessOffset(ConstantRange &Range) {
 
 /// ParamAccessCall
 ///   := '(' 'callee' ':' GVReference ',' ParamNo ',' ParamAccessOffset ')'
-bool LLParser::ParseParamAccessCall(FunctionSummary::ParamAccess::Call &Call) {
-  if (ParseToken(lltok::lparen, "expected '(' here") ||
-      ParseToken(lltok::kw_callee, "expected 'callee' here") ||
-      ParseToken(lltok::colon, "expected ':' here"))
+bool LLParser::parseParamAccessCall(FunctionSummary::ParamAccess::Call &Call,
+                                    IdLocListType &IdLocList) {
+  if (parseToken(lltok::lparen, "expected '(' here") ||
+      parseToken(lltok::kw_callee, "expected 'callee' here") ||
+      parseToken(lltok::colon, "expected ':' here"))
     return true;
 
   unsigned GVId;
   ValueInfo VI;
-  if (ParseGVReference(VI, GVId))
+  LocTy Loc = Lex.getLoc();
+  if (parseGVReference(VI, GVId))
     return true;
 
-  Call.Callee = VI.getGUID();
+  Call.Callee = VI;
+  IdLocList.emplace_back(GVId, Loc);
 
-  if (ParseToken(lltok::comma, "expected ',' here") ||
-      ParseParamNo(Call.ParamNo) ||
-      ParseToken(lltok::comma, "expected ',' here") ||
-      ParseParamAccessOffset(Call.Offsets))
+  if (parseToken(lltok::comma, "expected ',' here") ||
+      parseParamNo(Call.ParamNo) ||
+      parseToken(lltok::comma, "expected ',' here") ||
+      parseParamAccessOffset(Call.Offsets))
     return true;
 
-  if (ParseToken(lltok::rparen, "expected ')' here"))
+  if (parseToken(lltok::rparen, "expected ')' here"))
     return true;
 
   return false;
@@ -8725,30 +8986,31 @@ bool LLParser::ParseParamAccessCall(FunctionSummary::ParamAccess::Call &Call) {
 /// ParamAccess
 ///   := '(' ParamNo ',' ParamAccessOffset [',' OptionalParamAccessCalls]? ')'
 /// OptionalParamAccessCalls := '(' Call [',' Call]* ')'
-bool LLParser::ParseParamAccess(FunctionSummary::ParamAccess &Param) {
-  if (ParseToken(lltok::lparen, "expected '(' here") ||
-      ParseParamNo(Param.ParamNo) ||
-      ParseToken(lltok::comma, "expected ',' here") ||
-      ParseParamAccessOffset(Param.Use))
+bool LLParser::parseParamAccess(FunctionSummary::ParamAccess &Param,
+                                IdLocListType &IdLocList) {
+  if (parseToken(lltok::lparen, "expected '(' here") ||
+      parseParamNo(Param.ParamNo) ||
+      parseToken(lltok::comma, "expected ',' here") ||
+      parseParamAccessOffset(Param.Use))
     return true;
 
   if (EatIfPresent(lltok::comma)) {
-    if (ParseToken(lltok::kw_calls, "expected 'calls' here") ||
-        ParseToken(lltok::colon, "expected ':' here") ||
-        ParseToken(lltok::lparen, "expected '(' here"))
+    if (parseToken(lltok::kw_calls, "expected 'calls' here") ||
+        parseToken(lltok::colon, "expected ':' here") ||
+        parseToken(lltok::lparen, "expected '(' here"))
       return true;
     do {
       FunctionSummary::ParamAccess::Call Call;
-      if (ParseParamAccessCall(Call))
+      if (parseParamAccessCall(Call, IdLocList))
         return true;
       Param.Calls.push_back(Call);
     } while (EatIfPresent(lltok::comma));
 
-    if (ParseToken(lltok::rparen, "expected ')' here"))
+    if (parseToken(lltok::rparen, "expected ')' here"))
       return true;
   }
 
-  if (ParseToken(lltok::rparen, "expected ')' here"))
+  if (parseToken(lltok::rparen, "expected ')' here"))
     return true;
 
   return false;
@@ -8756,36 +9018,53 @@ bool LLParser::ParseParamAccess(FunctionSummary::ParamAccess &Param) {
 
 /// OptionalParamAccesses
 ///   := 'params' ':' '(' ParamAccess [',' ParamAccess]* ')'
-bool LLParser::ParseOptionalParamAccesses(
+bool LLParser::parseOptionalParamAccesses(
     std::vector<FunctionSummary::ParamAccess> &Params) {
   assert(Lex.getKind() == lltok::kw_params);
   Lex.Lex();
 
-  if (ParseToken(lltok::colon, "expected ':' here") ||
-      ParseToken(lltok::lparen, "expected '(' here"))
+  if (parseToken(lltok::colon, "expected ':' here") ||
+      parseToken(lltok::lparen, "expected '(' here"))
     return true;
 
+  IdLocListType VContexts;
+  size_t CallsNum = 0;
   do {
     FunctionSummary::ParamAccess ParamAccess;
-    if (ParseParamAccess(ParamAccess))
+    if (parseParamAccess(ParamAccess, VContexts))
       return true;
-    Params.push_back(ParamAccess);
+    CallsNum += ParamAccess.Calls.size();
+    assert(VContexts.size() == CallsNum);
+    Params.emplace_back(std::move(ParamAccess));
   } while (EatIfPresent(lltok::comma));
 
-  if (ParseToken(lltok::rparen, "expected ')' here"))
+  if (parseToken(lltok::rparen, "expected ')' here"))
     return true;
 
+  // Now that the Params is finalized, it is safe to save the locations
+  // of any forward GV references that need updating later.
+  IdLocListType::const_iterator ItContext = VContexts.begin();
+  for (auto &PA : Params) {
+    for (auto &C : PA.Calls) {
+      if (C.Callee.getRef() == FwdVIRef)
+        ForwardRefValueInfos[ItContext->first].emplace_back(&C.Callee,
+                                                            ItContext->second);
+      ++ItContext;
+    }
+  }
+  assert(ItContext == VContexts.end());
+
   return false;
 }
 
 /// OptionalRefs
 ///   := 'refs' ':' '(' GVReference [',' GVReference]* ')'
-bool LLParser::ParseOptionalRefs(std::vector<ValueInfo> &Refs) {
+bool LLParser::parseOptionalRefs(std::vector<ValueInfo> &Refs) {
   assert(Lex.getKind() == lltok::kw_refs);
   Lex.Lex();
 
-  if (ParseToken(lltok::colon, "expected ':' in refs") ||
-      ParseToken(lltok::lparen, "expected '(' in refs"))
+  if (parseToken(lltok::colon, "expected ':' in refs") ||
+      parseToken(lltok::lparen, "expected '(' in refs"))
     return true;
 
   struct ValueContext {
@@ -8794,11 +9073,11 @@ bool LLParser::ParseOptionalRefs(std::vector<ValueInfo> &Refs) {
     LocTy Loc;
   };
   std::vector<ValueContext> VContexts;
-  // Parse each ref edge
+  // parse each ref edge
   do {
     ValueContext VC;
     VC.Loc = Lex.getLoc();
-    if (ParseGVReference(VC.VI, VC.GVId))
+    if (parseGVReference(VC.VI, VC.GVId))
       return true;
     VContexts.push_back(VC);
   } while (EatIfPresent(lltok::comma));
@@ -8823,16 +9102,15 @@ bool LLParser::ParseOptionalRefs(std::vector<ValueInfo> &Refs) {
   // Now that the Refs vector is finalized, it is safe to save the locations
   // of any forward GV references that need updating later.
   for (auto I : IdToIndexMap) {
+    auto &Infos = ForwardRefValueInfos[I.first];
     for (auto P : I.second) {
       assert(Refs[P.first].getRef() == FwdVIRef &&
              "Forward referenced ValueInfo expected to be empty");
-      auto FwdRef = ForwardRefValueInfos.insert(std::make_pair(
-          I.first, std::vector<std::pair<ValueInfo *, LocTy>>()));
-      FwdRef.first->second.push_back(std::make_pair(&Refs[P.first], P.second));
+      Infos.emplace_back(&Refs[P.first], P.second);
     }
   }
 
-  if (ParseToken(lltok::rparen, "expected ')' in refs"))
+  if (parseToken(lltok::rparen, "expected ')' in refs"))
     return true;
 
   return false;
@@ -8842,47 +9120,47 @@ bool LLParser::ParseOptionalRefs(std::vector<ValueInfo> &Refs) {
 ///   := 'typeidinfo' ':' '(' [',' TypeTests]? [',' TypeTestAssumeVCalls]?
 ///         [',' TypeCheckedLoadVCalls]?  [',' TypeTestAssumeConstVCalls]?
 ///         [',' TypeCheckedLoadConstVCalls]? ')'
-bool LLParser::ParseOptionalTypeIdInfo(
+bool LLParser::parseOptionalTypeIdInfo(
     FunctionSummary::TypeIdInfo &TypeIdInfo) {
   assert(Lex.getKind() == lltok::kw_typeIdInfo);
   Lex.Lex();
 
-  if (ParseToken(lltok::colon, "expected ':' here") ||
-      ParseToken(lltok::lparen, "expected '(' in typeIdInfo"))
+  if (parseToken(lltok::colon, "expected ':' here") ||
+      parseToken(lltok::lparen, "expected '(' in typeIdInfo"))
     return true;
 
   do {
     switch (Lex.getKind()) {
     case lltok::kw_typeTests:
-      if (ParseTypeTests(TypeIdInfo.TypeTests))
+      if (parseTypeTests(TypeIdInfo.TypeTests))
         return true;
       break;
     case lltok::kw_typeTestAssumeVCalls:
-      if (ParseVFuncIdList(lltok::kw_typeTestAssumeVCalls,
+      if (parseVFuncIdList(lltok::kw_typeTestAssumeVCalls,
                            TypeIdInfo.TypeTestAssumeVCalls))
         return true;
       break;
     case lltok::kw_typeCheckedLoadVCalls:
-      if (ParseVFuncIdList(lltok::kw_typeCheckedLoadVCalls,
+      if (parseVFuncIdList(lltok::kw_typeCheckedLoadVCalls,
                            TypeIdInfo.TypeCheckedLoadVCalls))
         return true;
       break;
     case lltok::kw_typeTestAssumeConstVCalls:
-      if (ParseConstVCallList(lltok::kw_typeTestAssumeConstVCalls,
+      if (parseConstVCallList(lltok::kw_typeTestAssumeConstVCalls,
                               TypeIdInfo.TypeTestAssumeConstVCalls))
         return true;
       break;
     case lltok::kw_typeCheckedLoadConstVCalls:
-      if (ParseConstVCallList(lltok::kw_typeCheckedLoadConstVCalls,
+      if (parseConstVCallList(lltok::kw_typeCheckedLoadConstVCalls,
                               TypeIdInfo.TypeCheckedLoadConstVCalls))
         return true;
       break;
     default:
-      return Error(Lex.getLoc(), "invalid typeIdInfo list type");
+      return error(Lex.getLoc(), "invalid typeIdInfo list type");
     }
   } while (EatIfPresent(lltok::comma));
 
-  if (ParseToken(lltok::rparen, "expected ')' in typeIdInfo"))
+  if (parseToken(lltok::rparen, "expected ')' in typeIdInfo"))
     return true;
 
   return false;
@@ -8891,12 +9169,12 @@ bool LLParser::ParseOptionalTypeIdInfo(
 /// TypeTests
 ///   ::= 'typeTests' ':' '(' (SummaryID | UInt64)
 ///         [',' (SummaryID | UInt64)]* ')'
-bool LLParser::ParseTypeTests(std::vector<GlobalValue::GUID> &TypeTests) {
+bool LLParser::parseTypeTests(std::vector<GlobalValue::GUID> &TypeTests) {
   assert(Lex.getKind() == lltok::kw_typeTests);
   Lex.Lex();
 
-  if (ParseToken(lltok::colon, "expected ':' here") ||
-      ParseToken(lltok::lparen, "expected '(' in typeIdInfo"))
+  if (parseToken(lltok::colon, "expected ':' here") ||
+      parseToken(lltok::lparen, "expected '(' in typeIdInfo"))
     return true;
 
   IdToIndexMapType IdToIndexMap;
@@ -8910,7 +9188,7 @@ bool LLParser::ParseTypeTests(std::vector<GlobalValue::GUID> &TypeTests) {
       // can only do so once the std::vector is finalized.
       IdToIndexMap[ID].push_back(std::make_pair(TypeTests.size(), Loc));
       Lex.Lex();
-    } else if (ParseUInt64(GUID))
+    } else if (parseUInt64(GUID))
       return true;
     TypeTests.push_back(GUID);
   } while (EatIfPresent(lltok::comma));
@@ -8918,17 +9196,15 @@ bool LLParser::ParseTypeTests(std::vector<GlobalValue::GUID> &TypeTests) {
   // Now that the TypeTests vector is finalized, it is safe to save the
   // locations of any forward GV references that need updating later.
   for (auto I : IdToIndexMap) {
+    auto &Ids = ForwardRefTypeIds[I.first];
     for (auto P : I.second) {
       assert(TypeTests[P.first] == 0 &&
              "Forward referenced type id GUID expected to be 0");
-      auto FwdRef = ForwardRefTypeIds.insert(std::make_pair(
-          I.first, std::vector<std::pair<GlobalValue::GUID *, LocTy>>()));
-      FwdRef.first->second.push_back(
-          std::make_pair(&TypeTests[P.first], P.second));
+      Ids.emplace_back(&TypeTests[P.first], P.second);
     }
   }
 
-  if (ParseToken(lltok::rparen, "expected ')' in typeIdInfo"))
+  if (parseToken(lltok::rparen, "expected ')' in typeIdInfo"))
     return true;
 
   return false;
@@ -8936,36 +9212,34 @@ bool LLParser::ParseTypeTests(std::vector<GlobalValue::GUID> &TypeTests) {
 
 /// VFuncIdList
 ///   ::= Kind ':' '(' VFuncId [',' VFuncId]* ')'
-bool LLParser::ParseVFuncIdList(
+bool LLParser::parseVFuncIdList(
     lltok::Kind Kind, std::vector<FunctionSummary::VFuncId> &VFuncIdList) {
   assert(Lex.getKind() == Kind);
   Lex.Lex();
 
-  if (ParseToken(lltok::colon, "expected ':' here") ||
-      ParseToken(lltok::lparen, "expected '(' here"))
+  if (parseToken(lltok::colon, "expected ':' here") ||
+      parseToken(lltok::lparen, "expected '(' here"))
     return true;
 
   IdToIndexMapType IdToIndexMap;
   do {
     FunctionSummary::VFuncId VFuncId;
-    if (ParseVFuncId(VFuncId, IdToIndexMap, VFuncIdList.size()))
+    if (parseVFuncId(VFuncId, IdToIndexMap, VFuncIdList.size()))
       return true;
     VFuncIdList.push_back(VFuncId);
   } while (EatIfPresent(lltok::comma));
 
-  if (ParseToken(lltok::rparen, "expected ')' here"))
+  if (parseToken(lltok::rparen, "expected ')' here"))
     return true;
 
   // Now that the VFuncIdList vector is finalized, it is safe to save the
   // locations of any forward GV references that need updating later.
   for (auto I : IdToIndexMap) {
+    auto &Ids = ForwardRefTypeIds[I.first];
     for (auto P : I.second) {
       assert(VFuncIdList[P.first].GUID == 0 &&
              "Forward referenced type id GUID expected to be 0");
-      auto FwdRef = ForwardRefTypeIds.insert(std::make_pair(
-          I.first, std::vector<std::pair<GlobalValue::GUID *, LocTy>>()));
-      FwdRef.first->second.push_back(
-          std::make_pair(&VFuncIdList[P.first].GUID, P.second));
+      Ids.emplace_back(&VFuncIdList[P.first].GUID, P.second);
     }
   }
 
@@ -8974,37 +9248,35 @@ bool LLParser::ParseVFuncIdList(
 
 /// ConstVCallList
 ///   ::= Kind ':' '(' ConstVCall [',' ConstVCall]* ')'
-bool LLParser::ParseConstVCallList(
+bool LLParser::parseConstVCallList(
     lltok::Kind Kind,
     std::vector<FunctionSummary::ConstVCall> &ConstVCallList) {
   assert(Lex.getKind() == Kind);
   Lex.Lex();
 
-  if (ParseToken(lltok::colon, "expected ':' here") ||
-      ParseToken(lltok::lparen, "expected '(' here"))
+  if (parseToken(lltok::colon, "expected ':' here") ||
+      parseToken(lltok::lparen, "expected '(' here"))
     return true;
 
   IdToIndexMapType IdToIndexMap;
   do {
     FunctionSummary::ConstVCall ConstVCall;
-    if (ParseConstVCall(ConstVCall, IdToIndexMap, ConstVCallList.size()))
+    if (parseConstVCall(ConstVCall, IdToIndexMap, ConstVCallList.size()))
       return true;
     ConstVCallList.push_back(ConstVCall);
   } while (EatIfPresent(lltok::comma));
 
-  if (ParseToken(lltok::rparen, "expected ')' here"))
+  if (parseToken(lltok::rparen, "expected ')' here"))
     return true;
 
   // Now that the ConstVCallList vector is finalized, it is safe to save the
   // locations of any forward GV references that need updating later.
   for (auto I : IdToIndexMap) {
+    auto &Ids = ForwardRefTypeIds[I.first];
     for (auto P : I.second) {
       assert(ConstVCallList[P.first].VFunc.GUID == 0 &&
              "Forward referenced type id GUID expected to be 0");
-      auto FwdRef = ForwardRefTypeIds.insert(std::make_pair(
-          I.first, std::vector<std::pair<GlobalValue::GUID *, LocTy>>()));
-      FwdRef.first->second.push_back(
-          std::make_pair(&ConstVCallList[P.first].VFunc.GUID, P.second));
+      Ids.emplace_back(&ConstVCallList[P.first].VFunc.GUID, P.second);
     }
   }
 
@@ -9013,17 +9285,17 @@ bool LLParser::ParseConstVCallList(
 
 /// ConstVCall
 ///   ::= '(' VFuncId ',' Args ')'
-bool LLParser::ParseConstVCall(FunctionSummary::ConstVCall &ConstVCall,
+bool LLParser::parseConstVCall(FunctionSummary::ConstVCall &ConstVCall,
                                IdToIndexMapType &IdToIndexMap, unsigned Index) {
-  if (ParseToken(lltok::lparen, "expected '(' here") ||
-      ParseVFuncId(ConstVCall.VFunc, IdToIndexMap, Index))
+  if (parseToken(lltok::lparen, "expected '(' here") ||
+      parseVFuncId(ConstVCall.VFunc, IdToIndexMap, Index))
     return true;
 
   if (EatIfPresent(lltok::comma))
-    if (ParseArgs(ConstVCall.Args))
+    if (parseArgs(ConstVCall.Args))
       return true;
 
-  if (ParseToken(lltok::rparen, "expected ')' here"))
+  if (parseToken(lltok::rparen, "expected ')' here"))
     return true;
 
   return false;
@@ -9032,13 +9304,13 @@ bool LLParser::ParseConstVCall(FunctionSummary::ConstVCall &ConstVCall,
 /// VFuncId
 ///   ::= 'vFuncId' ':' '(' (SummaryID | 'guid' ':' UInt64) ','
 ///         'offset' ':' UInt64 ')'
-bool LLParser::ParseVFuncId(FunctionSummary::VFuncId &VFuncId,
+bool LLParser::parseVFuncId(FunctionSummary::VFuncId &VFuncId,
                             IdToIndexMapType &IdToIndexMap, unsigned Index) {
   assert(Lex.getKind() == lltok::kw_vFuncId);
   Lex.Lex();
 
-  if (ParseToken(lltok::colon, "expected ':' here") ||
-      ParseToken(lltok::lparen, "expected '(' here"))
+  if (parseToken(lltok::colon, "expected ':' here") ||
+      parseToken(lltok::lparen, "expected '(' here"))
     return true;
 
   if (Lex.getKind() == lltok::SummaryID) {
@@ -9050,16 +9322,16 @@ bool LLParser::ParseVFuncId(FunctionSummary::VFuncId &VFuncId,
     // can only do so once the caller's std::vector is finalized.
     IdToIndexMap[ID].push_back(std::make_pair(Index, Loc));
     Lex.Lex();
-  } else if (ParseToken(lltok::kw_guid, "expected 'guid' here") ||
-             ParseToken(lltok::colon, "expected ':' here") ||
-             ParseUInt64(VFuncId.GUID))
+  } else if (parseToken(lltok::kw_guid, "expected 'guid' here") ||
+             parseToken(lltok::colon, "expected ':' here") ||
+             parseUInt64(VFuncId.GUID))
     return true;
 
-  if (ParseToken(lltok::comma, "expected ',' here") ||
-      ParseToken(lltok::kw_offset, "expected 'offset' here") ||
-      ParseToken(lltok::colon, "expected ':' here") ||
-      ParseUInt64(VFuncId.Offset) ||
-      ParseToken(lltok::rparen, "expected ')' here"))
+  if (parseToken(lltok::comma, "expected ',' here") ||
+      parseToken(lltok::kw_offset, "expected 'offset' here") ||
+      parseToken(lltok::colon, "expected ':' here") ||
+      parseUInt64(VFuncId.Offset) ||
+      parseToken(lltok::rparen, "expected ')' here"))
     return true;
 
   return false;
@@ -9069,12 +9341,12 @@ bool LLParser::ParseVFuncId(FunctionSummary::VFuncId &VFuncId,
 ///   ::= 'flags' ':' '(' 'linkage' ':' OptionalLinkageAux ','
 ///         'notEligibleToImport' ':' Flag ',' 'live' ':' Flag ','
 ///         'dsoLocal' ':' Flag ',' 'canAutoHide' ':' Flag ')'
-bool LLParser::ParseGVFlags(GlobalValueSummary::GVFlags &GVFlags) {
+bool LLParser::parseGVFlags(GlobalValueSummary::GVFlags &GVFlags) {
   assert(Lex.getKind() == lltok::kw_flags);
   Lex.Lex();
 
-  if (ParseToken(lltok::colon, "expected ':' here") ||
-      ParseToken(lltok::lparen, "expected '(' here"))
+  if (parseToken(lltok::colon, "expected ':' here") ||
+      parseToken(lltok::lparen, "expected '(' here"))
     return true;
 
   do {
@@ -9082,7 +9354,7 @@ bool LLParser::ParseGVFlags(GlobalValueSummary::GVFlags &GVFlags) {
     switch (Lex.getKind()) {
     case lltok::kw_linkage:
       Lex.Lex();
-      if (ParseToken(lltok::colon, "expected ':'"))
+      if (parseToken(lltok::colon, "expected ':'"))
         return true;
       bool HasLinkage;
       GVFlags.Linkage = parseOptionalLinkageAux(Lex.getKind(), HasLinkage);
@@ -9091,34 +9363,34 @@ bool LLParser::ParseGVFlags(GlobalValueSummary::GVFlags &GVFlags) {
       break;
     case lltok::kw_notEligibleToImport:
       Lex.Lex();
-      if (ParseToken(lltok::colon, "expected ':'") || ParseFlag(Flag))
+      if (parseToken(lltok::colon, "expected ':'") || parseFlag(Flag))
         return true;
       GVFlags.NotEligibleToImport = Flag;
       break;
     case lltok::kw_live:
       Lex.Lex();
-      if (ParseToken(lltok::colon, "expected ':'") || ParseFlag(Flag))
+      if (parseToken(lltok::colon, "expected ':'") || parseFlag(Flag))
         return true;
       GVFlags.Live = Flag;
       break;
     case lltok::kw_dsoLocal:
       Lex.Lex();
-      if (ParseToken(lltok::colon, "expected ':'") || ParseFlag(Flag))
+      if (parseToken(lltok::colon, "expected ':'") || parseFlag(Flag))
         return true;
       GVFlags.DSOLocal = Flag;
       break;
     case lltok::kw_canAutoHide:
       Lex.Lex();
-      if (ParseToken(lltok::colon, "expected ':'") || ParseFlag(Flag))
+      if (parseToken(lltok::colon, "expected ':'") || parseFlag(Flag))
         return true;
       GVFlags.CanAutoHide = Flag;
       break;
     default:
-      return Error(Lex.getLoc(), "expected gv flag type");
+      return error(Lex.getLoc(), "expected gv flag type");
     }
   } while (EatIfPresent(lltok::comma));
 
-  if (ParseToken(lltok::rparen, "expected ')' here"))
+  if (parseToken(lltok::rparen, "expected ')' here"))
     return true;
 
   return false;
@@ -9128,19 +9400,19 @@ bool LLParser::ParseGVFlags(GlobalValueSummary::GVFlags &GVFlags) {
 ///   ::= 'varFlags' ':' '(' 'readonly' ':' Flag
 ///                      ',' 'writeonly' ':' Flag
 ///                      ',' 'constant' ':' Flag ')'
-bool LLParser::ParseGVarFlags(GlobalVarSummary::GVarFlags &GVarFlags) {
+bool LLParser::parseGVarFlags(GlobalVarSummary::GVarFlags &GVarFlags) {
   assert(Lex.getKind() == lltok::kw_varFlags);
   Lex.Lex();
 
-  if (ParseToken(lltok::colon, "expected ':' here") ||
-      ParseToken(lltok::lparen, "expected '(' here"))
+  if (parseToken(lltok::colon, "expected ':' here") ||
+      parseToken(lltok::lparen, "expected '(' here"))
     return true;
 
   auto ParseRest = [this](unsigned int &Val) {
     Lex.Lex();
-    if (ParseToken(lltok::colon, "expected ':'"))
+    if (parseToken(lltok::colon, "expected ':'"))
       return true;
-    return ParseFlag(Val);
+    return parseFlag(Val);
   };
 
   do {
@@ -9167,19 +9439,19 @@ bool LLParser::ParseGVarFlags(GlobalVarSummary::GVarFlags &GVarFlags) {
       GVarFlags.VCallVisibility = Flag;
       break;
     default:
-      return Error(Lex.getLoc(), "expected gvar flag type");
+      return error(Lex.getLoc(), "expected gvar flag type");
     }
   } while (EatIfPresent(lltok::comma));
-  return ParseToken(lltok::rparen, "expected ')' here");
+  return parseToken(lltok::rparen, "expected ')' here");
 }
 
 /// ModuleReference
 ///   ::= 'module' ':' UInt
-bool LLParser::ParseModuleReference(StringRef &ModulePath) {
-  // Parse module id.
-  if (ParseToken(lltok::kw_module, "expected 'module' here") ||
-      ParseToken(lltok::colon, "expected ':' here") ||
-      ParseToken(lltok::SummaryID, "expected module ID"))
+bool LLParser::parseModuleReference(StringRef &ModulePath) {
+  // parse module id.
+  if (parseToken(lltok::kw_module, "expected 'module' here") ||
+      parseToken(lltok::colon, "expected ':' here") ||
+      parseToken(lltok::SummaryID, "expected module ID"))
     return true;
 
   unsigned ModuleID = Lex.getUIntVal();
@@ -9192,11 +9464,11 @@ bool LLParser::ParseModuleReference(StringRef &ModulePath) {
 
 /// GVReference
 ///   ::= SummaryID
-bool LLParser::ParseGVReference(ValueInfo &VI, unsigned &GVId) {
+bool LLParser::parseGVReference(ValueInfo &VI, unsigned &GVId) {
   bool WriteOnly = false, ReadOnly = EatIfPresent(lltok::kw_readonly);
   if (!ReadOnly)
     WriteOnly = EatIfPresent(lltok::kw_writeonly);
-  if (ParseToken(lltok::SummaryID, "expected GV ID"))
+  if (parseToken(lltok::SummaryID, "expected GV ID"))
     return true;
 
   GVId = Lex.getUIntVal();
diff --git a/contrib/llvm-project/llvm/lib/AsmParser/LLParser.h b/contrib/llvm-project/llvm/lib/AsmParser/LLParser.h
index ebd8655dc35e..aa79823ce986 100644
--- a/contrib/llvm-project/llvm/lib/AsmParser/LLParser.h
+++ b/contrib/llvm-project/llvm/lib/AsmParser/LLParser.h
@@ -46,7 +46,7 @@ namespace llvm {
       t_LocalID, t_GlobalID,           // ID in UIntVal.
       t_LocalName, t_GlobalName,       // Name in StrVal.
       t_APSInt, t_APFloat,             // Value in APSIntVal/APFloatVal.
-      t_Null, t_Undef, t_Zero, t_None, // No value.
+      t_Null, t_Undef, t_Zero, t_None, t_Poison, // No value.
       t_EmptyArray,                    // No value:  []
       t_Constant,                      // Value in ConstantVal.
       t_InlineAsm,                     // Value in FTy/StrVal/StrVal2/UIntVal.
@@ -166,8 +166,8 @@ namespace llvm {
         : Context(Context), Lex(F, SM, Err, Context), M(M), Index(Index),
           Slots(Slots), BlockAddressPFS(nullptr) {}
     bool Run(
-        bool UpgradeDebugInfo,
-        DataLayoutCallbackTy DataLayoutCallback = [](Module *) {});
+        bool UpgradeDebugInfo, DataLayoutCallbackTy DataLayoutCallback =
+                                   [](StringRef) { return None; });
 
     bool parseStandaloneConstantValue(Constant *&C, const SlotMapping *Slots);
 
@@ -177,31 +177,26 @@ namespace llvm {
     LLVMContext &getContext() { return Context; }
 
   private:
-
-    bool Error(LocTy L, const Twine &Msg) const {
-      return Lex.Error(L, Msg);
-    }
-    bool TokError(const Twine &Msg) const {
-      return Error(Lex.getLoc(), Msg);
-    }
+    bool error(LocTy L, const Twine &Msg) const { return Lex.Error(L, Msg); }
+    bool tokError(const Twine &Msg) const { return error(Lex.getLoc(), Msg); }
 
     /// Restore the internal name and slot mappings using the mappings that
     /// were created at an earlier parsing stage.
     void restoreParsingState(const SlotMapping *Slots);
 
-    /// GetGlobalVal - Get a value with the specified name or ID, creating a
+    /// getGlobalVal - Get a value with the specified name or ID, creating a
     /// forward reference record if needed.  This can return null if the value
     /// exists but does not have the right type.
-    GlobalValue *GetGlobalVal(const std::string &N, Type *Ty, LocTy Loc,
+    GlobalValue *getGlobalVal(const std::string &N, Type *Ty, LocTy Loc,
                               bool IsCall);
-    GlobalValue *GetGlobalVal(unsigned ID, Type *Ty, LocTy Loc, bool IsCall);
+    GlobalValue *getGlobalVal(unsigned ID, Type *Ty, LocTy Loc, bool IsCall);
 
     /// Get a Comdat with the specified name, creating a forward reference
     /// record if needed.
     Comdat *getComdat(const std::string &Name, LocTy Loc);
 
     // Helper Routines.
-    bool ParseToken(lltok::Kind T, const char *ErrMsg);
+    bool parseToken(lltok::Kind T, const char *ErrMsg);
     bool EatIfPresent(lltok::Kind T) {
       if (Lex.getKind() != T) return false;
       Lex.Lex();
@@ -228,7 +223,7 @@ namespace llvm {
       return FMF;
     }
 
-    bool ParseOptionalToken(lltok::Kind T, bool &Present,
+    bool parseOptionalToken(lltok::Kind T, bool &Present,
                             LocTy *Loc = nullptr) {
       if (Lex.getKind() != T) {
         Present = false;
@@ -240,80 +235,81 @@ namespace llvm {
       }
       return false;
     }
-    bool ParseStringConstant(std::string &Result);
-    bool ParseUInt32(unsigned &Val);
-    bool ParseUInt32(unsigned &Val, LocTy &Loc) {
+    bool parseStringConstant(std::string &Result);
+    bool parseUInt32(unsigned &Val);
+    bool parseUInt32(unsigned &Val, LocTy &Loc) {
       Loc = Lex.getLoc();
-      return ParseUInt32(Val);
+      return parseUInt32(Val);
     }
-    bool ParseUInt64(uint64_t &Val);
-    bool ParseUInt64(uint64_t &Val, LocTy &Loc) {
+    bool parseUInt64(uint64_t &Val);
+    bool parseUInt64(uint64_t &Val, LocTy &Loc) {
       Loc = Lex.getLoc();
-      return ParseUInt64(Val);
+      return parseUInt64(Val);
     }
-    bool ParseFlag(unsigned &Val);
+    bool parseFlag(unsigned &Val);
 
-    bool ParseStringAttribute(AttrBuilder &B);
+    bool parseStringAttribute(AttrBuilder &B);
 
-    bool ParseTLSModel(GlobalVariable::ThreadLocalMode &TLM);
-    bool ParseOptionalThreadLocal(GlobalVariable::ThreadLocalMode &TLM);
-    bool ParseOptionalUnnamedAddr(GlobalVariable::UnnamedAddr &UnnamedAddr);
-    bool ParseOptionalAddrSpace(unsigned &AddrSpace, unsigned DefaultAS = 0);
-    bool ParseOptionalProgramAddrSpace(unsigned &AddrSpace) {
-      return ParseOptionalAddrSpace(
+    bool parseTLSModel(GlobalVariable::ThreadLocalMode &TLM);
+    bool parseOptionalThreadLocal(GlobalVariable::ThreadLocalMode &TLM);
+    bool parseOptionalUnnamedAddr(GlobalVariable::UnnamedAddr &UnnamedAddr);
+    bool parseOptionalAddrSpace(unsigned &AddrSpace, unsigned DefaultAS = 0);
+    bool parseOptionalProgramAddrSpace(unsigned &AddrSpace) {
+      return parseOptionalAddrSpace(
           AddrSpace, M->getDataLayout().getProgramAddressSpace());
     };
-    bool ParseOptionalParamAttrs(AttrBuilder &B);
-    bool ParseOptionalReturnAttrs(AttrBuilder &B);
-    bool ParseOptionalLinkage(unsigned &Res, bool &HasLinkage,
+    bool parseOptionalParamAttrs(AttrBuilder &B);
+    bool parseOptionalReturnAttrs(AttrBuilder &B);
+    bool parseOptionalLinkage(unsigned &Res, bool &HasLinkage,
                               unsigned &Visibility, unsigned &DLLStorageClass,
                               bool &DSOLocal);
-    void ParseOptionalDSOLocal(bool &DSOLocal);
-    void ParseOptionalVisibility(unsigned &Res);
-    void ParseOptionalDLLStorageClass(unsigned &Res);
-    bool ParseOptionalCallingConv(unsigned &CC);
-    bool ParseOptionalAlignment(MaybeAlign &Alignment,
+    void parseOptionalDSOLocal(bool &DSOLocal);
+    void parseOptionalVisibility(unsigned &Res);
+    void parseOptionalDLLStorageClass(unsigned &Res);
+    bool parseOptionalCallingConv(unsigned &CC);
+    bool parseOptionalAlignment(MaybeAlign &Alignment,
                                 bool AllowParens = false);
-    bool ParseOptionalDerefAttrBytes(lltok::Kind AttrKind, uint64_t &Bytes);
-    bool ParseScopeAndOrdering(bool isAtomic, SyncScope::ID &SSID,
+    bool parseOptionalDerefAttrBytes(lltok::Kind AttrKind, uint64_t &Bytes);
+    bool parseScopeAndOrdering(bool IsAtomic, SyncScope::ID &SSID,
                                AtomicOrdering &Ordering);
-    bool ParseScope(SyncScope::ID &SSID);
-    bool ParseOrdering(AtomicOrdering &Ordering);
-    bool ParseOptionalStackAlignment(unsigned &Alignment);
-    bool ParseOptionalCommaAlign(MaybeAlign &Alignment, bool &AteExtraComma);
-    bool ParseOptionalCommaAddrSpace(unsigned &AddrSpace, LocTy &Loc,
+    bool parseScope(SyncScope::ID &SSID);
+    bool parseOrdering(AtomicOrdering &Ordering);
+    bool parseOptionalStackAlignment(unsigned &Alignment);
+    bool parseOptionalCommaAlign(MaybeAlign &Alignment, bool &AteExtraComma);
+    bool parseOptionalCommaAddrSpace(unsigned &AddrSpace, LocTy &Loc,
                                      bool &AteExtraComma);
-    bool ParseOptionalCommaInAlloca(bool &IsInAlloca);
+    bool parseOptionalCommaInAlloca(bool &IsInAlloca);
     bool parseAllocSizeArguments(unsigned &BaseSizeArg,
                                  Optional<unsigned> &HowManyArg);
-    bool ParseIndexList(SmallVectorImpl<unsigned> &Indices,
+    bool parseIndexList(SmallVectorImpl<unsigned> &Indices,
                         bool &AteExtraComma);
-    bool ParseIndexList(SmallVectorImpl<unsigned> &Indices) {
+    bool parseIndexList(SmallVectorImpl<unsigned> &Indices) {
       bool AteExtraComma;
-      if (ParseIndexList(Indices, AteExtraComma)) return true;
+      if (parseIndexList(Indices, AteExtraComma))
+        return true;
       if (AteExtraComma)
-        return TokError("expected index");
+        return tokError("expected index");
       return false;
     }
 
     // Top-Level Entities
-    bool ParseTopLevelEntities();
-    bool ValidateEndOfModule(bool UpgradeDebugInfo);
-    bool ValidateEndOfIndex();
-    bool ParseTargetDefinitions();
-    bool ParseTargetDefinition();
-    bool ParseModuleAsm();
-    bool ParseSourceFileName();
-    bool ParseDepLibs();        // FIXME: Remove in 4.0.
-    bool ParseUnnamedType();
-    bool ParseNamedType();
-    bool ParseDeclare();
-    bool ParseDefine();
-
-    bool ParseGlobalType(bool &IsConstant);
-    bool ParseUnnamedGlobal();
-    bool ParseNamedGlobal();
-    bool ParseGlobal(const std::string &Name, LocTy NameLoc, unsigned Linkage,
+    bool parseTopLevelEntities();
+    bool validateEndOfModule(bool UpgradeDebugInfo);
+    bool validateEndOfIndex();
+    bool parseTargetDefinitions();
+    bool parseTargetDefinition();
+    bool parseModuleAsm();
+    bool parseSourceFileName();
+    bool parseDepLibs(); // FIXME: Remove in 4.0.
+    bool parseUnnamedType();
+    bool parseNamedType();
+    bool parseDeclare();
+    bool parseDefine();
+
+    bool parseGlobalType(bool &IsConstant);
+    bool parseUnnamedGlobal();
+    bool parseNamedGlobal();
+    bool parseGlobal(const std::string &Name, LocTy NameLoc, unsigned Linkage,
                      bool HasLinkage, unsigned Visibility,
                      unsigned DLLStorageClass, bool DSOLocal,
                      GlobalVariable::ThreadLocalMode TLM,
@@ -324,92 +320,96 @@ namespace llvm {
                              GlobalVariable::ThreadLocalMode TLM,
                              GlobalVariable::UnnamedAddr UnnamedAddr);
     bool parseComdat();
-    bool ParseStandaloneMetadata();
-    bool ParseNamedMetadata();
-    bool ParseMDString(MDString *&Result);
-    bool ParseMDNodeID(MDNode *&Result);
-    bool ParseUnnamedAttrGrp();
-    bool ParseFnAttributeValuePairs(AttrBuilder &B,
+    bool parseStandaloneMetadata();
+    bool parseNamedMetadata();
+    bool parseMDString(MDString *&Result);
+    bool parseMDNodeID(MDNode *&Result);
+    bool parseUnnamedAttrGrp();
+    bool parseFnAttributeValuePairs(AttrBuilder &B,
                                     std::vector<unsigned> &FwdRefAttrGrps,
                                     bool inAttrGrp, LocTy &BuiltinLoc);
-    bool ParseByValWithOptionalType(Type *&Result);
-    bool ParsePreallocated(Type *&Result);
+    bool parseRequiredTypeAttr(Type *&Result, lltok::Kind AttrName);
+    bool parsePreallocated(Type *&Result);
+    bool parseByRef(Type *&Result);
 
     // Module Summary Index Parsing.
-    bool SkipModuleSummaryEntry();
-    bool ParseSummaryEntry();
-    bool ParseModuleEntry(unsigned ID);
-    bool ParseModuleReference(StringRef &ModulePath);
-    bool ParseGVReference(ValueInfo &VI, unsigned &GVId);
-    bool ParseSummaryIndexFlags();
-    bool ParseBlockCount();
-    bool ParseGVEntry(unsigned ID);
-    bool ParseFunctionSummary(std::string Name, GlobalValue::GUID, unsigned ID);
-    bool ParseVariableSummary(std::string Name, GlobalValue::GUID, unsigned ID);
-    bool ParseAliasSummary(std::string Name, GlobalValue::GUID, unsigned ID);
-    bool ParseGVFlags(GlobalValueSummary::GVFlags &GVFlags);
-    bool ParseGVarFlags(GlobalVarSummary::GVarFlags &GVarFlags);
-    bool ParseOptionalFFlags(FunctionSummary::FFlags &FFlags);
-    bool ParseOptionalCalls(std::vector<FunctionSummary::EdgeTy> &Calls);
-    bool ParseHotness(CalleeInfo::HotnessType &Hotness);
-    bool ParseOptionalTypeIdInfo(FunctionSummary::TypeIdInfo &TypeIdInfo);
-    bool ParseTypeTests(std::vector<GlobalValue::GUID> &TypeTests);
-    bool ParseVFuncIdList(lltok::Kind Kind,
+    bool skipModuleSummaryEntry();
+    bool parseSummaryEntry();
+    bool parseModuleEntry(unsigned ID);
+    bool parseModuleReference(StringRef &ModulePath);
+    bool parseGVReference(ValueInfo &VI, unsigned &GVId);
+    bool parseSummaryIndexFlags();
+    bool parseBlockCount();
+    bool parseGVEntry(unsigned ID);
+    bool parseFunctionSummary(std::string Name, GlobalValue::GUID, unsigned ID);
+    bool parseVariableSummary(std::string Name, GlobalValue::GUID, unsigned ID);
+    bool parseAliasSummary(std::string Name, GlobalValue::GUID, unsigned ID);
+    bool parseGVFlags(GlobalValueSummary::GVFlags &GVFlags);
+    bool parseGVarFlags(GlobalVarSummary::GVarFlags &GVarFlags);
+    bool parseOptionalFFlags(FunctionSummary::FFlags &FFlags);
+    bool parseOptionalCalls(std::vector<FunctionSummary::EdgeTy> &Calls);
+    bool parseHotness(CalleeInfo::HotnessType &Hotness);
+    bool parseOptionalTypeIdInfo(FunctionSummary::TypeIdInfo &TypeIdInfo);
+    bool parseTypeTests(std::vector<GlobalValue::GUID> &TypeTests);
+    bool parseVFuncIdList(lltok::Kind Kind,
                           std::vector<FunctionSummary::VFuncId> &VFuncIdList);
-    bool ParseConstVCallList(
+    bool parseConstVCallList(
         lltok::Kind Kind,
         std::vector<FunctionSummary::ConstVCall> &ConstVCallList);
     using IdToIndexMapType =
         std::map<unsigned, std::vector<std::pair<unsigned, LocTy>>>;
-    bool ParseConstVCall(FunctionSummary::ConstVCall &ConstVCall,
+    bool parseConstVCall(FunctionSummary::ConstVCall &ConstVCall,
                          IdToIndexMapType &IdToIndexMap, unsigned Index);
-    bool ParseVFuncId(FunctionSummary::VFuncId &VFuncId,
+    bool parseVFuncId(FunctionSummary::VFuncId &VFuncId,
                       IdToIndexMapType &IdToIndexMap, unsigned Index);
-    bool ParseOptionalVTableFuncs(VTableFuncList &VTableFuncs);
-    bool ParseOptionalParamAccesses(
+    bool parseOptionalVTableFuncs(VTableFuncList &VTableFuncs);
+    bool parseOptionalParamAccesses(
         std::vector<FunctionSummary::ParamAccess> &Params);
-    bool ParseParamNo(uint64_t &ParamNo);
-    bool ParseParamAccess(FunctionSummary::ParamAccess &Param);
-    bool ParseParamAccessCall(FunctionSummary::ParamAccess::Call &Call);
-    bool ParseParamAccessOffset(ConstantRange &range);
-    bool ParseOptionalRefs(std::vector<ValueInfo> &Refs);
-    bool ParseTypeIdEntry(unsigned ID);
-    bool ParseTypeIdSummary(TypeIdSummary &TIS);
-    bool ParseTypeIdCompatibleVtableEntry(unsigned ID);
-    bool ParseTypeTestResolution(TypeTestResolution &TTRes);
-    bool ParseOptionalWpdResolutions(
+    bool parseParamNo(uint64_t &ParamNo);
+    using IdLocListType = std::vector<std::pair<unsigned, LocTy>>;
+    bool parseParamAccess(FunctionSummary::ParamAccess &Param,
+                          IdLocListType &IdLocList);
+    bool parseParamAccessCall(FunctionSummary::ParamAccess::Call &Call,
+                              IdLocListType &IdLocList);
+    bool parseParamAccessOffset(ConstantRange &Range);
+    bool parseOptionalRefs(std::vector<ValueInfo> &Refs);
+    bool parseTypeIdEntry(unsigned ID);
+    bool parseTypeIdSummary(TypeIdSummary &TIS);
+    bool parseTypeIdCompatibleVtableEntry(unsigned ID);
+    bool parseTypeTestResolution(TypeTestResolution &TTRes);
+    bool parseOptionalWpdResolutions(
         std::map<uint64_t, WholeProgramDevirtResolution> &WPDResMap);
-    bool ParseWpdRes(WholeProgramDevirtResolution &WPDRes);
-    bool ParseOptionalResByArg(
+    bool parseWpdRes(WholeProgramDevirtResolution &WPDRes);
+    bool parseOptionalResByArg(
         std::map<std::vector<uint64_t>, WholeProgramDevirtResolution::ByArg>
             &ResByArg);
-    bool ParseArgs(std::vector<uint64_t> &Args);
-    void AddGlobalValueToIndex(std::string Name, GlobalValue::GUID,
+    bool parseArgs(std::vector<uint64_t> &Args);
+    void addGlobalValueToIndex(std::string Name, GlobalValue::GUID,
                                GlobalValue::LinkageTypes Linkage, unsigned ID,
                                std::unique_ptr<GlobalValueSummary> Summary);
 
     // Type Parsing.
-    bool ParseType(Type *&Result, const Twine &Msg, bool AllowVoid = false);
-    bool ParseType(Type *&Result, bool AllowVoid = false) {
-      return ParseType(Result, "expected type", AllowVoid);
+    bool parseType(Type *&Result, const Twine &Msg, bool AllowVoid = false);
+    bool parseType(Type *&Result, bool AllowVoid = false) {
+      return parseType(Result, "expected type", AllowVoid);
     }
-    bool ParseType(Type *&Result, const Twine &Msg, LocTy &Loc,
+    bool parseType(Type *&Result, const Twine &Msg, LocTy &Loc,
                    bool AllowVoid = false) {
       Loc = Lex.getLoc();
-      return ParseType(Result, Msg, AllowVoid);
+      return parseType(Result, Msg, AllowVoid);
     }
-    bool ParseType(Type *&Result, LocTy &Loc, bool AllowVoid = false) {
+    bool parseType(Type *&Result, LocTy &Loc, bool AllowVoid = false) {
       Loc = Lex.getLoc();
-      return ParseType(Result, AllowVoid);
+      return parseType(Result, AllowVoid);
     }
-    bool ParseAnonStructType(Type *&Result, bool Packed);
-    bool ParseStructBody(SmallVectorImpl<Type*> &Body);
-    bool ParseStructDefinition(SMLoc TypeLoc, StringRef Name,
-                               std::pair<Type*, LocTy> &Entry,
+    bool parseAnonStructType(Type *&Result, bool Packed);
+    bool parseStructBody(SmallVectorImpl<Type *> &Body);
+    bool parseStructDefinition(SMLoc TypeLoc, StringRef Name,
+                               std::pair<Type *, LocTy> &Entry,
                                Type *&ResultTy);
 
-    bool ParseArrayVectorType(Type *&Result, bool isVector);
-    bool ParseFunctionType(Type *&Result);
+    bool parseArrayVectorType(Type *&Result, bool IsVector);
+    bool parseFunctionType(Type *&Result);
 
     // Function Semantic Analysis.
     class PerFunctionState {
@@ -428,67 +428,65 @@ namespace llvm {
 
       Function &getFunction() const { return F; }
 
-      bool FinishFunction();
+      bool finishFunction();
 
       /// GetVal - Get a value with the specified name or ID, creating a
       /// forward reference record if needed.  This can return null if the value
       /// exists but does not have the right type.
-      Value *GetVal(const std::string &Name, Type *Ty, LocTy Loc, bool IsCall);
-      Value *GetVal(unsigned ID, Type *Ty, LocTy Loc, bool IsCall);
+      Value *getVal(const std::string &Name, Type *Ty, LocTy Loc, bool IsCall);
+      Value *getVal(unsigned ID, Type *Ty, LocTy Loc, bool IsCall);
 
-      /// SetInstName - After an instruction is parsed and inserted into its
+      /// setInstName - After an instruction is parsed and inserted into its
       /// basic block, this installs its name.
-      bool SetInstName(int NameID, const std::string &NameStr, LocTy NameLoc,
+      bool setInstName(int NameID, const std::string &NameStr, LocTy NameLoc,
                        Instruction *Inst);
 
       /// GetBB - Get a basic block with the specified name or ID, creating a
       /// forward reference record if needed.  This can return null if the value
       /// is not a BasicBlock.
-      BasicBlock *GetBB(const std::string &Name, LocTy Loc);
-      BasicBlock *GetBB(unsigned ID, LocTy Loc);
+      BasicBlock *getBB(const std::string &Name, LocTy Loc);
+      BasicBlock *getBB(unsigned ID, LocTy Loc);
 
       /// DefineBB - Define the specified basic block, which is either named or
       /// unnamed.  If there is an error, this returns null otherwise it returns
       /// the block being defined.
-      BasicBlock *DefineBB(const std::string &Name, int NameID, LocTy Loc);
+      BasicBlock *defineBB(const std::string &Name, int NameID, LocTy Loc);
 
       bool resolveForwardRefBlockAddresses();
     };
 
-    bool ConvertValIDToValue(Type *Ty, ValID &ID, Value *&V,
+    bool convertValIDToValue(Type *Ty, ValID &ID, Value *&V,
                              PerFunctionState *PFS, bool IsCall);
 
     Value *checkValidVariableType(LocTy Loc, const Twine &Name, Type *Ty,
                                   Value *Val, bool IsCall);
 
     bool parseConstantValue(Type *Ty, Constant *&C);
-    bool ParseValue(Type *Ty, Value *&V, PerFunctionState *PFS);
-    bool ParseValue(Type *Ty, Value *&V, PerFunctionState &PFS) {
-      return ParseValue(Ty, V, &PFS);
+    bool parseValue(Type *Ty, Value *&V, PerFunctionState *PFS);
+    bool parseValue(Type *Ty, Value *&V, PerFunctionState &PFS) {
+      return parseValue(Ty, V, &PFS);
     }
 
-    bool ParseValue(Type *Ty, Value *&V, LocTy &Loc,
-                    PerFunctionState &PFS) {
+    bool parseValue(Type *Ty, Value *&V, LocTy &Loc, PerFunctionState &PFS) {
       Loc = Lex.getLoc();
-      return ParseValue(Ty, V, &PFS);
+      return parseValue(Ty, V, &PFS);
     }
 
-    bool ParseTypeAndValue(Value *&V, PerFunctionState *PFS);
-    bool ParseTypeAndValue(Value *&V, PerFunctionState &PFS) {
-      return ParseTypeAndValue(V, &PFS);
+    bool parseTypeAndValue(Value *&V, PerFunctionState *PFS);
+    bool parseTypeAndValue(Value *&V, PerFunctionState &PFS) {
+      return parseTypeAndValue(V, &PFS);
     }
-    bool ParseTypeAndValue(Value *&V, LocTy &Loc, PerFunctionState &PFS) {
+    bool parseTypeAndValue(Value *&V, LocTy &Loc, PerFunctionState &PFS) {
       Loc = Lex.getLoc();
-      return ParseTypeAndValue(V, PFS);
+      return parseTypeAndValue(V, PFS);
     }
-    bool ParseTypeAndBasicBlock(BasicBlock *&BB, LocTy &Loc,
+    bool parseTypeAndBasicBlock(BasicBlock *&BB, LocTy &Loc,
                                 PerFunctionState &PFS);
-    bool ParseTypeAndBasicBlock(BasicBlock *&BB, PerFunctionState &PFS) {
+    bool parseTypeAndBasicBlock(BasicBlock *&BB, PerFunctionState &PFS) {
       LocTy Loc;
-      return ParseTypeAndBasicBlock(BB, Loc, PFS);
+      return parseTypeAndBasicBlock(BB, Loc, PFS);
     }
 
-
     struct ParamInfo {
       LocTy Loc;
       Value *V;
@@ -496,49 +494,47 @@ namespace llvm {
       ParamInfo(LocTy loc, Value *v, AttributeSet attrs)
           : Loc(loc), V(v), Attrs(attrs) {}
     };
-    bool ParseParameterList(SmallVectorImpl<ParamInfo> &ArgList,
-                            PerFunctionState &PFS,
-                            bool IsMustTailCall = false,
+    bool parseParameterList(SmallVectorImpl<ParamInfo> &ArgList,
+                            PerFunctionState &PFS, bool IsMustTailCall = false,
                             bool InVarArgsFunc = false);
 
     bool
-    ParseOptionalOperandBundles(SmallVectorImpl<OperandBundleDef> &BundleList,
+    parseOptionalOperandBundles(SmallVectorImpl<OperandBundleDef> &BundleList,
                                 PerFunctionState &PFS);
 
-    bool ParseExceptionArgs(SmallVectorImpl<Value *> &Args,
+    bool parseExceptionArgs(SmallVectorImpl<Value *> &Args,
                             PerFunctionState &PFS);
 
     // Constant Parsing.
-    bool ParseValID(ValID &ID, PerFunctionState *PFS = nullptr);
-    bool ParseGlobalValue(Type *Ty, Constant *&C);
-    bool ParseGlobalTypeAndValue(Constant *&V);
-    bool ParseGlobalValueVector(SmallVectorImpl<Constant *> &Elts,
+    bool parseValID(ValID &ID, PerFunctionState *PFS = nullptr);
+    bool parseGlobalValue(Type *Ty, Constant *&C);
+    bool parseGlobalTypeAndValue(Constant *&V);
+    bool parseGlobalValueVector(SmallVectorImpl<Constant *> &Elts,
                                 Optional<unsigned> *InRangeOp = nullptr);
     bool parseOptionalComdat(StringRef GlobalName, Comdat *&C);
-    bool ParseMetadataAsValue(Value *&V, PerFunctionState &PFS);
-    bool ParseValueAsMetadata(Metadata *&MD, const Twine &TypeMsg,
+    bool parseMetadataAsValue(Value *&V, PerFunctionState &PFS);
+    bool parseValueAsMetadata(Metadata *&MD, const Twine &TypeMsg,
                               PerFunctionState *PFS);
-    bool ParseMetadata(Metadata *&MD, PerFunctionState *PFS);
-    bool ParseMDTuple(MDNode *&MD, bool IsDistinct = false);
-    bool ParseMDNode(MDNode *&N);
-    bool ParseMDNodeTail(MDNode *&N);
-    bool ParseMDNodeVector(SmallVectorImpl<Metadata *> &Elts);
-    bool ParseMetadataAttachment(unsigned &Kind, MDNode *&MD);
-    bool ParseInstructionMetadata(Instruction &Inst);
-    bool ParseGlobalObjectMetadataAttachment(GlobalObject &GO);
-    bool ParseOptionalFunctionMetadata(Function &F);
+    bool parseMetadata(Metadata *&MD, PerFunctionState *PFS);
+    bool parseMDTuple(MDNode *&MD, bool IsDistinct = false);
+    bool parseMDNode(MDNode *&N);
+    bool parseMDNodeTail(MDNode *&N);
+    bool parseMDNodeVector(SmallVectorImpl<Metadata *> &Elts);
+    bool parseMetadataAttachment(unsigned &Kind, MDNode *&MD);
+    bool parseInstructionMetadata(Instruction &Inst);
+    bool parseGlobalObjectMetadataAttachment(GlobalObject &GO);
+    bool parseOptionalFunctionMetadata(Function &F);
 
     template <class FieldTy>
-    bool ParseMDField(LocTy Loc, StringRef Name, FieldTy &Result);
-    template <class FieldTy> bool ParseMDField(StringRef Name, FieldTy &Result);
-    template <class ParserTy>
-    bool ParseMDFieldsImplBody(ParserTy parseField);
+    bool parseMDField(LocTy Loc, StringRef Name, FieldTy &Result);
+    template <class FieldTy> bool parseMDField(StringRef Name, FieldTy &Result);
+    template <class ParserTy> bool parseMDFieldsImplBody(ParserTy ParseField);
     template <class ParserTy>
-    bool ParseMDFieldsImpl(ParserTy parseField, LocTy &ClosingLoc);
-    bool ParseSpecializedMDNode(MDNode *&N, bool IsDistinct = false);
+    bool parseMDFieldsImpl(ParserTy ParseField, LocTy &ClosingLoc);
+    bool parseSpecializedMDNode(MDNode *&N, bool IsDistinct = false);
 
 #define HANDLE_SPECIALIZED_MDNODE_LEAF(CLASS)                                  \
-  bool Parse##CLASS(MDNode *&Result, bool IsDistinct);
+  bool parse##CLASS(MDNode *&Result, bool IsDistinct);
 #include "llvm/IR/Metadata.def"
 
     // Function Parsing.
@@ -550,64 +546,64 @@ namespace llvm {
       ArgInfo(LocTy L, Type *ty, AttributeSet Attr, const std::string &N)
           : Loc(L), Ty(ty), Attrs(Attr), Name(N) {}
     };
-    bool ParseArgumentList(SmallVectorImpl<ArgInfo> &ArgList, bool &isVarArg);
-    bool ParseFunctionHeader(Function *&Fn, bool isDefine);
-    bool ParseFunctionBody(Function &Fn);
-    bool ParseBasicBlock(PerFunctionState &PFS);
+    bool parseArgumentList(SmallVectorImpl<ArgInfo> &ArgList, bool &IsVarArg);
+    bool parseFunctionHeader(Function *&Fn, bool IsDefine);
+    bool parseFunctionBody(Function &Fn);
+    bool parseBasicBlock(PerFunctionState &PFS);
 
     enum TailCallType { TCT_None, TCT_Tail, TCT_MustTail };
 
     // Instruction Parsing.  Each instruction parsing routine can return with a
     // normal result, an error result, or return having eaten an extra comma.
     enum InstResult { InstNormal = 0, InstError = 1, InstExtraComma = 2 };
-    int ParseInstruction(Instruction *&Inst, BasicBlock *BB,
+    int parseInstruction(Instruction *&Inst, BasicBlock *BB,
                          PerFunctionState &PFS);
-    bool ParseCmpPredicate(unsigned &P, unsigned Opc);
-
-    bool ParseRet(Instruction *&Inst, BasicBlock *BB, PerFunctionState &PFS);
-    bool ParseBr(Instruction *&Inst, PerFunctionState &PFS);
-    bool ParseSwitch(Instruction *&Inst, PerFunctionState &PFS);
-    bool ParseIndirectBr(Instruction *&Inst, PerFunctionState &PFS);
-    bool ParseInvoke(Instruction *&Inst, PerFunctionState &PFS);
-    bool ParseResume(Instruction *&Inst, PerFunctionState &PFS);
-    bool ParseCleanupRet(Instruction *&Inst, PerFunctionState &PFS);
-    bool ParseCatchRet(Instruction *&Inst, PerFunctionState &PFS);
-    bool ParseCatchSwitch(Instruction *&Inst, PerFunctionState &PFS);
-    bool ParseCatchPad(Instruction *&Inst, PerFunctionState &PFS);
-    bool ParseCleanupPad(Instruction *&Inst, PerFunctionState &PFS);
-    bool ParseCallBr(Instruction *&Inst, PerFunctionState &PFS);
-
-    bool ParseUnaryOp(Instruction *&Inst, PerFunctionState &PFS, unsigned Opc,
+    bool parseCmpPredicate(unsigned &P, unsigned Opc);
+
+    bool parseRet(Instruction *&Inst, BasicBlock *BB, PerFunctionState &PFS);
+    bool parseBr(Instruction *&Inst, PerFunctionState &PFS);
+    bool parseSwitch(Instruction *&Inst, PerFunctionState &PFS);
+    bool parseIndirectBr(Instruction *&Inst, PerFunctionState &PFS);
+    bool parseInvoke(Instruction *&Inst, PerFunctionState &PFS);
+    bool parseResume(Instruction *&Inst, PerFunctionState &PFS);
+    bool parseCleanupRet(Instruction *&Inst, PerFunctionState &PFS);
+    bool parseCatchRet(Instruction *&Inst, PerFunctionState &PFS);
+    bool parseCatchSwitch(Instruction *&Inst, PerFunctionState &PFS);
+    bool parseCatchPad(Instruction *&Inst, PerFunctionState &PFS);
+    bool parseCleanupPad(Instruction *&Inst, PerFunctionState &PFS);
+    bool parseCallBr(Instruction *&Inst, PerFunctionState &PFS);
+
+    bool parseUnaryOp(Instruction *&Inst, PerFunctionState &PFS, unsigned Opc,
                       bool IsFP);
-    bool ParseArithmetic(Instruction *&Inst, PerFunctionState &PFS, unsigned Opc,
-                         bool IsFP);
-    bool ParseLogical(Instruction *&Inst, PerFunctionState &PFS, unsigned Opc);
-    bool ParseCompare(Instruction *&Inst, PerFunctionState &PFS, unsigned Opc);
-    bool ParseCast(Instruction *&Inst, PerFunctionState &PFS, unsigned Opc);
-    bool ParseSelect(Instruction *&Inst, PerFunctionState &PFS);
-    bool ParseVA_Arg(Instruction *&Inst, PerFunctionState &PFS);
-    bool ParseExtractElement(Instruction *&Inst, PerFunctionState &PFS);
-    bool ParseInsertElement(Instruction *&Inst, PerFunctionState &PFS);
-    bool ParseShuffleVector(Instruction *&Inst, PerFunctionState &PFS);
-    int ParsePHI(Instruction *&Inst, PerFunctionState &PFS);
-    bool ParseLandingPad(Instruction *&Inst, PerFunctionState &PFS);
-    bool ParseCall(Instruction *&Inst, PerFunctionState &PFS,
+    bool parseArithmetic(Instruction *&Inst, PerFunctionState &PFS,
+                         unsigned Opc, bool IsFP);
+    bool parseLogical(Instruction *&Inst, PerFunctionState &PFS, unsigned Opc);
+    bool parseCompare(Instruction *&Inst, PerFunctionState &PFS, unsigned Opc);
+    bool parseCast(Instruction *&Inst, PerFunctionState &PFS, unsigned Opc);
+    bool parseSelect(Instruction *&Inst, PerFunctionState &PFS);
+    bool parseVAArg(Instruction *&Inst, PerFunctionState &PFS);
+    bool parseExtractElement(Instruction *&Inst, PerFunctionState &PFS);
+    bool parseInsertElement(Instruction *&Inst, PerFunctionState &PFS);
+    bool parseShuffleVector(Instruction *&Inst, PerFunctionState &PFS);
+    int parsePHI(Instruction *&Inst, PerFunctionState &PFS);
+    bool parseLandingPad(Instruction *&Inst, PerFunctionState &PFS);
+    bool parseCall(Instruction *&Inst, PerFunctionState &PFS,
                    CallInst::TailCallKind TCK);
-    int ParseAlloc(Instruction *&Inst, PerFunctionState &PFS);
-    int ParseLoad(Instruction *&Inst, PerFunctionState &PFS);
-    int ParseStore(Instruction *&Inst, PerFunctionState &PFS);
-    int ParseCmpXchg(Instruction *&Inst, PerFunctionState &PFS);
-    int ParseAtomicRMW(Instruction *&Inst, PerFunctionState &PFS);
-    int ParseFence(Instruction *&Inst, PerFunctionState &PFS);
-    int ParseGetElementPtr(Instruction *&Inst, PerFunctionState &PFS);
-    int ParseExtractValue(Instruction *&Inst, PerFunctionState &PFS);
-    int ParseInsertValue(Instruction *&Inst, PerFunctionState &PFS);
-    bool ParseFreeze(Instruction *&I, PerFunctionState &PFS);
+    int parseAlloc(Instruction *&Inst, PerFunctionState &PFS);
+    int parseLoad(Instruction *&Inst, PerFunctionState &PFS);
+    int parseStore(Instruction *&Inst, PerFunctionState &PFS);
+    int parseCmpXchg(Instruction *&Inst, PerFunctionState &PFS);
+    int parseAtomicRMW(Instruction *&Inst, PerFunctionState &PFS);
+    int parseFence(Instruction *&Inst, PerFunctionState &PFS);
+    int parseGetElementPtr(Instruction *&Inst, PerFunctionState &PFS);
+    int parseExtractValue(Instruction *&Inst, PerFunctionState &PFS);
+    int parseInsertValue(Instruction *&Inst, PerFunctionState &PFS);
+    bool parseFreeze(Instruction *&I, PerFunctionState &PFS);
 
     // Use-list order directives.
-    bool ParseUseListOrder(PerFunctionState *PFS = nullptr);
-    bool ParseUseListOrderBB();
-    bool ParseUseListOrderIndexes(SmallVectorImpl<unsigned> &Indexes);
+    bool parseUseListOrder(PerFunctionState *PFS = nullptr);
+    bool parseUseListOrderBB();
+    bool parseUseListOrderIndexes(SmallVectorImpl<unsigned> &Indexes);
     bool sortUseListOrder(Value *V, ArrayRef<unsigned> Indexes, SMLoc Loc);
   };
 } // End llvm namespace
diff --git a/contrib/llvm-project/llvm/lib/AsmParser/LLToken.h b/contrib/llvm-project/llvm/lib/AsmParser/LLToken.h
index 0fb3bae77dd3..5149f861837a 100644
--- a/contrib/llvm-project/llvm/lib/AsmParser/LLToken.h
+++ b/contrib/llvm-project/llvm/lib/AsmParser/LLToken.h
@@ -74,6 +74,7 @@ enum Kind {
   kw_localexec,
   kw_zeroinitializer,
   kw_undef,
+  kw_poison,
   kw_null,
   kw_none,
   kw_to,
@@ -170,6 +171,7 @@ enum Kind {
   kw_amdgpu_ps,
   kw_amdgpu_cs,
   kw_amdgpu_kernel,
+  kw_amdgpu_gfx,
   kw_tailcc,
 
   // Attributes:
@@ -198,6 +200,7 @@ enum Kind {
   kw_noalias,
   kw_noundef,
   kw_nobuiltin,
+  kw_nocallback,
   kw_nocapture,
   kw_noduplicate,
   kw_nofree,
@@ -207,6 +210,7 @@ enum Kind {
   kw_nonlazybind,
   kw_nomerge,
   kw_nonnull,
+  kw_noprofile,
   kw_noredzone,
   kw_noreturn,
   kw_nosync,
@@ -240,6 +244,8 @@ enum Kind {
   kw_writeonly,
   kw_zeroext,
   kw_immarg,
+  kw_byref,
+  kw_mustprogress,
 
   kw_type,
   kw_opaque,
@@ -357,6 +363,7 @@ enum Kind {
   kw_extractvalue,
   kw_insertvalue,
   kw_blockaddress,
+  kw_dso_local_equivalent,
 
   kw_freeze,
 
diff --git a/contrib/llvm-project/llvm/lib/BinaryFormat/Dwarf.cpp b/contrib/llvm-project/llvm/lib/BinaryFormat/Dwarf.cpp
index a497c16685c1..e4747369f3e3 100644
--- a/contrib/llvm-project/llvm/lib/BinaryFormat/Dwarf.cpp
+++ b/contrib/llvm-project/llvm/lib/BinaryFormat/Dwarf.cpp
@@ -151,6 +151,8 @@ StringRef llvm::dwarf::OperationEncodingString(unsigned Encoding) {
     return "DW_OP_LLVM_tag_offset";
   case DW_OP_LLVM_entry_value:
     return "DW_OP_LLVM_entry_value";
+  case DW_OP_LLVM_implicit_pointer:
+    return "DW_OP_LLVM_implicit_pointer";
   }
 }
 
@@ -163,6 +165,7 @@ unsigned llvm::dwarf::getOperationEncoding(StringRef OperationEncodingString) {
       .Case("DW_OP_LLVM_fragment", DW_OP_LLVM_fragment)
       .Case("DW_OP_LLVM_tag_offset", DW_OP_LLVM_tag_offset)
       .Case("DW_OP_LLVM_entry_value", DW_OP_LLVM_entry_value)
+      .Case("DW_OP_LLVM_implicit_pointer", DW_OP_LLVM_implicit_pointer)
       .Default(0);
 }
 
@@ -488,6 +491,17 @@ StringRef llvm::dwarf::MacroString(unsigned Encoding) {
   }
 }
 
+StringRef llvm::dwarf::GnuMacroString(unsigned Encoding) {
+  switch (Encoding) {
+  default:
+    return StringRef();
+#define HANDLE_DW_MACRO_GNU(ID, NAME)                                          \
+  case DW_MACRO_GNU_##NAME:                                                    \
+    return "DW_MACRO_GNU_" #NAME;
+#include "llvm/BinaryFormat/Dwarf.def"
+  }
+}
+
 unsigned llvm::dwarf::getMacro(StringRef MacroString) {
   return StringSwitch<unsigned>(MacroString)
 #define HANDLE_DW_MACRO(ID, NAME) .Case("DW_MACRO_" #NAME, ID)
@@ -784,6 +798,17 @@ StringRef llvm::dwarf::FormatString(bool IsDWARF64) {
   return FormatString(IsDWARF64 ? DWARF64 : DWARF32);
 }
 
+StringRef llvm::dwarf::RLEString(unsigned RLE) {
+  switch (RLE) {
+  default:
+    return StringRef();
+#define HANDLE_DW_RLE(ID, NAME)                                                \
+  case DW_RLE_##NAME:                                                          \
+    return "DW_RLE_" #NAME;
+#include "llvm/BinaryFormat/Dwarf.def"
+  }
+}
+
 constexpr char llvm::dwarf::EnumTraits<Attribute>::Type[];
 constexpr char llvm::dwarf::EnumTraits<Form>::Type[];
 constexpr char llvm::dwarf::EnumTraits<Index>::Type[];
diff --git a/contrib/llvm-project/llvm/lib/BinaryFormat/MachO.cpp b/contrib/llvm-project/llvm/lib/BinaryFormat/MachO.cpp
index 2b9eb8025521..02a515c94399 100644
--- a/contrib/llvm-project/llvm/lib/BinaryFormat/MachO.cpp
+++ b/contrib/llvm-project/llvm/lib/BinaryFormat/MachO.cpp
@@ -55,10 +55,10 @@ static MachO::CPUSubTypeARM getARMSubType(const Triple &T) {
 }
 
 static MachO::CPUSubTypeARM64 getARM64SubType(const Triple &T) {
-  assert(T.isAArch64() || T.getArch() == Triple::aarch64_32);
+  assert(T.isAArch64());
   if (T.isArch32Bit())
     return (MachO::CPUSubTypeARM64)MachO::CPU_SUBTYPE_ARM64_32_V8;
-  if (T.getArchName() == "arm64e")
+  if (T.isArm64e())
     return MachO::CPU_SUBTYPE_ARM64E;
 
   return MachO::CPU_SUBTYPE_ARM64_ALL;
@@ -84,9 +84,7 @@ Expected<uint32_t> MachO::getCPUType(const Triple &T) {
   if (T.isARM() || T.isThumb())
     return MachO::CPU_TYPE_ARM;
   if (T.isAArch64())
-    return MachO::CPU_TYPE_ARM64;
-  if (T.getArch() == Triple::aarch64_32)
-    return MachO::CPU_TYPE_ARM64_32;
+    return T.isArch32Bit() ? MachO::CPU_TYPE_ARM64_32 : MachO::CPU_TYPE_ARM64;
   if (T.getArch() == Triple::ppc)
     return MachO::CPU_TYPE_POWERPC;
   if (T.getArch() == Triple::ppc64)
diff --git a/contrib/llvm-project/llvm/lib/BinaryFormat/MsgPackDocument.cpp b/contrib/llvm-project/llvm/lib/BinaryFormat/MsgPackDocument.cpp
index 53720c542e14..81ea4cee1a9d 100644
--- a/contrib/llvm-project/llvm/lib/BinaryFormat/MsgPackDocument.cpp
+++ b/contrib/llvm-project/llvm/lib/BinaryFormat/MsgPackDocument.cpp
@@ -277,6 +277,8 @@ void Document::writeToBlob(std::string &Blob) {
     case Type::String:
       MPWriter.write(Node.getString());
       break;
+    case Type::Empty:
+      llvm_unreachable("unhandled empty msgpack node");
     default:
       llvm_unreachable("unhandled msgpack object kind");
     }
@@ -310,4 +312,3 @@ void Document::writeToBlob(std::string &Blob) {
     }
   }
 }
-
diff --git a/contrib/llvm-project/llvm/lib/BinaryFormat/Wasm.cpp b/contrib/llvm-project/llvm/lib/BinaryFormat/Wasm.cpp
index 88608168783b..126680ac41c2 100644
--- a/contrib/llvm-project/llvm/lib/BinaryFormat/Wasm.cpp
+++ b/contrib/llvm-project/llvm/lib/BinaryFormat/Wasm.cpp
@@ -14,6 +14,8 @@ std::string llvm::wasm::toString(wasm::WasmSymbolType Type) {
     return "WASM_SYMBOL_TYPE_FUNCTION";
   case wasm::WASM_SYMBOL_TYPE_GLOBAL:
     return "WASM_SYMBOL_TYPE_GLOBAL";
+  case wasm::WASM_SYMBOL_TYPE_TABLE:
+    return "WASM_SYMBOL_TYPE_TABLE";
   case wasm::WASM_SYMBOL_TYPE_DATA:
     return "WASM_SYMBOL_TYPE_DATA";
   case wasm::WASM_SYMBOL_TYPE_SECTION:
@@ -46,7 +48,9 @@ bool llvm::wasm::relocTypeHasAddend(uint32_t Type) {
   case R_WASM_MEMORY_ADDR_REL_SLEB64:
   case R_WASM_MEMORY_ADDR_I32:
   case R_WASM_MEMORY_ADDR_I64:
+  case R_WASM_MEMORY_ADDR_TLS_SLEB:
   case R_WASM_FUNCTION_OFFSET_I32:
+  case R_WASM_FUNCTION_OFFSET_I64:
   case R_WASM_SECTION_OFFSET_I32:
     return true;
   default:
diff --git a/contrib/llvm-project/llvm/lib/BinaryFormat/XCOFF.cpp b/contrib/llvm-project/llvm/lib/BinaryFormat/XCOFF.cpp
index 3c8a2cdbc3aa..0f270a5cea1b 100644
--- a/contrib/llvm-project/llvm/lib/BinaryFormat/XCOFF.cpp
+++ b/contrib/llvm-project/llvm/lib/BinaryFormat/XCOFF.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/BinaryFormat/XCOFF.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
 
 using namespace llvm;
@@ -75,4 +76,81 @@ StringRef XCOFF::getRelocationTypeString(XCOFF::RelocationType Type) {
   }
   return "Unknown";
 }
+
+#define LANG_CASE(A)                                                           \
+  case XCOFF::TracebackTable::A:                                               \
+    return #A;
+
+StringRef XCOFF::getNameForTracebackTableLanguageId(
+    XCOFF::TracebackTable::LanguageID LangId) {
+  switch (LangId) {
+    LANG_CASE(C)
+    LANG_CASE(Fortran)
+    LANG_CASE(Pascal)
+    LANG_CASE(Ada)
+    LANG_CASE(PL1)
+    LANG_CASE(Basic)
+    LANG_CASE(Lisp)
+    LANG_CASE(Cobol)
+    LANG_CASE(Modula2)
+    LANG_CASE(Rpg)
+    LANG_CASE(PL8)
+    LANG_CASE(Assembly)
+    LANG_CASE(Java)
+    LANG_CASE(ObjectiveC)
+    LANG_CASE(CPlusPlus)
+  }
+  return "Unknown";
+}
+#undef LANG_CASE
+
+SmallString<32> XCOFF::parseParmsType(uint32_t Value, unsigned ParmsNum) {
+  SmallString<32> ParmsType;
+  for (unsigned I = 0; I < ParmsNum; ++I) {
+    if (I != 0)
+      ParmsType += ", ";
+    if ((Value & TracebackTable::ParmTypeIsFloatingBit) == 0) {
+      // Fixed parameter type.
+      ParmsType += "i";
+      Value <<= 1;
+    } else {
+      if ((Value & TracebackTable::ParmTypeFloatingIsDoubleBit) == 0)
+        // Float parameter type.
+        ParmsType += "f";
+      else
+        // Double parameter type.
+        ParmsType += "d";
+
+      Value <<= 2;
+    }
+  }
+  assert(Value == 0u && "ParmsType encodes more than ParmsNum parameters.");
+  return ParmsType;
+}
+
+SmallString<32> XCOFF::getExtendedTBTableFlagString(uint8_t Flag) {
+  SmallString<32> Res;
+
+  if (Flag & ExtendedTBTableFlag::TB_OS1)
+    Res += "TB_OS1 ";
+  if (Flag & ExtendedTBTableFlag::TB_RESERVED)
+    Res += "TB_RESERVED ";
+  if (Flag & ExtendedTBTableFlag::TB_SSP_CANARY)
+    Res += "TB_SSP_CANARY ";
+  if (Flag & ExtendedTBTableFlag::TB_OS2)
+    Res += "TB_OS2 ";
+  if (Flag & ExtendedTBTableFlag::TB_EH_INFO)
+    Res += "TB_EH_INFO ";
+  if (Flag & ExtendedTBTableFlag::TB_LONGTBTABLE2)
+    Res += "TB_LONGTBTABLE2 ";
+
+  // Two of the bits that haven't got used in the mask.
+  if (Flag & 0x06)
+    Res += "Unknown ";
+
+  // Pop the last space.
+  Res.pop_back();
+  return Res;
+}
+
 #undef RELOC_CASE
diff --git a/contrib/llvm-project/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp b/contrib/llvm-project/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
index 2ce064c7685a..e91af121ea08 100644
--- a/contrib/llvm-project/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
+++ b/contrib/llvm-project/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
@@ -135,6 +135,7 @@ static Optional<const char *> GetCodeName(unsigned CodeID, unsigned BlockID,
       STRINGIFY_CODE(MODULE_CODE, FUNCTION)
       STRINGIFY_CODE(MODULE_CODE, ALIAS)
       STRINGIFY_CODE(MODULE_CODE, GCNAME)
+      STRINGIFY_CODE(MODULE_CODE, COMDAT)
       STRINGIFY_CODE(MODULE_CODE, VSTOFFSET)
       STRINGIFY_CODE(MODULE_CODE, METADATA_VALUES_UNUSED)
       STRINGIFY_CODE(MODULE_CODE, SOURCE_FILENAME)
@@ -176,16 +177,20 @@ static Optional<const char *> GetCodeName(unsigned CodeID, unsigned BlockID,
       STRINGIFY_CODE(TYPE_CODE, OPAQUE)
       STRINGIFY_CODE(TYPE_CODE, INTEGER)
       STRINGIFY_CODE(TYPE_CODE, POINTER)
+      STRINGIFY_CODE(TYPE_CODE, HALF)
       STRINGIFY_CODE(TYPE_CODE, ARRAY)
       STRINGIFY_CODE(TYPE_CODE, VECTOR)
       STRINGIFY_CODE(TYPE_CODE, X86_FP80)
       STRINGIFY_CODE(TYPE_CODE, FP128)
       STRINGIFY_CODE(TYPE_CODE, PPC_FP128)
       STRINGIFY_CODE(TYPE_CODE, METADATA)
+      STRINGIFY_CODE(TYPE_CODE, X86_MMX)
       STRINGIFY_CODE(TYPE_CODE, STRUCT_ANON)
       STRINGIFY_CODE(TYPE_CODE, STRUCT_NAME)
       STRINGIFY_CODE(TYPE_CODE, STRUCT_NAMED)
       STRINGIFY_CODE(TYPE_CODE, FUNCTION)
+      STRINGIFY_CODE(TYPE_CODE, TOKEN)
+      STRINGIFY_CODE(TYPE_CODE, BFLOAT)
     }
 
   case bitc::CONSTANTS_BLOCK_ID:
diff --git a/contrib/llvm-project/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/contrib/llvm-project/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 659e26c2bd25..f2800201e871 100644
--- a/contrib/llvm-project/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/contrib/llvm-project/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -20,8 +20,9 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
-#include "llvm/Bitstream/BitstreamReader.h"
+#include "llvm/Bitcode/BitcodeCommon.h"
 #include "llvm/Bitcode/LLVMBitCodes.h"
+#include "llvm/Bitstream/BitstreamReader.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
@@ -578,9 +579,7 @@ public:
   /// \returns true if an error occurred.
   Error parseBitcodeInto(
       Module *M, bool ShouldLazyLoadMetadata = false, bool IsImporting = false,
-      DataLayoutCallbackTy DataLayoutCallback = [](std::string) {
-        return None;
-      });
+      DataLayoutCallbackTy DataLayoutCallback = [](StringRef) { return None; });
 
   static uint64_t decodeSignRotatedValue(uint64_t V);
 
@@ -650,7 +649,7 @@ private:
   /// Read a value/type pair out of the specified record from slot 'Slot'.
   /// Increment Slot past the number of slots used in the record. Return true on
   /// failure.
-  bool getValueTypePair(SmallVectorImpl<uint64_t> &Record, unsigned &Slot,
+  bool getValueTypePair(const SmallVectorImpl<uint64_t> &Record, unsigned &Slot,
                         unsigned InstNum, Value *&ResVal,
                         Type **FullTy = nullptr) {
     if (Slot == Record.size()) return true;
@@ -677,7 +676,7 @@ private:
   /// Read a value out of the specified record from slot 'Slot'. Increment Slot
   /// past the number of slots used by the value in the record. Return true if
   /// there is an error.
-  bool popValue(SmallVectorImpl<uint64_t> &Record, unsigned &Slot,
+  bool popValue(const SmallVectorImpl<uint64_t> &Record, unsigned &Slot,
                 unsigned InstNum, Type *Ty, Value *&ResVal) {
     if (getValue(Record, Slot, InstNum, Ty, ResVal))
       return true;
@@ -687,7 +686,7 @@ private:
   }
 
   /// Like popValue, but does not increment the Slot number.
-  bool getValue(SmallVectorImpl<uint64_t> &Record, unsigned Slot,
+  bool getValue(const SmallVectorImpl<uint64_t> &Record, unsigned Slot,
                 unsigned InstNum, Type *Ty, Value *&ResVal) {
     ResVal = getValue(Record, Slot, InstNum, Ty);
     return ResVal == nullptr;
@@ -695,7 +694,7 @@ private:
 
   /// Version of getValue that returns ResVal directly, or 0 if there is an
   /// error.
-  Value *getValue(SmallVectorImpl<uint64_t> &Record, unsigned Slot,
+  Value *getValue(const SmallVectorImpl<uint64_t> &Record, unsigned Slot,
                   unsigned InstNum, Type *Ty) {
     if (Slot == Record.size()) return nullptr;
     unsigned ValNo = (unsigned)Record[Slot];
@@ -706,7 +705,7 @@ private:
   }
 
   /// Like getValue, but decodes signed VBRs.
-  Value *getValueSigned(SmallVectorImpl<uint64_t> &Record, unsigned Slot,
+  Value *getValueSigned(const SmallVectorImpl<uint64_t> &Record, unsigned Slot,
                         unsigned InstNum, Type *Ty) {
     if (Slot == Record.size()) return nullptr;
     unsigned ValNo = (unsigned)decodeSignRotatedValue(Record[Slot]);
@@ -716,9 +715,9 @@ private:
     return getFnValueByID(ValNo, Ty);
   }
 
-  /// Upgrades old-style typeless byval attributes by adding the corresponding
-  /// argument's pointee type.
-  void propagateByValTypes(CallBase *CB, ArrayRef<Type *> ArgsFullTys);
+  /// Upgrades old-style typeless byval or sret attributes by adding the
+  /// corresponding argument's pointee type.
+  void propagateByValSRetTypes(CallBase *CB, ArrayRef<Type *> ArgsFullTys);
 
   /// Converts alignment exponent (i.e. power of two (or zero)) to the
   /// corresponding alignment to use. If alignment is too large, returns
@@ -834,6 +833,8 @@ private:
   void parseTypeIdCompatibleVtableSummaryRecord(ArrayRef<uint64_t> Record);
   void parseTypeIdCompatibleVtableInfo(ArrayRef<uint64_t> Record, size_t &Slot,
                                        TypeIdCompatibleVtableInfo &TypeId);
+  std::vector<FunctionSummary::ParamAccess>
+  parseParamAccesses(ArrayRef<uint64_t> Record);
 
   std::pair<ValueInfo, GlobalValue::GUID>
   getValueInfoFromValueId(unsigned ValueId);
@@ -1432,6 +1433,8 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) {
     return Attribute::NoAlias;
   case bitc::ATTR_KIND_NO_BUILTIN:
     return Attribute::NoBuiltin;
+  case bitc::ATTR_KIND_NO_CALLBACK:
+    return Attribute::NoCallback;
   case bitc::ATTR_KIND_NO_CAPTURE:
     return Attribute::NoCapture;
   case bitc::ATTR_KIND_NO_DUPLICATE:
@@ -1532,6 +1535,12 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) {
     return Attribute::Preallocated;
   case bitc::ATTR_KIND_NOUNDEF:
     return Attribute::NoUndef;
+  case bitc::ATTR_KIND_BYREF:
+    return Attribute::ByRef;
+  case bitc::ATTR_KIND_MUSTPROGRESS:
+    return Attribute::MustProgress;
+  case bitc::ATTR_KIND_HOT:
+    return Attribute::Hot;
   }
 }
 
@@ -1606,6 +1615,8 @@ Error BitcodeReader::parseAttributeGroupBlock() {
           // this AttributeList with a function.
           if (Kind == Attribute::ByVal)
             B.addByValAttr(nullptr);
+          else if (Kind == Attribute::StructRet)
+            B.addStructRetAttr(nullptr);
 
           B.addAttribute(Kind);
         } else if (Record[i] == 1) { // Integer attribute
@@ -1649,6 +1660,10 @@ Error BitcodeReader::parseAttributeGroupBlock() {
             return Err;
           if (Kind == Attribute::ByVal) {
             B.addByValAttr(HasType ? getTypeByID(Record[++i]) : nullptr);
+          } else if (Kind == Attribute::StructRet) {
+            B.addStructRetAttr(HasType ? getTypeByID(Record[++i]) : nullptr);
+          } else if (Kind == Attribute::ByRef) {
+            B.addByRefAttr(getTypeByID(Record[++i]));
           } else if (Kind == Attribute::Preallocated) {
             B.addPreallocatedAttr(getTypeByID(Record[++i]));
           }
@@ -1711,7 +1726,7 @@ Error BitcodeReader::parseTypeTableBody() {
     case bitc::TYPE_CODE_NUMENTRY: // TYPE_CODE_NUMENTRY: [numentries]
       // TYPE_CODE_NUMENTRY contains a count of the number of types in the
       // type list.  This allows us to reserve space.
-      if (Record.size() < 1)
+      if (Record.empty())
         return error("Invalid record");
       TypeList.resize(Record[0]);
       continue;
@@ -1748,11 +1763,14 @@ Error BitcodeReader::parseTypeTableBody() {
     case bitc::TYPE_CODE_X86_MMX:   // X86_MMX
       ResultTy = Type::getX86_MMXTy(Context);
       break;
+    case bitc::TYPE_CODE_X86_AMX:   // X86_AMX
+      ResultTy = Type::getX86_AMXTy(Context);
+      break;
     case bitc::TYPE_CODE_TOKEN:     // TOKEN
       ResultTy = Type::getTokenTy(Context);
       break;
     case bitc::TYPE_CODE_INTEGER: { // INTEGER: [width]
-      if (Record.size() < 1)
+      if (Record.empty())
         return error("Invalid record");
 
       uint64_t NumBits = Record[0];
@@ -1764,7 +1782,7 @@ Error BitcodeReader::parseTypeTableBody() {
     }
     case bitc::TYPE_CODE_POINTER: { // POINTER: [pointee type] or
                                     //          [pointee type, address space]
-      if (Record.size() < 1)
+      if (Record.empty())
         return error("Invalid record");
       unsigned AddressSpace = 0;
       if (Record.size() == 2)
@@ -1819,7 +1837,7 @@ Error BitcodeReader::parseTypeTableBody() {
       break;
     }
     case bitc::TYPE_CODE_STRUCT_ANON: {  // STRUCT: [ispacked, eltty x N]
-      if (Record.size() < 1)
+      if (Record.empty())
         return error("Invalid record");
       SmallVector<Type*, 8> EltTys;
       for (unsigned i = 1, e = Record.size(); i != e; ++i) {
@@ -1839,7 +1857,7 @@ Error BitcodeReader::parseTypeTableBody() {
       continue;
 
     case bitc::TYPE_CODE_STRUCT_NAMED: { // STRUCT: [ispacked, eltty x N]
-      if (Record.size() < 1)
+      if (Record.empty())
         return error("Invalid record");
 
       if (NumRecords >= TypeList.size())
@@ -2399,6 +2417,9 @@ Error BitcodeReader::parseConstants() {
     case bitc::CST_CODE_UNDEF:     // UNDEF
       V = UndefValue::get(CurTy);
       break;
+    case bitc::CST_CODE_POISON:    // POISON
+      V = PoisonValue::get(CurTy);
+      break;
     case bitc::CST_CODE_SETTYPE:   // SETTYPE: [typeid]
       if (Record.empty())
         return error("Invalid record");
@@ -2912,8 +2933,7 @@ Error BitcodeReader::parseUseLists() {
       if (RecordLength < 3)
         // Records should have at least an ID and two indexes.
         return error("Invalid record");
-      unsigned ID = Record.back();
-      Record.pop_back();
+      unsigned ID = Record.pop_back_val();
 
       Value *V;
       if (IsBB) {
@@ -2965,12 +2985,15 @@ Error BitcodeReader::materializeMetadata() {
   }
 
   // Upgrade "Linker Options" module flag to "llvm.linker.options" module-level
-  // metadata.
-  if (Metadata *Val = TheModule->getModuleFlag("Linker Options")) {
-    NamedMDNode *LinkerOpts =
-        TheModule->getOrInsertNamedMetadata("llvm.linker.options");
-    for (const MDOperand &MDOptions : cast<MDNode>(Val)->operands())
-      LinkerOpts->addOperand(cast<MDNode>(MDOptions));
+  // metadata. Only upgrade if the new option doesn't exist to avoid upgrade
+  // multiple times.
+  if (!TheModule->getNamedMetadata("llvm.linker.options")) {
+    if (Metadata *Val = TheModule->getModuleFlag("Linker Options")) {
+      NamedMDNode *LinkerOpts =
+          TheModule->getOrInsertNamedMetadata("llvm.linker.options");
+      for (const MDOperand &MDOptions : cast<MDNode>(Val)->operands())
+        LinkerOpts->addOperand(cast<MDNode>(MDOptions));
+    }
   }
 
   DeferredMetadataInfo.clear();
@@ -3278,17 +3301,24 @@ Error BitcodeReader::parseFunctionRecord(ArrayRef<uint64_t> Record) {
   Func->setLinkage(getDecodedLinkage(RawLinkage));
   Func->setAttributes(getAttributes(Record[4]));
 
-  // Upgrade any old-style byval without a type by propagating the argument's
-  // pointee type. There should be no opaque pointers where the byval type is
-  // implicit.
+  // Upgrade any old-style byval or sret without a type by propagating the
+  // argument's pointee type. There should be no opaque pointers where the byval
+  // type is implicit.
   for (unsigned i = 0; i != Func->arg_size(); ++i) {
-    if (!Func->hasParamAttribute(i, Attribute::ByVal))
-      continue;
+    for (Attribute::AttrKind Kind : {Attribute::ByVal, Attribute::StructRet}) {
+      if (!Func->hasParamAttribute(i, Kind))
+        continue;
 
-    Type *PTy = cast<FunctionType>(FullFTy)->getParamType(i);
-    Func->removeParamAttr(i, Attribute::ByVal);
-    Func->addParamAttr(i, Attribute::getWithByValType(
-                              Context, getPointerElementFlatType(PTy)));
+      Func->removeParamAttr(i, Kind);
+
+      Type *PTy = cast<FunctionType>(FullFTy)->getParamType(i);
+      Type *PtrEltTy = getPointerElementFlatType(PTy);
+      Attribute NewAttr =
+          Kind == Attribute::ByVal
+              ? Attribute::getWithByValType(Context, PtrEltTy)
+              : Attribute::getWithStructRetType(Context, PtrEltTy);
+      Func->addParamAttr(i, NewAttr);
+    }
   }
 
   MaybeAlign Alignment;
@@ -3708,7 +3738,7 @@ Error BitcodeReader::parseModule(uint64_t ResumeBit,
       break;
     /// MODULE_CODE_VSTOFFSET: [offset]
     case bitc::MODULE_CODE_VSTOFFSET:
-      if (Record.size() < 1)
+      if (Record.empty())
         return error("Invalid record");
       // Note that we subtract 1 here because the offset is relative to one word
       // before the start of the identification or module block, which was
@@ -3749,16 +3779,22 @@ Error BitcodeReader::typeCheckLoadStoreInst(Type *ValType, Type *PtrType) {
   return Error::success();
 }
 
-void BitcodeReader::propagateByValTypes(CallBase *CB,
-                                        ArrayRef<Type *> ArgsFullTys) {
+void BitcodeReader::propagateByValSRetTypes(CallBase *CB,
+                                            ArrayRef<Type *> ArgsFullTys) {
   for (unsigned i = 0; i != CB->arg_size(); ++i) {
-    if (!CB->paramHasAttr(i, Attribute::ByVal))
-      continue;
+    for (Attribute::AttrKind Kind : {Attribute::ByVal, Attribute::StructRet}) {
+      if (!CB->paramHasAttr(i, Kind))
+        continue;
+
+      CB->removeParamAttr(i, Kind);
 
-    CB->removeParamAttr(i, Attribute::ByVal);
-    CB->addParamAttr(
-        i, Attribute::getWithByValType(
-               Context, getPointerElementFlatType(ArgsFullTys[i])));
+      Type *PtrEltTy = getPointerElementFlatType(ArgsFullTys[i]);
+      Attribute NewAttr =
+          Kind == Attribute::ByVal
+              ? Attribute::getWithByValType(Context, PtrEltTy)
+              : Attribute::getWithStructRetType(Context, PtrEltTy);
+      CB->addParamAttr(i, NewAttr);
+    }
   }
 }
 
@@ -3862,7 +3898,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
     default: // Default behavior: reject
       return error("Invalid value");
     case bitc::FUNC_CODE_DECLAREBLOCKS: {   // DECLAREBLOCKS: [nblocks]
-      if (Record.size() < 1 || Record[0] == 0)
+      if (Record.empty() || Record[0] == 0)
         return error("Invalid record");
       // Create all the basic blocks for the function.
       FunctionBBs.resize(Record[0]);
@@ -3929,7 +3965,8 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
         if (!IA)
           return error("Invalid record");
       }
-      LastLoc = DebugLoc::get(Line, Col, Scope, IA, isImplicitCode);
+      LastLoc = DILocation::get(Scope->getContext(), Line, Col, Scope, IA,
+                                isImplicitCode);
       I->setDebugLoc(LastLoc);
       I = nullptr;
       continue;
@@ -4610,7 +4647,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       cast<InvokeInst>(I)->setCallingConv(
           static_cast<CallingConv::ID>(CallingConv::MaxID & CCInfo));
       cast<InvokeInst>(I)->setAttributes(PAL);
-      propagateByValTypes(cast<CallBase>(I), ArgsFullTys);
+      propagateByValSRetTypes(cast<CallBase>(I), ArgsFullTys);
 
       break;
     }
@@ -4704,7 +4741,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       InstructionList.push_back(I);
       break;
     case bitc::FUNC_CODE_INST_PHI: { // PHI: [ty, val0,bb0, ...]
-      if (Record.size() < 1)
+      if (Record.empty())
         return error("Invalid record");
       // The first record specifies the type.
       FullTy = getFullyStructuredTypeByID(Record[0]);
@@ -4806,17 +4843,13 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
     case bitc::FUNC_CODE_INST_ALLOCA: { // ALLOCA: [instty, opty, op, align]
       if (Record.size() != 4)
         return error("Invalid record");
-      uint64_t AlignRecord = Record[3];
-      const uint64_t InAllocaMask = uint64_t(1) << 5;
-      const uint64_t ExplicitTypeMask = uint64_t(1) << 6;
-      const uint64_t SwiftErrorMask = uint64_t(1) << 7;
-      const uint64_t FlagMask = InAllocaMask | ExplicitTypeMask |
-                                SwiftErrorMask;
-      bool InAlloca = AlignRecord & InAllocaMask;
-      bool SwiftError = AlignRecord & SwiftErrorMask;
+      using APV = AllocaPackedValues;
+      const uint64_t Rec = Record[3];
+      const bool InAlloca = Bitfield::get<APV::UsedWithInAlloca>(Rec);
+      const bool SwiftError = Bitfield::get<APV::SwiftError>(Rec);
       FullTy = getFullyStructuredTypeByID(Record[0]);
       Type *Ty = flattenPointerTypes(FullTy);
-      if ((AlignRecord & ExplicitTypeMask) == 0) {
+      if (!Bitfield::get<APV::ExplicitType>(Rec)) {
         auto *PTy = dyn_cast_or_null<PointerType>(Ty);
         if (!PTy)
           return error("Old-style alloca with a non-pointer type");
@@ -4825,7 +4858,8 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       Type *OpTy = getTypeByID(Record[1]);
       Value *Size = getFnValueByID(Record[2], OpTy);
       MaybeAlign Align;
-      if (Error Err = parseAlignmentValue(AlignRecord & ~FlagMask, Align)) {
+      if (Error Err =
+              parseAlignmentValue(Bitfield::get<APV::Align>(Rec), Align)) {
         return Err;
       }
       if (!Ty || !Size)
@@ -4982,54 +5016,55 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       InstructionList.push_back(I);
       break;
     }
-    case bitc::FUNC_CODE_INST_CMPXCHG_OLD:
-    case bitc::FUNC_CODE_INST_CMPXCHG: {
-      // CMPXCHG:[ptrty, ptr, cmp, new, vol, successordering, ssid,
-      //          failureordering?, isweak?]
+    case bitc::FUNC_CODE_INST_CMPXCHG_OLD: {
+      // CMPXCHG_OLD: [ptrty, ptr, cmp, val, vol, ordering, synchscope,
+      // failure_ordering?, weak?]
+      const size_t NumRecords = Record.size();
       unsigned OpNum = 0;
-      Value *Ptr, *Cmp, *New;
+      Value *Ptr = nullptr;
       if (getValueTypePair(Record, OpNum, NextValueNo, Ptr, &FullTy))
         return error("Invalid record");
 
       if (!isa<PointerType>(Ptr->getType()))
         return error("Cmpxchg operand is not a pointer type");
 
-      if (BitCode == bitc::FUNC_CODE_INST_CMPXCHG) {
-        if (getValueTypePair(Record, OpNum, NextValueNo, Cmp, &FullTy))
-          return error("Invalid record");
-      } else if (popValue(Record, OpNum, NextValueNo,
-                          getPointerElementFlatType(FullTy), Cmp))
+      Value *Cmp = nullptr;
+      if (popValue(Record, OpNum, NextValueNo,
+                   getPointerElementFlatType(FullTy), Cmp))
         return error("Invalid record");
-      else
-        FullTy = cast<PointerType>(FullTy)->getElementType();
 
+      FullTy = cast<PointerType>(FullTy)->getElementType();
+
+      Value *New = nullptr;
       if (popValue(Record, OpNum, NextValueNo, Cmp->getType(), New) ||
-          Record.size() < OpNum + 3 || Record.size() > OpNum + 5)
+          NumRecords < OpNum + 3 || NumRecords > OpNum + 5)
         return error("Invalid record");
 
-      AtomicOrdering SuccessOrdering = getDecodedOrdering(Record[OpNum + 1]);
+      const AtomicOrdering SuccessOrdering =
+          getDecodedOrdering(Record[OpNum + 1]);
       if (SuccessOrdering == AtomicOrdering::NotAtomic ||
           SuccessOrdering == AtomicOrdering::Unordered)
         return error("Invalid record");
-      SyncScope::ID SSID = getDecodedSyncScopeID(Record[OpNum + 2]);
+
+      const SyncScope::ID SSID = getDecodedSyncScopeID(Record[OpNum + 2]);
 
       if (Error Err = typeCheckLoadStoreInst(Cmp->getType(), Ptr->getType()))
         return Err;
-      AtomicOrdering FailureOrdering;
-      if (Record.size() < 7)
-        FailureOrdering =
-            AtomicCmpXchgInst::getStrongestFailureOrdering(SuccessOrdering);
-      else
-        FailureOrdering = getDecodedOrdering(Record[OpNum + 3]);
 
-      Align Alignment(
+      const AtomicOrdering FailureOrdering =
+          NumRecords < 7
+              ? AtomicCmpXchgInst::getStrongestFailureOrdering(SuccessOrdering)
+              : getDecodedOrdering(Record[OpNum + 3]);
+
+      const Align Alignment(
           TheModule->getDataLayout().getTypeStoreSize(Cmp->getType()));
+
       I = new AtomicCmpXchgInst(Ptr, Cmp, New, Alignment, SuccessOrdering,
                                 FailureOrdering, SSID);
-      FullTy = StructType::get(Context, {FullTy, Type::getInt1Ty(Context)});
       cast<AtomicCmpXchgInst>(I)->setVolatile(Record[OpNum]);
+      FullTy = StructType::get(Context, {FullTy, Type::getInt1Ty(Context)});
 
-      if (Record.size() < 8) {
+      if (NumRecords < 8) {
         // Before weak cmpxchgs existed, the instruction simply returned the
         // value loaded from memory, so bitcode files from that era will be
         // expecting the first component of a modern cmpxchg.
@@ -5037,12 +5072,59 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
         I = ExtractValueInst::Create(I, 0);
         FullTy = cast<StructType>(FullTy)->getElementType(0);
       } else {
-        cast<AtomicCmpXchgInst>(I)->setWeak(Record[OpNum+4]);
+        cast<AtomicCmpXchgInst>(I)->setWeak(Record[OpNum + 4]);
       }
 
       InstructionList.push_back(I);
       break;
     }
+    case bitc::FUNC_CODE_INST_CMPXCHG: {
+      // CMPXCHG: [ptrty, ptr, cmp, val, vol, success_ordering, synchscope,
+      // failure_ordering, weak]
+      const size_t NumRecords = Record.size();
+      unsigned OpNum = 0;
+      Value *Ptr = nullptr;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Ptr, &FullTy))
+        return error("Invalid record");
+
+      if (!isa<PointerType>(Ptr->getType()))
+        return error("Cmpxchg operand is not a pointer type");
+
+      Value *Cmp = nullptr;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Cmp, &FullTy))
+        return error("Invalid record");
+
+      Value *Val = nullptr;
+      if (popValue(Record, OpNum, NextValueNo, Cmp->getType(), Val) ||
+          NumRecords < OpNum + 3 || NumRecords > OpNum + 5)
+        return error("Invalid record");
+
+      const AtomicOrdering SuccessOrdering =
+          getDecodedOrdering(Record[OpNum + 1]);
+      if (SuccessOrdering == AtomicOrdering::NotAtomic ||
+          SuccessOrdering == AtomicOrdering::Unordered)
+        return error("Invalid record");
+
+      const SyncScope::ID SSID = getDecodedSyncScopeID(Record[OpNum + 2]);
+
+      if (Error Err = typeCheckLoadStoreInst(Cmp->getType(), Ptr->getType()))
+        return Err;
+
+      const AtomicOrdering FailureOrdering =
+          getDecodedOrdering(Record[OpNum + 3]);
+
+      const Align Alignment(
+          TheModule->getDataLayout().getTypeStoreSize(Cmp->getType()));
+
+      I = new AtomicCmpXchgInst(Ptr, Cmp, Val, Alignment, SuccessOrdering,
+                                FailureOrdering, SSID);
+      FullTy = StructType::get(Context, {FullTy, Type::getInt1Ty(Context)});
+      cast<AtomicCmpXchgInst>(I)->setVolatile(Record[OpNum]);
+      cast<AtomicCmpXchgInst>(I)->setWeak(Record[OpNum + 4]);
+
+      InstructionList.push_back(I);
+      break;
+    }
     case bitc::FUNC_CODE_INST_ATOMICRMW: {
       // ATOMICRMW:[ptrty, ptr, val, op, vol, ordering, ssid]
       unsigned OpNum = 0;
@@ -5172,7 +5254,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
         TCK = CallInst::TCK_NoTail;
       cast<CallInst>(I)->setTailCallKind(TCK);
       cast<CallInst>(I)->setAttributes(PAL);
-      propagateByValTypes(cast<CallBase>(I), ArgsFullTys);
+      propagateByValSRetTypes(cast<CallBase>(I), ArgsFullTys);
       if (FMF.any()) {
         if (!isa<FPMathOperator>(I))
           return error("Fast-math-flags specified for call without "
@@ -5200,7 +5282,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       // number of operand bundle blocks.  These blocks are read into
       // OperandBundles and consumed at the next call or invoke instruction.
 
-      if (Record.size() < 1 || Record[0] >= BundleTags.size())
+      if (Record.empty() || Record[0] >= BundleTags.size())
         return error("Invalid record");
 
       std::vector<Value *> Inputs;
@@ -5390,6 +5472,36 @@ Error BitcodeReader::materialize(GlobalValue *GV) {
     }
   }
 
+  // "Upgrade" older incorrect branch weights by dropping them.
+  for (auto &I : instructions(F)) {
+    if (auto *MD = I.getMetadata(LLVMContext::MD_prof)) {
+      if (MD->getOperand(0) != nullptr && isa<MDString>(MD->getOperand(0))) {
+        MDString *MDS = cast<MDString>(MD->getOperand(0));
+        StringRef ProfName = MDS->getString();
+        // Check consistency of !prof branch_weights metadata.
+        if (!ProfName.equals("branch_weights"))
+          continue;
+        unsigned ExpectedNumOperands = 0;
+        if (BranchInst *BI = dyn_cast<BranchInst>(&I))
+          ExpectedNumOperands = BI->getNumSuccessors();
+        else if (SwitchInst *SI = dyn_cast<SwitchInst>(&I))
+          ExpectedNumOperands = SI->getNumSuccessors();
+        else if (isa<CallInst>(&I))
+          ExpectedNumOperands = 1;
+        else if (IndirectBrInst *IBI = dyn_cast<IndirectBrInst>(&I))
+          ExpectedNumOperands = IBI->getNumDestinations();
+        else if (isa<SelectInst>(&I))
+          ExpectedNumOperands = 2;
+        else
+          continue; // ignore and continue.
+
+        // If branch weight doesn't match, just strip branch weight.
+        if (MD->getNumOperands() != 1 + ExpectedNumOperands)
+          I.setMetadata(LLVMContext::MD_prof, nullptr);
+      }
+    }
+  }
+
   // Look for functions that rely on old function attribute behavior.
   UpgradeFunctionAttributes(*F);
 
@@ -5703,7 +5815,7 @@ Error ModuleSummaryIndexBitcodeReader::parseModule() {
         }
         /// MODULE_CODE_VSTOFFSET: [offset]
         case bitc::MODULE_CODE_VSTOFFSET:
-          if (Record.size() < 1)
+          if (Record.empty())
             return error("Invalid record");
           // Note that we subtract 1 here because the offset is relative to one
           // word before the start of the identification or module block, which
@@ -5821,8 +5933,8 @@ static void parseTypeIdSummaryRecord(ArrayRef<uint64_t> Record,
     parseWholeProgramDevirtResolution(Record, Strtab, Slot, TypeId);
 }
 
-static std::vector<FunctionSummary::ParamAccess>
-parseParamAccesses(ArrayRef<uint64_t> Record) {
+std::vector<FunctionSummary::ParamAccess>
+ModuleSummaryIndexBitcodeReader::parseParamAccesses(ArrayRef<uint64_t> Record) {
   auto ReadRange = [&]() {
     APInt Lower(FunctionSummary::ParamAccess::RangeWidth,
                 BitcodeReader::decodeSignRotatedValue(Record.front()));
@@ -5848,7 +5960,7 @@ parseParamAccesses(ArrayRef<uint64_t> Record) {
     for (auto &Call : ParamAccess.Calls) {
       Call.ParamNo = Record.front();
       Record = Record.drop_front();
-      Call.Callee = Record.front();
+      Call.Callee = getValueInfoFromValueId(Record.front()).first;
       Record = Record.drop_front();
       Call.Offsets = ReadRange();
     }
@@ -6245,8 +6357,7 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
     }
     case bitc::FS_TYPE_TESTS:
       assert(PendingTypeTests.empty());
-      PendingTypeTests.insert(PendingTypeTests.end(), Record.begin(),
-                              Record.end());
+      llvm::append_range(PendingTypeTests, Record);
       break;
 
     case bitc::FS_TYPE_TEST_ASSUME_VCALLS:
diff --git a/contrib/llvm-project/llvm/lib/Bitcode/Reader/MetadataLoader.cpp b/contrib/llvm-project/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
index a8bf579bd180..8cdda62870ff 100644
--- a/contrib/llvm-project/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
+++ b/contrib/llvm-project/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
@@ -17,7 +17,6 @@
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
@@ -63,7 +62,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MemoryBuffer.h"
@@ -75,7 +73,6 @@
 #include <deque>
 #include <limits>
 #include <map>
-#include <memory>
 #include <string>
 #include <system_error>
 #include <tuple>
@@ -368,7 +365,7 @@ public:
   ~PlaceholderQueue() {
     assert(empty() && "PlaceholderQueue hasn't been flushed before being destroyed");
   }
-  bool empty() { return PHs.empty(); }
+  bool empty() const { return PHs.empty(); }
   DistinctMDOperandPlaceholder &getPlaceholderOp(unsigned ID);
   void flush(BitcodeReaderMetadataList &MetadataList);
 
@@ -441,6 +438,20 @@ class MetadataLoader::MetadataLoaderImpl {
   /// Index that keeps track of where to find a metadata record in the stream.
   std::vector<uint64_t> GlobalMetadataBitPosIndex;
 
+  /// Cursor position of the start of the global decl attachments, to enable
+  /// loading using the index built for lazy loading, instead of forward
+  /// references.
+  uint64_t GlobalDeclAttachmentPos = 0;
+
+#ifndef NDEBUG
+  /// Sanity check that we end up parsing all of the global decl attachments.
+  unsigned NumGlobalDeclAttachSkipped = 0;
+  unsigned NumGlobalDeclAttachParsed = 0;
+#endif
+
+  /// Load the global decl attachments, using the index built for lazy loading.
+  Expected<bool> loadGlobalDeclAttachments();
+
   /// Populate the index above to enable lazily loading of metadata, and load
   /// the named metadata as well as the transitively referenced global
   /// Metadata.
@@ -665,7 +676,7 @@ public:
     return FunctionsWithSPs.lookup(F);
   }
 
-  bool hasSeenOldLoopTags() { return HasSeenOldLoopTags; }
+  bool hasSeenOldLoopTags() const { return HasSeenOldLoopTags; }
 
   Error parseMetadataAttachment(
       Function &F, const SmallVectorImpl<Instruction *> &InstructionList);
@@ -673,7 +684,7 @@ public:
   Error parseMetadataKinds();
 
   void setStripTBAA(bool Value) { StripTBAA = Value; }
-  bool isStrippingTBAA() { return StripTBAA; }
+  bool isStrippingTBAA() const { return StripTBAA; }
 
   unsigned size() const { return MetadataList.size(); }
   void shrinkTo(unsigned N) { MetadataList.shrinkTo(N); }
@@ -684,8 +695,10 @@ Expected<bool>
 MetadataLoader::MetadataLoaderImpl::lazyLoadModuleMetadataBlock() {
   IndexCursor = Stream;
   SmallVector<uint64_t, 64> Record;
+  GlobalDeclAttachmentPos = 0;
   // Get the abbrevs, and preload record positions to make them lazy-loadable.
   while (true) {
+    uint64_t SavedPos = IndexCursor.GetCurrentBitNo();
     Expected<BitstreamEntry> MaybeEntry = IndexCursor.advanceSkippingSubblocks(
         BitstreamCursor::AF_DontPopBlockAtEnd);
     if (!MaybeEntry)
@@ -820,25 +833,11 @@ MetadataLoader::MetadataLoaderImpl::lazyLoadModuleMetadataBlock() {
         break;
       }
       case bitc::METADATA_GLOBAL_DECL_ATTACHMENT: {
-        // FIXME: we need to do this early because we don't materialize global
-        // value explicitly.
-        if (Error Err = IndexCursor.JumpToBit(CurrentPos))
-          return std::move(Err);
-        Record.clear();
-        if (Expected<unsigned> MaybeRecord =
-                IndexCursor.readRecord(Entry.ID, Record))
-          ;
-        else
-          return MaybeRecord.takeError();
-        if (Record.size() % 2 == 0)
-          return error("Invalid record");
-        unsigned ValueID = Record[0];
-        if (ValueID >= ValueList.size())
-          return error("Invalid record");
-        if (auto *GO = dyn_cast<GlobalObject>(ValueList[ValueID]))
-          if (Error Err = parseGlobalObjectAttachment(
-                  *GO, ArrayRef<uint64_t>(Record).slice(1)))
-            return std::move(Err);
+        if (!GlobalDeclAttachmentPos)
+          GlobalDeclAttachmentPos = SavedPos;
+#ifndef NDEBUG
+        NumGlobalDeclAttachSkipped++;
+#endif
         break;
       }
       case bitc::METADATA_KIND:
@@ -853,6 +852,7 @@ MetadataLoader::MetadataLoaderImpl::lazyLoadModuleMetadataBlock() {
       case bitc::METADATA_SUBRANGE:
       case bitc::METADATA_ENUMERATOR:
       case bitc::METADATA_BASIC_TYPE:
+      case bitc::METADATA_STRING_TYPE:
       case bitc::METADATA_DERIVED_TYPE:
       case bitc::METADATA_COMPOSITE_TYPE:
       case bitc::METADATA_SUBROUTINE_TYPE:
@@ -875,6 +875,7 @@ MetadataLoader::MetadataLoaderImpl::lazyLoadModuleMetadataBlock() {
       case bitc::METADATA_OBJC_PROPERTY:
       case bitc::METADATA_IMPORTED_ENTITY:
       case bitc::METADATA_GLOBAL_VAR_EXPR:
+      case bitc::METADATA_GENERIC_SUBRANGE:
         // We don't expect to see any of these, if we see one, give up on
         // lazy-loading and fallback.
         MDStringRef.clear();
@@ -887,6 +888,83 @@ MetadataLoader::MetadataLoaderImpl::lazyLoadModuleMetadataBlock() {
   }
 }
 
+// Load the global decl attachments after building the lazy loading index.
+// We don't load them "lazily" - all global decl attachments must be
+// parsed since they aren't materialized on demand. However, by delaying
+// their parsing until after the index is created, we can use the index
+// instead of creating temporaries.
+Expected<bool> MetadataLoader::MetadataLoaderImpl::loadGlobalDeclAttachments() {
+  // Nothing to do if we didn't find any of these metadata records.
+  if (!GlobalDeclAttachmentPos)
+    return true;
+  // Use a temporary cursor so that we don't mess up the main Stream cursor or
+  // the lazy loading IndexCursor (which holds the necessary abbrev ids).
+  BitstreamCursor TempCursor = Stream;
+  SmallVector<uint64_t, 64> Record;
+  // Jump to the position before the first global decl attachment, so we can
+  // scan for the first BitstreamEntry record.
+  if (Error Err = TempCursor.JumpToBit(GlobalDeclAttachmentPos))
+    return std::move(Err);
+  while (true) {
+    Expected<BitstreamEntry> MaybeEntry = TempCursor.advanceSkippingSubblocks(
+        BitstreamCursor::AF_DontPopBlockAtEnd);
+    if (!MaybeEntry)
+      return MaybeEntry.takeError();
+    BitstreamEntry Entry = MaybeEntry.get();
+
+    switch (Entry.Kind) {
+    case BitstreamEntry::SubBlock: // Handled for us already.
+    case BitstreamEntry::Error:
+      return error("Malformed block");
+    case BitstreamEntry::EndBlock:
+      // Sanity check that we parsed them all.
+      assert(NumGlobalDeclAttachSkipped == NumGlobalDeclAttachParsed);
+      return true;
+    case BitstreamEntry::Record:
+      break;
+    }
+    uint64_t CurrentPos = TempCursor.GetCurrentBitNo();
+    Expected<unsigned> MaybeCode = TempCursor.skipRecord(Entry.ID);
+    if (!MaybeCode)
+      return MaybeCode.takeError();
+    if (MaybeCode.get() != bitc::METADATA_GLOBAL_DECL_ATTACHMENT) {
+      // Anything other than a global decl attachment signals the end of
+      // these records. sanity check that we parsed them all.
+      assert(NumGlobalDeclAttachSkipped == NumGlobalDeclAttachParsed);
+      return true;
+    }
+#ifndef NDEBUG
+    NumGlobalDeclAttachParsed++;
+#endif
+    // FIXME: we need to do this early because we don't materialize global
+    // value explicitly.
+    if (Error Err = TempCursor.JumpToBit(CurrentPos))
+      return std::move(Err);
+    Record.clear();
+    if (Expected<unsigned> MaybeRecord =
+            TempCursor.readRecord(Entry.ID, Record))
+      ;
+    else
+      return MaybeRecord.takeError();
+    if (Record.size() % 2 == 0)
+      return error("Invalid record");
+    unsigned ValueID = Record[0];
+    if (ValueID >= ValueList.size())
+      return error("Invalid record");
+    if (auto *GO = dyn_cast<GlobalObject>(ValueList[ValueID])) {
+      // Need to save and restore the current position since
+      // parseGlobalObjectAttachment will resolve all forward references which
+      // would require parsing from locations stored in the index.
+      CurrentPos = TempCursor.GetCurrentBitNo();
+      if (Error Err = parseGlobalObjectAttachment(
+              *GO, ArrayRef<uint64_t>(Record).slice(1)))
+        return std::move(Err);
+      if (Error Err = TempCursor.JumpToBit(CurrentPos))
+        return std::move(Err);
+    }
+  }
+}
+
 /// Parse a METADATA_BLOCK. If ModuleLevel is true then we are parsing
 /// module level metadata.
 Error MetadataLoader::MetadataLoaderImpl::parseMetadata(bool ModuleLevel) {
@@ -916,6 +994,14 @@ Error MetadataLoader::MetadataLoaderImpl::parseMetadata(bool ModuleLevel) {
       MetadataList.resize(MDStringRef.size() +
                           GlobalMetadataBitPosIndex.size());
 
+      // Now that we have built the index, load the global decl attachments
+      // that were deferred during that process. This avoids creating
+      // temporaries.
+      SuccessOrErr = loadGlobalDeclAttachments();
+      if (!SuccessOrErr)
+        return SuccessOrErr.takeError();
+      assert(SuccessOrErr.get());
+
       // Reading the named metadata created forward references and/or
       // placeholders, that we flush here.
       resolveForwardRefsAndPlaceholders(Placeholders);
@@ -1286,6 +1372,18 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     NextMetadataNo++;
     break;
   }
+  case bitc::METADATA_GENERIC_SUBRANGE: {
+    Metadata *Val = nullptr;
+    Val = GET_OR_DISTINCT(DIGenericSubrange,
+                          (Context, getMDOrNull(Record[1]),
+                           getMDOrNull(Record[2]), getMDOrNull(Record[3]),
+                           getMDOrNull(Record[4])));
+
+    MetadataList.assignValue(Val, NextMetadataNo);
+    IsDistinct = Record[0] & 1;
+    NextMetadataNo++;
+    break;
+  }
   case bitc::METADATA_ENUMERATOR: {
     if (Record.size() < 3)
       return error("Invalid record");
@@ -1325,6 +1423,20 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     NextMetadataNo++;
     break;
   }
+  case bitc::METADATA_STRING_TYPE: {
+    if (Record.size() != 8)
+      return error("Invalid record");
+
+    IsDistinct = Record[0];
+    MetadataList.assignValue(
+        GET_OR_DISTINCT(DIStringType,
+                        (Context, Record[1], getMDString(Record[2]),
+                         getMDOrNull(Record[3]), getMDOrNull(Record[4]),
+                         Record[5], Record[6], Record[7])),
+        NextMetadataNo);
+    NextMetadataNo++;
+    break;
+  }
   case bitc::METADATA_DERIVED_TYPE: {
     if (Record.size() < 12 || Record.size() > 13)
       return error("Invalid record");
@@ -1350,7 +1462,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     break;
   }
   case bitc::METADATA_COMPOSITE_TYPE: {
-    if (Record.size() < 16 || Record.size() > 18)
+    if (Record.size() < 16 || Record.size() > 21)
       return error("Invalid record");
 
     // If we have a UUID and this is not a forward declaration, lookup the
@@ -1375,6 +1487,9 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     Metadata *TemplateParams = nullptr;
     Metadata *Discriminator = nullptr;
     Metadata *DataLocation = nullptr;
+    Metadata *Associated = nullptr;
+    Metadata *Allocated = nullptr;
+    Metadata *Rank = nullptr;
     auto *Identifier = getMDString(Record[15]);
     // If this module is being parsed so that it can be ThinLTO imported
     // into another module, composite types only need to be imported
@@ -1399,13 +1514,21 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
         Discriminator = getMDOrNull(Record[16]);
       if (Record.size() > 17)
         DataLocation = getMDOrNull(Record[17]);
+      if (Record.size() > 19) {
+        Associated = getMDOrNull(Record[18]);
+        Allocated = getMDOrNull(Record[19]);
+      }
+      if (Record.size() > 20) {
+        Rank = getMDOrNull(Record[20]);
+      }
     }
     DICompositeType *CT = nullptr;
     if (Identifier)
       CT = DICompositeType::buildODRType(
           Context, *Identifier, Tag, Name, File, Line, Scope, BaseType,
           SizeInBits, AlignInBits, OffsetInBits, Flags, Elements, RuntimeLang,
-          VTableHolder, TemplateParams, Discriminator, DataLocation);
+          VTableHolder, TemplateParams, Discriminator, DataLocation, Associated,
+          Allocated, Rank);
 
     // Create a node if we didn't get a lazy ODR type.
     if (!CT)
@@ -1413,7 +1536,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
                            (Context, Tag, Name, File, Line, Scope, BaseType,
                             SizeInBits, AlignInBits, OffsetInBits, Flags,
                             Elements, RuntimeLang, VTableHolder, TemplateParams,
-                            Identifier, Discriminator, DataLocation));
+                            Identifier, Discriminator, DataLocation, Associated,
+                            Allocated, Rank));
     if (!IsNotUsedInTypeRef && Identifier)
       MetadataList.addTypeRef(*Identifier, *cast<DICompositeType>(CT));
 
@@ -1441,19 +1565,20 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
   }
 
   case bitc::METADATA_MODULE: {
-    if (Record.size() < 5 || Record.size() > 8)
+    if (Record.size() < 5 || Record.size() > 9)
       return error("Invalid record");
 
-    unsigned Offset = Record.size() >= 7 ? 2 : 1;
+    unsigned Offset = Record.size() >= 8 ? 2 : 1;
     IsDistinct = Record[0];
     MetadataList.assignValue(
         GET_OR_DISTINCT(
             DIModule,
-            (Context, Record.size() >= 7 ? getMDOrNull(Record[1]) : nullptr,
+            (Context, Record.size() >= 8 ? getMDOrNull(Record[1]) : nullptr,
              getMDOrNull(Record[0 + Offset]), getMDString(Record[1 + Offset]),
              getMDString(Record[2 + Offset]), getMDString(Record[3 + Offset]),
              getMDString(Record[4 + Offset]),
-             Record.size() <= 7 ? 0 : Record[7])),
+             Record.size() <= 7 ? 0 : Record[7],
+             Record.size() <= 8 ? false : Record[8])),
         NextMetadataNo);
     NextMetadataNo++;
     break;
@@ -2001,7 +2126,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseGlobalObjectAttachment(
     auto K = MDKindMap.find(Record[I]);
     if (K == MDKindMap.end())
       return error("Invalid ID");
-    MDNode *MD = MetadataList.getMDNodeFwdRefOrNull(Record[I + 1]);
+    MDNode *MD =
+        dyn_cast_or_null<MDNode>(getMetadataFwdRefOrLoad(Record[I + 1]));
     if (!MD)
       return error("Invalid metadata attachment: expect fwd ref to MDNode");
     GO.addMetadata(K->second, *MD);
diff --git a/contrib/llvm-project/llvm/lib/Bitcode/Reader/ValueList.cpp b/contrib/llvm-project/llvm/lib/Bitcode/Reader/ValueList.cpp
index 63a206eeb022..ddfa28c6b1e4 100644
--- a/contrib/llvm-project/llvm/lib/Bitcode/Reader/ValueList.cpp
+++ b/contrib/llvm-project/llvm/lib/Bitcode/Reader/ValueList.cpp
@@ -16,14 +16,11 @@
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
-#include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <algorithm>
-#include <cassert>
 #include <cstddef>
 #include <limits>
-#include <utility>
 
 using namespace llvm;
 
diff --git a/contrib/llvm-project/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/contrib/llvm-project/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 9c15a5f9f193..37ecb9992e44 100644
--- a/contrib/llvm-project/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/contrib/llvm-project/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -24,6 +24,7 @@
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/Bitcode/BitcodeCommon.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Bitcode/LLVMBitCodes.h"
 #include "llvm/Bitstream/BitCodes.h"
@@ -85,6 +86,9 @@ static cl::opt<unsigned>
     IndexThreshold("bitcode-mdindex-threshold", cl::Hidden, cl::init(25),
                    cl::desc("Number of metadatas above which we emit an index "
                             "to enable lazy-loading"));
+static cl::opt<uint32_t> FlushThreshold(
+    "bitcode-flush-threshold", cl::Hidden, cl::init(512),
+    cl::desc("The threshold (unit M) for flushing LLVM bitcode."));
 
 static cl::opt<bool> WriteRelBFToSummary(
     "write-relbf-to-summary", cl::Hidden, cl::init(false),
@@ -296,10 +300,15 @@ private:
                           SmallVectorImpl<uint64_t> &Record, unsigned &Abbrev);
   void writeDISubrange(const DISubrange *N, SmallVectorImpl<uint64_t> &Record,
                        unsigned Abbrev);
+  void writeDIGenericSubrange(const DIGenericSubrange *N,
+                              SmallVectorImpl<uint64_t> &Record,
+                              unsigned Abbrev);
   void writeDIEnumerator(const DIEnumerator *N,
                          SmallVectorImpl<uint64_t> &Record, unsigned Abbrev);
   void writeDIBasicType(const DIBasicType *N, SmallVectorImpl<uint64_t> &Record,
                         unsigned Abbrev);
+  void writeDIStringType(const DIStringType *N,
+                         SmallVectorImpl<uint64_t> &Record, unsigned Abbrev);
   void writeDIDerivedType(const DIDerivedType *N,
                           SmallVectorImpl<uint64_t> &Record, unsigned Abbrev);
   void writeDICompositeType(const DICompositeType *N,
@@ -394,6 +403,8 @@ private:
   unsigned getEncodedSyncScopeID(SyncScope::ID SSID) {
     return unsigned(SSID);
   }
+
+  unsigned getEncodedAlign(MaybeAlign Alignment) { return encode(Alignment); }
 };
 
 /// Class to manage the bitcode writing for a combined index.
@@ -615,6 +626,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) {
     return bitc::ATTR_KIND_IN_ALLOCA;
   case Attribute::Cold:
     return bitc::ATTR_KIND_COLD;
+  case Attribute::Hot:
+    return bitc::ATTR_KIND_HOT;
   case Attribute::InaccessibleMemOnly:
     return bitc::ATTR_KIND_INACCESSIBLEMEM_ONLY;
   case Attribute::InaccessibleMemOrArgMemOnly:
@@ -635,6 +648,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) {
     return bitc::ATTR_KIND_NO_ALIAS;
   case Attribute::NoBuiltin:
     return bitc::ATTR_KIND_NO_BUILTIN;
+  case Attribute::NoCallback:
+    return bitc::ATTR_KIND_NO_CALLBACK;
   case Attribute::NoCapture:
     return bitc::ATTR_KIND_NO_CAPTURE;
   case Attribute::NoDuplicate:
@@ -665,6 +680,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) {
     return bitc::ATTR_KIND_NOSYNC;
   case Attribute::NoCfCheck:
     return bitc::ATTR_KIND_NOCF_CHECK;
+  case Attribute::NoProfile:
+    return bitc::ATTR_KIND_NO_PROFILE;
   case Attribute::NoUnwind:
     return bitc::ATTR_KIND_NO_UNWIND;
   case Attribute::NullPointerIsValid:
@@ -733,6 +750,10 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) {
     return bitc::ATTR_KIND_PREALLOCATED;
   case Attribute::NoUndef:
     return bitc::ATTR_KIND_NOUNDEF;
+  case Attribute::ByRef:
+    return bitc::ATTR_KIND_BYREF;
+  case Attribute::MustProgress:
+    return bitc::ATTR_KIND_MUSTPROGRESS;
   case Attribute::EndAttrKinds:
     llvm_unreachable("Can not encode end-attribute kinds marker.");
   case Attribute::None:
@@ -894,6 +915,7 @@ void ModuleBitcodeWriter::writeTypeTable() {
     case Type::LabelTyID:     Code = bitc::TYPE_CODE_LABEL;     break;
     case Type::MetadataTyID:  Code = bitc::TYPE_CODE_METADATA;  break;
     case Type::X86_MMXTyID:   Code = bitc::TYPE_CODE_X86_MMX;   break;
+    case Type::X86_AMXTyID:   Code = bitc::TYPE_CODE_X86_AMX;   break;
     case Type::TokenTyID:     Code = bitc::TYPE_CODE_TOKEN;     break;
     case Type::IntegerTyID:
       // INTEGER: [width]
@@ -963,7 +985,7 @@ void ModuleBitcodeWriter::writeTypeTable() {
       // VECTOR [numelts, eltty] or
       //        [numelts, eltty, scalable]
       Code = bitc::TYPE_CODE_VECTOR;
-      TypeVals.push_back(VT->getElementCount().Min);
+      TypeVals.push_back(VT->getElementCount().getKnownMinValue());
       TypeVals.push_back(VE.getTypeID(VT->getElementType()));
       if (isa<ScalableVectorType>(VT))
         TypeVals.push_back(true);
@@ -1179,10 +1201,14 @@ void ModuleBitcodeWriter::writeModuleInfo() {
   // compute the maximum alignment value.
   std::map<std::string, unsigned> SectionMap;
   std::map<std::string, unsigned> GCMap;
-  unsigned MaxAlignment = 0;
+  MaybeAlign MaxAlignment;
   unsigned MaxGlobalType = 0;
+  const auto UpdateMaxAlignment = [&MaxAlignment](const MaybeAlign A) {
+    if (A)
+      MaxAlignment = !MaxAlignment ? *A : std::max(*MaxAlignment, *A);
+  };
   for (const GlobalVariable &GV : M.globals()) {
-    MaxAlignment = std::max(MaxAlignment, GV.getAlignment());
+    UpdateMaxAlignment(GV.getAlign());
     MaxGlobalType = std::max(MaxGlobalType, VE.getTypeID(GV.getValueType()));
     if (GV.hasSection()) {
       // Give section names unique ID's.
@@ -1195,7 +1221,7 @@ void ModuleBitcodeWriter::writeModuleInfo() {
     }
   }
   for (const Function &F : M) {
-    MaxAlignment = std::max(MaxAlignment, F.getAlignment());
+    UpdateMaxAlignment(F.getAlign());
     if (F.hasSection()) {
       // Give section names unique ID's.
       unsigned &Entry = SectionMap[std::string(F.getSection())];
@@ -1231,10 +1257,10 @@ void ModuleBitcodeWriter::writeModuleInfo() {
                                                            //| constant
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // Initializer.
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 5)); // Linkage.
-    if (MaxAlignment == 0)                                 // Alignment.
+    if (!MaxAlignment)                                     // Alignment.
       Abbv->Add(BitCodeAbbrevOp(0));
     else {
-      unsigned MaxEncAlignment = Log2_32(MaxAlignment)+1;
+      unsigned MaxEncAlignment = getEncodedAlign(MaxAlignment);
       Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
                                Log2_32_Ceil(MaxEncAlignment+1)));
     }
@@ -1287,7 +1313,7 @@ void ModuleBitcodeWriter::writeModuleInfo() {
     Vals.push_back(GV.isDeclaration() ? 0 :
                    (VE.getValueID(GV.getInitializer()) + 1));
     Vals.push_back(getEncodedLinkage(GV));
-    Vals.push_back(Log2_32(GV.getAlignment())+1);
+    Vals.push_back(getEncodedAlign(GV.getAlign()));
     Vals.push_back(GV.hasSection() ? SectionMap[std::string(GV.getSection())]
                                    : 0);
     if (GV.isThreadLocal() ||
@@ -1333,7 +1359,7 @@ void ModuleBitcodeWriter::writeModuleInfo() {
     Vals.push_back(F.isDeclaration());
     Vals.push_back(getEncodedLinkage(F));
     Vals.push_back(VE.getAttributeListID(F.getAttributes()));
-    Vals.push_back(Log2_32(F.getAlignment())+1);
+    Vals.push_back(getEncodedAlign(F.getAlign()));
     Vals.push_back(F.hasSection() ? SectionMap[std::string(F.getSection())]
                                   : 0);
     Vals.push_back(getEncodedVisibility(F));
@@ -1535,6 +1561,19 @@ void ModuleBitcodeWriter::writeDISubrange(const DISubrange *N,
   Record.clear();
 }
 
+void ModuleBitcodeWriter::writeDIGenericSubrange(
+    const DIGenericSubrange *N, SmallVectorImpl<uint64_t> &Record,
+    unsigned Abbrev) {
+  Record.push_back((uint64_t)N->isDistinct());
+  Record.push_back(VE.getMetadataOrNullID(N->getRawCountNode()));
+  Record.push_back(VE.getMetadataOrNullID(N->getRawLowerBound()));
+  Record.push_back(VE.getMetadataOrNullID(N->getRawUpperBound()));
+  Record.push_back(VE.getMetadataOrNullID(N->getRawStride()));
+
+  Stream.EmitRecord(bitc::METADATA_GENERIC_SUBRANGE, Record, Abbrev);
+  Record.clear();
+}
+
 static void emitSignedInt64(SmallVectorImpl<uint64_t> &Vals, uint64_t V) {
   if ((int64_t)V >= 0)
     Vals.push_back(V << 1);
@@ -1581,6 +1620,22 @@ void ModuleBitcodeWriter::writeDIBasicType(const DIBasicType *N,
   Record.clear();
 }
 
+void ModuleBitcodeWriter::writeDIStringType(const DIStringType *N,
+                                            SmallVectorImpl<uint64_t> &Record,
+                                            unsigned Abbrev) {
+  Record.push_back(N->isDistinct());
+  Record.push_back(N->getTag());
+  Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
+  Record.push_back(VE.getMetadataOrNullID(N->getStringLength()));
+  Record.push_back(VE.getMetadataOrNullID(N->getStringLengthExp()));
+  Record.push_back(N->getSizeInBits());
+  Record.push_back(N->getAlignInBits());
+  Record.push_back(N->getEncoding());
+
+  Stream.EmitRecord(bitc::METADATA_STRING_TYPE, Record, Abbrev);
+  Record.clear();
+}
+
 void ModuleBitcodeWriter::writeDIDerivedType(const DIDerivedType *N,
                                              SmallVectorImpl<uint64_t> &Record,
                                              unsigned Abbrev) {
@@ -1630,6 +1685,9 @@ void ModuleBitcodeWriter::writeDICompositeType(
   Record.push_back(VE.getMetadataOrNullID(N->getRawIdentifier()));
   Record.push_back(VE.getMetadataOrNullID(N->getDiscriminator()));
   Record.push_back(VE.getMetadataOrNullID(N->getRawDataLocation()));
+  Record.push_back(VE.getMetadataOrNullID(N->getRawAssociated()));
+  Record.push_back(VE.getMetadataOrNullID(N->getRawAllocated()));
+  Record.push_back(VE.getMetadataOrNullID(N->getRawRank()));
 
   Stream.EmitRecord(bitc::METADATA_COMPOSITE_TYPE, Record, Abbrev);
   Record.clear();
@@ -1813,6 +1871,7 @@ void ModuleBitcodeWriter::writeDIModule(const DIModule *N,
   for (auto &I : N->operands())
     Record.push_back(VE.getMetadataOrNullID(I));
   Record.push_back(N->getLineNo());
+  Record.push_back(N->getIsDecl());
 
   Stream.EmitRecord(bitc::METADATA_MODULE, Record, Abbrev);
   Record.clear();
@@ -2374,6 +2433,8 @@ void ModuleBitcodeWriter::writeConstants(unsigned FirstVal, unsigned LastVal,
     unsigned AbbrevToUse = 0;
     if (C->isNullValue()) {
       Code = bitc::CST_CODE_NULL;
+    } else if (isa<PoisonValue>(C)) {
+      Code = bitc::CST_CODE_POISON;
     } else if (isa<UndefValue>(C)) {
       Code = bitc::CST_CODE_UNDEF;
     } else if (const ConstantInt *IV = dyn_cast<ConstantInt>(C)) {
@@ -2941,14 +3002,13 @@ void ModuleBitcodeWriter::writeInstruction(const Instruction &I,
     Vals.push_back(VE.getTypeID(AI.getAllocatedType()));
     Vals.push_back(VE.getTypeID(I.getOperand(0)->getType()));
     Vals.push_back(VE.getValueID(I.getOperand(0))); // size.
-    unsigned AlignRecord = Log2_32(AI.getAlignment()) + 1;
-    assert(Log2_32(Value::MaximumAlignment) + 1 < 1 << 5 &&
-           "not enough bits for maximum alignment");
-    assert(AlignRecord < 1 << 5 && "alignment greater than 1 << 64");
-    AlignRecord |= AI.isUsedWithInAlloca() << 5;
-    AlignRecord |= 1 << 6;
-    AlignRecord |= AI.isSwiftError() << 7;
-    Vals.push_back(AlignRecord);
+    using APV = AllocaPackedValues;
+    unsigned Record = 0;
+    Bitfield::set<APV::Align>(Record, getEncodedAlign(AI.getAlign()));
+    Bitfield::set<APV::UsedWithInAlloca>(Record, AI.isUsedWithInAlloca());
+    Bitfield::set<APV::ExplicitType>(Record, true);
+    Bitfield::set<APV::SwiftError>(Record, AI.isSwiftError());
+    Vals.push_back(Record);
     break;
   }
 
@@ -2962,7 +3022,7 @@ void ModuleBitcodeWriter::writeInstruction(const Instruction &I,
         AbbrevToUse = FUNCTION_INST_LOAD_ABBREV;
     }
     Vals.push_back(VE.getTypeID(I.getType()));
-    Vals.push_back(Log2_32(cast<LoadInst>(I).getAlignment())+1);
+    Vals.push_back(getEncodedAlign(cast<LoadInst>(I).getAlign()));
     Vals.push_back(cast<LoadInst>(I).isVolatile());
     if (cast<LoadInst>(I).isAtomic()) {
       Vals.push_back(getEncodedOrdering(cast<LoadInst>(I).getOrdering()));
@@ -2976,7 +3036,7 @@ void ModuleBitcodeWriter::writeInstruction(const Instruction &I,
       Code = bitc::FUNC_CODE_INST_STORE;
     pushValueAndType(I.getOperand(1), InstID, Vals); // ptrty + ptr
     pushValueAndType(I.getOperand(0), InstID, Vals); // valty + val
-    Vals.push_back(Log2_32(cast<StoreInst>(I).getAlignment())+1);
+    Vals.push_back(getEncodedAlign(cast<StoreInst>(I).getAlign()));
     Vals.push_back(cast<StoreInst>(I).isVolatile());
     if (cast<StoreInst>(I).isAtomic()) {
       Vals.push_back(getEncodedOrdering(cast<StoreInst>(I).getOrdering()));
@@ -3539,8 +3599,10 @@ void IndexBitcodeWriter::writeModStrings() {
 
 /// Write the function type metadata related records that need to appear before
 /// a function summary entry (whether per-module or combined).
+template <typename Fn>
 static void writeFunctionTypeMetadataRecords(BitstreamWriter &Stream,
-                                             FunctionSummary *FS) {
+                                             FunctionSummary *FS,
+                                             Fn GetValueID) {
   if (!FS->type_tests().empty())
     Stream.EmitRecord(bitc::FS_TYPE_TESTS, FS->type_tests());
 
@@ -3569,7 +3631,7 @@ static void writeFunctionTypeMetadataRecords(BitstreamWriter &Stream,
       Record.clear();
       Record.push_back(VC.VFunc.GUID);
       Record.push_back(VC.VFunc.Offset);
-      Record.insert(Record.end(), VC.Args.begin(), VC.Args.end());
+      llvm::append_range(Record, VC.Args);
       Stream.EmitRecord(Ty, Record);
     }
   };
@@ -3590,16 +3652,25 @@ static void writeFunctionTypeMetadataRecords(BitstreamWriter &Stream,
   if (!FS->paramAccesses().empty()) {
     Record.clear();
     for (auto &Arg : FS->paramAccesses()) {
+      size_t UndoSize = Record.size();
       Record.push_back(Arg.ParamNo);
       WriteRange(Arg.Use);
       Record.push_back(Arg.Calls.size());
       for (auto &Call : Arg.Calls) {
         Record.push_back(Call.ParamNo);
-        Record.push_back(Call.Callee);
+        Optional<unsigned> ValueID = GetValueID(Call.Callee);
+        if (!ValueID) {
+          // If ValueID is unknown we can't drop just this call, we must drop
+          // entire parameter.
+          Record.resize(UndoSize);
+          break;
+        }
+        Record.push_back(*ValueID);
         WriteRange(Call.Offsets);
       }
     }
-    Stream.EmitRecord(bitc::FS_PARAM_ACCESS, Record);
+    if (!Record.empty())
+      Stream.EmitRecord(bitc::FS_PARAM_ACCESS, Record);
   }
 }
 
@@ -3634,7 +3705,7 @@ static void writeWholeProgramDevirtResolutionByArg(
     SmallVector<uint64_t, 64> &NameVals, const std::vector<uint64_t> &args,
     const WholeProgramDevirtResolution::ByArg &ByArg) {
   NameVals.push_back(args.size());
-  NameVals.insert(NameVals.end(), args.begin(), args.end());
+  llvm::append_range(NameVals, args);
 
   NameVals.push_back(ByArg.TheKind);
   NameVals.push_back(ByArg.Info);
@@ -3696,7 +3767,11 @@ void ModuleBitcodeWriterBase::writePerModuleFunctionSummaryRecord(
   NameVals.push_back(ValueID);
 
   FunctionSummary *FS = cast<FunctionSummary>(Summary);
-  writeFunctionTypeMetadataRecords(Stream, FS);
+
+  writeFunctionTypeMetadataRecords(
+      Stream, FS, [&](const ValueInfo &VI) -> Optional<unsigned> {
+        return {VE.getValueID(VI.getValue())};
+      });
 
   auto SpecialRefCnts = FS->specialRefCounts();
   NameVals.push_back(getEncodedGVSummaryFlags(FS->flags()));
@@ -3757,7 +3832,7 @@ void ModuleBitcodeWriterBase::writeModuleLevelReferences(
     NameVals.push_back(VE.getValueID(RI.getValue()));
   // Sort the refs for determinism output, the vector returned by FS->refs() has
   // been initialized from a DenseSet.
-  llvm::sort(NameVals.begin() + SizeBeforeRefs, NameVals.end());
+  llvm::sort(drop_begin(NameVals, SizeBeforeRefs));
 
   if (VTableFuncs.empty())
     Stream.EmitRecord(bitc::FS_PERMODULE_GLOBALVAR_INIT_REFS, NameVals,
@@ -4073,8 +4148,38 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
       return;
     }
 
+    auto GetValueId = [&](const ValueInfo &VI) -> Optional<unsigned> {
+      GlobalValue::GUID GUID = VI.getGUID();
+      Optional<unsigned> CallValueId = getValueId(GUID);
+      if (CallValueId)
+        return CallValueId;
+      // For SamplePGO, the indirect call targets for local functions will
+      // have its original name annotated in profile. We try to find the
+      // corresponding PGOFuncName as the GUID.
+      GUID = Index.getGUIDFromOriginalID(GUID);
+      if (!GUID)
+        return None;
+      CallValueId = getValueId(GUID);
+      if (!CallValueId)
+        return None;
+      // The mapping from OriginalId to GUID may return a GUID
+      // that corresponds to a static variable. Filter it out here.
+      // This can happen when
+      // 1) There is a call to a library function which does not have
+      // a CallValidId;
+      // 2) There is a static variable with the  OriginalGUID identical
+      // to the GUID of the library function in 1);
+      // When this happens, the logic for SamplePGO kicks in and
+      // the static variable in 2) will be found, which needs to be
+      // filtered out.
+      auto *GVSum = Index.getGlobalValueSummary(GUID, false);
+      if (GVSum && GVSum->getSummaryKind() == GlobalValueSummary::GlobalVarKind)
+        return None;
+      return CallValueId;
+    };
+
     auto *FS = cast<FunctionSummary>(S);
-    writeFunctionTypeMetadataRecords(Stream, FS);
+    writeFunctionTypeMetadataRecords(Stream, FS, GetValueId);
     getReferencedTypeIds(FS, ReferencedTypeIds);
 
     NameVals.push_back(*ValueId);
@@ -4116,33 +4221,9 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
     for (auto &EI : FS->calls()) {
       // If this GUID doesn't have a value id, it doesn't have a function
       // summary and we don't need to record any calls to it.
-      GlobalValue::GUID GUID = EI.first.getGUID();
-      auto CallValueId = getValueId(GUID);
-      if (!CallValueId) {
-        // For SamplePGO, the indirect call targets for local functions will
-        // have its original name annotated in profile. We try to find the
-        // corresponding PGOFuncName as the GUID.
-        GUID = Index.getGUIDFromOriginalID(GUID);
-        if (GUID == 0)
-          continue;
-        CallValueId = getValueId(GUID);
-        if (!CallValueId)
-          continue;
-        // The mapping from OriginalId to GUID may return a GUID
-        // that corresponds to a static variable. Filter it out here.
-        // This can happen when
-        // 1) There is a call to a library function which does not have
-        // a CallValidId;
-        // 2) There is a static variable with the  OriginalGUID identical
-        // to the GUID of the library function in 1);
-        // When this happens, the logic for SamplePGO kicks in and
-        // the static variable in 2) will be found, which needs to be
-        // filtered out.
-        auto *GVSum = Index.getGlobalValueSummary(GUID, false);
-        if (GVSum &&
-            GVSum->getSummaryKind() == GlobalValueSummary::GlobalVarKind)
-          continue;
-      }
+      Optional<unsigned> CallValueId = GetValueId(EI.first);
+      if (!CallValueId)
+        continue;
       NameVals.push_back(*CallValueId);
       if (HasProfileData)
         NameVals.push_back(static_cast<uint8_t>(EI.second.Hotness));
@@ -4404,8 +4485,8 @@ static void writeBitcodeHeader(BitstreamWriter &Stream) {
   Stream.Emit(0xD, 4);
 }
 
-BitcodeWriter::BitcodeWriter(SmallVectorImpl<char> &Buffer)
-    : Buffer(Buffer), Stream(new BitstreamWriter(Buffer)) {
+BitcodeWriter::BitcodeWriter(SmallVectorImpl<char> &Buffer, raw_fd_stream *FS)
+    : Buffer(Buffer), Stream(new BitstreamWriter(Buffer, FS, FlushThreshold)) {
   writeBitcodeHeader(*Stream);
 }
 
@@ -4516,7 +4597,7 @@ void llvm::WriteBitcodeToFile(const Module &M, raw_ostream &Out,
   if (TT.isOSDarwin() || TT.isOSBinFormatMachO())
     Buffer.insert(Buffer.begin(), BWH_HeaderSize, 0);
 
-  BitcodeWriter Writer(Buffer);
+  BitcodeWriter Writer(Buffer, dyn_cast<raw_fd_stream>(&Out));
   Writer.writeModule(M, ShouldPreserveUseListOrder, Index, GenerateHash,
                      ModHash);
   Writer.writeSymtab();
@@ -4526,7 +4607,8 @@ void llvm::WriteBitcodeToFile(const Module &M, raw_ostream &Out,
     emitDarwinBCHeaderAndTrailer(Buffer, TT);
 
   // Write the generated bitstream to "Out".
-  Out.write((char*)&Buffer.front(), Buffer.size());
+  if (!Buffer.empty())
+    Out.write((char *)&Buffer.front(), Buffer.size());
 }
 
 void IndexBitcodeWriter::write() {
@@ -4730,6 +4812,9 @@ static const char *getSectionNameForBitcode(const Triple &T) {
   case Triple::Wasm:
   case Triple::UnknownObjectFormat:
     return ".llvmbc";
+  case Triple::GOFF:
+    llvm_unreachable("GOFF is not yet implemented");
+    break;
   case Triple::XCOFF:
     llvm_unreachable("XCOFF is not yet implemented");
     break;
@@ -4746,6 +4831,9 @@ static const char *getSectionNameForCommandline(const Triple &T) {
   case Triple::Wasm:
   case Triple::UnknownObjectFormat:
     return ".llvmcmd";
+  case Triple::GOFF:
+    llvm_unreachable("GOFF is not yet implemented");
+    break;
   case Triple::XCOFF:
     llvm_unreachable("XCOFF is not yet implemented");
     break;
@@ -4754,8 +4842,8 @@ static const char *getSectionNameForCommandline(const Triple &T) {
 }
 
 void llvm::EmbedBitcodeInModule(llvm::Module &M, llvm::MemoryBufferRef Buf,
-                                bool EmbedBitcode, bool EmbedMarker,
-                                const std::vector<uint8_t> *CmdArgs) {
+                                bool EmbedBitcode, bool EmbedCmdline,
+                                const std::vector<uint8_t> &CmdArgs) {
   // Save llvm.compiler.used and remove it.
   SmallVector<Constant *, 2> UsedArray;
   SmallPtrSet<GlobalValue *, 4> UsedGlobals;
@@ -4774,11 +4862,10 @@ void llvm::EmbedBitcodeInModule(llvm::Module &M, llvm::MemoryBufferRef Buf,
   std::string Data;
   ArrayRef<uint8_t> ModuleData;
   Triple T(M.getTargetTriple());
-  // Create a constant that contains the bitcode.
-  // In case of embedding a marker, ignore the input Buf and use the empty
-  // ArrayRef. It is also legal to create a bitcode marker even Buf is empty.
+
   if (EmbedBitcode) {
-    if (!isBitcode((const unsigned char *)Buf.getBufferStart(),
+    if (Buf.getBufferSize() == 0 ||
+        !isBitcode((const unsigned char *)Buf.getBufferStart(),
                    (const unsigned char *)Buf.getBufferEnd())) {
       // If the input is LLVM Assembly, bitcode is produced by serializing
       // the module. Use-lists order need to be preserved in this case.
@@ -4797,6 +4884,9 @@ void llvm::EmbedBitcodeInModule(llvm::Module &M, llvm::MemoryBufferRef Buf,
       M, ModuleConstant->getType(), true, llvm::GlobalValue::PrivateLinkage,
       ModuleConstant);
   GV->setSection(getSectionNameForBitcode(T));
+  // Set alignment to 1 to prevent padding between two contributions from input
+  // sections after linking.
+  GV->setAlignment(Align(1));
   UsedArray.push_back(
       ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV, UsedElementType));
   if (llvm::GlobalVariable *Old =
@@ -4810,16 +4900,17 @@ void llvm::EmbedBitcodeInModule(llvm::Module &M, llvm::MemoryBufferRef Buf,
   }
 
   // Skip if only bitcode needs to be embedded.
-  if (EmbedMarker) {
+  if (EmbedCmdline) {
     // Embed command-line options.
-    ArrayRef<uint8_t> CmdData(const_cast<uint8_t *>(CmdArgs->data()),
-                              CmdArgs->size());
+    ArrayRef<uint8_t> CmdData(const_cast<uint8_t *>(CmdArgs.data()),
+                              CmdArgs.size());
     llvm::Constant *CmdConstant =
         llvm::ConstantDataArray::get(M.getContext(), CmdData);
     GV = new llvm::GlobalVariable(M, CmdConstant->getType(), true,
                                   llvm::GlobalValue::PrivateLinkage,
                                   CmdConstant);
     GV->setSection(getSectionNameForCommandline(T));
+    GV->setAlignment(Align(1));
     UsedArray.push_back(
         ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV, UsedElementType));
     if (llvm::GlobalVariable *Old = M.getGlobalVariable("llvm.cmdline", true)) {
diff --git a/contrib/llvm-project/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp b/contrib/llvm-project/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
index 8bdddc27e95a..bbee8b324954 100644
--- a/contrib/llvm-project/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
+++ b/contrib/llvm-project/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
@@ -11,11 +11,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "ValueEnumerator.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Argument.h"
-#include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/DebugInfoMetadata.h"
@@ -32,7 +30,6 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Use.h"
-#include "llvm/IR/UseListOrder.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
 #include "llvm/IR/ValueSymbolTable.h"
@@ -42,12 +39,9 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
-#include <cassert>
 #include <cstddef>
 #include <iterator>
 #include <tuple>
-#include <utility>
-#include <vector>
 
 using namespace llvm;
 
@@ -84,6 +78,16 @@ struct OrderMap {
 
 } // end anonymous namespace
 
+/// Look for a value that might be wrapped as metadata, e.g. a value in a
+/// metadata operand. Returns nullptr for a non-wrapped input value if
+/// OnlyWrapped is true, or it returns the input value as-is if false.
+static const Value *skipMetadataWrapper(const Value *V, bool OnlyWrapped) {
+  if (const auto *MAV = dyn_cast<MetadataAsValue>(V))
+    if (const auto *VAM = dyn_cast<ValueAsMetadata>(MAV->getMetadata()))
+      return VAM->getValue();
+  return OnlyWrapped ? nullptr : V;
+}
+
 static void orderValue(const Value *V, OrderMap &OM) {
   if (OM.lookup(V).first)
     return;
@@ -129,6 +133,25 @@ static OrderMap orderModule(const Module &M) {
       if (!isa<GlobalValue>(U.get()))
         orderValue(U.get(), OM);
   }
+
+  // As constants used in metadata operands are emitted as module-level
+  // constants, we must order them before other operands. Also, we must order
+  // these before global values, as these will be read before setting the
+  // global values' initializers. The latter matters for constants which have
+  // uses towards other constants that are used as initializers.
+  for (const Function &F : M) {
+    if (F.isDeclaration())
+      continue;
+    for (const BasicBlock &BB : F)
+      for (const Instruction &I : BB)
+        for (const Value *V : I.operands()) {
+          if (const Value *Op = skipMetadataWrapper(V, true)) {
+            if ((isa<Constant>(*Op) && !isa<GlobalValue>(*Op)) ||
+                isa<InlineAsm>(*Op))
+              orderValue(Op, OM);
+          }
+        }
+  }
   OM.LastGlobalConstantID = OM.size();
 
   // Initializers of GlobalValues are processed in
@@ -979,6 +1002,8 @@ void ValueEnumerator::incorporateFunction(const Function &F) {
     EnumerateValue(&I);
     if (I.hasAttribute(Attribute::ByVal))
       EnumerateType(I.getParamByValType());
+    else if (I.hasAttribute(Attribute::StructRet))
+      EnumerateType(I.getParamStructRetType());
   }
   FirstFuncConstantID = Values.size();
 
diff --git a/contrib/llvm-project/llvm/lib/Bitstream/Reader/BitstreamReader.cpp b/contrib/llvm-project/llvm/lib/Bitstream/Reader/BitstreamReader.cpp
index 2739137c1e44..28adfe6268f9 100644
--- a/contrib/llvm-project/llvm/lib/Bitstream/Reader/BitstreamReader.cpp
+++ b/contrib/llvm-project/llvm/lib/Bitstream/Reader/BitstreamReader.cpp
@@ -27,8 +27,7 @@ Error BitstreamCursor::EnterSubBlock(unsigned BlockID, unsigned *NumWordsP) {
   if (BlockInfo) {
     if (const BitstreamBlockInfo::BlockInfo *Info =
             BlockInfo->getBlockInfo(BlockID)) {
-      CurAbbrevs.insert(CurAbbrevs.end(), Info->Abbrevs.begin(),
-                        Info->Abbrevs.end());
+      llvm::append_range(CurAbbrevs, Info->Abbrevs);
     }
   }
 
@@ -156,8 +155,9 @@ Expected<unsigned> BitstreamCursor::skipRecord(unsigned AbbrevID) {
         report_fatal_error("Array element type can't be an Array or a Blob");
       case BitCodeAbbrevOp::Fixed:
         assert((unsigned)EltEnc.getEncodingData() <= MaxChunkSize);
-        if (Error Err = JumpToBit(GetCurrentBitNo() +
-                                  NumElts * EltEnc.getEncodingData()))
+        if (Error Err =
+                JumpToBit(GetCurrentBitNo() + static_cast<uint64_t>(NumElts) *
+                                                  EltEnc.getEncodingData()))
           return std::move(Err);
         break;
       case BitCodeAbbrevOp::VBR:
@@ -186,7 +186,7 @@ Expected<unsigned> BitstreamCursor::skipRecord(unsigned AbbrevID) {
     SkipToFourByteBoundary();  // 32-bit alignment
 
     // Figure out where the end of this blob will be including tail padding.
-    size_t NewEnd = GetCurrentBitNo()+((NumElts+3)&~3)*8;
+    const size_t NewEnd = GetCurrentBitNo() + alignTo(NumElts, 4) * 8;
 
     // If this would read off the end of the bitcode file, just set the
     // record to empty and return.
@@ -314,7 +314,7 @@ Expected<unsigned> BitstreamCursor::readRecord(unsigned AbbrevID,
 
     // Figure out where the end of this blob will be including tail padding.
     size_t CurBitPos = GetCurrentBitNo();
-    size_t NewEnd = CurBitPos+((NumElts+3)&~3)*8;
+    const size_t NewEnd = CurBitPos + alignTo(NumElts, 4) * 8;
 
     // If this would read off the end of the bitcode file, just set the
     // record to empty and return.
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AllocationOrder.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AllocationOrder.cpp
index c99800659bfd..2aef1234ac0e 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/AllocationOrder.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AllocationOrder.cpp
@@ -26,17 +26,15 @@ using namespace llvm;
 #define DEBUG_TYPE "regalloc"
 
 // Compare VirtRegMap::getRegAllocPref().
-AllocationOrder::AllocationOrder(unsigned VirtReg,
-                                 const VirtRegMap &VRM,
-                                 const RegisterClassInfo &RegClassInfo,
-                                 const LiveRegMatrix *Matrix)
-  : Pos(0), HardHints(false) {
+AllocationOrder AllocationOrder::create(unsigned VirtReg, const VirtRegMap &VRM,
+                                        const RegisterClassInfo &RegClassInfo,
+                                        const LiveRegMatrix *Matrix) {
   const MachineFunction &MF = VRM.getMachineFunction();
   const TargetRegisterInfo *TRI = &VRM.getTargetRegInfo();
-  Order = RegClassInfo.getOrder(MF.getRegInfo().getRegClass(VirtReg));
-  if (TRI->getRegAllocationHints(VirtReg, Order, Hints, MF, &VRM, Matrix))
-    HardHints = true;
-  rewind();
+  auto Order = RegClassInfo.getOrder(MF.getRegInfo().getRegClass(VirtReg));
+  SmallVector<MCPhysReg, 16> Hints;
+  bool HardHints =
+      TRI->getRegAllocationHints(VirtReg, Order, Hints, MF, &VRM, Matrix);
 
   LLVM_DEBUG({
     if (!Hints.empty()) {
@@ -51,4 +49,5 @@ AllocationOrder::AllocationOrder(unsigned VirtReg,
     assert(is_contained(Order, Hints[I]) &&
            "Target hint is outside allocation order.");
 #endif
+  return AllocationOrder(std::move(Hints), Order, HardHints);
 }
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AllocationOrder.h b/contrib/llvm-project/llvm/lib/CodeGen/AllocationOrder.h
index fa0690ab4ea5..0701e6810100 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/AllocationOrder.h
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AllocationOrder.h
@@ -17,9 +17,9 @@
 #define LLVM_LIB_CODEGEN_ALLOCATIONORDER_H
 
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/MC/MCRegister.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/Register.h"
 
 namespace llvm {
 
@@ -28,67 +28,95 @@ class VirtRegMap;
 class LiveRegMatrix;
 
 class LLVM_LIBRARY_VISIBILITY AllocationOrder {
-  SmallVector<MCPhysReg, 16> Hints;
+  const SmallVector<MCPhysReg, 16> Hints;
   ArrayRef<MCPhysReg> Order;
-  int Pos;
-
-  // If HardHints is true, *only* Hints will be returned.
-  bool HardHints;
+  // How far into the Order we can iterate. This is 0 if the AllocationOrder is
+  // constructed with HardHints = true, Order.size() otherwise. While
+  // technically a size_t, it will participate in comparisons with the
+  // Iterator's Pos, which must be signed, so it's typed here as signed, too, to
+  // avoid warnings and under the assumption that the size of Order is
+  // relatively small.
+  // IterationLimit defines an invalid iterator position.
+  const int IterationLimit;
 
 public:
+  /// Forward iterator for an AllocationOrder.
+  class Iterator final {
+    const AllocationOrder &AO;
+    int Pos = 0;
+
+  public:
+    Iterator(const AllocationOrder &AO, int Pos) : AO(AO), Pos(Pos) {}
+
+    /// Return true if the curent position is that of a preferred register.
+    bool isHint() const { return Pos < 0; }
+
+    /// Return the next physical register in the allocation order.
+    MCRegister operator*() const {
+      if (Pos < 0)
+        return AO.Hints.end()[Pos];
+      assert(Pos < AO.IterationLimit);
+      return AO.Order[Pos];
+    }
+
+    /// Advance the iterator to the next position. If that's past the Hints
+    /// list, advance to the first value that's not also in the Hints list.
+    Iterator &operator++() {
+      if (Pos < AO.IterationLimit)
+        ++Pos;
+      while (Pos >= 0 && Pos < AO.IterationLimit && AO.isHint(AO.Order[Pos]))
+        ++Pos;
+      return *this;
+    }
+
+    bool operator==(const Iterator &Other) const {
+      assert(&AO == &Other.AO);
+      return Pos == Other.Pos;
+    }
+
+    bool operator!=(const Iterator &Other) const { return !(*this == Other); }
+  };
 
   /// Create a new AllocationOrder for VirtReg.
   /// @param VirtReg      Virtual register to allocate for.
   /// @param VRM          Virtual register map for function.
   /// @param RegClassInfo Information about reserved and allocatable registers.
-  AllocationOrder(unsigned VirtReg,
-                  const VirtRegMap &VRM,
-                  const RegisterClassInfo &RegClassInfo,
-                  const LiveRegMatrix *Matrix);
-
-  /// Get the allocation order without reordered hints.
-  ArrayRef<MCPhysReg> getOrder() const { return Order; }
-
-  /// Return the next physical register in the allocation order, or 0.
-  /// It is safe to call next() again after it returned 0, it will keep
-  /// returning 0 until rewind() is called.
-  unsigned next(unsigned Limit = 0) {
-    if (Pos < 0)
-      return Hints.end()[Pos++];
-    if (HardHints)
-      return 0;
-    if (!Limit)
-      Limit = Order.size();
-    while (Pos < int(Limit)) {
-      unsigned Reg = Order[Pos++];
-      if (!isHint(Reg))
-        return Reg;
-    }
-    return 0;
+  static AllocationOrder create(unsigned VirtReg, const VirtRegMap &VRM,
+                                const RegisterClassInfo &RegClassInfo,
+                                const LiveRegMatrix *Matrix);
+
+  /// Create an AllocationOrder given the Hits, Order, and HardHits values.
+  /// Use the create method above - the ctor is for unittests.
+  AllocationOrder(SmallVector<MCPhysReg, 16> &&Hints, ArrayRef<MCPhysReg> Order,
+                  bool HardHints)
+      : Hints(std::move(Hints)), Order(Order),
+        IterationLimit(HardHints ? 0 : static_cast<int>(Order.size())) {}
+
+  Iterator begin() const {
+    return Iterator(*this, -(static_cast<int>(Hints.size())));
   }
 
-  /// As next(), but allow duplicates to be returned, and stop before the
-  /// Limit'th register in the RegisterClassInfo allocation order.
-  ///
-  /// This can produce more than Limit registers if there are hints.
-  unsigned nextWithDups(unsigned Limit) {
-    if (Pos < 0)
-      return Hints.end()[Pos++];
-    if (HardHints)
-      return 0;
-    if (Pos < int(Limit))
-      return Order[Pos++];
-    return 0;
-  }
+  Iterator end() const { return Iterator(*this, IterationLimit); }
 
-  /// Start over from the beginning.
-  void rewind() { Pos = -int(Hints.size()); }
+  Iterator getOrderLimitEnd(unsigned OrderLimit) const {
+    assert(OrderLimit <= Order.size());
+    if (OrderLimit == 0)
+      return end();
+    Iterator Ret(*this,
+                 std::min(static_cast<int>(OrderLimit) - 1, IterationLimit));
+    return ++Ret;
+  }
 
-  /// Return true if the last register returned from next() was a preferred register.
-  bool isHint() const { return Pos <= 0; }
+  /// Get the allocation order without reordered hints.
+  ArrayRef<MCPhysReg> getOrder() const { return Order; }
 
-  /// Return true if PhysReg is a preferred register.
-  bool isHint(unsigned PhysReg) const { return is_contained(Hints, PhysReg); }
+  /// Return true if Reg is a preferred physical register.
+  bool isHint(Register Reg) const {
+    assert(!Reg.isPhysical() ||
+           Reg.id() <
+               static_cast<uint32_t>(std::numeric_limits<MCPhysReg>::max()));
+    return Reg.isPhysical() && is_contained(Hints, Reg.id());
+  }
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/Analysis.cpp b/contrib/llvm-project/llvm/lib/CodeGen/Analysis.cpp
index 7da28ffec85c..ebeff1fec30b 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/Analysis.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/Analysis.cpp
@@ -88,19 +88,25 @@ void llvm::ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL,
                            uint64_t StartingOffset) {
   // Given a struct type, recursively traverse the elements.
   if (StructType *STy = dyn_cast<StructType>(Ty)) {
-    const StructLayout *SL = DL.getStructLayout(STy);
+    // If the Offsets aren't needed, don't query the struct layout. This allows
+    // us to support structs with scalable vectors for operations that don't
+    // need offsets.
+    const StructLayout *SL = Offsets ? DL.getStructLayout(STy) : nullptr;
     for (StructType::element_iterator EB = STy->element_begin(),
                                       EI = EB,
                                       EE = STy->element_end();
-         EI != EE; ++EI)
+         EI != EE; ++EI) {
+      // Don't compute the element offset if we didn't get a StructLayout above.
+      uint64_t EltOffset = SL ? SL->getElementOffset(EI - EB) : 0;
       ComputeValueVTs(TLI, DL, *EI, ValueVTs, MemVTs, Offsets,
-                      StartingOffset + SL->getElementOffset(EI - EB));
+                      StartingOffset + EltOffset);
+    }
     return;
   }
   // Given an array type, recursively traverse the elements.
   if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
     Type *EltTy = ATy->getElementType();
-    uint64_t EltSize = DL.getTypeAllocSize(EltTy);
+    uint64_t EltSize = DL.getTypeAllocSize(EltTy).getFixedValue();
     for (unsigned i = 0, e = ATy->getNumElements(); i != e; ++i)
       ComputeValueVTs(TLI, DL, EltTy, ValueVTs, MemVTs, Offsets,
                       StartingOffset + i * EltSize);
@@ -131,16 +137,21 @@ void llvm::computeValueLLTs(const DataLayout &DL, Type &Ty,
                             uint64_t StartingOffset) {
   // Given a struct type, recursively traverse the elements.
   if (StructType *STy = dyn_cast<StructType>(&Ty)) {
-    const StructLayout *SL = DL.getStructLayout(STy);
-    for (unsigned I = 0, E = STy->getNumElements(); I != E; ++I)
+    // If the Offsets aren't needed, don't query the struct layout. This allows
+    // us to support structs with scalable vectors for operations that don't
+    // need offsets.
+    const StructLayout *SL = Offsets ? DL.getStructLayout(STy) : nullptr;
+    for (unsigned I = 0, E = STy->getNumElements(); I != E; ++I) {
+      uint64_t EltOffset = SL ? SL->getElementOffset(I) : 0;
       computeValueLLTs(DL, *STy->getElementType(I), ValueTys, Offsets,
-                       StartingOffset + SL->getElementOffset(I));
+                       StartingOffset + EltOffset);
+    }
     return;
   }
   // Given an array type, recursively traverse the elements.
   if (ArrayType *ATy = dyn_cast<ArrayType>(&Ty)) {
     Type *EltTy = ATy->getElementType();
-    uint64_t EltSize = DL.getTypeAllocSize(EltTy);
+    uint64_t EltSize = DL.getTypeAllocSize(EltTy).getFixedValue();
     for (unsigned i = 0, e = ATy->getNumElements(); i != e; ++i)
       computeValueLLTs(DL, *EltTy, ValueTys, Offsets,
                        StartingOffset + i * EltSize);
@@ -174,27 +185,6 @@ GlobalValue *llvm::ExtractTypeInfo(Value *V) {
   return GV;
 }
 
-/// hasInlineAsmMemConstraint - Return true if the inline asm instruction being
-/// processed uses a memory 'm' constraint.
-bool
-llvm::hasInlineAsmMemConstraint(InlineAsm::ConstraintInfoVector &CInfos,
-                                const TargetLowering &TLI) {
-  for (unsigned i = 0, e = CInfos.size(); i != e; ++i) {
-    InlineAsm::ConstraintInfo &CI = CInfos[i];
-    for (unsigned j = 0, ee = CI.Codes.size(); j != ee; ++j) {
-      TargetLowering::ConstraintType CType = TLI.getConstraintType(CI.Codes[j]);
-      if (CType == TargetLowering::C_Memory)
-        return true;
-    }
-
-    // Indirect operand accesses access memory.
-    if (CI.isIndirect)
-      return true;
-  }
-
-  return false;
-}
-
 /// getFCmpCondCode - Return the ISD condition code corresponding to
 /// the given LLVM IR floating-point condition code.  This includes
 /// consideration of global floating-point math flags.
@@ -537,11 +527,15 @@ bool llvm::isInTailCallPosition(const CallBase &Call, const TargetMachine &TM) {
     // Debug info intrinsics do not get in the way of tail call optimization.
     if (isa<DbgInfoIntrinsic>(BBI))
       continue;
-    // A lifetime end or assume intrinsic should not stop tail call
-    // optimization.
+    // Pseudo probe intrinsics do not block tail call optimization either.
+    if (isa<PseudoProbeInst>(BBI))
+      continue;
+    // A lifetime end, assume or noalias.decl intrinsic should not stop tail
+    // call optimization.
     if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(BBI))
       if (II->getIntrinsicID() == Intrinsic::lifetime_end ||
-          II->getIntrinsicID() == Intrinsic::assume)
+          II->getIntrinsicID() == Intrinsic::assume ||
+          II->getIntrinsicID() == Intrinsic::experimental_noalias_scope_decl)
         continue;
     if (BBI->mayHaveSideEffects() || BBI->mayReadFromMemory() ||
         !isSafeToSpeculativelyExecute(&*BBI))
@@ -739,8 +733,7 @@ static void collectEHScopeMembers(
     if (Visiting->isEHScopeReturnBlock())
       continue;
 
-    for (const MachineBasicBlock *Succ : Visiting->successors())
-      Worklist.push_back(Succ);
+    append_range(Worklist, Visiting->successors());
   }
 }
 
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp
new file mode 100644
index 000000000000..95d878e65be4
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp
@@ -0,0 +1,79 @@
+//===-- CodeGen/AsmPrinter/AIXException.cpp - AIX Exception Impl ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing AIX exception info into asm files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DwarfException.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/MC/MCSectionXCOFF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+AIXException::AIXException(AsmPrinter *A) : DwarfCFIExceptionBase(A) {}
+
+void AIXException::emitExceptionInfoTable(const MCSymbol *LSDA,
+                                          const MCSymbol *PerSym) {
+  // Generate EH Info Table.
+  // The EH Info Table, aka, 'compat unwind section' on AIX, have the following
+  // format: struct eh_info_t {
+  //   unsigned version;           /* EH info verion 0 */
+  // #if defined(__64BIT__)
+  //   char _pad[4];               /* padding */
+  // #endif
+  //   unsigned long lsda;         /* Pointer to LSDA */
+  //   unsigned long personality;  /* Pointer to the personality routine */
+  //   }
+
+  Asm->OutStreamer->SwitchSection(
+      Asm->getObjFileLowering().getCompactUnwindSection());
+  MCSymbol *EHInfoLabel =
+      TargetLoweringObjectFileXCOFF::getEHInfoTableSymbol(Asm->MF);
+  Asm->OutStreamer->emitLabel(EHInfoLabel);
+
+  // Version number.
+  Asm->emitInt32(0);
+
+  const DataLayout &DL = MMI->getModule()->getDataLayout();
+  const unsigned PointerSize = DL.getPointerSize();
+
+  // Add necessary paddings in 64 bit mode.
+  Asm->OutStreamer->emitValueToAlignment(PointerSize);
+
+  // LSDA location.
+  Asm->OutStreamer->emitValue(MCSymbolRefExpr::create(LSDA, Asm->OutContext),
+                              PointerSize);
+
+  // Personality routine.
+  Asm->OutStreamer->emitValue(MCSymbolRefExpr::create(PerSym, Asm->OutContext),
+                              PointerSize);
+}
+
+void AIXException::endFunction(const MachineFunction *MF) {
+  if (!TargetLoweringObjectFileXCOFF::ShouldEmitEHBlock(MF))
+    return;
+
+  const MCSymbol *LSDALabel = emitExceptionTable();
+
+  const Function &F = MF->getFunction();
+  assert(F.hasPersonalityFn() &&
+         "Landingpads are presented, but no personality routine is found.");
+  const Function *Per =
+      dyn_cast<Function>(F.getPersonalityFn()->stripPointerCasts());
+  const MCSymbol *PerSym = Asm->TM.getSymbol(Per);
+
+  emitExceptionInfoTable(LSDALabel, PerSym);
+}
+
+} // End of namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp
index dea0227f7578..4e45a0ffc60f 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp
@@ -190,7 +190,6 @@ public:
 template <typename DataT>
 class Dwarf5AccelTableWriter : public AccelTableWriter {
   struct Header {
-    uint32_t UnitLength = 0;
     uint16_t Version = 5;
     uint16_t Padding = 0;
     uint32_t CompUnitCount;
@@ -271,7 +270,7 @@ void AccelTableWriter::emitOffsets(const MCSymbol *Base) const {
         continue;
       PrevHash = HashValue;
       Asm->OutStreamer->AddComment("Offset in Bucket " + Twine(i));
-      Asm->emitLabelDifference(Hash->Sym, Base, sizeof(uint32_t));
+      Asm->emitLabelDifference(Hash->Sym, Base, Asm->getDwarfOffsetByteSize());
     }
   }
 }
@@ -367,9 +366,8 @@ void Dwarf5AccelTableWriter<DataT>::Header::emit(
   assert(CompUnitCount > 0 && "Index must have at least one CU.");
 
   AsmPrinter *Asm = Ctx.Asm;
-  Asm->OutStreamer->AddComment("Header: unit length");
-  Asm->emitLabelDifference(Ctx.ContributionEnd, Ctx.ContributionStart,
-                           sizeof(uint32_t));
+  Asm->emitDwarfUnitLength(Ctx.ContributionEnd, Ctx.ContributionStart,
+                           "Header: unit length");
   Asm->OutStreamer->emitLabel(Ctx.ContributionStart);
   Asm->OutStreamer->AddComment("Header: version");
   Asm->emitInt16(Version);
@@ -506,7 +504,7 @@ template <typename DataT> void Dwarf5AccelTableWriter<DataT>::emitData() const {
       for (const auto *Value : Hash->Values)
         emitEntry(*static_cast<const DataT *>(Value));
       Asm->OutStreamer->AddComment("End of list: " + Hash->Name.getString());
-      Asm->emitInt32(0);
+      Asm->emitInt8(0);
     }
   }
 }
@@ -593,10 +591,14 @@ void llvm::emitDWARF5AccelTable(
 }
 
 void AppleAccelTableOffsetData::emit(AsmPrinter *Asm) const {
+  assert(Die.getDebugSectionOffset() <= UINT32_MAX &&
+         "The section offset exceeds the limit.");
   Asm->emitInt32(Die.getDebugSectionOffset());
 }
 
 void AppleAccelTableTypeData::emit(AsmPrinter *Asm) const {
+  assert(Die.getDebugSectionOffset() <= UINT32_MAX &&
+         "The section offset exceeds the limit.");
   Asm->emitInt32(Die.getDebugSectionOffset());
   Asm->emitInt16(Die.getTag());
   Asm->emitInt8(0);
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp
index 883aaf5aefc4..3df8e35accc4 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp
@@ -29,9 +29,7 @@ MCSymbol *AddressPool::emitHeader(AsmPrinter &Asm, MCSection *Section) {
   MCSymbol *BeginLabel = Asm.createTempSymbol(Prefix + "start");
   MCSymbol *EndLabel = Asm.createTempSymbol(Prefix + "end");
 
-  Asm.OutStreamer->AddComment("Length of contribution");
-  Asm.emitLabelDifference(EndLabel, BeginLabel,
-                          4); // TODO: Support DWARF64 format.
+  Asm.emitDwarfUnitLength(EndLabel, BeginLabel, "Length of contribution");
   Asm.OutStreamer->emitLabel(BeginLabel);
   Asm.OutStreamer->AddComment("DWARF version number");
   Asm.emitInt16(Asm.getDwarfVersion());
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AddressPool.h b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AddressPool.h
index f92cf72093ca..f1edc6c330d5 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AddressPool.h
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AddressPool.h
@@ -48,7 +48,7 @@ public:
 
   bool hasBeenUsed() const { return HasBeenUsed; }
 
-  void resetUsedFlag() { HasBeenUsed = false; }
+  void resetUsedFlag(bool HasBeenUsed = false) { this->HasBeenUsed = HasBeenUsed; }
 
   MCSymbol *getLabel() { return AddressTableBaseSym; }
   void setLabel(MCSymbol *Sym) { AddressTableBaseSym = Sym; }
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index c7eb0257d71b..85754bf29d0c 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -14,6 +14,7 @@
 #include "CodeViewDebug.h"
 #include "DwarfDebug.h"
 #include "DwarfException.h"
+#include "PseudoProbePrinter.h"
 #include "WasmException.h"
 #include "WinCFGuard.h"
 #include "WinException.h"
@@ -30,6 +31,7 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/BinaryFormat/COFF.h"
 #include "llvm/BinaryFormat/Dwarf.h"
@@ -77,6 +79,7 @@
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/PseudoProbe.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -131,17 +134,25 @@ using namespace llvm;
 
 #define DEBUG_TYPE "asm-printer"
 
-static const char *const DWARFGroupName = "dwarf";
-static const char *const DWARFGroupDescription = "DWARF Emission";
-static const char *const DbgTimerName = "emit";
-static const char *const DbgTimerDescription = "Debug Info Emission";
-static const char *const EHTimerName = "write_exception";
-static const char *const EHTimerDescription = "DWARF Exception Writer";
-static const char *const CFGuardName = "Control Flow Guard";
-static const char *const CFGuardDescription = "Control Flow Guard";
-static const char *const CodeViewLineTablesGroupName = "linetables";
-static const char *const CodeViewLineTablesGroupDescription =
-  "CodeView Line Tables";
+// FIXME: this option currently only applies to DWARF, and not CodeView, tables
+static cl::opt<bool>
+    DisableDebugInfoPrinting("disable-debug-info-print", cl::Hidden,
+                             cl::desc("Disable debug info printing"));
+
+const char DWARFGroupName[] = "dwarf";
+const char DWARFGroupDescription[] = "DWARF Emission";
+const char DbgTimerName[] = "emit";
+const char DbgTimerDescription[] = "Debug Info Emission";
+const char EHTimerName[] = "write_exception";
+const char EHTimerDescription[] = "DWARF Exception Writer";
+const char CFGuardName[] = "Control Flow Guard";
+const char CFGuardDescription[] = "Control Flow Guard";
+const char CodeViewLineTablesGroupName[] = "linetables";
+const char CodeViewLineTablesGroupDescription[] = "CodeView Line Tables";
+const char PPTimerName[] = "emit";
+const char PPTimerDescription[] = "Pseudo Probe Emission";
+const char PPGroupName[] = "pseudo probe";
+const char PPGroupDescription[] = "Pseudo Probe Emission";
 
 STATISTIC(EmittedInsts, "Number of machine instrs printed");
 
@@ -188,7 +199,8 @@ AsmPrinter::AsmPrinter(TargetMachine &tm, std::unique_ptr<MCStreamer> Streamer)
 }
 
 AsmPrinter::~AsmPrinter() {
-  assert(!DD && Handlers.empty() && "Debug/EH info didn't get finalized");
+  assert(!DD && Handlers.size() == NumUserHandlers &&
+         "Debug/EH info didn't get finalized");
 
   if (GCMetadataPrinters) {
     gcp_map_type &GCMap = getGCMap(GCMetadataPrinters);
@@ -231,9 +243,11 @@ void AsmPrinter::EmitToStreamer(MCStreamer &S, const MCInst &Inst) {
 }
 
 void AsmPrinter::emitInitialRawDwarfLocDirective(const MachineFunction &MF) {
-  assert(DD && "Dwarf debug file is not defined.");
-  assert(OutStreamer->hasRawTextSupport() && "Expected assembly output mode.");
-  (void)DD->emitInitialLocDirective(MF, /*CUID=*/0);
+  if (DD) {
+    assert(OutStreamer->hasRawTextSupport() &&
+           "Expected assembly output mode.");
+    (void)DD->emitInitialLocDirective(MF, /*CUID=*/0);
+  }
 }
 
 /// getCurrentSection() - Return the current section we are emitting to.
@@ -261,6 +275,9 @@ bool AsmPrinter::doInitialization(Module &M) {
 
   OutStreamer->InitSections(false);
 
+  if (DisableDebugInfoPrinting)
+    MMI->setDebugInfoAvailability(false);
+
   // Emit the version-min deployment target directive if needed.
   //
   // FIXME: If we end up with a collection of these sorts of Darwin-specific
@@ -296,6 +313,7 @@ bool AsmPrinter::doInitialization(Module &M) {
     std::unique_ptr<MCSubtargetInfo> STI(TM.getTarget().createMCSubtargetInfo(
         TM.getTargetTriple().str(), TM.getTargetCPU(),
         TM.getTargetFeatureString()));
+    assert(STI && "Unable to create subtarget info");
     OutStreamer->AddComment("Start of file scope inline assembly");
     OutStreamer->AddBlankLine();
     emitInlineAsm(M.getModuleInlineAsm() + "\n",
@@ -313,14 +331,21 @@ bool AsmPrinter::doInitialization(Module &M) {
                             CodeViewLineTablesGroupDescription);
     }
     if (!EmitCodeView || M.getDwarfVersion()) {
-      DD = new DwarfDebug(this, &M);
-      DD->beginModule();
-      Handlers.emplace_back(std::unique_ptr<DwarfDebug>(DD), DbgTimerName,
-                            DbgTimerDescription, DWARFGroupName,
-                            DWARFGroupDescription);
+      if (!DisableDebugInfoPrinting) {
+        DD = new DwarfDebug(this);
+        Handlers.emplace_back(std::unique_ptr<DwarfDebug>(DD), DbgTimerName,
+                              DbgTimerDescription, DWARFGroupName,
+                              DWARFGroupDescription);
+      }
     }
   }
 
+  if (M.getNamedMetadata(PseudoProbeDescMetadataName)) {
+    PP = new PseudoProbeHandler(this, &M);
+    Handlers.emplace_back(std::unique_ptr<PseudoProbeHandler>(PP), PPTimerName,
+                          PPTimerDescription, PPGroupName, PPGroupDescription);
+  }
+
   switch (MAI->getExceptionHandlingType()) {
   case ExceptionHandling::SjLj:
   case ExceptionHandling::DwarfCFI:
@@ -368,6 +393,9 @@ bool AsmPrinter::doInitialization(Module &M) {
   case ExceptionHandling::Wasm:
     ES = new WasmException(this);
     break;
+  case ExceptionHandling::AIX:
+    ES = new AIXException(this);
+    break;
   }
   if (ES)
     Handlers.emplace_back(std::unique_ptr<EHStreamer>(ES), EHTimerName,
@@ -379,6 +407,13 @@ bool AsmPrinter::doInitialization(Module &M) {
     Handlers.emplace_back(std::make_unique<WinCFGuard>(this), CFGuardName,
                           CFGuardDescription, DWARFGroupName,
                           DWARFGroupDescription);
+
+  for (const HandlerInfo &HI : Handlers) {
+    NamedRegionTimer T(HI.TimerName, HI.TimerDescription, HI.TimerGroupName,
+                       HI.TimerGroupDescription, TimePassesIsEnabled);
+    HI.Handler->beginModule(&M);
+  }
+
   return false;
 }
 
@@ -449,10 +484,8 @@ MCSymbol *AsmPrinter::getSymbolPreferLocal(const GlobalValue &GV) const {
   if (TM.getTargetTriple().isOSBinFormatELF() && GV.canBenefitFromLocalAlias()) {
     const Module &M = *GV.getParent();
     if (TM.getRelocationModel() != Reloc::Static &&
-        M.getPIELevel() == PIELevel::Default)
-      if (GV.isDSOLocal() || (TM.getTargetTriple().isX86() &&
-                              GV.getParent()->noSemanticInterposition()))
-        return getSymbolWithGlobalValueBase(&GV, "$local");
+        M.getPIELevel() == PIELevel::Default && GV.isDSOLocal())
+      return getSymbolWithGlobalValueBase(&GV, "$local");
   }
   return TM.getSymbol(&GV);
 }
@@ -500,8 +533,8 @@ void AsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
 
   GVSym->redefineIfPossible();
   if (GVSym->isDefined() || GVSym->isVariable())
-    report_fatal_error("symbol '" + Twine(GVSym->getName()) +
-                       "' is already defined");
+    OutContext.reportError(SMLoc(), "symbol '" + Twine(GVSym->getName()) +
+                                        "' is already defined");
 
   if (MAI->hasDotTypeDotSizeDirective())
     OutStreamer->emitSymbolAttribute(EmittedSym, MCSA_ELF_TypeObject);
@@ -812,13 +845,21 @@ static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS) {
   if ((Size = MI.getRestoreSize(TII))) {
     CommentOS << *Size << "-byte Reload\n";
   } else if ((Size = MI.getFoldedRestoreSize(TII))) {
-    if (*Size)
-      CommentOS << *Size << "-byte Folded Reload\n";
+    if (*Size) {
+      if (*Size == unsigned(MemoryLocation::UnknownSize))
+        CommentOS << "Unknown-size Folded Reload\n";
+      else
+        CommentOS << *Size << "-byte Folded Reload\n";
+    }
   } else if ((Size = MI.getSpillSize(TII))) {
     CommentOS << *Size << "-byte Spill\n";
   } else if ((Size = MI.getFoldedSpillSize(TII))) {
-    if (*Size)
-      CommentOS << *Size << "-byte Folded Spill\n";
+    if (*Size) {
+      if (*Size == unsigned(MemoryLocation::UnknownSize))
+        CommentOS << "Unknown-size Folded Spill\n";
+      else
+        CommentOS << *Size << "-byte Folded Spill\n";
+    }
   }
 
   // Check for spill-induced copies
@@ -877,7 +918,7 @@ static bool emitDebugValueComment(const MachineInstr *MI, AsmPrinter &AP) {
 
   // The second operand is only an offset if it's an immediate.
   bool MemLoc = MI->isIndirectDebugValue();
-  int64_t Offset = MemLoc ? MI->getOperand(1).getImm() : 0;
+  auto Offset = StackOffset::getFixed(MemLoc ? MI->getOperand(1).getImm() : 0);
   const DIExpression *Expr = MI->getDebugExpression();
   if (Expr->getNumElements()) {
     OS << '[';
@@ -916,6 +957,8 @@ static bool emitDebugValueComment(const MachineInstr *MI, AsmPrinter &AP) {
   } else if (MI->getDebugOperand(0).isTargetIndex()) {
     auto Op = MI->getDebugOperand(0);
     OS << "!target-index(" << Op.getIndex() << "," << Op.getOffset() << ")";
+    // NOTE: Want this comment at start of line, don't emit with AddComment.
+    AP.OutStreamer->emitRawComment(OS.str());
     return true;
   } else {
     Register Reg;
@@ -941,7 +984,7 @@ static bool emitDebugValueComment(const MachineInstr *MI, AsmPrinter &AP) {
   }
 
   if (MemLoc)
-    OS << '+' << Offset << ']';
+    OS << '+' << Offset.getFixed() << ']';
 
   // NOTE: Want this comment at start of line, don't emit with AddComment.
   AP.OutStreamer->emitRawComment(OS.str());
@@ -1023,6 +1066,56 @@ void AsmPrinter::emitFrameAlloc(const MachineInstr &MI) {
                              MCConstantExpr::create(FrameOffset, OutContext));
 }
 
+/// Returns the BB metadata to be emitted in the .llvm_bb_addr_map section for a
+/// given basic block. This can be used to capture more precise profile
+/// information. We use the last 3 bits (LSBs) to ecnode the following
+/// information:
+///  * (1): set if return block (ret or tail call).
+///  * (2): set if ends with a tail call.
+///  * (3): set if exception handling (EH) landing pad.
+/// The remaining bits are zero.
+static unsigned getBBAddrMapMetadata(const MachineBasicBlock &MBB) {
+  const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo();
+  return ((unsigned)MBB.isReturnBlock()) |
+         ((!MBB.empty() && TII->isTailCall(MBB.back())) << 1) |
+         (MBB.isEHPad() << 2);
+}
+
+void AsmPrinter::emitBBAddrMapSection(const MachineFunction &MF) {
+  MCSection *BBAddrMapSection =
+      getObjFileLowering().getBBAddrMapSection(*MF.getSection());
+  assert(BBAddrMapSection && ".llvm_bb_addr_map section is not initialized.");
+
+  const MCSymbol *FunctionSymbol = getFunctionBegin();
+
+  OutStreamer->PushSection();
+  OutStreamer->SwitchSection(BBAddrMapSection);
+  OutStreamer->emitSymbolValue(FunctionSymbol, getPointerSize());
+  // Emit the total number of basic blocks in this function.
+  OutStreamer->emitULEB128IntValue(MF.size());
+  // Emit BB Information for each basic block in the funciton.
+  for (const MachineBasicBlock &MBB : MF) {
+    const MCSymbol *MBBSymbol =
+        MBB.isEntryBlock() ? FunctionSymbol : MBB.getSymbol();
+    // Emit the basic block offset.
+    emitLabelDifferenceAsULEB128(MBBSymbol, FunctionSymbol);
+    // Emit the basic block size. When BBs have alignments, their size cannot
+    // always be computed from their offsets.
+    emitLabelDifferenceAsULEB128(MBB.getEndSymbol(), MBBSymbol);
+    OutStreamer->emitULEB128IntValue(getBBAddrMapMetadata(MBB));
+  }
+  OutStreamer->PopSection();
+}
+
+void AsmPrinter::emitPseudoProbe(const MachineInstr &MI) {
+  auto GUID = MI.getOperand(0).getImm();
+  auto Index = MI.getOperand(1).getImm();
+  auto Type = MI.getOperand(2).getImm();
+  auto Attr = MI.getOperand(3).getImm();
+  DILocation *DebugLoc = MI.getDebugLoc();
+  PP->emitPseudoProbe(GUID, Index, Type, Attr, DebugLoc);
+}
+
 void AsmPrinter::emitStackSizeSection(const MachineFunction &MF) {
   if (!MF.getTarget().Options.EmitStackSizeSection)
     return;
@@ -1069,8 +1162,6 @@ void AsmPrinter::emitFunctionBody() {
   // Emit target-specific gunk before the function body.
   emitFunctionBodyStart();
 
-  bool ShouldPrintDebugScopes = MMI->hasDebugInfo();
-
   if (isVerbose()) {
     // Get MachineDominatorTree or compute it on the fly if it's unavailable
     MDT = getAnalysisIfAvailable<MachineDominatorTree>();
@@ -1093,9 +1184,11 @@ void AsmPrinter::emitFunctionBody() {
   bool HasAnyRealCode = false;
   int NumInstsInFunction = 0;
 
+  bool CanDoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
   for (auto &MBB : *MF) {
     // Print a label for the basic block.
     emitBasicBlockStart(MBB);
+    DenseMap<StringRef, unsigned> MnemonicCounts;
     for (auto &MI : MBB) {
       // Print the assembly for the instruction.
       if (!MI.isPosition() && !MI.isImplicitDef() && !MI.isKill() &&
@@ -1108,13 +1201,10 @@ void AsmPrinter::emitFunctionBody() {
       if (MCSymbol *S = MI.getPreInstrSymbol())
         OutStreamer->emitLabel(S);
 
-      if (ShouldPrintDebugScopes) {
-        for (const HandlerInfo &HI : Handlers) {
-          NamedRegionTimer T(HI.TimerName, HI.TimerDescription,
-                             HI.TimerGroupName, HI.TimerGroupDescription,
-                             TimePassesIsEnabled);
-          HI.Handler->beginInstruction(&MI);
-        }
+      for (const HandlerInfo &HI : Handlers) {
+        NamedRegionTimer T(HI.TimerName, HI.TimerDescription, HI.TimerGroupName,
+                           HI.TimerGroupDescription, TimePassesIsEnabled);
+        HI.Handler->beginInstruction(&MI);
       }
 
       if (isVerbose())
@@ -1142,6 +1232,11 @@ void AsmPrinter::emitFunctionBody() {
             emitInstruction(&MI);
         }
         break;
+      case TargetOpcode::DBG_INSTR_REF:
+        // This instruction reference will have been resolved to a machine
+        // location, and a nearby DBG_VALUE created. We can safely ignore
+        // the instruction reference.
+        break;
       case TargetOpcode::DBG_LABEL:
         if (isVerbose()) {
           if (!emitDebugLabelComment(&MI, *this))
@@ -1154,8 +1249,18 @@ void AsmPrinter::emitFunctionBody() {
       case TargetOpcode::KILL:
         if (isVerbose()) emitKill(&MI, *this);
         break;
+      case TargetOpcode::PSEUDO_PROBE:
+        emitPseudoProbe(MI);
+        break;
       default:
         emitInstruction(&MI);
+        if (CanDoExtraAnalysis) {
+          MCInst MCI;
+          MCI.setOpcode(MI.getOpcode());
+          auto Name = OutStreamer->getMnemonic(MCI);
+          auto I = MnemonicCounts.insert({Name, 0u});
+          I.first->second++;
+        }
         break;
       }
 
@@ -1163,54 +1268,69 @@ void AsmPrinter::emitFunctionBody() {
       if (MCSymbol *S = MI.getPostInstrSymbol())
         OutStreamer->emitLabel(S);
 
-      if (ShouldPrintDebugScopes) {
-        for (const HandlerInfo &HI : Handlers) {
-          NamedRegionTimer T(HI.TimerName, HI.TimerDescription,
-                             HI.TimerGroupName, HI.TimerGroupDescription,
-                             TimePassesIsEnabled);
-          HI.Handler->endInstruction();
-        }
+      for (const HandlerInfo &HI : Handlers) {
+        NamedRegionTimer T(HI.TimerName, HI.TimerDescription, HI.TimerGroupName,
+                           HI.TimerGroupDescription, TimePassesIsEnabled);
+        HI.Handler->endInstruction();
       }
     }
 
-    // We need a temporary symbol for the end of this basic block, if either we
-    // have BBLabels enabled and we want to emit size directive for the BBs, or
-    // if this basic blocks marks the end of a section (except the section
-    // containing the entry basic block as the end symbol for that section is
-    // CurrentFnEnd).
-    MCSymbol *CurrentBBEnd = nullptr;
-    if ((MAI->hasDotTypeDotSizeDirective() && MF->hasBBLabels()) ||
-        (MBB.isEndSection() && !MBB.sameSection(&MF->front()))) {
-      CurrentBBEnd = OutContext.createTempSymbol();
-      OutStreamer->emitLabel(CurrentBBEnd);
-    }
+    // We must emit temporary symbol for the end of this basic block, if either
+    // we have BBLabels enabled or if this basic blocks marks the end of a
+    // section (except the section containing the entry basic block as the end
+    // symbol for that section is CurrentFnEnd).
+    if (MF->hasBBLabels() ||
+        (MAI->hasDotTypeDotSizeDirective() && MBB.isEndSection() &&
+         !MBB.sameSection(&MF->front())))
+      OutStreamer->emitLabel(MBB.getEndSymbol());
 
-    // Helper for emitting the size directive associated with a basic block
-    // symbol.
-    auto emitELFSizeDirective = [&](MCSymbol *SymForSize) {
-      assert(CurrentBBEnd && "Basicblock end symbol not set!");
-      const MCExpr *SizeExp = MCBinaryExpr::createSub(
-          MCSymbolRefExpr::create(CurrentBBEnd, OutContext),
-          MCSymbolRefExpr::create(SymForSize, OutContext), OutContext);
-      OutStreamer->emitELFSize(SymForSize, SizeExp);
-    };
-
-    // Emit size directive for the size of each basic block, if BBLabels is
-    // enabled.
-    if (MAI->hasDotTypeDotSizeDirective() && MF->hasBBLabels())
-      emitELFSizeDirective(MBB.getSymbol());
-
-    // Emit size directive for the size of each basic block section once we
-    // get to the end of that section.
     if (MBB.isEndSection()) {
+      // The size directive for the section containing the entry block is
+      // handled separately by the function section.
       if (!MBB.sameSection(&MF->front())) {
-        if (MAI->hasDotTypeDotSizeDirective())
-          emitELFSizeDirective(CurrentSectionBeginSym);
+        if (MAI->hasDotTypeDotSizeDirective()) {
+          // Emit the size directive for the basic block section.
+          const MCExpr *SizeExp = MCBinaryExpr::createSub(
+              MCSymbolRefExpr::create(MBB.getEndSymbol(), OutContext),
+              MCSymbolRefExpr::create(CurrentSectionBeginSym, OutContext),
+              OutContext);
+          OutStreamer->emitELFSize(CurrentSectionBeginSym, SizeExp);
+        }
         MBBSectionRanges[MBB.getSectionIDNum()] =
-            MBBSectionRange{CurrentSectionBeginSym, CurrentBBEnd};
+            MBBSectionRange{CurrentSectionBeginSym, MBB.getEndSymbol()};
       }
     }
     emitBasicBlockEnd(MBB);
+
+    if (CanDoExtraAnalysis) {
+      // Skip empty blocks.
+      if (MBB.empty())
+        continue;
+
+      MachineOptimizationRemarkAnalysis R(DEBUG_TYPE, "InstructionMix",
+                                          MBB.begin()->getDebugLoc(), &MBB);
+
+      // Generate instruction mix remark. First, sort counts in descending order
+      // by count and name.
+      SmallVector<std::pair<StringRef, unsigned>, 128> MnemonicVec;
+      for (auto &KV : MnemonicCounts)
+        MnemonicVec.emplace_back(KV.first, KV.second);
+
+      sort(MnemonicVec, [](const std::pair<StringRef, unsigned> &A,
+                           const std::pair<StringRef, unsigned> &B) {
+        if (A.second > B.second)
+          return true;
+        if (A.second == B.second)
+          return StringRef(A.first) < StringRef(B.first);
+        return false;
+      });
+      R << "BasicBlock: " << ore::NV("BasicBlock", MBB.getName()) << "\n";
+      for (auto &KV : MnemonicVec) {
+        auto Name = (Twine("INST_") + KV.first.trim()).str();
+        R << KV.first << ": " << ore::NV(Name, KV.second) << "\n";
+      }
+      ORE->emit(R);
+    }
   }
 
   EmittedInsts += NumInstsInFunction;
@@ -1297,6 +1417,11 @@ void AsmPrinter::emitFunctionBody() {
     HI.Handler->endFunction(MF);
   }
 
+  // Emit section containing BB address offsets and their metadata, when
+  // BB labels are requested for this function.
+  if (MF->hasBBLabels())
+    emitBBAddrMapSection(*MF);
+
   // Emit section containing stack size metadata.
   emitStackSizeSection(*MF);
 
@@ -1390,16 +1515,7 @@ void AsmPrinter::emitGlobalGOTEquivs() {
 void AsmPrinter::emitGlobalIndirectSymbol(Module &M,
                                           const GlobalIndirectSymbol& GIS) {
   MCSymbol *Name = getSymbol(&GIS);
-
-  if (GIS.hasExternalLinkage() || !MAI->getWeakRefDirective())
-    OutStreamer->emitSymbolAttribute(Name, MCSA_Global);
-  else if (GIS.hasWeakLinkage() || GIS.hasLinkOnceLinkage())
-    OutStreamer->emitSymbolAttribute(Name, MCSA_WeakReference);
-  else
-    assert(GIS.hasLocalLinkage() && "Invalid alias or ifunc linkage");
-
   bool IsFunction = GIS.getValueType()->isFunctionTy();
-
   // Treat bitcasts of functions as functions also. This is important at least
   // on WebAssembly where object and function addresses can't alias each other.
   if (!IsFunction)
@@ -1408,6 +1524,30 @@ void AsmPrinter::emitGlobalIndirectSymbol(Module &M,
         IsFunction =
           CE->getOperand(0)->getType()->getPointerElementType()->isFunctionTy();
 
+  // AIX's assembly directive `.set` is not usable for aliasing purpose,
+  // so AIX has to use the extra-label-at-definition strategy. At this
+  // point, all the extra label is emitted, we just have to emit linkage for
+  // those labels.
+  if (TM.getTargetTriple().isOSBinFormatXCOFF()) {
+    assert(!isa<GlobalIFunc>(GIS) && "IFunc is not supported on AIX.");
+    assert(MAI->hasVisibilityOnlyWithLinkage() &&
+           "Visibility should be handled with emitLinkage() on AIX.");
+    emitLinkage(&GIS, Name);
+    // If it's a function, also emit linkage for aliases of function entry
+    // point.
+    if (IsFunction)
+      emitLinkage(&GIS,
+                  getObjFileLowering().getFunctionEntryPointSymbol(&GIS, TM));
+    return;
+  }
+
+  if (GIS.hasExternalLinkage() || !MAI->getWeakRefDirective())
+    OutStreamer->emitSymbolAttribute(Name, MCSA_Global);
+  else if (GIS.hasWeakLinkage() || GIS.hasLinkOnceLinkage())
+    OutStreamer->emitSymbolAttribute(Name, MCSA_WeakReference);
+  else
+    assert(GIS.hasLocalLinkage() && "Invalid alias or ifunc linkage");
+
   // Set the symbol type to function if the alias has a function type.
   // This affects codegen when the aliasee is not a function.
   if (IsFunction)
@@ -1517,9 +1657,8 @@ bool AsmPrinter::doFinalization(Module &M) {
     // Variable `Name` is the function descriptor symbol (see above). Get the
     // function entry point symbol.
     MCSymbol *FnEntryPointSym = TLOF.getFunctionEntryPointSymbol(&F, TM);
-    if (cast<MCSymbolXCOFF>(FnEntryPointSym)->hasRepresentedCsectSet())
-      // Emit linkage for the function entry point.
-      emitLinkage(&F, FnEntryPointSym);
+    // Emit linkage for the function entry point.
+    emitLinkage(&F, FnEntryPointSym);
 
     // Emit linkage for the function descriptor.
     emitLinkage(&F, Name);
@@ -1584,7 +1723,11 @@ bool AsmPrinter::doFinalization(Module &M) {
                        HI.TimerGroupDescription, TimePassesIsEnabled);
     HI.Handler->endModule();
   }
-  Handlers.clear();
+
+  // This deletes all the ephemeral handlers that AsmPrinter added, while
+  // keeping all the user-added handlers alive until the AsmPrinter is
+  // destroyed.
+  Handlers.erase(Handlers.begin() + NumUserHandlers, Handlers.end());
   DD = nullptr;
 
   // If the target wants to know about weak references, print them all.
@@ -1668,51 +1811,6 @@ bool AsmPrinter::doFinalization(Module &M) {
     if (MCSection *S = MAI->getNonexecutableStackSection(OutContext))
       OutStreamer->SwitchSection(S);
 
-  if (TM.getTargetTriple().isOSBinFormatCOFF()) {
-    // Emit /EXPORT: flags for each exported global as necessary.
-    const auto &TLOF = getObjFileLowering();
-    std::string Flags;
-
-    for (const GlobalValue &GV : M.global_values()) {
-      raw_string_ostream OS(Flags);
-      TLOF.emitLinkerFlagsForGlobal(OS, &GV);
-      OS.flush();
-      if (!Flags.empty()) {
-        OutStreamer->SwitchSection(TLOF.getDrectveSection());
-        OutStreamer->emitBytes(Flags);
-      }
-      Flags.clear();
-    }
-
-    // Emit /INCLUDE: flags for each used global as necessary.
-    if (const auto *LU = M.getNamedGlobal("llvm.used")) {
-      assert(LU->hasInitializer() &&
-             "expected llvm.used to have an initializer");
-      assert(isa<ArrayType>(LU->getValueType()) &&
-             "expected llvm.used to be an array type");
-      if (const auto *A = cast<ConstantArray>(LU->getInitializer())) {
-        for (const Value *Op : A->operands()) {
-          const auto *GV = cast<GlobalValue>(Op->stripPointerCasts());
-          // Global symbols with internal or private linkage are not visible to
-          // the linker, and thus would cause an error when the linker tried to
-          // preserve the symbol due to the `/include:` directive.
-          if (GV->hasLocalLinkage())
-            continue;
-
-          raw_string_ostream OS(Flags);
-          TLOF.emitLinkerFlagsForUsed(OS, GV);
-          OS.flush();
-
-          if (!Flags.empty()) {
-            OutStreamer->SwitchSection(TLOF.getDrectveSection());
-            OutStreamer->emitBytes(Flags);
-          }
-          Flags.clear();
-        }
-      }
-    }
-  }
-
   if (TM.Options.EmitAddrsig) {
     // Emit address-significance attributes for all globals.
     OutStreamer->emitAddrsig();
@@ -1756,10 +1854,11 @@ bool AsmPrinter::doFinalization(Module &M) {
   return false;
 }
 
-MCSymbol *AsmPrinter::getCurExceptionSym() {
-  if (!CurExceptionSym)
-    CurExceptionSym = createTempSymbol("exception");
-  return CurExceptionSym;
+MCSymbol *AsmPrinter::getMBBExceptionSym(const MachineBasicBlock &MBB) {
+  auto Res = MBBSectionExceptionSyms.try_emplace(MBB.getSectionIDNum());
+  if (Res.second)
+    Res.first->second = createTempSymbol("exception");
+  return Res.first->second;
 }
 
 void AsmPrinter::SetupMachineFunction(MachineFunction &MF) {
@@ -1786,13 +1885,13 @@ void AsmPrinter::SetupMachineFunction(MachineFunction &MF) {
   CurrentFnBegin = nullptr;
   CurrentSectionBeginSym = nullptr;
   MBBSectionRanges.clear();
-  CurExceptionSym = nullptr;
+  MBBSectionExceptionSyms.clear();
   bool NeedsLocalForSize = MAI->needsLocalForSize();
   if (F.hasFnAttribute("patchable-function-entry") ||
       F.hasFnAttribute("function-instrument") ||
       F.hasFnAttribute("xray-instruction-threshold") ||
       needFuncLabelsForEHOrDebugInfo(MF) || NeedsLocalForSize ||
-      MF.getTarget().Options.EmitStackSizeSection) {
+      MF.getTarget().Options.EmitStackSizeSection || MF.hasBBLabels()) {
     CurrentFnBegin = createTempSymbol("func_begin");
     if (NeedsLocalForSize)
       CurrentFnSymForSize = CurrentFnBegin;
@@ -1882,8 +1981,7 @@ void AsmPrinter::emitConstantPool() {
       unsigned NewOffset = alignTo(Offset, CPE.getAlign());
       OutStreamer->emitZeros(NewOffset - Offset);
 
-      Type *Ty = CPE.getType();
-      Offset = NewOffset + getDataLayout().getTypeAllocSize(Ty);
+      Offset = NewOffset + CPE.getSizeInBytes(getDataLayout());
 
       OutStreamer->emitLabel(Sym);
       if (CPE.isMachineConstantPoolEntry())
@@ -2083,47 +2181,50 @@ void AsmPrinter::emitLLVMUsedList(const ConstantArray *InitList) {
   }
 }
 
-namespace {
-
-struct Structor {
-  int Priority = 0;
-  Constant *Func = nullptr;
-  GlobalValue *ComdatKey = nullptr;
-
-  Structor() = default;
-};
-
-} // end anonymous namespace
-
-/// EmitXXStructorList - Emit the ctor or dtor list taking into account the init
-/// priority.
-void AsmPrinter::emitXXStructorList(const DataLayout &DL, const Constant *List,
-                                    bool isCtor) {
-  // Should be an array of '{ i32, void ()*, i8* }' structs.  The first value is the
-  // init priority.
-  if (!isa<ConstantArray>(List)) return;
+void AsmPrinter::preprocessXXStructorList(const DataLayout &DL,
+                                          const Constant *List,
+                                          SmallVector<Structor, 8> &Structors) {
+  // Should be an array of '{ i32, void ()*, i8* }' structs.  The first value is
+  // the init priority.
+  if (!isa<ConstantArray>(List))
+    return;
 
   // Gather the structors in a form that's convenient for sorting by priority.
-  SmallVector<Structor, 8> Structors;
   for (Value *O : cast<ConstantArray>(List)->operands()) {
     auto *CS = cast<ConstantStruct>(O);
     if (CS->getOperand(1)->isNullValue())
-      break;  // Found a null terminator, skip the rest.
+      break; // Found a null terminator, skip the rest.
     ConstantInt *Priority = dyn_cast<ConstantInt>(CS->getOperand(0));
-    if (!Priority) continue; // Malformed.
+    if (!Priority)
+      continue; // Malformed.
     Structors.push_back(Structor());
     Structor &S = Structors.back();
     S.Priority = Priority->getLimitedValue(65535);
     S.Func = CS->getOperand(1);
-    if (!CS->getOperand(2)->isNullValue())
+    if (!CS->getOperand(2)->isNullValue()) {
+      if (TM.getTargetTriple().isOSAIX())
+        llvm::report_fatal_error(
+            "associated data of XXStructor list is not yet supported on AIX");
       S.ComdatKey =
           dyn_cast<GlobalValue>(CS->getOperand(2)->stripPointerCasts());
+    }
   }
 
   // Emit the function pointers in the target-specific order
   llvm::stable_sort(Structors, [](const Structor &L, const Structor &R) {
     return L.Priority < R.Priority;
   });
+}
+
+/// EmitXXStructorList - Emit the ctor or dtor list taking into account the init
+/// priority.
+void AsmPrinter::emitXXStructorList(const DataLayout &DL, const Constant *List,
+                                    bool IsCtor) {
+  SmallVector<Structor, 8> Structors;
+  preprocessXXStructorList(DL, List, Structors);
+  if (Structors.empty())
+    return;
+
   const Align Align = DL.getPointerPrefAlignment();
   for (Structor &S : Structors) {
     const TargetLoweringObjectFile &Obj = getObjFileLowering();
@@ -2139,8 +2240,9 @@ void AsmPrinter::emitXXStructorList(const DataLayout &DL, const Constant *List,
 
       KeySym = getSymbol(GV);
     }
+
     MCSection *OutputSection =
-        (isCtor ? Obj.getStaticCtorSection(S.Priority, KeySym)
+        (IsCtor ? Obj.getStaticCtorSection(S.Priority, KeySym)
                 : Obj.getStaticDtorSection(S.Priority, KeySym));
     OutStreamer->SwitchSection(OutputSection);
     if (OutStreamer->getCurrentSection() != OutStreamer->getPreviousSection())
@@ -2274,12 +2376,25 @@ const MCExpr *AsmPrinter::lowerConstant(const Constant *CV) {
   if (const BlockAddress *BA = dyn_cast<BlockAddress>(CV))
     return MCSymbolRefExpr::create(GetBlockAddressSymbol(BA), Ctx);
 
+  if (const auto *Equiv = dyn_cast<DSOLocalEquivalent>(CV))
+    return getObjFileLowering().lowerDSOLocalEquivalent(Equiv, TM);
+
   const ConstantExpr *CE = dyn_cast<ConstantExpr>(CV);
   if (!CE) {
     llvm_unreachable("Unknown constant value to lower!");
   }
 
   switch (CE->getOpcode()) {
+  case Instruction::AddrSpaceCast: {
+    const Constant *Op = CE->getOperand(0);
+    unsigned DstAS = CE->getType()->getPointerAddressSpace();
+    unsigned SrcAS = Op->getType()->getPointerAddressSpace();
+    if (TM.isNoopAddrSpaceCast(SrcAS, DstAS))
+      return lowerConstant(Op);
+
+    // Fallthrough to error.
+    LLVM_FALLTHROUGH;
+  }
   default: {
     // If the code isn't optimized, there may be outstanding folding
     // opportunities. Attempt to fold the expression using DataLayout as a
@@ -2345,7 +2460,8 @@ const MCExpr *AsmPrinter::lowerConstant(const Constant *CV) {
     //
     // If the pointer is larger than the resultant integer, then
     // as with Trunc just depend on the assembler to truncate it.
-    if (DL.getTypeAllocSize(Ty) <= DL.getTypeAllocSize(Op->getType()))
+    if (DL.getTypeAllocSize(Ty).getFixedSize() <=
+        DL.getTypeAllocSize(Op->getType()).getFixedSize())
       return OpExpr;
 
     // Otherwise the pointer is smaller than the resultant integer, mask off
@@ -2359,18 +2475,25 @@ const MCExpr *AsmPrinter::lowerConstant(const Constant *CV) {
   case Instruction::Sub: {
     GlobalValue *LHSGV;
     APInt LHSOffset;
+    DSOLocalEquivalent *DSOEquiv;
     if (IsConstantOffsetFromGlobal(CE->getOperand(0), LHSGV, LHSOffset,
-                                   getDataLayout())) {
+                                   getDataLayout(), &DSOEquiv)) {
       GlobalValue *RHSGV;
       APInt RHSOffset;
       if (IsConstantOffsetFromGlobal(CE->getOperand(1), RHSGV, RHSOffset,
                                      getDataLayout())) {
         const MCExpr *RelocExpr =
             getObjFileLowering().lowerRelativeReference(LHSGV, RHSGV, TM);
-        if (!RelocExpr)
+        if (!RelocExpr) {
+          const MCExpr *LHSExpr =
+              MCSymbolRefExpr::create(getSymbol(LHSGV), Ctx);
+          if (DSOEquiv &&
+              getObjFileLowering().supportDSOLocalEquivalentLowering())
+            LHSExpr =
+                getObjFileLowering().lowerDSOLocalEquivalent(DSOEquiv, TM);
           RelocExpr = MCBinaryExpr::createSub(
-              MCSymbolRefExpr::create(getSymbol(LHSGV), Ctx),
-              MCSymbolRefExpr::create(getSymbol(RHSGV), Ctx), Ctx);
+              LHSExpr, MCSymbolRefExpr::create(getSymbol(RHSGV), Ctx), Ctx);
+        }
         int64_t Addend = (LHSOffset - RHSOffset).getSExtValue();
         if (Addend != 0)
           RelocExpr = MCBinaryExpr::createAdd(
@@ -3001,7 +3124,7 @@ static void emitBasicBlockLoopComments(const MachineBasicBlock &MBB,
   OS.indent(Loop->getLoopDepth()*2-2);
 
   OS << "This ";
-  if (Loop->empty())
+  if (Loop->isInnermost())
     OS << "Inner ";
   OS << "Loop Header: Depth=" + Twine(Loop->getLoopDepth()) << '\n';
 
@@ -3025,6 +3148,16 @@ void AsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) {
   if (Alignment != Align(1))
     emitAlignment(Alignment);
 
+  // Switch to a new section if this basic block must begin a section. The
+  // entry block is always placed in the function section and is handled
+  // separately.
+  if (MBB.isBeginSection() && !MBB.isEntryBlock()) {
+    OutStreamer->SwitchSection(
+        getObjFileLowering().getSectionForMachineBasicBlock(MF->getFunction(),
+                                                            MBB, TM));
+    CurrentSectionBeginSym = MBB.getSymbol();
+  }
+
   // If the block has its address taken, emit any labels that were used to
   // reference the block.  It is possible that there is more than one label
   // here, because multiple LLVM BB's may have been RAUW'd to this block after
@@ -3055,33 +3188,25 @@ void AsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) {
     emitBasicBlockLoopComments(MBB, MLI, *this);
   }
 
-  if (MBB.pred_empty() ||
-      (!MF->hasBBLabels() && isBlockOnlyReachableByFallthrough(&MBB) &&
-       !MBB.isEHFuncletEntry() && !MBB.hasLabelMustBeEmitted())) {
+  // Print the main label for the block.
+  if (shouldEmitLabelForBasicBlock(MBB)) {
+    if (isVerbose() && MBB.hasLabelMustBeEmitted())
+      OutStreamer->AddComment("Label of block must be emitted");
+    OutStreamer->emitLabel(MBB.getSymbol());
+  } else {
     if (isVerbose()) {
       // NOTE: Want this comment at start of line, don't emit with AddComment.
       OutStreamer->emitRawComment(" %bb." + Twine(MBB.getNumber()) + ":",
                                   false);
     }
-  } else {
-    if (isVerbose() && MBB.hasLabelMustBeEmitted()) {
-      OutStreamer->AddComment("Label of block must be emitted");
-    }
-    auto *BBSymbol = MBB.getSymbol();
-    // Switch to a new section if this basic block must begin a section.
-    if (MBB.isBeginSection()) {
-      OutStreamer->SwitchSection(
-          getObjFileLowering().getSectionForMachineBasicBlock(MF->getFunction(),
-                                                              MBB, TM));
-      CurrentSectionBeginSym = BBSymbol;
-    }
-    OutStreamer->emitLabel(BBSymbol);
-    // With BB sections, each basic block must handle CFI information on its own
-    // if it begins a section.
-    if (MBB.isBeginSection())
-      for (const HandlerInfo &HI : Handlers)
-        HI.Handler->beginBasicBlock(MBB);
   }
+
+  // With BB sections, each basic block must handle CFI information on its own
+  // if it begins a section (Entry block is handled separately by
+  // AsmPrinterHandler::beginFunction).
+  if (MBB.isBeginSection() && !MBB.isEntryBlock())
+    for (const HandlerInfo &HI : Handlers)
+      HI.Handler->beginBasicBlock(MBB);
 }
 
 void AsmPrinter::emitBasicBlockEnd(const MachineBasicBlock &MBB) {
@@ -3113,15 +3238,26 @@ void AsmPrinter::emitVisibility(MCSymbol *Sym, unsigned Visibility,
     OutStreamer->emitSymbolAttribute(Sym, Attr);
 }
 
+bool AsmPrinter::shouldEmitLabelForBasicBlock(
+    const MachineBasicBlock &MBB) const {
+  // With `-fbasic-block-sections=`, a label is needed for every non-entry block
+  // in the labels mode (option `=labels`) and every section beginning in the
+  // sections mode (`=all` and `=list=`).
+  if ((MF->hasBBLabels() || MBB.isBeginSection()) && !MBB.isEntryBlock())
+    return true;
+  // A label is needed for any block with at least one predecessor (when that
+  // predecessor is not the fallthrough predecessor, or if it is an EH funclet
+  // entry, or if a label is forced).
+  return !MBB.pred_empty() &&
+         (!isBlockOnlyReachableByFallthrough(&MBB) || MBB.isEHFuncletEntry() ||
+          MBB.hasLabelMustBeEmitted());
+}
+
 /// isBlockOnlyReachableByFallthough - Return true if the basic block has
 /// exactly one predecessor and the control transfer mechanism between
 /// the predecessor and this block is a fall-through.
 bool AsmPrinter::
 isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const {
-  // With BasicBlock Sections, beginning of the section is not a fallthrough.
-  if (MBB->isBeginSection())
-    return false;
-
   // If this is a landing pad, it isn't a fall through.  If it has no preds,
   // then nothing falls through to it.
   if (MBB->isEHPad() || MBB->pred_empty())
@@ -3232,14 +3368,10 @@ void AsmPrinter::emitXRayTable() {
   MCSection *InstMap = nullptr;
   MCSection *FnSledIndex = nullptr;
   const Triple &TT = TM.getTargetTriple();
-  // Use PC-relative addresses on all targets except MIPS (MIPS64 cannot use
-  // PC-relative addresses because R_MIPS_PC64 does not exist).
-  bool PCRel = !TT.isMIPS();
+  // Use PC-relative addresses on all targets.
   if (TT.isOSBinFormatELF()) {
     auto LinkedToSym = cast<MCSymbolELF>(CurrentFnSym);
     auto Flags = ELF::SHF_ALLOC | ELF::SHF_LINK_ORDER;
-    if (!PCRel)
-      Flags |= ELF::SHF_WRITE;
     StringRef GroupName;
     if (F.hasComdat()) {
       Flags |= ELF::SHF_GROUP;
@@ -3273,25 +3405,20 @@ void AsmPrinter::emitXRayTable() {
   OutStreamer->SwitchSection(InstMap);
   OutStreamer->emitLabel(SledsStart);
   for (const auto &Sled : Sleds) {
-    if (PCRel) {
-      MCSymbol *Dot = Ctx.createTempSymbol();
-      OutStreamer->emitLabel(Dot);
-      OutStreamer->emitValueImpl(
-          MCBinaryExpr::createSub(MCSymbolRefExpr::create(Sled.Sled, Ctx),
-                                  MCSymbolRefExpr::create(Dot, Ctx), Ctx),
-          WordSizeBytes);
-      OutStreamer->emitValueImpl(
-          MCBinaryExpr::createSub(
-              MCSymbolRefExpr::create(CurrentFnBegin, Ctx),
-              MCBinaryExpr::createAdd(
-                  MCSymbolRefExpr::create(Dot, Ctx),
-                  MCConstantExpr::create(WordSizeBytes, Ctx), Ctx),
-              Ctx),
-          WordSizeBytes);
-    } else {
-      OutStreamer->emitSymbolValue(Sled.Sled, WordSizeBytes);
-      OutStreamer->emitSymbolValue(CurrentFnSym, WordSizeBytes);
-    }
+    MCSymbol *Dot = Ctx.createTempSymbol();
+    OutStreamer->emitLabel(Dot);
+    OutStreamer->emitValueImpl(
+        MCBinaryExpr::createSub(MCSymbolRefExpr::create(Sled.Sled, Ctx),
+                                MCSymbolRefExpr::create(Dot, Ctx), Ctx),
+        WordSizeBytes);
+    OutStreamer->emitValueImpl(
+        MCBinaryExpr::createSub(
+            MCSymbolRefExpr::create(CurrentFnBegin, Ctx),
+            MCBinaryExpr::createAdd(MCSymbolRefExpr::create(Dot, Ctx),
+                                    MCConstantExpr::create(WordSizeBytes, Ctx),
+                                    Ctx),
+            Ctx),
+        WordSizeBytes);
     Sled.emit(WordSizeBytes, OutStreamer.get());
   }
   MCSymbol *SledsEnd = OutContext.createTempSymbol("xray_sleds_end", true);
@@ -3366,3 +3493,17 @@ uint16_t AsmPrinter::getDwarfVersion() const {
 void AsmPrinter::setDwarfVersion(uint16_t Version) {
   OutStreamer->getContext().setDwarfVersion(Version);
 }
+
+bool AsmPrinter::isDwarf64() const {
+  return OutStreamer->getContext().getDwarfFormat() == dwarf::DWARF64;
+}
+
+unsigned int AsmPrinter::getDwarfOffsetByteSize() const {
+  return dwarf::getDwarfOffsetByteSize(
+      OutStreamer->getContext().getDwarfFormat());
+}
+
+unsigned int AsmPrinter::getUnitLengthFieldByteSize() const {
+  return dwarf::getUnitLengthFieldByteSize(
+      OutStreamer->getContext().getDwarfFormat());
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
index b6a9a9568360..c6e43445e7d0 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
@@ -27,6 +27,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
+#include <cstdint>
 using namespace llvm;
 
 #define DEBUG_TYPE "asm-printer"
@@ -97,6 +98,12 @@ static const char *DecodeDWARFEncoding(unsigned Encoding) {
   case dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata8
       :
     return "indirect pcrel sdata8";
+  case dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_datarel |
+      dwarf::DW_EH_PE_sdata4:
+    return "indirect datarel sdata4";
+  case dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_datarel |
+      dwarf::DW_EH_PE_sdata8:
+    return "indirect datarel sdata8";
   }
 
   return "<unknown encoding>";
@@ -137,8 +144,7 @@ unsigned AsmPrinter::GetSizeOfEncodedValue(unsigned Encoding) const {
   }
 }
 
-void AsmPrinter::emitTTypeReference(const GlobalValue *GV,
-                                    unsigned Encoding) const {
+void AsmPrinter::emitTTypeReference(const GlobalValue *GV, unsigned Encoding) {
   if (GV) {
     const TargetLoweringObjectFile &TLOF = getObjFileLowering();
 
@@ -154,19 +160,22 @@ void AsmPrinter::emitDwarfSymbolReference(const MCSymbol *Label,
   if (!ForceOffset) {
     // On COFF targets, we have to emit the special .secrel32 directive.
     if (MAI->needsDwarfSectionOffsetDirective()) {
+      assert(!isDwarf64() &&
+             "emitting DWARF64 is not implemented for COFF targets");
       OutStreamer->EmitCOFFSecRel32(Label, /*Offset=*/0);
       return;
     }
 
     // If the format uses relocations with dwarf, refer to the symbol directly.
     if (MAI->doesDwarfUseRelocationsAcrossSections()) {
-      OutStreamer->emitSymbolValue(Label, 4);
+      OutStreamer->emitSymbolValue(Label, getDwarfOffsetByteSize());
       return;
     }
   }
 
   // Otherwise, emit it as a label difference from the start of the section.
-  emitLabelDifference(Label, Label->getSection().getBeginSymbol(), 4);
+  emitLabelDifference(Label, Label->getSection().getBeginSymbol(),
+                      getDwarfOffsetByteSize());
 }
 
 void AsmPrinter::emitDwarfStringOffset(DwarfStringPoolEntry S) const {
@@ -177,12 +186,38 @@ void AsmPrinter::emitDwarfStringOffset(DwarfStringPoolEntry S) const {
   }
 
   // Just emit the offset directly; no need for symbol math.
-  emitInt32(S.Offset);
+  OutStreamer->emitIntValue(S.Offset, getDwarfOffsetByteSize());
 }
 
 void AsmPrinter::emitDwarfOffset(const MCSymbol *Label, uint64_t Offset) const {
-  // TODO: Support DWARF64
-  emitLabelPlusOffset(Label, Offset, 4);
+  emitLabelPlusOffset(Label, Offset, getDwarfOffsetByteSize());
+}
+
+void AsmPrinter::emitDwarfLengthOrOffset(uint64_t Value) const {
+  assert(isDwarf64() || Value <= UINT32_MAX);
+  OutStreamer->emitIntValue(Value, getDwarfOffsetByteSize());
+}
+
+void AsmPrinter::maybeEmitDwarf64Mark() const {
+  if (!isDwarf64())
+    return;
+  OutStreamer->AddComment("DWARF64 Mark");
+  OutStreamer->emitInt32(dwarf::DW_LENGTH_DWARF64);
+}
+
+void AsmPrinter::emitDwarfUnitLength(uint64_t Length,
+                                     const Twine &Comment) const {
+  assert(isDwarf64() || Length <= dwarf::DW_LENGTH_lo_reserved);
+  maybeEmitDwarf64Mark();
+  OutStreamer->AddComment(Comment);
+  OutStreamer->emitIntValue(Length, getDwarfOffsetByteSize());
+}
+
+void AsmPrinter::emitDwarfUnitLength(const MCSymbol *Hi, const MCSymbol *Lo,
+                                     const Twine &Comment) const {
+  maybeEmitDwarf64Mark();
+  OutStreamer->AddComment(Comment);
+  OutStreamer->emitAbsoluteSymbolDiff(Hi, Lo, getDwarfOffsetByteSize());
 }
 
 void AsmPrinter::emitCallSiteOffset(const MCSymbol *Hi, const MCSymbol *Lo,
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
index 538107cecd8b..4a67b0bc2c4d 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -146,6 +147,7 @@ void AsmPrinter::emitInlineAsm(StringRef Str, const MCSubtargetInfo &STI,
   // we only need MCInstrInfo for asm parsing. We create one unconditionally
   // because it's not subtarget dependent.
   std::unique_ptr<MCInstrInfo> MII(TM.getTarget().createMCInstrInfo());
+  assert(MII && "Failed to create instruction info");
   std::unique_ptr<MCTargetAsmParser> TAP(TM.getTarget().createMCAsmParser(
       STI, *Parser, *MII, MCOptions));
   if (!TAP)
@@ -232,7 +234,8 @@ static void EmitMSInlineAsmStr(const char *AsmStr, const MachineInstr *MI,
 
       const char *IDStart = LastEmitted;
       const char *IDEnd = IDStart;
-      while (*IDEnd >= '0' && *IDEnd <= '9') ++IDEnd;
+      while (isDigit(*IDEnd))
+        ++IDEnd;
 
       unsigned Val;
       if (StringRef(IDStart, IDEnd-IDStart).getAsInteger(10, Val))
@@ -397,7 +400,8 @@ static void EmitGCCInlineAsmStr(const char *AsmStr, const MachineInstr *MI,
 
       const char *IDStart = LastEmitted;
       const char *IDEnd = IDStart;
-      while (*IDEnd >= '0' && *IDEnd <= '9') ++IDEnd;
+      while (isDigit(*IDEnd))
+        ++IDEnd;
 
       unsigned Val;
       if (StringRef(IDStart, IDEnd-IDStart).getAsInteger(10, Val))
@@ -547,22 +551,23 @@ void AsmPrinter::emitInlineAsm(const MachineInstr *MI) const {
     EmitMSInlineAsmStr(AsmStr, MI, MMI, AP, LocCookie, OS);
 
   // Emit warnings if we use reserved registers on the clobber list, as
-  // that might give surprising results.
-  std::vector<std::string> RestrRegs;
+  // that might lead to undefined behaviour.
+  SmallVector<Register, 8> RestrRegs;
+  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
   // Start with the first operand descriptor, and iterate over them.
   for (unsigned I = InlineAsm::MIOp_FirstOperand, NumOps = MI->getNumOperands();
        I < NumOps; ++I) {
     const MachineOperand &MO = MI->getOperand(I);
-    if (MO.isImm()) {
-      unsigned Flags = MO.getImm();
-      const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
-      if (InlineAsm::getKind(Flags) == InlineAsm::Kind_Clobber &&
-          !TRI->isAsmClobberable(*MF, MI->getOperand(I + 1).getReg())) {
-        RestrRegs.push_back(TRI->getName(MI->getOperand(I + 1).getReg()));
-      }
-      // Skip to one before the next operand descriptor, if it exists.
-      I += InlineAsm::getNumOperandRegisters(Flags);
+    if (!MO.isImm())
+      continue;
+    unsigned Flags = MO.getImm();
+    if (InlineAsm::getKind(Flags) == InlineAsm::Kind_Clobber) {
+      Register Reg = MI->getOperand(I + 1).getReg();
+      if (!TRI->isAsmClobberable(*MF, Reg))
+        RestrRegs.push_back(Reg);
     }
+    // Skip to one before the next operand descriptor, if it exists.
+    I += InlineAsm::getNumOperandRegisters(Flags);
   }
 
   if (!RestrRegs.empty()) {
@@ -572,14 +577,15 @@ void AsmPrinter::emitInlineAsm(const MachineInstr *MI) const {
         SrcMgr.getMemoryBuffer(BufNum)->getBuffer().begin());
 
     std::string Msg = "inline asm clobber list contains reserved registers: ";
-    for (auto I = RestrRegs.begin(), E = RestrRegs.end(); I != E; I++) {
+    for (auto I = RestrRegs.begin(), E = RestrRegs.end(); I != E; ++I) {
       if(I != RestrRegs.begin())
         Msg += ", ";
-      Msg += *I;
+      Msg += TRI->getName(*I);
     }
-    std::string Note = "Reserved registers on the clobber list may not be "
-                "preserved across the asm statement, and clobbering them may "
-                "lead to undefined behaviour.";
+    const char *Note =
+        "Reserved registers on the clobber list may not be "
+        "preserved across the asm statement, and clobbering them may "
+        "lead to undefined behaviour.";
     SrcMgr.PrintMessage(Loc, SourceMgr::DK_Warning, Msg);
     SrcMgr.PrintMessage(Loc, SourceMgr::DK_Note, Note);
   }
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/ByteStreamer.h b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/ByteStreamer.h
index 90929a217368..5e7db1f2f76c 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/ByteStreamer.h
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/ByteStreamer.h
@@ -29,7 +29,7 @@ class ByteStreamer {
 
  public:
   // For now we're just handling the calls we need for dwarf emission/hashing.
-  virtual void EmitInt8(uint8_t Byte, const Twine &Comment = "") = 0;
+  virtual void emitInt8(uint8_t Byte, const Twine &Comment = "") = 0;
   virtual void emitSLEB128(uint64_t DWord, const Twine &Comment = "") = 0;
   virtual void emitULEB128(uint64_t DWord, const Twine &Comment = "",
                            unsigned PadTo = 0) = 0;
@@ -41,7 +41,7 @@ private:
 
 public:
   APByteStreamer(AsmPrinter &Asm) : AP(Asm) {}
-  void EmitInt8(uint8_t Byte, const Twine &Comment) override {
+  void emitInt8(uint8_t Byte, const Twine &Comment) override {
     AP.OutStreamer->AddComment(Comment);
     AP.emitInt8(Byte);
   }
@@ -61,7 +61,7 @@ class HashingByteStreamer final : public ByteStreamer {
   DIEHash &Hash;
  public:
  HashingByteStreamer(DIEHash &H) : Hash(H) {}
-  void EmitInt8(uint8_t Byte, const Twine &Comment) override {
+  void emitInt8(uint8_t Byte, const Twine &Comment) override {
     Hash.update(Byte);
   }
   void emitSLEB128(uint64_t DWord, const Twine &Comment) override {
@@ -88,7 +88,7 @@ public:
                      std::vector<std::string> &Comments, bool GenerateComments)
       : Buffer(Buffer), Comments(Comments), GenerateComments(GenerateComments) {
   }
-  void EmitInt8(uint8_t Byte, const Twine &Comment) override {
+  void emitInt8(uint8_t Byte, const Twine &Comment) override {
     Buffer.push_back(Byte);
     if (GenerateComments)
       Comments.push_back(Comment.str());
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index 39069e24e061..b15e750aaf85 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -13,15 +13,10 @@
 #include "CodeViewDebug.h"
 #include "DwarfExpression.h"
 #include "llvm/ADT/APSInt.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/TinyPtrVector.h"
 #include "llvm/ADT/Triple.h"
@@ -40,7 +35,6 @@
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
-#include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/CodeViewRecordIO.h"
 #include "llvm/DebugInfo/CodeView/ContinuationRecordBuilder.h"
 #include "llvm/DebugInfo/CodeView/DebugInlineeLinesSubsection.h"
@@ -48,14 +42,12 @@
 #include "llvm/DebugInfo/CodeView/Line.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeDumpVisitor.h"
-#include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeTableCollection.h"
 #include "llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
@@ -71,7 +63,6 @@
 #include "llvm/Support/BinaryStreamWriter.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -85,12 +76,8 @@
 #include <cassert>
 #include <cctype>
 #include <cstddef>
-#include <cstdint>
 #include <iterator>
 #include <limits>
-#include <string>
-#include <utility>
-#include <vector>
 
 using namespace llvm;
 using namespace llvm::codeview;
@@ -139,7 +126,9 @@ static CPUType mapArchToCVCPUType(Triple::ArchType Type) {
   case Triple::ArchType::x86_64:
     return CPUType::X64;
   case Triple::ArchType::thumb:
-    return CPUType::Thumb;
+    // LLVM currently doesn't support Windows CE and so thumb
+    // here is indiscriminately mapped to ARMNT specifically.
+    return CPUType::ARMNT;
   case Triple::ArchType::aarch64:
     return CPUType::ARM64;
   default:
@@ -148,28 +137,7 @@ static CPUType mapArchToCVCPUType(Triple::ArchType Type) {
 }
 
 CodeViewDebug::CodeViewDebug(AsmPrinter *AP)
-    : DebugHandlerBase(AP), OS(*Asm->OutStreamer), TypeTable(Allocator) {
-  // If module doesn't have named metadata anchors or COFF debug section
-  // is not available, skip any debug info related stuff.
-  if (!MMI->getModule()->getNamedMetadata("llvm.dbg.cu") ||
-      !AP->getObjFileLowering().getCOFFDebugSymbolsSection()) {
-    Asm = nullptr;
-    MMI->setDebugInfoAvailability(false);
-    return;
-  }
-  // Tell MMI that we have debug info.
-  MMI->setDebugInfoAvailability(true);
-
-  TheCPU =
-      mapArchToCVCPUType(Triple(MMI->getModule()->getTargetTriple()).getArch());
-
-  collectGlobalVariableInfo();
-
-  // Check if we should emit type record hashes.
-  ConstantInt *GH = mdconst::extract_or_null<ConstantInt>(
-      MMI->getModule()->getModuleFlag("CodeViewGHash"));
-  EmitDebugGlobalHashes = GH && !GH->isZero();
-}
+    : DebugHandlerBase(AP), OS(*Asm->OutStreamer), TypeTable(Allocator) {}
 
 StringRef CodeViewDebug::getFullFilepath(const DIFile *File) {
   std::string &Filepath = FileToFilepathMap[File];
@@ -507,8 +475,7 @@ void CodeViewDebug::recordLocalVariable(LocalVariable &&Var,
 
 static void addLocIfNotPresent(SmallVectorImpl<const DILocation *> &Locs,
                                const DILocation *Loc) {
-  auto B = Locs.begin(), E = Locs.end();
-  if (std::find(B, E, Loc) == E)
+  if (!llvm::is_contained(Locs, Loc))
     Locs.push_back(Loc);
 }
 
@@ -574,12 +541,31 @@ void CodeViewDebug::emitCodeViewMagicVersion() {
   OS.emitInt32(COFF::DEBUG_SECTION_MAGIC);
 }
 
+void CodeViewDebug::beginModule(Module *M) {
+  // If module doesn't have named metadata anchors or COFF debug section
+  // is not available, skip any debug info related stuff.
+  if (!M->getNamedMetadata("llvm.dbg.cu") ||
+      !Asm->getObjFileLowering().getCOFFDebugSymbolsSection()) {
+    Asm = nullptr;
+    return;
+  }
+  // Tell MMI that we have and need debug info.
+  MMI->setDebugInfoAvailability(true);
+
+  TheCPU = mapArchToCVCPUType(Triple(M->getTargetTriple()).getArch());
+
+  collectGlobalVariableInfo();
+
+  // Check if we should emit type record hashes.
+  ConstantInt *GH =
+      mdconst::extract_or_null<ConstantInt>(M->getModuleFlag("CodeViewGHash"));
+  EmitDebugGlobalHashes = GH && !GH->isZero();
+}
+
 void CodeViewDebug::endModule() {
   if (!Asm || !MMI->hasDebugInfo())
     return;
 
-  assert(Asm != nullptr);
-
   // The COFF .debug$S section consists of several subsections, each starting
   // with a 4-byte control code (e.g. 0xF1, 0xF2, etc) and then a 4-byte length
   // of the payload followed by the payload itself.  The subsections are 4-byte
@@ -600,13 +586,18 @@ void CodeViewDebug::endModule() {
     if (!P.first->isDeclarationForLinker())
       emitDebugInfoForFunction(P.first, *P.second);
 
-  // Emit global variable debug information.
-  setCurrentSubprogram(nullptr);
-  emitDebugInfoForGlobals();
+  // Get types used by globals without emitting anything.
+  // This is meant to collect all static const data members so they can be
+  // emitted as globals.
+  collectDebugInfoForGlobals();
 
   // Emit retained types.
   emitDebugInfoForRetainedTypes();
 
+  // Emit global variable debug information.
+  setCurrentSubprogram(nullptr);
+  emitDebugInfoForGlobals();
+
   // Switch back to the generic .debug$S section after potentially processing
   // comdat symbol sections.
   switchToDebugSectionForSymbol(nullptr);
@@ -1195,12 +1186,15 @@ void CodeViewDebug::collectVariableInfoFromMFTable(
 
     // Get the frame register used and the offset.
     Register FrameReg;
-    int FrameOffset = TFI->getFrameIndexReference(*Asm->MF, VI.Slot, FrameReg);
+    StackOffset FrameOffset = TFI->getFrameIndexReference(*Asm->MF, VI.Slot, FrameReg);
     uint16_t CVReg = TRI->getCodeViewRegNum(FrameReg);
 
+    assert(!FrameOffset.getScalable() &&
+           "Frame offsets with a scalable component are not supported");
+
     // Calculate the label ranges.
     LocalVarDefRange DefRange =
-        createDefRangeMem(CVReg, FrameOffset + ExprOffset);
+        createDefRangeMem(CVReg, FrameOffset.getFixed() + ExprOffset);
 
     for (const InsnRange &Range : Scope->getRanges()) {
       const MCSymbol *Begin = getLabelBeforeInsn(Range.first);
@@ -2155,6 +2149,15 @@ void CodeViewDebug::collectMemberInfo(ClassInfo &Info,
                                       const DIDerivedType *DDTy) {
   if (!DDTy->getName().empty()) {
     Info.Members.push_back({DDTy, 0});
+
+    // Collect static const data members with values.
+    if ((DDTy->getFlags() & DINode::FlagStaticMember) ==
+        DINode::FlagStaticMember) {
+      if (DDTy->getConstant() && (isa<ConstantInt>(DDTy->getConstant()) ||
+                                  isa<ConstantFP>(DDTy->getConstant())))
+        StaticConstMembers.push_back(DDTy);
+    }
+
     return;
   }
 
@@ -3057,15 +3060,32 @@ void CodeViewDebug::collectGlobalVariableInfo() {
   }
 }
 
+void CodeViewDebug::collectDebugInfoForGlobals() {
+  for (const CVGlobalVariable &CVGV : GlobalVariables) {
+    const DIGlobalVariable *DIGV = CVGV.DIGV;
+    const DIScope *Scope = DIGV->getScope();
+    getCompleteTypeIndex(DIGV->getType());
+    getFullyQualifiedName(Scope, DIGV->getName());
+  }
+
+  for (const CVGlobalVariable &CVGV : ComdatVariables) {
+    const DIGlobalVariable *DIGV = CVGV.DIGV;
+    const DIScope *Scope = DIGV->getScope();
+    getCompleteTypeIndex(DIGV->getType());
+    getFullyQualifiedName(Scope, DIGV->getName());
+  }
+}
+
 void CodeViewDebug::emitDebugInfoForGlobals() {
   // First, emit all globals that are not in a comdat in a single symbol
   // substream. MSVC doesn't like it if the substream is empty, so only open
   // it if we have at least one global to emit.
   switchToDebugSectionForSymbol(nullptr);
-  if (!GlobalVariables.empty()) {
+  if (!GlobalVariables.empty() || !StaticConstMembers.empty()) {
     OS.AddComment("Symbol subsection for globals");
     MCSymbol *EndLabel = beginCVSubsection(DebugSubsectionKind::Symbols);
     emitGlobalVariableList(GlobalVariables);
+    emitStaticConstMemberList();
     endCVSubsection(EndLabel);
   }
 
@@ -3104,6 +3124,61 @@ void CodeViewDebug::emitGlobalVariableList(ArrayRef<CVGlobalVariable> Globals) {
   }
 }
 
+void CodeViewDebug::emitStaticConstMemberList() {
+  for (const DIDerivedType *DTy : StaticConstMembers) {
+    const DIScope *Scope = DTy->getScope();
+
+    APSInt Value;
+    if (const ConstantInt *CI =
+            dyn_cast_or_null<ConstantInt>(DTy->getConstant()))
+      Value = APSInt(CI->getValue(),
+                     DebugHandlerBase::isUnsignedDIType(DTy->getBaseType()));
+    else if (const ConstantFP *CFP =
+                 dyn_cast_or_null<ConstantFP>(DTy->getConstant()))
+      Value = APSInt(CFP->getValueAPF().bitcastToAPInt(), true);
+    else
+      llvm_unreachable("cannot emit a constant without a value");
+
+    std::string QualifiedName = getFullyQualifiedName(Scope, DTy->getName());
+
+    MCSymbol *SConstantEnd = beginSymbolRecord(SymbolKind::S_CONSTANT);
+    OS.AddComment("Type");
+    OS.emitInt32(getTypeIndex(DTy->getBaseType()).getIndex());
+    OS.AddComment("Value");
+
+    // Encoded integers shouldn't need more than 10 bytes.
+    uint8_t Data[10];
+    BinaryStreamWriter Writer(Data, llvm::support::endianness::little);
+    CodeViewRecordIO IO(Writer);
+    cantFail(IO.mapEncodedInteger(Value));
+    StringRef SRef((char *)Data, Writer.getOffset());
+    OS.emitBinaryData(SRef);
+
+    OS.AddComment("Name");
+    emitNullTerminatedSymbolName(OS, QualifiedName);
+    endSymbolRecord(SConstantEnd);
+  }
+}
+
+static bool isFloatDIType(const DIType *Ty) {
+  if (isa<DICompositeType>(Ty))
+    return false;
+
+  if (auto *DTy = dyn_cast<DIDerivedType>(Ty)) {
+    dwarf::Tag T = (dwarf::Tag)Ty->getTag();
+    if (T == dwarf::DW_TAG_pointer_type ||
+        T == dwarf::DW_TAG_ptr_to_member_type ||
+        T == dwarf::DW_TAG_reference_type ||
+        T == dwarf::DW_TAG_rvalue_reference_type)
+      return false;
+    assert(DTy->getBaseType() && "Expected valid base type");
+    return isFloatDIType(DTy->getBaseType());
+  }
+
+  auto *BTy = cast<DIBasicType>(Ty);
+  return (BTy->getEncoding() == dwarf::DW_ATE_float);
+}
+
 void CodeViewDebug::emitDebugInfoForGlobal(const CVGlobalVariable &CVGV) {
   const DIGlobalVariable *DIGV = CVGV.DIGV;
 
@@ -3139,7 +3214,12 @@ void CodeViewDebug::emitDebugInfoForGlobal(const CVGlobalVariable &CVGV) {
     const DIExpression *DIE = CVGV.GVInfo.get<const DIExpression *>();
     assert(DIE->isConstant() &&
            "Global constant variables must contain a constant expression.");
-    uint64_t Val = DIE->getElement(1);
+
+    // Use unsigned for floats.
+    bool isUnsigned = isFloatDIType(DIGV->getType())
+                          ? true
+                          : DebugHandlerBase::isUnsignedDIType(DIGV->getType());
+    APSInt Value(APInt(/*BitWidth=*/64, DIE->getElement(1)), isUnsigned);
 
     MCSymbol *SConstantEnd = beginSymbolRecord(SymbolKind::S_CONSTANT);
     OS.AddComment("Type");
@@ -3150,7 +3230,7 @@ void CodeViewDebug::emitDebugInfoForGlobal(const CVGlobalVariable &CVGV) {
     uint8_t data[10];
     BinaryStreamWriter Writer(data, llvm::support::endianness::little);
     CodeViewRecordIO IO(Writer);
-    cantFail(IO.mapEncodedInteger(Val));
+    cantFail(IO.mapEncodedInteger(Value));
     StringRef SRef((char *)data, Writer.getOffset());
     OS.emitBinaryData(SRef);
 
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h
index 82f0293874d0..9eee5492bc81 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h
@@ -203,6 +203,9 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase {
   // Array of non-COMDAT global variables.
   SmallVector<CVGlobalVariable, 1> GlobalVariables;
 
+  /// List of static const data members to be emitted as S_CONSTANTs.
+  SmallVector<const DIDerivedType *, 4> StaticConstMembers;
+
   /// The set of comdat .debug$S sections that we've seen so far. Each section
   /// must start with a magic version number that must only be emitted once.
   /// This set tracks which sections we've already opened.
@@ -227,10 +230,6 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase {
   void calculateRanges(LocalVariable &Var,
                        const DbgValueHistoryMap::Entries &Entries);
 
-  static void collectInlineSiteChildren(SmallVectorImpl<unsigned> &Children,
-                                        const FunctionInfo &FI,
-                                        const InlineSite &Site);
-
   /// Remember some debug info about each function. Keep it in a stable order to
   /// emit at the end of the TU.
   MapVector<const Function *, std::unique_ptr<FunctionInfo>> FnDebugInfo;
@@ -313,9 +312,11 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase {
   void emitDebugInfoForUDTs(
       const std::vector<std::pair<std::string, const DIType *>> &UDTs);
 
+  void collectDebugInfoForGlobals();
   void emitDebugInfoForGlobals();
   void emitGlobalVariableList(ArrayRef<CVGlobalVariable> Globals);
   void emitDebugInfoForGlobal(const CVGlobalVariable &CVGV);
+  void emitStaticConstMemberList();
 
   /// Opens a subsection of the given kind in a .debug$S codeview section.
   /// Returns an end label for use with endCVSubsection when the subsection is
@@ -464,6 +465,8 @@ protected:
 public:
   CodeViewDebug(AsmPrinter *AP);
 
+  void beginModule(Module *M) override;
+
   void setSymbolSize(const MCSymbol *, uint64_t) override {}
 
   /// Emit the COFF section that holds the line table information.
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DIE.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DIE.cpp
index edf82fbed650..39b0b027c765 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DIE.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DIE.cpp
@@ -194,7 +194,7 @@ DIEAbbrev DIE::generateAbbrev() const {
   return Abbrev;
 }
 
-unsigned DIE::getDebugSectionOffset() const {
+uint64_t DIE::getDebugSectionOffset() const {
   const DIEUnit *Unit = getUnit();
   assert(Unit && "DIE must be owned by a DIEUnit to get its absolute offset");
   return Unit->getDebugSectionOffset() + getOffset();
@@ -313,10 +313,8 @@ unsigned DIE::computeOffsetsAndAbbrevs(const AsmPrinter *AP,
 //===----------------------------------------------------------------------===//
 // DIEUnit Implementation
 //===----------------------------------------------------------------------===//
-DIEUnit::DIEUnit(uint16_t V, uint8_t A, dwarf::Tag UnitTag)
-    : Die(UnitTag), Section(nullptr), Offset(0), Length(0), Version(V),
-      AddrSize(A)
-{
+DIEUnit::DIEUnit(dwarf::Tag UnitTag)
+    : Die(UnitTag), Section(nullptr), Offset(0) {
   Die.Owner = this;
   assert((UnitTag == dwarf::DW_TAG_compile_unit ||
           UnitTag == dwarf::DW_TAG_skeleton_unit ||
@@ -430,10 +428,10 @@ void DIEInteger::emitValue(const AsmPrinter *Asm, dwarf::Form Form) const {
 /// SizeOf - Determine size of integer value in bytes.
 ///
 unsigned DIEInteger::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
-  dwarf::FormParams Params = {0, 0, dwarf::DWARF32};
-  if (AP)
-    Params = {AP->getDwarfVersion(), uint8_t(AP->getPointerSize()),
-              AP->OutStreamer->getContext().getDwarfFormat()};
+  assert(AP && "AsmPrinter is required to set FormParams");
+  dwarf::FormParams Params = {AP->getDwarfVersion(),
+                              uint8_t(AP->getPointerSize()),
+                              AP->OutStreamer->getContext().getDwarfFormat()};
 
   if (Optional<uint8_t> FixedSize = dwarf::getFixedFormByteSize(Form, Params))
     return *FixedSize;
@@ -472,10 +470,16 @@ void DIEExpr::emitValue(const AsmPrinter *AP, dwarf::Form Form) const {
 /// SizeOf - Determine size of expression value in bytes.
 ///
 unsigned DIEExpr::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
-  if (Form == dwarf::DW_FORM_data4) return 4;
-  if (Form == dwarf::DW_FORM_sec_offset) return 4;
-  if (Form == dwarf::DW_FORM_strp) return 4;
-  return AP->getPointerSize();
+  switch (Form) {
+  case dwarf::DW_FORM_data4:
+    return 4;
+  case dwarf::DW_FORM_data8:
+    return 8;
+  case dwarf::DW_FORM_sec_offset:
+    return AP->getDwarfOffsetByteSize();
+  default:
+    llvm_unreachable("DIE Value form not supported yet");
+  }
 }
 
 LLVM_DUMP_METHOD
@@ -488,19 +492,26 @@ void DIEExpr::print(raw_ostream &O) const { O << "Expr: " << *Expr; }
 /// EmitValue - Emit label value.
 ///
 void DIELabel::emitValue(const AsmPrinter *AP, dwarf::Form Form) const {
-  AP->emitLabelReference(
-      Label, SizeOf(AP, Form),
-      Form == dwarf::DW_FORM_strp || Form == dwarf::DW_FORM_sec_offset ||
-          Form == dwarf::DW_FORM_ref_addr || Form == dwarf::DW_FORM_data4);
+  bool IsSectionRelative = Form != dwarf::DW_FORM_addr;
+  AP->emitLabelReference(Label, SizeOf(AP, Form), IsSectionRelative);
 }
 
 /// SizeOf - Determine size of label value in bytes.
 ///
 unsigned DIELabel::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
-  if (Form == dwarf::DW_FORM_data4) return 4;
-  if (Form == dwarf::DW_FORM_sec_offset) return 4;
-  if (Form == dwarf::DW_FORM_strp) return 4;
-  return AP->MAI->getCodePointerSize();
+  switch (Form) {
+  case dwarf::DW_FORM_data4:
+    return 4;
+  case dwarf::DW_FORM_data8:
+    return 8;
+  case dwarf::DW_FORM_sec_offset:
+  case dwarf::DW_FORM_strp:
+    return AP->getDwarfOffsetByteSize();
+  case dwarf::DW_FORM_addr:
+    return AP->MAI->getCodePointerSize();
+  default:
+    llvm_unreachable("DIE Value form not supported yet");
+  }
 }
 
 LLVM_DUMP_METHOD
@@ -536,10 +547,16 @@ void DIEDelta::emitValue(const AsmPrinter *AP, dwarf::Form Form) const {
 /// SizeOf - Determine size of delta value in bytes.
 ///
 unsigned DIEDelta::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
-  if (Form == dwarf::DW_FORM_data4) return 4;
-  if (Form == dwarf::DW_FORM_sec_offset) return 4;
-  if (Form == dwarf::DW_FORM_strp) return 4;
-  return AP->MAI->getCodePointerSize();
+  switch (Form) {
+  case dwarf::DW_FORM_data4:
+    return 4;
+  case dwarf::DW_FORM_data8:
+    return 8;
+  case dwarf::DW_FORM_sec_offset:
+    return AP->getDwarfOffsetByteSize();
+  default:
+    llvm_unreachable("DIE Value form not supported yet");
+  }
 }
 
 LLVM_DUMP_METHOD
@@ -645,7 +662,7 @@ void DIEEntry::emitValue(const AsmPrinter *AP, dwarf::Form Form) const {
 
   case dwarf::DW_FORM_ref_addr: {
     // Get the absolute offset for this DIE within the debug info/types section.
-    unsigned Addr = Entry->getDebugSectionOffset();
+    uint64_t Addr = Entry->getDebugSectionOffset();
     if (const MCSymbol *SectionSym =
             Entry->getUnit()->getCrossSectionRelativeBaseAddress()) {
       AP->emitLabelPlusOffset(SectionSym, Addr, SizeOf(AP, Form), true);
@@ -802,13 +819,24 @@ void DIEBlock::print(raw_ostream &O) const {
 //===----------------------------------------------------------------------===//
 
 unsigned DIELocList::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
-  if (Form == dwarf::DW_FORM_loclistx)
+  switch (Form) {
+  case dwarf::DW_FORM_loclistx:
     return getULEB128Size(Index);
-  if (Form == dwarf::DW_FORM_data4)
-    return 4;
-  if (Form == dwarf::DW_FORM_sec_offset)
+  case dwarf::DW_FORM_data4:
+    assert(!AP->isDwarf64() &&
+           "DW_FORM_data4 is not suitable to emit a pointer to a location list "
+           "in the 64-bit DWARF format");
     return 4;
-  return AP->MAI->getCodePointerSize();
+  case dwarf::DW_FORM_data8:
+    assert(AP->isDwarf64() &&
+           "DW_FORM_data8 is not suitable to emit a pointer to a location list "
+           "in the 32-bit DWARF format");
+    return 8;
+  case dwarf::DW_FORM_sec_offset:
+    return AP->getDwarfOffsetByteSize();
+  default:
+    llvm_unreachable("DIE Value form not supported yet");
+  }
 }
 
 /// EmitValue - Emit label value.
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp
index f26ef63eedec..da9997efc01f 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp
@@ -12,6 +12,7 @@
 
 #include "DIEHash.h"
 #include "ByteStreamer.h"
+#include "DwarfCompileUnit.h"
 #include "DwarfDebug.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
@@ -214,7 +215,15 @@ void DIEHash::hashDIEEntry(dwarf::Attribute Attribute, dwarf::Tag Tag,
 // all of the data is going to be added as integers.
 void DIEHash::hashBlockData(const DIE::const_value_range &Values) {
   for (const auto &V : Values)
-    Hash.update((uint64_t)V.getDIEInteger().getValue());
+    if (V.getType() == DIEValue::isBaseTypeRef) {
+      const DIE &C =
+          *CU->ExprRefedBaseTypes[V.getDIEBaseTypeRef().getIndex()].Die;
+      StringRef Name = getDIEStringAttr(C, dwarf::DW_AT_name);
+      assert(!Name.empty() &&
+             "Base types referenced from DW_OP_convert should have a name");
+      hashNestedType(C, Name);
+    } else
+      Hash.update((uint64_t)V.getDIEInteger().getValue());
 }
 
 // Hash the contents of a loclistptr class.
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DIEHash.h b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DIEHash.h
index 1a69f6772873..29e1da4c5d60 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DIEHash.h
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DIEHash.h
@@ -31,7 +31,8 @@ class DIEHash {
   };
 
 public:
-  DIEHash(AsmPrinter *A = nullptr) : AP(A) {}
+  DIEHash(AsmPrinter *A = nullptr, DwarfCompileUnit *CU = nullptr)
+      : AP(A), CU(CU) {}
 
   /// Computes the CU signature.
   uint64_t computeCUSignature(StringRef DWOName, const DIE &Die);
@@ -101,6 +102,7 @@ private:
 private:
   MD5 Hash;
   AsmPrinter *AP;
+  DwarfCompileUnit *CU;
   DenseMap<const DIE *, unsigned> Numbering;
 };
 }
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp
index 584b7614915d..1c9131edab83 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp
@@ -8,9 +8,11 @@
 
 #include "llvm/CodeGen/DbgEntityHistoryCalculator.h"
 #include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/LexicalScopes.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -51,6 +53,37 @@ static Register isDescribedByReg(const MachineInstr &MI) {
                                        : Register();
 }
 
+void InstructionOrdering::initialize(const MachineFunction &MF) {
+  // We give meta instructions the same ordinal as the preceding instruction
+  // because this class is written for the task of comparing positions of
+  // variable location ranges against scope ranges. To reflect what we'll see
+  // in the binary, when we look at location ranges we must consider all
+  // DBG_VALUEs between two real instructions at the same position. And a
+  // scope range which ends on a meta instruction should be considered to end
+  // at the last seen real instruction. E.g.
+  //
+  //  1 instruction p      Both the variable location for x and for y start
+  //  1 DBG_VALUE for "x"  after instruction p so we give them all the same
+  //  1 DBG_VALUE for "y"  number. If a scope range ends at DBG_VALUE for "y",
+  //  2 instruction q      we should treat it as ending after instruction p
+  //                       because it will be the last real instruction in the
+  //                       range. DBG_VALUEs at or after this position for
+  //                       variables declared in the scope will have no effect.
+  clear();
+  unsigned Position = 0;
+  for (const MachineBasicBlock &MBB : MF)
+    for (const MachineInstr &MI : MBB)
+      InstNumberMap[&MI] = MI.isMetaInstruction() ? Position : ++Position;
+}
+
+bool InstructionOrdering::isBefore(const MachineInstr *A,
+                                   const MachineInstr *B) const {
+  assert(A->getParent() && B->getParent() && "Operands must have a parent");
+  assert(A->getMF() == B->getMF() &&
+         "Operands must be in the same MachineFunction");
+  return InstNumberMap.lookup(A) < InstNumberMap.lookup(B);
+}
+
 bool DbgValueHistoryMap::startDbgValue(InlinedEntity Var,
                                        const MachineInstr &MI,
                                        EntryIndex &NewIndex) {
@@ -90,6 +123,156 @@ void DbgValueHistoryMap::Entry::endEntry(EntryIndex Index) {
   EndIndex = Index;
 }
 
+/// Check if the instruction range [StartMI, EndMI] intersects any instruction
+/// range in Ranges. EndMI can be nullptr to indicate that the range is
+/// unbounded. Assumes Ranges is ordered and disjoint. Returns true and points
+/// to the first intersecting scope range if one exists.
+static Optional<ArrayRef<InsnRange>::iterator>
+intersects(const MachineInstr *StartMI, const MachineInstr *EndMI,
+           const ArrayRef<InsnRange> &Ranges,
+           const InstructionOrdering &Ordering) {
+  for (auto RangesI = Ranges.begin(), RangesE = Ranges.end();
+       RangesI != RangesE; ++RangesI) {
+    if (EndMI && Ordering.isBefore(EndMI, RangesI->first))
+      return None;
+    if (EndMI && !Ordering.isBefore(RangesI->second, EndMI))
+      return RangesI;
+    if (Ordering.isBefore(StartMI, RangesI->second))
+      return RangesI;
+  }
+  return None;
+}
+
+void DbgValueHistoryMap::trimLocationRanges(
+    const MachineFunction &MF, LexicalScopes &LScopes,
+    const InstructionOrdering &Ordering) {
+  // The indices of the entries we're going to remove for each variable.
+  SmallVector<EntryIndex, 4> ToRemove;
+  // Entry reference count for each variable. Clobbers left with no references
+  // will be removed.
+  SmallVector<int, 4> ReferenceCount;
+  // Entries reference other entries by index. Offsets is used to remap these
+  // references if any entries are removed.
+  SmallVector<size_t, 4> Offsets;
+
+  for (auto &Record : VarEntries) {
+    auto &HistoryMapEntries = Record.second;
+    if (HistoryMapEntries.empty())
+      continue;
+
+    InlinedEntity Entity = Record.first;
+    const DILocalVariable *LocalVar = cast<DILocalVariable>(Entity.first);
+
+    LexicalScope *Scope = nullptr;
+    if (const DILocation *InlinedAt = Entity.second) {
+      Scope = LScopes.findInlinedScope(LocalVar->getScope(), InlinedAt);
+    } else {
+      Scope = LScopes.findLexicalScope(LocalVar->getScope());
+      // Ignore variables for non-inlined function level scopes. The scope
+      // ranges (from scope->getRanges()) will not include any instructions
+      // before the first one with a debug-location, which could cause us to
+      // incorrectly drop a location. We could introduce special casing for
+      // these variables, but it doesn't seem worth it because no out-of-scope
+      // locations have been observed for variables declared in function level
+      // scopes.
+      if (Scope &&
+          (Scope->getScopeNode() == Scope->getScopeNode()->getSubprogram()) &&
+          (Scope->getScopeNode() == LocalVar->getScope()))
+        continue;
+    }
+
+    // If there is no scope for the variable then something has probably gone
+    // wrong.
+    if (!Scope)
+      continue;
+
+    ToRemove.clear();
+    // Zero the reference counts.
+    ReferenceCount.assign(HistoryMapEntries.size(), 0);
+    // Index of the DBG_VALUE which marks the start of the current location
+    // range.
+    EntryIndex StartIndex = 0;
+    ArrayRef<InsnRange> ScopeRanges(Scope->getRanges());
+    for (auto EI = HistoryMapEntries.begin(), EE = HistoryMapEntries.end();
+         EI != EE; ++EI, ++StartIndex) {
+      // Only DBG_VALUEs can open location ranges so skip anything else.
+      if (!EI->isDbgValue())
+        continue;
+
+      // Index of the entry which closes this range.
+      EntryIndex EndIndex = EI->getEndIndex();
+      // If this range is closed bump the reference count of the closing entry.
+      if (EndIndex != NoEntry)
+        ReferenceCount[EndIndex] += 1;
+      // Skip this location range if the opening entry is still referenced. It
+      // may close a location range which intersects a scope range.
+      // TODO: We could be 'smarter' and trim these kinds of ranges such that
+      // they do not leak out of the scope ranges if they partially overlap.
+      if (ReferenceCount[StartIndex] > 0)
+        continue;
+
+      const MachineInstr *StartMI = EI->getInstr();
+      const MachineInstr *EndMI = EndIndex != NoEntry
+                                      ? HistoryMapEntries[EndIndex].getInstr()
+                                      : nullptr;
+      // Check if the location range [StartMI, EndMI] intersects with any scope
+      // range for the variable.
+      if (auto R = intersects(StartMI, EndMI, ScopeRanges, Ordering)) {
+        // Adjust ScopeRanges to exclude ranges which subsequent location ranges
+        // cannot possibly intersect.
+        ScopeRanges = ArrayRef<InsnRange>(R.getValue(), ScopeRanges.end());
+      } else {
+        // If the location range does not intersect any scope range then the
+        // DBG_VALUE which opened this location range is usless, mark it for
+        // removal.
+        ToRemove.push_back(StartIndex);
+        // Because we'll be removing this entry we need to update the reference
+        // count of the closing entry, if one exists.
+        if (EndIndex != NoEntry)
+          ReferenceCount[EndIndex] -= 1;
+      }
+    }
+
+    // If there is nothing to remove then jump to next variable.
+    if (ToRemove.empty())
+      continue;
+
+    // Mark clobbers that will no longer close any location ranges for removal.
+    for (size_t i = 0; i < HistoryMapEntries.size(); ++i)
+      if (ReferenceCount[i] <= 0 && HistoryMapEntries[i].isClobber())
+        ToRemove.push_back(i);
+
+    llvm::sort(ToRemove);
+
+    // Build an offset map so we can update the EndIndex of the remaining
+    // entries.
+    // Zero the offsets.
+    Offsets.assign(HistoryMapEntries.size(), 0);
+    size_t CurOffset = 0;
+    auto ToRemoveItr = ToRemove.begin();
+    for (size_t EntryIdx = *ToRemoveItr; EntryIdx < HistoryMapEntries.size();
+         ++EntryIdx) {
+      // Check if this is an entry which will be removed.
+      if (ToRemoveItr != ToRemove.end() && *ToRemoveItr == EntryIdx) {
+        ++ToRemoveItr;
+        ++CurOffset;
+      }
+      Offsets[EntryIdx] = CurOffset;
+    }
+
+    // Update the EndIndex of the entries to account for those which will be
+    // removed.
+    for (auto &Entry : HistoryMapEntries)
+      if (Entry.isClosed())
+        Entry.EndIndex -= Offsets[Entry.EndIndex];
+
+    // Now actually remove the entries. Iterate backwards so that our remaining
+    // ToRemove indices are valid after each erase.
+    for (auto Itr = ToRemove.rbegin(), End = ToRemove.rend(); Itr != End; ++Itr)
+      HistoryMapEntries.erase(HistoryMapEntries.begin() + *Itr);
+  }
+}
+
 void DbgLabelInstrMap::addInstr(InlinedEntity Label, const MachineInstr &MI) {
   assert(MI.isDebugLabel() && "not a DBG_LABEL");
   LabelInstr[Label] = &MI;
@@ -234,7 +417,7 @@ void llvm::calculateDbgEntityHistory(const MachineFunction *MF,
                                      DbgValueHistoryMap &DbgValues,
                                      DbgLabelInstrMap &DbgLabels) {
   const TargetLowering *TLI = MF->getSubtarget().getTargetLowering();
-  unsigned SP = TLI->getStackPointerRegisterToSaveRestore();
+  Register SP = TLI->getStackPointerRegisterToSaveRestore();
   Register FrameReg = TRI->getFrameRegister(*MF);
   RegDescribedVarsMap RegVars;
   DbgValueEntriesMap LiveEntries;
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
index 880791a06d93..68a4bfba42a7 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
@@ -21,11 +21,16 @@
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/CommandLine.h"
 
 using namespace llvm;
 
 #define DEBUG_TYPE "dwarfdebug"
 
+/// If true, we drop variable location ranges which exist entirely outside the
+/// variable's lexical scope instruction ranges.
+static cl::opt<bool> TrimVarLocs("trim-var-locs", cl::Hidden, cl::init(true));
+
 Optional<DbgVariableLocation>
 DbgVariableLocation::extractFromMachineInstruction(
     const MachineInstr &Instruction) {
@@ -86,6 +91,11 @@ DbgVariableLocation::extractFromMachineInstruction(
 
 DebugHandlerBase::DebugHandlerBase(AsmPrinter *A) : Asm(A), MMI(Asm->MMI) {}
 
+void DebugHandlerBase::beginModule(Module *M) {
+  if (M->debug_compile_units().empty())
+    Asm = nullptr;
+}
+
 // Each LexicalScope has first instruction and last instruction to mark
 // beginning and end of a scope respectively. Create an inverse map that list
 // scopes starts (and ends) with an instruction. One instruction may start (or
@@ -153,6 +163,54 @@ uint64_t DebugHandlerBase::getBaseTypeSize(const DIType *Ty) {
   return getBaseTypeSize(BaseType);
 }
 
+bool DebugHandlerBase::isUnsignedDIType(const DIType *Ty) {
+  if (auto *CTy = dyn_cast<DICompositeType>(Ty)) {
+    // FIXME: Enums without a fixed underlying type have unknown signedness
+    // here, leading to incorrectly emitted constants.
+    if (CTy->getTag() == dwarf::DW_TAG_enumeration_type)
+      return false;
+
+    // (Pieces of) aggregate types that get hacked apart by SROA may be
+    // represented by a constant. Encode them as unsigned bytes.
+    return true;
+  }
+
+  if (auto *DTy = dyn_cast<DIDerivedType>(Ty)) {
+    dwarf::Tag T = (dwarf::Tag)Ty->getTag();
+    // Encode pointer constants as unsigned bytes. This is used at least for
+    // null pointer constant emission.
+    // FIXME: reference and rvalue_reference /probably/ shouldn't be allowed
+    // here, but accept them for now due to a bug in SROA producing bogus
+    // dbg.values.
+    if (T == dwarf::DW_TAG_pointer_type ||
+        T == dwarf::DW_TAG_ptr_to_member_type ||
+        T == dwarf::DW_TAG_reference_type ||
+        T == dwarf::DW_TAG_rvalue_reference_type)
+      return true;
+    assert(T == dwarf::DW_TAG_typedef || T == dwarf::DW_TAG_const_type ||
+           T == dwarf::DW_TAG_volatile_type ||
+           T == dwarf::DW_TAG_restrict_type || T == dwarf::DW_TAG_atomic_type);
+    assert(DTy->getBaseType() && "Expected valid base type");
+    return isUnsignedDIType(DTy->getBaseType());
+  }
+
+  auto *BTy = cast<DIBasicType>(Ty);
+  unsigned Encoding = BTy->getEncoding();
+  assert((Encoding == dwarf::DW_ATE_unsigned ||
+          Encoding == dwarf::DW_ATE_unsigned_char ||
+          Encoding == dwarf::DW_ATE_signed ||
+          Encoding == dwarf::DW_ATE_signed_char ||
+          Encoding == dwarf::DW_ATE_float || Encoding == dwarf::DW_ATE_UTF ||
+          Encoding == dwarf::DW_ATE_boolean ||
+          (Ty->getTag() == dwarf::DW_TAG_unspecified_type &&
+           Ty->getName() == "decltype(nullptr)")) &&
+         "Unsupported encoding");
+  return Encoding == dwarf::DW_ATE_unsigned ||
+         Encoding == dwarf::DW_ATE_unsigned_char ||
+         Encoding == dwarf::DW_ATE_UTF || Encoding == dwarf::DW_ATE_boolean ||
+         Ty->getTag() == dwarf::DW_TAG_unspecified_type;
+}
+
 static bool hasDebugInfo(const MachineModuleInfo *MMI,
                          const MachineFunction *MF) {
   if (!MMI->hasDebugInfo())
@@ -191,6 +249,9 @@ void DebugHandlerBase::beginFunction(const MachineFunction *MF) {
   assert(DbgLabels.empty() && "DbgLabels map wasn't cleaned!");
   calculateDbgEntityHistory(MF, Asm->MF->getSubtarget().getRegisterInfo(),
                             DbgValues, DbgLabels);
+  InstOrdering.initialize(*MF);
+  if (TrimVarLocs)
+    DbgValues.trimLocationRanges(*MF, LScopes, InstOrdering);
   LLVM_DEBUG(DbgValues.dump());
 
   // Request labels for the full history.
@@ -212,10 +273,16 @@ void DebugHandlerBase::beginFunction(const MachineFunction *MF) {
     // doing that violates the ranges that are calculated in the history map.
     // However, we currently do not emit debug values for constant arguments
     // directly at the start of the function, so this code is still useful.
+    // FIXME: If the first mention of an argument is in a unique section basic
+    // block, we cannot always assign the CurrentFnBeginLabel as it lies in a
+    // different section.  Temporarily, we disable generating loc list
+    // information or DW_AT_const_value when the block is in a different
+    // section.
     const DILocalVariable *DIVar =
         Entries.front().getInstr()->getDebugVariable();
     if (DIVar->isParameter() &&
-        getDISubprogram(DIVar->getScope())->describes(&MF->getFunction())) {
+        getDISubprogram(DIVar->getScope())->describes(&MF->getFunction()) &&
+        Entries.front().getInstr()->getParent()->sameSection(&MF->front())) {
       if (!IsDescribedByReg(Entries.front().getInstr()))
         LabelsBeforeInsn[Entries.front().getInstr()] = Asm->getFunctionBegin();
       if (Entries.front().getInstr()->getDebugExpression()->isFragment()) {
@@ -262,7 +329,7 @@ void DebugHandlerBase::beginFunction(const MachineFunction *MF) {
 }
 
 void DebugHandlerBase::beginInstruction(const MachineInstr *MI) {
-  if (!MMI->hasDebugInfo())
+  if (!Asm || !MMI->hasDebugInfo())
     return;
 
   assert(CurMI == nullptr);
@@ -288,7 +355,7 @@ void DebugHandlerBase::beginInstruction(const MachineInstr *MI) {
 }
 
 void DebugHandlerBase::endInstruction() {
-  if (!MMI->hasDebugInfo())
+  if (!Asm || !MMI->hasDebugInfo())
     return;
 
   assert(CurMI != nullptr);
@@ -320,12 +387,13 @@ void DebugHandlerBase::endInstruction() {
 }
 
 void DebugHandlerBase::endFunction(const MachineFunction *MF) {
-  if (hasDebugInfo(MMI, MF))
+  if (Asm && hasDebugInfo(MMI, MF))
     endFunctionImpl(MF);
   DbgValues.clear();
   DbgLabels.clear();
   LabelsBeforeInsn.clear();
   LabelsAfterInsn.clear();
+  InstOrdering.clear();
 }
 
 void DebugHandlerBase::beginBasicBlock(const MachineBasicBlock &MBB) {
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
index 11ed1062f77e..c20ac6040aef 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
@@ -81,8 +81,9 @@ void DwarfCFIException::endModule() {
   }
 }
 
-static MCSymbol *getExceptionSym(AsmPrinter *Asm) {
-  return Asm->getCurExceptionSym();
+static MCSymbol *getExceptionSym(AsmPrinter *Asm,
+                                 const MachineBasicBlock *MBB) {
+  return Asm->getMBBExceptionSym(*MBB);
 }
 
 void DwarfCFIException::beginFunction(const MachineFunction *MF) {
@@ -161,7 +162,7 @@ void DwarfCFIException::beginFragment(const MachineBasicBlock *MBB,
 
   // Provide LSDA information.
   if (shouldEmitLSDA)
-    Asm->OutStreamer->emitCFILsda(ESP(Asm), TLOF.getLSDAEncoding());
+    Asm->OutStreamer->emitCFILsda(ESP(Asm, MBB), TLOF.getLSDAEncoding());
 }
 
 /// endFunction - Gather and emit post-function exception information.
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index 296c380ae550..befc4bba19a2 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -12,18 +12,12 @@
 
 #include "DwarfCompileUnit.h"
 #include "AddressPool.h"
-#include "DwarfDebug.h"
 #include "DwarfExpression.h"
-#include "DwarfUnit.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/DIE.h"
-#include "llvm/CodeGen/LexicalScopes.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineOperand.h"
@@ -32,22 +26,16 @@
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
-#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolWasm.h"
 #include "llvm/MC/MachineLocation.h"
-#include "llvm/Support/Casting.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
 #include <iterator>
-#include <memory>
 #include <string>
 #include <utility>
 
@@ -117,7 +105,7 @@ unsigned DwarfCompileUnit::getOrCreateSourceID(const DIFile *File) {
     return Asm->OutStreamer->emitDwarfFileDirective(0, "", "", None, None,
                                                     CUID);
   return Asm->OutStreamer->emitDwarfFileDirective(
-      0, File->getDirectory(), File->getFilename(), getMD5AsBytes(File),
+      0, File->getDirectory(), File->getFilename(), DD->getMD5AsBytes(File),
       File->getSource(), CUID);
 }
 
@@ -260,7 +248,9 @@ void DwarfCompileUnit::addLocationAttribute(
                                      : dwarf::DW_OP_const8u);
             // 2) containing the (relocated) offset of the TLS variable
             //    within the module's TLS block.
-            addExpr(*Loc, dwarf::DW_FORM_udata,
+            addExpr(*Loc,
+                    PointerSize == 4 ? dwarf::DW_FORM_data4
+                                     : dwarf::DW_FORM_data8,
                     Asm->getObjFileLowering().getDebugThreadLocalSymbol(Sym));
           } else {
             addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_const_index);
@@ -432,7 +422,10 @@ DIE &DwarfCompileUnit::updateSubprogramScopeDIE(const DISubprogram *SP) {
       // FIXME: duplicated from Target/WebAssembly/WebAssembly.h
       // don't want to depend on target specific headers in this code?
       const unsigned TI_GLOBAL_RELOC = 3;
-      if (FrameBase.Location.WasmLoc.Kind == TI_GLOBAL_RELOC) {
+      // FIXME: when writing dwo, we need to avoid relocations. Probably
+      // the "right" solution is to treat globals the way func and data symbols
+      // are (with entries in .debug_addr).
+      if (FrameBase.Location.WasmLoc.Kind == TI_GLOBAL_RELOC && !isDwoUnit()) {
         // These need to be relocatable.
         assert(FrameBase.Location.WasmLoc.Index == 0);  // Only SP so far.
         auto SPSym = cast<MCSymbolWasm>(
@@ -449,8 +442,8 @@ DIE &DwarfCompileUnit::updateSubprogramScopeDIE(const DISubprogram *SP) {
             true});
         DIELoc *Loc = new (DIEValueAllocator) DIELoc;
         addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_WASM_location);
-        addSInt(*Loc, dwarf::DW_FORM_sdata, FrameBase.Location.WasmLoc.Kind);
-        addLabel(*Loc, dwarf::DW_FORM_udata, SPSym);
+        addSInt(*Loc, dwarf::DW_FORM_sdata, TI_GLOBAL_RELOC);
+        addLabel(*Loc, dwarf::DW_FORM_data4, SPSym);
         DD->addArangeLabel(SymbolCU(this, SPSym));
         addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_stack_value);
         addBlock(*SPDie, dwarf::DW_AT_frame_base, Loc);
@@ -565,7 +558,12 @@ void DwarfCompileUnit::addScopeRangeList(DIE &ScopeDIE,
 
 void DwarfCompileUnit::attachRangesOrLowHighPC(
     DIE &Die, SmallVector<RangeSpan, 2> Ranges) {
-  if (Ranges.size() == 1 || !DD->useRangesSection()) {
+  assert(!Ranges.empty());
+  if (!DD->useRangesSection() ||
+      (Ranges.size() == 1 &&
+       (!DD->alwaysUseRanges() ||
+        DD->getSectionLabel(&Ranges.front().Begin->getSection()) ==
+            Ranges.front().Begin))) {
     const RangeSpan &Front = Ranges.front();
     const RangeSpan &Back = Ranges.back();
     attachLowHighPC(Die, Front.Begin, Back.End);
@@ -688,9 +686,9 @@ DIE *DwarfCompileUnit::constructVariableDIEImpl(const DbgVariable &DV,
 
   // Add variable address.
 
-  unsigned Offset = DV.getDebugLocListIndex();
-  if (Offset != ~0U) {
-    addLocationList(*VariableDie, dwarf::DW_AT_location, Offset);
+  unsigned Index = DV.getDebugLocListIndex();
+  if (Index != ~0U) {
+    addLocationList(*VariableDie, dwarf::DW_AT_location, Index);
     auto TagOffset = DV.getDebugLocListTagOffset();
     if (TagOffset)
       addUInt(*VariableDie, dwarf::DW_AT_LLVM_tag_offset, dwarf::DW_FORM_data1,
@@ -722,6 +720,13 @@ DIE *DwarfCompileUnit::constructVariableDIEImpl(const DbgVariable &DV,
       addConstantFPValue(*VariableDie, DVal->getConstantFP());
     } else if (DVal->isConstantInt()) {
       addConstantValue(*VariableDie, DVal->getConstantInt(), DV.getType());
+    } else if (DVal->isTargetIndexLocation()) {
+      DIELoc *Loc = new (DIEValueAllocator) DIELoc;
+      DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc);
+      const DIBasicType *BT = dyn_cast<DIBasicType>(
+          static_cast<const Metadata *>(DV.getVariable()->getType()));
+      DwarfDebug::emitDebugLocValue(*Asm, BT, *DVal, DwarfExpr);
+      addBlock(*VariableDie, dwarf::DW_AT_location, DwarfExpr.finalize());
     }
     return VariableDie;
   }
@@ -737,10 +742,14 @@ DIE *DwarfCompileUnit::constructVariableDIEImpl(const DbgVariable &DV,
     Register FrameReg;
     const DIExpression *Expr = Fragment.Expr;
     const TargetFrameLowering *TFI = Asm->MF->getSubtarget().getFrameLowering();
-    int Offset = TFI->getFrameIndexReference(*Asm->MF, Fragment.FI, FrameReg);
+    StackOffset Offset =
+        TFI->getFrameIndexReference(*Asm->MF, Fragment.FI, FrameReg);
     DwarfExpr.addFragmentOffset(Expr);
+
+    auto *TRI = Asm->MF->getSubtarget().getRegisterInfo();
     SmallVector<uint64_t, 8> Ops;
-    DIExpression::appendOffset(Ops, Offset);
+    TRI->getOffsetOpcodes(Offset, Ops);
+
     // According to
     // https://docs.nvidia.com/cuda/archive/10.0/ptx-writers-guide-to-interoperability/index.html#cuda-specific-dwarf
     // cuda-gdb requires DW_AT_address_class for all variables to be able to
@@ -801,6 +810,10 @@ static SmallVector<const DIVariable *, 2> dependencies(DbgVariable *Var) {
     return Result;
   if (auto *DLVar = Array->getDataLocation())
     Result.push_back(DLVar);
+  if (auto *AsVar = Array->getAssociated())
+    Result.push_back(AsVar);
+  if (auto *AlVar = Array->getAllocated())
+    Result.push_back(AlVar);
   for (auto *El : Array->getElements()) {
     if (auto *Subrange = dyn_cast<DISubrange>(El)) {
       if (auto Count = Subrange->getCount())
@@ -815,6 +828,19 @@ static SmallVector<const DIVariable *, 2> dependencies(DbgVariable *Var) {
       if (auto ST = Subrange->getStride())
         if (auto *Dependency = ST.dyn_cast<DIVariable *>())
           Result.push_back(Dependency);
+    } else if (auto *GenericSubrange = dyn_cast<DIGenericSubrange>(El)) {
+      if (auto Count = GenericSubrange->getCount())
+        if (auto *Dependency = Count.dyn_cast<DIVariable *>())
+          Result.push_back(Dependency);
+      if (auto LB = GenericSubrange->getLowerBound())
+        if (auto *Dependency = LB.dyn_cast<DIVariable *>())
+          Result.push_back(Dependency);
+      if (auto UB = GenericSubrange->getUpperBound())
+        if (auto *Dependency = UB.dyn_cast<DIVariable *>())
+          Result.push_back(Dependency);
+      if (auto ST = GenericSubrange->getStride())
+        if (auto *Dependency = ST.dyn_cast<DIVariable *>())
+          Result.push_back(Dependency);
     }
   }
   return Result;
@@ -996,7 +1022,7 @@ void DwarfCompileUnit::constructAbstractSubprogramScopeDIE(
 }
 
 bool DwarfCompileUnit::useGNUAnalogForDwarf5Feature() const {
-  return DD->getDwarfVersion() == 4 && DD->tuneForGDB();
+  return DD->getDwarfVersion() == 4 && !DD->tuneForLLDB();
 }
 
 dwarf::Tag DwarfCompileUnit::getDwarf5OrGNUTag(dwarf::Tag Tag) const {
@@ -1352,11 +1378,9 @@ void DwarfCompileUnit::addComplexAddress(const DbgVariable &DV, DIE &Die,
 /// Add a Dwarf loclistptr attribute data and value.
 void DwarfCompileUnit::addLocationList(DIE &Die, dwarf::Attribute Attribute,
                                        unsigned Index) {
-  dwarf::Form Form = dwarf::DW_FORM_data4;
-  if (DD->getDwarfVersion() == 4)
-    Form =dwarf::DW_FORM_sec_offset;
-  if (DD->getDwarfVersion() >= 5)
-    Form =dwarf::DW_FORM_loclistx;
+  dwarf::Form Form = (DD->getDwarfVersion() >= 5)
+                         ? dwarf::DW_FORM_loclistx
+                         : DD->getDwarfSectionOffsetForm();
   Die.addValue(DIEValueAllocator, Attribute, Form, DIELocList(Index));
 }
 
@@ -1417,8 +1441,8 @@ void DwarfCompileUnit::addAddrTableBase() {
   const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
   MCSymbol *Label = DD->getAddressPool().getLabel();
   addSectionLabel(getUnitDie(),
-                  getDwarfVersion() >= 5 ? dwarf::DW_AT_addr_base
-                                         : dwarf::DW_AT_GNU_addr_base,
+                  DD->getDwarfVersion() >= 5 ? dwarf::DW_AT_addr_base
+                                             : dwarf::DW_AT_GNU_addr_base,
                   Label, TLOF.getDwarfAddrSection()->getBeginSymbol());
 }
 
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
index 4ccd8c96dd0d..6d8186a5ee2b 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
@@ -22,7 +22,6 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/CodeGen/DbgEntityHistoryCalculator.h"
-#include "llvm/CodeGen/DIE.h"
 #include "llvm/CodeGen/LexicalScopes.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/Support/Casting.h"
@@ -34,6 +33,9 @@
 namespace llvm {
 
 class AsmPrinter;
+class DIE;
+class DIELoc;
+class DIEValueList;
 class DwarfFile;
 class GlobalVariable;
 class MCExpr;
@@ -55,7 +57,7 @@ class DwarfCompileUnit final : public DwarfUnit {
   DwarfCompileUnit *Skeleton = nullptr;
 
   /// The start of the unit within its section.
-  MCSymbol *LabelBegin;
+  MCSymbol *LabelBegin = nullptr;
 
   /// The start of the unit macro info within macro section.
   MCSymbol *MacroLabelBegin;
@@ -287,8 +289,8 @@ public:
     return DwarfUnit::getHeaderSize() + DWOIdSize;
   }
   unsigned getLength() {
-    return sizeof(uint32_t) + // Length field
-        getHeaderSize() + getUnitDie().getSize();
+    return Asm->getUnitLengthFieldByteSize() + // Length field
+           getHeaderSize() + getUnitDie().getSize();
   }
 
   void emitHeader(bool UseOffsets) override;
@@ -297,7 +299,7 @@ public:
   void addAddrTableBase();
 
   MCSymbol *getLabelBegin() const {
-    assert(getSection());
+    assert(LabelBegin && "LabelBegin is not initialized");
     return LabelBegin;
   }
 
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 45ed5256deb9..462682743c6a 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -13,30 +13,18 @@
 #include "DwarfDebug.h"
 #include "ByteStreamer.h"
 #include "DIEHash.h"
-#include "DebugLocEntry.h"
-#include "DebugLocStream.h"
 #include "DwarfCompileUnit.h"
 #include "DwarfExpression.h"
-#include "DwarfFile.h"
 #include "DwarfUnit.h"
 #include "llvm/ADT/APInt.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
-#include "llvm/BinaryFormat/Dwarf.h"
-#include "llvm/CodeGen/AccelTable.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/DIE.h"
 #include "llvm/CodeGen/LexicalScopes.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
@@ -46,14 +34,11 @@
 #include "llvm/DebugInfo/DWARF/DWARFExpression.h"
 #include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Module.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
@@ -71,15 +56,10 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetOptions.h"
 #include <algorithm>
-#include <cassert>
 #include <cstddef>
-#include <cstdint>
 #include <iterator>
 #include <string>
-#include <utility>
-#include <vector>
 
 using namespace llvm;
 
@@ -87,18 +67,10 @@ using namespace llvm;
 
 STATISTIC(NumCSParams, "Number of dbg call site params created");
 
-static cl::opt<bool>
-DisableDebugInfoPrinting("disable-debug-info-print", cl::Hidden,
-                         cl::desc("Disable debug info printing"));
-
 static cl::opt<bool> UseDwarfRangesBaseAddressSpecifier(
     "use-dwarf-ranges-base-address-specifier", cl::Hidden,
     cl::desc("Use base address specifiers in debug_ranges"), cl::init(false));
 
-static cl::opt<bool> EmitDwarfDebugEntryValues(
-    "emit-debug-entry-values", cl::Hidden,
-    cl::desc("Emit the debug entry values"), cl::init(false));
-
 static cl::opt<bool> GenerateARangeSection("generate-arange-section",
                                            cl::Hidden,
                                            cl::desc("Generate dwarf aranges"),
@@ -151,6 +123,18 @@ static cl::opt<DefaultOnOff> DwarfSectionsAsReferences(
                clEnumVal(Enable, "Enabled"), clEnumVal(Disable, "Disabled")),
     cl::init(Default));
 
+static cl::opt<bool>
+    UseGNUDebugMacro("use-gnu-debug-macro", cl::Hidden,
+                     cl::desc("Emit the GNU .debug_macro format with DWARF <5"),
+                     cl::init(false));
+
+static cl::opt<DefaultOnOff> DwarfOpConvert(
+    "dwarf-op-convert", cl::Hidden,
+    cl::desc("Enable use of the DWARFv5 DW_OP_convert operator"),
+    cl::values(clEnumVal(Default, "Default for platform"),
+               clEnumVal(Enable, "Enabled"), clEnumVal(Disable, "Disabled")),
+    cl::init(Default));
+
 enum LinkageNameOption {
   DefaultLinkageNames,
   AllLinkageNames,
@@ -167,19 +151,23 @@ static cl::opt<LinkageNameOption>
                                             "Abstract subprograms")),
                       cl::init(DefaultLinkageNames));
 
-static cl::opt<unsigned> LocationAnalysisSizeLimit(
-    "singlevarlocation-input-bb-limit",
-    cl::desc("Maximum block size to analyze for single-location variables"),
-    cl::init(30000), cl::Hidden);
+static cl::opt<DwarfDebug::MinimizeAddrInV5> MinimizeAddrInV5Option(
+    "minimize-addr-in-v5", cl::Hidden,
+    cl::desc("Always use DW_AT_ranges in DWARFv5 whenever it could allow more "
+             "address pool entry sharing to reduce relocations/object size"),
+    cl::values(clEnumValN(DwarfDebug::MinimizeAddrInV5::Default, "Default",
+                          "Default address minimization strategy"),
+               clEnumValN(DwarfDebug::MinimizeAddrInV5::Ranges, "Ranges",
+                          "Use rnglists for contiguous ranges if that allows "
+                          "using a pre-existing base address"),
+               clEnumValN(DwarfDebug::MinimizeAddrInV5::Disabled, "Disabled",
+                          "Stuff")),
+    cl::init(DwarfDebug::MinimizeAddrInV5::Default));
 
-static const char *const DWARFGroupName = "dwarf";
-static const char *const DWARFGroupDescription = "DWARF Emission";
-static const char *const DbgTimerName = "writer";
-static const char *const DbgTimerDescription = "DWARF Debug Writer";
 static constexpr unsigned ULEB128PadSize = 4;
 
 void DebugLocDwarfExpression::emitOp(uint8_t Op, const char *Comment) {
-  getActiveStreamer().EmitInt8(
+  getActiveStreamer().emitInt8(
       Op, Comment ? Twine(Comment) + " " + dwarf::OperationEncodingString(Op)
                   : dwarf::OperationEncodingString(Op));
 }
@@ -193,7 +181,7 @@ void DebugLocDwarfExpression::emitUnsigned(uint64_t Value) {
 }
 
 void DebugLocDwarfExpression::emitData1(uint8_t Value) {
-  getActiveStreamer().EmitInt8(Value, Twine(Value));
+  getActiveStreamer().emitInt8(Value, Twine(Value));
 }
 
 void DebugLocDwarfExpression::emitBaseTypeRef(uint64_t Idx) {
@@ -202,7 +190,7 @@ void DebugLocDwarfExpression::emitBaseTypeRef(uint64_t Idx) {
 }
 
 bool DebugLocDwarfExpression::isFrameRegister(const TargetRegisterInfo &TRI,
-                                              unsigned MachineReg) {
+                                              llvm::Register MachineReg) {
   // This information is not available while emitting .debug_loc entries.
   return false;
 }
@@ -227,7 +215,7 @@ void DebugLocDwarfExpression::commitTemporaryBuffer() {
     const char *Comment = (Byte.index() < TmpBuf->Comments.size())
                               ? TmpBuf->Comments[Byte.index()].c_str()
                               : "";
-    OutBS.EmitInt8(Byte.value(), Comment);
+    OutBS.emitInt8(Byte.value(), Comment);
   }
   TmpBuf->Bytes.clear();
   TmpBuf->Comments.clear();
@@ -242,8 +230,8 @@ static DbgValueLoc getDebugLocValue(const MachineInstr *MI) {
   const DIExpression *Expr = MI->getDebugExpression();
   assert(MI->getNumOperands() == 4);
   if (MI->getDebugOperand(0).isReg()) {
-    auto RegOp = MI->getDebugOperand(0);
-    auto Op1 = MI->getDebugOffset();
+    const auto &RegOp = MI->getDebugOperand(0);
+    const auto &Op1 = MI->getDebugOffset();
     // If the second operand is an immediate, this is a
     // register-indirect address.
     assert((!Op1.isImm() || (Op1.getImm() == 0)) && "unexpected offset");
@@ -251,7 +239,7 @@ static DbgValueLoc getDebugLocValue(const MachineInstr *MI) {
     return DbgValueLoc(Expr, MLoc);
   }
   if (MI->getDebugOperand(0).isTargetIndex()) {
-    auto Op = MI->getDebugOperand(0);
+    const auto &Op = MI->getDebugOperand(0);
     return DbgValueLoc(Expr,
                        TargetIndexLocation(Op.getIndex(), Op.getOffset()));
   }
@@ -354,7 +342,7 @@ static AccelTableKind computeAccelTableKind(unsigned DwarfVersion,
   return AccelTableKind::None;
 }
 
-DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M)
+DwarfDebug::DwarfDebug(AsmPrinter *A)
     : DebugHandlerBase(A), DebugLocs(A->OutStreamer->isVerboseAsm()),
       InfoHolder(A, "info_string", DIEValueAllocator),
       SkeletonHolder(A, "skel_string", DIEValueAllocator),
@@ -397,6 +385,11 @@ DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M)
   DwarfVersion =
       TT.isNVPTX() ? 2 : (DwarfVersion ? DwarfVersion : dwarf::DWARF_VERSION);
 
+  bool Dwarf64 = Asm->TM.Options.MCOptions.Dwarf64 &&
+                 DwarfVersion >= 3 &&   // DWARF64 was introduced in DWARFv3.
+                 TT.isArch64Bit() &&    // DWARF64 requires 64-bit relocations.
+                 TT.isOSBinFormatELF(); // Support only ELF for now.
+
   UseRangesSection = !NoDwarfRangesSection && !TT.isNVPTX();
 
   // Use sections as references. Force for NVPTX.
@@ -406,8 +399,9 @@ DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M)
     UseSectionsAsReferences = DwarfSectionsAsReferences == Enable;
 
   // Don't generate type units for unsupported object file formats.
-  GenerateTypeUnits =
-      A->TM.getTargetTriple().isOSBinFormatELF() && GenerateDwarfTypeUnits;
+  GenerateTypeUnits = (A->TM.getTargetTriple().isOSBinFormatELF() ||
+                       A->TM.getTargetTriple().isOSBinFormatWasm()) &&
+                      GenerateDwarfTypeUnits;
 
   TheAccelTableKind = computeAccelTableKind(
       DwarfVersion, GenerateTypeUnits, DebuggerTuning, A->TM.getTargetTriple());
@@ -430,11 +424,31 @@ DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M)
 
   // Emit call-site-param debug info for GDB and LLDB, if the target supports
   // the debug entry values feature. It can also be enabled explicitly.
-  EmitDebugEntryValues = (Asm->TM.Options.ShouldEmitDebugEntryValues() &&
-                          (tuneForGDB() || tuneForLLDB())) ||
-                         EmitDwarfDebugEntryValues;
+  EmitDebugEntryValues = Asm->TM.Options.ShouldEmitDebugEntryValues();
+
+  // It is unclear if the GCC .debug_macro extension is well-specified
+  // for split DWARF. For now, do not allow LLVM to emit it.
+  UseDebugMacroSection =
+      DwarfVersion >= 5 || (UseGNUDebugMacro && !useSplitDwarf());
+  if (DwarfOpConvert == Default)
+    EnableOpConvert = !((tuneForGDB() && useSplitDwarf()) || (tuneForLLDB() && !TT.isOSBinFormatMachO()));
+  else
+    EnableOpConvert = (DwarfOpConvert == Enable);
+
+  // Split DWARF would benefit object size significantly by trading reductions
+  // in address pool usage for slightly increased range list encodings.
+  if (DwarfVersion >= 5) {
+    MinimizeAddr = MinimizeAddrInV5Option;
+    // FIXME: In the future, enable this by default for Split DWARF where the
+    // tradeoff is more pronounced due to being able to offload the range
+    // lists to the dwo file and shrink object files/reduce relocations there.
+    if (MinimizeAddr == MinimizeAddrInV5::Default)
+      MinimizeAddr = MinimizeAddrInV5::Disabled;
+  }
 
   Asm->OutStreamer->getContext().setDwarfVersion(DwarfVersion);
+  Asm->OutStreamer->getContext().setDwarfFormat(Dwarf64 ? dwarf::DWARF64
+                                                        : dwarf::DWARF32);
 }
 
 // Define out of line so we don't have to include DwarfUnit.h in DwarfDebug.h.
@@ -583,7 +597,7 @@ static const DIExpression *combineDIExpressions(const DIExpression *Original,
   std::vector<uint64_t> Elts = Addition->getElements().vec();
   // Avoid multiple DW_OP_stack_values.
   if (Original->isImplicit() && Addition->isImplicit())
-    erase_if(Elts, [](uint64_t Op) { return Op == dwarf::DW_OP_stack_value; });
+    erase_value(Elts, dwarf::DW_OP_stack_value);
   const DIExpression *CombinedExpr =
       (Elts.size() > 0) ? DIExpression::append(Original, Elts) : Original;
   return CombinedExpr;
@@ -709,11 +723,11 @@ static void interpretValues(const MachineInstr *CurMI,
                              ForwardedRegWorklist[ParamFwdReg], Params);
       } else if (ParamValue->first.isReg()) {
         Register RegLoc = ParamValue->first.getReg();
-        unsigned SP = TLI.getStackPointerRegisterToSaveRestore();
+        Register SP = TLI.getStackPointerRegisterToSaveRestore();
         Register FP = TRI.getFrameRegister(*MF);
         bool IsSPorFP = (RegLoc == SP) || (RegLoc == FP);
         if (TRI.isCalleeSavedPhysReg(RegLoc, *MF) || IsSPorFP) {
-          MachineLocation MLoc(RegLoc, /*IsIndirect=*/IsSPorFP);
+          MachineLocation MLoc(RegLoc, /*Indirect=*/IsSPorFP);
           finishCallSiteParams(MLoc, ParamValue->second,
                                ForwardedRegWorklist[ParamFwdReg], Params);
         } else {
@@ -797,6 +811,11 @@ static void collectCallSiteParameters(const MachineInstr *CallMI,
     (void)InsertedReg;
   }
 
+  // Do not emit CSInfo for undef forwarding registers.
+  for (auto &MO : CallMI->uses())
+    if (MO.isReg() && MO.isUndef())
+      ForwardedRegWorklist.erase(MO.getReg());
+
   // We erase, from the ForwardedRegWorklist, those forwarding registers for
   // which we successfully describe a loaded value (by using
   // the describeLoadedValue()). For those remaining arguments in the working
@@ -1071,9 +1090,8 @@ DwarfDebug::getOrCreateDwarfCompileUnit(const DICompileUnit *DIUnit) {
   // compilation directory.
   if (!Asm->OutStreamer->hasRawTextSupport() || SingleCU)
     Asm->OutStreamer->emitDwarfFile0Directive(
-        CompilationDir, DIUnit->getFilename(),
-        NewCU.getMD5AsBytes(DIUnit->getFile()), DIUnit->getSource(),
-        NewCU.getUniqueID());
+        CompilationDir, DIUnit->getFilename(), getMD5AsBytes(DIUnit->getFile()),
+        DIUnit->getSource(), NewCU.getUniqueID());
 
   if (useSplitDwarf()) {
     NewCU.setSkeleton(constructSkeletonCU(NewCU));
@@ -1126,21 +1144,17 @@ sortGlobalExprs(SmallVectorImpl<DwarfCompileUnit::GlobalExpr> &GVEs) {
 // Emit all Dwarf sections that should come prior to the content. Create
 // global DIEs and emit initial debug info sections. This is invoked by
 // the target AsmPrinter.
-void DwarfDebug::beginModule() {
-  NamedRegionTimer T(DbgTimerName, DbgTimerDescription, DWARFGroupName,
-                     DWARFGroupDescription, TimePassesIsEnabled);
-  if (DisableDebugInfoPrinting) {
-    MMI->setDebugInfoAvailability(false);
-    return;
-  }
+void DwarfDebug::beginModule(Module *M) {
+  DebugHandlerBase::beginModule(M);
 
-  const Module *M = MMI->getModule();
+  if (!Asm || !MMI->hasDebugInfo())
+    return;
 
   unsigned NumDebugCUs = std::distance(M->debug_compile_units_begin(),
                                        M->debug_compile_units_end());
-  // Tell MMI whether we have debug info.
-  assert(MMI->hasDebugInfo() == (NumDebugCUs > 0) &&
-         "DebugInfoAvailabilty initialized unexpectedly");
+  assert(NumDebugCUs > 0 && "Asm unexpectedly initialized");
+  assert(MMI->hasDebugInfo() &&
+         "DebugInfoAvailabilty unexpectedly not initialized");
   SingleCU = NumDebugCUs == 1;
   DenseMap<DIGlobalVariable *, SmallVector<DwarfCompileUnit::GlobalExpr, 1>>
       GVMap;
@@ -1292,7 +1306,7 @@ void DwarfDebug::finalizeModuleInfo() {
                       Asm->TM.Options.MCOptions.SplitDwarfFile);
       // Emit a unique identifier for this CU.
       uint64_t ID =
-          DIEHash(Asm).computeCUSignature(DWOName, TheCU.getUnitDie());
+          DIEHash(Asm, &TheCU).computeCUSignature(DWOName, TheCU.getUnitDie());
       if (getDwarfVersion() >= 5) {
         TheCU.setDWOId(ID);
         SkCU->setDWOId(ID);
@@ -1353,15 +1367,18 @@ void DwarfDebug::finalizeModuleInfo() {
     // If compile Unit has macros, emit "DW_AT_macro_info/DW_AT_macros"
     // attribute.
     if (CUNode->getMacros()) {
-      if (getDwarfVersion() >= 5) {
+      if (UseDebugMacroSection) {
         if (useSplitDwarf())
           TheCU.addSectionDelta(
               TheCU.getUnitDie(), dwarf::DW_AT_macros, U.getMacroLabelBegin(),
               TLOF.getDwarfMacroDWOSection()->getBeginSymbol());
-        else
-          U.addSectionLabel(U.getUnitDie(), dwarf::DW_AT_macros,
-                            U.getMacroLabelBegin(),
+        else {
+          dwarf::Attribute MacrosAttr = getDwarfVersion() >= 5
+                                            ? dwarf::DW_AT_macros
+                                            : dwarf::DW_AT_GNU_macros;
+          U.addSectionLabel(U.getUnitDie(), MacrosAttr, U.getMacroLabelBegin(),
                             TLOF.getDwarfMacroSection()->getBeginSymbol());
+        }
       } else {
         if (useSplitDwarf())
           TheCU.addSectionDelta(
@@ -1398,9 +1415,8 @@ void DwarfDebug::endModule() {
   }
 
   // If we aren't actually generating debug info (check beginModule -
-  // conditionalized on !DisableDebugInfoPrinting and the presence of the
-  // llvm.dbg.cu metadata node)
-  if (!MMI->hasDebugInfo())
+  // conditionalized on the presence of the llvm.dbg.cu metadata node)
+  if (!Asm || !MMI->hasDebugInfo())
     return;
 
   // Finalize the debug info for the module.
@@ -1532,7 +1548,8 @@ void DwarfDebug::collectVariableInfoFromMFTable(
 /// either open or otherwise rolls off the end of the scope.
 static bool validThroughout(LexicalScopes &LScopes,
                             const MachineInstr *DbgValue,
-                            const MachineInstr *RangeEnd) {
+                            const MachineInstr *RangeEnd,
+                            const InstructionOrdering &Ordering) {
   assert(DbgValue->getDebugLoc() && "DBG_VALUE without a debug location");
   auto MBB = DbgValue->getParent();
   auto DL = DbgValue->getDebugLoc();
@@ -1544,34 +1561,30 @@ static bool validThroughout(LexicalScopes &LScopes,
   if (LSRange.size() == 0)
     return false;
 
-
-  // Determine if the DBG_VALUE is valid at the beginning of its lexical block.
   const MachineInstr *LScopeBegin = LSRange.front().first;
-  // Early exit if the lexical scope begins outside of the current block.
-  if (LScopeBegin->getParent() != MBB)
-    return false;
-
-  // If there are instructions belonging to our scope in another block, and
-  // we're not a constant (see DWARF2 comment below), then we can't be
-  // validThroughout.
-  const MachineInstr *LScopeEnd = LSRange.back().second;
-  if (RangeEnd && LScopeEnd->getParent() != MBB)
-    return false;
-
-  MachineBasicBlock::const_reverse_iterator Pred(DbgValue);
-  for (++Pred; Pred != MBB->rend(); ++Pred) {
-    if (Pred->getFlag(MachineInstr::FrameSetup))
-      break;
-    auto PredDL = Pred->getDebugLoc();
-    if (!PredDL || Pred->isMetaInstruction())
-      continue;
-    // Check whether the instruction preceding the DBG_VALUE is in the same
-    // (sub)scope as the DBG_VALUE.
-    if (DL->getScope() == PredDL->getScope())
-      return false;
-    auto *PredScope = LScopes.findLexicalScope(PredDL);
-    if (!PredScope || LScope->dominates(PredScope))
+  // If the scope starts before the DBG_VALUE then we may have a negative
+  // result. Otherwise the location is live coming into the scope and we
+  // can skip the following checks.
+  if (!Ordering.isBefore(DbgValue, LScopeBegin)) {
+    // Exit if the lexical scope begins outside of the current block.
+    if (LScopeBegin->getParent() != MBB)
       return false;
+
+    MachineBasicBlock::const_reverse_iterator Pred(DbgValue);
+    for (++Pred; Pred != MBB->rend(); ++Pred) {
+      if (Pred->getFlag(MachineInstr::FrameSetup))
+        break;
+      auto PredDL = Pred->getDebugLoc();
+      if (!PredDL || Pred->isMetaInstruction())
+        continue;
+      // Check whether the instruction preceding the DBG_VALUE is in the same
+      // (sub)scope as the DBG_VALUE.
+      if (DL->getScope() == PredDL->getScope())
+        return false;
+      auto *PredScope = LScopes.findLexicalScope(PredDL);
+      if (!PredScope || LScope->dominates(PredScope))
+        return false;
+    }
   }
 
   // If the range of the DBG_VALUE is open-ended, report success.
@@ -1585,24 +1598,10 @@ static bool validThroughout(LexicalScopes &LScopes,
   if (DbgValue->getDebugOperand(0).isImm() && MBB->pred_empty())
     return true;
 
-  // Now check for situations where an "open-ended" DBG_VALUE isn't enough to
-  // determine eligibility for a single location, e.g. nested scopes, inlined
-  // functions.
-  // FIXME: For now we just handle a simple (but common) case where the scope
-  // is contained in MBB. We could be smarter here.
-  //
-  // At this point we know that our scope ends in MBB. So, if RangeEnd exists
-  // outside of the block we can ignore it; the location is just leaking outside
-  // its scope.
-  assert(LScopeEnd->getParent() == MBB && "Scope ends outside MBB");
-  if (RangeEnd->getParent() != DbgValue->getParent())
-    return true;
-
-  // The location range and variable's enclosing scope are both contained within
-  // MBB, test if location terminates before end of scope.
-  for (auto I = RangeEnd->getIterator(); I != MBB->end(); ++I)
-    if (&*I == LScopeEnd)
-      return false;
+  // Test if the location terminates before the end of the scope.
+  const MachineInstr *LScopeEnd = LSRange.back().second;
+  if (Ordering.isBefore(RangeEnd, LScopeEnd))
+    return false;
 
   // There's a single location which starts at the scope start, and ends at or
   // after the scope end.
@@ -1642,10 +1641,8 @@ static bool validThroughout(LexicalScopes &LScopes,
 // [1-3)    [(reg0, fragment 0, 32), (reg1, fragment 32, 32)]
 // [3-4)    [(reg1, fragment 32, 32), (123, fragment 64, 32)]
 // [4-)     [(@g, fragment 0, 96)]
-bool DwarfDebug::buildLocationList(
-    SmallVectorImpl<DebugLocEntry> &DebugLoc,
-    const DbgValueHistoryMap::Entries &Entries,
-    DenseSet<const MachineBasicBlock *> &VeryLargeBlocks) {
+bool DwarfDebug::buildLocationList(SmallVectorImpl<DebugLocEntry> &DebugLoc,
+                                   const DbgValueHistoryMap::Entries &Entries) {
   using OpenRange =
       std::pair<DbgValueHistoryMap::EntryIndex, DbgValueLoc>;
   SmallVector<OpenRange, 4> OpenRanges;
@@ -1658,9 +1655,7 @@ bool DwarfDebug::buildLocationList(
 
     // Remove all values that are no longer live.
     size_t Index = std::distance(EB, EI);
-    auto Last =
-        remove_if(OpenRanges, [&](OpenRange &R) { return R.first <= Index; });
-    OpenRanges.erase(Last, OpenRanges.end());
+    erase_if(OpenRanges, [&](OpenRange &R) { return R.first <= Index; });
 
     // If we are dealing with a clobbering entry, this iteration will result in
     // a location list entry starting after the clobbering instruction.
@@ -1741,14 +1736,8 @@ bool DwarfDebug::buildLocationList(
       DebugLoc.pop_back();
   }
 
-  // If there's a single entry, safe for a single location, and not part of
-  // an over-sized basic block, then ask validThroughout whether this
-  // location can be represented as a single variable location.
-  if (DebugLoc.size() != 1 || !isSafeForSingleLocation)
-    return false;
-  if (VeryLargeBlocks.count(StartDebugMI->getParent()))
-    return false;
-  return validThroughout(LScopes, StartDebugMI, EndMI);
+  return DebugLoc.size() == 1 && isSafeForSingleLocation &&
+         validThroughout(LScopes, StartDebugMI, EndMI, getInstOrdering());
 }
 
 DbgEntity *DwarfDebug::createConcreteEntity(DwarfCompileUnit &TheCU,
@@ -1780,13 +1769,6 @@ void DwarfDebug::collectEntityInfo(DwarfCompileUnit &TheCU,
   // Grab the variable info that was squirreled away in the MMI side-table.
   collectVariableInfoFromMFTable(TheCU, Processed);
 
-  // Identify blocks that are unreasonably sized, so that we can later
-  // skip lexical scope analysis over them.
-  DenseSet<const MachineBasicBlock *> VeryLargeBlocks;
-  for (const auto &MBB : *CurFn)
-    if (MBB.size() > LocationAnalysisSizeLimit)
-      VeryLargeBlocks.insert(&MBB);
-
   for (const auto &I : DbgValues) {
     InlinedEntity IV = I.first;
     if (Processed.count(IV))
@@ -1823,8 +1805,7 @@ void DwarfDebug::collectEntityInfo(DwarfCompileUnit &TheCU,
     if (HistSize == 1 || SingleValueWithClobber) {
       const auto *End =
           SingleValueWithClobber ? HistoryMapEntries[1].getInstr() : nullptr;
-      if (VeryLargeBlocks.count(MInsn->getParent()) == 0 &&
-          validThroughout(LScopes, MInsn, End)) {
+      if (validThroughout(LScopes, MInsn, End, getInstOrdering())) {
         RegVar->initializeDbgValue(MInsn);
         continue;
       }
@@ -1839,8 +1820,7 @@ void DwarfDebug::collectEntityInfo(DwarfCompileUnit &TheCU,
 
     // Build the location list for this variable.
     SmallVector<DebugLocEntry, 8> Entries;
-    bool isValidSingleLocation =
-        buildLocationList(Entries, HistoryMapEntries, VeryLargeBlocks);
+    bool isValidSingleLocation = buildLocationList(Entries, HistoryMapEntries);
 
     // Check whether buildLocationList managed to merge all locations to one
     // that is valid throughout the variable's scope. If so, produce single
@@ -1945,7 +1925,8 @@ void DwarfDebug::beginInstruction(const MachineInstr *MI) {
   }
 
   DebugHandlerBase::beginInstruction(MI);
-  assert(CurMI);
+  if (!CurMI)
+    return;
 
   if (NoDebug)
     return;
@@ -2382,10 +2363,10 @@ void DwarfDebug::emitDebugPubSection(bool GnuStyle, StringRef Name,
     TheU = Skeleton;
 
   // Emit the header.
-  Asm->OutStreamer->AddComment("Length of Public " + Name + " Info");
   MCSymbol *BeginLabel = Asm->createTempSymbol("pub" + Name + "_begin");
   MCSymbol *EndLabel = Asm->createTempSymbol("pub" + Name + "_end");
-  Asm->emitLabelDifference(EndLabel, BeginLabel, 4);
+  Asm->emitDwarfUnitLength(EndLabel, BeginLabel,
+                           "Length of Public " + Name + " Info");
 
   Asm->OutStreamer->emitLabel(BeginLabel);
 
@@ -2396,7 +2377,7 @@ void DwarfDebug::emitDebugPubSection(bool GnuStyle, StringRef Name,
   emitSectionReference(*TheU);
 
   Asm->OutStreamer->AddComment("Compilation Unit Length");
-  Asm->emitInt32(TheU->getLength());
+  Asm->emitDwarfLengthOrOffset(TheU->getLength());
 
   // Emit the pubnames for this compilation unit.
   for (const auto &GI : Globals) {
@@ -2404,7 +2385,7 @@ void DwarfDebug::emitDebugPubSection(bool GnuStyle, StringRef Name,
     const DIE *Entity = GI.second;
 
     Asm->OutStreamer->AddComment("DIE offset");
-    Asm->emitInt32(Entity->getOffset());
+    Asm->emitDwarfLengthOrOffset(Entity->getOffset());
 
     if (GnuStyle) {
       dwarf::PubIndexEntryDescriptor Desc = computeIndexValue(TheU, Entity);
@@ -2419,7 +2400,7 @@ void DwarfDebug::emitDebugPubSection(bool GnuStyle, StringRef Name,
   }
 
   Asm->OutStreamer->AddComment("End Mark");
-  Asm->emitInt32(0);
+  Asm->emitDwarfLengthOrOffset(0);
   Asm->OutStreamer->emitLabel(EndLabel);
 }
 
@@ -2458,7 +2439,7 @@ void DwarfDebug::emitDebugLocEntry(ByteStreamer &Streamer,
   for (auto &Op : Expr) {
     assert(Op.getCode() != dwarf::DW_OP_const_type &&
            "3 operand ops not yet supported");
-    Streamer.EmitInt8(Op.getCode(), Comment != End ? *(Comment++) : "");
+    Streamer.emitInt8(Op.getCode(), Comment != End ? *(Comment++) : "");
     Offset++;
     for (unsigned I = 0; I < 2; ++I) {
       if (Op.getDescription().Op[I] == Encoding::SizeNA)
@@ -2474,7 +2455,7 @@ void DwarfDebug::emitDebugLocEntry(ByteStreamer &Streamer,
             Comment++;
       } else {
         for (uint64_t J = Offset; J < Op.getOperandEndOffset(I); ++J)
-          Streamer.EmitInt8(Data.getData()[J], Comment != End ? *(Comment++) : "");
+          Streamer.emitInt8(Data.getData()[J], Comment != End ? *(Comment++) : "");
       }
       Offset = Op.getOperandEndOffset(I);
     }
@@ -2511,10 +2492,26 @@ void DwarfDebug::emitDebugLocValue(const AsmPrinter &AP, const DIBasicType *BT,
     TargetIndexLocation Loc = Value.getTargetIndexLocation();
     // TODO TargetIndexLocation is a target-independent. Currently only the WebAssembly-specific
     // encoding is supported.
+    assert(AP.TM.getTargetTriple().isWasm());
     DwarfExpr.addWasmLocation(Loc.Index, static_cast<uint64_t>(Loc.Offset));
+      DwarfExpr.addExpression(std::move(ExprCursor));
+      return;
   } else if (Value.isConstantFP()) {
-    APInt RawBytes = Value.getConstantFP()->getValueAPF().bitcastToAPInt();
-    DwarfExpr.addUnsignedConstant(RawBytes);
+    if (AP.getDwarfVersion() >= 4 && !AP.getDwarfDebug()->tuneForSCE() &&
+        !ExprCursor) {
+      DwarfExpr.addConstantFP(Value.getConstantFP()->getValueAPF(), AP);
+      return;
+    }
+    if (Value.getConstantFP()->getValueAPF().bitcastToAPInt().getBitWidth() <=
+        64 /*bits*/)
+      DwarfExpr.addUnsignedConstant(
+          Value.getConstantFP()->getValueAPF().bitcastToAPInt());
+    else
+      LLVM_DEBUG(
+          dbgs()
+          << "Skipped DwarfExpression creation for ConstantFP of size"
+          << Value.getConstantFP()->getValueAPF().bitcastToAPInt().getBitWidth()
+          << " bits\n");
   }
   DwarfExpr.addExpression(std::move(ExprCursor));
 }
@@ -2537,7 +2534,7 @@ void DebugLocEntry::finalize(const AsmPrinter &AP,
         }) && "all values are expected to be fragments");
     assert(llvm::is_sorted(Values) && "fragments are expected to be sorted");
 
-    for (auto Fragment : Values)
+    for (const auto &Fragment : Values)
       DwarfDebug::emitDebugLocValue(AP, BT, Fragment, DwarfExpr);
 
   } else {
@@ -2580,7 +2577,8 @@ static MCSymbol *emitRnglistsTableHeader(AsmPrinter *Asm,
   Asm->OutStreamer->emitLabel(Holder.getRnglistsTableBaseSym());
 
   for (const RangeSpanList &List : Holder.getRangeLists())
-    Asm->emitLabelDifference(List.Label, Holder.getRnglistsTableBaseSym(), 4);
+    Asm->emitLabelDifference(List.Label, Holder.getRnglistsTableBaseSym(),
+                             Asm->getDwarfOffsetByteSize());
 
   return TableEnd;
 }
@@ -2599,7 +2597,8 @@ static MCSymbol *emitLoclistsTableHeader(AsmPrinter *Asm,
   Asm->OutStreamer->emitLabel(DebugLocs.getSym());
 
   for (const auto &List : DebugLocs.getLists())
-    Asm->emitLabelDifference(List.Label, DebugLocs.getSym(), 4);
+    Asm->emitLabelDifference(List.Label, DebugLocs.getSym(),
+                             Asm->getDwarfOffsetByteSize());
 
   return TableEnd;
 }
@@ -2881,23 +2880,23 @@ void DwarfDebug::emitDebugARanges() {
 
     // Emit size of content not including length itself.
     unsigned ContentSize =
-        sizeof(int16_t) + // DWARF ARange version number
-        sizeof(int32_t) + // Offset of CU in the .debug_info section
-        sizeof(int8_t) +  // Pointer Size (in bytes)
-        sizeof(int8_t);   // Segment Size (in bytes)
+        sizeof(int16_t) +               // DWARF ARange version number
+        Asm->getDwarfOffsetByteSize() + // Offset of CU in the .debug_info
+                                        // section
+        sizeof(int8_t) +                // Pointer Size (in bytes)
+        sizeof(int8_t);                 // Segment Size (in bytes)
 
     unsigned TupleSize = PtrSize * 2;
 
     // 7.20 in the Dwarf specs requires the table to be aligned to a tuple.
-    unsigned Padding =
-        offsetToAlignment(sizeof(int32_t) + ContentSize, Align(TupleSize));
+    unsigned Padding = offsetToAlignment(
+        Asm->getUnitLengthFieldByteSize() + ContentSize, Align(TupleSize));
 
     ContentSize += Padding;
     ContentSize += (List.size() + 1) * TupleSize;
 
     // For each compile unit, write the list of spans it covers.
-    Asm->OutStreamer->AddComment("Length of ARange Set");
-    Asm->emitInt32(ContentSize);
+    Asm->emitDwarfUnitLength(ContentSize, "Length of ARange Set");
     Asm->OutStreamer->AddComment("DWARF Arange version number");
     Asm->emitInt16(dwarf::DW_ARANGES_VERSION);
     Asm->OutStreamer->AddComment("Offset Into Debug Info Section");
@@ -2983,25 +2982,30 @@ void DwarfDebug::emitDebugRangesDWO() {
                       Asm->getObjFileLowering().getDwarfRnglistsDWOSection());
 }
 
-/// Emit the header of a DWARF 5 macro section.
+/// Emit the header of a DWARF 5 macro section, or the GNU extension for
+/// DWARF 4.
 static void emitMacroHeader(AsmPrinter *Asm, const DwarfDebug &DD,
-                            const DwarfCompileUnit &CU) {
+                            const DwarfCompileUnit &CU, uint16_t DwarfVersion) {
   enum HeaderFlagMask {
 #define HANDLE_MACRO_FLAG(ID, NAME) MACRO_FLAG_##NAME = ID,
 #include "llvm/BinaryFormat/Dwarf.def"
   };
-  uint8_t Flags = 0;
   Asm->OutStreamer->AddComment("Macro information version");
-  Asm->emitInt16(5);
-  // We are setting Offset and line offset flags unconditionally here,
-  // since we're only supporting DWARF32 and line offset should be mostly
-  // present.
-  // FIXME: Add support for DWARF64.
-  Flags |= MACRO_FLAG_DEBUG_LINE_OFFSET;
-  Asm->OutStreamer->AddComment("Flags: 32 bit, debug_line_offset present");
-  Asm->emitInt8(Flags);
+  Asm->emitInt16(DwarfVersion >= 5 ? DwarfVersion : 4);
+  // We emit the line offset flag unconditionally here, since line offset should
+  // be mostly present.
+  if (Asm->isDwarf64()) {
+    Asm->OutStreamer->AddComment("Flags: 64 bit, debug_line_offset present");
+    Asm->emitInt8(MACRO_FLAG_OFFSET_SIZE | MACRO_FLAG_DEBUG_LINE_OFFSET);
+  } else {
+    Asm->OutStreamer->AddComment("Flags: 32 bit, debug_line_offset present");
+    Asm->emitInt8(MACRO_FLAG_DEBUG_LINE_OFFSET);
+  }
   Asm->OutStreamer->AddComment("debug_line_offset");
-  Asm->OutStreamer->emitSymbolValue(CU.getLineTableStartSym(), /*Size=*/4);
+  if (DD.useSplitDwarf())
+    Asm->emitDwarfLengthOrOffset(0);
+  else
+    Asm->emitDwarfSymbolReference(CU.getLineTableStartSym());
 }
 
 void DwarfDebug::handleMacroNodes(DIMacroNodeArray Nodes, DwarfCompileUnit &U) {
@@ -3018,55 +3022,63 @@ void DwarfDebug::handleMacroNodes(DIMacroNodeArray Nodes, DwarfCompileUnit &U) {
 void DwarfDebug::emitMacro(DIMacro &M) {
   StringRef Name = M.getName();
   StringRef Value = M.getValue();
-  bool UseMacro = getDwarfVersion() >= 5;
-
-  if (UseMacro) {
-    unsigned Type = M.getMacinfoType() == dwarf::DW_MACINFO_define
-                        ? dwarf::DW_MACRO_define_strx
-                        : dwarf::DW_MACRO_undef_strx;
-    Asm->OutStreamer->AddComment(dwarf::MacroString(Type));
-    Asm->emitULEB128(Type);
-    Asm->OutStreamer->AddComment("Line Number");
-    Asm->emitULEB128(M.getLine());
-    Asm->OutStreamer->AddComment("Macro String");
-    if (!Value.empty())
-      Asm->emitULEB128(this->InfoHolder.getStringPool()
-                           .getIndexedEntry(*Asm, (Name + " " + Value).str())
-                           .getIndex());
-    else
-      // DW_MACRO_undef_strx doesn't have a value, so just emit the macro
-      // string.
-      Asm->emitULEB128(this->InfoHolder.getStringPool()
-                           .getIndexedEntry(*Asm, (Name).str())
-                           .getIndex());
+
+  // There should be one space between the macro name and the macro value in
+  // define entries. In undef entries, only the macro name is emitted.
+  std::string Str = Value.empty() ? Name.str() : (Name + " " + Value).str();
+
+  if (UseDebugMacroSection) {
+    if (getDwarfVersion() >= 5) {
+      unsigned Type = M.getMacinfoType() == dwarf::DW_MACINFO_define
+                          ? dwarf::DW_MACRO_define_strx
+                          : dwarf::DW_MACRO_undef_strx;
+      Asm->OutStreamer->AddComment(dwarf::MacroString(Type));
+      Asm->emitULEB128(Type);
+      Asm->OutStreamer->AddComment("Line Number");
+      Asm->emitULEB128(M.getLine());
+      Asm->OutStreamer->AddComment("Macro String");
+      Asm->emitULEB128(
+          InfoHolder.getStringPool().getIndexedEntry(*Asm, Str).getIndex());
+    } else {
+      unsigned Type = M.getMacinfoType() == dwarf::DW_MACINFO_define
+                          ? dwarf::DW_MACRO_GNU_define_indirect
+                          : dwarf::DW_MACRO_GNU_undef_indirect;
+      Asm->OutStreamer->AddComment(dwarf::GnuMacroString(Type));
+      Asm->emitULEB128(Type);
+      Asm->OutStreamer->AddComment("Line Number");
+      Asm->emitULEB128(M.getLine());
+      Asm->OutStreamer->AddComment("Macro String");
+      Asm->emitDwarfSymbolReference(
+          InfoHolder.getStringPool().getEntry(*Asm, Str).getSymbol());
+    }
   } else {
     Asm->OutStreamer->AddComment(dwarf::MacinfoString(M.getMacinfoType()));
     Asm->emitULEB128(M.getMacinfoType());
     Asm->OutStreamer->AddComment("Line Number");
     Asm->emitULEB128(M.getLine());
     Asm->OutStreamer->AddComment("Macro String");
-    Asm->OutStreamer->emitBytes(Name);
-    if (!Value.empty()) {
-      // There should be one space between macro name and macro value.
-      Asm->emitInt8(' ');
-      Asm->OutStreamer->AddComment("Macro Value=");
-      Asm->OutStreamer->emitBytes(Value);
-    }
+    Asm->OutStreamer->emitBytes(Str);
     Asm->emitInt8('\0');
   }
 }
 
 void DwarfDebug::emitMacroFileImpl(
-    DIMacroFile &F, DwarfCompileUnit &U, unsigned StartFile, unsigned EndFile,
+    DIMacroFile &MF, DwarfCompileUnit &U, unsigned StartFile, unsigned EndFile,
     StringRef (*MacroFormToString)(unsigned Form)) {
 
   Asm->OutStreamer->AddComment(MacroFormToString(StartFile));
   Asm->emitULEB128(StartFile);
   Asm->OutStreamer->AddComment("Line Number");
-  Asm->emitULEB128(F.getLine());
+  Asm->emitULEB128(MF.getLine());
   Asm->OutStreamer->AddComment("File Number");
-  Asm->emitULEB128(U.getOrCreateSourceID(F.getFile()));
-  handleMacroNodes(F.getElements(), U);
+  DIFile &F = *MF.getFile();
+  if (useSplitDwarf())
+    Asm->emitULEB128(getDwoLineTable(U)->getFile(
+        F.getDirectory(), F.getFilename(), getMD5AsBytes(&F),
+        Asm->OutContext.getDwarfVersion(), F.getSource()));
+  else
+    Asm->emitULEB128(U.getOrCreateSourceID(&F));
+  handleMacroNodes(MF.getElements(), U);
   Asm->OutStreamer->AddComment(MacroFormToString(EndFile));
   Asm->emitULEB128(EndFile);
 }
@@ -3075,10 +3087,10 @@ void DwarfDebug::emitMacroFile(DIMacroFile &F, DwarfCompileUnit &U) {
   // DWARFv5 macro and DWARFv4 macinfo share some common encodings,
   // so for readibility/uniformity, We are explicitly emitting those.
   assert(F.getMacinfoType() == dwarf::DW_MACINFO_start_file);
-  bool UseMacro = getDwarfVersion() >= 5;
-  if (UseMacro)
-    emitMacroFileImpl(F, U, dwarf::DW_MACRO_start_file,
-                      dwarf::DW_MACRO_end_file, dwarf::MacroString);
+  if (UseDebugMacroSection)
+    emitMacroFileImpl(
+        F, U, dwarf::DW_MACRO_start_file, dwarf::DW_MACRO_end_file,
+        (getDwarfVersion() >= 5) ? dwarf::MacroString : dwarf::GnuMacroString);
   else
     emitMacroFileImpl(F, U, dwarf::DW_MACINFO_start_file,
                       dwarf::DW_MACINFO_end_file, dwarf::MacinfoString);
@@ -3095,8 +3107,8 @@ void DwarfDebug::emitDebugMacinfoImpl(MCSection *Section) {
       continue;
     Asm->OutStreamer->SwitchSection(Section);
     Asm->OutStreamer->emitLabel(U.getMacroLabelBegin());
-    if (getDwarfVersion() >= 5)
-      emitMacroHeader(Asm, *this, U);
+    if (UseDebugMacroSection)
+      emitMacroHeader(Asm, *this, U, getDwarfVersion());
     handleMacroNodes(Macros, U);
     Asm->OutStreamer->AddComment("End Of Macro List Mark");
     Asm->emitInt8(0);
@@ -3106,14 +3118,14 @@ void DwarfDebug::emitDebugMacinfoImpl(MCSection *Section) {
 /// Emit macros into a debug macinfo/macro section.
 void DwarfDebug::emitDebugMacinfo() {
   auto &ObjLower = Asm->getObjFileLowering();
-  emitDebugMacinfoImpl(getDwarfVersion() >= 5
+  emitDebugMacinfoImpl(UseDebugMacroSection
                            ? ObjLower.getDwarfMacroSection()
                            : ObjLower.getDwarfMacinfoSection());
 }
 
 void DwarfDebug::emitDebugMacinfoDWO() {
   auto &ObjLower = Asm->getObjFileLowering();
-  emitDebugMacinfoImpl(getDwarfVersion() >= 5
+  emitDebugMacinfoImpl(UseDebugMacroSection
                            ? ObjLower.getDwarfMacroDWOSection()
                            : ObjLower.getDwarfMacinfoDWOSection());
 }
@@ -3200,7 +3212,7 @@ MCDwarfDwoLineTable *DwarfDebug::getDwoLineTable(const DwarfCompileUnit &CU) {
   const DICompileUnit *DIUnit = CU.getCUNode();
   SplitTypeUnitFileTable.maybeSetRootFile(
       DIUnit->getDirectory(), DIUnit->getFilename(),
-      CU.getMD5AsBytes(DIUnit->getFile()), DIUnit->getSource());
+      getMD5AsBytes(DIUnit->getFile()), DIUnit->getSource());
   return &SplitTypeUnitFileTable;
 }
 
@@ -3303,14 +3315,14 @@ void DwarfDebug::addDwarfTypeUnitType(DwarfCompileUnit &CU,
 
 DwarfDebug::NonTypeUnitContext::NonTypeUnitContext(DwarfDebug *DD)
     : DD(DD),
-      TypeUnitsUnderConstruction(std::move(DD->TypeUnitsUnderConstruction)) {
+      TypeUnitsUnderConstruction(std::move(DD->TypeUnitsUnderConstruction)), AddrPoolUsed(DD->AddrPool.hasBeenUsed()) {
   DD->TypeUnitsUnderConstruction.clear();
-  assert(TypeUnitsUnderConstruction.empty() || !DD->AddrPool.hasBeenUsed());
+  DD->AddrPool.resetUsedFlag();
 }
 
 DwarfDebug::NonTypeUnitContext::~NonTypeUnitContext() {
   DD->TypeUnitsUnderConstruction = std::move(TypeUnitsUnderConstruction);
-  DD->AddrPool.resetUsedFlag();
+  DD->AddrPool.resetUsedFlag(AddrPoolUsed);
 }
 
 DwarfDebug::NonTypeUnitContext DwarfDebug::enterNonTypeUnitContext() {
@@ -3375,6 +3387,15 @@ uint16_t DwarfDebug::getDwarfVersion() const {
   return Asm->OutStreamer->getContext().getDwarfVersion();
 }
 
+dwarf::Form DwarfDebug::getDwarfSectionOffsetForm() const {
+  if (Asm->getDwarfVersion() >= 4)
+    return dwarf::Form::DW_FORM_sec_offset;
+  assert((!Asm->isDwarf64() || (Asm->getDwarfVersion() == 3)) &&
+         "DWARF64 is not defined prior DWARFv3");
+  return Asm->isDwarf64() ? dwarf::Form::DW_FORM_data8
+                          : dwarf::Form::DW_FORM_data4;
+}
+
 const MCSymbol *DwarfDebug::getSectionLabel(const MCSection *S) {
   return SectionLabels.find(S)->second;
 }
@@ -3383,3 +3404,20 @@ void DwarfDebug::insertSectionLabel(const MCSymbol *S) {
     if (useSplitDwarf() || getDwarfVersion() >= 5)
       AddrPool.getIndex(S);
 }
+
+Optional<MD5::MD5Result> DwarfDebug::getMD5AsBytes(const DIFile *File) const {
+  assert(File);
+  if (getDwarfVersion() < 5)
+    return None;
+  Optional<DIFile::ChecksumInfo<StringRef>> Checksum = File->getChecksum();
+  if (!Checksum || Checksum->Kind != DIFile::CSK_MD5)
+    return None;
+
+  // Convert the string checksum to an MD5Result for the streamer.
+  // The verifier validates the checksum so we assume it's okay.
+  // An MD5 checksum is 16 bytes.
+  std::string ChecksumString = fromHex(Checksum->Value);
+  MD5::MD5Result CKMem;
+  std::copy(ChecksumString.begin(), ChecksumString.end(), CKMem.Bytes.data());
+  return CKMem;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
index ad2f2f3edd8e..df19ef458888 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -114,7 +114,7 @@ public:
 ///
 /// Variables that have been optimized out use none of these fields.
 class DbgVariable : public DbgEntity {
-  /// Offset in DebugLocs.
+  /// Index of the entry list in DebugLocs.
   unsigned DebugLocListIndex = ~0u;
   /// DW_OP_LLVM_tag_offset value from DebugLocs.
   Optional<uint8_t> DebugLocListTagOffset;
@@ -372,6 +372,23 @@ class DwarfDebug : public DebugHandlerBase {
   /// Generate DWARF v4 type units.
   bool GenerateTypeUnits;
 
+  /// Emit a .debug_macro section instead of .debug_macinfo.
+  bool UseDebugMacroSection;
+
+  /// Avoid using DW_OP_convert due to consumer incompatibilities.
+  bool EnableOpConvert;
+
+public:
+  enum class MinimizeAddrInV5 {
+    Default,
+    Disabled,
+    Ranges,
+  };
+
+private:
+  /// Force the use of DW_AT_ranges even for single-entry range lists.
+  MinimizeAddrInV5 MinimizeAddr = MinimizeAddrInV5::Disabled;
+
   /// DWARF5 Experimental Options
   /// @{
   AccelTableKind TheAccelTableKind;
@@ -409,6 +426,9 @@ class DwarfDebug : public DebugHandlerBase {
   bool SingleCU;
   bool IsDarwin;
 
+  /// Map for tracking Fortran deferred CHARACTER lengths.
+  DenseMap<const DIStringType *, unsigned> StringTypeLocMap;
+
   AddressPool AddrPool;
 
   /// Accelerator tables.
@@ -592,10 +612,8 @@ class DwarfDebug : public DebugHandlerBase {
   /// function that describe the same variable. If the resulting 
   /// list has only one entry that is valid for entire variable's
   /// scope return true.
-  bool buildLocationList(
-      SmallVectorImpl<DebugLocEntry> &DebugLoc,
-      const DbgValueHistoryMap::Entries &Entries,
-      DenseSet<const MachineBasicBlock *> &VeryLargeBlocks);
+  bool buildLocationList(SmallVectorImpl<DebugLocEntry> &DebugLoc,
+                         const DbgValueHistoryMap::Entries &Entries);
 
   /// Collect variable information from the side table maintained by MF.
   void collectVariableInfoFromMFTable(DwarfCompileUnit &TheCU,
@@ -617,13 +635,13 @@ public:
   //===--------------------------------------------------------------------===//
   // Main entry points.
   //
-  DwarfDebug(AsmPrinter *A, Module *M);
+  DwarfDebug(AsmPrinter *A);
 
   ~DwarfDebug() override;
 
   /// Emit all Dwarf sections that should come prior to the
   /// content.
-  void beginModule();
+  void beginModule(Module *M) override;
 
   /// Emit all Dwarf sections that should come after the content.
   void endModule() override;
@@ -645,6 +663,7 @@ public:
   class NonTypeUnitContext {
     DwarfDebug *DD;
     decltype(DwarfDebug::TypeUnitsUnderConstruction) TypeUnitsUnderConstruction;
+    bool AddrPoolUsed;
     friend class DwarfDebug;
     NonTypeUnitContext(DwarfDebug *DD);
   public:
@@ -681,6 +700,12 @@ public:
   /// Returns whether ranges section should be emitted.
   bool useRangesSection() const { return UseRangesSection; }
 
+  /// Returns whether range encodings should be used for single entry range
+  /// lists.
+  bool alwaysUseRanges() const {
+    return MinimizeAddr == MinimizeAddrInV5::Ranges;
+  }
+
   /// Returns whether to use sections as labels rather than temp symbols.
   bool useSectionsAsReferences() const {
     return UseSectionsAsReferences;
@@ -719,11 +744,21 @@ public:
     return EmitDebugEntryValues;
   }
 
+  bool useOpConvert() const {
+    return EnableOpConvert;
+  }
+
   bool shareAcrossDWOCUs() const;
 
   /// Returns the Dwarf Version.
   uint16_t getDwarfVersion() const;
 
+  /// Returns a suitable DWARF form to represent a section offset, i.e.
+  /// * DW_FORM_sec_offset for DWARF version >= 4;
+  /// * DW_FORM_data8 for 64-bit DWARFv3;
+  /// * DW_FORM_data4 for 32-bit DWARFv3 and DWARFv2.
+  dwarf::Form getDwarfSectionOffsetForm() const;
+
   /// Returns the previous CU that was being updated
   const DwarfCompileUnit *getPrevCU() const { return PrevCU; }
   void setPrevCU(const DwarfCompileUnit *PrevCU) { this->PrevCU = PrevCU; }
@@ -768,6 +803,16 @@ public:
     return CUDieMap.lookup(Die);
   }
 
+  unsigned getStringTypeLoc(const DIStringType *ST) const {
+    return StringTypeLocMap.lookup(ST);
+  }
+
+  void addStringTypeLoc(const DIStringType *ST, unsigned Loc) {
+    assert(ST);
+    if (Loc)
+      StringTypeLocMap[ST] = Loc;
+  }
+
   /// \defgroup DebuggerTuning Predicates to tune DWARF for a given debugger.
   ///
   /// Returns whether we are "tuning" for a given debugger.
@@ -777,13 +822,16 @@ public:
   bool tuneForSCE() const { return DebuggerTuning == DebuggerKind::SCE; }
   /// @}
 
-  void addSectionLabel(const MCSymbol *Sym);
   const MCSymbol *getSectionLabel(const MCSection *S);
   void insertSectionLabel(const MCSymbol *S);
 
   static void emitDebugLocValue(const AsmPrinter &AP, const DIBasicType *BT,
                                 const DbgValueLoc &Value,
                                 DwarfExpression &DwarfExpr);
+
+  /// If the \p File has an MD5 checksum, return it as an MD5Result
+  /// allocated in the MCContext.
+  Optional<MD5::MD5Result> getMD5AsBytes(const DIFile *File) const;
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfException.h b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfException.h
index c2956380438f..b19b4365383f 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfException.h
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfException.h
@@ -92,6 +92,20 @@ public:
   /// Gather and emit post-function exception information.
   void endFunction(const MachineFunction *) override;
 };
+
+class LLVM_LIBRARY_VISIBILITY AIXException : public DwarfCFIExceptionBase {
+  /// This is AIX's compat unwind section, which unwinder would use
+  /// to find the location of LSDA area and personality rountine.
+  void emitExceptionInfoTable(const MCSymbol *LSDA, const MCSymbol *PerSym);
+
+public:
+  AIXException(AsmPrinter *A);
+
+  void endModule() override {}
+  void beginFunction(const MachineFunction *MF) override {}
+
+  void endFunction(const MachineFunction *MF) override;
+};
 } // End of namespace llvm
 
 #endif
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
index d4762121d105..59ad7646ce1c 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
@@ -17,14 +17,14 @@
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/CodeGen/Register.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <algorithm>
-#include <cassert>
-#include <cstdint>
 
 using namespace llvm;
 
+#define DEBUG_TYPE "dwarfdebug"
+
 void DwarfExpression::emitConstu(uint64_t Value) {
   if (Value < 32)
     emitOp(dwarf::DW_OP_lit0 + Value);
@@ -97,7 +97,8 @@ void DwarfExpression::addAnd(unsigned Mask) {
 }
 
 bool DwarfExpression::addMachineReg(const TargetRegisterInfo &TRI,
-                                    unsigned MachineReg, unsigned MaxSize) {
+                                    llvm::Register MachineReg,
+                                    unsigned MaxSize) {
   if (!llvm::Register::isPhysicalRegister(MachineReg)) {
     if (isFrameRegister(TRI, MachineReg)) {
       DwarfRegs.push_back(Register::createRegister(-1, nullptr));
@@ -219,9 +220,36 @@ void DwarfExpression::addUnsignedConstant(const APInt &Value) {
   }
 }
 
+void DwarfExpression::addConstantFP(const APFloat &APF, const AsmPrinter &AP) {
+  assert(isImplicitLocation() || isUnknownLocation());
+  APInt API = APF.bitcastToAPInt();
+  int NumBytes = API.getBitWidth() / 8;
+  if (NumBytes == 4 /*float*/ || NumBytes == 8 /*double*/) {
+    // FIXME: Add support for `long double`.
+    emitOp(dwarf::DW_OP_implicit_value);
+    emitUnsigned(NumBytes /*Size of the block in bytes*/);
+
+    // The loop below is emitting the value starting at least significant byte,
+    // so we need to perform a byte-swap to get the byte order correct in case
+    // of a big-endian target.
+    if (AP.getDataLayout().isBigEndian())
+      API = API.byteSwap();
+
+    for (int i = 0; i < NumBytes; ++i) {
+      emitData1(API.getZExtValue() & 0xFF);
+      API = API.lshr(8);
+    }
+
+    return;
+  }
+  LLVM_DEBUG(
+      dbgs() << "Skipped DW_OP_implicit_value creation for ConstantFP of size: "
+             << API.getBitWidth() << " bits\n");
+}
+
 bool DwarfExpression::addMachineRegExpression(const TargetRegisterInfo &TRI,
                                               DIExpressionCursor &ExprCursor,
-                                              unsigned MachineReg,
+                                              llvm::Register MachineReg,
                                               unsigned FragmentOffsetInBits) {
   auto Fragment = ExprCursor.getFragmentInfo();
   if (!addMachineReg(TRI, MachineReg, Fragment ? Fragment->SizeInBits : ~1U)) {
@@ -498,6 +526,7 @@ void DwarfExpression::addExpression(DIExpressionCursor &&ExprCursor,
     case dwarf::DW_OP_not:
     case dwarf::DW_OP_dup:
     case dwarf::DW_OP_push_object_address:
+    case dwarf::DW_OP_over:
       emitOp(OpNum);
       break;
     case dwarf::DW_OP_deref:
@@ -513,10 +542,15 @@ void DwarfExpression::addExpression(DIExpressionCursor &&ExprCursor,
       assert(!isRegisterLocation());
       emitConstu(Op->getArg(0));
       break;
+    case dwarf::DW_OP_consts:
+      assert(!isRegisterLocation());
+      emitOp(dwarf::DW_OP_consts);
+      emitSigned(Op->getArg(0));
+      break;
     case dwarf::DW_OP_LLVM_convert: {
       unsigned BitSize = Op->getArg(0);
       dwarf::TypeKind Encoding = static_cast<dwarf::TypeKind>(Op->getArg(1));
-      if (DwarfVersion >= 5) {
+      if (DwarfVersion >= 5 && CU.getDwarfDebug().useOpConvert()) {
         emitOp(dwarf::DW_OP_convert);
         // If targeting a location-list; simply emit the index into the raw
         // byte stream as ULEB128, DwarfDebug::emitDebugLocEntry has been
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h
index 757b17511453..8fca9f5a630b 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h
@@ -218,7 +218,7 @@ protected:
   /// Return whether the given machine register is the frame register in the
   /// current function.
   virtual bool isFrameRegister(const TargetRegisterInfo &TRI,
-                               unsigned MachineReg) = 0;
+                               llvm::Register MachineReg) = 0;
 
   /// Emit a DW_OP_reg operation. Note that this is only legal inside a DWARF
   /// register location description.
@@ -245,7 +245,7 @@ protected:
   /// multiple subregisters that alias the register.
   ///
   /// \return false if no DWARF register exists for MachineReg.
-  bool addMachineReg(const TargetRegisterInfo &TRI, unsigned MachineReg,
+  bool addMachineReg(const TargetRegisterInfo &TRI, llvm::Register MachineReg,
                      unsigned MaxSize = ~1U);
 
   /// Emit a DW_OP_piece or DW_OP_bit_piece operation for a variable fragment.
@@ -299,6 +299,9 @@ public:
   /// Emit an unsigned constant.
   void addUnsignedConstant(const APInt &Value);
 
+  /// Emit an floating point constant.
+  void addConstantFP(const APFloat &Value, const AsmPrinter &AP);
+
   /// Lock this down to become a memory location description.
   void setMemoryLocationKind() {
     assert(isUnknownLocation());
@@ -322,7 +325,8 @@ public:
   /// \return                         false if no DWARF register exists
   ///                                 for MachineReg.
   bool addMachineRegExpression(const TargetRegisterInfo &TRI,
-                               DIExpressionCursor &Expr, unsigned MachineReg,
+                               DIExpressionCursor &Expr,
+                               llvm::Register MachineReg,
                                unsigned FragmentOffsetInBits = 0);
 
   /// Begin emission of an entry value dwarf operation. The entry value's
@@ -385,7 +389,7 @@ class DebugLocDwarfExpression final : public DwarfExpression {
   void commitTemporaryBuffer() override;
 
   bool isFrameRegister(const TargetRegisterInfo &TRI,
-                       unsigned MachineReg) override;
+                       llvm::Register MachineReg) override;
 
 public:
   DebugLocDwarfExpression(unsigned DwarfVersion, BufferByteStreamer &BS,
@@ -415,7 +419,7 @@ class DIEDwarfExpression final : public DwarfExpression {
   void commitTemporaryBuffer() override;
 
   bool isFrameRegister(const TargetRegisterInfo &TRI,
-                       unsigned MachineReg) override;
+                       llvm::Register MachineReg) override;
 
 public:
   DIEDwarfExpression(const AsmPrinter &AP, DwarfCompileUnit &CU, DIELoc &DIE);
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfFile.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfFile.cpp
index 812e6383288f..838e1c9a10be 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfFile.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfFile.cpp
@@ -10,10 +10,9 @@
 #include "DwarfCompileUnit.h"
 #include "DwarfDebug.h"
 #include "DwarfUnit.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/AsmPrinter.h"
-#include "llvm/CodeGen/DIE.h"
 #include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/MC/MCStreamer.h"
 #include <algorithm>
 #include <cstdint>
@@ -59,7 +58,7 @@ void DwarfFile::emitUnit(DwarfUnit *TheU, bool UseOffsets) {
 // Compute the size and offset for each DIE.
 void DwarfFile::computeSizeAndOffsets() {
   // Offset from the first CU in the debug info section is 0 initially.
-  unsigned SecOffset = 0;
+  uint64_t SecOffset = 0;
 
   // Iterate over each compile unit and set the size and offsets for each
   // DIE within each compile unit. All offsets are CU relative.
@@ -75,12 +74,15 @@ void DwarfFile::computeSizeAndOffsets() {
     TheU->setDebugSectionOffset(SecOffset);
     SecOffset += computeSizeAndOffsetsForUnit(TheU.get());
   }
+  if (SecOffset > UINT32_MAX && !Asm->isDwarf64())
+    report_fatal_error("The generated debug information is too large "
+                       "for the 32-bit DWARF format.");
 }
 
 unsigned DwarfFile::computeSizeAndOffsetsForUnit(DwarfUnit *TheU) {
   // CU-relative offset is reset to 0 here.
-  unsigned Offset = sizeof(int32_t) +      // Length of Unit Info
-                    TheU->getHeaderSize(); // Unit-specific headers
+  unsigned Offset = Asm->getUnitLengthFieldByteSize() + // Length of Unit Info
+                    TheU->getHeaderSize();              // Unit-specific headers
 
   // The return value here is CU-relative, after laying out
   // all of the CU DIE.
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h
index cf293d7534d0..79a6ce7801b7 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h
@@ -14,7 +14,6 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/DIE.h"
-#include "llvm/IR/Metadata.h"
 #include "llvm/Support/Allocator.h"
 #include <map>
 #include <memory>
@@ -26,10 +25,12 @@ class AsmPrinter;
 class DbgEntity;
 class DbgVariable;
 class DbgLabel;
+class DINode;
 class DwarfCompileUnit;
 class DwarfUnit;
 class LexicalScope;
 class MCSection;
+class MDNode;
 
 // Data structure to hold a range for range lists.
 struct RangeSpan {
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp
index a43929d8e8f7..a876f8ccace9 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp
@@ -8,7 +8,6 @@
 
 #include "DwarfStringPool.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -33,7 +32,6 @@ DwarfStringPool::getEntryImpl(AsmPrinter &Asm, StringRef Str) {
     Entry.Symbol = ShouldCreateSymbols ? Asm.createTempSymbol(Prefix) : nullptr;
 
     NumBytes += Str.size() + 1;
-    assert(NumBytes > Entry.Offset && "Unexpected overflow");
   }
   return *I.first;
 }
@@ -58,13 +56,13 @@ void DwarfStringPool::emitStringOffsetsTableHeader(AsmPrinter &Asm,
   if (getNumIndexedStrings() == 0)
     return;
   Asm.OutStreamer->SwitchSection(Section);
-  unsigned EntrySize = 4;
-  // FIXME: DWARF64
+  unsigned EntrySize = Asm.getDwarfOffsetByteSize();
   // We are emitting the header for a contribution to the string offsets
   // table. The header consists of an entry with the contribution's
   // size (not including the size of the length field), the DWARF version and
   // 2 bytes of padding.
-  Asm.emitInt32(getNumIndexedStrings() * EntrySize + 4);
+  Asm.emitDwarfUnitLength(getNumIndexedStrings() * EntrySize + 4,
+                          "Length of String Offsets Set");
   Asm.emitInt16(Asm.getDwarfVersion());
   Asm.emitInt16(0);
   // Define the symbol that marks the start of the contribution. It is
@@ -120,7 +118,7 @@ void DwarfStringPool::emit(AsmPrinter &Asm, MCSection *StrSection,
     }
 
     Asm.OutStreamer->SwitchSection(OffsetSection);
-    unsigned size = 4; // FIXME: DWARF64 is 8.
+    unsigned size = Asm.getDwarfOffsetByteSize();
     for (const auto &Entry : Entries)
       if (UseRelativeOffsets)
         Asm.emitDwarfStringOffset(Entry->getValue());
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.h b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.h
index c5f5637fdae3..79b5df89e338 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.h
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.h
@@ -28,7 +28,7 @@ class DwarfStringPool {
 
   StringMap<EntryTy, BumpPtrAllocator &> Pool;
   StringRef Prefix;
-  unsigned NumBytes = 0;
+  uint64_t NumBytes = 0;
   unsigned NumIndexedStrings = 0;
   bool ShouldCreateSymbols;
 
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index ceeae14c1073..118b5fcc3bf6 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -13,7 +13,6 @@
 #include "DwarfUnit.h"
 #include "AddressPool.h"
 #include "DwarfCompileUnit.h"
-#include "DwarfDebug.h"
 #include "DwarfExpression.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
@@ -84,15 +83,14 @@ unsigned DIEDwarfExpression::getTemporaryBufferSize() {
 void DIEDwarfExpression::commitTemporaryBuffer() { OutDIE.takeValues(TmpDIE); }
 
 bool DIEDwarfExpression::isFrameRegister(const TargetRegisterInfo &TRI,
-                                         unsigned MachineReg) {
+                                         llvm::Register MachineReg) {
   return MachineReg == TRI.getFrameRegister(*AP.MF);
 }
 
 DwarfUnit::DwarfUnit(dwarf::Tag UnitTag, const DICompileUnit *Node,
                      AsmPrinter *A, DwarfDebug *DW, DwarfFile *DWU)
-    : DIEUnit(A->getDwarfVersion(), A->MAI->getCodePointerSize(), UnitTag),
-      CUNode(Node), Asm(A), DD(DW), DU(DWU), IndexTyDie(nullptr) {
-}
+    : DIEUnit(UnitTag), CUNode(Node), Asm(A), DD(DW), DU(DWU),
+      IndexTyDie(nullptr) {}
 
 DwarfTypeUnit::DwarfTypeUnit(DwarfCompileUnit &CU, AsmPrinter *A,
                              DwarfDebug *DW, DwarfFile *DWU,
@@ -301,27 +299,7 @@ void DwarfUnit::addLabel(DIELoc &Die, dwarf::Form Form, const MCSymbol *Label) {
 
 void DwarfUnit::addSectionOffset(DIE &Die, dwarf::Attribute Attribute,
                                  uint64_t Integer) {
-  if (DD->getDwarfVersion() >= 4)
-    addUInt(Die, Attribute, dwarf::DW_FORM_sec_offset, Integer);
-  else
-    addUInt(Die, Attribute, dwarf::DW_FORM_data4, Integer);
-}
-
-Optional<MD5::MD5Result> DwarfUnit::getMD5AsBytes(const DIFile *File) const {
-  assert(File);
-  if (DD->getDwarfVersion() < 5)
-    return None;
-  Optional<DIFile::ChecksumInfo<StringRef>> Checksum = File->getChecksum();
-  if (!Checksum || Checksum->Kind != DIFile::CSK_MD5)
-    return None;
-
-  // Convert the string checksum to an MD5Result for the streamer.
-  // The verifier validates the checksum so we assume it's okay.
-  // An MD5 checksum is 16 bytes.
-  std::string ChecksumString = fromHex(Checksum->Value);
-  MD5::MD5Result CKMem;
-  std::copy(ChecksumString.begin(), ChecksumString.end(), CKMem.Bytes.data());
-  return CKMem;
+  addUInt(Die, Attribute, DD->getDwarfSectionOffsetForm(), Integer);
 }
 
 unsigned DwarfTypeUnit::getOrCreateSourceID(const DIFile *File) {
@@ -332,10 +310,9 @@ unsigned DwarfTypeUnit::getOrCreateSourceID(const DIFile *File) {
     // This is a split type unit that needs a line table.
     addSectionOffset(getUnitDie(), dwarf::DW_AT_stmt_list, 0);
   }
-  return SplitLineTable->getFile(File->getDirectory(), File->getFilename(),
-                                 getMD5AsBytes(File),
-                                 Asm->OutContext.getDwarfVersion(),
-                                 File->getSource());
+  return SplitLineTable->getFile(
+      File->getDirectory(), File->getFilename(), DD->getMD5AsBytes(File),
+      Asm->OutContext.getDwarfVersion(), File->getSource());
 }
 
 void DwarfUnit::addOpAddress(DIELoc &Die, const MCSymbol *Sym) {
@@ -353,7 +330,7 @@ void DwarfUnit::addOpAddress(DIELoc &Die, const MCSymbol *Sym) {
   }
 
   addUInt(Die, dwarf::DW_FORM_data1, dwarf::DW_OP_addr);
-  addLabel(Die, dwarf::DW_FORM_udata, Sym);
+  addLabel(Die, dwarf::DW_FORM_addr, Sym);
 }
 
 void DwarfUnit::addLabelDelta(DIE &Die, dwarf::Attribute Attribute,
@@ -457,77 +434,6 @@ void DwarfUnit::addSourceLine(DIE &Die, const DIObjCProperty *Ty) {
   addSourceLine(Die, Ty->getLine(), Ty->getFile());
 }
 
-/// Return true if type encoding is unsigned.
-static bool isUnsignedDIType(DwarfDebug *DD, const DIType *Ty) {
-  if (auto *CTy = dyn_cast<DICompositeType>(Ty)) {
-    // FIXME: Enums without a fixed underlying type have unknown signedness
-    // here, leading to incorrectly emitted constants.
-    if (CTy->getTag() == dwarf::DW_TAG_enumeration_type)
-      return false;
-
-    // (Pieces of) aggregate types that get hacked apart by SROA may be
-    // represented by a constant. Encode them as unsigned bytes.
-    return true;
-  }
-
-  if (auto *DTy = dyn_cast<DIDerivedType>(Ty)) {
-    dwarf::Tag T = (dwarf::Tag)Ty->getTag();
-    // Encode pointer constants as unsigned bytes. This is used at least for
-    // null pointer constant emission.
-    // FIXME: reference and rvalue_reference /probably/ shouldn't be allowed
-    // here, but accept them for now due to a bug in SROA producing bogus
-    // dbg.values.
-    if (T == dwarf::DW_TAG_pointer_type ||
-        T == dwarf::DW_TAG_ptr_to_member_type ||
-        T == dwarf::DW_TAG_reference_type ||
-        T == dwarf::DW_TAG_rvalue_reference_type)
-      return true;
-    assert(T == dwarf::DW_TAG_typedef || T == dwarf::DW_TAG_const_type ||
-           T == dwarf::DW_TAG_volatile_type ||
-           T == dwarf::DW_TAG_restrict_type || T == dwarf::DW_TAG_atomic_type);
-    assert(DTy->getBaseType() && "Expected valid base type");
-    return isUnsignedDIType(DD, DTy->getBaseType());
-  }
-
-  auto *BTy = cast<DIBasicType>(Ty);
-  unsigned Encoding = BTy->getEncoding();
-  assert((Encoding == dwarf::DW_ATE_unsigned ||
-          Encoding == dwarf::DW_ATE_unsigned_char ||
-          Encoding == dwarf::DW_ATE_signed ||
-          Encoding == dwarf::DW_ATE_signed_char ||
-          Encoding == dwarf::DW_ATE_float || Encoding == dwarf::DW_ATE_UTF ||
-          Encoding == dwarf::DW_ATE_boolean ||
-          (Ty->getTag() == dwarf::DW_TAG_unspecified_type &&
-           Ty->getName() == "decltype(nullptr)")) &&
-         "Unsupported encoding");
-  return Encoding == dwarf::DW_ATE_unsigned ||
-         Encoding == dwarf::DW_ATE_unsigned_char ||
-         Encoding == dwarf::DW_ATE_UTF || Encoding == dwarf::DW_ATE_boolean ||
-         Ty->getTag() == dwarf::DW_TAG_unspecified_type;
-}
-
-void DwarfUnit::addConstantFPValue(DIE &Die, const MachineOperand &MO) {
-  assert(MO.isFPImm() && "Invalid machine operand!");
-  DIEBlock *Block = new (DIEValueAllocator) DIEBlock;
-  APFloat FPImm = MO.getFPImm()->getValueAPF();
-
-  // Get the raw data form of the floating point.
-  const APInt FltVal = FPImm.bitcastToAPInt();
-  const char *FltPtr = (const char *)FltVal.getRawData();
-
-  int NumBytes = FltVal.getBitWidth() / 8; // 8 bits per byte.
-  bool LittleEndian = Asm->getDataLayout().isLittleEndian();
-  int Incr = (LittleEndian ? 1 : -1);
-  int Start = (LittleEndian ? 0 : NumBytes - 1);
-  int Stop = (LittleEndian ? NumBytes : -1);
-
-  // Output the constant to DWARF one byte at a time.
-  for (; Start != Stop; Start += Incr)
-    addUInt(*Block, dwarf::DW_FORM_data1, (unsigned char)0xFF & FltPtr[Start]);
-
-  addBlock(Die, dwarf::DW_AT_const_value, Block);
-}
-
 void DwarfUnit::addConstantFPValue(DIE &Die, const ConstantFP *CFP) {
   // Pass this down to addConstantValue as an unsigned bag of bits.
   addConstantValue(Die, CFP->getValueAPF().bitcastToAPInt(), true);
@@ -538,15 +444,8 @@ void DwarfUnit::addConstantValue(DIE &Die, const ConstantInt *CI,
   addConstantValue(Die, CI->getValue(), Ty);
 }
 
-void DwarfUnit::addConstantValue(DIE &Die, const MachineOperand &MO,
-                                 const DIType *Ty) {
-  assert(MO.isImm() && "Invalid machine operand!");
-
-  addConstantValue(Die, isUnsignedDIType(DD, Ty), MO.getImm());
-}
-
 void DwarfUnit::addConstantValue(DIE &Die, uint64_t Val, const DIType *Ty) {
-  addConstantValue(Die, isUnsignedDIType(DD, Ty), Val);
+  addConstantValue(Die, DD->isUnsignedDIType(Ty), Val);
 }
 
 void DwarfUnit::addConstantValue(DIE &Die, bool Unsigned, uint64_t Val) {
@@ -557,7 +456,7 @@ void DwarfUnit::addConstantValue(DIE &Die, bool Unsigned, uint64_t Val) {
 }
 
 void DwarfUnit::addConstantValue(DIE &Die, const APInt &Val, const DIType *Ty) {
-  addConstantValue(Die, Val, isUnsignedDIType(DD, Ty));
+  addConstantValue(Die, Val, DD->isUnsignedDIType(Ty));
 }
 
 void DwarfUnit::addConstantValue(DIE &Die, const APInt &Val, bool Unsigned) {
@@ -654,6 +553,8 @@ DIE *DwarfUnit::createTypeDIE(const DIScope *Context, DIE &ContextDIE,
 
   if (auto *BT = dyn_cast<DIBasicType>(Ty))
     constructTypeDIE(TyDIE, BT);
+  else if (auto *ST = dyn_cast<DIStringType>(Ty))
+    constructTypeDIE(TyDIE, ST);
   else if (auto *STy = dyn_cast<DISubroutineType>(Ty))
     constructTypeDIE(TyDIE, STy);
   else if (auto *CTy = dyn_cast<DICompositeType>(Ty)) {
@@ -772,8 +673,9 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DIBasicType *BTy) {
   if (BTy->getTag() == dwarf::DW_TAG_unspecified_type)
     return;
 
-  addUInt(Buffer, dwarf::DW_AT_encoding, dwarf::DW_FORM_data1,
-          BTy->getEncoding());
+  if (BTy->getTag() != dwarf::DW_TAG_string_type)
+    addUInt(Buffer, dwarf::DW_AT_encoding, dwarf::DW_FORM_data1,
+            BTy->getEncoding());
 
   uint64_t Size = BTy->getSizeInBits() >> 3;
   addUInt(Buffer, dwarf::DW_AT_byte_size, None, Size);
@@ -784,6 +686,37 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DIBasicType *BTy) {
     addUInt(Buffer, dwarf::DW_AT_endianity, None, dwarf::DW_END_little);
 }
 
+void DwarfUnit::constructTypeDIE(DIE &Buffer, const DIStringType *STy) {
+  // Get core information.
+  StringRef Name = STy->getName();
+  // Add name if not anonymous or intermediate type.
+  if (!Name.empty())
+    addString(Buffer, dwarf::DW_AT_name, Name);
+
+  if (DIVariable *Var = STy->getStringLength()) {
+    if (auto *VarDIE = getDIE(Var))
+      addDIEEntry(Buffer, dwarf::DW_AT_string_length, *VarDIE);
+  } else if (DIExpression *Expr = STy->getStringLengthExp()) {
+    DIELoc *Loc = new (DIEValueAllocator) DIELoc;
+    DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc);
+    // This is to describe the memory location of the
+    // length of a Fortran deferred length string, so
+    // lock it down as such.
+    DwarfExpr.setMemoryLocationKind();
+    DwarfExpr.addExpression(Expr);
+    addBlock(Buffer, dwarf::DW_AT_string_length, DwarfExpr.finalize());
+  } else {
+    uint64_t Size = STy->getSizeInBits() >> 3;
+    addUInt(Buffer, dwarf::DW_AT_byte_size, None, Size);
+  }
+
+  if (STy->getEncoding()) {
+    // For eventual Unicode support.
+    addUInt(Buffer, dwarf::DW_AT_encoding, dwarf::DW_FORM_data1,
+            STy->getEncoding());
+  }
+}
+
 void DwarfUnit::constructTypeDIE(DIE &Buffer, const DIDerivedType *DTy) {
   // Get core information.
   StringRef Name = DTy->getName();
@@ -910,6 +843,11 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DICompositeType *CTy) {
       }
     }
 
+    // Add template parameters to a class, structure or union types.
+    if (Tag == dwarf::DW_TAG_class_type ||
+        Tag == dwarf::DW_TAG_structure_type || Tag == dwarf::DW_TAG_union_type)
+      addTemplateParams(Buffer, CTy->getTemplateParams());
+
     // Add elements to structure type.
     DINodeArray Elements = CTy->getElements();
     for (const auto *Element : Elements) {
@@ -929,7 +867,7 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DICompositeType *CTy) {
           DIE &Variant = createAndAddDIE(dwarf::DW_TAG_variant, Buffer);
           if (const ConstantInt *CI =
               dyn_cast_or_null<ConstantInt>(DDTy->getDiscriminantValue())) {
-            if (isUnsignedDIType(DD, Discriminator->getBaseType()))
+            if (DD->isUnsignedDIType(Discriminator->getBaseType()))
               addUInt(Variant, dwarf::DW_AT_discr_value, None, CI->getZExtValue());
             else
               addSInt(Variant, dwarf::DW_AT_discr_value, None, CI->getSExtValue());
@@ -979,12 +917,6 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DICompositeType *CTy) {
     if (CTy->isObjcClassComplete())
       addFlag(Buffer, dwarf::DW_AT_APPLE_objc_complete_type);
 
-    // Add template parameters to a class, structure or union types.
-    // FIXME: The support isn't in the metadata for this yet.
-    if (Tag == dwarf::DW_TAG_class_type ||
-        Tag == dwarf::DW_TAG_structure_type || Tag == dwarf::DW_TAG_union_type)
-      addTemplateParams(Buffer, CTy->getTemplateParams());
-
     // Add the type's non-standard calling convention.
     uint8_t CC = 0;
     if (CTy->isTypePassByValue())
@@ -1008,8 +940,10 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DICompositeType *CTy) {
       Tag == dwarf::DW_TAG_class_type || Tag == dwarf::DW_TAG_structure_type ||
       Tag == dwarf::DW_TAG_union_type) {
     // Add size if non-zero (derived types might be zero-sized.)
+    // Ignore the size if it's a non-enum forward decl.
     // TODO: Do we care about size for enum forward declarations?
-    if (Size)
+    if (Size &&
+        (!CTy->isForwardDecl() || Tag == dwarf::DW_TAG_enumeration_type))
       addUInt(Buffer, dwarf::DW_AT_byte_size, None, Size);
     else if (!CTy->isForwardDecl())
       // Add zero size if it is not a forward declaration.
@@ -1133,6 +1067,8 @@ DIE *DwarfUnit::getOrCreateModule(const DIModule *M) {
             getOrCreateSourceID(M->getFile()));
   if (M->getLineNo())
     addUInt(MDie, dwarf::DW_AT_decl_line, None, M->getLineNo());
+  if (M->getIsDecl())
+    addFlag(MDie, dwarf::DW_AT_declaration);
 
   return &MDie;
 }
@@ -1354,7 +1290,7 @@ void DwarfUnit::constructSubrangeDIE(DIE &Buffer, const DISubrange *SR,
   if (auto *CI = SR->getCount().dyn_cast<ConstantInt*>())
     Count = CI->getSExtValue();
 
-  auto addBoundTypeEntry = [&](dwarf::Attribute Attr,
+  auto AddBoundTypeEntry = [&](dwarf::Attribute Attr,
                                DISubrange::BoundType Bound) -> void {
     if (auto *BV = Bound.dyn_cast<DIVariable *>()) {
       if (auto *VarDIE = getDIE(BV))
@@ -1372,7 +1308,7 @@ void DwarfUnit::constructSubrangeDIE(DIE &Buffer, const DISubrange *SR,
     }
   };
 
-  addBoundTypeEntry(dwarf::DW_AT_lower_bound, SR->getLowerBound());
+  AddBoundTypeEntry(dwarf::DW_AT_lower_bound, SR->getLowerBound());
 
   if (auto *CV = SR->getCount().dyn_cast<DIVariable*>()) {
     if (auto *CountVarDIE = getDIE(CV))
@@ -1380,9 +1316,45 @@ void DwarfUnit::constructSubrangeDIE(DIE &Buffer, const DISubrange *SR,
   } else if (Count != -1)
     addUInt(DW_Subrange, dwarf::DW_AT_count, None, Count);
 
-  addBoundTypeEntry(dwarf::DW_AT_upper_bound, SR->getUpperBound());
+  AddBoundTypeEntry(dwarf::DW_AT_upper_bound, SR->getUpperBound());
 
-  addBoundTypeEntry(dwarf::DW_AT_byte_stride, SR->getStride());
+  AddBoundTypeEntry(dwarf::DW_AT_byte_stride, SR->getStride());
+}
+
+void DwarfUnit::constructGenericSubrangeDIE(DIE &Buffer,
+                                            const DIGenericSubrange *GSR,
+                                            DIE *IndexTy) {
+  DIE &DwGenericSubrange =
+      createAndAddDIE(dwarf::DW_TAG_generic_subrange, Buffer);
+  addDIEEntry(DwGenericSubrange, dwarf::DW_AT_type, *IndexTy);
+
+  int64_t DefaultLowerBound = getDefaultLowerBound();
+
+  auto AddBoundTypeEntry = [&](dwarf::Attribute Attr,
+                               DIGenericSubrange::BoundType Bound) -> void {
+    if (auto *BV = Bound.dyn_cast<DIVariable *>()) {
+      if (auto *VarDIE = getDIE(BV))
+        addDIEEntry(DwGenericSubrange, Attr, *VarDIE);
+    } else if (auto *BE = Bound.dyn_cast<DIExpression *>()) {
+      if (BE->isSignedConstant()) {
+        if (Attr != dwarf::DW_AT_lower_bound || DefaultLowerBound == -1 ||
+            static_cast<int64_t>(BE->getElement(1)) != DefaultLowerBound)
+          addSInt(DwGenericSubrange, Attr, dwarf::DW_FORM_sdata,
+                  BE->getElement(1));
+      } else {
+        DIELoc *Loc = new (DIEValueAllocator) DIELoc;
+        DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc);
+        DwarfExpr.setMemoryLocationKind();
+        DwarfExpr.addExpression(BE);
+        addBlock(DwGenericSubrange, Attr, DwarfExpr.finalize());
+      }
+    }
+  };
+
+  AddBoundTypeEntry(dwarf::DW_AT_lower_bound, GSR->getLowerBound());
+  AddBoundTypeEntry(dwarf::DW_AT_count, GSR->getCount());
+  AddBoundTypeEntry(dwarf::DW_AT_upper_bound, GSR->getUpperBound());
+  AddBoundTypeEntry(dwarf::DW_AT_byte_stride, GSR->getStride());
 }
 
 DIE *DwarfUnit::getIndexTyDie() {
@@ -1447,6 +1419,39 @@ void DwarfUnit::constructArrayTypeDIE(DIE &Buffer, const DICompositeType *CTy) {
     addBlock(Buffer, dwarf::DW_AT_data_location, DwarfExpr.finalize());
   }
 
+  if (DIVariable *Var = CTy->getAssociated()) {
+    if (auto *VarDIE = getDIE(Var))
+      addDIEEntry(Buffer, dwarf::DW_AT_associated, *VarDIE);
+  } else if (DIExpression *Expr = CTy->getAssociatedExp()) {
+    DIELoc *Loc = new (DIEValueAllocator) DIELoc;
+    DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc);
+    DwarfExpr.setMemoryLocationKind();
+    DwarfExpr.addExpression(Expr);
+    addBlock(Buffer, dwarf::DW_AT_associated, DwarfExpr.finalize());
+  }
+
+  if (DIVariable *Var = CTy->getAllocated()) {
+    if (auto *VarDIE = getDIE(Var))
+      addDIEEntry(Buffer, dwarf::DW_AT_allocated, *VarDIE);
+  } else if (DIExpression *Expr = CTy->getAllocatedExp()) {
+    DIELoc *Loc = new (DIEValueAllocator) DIELoc;
+    DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc);
+    DwarfExpr.setMemoryLocationKind();
+    DwarfExpr.addExpression(Expr);
+    addBlock(Buffer, dwarf::DW_AT_allocated, DwarfExpr.finalize());
+  }
+
+  if (auto *RankConst = CTy->getRankConst()) {
+    addSInt(Buffer, dwarf::DW_AT_rank, dwarf::DW_FORM_sdata,
+            RankConst->getSExtValue());
+  } else if (auto *RankExpr = CTy->getRankExp()) {
+    DIELoc *Loc = new (DIEValueAllocator) DIELoc;
+    DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc);
+    DwarfExpr.setMemoryLocationKind();
+    DwarfExpr.addExpression(RankExpr);
+    addBlock(Buffer, dwarf::DW_AT_rank, DwarfExpr.finalize());
+  }
+
   // Emit the element type.
   addType(Buffer, CTy->getBaseType());
 
@@ -1459,15 +1464,19 @@ void DwarfUnit::constructArrayTypeDIE(DIE &Buffer, const DICompositeType *CTy) {
   DINodeArray Elements = CTy->getElements();
   for (unsigned i = 0, N = Elements.size(); i < N; ++i) {
     // FIXME: Should this really be such a loose cast?
-    if (auto *Element = dyn_cast_or_null<DINode>(Elements[i]))
+    if (auto *Element = dyn_cast_or_null<DINode>(Elements[i])) {
       if (Element->getTag() == dwarf::DW_TAG_subrange_type)
         constructSubrangeDIE(Buffer, cast<DISubrange>(Element), IdxTy);
+      else if (Element->getTag() == dwarf::DW_TAG_generic_subrange)
+        constructGenericSubrangeDIE(Buffer, cast<DIGenericSubrange>(Element),
+                                    IdxTy);
+    }
   }
 }
 
 void DwarfUnit::constructEnumTypeDIE(DIE &Buffer, const DICompositeType *CTy) {
   const DIType *DTy = CTy->getBaseType();
-  bool IsUnsigned = DTy && isUnsignedDIType(DD, DTy);
+  bool IsUnsigned = DTy && DD->isUnsignedDIType(DTy);
   if (DTy) {
     if (DD->getDwarfVersion() >= 3)
       addType(Buffer, DTy);
@@ -1666,15 +1675,15 @@ DIE *DwarfUnit::getOrCreateStaticMemberDIE(const DIDerivedType *DT) {
 
 void DwarfUnit::emitCommonHeader(bool UseOffsets, dwarf::UnitType UT) {
   // Emit size of content not including length itself
-  Asm->OutStreamer->AddComment("Length of Unit");
   if (!DD->useSectionsAsReferences()) {
     StringRef Prefix = isDwoUnit() ? "debug_info_dwo_" : "debug_info_";
     MCSymbol *BeginLabel = Asm->createTempSymbol(Prefix + "start");
     EndLabel = Asm->createTempSymbol(Prefix + "end");
-    Asm->emitLabelDifference(EndLabel, BeginLabel, 4);
+    Asm->emitDwarfUnitLength(EndLabel, BeginLabel, "Length of Unit");
     Asm->OutStreamer->emitLabel(BeginLabel);
   } else
-    Asm->emitInt32(getHeaderSize() + getUnitDie().getSize());
+    Asm->emitDwarfUnitLength(getHeaderSize() + getUnitDie().getSize(),
+                             "Length of Unit");
 
   Asm->OutStreamer->AddComment("DWARF version number");
   unsigned Version = DD->getDwarfVersion();
@@ -1694,7 +1703,7 @@ void DwarfUnit::emitCommonHeader(bool UseOffsets, dwarf::UnitType UT) {
   Asm->OutStreamer->AddComment("Offset Into Abbrev. Section");
   const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
   if (UseOffsets)
-    Asm->emitInt32(0);
+    Asm->emitDwarfLengthOrOffset(0);
   else
     Asm->emitDwarfSymbolReference(
         TLOF.getDwarfAbbrevSection()->getBeginSymbol(), false);
@@ -1713,16 +1722,14 @@ void DwarfTypeUnit::emitHeader(bool UseOffsets) {
   Asm->OutStreamer->emitIntValue(TypeSignature, sizeof(TypeSignature));
   Asm->OutStreamer->AddComment("Type DIE Offset");
   // In a skeleton type unit there is no type DIE so emit a zero offset.
-  Asm->OutStreamer->emitIntValue(Ty ? Ty->getOffset() : 0,
-                                 sizeof(Ty->getOffset()));
+  Asm->emitDwarfLengthOrOffset(Ty ? Ty->getOffset() : 0);
 }
 
 DIE::value_iterator
 DwarfUnit::addSectionDelta(DIE &Die, dwarf::Attribute Attribute,
                            const MCSymbol *Hi, const MCSymbol *Lo) {
   return Die.addValue(DIEValueAllocator, Attribute,
-                      DD->getDwarfVersion() >= 4 ? dwarf::DW_FORM_sec_offset
-                                                 : dwarf::DW_FORM_data4,
+                      DD->getDwarfSectionOffsetForm(),
                       new (DIEValueAllocator) DIEDelta(Hi, Lo));
 }
 
@@ -1730,10 +1737,7 @@ DIE::value_iterator
 DwarfUnit::addSectionLabel(DIE &Die, dwarf::Attribute Attribute,
                            const MCSymbol *Label, const MCSymbol *Sec) {
   if (Asm->MAI->doesDwarfUseRelocationsAcrossSections())
-    return addLabel(Die, Attribute,
-                    DD->getDwarfVersion() >= 4 ? dwarf::DW_FORM_sec_offset
-                                               : dwarf::DW_FORM_data4,
-                    Label);
+    return addLabel(Die, Attribute, DD->getDwarfSectionOffsetForm(), Label);
   return addSectionDelta(Die, Attribute, Label, Sec);
 }
 
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
index 34f3a34ed336..5c643760fd56 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
@@ -16,22 +16,19 @@
 #include "DwarfDebug.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Optional.h"
-#include "llvm/ADT/StringMap.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/DIE.h"
-#include "llvm/IR/DIBuilder.h"
-#include "llvm/IR/DebugInfo.h"
-#include "llvm/MC/MCDwarf.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCSection.h"
+#include <string>
 
 namespace llvm {
 
-class MachineOperand;
-class ConstantInt;
 class ConstantFP;
+class ConstantInt;
 class DbgVariable;
 class DwarfCompileUnit;
+class MachineOperand;
+class MCDwarfDwoLineTable;
+class MCSymbol;
 
 //===----------------------------------------------------------------------===//
 /// This dwarf writer support class manages information associated with a
@@ -77,7 +74,6 @@ protected:
 
   bool applySubprogramDefinitionAttributes(const DISubprogram *SP, DIE &SPDie);
 
-  bool shareAcrossDWOCUs() const;
   bool isShareableAcrossCUs(const DINode *D) const;
 
 public:
@@ -86,8 +82,7 @@ public:
   MCSymbol *getEndLabel() const { return EndLabel; }
   uint16_t getLanguage() const { return CUNode->getSourceLanguage(); }
   const DICompileUnit *getCUNode() const { return CUNode; }
-
-  uint16_t getDwarfVersion() const { return DD->getDwarfVersion(); }
+  DwarfDebug &getDwarfDebug() const { return *DD; }
 
   /// Return true if this compile unit has something to write out.
   bool hasContent() const { return getUnitDie().hasChildren(); }
@@ -195,7 +190,6 @@ public:
   void addSourceLine(DIE &Die, const DIObjCProperty *Ty);
 
   /// Add constant value entry in variable DIE.
-  void addConstantValue(DIE &Die, const MachineOperand &MO, const DIType *Ty);
   void addConstantValue(DIE &Die, const ConstantInt *CI, const DIType *Ty);
   void addConstantValue(DIE &Die, const APInt &Val, const DIType *Ty);
   void addConstantValue(DIE &Die, const APInt &Val, bool Unsigned);
@@ -203,7 +197,6 @@ public:
   void addConstantValue(DIE &Die, bool Unsigned, uint64_t Val);
 
   /// Add constant value entry in variable DIE.
-  void addConstantFPValue(DIE &Die, const MachineOperand &MO);
   void addConstantFPValue(DIE &Die, const ConstantFP *CFP);
 
   /// Add a linkage name, if it isn't empty.
@@ -255,9 +248,9 @@ public:
   /// Compute the size of a header for this unit, not including the initial
   /// length field.
   virtual unsigned getHeaderSize() const {
-    return sizeof(int16_t) + // DWARF version number
-           sizeof(int32_t) + // Offset Into Abbrev. Section
-           sizeof(int8_t) +  // Pointer Size (in bytes)
+    return sizeof(int16_t) +               // DWARF version number
+           Asm->getDwarfOffsetByteSize() + // Offset Into Abbrev. Section
+           sizeof(int8_t) +                // Pointer Size (in bytes)
            (DD->getDwarfVersion() >= 5 ? sizeof(int8_t)
                                        : 0); // DWARF v5 unit type
   }
@@ -284,10 +277,6 @@ public:
                                       const MCSymbol *Label,
                                       const MCSymbol *Sec);
 
-  /// If the \p File has an MD5 checksum, return it as an MD5Result
-  /// allocated in the MCContext.
-  Optional<MD5::MD5Result> getMD5AsBytes(const DIFile *File) const;
-
   /// Get context owner's DIE.
   DIE *createTypeDIE(const DICompositeType *Ty);
 
@@ -306,9 +295,12 @@ protected:
 
 private:
   void constructTypeDIE(DIE &Buffer, const DIBasicType *BTy);
+  void constructTypeDIE(DIE &Buffer, const DIStringType *BTy);
   void constructTypeDIE(DIE &Buffer, const DIDerivedType *DTy);
   void constructTypeDIE(DIE &Buffer, const DISubroutineType *CTy);
   void constructSubrangeDIE(DIE &Buffer, const DISubrange *SR, DIE *IndexTy);
+  void constructGenericSubrangeDIE(DIE &Buffer, const DIGenericSubrange *SR,
+                                   DIE *IndexTy);
   void constructArrayTypeDIE(DIE &Buffer, const DICompositeType *CTy);
   void constructEnumTypeDIE(DIE &Buffer, const DICompositeType *CTy);
   DIE &constructMemberDIE(DIE &Buffer, const DIDerivedType *DT);
@@ -361,7 +353,7 @@ public:
   void emitHeader(bool UseOffsets) override;
   unsigned getHeaderSize() const override {
     return DwarfUnit::getHeaderSize() + sizeof(uint64_t) + // Type Signature
-           sizeof(uint32_t);                               // Type DIE Offset
+           Asm->getDwarfOffsetByteSize();                  // Type DIE Offset
   }
   void addGlobalName(StringRef Name, const DIE &Die,
                      const DIScope *Context) override;
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp
index 99ee4567fa58..2ffe8a7b0469 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp
@@ -44,15 +44,9 @@ EHStreamer::~EHStreamer() = default;
 unsigned EHStreamer::sharedTypeIDs(const LandingPadInfo *L,
                                    const LandingPadInfo *R) {
   const std::vector<int> &LIds = L->TypeIds, &RIds = R->TypeIds;
-  unsigned LSize = LIds.size(), RSize = RIds.size();
-  unsigned MinSize = LSize < RSize ? LSize : RSize;
-  unsigned Count = 0;
-
-  for (; Count != MinSize; ++Count)
-    if (LIds[Count] != RIds[Count])
-      return Count;
-
-  return Count;
+  return std::mismatch(LIds.begin(), LIds.end(), RIds.begin(), RIds.end())
+             .first -
+         LIds.begin();
 }
 
 /// Compute the actions table and gather the first action index for each landing
@@ -220,15 +214,30 @@ void EHStreamer::computePadMap(
 /// the landing pad and the action.  Calls marked 'nounwind' have no entry and
 /// must not be contained in the try-range of any entry - they form gaps in the
 /// table.  Entries must be ordered by try-range address.
-void EHStreamer::
-computeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
-                     const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
-                     const SmallVectorImpl<unsigned> &FirstActions) {
+///
+/// Call-sites are split into one or more call-site ranges associated with
+/// different sections of the function.
+///
+///   - Without -basic-block-sections, all call-sites are grouped into one
+///     call-site-range corresponding to the function section.
+///
+///   - With -basic-block-sections, one call-site range is created for each
+///     section, with its FragmentBeginLabel and FragmentEndLabel respectively
+//      set to the beginning and ending of the corresponding section and its
+//      ExceptionLabel set to the exception symbol dedicated for this section.
+//      Later, one LSDA header will be emitted for each call-site range with its
+//      call-sites following. The action table and type info table will be
+//      shared across all ranges.
+void EHStreamer::computeCallSiteTable(
+    SmallVectorImpl<CallSiteEntry> &CallSites,
+    SmallVectorImpl<CallSiteRange> &CallSiteRanges,
+    const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
+    const SmallVectorImpl<unsigned> &FirstActions) {
   RangeMapType PadMap;
   computePadMap(LandingPads, PadMap);
 
   // The end label of the previous invoke or nounwind try-range.
-  MCSymbol *LastLabel = nullptr;
+  MCSymbol *LastLabel = Asm->getFunctionBegin();
 
   // Whether there is a potentially throwing instruction (currently this means
   // an ordinary call) between the end of the previous try-range and now.
@@ -241,6 +250,21 @@ computeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
 
   // Visit all instructions in order of address.
   for (const auto &MBB : *Asm->MF) {
+    if (&MBB == &Asm->MF->front() || MBB.isBeginSection()) {
+      // We start a call-site range upon function entry and at the beginning of
+      // every basic block section.
+      CallSiteRanges.push_back(
+          {Asm->MBBSectionRanges[MBB.getSectionIDNum()].BeginLabel,
+           Asm->MBBSectionRanges[MBB.getSectionIDNum()].EndLabel,
+           Asm->getMBBExceptionSym(MBB), CallSites.size()});
+      PreviousIsInvoke = false;
+      SawPotentiallyThrowing = false;
+      LastLabel = nullptr;
+    }
+
+    if (MBB.isEHPad())
+      CallSiteRanges.back().IsLPRange = true;
+
     for (const auto &MI : MBB) {
       if (!MI.isEHLabel()) {
         if (MI.isCall())
@@ -264,13 +288,14 @@ computeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
       assert(BeginLabel == LandingPad->BeginLabels[P.RangeIndex] &&
              "Inconsistent landing pad map!");
 
-      // For Dwarf exception handling (SjLj handling doesn't use this). If some
-      // instruction between the previous try-range and this one may throw,
-      // create a call-site entry with no landing pad for the region between the
-      // try-ranges.
-      if (SawPotentiallyThrowing && Asm->MAI->usesCFIForEH()) {
-        CallSiteEntry Site = { LastLabel, BeginLabel, nullptr, 0 };
-        CallSites.push_back(Site);
+      // For Dwarf and AIX exception handling (SjLj handling doesn't use this).
+      // If some instruction between the previous try-range and this one may
+      // throw, create a call-site entry with no landing pad for the region
+      // between the try-ranges.
+      if (SawPotentiallyThrowing &&
+          (Asm->MAI->usesCFIForEH() ||
+           Asm->MAI->getExceptionHandlingType() == ExceptionHandling::AIX)) {
+        CallSites.push_back({LastLabel, BeginLabel, nullptr, 0});
         PreviousIsInvoke = false;
       }
 
@@ -313,14 +338,21 @@ computeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
         PreviousIsInvoke = true;
       }
     }
-  }
 
-  // If some instruction between the previous try-range and the end of the
-  // function may throw, create a call-site entry with no landing pad for the
-  // region following the try-range.
-  if (SawPotentiallyThrowing && !IsSJLJ) {
-    CallSiteEntry Site = { LastLabel, nullptr, nullptr, 0 };
-    CallSites.push_back(Site);
+    // We end the call-site range upon function exit and at the end of every
+    // basic block section.
+    if (&MBB == &Asm->MF->back() || MBB.isEndSection()) {
+      // If some instruction between the previous try-range and the end of the
+      // function may throw, create a call-site entry with no landing pad for
+      // the region following the try-range.
+      if (SawPotentiallyThrowing && !IsSJLJ) {
+        CallSiteEntry Site = {LastLabel, CallSiteRanges.back().FragmentEndLabel,
+                              nullptr, 0};
+        CallSites.push_back(Site);
+        SawPotentiallyThrowing = false;
+      }
+      CallSiteRanges.back().CallSiteEndIdx = CallSites.size();
+    }
   }
 }
 
@@ -371,19 +403,25 @@ MCSymbol *EHStreamer::emitExceptionTable() {
   SmallVector<unsigned, 64> FirstActions;
   computeActionsTable(LandingPads, Actions, FirstActions);
 
-  // Compute the call-site table.
+  // Compute the call-site table and call-site ranges. Normally, there is only
+  // one call-site-range which covers the whole funciton. With
+  // -basic-block-sections, there is one call-site-range per basic block
+  // section.
   SmallVector<CallSiteEntry, 64> CallSites;
-  computeCallSiteTable(CallSites, LandingPads, FirstActions);
+  SmallVector<CallSiteRange, 4> CallSiteRanges;
+  computeCallSiteTable(CallSites, CallSiteRanges, LandingPads, FirstActions);
 
   bool IsSJLJ = Asm->MAI->getExceptionHandlingType() == ExceptionHandling::SjLj;
   bool IsWasm = Asm->MAI->getExceptionHandlingType() == ExceptionHandling::Wasm;
+  bool HasLEB128Directives = Asm->MAI->hasLEB128Directives();
   unsigned CallSiteEncoding =
       IsSJLJ ? static_cast<unsigned>(dwarf::DW_EH_PE_udata4) :
                Asm->getObjFileLowering().getCallSiteEncoding();
   bool HaveTTData = !TypeInfos.empty() || !FilterIds.empty();
 
   // Type infos.
-  MCSection *LSDASection = Asm->getObjFileLowering().getLSDASection();
+  MCSection *LSDASection =
+      Asm->getObjFileLowering().getSectionForLSDA(MF->getFunction(), Asm->TM);
   unsigned TTypeEncoding;
 
   if (!HaveTTData) {
@@ -433,35 +471,122 @@ MCSymbol *EHStreamer::emitExceptionTable() {
     Asm->OutContext.getOrCreateSymbol(Twine("GCC_except_table")+
                                       Twine(Asm->getFunctionNumber()));
   Asm->OutStreamer->emitLabel(GCCETSym);
-  Asm->OutStreamer->emitLabel(Asm->getCurExceptionSym());
-
-  // Emit the LSDA header.
-  Asm->emitEncodingByte(dwarf::DW_EH_PE_omit, "@LPStart");
-  Asm->emitEncodingByte(TTypeEncoding, "@TType");
+  MCSymbol *CstEndLabel = Asm->createTempSymbol(
+      CallSiteRanges.size() > 1 ? "action_table_base" : "cst_end");
 
   MCSymbol *TTBaseLabel = nullptr;
-  if (HaveTTData) {
-    // N.B.: There is a dependency loop between the size of the TTBase uleb128
-    // here and the amount of padding before the aligned type table. The
-    // assembler must sometimes pad this uleb128 or insert extra padding before
-    // the type table. See PR35809 or GNU as bug 4029.
-    MCSymbol *TTBaseRefLabel = Asm->createTempSymbol("ttbaseref");
+  if (HaveTTData)
     TTBaseLabel = Asm->createTempSymbol("ttbase");
-    Asm->emitLabelDifferenceAsULEB128(TTBaseLabel, TTBaseRefLabel);
-    Asm->OutStreamer->emitLabel(TTBaseRefLabel);
-  }
 
-  bool VerboseAsm = Asm->OutStreamer->isVerboseAsm();
+  const bool VerboseAsm = Asm->OutStreamer->isVerboseAsm();
+
+  // Helper for emitting references (offsets) for type table and the end of the
+  // call-site table (which marks the beginning of the action table).
+  //  * For Itanium, these references will be emitted for every callsite range.
+  //  * For SJLJ and Wasm, they will be emitted only once in the LSDA header.
+  auto EmitTypeTableRefAndCallSiteTableEndRef = [&]() {
+    Asm->emitEncodingByte(TTypeEncoding, "@TType");
+    if (HaveTTData) {
+      // N.B.: There is a dependency loop between the size of the TTBase uleb128
+      // here and the amount of padding before the aligned type table. The
+      // assembler must sometimes pad this uleb128 or insert extra padding
+      // before the type table. See PR35809 or GNU as bug 4029.
+      MCSymbol *TTBaseRefLabel = Asm->createTempSymbol("ttbaseref");
+      Asm->emitLabelDifferenceAsULEB128(TTBaseLabel, TTBaseRefLabel);
+      Asm->OutStreamer->emitLabel(TTBaseRefLabel);
+    }
+
+    // The Action table follows the call-site table. So we emit the
+    // label difference from here (start of the call-site table for SJLJ and
+    // Wasm, and start of a call-site range for Itanium) to the end of the
+    // whole call-site table (end of the last call-site range for Itanium).
+    MCSymbol *CstBeginLabel = Asm->createTempSymbol("cst_begin");
+    Asm->emitEncodingByte(CallSiteEncoding, "Call site");
+    Asm->emitLabelDifferenceAsULEB128(CstEndLabel, CstBeginLabel);
+    Asm->OutStreamer->emitLabel(CstBeginLabel);
+  };
+
+  // An alternative path to EmitTypeTableRefAndCallSiteTableEndRef.
+  // For some platforms, the system assembler does not accept the form of
+  // `.uleb128 label2 - label1`. In those situations, we would need to calculate
+  // the size between label1 and label2 manually.
+  // In this case, we would need to calculate the LSDA size and the call
+  // site table size.
+  auto EmitTypeTableOffsetAndCallSiteTableOffset = [&]() {
+    assert(CallSiteEncoding == dwarf::DW_EH_PE_udata4 && !HasLEB128Directives &&
+           "Targets supporting .uleb128 do not need to take this path.");
+    if (CallSiteRanges.size() > 1)
+      report_fatal_error(
+          "-fbasic-block-sections is not yet supported on "
+          "platforms that do not have general LEB128 directive support.");
+
+    uint64_t CallSiteTableSize = 0;
+    const CallSiteRange &CSRange = CallSiteRanges.back();
+    for (size_t CallSiteIdx = CSRange.CallSiteBeginIdx;
+         CallSiteIdx < CSRange.CallSiteEndIdx; ++CallSiteIdx) {
+      const CallSiteEntry &S = CallSites[CallSiteIdx];
+      // Each call site entry consists of 3 udata4 fields (12 bytes) and
+      // 1 ULEB128 field.
+      CallSiteTableSize += 12 + getULEB128Size(S.Action);
+      assert(isUInt<32>(CallSiteTableSize) && "CallSiteTableSize overflows.");
+    }
+
+    Asm->emitEncodingByte(TTypeEncoding, "@TType");
+    if (HaveTTData) {
+      const unsigned ByteSizeOfCallSiteOffset =
+          getULEB128Size(CallSiteTableSize);
+      uint64_t ActionTableSize = 0;
+      for (const ActionEntry &Action : Actions) {
+        // Each action entry consists of two SLEB128 fields.
+        ActionTableSize += getSLEB128Size(Action.ValueForTypeID) +
+                           getSLEB128Size(Action.NextAction);
+        assert(isUInt<32>(ActionTableSize) && "ActionTableSize overflows.");
+      }
+
+      const unsigned TypeInfoSize =
+          Asm->GetSizeOfEncodedValue(TTypeEncoding) * MF->getTypeInfos().size();
+
+      const uint64_t LSDASizeBeforeAlign =
+          1                          // Call site encoding byte.
+          + ByteSizeOfCallSiteOffset // ULEB128 encoding of CallSiteTableSize.
+          + CallSiteTableSize        // Call site table content.
+          + ActionTableSize;         // Action table content.
+
+      const uint64_t LSDASizeWithoutAlign = LSDASizeBeforeAlign + TypeInfoSize;
+      const unsigned ByteSizeOfLSDAWithoutAlign =
+          getULEB128Size(LSDASizeWithoutAlign);
+      const uint64_t DisplacementBeforeAlign =
+          2 // LPStartEncoding and TypeTableEncoding.
+          + ByteSizeOfLSDAWithoutAlign + LSDASizeBeforeAlign;
+
+      // The type info area starts with 4 byte alignment.
+      const unsigned NeedAlignVal = (4 - DisplacementBeforeAlign % 4) % 4;
+      uint64_t LSDASizeWithAlign = LSDASizeWithoutAlign + NeedAlignVal;
+      const unsigned ByteSizeOfLSDAWithAlign =
+          getULEB128Size(LSDASizeWithAlign);
+
+      // The LSDASizeWithAlign could use 1 byte less padding for alignment
+      // when the data we use to represent the LSDA Size "needs" to be 1 byte
+      // larger than the one previously calculated without alignment.
+      if (ByteSizeOfLSDAWithAlign > ByteSizeOfLSDAWithoutAlign)
+        LSDASizeWithAlign -= 1;
+
+      Asm->OutStreamer->emitULEB128IntValue(LSDASizeWithAlign,
+                                            ByteSizeOfLSDAWithAlign);
+    }
 
-  // Emit the landing pad call site table.
-  MCSymbol *CstBeginLabel = Asm->createTempSymbol("cst_begin");
-  MCSymbol *CstEndLabel = Asm->createTempSymbol("cst_end");
-  Asm->emitEncodingByte(CallSiteEncoding, "Call site");
-  Asm->emitLabelDifferenceAsULEB128(CstEndLabel, CstBeginLabel);
-  Asm->OutStreamer->emitLabel(CstBeginLabel);
+    Asm->emitEncodingByte(CallSiteEncoding, "Call site");
+    Asm->OutStreamer->emitULEB128IntValue(CallSiteTableSize);
+  };
 
   // SjLj / Wasm Exception handling
   if (IsSJLJ || IsWasm) {
+    Asm->OutStreamer->emitLabel(Asm->getMBBExceptionSym(Asm->MF->front()));
+
+    // emit the LSDA header.
+    Asm->emitEncodingByte(dwarf::DW_EH_PE_omit, "@LPStart");
+    EmitTypeTableRefAndCallSiteTableEndRef();
+
     unsigned idx = 0;
     for (SmallVectorImpl<CallSiteEntry>::const_iterator
          I = CallSites.begin(), E = CallSites.end(); I != E; ++I, ++idx) {
@@ -486,6 +611,7 @@ MCSymbol *EHStreamer::emitExceptionTable() {
       }
       Asm->emitULEB128(S.Action);
     }
+    Asm->OutStreamer->emitLabel(CstEndLabel);
   } else {
     // Itanium LSDA exception handling
 
@@ -507,57 +633,127 @@ MCSymbol *EHStreamer::emitExceptionTable() {
     // A missing entry in the call-site table indicates that a call is not
     // supposed to throw.
 
+    assert(CallSiteRanges.size() != 0 && "No call-site ranges!");
+
+    // There should be only one call-site range which includes all the landing
+    // pads. Find that call-site range here.
+    const CallSiteRange *LandingPadRange = nullptr;
+    for (const CallSiteRange &CSRange : CallSiteRanges) {
+      if (CSRange.IsLPRange) {
+        assert(LandingPadRange == nullptr &&
+               "All landing pads must be in a single callsite range.");
+        LandingPadRange = &CSRange;
+      }
+    }
+
+    // The call-site table is split into its call-site ranges, each being
+    // emitted as:
+    //              [ LPStartEncoding | LPStart ]
+    //              [ TypeTableEncoding | TypeTableOffset ]
+    //              [ CallSiteEncoding | CallSiteTableEndOffset ]
+    // cst_begin -> { call-site entries contained in this range }
+    //
+    // and is followed by the next call-site range.
+    //
+    // For each call-site range, CallSiteTableEndOffset is computed as the
+    // difference between cst_begin of that range and the last call-site-table's
+    // end label. This offset is used to find the action table.
+
     unsigned Entry = 0;
-    for (SmallVectorImpl<CallSiteEntry>::const_iterator
-         I = CallSites.begin(), E = CallSites.end(); I != E; ++I) {
-      const CallSiteEntry &S = *I;
+    for (const CallSiteRange &CSRange : CallSiteRanges) {
+      if (CSRange.CallSiteBeginIdx != 0) {
+        // Align the call-site range for all ranges except the first. The
+        // first range is already aligned due to the exception table alignment.
+        Asm->emitAlignment(Align(4));
+      }
+      Asm->OutStreamer->emitLabel(CSRange.ExceptionLabel);
+
+      // Emit the LSDA header.
+      // If only one call-site range exists, LPStart is omitted as it is the
+      // same as the function entry.
+      if (CallSiteRanges.size() == 1) {
+        Asm->emitEncodingByte(dwarf::DW_EH_PE_omit, "@LPStart");
+      } else if (!Asm->isPositionIndependent()) {
+        // For more than one call-site ranges, LPStart must be explicitly
+        // specified.
+        // For non-PIC we can simply use the absolute value.
+        Asm->emitEncodingByte(dwarf::DW_EH_PE_absptr, "@LPStart");
+        Asm->OutStreamer->emitSymbolValue(LandingPadRange->FragmentBeginLabel,
+                                          Asm->MAI->getCodePointerSize());
+      } else {
+        // For PIC mode, we Emit a PC-relative address for LPStart.
+        Asm->emitEncodingByte(dwarf::DW_EH_PE_pcrel, "@LPStart");
+        MCContext &Context = Asm->OutStreamer->getContext();
+        MCSymbol *Dot = Context.createTempSymbol();
+        Asm->OutStreamer->emitLabel(Dot);
+        Asm->OutStreamer->emitValue(
+            MCBinaryExpr::createSub(
+                MCSymbolRefExpr::create(LandingPadRange->FragmentBeginLabel,
+                                        Context),
+                MCSymbolRefExpr::create(Dot, Context), Context),
+            Asm->MAI->getCodePointerSize());
+      }
+
+      if (HasLEB128Directives)
+        EmitTypeTableRefAndCallSiteTableEndRef();
+      else
+        EmitTypeTableOffsetAndCallSiteTableOffset();
+
+      for (size_t CallSiteIdx = CSRange.CallSiteBeginIdx;
+           CallSiteIdx != CSRange.CallSiteEndIdx; ++CallSiteIdx) {
+        const CallSiteEntry &S = CallSites[CallSiteIdx];
+
+        MCSymbol *EHFuncBeginSym = CSRange.FragmentBeginLabel;
+        MCSymbol *EHFuncEndSym = CSRange.FragmentEndLabel;
 
-      MCSymbol *EHFuncBeginSym = Asm->getFunctionBegin();
-
-      MCSymbol *BeginLabel = S.BeginLabel;
-      if (!BeginLabel)
-        BeginLabel = EHFuncBeginSym;
-      MCSymbol *EndLabel = S.EndLabel;
-      if (!EndLabel)
-        EndLabel = Asm->getFunctionEnd();
-
-      // Offset of the call site relative to the start of the procedure.
-      if (VerboseAsm)
-        Asm->OutStreamer->AddComment(">> Call Site " + Twine(++Entry) + " <<");
-      Asm->emitCallSiteOffset(BeginLabel, EHFuncBeginSym, CallSiteEncoding);
-      if (VerboseAsm)
-        Asm->OutStreamer->AddComment(Twine("  Call between ") +
-                                     BeginLabel->getName() + " and " +
-                                     EndLabel->getName());
-      Asm->emitCallSiteOffset(EndLabel, BeginLabel, CallSiteEncoding);
-
-      // Offset of the landing pad relative to the start of the procedure.
-      if (!S.LPad) {
+        MCSymbol *BeginLabel = S.BeginLabel;
+        if (!BeginLabel)
+          BeginLabel = EHFuncBeginSym;
+        MCSymbol *EndLabel = S.EndLabel;
+        if (!EndLabel)
+          EndLabel = EHFuncEndSym;
+
+        // Offset of the call site relative to the start of the procedure.
         if (VerboseAsm)
-          Asm->OutStreamer->AddComment("    has no landing pad");
-        Asm->emitCallSiteValue(0, CallSiteEncoding);
-      } else {
+          Asm->OutStreamer->AddComment(">> Call Site " + Twine(++Entry) +
+                                       " <<");
+        Asm->emitCallSiteOffset(BeginLabel, EHFuncBeginSym, CallSiteEncoding);
         if (VerboseAsm)
-          Asm->OutStreamer->AddComment(Twine("    jumps to ") +
-                                       S.LPad->LandingPadLabel->getName());
-        Asm->emitCallSiteOffset(S.LPad->LandingPadLabel, EHFuncBeginSym,
-                                CallSiteEncoding);
-      }
+          Asm->OutStreamer->AddComment(Twine("  Call between ") +
+                                       BeginLabel->getName() + " and " +
+                                       EndLabel->getName());
+        Asm->emitCallSiteOffset(EndLabel, BeginLabel, CallSiteEncoding);
+
+        // Offset of the landing pad relative to the start of the landing pad
+        // fragment.
+        if (!S.LPad) {
+          if (VerboseAsm)
+            Asm->OutStreamer->AddComment("    has no landing pad");
+          Asm->emitCallSiteValue(0, CallSiteEncoding);
+        } else {
+          if (VerboseAsm)
+            Asm->OutStreamer->AddComment(Twine("    jumps to ") +
+                                         S.LPad->LandingPadLabel->getName());
+          Asm->emitCallSiteOffset(S.LPad->LandingPadLabel,
+                                  LandingPadRange->FragmentBeginLabel,
+                                  CallSiteEncoding);
+        }
 
-      // Offset of the first associated action record, relative to the start of
-      // the action table. This value is biased by 1 (1 indicates the start of
-      // the action table), and 0 indicates that there are no actions.
-      if (VerboseAsm) {
-        if (S.Action == 0)
-          Asm->OutStreamer->AddComment("  On action: cleanup");
-        else
-          Asm->OutStreamer->AddComment("  On action: " +
-                                       Twine((S.Action - 1) / 2 + 1));
+        // Offset of the first associated action record, relative to the start
+        // of the action table. This value is biased by 1 (1 indicates the start
+        // of the action table), and 0 indicates that there are no actions.
+        if (VerboseAsm) {
+          if (S.Action == 0)
+            Asm->OutStreamer->AddComment("  On action: cleanup");
+          else
+            Asm->OutStreamer->AddComment("  On action: " +
+                                         Twine((S.Action - 1) / 2 + 1));
+        }
+        Asm->emitULEB128(S.Action);
       }
-      Asm->emitULEB128(S.Action);
     }
+    Asm->OutStreamer->emitLabel(CstEndLabel);
   }
-  Asm->OutStreamer->emitLabel(CstEndLabel);
 
   // Emit the Action Table.
   int Entry = 0;
@@ -587,15 +783,12 @@ MCSymbol *EHStreamer::emitExceptionTable() {
     Asm->emitSLEB128(Action.ValueForTypeID);
 
     // Action Record
-    //
-    //   Self-relative signed displacement in bytes of the next action record,
-    //   or 0 if there is no next action record.
     if (VerboseAsm) {
-      if (Action.NextAction == 0) {
+      if (Action.Previous == unsigned(-1)) {
         Asm->OutStreamer->AddComment("  No further actions");
       } else {
-        unsigned NextAction = Entry + (Action.NextAction + 1) / 2;
-        Asm->OutStreamer->AddComment("  Continue to action "+Twine(NextAction));
+        Asm->OutStreamer->AddComment("  Continue to action " +
+                                     Twine(Action.Previous + 1));
       }
     }
     Asm->emitSLEB128(Action.NextAction);
@@ -615,7 +808,7 @@ void EHStreamer::emitTypeInfos(unsigned TTypeEncoding, MCSymbol *TTBaseLabel) {
   const std::vector<const GlobalValue *> &TypeInfos = MF->getTypeInfos();
   const std::vector<unsigned> &FilterIds = MF->getFilterIds();
 
-  bool VerboseAsm = Asm->OutStreamer->isVerboseAsm();
+  const bool VerboseAsm = Asm->OutStreamer->isVerboseAsm();
 
   int Entry = 0;
   // Emit the Catch TypeInfos.
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/EHStreamer.h b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/EHStreamer.h
index e62cf17a05d4..234e62506a56 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/EHStreamer.h
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/EHStreamer.h
@@ -69,23 +69,48 @@ protected:
     unsigned Action;
   };
 
+  /// Structure describing a contiguous range of call-sites which reside
+  /// in the same procedure fragment. With -fbasic-block-sections, there will
+  /// be one call site range per basic block section. Otherwise, we will have
+  /// one call site range containing all the call sites in the function.
+  struct CallSiteRange {
+    // Symbol marking the beginning of the precedure fragment.
+    MCSymbol *FragmentBeginLabel = nullptr;
+    // Symbol marking the end of the procedure fragment.
+    MCSymbol *FragmentEndLabel = nullptr;
+    // LSDA symbol for this call-site range.
+    MCSymbol *ExceptionLabel = nullptr;
+    // Index of the first call-site entry in the call-site table which
+    // belongs to this range.
+    size_t CallSiteBeginIdx = 0;
+    // Index just after the last call-site entry in the call-site table which
+    // belongs to this range.
+    size_t CallSiteEndIdx = 0;
+    // Whether this is the call-site range containing all the landing pads.
+    bool IsLPRange = false;
+  };
+
   /// Compute the actions table and gather the first action index for each
   /// landing pad site.
-  void computeActionsTable(const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
-                           SmallVectorImpl<ActionEntry> &Actions,
-                           SmallVectorImpl<unsigned> &FirstActions);
+  void computeActionsTable(
+      const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
+      SmallVectorImpl<ActionEntry> &Actions,
+      SmallVectorImpl<unsigned> &FirstActions);
 
   void computePadMap(const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
                      RangeMapType &PadMap);
 
-  /// Compute the call-site table.  The entry for an invoke has a try-range
-  /// containing the call, a non-zero landing pad and an appropriate action.
-  /// The entry for an ordinary call has a try-range containing the call and
-  /// zero for the landing pad and the action.  Calls marked 'nounwind' have
-  /// no entry and must not be contained in the try-range of any entry - they
-  /// form gaps in the table.  Entries must be ordered by try-range address.
+  /// Compute the call-site table and the call-site ranges. The entry for an
+  /// invoke has a try-range containing the call, a non-zero landing pad and an
+  /// appropriate action. The entry for an ordinary call has a try-range
+  /// containing the call and zero for the landing pad and the action.  Calls
+  /// marked 'nounwind' have no entry and must not be contained in the try-range
+  /// of any entry - they form gaps in the table.  Entries must be ordered by
+  /// try-range address. CallSiteRanges vector is only populated for Itanium
+  /// exception handling.
   virtual void computeCallSiteTable(
       SmallVectorImpl<CallSiteEntry> &CallSites,
+      SmallVectorImpl<CallSiteRange> &CallSiteRanges,
       const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
       const SmallVectorImpl<unsigned> &FirstActions);
 
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
index 8fa83f515910..354b638b47a2 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
@@ -145,9 +145,10 @@ void OcamlGCMetadataPrinter::finishAssembly(Module &M, GCModuleInfo &Info,
       report_fatal_error("Function '" + FI.getFunction().getName() +
                          "' is too large for the ocaml GC! "
                          "Frame size " +
-                         Twine(FrameSize) + ">= 65536.\n"
-                                            "(" +
-                         Twine(uintptr_t(&FI)) + ")");
+                         Twine(FrameSize) +
+                         ">= 65536.\n"
+                         "(" +
+                         Twine(reinterpret_cast<uintptr_t>(&FI)) + ")");
     }
 
     AP.OutStreamer->AddComment("live roots for " +
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.cpp
new file mode 100644
index 000000000000..e8636052c54c
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.cpp
@@ -0,0 +1,84 @@
+//===- llvm/CodeGen/PseudoProbePrinter.cpp - Pseudo Probe Emission -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing pseudo probe info into asm files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PseudoProbePrinter.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PseudoProbe.h"
+#include "llvm/MC/MCPseudoProbe.h"
+#include "llvm/MC/MCStreamer.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "pseudoprobe"
+
+PseudoProbeHandler::~PseudoProbeHandler() = default;
+
+PseudoProbeHandler::PseudoProbeHandler(AsmPrinter *A, Module *M) : Asm(A) {
+  NamedMDNode *FuncInfo = M->getNamedMetadata(PseudoProbeDescMetadataName);
+  assert(FuncInfo && "Pseudo probe descriptors are missing");
+  for (const auto *Operand : FuncInfo->operands()) {
+    const auto *MD = cast<MDNode>(Operand);
+    auto GUID =
+        mdconst::dyn_extract<ConstantInt>(MD->getOperand(0))->getZExtValue();
+    auto Name = cast<MDString>(MD->getOperand(2))->getString();
+    // We may see pairs with same name but different GUIDs here in LTO mode, due
+    // to static same-named functions inlined from other modules into this
+    // module. Function profiles with the same name will be merged no matter
+    // whether they are collected on the same function. Therefore we just pick
+    // up the last <Name, GUID> pair here to represent the same-named function
+    // collection and all probes from the collection will be merged into a
+    // single profile eventually.
+    Names[Name] = GUID;
+  }
+
+  LLVM_DEBUG(dump());
+}
+
+void PseudoProbeHandler::emitPseudoProbe(uint64_t Guid, uint64_t Index,
+                                         uint64_t Type, uint64_t Attr,
+                                         const DILocation *DebugLoc) {
+  // Gather all the inlined-at nodes.
+  // When it's done ReversedInlineStack looks like ([66, B], [88, A])
+  // which means, Function A inlines function B at calliste with a probe id 88,
+  // and B inlines C at probe 66 where C is represented by Guid.
+  SmallVector<InlineSite, 8> ReversedInlineStack;
+  auto *InlinedAt = DebugLoc ? DebugLoc->getInlinedAt() : nullptr;
+  while (InlinedAt) {
+    const DISubprogram *SP = InlinedAt->getScope()->getSubprogram();
+    // Use linkage name for C++ if possible.
+    auto Name = SP->getLinkageName();
+    if (Name.empty())
+      Name = SP->getName();
+    assert(Names.count(Name) && "Pseudo probe descriptor missing for function");
+    uint64_t CallerGuid = Names[Name];
+    uint64_t CallerProbeId = PseudoProbeDwarfDiscriminator::extractProbeIndex(
+        InlinedAt->getDiscriminator());
+    ReversedInlineStack.emplace_back(CallerGuid, CallerProbeId);
+    InlinedAt = InlinedAt->getInlinedAt();
+  }
+
+  SmallVector<InlineSite, 8> InlineStack(ReversedInlineStack.rbegin(),
+                                         ReversedInlineStack.rend());
+  Asm->OutStreamer->emitPseudoProbe(Guid, Index, Type, Attr, InlineStack);
+}
+
+#ifndef NDEBUG
+void PseudoProbeHandler::dump() const {
+  dbgs() << "\n=============================\n";
+  dbgs() << "\nFunction Name to GUID map:\n";
+  dbgs() << "\n=============================\n";
+  for (const auto &Item : Names)
+    dbgs() << "Func: " << Item.first << "   GUID: " << Item.second << "\n";
+}
+#endif
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.h b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.h
new file mode 100644
index 000000000000..bea07ceae9d4
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.h
@@ -0,0 +1,53 @@
+//===- PseudoProbePrinter.h - Pseudo probe encoding support -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing pseudo probe info into asm files.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_PSEUDOPROBEPRINTER_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_PSEUDOPROBEPRINTER_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/AsmPrinterHandler.h"
+
+namespace llvm {
+
+class AsmPrinter;
+class MCStreamer;
+class Module;
+class DILocation;
+
+class PseudoProbeHandler : public AsmPrinterHandler {
+  // Target of pseudo probe emission.
+  AsmPrinter *Asm;
+  // Name to GUID map
+  DenseMap<StringRef, uint64_t> Names;
+
+public:
+  PseudoProbeHandler(AsmPrinter *A, Module *M);
+  ~PseudoProbeHandler() override;
+
+  void emitPseudoProbe(uint64_t Guid, uint64_t Index, uint64_t Type,
+                       uint64_t Attr, const DILocation *DebugLoc);
+
+  // Unused.
+  void setSymbolSize(const MCSymbol *Sym, uint64_t Size) override {}
+  void endModule() override {}
+  void beginFunction(const MachineFunction *MF) override {}
+  void endFunction(const MachineFunction *MF) override {}
+  void beginInstruction(const MachineInstr *MI) override {}
+  void endInstruction() override {}
+  
+#ifndef NDEBUG
+  void dump() const;
+#endif
+};
+
+} // namespace llvm
+#endif // LLVM_LIB_CODEGEN_ASMPRINTER_PSEUDOPROBEPRINTER_H
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/WasmException.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/WasmException.cpp
index baef4d2cc849..352a33e8639d 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/WasmException.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/WasmException.cpp
@@ -18,11 +18,11 @@
 using namespace llvm;
 
 void WasmException::endModule() {
-  // This is the symbol used in 'throw' and 'br_on_exn' instruction to denote
-  // this is a C++ exception. This symbol has to be emitted somewhere once in
-  // the module.  Check if the symbol has already been created, i.e., we have at
-  // least one 'throw' or 'br_on_exn' instruction in the module, and emit the
-  // symbol only if so.
+  // This is the symbol used in 'throw' and 'catch' instruction to denote this
+  // is a C++ exception. This symbol has to be emitted somewhere once in the
+  // module.  Check if the symbol has already been created, i.e., we have at
+  // least one 'throw' or 'catch' instruction in the module, and emit the symbol
+  // only if so.
   SmallString<60> NameStr;
   Mangler::getNameWithPrefix(NameStr, "__cpp_exception", Asm->getDataLayout());
   if (Asm->OutContext.lookupSymbol(NameStr)) {
@@ -76,6 +76,7 @@ void WasmException::endFunction(const MachineFunction *MF) {
 // information.
 void WasmException::computeCallSiteTable(
     SmallVectorImpl<CallSiteEntry> &CallSites,
+    SmallVectorImpl<CallSiteRange> &CallSiteRanges,
     const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
     const SmallVectorImpl<unsigned> &FirstActions) {
   MachineFunction &MF = *Asm->MF;
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/WasmException.h b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/WasmException.h
index 1893b6b2df43..f06de786bd76 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/WasmException.h
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/WasmException.h
@@ -32,6 +32,7 @@ protected:
   // Compute the call site table for wasm EH.
   void computeCallSiteTable(
       SmallVectorImpl<CallSiteEntry> &CallSites,
+      SmallVectorImpl<CallSiteRange> &CallSiteRanges,
       const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
       const SmallVectorImpl<unsigned> &FirstActions) override;
 };
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/WinCFGuard.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/WinCFGuard.cpp
index 914308d9147e..1e3f33e70715 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/WinCFGuard.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/WinCFGuard.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // This file contains support for writing the metadata for Windows Control Flow
-// Guard, including address-taken functions, and valid longjmp targets.
+// Guard, including address-taken functions and valid longjmp targets.
 //
 //===----------------------------------------------------------------------===//
 
@@ -17,8 +17,8 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCStreamer.h"
@@ -38,8 +38,7 @@ void WinCFGuard::endFunction(const MachineFunction *MF) {
     return;
 
   // Copy the function's longjmp targets to a module-level list.
-  LongjmpTargets.insert(LongjmpTargets.end(), MF->getLongjmpTargets().begin(),
-                        MF->getLongjmpTargets().end());
+  llvm::append_range(LongjmpTargets, MF->getLongjmpTargets());
 }
 
 /// Returns true if this function's address is escaped in a way that might make
@@ -78,20 +77,50 @@ static bool isPossibleIndirectCallTarget(const Function *F) {
   return false;
 }
 
+MCSymbol *WinCFGuard::lookupImpSymbol(const MCSymbol *Sym) {
+  if (Sym->getName().startswith("__imp_"))
+    return nullptr;
+  return Asm->OutContext.lookupSymbol(Twine("__imp_") + Sym->getName());
+}
+
 void WinCFGuard::endModule() {
   const Module *M = Asm->MMI->getModule();
-  std::vector<const Function *> Functions;
-  for (const Function &F : *M)
-    if (isPossibleIndirectCallTarget(&F))
-      Functions.push_back(&F);
-  if (Functions.empty() && LongjmpTargets.empty())
+  std::vector<const MCSymbol *> GFIDsEntries;
+  std::vector<const MCSymbol *> GIATsEntries;
+  for (const Function &F : *M) {
+    if (isPossibleIndirectCallTarget(&F)) {
+      // If F is a dllimport and has an "__imp_" symbol already defined, add the
+      // "__imp_" symbol to the .giats section.
+      if (F.hasDLLImportStorageClass()) {
+        if (MCSymbol *impSym = lookupImpSymbol(Asm->getSymbol(&F))) {
+          GIATsEntries.push_back(impSym);
+        }
+      }
+      // Add the function's symbol to the .gfids section.
+      // Note: For dllimport functions, MSVC sometimes does not add this symbol
+      // to the .gfids section, but only adds the corresponding "__imp_" symbol
+      // to the .giats section. Here we always add the symbol to the .gfids
+      // section, since this does not introduce security risks.
+      GFIDsEntries.push_back(Asm->getSymbol(&F));
+    }
+  }
+
+  if (GFIDsEntries.empty() && GIATsEntries.empty() && LongjmpTargets.empty())
     return;
+
+  // Emit the symbol index of each GFIDs entry to form the .gfids section.
   auto &OS = *Asm->OutStreamer;
   OS.SwitchSection(Asm->OutContext.getObjectFileInfo()->getGFIDsSection());
-  for (const Function *F : Functions)
-    OS.EmitCOFFSymbolIndex(Asm->getSymbol(F));
+  for (const MCSymbol *S : GFIDsEntries)
+    OS.EmitCOFFSymbolIndex(S);
+
+  // Emit the symbol index of each GIATs entry to form the .giats section.
+  OS.SwitchSection(Asm->OutContext.getObjectFileInfo()->getGIATsSection());
+  for (const MCSymbol *S : GIATsEntries) {
+    OS.EmitCOFFSymbolIndex(S);
+  }
 
-  // Emit the symbol index of each longjmp target.
+  // Emit the symbol index of each longjmp target to form the .gljmp section.
   OS.SwitchSection(Asm->OutContext.getObjectFileInfo()->getGLJMPSection());
   for (const MCSymbol *S : LongjmpTargets) {
     OS.EmitCOFFSymbolIndex(S);
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/WinCFGuard.h b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/WinCFGuard.h
index 494a153b05ba..0e472af52c8f 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/WinCFGuard.h
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/WinCFGuard.h
@@ -24,6 +24,7 @@ class LLVM_LIBRARY_VISIBILITY WinCFGuard : public AsmPrinterHandler {
   /// Target of directive emission.
   AsmPrinter *Asm;
   std::vector<const MCSymbol *> LongjmpTargets;
+  MCSymbol *lookupImpSymbol(const MCSymbol *Sym);
 
 public:
   WinCFGuard(AsmPrinter *A);
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/WinException.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/WinException.cpp
index cd8077e7d548..3a9c9df79783 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/WinException.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/WinException.cpp
@@ -137,8 +137,8 @@ void WinException::endFunction(const MachineFunction *MF) {
 
   endFuncletImpl();
 
-  // endFunclet will emit the necessary .xdata tables for x64 SEH.
-  if (Per == EHPersonality::MSVC_Win64SEH && MF->hasEHFunclets())
+  // endFunclet will emit the necessary .xdata tables for table-based SEH.
+  if (Per == EHPersonality::MSVC_TableSEH && MF->hasEHFunclets())
     return;
 
   if (shouldEmitPersonality || shouldEmitLSDA) {
@@ -151,7 +151,7 @@ void WinException::endFunction(const MachineFunction *MF) {
 
     // Emit the tables appropriate to the personality function in use. If we
     // don't recognize the personality, assume it uses an Itanium-style LSDA.
-    if (Per == EHPersonality::MSVC_Win64SEH)
+    if (Per == EHPersonality::MSVC_TableSEH)
       emitCSpecificHandlerTable(MF);
     else if (Per == EHPersonality::MSVC_X86SEH)
       emitExceptHandlerTable(MF);
@@ -258,31 +258,35 @@ void WinException::endFuncletImpl() {
     if (F.hasPersonalityFn())
       Per = classifyEHPersonality(F.getPersonalityFn()->stripPointerCasts());
 
-    // On funclet exit, we emit a fake "function" end marker, so that the call
-    // to EmitWinEHHandlerData below can calculate the size of the funclet or
-    // function.
-    if (isAArch64) {
-      MCSection *XData = Asm->OutStreamer->getAssociatedXDataSection(
-          Asm->OutStreamer->getCurrentSectionOnly());
-      Asm->OutStreamer->SwitchSection(XData);
-    }
-
-    // Emit an UNWIND_INFO struct describing the prologue.
-    Asm->OutStreamer->EmitWinEHHandlerData();
-
     if (Per == EHPersonality::MSVC_CXX && shouldEmitPersonality &&
         !CurrentFuncletEntry->isCleanupFuncletEntry()) {
+      // Emit an UNWIND_INFO struct describing the prologue.
+      Asm->OutStreamer->EmitWinEHHandlerData();
+
       // If this is a C++ catch funclet (or the parent function),
       // emit a reference to the LSDA for the parent function.
       StringRef FuncLinkageName = GlobalValue::dropLLVMManglingEscape(F.getName());
       MCSymbol *FuncInfoXData = Asm->OutContext.getOrCreateSymbol(
           Twine("$cppxdata$", FuncLinkageName));
       Asm->OutStreamer->emitValue(create32bitRef(FuncInfoXData), 4);
-    } else if (Per == EHPersonality::MSVC_Win64SEH && MF->hasEHFunclets() &&
+    } else if (Per == EHPersonality::MSVC_TableSEH && MF->hasEHFunclets() &&
                !CurrentFuncletEntry->isEHFuncletEntry()) {
+      // Emit an UNWIND_INFO struct describing the prologue.
+      Asm->OutStreamer->EmitWinEHHandlerData();
+
       // If this is the parent function in Win64 SEH, emit the LSDA immediately
       // following .seh_handlerdata.
       emitCSpecificHandlerTable(MF);
+    } else if (shouldEmitPersonality || shouldEmitLSDA) {
+      // Emit an UNWIND_INFO struct describing the prologue.
+      Asm->OutStreamer->EmitWinEHHandlerData();
+      // In these cases, no further info is written to the .xdata section
+      // right here, but is written by e.g. emitExceptionTable in endFunction()
+      // above.
+    } else {
+      // No need to emit the EH handler data right here if nothing needs
+      // writing to the .xdata section; it will be emitted for all
+      // functions that need it in the end anyway.
     }
 
     // Switch back to the funclet start .text section now that we are done
@@ -339,22 +343,24 @@ int WinException::getFrameIndexOffset(int FrameIndex,
   const TargetFrameLowering &TFI = *Asm->MF->getSubtarget().getFrameLowering();
   Register UnusedReg;
   if (Asm->MAI->usesWindowsCFI()) {
-    int Offset =
+    StackOffset Offset =
         TFI.getFrameIndexReferencePreferSP(*Asm->MF, FrameIndex, UnusedReg,
                                            /*IgnoreSPUpdates*/ true);
     assert(UnusedReg ==
            Asm->MF->getSubtarget()
                .getTargetLowering()
                ->getStackPointerRegisterToSaveRestore());
-    return Offset;
+    return Offset.getFixed();
   }
 
   // For 32-bit, offsets should be relative to the end of the EH registration
   // node. For 64-bit, it's relative to SP at the end of the prologue.
   assert(FuncInfo.EHRegNodeEndOffset != INT_MAX);
-  int Offset = TFI.getFrameIndexReference(*Asm->MF, FrameIndex, UnusedReg);
-  Offset += FuncInfo.EHRegNodeEndOffset;
-  return Offset;
+  StackOffset Offset = TFI.getFrameIndexReference(*Asm->MF, FrameIndex, UnusedReg);
+  Offset += StackOffset::getFixed(FuncInfo.EHRegNodeEndOffset);
+  assert(!Offset.getScalable() &&
+         "Frame offsets with a scalable component are not supported");
+  return Offset.getFixed();
 }
 
 namespace {
@@ -951,7 +957,7 @@ void WinException::emitEHRegistrationOffsetLabel(const WinEHFuncInfo &FuncInfo,
   int FI = FuncInfo.EHRegNodeFrameIndex;
   if (FI != INT_MAX) {
     const TargetFrameLowering *TFI = Asm->MF->getSubtarget().getFrameLowering();
-    Offset = TFI->getNonLocalFrameIndexReference(*Asm->MF, FI);
+    Offset = TFI->getNonLocalFrameIndexReference(*Asm->MF, FI).getFixed();
   }
 
   MCContext &Ctx = Asm->OutContext;
@@ -1015,7 +1021,8 @@ void WinException::emitExceptHandlerTable(const MachineFunction *MF) {
       Register UnusedReg;
       const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
       int SSPIdx = MFI.getStackProtectorIndex();
-      GSCookieOffset = TFI->getFrameIndexReference(*MF, SSPIdx, UnusedReg);
+      GSCookieOffset =
+          TFI->getFrameIndexReference(*MF, SSPIdx, UnusedReg).getFixed();
     }
 
     // Retrieve the EH Guard slot.
@@ -1025,7 +1032,8 @@ void WinException::emitExceptHandlerTable(const MachineFunction *MF) {
       Register UnusedReg;
       const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
       int EHGuardIdx = FuncInfo.EHGuardFrameIndex;
-      EHCookieOffset = TFI->getFrameIndexReference(*MF, EHGuardIdx, UnusedReg);
+      EHCookieOffset =
+          TFI->getFrameIndexReference(*MF, EHGuardIdx, UnusedReg).getFixed();
     }
 
     AddComment("GSCookieOffset");
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AtomicExpandPass.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AtomicExpandPass.cpp
index c61531c5141a..4026022caa07 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -1507,8 +1507,8 @@ void AtomicExpand::expandAtomicLoadToLibcall(LoadInst *I) {
   bool expanded = expandAtomicOpToLibcall(
       I, Size, I->getAlign(), I->getPointerOperand(), nullptr, nullptr,
       I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls);
-  (void)expanded;
-  assert(expanded && "expandAtomicOpToLibcall shouldn't fail tor Load");
+  if (!expanded)
+    report_fatal_error("expandAtomicOpToLibcall shouldn't fail for Load");
 }
 
 void AtomicExpand::expandAtomicStoreToLibcall(StoreInst *I) {
@@ -1520,8 +1520,8 @@ void AtomicExpand::expandAtomicStoreToLibcall(StoreInst *I) {
   bool expanded = expandAtomicOpToLibcall(
       I, Size, I->getAlign(), I->getPointerOperand(), I->getValueOperand(),
       nullptr, I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls);
-  (void)expanded;
-  assert(expanded && "expandAtomicOpToLibcall shouldn't fail tor Store");
+  if (!expanded)
+    report_fatal_error("expandAtomicOpToLibcall shouldn't fail for Store");
 }
 
 void AtomicExpand::expandAtomicCASToLibcall(AtomicCmpXchgInst *I) {
@@ -1535,8 +1535,8 @@ void AtomicExpand::expandAtomicCASToLibcall(AtomicCmpXchgInst *I) {
       I, Size, I->getAlign(), I->getPointerOperand(), I->getNewValOperand(),
       I->getCompareOperand(), I->getSuccessOrdering(), I->getFailureOrdering(),
       Libcalls);
-  (void)expanded;
-  assert(expanded && "expandAtomicOpToLibcall shouldn't fail tor CAS");
+  if (!expanded)
+    report_fatal_error("expandAtomicOpToLibcall shouldn't fail for CAS");
 }
 
 static ArrayRef<RTLIB::Libcall> GetRMWLibcall(AtomicRMWInst::BinOp Op) {
@@ -1685,6 +1685,11 @@ bool AtomicExpand::expandAtomicOpToLibcall(
     return false;
   }
 
+  if (!TLI->getLibcallName(RTLibType)) {
+    // This target does not implement the requested atomic libcall so give up.
+    return false;
+  }
+
   // Build up the function call. There's two kinds. First, the sized
   // variants.  These calls are going to be one of the following (with
   // N=1,2,4,8,16):
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/BBSectionsPrepare.cpp b/contrib/llvm-project/llvm/lib/CodeGen/BBSectionsPrepare.cpp
deleted file mode 100644
index a35c4d813acc..000000000000
--- a/contrib/llvm-project/llvm/lib/CodeGen/BBSectionsPrepare.cpp
+++ /dev/null
@@ -1,457 +0,0 @@
-//===-- BBSectionsPrepare.cpp ---=========---------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// BBSectionsPrepare implementation.
-//
-// The purpose of this pass is to assign sections to basic blocks when
-// -fbasic-block-sections= option is used. Further, with profile information
-// only the subset of basic blocks with profiles are placed in separate sections
-// and the rest are grouped in a cold section. The exception handling blocks are
-// treated specially to ensure they are all in one seciton.
-//
-// Basic Block Sections
-// ====================
-//
-// With option, -fbasic-block-sections=list, every function may be split into
-// clusters of basic blocks. Every cluster will be emitted into a separate
-// section with its basic blocks sequenced in the given order. To get the
-// optimized performance, the clusters must form an optimal BB layout for the
-// function. Every cluster's section is labeled with a symbol to allow the
-// linker to reorder the sections in any arbitrary sequence. A global order of
-// these sections would encapsulate the function layout.
-//
-// There are a couple of challenges to be addressed:
-//
-// 1. The last basic block of every cluster should not have any implicit
-//    fallthrough to its next basic block, as it can be reordered by the linker.
-//    The compiler should make these fallthroughs explicit by adding
-//    unconditional jumps..
-//
-// 2. All inter-cluster branch targets would now need to be resolved by the
-//    linker as they cannot be calculated during compile time. This is done
-//    using static relocations. Further, the compiler tries to use short branch
-//    instructions on some ISAs for small branch offsets. This is not possible
-//    for inter-cluster branches as the offset is not determined at compile
-//    time, and therefore, long branch instructions have to be used for those.
-//
-// 3. Debug Information (DebugInfo) and Call Frame Information (CFI) emission
-//    needs special handling with basic block sections. DebugInfo needs to be
-//    emitted with more relocations as basic block sections can break a
-//    function into potentially several disjoint pieces, and CFI needs to be
-//    emitted per cluster. This also bloats the object file and binary sizes.
-//
-// Basic Block Labels
-// ==================
-//
-// With -fbasic-block-sections=labels, or when a basic block is placed in a
-// unique section, it is labelled with a symbol.  This allows easy mapping of
-// virtual addresses from PMU profiles back to the corresponding basic blocks.
-// Since the number of basic blocks is large, the labeling bloats the symbol
-// table sizes and the string table sizes significantly. While the binary size
-// does increase, it does not affect performance as the symbol table is not
-// loaded in memory during run-time. The string table size bloat is kept very
-// minimal using a unary naming scheme that uses string suffix compression. The
-// basic blocks for function foo are named "a.BB.foo", "aa.BB.foo", ... This
-// turns out to be very good for string table sizes and the bloat in the string
-// table size for a very large binary is ~8 %.  The naming also allows using
-// the --symbol-ordering-file option in LLD to arbitrarily reorder the
-// sections.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringMap.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/TargetInstrInfo.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/LineIterator.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Target/TargetMachine.h"
-
-using llvm::SmallSet;
-using llvm::SmallVector;
-using llvm::StringMap;
-using llvm::StringRef;
-using namespace llvm;
-
-namespace {
-
-// This struct represents the cluster information for a machine basic block.
-struct BBClusterInfo {
-  // MachineBasicBlock ID.
-  unsigned MBBNumber;
-  // Cluster ID this basic block belongs to.
-  unsigned ClusterID;
-  // Position of basic block within the cluster.
-  unsigned PositionInCluster;
-};
-
-using ProgramBBClusterInfoMapTy = StringMap<SmallVector<BBClusterInfo, 4>>;
-
-class BBSectionsPrepare : public MachineFunctionPass {
-public:
-  static char ID;
-
-  // This contains the basic-block-sections profile.
-  const MemoryBuffer *MBuf = nullptr;
-
-  // This encapsulates the BB cluster information for the whole program.
-  //
-  // For every function name, it contains the cluster information for (all or
-  // some of) its basic blocks. The cluster information for every basic block
-  // includes its cluster ID along with the position of the basic block in that
-  // cluster.
-  ProgramBBClusterInfoMapTy ProgramBBClusterInfo;
-
-  // Some functions have alias names. We use this map to find the main alias
-  // name for which we have mapping in ProgramBBClusterInfo.
-  StringMap<StringRef> FuncAliasMap;
-
-  BBSectionsPrepare(const MemoryBuffer *Buf)
-      : MachineFunctionPass(ID), MBuf(Buf) {
-    initializeBBSectionsPreparePass(*PassRegistry::getPassRegistry());
-  };
-
-  BBSectionsPrepare() : MachineFunctionPass(ID) {
-    initializeBBSectionsPreparePass(*PassRegistry::getPassRegistry());
-  }
-
-  StringRef getPassName() const override {
-    return "Basic Block Sections Analysis";
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
-
-  /// Read profiles of basic blocks if available here.
-  bool doInitialization(Module &M) override;
-
-  /// Identify basic blocks that need separate sections and prepare to emit them
-  /// accordingly.
-  bool runOnMachineFunction(MachineFunction &MF) override;
-};
-
-} // end anonymous namespace
-
-char BBSectionsPrepare::ID = 0;
-INITIALIZE_PASS(BBSectionsPrepare, "bbsections-prepare",
-                "Prepares for basic block sections, by splitting functions "
-                "into clusters of basic blocks.",
-                false, false)
-
-// This function updates and optimizes the branching instructions of every basic
-// block in a given function to account for changes in the layout.
-static void updateBranches(
-    MachineFunction &MF,
-    const SmallVector<MachineBasicBlock *, 4> &PreLayoutFallThroughs) {
-  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
-  SmallVector<MachineOperand, 4> Cond;
-  for (auto &MBB : MF) {
-    auto NextMBBI = std::next(MBB.getIterator());
-    auto *FTMBB = PreLayoutFallThroughs[MBB.getNumber()];
-    // If this block had a fallthrough before we need an explicit unconditional
-    // branch to that block if either
-    //     1- the block ends a section, which means its next block may be
-    //        reorderd by the linker, or
-    //     2- the fallthrough block is not adjacent to the block in the new
-    //        order.
-    if (FTMBB && (MBB.isEndSection() || &*NextMBBI != FTMBB))
-      TII->insertUnconditionalBranch(MBB, FTMBB, MBB.findBranchDebugLoc());
-
-    // We do not optimize branches for machine basic blocks ending sections, as
-    // their adjacent block might be reordered by the linker.
-    if (MBB.isEndSection())
-      continue;
-
-    // It might be possible to optimize branches by flipping the branch
-    // condition.
-    Cond.clear();
-    MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
-    if (TII->analyzeBranch(MBB, TBB, FBB, Cond))
-      continue;
-    MBB.updateTerminator(FTMBB);
-  }
-}
-
-// This function provides the BBCluster information associated with a function.
-// Returns true if a valid association exists and false otherwise.
-static bool getBBClusterInfoForFunction(
-    const MachineFunction &MF, const StringMap<StringRef> FuncAliasMap,
-    const ProgramBBClusterInfoMapTy &ProgramBBClusterInfo,
-    std::vector<Optional<BBClusterInfo>> &V) {
-  // Get the main alias name for the function.
-  auto FuncName = MF.getName();
-  auto R = FuncAliasMap.find(FuncName);
-  StringRef AliasName = R == FuncAliasMap.end() ? FuncName : R->second;
-
-  // Find the assoicated cluster information.
-  auto P = ProgramBBClusterInfo.find(AliasName);
-  if (P == ProgramBBClusterInfo.end())
-    return false;
-
-  if (P->second.empty()) {
-    // This indicates that sections are desired for all basic blocks of this
-    // function. We clear the BBClusterInfo vector to denote this.
-    V.clear();
-    return true;
-  }
-
-  V.resize(MF.getNumBlockIDs());
-  for (auto bbClusterInfo : P->second) {
-    // Bail out if the cluster information contains invalid MBB numbers.
-    if (bbClusterInfo.MBBNumber >= MF.getNumBlockIDs())
-      return false;
-    V[bbClusterInfo.MBBNumber] = bbClusterInfo;
-  }
-  return true;
-}
-
-// This function sorts basic blocks according to the cluster's information.
-// All explicitly specified clusters of basic blocks will be ordered
-// accordingly. All non-specified BBs go into a separate "Cold" section.
-// Additionally, if exception handling landing pads end up in more than one
-// clusters, they are moved into a single "Exception" section. Eventually,
-// clusters are ordered in increasing order of their IDs, with the "Exception"
-// and "Cold" succeeding all other clusters.
-// FuncBBClusterInfo represent the cluster information for basic blocks. If this
-// is empty, it means unique sections for all basic blocks in the function.
-static bool assignSectionsAndSortBasicBlocks(
-    MachineFunction &MF,
-    const std::vector<Optional<BBClusterInfo>> &FuncBBClusterInfo) {
-  assert(MF.hasBBSections() && "BB Sections is not set for function.");
-  // This variable stores the section ID of the cluster containing eh_pads (if
-  // all eh_pads are one cluster). If more than one cluster contain eh_pads, we
-  // set it equal to ExceptionSectionID.
-  Optional<MBBSectionID> EHPadsSectionID;
-
-  for (auto &MBB : MF) {
-    // With the 'all' option, every basic block is placed in a unique section.
-    // With the 'list' option, every basic block is placed in a section
-    // associated with its cluster, unless we want individual unique sections
-    // for every basic block in this function (if FuncBBClusterInfo is empty).
-    if (MF.getTarget().getBBSectionsType() == llvm::BasicBlockSection::All ||
-        FuncBBClusterInfo.empty()) {
-      // If unique sections are desired for all basic blocks of the function, we
-      // set every basic block's section ID equal to its number (basic block
-      // id). This further ensures that basic blocks are ordered canonically.
-      MBB.setSectionID({static_cast<unsigned int>(MBB.getNumber())});
-    } else if (FuncBBClusterInfo[MBB.getNumber()].hasValue())
-      MBB.setSectionID(FuncBBClusterInfo[MBB.getNumber()]->ClusterID);
-    else {
-      // BB goes into the special cold section if it is not specified in the
-      // cluster info map.
-      MBB.setSectionID(MBBSectionID::ColdSectionID);
-    }
-
-    if (MBB.isEHPad() && EHPadsSectionID != MBB.getSectionID() &&
-        EHPadsSectionID != MBBSectionID::ExceptionSectionID) {
-      // If we already have one cluster containing eh_pads, this must be updated
-      // to ExceptionSectionID. Otherwise, we set it equal to the current
-      // section ID.
-      EHPadsSectionID = EHPadsSectionID.hasValue()
-                            ? MBBSectionID::ExceptionSectionID
-                            : MBB.getSectionID();
-    }
-  }
-
-  // If EHPads are in more than one section, this places all of them in the
-  // special exception section.
-  if (EHPadsSectionID == MBBSectionID::ExceptionSectionID)
-    for (auto &MBB : MF)
-      if (MBB.isEHPad())
-        MBB.setSectionID(EHPadsSectionID.getValue());
-
-  SmallVector<MachineBasicBlock *, 4> PreLayoutFallThroughs(
-      MF.getNumBlockIDs());
-  for (auto &MBB : MF)
-    PreLayoutFallThroughs[MBB.getNumber()] = MBB.getFallThrough();
-
-  // We make sure that the cluster including the entry basic block precedes all
-  // other clusters.
-  auto EntryBBSectionID = MF.front().getSectionID();
-
-  // Helper function for ordering BB sections as follows:
-  //   * Entry section (section including the entry block).
-  //   * Regular sections (in increasing order of their Number).
-  //     ...
-  //   * Exception section
-  //   * Cold section
-  auto MBBSectionOrder = [EntryBBSectionID](const MBBSectionID &LHS,
-                                            const MBBSectionID &RHS) {
-    // We make sure that the section containing the entry block precedes all the
-    // other sections.
-    if (LHS == EntryBBSectionID || RHS == EntryBBSectionID)
-      return LHS == EntryBBSectionID;
-    return LHS.Type == RHS.Type ? LHS.Number < RHS.Number : LHS.Type < RHS.Type;
-  };
-
-  // We sort all basic blocks to make sure the basic blocks of every cluster are
-  // contiguous and ordered accordingly. Furthermore, clusters are ordered in
-  // increasing order of their section IDs, with the exception and the
-  // cold section placed at the end of the function.
-  MF.sort([&](MachineBasicBlock &X, MachineBasicBlock &Y) {
-    auto XSectionID = X.getSectionID();
-    auto YSectionID = Y.getSectionID();
-    if (XSectionID != YSectionID)
-      return MBBSectionOrder(XSectionID, YSectionID);
-    // If the two basic block are in the same section, the order is decided by
-    // their position within the section.
-    if (XSectionID.Type == MBBSectionID::SectionType::Default)
-      return FuncBBClusterInfo[X.getNumber()]->PositionInCluster <
-             FuncBBClusterInfo[Y.getNumber()]->PositionInCluster;
-    return X.getNumber() < Y.getNumber();
-  });
-
-  // Set IsBeginSection and IsEndSection according to the assigned section IDs.
-  MF.assignBeginEndSections();
-
-  // After reordering basic blocks, we must update basic block branches to
-  // insert explicit fallthrough branches when required and optimize branches
-  // when possible.
-  updateBranches(MF, PreLayoutFallThroughs);
-
-  return true;
-}
-
-bool BBSectionsPrepare::runOnMachineFunction(MachineFunction &MF) {
-  auto BBSectionsType = MF.getTarget().getBBSectionsType();
-  assert(BBSectionsType != BasicBlockSection::None &&
-         "BB Sections not enabled!");
-  // Renumber blocks before sorting them for basic block sections.  This is
-  // useful during sorting, basic blocks in the same section will retain the
-  // default order.  This renumbering should also be done for basic block
-  // labels to match the profiles with the correct blocks.
-  MF.RenumberBlocks();
-
-  if (BBSectionsType == BasicBlockSection::Labels) {
-    MF.setBBSectionsType(BBSectionsType);
-    MF.createBBLabels();
-    return true;
-  }
-
-  std::vector<Optional<BBClusterInfo>> FuncBBClusterInfo;
-  if (BBSectionsType == BasicBlockSection::List &&
-      !getBBClusterInfoForFunction(MF, FuncAliasMap, ProgramBBClusterInfo,
-                                   FuncBBClusterInfo))
-    return true;
-  MF.setBBSectionsType(BBSectionsType);
-  MF.createBBLabels();
-  assignSectionsAndSortBasicBlocks(MF, FuncBBClusterInfo);
-  return true;
-}
-
-// Basic Block Sections can be enabled for a subset of machine basic blocks.
-// This is done by passing a file containing names of functions for which basic
-// block sections are desired.  Additionally, machine basic block ids of the
-// functions can also be specified for a finer granularity. Moreover, a cluster
-// of basic blocks could be assigned to the same section.
-// A file with basic block sections for all of function main and three blocks
-// for function foo (of which 1 and 2 are placed in a cluster) looks like this:
-// ----------------------------
-// list.txt:
-// !main
-// !foo
-// !!1 2
-// !!4
-static Error getBBClusterInfo(const MemoryBuffer *MBuf,
-                              ProgramBBClusterInfoMapTy &ProgramBBClusterInfo,
-                              StringMap<StringRef> &FuncAliasMap) {
-  assert(MBuf);
-  line_iterator LineIt(*MBuf, /*SkipBlanks=*/true, /*CommentMarker=*/'#');
-
-  auto invalidProfileError = [&](auto Message) {
-    return make_error<StringError>(
-        Twine("Invalid profile " + MBuf->getBufferIdentifier() + " at line " +
-              Twine(LineIt.line_number()) + ": " + Message),
-        inconvertibleErrorCode());
-  };
-
-  auto FI = ProgramBBClusterInfo.end();
-
-  // Current cluster ID corresponding to this function.
-  unsigned CurrentCluster = 0;
-  // Current position in the current cluster.
-  unsigned CurrentPosition = 0;
-
-  // Temporary set to ensure every basic block ID appears once in the clusters
-  // of a function.
-  SmallSet<unsigned, 4> FuncBBIDs;
-
-  for (; !LineIt.is_at_eof(); ++LineIt) {
-    StringRef S(*LineIt);
-    if (S[0] == '@')
-      continue;
-    // Check for the leading "!"
-    if (!S.consume_front("!") || S.empty())
-      break;
-    // Check for second "!" which indicates a cluster of basic blocks.
-    if (S.consume_front("!")) {
-      if (FI == ProgramBBClusterInfo.end())
-        return invalidProfileError(
-            "Cluster list does not follow a function name specifier.");
-      SmallVector<StringRef, 4> BBIndexes;
-      S.split(BBIndexes, ' ');
-      // Reset current cluster position.
-      CurrentPosition = 0;
-      for (auto BBIndexStr : BBIndexes) {
-        unsigned long long BBIndex;
-        if (getAsUnsignedInteger(BBIndexStr, 10, BBIndex))
-          return invalidProfileError(Twine("Unsigned integer expected: '") +
-                                     BBIndexStr + "'.");
-        if (!FuncBBIDs.insert(BBIndex).second)
-          return invalidProfileError(Twine("Duplicate basic block id found '") +
-                                     BBIndexStr + "'.");
-        if (!BBIndex && CurrentPosition)
-          return invalidProfileError("Entry BB (0) does not begin a cluster.");
-
-        FI->second.emplace_back(BBClusterInfo{
-            ((unsigned)BBIndex), CurrentCluster, CurrentPosition++});
-      }
-      CurrentCluster++;
-    } else { // This is a function name specifier.
-      // Function aliases are separated using '/'. We use the first function
-      // name for the cluster info mapping and delegate all other aliases to
-      // this one.
-      SmallVector<StringRef, 4> Aliases;
-      S.split(Aliases, '/');
-      for (size_t i = 1; i < Aliases.size(); ++i)
-        FuncAliasMap.try_emplace(Aliases[i], Aliases.front());
-
-      // Prepare for parsing clusters of this function name.
-      // Start a new cluster map for this function name.
-      FI = ProgramBBClusterInfo.try_emplace(Aliases.front()).first;
-      CurrentCluster = 0;
-      FuncBBIDs.clear();
-    }
-  }
-  return Error::success();
-}
-
-bool BBSectionsPrepare::doInitialization(Module &M) {
-  if (!MBuf)
-    return false;
-  if (auto Err = getBBClusterInfo(MBuf, ProgramBBClusterInfo, FuncAliasMap))
-    report_fatal_error(std::move(Err));
-  return false;
-}
-
-void BBSectionsPrepare::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.setPreservesAll();
-  MachineFunctionPass::getAnalysisUsage(AU);
-}
-
-MachineFunctionPass *
-llvm::createBBSectionsPreparePass(const MemoryBuffer *Buf) {
-  return new BBSectionsPrepare(Buf);
-}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/BasicBlockSections.cpp b/contrib/llvm-project/llvm/lib/CodeGen/BasicBlockSections.cpp
new file mode 100644
index 000000000000..7499ea8b42d4
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/BasicBlockSections.cpp
@@ -0,0 +1,484 @@
+//===-- BasicBlockSections.cpp ---=========--------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// BasicBlockSections implementation.
+//
+// The purpose of this pass is to assign sections to basic blocks when
+// -fbasic-block-sections= option is used. Further, with profile information
+// only the subset of basic blocks with profiles are placed in separate sections
+// and the rest are grouped in a cold section. The exception handling blocks are
+// treated specially to ensure they are all in one seciton.
+//
+// Basic Block Sections
+// ====================
+//
+// With option, -fbasic-block-sections=list, every function may be split into
+// clusters of basic blocks. Every cluster will be emitted into a separate
+// section with its basic blocks sequenced in the given order. To get the
+// optimized performance, the clusters must form an optimal BB layout for the
+// function. Every cluster's section is labeled with a symbol to allow the
+// linker to reorder the sections in any arbitrary sequence. A global order of
+// these sections would encapsulate the function layout.
+//
+// There are a couple of challenges to be addressed:
+//
+// 1. The last basic block of every cluster should not have any implicit
+//    fallthrough to its next basic block, as it can be reordered by the linker.
+//    The compiler should make these fallthroughs explicit by adding
+//    unconditional jumps..
+//
+// 2. All inter-cluster branch targets would now need to be resolved by the
+//    linker as they cannot be calculated during compile time. This is done
+//    using static relocations. Further, the compiler tries to use short branch
+//    instructions on some ISAs for small branch offsets. This is not possible
+//    for inter-cluster branches as the offset is not determined at compile
+//    time, and therefore, long branch instructions have to be used for those.
+//
+// 3. Debug Information (DebugInfo) and Call Frame Information (CFI) emission
+//    needs special handling with basic block sections. DebugInfo needs to be
+//    emitted with more relocations as basic block sections can break a
+//    function into potentially several disjoint pieces, and CFI needs to be
+//    emitted per cluster. This also bloats the object file and binary sizes.
+//
+// Basic Block Labels
+// ==================
+//
+// With -fbasic-block-sections=labels, we emit the offsets of BB addresses of
+// every function into the .llvm_bb_addr_map section. Along with the function
+// symbols, this allows for mapping of virtual addresses in PMU profiles back to
+// the corresponding basic blocks. This logic is implemented in AsmPrinter. This
+// pass only assigns the BBSectionType of every function to ``labels``.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/BasicBlockSectionUtils.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/LineIterator.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Target/TargetMachine.h"
+
+using llvm::SmallSet;
+using llvm::SmallVector;
+using llvm::StringMap;
+using llvm::StringRef;
+using namespace llvm;
+
+// Placing the cold clusters in a separate section mitigates against poor
+// profiles and allows optimizations such as hugepage mapping to be applied at a
+// section granularity. Defaults to ".text.split." which is recognized by lld
+// via the `-z keep-text-section-prefix` flag.
+cl::opt<std::string> llvm::BBSectionsColdTextPrefix(
+    "bbsections-cold-text-prefix",
+    cl::desc("The text prefix to use for cold basic block clusters"),
+    cl::init(".text.split."), cl::Hidden);
+
+namespace {
+
+// This struct represents the cluster information for a machine basic block.
+struct BBClusterInfo {
+  // MachineBasicBlock ID.
+  unsigned MBBNumber;
+  // Cluster ID this basic block belongs to.
+  unsigned ClusterID;
+  // Position of basic block within the cluster.
+  unsigned PositionInCluster;
+};
+
+using ProgramBBClusterInfoMapTy = StringMap<SmallVector<BBClusterInfo, 4>>;
+
+class BasicBlockSections : public MachineFunctionPass {
+public:
+  static char ID;
+
+  // This contains the basic-block-sections profile.
+  const MemoryBuffer *MBuf = nullptr;
+
+  // This encapsulates the BB cluster information for the whole program.
+  //
+  // For every function name, it contains the cluster information for (all or
+  // some of) its basic blocks. The cluster information for every basic block
+  // includes its cluster ID along with the position of the basic block in that
+  // cluster.
+  ProgramBBClusterInfoMapTy ProgramBBClusterInfo;
+
+  // Some functions have alias names. We use this map to find the main alias
+  // name for which we have mapping in ProgramBBClusterInfo.
+  StringMap<StringRef> FuncAliasMap;
+
+  BasicBlockSections(const MemoryBuffer *Buf)
+      : MachineFunctionPass(ID), MBuf(Buf) {
+    initializeBasicBlockSectionsPass(*PassRegistry::getPassRegistry());
+  };
+
+  BasicBlockSections() : MachineFunctionPass(ID) {
+    initializeBasicBlockSectionsPass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+    return "Basic Block Sections Analysis";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  /// Read profiles of basic blocks if available here.
+  bool doInitialization(Module &M) override;
+
+  /// Identify basic blocks that need separate sections and prepare to emit them
+  /// accordingly.
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+
+} // end anonymous namespace
+
+char BasicBlockSections::ID = 0;
+INITIALIZE_PASS(BasicBlockSections, "bbsections-prepare",
+                "Prepares for basic block sections, by splitting functions "
+                "into clusters of basic blocks.",
+                false, false)
+
+// This function updates and optimizes the branching instructions of every basic
+// block in a given function to account for changes in the layout.
+static void updateBranches(
+    MachineFunction &MF,
+    const SmallVector<MachineBasicBlock *, 4> &PreLayoutFallThroughs) {
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+  SmallVector<MachineOperand, 4> Cond;
+  for (auto &MBB : MF) {
+    auto NextMBBI = std::next(MBB.getIterator());
+    auto *FTMBB = PreLayoutFallThroughs[MBB.getNumber()];
+    // If this block had a fallthrough before we need an explicit unconditional
+    // branch to that block if either
+    //     1- the block ends a section, which means its next block may be
+    //        reorderd by the linker, or
+    //     2- the fallthrough block is not adjacent to the block in the new
+    //        order.
+    if (FTMBB && (MBB.isEndSection() || &*NextMBBI != FTMBB))
+      TII->insertUnconditionalBranch(MBB, FTMBB, MBB.findBranchDebugLoc());
+
+    // We do not optimize branches for machine basic blocks ending sections, as
+    // their adjacent block might be reordered by the linker.
+    if (MBB.isEndSection())
+      continue;
+
+    // It might be possible to optimize branches by flipping the branch
+    // condition.
+    Cond.clear();
+    MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
+    if (TII->analyzeBranch(MBB, TBB, FBB, Cond))
+      continue;
+    MBB.updateTerminator(FTMBB);
+  }
+}
+
+// This function provides the BBCluster information associated with a function.
+// Returns true if a valid association exists and false otherwise.
+static bool getBBClusterInfoForFunction(
+    const MachineFunction &MF, const StringMap<StringRef> FuncAliasMap,
+    const ProgramBBClusterInfoMapTy &ProgramBBClusterInfo,
+    std::vector<Optional<BBClusterInfo>> &V) {
+  // Get the main alias name for the function.
+  auto FuncName = MF.getName();
+  auto R = FuncAliasMap.find(FuncName);
+  StringRef AliasName = R == FuncAliasMap.end() ? FuncName : R->second;
+
+  // Find the assoicated cluster information.
+  auto P = ProgramBBClusterInfo.find(AliasName);
+  if (P == ProgramBBClusterInfo.end())
+    return false;
+
+  if (P->second.empty()) {
+    // This indicates that sections are desired for all basic blocks of this
+    // function. We clear the BBClusterInfo vector to denote this.
+    V.clear();
+    return true;
+  }
+
+  V.resize(MF.getNumBlockIDs());
+  for (auto bbClusterInfo : P->second) {
+    // Bail out if the cluster information contains invalid MBB numbers.
+    if (bbClusterInfo.MBBNumber >= MF.getNumBlockIDs())
+      return false;
+    V[bbClusterInfo.MBBNumber] = bbClusterInfo;
+  }
+  return true;
+}
+
+// This function sorts basic blocks according to the cluster's information.
+// All explicitly specified clusters of basic blocks will be ordered
+// accordingly. All non-specified BBs go into a separate "Cold" section.
+// Additionally, if exception handling landing pads end up in more than one
+// clusters, they are moved into a single "Exception" section. Eventually,
+// clusters are ordered in increasing order of their IDs, with the "Exception"
+// and "Cold" succeeding all other clusters.
+// FuncBBClusterInfo represent the cluster information for basic blocks. If this
+// is empty, it means unique sections for all basic blocks in the function.
+static void
+assignSections(MachineFunction &MF,
+               const std::vector<Optional<BBClusterInfo>> &FuncBBClusterInfo) {
+  assert(MF.hasBBSections() && "BB Sections is not set for function.");
+  // This variable stores the section ID of the cluster containing eh_pads (if
+  // all eh_pads are one cluster). If more than one cluster contain eh_pads, we
+  // set it equal to ExceptionSectionID.
+  Optional<MBBSectionID> EHPadsSectionID;
+
+  for (auto &MBB : MF) {
+    // With the 'all' option, every basic block is placed in a unique section.
+    // With the 'list' option, every basic block is placed in a section
+    // associated with its cluster, unless we want individual unique sections
+    // for every basic block in this function (if FuncBBClusterInfo is empty).
+    if (MF.getTarget().getBBSectionsType() == llvm::BasicBlockSection::All ||
+        FuncBBClusterInfo.empty()) {
+      // If unique sections are desired for all basic blocks of the function, we
+      // set every basic block's section ID equal to its number (basic block
+      // id). This further ensures that basic blocks are ordered canonically.
+      MBB.setSectionID({static_cast<unsigned int>(MBB.getNumber())});
+    } else if (FuncBBClusterInfo[MBB.getNumber()].hasValue())
+      MBB.setSectionID(FuncBBClusterInfo[MBB.getNumber()]->ClusterID);
+    else {
+      // BB goes into the special cold section if it is not specified in the
+      // cluster info map.
+      MBB.setSectionID(MBBSectionID::ColdSectionID);
+    }
+
+    if (MBB.isEHPad() && EHPadsSectionID != MBB.getSectionID() &&
+        EHPadsSectionID != MBBSectionID::ExceptionSectionID) {
+      // If we already have one cluster containing eh_pads, this must be updated
+      // to ExceptionSectionID. Otherwise, we set it equal to the current
+      // section ID.
+      EHPadsSectionID = EHPadsSectionID.hasValue()
+                            ? MBBSectionID::ExceptionSectionID
+                            : MBB.getSectionID();
+    }
+  }
+
+  // If EHPads are in more than one section, this places all of them in the
+  // special exception section.
+  if (EHPadsSectionID == MBBSectionID::ExceptionSectionID)
+    for (auto &MBB : MF)
+      if (MBB.isEHPad())
+        MBB.setSectionID(EHPadsSectionID.getValue());
+}
+
+void llvm::sortBasicBlocksAndUpdateBranches(
+    MachineFunction &MF, MachineBasicBlockComparator MBBCmp) {
+  SmallVector<MachineBasicBlock *, 4> PreLayoutFallThroughs(
+      MF.getNumBlockIDs());
+  for (auto &MBB : MF)
+    PreLayoutFallThroughs[MBB.getNumber()] = MBB.getFallThrough();
+
+  MF.sort(MBBCmp);
+
+  // Set IsBeginSection and IsEndSection according to the assigned section IDs.
+  MF.assignBeginEndSections();
+
+  // After reordering basic blocks, we must update basic block branches to
+  // insert explicit fallthrough branches when required and optimize branches
+  // when possible.
+  updateBranches(MF, PreLayoutFallThroughs);
+}
+
+// If the exception section begins with a landing pad, that landing pad will
+// assume a zero offset (relative to @LPStart) in the LSDA. However, a value of
+// zero implies "no landing pad." This function inserts a NOP just before the EH
+// pad label to ensure a nonzero offset. Returns true if padding is not needed.
+static bool avoidZeroOffsetLandingPad(MachineFunction &MF) {
+  for (auto &MBB : MF) {
+    if (MBB.isBeginSection() && MBB.isEHPad()) {
+      MachineBasicBlock::iterator MI = MBB.begin();
+      while (!MI->isEHLabel())
+        ++MI;
+      MCInst Noop;
+      MF.getSubtarget().getInstrInfo()->getNoop(Noop);
+      BuildMI(MBB, MI, DebugLoc(),
+              MF.getSubtarget().getInstrInfo()->get(Noop.getOpcode()));
+      return false;
+    }
+  }
+  return true;
+}
+
+bool BasicBlockSections::runOnMachineFunction(MachineFunction &MF) {
+  auto BBSectionsType = MF.getTarget().getBBSectionsType();
+  assert(BBSectionsType != BasicBlockSection::None &&
+         "BB Sections not enabled!");
+  // Renumber blocks before sorting them for basic block sections.  This is
+  // useful during sorting, basic blocks in the same section will retain the
+  // default order.  This renumbering should also be done for basic block
+  // labels to match the profiles with the correct blocks.
+  MF.RenumberBlocks();
+
+  if (BBSectionsType == BasicBlockSection::Labels) {
+    MF.setBBSectionsType(BBSectionsType);
+    return true;
+  }
+
+  std::vector<Optional<BBClusterInfo>> FuncBBClusterInfo;
+  if (BBSectionsType == BasicBlockSection::List &&
+      !getBBClusterInfoForFunction(MF, FuncAliasMap, ProgramBBClusterInfo,
+                                   FuncBBClusterInfo))
+    return true;
+  MF.setBBSectionsType(BBSectionsType);
+  assignSections(MF, FuncBBClusterInfo);
+
+  // We make sure that the cluster including the entry basic block precedes all
+  // other clusters.
+  auto EntryBBSectionID = MF.front().getSectionID();
+
+  // Helper function for ordering BB sections as follows:
+  //   * Entry section (section including the entry block).
+  //   * Regular sections (in increasing order of their Number).
+  //     ...
+  //   * Exception section
+  //   * Cold section
+  auto MBBSectionOrder = [EntryBBSectionID](const MBBSectionID &LHS,
+                                            const MBBSectionID &RHS) {
+    // We make sure that the section containing the entry block precedes all the
+    // other sections.
+    if (LHS == EntryBBSectionID || RHS == EntryBBSectionID)
+      return LHS == EntryBBSectionID;
+    return LHS.Type == RHS.Type ? LHS.Number < RHS.Number : LHS.Type < RHS.Type;
+  };
+
+  // We sort all basic blocks to make sure the basic blocks of every cluster are
+  // contiguous and ordered accordingly. Furthermore, clusters are ordered in
+  // increasing order of their section IDs, with the exception and the
+  // cold section placed at the end of the function.
+  auto Comparator = [&](const MachineBasicBlock &X,
+                        const MachineBasicBlock &Y) {
+    auto XSectionID = X.getSectionID();
+    auto YSectionID = Y.getSectionID();
+    if (XSectionID != YSectionID)
+      return MBBSectionOrder(XSectionID, YSectionID);
+    // If the two basic block are in the same section, the order is decided by
+    // their position within the section.
+    if (XSectionID.Type == MBBSectionID::SectionType::Default)
+      return FuncBBClusterInfo[X.getNumber()]->PositionInCluster <
+             FuncBBClusterInfo[Y.getNumber()]->PositionInCluster;
+    return X.getNumber() < Y.getNumber();
+  };
+
+  sortBasicBlocksAndUpdateBranches(MF, Comparator);
+  avoidZeroOffsetLandingPad(MF);
+  return true;
+}
+
+// Basic Block Sections can be enabled for a subset of machine basic blocks.
+// This is done by passing a file containing names of functions for which basic
+// block sections are desired.  Additionally, machine basic block ids of the
+// functions can also be specified for a finer granularity. Moreover, a cluster
+// of basic blocks could be assigned to the same section.
+// A file with basic block sections for all of function main and three blocks
+// for function foo (of which 1 and 2 are placed in a cluster) looks like this:
+// ----------------------------
+// list.txt:
+// !main
+// !foo
+// !!1 2
+// !!4
+static Error getBBClusterInfo(const MemoryBuffer *MBuf,
+                              ProgramBBClusterInfoMapTy &ProgramBBClusterInfo,
+                              StringMap<StringRef> &FuncAliasMap) {
+  assert(MBuf);
+  line_iterator LineIt(*MBuf, /*SkipBlanks=*/true, /*CommentMarker=*/'#');
+
+  auto invalidProfileError = [&](auto Message) {
+    return make_error<StringError>(
+        Twine("Invalid profile " + MBuf->getBufferIdentifier() + " at line " +
+              Twine(LineIt.line_number()) + ": " + Message),
+        inconvertibleErrorCode());
+  };
+
+  auto FI = ProgramBBClusterInfo.end();
+
+  // Current cluster ID corresponding to this function.
+  unsigned CurrentCluster = 0;
+  // Current position in the current cluster.
+  unsigned CurrentPosition = 0;
+
+  // Temporary set to ensure every basic block ID appears once in the clusters
+  // of a function.
+  SmallSet<unsigned, 4> FuncBBIDs;
+
+  for (; !LineIt.is_at_eof(); ++LineIt) {
+    StringRef S(*LineIt);
+    if (S[0] == '@')
+      continue;
+    // Check for the leading "!"
+    if (!S.consume_front("!") || S.empty())
+      break;
+    // Check for second "!" which indicates a cluster of basic blocks.
+    if (S.consume_front("!")) {
+      if (FI == ProgramBBClusterInfo.end())
+        return invalidProfileError(
+            "Cluster list does not follow a function name specifier.");
+      SmallVector<StringRef, 4> BBIndexes;
+      S.split(BBIndexes, ' ');
+      // Reset current cluster position.
+      CurrentPosition = 0;
+      for (auto BBIndexStr : BBIndexes) {
+        unsigned long long BBIndex;
+        if (getAsUnsignedInteger(BBIndexStr, 10, BBIndex))
+          return invalidProfileError(Twine("Unsigned integer expected: '") +
+                                     BBIndexStr + "'.");
+        if (!FuncBBIDs.insert(BBIndex).second)
+          return invalidProfileError(Twine("Duplicate basic block id found '") +
+                                     BBIndexStr + "'.");
+        if (!BBIndex && CurrentPosition)
+          return invalidProfileError("Entry BB (0) does not begin a cluster.");
+
+        FI->second.emplace_back(BBClusterInfo{
+            ((unsigned)BBIndex), CurrentCluster, CurrentPosition++});
+      }
+      CurrentCluster++;
+    } else { // This is a function name specifier.
+      // Function aliases are separated using '/'. We use the first function
+      // name for the cluster info mapping and delegate all other aliases to
+      // this one.
+      SmallVector<StringRef, 4> Aliases;
+      S.split(Aliases, '/');
+      for (size_t i = 1; i < Aliases.size(); ++i)
+        FuncAliasMap.try_emplace(Aliases[i], Aliases.front());
+
+      // Prepare for parsing clusters of this function name.
+      // Start a new cluster map for this function name.
+      FI = ProgramBBClusterInfo.try_emplace(Aliases.front()).first;
+      CurrentCluster = 0;
+      FuncBBIDs.clear();
+    }
+  }
+  return Error::success();
+}
+
+bool BasicBlockSections::doInitialization(Module &M) {
+  if (!MBuf)
+    return false;
+  if (auto Err = getBBClusterInfo(MBuf, ProgramBBClusterInfo, FuncAliasMap))
+    report_fatal_error(std::move(Err));
+  return false;
+}
+
+void BasicBlockSections::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+MachineFunctionPass *
+llvm::createBasicBlockSectionsPass(const MemoryBuffer *Buf) {
+  return new BasicBlockSections(Buf);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/BranchFolding.cpp b/contrib/llvm-project/llvm/lib/CodeGen/BranchFolding.cpp
index c6d5aa37834f..fd3f465fb390 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/BranchFolding.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/BranchFolding.cpp
@@ -18,16 +18,12 @@
 
 #include "BranchFolding.h"
 #include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/CodeGen/Analysis.h"
-#include "llvm/CodeGen/LivePhysRegs.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -64,7 +60,6 @@
 #include <cstddef>
 #include <iterator>
 #include <numeric>
-#include <vector>
 
 using namespace llvm;
 
@@ -139,17 +134,18 @@ bool BranchFolderPass::runOnMachineFunction(MachineFunction &MF) {
                                  MF.getSubtarget().getRegisterInfo());
 }
 
-BranchFolder::BranchFolder(bool defaultEnableTailMerge, bool CommonHoist,
+BranchFolder::BranchFolder(bool DefaultEnableTailMerge, bool CommonHoist,
                            MBFIWrapper &FreqInfo,
                            const MachineBranchProbabilityInfo &ProbInfo,
-                           ProfileSummaryInfo *PSI,
-                           unsigned MinTailLength)
+                           ProfileSummaryInfo *PSI, unsigned MinTailLength)
     : EnableHoistCommonCode(CommonHoist), MinCommonTailLength(MinTailLength),
       MBBFreqInfo(FreqInfo), MBPI(ProbInfo), PSI(PSI) {
   if (MinCommonTailLength == 0)
     MinCommonTailLength = TailMergeSize;
   switch (FlagEnableTailMerge) {
-  case cl::BOU_UNSET: EnableTailMerge = defaultEnableTailMerge; break;
+  case cl::BOU_UNSET:
+    EnableTailMerge = DefaultEnableTailMerge;
+    break;
   case cl::BOU_TRUE: EnableTailMerge = true; break;
   case cl::BOU_FALSE: EnableTailMerge = false; break;
   }
@@ -1407,7 +1403,7 @@ ReoptimizeBlock:
       LLVM_DEBUG(dbgs() << "\nMerging into block: " << PrevBB
                         << "From MBB: " << *MBB);
       // Remove redundant DBG_VALUEs first.
-      if (PrevBB.begin() != PrevBB.end()) {
+      if (!PrevBB.empty()) {
         MachineBasicBlock::iterator PrevBBIter = PrevBB.end();
         --PrevBBIter;
         MachineBasicBlock::iterator MBBIter = MBB->begin();
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/BranchFolding.h b/contrib/llvm-project/llvm/lib/CodeGen/BranchFolding.h
index 49c6bcae2db4..2a4ea92a92aa 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/BranchFolding.h
+++ b/contrib/llvm-project/llvm/lib/CodeGen/BranchFolding.h
@@ -32,8 +32,7 @@ class TargetRegisterInfo;
 
   class LLVM_LIBRARY_VISIBILITY BranchFolder {
   public:
-    explicit BranchFolder(bool defaultEnableTailMerge,
-                          bool CommonHoist,
+    explicit BranchFolder(bool DefaultEnableTailMerge, bool CommonHoist,
                           MBFIWrapper &FreqInfo,
                           const MachineBranchProbabilityInfo &ProbInfo,
                           ProfileSummaryInfo *PSI,
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/BranchRelaxation.cpp b/contrib/llvm-project/llvm/lib/CodeGen/BranchRelaxation.cpp
index 5a3ec1a36f96..366c303614d6 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/BranchRelaxation.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/BranchRelaxation.cpp
@@ -507,25 +507,31 @@ bool BranchRelaxation::relaxBranchInstructions() {
       Next = std::next(J);
       MachineInstr &MI = *J;
 
-      if (MI.isConditionalBranch()) {
-        MachineBasicBlock *DestBB = TII->getBranchDestBlock(MI);
-        if (!isBlockInRange(MI, *DestBB)) {
-          if (Next != MBB.end() && Next->isConditionalBranch()) {
-            // If there are multiple conditional branches, this isn't an
-            // analyzable block. Split later terminators into a new block so
-            // each one will be analyzable.
-
-            splitBlockBeforeInstr(*Next, DestBB);
-          } else {
-            fixupConditionalBranch(MI);
-            ++NumConditionalRelaxed;
-          }
+      if (!MI.isConditionalBranch())
+        continue;
+
+      if (MI.getOpcode() == TargetOpcode::FAULTING_OP)
+        // FAULTING_OP's destination is not encoded in the instruction stream
+        // and thus never needs relaxed.
+        continue;
+
+      MachineBasicBlock *DestBB = TII->getBranchDestBlock(MI);
+      if (!isBlockInRange(MI, *DestBB)) {
+        if (Next != MBB.end() && Next->isConditionalBranch()) {
+          // If there are multiple conditional branches, this isn't an
+          // analyzable block. Split later terminators into a new block so
+          // each one will be analyzable.
+
+          splitBlockBeforeInstr(*Next, DestBB);
+        } else {
+          fixupConditionalBranch(MI);
+          ++NumConditionalRelaxed;
+        }
 
-          Changed = true;
+        Changed = true;
 
-          // This may have modified all of the terminators, so start over.
-          Next = MBB.getFirstTerminator();
-        }
+        // This may have modified all of the terminators, so start over.
+        Next = MBB.getFirstTerminator();
       }
     }
   }
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/BreakFalseDeps.cpp b/contrib/llvm-project/llvm/lib/CodeGen/BreakFalseDeps.cpp
index b01a264dd97d..b11db3e65770 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/BreakFalseDeps.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/BreakFalseDeps.cpp
@@ -118,7 +118,7 @@ bool BreakFalseDeps::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx,
   if (!MO.isRenamable())
     return false;
 
-  Register OriginalReg = MO.getReg();
+  MCRegister OriginalReg = MO.getReg().asMCReg();
 
   // Update only undef operands that have reg units that are mapped to one root.
   for (MCRegUnitIterator Unit(OriginalReg, TRI); Unit.isValid(); ++Unit) {
@@ -171,8 +171,8 @@ bool BreakFalseDeps::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx,
 
 bool BreakFalseDeps::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx,
                                            unsigned Pref) {
-  Register reg = MI->getOperand(OpIdx).getReg();
-  unsigned Clearance = RDA->getClearance(MI, reg);
+  MCRegister Reg = MI->getOperand(OpIdx).getReg().asMCReg();
+  unsigned Clearance = RDA->getClearance(MI, Reg);
   LLVM_DEBUG(dbgs() << "Clearance: " << Clearance << ", want " << Pref);
 
   if (Pref > Clearance) {
@@ -186,17 +186,24 @@ bool BreakFalseDeps::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx,
 void BreakFalseDeps::processDefs(MachineInstr *MI) {
   assert(!MI->isDebugInstr() && "Won't process debug values");
 
+  const MCInstrDesc &MCID = MI->getDesc();
+
   // Break dependence on undef uses. Do this before updating LiveRegs below.
   // This can remove a false dependence with no additional instructions.
-  unsigned OpNum;
-  unsigned Pref = TII->getUndefRegClearance(*MI, OpNum, TRI);
-  if (Pref) {
-    bool HadTrueDependency = pickBestRegisterForUndef(MI, OpNum, Pref);
-    // We don't need to bother trying to break a dependency if this
-    // instruction has a true dependency on that register through another
-    // operand - we'll have to wait for it to be available regardless.
-    if (!HadTrueDependency && shouldBreakDependence(MI, OpNum, Pref))
-      UndefReads.push_back(std::make_pair(MI, OpNum));
+  for (unsigned i = MCID.getNumDefs(), e = MCID.getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.getReg() || !MO.isUse() || !MO.isUndef())
+      continue;
+
+    unsigned Pref = TII->getUndefRegClearance(*MI, i, TRI);
+    if (Pref) {
+      bool HadTrueDependency = pickBestRegisterForUndef(MI, i, Pref);
+      // We don't need to bother trying to break a dependency if this
+      // instruction has a true dependency on that register through another
+      // operand - we'll have to wait for it to be available regardless.
+      if (!HadTrueDependency && shouldBreakDependence(MI, i, Pref))
+        UndefReads.push_back(std::make_pair(MI, i));
+    }
   }
 
   // The code below allows the target to create a new instruction to break the
@@ -204,7 +211,6 @@ void BreakFalseDeps::processDefs(MachineInstr *MI) {
   if (MF->getFunction().hasMinSize())
     return;
 
-  const MCInstrDesc &MCID = MI->getDesc();
   for (unsigned i = 0,
     e = MI->isVariadic() ? MI->getNumOperands() : MCID.getNumDefs();
     i != e; ++i) {
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/CalcSpillWeights.cpp b/contrib/llvm-project/llvm/lib/CodeGen/CalcSpillWeights.cpp
index 5d6ee09c8438..16f380c1eb62 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/CalcSpillWeights.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/CalcSpillWeights.cpp
@@ -28,66 +28,59 @@ using namespace llvm;
 
 #define DEBUG_TYPE "calcspillweights"
 
-void llvm::calculateSpillWeightsAndHints(LiveIntervals &LIS,
-                           MachineFunction &MF,
-                           VirtRegMap *VRM,
-                           const MachineLoopInfo &MLI,
-                           const MachineBlockFrequencyInfo &MBFI,
-                           VirtRegAuxInfo::NormalizingFn norm) {
+void VirtRegAuxInfo::calculateSpillWeightsAndHints() {
   LLVM_DEBUG(dbgs() << "********** Compute Spill Weights **********\n"
                     << "********** Function: " << MF.getName() << '\n');
 
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  VirtRegAuxInfo VRAI(MF, LIS, VRM, MLI, MBFI, norm);
-  for (unsigned i = 0, e = MRI.getNumVirtRegs(); i != e; ++i) {
-    unsigned Reg = Register::index2VirtReg(i);
+  for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+    unsigned Reg = Register::index2VirtReg(I);
     if (MRI.reg_nodbg_empty(Reg))
       continue;
-    VRAI.calculateSpillWeightAndHint(LIS.getInterval(Reg));
+    calculateSpillWeightAndHint(LIS.getInterval(Reg));
   }
 }
 
 // Return the preferred allocation register for reg, given a COPY instruction.
-static Register copyHint(const MachineInstr *mi, unsigned reg,
-                         const TargetRegisterInfo &tri,
-                         const MachineRegisterInfo &mri) {
-  unsigned sub, hsub;
-  Register hreg;
-  if (mi->getOperand(0).getReg() == reg) {
-    sub = mi->getOperand(0).getSubReg();
-    hreg = mi->getOperand(1).getReg();
-    hsub = mi->getOperand(1).getSubReg();
+static Register copyHint(const MachineInstr *MI, unsigned Reg,
+                         const TargetRegisterInfo &TRI,
+                         const MachineRegisterInfo &MRI) {
+  unsigned Sub, HSub;
+  Register HReg;
+  if (MI->getOperand(0).getReg() == Reg) {
+    Sub = MI->getOperand(0).getSubReg();
+    HReg = MI->getOperand(1).getReg();
+    HSub = MI->getOperand(1).getSubReg();
   } else {
-    sub = mi->getOperand(1).getSubReg();
-    hreg = mi->getOperand(0).getReg();
-    hsub = mi->getOperand(0).getSubReg();
+    Sub = MI->getOperand(1).getSubReg();
+    HReg = MI->getOperand(0).getReg();
+    HSub = MI->getOperand(0).getSubReg();
   }
 
-  if (!hreg)
+  if (!HReg)
     return 0;
 
-  if (Register::isVirtualRegister(hreg))
-    return sub == hsub ? hreg : Register();
+  if (Register::isVirtualRegister(HReg))
+    return Sub == HSub ? HReg : Register();
 
-  const TargetRegisterClass *rc = mri.getRegClass(reg);
-  Register CopiedPReg = (hsub ? tri.getSubReg(hreg, hsub) : hreg);
+  const TargetRegisterClass *rc = MRI.getRegClass(Reg);
+  MCRegister CopiedPReg = HSub ? TRI.getSubReg(HReg, HSub) : HReg.asMCReg();
   if (rc->contains(CopiedPReg))
     return CopiedPReg;
 
   // Check if reg:sub matches so that a super register could be hinted.
-  if (sub)
-    return tri.getMatchingSuperReg(CopiedPReg, sub, rc);
+  if (Sub)
+    return TRI.getMatchingSuperReg(CopiedPReg, Sub, rc);
 
   return 0;
 }
 
 // Check if all values in LI are rematerializable
-static bool isRematerializable(const LiveInterval &LI,
-                               const LiveIntervals &LIS,
-                               VirtRegMap *VRM,
+static bool isRematerializable(const LiveInterval &LI, const LiveIntervals &LIS,
+                               const VirtRegMap &VRM,
                                const TargetInstrInfo &TII) {
-  unsigned Reg = LI.reg;
-  unsigned Original = VRM ? VRM->getOriginal(Reg) : 0;
+  unsigned Reg = LI.reg();
+  unsigned Original = VRM.getOriginal(Reg);
   for (LiveInterval::const_vni_iterator I = LI.vni_begin(), E = LI.vni_end();
        I != E; ++I) {
     const VNInfo *VNI = *I;
@@ -102,31 +95,28 @@ static bool isRematerializable(const LiveInterval &LI,
     // Trace copies introduced by live range splitting.  The inline
     // spiller can rematerialize through these copies, so the spill
     // weight must reflect this.
-    if (VRM) {
-      while (MI->isFullCopy()) {
-        // The copy destination must match the interval register.
-        if (MI->getOperand(0).getReg() != Reg)
-          return false;
-
-        // Get the source register.
-        Reg = MI->getOperand(1).getReg();
-
-        // If the original (pre-splitting) registers match this
-        // copy came from a split.
-        if (!Register::isVirtualRegister(Reg) ||
-            VRM->getOriginal(Reg) != Original)
-          return false;
-
-        // Follow the copy live-in value.
-        const LiveInterval &SrcLI = LIS.getInterval(Reg);
-        LiveQueryResult SrcQ = SrcLI.Query(VNI->def);
-        VNI = SrcQ.valueIn();
-        assert(VNI && "Copy from non-existing value");
-        if (VNI->isPHIDef())
-          return false;
-        MI = LIS.getInstructionFromIndex(VNI->def);
-        assert(MI && "Dead valno in interval");
-      }
+    while (MI->isFullCopy()) {
+      // The copy destination must match the interval register.
+      if (MI->getOperand(0).getReg() != Reg)
+        return false;
+
+      // Get the source register.
+      Reg = MI->getOperand(1).getReg();
+
+      // If the original (pre-splitting) registers match this
+      // copy came from a split.
+      if (!Register::isVirtualRegister(Reg) || VRM.getOriginal(Reg) != Original)
+        return false;
+
+      // Follow the copy live-in value.
+      const LiveInterval &SrcLI = LIS.getInterval(Reg);
+      LiveQueryResult SrcQ = SrcLI.Query(VNI->def);
+      VNI = SrcQ.valueIn();
+      assert(VNI && "Copy from non-existing value");
+      if (VNI->isPHIDef())
+        return false;
+      MI = LIS.getInstructionFromIndex(VNI->def);
+      assert(MI && "Dead valno in interval");
     }
 
     if (!TII.isTriviallyReMaterializable(*MI, LIS.getAliasAnalysis()))
@@ -135,43 +125,55 @@ static bool isRematerializable(const LiveInterval &LI,
   return true;
 }
 
-void VirtRegAuxInfo::calculateSpillWeightAndHint(LiveInterval &li) {
-  float weight = weightCalcHelper(li);
+void VirtRegAuxInfo::calculateSpillWeightAndHint(LiveInterval &LI) {
+  float Weight = weightCalcHelper(LI);
   // Check if unspillable.
-  if (weight < 0)
+  if (Weight < 0)
     return;
-  li.weight = weight;
+  LI.setWeight(Weight);
 }
 
-float VirtRegAuxInfo::futureWeight(LiveInterval &li, SlotIndex start,
-                                   SlotIndex end) {
-  return weightCalcHelper(li, &start, &end);
+float VirtRegAuxInfo::futureWeight(LiveInterval &LI, SlotIndex Start,
+                                   SlotIndex End) {
+  return weightCalcHelper(LI, &Start, &End);
 }
 
-float VirtRegAuxInfo::weightCalcHelper(LiveInterval &li, SlotIndex *start,
-                                       SlotIndex *end) {
-  MachineRegisterInfo &mri = MF.getRegInfo();
-  const TargetRegisterInfo &tri = *MF.getSubtarget().getRegisterInfo();
-  MachineBasicBlock *mbb = nullptr;
-  MachineLoop *loop = nullptr;
-  bool isExiting = false;
-  float totalWeight = 0;
-  unsigned numInstr = 0; // Number of instructions using li
-  SmallPtrSet<MachineInstr*, 8> visited;
-
-  std::pair<unsigned, unsigned> TargetHint = mri.getRegAllocationHint(li.reg);
+float VirtRegAuxInfo::weightCalcHelper(LiveInterval &LI, SlotIndex *Start,
+                                       SlotIndex *End) {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  MachineBasicBlock *MBB = nullptr;
+  MachineLoop *Loop = nullptr;
+  bool IsExiting = false;
+  float TotalWeight = 0;
+  unsigned NumInstr = 0; // Number of instructions using LI
+  SmallPtrSet<MachineInstr *, 8> Visited;
+
+  std::pair<Register, Register> TargetHint = MRI.getRegAllocationHint(LI.reg());
+
+  if (LI.isSpillable()) {
+    Register Reg = LI.reg();
+    Register Original = VRM.getOriginal(Reg);
+    const LiveInterval &OrigInt = LIS.getInterval(Original);
+    // li comes from a split of OrigInt. If OrigInt was marked
+    // as not spillable, make sure the new interval is marked
+    // as not spillable as well.
+    if (!OrigInt.isSpillable())
+      LI.markNotSpillable();
+  }
 
   // Don't recompute spill weight for an unspillable register.
-  bool Spillable = li.isSpillable();
+  bool IsSpillable = LI.isSpillable();
 
-  bool localSplitArtifact = start && end;
+  bool IsLocalSplitArtifact = Start && End;
 
   // Do not update future local split artifacts.
-  bool updateLI = !localSplitArtifact;
+  bool ShouldUpdateLI = !IsLocalSplitArtifact;
 
-  if (localSplitArtifact) {
-    MachineBasicBlock *localMBB = LIS.getMBBFromIndex(*end);
-    assert(localMBB == LIS.getMBBFromIndex(*start) &&
+  if (IsLocalSplitArtifact) {
+    MachineBasicBlock *localMBB = LIS.getMBBFromIndex(*End);
+    assert(localMBB == LIS.getMBBFromIndex(*Start) &&
            "start and end are expected to be in the same basic block");
 
     // Local split artifact will have 2 additional copy instructions and they
@@ -179,116 +181,119 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &li, SlotIndex *start,
     // localLI = COPY other
     // ...
     // other   = COPY localLI
-    totalWeight += LiveIntervals::getSpillWeight(true, false, &MBFI, localMBB);
-    totalWeight += LiveIntervals::getSpillWeight(false, true, &MBFI, localMBB);
+    TotalWeight += LiveIntervals::getSpillWeight(true, false, &MBFI, localMBB);
+    TotalWeight += LiveIntervals::getSpillWeight(false, true, &MBFI, localMBB);
 
-    numInstr += 2;
+    NumInstr += 2;
   }
 
   // CopyHint is a sortable hint derived from a COPY instruction.
   struct CopyHint {
-    unsigned Reg;
-    float Weight;
-    bool IsPhys;
-    CopyHint(unsigned R, float W, bool P) :
-      Reg(R), Weight(W), IsPhys(P) {}
-    bool operator<(const CopyHint &rhs) const {
+    const Register Reg;
+    const float Weight;
+    CopyHint(Register R, float W) : Reg(R), Weight(W) {}
+    bool operator<(const CopyHint &Rhs) const {
       // Always prefer any physreg hint.
-      if (IsPhys != rhs.IsPhys)
-        return (IsPhys && !rhs.IsPhys);
-      if (Weight != rhs.Weight)
-        return (Weight > rhs.Weight);
-      return Reg < rhs.Reg; // Tie-breaker.
+      if (Reg.isPhysical() != Rhs.Reg.isPhysical())
+        return Reg.isPhysical();
+      if (Weight != Rhs.Weight)
+        return (Weight > Rhs.Weight);
+      return Reg.id() < Rhs.Reg.id(); // Tie-breaker.
     }
   };
-  std::set<CopyHint> CopyHints;
 
+  std::set<CopyHint> CopyHints;
+  DenseMap<unsigned, float> Hint;
   for (MachineRegisterInfo::reg_instr_nodbg_iterator
-           I = mri.reg_instr_nodbg_begin(li.reg),
-           E = mri.reg_instr_nodbg_end();
+           I = MRI.reg_instr_nodbg_begin(LI.reg()),
+           E = MRI.reg_instr_nodbg_end();
        I != E;) {
-    MachineInstr *mi = &*(I++);
+    MachineInstr *MI = &*(I++);
 
     // For local split artifacts, we are interested only in instructions between
     // the expected start and end of the range.
-    SlotIndex si = LIS.getInstructionIndex(*mi);
-    if (localSplitArtifact && ((si < *start) || (si > *end)))
+    SlotIndex SI = LIS.getInstructionIndex(*MI);
+    if (IsLocalSplitArtifact && ((SI < *Start) || (SI > *End)))
       continue;
 
-    numInstr++;
-    if (mi->isIdentityCopy() || mi->isImplicitDef())
+    NumInstr++;
+    if (MI->isIdentityCopy() || MI->isImplicitDef())
       continue;
-    if (!visited.insert(mi).second)
+    if (!Visited.insert(MI).second)
       continue;
 
-    float weight = 1.0f;
-    if (Spillable) {
+    // For terminators that produce values, ask the backend if the register is
+    // not spillable.
+    if (TII.isUnspillableTerminator(MI) && MI->definesRegister(LI.reg())) {
+      LI.markNotSpillable();
+      return -1.0f;
+    }
+
+    float Weight = 1.0f;
+    if (IsSpillable) {
       // Get loop info for mi.
-      if (mi->getParent() != mbb) {
-        mbb = mi->getParent();
-        loop = Loops.getLoopFor(mbb);
-        isExiting = loop ? loop->isLoopExiting(mbb) : false;
+      if (MI->getParent() != MBB) {
+        MBB = MI->getParent();
+        Loop = Loops.getLoopFor(MBB);
+        IsExiting = Loop ? Loop->isLoopExiting(MBB) : false;
       }
 
       // Calculate instr weight.
-      bool reads, writes;
-      std::tie(reads, writes) = mi->readsWritesVirtualRegister(li.reg);
-      weight = LiveIntervals::getSpillWeight(writes, reads, &MBFI, *mi);
+      bool Reads, Writes;
+      std::tie(Reads, Writes) = MI->readsWritesVirtualRegister(LI.reg());
+      Weight = LiveIntervals::getSpillWeight(Writes, Reads, &MBFI, *MI);
 
       // Give extra weight to what looks like a loop induction variable update.
-      if (writes && isExiting && LIS.isLiveOutOfMBB(li, mbb))
-        weight *= 3;
+      if (Writes && IsExiting && LIS.isLiveOutOfMBB(LI, MBB))
+        Weight *= 3;
 
-      totalWeight += weight;
+      TotalWeight += Weight;
     }
 
     // Get allocation hints from copies.
-    if (!mi->isCopy())
+    if (!MI->isCopy())
       continue;
-    Register hint = copyHint(mi, li.reg, tri, mri);
-    if (!hint)
+    Register HintReg = copyHint(MI, LI.reg(), TRI, MRI);
+    if (!HintReg)
       continue;
     // Force hweight onto the stack so that x86 doesn't add hidden precision,
     // making the comparison incorrectly pass (i.e., 1 > 1 == true??).
     //
     // FIXME: we probably shouldn't use floats at all.
-    volatile float hweight = Hint[hint] += weight;
-    if (Register::isVirtualRegister(hint) || mri.isAllocatable(hint))
-      CopyHints.insert(
-          CopyHint(hint, hweight, Register::isPhysicalRegister(hint)));
+    volatile float HWeight = Hint[HintReg] += Weight;
+    if (HintReg.isVirtual() || MRI.isAllocatable(HintReg))
+      CopyHints.insert(CopyHint(HintReg, HWeight));
   }
 
-  Hint.clear();
-
   // Pass all the sorted copy hints to mri.
-  if (updateLI && CopyHints.size()) {
+  if (ShouldUpdateLI && CopyHints.size()) {
     // Remove a generic hint if previously added by target.
     if (TargetHint.first == 0 && TargetHint.second)
-      mri.clearSimpleHint(li.reg);
+      MRI.clearSimpleHint(LI.reg());
 
-    std::set<unsigned> HintedRegs;
+    std::set<Register> HintedRegs;
     for (auto &Hint : CopyHints) {
       if (!HintedRegs.insert(Hint.Reg).second ||
           (TargetHint.first != 0 && Hint.Reg == TargetHint.second))
         // Don't add the same reg twice or the target-type hint again.
         continue;
-      mri.addRegAllocationHint(li.reg, Hint.Reg);
+      MRI.addRegAllocationHint(LI.reg(), Hint.Reg);
     }
 
     // Weakly boost the spill weight of hinted registers.
-    totalWeight *= 1.01F;
+    TotalWeight *= 1.01F;
   }
 
   // If the live interval was already unspillable, leave it that way.
-  if (!Spillable)
+  if (!IsSpillable)
     return -1.0;
 
   // Mark li as unspillable if all live ranges are tiny and the interval
   // is not live at any reg mask.  If the interval is live at a reg mask
   // spilling may be required.
-  if (updateLI && li.isZeroLength(LIS.getSlotIndexes()) &&
-      !li.isLiveAtIndexes(LIS.getRegMaskSlots())) {
-    li.markNotSpillable();
+  if (ShouldUpdateLI && LI.isZeroLength(LIS.getSlotIndexes()) &&
+      !LI.isLiveAtIndexes(LIS.getRegMaskSlots())) {
+    LI.markNotSpillable();
     return -1.0;
   }
 
@@ -296,10 +301,10 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &li, SlotIndex *start,
   // it is a preferred candidate for spilling.
   // FIXME: this gets much more complicated once we support non-trivial
   // re-materialization.
-  if (isRematerializable(li, LIS, VRM, *MF.getSubtarget().getInstrInfo()))
-    totalWeight *= 0.5F;
+  if (isRematerializable(LI, LIS, VRM, *MF.getSubtarget().getInstrInfo()))
+    TotalWeight *= 0.5F;
 
-  if (localSplitArtifact)
-    return normalize(totalWeight, start->distance(*end), numInstr);
-  return normalize(totalWeight, li.getSize(), numInstr);
+  if (IsLocalSplitArtifact)
+    return normalize(TotalWeight, Start->distance(*End), NumInstr);
+  return normalize(TotalWeight, LI.getSize(), NumInstr);
 }
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/CallingConvLower.cpp b/contrib/llvm-project/llvm/lib/CodeGen/CallingConvLower.cpp
index 3d8c2c8b00aa..c9246f6e8754 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/CallingConvLower.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/CallingConvLower.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
@@ -62,6 +63,11 @@ void CCState::MarkAllocated(MCPhysReg Reg) {
     UsedRegs[*AI / 32] |= 1 << (*AI & 31);
 }
 
+void CCState::MarkUnallocated(MCPhysReg Reg) {
+  for (MCRegAliasIterator AI(Reg, &TRI, true); AI.isValid(); ++AI)
+    UsedRegs[*AI / 32] &= ~(1 << (*AI & 31));
+}
+
 bool CCState::IsShadowAllocatedReg(MCRegister Reg) const {
   if (!isAllocated(Reg))
     return false;
@@ -184,14 +190,17 @@ void CCState::AnalyzeCallResult(MVT VT, CCAssignFn Fn) {
   }
 }
 
+void CCState::ensureMaxAlignment(Align Alignment) {
+  if (!AnalyzingMustTailForwardedRegs)
+    MF.getFrameInfo().ensureMaxAlignment(Alignment);
+}
+
 static bool isValueTypeInRegForCC(CallingConv::ID CC, MVT VT) {
   if (VT.isVector())
     return true; // Assume -msse-regparm might be in effect.
   if (!VT.isInteger())
     return false;
-  if (CC == CallingConv::X86_VectorCall || CC == CallingConv::X86_FastCall)
-    return true;
-  return false;
+  return (CC == CallingConv::X86_VectorCall || CC == CallingConv::X86_FastCall);
 }
 
 void CCState::getRemainingRegParmsForType(SmallVectorImpl<MCPhysReg> &Regs,
@@ -207,8 +216,8 @@ void CCState::getRemainingRegParmsForType(SmallVectorImpl<MCPhysReg> &Regs,
 
   // Allocate something of this value type repeatedly until we get assigned a
   // location in memory.
-  bool HaveRegParm = true;
-  while (HaveRegParm) {
+  bool HaveRegParm;
+  do {
     if (Fn(0, VT, VT, CCValAssign::Full, Flags, *this)) {
 #ifndef NDEBUG
       dbgs() << "Call has unhandled type " << EVT(VT).getEVTString()
@@ -217,7 +226,7 @@ void CCState::getRemainingRegParmsForType(SmallVectorImpl<MCPhysReg> &Regs,
       llvm_unreachable(nullptr);
     }
     HaveRegParm = Locs.back().isRegLoc();
-  }
+  } while (HaveRegParm);
 
   // Copy all the registers from the value locations we added.
   assert(NumLocs < Locs.size() && "CC assignment failed to add location");
@@ -248,7 +257,7 @@ void CCState::analyzeMustTailForwardedRegisters(
     const TargetLowering *TL = MF.getSubtarget().getTargetLowering();
     const TargetRegisterClass *RC = TL->getRegClassFor(RegVT);
     for (MCPhysReg PReg : RemainingRegs) {
-      unsigned VReg = MF.addLiveIn(PReg, RC);
+      Register VReg = MF.addLiveIn(PReg, RC);
       Forwards.push_back(ForwardedRegister(VReg, PReg, RegVT));
     }
   }
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/CodeGen.cpp b/contrib/llvm-project/llvm/lib/CodeGen/CodeGen.cpp
index 7a8c022c82da..d2400d0371e3 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/CodeGen.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/CodeGen.cpp
@@ -20,16 +20,17 @@ using namespace llvm;
 /// initializeCodeGen - Initialize all passes linked into the CodeGen library.
 void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeAtomicExpandPass(Registry);
-  initializeBBSectionsPreparePass(Registry);
+  initializeBasicBlockSectionsPass(Registry);
   initializeBranchFolderPassPass(Registry);
   initializeBranchRelaxationPass(Registry);
   initializeCFGuardLongjmpPass(Registry);
   initializeCFIInstrInserterPass(Registry);
+  initializeCheckDebugMachineModulePass(Registry);
   initializeCodeGenPreparePass(Registry);
   initializeDeadMachineInstructionElimPass(Registry);
   initializeDebugifyMachineModulePass(Registry);
   initializeDetectDeadLanesPass(Registry);
-  initializeDwarfEHPreparePass(Registry);
+  initializeDwarfEHPrepareLegacyPassPass(Registry);
   initializeEarlyIfConverterPass(Registry);
   initializeEarlyIfPredicatorPass(Registry);
   initializeEarlyMachineLICMPass(Registry);
@@ -98,7 +99,6 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeRegisterCoalescerPass(Registry);
   initializeRenameIndependentSubregsPass(Registry);
   initializeSafeStackLegacyPassPass(Registry);
-  initializeScalarizeMaskedMemIntrinPass(Registry);
   initializeShrinkWrapPass(Registry);
   initializeSjLjEHPreparePass(Registry);
   initializeSlotIndexesPass(Registry);
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/CodeGenPassBuilder.cpp b/contrib/llvm-project/llvm/lib/CodeGen/CodeGenPassBuilder.cpp
new file mode 100644
index 000000000000..7f37f2069a3b
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/CodeGenPassBuilder.cpp
@@ -0,0 +1,25 @@
+//===--- CodeGenPassBuilder.cpp --------------------------------------- ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines interfaces to access the target independent code
+// generation passes provided by the LLVM backend.
+//
+//===---------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/CodeGenPassBuilder.h"
+
+using namespace llvm;
+
+namespace llvm {
+#define DUMMY_MACHINE_MODULE_PASS(NAME, PASS_NAME, CONSTRUCTOR)                \
+  AnalysisKey PASS_NAME::Key;
+#include "llvm/CodeGen/MachinePassRegistry.def"
+#define DUMMY_MACHINE_FUNCTION_PASS(NAME, PASS_NAME, CONSTRUCTOR)              \
+  AnalysisKey PASS_NAME::Key;
+#include "llvm/CodeGen/MachinePassRegistry.def"
+} // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/CodeGenPrepare.cpp b/contrib/llvm-project/llvm/lib/CodeGen/CodeGenPrepare.cpp
index e8b8e6c93cf0..b2bc75c19709 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -376,6 +376,7 @@ class TypePromotionTransaction;
       return *DT;
     }
 
+    void removeAllAssertingVHReferences(Value *V);
     bool eliminateFallThrough(Function &F);
     bool eliminateMostlyEmptyBlocks(Function &F);
     BasicBlock *findDestBlockOfMergeableEmptyBlock(BasicBlock *BB);
@@ -383,6 +384,7 @@ class TypePromotionTransaction;
     void eliminateMostlyEmptyBlock(BasicBlock *BB);
     bool isMergingEmptyBlockProfitable(BasicBlock *BB, BasicBlock *DestBB,
                                        bool isPreheader);
+    bool makeBitReverse(Instruction &I);
     bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT);
     bool optimizeInst(Instruction *I, bool &ModifiedDT);
     bool optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
@@ -437,7 +439,11 @@ char CodeGenPrepare::ID = 0;
 
 INITIALIZE_PASS_BEGIN(CodeGenPrepare, DEBUG_TYPE,
                       "Optimize for code generation", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_END(CodeGenPrepare, DEBUG_TYPE,
                     "Optimize for code generation", false, false)
 
@@ -466,13 +472,21 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
   PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
   OptSize = F.hasOptSize();
   if (ProfileGuidedSectionPrefix) {
-    if (PSI->isFunctionHotInCallGraph(&F, *BFI))
-      F.setSectionPrefix(".hot");
-    else if (PSI->isFunctionColdInCallGraph(&F, *BFI))
-      F.setSectionPrefix(".unlikely");
+    // The hot attribute overwrites profile count based hotness while profile
+    // counts based hotness overwrite the cold attribute.
+    // This is a conservative behabvior.
+    if (F.hasFnAttribute(Attribute::Hot) ||
+        PSI->isFunctionHotInCallGraph(&F, *BFI))
+      F.setSectionPrefix("hot");
+    // If PSI shows this function is not hot, we will placed the function
+    // into unlikely section if (1) PSI shows this is a cold function, or
+    // (2) the function has a attribute of cold.
+    else if (PSI->isFunctionColdInCallGraph(&F, *BFI) ||
+             F.hasFnAttribute(Attribute::Cold))
+      F.setSectionPrefix("unlikely");
     else if (ProfileUnknownInSpecialSection && PSI->hasPartialSampleProfile() &&
              PSI->isFunctionHotnessUnknown(F))
-      F.setSectionPrefix(".unknown");
+      F.setSectionPrefix("unknown");
   }
 
   /// This optimization identifies DIV instructions that can be
@@ -538,6 +552,7 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
     LargeOffsetGEPID.clear();
   }
 
+  NewGEPBases.clear();
   SunkAddrs.clear();
 
   if (!DisableBranchOpts) {
@@ -547,13 +562,13 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
     // are removed.
     SmallSetVector<BasicBlock*, 8> WorkList;
     for (BasicBlock &BB : F) {
-      SmallVector<BasicBlock *, 2> Successors(succ_begin(&BB), succ_end(&BB));
+      SmallVector<BasicBlock *, 2> Successors(successors(&BB));
       MadeChange |= ConstantFoldTerminator(&BB, true);
       if (!MadeChange) continue;
 
       for (SmallVectorImpl<BasicBlock*>::iterator
              II = Successors.begin(), IE = Successors.end(); II != IE; ++II)
-        if (pred_begin(*II) == pred_end(*II))
+        if (pred_empty(*II))
           WorkList.insert(*II);
     }
 
@@ -561,13 +576,13 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
     MadeChange |= !WorkList.empty();
     while (!WorkList.empty()) {
       BasicBlock *BB = WorkList.pop_back_val();
-      SmallVector<BasicBlock*, 2> Successors(succ_begin(BB), succ_end(BB));
+      SmallVector<BasicBlock*, 2> Successors(successors(BB));
 
       DeleteDeadBlock(BB);
 
       for (SmallVectorImpl<BasicBlock*>::iterator
              II = Successors.begin(), IE = Successors.end(); II != IE; ++II)
-        if (pred_begin(*II) == pred_end(*II))
+        if (pred_empty(*II))
           WorkList.insert(*II);
     }
 
@@ -601,6 +616,33 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
   return EverMadeChange;
 }
 
+/// An instruction is about to be deleted, so remove all references to it in our
+/// GEP-tracking data strcutures.
+void CodeGenPrepare::removeAllAssertingVHReferences(Value *V) {
+  LargeOffsetGEPMap.erase(V);
+  NewGEPBases.erase(V);
+
+  auto GEP = dyn_cast<GetElementPtrInst>(V);
+  if (!GEP)
+    return;
+
+  LargeOffsetGEPID.erase(GEP);
+
+  auto VecI = LargeOffsetGEPMap.find(GEP->getPointerOperand());
+  if (VecI == LargeOffsetGEPMap.end())
+    return;
+
+  auto &GEPVector = VecI->second;
+  const auto &I =
+      llvm::find_if(GEPVector, [=](auto &Elt) { return Elt.first == GEP; });
+  if (I == GEPVector.end())
+    return;
+
+  GEPVector.erase(I);
+  if (GEPVector.empty())
+    LargeOffsetGEPMap.erase(VecI);
+}
+
 // Verify BFI has been updated correctly by recomputing BFI and comparing them.
 void LLVM_ATTRIBUTE_UNUSED CodeGenPrepare::verifyBFIUpdates(Function &F) {
   DominatorTree NewDT(F);
@@ -619,9 +661,10 @@ bool CodeGenPrepare::eliminateFallThrough(Function &F) {
   // Use a temporary array to avoid iterator being invalidated when
   // deleting blocks.
   SmallVector<WeakTrackingVH, 16> Blocks;
-  for (auto &Block : llvm::make_range(std::next(F.begin()), F.end()))
+  for (auto &Block : llvm::drop_begin(F))
     Blocks.push_back(&Block);
 
+  SmallSet<WeakTrackingVH, 16> Preds;
   for (auto &Block : Blocks) {
     auto *BB = cast_or_null<BasicBlock>(Block);
     if (!BB)
@@ -640,8 +683,16 @@ bool CodeGenPrepare::eliminateFallThrough(Function &F) {
 
       // Merge BB into SinglePred and delete it.
       MergeBlockIntoPredecessor(BB);
+      Preds.insert(SinglePred);
     }
   }
+
+  // (Repeatedly) merging blocks into their predecessors can create redundant
+  // debug intrinsics.
+  for (auto &Pred : Preds)
+    if (auto *BB = cast_or_null<BasicBlock>(Pred))
+      RemoveRedundantDbgInstrs(BB);
+
   return Changed;
 }
 
@@ -686,7 +737,7 @@ bool CodeGenPrepare::eliminateMostlyEmptyBlocks(Function &F) {
   SmallVector<Loop *, 16> LoopList(LI->begin(), LI->end());
   while (!LoopList.empty()) {
     Loop *L = LoopList.pop_back_val();
-    LoopList.insert(LoopList.end(), L->begin(), L->end());
+    llvm::append_range(LoopList, *L);
     if (BasicBlock *Preheader = L->getLoopPreheader())
       Preheaders.insert(Preheader);
   }
@@ -696,7 +747,7 @@ bool CodeGenPrepare::eliminateMostlyEmptyBlocks(Function &F) {
   // as we remove them.
   // Note that this intentionally skips the entry block.
   SmallVector<WeakTrackingVH, 16> Blocks;
-  for (auto &Block : llvm::make_range(std::next(F.begin()), F.end()))
+  for (auto &Block : llvm::drop_begin(F))
     Blocks.push_back(&Block);
 
   for (auto &Block : Blocks) {
@@ -2011,7 +2062,14 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
     switch (II->getIntrinsicID()) {
     default: break;
     case Intrinsic::assume: {
+      Value *Operand = II->getOperand(0);
       II->eraseFromParent();
+      // Prune the operand, it's most likely dead.
+      resetIteratorIfInvalidatedWhileCalling(BB, [&]() {
+        RecursivelyDeleteTriviallyDeadInstructions(
+            Operand, TLInfo, nullptr,
+            [&](Value *V) { removeAllAssertingVHReferences(V); });
+      });
       return true;
     }
 
@@ -2172,8 +2230,7 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB, bool &ModifiedDT
     EVI = dyn_cast<ExtractValueInst>(V);
     if (EVI) {
       V = EVI->getOperand(0);
-      if (!std::all_of(EVI->idx_begin(), EVI->idx_end(),
-                       [](unsigned idx) { return idx == 0; }))
+      if (!llvm::all_of(EVI->indices(), [](unsigned idx) { return idx == 0; }))
         return false;
     }
 
@@ -2192,13 +2249,12 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB, bool &ModifiedDT
     // Skip over debug and the bitcast.
     do {
       ++BI;
-    } while (isa<DbgInfoIntrinsic>(BI) || &*BI == BCI || &*BI == EVI);
+    } while (isa<DbgInfoIntrinsic>(BI) || &*BI == BCI || &*BI == EVI ||
+             isa<PseudoProbeInst>(BI));
     if (&*BI != RetI)
       return false;
   } else {
-    BasicBlock::iterator BI = BB->begin();
-    while (isa<DbgInfoIntrinsic>(BI)) ++BI;
-    if (&*BI != RetI)
+    if (BB->getFirstNonPHIOrDbg(true) != RetI)
       return false;
   }
 
@@ -2223,18 +2279,12 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB, bool &ModifiedDT
     for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) {
       if (!VisitedBBs.insert(*PI).second)
         continue;
-
-      BasicBlock::InstListType &InstList = (*PI)->getInstList();
-      BasicBlock::InstListType::reverse_iterator RI = InstList.rbegin();
-      BasicBlock::InstListType::reverse_iterator RE = InstList.rend();
-      do { ++RI; } while (RI != RE && isa<DbgInfoIntrinsic>(&*RI));
-      if (RI == RE)
-        continue;
-
-      CallInst *CI = dyn_cast<CallInst>(&*RI);
-      if (CI && CI->use_empty() && TLI->mayBeEmittedAsTailCall(CI) &&
-          attributesPermitTailCall(F, CI, RetI, *TLI))
-        TailCallBBs.push_back(*PI);
+      if (Instruction *I = (*PI)->rbegin()->getPrevNonDebugInstruction(true)) {
+        CallInst *CI = dyn_cast<CallInst>(I);
+        if (CI && CI->use_empty() && TLI->mayBeEmittedAsTailCall(CI) &&
+            attributesPermitTailCall(F, CI, RetI, *TLI))
+          TailCallBBs.push_back(*PI);
+      }
     }
   }
 
@@ -2258,7 +2308,7 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB, bool &ModifiedDT
   }
 
   // If we eliminated all predecessors of the block, delete the block now.
-  if (Changed && !BB->hasAddressTaken() && pred_begin(BB) == pred_end(BB))
+  if (Changed && !BB->hasAddressTaken() && pred_empty(BB))
     BB->eraseFromParent();
 
   return Changed;
@@ -3109,9 +3159,7 @@ public:
   /// \returns whether the element is actually removed, i.e. was in the
   /// collection before the operation.
   bool erase(PHINode *Ptr) {
-    auto it = NodeMap.find(Ptr);
-    if (it != NodeMap.end()) {
-      NodeMap.erase(Ptr);
+    if (NodeMap.erase(Ptr)) {
       SkipRemovedElements(FirstValidElement);
       return true;
     }
@@ -3666,8 +3714,7 @@ private:
             PHINode::Create(CommonType, PredCount, "sunk_phi", CurrentPhi);
         Map[Current] = PHI;
         ST.insertNewPhi(PHI);
-        for (Value *P : CurrentPhi->incoming_values())
-          Worklist.push_back(P);
+        append_range(Worklist, CurrentPhi->incoming_values());
       }
     }
   }
@@ -4289,7 +4336,7 @@ bool AddressingModeMatcher::matchOperationAddr(User *AddrInst, unsigned Opcode,
     unsigned SrcAS
       = AddrInst->getOperand(0)->getType()->getPointerAddressSpace();
     unsigned DestAS = AddrInst->getType()->getPointerAddressSpace();
-    if (TLI.isNoopAddrSpaceCast(SrcAS, DestAS))
+    if (TLI.getTargetMachine().isNoopAddrSpaceCast(SrcAS, DestAS))
       return matchAddr(AddrInst->getOperand(0), Depth);
     return false;
   }
@@ -4921,8 +4968,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
 
     // For a PHI node, push all of its incoming values.
     if (PHINode *P = dyn_cast<PHINode>(V)) {
-      for (Value *IncValue : P->incoming_values())
-        worklist.push_back(IncValue);
+      append_range(worklist, P->incoming_values());
       PhiOrSelectSeen = true;
       continue;
     }
@@ -5236,20 +5282,11 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
   // If we have no uses, recursively delete the value and all dead instructions
   // using it.
   if (Repl->use_empty()) {
-    // This can cause recursive deletion, which can invalidate our iterator.
-    // Use a WeakTrackingVH to hold onto it in case this happens.
-    Value *CurValue = &*CurInstIterator;
-    WeakTrackingVH IterHandle(CurValue);
-    BasicBlock *BB = CurInstIterator->getParent();
-
-    RecursivelyDeleteTriviallyDeadInstructions(Repl, TLInfo);
-
-    if (IterHandle != CurValue) {
-      // If the iterator instruction was recursively deleted, start over at the
-      // start of the block.
-      CurInstIterator = BB->begin();
-      SunkAddrs.clear();
-    }
+    resetIteratorIfInvalidatedWhileCalling(CurInstIterator->getParent(), [&]() {
+      RecursivelyDeleteTriviallyDeadInstructions(
+          Repl, TLInfo, nullptr,
+          [&](Value *V) { removeAllAssertingVHReferences(V); });
+    });
   }
   ++NumMemoryInsts;
   return true;
@@ -5270,92 +5307,112 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
 ///
 /// If the final index isn't a vector or is a splat, we can emit a scalar GEP
 /// followed by a GEP with an all zeroes vector index. This will enable
-/// SelectionDAGBuilder to use a the scalar GEP as the uniform base and have a
+/// SelectionDAGBuilder to use the scalar GEP as the uniform base and have a
 /// zero index.
 bool CodeGenPrepare::optimizeGatherScatterInst(Instruction *MemoryInst,
                                                Value *Ptr) {
-  const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
-  if (!GEP || !GEP->hasIndices())
-    return false;
+  Value *NewAddr;
 
-  // If the GEP and the gather/scatter aren't in the same BB, don't optimize.
-  // FIXME: We should support this by sinking the GEP.
-  if (MemoryInst->getParent() != GEP->getParent())
-    return false;
+  if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
+    // Don't optimize GEPs that don't have indices.
+    if (!GEP->hasIndices())
+      return false;
 
-  SmallVector<Value *, 2> Ops(GEP->op_begin(), GEP->op_end());
+    // If the GEP and the gather/scatter aren't in the same BB, don't optimize.
+    // FIXME: We should support this by sinking the GEP.
+    if (MemoryInst->getParent() != GEP->getParent())
+      return false;
 
-  bool RewriteGEP = false;
+    SmallVector<Value *, 2> Ops(GEP->operands());
 
-  if (Ops[0]->getType()->isVectorTy()) {
-    Ops[0] = const_cast<Value *>(getSplatValue(Ops[0]));
-    if (!Ops[0])
-      return false;
-    RewriteGEP = true;
-  }
+    bool RewriteGEP = false;
 
-  unsigned FinalIndex = Ops.size() - 1;
+    if (Ops[0]->getType()->isVectorTy()) {
+      Ops[0] = getSplatValue(Ops[0]);
+      if (!Ops[0])
+        return false;
+      RewriteGEP = true;
+    }
 
-  // Ensure all but the last index is 0.
-  // FIXME: This isn't strictly required. All that's required is that they are
-  // all scalars or splats.
-  for (unsigned i = 1; i < FinalIndex; ++i) {
-    auto *C = dyn_cast<Constant>(Ops[i]);
-    if (!C)
-      return false;
-    if (isa<VectorType>(C->getType()))
-      C = C->getSplatValue();
-    auto *CI = dyn_cast_or_null<ConstantInt>(C);
-    if (!CI || !CI->isZero())
-      return false;
-    // Scalarize the index if needed.
-    Ops[i] = CI;
-  }
-
-  // Try to scalarize the final index.
-  if (Ops[FinalIndex]->getType()->isVectorTy()) {
-    if (Value *V = const_cast<Value *>(getSplatValue(Ops[FinalIndex]))) {
-      auto *C = dyn_cast<ConstantInt>(V);
-      // Don't scalarize all zeros vector.
-      if (!C || !C->isZero()) {
-        Ops[FinalIndex] = V;
-        RewriteGEP = true;
+    unsigned FinalIndex = Ops.size() - 1;
+
+    // Ensure all but the last index is 0.
+    // FIXME: This isn't strictly required. All that's required is that they are
+    // all scalars or splats.
+    for (unsigned i = 1; i < FinalIndex; ++i) {
+      auto *C = dyn_cast<Constant>(Ops[i]);
+      if (!C)
+        return false;
+      if (isa<VectorType>(C->getType()))
+        C = C->getSplatValue();
+      auto *CI = dyn_cast_or_null<ConstantInt>(C);
+      if (!CI || !CI->isZero())
+        return false;
+      // Scalarize the index if needed.
+      Ops[i] = CI;
+    }
+
+    // Try to scalarize the final index.
+    if (Ops[FinalIndex]->getType()->isVectorTy()) {
+      if (Value *V = getSplatValue(Ops[FinalIndex])) {
+        auto *C = dyn_cast<ConstantInt>(V);
+        // Don't scalarize all zeros vector.
+        if (!C || !C->isZero()) {
+          Ops[FinalIndex] = V;
+          RewriteGEP = true;
+        }
       }
     }
-  }
 
-  // If we made any changes or the we have extra operands, we need to generate
-  // new instructions.
-  if (!RewriteGEP && Ops.size() == 2)
-    return false;
-
-  unsigned NumElts = cast<FixedVectorType>(Ptr->getType())->getNumElements();
+    // If we made any changes or the we have extra operands, we need to generate
+    // new instructions.
+    if (!RewriteGEP && Ops.size() == 2)
+      return false;
 
-  IRBuilder<> Builder(MemoryInst);
+    auto NumElts = cast<VectorType>(Ptr->getType())->getElementCount();
 
-  Type *ScalarIndexTy = DL->getIndexType(Ops[0]->getType()->getScalarType());
+    IRBuilder<> Builder(MemoryInst);
 
-  Value *NewAddr;
+    Type *ScalarIndexTy = DL->getIndexType(Ops[0]->getType()->getScalarType());
 
-  // If the final index isn't a vector, emit a scalar GEP containing all ops
-  // and a vector GEP with all zeroes final index.
-  if (!Ops[FinalIndex]->getType()->isVectorTy()) {
-    NewAddr = Builder.CreateGEP(Ops[0], makeArrayRef(Ops).drop_front());
-    auto *IndexTy = FixedVectorType::get(ScalarIndexTy, NumElts);
-    NewAddr = Builder.CreateGEP(NewAddr, Constant::getNullValue(IndexTy));
-  } else {
-    Value *Base = Ops[0];
-    Value *Index = Ops[FinalIndex];
+    // If the final index isn't a vector, emit a scalar GEP containing all ops
+    // and a vector GEP with all zeroes final index.
+    if (!Ops[FinalIndex]->getType()->isVectorTy()) {
+      NewAddr = Builder.CreateGEP(Ops[0], makeArrayRef(Ops).drop_front());
+      auto *IndexTy = VectorType::get(ScalarIndexTy, NumElts);
+      NewAddr = Builder.CreateGEP(NewAddr, Constant::getNullValue(IndexTy));
+    } else {
+      Value *Base = Ops[0];
+      Value *Index = Ops[FinalIndex];
+
+      // Create a scalar GEP if there are more than 2 operands.
+      if (Ops.size() != 2) {
+        // Replace the last index with 0.
+        Ops[FinalIndex] = Constant::getNullValue(ScalarIndexTy);
+        Base = Builder.CreateGEP(Base, makeArrayRef(Ops).drop_front());
+      }
 
-    // Create a scalar GEP if there are more than 2 operands.
-    if (Ops.size() != 2) {
-      // Replace the last index with 0.
-      Ops[FinalIndex] = Constant::getNullValue(ScalarIndexTy);
-      Base = Builder.CreateGEP(Base, makeArrayRef(Ops).drop_front());
+      // Now create the GEP with scalar pointer and vector index.
+      NewAddr = Builder.CreateGEP(Base, Index);
     }
+  } else if (!isa<Constant>(Ptr)) {
+    // Not a GEP, maybe its a splat and we can create a GEP to enable
+    // SelectionDAGBuilder to use it as a uniform base.
+    Value *V = getSplatValue(Ptr);
+    if (!V)
+      return false;
+
+    auto NumElts = cast<VectorType>(Ptr->getType())->getElementCount();
 
-    // Now create the GEP with scalar pointer and vector index.
-    NewAddr = Builder.CreateGEP(Base, Index);
+    IRBuilder<> Builder(MemoryInst);
+
+    // Emit a vector GEP with a scalar pointer and all 0s vector index.
+    Type *ScalarIndexTy = DL->getIndexType(V->getType()->getScalarType());
+    auto *IndexTy = VectorType::get(ScalarIndexTy, NumElts);
+    NewAddr = Builder.CreateGEP(V, Constant::getNullValue(IndexTy));
+  } else {
+    // Constant, SelectionDAGBuilder knows to check if its a splat.
+    return false;
   }
 
   MemoryInst->replaceUsesOfWith(Ptr, NewAddr);
@@ -5363,7 +5420,9 @@ bool CodeGenPrepare::optimizeGatherScatterInst(Instruction *MemoryInst,
   // If we have no uses, recursively delete the value and all dead instructions
   // using it.
   if (Ptr->use_empty())
-    RecursivelyDeleteTriviallyDeadInstructions(Ptr, TLInfo);
+    RecursivelyDeleteTriviallyDeadInstructions(
+        Ptr, TLInfo, nullptr,
+        [&](Value *V) { removeAllAssertingVHReferences(V); });
 
   return true;
 }
@@ -5752,6 +5811,12 @@ bool CodeGenPrepare::optimizePhiType(
   Visited.insert(I);
   SmallPtrSet<Instruction *, 4> Defs;
   SmallPtrSet<Instruction *, 4> Uses;
+  // This works by adding extra bitcasts between load/stores and removing
+  // existing bicasts. If we have a phi(bitcast(load)) or a store(bitcast(phi))
+  // we can get in the situation where we remove a bitcast in one iteration
+  // just to add it again in the next. We need to ensure that at least one
+  // bitcast we remove are anchored to something that will not change back.
+  bool AnyAnchored = false;
 
   while (!Worklist.empty()) {
     Instruction *II = Worklist.pop_back_val();
@@ -5768,6 +5833,8 @@ bool CodeGenPrepare::optimizePhiType(
             Worklist.push_back(OpPhi);
           }
         } else if (auto *OpLoad = dyn_cast<LoadInst>(V)) {
+          if (!OpLoad->isSimple())
+            return false;
           if (!Defs.count(OpLoad)) {
             Defs.insert(OpLoad);
             Worklist.push_back(OpLoad);
@@ -5785,9 +5852,12 @@ bool CodeGenPrepare::optimizePhiType(
           if (!Defs.count(OpBC)) {
             Defs.insert(OpBC);
             Worklist.push_back(OpBC);
+            AnyAnchored |= !isa<LoadInst>(OpBC->getOperand(0)) &&
+                           !isa<ExtractElementInst>(OpBC->getOperand(0));
           }
-        } else if (!isa<UndefValue>(V))
+        } else if (!isa<UndefValue>(V)) {
           return false;
+        }
       }
     }
 
@@ -5802,7 +5872,7 @@ bool CodeGenPrepare::optimizePhiType(
           Worklist.push_back(OpPhi);
         }
       } else if (auto *OpStore = dyn_cast<StoreInst>(V)) {
-        if (OpStore->getOperand(0) != II)
+        if (!OpStore->isSimple() || OpStore->getOperand(0) != II)
           return false;
         Uses.insert(OpStore);
       } else if (auto *OpBC = dyn_cast<BitCastInst>(V)) {
@@ -5811,12 +5881,15 @@ bool CodeGenPrepare::optimizePhiType(
         if (OpBC->getType() != ConvertTy)
           return false;
         Uses.insert(OpBC);
-      } else
+        AnyAnchored |=
+            any_of(OpBC->users(), [](User *U) { return !isa<StoreInst>(U); });
+      } else {
         return false;
+      }
     }
   }
 
-  if (!ConvertTy || !TLI->shouldConvertPhiType(PhiTy, ConvertTy))
+  if (!ConvertTy || !AnyAnchored || !TLI->shouldConvertPhiType(PhiTy, ConvertTy))
     return false;
 
   LLVM_DEBUG(dbgs() << "Converting " << *I << "\n  and connected nodes to "
@@ -5827,11 +5900,13 @@ bool CodeGenPrepare::optimizePhiType(
   ValueToValueMap ValMap;
   ValMap[UndefValue::get(PhiTy)] = UndefValue::get(ConvertTy);
   for (Instruction *D : Defs) {
-    if (isa<BitCastInst>(D))
+    if (isa<BitCastInst>(D)) {
       ValMap[D] = D->getOperand(0);
-    else
+      DeletedInstrs.insert(D);
+    } else {
       ValMap[D] =
           new BitCastInst(D, ConvertTy, D->getName() + ".bc", D->getNextNode());
+    }
   }
   for (PHINode *Phi : PhiNodes)
     ValMap[Phi] = PHINode::Create(ConvertTy, Phi->getNumIncomingValues(),
@@ -5842,15 +5917,17 @@ bool CodeGenPrepare::optimizePhiType(
     for (int i = 0, e = Phi->getNumIncomingValues(); i < e; i++)
       NewPhi->addIncoming(ValMap[Phi->getIncomingValue(i)],
                           Phi->getIncomingBlock(i));
+    Visited.insert(NewPhi);
   }
   // And finally pipe up the stores and bitcasts
   for (Instruction *U : Uses) {
     if (isa<BitCastInst>(U)) {
       DeletedInstrs.insert(U);
       U->replaceAllUsesWith(ValMap[U->getOperand(0)]);
-    } else
+    } else {
       U->setOperand(0,
                     new BitCastInst(ValMap[U->getOperand(0)], PhiTy, "bc", U));
+    }
   }
 
   // Save the removed phis to be deleted later.
@@ -6445,9 +6522,7 @@ bool CodeGenPrepare::optimizeFunnelShift(IntrinsicInst *Fsh) {
 /// If we have a SelectInst that will likely profit from branch prediction,
 /// turn it into a branch.
 bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) {
-  // If branch conversion isn't desirable, exit early.
-  if (DisableSelectToBranch || OptSize ||
-      llvm::shouldOptimizeForSize(SI->getParent(), PSI, BFI.get()))
+  if (DisableSelectToBranch)
     return false;
 
   // Find all consecutive select instructions that share the same condition.
@@ -6483,7 +6558,8 @@ bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) {
     SelectKind = TargetLowering::ScalarValSelect;
 
   if (TLI->isSelectSupported(SelectKind) &&
-      !isFormingBranchFromSelectProfitable(TTI, TLI, SI))
+      (!isFormingBranchFromSelectProfitable(TTI, TLI, SI) || OptSize ||
+       llvm::shouldOptimizeForSize(SI->getParent(), PSI, BFI.get())))
     return false;
 
   // The DominatorTree needs to be rebuilt by any consumers after this
@@ -6621,6 +6697,7 @@ bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) {
 /// in MVE takes a GPR (integer) register, and the instruction that incorporate
 /// a VDUP (such as a VADD qd, qm, rm) also require a gpr register.
 bool CodeGenPrepare::optimizeShuffleVectorInst(ShuffleVectorInst *SVI) {
+  // Accept shuf(insertelem(undef/poison, val, 0), undef/poison, <0,0,..>) only
   if (!match(SVI, m_Shuffle(m_InsertElt(m_Undef(), m_Value(), m_ZeroInt()),
                             m_Undef(), m_ZeroMask())))
     return false;
@@ -6640,14 +6717,12 @@ bool CodeGenPrepare::optimizeShuffleVectorInst(ShuffleVectorInst *SVI) {
   Builder.SetInsertPoint(SVI);
   Value *BC1 = Builder.CreateBitCast(
       cast<Instruction>(SVI->getOperand(0))->getOperand(1), NewType);
-  Value *Insert = Builder.CreateInsertElement(UndefValue::get(NewVecType), BC1,
-                                              (uint64_t)0);
-  Value *Shuffle = Builder.CreateShuffleVector(
-      Insert, UndefValue::get(NewVecType), SVI->getShuffleMask());
+  Value *Shuffle = Builder.CreateVectorSplat(NewVecType->getNumElements(), BC1);
   Value *BC2 = Builder.CreateBitCast(Shuffle, SVIVecType);
 
   SVI->replaceAllUsesWith(BC2);
-  RecursivelyDeleteTriviallyDeadInstructions(SVI);
+  RecursivelyDeleteTriviallyDeadInstructions(
+      SVI, TLInfo, nullptr, [&](Value *V) { removeAllAssertingVHReferences(V); });
 
   // Also hoist the bitcast up to its operand if it they are not in the same
   // block.
@@ -6920,10 +6995,10 @@ class VectorPromoteHelper {
     if (UseSplat)
       return ConstantVector::getSplat(EC, Val);
 
-    if (!EC.Scalable) {
+    if (!EC.isScalable()) {
       SmallVector<Constant *, 4> ConstVec;
       UndefValue *UndefVal = UndefValue::get(Val->getType());
-      for (unsigned Idx = 0; Idx != EC.Min; ++Idx) {
+      for (unsigned Idx = 0; Idx != EC.getKnownMinValue(); ++Idx) {
         if (Idx == ExtractIdx)
           ConstVec.push_back(Val);
         else
@@ -7604,11 +7679,10 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) {
 
 /// Given an OR instruction, check to see if this is a bitreverse
 /// idiom. If so, insert the new intrinsic and return true.
-static bool makeBitReverse(Instruction &I, const DataLayout &DL,
-                           const TargetLowering &TLI) {
+bool CodeGenPrepare::makeBitReverse(Instruction &I) {
   if (!I.getType()->isIntegerTy() ||
-      !TLI.isOperationLegalOrCustom(ISD::BITREVERSE,
-                                    TLI.getValueType(DL, I.getType(), true)))
+      !TLI->isOperationLegalOrCustom(ISD::BITREVERSE,
+                                     TLI->getValueType(*DL, I.getType(), true)))
     return false;
 
   SmallVector<Instruction*, 4> Insts;
@@ -7616,7 +7690,8 @@ static bool makeBitReverse(Instruction &I, const DataLayout &DL,
     return false;
   Instruction *LastInst = Insts.back();
   I.replaceAllUsesWith(LastInst);
-  RecursivelyDeleteTriviallyDeadInstructions(&I);
+  RecursivelyDeleteTriviallyDeadInstructions(
+      &I, TLInfo, nullptr, [&](Value *V) { removeAllAssertingVHReferences(V); });
   return true;
 }
 
@@ -7638,7 +7713,7 @@ bool CodeGenPrepare::optimizeBlock(BasicBlock &BB, bool &ModifiedDT) {
   while (MadeBitReverse) {
     MadeBitReverse = false;
     for (auto &I : reverse(BB)) {
-      if (makeBitReverse(I, *DL, *TLI)) {
+      if (makeBitReverse(I)) {
         MadeBitReverse = MadeChange = true;
         break;
       }
@@ -7757,9 +7832,10 @@ bool CodeGenPrepare::splitBranchCondition(Function &F, bool &ModifiedDT) {
     //   %cond2 = icmp|fcmp|binary instruction ...
     //   %cond.or = or|and i1 %cond1, cond2
     //   br i1 %cond.or label %dest1, label %dest2"
-    BinaryOperator *LogicOp;
+    Instruction *LogicOp;
     BasicBlock *TBB, *FBB;
-    if (!match(BB.getTerminator(), m_Br(m_OneUse(m_BinOp(LogicOp)), TBB, FBB)))
+    if (!match(BB.getTerminator(),
+               m_Br(m_OneUse(m_Instruction(LogicOp)), TBB, FBB)))
       continue;
 
     auto *Br1 = cast<BranchInst>(BB.getTerminator());
@@ -7772,17 +7848,22 @@ bool CodeGenPrepare::splitBranchCondition(Function &F, bool &ModifiedDT) {
 
     unsigned Opc;
     Value *Cond1, *Cond2;
-    if (match(LogicOp, m_And(m_OneUse(m_Value(Cond1)),
-                             m_OneUse(m_Value(Cond2)))))
+    if (match(LogicOp,
+              m_LogicalAnd(m_OneUse(m_Value(Cond1)), m_OneUse(m_Value(Cond2)))))
       Opc = Instruction::And;
-    else if (match(LogicOp, m_Or(m_OneUse(m_Value(Cond1)),
-                                 m_OneUse(m_Value(Cond2)))))
+    else if (match(LogicOp, m_LogicalOr(m_OneUse(m_Value(Cond1)),
+                                        m_OneUse(m_Value(Cond2)))))
       Opc = Instruction::Or;
     else
       continue;
 
-    if (!match(Cond1, m_CombineOr(m_Cmp(), m_BinOp())) ||
-        !match(Cond2, m_CombineOr(m_Cmp(), m_BinOp()))   )
+    auto IsGoodCond = [](Value *Cond) {
+      return match(
+          Cond,
+          m_CombineOr(m_Cmp(), m_CombineOr(m_LogicalAnd(m_Value(), m_Value()),
+                                           m_LogicalOr(m_Value(), m_Value()))));
+    };
+    if (!IsGoodCond(Cond1) || !IsGoodCond(Cond2))
       continue;
 
     LLVM_DEBUG(dbgs() << "Before branch condition splitting\n"; BB.dump());
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/CommandFlags.cpp b/contrib/llvm-project/llvm/lib/CodeGen/CommandFlags.cpp
index 12dadf97e02c..97c110afdda4 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/CommandFlags.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/CommandFlags.cpp
@@ -58,6 +58,7 @@ CGOPT(bool, EnableNoInfsFPMath)
 CGOPT(bool, EnableNoNaNsFPMath)
 CGOPT(bool, EnableNoSignedZerosFPMath)
 CGOPT(bool, EnableNoTrappingFPMath)
+CGOPT(bool, EnableAIXExtendedAltivecABI)
 CGOPT(DenormalMode::DenormalModeKind, DenormalFPMath)
 CGOPT(DenormalMode::DenormalModeKind, DenormalFP32Math)
 CGOPT(bool, EnableHonorSignDependentRoundingFPMath)
@@ -74,7 +75,12 @@ CGOPT(bool, UseCtors)
 CGOPT(bool, RelaxELFRelocations)
 CGOPT_EXP(bool, DataSections)
 CGOPT_EXP(bool, FunctionSections)
+CGOPT(bool, IgnoreXCOFFVisibility)
+CGOPT(bool, XCOFFTracebackTable)
 CGOPT(std::string, BBSections)
+CGOPT(std::string, StackProtectorGuard)
+CGOPT(unsigned, StackProtectorGuardOffset)
+CGOPT(std::string, StackProtectorGuardReg)
 CGOPT(unsigned, TLSSize)
 CGOPT(bool, EmulatedTLS)
 CGOPT(bool, UniqueSectionNames)
@@ -84,7 +90,10 @@ CGOPT(DebuggerKind, DebuggerTuningOpt)
 CGOPT(bool, EnableStackSizeSection)
 CGOPT(bool, EnableAddrsig)
 CGOPT(bool, EmitCallSiteInfo)
+CGOPT(bool, EnableMachineFunctionSplitter)
 CGOPT(bool, EnableDebugEntryValues)
+CGOPT(bool, PseudoProbeForProfiling)
+CGOPT(bool, ValueTrackingVariableLocations)
 CGOPT(bool, ForceDwarfFrameSection)
 CGOPT(bool, XRayOmitFunctionIndex)
 
@@ -276,6 +285,11 @@ codegen::RegisterCodeGenFlags::RegisterCodeGenFlags() {
       cl::init(false));
   CGBINDOPT(DontPlaceZerosInBSS);
 
+  static cl::opt<bool> EnableAIXExtendedAltivecABI(
+      "vec-extabi", cl::desc("Enable the AIX Extended Altivec ABI."),
+      cl::init(false));
+  CGBINDOPT(EnableAIXExtendedAltivecABI);
+
   static cl::opt<bool> EnableGuaranteedTailCallOpt(
       "tailcallopt",
       cl::desc(
@@ -331,13 +345,40 @@ codegen::RegisterCodeGenFlags::RegisterCodeGenFlags() {
       cl::init(false));
   CGBINDOPT(FunctionSections);
 
+  static cl::opt<bool> IgnoreXCOFFVisibility(
+      "ignore-xcoff-visibility",
+      cl::desc("Not emit the visibility attribute for asm in AIX OS or give "
+               "all symbols 'unspecified' visibility in XCOFF object file"),
+      cl::init(false));
+  CGBINDOPT(IgnoreXCOFFVisibility);
+
+  static cl::opt<bool> XCOFFTracebackTable(
+      "xcoff-traceback-table", cl::desc("Emit the XCOFF traceback table"),
+      cl::init(true));
+  CGBINDOPT(XCOFFTracebackTable);
+
   static cl::opt<std::string> BBSections(
-      "basicblock-sections",
+      "basic-block-sections",
       cl::desc("Emit basic blocks into separate sections"),
       cl::value_desc("all | <function list (file)> | labels | none"),
       cl::init("none"));
   CGBINDOPT(BBSections);
 
+  static cl::opt<std::string> StackProtectorGuard(
+      "stack-protector-guard", cl::desc("Stack protector guard mode"),
+      cl::init("none"));
+  CGBINDOPT(StackProtectorGuard);
+
+  static cl::opt<std::string> StackProtectorGuardReg(
+      "stack-protector-guard-reg", cl::desc("Stack protector guard register"),
+      cl::init("none"));
+  CGBINDOPT(StackProtectorGuardReg);
+
+  static cl::opt<unsigned> StackProtectorGuardOffset(
+      "stack-protector-guard-offset", cl::desc("Stack protector guard offset"),
+      cl::init((unsigned)-1));
+  CGBINDOPT(StackProtectorGuardOffset);
+
   static cl::opt<unsigned> TLSSize(
       "tls-size", cl::desc("Bit size of immediate TLS offsets"), cl::init(0));
   CGBINDOPT(TLSSize);
@@ -352,7 +393,7 @@ codegen::RegisterCodeGenFlags::RegisterCodeGenFlags() {
   CGBINDOPT(UniqueSectionNames);
 
   static cl::opt<bool> UniqueBasicBlockSectionNames(
-      "unique-bb-section-names",
+      "unique-basic-block-section-names",
       cl::desc("Give unique names to every basic block section"),
       cl::init(false));
   CGBINDOPT(UniqueBasicBlockSectionNames);
@@ -400,6 +441,24 @@ codegen::RegisterCodeGenFlags::RegisterCodeGenFlags() {
       cl::init(false));
   CGBINDOPT(EnableDebugEntryValues);
 
+  static cl::opt<bool> PseudoProbeForProfiling(
+      "pseudo-probe-for-profiling", cl::desc("Emit pseudo probes for AutoFDO"),
+      cl::init(false));
+  CGBINDOPT(PseudoProbeForProfiling);
+
+  static cl::opt<bool> ValueTrackingVariableLocations(
+      "experimental-debug-variable-locations",
+      cl::desc("Use experimental new value-tracking variable locations"),
+      cl::init(false));
+  CGBINDOPT(ValueTrackingVariableLocations);
+
+  static cl::opt<bool> EnableMachineFunctionSplitter(
+      "split-machine-functions",
+      cl::desc("Split out cold basic blocks from machine functions based on "
+               "profile information"),
+      cl::init(false));
+  CGBINDOPT(EnableMachineFunctionSplitter);
+
   static cl::opt<bool> ForceDwarfFrameSection(
       "force-dwarf-frame-section",
       cl::desc("Always emit a debug frame section."), cl::init(false));
@@ -436,9 +495,28 @@ codegen::getBBSectionsMode(llvm::TargetOptions &Options) {
   }
 }
 
+llvm::StackProtectorGuards
+codegen::getStackProtectorGuardMode(llvm::TargetOptions &Options) {
+  if (getStackProtectorGuard() == "tls")
+    return StackProtectorGuards::TLS;
+  if (getStackProtectorGuard() == "global")
+    return StackProtectorGuards::Global;
+  if (getStackProtectorGuard() != "none") {
+    ErrorOr<std::unique_ptr<MemoryBuffer>> MBOrErr =
+        MemoryBuffer::getFile(getStackProtectorGuard());
+    if (!MBOrErr)
+      errs() << "error illegal stack protector guard mode: "
+             << MBOrErr.getError().message() << "\n";
+    else
+      Options.BBSectionsFuncListBuf = std::move(*MBOrErr);
+  }
+  return StackProtectorGuards::None;
+}
+
 // Common utility function tightly tied to the options listed here. Initializes
 // a TargetOptions object with CodeGen flags and returns it.
-TargetOptions codegen::InitTargetOptionsFromCodeGenFlags() {
+TargetOptions
+codegen::InitTargetOptionsFromCodeGenFlags(const Triple &TheTriple) {
   TargetOptions Options;
   Options.AllowFPOpFusion = getFuseFPOps();
   Options.UnsafeFPMath = getEnableUnsafeFPMath();
@@ -456,25 +534,35 @@ TargetOptions codegen::InitTargetOptionsFromCodeGenFlags() {
       getEnableHonorSignDependentRoundingFPMath();
   if (getFloatABIForCalls() != FloatABI::Default)
     Options.FloatABIType = getFloatABIForCalls();
+  Options.EnableAIXExtendedAltivecABI = getEnableAIXExtendedAltivecABI();
   Options.NoZerosInBSS = getDontPlaceZerosInBSS();
   Options.GuaranteedTailCallOpt = getEnableGuaranteedTailCallOpt();
   Options.StackAlignmentOverride = getOverrideStackAlignment();
   Options.StackSymbolOrdering = getStackSymbolOrdering();
   Options.UseInitArray = !getUseCtors();
   Options.RelaxELFRelocations = getRelaxELFRelocations();
-  Options.DataSections = getDataSections();
+  Options.DataSections =
+      getExplicitDataSections().getValueOr(TheTriple.hasDefaultDataSections());
   Options.FunctionSections = getFunctionSections();
+  Options.IgnoreXCOFFVisibility = getIgnoreXCOFFVisibility();
+  Options.XCOFFTracebackTable = getXCOFFTracebackTable();
   Options.BBSections = getBBSectionsMode(Options);
   Options.UniqueSectionNames = getUniqueSectionNames();
   Options.UniqueBasicBlockSectionNames = getUniqueBasicBlockSectionNames();
+  Options.StackProtectorGuard = getStackProtectorGuardMode(Options);
+  Options.StackProtectorGuardOffset = getStackProtectorGuardOffset();
+  Options.StackProtectorGuardReg = getStackProtectorGuardReg();
   Options.TLSSize = getTLSSize();
   Options.EmulatedTLS = getEmulatedTLS();
   Options.ExplicitEmulatedTLS = EmulatedTLSView->getNumOccurrences() > 0;
   Options.ExceptionModel = getExceptionModel();
   Options.EmitStackSizeSection = getEnableStackSizeSection();
+  Options.EnableMachineFunctionSplitter = getEnableMachineFunctionSplitter();
   Options.EmitAddrsig = getEnableAddrsig();
   Options.EmitCallSiteInfo = getEmitCallSiteInfo();
   Options.EnableDebugEntryValues = getEnableDebugEntryValues();
+  Options.PseudoProbeForProfiling = getPseudoProbeForProfiling();
+  Options.ValueTrackingVariableLocations = getValueTrackingVariableLocations();
   Options.ForceDwarfFrameSection = getForceDwarfFrameSection();
   Options.XRayOmitFunctionIndex = getXRayOmitFunctionIndex();
 
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp b/contrib/llvm-project/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp
index d1529b08f708..93467e9d09b8 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp
@@ -10,6 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -48,6 +49,8 @@ namespace {
 
   private:
     bool isDead(const MachineInstr *MI) const;
+
+    bool eliminateDeadMI(MachineFunction &MF);
   };
 }
 char DeadMachineInstructionElim::ID = 0;
@@ -107,7 +110,13 @@ bool DeadMachineInstructionElim::isDead(const MachineInstr *MI) const {
 bool DeadMachineInstructionElim::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
+  bool AnyChanges = eliminateDeadMI(MF);
+  while (AnyChanges && eliminateDeadMI(MF))
+    ;
+  return AnyChanges;
+}
 
+bool DeadMachineInstructionElim::eliminateDeadMI(MachineFunction &MF) {
   bool AnyChanges = false;
   MRI = &MF.getRegInfo();
   TRI = MF.getSubtarget().getRegisterInfo();
@@ -116,22 +125,24 @@ bool DeadMachineInstructionElim::runOnMachineFunction(MachineFunction &MF) {
   // Loop over all instructions in all blocks, from bottom to top, so that it's
   // more likely that chains of dependent but ultimately dead instructions will
   // be cleaned up.
-  for (MachineBasicBlock &MBB : make_range(MF.rbegin(), MF.rend())) {
+  for (MachineBasicBlock *MBB : post_order(&MF)) {
     // Start out assuming that reserved registers are live out of this block.
     LivePhysRegs = MRI->getReservedRegs();
 
     // Add live-ins from successors to LivePhysRegs. Normally, physregs are not
     // live across blocks, but some targets (x86) can have flags live out of a
     // block.
-    for (MachineBasicBlock::succ_iterator S = MBB.succ_begin(),
-           E = MBB.succ_end(); S != E; S++)
+    for (MachineBasicBlock::succ_iterator S = MBB->succ_begin(),
+                                          E = MBB->succ_end();
+         S != E; S++)
       for (const auto &LI : (*S)->liveins())
         LivePhysRegs.set(LI.PhysReg);
 
     // Now scan the instructions and delete dead ones, tracking physreg
     // liveness as we go.
-    for (MachineBasicBlock::reverse_iterator MII = MBB.rbegin(),
-         MIE = MBB.rend(); MII != MIE; ) {
+    for (MachineBasicBlock::reverse_iterator MII = MBB->rbegin(),
+                                             MIE = MBB->rend();
+         MII != MIE;) {
       MachineInstr *MI = &*MII++;
 
       // If the instruction is dead, delete it!
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/DetectDeadLanes.cpp b/contrib/llvm-project/llvm/lib/CodeGen/DetectDeadLanes.cpp
index 6d5306c1dc0c..03fe5f155291 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/DetectDeadLanes.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/DetectDeadLanes.cpp
@@ -25,11 +25,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <deque>
-#include <vector>
-
 #include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/SetVector.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
@@ -40,6 +36,7 @@
 #include "llvm/PassRegistry.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include <deque>
 
 using namespace llvm;
 
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/DwarfEHPrepare.cpp b/contrib/llvm-project/llvm/lib/CodeGen/DwarfEHPrepare.cpp
index c75c957bff8a..97e0162f35a1 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/DwarfEHPrepare.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/DwarfEHPrepare.cpp
@@ -15,6 +15,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
@@ -44,67 +45,44 @@ STATISTIC(NumResumesLowered, "Number of resume calls lowered");
 
 namespace {
 
-  class DwarfEHPrepare : public FunctionPass {
-    // RewindFunction - _Unwind_Resume or the target equivalent.
-    FunctionCallee RewindFunction = nullptr;
+class DwarfEHPrepare {
+  CodeGenOpt::Level OptLevel;
 
-    CodeGenOpt::Level OptLevel;
-    DominatorTree *DT = nullptr;
-    const TargetLowering *TLI = nullptr;
-
-    bool InsertUnwindResumeCalls(Function &Fn);
-    Value *GetExceptionObject(ResumeInst *RI);
-    size_t
-    pruneUnreachableResumes(Function &Fn,
-                            SmallVectorImpl<ResumeInst *> &Resumes,
-                            SmallVectorImpl<LandingPadInst *> &CleanupLPads);
+  // RewindFunction - _Unwind_Resume or the target equivalent.
+  FunctionCallee &RewindFunction;
 
-  public:
-    static char ID; // Pass identification, replacement for typeid.
-
-    DwarfEHPrepare(CodeGenOpt::Level OptLevel = CodeGenOpt::Default)
-      : FunctionPass(ID), OptLevel(OptLevel) {}
+  Function &F;
+  const TargetLowering &TLI;
+  DomTreeUpdater *DTU;
+  const TargetTransformInfo *TTI;
 
-    bool runOnFunction(Function &Fn) override;
+  /// Return the exception object from the value passed into
+  /// the 'resume' instruction (typically an aggregate). Clean up any dead
+  /// instructions, including the 'resume' instruction.
+  Value *GetExceptionObject(ResumeInst *RI);
 
-    bool doFinalization(Module &M) override {
-      RewindFunction = nullptr;
-      return false;
-    }
+  /// Replace resumes that are not reachable from a cleanup landing pad with
+  /// unreachable and then simplify those blocks.
+  size_t
+  pruneUnreachableResumes(SmallVectorImpl<ResumeInst *> &Resumes,
+                          SmallVectorImpl<LandingPadInst *> &CleanupLPads);
 
-    void getAnalysisUsage(AnalysisUsage &AU) const override;
+  /// Convert the ResumeInsts that are still present
+  /// into calls to the appropriate _Unwind_Resume function.
+  bool InsertUnwindResumeCalls();
 
-    StringRef getPassName() const override {
-      return "Exception handling preparation";
-    }
-  };
+public:
+  DwarfEHPrepare(CodeGenOpt::Level OptLevel_, FunctionCallee &RewindFunction_,
+                 Function &F_, const TargetLowering &TLI_, DomTreeUpdater *DTU_,
+                 const TargetTransformInfo *TTI_)
+      : OptLevel(OptLevel_), RewindFunction(RewindFunction_), F(F_), TLI(TLI_),
+        DTU(DTU_), TTI(TTI_) {}
 
-} // end anonymous namespace
+  bool run();
+};
 
-char DwarfEHPrepare::ID = 0;
+} // namespace
 
-INITIALIZE_PASS_BEGIN(DwarfEHPrepare, DEBUG_TYPE,
-                      "Prepare DWARF exceptions", false, false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_END(DwarfEHPrepare, DEBUG_TYPE,
-                    "Prepare DWARF exceptions", false, false)
-
-FunctionPass *llvm::createDwarfEHPass(CodeGenOpt::Level OptLevel) {
-  return new DwarfEHPrepare(OptLevel);
-}
-
-void DwarfEHPrepare::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<TargetPassConfig>();
-  AU.addRequired<TargetTransformInfoWrapperPass>();
-  if (OptLevel != CodeGenOpt::None)
-    AU.addRequired<DominatorTreeWrapperPass>();
-}
-
-/// GetExceptionObject - Return the exception object from the value passed into
-/// the 'resume' instruction (typically an aggregate). Clean up any dead
-/// instructions, including the 'resume' instruction.
 Value *DwarfEHPrepare::GetExceptionObject(ResumeInst *RI) {
   Value *V = RI->getOperand(0);
   Value *ExnObj = nullptr;
@@ -142,16 +120,16 @@ Value *DwarfEHPrepare::GetExceptionObject(ResumeInst *RI) {
   return ExnObj;
 }
 
-/// Replace resumes that are not reachable from a cleanup landing pad with
-/// unreachable and then simplify those blocks.
 size_t DwarfEHPrepare::pruneUnreachableResumes(
-    Function &Fn, SmallVectorImpl<ResumeInst *> &Resumes,
+    SmallVectorImpl<ResumeInst *> &Resumes,
     SmallVectorImpl<LandingPadInst *> &CleanupLPads) {
+  assert(DTU && "Should have DomTreeUpdater here.");
+
   BitVector ResumeReachable(Resumes.size());
   size_t ResumeIndex = 0;
   for (auto *RI : Resumes) {
     for (auto *LP : CleanupLPads) {
-      if (isPotentiallyReachable(LP, RI, nullptr, DT)) {
+      if (isPotentiallyReachable(LP, RI, nullptr, &DTU->getDomTree())) {
         ResumeReachable.set(ResumeIndex);
         break;
       }
@@ -163,9 +141,7 @@ size_t DwarfEHPrepare::pruneUnreachableResumes(
   if (ResumeReachable.all())
     return Resumes.size();
 
-  const TargetTransformInfo &TTI =
-      getAnalysis<TargetTransformInfoWrapperPass>().getTTI(Fn);
-  LLVMContext &Ctx = Fn.getContext();
+  LLVMContext &Ctx = F.getContext();
 
   // Otherwise, insert unreachable instructions and call simplifycfg.
   size_t ResumesLeft = 0;
@@ -177,19 +153,17 @@ size_t DwarfEHPrepare::pruneUnreachableResumes(
       BasicBlock *BB = RI->getParent();
       new UnreachableInst(Ctx, RI);
       RI->eraseFromParent();
-      simplifyCFG(BB, TTI);
+      simplifyCFG(BB, *TTI, RequireAndPreserveDomTree ? DTU : nullptr);
     }
   }
   Resumes.resize(ResumesLeft);
   return ResumesLeft;
 }
 
-/// InsertUnwindResumeCalls - Convert the ResumeInsts that are still present
-/// into calls to the appropriate _Unwind_Resume function.
-bool DwarfEHPrepare::InsertUnwindResumeCalls(Function &Fn) {
-  SmallVector<ResumeInst*, 16> Resumes;
-  SmallVector<LandingPadInst*, 16> CleanupLPads;
-  for (BasicBlock &BB : Fn) {
+bool DwarfEHPrepare::InsertUnwindResumeCalls() {
+  SmallVector<ResumeInst *, 16> Resumes;
+  SmallVector<LandingPadInst *, 16> CleanupLPads;
+  for (BasicBlock &BB : F) {
     if (auto *RI = dyn_cast<ResumeInst>(BB.getTerminator()))
       Resumes.push_back(RI);
     if (auto *LP = BB.getLandingPadInst())
@@ -201,25 +175,25 @@ bool DwarfEHPrepare::InsertUnwindResumeCalls(Function &Fn) {
     return false;
 
   // Check the personality, don't do anything if it's scope-based.
-  EHPersonality Pers = classifyEHPersonality(Fn.getPersonalityFn());
+  EHPersonality Pers = classifyEHPersonality(F.getPersonalityFn());
   if (isScopedEHPersonality(Pers))
     return false;
 
-  LLVMContext &Ctx = Fn.getContext();
+  LLVMContext &Ctx = F.getContext();
 
   size_t ResumesLeft = Resumes.size();
   if (OptLevel != CodeGenOpt::None)
-    ResumesLeft = pruneUnreachableResumes(Fn, Resumes, CleanupLPads);
+    ResumesLeft = pruneUnreachableResumes(Resumes, CleanupLPads);
 
   if (ResumesLeft == 0)
     return true; // We pruned them all.
 
   // Find the rewind function if we didn't already.
   if (!RewindFunction) {
-    FunctionType *FTy = FunctionType::get(Type::getVoidTy(Ctx),
-                                          Type::getInt8PtrTy(Ctx), false);
-    const char *RewindName = TLI->getLibcallName(RTLIB::UNWIND_RESUME);
-    RewindFunction = Fn.getParent()->getOrInsertFunction(RewindName, FTy);
+    FunctionType *FTy =
+        FunctionType::get(Type::getVoidTy(Ctx), Type::getInt8PtrTy(Ctx), false);
+    const char *RewindName = TLI.getLibcallName(RTLIB::UNWIND_RESUME);
+    RewindFunction = F.getParent()->getOrInsertFunction(RewindName, FTy);
   }
 
   // Create the basic block where the _Unwind_Resume call will live.
@@ -232,22 +206,27 @@ bool DwarfEHPrepare::InsertUnwindResumeCalls(Function &Fn) {
 
     // Call the _Unwind_Resume function.
     CallInst *CI = CallInst::Create(RewindFunction, ExnObj, "", UnwindBB);
-    CI->setCallingConv(TLI->getLibcallCallingConv(RTLIB::UNWIND_RESUME));
+    CI->setCallingConv(TLI.getLibcallCallingConv(RTLIB::UNWIND_RESUME));
 
     // We never expect _Unwind_Resume to return.
+    CI->setDoesNotReturn();
     new UnreachableInst(Ctx, UnwindBB);
     return true;
   }
 
-  BasicBlock *UnwindBB = BasicBlock::Create(Ctx, "unwind_resume", &Fn);
-  PHINode *PN = PHINode::Create(Type::getInt8PtrTy(Ctx), ResumesLeft,
-                                "exn.obj", UnwindBB);
+  std::vector<DominatorTree::UpdateType> Updates;
+  Updates.reserve(Resumes.size());
+
+  BasicBlock *UnwindBB = BasicBlock::Create(Ctx, "unwind_resume", &F);
+  PHINode *PN = PHINode::Create(Type::getInt8PtrTy(Ctx), ResumesLeft, "exn.obj",
+                                UnwindBB);
 
   // Extract the exception object from the ResumeInst and add it to the PHI node
   // that feeds the _Unwind_Resume call.
   for (ResumeInst *RI : Resumes) {
     BasicBlock *Parent = RI->getParent();
     BranchInst::Create(UnwindBB, Parent);
+    Updates.push_back({DominatorTree::Insert, Parent, UnwindBB});
 
     Value *ExnObj = GetExceptionObject(RI);
     PN->addIncoming(ExnObj, Parent);
@@ -257,21 +236,100 @@ bool DwarfEHPrepare::InsertUnwindResumeCalls(Function &Fn) {
 
   // Call the function.
   CallInst *CI = CallInst::Create(RewindFunction, PN, "", UnwindBB);
-  CI->setCallingConv(TLI->getLibcallCallingConv(RTLIB::UNWIND_RESUME));
+  CI->setCallingConv(TLI.getLibcallCallingConv(RTLIB::UNWIND_RESUME));
 
   // We never expect _Unwind_Resume to return.
+  CI->setDoesNotReturn();
   new UnreachableInst(Ctx, UnwindBB);
+
+  if (DTU && RequireAndPreserveDomTree)
+    DTU->applyUpdates(Updates);
+
   return true;
 }
 
-bool DwarfEHPrepare::runOnFunction(Function &Fn) {
-  const TargetMachine &TM =
-      getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
-  DT = OptLevel != CodeGenOpt::None
-      ? &getAnalysis<DominatorTreeWrapperPass>().getDomTree() : nullptr;
-  TLI = TM.getSubtargetImpl(Fn)->getTargetLowering();
-  bool Changed = InsertUnwindResumeCalls(Fn);
-  DT = nullptr;
-  TLI = nullptr;
+bool DwarfEHPrepare::run() {
+  assert(((OptLevel == CodeGenOpt::None || !RequireAndPreserveDomTree) ||
+          (DTU &&
+           DTU->getDomTree().verify(DominatorTree::VerificationLevel::Full))) &&
+         "Original domtree is invalid?");
+
+  bool Changed = InsertUnwindResumeCalls();
+
+  assert(((OptLevel == CodeGenOpt::None || !RequireAndPreserveDomTree) ||
+          (DTU &&
+           DTU->getDomTree().verify(DominatorTree::VerificationLevel::Full))) &&
+         "Original domtree is invalid?");
+
   return Changed;
 }
+
+static bool prepareDwarfEH(CodeGenOpt::Level OptLevel,
+                           FunctionCallee &RewindFunction, Function &F,
+                           const TargetLowering &TLI, DominatorTree *DT,
+                           const TargetTransformInfo *TTI) {
+  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+
+  return DwarfEHPrepare(OptLevel, RewindFunction, F, TLI, DT ? &DTU : nullptr,
+                        TTI)
+      .run();
+}
+
+namespace {
+
+class DwarfEHPrepareLegacyPass : public FunctionPass {
+  // RewindFunction - _Unwind_Resume or the target equivalent.
+  FunctionCallee RewindFunction = nullptr;
+
+  CodeGenOpt::Level OptLevel;
+
+public:
+  static char ID; // Pass identification, replacement for typeid.
+
+  DwarfEHPrepareLegacyPass(CodeGenOpt::Level OptLevel = CodeGenOpt::Default)
+      : FunctionPass(ID), OptLevel(OptLevel) {}
+
+  bool runOnFunction(Function &F) override {
+    const TargetMachine &TM =
+        getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
+    const TargetLowering &TLI = *TM.getSubtargetImpl(F)->getTargetLowering();
+    DominatorTree *DT = nullptr;
+    const TargetTransformInfo *TTI = nullptr;
+    if (OptLevel != CodeGenOpt::None) {
+      DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+      TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    }
+    return prepareDwarfEH(OptLevel, RewindFunction, F, TLI, DT, TTI);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetPassConfig>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    if (OptLevel != CodeGenOpt::None) {
+      AU.addRequired<DominatorTreeWrapperPass>();
+      AU.addRequired<TargetTransformInfoWrapperPass>();
+      if (RequireAndPreserveDomTree)
+        AU.addPreserved<DominatorTreeWrapperPass>();
+    }
+  }
+
+  StringRef getPassName() const override {
+    return "Exception handling preparation";
+  }
+};
+
+} // end anonymous namespace
+
+char DwarfEHPrepareLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(DwarfEHPrepareLegacyPass, DEBUG_TYPE,
+                      "Prepare DWARF exceptions", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(DwarfEHPrepareLegacyPass, DEBUG_TYPE,
+                    "Prepare DWARF exceptions", false, false)
+
+FunctionPass *llvm::createDwarfEHPass(CodeGenOpt::Level OptLevel) {
+  return new DwarfEHPrepareLegacyPass(OptLevel);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/EarlyIfConversion.cpp b/contrib/llvm-project/llvm/lib/CodeGen/EarlyIfConversion.cpp
index 96d4efb856c1..cf7d93d6a33a 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/EarlyIfConversion.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/EarlyIfConversion.cpp
@@ -27,6 +27,7 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/MachineTraceMetrics.h"
 #include "llvm/CodeGen/Passes.h"
@@ -264,7 +265,8 @@ bool SSAIfConv::InstrDependenciesAllowIfConv(MachineInstr *I) {
 
     // Remember clobbered regunits.
     if (MO.isDef() && Register::isPhysicalRegister(Reg))
-      for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
+      for (MCRegUnitIterator Units(Reg.asMCReg(), TRI); Units.isValid();
+           ++Units)
         ClobberedRegUnits.set(*Units);
 
     if (!MO.readsReg() || !Register::isVirtualRegister(Reg))
@@ -363,7 +365,7 @@ bool SSAIfConv::findInsertionPoint() {
   // Keep track of live regunits before the current position.
   // Only track RegUnits that are also in ClobberedRegUnits.
   LiveRegUnits.clear();
-  SmallVector<unsigned, 8> Reads;
+  SmallVector<MCRegister, 8> Reads;
   MachineBasicBlock::iterator FirstTerm = Head->getFirstTerminator();
   MachineBasicBlock::iterator I = Head->end();
   MachineBasicBlock::iterator B = Head->begin();
@@ -385,11 +387,12 @@ bool SSAIfConv::findInsertionPoint() {
         continue;
       // I clobbers Reg, so it isn't live before I.
       if (MO.isDef())
-        for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
+        for (MCRegUnitIterator Units(Reg.asMCReg(), TRI); Units.isValid();
+             ++Units)
           LiveRegUnits.erase(*Units);
       // Unless I reads Reg.
       if (MO.readsReg())
-        Reads.push_back(Reg);
+        Reads.push_back(Reg.asMCReg());
     }
     // Anything read by I is live before I.
     while (!Reads.empty())
@@ -794,6 +797,17 @@ static unsigned adjCycles(unsigned Cyc, int Delta) {
   return Cyc + Delta;
 }
 
+namespace {
+/// Helper class to simplify emission of cycle counts into optimization remarks.
+struct Cycles {
+  const char *Key;
+  unsigned Value;
+};
+template <typename Remark> Remark &operator<<(Remark &R, Cycles C) {
+  return R << ore::NV(C.Key, C.Value) << (C.Value == 1 ? " cycle" : " cycles");
+}
+} // anonymous namespace
+
 /// Apply cost model and heuristics to the if-conversion in IfConv.
 /// Return true if the conversion is a good idea.
 ///
@@ -814,6 +828,9 @@ bool EarlyIfConverter::shouldConvertIf() {
   // Set a somewhat arbitrary limit on the critical path extension we accept.
   unsigned CritLimit = SchedModel.MispredictPenalty/2;
 
+  MachineBasicBlock &MBB = *IfConv.Head;
+  MachineOptimizationRemarkEmitter MORE(*MBB.getParent(), nullptr);
+
   // If-conversion only makes sense when there is unexploited ILP. Compute the
   // maximum-ILP resource length of the trace after if-conversion. Compare it
   // to the shortest critical path.
@@ -825,6 +842,17 @@ bool EarlyIfConverter::shouldConvertIf() {
                     << ", minimal critical path " << MinCrit << '\n');
   if (ResLength > MinCrit + CritLimit) {
     LLVM_DEBUG(dbgs() << "Not enough available ILP.\n");
+    MORE.emit([&]() {
+      MachineOptimizationRemarkMissed R(DEBUG_TYPE, "IfConversion",
+                                        MBB.findDebugLoc(MBB.back()), &MBB);
+      R << "did not if-convert branch: the resulting critical path ("
+        << Cycles{"ResLength", ResLength}
+        << ") would extend the shorter leg's critical path ("
+        << Cycles{"MinCrit", MinCrit} << ") by more than the threshold of "
+        << Cycles{"CritLimit", CritLimit}
+        << ", which cannot be hidden by available ILP.";
+      return R;
+    });
     return false;
   }
 
@@ -839,6 +867,14 @@ bool EarlyIfConverter::shouldConvertIf() {
   // Look at all the tail phis, and compute the critical path extension caused
   // by inserting select instructions.
   MachineTraceMetrics::Trace TailTrace = MinInstr->getTrace(IfConv.Tail);
+  struct CriticalPathInfo {
+    unsigned Extra; // Count of extra cycles that the component adds.
+    unsigned Depth; // Absolute depth of the component in cycles.
+  };
+  CriticalPathInfo Cond{};
+  CriticalPathInfo TBlock{};
+  CriticalPathInfo FBlock{};
+  bool ShouldConvert = true;
   for (unsigned i = 0, e = IfConv.PHIs.size(); i != e; ++i) {
     SSAIfConv::PHIInfo &PI = IfConv.PHIs[i];
     unsigned Slack = TailTrace.getInstrSlack(*PI.PHI);
@@ -850,9 +886,11 @@ bool EarlyIfConverter::shouldConvertIf() {
     if (CondDepth > MaxDepth) {
       unsigned Extra = CondDepth - MaxDepth;
       LLVM_DEBUG(dbgs() << "Condition adds " << Extra << " cycles.\n");
+      if (Extra > Cond.Extra)
+        Cond = {Extra, CondDepth};
       if (Extra > CritLimit) {
         LLVM_DEBUG(dbgs() << "Exceeds limit of " << CritLimit << '\n');
-        return false;
+        ShouldConvert = false;
       }
     }
 
@@ -861,9 +899,11 @@ bool EarlyIfConverter::shouldConvertIf() {
     if (TDepth > MaxDepth) {
       unsigned Extra = TDepth - MaxDepth;
       LLVM_DEBUG(dbgs() << "TBB data adds " << Extra << " cycles.\n");
+      if (Extra > TBlock.Extra)
+        TBlock = {Extra, TDepth};
       if (Extra > CritLimit) {
         LLVM_DEBUG(dbgs() << "Exceeds limit of " << CritLimit << '\n');
-        return false;
+        ShouldConvert = false;
       }
     }
 
@@ -872,13 +912,63 @@ bool EarlyIfConverter::shouldConvertIf() {
     if (FDepth > MaxDepth) {
       unsigned Extra = FDepth - MaxDepth;
       LLVM_DEBUG(dbgs() << "FBB data adds " << Extra << " cycles.\n");
+      if (Extra > FBlock.Extra)
+        FBlock = {Extra, FDepth};
       if (Extra > CritLimit) {
         LLVM_DEBUG(dbgs() << "Exceeds limit of " << CritLimit << '\n');
-        return false;
+        ShouldConvert = false;
       }
     }
   }
-  return true;
+
+  // Organize by "short" and "long" legs, since the diagnostics get confusing
+  // when referring to the "true" and "false" sides of the branch, given that
+  // those don't always correlate with what the user wrote in source-terms.
+  const CriticalPathInfo Short = TBlock.Extra > FBlock.Extra ? FBlock : TBlock;
+  const CriticalPathInfo Long = TBlock.Extra > FBlock.Extra ? TBlock : FBlock;
+
+  if (ShouldConvert) {
+    MORE.emit([&]() {
+      MachineOptimizationRemark R(DEBUG_TYPE, "IfConversion",
+                                  MBB.back().getDebugLoc(), &MBB);
+      R << "performing if-conversion on branch: the condition adds "
+        << Cycles{"CondCycles", Cond.Extra} << " to the critical path";
+      if (Short.Extra > 0)
+        R << ", and the short leg adds another "
+          << Cycles{"ShortCycles", Short.Extra};
+      if (Long.Extra > 0)
+        R << ", and the long leg adds another "
+          << Cycles{"LongCycles", Long.Extra};
+      R << ", each staying under the threshold of "
+        << Cycles{"CritLimit", CritLimit} << ".";
+      return R;
+    });
+  } else {
+    MORE.emit([&]() {
+      MachineOptimizationRemarkMissed R(DEBUG_TYPE, "IfConversion",
+                                        MBB.back().getDebugLoc(), &MBB);
+      R << "did not if-convert branch: the condition would add "
+        << Cycles{"CondCycles", Cond.Extra} << " to the critical path";
+      if (Cond.Extra > CritLimit)
+        R << " exceeding the limit of " << Cycles{"CritLimit", CritLimit};
+      if (Short.Extra > 0) {
+        R << ", and the short leg would add another "
+          << Cycles{"ShortCycles", Short.Extra};
+        if (Short.Extra > CritLimit)
+          R << " exceeding the limit of " << Cycles{"CritLimit", CritLimit};
+      }
+      if (Long.Extra > 0) {
+        R << ", and the long leg would add another "
+          << Cycles{"LongCycles", Long.Extra};
+        if (Long.Extra > CritLimit)
+          R << " exceeding the limit of " << Cycles{"CritLimit", CritLimit};
+      }
+      R << ".";
+      return R;
+    });
+  }
+
+  return ShouldConvert;
 }
 
 /// Attempt repeated if-conversion on MBB, return true if successful.
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/ExpandReductions.cpp b/contrib/llvm-project/llvm/lib/CodeGen/ExpandReductions.cpp
index 45f21c1085dd..a4c9f02dc64d 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/ExpandReductions.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/ExpandReductions.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // This pass implements IR expansion for reduction intrinsics, allowing targets
-// to enable the experimental intrinsics until just before codegen.
+// to enable the intrinsics until just before codegen.
 //
 //===----------------------------------------------------------------------===//
 
@@ -30,49 +30,49 @@ namespace {
 
 unsigned getOpcode(Intrinsic::ID ID) {
   switch (ID) {
-  case Intrinsic::experimental_vector_reduce_v2_fadd:
+  case Intrinsic::vector_reduce_fadd:
     return Instruction::FAdd;
-  case Intrinsic::experimental_vector_reduce_v2_fmul:
+  case Intrinsic::vector_reduce_fmul:
     return Instruction::FMul;
-  case Intrinsic::experimental_vector_reduce_add:
+  case Intrinsic::vector_reduce_add:
     return Instruction::Add;
-  case Intrinsic::experimental_vector_reduce_mul:
+  case Intrinsic::vector_reduce_mul:
     return Instruction::Mul;
-  case Intrinsic::experimental_vector_reduce_and:
+  case Intrinsic::vector_reduce_and:
     return Instruction::And;
-  case Intrinsic::experimental_vector_reduce_or:
+  case Intrinsic::vector_reduce_or:
     return Instruction::Or;
-  case Intrinsic::experimental_vector_reduce_xor:
+  case Intrinsic::vector_reduce_xor:
     return Instruction::Xor;
-  case Intrinsic::experimental_vector_reduce_smax:
-  case Intrinsic::experimental_vector_reduce_smin:
-  case Intrinsic::experimental_vector_reduce_umax:
-  case Intrinsic::experimental_vector_reduce_umin:
+  case Intrinsic::vector_reduce_smax:
+  case Intrinsic::vector_reduce_smin:
+  case Intrinsic::vector_reduce_umax:
+  case Intrinsic::vector_reduce_umin:
     return Instruction::ICmp;
-  case Intrinsic::experimental_vector_reduce_fmax:
-  case Intrinsic::experimental_vector_reduce_fmin:
+  case Intrinsic::vector_reduce_fmax:
+  case Intrinsic::vector_reduce_fmin:
     return Instruction::FCmp;
   default:
     llvm_unreachable("Unexpected ID");
   }
 }
 
-RecurrenceDescriptor::MinMaxRecurrenceKind getMRK(Intrinsic::ID ID) {
+RecurKind getRK(Intrinsic::ID ID) {
   switch (ID) {
-  case Intrinsic::experimental_vector_reduce_smax:
-    return RecurrenceDescriptor::MRK_SIntMax;
-  case Intrinsic::experimental_vector_reduce_smin:
-    return RecurrenceDescriptor::MRK_SIntMin;
-  case Intrinsic::experimental_vector_reduce_umax:
-    return RecurrenceDescriptor::MRK_UIntMax;
-  case Intrinsic::experimental_vector_reduce_umin:
-    return RecurrenceDescriptor::MRK_UIntMin;
-  case Intrinsic::experimental_vector_reduce_fmax:
-    return RecurrenceDescriptor::MRK_FloatMax;
-  case Intrinsic::experimental_vector_reduce_fmin:
-    return RecurrenceDescriptor::MRK_FloatMin;
+  case Intrinsic::vector_reduce_smax:
+    return RecurKind::SMax;
+  case Intrinsic::vector_reduce_smin:
+    return RecurKind::SMin;
+  case Intrinsic::vector_reduce_umax:
+    return RecurKind::UMax;
+  case Intrinsic::vector_reduce_umin:
+    return RecurKind::UMin;
+  case Intrinsic::vector_reduce_fmax:
+    return RecurKind::FMax;
+  case Intrinsic::vector_reduce_fmin:
+    return RecurKind::FMin;
   default:
-    return RecurrenceDescriptor::MRK_Invalid;
+    return RecurKind::None;
   }
 }
 
@@ -83,19 +83,19 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
     if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
       switch (II->getIntrinsicID()) {
       default: break;
-      case Intrinsic::experimental_vector_reduce_v2_fadd:
-      case Intrinsic::experimental_vector_reduce_v2_fmul:
-      case Intrinsic::experimental_vector_reduce_add:
-      case Intrinsic::experimental_vector_reduce_mul:
-      case Intrinsic::experimental_vector_reduce_and:
-      case Intrinsic::experimental_vector_reduce_or:
-      case Intrinsic::experimental_vector_reduce_xor:
-      case Intrinsic::experimental_vector_reduce_smax:
-      case Intrinsic::experimental_vector_reduce_smin:
-      case Intrinsic::experimental_vector_reduce_umax:
-      case Intrinsic::experimental_vector_reduce_umin:
-      case Intrinsic::experimental_vector_reduce_fmax:
-      case Intrinsic::experimental_vector_reduce_fmin:
+      case Intrinsic::vector_reduce_fadd:
+      case Intrinsic::vector_reduce_fmul:
+      case Intrinsic::vector_reduce_add:
+      case Intrinsic::vector_reduce_mul:
+      case Intrinsic::vector_reduce_and:
+      case Intrinsic::vector_reduce_or:
+      case Intrinsic::vector_reduce_xor:
+      case Intrinsic::vector_reduce_smax:
+      case Intrinsic::vector_reduce_smin:
+      case Intrinsic::vector_reduce_umax:
+      case Intrinsic::vector_reduce_umin:
+      case Intrinsic::vector_reduce_fmax:
+      case Intrinsic::vector_reduce_fmin:
         if (TTI->shouldExpandReduction(II))
           Worklist.push_back(II);
 
@@ -108,7 +108,7 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
     FastMathFlags FMF =
         isa<FPMathOperator>(II) ? II->getFastMathFlags() : FastMathFlags{};
     Intrinsic::ID ID = II->getIntrinsicID();
-    RecurrenceDescriptor::MinMaxRecurrenceKind MRK = getMRK(ID);
+    RecurKind RK = getRK(ID);
 
     Value *Rdx = nullptr;
     IRBuilder<> Builder(II);
@@ -116,42 +116,54 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
     Builder.setFastMathFlags(FMF);
     switch (ID) {
     default: llvm_unreachable("Unexpected intrinsic!");
-    case Intrinsic::experimental_vector_reduce_v2_fadd:
-    case Intrinsic::experimental_vector_reduce_v2_fmul: {
+    case Intrinsic::vector_reduce_fadd:
+    case Intrinsic::vector_reduce_fmul: {
       // FMFs must be attached to the call, otherwise it's an ordered reduction
       // and it can't be handled by generating a shuffle sequence.
       Value *Acc = II->getArgOperand(0);
       Value *Vec = II->getArgOperand(1);
       if (!FMF.allowReassoc())
-        Rdx = getOrderedReduction(Builder, Acc, Vec, getOpcode(ID), MRK);
+        Rdx = getOrderedReduction(Builder, Acc, Vec, getOpcode(ID), RK);
       else {
         if (!isPowerOf2_32(
                 cast<FixedVectorType>(Vec->getType())->getNumElements()))
           continue;
 
-        Rdx = getShuffleReduction(Builder, Vec, getOpcode(ID), MRK);
+        Rdx = getShuffleReduction(Builder, Vec, getOpcode(ID), RK);
         Rdx = Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(ID),
                                   Acc, Rdx, "bin.rdx");
       }
       break;
     }
-    case Intrinsic::experimental_vector_reduce_add:
-    case Intrinsic::experimental_vector_reduce_mul:
-    case Intrinsic::experimental_vector_reduce_and:
-    case Intrinsic::experimental_vector_reduce_or:
-    case Intrinsic::experimental_vector_reduce_xor:
-    case Intrinsic::experimental_vector_reduce_smax:
-    case Intrinsic::experimental_vector_reduce_smin:
-    case Intrinsic::experimental_vector_reduce_umax:
-    case Intrinsic::experimental_vector_reduce_umin:
-    case Intrinsic::experimental_vector_reduce_fmax:
-    case Intrinsic::experimental_vector_reduce_fmin: {
+    case Intrinsic::vector_reduce_add:
+    case Intrinsic::vector_reduce_mul:
+    case Intrinsic::vector_reduce_and:
+    case Intrinsic::vector_reduce_or:
+    case Intrinsic::vector_reduce_xor:
+    case Intrinsic::vector_reduce_smax:
+    case Intrinsic::vector_reduce_smin:
+    case Intrinsic::vector_reduce_umax:
+    case Intrinsic::vector_reduce_umin: {
       Value *Vec = II->getArgOperand(0);
       if (!isPowerOf2_32(
               cast<FixedVectorType>(Vec->getType())->getNumElements()))
         continue;
 
-      Rdx = getShuffleReduction(Builder, Vec, getOpcode(ID), MRK);
+      Rdx = getShuffleReduction(Builder, Vec, getOpcode(ID), RK);
+      break;
+    }
+    case Intrinsic::vector_reduce_fmax:
+    case Intrinsic::vector_reduce_fmin: {
+      // FIXME: We only expand 'fast' reductions here because the underlying
+      //        code in createMinMaxOp() assumes that comparisons use 'fast'
+      //        semantics.
+      Value *Vec = II->getArgOperand(0);
+      if (!isPowerOf2_32(
+              cast<FixedVectorType>(Vec->getType())->getNumElements()) ||
+          !FMF.isFast())
+        continue;
+
+      Rdx = getShuffleReduction(Builder, Vec, getOpcode(ID), RK);
       break;
     }
     }
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp b/contrib/llvm-project/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp
index 27319804049d..f8f99b7e87f2 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp
@@ -46,6 +46,20 @@ static cl::opt<bool> FixupSCSExtendSlotSize(
     cl::desc("Allow spill in spill slot of greater size than register size"),
     cl::Hidden);
 
+static cl::opt<bool> PassGCPtrInCSR(
+    "fixup-allow-gcptr-in-csr", cl::Hidden, cl::init(false),
+    cl::desc("Allow passing GC Pointer arguments in callee saved registers"));
+
+static cl::opt<bool> EnableCopyProp(
+    "fixup-scs-enable-copy-propagation", cl::Hidden, cl::init(true),
+    cl::desc("Enable simple copy propagation during register reloading"));
+
+// This is purely debugging option.
+// It may be handy for investigating statepoint spilling issues.
+static cl::opt<unsigned> MaxStatepointsWithRegs(
+    "fixup-max-csr-statepoints", cl::Hidden,
+    cl::desc("Max number of statepoints allowed to pass GC Ptrs in registers"));
+
 namespace {
 
 class FixupStatepointCallerSaved : public MachineFunctionPass {
@@ -67,6 +81,7 @@ public:
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 };
+
 } // End anonymous namespace.
 
 char FixupStatepointCallerSaved::ID = 0;
@@ -83,7 +98,101 @@ static unsigned getRegisterSize(const TargetRegisterInfo &TRI, Register Reg) {
   return TRI.getSpillSize(*RC);
 }
 
+// Try to eliminate redundant copy to register which we're going to
+// spill, i.e. try to change:
+//    X = COPY Y
+//    SPILL X
+//  to
+//    SPILL Y
+//  If there are no uses of X between copy and STATEPOINT, that COPY
+//  may be eliminated.
+//  Reg - register we're about to spill
+//  RI - On entry points to statepoint.
+//       On successful copy propagation set to new spill point.
+//  IsKill - set to true if COPY is Kill (there are no uses of Y)
+//  Returns either found source copy register or original one.
+static Register performCopyPropagation(Register Reg,
+                                       MachineBasicBlock::iterator &RI,
+                                       bool &IsKill, const TargetInstrInfo &TII,
+                                       const TargetRegisterInfo &TRI) {
+  // First check if statepoint itself uses Reg in non-meta operands.
+  int Idx = RI->findRegisterUseOperandIdx(Reg, false, &TRI);
+  if (Idx >= 0 && (unsigned)Idx < StatepointOpers(&*RI).getNumDeoptArgsIdx()) {
+    IsKill = false;
+    return Reg;
+  }
+
+  if (!EnableCopyProp)
+    return Reg;
+
+  MachineBasicBlock *MBB = RI->getParent();
+  MachineBasicBlock::reverse_iterator E = MBB->rend();
+  MachineInstr *Def = nullptr, *Use = nullptr;
+  for (auto It = ++(RI.getReverse()); It != E; ++It) {
+    if (It->readsRegister(Reg, &TRI) && !Use)
+      Use = &*It;
+    if (It->modifiesRegister(Reg, &TRI)) {
+      Def = &*It;
+      break;
+    }
+  }
+
+  if (!Def)
+    return Reg;
+
+  auto DestSrc = TII.isCopyInstr(*Def);
+  if (!DestSrc || DestSrc->Destination->getReg() != Reg)
+    return Reg;
+
+  Register SrcReg = DestSrc->Source->getReg();
+
+  if (getRegisterSize(TRI, Reg) != getRegisterSize(TRI, SrcReg))
+    return Reg;
+
+  LLVM_DEBUG(dbgs() << "spillRegisters: perform copy propagation "
+                    << printReg(Reg, &TRI) << " -> " << printReg(SrcReg, &TRI)
+                    << "\n");
+
+  // Insert spill immediately after Def
+  RI = ++MachineBasicBlock::iterator(Def);
+  IsKill = DestSrc->Source->isKill();
+
+  // There are no uses of original register between COPY and STATEPOINT.
+  // There can't be any after STATEPOINT, so we can eliminate Def.
+  if (!Use) {
+    LLVM_DEBUG(dbgs() << "spillRegisters: removing dead copy " << *Def);
+    Def->eraseFromParent();
+  }
+  return SrcReg;
+}
+
 namespace {
+// Pair {Register, FrameIndex}
+using RegSlotPair = std::pair<Register, int>;
+
+// Keeps track of what reloads were inserted in MBB.
+class RegReloadCache {
+  using ReloadSet = SmallSet<RegSlotPair, 8>;
+  DenseMap<const MachineBasicBlock *, ReloadSet> Reloads;
+
+public:
+  RegReloadCache() = default;
+
+  // Record reload of Reg from FI in block MBB
+  void recordReload(Register Reg, int FI, const MachineBasicBlock *MBB) {
+    RegSlotPair RSP(Reg, FI);
+    auto Res = Reloads[MBB].insert(RSP);
+    (void)Res;
+    assert(Res.second && "reload already exists");
+  }
+
+  // Does basic block MBB contains reload of Reg from FI?
+  bool hasReload(Register Reg, int FI, const MachineBasicBlock *MBB) {
+    RegSlotPair RSP(Reg, FI);
+    return Reloads.count(MBB) && Reloads[MBB].count(RSP);
+  }
+};
+
 // Cache used frame indexes during statepoint re-write to re-use them in
 // processing next statepoint instruction.
 // Two strategies. One is to preserve the size of spill slot while another one
@@ -105,24 +214,62 @@ private:
   // size will be increased.
   DenseMap<unsigned, FrameIndexesPerSize> Cache;
 
+  // Keeps track of slots reserved for the shared landing pad processing.
+  // Initialized from GlobalIndices for the current EHPad.
+  SmallSet<int, 8> ReservedSlots;
+
+  // Landing pad can be destination of several statepoints. Every register
+  // defined by such statepoints must be spilled to the same stack slot.
+  // This map keeps that information.
+  DenseMap<const MachineBasicBlock *, SmallVector<RegSlotPair, 8>>
+      GlobalIndices;
+
+  FrameIndexesPerSize &getCacheBucket(unsigned Size) {
+    // In FixupSCSExtendSlotSize mode the bucket with 0 index is used
+    // for all sizes.
+    return Cache[FixupSCSExtendSlotSize ? 0 : Size];
+  }
+
 public:
   FrameIndexesCache(MachineFrameInfo &MFI, const TargetRegisterInfo &TRI)
       : MFI(MFI), TRI(TRI) {}
   // Reset the current state of used frame indexes. After invocation of
-  // this function all frame indexes are available for allocation.
-  void reset() {
+  // this function all frame indexes are available for allocation with
+  // the exception of slots reserved for landing pad processing (if any).
+  void reset(const MachineBasicBlock *EHPad) {
     for (auto &It : Cache)
       It.second.Index = 0;
+
+    ReservedSlots.clear();
+    if (EHPad && GlobalIndices.count(EHPad))
+      for (auto &RSP : GlobalIndices[EHPad])
+        ReservedSlots.insert(RSP.second);
   }
+
   // Get frame index to spill the register.
-  int getFrameIndex(Register Reg) {
+  int getFrameIndex(Register Reg, MachineBasicBlock *EHPad) {
+    // Check if slot for Reg is already reserved at EHPad.
+    auto It = GlobalIndices.find(EHPad);
+    if (It != GlobalIndices.end()) {
+      auto &Vec = It->second;
+      auto Idx = llvm::find_if(
+          Vec, [Reg](RegSlotPair &RSP) { return Reg == RSP.first; });
+      if (Idx != Vec.end()) {
+        int FI = Idx->second;
+        LLVM_DEBUG(dbgs() << "Found global FI " << FI << " for register "
+                          << printReg(Reg, &TRI) << " at "
+                          << printMBBReference(*EHPad) << "\n");
+        assert(ReservedSlots.count(FI) && "using unreserved slot");
+        return FI;
+      }
+    }
+
     unsigned Size = getRegisterSize(TRI, Reg);
-    // In FixupSCSExtendSlotSize mode the bucket with 0 index is used
-    // for all sizes.
-    unsigned Bucket = FixupSCSExtendSlotSize ? 0 : Size;
-    FrameIndexesPerSize &Line = Cache[Bucket];
-    if (Line.Index < Line.Slots.size()) {
+    FrameIndexesPerSize &Line = getCacheBucket(Size);
+    while (Line.Index < Line.Slots.size()) {
       int FI = Line.Slots[Line.Index++];
+      if (ReservedSlots.count(FI))
+        continue;
       // If all sizes are kept together we probably need to extend the
       // spill slot size.
       if (MFI.getObjectSize(FI) < Size) {
@@ -136,15 +283,25 @@ public:
     NumSpillSlotsAllocated++;
     Line.Slots.push_back(FI);
     ++Line.Index;
+
+    // Remember assignment {Reg, FI} for EHPad
+    if (EHPad) {
+      GlobalIndices[EHPad].push_back(std::make_pair(Reg, FI));
+      LLVM_DEBUG(dbgs() << "Reserved FI " << FI << " for spilling reg "
+                        << printReg(Reg, &TRI) << " at landing pad "
+                        << printMBBReference(*EHPad) << "\n");
+    }
+
     return FI;
   }
+
   // Sort all registers to spill in descendent order. In the
   // FixupSCSExtendSlotSize mode it will minimize the total frame size.
   // In non FixupSCSExtendSlotSize mode we can skip this step.
   void sortRegisters(SmallVectorImpl<Register> &Regs) {
     if (!FixupSCSExtendSlotSize)
       return;
-    llvm::sort(Regs.begin(), Regs.end(), [&](Register &A, Register &B) {
+    llvm::sort(Regs, [&](Register &A, Register &B) {
       return getRegisterSize(TRI, A) > getRegisterSize(TRI, B);
     });
   }
@@ -156,6 +313,8 @@ private:
   // statepoint instruction.
   MachineInstr &MI;
   MachineFunction &MF;
+  // If non-null then statepoint is invoke, and this points to the landing pad.
+  MachineBasicBlock *EHPad;
   const TargetRegisterInfo &TRI;
   const TargetInstrInfo &TII;
   MachineFrameInfo &MFI;
@@ -163,36 +322,77 @@ private:
   const uint32_t *Mask;
   // Cache of frame indexes used on previous instruction processing.
   FrameIndexesCache &CacheFI;
+  bool AllowGCPtrInCSR;
   // Operands with physical registers requiring spilling.
   SmallVector<unsigned, 8> OpsToSpill;
   // Set of register to spill.
   SmallVector<Register, 8> RegsToSpill;
+  // Set of registers to reload after statepoint.
+  SmallVector<Register, 8> RegsToReload;
   // Map Register to Frame Slot index.
   DenseMap<Register, int> RegToSlotIdx;
 
 public:
   StatepointState(MachineInstr &MI, const uint32_t *Mask,
-                  FrameIndexesCache &CacheFI)
+                  FrameIndexesCache &CacheFI, bool AllowGCPtrInCSR)
       : MI(MI), MF(*MI.getMF()), TRI(*MF.getSubtarget().getRegisterInfo()),
         TII(*MF.getSubtarget().getInstrInfo()), MFI(MF.getFrameInfo()),
-        Mask(Mask), CacheFI(CacheFI) {}
+        Mask(Mask), CacheFI(CacheFI), AllowGCPtrInCSR(AllowGCPtrInCSR) {
+
+    // Find statepoint's landing pad, if any.
+    EHPad = nullptr;
+    MachineBasicBlock *MBB = MI.getParent();
+    // Invoke statepoint must be last one in block.
+    bool Last = std::none_of(++MI.getIterator(), MBB->end().getInstrIterator(),
+                             [](MachineInstr &I) {
+                               return I.getOpcode() == TargetOpcode::STATEPOINT;
+                             });
+
+    if (!Last)
+      return;
+
+    auto IsEHPad = [](MachineBasicBlock *B) { return B->isEHPad(); };
+
+    assert(llvm::count_if(MBB->successors(), IsEHPad) < 2 && "multiple EHPads");
+
+    auto It = llvm::find_if(MBB->successors(), IsEHPad);
+    if (It != MBB->succ_end())
+      EHPad = *It;
+  }
+
+  MachineBasicBlock *getEHPad() const { return EHPad; }
+
   // Return true if register is callee saved.
   bool isCalleeSaved(Register Reg) { return (Mask[Reg / 32] >> Reg % 32) & 1; }
+
   // Iterates over statepoint meta args to find caller saver registers.
   // Also cache the size of found registers.
   // Returns true if caller save registers found.
   bool findRegistersToSpill() {
+    SmallSet<Register, 8> GCRegs;
+    // All GC pointer operands assigned to registers produce new value.
+    // Since they're tied to their defs, it is enough to collect def registers.
+    for (const auto &Def : MI.defs())
+      GCRegs.insert(Def.getReg());
+
     SmallSet<Register, 8> VisitedRegs;
     for (unsigned Idx = StatepointOpers(&MI).getVarIdx(),
                   EndIdx = MI.getNumOperands();
          Idx < EndIdx; ++Idx) {
       MachineOperand &MO = MI.getOperand(Idx);
-      if (!MO.isReg() || MO.isImplicit())
+      // Leave `undef` operands as is, StackMaps will rewrite them
+      // into a constant.
+      if (!MO.isReg() || MO.isImplicit() || MO.isUndef())
         continue;
       Register Reg = MO.getReg();
       assert(Reg.isPhysical() && "Only physical regs are expected");
-      if (isCalleeSaved(Reg))
+
+      if (isCalleeSaved(Reg) && (AllowGCPtrInCSR || !is_contained(GCRegs, Reg)))
         continue;
+
+      LLVM_DEBUG(dbgs() << "Will spill " << printReg(Reg, &TRI) << " at index "
+                        << Idx << "\n");
+
       if (VisitedRegs.insert(Reg).second)
         RegsToSpill.push_back(Reg);
       OpsToSpill.push_back(Idx);
@@ -200,30 +400,109 @@ public:
     CacheFI.sortRegisters(RegsToSpill);
     return !RegsToSpill.empty();
   }
+
   // Spill all caller saved registers right before statepoint instruction.
   // Remember frame index where register is spilled.
   void spillRegisters() {
     for (Register Reg : RegsToSpill) {
-      int FI = CacheFI.getFrameIndex(Reg);
+      int FI = CacheFI.getFrameIndex(Reg, EHPad);
       const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(Reg);
-      TII.storeRegToStackSlot(*MI.getParent(), MI, Reg, true /*is_Kill*/, FI,
-                              RC, &TRI);
+
       NumSpilledRegisters++;
       RegToSlotIdx[Reg] = FI;
+
+      LLVM_DEBUG(dbgs() << "Spilling " << printReg(Reg, &TRI) << " to FI " << FI
+                        << "\n");
+
+      // Perform trivial copy propagation
+      bool IsKill = true;
+      MachineBasicBlock::iterator InsertBefore(MI);
+      Reg = performCopyPropagation(Reg, InsertBefore, IsKill, TII, TRI);
+
+      LLVM_DEBUG(dbgs() << "Insert spill before " << *InsertBefore);
+      TII.storeRegToStackSlot(*MI.getParent(), InsertBefore, Reg, IsKill, FI,
+                              RC, &TRI);
+    }
+  }
+
+  void insertReloadBefore(unsigned Reg, MachineBasicBlock::iterator It,
+                          MachineBasicBlock *MBB) {
+    const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(Reg);
+    int FI = RegToSlotIdx[Reg];
+    if (It != MBB->end()) {
+      TII.loadRegFromStackSlot(*MBB, It, Reg, FI, RC, &TRI);
+      return;
     }
+
+    // To insert reload at the end of MBB, insert it before last instruction
+    // and then swap them.
+    assert(!MBB->empty() && "Empty block");
+    --It;
+    TII.loadRegFromStackSlot(*MBB, It, Reg, FI, RC, &TRI);
+    MachineInstr *Reload = It->getPrevNode();
+    int Dummy = 0;
+    (void)Dummy;
+    assert(TII.isLoadFromStackSlot(*Reload, Dummy) == Reg);
+    assert(Dummy == FI);
+    MBB->remove(Reload);
+    MBB->insertAfter(It, Reload);
   }
+
+  // Insert reloads of (relocated) registers spilled in statepoint.
+  void insertReloads(MachineInstr *NewStatepoint, RegReloadCache &RC) {
+    MachineBasicBlock *MBB = NewStatepoint->getParent();
+    auto InsertPoint = std::next(NewStatepoint->getIterator());
+
+    for (auto Reg : RegsToReload) {
+      insertReloadBefore(Reg, InsertPoint, MBB);
+      LLVM_DEBUG(dbgs() << "Reloading " << printReg(Reg, &TRI) << " from FI "
+                        << RegToSlotIdx[Reg] << " after statepoint\n");
+
+      if (EHPad && !RC.hasReload(Reg, RegToSlotIdx[Reg], EHPad)) {
+        RC.recordReload(Reg, RegToSlotIdx[Reg], EHPad);
+        auto EHPadInsertPoint = EHPad->SkipPHIsLabelsAndDebug(EHPad->begin());
+        insertReloadBefore(Reg, EHPadInsertPoint, EHPad);
+        LLVM_DEBUG(dbgs() << "...also reload at EHPad "
+                          << printMBBReference(*EHPad) << "\n");
+      }
+    }
+  }
+
   // Re-write statepoint machine instruction to replace caller saved operands
   // with indirect memory location (frame index).
-  void rewriteStatepoint() {
+  MachineInstr *rewriteStatepoint() {
     MachineInstr *NewMI =
         MF.CreateMachineInstr(TII.get(MI.getOpcode()), MI.getDebugLoc(), true);
     MachineInstrBuilder MIB(MF, NewMI);
 
+    unsigned NumOps = MI.getNumOperands();
+
+    // New indices for the remaining defs.
+    SmallVector<unsigned, 8> NewIndices;
+    unsigned NumDefs = MI.getNumDefs();
+    for (unsigned I = 0; I < NumDefs; ++I) {
+      MachineOperand &DefMO = MI.getOperand(I);
+      assert(DefMO.isReg() && DefMO.isDef() && "Expected Reg Def operand");
+      Register Reg = DefMO.getReg();
+      if (!AllowGCPtrInCSR) {
+        assert(is_contained(RegsToSpill, Reg));
+        RegsToReload.push_back(Reg);
+      } else {
+        if (isCalleeSaved(Reg)) {
+          NewIndices.push_back(NewMI->getNumOperands());
+          MIB.addReg(Reg, RegState::Define);
+        } else {
+          NewIndices.push_back(NumOps);
+          RegsToReload.push_back(Reg);
+        }
+      }
+    }
+
     // Add End marker.
     OpsToSpill.push_back(MI.getNumOperands());
     unsigned CurOpIdx = 0;
 
-    for (unsigned I = 0; I < MI.getNumOperands(); ++I) {
+    for (unsigned I = NumDefs; I < MI.getNumOperands(); ++I) {
       MachineOperand &MO = MI.getOperand(I);
       if (I == OpsToSpill[CurOpIdx]) {
         int FI = RegToSlotIdx[MO.getReg()];
@@ -234,23 +513,38 @@ public:
         MIB.addFrameIndex(FI);
         MIB.addImm(0);
         ++CurOpIdx;
-      } else
+      } else {
         MIB.add(MO);
+        unsigned OldDef;
+        if (AllowGCPtrInCSR && MI.isRegTiedToDefOperand(I, &OldDef)) {
+          assert(OldDef < NumDefs);
+          assert(NewIndices[OldDef] < NumOps);
+          MIB->tieOperands(NewIndices[OldDef], MIB->getNumOperands() - 1);
+        }
+      }
     }
     assert(CurOpIdx == (OpsToSpill.size() - 1) && "Not all operands processed");
     // Add mem operands.
     NewMI->setMemRefs(MF, MI.memoperands());
     for (auto It : RegToSlotIdx) {
+      Register R = It.first;
       int FrameIndex = It.second;
       auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
-      auto *MMO = MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad,
-                                          getRegisterSize(TRI, It.first),
-                                          MFI.getObjectAlign(FrameIndex));
+      MachineMemOperand::Flags Flags = MachineMemOperand::MOLoad;
+      if (is_contained(RegsToReload, R))
+        Flags |= MachineMemOperand::MOStore;
+      auto *MMO =
+          MF.getMachineMemOperand(PtrInfo, Flags, getRegisterSize(TRI, R),
+                                  MFI.getObjectAlign(FrameIndex));
       NewMI->addMemOperand(MF, MMO);
     }
+
     // Insert new statepoint and erase old one.
     MI.getParent()->insert(MI, NewMI);
+
+    LLVM_DEBUG(dbgs() << "rewritten statepoint to : " << *NewMI << "\n");
     MI.eraseFromParent();
+    return NewMI;
   }
 };
 
@@ -259,28 +553,33 @@ private:
   MachineFunction &MF;
   const TargetRegisterInfo &TRI;
   FrameIndexesCache CacheFI;
+  RegReloadCache ReloadCache;
 
 public:
   StatepointProcessor(MachineFunction &MF)
       : MF(MF), TRI(*MF.getSubtarget().getRegisterInfo()),
         CacheFI(MF.getFrameInfo(), TRI) {}
 
-  bool process(MachineInstr &MI) {
+  bool process(MachineInstr &MI, bool AllowGCPtrInCSR) {
     StatepointOpers SO(&MI);
     uint64_t Flags = SO.getFlags();
     // Do nothing for LiveIn, it supports all registers.
     if (Flags & (uint64_t)StatepointFlags::DeoptLiveIn)
       return false;
+    LLVM_DEBUG(dbgs() << "\nMBB " << MI.getParent()->getNumber() << " "
+                      << MI.getParent()->getName() << " : process statepoint "
+                      << MI);
     CallingConv::ID CC = SO.getCallingConv();
     const uint32_t *Mask = TRI.getCallPreservedMask(MF, CC);
-    CacheFI.reset();
-    StatepointState SS(MI, Mask, CacheFI);
+    StatepointState SS(MI, Mask, CacheFI, AllowGCPtrInCSR);
+    CacheFI.reset(SS.getEHPad());
 
     if (!SS.findRegistersToSpill())
       return false;
 
     SS.spillRegisters();
-    SS.rewriteStatepoint();
+    auto *NewStatepoint = SS.rewriteStatepoint();
+    SS.insertReloads(NewStatepoint, ReloadCache);
     return true;
   }
 };
@@ -305,7 +604,14 @@ bool FixupStatepointCallerSaved::runOnMachineFunction(MachineFunction &MF) {
 
   bool Changed = false;
   StatepointProcessor SPP(MF);
-  for (MachineInstr *I : Statepoints)
-    Changed |= SPP.process(*I);
+  unsigned NumStatepoints = 0;
+  bool AllowGCPtrInCSR = PassGCPtrInCSR;
+  for (MachineInstr *I : Statepoints) {
+    ++NumStatepoints;
+    if (MaxStatepointsWithRegs.getNumOccurrences() &&
+        NumStatepoints >= MaxStatepointsWithRegs)
+      AllowGCPtrInCSR = false;
+    Changed |= SPP.process(*I, AllowGCPtrInCSR);
+  }
   return Changed;
 }
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GCRootLowering.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GCRootLowering.cpp
index c6730aa6b00d..e2ee0c97f94d 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/GCRootLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GCRootLowering.cpp
@@ -296,7 +296,10 @@ void GCMachineCodeAnalysis::FindStackOffsets(MachineFunction &MF) {
     } else {
       Register FrameReg; // FIXME: surely GCRoot ought to store the
                          // register that the offset is from?
-      RI->StackOffset = TFI->getFrameIndexReference(MF, RI->Num, FrameReg);
+      auto FrameOffset = TFI->getFrameIndexReference(MF, RI->Num, FrameReg);
+      assert(!FrameOffset.getScalable() &&
+             "Frame offsets with a scalable component are not supported");
+      RI->StackOffset = FrameOffset.getFixed();
       ++RI;
     }
   }
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/CSEInfo.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/CSEInfo.cpp
index c4d8777615d2..2fa208fbfaaf 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/CSEInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/CSEInfo.cpp
@@ -59,6 +59,7 @@ bool CSEConfigFull::shouldCSEOpc(unsigned Opc) {
   case TargetOpcode::G_UNMERGE_VALUES:
   case TargetOpcode::G_TRUNC:
   case TargetOpcode::G_PTR_ADD:
+  case TargetOpcode::G_EXTRACT:
     return true;
   }
   return false;
@@ -366,23 +367,30 @@ GISelInstProfileBuilder::addNodeIDFlag(unsigned Flag) const {
   return *this;
 }
 
+const GISelInstProfileBuilder &
+GISelInstProfileBuilder::addNodeIDReg(Register Reg) const {
+  LLT Ty = MRI.getType(Reg);
+  if (Ty.isValid())
+    addNodeIDRegType(Ty);
+
+  if (const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(Reg)) {
+    if (const auto *RB = RCOrRB.dyn_cast<const RegisterBank *>())
+      addNodeIDRegType(RB);
+    else if (const auto *RC = RCOrRB.dyn_cast<const TargetRegisterClass *>())
+      addNodeIDRegType(RC);
+  }
+  return *this;
+}
+
 const GISelInstProfileBuilder &GISelInstProfileBuilder::addNodeIDMachineOperand(
     const MachineOperand &MO) const {
   if (MO.isReg()) {
     Register Reg = MO.getReg();
     if (!MO.isDef())
       addNodeIDRegNum(Reg);
-    LLT Ty = MRI.getType(Reg);
-    if (Ty.isValid())
-      addNodeIDRegType(Ty);
-
-    if (const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(Reg)) {
-      if (const auto *RB = RCOrRB.dyn_cast<const RegisterBank *>())
-        addNodeIDRegType(RB);
-      else if (const auto *RC = RCOrRB.dyn_cast<const TargetRegisterClass *>())
-        addNodeIDRegType(RC);
-    }
 
+    // Profile the register properties.
+    addNodeIDReg(Reg);
     assert(!MO.isImplicit() && "Unhandled case");
   } else if (MO.isImm())
     ID.AddInteger(MO.getImm());
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp
index 88173dc4d302..2c86f06a602d 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 
 using namespace llvm;
 
@@ -41,8 +42,14 @@ CSEMIRBuilder::getDominatingInstrForID(FoldingSetNodeID &ID,
   if (MI) {
     CSEInfo->countOpcodeHit(MI->getOpcode());
     auto CurrPos = getInsertPt();
-    if (!dominates(MI, CurrPos))
+    auto MII = MachineBasicBlock::iterator(MI);
+    if (MII == CurrPos) {
+      // Move the insert point ahead of the instruction so any future uses of
+      // this builder will have the def ready.
+      setInsertPt(*CurMBB, std::next(MII));
+    } else if (!dominates(MI, CurrPos)) {
       CurMBB->splice(CurrPos, CurMBB, MI);
+    }
     return MachineInstrBuilder(getMF(), MI);
   }
   return MachineInstrBuilder();
@@ -61,6 +68,11 @@ void CSEMIRBuilder::profileDstOp(const DstOp &Op,
   case DstOp::DstType::Ty_RC:
     B.addNodeIDRegType(Op.getRegClass());
     break;
+  case DstOp::DstType::Ty_Reg: {
+    // Regs can have LLT&(RB|RC). If those exist, profile them as well.
+    B.addNodeIDReg(Op.getReg());
+    break;
+  }
   default:
     B.addNodeIDRegType(Op.getLLTTy(*getMRI()));
     break;
@@ -70,6 +82,9 @@ void CSEMIRBuilder::profileDstOp(const DstOp &Op,
 void CSEMIRBuilder::profileSrcOp(const SrcOp &Op,
                                  GISelInstProfileBuilder &B) const {
   switch (Op.getSrcOpKind()) {
+  case SrcOp::SrcType::Ty_Imm:
+    B.addNodeIDImmediate(static_cast<int64_t>(Op.getImm()));
+    break;
   case SrcOp::SrcType::Ty_Predicate:
     B.addNodeIDImmediate(static_cast<int64_t>(Op.getPredicate()));
     break;
@@ -115,7 +130,7 @@ bool CSEMIRBuilder::checkCopyToDefsPossible(ArrayRef<DstOp> DstOps) {
   if (DstOps.size() == 1)
     return true; // always possible to emit copy to just 1 vreg.
 
-  return std::all_of(DstOps.begin(), DstOps.end(), [](const DstOp &Op) {
+  return llvm::all_of(DstOps, [](const DstOp &Op) {
     DstOp::DstType DT = Op.getDstOpKind();
     return DT == DstOp::DstType::Ty_LLT || DT == DstOp::DstType::Ty_RC;
   });
@@ -131,6 +146,21 @@ CSEMIRBuilder::generateCopiesIfRequired(ArrayRef<DstOp> DstOps,
     if (Op.getDstOpKind() == DstOp::DstType::Ty_Reg)
       return buildCopy(Op.getReg(), MIB.getReg(0));
   }
+
+  // If we didn't generate a copy then we're re-using an existing node directly
+  // instead of emitting any code. Merge the debug location we wanted to emit
+  // into the instruction we're CSE'ing with. Debug locations arent part of the
+  // profile so we don't need to recompute it.
+  if (getDebugLoc()) {
+    GISelChangeObserver *Observer = getState().Observer;
+    if (Observer)
+      Observer->changingInstr(*MIB);
+    MIB->setDebugLoc(
+        DILocation::getMergedLocation(MIB->getDebugLoc(), getDebugLoc()));
+    if (Observer)
+      Observer->changedInstr(*MIB);
+  }
+
   return MIB;
 }
 
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
index 1be0ca441205..803e1527a4f0 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -30,6 +30,51 @@ using namespace llvm;
 
 void CallLowering::anchor() {}
 
+/// Helper function which updates \p Flags when \p AttrFn returns true.
+static void
+addFlagsUsingAttrFn(ISD::ArgFlagsTy &Flags,
+                    const std::function<bool(Attribute::AttrKind)> &AttrFn) {
+  if (AttrFn(Attribute::SExt))
+    Flags.setSExt();
+  if (AttrFn(Attribute::ZExt))
+    Flags.setZExt();
+  if (AttrFn(Attribute::InReg))
+    Flags.setInReg();
+  if (AttrFn(Attribute::StructRet))
+    Flags.setSRet();
+  if (AttrFn(Attribute::Nest))
+    Flags.setNest();
+  if (AttrFn(Attribute::ByVal))
+    Flags.setByVal();
+  if (AttrFn(Attribute::Preallocated))
+    Flags.setPreallocated();
+  if (AttrFn(Attribute::InAlloca))
+    Flags.setInAlloca();
+  if (AttrFn(Attribute::Returned))
+    Flags.setReturned();
+  if (AttrFn(Attribute::SwiftSelf))
+    Flags.setSwiftSelf();
+  if (AttrFn(Attribute::SwiftError))
+    Flags.setSwiftError();
+}
+
+ISD::ArgFlagsTy CallLowering::getAttributesForArgIdx(const CallBase &Call,
+                                                     unsigned ArgIdx) const {
+  ISD::ArgFlagsTy Flags;
+  addFlagsUsingAttrFn(Flags, [&Call, &ArgIdx](Attribute::AttrKind Attr) {
+    return Call.paramHasAttr(ArgIdx, Attr);
+  });
+  return Flags;
+}
+
+void CallLowering::addArgFlagsFromAttributes(ISD::ArgFlagsTy &Flags,
+                                             const AttributeList &Attrs,
+                                             unsigned OpIdx) const {
+  addFlagsUsingAttrFn(Flags, [&Attrs, &OpIdx](Attribute::AttrKind Attr) {
+    return Attrs.hasAttribute(OpIdx, Attr);
+  });
+}
+
 bool CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, const CallBase &CB,
                              ArrayRef<Register> ResRegs,
                              ArrayRef<ArrayRef<Register>> ArgRegs,
@@ -37,6 +82,29 @@ bool CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, const CallBase &CB,
                              std::function<unsigned()> GetCalleeReg) const {
   CallLoweringInfo Info;
   const DataLayout &DL = MIRBuilder.getDataLayout();
+  MachineFunction &MF = MIRBuilder.getMF();
+  bool CanBeTailCalled = CB.isTailCall() &&
+                         isInTailCallPosition(CB, MF.getTarget()) &&
+                         (MF.getFunction()
+                              .getFnAttribute("disable-tail-calls")
+                              .getValueAsString() != "true");
+
+  CallingConv::ID CallConv = CB.getCallingConv();
+  Type *RetTy = CB.getType();
+  bool IsVarArg = CB.getFunctionType()->isVarArg();
+
+  SmallVector<BaseArgInfo, 4> SplitArgs;
+  getReturnInfo(CallConv, RetTy, CB.getAttributes(), SplitArgs, DL);
+  Info.CanLowerReturn = canLowerReturn(MF, CallConv, SplitArgs, IsVarArg);
+
+  if (!Info.CanLowerReturn) {
+    // Callee requires sret demotion.
+    insertSRetOutgoingArgument(MIRBuilder, CB, Info);
+
+    // The sret demotion isn't compatible with tail-calls, since the sret
+    // argument points into the caller's stack frame.
+    CanBeTailCalled = false;
+  }
 
   // First step is to marshall all the function's parameters into the correct
   // physregs and memory locations. Gather the sequence of argument types that
@@ -44,9 +112,15 @@ bool CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, const CallBase &CB,
   unsigned i = 0;
   unsigned NumFixedArgs = CB.getFunctionType()->getNumParams();
   for (auto &Arg : CB.args()) {
-    ArgInfo OrigArg{ArgRegs[i], Arg->getType(), ISD::ArgFlagsTy{},
+    ArgInfo OrigArg{ArgRegs[i], Arg->getType(), getAttributesForArgIdx(CB, i),
                     i < NumFixedArgs};
     setArgFlags(OrigArg, i + AttributeList::FirstArgIndex, DL, CB);
+
+    // If we have an explicit sret argument that is an Instruction, (i.e., it
+    // might point to function-local memory), we can't meaningfully tail-call.
+    if (OrigArg.Flags[0].isSRet() && isa<Instruction>(&Arg))
+      CanBeTailCalled = false;
+
     Info.OrigArgs.push_back(OrigArg);
     ++i;
   }
@@ -59,21 +133,16 @@ bool CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, const CallBase &CB,
   else
     Info.Callee = MachineOperand::CreateReg(GetCalleeReg(), false);
 
-  Info.OrigRet = ArgInfo{ResRegs, CB.getType(), ISD::ArgFlagsTy{}};
+  Info.OrigRet = ArgInfo{ResRegs, RetTy, ISD::ArgFlagsTy{}};
   if (!Info.OrigRet.Ty->isVoidTy())
     setArgFlags(Info.OrigRet, AttributeList::ReturnIndex, DL, CB);
 
-  MachineFunction &MF = MIRBuilder.getMF();
   Info.KnownCallees = CB.getMetadata(LLVMContext::MD_callees);
-  Info.CallConv = CB.getCallingConv();
+  Info.CallConv = CallConv;
   Info.SwiftErrorVReg = SwiftErrorVReg;
   Info.IsMustTailCall = CB.isMustTailCall();
-  Info.IsTailCall =
-      CB.isTailCall() && isInTailCallPosition(CB, MF.getTarget()) &&
-      (MF.getFunction()
-           .getFnAttribute("disable-tail-calls")
-           .getValueAsString() != "true");
-  Info.IsVarArg = CB.getFunctionType()->isVarArg();
+  Info.IsTailCall = CanBeTailCalled;
+  Info.IsVarArg = IsVarArg;
   return lowerCall(MIRBuilder, Info);
 }
 
@@ -83,24 +152,7 @@ void CallLowering::setArgFlags(CallLowering::ArgInfo &Arg, unsigned OpIdx,
                                const FuncInfoTy &FuncInfo) const {
   auto &Flags = Arg.Flags[0];
   const AttributeList &Attrs = FuncInfo.getAttributes();
-  if (Attrs.hasAttribute(OpIdx, Attribute::ZExt))
-    Flags.setZExt();
-  if (Attrs.hasAttribute(OpIdx, Attribute::SExt))
-    Flags.setSExt();
-  if (Attrs.hasAttribute(OpIdx, Attribute::InReg))
-    Flags.setInReg();
-  if (Attrs.hasAttribute(OpIdx, Attribute::StructRet))
-    Flags.setSRet();
-  if (Attrs.hasAttribute(OpIdx, Attribute::SwiftSelf))
-    Flags.setSwiftSelf();
-  if (Attrs.hasAttribute(OpIdx, Attribute::SwiftError))
-    Flags.setSwiftError();
-  if (Attrs.hasAttribute(OpIdx, Attribute::ByVal))
-    Flags.setByVal();
-  if (Attrs.hasAttribute(OpIdx, Attribute::Preallocated))
-    Flags.setPreallocated();
-  if (Attrs.hasAttribute(OpIdx, Attribute::InAlloca))
-    Flags.setInAlloca();
+  addArgFlagsFromAttributes(Flags, Attrs, OpIdx);
 
   if (Flags.isByVal() || Flags.isInAlloca() || Flags.isPreallocated()) {
     Type *ElementTy = cast<PointerType>(Arg.Ty)->getElementType();
@@ -117,8 +169,6 @@ void CallLowering::setArgFlags(CallLowering::ArgInfo &Arg, unsigned OpIdx,
       FrameAlign = Align(getTLI()->getByValTypeAlignment(ElementTy, DL));
     Flags.setByValAlign(FrameAlign);
   }
-  if (Attrs.hasAttribute(OpIdx, Attribute::Nest))
-    Flags.setNest();
   Flags.setOrigAlign(DL.getABITypeAlign(Arg.Ty));
 }
 
@@ -195,99 +245,97 @@ bool CallLowering::handleAssignments(CCState &CCInfo,
   unsigned NumArgs = Args.size();
   for (unsigned i = 0; i != NumArgs; ++i) {
     EVT CurVT = EVT::getEVT(Args[i].Ty);
-    if (!CurVT.isSimple() ||
-        Handler.assignArg(i, CurVT.getSimpleVT(), CurVT.getSimpleVT(),
-                          CCValAssign::Full, Args[i], Args[i].Flags[0],
-                          CCInfo)) {
-      MVT NewVT = TLI->getRegisterTypeForCallingConv(
-          F.getContext(), F.getCallingConv(), EVT(CurVT));
-
-      // If we need to split the type over multiple regs, check it's a scenario
-      // we currently support.
-      unsigned NumParts = TLI->getNumRegistersForCallingConv(
-          F.getContext(), F.getCallingConv(), CurVT);
-      if (NumParts > 1) {
-        // For now only handle exact splits.
-        if (NewVT.getSizeInBits() * NumParts != CurVT.getSizeInBits())
-          return false;
-      }
+    if (CurVT.isSimple() &&
+        !Handler.assignArg(i, CurVT.getSimpleVT(), CurVT.getSimpleVT(),
+                           CCValAssign::Full, Args[i], Args[i].Flags[0],
+                           CCInfo))
+      continue;
+
+    MVT NewVT = TLI->getRegisterTypeForCallingConv(
+        F.getContext(), F.getCallingConv(), EVT(CurVT));
+
+    // If we need to split the type over multiple regs, check it's a scenario
+    // we currently support.
+    unsigned NumParts = TLI->getNumRegistersForCallingConv(
+        F.getContext(), F.getCallingConv(), CurVT);
+
+    if (NumParts == 1) {
+      // Try to use the register type if we couldn't assign the VT.
+      if (Handler.assignArg(i, NewVT, NewVT, CCValAssign::Full, Args[i],
+                            Args[i].Flags[0], CCInfo))
+        return false;
+      continue;
+    }
 
-      // For incoming arguments (physregs to vregs), we could have values in
-      // physregs (or memlocs) which we want to extract and copy to vregs.
-      // During this, we might have to deal with the LLT being split across
-      // multiple regs, so we have to record this information for later.
-      //
-      // If we have outgoing args, then we have the opposite case. We have a
-      // vreg with an LLT which we want to assign to a physical location, and
-      // we might have to record that the value has to be split later.
-      if (Handler.isIncomingArgumentHandler()) {
-        if (NumParts == 1) {
-          // Try to use the register type if we couldn't assign the VT.
-          if (Handler.assignArg(i, NewVT, NewVT, CCValAssign::Full, Args[i],
-                                Args[i].Flags[0], CCInfo))
-            return false;
+    assert(NumParts > 1);
+    // For now only handle exact splits.
+    if (NewVT.getSizeInBits() * NumParts != CurVT.getSizeInBits())
+      return false;
+
+    // For incoming arguments (physregs to vregs), we could have values in
+    // physregs (or memlocs) which we want to extract and copy to vregs.
+    // During this, we might have to deal with the LLT being split across
+    // multiple regs, so we have to record this information for later.
+    //
+    // If we have outgoing args, then we have the opposite case. We have a
+    // vreg with an LLT which we want to assign to a physical location, and
+    // we might have to record that the value has to be split later.
+    if (Handler.isIncomingArgumentHandler()) {
+      // We're handling an incoming arg which is split over multiple regs.
+      // E.g. passing an s128 on AArch64.
+      ISD::ArgFlagsTy OrigFlags = Args[i].Flags[0];
+      Args[i].OrigRegs.push_back(Args[i].Regs[0]);
+      Args[i].Regs.clear();
+      Args[i].Flags.clear();
+      LLT NewLLT = getLLTForMVT(NewVT);
+      // For each split register, create and assign a vreg that will store
+      // the incoming component of the larger value. These will later be
+      // merged to form the final vreg.
+      for (unsigned Part = 0; Part < NumParts; ++Part) {
+        Register Reg =
+            MIRBuilder.getMRI()->createGenericVirtualRegister(NewLLT);
+        ISD::ArgFlagsTy Flags = OrigFlags;
+        if (Part == 0) {
+          Flags.setSplit();
         } else {
-          // We're handling an incoming arg which is split over multiple regs.
-          // E.g. passing an s128 on AArch64.
-          ISD::ArgFlagsTy OrigFlags = Args[i].Flags[0];
-          Args[i].OrigRegs.push_back(Args[i].Regs[0]);
-          Args[i].Regs.clear();
-          Args[i].Flags.clear();
-          LLT NewLLT = getLLTForMVT(NewVT);
-          // For each split register, create and assign a vreg that will store
-          // the incoming component of the larger value. These will later be
-          // merged to form the final vreg.
-          for (unsigned Part = 0; Part < NumParts; ++Part) {
-            Register Reg =
-                MIRBuilder.getMRI()->createGenericVirtualRegister(NewLLT);
-            ISD::ArgFlagsTy Flags = OrigFlags;
-            if (Part == 0) {
-              Flags.setSplit();
-            } else {
-              Flags.setOrigAlign(Align(1));
-              if (Part == NumParts - 1)
-                Flags.setSplitEnd();
-            }
-            Args[i].Regs.push_back(Reg);
-            Args[i].Flags.push_back(Flags);
-            if (Handler.assignArg(i + Part, NewVT, NewVT, CCValAssign::Full,
-                                  Args[i], Args[i].Flags[Part], CCInfo)) {
-              // Still couldn't assign this smaller part type for some reason.
-              return false;
-            }
-          }
+          Flags.setOrigAlign(Align(1));
+          if (Part == NumParts - 1)
+            Flags.setSplitEnd();
         }
-      } else {
-        // Handling an outgoing arg that might need to be split.
-        if (NumParts < 2)
-          return false; // Don't know how to deal with this type combination.
-
-        // This type is passed via multiple registers in the calling convention.
-        // We need to extract the individual parts.
-        Register LargeReg = Args[i].Regs[0];
-        LLT SmallTy = LLT::scalar(NewVT.getSizeInBits());
-        auto Unmerge = MIRBuilder.buildUnmerge(SmallTy, LargeReg);
-        assert(Unmerge->getNumOperands() == NumParts + 1);
-        ISD::ArgFlagsTy OrigFlags = Args[i].Flags[0];
-        // We're going to replace the regs and flags with the split ones.
-        Args[i].Regs.clear();
-        Args[i].Flags.clear();
-        for (unsigned PartIdx = 0; PartIdx < NumParts; ++PartIdx) {
-          ISD::ArgFlagsTy Flags = OrigFlags;
-          if (PartIdx == 0) {
-            Flags.setSplit();
-          } else {
-            Flags.setOrigAlign(Align(1));
-            if (PartIdx == NumParts - 1)
-              Flags.setSplitEnd();
-          }
-          Args[i].Regs.push_back(Unmerge.getReg(PartIdx));
-          Args[i].Flags.push_back(Flags);
-          if (Handler.assignArg(i + PartIdx, NewVT, NewVT, CCValAssign::Full,
-                                Args[i], Args[i].Flags[PartIdx], CCInfo))
-            return false;
+        Args[i].Regs.push_back(Reg);
+        Args[i].Flags.push_back(Flags);
+        if (Handler.assignArg(i, NewVT, NewVT, CCValAssign::Full, Args[i],
+                              Args[i].Flags[Part], CCInfo)) {
+          // Still couldn't assign this smaller part type for some reason.
+          return false;
         }
       }
+    } else {
+      // This type is passed via multiple registers in the calling convention.
+      // We need to extract the individual parts.
+      Register LargeReg = Args[i].Regs[0];
+      LLT SmallTy = LLT::scalar(NewVT.getSizeInBits());
+      auto Unmerge = MIRBuilder.buildUnmerge(SmallTy, LargeReg);
+      assert(Unmerge->getNumOperands() == NumParts + 1);
+      ISD::ArgFlagsTy OrigFlags = Args[i].Flags[0];
+      // We're going to replace the regs and flags with the split ones.
+      Args[i].Regs.clear();
+      Args[i].Flags.clear();
+      for (unsigned PartIdx = 0; PartIdx < NumParts; ++PartIdx) {
+        ISD::ArgFlagsTy Flags = OrigFlags;
+        if (PartIdx == 0) {
+          Flags.setSplit();
+        } else {
+          Flags.setOrigAlign(Align(1));
+          if (PartIdx == NumParts - 1)
+            Flags.setSplitEnd();
+        }
+        Args[i].Regs.push_back(Unmerge.getReg(PartIdx));
+        Args[i].Flags.push_back(Flags);
+        if (Handler.assignArg(i, NewVT, NewVT, CCValAssign::Full,
+                              Args[i], Args[i].Flags[PartIdx], CCInfo))
+          return false;
+      }
     }
   }
 
@@ -313,85 +361,239 @@ bool CallLowering::handleAssignments(CCState &CCInfo,
     EVT VAVT = VA.getValVT();
     const LLT OrigTy = getLLTForType(*Args[i].Ty, DL);
 
-    if (VA.isRegLoc()) {
-      if (Handler.isIncomingArgumentHandler() && VAVT != OrigVT) {
-        if (VAVT.getSizeInBits() < OrigVT.getSizeInBits()) {
-          // Expected to be multiple regs for a single incoming arg.
-          unsigned NumArgRegs = Args[i].Regs.size();
-          if (NumArgRegs < 2)
-            return false;
-
-          assert((j + (NumArgRegs - 1)) < ArgLocs.size() &&
-                 "Too many regs for number of args");
-          for (unsigned Part = 0; Part < NumArgRegs; ++Part) {
-            // There should be Regs.size() ArgLocs per argument.
-            VA = ArgLocs[j + Part];
-            Handler.assignValueToReg(Args[i].Regs[Part], VA.getLocReg(), VA);
-          }
-          j += NumArgRegs - 1;
-          // Merge the split registers into the expected larger result vreg
-          // of the original call.
-          MIRBuilder.buildMerge(Args[i].OrigRegs[0], Args[i].Regs);
-          continue;
-        }
-        const LLT VATy(VAVT.getSimpleVT());
-        Register NewReg =
-            MIRBuilder.getMRI()->createGenericVirtualRegister(VATy);
-        Handler.assignValueToReg(NewReg, VA.getLocReg(), VA);
-        // If it's a vector type, we either need to truncate the elements
-        // or do an unmerge to get the lower block of elements.
-        if (VATy.isVector() &&
-            VATy.getNumElements() > OrigVT.getVectorNumElements()) {
-          // Just handle the case where the VA type is 2 * original type.
-          if (VATy.getNumElements() != OrigVT.getVectorNumElements() * 2) {
-            LLVM_DEBUG(dbgs()
-                       << "Incoming promoted vector arg has too many elts");
-            return false;
-          }
-          auto Unmerge = MIRBuilder.buildUnmerge({OrigTy, OrigTy}, {NewReg});
-          MIRBuilder.buildCopy(ArgReg, Unmerge.getReg(0));
-        } else {
-          MIRBuilder.buildTrunc(ArgReg, {NewReg}).getReg(0);
+    // Expected to be multiple regs for a single incoming arg.
+    // There should be Regs.size() ArgLocs per argument.
+    unsigned NumArgRegs = Args[i].Regs.size();
+
+    assert((j + (NumArgRegs - 1)) < ArgLocs.size() &&
+           "Too many regs for number of args");
+    for (unsigned Part = 0; Part < NumArgRegs; ++Part) {
+      // There should be Regs.size() ArgLocs per argument.
+      VA = ArgLocs[j + Part];
+      if (VA.isMemLoc()) {
+        // Don't currently support loading/storing a type that needs to be split
+        // to the stack. Should be easy, just not implemented yet.
+        if (NumArgRegs > 1) {
+          LLVM_DEBUG(
+            dbgs()
+            << "Load/store a split arg to/from the stack not implemented yet\n");
+          return false;
         }
-      } else if (!Handler.isIncomingArgumentHandler()) {
-        assert((j + (Args[i].Regs.size() - 1)) < ArgLocs.size() &&
-               "Too many regs for number of args");
-        // This is an outgoing argument that might have been split.
-        for (unsigned Part = 0; Part < Args[i].Regs.size(); ++Part) {
-          // There should be Regs.size() ArgLocs per argument.
-          VA = ArgLocs[j + Part];
-          Handler.assignValueToReg(Args[i].Regs[Part], VA.getLocReg(), VA);
+
+        // FIXME: Use correct address space for pointer size
+        EVT LocVT = VA.getValVT();
+        unsigned MemSize = LocVT == MVT::iPTR ? DL.getPointerSize()
+                                              : LocVT.getStoreSize();
+        unsigned Offset = VA.getLocMemOffset();
+        MachinePointerInfo MPO;
+        Register StackAddr = Handler.getStackAddress(MemSize, Offset, MPO);
+        Handler.assignValueToAddress(Args[i], StackAddr,
+                                     MemSize, MPO, VA);
+        continue;
+      }
+
+      assert(VA.isRegLoc() && "custom loc should have been handled already");
+
+      // GlobalISel does not currently work for scalable vectors.
+      if (OrigVT.getFixedSizeInBits() >= VAVT.getFixedSizeInBits() ||
+          !Handler.isIncomingArgumentHandler()) {
+        // This is an argument that might have been split. There should be
+        // Regs.size() ArgLocs per argument.
+
+        // Insert the argument copies. If VAVT < OrigVT, we'll insert the merge
+        // to the original register after handling all of the parts.
+        Handler.assignValueToReg(Args[i].Regs[Part], VA.getLocReg(), VA);
+        continue;
+      }
+
+      // This ArgLoc covers multiple pieces, so we need to split it.
+      const LLT VATy(VAVT.getSimpleVT());
+      Register NewReg =
+        MIRBuilder.getMRI()->createGenericVirtualRegister(VATy);
+      Handler.assignValueToReg(NewReg, VA.getLocReg(), VA);
+      // If it's a vector type, we either need to truncate the elements
+      // or do an unmerge to get the lower block of elements.
+      if (VATy.isVector() &&
+          VATy.getNumElements() > OrigVT.getVectorNumElements()) {
+        // Just handle the case where the VA type is 2 * original type.
+        if (VATy.getNumElements() != OrigVT.getVectorNumElements() * 2) {
+          LLVM_DEBUG(dbgs()
+                     << "Incoming promoted vector arg has too many elts");
+          return false;
         }
-        j += Args[i].Regs.size() - 1;
+        auto Unmerge = MIRBuilder.buildUnmerge({OrigTy, OrigTy}, {NewReg});
+        MIRBuilder.buildCopy(ArgReg, Unmerge.getReg(0));
       } else {
-        Handler.assignValueToReg(ArgReg, VA.getLocReg(), VA);
+        MIRBuilder.buildTrunc(ArgReg, {NewReg}).getReg(0);
       }
-    } else if (VA.isMemLoc()) {
-      // Don't currently support loading/storing a type that needs to be split
-      // to the stack. Should be easy, just not implemented yet.
-      if (Args[i].Regs.size() > 1) {
-        LLVM_DEBUG(
-            dbgs()
-            << "Load/store a split arg to/from the stack not implemented yet");
-        return false;
+    }
+
+    // Now that all pieces have been handled, re-pack any arguments into any
+    // wider, original registers.
+    if (Handler.isIncomingArgumentHandler()) {
+      if (VAVT.getFixedSizeInBits() < OrigVT.getFixedSizeInBits()) {
+        assert(NumArgRegs >= 2);
+
+        // Merge the split registers into the expected larger result vreg
+        // of the original call.
+        MIRBuilder.buildMerge(Args[i].OrigRegs[0], Args[i].Regs);
       }
+    }
 
-      EVT LocVT = VA.getValVT();
-      unsigned MemSize = LocVT == MVT::iPTR ? DL.getPointerSize()
-                                            : LocVT.getStoreSize();
+    j += NumArgRegs - 1;
+  }
 
-      unsigned Offset = VA.getLocMemOffset();
-      MachinePointerInfo MPO;
-      Register StackAddr = Handler.getStackAddress(MemSize, Offset, MPO);
-      Handler.assignValueToAddress(Args[i], StackAddr, MemSize, MPO, VA);
-    } else {
-      // FIXME: Support byvals and other weirdness
+  return true;
+}
+
+void CallLowering::insertSRetLoads(MachineIRBuilder &MIRBuilder, Type *RetTy,
+                                   ArrayRef<Register> VRegs, Register DemoteReg,
+                                   int FI) const {
+  MachineFunction &MF = MIRBuilder.getMF();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const DataLayout &DL = MF.getDataLayout();
+
+  SmallVector<EVT, 4> SplitVTs;
+  SmallVector<uint64_t, 4> Offsets;
+  ComputeValueVTs(*TLI, DL, RetTy, SplitVTs, &Offsets, 0);
+
+  assert(VRegs.size() == SplitVTs.size());
+
+  unsigned NumValues = SplitVTs.size();
+  Align BaseAlign = DL.getPrefTypeAlign(RetTy);
+  Type *RetPtrTy = RetTy->getPointerTo(DL.getAllocaAddrSpace());
+  LLT OffsetLLTy = getLLTForType(*DL.getIntPtrType(RetPtrTy), DL);
+
+  MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
+
+  for (unsigned I = 0; I < NumValues; ++I) {
+    Register Addr;
+    MIRBuilder.materializePtrAdd(Addr, DemoteReg, OffsetLLTy, Offsets[I]);
+    auto *MMO = MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad,
+                                        MRI.getType(VRegs[I]).getSizeInBytes(),
+                                        commonAlignment(BaseAlign, Offsets[I]));
+    MIRBuilder.buildLoad(VRegs[I], Addr, *MMO);
+  }
+}
+
+void CallLowering::insertSRetStores(MachineIRBuilder &MIRBuilder, Type *RetTy,
+                                    ArrayRef<Register> VRegs,
+                                    Register DemoteReg) const {
+  MachineFunction &MF = MIRBuilder.getMF();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const DataLayout &DL = MF.getDataLayout();
+
+  SmallVector<EVT, 4> SplitVTs;
+  SmallVector<uint64_t, 4> Offsets;
+  ComputeValueVTs(*TLI, DL, RetTy, SplitVTs, &Offsets, 0);
+
+  assert(VRegs.size() == SplitVTs.size());
+
+  unsigned NumValues = SplitVTs.size();
+  Align BaseAlign = DL.getPrefTypeAlign(RetTy);
+  unsigned AS = DL.getAllocaAddrSpace();
+  LLT OffsetLLTy =
+      getLLTForType(*DL.getIntPtrType(RetTy->getPointerTo(AS)), DL);
+
+  MachinePointerInfo PtrInfo(AS);
+
+  for (unsigned I = 0; I < NumValues; ++I) {
+    Register Addr;
+    MIRBuilder.materializePtrAdd(Addr, DemoteReg, OffsetLLTy, Offsets[I]);
+    auto *MMO = MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
+                                        MRI.getType(VRegs[I]).getSizeInBytes(),
+                                        commonAlignment(BaseAlign, Offsets[I]));
+    MIRBuilder.buildStore(VRegs[I], Addr, *MMO);
+  }
+}
+
+void CallLowering::insertSRetIncomingArgument(
+    const Function &F, SmallVectorImpl<ArgInfo> &SplitArgs, Register &DemoteReg,
+    MachineRegisterInfo &MRI, const DataLayout &DL) const {
+  unsigned AS = DL.getAllocaAddrSpace();
+  DemoteReg = MRI.createGenericVirtualRegister(
+      LLT::pointer(AS, DL.getPointerSizeInBits(AS)));
+
+  Type *PtrTy = PointerType::get(F.getReturnType(), AS);
+
+  SmallVector<EVT, 1> ValueVTs;
+  ComputeValueVTs(*TLI, DL, PtrTy, ValueVTs);
+
+  // NOTE: Assume that a pointer won't get split into more than one VT.
+  assert(ValueVTs.size() == 1);
+
+  ArgInfo DemoteArg(DemoteReg, ValueVTs[0].getTypeForEVT(PtrTy->getContext()));
+  setArgFlags(DemoteArg, AttributeList::ReturnIndex, DL, F);
+  DemoteArg.Flags[0].setSRet();
+  SplitArgs.insert(SplitArgs.begin(), DemoteArg);
+}
+
+void CallLowering::insertSRetOutgoingArgument(MachineIRBuilder &MIRBuilder,
+                                              const CallBase &CB,
+                                              CallLoweringInfo &Info) const {
+  const DataLayout &DL = MIRBuilder.getDataLayout();
+  Type *RetTy = CB.getType();
+  unsigned AS = DL.getAllocaAddrSpace();
+  LLT FramePtrTy = LLT::pointer(AS, DL.getPointerSizeInBits(AS));
+
+  int FI = MIRBuilder.getMF().getFrameInfo().CreateStackObject(
+      DL.getTypeAllocSize(RetTy), DL.getPrefTypeAlign(RetTy), false);
+
+  Register DemoteReg = MIRBuilder.buildFrameIndex(FramePtrTy, FI).getReg(0);
+  ArgInfo DemoteArg(DemoteReg, PointerType::get(RetTy, AS));
+  setArgFlags(DemoteArg, AttributeList::ReturnIndex, DL, CB);
+  DemoteArg.Flags[0].setSRet();
+
+  Info.OrigArgs.insert(Info.OrigArgs.begin(), DemoteArg);
+  Info.DemoteStackIndex = FI;
+  Info.DemoteRegister = DemoteReg;
+}
+
+bool CallLowering::checkReturn(CCState &CCInfo,
+                               SmallVectorImpl<BaseArgInfo> &Outs,
+                               CCAssignFn *Fn) const {
+  for (unsigned I = 0, E = Outs.size(); I < E; ++I) {
+    MVT VT = MVT::getVT(Outs[I].Ty);
+    if (Fn(I, VT, VT, CCValAssign::Full, Outs[I].Flags[0], CCInfo))
       return false;
-    }
   }
   return true;
 }
 
+void CallLowering::getReturnInfo(CallingConv::ID CallConv, Type *RetTy,
+                                 AttributeList Attrs,
+                                 SmallVectorImpl<BaseArgInfo> &Outs,
+                                 const DataLayout &DL) const {
+  LLVMContext &Context = RetTy->getContext();
+  ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
+
+  SmallVector<EVT, 4> SplitVTs;
+  ComputeValueVTs(*TLI, DL, RetTy, SplitVTs);
+  addArgFlagsFromAttributes(Flags, Attrs, AttributeList::ReturnIndex);
+
+  for (EVT VT : SplitVTs) {
+    unsigned NumParts =
+        TLI->getNumRegistersForCallingConv(Context, CallConv, VT);
+    MVT RegVT = TLI->getRegisterTypeForCallingConv(Context, CallConv, VT);
+    Type *PartTy = EVT(RegVT).getTypeForEVT(Context);
+
+    for (unsigned I = 0; I < NumParts; ++I) {
+      Outs.emplace_back(PartTy, Flags);
+    }
+  }
+}
+
+bool CallLowering::checkReturnTypeForCallConv(MachineFunction &MF) const {
+  const auto &F = MF.getFunction();
+  Type *ReturnType = F.getReturnType();
+  CallingConv::ID CallConv = F.getCallingConv();
+
+  SmallVector<BaseArgInfo, 4> SplitArgs;
+  getReturnInfo(CallConv, ReturnType, F.getAttributes(), SplitArgs,
+                MF.getDataLayout());
+  return canLowerReturn(MF, CallConv, SplitArgs, F.isVarArg());
+}
+
 bool CallLowering::analyzeArgInfo(CCState &CCState,
                                   SmallVectorImpl<ArgInfo> &Args,
                                   CCAssignFn &AssignFnFixed,
@@ -409,6 +611,58 @@ bool CallLowering::analyzeArgInfo(CCState &CCState,
   return true;
 }
 
+bool CallLowering::parametersInCSRMatch(
+    const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask,
+    const SmallVectorImpl<CCValAssign> &OutLocs,
+    const SmallVectorImpl<ArgInfo> &OutArgs) const {
+  for (unsigned i = 0; i < OutLocs.size(); ++i) {
+    auto &ArgLoc = OutLocs[i];
+    // If it's not a register, it's fine.
+    if (!ArgLoc.isRegLoc())
+      continue;
+
+    MCRegister PhysReg = ArgLoc.getLocReg();
+
+    // Only look at callee-saved registers.
+    if (MachineOperand::clobbersPhysReg(CallerPreservedMask, PhysReg))
+      continue;
+
+    LLVM_DEBUG(
+        dbgs()
+        << "... Call has an argument passed in a callee-saved register.\n");
+
+    // Check if it was copied from.
+    const ArgInfo &OutInfo = OutArgs[i];
+
+    if (OutInfo.Regs.size() > 1) {
+      LLVM_DEBUG(
+          dbgs() << "... Cannot handle arguments in multiple registers.\n");
+      return false;
+    }
+
+    // Check if we copy the register, walking through copies from virtual
+    // registers. Note that getDefIgnoringCopies does not ignore copies from
+    // physical registers.
+    MachineInstr *RegDef = getDefIgnoringCopies(OutInfo.Regs[0], MRI);
+    if (!RegDef || RegDef->getOpcode() != TargetOpcode::COPY) {
+      LLVM_DEBUG(
+          dbgs()
+          << "... Parameter was not copied into a VReg, cannot tail call.\n");
+      return false;
+    }
+
+    // Got a copy. Verify that it's the same as the register we want.
+    Register CopyRHS = RegDef->getOperand(1).getReg();
+    if (CopyRHS != PhysReg) {
+      LLVM_DEBUG(dbgs() << "... Callee-saved register was not copied into "
+                           "VReg, cannot tail call.\n");
+      return false;
+    }
+  }
+
+  return true;
+}
+
 bool CallLowering::resultsCompatible(CallLoweringInfo &Info,
                                      MachineFunction &MF,
                                      SmallVectorImpl<ArgInfo> &InArgs,
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/Combiner.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/Combiner.cpp
index b4562a5c6601..f1071d96e5a3 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/Combiner.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/Combiner.cpp
@@ -130,8 +130,6 @@ bool Combiner::combineMachineInstrs(MachineFunction &MF,
       WrapperObserver.addObserver(CSEInfo);
     RAIIDelegateInstaller DelInstall(MF, &WrapperObserver);
     for (MachineBasicBlock *MBB : post_order(&MF)) {
-      if (MBB->empty())
-        continue;
       for (auto MII = MBB->rbegin(), MIE = MBB->rend(); MII != MIE;) {
         MachineInstr *CurMI = &*MII;
         ++MII;
@@ -155,5 +153,8 @@ bool Combiner::combineMachineInstrs(MachineFunction &MF,
     MFChanged |= Changed;
   } while (Changed);
 
+  assert(!CSEInfo || (!errorToBool(CSEInfo->verify()) &&
+                         "CSEInfo is not consistent. Likely missing calls to "
+                         "observer on mutations"));
   return MFChanged;
 }
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 194961ae3b21..a9353bdfb780 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -16,6 +16,7 @@
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
@@ -34,7 +35,6 @@ static cl::opt<bool>
                        cl::desc("Force all indexed operations to be "
                                 "legal for the GlobalISel combiner"));
 
-
 CombinerHelper::CombinerHelper(GISelChangeObserver &Observer,
                                MachineIRBuilder &B, GISelKnownBits *KB,
                                MachineDominatorTree *MDT,
@@ -44,6 +44,75 @@ CombinerHelper::CombinerHelper(GISelChangeObserver &Observer,
   (void)this->KB;
 }
 
+const TargetLowering &CombinerHelper::getTargetLowering() const {
+  return *Builder.getMF().getSubtarget().getTargetLowering();
+}
+
+/// \returns The little endian in-memory byte position of byte \p I in a
+/// \p ByteWidth bytes wide type.
+///
+/// E.g. Given a 4-byte type x, x[0] -> byte 0
+static unsigned littleEndianByteAt(const unsigned ByteWidth, const unsigned I) {
+  assert(I < ByteWidth && "I must be in [0, ByteWidth)");
+  return I;
+}
+
+/// \returns The big endian in-memory byte position of byte \p I in a
+/// \p ByteWidth bytes wide type.
+///
+/// E.g. Given a 4-byte type x, x[0] -> byte 3
+static unsigned bigEndianByteAt(const unsigned ByteWidth, const unsigned I) {
+  assert(I < ByteWidth && "I must be in [0, ByteWidth)");
+  return ByteWidth - I - 1;
+}
+
+/// Given a map from byte offsets in memory to indices in a load/store,
+/// determine if that map corresponds to a little or big endian byte pattern.
+///
+/// \param MemOffset2Idx maps memory offsets to address offsets.
+/// \param LowestIdx is the lowest index in \p MemOffset2Idx.
+///
+/// \returns true if the map corresponds to a big endian byte pattern, false
+/// if it corresponds to a little endian byte pattern, and None otherwise.
+///
+/// E.g. given a 32-bit type x, and x[AddrOffset], the in-memory byte patterns
+/// are as follows:
+///
+/// AddrOffset   Little endian    Big endian
+/// 0            0                3
+/// 1            1                2
+/// 2            2                1
+/// 3            3                0
+static Optional<bool>
+isBigEndian(const SmallDenseMap<int64_t, int64_t, 8> &MemOffset2Idx,
+            int64_t LowestIdx) {
+  // Need at least two byte positions to decide on endianness.
+  unsigned Width = MemOffset2Idx.size();
+  if (Width < 2)
+    return None;
+  bool BigEndian = true, LittleEndian = true;
+  for (unsigned MemOffset = 0; MemOffset < Width; ++ MemOffset) {
+    auto MemOffsetAndIdx = MemOffset2Idx.find(MemOffset);
+    if (MemOffsetAndIdx == MemOffset2Idx.end())
+      return None;
+    const int64_t Idx = MemOffsetAndIdx->second - LowestIdx;
+    assert(Idx >= 0 && "Expected non-negative byte offset?");
+    LittleEndian &= Idx == littleEndianByteAt(Width, MemOffset);
+    BigEndian &= Idx == bigEndianByteAt(Width, MemOffset);
+    if (!BigEndian && !LittleEndian)
+      return None;
+  }
+
+  assert((BigEndian != LittleEndian) &&
+         "Pattern cannot be both big and little endian!");
+  return BigEndian;
+}
+
+bool CombinerHelper::isLegalOrBeforeLegalizer(
+    const LegalityQuery &Query) const {
+  return !LI || LI->getAction(Query).Action == LegalizeActions::Legal;
+}
+
 void CombinerHelper::replaceRegWith(MachineRegisterInfo &MRI, Register FromReg,
                                     Register ToReg) const {
   Observer.changingAllUsesOfReg(MRI, FromReg);
@@ -555,13 +624,13 @@ bool CombinerHelper::isPredecessor(const MachineInstr &DefMI,
   assert(DefMI.getParent() == UseMI.getParent());
   if (&DefMI == &UseMI)
     return false;
-
-  // Loop through the basic block until we find one of the instructions.
-  MachineBasicBlock::const_iterator I = DefMI.getParent()->begin();
-  for (; &*I != &DefMI && &*I != &UseMI; ++I)
-    return &*I == &DefMI;
-
-  llvm_unreachable("Block must contain instructions");
+  const MachineBasicBlock &MBB = *DefMI.getParent();
+  auto DefOrUse = find_if(MBB, [&DefMI, &UseMI](const MachineInstr &MI) {
+    return &MI == &DefMI || &MI == &UseMI;
+  });
+  if (DefOrUse == MBB.end())
+    llvm_unreachable("Block must contain both DefMI and UseMI!");
+  return &*DefOrUse == &DefMI;
 }
 
 bool CombinerHelper::dominates(const MachineInstr &DefMI,
@@ -576,20 +645,97 @@ bool CombinerHelper::dominates(const MachineInstr &DefMI,
   return isPredecessor(DefMI, UseMI);
 }
 
-bool CombinerHelper::matchSextAlreadyExtended(MachineInstr &MI) {
+bool CombinerHelper::matchSextTruncSextLoad(MachineInstr &MI) {
   assert(MI.getOpcode() == TargetOpcode::G_SEXT_INREG);
   Register SrcReg = MI.getOperand(1).getReg();
-  unsigned SrcSignBits = KB->computeNumSignBits(SrcReg);
-  unsigned NumSextBits =
-      MRI.getType(MI.getOperand(0).getReg()).getScalarSizeInBits() -
-      MI.getOperand(2).getImm();
-  return SrcSignBits >= NumSextBits;
+  Register LoadUser = SrcReg;
+
+  if (MRI.getType(SrcReg).isVector())
+    return false;
+
+  Register TruncSrc;
+  if (mi_match(SrcReg, MRI, m_GTrunc(m_Reg(TruncSrc))))
+    LoadUser = TruncSrc;
+
+  uint64_t SizeInBits = MI.getOperand(2).getImm();
+  // If the source is a G_SEXTLOAD from the same bit width, then we don't
+  // need any extend at all, just a truncate.
+  if (auto *LoadMI = getOpcodeDef(TargetOpcode::G_SEXTLOAD, LoadUser, MRI)) {
+    const auto &MMO = **LoadMI->memoperands_begin();
+    // If truncating more than the original extended value, abort.
+    if (TruncSrc && MRI.getType(TruncSrc).getSizeInBits() < MMO.getSizeInBits())
+      return false;
+    if (MMO.getSizeInBits() == SizeInBits)
+      return true;
+  }
+  return false;
 }
 
-bool CombinerHelper::applySextAlreadyExtended(MachineInstr &MI) {
+bool CombinerHelper::applySextTruncSextLoad(MachineInstr &MI) {
   assert(MI.getOpcode() == TargetOpcode::G_SEXT_INREG);
-  MachineIRBuilder MIB(MI);
-  MIB.buildCopy(MI.getOperand(0).getReg(), MI.getOperand(1).getReg());
+  Builder.setInstrAndDebugLoc(MI);
+  Builder.buildCopy(MI.getOperand(0).getReg(), MI.getOperand(1).getReg());
+  MI.eraseFromParent();
+  return true;
+}
+
+bool CombinerHelper::matchSextInRegOfLoad(
+    MachineInstr &MI, std::tuple<Register, unsigned> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_SEXT_INREG);
+
+  // Only supports scalars for now.
+  if (MRI.getType(MI.getOperand(0).getReg()).isVector())
+    return false;
+
+  Register SrcReg = MI.getOperand(1).getReg();
+  MachineInstr *LoadDef = getOpcodeDef(TargetOpcode::G_LOAD, SrcReg, MRI);
+  if (!LoadDef || !MRI.hasOneNonDBGUse(LoadDef->getOperand(0).getReg()))
+    return false;
+
+  // If the sign extend extends from a narrower width than the load's width,
+  // then we can narrow the load width when we combine to a G_SEXTLOAD.
+  auto &MMO = **LoadDef->memoperands_begin();
+  // Don't do this for non-simple loads.
+  if (MMO.isAtomic() || MMO.isVolatile())
+    return false;
+
+  // Avoid widening the load at all.
+  unsigned NewSizeBits =
+      std::min((uint64_t)MI.getOperand(2).getImm(), MMO.getSizeInBits());
+
+  // Don't generate G_SEXTLOADs with a < 1 byte width.
+  if (NewSizeBits < 8)
+    return false;
+  // Don't bother creating a non-power-2 sextload, it will likely be broken up
+  // anyway for most targets.
+  if (!isPowerOf2_32(NewSizeBits))
+    return false;
+  MatchInfo = std::make_tuple(LoadDef->getOperand(0).getReg(), NewSizeBits);
+  return true;
+}
+
+bool CombinerHelper::applySextInRegOfLoad(
+    MachineInstr &MI, std::tuple<Register, unsigned> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_SEXT_INREG);
+  Register LoadReg;
+  unsigned ScalarSizeBits;
+  std::tie(LoadReg, ScalarSizeBits) = MatchInfo;
+  auto *LoadDef = MRI.getVRegDef(LoadReg);
+  assert(LoadDef && "Expected a load reg");
+
+  // If we have the following:
+  // %ld = G_LOAD %ptr, (load 2)
+  // %ext = G_SEXT_INREG %ld, 8
+  //    ==>
+  // %ld = G_SEXTLOAD %ptr (load 1)
+
+  auto &MMO = **LoadDef->memoperands_begin();
+  Builder.setInstrAndDebugLoc(MI);
+  auto &MF = Builder.getMF();
+  auto PtrInfo = MMO.getPointerInfo();
+  auto *NewMMO = MF.getMachineMemOperand(&MMO, PtrInfo, ScalarSizeBits / 8);
+  Builder.buildLoadInstr(TargetOpcode::G_SEXTLOAD, MI.getOperand(0).getReg(),
+                         LoadDef->getOperand(1).getReg(), *NewMMO);
   MI.eraseFromParent();
   return true;
 }
@@ -611,7 +757,7 @@ bool CombinerHelper::findPostIndexCandidate(MachineInstr &MI, Register &Addr,
     return false;
 
   LLVM_DEBUG(dbgs() << "Searching for post-indexing opportunity for: " << MI);
-
+  // FIXME: The following use traversal needs a bail out for patholigical cases.
   for (auto &Use : MRI.use_nodbg_instructions(Base)) {
     if (Use.getOpcode() != TargetOpcode::G_PTR_ADD)
       continue;
@@ -738,6 +884,11 @@ bool CombinerHelper::matchCombineIndexedLoadStore(MachineInstr &MI, IndexedLoadS
       Opcode != TargetOpcode::G_ZEXTLOAD && Opcode != TargetOpcode::G_STORE)
     return false;
 
+  // For now, no targets actually support these opcodes so don't waste time
+  // running these unless we're forced to for testing.
+  if (!ForceLegalIndexing)
+    return false;
+
   MatchInfo.IsPre = findPreIndexCandidate(MI, MatchInfo.Addr, MatchInfo.Base,
                                           MatchInfo.Offset);
   if (!MatchInfo.IsPre &&
@@ -790,14 +941,12 @@ void CombinerHelper::applyCombineIndexedLoadStore(
   LLVM_DEBUG(dbgs() << "    Combinined to indexed operation");
 }
 
-bool CombinerHelper::matchElideBrByInvertingCond(MachineInstr &MI) {
+bool CombinerHelper::matchOptBrCondByInvertingCond(MachineInstr &MI) {
   if (MI.getOpcode() != TargetOpcode::G_BR)
     return false;
 
   // Try to match the following:
   // bb1:
-  //   %c(s32) = G_ICMP pred, %a, %b
-  //   %c1(s1) = G_TRUNC %c(s32)
   //   G_BRCOND %c1, %bb2
   //   G_BR %bb3
   // bb2:
@@ -807,7 +956,7 @@ bool CombinerHelper::matchElideBrByInvertingCond(MachineInstr &MI) {
   // The above pattern does not have a fall through to the successor bb2, always
   // resulting in a branch no matter which path is taken. Here we try to find
   // and replace that pattern with conditional branch to bb3 and otherwise
-  // fallthrough to bb2.
+  // fallthrough to bb2. This is generally better for branch predictors.
 
   MachineBasicBlock *MBB = MI.getParent();
   MachineBasicBlock::iterator BrIt(MI);
@@ -819,43 +968,38 @@ bool CombinerHelper::matchElideBrByInvertingCond(MachineInstr &MI) {
   if (BrCond->getOpcode() != TargetOpcode::G_BRCOND)
     return false;
 
-  // Check that the next block is the conditional branch target.
-  if (!MBB->isLayoutSuccessor(BrCond->getOperand(1).getMBB()))
-    return false;
-
-  MachineInstr *CmpMI = MRI.getVRegDef(BrCond->getOperand(0).getReg());
-  if (!CmpMI || CmpMI->getOpcode() != TargetOpcode::G_ICMP ||
-      !MRI.hasOneNonDBGUse(CmpMI->getOperand(0).getReg()))
-    return false;
-  return true;
-}
-
-bool CombinerHelper::tryElideBrByInvertingCond(MachineInstr &MI) {
-  if (!matchElideBrByInvertingCond(MI))
-    return false;
-  applyElideBrByInvertingCond(MI);
-  return true;
+  // Check that the next block is the conditional branch target. Also make sure
+  // that it isn't the same as the G_BR's target (otherwise, this will loop.)
+  MachineBasicBlock *BrCondTarget = BrCond->getOperand(1).getMBB();
+  return BrCondTarget != MI.getOperand(0).getMBB() &&
+         MBB->isLayoutSuccessor(BrCondTarget);
 }
 
-void CombinerHelper::applyElideBrByInvertingCond(MachineInstr &MI) {
+void CombinerHelper::applyOptBrCondByInvertingCond(MachineInstr &MI) {
   MachineBasicBlock *BrTarget = MI.getOperand(0).getMBB();
   MachineBasicBlock::iterator BrIt(MI);
   MachineInstr *BrCond = &*std::prev(BrIt);
-  MachineInstr *CmpMI = MRI.getVRegDef(BrCond->getOperand(0).getReg());
 
-  CmpInst::Predicate InversePred = CmpInst::getInversePredicate(
-      (CmpInst::Predicate)CmpMI->getOperand(1).getPredicate());
+  Builder.setInstrAndDebugLoc(*BrCond);
+  LLT Ty = MRI.getType(BrCond->getOperand(0).getReg());
+  // FIXME: Does int/fp matter for this? If so, we might need to restrict
+  // this to i1 only since we might not know for sure what kind of
+  // compare generated the condition value.
+  auto True = Builder.buildConstant(
+      Ty, getICmpTrueVal(getTargetLowering(), false, false));
+  auto Xor = Builder.buildXor(Ty, BrCond->getOperand(0), True);
 
-  // Invert the G_ICMP condition.
-  Observer.changingInstr(*CmpMI);
-  CmpMI->getOperand(1).setPredicate(InversePred);
-  Observer.changedInstr(*CmpMI);
+  auto *FallthroughBB = BrCond->getOperand(1).getMBB();
+  Observer.changingInstr(MI);
+  MI.getOperand(0).setMBB(FallthroughBB);
+  Observer.changedInstr(MI);
 
-  // Change the conditional branch target.
+  // Change the conditional branch to use the inverted condition and
+  // new target block.
   Observer.changingInstr(*BrCond);
+  BrCond->getOperand(0).setReg(Xor.getReg(0));
   BrCond->getOperand(1).setMBB(BrTarget);
   Observer.changedInstr(*BrCond);
-  MI.eraseFromParent();
 }
 
 static bool shouldLowerMemFuncForSize(const MachineFunction &MF) {
@@ -946,8 +1090,7 @@ static Register getMemsetValue(Register Val, LLT Ty, MachineIRBuilder &MIB) {
   unsigned NumBits = Ty.getScalarSizeInBits();
   auto ValVRegAndVal = getConstantVRegValWithLookThrough(Val, MRI);
   if (!Ty.isVector() && ValVRegAndVal) {
-    unsigned KnownVal = ValVRegAndVal->Value;
-    APInt Scalar = APInt(8, KnownVal);
+    APInt Scalar = ValVRegAndVal->Value.truncOrSelf(8);
     APInt SplatVal = APInt::getSplat(NumBits, Scalar);
     return MIB.buildConstant(Ty, SplatVal).getReg(0);
   }
@@ -1299,13 +1442,11 @@ bool CombinerHelper::optimizeMemmove(MachineInstr &MI, Register Dst,
 }
 
 bool CombinerHelper::tryCombineMemCpyFamily(MachineInstr &MI, unsigned MaxLen) {
+  const unsigned Opc = MI.getOpcode();
   // This combine is fairly complex so it's not written with a separate
   // matcher function.
-  assert(MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
-  Intrinsic::ID ID = (Intrinsic::ID)MI.getIntrinsicID();
-  assert((ID == Intrinsic::memcpy || ID == Intrinsic::memmove ||
-          ID == Intrinsic::memset) &&
-         "Expected a memcpy like intrinsic");
+  assert((Opc == TargetOpcode::G_MEMCPY || Opc == TargetOpcode::G_MEMMOVE ||
+          Opc == TargetOpcode::G_MEMSET) && "Expected memcpy like instruction");
 
   auto MMOIt = MI.memoperands_begin();
   const MachineMemOperand *MemOp = *MMOIt;
@@ -1316,11 +1457,11 @@ bool CombinerHelper::tryCombineMemCpyFamily(MachineInstr &MI, unsigned MaxLen) {
 
   Align DstAlign = MemOp->getBaseAlign();
   Align SrcAlign;
-  Register Dst = MI.getOperand(1).getReg();
-  Register Src = MI.getOperand(2).getReg();
-  Register Len = MI.getOperand(3).getReg();
+  Register Dst = MI.getOperand(0).getReg();
+  Register Src = MI.getOperand(1).getReg();
+  Register Len = MI.getOperand(2).getReg();
 
-  if (ID != Intrinsic::memset) {
+  if (Opc != TargetOpcode::G_MEMSET) {
     assert(MMOIt != MI.memoperands_end() && "Expected a second MMO on MI");
     MemOp = *(++MMOIt);
     SrcAlign = MemOp->getBaseAlign();
@@ -1330,7 +1471,7 @@ bool CombinerHelper::tryCombineMemCpyFamily(MachineInstr &MI, unsigned MaxLen) {
   auto LenVRegAndVal = getConstantVRegValWithLookThrough(Len, MRI);
   if (!LenVRegAndVal)
     return false; // Leave it to the legalizer to lower it to a libcall.
-  unsigned KnownLen = LenVRegAndVal->Value;
+  unsigned KnownLen = LenVRegAndVal->Value.getZExtValue();
 
   if (KnownLen == 0) {
     MI.eraseFromParent();
@@ -1340,15 +1481,78 @@ bool CombinerHelper::tryCombineMemCpyFamily(MachineInstr &MI, unsigned MaxLen) {
   if (MaxLen && KnownLen > MaxLen)
     return false;
 
-  if (ID == Intrinsic::memcpy)
+  if (Opc == TargetOpcode::G_MEMCPY)
     return optimizeMemcpy(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile);
-  if (ID == Intrinsic::memmove)
+  if (Opc == TargetOpcode::G_MEMMOVE)
     return optimizeMemmove(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile);
-  if (ID == Intrinsic::memset)
+  if (Opc == TargetOpcode::G_MEMSET)
     return optimizeMemset(MI, Dst, Src, KnownLen, DstAlign, IsVolatile);
   return false;
 }
 
+static Optional<APFloat> constantFoldFpUnary(unsigned Opcode, LLT DstTy,
+                                             const Register Op,
+                                             const MachineRegisterInfo &MRI) {
+  const ConstantFP *MaybeCst = getConstantFPVRegVal(Op, MRI);
+  if (!MaybeCst)
+    return None;
+
+  APFloat V = MaybeCst->getValueAPF();
+  switch (Opcode) {
+  default:
+    llvm_unreachable("Unexpected opcode!");
+  case TargetOpcode::G_FNEG: {
+    V.changeSign();
+    return V;
+  }
+  case TargetOpcode::G_FABS: {
+    V.clearSign();
+    return V;
+  }
+  case TargetOpcode::G_FPTRUNC:
+    break;
+  case TargetOpcode::G_FSQRT: {
+    bool Unused;
+    V.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven, &Unused);
+    V = APFloat(sqrt(V.convertToDouble()));
+    break;
+  }
+  case TargetOpcode::G_FLOG2: {
+    bool Unused;
+    V.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven, &Unused);
+    V = APFloat(log2(V.convertToDouble()));
+    break;
+  }
+  }
+  // Convert `APFloat` to appropriate IEEE type depending on `DstTy`. Otherwise,
+  // `buildFConstant` will assert on size mismatch. Only `G_FPTRUNC`, `G_FSQRT`,
+  // and `G_FLOG2` reach here.
+  bool Unused;
+  V.convert(getFltSemanticForLLT(DstTy), APFloat::rmNearestTiesToEven, &Unused);
+  return V;
+}
+
+bool CombinerHelper::matchCombineConstantFoldFpUnary(MachineInstr &MI,
+                                                     Optional<APFloat> &Cst) {
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(1).getReg();
+  LLT DstTy = MRI.getType(DstReg);
+  Cst = constantFoldFpUnary(MI.getOpcode(), DstTy, SrcReg, MRI);
+  return Cst.hasValue();
+}
+
+bool CombinerHelper::applyCombineConstantFoldFpUnary(MachineInstr &MI,
+                                                     Optional<APFloat> &Cst) {
+  assert(Cst.hasValue() && "Optional is unexpectedly empty!");
+  Builder.setInstrAndDebugLoc(MI);
+  MachineFunction &MF = Builder.getMF();
+  auto *FPVal = ConstantFP::get(MF.getFunction().getContext(), *Cst);
+  Register DstReg = MI.getOperand(0).getReg();
+  Builder.buildFConstant(DstReg, *FPVal);
+  MI.eraseFromParent();
+  return true;
+}
+
 bool CombinerHelper::matchPtrAddImmedChain(MachineInstr &MI,
                                            PtrAddChain &MatchInfo) {
   // We're trying to match the following pattern:
@@ -1377,7 +1581,7 @@ bool CombinerHelper::matchPtrAddImmedChain(MachineInstr &MI,
     return false;
 
   // Pass the combined immediate to the apply function.
-  MatchInfo.Imm = MaybeImmVal->Value + MaybeImm2Val->Value;
+  MatchInfo.Imm = (MaybeImmVal->Value + MaybeImm2Val->Value).getSExtValue();
   MatchInfo.Base = Base;
   return true;
 }
@@ -1395,15 +1599,211 @@ bool CombinerHelper::applyPtrAddImmedChain(MachineInstr &MI,
   return true;
 }
 
+bool CombinerHelper::matchShiftImmedChain(MachineInstr &MI,
+                                          RegisterImmPair &MatchInfo) {
+  // We're trying to match the following pattern with any of
+  // G_SHL/G_ASHR/G_LSHR/G_SSHLSAT/G_USHLSAT shift instructions:
+  //   %t1 = SHIFT %base, G_CONSTANT imm1
+  //   %root = SHIFT %t1, G_CONSTANT imm2
+  // -->
+  //   %root = SHIFT %base, G_CONSTANT (imm1 + imm2)
+
+  unsigned Opcode = MI.getOpcode();
+  assert((Opcode == TargetOpcode::G_SHL || Opcode == TargetOpcode::G_ASHR ||
+          Opcode == TargetOpcode::G_LSHR || Opcode == TargetOpcode::G_SSHLSAT ||
+          Opcode == TargetOpcode::G_USHLSAT) &&
+         "Expected G_SHL, G_ASHR, G_LSHR, G_SSHLSAT or G_USHLSAT");
+
+  Register Shl2 = MI.getOperand(1).getReg();
+  Register Imm1 = MI.getOperand(2).getReg();
+  auto MaybeImmVal = getConstantVRegValWithLookThrough(Imm1, MRI);
+  if (!MaybeImmVal)
+    return false;
+
+  MachineInstr *Shl2Def = MRI.getUniqueVRegDef(Shl2);
+  if (Shl2Def->getOpcode() != Opcode)
+    return false;
+
+  Register Base = Shl2Def->getOperand(1).getReg();
+  Register Imm2 = Shl2Def->getOperand(2).getReg();
+  auto MaybeImm2Val = getConstantVRegValWithLookThrough(Imm2, MRI);
+  if (!MaybeImm2Val)
+    return false;
+
+  // Pass the combined immediate to the apply function.
+  MatchInfo.Imm =
+      (MaybeImmVal->Value.getSExtValue() + MaybeImm2Val->Value).getSExtValue();
+  MatchInfo.Reg = Base;
+
+  // There is no simple replacement for a saturating unsigned left shift that
+  // exceeds the scalar size.
+  if (Opcode == TargetOpcode::G_USHLSAT &&
+      MatchInfo.Imm >= MRI.getType(Shl2).getScalarSizeInBits())
+    return false;
+
+  return true;
+}
+
+bool CombinerHelper::applyShiftImmedChain(MachineInstr &MI,
+                                          RegisterImmPair &MatchInfo) {
+  unsigned Opcode = MI.getOpcode();
+  assert((Opcode == TargetOpcode::G_SHL || Opcode == TargetOpcode::G_ASHR ||
+          Opcode == TargetOpcode::G_LSHR || Opcode == TargetOpcode::G_SSHLSAT ||
+          Opcode == TargetOpcode::G_USHLSAT) &&
+         "Expected G_SHL, G_ASHR, G_LSHR, G_SSHLSAT or G_USHLSAT");
+
+  Builder.setInstrAndDebugLoc(MI);
+  LLT Ty = MRI.getType(MI.getOperand(1).getReg());
+  unsigned const ScalarSizeInBits = Ty.getScalarSizeInBits();
+  auto Imm = MatchInfo.Imm;
+
+  if (Imm >= ScalarSizeInBits) {
+    // Any logical shift that exceeds scalar size will produce zero.
+    if (Opcode == TargetOpcode::G_SHL || Opcode == TargetOpcode::G_LSHR) {
+      Builder.buildConstant(MI.getOperand(0), 0);
+      MI.eraseFromParent();
+      return true;
+    }
+    // Arithmetic shift and saturating signed left shift have no effect beyond
+    // scalar size.
+    Imm = ScalarSizeInBits - 1;
+  }
+
+  LLT ImmTy = MRI.getType(MI.getOperand(2).getReg());
+  Register NewImm = Builder.buildConstant(ImmTy, Imm).getReg(0);
+  Observer.changingInstr(MI);
+  MI.getOperand(1).setReg(MatchInfo.Reg);
+  MI.getOperand(2).setReg(NewImm);
+  Observer.changedInstr(MI);
+  return true;
+}
+
+bool CombinerHelper::matchShiftOfShiftedLogic(MachineInstr &MI,
+                                              ShiftOfShiftedLogic &MatchInfo) {
+  // We're trying to match the following pattern with any of
+  // G_SHL/G_ASHR/G_LSHR/G_USHLSAT/G_SSHLSAT shift instructions in combination
+  // with any of G_AND/G_OR/G_XOR logic instructions.
+  //   %t1 = SHIFT %X, G_CONSTANT C0
+  //   %t2 = LOGIC %t1, %Y
+  //   %root = SHIFT %t2, G_CONSTANT C1
+  // -->
+  //   %t3 = SHIFT %X, G_CONSTANT (C0+C1)
+  //   %t4 = SHIFT %Y, G_CONSTANT C1
+  //   %root = LOGIC %t3, %t4
+  unsigned ShiftOpcode = MI.getOpcode();
+  assert((ShiftOpcode == TargetOpcode::G_SHL ||
+          ShiftOpcode == TargetOpcode::G_ASHR ||
+          ShiftOpcode == TargetOpcode::G_LSHR ||
+          ShiftOpcode == TargetOpcode::G_USHLSAT ||
+          ShiftOpcode == TargetOpcode::G_SSHLSAT) &&
+         "Expected G_SHL, G_ASHR, G_LSHR, G_USHLSAT and G_SSHLSAT");
+
+  // Match a one-use bitwise logic op.
+  Register LogicDest = MI.getOperand(1).getReg();
+  if (!MRI.hasOneNonDBGUse(LogicDest))
+    return false;
+
+  MachineInstr *LogicMI = MRI.getUniqueVRegDef(LogicDest);
+  unsigned LogicOpcode = LogicMI->getOpcode();
+  if (LogicOpcode != TargetOpcode::G_AND && LogicOpcode != TargetOpcode::G_OR &&
+      LogicOpcode != TargetOpcode::G_XOR)
+    return false;
+
+  // Find a matching one-use shift by constant.
+  const Register C1 = MI.getOperand(2).getReg();
+  auto MaybeImmVal = getConstantVRegValWithLookThrough(C1, MRI);
+  if (!MaybeImmVal)
+    return false;
+
+  const uint64_t C1Val = MaybeImmVal->Value.getZExtValue();
+
+  auto matchFirstShift = [&](const MachineInstr *MI, uint64_t &ShiftVal) {
+    // Shift should match previous one and should be a one-use.
+    if (MI->getOpcode() != ShiftOpcode ||
+        !MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
+      return false;
+
+    // Must be a constant.
+    auto MaybeImmVal =
+        getConstantVRegValWithLookThrough(MI->getOperand(2).getReg(), MRI);
+    if (!MaybeImmVal)
+      return false;
+
+    ShiftVal = MaybeImmVal->Value.getSExtValue();
+    return true;
+  };
+
+  // Logic ops are commutative, so check each operand for a match.
+  Register LogicMIReg1 = LogicMI->getOperand(1).getReg();
+  MachineInstr *LogicMIOp1 = MRI.getUniqueVRegDef(LogicMIReg1);
+  Register LogicMIReg2 = LogicMI->getOperand(2).getReg();
+  MachineInstr *LogicMIOp2 = MRI.getUniqueVRegDef(LogicMIReg2);
+  uint64_t C0Val;
+
+  if (matchFirstShift(LogicMIOp1, C0Val)) {
+    MatchInfo.LogicNonShiftReg = LogicMIReg2;
+    MatchInfo.Shift2 = LogicMIOp1;
+  } else if (matchFirstShift(LogicMIOp2, C0Val)) {
+    MatchInfo.LogicNonShiftReg = LogicMIReg1;
+    MatchInfo.Shift2 = LogicMIOp2;
+  } else
+    return false;
+
+  MatchInfo.ValSum = C0Val + C1Val;
+
+  // The fold is not valid if the sum of the shift values exceeds bitwidth.
+  if (MatchInfo.ValSum >= MRI.getType(LogicDest).getScalarSizeInBits())
+    return false;
+
+  MatchInfo.Logic = LogicMI;
+  return true;
+}
+
+bool CombinerHelper::applyShiftOfShiftedLogic(MachineInstr &MI,
+                                              ShiftOfShiftedLogic &MatchInfo) {
+  unsigned Opcode = MI.getOpcode();
+  assert((Opcode == TargetOpcode::G_SHL || Opcode == TargetOpcode::G_ASHR ||
+          Opcode == TargetOpcode::G_LSHR || Opcode == TargetOpcode::G_USHLSAT ||
+          Opcode == TargetOpcode::G_SSHLSAT) &&
+         "Expected G_SHL, G_ASHR, G_LSHR, G_USHLSAT and G_SSHLSAT");
+
+  LLT ShlType = MRI.getType(MI.getOperand(2).getReg());
+  LLT DestType = MRI.getType(MI.getOperand(0).getReg());
+  Builder.setInstrAndDebugLoc(MI);
+
+  Register Const = Builder.buildConstant(ShlType, MatchInfo.ValSum).getReg(0);
+
+  Register Shift1Base = MatchInfo.Shift2->getOperand(1).getReg();
+  Register Shift1 =
+      Builder.buildInstr(Opcode, {DestType}, {Shift1Base, Const}).getReg(0);
+
+  Register Shift2Const = MI.getOperand(2).getReg();
+  Register Shift2 = Builder
+                        .buildInstr(Opcode, {DestType},
+                                    {MatchInfo.LogicNonShiftReg, Shift2Const})
+                        .getReg(0);
+
+  Register Dest = MI.getOperand(0).getReg();
+  Builder.buildInstr(MatchInfo.Logic->getOpcode(), {Dest}, {Shift1, Shift2});
+
+  // These were one use so it's safe to remove them.
+  MatchInfo.Shift2->eraseFromParent();
+  MatchInfo.Logic->eraseFromParent();
+
+  MI.eraseFromParent();
+  return true;
+}
+
 bool CombinerHelper::matchCombineMulToShl(MachineInstr &MI,
                                           unsigned &ShiftVal) {
   assert(MI.getOpcode() == TargetOpcode::G_MUL && "Expected a G_MUL");
   auto MaybeImmVal =
       getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
-  if (!MaybeImmVal || !isPowerOf2_64(MaybeImmVal->Value))
+  if (!MaybeImmVal)
     return false;
-  ShiftVal = Log2_64(MaybeImmVal->Value);
-  return true;
+
+  ShiftVal = MaybeImmVal->Value.exactLogBase2();
+  return (static_cast<int32_t>(ShiftVal) != -1);
 }
 
 bool CombinerHelper::applyCombineMulToShl(MachineInstr &MI,
@@ -1419,6 +1819,254 @@ bool CombinerHelper::applyCombineMulToShl(MachineInstr &MI,
   return true;
 }
 
+// shl ([sza]ext x), y => zext (shl x, y), if shift does not overflow source
+bool CombinerHelper::matchCombineShlOfExtend(MachineInstr &MI,
+                                             RegisterImmPair &MatchData) {
+  assert(MI.getOpcode() == TargetOpcode::G_SHL && KB);
+
+  Register LHS = MI.getOperand(1).getReg();
+
+  Register ExtSrc;
+  if (!mi_match(LHS, MRI, m_GAnyExt(m_Reg(ExtSrc))) &&
+      !mi_match(LHS, MRI, m_GZExt(m_Reg(ExtSrc))) &&
+      !mi_match(LHS, MRI, m_GSExt(m_Reg(ExtSrc))))
+    return false;
+
+  // TODO: Should handle vector splat.
+  Register RHS = MI.getOperand(2).getReg();
+  auto MaybeShiftAmtVal = getConstantVRegValWithLookThrough(RHS, MRI);
+  if (!MaybeShiftAmtVal)
+    return false;
+
+  if (LI) {
+    LLT SrcTy = MRI.getType(ExtSrc);
+
+    // We only really care about the legality with the shifted value. We can
+    // pick any type the constant shift amount, so ask the target what to
+    // use. Otherwise we would have to guess and hope it is reported as legal.
+    LLT ShiftAmtTy = getTargetLowering().getPreferredShiftAmountTy(SrcTy);
+    if (!isLegalOrBeforeLegalizer({TargetOpcode::G_SHL, {SrcTy, ShiftAmtTy}}))
+      return false;
+  }
+
+  int64_t ShiftAmt = MaybeShiftAmtVal->Value.getSExtValue();
+  MatchData.Reg = ExtSrc;
+  MatchData.Imm = ShiftAmt;
+
+  unsigned MinLeadingZeros = KB->getKnownZeroes(ExtSrc).countLeadingOnes();
+  return MinLeadingZeros >= ShiftAmt;
+}
+
+bool CombinerHelper::applyCombineShlOfExtend(MachineInstr &MI,
+                                             const RegisterImmPair &MatchData) {
+  Register ExtSrcReg = MatchData.Reg;
+  int64_t ShiftAmtVal = MatchData.Imm;
+
+  LLT ExtSrcTy = MRI.getType(ExtSrcReg);
+  Builder.setInstrAndDebugLoc(MI);
+  auto ShiftAmt = Builder.buildConstant(ExtSrcTy, ShiftAmtVal);
+  auto NarrowShift =
+      Builder.buildShl(ExtSrcTy, ExtSrcReg, ShiftAmt, MI.getFlags());
+  Builder.buildZExt(MI.getOperand(0), NarrowShift);
+  MI.eraseFromParent();
+  return true;
+}
+
+static Register peekThroughBitcast(Register Reg,
+                                   const MachineRegisterInfo &MRI) {
+  while (mi_match(Reg, MRI, m_GBitcast(m_Reg(Reg))))
+    ;
+
+  return Reg;
+}
+
+bool CombinerHelper::matchCombineUnmergeMergeToPlainValues(
+    MachineInstr &MI, SmallVectorImpl<Register> &Operands) {
+  assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES &&
+         "Expected an unmerge");
+  Register SrcReg =
+      peekThroughBitcast(MI.getOperand(MI.getNumOperands() - 1).getReg(), MRI);
+
+  MachineInstr *SrcInstr = MRI.getVRegDef(SrcReg);
+  if (SrcInstr->getOpcode() != TargetOpcode::G_MERGE_VALUES &&
+      SrcInstr->getOpcode() != TargetOpcode::G_BUILD_VECTOR &&
+      SrcInstr->getOpcode() != TargetOpcode::G_CONCAT_VECTORS)
+    return false;
+
+  // Check the source type of the merge.
+  LLT SrcMergeTy = MRI.getType(SrcInstr->getOperand(1).getReg());
+  LLT Dst0Ty = MRI.getType(MI.getOperand(0).getReg());
+  bool SameSize = Dst0Ty.getSizeInBits() == SrcMergeTy.getSizeInBits();
+  if (SrcMergeTy != Dst0Ty && !SameSize)
+    return false;
+  // They are the same now (modulo a bitcast).
+  // We can collect all the src registers.
+  for (unsigned Idx = 1, EndIdx = SrcInstr->getNumOperands(); Idx != EndIdx;
+       ++Idx)
+    Operands.push_back(SrcInstr->getOperand(Idx).getReg());
+  return true;
+}
+
+bool CombinerHelper::applyCombineUnmergeMergeToPlainValues(
+    MachineInstr &MI, SmallVectorImpl<Register> &Operands) {
+  assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES &&
+         "Expected an unmerge");
+  assert((MI.getNumOperands() - 1 == Operands.size()) &&
+         "Not enough operands to replace all defs");
+  unsigned NumElems = MI.getNumOperands() - 1;
+
+  LLT SrcTy = MRI.getType(Operands[0]);
+  LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+  bool CanReuseInputDirectly = DstTy == SrcTy;
+  Builder.setInstrAndDebugLoc(MI);
+  for (unsigned Idx = 0; Idx < NumElems; ++Idx) {
+    Register DstReg = MI.getOperand(Idx).getReg();
+    Register SrcReg = Operands[Idx];
+    if (CanReuseInputDirectly)
+      replaceRegWith(MRI, DstReg, SrcReg);
+    else
+      Builder.buildCast(DstReg, SrcReg);
+  }
+  MI.eraseFromParent();
+  return true;
+}
+
+bool CombinerHelper::matchCombineUnmergeConstant(MachineInstr &MI,
+                                                 SmallVectorImpl<APInt> &Csts) {
+  unsigned SrcIdx = MI.getNumOperands() - 1;
+  Register SrcReg = MI.getOperand(SrcIdx).getReg();
+  MachineInstr *SrcInstr = MRI.getVRegDef(SrcReg);
+  if (SrcInstr->getOpcode() != TargetOpcode::G_CONSTANT &&
+      SrcInstr->getOpcode() != TargetOpcode::G_FCONSTANT)
+    return false;
+  // Break down the big constant in smaller ones.
+  const MachineOperand &CstVal = SrcInstr->getOperand(1);
+  APInt Val = SrcInstr->getOpcode() == TargetOpcode::G_CONSTANT
+                  ? CstVal.getCImm()->getValue()
+                  : CstVal.getFPImm()->getValueAPF().bitcastToAPInt();
+
+  LLT Dst0Ty = MRI.getType(MI.getOperand(0).getReg());
+  unsigned ShiftAmt = Dst0Ty.getSizeInBits();
+  // Unmerge a constant.
+  for (unsigned Idx = 0; Idx != SrcIdx; ++Idx) {
+    Csts.emplace_back(Val.trunc(ShiftAmt));
+    Val = Val.lshr(ShiftAmt);
+  }
+
+  return true;
+}
+
+bool CombinerHelper::applyCombineUnmergeConstant(MachineInstr &MI,
+                                                 SmallVectorImpl<APInt> &Csts) {
+  assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES &&
+         "Expected an unmerge");
+  assert((MI.getNumOperands() - 1 == Csts.size()) &&
+         "Not enough operands to replace all defs");
+  unsigned NumElems = MI.getNumOperands() - 1;
+  Builder.setInstrAndDebugLoc(MI);
+  for (unsigned Idx = 0; Idx < NumElems; ++Idx) {
+    Register DstReg = MI.getOperand(Idx).getReg();
+    Builder.buildConstant(DstReg, Csts[Idx]);
+  }
+
+  MI.eraseFromParent();
+  return true;
+}
+
+bool CombinerHelper::matchCombineUnmergeWithDeadLanesToTrunc(MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES &&
+         "Expected an unmerge");
+  // Check that all the lanes are dead except the first one.
+  for (unsigned Idx = 1, EndIdx = MI.getNumDefs(); Idx != EndIdx; ++Idx) {
+    if (!MRI.use_nodbg_empty(MI.getOperand(Idx).getReg()))
+      return false;
+  }
+  return true;
+}
+
+bool CombinerHelper::applyCombineUnmergeWithDeadLanesToTrunc(MachineInstr &MI) {
+  Builder.setInstrAndDebugLoc(MI);
+  Register SrcReg = MI.getOperand(MI.getNumDefs()).getReg();
+  // Truncating a vector is going to truncate every single lane,
+  // whereas we want the full lowbits.
+  // Do the operation on a scalar instead.
+  LLT SrcTy = MRI.getType(SrcReg);
+  if (SrcTy.isVector())
+    SrcReg =
+        Builder.buildCast(LLT::scalar(SrcTy.getSizeInBits()), SrcReg).getReg(0);
+
+  Register Dst0Reg = MI.getOperand(0).getReg();
+  LLT Dst0Ty = MRI.getType(Dst0Reg);
+  if (Dst0Ty.isVector()) {
+    auto MIB = Builder.buildTrunc(LLT::scalar(Dst0Ty.getSizeInBits()), SrcReg);
+    Builder.buildCast(Dst0Reg, MIB);
+  } else
+    Builder.buildTrunc(Dst0Reg, SrcReg);
+  MI.eraseFromParent();
+  return true;
+}
+
+bool CombinerHelper::matchCombineUnmergeZExtToZExt(MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES &&
+         "Expected an unmerge");
+  Register Dst0Reg = MI.getOperand(0).getReg();
+  LLT Dst0Ty = MRI.getType(Dst0Reg);
+  // G_ZEXT on vector applies to each lane, so it will
+  // affect all destinations. Therefore we won't be able
+  // to simplify the unmerge to just the first definition.
+  if (Dst0Ty.isVector())
+    return false;
+  Register SrcReg = MI.getOperand(MI.getNumDefs()).getReg();
+  LLT SrcTy = MRI.getType(SrcReg);
+  if (SrcTy.isVector())
+    return false;
+
+  Register ZExtSrcReg;
+  if (!mi_match(SrcReg, MRI, m_GZExt(m_Reg(ZExtSrcReg))))
+    return false;
+
+  // Finally we can replace the first definition with
+  // a zext of the source if the definition is big enough to hold
+  // all of ZExtSrc bits.
+  LLT ZExtSrcTy = MRI.getType(ZExtSrcReg);
+  return ZExtSrcTy.getSizeInBits() <= Dst0Ty.getSizeInBits();
+}
+
+bool CombinerHelper::applyCombineUnmergeZExtToZExt(MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES &&
+         "Expected an unmerge");
+
+  Register Dst0Reg = MI.getOperand(0).getReg();
+
+  MachineInstr *ZExtInstr =
+      MRI.getVRegDef(MI.getOperand(MI.getNumDefs()).getReg());
+  assert(ZExtInstr && ZExtInstr->getOpcode() == TargetOpcode::G_ZEXT &&
+         "Expecting a G_ZEXT");
+
+  Register ZExtSrcReg = ZExtInstr->getOperand(1).getReg();
+  LLT Dst0Ty = MRI.getType(Dst0Reg);
+  LLT ZExtSrcTy = MRI.getType(ZExtSrcReg);
+
+  Builder.setInstrAndDebugLoc(MI);
+
+  if (Dst0Ty.getSizeInBits() > ZExtSrcTy.getSizeInBits()) {
+    Builder.buildZExt(Dst0Reg, ZExtSrcReg);
+  } else {
+    assert(Dst0Ty.getSizeInBits() == ZExtSrcTy.getSizeInBits() &&
+           "ZExt src doesn't fit in destination");
+    replaceRegWith(MRI, Dst0Reg, ZExtSrcReg);
+  }
+
+  Register ZeroReg;
+  for (unsigned Idx = 1, EndIdx = MI.getNumDefs(); Idx != EndIdx; ++Idx) {
+    if (!ZeroReg)
+      ZeroReg = Builder.buildConstant(Dst0Ty, 0).getReg(0);
+    replaceRegWith(MRI, MI.getOperand(Idx).getReg(), ZeroReg);
+  }
+  MI.eraseFromParent();
+  return true;
+}
+
 bool CombinerHelper::matchCombineShiftToUnmerge(MachineInstr &MI,
                                                 unsigned TargetShiftSize,
                                                 unsigned &ShiftVal) {
@@ -1440,7 +2088,7 @@ bool CombinerHelper::matchCombineShiftToUnmerge(MachineInstr &MI,
   if (!MaybeImmVal)
     return false;
 
-  ShiftVal = MaybeImmVal->Value;
+  ShiftVal = MaybeImmVal->Value.getSExtValue();
   return ShiftVal >= Size / 2 && ShiftVal < Size;
 }
 
@@ -1529,40 +2177,346 @@ bool CombinerHelper::tryCombineShiftToUnmerge(MachineInstr &MI,
   return false;
 }
 
-bool CombinerHelper::matchAnyExplicitUseIsUndef(MachineInstr &MI) {
-  return any_of(MI.explicit_uses(), [this](const MachineOperand &MO) {
-    return MO.isReg() &&
-           getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, MO.getReg(), MRI);
-  });
-}
-
-bool CombinerHelper::matchAllExplicitUsesAreUndef(MachineInstr &MI) {
-  return all_of(MI.explicit_uses(), [this](const MachineOperand &MO) {
-    return !MO.isReg() ||
-           getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, MO.getReg(), MRI);
-  });
+bool CombinerHelper::matchCombineI2PToP2I(MachineInstr &MI, Register &Reg) {
+  assert(MI.getOpcode() == TargetOpcode::G_INTTOPTR && "Expected a G_INTTOPTR");
+  Register DstReg = MI.getOperand(0).getReg();
+  LLT DstTy = MRI.getType(DstReg);
+  Register SrcReg = MI.getOperand(1).getReg();
+  return mi_match(SrcReg, MRI,
+                  m_GPtrToInt(m_all_of(m_SpecificType(DstTy), m_Reg(Reg))));
 }
 
-bool CombinerHelper::matchUndefShuffleVectorMask(MachineInstr &MI) {
-  assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
-  ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
-  return all_of(Mask, [](int Elt) { return Elt < 0; });
+bool CombinerHelper::applyCombineI2PToP2I(MachineInstr &MI, Register &Reg) {
+  assert(MI.getOpcode() == TargetOpcode::G_INTTOPTR && "Expected a G_INTTOPTR");
+  Register DstReg = MI.getOperand(0).getReg();
+  Builder.setInstr(MI);
+  Builder.buildCopy(DstReg, Reg);
+  MI.eraseFromParent();
+  return true;
 }
 
-bool CombinerHelper::matchUndefStore(MachineInstr &MI) {
-  assert(MI.getOpcode() == TargetOpcode::G_STORE);
-  return getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, MI.getOperand(0).getReg(),
-                      MRI);
+bool CombinerHelper::matchCombineP2IToI2P(MachineInstr &MI, Register &Reg) {
+  assert(MI.getOpcode() == TargetOpcode::G_PTRTOINT && "Expected a G_PTRTOINT");
+  Register SrcReg = MI.getOperand(1).getReg();
+  return mi_match(SrcReg, MRI, m_GIntToPtr(m_Reg(Reg)));
 }
 
-bool CombinerHelper::eraseInst(MachineInstr &MI) {
+bool CombinerHelper::applyCombineP2IToI2P(MachineInstr &MI, Register &Reg) {
+  assert(MI.getOpcode() == TargetOpcode::G_PTRTOINT && "Expected a G_PTRTOINT");
+  Register DstReg = MI.getOperand(0).getReg();
+  Builder.setInstr(MI);
+  Builder.buildZExtOrTrunc(DstReg, Reg);
   MI.eraseFromParent();
   return true;
 }
 
-bool CombinerHelper::matchEqualDefs(const MachineOperand &MOP1,
-                                    const MachineOperand &MOP2) {
-  if (!MOP1.isReg() || !MOP2.isReg())
+bool CombinerHelper::matchCombineAddP2IToPtrAdd(
+    MachineInstr &MI, std::pair<Register, bool> &PtrReg) {
+  assert(MI.getOpcode() == TargetOpcode::G_ADD);
+  Register LHS = MI.getOperand(1).getReg();
+  Register RHS = MI.getOperand(2).getReg();
+  LLT IntTy = MRI.getType(LHS);
+
+  // G_PTR_ADD always has the pointer in the LHS, so we may need to commute the
+  // instruction.
+  PtrReg.second = false;
+  for (Register SrcReg : {LHS, RHS}) {
+    if (mi_match(SrcReg, MRI, m_GPtrToInt(m_Reg(PtrReg.first)))) {
+      // Don't handle cases where the integer is implicitly converted to the
+      // pointer width.
+      LLT PtrTy = MRI.getType(PtrReg.first);
+      if (PtrTy.getScalarSizeInBits() == IntTy.getScalarSizeInBits())
+        return true;
+    }
+
+    PtrReg.second = true;
+  }
+
+  return false;
+}
+
+bool CombinerHelper::applyCombineAddP2IToPtrAdd(
+    MachineInstr &MI, std::pair<Register, bool> &PtrReg) {
+  Register Dst = MI.getOperand(0).getReg();
+  Register LHS = MI.getOperand(1).getReg();
+  Register RHS = MI.getOperand(2).getReg();
+
+  const bool DoCommute = PtrReg.second;
+  if (DoCommute)
+    std::swap(LHS, RHS);
+  LHS = PtrReg.first;
+
+  LLT PtrTy = MRI.getType(LHS);
+
+  Builder.setInstrAndDebugLoc(MI);
+  auto PtrAdd = Builder.buildPtrAdd(PtrTy, LHS, RHS);
+  Builder.buildPtrToInt(Dst, PtrAdd);
+  MI.eraseFromParent();
+  return true;
+}
+
+bool CombinerHelper::matchCombineConstPtrAddToI2P(MachineInstr &MI,
+                                                  int64_t &NewCst) {
+  assert(MI.getOpcode() == TargetOpcode::G_PTR_ADD && "Expected a G_PTR_ADD");
+  Register LHS = MI.getOperand(1).getReg();
+  Register RHS = MI.getOperand(2).getReg();
+  MachineRegisterInfo &MRI = Builder.getMF().getRegInfo();
+
+  if (auto RHSCst = getConstantVRegSExtVal(RHS, MRI)) {
+    int64_t Cst;
+    if (mi_match(LHS, MRI, m_GIntToPtr(m_ICst(Cst)))) {
+      NewCst = Cst + *RHSCst;
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool CombinerHelper::applyCombineConstPtrAddToI2P(MachineInstr &MI,
+                                                  int64_t &NewCst) {
+  assert(MI.getOpcode() == TargetOpcode::G_PTR_ADD && "Expected a G_PTR_ADD");
+  Register Dst = MI.getOperand(0).getReg();
+
+  Builder.setInstrAndDebugLoc(MI);
+  Builder.buildConstant(Dst, NewCst);
+  MI.eraseFromParent();
+  return true;
+}
+
+bool CombinerHelper::matchCombineAnyExtTrunc(MachineInstr &MI, Register &Reg) {
+  assert(MI.getOpcode() == TargetOpcode::G_ANYEXT && "Expected a G_ANYEXT");
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(1).getReg();
+  LLT DstTy = MRI.getType(DstReg);
+  return mi_match(SrcReg, MRI,
+                  m_GTrunc(m_all_of(m_Reg(Reg), m_SpecificType(DstTy))));
+}
+
+bool CombinerHelper::applyCombineAnyExtTrunc(MachineInstr &MI, Register &Reg) {
+  assert(MI.getOpcode() == TargetOpcode::G_ANYEXT && "Expected a G_ANYEXT");
+  Register DstReg = MI.getOperand(0).getReg();
+  MI.eraseFromParent();
+  replaceRegWith(MRI, DstReg, Reg);
+  return true;
+}
+
+bool CombinerHelper::matchCombineExtOfExt(
+    MachineInstr &MI, std::tuple<Register, unsigned> &MatchInfo) {
+  assert((MI.getOpcode() == TargetOpcode::G_ANYEXT ||
+          MI.getOpcode() == TargetOpcode::G_SEXT ||
+          MI.getOpcode() == TargetOpcode::G_ZEXT) &&
+         "Expected a G_[ASZ]EXT");
+  Register SrcReg = MI.getOperand(1).getReg();
+  MachineInstr *SrcMI = MRI.getVRegDef(SrcReg);
+  // Match exts with the same opcode, anyext([sz]ext) and sext(zext).
+  unsigned Opc = MI.getOpcode();
+  unsigned SrcOpc = SrcMI->getOpcode();
+  if (Opc == SrcOpc ||
+      (Opc == TargetOpcode::G_ANYEXT &&
+       (SrcOpc == TargetOpcode::G_SEXT || SrcOpc == TargetOpcode::G_ZEXT)) ||
+      (Opc == TargetOpcode::G_SEXT && SrcOpc == TargetOpcode::G_ZEXT)) {
+    MatchInfo = std::make_tuple(SrcMI->getOperand(1).getReg(), SrcOpc);
+    return true;
+  }
+  return false;
+}
+
+bool CombinerHelper::applyCombineExtOfExt(
+    MachineInstr &MI, std::tuple<Register, unsigned> &MatchInfo) {
+  assert((MI.getOpcode() == TargetOpcode::G_ANYEXT ||
+          MI.getOpcode() == TargetOpcode::G_SEXT ||
+          MI.getOpcode() == TargetOpcode::G_ZEXT) &&
+         "Expected a G_[ASZ]EXT");
+
+  Register Reg = std::get<0>(MatchInfo);
+  unsigned SrcExtOp = std::get<1>(MatchInfo);
+
+  // Combine exts with the same opcode.
+  if (MI.getOpcode() == SrcExtOp) {
+    Observer.changingInstr(MI);
+    MI.getOperand(1).setReg(Reg);
+    Observer.changedInstr(MI);
+    return true;
+  }
+
+  // Combine:
+  // - anyext([sz]ext x) to [sz]ext x
+  // - sext(zext x) to zext x
+  if (MI.getOpcode() == TargetOpcode::G_ANYEXT ||
+      (MI.getOpcode() == TargetOpcode::G_SEXT &&
+       SrcExtOp == TargetOpcode::G_ZEXT)) {
+    Register DstReg = MI.getOperand(0).getReg();
+    Builder.setInstrAndDebugLoc(MI);
+    Builder.buildInstr(SrcExtOp, {DstReg}, {Reg});
+    MI.eraseFromParent();
+    return true;
+  }
+
+  return false;
+}
+
+bool CombinerHelper::applyCombineMulByNegativeOne(MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::G_MUL && "Expected a G_MUL");
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(1).getReg();
+  LLT DstTy = MRI.getType(DstReg);
+
+  Builder.setInstrAndDebugLoc(MI);
+  Builder.buildSub(DstReg, Builder.buildConstant(DstTy, 0), SrcReg,
+                   MI.getFlags());
+  MI.eraseFromParent();
+  return true;
+}
+
+bool CombinerHelper::matchCombineFNegOfFNeg(MachineInstr &MI, Register &Reg) {
+  assert(MI.getOpcode() == TargetOpcode::G_FNEG && "Expected a G_FNEG");
+  Register SrcReg = MI.getOperand(1).getReg();
+  return mi_match(SrcReg, MRI, m_GFNeg(m_Reg(Reg)));
+}
+
+bool CombinerHelper::matchCombineFAbsOfFAbs(MachineInstr &MI, Register &Src) {
+  assert(MI.getOpcode() == TargetOpcode::G_FABS && "Expected a G_FABS");
+  Src = MI.getOperand(1).getReg();
+  Register AbsSrc;
+  return mi_match(Src, MRI, m_GFabs(m_Reg(AbsSrc)));
+}
+
+bool CombinerHelper::applyCombineFAbsOfFAbs(MachineInstr &MI, Register &Src) {
+  assert(MI.getOpcode() == TargetOpcode::G_FABS && "Expected a G_FABS");
+  Register Dst = MI.getOperand(0).getReg();
+  MI.eraseFromParent();
+  replaceRegWith(MRI, Dst, Src);
+  return true;
+}
+
+bool CombinerHelper::matchCombineTruncOfExt(
+    MachineInstr &MI, std::pair<Register, unsigned> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Expected a G_TRUNC");
+  Register SrcReg = MI.getOperand(1).getReg();
+  MachineInstr *SrcMI = MRI.getVRegDef(SrcReg);
+  unsigned SrcOpc = SrcMI->getOpcode();
+  if (SrcOpc == TargetOpcode::G_ANYEXT || SrcOpc == TargetOpcode::G_SEXT ||
+      SrcOpc == TargetOpcode::G_ZEXT) {
+    MatchInfo = std::make_pair(SrcMI->getOperand(1).getReg(), SrcOpc);
+    return true;
+  }
+  return false;
+}
+
+bool CombinerHelper::applyCombineTruncOfExt(
+    MachineInstr &MI, std::pair<Register, unsigned> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Expected a G_TRUNC");
+  Register SrcReg = MatchInfo.first;
+  unsigned SrcExtOp = MatchInfo.second;
+  Register DstReg = MI.getOperand(0).getReg();
+  LLT SrcTy = MRI.getType(SrcReg);
+  LLT DstTy = MRI.getType(DstReg);
+  if (SrcTy == DstTy) {
+    MI.eraseFromParent();
+    replaceRegWith(MRI, DstReg, SrcReg);
+    return true;
+  }
+  Builder.setInstrAndDebugLoc(MI);
+  if (SrcTy.getSizeInBits() < DstTy.getSizeInBits())
+    Builder.buildInstr(SrcExtOp, {DstReg}, {SrcReg});
+  else
+    Builder.buildTrunc(DstReg, SrcReg);
+  MI.eraseFromParent();
+  return true;
+}
+
+bool CombinerHelper::matchCombineTruncOfShl(
+    MachineInstr &MI, std::pair<Register, Register> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Expected a G_TRUNC");
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(1).getReg();
+  LLT DstTy = MRI.getType(DstReg);
+  Register ShiftSrc;
+  Register ShiftAmt;
+
+  if (MRI.hasOneNonDBGUse(SrcReg) &&
+      mi_match(SrcReg, MRI, m_GShl(m_Reg(ShiftSrc), m_Reg(ShiftAmt))) &&
+      isLegalOrBeforeLegalizer(
+          {TargetOpcode::G_SHL,
+           {DstTy, getTargetLowering().getPreferredShiftAmountTy(DstTy)}})) {
+    KnownBits Known = KB->getKnownBits(ShiftAmt);
+    unsigned Size = DstTy.getSizeInBits();
+    if (Known.getBitWidth() - Known.countMinLeadingZeros() <= Log2_32(Size)) {
+      MatchInfo = std::make_pair(ShiftSrc, ShiftAmt);
+      return true;
+    }
+  }
+  return false;
+}
+
+bool CombinerHelper::applyCombineTruncOfShl(
+    MachineInstr &MI, std::pair<Register, Register> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Expected a G_TRUNC");
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(1).getReg();
+  LLT DstTy = MRI.getType(DstReg);
+  MachineInstr *SrcMI = MRI.getVRegDef(SrcReg);
+
+  Register ShiftSrc = MatchInfo.first;
+  Register ShiftAmt = MatchInfo.second;
+  Builder.setInstrAndDebugLoc(MI);
+  auto TruncShiftSrc = Builder.buildTrunc(DstTy, ShiftSrc);
+  Builder.buildShl(DstReg, TruncShiftSrc, ShiftAmt, SrcMI->getFlags());
+  MI.eraseFromParent();
+  return true;
+}
+
+bool CombinerHelper::matchAnyExplicitUseIsUndef(MachineInstr &MI) {
+  return any_of(MI.explicit_uses(), [this](const MachineOperand &MO) {
+    return MO.isReg() &&
+           getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, MO.getReg(), MRI);
+  });
+}
+
+bool CombinerHelper::matchAllExplicitUsesAreUndef(MachineInstr &MI) {
+  return all_of(MI.explicit_uses(), [this](const MachineOperand &MO) {
+    return !MO.isReg() ||
+           getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, MO.getReg(), MRI);
+  });
+}
+
+bool CombinerHelper::matchUndefShuffleVectorMask(MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
+  ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
+  return all_of(Mask, [](int Elt) { return Elt < 0; });
+}
+
+bool CombinerHelper::matchUndefStore(MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::G_STORE);
+  return getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, MI.getOperand(0).getReg(),
+                      MRI);
+}
+
+bool CombinerHelper::matchUndefSelectCmp(MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::G_SELECT);
+  return getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, MI.getOperand(1).getReg(),
+                      MRI);
+}
+
+bool CombinerHelper::matchConstantSelectCmp(MachineInstr &MI, unsigned &OpIdx) {
+  assert(MI.getOpcode() == TargetOpcode::G_SELECT);
+  if (auto MaybeCstCmp =
+          getConstantVRegValWithLookThrough(MI.getOperand(1).getReg(), MRI)) {
+    OpIdx = MaybeCstCmp->Value.isNullValue() ? 3 : 2;
+    return true;
+  }
+  return false;
+}
+
+bool CombinerHelper::eraseInst(MachineInstr &MI) {
+  MI.eraseFromParent();
+  return true;
+}
+
+bool CombinerHelper::matchEqualDefs(const MachineOperand &MOP1,
+                                    const MachineOperand &MOP2) {
+  if (!MOP1.isReg() || !MOP2.isReg())
     return false;
   MachineInstr *I1 = getDefIgnoringCopies(MOP1.getReg(), MRI);
   if (!I1)
@@ -1651,6 +2605,16 @@ bool CombinerHelper::replaceSingleDefInstWithOperand(MachineInstr &MI,
   return true;
 }
 
+bool CombinerHelper::replaceSingleDefInstWithReg(MachineInstr &MI,
+                                                 Register Replacement) {
+  assert(MI.getNumExplicitDefs() == 1 && "Expected one explicit def?");
+  Register OldReg = MI.getOperand(0).getReg();
+  assert(canReplaceReg(OldReg, Replacement, MRI) && "Cannot replace register?");
+  MI.eraseFromParent();
+  replaceRegWith(MRI, OldReg, Replacement);
+  return true;
+}
+
 bool CombinerHelper::matchSelectSameVal(MachineInstr &MI) {
   assert(MI.getOpcode() == TargetOpcode::G_SELECT);
   // Match (cond ? x : x)
@@ -1671,6 +2635,18 @@ bool CombinerHelper::matchOperandIsZero(MachineInstr &MI, unsigned OpIdx) {
                        MRI);
 }
 
+bool CombinerHelper::matchOperandIsUndef(MachineInstr &MI, unsigned OpIdx) {
+  MachineOperand &MO = MI.getOperand(OpIdx);
+  return MO.isReg() &&
+         getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, MO.getReg(), MRI);
+}
+
+bool CombinerHelper::matchOperandIsKnownToBeAPowerOfTwo(MachineInstr &MI,
+                                                        unsigned OpIdx) {
+  MachineOperand &MO = MI.getOperand(OpIdx);
+  return isKnownToBeAPowerOfTwo(MO.getReg(), MRI, KB);
+}
+
 bool CombinerHelper::replaceInstWithFConstant(MachineInstr &MI, double C) {
   assert(MI.getNumDefs() == 1 && "Expected only one def?");
   Builder.setInstr(MI);
@@ -1706,9 +2682,7 @@ bool CombinerHelper::matchSimplifyAddToSub(
   // ((0-A) + B) -> B - A
   // (A + (0-B)) -> A - B
   auto CheckFold = [&](Register &MaybeSub, Register &MaybeNewLHS) {
-    int64_t Cst;
-    if (!mi_match(MaybeSub, MRI, m_GSub(m_ICst(Cst), m_Reg(NewRHS))) ||
-        Cst != 0)
+    if (!mi_match(MaybeSub, MRI, m_Neg(m_Reg(NewRHS))))
       return false;
     NewLHS = MaybeNewLHS;
     return true;
@@ -1717,6 +2691,67 @@ bool CombinerHelper::matchSimplifyAddToSub(
   return CheckFold(LHS, RHS) || CheckFold(RHS, LHS);
 }
 
+bool CombinerHelper::matchCombineInsertVecElts(
+    MachineInstr &MI, SmallVectorImpl<Register> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT &&
+         "Invalid opcode");
+  Register DstReg = MI.getOperand(0).getReg();
+  LLT DstTy = MRI.getType(DstReg);
+  assert(DstTy.isVector() && "Invalid G_INSERT_VECTOR_ELT?");
+  unsigned NumElts = DstTy.getNumElements();
+  // If this MI is part of a sequence of insert_vec_elts, then
+  // don't do the combine in the middle of the sequence.
+  if (MRI.hasOneUse(DstReg) && MRI.use_instr_begin(DstReg)->getOpcode() ==
+                                   TargetOpcode::G_INSERT_VECTOR_ELT)
+    return false;
+  MachineInstr *CurrInst = &MI;
+  MachineInstr *TmpInst;
+  int64_t IntImm;
+  Register TmpReg;
+  MatchInfo.resize(NumElts);
+  while (mi_match(
+      CurrInst->getOperand(0).getReg(), MRI,
+      m_GInsertVecElt(m_MInstr(TmpInst), m_Reg(TmpReg), m_ICst(IntImm)))) {
+    if (IntImm >= NumElts)
+      return false;
+    if (!MatchInfo[IntImm])
+      MatchInfo[IntImm] = TmpReg;
+    CurrInst = TmpInst;
+  }
+  // Variable index.
+  if (CurrInst->getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT)
+    return false;
+  if (TmpInst->getOpcode() == TargetOpcode::G_BUILD_VECTOR) {
+    for (unsigned I = 1; I < TmpInst->getNumOperands(); ++I) {
+      if (!MatchInfo[I - 1].isValid())
+        MatchInfo[I - 1] = TmpInst->getOperand(I).getReg();
+    }
+    return true;
+  }
+  // If we didn't end in a G_IMPLICIT_DEF, bail out.
+  return TmpInst->getOpcode() == TargetOpcode::G_IMPLICIT_DEF;
+}
+
+bool CombinerHelper::applyCombineInsertVecElts(
+    MachineInstr &MI, SmallVectorImpl<Register> &MatchInfo) {
+  Builder.setInstr(MI);
+  Register UndefReg;
+  auto GetUndef = [&]() {
+    if (UndefReg)
+      return UndefReg;
+    LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+    UndefReg = Builder.buildUndef(DstTy.getScalarType()).getReg(0);
+    return UndefReg;
+  };
+  for (unsigned I = 0; I < MatchInfo.size(); ++I) {
+    if (!MatchInfo[I])
+      MatchInfo[I] = GetUndef();
+  }
+  Builder.buildBuildVector(MI.getOperand(0).getReg(), MatchInfo);
+  MI.eraseFromParent();
+  return true;
+}
+
 bool CombinerHelper::applySimplifyAddToSub(
     MachineInstr &MI, std::tuple<Register, Register> &MatchInfo) {
   Builder.setInstr(MI);
@@ -1727,6 +2762,812 @@ bool CombinerHelper::applySimplifyAddToSub(
   return true;
 }
 
+bool CombinerHelper::matchHoistLogicOpWithSameOpcodeHands(
+    MachineInstr &MI, InstructionStepsMatchInfo &MatchInfo) {
+  // Matches: logic (hand x, ...), (hand y, ...) -> hand (logic x, y), ...
+  //
+  // Creates the new hand + logic instruction (but does not insert them.)
+  //
+  // On success, MatchInfo is populated with the new instructions. These are
+  // inserted in applyHoistLogicOpWithSameOpcodeHands.
+  unsigned LogicOpcode = MI.getOpcode();
+  assert(LogicOpcode == TargetOpcode::G_AND ||
+         LogicOpcode == TargetOpcode::G_OR ||
+         LogicOpcode == TargetOpcode::G_XOR);
+  MachineIRBuilder MIB(MI);
+  Register Dst = MI.getOperand(0).getReg();
+  Register LHSReg = MI.getOperand(1).getReg();
+  Register RHSReg = MI.getOperand(2).getReg();
+
+  // Don't recompute anything.
+  if (!MRI.hasOneNonDBGUse(LHSReg) || !MRI.hasOneNonDBGUse(RHSReg))
+    return false;
+
+  // Make sure we have (hand x, ...), (hand y, ...)
+  MachineInstr *LeftHandInst = getDefIgnoringCopies(LHSReg, MRI);
+  MachineInstr *RightHandInst = getDefIgnoringCopies(RHSReg, MRI);
+  if (!LeftHandInst || !RightHandInst)
+    return false;
+  unsigned HandOpcode = LeftHandInst->getOpcode();
+  if (HandOpcode != RightHandInst->getOpcode())
+    return false;
+  if (!LeftHandInst->getOperand(1).isReg() ||
+      !RightHandInst->getOperand(1).isReg())
+    return false;
+
+  // Make sure the types match up, and if we're doing this post-legalization,
+  // we end up with legal types.
+  Register X = LeftHandInst->getOperand(1).getReg();
+  Register Y = RightHandInst->getOperand(1).getReg();
+  LLT XTy = MRI.getType(X);
+  LLT YTy = MRI.getType(Y);
+  if (XTy != YTy)
+    return false;
+  if (!isLegalOrBeforeLegalizer({LogicOpcode, {XTy, YTy}}))
+    return false;
+
+  // Optional extra source register.
+  Register ExtraHandOpSrcReg;
+  switch (HandOpcode) {
+  default:
+    return false;
+  case TargetOpcode::G_ANYEXT:
+  case TargetOpcode::G_SEXT:
+  case TargetOpcode::G_ZEXT: {
+    // Match: logic (ext X), (ext Y) --> ext (logic X, Y)
+    break;
+  }
+  case TargetOpcode::G_AND:
+  case TargetOpcode::G_ASHR:
+  case TargetOpcode::G_LSHR:
+  case TargetOpcode::G_SHL: {
+    // Match: logic (binop x, z), (binop y, z) -> binop (logic x, y), z
+    MachineOperand &ZOp = LeftHandInst->getOperand(2);
+    if (!matchEqualDefs(ZOp, RightHandInst->getOperand(2)))
+      return false;
+    ExtraHandOpSrcReg = ZOp.getReg();
+    break;
+  }
+  }
+
+  // Record the steps to build the new instructions.
+  //
+  // Steps to build (logic x, y)
+  auto NewLogicDst = MRI.createGenericVirtualRegister(XTy);
+  OperandBuildSteps LogicBuildSteps = {
+      [=](MachineInstrBuilder &MIB) { MIB.addDef(NewLogicDst); },
+      [=](MachineInstrBuilder &MIB) { MIB.addReg(X); },
+      [=](MachineInstrBuilder &MIB) { MIB.addReg(Y); }};
+  InstructionBuildSteps LogicSteps(LogicOpcode, LogicBuildSteps);
+
+  // Steps to build hand (logic x, y), ...z
+  OperandBuildSteps HandBuildSteps = {
+      [=](MachineInstrBuilder &MIB) { MIB.addDef(Dst); },
+      [=](MachineInstrBuilder &MIB) { MIB.addReg(NewLogicDst); }};
+  if (ExtraHandOpSrcReg.isValid())
+    HandBuildSteps.push_back(
+        [=](MachineInstrBuilder &MIB) { MIB.addReg(ExtraHandOpSrcReg); });
+  InstructionBuildSteps HandSteps(HandOpcode, HandBuildSteps);
+
+  MatchInfo = InstructionStepsMatchInfo({LogicSteps, HandSteps});
+  return true;
+}
+
+bool CombinerHelper::applyBuildInstructionSteps(
+    MachineInstr &MI, InstructionStepsMatchInfo &MatchInfo) {
+  assert(MatchInfo.InstrsToBuild.size() &&
+         "Expected at least one instr to build?");
+  Builder.setInstr(MI);
+  for (auto &InstrToBuild : MatchInfo.InstrsToBuild) {
+    assert(InstrToBuild.Opcode && "Expected a valid opcode?");
+    assert(InstrToBuild.OperandFns.size() && "Expected at least one operand?");
+    MachineInstrBuilder Instr = Builder.buildInstr(InstrToBuild.Opcode);
+    for (auto &OperandFn : InstrToBuild.OperandFns)
+      OperandFn(Instr);
+  }
+  MI.eraseFromParent();
+  return true;
+}
+
+bool CombinerHelper::matchAshrShlToSextInreg(
+    MachineInstr &MI, std::tuple<Register, int64_t> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_ASHR);
+  int64_t ShlCst, AshrCst;
+  Register Src;
+  // FIXME: detect splat constant vectors.
+  if (!mi_match(MI.getOperand(0).getReg(), MRI,
+                m_GAShr(m_GShl(m_Reg(Src), m_ICst(ShlCst)), m_ICst(AshrCst))))
+    return false;
+  if (ShlCst != AshrCst)
+    return false;
+  if (!isLegalOrBeforeLegalizer(
+          {TargetOpcode::G_SEXT_INREG, {MRI.getType(Src)}}))
+    return false;
+  MatchInfo = std::make_tuple(Src, ShlCst);
+  return true;
+}
+bool CombinerHelper::applyAshShlToSextInreg(
+    MachineInstr &MI, std::tuple<Register, int64_t> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_ASHR);
+  Register Src;
+  int64_t ShiftAmt;
+  std::tie(Src, ShiftAmt) = MatchInfo;
+  unsigned Size = MRI.getType(Src).getScalarSizeInBits();
+  Builder.setInstrAndDebugLoc(MI);
+  Builder.buildSExtInReg(MI.getOperand(0).getReg(), Src, Size - ShiftAmt);
+  MI.eraseFromParent();
+  return true;
+}
+
+bool CombinerHelper::matchRedundantAnd(MachineInstr &MI,
+                                       Register &Replacement) {
+  // Given
+  //
+  // %y:_(sN) = G_SOMETHING
+  // %x:_(sN) = G_SOMETHING
+  // %res:_(sN) = G_AND %x, %y
+  //
+  // Eliminate the G_AND when it is known that x & y == x or x & y == y.
+  //
+  // Patterns like this can appear as a result of legalization. E.g.
+  //
+  // %cmp:_(s32) = G_ICMP intpred(pred), %x(s32), %y
+  // %one:_(s32) = G_CONSTANT i32 1
+  // %and:_(s32) = G_AND %cmp, %one
+  //
+  // In this case, G_ICMP only produces a single bit, so x & 1 == x.
+  assert(MI.getOpcode() == TargetOpcode::G_AND);
+  if (!KB)
+    return false;
+
+  Register AndDst = MI.getOperand(0).getReg();
+  LLT DstTy = MRI.getType(AndDst);
+
+  // FIXME: This should be removed once GISelKnownBits supports vectors.
+  if (DstTy.isVector())
+    return false;
+
+  Register LHS = MI.getOperand(1).getReg();
+  Register RHS = MI.getOperand(2).getReg();
+  KnownBits LHSBits = KB->getKnownBits(LHS);
+  KnownBits RHSBits = KB->getKnownBits(RHS);
+
+  // Check that x & Mask == x.
+  // x & 1 == x, always
+  // x & 0 == x, only if x is also 0
+  // Meaning Mask has no effect if every bit is either one in Mask or zero in x.
+  //
+  // Check if we can replace AndDst with the LHS of the G_AND
+  if (canReplaceReg(AndDst, LHS, MRI) &&
+      (LHSBits.Zero | RHSBits.One).isAllOnesValue()) {
+    Replacement = LHS;
+    return true;
+  }
+
+  // Check if we can replace AndDst with the RHS of the G_AND
+  if (canReplaceReg(AndDst, RHS, MRI) &&
+      (LHSBits.One | RHSBits.Zero).isAllOnesValue()) {
+    Replacement = RHS;
+    return true;
+  }
+
+  return false;
+}
+
+bool CombinerHelper::matchRedundantOr(MachineInstr &MI, Register &Replacement) {
+  // Given
+  //
+  // %y:_(sN) = G_SOMETHING
+  // %x:_(sN) = G_SOMETHING
+  // %res:_(sN) = G_OR %x, %y
+  //
+  // Eliminate the G_OR when it is known that x | y == x or x | y == y.
+  assert(MI.getOpcode() == TargetOpcode::G_OR);
+  if (!KB)
+    return false;
+
+  Register OrDst = MI.getOperand(0).getReg();
+  LLT DstTy = MRI.getType(OrDst);
+
+  // FIXME: This should be removed once GISelKnownBits supports vectors.
+  if (DstTy.isVector())
+    return false;
+
+  Register LHS = MI.getOperand(1).getReg();
+  Register RHS = MI.getOperand(2).getReg();
+  KnownBits LHSBits = KB->getKnownBits(LHS);
+  KnownBits RHSBits = KB->getKnownBits(RHS);
+
+  // Check that x | Mask == x.
+  // x | 0 == x, always
+  // x | 1 == x, only if x is also 1
+  // Meaning Mask has no effect if every bit is either zero in Mask or one in x.
+  //
+  // Check if we can replace OrDst with the LHS of the G_OR
+  if (canReplaceReg(OrDst, LHS, MRI) &&
+      (LHSBits.One | RHSBits.Zero).isAllOnesValue()) {
+    Replacement = LHS;
+    return true;
+  }
+
+  // Check if we can replace OrDst with the RHS of the G_OR
+  if (canReplaceReg(OrDst, RHS, MRI) &&
+      (LHSBits.Zero | RHSBits.One).isAllOnesValue()) {
+    Replacement = RHS;
+    return true;
+  }
+
+  return false;
+}
+
+bool CombinerHelper::matchRedundantSExtInReg(MachineInstr &MI) {
+  // If the input is already sign extended, just drop the extension.
+  Register Src = MI.getOperand(1).getReg();
+  unsigned ExtBits = MI.getOperand(2).getImm();
+  unsigned TypeSize = MRI.getType(Src).getScalarSizeInBits();
+  return KB->computeNumSignBits(Src) >= (TypeSize - ExtBits + 1);
+}
+
+static bool isConstValidTrue(const TargetLowering &TLI, unsigned ScalarSizeBits,
+                             int64_t Cst, bool IsVector, bool IsFP) {
+  // For i1, Cst will always be -1 regardless of boolean contents.
+  return (ScalarSizeBits == 1 && Cst == -1) ||
+         isConstTrueVal(TLI, Cst, IsVector, IsFP);
+}
+
+bool CombinerHelper::matchNotCmp(MachineInstr &MI,
+                                 SmallVectorImpl<Register> &RegsToNegate) {
+  assert(MI.getOpcode() == TargetOpcode::G_XOR);
+  LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+  const auto &TLI = *Builder.getMF().getSubtarget().getTargetLowering();
+  Register XorSrc;
+  Register CstReg;
+  // We match xor(src, true) here.
+  if (!mi_match(MI.getOperand(0).getReg(), MRI,
+                m_GXor(m_Reg(XorSrc), m_Reg(CstReg))))
+    return false;
+
+  if (!MRI.hasOneNonDBGUse(XorSrc))
+    return false;
+
+  // Check that XorSrc is the root of a tree of comparisons combined with ANDs
+  // and ORs. The suffix of RegsToNegate starting from index I is used a work
+  // list of tree nodes to visit.
+  RegsToNegate.push_back(XorSrc);
+  // Remember whether the comparisons are all integer or all floating point.
+  bool IsInt = false;
+  bool IsFP = false;
+  for (unsigned I = 0; I < RegsToNegate.size(); ++I) {
+    Register Reg = RegsToNegate[I];
+    if (!MRI.hasOneNonDBGUse(Reg))
+      return false;
+    MachineInstr *Def = MRI.getVRegDef(Reg);
+    switch (Def->getOpcode()) {
+    default:
+      // Don't match if the tree contains anything other than ANDs, ORs and
+      // comparisons.
+      return false;
+    case TargetOpcode::G_ICMP:
+      if (IsFP)
+        return false;
+      IsInt = true;
+      // When we apply the combine we will invert the predicate.
+      break;
+    case TargetOpcode::G_FCMP:
+      if (IsInt)
+        return false;
+      IsFP = true;
+      // When we apply the combine we will invert the predicate.
+      break;
+    case TargetOpcode::G_AND:
+    case TargetOpcode::G_OR:
+      // Implement De Morgan's laws:
+      // ~(x & y) -> ~x | ~y
+      // ~(x | y) -> ~x & ~y
+      // When we apply the combine we will change the opcode and recursively
+      // negate the operands.
+      RegsToNegate.push_back(Def->getOperand(1).getReg());
+      RegsToNegate.push_back(Def->getOperand(2).getReg());
+      break;
+    }
+  }
+
+  // Now we know whether the comparisons are integer or floating point, check
+  // the constant in the xor.
+  int64_t Cst;
+  if (Ty.isVector()) {
+    MachineInstr *CstDef = MRI.getVRegDef(CstReg);
+    auto MaybeCst = getBuildVectorConstantSplat(*CstDef, MRI);
+    if (!MaybeCst)
+      return false;
+    if (!isConstValidTrue(TLI, Ty.getScalarSizeInBits(), *MaybeCst, true, IsFP))
+      return false;
+  } else {
+    if (!mi_match(CstReg, MRI, m_ICst(Cst)))
+      return false;
+    if (!isConstValidTrue(TLI, Ty.getSizeInBits(), Cst, false, IsFP))
+      return false;
+  }
+
+  return true;
+}
+
+bool CombinerHelper::applyNotCmp(MachineInstr &MI,
+                                 SmallVectorImpl<Register> &RegsToNegate) {
+  for (Register Reg : RegsToNegate) {
+    MachineInstr *Def = MRI.getVRegDef(Reg);
+    Observer.changingInstr(*Def);
+    // For each comparison, invert the opcode. For each AND and OR, change the
+    // opcode.
+    switch (Def->getOpcode()) {
+    default:
+      llvm_unreachable("Unexpected opcode");
+    case TargetOpcode::G_ICMP:
+    case TargetOpcode::G_FCMP: {
+      MachineOperand &PredOp = Def->getOperand(1);
+      CmpInst::Predicate NewP = CmpInst::getInversePredicate(
+          (CmpInst::Predicate)PredOp.getPredicate());
+      PredOp.setPredicate(NewP);
+      break;
+    }
+    case TargetOpcode::G_AND:
+      Def->setDesc(Builder.getTII().get(TargetOpcode::G_OR));
+      break;
+    case TargetOpcode::G_OR:
+      Def->setDesc(Builder.getTII().get(TargetOpcode::G_AND));
+      break;
+    }
+    Observer.changedInstr(*Def);
+  }
+
+  replaceRegWith(MRI, MI.getOperand(0).getReg(), MI.getOperand(1).getReg());
+  MI.eraseFromParent();
+  return true;
+}
+
+bool CombinerHelper::matchXorOfAndWithSameReg(
+    MachineInstr &MI, std::pair<Register, Register> &MatchInfo) {
+  // Match (xor (and x, y), y) (or any of its commuted cases)
+  assert(MI.getOpcode() == TargetOpcode::G_XOR);
+  Register &X = MatchInfo.first;
+  Register &Y = MatchInfo.second;
+  Register AndReg = MI.getOperand(1).getReg();
+  Register SharedReg = MI.getOperand(2).getReg();
+
+  // Find a G_AND on either side of the G_XOR.
+  // Look for one of
+  //
+  // (xor (and x, y), SharedReg)
+  // (xor SharedReg, (and x, y))
+  if (!mi_match(AndReg, MRI, m_GAnd(m_Reg(X), m_Reg(Y)))) {
+    std::swap(AndReg, SharedReg);
+    if (!mi_match(AndReg, MRI, m_GAnd(m_Reg(X), m_Reg(Y))))
+      return false;
+  }
+
+  // Only do this if we'll eliminate the G_AND.
+  if (!MRI.hasOneNonDBGUse(AndReg))
+    return false;
+
+  // We can combine if SharedReg is the same as either the LHS or RHS of the
+  // G_AND.
+  if (Y != SharedReg)
+    std::swap(X, Y);
+  return Y == SharedReg;
+}
+
+bool CombinerHelper::applyXorOfAndWithSameReg(
+    MachineInstr &MI, std::pair<Register, Register> &MatchInfo) {
+  // Fold (xor (and x, y), y) -> (and (not x), y)
+  Builder.setInstrAndDebugLoc(MI);
+  Register X, Y;
+  std::tie(X, Y) = MatchInfo;
+  auto Not = Builder.buildNot(MRI.getType(X), X);
+  Observer.changingInstr(MI);
+  MI.setDesc(Builder.getTII().get(TargetOpcode::G_AND));
+  MI.getOperand(1).setReg(Not->getOperand(0).getReg());
+  MI.getOperand(2).setReg(Y);
+  Observer.changedInstr(MI);
+  return true;
+}
+
+bool CombinerHelper::matchPtrAddZero(MachineInstr &MI) {
+  Register DstReg = MI.getOperand(0).getReg();
+  LLT Ty = MRI.getType(DstReg);
+  const DataLayout &DL = Builder.getMF().getDataLayout();
+
+  if (DL.isNonIntegralAddressSpace(Ty.getScalarType().getAddressSpace()))
+    return false;
+
+  if (Ty.isPointer()) {
+    auto ConstVal = getConstantVRegVal(MI.getOperand(1).getReg(), MRI);
+    return ConstVal && *ConstVal == 0;
+  }
+
+  assert(Ty.isVector() && "Expecting a vector type");
+  const MachineInstr *VecMI = MRI.getVRegDef(MI.getOperand(1).getReg());
+  return isBuildVectorAllZeros(*VecMI, MRI);
+}
+
+bool CombinerHelper::applyPtrAddZero(MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::G_PTR_ADD);
+  Builder.setInstrAndDebugLoc(MI);
+  Builder.buildIntToPtr(MI.getOperand(0), MI.getOperand(2));
+  MI.eraseFromParent();
+  return true;
+}
+
+/// The second source operand is known to be a power of 2.
+bool CombinerHelper::applySimplifyURemByPow2(MachineInstr &MI) {
+  Register DstReg = MI.getOperand(0).getReg();
+  Register Src0 = MI.getOperand(1).getReg();
+  Register Pow2Src1 = MI.getOperand(2).getReg();
+  LLT Ty = MRI.getType(DstReg);
+  Builder.setInstrAndDebugLoc(MI);
+
+  // Fold (urem x, pow2) -> (and x, pow2-1)
+  auto NegOne = Builder.buildConstant(Ty, -1);
+  auto Add = Builder.buildAdd(Ty, Pow2Src1, NegOne);
+  Builder.buildAnd(DstReg, Src0, Add);
+  MI.eraseFromParent();
+  return true;
+}
+
+Optional<SmallVector<Register, 8>>
+CombinerHelper::findCandidatesForLoadOrCombine(const MachineInstr *Root) const {
+  assert(Root->getOpcode() == TargetOpcode::G_OR && "Expected G_OR only!");
+  // We want to detect if Root is part of a tree which represents a bunch
+  // of loads being merged into a larger load. We'll try to recognize patterns
+  // like, for example:
+  //
+  //  Reg   Reg
+  //   \    /
+  //    OR_1   Reg
+  //     \    /
+  //      OR_2
+  //        \     Reg
+  //         .. /
+  //        Root
+  //
+  //  Reg   Reg   Reg   Reg
+  //     \ /       \   /
+  //     OR_1      OR_2
+  //       \       /
+  //        \    /
+  //         ...
+  //         Root
+  //
+  // Each "Reg" may have been produced by a load + some arithmetic. This
+  // function will save each of them.
+  SmallVector<Register, 8> RegsToVisit;
+  SmallVector<const MachineInstr *, 7> Ors = {Root};
+
+  // In the "worst" case, we're dealing with a load for each byte. So, there
+  // are at most #bytes - 1 ORs.
+  const unsigned MaxIter =
+      MRI.getType(Root->getOperand(0).getReg()).getSizeInBytes() - 1;
+  for (unsigned Iter = 0; Iter < MaxIter; ++Iter) {
+    if (Ors.empty())
+      break;
+    const MachineInstr *Curr = Ors.pop_back_val();
+    Register OrLHS = Curr->getOperand(1).getReg();
+    Register OrRHS = Curr->getOperand(2).getReg();
+
+    // In the combine, we want to elimate the entire tree.
+    if (!MRI.hasOneNonDBGUse(OrLHS) || !MRI.hasOneNonDBGUse(OrRHS))
+      return None;
+
+    // If it's a G_OR, save it and continue to walk. If it's not, then it's
+    // something that may be a load + arithmetic.
+    if (const MachineInstr *Or = getOpcodeDef(TargetOpcode::G_OR, OrLHS, MRI))
+      Ors.push_back(Or);
+    else
+      RegsToVisit.push_back(OrLHS);
+    if (const MachineInstr *Or = getOpcodeDef(TargetOpcode::G_OR, OrRHS, MRI))
+      Ors.push_back(Or);
+    else
+      RegsToVisit.push_back(OrRHS);
+  }
+
+  // We're going to try and merge each register into a wider power-of-2 type,
+  // so we ought to have an even number of registers.
+  if (RegsToVisit.empty() || RegsToVisit.size() % 2 != 0)
+    return None;
+  return RegsToVisit;
+}
+
+/// Helper function for findLoadOffsetsForLoadOrCombine.
+///
+/// Check if \p Reg is the result of loading a \p MemSizeInBits wide value,
+/// and then moving that value into a specific byte offset.
+///
+/// e.g. x[i] << 24
+///
+/// \returns The load instruction and the byte offset it is moved into.
+static Optional<std::pair<MachineInstr *, int64_t>>
+matchLoadAndBytePosition(Register Reg, unsigned MemSizeInBits,
+                         const MachineRegisterInfo &MRI) {
+  assert(MRI.hasOneNonDBGUse(Reg) &&
+         "Expected Reg to only have one non-debug use?");
+  Register MaybeLoad;
+  int64_t Shift;
+  if (!mi_match(Reg, MRI,
+                m_OneNonDBGUse(m_GShl(m_Reg(MaybeLoad), m_ICst(Shift))))) {
+    Shift = 0;
+    MaybeLoad = Reg;
+  }
+
+  if (Shift % MemSizeInBits != 0)
+    return None;
+
+  // TODO: Handle other types of loads.
+  auto *Load = getOpcodeDef(TargetOpcode::G_ZEXTLOAD, MaybeLoad, MRI);
+  if (!Load)
+    return None;
+
+  const auto &MMO = **Load->memoperands_begin();
+  if (!MMO.isUnordered() || MMO.getSizeInBits() != MemSizeInBits)
+    return None;
+
+  return std::make_pair(Load, Shift / MemSizeInBits);
+}
+
+Optional<std::pair<MachineInstr *, int64_t>>
+CombinerHelper::findLoadOffsetsForLoadOrCombine(
+    SmallDenseMap<int64_t, int64_t, 8> &MemOffset2Idx,
+    const SmallVector<Register, 8> &RegsToVisit, const unsigned MemSizeInBits) {
+
+  // Each load found for the pattern. There should be one for each RegsToVisit.
+  SmallSetVector<const MachineInstr *, 8> Loads;
+
+  // The lowest index used in any load. (The lowest "i" for each x[i].)
+  int64_t LowestIdx = INT64_MAX;
+
+  // The load which uses the lowest index.
+  MachineInstr *LowestIdxLoad = nullptr;
+
+  // Keeps track of the load indices we see. We shouldn't see any indices twice.
+  SmallSet<int64_t, 8> SeenIdx;
+
+  // Ensure each load is in the same MBB.
+  // TODO: Support multiple MachineBasicBlocks.
+  MachineBasicBlock *MBB = nullptr;
+  const MachineMemOperand *MMO = nullptr;
+
+  // Earliest instruction-order load in the pattern.
+  MachineInstr *EarliestLoad = nullptr;
+
+  // Latest instruction-order load in the pattern.
+  MachineInstr *LatestLoad = nullptr;
+
+  // Base pointer which every load should share.
+  Register BasePtr;
+
+  // We want to find a load for each register. Each load should have some
+  // appropriate bit twiddling arithmetic. During this loop, we will also keep
+  // track of the load which uses the lowest index. Later, we will check if we
+  // can use its pointer in the final, combined load.
+  for (auto Reg : RegsToVisit) {
+    // Find the load, and find the position that it will end up in (e.g. a
+    // shifted) value.
+    auto LoadAndPos = matchLoadAndBytePosition(Reg, MemSizeInBits, MRI);
+    if (!LoadAndPos)
+      return None;
+    MachineInstr *Load;
+    int64_t DstPos;
+    std::tie(Load, DstPos) = *LoadAndPos;
+
+    // TODO: Handle multiple MachineBasicBlocks. Currently not handled because
+    // it is difficult to check for stores/calls/etc between loads.
+    MachineBasicBlock *LoadMBB = Load->getParent();
+    if (!MBB)
+      MBB = LoadMBB;
+    if (LoadMBB != MBB)
+      return None;
+
+    // Make sure that the MachineMemOperands of every seen load are compatible.
+    const MachineMemOperand *LoadMMO = *Load->memoperands_begin();
+    if (!MMO)
+      MMO = LoadMMO;
+    if (MMO->getAddrSpace() != LoadMMO->getAddrSpace())
+      return None;
+
+    // Find out what the base pointer and index for the load is.
+    Register LoadPtr;
+    int64_t Idx;
+    if (!mi_match(Load->getOperand(1).getReg(), MRI,
+                  m_GPtrAdd(m_Reg(LoadPtr), m_ICst(Idx)))) {
+      LoadPtr = Load->getOperand(1).getReg();
+      Idx = 0;
+    }
+
+    // Don't combine things like a[i], a[i] -> a bigger load.
+    if (!SeenIdx.insert(Idx).second)
+      return None;
+
+    // Every load must share the same base pointer; don't combine things like:
+    //
+    // a[i], b[i + 1] -> a bigger load.
+    if (!BasePtr.isValid())
+      BasePtr = LoadPtr;
+    if (BasePtr != LoadPtr)
+      return None;
+
+    if (Idx < LowestIdx) {
+      LowestIdx = Idx;
+      LowestIdxLoad = Load;
+    }
+
+    // Keep track of the byte offset that this load ends up at. If we have seen
+    // the byte offset, then stop here. We do not want to combine:
+    //
+    // a[i] << 16, a[i + k] << 16 -> a bigger load.
+    if (!MemOffset2Idx.try_emplace(DstPos, Idx).second)
+      return None;
+    Loads.insert(Load);
+
+    // Keep track of the position of the earliest/latest loads in the pattern.
+    // We will check that there are no load fold barriers between them later
+    // on.
+    //
+    // FIXME: Is there a better way to check for load fold barriers?
+    if (!EarliestLoad || dominates(*Load, *EarliestLoad))
+      EarliestLoad = Load;
+    if (!LatestLoad || dominates(*LatestLoad, *Load))
+      LatestLoad = Load;
+  }
+
+  // We found a load for each register. Let's check if each load satisfies the
+  // pattern.
+  assert(Loads.size() == RegsToVisit.size() &&
+         "Expected to find a load for each register?");
+  assert(EarliestLoad != LatestLoad && EarliestLoad &&
+         LatestLoad && "Expected at least two loads?");
+
+  // Check if there are any stores, calls, etc. between any of the loads. If
+  // there are, then we can't safely perform the combine.
+  //
+  // MaxIter is chosen based off the (worst case) number of iterations it
+  // typically takes to succeed in the LLVM test suite plus some padding.
+  //
+  // FIXME: Is there a better way to check for load fold barriers?
+  const unsigned MaxIter = 20;
+  unsigned Iter = 0;
+  for (const auto &MI : instructionsWithoutDebug(EarliestLoad->getIterator(),
+                                                 LatestLoad->getIterator())) {
+    if (Loads.count(&MI))
+      continue;
+    if (MI.isLoadFoldBarrier())
+      return None;
+    if (Iter++ == MaxIter)
+      return None;
+  }
+
+  return std::make_pair(LowestIdxLoad, LowestIdx);
+}
+
+bool CombinerHelper::matchLoadOrCombine(
+    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_OR);
+  MachineFunction &MF = *MI.getMF();
+  // Assuming a little-endian target, transform:
+  //  s8 *a = ...
+  //  s32 val = a[0] | (a[1] << 8) | (a[2] << 16) | (a[3] << 24)
+  // =>
+  //  s32 val = *((i32)a)
+  //
+  //  s8 *a = ...
+  //  s32 val = (a[0] << 24) | (a[1] << 16) | (a[2] << 8) | a[3]
+  // =>
+  //  s32 val = BSWAP(*((s32)a))
+  Register Dst = MI.getOperand(0).getReg();
+  LLT Ty = MRI.getType(Dst);
+  if (Ty.isVector())
+    return false;
+
+  // We need to combine at least two loads into this type. Since the smallest
+  // possible load is into a byte, we need at least a 16-bit wide type.
+  const unsigned WideMemSizeInBits = Ty.getSizeInBits();
+  if (WideMemSizeInBits < 16 || WideMemSizeInBits % 8 != 0)
+    return false;
+
+  // Match a collection of non-OR instructions in the pattern.
+  auto RegsToVisit = findCandidatesForLoadOrCombine(&MI);
+  if (!RegsToVisit)
+    return false;
+
+  // We have a collection of non-OR instructions. Figure out how wide each of
+  // the small loads should be based off of the number of potential loads we
+  // found.
+  const unsigned NarrowMemSizeInBits = WideMemSizeInBits / RegsToVisit->size();
+  if (NarrowMemSizeInBits % 8 != 0)
+    return false;
+
+  // Check if each register feeding into each OR is a load from the same
+  // base pointer + some arithmetic.
+  //
+  // e.g. a[0], a[1] << 8, a[2] << 16, etc.
+  //
+  // Also verify that each of these ends up putting a[i] into the same memory
+  // offset as a load into a wide type would.
+  SmallDenseMap<int64_t, int64_t, 8> MemOffset2Idx;
+  MachineInstr *LowestIdxLoad;
+  int64_t LowestIdx;
+  auto MaybeLoadInfo = findLoadOffsetsForLoadOrCombine(
+      MemOffset2Idx, *RegsToVisit, NarrowMemSizeInBits);
+  if (!MaybeLoadInfo)
+    return false;
+  std::tie(LowestIdxLoad, LowestIdx) = *MaybeLoadInfo;
+
+  // We have a bunch of loads being OR'd together. Using the addresses + offsets
+  // we found before, check if this corresponds to a big or little endian byte
+  // pattern. If it does, then we can represent it using a load + possibly a
+  // BSWAP.
+  bool IsBigEndianTarget = MF.getDataLayout().isBigEndian();
+  Optional<bool> IsBigEndian = isBigEndian(MemOffset2Idx, LowestIdx);
+  if (!IsBigEndian.hasValue())
+    return false;
+  bool NeedsBSwap = IsBigEndianTarget != *IsBigEndian;
+  if (NeedsBSwap && !isLegalOrBeforeLegalizer({TargetOpcode::G_BSWAP, {Ty}}))
+    return false;
+
+  // Make sure that the load from the lowest index produces offset 0 in the
+  // final value.
+  //
+  // This ensures that we won't combine something like this:
+  //
+  // load x[i] -> byte 2
+  // load x[i+1] -> byte 0 ---> wide_load x[i]
+  // load x[i+2] -> byte 1
+  const unsigned NumLoadsInTy = WideMemSizeInBits / NarrowMemSizeInBits;
+  const unsigned ZeroByteOffset =
+      *IsBigEndian
+          ? bigEndianByteAt(NumLoadsInTy, 0)
+          : littleEndianByteAt(NumLoadsInTy, 0);
+  auto ZeroOffsetIdx = MemOffset2Idx.find(ZeroByteOffset);
+  if (ZeroOffsetIdx == MemOffset2Idx.end() ||
+      ZeroOffsetIdx->second != LowestIdx)
+    return false;
+
+  // We wil reuse the pointer from the load which ends up at byte offset 0. It
+  // may not use index 0.
+  Register Ptr = LowestIdxLoad->getOperand(1).getReg();
+  const MachineMemOperand &MMO = **LowestIdxLoad->memoperands_begin();
+  LegalityQuery::MemDesc MMDesc;
+  MMDesc.SizeInBits = WideMemSizeInBits;
+  MMDesc.AlignInBits = MMO.getAlign().value() * 8;
+  MMDesc.Ordering = MMO.getOrdering();
+  if (!isLegalOrBeforeLegalizer(
+          {TargetOpcode::G_LOAD, {Ty, MRI.getType(Ptr)}, {MMDesc}}))
+    return false;
+  auto PtrInfo = MMO.getPointerInfo();
+  auto *NewMMO = MF.getMachineMemOperand(&MMO, PtrInfo, WideMemSizeInBits / 8);
+
+  // Load must be allowed and fast on the target.
+  LLVMContext &C = MF.getFunction().getContext();
+  auto &DL = MF.getDataLayout();
+  bool Fast = false;
+  if (!getTargetLowering().allowsMemoryAccess(C, DL, Ty, *NewMMO, &Fast) ||
+      !Fast)
+    return false;
+
+  MatchInfo = [=](MachineIRBuilder &MIB) {
+    Register LoadDst = NeedsBSwap ? MRI.cloneVirtualRegister(Dst) : Dst;
+    MIB.buildLoad(LoadDst, Ptr, *NewMMO);
+    if (NeedsBSwap)
+      MIB.buildBSwap(Dst, LoadDst);
+  };
+  return true;
+}
+
+bool CombinerHelper::applyLoadOrCombine(
+    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+  Builder.setInstrAndDebugLoc(MI);
+  MatchInfo(Builder);
+  MI.eraseFromParent();
+  return true;
+}
+
 bool CombinerHelper::tryCombine(MachineInstr &MI) {
   if (tryCombineCopy(MI))
     return true;
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/GISelChangeObserver.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/GISelChangeObserver.cpp
index bdaa6378e901..59f4d60a41d8 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/GISelChangeObserver.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/GISelChangeObserver.cpp
@@ -16,7 +16,7 @@
 using namespace llvm;
 
 void GISelChangeObserver::changingAllUsesOfReg(
-    const MachineRegisterInfo &MRI, unsigned Reg) {
+    const MachineRegisterInfo &MRI, Register Reg) {
   for (auto &ChangingMI : MRI.use_instructions(Reg)) {
     changingInstr(ChangingMI);
     ChangingAllUsesOfReg.insert(&ChangingMI);
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
index 0e9c6e4fab9f..2de20489e1d1 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
@@ -11,7 +11,6 @@
 //
 //===------------------
 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -95,6 +94,25 @@ dumpResult(const MachineInstr &MI, const KnownBits &Known, unsigned Depth) {
          << "\n";
 }
 
+/// Compute known bits for the intersection of \p Src0 and \p Src1
+void GISelKnownBits::computeKnownBitsMin(Register Src0, Register Src1,
+                                         KnownBits &Known,
+                                         const APInt &DemandedElts,
+                                         unsigned Depth) {
+  // Test src1 first, since we canonicalize simpler expressions to the RHS.
+  computeKnownBitsImpl(Src1, Known, DemandedElts, Depth);
+
+  // If we don't know any bits, early out.
+  if (Known.isUnknown())
+    return;
+
+  KnownBits Known2;
+  computeKnownBitsImpl(Src0, Known2, DemandedElts, Depth);
+
+  // Only known if known in both the LHS and RHS.
+  Known = KnownBits::commonBits(Known, Known2);
+}
+
 void GISelKnownBits::computeKnownBitsImpl(Register R, KnownBits &Known,
                                           const APInt &DemandedElts,
                                           unsigned Depth) {
@@ -182,8 +200,7 @@ void GISelKnownBits::computeKnownBitsImpl(Register R, KnownBits &Known,
         // For COPYs we don't do anything, don't increase the depth.
         computeKnownBitsImpl(SrcReg, Known2, DemandedElts,
                              Depth + (Opcode != TargetOpcode::COPY));
-        Known.One &= Known2.One;
-        Known.Zero &= Known2.Zero;
+        Known = KnownBits::commonBits(Known, Known2);
         // If we reach a point where we don't know anything
         // just stop looking through the operands.
         if (Known.One == 0 && Known.Zero == 0)
@@ -200,8 +217,7 @@ void GISelKnownBits::computeKnownBitsImpl(Register R, KnownBits &Known,
     auto CstVal = getConstantVRegVal(R, MRI);
     if (!CstVal)
       break;
-    Known.One = *CstVal;
-    Known.Zero = ~Known.One;
+    Known = KnownBits::makeConstant(*CstVal);
     break;
   }
   case TargetOpcode::G_FRAME_INDEX: {
@@ -268,33 +284,50 @@ void GISelKnownBits::computeKnownBitsImpl(Register R, KnownBits &Known,
                          Depth + 1);
     computeKnownBitsImpl(MI.getOperand(1).getReg(), Known2, DemandedElts,
                          Depth + 1);
-    // If low bits are zero in either operand, output low known-0 bits.
-    // Also compute a conservative estimate for high known-0 bits.
-    // More trickiness is possible, but this is sufficient for the
-    // interesting case of alignment computation.
-    unsigned TrailZ =
-        Known.countMinTrailingZeros() + Known2.countMinTrailingZeros();
-    unsigned LeadZ =
-        std::max(Known.countMinLeadingZeros() + Known2.countMinLeadingZeros(),
-                 BitWidth) -
-        BitWidth;
-
-    Known.resetAll();
-    Known.Zero.setLowBits(std::min(TrailZ, BitWidth));
-    Known.Zero.setHighBits(std::min(LeadZ, BitWidth));
+    Known = KnownBits::computeForMul(Known, Known2);
     break;
   }
   case TargetOpcode::G_SELECT: {
-    computeKnownBitsImpl(MI.getOperand(3).getReg(), Known, DemandedElts,
+    computeKnownBitsMin(MI.getOperand(2).getReg(), MI.getOperand(3).getReg(),
+                        Known, DemandedElts, Depth + 1);
+    break;
+  }
+  case TargetOpcode::G_SMIN: {
+    // TODO: Handle clamp pattern with number of sign bits
+    KnownBits KnownRHS;
+    computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
                          Depth + 1);
-    // If we don't know any bits, early out.
-    if (Known.isUnknown())
-      break;
-    computeKnownBitsImpl(MI.getOperand(2).getReg(), Known2, DemandedElts,
+    computeKnownBitsImpl(MI.getOperand(2).getReg(), KnownRHS, DemandedElts,
                          Depth + 1);
-    // Only known if known in both the LHS and RHS.
-    Known.One &= Known2.One;
-    Known.Zero &= Known2.Zero;
+    Known = KnownBits::smin(Known, KnownRHS);
+    break;
+  }
+  case TargetOpcode::G_SMAX: {
+    // TODO: Handle clamp pattern with number of sign bits
+    KnownBits KnownRHS;
+    computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
+                         Depth + 1);
+    computeKnownBitsImpl(MI.getOperand(2).getReg(), KnownRHS, DemandedElts,
+                         Depth + 1);
+    Known = KnownBits::smax(Known, KnownRHS);
+    break;
+  }
+  case TargetOpcode::G_UMIN: {
+    KnownBits KnownRHS;
+    computeKnownBitsImpl(MI.getOperand(1).getReg(), Known,
+                         DemandedElts, Depth + 1);
+    computeKnownBitsImpl(MI.getOperand(2).getReg(), KnownRHS,
+                         DemandedElts, Depth + 1);
+    Known = KnownBits::umin(Known, KnownRHS);
+    break;
+  }
+  case TargetOpcode::G_UMAX: {
+    KnownBits KnownRHS;
+    computeKnownBitsImpl(MI.getOperand(1).getReg(), Known,
+                         DemandedElts, Depth + 1);
+    computeKnownBitsImpl(MI.getOperand(2).getReg(), KnownRHS,
+                         DemandedElts, Depth + 1);
+    Known = KnownBits::umax(Known, KnownRHS);
     break;
   }
   case TargetOpcode::G_FCMP:
@@ -314,61 +347,56 @@ void GISelKnownBits::computeKnownBitsImpl(Register R, KnownBits &Known,
     Known = Known.sext(BitWidth);
     break;
   }
+  case TargetOpcode::G_SEXT_INREG: {
+    computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
+                         Depth + 1);
+    Known = Known.sextInReg(MI.getOperand(2).getImm());
+    break;
+  }
   case TargetOpcode::G_ANYEXT: {
     computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
                          Depth + 1);
-    Known = Known.zext(BitWidth);
+    Known = Known.anyext(BitWidth);
     break;
   }
   case TargetOpcode::G_LOAD: {
-    if (MI.hasOneMemOperand()) {
-      const MachineMemOperand *MMO = *MI.memoperands_begin();
-      if (const MDNode *Ranges = MMO->getRanges()) {
-        computeKnownBitsFromRangeMetadata(*Ranges, Known);
-      }
+    const MachineMemOperand *MMO = *MI.memoperands_begin();
+    if (const MDNode *Ranges = MMO->getRanges()) {
+      computeKnownBitsFromRangeMetadata(*Ranges, Known);
     }
+
     break;
   }
   case TargetOpcode::G_ZEXTLOAD: {
     // Everything above the retrieved bits is zero
-    if (MI.hasOneMemOperand())
-      Known.Zero.setBitsFrom((*MI.memoperands_begin())->getSizeInBits());
+    Known.Zero.setBitsFrom((*MI.memoperands_begin())->getSizeInBits());
     break;
   }
-  case TargetOpcode::G_ASHR:
-  case TargetOpcode::G_LSHR:
-  case TargetOpcode::G_SHL: {
-    KnownBits RHSKnown;
+  case TargetOpcode::G_ASHR: {
+    KnownBits LHSKnown, RHSKnown;
+    computeKnownBitsImpl(MI.getOperand(1).getReg(), LHSKnown, DemandedElts,
+                         Depth + 1);
     computeKnownBitsImpl(MI.getOperand(2).getReg(), RHSKnown, DemandedElts,
                          Depth + 1);
-    if (!RHSKnown.isConstant()) {
-      LLVM_DEBUG(
-          MachineInstr *RHSMI = MRI.getVRegDef(MI.getOperand(2).getReg());
-          dbgs() << '[' << Depth << "] Shift not known constant: " << *RHSMI);
-      break;
-    }
-    uint64_t Shift = RHSKnown.getConstant().getZExtValue();
-    LLVM_DEBUG(dbgs() << '[' << Depth << "] Shift is " << Shift << '\n');
-
-    computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
+    Known = KnownBits::ashr(LHSKnown, RHSKnown);
+    break;
+  }
+  case TargetOpcode::G_LSHR: {
+    KnownBits LHSKnown, RHSKnown;
+    computeKnownBitsImpl(MI.getOperand(1).getReg(), LHSKnown, DemandedElts,
                          Depth + 1);
-
-    switch (Opcode) {
-    case TargetOpcode::G_ASHR:
-      Known.Zero = Known.Zero.ashr(Shift);
-      Known.One = Known.One.ashr(Shift);
-      break;
-    case TargetOpcode::G_LSHR:
-      Known.Zero = Known.Zero.lshr(Shift);
-      Known.One = Known.One.lshr(Shift);
-      Known.Zero.setBitsFrom(Known.Zero.getBitWidth() - Shift);
-      break;
-    case TargetOpcode::G_SHL:
-      Known.Zero = Known.Zero.shl(Shift);
-      Known.One = Known.One.shl(Shift);
-      Known.Zero.setBits(0, Shift);
-      break;
-    }
+    computeKnownBitsImpl(MI.getOperand(2).getReg(), RHSKnown, DemandedElts,
+                         Depth + 1);
+    Known = KnownBits::lshr(LHSKnown, RHSKnown);
+    break;
+  }
+  case TargetOpcode::G_SHL: {
+    KnownBits LHSKnown, RHSKnown;
+    computeKnownBitsImpl(MI.getOperand(1).getReg(), LHSKnown, DemandedElts,
+                         Depth + 1);
+    computeKnownBitsImpl(MI.getOperand(2).getReg(), RHSKnown, DemandedElts,
+                         Depth + 1);
+    Known = KnownBits::shl(LHSKnown, RHSKnown);
     break;
   }
   case TargetOpcode::G_INTTOPTR:
@@ -390,6 +418,48 @@ void GISelKnownBits::computeKnownBitsImpl(Register R, KnownBits &Known,
       Known.Zero.setBitsFrom(SrcBitWidth);
     break;
   }
+  case TargetOpcode::G_MERGE_VALUES: {
+    unsigned NumOps = MI.getNumOperands();
+    unsigned OpSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
+
+    for (unsigned I = 0; I != NumOps - 1; ++I) {
+      KnownBits SrcOpKnown;
+      computeKnownBitsImpl(MI.getOperand(I + 1).getReg(), SrcOpKnown,
+                           DemandedElts, Depth + 1);
+      Known.insertBits(SrcOpKnown, I * OpSize);
+    }
+    break;
+  }
+  case TargetOpcode::G_UNMERGE_VALUES: {
+    unsigned NumOps = MI.getNumOperands();
+    Register SrcReg = MI.getOperand(NumOps - 1).getReg();
+    if (MRI.getType(SrcReg).isVector())
+      return; // TODO: Handle vectors.
+
+    KnownBits SrcOpKnown;
+    computeKnownBitsImpl(SrcReg, SrcOpKnown, DemandedElts, Depth + 1);
+
+    // Figure out the result operand index
+    unsigned DstIdx = 0;
+    for (; DstIdx != NumOps - 1 && MI.getOperand(DstIdx).getReg() != R;
+         ++DstIdx)
+      ;
+
+    Known = SrcOpKnown.extractBits(BitWidth, BitWidth * DstIdx);
+    break;
+  }
+  case TargetOpcode::G_BSWAP: {
+    Register SrcReg = MI.getOperand(1).getReg();
+    computeKnownBitsImpl(SrcReg, Known, DemandedElts, Depth + 1);
+    Known.byteSwap();
+    break;
+  }
+  case TargetOpcode::G_BITREVERSE: {
+    Register SrcReg = MI.getOperand(1).getReg();
+    computeKnownBitsImpl(SrcReg, Known, DemandedElts, Depth + 1);
+    Known.reverseBits();
+    break;
+  }
   }
 
   assert(!Known.hasConflict() && "Bits known to be one AND zero?");
@@ -399,6 +469,17 @@ void GISelKnownBits::computeKnownBitsImpl(Register R, KnownBits &Known,
   ComputeKnownBitsCache[R] = Known;
 }
 
+/// Compute number of sign bits for the intersection of \p Src0 and \p Src1
+unsigned GISelKnownBits::computeNumSignBitsMin(Register Src0, Register Src1,
+                                               const APInt &DemandedElts,
+                                               unsigned Depth) {
+  // Test src1 first, since we canonicalize simpler expressions to the RHS.
+  unsigned Src1SignBits = computeNumSignBits(Src1, DemandedElts, Depth);
+  if (Src1SignBits == 1)
+    return 1;
+  return std::min(computeNumSignBits(Src0, DemandedElts, Depth), Src1SignBits);
+}
+
 unsigned GISelKnownBits::computeNumSignBits(Register R,
                                             const APInt &DemandedElts,
                                             unsigned Depth) {
@@ -442,15 +523,30 @@ unsigned GISelKnownBits::computeNumSignBits(Register R,
     unsigned Tmp = DstTy.getScalarSizeInBits() - SrcTy.getScalarSizeInBits();
     return computeNumSignBits(Src, DemandedElts, Depth + 1) + Tmp;
   }
+  case TargetOpcode::G_SEXT_INREG: {
+    // Max of the input and what this extends.
+    Register Src = MI.getOperand(1).getReg();
+    unsigned SrcBits = MI.getOperand(2).getImm();
+    unsigned InRegBits = TyBits - SrcBits + 1;
+    return std::max(computeNumSignBits(Src, DemandedElts, Depth + 1), InRegBits);
+  }
   case TargetOpcode::G_SEXTLOAD: {
-    Register Dst = MI.getOperand(0).getReg();
-    LLT Ty = MRI.getType(Dst);
-    // TODO: add vector support
-    if (Ty.isVector())
-      break;
-    if (MI.hasOneMemOperand())
-      return Ty.getSizeInBits() - (*MI.memoperands_begin())->getSizeInBits();
-    break;
+    // FIXME: We need an in-memory type representation.
+    if (DstTy.isVector())
+      return 1;
+
+    // e.g. i16->i32 = '17' bits known.
+    const MachineMemOperand *MMO = *MI.memoperands_begin();
+    return TyBits - MMO->getSizeInBits() + 1;
+  }
+  case TargetOpcode::G_ZEXTLOAD: {
+    // FIXME: We need an in-memory type representation.
+    if (DstTy.isVector())
+      return 1;
+
+    // e.g. i16->i32 = '16' bits known.
+    const MachineMemOperand *MMO = *MI.memoperands_begin();
+    return TyBits - MMO->getSizeInBits();
   }
   case TargetOpcode::G_TRUNC: {
     Register Src = MI.getOperand(1).getReg();
@@ -464,6 +560,11 @@ unsigned GISelKnownBits::computeNumSignBits(Register R,
       return NumSrcSignBits - (NumSrcBits - DstTyBits);
     break;
   }
+  case TargetOpcode::G_SELECT: {
+    return computeNumSignBitsMin(MI.getOperand(2).getReg(),
+                                 MI.getOperand(3).getReg(), DemandedElts,
+                                 Depth + 1);
+  }
   case TargetOpcode::G_INTRINSIC:
   case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
   default: {
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 8f6643b2f193..b7883cbc3120 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -29,9 +29,11 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/StackProtector.h"
+#include "llvm/CodeGen/SwitchLoweringUtils.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
@@ -48,11 +50,13 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
@@ -70,6 +74,7 @@
 #include "llvm/Target/TargetMachine.h"
 #include <algorithm>
 #include <cassert>
+#include <cstddef>
 #include <cstdint>
 #include <iterator>
 #include <string>
@@ -90,6 +95,8 @@ INITIALIZE_PASS_BEGIN(IRTranslator, DEBUG_TYPE, "IRTranslator LLVM IR -> MI",
                 false, false)
 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
 INITIALIZE_PASS_DEPENDENCY(GISelCSEAnalysisWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(StackProtector)
 INITIALIZE_PASS_END(IRTranslator, DEBUG_TYPE, "IRTranslator LLVM IR -> MI",
                 false, false)
 
@@ -110,7 +117,8 @@ static void reportTranslationError(MachineFunction &MF,
     ORE.emit(R);
 }
 
-IRTranslator::IRTranslator() : MachineFunctionPass(ID) { }
+IRTranslator::IRTranslator(CodeGenOpt::Level optlevel)
+    : MachineFunctionPass(ID), OptLevel(optlevel) {}
 
 #ifndef NDEBUG
 namespace {
@@ -154,13 +162,17 @@ void IRTranslator::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<StackProtector>();
   AU.addRequired<TargetPassConfig>();
   AU.addRequired<GISelCSEAnalysisWrapperPass>();
+  if (OptLevel != CodeGenOpt::None)
+    AU.addRequired<BranchProbabilityInfoWrapperPass>();
   getSelectionDAGFallbackAnalysisUsage(AU);
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
 IRTranslator::ValueToVRegInfo::VRegListT &
 IRTranslator::allocateVRegs(const Value &Val) {
-  assert(!VMap.contains(Val) && "Value already allocated in VMap");
+  auto VRegsIt = VMap.findVRegs(Val);
+  if (VRegsIt != VMap.vregs_end())
+    return *VRegsIt->second;
   auto *Regs = VMap.getVRegs(Val);
   auto *Offsets = VMap.getOffsets(Val);
   SmallVector<LLT, 4> SplitTys;
@@ -222,8 +234,9 @@ ArrayRef<Register> IRTranslator::getOrCreateVRegs(const Value &Val) {
 }
 
 int IRTranslator::getOrCreateFrameIndex(const AllocaInst &AI) {
-  if (FrameIndices.find(&AI) != FrameIndices.end())
-    return FrameIndices[&AI];
+  auto MapEntry = FrameIndices.find(&AI);
+  if (MapEntry != FrameIndices.end())
+    return MapEntry->second;
 
   uint64_t ElementSize = DL->getTypeAllocSize(AI.getAllocatedType());
   uint64_t Size =
@@ -293,25 +306,8 @@ bool IRTranslator::translateBinaryOp(unsigned Opcode, const User &U,
   return true;
 }
 
-bool IRTranslator::translateFSub(const User &U, MachineIRBuilder &MIRBuilder) {
-  // -0.0 - X --> G_FNEG
-  if (isa<Constant>(U.getOperand(0)) &&
-      U.getOperand(0) == ConstantFP::getZeroValueForNegation(U.getType())) {
-    Register Op1 = getOrCreateVReg(*U.getOperand(1));
-    Register Res = getOrCreateVReg(U);
-    uint16_t Flags = 0;
-    if (isa<Instruction>(U)) {
-      const Instruction &I = cast<Instruction>(U);
-      Flags = MachineInstr::copyFlagsFromInstruction(I);
-    }
-    // Negate the last operand of the FSUB
-    MIRBuilder.buildFNeg(Res, Op1, Flags);
-    return true;
-  }
-  return translateBinaryOp(TargetOpcode::G_FSUB, U, MIRBuilder);
-}
-
-bool IRTranslator::translateFNeg(const User &U, MachineIRBuilder &MIRBuilder) {
+bool IRTranslator::translateUnaryOp(unsigned Opcode, const User &U,
+                                    MachineIRBuilder &MIRBuilder) {
   Register Op0 = getOrCreateVReg(*U.getOperand(0));
   Register Res = getOrCreateVReg(U);
   uint16_t Flags = 0;
@@ -319,10 +315,14 @@ bool IRTranslator::translateFNeg(const User &U, MachineIRBuilder &MIRBuilder) {
     const Instruction &I = cast<Instruction>(U);
     Flags = MachineInstr::copyFlagsFromInstruction(I);
   }
-  MIRBuilder.buildFNeg(Res, Op0, Flags);
+  MIRBuilder.buildInstr(Opcode, {Res}, {Op0}, Flags);
   return true;
 }
 
+bool IRTranslator::translateFNeg(const User &U, MachineIRBuilder &MIRBuilder) {
+  return translateUnaryOp(TargetOpcode::G_FNEG, U, MIRBuilder);
+}
+
 bool IRTranslator::translateCompare(const User &U,
                                     MachineIRBuilder &MIRBuilder) {
   auto *CI = dyn_cast<CmpInst>(&U);
@@ -368,31 +368,289 @@ bool IRTranslator::translateRet(const User &U, MachineIRBuilder &MIRBuilder) {
   // The target may mess up with the insertion point, but
   // this is not important as a return is the last instruction
   // of the block anyway.
-  return CLI->lowerReturn(MIRBuilder, Ret, VRegs, SwiftErrorVReg);
+  return CLI->lowerReturn(MIRBuilder, Ret, VRegs, FuncInfo, SwiftErrorVReg);
+}
+
+void IRTranslator::emitBranchForMergedCondition(
+    const Value *Cond, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
+    MachineBasicBlock *CurBB, MachineBasicBlock *SwitchBB,
+    BranchProbability TProb, BranchProbability FProb, bool InvertCond) {
+  // If the leaf of the tree is a comparison, merge the condition into
+  // the caseblock.
+  if (const CmpInst *BOp = dyn_cast<CmpInst>(Cond)) {
+    CmpInst::Predicate Condition;
+    if (const ICmpInst *IC = dyn_cast<ICmpInst>(Cond)) {
+      Condition = InvertCond ? IC->getInversePredicate() : IC->getPredicate();
+    } else {
+      const FCmpInst *FC = cast<FCmpInst>(Cond);
+      Condition = InvertCond ? FC->getInversePredicate() : FC->getPredicate();
+    }
+
+    SwitchCG::CaseBlock CB(Condition, false, BOp->getOperand(0),
+                           BOp->getOperand(1), nullptr, TBB, FBB, CurBB,
+                           CurBuilder->getDebugLoc(), TProb, FProb);
+    SL->SwitchCases.push_back(CB);
+    return;
+  }
+
+  // Create a CaseBlock record representing this branch.
+  CmpInst::Predicate Pred = InvertCond ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
+  SwitchCG::CaseBlock CB(
+      Pred, false, Cond, ConstantInt::getTrue(MF->getFunction().getContext()),
+      nullptr, TBB, FBB, CurBB, CurBuilder->getDebugLoc(), TProb, FProb);
+  SL->SwitchCases.push_back(CB);
+}
+
+static bool isValInBlock(const Value *V, const BasicBlock *BB) {
+  if (const Instruction *I = dyn_cast<Instruction>(V))
+    return I->getParent() == BB;
+  return true;
+}
+
+void IRTranslator::findMergedConditions(
+    const Value *Cond, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
+    MachineBasicBlock *CurBB, MachineBasicBlock *SwitchBB,
+    Instruction::BinaryOps Opc, BranchProbability TProb,
+    BranchProbability FProb, bool InvertCond) {
+  using namespace PatternMatch;
+  assert((Opc == Instruction::And || Opc == Instruction::Or) &&
+         "Expected Opc to be AND/OR");
+  // Skip over not part of the tree and remember to invert op and operands at
+  // next level.
+  Value *NotCond;
+  if (match(Cond, m_OneUse(m_Not(m_Value(NotCond)))) &&
+      isValInBlock(NotCond, CurBB->getBasicBlock())) {
+    findMergedConditions(NotCond, TBB, FBB, CurBB, SwitchBB, Opc, TProb, FProb,
+                         !InvertCond);
+    return;
+  }
+
+  const Instruction *BOp = dyn_cast<Instruction>(Cond);
+  const Value *BOpOp0, *BOpOp1;
+  // Compute the effective opcode for Cond, taking into account whether it needs
+  // to be inverted, e.g.
+  //   and (not (or A, B)), C
+  // gets lowered as
+  //   and (and (not A, not B), C)
+  Instruction::BinaryOps BOpc = (Instruction::BinaryOps)0;
+  if (BOp) {
+    BOpc = match(BOp, m_LogicalAnd(m_Value(BOpOp0), m_Value(BOpOp1)))
+               ? Instruction::And
+               : (match(BOp, m_LogicalOr(m_Value(BOpOp0), m_Value(BOpOp1)))
+                      ? Instruction::Or
+                      : (Instruction::BinaryOps)0);
+    if (InvertCond) {
+      if (BOpc == Instruction::And)
+        BOpc = Instruction::Or;
+      else if (BOpc == Instruction::Or)
+        BOpc = Instruction::And;
+    }
+  }
+
+  // If this node is not part of the or/and tree, emit it as a branch.
+  // Note that all nodes in the tree should have same opcode.
+  bool BOpIsInOrAndTree = BOpc && BOpc == Opc && BOp->hasOneUse();
+  if (!BOpIsInOrAndTree || BOp->getParent() != CurBB->getBasicBlock() ||
+      !isValInBlock(BOpOp0, CurBB->getBasicBlock()) ||
+      !isValInBlock(BOpOp1, CurBB->getBasicBlock())) {
+    emitBranchForMergedCondition(Cond, TBB, FBB, CurBB, SwitchBB, TProb, FProb,
+                                 InvertCond);
+    return;
+  }
+
+  //  Create TmpBB after CurBB.
+  MachineFunction::iterator BBI(CurBB);
+  MachineBasicBlock *TmpBB =
+      MF->CreateMachineBasicBlock(CurBB->getBasicBlock());
+  CurBB->getParent()->insert(++BBI, TmpBB);
+
+  if (Opc == Instruction::Or) {
+    // Codegen X | Y as:
+    // BB1:
+    //   jmp_if_X TBB
+    //   jmp TmpBB
+    // TmpBB:
+    //   jmp_if_Y TBB
+    //   jmp FBB
+    //
+
+    // We have flexibility in setting Prob for BB1 and Prob for TmpBB.
+    // The requirement is that
+    //   TrueProb for BB1 + (FalseProb for BB1 * TrueProb for TmpBB)
+    //     = TrueProb for original BB.
+    // Assuming the original probabilities are A and B, one choice is to set
+    // BB1's probabilities to A/2 and A/2+B, and set TmpBB's probabilities to
+    // A/(1+B) and 2B/(1+B). This choice assumes that
+    //   TrueProb for BB1 == FalseProb for BB1 * TrueProb for TmpBB.
+    // Another choice is to assume TrueProb for BB1 equals to TrueProb for
+    // TmpBB, but the math is more complicated.
+
+    auto NewTrueProb = TProb / 2;
+    auto NewFalseProb = TProb / 2 + FProb;
+    // Emit the LHS condition.
+    findMergedConditions(BOpOp0, TBB, TmpBB, CurBB, SwitchBB, Opc, NewTrueProb,
+                         NewFalseProb, InvertCond);
+
+    // Normalize A/2 and B to get A/(1+B) and 2B/(1+B).
+    SmallVector<BranchProbability, 2> Probs{TProb / 2, FProb};
+    BranchProbability::normalizeProbabilities(Probs.begin(), Probs.end());
+    // Emit the RHS condition into TmpBB.
+    findMergedConditions(BOpOp1, TBB, FBB, TmpBB, SwitchBB, Opc, Probs[0],
+                         Probs[1], InvertCond);
+  } else {
+    assert(Opc == Instruction::And && "Unknown merge op!");
+    // Codegen X & Y as:
+    // BB1:
+    //   jmp_if_X TmpBB
+    //   jmp FBB
+    // TmpBB:
+    //   jmp_if_Y TBB
+    //   jmp FBB
+    //
+    //  This requires creation of TmpBB after CurBB.
+
+    // We have flexibility in setting Prob for BB1 and Prob for TmpBB.
+    // The requirement is that
+    //   FalseProb for BB1 + (TrueProb for BB1 * FalseProb for TmpBB)
+    //     = FalseProb for original BB.
+    // Assuming the original probabilities are A and B, one choice is to set
+    // BB1's probabilities to A+B/2 and B/2, and set TmpBB's probabilities to
+    // 2A/(1+A) and B/(1+A). This choice assumes that FalseProb for BB1 ==
+    // TrueProb for BB1 * FalseProb for TmpBB.
+
+    auto NewTrueProb = TProb + FProb / 2;
+    auto NewFalseProb = FProb / 2;
+    // Emit the LHS condition.
+    findMergedConditions(BOpOp0, TmpBB, FBB, CurBB, SwitchBB, Opc, NewTrueProb,
+                         NewFalseProb, InvertCond);
+
+    // Normalize A and B/2 to get 2A/(1+A) and B/(1+A).
+    SmallVector<BranchProbability, 2> Probs{TProb, FProb / 2};
+    BranchProbability::normalizeProbabilities(Probs.begin(), Probs.end());
+    // Emit the RHS condition into TmpBB.
+    findMergedConditions(BOpOp1, TBB, FBB, TmpBB, SwitchBB, Opc, Probs[0],
+                         Probs[1], InvertCond);
+  }
+}
+
+bool IRTranslator::shouldEmitAsBranches(
+    const std::vector<SwitchCG::CaseBlock> &Cases) {
+  // For multiple cases, it's better to emit as branches.
+  if (Cases.size() != 2)
+    return true;
+
+  // If this is two comparisons of the same values or'd or and'd together, they
+  // will get folded into a single comparison, so don't emit two blocks.
+  if ((Cases[0].CmpLHS == Cases[1].CmpLHS &&
+       Cases[0].CmpRHS == Cases[1].CmpRHS) ||
+      (Cases[0].CmpRHS == Cases[1].CmpLHS &&
+       Cases[0].CmpLHS == Cases[1].CmpRHS)) {
+    return false;
+  }
+
+  // Handle: (X != null) | (Y != null) --> (X|Y) != 0
+  // Handle: (X == null) & (Y == null) --> (X|Y) == 0
+  if (Cases[0].CmpRHS == Cases[1].CmpRHS &&
+      Cases[0].PredInfo.Pred == Cases[1].PredInfo.Pred &&
+      isa<Constant>(Cases[0].CmpRHS) &&
+      cast<Constant>(Cases[0].CmpRHS)->isNullValue()) {
+    if (Cases[0].PredInfo.Pred == CmpInst::ICMP_EQ &&
+        Cases[0].TrueBB == Cases[1].ThisBB)
+      return false;
+    if (Cases[0].PredInfo.Pred == CmpInst::ICMP_NE &&
+        Cases[0].FalseBB == Cases[1].ThisBB)
+      return false;
+  }
+
+  return true;
 }
 
 bool IRTranslator::translateBr(const User &U, MachineIRBuilder &MIRBuilder) {
   const BranchInst &BrInst = cast<BranchInst>(U);
-  unsigned Succ = 0;
-  if (!BrInst.isUnconditional()) {
-    // We want a G_BRCOND to the true BB followed by an unconditional branch.
-    Register Tst = getOrCreateVReg(*BrInst.getCondition());
-    const BasicBlock &TrueTgt = *cast<BasicBlock>(BrInst.getSuccessor(Succ++));
-    MachineBasicBlock &TrueBB = getMBB(TrueTgt);
-    MIRBuilder.buildBrCond(Tst, TrueBB);
+  auto &CurMBB = MIRBuilder.getMBB();
+  auto *Succ0MBB = &getMBB(*BrInst.getSuccessor(0));
+
+  if (BrInst.isUnconditional()) {
+    // If the unconditional target is the layout successor, fallthrough.
+    if (!CurMBB.isLayoutSuccessor(Succ0MBB))
+      MIRBuilder.buildBr(*Succ0MBB);
+
+    // Link successors.
+    for (const BasicBlock *Succ : successors(&BrInst))
+      CurMBB.addSuccessor(&getMBB(*Succ));
+    return true;
   }
 
-  const BasicBlock &BrTgt = *cast<BasicBlock>(BrInst.getSuccessor(Succ));
-  MachineBasicBlock &TgtBB = getMBB(BrTgt);
-  MachineBasicBlock &CurBB = MIRBuilder.getMBB();
+  // If this condition is one of the special cases we handle, do special stuff
+  // now.
+  const Value *CondVal = BrInst.getCondition();
+  MachineBasicBlock *Succ1MBB = &getMBB(*BrInst.getSuccessor(1));
 
-  // If the unconditional target is the layout successor, fallthrough.
-  if (!CurBB.isLayoutSuccessor(&TgtBB))
-    MIRBuilder.buildBr(TgtBB);
+  const auto &TLI = *MF->getSubtarget().getTargetLowering();
 
-  // Link successors.
-  for (const BasicBlock *Succ : successors(&BrInst))
-    CurBB.addSuccessor(&getMBB(*Succ));
+  // If this is a series of conditions that are or'd or and'd together, emit
+  // this as a sequence of branches instead of setcc's with and/or operations.
+  // As long as jumps are not expensive (exceptions for multi-use logic ops,
+  // unpredictable branches, and vector extracts because those jumps are likely
+  // expensive for any target), this should improve performance.
+  // For example, instead of something like:
+  //     cmp A, B
+  //     C = seteq
+  //     cmp D, E
+  //     F = setle
+  //     or C, F
+  //     jnz foo
+  // Emit:
+  //     cmp A, B
+  //     je foo
+  //     cmp D, E
+  //     jle foo
+  using namespace PatternMatch;
+  const Instruction *CondI = dyn_cast<Instruction>(CondVal);
+  if (!TLI.isJumpExpensive() && CondI && CondI->hasOneUse() &&
+      !BrInst.hasMetadata(LLVMContext::MD_unpredictable)) {
+    Instruction::BinaryOps Opcode = (Instruction::BinaryOps)0;
+    Value *Vec;
+    const Value *BOp0, *BOp1;
+    if (match(CondI, m_LogicalAnd(m_Value(BOp0), m_Value(BOp1))))
+      Opcode = Instruction::And;
+    else if (match(CondI, m_LogicalOr(m_Value(BOp0), m_Value(BOp1))))
+      Opcode = Instruction::Or;
+
+    if (Opcode && !(match(BOp0, m_ExtractElt(m_Value(Vec), m_Value())) &&
+                    match(BOp1, m_ExtractElt(m_Specific(Vec), m_Value())))) {
+      findMergedConditions(CondI, Succ0MBB, Succ1MBB, &CurMBB, &CurMBB, Opcode,
+                           getEdgeProbability(&CurMBB, Succ0MBB),
+                           getEdgeProbability(&CurMBB, Succ1MBB),
+                           /*InvertCond=*/false);
+      assert(SL->SwitchCases[0].ThisBB == &CurMBB && "Unexpected lowering!");
+
+      // Allow some cases to be rejected.
+      if (shouldEmitAsBranches(SL->SwitchCases)) {
+        // Emit the branch for this block.
+        emitSwitchCase(SL->SwitchCases[0], &CurMBB, *CurBuilder);
+        SL->SwitchCases.erase(SL->SwitchCases.begin());
+        return true;
+      }
+
+      // Okay, we decided not to do this, remove any inserted MBB's and clear
+      // SwitchCases.
+      for (unsigned I = 1, E = SL->SwitchCases.size(); I != E; ++I)
+        MF->erase(SL->SwitchCases[I].ThisBB);
+
+      SL->SwitchCases.clear();
+    }
+  }
+
+  // Create a CaseBlock record representing this branch.
+  SwitchCG::CaseBlock CB(CmpInst::ICMP_EQ, false, CondVal,
+                         ConstantInt::getTrue(MF->getFunction().getContext()),
+                         nullptr, Succ0MBB, Succ1MBB, &CurMBB,
+                         CurBuilder->getDebugLoc());
+
+  // Use emitSwitchCase to actually insert the fast branch sequence for this
+  // cond branch.
+  emitSwitchCase(CB, &CurMBB, *CurBuilder);
   return true;
 }
 
@@ -457,6 +715,7 @@ bool IRTranslator::translateSwitch(const User &U, MachineIRBuilder &MIB) {
   }
 
   SL->findJumpTables(Clusters, &SI, DefaultMBB, nullptr, nullptr);
+  SL->findBitTestClusters(Clusters, &SI);
 
   LLVM_DEBUG({
     dbgs() << "Case clusters: ";
@@ -577,8 +836,22 @@ void IRTranslator::emitSwitchCase(SwitchCG::CaseBlock &CB,
   const LLT i1Ty = LLT::scalar(1);
   // Build the compare.
   if (!CB.CmpMHS) {
-    Register CondRHS = getOrCreateVReg(*CB.CmpRHS);
-    Cond = MIB.buildICmp(CB.PredInfo.Pred, i1Ty, CondLHS, CondRHS).getReg(0);
+    const auto *CI = dyn_cast<ConstantInt>(CB.CmpRHS);
+    // For conditional branch lowering, we might try to do something silly like
+    // emit an G_ICMP to compare an existing G_ICMP i1 result with true. If so,
+    // just re-use the existing condition vreg.
+    if (MRI->getType(CondLHS).getSizeInBits() == 1 && CI &&
+        CI->getZExtValue() == 1 && CB.PredInfo.Pred == CmpInst::ICMP_EQ) {
+      Cond = CondLHS;
+    } else {
+      Register CondRHS = getOrCreateVReg(*CB.CmpRHS);
+      if (CmpInst::isFPPredicate(CB.PredInfo.Pred))
+        Cond =
+            MIB.buildFCmp(CB.PredInfo.Pred, i1Ty, CondLHS, CondRHS).getReg(0);
+      else
+        Cond =
+            MIB.buildICmp(CB.PredInfo.Pred, i1Ty, CondLHS, CondRHS).getReg(0);
+    }
   } else {
     assert(CB.PredInfo.Pred == CmpInst::ICMP_SLE &&
            "Can only handle SLE ranges");
@@ -611,17 +884,8 @@ void IRTranslator::emitSwitchCase(SwitchCG::CaseBlock &CB,
     addSuccessorWithProb(CB.ThisBB, CB.FalseBB, CB.FalseProb);
   CB.ThisBB->normalizeSuccProbs();
 
-  //  if (SwitchBB->getBasicBlock() != CB.FalseBB->getBasicBlock())
-    addMachineCFGPred({SwitchBB->getBasicBlock(), CB.FalseBB->getBasicBlock()},
-                      CB.ThisBB);
-
-  // If the lhs block is the next block, invert the condition so that we can
-  // fall through to the lhs instead of the rhs block.
-  if (CB.TrueBB == CB.ThisBB->getNextNode()) {
-    std::swap(CB.TrueBB, CB.FalseBB);
-    auto True = MIB.buildConstant(i1Ty, 1);
-    Cond = MIB.buildXor(i1Ty, Cond, True).getReg(0);
-  }
+  addMachineCFGPred({SwitchBB->getBasicBlock(), CB.FalseBB->getBasicBlock()},
+                    CB.ThisBB);
 
   MIB.buildBrCond(Cond, *CB.TrueBB);
   MIB.buildBr(*CB.FalseBB);
@@ -734,6 +998,156 @@ bool IRTranslator::lowerSwitchRangeWorkItem(SwitchCG::CaseClusterIt I,
   return true;
 }
 
+void IRTranslator::emitBitTestHeader(SwitchCG::BitTestBlock &B,
+                                     MachineBasicBlock *SwitchBB) {
+  MachineIRBuilder &MIB = *CurBuilder;
+  MIB.setMBB(*SwitchBB);
+
+  // Subtract the minimum value.
+  Register SwitchOpReg = getOrCreateVReg(*B.SValue);
+
+  LLT SwitchOpTy = MRI->getType(SwitchOpReg);
+  Register MinValReg = MIB.buildConstant(SwitchOpTy, B.First).getReg(0);
+  auto RangeSub = MIB.buildSub(SwitchOpTy, SwitchOpReg, MinValReg);
+
+  // Ensure that the type will fit the mask value.
+  LLT MaskTy = SwitchOpTy;
+  for (unsigned I = 0, E = B.Cases.size(); I != E; ++I) {
+    if (!isUIntN(SwitchOpTy.getSizeInBits(), B.Cases[I].Mask)) {
+      // Switch table case range are encoded into series of masks.
+      // Just use pointer type, it's guaranteed to fit.
+      MaskTy = LLT::scalar(64);
+      break;
+    }
+  }
+  Register SubReg = RangeSub.getReg(0);
+  if (SwitchOpTy != MaskTy)
+    SubReg = MIB.buildZExtOrTrunc(MaskTy, SubReg).getReg(0);
+
+  B.RegVT = getMVTForLLT(MaskTy);
+  B.Reg = SubReg;
+
+  MachineBasicBlock *MBB = B.Cases[0].ThisBB;
+
+  if (!B.OmitRangeCheck)
+    addSuccessorWithProb(SwitchBB, B.Default, B.DefaultProb);
+  addSuccessorWithProb(SwitchBB, MBB, B.Prob);
+
+  SwitchBB->normalizeSuccProbs();
+
+  if (!B.OmitRangeCheck) {
+    // Conditional branch to the default block.
+    auto RangeCst = MIB.buildConstant(SwitchOpTy, B.Range);
+    auto RangeCmp = MIB.buildICmp(CmpInst::Predicate::ICMP_UGT, LLT::scalar(1),
+                                  RangeSub, RangeCst);
+    MIB.buildBrCond(RangeCmp, *B.Default);
+  }
+
+  // Avoid emitting unnecessary branches to the next block.
+  if (MBB != SwitchBB->getNextNode())
+    MIB.buildBr(*MBB);
+}
+
+void IRTranslator::emitBitTestCase(SwitchCG::BitTestBlock &BB,
+                                   MachineBasicBlock *NextMBB,
+                                   BranchProbability BranchProbToNext,
+                                   Register Reg, SwitchCG::BitTestCase &B,
+                                   MachineBasicBlock *SwitchBB) {
+  MachineIRBuilder &MIB = *CurBuilder;
+  MIB.setMBB(*SwitchBB);
+
+  LLT SwitchTy = getLLTForMVT(BB.RegVT);
+  Register Cmp;
+  unsigned PopCount = countPopulation(B.Mask);
+  if (PopCount == 1) {
+    // Testing for a single bit; just compare the shift count with what it
+    // would need to be to shift a 1 bit in that position.
+    auto MaskTrailingZeros =
+        MIB.buildConstant(SwitchTy, countTrailingZeros(B.Mask));
+    Cmp =
+        MIB.buildICmp(ICmpInst::ICMP_EQ, LLT::scalar(1), Reg, MaskTrailingZeros)
+            .getReg(0);
+  } else if (PopCount == BB.Range) {
+    // There is only one zero bit in the range, test for it directly.
+    auto MaskTrailingOnes =
+        MIB.buildConstant(SwitchTy, countTrailingOnes(B.Mask));
+    Cmp = MIB.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Reg, MaskTrailingOnes)
+              .getReg(0);
+  } else {
+    // Make desired shift.
+    auto CstOne = MIB.buildConstant(SwitchTy, 1);
+    auto SwitchVal = MIB.buildShl(SwitchTy, CstOne, Reg);
+
+    // Emit bit tests and jumps.
+    auto CstMask = MIB.buildConstant(SwitchTy, B.Mask);
+    auto AndOp = MIB.buildAnd(SwitchTy, SwitchVal, CstMask);
+    auto CstZero = MIB.buildConstant(SwitchTy, 0);
+    Cmp = MIB.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), AndOp, CstZero)
+              .getReg(0);
+  }
+
+  // The branch probability from SwitchBB to B.TargetBB is B.ExtraProb.
+  addSuccessorWithProb(SwitchBB, B.TargetBB, B.ExtraProb);
+  // The branch probability from SwitchBB to NextMBB is BranchProbToNext.
+  addSuccessorWithProb(SwitchBB, NextMBB, BranchProbToNext);
+  // It is not guaranteed that the sum of B.ExtraProb and BranchProbToNext is
+  // one as they are relative probabilities (and thus work more like weights),
+  // and hence we need to normalize them to let the sum of them become one.
+  SwitchBB->normalizeSuccProbs();
+
+  // Record the fact that the IR edge from the header to the bit test target
+  // will go through our new block. Neeeded for PHIs to have nodes added.
+  addMachineCFGPred({BB.Parent->getBasicBlock(), B.TargetBB->getBasicBlock()},
+                    SwitchBB);
+
+  MIB.buildBrCond(Cmp, *B.TargetBB);
+
+  // Avoid emitting unnecessary branches to the next block.
+  if (NextMBB != SwitchBB->getNextNode())
+    MIB.buildBr(*NextMBB);
+}
+
+bool IRTranslator::lowerBitTestWorkItem(
+    SwitchCG::SwitchWorkListItem W, MachineBasicBlock *SwitchMBB,
+    MachineBasicBlock *CurMBB, MachineBasicBlock *DefaultMBB,
+    MachineIRBuilder &MIB, MachineFunction::iterator BBI,
+    BranchProbability DefaultProb, BranchProbability UnhandledProbs,
+    SwitchCG::CaseClusterIt I, MachineBasicBlock *Fallthrough,
+    bool FallthroughUnreachable) {
+  using namespace SwitchCG;
+  MachineFunction *CurMF = SwitchMBB->getParent();
+  // FIXME: Optimize away range check based on pivot comparisons.
+  BitTestBlock *BTB = &SL->BitTestCases[I->BTCasesIndex];
+  // The bit test blocks haven't been inserted yet; insert them here.
+  for (BitTestCase &BTC : BTB->Cases)
+    CurMF->insert(BBI, BTC.ThisBB);
+
+  // Fill in fields of the BitTestBlock.
+  BTB->Parent = CurMBB;
+  BTB->Default = Fallthrough;
+
+  BTB->DefaultProb = UnhandledProbs;
+  // If the cases in bit test don't form a contiguous range, we evenly
+  // distribute the probability on the edge to Fallthrough to two
+  // successors of CurMBB.
+  if (!BTB->ContiguousRange) {
+    BTB->Prob += DefaultProb / 2;
+    BTB->DefaultProb -= DefaultProb / 2;
+  }
+
+  if (FallthroughUnreachable) {
+    // Skip the range check if the fallthrough block is unreachable.
+    BTB->OmitRangeCheck = true;
+  }
+
+  // If we're in the right place, emit the bit test header right now.
+  if (CurMBB == SwitchMBB) {
+    emitBitTestHeader(*BTB, SwitchMBB);
+    BTB->Emitted = true;
+  }
+  return true;
+}
+
 bool IRTranslator::lowerSwitchWorkItem(SwitchCG::SwitchWorkListItem W,
                                        Value *Cond,
                                        MachineBasicBlock *SwitchMBB,
@@ -794,9 +1208,15 @@ bool IRTranslator::lowerSwitchWorkItem(SwitchCG::SwitchWorkListItem W,
 
     switch (I->Kind) {
     case CC_BitTests: {
-      LLVM_DEBUG(dbgs() << "Switch to bit test optimization unimplemented");
-      return false; // Bit tests currently unimplemented.
+      if (!lowerBitTestWorkItem(W, SwitchMBB, CurMBB, DefaultMBB, MIB, BBI,
+                                DefaultProb, UnhandledProbs, I, Fallthrough,
+                                FallthroughUnreachable)) {
+        LLVM_DEBUG(dbgs() << "Failed to lower bit test for switch");
+        return false;
+      }
+      break;
     }
+
     case CC_JumpTable: {
       if (!lowerJumpTableWorkItem(W, SwitchMBB, CurMBB, DefaultMBB, MIB, BBI,
                                   UnhandledProbs, I, Fallthrough,
@@ -1137,16 +1557,33 @@ bool IRTranslator::translateGetElementPtr(const User &U,
 
 bool IRTranslator::translateMemFunc(const CallInst &CI,
                                     MachineIRBuilder &MIRBuilder,
-                                    Intrinsic::ID ID) {
+                                    unsigned Opcode) {
 
   // If the source is undef, then just emit a nop.
   if (isa<UndefValue>(CI.getArgOperand(1)))
     return true;
 
-  ArrayRef<Register> Res;
-  auto ICall = MIRBuilder.buildIntrinsic(ID, Res, true);
-  for (auto AI = CI.arg_begin(), AE = CI.arg_end(); std::next(AI) != AE; ++AI)
-    ICall.addUse(getOrCreateVReg(**AI));
+  SmallVector<Register, 3> SrcRegs;
+
+  unsigned MinPtrSize = UINT_MAX;
+  for (auto AI = CI.arg_begin(), AE = CI.arg_end(); std::next(AI) != AE; ++AI) {
+    Register SrcReg = getOrCreateVReg(**AI);
+    LLT SrcTy = MRI->getType(SrcReg);
+    if (SrcTy.isPointer())
+      MinPtrSize = std::min(SrcTy.getSizeInBits(), MinPtrSize);
+    SrcRegs.push_back(SrcReg);
+  }
+
+  LLT SizeTy = LLT::scalar(MinPtrSize);
+
+  // The size operand should be the minimum of the pointer sizes.
+  Register &SizeOpReg = SrcRegs[SrcRegs.size() - 1];
+  if (MRI->getType(SizeOpReg) != SizeTy)
+    SizeOpReg = MIRBuilder.buildZExtOrTrunc(SizeTy, SizeOpReg).getReg(0);
+
+  auto ICall = MIRBuilder.buildInstr(Opcode);
+  for (Register SrcReg : SrcRegs)
+    ICall.addUse(SrcReg);
 
   Align DstAlign;
   Align SrcAlign;
@@ -1175,7 +1612,7 @@ bool IRTranslator::translateMemFunc(const CallInst &CI,
   ICall.addMemOperand(MF->getMachineMemOperand(
       MachinePointerInfo(CI.getArgOperand(0)),
       MachineMemOperand::MOStore | VolFlag, 1, DstAlign));
-  if (ID != Intrinsic::memset)
+  if (Opcode != TargetOpcode::G_MEMSET)
     ICall.addMemOperand(MF->getMachineMemOperand(
         MachinePointerInfo(CI.getArgOperand(1)),
         MachineMemOperand::MOLoad | VolFlag, 1, SrcAlign));
@@ -1214,6 +1651,16 @@ bool IRTranslator::translateOverflowIntrinsic(const CallInst &CI, unsigned Op,
   return true;
 }
 
+bool IRTranslator::translateFixedPointIntrinsic(unsigned Op, const CallInst &CI,
+                                                MachineIRBuilder &MIRBuilder) {
+  Register Dst = getOrCreateVReg(CI);
+  Register Src0 = getOrCreateVReg(*CI.getOperand(0));
+  Register Src1 = getOrCreateVReg(*CI.getOperand(1));
+  uint64_t Scale = cast<ConstantInt>(CI.getOperand(2))->getZExtValue();
+  MIRBuilder.buildInstr(Op, {Dst}, { Src0, Src1, Scale });
+  return true;
+}
+
 unsigned IRTranslator::getSimpleIntrinsicOpcode(Intrinsic::ID ID) {
   switch (ID) {
     default:
@@ -1264,10 +1711,14 @@ unsigned IRTranslator::getSimpleIntrinsicOpcode(Intrinsic::ID ID) {
       return TargetOpcode::G_FNEARBYINT;
     case Intrinsic::pow:
       return TargetOpcode::G_FPOW;
+    case Intrinsic::powi:
+      return TargetOpcode::G_FPOWI;
     case Intrinsic::rint:
       return TargetOpcode::G_FRINT;
     case Intrinsic::round:
       return TargetOpcode::G_INTRINSIC_ROUND;
+    case Intrinsic::roundeven:
+      return TargetOpcode::G_INTRINSIC_ROUNDEVEN;
     case Intrinsic::sin:
       return TargetOpcode::G_FSIN;
     case Intrinsic::sqrt:
@@ -1278,6 +1729,31 @@ unsigned IRTranslator::getSimpleIntrinsicOpcode(Intrinsic::ID ID) {
       return TargetOpcode::G_READCYCLECOUNTER;
     case Intrinsic::ptrmask:
       return TargetOpcode::G_PTRMASK;
+    case Intrinsic::lrint:
+      return TargetOpcode::G_INTRINSIC_LRINT;
+    // FADD/FMUL require checking the FMF, so are handled elsewhere.
+    case Intrinsic::vector_reduce_fmin:
+      return TargetOpcode::G_VECREDUCE_FMIN;
+    case Intrinsic::vector_reduce_fmax:
+      return TargetOpcode::G_VECREDUCE_FMAX;
+    case Intrinsic::vector_reduce_add:
+      return TargetOpcode::G_VECREDUCE_ADD;
+    case Intrinsic::vector_reduce_mul:
+      return TargetOpcode::G_VECREDUCE_MUL;
+    case Intrinsic::vector_reduce_and:
+      return TargetOpcode::G_VECREDUCE_AND;
+    case Intrinsic::vector_reduce_or:
+      return TargetOpcode::G_VECREDUCE_OR;
+    case Intrinsic::vector_reduce_xor:
+      return TargetOpcode::G_VECREDUCE_XOR;
+    case Intrinsic::vector_reduce_smax:
+      return TargetOpcode::G_VECREDUCE_SMAX;
+    case Intrinsic::vector_reduce_smin:
+      return TargetOpcode::G_VECREDUCE_SMIN;
+    case Intrinsic::vector_reduce_umax:
+      return TargetOpcode::G_VECREDUCE_UMAX;
+    case Intrinsic::vector_reduce_umin:
+      return TargetOpcode::G_VECREDUCE_UMIN;
   }
   return Intrinsic::not_intrinsic;
 }
@@ -1370,7 +1846,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
     // Get the underlying objects for the location passed on the lifetime
     // marker.
     SmallVector<const Value *, 4> Allocas;
-    GetUnderlyingObjects(CI.getArgOperand(1), Allocas, *DL);
+    getUnderlyingObjects(CI.getArgOperand(1), Allocas);
 
     // Iterate over each underlying object, creating lifetime markers for each
     // static alloca. Quit if we find a non-static alloca.
@@ -1484,6 +1960,37 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
     return translateBinaryOp(TargetOpcode::G_USUBSAT, CI, MIRBuilder);
   case Intrinsic::ssub_sat:
     return translateBinaryOp(TargetOpcode::G_SSUBSAT, CI, MIRBuilder);
+  case Intrinsic::ushl_sat:
+    return translateBinaryOp(TargetOpcode::G_USHLSAT, CI, MIRBuilder);
+  case Intrinsic::sshl_sat:
+    return translateBinaryOp(TargetOpcode::G_SSHLSAT, CI, MIRBuilder);
+  case Intrinsic::umin:
+    return translateBinaryOp(TargetOpcode::G_UMIN, CI, MIRBuilder);
+  case Intrinsic::umax:
+    return translateBinaryOp(TargetOpcode::G_UMAX, CI, MIRBuilder);
+  case Intrinsic::smin:
+    return translateBinaryOp(TargetOpcode::G_SMIN, CI, MIRBuilder);
+  case Intrinsic::smax:
+    return translateBinaryOp(TargetOpcode::G_SMAX, CI, MIRBuilder);
+  case Intrinsic::abs:
+    // TODO: Preserve "int min is poison" arg in GMIR?
+    return translateUnaryOp(TargetOpcode::G_ABS, CI, MIRBuilder);
+  case Intrinsic::smul_fix:
+    return translateFixedPointIntrinsic(TargetOpcode::G_SMULFIX, CI, MIRBuilder);
+  case Intrinsic::umul_fix:
+    return translateFixedPointIntrinsic(TargetOpcode::G_UMULFIX, CI, MIRBuilder);
+  case Intrinsic::smul_fix_sat:
+    return translateFixedPointIntrinsic(TargetOpcode::G_SMULFIXSAT, CI, MIRBuilder);
+  case Intrinsic::umul_fix_sat:
+    return translateFixedPointIntrinsic(TargetOpcode::G_UMULFIXSAT, CI, MIRBuilder);
+  case Intrinsic::sdiv_fix:
+    return translateFixedPointIntrinsic(TargetOpcode::G_SDIVFIX, CI, MIRBuilder);
+  case Intrinsic::udiv_fix:
+    return translateFixedPointIntrinsic(TargetOpcode::G_UDIVFIX, CI, MIRBuilder);
+  case Intrinsic::sdiv_fix_sat:
+    return translateFixedPointIntrinsic(TargetOpcode::G_SDIVFIXSAT, CI, MIRBuilder);
+  case Intrinsic::udiv_fix_sat:
+    return translateFixedPointIntrinsic(TargetOpcode::G_UDIVFIXSAT, CI, MIRBuilder);
   case Intrinsic::fmuladd: {
     const TargetMachine &TM = MF->getTarget();
     const TargetLowering &TLI = *MF->getSubtarget().getTargetLowering();
@@ -1507,10 +2014,24 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
     }
     return true;
   }
+  case Intrinsic::convert_from_fp16:
+    // FIXME: This intrinsic should probably be removed from the IR.
+    MIRBuilder.buildFPExt(getOrCreateVReg(CI),
+                          getOrCreateVReg(*CI.getArgOperand(0)),
+                          MachineInstr::copyFlagsFromInstruction(CI));
+    return true;
+  case Intrinsic::convert_to_fp16:
+    // FIXME: This intrinsic should probably be removed from the IR.
+    MIRBuilder.buildFPTrunc(getOrCreateVReg(CI),
+                            getOrCreateVReg(*CI.getArgOperand(0)),
+                            MachineInstr::copyFlagsFromInstruction(CI));
+    return true;
   case Intrinsic::memcpy:
+    return translateMemFunc(CI, MIRBuilder, TargetOpcode::G_MEMCPY);
   case Intrinsic::memmove:
+    return translateMemFunc(CI, MIRBuilder, TargetOpcode::G_MEMMOVE);
   case Intrinsic::memset:
-    return translateMemFunc(CI, MIRBuilder, ID);
+    return translateMemFunc(CI, MIRBuilder, TargetOpcode::G_MEMSET);
   case Intrinsic::eh_typeid_for: {
     GlobalValue *GV = ExtractTypeInfo(CI.getArgOperand(0));
     Register Reg = getOrCreateVReg(CI);
@@ -1593,7 +2114,18 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
   }
   case Intrinsic::invariant_end:
     return true;
+  case Intrinsic::expect:
+  case Intrinsic::annotation:
+  case Intrinsic::ptr_annotation:
+  case Intrinsic::launder_invariant_group:
+  case Intrinsic::strip_invariant_group: {
+    // Drop the intrinsic, but forward the value.
+    MIRBuilder.buildCopy(getOrCreateVReg(CI),
+                         getOrCreateVReg(*CI.getArgOperand(0)));
+    return true;
+  }
   case Intrinsic::assume:
+  case Intrinsic::experimental_noalias_scope_decl:
   case Intrinsic::var_annotation:
   case Intrinsic::sideeffect:
     // Discard annotate attributes, assumptions, and artificial side-effects.
@@ -1613,6 +2145,68 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
       .addUse(getOrCreateVReg(*CI.getArgOperand(1)));
     return true;
   }
+  case Intrinsic::localescape: {
+    MachineBasicBlock &EntryMBB = MF->front();
+    StringRef EscapedName = GlobalValue::dropLLVMManglingEscape(MF->getName());
+
+    // Directly emit some LOCAL_ESCAPE machine instrs. Label assignment emission
+    // is the same on all targets.
+    for (unsigned Idx = 0, E = CI.getNumArgOperands(); Idx < E; ++Idx) {
+      Value *Arg = CI.getArgOperand(Idx)->stripPointerCasts();
+      if (isa<ConstantPointerNull>(Arg))
+        continue; // Skip null pointers. They represent a hole in index space.
+
+      int FI = getOrCreateFrameIndex(*cast<AllocaInst>(Arg));
+      MCSymbol *FrameAllocSym =
+          MF->getMMI().getContext().getOrCreateFrameAllocSymbol(EscapedName,
+                                                                Idx);
+
+      // This should be inserted at the start of the entry block.
+      auto LocalEscape =
+          MIRBuilder.buildInstrNoInsert(TargetOpcode::LOCAL_ESCAPE)
+              .addSym(FrameAllocSym)
+              .addFrameIndex(FI);
+
+      EntryMBB.insert(EntryMBB.begin(), LocalEscape);
+    }
+
+    return true;
+  }
+  case Intrinsic::vector_reduce_fadd:
+  case Intrinsic::vector_reduce_fmul: {
+    // Need to check for the reassoc flag to decide whether we want a
+    // sequential reduction opcode or not.
+    Register Dst = getOrCreateVReg(CI);
+    Register ScalarSrc = getOrCreateVReg(*CI.getArgOperand(0));
+    Register VecSrc = getOrCreateVReg(*CI.getArgOperand(1));
+    unsigned Opc = 0;
+    if (!CI.hasAllowReassoc()) {
+      // The sequential ordering case.
+      Opc = ID == Intrinsic::vector_reduce_fadd
+                ? TargetOpcode::G_VECREDUCE_SEQ_FADD
+                : TargetOpcode::G_VECREDUCE_SEQ_FMUL;
+      MIRBuilder.buildInstr(Opc, {Dst}, {ScalarSrc, VecSrc},
+                            MachineInstr::copyFlagsFromInstruction(CI));
+      return true;
+    }
+    // We split the operation into a separate G_FADD/G_FMUL + the reduce,
+    // since the associativity doesn't matter.
+    unsigned ScalarOpc;
+    if (ID == Intrinsic::vector_reduce_fadd) {
+      Opc = TargetOpcode::G_VECREDUCE_FADD;
+      ScalarOpc = TargetOpcode::G_FADD;
+    } else {
+      Opc = TargetOpcode::G_VECREDUCE_FMUL;
+      ScalarOpc = TargetOpcode::G_FMUL;
+    }
+    LLT DstTy = MRI->getType(Dst);
+    auto Rdx = MIRBuilder.buildInstr(
+        Opc, {DstTy}, {VecSrc}, MachineInstr::copyFlagsFromInstruction(CI));
+    MIRBuilder.buildInstr(ScalarOpc, {Dst}, {ScalarSrc, Rdx},
+                          MachineInstr::copyFlagsFromInstruction(CI));
+
+    return true;
+  }
 #define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC)  \
   case Intrinsic::INTRINSIC:
 #include "llvm/IR/ConstrainedOps.def"
@@ -1722,10 +2316,6 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
     MIB->copyIRFlags(CI);
 
   for (auto &Arg : enumerate(CI.arg_operands())) {
-    // Some intrinsics take metadata parameters. Reject them.
-    if (isa<MetadataAsValue>(Arg.value()))
-      return false;
-
     // If this is required to be an immediate, don't materialize it in a
     // register.
     if (CI.paramHasAttr(Arg.index(), Attribute::ImmArg)) {
@@ -1738,6 +2328,11 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
       } else {
         MIB.addFPImm(cast<ConstantFP>(Arg.value()));
       }
+    } else if (auto MD = dyn_cast<MetadataAsValue>(Arg.value())) {
+      auto *MDN = dyn_cast<MDNode>(MD->getMetadata());
+      if (!MDN) // This was probably an MDString.
+        return false;
+      MIB.addMetadata(MDN);
     } else {
       ArrayRef<Register> VRegs = getOrCreateVRegs(*Arg.value());
       if (VRegs.size() > 1)
@@ -1762,6 +2357,62 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
   return true;
 }
 
+bool IRTranslator::findUnwindDestinations(
+    const BasicBlock *EHPadBB,
+    BranchProbability Prob,
+    SmallVectorImpl<std::pair<MachineBasicBlock *, BranchProbability>>
+        &UnwindDests) {
+  EHPersonality Personality = classifyEHPersonality(
+      EHPadBB->getParent()->getFunction().getPersonalityFn());
+  bool IsMSVCCXX = Personality == EHPersonality::MSVC_CXX;
+  bool IsCoreCLR = Personality == EHPersonality::CoreCLR;
+  bool IsWasmCXX = Personality == EHPersonality::Wasm_CXX;
+  bool IsSEH = isAsynchronousEHPersonality(Personality);
+
+  if (IsWasmCXX) {
+    // Ignore this for now.
+    return false;
+  }
+
+  while (EHPadBB) {
+    const Instruction *Pad = EHPadBB->getFirstNonPHI();
+    BasicBlock *NewEHPadBB = nullptr;
+    if (isa<LandingPadInst>(Pad)) {
+      // Stop on landingpads. They are not funclets.
+      UnwindDests.emplace_back(&getMBB(*EHPadBB), Prob);
+      break;
+    }
+    if (isa<CleanupPadInst>(Pad)) {
+      // Stop on cleanup pads. Cleanups are always funclet entries for all known
+      // personalities.
+      UnwindDests.emplace_back(&getMBB(*EHPadBB), Prob);
+      UnwindDests.back().first->setIsEHScopeEntry();
+      UnwindDests.back().first->setIsEHFuncletEntry();
+      break;
+    }
+    if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(Pad)) {
+      // Add the catchpad handlers to the possible destinations.
+      for (const BasicBlock *CatchPadBB : CatchSwitch->handlers()) {
+        UnwindDests.emplace_back(&getMBB(*CatchPadBB), Prob);
+        // For MSVC++ and the CLR, catchblocks are funclets and need prologues.
+        if (IsMSVCCXX || IsCoreCLR)
+          UnwindDests.back().first->setIsEHFuncletEntry();
+        if (!IsSEH)
+          UnwindDests.back().first->setIsEHScopeEntry();
+      }
+      NewEHPadBB = CatchSwitch->getUnwindDest();
+    } else {
+      continue;
+    }
+
+    BranchProbabilityInfo *BPI = FuncInfo.BPI;
+    if (BPI && NewEHPadBB)
+      Prob *= BPI->getEdgeProbability(EHPadBB, NewEHPadBB);
+    EHPadBB = NewEHPadBB;
+  }
+  return true;
+}
+
 bool IRTranslator::translateInvoke(const User &U,
                                    MachineIRBuilder &MIRBuilder) {
   const InvokeInst &I = cast<InvokeInst>(U);
@@ -1787,7 +2438,7 @@ bool IRTranslator::translateInvoke(const User &U,
     return false;
 
   // FIXME: support Windows exception handling.
-  if (!isa<LandingPadInst>(EHPadBB->front()))
+  if (!isa<LandingPadInst>(EHPadBB->getFirstNonPHI()))
     return false;
 
   // Emit the actual call, bracketed by EH_LABELs so that the MF knows about
@@ -1801,14 +2452,28 @@ bool IRTranslator::translateInvoke(const User &U,
   MCSymbol *EndSymbol = Context.createTempSymbol();
   MIRBuilder.buildInstr(TargetOpcode::EH_LABEL).addSym(EndSymbol);
 
-  // FIXME: track probabilities.
+  SmallVector<std::pair<MachineBasicBlock *, BranchProbability>, 1> UnwindDests;
+  BranchProbabilityInfo *BPI = FuncInfo.BPI;
+  MachineBasicBlock *InvokeMBB = &MIRBuilder.getMBB();
+  BranchProbability EHPadBBProb =
+      BPI ? BPI->getEdgeProbability(InvokeMBB->getBasicBlock(), EHPadBB)
+          : BranchProbability::getZero();
+
+  if (!findUnwindDestinations(EHPadBB, EHPadBBProb, UnwindDests))
+    return false;
+
   MachineBasicBlock &EHPadMBB = getMBB(*EHPadBB),
                     &ReturnMBB = getMBB(*ReturnBB);
+  // Update successor info.
+  addSuccessorWithProb(InvokeMBB, &ReturnMBB);
+  for (auto &UnwindDest : UnwindDests) {
+    UnwindDest.first->setIsEHPad();
+    addSuccessorWithProb(InvokeMBB, UnwindDest.first, UnwindDest.second);
+  }
+  InvokeMBB->normalizeSuccProbs();
+
   MF->addInvoke(&EHPadMBB, BeginSymbol, EndSymbol);
-  MIRBuilder.getMBB().addSuccessor(&ReturnMBB);
-  MIRBuilder.getMBB().addSuccessor(&EHPadMBB);
   MIRBuilder.buildBr(ReturnMBB);
-
   return true;
 }
 
@@ -1846,6 +2511,12 @@ bool IRTranslator::translateLandingPad(const User &U,
   MIRBuilder.buildInstr(TargetOpcode::EH_LABEL)
     .addSym(MF->addLandingPad(&MBB));
 
+  // If the unwinder does not preserve all registers, ensure that the
+  // function marks the clobbered registers as used.
+  const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
+  if (auto *RegMask = TRI.getCustomEHPadPreservedMask(*MF))
+    MF->getRegInfo().addPhysRegsUsedFromRegMask(RegMask);
+
   LLT Ty = getLLTForType(*LP.getType(), *DL);
   Register Undef = MRI->createGenericVirtualRegister(Ty);
   MIRBuilder.buildUndef(Undef);
@@ -2184,8 +2855,8 @@ bool IRTranslator::translate(const Instruction &Inst) {
   // We only emit constants into the entry block from here. To prevent jumpy
   // debug behaviour set the line to 0.
   if (const DebugLoc &DL = Inst.getDebugLoc())
-    EntryBuilder->setDebugLoc(
-        DebugLoc::get(0, 0, DL.getScope(), DL.getInlinedAt()));
+    EntryBuilder->setDebugLoc(DILocation::get(
+        Inst.getContext(), 0, 0, DL.getScope(), DL.getInlinedAt()));
   else
     EntryBuilder->setDebugLoc(DebugLoc());
 
@@ -2263,6 +2934,57 @@ bool IRTranslator::translate(const Constant &C, Register Reg) {
 }
 
 void IRTranslator::finalizeBasicBlock() {
+  for (auto &BTB : SL->BitTestCases) {
+    // Emit header first, if it wasn't already emitted.
+    if (!BTB.Emitted)
+      emitBitTestHeader(BTB, BTB.Parent);
+
+    BranchProbability UnhandledProb = BTB.Prob;
+    for (unsigned j = 0, ej = BTB.Cases.size(); j != ej; ++j) {
+      UnhandledProb -= BTB.Cases[j].ExtraProb;
+      // Set the current basic block to the mbb we wish to insert the code into
+      MachineBasicBlock *MBB = BTB.Cases[j].ThisBB;
+      // If all cases cover a contiguous range, it is not necessary to jump to
+      // the default block after the last bit test fails. This is because the
+      // range check during bit test header creation has guaranteed that every
+      // case here doesn't go outside the range. In this case, there is no need
+      // to perform the last bit test, as it will always be true. Instead, make
+      // the second-to-last bit-test fall through to the target of the last bit
+      // test, and delete the last bit test.
+
+      MachineBasicBlock *NextMBB;
+      if (BTB.ContiguousRange && j + 2 == ej) {
+        // Second-to-last bit-test with contiguous range: fall through to the
+        // target of the final bit test.
+        NextMBB = BTB.Cases[j + 1].TargetBB;
+      } else if (j + 1 == ej) {
+        // For the last bit test, fall through to Default.
+        NextMBB = BTB.Default;
+      } else {
+        // Otherwise, fall through to the next bit test.
+        NextMBB = BTB.Cases[j + 1].ThisBB;
+      }
+
+      emitBitTestCase(BTB, NextMBB, UnhandledProb, BTB.Reg, BTB.Cases[j], MBB);
+
+      // FIXME delete this block below?
+      if (BTB.ContiguousRange && j + 2 == ej) {
+        // Since we're not going to use the final bit test, remove it.
+        BTB.Cases.pop_back();
+        break;
+      }
+    }
+    // This is "default" BB. We have two jumps to it. From "header" BB and from
+    // last "case" BB, unless the latter was skipped.
+    CFGEdge HeaderToDefaultEdge = {BTB.Parent->getBasicBlock(),
+                                   BTB.Default->getBasicBlock()};
+    addMachineCFGPred(HeaderToDefaultEdge, BTB.Parent);
+    if (!BTB.ContiguousRange) {
+      addMachineCFGPred(HeaderToDefaultEdge, BTB.Cases.back().ThisBB);
+    }
+  }
+  SL->BitTestCases.clear();
+
   for (auto &JTCase : SL->JTCases) {
     // Emit header first, if it wasn't already emitted.
     if (!JTCase.first.Emitted)
@@ -2271,6 +2993,10 @@ void IRTranslator::finalizeBasicBlock() {
     emitJumpTable(JTCase.second, JTCase.second.MBB);
   }
   SL->JTCases.clear();
+
+  for (auto &SwCase : SL->SwitchCases)
+    emitSwitchCase(SwCase, &CurBuilder->getMBB(), *CurBuilder);
+  SL->SwitchCases.clear();
 }
 
 void IRTranslator::finalizeFunction() {
@@ -2332,14 +3058,23 @@ bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) {
   MRI = &MF->getRegInfo();
   DL = &F.getParent()->getDataLayout();
   ORE = std::make_unique<OptimizationRemarkEmitter>(&F);
+  const TargetMachine &TM = MF->getTarget();
+  TM.resetTargetOptions(F);
+  EnableOpts = OptLevel != CodeGenOpt::None && !skipFunction(F);
   FuncInfo.MF = MF;
-  FuncInfo.BPI = nullptr;
+  if (EnableOpts)
+    FuncInfo.BPI = &getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI();
+  else
+    FuncInfo.BPI = nullptr;
+
+  FuncInfo.CanLowerReturn = CLI->checkReturnTypeForCallConv(*MF);
+
   const auto &TLI = *MF->getSubtarget().getTargetLowering();
-  const TargetMachine &TM = MF->getTarget();
+
   SL = std::make_unique<GISelSwitchLowering>(this, FuncInfo);
   SL->init(TLI, TM, *DL);
 
-  EnableOpts = TM.getOptLevel() != CodeGenOpt::None && !skipFunction(F);
+
 
   assert(PendingPHIs.empty() && "stale PHIs");
 
@@ -2407,7 +3142,7 @@ bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) {
     }
   }
 
-  if (!CLI->lowerFormalArguments(*EntryBuilder.get(), F, VRegArgs)) {
+  if (!CLI->lowerFormalArguments(*EntryBuilder.get(), F, VRegArgs, FuncInfo)) {
     OptimizationRemarkMissed R("gisel-irtranslator", "GISelFailure",
                                F.getSubprogram(), &F.getEntryBlock());
     R << "unable to lower arguments: " << ore::NV("Prototype", F.getType());
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp
index 1e2a82615da8..bb4d41cfd69f 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp
@@ -562,6 +562,11 @@ bool InlineAsmLowering::lowerInlineAsm(
       }
 
       unsigned Flag = InlineAsm::getFlagWord(InlineAsm::Kind_RegUse, NumRegs);
+      if (OpInfo.Regs.front().isVirtual()) {
+        // Put the register class of the virtual registers in the flag word.
+        const TargetRegisterClass *RC = MRI->getRegClass(OpInfo.Regs.front());
+        Flag = InlineAsm::getFlagWordForRegClass(Flag, RC->getID());
+      }
       Inst.addImm(Flag);
       if (!buildAnyextOrCopy(OpInfo.Regs[0], SourceRegs[0], MIRBuilder))
         return false;
@@ -657,6 +662,7 @@ bool InlineAsmLowering::lowerAsmOperandForConstraint(
   default:
     return false;
   case 'i': // Simple Integer or Relocatable Constant
+  case 'n': // immediate integer with a known value.
     if (ConstantInt *CI = dyn_cast<ConstantInt>(Val)) {
       assert(CI->getBitWidth() <= 64 &&
              "expected immediate to fit into 64-bits");
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp
index f32278d07052..25fae5487187 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp
@@ -41,7 +41,7 @@ static cl::opt<std::string>
                    cl::desc("Record GlobalISel rule coverage files of this "
                             "prefix if instrumentation was generated"));
 #else
-static const std::string CoveragePrefix = "";
+static const std::string CoveragePrefix;
 #endif
 
 char InstructionSelect::ID = 0;
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/InstructionSelector.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/InstructionSelector.cpp
index 2fedc034d315..4fec9e628ddb 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/InstructionSelector.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/InstructionSelector.cpp
@@ -33,24 +33,12 @@ InstructionSelector::MatcherState::MatcherState(unsigned MaxRenderers)
 
 InstructionSelector::InstructionSelector() = default;
 
-bool InstructionSelector::constrainOperandRegToRegClass(
-    MachineInstr &I, unsigned OpIdx, const TargetRegisterClass &RC,
-    const TargetInstrInfo &TII, const TargetRegisterInfo &TRI,
-    const RegisterBankInfo &RBI) const {
-  MachineBasicBlock &MBB = *I.getParent();
-  MachineFunction &MF = *MBB.getParent();
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-
-  return constrainOperandRegClass(MF, TRI, MRI, TII, RBI, I, RC,
-                                  I.getOperand(OpIdx));
-}
-
 bool InstructionSelector::isOperandImmEqual(
     const MachineOperand &MO, int64_t Value,
     const MachineRegisterInfo &MRI) const {
   if (MO.isReg() && MO.getReg())
     if (auto VRegVal = getConstantVRegValWithLookThrough(MO.getReg(), MRI))
-      return VRegVal->Value == Value;
+      return VRegVal->Value.getSExtValue() == Value;
   return false;
 }
 
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp
index a83742f2138f..1993f6033291 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp
@@ -10,6 +10,17 @@
 //
 //===----------------------------------------------------------------------===//
 
+// Enable optimizations to work around MSVC debug mode bug in 32-bit:
+// https://developercommunity.visualstudio.com/content/problem/1179643/msvc-copies-overaligned-non-trivially-copyable-par.html
+// FIXME: Remove this when the issue is closed.
+#if defined(_MSC_VER) && !defined(__clang__) && defined(_M_IX86)
+// We have to disable runtime checks in order to enable optimizations. This is
+// done for the entire file because the problem is actually observed in STL
+// template functions.
+#pragma runtime_checks("", off)
+#pragma optimize("gs", on)
+#endif
+
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
 
 using namespace llvm;
@@ -24,7 +35,7 @@ LegalityPredicates::typeInSet(unsigned TypeIdx,
                               std::initializer_list<LLT> TypesInit) {
   SmallVector<LLT, 4> Types = TypesInit;
   return [=](const LegalityQuery &Query) {
-    return std::find(Types.begin(), Types.end(), Query.Types[TypeIdx]) != Types.end();
+    return llvm::is_contained(Types, Query.Types[TypeIdx]);
   };
 }
 
@@ -34,7 +45,7 @@ LegalityPredicate LegalityPredicates::typePairInSet(
   SmallVector<std::pair<LLT, LLT>, 4> Types = TypesInit;
   return [=](const LegalityQuery &Query) {
     std::pair<LLT, LLT> Match = {Query.Types[TypeIdx0], Query.Types[TypeIdx1]};
-    return std::find(Types.begin(), Types.end(), Match) != Types.end();
+    return llvm::is_contained(Types, Match);
   };
 }
 
@@ -46,11 +57,10 @@ LegalityPredicate LegalityPredicates::typePairAndMemDescInSet(
     TypePairAndMemDesc Match = {Query.Types[TypeIdx0], Query.Types[TypeIdx1],
                                 Query.MMODescrs[MMOIdx].SizeInBits,
                                 Query.MMODescrs[MMOIdx].AlignInBits};
-    return std::find_if(
-      TypesAndMemDesc.begin(), TypesAndMemDesc.end(),
-      [=](const TypePairAndMemDesc &Entry) ->bool {
-        return Match.isCompatible(Entry);
-      }) != TypesAndMemDesc.end();
+    return llvm::any_of(TypesAndMemDesc,
+                        [=](const TypePairAndMemDesc &Entry) -> bool {
+                          return Match.isCompatible(Entry);
+                        });
   };
 }
 
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LegalizeMutations.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LegalizeMutations.cpp
index fcbecf90a845..f3ba3f080198 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LegalizeMutations.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LegalizeMutations.cpp
@@ -43,6 +43,16 @@ LegalizeMutation LegalizeMutations::changeElementTo(unsigned TypeIdx,
   };
 }
 
+LegalizeMutation LegalizeMutations::changeElementSizeTo(unsigned TypeIdx,
+                                                        unsigned FromTypeIdx) {
+  return [=](const LegalityQuery &Query) {
+    const LLT OldTy = Query.Types[TypeIdx];
+    const LLT NewTy = Query.Types[FromTypeIdx];
+    const LLT NewEltTy = LLT::scalar(NewTy.getScalarSizeInBits());
+    return std::make_pair(TypeIdx, OldTy.changeElementType(NewEltTy));
+  };
+}
+
 LegalizeMutation LegalizeMutations::widenScalarOrEltToNextPow2(unsigned TypeIdx,
                                                                unsigned Min) {
   return [=](const LegalityQuery &Query) {
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp
index 1d7be54de3b0..5ba9367cac8a 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp
@@ -284,7 +284,7 @@ Legalizer::legalizeMachineFunction(MachineFunction &MF, const LegalizerInfo &LI,
                                             WrapperObserver)) {
         WorkListObserver.printNewInstrs();
         for (auto *DeadMI : DeadInstructions) {
-          LLVM_DEBUG(dbgs() << *DeadMI << "Is dead\n");
+          LLVM_DEBUG(dbgs() << "Is dead: " << *DeadMI);
           RemoveDeadInstFromLists(DeadMI);
           DeadMI->eraseFromParentAndMarkDBGValuesForRemoval();
         }
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 244e7a9583d6..66871ca3b926 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -16,6 +16,7 @@
 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
@@ -29,6 +30,7 @@
 
 using namespace llvm;
 using namespace LegalizeActions;
+using namespace MIPatternMatch;
 
 /// Try to break down \p OrigTy into \p NarrowTy sized pieces.
 ///
@@ -75,6 +77,8 @@ static Type *getFloatTypeForLLT(LLVMContext &Ctx, LLT Ty) {
     return Type::getFloatTy(Ctx);
   case 64:
     return Type::getDoubleTy(Ctx);
+  case 80:
+    return Type::getX86_FP80Ty(Ctx);
   case 128:
     return Type::getFP128Ty(Ctx);
   default:
@@ -86,16 +90,15 @@ LegalizerHelper::LegalizerHelper(MachineFunction &MF,
                                  GISelChangeObserver &Observer,
                                  MachineIRBuilder &Builder)
     : MIRBuilder(Builder), Observer(Observer), MRI(MF.getRegInfo()),
-      LI(*MF.getSubtarget().getLegalizerInfo()) {
-  MIRBuilder.setChangeObserver(Observer);
-}
+      LI(*MF.getSubtarget().getLegalizerInfo()),
+      TLI(*MF.getSubtarget().getTargetLowering()) { }
 
 LegalizerHelper::LegalizerHelper(MachineFunction &MF, const LegalizerInfo &LI,
                                  GISelChangeObserver &Observer,
                                  MachineIRBuilder &B)
-    : MIRBuilder(B), Observer(Observer), MRI(MF.getRegInfo()), LI(LI) {
-  MIRBuilder.setChangeObserver(Observer);
-}
+  : MIRBuilder(B), Observer(Observer), MRI(MF.getRegInfo()), LI(LI),
+    TLI(*MF.getSubtarget().getTargetLowering()) { }
+
 LegalizerHelper::LegalizeResult
 LegalizerHelper::legalizeInstrStep(MachineInstr &MI) {
   LLVM_DEBUG(dbgs() << "Legalizing: " << MI);
@@ -237,22 +240,21 @@ void LegalizerHelper::insertParts(Register DstReg,
   }
 }
 
-/// Return the result registers of G_UNMERGE_VALUES \p MI in \p Regs
+/// Append the result registers of G_UNMERGE_VALUES \p MI to \p Regs.
 static void getUnmergeResults(SmallVectorImpl<Register> &Regs,
                               const MachineInstr &MI) {
   assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES);
 
+  const int StartIdx = Regs.size();
   const int NumResults = MI.getNumOperands() - 1;
-  Regs.resize(NumResults);
+  Regs.resize(Regs.size() + NumResults);
   for (int I = 0; I != NumResults; ++I)
-    Regs[I] = MI.getOperand(I).getReg();
+    Regs[StartIdx + I] = MI.getOperand(I).getReg();
 }
 
-LLT LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts, LLT DstTy,
-                                    LLT NarrowTy, Register SrcReg) {
+void LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts,
+                                     LLT GCDTy, Register SrcReg) {
   LLT SrcTy = MRI.getType(SrcReg);
-
-  LLT GCDTy = getGCDType(DstTy, getGCDType(SrcTy, NarrowTy));
   if (SrcTy == GCDTy) {
     // If the source already evenly divides the result type, we don't need to do
     // anything.
@@ -262,7 +264,13 @@ LLT LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts, LLT DstTy,
     auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg);
     getUnmergeResults(Parts, *Unmerge);
   }
+}
 
+LLT LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts, LLT DstTy,
+                                    LLT NarrowTy, Register SrcReg) {
+  LLT SrcTy = MRI.getType(SrcReg);
+  LLT GCDTy = getGCDType(getGCDType(SrcTy, NarrowTy), DstTy);
+  extractGCDType(Parts, GCDTy, SrcReg);
   return GCDTy;
 }
 
@@ -376,7 +384,14 @@ void LegalizerHelper::buildWidenedRemergeToDst(Register DstReg, LLT LCMTy,
   }
 
   if (LCMTy.isVector()) {
-    MIRBuilder.buildExtract(DstReg, Remerge, 0);
+    unsigned NumDefs = LCMTy.getSizeInBits() / DstTy.getSizeInBits();
+    SmallVector<Register, 8> UnmergeDefs(NumDefs);
+    UnmergeDefs[0] = DstReg;
+    for (unsigned I = 1; I != NumDefs; ++I)
+      UnmergeDefs[I] = MRI.createGenericVirtualRegister(DstTy);
+
+    MIRBuilder.buildUnmerge(UnmergeDefs,
+                            MIRBuilder.buildMerge(LCMTy, RemergeRegs));
     return;
   }
 
@@ -384,7 +399,7 @@ void LegalizerHelper::buildWidenedRemergeToDst(Register DstReg, LLT LCMTy,
 }
 
 static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) {
-#define RTLIBCASE(LibcallPrefix)                                               \
+#define RTLIBCASE_INT(LibcallPrefix)                                           \
   do {                                                                         \
     switch (Size) {                                                            \
     case 32:                                                                   \
@@ -398,19 +413,33 @@ static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) {
     }                                                                          \
   } while (0)
 
-  assert((Size == 32 || Size == 64 || Size == 128) && "Unsupported size");
+#define RTLIBCASE(LibcallPrefix)                                               \
+  do {                                                                         \
+    switch (Size) {                                                            \
+    case 32:                                                                   \
+      return RTLIB::LibcallPrefix##32;                                         \
+    case 64:                                                                   \
+      return RTLIB::LibcallPrefix##64;                                         \
+    case 80:                                                                   \
+      return RTLIB::LibcallPrefix##80;                                         \
+    case 128:                                                                  \
+      return RTLIB::LibcallPrefix##128;                                        \
+    default:                                                                   \
+      llvm_unreachable("unexpected size");                                     \
+    }                                                                          \
+  } while (0)
 
   switch (Opcode) {
   case TargetOpcode::G_SDIV:
-    RTLIBCASE(SDIV_I);
+    RTLIBCASE_INT(SDIV_I);
   case TargetOpcode::G_UDIV:
-    RTLIBCASE(UDIV_I);
+    RTLIBCASE_INT(UDIV_I);
   case TargetOpcode::G_SREM:
-    RTLIBCASE(SREM_I);
+    RTLIBCASE_INT(SREM_I);
   case TargetOpcode::G_UREM:
-    RTLIBCASE(UREM_I);
+    RTLIBCASE_INT(UREM_I);
   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
-    RTLIBCASE(CTLZ_I);
+    RTLIBCASE_INT(CTLZ_I);
   case TargetOpcode::G_FADD:
     RTLIBCASE(ADD_F);
   case TargetOpcode::G_FSUB:
@@ -453,13 +482,16 @@ static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) {
     RTLIBCASE(RINT_F);
   case TargetOpcode::G_FNEARBYINT:
     RTLIBCASE(NEARBYINT_F);
+  case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
+    RTLIBCASE(ROUNDEVEN_F);
   }
   llvm_unreachable("Unknown libcall function");
 }
 
 /// True if an instruction is in tail position in its caller. Intended for
 /// legalizing libcalls as tail calls when possible.
-static bool isLibCallInTailPosition(MachineInstr &MI) {
+static bool isLibCallInTailPosition(const TargetInstrInfo &TII,
+                                    MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
   const Function &F = MBB.getParent()->getFunction();
 
@@ -479,7 +511,6 @@ static bool isLibCallInTailPosition(MachineInstr &MI) {
     return false;
 
   // Only tail call if the following instruction is a standard return.
-  auto &TII = *MI.getMF()->getSubtarget().getInstrInfo();
   auto Next = next_nodbg(MI.getIterator(), MBB.instr_end());
   if (Next == MBB.instr_end() || TII.isTailCall(*Next) || !Next->isReturn())
     return false;
@@ -531,12 +562,11 @@ simpleLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, unsigned Size,
 LegalizerHelper::LegalizeResult
 llvm::createMemLibcall(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
                        MachineInstr &MI) {
-  assert(MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
   auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
 
   SmallVector<CallLowering::ArgInfo, 3> Args;
   // Add all the args, except for the last which is an imm denoting 'tail'.
-  for (unsigned i = 1; i < MI.getNumOperands() - 1; i++) {
+  for (unsigned i = 0; i < MI.getNumOperands() - 1; ++i) {
     Register Reg = MI.getOperand(i).getReg();
 
     // Need derive an IR type for call lowering.
@@ -551,31 +581,28 @@ llvm::createMemLibcall(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
 
   auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
   auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
-  Intrinsic::ID ID = MI.getOperand(0).getIntrinsicID();
   RTLIB::Libcall RTLibcall;
-  switch (ID) {
-  case Intrinsic::memcpy:
+  switch (MI.getOpcode()) {
+  case TargetOpcode::G_MEMCPY:
     RTLibcall = RTLIB::MEMCPY;
     break;
-  case Intrinsic::memset:
-    RTLibcall = RTLIB::MEMSET;
-    break;
-  case Intrinsic::memmove:
+  case TargetOpcode::G_MEMMOVE:
     RTLibcall = RTLIB::MEMMOVE;
     break;
+  case TargetOpcode::G_MEMSET:
+    RTLibcall = RTLIB::MEMSET;
+    break;
   default:
     return LegalizerHelper::UnableToLegalize;
   }
   const char *Name = TLI.getLibcallName(RTLibcall);
 
-  MIRBuilder.setInstrAndDebugLoc(MI);
-
   CallLowering::CallLoweringInfo Info;
   Info.CallConv = TLI.getLibcallCallingConv(RTLibcall);
   Info.Callee = MachineOperand::CreateES(Name);
   Info.OrigRet = CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx));
-  Info.IsTailCall = MI.getOperand(MI.getNumOperands() - 1).getImm() == 1 &&
-                    isLibCallInTailPosition(MI);
+  Info.IsTailCall = MI.getOperand(MI.getNumOperands() - 1).getImm() &&
+                    isLibCallInTailPosition(MIRBuilder.getTII(), MI);
 
   std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs));
   if (!CLI.lowerCall(MIRBuilder, Info))
@@ -668,10 +695,11 @@ LegalizerHelper::libcall(MachineInstr &MI) {
   case TargetOpcode::G_FMAXNUM:
   case TargetOpcode::G_FSQRT:
   case TargetOpcode::G_FRINT:
-  case TargetOpcode::G_FNEARBYINT: {
+  case TargetOpcode::G_FNEARBYINT:
+  case TargetOpcode::G_INTRINSIC_ROUNDEVEN: {
     Type *HLTy = getFloatTypeForLLT(Ctx, LLTy);
-    if (!HLTy || (Size != 32 && Size != 64 && Size != 128)) {
-      LLVM_DEBUG(dbgs() << "No libcall available for size " << Size << ".\n");
+    if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
+      LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
       return UnableToLegalize;
     }
     auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy);
@@ -720,6 +748,13 @@ LegalizerHelper::libcall(MachineInstr &MI) {
       return Status;
     break;
   }
+  case TargetOpcode::G_MEMCPY:
+  case TargetOpcode::G_MEMMOVE:
+  case TargetOpcode::G_MEMSET: {
+    LegalizeResult Result = createMemLibcall(MIRBuilder, *MIRBuilder.getMRI(), MI);
+    MI.eraseFromParent();
+    return Result;
+  }
   }
 
   MI.eraseFromParent();
@@ -900,7 +935,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
   case TargetOpcode::G_INSERT:
     return narrowScalarInsert(MI, TypeIdx, NarrowTy);
   case TargetOpcode::G_LOAD: {
-    const auto &MMO = **MI.memoperands_begin();
+    auto &MMO = **MI.memoperands_begin();
     Register DstReg = MI.getOperand(0).getReg();
     LLT DstTy = MRI.getType(DstReg);
     if (DstTy.isVector())
@@ -908,7 +943,6 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
 
     if (8 * MMO.getSize() != DstTy.getSizeInBits()) {
       Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
-      auto &MMO = **MI.memoperands_begin();
       MIRBuilder.buildLoad(TmpReg, MI.getOperand(1), MMO);
       MIRBuilder.buildAnyExt(DstReg, TmpReg);
       MI.eraseFromParent();
@@ -925,10 +959,15 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
 
     Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
     auto &MMO = **MI.memoperands_begin();
-    if (MMO.getSizeInBits() == NarrowSize) {
+    unsigned MemSize = MMO.getSizeInBits();
+
+    if (MemSize == NarrowSize) {
       MIRBuilder.buildLoad(TmpReg, PtrReg, MMO);
-    } else {
+    } else if (MemSize < NarrowSize) {
       MIRBuilder.buildLoadInstr(MI.getOpcode(), TmpReg, PtrReg, MMO);
+    } else if (MemSize > NarrowSize) {
+      // FIXME: Need to split the load.
+      return UnableToLegalize;
     }
 
     if (ZExt)
@@ -1024,6 +1063,11 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
     Observer.changedInstr(MI);
     return Legalized;
   case TargetOpcode::G_PHI: {
+    // FIXME: add support for when SizeOp0 isn't an exact multiple of
+    // NarrowSize.
+    if (SizeOp0 % NarrowSize != 0)
+      return UnableToLegalize;
+
     unsigned NumParts = SizeOp0 / NarrowSize;
     SmallVector<Register, 2> DstRegs(NumParts);
     SmallVector<SmallVector<Register, 2>, 2> SrcRegs(MI.getNumOperands() / 2);
@@ -1204,6 +1248,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
     MI.eraseFromParent();
     return Legalized;
   }
+  case TargetOpcode::G_PTR_ADD:
   case TargetOpcode::G_PTRMASK: {
     if (TypeIdx != 1)
       return UnableToLegalize;
@@ -1212,6 +1257,16 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
     Observer.changedInstr(MI);
     return Legalized;
   }
+  case TargetOpcode::G_FPTOUI:
+  case TargetOpcode::G_FPTOSI:
+    return narrowScalarFPTOI(MI, TypeIdx, NarrowTy);
+  case TargetOpcode::G_FPEXT:
+    if (TypeIdx != 0)
+      return UnableToLegalize;
+    Observer.changingInstr(MI);
+    narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_FPEXT);
+    Observer.changedInstr(MI);
+    return Legalized;
   }
 }
 
@@ -1272,10 +1327,8 @@ void LegalizerHelper::narrowScalarDst(MachineInstr &MI, LLT NarrowTy,
 void LegalizerHelper::moreElementsVectorDst(MachineInstr &MI, LLT WideTy,
                                             unsigned OpIdx) {
   MachineOperand &MO = MI.getOperand(OpIdx);
-  Register DstExt = MRI.createGenericVirtualRegister(WideTy);
   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
-  MIRBuilder.buildExtract(MO, DstExt, 0);
-  MO.setReg(DstExt);
+  MO.setReg(widenWithUnmerge(WideTy, MO.getReg()));
 }
 
 void LegalizerHelper::moreElementsVectorSrc(MachineInstr &MI, LLT MoreTy,
@@ -1443,6 +1496,40 @@ LegalizerHelper::widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx,
   return Legalized;
 }
 
+Register LegalizerHelper::widenWithUnmerge(LLT WideTy, Register OrigReg) {
+  Register WideReg = MRI.createGenericVirtualRegister(WideTy);
+  LLT OrigTy = MRI.getType(OrigReg);
+  LLT LCMTy = getLCMType(WideTy, OrigTy);
+
+  const int NumMergeParts = LCMTy.getSizeInBits() / WideTy.getSizeInBits();
+  const int NumUnmergeParts = LCMTy.getSizeInBits() / OrigTy.getSizeInBits();
+
+  Register UnmergeSrc = WideReg;
+
+  // Create a merge to the LCM type, padding with undef
+  // %0:_(<3 x s32>) = G_FOO => <4 x s32>
+  // =>
+  // %1:_(<4 x s32>) = G_FOO
+  // %2:_(<4 x s32>) = G_IMPLICIT_DEF
+  // %3:_(<12 x s32>) = G_CONCAT_VECTORS %1, %2, %2
+  // %0:_(<3 x s32>), %4:_, %5:_, %6:_ = G_UNMERGE_VALUES %3
+  if (NumMergeParts > 1) {
+    Register Undef = MIRBuilder.buildUndef(WideTy).getReg(0);
+    SmallVector<Register, 8> MergeParts(NumMergeParts, Undef);
+    MergeParts[0] = WideReg;
+    UnmergeSrc = MIRBuilder.buildMerge(LCMTy, MergeParts).getReg(0);
+  }
+
+  // Unmerge to the original register and pad with dead defs.
+  SmallVector<Register, 8> UnmergeResults(NumUnmergeParts);
+  UnmergeResults[0] = OrigReg;
+  for (int I = 1; I != NumUnmergeParts; ++I)
+    UnmergeResults[I] = MRI.createGenericVirtualRegister(OrigTy);
+
+  MIRBuilder.buildUnmerge(UnmergeResults, UnmergeSrc);
+  return WideReg;
+}
+
 LegalizerHelper::LegalizeResult
 LegalizerHelper::widenScalarUnmergeValues(MachineInstr &MI, unsigned TypeIdx,
                                           LLT WideTy) {
@@ -1512,35 +1599,60 @@ LegalizerHelper::widenScalarUnmergeValues(MachineInstr &MI, unsigned TypeIdx,
 
   auto Unmerge = MIRBuilder.buildUnmerge(WideTy, WideSrc);
 
-  // Create a sequence of unmerges to the original results. since we may have
-  // widened the source, we will need to pad the results with dead defs to cover
-  // the source register.
-  // e.g. widen s16 to s32:
-  // %1:_(s16), %2:_(s16), %3:_(s16) = G_UNMERGE_VALUES %0:_(s48)
+  // Create a sequence of unmerges and merges to the original results. Since we
+  // may have widened the source, we will need to pad the results with dead defs
+  // to cover the source register.
+  // e.g. widen s48 to s64:
+  // %1:_(s48), %2:_(s48) = G_UNMERGE_VALUES %0:_(s96)
   //
   // =>
-  //  %4:_(s64) = G_ANYEXT %0:_(s48)
-  //  %5:_(s32), %6:_(s32) = G_UNMERGE_VALUES %4 ; Requested unmerge
-  //  %1:_(s16), %2:_(s16) = G_UNMERGE_VALUES %5 ; unpack to original regs
-  //  %3:_(s16), dead %7 = G_UNMERGE_VALUES %6 ; original reg + extra dead def
-
+  //  %4:_(s192) = G_ANYEXT %0:_(s96)
+  //  %5:_(s64), %6, %7 = G_UNMERGE_VALUES %4 ; Requested unmerge
+  //  ; unpack to GCD type, with extra dead defs
+  //  %8:_(s16), %9, %10, %11 = G_UNMERGE_VALUES %5:_(s64)
+  //  %12:_(s16), %13, dead %14, dead %15 = G_UNMERGE_VALUES %6:_(s64)
+  //  dead %16:_(s16), dead %17, dead %18, dead %18 = G_UNMERGE_VALUES %7:_(s64)
+  //  %1:_(s48) = G_MERGE_VALUES %8:_(s16), %9, %10   ; Remerge to destination
+  //  %2:_(s48) = G_MERGE_VALUES %11:_(s16), %12, %13 ; Remerge to destination
+  const LLT GCDTy = getGCDType(WideTy, DstTy);
   const int NumUnmerge = Unmerge->getNumOperands() - 1;
-  const int PartsPerUnmerge = WideTy.getSizeInBits() / DstTy.getSizeInBits();
-
-  for (int I = 0; I != NumUnmerge; ++I) {
-    auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES);
-
-    for (int J = 0; J != PartsPerUnmerge; ++J) {
-      int Idx = I * PartsPerUnmerge + J;
-      if (Idx < NumDst)
-        MIB.addDef(MI.getOperand(Idx).getReg());
-      else {
-        // Create dead def for excess components.
-        MIB.addDef(MRI.createGenericVirtualRegister(DstTy));
+  const int PartsPerRemerge = DstTy.getSizeInBits() / GCDTy.getSizeInBits();
+
+  // Directly unmerge to the destination without going through a GCD type
+  // if possible
+  if (PartsPerRemerge == 1) {
+    const int PartsPerUnmerge = WideTy.getSizeInBits() / DstTy.getSizeInBits();
+
+    for (int I = 0; I != NumUnmerge; ++I) {
+      auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES);
+
+      for (int J = 0; J != PartsPerUnmerge; ++J) {
+        int Idx = I * PartsPerUnmerge + J;
+        if (Idx < NumDst)
+          MIB.addDef(MI.getOperand(Idx).getReg());
+        else {
+          // Create dead def for excess components.
+          MIB.addDef(MRI.createGenericVirtualRegister(DstTy));
+        }
       }
+
+      MIB.addUse(Unmerge.getReg(I));
     }
+  } else {
+    SmallVector<Register, 16> Parts;
+    for (int J = 0; J != NumUnmerge; ++J)
+      extractGCDType(Parts, GCDTy, Unmerge.getReg(J));
+
+    SmallVector<Register, 8> RemergeParts;
+    for (int I = 0; I != NumDst; ++I) {
+      for (int J = 0; J < PartsPerRemerge; ++J) {
+        const int Idx = I * PartsPerRemerge + J;
+        RemergeParts.emplace_back(Parts[Idx]);
+      }
 
-    MIB.addUse(Unmerge.getReg(I));
+      MIRBuilder.buildMerge(MI.getOperand(I).getReg(), RemergeParts);
+      RemergeParts.clear();
+    }
   }
 
   MI.eraseFromParent();
@@ -1590,8 +1702,7 @@ LegalizerHelper::widenScalarExtract(MachineInstr &MI, unsigned TypeIdx,
     if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) {
       Src = MIRBuilder.buildAnyExt(WideTy, Src);
       ShiftTy = WideTy;
-    } else if (WideTy.getSizeInBits() > SrcTy.getSizeInBits())
-      return UnableToLegalize;
+    }
 
     auto LShr = MIRBuilder.buildLShr(
       ShiftTy, Src, MIRBuilder.buildConstant(ShiftTy, Offset));
@@ -1629,7 +1740,7 @@ LegalizerHelper::widenScalarExtract(MachineInstr &MI, unsigned TypeIdx,
 LegalizerHelper::LegalizeResult
 LegalizerHelper::widenScalarInsert(MachineInstr &MI, unsigned TypeIdx,
                                    LLT WideTy) {
-  if (TypeIdx != 0)
+  if (TypeIdx != 0 || WideTy.isVector())
     return UnableToLegalize;
   Observer.changingInstr(MI);
   widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
@@ -1639,14 +1750,45 @@ LegalizerHelper::widenScalarInsert(MachineInstr &MI, unsigned TypeIdx,
 }
 
 LegalizerHelper::LegalizeResult
-LegalizerHelper::widenScalarAddSubSat(MachineInstr &MI, unsigned TypeIdx,
-                                      LLT WideTy) {
+LegalizerHelper::widenScalarAddoSubo(MachineInstr &MI, unsigned TypeIdx,
+                                     LLT WideTy) {
+  if (TypeIdx == 1)
+    return UnableToLegalize; // TODO
+  unsigned Op = MI.getOpcode();
+  unsigned Opcode = Op == TargetOpcode::G_UADDO || Op == TargetOpcode::G_SADDO
+                        ? TargetOpcode::G_ADD
+                        : TargetOpcode::G_SUB;
+  unsigned ExtOpcode =
+      Op == TargetOpcode::G_UADDO || Op == TargetOpcode::G_USUBO
+          ? TargetOpcode::G_ZEXT
+          : TargetOpcode::G_SEXT;
+  auto LHSExt = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MI.getOperand(2)});
+  auto RHSExt = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MI.getOperand(3)});
+  // Do the arithmetic in the larger type.
+  auto NewOp = MIRBuilder.buildInstr(Opcode, {WideTy}, {LHSExt, RHSExt});
+  LLT OrigTy = MRI.getType(MI.getOperand(0).getReg());
+  auto TruncOp = MIRBuilder.buildTrunc(OrigTy, NewOp);
+  auto ExtOp = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {TruncOp});
+  // There is no overflow if the ExtOp is the same as NewOp.
+  MIRBuilder.buildICmp(CmpInst::ICMP_NE, MI.getOperand(1), NewOp, ExtOp);
+  // Now trunc the NewOp to the original result.
+  MIRBuilder.buildTrunc(MI.getOperand(0), NewOp);
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::widenScalarAddSubShlSat(MachineInstr &MI, unsigned TypeIdx,
+                                         LLT WideTy) {
   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SADDSAT ||
-                  MI.getOpcode() == TargetOpcode::G_SSUBSAT;
+                  MI.getOpcode() == TargetOpcode::G_SSUBSAT ||
+                  MI.getOpcode() == TargetOpcode::G_SSHLSAT;
+  bool IsShift = MI.getOpcode() == TargetOpcode::G_SSHLSAT ||
+                 MI.getOpcode() == TargetOpcode::G_USHLSAT;
   // We can convert this to:
   //   1. Any extend iN to iM
   //   2. SHL by M-N
-  //   3. [US][ADD|SUB]SAT
+  //   3. [US][ADD|SUB|SHL]SAT
   //   4. L/ASHR by M-N
   //
   // It may be more efficient to lower this to a min and a max operation in
@@ -1657,11 +1799,14 @@ LegalizerHelper::widenScalarAddSubSat(MachineInstr &MI, unsigned TypeIdx,
   unsigned NewBits = WideTy.getScalarSizeInBits();
   unsigned SHLAmount = NewBits - MRI.getType(DstReg).getScalarSizeInBits();
 
+  // Shifts must zero-extend the RHS to preserve the unsigned quantity, and
+  // must not left shift the RHS to preserve the shift amount.
   auto LHS = MIRBuilder.buildAnyExt(WideTy, MI.getOperand(1));
-  auto RHS = MIRBuilder.buildAnyExt(WideTy, MI.getOperand(2));
+  auto RHS = IsShift ? MIRBuilder.buildZExt(WideTy, MI.getOperand(2))
+                     : MIRBuilder.buildAnyExt(WideTy, MI.getOperand(2));
   auto ShiftK = MIRBuilder.buildConstant(WideTy, SHLAmount);
   auto ShiftL = MIRBuilder.buildShl(WideTy, LHS, ShiftK);
-  auto ShiftR = MIRBuilder.buildShl(WideTy, RHS, ShiftK);
+  auto ShiftR = IsShift ? RHS : MIRBuilder.buildShl(WideTy, RHS, ShiftK);
 
   auto WideInst = MIRBuilder.buildInstr(MI.getOpcode(), {WideTy},
                                         {ShiftL, ShiftR}, MI.getFlags());
@@ -1689,34 +1834,18 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
     return widenScalarMergeValues(MI, TypeIdx, WideTy);
   case TargetOpcode::G_UNMERGE_VALUES:
     return widenScalarUnmergeValues(MI, TypeIdx, WideTy);
+  case TargetOpcode::G_SADDO:
+  case TargetOpcode::G_SSUBO:
   case TargetOpcode::G_UADDO:
-  case TargetOpcode::G_USUBO: {
-    if (TypeIdx == 1)
-      return UnableToLegalize; // TODO
-    auto LHSZext = MIRBuilder.buildZExt(WideTy, MI.getOperand(2));
-    auto RHSZext = MIRBuilder.buildZExt(WideTy, MI.getOperand(3));
-    unsigned Opcode = MI.getOpcode() == TargetOpcode::G_UADDO
-                          ? TargetOpcode::G_ADD
-                          : TargetOpcode::G_SUB;
-    // Do the arithmetic in the larger type.
-    auto NewOp = MIRBuilder.buildInstr(Opcode, {WideTy}, {LHSZext, RHSZext});
-    LLT OrigTy = MRI.getType(MI.getOperand(0).getReg());
-    APInt Mask =
-        APInt::getLowBitsSet(WideTy.getSizeInBits(), OrigTy.getSizeInBits());
-    auto AndOp = MIRBuilder.buildAnd(
-        WideTy, NewOp, MIRBuilder.buildConstant(WideTy, Mask));
-    // There is no overflow if the AndOp is the same as NewOp.
-    MIRBuilder.buildICmp(CmpInst::ICMP_NE, MI.getOperand(1), NewOp, AndOp);
-    // Now trunc the NewOp to the original result.
-    MIRBuilder.buildTrunc(MI.getOperand(0), NewOp);
-    MI.eraseFromParent();
-    return Legalized;
-  }
+  case TargetOpcode::G_USUBO:
+    return widenScalarAddoSubo(MI, TypeIdx, WideTy);
   case TargetOpcode::G_SADDSAT:
   case TargetOpcode::G_SSUBSAT:
+  case TargetOpcode::G_SSHLSAT:
   case TargetOpcode::G_UADDSAT:
   case TargetOpcode::G_USUBSAT:
-    return widenScalarAddSubSat(MI, TypeIdx, WideTy);
+  case TargetOpcode::G_USHLSAT:
+    return widenScalarAddSubShlSat(MI, TypeIdx, WideTy);
   case TargetOpcode::G_CTTZ:
   case TargetOpcode::G_CTTZ_ZERO_UNDEF:
   case TargetOpcode::G_CTLZ:
@@ -1908,21 +2037,25 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
     Observer.changedInstr(MI);
     return Legalized;
   case TargetOpcode::G_SITOFP:
-    if (TypeIdx != 1)
-      return UnableToLegalize;
     Observer.changingInstr(MI);
-    widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
+
+    if (TypeIdx == 0)
+      widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
+    else
+      widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
+
     Observer.changedInstr(MI);
     return Legalized;
-
   case TargetOpcode::G_UITOFP:
-    if (TypeIdx != 1)
-      return UnableToLegalize;
     Observer.changingInstr(MI);
-    widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
+
+    if (TypeIdx == 0)
+      widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
+    else
+      widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
+
     Observer.changedInstr(MI);
     return Legalized;
-
   case TargetOpcode::G_LOAD:
   case TargetOpcode::G_SEXTLOAD:
   case TargetOpcode::G_ZEXTLOAD:
@@ -1936,7 +2069,7 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
       return UnableToLegalize;
 
     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
-    if (!isPowerOf2_32(Ty.getSizeInBits()))
+    if (!Ty.isScalar())
       return UnableToLegalize;
 
     Observer.changingInstr(MI);
@@ -2134,6 +2267,7 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
   case TargetOpcode::G_FPOW:
   case TargetOpcode::G_INTRINSIC_TRUNC:
   case TargetOpcode::G_INTRINSIC_ROUND:
+  case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
     assert(TypeIdx == 0);
     Observer.changingInstr(MI);
 
@@ -2143,6 +2277,15 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
     Observer.changedInstr(MI);
     return Legalized;
+  case TargetOpcode::G_FPOWI: {
+    if (TypeIdx != 0)
+      return UnableToLegalize;
+    Observer.changingInstr(MI);
+    widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT);
+    widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
   case TargetOpcode::G_INTTOPTR:
     if (TypeIdx != 1)
       return UnableToLegalize;
@@ -2169,8 +2312,7 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
     // Avoid changing the result vector type if the source element type was
     // requested.
     if (TypeIdx == 1) {
-      auto &TII = *MI.getMF()->getSubtarget().getInstrInfo();
-      MI.setDesc(TII.get(TargetOpcode::G_BUILD_VECTOR_TRUNC));
+      MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BUILD_VECTOR_TRUNC));
     } else {
       widenScalarDst(MI, WideTy, 0);
     }
@@ -2273,6 +2415,376 @@ LegalizerHelper::lowerBitcast(MachineInstr &MI) {
   return UnableToLegalize;
 }
 
+/// Figure out the bit offset into a register when coercing a vector index for
+/// the wide element type. This is only for the case when promoting vector to
+/// one with larger elements.
+//
+///
+/// %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
+/// %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
+static Register getBitcastWiderVectorElementOffset(MachineIRBuilder &B,
+                                                   Register Idx,
+                                                   unsigned NewEltSize,
+                                                   unsigned OldEltSize) {
+  const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
+  LLT IdxTy = B.getMRI()->getType(Idx);
+
+  // Now figure out the amount we need to shift to get the target bits.
+  auto OffsetMask = B.buildConstant(
+    IdxTy, ~(APInt::getAllOnesValue(IdxTy.getSizeInBits()) << Log2EltRatio));
+  auto OffsetIdx = B.buildAnd(IdxTy, Idx, OffsetMask);
+  return B.buildShl(IdxTy, OffsetIdx,
+                    B.buildConstant(IdxTy, Log2_32(OldEltSize))).getReg(0);
+}
+
+/// Perform a G_EXTRACT_VECTOR_ELT in a different sized vector element. If this
+/// is casting to a vector with a smaller element size, perform multiple element
+/// extracts and merge the results. If this is coercing to a vector with larger
+/// elements, index the bitcasted vector and extract the target element with bit
+/// operations. This is intended to force the indexing in the native register
+/// size for architectures that can dynamically index the register file.
+LegalizerHelper::LegalizeResult
+LegalizerHelper::bitcastExtractVectorElt(MachineInstr &MI, unsigned TypeIdx,
+                                         LLT CastTy) {
+  if (TypeIdx != 1)
+    return UnableToLegalize;
+
+  Register Dst = MI.getOperand(0).getReg();
+  Register SrcVec = MI.getOperand(1).getReg();
+  Register Idx = MI.getOperand(2).getReg();
+  LLT SrcVecTy = MRI.getType(SrcVec);
+  LLT IdxTy = MRI.getType(Idx);
+
+  LLT SrcEltTy = SrcVecTy.getElementType();
+  unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
+  unsigned OldNumElts = SrcVecTy.getNumElements();
+
+  LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy;
+  Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0);
+
+  const unsigned NewEltSize = NewEltTy.getSizeInBits();
+  const unsigned OldEltSize = SrcEltTy.getSizeInBits();
+  if (NewNumElts > OldNumElts) {
+    // Decreasing the vector element size
+    //
+    // e.g. i64 = extract_vector_elt x:v2i64, y:i32
+    //  =>
+    //  v4i32:castx = bitcast x:v2i64
+    //
+    // i64 = bitcast
+    //   (v2i32 build_vector (i32 (extract_vector_elt castx, (2 * y))),
+    //                       (i32 (extract_vector_elt castx, (2 * y + 1)))
+    //
+    if (NewNumElts % OldNumElts != 0)
+      return UnableToLegalize;
+
+    // Type of the intermediate result vector.
+    const unsigned NewEltsPerOldElt = NewNumElts / OldNumElts;
+    LLT MidTy = LLT::scalarOrVector(NewEltsPerOldElt, NewEltTy);
+
+    auto NewEltsPerOldEltK = MIRBuilder.buildConstant(IdxTy, NewEltsPerOldElt);
+
+    SmallVector<Register, 8> NewOps(NewEltsPerOldElt);
+    auto NewBaseIdx = MIRBuilder.buildMul(IdxTy, Idx, NewEltsPerOldEltK);
+
+    for (unsigned I = 0; I < NewEltsPerOldElt; ++I) {
+      auto IdxOffset = MIRBuilder.buildConstant(IdxTy, I);
+      auto TmpIdx = MIRBuilder.buildAdd(IdxTy, NewBaseIdx, IdxOffset);
+      auto Elt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec, TmpIdx);
+      NewOps[I] = Elt.getReg(0);
+    }
+
+    auto NewVec = MIRBuilder.buildBuildVector(MidTy, NewOps);
+    MIRBuilder.buildBitcast(Dst, NewVec);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  if (NewNumElts < OldNumElts) {
+    if (NewEltSize % OldEltSize != 0)
+      return UnableToLegalize;
+
+    // This only depends on powers of 2 because we use bit tricks to figure out
+    // the bit offset we need to shift to get the target element. A general
+    // expansion could emit division/multiply.
+    if (!isPowerOf2_32(NewEltSize / OldEltSize))
+      return UnableToLegalize;
+
+    // Increasing the vector element size.
+    // %elt:_(small_elt) = G_EXTRACT_VECTOR_ELT %vec:_(<N x small_elt>), %idx
+    //
+    //   =>
+    //
+    // %cast = G_BITCAST %vec
+    // %scaled_idx = G_LSHR %idx, Log2(DstEltSize / SrcEltSize)
+    // %wide_elt  = G_EXTRACT_VECTOR_ELT %cast, %scaled_idx
+    // %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
+    // %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
+    // %elt_bits = G_LSHR %wide_elt, %offset_bits
+    // %elt = G_TRUNC %elt_bits
+
+    const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
+    auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio);
+
+    // Divide to get the index in the wider element type.
+    auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio);
+
+    Register WideElt = CastVec;
+    if (CastTy.isVector()) {
+      WideElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec,
+                                                     ScaledIdx).getReg(0);
+    }
+
+    // Compute the bit offset into the register of the target element.
+    Register OffsetBits = getBitcastWiderVectorElementOffset(
+      MIRBuilder, Idx, NewEltSize, OldEltSize);
+
+    // Shift the wide element to get the target element.
+    auto ExtractedBits = MIRBuilder.buildLShr(NewEltTy, WideElt, OffsetBits);
+    MIRBuilder.buildTrunc(Dst, ExtractedBits);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  return UnableToLegalize;
+}
+
+/// Emit code to insert \p InsertReg into \p TargetRet at \p OffsetBits in \p
+/// TargetReg, while preserving other bits in \p TargetReg.
+///
+/// (InsertReg << Offset) | (TargetReg & ~(-1 >> InsertReg.size()) << Offset)
+static Register buildBitFieldInsert(MachineIRBuilder &B,
+                                    Register TargetReg, Register InsertReg,
+                                    Register OffsetBits) {
+  LLT TargetTy = B.getMRI()->getType(TargetReg);
+  LLT InsertTy = B.getMRI()->getType(InsertReg);
+  auto ZextVal = B.buildZExt(TargetTy, InsertReg);
+  auto ShiftedInsertVal = B.buildShl(TargetTy, ZextVal, OffsetBits);
+
+  // Produce a bitmask of the value to insert
+  auto EltMask = B.buildConstant(
+    TargetTy, APInt::getLowBitsSet(TargetTy.getSizeInBits(),
+                                   InsertTy.getSizeInBits()));
+  // Shift it into position
+  auto ShiftedMask = B.buildShl(TargetTy, EltMask, OffsetBits);
+  auto InvShiftedMask = B.buildNot(TargetTy, ShiftedMask);
+
+  // Clear out the bits in the wide element
+  auto MaskedOldElt = B.buildAnd(TargetTy, TargetReg, InvShiftedMask);
+
+  // The value to insert has all zeros already, so stick it into the masked
+  // wide element.
+  return B.buildOr(TargetTy, MaskedOldElt, ShiftedInsertVal).getReg(0);
+}
+
+/// Perform a G_INSERT_VECTOR_ELT in a different sized vector element. If this
+/// is increasing the element size, perform the indexing in the target element
+/// type, and use bit operations to insert at the element position. This is
+/// intended for architectures that can dynamically index the register file and
+/// want to force indexing in the native register size.
+LegalizerHelper::LegalizeResult
+LegalizerHelper::bitcastInsertVectorElt(MachineInstr &MI, unsigned TypeIdx,
+                                        LLT CastTy) {
+  if (TypeIdx != 0)
+    return UnableToLegalize;
+
+  Register Dst = MI.getOperand(0).getReg();
+  Register SrcVec = MI.getOperand(1).getReg();
+  Register Val = MI.getOperand(2).getReg();
+  Register Idx = MI.getOperand(3).getReg();
+
+  LLT VecTy = MRI.getType(Dst);
+  LLT IdxTy = MRI.getType(Idx);
+
+  LLT VecEltTy = VecTy.getElementType();
+  LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy;
+  const unsigned NewEltSize = NewEltTy.getSizeInBits();
+  const unsigned OldEltSize = VecEltTy.getSizeInBits();
+
+  unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
+  unsigned OldNumElts = VecTy.getNumElements();
+
+  Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0);
+  if (NewNumElts < OldNumElts) {
+    if (NewEltSize % OldEltSize != 0)
+      return UnableToLegalize;
+
+    // This only depends on powers of 2 because we use bit tricks to figure out
+    // the bit offset we need to shift to get the target element. A general
+    // expansion could emit division/multiply.
+    if (!isPowerOf2_32(NewEltSize / OldEltSize))
+      return UnableToLegalize;
+
+    const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
+    auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio);
+
+    // Divide to get the index in the wider element type.
+    auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio);
+
+    Register ExtractedElt = CastVec;
+    if (CastTy.isVector()) {
+      ExtractedElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec,
+                                                          ScaledIdx).getReg(0);
+    }
+
+    // Compute the bit offset into the register of the target element.
+    Register OffsetBits = getBitcastWiderVectorElementOffset(
+      MIRBuilder, Idx, NewEltSize, OldEltSize);
+
+    Register InsertedElt = buildBitFieldInsert(MIRBuilder, ExtractedElt,
+                                               Val, OffsetBits);
+    if (CastTy.isVector()) {
+      InsertedElt = MIRBuilder.buildInsertVectorElement(
+        CastTy, CastVec, InsertedElt, ScaledIdx).getReg(0);
+    }
+
+    MIRBuilder.buildBitcast(Dst, InsertedElt);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  return UnableToLegalize;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerLoad(MachineInstr &MI) {
+  // Lower to a memory-width G_LOAD and a G_SEXT/G_ZEXT/G_ANYEXT
+  Register DstReg = MI.getOperand(0).getReg();
+  Register PtrReg = MI.getOperand(1).getReg();
+  LLT DstTy = MRI.getType(DstReg);
+  auto &MMO = **MI.memoperands_begin();
+
+  if (DstTy.getSizeInBits() == MMO.getSizeInBits()) {
+    if (MI.getOpcode() == TargetOpcode::G_LOAD) {
+      // This load needs splitting into power of 2 sized loads.
+      if (DstTy.isVector())
+        return UnableToLegalize;
+      if (isPowerOf2_32(DstTy.getSizeInBits()))
+        return UnableToLegalize; // Don't know what we're being asked to do.
+
+      // Our strategy here is to generate anyextending loads for the smaller
+      // types up to next power-2 result type, and then combine the two larger
+      // result values together, before truncating back down to the non-pow-2
+      // type.
+      // E.g. v1 = i24 load =>
+      // v2 = i32 zextload (2 byte)
+      // v3 = i32 load (1 byte)
+      // v4 = i32 shl v3, 16
+      // v5 = i32 or v4, v2
+      // v1 = i24 trunc v5
+      // By doing this we generate the correct truncate which should get
+      // combined away as an artifact with a matching extend.
+      uint64_t LargeSplitSize = PowerOf2Floor(DstTy.getSizeInBits());
+      uint64_t SmallSplitSize = DstTy.getSizeInBits() - LargeSplitSize;
+
+      MachineFunction &MF = MIRBuilder.getMF();
+      MachineMemOperand *LargeMMO =
+        MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
+      MachineMemOperand *SmallMMO = MF.getMachineMemOperand(
+        &MMO, LargeSplitSize / 8, SmallSplitSize / 8);
+
+      LLT PtrTy = MRI.getType(PtrReg);
+      unsigned AnyExtSize = NextPowerOf2(DstTy.getSizeInBits());
+      LLT AnyExtTy = LLT::scalar(AnyExtSize);
+      Register LargeLdReg = MRI.createGenericVirtualRegister(AnyExtTy);
+      Register SmallLdReg = MRI.createGenericVirtualRegister(AnyExtTy);
+      auto LargeLoad = MIRBuilder.buildLoadInstr(
+        TargetOpcode::G_ZEXTLOAD, LargeLdReg, PtrReg, *LargeMMO);
+
+      auto OffsetCst = MIRBuilder.buildConstant(
+        LLT::scalar(PtrTy.getSizeInBits()), LargeSplitSize / 8);
+      Register PtrAddReg = MRI.createGenericVirtualRegister(PtrTy);
+      auto SmallPtr =
+        MIRBuilder.buildPtrAdd(PtrAddReg, PtrReg, OffsetCst.getReg(0));
+      auto SmallLoad = MIRBuilder.buildLoad(SmallLdReg, SmallPtr.getReg(0),
+                                            *SmallMMO);
+
+      auto ShiftAmt = MIRBuilder.buildConstant(AnyExtTy, LargeSplitSize);
+      auto Shift = MIRBuilder.buildShl(AnyExtTy, SmallLoad, ShiftAmt);
+      auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad);
+      MIRBuilder.buildTrunc(DstReg, {Or.getReg(0)});
+      MI.eraseFromParent();
+      return Legalized;
+    }
+
+    MIRBuilder.buildLoad(DstReg, PtrReg, MMO);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  if (DstTy.isScalar()) {
+    Register TmpReg =
+      MRI.createGenericVirtualRegister(LLT::scalar(MMO.getSizeInBits()));
+    MIRBuilder.buildLoad(TmpReg, PtrReg, MMO);
+    switch (MI.getOpcode()) {
+    default:
+      llvm_unreachable("Unexpected opcode");
+    case TargetOpcode::G_LOAD:
+      MIRBuilder.buildAnyExtOrTrunc(DstReg, TmpReg);
+      break;
+    case TargetOpcode::G_SEXTLOAD:
+      MIRBuilder.buildSExt(DstReg, TmpReg);
+      break;
+    case TargetOpcode::G_ZEXTLOAD:
+      MIRBuilder.buildZExt(DstReg, TmpReg);
+      break;
+    }
+
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  return UnableToLegalize;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerStore(MachineInstr &MI) {
+  // Lower a non-power of 2 store into multiple pow-2 stores.
+  // E.g. split an i24 store into an i16 store + i8 store.
+  // We do this by first extending the stored value to the next largest power
+  // of 2 type, and then using truncating stores to store the components.
+  // By doing this, likewise with G_LOAD, generate an extend that can be
+  // artifact-combined away instead of leaving behind extracts.
+  Register SrcReg = MI.getOperand(0).getReg();
+  Register PtrReg = MI.getOperand(1).getReg();
+  LLT SrcTy = MRI.getType(SrcReg);
+  MachineMemOperand &MMO = **MI.memoperands_begin();
+  if (SrcTy.getSizeInBits() != MMO.getSizeInBits())
+    return UnableToLegalize;
+  if (SrcTy.isVector())
+    return UnableToLegalize;
+  if (isPowerOf2_32(SrcTy.getSizeInBits()))
+    return UnableToLegalize; // Don't know what we're being asked to do.
+
+  // Extend to the next pow-2.
+  const LLT ExtendTy = LLT::scalar(NextPowerOf2(SrcTy.getSizeInBits()));
+  auto ExtVal = MIRBuilder.buildAnyExt(ExtendTy, SrcReg);
+
+  // Obtain the smaller value by shifting away the larger value.
+  uint64_t LargeSplitSize = PowerOf2Floor(SrcTy.getSizeInBits());
+  uint64_t SmallSplitSize = SrcTy.getSizeInBits() - LargeSplitSize;
+  auto ShiftAmt = MIRBuilder.buildConstant(ExtendTy, LargeSplitSize);
+  auto SmallVal = MIRBuilder.buildLShr(ExtendTy, ExtVal, ShiftAmt);
+
+  // Generate the PtrAdd and truncating stores.
+  LLT PtrTy = MRI.getType(PtrReg);
+  auto OffsetCst = MIRBuilder.buildConstant(
+    LLT::scalar(PtrTy.getSizeInBits()), LargeSplitSize / 8);
+  Register PtrAddReg = MRI.createGenericVirtualRegister(PtrTy);
+  auto SmallPtr =
+    MIRBuilder.buildPtrAdd(PtrAddReg, PtrReg, OffsetCst.getReg(0));
+
+  MachineFunction &MF = MIRBuilder.getMF();
+  MachineMemOperand *LargeMMO =
+    MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
+  MachineMemOperand *SmallMMO =
+    MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8);
+  MIRBuilder.buildStore(ExtVal.getReg(0), PtrReg, *LargeMMO);
+  MIRBuilder.buildStore(SmallVal.getReg(0), SmallPtr.getReg(0), *SmallMMO);
+  MI.eraseFromParent();
+  return Legalized;
+}
+
 LegalizerHelper::LegalizeResult
 LegalizerHelper::bitcast(MachineInstr &MI, unsigned TypeIdx, LLT CastTy) {
   switch (MI.getOpcode()) {
@@ -2321,13 +2833,24 @@ LegalizerHelper::bitcast(MachineInstr &MI, unsigned TypeIdx, LLT CastTy) {
     Observer.changedInstr(MI);
     return Legalized;
   }
+  case TargetOpcode::G_EXTRACT_VECTOR_ELT:
+    return bitcastExtractVectorElt(MI, TypeIdx, CastTy);
+  case TargetOpcode::G_INSERT_VECTOR_ELT:
+    return bitcastInsertVectorElt(MI, TypeIdx, CastTy);
   default:
     return UnableToLegalize;
   }
 }
 
+// Legalize an instruction by changing the opcode in place.
+void LegalizerHelper::changeOpcode(MachineInstr &MI, unsigned NewOpcode) {
+    Observer.changingInstr(MI);
+    MI.setDesc(MIRBuilder.getTII().get(NewOpcode));
+    Observer.changedInstr(MI);
+}
+
 LegalizerHelper::LegalizeResult
-LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
+LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
   using namespace TargetOpcode;
 
   switch(MI.getOpcode()) {
@@ -2337,6 +2860,7 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
     return lowerBitcast(MI);
   case TargetOpcode::G_SREM:
   case TargetOpcode::G_UREM: {
+    LLT Ty = MRI.getType(MI.getOperand(0).getReg());
     auto Quot =
         MIRBuilder.buildInstr(MI.getOpcode() == G_SREM ? G_SDIV : G_UDIV, {Ty},
                               {MI.getOperand(1), MI.getOperand(2)});
@@ -2349,6 +2873,9 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
   case TargetOpcode::G_SADDO:
   case TargetOpcode::G_SSUBO:
     return lowerSADDO_SSUBO(MI);
+  case TargetOpcode::G_UMULH:
+  case TargetOpcode::G_SMULH:
+    return lowerSMULH_UMULH(MI);
   case TargetOpcode::G_SMULO:
   case TargetOpcode::G_UMULO: {
     // Generate G_UMULH/G_SMULH to check for overflow and a normal G_MUL for the
@@ -2357,6 +2884,7 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
     Register Overflow = MI.getOperand(1).getReg();
     Register LHS = MI.getOperand(2).getReg();
     Register RHS = MI.getOperand(3).getReg();
+    LLT Ty = MRI.getType(Res);
 
     unsigned Opcode = MI.getOpcode() == TargetOpcode::G_SMULO
                           ? TargetOpcode::G_SMULH
@@ -2386,31 +2914,29 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
     return Legalized;
   }
   case TargetOpcode::G_FNEG: {
+    Register Res = MI.getOperand(0).getReg();
+    LLT Ty = MRI.getType(Res);
+
     // TODO: Handle vector types once we are able to
     // represent them.
     if (Ty.isVector())
       return UnableToLegalize;
-    Register Res = MI.getOperand(0).getReg();
-    LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
-    Type *ZeroTy = getFloatTypeForLLT(Ctx, Ty);
-    if (!ZeroTy)
-      return UnableToLegalize;
-    ConstantFP &ZeroForNegation =
-        *cast<ConstantFP>(ConstantFP::getZeroValueForNegation(ZeroTy));
-    auto Zero = MIRBuilder.buildFConstant(Ty, ZeroForNegation);
+    auto SignMask =
+        MIRBuilder.buildConstant(Ty, APInt::getSignMask(Ty.getSizeInBits()));
     Register SubByReg = MI.getOperand(1).getReg();
-    Register ZeroReg = Zero.getReg(0);
-    MIRBuilder.buildFSub(Res, ZeroReg, SubByReg, MI.getFlags());
+    MIRBuilder.buildXor(Res, SubByReg, SignMask);
     MI.eraseFromParent();
     return Legalized;
   }
   case TargetOpcode::G_FSUB: {
+    Register Res = MI.getOperand(0).getReg();
+    LLT Ty = MRI.getType(Res);
+
     // Lower (G_FSUB LHS, RHS) to (G_FADD LHS, (G_FNEG RHS)).
     // First, check if G_FNEG is marked as Lower. If so, we may
     // end up with an infinite loop as G_FSUB is used to legalize G_FNEG.
     if (LI.getAction({G_FNEG, {Ty}}).Action == Lower)
       return UnableToLegalize;
-    Register Res = MI.getOperand(0).getReg();
     Register LHS = MI.getOperand(1).getReg();
     Register RHS = MI.getOperand(2).getReg();
     Register Neg = MRI.createGenericVirtualRegister(Ty);
@@ -2425,6 +2951,12 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
     return lowerFFloor(MI);
   case TargetOpcode::G_INTRINSIC_ROUND:
     return lowerIntrinsicRound(MI);
+  case TargetOpcode::G_INTRINSIC_ROUNDEVEN: {
+    // Since round even is the assumed rounding mode for unconstrained FP
+    // operations, rint and roundeven are the same operation.
+    changeOpcode(MI, TargetOpcode::G_FRINT);
+    return Legalized;
+  }
   case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
     Register OldValRes = MI.getOperand(0).getReg();
     Register SuccessRes = MI.getOperand(1).getReg();
@@ -2439,145 +2971,16 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
   }
   case TargetOpcode::G_LOAD:
   case TargetOpcode::G_SEXTLOAD:
-  case TargetOpcode::G_ZEXTLOAD: {
-    // Lower to a memory-width G_LOAD and a G_SEXT/G_ZEXT/G_ANYEXT
-    Register DstReg = MI.getOperand(0).getReg();
-    Register PtrReg = MI.getOperand(1).getReg();
-    LLT DstTy = MRI.getType(DstReg);
-    auto &MMO = **MI.memoperands_begin();
-
-    if (DstTy.getSizeInBits() == MMO.getSizeInBits()) {
-      if (MI.getOpcode() == TargetOpcode::G_LOAD) {
-        // This load needs splitting into power of 2 sized loads.
-        if (DstTy.isVector())
-          return UnableToLegalize;
-        if (isPowerOf2_32(DstTy.getSizeInBits()))
-          return UnableToLegalize; // Don't know what we're being asked to do.
-
-        // Our strategy here is to generate anyextending loads for the smaller
-        // types up to next power-2 result type, and then combine the two larger
-        // result values together, before truncating back down to the non-pow-2
-        // type.
-        // E.g. v1 = i24 load =>
-        // v2 = i32 zextload (2 byte)
-        // v3 = i32 load (1 byte)
-        // v4 = i32 shl v3, 16
-        // v5 = i32 or v4, v2
-        // v1 = i24 trunc v5
-        // By doing this we generate the correct truncate which should get
-        // combined away as an artifact with a matching extend.
-        uint64_t LargeSplitSize = PowerOf2Floor(DstTy.getSizeInBits());
-        uint64_t SmallSplitSize = DstTy.getSizeInBits() - LargeSplitSize;
-
-        MachineFunction &MF = MIRBuilder.getMF();
-        MachineMemOperand *LargeMMO =
-            MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
-        MachineMemOperand *SmallMMO = MF.getMachineMemOperand(
-            &MMO, LargeSplitSize / 8, SmallSplitSize / 8);
-
-        LLT PtrTy = MRI.getType(PtrReg);
-        unsigned AnyExtSize = NextPowerOf2(DstTy.getSizeInBits());
-        LLT AnyExtTy = LLT::scalar(AnyExtSize);
-        Register LargeLdReg = MRI.createGenericVirtualRegister(AnyExtTy);
-        Register SmallLdReg = MRI.createGenericVirtualRegister(AnyExtTy);
-        auto LargeLoad = MIRBuilder.buildLoadInstr(
-            TargetOpcode::G_ZEXTLOAD, LargeLdReg, PtrReg, *LargeMMO);
-
-        auto OffsetCst = MIRBuilder.buildConstant(
-            LLT::scalar(PtrTy.getSizeInBits()), LargeSplitSize / 8);
-        Register PtrAddReg = MRI.createGenericVirtualRegister(PtrTy);
-        auto SmallPtr =
-            MIRBuilder.buildPtrAdd(PtrAddReg, PtrReg, OffsetCst.getReg(0));
-        auto SmallLoad = MIRBuilder.buildLoad(SmallLdReg, SmallPtr.getReg(0),
-                                              *SmallMMO);
-
-        auto ShiftAmt = MIRBuilder.buildConstant(AnyExtTy, LargeSplitSize);
-        auto Shift = MIRBuilder.buildShl(AnyExtTy, SmallLoad, ShiftAmt);
-        auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad);
-        MIRBuilder.buildTrunc(DstReg, {Or.getReg(0)});
-        MI.eraseFromParent();
-        return Legalized;
-      }
-      MIRBuilder.buildLoad(DstReg, PtrReg, MMO);
-      MI.eraseFromParent();
-      return Legalized;
-    }
-
-    if (DstTy.isScalar()) {
-      Register TmpReg =
-          MRI.createGenericVirtualRegister(LLT::scalar(MMO.getSizeInBits()));
-      MIRBuilder.buildLoad(TmpReg, PtrReg, MMO);
-      switch (MI.getOpcode()) {
-      default:
-        llvm_unreachable("Unexpected opcode");
-      case TargetOpcode::G_LOAD:
-        MIRBuilder.buildExtOrTrunc(TargetOpcode::G_ANYEXT, DstReg, TmpReg);
-        break;
-      case TargetOpcode::G_SEXTLOAD:
-        MIRBuilder.buildSExt(DstReg, TmpReg);
-        break;
-      case TargetOpcode::G_ZEXTLOAD:
-        MIRBuilder.buildZExt(DstReg, TmpReg);
-        break;
-      }
-      MI.eraseFromParent();
-      return Legalized;
-    }
-
-    return UnableToLegalize;
-  }
-  case TargetOpcode::G_STORE: {
-    // Lower a non-power of 2 store into multiple pow-2 stores.
-    // E.g. split an i24 store into an i16 store + i8 store.
-    // We do this by first extending the stored value to the next largest power
-    // of 2 type, and then using truncating stores to store the components.
-    // By doing this, likewise with G_LOAD, generate an extend that can be
-    // artifact-combined away instead of leaving behind extracts.
-    Register SrcReg = MI.getOperand(0).getReg();
-    Register PtrReg = MI.getOperand(1).getReg();
-    LLT SrcTy = MRI.getType(SrcReg);
-    MachineMemOperand &MMO = **MI.memoperands_begin();
-    if (SrcTy.getSizeInBits() != MMO.getSizeInBits())
-      return UnableToLegalize;
-    if (SrcTy.isVector())
-      return UnableToLegalize;
-    if (isPowerOf2_32(SrcTy.getSizeInBits()))
-      return UnableToLegalize; // Don't know what we're being asked to do.
-
-    // Extend to the next pow-2.
-    const LLT ExtendTy = LLT::scalar(NextPowerOf2(SrcTy.getSizeInBits()));
-    auto ExtVal = MIRBuilder.buildAnyExt(ExtendTy, SrcReg);
-
-    // Obtain the smaller value by shifting away the larger value.
-    uint64_t LargeSplitSize = PowerOf2Floor(SrcTy.getSizeInBits());
-    uint64_t SmallSplitSize = SrcTy.getSizeInBits() - LargeSplitSize;
-    auto ShiftAmt = MIRBuilder.buildConstant(ExtendTy, LargeSplitSize);
-    auto SmallVal = MIRBuilder.buildLShr(ExtendTy, ExtVal, ShiftAmt);
-
-    // Generate the PtrAdd and truncating stores.
-    LLT PtrTy = MRI.getType(PtrReg);
-    auto OffsetCst = MIRBuilder.buildConstant(
-            LLT::scalar(PtrTy.getSizeInBits()), LargeSplitSize / 8);
-    Register PtrAddReg = MRI.createGenericVirtualRegister(PtrTy);
-    auto SmallPtr =
-        MIRBuilder.buildPtrAdd(PtrAddReg, PtrReg, OffsetCst.getReg(0));
-
-    MachineFunction &MF = MIRBuilder.getMF();
-    MachineMemOperand *LargeMMO =
-        MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
-    MachineMemOperand *SmallMMO =
-        MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8);
-    MIRBuilder.buildStore(ExtVal.getReg(0), PtrReg, *LargeMMO);
-    MIRBuilder.buildStore(SmallVal.getReg(0), SmallPtr.getReg(0), *SmallMMO);
-    MI.eraseFromParent();
-    return Legalized;
-  }
+  case TargetOpcode::G_ZEXTLOAD:
+    return lowerLoad(MI);
+  case TargetOpcode::G_STORE:
+    return lowerStore(MI);
   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
   case TargetOpcode::G_CTTZ_ZERO_UNDEF:
   case TargetOpcode::G_CTLZ:
   case TargetOpcode::G_CTTZ:
   case TargetOpcode::G_CTPOP:
-    return lowerBitCount(MI, TypeIdx, Ty);
+    return lowerBitCount(MI);
   case G_UADDO: {
     Register Res = MI.getOperand(0).getReg();
     Register CarryOut = MI.getOperand(1).getReg();
@@ -2639,22 +3042,24 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
     return Legalized;
   }
   case G_UITOFP:
-    return lowerUITOFP(MI, TypeIdx, Ty);
+    return lowerUITOFP(MI);
   case G_SITOFP:
-    return lowerSITOFP(MI, TypeIdx, Ty);
+    return lowerSITOFP(MI);
   case G_FPTOUI:
-    return lowerFPTOUI(MI, TypeIdx, Ty);
+    return lowerFPTOUI(MI);
   case G_FPTOSI:
     return lowerFPTOSI(MI);
   case G_FPTRUNC:
-    return lowerFPTRUNC(MI, TypeIdx, Ty);
+    return lowerFPTRUNC(MI);
+  case G_FPOWI:
+    return lowerFPOWI(MI);
   case G_SMIN:
   case G_SMAX:
   case G_UMIN:
   case G_UMAX:
-    return lowerMinMax(MI, TypeIdx, Ty);
+    return lowerMinMax(MI);
   case G_FCOPYSIGN:
-    return lowerFCopySign(MI, TypeIdx, Ty);
+    return lowerFCopySign(MI);
   case G_FMINNUM:
   case G_FMAXNUM:
     return lowerFMinNumMaxNum(MI);
@@ -2677,6 +3082,9 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
     MI.eraseFromParent();
     return Legalized;
   }
+  case G_EXTRACT_VECTOR_ELT:
+  case G_INSERT_VECTOR_ELT:
+    return lowerExtractInsertVectorElt(MI);
   case G_SHUFFLE_VECTOR:
     return lowerShuffleVector(MI);
   case G_DYN_STACKALLOC:
@@ -2692,33 +3100,123 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
   case G_READ_REGISTER:
   case G_WRITE_REGISTER:
     return lowerReadWriteRegister(MI);
+  case G_UADDSAT:
+  case G_USUBSAT: {
+    // Try to make a reasonable guess about which lowering strategy to use. The
+    // target can override this with custom lowering and calling the
+    // implementation functions.
+    LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+    if (LI.isLegalOrCustom({G_UMIN, Ty}))
+      return lowerAddSubSatToMinMax(MI);
+    return lowerAddSubSatToAddoSubo(MI);
+  }
+  case G_SADDSAT:
+  case G_SSUBSAT: {
+    LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+
+    // FIXME: It would probably make more sense to see if G_SADDO is preferred,
+    // since it's a shorter expansion. However, we would need to figure out the
+    // preferred boolean type for the carry out for the query.
+    if (LI.isLegalOrCustom({G_SMIN, Ty}) && LI.isLegalOrCustom({G_SMAX, Ty}))
+      return lowerAddSubSatToMinMax(MI);
+    return lowerAddSubSatToAddoSubo(MI);
+  }
+  case G_SSHLSAT:
+  case G_USHLSAT:
+    return lowerShlSat(MI);
+  case G_ABS: {
+    // Expand %res = G_ABS %a into:
+    // %v1 = G_ASHR %a, scalar_size-1
+    // %v2 = G_ADD %a, %v1
+    // %res = G_XOR %v2, %v1
+    LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+    Register OpReg = MI.getOperand(1).getReg();
+    auto ShiftAmt =
+        MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - 1);
+    auto Shift =
+        MIRBuilder.buildAShr(DstTy, OpReg, ShiftAmt);
+    auto Add = MIRBuilder.buildAdd(DstTy, OpReg, Shift);
+    MIRBuilder.buildXor(MI.getOperand(0).getReg(), Add, Shift);
+    MI.eraseFromParent();
+    return Legalized;
   }
+  case G_SELECT:
+    return lowerSelect(MI);
+  }
+}
+
+Align LegalizerHelper::getStackTemporaryAlignment(LLT Ty,
+                                                  Align MinAlign) const {
+  // FIXME: We're missing a way to go back from LLT to llvm::Type to query the
+  // datalayout for the preferred alignment. Also there should be a target hook
+  // for this to allow targets to reduce the alignment and ignore the
+  // datalayout. e.g. AMDGPU should always use a 4-byte alignment, regardless of
+  // the type.
+  return std::max(Align(PowerOf2Ceil(Ty.getSizeInBytes())), MinAlign);
+}
+
+MachineInstrBuilder
+LegalizerHelper::createStackTemporary(TypeSize Bytes, Align Alignment,
+                                      MachinePointerInfo &PtrInfo) {
+  MachineFunction &MF = MIRBuilder.getMF();
+  const DataLayout &DL = MIRBuilder.getDataLayout();
+  int FrameIdx = MF.getFrameInfo().CreateStackObject(Bytes, Alignment, false);
+
+  unsigned AddrSpace = DL.getAllocaAddrSpace();
+  LLT FramePtrTy = LLT::pointer(AddrSpace, DL.getPointerSizeInBits(AddrSpace));
+
+  PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIdx);
+  return MIRBuilder.buildFrameIndex(FramePtrTy, FrameIdx);
+}
+
+static Register clampDynamicVectorIndex(MachineIRBuilder &B, Register IdxReg,
+                                        LLT VecTy) {
+  int64_t IdxVal;
+  if (mi_match(IdxReg, *B.getMRI(), m_ICst(IdxVal)))
+    return IdxReg;
+
+  LLT IdxTy = B.getMRI()->getType(IdxReg);
+  unsigned NElts = VecTy.getNumElements();
+  if (isPowerOf2_32(NElts)) {
+    APInt Imm = APInt::getLowBitsSet(IdxTy.getSizeInBits(), Log2_32(NElts));
+    return B.buildAnd(IdxTy, IdxReg, B.buildConstant(IdxTy, Imm)).getReg(0);
+  }
+
+  return B.buildUMin(IdxTy, IdxReg, B.buildConstant(IdxTy, NElts - 1))
+      .getReg(0);
+}
+
+Register LegalizerHelper::getVectorElementPointer(Register VecPtr, LLT VecTy,
+                                                  Register Index) {
+  LLT EltTy = VecTy.getElementType();
+
+  // Calculate the element offset and add it to the pointer.
+  unsigned EltSize = EltTy.getSizeInBits() / 8; // FIXME: should be ABI size.
+  assert(EltSize * 8 == EltTy.getSizeInBits() &&
+         "Converting bits to bytes lost precision");
+
+  Index = clampDynamicVectorIndex(MIRBuilder, Index, VecTy);
+
+  LLT IdxTy = MRI.getType(Index);
+  auto Mul = MIRBuilder.buildMul(IdxTy, Index,
+                                 MIRBuilder.buildConstant(IdxTy, EltSize));
+
+  LLT PtrTy = MRI.getType(VecPtr);
+  return MIRBuilder.buildPtrAdd(PtrTy, VecPtr, Mul).getReg(0);
 }
 
 LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorImplicitDef(
     MachineInstr &MI, unsigned TypeIdx, LLT NarrowTy) {
-  SmallVector<Register, 2> DstRegs;
-
-  unsigned NarrowSize = NarrowTy.getSizeInBits();
   Register DstReg = MI.getOperand(0).getReg();
-  unsigned Size = MRI.getType(DstReg).getSizeInBits();
-  int NumParts = Size / NarrowSize;
-  // FIXME: Don't know how to handle the situation where the small vectors
-  // aren't all the same size yet.
-  if (Size % NarrowSize != 0)
-    return UnableToLegalize;
+  LLT DstTy = MRI.getType(DstReg);
+  LLT LCMTy = getLCMType(DstTy, NarrowTy);
 
-  for (int i = 0; i < NumParts; ++i) {
-    Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
-    MIRBuilder.buildUndef(TmpReg);
-    DstRegs.push_back(TmpReg);
-  }
+  unsigned NumParts = LCMTy.getSizeInBits() / NarrowTy.getSizeInBits();
 
-  if (NarrowTy.isVector())
-    MIRBuilder.buildConcatVectors(DstReg, DstRegs);
-  else
-    MIRBuilder.buildBuildVector(DstReg, DstRegs);
+  auto NewUndef = MIRBuilder.buildUndef(NarrowTy);
+  SmallVector<Register, 8> Parts(NumParts, NewUndef.getReg(0));
 
+  buildWidenedRemergeToDst(DstReg, LCMTy, Parts);
   MI.eraseFromParent();
   return Legalized;
 }
@@ -2839,7 +3337,7 @@ LegalizerHelper::fewerElementsVectorCasts(MachineInstr &MI, unsigned TypeIdx,
     if (NumParts * NarrowTy.getNumElements() != DstTy.getNumElements())
       return UnableToLegalize;
 
-    NarrowTy1 = LLT::vector(NumParts, SrcTy.getElementType().getSizeInBits());
+    NarrowTy1 = LLT::vector(NarrowTy.getNumElements(), SrcTy.getElementType());
   } else {
     NumParts = DstTy.getNumElements();
     NarrowTy1 = SrcTy.getElementType();
@@ -3112,63 +3610,116 @@ LegalizerHelper::fewerElementsVectorUnmergeValues(MachineInstr &MI,
   return Legalized;
 }
 
+// Handle FewerElementsVector a G_BUILD_VECTOR or G_CONCAT_VECTORS that produces
+// a vector
+//
+// Create a G_BUILD_VECTOR or G_CONCAT_VECTORS of NarrowTy pieces, padding with
+// undef as necessary.
+//
+// %3:_(<3 x s16>) = G_BUILD_VECTOR %0, %1, %2
+//   -> <2 x s16>
+//
+// %4:_(s16) = G_IMPLICIT_DEF
+// %5:_(<2 x s16>) = G_BUILD_VECTOR %0, %1
+// %6:_(<2 x s16>) = G_BUILD_VECTOR %2, %4
+// %7:_(<2 x s16>) = G_IMPLICIT_DEF
+// %8:_(<6 x s16>) = G_CONCAT_VECTORS %5, %6, %7
+// %3:_(<3 x s16>), %8:_(<3 x s16>) = G_UNMERGE_VALUES %8
 LegalizerHelper::LegalizeResult
-LegalizerHelper::fewerElementsVectorBuildVector(MachineInstr &MI,
-                                                unsigned TypeIdx,
-                                                LLT NarrowTy) {
-  assert(TypeIdx == 0 && "not a vector type index");
+LegalizerHelper::fewerElementsVectorMerge(MachineInstr &MI, unsigned TypeIdx,
+                                          LLT NarrowTy) {
   Register DstReg = MI.getOperand(0).getReg();
   LLT DstTy = MRI.getType(DstReg);
-  LLT SrcTy = DstTy.getElementType();
+  LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
+  LLT GCDTy = getGCDType(getGCDType(SrcTy, NarrowTy), DstTy);
 
-  int DstNumElts = DstTy.getNumElements();
-  int NarrowNumElts = NarrowTy.getNumElements();
-  int NumConcat = (DstNumElts + NarrowNumElts - 1) / NarrowNumElts;
-  LLT WidenedDstTy = LLT::vector(NarrowNumElts * NumConcat, SrcTy);
+  // Break into a common type
+  SmallVector<Register, 16> Parts;
+  for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
+    extractGCDType(Parts, GCDTy, MI.getOperand(I).getReg());
 
-  SmallVector<Register, 8> ConcatOps;
-  SmallVector<Register, 8> SubBuildVector;
+  // Build the requested new merge, padding with undef.
+  LLT LCMTy = buildLCMMergePieces(DstTy, NarrowTy, GCDTy, Parts,
+                                  TargetOpcode::G_ANYEXT);
 
-  Register UndefReg;
-  if (WidenedDstTy != DstTy)
-    UndefReg = MIRBuilder.buildUndef(SrcTy).getReg(0);
+  // Pack into the original result register.
+  buildWidenedRemergeToDst(DstReg, LCMTy, Parts);
 
-  // Create a G_CONCAT_VECTORS of NarrowTy pieces, padding with undef as
-  // necessary.
-  //
-  // %3:_(<3 x s16>) = G_BUILD_VECTOR %0, %1, %2
-  //   -> <2 x s16>
-  //
-  // %4:_(s16) = G_IMPLICIT_DEF
-  // %5:_(<2 x s16>) = G_BUILD_VECTOR %0, %1
-  // %6:_(<2 x s16>) = G_BUILD_VECTOR %2, %4
-  // %7:_(<4 x s16>) = G_CONCAT_VECTORS %5, %6
-  // %3:_(<3 x s16>) = G_EXTRACT %7, 0
-  for (int I = 0; I != NumConcat; ++I) {
-    for (int J = 0; J != NarrowNumElts; ++J) {
-      int SrcIdx = NarrowNumElts * I + J;
-
-      if (SrcIdx < DstNumElts) {
-        Register SrcReg = MI.getOperand(SrcIdx + 1).getReg();
-        SubBuildVector.push_back(SrcReg);
-      } else
-        SubBuildVector.push_back(UndefReg);
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::fewerElementsVectorExtractInsertVectorElt(MachineInstr &MI,
+                                                           unsigned TypeIdx,
+                                                           LLT NarrowVecTy) {
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcVec = MI.getOperand(1).getReg();
+  Register InsertVal;
+  bool IsInsert = MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT;
+
+  assert((IsInsert ? TypeIdx == 0 : TypeIdx == 1) && "not a vector type index");
+  if (IsInsert)
+    InsertVal = MI.getOperand(2).getReg();
+
+  Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg();
+
+  // TODO: Handle total scalarization case.
+  if (!NarrowVecTy.isVector())
+    return UnableToLegalize;
+
+  LLT VecTy = MRI.getType(SrcVec);
+
+  // If the index is a constant, we can really break this down as you would
+  // expect, and index into the target size pieces.
+  int64_t IdxVal;
+  if (mi_match(Idx, MRI, m_ICst(IdxVal))) {
+    // Avoid out of bounds indexing the pieces.
+    if (IdxVal >= VecTy.getNumElements()) {
+      MIRBuilder.buildUndef(DstReg);
+      MI.eraseFromParent();
+      return Legalized;
     }
 
-    auto BuildVec = MIRBuilder.buildBuildVector(NarrowTy, SubBuildVector);
-    ConcatOps.push_back(BuildVec.getReg(0));
-    SubBuildVector.clear();
-  }
+    SmallVector<Register, 8> VecParts;
+    LLT GCDTy = extractGCDType(VecParts, VecTy, NarrowVecTy, SrcVec);
+
+    // Build a sequence of NarrowTy pieces in VecParts for this operand.
+    LLT LCMTy = buildLCMMergePieces(VecTy, NarrowVecTy, GCDTy, VecParts,
+                                    TargetOpcode::G_ANYEXT);
+
+    unsigned NewNumElts = NarrowVecTy.getNumElements();
 
-  if (DstTy == WidenedDstTy)
-    MIRBuilder.buildConcatVectors(DstReg, ConcatOps);
-  else {
-    auto Concat = MIRBuilder.buildConcatVectors(WidenedDstTy, ConcatOps);
-    MIRBuilder.buildExtract(DstReg, Concat, 0);
+    LLT IdxTy = MRI.getType(Idx);
+    int64_t PartIdx = IdxVal / NewNumElts;
+    auto NewIdx =
+        MIRBuilder.buildConstant(IdxTy, IdxVal - NewNumElts * PartIdx);
+
+    if (IsInsert) {
+      LLT PartTy = MRI.getType(VecParts[PartIdx]);
+
+      // Use the adjusted index to insert into one of the subvectors.
+      auto InsertPart = MIRBuilder.buildInsertVectorElement(
+          PartTy, VecParts[PartIdx], InsertVal, NewIdx);
+      VecParts[PartIdx] = InsertPart.getReg(0);
+
+      // Recombine the inserted subvector with the others to reform the result
+      // vector.
+      buildWidenedRemergeToDst(DstReg, LCMTy, VecParts);
+    } else {
+      MIRBuilder.buildExtractVectorElement(DstReg, VecParts[PartIdx], NewIdx);
+    }
+
+    MI.eraseFromParent();
+    return Legalized;
   }
 
-  MI.eraseFromParent();
-  return Legalized;
+  // With a variable index, we can't perform the operation in a smaller type, so
+  // we're forced to expand this.
+  //
+  // TODO: We could emit a chain of compare/select to figure out which piece to
+  // index.
+  return lowerExtractInsertVectorElt(MI);
 }
 
 LegalizerHelper::LegalizeResult
@@ -3214,7 +3765,8 @@ LegalizerHelper::reduceLoadStoreWidth(MachineInstr &MI, unsigned TypeIdx,
   if (NumParts == -1)
     return UnableToLegalize;
 
-  const LLT OffsetTy = LLT::scalar(MRI.getType(AddrReg).getScalarSizeInBits());
+  LLT PtrTy = MRI.getType(AddrReg);
+  const LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits());
 
   unsigned TotalSize = ValTy.getSizeInBits();
 
@@ -3412,6 +3964,7 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
   case G_ADD:
   case G_SUB:
   case G_MUL:
+  case G_PTR_ADD:
   case G_SMULH:
   case G_UMULH:
   case G_FADD:
@@ -3435,6 +3988,7 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
   case G_FFLOOR:
   case G_FRINT:
   case G_INTRINSIC_ROUND:
+  case G_INTRINSIC_ROUNDEVEN:
   case G_INTRINSIC_TRUNC:
   case G_FCOS:
   case G_FSIN:
@@ -3466,6 +4020,8 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
   case G_SHL:
   case G_LSHR:
   case G_ASHR:
+  case G_SSHLSAT:
+  case G_USHLSAT:
   case G_CTLZ:
   case G_CTLZ_ZERO_UNDEF:
   case G_CTTZ:
@@ -3496,7 +4052,15 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
   case G_UNMERGE_VALUES:
     return fewerElementsVectorUnmergeValues(MI, TypeIdx, NarrowTy);
   case G_BUILD_VECTOR:
-    return fewerElementsVectorBuildVector(MI, TypeIdx, NarrowTy);
+    assert(TypeIdx == 0 && "not a vector type index");
+    return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy);
+  case G_CONCAT_VECTORS:
+    if (TypeIdx != 1) // TODO: This probably does work as expected already.
+      return UnableToLegalize;
+    return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy);
+  case G_EXTRACT_VECTOR_ELT:
+  case G_INSERT_VECTOR_ELT:
+    return fewerElementsVectorExtractInsertVectorElt(MI, TypeIdx, NarrowTy);
   case G_LOAD:
   case G_STORE:
     return reduceLoadStoreWidth(MI, TypeIdx, NarrowTy);
@@ -3919,6 +4483,31 @@ LegalizerHelper::narrowScalarMul(MachineInstr &MI, LLT NarrowTy) {
   return Legalized;
 }
 
+LegalizerHelper::LegalizeResult
+LegalizerHelper::narrowScalarFPTOI(MachineInstr &MI, unsigned TypeIdx,
+                                   LLT NarrowTy) {
+  if (TypeIdx != 0)
+    return UnableToLegalize;
+
+  bool IsSigned = MI.getOpcode() == TargetOpcode::G_FPTOSI;
+
+  Register Src = MI.getOperand(1).getReg();
+  LLT SrcTy = MRI.getType(Src);
+
+  // If all finite floats fit into the narrowed integer type, we can just swap
+  // out the result type. This is practically only useful for conversions from
+  // half to at least 16-bits, so just handle the one case.
+  if (SrcTy.getScalarType() != LLT::scalar(16) ||
+      NarrowTy.getScalarSizeInBits() < (IsSigned ? 17 : 16))
+    return UnableToLegalize;
+
+  Observer.changingInstr(MI);
+  narrowScalarDst(MI, NarrowTy, 0,
+                  IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT);
+  Observer.changedInstr(MI);
+  return Legalized;
+}
+
 LegalizerHelper::LegalizeResult
 LegalizerHelper::narrowScalarExtract(MachineInstr &MI, unsigned TypeIdx,
                                      LLT NarrowTy) {
@@ -4268,9 +4857,9 @@ LegalizerHelper::narrowScalarCTPOP(MachineInstr &MI, unsigned TypeIdx,
 }
 
 LegalizerHelper::LegalizeResult
-LegalizerHelper::lowerBitCount(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
+LegalizerHelper::lowerBitCount(MachineInstr &MI) {
   unsigned Opc = MI.getOpcode();
-  auto &TII = *MI.getMF()->getSubtarget().getInstrInfo();
+  const auto &TII = MIRBuilder.getTII();
   auto isSupported = [this](const LegalityQuery &Q) {
     auto QAction = LI.getAction(Q).Action;
     return QAction == Legal || QAction == Libcall || QAction == Custom;
@@ -4358,15 +4947,15 @@ LegalizerHelper::lowerBitCount(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
     // unless the target has ctlz but not ctpop, in which case we use:
     // { return 32 - nlz(~x & (x-1)); }
     // Ref: "Hacker's Delight" by Henry Warren
-    auto MIBCstNeg1 = MIRBuilder.buildConstant(Ty, -1);
-    auto MIBNot = MIRBuilder.buildXor(Ty, SrcReg, MIBCstNeg1);
+    auto MIBCstNeg1 = MIRBuilder.buildConstant(SrcTy, -1);
+    auto MIBNot = MIRBuilder.buildXor(SrcTy, SrcReg, MIBCstNeg1);
     auto MIBTmp = MIRBuilder.buildAnd(
-        Ty, MIBNot, MIRBuilder.buildAdd(Ty, SrcReg, MIBCstNeg1));
-    if (!isSupported({TargetOpcode::G_CTPOP, {Ty, Ty}}) &&
-        isSupported({TargetOpcode::G_CTLZ, {Ty, Ty}})) {
-      auto MIBCstLen = MIRBuilder.buildConstant(Ty, Len);
+        SrcTy, MIBNot, MIRBuilder.buildAdd(SrcTy, SrcReg, MIBCstNeg1));
+    if (!isSupported({TargetOpcode::G_CTPOP, {SrcTy, SrcTy}}) &&
+        isSupported({TargetOpcode::G_CTLZ, {SrcTy, SrcTy}})) {
+      auto MIBCstLen = MIRBuilder.buildConstant(SrcTy, Len);
       MIRBuilder.buildSub(MI.getOperand(0), MIBCstLen,
-                          MIRBuilder.buildCTLZ(Ty, MIBTmp));
+                          MIRBuilder.buildCTLZ(SrcTy, MIBTmp));
       MI.eraseFromParent();
       return Legalized;
     }
@@ -4375,6 +4964,8 @@ LegalizerHelper::lowerBitCount(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
     return Legalized;
   }
   case TargetOpcode::G_CTPOP: {
+    Register SrcReg = MI.getOperand(1).getReg();
+    LLT Ty = MRI.getType(SrcReg);
     unsigned Size = Ty.getSizeInBits();
     MachineIRBuilder &B = MIRBuilder;
 
@@ -4384,11 +4975,11 @@ LegalizerHelper::lowerBitCount(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
     // B2Count = val - { (val >> 1) & 0x55555555 }
     // since it gives same result in blocks of 2 with one instruction less.
     auto C_1 = B.buildConstant(Ty, 1);
-    auto B2Set1LoTo1Hi = B.buildLShr(Ty, MI.getOperand(1).getReg(), C_1);
+    auto B2Set1LoTo1Hi = B.buildLShr(Ty, SrcReg, C_1);
     APInt B2Mask1HiTo0 = APInt::getSplat(Size, APInt(8, 0x55));
     auto C_B2Mask1HiTo0 = B.buildConstant(Ty, B2Mask1HiTo0);
     auto B2Count1Hi = B.buildAnd(Ty, B2Set1LoTo1Hi, C_B2Mask1HiTo0);
-    auto B2Count = B.buildSub(Ty, MI.getOperand(1).getReg(), B2Count1Hi);
+    auto B2Count = B.buildSub(Ty, SrcReg, B2Count1Hi);
 
     // In order to get count in blocks of 4 add values from adjacent block of 2.
     // B4Count = { B2Count & 0x33333333 } + { (B2Count >> 2) & 0x33333333 }
@@ -4487,8 +5078,7 @@ LegalizerHelper::lowerU64ToF32BitOps(MachineInstr &MI) {
   return Legalized;
 }
 
-LegalizerHelper::LegalizeResult
-LegalizerHelper::lowerUITOFP(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
+LegalizerHelper::LegalizeResult LegalizerHelper::lowerUITOFP(MachineInstr &MI) {
   Register Dst = MI.getOperand(0).getReg();
   Register Src = MI.getOperand(1).getReg();
   LLT DstTy = MRI.getType(Dst);
@@ -4516,8 +5106,7 @@ LegalizerHelper::lowerUITOFP(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
   return UnableToLegalize;
 }
 
-LegalizerHelper::LegalizeResult
-LegalizerHelper::lowerSITOFP(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
+LegalizerHelper::LegalizeResult LegalizerHelper::lowerSITOFP(MachineInstr &MI) {
   Register Dst = MI.getOperand(0).getReg();
   Register Src = MI.getOperand(1).getReg();
   LLT DstTy = MRI.getType(Dst);
@@ -4563,8 +5152,7 @@ LegalizerHelper::lowerSITOFP(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
   return UnableToLegalize;
 }
 
-LegalizerHelper::LegalizeResult
-LegalizerHelper::lowerFPTOUI(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
+LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOUI(MachineInstr &MI) {
   Register Dst = MI.getOperand(0).getReg();
   Register Src = MI.getOperand(1).getReg();
   LLT DstTy = MRI.getType(Dst);
@@ -4781,7 +5369,7 @@ LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) {
 }
 
 LegalizerHelper::LegalizeResult
-LegalizerHelper::lowerFPTRUNC(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
+LegalizerHelper::lowerFPTRUNC(MachineInstr &MI) {
   Register Dst = MI.getOperand(0).getReg();
   Register Src = MI.getOperand(1).getReg();
 
@@ -4796,6 +5384,20 @@ LegalizerHelper::lowerFPTRUNC(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
   return UnableToLegalize;
 }
 
+// TODO: If RHS is a constant SelectionDAGBuilder expands this into a
+// multiplication tree.
+LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPOWI(MachineInstr &MI) {
+  Register Dst = MI.getOperand(0).getReg();
+  Register Src0 = MI.getOperand(1).getReg();
+  Register Src1 = MI.getOperand(2).getReg();
+  LLT Ty = MRI.getType(Dst);
+
+  auto CvtSrc1 = MIRBuilder.buildSITOFP(Ty, Src1);
+  MIRBuilder.buildFPow(Dst, Src0, CvtSrc1, MI.getFlags());
+  MI.eraseFromParent();
+  return Legalized;
+}
+
 static CmpInst::Predicate minMaxToCompare(unsigned Opc) {
   switch (Opc) {
   case TargetOpcode::G_SMIN:
@@ -4811,8 +5413,7 @@ static CmpInst::Predicate minMaxToCompare(unsigned Opc) {
   }
 }
 
-LegalizerHelper::LegalizeResult
-LegalizerHelper::lowerMinMax(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
+LegalizerHelper::LegalizeResult LegalizerHelper::lowerMinMax(MachineInstr &MI) {
   Register Dst = MI.getOperand(0).getReg();
   Register Src0 = MI.getOperand(1).getReg();
   Register Src1 = MI.getOperand(2).getReg();
@@ -4828,7 +5429,7 @@ LegalizerHelper::lowerMinMax(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
 }
 
 LegalizerHelper::LegalizeResult
-LegalizerHelper::lowerFCopySign(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
+LegalizerHelper::lowerFCopySign(MachineInstr &MI) {
   Register Dst = MI.getOperand(0).getReg();
   Register Src0 = MI.getOperand(1).getReg();
   Register Src1 = MI.getOperand(2).getReg();
@@ -5050,6 +5651,71 @@ LegalizerHelper::lowerUnmergeValues(MachineInstr &MI) {
   return Legalized;
 }
 
+/// Lower a vector extract or insert by writing the vector to a stack temporary
+/// and reloading the element or vector.
+///
+/// %dst = G_EXTRACT_VECTOR_ELT %vec, %idx
+///  =>
+///  %stack_temp = G_FRAME_INDEX
+///  G_STORE %vec, %stack_temp
+///  %idx = clamp(%idx, %vec.getNumElements())
+///  %element_ptr = G_PTR_ADD %stack_temp, %idx
+///  %dst = G_LOAD %element_ptr
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerExtractInsertVectorElt(MachineInstr &MI) {
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcVec = MI.getOperand(1).getReg();
+  Register InsertVal;
+  if (MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT)
+    InsertVal = MI.getOperand(2).getReg();
+
+  Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg();
+
+  LLT VecTy = MRI.getType(SrcVec);
+  LLT EltTy = VecTy.getElementType();
+  if (!EltTy.isByteSized()) { // Not implemented.
+    LLVM_DEBUG(dbgs() << "Can't handle non-byte element vectors yet\n");
+    return UnableToLegalize;
+  }
+
+  unsigned EltBytes = EltTy.getSizeInBytes();
+  Align VecAlign = getStackTemporaryAlignment(VecTy);
+  Align EltAlign;
+
+  MachinePointerInfo PtrInfo;
+  auto StackTemp = createStackTemporary(TypeSize::Fixed(VecTy.getSizeInBytes()),
+                                        VecAlign, PtrInfo);
+  MIRBuilder.buildStore(SrcVec, StackTemp, PtrInfo, VecAlign);
+
+  // Get the pointer to the element, and be sure not to hit undefined behavior
+  // if the index is out of bounds.
+  Register EltPtr = getVectorElementPointer(StackTemp.getReg(0), VecTy, Idx);
+
+  int64_t IdxVal;
+  if (mi_match(Idx, MRI, m_ICst(IdxVal))) {
+    int64_t Offset = IdxVal * EltBytes;
+    PtrInfo = PtrInfo.getWithOffset(Offset);
+    EltAlign = commonAlignment(VecAlign, Offset);
+  } else {
+    // We lose information with a variable offset.
+    EltAlign = getStackTemporaryAlignment(EltTy);
+    PtrInfo = MachinePointerInfo(MRI.getType(EltPtr).getAddressSpace());
+  }
+
+  if (InsertVal) {
+    // Write the inserted element
+    MIRBuilder.buildStore(InsertVal, EltPtr, PtrInfo, EltAlign);
+
+    // Reload the whole vector.
+    MIRBuilder.buildLoad(DstReg, StackTemp, PtrInfo, VecAlign);
+  } else {
+    MIRBuilder.buildLoad(DstReg, EltPtr, PtrInfo, EltAlign);
+  }
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
 LegalizerHelper::LegalizeResult
 LegalizerHelper::lowerShuffleVector(MachineInstr &MI) {
   Register DstReg = MI.getOperand(0).getReg();
@@ -5120,7 +5786,6 @@ LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) {
   LLT PtrTy = MRI.getType(Dst);
   LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
 
-  const auto &TLI = *MF.getSubtarget().getTargetLowering();
   Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
   auto SPTmp = MIRBuilder.buildCopy(PtrTy, SPReg);
   SPTmp = MIRBuilder.buildCast(IntPtrTy, SPTmp);
@@ -5265,6 +5930,185 @@ LegalizerHelper::lowerSADDO_SSUBO(MachineInstr &MI) {
   return Legalized;
 }
 
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerAddSubSatToMinMax(MachineInstr &MI) {
+  Register Res = MI.getOperand(0).getReg();
+  Register LHS = MI.getOperand(1).getReg();
+  Register RHS = MI.getOperand(2).getReg();
+  LLT Ty = MRI.getType(Res);
+  bool IsSigned;
+  bool IsAdd;
+  unsigned BaseOp;
+  switch (MI.getOpcode()) {
+  default:
+    llvm_unreachable("unexpected addsat/subsat opcode");
+  case TargetOpcode::G_UADDSAT:
+    IsSigned = false;
+    IsAdd = true;
+    BaseOp = TargetOpcode::G_ADD;
+    break;
+  case TargetOpcode::G_SADDSAT:
+    IsSigned = true;
+    IsAdd = true;
+    BaseOp = TargetOpcode::G_ADD;
+    break;
+  case TargetOpcode::G_USUBSAT:
+    IsSigned = false;
+    IsAdd = false;
+    BaseOp = TargetOpcode::G_SUB;
+    break;
+  case TargetOpcode::G_SSUBSAT:
+    IsSigned = true;
+    IsAdd = false;
+    BaseOp = TargetOpcode::G_SUB;
+    break;
+  }
+
+  if (IsSigned) {
+    // sadd.sat(a, b) ->
+    //   hi = 0x7fffffff - smax(a, 0)
+    //   lo = 0x80000000 - smin(a, 0)
+    //   a + smin(smax(lo, b), hi)
+    // ssub.sat(a, b) ->
+    //   lo = smax(a, -1) - 0x7fffffff
+    //   hi = smin(a, -1) - 0x80000000
+    //   a - smin(smax(lo, b), hi)
+    // TODO: AMDGPU can use a "median of 3" instruction here:
+    //   a +/- med3(lo, b, hi)
+    uint64_t NumBits = Ty.getScalarSizeInBits();
+    auto MaxVal =
+        MIRBuilder.buildConstant(Ty, APInt::getSignedMaxValue(NumBits));
+    auto MinVal =
+        MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(NumBits));
+    MachineInstrBuilder Hi, Lo;
+    if (IsAdd) {
+      auto Zero = MIRBuilder.buildConstant(Ty, 0);
+      Hi = MIRBuilder.buildSub(Ty, MaxVal, MIRBuilder.buildSMax(Ty, LHS, Zero));
+      Lo = MIRBuilder.buildSub(Ty, MinVal, MIRBuilder.buildSMin(Ty, LHS, Zero));
+    } else {
+      auto NegOne = MIRBuilder.buildConstant(Ty, -1);
+      Lo = MIRBuilder.buildSub(Ty, MIRBuilder.buildSMax(Ty, LHS, NegOne),
+                               MaxVal);
+      Hi = MIRBuilder.buildSub(Ty, MIRBuilder.buildSMin(Ty, LHS, NegOne),
+                               MinVal);
+    }
+    auto RHSClamped =
+        MIRBuilder.buildSMin(Ty, MIRBuilder.buildSMax(Ty, Lo, RHS), Hi);
+    MIRBuilder.buildInstr(BaseOp, {Res}, {LHS, RHSClamped});
+  } else {
+    // uadd.sat(a, b) -> a + umin(~a, b)
+    // usub.sat(a, b) -> a - umin(a, b)
+    Register Not = IsAdd ? MIRBuilder.buildNot(Ty, LHS).getReg(0) : LHS;
+    auto Min = MIRBuilder.buildUMin(Ty, Not, RHS);
+    MIRBuilder.buildInstr(BaseOp, {Res}, {LHS, Min});
+  }
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerAddSubSatToAddoSubo(MachineInstr &MI) {
+  Register Res = MI.getOperand(0).getReg();
+  Register LHS = MI.getOperand(1).getReg();
+  Register RHS = MI.getOperand(2).getReg();
+  LLT Ty = MRI.getType(Res);
+  LLT BoolTy = Ty.changeElementSize(1);
+  bool IsSigned;
+  bool IsAdd;
+  unsigned OverflowOp;
+  switch (MI.getOpcode()) {
+  default:
+    llvm_unreachable("unexpected addsat/subsat opcode");
+  case TargetOpcode::G_UADDSAT:
+    IsSigned = false;
+    IsAdd = true;
+    OverflowOp = TargetOpcode::G_UADDO;
+    break;
+  case TargetOpcode::G_SADDSAT:
+    IsSigned = true;
+    IsAdd = true;
+    OverflowOp = TargetOpcode::G_SADDO;
+    break;
+  case TargetOpcode::G_USUBSAT:
+    IsSigned = false;
+    IsAdd = false;
+    OverflowOp = TargetOpcode::G_USUBO;
+    break;
+  case TargetOpcode::G_SSUBSAT:
+    IsSigned = true;
+    IsAdd = false;
+    OverflowOp = TargetOpcode::G_SSUBO;
+    break;
+  }
+
+  auto OverflowRes =
+      MIRBuilder.buildInstr(OverflowOp, {Ty, BoolTy}, {LHS, RHS});
+  Register Tmp = OverflowRes.getReg(0);
+  Register Ov = OverflowRes.getReg(1);
+  MachineInstrBuilder Clamp;
+  if (IsSigned) {
+    // sadd.sat(a, b) ->
+    //   {tmp, ov} = saddo(a, b)
+    //   ov ? (tmp >>s 31) + 0x80000000 : r
+    // ssub.sat(a, b) ->
+    //   {tmp, ov} = ssubo(a, b)
+    //   ov ? (tmp >>s 31) + 0x80000000 : r
+    uint64_t NumBits = Ty.getScalarSizeInBits();
+    auto ShiftAmount = MIRBuilder.buildConstant(Ty, NumBits - 1);
+    auto Sign = MIRBuilder.buildAShr(Ty, Tmp, ShiftAmount);
+    auto MinVal =
+        MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(NumBits));
+    Clamp = MIRBuilder.buildAdd(Ty, Sign, MinVal);
+  } else {
+    // uadd.sat(a, b) ->
+    //   {tmp, ov} = uaddo(a, b)
+    //   ov ? 0xffffffff : tmp
+    // usub.sat(a, b) ->
+    //   {tmp, ov} = usubo(a, b)
+    //   ov ? 0 : tmp
+    Clamp = MIRBuilder.buildConstant(Ty, IsAdd ? -1 : 0);
+  }
+  MIRBuilder.buildSelect(Res, Ov, Clamp, Tmp);
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerShlSat(MachineInstr &MI) {
+  assert((MI.getOpcode() == TargetOpcode::G_SSHLSAT ||
+          MI.getOpcode() == TargetOpcode::G_USHLSAT) &&
+         "Expected shlsat opcode!");
+  bool IsSigned = MI.getOpcode() == TargetOpcode::G_SSHLSAT;
+  Register Res = MI.getOperand(0).getReg();
+  Register LHS = MI.getOperand(1).getReg();
+  Register RHS = MI.getOperand(2).getReg();
+  LLT Ty = MRI.getType(Res);
+  LLT BoolTy = Ty.changeElementSize(1);
+
+  unsigned BW = Ty.getScalarSizeInBits();
+  auto Result = MIRBuilder.buildShl(Ty, LHS, RHS);
+  auto Orig = IsSigned ? MIRBuilder.buildAShr(Ty, Result, RHS)
+                       : MIRBuilder.buildLShr(Ty, Result, RHS);
+
+  MachineInstrBuilder SatVal;
+  if (IsSigned) {
+    auto SatMin = MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(BW));
+    auto SatMax = MIRBuilder.buildConstant(Ty, APInt::getSignedMaxValue(BW));
+    auto Cmp = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, LHS,
+                                    MIRBuilder.buildConstant(Ty, 0));
+    SatVal = MIRBuilder.buildSelect(Ty, Cmp, SatMin, SatMax);
+  } else {
+    SatVal = MIRBuilder.buildConstant(Ty, APInt::getMaxValue(BW));
+  }
+  auto Ov = MIRBuilder.buildICmp(CmpInst::ICMP_NE, BoolTy, LHS, Orig);
+  MIRBuilder.buildSelect(Res, Ov, SatVal, Result);
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
 LegalizerHelper::LegalizeResult
 LegalizerHelper::lowerBswap(MachineInstr &MI) {
   Register Dst = MI.getOperand(0).getReg();
@@ -5345,8 +6189,6 @@ LegalizerHelper::lowerBitreverse(MachineInstr &MI) {
 LegalizerHelper::LegalizeResult
 LegalizerHelper::lowerReadWriteRegister(MachineInstr &MI) {
   MachineFunction &MF = MIRBuilder.getMF();
-  const TargetSubtargetInfo &STI = MF.getSubtarget();
-  const TargetLowering *TLI = STI.getTargetLowering();
 
   bool IsRead = MI.getOpcode() == TargetOpcode::G_READ_REGISTER;
   int NameOpIdx = IsRead ? 1 : 0;
@@ -5357,7 +6199,7 @@ LegalizerHelper::lowerReadWriteRegister(MachineInstr &MI) {
   const MDString *RegStr = cast<MDString>(
     cast<MDNode>(MI.getOperand(NameOpIdx).getMetadata())->getOperand(0));
 
-  Register PhysReg = TLI->getRegisterByName(RegStr->getString().data(), Ty, MF);
+  Register PhysReg = TLI.getRegisterByName(RegStr->getString().data(), Ty, MF);
   if (!PhysReg.isValid())
     return UnableToLegalize;
 
@@ -5369,3 +6211,63 @@ LegalizerHelper::lowerReadWriteRegister(MachineInstr &MI) {
   MI.eraseFromParent();
   return Legalized;
 }
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerSMULH_UMULH(MachineInstr &MI) {
+  bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULH;
+  unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
+  Register Result = MI.getOperand(0).getReg();
+  LLT OrigTy = MRI.getType(Result);
+  auto SizeInBits = OrigTy.getScalarSizeInBits();
+  LLT WideTy = OrigTy.changeElementSize(SizeInBits * 2);
+
+  auto LHS = MIRBuilder.buildInstr(ExtOp, {WideTy}, {MI.getOperand(1)});
+  auto RHS = MIRBuilder.buildInstr(ExtOp, {WideTy}, {MI.getOperand(2)});
+  auto Mul = MIRBuilder.buildMul(WideTy, LHS, RHS);
+  unsigned ShiftOp = IsSigned ? TargetOpcode::G_ASHR : TargetOpcode::G_LSHR;
+
+  auto ShiftAmt = MIRBuilder.buildConstant(WideTy, SizeInBits);
+  auto Shifted = MIRBuilder.buildInstr(ShiftOp, {WideTy}, {Mul, ShiftAmt});
+  MIRBuilder.buildTrunc(Result, Shifted);
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult LegalizerHelper::lowerSelect(MachineInstr &MI) {
+  // Implement vector G_SELECT in terms of XOR, AND, OR.
+  Register DstReg = MI.getOperand(0).getReg();
+  Register MaskReg = MI.getOperand(1).getReg();
+  Register Op1Reg = MI.getOperand(2).getReg();
+  Register Op2Reg = MI.getOperand(3).getReg();
+  LLT DstTy = MRI.getType(DstReg);
+  LLT MaskTy = MRI.getType(MaskReg);
+  LLT Op1Ty = MRI.getType(Op1Reg);
+  if (!DstTy.isVector())
+    return UnableToLegalize;
+
+  // Vector selects can have a scalar predicate. If so, splat into a vector and
+  // finish for later legalization attempts to try again.
+  if (MaskTy.isScalar()) {
+    Register MaskElt = MaskReg;
+    if (MaskTy.getSizeInBits() < DstTy.getScalarSizeInBits())
+      MaskElt = MIRBuilder.buildSExt(DstTy.getElementType(), MaskElt).getReg(0);
+    // Generate a vector splat idiom to be pattern matched later.
+    auto ShufSplat = MIRBuilder.buildShuffleSplat(DstTy, MaskElt);
+    Observer.changingInstr(MI);
+    MI.getOperand(1).setReg(ShufSplat.getReg(0));
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
+
+  if (MaskTy.getSizeInBits() != Op1Ty.getSizeInBits()) {
+    return UnableToLegalize;
+  }
+
+  auto NotMask = MIRBuilder.buildNot(MaskTy, MaskReg);
+  auto NewOp1 = MIRBuilder.buildAnd(MaskTy, Op1Reg, MaskReg);
+  auto NewOp2 = MIRBuilder.buildAnd(MaskTy, Op2Reg, NotMask);
+  MIRBuilder.buildOr(DstReg, NewOp1, NewOp2);
+  MI.eraseFromParent();
+  return Legalized;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
index 4abd0c4df97a..30acac14bc5f 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
@@ -105,6 +105,7 @@ raw_ostream &LegalityQuery::print(raw_ostream &OS) const {
 static bool hasNoSimpleLoops(const LegalizeRule &Rule, const LegalityQuery &Q,
                              const std::pair<unsigned, LLT> &Mutation) {
   switch (Rule.getAction()) {
+  case Legal:
   case Custom:
   case Lower:
   case MoreElements:
@@ -122,7 +123,7 @@ static bool mutationIsSane(const LegalizeRule &Rule,
                            std::pair<unsigned, LLT> Mutation) {
   // If the user wants a custom mutation, then we can't really say much about
   // it. Return true, and trust that they're doing the right thing.
-  if (Rule.getAction() == Custom)
+  if (Rule.getAction() == Custom || Rule.getAction() == Legal)
     return true;
 
   const unsigned TypeIdx = Mutation.first;
@@ -147,7 +148,8 @@ static bool mutationIsSane(const LegalizeRule &Rule,
         if (NewTy.getNumElements() <= OldElts)
           return false;
       }
-    }
+    } else if (Rule.getAction() == MoreElements)
+      return false;
 
     // Make sure the element type didn't change.
     return NewTy.getScalarType() == OldTy.getScalarType();
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/Localizer.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/Localizer.cpp
index a07416d08614..30c00c63f6f4 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/Localizer.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/Localizer.cpp
@@ -11,6 +11,7 @@
 
 #include "llvm/CodeGen/GlobalISel/Localizer.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
@@ -56,6 +57,20 @@ bool Localizer::isLocalUse(MachineOperand &MOUse, const MachineInstr &Def,
   return InsertMBB == Def.getParent();
 }
 
+bool Localizer::isNonUniquePhiValue(MachineOperand &Op) const {
+  MachineInstr *MI = Op.getParent();
+  if (!MI->isPHI())
+    return false;
+
+  Register SrcReg = Op.getReg();
+  for (unsigned Idx = 1; Idx < MI->getNumOperands(); Idx += 2) {
+    auto &MO = MI->getOperand(Idx);
+    if (&MO != &Op && MO.isReg() && MO.getReg() == SrcReg)
+      return true;
+  }
+  return false;
+}
+
 bool Localizer::localizeInterBlock(MachineFunction &MF,
                                    LocalizedSetVecT &LocalizedInstrs) {
   bool Changed = false;
@@ -93,6 +108,14 @@ bool Localizer::localizeInterBlock(MachineFunction &MF,
         LocalizedInstrs.insert(&MI);
         continue;
       }
+
+      // If the use is a phi operand that's not unique, don't try to localize.
+      // If we do, we can cause unnecessary instruction bloat by duplicating
+      // into each predecessor block, when the existing one is sufficient and
+      // allows for easier optimization later.
+      if (isNonUniquePhiValue(MOUse))
+        continue;
+
       LLVM_DEBUG(dbgs() << "Fixing non-local use\n");
       Changed = true;
       auto MBBAndReg = std::make_pair(InsertMBB, Reg);
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
index 10f696d6a3b3..67ef02a4e7b2 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
@@ -9,8 +9,8 @@
 /// This file implements the MachineIRBuidler class.
 //===----------------------------------------------------------------------===//
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
-
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -106,8 +106,8 @@ MachineInstrBuilder MachineIRBuilder::buildConstDbgValue(const Constant &C,
   } else if (auto *CFP = dyn_cast<ConstantFP>(&C)) {
     MIB.addFPImm(CFP);
   } else {
-    // Insert %noreg if we didn't find a usable constant and had to drop it.
-    MIB.addReg(0U);
+    // Insert $noreg if we didn't find a usable constant and had to drop it.
+    MIB.addReg(Register());
   }
 
   MIB.addImm(0).addMetadata(Variable).addMetadata(Expr);
@@ -162,6 +162,11 @@ MachineInstrBuilder MachineIRBuilder::buildJumpTable(const LLT PtrTy,
       .addJumpTableIndex(JTI);
 }
 
+void MachineIRBuilder::validateUnaryOp(const LLT Res, const LLT Op0) {
+  assert((Res.isScalar() || Res.isVector()) && "invalid operand type");
+  assert((Res == Op0) && "type mismatch");
+}
+
 void MachineIRBuilder::validateBinaryOp(const LLT Res, const LLT Op0,
                                         const LLT Op1) {
   assert((Res.isScalar() || Res.isVector()) && "invalid operand type");
@@ -312,17 +317,29 @@ MachineInstrBuilder MachineIRBuilder::buildFConstant(const DstOp &Res,
   return buildFConstant(Res, *CFP);
 }
 
-MachineInstrBuilder MachineIRBuilder::buildBrCond(Register Tst,
+MachineInstrBuilder MachineIRBuilder::buildBrCond(const SrcOp &Tst,
                                                   MachineBasicBlock &Dest) {
-  assert(getMRI()->getType(Tst).isScalar() && "invalid operand type");
+  assert(Tst.getLLTTy(*getMRI()).isScalar() && "invalid operand type");
 
-  return buildInstr(TargetOpcode::G_BRCOND).addUse(Tst).addMBB(&Dest);
+  auto MIB = buildInstr(TargetOpcode::G_BRCOND);
+  Tst.addSrcToMIB(MIB);
+  MIB.addMBB(&Dest);
+  return MIB;
 }
 
-MachineInstrBuilder MachineIRBuilder::buildLoad(const DstOp &Res,
-                                                const SrcOp &Addr,
-                                                MachineMemOperand &MMO) {
-  return buildLoadInstr(TargetOpcode::G_LOAD, Res, Addr, MMO);
+MachineInstrBuilder
+MachineIRBuilder::buildLoad(const DstOp &Dst, const SrcOp &Addr,
+                            MachinePointerInfo PtrInfo, Align Alignment,
+                            MachineMemOperand::Flags MMOFlags,
+                            const AAMDNodes &AAInfo) {
+  MMOFlags |= MachineMemOperand::MOLoad;
+  assert((MMOFlags & MachineMemOperand::MOStore) == 0);
+
+  uint64_t Size = MemoryLocation::getSizeOrUnknown(
+      TypeSize::Fixed(Dst.getLLTTy(*getMRI()).getSizeInBytes()));
+  MachineMemOperand *MMO =
+      getMF().getMachineMemOperand(PtrInfo, MMOFlags, Size, Alignment, AAInfo);
+  return buildLoad(Dst, Addr, *MMO);
 }
 
 MachineInstrBuilder MachineIRBuilder::buildLoadInstr(unsigned Opcode,
@@ -369,6 +386,21 @@ MachineInstrBuilder MachineIRBuilder::buildStore(const SrcOp &Val,
   return MIB;
 }
 
+MachineInstrBuilder
+MachineIRBuilder::buildStore(const SrcOp &Val, const SrcOp &Addr,
+                             MachinePointerInfo PtrInfo, Align Alignment,
+                             MachineMemOperand::Flags MMOFlags,
+                             const AAMDNodes &AAInfo) {
+  MMOFlags |= MachineMemOperand::MOStore;
+  assert((MMOFlags & MachineMemOperand::MOLoad) == 0);
+
+  uint64_t Size = MemoryLocation::getSizeOrUnknown(
+      TypeSize::Fixed(Val.getLLTTy(*getMRI()).getSizeInBytes()));
+  MachineMemOperand *MMO =
+      getMF().getMachineMemOperand(PtrInfo, MMOFlags, Size, Alignment, AAInfo);
+  return buildStore(Val, Addr, *MMO);
+}
+
 MachineInstrBuilder MachineIRBuilder::buildAnyExt(const DstOp &Res,
                                                   const SrcOp &Op) {
   return buildInstr(TargetOpcode::G_ANYEXT, Res, Op);
@@ -603,6 +635,35 @@ MachineIRBuilder::buildBuildVectorTrunc(const DstOp &Res,
   return buildInstr(TargetOpcode::G_BUILD_VECTOR_TRUNC, Res, TmpVec);
 }
 
+MachineInstrBuilder MachineIRBuilder::buildShuffleSplat(const DstOp &Res,
+                                                        const SrcOp &Src) {
+  LLT DstTy = Res.getLLTTy(*getMRI());
+  assert(Src.getLLTTy(*getMRI()) == DstTy.getElementType() &&
+         "Expected Src to match Dst elt ty");
+  auto UndefVec = buildUndef(DstTy);
+  auto Zero = buildConstant(LLT::scalar(64), 0);
+  auto InsElt = buildInsertVectorElement(DstTy, UndefVec, Src, Zero);
+  SmallVector<int, 16> ZeroMask(DstTy.getNumElements());
+  return buildShuffleVector(DstTy, InsElt, UndefVec, ZeroMask);
+}
+
+MachineInstrBuilder MachineIRBuilder::buildShuffleVector(const DstOp &Res,
+                                                         const SrcOp &Src1,
+                                                         const SrcOp &Src2,
+                                                         ArrayRef<int> Mask) {
+  LLT DstTy = Res.getLLTTy(*getMRI());
+  LLT Src1Ty = Src1.getLLTTy(*getMRI());
+  LLT Src2Ty = Src2.getLLTTy(*getMRI());
+  assert(Src1Ty.getNumElements() + Src2Ty.getNumElements() >= Mask.size());
+  assert(DstTy.getElementType() == Src1Ty.getElementType() &&
+         DstTy.getElementType() == Src2Ty.getElementType());
+  (void)Src1Ty;
+  (void)Src2Ty;
+  ArrayRef<int> MaskAlloc = getMF().allocateShuffleMask(Mask);
+  return buildInstr(TargetOpcode::G_SHUFFLE_VECTOR, {DstTy}, {Src1, Src2})
+      .addShuffleMask(MaskAlloc);
+}
+
 MachineInstrBuilder
 MachineIRBuilder::buildConcatVectors(const DstOp &Res, ArrayRef<Register> Ops) {
   // Unfortunately to convert from ArrayRef<Register> to ArrayRef<SrcOp>,
@@ -925,6 +986,14 @@ MachineInstrBuilder MachineIRBuilder::buildInstr(unsigned Opc,
         SrcOps[1].getLLTTy(*getMRI()), SrcOps[2].getLLTTy(*getMRI()));
     break;
   }
+  case TargetOpcode::G_FNEG:
+  case TargetOpcode::G_ABS:
+    // All these are unary ops.
+    assert(DstOps.size() == 1 && "Invalid Dst");
+    assert(SrcOps.size() == 1 && "Invalid Srcs");
+    validateUnaryOp(DstOps[0].getLLTTy(*getMRI()),
+                    SrcOps[0].getLLTTy(*getMRI()));
+    break;
   case TargetOpcode::G_ADD:
   case TargetOpcode::G_AND:
   case TargetOpcode::G_MUL:
@@ -953,7 +1022,9 @@ MachineInstrBuilder MachineIRBuilder::buildInstr(unsigned Opc,
   }
   case TargetOpcode::G_SHL:
   case TargetOpcode::G_ASHR:
-  case TargetOpcode::G_LSHR: {
+  case TargetOpcode::G_LSHR:
+  case TargetOpcode::G_USHLSAT:
+  case TargetOpcode::G_SSHLSAT: {
     assert(DstOps.size() == 1 && "Invalid Dst");
     assert(SrcOps.size() == 2 && "Invalid Srcs");
     validateShiftOp(DstOps[0].getLLTTy(*getMRI()),
@@ -1018,11 +1089,11 @@ MachineInstrBuilder MachineIRBuilder::buildInstr(unsigned Opc,
   case TargetOpcode::G_UNMERGE_VALUES: {
     assert(!DstOps.empty() && "Invalid trivial sequence");
     assert(SrcOps.size() == 1 && "Invalid src for Unmerge");
-    assert(std::all_of(DstOps.begin(), DstOps.end(),
-                       [&, this](const DstOp &Op) {
-                         return Op.getLLTTy(*getMRI()) ==
-                                DstOps[0].getLLTTy(*getMRI());
-                       }) &&
+    assert(llvm::all_of(DstOps,
+                        [&, this](const DstOp &Op) {
+                          return Op.getLLTTy(*getMRI()) ==
+                                 DstOps[0].getLLTTy(*getMRI());
+                        }) &&
            "type mismatch in output list");
     assert(DstOps.size() * DstOps[0].getLLTTy(*getMRI()).getSizeInBits() ==
                SrcOps[0].getLLTTy(*getMRI()).getSizeInBits() &&
@@ -1032,11 +1103,11 @@ MachineInstrBuilder MachineIRBuilder::buildInstr(unsigned Opc,
   case TargetOpcode::G_MERGE_VALUES: {
     assert(!SrcOps.empty() && "invalid trivial sequence");
     assert(DstOps.size() == 1 && "Invalid Dst");
-    assert(std::all_of(SrcOps.begin(), SrcOps.end(),
-                       [&, this](const SrcOp &Op) {
-                         return Op.getLLTTy(*getMRI()) ==
-                                SrcOps[0].getLLTTy(*getMRI());
-                       }) &&
+    assert(llvm::all_of(SrcOps,
+                        [&, this](const SrcOp &Op) {
+                          return Op.getLLTTy(*getMRI()) ==
+                                 SrcOps[0].getLLTTy(*getMRI());
+                        }) &&
            "type mismatch in input list");
     assert(SrcOps.size() * SrcOps[0].getLLTTy(*getMRI()).getSizeInBits() ==
                DstOps[0].getLLTTy(*getMRI()).getSizeInBits() &&
@@ -1083,11 +1154,11 @@ MachineInstrBuilder MachineIRBuilder::buildInstr(unsigned Opc,
     assert(DstOps.size() == 1 && "Invalid DstOps");
     assert(DstOps[0].getLLTTy(*getMRI()).isVector() &&
            "Res type must be a vector");
-    assert(std::all_of(SrcOps.begin(), SrcOps.end(),
-                       [&, this](const SrcOp &Op) {
-                         return Op.getLLTTy(*getMRI()) ==
-                                SrcOps[0].getLLTTy(*getMRI());
-                       }) &&
+    assert(llvm::all_of(SrcOps,
+                        [&, this](const SrcOp &Op) {
+                          return Op.getLLTTy(*getMRI()) ==
+                                 SrcOps[0].getLLTTy(*getMRI());
+                        }) &&
            "type mismatch in input list");
     assert(SrcOps.size() * SrcOps[0].getLLTTy(*getMRI()).getSizeInBits() ==
                DstOps[0].getLLTTy(*getMRI()).getSizeInBits() &&
@@ -1100,11 +1171,11 @@ MachineInstrBuilder MachineIRBuilder::buildInstr(unsigned Opc,
     assert(DstOps.size() == 1 && "Invalid DstOps");
     assert(DstOps[0].getLLTTy(*getMRI()).isVector() &&
            "Res type must be a vector");
-    assert(std::all_of(SrcOps.begin(), SrcOps.end(),
-                       [&, this](const SrcOp &Op) {
-                         return Op.getLLTTy(*getMRI()) ==
-                                SrcOps[0].getLLTTy(*getMRI());
-                       }) &&
+    assert(llvm::all_of(SrcOps,
+                        [&, this](const SrcOp &Op) {
+                          return Op.getLLTTy(*getMRI()) ==
+                                 SrcOps[0].getLLTTy(*getMRI());
+                        }) &&
            "type mismatch in input list");
     if (SrcOps[0].getLLTTy(*getMRI()).getSizeInBits() ==
         DstOps[0].getLLTTy(*getMRI()).getElementType().getSizeInBits())
@@ -1115,12 +1186,12 @@ MachineInstrBuilder MachineIRBuilder::buildInstr(unsigned Opc,
     assert(DstOps.size() == 1 && "Invalid DstOps");
     assert((!SrcOps.empty() || SrcOps.size() < 2) &&
            "Must have at least 2 operands");
-    assert(std::all_of(SrcOps.begin(), SrcOps.end(),
-                       [&, this](const SrcOp &Op) {
-                         return (Op.getLLTTy(*getMRI()).isVector() &&
-                                 Op.getLLTTy(*getMRI()) ==
-                                     SrcOps[0].getLLTTy(*getMRI()));
-                       }) &&
+    assert(llvm::all_of(SrcOps,
+                        [&, this](const SrcOp &Op) {
+                          return (Op.getLLTTy(*getMRI()).isVector() &&
+                                  Op.getLLTTy(*getMRI()) ==
+                                      SrcOps[0].getLLTTy(*getMRI()));
+                        }) &&
            "type mismatch in input list");
     assert(SrcOps.size() * SrcOps[0].getLLTTy(*getMRI()).getSizeInBits() ==
                DstOps[0].getLLTTy(*getMRI()).getSizeInBits() &&
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
index 255ea693b5c4..e2a963747101 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
@@ -421,8 +421,7 @@ RegisterBankInfo::getInstrPossibleMappings(const MachineInstr &MI) const {
 
   // Then the alternative mapping, if any.
   InstructionMappings AltMappings = getInstrAlternativeMappings(MI);
-  for (const InstructionMapping *AltMapping : AltMappings)
-    PossibleMappings.push_back(AltMapping);
+  append_range(PossibleMappings, AltMappings);
 #ifndef NDEBUG
   for (const InstructionMapping *Mapping : PossibleMappings)
     assert(Mapping->verify(MI) && "Mapping is invalid");
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/Utils.cpp
index 8a7fb4fbbf2d..cd2483224489 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/Utils.cpp
@@ -11,8 +11,11 @@
 
 #include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/ADT/APFloat.h"
-#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
+#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -20,13 +23,16 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/StackProtector.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/Target/TargetMachine.h"
 
 #define DEBUG_TYPE "globalisel-utils"
 
 using namespace llvm;
+using namespace MIPatternMatch;
 
 Register llvm::constrainRegToClass(MachineRegisterInfo &MRI,
                                    const TargetInstrInfo &TII,
@@ -42,7 +48,7 @@ Register llvm::constrainOperandRegClass(
     const MachineFunction &MF, const TargetRegisterInfo &TRI,
     MachineRegisterInfo &MRI, const TargetInstrInfo &TII,
     const RegisterBankInfo &RBI, MachineInstr &InsertPt,
-    const TargetRegisterClass &RegClass, const MachineOperand &RegMO) {
+    const TargetRegisterClass &RegClass, MachineOperand &RegMO) {
   Register Reg = RegMO.getReg();
   // Assume physical registers are properly constrained.
   assert(Register::isVirtualRegister(Reg) && "PhysReg not implemented");
@@ -63,6 +69,13 @@ Register llvm::constrainOperandRegClass(
               TII.get(TargetOpcode::COPY), Reg)
           .addReg(ConstrainedReg);
     }
+    if (GISelChangeObserver *Observer = MF.getObserver()) {
+      Observer->changingInstr(*RegMO.getParent());
+    }
+    RegMO.setReg(ConstrainedReg);
+    if (GISelChangeObserver *Observer = MF.getObserver()) {
+      Observer->changedInstr(*RegMO.getParent());
+    }
   } else {
     if (GISelChangeObserver *Observer = MF.getObserver()) {
       if (!RegMO.isDef()) {
@@ -80,7 +93,7 @@ Register llvm::constrainOperandRegClass(
     const MachineFunction &MF, const TargetRegisterInfo &TRI,
     MachineRegisterInfo &MRI, const TargetInstrInfo &TII,
     const RegisterBankInfo &RBI, MachineInstr &InsertPt, const MCInstrDesc &II,
-    const MachineOperand &RegMO, unsigned OpIdx) {
+    MachineOperand &RegMO, unsigned OpIdx) {
   Register Reg = RegMO.getReg();
   // Assume physical registers are properly constrained.
   assert(Register::isVirtualRegister(Reg) && "PhysReg not implemented");
@@ -150,8 +163,7 @@ bool llvm::constrainSelectedInstRegOperands(MachineInstr &I,
     // If the operand is a vreg, we should constrain its regclass, and only
     // insert COPYs if that's impossible.
     // constrainOperandRegClass does that for us.
-    MO.setReg(constrainOperandRegClass(MF, TRI, MRI, TII, RBI, I, I.getDesc(),
-                                       MO, OpI));
+    constrainOperandRegClass(MF, TRI, MRI, TII, RBI, I, I.getDesc(), MO, OpI);
 
     // Tie uses to defs as indicated in MCInstrDesc if this hasn't already been
     // done.
@@ -180,6 +192,14 @@ bool llvm::canReplaceReg(Register DstReg, Register SrcReg,
 
 bool llvm::isTriviallyDead(const MachineInstr &MI,
                            const MachineRegisterInfo &MRI) {
+  // FIXME: This logical is mostly duplicated with
+  // DeadMachineInstructionElim::isDead. Why is LOCAL_ESCAPE not considered in
+  // MachineInstr::isLabel?
+
+  // Don't delete frame allocation labels.
+  if (MI.getOpcode() == TargetOpcode::LOCAL_ESCAPE)
+    return false;
+
   // If we can move an instruction, we can remove it.  Otherwise, it has
   // a side-effect of some sort.
   bool SawStore = false;
@@ -242,8 +262,8 @@ void llvm::reportGISelFailure(MachineFunction &MF, const TargetPassConfig &TPC,
   reportGISelFailure(MF, TPC, MORE, R);
 }
 
-Optional<int64_t> llvm::getConstantVRegVal(Register VReg,
-                                           const MachineRegisterInfo &MRI) {
+Optional<APInt> llvm::getConstantVRegVal(Register VReg,
+                                         const MachineRegisterInfo &MRI) {
   Optional<ValueAndVReg> ValAndVReg =
       getConstantVRegValWithLookThrough(VReg, MRI, /*LookThroughInstrs*/ false);
   assert((!ValAndVReg || ValAndVReg->VReg == VReg) &&
@@ -253,9 +273,17 @@ Optional<int64_t> llvm::getConstantVRegVal(Register VReg,
   return ValAndVReg->Value;
 }
 
+Optional<int64_t> llvm::getConstantVRegSExtVal(Register VReg,
+                                               const MachineRegisterInfo &MRI) {
+  Optional<APInt> Val = getConstantVRegVal(VReg, MRI);
+  if (Val && Val->getBitWidth() <= 64)
+    return Val->getSExtValue();
+  return None;
+}
+
 Optional<ValueAndVReg> llvm::getConstantVRegValWithLookThrough(
     Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs,
-    bool HandleFConstant) {
+    bool HandleFConstant, bool LookThroughAnyExt) {
   SmallVector<std::pair<unsigned, unsigned>, 4> SeenOpcodes;
   MachineInstr *MI;
   auto IsConstantOpcode = [HandleFConstant](unsigned Opcode) {
@@ -282,6 +310,10 @@ Optional<ValueAndVReg> llvm::getConstantVRegValWithLookThrough(
   while ((MI = MRI.getVRegDef(VReg)) && !IsConstantOpcode(MI->getOpcode()) &&
          LookThroughInstrs) {
     switch (MI->getOpcode()) {
+    case TargetOpcode::G_ANYEXT:
+      if (!LookThroughAnyExt)
+        return None;
+      LLVM_FALLTHROUGH;
     case TargetOpcode::G_TRUNC:
     case TargetOpcode::G_SEXT:
     case TargetOpcode::G_ZEXT:
@@ -315,6 +347,7 @@ Optional<ValueAndVReg> llvm::getConstantVRegValWithLookThrough(
     case TargetOpcode::G_TRUNC:
       Val = Val.trunc(OpcodeAndSize.second);
       break;
+    case TargetOpcode::G_ANYEXT:
     case TargetOpcode::G_SEXT:
       Val = Val.sext(OpcodeAndSize.second);
       break;
@@ -324,13 +357,10 @@ Optional<ValueAndVReg> llvm::getConstantVRegValWithLookThrough(
     }
   }
 
-  if (Val.getBitWidth() > 64)
-    return None;
-
-  return ValueAndVReg{Val.getSExtValue(), VReg};
+  return ValueAndVReg{Val, VReg};
 }
 
-const llvm::ConstantFP *
+const ConstantFP *
 llvm::getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI) {
   MachineInstr *MI = MRI.getVRegDef(VReg);
   if (TargetOpcode::G_FCONSTANT != MI->getOpcode())
@@ -338,15 +368,8 @@ llvm::getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI) {
   return MI->getOperand(1).getFPImm();
 }
 
-namespace {
-struct DefinitionAndSourceRegister {
-  llvm::MachineInstr *MI;
-  Register Reg;
-};
-} // namespace
-
-static llvm::Optional<DefinitionAndSourceRegister>
-getDefSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI) {
+Optional<DefinitionAndSourceRegister>
+llvm::getDefSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI) {
   Register DefSrcReg = Reg;
   auto *DefMI = MRI.getVRegDef(Reg);
   auto DstTy = MRI.getType(DefMI->getOperand(0).getReg());
@@ -355,7 +378,7 @@ getDefSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI) {
   while (DefMI->getOpcode() == TargetOpcode::COPY) {
     Register SrcReg = DefMI->getOperand(1).getReg();
     auto SrcTy = MRI.getType(SrcReg);
-    if (!SrcTy.isValid() || SrcTy != DstTy)
+    if (!SrcTy.isValid())
       break;
     DefMI = MRI.getVRegDef(SrcReg);
     DefSrcReg = SrcReg;
@@ -363,8 +386,8 @@ getDefSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI) {
   return DefinitionAndSourceRegister{DefMI, DefSrcReg};
 }
 
-llvm::MachineInstr *llvm::getDefIgnoringCopies(Register Reg,
-                                               const MachineRegisterInfo &MRI) {
+MachineInstr *llvm::getDefIgnoringCopies(Register Reg,
+                                         const MachineRegisterInfo &MRI) {
   Optional<DefinitionAndSourceRegister> DefSrcReg =
       getDefSrcRegIgnoringCopies(Reg, MRI);
   return DefSrcReg ? DefSrcReg->MI : nullptr;
@@ -377,8 +400,8 @@ Register llvm::getSrcRegIgnoringCopies(Register Reg,
   return DefSrcReg ? DefSrcReg->Reg : Register();
 }
 
-llvm::MachineInstr *llvm::getOpcodeDef(unsigned Opcode, Register Reg,
-                                       const MachineRegisterInfo &MRI) {
+MachineInstr *llvm::getOpcodeDef(unsigned Opcode, Register Reg,
+                                 const MachineRegisterInfo &MRI) {
   MachineInstr *DefMI = getDefIgnoringCopies(Reg, MRI);
   return DefMI && DefMI->getOpcode() == Opcode ? DefMI : nullptr;
 }
@@ -407,9 +430,8 @@ Optional<APInt> llvm::ConstantFoldBinOp(unsigned Opcode, const Register Op1,
   if (!MaybeOp1Cst)
     return None;
 
-  LLT Ty = MRI.getType(Op1);
-  APInt C1(Ty.getSizeInBits(), *MaybeOp1Cst, true);
-  APInt C2(Ty.getSizeInBits(), *MaybeOp2Cst, true);
+  const APInt &C1 = *MaybeOp1Cst;
+  const APInt &C2 = *MaybeOp2Cst;
   switch (Opcode) {
   default:
     break;
@@ -458,7 +480,8 @@ bool llvm::isKnownNeverNaN(Register Val, const MachineRegisterInfo &MRI,
   if (!DefMI)
     return false;
 
-  if (DefMI->getFlag(MachineInstr::FmNoNans))
+  const TargetMachine& TM = DefMI->getMF()->getTarget();
+  if (DefMI->getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath)
     return true;
 
   if (SNaN) {
@@ -489,75 +512,304 @@ Align llvm::inferAlignFromPtrInfo(MachineFunction &MF,
   return Align(1);
 }
 
+Register llvm::getFunctionLiveInPhysReg(MachineFunction &MF,
+                                        const TargetInstrInfo &TII,
+                                        MCRegister PhysReg,
+                                        const TargetRegisterClass &RC,
+                                        LLT RegTy) {
+  DebugLoc DL; // FIXME: Is no location the right choice?
+  MachineBasicBlock &EntryMBB = MF.front();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  Register LiveIn = MRI.getLiveInVirtReg(PhysReg);
+  if (LiveIn) {
+    MachineInstr *Def = MRI.getVRegDef(LiveIn);
+    if (Def) {
+      // FIXME: Should the verifier check this is in the entry block?
+      assert(Def->getParent() == &EntryMBB && "live-in copy not in entry block");
+      return LiveIn;
+    }
+
+    // It's possible the incoming argument register and copy was added during
+    // lowering, but later deleted due to being/becoming dead. If this happens,
+    // re-insert the copy.
+  } else {
+    // The live in register was not present, so add it.
+    LiveIn = MF.addLiveIn(PhysReg, &RC);
+    if (RegTy.isValid())
+      MRI.setType(LiveIn, RegTy);
+  }
+
+  BuildMI(EntryMBB, EntryMBB.begin(), DL, TII.get(TargetOpcode::COPY), LiveIn)
+    .addReg(PhysReg);
+  if (!EntryMBB.isLiveIn(PhysReg))
+    EntryMBB.addLiveIn(PhysReg);
+  return LiveIn;
+}
+
 Optional<APInt> llvm::ConstantFoldExtOp(unsigned Opcode, const Register Op1,
                                         uint64_t Imm,
                                         const MachineRegisterInfo &MRI) {
   auto MaybeOp1Cst = getConstantVRegVal(Op1, MRI);
   if (MaybeOp1Cst) {
-    LLT Ty = MRI.getType(Op1);
-    APInt C1(Ty.getSizeInBits(), *MaybeOp1Cst, true);
     switch (Opcode) {
     default:
       break;
-    case TargetOpcode::G_SEXT_INREG:
-      return C1.trunc(Imm).sext(C1.getBitWidth());
+    case TargetOpcode::G_SEXT_INREG: {
+      LLT Ty = MRI.getType(Op1);
+      return MaybeOp1Cst->trunc(Imm).sext(Ty.getScalarSizeInBits());
+    }
     }
   }
   return None;
 }
 
+bool llvm::isKnownToBeAPowerOfTwo(Register Reg, const MachineRegisterInfo &MRI,
+                                  GISelKnownBits *KB) {
+  Optional<DefinitionAndSourceRegister> DefSrcReg =
+      getDefSrcRegIgnoringCopies(Reg, MRI);
+  if (!DefSrcReg)
+    return false;
+
+  const MachineInstr &MI = *DefSrcReg->MI;
+  const LLT Ty = MRI.getType(Reg);
+
+  switch (MI.getOpcode()) {
+  case TargetOpcode::G_CONSTANT: {
+    unsigned BitWidth = Ty.getScalarSizeInBits();
+    const ConstantInt *CI = MI.getOperand(1).getCImm();
+    return CI->getValue().zextOrTrunc(BitWidth).isPowerOf2();
+  }
+  case TargetOpcode::G_SHL: {
+    // A left-shift of a constant one will have exactly one bit set because
+    // shifting the bit off the end is undefined.
+
+    // TODO: Constant splat
+    if (auto ConstLHS = getConstantVRegVal(MI.getOperand(1).getReg(), MRI)) {
+      if (*ConstLHS == 1)
+        return true;
+    }
+
+    break;
+  }
+  case TargetOpcode::G_LSHR: {
+    if (auto ConstLHS = getConstantVRegVal(MI.getOperand(1).getReg(), MRI)) {
+      if (ConstLHS->isSignMask())
+        return true;
+    }
+
+    break;
+  }
+  default:
+    break;
+  }
+
+  // TODO: Are all operands of a build vector constant powers of two?
+  if (!KB)
+    return false;
+
+  // More could be done here, though the above checks are enough
+  // to handle some common cases.
+
+  // Fall back to computeKnownBits to catch other known cases.
+  KnownBits Known = KB->getKnownBits(Reg);
+  return (Known.countMaxPopulation() == 1) && (Known.countMinPopulation() == 1);
+}
+
 void llvm::getSelectionDAGFallbackAnalysisUsage(AnalysisUsage &AU) {
   AU.addPreserved<StackProtector>();
 }
 
-LLT llvm::getLCMType(LLT Ty0, LLT Ty1) {
-  if (!Ty0.isVector() && !Ty1.isVector()) {
-    unsigned Mul = Ty0.getSizeInBits() * Ty1.getSizeInBits();
-    int GCDSize = greatestCommonDivisor(Ty0.getSizeInBits(),
-                                        Ty1.getSizeInBits());
-    return LLT::scalar(Mul / GCDSize);
-  }
+static unsigned getLCMSize(unsigned OrigSize, unsigned TargetSize) {
+  unsigned Mul = OrigSize * TargetSize;
+  unsigned GCDSize = greatestCommonDivisor(OrigSize, TargetSize);
+  return Mul / GCDSize;
+}
+
+LLT llvm::getLCMType(LLT OrigTy, LLT TargetTy) {
+  const unsigned OrigSize = OrigTy.getSizeInBits();
+  const unsigned TargetSize = TargetTy.getSizeInBits();
+
+  if (OrigSize == TargetSize)
+    return OrigTy;
+
+  if (OrigTy.isVector()) {
+    const LLT OrigElt = OrigTy.getElementType();
 
-  if (Ty0.isVector() && !Ty1.isVector()) {
-    assert(Ty0.getElementType() == Ty1 && "not yet handled");
-    return Ty0;
+    if (TargetTy.isVector()) {
+      const LLT TargetElt = TargetTy.getElementType();
+
+      if (OrigElt.getSizeInBits() == TargetElt.getSizeInBits()) {
+        int GCDElts = greatestCommonDivisor(OrigTy.getNumElements(),
+                                            TargetTy.getNumElements());
+        // Prefer the original element type.
+        int Mul = OrigTy.getNumElements() * TargetTy.getNumElements();
+        return LLT::vector(Mul / GCDElts, OrigTy.getElementType());
+      }
+    } else {
+      if (OrigElt.getSizeInBits() == TargetSize)
+        return OrigTy;
+    }
+
+    unsigned LCMSize = getLCMSize(OrigSize, TargetSize);
+    return LLT::vector(LCMSize / OrigElt.getSizeInBits(), OrigElt);
   }
 
-  if (Ty1.isVector() && !Ty0.isVector()) {
-    assert(Ty1.getElementType() == Ty0 && "not yet handled");
-    return Ty1;
+  if (TargetTy.isVector()) {
+    unsigned LCMSize = getLCMSize(OrigSize, TargetSize);
+    return LLT::vector(LCMSize / OrigSize, OrigTy);
   }
 
-  if (Ty0.isVector() && Ty1.isVector()) {
-    assert(Ty0.getElementType() == Ty1.getElementType() && "not yet handled");
+  unsigned LCMSize = getLCMSize(OrigSize, TargetSize);
+
+  // Preserve pointer types.
+  if (LCMSize == OrigSize)
+    return OrigTy;
+  if (LCMSize == TargetSize)
+    return TargetTy;
 
-    int GCDElts = greatestCommonDivisor(Ty0.getNumElements(),
-                                        Ty1.getNumElements());
+  return LLT::scalar(LCMSize);
+}
 
-    int Mul = Ty0.getNumElements() * Ty1.getNumElements();
-    return LLT::vector(Mul / GCDElts, Ty0.getElementType());
+LLT llvm::getGCDType(LLT OrigTy, LLT TargetTy) {
+  const unsigned OrigSize = OrigTy.getSizeInBits();
+  const unsigned TargetSize = TargetTy.getSizeInBits();
+
+  if (OrigSize == TargetSize)
+    return OrigTy;
+
+  if (OrigTy.isVector()) {
+    LLT OrigElt = OrigTy.getElementType();
+    if (TargetTy.isVector()) {
+      LLT TargetElt = TargetTy.getElementType();
+      if (OrigElt.getSizeInBits() == TargetElt.getSizeInBits()) {
+        int GCD = greatestCommonDivisor(OrigTy.getNumElements(),
+                                        TargetTy.getNumElements());
+        return LLT::scalarOrVector(GCD, OrigElt);
+      }
+    } else {
+      // If the source is a vector of pointers, return a pointer element.
+      if (OrigElt.getSizeInBits() == TargetSize)
+        return OrigElt;
+    }
+
+    unsigned GCD = greatestCommonDivisor(OrigSize, TargetSize);
+    if (GCD == OrigElt.getSizeInBits())
+      return OrigElt;
+
+    // If we can't produce the original element type, we have to use a smaller
+    // scalar.
+    if (GCD < OrigElt.getSizeInBits())
+      return LLT::scalar(GCD);
+    return LLT::vector(GCD / OrigElt.getSizeInBits(), OrigElt);
+  }
+
+  if (TargetTy.isVector()) {
+    // Try to preserve the original element type.
+    LLT TargetElt = TargetTy.getElementType();
+    if (TargetElt.getSizeInBits() == OrigSize)
+      return OrigTy;
   }
 
-  llvm_unreachable("not yet handled");
+  unsigned GCD = greatestCommonDivisor(OrigSize, TargetSize);
+  return LLT::scalar(GCD);
 }
 
-LLT llvm::getGCDType(LLT OrigTy, LLT TargetTy) {
-  if (OrigTy.isVector() && TargetTy.isVector()) {
-    assert(OrigTy.getElementType() == TargetTy.getElementType());
-    int GCD = greatestCommonDivisor(OrigTy.getNumElements(),
-                                    TargetTy.getNumElements());
-    return LLT::scalarOrVector(GCD, OrigTy.getElementType());
+Optional<int> llvm::getSplatIndex(MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR &&
+         "Only G_SHUFFLE_VECTOR can have a splat index!");
+  ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
+  auto FirstDefinedIdx = find_if(Mask, [](int Elt) { return Elt >= 0; });
+
+  // If all elements are undefined, this shuffle can be considered a splat.
+  // Return 0 for better potential for callers to simplify.
+  if (FirstDefinedIdx == Mask.end())
+    return 0;
+
+  // Make sure all remaining elements are either undef or the same
+  // as the first non-undef value.
+  int SplatValue = *FirstDefinedIdx;
+  if (any_of(make_range(std::next(FirstDefinedIdx), Mask.end()),
+             [&SplatValue](int Elt) { return Elt >= 0 && Elt != SplatValue; }))
+    return None;
+
+  return SplatValue;
+}
+
+static bool isBuildVectorOp(unsigned Opcode) {
+  return Opcode == TargetOpcode::G_BUILD_VECTOR ||
+         Opcode == TargetOpcode::G_BUILD_VECTOR_TRUNC;
+}
+
+// TODO: Handle mixed undef elements.
+static bool isBuildVectorConstantSplat(const MachineInstr &MI,
+                                       const MachineRegisterInfo &MRI,
+                                       int64_t SplatValue) {
+  if (!isBuildVectorOp(MI.getOpcode()))
+    return false;
+
+  const unsigned NumOps = MI.getNumOperands();
+  for (unsigned I = 1; I != NumOps; ++I) {
+    Register Element = MI.getOperand(I).getReg();
+    if (!mi_match(Element, MRI, m_SpecificICst(SplatValue)))
+      return false;
   }
 
-  if (OrigTy.isVector() && !TargetTy.isVector()) {
-    assert(OrigTy.getElementType() == TargetTy);
-    return TargetTy;
+  return true;
+}
+
+Optional<int64_t>
+llvm::getBuildVectorConstantSplat(const MachineInstr &MI,
+                                  const MachineRegisterInfo &MRI) {
+  if (!isBuildVectorOp(MI.getOpcode()))
+    return None;
+
+  const unsigned NumOps = MI.getNumOperands();
+  Optional<int64_t> Scalar;
+  for (unsigned I = 1; I != NumOps; ++I) {
+    Register Element = MI.getOperand(I).getReg();
+    int64_t ElementValue;
+    if (!mi_match(Element, MRI, m_ICst(ElementValue)))
+      return None;
+    if (!Scalar)
+      Scalar = ElementValue;
+    else if (*Scalar != ElementValue)
+      return None;
   }
 
-  assert(!OrigTy.isVector() && !TargetTy.isVector() &&
-         "GCD type of vector and scalar not implemented");
+  return Scalar;
+}
+
+bool llvm::isBuildVectorAllZeros(const MachineInstr &MI,
+                                 const MachineRegisterInfo &MRI) {
+  return isBuildVectorConstantSplat(MI, MRI, 0);
+}
 
-  int GCD = greatestCommonDivisor(OrigTy.getSizeInBits(),
-                                  TargetTy.getSizeInBits());
-  return LLT::scalar(GCD);
+bool llvm::isBuildVectorAllOnes(const MachineInstr &MI,
+                                const MachineRegisterInfo &MRI) {
+  return isBuildVectorConstantSplat(MI, MRI, -1);
+}
+
+bool llvm::isConstTrueVal(const TargetLowering &TLI, int64_t Val, bool IsVector,
+                          bool IsFP) {
+  switch (TLI.getBooleanContents(IsVector, IsFP)) {
+  case TargetLowering::UndefinedBooleanContent:
+    return Val & 0x1;
+  case TargetLowering::ZeroOrOneBooleanContent:
+    return Val == 1;
+  case TargetLowering::ZeroOrNegativeOneBooleanContent:
+    return Val == -1;
+  }
+  llvm_unreachable("Invalid boolean contents");
+}
+
+int64_t llvm::getICmpTrueVal(const TargetLowering &TLI, bool IsVector,
+                             bool IsFP) {
+  switch (TLI.getBooleanContents(IsVector, IsFP)) {
+  case TargetLowering::UndefinedBooleanContent:
+  case TargetLowering::ZeroOrOneBooleanContent:
+    return 1;
+  case TargetLowering::ZeroOrNegativeOneBooleanContent:
+    return -1;
+  }
+  llvm_unreachable("Invalid boolean contents");
 }
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalMerge.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalMerge.cpp
index 1e20c02ba160..6c1ce4c1efb0 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/GlobalMerge.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalMerge.cpp
@@ -223,8 +223,9 @@ bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
   // FIXME: Find better heuristics
   llvm::stable_sort(
       Globals, [&DL](const GlobalVariable *GV1, const GlobalVariable *GV2) {
-        return DL.getTypeAllocSize(GV1->getValueType()) <
-               DL.getTypeAllocSize(GV2->getValueType());
+        // We don't support scalable global variables.
+        return DL.getTypeAllocSize(GV1->getValueType()).getFixedSize() <
+               DL.getTypeAllocSize(GV2->getValueType()).getFixedSize();
       });
 
   // If we want to just blindly group all globals together, do so.
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/HardwareLoops.cpp b/contrib/llvm-project/llvm/lib/CodeGen/HardwareLoops.cpp
index 0ba7e920e507..810b10c9c82a 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/HardwareLoops.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/HardwareLoops.cpp
@@ -165,7 +165,7 @@ namespace {
     Value *InitLoopCount();
 
     // Insert the set_loop_iteration intrinsic.
-    void InsertIterationSetup(Value *LoopCountInit);
+    Value *InsertIterationSetup(Value *LoopCountInit);
 
     // Insert the loop_decrement intrinsic.
     void InsertLoopDec();
@@ -187,7 +187,7 @@ namespace {
                  const DataLayout &DL,
                  OptimizationRemarkEmitter *ORE) :
       SE(SE), DL(DL), ORE(ORE), L(Info.L), M(L->getHeader()->getModule()),
-      ExitCount(Info.ExitCount),
+      TripCount(Info.TripCount),
       CountType(Info.CountType),
       ExitBranch(Info.ExitBranch),
       LoopDecrement(Info.LoopDecrement),
@@ -202,7 +202,7 @@ namespace {
     OptimizationRemarkEmitter *ORE = nullptr;
     Loop *L                 = nullptr;
     Module *M               = nullptr;
-    const SCEV *ExitCount   = nullptr;
+    const SCEV *TripCount   = nullptr;
     Type *CountType         = nullptr;
     BranchInst *ExitBranch  = nullptr;
     Value *LoopDecrement    = nullptr;
@@ -234,7 +234,7 @@ bool HardwareLoops::runOnFunction(Function &F) {
 
   for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I) {
     Loop *L = *I;
-    if (!L->getParentLoop())
+    if (L->isOutermost())
       TryConvertLoop(L);
   }
 
@@ -298,7 +298,7 @@ bool HardwareLoops::TryConvertLoop(HardwareLoopInfo &HWLoopInfo) {
   }
 
   assert(
-      (HWLoopInfo.ExitBlock && HWLoopInfo.ExitBranch && HWLoopInfo.ExitCount) &&
+      (HWLoopInfo.ExitBlock && HWLoopInfo.ExitBranch && HWLoopInfo.TripCount) &&
       "Hardware Loop must have set exit info.");
 
   BasicBlock *Preheader = L->getLoopPreheader();
@@ -325,11 +325,11 @@ void HardwareLoop::Create() {
     return;
   }
 
-  InsertIterationSetup(LoopCountInit);
+  Value *Setup = InsertIterationSetup(LoopCountInit);
 
   if (UsePHICounter || ForceHardwareLoopPHI) {
     Instruction *LoopDec = InsertLoopRegDec(LoopCountInit);
-    Value *EltsRem = InsertPHICounter(LoopCountInit, LoopDec);
+    Value *EltsRem = InsertPHICounter(Setup, LoopDec);
     LoopDec->setOperand(0, EltsRem);
     UpdateBranch(LoopDec);
   } else
@@ -383,18 +383,13 @@ Value *HardwareLoop::InitLoopCount() {
   // loop counter and tests that is not zero?
 
   SCEVExpander SCEVE(SE, DL, "loopcnt");
-  if (!ExitCount->getType()->isPointerTy() &&
-      ExitCount->getType() != CountType)
-    ExitCount = SE.getZeroExtendExpr(ExitCount, CountType);
-
-  ExitCount = SE.getAddExpr(ExitCount, SE.getOne(CountType));
 
   // If we're trying to use the 'test and set' form of the intrinsic, we need
   // to replace a conditional branch that is controlling entry to the loop. It
   // is likely (guaranteed?) that the preheader has an unconditional branch to
   // the loop header, so also check if it has a single predecessor.
-  if (SE.isLoopEntryGuardedByCond(L, ICmpInst::ICMP_NE, ExitCount,
-                                  SE.getZero(ExitCount->getType()))) {
+  if (SE.isLoopEntryGuardedByCond(L, ICmpInst::ICMP_NE, TripCount,
+                                  SE.getZero(TripCount->getType()))) {
     LLVM_DEBUG(dbgs() << " - Attempting to use test.set counter.\n");
     UseLoopGuard |= ForceGuardLoopEntry;
   } else
@@ -402,16 +397,23 @@ Value *HardwareLoop::InitLoopCount() {
 
   BasicBlock *BB = L->getLoopPreheader();
   if (UseLoopGuard && BB->getSinglePredecessor() &&
-      cast<BranchInst>(BB->getTerminator())->isUnconditional())
-    BB = BB->getSinglePredecessor();
+      cast<BranchInst>(BB->getTerminator())->isUnconditional()) {
+    BasicBlock *Predecessor = BB->getSinglePredecessor();
+    // If it's not safe to create a while loop then don't force it and create a
+    // do-while loop instead
+    if (!isSafeToExpandAt(TripCount, Predecessor->getTerminator(), SE))
+        UseLoopGuard = false;
+    else
+        BB = Predecessor;
+  }
 
-  if (!isSafeToExpandAt(ExitCount, BB->getTerminator(), SE)) {
-    LLVM_DEBUG(dbgs() << "- Bailing, unsafe to expand ExitCount "
-               << *ExitCount << "\n");
+  if (!isSafeToExpandAt(TripCount, BB->getTerminator(), SE)) {
+    LLVM_DEBUG(dbgs() << "- Bailing, unsafe to expand TripCount "
+               << *TripCount << "\n");
     return nullptr;
   }
 
-  Value *Count = SCEVE.expandCodeFor(ExitCount, CountType,
+  Value *Count = SCEVE.expandCodeFor(TripCount, CountType,
                                      BB->getTerminator());
 
   // FIXME: We've expanded Count where we hope to insert the counter setting
@@ -430,11 +432,13 @@ Value *HardwareLoop::InitLoopCount() {
   return Count;
 }
 
-void HardwareLoop::InsertIterationSetup(Value *LoopCountInit) {
+Value* HardwareLoop::InsertIterationSetup(Value *LoopCountInit) {
   IRBuilder<> Builder(BeginBB->getTerminator());
   Type *Ty = LoopCountInit->getType();
-  Intrinsic::ID ID = UseLoopGuard ?
-    Intrinsic::test_set_loop_iterations : Intrinsic::set_loop_iterations;
+  bool UsePhi = UsePHICounter || ForceHardwareLoopPHI;
+  Intrinsic::ID ID = UseLoopGuard ? Intrinsic::test_set_loop_iterations
+                                  : (UsePhi ? Intrinsic::start_loop_iterations
+                                           : Intrinsic::set_loop_iterations);
   Function *LoopIter = Intrinsic::getDeclaration(M, ID, Ty);
   Value *SetCount = Builder.CreateCall(LoopIter, LoopCountInit);
 
@@ -450,6 +454,7 @@ void HardwareLoop::InsertIterationSetup(Value *LoopCountInit) {
   }
   LLVM_DEBUG(dbgs() << "HWLoops: Inserted loop counter: "
              << *SetCount << "\n");
+  return UseLoopGuard ? LoopCountInit : SetCount;
 }
 
 void HardwareLoop::InsertLoopDec() {
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/IfConversion.cpp b/contrib/llvm-project/llvm/lib/CodeGen/IfConversion.cpp
index 1a5c5d685017..37be2eabf5fe 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/IfConversion.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/IfConversion.cpp
@@ -751,7 +751,7 @@ bool IfConverter::CountDuplicatedInstructions(
     // A pred-clobbering instruction in the shared portion prevents
     // if-conversion.
     std::vector<MachineOperand> PredDefs;
-    if (TII->DefinesPredicate(*TIB, PredDefs))
+    if (TII->ClobbersPredicate(*TIB, PredDefs, false))
       return false;
     // If we get all the way to the branch instructions, don't count them.
     if (!TIB->isBranch())
@@ -1146,7 +1146,7 @@ void IfConverter::ScanInstructions(BBInfo &BBI,
     // FIXME: Make use of PredDefs? e.g. ADDC, SUBC sets predicates but are
     // still potentially predicable.
     std::vector<MachineOperand> PredDefs;
-    if (TII->DefinesPredicate(MI, PredDefs))
+    if (TII->ClobbersPredicate(MI, PredDefs, true))
       BBI.ClobbersPred = true;
 
     if (!TII->isPredicable(MI)) {
@@ -2264,8 +2264,7 @@ void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) {
   if (ToBBI.IsBrAnalyzable)
     ToBBI.BB->normalizeSuccProbs();
 
-  SmallVector<MachineBasicBlock *, 4> FromSuccs(FromMBB.succ_begin(),
-                                                FromMBB.succ_end());
+  SmallVector<MachineBasicBlock *, 4> FromSuccs(FromMBB.successors());
   MachineBasicBlock *NBB = getNextBlock(FromMBB);
   MachineBasicBlock *FallThrough = FromBBI.HasFallThrough ? NBB : nullptr;
   // The edge probability from ToBBI.BB to FromMBB, which is only needed when
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/ImplicitNullChecks.cpp b/contrib/llvm-project/llvm/lib/CodeGen/ImplicitNullChecks.cpp
index 16c9bfc672af..5cdaa9b74e80 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/ImplicitNullChecks.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/ImplicitNullChecks.cpp
@@ -200,10 +200,16 @@ class ImplicitNullChecks : public MachineFunctionPass {
                                        unsigned PointerReg,
                                        ArrayRef<MachineInstr *> PrevInsts);
 
+  /// Returns true if \p DependenceMI can clobber the liveIns in NullSucc block
+  /// if it was hoisted to the NullCheck block. This is used by caller
+  /// canHoistInst to decide if DependenceMI can be hoisted safely.
+  bool canDependenceHoistingClobberLiveIns(MachineInstr *DependenceMI,
+                                           MachineBasicBlock *NullSucc);
+
   /// Return true if \p FaultingMI can be hoisted from after the
   /// instructions in \p InstsSeenSoFar to before them.  Set \p Dependence to a
-  /// non-null value if we also need to (and legally can) hoist a depedency.
-  bool canHoistInst(MachineInstr *FaultingMI, unsigned PointerReg,
+  /// non-null value if we also need to (and legally can) hoist a dependency.
+  bool canHoistInst(MachineInstr *FaultingMI,
                     ArrayRef<MachineInstr *> InstsSeenSoFar,
                     MachineBasicBlock *NullSucc, MachineInstr *&Dependence);
 
@@ -275,12 +281,12 @@ bool ImplicitNullChecks::canReorder(const MachineInstr *A,
   // between A and B here -- for instance, we should not be dealing with heap
   // load-store dependencies here.
 
-  for (auto MOA : A->operands()) {
+  for (const auto &MOA : A->operands()) {
     if (!(MOA.isReg() && MOA.getReg()))
       continue;
 
     Register RegA = MOA.getReg();
-    for (auto MOB : B->operands()) {
+    for (const auto &MOB : B->operands()) {
       if (!(MOB.isReg() && MOB.getReg()))
         continue;
 
@@ -347,11 +353,9 @@ ImplicitNullChecks::areMemoryOpsAliased(const MachineInstr &MI,
           return AR_MayAlias;
         continue;
       }
-      llvm::AliasResult AAResult =
-          AA->alias(MemoryLocation(MMO1->getValue(), LocationSize::unknown(),
-                                   MMO1->getAAInfo()),
-                    MemoryLocation(MMO2->getValue(), LocationSize::unknown(),
-                                   MMO2->getAAInfo()));
+      llvm::AliasResult AAResult = AA->alias(
+          MemoryLocation::getAfter(MMO1->getValue(), MMO1->getAAInfo()),
+          MemoryLocation::getAfter(MMO2->getValue(), MMO2->getAAInfo()));
       if (AAResult != NoAlias)
         return AR_MayAlias;
     }
@@ -363,23 +367,105 @@ ImplicitNullChecks::SuitabilityResult
 ImplicitNullChecks::isSuitableMemoryOp(const MachineInstr &MI,
                                        unsigned PointerReg,
                                        ArrayRef<MachineInstr *> PrevInsts) {
-  int64_t Offset;
-  bool OffsetIsScalable;
-  const MachineOperand *BaseOp;
+  // Implementation restriction for faulting_op insertion
+  // TODO: This could be relaxed if we find a test case which warrants it.
+  if (MI.getDesc().getNumDefs() > 1)
+   return SR_Unsuitable;
 
+  if (!MI.mayLoadOrStore() || MI.isPredicable())
+    return SR_Unsuitable;
+  auto AM = TII->getAddrModeFromMemoryOp(MI, TRI);
+  if (!AM)
+    return SR_Unsuitable;
+  auto AddrMode = *AM;
+  const Register BaseReg = AddrMode.BaseReg, ScaledReg = AddrMode.ScaledReg;
+  int64_t Displacement = AddrMode.Displacement;
 
-  if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, TRI) ||
-      !BaseOp->isReg() || BaseOp->getReg() != PointerReg)
+  // We need the base of the memory instruction to be same as the register
+  // where the null check is performed (i.e. PointerReg).
+  if (BaseReg != PointerReg && ScaledReg != PointerReg)
+    return SR_Unsuitable;
+  const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
+  unsigned PointerRegSizeInBits = TRI->getRegSizeInBits(PointerReg, MRI);
+  // Bail out of the sizes of BaseReg, ScaledReg and PointerReg are not the
+  // same.
+  if ((BaseReg &&
+       TRI->getRegSizeInBits(BaseReg, MRI) != PointerRegSizeInBits) ||
+      (ScaledReg &&
+       TRI->getRegSizeInBits(ScaledReg, MRI) != PointerRegSizeInBits))
     return SR_Unsuitable;
 
-  // FIXME: This algorithm assumes instructions have fixed-size offsets.
-  if (OffsetIsScalable)
+  // Returns true if RegUsedInAddr is used for calculating the displacement
+  // depending on addressing mode. Also calculates the Displacement.
+  auto CalculateDisplacementFromAddrMode = [&](Register RegUsedInAddr,
+                                               int64_t Multiplier) {
+    // The register can be NoRegister, which is defined as zero for all targets.
+    // Consider instruction of interest as `movq 8(,%rdi,8), %rax`. Here the
+    // ScaledReg is %rdi, while there is no BaseReg.
+    if (!RegUsedInAddr)
+      return false;
+    assert(Multiplier && "expected to be non-zero!");
+    MachineInstr *ModifyingMI = nullptr;
+    for (auto It = std::next(MachineBasicBlock::const_reverse_iterator(&MI));
+         It != MI.getParent()->rend(); It++) {
+      const MachineInstr *CurrMI = &*It;
+      if (CurrMI->modifiesRegister(RegUsedInAddr, TRI)) {
+        ModifyingMI = const_cast<MachineInstr *>(CurrMI);
+        break;
+      }
+    }
+    if (!ModifyingMI)
+      return false;
+    // Check for the const value defined in register by ModifyingMI. This means
+    // all other previous values for that register has been invalidated.
+    int64_t ImmVal;
+    if (!TII->getConstValDefinedInReg(*ModifyingMI, RegUsedInAddr, ImmVal))
+      return false;
+    // Calculate the reg size in bits, since this is needed for bailing out in
+    // case of overflow.
+    int32_t RegSizeInBits = TRI->getRegSizeInBits(RegUsedInAddr, MRI);
+    APInt ImmValC(RegSizeInBits, ImmVal, true /*IsSigned*/);
+    APInt MultiplierC(RegSizeInBits, Multiplier);
+    assert(MultiplierC.isStrictlyPositive() &&
+           "expected to be a positive value!");
+    bool IsOverflow;
+    // Sign of the product depends on the sign of the ImmVal, since Multiplier
+    // is always positive.
+    APInt Product = ImmValC.smul_ov(MultiplierC, IsOverflow);
+    if (IsOverflow)
+      return false;
+    APInt DisplacementC(64, Displacement, true /*isSigned*/);
+    DisplacementC = Product.sadd_ov(DisplacementC, IsOverflow);
+    if (IsOverflow)
+      return false;
+
+    // We only handle diplacements upto 64 bits wide.
+    if (DisplacementC.getActiveBits() > 64)
+      return false;
+    Displacement = DisplacementC.getSExtValue();
+    return true;
+  };
+
+  // If a register used in the address is constant, fold it's effect into the
+  // displacement for ease of analysis.
+  bool BaseRegIsConstVal = false, ScaledRegIsConstVal = false;
+  if (CalculateDisplacementFromAddrMode(BaseReg, 1))
+    BaseRegIsConstVal = true;
+  if (CalculateDisplacementFromAddrMode(ScaledReg, AddrMode.Scale))
+    ScaledRegIsConstVal = true;
+
+  // The register which is not null checked should be part of the Displacement
+  // calculation, otherwise we do not know whether the Displacement is made up
+  // by some symbolic values.
+  // This matters because we do not want to incorrectly assume that load from
+  // falls in the zeroth faulting page in the "sane offset check" below.
+  if ((BaseReg && BaseReg != PointerReg && !BaseRegIsConstVal) ||
+      (ScaledReg && ScaledReg != PointerReg && !ScaledRegIsConstVal))
     return SR_Unsuitable;
 
   // We want the mem access to be issued at a sane offset from PointerReg,
   // so that if PointerReg is null then the access reliably page faults.
-  if (!(MI.mayLoadOrStore() && !MI.isPredicable() &&
-        -PageSize < Offset && Offset < PageSize))
+  if (!(-PageSize < Displacement && Displacement < PageSize))
     return SR_Unsuitable;
 
   // Finally, check whether the current memory access aliases with previous one.
@@ -393,8 +479,39 @@ ImplicitNullChecks::isSuitableMemoryOp(const MachineInstr &MI,
   return SR_Suitable;
 }
 
+bool ImplicitNullChecks::canDependenceHoistingClobberLiveIns(
+    MachineInstr *DependenceMI, MachineBasicBlock *NullSucc) {
+  for (const auto &DependenceMO : DependenceMI->operands()) {
+    if (!(DependenceMO.isReg() && DependenceMO.getReg()))
+      continue;
+
+    // Make sure that we won't clobber any live ins to the sibling block by
+    // hoisting Dependency.  For instance, we can't hoist INST to before the
+    // null check (even if it safe, and does not violate any dependencies in
+    // the non_null_block) if %rdx is live in to _null_block.
+    //
+    //    test %rcx, %rcx
+    //    je _null_block
+    //  _non_null_block:
+    //    %rdx = INST
+    //    ...
+    //
+    // This restriction does not apply to the faulting load inst because in
+    // case the pointer loaded from is in the null page, the load will not
+    // semantically execute, and affect machine state.  That is, if the load
+    // was loading into %rax and it faults, the value of %rax should stay the
+    // same as it would have been had the load not have executed and we'd have
+    // branched to NullSucc directly.
+    if (AnyAliasLiveIn(TRI, NullSucc, DependenceMO.getReg()))
+      return true;
+
+  }
+
+  // The dependence does not clobber live-ins in NullSucc block.
+  return false;
+}
+
 bool ImplicitNullChecks::canHoistInst(MachineInstr *FaultingMI,
-                                      unsigned PointerReg,
                                       ArrayRef<MachineInstr *> InstsSeenSoFar,
                                       MachineBasicBlock *NullSucc,
                                       MachineInstr *&Dependence) {
@@ -419,37 +536,8 @@ bool ImplicitNullChecks::canHoistInst(MachineInstr *FaultingMI,
   if (DependenceMI->mayLoadOrStore())
     return false;
 
-  for (auto &DependenceMO : DependenceMI->operands()) {
-    if (!(DependenceMO.isReg() && DependenceMO.getReg()))
-      continue;
-
-    // Make sure that we won't clobber any live ins to the sibling block by
-    // hoisting Dependency.  For instance, we can't hoist INST to before the
-    // null check (even if it safe, and does not violate any dependencies in
-    // the non_null_block) if %rdx is live in to _null_block.
-    //
-    //    test %rcx, %rcx
-    //    je _null_block
-    //  _non_null_block:
-    //    %rdx = INST
-    //    ...
-    //
-    // This restriction does not apply to the faulting load inst because in
-    // case the pointer loaded from is in the null page, the load will not
-    // semantically execute, and affect machine state.  That is, if the load
-    // was loading into %rax and it faults, the value of %rax should stay the
-    // same as it would have been had the load not have executed and we'd have
-    // branched to NullSucc directly.
-    if (AnyAliasLiveIn(TRI, NullSucc, DependenceMO.getReg()))
-      return false;
-
-    // The Dependency can't be re-defining the base register -- then we won't
-    // get the memory operation on the address we want.  This is already
-    // checked in \c IsSuitableMemoryOp.
-    assert(!(DependenceMO.isDef() &&
-             TRI->regsOverlap(DependenceMO.getReg(), PointerReg)) &&
-           "Should have been checked before!");
-  }
+  if (canDependenceHoistingClobberLiveIns(DependenceMI, NullSucc))
+    return false;
 
   auto DepDepResult =
       computeDependence(DependenceMI, {InstsSeenSoFar.begin(), DependenceItr});
@@ -486,9 +574,9 @@ bool ImplicitNullChecks::analyzeBlockForNullChecks(
          MBP.Predicate == MachineBranchPredicate::PRED_EQ)))
     return false;
 
-  // If we cannot erase the test instruction itself, then making the null check
-  // implicit does not buy us much.
-  if (!MBP.SingleUseCondition)
+  // If there is a separate condition generation instruction, we chose not to
+  // transform unless we can remove both condition and consuming branch.
+  if (MBP.ConditionDef && !MBP.SingleUseCondition)
     return false;
 
   MachineBasicBlock *NotNullSucc, *NullSucc;
@@ -506,32 +594,34 @@ bool ImplicitNullChecks::analyzeBlockForNullChecks(
   if (NotNullSucc->pred_size() != 1)
     return false;
 
-  // To prevent the invalid transformation of the following code:
-  //
-  //   mov %rax, %rcx
-  //   test %rax, %rax
-  //   %rax = ...
-  //   je throw_npe
-  //   mov(%rcx), %r9
-  //   mov(%rax), %r10
-  //
-  // into:
-  //
-  //   mov %rax, %rcx
-  //   %rax = ....
-  //   faulting_load_op("movl (%rax), %r10", throw_npe)
-  //   mov(%rcx), %r9
-  //
-  // we must ensure that there are no instructions between the 'test' and
-  // conditional jump that modify %rax.
   const Register PointerReg = MBP.LHS.getReg();
 
-  assert(MBP.ConditionDef->getParent() ==  &MBB && "Should be in basic block");
-
-  for (auto I = MBB.rbegin(); MBP.ConditionDef != &*I; ++I)
-    if (I->modifiesRegister(PointerReg, TRI))
-      return false;
+  if (MBP.ConditionDef) {
+    // To prevent the invalid transformation of the following code:
+    //
+    //   mov %rax, %rcx
+    //   test %rax, %rax
+    //   %rax = ...
+    //   je throw_npe
+    //   mov(%rcx), %r9
+    //   mov(%rax), %r10
+    //
+    // into:
+    //
+    //   mov %rax, %rcx
+    //   %rax = ....
+    //   faulting_load_op("movl (%rax), %r10", throw_npe)
+    //   mov(%rcx), %r9
+    //
+    // we must ensure that there are no instructions between the 'test' and
+    // conditional jump that modify %rax.
+    assert(MBP.ConditionDef->getParent() ==  &MBB &&
+           "Should be in basic block");
 
+    for (auto I = MBB.rbegin(); MBP.ConditionDef != &*I; ++I)
+      if (I->modifiesRegister(PointerReg, TRI))
+        return false;
+  }
   // Starting with a code fragment like:
   //
   //   test %rax, %rax
@@ -597,17 +687,15 @@ bool ImplicitNullChecks::analyzeBlockForNullChecks(
     if (SR == SR_Impossible)
       return false;
     if (SR == SR_Suitable &&
-        canHoistInst(&MI, PointerReg, InstsSeenSoFar, NullSucc, Dependence)) {
+        canHoistInst(&MI, InstsSeenSoFar, NullSucc, Dependence)) {
       NullCheckList.emplace_back(&MI, MBP.ConditionDef, &MBB, NotNullSucc,
                                  NullSucc, Dependence);
       return true;
     }
 
-    // If MI re-defines the PointerReg then we cannot move further.
-    if (llvm::any_of(MI.operands(), [&](MachineOperand &MO) {
-          return MO.isReg() && MO.getReg() && MO.isDef() &&
-                 TRI->regsOverlap(MO.getReg(), PointerReg);
-        }))
+    // If MI re-defines the PointerReg in a way that changes the value of
+    // PointerReg if it was null, then we cannot move further.
+    if (!TII->preservesZeroValueInReg(&MI, PointerReg, TRI))
       return false;
     InstsSeenSoFar.push_back(&MI);
   }
@@ -712,9 +800,11 @@ void ImplicitNullChecks::rewriteNullChecks(
     }
 
     NC.getMemOperation()->eraseFromParent();
-    NC.getCheckOperation()->eraseFromParent();
+    if (auto *CheckOp = NC.getCheckOperation())
+      CheckOp->eraseFromParent();
 
-    // Insert an *unconditional* branch to not-null successor.
+    // Insert an *unconditional* branch to not-null successor - we expect
+    // block placement to remove fallthroughs later.
     TII->insertBranch(*NC.getCheckBlock(), NC.getNotNullSucc(), nullptr,
                       /*Cond=*/None, DL);
 
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/InlineSpiller.cpp b/contrib/llvm-project/llvm/lib/CodeGen/InlineSpiller.cpp
index 41eef2fed840..876e1d3f932a 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/InlineSpiller.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/InlineSpiller.cpp
@@ -153,7 +153,7 @@ public:
                             unsigned Original);
   bool rmFromMergeableSpills(MachineInstr &Spill, int StackSlot);
   void hoistAllSpills();
-  void LRE_DidCloneVirtReg(unsigned, unsigned) override;
+  void LRE_DidCloneVirtReg(Register, Register) override;
 };
 
 class InlineSpiller : public Spiller {
@@ -269,6 +269,14 @@ static Register isFullCopyOf(const MachineInstr &MI, Register Reg) {
   return Register();
 }
 
+static void getVDefInterval(const MachineInstr &MI, LiveIntervals &LIS) {
+  for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
+    const MachineOperand &MO = MI.getOperand(I);
+    if (MO.isReg() && MO.isDef() && Register::isVirtualRegister(MO.getReg()))
+      LIS.getInterval(MO.getReg());
+  }
+}
+
 /// isSnippet - Identify if a live interval is a snippet that should be spilled.
 /// It is assumed that SnipLI is a virtual register with the same original as
 /// Edit->getReg().
@@ -289,8 +297,9 @@ bool InlineSpiller::isSnippet(const LiveInterval &SnipLI) {
 
   // Check that all uses satisfy our criteria.
   for (MachineRegisterInfo::reg_instr_nodbg_iterator
-       RI = MRI.reg_instr_nodbg_begin(SnipLI.reg),
-       E = MRI.reg_instr_nodbg_end(); RI != E; ) {
+           RI = MRI.reg_instr_nodbg_begin(SnipLI.reg()),
+           E = MRI.reg_instr_nodbg_end();
+       RI != E;) {
     MachineInstr &MI = *RI++;
 
     // Allow copies to/from Reg.
@@ -299,11 +308,11 @@ bool InlineSpiller::isSnippet(const LiveInterval &SnipLI) {
 
     // Allow stack slot loads.
     int FI;
-    if (SnipLI.reg == TII.isLoadFromStackSlot(MI, FI) && FI == StackSlot)
+    if (SnipLI.reg() == TII.isLoadFromStackSlot(MI, FI) && FI == StackSlot)
       continue;
 
     // Allow stack slot stores.
-    if (SnipLI.reg == TII.isStoreToStackSlot(MI, FI) && FI == StackSlot)
+    if (SnipLI.reg() == TII.isStoreToStackSlot(MI, FI) && FI == StackSlot)
       continue;
 
     // Allow a single additional instruction.
@@ -409,14 +418,21 @@ bool InlineSpiller::hoistSpillInsideBB(LiveInterval &SpillLI,
     MII = DefMI;
     ++MII;
   }
+  MachineInstrSpan MIS(MII, MBB);
   // Insert spill without kill flag immediately after def.
   TII.storeRegToStackSlot(*MBB, MII, SrcReg, false, StackSlot,
                           MRI.getRegClass(SrcReg), &TRI);
+  LIS.InsertMachineInstrRangeInMaps(MIS.begin(), MII);
+  for (const MachineInstr &MI : make_range(MIS.begin(), MII))
+    getVDefInterval(MI, LIS);
   --MII; // Point to store instruction.
-  LIS.InsertMachineInstrInMaps(*MII);
   LLVM_DEBUG(dbgs() << "\thoisted: " << SrcVNI->def << '\t' << *MII);
 
-  HSpiller.addToMergeableSpills(*MII, StackSlot, Original);
+  // If there is only 1 store instruction is required for spill, add it
+  // to mergeable list. In X86 AMX, 2 intructions are required to store.
+  // We disable the merge for this case.
+  if (MIS.begin() == MII)
+    HSpiller.addToMergeableSpills(*MII, StackSlot, Original);
   ++NumSpills;
   return true;
 }
@@ -432,7 +448,7 @@ void InlineSpiller::eliminateRedundantSpills(LiveInterval &SLI, VNInfo *VNI) {
   do {
     LiveInterval *LI;
     std::tie(LI, VNI) = WorkList.pop_back_val();
-    Register Reg = LI->reg;
+    Register Reg = LI->reg();
     LLVM_DEBUG(dbgs() << "Checking redundant spills for " << VNI->id << '@'
                       << VNI->def << " in " << *LI << '\n');
 
@@ -511,7 +527,7 @@ void InlineSpiller::markValueUsed(LiveInterval *LI, VNInfo *VNI) {
     if (!SnippetCopies.count(MI))
       continue;
     LiveInterval &SnipLI = LIS.getInterval(MI->getOperand(1).getReg());
-    assert(isRegToSpill(SnipLI.reg) && "Unexpected register in copy");
+    assert(isRegToSpill(SnipLI.reg()) && "Unexpected register in copy");
     VNInfo *SnipVNI = SnipLI.getVNInfoAt(VNI->def.getRegSlot(true));
     assert(SnipVNI && "Snippet undefined before copy");
     WorkList.push_back(std::make_pair(&SnipLI, SnipVNI));
@@ -556,7 +572,7 @@ bool InlineSpiller::canGuaranteeAssignmentAfterRemat(Register VReg,
 bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg, MachineInstr &MI) {
   // Analyze instruction
   SmallVector<std::pair<MachineInstr *, unsigned>, 8> Ops;
-  VirtRegInfo RI = AnalyzeVirtRegInBundle(MI, VirtReg.reg, &Ops);
+  VirtRegInfo RI = AnalyzeVirtRegInBundle(MI, VirtReg.reg(), &Ops);
 
   if (!RI.Reads)
     return false;
@@ -568,7 +584,7 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg, MachineInstr &MI) {
     LLVM_DEBUG(dbgs() << "\tadding <undef> flags: ");
     for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
       MachineOperand &MO = MI.getOperand(i);
-      if (MO.isReg() && MO.isUse() && MO.getReg() == VirtReg.reg)
+      if (MO.isReg() && MO.isUse() && MO.getReg() == VirtReg.reg())
         MO.setIsUndef();
     }
     LLVM_DEBUG(dbgs() << UseIdx << '\t' << MI);
@@ -608,7 +624,7 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg, MachineInstr &MI) {
 
   // If we can't guarantee that we'll be able to actually assign the new vreg,
   // we can't remat.
-  if (!canGuaranteeAssignmentAfterRemat(VirtReg.reg, MI)) {
+  if (!canGuaranteeAssignmentAfterRemat(VirtReg.reg(), MI)) {
     markValueUsed(&VirtReg, ParentVNI);
     LLVM_DEBUG(dbgs() << "\tcannot remat for " << UseIdx << '\t' << MI);
     return false;
@@ -633,7 +649,7 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg, MachineInstr &MI) {
   // Replace operands
   for (const auto &OpPair : Ops) {
     MachineOperand &MO = OpPair.first->getOperand(OpPair.second);
-    if (MO.isReg() && MO.isUse() && MO.getReg() == VirtReg.reg) {
+    if (MO.isReg() && MO.isUse() && MO.getReg() == VirtReg.reg()) {
       MO.setReg(NewVReg);
       MO.setIsKill();
     }
@@ -810,6 +826,14 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr *, unsigned>> Ops,
   bool WasCopy = MI->isCopy();
   Register ImpReg;
 
+  // TII::foldMemoryOperand will do what we need here for statepoint
+  // (fold load into use and remove corresponding def). We will replace
+  // uses of removed def with loads (spillAroundUses).
+  // For that to work we need to untie def and use to pass it through
+  // foldMemoryOperand and signal foldPatchpoint that it is allowed to
+  // fold them.
+  bool UntieRegs = MI->getOpcode() == TargetOpcode::STATEPOINT;
+
   // Spill subregs if the target allows it.
   // We always want to spill subregs for stackmap/patchpoint pseudos.
   bool SpillSubRegs = TII.isSubregFoldable() ||
@@ -835,7 +859,7 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr *, unsigned>> Ops,
     if (LoadMI && MO.isDef())
       return false;
     // Tied use operands should not be passed to foldMemoryOperand.
-    if (!MI->isRegTiedToDefOperand(Idx))
+    if (UntieRegs || !MI->isRegTiedToDefOperand(Idx))
       FoldOps.push_back(Idx);
   }
 
@@ -846,11 +870,31 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr *, unsigned>> Ops,
 
   MachineInstrSpan MIS(MI, MI->getParent());
 
+  SmallVector<std::pair<unsigned, unsigned> > TiedOps;
+  if (UntieRegs)
+    for (unsigned Idx : FoldOps) {
+      MachineOperand &MO = MI->getOperand(Idx);
+      if (!MO.isTied())
+        continue;
+      unsigned Tied = MI->findTiedOperandIdx(Idx);
+      if (MO.isUse())
+        TiedOps.emplace_back(Tied, Idx);
+      else {
+        assert(MO.isDef() && "Tied to not use and def?");
+        TiedOps.emplace_back(Idx, Tied);
+      }
+      MI->untieRegOperand(Idx);
+    }
+
   MachineInstr *FoldMI =
       LoadMI ? TII.foldMemoryOperand(*MI, FoldOps, *LoadMI, &LIS)
              : TII.foldMemoryOperand(*MI, FoldOps, StackSlot, &LIS, &VRM);
-  if (!FoldMI)
+  if (!FoldMI) {
+    // Re-tie operands.
+    for (auto Tied : TiedOps)
+      MI->tieOperands(Tied.first, Tied.second);
     return false;
+  }
 
   // Remove LIS for any dead defs in the original MI not in FoldMI.
   for (MIBundleOperands MO(*MI); MO.isValid(); ++MO) {
@@ -869,7 +913,7 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr *, unsigned>> Ops,
     // FoldMI does not define this physreg. Remove the LI segment.
     assert(MO->isDead() && "Cannot fold physreg def");
     SlotIndex Idx = LIS.getInstructionIndex(*MI).getRegSlot();
-    LIS.removePhysRegDefAt(Reg, Idx);
+    LIS.removePhysRegDefAt(Reg.asMCReg(), Idx);
   }
 
   int FI;
@@ -906,7 +950,11 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr *, unsigned>> Ops,
     ++NumFolded;
   else if (Ops.front().second == 0) {
     ++NumSpills;
-    HSpiller.addToMergeableSpills(*FoldMI, StackSlot, Original);
+    // If there is only 1 store instruction is required for spill, add it
+    // to mergeable list. In X86 AMX, 2 intructions are required to store.
+    // We disable the merge for this case.
+    if (std::distance(MIS.begin(), MIS.end()) <= 1)
+      HSpiller.addToMergeableSpills(*FoldMI, StackSlot, Original);
   } else
     ++NumReloads;
   return true;
@@ -953,6 +1001,7 @@ void InlineSpiller::insertSpill(Register NewVReg, bool isKill,
   MachineInstrSpan MIS(MI, &MBB);
   MachineBasicBlock::iterator SpillBefore = std::next(MI);
   bool IsRealSpill = isRealSpill(*MI);
+
   if (IsRealSpill)
     TII.storeRegToStackSlot(MBB, SpillBefore, NewVReg, isKill, StackSlot,
                             MRI.getRegClass(NewVReg), &TRI);
@@ -966,11 +1015,16 @@ void InlineSpiller::insertSpill(Register NewVReg, bool isKill,
 
   MachineBasicBlock::iterator Spill = std::next(MI);
   LIS.InsertMachineInstrRangeInMaps(Spill, MIS.end());
+  for (const MachineInstr &MI : make_range(Spill, MIS.end()))
+    getVDefInterval(MI, LIS);
 
   LLVM_DEBUG(
       dumpMachineInstrRangeWithSlotIndex(Spill, MIS.end(), LIS, "spill"));
   ++NumSpills;
-  if (IsRealSpill)
+  // If there is only 1 store instruction is required for spill, add it
+  // to mergeable list. In X86 AMX, 2 intructions are required to store.
+  // We disable the merge for this case.
+  if (IsRealSpill && std::distance(Spill, MIS.end()) <= 1)
     HSpiller.addToMergeableSpills(*Spill, StackSlot, Original);
 }
 
@@ -1160,7 +1214,7 @@ void HoistSpillHelper::addToMergeableSpills(MachineInstr &Spill, int StackSlot,
   // save a copy of LiveInterval in StackSlotToOrigLI because the original
   // LiveInterval may be cleared after all its references are spilled.
   if (StackSlotToOrigLI.find(StackSlot) == StackSlotToOrigLI.end()) {
-    auto LI = std::make_unique<LiveInterval>(OrigLI.reg, OrigLI.weight);
+    auto LI = std::make_unique<LiveInterval>(OrigLI.reg(), OrigLI.weight());
     LI->assign(OrigLI, Allocator);
     StackSlotToOrigLI[StackSlot] = std::move(LI);
   }
@@ -1188,7 +1242,7 @@ bool HoistSpillHelper::rmFromMergeableSpills(MachineInstr &Spill,
 bool HoistSpillHelper::isSpillCandBB(LiveInterval &OrigLI, VNInfo &OrigVNI,
                                      MachineBasicBlock &BB, Register &LiveReg) {
   SlotIndex Idx;
-  Register OrigReg = OrigLI.reg;
+  Register OrigReg = OrigLI.reg();
   MachineBasicBlock::iterator MI = IPA.getLastInsertPointIter(OrigLI, BB);
   if (MI != BB.end())
     Idx = LIS.getInstructionIndex(*MI);
@@ -1516,10 +1570,13 @@ void HoistSpillHelper::hoistAllSpills() {
     for (auto const &Insert : SpillsToIns) {
       MachineBasicBlock *BB = Insert.first;
       Register LiveReg = Insert.second;
-      MachineBasicBlock::iterator MI = IPA.getLastInsertPointIter(OrigLI, *BB);
-      TII.storeRegToStackSlot(*BB, MI, LiveReg, false, Slot,
+      MachineBasicBlock::iterator MII = IPA.getLastInsertPointIter(OrigLI, *BB);
+      MachineInstrSpan MIS(MII, BB);
+      TII.storeRegToStackSlot(*BB, MII, LiveReg, false, Slot,
                               MRI.getRegClass(LiveReg), &TRI);
-      LIS.InsertMachineInstrRangeInMaps(std::prev(MI), MI);
+      LIS.InsertMachineInstrRangeInMaps(MIS.begin(), MII);
+      for (const MachineInstr &MI : make_range(MIS.begin(), MII))
+        getVDefInterval(MI, LIS);
       ++NumSpills;
     }
 
@@ -1539,11 +1596,13 @@ void HoistSpillHelper::hoistAllSpills() {
 
 /// For VirtReg clone, the \p New register should have the same physreg or
 /// stackslot as the \p old register.
-void HoistSpillHelper::LRE_DidCloneVirtReg(unsigned New, unsigned Old) {
+void HoistSpillHelper::LRE_DidCloneVirtReg(Register New, Register Old) {
   if (VRM.hasPhys(Old))
     VRM.assignVirt2Phys(New, VRM.getPhys(Old));
   else if (VRM.getStackSlot(Old) != VirtRegMap::NO_STACK_SLOT)
     VRM.assignVirt2StackSlot(New, VRM.getStackSlot(Old));
   else
     llvm_unreachable("VReg should be assigned either physreg or stackslot");
+  if (VRM.hasShape(Old))
+    VRM.assignVirt2Shape(New, VRM.getShape(Old));
 }
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/InterferenceCache.cpp b/contrib/llvm-project/llvm/lib/CodeGen/InterferenceCache.cpp
index 7b50dac4cd1a..a56485cdbc67 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/InterferenceCache.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/InterferenceCache.cpp
@@ -12,19 +12,15 @@
 
 #include "InterferenceCache.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/CodeGen/LiveInterval.h"
-#include "llvm/CodeGen/LiveIntervalUnion.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <cassert>
 #include <cstdint>
-#include <cstdlib>
 #include <tuple>
 
 using namespace llvm;
@@ -64,8 +60,8 @@ void InterferenceCache::init(MachineFunction *mf,
     Entries[i].clear(mf, indexes, lis);
 }
 
-InterferenceCache::Entry *InterferenceCache::get(unsigned PhysReg) {
-  unsigned E = PhysRegEntries[PhysReg];
+InterferenceCache::Entry *InterferenceCache::get(MCRegister PhysReg) {
+  unsigned char E = PhysRegEntries[PhysReg.id()];
   if (E < CacheEntries && Entries[E].getPhysReg() == PhysReg) {
     if (!Entries[E].valid(LIUArray, TRI))
       Entries[E].revalidate(LIUArray, TRI);
@@ -101,7 +97,7 @@ void InterferenceCache::Entry::revalidate(LiveIntervalUnion *LIUArray,
     RegUnits[i].VirtTag = LIUArray[*Units].getTag();
 }
 
-void InterferenceCache::Entry::reset(unsigned physReg,
+void InterferenceCache::Entry::reset(MCRegister physReg,
                                      LiveIntervalUnion *LIUArray,
                                      const TargetRegisterInfo *TRI,
                                      const MachineFunction *MF) {
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/InterferenceCache.h b/contrib/llvm-project/llvm/lib/CodeGen/InterferenceCache.h
index 9019e9f61fa0..ace1691c1363 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/InterferenceCache.h
+++ b/contrib/llvm-project/llvm/lib/CodeGen/InterferenceCache.h
@@ -44,7 +44,7 @@ class LLVM_LIBRARY_VISIBILITY InterferenceCache {
   /// of PhysReg in all basic blocks.
   class Entry {
     /// PhysReg - The register currently represented.
-    unsigned PhysReg = 0;
+    MCRegister PhysReg = 0;
 
     /// Tag - Cache tag is changed when any of the underlying LiveIntervalUnions
     /// change.
@@ -102,13 +102,13 @@ class LLVM_LIBRARY_VISIBILITY InterferenceCache {
 
     void clear(MachineFunction *mf, SlotIndexes *indexes, LiveIntervals *lis) {
       assert(!hasRefs() && "Cannot clear cache entry with references");
-      PhysReg = 0;
+      PhysReg = MCRegister::NoRegister;
       MF = mf;
       Indexes = indexes;
       LIS = lis;
     }
 
-    unsigned getPhysReg() const { return PhysReg; }
+    MCRegister getPhysReg() const { return PhysReg; }
 
     void addRef(int Delta) { RefCount += Delta; }
 
@@ -120,10 +120,8 @@ class LLVM_LIBRARY_VISIBILITY InterferenceCache {
     bool valid(LiveIntervalUnion *LIUArray, const TargetRegisterInfo *TRI);
 
     /// reset - Initialize entry to represent physReg's aliases.
-    void reset(unsigned physReg,
-               LiveIntervalUnion *LIUArray,
-               const TargetRegisterInfo *TRI,
-               const MachineFunction *MF);
+    void reset(MCRegister physReg, LiveIntervalUnion *LIUArray,
+               const TargetRegisterInfo *TRI, const MachineFunction *MF);
 
     /// get - Return an up to date BlockInterference.
     BlockInterference *get(unsigned MBBNum) {
@@ -154,7 +152,7 @@ class LLVM_LIBRARY_VISIBILITY InterferenceCache {
   Entry Entries[CacheEntries];
 
   // get - Get a valid entry for PhysReg.
-  Entry *get(unsigned PhysReg);
+  Entry *get(MCRegister PhysReg);
 
 public:
   InterferenceCache() = default;
@@ -207,11 +205,11 @@ public:
     ~Cursor() { setEntry(nullptr); }
 
     /// setPhysReg - Point this cursor to PhysReg's interference.
-    void setPhysReg(InterferenceCache &Cache, unsigned PhysReg) {
+    void setPhysReg(InterferenceCache &Cache, MCRegister PhysReg) {
       // Release reference before getting a new one. That guarantees we can
       // actually have CacheEntries live cursors.
       setEntry(nullptr);
-      if (PhysReg)
+      if (PhysReg.isValid())
         setEntry(Cache.get(PhysReg));
     }
 
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/contrib/llvm-project/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index c4d83547a06c..b22e6faeb91c 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -22,8 +22,8 @@
 //
 // E.g. An interleaved load (Factor = 2):
 //        %wide.vec = load <8 x i32>, <8 x i32>* %ptr
-//        %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <0, 2, 4, 6>
-//        %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <1, 3, 5, 7>
+//        %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <0, 2, 4, 6>
+//        %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <1, 3, 5, 7>
 //
 // It could be transformed into a ld2 intrinsic in AArch64 backend or a vld2
 // intrinsic in ARM backend.
@@ -66,6 +66,7 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
 #include <utility>
 
@@ -118,6 +119,15 @@ private:
   /// replacements are also performed.
   bool tryReplaceExtracts(ArrayRef<ExtractElementInst *> Extracts,
                           ArrayRef<ShuffleVectorInst *> Shuffles);
+
+  /// Given a number of shuffles of the form shuffle(binop(x,y)), convert them
+  /// to binop(shuffle(x), shuffle(y)) to allow the formation of an
+  /// interleaving load. Any newly created shuffles that operate on \p LI will
+  /// be added to \p Shuffles. Returns true, if any changes to the IR have been
+  /// made.
+  bool replaceBinOpShuffles(ArrayRef<ShuffleVectorInst *> BinOpShuffles,
+                            SmallVectorImpl<ShuffleVectorInst *> &Shuffles,
+                            LoadInst *LI);
 };
 
 } // end anonymous namespace.
@@ -283,67 +293,97 @@ bool InterleavedAccess::lowerInterleavedLoad(
   if (!LI->isSimple() || isa<ScalableVectorType>(LI->getType()))
     return false;
 
+  // Check if all users of this load are shufflevectors. If we encounter any
+  // users that are extractelement instructions or binary operators, we save
+  // them to later check if they can be modified to extract from one of the
+  // shufflevectors instead of the load.
+
   SmallVector<ShuffleVectorInst *, 4> Shuffles;
   SmallVector<ExtractElementInst *, 4> Extracts;
+  // BinOpShuffles need to be handled a single time in case both operands of the
+  // binop are the same load.
+  SmallSetVector<ShuffleVectorInst *, 4> BinOpShuffles;
 
-  // Check if all users of this load are shufflevectors. If we encounter any
-  // users that are extractelement instructions, we save them to later check if
-  // they can be modifed to extract from one of the shufflevectors instead of
-  // the load.
-  for (auto UI = LI->user_begin(), E = LI->user_end(); UI != E; UI++) {
-    auto *Extract = dyn_cast<ExtractElementInst>(*UI);
+  for (auto *User : LI->users()) {
+    auto *Extract = dyn_cast<ExtractElementInst>(User);
     if (Extract && isa<ConstantInt>(Extract->getIndexOperand())) {
       Extracts.push_back(Extract);
       continue;
     }
-    ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(*UI);
+    auto *BI = dyn_cast<BinaryOperator>(User);
+    if (BI && BI->hasOneUse()) {
+      if (auto *SVI = dyn_cast<ShuffleVectorInst>(*BI->user_begin())) {
+        BinOpShuffles.insert(SVI);
+        continue;
+      }
+    }
+    auto *SVI = dyn_cast<ShuffleVectorInst>(User);
     if (!SVI || !isa<UndefValue>(SVI->getOperand(1)))
       return false;
 
     Shuffles.push_back(SVI);
   }
 
-  if (Shuffles.empty())
+  if (Shuffles.empty() && BinOpShuffles.empty())
     return false;
 
   unsigned Factor, Index;
 
   unsigned NumLoadElements =
       cast<FixedVectorType>(LI->getType())->getNumElements();
+  auto *FirstSVI = Shuffles.size() > 0 ? Shuffles[0] : BinOpShuffles[0];
   // Check if the first shufflevector is DE-interleave shuffle.
-  if (!isDeInterleaveMask(Shuffles[0]->getShuffleMask(), Factor, Index,
-                          MaxFactor, NumLoadElements))
+  if (!isDeInterleaveMask(FirstSVI->getShuffleMask(), Factor, Index, MaxFactor,
+                          NumLoadElements))
     return false;
 
   // Holds the corresponding index for each DE-interleave shuffle.
   SmallVector<unsigned, 4> Indices;
-  Indices.push_back(Index);
 
-  Type *VecTy = Shuffles[0]->getType();
+  Type *VecTy = FirstSVI->getType();
 
   // Check if other shufflevectors are also DE-interleaved of the same type
   // and factor as the first shufflevector.
-  for (unsigned i = 1; i < Shuffles.size(); i++) {
-    if (Shuffles[i]->getType() != VecTy)
+  for (auto *Shuffle : Shuffles) {
+    if (Shuffle->getType() != VecTy)
       return false;
-
-    if (!isDeInterleaveMaskOfFactor(Shuffles[i]->getShuffleMask(), Factor,
+    if (!isDeInterleaveMaskOfFactor(Shuffle->getShuffleMask(), Factor,
                                     Index))
       return false;
 
+    assert(Shuffle->getShuffleMask().size() <= NumLoadElements);
     Indices.push_back(Index);
   }
+  for (auto *Shuffle : BinOpShuffles) {
+    if (Shuffle->getType() != VecTy)
+      return false;
+    if (!isDeInterleaveMaskOfFactor(Shuffle->getShuffleMask(), Factor,
+                                    Index))
+      return false;
+
+    assert(Shuffle->getShuffleMask().size() <= NumLoadElements);
+
+    if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(0) == LI)
+      Indices.push_back(Index);
+    if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(1) == LI)
+      Indices.push_back(Index);
+  }
 
   // Try and modify users of the load that are extractelement instructions to
   // use the shufflevector instructions instead of the load.
   if (!tryReplaceExtracts(Extracts, Shuffles))
     return false;
 
+  bool BinOpShuffleChanged =
+      replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, LI);
+
   LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *LI << "\n");
 
   // Try to create target specific intrinsics to replace the load and shuffles.
-  if (!TLI->lowerInterleavedLoad(LI, Shuffles, Indices, Factor))
-    return false;
+  if (!TLI->lowerInterleavedLoad(LI, Shuffles, Indices, Factor)) {
+    // If Extracts is not empty, tryReplaceExtracts made changes earlier.
+    return !Extracts.empty() || BinOpShuffleChanged;
+  }
 
   for (auto SVI : Shuffles)
     DeadInsts.push_back(SVI);
@@ -352,6 +392,39 @@ bool InterleavedAccess::lowerInterleavedLoad(
   return true;
 }
 
+bool InterleavedAccess::replaceBinOpShuffles(
+    ArrayRef<ShuffleVectorInst *> BinOpShuffles,
+    SmallVectorImpl<ShuffleVectorInst *> &Shuffles, LoadInst *LI) {
+  for (auto *SVI : BinOpShuffles) {
+    BinaryOperator *BI = cast<BinaryOperator>(SVI->getOperand(0));
+    Type *BIOp0Ty = BI->getOperand(0)->getType();
+    ArrayRef<int> Mask = SVI->getShuffleMask();
+    assert(all_of(Mask, [&](int Idx) {
+      return Idx < (int)cast<FixedVectorType>(BIOp0Ty)->getNumElements();
+    }));
+
+    auto *NewSVI1 =
+        new ShuffleVectorInst(BI->getOperand(0), PoisonValue::get(BIOp0Ty),
+                              Mask, SVI->getName(), SVI);
+    auto *NewSVI2 = new ShuffleVectorInst(
+        BI->getOperand(1), PoisonValue::get(BI->getOperand(1)->getType()), Mask,
+        SVI->getName(), SVI);
+    Value *NewBI = BinaryOperator::Create(BI->getOpcode(), NewSVI1, NewSVI2,
+                                          BI->getName(), SVI);
+    SVI->replaceAllUsesWith(NewBI);
+    LLVM_DEBUG(dbgs() << "  Replaced: " << *BI << "\n    And   : " << *SVI
+                      << "\n  With    : " << *NewSVI1 << "\n    And   : "
+                      << *NewSVI2 << "\n    And   : " << *NewBI << "\n");
+    RecursivelyDeleteTriviallyDeadInstructions(SVI);
+    if (NewSVI1->getOperand(0) == LI)
+      Shuffles.push_back(NewSVI1);
+    if (NewSVI2->getOperand(0) == LI)
+      Shuffles.push_back(NewSVI2);
+  }
+
+  return !BinOpShuffles.empty();
+}
+
 bool InterleavedAccess::tryReplaceExtracts(
     ArrayRef<ExtractElementInst *> Extracts,
     ArrayRef<ShuffleVectorInst *> Shuffles) {
@@ -421,7 +494,7 @@ bool InterleavedAccess::lowerInterleavedStore(
   if (!SI->isSimple())
     return false;
 
-  ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(SI->getValueOperand());
+  auto *SVI = dyn_cast<ShuffleVectorInst>(SI->getValueOperand());
   if (!SVI || !SVI->hasOneUse() || isa<ScalableVectorType>(SVI->getType()))
     return false;
 
@@ -461,10 +534,10 @@ bool InterleavedAccess::runOnFunction(Function &F) {
   bool Changed = false;
 
   for (auto &I : instructions(F)) {
-    if (LoadInst *LI = dyn_cast<LoadInst>(&I))
+    if (auto *LI = dyn_cast<LoadInst>(&I))
       Changed |= lowerInterleavedLoad(LI, DeadInsts);
 
-    if (StoreInst *SI = dyn_cast<StoreInst>(&I))
+    if (auto *SI = dyn_cast<StoreInst>(&I))
       Changed |= lowerInterleavedStore(SI, DeadInsts);
   }
 
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp b/contrib/llvm-project/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp
index f7131926ee65..ff3f93d51ea8 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp
@@ -1104,10 +1104,8 @@ InterleavedLoadCombineImpl::findFirstLoad(const std::set<LoadInst *> &LIs) {
 
   // All LIs are within the same BB. Select the first for a reference.
   BasicBlock *BB = (*LIs.begin())->getParent();
-  BasicBlock::iterator FLI =
-      std::find_if(BB->begin(), BB->end(), [&LIs](Instruction &I) -> bool {
-        return is_contained(LIs, &I);
-      });
+  BasicBlock::iterator FLI = llvm::find_if(
+      *BB, [&LIs](Instruction &I) -> bool { return is_contained(LIs, &I); });
   assert(FLI != BB->end());
 
   return cast<LoadInst>(FLI);
@@ -1130,8 +1128,8 @@ bool InterleavedLoadCombineImpl::combine(std::list<VectorInfo> &InterleavedLoad,
   std::set<Instruction *> Is;
   std::set<Instruction *> SVIs;
 
-  unsigned InterleavedCost;
-  unsigned InstructionCost = 0;
+  InstructionCost InterleavedCost;
+  InstructionCost InstructionCost = 0;
 
   // Get the interleave factor
   unsigned Factor = InterleavedLoad.size();
@@ -1174,6 +1172,10 @@ bool InterleavedLoadCombineImpl::combine(std::list<VectorInfo> &InterleavedLoad,
     }
   }
 
+  // We need to have a valid cost in order to proceed.
+  if (!InstructionCost.isValid())
+    return false;
+
   // We know that all LoadInst are within the same BB. This guarantees that
   // either everything or nothing is loaded.
   LoadInst *First = findFirstLoad(LIs);
@@ -1236,8 +1238,7 @@ bool InterleavedLoadCombineImpl::combine(std::list<VectorInfo> &InterleavedLoad,
       Mask.push_back(i + j * Factor);
 
     Builder.SetInsertPoint(VI.SVI);
-    auto SVI = Builder.CreateShuffleVector(LI, UndefValue::get(LI->getType()),
-                                           Mask, "interleaved.shuffle");
+    auto SVI = Builder.CreateShuffleVector(LI, Mask, "interleaved.shuffle");
     VI.SVI->replaceAllUsesWith(SVI);
     i++;
   }
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/IntrinsicLowering.cpp b/contrib/llvm-project/llvm/lib/CodeGen/IntrinsicLowering.cpp
index e37c21e76597..55089d3b90d0 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/IntrinsicLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/IntrinsicLowering.cpp
@@ -329,6 +329,7 @@ void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) {
     break;
 
   case Intrinsic::assume:
+  case Intrinsic::experimental_noalias_scope_decl:
   case Intrinsic::var_annotation:
     break;   // Strip out these intrinsics
 
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LLVMTargetMachine.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LLVMTargetMachine.cpp
index b485f2cf7261..f9b7bf613ff6 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/LLVMTargetMachine.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LLVMTargetMachine.cpp
@@ -40,13 +40,16 @@ static cl::opt<bool> EnableTrapUnreachable("trap-unreachable",
 
 void LLVMTargetMachine::initAsmInfo() {
   MRI.reset(TheTarget.createMCRegInfo(getTargetTriple().str()));
+  assert(MRI && "Unable to create reg info");
   MII.reset(TheTarget.createMCInstrInfo());
+  assert(MII && "Unable to create instruction info");
   // FIXME: Having an MCSubtargetInfo on the target machine is a hack due
   // to some backends having subtarget feature dependent module level
   // code generation. This is similar to the hack in the AsmPrinter for
   // module level assembly etc.
   STI.reset(TheTarget.createMCSubtargetInfo(
       getTargetTriple().str(), getTargetCPU(), getTargetFeatureString()));
+  assert(STI && "Unable to create subtarget info");
 
   MCAsmInfo *TmpAsmInfo = TheTarget.createMCAsmInfo(
       *MRI, getTargetTriple().str(), Options.MCOptions);
@@ -58,6 +61,9 @@ void LLVMTargetMachine::initAsmInfo() {
          "Make sure you include the correct TargetSelect.h"
          "and that InitializeAllTargetMCs() is being invoked!");
 
+  if (Options.BinutilsVersion.first > 0)
+    TmpAsmInfo->setBinutilsVersion(Options.BinutilsVersion);
+
   if (Options.DisableIntegratedAS)
     TmpAsmInfo->setUseIntegratedAssembler(false);
 
@@ -118,6 +124,24 @@ bool LLVMTargetMachine::addAsmPrinter(PassManagerBase &PM,
                                       raw_pwrite_stream *DwoOut,
                                       CodeGenFileType FileType,
                                       MCContext &Context) {
+  Expected<std::unique_ptr<MCStreamer>> MCStreamerOrErr =
+      createMCStreamer(Out, DwoOut, FileType, Context);
+  if (auto Err = MCStreamerOrErr.takeError())
+    return true;
+
+  // Create the AsmPrinter, which takes ownership of AsmStreamer if successful.
+  FunctionPass *Printer =
+      getTarget().createAsmPrinter(*this, std::move(*MCStreamerOrErr));
+  if (!Printer)
+    return true;
+
+  PM.add(Printer);
+  return false;
+}
+
+Expected<std::unique_ptr<MCStreamer>> LLVMTargetMachine::createMCStreamer(
+    raw_pwrite_stream &Out, raw_pwrite_stream *DwoOut, CodeGenFileType FileType,
+    MCContext &Context) {
   if (Options.MCOptions.MCSaveTempLabels)
     Context.setAllowTemporaryLabels(false);
 
@@ -152,10 +176,14 @@ bool LLVMTargetMachine::addAsmPrinter(PassManagerBase &PM,
     // Create the code emitter for the target if it exists.  If not, .o file
     // emission fails.
     MCCodeEmitter *MCE = getTarget().createMCCodeEmitter(MII, MRI, Context);
+    if (!MCE)
+      return make_error<StringError>("createMCCodeEmitter failed",
+                                     inconvertibleErrorCode());
     MCAsmBackend *MAB =
         getTarget().createMCAsmBackend(STI, MRI, Options.MCOptions);
-    if (!MCE || !MAB)
-      return true;
+    if (!MAB)
+      return make_error<StringError>("createMCAsmBackend failed",
+                                     inconvertibleErrorCode());
 
     Triple T(getTargetTriple().str());
     AsmStreamer.reset(getTarget().createMCObjectStreamer(
@@ -174,14 +202,7 @@ bool LLVMTargetMachine::addAsmPrinter(PassManagerBase &PM,
     break;
   }
 
-  // Create the AsmPrinter, which takes ownership of AsmStreamer if successful.
-  FunctionPass *Printer =
-      getTarget().createAsmPrinter(*this, std::move(AsmStreamer));
-  if (!Printer)
-    return true;
-
-  PM.add(Printer);
-  return false;
+  return std::move(AsmStreamer);
 }
 
 bool LLVMTargetMachine::addPassesToEmitFile(
@@ -196,20 +217,14 @@ bool LLVMTargetMachine::addPassesToEmitFile(
   if (!PassConfig)
     return true;
 
-  if (!TargetPassConfig::willCompleteCodeGenPipeline()) {
-    if (this->getTargetTriple().isOSAIX()) {
-      // On AIX, we might manifest MCSymbols during SDAG lowering. For MIR
-      // testing to be meaningful, we need to ensure that the symbols created
-      // are MCSymbolXCOFF variants, which requires that
-      // the TargetLoweringObjectFile instance has been initialized.
-      MCContext &Ctx = MMIWP->getMMI().getContext();
-      const_cast<TargetLoweringObjectFile &>(*this->getObjFileLowering())
-          .Initialize(Ctx, *this);
-    }
-    PM.add(createPrintMIRPass(Out));
-  } else if (addAsmPrinter(PM, Out, DwoOut, FileType,
-                           MMIWP->getMMI().getContext()))
-    return true;
+  if (TargetPassConfig::willCompleteCodeGenPipeline()) {
+    if (addAsmPrinter(PM, Out, DwoOut, FileType, MMIWP->getMMI().getContext()))
+      return true;
+  } else {
+    // MIR printing is redundant with -filetype=null.
+    if (FileType != CGFT_Null)
+      PM.add(createPrintMIRPass(Out));
+  }
 
   PM.add(createFreeMachineFunctionPass());
   return false;
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LexicalScopes.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LexicalScopes.cpp
index 690b429832a5..8139c2cbb6cd 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/LexicalScopes.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LexicalScopes.cpp
@@ -324,7 +324,7 @@ bool LexicalScopes::dominates(const DILocation *DL, MachineBasicBlock *MBB) {
     Set = std::make_unique<BlockSetT>();
     getMachineBasicBlocks(DL, *Set);
   }
-  return Set->count(MBB) != 0;
+  return Set->contains(MBB);
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugValues.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugValues.cpp
deleted file mode 100644
index 07a275b546f6..000000000000
--- a/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugValues.cpp
+++ /dev/null
@@ -1,1976 +0,0 @@
-//===- LiveDebugValues.cpp - Tracking Debug Value MIs ---------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file LiveDebugValues.cpp
-///
-/// LiveDebugValues is an optimistic "available expressions" dataflow
-/// algorithm. The set of expressions is the set of machine locations
-/// (registers, spill slots, constants) that a variable fragment might be
-/// located, qualified by a DIExpression and indirect-ness flag, while each
-/// variable is identified by a DebugVariable object. The availability of an
-/// expression begins when a DBG_VALUE instruction specifies the location of a
-/// DebugVariable, and continues until that location is clobbered or
-/// re-specified by a different DBG_VALUE for the same DebugVariable.
-///
-/// The cannonical "available expressions" problem doesn't have expression
-/// clobbering, instead when a variable is re-assigned, any expressions using
-/// that variable get invalidated. LiveDebugValues can map onto "available
-/// expressions" by having every register represented by a variable, which is
-/// used in an expression that becomes available at a DBG_VALUE instruction.
-/// When the register is clobbered, its variable is effectively reassigned, and
-/// expressions computed from it become unavailable. A similar construct is
-/// needed when a DebugVariable has its location re-specified, to invalidate
-/// all other locations for that DebugVariable.
-///
-/// Using the dataflow analysis to compute the available expressions, we create
-/// a DBG_VALUE at the beginning of each block where the expression is
-/// live-in. This propagates variable locations into every basic block where
-/// the location can be determined, rather than only having DBG_VALUEs in blocks
-/// where locations are specified due to an assignment or some optimization.
-/// Movements of values between registers and spill slots are annotated with
-/// DBG_VALUEs too to track variable values bewteen locations. All this allows
-/// DbgEntityHistoryCalculator to focus on only the locations within individual
-/// blocks, facilitating testing and improving modularity.
-///
-/// We follow an optimisic dataflow approach, with this lattice:
-///
-/// \verbatim
-///                    ┬ "Unknown"
-///                          |
-///                          v
-///                         True
-///                          |
-///                          v
-///                      ⊥ False
-/// \endverbatim With "True" signifying that the expression is available (and
-/// thus a DebugVariable's location is the corresponding register), while
-/// "False" signifies that the expression is unavailable. "Unknown"s never
-/// survive to the end of the analysis (see below).
-///
-/// Formally, all DebugVariable locations that are live-out of a block are
-/// initialized to \top.  A blocks live-in values take the meet of the lattice
-/// value for every predecessors live-outs, except for the entry block, where
-/// all live-ins are \bot. The usual dataflow propagation occurs: the transfer
-/// function for a block assigns an expression for a DebugVariable to be "True"
-/// if a DBG_VALUE in the block specifies it; "False" if the location is
-/// clobbered; or the live-in value if it is unaffected by the block. We
-/// visit each block in reverse post order until a fixedpoint is reached. The
-/// solution produced is maximal.
-///
-/// Intuitively, we start by assuming that every expression / variable location
-/// is at least "True", and then propagate "False" from the entry block and any
-/// clobbers until there are no more changes to make. This gives us an accurate
-/// solution because all incorrect locations will have a "False" propagated into
-/// them. It also gives us a solution that copes well with loops by assuming
-/// that variable locations are live-through every loop, and then removing those
-/// that are not through dataflow.
-///
-/// Within LiveDebugValues: each variable location is represented by a
-/// VarLoc object that identifies the source variable, its current
-/// machine-location, and the DBG_VALUE inst that specifies the location. Each
-/// VarLoc is indexed in the (function-scope) \p VarLocMap, giving each VarLoc a
-/// unique index. Rather than operate directly on machine locations, the
-/// dataflow analysis in this pass identifies locations by their index in the
-/// VarLocMap, meaning all the variable locations in a block can be described
-/// by a sparse vector of VarLocMap indicies.
-///
-/// All the storage for the dataflow analysis is local to the ExtendRanges
-/// method and passed down to helper methods. "OutLocs" and "InLocs" record the
-/// in and out lattice values for each block. "OpenRanges" maintains a list of
-/// variable locations and, with the "process" method, evaluates the transfer
-/// function of each block. "flushPendingLocs" installs DBG_VALUEs for each
-/// live-in location at the start of blocks, while "Transfers" records
-/// transfers of values between machine-locations.
-///
-/// We avoid explicitly representing the "Unknown" (\top) lattice value in the
-/// implementation. Instead, unvisited blocks implicitly have all lattice
-/// values set as "Unknown". After being visited, there will be path back to
-/// the entry block where the lattice value is "False", and as the transfer
-/// function cannot make new "Unknown" locations, there are no scenarios where
-/// a block can have an "Unknown" location after being visited. Similarly, we
-/// don't enumerate all possible variable locations before exploring the
-/// function: when a new location is discovered, all blocks previously explored
-/// were implicitly "False" but unrecorded, and become explicitly "False" when
-/// a new VarLoc is created with its bit not set in predecessor InLocs or
-/// OutLocs.
-///
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/CoalescingBitVector.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/UniqueVector.h"
-#include "llvm/CodeGen/LexicalScopes.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
-#include "llvm/CodeGen/RegisterScavenging.h"
-#include "llvm/CodeGen/TargetFrameLowering.h"
-#include "llvm/CodeGen/TargetInstrInfo.h"
-#include "llvm/CodeGen/TargetLowering.h"
-#include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/CodeGen/TargetSubtargetInfo.h"
-#include "llvm/Config/llvm-config.h"
-#include "llvm/IR/DIBuilder.h"
-#include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/IR/DebugLoc.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Module.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
-#include <functional>
-#include <queue>
-#include <tuple>
-#include <utility>
-#include <vector>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "livedebugvalues"
-
-STATISTIC(NumInserted, "Number of DBG_VALUE instructions inserted");
-
-// Options to prevent pathological compile-time behavior. If InputBBLimit and
-// InputDbgValueLimit are both exceeded, range extension is disabled.
-static cl::opt<unsigned> InputBBLimit(
-    "livedebugvalues-input-bb-limit",
-    cl::desc("Maximum input basic blocks before DBG_VALUE limit applies"),
-    cl::init(10000), cl::Hidden);
-static cl::opt<unsigned> InputDbgValueLimit(
-    "livedebugvalues-input-dbg-value-limit",
-    cl::desc(
-        "Maximum input DBG_VALUE insts supported by debug range extension"),
-    cl::init(50000), cl::Hidden);
-
-// If @MI is a DBG_VALUE with debug value described by a defined
-// register, returns the number of this register. In the other case, returns 0.
-static Register isDbgValueDescribedByReg(const MachineInstr &MI) {
-  assert(MI.isDebugValue() && "expected a DBG_VALUE");
-  assert(MI.getNumOperands() == 4 && "malformed DBG_VALUE");
-  // If location of variable is described using a register (directly
-  // or indirectly), this register is always a first operand.
-  return MI.getDebugOperand(0).isReg() ? MI.getDebugOperand(0).getReg()
-                                       : Register();
-}
-
-/// If \p Op is a stack or frame register return true, otherwise return false.
-/// This is used to avoid basing the debug entry values on the registers, since
-/// we do not support it at the moment.
-static bool isRegOtherThanSPAndFP(const MachineOperand &Op,
-                                  const MachineInstr &MI,
-                                  const TargetRegisterInfo *TRI) {
-  if (!Op.isReg())
-    return false;
-
-  const MachineFunction *MF = MI.getParent()->getParent();
-  const TargetLowering *TLI = MF->getSubtarget().getTargetLowering();
-  Register SP = TLI->getStackPointerRegisterToSaveRestore();
-  Register FP = TRI->getFrameRegister(*MF);
-  Register Reg = Op.getReg();
-
-  return Reg && Reg != SP && Reg != FP;
-}
-
-namespace {
-
-// Max out the number of statically allocated elements in DefinedRegsSet, as
-// this prevents fallback to std::set::count() operations.
-using DefinedRegsSet = SmallSet<Register, 32>;
-
-using VarLocSet = CoalescingBitVector<uint64_t>;
-
-/// A type-checked pair of {Register Location (or 0), Index}, used to index
-/// into a \ref VarLocMap. This can be efficiently converted to a 64-bit int
-/// for insertion into a \ref VarLocSet, and efficiently converted back. The
-/// type-checker helps ensure that the conversions aren't lossy.
-///
-/// Why encode a location /into/ the VarLocMap index? This makes it possible
-/// to find the open VarLocs killed by a register def very quickly. This is a
-/// performance-critical operation for LiveDebugValues.
-struct LocIndex {
-  using u32_location_t = uint32_t;
-  using u32_index_t = uint32_t;
-
-  u32_location_t Location; // Physical registers live in the range [1;2^30) (see
-                           // \ref MCRegister), so we have plenty of range left
-                           // here to encode non-register locations.
-  u32_index_t Index;
-
-  /// The first location greater than 0 that is not reserved for VarLocs of
-  /// kind RegisterKind.
-  static constexpr u32_location_t kFirstInvalidRegLocation = 1 << 30;
-
-  /// A special location reserved for VarLocs of kind SpillLocKind.
-  static constexpr u32_location_t kSpillLocation = kFirstInvalidRegLocation;
-
-  /// A special location reserved for VarLocs of kind EntryValueBackupKind and
-  /// EntryValueCopyBackupKind.
-  static constexpr u32_location_t kEntryValueBackupLocation =
-      kFirstInvalidRegLocation + 1;
-
-  LocIndex(u32_location_t Location, u32_index_t Index)
-      : Location(Location), Index(Index) {}
-
-  uint64_t getAsRawInteger() const {
-    return (static_cast<uint64_t>(Location) << 32) | Index;
-  }
-
-  template<typename IntT> static LocIndex fromRawInteger(IntT ID) {
-    static_assert(std::is_unsigned<IntT>::value &&
-                      sizeof(ID) == sizeof(uint64_t),
-                  "Cannot convert raw integer to LocIndex");
-    return {static_cast<u32_location_t>(ID >> 32),
-            static_cast<u32_index_t>(ID)};
-  }
-
-  /// Get the start of the interval reserved for VarLocs of kind RegisterKind
-  /// which reside in \p Reg. The end is at rawIndexForReg(Reg+1)-1.
-  static uint64_t rawIndexForReg(uint32_t Reg) {
-    return LocIndex(Reg, 0).getAsRawInteger();
-  }
-
-  /// Return a range covering all set indices in the interval reserved for
-  /// \p Location in \p Set.
-  static auto indexRangeForLocation(const VarLocSet &Set,
-                                    u32_location_t Location) {
-    uint64_t Start = LocIndex(Location, 0).getAsRawInteger();
-    uint64_t End = LocIndex(Location + 1, 0).getAsRawInteger();
-    return Set.half_open_range(Start, End);
-  }
-};
-
-class LiveDebugValues : public MachineFunctionPass {
-private:
-  const TargetRegisterInfo *TRI;
-  const TargetInstrInfo *TII;
-  const TargetFrameLowering *TFI;
-  BitVector CalleeSavedRegs;
-  LexicalScopes LS;
-  VarLocSet::Allocator Alloc;
-
-  enum struct TransferKind { TransferCopy, TransferSpill, TransferRestore };
-
-  using FragmentInfo = DIExpression::FragmentInfo;
-  using OptFragmentInfo = Optional<DIExpression::FragmentInfo>;
-
-  /// A pair of debug variable and value location.
-  struct VarLoc {
-    // The location at which a spilled variable resides. It consists of a
-    // register and an offset.
-    struct SpillLoc {
-      unsigned SpillBase;
-      int SpillOffset;
-      bool operator==(const SpillLoc &Other) const {
-        return SpillBase == Other.SpillBase && SpillOffset == Other.SpillOffset;
-      }
-      bool operator!=(const SpillLoc &Other) const {
-        return !(*this == Other);
-      }
-    };
-
-    /// Identity of the variable at this location.
-    const DebugVariable Var;
-
-    /// The expression applied to this location.
-    const DIExpression *Expr;
-
-    /// DBG_VALUE to clone var/expr information from if this location
-    /// is moved.
-    const MachineInstr &MI;
-
-    enum VarLocKind {
-      InvalidKind = 0,
-      RegisterKind,
-      SpillLocKind,
-      ImmediateKind,
-      EntryValueKind,
-      EntryValueBackupKind,
-      EntryValueCopyBackupKind
-    } Kind = InvalidKind;
-
-    /// The value location. Stored separately to avoid repeatedly
-    /// extracting it from MI.
-    union {
-      uint64_t RegNo;
-      SpillLoc SpillLocation;
-      uint64_t Hash;
-      int64_t Immediate;
-      const ConstantFP *FPImm;
-      const ConstantInt *CImm;
-    } Loc;
-
-    VarLoc(const MachineInstr &MI, LexicalScopes &LS)
-        : Var(MI.getDebugVariable(), MI.getDebugExpression(),
-              MI.getDebugLoc()->getInlinedAt()),
-          Expr(MI.getDebugExpression()), MI(MI) {
-      static_assert((sizeof(Loc) == sizeof(uint64_t)),
-                    "hash does not cover all members of Loc");
-      assert(MI.isDebugValue() && "not a DBG_VALUE");
-      assert(MI.getNumOperands() == 4 && "malformed DBG_VALUE");
-      if (int RegNo = isDbgValueDescribedByReg(MI)) {
-        Kind = RegisterKind;
-        Loc.RegNo = RegNo;
-      } else if (MI.getDebugOperand(0).isImm()) {
-        Kind = ImmediateKind;
-        Loc.Immediate = MI.getDebugOperand(0).getImm();
-      } else if (MI.getDebugOperand(0).isFPImm()) {
-        Kind = ImmediateKind;
-        Loc.FPImm = MI.getDebugOperand(0).getFPImm();
-      } else if (MI.getDebugOperand(0).isCImm()) {
-        Kind = ImmediateKind;
-        Loc.CImm = MI.getDebugOperand(0).getCImm();
-      }
-
-      // We create the debug entry values from the factory functions rather than
-      // from this ctor.
-      assert(Kind != EntryValueKind && !isEntryBackupLoc());
-    }
-
-    /// Take the variable and machine-location in DBG_VALUE MI, and build an
-    /// entry location using the given expression.
-    static VarLoc CreateEntryLoc(const MachineInstr &MI, LexicalScopes &LS,
-                                 const DIExpression *EntryExpr, Register Reg) {
-      VarLoc VL(MI, LS);
-      assert(VL.Kind == RegisterKind);
-      VL.Kind = EntryValueKind;
-      VL.Expr = EntryExpr;
-      VL.Loc.RegNo = Reg;
-      return VL;
-    }
-
-    /// Take the variable and machine-location from the DBG_VALUE (from the
-    /// function entry), and build an entry value backup location. The backup
-    /// location will turn into the normal location if the backup is valid at
-    /// the time of the primary location clobbering.
-    static VarLoc CreateEntryBackupLoc(const MachineInstr &MI,
-                                       LexicalScopes &LS,
-                                       const DIExpression *EntryExpr) {
-      VarLoc VL(MI, LS);
-      assert(VL.Kind == RegisterKind);
-      VL.Kind = EntryValueBackupKind;
-      VL.Expr = EntryExpr;
-      return VL;
-    }
-
-    /// Take the variable and machine-location from the DBG_VALUE (from the
-    /// function entry), and build a copy of an entry value backup location by
-    /// setting the register location to NewReg.
-    static VarLoc CreateEntryCopyBackupLoc(const MachineInstr &MI,
-                                           LexicalScopes &LS,
-                                           const DIExpression *EntryExpr,
-                                           Register NewReg) {
-      VarLoc VL(MI, LS);
-      assert(VL.Kind == RegisterKind);
-      VL.Kind = EntryValueCopyBackupKind;
-      VL.Expr = EntryExpr;
-      VL.Loc.RegNo = NewReg;
-      return VL;
-    }
-
-    /// Copy the register location in DBG_VALUE MI, updating the register to
-    /// be NewReg.
-    static VarLoc CreateCopyLoc(const MachineInstr &MI, LexicalScopes &LS,
-                                Register NewReg) {
-      VarLoc VL(MI, LS);
-      assert(VL.Kind == RegisterKind);
-      VL.Loc.RegNo = NewReg;
-      return VL;
-    }
-
-    /// Take the variable described by DBG_VALUE MI, and create a VarLoc
-    /// locating it in the specified spill location.
-    static VarLoc CreateSpillLoc(const MachineInstr &MI, unsigned SpillBase,
-                                 int SpillOffset, LexicalScopes &LS) {
-      VarLoc VL(MI, LS);
-      assert(VL.Kind == RegisterKind);
-      VL.Kind = SpillLocKind;
-      VL.Loc.SpillLocation = {SpillBase, SpillOffset};
-      return VL;
-    }
-
-    /// Create a DBG_VALUE representing this VarLoc in the given function.
-    /// Copies variable-specific information such as DILocalVariable and
-    /// inlining information from the original DBG_VALUE instruction, which may
-    /// have been several transfers ago.
-    MachineInstr *BuildDbgValue(MachineFunction &MF) const {
-      const DebugLoc &DbgLoc = MI.getDebugLoc();
-      bool Indirect = MI.isIndirectDebugValue();
-      const auto &IID = MI.getDesc();
-      const DILocalVariable *Var = MI.getDebugVariable();
-      const DIExpression *DIExpr = MI.getDebugExpression();
-      NumInserted++;
-
-      switch (Kind) {
-      case EntryValueKind:
-        // An entry value is a register location -- but with an updated
-        // expression. The register location of such DBG_VALUE is always the one
-        // from the entry DBG_VALUE, it does not matter if the entry value was
-        // copied in to another register due to some optimizations.
-        return BuildMI(MF, DbgLoc, IID, Indirect,
-                       MI.getDebugOperand(0).getReg(), Var, Expr);
-      case RegisterKind:
-        // Register locations are like the source DBG_VALUE, but with the
-        // register number from this VarLoc.
-        return BuildMI(MF, DbgLoc, IID, Indirect, Loc.RegNo, Var, DIExpr);
-      case SpillLocKind: {
-        // Spills are indirect DBG_VALUEs, with a base register and offset.
-        // Use the original DBG_VALUEs expression to build the spilt location
-        // on top of. FIXME: spill locations created before this pass runs
-        // are not recognized, and not handled here.
-        auto *SpillExpr = DIExpression::prepend(
-            DIExpr, DIExpression::ApplyOffset, Loc.SpillLocation.SpillOffset);
-        unsigned Base = Loc.SpillLocation.SpillBase;
-        return BuildMI(MF, DbgLoc, IID, true, Base, Var, SpillExpr);
-      }
-      case ImmediateKind: {
-        MachineOperand MO = MI.getDebugOperand(0);
-        return BuildMI(MF, DbgLoc, IID, Indirect, MO, Var, DIExpr);
-      }
-      case EntryValueBackupKind:
-      case EntryValueCopyBackupKind:
-      case InvalidKind:
-        llvm_unreachable(
-            "Tried to produce DBG_VALUE for invalid or backup VarLoc");
-      }
-      llvm_unreachable("Unrecognized LiveDebugValues.VarLoc.Kind enum");
-    }
-
-    /// Is the Loc field a constant or constant object?
-    bool isConstant() const { return Kind == ImmediateKind; }
-
-    /// Check if the Loc field is an entry backup location.
-    bool isEntryBackupLoc() const {
-      return Kind == EntryValueBackupKind || Kind == EntryValueCopyBackupKind;
-    }
-
-    /// If this variable is described by a register holding the entry value,
-    /// return it, otherwise return 0.
-    unsigned getEntryValueBackupReg() const {
-      if (Kind == EntryValueBackupKind)
-        return Loc.RegNo;
-      return 0;
-    }
-
-    /// If this variable is described by a register holding the copy of the
-    /// entry value, return it, otherwise return 0.
-    unsigned getEntryValueCopyBackupReg() const {
-      if (Kind == EntryValueCopyBackupKind)
-        return Loc.RegNo;
-      return 0;
-    }
-
-    /// If this variable is described by a register, return it,
-    /// otherwise return 0.
-    unsigned isDescribedByReg() const {
-      if (Kind == RegisterKind)
-        return Loc.RegNo;
-      return 0;
-    }
-
-    /// Determine whether the lexical scope of this value's debug location
-    /// dominates MBB.
-    bool dominates(LexicalScopes &LS, MachineBasicBlock &MBB) const {
-      return LS.dominates(MI.getDebugLoc().get(), &MBB);
-    }
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-    // TRI can be null.
-    void dump(const TargetRegisterInfo *TRI, raw_ostream &Out = dbgs()) const {
-      Out << "VarLoc(";
-      switch (Kind) {
-      case RegisterKind:
-      case EntryValueKind:
-      case EntryValueBackupKind:
-      case EntryValueCopyBackupKind:
-        Out << printReg(Loc.RegNo, TRI);
-        break;
-      case SpillLocKind:
-        Out << printReg(Loc.SpillLocation.SpillBase, TRI);
-        Out << "[" << Loc.SpillLocation.SpillOffset << "]";
-        break;
-      case ImmediateKind:
-        Out << Loc.Immediate;
-        break;
-      case InvalidKind:
-        llvm_unreachable("Invalid VarLoc in dump method");
-      }
-
-      Out << ", \"" << Var.getVariable()->getName() << "\", " << *Expr << ", ";
-      if (Var.getInlinedAt())
-        Out << "!" << Var.getInlinedAt()->getMetadataID() << ")\n";
-      else
-        Out << "(null))";
-
-      if (isEntryBackupLoc())
-        Out << " (backup loc)\n";
-      else
-        Out << "\n";
-    }
-#endif
-
-    bool operator==(const VarLoc &Other) const {
-      return Kind == Other.Kind && Var == Other.Var &&
-             Loc.Hash == Other.Loc.Hash && Expr == Other.Expr;
-    }
-
-    /// This operator guarantees that VarLocs are sorted by Variable first.
-    bool operator<(const VarLoc &Other) const {
-      return std::tie(Var, Kind, Loc.Hash, Expr) <
-             std::tie(Other.Var, Other.Kind, Other.Loc.Hash, Other.Expr);
-    }
-  };
-
-  /// VarLocMap is used for two things:
-  /// 1) Assigning a unique LocIndex to a VarLoc. This LocIndex can be used to
-  ///    virtually insert a VarLoc into a VarLocSet.
-  /// 2) Given a LocIndex, look up the unique associated VarLoc.
-  class VarLocMap {
-    /// Map a VarLoc to an index within the vector reserved for its location
-    /// within Loc2Vars.
-    std::map<VarLoc, LocIndex::u32_index_t> Var2Index;
-
-    /// Map a location to a vector which holds VarLocs which live in that
-    /// location.
-    SmallDenseMap<LocIndex::u32_location_t, std::vector<VarLoc>> Loc2Vars;
-
-    /// Determine the 32-bit location reserved for \p VL, based on its kind.
-    static LocIndex::u32_location_t getLocationForVar(const VarLoc &VL) {
-      switch (VL.Kind) {
-      case VarLoc::RegisterKind:
-        assert((VL.Loc.RegNo < LocIndex::kFirstInvalidRegLocation) &&
-               "Physreg out of range?");
-        return VL.Loc.RegNo;
-      case VarLoc::SpillLocKind:
-        return LocIndex::kSpillLocation;
-      case VarLoc::EntryValueBackupKind:
-      case VarLoc::EntryValueCopyBackupKind:
-        return LocIndex::kEntryValueBackupLocation;
-      default:
-        return 0;
-      }
-    }
-
-  public:
-    /// Retrieve a unique LocIndex for \p VL.
-    LocIndex insert(const VarLoc &VL) {
-      LocIndex::u32_location_t Location = getLocationForVar(VL);
-      LocIndex::u32_index_t &Index = Var2Index[VL];
-      if (!Index) {
-        auto &Vars = Loc2Vars[Location];
-        Vars.push_back(VL);
-        Index = Vars.size();
-      }
-      return {Location, Index - 1};
-    }
-
-    /// Retrieve the unique VarLoc associated with \p ID.
-    const VarLoc &operator[](LocIndex ID) const {
-      auto LocIt = Loc2Vars.find(ID.Location);
-      assert(LocIt != Loc2Vars.end() && "Location not tracked");
-      return LocIt->second[ID.Index];
-    }
-  };
-
-  using VarLocInMBB =
-      SmallDenseMap<const MachineBasicBlock *, std::unique_ptr<VarLocSet>>;
-  struct TransferDebugPair {
-    MachineInstr *TransferInst; ///< Instruction where this transfer occurs.
-    LocIndex LocationID;        ///< Location number for the transfer dest.
-  };
-  using TransferMap = SmallVector<TransferDebugPair, 4>;
-
-  // Types for recording sets of variable fragments that overlap. For a given
-  // local variable, we record all other fragments of that variable that could
-  // overlap it, to reduce search time.
-  using FragmentOfVar =
-      std::pair<const DILocalVariable *, DIExpression::FragmentInfo>;
-  using OverlapMap =
-      DenseMap<FragmentOfVar, SmallVector<DIExpression::FragmentInfo, 1>>;
-
-  // Helper while building OverlapMap, a map of all fragments seen for a given
-  // DILocalVariable.
-  using VarToFragments =
-      DenseMap<const DILocalVariable *, SmallSet<FragmentInfo, 4>>;
-
-  /// This holds the working set of currently open ranges. For fast
-  /// access, this is done both as a set of VarLocIDs, and a map of
-  /// DebugVariable to recent VarLocID. Note that a DBG_VALUE ends all
-  /// previous open ranges for the same variable. In addition, we keep
-  /// two different maps (Vars/EntryValuesBackupVars), so erase/insert
-  /// methods act differently depending on whether a VarLoc is primary
-  /// location or backup one. In the case the VarLoc is backup location
-  /// we will erase/insert from the EntryValuesBackupVars map, otherwise
-  /// we perform the operation on the Vars.
-  class OpenRangesSet {
-    VarLocSet VarLocs;
-    // Map the DebugVariable to recent primary location ID.
-    SmallDenseMap<DebugVariable, LocIndex, 8> Vars;
-    // Map the DebugVariable to recent backup location ID.
-    SmallDenseMap<DebugVariable, LocIndex, 8> EntryValuesBackupVars;
-    OverlapMap &OverlappingFragments;
-
-  public:
-    OpenRangesSet(VarLocSet::Allocator &Alloc, OverlapMap &_OLapMap)
-        : VarLocs(Alloc), OverlappingFragments(_OLapMap) {}
-
-    const VarLocSet &getVarLocs() const { return VarLocs; }
-
-    /// Terminate all open ranges for VL.Var by removing it from the set.
-    void erase(const VarLoc &VL);
-
-    /// Terminate all open ranges listed in \c KillSet by removing
-    /// them from the set.
-    void erase(const VarLocSet &KillSet, const VarLocMap &VarLocIDs);
-
-    /// Insert a new range into the set.
-    void insert(LocIndex VarLocID, const VarLoc &VL);
-
-    /// Insert a set of ranges.
-    void insertFromLocSet(const VarLocSet &ToLoad, const VarLocMap &Map) {
-      for (uint64_t ID : ToLoad) {
-        LocIndex Idx = LocIndex::fromRawInteger(ID);
-        const VarLoc &VarL = Map[Idx];
-        insert(Idx, VarL);
-      }
-    }
-
-    llvm::Optional<LocIndex> getEntryValueBackup(DebugVariable Var);
-
-    /// Empty the set.
-    void clear() {
-      VarLocs.clear();
-      Vars.clear();
-      EntryValuesBackupVars.clear();
-    }
-
-    /// Return whether the set is empty or not.
-    bool empty() const {
-      assert(Vars.empty() == EntryValuesBackupVars.empty() &&
-             Vars.empty() == VarLocs.empty() &&
-             "open ranges are inconsistent");
-      return VarLocs.empty();
-    }
-
-    /// Get an empty range of VarLoc IDs.
-    auto getEmptyVarLocRange() const {
-      return iterator_range<VarLocSet::const_iterator>(getVarLocs().end(),
-                                                       getVarLocs().end());
-    }
-
-    /// Get all set IDs for VarLocs of kind RegisterKind in \p Reg.
-    auto getRegisterVarLocs(Register Reg) const {
-      return LocIndex::indexRangeForLocation(getVarLocs(), Reg);
-    }
-
-    /// Get all set IDs for VarLocs of kind SpillLocKind.
-    auto getSpillVarLocs() const {
-      return LocIndex::indexRangeForLocation(getVarLocs(),
-                                             LocIndex::kSpillLocation);
-    }
-
-    /// Get all set IDs for VarLocs of kind EntryValueBackupKind or
-    /// EntryValueCopyBackupKind.
-    auto getEntryValueBackupVarLocs() const {
-      return LocIndex::indexRangeForLocation(
-          getVarLocs(), LocIndex::kEntryValueBackupLocation);
-    }
-  };
-
-  /// Collect all VarLoc IDs from \p CollectFrom for VarLocs of kind
-  /// RegisterKind which are located in any reg in \p Regs. Insert collected IDs
-  /// into \p Collected.
-  void collectIDsForRegs(VarLocSet &Collected, const DefinedRegsSet &Regs,
-                         const VarLocSet &CollectFrom) const;
-
-  /// Get the registers which are used by VarLocs of kind RegisterKind tracked
-  /// by \p CollectFrom.
-  void getUsedRegs(const VarLocSet &CollectFrom,
-                   SmallVectorImpl<uint32_t> &UsedRegs) const;
-
-  VarLocSet &getVarLocsInMBB(const MachineBasicBlock *MBB, VarLocInMBB &Locs) {
-    std::unique_ptr<VarLocSet> &VLS = Locs[MBB];
-    if (!VLS)
-      VLS = std::make_unique<VarLocSet>(Alloc);
-    return *VLS.get();
-  }
-
-  const VarLocSet &getVarLocsInMBB(const MachineBasicBlock *MBB,
-                                   const VarLocInMBB &Locs) const {
-    auto It = Locs.find(MBB);
-    assert(It != Locs.end() && "MBB not in map");
-    return *It->second.get();
-  }
-
-  /// Tests whether this instruction is a spill to a stack location.
-  bool isSpillInstruction(const MachineInstr &MI, MachineFunction *MF);
-
-  /// Decide if @MI is a spill instruction and return true if it is. We use 2
-  /// criteria to make this decision:
-  /// - Is this instruction a store to a spill slot?
-  /// - Is there a register operand that is both used and killed?
-  /// TODO: Store optimization can fold spills into other stores (including
-  /// other spills). We do not handle this yet (more than one memory operand).
-  bool isLocationSpill(const MachineInstr &MI, MachineFunction *MF,
-                       Register &Reg);
-
-  /// Returns true if the given machine instruction is a debug value which we
-  /// can emit entry values for.
-  ///
-  /// Currently, we generate debug entry values only for parameters that are
-  /// unmodified throughout the function and located in a register.
-  bool isEntryValueCandidate(const MachineInstr &MI,
-                             const DefinedRegsSet &Regs) const;
-
-  /// If a given instruction is identified as a spill, return the spill location
-  /// and set \p Reg to the spilled register.
-  Optional<VarLoc::SpillLoc> isRestoreInstruction(const MachineInstr &MI,
-                                                  MachineFunction *MF,
-                                                  Register &Reg);
-  /// Given a spill instruction, extract the register and offset used to
-  /// address the spill location in a target independent way.
-  VarLoc::SpillLoc extractSpillBaseRegAndOffset(const MachineInstr &MI);
-  void insertTransferDebugPair(MachineInstr &MI, OpenRangesSet &OpenRanges,
-                               TransferMap &Transfers, VarLocMap &VarLocIDs,
-                               LocIndex OldVarID, TransferKind Kind,
-                               Register NewReg = Register());
-
-  void transferDebugValue(const MachineInstr &MI, OpenRangesSet &OpenRanges,
-                          VarLocMap &VarLocIDs);
-  void transferSpillOrRestoreInst(MachineInstr &MI, OpenRangesSet &OpenRanges,
-                                  VarLocMap &VarLocIDs, TransferMap &Transfers);
-  bool removeEntryValue(const MachineInstr &MI, OpenRangesSet &OpenRanges,
-                        VarLocMap &VarLocIDs, const VarLoc &EntryVL);
-  void emitEntryValues(MachineInstr &MI, OpenRangesSet &OpenRanges,
-                       VarLocMap &VarLocIDs, TransferMap &Transfers,
-                       VarLocSet &KillSet);
-  void recordEntryValue(const MachineInstr &MI,
-                        const DefinedRegsSet &DefinedRegs,
-                        OpenRangesSet &OpenRanges, VarLocMap &VarLocIDs);
-  void transferRegisterCopy(MachineInstr &MI, OpenRangesSet &OpenRanges,
-                            VarLocMap &VarLocIDs, TransferMap &Transfers);
-  void transferRegisterDef(MachineInstr &MI, OpenRangesSet &OpenRanges,
-                           VarLocMap &VarLocIDs, TransferMap &Transfers);
-  bool transferTerminator(MachineBasicBlock *MBB, OpenRangesSet &OpenRanges,
-                          VarLocInMBB &OutLocs, const VarLocMap &VarLocIDs);
-
-  void process(MachineInstr &MI, OpenRangesSet &OpenRanges,
-               VarLocMap &VarLocIDs, TransferMap &Transfers);
-
-  void accumulateFragmentMap(MachineInstr &MI, VarToFragments &SeenFragments,
-                             OverlapMap &OLapMap);
-
-  bool join(MachineBasicBlock &MBB, VarLocInMBB &OutLocs, VarLocInMBB &InLocs,
-            const VarLocMap &VarLocIDs,
-            SmallPtrSet<const MachineBasicBlock *, 16> &Visited,
-            SmallPtrSetImpl<const MachineBasicBlock *> &ArtificialBlocks);
-
-  /// Create DBG_VALUE insts for inlocs that have been propagated but
-  /// had their instruction creation deferred.
-  void flushPendingLocs(VarLocInMBB &PendingInLocs, VarLocMap &VarLocIDs);
-
-  bool ExtendRanges(MachineFunction &MF);
-
-public:
-  static char ID;
-
-  /// Default construct and initialize the pass.
-  LiveDebugValues();
-
-  /// Tell the pass manager which passes we depend on and what
-  /// information we preserve.
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
-
-  MachineFunctionProperties getRequiredProperties() const override {
-    return MachineFunctionProperties().set(
-        MachineFunctionProperties::Property::NoVRegs);
-  }
-
-  /// Print to ostream with a message.
-  void printVarLocInMBB(const MachineFunction &MF, const VarLocInMBB &V,
-                        const VarLocMap &VarLocIDs, const char *msg,
-                        raw_ostream &Out) const;
-
-  /// Calculate the liveness information for the given machine function.
-  bool runOnMachineFunction(MachineFunction &MF) override;
-};
-
-} // end anonymous namespace
-
-//===----------------------------------------------------------------------===//
-//            Implementation
-//===----------------------------------------------------------------------===//
-
-char LiveDebugValues::ID = 0;
-
-char &llvm::LiveDebugValuesID = LiveDebugValues::ID;
-
-INITIALIZE_PASS(LiveDebugValues, DEBUG_TYPE, "Live DEBUG_VALUE analysis",
-                false, false)
-
-/// Default construct and initialize the pass.
-LiveDebugValues::LiveDebugValues() : MachineFunctionPass(ID) {
-  initializeLiveDebugValuesPass(*PassRegistry::getPassRegistry());
-}
-
-/// Tell the pass manager which passes we depend on and what information we
-/// preserve.
-void LiveDebugValues::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.setPreservesCFG();
-  MachineFunctionPass::getAnalysisUsage(AU);
-}
-
-/// Erase a variable from the set of open ranges, and additionally erase any
-/// fragments that may overlap it. If the VarLoc is a buckup location, erase
-/// the variable from the EntryValuesBackupVars set, indicating we should stop
-/// tracking its backup entry location. Otherwise, if the VarLoc is primary
-/// location, erase the variable from the Vars set.
-void LiveDebugValues::OpenRangesSet::erase(const VarLoc &VL) {
-  // Erasure helper.
-  auto DoErase = [VL, this](DebugVariable VarToErase) {
-    auto *EraseFrom = VL.isEntryBackupLoc() ? &EntryValuesBackupVars : &Vars;
-    auto It = EraseFrom->find(VarToErase);
-    if (It != EraseFrom->end()) {
-      LocIndex ID = It->second;
-      VarLocs.reset(ID.getAsRawInteger());
-      EraseFrom->erase(It);
-    }
-  };
-
-  DebugVariable Var = VL.Var;
-
-  // Erase the variable/fragment that ends here.
-  DoErase(Var);
-
-  // Extract the fragment. Interpret an empty fragment as one that covers all
-  // possible bits.
-  FragmentInfo ThisFragment = Var.getFragmentOrDefault();
-
-  // There may be fragments that overlap the designated fragment. Look them up
-  // in the pre-computed overlap map, and erase them too.
-  auto MapIt = OverlappingFragments.find({Var.getVariable(), ThisFragment});
-  if (MapIt != OverlappingFragments.end()) {
-    for (auto Fragment : MapIt->second) {
-      LiveDebugValues::OptFragmentInfo FragmentHolder;
-      if (!DebugVariable::isDefaultFragment(Fragment))
-        FragmentHolder = LiveDebugValues::OptFragmentInfo(Fragment);
-      DoErase({Var.getVariable(), FragmentHolder, Var.getInlinedAt()});
-    }
-  }
-}
-
-void LiveDebugValues::OpenRangesSet::erase(const VarLocSet &KillSet,
-                                           const VarLocMap &VarLocIDs) {
-  VarLocs.intersectWithComplement(KillSet);
-  for (uint64_t ID : KillSet) {
-    const VarLoc *VL = &VarLocIDs[LocIndex::fromRawInteger(ID)];
-    auto *EraseFrom = VL->isEntryBackupLoc() ? &EntryValuesBackupVars : &Vars;
-    EraseFrom->erase(VL->Var);
-  }
-}
-
-void LiveDebugValues::OpenRangesSet::insert(LocIndex VarLocID,
-                                            const VarLoc &VL) {
-  auto *InsertInto = VL.isEntryBackupLoc() ? &EntryValuesBackupVars : &Vars;
-  VarLocs.set(VarLocID.getAsRawInteger());
-  InsertInto->insert({VL.Var, VarLocID});
-}
-
-/// Return the Loc ID of an entry value backup location, if it exists for the
-/// variable.
-llvm::Optional<LocIndex>
-LiveDebugValues::OpenRangesSet::getEntryValueBackup(DebugVariable Var) {
-  auto It = EntryValuesBackupVars.find(Var);
-  if (It != EntryValuesBackupVars.end())
-    return It->second;
-
-  return llvm::None;
-}
-
-void LiveDebugValues::collectIDsForRegs(VarLocSet &Collected,
-                                        const DefinedRegsSet &Regs,
-                                        const VarLocSet &CollectFrom) const {
-  assert(!Regs.empty() && "Nothing to collect");
-  SmallVector<uint32_t, 32> SortedRegs;
-  for (Register Reg : Regs)
-    SortedRegs.push_back(Reg);
-  array_pod_sort(SortedRegs.begin(), SortedRegs.end());
-  auto It = CollectFrom.find(LocIndex::rawIndexForReg(SortedRegs.front()));
-  auto End = CollectFrom.end();
-  for (uint32_t Reg : SortedRegs) {
-    // The half-open interval [FirstIndexForReg, FirstInvalidIndex) contains all
-    // possible VarLoc IDs for VarLocs of kind RegisterKind which live in Reg.
-    uint64_t FirstIndexForReg = LocIndex::rawIndexForReg(Reg);
-    uint64_t FirstInvalidIndex = LocIndex::rawIndexForReg(Reg + 1);
-    It.advanceToLowerBound(FirstIndexForReg);
-
-    // Iterate through that half-open interval and collect all the set IDs.
-    for (; It != End && *It < FirstInvalidIndex; ++It)
-      Collected.set(*It);
-
-    if (It == End)
-      return;
-  }
-}
-
-void LiveDebugValues::getUsedRegs(const VarLocSet &CollectFrom,
-                                  SmallVectorImpl<uint32_t> &UsedRegs) const {
-  // All register-based VarLocs are assigned indices greater than or equal to
-  // FirstRegIndex.
-  uint64_t FirstRegIndex = LocIndex::rawIndexForReg(1);
-  uint64_t FirstInvalidIndex =
-      LocIndex::rawIndexForReg(LocIndex::kFirstInvalidRegLocation);
-  for (auto It = CollectFrom.find(FirstRegIndex),
-            End = CollectFrom.find(FirstInvalidIndex);
-       It != End;) {
-    // We found a VarLoc ID for a VarLoc that lives in a register. Figure out
-    // which register and add it to UsedRegs.
-    uint32_t FoundReg = LocIndex::fromRawInteger(*It).Location;
-    assert((UsedRegs.empty() || FoundReg != UsedRegs.back()) &&
-           "Duplicate used reg");
-    UsedRegs.push_back(FoundReg);
-
-    // Skip to the next /set/ register. Note that this finds a lower bound, so
-    // even if there aren't any VarLocs living in `FoundReg+1`, we're still
-    // guaranteed to move on to the next register (or to end()).
-    uint64_t NextRegIndex = LocIndex::rawIndexForReg(FoundReg + 1);
-    It.advanceToLowerBound(NextRegIndex);
-  }
-}
-
-//===----------------------------------------------------------------------===//
-//            Debug Range Extension Implementation
-//===----------------------------------------------------------------------===//
-
-#ifndef NDEBUG
-void LiveDebugValues::printVarLocInMBB(const MachineFunction &MF,
-                                       const VarLocInMBB &V,
-                                       const VarLocMap &VarLocIDs,
-                                       const char *msg,
-                                       raw_ostream &Out) const {
-  Out << '\n' << msg << '\n';
-  for (const MachineBasicBlock &BB : MF) {
-    if (!V.count(&BB))
-      continue;
-    const VarLocSet &L = getVarLocsInMBB(&BB, V);
-    if (L.empty())
-      continue;
-    Out << "MBB: " << BB.getNumber() << ":\n";
-    for (uint64_t VLL : L) {
-      const VarLoc &VL = VarLocIDs[LocIndex::fromRawInteger(VLL)];
-      Out << " Var: " << VL.Var.getVariable()->getName();
-      Out << " MI: ";
-      VL.dump(TRI, Out);
-    }
-  }
-  Out << "\n";
-}
-#endif
-
-LiveDebugValues::VarLoc::SpillLoc
-LiveDebugValues::extractSpillBaseRegAndOffset(const MachineInstr &MI) {
-  assert(MI.hasOneMemOperand() &&
-         "Spill instruction does not have exactly one memory operand?");
-  auto MMOI = MI.memoperands_begin();
-  const PseudoSourceValue *PVal = (*MMOI)->getPseudoValue();
-  assert(PVal->kind() == PseudoSourceValue::FixedStack &&
-         "Inconsistent memory operand in spill instruction");
-  int FI = cast<FixedStackPseudoSourceValue>(PVal)->getFrameIndex();
-  const MachineBasicBlock *MBB = MI.getParent();
-  Register Reg;
-  int Offset = TFI->getFrameIndexReference(*MBB->getParent(), FI, Reg);
-  return {Reg, Offset};
-}
-
-/// Try to salvage the debug entry value if we encounter a new debug value
-/// describing the same parameter, otherwise stop tracking the value. Return
-/// true if we should stop tracking the entry value, otherwise return false.
-bool LiveDebugValues::removeEntryValue(const MachineInstr &MI,
-                                       OpenRangesSet &OpenRanges,
-                                       VarLocMap &VarLocIDs,
-                                       const VarLoc &EntryVL) {
-  // Skip the DBG_VALUE which is the debug entry value itself.
-  if (MI.isIdenticalTo(EntryVL.MI))
-    return false;
-
-  // If the parameter's location is not register location, we can not track
-  // the entry value any more. In addition, if the debug expression from the
-  // DBG_VALUE is not empty, we can assume the parameter's value has changed
-  // indicating that we should stop tracking its entry value as well.
-  if (!MI.getDebugOperand(0).isReg() ||
-      MI.getDebugExpression()->getNumElements() != 0)
-    return true;
-
-  // If the DBG_VALUE comes from a copy instruction that copies the entry value,
-  // it means the parameter's value has not changed and we should be able to use
-  // its entry value.
-  bool TrySalvageEntryValue = false;
-  Register Reg = MI.getDebugOperand(0).getReg();
-  auto I = std::next(MI.getReverseIterator());
-  const MachineOperand *SrcRegOp, *DestRegOp;
-  if (I != MI.getParent()->rend()) {
-    // TODO: Try to keep tracking of an entry value if we encounter a propagated
-    // DBG_VALUE describing the copy of the entry value. (Propagated entry value
-    // does not indicate the parameter modification.)
-    auto DestSrc = TII->isCopyInstr(*I);
-    if (!DestSrc)
-      return true;
-
-    SrcRegOp = DestSrc->Source;
-    DestRegOp = DestSrc->Destination;
-    if (Reg != DestRegOp->getReg())
-      return true;
-    TrySalvageEntryValue = true;
-  }
-
-  if (TrySalvageEntryValue) {
-    for (uint64_t ID : OpenRanges.getEntryValueBackupVarLocs()) {
-      const VarLoc &VL = VarLocIDs[LocIndex::fromRawInteger(ID)];
-      if (VL.getEntryValueCopyBackupReg() == Reg &&
-          VL.MI.getDebugOperand(0).getReg() == SrcRegOp->getReg())
-        return false;
-    }
-  }
-
-  return true;
-}
-
-/// End all previous ranges related to @MI and start a new range from @MI
-/// if it is a DBG_VALUE instr.
-void LiveDebugValues::transferDebugValue(const MachineInstr &MI,
-                                         OpenRangesSet &OpenRanges,
-                                         VarLocMap &VarLocIDs) {
-  if (!MI.isDebugValue())
-    return;
-  const DILocalVariable *Var = MI.getDebugVariable();
-  const DIExpression *Expr = MI.getDebugExpression();
-  const DILocation *DebugLoc = MI.getDebugLoc();
-  const DILocation *InlinedAt = DebugLoc->getInlinedAt();
-  assert(Var->isValidLocationForIntrinsic(DebugLoc) &&
-         "Expected inlined-at fields to agree");
-
-  DebugVariable V(Var, Expr, InlinedAt);
-
-  // Check if this DBG_VALUE indicates a parameter's value changing.
-  // If that is the case, we should stop tracking its entry value.
-  auto EntryValBackupID = OpenRanges.getEntryValueBackup(V);
-  if (Var->isParameter() && EntryValBackupID) {
-    const VarLoc &EntryVL = VarLocIDs[*EntryValBackupID];
-    if (removeEntryValue(MI, OpenRanges, VarLocIDs, EntryVL)) {
-      LLVM_DEBUG(dbgs() << "Deleting a DBG entry value because of: ";
-                 MI.print(dbgs(), /*IsStandalone*/ false,
-                          /*SkipOpers*/ false, /*SkipDebugLoc*/ false,
-                          /*AddNewLine*/ true, TII));
-      OpenRanges.erase(EntryVL);
-    }
-  }
-
-  if (isDbgValueDescribedByReg(MI) || MI.getDebugOperand(0).isImm() ||
-      MI.getDebugOperand(0).isFPImm() || MI.getDebugOperand(0).isCImm()) {
-    // Use normal VarLoc constructor for registers and immediates.
-    VarLoc VL(MI, LS);
-    // End all previous ranges of VL.Var.
-    OpenRanges.erase(VL);
-
-    LocIndex ID = VarLocIDs.insert(VL);
-    // Add the VarLoc to OpenRanges from this DBG_VALUE.
-    OpenRanges.insert(ID, VL);
-  } else if (MI.hasOneMemOperand()) {
-    llvm_unreachable("DBG_VALUE with mem operand encountered after regalloc?");
-  } else {
-    // This must be an undefined location. If it has an open range, erase it.
-    assert(MI.getDebugOperand(0).isReg() &&
-           MI.getDebugOperand(0).getReg() == 0 &&
-           "Unexpected non-undef DBG_VALUE encountered");
-    VarLoc VL(MI, LS);
-    OpenRanges.erase(VL);
-  }
-}
-
-/// Turn the entry value backup locations into primary locations.
-void LiveDebugValues::emitEntryValues(MachineInstr &MI,
-                                      OpenRangesSet &OpenRanges,
-                                      VarLocMap &VarLocIDs,
-                                      TransferMap &Transfers,
-                                      VarLocSet &KillSet) {
-  // Do not insert entry value locations after a terminator.
-  if (MI.isTerminator())
-    return;
-
-  for (uint64_t ID : KillSet) {
-    LocIndex Idx = LocIndex::fromRawInteger(ID);
-    const VarLoc &VL = VarLocIDs[Idx];
-    if (!VL.Var.getVariable()->isParameter())
-      continue;
-
-    auto DebugVar = VL.Var;
-    Optional<LocIndex> EntryValBackupID =
-        OpenRanges.getEntryValueBackup(DebugVar);
-
-    // If the parameter has the entry value backup, it means we should
-    // be able to use its entry value.
-    if (!EntryValBackupID)
-      continue;
-
-    const VarLoc &EntryVL = VarLocIDs[*EntryValBackupID];
-    VarLoc EntryLoc =
-        VarLoc::CreateEntryLoc(EntryVL.MI, LS, EntryVL.Expr, EntryVL.Loc.RegNo);
-    LocIndex EntryValueID = VarLocIDs.insert(EntryLoc);
-    Transfers.push_back({&MI, EntryValueID});
-    OpenRanges.insert(EntryValueID, EntryLoc);
-  }
-}
-
-/// Create new TransferDebugPair and insert it in \p Transfers. The VarLoc
-/// with \p OldVarID should be deleted form \p OpenRanges and replaced with
-/// new VarLoc. If \p NewReg is different than default zero value then the
-/// new location will be register location created by the copy like instruction,
-/// otherwise it is variable's location on the stack.
-void LiveDebugValues::insertTransferDebugPair(
-    MachineInstr &MI, OpenRangesSet &OpenRanges, TransferMap &Transfers,
-    VarLocMap &VarLocIDs, LocIndex OldVarID, TransferKind Kind,
-    Register NewReg) {
-  const MachineInstr *DebugInstr = &VarLocIDs[OldVarID].MI;
-
-  auto ProcessVarLoc = [&MI, &OpenRanges, &Transfers, &VarLocIDs](VarLoc &VL) {
-    LocIndex LocId = VarLocIDs.insert(VL);
-
-    // Close this variable's previous location range.
-    OpenRanges.erase(VL);
-
-    // Record the new location as an open range, and a postponed transfer
-    // inserting a DBG_VALUE for this location.
-    OpenRanges.insert(LocId, VL);
-    assert(!MI.isTerminator() && "Cannot insert DBG_VALUE after terminator");
-    TransferDebugPair MIP = {&MI, LocId};
-    Transfers.push_back(MIP);
-  };
-
-  // End all previous ranges of VL.Var.
-  OpenRanges.erase(VarLocIDs[OldVarID]);
-  switch (Kind) {
-  case TransferKind::TransferCopy: {
-    assert(NewReg &&
-           "No register supplied when handling a copy of a debug value");
-    // Create a DBG_VALUE instruction to describe the Var in its new
-    // register location.
-    VarLoc VL = VarLoc::CreateCopyLoc(*DebugInstr, LS, NewReg);
-    ProcessVarLoc(VL);
-    LLVM_DEBUG({
-      dbgs() << "Creating VarLoc for register copy:";
-      VL.dump(TRI);
-    });
-    return;
-  }
-  case TransferKind::TransferSpill: {
-    // Create a DBG_VALUE instruction to describe the Var in its spilled
-    // location.
-    VarLoc::SpillLoc SpillLocation = extractSpillBaseRegAndOffset(MI);
-    VarLoc VL = VarLoc::CreateSpillLoc(*DebugInstr, SpillLocation.SpillBase,
-                                       SpillLocation.SpillOffset, LS);
-    ProcessVarLoc(VL);
-    LLVM_DEBUG({
-      dbgs() << "Creating VarLoc for spill:";
-      VL.dump(TRI);
-    });
-    return;
-  }
-  case TransferKind::TransferRestore: {
-    assert(NewReg &&
-           "No register supplied when handling a restore of a debug value");
-    // DebugInstr refers to the pre-spill location, therefore we can reuse
-    // its expression.
-    VarLoc VL = VarLoc::CreateCopyLoc(*DebugInstr, LS, NewReg);
-    ProcessVarLoc(VL);
-    LLVM_DEBUG({
-      dbgs() << "Creating VarLoc for restore:";
-      VL.dump(TRI);
-    });
-    return;
-  }
-  }
-  llvm_unreachable("Invalid transfer kind");
-}
-
-/// A definition of a register may mark the end of a range.
-void LiveDebugValues::transferRegisterDef(
-    MachineInstr &MI, OpenRangesSet &OpenRanges, VarLocMap &VarLocIDs,
-    TransferMap &Transfers) {
-
-  // Meta Instructions do not affect the debug liveness of any register they
-  // define.
-  if (MI.isMetaInstruction())
-    return;
-
-  MachineFunction *MF = MI.getMF();
-  const TargetLowering *TLI = MF->getSubtarget().getTargetLowering();
-  Register SP = TLI->getStackPointerRegisterToSaveRestore();
-
-  // Find the regs killed by MI, and find regmasks of preserved regs.
-  DefinedRegsSet DeadRegs;
-  SmallVector<const uint32_t *, 4> RegMasks;
-  for (const MachineOperand &MO : MI.operands()) {
-    // Determine whether the operand is a register def.
-    if (MO.isReg() && MO.isDef() && MO.getReg() &&
-        Register::isPhysicalRegister(MO.getReg()) &&
-        !(MI.isCall() && MO.getReg() == SP)) {
-      // Remove ranges of all aliased registers.
-      for (MCRegAliasIterator RAI(MO.getReg(), TRI, true); RAI.isValid(); ++RAI)
-        // FIXME: Can we break out of this loop early if no insertion occurs?
-        DeadRegs.insert(*RAI);
-    } else if (MO.isRegMask()) {
-      RegMasks.push_back(MO.getRegMask());
-    }
-  }
-
-  // Erase VarLocs which reside in one of the dead registers. For performance
-  // reasons, it's critical to not iterate over the full set of open VarLocs.
-  // Iterate over the set of dying/used regs instead.
-  if (!RegMasks.empty()) {
-    SmallVector<uint32_t, 32> UsedRegs;
-    getUsedRegs(OpenRanges.getVarLocs(), UsedRegs);
-    for (uint32_t Reg : UsedRegs) {
-      // Remove ranges of all clobbered registers. Register masks don't usually
-      // list SP as preserved. Assume that call instructions never clobber SP,
-      // because some backends (e.g., AArch64) never list SP in the regmask.
-      // While the debug info may be off for an instruction or two around
-      // callee-cleanup calls, transferring the DEBUG_VALUE across the call is
-      // still a better user experience.
-      if (Reg == SP)
-        continue;
-      bool AnyRegMaskKillsReg =
-          any_of(RegMasks, [Reg](const uint32_t *RegMask) {
-            return MachineOperand::clobbersPhysReg(RegMask, Reg);
-          });
-      if (AnyRegMaskKillsReg)
-        DeadRegs.insert(Reg);
-    }
-  }
-
-  if (DeadRegs.empty())
-    return;
-
-  VarLocSet KillSet(Alloc);
-  collectIDsForRegs(KillSet, DeadRegs, OpenRanges.getVarLocs());
-  OpenRanges.erase(KillSet, VarLocIDs);
-
-  if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) {
-    auto &TM = TPC->getTM<TargetMachine>();
-    if (TM.Options.ShouldEmitDebugEntryValues())
-      emitEntryValues(MI, OpenRanges, VarLocIDs, Transfers, KillSet);
-  }
-}
-
-bool LiveDebugValues::isSpillInstruction(const MachineInstr &MI,
-                                         MachineFunction *MF) {
-  // TODO: Handle multiple stores folded into one.
-  if (!MI.hasOneMemOperand())
-    return false;
-
-  if (!MI.getSpillSize(TII) && !MI.getFoldedSpillSize(TII))
-    return false; // This is not a spill instruction, since no valid size was
-                  // returned from either function.
-
-  return true;
-}
-
-bool LiveDebugValues::isLocationSpill(const MachineInstr &MI,
-                                      MachineFunction *MF, Register &Reg) {
-  if (!isSpillInstruction(MI, MF))
-    return false;
-
-  auto isKilledReg = [&](const MachineOperand MO, Register &Reg) {
-    if (!MO.isReg() || !MO.isUse()) {
-      Reg = 0;
-      return false;
-    }
-    Reg = MO.getReg();
-    return MO.isKill();
-  };
-
-  for (const MachineOperand &MO : MI.operands()) {
-    // In a spill instruction generated by the InlineSpiller the spilled
-    // register has its kill flag set.
-    if (isKilledReg(MO, Reg))
-      return true;
-    if (Reg != 0) {
-      // Check whether next instruction kills the spilled register.
-      // FIXME: Current solution does not cover search for killed register in
-      // bundles and instructions further down the chain.
-      auto NextI = std::next(MI.getIterator());
-      // Skip next instruction that points to basic block end iterator.
-      if (MI.getParent()->end() == NextI)
-        continue;
-      Register RegNext;
-      for (const MachineOperand &MONext : NextI->operands()) {
-        // Return true if we came across the register from the
-        // previous spill instruction that is killed in NextI.
-        if (isKilledReg(MONext, RegNext) && RegNext == Reg)
-          return true;
-      }
-    }
-  }
-  // Return false if we didn't find spilled register.
-  return false;
-}
-
-Optional<LiveDebugValues::VarLoc::SpillLoc>
-LiveDebugValues::isRestoreInstruction(const MachineInstr &MI,
-                                      MachineFunction *MF, Register &Reg) {
-  if (!MI.hasOneMemOperand())
-    return None;
-
-  // FIXME: Handle folded restore instructions with more than one memory
-  // operand.
-  if (MI.getRestoreSize(TII)) {
-    Reg = MI.getOperand(0).getReg();
-    return extractSpillBaseRegAndOffset(MI);
-  }
-  return None;
-}
-
-/// A spilled register may indicate that we have to end the current range of
-/// a variable and create a new one for the spill location.
-/// A restored register may indicate the reverse situation.
-/// We don't want to insert any instructions in process(), so we just create
-/// the DBG_VALUE without inserting it and keep track of it in \p Transfers.
-/// It will be inserted into the BB when we're done iterating over the
-/// instructions.
-void LiveDebugValues::transferSpillOrRestoreInst(MachineInstr &MI,
-                                                 OpenRangesSet &OpenRanges,
-                                                 VarLocMap &VarLocIDs,
-                                                 TransferMap &Transfers) {
-  MachineFunction *MF = MI.getMF();
-  TransferKind TKind;
-  Register Reg;
-  Optional<VarLoc::SpillLoc> Loc;
-
-  LLVM_DEBUG(dbgs() << "Examining instruction: "; MI.dump(););
-
-  // First, if there are any DBG_VALUEs pointing at a spill slot that is
-  // written to, then close the variable location. The value in memory
-  // will have changed.
-  VarLocSet KillSet(Alloc);
-  if (isSpillInstruction(MI, MF)) {
-    Loc = extractSpillBaseRegAndOffset(MI);
-    for (uint64_t ID : OpenRanges.getSpillVarLocs()) {
-      LocIndex Idx = LocIndex::fromRawInteger(ID);
-      const VarLoc &VL = VarLocIDs[Idx];
-      assert(VL.Kind == VarLoc::SpillLocKind && "Broken VarLocSet?");
-      if (VL.Loc.SpillLocation == *Loc) {
-        // This location is overwritten by the current instruction -- terminate
-        // the open range, and insert an explicit DBG_VALUE $noreg.
-        //
-        // Doing this at a later stage would require re-interpreting all
-        // DBG_VALUes and DIExpressions to identify whether they point at
-        // memory, and then analysing all memory writes to see if they
-        // overwrite that memory, which is expensive.
-        //
-        // At this stage, we already know which DBG_VALUEs are for spills and
-        // where they are located; it's best to fix handle overwrites now.
-        KillSet.set(ID);
-        VarLoc UndefVL = VarLoc::CreateCopyLoc(VL.MI, LS, 0);
-        LocIndex UndefLocID = VarLocIDs.insert(UndefVL);
-        Transfers.push_back({&MI, UndefLocID});
-      }
-    }
-    OpenRanges.erase(KillSet, VarLocIDs);
-  }
-
-  // Try to recognise spill and restore instructions that may create a new
-  // variable location.
-  if (isLocationSpill(MI, MF, Reg)) {
-    TKind = TransferKind::TransferSpill;
-    LLVM_DEBUG(dbgs() << "Recognized as spill: "; MI.dump(););
-    LLVM_DEBUG(dbgs() << "Register: " << Reg << " " << printReg(Reg, TRI)
-                      << "\n");
-  } else {
-    if (!(Loc = isRestoreInstruction(MI, MF, Reg)))
-      return;
-    TKind = TransferKind::TransferRestore;
-    LLVM_DEBUG(dbgs() << "Recognized as restore: "; MI.dump(););
-    LLVM_DEBUG(dbgs() << "Register: " << Reg << " " << printReg(Reg, TRI)
-                      << "\n");
-  }
-  // Check if the register or spill location is the location of a debug value.
-  auto TransferCandidates = OpenRanges.getEmptyVarLocRange();
-  if (TKind == TransferKind::TransferSpill)
-    TransferCandidates = OpenRanges.getRegisterVarLocs(Reg);
-  else if (TKind == TransferKind::TransferRestore)
-    TransferCandidates = OpenRanges.getSpillVarLocs();
-  for (uint64_t ID : TransferCandidates) {
-    LocIndex Idx = LocIndex::fromRawInteger(ID);
-    const VarLoc &VL = VarLocIDs[Idx];
-    if (TKind == TransferKind::TransferSpill) {
-      assert(VL.isDescribedByReg() == Reg && "Broken VarLocSet?");
-      LLVM_DEBUG(dbgs() << "Spilling Register " << printReg(Reg, TRI) << '('
-                        << VL.Var.getVariable()->getName() << ")\n");
-    } else {
-      assert(TKind == TransferKind::TransferRestore &&
-             VL.Kind == VarLoc::SpillLocKind && "Broken VarLocSet?");
-      if (VL.Loc.SpillLocation != *Loc)
-        // The spill location is not the location of a debug value.
-        continue;
-      LLVM_DEBUG(dbgs() << "Restoring Register " << printReg(Reg, TRI) << '('
-                        << VL.Var.getVariable()->getName() << ")\n");
-    }
-    insertTransferDebugPair(MI, OpenRanges, Transfers, VarLocIDs, Idx, TKind,
-                            Reg);
-    // FIXME: A comment should explain why it's correct to return early here,
-    // if that is in fact correct.
-    return;
-  }
-}
-
-/// If \p MI is a register copy instruction, that copies a previously tracked
-/// value from one register to another register that is callee saved, we
-/// create new DBG_VALUE instruction  described with copy destination register.
-void LiveDebugValues::transferRegisterCopy(MachineInstr &MI,
-                                           OpenRangesSet &OpenRanges,
-                                           VarLocMap &VarLocIDs,
-                                           TransferMap &Transfers) {
-  auto DestSrc = TII->isCopyInstr(MI);
-  if (!DestSrc)
-    return;
-
-  const MachineOperand *DestRegOp = DestSrc->Destination;
-  const MachineOperand *SrcRegOp = DestSrc->Source;
-
-  if (!DestRegOp->isDef())
-    return;
-
-  auto isCalleeSavedReg = [&](Register Reg) {
-    for (MCRegAliasIterator RAI(Reg, TRI, true); RAI.isValid(); ++RAI)
-      if (CalleeSavedRegs.test(*RAI))
-        return true;
-    return false;
-  };
-
-  Register SrcReg = SrcRegOp->getReg();
-  Register DestReg = DestRegOp->getReg();
-
-  // We want to recognize instructions where destination register is callee
-  // saved register. If register that could be clobbered by the call is
-  // included, there would be a great chance that it is going to be clobbered
-  // soon. It is more likely that previous register location, which is callee
-  // saved, is going to stay unclobbered longer, even if it is killed.
-  if (!isCalleeSavedReg(DestReg))
-    return;
-
-  // Remember an entry value movement. If we encounter a new debug value of
-  // a parameter describing only a moving of the value around, rather then
-  // modifying it, we are still able to use the entry value if needed.
-  if (isRegOtherThanSPAndFP(*DestRegOp, MI, TRI)) {
-    for (uint64_t ID : OpenRanges.getEntryValueBackupVarLocs()) {
-      LocIndex Idx = LocIndex::fromRawInteger(ID);
-      const VarLoc &VL = VarLocIDs[Idx];
-      if (VL.getEntryValueBackupReg() == SrcReg) {
-        LLVM_DEBUG(dbgs() << "Copy of the entry value: "; MI.dump(););
-        VarLoc EntryValLocCopyBackup =
-            VarLoc::CreateEntryCopyBackupLoc(VL.MI, LS, VL.Expr, DestReg);
-
-        // Stop tracking the original entry value.
-        OpenRanges.erase(VL);
-
-        // Start tracking the entry value copy.
-        LocIndex EntryValCopyLocID = VarLocIDs.insert(EntryValLocCopyBackup);
-        OpenRanges.insert(EntryValCopyLocID, EntryValLocCopyBackup);
-        break;
-      }
-    }
-  }
-
-  if (!SrcRegOp->isKill())
-    return;
-
-  for (uint64_t ID : OpenRanges.getRegisterVarLocs(SrcReg)) {
-    LocIndex Idx = LocIndex::fromRawInteger(ID);
-    assert(VarLocIDs[Idx].isDescribedByReg() == SrcReg && "Broken VarLocSet?");
-    insertTransferDebugPair(MI, OpenRanges, Transfers, VarLocIDs, Idx,
-                            TransferKind::TransferCopy, DestReg);
-    // FIXME: A comment should explain why it's correct to return early here,
-    // if that is in fact correct.
-    return;
-  }
-}
-
-/// Terminate all open ranges at the end of the current basic block.
-bool LiveDebugValues::transferTerminator(MachineBasicBlock *CurMBB,
-                                         OpenRangesSet &OpenRanges,
-                                         VarLocInMBB &OutLocs,
-                                         const VarLocMap &VarLocIDs) {
-  bool Changed = false;
-
-  LLVM_DEBUG(for (uint64_t ID
-                  : OpenRanges.getVarLocs()) {
-    // Copy OpenRanges to OutLocs, if not already present.
-    dbgs() << "Add to OutLocs in MBB #" << CurMBB->getNumber() << ":  ";
-    VarLocIDs[LocIndex::fromRawInteger(ID)].dump(TRI);
-  });
-  VarLocSet &VLS = getVarLocsInMBB(CurMBB, OutLocs);
-  Changed = VLS != OpenRanges.getVarLocs();
-  // New OutLocs set may be different due to spill, restore or register
-  // copy instruction processing.
-  if (Changed)
-    VLS = OpenRanges.getVarLocs();
-  OpenRanges.clear();
-  return Changed;
-}
-
-/// Accumulate a mapping between each DILocalVariable fragment and other
-/// fragments of that DILocalVariable which overlap. This reduces work during
-/// the data-flow stage from "Find any overlapping fragments" to "Check if the
-/// known-to-overlap fragments are present".
-/// \param MI A previously unprocessed DEBUG_VALUE instruction to analyze for
-///           fragment usage.
-/// \param SeenFragments Map from DILocalVariable to all fragments of that
-///           Variable which are known to exist.
-/// \param OverlappingFragments The overlap map being constructed, from one
-///           Var/Fragment pair to a vector of fragments known to overlap.
-void LiveDebugValues::accumulateFragmentMap(MachineInstr &MI,
-                                            VarToFragments &SeenFragments,
-                                            OverlapMap &OverlappingFragments) {
-  DebugVariable MIVar(MI.getDebugVariable(), MI.getDebugExpression(),
-                      MI.getDebugLoc()->getInlinedAt());
-  FragmentInfo ThisFragment = MIVar.getFragmentOrDefault();
-
-  // If this is the first sighting of this variable, then we are guaranteed
-  // there are currently no overlapping fragments either. Initialize the set
-  // of seen fragments, record no overlaps for the current one, and return.
-  auto SeenIt = SeenFragments.find(MIVar.getVariable());
-  if (SeenIt == SeenFragments.end()) {
-    SmallSet<FragmentInfo, 4> OneFragment;
-    OneFragment.insert(ThisFragment);
-    SeenFragments.insert({MIVar.getVariable(), OneFragment});
-
-    OverlappingFragments.insert({{MIVar.getVariable(), ThisFragment}, {}});
-    return;
-  }
-
-  // If this particular Variable/Fragment pair already exists in the overlap
-  // map, it has already been accounted for.
-  auto IsInOLapMap =
-      OverlappingFragments.insert({{MIVar.getVariable(), ThisFragment}, {}});
-  if (!IsInOLapMap.second)
-    return;
-
-  auto &ThisFragmentsOverlaps = IsInOLapMap.first->second;
-  auto &AllSeenFragments = SeenIt->second;
-
-  // Otherwise, examine all other seen fragments for this variable, with "this"
-  // fragment being a previously unseen fragment. Record any pair of
-  // overlapping fragments.
-  for (auto &ASeenFragment : AllSeenFragments) {
-    // Does this previously seen fragment overlap?
-    if (DIExpression::fragmentsOverlap(ThisFragment, ASeenFragment)) {
-      // Yes: Mark the current fragment as being overlapped.
-      ThisFragmentsOverlaps.push_back(ASeenFragment);
-      // Mark the previously seen fragment as being overlapped by the current
-      // one.
-      auto ASeenFragmentsOverlaps =
-          OverlappingFragments.find({MIVar.getVariable(), ASeenFragment});
-      assert(ASeenFragmentsOverlaps != OverlappingFragments.end() &&
-             "Previously seen var fragment has no vector of overlaps");
-      ASeenFragmentsOverlaps->second.push_back(ThisFragment);
-    }
-  }
-
-  AllSeenFragments.insert(ThisFragment);
-}
-
-/// This routine creates OpenRanges.
-void LiveDebugValues::process(MachineInstr &MI, OpenRangesSet &OpenRanges,
-                              VarLocMap &VarLocIDs, TransferMap &Transfers) {
-  transferDebugValue(MI, OpenRanges, VarLocIDs);
-  transferRegisterDef(MI, OpenRanges, VarLocIDs, Transfers);
-  transferRegisterCopy(MI, OpenRanges, VarLocIDs, Transfers);
-  transferSpillOrRestoreInst(MI, OpenRanges, VarLocIDs, Transfers);
-}
-
-/// This routine joins the analysis results of all incoming edges in @MBB by
-/// inserting a new DBG_VALUE instruction at the start of the @MBB - if the same
-/// source variable in all the predecessors of @MBB reside in the same location.
-bool LiveDebugValues::join(
-    MachineBasicBlock &MBB, VarLocInMBB &OutLocs, VarLocInMBB &InLocs,
-    const VarLocMap &VarLocIDs,
-    SmallPtrSet<const MachineBasicBlock *, 16> &Visited,
-    SmallPtrSetImpl<const MachineBasicBlock *> &ArtificialBlocks) {
-  LLVM_DEBUG(dbgs() << "join MBB: " << MBB.getNumber() << "\n");
-
-  VarLocSet InLocsT(Alloc); // Temporary incoming locations.
-
-  // For all predecessors of this MBB, find the set of VarLocs that
-  // can be joined.
-  int NumVisited = 0;
-  for (auto p : MBB.predecessors()) {
-    // Ignore backedges if we have not visited the predecessor yet. As the
-    // predecessor hasn't yet had locations propagated into it, most locations
-    // will not yet be valid, so treat them as all being uninitialized and
-    // potentially valid. If a location guessed to be correct here is
-    // invalidated later, we will remove it when we revisit this block.
-    if (!Visited.count(p)) {
-      LLVM_DEBUG(dbgs() << "  ignoring unvisited pred MBB: " << p->getNumber()
-                        << "\n");
-      continue;
-    }
-    auto OL = OutLocs.find(p);
-    // Join is null in case of empty OutLocs from any of the pred.
-    if (OL == OutLocs.end())
-      return false;
-
-    // Just copy over the Out locs to incoming locs for the first visited
-    // predecessor, and for all other predecessors join the Out locs.
-    VarLocSet &OutLocVLS = *OL->second.get();
-    if (!NumVisited)
-      InLocsT = OutLocVLS;
-    else
-      InLocsT &= OutLocVLS;
-
-    LLVM_DEBUG({
-      if (!InLocsT.empty()) {
-        for (uint64_t ID : InLocsT)
-          dbgs() << "  gathered candidate incoming var: "
-                 << VarLocIDs[LocIndex::fromRawInteger(ID)]
-                        .Var.getVariable()
-                        ->getName()
-                 << "\n";
-      }
-    });
-
-    NumVisited++;
-  }
-
-  // Filter out DBG_VALUES that are out of scope.
-  VarLocSet KillSet(Alloc);
-  bool IsArtificial = ArtificialBlocks.count(&MBB);
-  if (!IsArtificial) {
-    for (uint64_t ID : InLocsT) {
-      LocIndex Idx = LocIndex::fromRawInteger(ID);
-      if (!VarLocIDs[Idx].dominates(LS, MBB)) {
-        KillSet.set(ID);
-        LLVM_DEBUG({
-          auto Name = VarLocIDs[Idx].Var.getVariable()->getName();
-          dbgs() << "  killing " << Name << ", it doesn't dominate MBB\n";
-        });
-      }
-    }
-  }
-  InLocsT.intersectWithComplement(KillSet);
-
-  // As we are processing blocks in reverse post-order we
-  // should have processed at least one predecessor, unless it
-  // is the entry block which has no predecessor.
-  assert((NumVisited || MBB.pred_empty()) &&
-         "Should have processed at least one predecessor");
-
-  VarLocSet &ILS = getVarLocsInMBB(&MBB, InLocs);
-  bool Changed = false;
-  if (ILS != InLocsT) {
-    ILS = InLocsT;
-    Changed = true;
-  }
-
-  return Changed;
-}
-
-void LiveDebugValues::flushPendingLocs(VarLocInMBB &PendingInLocs,
-                                       VarLocMap &VarLocIDs) {
-  // PendingInLocs records all locations propagated into blocks, which have
-  // not had DBG_VALUE insts created. Go through and create those insts now.
-  for (auto &Iter : PendingInLocs) {
-    // Map is keyed on a constant pointer, unwrap it so we can insert insts.
-    auto &MBB = const_cast<MachineBasicBlock &>(*Iter.first);
-    VarLocSet &Pending = *Iter.second.get();
-
-    for (uint64_t ID : Pending) {
-      // The ID location is live-in to MBB -- work out what kind of machine
-      // location it is and create a DBG_VALUE.
-      const VarLoc &DiffIt = VarLocIDs[LocIndex::fromRawInteger(ID)];
-      if (DiffIt.isEntryBackupLoc())
-        continue;
-      MachineInstr *MI = DiffIt.BuildDbgValue(*MBB.getParent());
-      MBB.insert(MBB.instr_begin(), MI);
-
-      (void)MI;
-      LLVM_DEBUG(dbgs() << "Inserted: "; MI->dump(););
-    }
-  }
-}
-
-bool LiveDebugValues::isEntryValueCandidate(
-    const MachineInstr &MI, const DefinedRegsSet &DefinedRegs) const {
-  assert(MI.isDebugValue() && "This must be DBG_VALUE.");
-
-  // TODO: Add support for local variables that are expressed in terms of
-  // parameters entry values.
-  // TODO: Add support for modified arguments that can be expressed
-  // by using its entry value.
-  auto *DIVar = MI.getDebugVariable();
-  if (!DIVar->isParameter())
-    return false;
-
-  // Do not consider parameters that belong to an inlined function.
-  if (MI.getDebugLoc()->getInlinedAt())
-    return false;
-
-  // Only consider parameters that are described using registers. Parameters
-  // that are passed on the stack are not yet supported, so ignore debug
-  // values that are described by the frame or stack pointer.
-  if (!isRegOtherThanSPAndFP(MI.getDebugOperand(0), MI, TRI))
-    return false;
-
-  // If a parameter's value has been propagated from the caller, then the
-  // parameter's DBG_VALUE may be described using a register defined by some
-  // instruction in the entry block, in which case we shouldn't create an
-  // entry value.
-  if (DefinedRegs.count(MI.getDebugOperand(0).getReg()))
-    return false;
-
-  // TODO: Add support for parameters that have a pre-existing debug expressions
-  // (e.g. fragments).
-  if (MI.getDebugExpression()->getNumElements() > 0)
-    return false;
-
-  return true;
-}
-
-/// Collect all register defines (including aliases) for the given instruction.
-static void collectRegDefs(const MachineInstr &MI, DefinedRegsSet &Regs,
-                           const TargetRegisterInfo *TRI) {
-  for (const MachineOperand &MO : MI.operands())
-    if (MO.isReg() && MO.isDef() && MO.getReg())
-      for (MCRegAliasIterator AI(MO.getReg(), TRI, true); AI.isValid(); ++AI)
-        Regs.insert(*AI);
-}
-
-/// This routine records the entry values of function parameters. The values
-/// could be used as backup values. If we loose the track of some unmodified
-/// parameters, the backup values will be used as a primary locations.
-void LiveDebugValues::recordEntryValue(const MachineInstr &MI,
-                                       const DefinedRegsSet &DefinedRegs,
-                                       OpenRangesSet &OpenRanges,
-                                       VarLocMap &VarLocIDs) {
-  if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) {
-    auto &TM = TPC->getTM<TargetMachine>();
-    if (!TM.Options.ShouldEmitDebugEntryValues())
-      return;
-  }
-
-  DebugVariable V(MI.getDebugVariable(), MI.getDebugExpression(),
-                  MI.getDebugLoc()->getInlinedAt());
-
-  if (!isEntryValueCandidate(MI, DefinedRegs) ||
-      OpenRanges.getEntryValueBackup(V))
-    return;
-
-  LLVM_DEBUG(dbgs() << "Creating the backup entry location: "; MI.dump(););
-
-  // Create the entry value and use it as a backup location until it is
-  // valid. It is valid until a parameter is not changed.
-  DIExpression *NewExpr =
-      DIExpression::prepend(MI.getDebugExpression(), DIExpression::EntryValue);
-  VarLoc EntryValLocAsBackup = VarLoc::CreateEntryBackupLoc(MI, LS, NewExpr);
-  LocIndex EntryValLocID = VarLocIDs.insert(EntryValLocAsBackup);
-  OpenRanges.insert(EntryValLocID, EntryValLocAsBackup);
-}
-
-/// Calculate the liveness information for the given machine function and
-/// extend ranges across basic blocks.
-bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
-  LLVM_DEBUG(dbgs() << "\nDebug Range Extension\n");
-
-  bool Changed = false;
-  bool OLChanged = false;
-  bool MBBJoined = false;
-
-  VarLocMap VarLocIDs;         // Map VarLoc<>unique ID for use in bitvectors.
-  OverlapMap OverlapFragments; // Map of overlapping variable fragments.
-  OpenRangesSet OpenRanges(Alloc, OverlapFragments);
-                              // Ranges that are open until end of bb.
-  VarLocInMBB OutLocs;        // Ranges that exist beyond bb.
-  VarLocInMBB InLocs;         // Ranges that are incoming after joining.
-  TransferMap Transfers;      // DBG_VALUEs associated with transfers (such as
-                              // spills, copies and restores).
-
-  VarToFragments SeenFragments;
-
-  // Blocks which are artificial, i.e. blocks which exclusively contain
-  // instructions without locations, or with line 0 locations.
-  SmallPtrSet<const MachineBasicBlock *, 16> ArtificialBlocks;
-
-  DenseMap<unsigned int, MachineBasicBlock *> OrderToBB;
-  DenseMap<MachineBasicBlock *, unsigned int> BBToOrder;
-  std::priority_queue<unsigned int, std::vector<unsigned int>,
-                      std::greater<unsigned int>>
-      Worklist;
-  std::priority_queue<unsigned int, std::vector<unsigned int>,
-                      std::greater<unsigned int>>
-      Pending;
-
-  // Set of register defines that are seen when traversing the entry block
-  // looking for debug entry value candidates.
-  DefinedRegsSet DefinedRegs;
-
-  // Only in the case of entry MBB collect DBG_VALUEs representing
-  // function parameters in order to generate debug entry values for them.
-  MachineBasicBlock &First_MBB = *(MF.begin());
-  for (auto &MI : First_MBB) {
-    collectRegDefs(MI, DefinedRegs, TRI);
-      if (MI.isDebugValue())
-        recordEntryValue(MI, DefinedRegs, OpenRanges, VarLocIDs);
-  }
-
-  // Initialize per-block structures and scan for fragment overlaps.
-  for (auto &MBB : MF)
-    for (auto &MI : MBB)
-      if (MI.isDebugValue())
-        accumulateFragmentMap(MI, SeenFragments, OverlapFragments);
-
-  auto hasNonArtificialLocation = [](const MachineInstr &MI) -> bool {
-    if (const DebugLoc &DL = MI.getDebugLoc())
-      return DL.getLine() != 0;
-    return false;
-  };
-  for (auto &MBB : MF)
-    if (none_of(MBB.instrs(), hasNonArtificialLocation))
-      ArtificialBlocks.insert(&MBB);
-
-  LLVM_DEBUG(printVarLocInMBB(MF, OutLocs, VarLocIDs,
-                              "OutLocs after initialization", dbgs()));
-
-  ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
-  unsigned int RPONumber = 0;
-  for (auto RI = RPOT.begin(), RE = RPOT.end(); RI != RE; ++RI) {
-    OrderToBB[RPONumber] = *RI;
-    BBToOrder[*RI] = RPONumber;
-    Worklist.push(RPONumber);
-    ++RPONumber;
-  }
-
-  if (RPONumber > InputBBLimit) {
-    unsigned NumInputDbgValues = 0;
-    for (auto &MBB : MF)
-      for (auto &MI : MBB)
-        if (MI.isDebugValue())
-          ++NumInputDbgValues;
-    if (NumInputDbgValues > InputDbgValueLimit) {
-      LLVM_DEBUG(dbgs() << "Disabling LiveDebugValues: " << MF.getName()
-                        << " has " << RPONumber << " basic blocks and "
-                        << NumInputDbgValues
-                        << " input DBG_VALUEs, exceeding limits.\n");
-      return false;
-    }
-  }
-
-  // This is a standard "union of predecessor outs" dataflow problem.
-  // To solve it, we perform join() and process() using the two worklist method
-  // until the ranges converge.
-  // Ranges have converged when both worklists are empty.
-  SmallPtrSet<const MachineBasicBlock *, 16> Visited;
-  while (!Worklist.empty() || !Pending.empty()) {
-    // We track what is on the pending worklist to avoid inserting the same
-    // thing twice.  We could avoid this with a custom priority queue, but this
-    // is probably not worth it.
-    SmallPtrSet<MachineBasicBlock *, 16> OnPending;
-    LLVM_DEBUG(dbgs() << "Processing Worklist\n");
-    while (!Worklist.empty()) {
-      MachineBasicBlock *MBB = OrderToBB[Worklist.top()];
-      Worklist.pop();
-      MBBJoined = join(*MBB, OutLocs, InLocs, VarLocIDs, Visited,
-                       ArtificialBlocks);
-      MBBJoined |= Visited.insert(MBB).second;
-      if (MBBJoined) {
-        MBBJoined = false;
-        Changed = true;
-        // Now that we have started to extend ranges across BBs we need to
-        // examine spill, copy and restore instructions to see whether they
-        // operate with registers that correspond to user variables.
-        // First load any pending inlocs.
-        OpenRanges.insertFromLocSet(getVarLocsInMBB(MBB, InLocs), VarLocIDs);
-        for (auto &MI : *MBB)
-          process(MI, OpenRanges, VarLocIDs, Transfers);
-        OLChanged |= transferTerminator(MBB, OpenRanges, OutLocs, VarLocIDs);
-
-        LLVM_DEBUG(printVarLocInMBB(MF, OutLocs, VarLocIDs,
-                                    "OutLocs after propagating", dbgs()));
-        LLVM_DEBUG(printVarLocInMBB(MF, InLocs, VarLocIDs,
-                                    "InLocs after propagating", dbgs()));
-
-        if (OLChanged) {
-          OLChanged = false;
-          for (auto s : MBB->successors())
-            if (OnPending.insert(s).second) {
-              Pending.push(BBToOrder[s]);
-            }
-        }
-      }
-    }
-    Worklist.swap(Pending);
-    // At this point, pending must be empty, since it was just the empty
-    // worklist
-    assert(Pending.empty() && "Pending should be empty");
-  }
-
-  // Add any DBG_VALUE instructions created by location transfers.
-  for (auto &TR : Transfers) {
-    assert(!TR.TransferInst->isTerminator() &&
-           "Cannot insert DBG_VALUE after terminator");
-    MachineBasicBlock *MBB = TR.TransferInst->getParent();
-    const VarLoc &VL = VarLocIDs[TR.LocationID];
-    MachineInstr *MI = VL.BuildDbgValue(MF);
-    MBB->insertAfterBundle(TR.TransferInst->getIterator(), MI);
-  }
-  Transfers.clear();
-
-  // Deferred inlocs will not have had any DBG_VALUE insts created; do
-  // that now.
-  flushPendingLocs(InLocs, VarLocIDs);
-
-  LLVM_DEBUG(printVarLocInMBB(MF, OutLocs, VarLocIDs, "Final OutLocs", dbgs()));
-  LLVM_DEBUG(printVarLocInMBB(MF, InLocs, VarLocIDs, "Final InLocs", dbgs()));
-  return Changed;
-}
-
-bool LiveDebugValues::runOnMachineFunction(MachineFunction &MF) {
-  if (!MF.getFunction().getSubprogram())
-    // LiveDebugValues will already have removed all DBG_VALUEs.
-    return false;
-
-  // Skip functions from NoDebug compilation units.
-  if (MF.getFunction().getSubprogram()->getUnit()->getEmissionKind() ==
-      DICompileUnit::NoDebug)
-    return false;
-
-  TRI = MF.getSubtarget().getRegisterInfo();
-  TII = MF.getSubtarget().getInstrInfo();
-  TFI = MF.getSubtarget().getFrameLowering();
-  TFI->getCalleeSaves(MF, CalleeSavedRegs);
-  LS.initialize(MF);
-
-  bool Changed = ExtendRanges(MF);
-  return Changed;
-}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
new file mode 100644
index 000000000000..18ffe8ba0669
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
@@ -0,0 +1,3363 @@
+//===- InstrRefBasedImpl.cpp - Tracking Debug Value MIs -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file InstrRefBasedImpl.cpp
+///
+/// This is a separate implementation of LiveDebugValues, see
+/// LiveDebugValues.cpp and VarLocBasedImpl.cpp for more information.
+///
+/// This pass propagates variable locations between basic blocks, resolving
+/// control flow conflicts between them. The problem is much like SSA
+/// construction, where each DBG_VALUE instruction assigns the *value* that
+/// a variable has, and every instruction where the variable is in scope uses
+/// that variable. The resulting map of instruction-to-value is then translated
+/// into a register (or spill) location for each variable over each instruction.
+///
+/// This pass determines which DBG_VALUE dominates which instructions, or if
+/// none do, where values must be merged (like PHI nodes). The added
+/// complication is that because codegen has already finished, a PHI node may
+/// be needed for a variable location to be correct, but no register or spill
+/// slot merges the necessary values. In these circumstances, the variable
+/// location is dropped.
+///
+/// What makes this analysis non-trivial is loops: we cannot tell in advance
+/// whether a variable location is live throughout a loop, or whether its
+/// location is clobbered (or redefined by another DBG_VALUE), without
+/// exploring all the way through.
+///
+/// To make this simpler we perform two kinds of analysis. First, we identify
+/// every value defined by every instruction (ignoring those that only move
+/// another value), then compute a map of which values are available for each
+/// instruction. This is stronger than a reaching-def analysis, as we create
+/// PHI values where other values merge.
+///
+/// Secondly, for each variable, we effectively re-construct SSA using each
+/// DBG_VALUE as a def. The DBG_VALUEs read a value-number computed by the
+/// first analysis from the location they refer to. We can then compute the
+/// dominance frontiers of where a variable has a value, and create PHI nodes
+/// where they merge.
+/// This isn't precisely SSA-construction though, because the function shape
+/// is pre-defined. If a variable location requires a PHI node, but no
+/// PHI for the relevant values is present in the function (as computed by the
+/// first analysis), the location must be dropped.
+///
+/// Once both are complete, we can pass back over all instructions knowing:
+///  * What _value_ each variable should contain, either defined by an
+///    instruction or where control flow merges
+///  * What the location of that value is (if any).
+/// Allowing us to create appropriate live-in DBG_VALUEs, and DBG_VALUEs when
+/// a value moves location. After this pass runs, all variable locations within
+/// a block should be specified by DBG_VALUEs within that block, allowing
+/// DbgEntityHistoryCalculator to focus on individual blocks.
+///
+/// This pass is able to go fast because the size of the first
+/// reaching-definition analysis is proportional to the working-set size of
+/// the function, which the compiler tries to keep small. (It's also
+/// proportional to the number of blocks). Additionally, we repeatedly perform
+/// the second reaching-definition analysis with only the variables and blocks
+/// in a single lexical scope, exploiting their locality.
+///
+/// Determining where PHIs happen is trickier with this approach, and it comes
+/// to a head in the major problem for LiveDebugValues: is a value live-through
+/// a loop, or not? Your garden-variety dataflow analysis aims to build a set of
+/// facts about a function, however this analysis needs to generate new value
+/// numbers at joins.
+///
+/// To do this, consider a lattice of all definition values, from instructions
+/// and from PHIs. Each PHI is characterised by the RPO number of the block it
+/// occurs in. Each value pair A, B can be ordered by RPO(A) < RPO(B):
+/// with non-PHI values at the top, and any PHI value in the last block (by RPO
+/// order) at the bottom.
+///
+/// (Awkwardly: lower-down-the _lattice_ means a greater RPO _number_. Below,
+/// "rank" always refers to the former).
+///
+/// At any join, for each register, we consider:
+///  * All incoming values, and
+///  * The PREVIOUS live-in value at this join.
+/// If all incoming values agree: that's the live-in value. If they do not, the
+/// incoming values are ranked according to the partial order, and the NEXT
+/// LOWEST rank after the PREVIOUS live-in value is picked (multiple values of
+/// the same rank are ignored as conflicting). If there are no candidate values,
+/// or if the rank of the live-in would be lower than the rank of the current
+/// blocks PHIs, create a new PHI value.
+///
+/// Intuitively: if it's not immediately obvious what value a join should result
+/// in, we iteratively descend from instruction-definitions down through PHI
+/// values, getting closer to the current block each time. If the current block
+/// is a loop head, this ordering is effectively searching outer levels of
+/// loops, to find a value that's live-through the current loop.
+///
+/// If there is no value that's live-through this loop, a PHI is created for
+/// this location instead. We can't use a lower-ranked PHI because by definition
+/// it doesn't dominate the current block. We can't create a PHI value any
+/// earlier, because we risk creating a PHI value at a location where values do
+/// not in fact merge, thus misrepresenting the truth, and not making the true
+/// live-through value for variable locations.
+///
+/// This algorithm applies to both calculating the availability of values in
+/// the first analysis, and the location of variables in the second. However
+/// for the second we add an extra dimension of pain: creating a variable
+/// location PHI is only valid if, for each incoming edge,
+///  * There is a value for the variable on the incoming edge, and
+///  * All the edges have that value in the same register.
+/// Or put another way: we can only create a variable-location PHI if there is
+/// a matching machine-location PHI, each input to which is the variables value
+/// in the predecessor block.
+///
+/// To accommodate this difference, each point on the lattice is split in
+/// two: a "proposed" PHI and "definite" PHI. Any PHI that can immediately
+/// have a location determined are "definite" PHIs, and no further work is
+/// needed. Otherwise, a location that all non-backedge predecessors agree
+/// on is picked and propagated as a "proposed" PHI value. If that PHI value
+/// is truly live-through, it'll appear on the loop backedges on the next
+/// dataflow iteration, after which the block live-in moves to be a "definite"
+/// PHI. If it's not truly live-through, the variable value will be downgraded
+/// further as we explore the lattice, or remains "proposed" and is considered
+/// invalid once dataflow completes.
+///
+/// ### Terminology
+///
+/// A machine location is a register or spill slot, a value is something that's
+/// defined by an instruction or PHI node, while a variable value is the value
+/// assigned to a variable. A variable location is a machine location, that must
+/// contain the appropriate variable value. A value that is a PHI node is
+/// occasionally called an mphi.
+///
+/// The first dataflow problem is the "machine value location" problem,
+/// because we're determining which machine locations contain which values.
+/// The "locations" are constant: what's unknown is what value they contain.
+///
+/// The second dataflow problem (the one for variables) is the "variable value
+/// problem", because it's determining what values a variable has, rather than
+/// what location those values are placed in. Unfortunately, it's not that
+/// simple, because producing a PHI value always involves picking a location.
+/// This is an imperfection that we just have to accept, at least for now.
+///
+/// TODO:
+///   Overlapping fragments
+///   Entry values
+///   Add back DEBUG statements for debugging this
+///   Collect statistics
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/UniqueVector.h"
+#include "llvm/CodeGen/LexicalScopes.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/TypeSize.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <functional>
+#include <queue>
+#include <tuple>
+#include <utility>
+#include <vector>
+#include <limits.h>
+#include <limits>
+
+#include "LiveDebugValues.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "livedebugvalues"
+
+STATISTIC(NumInserted, "Number of DBG_VALUE instructions inserted");
+STATISTIC(NumRemoved, "Number of DBG_VALUE instructions removed");
+
+// Act more like the VarLoc implementation, by propagating some locations too
+// far and ignoring some transfers.
+static cl::opt<bool> EmulateOldLDV("emulate-old-livedebugvalues", cl::Hidden,
+                                   cl::desc("Act like old LiveDebugValues did"),
+                                   cl::init(false));
+
+// Rely on isStoreToStackSlotPostFE and similar to observe all stack spills.
+static cl::opt<bool>
+    ObserveAllStackops("observe-all-stack-ops", cl::Hidden,
+                       cl::desc("Allow non-kill spill and restores"),
+                       cl::init(false));
+
+namespace {
+
+// The location at which a spilled value resides. It consists of a register and
+// an offset.
+struct SpillLoc {
+  unsigned SpillBase;
+  StackOffset SpillOffset;
+  bool operator==(const SpillLoc &Other) const {
+    return std::make_pair(SpillBase, SpillOffset) ==
+           std::make_pair(Other.SpillBase, Other.SpillOffset);
+  }
+  bool operator<(const SpillLoc &Other) const {
+    return std::make_tuple(SpillBase, SpillOffset.getFixed(),
+                    SpillOffset.getScalable()) <
+           std::make_tuple(Other.SpillBase, Other.SpillOffset.getFixed(),
+                    Other.SpillOffset.getScalable());
+  }
+};
+
+class LocIdx {
+  unsigned Location;
+
+  // Default constructor is private, initializing to an illegal location number.
+  // Use only for "not an entry" elements in IndexedMaps.
+  LocIdx() : Location(UINT_MAX) { }
+
+public:
+  #define NUM_LOC_BITS 24
+  LocIdx(unsigned L) : Location(L) {
+    assert(L < (1 << NUM_LOC_BITS) && "Machine locations must fit in 24 bits");
+  }
+
+  static LocIdx MakeIllegalLoc() {
+    return LocIdx();
+  }
+
+  bool isIllegal() const {
+    return Location == UINT_MAX;
+  }
+
+  uint64_t asU64() const {
+    return Location;
+  }
+
+  bool operator==(unsigned L) const {
+    return Location == L;
+  }
+
+  bool operator==(const LocIdx &L) const {
+    return Location == L.Location;
+  }
+
+  bool operator!=(unsigned L) const {
+    return !(*this == L);
+  }
+
+  bool operator!=(const LocIdx &L) const {
+    return !(*this == L);
+  }
+
+  bool operator<(const LocIdx &Other) const {
+    return Location < Other.Location;
+  }
+};
+
+class LocIdxToIndexFunctor {
+public:
+  using argument_type = LocIdx;
+  unsigned operator()(const LocIdx &L) const {
+    return L.asU64();
+  }
+};
+
+/// Unique identifier for a value defined by an instruction, as a value type.
+/// Casts back and forth to a uint64_t. Probably replacable with something less
+/// bit-constrained. Each value identifies the instruction and machine location
+/// where the value is defined, although there may be no corresponding machine
+/// operand for it (ex: regmasks clobbering values). The instructions are
+/// one-based, and definitions that are PHIs have instruction number zero.
+///
+/// The obvious limits of a 1M block function or 1M instruction blocks are
+/// problematic; but by that point we should probably have bailed out of
+/// trying to analyse the function.
+class ValueIDNum {
+  uint64_t BlockNo : 20;         /// The block where the def happens.
+  uint64_t InstNo : 20;          /// The Instruction where the def happens.
+                                 /// One based, is distance from start of block.
+  uint64_t LocNo : NUM_LOC_BITS; /// The machine location where the def happens.
+
+public:
+  // XXX -- temporarily enabled while the live-in / live-out tables are moved
+  // to something more type-y
+  ValueIDNum() : BlockNo(0xFFFFF),
+                 InstNo(0xFFFFF),
+                 LocNo(0xFFFFFF) { }
+
+  ValueIDNum(uint64_t Block, uint64_t Inst, uint64_t Loc)
+    : BlockNo(Block), InstNo(Inst), LocNo(Loc) { }
+
+  ValueIDNum(uint64_t Block, uint64_t Inst, LocIdx Loc)
+    : BlockNo(Block), InstNo(Inst), LocNo(Loc.asU64()) { }
+
+  uint64_t getBlock() const { return BlockNo; }
+  uint64_t getInst() const { return InstNo; }
+  uint64_t getLoc() const { return LocNo; }
+  bool isPHI() const { return InstNo == 0; }
+
+  uint64_t asU64() const {
+    uint64_t TmpBlock = BlockNo;
+    uint64_t TmpInst = InstNo;
+    return TmpBlock << 44ull | TmpInst << NUM_LOC_BITS | LocNo;
+  }
+
+  static ValueIDNum fromU64(uint64_t v) {
+    uint64_t L = (v & 0x3FFF);
+    return {v >> 44ull, ((v >> NUM_LOC_BITS) & 0xFFFFF), L};
+  }
+
+  bool operator<(const ValueIDNum &Other) const {
+    return asU64() < Other.asU64();
+  }
+
+  bool operator==(const ValueIDNum &Other) const {
+    return std::tie(BlockNo, InstNo, LocNo) ==
+           std::tie(Other.BlockNo, Other.InstNo, Other.LocNo);
+  }
+
+  bool operator!=(const ValueIDNum &Other) const { return !(*this == Other); }
+
+  std::string asString(const std::string &mlocname) const {
+    return Twine("Value{bb: ")
+        .concat(Twine(BlockNo).concat(
+            Twine(", inst: ")
+                .concat((InstNo ? Twine(InstNo) : Twine("live-in"))
+                            .concat(Twine(", loc: ").concat(Twine(mlocname)))
+                            .concat(Twine("}")))))
+        .str();
+  }
+
+  static ValueIDNum EmptyValue;
+};
+
+} // end anonymous namespace
+
+namespace {
+
+/// Meta qualifiers for a value. Pair of whatever expression is used to qualify
+/// the the value, and Boolean of whether or not it's indirect.
+class DbgValueProperties {
+public:
+  DbgValueProperties(const DIExpression *DIExpr, bool Indirect)
+      : DIExpr(DIExpr), Indirect(Indirect) {}
+
+  /// Extract properties from an existing DBG_VALUE instruction.
+  DbgValueProperties(const MachineInstr &MI) {
+    assert(MI.isDebugValue());
+    DIExpr = MI.getDebugExpression();
+    Indirect = MI.getOperand(1).isImm();
+  }
+
+  bool operator==(const DbgValueProperties &Other) const {
+    return std::tie(DIExpr, Indirect) == std::tie(Other.DIExpr, Other.Indirect);
+  }
+
+  bool operator!=(const DbgValueProperties &Other) const {
+    return !(*this == Other);
+  }
+
+  const DIExpression *DIExpr;
+  bool Indirect;
+};
+
+/// Tracker for what values are in machine locations. Listens to the Things
+/// being Done by various instructions, and maintains a table of what machine
+/// locations have what values (as defined by a ValueIDNum).
+///
+/// There are potentially a much larger number of machine locations on the
+/// target machine than the actual working-set size of the function. On x86 for
+/// example, we're extremely unlikely to want to track values through control
+/// or debug registers. To avoid doing so, MLocTracker has several layers of
+/// indirection going on, with two kinds of ``location'':
+///  * A LocID uniquely identifies a register or spill location, with a
+///    predictable value.
+///  * A LocIdx is a key (in the database sense) for a LocID and a ValueIDNum.
+/// Whenever a location is def'd or used by a MachineInstr, we automagically
+/// create a new LocIdx for a location, but not otherwise. This ensures we only
+/// account for locations that are actually used or defined. The cost is another
+/// vector lookup (of LocID -> LocIdx) over any other implementation. This is
+/// fairly cheap, and the compiler tries to reduce the working-set at any one
+/// time in the function anyway.
+///
+/// Register mask operands completely blow this out of the water; I've just
+/// piled hacks on top of hacks to get around that.
+class MLocTracker {
+public:
+  MachineFunction &MF;
+  const TargetInstrInfo &TII;
+  const TargetRegisterInfo &TRI;
+  const TargetLowering &TLI;
+
+  /// IndexedMap type, mapping from LocIdx to ValueIDNum.
+  using LocToValueType = IndexedMap<ValueIDNum, LocIdxToIndexFunctor>;
+
+  /// Map of LocIdxes to the ValueIDNums that they store. This is tightly
+  /// packed, entries only exist for locations that are being tracked.
+  LocToValueType LocIdxToIDNum;
+
+  /// "Map" of machine location IDs (i.e., raw register or spill number) to the
+  /// LocIdx key / number for that location. There are always at least as many
+  /// as the number of registers on the target -- if the value in the register
+  /// is not being tracked, then the LocIdx value will be zero. New entries are
+  /// appended if a new spill slot begins being tracked.
+  /// This, and the corresponding reverse map persist for the analysis of the
+  /// whole function, and is necessarying for decoding various vectors of
+  /// values.
+  std::vector<LocIdx> LocIDToLocIdx;
+
+  /// Inverse map of LocIDToLocIdx.
+  IndexedMap<unsigned, LocIdxToIndexFunctor> LocIdxToLocID;
+
+  /// Unique-ification of spill slots. Used to number them -- their LocID
+  /// number is the index in SpillLocs minus one plus NumRegs.
+  UniqueVector<SpillLoc> SpillLocs;
+
+  // If we discover a new machine location, assign it an mphi with this
+  // block number.
+  unsigned CurBB;
+
+  /// Cached local copy of the number of registers the target has.
+  unsigned NumRegs;
+
+  /// Collection of register mask operands that have been observed. Second part
+  /// of pair indicates the instruction that they happened in. Used to
+  /// reconstruct where defs happened if we start tracking a location later
+  /// on.
+  SmallVector<std::pair<const MachineOperand *, unsigned>, 32> Masks;
+
+  /// Iterator for locations and the values they contain. Dereferencing
+  /// produces a struct/pair containing the LocIdx key for this location,
+  /// and a reference to the value currently stored. Simplifies the process
+  /// of seeking a particular location.
+  class MLocIterator {
+    LocToValueType &ValueMap;
+    LocIdx Idx;
+
+  public:
+    class value_type {
+      public:
+      value_type(LocIdx Idx, ValueIDNum &Value) : Idx(Idx), Value(Value) { }
+      const LocIdx Idx;  /// Read-only index of this location.
+      ValueIDNum &Value; /// Reference to the stored value at this location.
+    };
+
+    MLocIterator(LocToValueType &ValueMap, LocIdx Idx)
+      : ValueMap(ValueMap), Idx(Idx) { }
+
+    bool operator==(const MLocIterator &Other) const {
+      assert(&ValueMap == &Other.ValueMap);
+      return Idx == Other.Idx;
+    }
+
+    bool operator!=(const MLocIterator &Other) const {
+      return !(*this == Other);
+    }
+
+    void operator++() {
+      Idx = LocIdx(Idx.asU64() + 1);
+    }
+
+    value_type operator*() {
+      return value_type(Idx, ValueMap[LocIdx(Idx)]);
+    }
+  };
+
+  MLocTracker(MachineFunction &MF, const TargetInstrInfo &TII,
+              const TargetRegisterInfo &TRI, const TargetLowering &TLI)
+      : MF(MF), TII(TII), TRI(TRI), TLI(TLI),
+        LocIdxToIDNum(ValueIDNum::EmptyValue),
+        LocIdxToLocID(0) {
+    NumRegs = TRI.getNumRegs();
+    reset();
+    LocIDToLocIdx.resize(NumRegs, LocIdx::MakeIllegalLoc());
+    assert(NumRegs < (1u << NUM_LOC_BITS)); // Detect bit packing failure
+
+    // Always track SP. This avoids the implicit clobbering caused by regmasks
+    // from affectings its values. (LiveDebugValues disbelieves calls and
+    // regmasks that claim to clobber SP).
+    Register SP = TLI.getStackPointerRegisterToSaveRestore();
+    if (SP) {
+      unsigned ID = getLocID(SP, false);
+      (void)lookupOrTrackRegister(ID);
+    }
+  }
+
+  /// Produce location ID number for indexing LocIDToLocIdx. Takes the register
+  /// or spill number, and flag for whether it's a spill or not.
+  unsigned getLocID(Register RegOrSpill, bool isSpill) {
+    return (isSpill) ? RegOrSpill.id() + NumRegs - 1 : RegOrSpill.id();
+  }
+
+  /// Accessor for reading the value at Idx.
+  ValueIDNum getNumAtPos(LocIdx Idx) const {
+    assert(Idx.asU64() < LocIdxToIDNum.size());
+    return LocIdxToIDNum[Idx];
+  }
+
+  unsigned getNumLocs(void) const { return LocIdxToIDNum.size(); }
+
+  /// Reset all locations to contain a PHI value at the designated block. Used
+  /// sometimes for actual PHI values, othertimes to indicate the block entry
+  /// value (before any more information is known).
+  void setMPhis(unsigned NewCurBB) {
+    CurBB = NewCurBB;
+    for (auto Location : locations())
+      Location.Value = {CurBB, 0, Location.Idx};
+  }
+
+  /// Load values for each location from array of ValueIDNums. Take current
+  /// bbnum just in case we read a value from a hitherto untouched register.
+  void loadFromArray(ValueIDNum *Locs, unsigned NewCurBB) {
+    CurBB = NewCurBB;
+    // Iterate over all tracked locations, and load each locations live-in
+    // value into our local index.
+    for (auto Location : locations())
+      Location.Value = Locs[Location.Idx.asU64()];
+  }
+
+  /// Wipe any un-necessary location records after traversing a block.
+  void reset(void) {
+    // We could reset all the location values too; however either loadFromArray
+    // or setMPhis should be called before this object is re-used. Just
+    // clear Masks, they're definitely not needed.
+    Masks.clear();
+  }
+
+  /// Clear all data. Destroys the LocID <=> LocIdx map, which makes most of
+  /// the information in this pass uninterpretable.
+  void clear(void) {
+    reset();
+    LocIDToLocIdx.clear();
+    LocIdxToLocID.clear();
+    LocIdxToIDNum.clear();
+    //SpillLocs.reset(); XXX UniqueVector::reset assumes a SpillLoc casts from 0
+    SpillLocs = decltype(SpillLocs)();
+
+    LocIDToLocIdx.resize(NumRegs, LocIdx::MakeIllegalLoc());
+  }
+
+  /// Set a locaiton to a certain value.
+  void setMLoc(LocIdx L, ValueIDNum Num) {
+    assert(L.asU64() < LocIdxToIDNum.size());
+    LocIdxToIDNum[L] = Num;
+  }
+
+  /// Create a LocIdx for an untracked register ID. Initialize it to either an
+  /// mphi value representing a live-in, or a recent register mask clobber.
+  LocIdx trackRegister(unsigned ID) {
+    assert(ID != 0);
+    LocIdx NewIdx = LocIdx(LocIdxToIDNum.size());
+    LocIdxToIDNum.grow(NewIdx);
+    LocIdxToLocID.grow(NewIdx);
+
+    // Default: it's an mphi.
+    ValueIDNum ValNum = {CurBB, 0, NewIdx};
+    // Was this reg ever touched by a regmask?
+    for (const auto &MaskPair : reverse(Masks)) {
+      if (MaskPair.first->clobbersPhysReg(ID)) {
+        // There was an earlier def we skipped.
+        ValNum = {CurBB, MaskPair.second, NewIdx};
+        break;
+      }
+    }
+
+    LocIdxToIDNum[NewIdx] = ValNum;
+    LocIdxToLocID[NewIdx] = ID;
+    return NewIdx;
+  }
+
+  LocIdx lookupOrTrackRegister(unsigned ID) {
+    LocIdx &Index = LocIDToLocIdx[ID];
+    if (Index.isIllegal())
+      Index = trackRegister(ID);
+    return Index;
+  }
+
+  /// Record a definition of the specified register at the given block / inst.
+  /// This doesn't take a ValueIDNum, because the definition and its location
+  /// are synonymous.
+  void defReg(Register R, unsigned BB, unsigned Inst) {
+    unsigned ID = getLocID(R, false);
+    LocIdx Idx = lookupOrTrackRegister(ID);
+    ValueIDNum ValueID = {BB, Inst, Idx};
+    LocIdxToIDNum[Idx] = ValueID;
+  }
+
+  /// Set a register to a value number. To be used if the value number is
+  /// known in advance.
+  void setReg(Register R, ValueIDNum ValueID) {
+    unsigned ID = getLocID(R, false);
+    LocIdx Idx = lookupOrTrackRegister(ID);
+    LocIdxToIDNum[Idx] = ValueID;
+  }
+
+  ValueIDNum readReg(Register R) {
+    unsigned ID = getLocID(R, false);
+    LocIdx Idx = lookupOrTrackRegister(ID);
+    return LocIdxToIDNum[Idx];
+  }
+
+  /// Reset a register value to zero / empty. Needed to replicate the
+  /// VarLoc implementation where a copy to/from a register effectively
+  /// clears the contents of the source register. (Values can only have one
+  ///  machine location in VarLocBasedImpl).
+  void wipeRegister(Register R) {
+    unsigned ID = getLocID(R, false);
+    LocIdx Idx = LocIDToLocIdx[ID];
+    LocIdxToIDNum[Idx] = ValueIDNum::EmptyValue;
+  }
+
+  /// Determine the LocIdx of an existing register.
+  LocIdx getRegMLoc(Register R) {
+    unsigned ID = getLocID(R, false);
+    return LocIDToLocIdx[ID];
+  }
+
+  /// Record a RegMask operand being executed. Defs any register we currently
+  /// track, stores a pointer to the mask in case we have to account for it
+  /// later.
+  void writeRegMask(const MachineOperand *MO, unsigned CurBB, unsigned InstID) {
+    // Ensure SP exists, so that we don't override it later.
+    Register SP = TLI.getStackPointerRegisterToSaveRestore();
+
+    // Def any register we track have that isn't preserved. The regmask
+    // terminates the liveness of a register, meaning its value can't be
+    // relied upon -- we represent this by giving it a new value.
+    for (auto Location : locations()) {
+      unsigned ID = LocIdxToLocID[Location.Idx];
+      // Don't clobber SP, even if the mask says it's clobbered.
+      if (ID < NumRegs && ID != SP && MO->clobbersPhysReg(ID))
+        defReg(ID, CurBB, InstID);
+    }
+    Masks.push_back(std::make_pair(MO, InstID));
+  }
+
+  /// Find LocIdx for SpillLoc \p L, creating a new one if it's not tracked.
+  LocIdx getOrTrackSpillLoc(SpillLoc L) {
+    unsigned SpillID = SpillLocs.idFor(L);
+    if (SpillID == 0) {
+      SpillID = SpillLocs.insert(L);
+      unsigned L = getLocID(SpillID, true);
+      LocIdx Idx = LocIdx(LocIdxToIDNum.size()); // New idx
+      LocIdxToIDNum.grow(Idx);
+      LocIdxToLocID.grow(Idx);
+      LocIDToLocIdx.push_back(Idx);
+      LocIdxToLocID[Idx] = L;
+      return Idx;
+    } else {
+      unsigned L = getLocID(SpillID, true);
+      LocIdx Idx = LocIDToLocIdx[L];
+      return Idx;
+    }
+  }
+
+  /// Set the value stored in a spill slot.
+  void setSpill(SpillLoc L, ValueIDNum ValueID) {
+    LocIdx Idx = getOrTrackSpillLoc(L);
+    LocIdxToIDNum[Idx] = ValueID;
+  }
+
+  /// Read whatever value is in a spill slot, or None if it isn't tracked.
+  Optional<ValueIDNum> readSpill(SpillLoc L) {
+    unsigned SpillID = SpillLocs.idFor(L);
+    if (SpillID == 0)
+      return None;
+
+    unsigned LocID = getLocID(SpillID, true);
+    LocIdx Idx = LocIDToLocIdx[LocID];
+    return LocIdxToIDNum[Idx];
+  }
+
+  /// Determine the LocIdx of a spill slot. Return None if it previously
+  /// hasn't had a value assigned.
+  Optional<LocIdx> getSpillMLoc(SpillLoc L) {
+    unsigned SpillID = SpillLocs.idFor(L);
+    if (SpillID == 0)
+      return None;
+    unsigned LocNo = getLocID(SpillID, true);
+    return LocIDToLocIdx[LocNo];
+  }
+
+  /// Return true if Idx is a spill machine location.
+  bool isSpill(LocIdx Idx) const {
+    return LocIdxToLocID[Idx] >= NumRegs;
+  }
+
+  MLocIterator begin() {
+    return MLocIterator(LocIdxToIDNum, 0);
+  }
+
+  MLocIterator end() {
+    return MLocIterator(LocIdxToIDNum, LocIdxToIDNum.size());
+  }
+
+  /// Return a range over all locations currently tracked.
+  iterator_range<MLocIterator> locations() {
+    return llvm::make_range(begin(), end());
+  }
+
+  std::string LocIdxToName(LocIdx Idx) const {
+    unsigned ID = LocIdxToLocID[Idx];
+    if (ID >= NumRegs)
+      return Twine("slot ").concat(Twine(ID - NumRegs)).str();
+    else
+      return TRI.getRegAsmName(ID).str();
+  }
+
+  std::string IDAsString(const ValueIDNum &Num) const {
+    std::string DefName = LocIdxToName(Num.getLoc());
+    return Num.asString(DefName);
+  }
+
+  LLVM_DUMP_METHOD
+  void dump() {
+    for (auto Location : locations()) {
+      std::string MLocName = LocIdxToName(Location.Value.getLoc());
+      std::string DefName = Location.Value.asString(MLocName);
+      dbgs() << LocIdxToName(Location.Idx) << " --> " << DefName << "\n";
+    }
+  }
+
+  LLVM_DUMP_METHOD
+  void dump_mloc_map() {
+    for (auto Location : locations()) {
+      std::string foo = LocIdxToName(Location.Idx);
+      dbgs() << "Idx " << Location.Idx.asU64() << " " << foo << "\n";
+    }
+  }
+
+  /// Create a DBG_VALUE based on  machine location \p MLoc. Qualify it with the
+  /// information in \pProperties, for variable Var. Don't insert it anywhere,
+  /// just return the builder for it.
+  MachineInstrBuilder emitLoc(Optional<LocIdx> MLoc, const DebugVariable &Var,
+                              const DbgValueProperties &Properties) {
+    DebugLoc DL = DILocation::get(Var.getVariable()->getContext(), 0, 0,
+                                  Var.getVariable()->getScope(),
+                                  const_cast<DILocation *>(Var.getInlinedAt()));
+    auto MIB = BuildMI(MF, DL, TII.get(TargetOpcode::DBG_VALUE));
+
+    const DIExpression *Expr = Properties.DIExpr;
+    if (!MLoc) {
+      // No location -> DBG_VALUE $noreg
+      MIB.addReg(0, RegState::Debug);
+      MIB.addReg(0, RegState::Debug);
+    } else if (LocIdxToLocID[*MLoc] >= NumRegs) {
+      unsigned LocID = LocIdxToLocID[*MLoc];
+      const SpillLoc &Spill = SpillLocs[LocID - NumRegs + 1];
+
+      auto *TRI = MF.getSubtarget().getRegisterInfo();
+      Expr = TRI->prependOffsetExpression(Expr, DIExpression::ApplyOffset,
+                                          Spill.SpillOffset);
+      unsigned Base = Spill.SpillBase;
+      MIB.addReg(Base, RegState::Debug);
+      MIB.addImm(0);
+    } else {
+      unsigned LocID = LocIdxToLocID[*MLoc];
+      MIB.addReg(LocID, RegState::Debug);
+      if (Properties.Indirect)
+        MIB.addImm(0);
+      else
+        MIB.addReg(0, RegState::Debug);
+    }
+
+    MIB.addMetadata(Var.getVariable());
+    MIB.addMetadata(Expr);
+    return MIB;
+  }
+};
+
+/// Class recording the (high level) _value_ of a variable. Identifies either
+/// the value of the variable as a ValueIDNum, or a constant MachineOperand.
+/// This class also stores meta-information about how the value is qualified.
+/// Used to reason about variable values when performing the second
+/// (DebugVariable specific) dataflow analysis.
+class DbgValue {
+public:
+  union {
+    /// If Kind is Def, the value number that this value is based on.
+    ValueIDNum ID;
+    /// If Kind is Const, the MachineOperand defining this value.
+    MachineOperand MO;
+    /// For a NoVal DbgValue, which block it was generated in.
+    unsigned BlockNo;
+  };
+  /// Qualifiers for the ValueIDNum above.
+  DbgValueProperties Properties;
+
+  typedef enum {
+    Undef,     // Represents a DBG_VALUE $noreg in the transfer function only.
+    Def,       // This value is defined by an inst, or is a PHI value.
+    Const,     // A constant value contained in the MachineOperand field.
+    Proposed,  // This is a tentative PHI value, which may be confirmed or
+               // invalidated later.
+    NoVal      // Empty DbgValue, generated during dataflow. BlockNo stores
+               // which block this was generated in.
+   } KindT;
+  /// Discriminator for whether this is a constant or an in-program value.
+  KindT Kind;
+
+  DbgValue(const ValueIDNum &Val, const DbgValueProperties &Prop, KindT Kind)
+    : ID(Val), Properties(Prop), Kind(Kind) {
+    assert(Kind == Def || Kind == Proposed);
+  }
+
+  DbgValue(unsigned BlockNo, const DbgValueProperties &Prop, KindT Kind)
+    : BlockNo(BlockNo), Properties(Prop), Kind(Kind) {
+    assert(Kind == NoVal);
+  }
+
+  DbgValue(const MachineOperand &MO, const DbgValueProperties &Prop, KindT Kind)
+    : MO(MO), Properties(Prop), Kind(Kind) {
+    assert(Kind == Const);
+  }
+
+  DbgValue(const DbgValueProperties &Prop, KindT Kind)
+    : Properties(Prop), Kind(Kind) {
+    assert(Kind == Undef &&
+           "Empty DbgValue constructor must pass in Undef kind");
+  }
+
+  void dump(const MLocTracker *MTrack) const {
+    if (Kind == Const) {
+      MO.dump();
+    } else if (Kind == NoVal) {
+      dbgs() << "NoVal(" << BlockNo << ")";
+    } else if (Kind == Proposed) {
+      dbgs() << "VPHI(" << MTrack->IDAsString(ID) << ")";
+    } else {
+      assert(Kind == Def);
+      dbgs() << MTrack->IDAsString(ID);
+    }
+    if (Properties.Indirect)
+      dbgs() << " indir";
+    if (Properties.DIExpr)
+      dbgs() << " " << *Properties.DIExpr;
+  }
+
+  bool operator==(const DbgValue &Other) const {
+    if (std::tie(Kind, Properties) != std::tie(Other.Kind, Other.Properties))
+      return false;
+    else if (Kind == Proposed && ID != Other.ID)
+      return false;
+    else if (Kind == Def && ID != Other.ID)
+      return false;
+    else if (Kind == NoVal && BlockNo != Other.BlockNo)
+      return false;
+    else if (Kind == Const)
+      return MO.isIdenticalTo(Other.MO);
+
+    return true;
+  }
+
+  bool operator!=(const DbgValue &Other) const { return !(*this == Other); }
+};
+
+/// Types for recording sets of variable fragments that overlap. For a given
+/// local variable, we record all other fragments of that variable that could
+/// overlap it, to reduce search time.
+using FragmentOfVar =
+    std::pair<const DILocalVariable *, DIExpression::FragmentInfo>;
+using OverlapMap =
+    DenseMap<FragmentOfVar, SmallVector<DIExpression::FragmentInfo, 1>>;
+
+/// Collection of DBG_VALUEs observed when traversing a block. Records each
+/// variable and the value the DBG_VALUE refers to. Requires the machine value
+/// location dataflow algorithm to have run already, so that values can be
+/// identified.
+class VLocTracker {
+public:
+  /// Map DebugVariable to the latest Value it's defined to have.
+  /// Needs to be a MapVector because we determine order-in-the-input-MIR from
+  /// the order in this container.
+  /// We only retain the last DbgValue in each block for each variable, to
+  /// determine the blocks live-out variable value. The Vars container forms the
+  /// transfer function for this block, as part of the dataflow analysis. The
+  /// movement of values between locations inside of a block is handled at a
+  /// much later stage, in the TransferTracker class.
+  MapVector<DebugVariable, DbgValue> Vars;
+  DenseMap<DebugVariable, const DILocation *> Scopes;
+  MachineBasicBlock *MBB;
+
+public:
+  VLocTracker() {}
+
+  void defVar(const MachineInstr &MI, const DbgValueProperties &Properties,
+              Optional<ValueIDNum> ID) {
+    assert(MI.isDebugValue() || MI.isDebugRef());
+    DebugVariable Var(MI.getDebugVariable(), MI.getDebugExpression(),
+                      MI.getDebugLoc()->getInlinedAt());
+    DbgValue Rec = (ID) ? DbgValue(*ID, Properties, DbgValue::Def)
+                        : DbgValue(Properties, DbgValue::Undef);
+
+    // Attempt insertion; overwrite if it's already mapped.
+    auto Result = Vars.insert(std::make_pair(Var, Rec));
+    if (!Result.second)
+      Result.first->second = Rec;
+    Scopes[Var] = MI.getDebugLoc().get();
+  }
+
+  void defVar(const MachineInstr &MI, const MachineOperand &MO) {
+    // Only DBG_VALUEs can define constant-valued variables.
+    assert(MI.isDebugValue());
+    DebugVariable Var(MI.getDebugVariable(), MI.getDebugExpression(),
+                      MI.getDebugLoc()->getInlinedAt());
+    DbgValueProperties Properties(MI);
+    DbgValue Rec = DbgValue(MO, Properties, DbgValue::Const);
+
+    // Attempt insertion; overwrite if it's already mapped.
+    auto Result = Vars.insert(std::make_pair(Var, Rec));
+    if (!Result.second)
+      Result.first->second = Rec;
+    Scopes[Var] = MI.getDebugLoc().get();
+  }
+};
+
+/// Tracker for converting machine value locations and variable values into
+/// variable locations (the output of LiveDebugValues), recorded as DBG_VALUEs
+/// specifying block live-in locations and transfers within blocks.
+///
+/// Operating on a per-block basis, this class takes a (pre-loaded) MLocTracker
+/// and must be initialized with the set of variable values that are live-in to
+/// the block. The caller then repeatedly calls process(). TransferTracker picks
+/// out variable locations for the live-in variable values (if there _is_ a
+/// location) and creates the corresponding DBG_VALUEs. Then, as the block is
+/// stepped through, transfers of values between machine locations are
+/// identified and if profitable, a DBG_VALUE created.
+///
+/// This is where debug use-before-defs would be resolved: a variable with an
+/// unavailable value could materialize in the middle of a block, when the
+/// value becomes available. Or, we could detect clobbers and re-specify the
+/// variable in a backup location. (XXX these are unimplemented).
+class TransferTracker {
+public:
+  const TargetInstrInfo *TII;
+  /// This machine location tracker is assumed to always contain the up-to-date
+  /// value mapping for all machine locations. TransferTracker only reads
+  /// information from it. (XXX make it const?)
+  MLocTracker *MTracker;
+  MachineFunction &MF;
+
+  /// Record of all changes in variable locations at a block position. Awkwardly
+  /// we allow inserting either before or after the point: MBB != nullptr
+  /// indicates it's before, otherwise after.
+  struct Transfer {
+    MachineBasicBlock::iterator Pos; /// Position to insert DBG_VALUes
+    MachineBasicBlock *MBB;          /// non-null if we should insert after.
+    SmallVector<MachineInstr *, 4> Insts; /// Vector of DBG_VALUEs to insert.
+  };
+
+  typedef struct {
+    LocIdx Loc;
+    DbgValueProperties Properties;
+  } LocAndProperties;
+
+  /// Collection of transfers (DBG_VALUEs) to be inserted.
+  SmallVector<Transfer, 32> Transfers;
+
+  /// Local cache of what-value-is-in-what-LocIdx. Used to identify differences
+  /// between TransferTrackers view of variable locations and MLocTrackers. For
+  /// example, MLocTracker observes all clobbers, but TransferTracker lazily
+  /// does not.
+  std::vector<ValueIDNum> VarLocs;
+
+  /// Map from LocIdxes to which DebugVariables are based that location.
+  /// Mantained while stepping through the block. Not accurate if
+  /// VarLocs[Idx] != MTracker->LocIdxToIDNum[Idx].
+  std::map<LocIdx, SmallSet<DebugVariable, 4>> ActiveMLocs;
+
+  /// Map from DebugVariable to it's current location and qualifying meta
+  /// information. To be used in conjunction with ActiveMLocs to construct
+  /// enough information for the DBG_VALUEs for a particular LocIdx.
+  DenseMap<DebugVariable, LocAndProperties> ActiveVLocs;
+
+  /// Temporary cache of DBG_VALUEs to be entered into the Transfers collection.
+  SmallVector<MachineInstr *, 4> PendingDbgValues;
+
+  /// Record of a use-before-def: created when a value that's live-in to the
+  /// current block isn't available in any machine location, but it will be
+  /// defined in this block.
+  struct UseBeforeDef {
+    /// Value of this variable, def'd in block.
+    ValueIDNum ID;
+    /// Identity of this variable.
+    DebugVariable Var;
+    /// Additional variable properties.
+    DbgValueProperties Properties;
+  };
+
+  /// Map from instruction index (within the block) to the set of UseBeforeDefs
+  /// that become defined at that instruction.
+  DenseMap<unsigned, SmallVector<UseBeforeDef, 1>> UseBeforeDefs;
+
+  /// The set of variables that are in UseBeforeDefs and can become a location
+  /// once the relevant value is defined. An element being erased from this
+  /// collection prevents the use-before-def materializing.
+  DenseSet<DebugVariable> UseBeforeDefVariables;
+
+  const TargetRegisterInfo &TRI;
+  const BitVector &CalleeSavedRegs;
+
+  TransferTracker(const TargetInstrInfo *TII, MLocTracker *MTracker,
+                  MachineFunction &MF, const TargetRegisterInfo &TRI,
+                  const BitVector &CalleeSavedRegs)
+      : TII(TII), MTracker(MTracker), MF(MF), TRI(TRI),
+        CalleeSavedRegs(CalleeSavedRegs) {}
+
+  /// Load object with live-in variable values. \p mlocs contains the live-in
+  /// values in each machine location, while \p vlocs the live-in variable
+  /// values. This method picks variable locations for the live-in variables,
+  /// creates DBG_VALUEs and puts them in #Transfers, then prepares the other
+  /// object fields to track variable locations as we step through the block.
+  /// FIXME: could just examine mloctracker instead of passing in \p mlocs?
+  void loadInlocs(MachineBasicBlock &MBB, ValueIDNum *MLocs,
+                  SmallVectorImpl<std::pair<DebugVariable, DbgValue>> &VLocs,
+                  unsigned NumLocs) {
+    ActiveMLocs.clear();
+    ActiveVLocs.clear();
+    VarLocs.clear();
+    VarLocs.reserve(NumLocs);
+    UseBeforeDefs.clear();
+    UseBeforeDefVariables.clear();
+
+    auto isCalleeSaved = [&](LocIdx L) {
+      unsigned Reg = MTracker->LocIdxToLocID[L];
+      if (Reg >= MTracker->NumRegs)
+        return false;
+      for (MCRegAliasIterator RAI(Reg, &TRI, true); RAI.isValid(); ++RAI)
+        if (CalleeSavedRegs.test(*RAI))
+          return true;
+      return false;
+    };
+
+    // Map of the preferred location for each value.
+    std::map<ValueIDNum, LocIdx> ValueToLoc;
+
+    // Produce a map of value numbers to the current machine locs they live
+    // in. When emulating VarLocBasedImpl, there should only be one
+    // location; when not, we get to pick.
+    for (auto Location : MTracker->locations()) {
+      LocIdx Idx = Location.Idx;
+      ValueIDNum &VNum = MLocs[Idx.asU64()];
+      VarLocs.push_back(VNum);
+      auto it = ValueToLoc.find(VNum);
+      // In order of preference, pick:
+      //  * Callee saved registers,
+      //  * Other registers,
+      //  * Spill slots.
+      if (it == ValueToLoc.end() || MTracker->isSpill(it->second) ||
+          (!isCalleeSaved(it->second) && isCalleeSaved(Idx.asU64()))) {
+        // Insert, or overwrite if insertion failed.
+        auto PrefLocRes = ValueToLoc.insert(std::make_pair(VNum, Idx));
+        if (!PrefLocRes.second)
+          PrefLocRes.first->second = Idx;
+      }
+    }
+
+    // Now map variables to their picked LocIdxes.
+    for (auto Var : VLocs) {
+      if (Var.second.Kind == DbgValue::Const) {
+        PendingDbgValues.push_back(
+            emitMOLoc(Var.second.MO, Var.first, Var.second.Properties));
+        continue;
+      }
+
+      // If the value has no location, we can't make a variable location.
+      const ValueIDNum &Num = Var.second.ID;
+      auto ValuesPreferredLoc = ValueToLoc.find(Num);
+      if (ValuesPreferredLoc == ValueToLoc.end()) {
+        // If it's a def that occurs in this block, register it as a
+        // use-before-def to be resolved as we step through the block.
+        if (Num.getBlock() == (unsigned)MBB.getNumber() && !Num.isPHI())
+          addUseBeforeDef(Var.first, Var.second.Properties, Num);
+        continue;
+      }
+
+      LocIdx M = ValuesPreferredLoc->second;
+      auto NewValue = LocAndProperties{M, Var.second.Properties};
+      auto Result = ActiveVLocs.insert(std::make_pair(Var.first, NewValue));
+      if (!Result.second)
+        Result.first->second = NewValue;
+      ActiveMLocs[M].insert(Var.first);
+      PendingDbgValues.push_back(
+          MTracker->emitLoc(M, Var.first, Var.second.Properties));
+    }
+    flushDbgValues(MBB.begin(), &MBB);
+  }
+
+  /// Record that \p Var has value \p ID, a value that becomes available
+  /// later in the function.
+  void addUseBeforeDef(const DebugVariable &Var,
+                       const DbgValueProperties &Properties, ValueIDNum ID) {
+    UseBeforeDef UBD = {ID, Var, Properties};
+    UseBeforeDefs[ID.getInst()].push_back(UBD);
+    UseBeforeDefVariables.insert(Var);
+  }
+
+  /// After the instruction at index \p Inst and position \p pos has been
+  /// processed, check whether it defines a variable value in a use-before-def.
+  /// If so, and the variable value hasn't changed since the start of the
+  /// block, create a DBG_VALUE.
+  void checkInstForNewValues(unsigned Inst, MachineBasicBlock::iterator pos) {
+    auto MIt = UseBeforeDefs.find(Inst);
+    if (MIt == UseBeforeDefs.end())
+      return;
+
+    for (auto &Use : MIt->second) {
+      LocIdx L = Use.ID.getLoc();
+
+      // If something goes very wrong, we might end up labelling a COPY
+      // instruction or similar with an instruction number, where it doesn't
+      // actually define a new value, instead it moves a value. In case this
+      // happens, discard.
+      if (MTracker->LocIdxToIDNum[L] != Use.ID)
+        continue;
+
+      // If a different debug instruction defined the variable value / location
+      // since the start of the block, don't materialize this use-before-def.
+      if (!UseBeforeDefVariables.count(Use.Var))
+        continue;
+
+      PendingDbgValues.push_back(MTracker->emitLoc(L, Use.Var, Use.Properties));
+    }
+    flushDbgValues(pos, nullptr);
+  }
+
+  /// Helper to move created DBG_VALUEs into Transfers collection.
+  void flushDbgValues(MachineBasicBlock::iterator Pos, MachineBasicBlock *MBB) {
+    if (PendingDbgValues.size() > 0) {
+      Transfers.push_back({Pos, MBB, PendingDbgValues});
+      PendingDbgValues.clear();
+    }
+  }
+
+  /// Change a variable value after encountering a DBG_VALUE inside a block.
+  void redefVar(const MachineInstr &MI) {
+    DebugVariable Var(MI.getDebugVariable(), MI.getDebugExpression(),
+                      MI.getDebugLoc()->getInlinedAt());
+    DbgValueProperties Properties(MI);
+
+    const MachineOperand &MO = MI.getOperand(0);
+
+    // Ignore non-register locations, we don't transfer those.
+    if (!MO.isReg() || MO.getReg() == 0) {
+      auto It = ActiveVLocs.find(Var);
+      if (It != ActiveVLocs.end()) {
+        ActiveMLocs[It->second.Loc].erase(Var);
+        ActiveVLocs.erase(It);
+     }
+      // Any use-before-defs no longer apply.
+      UseBeforeDefVariables.erase(Var);
+      return;
+    }
+
+    Register Reg = MO.getReg();
+    LocIdx NewLoc = MTracker->getRegMLoc(Reg);
+    redefVar(MI, Properties, NewLoc);
+  }
+
+  /// Handle a change in variable location within a block. Terminate the
+  /// variables current location, and record the value it now refers to, so
+  /// that we can detect location transfers later on.
+  void redefVar(const MachineInstr &MI, const DbgValueProperties &Properties,
+                Optional<LocIdx> OptNewLoc) {
+    DebugVariable Var(MI.getDebugVariable(), MI.getDebugExpression(),
+                      MI.getDebugLoc()->getInlinedAt());
+    // Any use-before-defs no longer apply.
+    UseBeforeDefVariables.erase(Var);
+
+    // Erase any previous location,
+    auto It = ActiveVLocs.find(Var);
+    if (It != ActiveVLocs.end())
+      ActiveMLocs[It->second.Loc].erase(Var);
+
+    // If there _is_ no new location, all we had to do was erase.
+    if (!OptNewLoc)
+      return;
+    LocIdx NewLoc = *OptNewLoc;
+
+    // Check whether our local copy of values-by-location in #VarLocs is out of
+    // date. Wipe old tracking data for the location if it's been clobbered in
+    // the meantime.
+    if (MTracker->getNumAtPos(NewLoc) != VarLocs[NewLoc.asU64()]) {
+      for (auto &P : ActiveMLocs[NewLoc]) {
+        ActiveVLocs.erase(P);
+      }
+      ActiveMLocs[NewLoc.asU64()].clear();
+      VarLocs[NewLoc.asU64()] = MTracker->getNumAtPos(NewLoc);
+    }
+
+    ActiveMLocs[NewLoc].insert(Var);
+    if (It == ActiveVLocs.end()) {
+      ActiveVLocs.insert(
+          std::make_pair(Var, LocAndProperties{NewLoc, Properties}));
+    } else {
+      It->second.Loc = NewLoc;
+      It->second.Properties = Properties;
+    }
+  }
+
+  /// Explicitly terminate variable locations based on \p mloc. Creates undef
+  /// DBG_VALUEs for any variables that were located there, and clears
+  /// #ActiveMLoc / #ActiveVLoc tracking information for that location.
+  void clobberMloc(LocIdx MLoc, MachineBasicBlock::iterator Pos) {
+    assert(MTracker->isSpill(MLoc));
+    auto ActiveMLocIt = ActiveMLocs.find(MLoc);
+    if (ActiveMLocIt == ActiveMLocs.end())
+      return;
+
+    VarLocs[MLoc.asU64()] = ValueIDNum::EmptyValue;
+
+    for (auto &Var : ActiveMLocIt->second) {
+      auto ActiveVLocIt = ActiveVLocs.find(Var);
+      // Create an undef. We can't feed in a nullptr DIExpression alas,
+      // so use the variables last expression. Pass None as the location.
+      const DIExpression *Expr = ActiveVLocIt->second.Properties.DIExpr;
+      DbgValueProperties Properties(Expr, false);
+      PendingDbgValues.push_back(MTracker->emitLoc(None, Var, Properties));
+      ActiveVLocs.erase(ActiveVLocIt);
+    }
+    flushDbgValues(Pos, nullptr);
+
+    ActiveMLocIt->second.clear();
+  }
+
+  /// Transfer variables based on \p Src to be based on \p Dst. This handles
+  /// both register copies as well as spills and restores. Creates DBG_VALUEs
+  /// describing the movement.
+  void transferMlocs(LocIdx Src, LocIdx Dst, MachineBasicBlock::iterator Pos) {
+    // Does Src still contain the value num we expect? If not, it's been
+    // clobbered in the meantime, and our variable locations are stale.
+    if (VarLocs[Src.asU64()] != MTracker->getNumAtPos(Src))
+      return;
+
+    // assert(ActiveMLocs[Dst].size() == 0);
+    //^^^ Legitimate scenario on account of un-clobbered slot being assigned to?
+    ActiveMLocs[Dst] = ActiveMLocs[Src];
+    VarLocs[Dst.asU64()] = VarLocs[Src.asU64()];
+
+    // For each variable based on Src; create a location at Dst.
+    for (auto &Var : ActiveMLocs[Src]) {
+      auto ActiveVLocIt = ActiveVLocs.find(Var);
+      assert(ActiveVLocIt != ActiveVLocs.end());
+      ActiveVLocIt->second.Loc = Dst;
+
+      assert(Dst != 0);
+      MachineInstr *MI =
+          MTracker->emitLoc(Dst, Var, ActiveVLocIt->second.Properties);
+      PendingDbgValues.push_back(MI);
+    }
+    ActiveMLocs[Src].clear();
+    flushDbgValues(Pos, nullptr);
+
+    // XXX XXX XXX "pretend to be old LDV" means dropping all tracking data
+    // about the old location.
+    if (EmulateOldLDV)
+      VarLocs[Src.asU64()] = ValueIDNum::EmptyValue;
+  }
+
+  MachineInstrBuilder emitMOLoc(const MachineOperand &MO,
+                                const DebugVariable &Var,
+                                const DbgValueProperties &Properties) {
+    DebugLoc DL = DILocation::get(Var.getVariable()->getContext(), 0, 0,
+                                  Var.getVariable()->getScope(),
+                                  const_cast<DILocation *>(Var.getInlinedAt()));
+    auto MIB = BuildMI(MF, DL, TII->get(TargetOpcode::DBG_VALUE));
+    MIB.add(MO);
+    if (Properties.Indirect)
+      MIB.addImm(0);
+    else
+      MIB.addReg(0);
+    MIB.addMetadata(Var.getVariable());
+    MIB.addMetadata(Properties.DIExpr);
+    return MIB;
+  }
+};
+
+class InstrRefBasedLDV : public LDVImpl {
+private:
+  using FragmentInfo = DIExpression::FragmentInfo;
+  using OptFragmentInfo = Optional<DIExpression::FragmentInfo>;
+
+  // Helper while building OverlapMap, a map of all fragments seen for a given
+  // DILocalVariable.
+  using VarToFragments =
+      DenseMap<const DILocalVariable *, SmallSet<FragmentInfo, 4>>;
+
+  /// Machine location/value transfer function, a mapping of which locations
+  /// are assigned which new values.
+  using MLocTransferMap = std::map<LocIdx, ValueIDNum>;
+
+  /// Live in/out structure for the variable values: a per-block map of
+  /// variables to their values. XXX, better name?
+  using LiveIdxT =
+      DenseMap<const MachineBasicBlock *, DenseMap<DebugVariable, DbgValue> *>;
+
+  using VarAndLoc = std::pair<DebugVariable, DbgValue>;
+
+  /// Type for a live-in value: the predecessor block, and its value.
+  using InValueT = std::pair<MachineBasicBlock *, DbgValue *>;
+
+  /// Vector (per block) of a collection (inner smallvector) of live-ins.
+  /// Used as the result type for the variable value dataflow problem.
+  using LiveInsT = SmallVector<SmallVector<VarAndLoc, 8>, 8>;
+
+  const TargetRegisterInfo *TRI;
+  const TargetInstrInfo *TII;
+  const TargetFrameLowering *TFI;
+  BitVector CalleeSavedRegs;
+  LexicalScopes LS;
+  TargetPassConfig *TPC;
+
+  /// Object to track machine locations as we step through a block. Could
+  /// probably be a field rather than a pointer, as it's always used.
+  MLocTracker *MTracker;
+
+  /// Number of the current block LiveDebugValues is stepping through.
+  unsigned CurBB;
+
+  /// Number of the current instruction LiveDebugValues is evaluating.
+  unsigned CurInst;
+
+  /// Variable tracker -- listens to DBG_VALUEs occurring as InstrRefBasedImpl
+  /// steps through a block. Reads the values at each location from the
+  /// MLocTracker object.
+  VLocTracker *VTracker;
+
+  /// Tracker for transfers, listens to DBG_VALUEs and transfers of values
+  /// between locations during stepping, creates new DBG_VALUEs when values move
+  /// location.
+  TransferTracker *TTracker;
+
+  /// Blocks which are artificial, i.e. blocks which exclusively contain
+  /// instructions without DebugLocs, or with line 0 locations.
+  SmallPtrSet<const MachineBasicBlock *, 16> ArtificialBlocks;
+
+  // Mapping of blocks to and from their RPOT order.
+  DenseMap<unsigned int, MachineBasicBlock *> OrderToBB;
+  DenseMap<MachineBasicBlock *, unsigned int> BBToOrder;
+  DenseMap<unsigned, unsigned> BBNumToRPO;
+
+  /// Pair of MachineInstr, and its 1-based offset into the containing block.
+  using InstAndNum = std::pair<const MachineInstr *, unsigned>;
+  /// Map from debug instruction number to the MachineInstr labelled with that
+  /// number, and its location within the function. Used to transform
+  /// instruction numbers in DBG_INSTR_REFs into machine value numbers.
+  std::map<uint64_t, InstAndNum> DebugInstrNumToInstr;
+
+  // Map of overlapping variable fragments.
+  OverlapMap OverlapFragments;
+  VarToFragments SeenFragments;
+
+  /// Tests whether this instruction is a spill to a stack slot.
+  bool isSpillInstruction(const MachineInstr &MI, MachineFunction *MF);
+
+  /// Decide if @MI is a spill instruction and return true if it is. We use 2
+  /// criteria to make this decision:
+  /// - Is this instruction a store to a spill slot?
+  /// - Is there a register operand that is both used and killed?
+  /// TODO: Store optimization can fold spills into other stores (including
+  /// other spills). We do not handle this yet (more than one memory operand).
+  bool isLocationSpill(const MachineInstr &MI, MachineFunction *MF,
+                       unsigned &Reg);
+
+  /// If a given instruction is identified as a spill, return the spill slot
+  /// and set \p Reg to the spilled register.
+  Optional<SpillLoc> isRestoreInstruction(const MachineInstr &MI,
+                                          MachineFunction *MF, unsigned &Reg);
+
+  /// Given a spill instruction, extract the register and offset used to
+  /// address the spill slot in a target independent way.
+  SpillLoc extractSpillBaseRegAndOffset(const MachineInstr &MI);
+
+  /// Observe a single instruction while stepping through a block.
+  void process(MachineInstr &MI);
+
+  /// Examines whether \p MI is a DBG_VALUE and notifies trackers.
+  /// \returns true if MI was recognized and processed.
+  bool transferDebugValue(const MachineInstr &MI);
+
+  /// Examines whether \p MI is a DBG_INSTR_REF and notifies trackers.
+  /// \returns true if MI was recognized and processed.
+  bool transferDebugInstrRef(MachineInstr &MI);
+
+  /// Examines whether \p MI is copy instruction, and notifies trackers.
+  /// \returns true if MI was recognized and processed.
+  bool transferRegisterCopy(MachineInstr &MI);
+
+  /// Examines whether \p MI is stack spill or restore  instruction, and
+  /// notifies trackers. \returns true if MI was recognized and processed.
+  bool transferSpillOrRestoreInst(MachineInstr &MI);
+
+  /// Examines \p MI for any registers that it defines, and notifies trackers.
+  void transferRegisterDef(MachineInstr &MI);
+
+  /// Copy one location to the other, accounting for movement of subregisters
+  /// too.
+  void performCopy(Register Src, Register Dst);
+
+  void accumulateFragmentMap(MachineInstr &MI);
+
+  /// Step through the function, recording register definitions and movements
+  /// in an MLocTracker. Convert the observations into a per-block transfer
+  /// function in \p MLocTransfer, suitable for using with the machine value
+  /// location dataflow problem.
+  void
+  produceMLocTransferFunction(MachineFunction &MF,
+                              SmallVectorImpl<MLocTransferMap> &MLocTransfer,
+                              unsigned MaxNumBlocks);
+
+  /// Solve the machine value location dataflow problem. Takes as input the
+  /// transfer functions in \p MLocTransfer. Writes the output live-in and
+  /// live-out arrays to the (initialized to zero) multidimensional arrays in
+  /// \p MInLocs and \p MOutLocs. The outer dimension is indexed by block
+  /// number, the inner by LocIdx.
+  void mlocDataflow(ValueIDNum **MInLocs, ValueIDNum **MOutLocs,
+                    SmallVectorImpl<MLocTransferMap> &MLocTransfer);
+
+  /// Perform a control flow join (lattice value meet) of the values in machine
+  /// locations at \p MBB. Follows the algorithm described in the file-comment,
+  /// reading live-outs of predecessors from \p OutLocs, the current live ins
+  /// from \p InLocs, and assigning the newly computed live ins back into
+  /// \p InLocs. \returns two bools -- the first indicates whether a change
+  /// was made, the second whether a lattice downgrade occurred. If the latter
+  /// is true, revisiting this block is necessary.
+  std::tuple<bool, bool>
+  mlocJoin(MachineBasicBlock &MBB,
+           SmallPtrSet<const MachineBasicBlock *, 16> &Visited,
+           ValueIDNum **OutLocs, ValueIDNum *InLocs);
+
+  /// Solve the variable value dataflow problem, for a single lexical scope.
+  /// Uses the algorithm from the file comment to resolve control flow joins,
+  /// although there are extra hacks, see vlocJoin. Reads the
+  /// locations of values from the \p MInLocs and \p MOutLocs arrays (see
+  /// mlocDataflow) and reads the variable values transfer function from
+  /// \p AllTheVlocs. Live-in and Live-out variable values are stored locally,
+  /// with the live-ins permanently stored to \p Output once the fixedpoint is
+  /// reached.
+  /// \p VarsWeCareAbout contains a collection of the variables in \p Scope
+  /// that we should be tracking.
+  /// \p AssignBlocks contains the set of blocks that aren't in \p Scope, but
+  /// which do contain DBG_VALUEs, which VarLocBasedImpl tracks locations
+  /// through.
+  void vlocDataflow(const LexicalScope *Scope, const DILocation *DILoc,
+                    const SmallSet<DebugVariable, 4> &VarsWeCareAbout,
+                    SmallPtrSetImpl<MachineBasicBlock *> &AssignBlocks,
+                    LiveInsT &Output, ValueIDNum **MOutLocs,
+                    ValueIDNum **MInLocs,
+                    SmallVectorImpl<VLocTracker> &AllTheVLocs);
+
+  /// Compute the live-ins to a block, considering control flow merges according
+  /// to the method in the file comment. Live out and live in variable values
+  /// are stored in \p VLOCOutLocs and \p VLOCInLocs. The live-ins for \p MBB
+  /// are computed and stored into \p VLOCInLocs. \returns true if the live-ins
+  /// are modified.
+  /// \p InLocsT Output argument, storage for calculated live-ins.
+  /// \returns two bools -- the first indicates whether a change
+  /// was made, the second whether a lattice downgrade occurred. If the latter
+  /// is true, revisiting this block is necessary.
+  std::tuple<bool, bool>
+  vlocJoin(MachineBasicBlock &MBB, LiveIdxT &VLOCOutLocs, LiveIdxT &VLOCInLocs,
+           SmallPtrSet<const MachineBasicBlock *, 16> *VLOCVisited,
+           unsigned BBNum, const SmallSet<DebugVariable, 4> &AllVars,
+           ValueIDNum **MOutLocs, ValueIDNum **MInLocs,
+           SmallPtrSet<const MachineBasicBlock *, 8> &InScopeBlocks,
+           SmallPtrSet<const MachineBasicBlock *, 8> &BlocksToExplore,
+           DenseMap<DebugVariable, DbgValue> &InLocsT);
+
+  /// Continue exploration of the variable-value lattice, as explained in the
+  /// file-level comment. \p OldLiveInLocation contains the current
+  /// exploration position, from which we need to descend further. \p Values
+  /// contains the set of live-in values, \p CurBlockRPONum the RPO number of
+  /// the current block, and \p CandidateLocations a set of locations that
+  /// should be considered as PHI locations, if we reach the bottom of the
+  /// lattice. \returns true if we should downgrade; the value is the agreeing
+  /// value number in a non-backedge predecessor.
+  bool vlocDowngradeLattice(const MachineBasicBlock &MBB,
+                            const DbgValue &OldLiveInLocation,
+                            const SmallVectorImpl<InValueT> &Values,
+                            unsigned CurBlockRPONum);
+
+  /// For the given block and live-outs feeding into it, try to find a
+  /// machine location where they all join. If a solution for all predecessors
+  /// can't be found, a location where all non-backedge-predecessors join
+  /// will be returned instead. While this method finds a join location, this
+  /// says nothing as to whether it should be used.
+  /// \returns Pair of value ID if found, and true when the correct value
+  /// is available on all predecessor edges, or false if it's only available
+  /// for non-backedge predecessors.
+  std::tuple<Optional<ValueIDNum>, bool>
+  pickVPHILoc(MachineBasicBlock &MBB, const DebugVariable &Var,
+              const LiveIdxT &LiveOuts, ValueIDNum **MOutLocs,
+              ValueIDNum **MInLocs,
+              const SmallVectorImpl<MachineBasicBlock *> &BlockOrders);
+
+  /// Given the solutions to the two dataflow problems, machine value locations
+  /// in \p MInLocs and live-in variable values in \p SavedLiveIns, runs the
+  /// TransferTracker class over the function to produce live-in and transfer
+  /// DBG_VALUEs, then inserts them. Groups of DBG_VALUEs are inserted in the
+  /// order given by AllVarsNumbering -- this could be any stable order, but
+  /// right now "order of appearence in function, when explored in RPO", so
+  /// that we can compare explictly against VarLocBasedImpl.
+  void emitLocations(MachineFunction &MF, LiveInsT SavedLiveIns,
+                     ValueIDNum **MInLocs,
+                     DenseMap<DebugVariable, unsigned> &AllVarsNumbering);
+
+  /// Boilerplate computation of some initial sets, artifical blocks and
+  /// RPOT block ordering.
+  void initialSetup(MachineFunction &MF);
+
+  bool ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC) override;
+
+public:
+  /// Default construct and initialize the pass.
+  InstrRefBasedLDV();
+
+  LLVM_DUMP_METHOD
+  void dump_mloc_transfer(const MLocTransferMap &mloc_transfer) const;
+
+  bool isCalleeSaved(LocIdx L) {
+    unsigned Reg = MTracker->LocIdxToLocID[L];
+    for (MCRegAliasIterator RAI(Reg, TRI, true); RAI.isValid(); ++RAI)
+      if (CalleeSavedRegs.test(*RAI))
+        return true;
+    return false;
+  }
+};
+
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+//            Implementation
+//===----------------------------------------------------------------------===//
+
+ValueIDNum ValueIDNum::EmptyValue = {UINT_MAX, UINT_MAX, UINT_MAX};
+
+/// Default construct and initialize the pass.
+InstrRefBasedLDV::InstrRefBasedLDV() {}
+
+//===----------------------------------------------------------------------===//
+//            Debug Range Extension Implementation
+//===----------------------------------------------------------------------===//
+
+#ifndef NDEBUG
+// Something to restore in the future.
+// void InstrRefBasedLDV::printVarLocInMBB(..)
+#endif
+
+SpillLoc
+InstrRefBasedLDV::extractSpillBaseRegAndOffset(const MachineInstr &MI) {
+  assert(MI.hasOneMemOperand() &&
+         "Spill instruction does not have exactly one memory operand?");
+  auto MMOI = MI.memoperands_begin();
+  const PseudoSourceValue *PVal = (*MMOI)->getPseudoValue();
+  assert(PVal->kind() == PseudoSourceValue::FixedStack &&
+         "Inconsistent memory operand in spill instruction");
+  int FI = cast<FixedStackPseudoSourceValue>(PVal)->getFrameIndex();
+  const MachineBasicBlock *MBB = MI.getParent();
+  Register Reg;
+  StackOffset Offset = TFI->getFrameIndexReference(*MBB->getParent(), FI, Reg);
+  return {Reg, Offset};
+}
+
+/// End all previous ranges related to @MI and start a new range from @MI
+/// if it is a DBG_VALUE instr.
+bool InstrRefBasedLDV::transferDebugValue(const MachineInstr &MI) {
+  if (!MI.isDebugValue())
+    return false;
+
+  const DILocalVariable *Var = MI.getDebugVariable();
+  const DIExpression *Expr = MI.getDebugExpression();
+  const DILocation *DebugLoc = MI.getDebugLoc();
+  const DILocation *InlinedAt = DebugLoc->getInlinedAt();
+  assert(Var->isValidLocationForIntrinsic(DebugLoc) &&
+         "Expected inlined-at fields to agree");
+
+  DebugVariable V(Var, Expr, InlinedAt);
+  DbgValueProperties Properties(MI);
+
+  // If there are no instructions in this lexical scope, do no location tracking
+  // at all, this variable shouldn't get a legitimate location range.
+  auto *Scope = LS.findLexicalScope(MI.getDebugLoc().get());
+  if (Scope == nullptr)
+    return true; // handled it; by doing nothing
+
+  const MachineOperand &MO = MI.getOperand(0);
+
+  // MLocTracker needs to know that this register is read, even if it's only
+  // read by a debug inst.
+  if (MO.isReg() && MO.getReg() != 0)
+    (void)MTracker->readReg(MO.getReg());
+
+  // If we're preparing for the second analysis (variables), the machine value
+  // locations are already solved, and we report this DBG_VALUE and the value
+  // it refers to to VLocTracker.
+  if (VTracker) {
+    if (MO.isReg()) {
+      // Feed defVar the new variable location, or if this is a
+      // DBG_VALUE $noreg, feed defVar None.
+      if (MO.getReg())
+        VTracker->defVar(MI, Properties, MTracker->readReg(MO.getReg()));
+      else
+        VTracker->defVar(MI, Properties, None);
+    } else if (MI.getOperand(0).isImm() || MI.getOperand(0).isFPImm() ||
+               MI.getOperand(0).isCImm()) {
+      VTracker->defVar(MI, MI.getOperand(0));
+    }
+  }
+
+  // If performing final tracking of transfers, report this variable definition
+  // to the TransferTracker too.
+  if (TTracker)
+    TTracker->redefVar(MI);
+  return true;
+}
+
+bool InstrRefBasedLDV::transferDebugInstrRef(MachineInstr &MI) {
+  if (!MI.isDebugRef())
+    return false;
+
+  // Only handle this instruction when we are building the variable value
+  // transfer function.
+  if (!VTracker)
+    return false;
+
+  unsigned InstNo = MI.getOperand(0).getImm();
+  unsigned OpNo = MI.getOperand(1).getImm();
+
+  const DILocalVariable *Var = MI.getDebugVariable();
+  const DIExpression *Expr = MI.getDebugExpression();
+  const DILocation *DebugLoc = MI.getDebugLoc();
+  const DILocation *InlinedAt = DebugLoc->getInlinedAt();
+  assert(Var->isValidLocationForIntrinsic(DebugLoc) &&
+         "Expected inlined-at fields to agree");
+
+  DebugVariable V(Var, Expr, InlinedAt);
+
+  auto *Scope = LS.findLexicalScope(MI.getDebugLoc().get());
+  if (Scope == nullptr)
+    return true; // Handled by doing nothing. This variable is never in scope.
+
+  const MachineFunction &MF = *MI.getParent()->getParent();
+
+  // Various optimizations may have happened to the value during codegen,
+  // recorded in the value substitution table. Apply any substitutions to
+  // the instruction / operand number in this DBG_INSTR_REF.
+  auto Sub = MF.DebugValueSubstitutions.find(std::make_pair(InstNo, OpNo));
+  while (Sub != MF.DebugValueSubstitutions.end()) {
+    InstNo = Sub->second.first;
+    OpNo = Sub->second.second;
+    Sub = MF.DebugValueSubstitutions.find(std::make_pair(InstNo, OpNo));
+  }
+
+  // Default machine value number is <None> -- if no instruction defines
+  // the corresponding value, it must have been optimized out.
+  Optional<ValueIDNum> NewID = None;
+
+  // Try to lookup the instruction number, and find the machine value number
+  // that it defines.
+  auto InstrIt = DebugInstrNumToInstr.find(InstNo);
+  if (InstrIt != DebugInstrNumToInstr.end()) {
+    const MachineInstr &TargetInstr = *InstrIt->second.first;
+    uint64_t BlockNo = TargetInstr.getParent()->getNumber();
+
+    // Pick out the designated operand.
+    assert(OpNo < TargetInstr.getNumOperands());
+    const MachineOperand &MO = TargetInstr.getOperand(OpNo);
+
+    // Today, this can only be a register.
+    assert(MO.isReg() && MO.isDef());
+
+    unsigned LocID = MTracker->getLocID(MO.getReg(), false);
+    LocIdx L = MTracker->LocIDToLocIdx[LocID];
+    NewID = ValueIDNum(BlockNo, InstrIt->second.second, L);
+  }
+
+  // We, we have a value number or None. Tell the variable value tracker about
+  // it. The rest of this LiveDebugValues implementation acts exactly the same
+  // for DBG_INSTR_REFs as DBG_VALUEs (just, the former can refer to values that
+  // aren't immediately available).
+  DbgValueProperties Properties(Expr, false);
+  VTracker->defVar(MI, Properties, NewID);
+
+  // If we're on the final pass through the function, decompose this INSTR_REF
+  // into a plain DBG_VALUE.
+  if (!TTracker)
+    return true;
+
+  // Pick a location for the machine value number, if such a location exists.
+  // (This information could be stored in TransferTracker to make it faster).
+  Optional<LocIdx> FoundLoc = None;
+  for (auto Location : MTracker->locations()) {
+    LocIdx CurL = Location.Idx;
+    ValueIDNum ID = MTracker->LocIdxToIDNum[CurL];
+    if (NewID && ID == NewID) {
+      // If this is the first location with that value, pick it. Otherwise,
+      // consider whether it's a "longer term" location.
+      if (!FoundLoc) {
+        FoundLoc = CurL;
+        continue;
+      }
+
+      if (MTracker->isSpill(CurL))
+        FoundLoc = CurL; // Spills are a longer term location.
+      else if (!MTracker->isSpill(*FoundLoc) &&
+               !MTracker->isSpill(CurL) &&
+               !isCalleeSaved(*FoundLoc) &&
+               isCalleeSaved(CurL))
+        FoundLoc = CurL; // Callee saved regs are longer term than normal.
+    }
+  }
+
+  // Tell transfer tracker that the variable value has changed.
+  TTracker->redefVar(MI, Properties, FoundLoc);
+
+  // If there was a value with no location; but the value is defined in a
+  // later instruction in this block, this is a block-local use-before-def.
+  if (!FoundLoc && NewID && NewID->getBlock() == CurBB &&
+      NewID->getInst() > CurInst)
+    TTracker->addUseBeforeDef(V, {MI.getDebugExpression(), false}, *NewID);
+
+  // Produce a DBG_VALUE representing what this DBG_INSTR_REF meant.
+  // This DBG_VALUE is potentially a $noreg / undefined location, if
+  // FoundLoc is None.
+  // (XXX -- could morph the DBG_INSTR_REF in the future).
+  MachineInstr *DbgMI = MTracker->emitLoc(FoundLoc, V, Properties);
+  TTracker->PendingDbgValues.push_back(DbgMI);
+  TTracker->flushDbgValues(MI.getIterator(), nullptr);
+
+  return true;
+}
+
+void InstrRefBasedLDV::transferRegisterDef(MachineInstr &MI) {
+  // Meta Instructions do not affect the debug liveness of any register they
+  // define.
+  if (MI.isImplicitDef()) {
+    // Except when there's an implicit def, and the location it's defining has
+    // no value number. The whole point of an implicit def is to announce that
+    // the register is live, without be specific about it's value. So define
+    // a value if there isn't one already.
+    ValueIDNum Num = MTracker->readReg(MI.getOperand(0).getReg());
+    // Has a legitimate value -> ignore the implicit def.
+    if (Num.getLoc() != 0)
+      return;
+    // Otherwise, def it here.
+  } else if (MI.isMetaInstruction())
+    return;
+
+  MachineFunction *MF = MI.getMF();
+  const TargetLowering *TLI = MF->getSubtarget().getTargetLowering();
+  Register SP = TLI->getStackPointerRegisterToSaveRestore();
+
+  // Find the regs killed by MI, and find regmasks of preserved regs.
+  // Max out the number of statically allocated elements in `DeadRegs`, as this
+  // prevents fallback to std::set::count() operations.
+  SmallSet<uint32_t, 32> DeadRegs;
+  SmallVector<const uint32_t *, 4> RegMasks;
+  SmallVector<const MachineOperand *, 4> RegMaskPtrs;
+  for (const MachineOperand &MO : MI.operands()) {
+    // Determine whether the operand is a register def.
+    if (MO.isReg() && MO.isDef() && MO.getReg() &&
+        Register::isPhysicalRegister(MO.getReg()) &&
+        !(MI.isCall() && MO.getReg() == SP)) {
+      // Remove ranges of all aliased registers.
+      for (MCRegAliasIterator RAI(MO.getReg(), TRI, true); RAI.isValid(); ++RAI)
+        // FIXME: Can we break out of this loop early if no insertion occurs?
+        DeadRegs.insert(*RAI);
+    } else if (MO.isRegMask()) {
+      RegMasks.push_back(MO.getRegMask());
+      RegMaskPtrs.push_back(&MO);
+    }
+  }
+
+  // Tell MLocTracker about all definitions, of regmasks and otherwise.
+  for (uint32_t DeadReg : DeadRegs)
+    MTracker->defReg(DeadReg, CurBB, CurInst);
+
+  for (auto *MO : RegMaskPtrs)
+    MTracker->writeRegMask(MO, CurBB, CurInst);
+}
+
+void InstrRefBasedLDV::performCopy(Register SrcRegNum, Register DstRegNum) {
+  ValueIDNum SrcValue = MTracker->readReg(SrcRegNum);
+
+  MTracker->setReg(DstRegNum, SrcValue);
+
+  // In all circumstances, re-def the super registers. It's definitely a new
+  // value now. This doesn't uniquely identify the composition of subregs, for
+  // example, two identical values in subregisters composed in different
+  // places would not get equal value numbers.
+  for (MCSuperRegIterator SRI(DstRegNum, TRI); SRI.isValid(); ++SRI)
+    MTracker->defReg(*SRI, CurBB, CurInst);
+
+  // If we're emulating VarLocBasedImpl, just define all the subregisters.
+  // DBG_VALUEs of them will expect to be tracked from the DBG_VALUE, not
+  // through prior copies.
+  if (EmulateOldLDV) {
+    for (MCSubRegIndexIterator DRI(DstRegNum, TRI); DRI.isValid(); ++DRI)
+      MTracker->defReg(DRI.getSubReg(), CurBB, CurInst);
+    return;
+  }
+
+  // Otherwise, actually copy subregisters from one location to another.
+  // XXX: in addition, any subregisters of DstRegNum that don't line up with
+  // the source register should be def'd.
+  for (MCSubRegIndexIterator SRI(SrcRegNum, TRI); SRI.isValid(); ++SRI) {
+    unsigned SrcSubReg = SRI.getSubReg();
+    unsigned SubRegIdx = SRI.getSubRegIndex();
+    unsigned DstSubReg = TRI->getSubReg(DstRegNum, SubRegIdx);
+    if (!DstSubReg)
+      continue;
+
+    // Do copy. There are two matching subregisters, the source value should
+    // have been def'd when the super-reg was, the latter might not be tracked
+    // yet.
+    // This will force SrcSubReg to be tracked, if it isn't yet.
+    (void)MTracker->readReg(SrcSubReg);
+    LocIdx SrcL = MTracker->getRegMLoc(SrcSubReg);
+    assert(SrcL.asU64());
+    (void)MTracker->readReg(DstSubReg);
+    LocIdx DstL = MTracker->getRegMLoc(DstSubReg);
+    assert(DstL.asU64());
+    (void)DstL;
+    ValueIDNum CpyValue = {SrcValue.getBlock(), SrcValue.getInst(), SrcL};
+
+    MTracker->setReg(DstSubReg, CpyValue);
+  }
+}
+
+bool InstrRefBasedLDV::isSpillInstruction(const MachineInstr &MI,
+                                          MachineFunction *MF) {
+  // TODO: Handle multiple stores folded into one.
+  if (!MI.hasOneMemOperand())
+    return false;
+
+  if (!MI.getSpillSize(TII) && !MI.getFoldedSpillSize(TII))
+    return false; // This is not a spill instruction, since no valid size was
+                  // returned from either function.
+
+  return true;
+}
+
+bool InstrRefBasedLDV::isLocationSpill(const MachineInstr &MI,
+                                       MachineFunction *MF, unsigned &Reg) {
+  if (!isSpillInstruction(MI, MF))
+    return false;
+
+  // XXX FIXME: On x86, isStoreToStackSlotPostFE returns '1' instead of an
+  // actual register number.
+  if (ObserveAllStackops) {
+    int FI;
+    Reg = TII->isStoreToStackSlotPostFE(MI, FI);
+    return Reg != 0;
+  }
+
+  auto isKilledReg = [&](const MachineOperand MO, unsigned &Reg) {
+    if (!MO.isReg() || !MO.isUse()) {
+      Reg = 0;
+      return false;
+    }
+    Reg = MO.getReg();
+    return MO.isKill();
+  };
+
+  for (const MachineOperand &MO : MI.operands()) {
+    // In a spill instruction generated by the InlineSpiller the spilled
+    // register has its kill flag set.
+    if (isKilledReg(MO, Reg))
+      return true;
+    if (Reg != 0) {
+      // Check whether next instruction kills the spilled register.
+      // FIXME: Current solution does not cover search for killed register in
+      // bundles and instructions further down the chain.
+      auto NextI = std::next(MI.getIterator());
+      // Skip next instruction that points to basic block end iterator.
+      if (MI.getParent()->end() == NextI)
+        continue;
+      unsigned RegNext;
+      for (const MachineOperand &MONext : NextI->operands()) {
+        // Return true if we came across the register from the
+        // previous spill instruction that is killed in NextI.
+        if (isKilledReg(MONext, RegNext) && RegNext == Reg)
+          return true;
+      }
+    }
+  }
+  // Return false if we didn't find spilled register.
+  return false;
+}
+
+Optional<SpillLoc>
+InstrRefBasedLDV::isRestoreInstruction(const MachineInstr &MI,
+                                       MachineFunction *MF, unsigned &Reg) {
+  if (!MI.hasOneMemOperand())
+    return None;
+
+  // FIXME: Handle folded restore instructions with more than one memory
+  // operand.
+  if (MI.getRestoreSize(TII)) {
+    Reg = MI.getOperand(0).getReg();
+    return extractSpillBaseRegAndOffset(MI);
+  }
+  return None;
+}
+
+bool InstrRefBasedLDV::transferSpillOrRestoreInst(MachineInstr &MI) {
+  // XXX -- it's too difficult to implement VarLocBasedImpl's  stack location
+  // limitations under the new model. Therefore, when comparing them, compare
+  // versions that don't attempt spills or restores at all.
+  if (EmulateOldLDV)
+    return false;
+
+  MachineFunction *MF = MI.getMF();
+  unsigned Reg;
+  Optional<SpillLoc> Loc;
+
+  LLVM_DEBUG(dbgs() << "Examining instruction: "; MI.dump(););
+
+  // First, if there are any DBG_VALUEs pointing at a spill slot that is
+  // written to, terminate that variable location. The value in memory
+  // will have changed. DbgEntityHistoryCalculator doesn't try to detect this.
+  if (isSpillInstruction(MI, MF)) {
+    Loc = extractSpillBaseRegAndOffset(MI);
+
+    if (TTracker) {
+      Optional<LocIdx> MLoc = MTracker->getSpillMLoc(*Loc);
+      if (MLoc)
+        TTracker->clobberMloc(*MLoc, MI.getIterator());
+    }
+  }
+
+  // Try to recognise spill and restore instructions that may transfer a value.
+  if (isLocationSpill(MI, MF, Reg)) {
+    Loc = extractSpillBaseRegAndOffset(MI);
+    auto ValueID = MTracker->readReg(Reg);
+
+    // If the location is empty, produce a phi, signify it's the live-in value.
+    if (ValueID.getLoc() == 0)
+      ValueID = {CurBB, 0, MTracker->getRegMLoc(Reg)};
+
+    MTracker->setSpill(*Loc, ValueID);
+    auto OptSpillLocIdx = MTracker->getSpillMLoc(*Loc);
+    assert(OptSpillLocIdx && "Spill slot set but has no LocIdx?");
+    LocIdx SpillLocIdx = *OptSpillLocIdx;
+
+    // Tell TransferTracker about this spill, produce DBG_VALUEs for it.
+    if (TTracker)
+      TTracker->transferMlocs(MTracker->getRegMLoc(Reg), SpillLocIdx,
+                              MI.getIterator());
+  } else {
+    if (!(Loc = isRestoreInstruction(MI, MF, Reg)))
+      return false;
+
+    // Is there a value to be restored?
+    auto OptValueID = MTracker->readSpill(*Loc);
+    if (OptValueID) {
+      ValueIDNum ValueID = *OptValueID;
+      LocIdx SpillLocIdx = *MTracker->getSpillMLoc(*Loc);
+      // XXX -- can we recover sub-registers of this value? Until we can, first
+      // overwrite all defs of the register being restored to.
+      for (MCRegAliasIterator RAI(Reg, TRI, true); RAI.isValid(); ++RAI)
+        MTracker->defReg(*RAI, CurBB, CurInst);
+
+      // Now override the reg we're restoring to.
+      MTracker->setReg(Reg, ValueID);
+
+      // Report this restore to the transfer tracker too.
+      if (TTracker)
+        TTracker->transferMlocs(SpillLocIdx, MTracker->getRegMLoc(Reg),
+                                MI.getIterator());
+    } else {
+      // There isn't anything in the location; not clear if this is a code path
+      // that still runs. Def this register anyway just in case.
+      for (MCRegAliasIterator RAI(Reg, TRI, true); RAI.isValid(); ++RAI)
+        MTracker->defReg(*RAI, CurBB, CurInst);
+
+      // Force the spill slot to be tracked.
+      LocIdx L = MTracker->getOrTrackSpillLoc(*Loc);
+
+      // Set the restored value to be a machine phi number, signifying that it's
+      // whatever the spills live-in value is in this block. Definitely has
+      // a LocIdx due to the setSpill above.
+      ValueIDNum ValueID = {CurBB, 0, L};
+      MTracker->setReg(Reg, ValueID);
+      MTracker->setSpill(*Loc, ValueID);
+    }
+  }
+  return true;
+}
+
+bool InstrRefBasedLDV::transferRegisterCopy(MachineInstr &MI) {
+  auto DestSrc = TII->isCopyInstr(MI);
+  if (!DestSrc)
+    return false;
+
+  const MachineOperand *DestRegOp = DestSrc->Destination;
+  const MachineOperand *SrcRegOp = DestSrc->Source;
+
+  auto isCalleeSavedReg = [&](unsigned Reg) {
+    for (MCRegAliasIterator RAI(Reg, TRI, true); RAI.isValid(); ++RAI)
+      if (CalleeSavedRegs.test(*RAI))
+        return true;
+    return false;
+  };
+
+  Register SrcReg = SrcRegOp->getReg();
+  Register DestReg = DestRegOp->getReg();
+
+  // Ignore identity copies. Yep, these make it as far as LiveDebugValues.
+  if (SrcReg == DestReg)
+    return true;
+
+  // For emulating VarLocBasedImpl:
+  // We want to recognize instructions where destination register is callee
+  // saved register. If register that could be clobbered by the call is
+  // included, there would be a great chance that it is going to be clobbered
+  // soon. It is more likely that previous register, which is callee saved, is
+  // going to stay unclobbered longer, even if it is killed.
+  //
+  // For InstrRefBasedImpl, we can track multiple locations per value, so
+  // ignore this condition.
+  if (EmulateOldLDV && !isCalleeSavedReg(DestReg))
+    return false;
+
+  // InstrRefBasedImpl only followed killing copies.
+  if (EmulateOldLDV && !SrcRegOp->isKill())
+    return false;
+
+  // Copy MTracker info, including subregs if available.
+  InstrRefBasedLDV::performCopy(SrcReg, DestReg);
+
+  // Only produce a transfer of DBG_VALUE within a block where old LDV
+  // would have. We might make use of the additional value tracking in some
+  // other way, later.
+  if (TTracker && isCalleeSavedReg(DestReg) && SrcRegOp->isKill())
+    TTracker->transferMlocs(MTracker->getRegMLoc(SrcReg),
+                            MTracker->getRegMLoc(DestReg), MI.getIterator());
+
+  // VarLocBasedImpl would quit tracking the old location after copying.
+  if (EmulateOldLDV && SrcReg != DestReg)
+    MTracker->defReg(SrcReg, CurBB, CurInst);
+
+  return true;
+}
+
+/// Accumulate a mapping between each DILocalVariable fragment and other
+/// fragments of that DILocalVariable which overlap. This reduces work during
+/// the data-flow stage from "Find any overlapping fragments" to "Check if the
+/// known-to-overlap fragments are present".
+/// \param MI A previously unprocessed DEBUG_VALUE instruction to analyze for
+///           fragment usage.
+void InstrRefBasedLDV::accumulateFragmentMap(MachineInstr &MI) {
+  DebugVariable MIVar(MI.getDebugVariable(), MI.getDebugExpression(),
+                      MI.getDebugLoc()->getInlinedAt());
+  FragmentInfo ThisFragment = MIVar.getFragmentOrDefault();
+
+  // If this is the first sighting of this variable, then we are guaranteed
+  // there are currently no overlapping fragments either. Initialize the set
+  // of seen fragments, record no overlaps for the current one, and return.
+  auto SeenIt = SeenFragments.find(MIVar.getVariable());
+  if (SeenIt == SeenFragments.end()) {
+    SmallSet<FragmentInfo, 4> OneFragment;
+    OneFragment.insert(ThisFragment);
+    SeenFragments.insert({MIVar.getVariable(), OneFragment});
+
+    OverlapFragments.insert({{MIVar.getVariable(), ThisFragment}, {}});
+    return;
+  }
+
+  // If this particular Variable/Fragment pair already exists in the overlap
+  // map, it has already been accounted for.
+  auto IsInOLapMap =
+      OverlapFragments.insert({{MIVar.getVariable(), ThisFragment}, {}});
+  if (!IsInOLapMap.second)
+    return;
+
+  auto &ThisFragmentsOverlaps = IsInOLapMap.first->second;
+  auto &AllSeenFragments = SeenIt->second;
+
+  // Otherwise, examine all other seen fragments for this variable, with "this"
+  // fragment being a previously unseen fragment. Record any pair of
+  // overlapping fragments.
+  for (auto &ASeenFragment : AllSeenFragments) {
+    // Does this previously seen fragment overlap?
+    if (DIExpression::fragmentsOverlap(ThisFragment, ASeenFragment)) {
+      // Yes: Mark the current fragment as being overlapped.
+      ThisFragmentsOverlaps.push_back(ASeenFragment);
+      // Mark the previously seen fragment as being overlapped by the current
+      // one.
+      auto ASeenFragmentsOverlaps =
+          OverlapFragments.find({MIVar.getVariable(), ASeenFragment});
+      assert(ASeenFragmentsOverlaps != OverlapFragments.end() &&
+             "Previously seen var fragment has no vector of overlaps");
+      ASeenFragmentsOverlaps->second.push_back(ThisFragment);
+    }
+  }
+
+  AllSeenFragments.insert(ThisFragment);
+}
+
+void InstrRefBasedLDV::process(MachineInstr &MI) {
+  // Try to interpret an MI as a debug or transfer instruction. Only if it's
+  // none of these should we interpret it's register defs as new value
+  // definitions.
+  if (transferDebugValue(MI))
+    return;
+  if (transferDebugInstrRef(MI))
+    return;
+  if (transferRegisterCopy(MI))
+    return;
+  if (transferSpillOrRestoreInst(MI))
+    return;
+  transferRegisterDef(MI);
+}
+
+void InstrRefBasedLDV::produceMLocTransferFunction(
+    MachineFunction &MF, SmallVectorImpl<MLocTransferMap> &MLocTransfer,
+    unsigned MaxNumBlocks) {
+  // Because we try to optimize around register mask operands by ignoring regs
+  // that aren't currently tracked, we set up something ugly for later: RegMask
+  // operands that are seen earlier than the first use of a register, still need
+  // to clobber that register in the transfer function. But this information
+  // isn't actively recorded. Instead, we track each RegMask used in each block,
+  // and accumulated the clobbered but untracked registers in each block into
+  // the following bitvector. Later, if new values are tracked, we can add
+  // appropriate clobbers.
+  SmallVector<BitVector, 32> BlockMasks;
+  BlockMasks.resize(MaxNumBlocks);
+
+  // Reserve one bit per register for the masks described above.
+  unsigned BVWords = MachineOperand::getRegMaskSize(TRI->getNumRegs());
+  for (auto &BV : BlockMasks)
+    BV.resize(TRI->getNumRegs(), true);
+
+  // Step through all instructions and inhale the transfer function.
+  for (auto &MBB : MF) {
+    // Object fields that are read by trackers to know where we are in the
+    // function.
+    CurBB = MBB.getNumber();
+    CurInst = 1;
+
+    // Set all machine locations to a PHI value. For transfer function
+    // production only, this signifies the live-in value to the block.
+    MTracker->reset();
+    MTracker->setMPhis(CurBB);
+
+    // Step through each instruction in this block.
+    for (auto &MI : MBB) {
+      process(MI);
+      // Also accumulate fragment map.
+      if (MI.isDebugValue())
+        accumulateFragmentMap(MI);
+
+      // Create a map from the instruction number (if present) to the
+      // MachineInstr and its position.
+      if (uint64_t InstrNo = MI.peekDebugInstrNum()) {
+        auto InstrAndPos = std::make_pair(&MI, CurInst);
+        auto InsertResult =
+            DebugInstrNumToInstr.insert(std::make_pair(InstrNo, InstrAndPos));
+
+        // There should never be duplicate instruction numbers.
+        assert(InsertResult.second);
+        (void)InsertResult;
+      }
+
+      ++CurInst;
+    }
+
+    // Produce the transfer function, a map of machine location to new value. If
+    // any machine location has the live-in phi value from the start of the
+    // block, it's live-through and doesn't need recording in the transfer
+    // function.
+    for (auto Location : MTracker->locations()) {
+      LocIdx Idx = Location.Idx;
+      ValueIDNum &P = Location.Value;
+      if (P.isPHI() && P.getLoc() == Idx.asU64())
+        continue;
+
+      // Insert-or-update.
+      auto &TransferMap = MLocTransfer[CurBB];
+      auto Result = TransferMap.insert(std::make_pair(Idx.asU64(), P));
+      if (!Result.second)
+        Result.first->second = P;
+    }
+
+    // Accumulate any bitmask operands into the clobberred reg mask for this
+    // block.
+    for (auto &P : MTracker->Masks) {
+      BlockMasks[CurBB].clearBitsNotInMask(P.first->getRegMask(), BVWords);
+    }
+  }
+
+  // Compute a bitvector of all the registers that are tracked in this block.
+  const TargetLowering *TLI = MF.getSubtarget().getTargetLowering();
+  Register SP = TLI->getStackPointerRegisterToSaveRestore();
+  BitVector UsedRegs(TRI->getNumRegs());
+  for (auto Location : MTracker->locations()) {
+    unsigned ID = MTracker->LocIdxToLocID[Location.Idx];
+    if (ID >= TRI->getNumRegs() || ID == SP)
+      continue;
+    UsedRegs.set(ID);
+  }
+
+  // Check that any regmask-clobber of a register that gets tracked, is not
+  // live-through in the transfer function. It needs to be clobbered at the
+  // very least.
+  for (unsigned int I = 0; I < MaxNumBlocks; ++I) {
+    BitVector &BV = BlockMasks[I];
+    BV.flip();
+    BV &= UsedRegs;
+    // This produces all the bits that we clobber, but also use. Check that
+    // they're all clobbered or at least set in the designated transfer
+    // elem.
+    for (unsigned Bit : BV.set_bits()) {
+      unsigned ID = MTracker->getLocID(Bit, false);
+      LocIdx Idx = MTracker->LocIDToLocIdx[ID];
+      auto &TransferMap = MLocTransfer[I];
+
+      // Install a value representing the fact that this location is effectively
+      // written to in this block. As there's no reserved value, instead use
+      // a value number that is never generated. Pick the value number for the
+      // first instruction in the block, def'ing this location, which we know
+      // this block never used anyway.
+      ValueIDNum NotGeneratedNum = ValueIDNum(I, 1, Idx);
+      auto Result =
+        TransferMap.insert(std::make_pair(Idx.asU64(), NotGeneratedNum));
+      if (!Result.second) {
+        ValueIDNum &ValueID = Result.first->second;
+        if (ValueID.getBlock() == I && ValueID.isPHI())
+          // It was left as live-through. Set it to clobbered.
+          ValueID = NotGeneratedNum;
+      }
+    }
+  }
+}
+
+std::tuple<bool, bool>
+InstrRefBasedLDV::mlocJoin(MachineBasicBlock &MBB,
+                           SmallPtrSet<const MachineBasicBlock *, 16> &Visited,
+                           ValueIDNum **OutLocs, ValueIDNum *InLocs) {
+  LLVM_DEBUG(dbgs() << "join MBB: " << MBB.getNumber() << "\n");
+  bool Changed = false;
+  bool DowngradeOccurred = false;
+
+  // Collect predecessors that have been visited. Anything that hasn't been
+  // visited yet is a backedge on the first iteration, and the meet of it's
+  // lattice value for all locations will be unaffected.
+  SmallVector<const MachineBasicBlock *, 8> BlockOrders;
+  for (auto Pred : MBB.predecessors()) {
+    if (Visited.count(Pred)) {
+      BlockOrders.push_back(Pred);
+    }
+  }
+
+  // Visit predecessors in RPOT order.
+  auto Cmp = [&](const MachineBasicBlock *A, const MachineBasicBlock *B) {
+    return BBToOrder.find(A)->second < BBToOrder.find(B)->second;
+  };
+  llvm::sort(BlockOrders, Cmp);
+
+  // Skip entry block.
+  if (BlockOrders.size() == 0)
+    return std::tuple<bool, bool>(false, false);
+
+  // Step through all machine locations, then look at each predecessor and
+  // detect disagreements.
+  unsigned ThisBlockRPO = BBToOrder.find(&MBB)->second;
+  for (auto Location : MTracker->locations()) {
+    LocIdx Idx = Location.Idx;
+    // Pick out the first predecessors live-out value for this location. It's
+    // guaranteed to be not a backedge, as we order by RPO.
+    ValueIDNum BaseVal = OutLocs[BlockOrders[0]->getNumber()][Idx.asU64()];
+
+    // Some flags for whether there's a disagreement, and whether it's a
+    // disagreement with a backedge or not.
+    bool Disagree = false;
+    bool NonBackEdgeDisagree = false;
+
+    // Loop around everything that wasn't 'base'.
+    for (unsigned int I = 1; I < BlockOrders.size(); ++I) {
+      auto *MBB = BlockOrders[I];
+      if (BaseVal != OutLocs[MBB->getNumber()][Idx.asU64()]) {
+        // Live-out of a predecessor disagrees with the first predecessor.
+        Disagree = true;
+
+        // Test whether it's a disagreemnt in the backedges or not.
+        if (BBToOrder.find(MBB)->second < ThisBlockRPO) // might be self b/e
+          NonBackEdgeDisagree = true;
+      }
+    }
+
+    bool OverRide = false;
+    if (Disagree && !NonBackEdgeDisagree) {
+      // Only the backedges disagree. Consider demoting the livein
+      // lattice value, as per the file level comment. The value we consider
+      // demoting to is the value that the non-backedge predecessors agree on.
+      // The order of values is that non-PHIs are \top, a PHI at this block
+      // \bot, and phis between the two are ordered by their RPO number.
+      // If there's no agreement, or we've already demoted to this PHI value
+      // before, replace with a PHI value at this block.
+
+      // Calculate order numbers: zero means normal def, nonzero means RPO
+      // number.
+      unsigned BaseBlockRPONum = BBNumToRPO[BaseVal.getBlock()] + 1;
+      if (!BaseVal.isPHI())
+        BaseBlockRPONum = 0;
+
+      ValueIDNum &InLocID = InLocs[Idx.asU64()];
+      unsigned InLocRPONum = BBNumToRPO[InLocID.getBlock()] + 1;
+      if (!InLocID.isPHI())
+        InLocRPONum = 0;
+
+      // Should we ignore the disagreeing backedges, and override with the
+      // value the other predecessors agree on (in "base")?
+      unsigned ThisBlockRPONum = BBNumToRPO[MBB.getNumber()] + 1;
+      if (BaseBlockRPONum > InLocRPONum && BaseBlockRPONum < ThisBlockRPONum) {
+        // Override.
+        OverRide = true;
+        DowngradeOccurred = true;
+      }
+    }
+    // else: if we disagree in the non-backedges, then this is definitely
+    // a control flow merge where different values merge. Make it a PHI.
+
+    // Generate a phi...
+    ValueIDNum PHI = {(uint64_t)MBB.getNumber(), 0, Idx};
+    ValueIDNum NewVal = (Disagree && !OverRide) ? PHI : BaseVal;
+    if (InLocs[Idx.asU64()] != NewVal) {
+      Changed |= true;
+      InLocs[Idx.asU64()] = NewVal;
+    }
+  }
+
+  // TODO: Reimplement NumInserted and NumRemoved.
+  return std::tuple<bool, bool>(Changed, DowngradeOccurred);
+}
+
+void InstrRefBasedLDV::mlocDataflow(
+    ValueIDNum **MInLocs, ValueIDNum **MOutLocs,
+    SmallVectorImpl<MLocTransferMap> &MLocTransfer) {
+  std::priority_queue<unsigned int, std::vector<unsigned int>,
+                      std::greater<unsigned int>>
+      Worklist, Pending;
+
+  // We track what is on the current and pending worklist to avoid inserting
+  // the same thing twice. We could avoid this with a custom priority queue,
+  // but this is probably not worth it.
+  SmallPtrSet<MachineBasicBlock *, 16> OnPending, OnWorklist;
+
+  // Initialize worklist with every block to be visited.
+  for (unsigned int I = 0; I < BBToOrder.size(); ++I) {
+    Worklist.push(I);
+    OnWorklist.insert(OrderToBB[I]);
+  }
+
+  MTracker->reset();
+
+  // Set inlocs for entry block -- each as a PHI at the entry block. Represents
+  // the incoming value to the function.
+  MTracker->setMPhis(0);
+  for (auto Location : MTracker->locations())
+    MInLocs[0][Location.Idx.asU64()] = Location.Value;
+
+  SmallPtrSet<const MachineBasicBlock *, 16> Visited;
+  while (!Worklist.empty() || !Pending.empty()) {
+    // Vector for storing the evaluated block transfer function.
+    SmallVector<std::pair<LocIdx, ValueIDNum>, 32> ToRemap;
+
+    while (!Worklist.empty()) {
+      MachineBasicBlock *MBB = OrderToBB[Worklist.top()];
+      CurBB = MBB->getNumber();
+      Worklist.pop();
+
+      // Join the values in all predecessor blocks.
+      bool InLocsChanged, DowngradeOccurred;
+      std::tie(InLocsChanged, DowngradeOccurred) =
+          mlocJoin(*MBB, Visited, MOutLocs, MInLocs[CurBB]);
+      InLocsChanged |= Visited.insert(MBB).second;
+
+      // If a downgrade occurred, book us in for re-examination on the next
+      // iteration.
+      if (DowngradeOccurred && OnPending.insert(MBB).second)
+        Pending.push(BBToOrder[MBB]);
+
+      // Don't examine transfer function if we've visited this loc at least
+      // once, and inlocs haven't changed.
+      if (!InLocsChanged)
+        continue;
+
+      // Load the current set of live-ins into MLocTracker.
+      MTracker->loadFromArray(MInLocs[CurBB], CurBB);
+
+      // Each element of the transfer function can be a new def, or a read of
+      // a live-in value. Evaluate each element, and store to "ToRemap".
+      ToRemap.clear();
+      for (auto &P : MLocTransfer[CurBB]) {
+        if (P.second.getBlock() == CurBB && P.second.isPHI()) {
+          // This is a movement of whatever was live in. Read it.
+          ValueIDNum NewID = MTracker->getNumAtPos(P.second.getLoc());
+          ToRemap.push_back(std::make_pair(P.first, NewID));
+        } else {
+          // It's a def. Just set it.
+          assert(P.second.getBlock() == CurBB);
+          ToRemap.push_back(std::make_pair(P.first, P.second));
+        }
+      }
+
+      // Commit the transfer function changes into mloc tracker, which
+      // transforms the contents of the MLocTracker into the live-outs.
+      for (auto &P : ToRemap)
+        MTracker->setMLoc(P.first, P.second);
+
+      // Now copy out-locs from mloc tracker into out-loc vector, checking
+      // whether changes have occurred. These changes can have come from both
+      // the transfer function, and mlocJoin.
+      bool OLChanged = false;
+      for (auto Location : MTracker->locations()) {
+        OLChanged |= MOutLocs[CurBB][Location.Idx.asU64()] != Location.Value;
+        MOutLocs[CurBB][Location.Idx.asU64()] = Location.Value;
+      }
+
+      MTracker->reset();
+
+      // No need to examine successors again if out-locs didn't change.
+      if (!OLChanged)
+        continue;
+
+      // All successors should be visited: put any back-edges on the pending
+      // list for the next dataflow iteration, and any other successors to be
+      // visited this iteration, if they're not going to be already.
+      for (auto s : MBB->successors()) {
+        // Does branching to this successor represent a back-edge?
+        if (BBToOrder[s] > BBToOrder[MBB]) {
+          // No: visit it during this dataflow iteration.
+          if (OnWorklist.insert(s).second)
+            Worklist.push(BBToOrder[s]);
+        } else {
+          // Yes: visit it on the next iteration.
+          if (OnPending.insert(s).second)
+            Pending.push(BBToOrder[s]);
+        }
+      }
+    }
+
+    Worklist.swap(Pending);
+    std::swap(OnPending, OnWorklist);
+    OnPending.clear();
+    // At this point, pending must be empty, since it was just the empty
+    // worklist
+    assert(Pending.empty() && "Pending should be empty");
+  }
+
+  // Once all the live-ins don't change on mlocJoin(), we've reached a
+  // fixedpoint.
+}
+
+bool InstrRefBasedLDV::vlocDowngradeLattice(
+    const MachineBasicBlock &MBB, const DbgValue &OldLiveInLocation,
+    const SmallVectorImpl<InValueT> &Values, unsigned CurBlockRPONum) {
+  // Ranking value preference: see file level comment, the highest rank is
+  // a plain def, followed by PHI values in reverse post-order. Numerically,
+  // we assign all defs the rank '0', all PHIs their blocks RPO number plus
+  // one, and consider the lowest value the highest ranked.
+  int OldLiveInRank = BBNumToRPO[OldLiveInLocation.ID.getBlock()] + 1;
+  if (!OldLiveInLocation.ID.isPHI())
+    OldLiveInRank = 0;
+
+  // Allow any unresolvable conflict to be over-ridden.
+  if (OldLiveInLocation.Kind == DbgValue::NoVal) {
+    // Although if it was an unresolvable conflict from _this_ block, then
+    // all other seeking of downgrades and PHIs must have failed before hand.
+    if (OldLiveInLocation.BlockNo == (unsigned)MBB.getNumber())
+      return false;
+    OldLiveInRank = INT_MIN;
+  }
+
+  auto &InValue = *Values[0].second;
+
+  if (InValue.Kind == DbgValue::Const || InValue.Kind == DbgValue::NoVal)
+    return false;
+
+  unsigned ThisRPO = BBNumToRPO[InValue.ID.getBlock()];
+  int ThisRank = ThisRPO + 1;
+  if (!InValue.ID.isPHI())
+    ThisRank = 0;
+
+  // Too far down the lattice?
+  if (ThisRPO >= CurBlockRPONum)
+    return false;
+
+  // Higher in the lattice than what we've already explored?
+  if (ThisRank <= OldLiveInRank)
+    return false;
+
+  return true;
+}
+
+std::tuple<Optional<ValueIDNum>, bool> InstrRefBasedLDV::pickVPHILoc(
+    MachineBasicBlock &MBB, const DebugVariable &Var, const LiveIdxT &LiveOuts,
+    ValueIDNum **MOutLocs, ValueIDNum **MInLocs,
+    const SmallVectorImpl<MachineBasicBlock *> &BlockOrders) {
+  // Collect a set of locations from predecessor where its live-out value can
+  // be found.
+  SmallVector<SmallVector<LocIdx, 4>, 8> Locs;
+  unsigned NumLocs = MTracker->getNumLocs();
+  unsigned BackEdgesStart = 0;
+
+  for (auto p : BlockOrders) {
+    // Pick out where backedges start in the list of predecessors. Relies on
+    // BlockOrders being sorted by RPO.
+    if (BBToOrder[p] < BBToOrder[&MBB])
+      ++BackEdgesStart;
+
+    // For each predecessor, create a new set of locations.
+    Locs.resize(Locs.size() + 1);
+    unsigned ThisBBNum = p->getNumber();
+    auto LiveOutMap = LiveOuts.find(p);
+    if (LiveOutMap == LiveOuts.end())
+      // This predecessor isn't in scope, it must have no live-in/live-out
+      // locations.
+      continue;
+
+    auto It = LiveOutMap->second->find(Var);
+    if (It == LiveOutMap->second->end())
+      // There's no value recorded for this variable in this predecessor,
+      // leave an empty set of locations.
+      continue;
+
+    const DbgValue &OutVal = It->second;
+
+    if (OutVal.Kind == DbgValue::Const || OutVal.Kind == DbgValue::NoVal)
+      // Consts and no-values cannot have locations we can join on.
+      continue;
+
+    assert(OutVal.Kind == DbgValue::Proposed || OutVal.Kind == DbgValue::Def);
+    ValueIDNum ValToLookFor = OutVal.ID;
+
+    // Search the live-outs of the predecessor for the specified value.
+    for (unsigned int I = 0; I < NumLocs; ++I) {
+      if (MOutLocs[ThisBBNum][I] == ValToLookFor)
+        Locs.back().push_back(LocIdx(I));
+    }
+  }
+
+  // If there were no locations at all, return an empty result.
+  if (Locs.empty())
+    return std::tuple<Optional<ValueIDNum>, bool>(None, false);
+
+  // Lambda for seeking a common location within a range of location-sets.
+  using LocsIt = SmallVector<SmallVector<LocIdx, 4>, 8>::iterator;
+  auto SeekLocation =
+      [&Locs](llvm::iterator_range<LocsIt> SearchRange) -> Optional<LocIdx> {
+    // Starting with the first set of locations, take the intersection with
+    // subsequent sets.
+    SmallVector<LocIdx, 4> base = Locs[0];
+    for (auto &S : SearchRange) {
+      SmallVector<LocIdx, 4> new_base;
+      std::set_intersection(base.begin(), base.end(), S.begin(), S.end(),
+                            std::inserter(new_base, new_base.begin()));
+      base = new_base;
+    }
+    if (base.empty())
+      return None;
+
+    // We now have a set of LocIdxes that contain the right output value in
+    // each of the predecessors. Pick the lowest; if there's a register loc,
+    // that'll be it.
+    return *base.begin();
+  };
+
+  // Search for a common location for all predecessors. If we can't, then fall
+  // back to only finding a common location between non-backedge predecessors.
+  bool ValidForAllLocs = true;
+  auto TheLoc = SeekLocation(Locs);
+  if (!TheLoc) {
+    ValidForAllLocs = false;
+    TheLoc =
+        SeekLocation(make_range(Locs.begin(), Locs.begin() + BackEdgesStart));
+  }
+
+  if (!TheLoc)
+    return std::tuple<Optional<ValueIDNum>, bool>(None, false);
+
+  // Return a PHI-value-number for the found location.
+  LocIdx L = *TheLoc;
+  ValueIDNum PHIVal = {(unsigned)MBB.getNumber(), 0, L};
+  return std::tuple<Optional<ValueIDNum>, bool>(PHIVal, ValidForAllLocs);
+}
+
+std::tuple<bool, bool> InstrRefBasedLDV::vlocJoin(
+    MachineBasicBlock &MBB, LiveIdxT &VLOCOutLocs, LiveIdxT &VLOCInLocs,
+    SmallPtrSet<const MachineBasicBlock *, 16> *VLOCVisited, unsigned BBNum,
+    const SmallSet<DebugVariable, 4> &AllVars, ValueIDNum **MOutLocs,
+    ValueIDNum **MInLocs,
+    SmallPtrSet<const MachineBasicBlock *, 8> &InScopeBlocks,
+    SmallPtrSet<const MachineBasicBlock *, 8> &BlocksToExplore,
+    DenseMap<DebugVariable, DbgValue> &InLocsT) {
+  bool DowngradeOccurred = false;
+
+  // To emulate VarLocBasedImpl, process this block if it's not in scope but
+  // _does_ assign a variable value. No live-ins for this scope are transferred
+  // in though, so we can return immediately.
+  if (InScopeBlocks.count(&MBB) == 0 && !ArtificialBlocks.count(&MBB)) {
+    if (VLOCVisited)
+      return std::tuple<bool, bool>(true, false);
+    return std::tuple<bool, bool>(false, false);
+  }
+
+  LLVM_DEBUG(dbgs() << "join MBB: " << MBB.getNumber() << "\n");
+  bool Changed = false;
+
+  // Find any live-ins computed in a prior iteration.
+  auto ILSIt = VLOCInLocs.find(&MBB);
+  assert(ILSIt != VLOCInLocs.end());
+  auto &ILS = *ILSIt->second;
+
+  // Order predecessors by RPOT order, for exploring them in that order.
+  SmallVector<MachineBasicBlock *, 8> BlockOrders;
+  for (auto p : MBB.predecessors())
+    BlockOrders.push_back(p);
+
+  auto Cmp = [&](MachineBasicBlock *A, MachineBasicBlock *B) {
+    return BBToOrder[A] < BBToOrder[B];
+  };
+
+  llvm::sort(BlockOrders, Cmp);
+
+  unsigned CurBlockRPONum = BBToOrder[&MBB];
+
+  // Force a re-visit to loop heads in the first dataflow iteration.
+  // FIXME: if we could "propose" Const values this wouldn't be needed,
+  // because they'd need to be confirmed before being emitted.
+  if (!BlockOrders.empty() &&
+      BBToOrder[BlockOrders[BlockOrders.size() - 1]] >= CurBlockRPONum &&
+      VLOCVisited)
+    DowngradeOccurred = true;
+
+  auto ConfirmValue = [&InLocsT](const DebugVariable &DV, DbgValue VR) {
+    auto Result = InLocsT.insert(std::make_pair(DV, VR));
+    (void)Result;
+    assert(Result.second);
+  };
+
+  auto ConfirmNoVal = [&ConfirmValue, &MBB](const DebugVariable &Var, const DbgValueProperties &Properties) {
+    DbgValue NoLocPHIVal(MBB.getNumber(), Properties, DbgValue::NoVal);
+
+    ConfirmValue(Var, NoLocPHIVal);
+  };
+
+  // Attempt to join the values for each variable.
+  for (auto &Var : AllVars) {
+    // Collect all the DbgValues for this variable.
+    SmallVector<InValueT, 8> Values;
+    bool Bail = false;
+    unsigned BackEdgesStart = 0;
+    for (auto p : BlockOrders) {
+      // If the predecessor isn't in scope / to be explored, we'll never be
+      // able to join any locations.
+      if (!BlocksToExplore.contains(p)) {
+        Bail = true;
+        break;
+      }
+
+      // Don't attempt to handle unvisited predecessors: they're implicitly
+      // "unknown"s in the lattice.
+      if (VLOCVisited && !VLOCVisited->count(p))
+        continue;
+
+      // If the predecessors OutLocs is absent, there's not much we can do.
+      auto OL = VLOCOutLocs.find(p);
+      if (OL == VLOCOutLocs.end()) {
+        Bail = true;
+        break;
+      }
+
+      // No live-out value for this predecessor also means we can't produce
+      // a joined value.
+      auto VIt = OL->second->find(Var);
+      if (VIt == OL->second->end()) {
+        Bail = true;
+        break;
+      }
+
+      // Keep track of where back-edges begin in the Values vector. Relies on
+      // BlockOrders being sorted by RPO.
+      unsigned ThisBBRPONum = BBToOrder[p];
+      if (ThisBBRPONum < CurBlockRPONum)
+        ++BackEdgesStart;
+
+      Values.push_back(std::make_pair(p, &VIt->second));
+    }
+
+    // If there were no values, or one of the predecessors couldn't have a
+    // value, then give up immediately. It's not safe to produce a live-in
+    // value.
+    if (Bail || Values.size() == 0)
+      continue;
+
+    // Enumeration identifying the current state of the predecessors values.
+    enum {
+      Unset = 0,
+      Agreed,       // All preds agree on the variable value.
+      PropDisagree, // All preds agree, but the value kind is Proposed in some.
+      BEDisagree,   // Only back-edges disagree on variable value.
+      PHINeeded,    // Non-back-edge predecessors have conflicing values.
+      NoSolution    // Conflicting Value metadata makes solution impossible.
+    } OurState = Unset;
+
+    // All (non-entry) blocks have at least one non-backedge predecessor.
+    // Pick the variable value from the first of these, to compare against
+    // all others.
+    const DbgValue &FirstVal = *Values[0].second;
+    const ValueIDNum &FirstID = FirstVal.ID;
+
+    // Scan for variable values that can't be resolved: if they have different
+    // DIExpressions, different indirectness, or are mixed constants /
+    // non-constants.
+    for (auto &V : Values) {
+      if (V.second->Properties != FirstVal.Properties)
+        OurState = NoSolution;
+      if (V.second->Kind == DbgValue::Const && FirstVal.Kind != DbgValue::Const)
+        OurState = NoSolution;
+    }
+
+    // Flags diagnosing _how_ the values disagree.
+    bool NonBackEdgeDisagree = false;
+    bool DisagreeOnPHINess = false;
+    bool IDDisagree = false;
+    bool Disagree = false;
+    if (OurState == Unset) {
+      for (auto &V : Values) {
+        if (*V.second == FirstVal)
+          continue; // No disagreement.
+
+        Disagree = true;
+
+        // Flag whether the value number actually diagrees.
+        if (V.second->ID != FirstID)
+          IDDisagree = true;
+
+        // Distinguish whether disagreement happens in backedges or not.
+        // Relies on Values (and BlockOrders) being sorted by RPO.
+        unsigned ThisBBRPONum = BBToOrder[V.first];
+        if (ThisBBRPONum < CurBlockRPONum)
+          NonBackEdgeDisagree = true;
+
+        // Is there a difference in whether the value is definite or only
+        // proposed?
+        if (V.second->Kind != FirstVal.Kind &&
+            (V.second->Kind == DbgValue::Proposed ||
+             V.second->Kind == DbgValue::Def) &&
+            (FirstVal.Kind == DbgValue::Proposed ||
+             FirstVal.Kind == DbgValue::Def))
+          DisagreeOnPHINess = true;
+      }
+
+      // Collect those flags together and determine an overall state for
+      // what extend the predecessors agree on a live-in value.
+      if (!Disagree)
+        OurState = Agreed;
+      else if (!IDDisagree && DisagreeOnPHINess)
+        OurState = PropDisagree;
+      else if (!NonBackEdgeDisagree)
+        OurState = BEDisagree;
+      else
+        OurState = PHINeeded;
+    }
+
+    // An extra indicator: if we only disagree on whether the value is a
+    // Def, or proposed, then also flag whether that disagreement happens
+    // in backedges only.
+    bool PropOnlyInBEs = Disagree && !IDDisagree && DisagreeOnPHINess &&
+                         !NonBackEdgeDisagree && FirstVal.Kind == DbgValue::Def;
+
+    const auto &Properties = FirstVal.Properties;
+
+    auto OldLiveInIt = ILS.find(Var);
+    const DbgValue *OldLiveInLocation =
+        (OldLiveInIt != ILS.end()) ? &OldLiveInIt->second : nullptr;
+
+    bool OverRide = false;
+    if (OurState == BEDisagree && OldLiveInLocation) {
+      // Only backedges disagree: we can consider downgrading. If there was a
+      // previous live-in value, use it to work out whether the current
+      // incoming value represents a lattice downgrade or not.
+      OverRide =
+          vlocDowngradeLattice(MBB, *OldLiveInLocation, Values, CurBlockRPONum);
+    }
+
+    // Use the current state of predecessor agreement and other flags to work
+    // out what to do next. Possibilities include:
+    //  * Accept a value all predecessors agree on, or accept one that
+    //    represents a step down the exploration lattice,
+    //  * Use a PHI value number, if one can be found,
+    //  * Propose a PHI value number, and see if it gets confirmed later,
+    //  * Emit a 'NoVal' value, indicating we couldn't resolve anything.
+    if (OurState == Agreed) {
+      // Easiest solution: all predecessors agree on the variable value.
+      ConfirmValue(Var, FirstVal);
+    } else if (OurState == BEDisagree && OverRide) {
+      // Only backedges disagree, and the other predecessors have produced
+      // a new live-in value further down the exploration lattice.
+      DowngradeOccurred = true;
+      ConfirmValue(Var, FirstVal);
+    } else if (OurState == PropDisagree) {
+      // Predecessors agree on value, but some say it's only a proposed value.
+      // Propagate it as proposed: unless it was proposed in this block, in
+      // which case we're able to confirm the value.
+      if (FirstID.getBlock() == (uint64_t)MBB.getNumber() && FirstID.isPHI()) {
+        ConfirmValue(Var, DbgValue(FirstID, Properties, DbgValue::Def));
+      } else if (PropOnlyInBEs) {
+        // If only backedges disagree, a higher (in RPO) block confirmed this
+        // location, and we need to propagate it into this loop.
+        ConfirmValue(Var, DbgValue(FirstID, Properties, DbgValue::Def));
+      } else {
+        // Otherwise; a Def meeting a Proposed is still a Proposed.
+        ConfirmValue(Var, DbgValue(FirstID, Properties, DbgValue::Proposed));
+      }
+    } else if ((OurState == PHINeeded || OurState == BEDisagree)) {
+      // Predecessors disagree and can't be downgraded: this can only be
+      // solved with a PHI. Use pickVPHILoc to go look for one.
+      Optional<ValueIDNum> VPHI;
+      bool AllEdgesVPHI = false;
+      std::tie(VPHI, AllEdgesVPHI) =
+          pickVPHILoc(MBB, Var, VLOCOutLocs, MOutLocs, MInLocs, BlockOrders);
+
+      if (VPHI && AllEdgesVPHI) {
+        // There's a PHI value that's valid for all predecessors -- we can use
+        // it. If any of the non-backedge predecessors have proposed values
+        // though, this PHI is also only proposed, until the predecessors are
+        // confirmed.
+        DbgValue::KindT K = DbgValue::Def;
+        for (unsigned int I = 0; I < BackEdgesStart; ++I)
+          if (Values[I].second->Kind == DbgValue::Proposed)
+            K = DbgValue::Proposed;
+
+        ConfirmValue(Var, DbgValue(*VPHI, Properties, K));
+      } else if (VPHI) {
+        // There's a PHI value, but it's only legal for backedges. Leave this
+        // as a proposed PHI value: it might come back on the backedges,
+        // and allow us to confirm it in the future.
+        DbgValue NoBEValue = DbgValue(*VPHI, Properties, DbgValue::Proposed);
+        ConfirmValue(Var, NoBEValue);
+      } else {
+        ConfirmNoVal(Var, Properties);
+      }
+    } else {
+      // Otherwise: we don't know. Emit a "phi but no real loc" phi.
+      ConfirmNoVal(Var, Properties);
+    }
+  }
+
+  // Store newly calculated in-locs into VLOCInLocs, if they've changed.
+  Changed = ILS != InLocsT;
+  if (Changed)
+    ILS = InLocsT;
+
+  return std::tuple<bool, bool>(Changed, DowngradeOccurred);
+}
+
+void InstrRefBasedLDV::vlocDataflow(
+    const LexicalScope *Scope, const DILocation *DILoc,
+    const SmallSet<DebugVariable, 4> &VarsWeCareAbout,
+    SmallPtrSetImpl<MachineBasicBlock *> &AssignBlocks, LiveInsT &Output,
+    ValueIDNum **MOutLocs, ValueIDNum **MInLocs,
+    SmallVectorImpl<VLocTracker> &AllTheVLocs) {
+  // This method is much like mlocDataflow: but focuses on a single
+  // LexicalScope at a time. Pick out a set of blocks and variables that are
+  // to have their value assignments solved, then run our dataflow algorithm
+  // until a fixedpoint is reached.
+  std::priority_queue<unsigned int, std::vector<unsigned int>,
+                      std::greater<unsigned int>>
+      Worklist, Pending;
+  SmallPtrSet<MachineBasicBlock *, 16> OnWorklist, OnPending;
+
+  // The set of blocks we'll be examining.
+  SmallPtrSet<const MachineBasicBlock *, 8> BlocksToExplore;
+
+  // The order in which to examine them (RPO).
+  SmallVector<MachineBasicBlock *, 8> BlockOrders;
+
+  // RPO ordering function.
+  auto Cmp = [&](MachineBasicBlock *A, MachineBasicBlock *B) {
+    return BBToOrder[A] < BBToOrder[B];
+  };
+
+  LS.getMachineBasicBlocks(DILoc, BlocksToExplore);
+
+  // A separate container to distinguish "blocks we're exploring" versus
+  // "blocks that are potentially in scope. See comment at start of vlocJoin.
+  SmallPtrSet<const MachineBasicBlock *, 8> InScopeBlocks = BlocksToExplore;
+
+  // Old LiveDebugValues tracks variable locations that come out of blocks
+  // not in scope, where DBG_VALUEs occur. This is something we could
+  // legitimately ignore, but lets allow it for now.
+  if (EmulateOldLDV)
+    BlocksToExplore.insert(AssignBlocks.begin(), AssignBlocks.end());
+
+  // We also need to propagate variable values through any artificial blocks
+  // that immediately follow blocks in scope.
+  DenseSet<const MachineBasicBlock *> ToAdd;
+
+  // Helper lambda: For a given block in scope, perform a depth first search
+  // of all the artificial successors, adding them to the ToAdd collection.
+  auto AccumulateArtificialBlocks =
+      [this, &ToAdd, &BlocksToExplore,
+       &InScopeBlocks](const MachineBasicBlock *MBB) {
+        // Depth-first-search state: each node is a block and which successor
+        // we're currently exploring.
+        SmallVector<std::pair<const MachineBasicBlock *,
+                              MachineBasicBlock::const_succ_iterator>,
+                    8>
+            DFS;
+
+        // Find any artificial successors not already tracked.
+        for (auto *succ : MBB->successors()) {
+          if (BlocksToExplore.count(succ) || InScopeBlocks.count(succ))
+            continue;
+          if (!ArtificialBlocks.count(succ))
+            continue;
+          DFS.push_back(std::make_pair(succ, succ->succ_begin()));
+          ToAdd.insert(succ);
+        }
+
+        // Search all those blocks, depth first.
+        while (!DFS.empty()) {
+          const MachineBasicBlock *CurBB = DFS.back().first;
+          MachineBasicBlock::const_succ_iterator &CurSucc = DFS.back().second;
+          // Walk back if we've explored this blocks successors to the end.
+          if (CurSucc == CurBB->succ_end()) {
+            DFS.pop_back();
+            continue;
+          }
+
+          // If the current successor is artificial and unexplored, descend into
+          // it.
+          if (!ToAdd.count(*CurSucc) && ArtificialBlocks.count(*CurSucc)) {
+            DFS.push_back(std::make_pair(*CurSucc, (*CurSucc)->succ_begin()));
+            ToAdd.insert(*CurSucc);
+            continue;
+          }
+
+          ++CurSucc;
+        }
+      };
+
+  // Search in-scope blocks and those containing a DBG_VALUE from this scope
+  // for artificial successors.
+  for (auto *MBB : BlocksToExplore)
+    AccumulateArtificialBlocks(MBB);
+  for (auto *MBB : InScopeBlocks)
+    AccumulateArtificialBlocks(MBB);
+
+  BlocksToExplore.insert(ToAdd.begin(), ToAdd.end());
+  InScopeBlocks.insert(ToAdd.begin(), ToAdd.end());
+
+  // Single block scope: not interesting! No propagation at all. Note that
+  // this could probably go above ArtificialBlocks without damage, but
+  // that then produces output differences from original-live-debug-values,
+  // which propagates from a single block into many artificial ones.
+  if (BlocksToExplore.size() == 1)
+    return;
+
+  // Picks out relevants blocks RPO order and sort them.
+  for (auto *MBB : BlocksToExplore)
+    BlockOrders.push_back(const_cast<MachineBasicBlock *>(MBB));
+
+  llvm::sort(BlockOrders, Cmp);
+  unsigned NumBlocks = BlockOrders.size();
+
+  // Allocate some vectors for storing the live ins and live outs. Large.
+  SmallVector<DenseMap<DebugVariable, DbgValue>, 32> LiveIns, LiveOuts;
+  LiveIns.resize(NumBlocks);
+  LiveOuts.resize(NumBlocks);
+
+  // Produce by-MBB indexes of live-in/live-outs, to ease lookup within
+  // vlocJoin.
+  LiveIdxT LiveOutIdx, LiveInIdx;
+  LiveOutIdx.reserve(NumBlocks);
+  LiveInIdx.reserve(NumBlocks);
+  for (unsigned I = 0; I < NumBlocks; ++I) {
+    LiveOutIdx[BlockOrders[I]] = &LiveOuts[I];
+    LiveInIdx[BlockOrders[I]] = &LiveIns[I];
+  }
+
+  for (auto *MBB : BlockOrders) {
+    Worklist.push(BBToOrder[MBB]);
+    OnWorklist.insert(MBB);
+  }
+
+  // Iterate over all the blocks we selected, propagating variable values.
+  bool FirstTrip = true;
+  SmallPtrSet<const MachineBasicBlock *, 16> VLOCVisited;
+  while (!Worklist.empty() || !Pending.empty()) {
+    while (!Worklist.empty()) {
+      auto *MBB = OrderToBB[Worklist.top()];
+      CurBB = MBB->getNumber();
+      Worklist.pop();
+
+      DenseMap<DebugVariable, DbgValue> JoinedInLocs;
+
+      // Join values from predecessors. Updates LiveInIdx, and writes output
+      // into JoinedInLocs.
+      bool InLocsChanged, DowngradeOccurred;
+      std::tie(InLocsChanged, DowngradeOccurred) = vlocJoin(
+          *MBB, LiveOutIdx, LiveInIdx, (FirstTrip) ? &VLOCVisited : nullptr,
+          CurBB, VarsWeCareAbout, MOutLocs, MInLocs, InScopeBlocks,
+          BlocksToExplore, JoinedInLocs);
+
+      bool FirstVisit = VLOCVisited.insert(MBB).second;
+
+      // Always explore transfer function if inlocs changed, or if we've not
+      // visited this block before.
+      InLocsChanged |= FirstVisit;
+
+      // If a downgrade occurred, book us in for re-examination on the next
+      // iteration.
+      if (DowngradeOccurred && OnPending.insert(MBB).second)
+        Pending.push(BBToOrder[MBB]);
+
+      if (!InLocsChanged)
+        continue;
+
+      // Do transfer function.
+      auto &VTracker = AllTheVLocs[MBB->getNumber()];
+      for (auto &Transfer : VTracker.Vars) {
+        // Is this var we're mangling in this scope?
+        if (VarsWeCareAbout.count(Transfer.first)) {
+          // Erase on empty transfer (DBG_VALUE $noreg).
+          if (Transfer.second.Kind == DbgValue::Undef) {
+            JoinedInLocs.erase(Transfer.first);
+          } else {
+            // Insert new variable value; or overwrite.
+            auto NewValuePair = std::make_pair(Transfer.first, Transfer.second);
+            auto Result = JoinedInLocs.insert(NewValuePair);
+            if (!Result.second)
+              Result.first->second = Transfer.second;
+          }
+        }
+      }
+
+      // Did the live-out locations change?
+      bool OLChanged = JoinedInLocs != *LiveOutIdx[MBB];
+
+      // If they haven't changed, there's no need to explore further.
+      if (!OLChanged)
+        continue;
+
+      // Commit to the live-out record.
+      *LiveOutIdx[MBB] = JoinedInLocs;
+
+      // We should visit all successors. Ensure we'll visit any non-backedge
+      // successors during this dataflow iteration; book backedge successors
+      // to be visited next time around.
+      for (auto s : MBB->successors()) {
+        // Ignore out of scope / not-to-be-explored successors.
+        if (LiveInIdx.find(s) == LiveInIdx.end())
+          continue;
+
+        if (BBToOrder[s] > BBToOrder[MBB]) {
+          if (OnWorklist.insert(s).second)
+            Worklist.push(BBToOrder[s]);
+        } else if (OnPending.insert(s).second && (FirstTrip || OLChanged)) {
+          Pending.push(BBToOrder[s]);
+        }
+      }
+    }
+    Worklist.swap(Pending);
+    std::swap(OnWorklist, OnPending);
+    OnPending.clear();
+    assert(Pending.empty());
+    FirstTrip = false;
+  }
+
+  // Dataflow done. Now what? Save live-ins. Ignore any that are still marked
+  // as being variable-PHIs, because those did not have their machine-PHI
+  // value confirmed. Such variable values are places that could have been
+  // PHIs, but are not.
+  for (auto *MBB : BlockOrders) {
+    auto &VarMap = *LiveInIdx[MBB];
+    for (auto &P : VarMap) {
+      if (P.second.Kind == DbgValue::Proposed ||
+          P.second.Kind == DbgValue::NoVal)
+        continue;
+      Output[MBB->getNumber()].push_back(P);
+    }
+  }
+
+  BlockOrders.clear();
+  BlocksToExplore.clear();
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void InstrRefBasedLDV::dump_mloc_transfer(
+    const MLocTransferMap &mloc_transfer) const {
+  for (auto &P : mloc_transfer) {
+    std::string foo = MTracker->LocIdxToName(P.first);
+    std::string bar = MTracker->IDAsString(P.second);
+    dbgs() << "Loc " << foo << " --> " << bar << "\n";
+  }
+}
+#endif
+
+void InstrRefBasedLDV::emitLocations(
+    MachineFunction &MF, LiveInsT SavedLiveIns, ValueIDNum **MInLocs,
+    DenseMap<DebugVariable, unsigned> &AllVarsNumbering) {
+  TTracker = new TransferTracker(TII, MTracker, MF, *TRI, CalleeSavedRegs);
+  unsigned NumLocs = MTracker->getNumLocs();
+
+  // For each block, load in the machine value locations and variable value
+  // live-ins, then step through each instruction in the block. New DBG_VALUEs
+  // to be inserted will be created along the way.
+  for (MachineBasicBlock &MBB : MF) {
+    unsigned bbnum = MBB.getNumber();
+    MTracker->reset();
+    MTracker->loadFromArray(MInLocs[bbnum], bbnum);
+    TTracker->loadInlocs(MBB, MInLocs[bbnum], SavedLiveIns[MBB.getNumber()],
+                         NumLocs);
+
+    CurBB = bbnum;
+    CurInst = 1;
+    for (auto &MI : MBB) {
+      process(MI);
+      TTracker->checkInstForNewValues(CurInst, MI.getIterator());
+      ++CurInst;
+    }
+  }
+
+  // We have to insert DBG_VALUEs in a consistent order, otherwise they appeaer
+  // in DWARF in different orders. Use the order that they appear when walking
+  // through each block / each instruction, stored in AllVarsNumbering.
+  auto OrderDbgValues = [&](const MachineInstr *A,
+                            const MachineInstr *B) -> bool {
+    DebugVariable VarA(A->getDebugVariable(), A->getDebugExpression(),
+                       A->getDebugLoc()->getInlinedAt());
+    DebugVariable VarB(B->getDebugVariable(), B->getDebugExpression(),
+                       B->getDebugLoc()->getInlinedAt());
+    return AllVarsNumbering.find(VarA)->second <
+           AllVarsNumbering.find(VarB)->second;
+  };
+
+  // Go through all the transfers recorded in the TransferTracker -- this is
+  // both the live-ins to a block, and any movements of values that happen
+  // in the middle.
+  for (auto &P : TTracker->Transfers) {
+    // Sort them according to appearance order.
+    llvm::sort(P.Insts, OrderDbgValues);
+    // Insert either before or after the designated point...
+    if (P.MBB) {
+      MachineBasicBlock &MBB = *P.MBB;
+      for (auto *MI : P.Insts) {
+        MBB.insert(P.Pos, MI);
+      }
+    } else {
+      MachineBasicBlock &MBB = *P.Pos->getParent();
+      for (auto *MI : P.Insts) {
+        MBB.insertAfter(P.Pos, MI);
+      }
+    }
+  }
+}
+
+void InstrRefBasedLDV::initialSetup(MachineFunction &MF) {
+  // Build some useful data structures.
+  auto hasNonArtificialLocation = [](const MachineInstr &MI) -> bool {
+    if (const DebugLoc &DL = MI.getDebugLoc())
+      return DL.getLine() != 0;
+    return false;
+  };
+  // Collect a set of all the artificial blocks.
+  for (auto &MBB : MF)
+    if (none_of(MBB.instrs(), hasNonArtificialLocation))
+      ArtificialBlocks.insert(&MBB);
+
+  // Compute mappings of block <=> RPO order.
+  ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
+  unsigned int RPONumber = 0;
+  for (auto RI = RPOT.begin(), RE = RPOT.end(); RI != RE; ++RI) {
+    OrderToBB[RPONumber] = *RI;
+    BBToOrder[*RI] = RPONumber;
+    BBNumToRPO[(*RI)->getNumber()] = RPONumber;
+    ++RPONumber;
+  }
+}
+
+/// Calculate the liveness information for the given machine function and
+/// extend ranges across basic blocks.
+bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF,
+                                    TargetPassConfig *TPC) {
+  // No subprogram means this function contains no debuginfo.
+  if (!MF.getFunction().getSubprogram())
+    return false;
+
+  LLVM_DEBUG(dbgs() << "\nDebug Range Extension\n");
+  this->TPC = TPC;
+
+  TRI = MF.getSubtarget().getRegisterInfo();
+  TII = MF.getSubtarget().getInstrInfo();
+  TFI = MF.getSubtarget().getFrameLowering();
+  TFI->getCalleeSaves(MF, CalleeSavedRegs);
+  LS.initialize(MF);
+
+  MTracker =
+      new MLocTracker(MF, *TII, *TRI, *MF.getSubtarget().getTargetLowering());
+  VTracker = nullptr;
+  TTracker = nullptr;
+
+  SmallVector<MLocTransferMap, 32> MLocTransfer;
+  SmallVector<VLocTracker, 8> vlocs;
+  LiveInsT SavedLiveIns;
+
+  int MaxNumBlocks = -1;
+  for (auto &MBB : MF)
+    MaxNumBlocks = std::max(MBB.getNumber(), MaxNumBlocks);
+  assert(MaxNumBlocks >= 0);
+  ++MaxNumBlocks;
+
+  MLocTransfer.resize(MaxNumBlocks);
+  vlocs.resize(MaxNumBlocks);
+  SavedLiveIns.resize(MaxNumBlocks);
+
+  initialSetup(MF);
+
+  produceMLocTransferFunction(MF, MLocTransfer, MaxNumBlocks);
+
+  // Allocate and initialize two array-of-arrays for the live-in and live-out
+  // machine values. The outer dimension is the block number; while the inner
+  // dimension is a LocIdx from MLocTracker.
+  ValueIDNum **MOutLocs = new ValueIDNum *[MaxNumBlocks];
+  ValueIDNum **MInLocs = new ValueIDNum *[MaxNumBlocks];
+  unsigned NumLocs = MTracker->getNumLocs();
+  for (int i = 0; i < MaxNumBlocks; ++i) {
+    MOutLocs[i] = new ValueIDNum[NumLocs];
+    MInLocs[i] = new ValueIDNum[NumLocs];
+  }
+
+  // Solve the machine value dataflow problem using the MLocTransfer function,
+  // storing the computed live-ins / live-outs into the array-of-arrays. We use
+  // both live-ins and live-outs for decision making in the variable value
+  // dataflow problem.
+  mlocDataflow(MInLocs, MOutLocs, MLocTransfer);
+
+  // Walk back through each block / instruction, collecting DBG_VALUE
+  // instructions and recording what machine value their operands refer to.
+  for (auto &OrderPair : OrderToBB) {
+    MachineBasicBlock &MBB = *OrderPair.second;
+    CurBB = MBB.getNumber();
+    VTracker = &vlocs[CurBB];
+    VTracker->MBB = &MBB;
+    MTracker->loadFromArray(MInLocs[CurBB], CurBB);
+    CurInst = 1;
+    for (auto &MI : MBB) {
+      process(MI);
+      ++CurInst;
+    }
+    MTracker->reset();
+  }
+
+  // Number all variables in the order that they appear, to be used as a stable
+  // insertion order later.
+  DenseMap<DebugVariable, unsigned> AllVarsNumbering;
+
+  // Map from one LexicalScope to all the variables in that scope.
+  DenseMap<const LexicalScope *, SmallSet<DebugVariable, 4>> ScopeToVars;
+
+  // Map from One lexical scope to all blocks in that scope.
+  DenseMap<const LexicalScope *, SmallPtrSet<MachineBasicBlock *, 4>>
+      ScopeToBlocks;
+
+  // Store a DILocation that describes a scope.
+  DenseMap<const LexicalScope *, const DILocation *> ScopeToDILocation;
+
+  // To mirror old LiveDebugValues, enumerate variables in RPOT order. Otherwise
+  // the order is unimportant, it just has to be stable.
+  for (unsigned int I = 0; I < OrderToBB.size(); ++I) {
+    auto *MBB = OrderToBB[I];
+    auto *VTracker = &vlocs[MBB->getNumber()];
+    // Collect each variable with a DBG_VALUE in this block.
+    for (auto &idx : VTracker->Vars) {
+      const auto &Var = idx.first;
+      const DILocation *ScopeLoc = VTracker->Scopes[Var];
+      assert(ScopeLoc != nullptr);
+      auto *Scope = LS.findLexicalScope(ScopeLoc);
+
+      // No insts in scope -> shouldn't have been recorded.
+      assert(Scope != nullptr);
+
+      AllVarsNumbering.insert(std::make_pair(Var, AllVarsNumbering.size()));
+      ScopeToVars[Scope].insert(Var);
+      ScopeToBlocks[Scope].insert(VTracker->MBB);
+      ScopeToDILocation[Scope] = ScopeLoc;
+    }
+  }
+
+  // OK. Iterate over scopes: there might be something to be said for
+  // ordering them by size/locality, but that's for the future. For each scope,
+  // solve the variable value problem, producing a map of variables to values
+  // in SavedLiveIns.
+  for (auto &P : ScopeToVars) {
+    vlocDataflow(P.first, ScopeToDILocation[P.first], P.second,
+                 ScopeToBlocks[P.first], SavedLiveIns, MOutLocs, MInLocs,
+                 vlocs);
+  }
+
+  // Using the computed value locations and variable values for each block,
+  // create the DBG_VALUE instructions representing the extended variable
+  // locations.
+  emitLocations(MF, SavedLiveIns, MInLocs, AllVarsNumbering);
+
+  for (int Idx = 0; Idx < MaxNumBlocks; ++Idx) {
+    delete[] MOutLocs[Idx];
+    delete[] MInLocs[Idx];
+  }
+  delete[] MOutLocs;
+  delete[] MInLocs;
+
+  // Did we actually make any changes? If we created any DBG_VALUEs, then yes.
+  bool Changed = TTracker->Transfers.size() != 0;
+
+  delete MTracker;
+  delete TTracker;
+  MTracker = nullptr;
+  VTracker = nullptr;
+  TTracker = nullptr;
+
+  ArtificialBlocks.clear();
+  OrderToBB.clear();
+  BBToOrder.clear();
+  BBNumToRPO.clear();
+  DebugInstrNumToInstr.clear();
+
+  return Changed;
+}
+
+LDVImpl *llvm::makeInstrRefBasedLiveDebugValues() {
+  return new InstrRefBasedLDV();
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp
new file mode 100644
index 000000000000..770c46ec8436
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp
@@ -0,0 +1,97 @@
+//===- LiveDebugValues.cpp - Tracking Debug Value MIs ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "LiveDebugValues.h"
+
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetMachine.h"
+
+/// \file LiveDebugValues.cpp
+///
+/// The LiveDebugValues pass extends the range of variable locations
+/// (specified by DBG_VALUE instructions) from single blocks to successors
+/// and any other code locations where the variable location is valid.
+/// There are currently two implementations: the "VarLoc" implementation
+/// explicitly tracks the location of a variable, while the "InstrRef"
+/// implementation tracks the values defined by instructions through locations.
+///
+/// This file implements neither; it merely registers the pass, allows the
+/// user to pick which implementation will be used to propagate variable
+/// locations.
+
+#define DEBUG_TYPE "livedebugvalues"
+
+using namespace llvm;
+
+/// Generic LiveDebugValues pass. Calls through to VarLocBasedLDV or
+/// InstrRefBasedLDV to perform location propagation, via the LDVImpl
+/// base class.
+class LiveDebugValues : public MachineFunctionPass {
+public:
+  static char ID;
+
+  LiveDebugValues();
+  ~LiveDebugValues() {
+    if (TheImpl)
+      delete TheImpl;
+  }
+
+  /// Calculate the liveness information for the given machine function.
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoVRegs);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+private:
+  LDVImpl *TheImpl;
+  TargetPassConfig *TPC;
+};
+
+char LiveDebugValues::ID = 0;
+
+char &llvm::LiveDebugValuesID = LiveDebugValues::ID;
+
+INITIALIZE_PASS(LiveDebugValues, DEBUG_TYPE, "Live DEBUG_VALUE analysis", false,
+                false)
+
+/// Default construct and initialize the pass.
+LiveDebugValues::LiveDebugValues() : MachineFunctionPass(ID) {
+  initializeLiveDebugValuesPass(*PassRegistry::getPassRegistry());
+  TheImpl = nullptr;
+}
+
+bool LiveDebugValues::runOnMachineFunction(MachineFunction &MF) {
+  if (!TheImpl) {
+    TPC = getAnalysisIfAvailable<TargetPassConfig>();
+
+    bool InstrRefBased = false;
+    if (TPC) {
+      auto &TM = TPC->getTM<TargetMachine>();
+      InstrRefBased = TM.Options.ValueTrackingVariableLocations;
+    }
+
+    if (InstrRefBased)
+      TheImpl = llvm::makeInstrRefBasedLiveDebugValues();
+    else
+      TheImpl = llvm::makeVarLocBasedLiveDebugValues();
+  }
+
+  return TheImpl->ExtendRanges(MF, TPC);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.h b/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.h
new file mode 100644
index 000000000000..6b05bc68d74d
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.h
@@ -0,0 +1,32 @@
+//===- LiveDebugValues.cpp - Tracking Debug Value MIs ---------*- C++ -*---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+
+namespace llvm {
+
+// Inline namespace for types / symbols shared between different
+// LiveDebugValues implementations.
+inline namespace SharedLiveDebugValues {
+
+// Expose a base class for LiveDebugValues interfaces to inherit from. This
+// allows the generic LiveDebugValues pass handles to call into the
+// implementation.
+class LDVImpl {
+public:
+  virtual bool ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC) = 0;
+  virtual ~LDVImpl() {}
+};
+
+} // namespace SharedLiveDebugValues
+
+// Factory functions for LiveDebugValues implementations.
+extern LDVImpl *makeVarLocBasedLiveDebugValues();
+extern LDVImpl *makeInstrRefBasedLiveDebugValues();
+} // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp
new file mode 100644
index 000000000000..e2daa46fe6b9
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp
@@ -0,0 +1,1994 @@
+//===- VarLocBasedImpl.cpp - Tracking Debug Value MIs with VarLoc class----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file VarLocBasedImpl.cpp
+///
+/// LiveDebugValues is an optimistic "available expressions" dataflow
+/// algorithm. The set of expressions is the set of machine locations
+/// (registers, spill slots, constants) that a variable fragment might be
+/// located, qualified by a DIExpression and indirect-ness flag, while each
+/// variable is identified by a DebugVariable object. The availability of an
+/// expression begins when a DBG_VALUE instruction specifies the location of a
+/// DebugVariable, and continues until that location is clobbered or
+/// re-specified by a different DBG_VALUE for the same DebugVariable.
+///
+/// The output of LiveDebugValues is additional DBG_VALUE instructions,
+/// placed to extend variable locations as far they're available. This file
+/// and the VarLocBasedLDV class is an implementation that explicitly tracks
+/// locations, using the VarLoc class.
+///
+/// The canonical "available expressions" problem doesn't have expression
+/// clobbering, instead when a variable is re-assigned, any expressions using
+/// that variable get invalidated. LiveDebugValues can map onto "available
+/// expressions" by having every register represented by a variable, which is
+/// used in an expression that becomes available at a DBG_VALUE instruction.
+/// When the register is clobbered, its variable is effectively reassigned, and
+/// expressions computed from it become unavailable. A similar construct is
+/// needed when a DebugVariable has its location re-specified, to invalidate
+/// all other locations for that DebugVariable.
+///
+/// Using the dataflow analysis to compute the available expressions, we create
+/// a DBG_VALUE at the beginning of each block where the expression is
+/// live-in. This propagates variable locations into every basic block where
+/// the location can be determined, rather than only having DBG_VALUEs in blocks
+/// where locations are specified due to an assignment or some optimization.
+/// Movements of values between registers and spill slots are annotated with
+/// DBG_VALUEs too to track variable values bewteen locations. All this allows
+/// DbgEntityHistoryCalculator to focus on only the locations within individual
+/// blocks, facilitating testing and improving modularity.
+///
+/// We follow an optimisic dataflow approach, with this lattice:
+///
+/// \verbatim
+///                    ┬ "Unknown"
+///                          |
+///                          v
+///                         True
+///                          |
+///                          v
+///                      ⊥ False
+/// \endverbatim With "True" signifying that the expression is available (and
+/// thus a DebugVariable's location is the corresponding register), while
+/// "False" signifies that the expression is unavailable. "Unknown"s never
+/// survive to the end of the analysis (see below).
+///
+/// Formally, all DebugVariable locations that are live-out of a block are
+/// initialized to \top.  A blocks live-in values take the meet of the lattice
+/// value for every predecessors live-outs, except for the entry block, where
+/// all live-ins are \bot. The usual dataflow propagation occurs: the transfer
+/// function for a block assigns an expression for a DebugVariable to be "True"
+/// if a DBG_VALUE in the block specifies it; "False" if the location is
+/// clobbered; or the live-in value if it is unaffected by the block. We
+/// visit each block in reverse post order until a fixedpoint is reached. The
+/// solution produced is maximal.
+///
+/// Intuitively, we start by assuming that every expression / variable location
+/// is at least "True", and then propagate "False" from the entry block and any
+/// clobbers until there are no more changes to make. This gives us an accurate
+/// solution because all incorrect locations will have a "False" propagated into
+/// them. It also gives us a solution that copes well with loops by assuming
+/// that variable locations are live-through every loop, and then removing those
+/// that are not through dataflow.
+///
+/// Within LiveDebugValues: each variable location is represented by a
+/// VarLoc object that identifies the source variable, its current
+/// machine-location, and the DBG_VALUE inst that specifies the location. Each
+/// VarLoc is indexed in the (function-scope) \p VarLocMap, giving each VarLoc a
+/// unique index. Rather than operate directly on machine locations, the
+/// dataflow analysis in this pass identifies locations by their index in the
+/// VarLocMap, meaning all the variable locations in a block can be described
+/// by a sparse vector of VarLocMap indicies.
+///
+/// All the storage for the dataflow analysis is local to the ExtendRanges
+/// method and passed down to helper methods. "OutLocs" and "InLocs" record the
+/// in and out lattice values for each block. "OpenRanges" maintains a list of
+/// variable locations and, with the "process" method, evaluates the transfer
+/// function of each block. "flushPendingLocs" installs DBG_VALUEs for each
+/// live-in location at the start of blocks, while "Transfers" records
+/// transfers of values between machine-locations.
+///
+/// We avoid explicitly representing the "Unknown" (\top) lattice value in the
+/// implementation. Instead, unvisited blocks implicitly have all lattice
+/// values set as "Unknown". After being visited, there will be path back to
+/// the entry block where the lattice value is "False", and as the transfer
+/// function cannot make new "Unknown" locations, there are no scenarios where
+/// a block can have an "Unknown" location after being visited. Similarly, we
+/// don't enumerate all possible variable locations before exploring the
+/// function: when a new location is discovered, all blocks previously explored
+/// were implicitly "False" but unrecorded, and become explicitly "False" when
+/// a new VarLoc is created with its bit not set in predecessor InLocs or
+/// OutLocs.
+///
+//===----------------------------------------------------------------------===//
+
+#include "LiveDebugValues.h"
+
+#include "llvm/ADT/CoalescingBitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/UniqueVector.h"
+#include "llvm/CodeGen/LexicalScopes.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/TypeSize.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <functional>
+#include <queue>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "livedebugvalues"
+
+STATISTIC(NumInserted, "Number of DBG_VALUE instructions inserted");
+
+// Options to prevent pathological compile-time behavior. If InputBBLimit and
+// InputDbgValueLimit are both exceeded, range extension is disabled.
+static cl::opt<unsigned> InputBBLimit(
+    "livedebugvalues-input-bb-limit",
+    cl::desc("Maximum input basic blocks before DBG_VALUE limit applies"),
+    cl::init(10000), cl::Hidden);
+static cl::opt<unsigned> InputDbgValueLimit(
+    "livedebugvalues-input-dbg-value-limit",
+    cl::desc(
+        "Maximum input DBG_VALUE insts supported by debug range extension"),
+    cl::init(50000), cl::Hidden);
+
+// If @MI is a DBG_VALUE with debug value described by a defined
+// register, returns the number of this register. In the other case, returns 0.
+static Register isDbgValueDescribedByReg(const MachineInstr &MI) {
+  assert(MI.isDebugValue() && "expected a DBG_VALUE");
+  assert(MI.getNumOperands() == 4 && "malformed DBG_VALUE");
+  // If location of variable is described using a register (directly
+  // or indirectly), this register is always a first operand.
+  return MI.getDebugOperand(0).isReg() ? MI.getDebugOperand(0).getReg()
+                                       : Register();
+}
+
+/// If \p Op is a stack or frame register return true, otherwise return false.
+/// This is used to avoid basing the debug entry values on the registers, since
+/// we do not support it at the moment.
+static bool isRegOtherThanSPAndFP(const MachineOperand &Op,
+                                  const MachineInstr &MI,
+                                  const TargetRegisterInfo *TRI) {
+  if (!Op.isReg())
+    return false;
+
+  const MachineFunction *MF = MI.getParent()->getParent();
+  const TargetLowering *TLI = MF->getSubtarget().getTargetLowering();
+  Register SP = TLI->getStackPointerRegisterToSaveRestore();
+  Register FP = TRI->getFrameRegister(*MF);
+  Register Reg = Op.getReg();
+
+  return Reg && Reg != SP && Reg != FP;
+}
+
+namespace {
+
+// Max out the number of statically allocated elements in DefinedRegsSet, as
+// this prevents fallback to std::set::count() operations.
+using DefinedRegsSet = SmallSet<Register, 32>;
+
+using VarLocSet = CoalescingBitVector<uint64_t>;
+
+/// A type-checked pair of {Register Location (or 0), Index}, used to index
+/// into a \ref VarLocMap. This can be efficiently converted to a 64-bit int
+/// for insertion into a \ref VarLocSet, and efficiently converted back. The
+/// type-checker helps ensure that the conversions aren't lossy.
+///
+/// Why encode a location /into/ the VarLocMap index? This makes it possible
+/// to find the open VarLocs killed by a register def very quickly. This is a
+/// performance-critical operation for LiveDebugValues.
+struct LocIndex {
+  using u32_location_t = uint32_t;
+  using u32_index_t = uint32_t;
+
+  u32_location_t Location; // Physical registers live in the range [1;2^30) (see
+                           // \ref MCRegister), so we have plenty of range left
+                           // here to encode non-register locations.
+  u32_index_t Index;
+
+  /// The first location greater than 0 that is not reserved for VarLocs of
+  /// kind RegisterKind.
+  static constexpr u32_location_t kFirstInvalidRegLocation = 1 << 30;
+
+  /// A special location reserved for VarLocs of kind SpillLocKind.
+  static constexpr u32_location_t kSpillLocation = kFirstInvalidRegLocation;
+
+  /// A special location reserved for VarLocs of kind EntryValueBackupKind and
+  /// EntryValueCopyBackupKind.
+  static constexpr u32_location_t kEntryValueBackupLocation =
+      kFirstInvalidRegLocation + 1;
+
+  LocIndex(u32_location_t Location, u32_index_t Index)
+      : Location(Location), Index(Index) {}
+
+  uint64_t getAsRawInteger() const {
+    return (static_cast<uint64_t>(Location) << 32) | Index;
+  }
+
+  template<typename IntT> static LocIndex fromRawInteger(IntT ID) {
+    static_assert(std::is_unsigned<IntT>::value &&
+                      sizeof(ID) == sizeof(uint64_t),
+                  "Cannot convert raw integer to LocIndex");
+    return {static_cast<u32_location_t>(ID >> 32),
+            static_cast<u32_index_t>(ID)};
+  }
+
+  /// Get the start of the interval reserved for VarLocs of kind RegisterKind
+  /// which reside in \p Reg. The end is at rawIndexForReg(Reg+1)-1.
+  static uint64_t rawIndexForReg(uint32_t Reg) {
+    return LocIndex(Reg, 0).getAsRawInteger();
+  }
+
+  /// Return a range covering all set indices in the interval reserved for
+  /// \p Location in \p Set.
+  static auto indexRangeForLocation(const VarLocSet &Set,
+                                    u32_location_t Location) {
+    uint64_t Start = LocIndex(Location, 0).getAsRawInteger();
+    uint64_t End = LocIndex(Location + 1, 0).getAsRawInteger();
+    return Set.half_open_range(Start, End);
+  }
+};
+
+class VarLocBasedLDV : public LDVImpl {
+private:
+  const TargetRegisterInfo *TRI;
+  const TargetInstrInfo *TII;
+  const TargetFrameLowering *TFI;
+  TargetPassConfig *TPC;
+  BitVector CalleeSavedRegs;
+  LexicalScopes LS;
+  VarLocSet::Allocator Alloc;
+
+  enum struct TransferKind { TransferCopy, TransferSpill, TransferRestore };
+
+  using FragmentInfo = DIExpression::FragmentInfo;
+  using OptFragmentInfo = Optional<DIExpression::FragmentInfo>;
+
+  /// A pair of debug variable and value location.
+  struct VarLoc {
+    // The location at which a spilled variable resides. It consists of a
+    // register and an offset.
+    struct SpillLoc {
+      unsigned SpillBase;
+      StackOffset SpillOffset;
+      bool operator==(const SpillLoc &Other) const {
+        return SpillBase == Other.SpillBase && SpillOffset == Other.SpillOffset;
+      }
+      bool operator!=(const SpillLoc &Other) const {
+        return !(*this == Other);
+      }
+    };
+
+    /// Identity of the variable at this location.
+    const DebugVariable Var;
+
+    /// The expression applied to this location.
+    const DIExpression *Expr;
+
+    /// DBG_VALUE to clone var/expr information from if this location
+    /// is moved.
+    const MachineInstr &MI;
+
+    enum VarLocKind {
+      InvalidKind = 0,
+      RegisterKind,
+      SpillLocKind,
+      ImmediateKind,
+      EntryValueKind,
+      EntryValueBackupKind,
+      EntryValueCopyBackupKind
+    } Kind = InvalidKind;
+
+    /// The value location. Stored separately to avoid repeatedly
+    /// extracting it from MI.
+    union LocUnion {
+      uint64_t RegNo;
+      SpillLoc SpillLocation;
+      uint64_t Hash;
+      int64_t Immediate;
+      const ConstantFP *FPImm;
+      const ConstantInt *CImm;
+      LocUnion() : Hash(0) {}
+    } Loc;
+
+    VarLoc(const MachineInstr &MI, LexicalScopes &LS)
+        : Var(MI.getDebugVariable(), MI.getDebugExpression(),
+              MI.getDebugLoc()->getInlinedAt()),
+          Expr(MI.getDebugExpression()), MI(MI) {
+      assert(MI.isDebugValue() && "not a DBG_VALUE");
+      assert(MI.getNumOperands() == 4 && "malformed DBG_VALUE");
+      if (int RegNo = isDbgValueDescribedByReg(MI)) {
+        Kind = RegisterKind;
+        Loc.RegNo = RegNo;
+      } else if (MI.getDebugOperand(0).isImm()) {
+        Kind = ImmediateKind;
+        Loc.Immediate = MI.getDebugOperand(0).getImm();
+      } else if (MI.getDebugOperand(0).isFPImm()) {
+        Kind = ImmediateKind;
+        Loc.FPImm = MI.getDebugOperand(0).getFPImm();
+      } else if (MI.getDebugOperand(0).isCImm()) {
+        Kind = ImmediateKind;
+        Loc.CImm = MI.getDebugOperand(0).getCImm();
+      }
+
+      // We create the debug entry values from the factory functions rather than
+      // from this ctor.
+      assert(Kind != EntryValueKind && !isEntryBackupLoc());
+    }
+
+    /// Take the variable and machine-location in DBG_VALUE MI, and build an
+    /// entry location using the given expression.
+    static VarLoc CreateEntryLoc(const MachineInstr &MI, LexicalScopes &LS,
+                                 const DIExpression *EntryExpr, Register Reg) {
+      VarLoc VL(MI, LS);
+      assert(VL.Kind == RegisterKind);
+      VL.Kind = EntryValueKind;
+      VL.Expr = EntryExpr;
+      VL.Loc.RegNo = Reg;
+      return VL;
+    }
+
+    /// Take the variable and machine-location from the DBG_VALUE (from the
+    /// function entry), and build an entry value backup location. The backup
+    /// location will turn into the normal location if the backup is valid at
+    /// the time of the primary location clobbering.
+    static VarLoc CreateEntryBackupLoc(const MachineInstr &MI,
+                                       LexicalScopes &LS,
+                                       const DIExpression *EntryExpr) {
+      VarLoc VL(MI, LS);
+      assert(VL.Kind == RegisterKind);
+      VL.Kind = EntryValueBackupKind;
+      VL.Expr = EntryExpr;
+      return VL;
+    }
+
+    /// Take the variable and machine-location from the DBG_VALUE (from the
+    /// function entry), and build a copy of an entry value backup location by
+    /// setting the register location to NewReg.
+    static VarLoc CreateEntryCopyBackupLoc(const MachineInstr &MI,
+                                           LexicalScopes &LS,
+                                           const DIExpression *EntryExpr,
+                                           Register NewReg) {
+      VarLoc VL(MI, LS);
+      assert(VL.Kind == RegisterKind);
+      VL.Kind = EntryValueCopyBackupKind;
+      VL.Expr = EntryExpr;
+      VL.Loc.RegNo = NewReg;
+      return VL;
+    }
+
+    /// Copy the register location in DBG_VALUE MI, updating the register to
+    /// be NewReg.
+    static VarLoc CreateCopyLoc(const MachineInstr &MI, LexicalScopes &LS,
+                                Register NewReg) {
+      VarLoc VL(MI, LS);
+      assert(VL.Kind == RegisterKind);
+      VL.Loc.RegNo = NewReg;
+      return VL;
+    }
+
+    /// Take the variable described by DBG_VALUE MI, and create a VarLoc
+    /// locating it in the specified spill location.
+    static VarLoc CreateSpillLoc(const MachineInstr &MI, unsigned SpillBase,
+                                 StackOffset SpillOffset, LexicalScopes &LS) {
+      VarLoc VL(MI, LS);
+      assert(VL.Kind == RegisterKind);
+      VL.Kind = SpillLocKind;
+      VL.Loc.SpillLocation = {SpillBase, SpillOffset};
+      return VL;
+    }
+
+    /// Create a DBG_VALUE representing this VarLoc in the given function.
+    /// Copies variable-specific information such as DILocalVariable and
+    /// inlining information from the original DBG_VALUE instruction, which may
+    /// have been several transfers ago.
+    MachineInstr *BuildDbgValue(MachineFunction &MF) const {
+      const DebugLoc &DbgLoc = MI.getDebugLoc();
+      bool Indirect = MI.isIndirectDebugValue();
+      const auto &IID = MI.getDesc();
+      const DILocalVariable *Var = MI.getDebugVariable();
+      const DIExpression *DIExpr = MI.getDebugExpression();
+      NumInserted++;
+
+      switch (Kind) {
+      case EntryValueKind:
+        // An entry value is a register location -- but with an updated
+        // expression. The register location of such DBG_VALUE is always the one
+        // from the entry DBG_VALUE, it does not matter if the entry value was
+        // copied in to another register due to some optimizations.
+        return BuildMI(MF, DbgLoc, IID, Indirect,
+                       MI.getDebugOperand(0).getReg(), Var, Expr);
+      case RegisterKind:
+        // Register locations are like the source DBG_VALUE, but with the
+        // register number from this VarLoc.
+        return BuildMI(MF, DbgLoc, IID, Indirect, Loc.RegNo, Var, DIExpr);
+      case SpillLocKind: {
+        // Spills are indirect DBG_VALUEs, with a base register and offset.
+        // Use the original DBG_VALUEs expression to build the spilt location
+        // on top of. FIXME: spill locations created before this pass runs
+        // are not recognized, and not handled here.
+        auto *TRI = MF.getSubtarget().getRegisterInfo();
+        auto *SpillExpr = TRI->prependOffsetExpression(
+            DIExpr, DIExpression::ApplyOffset, Loc.SpillLocation.SpillOffset);
+        unsigned Base = Loc.SpillLocation.SpillBase;
+        return BuildMI(MF, DbgLoc, IID, true, Base, Var, SpillExpr);
+      }
+      case ImmediateKind: {
+        MachineOperand MO = MI.getDebugOperand(0);
+        return BuildMI(MF, DbgLoc, IID, Indirect, MO, Var, DIExpr);
+      }
+      case EntryValueBackupKind:
+      case EntryValueCopyBackupKind:
+      case InvalidKind:
+        llvm_unreachable(
+            "Tried to produce DBG_VALUE for invalid or backup VarLoc");
+      }
+      llvm_unreachable("Unrecognized VarLocBasedLDV.VarLoc.Kind enum");
+    }
+
+    /// Is the Loc field a constant or constant object?
+    bool isConstant() const { return Kind == ImmediateKind; }
+
+    /// Check if the Loc field is an entry backup location.
+    bool isEntryBackupLoc() const {
+      return Kind == EntryValueBackupKind || Kind == EntryValueCopyBackupKind;
+    }
+
+    /// If this variable is described by a register holding the entry value,
+    /// return it, otherwise return 0.
+    unsigned getEntryValueBackupReg() const {
+      if (Kind == EntryValueBackupKind)
+        return Loc.RegNo;
+      return 0;
+    }
+
+    /// If this variable is described by a register holding the copy of the
+    /// entry value, return it, otherwise return 0.
+    unsigned getEntryValueCopyBackupReg() const {
+      if (Kind == EntryValueCopyBackupKind)
+        return Loc.RegNo;
+      return 0;
+    }
+
+    /// If this variable is described by a register, return it,
+    /// otherwise return 0.
+    unsigned isDescribedByReg() const {
+      if (Kind == RegisterKind)
+        return Loc.RegNo;
+      return 0;
+    }
+
+    /// Determine whether the lexical scope of this value's debug location
+    /// dominates MBB.
+    bool dominates(LexicalScopes &LS, MachineBasicBlock &MBB) const {
+      return LS.dominates(MI.getDebugLoc().get(), &MBB);
+    }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+    // TRI can be null.
+    void dump(const TargetRegisterInfo *TRI, raw_ostream &Out = dbgs()) const {
+      Out << "VarLoc(";
+      switch (Kind) {
+      case RegisterKind:
+      case EntryValueKind:
+      case EntryValueBackupKind:
+      case EntryValueCopyBackupKind:
+        Out << printReg(Loc.RegNo, TRI);
+        break;
+      case SpillLocKind:
+        Out << printReg(Loc.SpillLocation.SpillBase, TRI);
+        Out << "[" << Loc.SpillLocation.SpillOffset.getFixed() << " + "
+            << Loc.SpillLocation.SpillOffset.getScalable() << "x vscale"
+            << "]";
+        break;
+      case ImmediateKind:
+        Out << Loc.Immediate;
+        break;
+      case InvalidKind:
+        llvm_unreachable("Invalid VarLoc in dump method");
+      }
+
+      Out << ", \"" << Var.getVariable()->getName() << "\", " << *Expr << ", ";
+      if (Var.getInlinedAt())
+        Out << "!" << Var.getInlinedAt()->getMetadataID() << ")\n";
+      else
+        Out << "(null))";
+
+      if (isEntryBackupLoc())
+        Out << " (backup loc)\n";
+      else
+        Out << "\n";
+    }
+#endif
+
+    bool operator==(const VarLoc &Other) const {
+      if (Kind != Other.Kind || !(Var == Other.Var) || Expr != Other.Expr)
+        return false;
+
+      switch (Kind) {
+      case SpillLocKind:
+        return Loc.SpillLocation == Other.Loc.SpillLocation;
+      case RegisterKind:
+      case ImmediateKind:
+      case EntryValueKind:
+      case EntryValueBackupKind:
+      case EntryValueCopyBackupKind:
+        return Loc.Hash == Other.Loc.Hash;
+      default:
+        llvm_unreachable("Invalid kind");
+      }
+    }
+
+    /// This operator guarantees that VarLocs are sorted by Variable first.
+    bool operator<(const VarLoc &Other) const {
+      switch (Kind) {
+      case SpillLocKind:
+        return std::make_tuple(Var, Kind, Loc.SpillLocation.SpillBase,
+                               Loc.SpillLocation.SpillOffset.getFixed(),
+                               Loc.SpillLocation.SpillOffset.getScalable(),
+                               Expr) <
+               std::make_tuple(
+                   Other.Var, Other.Kind, Other.Loc.SpillLocation.SpillBase,
+                   Other.Loc.SpillLocation.SpillOffset.getFixed(),
+                   Other.Loc.SpillLocation.SpillOffset.getScalable(),
+                   Other.Expr);
+      case RegisterKind:
+      case ImmediateKind:
+      case EntryValueKind:
+      case EntryValueBackupKind:
+      case EntryValueCopyBackupKind:
+        return std::tie(Var, Kind, Loc.Hash, Expr) <
+               std::tie(Other.Var, Other.Kind, Other.Loc.Hash, Other.Expr);
+      default:
+        llvm_unreachable("Invalid kind");
+      }
+    }
+  };
+
+  /// VarLocMap is used for two things:
+  /// 1) Assigning a unique LocIndex to a VarLoc. This LocIndex can be used to
+  ///    virtually insert a VarLoc into a VarLocSet.
+  /// 2) Given a LocIndex, look up the unique associated VarLoc.
+  class VarLocMap {
+    /// Map a VarLoc to an index within the vector reserved for its location
+    /// within Loc2Vars.
+    std::map<VarLoc, LocIndex::u32_index_t> Var2Index;
+
+    /// Map a location to a vector which holds VarLocs which live in that
+    /// location.
+    SmallDenseMap<LocIndex::u32_location_t, std::vector<VarLoc>> Loc2Vars;
+
+    /// Determine the 32-bit location reserved for \p VL, based on its kind.
+    static LocIndex::u32_location_t getLocationForVar(const VarLoc &VL) {
+      switch (VL.Kind) {
+      case VarLoc::RegisterKind:
+        assert((VL.Loc.RegNo < LocIndex::kFirstInvalidRegLocation) &&
+               "Physreg out of range?");
+        return VL.Loc.RegNo;
+      case VarLoc::SpillLocKind:
+        return LocIndex::kSpillLocation;
+      case VarLoc::EntryValueBackupKind:
+      case VarLoc::EntryValueCopyBackupKind:
+        return LocIndex::kEntryValueBackupLocation;
+      default:
+        return 0;
+      }
+    }
+
+  public:
+    /// Retrieve a unique LocIndex for \p VL.
+    LocIndex insert(const VarLoc &VL) {
+      LocIndex::u32_location_t Location = getLocationForVar(VL);
+      LocIndex::u32_index_t &Index = Var2Index[VL];
+      if (!Index) {
+        auto &Vars = Loc2Vars[Location];
+        Vars.push_back(VL);
+        Index = Vars.size();
+      }
+      return {Location, Index - 1};
+    }
+
+    /// Retrieve the unique VarLoc associated with \p ID.
+    const VarLoc &operator[](LocIndex ID) const {
+      auto LocIt = Loc2Vars.find(ID.Location);
+      assert(LocIt != Loc2Vars.end() && "Location not tracked");
+      return LocIt->second[ID.Index];
+    }
+  };
+
+  using VarLocInMBB =
+      SmallDenseMap<const MachineBasicBlock *, std::unique_ptr<VarLocSet>>;
+  struct TransferDebugPair {
+    MachineInstr *TransferInst; ///< Instruction where this transfer occurs.
+    LocIndex LocationID;        ///< Location number for the transfer dest.
+  };
+  using TransferMap = SmallVector<TransferDebugPair, 4>;
+
+  // Types for recording sets of variable fragments that overlap. For a given
+  // local variable, we record all other fragments of that variable that could
+  // overlap it, to reduce search time.
+  using FragmentOfVar =
+      std::pair<const DILocalVariable *, DIExpression::FragmentInfo>;
+  using OverlapMap =
+      DenseMap<FragmentOfVar, SmallVector<DIExpression::FragmentInfo, 1>>;
+
+  // Helper while building OverlapMap, a map of all fragments seen for a given
+  // DILocalVariable.
+  using VarToFragments =
+      DenseMap<const DILocalVariable *, SmallSet<FragmentInfo, 4>>;
+
+  /// This holds the working set of currently open ranges. For fast
+  /// access, this is done both as a set of VarLocIDs, and a map of
+  /// DebugVariable to recent VarLocID. Note that a DBG_VALUE ends all
+  /// previous open ranges for the same variable. In addition, we keep
+  /// two different maps (Vars/EntryValuesBackupVars), so erase/insert
+  /// methods act differently depending on whether a VarLoc is primary
+  /// location or backup one. In the case the VarLoc is backup location
+  /// we will erase/insert from the EntryValuesBackupVars map, otherwise
+  /// we perform the operation on the Vars.
+  class OpenRangesSet {
+    VarLocSet VarLocs;
+    // Map the DebugVariable to recent primary location ID.
+    SmallDenseMap<DebugVariable, LocIndex, 8> Vars;
+    // Map the DebugVariable to recent backup location ID.
+    SmallDenseMap<DebugVariable, LocIndex, 8> EntryValuesBackupVars;
+    OverlapMap &OverlappingFragments;
+
+  public:
+    OpenRangesSet(VarLocSet::Allocator &Alloc, OverlapMap &_OLapMap)
+        : VarLocs(Alloc), OverlappingFragments(_OLapMap) {}
+
+    const VarLocSet &getVarLocs() const { return VarLocs; }
+
+    /// Terminate all open ranges for VL.Var by removing it from the set.
+    void erase(const VarLoc &VL);
+
+    /// Terminate all open ranges listed in \c KillSet by removing
+    /// them from the set.
+    void erase(const VarLocSet &KillSet, const VarLocMap &VarLocIDs);
+
+    /// Insert a new range into the set.
+    void insert(LocIndex VarLocID, const VarLoc &VL);
+
+    /// Insert a set of ranges.
+    void insertFromLocSet(const VarLocSet &ToLoad, const VarLocMap &Map) {
+      for (uint64_t ID : ToLoad) {
+        LocIndex Idx = LocIndex::fromRawInteger(ID);
+        const VarLoc &VarL = Map[Idx];
+        insert(Idx, VarL);
+      }
+    }
+
+    llvm::Optional<LocIndex> getEntryValueBackup(DebugVariable Var);
+
+    /// Empty the set.
+    void clear() {
+      VarLocs.clear();
+      Vars.clear();
+      EntryValuesBackupVars.clear();
+    }
+
+    /// Return whether the set is empty or not.
+    bool empty() const {
+      assert(Vars.empty() == EntryValuesBackupVars.empty() &&
+             Vars.empty() == VarLocs.empty() &&
+             "open ranges are inconsistent");
+      return VarLocs.empty();
+    }
+
+    /// Get an empty range of VarLoc IDs.
+    auto getEmptyVarLocRange() const {
+      return iterator_range<VarLocSet::const_iterator>(getVarLocs().end(),
+                                                       getVarLocs().end());
+    }
+
+    /// Get all set IDs for VarLocs of kind RegisterKind in \p Reg.
+    auto getRegisterVarLocs(Register Reg) const {
+      return LocIndex::indexRangeForLocation(getVarLocs(), Reg);
+    }
+
+    /// Get all set IDs for VarLocs of kind SpillLocKind.
+    auto getSpillVarLocs() const {
+      return LocIndex::indexRangeForLocation(getVarLocs(),
+                                             LocIndex::kSpillLocation);
+    }
+
+    /// Get all set IDs for VarLocs of kind EntryValueBackupKind or
+    /// EntryValueCopyBackupKind.
+    auto getEntryValueBackupVarLocs() const {
+      return LocIndex::indexRangeForLocation(
+          getVarLocs(), LocIndex::kEntryValueBackupLocation);
+    }
+  };
+
+  /// Collect all VarLoc IDs from \p CollectFrom for VarLocs of kind
+  /// RegisterKind which are located in any reg in \p Regs. Insert collected IDs
+  /// into \p Collected.
+  void collectIDsForRegs(VarLocSet &Collected, const DefinedRegsSet &Regs,
+                         const VarLocSet &CollectFrom) const;
+
+  /// Get the registers which are used by VarLocs of kind RegisterKind tracked
+  /// by \p CollectFrom.
+  void getUsedRegs(const VarLocSet &CollectFrom,
+                   SmallVectorImpl<uint32_t> &UsedRegs) const;
+
+  VarLocSet &getVarLocsInMBB(const MachineBasicBlock *MBB, VarLocInMBB &Locs) {
+    std::unique_ptr<VarLocSet> &VLS = Locs[MBB];
+    if (!VLS)
+      VLS = std::make_unique<VarLocSet>(Alloc);
+    return *VLS.get();
+  }
+
+  const VarLocSet &getVarLocsInMBB(const MachineBasicBlock *MBB,
+                                   const VarLocInMBB &Locs) const {
+    auto It = Locs.find(MBB);
+    assert(It != Locs.end() && "MBB not in map");
+    return *It->second.get();
+  }
+
+  /// Tests whether this instruction is a spill to a stack location.
+  bool isSpillInstruction(const MachineInstr &MI, MachineFunction *MF);
+
+  /// Decide if @MI is a spill instruction and return true if it is. We use 2
+  /// criteria to make this decision:
+  /// - Is this instruction a store to a spill slot?
+  /// - Is there a register operand that is both used and killed?
+  /// TODO: Store optimization can fold spills into other stores (including
+  /// other spills). We do not handle this yet (more than one memory operand).
+  bool isLocationSpill(const MachineInstr &MI, MachineFunction *MF,
+                       Register &Reg);
+
+  /// Returns true if the given machine instruction is a debug value which we
+  /// can emit entry values for.
+  ///
+  /// Currently, we generate debug entry values only for parameters that are
+  /// unmodified throughout the function and located in a register.
+  bool isEntryValueCandidate(const MachineInstr &MI,
+                             const DefinedRegsSet &Regs) const;
+
+  /// If a given instruction is identified as a spill, return the spill location
+  /// and set \p Reg to the spilled register.
+  Optional<VarLoc::SpillLoc> isRestoreInstruction(const MachineInstr &MI,
+                                                  MachineFunction *MF,
+                                                  Register &Reg);
+  /// Given a spill instruction, extract the register and offset used to
+  /// address the spill location in a target independent way.
+  VarLoc::SpillLoc extractSpillBaseRegAndOffset(const MachineInstr &MI);
+  void insertTransferDebugPair(MachineInstr &MI, OpenRangesSet &OpenRanges,
+                               TransferMap &Transfers, VarLocMap &VarLocIDs,
+                               LocIndex OldVarID, TransferKind Kind,
+                               Register NewReg = Register());
+
+  void transferDebugValue(const MachineInstr &MI, OpenRangesSet &OpenRanges,
+                          VarLocMap &VarLocIDs);
+  void transferSpillOrRestoreInst(MachineInstr &MI, OpenRangesSet &OpenRanges,
+                                  VarLocMap &VarLocIDs, TransferMap &Transfers);
+  bool removeEntryValue(const MachineInstr &MI, OpenRangesSet &OpenRanges,
+                        VarLocMap &VarLocIDs, const VarLoc &EntryVL);
+  void emitEntryValues(MachineInstr &MI, OpenRangesSet &OpenRanges,
+                       VarLocMap &VarLocIDs, TransferMap &Transfers,
+                       VarLocSet &KillSet);
+  void recordEntryValue(const MachineInstr &MI,
+                        const DefinedRegsSet &DefinedRegs,
+                        OpenRangesSet &OpenRanges, VarLocMap &VarLocIDs);
+  void transferRegisterCopy(MachineInstr &MI, OpenRangesSet &OpenRanges,
+                            VarLocMap &VarLocIDs, TransferMap &Transfers);
+  void transferRegisterDef(MachineInstr &MI, OpenRangesSet &OpenRanges,
+                           VarLocMap &VarLocIDs, TransferMap &Transfers);
+  bool transferTerminator(MachineBasicBlock *MBB, OpenRangesSet &OpenRanges,
+                          VarLocInMBB &OutLocs, const VarLocMap &VarLocIDs);
+
+  void process(MachineInstr &MI, OpenRangesSet &OpenRanges,
+               VarLocMap &VarLocIDs, TransferMap &Transfers);
+
+  void accumulateFragmentMap(MachineInstr &MI, VarToFragments &SeenFragments,
+                             OverlapMap &OLapMap);
+
+  bool join(MachineBasicBlock &MBB, VarLocInMBB &OutLocs, VarLocInMBB &InLocs,
+            const VarLocMap &VarLocIDs,
+            SmallPtrSet<const MachineBasicBlock *, 16> &Visited,
+            SmallPtrSetImpl<const MachineBasicBlock *> &ArtificialBlocks);
+
+  /// Create DBG_VALUE insts for inlocs that have been propagated but
+  /// had their instruction creation deferred.
+  void flushPendingLocs(VarLocInMBB &PendingInLocs, VarLocMap &VarLocIDs);
+
+  bool ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC) override;
+
+public:
+  /// Default construct and initialize the pass.
+  VarLocBasedLDV();
+
+  ~VarLocBasedLDV();
+
+  /// Print to ostream with a message.
+  void printVarLocInMBB(const MachineFunction &MF, const VarLocInMBB &V,
+                        const VarLocMap &VarLocIDs, const char *msg,
+                        raw_ostream &Out) const;
+};
+
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+//            Implementation
+//===----------------------------------------------------------------------===//
+
+VarLocBasedLDV::VarLocBasedLDV() { }
+
+VarLocBasedLDV::~VarLocBasedLDV() { }
+
+/// Erase a variable from the set of open ranges, and additionally erase any
+/// fragments that may overlap it. If the VarLoc is a backup location, erase
+/// the variable from the EntryValuesBackupVars set, indicating we should stop
+/// tracking its backup entry location. Otherwise, if the VarLoc is primary
+/// location, erase the variable from the Vars set.
+void VarLocBasedLDV::OpenRangesSet::erase(const VarLoc &VL) {
+  // Erasure helper.
+  auto DoErase = [VL, this](DebugVariable VarToErase) {
+    auto *EraseFrom = VL.isEntryBackupLoc() ? &EntryValuesBackupVars : &Vars;
+    auto It = EraseFrom->find(VarToErase);
+    if (It != EraseFrom->end()) {
+      LocIndex ID = It->second;
+      VarLocs.reset(ID.getAsRawInteger());
+      EraseFrom->erase(It);
+    }
+  };
+
+  DebugVariable Var = VL.Var;
+
+  // Erase the variable/fragment that ends here.
+  DoErase(Var);
+
+  // Extract the fragment. Interpret an empty fragment as one that covers all
+  // possible bits.
+  FragmentInfo ThisFragment = Var.getFragmentOrDefault();
+
+  // There may be fragments that overlap the designated fragment. Look them up
+  // in the pre-computed overlap map, and erase them too.
+  auto MapIt = OverlappingFragments.find({Var.getVariable(), ThisFragment});
+  if (MapIt != OverlappingFragments.end()) {
+    for (auto Fragment : MapIt->second) {
+      VarLocBasedLDV::OptFragmentInfo FragmentHolder;
+      if (!DebugVariable::isDefaultFragment(Fragment))
+        FragmentHolder = VarLocBasedLDV::OptFragmentInfo(Fragment);
+      DoErase({Var.getVariable(), FragmentHolder, Var.getInlinedAt()});
+    }
+  }
+}
+
+void VarLocBasedLDV::OpenRangesSet::erase(const VarLocSet &KillSet,
+                                           const VarLocMap &VarLocIDs) {
+  VarLocs.intersectWithComplement(KillSet);
+  for (uint64_t ID : KillSet) {
+    const VarLoc *VL = &VarLocIDs[LocIndex::fromRawInteger(ID)];
+    auto *EraseFrom = VL->isEntryBackupLoc() ? &EntryValuesBackupVars : &Vars;
+    EraseFrom->erase(VL->Var);
+  }
+}
+
+void VarLocBasedLDV::OpenRangesSet::insert(LocIndex VarLocID,
+                                            const VarLoc &VL) {
+  auto *InsertInto = VL.isEntryBackupLoc() ? &EntryValuesBackupVars : &Vars;
+  VarLocs.set(VarLocID.getAsRawInteger());
+  InsertInto->insert({VL.Var, VarLocID});
+}
+
+/// Return the Loc ID of an entry value backup location, if it exists for the
+/// variable.
+llvm::Optional<LocIndex>
+VarLocBasedLDV::OpenRangesSet::getEntryValueBackup(DebugVariable Var) {
+  auto It = EntryValuesBackupVars.find(Var);
+  if (It != EntryValuesBackupVars.end())
+    return It->second;
+
+  return llvm::None;
+}
+
+void VarLocBasedLDV::collectIDsForRegs(VarLocSet &Collected,
+                                        const DefinedRegsSet &Regs,
+                                        const VarLocSet &CollectFrom) const {
+  assert(!Regs.empty() && "Nothing to collect");
+  SmallVector<uint32_t, 32> SortedRegs;
+  for (Register Reg : Regs)
+    SortedRegs.push_back(Reg);
+  array_pod_sort(SortedRegs.begin(), SortedRegs.end());
+  auto It = CollectFrom.find(LocIndex::rawIndexForReg(SortedRegs.front()));
+  auto End = CollectFrom.end();
+  for (uint32_t Reg : SortedRegs) {
+    // The half-open interval [FirstIndexForReg, FirstInvalidIndex) contains all
+    // possible VarLoc IDs for VarLocs of kind RegisterKind which live in Reg.
+    uint64_t FirstIndexForReg = LocIndex::rawIndexForReg(Reg);
+    uint64_t FirstInvalidIndex = LocIndex::rawIndexForReg(Reg + 1);
+    It.advanceToLowerBound(FirstIndexForReg);
+
+    // Iterate through that half-open interval and collect all the set IDs.
+    for (; It != End && *It < FirstInvalidIndex; ++It)
+      Collected.set(*It);
+
+    if (It == End)
+      return;
+  }
+}
+
+void VarLocBasedLDV::getUsedRegs(const VarLocSet &CollectFrom,
+                                  SmallVectorImpl<uint32_t> &UsedRegs) const {
+  // All register-based VarLocs are assigned indices greater than or equal to
+  // FirstRegIndex.
+  uint64_t FirstRegIndex = LocIndex::rawIndexForReg(1);
+  uint64_t FirstInvalidIndex =
+      LocIndex::rawIndexForReg(LocIndex::kFirstInvalidRegLocation);
+  for (auto It = CollectFrom.find(FirstRegIndex),
+            End = CollectFrom.find(FirstInvalidIndex);
+       It != End;) {
+    // We found a VarLoc ID for a VarLoc that lives in a register. Figure out
+    // which register and add it to UsedRegs.
+    uint32_t FoundReg = LocIndex::fromRawInteger(*It).Location;
+    assert((UsedRegs.empty() || FoundReg != UsedRegs.back()) &&
+           "Duplicate used reg");
+    UsedRegs.push_back(FoundReg);
+
+    // Skip to the next /set/ register. Note that this finds a lower bound, so
+    // even if there aren't any VarLocs living in `FoundReg+1`, we're still
+    // guaranteed to move on to the next register (or to end()).
+    uint64_t NextRegIndex = LocIndex::rawIndexForReg(FoundReg + 1);
+    It.advanceToLowerBound(NextRegIndex);
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//            Debug Range Extension Implementation
+//===----------------------------------------------------------------------===//
+
+#ifndef NDEBUG
+void VarLocBasedLDV::printVarLocInMBB(const MachineFunction &MF,
+                                       const VarLocInMBB &V,
+                                       const VarLocMap &VarLocIDs,
+                                       const char *msg,
+                                       raw_ostream &Out) const {
+  Out << '\n' << msg << '\n';
+  for (const MachineBasicBlock &BB : MF) {
+    if (!V.count(&BB))
+      continue;
+    const VarLocSet &L = getVarLocsInMBB(&BB, V);
+    if (L.empty())
+      continue;
+    Out << "MBB: " << BB.getNumber() << ":\n";
+    for (uint64_t VLL : L) {
+      const VarLoc &VL = VarLocIDs[LocIndex::fromRawInteger(VLL)];
+      Out << " Var: " << VL.Var.getVariable()->getName();
+      Out << " MI: ";
+      VL.dump(TRI, Out);
+    }
+  }
+  Out << "\n";
+}
+#endif
+
+VarLocBasedLDV::VarLoc::SpillLoc
+VarLocBasedLDV::extractSpillBaseRegAndOffset(const MachineInstr &MI) {
+  assert(MI.hasOneMemOperand() &&
+         "Spill instruction does not have exactly one memory operand?");
+  auto MMOI = MI.memoperands_begin();
+  const PseudoSourceValue *PVal = (*MMOI)->getPseudoValue();
+  assert(PVal->kind() == PseudoSourceValue::FixedStack &&
+         "Inconsistent memory operand in spill instruction");
+  int FI = cast<FixedStackPseudoSourceValue>(PVal)->getFrameIndex();
+  const MachineBasicBlock *MBB = MI.getParent();
+  Register Reg;
+  StackOffset Offset = TFI->getFrameIndexReference(*MBB->getParent(), FI, Reg);
+  return {Reg, Offset};
+}
+
+/// Try to salvage the debug entry value if we encounter a new debug value
+/// describing the same parameter, otherwise stop tracking the value. Return
+/// true if we should stop tracking the entry value, otherwise return false.
+bool VarLocBasedLDV::removeEntryValue(const MachineInstr &MI,
+                                       OpenRangesSet &OpenRanges,
+                                       VarLocMap &VarLocIDs,
+                                       const VarLoc &EntryVL) {
+  // Skip the DBG_VALUE which is the debug entry value itself.
+  if (MI.isIdenticalTo(EntryVL.MI))
+    return false;
+
+  // If the parameter's location is not register location, we can not track
+  // the entry value any more. In addition, if the debug expression from the
+  // DBG_VALUE is not empty, we can assume the parameter's value has changed
+  // indicating that we should stop tracking its entry value as well.
+  if (!MI.getDebugOperand(0).isReg() ||
+      MI.getDebugExpression()->getNumElements() != 0)
+    return true;
+
+  // If the DBG_VALUE comes from a copy instruction that copies the entry value,
+  // it means the parameter's value has not changed and we should be able to use
+  // its entry value.
+  bool TrySalvageEntryValue = false;
+  Register Reg = MI.getDebugOperand(0).getReg();
+  auto I = std::next(MI.getReverseIterator());
+  const MachineOperand *SrcRegOp, *DestRegOp;
+  if (I != MI.getParent()->rend()) {
+    // TODO: Try to keep tracking of an entry value if we encounter a propagated
+    // DBG_VALUE describing the copy of the entry value. (Propagated entry value
+    // does not indicate the parameter modification.)
+    auto DestSrc = TII->isCopyInstr(*I);
+    if (!DestSrc)
+      return true;
+
+    SrcRegOp = DestSrc->Source;
+    DestRegOp = DestSrc->Destination;
+    if (Reg != DestRegOp->getReg())
+      return true;
+    TrySalvageEntryValue = true;
+  }
+
+  if (TrySalvageEntryValue) {
+    for (uint64_t ID : OpenRanges.getEntryValueBackupVarLocs()) {
+      const VarLoc &VL = VarLocIDs[LocIndex::fromRawInteger(ID)];
+      if (VL.getEntryValueCopyBackupReg() == Reg &&
+          VL.MI.getDebugOperand(0).getReg() == SrcRegOp->getReg())
+        return false;
+    }
+  }
+
+  return true;
+}
+
+/// End all previous ranges related to @MI and start a new range from @MI
+/// if it is a DBG_VALUE instr.
+void VarLocBasedLDV::transferDebugValue(const MachineInstr &MI,
+                                         OpenRangesSet &OpenRanges,
+                                         VarLocMap &VarLocIDs) {
+  if (!MI.isDebugValue())
+    return;
+  const DILocalVariable *Var = MI.getDebugVariable();
+  const DIExpression *Expr = MI.getDebugExpression();
+  const DILocation *DebugLoc = MI.getDebugLoc();
+  const DILocation *InlinedAt = DebugLoc->getInlinedAt();
+  assert(Var->isValidLocationForIntrinsic(DebugLoc) &&
+         "Expected inlined-at fields to agree");
+
+  DebugVariable V(Var, Expr, InlinedAt);
+
+  // Check if this DBG_VALUE indicates a parameter's value changing.
+  // If that is the case, we should stop tracking its entry value.
+  auto EntryValBackupID = OpenRanges.getEntryValueBackup(V);
+  if (Var->isParameter() && EntryValBackupID) {
+    const VarLoc &EntryVL = VarLocIDs[*EntryValBackupID];
+    if (removeEntryValue(MI, OpenRanges, VarLocIDs, EntryVL)) {
+      LLVM_DEBUG(dbgs() << "Deleting a DBG entry value because of: ";
+                 MI.print(dbgs(), /*IsStandalone*/ false,
+                          /*SkipOpers*/ false, /*SkipDebugLoc*/ false,
+                          /*AddNewLine*/ true, TII));
+      OpenRanges.erase(EntryVL);
+    }
+  }
+
+  if (isDbgValueDescribedByReg(MI) || MI.getDebugOperand(0).isImm() ||
+      MI.getDebugOperand(0).isFPImm() || MI.getDebugOperand(0).isCImm()) {
+    // Use normal VarLoc constructor for registers and immediates.
+    VarLoc VL(MI, LS);
+    // End all previous ranges of VL.Var.
+    OpenRanges.erase(VL);
+
+    LocIndex ID = VarLocIDs.insert(VL);
+    // Add the VarLoc to OpenRanges from this DBG_VALUE.
+    OpenRanges.insert(ID, VL);
+  } else if (MI.hasOneMemOperand()) {
+    llvm_unreachable("DBG_VALUE with mem operand encountered after regalloc?");
+  } else {
+    // This must be an undefined location. If it has an open range, erase it.
+    assert(MI.getDebugOperand(0).isReg() &&
+           MI.getDebugOperand(0).getReg() == 0 &&
+           "Unexpected non-undef DBG_VALUE encountered");
+    VarLoc VL(MI, LS);
+    OpenRanges.erase(VL);
+  }
+}
+
+/// Turn the entry value backup locations into primary locations.
+void VarLocBasedLDV::emitEntryValues(MachineInstr &MI,
+                                      OpenRangesSet &OpenRanges,
+                                      VarLocMap &VarLocIDs,
+                                      TransferMap &Transfers,
+                                      VarLocSet &KillSet) {
+  // Do not insert entry value locations after a terminator.
+  if (MI.isTerminator())
+    return;
+
+  for (uint64_t ID : KillSet) {
+    LocIndex Idx = LocIndex::fromRawInteger(ID);
+    const VarLoc &VL = VarLocIDs[Idx];
+    if (!VL.Var.getVariable()->isParameter())
+      continue;
+
+    auto DebugVar = VL.Var;
+    Optional<LocIndex> EntryValBackupID =
+        OpenRanges.getEntryValueBackup(DebugVar);
+
+    // If the parameter has the entry value backup, it means we should
+    // be able to use its entry value.
+    if (!EntryValBackupID)
+      continue;
+
+    const VarLoc &EntryVL = VarLocIDs[*EntryValBackupID];
+    VarLoc EntryLoc =
+        VarLoc::CreateEntryLoc(EntryVL.MI, LS, EntryVL.Expr, EntryVL.Loc.RegNo);
+    LocIndex EntryValueID = VarLocIDs.insert(EntryLoc);
+    Transfers.push_back({&MI, EntryValueID});
+    OpenRanges.insert(EntryValueID, EntryLoc);
+  }
+}
+
+/// Create new TransferDebugPair and insert it in \p Transfers. The VarLoc
+/// with \p OldVarID should be deleted form \p OpenRanges and replaced with
+/// new VarLoc. If \p NewReg is different than default zero value then the
+/// new location will be register location created by the copy like instruction,
+/// otherwise it is variable's location on the stack.
+void VarLocBasedLDV::insertTransferDebugPair(
+    MachineInstr &MI, OpenRangesSet &OpenRanges, TransferMap &Transfers,
+    VarLocMap &VarLocIDs, LocIndex OldVarID, TransferKind Kind,
+    Register NewReg) {
+  const MachineInstr *DebugInstr = &VarLocIDs[OldVarID].MI;
+
+  auto ProcessVarLoc = [&MI, &OpenRanges, &Transfers, &VarLocIDs](VarLoc &VL) {
+    LocIndex LocId = VarLocIDs.insert(VL);
+
+    // Close this variable's previous location range.
+    OpenRanges.erase(VL);
+
+    // Record the new location as an open range, and a postponed transfer
+    // inserting a DBG_VALUE for this location.
+    OpenRanges.insert(LocId, VL);
+    assert(!MI.isTerminator() && "Cannot insert DBG_VALUE after terminator");
+    TransferDebugPair MIP = {&MI, LocId};
+    Transfers.push_back(MIP);
+  };
+
+  // End all previous ranges of VL.Var.
+  OpenRanges.erase(VarLocIDs[OldVarID]);
+  switch (Kind) {
+  case TransferKind::TransferCopy: {
+    assert(NewReg &&
+           "No register supplied when handling a copy of a debug value");
+    // Create a DBG_VALUE instruction to describe the Var in its new
+    // register location.
+    VarLoc VL = VarLoc::CreateCopyLoc(*DebugInstr, LS, NewReg);
+    ProcessVarLoc(VL);
+    LLVM_DEBUG({
+      dbgs() << "Creating VarLoc for register copy:";
+      VL.dump(TRI);
+    });
+    return;
+  }
+  case TransferKind::TransferSpill: {
+    // Create a DBG_VALUE instruction to describe the Var in its spilled
+    // location.
+    VarLoc::SpillLoc SpillLocation = extractSpillBaseRegAndOffset(MI);
+    VarLoc VL = VarLoc::CreateSpillLoc(*DebugInstr, SpillLocation.SpillBase,
+                                       SpillLocation.SpillOffset, LS);
+    ProcessVarLoc(VL);
+    LLVM_DEBUG({
+      dbgs() << "Creating VarLoc for spill:";
+      VL.dump(TRI);
+    });
+    return;
+  }
+  case TransferKind::TransferRestore: {
+    assert(NewReg &&
+           "No register supplied when handling a restore of a debug value");
+    // DebugInstr refers to the pre-spill location, therefore we can reuse
+    // its expression.
+    VarLoc VL = VarLoc::CreateCopyLoc(*DebugInstr, LS, NewReg);
+    ProcessVarLoc(VL);
+    LLVM_DEBUG({
+      dbgs() << "Creating VarLoc for restore:";
+      VL.dump(TRI);
+    });
+    return;
+  }
+  }
+  llvm_unreachable("Invalid transfer kind");
+}
+
+/// A definition of a register may mark the end of a range.
+void VarLocBasedLDV::transferRegisterDef(
+    MachineInstr &MI, OpenRangesSet &OpenRanges, VarLocMap &VarLocIDs,
+    TransferMap &Transfers) {
+
+  // Meta Instructions do not affect the debug liveness of any register they
+  // define.
+  if (MI.isMetaInstruction())
+    return;
+
+  MachineFunction *MF = MI.getMF();
+  const TargetLowering *TLI = MF->getSubtarget().getTargetLowering();
+  Register SP = TLI->getStackPointerRegisterToSaveRestore();
+
+  // Find the regs killed by MI, and find regmasks of preserved regs.
+  DefinedRegsSet DeadRegs;
+  SmallVector<const uint32_t *, 4> RegMasks;
+  for (const MachineOperand &MO : MI.operands()) {
+    // Determine whether the operand is a register def.
+    if (MO.isReg() && MO.isDef() && MO.getReg() &&
+        Register::isPhysicalRegister(MO.getReg()) &&
+        !(MI.isCall() && MO.getReg() == SP)) {
+      // Remove ranges of all aliased registers.
+      for (MCRegAliasIterator RAI(MO.getReg(), TRI, true); RAI.isValid(); ++RAI)
+        // FIXME: Can we break out of this loop early if no insertion occurs?
+        DeadRegs.insert(*RAI);
+    } else if (MO.isRegMask()) {
+      RegMasks.push_back(MO.getRegMask());
+    }
+  }
+
+  // Erase VarLocs which reside in one of the dead registers. For performance
+  // reasons, it's critical to not iterate over the full set of open VarLocs.
+  // Iterate over the set of dying/used regs instead.
+  if (!RegMasks.empty()) {
+    SmallVector<uint32_t, 32> UsedRegs;
+    getUsedRegs(OpenRanges.getVarLocs(), UsedRegs);
+    for (uint32_t Reg : UsedRegs) {
+      // Remove ranges of all clobbered registers. Register masks don't usually
+      // list SP as preserved. Assume that call instructions never clobber SP,
+      // because some backends (e.g., AArch64) never list SP in the regmask.
+      // While the debug info may be off for an instruction or two around
+      // callee-cleanup calls, transferring the DEBUG_VALUE across the call is
+      // still a better user experience.
+      if (Reg == SP)
+        continue;
+      bool AnyRegMaskKillsReg =
+          any_of(RegMasks, [Reg](const uint32_t *RegMask) {
+            return MachineOperand::clobbersPhysReg(RegMask, Reg);
+          });
+      if (AnyRegMaskKillsReg)
+        DeadRegs.insert(Reg);
+    }
+  }
+
+  if (DeadRegs.empty())
+    return;
+
+  VarLocSet KillSet(Alloc);
+  collectIDsForRegs(KillSet, DeadRegs, OpenRanges.getVarLocs());
+  OpenRanges.erase(KillSet, VarLocIDs);
+
+  if (TPC) {
+    auto &TM = TPC->getTM<TargetMachine>();
+    if (TM.Options.ShouldEmitDebugEntryValues())
+      emitEntryValues(MI, OpenRanges, VarLocIDs, Transfers, KillSet);
+  }
+}
+
+bool VarLocBasedLDV::isSpillInstruction(const MachineInstr &MI,
+                                         MachineFunction *MF) {
+  // TODO: Handle multiple stores folded into one.
+  if (!MI.hasOneMemOperand())
+    return false;
+
+  if (!MI.getSpillSize(TII) && !MI.getFoldedSpillSize(TII))
+    return false; // This is not a spill instruction, since no valid size was
+                  // returned from either function.
+
+  return true;
+}
+
+bool VarLocBasedLDV::isLocationSpill(const MachineInstr &MI,
+                                      MachineFunction *MF, Register &Reg) {
+  if (!isSpillInstruction(MI, MF))
+    return false;
+
+  auto isKilledReg = [&](const MachineOperand MO, Register &Reg) {
+    if (!MO.isReg() || !MO.isUse()) {
+      Reg = 0;
+      return false;
+    }
+    Reg = MO.getReg();
+    return MO.isKill();
+  };
+
+  for (const MachineOperand &MO : MI.operands()) {
+    // In a spill instruction generated by the InlineSpiller the spilled
+    // register has its kill flag set.
+    if (isKilledReg(MO, Reg))
+      return true;
+    if (Reg != 0) {
+      // Check whether next instruction kills the spilled register.
+      // FIXME: Current solution does not cover search for killed register in
+      // bundles and instructions further down the chain.
+      auto NextI = std::next(MI.getIterator());
+      // Skip next instruction that points to basic block end iterator.
+      if (MI.getParent()->end() == NextI)
+        continue;
+      Register RegNext;
+      for (const MachineOperand &MONext : NextI->operands()) {
+        // Return true if we came across the register from the
+        // previous spill instruction that is killed in NextI.
+        if (isKilledReg(MONext, RegNext) && RegNext == Reg)
+          return true;
+      }
+    }
+  }
+  // Return false if we didn't find spilled register.
+  return false;
+}
+
+Optional<VarLocBasedLDV::VarLoc::SpillLoc>
+VarLocBasedLDV::isRestoreInstruction(const MachineInstr &MI,
+                                      MachineFunction *MF, Register &Reg) {
+  if (!MI.hasOneMemOperand())
+    return None;
+
+  // FIXME: Handle folded restore instructions with more than one memory
+  // operand.
+  if (MI.getRestoreSize(TII)) {
+    Reg = MI.getOperand(0).getReg();
+    return extractSpillBaseRegAndOffset(MI);
+  }
+  return None;
+}
+
+/// A spilled register may indicate that we have to end the current range of
+/// a variable and create a new one for the spill location.
+/// A restored register may indicate the reverse situation.
+/// We don't want to insert any instructions in process(), so we just create
+/// the DBG_VALUE without inserting it and keep track of it in \p Transfers.
+/// It will be inserted into the BB when we're done iterating over the
+/// instructions.
+void VarLocBasedLDV::transferSpillOrRestoreInst(MachineInstr &MI,
+                                                 OpenRangesSet &OpenRanges,
+                                                 VarLocMap &VarLocIDs,
+                                                 TransferMap &Transfers) {
+  MachineFunction *MF = MI.getMF();
+  TransferKind TKind;
+  Register Reg;
+  Optional<VarLoc::SpillLoc> Loc;
+
+  LLVM_DEBUG(dbgs() << "Examining instruction: "; MI.dump(););
+
+  // First, if there are any DBG_VALUEs pointing at a spill slot that is
+  // written to, then close the variable location. The value in memory
+  // will have changed.
+  VarLocSet KillSet(Alloc);
+  if (isSpillInstruction(MI, MF)) {
+    Loc = extractSpillBaseRegAndOffset(MI);
+    for (uint64_t ID : OpenRanges.getSpillVarLocs()) {
+      LocIndex Idx = LocIndex::fromRawInteger(ID);
+      const VarLoc &VL = VarLocIDs[Idx];
+      assert(VL.Kind == VarLoc::SpillLocKind && "Broken VarLocSet?");
+      if (VL.Loc.SpillLocation == *Loc) {
+        // This location is overwritten by the current instruction -- terminate
+        // the open range, and insert an explicit DBG_VALUE $noreg.
+        //
+        // Doing this at a later stage would require re-interpreting all
+        // DBG_VALUes and DIExpressions to identify whether they point at
+        // memory, and then analysing all memory writes to see if they
+        // overwrite that memory, which is expensive.
+        //
+        // At this stage, we already know which DBG_VALUEs are for spills and
+        // where they are located; it's best to fix handle overwrites now.
+        KillSet.set(ID);
+        VarLoc UndefVL = VarLoc::CreateCopyLoc(VL.MI, LS, 0);
+        LocIndex UndefLocID = VarLocIDs.insert(UndefVL);
+        Transfers.push_back({&MI, UndefLocID});
+      }
+    }
+    OpenRanges.erase(KillSet, VarLocIDs);
+  }
+
+  // Try to recognise spill and restore instructions that may create a new
+  // variable location.
+  if (isLocationSpill(MI, MF, Reg)) {
+    TKind = TransferKind::TransferSpill;
+    LLVM_DEBUG(dbgs() << "Recognized as spill: "; MI.dump(););
+    LLVM_DEBUG(dbgs() << "Register: " << Reg << " " << printReg(Reg, TRI)
+                      << "\n");
+  } else {
+    if (!(Loc = isRestoreInstruction(MI, MF, Reg)))
+      return;
+    TKind = TransferKind::TransferRestore;
+    LLVM_DEBUG(dbgs() << "Recognized as restore: "; MI.dump(););
+    LLVM_DEBUG(dbgs() << "Register: " << Reg << " " << printReg(Reg, TRI)
+                      << "\n");
+  }
+  // Check if the register or spill location is the location of a debug value.
+  auto TransferCandidates = OpenRanges.getEmptyVarLocRange();
+  if (TKind == TransferKind::TransferSpill)
+    TransferCandidates = OpenRanges.getRegisterVarLocs(Reg);
+  else if (TKind == TransferKind::TransferRestore)
+    TransferCandidates = OpenRanges.getSpillVarLocs();
+  for (uint64_t ID : TransferCandidates) {
+    LocIndex Idx = LocIndex::fromRawInteger(ID);
+    const VarLoc &VL = VarLocIDs[Idx];
+    if (TKind == TransferKind::TransferSpill) {
+      assert(VL.isDescribedByReg() == Reg && "Broken VarLocSet?");
+      LLVM_DEBUG(dbgs() << "Spilling Register " << printReg(Reg, TRI) << '('
+                        << VL.Var.getVariable()->getName() << ")\n");
+    } else {
+      assert(TKind == TransferKind::TransferRestore &&
+             VL.Kind == VarLoc::SpillLocKind && "Broken VarLocSet?");
+      if (VL.Loc.SpillLocation != *Loc)
+        // The spill location is not the location of a debug value.
+        continue;
+      LLVM_DEBUG(dbgs() << "Restoring Register " << printReg(Reg, TRI) << '('
+                        << VL.Var.getVariable()->getName() << ")\n");
+    }
+    insertTransferDebugPair(MI, OpenRanges, Transfers, VarLocIDs, Idx, TKind,
+                            Reg);
+    // FIXME: A comment should explain why it's correct to return early here,
+    // if that is in fact correct.
+    return;
+  }
+}
+
+/// If \p MI is a register copy instruction, that copies a previously tracked
+/// value from one register to another register that is callee saved, we
+/// create new DBG_VALUE instruction  described with copy destination register.
+void VarLocBasedLDV::transferRegisterCopy(MachineInstr &MI,
+                                           OpenRangesSet &OpenRanges,
+                                           VarLocMap &VarLocIDs,
+                                           TransferMap &Transfers) {
+  auto DestSrc = TII->isCopyInstr(MI);
+  if (!DestSrc)
+    return;
+
+  const MachineOperand *DestRegOp = DestSrc->Destination;
+  const MachineOperand *SrcRegOp = DestSrc->Source;
+
+  if (!DestRegOp->isDef())
+    return;
+
+  auto isCalleeSavedReg = [&](Register Reg) {
+    for (MCRegAliasIterator RAI(Reg, TRI, true); RAI.isValid(); ++RAI)
+      if (CalleeSavedRegs.test(*RAI))
+        return true;
+    return false;
+  };
+
+  Register SrcReg = SrcRegOp->getReg();
+  Register DestReg = DestRegOp->getReg();
+
+  // We want to recognize instructions where destination register is callee
+  // saved register. If register that could be clobbered by the call is
+  // included, there would be a great chance that it is going to be clobbered
+  // soon. It is more likely that previous register location, which is callee
+  // saved, is going to stay unclobbered longer, even if it is killed.
+  if (!isCalleeSavedReg(DestReg))
+    return;
+
+  // Remember an entry value movement. If we encounter a new debug value of
+  // a parameter describing only a moving of the value around, rather then
+  // modifying it, we are still able to use the entry value if needed.
+  if (isRegOtherThanSPAndFP(*DestRegOp, MI, TRI)) {
+    for (uint64_t ID : OpenRanges.getEntryValueBackupVarLocs()) {
+      LocIndex Idx = LocIndex::fromRawInteger(ID);
+      const VarLoc &VL = VarLocIDs[Idx];
+      if (VL.getEntryValueBackupReg() == SrcReg) {
+        LLVM_DEBUG(dbgs() << "Copy of the entry value: "; MI.dump(););
+        VarLoc EntryValLocCopyBackup =
+            VarLoc::CreateEntryCopyBackupLoc(VL.MI, LS, VL.Expr, DestReg);
+
+        // Stop tracking the original entry value.
+        OpenRanges.erase(VL);
+
+        // Start tracking the entry value copy.
+        LocIndex EntryValCopyLocID = VarLocIDs.insert(EntryValLocCopyBackup);
+        OpenRanges.insert(EntryValCopyLocID, EntryValLocCopyBackup);
+        break;
+      }
+    }
+  }
+
+  if (!SrcRegOp->isKill())
+    return;
+
+  for (uint64_t ID : OpenRanges.getRegisterVarLocs(SrcReg)) {
+    LocIndex Idx = LocIndex::fromRawInteger(ID);
+    assert(VarLocIDs[Idx].isDescribedByReg() == SrcReg && "Broken VarLocSet?");
+    insertTransferDebugPair(MI, OpenRanges, Transfers, VarLocIDs, Idx,
+                            TransferKind::TransferCopy, DestReg);
+    // FIXME: A comment should explain why it's correct to return early here,
+    // if that is in fact correct.
+    return;
+  }
+}
+
+/// Terminate all open ranges at the end of the current basic block.
+bool VarLocBasedLDV::transferTerminator(MachineBasicBlock *CurMBB,
+                                         OpenRangesSet &OpenRanges,
+                                         VarLocInMBB &OutLocs,
+                                         const VarLocMap &VarLocIDs) {
+  bool Changed = false;
+
+  LLVM_DEBUG(for (uint64_t ID
+                  : OpenRanges.getVarLocs()) {
+    // Copy OpenRanges to OutLocs, if not already present.
+    dbgs() << "Add to OutLocs in MBB #" << CurMBB->getNumber() << ":  ";
+    VarLocIDs[LocIndex::fromRawInteger(ID)].dump(TRI);
+  });
+  VarLocSet &VLS = getVarLocsInMBB(CurMBB, OutLocs);
+  Changed = VLS != OpenRanges.getVarLocs();
+  // New OutLocs set may be different due to spill, restore or register
+  // copy instruction processing.
+  if (Changed)
+    VLS = OpenRanges.getVarLocs();
+  OpenRanges.clear();
+  return Changed;
+}
+
+/// Accumulate a mapping between each DILocalVariable fragment and other
+/// fragments of that DILocalVariable which overlap. This reduces work during
+/// the data-flow stage from "Find any overlapping fragments" to "Check if the
+/// known-to-overlap fragments are present".
+/// \param MI A previously unprocessed DEBUG_VALUE instruction to analyze for
+///           fragment usage.
+/// \param SeenFragments Map from DILocalVariable to all fragments of that
+///           Variable which are known to exist.
+/// \param OverlappingFragments The overlap map being constructed, from one
+///           Var/Fragment pair to a vector of fragments known to overlap.
+void VarLocBasedLDV::accumulateFragmentMap(MachineInstr &MI,
+                                            VarToFragments &SeenFragments,
+                                            OverlapMap &OverlappingFragments) {
+  DebugVariable MIVar(MI.getDebugVariable(), MI.getDebugExpression(),
+                      MI.getDebugLoc()->getInlinedAt());
+  FragmentInfo ThisFragment = MIVar.getFragmentOrDefault();
+
+  // If this is the first sighting of this variable, then we are guaranteed
+  // there are currently no overlapping fragments either. Initialize the set
+  // of seen fragments, record no overlaps for the current one, and return.
+  auto SeenIt = SeenFragments.find(MIVar.getVariable());
+  if (SeenIt == SeenFragments.end()) {
+    SmallSet<FragmentInfo, 4> OneFragment;
+    OneFragment.insert(ThisFragment);
+    SeenFragments.insert({MIVar.getVariable(), OneFragment});
+
+    OverlappingFragments.insert({{MIVar.getVariable(), ThisFragment}, {}});
+    return;
+  }
+
+  // If this particular Variable/Fragment pair already exists in the overlap
+  // map, it has already been accounted for.
+  auto IsInOLapMap =
+      OverlappingFragments.insert({{MIVar.getVariable(), ThisFragment}, {}});
+  if (!IsInOLapMap.second)
+    return;
+
+  auto &ThisFragmentsOverlaps = IsInOLapMap.first->second;
+  auto &AllSeenFragments = SeenIt->second;
+
+  // Otherwise, examine all other seen fragments for this variable, with "this"
+  // fragment being a previously unseen fragment. Record any pair of
+  // overlapping fragments.
+  for (auto &ASeenFragment : AllSeenFragments) {
+    // Does this previously seen fragment overlap?
+    if (DIExpression::fragmentsOverlap(ThisFragment, ASeenFragment)) {
+      // Yes: Mark the current fragment as being overlapped.
+      ThisFragmentsOverlaps.push_back(ASeenFragment);
+      // Mark the previously seen fragment as being overlapped by the current
+      // one.
+      auto ASeenFragmentsOverlaps =
+          OverlappingFragments.find({MIVar.getVariable(), ASeenFragment});
+      assert(ASeenFragmentsOverlaps != OverlappingFragments.end() &&
+             "Previously seen var fragment has no vector of overlaps");
+      ASeenFragmentsOverlaps->second.push_back(ThisFragment);
+    }
+  }
+
+  AllSeenFragments.insert(ThisFragment);
+}
+
+/// This routine creates OpenRanges.
+void VarLocBasedLDV::process(MachineInstr &MI, OpenRangesSet &OpenRanges,
+                              VarLocMap &VarLocIDs, TransferMap &Transfers) {
+  transferDebugValue(MI, OpenRanges, VarLocIDs);
+  transferRegisterDef(MI, OpenRanges, VarLocIDs, Transfers);
+  transferRegisterCopy(MI, OpenRanges, VarLocIDs, Transfers);
+  transferSpillOrRestoreInst(MI, OpenRanges, VarLocIDs, Transfers);
+}
+
+/// This routine joins the analysis results of all incoming edges in @MBB by
+/// inserting a new DBG_VALUE instruction at the start of the @MBB - if the same
+/// source variable in all the predecessors of @MBB reside in the same location.
+bool VarLocBasedLDV::join(
+    MachineBasicBlock &MBB, VarLocInMBB &OutLocs, VarLocInMBB &InLocs,
+    const VarLocMap &VarLocIDs,
+    SmallPtrSet<const MachineBasicBlock *, 16> &Visited,
+    SmallPtrSetImpl<const MachineBasicBlock *> &ArtificialBlocks) {
+  LLVM_DEBUG(dbgs() << "join MBB: " << MBB.getNumber() << "\n");
+
+  VarLocSet InLocsT(Alloc); // Temporary incoming locations.
+
+  // For all predecessors of this MBB, find the set of VarLocs that
+  // can be joined.
+  int NumVisited = 0;
+  for (auto p : MBB.predecessors()) {
+    // Ignore backedges if we have not visited the predecessor yet. As the
+    // predecessor hasn't yet had locations propagated into it, most locations
+    // will not yet be valid, so treat them as all being uninitialized and
+    // potentially valid. If a location guessed to be correct here is
+    // invalidated later, we will remove it when we revisit this block.
+    if (!Visited.count(p)) {
+      LLVM_DEBUG(dbgs() << "  ignoring unvisited pred MBB: " << p->getNumber()
+                        << "\n");
+      continue;
+    }
+    auto OL = OutLocs.find(p);
+    // Join is null in case of empty OutLocs from any of the pred.
+    if (OL == OutLocs.end())
+      return false;
+
+    // Just copy over the Out locs to incoming locs for the first visited
+    // predecessor, and for all other predecessors join the Out locs.
+    VarLocSet &OutLocVLS = *OL->second.get();
+    if (!NumVisited)
+      InLocsT = OutLocVLS;
+    else
+      InLocsT &= OutLocVLS;
+
+    LLVM_DEBUG({
+      if (!InLocsT.empty()) {
+        for (uint64_t ID : InLocsT)
+          dbgs() << "  gathered candidate incoming var: "
+                 << VarLocIDs[LocIndex::fromRawInteger(ID)]
+                        .Var.getVariable()
+                        ->getName()
+                 << "\n";
+      }
+    });
+
+    NumVisited++;
+  }
+
+  // Filter out DBG_VALUES that are out of scope.
+  VarLocSet KillSet(Alloc);
+  bool IsArtificial = ArtificialBlocks.count(&MBB);
+  if (!IsArtificial) {
+    for (uint64_t ID : InLocsT) {
+      LocIndex Idx = LocIndex::fromRawInteger(ID);
+      if (!VarLocIDs[Idx].dominates(LS, MBB)) {
+        KillSet.set(ID);
+        LLVM_DEBUG({
+          auto Name = VarLocIDs[Idx].Var.getVariable()->getName();
+          dbgs() << "  killing " << Name << ", it doesn't dominate MBB\n";
+        });
+      }
+    }
+  }
+  InLocsT.intersectWithComplement(KillSet);
+
+  // As we are processing blocks in reverse post-order we
+  // should have processed at least one predecessor, unless it
+  // is the entry block which has no predecessor.
+  assert((NumVisited || MBB.pred_empty()) &&
+         "Should have processed at least one predecessor");
+
+  VarLocSet &ILS = getVarLocsInMBB(&MBB, InLocs);
+  bool Changed = false;
+  if (ILS != InLocsT) {
+    ILS = InLocsT;
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+void VarLocBasedLDV::flushPendingLocs(VarLocInMBB &PendingInLocs,
+                                       VarLocMap &VarLocIDs) {
+  // PendingInLocs records all locations propagated into blocks, which have
+  // not had DBG_VALUE insts created. Go through and create those insts now.
+  for (auto &Iter : PendingInLocs) {
+    // Map is keyed on a constant pointer, unwrap it so we can insert insts.
+    auto &MBB = const_cast<MachineBasicBlock &>(*Iter.first);
+    VarLocSet &Pending = *Iter.second.get();
+
+    for (uint64_t ID : Pending) {
+      // The ID location is live-in to MBB -- work out what kind of machine
+      // location it is and create a DBG_VALUE.
+      const VarLoc &DiffIt = VarLocIDs[LocIndex::fromRawInteger(ID)];
+      if (DiffIt.isEntryBackupLoc())
+        continue;
+      MachineInstr *MI = DiffIt.BuildDbgValue(*MBB.getParent());
+      MBB.insert(MBB.instr_begin(), MI);
+
+      (void)MI;
+      LLVM_DEBUG(dbgs() << "Inserted: "; MI->dump(););
+    }
+  }
+}
+
+bool VarLocBasedLDV::isEntryValueCandidate(
+    const MachineInstr &MI, const DefinedRegsSet &DefinedRegs) const {
+  assert(MI.isDebugValue() && "This must be DBG_VALUE.");
+
+  // TODO: Add support for local variables that are expressed in terms of
+  // parameters entry values.
+  // TODO: Add support for modified arguments that can be expressed
+  // by using its entry value.
+  auto *DIVar = MI.getDebugVariable();
+  if (!DIVar->isParameter())
+    return false;
+
+  // Do not consider parameters that belong to an inlined function.
+  if (MI.getDebugLoc()->getInlinedAt())
+    return false;
+
+  // Only consider parameters that are described using registers. Parameters
+  // that are passed on the stack are not yet supported, so ignore debug
+  // values that are described by the frame or stack pointer.
+  if (!isRegOtherThanSPAndFP(MI.getDebugOperand(0), MI, TRI))
+    return false;
+
+  // If a parameter's value has been propagated from the caller, then the
+  // parameter's DBG_VALUE may be described using a register defined by some
+  // instruction in the entry block, in which case we shouldn't create an
+  // entry value.
+  if (DefinedRegs.count(MI.getDebugOperand(0).getReg()))
+    return false;
+
+  // TODO: Add support for parameters that have a pre-existing debug expressions
+  // (e.g. fragments).
+  if (MI.getDebugExpression()->getNumElements() > 0)
+    return false;
+
+  return true;
+}
+
+/// Collect all register defines (including aliases) for the given instruction.
+static void collectRegDefs(const MachineInstr &MI, DefinedRegsSet &Regs,
+                           const TargetRegisterInfo *TRI) {
+  for (const MachineOperand &MO : MI.operands())
+    if (MO.isReg() && MO.isDef() && MO.getReg())
+      for (MCRegAliasIterator AI(MO.getReg(), TRI, true); AI.isValid(); ++AI)
+        Regs.insert(*AI);
+}
+
+/// This routine records the entry values of function parameters. The values
+/// could be used as backup values. If we loose the track of some unmodified
+/// parameters, the backup values will be used as a primary locations.
+void VarLocBasedLDV::recordEntryValue(const MachineInstr &MI,
+                                       const DefinedRegsSet &DefinedRegs,
+                                       OpenRangesSet &OpenRanges,
+                                       VarLocMap &VarLocIDs) {
+  if (TPC) {
+    auto &TM = TPC->getTM<TargetMachine>();
+    if (!TM.Options.ShouldEmitDebugEntryValues())
+      return;
+  }
+
+  DebugVariable V(MI.getDebugVariable(), MI.getDebugExpression(),
+                  MI.getDebugLoc()->getInlinedAt());
+
+  if (!isEntryValueCandidate(MI, DefinedRegs) ||
+      OpenRanges.getEntryValueBackup(V))
+    return;
+
+  LLVM_DEBUG(dbgs() << "Creating the backup entry location: "; MI.dump(););
+
+  // Create the entry value and use it as a backup location until it is
+  // valid. It is valid until a parameter is not changed.
+  DIExpression *NewExpr =
+      DIExpression::prepend(MI.getDebugExpression(), DIExpression::EntryValue);
+  VarLoc EntryValLocAsBackup = VarLoc::CreateEntryBackupLoc(MI, LS, NewExpr);
+  LocIndex EntryValLocID = VarLocIDs.insert(EntryValLocAsBackup);
+  OpenRanges.insert(EntryValLocID, EntryValLocAsBackup);
+}
+
+/// Calculate the liveness information for the given machine function and
+/// extend ranges across basic blocks.
+bool VarLocBasedLDV::ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC) {
+  LLVM_DEBUG(dbgs() << "\nDebug Range Extension\n");
+
+  if (!MF.getFunction().getSubprogram())
+    // VarLocBaseLDV will already have removed all DBG_VALUEs.
+    return false;
+
+  // Skip functions from NoDebug compilation units.
+  if (MF.getFunction().getSubprogram()->getUnit()->getEmissionKind() ==
+      DICompileUnit::NoDebug)
+    return false;
+
+  TRI = MF.getSubtarget().getRegisterInfo();
+  TII = MF.getSubtarget().getInstrInfo();
+  TFI = MF.getSubtarget().getFrameLowering();
+  TFI->getCalleeSaves(MF, CalleeSavedRegs);
+  this->TPC = TPC;
+  LS.initialize(MF);
+
+  bool Changed = false;
+  bool OLChanged = false;
+  bool MBBJoined = false;
+
+  VarLocMap VarLocIDs;         // Map VarLoc<>unique ID for use in bitvectors.
+  OverlapMap OverlapFragments; // Map of overlapping variable fragments.
+  OpenRangesSet OpenRanges(Alloc, OverlapFragments);
+                              // Ranges that are open until end of bb.
+  VarLocInMBB OutLocs;        // Ranges that exist beyond bb.
+  VarLocInMBB InLocs;         // Ranges that are incoming after joining.
+  TransferMap Transfers;      // DBG_VALUEs associated with transfers (such as
+                              // spills, copies and restores).
+
+  VarToFragments SeenFragments;
+
+  // Blocks which are artificial, i.e. blocks which exclusively contain
+  // instructions without locations, or with line 0 locations.
+  SmallPtrSet<const MachineBasicBlock *, 16> ArtificialBlocks;
+
+  DenseMap<unsigned int, MachineBasicBlock *> OrderToBB;
+  DenseMap<MachineBasicBlock *, unsigned int> BBToOrder;
+  std::priority_queue<unsigned int, std::vector<unsigned int>,
+                      std::greater<unsigned int>>
+      Worklist;
+  std::priority_queue<unsigned int, std::vector<unsigned int>,
+                      std::greater<unsigned int>>
+      Pending;
+
+  // Set of register defines that are seen when traversing the entry block
+  // looking for debug entry value candidates.
+  DefinedRegsSet DefinedRegs;
+
+  // Only in the case of entry MBB collect DBG_VALUEs representing
+  // function parameters in order to generate debug entry values for them.
+  MachineBasicBlock &First_MBB = *(MF.begin());
+  for (auto &MI : First_MBB) {
+    collectRegDefs(MI, DefinedRegs, TRI);
+    if (MI.isDebugValue())
+      recordEntryValue(MI, DefinedRegs, OpenRanges, VarLocIDs);
+  }
+
+  // Initialize per-block structures and scan for fragment overlaps.
+  for (auto &MBB : MF)
+    for (auto &MI : MBB)
+      if (MI.isDebugValue())
+        accumulateFragmentMap(MI, SeenFragments, OverlapFragments);
+
+  auto hasNonArtificialLocation = [](const MachineInstr &MI) -> bool {
+    if (const DebugLoc &DL = MI.getDebugLoc())
+      return DL.getLine() != 0;
+    return false;
+  };
+  for (auto &MBB : MF)
+    if (none_of(MBB.instrs(), hasNonArtificialLocation))
+      ArtificialBlocks.insert(&MBB);
+
+  LLVM_DEBUG(printVarLocInMBB(MF, OutLocs, VarLocIDs,
+                              "OutLocs after initialization", dbgs()));
+
+  ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
+  unsigned int RPONumber = 0;
+  for (auto RI = RPOT.begin(), RE = RPOT.end(); RI != RE; ++RI) {
+    OrderToBB[RPONumber] = *RI;
+    BBToOrder[*RI] = RPONumber;
+    Worklist.push(RPONumber);
+    ++RPONumber;
+  }
+
+  if (RPONumber > InputBBLimit) {
+    unsigned NumInputDbgValues = 0;
+    for (auto &MBB : MF)
+      for (auto &MI : MBB)
+        if (MI.isDebugValue())
+          ++NumInputDbgValues;
+    if (NumInputDbgValues > InputDbgValueLimit) {
+      LLVM_DEBUG(dbgs() << "Disabling VarLocBasedLDV: " << MF.getName()
+                        << " has " << RPONumber << " basic blocks and "
+                        << NumInputDbgValues
+                        << " input DBG_VALUEs, exceeding limits.\n");
+      return false;
+    }
+  }
+
+  // This is a standard "union of predecessor outs" dataflow problem.
+  // To solve it, we perform join() and process() using the two worklist method
+  // until the ranges converge.
+  // Ranges have converged when both worklists are empty.
+  SmallPtrSet<const MachineBasicBlock *, 16> Visited;
+  while (!Worklist.empty() || !Pending.empty()) {
+    // We track what is on the pending worklist to avoid inserting the same
+    // thing twice.  We could avoid this with a custom priority queue, but this
+    // is probably not worth it.
+    SmallPtrSet<MachineBasicBlock *, 16> OnPending;
+    LLVM_DEBUG(dbgs() << "Processing Worklist\n");
+    while (!Worklist.empty()) {
+      MachineBasicBlock *MBB = OrderToBB[Worklist.top()];
+      Worklist.pop();
+      MBBJoined = join(*MBB, OutLocs, InLocs, VarLocIDs, Visited,
+                       ArtificialBlocks);
+      MBBJoined |= Visited.insert(MBB).second;
+      if (MBBJoined) {
+        MBBJoined = false;
+        Changed = true;
+        // Now that we have started to extend ranges across BBs we need to
+        // examine spill, copy and restore instructions to see whether they
+        // operate with registers that correspond to user variables.
+        // First load any pending inlocs.
+        OpenRanges.insertFromLocSet(getVarLocsInMBB(MBB, InLocs), VarLocIDs);
+        for (auto &MI : *MBB)
+          process(MI, OpenRanges, VarLocIDs, Transfers);
+        OLChanged |= transferTerminator(MBB, OpenRanges, OutLocs, VarLocIDs);
+
+        LLVM_DEBUG(printVarLocInMBB(MF, OutLocs, VarLocIDs,
+                                    "OutLocs after propagating", dbgs()));
+        LLVM_DEBUG(printVarLocInMBB(MF, InLocs, VarLocIDs,
+                                    "InLocs after propagating", dbgs()));
+
+        if (OLChanged) {
+          OLChanged = false;
+          for (auto s : MBB->successors())
+            if (OnPending.insert(s).second) {
+              Pending.push(BBToOrder[s]);
+            }
+        }
+      }
+    }
+    Worklist.swap(Pending);
+    // At this point, pending must be empty, since it was just the empty
+    // worklist
+    assert(Pending.empty() && "Pending should be empty");
+  }
+
+  // Add any DBG_VALUE instructions created by location transfers.
+  for (auto &TR : Transfers) {
+    assert(!TR.TransferInst->isTerminator() &&
+           "Cannot insert DBG_VALUE after terminator");
+    MachineBasicBlock *MBB = TR.TransferInst->getParent();
+    const VarLoc &VL = VarLocIDs[TR.LocationID];
+    MachineInstr *MI = VL.BuildDbgValue(MF);
+    MBB->insertAfterBundle(TR.TransferInst->getIterator(), MI);
+  }
+  Transfers.clear();
+
+  // Deferred inlocs will not have had any DBG_VALUE insts created; do
+  // that now.
+  flushPendingLocs(InLocs, VarLocIDs);
+
+  LLVM_DEBUG(printVarLocInMBB(MF, OutLocs, VarLocIDs, "Final OutLocs", dbgs()));
+  LLVM_DEBUG(printVarLocInMBB(MF, InLocs, VarLocIDs, "Final InLocs", dbgs()));
+  return Changed;
+}
+
+LDVImpl *
+llvm::makeVarLocBasedLiveDebugValues()
+{
+  return new VarLocBasedLDV();
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugVariables.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugVariables.cpp
index 158e873370b1..2325341070a3 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugVariables.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugVariables.cpp
@@ -54,7 +54,6 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
@@ -96,6 +95,7 @@ LiveDebugVariables::LiveDebugVariables() : MachineFunctionPass(ID) {
 
 enum : unsigned { UndefLocNo = ~0U };
 
+namespace {
 /// Describes a debug variable value by location number and expression along
 /// with some flags about the original usage of the location.
 class DbgVariableValue {
@@ -136,6 +136,7 @@ private:
   unsigned WasIndirect : 1;
   const DIExpression *Expression = nullptr;
 };
+} // namespace
 
 /// Map of where a user value is live to that value.
 using LocMap = IntervalMap<SlotIndex, DbgVariableValue, 4>;
@@ -394,6 +395,11 @@ class LDVImpl {
   LiveIntervals *LIS;
   const TargetRegisterInfo *TRI;
 
+  using StashedInstrRef =
+      std::tuple<unsigned, unsigned, const DILocalVariable *,
+                 const DIExpression *, DebugLoc>;
+  std::map<SlotIndex, std::vector<StashedInstrRef>> StashedInstrReferences;
+
   /// Whether emitDebugValues is called.
   bool EmitDone = false;
 
@@ -430,6 +436,16 @@ class LDVImpl {
   /// \returns True if the DBG_VALUE instruction should be deleted.
   bool handleDebugValue(MachineInstr &MI, SlotIndex Idx);
 
+  /// Track a DBG_INSTR_REF. This needs to be removed from the MachineFunction
+  /// during regalloc -- but there's no need to maintain live ranges, as we
+  /// refer to a value rather than a location.
+  ///
+  /// \param MI DBG_INSTR_REF instruction
+  /// \param Idx Last valid SlotIndex before instruction
+  ///
+  /// \returns True if the DBG_VALUE instruction should be deleted.
+  bool handleDebugInstrRef(MachineInstr &MI, SlotIndex Idx);
+
   /// Add DBG_LABEL instruction to UserLabel.
   ///
   /// \param MI DBG_LABEL instruction
@@ -458,6 +474,7 @@ public:
   /// Release all memory.
   void clear() {
     MF = nullptr;
+    StashedInstrReferences.clear();
     userValues.clear();
     userLabels.clear();
     virtRegToEqClass.clear();
@@ -665,6 +682,19 @@ bool LDVImpl::handleDebugValue(MachineInstr &MI, SlotIndex Idx) {
   return true;
 }
 
+bool LDVImpl::handleDebugInstrRef(MachineInstr &MI, SlotIndex Idx) {
+  assert(MI.isDebugRef());
+  unsigned InstrNum = MI.getOperand(0).getImm();
+  unsigned OperandNum = MI.getOperand(1).getImm();
+  auto *Var = MI.getDebugVariable();
+  auto *Expr = MI.getDebugExpression();
+  auto &DL = MI.getDebugLoc();
+  StashedInstrRef Stashed =
+      std::make_tuple(InstrNum, OperandNum, Var, Expr, DL);
+  StashedInstrReferences[Idx].push_back(Stashed);
+  return true;
+}
+
 bool LDVImpl::handleDebugLabel(MachineInstr &MI, SlotIndex Idx) {
   // DBG_LABEL label
   if (MI.getNumOperands() != 1 || !MI.getOperand(0).isMetadata()) {
@@ -712,6 +742,7 @@ bool LDVImpl::collectDebugValues(MachineFunction &mf) {
         // Only handle DBG_VALUE in handleDebugValue(). Skip all other
         // kinds of debug instructions.
         if ((MBBI->isDebugValue() && handleDebugValue(*MBBI, Idx)) ||
+            (MBBI->isDebugRef() && handleDebugInstrRef(*MBBI, Idx)) ||
             (MBBI->isDebugLabel() && handleDebugLabel(*MBBI, Idx))) {
           MBBI = MBB->erase(MBBI);
           Changed = true;
@@ -775,12 +806,12 @@ void UserValue::addDefsFromCopies(
   if (Kills.empty())
     return;
   // Don't track copies from physregs, there are too many uses.
-  if (!Register::isVirtualRegister(LI->reg))
+  if (!Register::isVirtualRegister(LI->reg()))
     return;
 
   // Collect all the (vreg, valno) pairs that are copies of LI.
   SmallVector<std::pair<LiveInterval*, const VNInfo*>, 8> CopyValues;
-  for (MachineOperand &MO : MRI.use_nodbg_operands(LI->reg)) {
+  for (MachineOperand &MO : MRI.use_nodbg_operands(LI->reg())) {
     MachineInstr *MI = MO.getParent();
     // Copies of the full value.
     if (MO.getSubReg() || !MI->isCopy())
@@ -991,10 +1022,10 @@ bool LDVImpl::runOnMachineFunction(MachineFunction &mf) {
   return Changed;
 }
 
-static void removeDebugValues(MachineFunction &mf) {
+static void removeDebugInstrs(MachineFunction &mf) {
   for (MachineBasicBlock &MBB : mf) {
     for (auto MBBI = MBB.begin(), MBBE = MBB.end(); MBBI != MBBE; ) {
-      if (!MBBI->isDebugValue()) {
+      if (!MBBI->isDebugInstr()) {
         ++MBBI;
         continue;
       }
@@ -1007,7 +1038,7 @@ bool LiveDebugVariables::runOnMachineFunction(MachineFunction &mf) {
   if (!EnableLDV)
     return false;
   if (!mf.getFunction().getSubprogram()) {
-    removeDebugValues(mf);
+    removeDebugInstrs(mf);
     return false;
   }
   if (!pImpl)
@@ -1064,7 +1095,7 @@ UserValue::splitLocation(unsigned OldLocNo, ArrayRef<Register> NewRegs,
           LII->start < LocMapI.stop()) {
         // Overlapping correct location. Allocate NewLocNo now.
         if (NewLocNo == UndefLocNo) {
-          MachineOperand MO = MachineOperand::CreateReg(LI->reg, false);
+          MachineOperand MO = MachineOperand::CreateReg(LI->reg(), false);
           MO.setSubReg(locations[OldLocNo].getSubReg());
           NewLocNo = getLocationNo(MO);
           DidChange = true;
@@ -1434,6 +1465,28 @@ void LDVImpl::emitDebugValues(VirtRegMap *VRM) {
     LLVM_DEBUG(userLabel->print(dbgs(), TRI));
     userLabel->emitDebugLabel(*LIS, *TII);
   }
+
+  LLVM_DEBUG(dbgs() << "********** EMITTING INSTR REFERENCES **********\n");
+
+  // Re-insert any DBG_INSTR_REFs back in the position they were. Ordering
+  // is preserved by vector.
+  auto Slots = LIS->getSlotIndexes();
+  const MCInstrDesc &RefII = TII->get(TargetOpcode::DBG_INSTR_REF);
+  for (auto &P : StashedInstrReferences) {
+    const SlotIndex &Idx = P.first;
+    auto *MBB = Slots->getMBBFromIndex(Idx);
+    MachineBasicBlock::iterator insertPos = findInsertLocation(MBB, Idx, *LIS);
+    for (auto &Stashed : P.second) {
+      auto MIB = BuildMI(*MF, std::get<4>(Stashed), RefII);
+      MIB.addImm(std::get<0>(Stashed));
+      MIB.addImm(std::get<1>(Stashed));
+      MIB.addMetadata(std::get<2>(Stashed));
+      MIB.addMetadata(std::get<3>(Stashed));
+      MachineInstr *New = MIB;
+      MBB->insert(insertPos, New);
+    }
+  }
+
   EmitDone = true;
 }
 
@@ -1442,10 +1495,6 @@ void LiveDebugVariables::emitDebugValues(VirtRegMap *VRM) {
     static_cast<LDVImpl*>(pImpl)->emitDebugValues(VRM);
 }
 
-bool LiveDebugVariables::doInitialization(Module &M) {
-  return Pass::doInitialization(M);
-}
-
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void LiveDebugVariables::dump() const {
   if (pImpl)
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugVariables.h b/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugVariables.h
index 74e738ec3e56..07dd3a83866f 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugVariables.h
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugVariables.h
@@ -56,7 +56,6 @@ private:
   bool runOnMachineFunction(MachineFunction &) override;
   void releaseMemory() override;
   void getAnalysisUsage(AnalysisUsage &) const override;
-  bool doInitialization(Module &) override;
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LiveInterval.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LiveInterval.cpp
index 930dc116205a..ce0e58772068 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/LiveInterval.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LiveInterval.cpp
@@ -951,9 +951,9 @@ void LiveInterval::refineSubRanges(
       MatchingRange = createSubRangeFrom(Allocator, Matching, SR);
       // Now that the subrange is split in half, make sure we
       // only keep in the subranges the VNIs that touch the related half.
-      stripValuesNotDefiningMask(reg, *MatchingRange, Matching, Indexes, TRI,
+      stripValuesNotDefiningMask(reg(), *MatchingRange, Matching, Indexes, TRI,
                                  ComposeSubRegIdx);
-      stripValuesNotDefiningMask(reg, SR, SR.LaneMask, Indexes, TRI,
+      stripValuesNotDefiningMask(reg(), SR, SR.LaneMask, Indexes, TRI,
                                  ComposeSubRegIdx);
     }
     Apply(*MatchingRange);
@@ -977,11 +977,11 @@ void LiveInterval::computeSubRangeUndefs(SmallVectorImpl<SlotIndex> &Undefs,
                                          LaneBitmask LaneMask,
                                          const MachineRegisterInfo &MRI,
                                          const SlotIndexes &Indexes) const {
-  assert(Register::isVirtualRegister(reg));
-  LaneBitmask VRegMask = MRI.getMaxLaneMaskForVReg(reg);
+  assert(Register::isVirtualRegister(reg()));
+  LaneBitmask VRegMask = MRI.getMaxLaneMaskForVReg(reg());
   assert((VRegMask & LaneMask).any());
   const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
-  for (const MachineOperand &MO : MRI.def_operands(reg)) {
+  for (const MachineOperand &MO : MRI.def_operands(reg())) {
     if (!MO.isUndef())
       continue;
     unsigned SubReg = MO.getSubReg();
@@ -1043,12 +1043,12 @@ void LiveInterval::SubRange::print(raw_ostream &OS) const {
 }
 
 void LiveInterval::print(raw_ostream &OS) const {
-  OS << printReg(reg) << ' ';
+  OS << printReg(reg()) << ' ';
   super::print(OS);
   // Print subranges
   for (const SubRange &SR : subranges())
     OS << SR;
-  OS << " weight:" << weight;
+  OS << " weight:" << Weight;
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -1087,7 +1087,7 @@ void LiveInterval::verify(const MachineRegisterInfo *MRI) const {
 
   // Make sure SubRanges are fine and LaneMasks are disjunct.
   LaneBitmask Mask;
-  LaneBitmask MaxMask = MRI != nullptr ? MRI->getMaxLaneMaskForVReg(reg)
+  LaneBitmask MaxMask = MRI != nullptr ? MRI->getMaxLaneMaskForVReg(reg())
                                        : LaneBitmask::getAll();
   for (const SubRange &SR : subranges()) {
     // Subrange lanemask should be disjunct to any previous subrange masks.
@@ -1361,8 +1361,9 @@ unsigned ConnectedVNInfoEqClasses::Classify(const LiveRange &LR) {
 void ConnectedVNInfoEqClasses::Distribute(LiveInterval &LI, LiveInterval *LIV[],
                                           MachineRegisterInfo &MRI) {
   // Rewrite instructions.
-  for (MachineRegisterInfo::reg_iterator RI = MRI.reg_begin(LI.reg),
-       RE = MRI.reg_end(); RI != RE;) {
+  for (MachineRegisterInfo::reg_iterator RI = MRI.reg_begin(LI.reg()),
+                                         RE = MRI.reg_end();
+       RI != RE;) {
     MachineOperand &MO = *RI;
     MachineInstr *MI = RI->getParent();
     ++RI;
@@ -1382,7 +1383,7 @@ void ConnectedVNInfoEqClasses::Distribute(LiveInterval &LI, LiveInterval *LIV[],
     if (!VNI)
       continue;
     if (unsigned EqClass = getEqClass(VNI))
-      MO.setReg(LIV[EqClass-1]->reg);
+      MO.setReg(LIV[EqClass - 1]->reg());
   }
 
   // Distribute subregister liveranges.
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LiveIntervalCalc.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LiveIntervalCalc.cpp
index 30c2d74a71c5..2756086cb8b1 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/LiveIntervalCalc.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LiveIntervalCalc.cpp
@@ -60,7 +60,7 @@ void LiveIntervalCalc::calculate(LiveInterval &LI, bool TrackSubRegs) {
   // Visit all def operands. If the same instruction has multiple defs of Reg,
   // createDeadDef() will deduplicate.
   const TargetRegisterInfo &TRI = *MRI->getTargetRegisterInfo();
-  unsigned Reg = LI.reg;
+  unsigned Reg = LI.reg();
   for (const MachineOperand &MO : MRI->reg_nodbg_operands(Reg)) {
     if (!MO.isDef() && !MO.readsReg())
       continue;
@@ -127,7 +127,7 @@ void LiveIntervalCalc::constructMainRangeFromSubranges(LiveInterval &LI) {
     }
   }
   resetLiveOutMap();
-  extendToUses(MainRange, LI.reg, LaneBitmask::getAll(), &LI);
+  extendToUses(MainRange, LI.reg(), LaneBitmask::getAll(), &LI);
 }
 
 void LiveIntervalCalc::createDeadDefs(LiveRange &LR, Register Reg) {
@@ -202,4 +202,4 @@ void LiveIntervalCalc::extendToUses(LiveRange &LR, Register Reg,
     // reading Reg multiple times. That is OK, extend() is idempotent.
     extend(LR, UseIdx, Reg, Undefs);
   }
-}
\ No newline at end of file
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LiveIntervalUnion.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LiveIntervalUnion.cpp
index 43fa8f2d7157..7ccb8df4bc05 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/LiveIntervalUnion.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LiveIntervalUnion.cpp
@@ -85,8 +85,8 @@ LiveIntervalUnion::print(raw_ostream &OS, const TargetRegisterInfo *TRI) const {
     return;
   }
   for (LiveSegments::const_iterator SI = Segments.begin(); SI.valid(); ++SI) {
-    OS << " [" << SI.start() << ' ' << SI.stop() << "):"
-       << printReg(SI.value()->reg, TRI);
+    OS << " [" << SI.start() << ' ' << SI.stop()
+       << "):" << printReg(SI.value()->reg(), TRI);
   }
   OS << '\n';
 }
@@ -95,10 +95,20 @@ LiveIntervalUnion::print(raw_ostream &OS, const TargetRegisterInfo *TRI) const {
 // Verify the live intervals in this union and add them to the visited set.
 void LiveIntervalUnion::verify(LiveVirtRegBitSet& VisitedVRegs) {
   for (SegmentIter SI = Segments.begin(); SI.valid(); ++SI)
-    VisitedVRegs.set(SI.value()->reg);
+    VisitedVRegs.set(SI.value()->reg());
 }
 #endif //!NDEBUG
 
+LiveInterval *LiveIntervalUnion::getOneVReg() const {
+  if (empty())
+    return nullptr;
+  for (LiveSegments::const_iterator SI = Segments.begin(); SI.valid(); ++SI) {
+    // return the first valid live interval
+    return SI.value();
+  }
+  return nullptr;
+}
+
 // Scan the vector of interfering virtual registers in this union. Assume it's
 // quite small.
 bool LiveIntervalUnion::Query::isSeenInterference(LiveInterval *VirtReg) const {
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LiveIntervals.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LiveIntervals.cpp
index e8ee0599e1a2..a32b486240c8 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/LiveIntervals.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LiveIntervals.cpp
@@ -37,6 +37,7 @@
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/Config/llvm-config.h"
+#include "llvm/IR/InstrTypes.h"
 #include "llvm/MC/LaneBitmask.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Pass.h"
@@ -159,7 +160,7 @@ void LiveIntervals::print(raw_ostream &OS, const Module* ) const {
 
   // Dump the virtregs.
   for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
-    unsigned Reg = Register::index2VirtReg(i);
+    Register Reg = Register::index2VirtReg(i);
     if (hasInterval(Reg))
       OS << getInterval(Reg) << '\n';
   }
@@ -183,7 +184,7 @@ LLVM_DUMP_METHOD void LiveIntervals::dumpInstrs() const {
 }
 #endif
 
-LiveInterval* LiveIntervals::createInterval(unsigned reg) {
+LiveInterval *LiveIntervals::createInterval(Register reg) {
   float Weight = Register::isPhysicalRegister(reg) ? huge_valf : 0.0F;
   return new LiveInterval(reg, Weight);
 }
@@ -193,13 +194,13 @@ bool LiveIntervals::computeVirtRegInterval(LiveInterval &LI) {
   assert(LICalc && "LICalc not initialized.");
   assert(LI.empty() && "Should only compute empty intervals.");
   LICalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator());
-  LICalc->calculate(LI, MRI->shouldTrackSubRegLiveness(LI.reg));
+  LICalc->calculate(LI, MRI->shouldTrackSubRegLiveness(LI.reg()));
   return computeDeadValues(LI, nullptr);
 }
 
 void LiveIntervals::computeVirtRegs() {
   for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
-    unsigned Reg = Register::index2VirtReg(i);
+    Register Reg = Register::index2VirtReg(i);
     if (MRI->reg_nodbg_empty(Reg))
       continue;
     LiveInterval &LI = createEmptyInterval(Reg);
@@ -225,6 +226,15 @@ void LiveIntervals::computeRegMasks() {
       RegMaskBits.push_back(Mask);
     }
 
+    // Unwinders may clobber additional registers.
+    // FIXME: This functionality can possibly be merged into
+    // MachineBasicBlock::getBeginClobberMask().
+    if (MBB.isEHPad())
+      if (auto *Mask = TRI->getCustomEHPadPreservedMask(*MBB.getParent())) {
+        RegMaskSlots.push_back(Indexes->getMBBStartIdx(&MBB));
+        RegMaskBits.push_back(Mask);
+      }
+
     for (const MachineInstr &MI : MBB) {
       for (const MachineOperand &MO : MI.operands()) {
         if (!MO.isRegMask())
@@ -277,7 +287,7 @@ void LiveIntervals::computeRegUnitRange(LiveRange &LR, unsigned Unit) {
     bool IsRootReserved = true;
     for (MCSuperRegIterator Super(*Root, TRI, /*IncludeSelf=*/true);
          Super.isValid(); ++Super) {
-      unsigned Reg = *Super;
+      MCRegister Reg = *Super;
       if (!MRI->reg_empty(Reg))
         LICalc->createDeadDefs(LR, Reg);
       // A register unit is considered reserved if all its roots and all their
@@ -296,7 +306,7 @@ void LiveIntervals::computeRegUnitRange(LiveRange &LR, unsigned Unit) {
     for (MCRegUnitRootIterator Root(Unit, TRI); Root.isValid(); ++Root) {
       for (MCSuperRegIterator Super(*Root, TRI, /*IncludeSelf=*/true);
            Super.isValid(); ++Super) {
-        unsigned Reg = *Super;
+        MCRegister Reg = *Super;
         if (!MRI->reg_empty(Reg))
           LICalc->extendToUses(LR, Reg);
       }
@@ -362,7 +372,7 @@ static void createSegmentsForValues(LiveRange &LR,
 
 void LiveIntervals::extendSegmentsToUses(LiveRange &Segments,
                                          ShrinkToUsesWorkList &WorkList,
-                                         unsigned Reg, LaneBitmask LaneMask) {
+                                         Register Reg, LaneBitmask LaneMask) {
   // Keep track of the PHIs that are in use.
   SmallPtrSet<VNInfo*, 8> UsedPHIs;
   // Blocks that have already been added to WorkList as live-out.
@@ -444,13 +454,13 @@ void LiveIntervals::extendSegmentsToUses(LiveRange &Segments,
 bool LiveIntervals::shrinkToUses(LiveInterval *li,
                                  SmallVectorImpl<MachineInstr*> *dead) {
   LLVM_DEBUG(dbgs() << "Shrink: " << *li << '\n');
-  assert(Register::isVirtualRegister(li->reg) &&
+  assert(Register::isVirtualRegister(li->reg()) &&
          "Can only shrink virtual registers");
 
   // Shrink subregister live ranges.
   bool NeedsCleanup = false;
   for (LiveInterval::SubRange &S : li->subranges()) {
-    shrinkToUses(S, li->reg);
+    shrinkToUses(S, li->reg());
     if (S.empty())
       NeedsCleanup = true;
   }
@@ -460,8 +470,8 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
   // Find all the values used, including PHI kills.
   ShrinkToUsesWorkList WorkList;
 
-  // Visit all instructions reading li->reg.
-  unsigned Reg = li->reg;
+  // Visit all instructions reading li->reg().
+  Register Reg = li->reg();
   for (MachineInstr &UseMI : MRI->reg_instructions(Reg)) {
     if (UseMI.isDebugValue() || !UseMI.readsVirtualRegister(Reg))
       continue;
@@ -514,7 +524,7 @@ bool LiveIntervals::computeDeadValues(LiveInterval &LI,
 
     // Is the register live before? Otherwise we may have to add a read-undef
     // flag for subregister defs.
-    unsigned VReg = LI.reg;
+    Register VReg = LI.reg();
     if (MRI->shouldTrackSubRegLiveness(VReg)) {
       if ((I == LI.begin() || std::prev(I)->end < Def) && !VNI->isPHIDef()) {
         MachineInstr *MI = getInstructionFromIndex(Def);
@@ -534,7 +544,7 @@ bool LiveIntervals::computeDeadValues(LiveInterval &LI,
       // This is a dead def. Make sure the instruction knows.
       MachineInstr *MI = getInstructionFromIndex(Def);
       assert(MI && "No instruction defining live value");
-      MI->addRegisterDead(LI.reg, TRI);
+      MI->addRegisterDead(LI.reg(), TRI);
       if (HaveDeadDef)
         MayHaveSplitComponents = true;
       HaveDeadDef = true;
@@ -548,7 +558,7 @@ bool LiveIntervals::computeDeadValues(LiveInterval &LI,
   return MayHaveSplitComponents;
 }
 
-void LiveIntervals::shrinkToUses(LiveInterval::SubRange &SR, unsigned Reg) {
+void LiveIntervals::shrinkToUses(LiveInterval::SubRange &SR, Register Reg) {
   LLVM_DEBUG(dbgs() << "Shrink: " << SR << '\n');
   assert(Register::isVirtualRegister(Reg) &&
          "Can only shrink virtual registers");
@@ -697,7 +707,7 @@ void LiveIntervals::addKillFlags(const VirtRegMap *VRM) {
                         LiveRange::const_iterator>, 4> SRs;
 
   for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
-    unsigned Reg = Register::index2VirtReg(i);
+    Register Reg = Register::index2VirtReg(i);
     if (MRI->reg_nodbg_empty(Reg))
       continue;
     const LiveInterval &LI = getInterval(Reg);
@@ -868,14 +878,12 @@ float LiveIntervals::getSpillWeight(bool isDef, bool isUse,
 float LiveIntervals::getSpillWeight(bool isDef, bool isUse,
                                     const MachineBlockFrequencyInfo *MBFI,
                                     const MachineBasicBlock *MBB) {
-  BlockFrequency Freq = MBFI->getBlockFreq(MBB);
-  const float Scale = 1.0f / MBFI->getEntryFreq();
-  return (isDef + isUse) * (Freq.getFrequency() * Scale);
+  return (isDef + isUse) * MBFI->getBlockFreqRelativeToEntryBlock(MBB);
 }
 
 LiveRange::Segment
-LiveIntervals::addSegmentToEndOfBlock(unsigned reg, MachineInstr &startInst) {
-  LiveInterval& Interval = createEmptyInterval(reg);
+LiveIntervals::addSegmentToEndOfBlock(Register Reg, MachineInstr &startInst) {
+  LiveInterval &Interval = createEmptyInterval(Reg);
   VNInfo *VN = Interval.getNextValue(
       SlotIndex(getInstructionIndex(startInst).getRegSlot()),
       getVNInfoAllocator());
@@ -1030,7 +1038,8 @@ public:
 
       // For physregs, only update the regunits that actually have a
       // precomputed live range.
-      for (MCRegUnitIterator Units(Reg, &TRI); Units.isValid(); ++Units)
+      for (MCRegUnitIterator Units(Reg.asMCReg(), &TRI); Units.isValid();
+           ++Units)
         if (LiveRange *LR = getRegUnitLI(*Units))
           updateRange(*LR, *Units, LaneBitmask::getNone());
     }
@@ -1041,7 +1050,7 @@ public:
 private:
   /// Update a single live range, assuming an instruction has been moved from
   /// OldIdx to NewIdx.
-  void updateRange(LiveRange &LR, unsigned Reg, LaneBitmask LaneMask) {
+  void updateRange(LiveRange &LR, Register Reg, LaneBitmask LaneMask) {
     if (!Updated.insert(&LR).second)
       return;
     LLVM_DEBUG({
@@ -1238,7 +1247,7 @@ private:
 
   /// Update LR to reflect an instruction has been moved upwards from OldIdx
   /// to NewIdx (NewIdx < OldIdx).
-  void handleMoveUp(LiveRange &LR, unsigned Reg, LaneBitmask LaneMask) {
+  void handleMoveUp(LiveRange &LR, Register Reg, LaneBitmask LaneMask) {
     LiveRange::iterator E = LR.end();
     // Segment going into OldIdx.
     LiveRange::iterator OldIdxIn = LR.find(OldIdx.getBaseIndex());
@@ -1420,7 +1429,7 @@ private:
   }
 
   // Return the last use of reg between NewIdx and OldIdx.
-  SlotIndex findLastUseBefore(SlotIndex Before, unsigned Reg,
+  SlotIndex findLastUseBefore(SlotIndex Before, Register Reg,
                               LaneBitmask LaneMask) {
     if (Register::isVirtualRegister(Reg)) {
       SlotIndex LastUse = Before;
@@ -1533,17 +1542,17 @@ void LiveIntervals::handleMoveIntoNewBundle(MachineInstr &BundleStart,
 
 void LiveIntervals::repairOldRegInRange(const MachineBasicBlock::iterator Begin,
                                         const MachineBasicBlock::iterator End,
-                                        const SlotIndex endIdx,
-                                        LiveRange &LR, const unsigned Reg,
+                                        const SlotIndex EndIdx, LiveRange &LR,
+                                        const Register Reg,
                                         LaneBitmask LaneMask) {
-  LiveInterval::iterator LII = LR.find(endIdx);
+  LiveInterval::iterator LII = LR.find(EndIdx);
   SlotIndex lastUseIdx;
   if (LII == LR.begin()) {
     // This happens when the function is called for a subregister that only
     // occurs _after_ the range that is to be repaired.
     return;
   }
-  if (LII != LR.end() && LII->start < endIdx)
+  if (LII != LR.end() && LII->start < EndIdx)
     lastUseIdx = LII->end;
   else
     --LII;
@@ -1637,11 +1646,11 @@ LiveIntervals::repairIntervalsInRange(MachineBasicBlock *MBB,
   while (End != MBB->end() && !Indexes->hasIndex(*End))
     ++End;
 
-  SlotIndex endIdx;
+  SlotIndex EndIdx;
   if (End == MBB->end())
-    endIdx = getMBBEndIdx(MBB).getPrevSlot();
+    EndIdx = getMBBEndIdx(MBB).getPrevSlot();
   else
-    endIdx = getInstructionIndex(*End);
+    EndIdx = getInstructionIndex(*End);
 
   Indexes->repairIndexesInRange(MBB, Begin, End);
 
@@ -1670,13 +1679,13 @@ LiveIntervals::repairIntervalsInRange(MachineBasicBlock *MBB,
       continue;
 
     for (LiveInterval::SubRange &S : LI.subranges())
-      repairOldRegInRange(Begin, End, endIdx, S, Reg, S.LaneMask);
+      repairOldRegInRange(Begin, End, EndIdx, S, Reg, S.LaneMask);
 
-    repairOldRegInRange(Begin, End, endIdx, LI, Reg);
+    repairOldRegInRange(Begin, End, EndIdx, LI, Reg);
   }
 }
 
-void LiveIntervals::removePhysRegDefAt(unsigned Reg, SlotIndex Pos) {
+void LiveIntervals::removePhysRegDefAt(MCRegister Reg, SlotIndex Pos) {
   for (MCRegUnitIterator Unit(Reg, TRI); Unit.isValid(); ++Unit) {
     if (LiveRange *LR = getCachedRegUnit(*Unit))
       if (VNInfo *VNI = LR->getVNInfoAt(Pos))
@@ -1709,7 +1718,7 @@ void LiveIntervals::splitSeparateComponents(LiveInterval &LI,
   if (NumComp <= 1)
     return;
   LLVM_DEBUG(dbgs() << "  Split " << NumComp << " components: " << LI << '\n');
-  unsigned Reg = LI.reg;
+  Register Reg = LI.reg();
   const TargetRegisterClass *RegClass = MRI->getRegClass(Reg);
   for (unsigned I = 1; I < NumComp; ++I) {
     Register NewVReg = MRI->createVirtualRegister(RegClass);
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LiveRangeEdit.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LiveRangeEdit.cpp
index 9de77c19a23a..037cb5426235 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/LiveRangeEdit.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LiveRangeEdit.cpp
@@ -12,7 +12,6 @@
 
 #include "llvm/CodeGen/LiveRangeEdit.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/CalcSpillWeights.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -188,7 +187,7 @@ bool LiveRangeEdit::foldAsLoad(LiveInterval *LI,
   MachineInstr *DefMI = nullptr, *UseMI = nullptr;
 
   // Check that there is a single def and a single use.
-  for (MachineOperand &MO : MRI.reg_nodbg_operands(LI->reg)) {
+  for (MachineOperand &MO : MRI.reg_nodbg_operands(LI->reg())) {
     MachineInstr *MI = MO.getParent();
     if (MO.isDef()) {
       if (DefMI && DefMI != MI)
@@ -224,7 +223,7 @@ bool LiveRangeEdit::foldAsLoad(LiveInterval *LI,
                     << "       into single use: " << *UseMI);
 
   SmallVector<unsigned, 8> Ops;
-  if (UseMI->readsWritesVirtualRegister(LI->reg, &Ops).second)
+  if (UseMI->readsWritesVirtualRegister(LI->reg(), &Ops).second)
     return false;
 
   MachineInstr *FoldMI = TII.foldMemoryOperand(*UseMI, Ops, *DefMI, &LIS);
@@ -236,7 +235,7 @@ bool LiveRangeEdit::foldAsLoad(LiveInterval *LI,
   if (UseMI->shouldUpdateCallSiteInfo())
     UseMI->getMF()->moveCallSiteInfo(UseMI, FoldMI);
   UseMI->eraseFromParent();
-  DefMI->addRegisterDead(LI->reg, nullptr);
+  DefMI->addRegisterDead(LI->reg(), nullptr);
   Dead.push_back(DefMI);
   ++NumDCEFoldedLoads;
   return true;
@@ -316,7 +315,7 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink,
       if (Reg && MOI->readsReg() && !MRI.isReserved(Reg))
         ReadsPhysRegs = true;
       else if (MOI->isDef())
-        LIS.removePhysRegDefAt(Reg, Idx);
+        LIS.removePhysRegDefAt(Reg.asMCReg(), Idx);
       continue;
     }
     LiveInterval &LI = LIS.getInterval(Reg);
@@ -332,7 +331,7 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink,
     // Remove defined value.
     if (MOI->isDef()) {
       if (TheDelegate && LI.getVNInfoAt(Idx) != nullptr)
-        TheDelegate->LRE_WillShrinkVirtReg(LI.reg);
+        TheDelegate->LRE_WillShrinkVirtReg(LI.reg());
       LIS.removeVRegDefAt(LI, Idx);
       if (LI.empty())
         RegsToErase.push_back(Reg);
@@ -369,7 +368,7 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink,
       pop_back();
       DeadRemats->insert(MI);
       const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
-      MI->substituteRegister(Dest, NewLI.reg, 0, TRI);
+      MI->substituteRegister(Dest, NewLI.reg(), 0, TRI);
       MI->getOperand(0).setIsDead(true);
     } else {
       if (TheDelegate)
@@ -409,7 +408,7 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr *> &Dead,
     ToShrink.pop_back();
     if (foldAsLoad(LI, Dead))
       continue;
-    unsigned VReg = LI->reg;
+    unsigned VReg = LI->reg();
     if (TheDelegate)
       TheDelegate->LRE_WillShrinkVirtReg(VReg);
     if (!LIS.shrinkToUses(LI, &Dead))
@@ -436,15 +435,15 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr *> &Dead,
     if (!SplitLIs.empty())
       ++NumFracRanges;
 
-    unsigned Original = VRM ? VRM->getOriginal(VReg) : 0;
+    Register Original = VRM ? VRM->getOriginal(VReg) : Register();
     for (const LiveInterval *SplitLI : SplitLIs) {
       // If LI is an original interval that hasn't been split yet, make the new
       // intervals their own originals instead of referring to LI. The original
       // interval must contain all the split products, and LI doesn't.
       if (Original != VReg && Original != 0)
-        VRM->setIsSplitFromReg(SplitLI->reg, Original);
+        VRM->setIsSplitFromReg(SplitLI->reg(), Original);
       if (TheDelegate)
-        TheDelegate->LRE_DidCloneVirtReg(SplitLI->reg, VReg);
+        TheDelegate->LRE_DidCloneVirtReg(SplitLI->reg(), VReg);
     }
   }
 }
@@ -463,14 +462,14 @@ void
 LiveRangeEdit::calculateRegClassAndHint(MachineFunction &MF,
                                         const MachineLoopInfo &Loops,
                                         const MachineBlockFrequencyInfo &MBFI) {
-  VirtRegAuxInfo VRAI(MF, LIS, VRM, Loops, MBFI);
+  VirtRegAuxInfo VRAI(MF, LIS, *VRM, Loops, MBFI);
   for (unsigned I = 0, Size = size(); I < Size; ++I) {
     LiveInterval &LI = LIS.getInterval(get(I));
-    if (MRI.recomputeRegClass(LI.reg))
+    if (MRI.recomputeRegClass(LI.reg()))
       LLVM_DEBUG({
         const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
-        dbgs() << "Inflated " << printReg(LI.reg) << " to "
-               << TRI->getRegClassName(MRI.getRegClass(LI.reg)) << '\n';
+        dbgs() << "Inflated " << printReg(LI.reg()) << " to "
+               << TRI->getRegClassName(MRI.getRegClass(LI.reg())) << '\n';
       });
     VRAI.calculateSpillWeightAndHint(LI);
   }
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LiveRangeShrink.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LiveRangeShrink.cpp
index 26439a656917..7fa14fd902ef 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/LiveRangeShrink.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LiveRangeShrink.cpp
@@ -156,7 +156,8 @@ bool LiveRangeShrink::runOnMachineFunction(MachineFunction &MF) {
         // If MI has side effects, it should become a barrier for code motion.
         // IOM is rebuild from the next instruction to prevent later
         // instructions from being moved before this MI.
-        if (MI.hasUnmodeledSideEffects() && Next != MBB.end()) {
+        if (MI.hasUnmodeledSideEffects() && !MI.isPseudoProbe() &&
+            Next != MBB.end()) {
           BuildInstOrderMap(Next, IOM);
           SawStore = false;
         }
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LiveRegMatrix.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LiveRegMatrix.cpp
index 08f046420fa1..a69aa6557e46 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/LiveRegMatrix.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LiveRegMatrix.cpp
@@ -78,7 +78,7 @@ void LiveRegMatrix::releaseMemory() {
 
 template <typename Callable>
 static bool foreachUnit(const TargetRegisterInfo *TRI,
-                        LiveInterval &VRegInterval, unsigned PhysReg,
+                        LiveInterval &VRegInterval, MCRegister PhysReg,
                         Callable Func) {
   if (VRegInterval.hasSubRanges()) {
     for (MCRegUnitMaskIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
@@ -101,11 +101,11 @@ static bool foreachUnit(const TargetRegisterInfo *TRI,
   return false;
 }
 
-void LiveRegMatrix::assign(LiveInterval &VirtReg, unsigned PhysReg) {
-  LLVM_DEBUG(dbgs() << "assigning " << printReg(VirtReg.reg, TRI) << " to "
+void LiveRegMatrix::assign(LiveInterval &VirtReg, MCRegister PhysReg) {
+  LLVM_DEBUG(dbgs() << "assigning " << printReg(VirtReg.reg(), TRI) << " to "
                     << printReg(PhysReg, TRI) << ':');
-  assert(!VRM->hasPhys(VirtReg.reg) && "Duplicate VirtReg assignment");
-  VRM->assignVirt2Phys(VirtReg.reg, PhysReg);
+  assert(!VRM->hasPhys(VirtReg.reg()) && "Duplicate VirtReg assignment");
+  VRM->assignVirt2Phys(VirtReg.reg(), PhysReg);
 
   foreachUnit(
       TRI, VirtReg, PhysReg, [&](unsigned Unit, const LiveRange &Range) {
@@ -119,10 +119,10 @@ void LiveRegMatrix::assign(LiveInterval &VirtReg, unsigned PhysReg) {
 }
 
 void LiveRegMatrix::unassign(LiveInterval &VirtReg) {
-  Register PhysReg = VRM->getPhys(VirtReg.reg);
-  LLVM_DEBUG(dbgs() << "unassigning " << printReg(VirtReg.reg, TRI) << " from "
-                    << printReg(PhysReg, TRI) << ':');
-  VRM->clearVirt(VirtReg.reg);
+  Register PhysReg = VRM->getPhys(VirtReg.reg());
+  LLVM_DEBUG(dbgs() << "unassigning " << printReg(VirtReg.reg(), TRI)
+                    << " from " << printReg(PhysReg, TRI) << ':');
+  VRM->clearVirt(VirtReg.reg());
 
   foreachUnit(TRI, VirtReg, PhysReg,
               [&](unsigned Unit, const LiveRange &Range) {
@@ -135,7 +135,7 @@ void LiveRegMatrix::unassign(LiveInterval &VirtReg) {
   LLVM_DEBUG(dbgs() << '\n');
 }
 
-bool LiveRegMatrix::isPhysRegUsed(unsigned PhysReg) const {
+bool LiveRegMatrix::isPhysRegUsed(MCRegister PhysReg) const {
   for (MCRegUnitIterator Unit(PhysReg, TRI); Unit.isValid(); ++Unit) {
     if (!Matrix[*Unit].empty())
       return true;
@@ -144,12 +144,12 @@ bool LiveRegMatrix::isPhysRegUsed(unsigned PhysReg) const {
 }
 
 bool LiveRegMatrix::checkRegMaskInterference(LiveInterval &VirtReg,
-                                             unsigned PhysReg) {
+                                             MCRegister PhysReg) {
   // Check if the cached information is valid.
   // The same BitVector can be reused for all PhysRegs.
   // We could cache multiple VirtRegs if it becomes necessary.
-  if (RegMaskVirtReg != VirtReg.reg || RegMaskTag != UserTag) {
-    RegMaskVirtReg = VirtReg.reg;
+  if (RegMaskVirtReg != VirtReg.reg() || RegMaskTag != UserTag) {
+    RegMaskVirtReg = VirtReg.reg();
     RegMaskTag = UserTag;
     RegMaskUsable.clear();
     LIS->checkRegMaskInterference(VirtReg, RegMaskUsable);
@@ -162,10 +162,10 @@ bool LiveRegMatrix::checkRegMaskInterference(LiveInterval &VirtReg,
 }
 
 bool LiveRegMatrix::checkRegUnitInterference(LiveInterval &VirtReg,
-                                             unsigned PhysReg) {
+                                             MCRegister PhysReg) {
   if (VirtReg.empty())
     return false;
-  CoalescerPair CP(VirtReg.reg, PhysReg, *TRI);
+  CoalescerPair CP(VirtReg.reg(), PhysReg, *TRI);
 
   bool Result = foreachUnit(TRI, VirtReg, PhysReg, [&](unsigned Unit,
                                                        const LiveRange &Range) {
@@ -176,14 +176,14 @@ bool LiveRegMatrix::checkRegUnitInterference(LiveInterval &VirtReg,
 }
 
 LiveIntervalUnion::Query &LiveRegMatrix::query(const LiveRange &LR,
-                                               unsigned RegUnit) {
+                                               MCRegister RegUnit) {
   LiveIntervalUnion::Query &Q = Queries[RegUnit];
   Q.init(UserTag, LR, Matrix[RegUnit]);
   return Q;
 }
 
 LiveRegMatrix::InterferenceKind
-LiveRegMatrix::checkInterference(LiveInterval &VirtReg, unsigned PhysReg) {
+LiveRegMatrix::checkInterference(LiveInterval &VirtReg, MCRegister PhysReg) {
   if (VirtReg.empty())
     return IK_Free;
 
@@ -197,9 +197,9 @@ LiveRegMatrix::checkInterference(LiveInterval &VirtReg, unsigned PhysReg) {
 
   // Check the matrix for virtual register interference.
   bool Interference = foreachUnit(TRI, VirtReg, PhysReg,
-                                  [&](unsigned Unit, const LiveRange &LR) {
-    return query(LR, Unit).checkInterference();
-  });
+                                  [&](MCRegister Unit, const LiveRange &LR) {
+                                    return query(LR, Unit).checkInterference();
+                                  });
   if (Interference)
     return IK_VirtReg;
 
@@ -207,7 +207,7 @@ LiveRegMatrix::checkInterference(LiveInterval &VirtReg, unsigned PhysReg) {
 }
 
 bool LiveRegMatrix::checkInterference(SlotIndex Start, SlotIndex End,
-                                      unsigned PhysReg) {
+                                      MCRegister PhysReg) {
   // Construct artificial live range containing only one segment [Start, End).
   VNInfo valno(0, Start);
   LiveRange::Segment Seg(Start, End, &valno);
@@ -221,3 +221,13 @@ bool LiveRegMatrix::checkInterference(SlotIndex Start, SlotIndex End,
   }
   return false;
 }
+
+Register LiveRegMatrix::getOneVReg(unsigned PhysReg) const {
+  LiveInterval *VRegInterval = nullptr;
+  for (MCRegUnitIterator Unit(PhysReg, TRI); Unit.isValid(); ++Unit) {
+    if ((VRegInterval = Matrix[*Unit].getOneVReg()))
+      return VRegInterval->reg();
+  }
+
+  return MCRegister::NoRegister;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LiveRegUnits.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LiveRegUnits.cpp
index b2731aa0e7db..ea2075bc139d 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/LiveRegUnits.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LiveRegUnits.cpp
@@ -11,15 +11,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/LiveRegUnits.h"
-
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstrBundle.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/MC/MCRegisterInfo.h"
 
 using namespace llvm;
 
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LiveVariables.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LiveVariables.cpp
index 6610491dd111..49b880c30936 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/LiveVariables.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LiveVariables.cpp
@@ -82,17 +82,15 @@ LLVM_DUMP_METHOD void LiveVariables::VarInfo::dump() const {
 #endif
 
 /// getVarInfo - Get (possibly creating) a VarInfo object for the given vreg.
-LiveVariables::VarInfo &LiveVariables::getVarInfo(unsigned RegIdx) {
-  assert(Register::isVirtualRegister(RegIdx) &&
-         "getVarInfo: not a virtual register!");
-  VirtRegInfo.grow(RegIdx);
-  return VirtRegInfo[RegIdx];
+LiveVariables::VarInfo &LiveVariables::getVarInfo(Register Reg) {
+  assert(Reg.isVirtual() && "getVarInfo: not a virtual register!");
+  VirtRegInfo.grow(Reg);
+  return VirtRegInfo[Reg];
 }
 
-void LiveVariables::MarkVirtRegAliveInBlock(VarInfo& VRInfo,
-                                            MachineBasicBlock *DefBlock,
-                                            MachineBasicBlock *MBB,
-                                    std::vector<MachineBasicBlock*> &WorkList) {
+void LiveVariables::MarkVirtRegAliveInBlock(
+    VarInfo &VRInfo, MachineBasicBlock *DefBlock, MachineBasicBlock *MBB,
+    SmallVectorImpl<MachineBasicBlock *> &WorkList) {
   unsigned BBNum = MBB->getNumber();
 
   // Check to see if this basic block is one of the killing blocks.  If so,
@@ -118,7 +116,7 @@ void LiveVariables::MarkVirtRegAliveInBlock(VarInfo& VRInfo,
 void LiveVariables::MarkVirtRegAliveInBlock(VarInfo &VRInfo,
                                             MachineBasicBlock *DefBlock,
                                             MachineBasicBlock *MBB) {
-  std::vector<MachineBasicBlock*> WorkList;
+  SmallVector<MachineBasicBlock *, 16> WorkList;
   MarkVirtRegAliveInBlock(VRInfo, DefBlock, MBB, WorkList);
 
   while (!WorkList.empty()) {
@@ -128,13 +126,13 @@ void LiveVariables::MarkVirtRegAliveInBlock(VarInfo &VRInfo,
   }
 }
 
-void LiveVariables::HandleVirtRegUse(unsigned reg, MachineBasicBlock *MBB,
+void LiveVariables::HandleVirtRegUse(Register Reg, MachineBasicBlock *MBB,
                                      MachineInstr &MI) {
-  assert(MRI->getVRegDef(reg) && "Register use before def!");
+  assert(MRI->getVRegDef(Reg) && "Register use before def!");
 
   unsigned BBNum = MBB->getNumber();
 
-  VarInfo& VRInfo = getVarInfo(reg);
+  VarInfo &VRInfo = getVarInfo(Reg);
 
   // Check to see if this basic block is already a kill block.
   if (!VRInfo.Kills.empty() && VRInfo.Kills.back()->getParent() == MBB) {
@@ -165,7 +163,8 @@ void LiveVariables::HandleVirtRegUse(unsigned reg, MachineBasicBlock *MBB,
   // where there is a use in a PHI node that's a predecessor to the defining
   // block. We don't want to mark all predecessors as having the value "alive"
   // in this case.
-  if (MBB == MRI->getVRegDef(reg)->getParent()) return;
+  if (MBB == MRI->getVRegDef(Reg)->getParent())
+    return;
 
   // Add a new kill entry for this basic block. If this virtual register is
   // already marked as alive in this basic block, that means it is alive in at
@@ -176,10 +175,10 @@ void LiveVariables::HandleVirtRegUse(unsigned reg, MachineBasicBlock *MBB,
   // Update all dominating blocks to mark them as "known live".
   for (MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(),
          E = MBB->pred_end(); PI != E; ++PI)
-    MarkVirtRegAliveInBlock(VRInfo, MRI->getVRegDef(reg)->getParent(), *PI);
+    MarkVirtRegAliveInBlock(VRInfo, MRI->getVRegDef(Reg)->getParent(), *PI);
 }
 
-void LiveVariables::HandleVirtRegDef(unsigned Reg, MachineInstr &MI) {
+void LiveVariables::HandleVirtRegDef(Register Reg, MachineInstr &MI) {
   VarInfo &VRInfo = getVarInfo(Reg);
 
   if (VRInfo.AliveBlocks.empty())
@@ -189,8 +188,9 @@ void LiveVariables::HandleVirtRegDef(unsigned Reg, MachineInstr &MI) {
 
 /// FindLastPartialDef - Return the last partial def of the specified register.
 /// Also returns the sub-registers that're defined by the instruction.
-MachineInstr *LiveVariables::FindLastPartialDef(unsigned Reg,
-                                            SmallSet<unsigned,4> &PartDefRegs) {
+MachineInstr *
+LiveVariables::FindLastPartialDef(Register Reg,
+                                  SmallSet<unsigned, 4> &PartDefRegs) {
   unsigned LastDefReg = 0;
   unsigned LastDefDist = 0;
   MachineInstr *LastDef = nullptr;
@@ -228,7 +228,7 @@ MachineInstr *LiveVariables::FindLastPartialDef(unsigned Reg,
 /// HandlePhysRegUse - Turn previous partial def's into read/mod/writes. Add
 /// implicit defs to a machine instruction if there was an earlier def of its
 /// super-register.
-void LiveVariables::HandlePhysRegUse(unsigned Reg, MachineInstr &MI) {
+void LiveVariables::HandlePhysRegUse(Register Reg, MachineInstr &MI) {
   MachineInstr *LastDef = PhysRegDef[Reg];
   // If there was a previous use or a "full" def all is well.
   if (!LastDef && !PhysRegUse[Reg]) {
@@ -278,7 +278,7 @@ void LiveVariables::HandlePhysRegUse(unsigned Reg, MachineInstr &MI) {
 
 /// FindLastRefOrPartRef - Return the last reference or partial reference of
 /// the specified register.
-MachineInstr *LiveVariables::FindLastRefOrPartRef(unsigned Reg) {
+MachineInstr *LiveVariables::FindLastRefOrPartRef(Register Reg) {
   MachineInstr *LastDef = PhysRegDef[Reg];
   MachineInstr *LastUse = PhysRegUse[Reg];
   if (!LastDef && !LastUse)
@@ -308,7 +308,7 @@ MachineInstr *LiveVariables::FindLastRefOrPartRef(unsigned Reg) {
   return LastRefOrPartRef;
 }
 
-bool LiveVariables::HandlePhysRegKill(unsigned Reg, MachineInstr *MI) {
+bool LiveVariables::HandlePhysRegKill(Register Reg, MachineInstr *MI) {
   MachineInstr *LastDef = PhysRegDef[Reg];
   MachineInstr *LastUse = PhysRegUse[Reg];
   if (!LastDef && !LastUse)
@@ -440,7 +440,7 @@ void LiveVariables::HandleRegMask(const MachineOperand &MO) {
   }
 }
 
-void LiveVariables::HandlePhysRegDef(unsigned Reg, MachineInstr *MI,
+void LiveVariables::HandlePhysRegDef(Register Reg, MachineInstr *MI,
                                      SmallVectorImpl<unsigned> &Defs) {
   // What parts of the register are previously defined?
   SmallSet<unsigned, 32> Live;
@@ -486,7 +486,7 @@ void LiveVariables::HandlePhysRegDef(unsigned Reg, MachineInstr *MI,
 void LiveVariables::UpdatePhysRegDefs(MachineInstr &MI,
                                       SmallVectorImpl<unsigned> &Defs) {
   while (!Defs.empty()) {
-    unsigned Reg = Defs.back();
+    Register Reg = Defs.back();
     Defs.pop_back();
     for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
          SubRegs.isValid(); ++SubRegs) {
@@ -653,7 +653,7 @@ bool LiveVariables::runOnMachineFunction(MachineFunction &mf) {
   // Convert and transfer the dead / killed information we have gathered into
   // VirtRegInfo onto MI's.
   for (unsigned i = 0, e1 = VirtRegInfo.size(); i != e1; ++i) {
-    const unsigned Reg = Register::index2VirtReg(i);
+    const Register Reg = Register::index2VirtReg(i);
     for (unsigned j = 0, e2 = VirtRegInfo[Reg].Kills.size(); j != e2; ++j)
       if (VirtRegInfo[Reg].Kills[j] == MRI->getVRegDef(Reg))
         VirtRegInfo[Reg].Kills[j]->addRegisterDead(Reg, TRI);
@@ -666,7 +666,7 @@ bool LiveVariables::runOnMachineFunction(MachineFunction &mf) {
   // other part of the code generator if this happens.
 #ifndef NDEBUG
   for(MachineFunction::iterator i = MF->begin(), e = MF->end(); i != e; ++i)
-    assert(Visited.count(&*i) != 0 && "unreachable basic block found");
+    assert(Visited.contains(&*i) && "unreachable basic block found");
 #endif
 
   PhysRegDef.clear();
@@ -678,7 +678,7 @@ bool LiveVariables::runOnMachineFunction(MachineFunction &mf) {
 
 /// replaceKillInstruction - Update register kill info by replacing a kill
 /// instruction with a new one.
-void LiveVariables::replaceKillInstruction(unsigned Reg, MachineInstr &OldMI,
+void LiveVariables::replaceKillInstruction(Register Reg, MachineInstr &OldMI,
                                            MachineInstr &NewMI) {
   VarInfo &VI = getVarInfo(Reg);
   std::replace(VI.Kills.begin(), VI.Kills.end(), &OldMI, &NewMI);
@@ -718,8 +718,7 @@ void LiveVariables::analyzePHINodes(const MachineFunction& Fn) {
 }
 
 bool LiveVariables::VarInfo::isLiveIn(const MachineBasicBlock &MBB,
-                                      unsigned Reg,
-                                      MachineRegisterInfo &MRI) {
+                                      Register Reg, MachineRegisterInfo &MRI) {
   unsigned Num = MBB.getNumber();
 
   // Reg is live-through.
@@ -735,7 +734,7 @@ bool LiveVariables::VarInfo::isLiveIn(const MachineBasicBlock &MBB,
   return findKill(&MBB);
 }
 
-bool LiveVariables::isLiveOut(unsigned Reg, const MachineBasicBlock &MBB) {
+bool LiveVariables::isLiveOut(Register Reg, const MachineBasicBlock &MBB) {
   LiveVariables::VarInfo &VI = getVarInfo(Reg);
 
   SmallPtrSet<const MachineBasicBlock *, 8> Kills;
@@ -793,7 +792,7 @@ void LiveVariables::addNewBlock(MachineBasicBlock *BB,
 
   // Update info for all live variables
   for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
-    unsigned Reg = Register::index2VirtReg(i);
+    Register Reg = Register::index2VirtReg(i);
 
     // If the Defs is defined in the successor it can't be live in BB.
     if (Defs.count(Reg))
@@ -819,7 +818,7 @@ void LiveVariables::addNewBlock(MachineBasicBlock *BB,
 
   SparseBitVector<> &BV = LiveInSets[SuccBB->getNumber()];
   for (auto R = BV.begin(), E = BV.end(); R != E; R++) {
-    unsigned VirtReg = Register::index2VirtReg(*R);
+    Register VirtReg = Register::index2VirtReg(*R);
     LiveVariables::VarInfo &VI = getVarInfo(VirtReg);
     VI.AliveBlocks.set(NumNew);
   }
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp
index 204fb556d810..ec6e693e8a46 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp
@@ -117,7 +117,7 @@ bool LocalStackSlotPass::runOnMachineFunction(MachineFunction &MF) {
 
   // If the target doesn't want/need this pass, or if there are no locals
   // to consider, early exit.
-  if (!TRI->requiresVirtualBaseRegisters(MF) || LocalObjectCount == 0)
+  if (LocalObjectCount == 0 || !TRI->requiresVirtualBaseRegisters(MF))
     return true;
 
   // Make sure we have enough space to store the local offsets.
@@ -416,15 +416,16 @@ bool LocalStackSlotPass::insertFrameReferenceRegisters(MachineFunction &Fn) {
       const TargetRegisterClass *RC = TRI->getPointerRegClass(*MF);
       BaseReg = Fn.getRegInfo().createVirtualRegister(RC);
 
-      LLVM_DEBUG(dbgs() << "  Materializing base register " << BaseReg
+      LLVM_DEBUG(dbgs() << "  Materializing base register"
                         << " at frame local offset "
-                        << LocalOffset + InstrOffset << "\n");
+                        << LocalOffset + InstrOffset);
 
       // Tell the target to insert the instruction to initialize
       // the base register.
       //            MachineBasicBlock::iterator InsertionPt = Entry->begin();
-      TRI->materializeFrameBaseRegister(Entry, BaseReg, FrameIdx,
-                                        InstrOffset);
+      BaseReg = TRI->materializeFrameBaseRegister(Entry, FrameIdx, InstrOffset);
+
+      LLVM_DEBUG(dbgs() << " into " << printReg(BaseReg, TRI) << '\n');
 
       // The base register already includes any offset specified
       // by the instruction, so account for that so it doesn't get
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LowLevelType.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LowLevelType.cpp
index 33752a1f9230..2bda586db8c7 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/LowLevelType.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LowLevelType.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/LowLevelType.h"
+#include "llvm/ADT/APFloat.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/Support/raw_ostream.h"
@@ -58,3 +59,18 @@ LLT llvm::getLLTForMVT(MVT Ty) {
   return LLT::vector(Ty.getVectorNumElements(),
                      Ty.getVectorElementType().getSizeInBits());
 }
+
+const llvm::fltSemantics &llvm::getFltSemanticForLLT(LLT Ty) {
+  assert(Ty.isScalar() && "Expected a scalar type.");
+  switch (Ty.getSizeInBits()) {
+  case 16:
+    return APFloat::IEEEhalf();
+  case 32:
+    return APFloat::IEEEsingle();
+  case 64:
+    return APFloat::IEEEdouble();
+  case 128:
+    return APFloat::IEEEquad();
+  }
+  llvm_unreachable("Invalid FP type size.");
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LowerEmuTLS.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LowerEmuTLS.cpp
index 36b863178b47..a06d1d6255c7 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/LowerEmuTLS.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LowerEmuTLS.cpp
@@ -15,8 +15,8 @@
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
@@ -44,6 +44,7 @@ private:
                                     GlobalVariable *to) {
     to->setLinkage(from->getLinkage());
     to->setVisibility(from->getVisibility());
+    to->setDSOLocal(from->isDSOLocal());
     if (from->hasComdat()) {
       to->setComdat(M.getOrInsertComdat(to->getName()));
       to->getComdat()->setSelectionKind(from->getComdat()->getSelectionKind());
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MBFIWrapper.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MBFIWrapper.cpp
index 5110f75ebb42..4755defec793 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/MBFIWrapper.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MBFIWrapper.cpp
@@ -30,6 +30,18 @@ void MBFIWrapper::setBlockFreq(const MachineBasicBlock *MBB,
   MergedBBFreq[MBB] = F;
 }
 
+Optional<uint64_t>
+MBFIWrapper::getBlockProfileCount(const MachineBasicBlock *MBB) const {
+  auto I = MergedBBFreq.find(MBB);
+
+  // Modified block frequency also impacts profile count. So we should compute
+  // profile count from new block frequency if it has been changed.
+  if (I != MergedBBFreq.end())
+    return MBFI.getProfileCountFromFreq(I->second.getFrequency());
+
+  return MBFI.getBlockProfileCount(MBB);
+}
+
 raw_ostream & MBFIWrapper::printBlockFreq(raw_ostream &OS,
                                           const MachineBasicBlock *MBB) const {
   return MBFI.printBlockFreq(OS, getBlockFreq(MBB));
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MIRCanonicalizerPass.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MIRCanonicalizerPass.cpp
index 9eddb8626f60..8ef6aca602a1 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/MIRCanonicalizerPass.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MIRCanonicalizerPass.cpp
@@ -85,9 +85,7 @@ static std::vector<MachineBasicBlock *> GetRPOList(MachineFunction &MF) {
     return {};
   ReversePostOrderTraversal<MachineBasicBlock *> RPOT(&*MF.begin());
   std::vector<MachineBasicBlock *> RPOList;
-  for (auto MBB : RPOT) {
-    RPOList.push_back(MBB);
-  }
+  append_range(RPOList, RPOT);
 
   return RPOList;
 }
@@ -108,7 +106,7 @@ rescheduleLexographically(std::vector<MachineInstr *> instructions,
     OS.flush();
 
     // Trim the assignment, or start from the beginning in the case of a store.
-    const size_t i = S.find("=");
+    const size_t i = S.find('=');
     StringInstrMap.push_back({(i == std::string::npos) ? S : S.substr(i), II});
   }
 
@@ -198,8 +196,7 @@ static bool rescheduleCanonically(unsigned &PseudoIdempotentInstCount,
 
       if (II->getOperand(i).isReg()) {
         if (!Register::isVirtualRegister(II->getOperand(i).getReg()))
-          if (llvm::find(PhysRegDefs, II->getOperand(i).getReg()) ==
-              PhysRegDefs.end()) {
+          if (!llvm::is_contained(PhysRegDefs, II->getOperand(i).getReg())) {
             continue;
           }
       }
@@ -276,9 +273,9 @@ static bool rescheduleCanonically(unsigned &PseudoIdempotentInstCount,
   // Sort the defs for users of multiple defs lexographically.
   for (const auto &E : MultiUserLookup) {
 
-    auto UseI =
-        std::find_if(MBB->instr_begin(), MBB->instr_end(),
-                     [&](MachineInstr &MI) -> bool { return &MI == E.second; });
+    auto UseI = llvm::find_if(MBB->instrs(), [&](MachineInstr &MI) -> bool {
+      return &MI == E.second;
+    });
 
     if (UseI == MBB->instr_end())
       continue;
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MIRParser/MILexer.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MIRParser/MILexer.cpp
index 98af46dc4872..b86fd6b41318 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/MIRParser/MILexer.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MIRParser/MILexer.cpp
@@ -212,11 +212,12 @@ static MIToken::TokenKind getIdentifierKind(StringRef Identifier) {
       .Case("contract", MIToken::kw_contract)
       .Case("afn", MIToken::kw_afn)
       .Case("reassoc", MIToken::kw_reassoc)
-      .Case("nuw" , MIToken::kw_nuw)
-      .Case("nsw" , MIToken::kw_nsw)
-      .Case("exact" , MIToken::kw_exact)
+      .Case("nuw", MIToken::kw_nuw)
+      .Case("nsw", MIToken::kw_nsw)
+      .Case("exact", MIToken::kw_exact)
       .Case("nofpexcept", MIToken::kw_nofpexcept)
       .Case("debug-location", MIToken::kw_debug_location)
+      .Case("debug-instr-number", MIToken::kw_debug_instr_number)
       .Case("same_value", MIToken::kw_cfi_same_value)
       .Case("offset", MIToken::kw_cfi_offset)
       .Case("rel_offset", MIToken::kw_cfi_rel_offset)
@@ -231,7 +232,8 @@ static MIToken::TokenKind getIdentifierKind(StringRef Identifier) {
       .Case("undefined", MIToken::kw_cfi_undefined)
       .Case("register", MIToken::kw_cfi_register)
       .Case("window_save", MIToken::kw_cfi_window_save)
-      .Case("negate_ra_sign_state", MIToken::kw_cfi_aarch64_negate_ra_sign_state)
+      .Case("negate_ra_sign_state",
+            MIToken::kw_cfi_aarch64_negate_ra_sign_state)
       .Case("blockaddress", MIToken::kw_blockaddress)
       .Case("intrinsic", MIToken::kw_intrinsic)
       .Case("target-index", MIToken::kw_target_index)
@@ -247,6 +249,7 @@ static MIToken::TokenKind getIdentifierKind(StringRef Identifier) {
       .Case("dereferenceable", MIToken::kw_dereferenceable)
       .Case("invariant", MIToken::kw_invariant)
       .Case("align", MIToken::kw_align)
+      .Case("basealign", MIToken::kw_align)
       .Case("addrspace", MIToken::kw_addrspace)
       .Case("stack", MIToken::kw_stack)
       .Case("got", MIToken::kw_got)
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MIRParser/MILexer.h b/contrib/llvm-project/llvm/lib/CodeGen/MIRParser/MILexer.h
index ef16da94d21b..452eda721331 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/MIRParser/MILexer.h
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MIRParser/MILexer.h
@@ -74,6 +74,7 @@ struct MIToken {
     kw_exact,
     kw_nofpexcept,
     kw_debug_location,
+    kw_debug_instr_number,
     kw_cfi_same_value,
     kw_cfi_offset,
     kw_cfi_rel_offset,
@@ -103,6 +104,7 @@ struct MIToken {
     kw_non_temporal,
     kw_invariant,
     kw_align,
+    kw_basealign,
     kw_addrspace,
     kw_stack,
     kw_got,
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MIRParser/MIParser.cpp
index ded31cd08fb5..fe979b981886 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MIRParser/MIParser.cpp
@@ -369,10 +369,7 @@ static void initSlots2Values(const Function &F,
 const Value* PerFunctionMIParsingState::getIRValue(unsigned Slot) {
   if (Slots2Values.empty())
     initSlots2Values(MF.getFunction(), Slots2Values);
-  auto ValueInfo = Slots2Values.find(Slot);
-  if (ValueInfo == Slots2Values.end())
-    return nullptr;
-  return ValueInfo->second;
+  return Slots2Values.lookup(Slot);
 }
 
 namespace {
@@ -984,6 +981,7 @@ bool MIParser::parse(MachineInstr *&MI) {
          Token.isNot(MIToken::kw_post_instr_symbol) &&
          Token.isNot(MIToken::kw_heap_alloc_marker) &&
          Token.isNot(MIToken::kw_debug_location) &&
+         Token.isNot(MIToken::kw_debug_instr_number) &&
          Token.isNot(MIToken::coloncolon) && Token.isNot(MIToken::lbrace)) {
     auto Loc = Token.location();
     Optional<unsigned> TiedDefIdx;
@@ -1014,6 +1012,19 @@ bool MIParser::parse(MachineInstr *&MI) {
     if (parseHeapAllocMarker(HeapAllocMarker))
       return true;
 
+  unsigned InstrNum = 0;
+  if (Token.is(MIToken::kw_debug_instr_number)) {
+    lex();
+    if (Token.isNot(MIToken::IntegerLiteral))
+      return error("expected an integer literal after 'debug-instr-number'");
+    if (getUnsigned(InstrNum))
+      return true;
+    lex();
+    // Lex past trailing comma if present.
+    if (Token.is(MIToken::comma))
+      lex();
+  }
+
   DebugLoc DebugLocation;
   if (Token.is(MIToken::kw_debug_location)) {
     lex();
@@ -1070,6 +1081,8 @@ bool MIParser::parse(MachineInstr *&MI) {
     MI->setHeapAllocMarker(MF, HeapAllocMarker);
   if (!MemOperands.empty())
     MI->setMemRefs(MF, MemOperands);
+  if (InstrNum)
+    MI->setDebugInstrNum(InstrNum);
   return false;
 }
 
@@ -2713,7 +2726,7 @@ bool MIParser::parseOffset(int64_t &Offset) {
 }
 
 bool MIParser::parseAlignment(unsigned &Alignment) {
-  assert(Token.is(MIToken::kw_align));
+  assert(Token.is(MIToken::kw_align) || Token.is(MIToken::kw_basealign));
   lex();
   if (Token.isNot(MIToken::IntegerLiteral) || Token.integerValue().isSigned())
     return error("expected an integer literal after 'align'");
@@ -3061,6 +3074,12 @@ bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) {
   while (consumeIfPresent(MIToken::comma)) {
     switch (Token.kind()) {
     case MIToken::kw_align:
+      // align is printed if it is different than size.
+      if (parseAlignment(BaseAlignment))
+        return true;
+      break;
+    case MIToken::kw_basealign:
+      // basealign is printed if it is different than align.
       if (parseAlignment(BaseAlignment))
         return true;
       break;
@@ -3153,10 +3172,7 @@ static void initSlots2BasicBlocks(
 static const BasicBlock *getIRBlockFromSlot(
     unsigned Slot,
     const DenseMap<unsigned, const BasicBlock *> &Slots2BasicBlocks) {
-  auto BlockInfo = Slots2BasicBlocks.find(Slot);
-  if (BlockInfo == Slots2BasicBlocks.end())
-    return nullptr;
-  return BlockInfo->second;
+  return Slots2BasicBlocks.lookup(Slot);
 }
 
 const BasicBlock *MIParser::getIRBlock(unsigned Slot) {
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
index 2e0b0e745e9e..ffa9aeb21edb 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
@@ -161,6 +161,9 @@ private:
                                        SMRange SourceRange);
 
   void computeFunctionProperties(MachineFunction &MF);
+
+  void setupDebugValueTracking(MachineFunction &MF,
+    PerFunctionMIParsingState &PFS, const yaml::MachineFunction &YamlMF);
 };
 
 } // end namespace llvm
@@ -322,9 +325,14 @@ bool MIRParserImpl::parseMachineFunction(Module &M, MachineModuleInfo &MMI) {
 static bool isSSA(const MachineFunction &MF) {
   const MachineRegisterInfo &MRI = MF.getRegInfo();
   for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
-    unsigned Reg = Register::index2VirtReg(I);
+    Register Reg = Register::index2VirtReg(I);
     if (!MRI.hasOneDef(Reg) && !MRI.def_empty(Reg))
       return false;
+
+    // Subregister defs are invalid in SSA.
+    const MachineOperand *RegDef = MRI.getOneDef(Reg);
+    if (RegDef && RegDef->getSubReg() != 0)
+      return false;
   }
   return true;
 }
@@ -397,6 +405,23 @@ bool MIRParserImpl::initializeCallSiteInfo(
   return false;
 }
 
+void MIRParserImpl::setupDebugValueTracking(
+    MachineFunction &MF, PerFunctionMIParsingState &PFS,
+    const yaml::MachineFunction &YamlMF) {
+  // Compute the value of the "next instruction number" field.
+  unsigned MaxInstrNum = 0;
+  for (auto &MBB : MF)
+    for (auto &MI : MBB)
+      MaxInstrNum = std::max((unsigned)MI.peekDebugInstrNum(), MaxInstrNum);
+  MF.setDebugInstrNumberingCount(MaxInstrNum);
+
+  // Load any substitutions.
+  for (auto &Sub : YamlMF.DebugValueSubstitutions) {
+    MF.makeDebugValueSubstitution(std::make_pair(Sub.SrcInst, Sub.SrcOp),
+                                  std::make_pair(Sub.DstInst, Sub.DstOp));
+  }
+}
+
 bool
 MIRParserImpl::initializeMachineFunction(const yaml::MachineFunction &YamlMF,
                                          MachineFunction &MF) {
@@ -446,10 +471,8 @@ MIRParserImpl::initializeMachineFunction(const yaml::MachineFunction &YamlMF,
   }
   // Check Basic Block Section Flags.
   if (MF.getTarget().getBBSectionsType() == BasicBlockSection::Labels) {
-    MF.createBBLabels();
     MF.setBBSectionsType(BasicBlockSection::Labels);
   } else if (MF.hasBBSections()) {
-    MF.createBBLabels();
     MF.assignBeginEndSections();
   }
   PFS.SM = &SM;
@@ -507,6 +530,8 @@ MIRParserImpl::initializeMachineFunction(const yaml::MachineFunction &YamlMF,
   if (initializeCallSiteInfo(PFS, YamlMF))
     return false;
 
+  setupDebugValueTracking(MF, PFS, YamlMF);
+
   MF.getSubtarget().mirFileLoaded(MF);
 
   MF.verify();
@@ -634,6 +659,12 @@ bool MIRParserImpl::setupRegisterInfo(const PerFunctionMIParsingState &PFS,
 
   // Compute MachineRegisterInfo::UsedPhysRegMask
   for (const MachineBasicBlock &MBB : MF) {
+    // Make sure MRI knows about registers clobbered by unwinder.
+    const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+    if (MBB.isEHPad())
+      if (auto *RegMask = TRI->getCustomEHPadPreservedMask(MF))
+        MRI.addPhysRegsUsedFromRegMask(RegMask);
+
     for (const MachineInstr &MI : MBB) {
       for (const MachineOperand &MO : MI.operands()) {
         if (!MO.isRegMask())
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MIRPrinter.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MIRPrinter.cpp
index fa23df6288e9..eae174019b56 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/MIRPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -220,6 +220,10 @@ void MIRPrinter::print(const MachineFunction &MF) {
   convert(MST, YamlMF.FrameInfo, MF.getFrameInfo());
   convertStackObjects(YamlMF, MF, MST);
   convertCallSiteObjects(YamlMF, MF, MST);
+  for (auto &Sub : MF.DebugValueSubstitutions)
+    YamlMF.DebugValueSubstitutions.push_back({Sub.first.first, Sub.first.second,
+                                              Sub.second.first,
+                                              Sub.second.second});
   if (const auto *ConstantPool = MF.getConstantPool())
     convert(YamlMF, *ConstantPool);
   if (const auto *JumpTableInfo = MF.getJumpTableInfo())
@@ -363,9 +367,17 @@ void MIRPrinter::convertStackObjects(yaml::MachineFunction &YMF,
                                      ModuleSlotTracker &MST) {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+
   // Process fixed stack objects.
+  assert(YMF.FixedStackObjects.empty());
+  SmallVector<int, 32> FixedStackObjectsIdx;
+  const int BeginIdx = MFI.getObjectIndexBegin();
+  if (BeginIdx < 0)
+    FixedStackObjectsIdx.reserve(-BeginIdx);
+
   unsigned ID = 0;
-  for (int I = MFI.getObjectIndexBegin(); I < 0; ++I, ++ID) {
+  for (int I = BeginIdx; I < 0; ++I, ++ID) {
+    FixedStackObjectsIdx.push_back(-1); // Fill index for possible dead.
     if (MFI.isDeadObjectIndex(I))
       continue;
 
@@ -380,14 +392,22 @@ void MIRPrinter::convertStackObjects(yaml::MachineFunction &YMF,
     YamlObject.StackID = (TargetStackID::Value)MFI.getStackID(I);
     YamlObject.IsImmutable = MFI.isImmutableObjectIndex(I);
     YamlObject.IsAliased = MFI.isAliasedObjectIndex(I);
+    // Save the ID' position in FixedStackObjects storage vector.
+    FixedStackObjectsIdx[ID] = YMF.FixedStackObjects.size();
     YMF.FixedStackObjects.push_back(YamlObject);
     StackObjectOperandMapping.insert(
         std::make_pair(I, FrameIndexOperand::createFixed(ID)));
   }
 
   // Process ordinary stack objects.
+  assert(YMF.StackObjects.empty());
+  SmallVector<unsigned, 32> StackObjectsIdx;
+  const int EndIdx = MFI.getObjectIndexEnd();
+  if (EndIdx > 0)
+    StackObjectsIdx.reserve(EndIdx);
   ID = 0;
-  for (int I = 0, E = MFI.getObjectIndexEnd(); I < E; ++I, ++ID) {
+  for (int I = 0; I < EndIdx; ++I, ++ID) {
+    StackObjectsIdx.push_back(-1); // Fill index for possible dead.
     if (MFI.isDeadObjectIndex(I))
       continue;
 
@@ -395,7 +415,7 @@ void MIRPrinter::convertStackObjects(yaml::MachineFunction &YMF,
     YamlObject.ID = ID;
     if (const auto *Alloca = MFI.getObjectAllocation(I))
       YamlObject.Name.Value = std::string(
-          Alloca->hasName() ? Alloca->getName() : "<unnamed alloca>");
+          Alloca->hasName() ? Alloca->getName() : "");
     YamlObject.Type = MFI.isSpillSlotObjectIndex(I)
                           ? yaml::MachineStackObject::SpillSlot
                           : MFI.isVariableSizedObjectIndex(I)
@@ -406,41 +426,42 @@ void MIRPrinter::convertStackObjects(yaml::MachineFunction &YMF,
     YamlObject.Alignment = MFI.getObjectAlign(I);
     YamlObject.StackID = (TargetStackID::Value)MFI.getStackID(I);
 
+    // Save the ID' position in StackObjects storage vector.
+    StackObjectsIdx[ID] = YMF.StackObjects.size();
     YMF.StackObjects.push_back(YamlObject);
     StackObjectOperandMapping.insert(std::make_pair(
         I, FrameIndexOperand::create(YamlObject.Name.Value, ID)));
   }
 
   for (const auto &CSInfo : MFI.getCalleeSavedInfo()) {
-    if (!CSInfo.isSpilledToReg() && MFI.isDeadObjectIndex(CSInfo.getFrameIdx()))
+    const int FrameIdx = CSInfo.getFrameIdx();
+    if (!CSInfo.isSpilledToReg() && MFI.isDeadObjectIndex(FrameIdx))
       continue;
 
     yaml::StringValue Reg;
     printRegMIR(CSInfo.getReg(), Reg, TRI);
     if (!CSInfo.isSpilledToReg()) {
-      auto StackObjectInfo = StackObjectOperandMapping.find(CSInfo.getFrameIdx());
-      assert(StackObjectInfo != StackObjectOperandMapping.end() &&
+      assert(FrameIdx >= MFI.getObjectIndexBegin() &&
+             FrameIdx < MFI.getObjectIndexEnd() &&
              "Invalid stack object index");
-      const FrameIndexOperand &StackObject = StackObjectInfo->second;
-      if (StackObject.IsFixed) {
-        YMF.FixedStackObjects[StackObject.ID].CalleeSavedRegister = Reg;
-        YMF.FixedStackObjects[StackObject.ID].CalleeSavedRestored =
-          CSInfo.isRestored();
+      if (FrameIdx < 0) { // Negative index means fixed objects.
+        auto &Object =
+            YMF.FixedStackObjects
+                [FixedStackObjectsIdx[FrameIdx + MFI.getNumFixedObjects()]];
+        Object.CalleeSavedRegister = Reg;
+        Object.CalleeSavedRestored = CSInfo.isRestored();
       } else {
-        YMF.StackObjects[StackObject.ID].CalleeSavedRegister = Reg;
-        YMF.StackObjects[StackObject.ID].CalleeSavedRestored =
-          CSInfo.isRestored();
+        auto &Object = YMF.StackObjects[StackObjectsIdx[FrameIdx]];
+        Object.CalleeSavedRegister = Reg;
+        Object.CalleeSavedRestored = CSInfo.isRestored();
       }
     }
   }
   for (unsigned I = 0, E = MFI.getLocalFrameObjectCount(); I < E; ++I) {
     auto LocalObject = MFI.getLocalFrameObjectMap(I);
-    auto StackObjectInfo = StackObjectOperandMapping.find(LocalObject.first);
-    assert(StackObjectInfo != StackObjectOperandMapping.end() &&
-           "Invalid stack object index");
-    const FrameIndexOperand &StackObject = StackObjectInfo->second;
-    assert(!StackObject.IsFixed && "Expected a locally mapped stack object");
-    YMF.StackObjects[StackObject.ID].LocalOffset = LocalObject.second;
+    assert(LocalObject.first >= 0 && "Expected a locally mapped stack object");
+    YMF.StackObjects[StackObjectsIdx[LocalObject.first]].LocalOffset =
+        LocalObject.second;
   }
 
   // Print the stack object references in the frame information class after
@@ -454,15 +475,16 @@ void MIRPrinter::convertStackObjects(yaml::MachineFunction &YMF,
   // Print the debug variable information.
   for (const MachineFunction::VariableDbgInfo &DebugVar :
        MF.getVariableDbgInfo()) {
-    auto StackObjectInfo = StackObjectOperandMapping.find(DebugVar.Slot);
-    assert(StackObjectInfo != StackObjectOperandMapping.end() &&
+    assert(DebugVar.Slot >= MFI.getObjectIndexBegin() &&
+           DebugVar.Slot < MFI.getObjectIndexEnd() &&
            "Invalid stack object index");
-    const FrameIndexOperand &StackObject = StackObjectInfo->second;
-    if (StackObject.IsFixed) {
-      auto &Object = YMF.FixedStackObjects[StackObject.ID];
+    if (DebugVar.Slot < 0) { // Negative index means fixed objects.
+      auto &Object =
+          YMF.FixedStackObjects[FixedStackObjectsIdx[DebugVar.Slot +
+                                                     MFI.getNumFixedObjects()]];
       printStackObjectDbgInfo(DebugVar, Object, MST);
     } else {
-      auto &Object = YMF.StackObjects[StackObject.ID];
+      auto &Object = YMF.StackObjects[StackObjectsIdx[DebugVar.Slot]];
       printStackObjectDbgInfo(DebugVar, Object, MST);
     }
   }
@@ -608,58 +630,10 @@ bool MIPrinter::canPredictSuccessors(const MachineBasicBlock &MBB) const {
 
 void MIPrinter::print(const MachineBasicBlock &MBB) {
   assert(MBB.getNumber() >= 0 && "Invalid MBB number");
-  OS << "bb." << MBB.getNumber();
-  bool HasAttributes = false;
-  if (const auto *BB = MBB.getBasicBlock()) {
-    if (BB->hasName()) {
-      OS << "." << BB->getName();
-    } else {
-      HasAttributes = true;
-      OS << " (";
-      int Slot = MST.getLocalSlot(BB);
-      if (Slot == -1)
-        OS << "<ir-block badref>";
-      else
-        OS << (Twine("%ir-block.") + Twine(Slot)).str();
-    }
-  }
-  if (MBB.hasAddressTaken()) {
-    OS << (HasAttributes ? ", " : " (");
-    OS << "address-taken";
-    HasAttributes = true;
-  }
-  if (MBB.isEHPad()) {
-    OS << (HasAttributes ? ", " : " (");
-    OS << "landing-pad";
-    HasAttributes = true;
-  }
-  if (MBB.isEHFuncletEntry()) {
-    OS << (HasAttributes ? ", " : " (");
-    OS << "ehfunclet-entry";
-    HasAttributes = true;
-  }
-  if (MBB.getAlignment() != Align(1)) {
-    OS << (HasAttributes ? ", " : " (");
-    OS << "align " << MBB.getAlignment().value();
-    HasAttributes = true;
-  }
-  if (MBB.getSectionID() != MBBSectionID(0)) {
-    OS << (HasAttributes ? ", " : " (");
-    OS << "bbsections ";
-    switch (MBB.getSectionID().Type) {
-    case MBBSectionID::SectionType::Exception:
-      OS << "Exception";
-      break;
-    case MBBSectionID::SectionType::Cold:
-      OS << "Cold";
-      break;
-    default:
-      OS << MBB.getSectionID().Number;
-    }
-    HasAttributes = true;
-  }
-  if (HasAttributes)
-    OS << ")";
+  MBB.printName(OS,
+                MachineBasicBlock::PrintNameIr |
+                    MachineBasicBlock::PrintNameAttributes,
+                &MST);
   OS << ":\n";
 
   bool HasLineAttributes = false;
@@ -818,6 +792,13 @@ void MIPrinter::print(const MachineInstr &MI) {
     NeedComma = true;
   }
 
+  if (auto Num = MI.peekDebugInstrNum()) {
+    if (NeedComma)
+      OS << ',';
+    OS << " debug-instr-number " << Num;
+    NeedComma = true;
+  }
+
   if (PrintLocations) {
     if (const DebugLoc &DL = MI.getDebugLoc()) {
       if (NeedComma)
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MIRVRegNamerUtils.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MIRVRegNamerUtils.cpp
index 54441301d65b..3d4f66f31174 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/MIRVRegNamerUtils.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MIRVRegNamerUtils.cpp
@@ -8,6 +8,7 @@
 
 #include "MIRVRegNamerUtils.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineStableHash.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/Support/Debug.h"
 
@@ -15,6 +16,11 @@ using namespace llvm;
 
 #define DEBUG_TYPE "mir-vregnamer-utils"
 
+static cl::opt<bool>
+    UseStableNamerHash("mir-vreg-namer-use-stable-hash", cl::init(false),
+                       cl::Hidden,
+                       cl::desc("Use Stable Hashing for MIR VReg Renaming"));
+
 using VRegRenameMap = std::map<unsigned, unsigned>;
 
 bool VRegRenamer::doVRegRenaming(const VRegRenameMap &VRM) {
@@ -52,6 +58,14 @@ std::string VRegRenamer::getInstructionOpcodeHash(MachineInstr &MI) {
   std::string S;
   raw_string_ostream OS(S);
 
+  if (UseStableNamerHash) {
+    auto Hash = stableHashValue(MI, /* HashVRegs */ true,
+                                /* HashConstantPoolIndices */ true,
+                                /* HashMemOperands */ true);
+    assert(Hash && "Expected non-zero Hash");
+    return std::to_string(Hash).substr(0, 5);
+  }
+
   // Gets a hashable artifact from a given MachineOperand (ie an unsigned).
   auto GetHashableMO = [this](const MachineOperand &MO) -> unsigned {
     switch (MO.getType()) {
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineBasicBlock.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineBasicBlock.cpp
index 2d4b60435d96..b4187af02975 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/MachineBasicBlock.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineBasicBlock.cpp
@@ -60,38 +60,25 @@ MCSymbol *MachineBasicBlock::getSymbol() const {
   if (!CachedMCSymbol) {
     const MachineFunction *MF = getParent();
     MCContext &Ctx = MF->getContext();
-    auto Prefix = Ctx.getAsmInfo()->getPrivateLabelPrefix();
 
-    assert(getNumber() >= 0 && "cannot get label for unreachable MBB");
-
-    // We emit a non-temporary symbol for every basic block if we have BBLabels
-    // or -- with basic block sections -- when a basic block begins a section.
-    // With basic block symbols, we use a unary encoding which can
-    // compress the symbol names significantly. For basic block sections where
-    // this block is the first in a cluster, we use a non-temp descriptive name.
-    // Otherwise we fall back to use temp label.
-    if (MF->hasBBLabels()) {
-      auto Iter = MF->getBBSectionsSymbolPrefix().begin();
-      if (getNumber() < 0 ||
-          getNumber() >= (int)MF->getBBSectionsSymbolPrefix().size())
-        report_fatal_error("Unreachable MBB: " + Twine(getNumber()));
-      // The basic blocks for function foo are named a.BB.foo, aa.BB.foo, and
-      // so on.
-      std::string Prefix(Iter + 1, Iter + getNumber() + 1);
-      std::reverse(Prefix.begin(), Prefix.end());
-      CachedMCSymbol =
-          Ctx.getOrCreateSymbol(Twine(Prefix) + ".BB." + Twine(MF->getName()));
-    } else if (MF->hasBBSections() && isBeginSection()) {
+    // We emit a non-temporary symbol -- with a descriptive name -- if it begins
+    // a section (with basic block sections). Otherwise we fall back to use temp
+    // label.
+    if (MF->hasBBSections() && isBeginSection()) {
       SmallString<5> Suffix;
       if (SectionID == MBBSectionID::ColdSectionID) {
         Suffix += ".cold";
       } else if (SectionID == MBBSectionID::ExceptionSectionID) {
         Suffix += ".eh";
       } else {
-        Suffix += "." + std::to_string(SectionID.Number);
+        // For symbols that represent basic block sections, we add ".__part." to
+        // allow tools like symbolizers to know that this represents a part of
+        // the original function.
+        Suffix = (Suffix + Twine(".__part.") + Twine(SectionID.Number)).str();
       }
       CachedMCSymbol = Ctx.getOrCreateSymbol(MF->getName() + Suffix);
     } else {
+      const StringRef Prefix = Ctx.getAsmInfo()->getPrivateLabelPrefix();
       CachedMCSymbol = Ctx.getOrCreateSymbol(Twine(Prefix) + "BB" +
                                              Twine(MF->getFunctionNumber()) +
                                              "_" + Twine(getNumber()));
@@ -100,6 +87,17 @@ MCSymbol *MachineBasicBlock::getSymbol() const {
   return CachedMCSymbol;
 }
 
+MCSymbol *MachineBasicBlock::getEndSymbol() const {
+  if (!CachedEndMCSymbol) {
+    const MachineFunction *MF = getParent();
+    MCContext &Ctx = MF->getContext();
+    auto Prefix = Ctx.getAsmInfo()->getPrivateLabelPrefix();
+    CachedEndMCSymbol = Ctx.getOrCreateSymbol(Twine(Prefix) + "BB_END" +
+                                              Twine(MF->getFunctionNumber()) +
+                                              "_" + Twine(getNumber()));
+  }
+  return CachedEndMCSymbol;
+}
 
 raw_ostream &llvm::operator<<(raw_ostream &OS, const MachineBasicBlock &MBB) {
   MBB.print(OS);
@@ -271,6 +269,10 @@ bool MachineBasicBlock::hasEHPadSuccessor() const {
   return false;
 }
 
+bool MachineBasicBlock::isEntryBlock() const {
+  return getParent()->begin() == getIterator();
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void MachineBasicBlock::dump() const {
   print(dbgs());
@@ -338,39 +340,7 @@ void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST,
   if (Indexes && PrintSlotIndexes)
     OS << Indexes->getMBBStartIdx(this) << '\t';
 
-  OS << "bb." << getNumber();
-  bool HasAttributes = false;
-  if (const auto *BB = getBasicBlock()) {
-    if (BB->hasName()) {
-      OS << "." << BB->getName();
-    } else {
-      HasAttributes = true;
-      OS << " (";
-      int Slot = MST.getLocalSlot(BB);
-      if (Slot == -1)
-        OS << "<ir-block badref>";
-      else
-        OS << (Twine("%ir-block.") + Twine(Slot)).str();
-    }
-  }
-
-  if (hasAddressTaken()) {
-    OS << (HasAttributes ? ", " : " (");
-    OS << "address-taken";
-    HasAttributes = true;
-  }
-  if (isEHPad()) {
-    OS << (HasAttributes ? ", " : " (");
-    OS << "landing-pad";
-    HasAttributes = true;
-  }
-  if (getAlignment() != Align(1)) {
-    OS << (HasAttributes ? ", " : " (");
-    OS << "align " << Log2(getAlignment());
-    HasAttributes = true;
-  }
-  if (HasAttributes)
-    OS << ")";
+  printName(OS, PrintNameIr | PrintNameAttributes, &MST);
   OS << ":\n";
 
   const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
@@ -383,11 +353,9 @@ void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST,
     if (Indexes) OS << '\t';
     // Don't indent(2), align with previous line attributes.
     OS << "; predecessors: ";
-    for (auto I = pred_begin(), E = pred_end(); I != E; ++I) {
-      if (I != pred_begin())
-        OS << ", ";
-      OS << printMBBReference(**I);
-    }
+    ListSeparator LS;
+    for (auto *Pred : predecessors())
+      OS << LS << printMBBReference(*Pred);
     OS << '\n';
     HasLineAttributes = true;
   }
@@ -396,10 +364,9 @@ void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST,
     if (Indexes) OS << '\t';
     // Print the successors
     OS.indent(2) << "successors: ";
+    ListSeparator LS;
     for (auto I = succ_begin(), E = succ_end(); I != E; ++I) {
-      if (I != succ_begin())
-        OS << ", ";
-      OS << printMBBReference(**I);
+      OS << LS << printMBBReference(**I);
       if (!Probs.empty())
         OS << '('
            << format("0x%08" PRIx32, getSuccProbability(I).getNumerator())
@@ -408,11 +375,10 @@ void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST,
     if (!Probs.empty() && IsStandalone) {
       // Print human readable probabilities as comments.
       OS << "; ";
+      ListSeparator LS;
       for (auto I = succ_begin(), E = succ_end(); I != E; ++I) {
         const BranchProbability &BP = getSuccProbability(I);
-        if (I != succ_begin())
-          OS << ", ";
-        OS << printMBBReference(**I) << '('
+        OS << LS << printMBBReference(**I) << '('
            << format("%.2f%%",
                      rint(((double)BP.getNumerator() / BP.getDenominator()) *
                           100.0 * 100.0) /
@@ -429,12 +395,9 @@ void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST,
     if (Indexes) OS << '\t';
     OS.indent(2) << "liveins: ";
 
-    bool First = true;
+    ListSeparator LS;
     for (const auto &LI : liveins()) {
-      if (!First)
-        OS << ", ";
-      First = false;
-      OS << printReg(LI.PhysReg, TRI);
+      OS << LS << printReg(LI.PhysReg, TRI);
       if (!LI.LaneMask.all())
         OS << ":0x" << PrintLaneMask(LI.LaneMask);
     }
@@ -478,9 +441,99 @@ void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST,
   }
 }
 
+/// Print the basic block's name as:
+///
+///    bb.{number}[.{ir-name}] [(attributes...)]
+///
+/// The {ir-name} is only printed when the \ref PrintNameIr flag is passed
+/// (which is the default). If the IR block has no name, it is identified
+/// numerically using the attribute syntax as "(%ir-block.{ir-slot})".
+///
+/// When the \ref PrintNameAttributes flag is passed, additional attributes
+/// of the block are printed when set.
+///
+/// \param printNameFlags Combination of \ref PrintNameFlag flags indicating
+///                       the parts to print.
+/// \param moduleSlotTracker Optional ModuleSlotTracker. This method will
+///                          incorporate its own tracker when necessary to
+///                          determine the block's IR name.
+void MachineBasicBlock::printName(raw_ostream &os, unsigned printNameFlags,
+                                  ModuleSlotTracker *moduleSlotTracker) const {
+  os << "bb." << getNumber();
+  bool hasAttributes = false;
+
+  if (printNameFlags & PrintNameIr) {
+    if (const auto *bb = getBasicBlock()) {
+      if (bb->hasName()) {
+        os << '.' << bb->getName();
+      } else {
+        hasAttributes = true;
+        os << " (";
+
+        int slot = -1;
+
+        if (moduleSlotTracker) {
+          slot = moduleSlotTracker->getLocalSlot(bb);
+        } else if (bb->getParent()) {
+          ModuleSlotTracker tmpTracker(bb->getModule(), false);
+          tmpTracker.incorporateFunction(*bb->getParent());
+          slot = tmpTracker.getLocalSlot(bb);
+        }
+
+        if (slot == -1)
+          os << "<ir-block badref>";
+        else
+          os << (Twine("%ir-block.") + Twine(slot)).str();
+      }
+    }
+  }
+
+  if (printNameFlags & PrintNameAttributes) {
+    if (hasAddressTaken()) {
+      os << (hasAttributes ? ", " : " (");
+      os << "address-taken";
+      hasAttributes = true;
+    }
+    if (isEHPad()) {
+      os << (hasAttributes ? ", " : " (");
+      os << "landing-pad";
+      hasAttributes = true;
+    }
+    if (isEHFuncletEntry()) {
+      os << (hasAttributes ? ", " : " (");
+      os << "ehfunclet-entry";
+      hasAttributes = true;
+    }
+    if (getAlignment() != Align(1)) {
+      os << (hasAttributes ? ", " : " (");
+      os << "align " << getAlignment().value();
+      hasAttributes = true;
+    }
+    if (getSectionID() != MBBSectionID(0)) {
+      os << (hasAttributes ? ", " : " (");
+      os << "bbsections ";
+      switch (getSectionID().Type) {
+      case MBBSectionID::SectionType::Exception:
+        os << "Exception";
+        break;
+      case MBBSectionID::SectionType::Cold:
+        os << "Cold";
+        break;
+      default:
+        os << getSectionID().Number;
+      }
+      hasAttributes = true;
+    }
+  }
+
+  if (hasAttributes)
+    os << ')';
+}
+
 void MachineBasicBlock::printAsOperand(raw_ostream &OS,
                                        bool /*PrintType*/) const {
-  OS << "%bb." << getNumber();
+  OS << '%';
+  printName(OS, 0);
 }
 
 void MachineBasicBlock::removeLiveIn(MCPhysReg Reg, LaneBitmask LaneMask) {
@@ -530,7 +583,7 @@ void MachineBasicBlock::sortUniqueLiveIns() {
 Register
 MachineBasicBlock::addLiveIn(MCRegister PhysReg, const TargetRegisterClass *RC) {
   assert(getParent() && "MBB must be inserted in function");
-  assert(PhysReg.isPhysical() && "Expected physreg");
+  assert(Register::isPhysicalRegister(PhysReg) && "Expected physreg");
   assert(RC && "Register class is required");
   assert((isEHPad() || this == &getParent()->front()) &&
          "Only the entry block and landing pads can have physreg live ins");
@@ -696,7 +749,7 @@ void MachineBasicBlock::splitSuccessor(MachineBasicBlock *Old,
                                        bool NormalizeSuccProbs) {
   succ_iterator OldI = llvm::find(successors(), Old);
   assert(OldI != succ_end() && "Old is not a successor of this block!");
-  assert(llvm::find(successors(), New) == succ_end() &&
+  assert(!llvm::is_contained(successors(), New) &&
          "New is already a successor of this block!");
 
   // Add a new successor with equal probability as the original one. Note
@@ -775,7 +828,7 @@ void MachineBasicBlock::replaceSuccessor(MachineBasicBlock *Old,
 
 void MachineBasicBlock::copySuccessor(MachineBasicBlock *Orig,
                                       succ_iterator I) {
-  if (Orig->Probs.empty())
+  if (!Orig->Probs.empty())
     addSuccessor(*I, Orig->getSuccProbability(I));
   else
     addSuccessorWithoutProb(*I);
@@ -891,6 +944,47 @@ bool MachineBasicBlock::canFallThrough() {
   return getFallThrough() != nullptr;
 }
 
+MachineBasicBlock *MachineBasicBlock::splitAt(MachineInstr &MI,
+                                              bool UpdateLiveIns,
+                                              LiveIntervals *LIS) {
+  MachineBasicBlock::iterator SplitPoint(&MI);
+  ++SplitPoint;
+
+  if (SplitPoint == end()) {
+    // Don't bother with a new block.
+    return this;
+  }
+
+  MachineFunction *MF = getParent();
+
+  LivePhysRegs LiveRegs;
+  if (UpdateLiveIns) {
+    // Make sure we add any physregs we define in the block as liveins to the
+    // new block.
+    MachineBasicBlock::iterator Prev(&MI);
+    LiveRegs.init(*MF->getSubtarget().getRegisterInfo());
+    LiveRegs.addLiveOuts(*this);
+    for (auto I = rbegin(), E = Prev.getReverse(); I != E; ++I)
+      LiveRegs.stepBackward(*I);
+  }
+
+  MachineBasicBlock *SplitBB = MF->CreateMachineBasicBlock(getBasicBlock());
+
+  MF->insert(++MachineFunction::iterator(this), SplitBB);
+  SplitBB->splice(SplitBB->begin(), this, SplitPoint, end());
+
+  SplitBB->transferSuccessorsAndUpdatePHIs(this);
+  addSuccessor(SplitBB);
+
+  if (UpdateLiveIns)
+    addLiveIns(*SplitBB, LiveRegs);
+
+  if (LIS)
+    LIS->insertMBBInMaps(SplitBB);
+
+  return SplitBB;
+}
+
 MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge(
     MachineBasicBlock *Succ, Pass &P,
     std::vector<SparseBitVector<>> *LiveInSets) {
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
index 1168b01a835f..54e0a14e0555 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
@@ -241,16 +241,21 @@ MachineBlockFrequencyInfo::getProfileCountFromFreq(uint64_t Freq) const {
   return MBFI ? MBFI->getProfileCountFromFreq(F, Freq) : None;
 }
 
-bool
-MachineBlockFrequencyInfo::isIrrLoopHeader(const MachineBasicBlock *MBB) {
+bool MachineBlockFrequencyInfo::isIrrLoopHeader(
+    const MachineBasicBlock *MBB) const {
   assert(MBFI && "Expected analysis to be available");
   return MBFI->isIrrLoopHeader(MBB);
 }
 
-void MachineBlockFrequencyInfo::setBlockFreq(const MachineBasicBlock *MBB,
-                                             uint64_t Freq) {
+void MachineBlockFrequencyInfo::onEdgeSplit(
+    const MachineBasicBlock &NewPredecessor,
+    const MachineBasicBlock &NewSuccessor,
+    const MachineBranchProbabilityInfo &MBPI) {
   assert(MBFI && "Expected analysis to be available");
-  MBFI->setBlockFreq(MBB, Freq);
+  auto NewSuccFreq = MBFI->getBlockFreq(&NewPredecessor) *
+                     MBPI.getEdgeProbability(&NewPredecessor, &NewSuccessor);
+
+  MBFI->setBlockFreq(&NewSuccessor, NewSuccFreq.getFrequency());
 }
 
 const MachineFunction *MachineBlockFrequencyInfo::getFunction() const {
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineBlockPlacement.cpp
index 783d22fafee9..048baa460e49 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -177,6 +177,14 @@ static cl::opt<unsigned> TailDupPlacementPenalty(
     cl::init(2),
     cl::Hidden);
 
+// Heuristic for tail duplication if profile count is used in cost model.
+static cl::opt<unsigned> TailDupProfilePercentThreshold(
+    "tail-dup-profile-percent-threshold",
+    cl::desc("If profile count information is used in tail duplication cost "
+             "model, the gained fall through number from tail duplication "
+             "should be at least this percent of hot count."),
+    cl::init(50), cl::Hidden);
+
 // Heuristic for triangle chains.
 static cl::opt<unsigned> TriangleChainCount(
     "triangle-chain-count",
@@ -377,6 +385,10 @@ class MachineBlockPlacement : public MachineFunctionPass {
   /// Partial tail duplication threshold.
   BlockFrequency DupThreshold;
 
+  /// True:  use block profile count to compute tail duplication cost.
+  /// False: use block frequency to compute tail duplication cost.
+  bool UseProfileCount;
+
   /// Allocator and owner of BlockChain structures.
   ///
   /// We build BlockChains lazily while processing the loop structure of
@@ -402,6 +414,19 @@ class MachineBlockPlacement : public MachineFunctionPass {
   SmallPtrSet<MachineBasicBlock *, 4> BlocksWithUnanalyzableExits;
 #endif
 
+  /// Get block profile count or frequency according to UseProfileCount.
+  /// The return value is used to model tail duplication cost.
+  BlockFrequency getBlockCountOrFrequency(const MachineBasicBlock *BB) {
+    if (UseProfileCount) {
+      auto Count = MBFI->getBlockProfileCount(BB);
+      if (Count)
+        return *Count;
+      else
+        return 0;
+    } else
+      return MBFI->getBlockFreq(BB);
+  }
+
   /// Scale the DupThreshold according to basic block size.
   BlockFrequency scaleThreshold(MachineBasicBlock *BB);
   void initDupThreshold();
@@ -424,10 +449,6 @@ class MachineBlockPlacement : public MachineFunctionPass {
       const MachineBasicBlock *BB, const BlockChain &Chain,
       const BlockFilterSet *BlockFilter,
       SmallVector<MachineBasicBlock *, 4> &Successors);
-  bool shouldPredBlockBeOutlined(
-      const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
-      const BlockChain &Chain, const BlockFilterSet *BlockFilter,
-      BranchProbability SuccProb, BranchProbability HotProb);
   bool isBestSuccessor(MachineBasicBlock *BB, MachineBasicBlock *Pred,
                        BlockFilterSet *BlockFilter);
   void findDuplicateCandidates(SmallVectorImpl<MachineBasicBlock *> &Candidates,
@@ -1652,11 +1673,9 @@ MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock(
   // worklist of already placed entries.
   // FIXME: If this shows up on profiles, it could be folded (at the cost of
   // some code complexity) into the loop below.
-  WorkList.erase(llvm::remove_if(WorkList,
-                                 [&](MachineBasicBlock *BB) {
-                                   return BlockToChain.lookup(BB) == &Chain;
-                                 }),
-                 WorkList.end());
+  llvm::erase_if(WorkList, [&](MachineBasicBlock *BB) {
+    return BlockToChain.lookup(BB) == &Chain;
+  });
 
   if (WorkList.empty())
     return nullptr;
@@ -2287,6 +2306,10 @@ void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain,
   if (Bottom == ExitingBB)
     return;
 
+  // The entry block should always be the first BB in a function.
+  if (Top->isEntryBlock())
+    return;
+
   bool ViableTopFallthrough = hasViableTopFallthrough(Top, LoopBlockSet);
 
   // If the header has viable fallthrough, check whether the current loop
@@ -2361,6 +2384,11 @@ void MachineBlockPlacement::rotateLoopWithProfile(
     BlockChain &LoopChain, const MachineLoop &L,
     const BlockFilterSet &LoopBlockSet) {
   auto RotationPos = LoopChain.end();
+  MachineBasicBlock *ChainHeaderBB = *LoopChain.begin();
+
+  // The entry block should always be the first BB in a function.
+  if (ChainHeaderBB->isEntryBlock())
+    return;
 
   BlockFrequency SmallestRotationCost = BlockFrequency::getMaxFrequency();
 
@@ -2379,7 +2407,6 @@ void MachineBlockPlacement::rotateLoopWithProfile(
   // chain head is not the loop header. As we only consider natural loops with
   // single header, this computation can be done only once.
   BlockFrequency HeaderFallThroughCost(0);
-  MachineBasicBlock *ChainHeaderBB = *LoopChain.begin();
   for (auto *Pred : ChainHeaderBB->predecessors()) {
     BlockChain *PredChain = BlockToChain[Pred];
     if (!LoopBlockSet.count(Pred) &&
@@ -2516,10 +2543,14 @@ MachineBlockPlacement::collectLoopBlockSet(const MachineLoop &L) {
                     MBPI->getEdgeProbability(LoopPred, L.getHeader());
 
     for (MachineBasicBlock *LoopBB : L.getBlocks()) {
+      if (LoopBlockSet.count(LoopBB))
+        continue;
       auto Freq = MBFI->getBlockFreq(LoopBB).getFrequency();
       if (Freq == 0 || LoopFreq.getFrequency() / Freq > LoopToColdBlockRatio)
         continue;
-      LoopBlockSet.insert(LoopBB);
+      BlockChain *Chain = BlockToChain[LoopBB];
+      for (MachineBasicBlock *ChainBB : *Chain)
+        LoopBlockSet.insert(ChainBB);
     }
   } else
     LoopBlockSet.insert(L.block_begin(), L.block_end());
@@ -3011,12 +3042,7 @@ bool MachineBlockPlacement::maybeTailDuplicateBlock(
           SmallVectorImpl<MachineBasicBlock *> &RemoveList = BlockWorkList;
           if (RemBB->isEHPad())
             RemoveList = EHPadWorkList;
-          RemoveList.erase(
-              llvm::remove_if(RemoveList,
-                              [RemBB](MachineBasicBlock *BB) {
-                                return BB == RemBB;
-                              }),
-              RemoveList.end());
+          llvm::erase_value(RemoveList, RemBB);
         }
 
         // Handle the filter set
@@ -3120,7 +3146,7 @@ bool MachineBlockPlacement::isBestSuccessor(MachineBasicBlock *BB,
 
   // Compute the number of reduced taken branches if Pred falls through to BB
   // instead of another successor. Then compare it with threshold.
-  BlockFrequency PredFreq = MBFI->getBlockFreq(Pred);
+  BlockFrequency PredFreq = getBlockCountOrFrequency(Pred);
   BlockFrequency Gain = PredFreq * (BBProb - BestProb);
   return Gain > scaleThreshold(BB);
 }
@@ -3134,8 +3160,8 @@ void MachineBlockPlacement::findDuplicateCandidates(
   MachineBasicBlock *Fallthrough = nullptr;
   BranchProbability DefaultBranchProb = BranchProbability::getZero();
   BlockFrequency BBDupThreshold(scaleThreshold(BB));
-  SmallVector<MachineBasicBlock *, 8> Preds(BB->pred_begin(), BB->pred_end());
-  SmallVector<MachineBasicBlock *, 8> Succs(BB->succ_begin(), BB->succ_end());
+  SmallVector<MachineBasicBlock *, 8> Preds(BB->predecessors());
+  SmallVector<MachineBasicBlock *, 8> Succs(BB->successors());
 
   // Sort for highest frequency.
   auto CmpSucc = [&](MachineBasicBlock *A, MachineBasicBlock *B) {
@@ -3194,7 +3220,7 @@ void MachineBlockPlacement::findDuplicateCandidates(
   // it. But it can beneficially fall through to BB, and duplicate BB into other
   // predecessors.
   for (MachineBasicBlock *Pred : Preds) {
-    BlockFrequency PredFreq = MBFI->getBlockFreq(Pred);
+    BlockFrequency PredFreq = getBlockCountOrFrequency(Pred);
 
     if (!TailDup.canTailDuplicate(BB, Pred)) {
       // BB can't be duplicated into Pred, but it is possible to be layout
@@ -3243,6 +3269,15 @@ void MachineBlockPlacement::initDupThreshold() {
   if (!F->getFunction().hasProfileData())
     return;
 
+  // We prefer to use prifile count.
+  uint64_t HotThreshold = PSI->getOrCompHotCountThreshold();
+  if (HotThreshold != UINT64_MAX) {
+    UseProfileCount = true;
+    DupThreshold = HotThreshold * TailDupProfilePercentThreshold / 100;
+    return;
+  }
+
+  // Profile count is not available, we can use block frequency instead.
   BlockFrequency MaxFreq = 0;
   for (MachineBasicBlock &MBB : *F) {
     BlockFrequency Freq = MBFI->getBlockFreq(&MBB);
@@ -3250,10 +3285,9 @@ void MachineBlockPlacement::initDupThreshold() {
       MaxFreq = Freq;
   }
 
-  // FIXME: we may use profile count instead of frequency,
-  // and need more fine tuning.
   BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
   DupThreshold = MaxFreq * ThresholdProb;
+  UseProfileCount = false;
 }
 
 bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
@@ -3326,8 +3360,8 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
   // No tail merging opportunities if the block number is less than four.
   if (MF.size() > 3 && EnableTailMerge) {
     unsigned TailMergeSize = TailDupSize + 1;
-    BranchFolder BF(/*EnableTailMerge=*/true, /*CommonHoist=*/false, *MBFI,
-                    *MBPI, PSI, TailMergeSize);
+    BranchFolder BF(/*DefaultEnableTailMerge=*/true, /*CommonHoist=*/false,
+                    *MBFI, *MBPI, PSI, TailMergeSize);
 
     if (BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(), MLI,
                             /*AfterPlacement=*/true)) {
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineCSE.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineCSE.cpp
index 09531276bc10..199fe2dc6454 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/MachineCSE.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineCSE.cpp
@@ -35,6 +35,7 @@
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCRegister.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Allocator.h"
@@ -115,18 +116,18 @@ namespace {
 
     bool PerformTrivialCopyPropagation(MachineInstr *MI,
                                        MachineBasicBlock *MBB);
-    bool isPhysDefTriviallyDead(unsigned Reg,
+    bool isPhysDefTriviallyDead(MCRegister Reg,
                                 MachineBasicBlock::const_iterator I,
                                 MachineBasicBlock::const_iterator E) const;
     bool hasLivePhysRegDefUses(const MachineInstr *MI,
                                const MachineBasicBlock *MBB,
-                               SmallSet<unsigned, 8> &PhysRefs,
+                               SmallSet<MCRegister, 8> &PhysRefs,
                                PhysDefVector &PhysDefs, bool &PhysUseDef) const;
     bool PhysRegDefsReach(MachineInstr *CSMI, MachineInstr *MI,
-                          SmallSet<unsigned, 8> &PhysRefs,
+                          SmallSet<MCRegister, 8> &PhysRefs,
                           PhysDefVector &PhysDefs, bool &NonLocal) const;
     bool isCSECandidate(MachineInstr *MI);
-    bool isProfitableToCSE(unsigned CSReg, unsigned Reg,
+    bool isProfitableToCSE(Register CSReg, Register Reg,
                            MachineBasicBlock *CSBB, MachineInstr *MI);
     void EnterScope(MachineBasicBlock *MBB);
     void ExitScope(MachineBasicBlock *MBB);
@@ -218,10 +219,9 @@ bool MachineCSE::PerformTrivialCopyPropagation(MachineInstr *MI,
   return Changed;
 }
 
-bool
-MachineCSE::isPhysDefTriviallyDead(unsigned Reg,
-                                   MachineBasicBlock::const_iterator I,
-                                   MachineBasicBlock::const_iterator E) const {
+bool MachineCSE::isPhysDefTriviallyDead(
+    MCRegister Reg, MachineBasicBlock::const_iterator I,
+    MachineBasicBlock::const_iterator E) const {
   unsigned LookAheadLeft = LookAheadLimit;
   while (LookAheadLeft) {
     // Skip over dbg_value's.
@@ -255,7 +255,7 @@ MachineCSE::isPhysDefTriviallyDead(unsigned Reg,
   return false;
 }
 
-static bool isCallerPreservedOrConstPhysReg(unsigned Reg,
+static bool isCallerPreservedOrConstPhysReg(MCRegister Reg,
                                             const MachineFunction &MF,
                                             const TargetRegisterInfo &TRI) {
   // MachineRegisterInfo::isConstantPhysReg directly called by
@@ -276,7 +276,7 @@ static bool isCallerPreservedOrConstPhysReg(unsigned Reg,
 /// instruction does not uses a physical register.
 bool MachineCSE::hasLivePhysRegDefUses(const MachineInstr *MI,
                                        const MachineBasicBlock *MBB,
-                                       SmallSet<unsigned, 8> &PhysRefs,
+                                       SmallSet<MCRegister, 8> &PhysRefs,
                                        PhysDefVector &PhysDefs,
                                        bool &PhysUseDef) const {
   // First, add all uses to PhysRefs.
@@ -289,7 +289,7 @@ bool MachineCSE::hasLivePhysRegDefUses(const MachineInstr *MI,
     if (Register::isVirtualRegister(Reg))
       continue;
     // Reading either caller preserved or constant physregs is ok.
-    if (!isCallerPreservedOrConstPhysReg(Reg, *MI->getMF(), *TRI))
+    if (!isCallerPreservedOrConstPhysReg(Reg.asMCReg(), *MI->getMF(), *TRI))
       for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
         PhysRefs.insert(*AI);
   }
@@ -308,12 +308,12 @@ bool MachineCSE::hasLivePhysRegDefUses(const MachineInstr *MI,
     if (Register::isVirtualRegister(Reg))
       continue;
     // Check against PhysRefs even if the def is "dead".
-    if (PhysRefs.count(Reg))
+    if (PhysRefs.count(Reg.asMCReg()))
       PhysUseDef = true;
     // If the def is dead, it's ok. But the def may not marked "dead". That's
     // common since this pass is run before livevariables. We can scan
     // forward a few instructions and check if it is obviously dead.
-    if (!MO.isDead() && !isPhysDefTriviallyDead(Reg, I, MBB->end()))
+    if (!MO.isDead() && !isPhysDefTriviallyDead(Reg.asMCReg(), I, MBB->end()))
       PhysDefs.push_back(std::make_pair(MOP.index(), Reg));
   }
 
@@ -327,7 +327,7 @@ bool MachineCSE::hasLivePhysRegDefUses(const MachineInstr *MI,
 }
 
 bool MachineCSE::PhysRegDefsReach(MachineInstr *CSMI, MachineInstr *MI,
-                                  SmallSet<unsigned, 8> &PhysRefs,
+                                  SmallSet<MCRegister, 8> &PhysRefs,
                                   PhysDefVector &PhysDefs,
                                   bool &NonLocal) const {
   // For now conservatively returns false if the common subexpression is
@@ -382,7 +382,7 @@ bool MachineCSE::PhysRegDefsReach(MachineInstr *CSMI, MachineInstr *MI,
       Register MOReg = MO.getReg();
       if (Register::isVirtualRegister(MOReg))
         continue;
-      if (PhysRefs.count(MOReg))
+      if (PhysRefs.count(MOReg.asMCReg()))
         return false;
     }
 
@@ -429,7 +429,7 @@ bool MachineCSE::isCSECandidate(MachineInstr *MI) {
 /// isProfitableToCSE - Return true if it's profitable to eliminate MI with a
 /// common expression that defines Reg. CSBB is basic block where CSReg is
 /// defined.
-bool MachineCSE::isProfitableToCSE(unsigned CSReg, unsigned Reg,
+bool MachineCSE::isProfitableToCSE(Register CSReg, Register Reg,
                                    MachineBasicBlock *CSBB, MachineInstr *MI) {
   // FIXME: Heuristics that works around the lack the live range splitting.
 
@@ -556,7 +556,7 @@ bool MachineCSE::ProcessBlockCSE(MachineBasicBlock *MBB) {
     // used, then it's not safe to replace it with a common subexpression.
     // It's also not safe if the instruction uses physical registers.
     bool CrossMBBPhysDef = false;
-    SmallSet<unsigned, 8> PhysRefs;
+    SmallSet<MCRegister, 8> PhysRefs;
     PhysDefVector PhysDefs;
     bool PhysUseDef = false;
     if (FoundCSE && hasLivePhysRegDefUses(MI, MBB, PhysRefs,
@@ -640,7 +640,7 @@ bool MachineCSE::ProcessBlockCSE(MachineBasicBlock *MBB) {
 
     // Actually perform the elimination.
     if (DoCSE) {
-      for (std::pair<unsigned, unsigned> &CSEPair : CSEPairs) {
+      for (const std::pair<unsigned, unsigned> &CSEPair : CSEPairs) {
         unsigned OldReg = CSEPair.first;
         unsigned NewReg = CSEPair.second;
         // OldReg may have been unused but is used now, clear the Dead flag
@@ -656,7 +656,7 @@ bool MachineCSE::ProcessBlockCSE(MachineBasicBlock *MBB) {
       // we should make sure it is not dead at CSMI.
       for (unsigned ImplicitDefToUpdate : ImplicitDefsToUpdate)
         CSMI->getOperand(ImplicitDefToUpdate).setIsDead(false);
-      for (auto PhysDef : PhysDefs)
+      for (const auto &PhysDef : PhysDefs)
         if (!MI->getOperand(PhysDef.first).isDead())
           CSMI->getOperand(PhysDef.first).setIsDead(false);
 
@@ -748,8 +748,7 @@ bool MachineCSE::PerformCSE(MachineDomTreeNode *Node) {
     Node = WorkList.pop_back_val();
     Scopes.push_back(Node);
     OpenChildren[Node] = Node->getNumChildren();
-    for (MachineDomTreeNode *Child : Node->children())
-      WorkList.push_back(Child);
+    append_range(WorkList, Node->children());
   } while (!WorkList.empty());
 
   // Now perform CSE.
@@ -777,11 +776,11 @@ bool MachineCSE::isPRECandidate(MachineInstr *MI) {
       MI->getNumExplicitDefs() != 1)
     return false;
 
-  for (auto def : MI->defs())
+  for (const auto &def : MI->defs())
     if (!Register::isVirtualRegister(def.getReg()))
       return false;
 
-  for (auto use : MI->uses())
+  for (const auto &use : MI->uses())
     if (use.isReg() && !Register::isVirtualRegister(use.getReg()))
       return false;
 
@@ -861,8 +860,7 @@ bool MachineCSE::PerformSimplePRE(MachineDominatorTree *DT) {
   BBs.push_back(DT->getRootNode());
   do {
     auto Node = BBs.pop_back_val();
-    for (MachineDomTreeNode *Child : Node->children())
-      BBs.push_back(Child);
+    append_range(BBs, Node->children());
 
     MachineBasicBlock *MBB = Node->getBlock();
     Changed |= ProcessBlockPRE(DT, MBB);
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineCheckDebugify.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineCheckDebugify.cpp
new file mode 100644
index 000000000000..bd7f0f862947
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineCheckDebugify.cpp
@@ -0,0 +1,126 @@
+//===- MachineCheckDebugify.cpp - Check debug info ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file This checks debug info after mir-debugify (+ pass-to-test). Currently
+/// it simply checks the integrity of line info in DILocation and
+/// DILocalVariable which mir-debugifiy generated before.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils/Debugify.h"
+
+#define DEBUG_TYPE "mir-check-debugify"
+
+using namespace llvm;
+
+namespace {
+
+struct CheckDebugMachineModule : public ModulePass {
+  bool runOnModule(Module &M) override {
+    MachineModuleInfo &MMI =
+        getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
+
+    NamedMDNode *NMD = M.getNamedMetadata("llvm.mir.debugify");
+    if (!NMD) {
+      errs() << "WARNING: Please run mir-debugify to generate "
+                "llvm.mir.debugify metadata first.\n";
+      return false;
+    }
+
+    auto getDebugifyOperand = [&](unsigned Idx) -> unsigned {
+      return mdconst::extract<ConstantInt>(NMD->getOperand(Idx)->getOperand(0))
+          ->getZExtValue();
+    };
+    assert(NMD->getNumOperands() == 2 &&
+           "llvm.mir.debugify should have exactly 2 operands!");
+    unsigned NumLines = getDebugifyOperand(0);
+    unsigned NumVars = getDebugifyOperand(1);
+    BitVector MissingLines{NumLines, true};
+    BitVector MissingVars{NumVars, true};
+
+    for (Function &F : M.functions()) {
+      MachineFunction *MF = MMI.getMachineFunction(F);
+      if (!MF)
+        continue;
+      for (MachineBasicBlock &MBB : *MF) {
+        // Find missing lines.
+        // TODO: Avoid meta instructions other than dbg_val.
+        for (MachineInstr &MI : MBB) {
+          if (MI.isDebugValue())
+            continue;
+          const DebugLoc DL = MI.getDebugLoc();
+          if (DL && DL.getLine() != 0) {
+            MissingLines.reset(DL.getLine() - 1);
+            continue;
+          }
+
+          if (!DL) {
+            errs() << "WARNING: Instruction with empty DebugLoc in function ";
+            errs() << F.getName() << " --";
+            MI.print(errs());
+          }
+        }
+
+        // Find missing variables.
+        // TODO: Handle DBG_INSTR_REF which is under an experimental option now.
+        for (MachineInstr &MI : MBB) {
+          if (!MI.isDebugValue())
+            continue;
+          const DILocalVariable *LocalVar = MI.getDebugVariable();
+          unsigned Var = ~0U;
+
+          (void)to_integer(LocalVar->getName(), Var, 10);
+          assert(Var <= NumVars && "Unexpected name for DILocalVariable");
+          MissingVars.reset(Var - 1);
+        }
+      }
+    }
+
+    bool Fail = false;
+    for (unsigned Idx : MissingLines.set_bits()) {
+      errs() << "WARNING: Missing line " << Idx + 1 << "\n";
+      Fail = true;
+    }
+
+    for (unsigned Idx : MissingVars.set_bits()) {
+      errs() << "WARNING: Missing variable " << Idx + 1 << "\n";
+      Fail = true;
+    }
+    errs() << "Machine IR debug info check: ";
+    errs() << (Fail ? "FAIL" : "PASS") << "\n";
+
+    return false;
+  }
+
+  CheckDebugMachineModule() : ModulePass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineModuleInfoWrapperPass>();
+    AU.addPreserved<MachineModuleInfoWrapperPass>();
+    AU.setPreservesCFG();
+  }
+
+  static char ID; // Pass identification.
+};
+char CheckDebugMachineModule::ID = 0;
+
+} // end anonymous namespace
+
+INITIALIZE_PASS_BEGIN(CheckDebugMachineModule, DEBUG_TYPE,
+                      "Machine Check Debug Module", false, false)
+INITIALIZE_PASS_END(CheckDebugMachineModule, DEBUG_TYPE,
+                    "Machine Check Debug Module", false, false)
+
+ModulePass *llvm::createCheckDebugMachineModulePass() {
+  return new CheckDebugMachineModule();
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineCombiner.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineCombiner.cpp
index f241435a0482..e2b6cfe55c16 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/MachineCombiner.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineCombiner.cpp
@@ -22,6 +22,7 @@
 #include "llvm/CodeGen/MachineSizeOpts.h"
 #include "llvm/CodeGen/MachineTraceMetrics.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSchedule.h"
@@ -72,6 +73,7 @@ class MachineCombiner : public MachineFunctionPass {
   MachineTraceMetrics::Ensemble *MinInstr;
   MachineBlockFrequencyInfo *MBFI;
   ProfileSummaryInfo *PSI;
+  RegisterClassInfo RegClassInfo;
 
   TargetSchedModel TSchedModel;
 
@@ -103,6 +105,10 @@ private:
                           SmallVectorImpl<MachineInstr *> &DelInstrs,
                           DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
                           MachineCombinerPattern Pattern, bool SlackIsAccurate);
+  bool reduceRegisterPressure(MachineInstr &Root, MachineBasicBlock *MBB,
+                              SmallVectorImpl<MachineInstr *> &InsInstrs,
+                              SmallVectorImpl<MachineInstr *> &DelInstrs,
+                              MachineCombinerPattern Pattern);
   bool preservesResourceLen(MachineBasicBlock *MBB,
                             MachineTraceMetrics::Trace BlockTrace,
                             SmallVectorImpl<MachineInstr *> &InsInstrs,
@@ -257,8 +263,9 @@ unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot,
 /// The combiner's goal may differ based on which pattern it is attempting
 /// to optimize.
 enum class CombinerObjective {
-  MustReduceDepth, // The data dependency chain must be improved.
-  Default          // The critical path must not be lengthened.
+  MustReduceDepth,            // The data dependency chain must be improved.
+  MustReduceRegisterPressure, // The register pressure must be reduced.
+  Default                     // The critical path must not be lengthened.
 };
 
 static CombinerObjective getCombinerObjective(MachineCombinerPattern P) {
@@ -272,6 +279,9 @@ static CombinerObjective getCombinerObjective(MachineCombinerPattern P) {
   case MachineCombinerPattern::REASSOC_XY_AMM_BMM:
   case MachineCombinerPattern::REASSOC_XMM_AMM_BMM:
     return CombinerObjective::MustReduceDepth;
+  case MachineCombinerPattern::REASSOC_XY_BCA:
+  case MachineCombinerPattern::REASSOC_XY_BAC:
+    return CombinerObjective::MustReduceRegisterPressure;
   default:
     return CombinerObjective::Default;
   }
@@ -300,6 +310,18 @@ std::pair<unsigned, unsigned> MachineCombiner::getLatenciesForInstrSequences(
   return {NewRootLatency, RootLatency};
 }
 
+bool MachineCombiner::reduceRegisterPressure(
+    MachineInstr &Root, MachineBasicBlock *MBB,
+    SmallVectorImpl<MachineInstr *> &InsInstrs,
+    SmallVectorImpl<MachineInstr *> &DelInstrs,
+    MachineCombinerPattern Pattern) {
+  // FIXME: for now, we don't do any check for the register pressure patterns.
+  // We treat them as always profitable. But we can do better if we make
+  // RegPressureTracker class be aware of TIE attribute. Then we can get an
+  // accurate compare of register pressure with DelInstrs or InsInstrs.
+  return true;
+}
+
 /// The DAGCombine code sequence ends in MI (Machine Instruction) Root.
 /// The new code sequence ends in MI NewRoot. A necessary condition for the new
 /// sequence to replace the old sequence is that it cannot lengthen the critical
@@ -438,6 +460,8 @@ bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize,
 /// \param DelInstrs instruction to delete from \p MBB
 /// \param MinInstr is a pointer to the machine trace information
 /// \param RegUnits set of live registers, needed to compute instruction depths
+/// \param TII is target instruction info, used to call target hook
+/// \param Pattern is used to call target hook finalizeInsInstrs
 /// \param IncrementalUpdate if true, compute instruction depths incrementally,
 ///                          otherwise invalidate the trace
 static void insertDeleteInstructions(MachineBasicBlock *MBB, MachineInstr &MI,
@@ -445,7 +469,18 @@ static void insertDeleteInstructions(MachineBasicBlock *MBB, MachineInstr &MI,
                                      SmallVector<MachineInstr *, 16> DelInstrs,
                                      MachineTraceMetrics::Ensemble *MinInstr,
                                      SparseSet<LiveRegUnit> &RegUnits,
+                                     const TargetInstrInfo *TII,
+                                     MachineCombinerPattern Pattern,
                                      bool IncrementalUpdate) {
+  // If we want to fix up some placeholder for some target, do it now.
+  // We need this because in genAlternativeCodeSequence, we have not decided the
+  // better pattern InsInstrs or DelInstrs, so we don't want generate some
+  // sideeffect to the function. For example we need to delay the constant pool
+  // entry creation here after InsInstrs is selected as better pattern.
+  // Otherwise the constant pool entry created for InsInstrs will not be deleted
+  // even if InsInstrs is not the better pattern.
+  TII->finalizeInsInstrs(MI, Pattern, InsInstrs);
+
   for (auto *InstrPtr : InsInstrs)
     MBB->insert((MachineBasicBlock::iterator)&MI, InstrPtr);
 
@@ -522,6 +557,9 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
 
   bool OptForSize = OptSize || llvm::shouldOptimizeForSize(MBB, PSI, MBFI);
 
+  bool DoRegPressureReduce =
+      TII->shouldReduceRegisterPressure(MBB, &RegClassInfo);
+
   while (BlockIter != MBB->end()) {
     auto &MI = *BlockIter++;
     SmallVector<MachineCombinerPattern, 16> Patterns;
@@ -552,7 +590,7 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
     // machine-combiner-verify-pattern-order is enabled, all patterns are
     // checked to ensure later patterns do not provide better latency savings.
 
-    if (!TII->getMachineCombinerPatterns(MI, Patterns))
+    if (!TII->getMachineCombinerPatterns(MI, Patterns, DoRegPressureReduce))
       continue;
 
     if (VerifyPatternOrder)
@@ -588,12 +626,33 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
       if (ML && TII->isThroughputPattern(P))
         SubstituteAlways = true;
 
-      if (IncrementalUpdate) {
+      if (IncrementalUpdate && LastUpdate != BlockIter) {
         // Update depths since the last incremental update.
         MinInstr->updateDepths(LastUpdate, BlockIter, RegUnits);
         LastUpdate = BlockIter;
       }
 
+      if (DoRegPressureReduce &&
+          getCombinerObjective(P) ==
+              CombinerObjective::MustReduceRegisterPressure) {
+        if (MBB->size() > inc_threshold) {
+          // Use incremental depth updates for basic blocks above threshold
+          IncrementalUpdate = true;
+          LastUpdate = BlockIter;
+        }
+        if (reduceRegisterPressure(MI, MBB, InsInstrs, DelInstrs, P)) {
+          // Replace DelInstrs with InsInstrs.
+          insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, MinInstr,
+                                   RegUnits, TII, P, IncrementalUpdate);
+          Changed |= true;
+
+          // Go back to previous instruction as it may have ILP reassociation
+          // opportunity.
+          BlockIter--;
+          break;
+        }
+      }
+
       // Substitute when we optimize for codesize and the new sequence has
       // fewer instructions OR
       // the new sequence neither lengthens the critical path nor increases
@@ -601,7 +660,7 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
       if (SubstituteAlways ||
           doSubstitute(NewInstCount, OldInstCount, OptForSize)) {
         insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, MinInstr,
-                                 RegUnits, IncrementalUpdate);
+                                 RegUnits, TII, P, IncrementalUpdate);
         // Eagerly stop after the first pattern fires.
         Changed = true;
         break;
@@ -624,7 +683,7 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
           }
 
           insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, MinInstr,
-                                   RegUnits, IncrementalUpdate);
+                                   RegUnits, TII, P, IncrementalUpdate);
 
           // Eagerly stop after the first pattern fires.
           Changed = true;
@@ -660,6 +719,7 @@ bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) {
          nullptr;
   MinInstr = nullptr;
   OptSize = MF.getFunction().hasOptSize();
+  RegClassInfo.runOnMachineFunction(MF);
 
   LLVM_DEBUG(dbgs() << getPassName() << ": " << MF.getName() << '\n');
   if (!TII->useMachineCombiner()) {
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineCopyPropagation.cpp
index 4c4839ca6522..d8659c1c7853 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineCopyPropagation.cpp
@@ -88,18 +88,18 @@ namespace {
 class CopyTracker {
   struct CopyInfo {
     MachineInstr *MI;
-    SmallVector<unsigned, 4> DefRegs;
+    SmallVector<MCRegister, 4> DefRegs;
     bool Avail;
   };
 
-  DenseMap<unsigned, CopyInfo> Copies;
+  DenseMap<MCRegister, CopyInfo> Copies;
 
 public:
   /// Mark all of the given registers and their subregisters as unavailable for
   /// copying.
-  void markRegsUnavailable(ArrayRef<unsigned> Regs,
+  void markRegsUnavailable(ArrayRef<MCRegister> Regs,
                            const TargetRegisterInfo &TRI) {
-    for (unsigned Reg : Regs) {
+    for (MCRegister Reg : Regs) {
       // Source of copy is no longer available for propagation.
       for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI) {
         auto CI = Copies.find(*RUI);
@@ -110,30 +110,30 @@ public:
   }
 
   /// Remove register from copy maps.
-  void invalidateRegister(unsigned Reg, const TargetRegisterInfo &TRI) {
+  void invalidateRegister(MCRegister Reg, const TargetRegisterInfo &TRI) {
     // Since Reg might be a subreg of some registers, only invalidate Reg is not
     // enough. We have to find the COPY defines Reg or registers defined by Reg
     // and invalidate all of them.
-    SmallSet<unsigned, 8> RegsToInvalidate;
+    SmallSet<MCRegister, 8> RegsToInvalidate;
     RegsToInvalidate.insert(Reg);
     for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI) {
       auto I = Copies.find(*RUI);
       if (I != Copies.end()) {
         if (MachineInstr *MI = I->second.MI) {
-          RegsToInvalidate.insert(MI->getOperand(0).getReg());
-          RegsToInvalidate.insert(MI->getOperand(1).getReg());
+          RegsToInvalidate.insert(MI->getOperand(0).getReg().asMCReg());
+          RegsToInvalidate.insert(MI->getOperand(1).getReg().asMCReg());
         }
         RegsToInvalidate.insert(I->second.DefRegs.begin(),
                                 I->second.DefRegs.end());
       }
     }
-    for (unsigned InvalidReg : RegsToInvalidate)
+    for (MCRegister InvalidReg : RegsToInvalidate)
       for (MCRegUnitIterator RUI(InvalidReg, &TRI); RUI.isValid(); ++RUI)
         Copies.erase(*RUI);
   }
 
   /// Clobber a single register, removing it from the tracker's copy maps.
-  void clobberRegister(unsigned Reg, const TargetRegisterInfo &TRI) {
+  void clobberRegister(MCRegister Reg, const TargetRegisterInfo &TRI) {
     for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI) {
       auto I = Copies.find(*RUI);
       if (I != Copies.end()) {
@@ -143,7 +143,7 @@ public:
         // When we clobber the destination of a copy, we need to clobber the
         // whole register it defined.
         if (MachineInstr *MI = I->second.MI)
-          markRegsUnavailable({MI->getOperand(0).getReg()}, TRI);
+          markRegsUnavailable({MI->getOperand(0).getReg().asMCReg()}, TRI);
         // Now we can erase the copy.
         Copies.erase(I);
       }
@@ -154,8 +154,8 @@ public:
   void trackCopy(MachineInstr *MI, const TargetRegisterInfo &TRI) {
     assert(MI->isCopy() && "Tracking non-copy?");
 
-    Register Def = MI->getOperand(0).getReg();
-    Register Src = MI->getOperand(1).getReg();
+    MCRegister Def = MI->getOperand(0).getReg().asMCReg();
+    MCRegister Src = MI->getOperand(1).getReg().asMCReg();
 
     // Remember Def is defined by the copy.
     for (MCRegUnitIterator RUI(Def, &TRI); RUI.isValid(); ++RUI)
@@ -175,8 +175,9 @@ public:
     return !Copies.empty();
   }
 
-  MachineInstr *findCopyForUnit(unsigned RegUnit, const TargetRegisterInfo &TRI,
-                         bool MustBeAvailable = false) {
+  MachineInstr *findCopyForUnit(MCRegister RegUnit,
+                                const TargetRegisterInfo &TRI,
+                                bool MustBeAvailable = false) {
     auto CI = Copies.find(RegUnit);
     if (CI == Copies.end())
       return nullptr;
@@ -185,8 +186,8 @@ public:
     return CI->second.MI;
   }
 
-  MachineInstr *findCopyDefViaUnit(unsigned RegUnit,
-                                    const TargetRegisterInfo &TRI) {
+  MachineInstr *findCopyDefViaUnit(MCRegister RegUnit,
+                                   const TargetRegisterInfo &TRI) {
     auto CI = Copies.find(RegUnit);
     if (CI == Copies.end())
       return nullptr;
@@ -196,7 +197,7 @@ public:
     return findCopyForUnit(*RUI, TRI, true);
   }
 
-  MachineInstr *findAvailBackwardCopy(MachineInstr &I, unsigned Reg,
+  MachineInstr *findAvailBackwardCopy(MachineInstr &I, MCRegister Reg,
                                       const TargetRegisterInfo &TRI) {
     MCRegUnitIterator RUI(Reg, &TRI);
     MachineInstr *AvailCopy = findCopyDefViaUnit(*RUI, TRI);
@@ -217,7 +218,7 @@ public:
     return AvailCopy;
   }
 
-  MachineInstr *findAvailCopy(MachineInstr &DestCopy, unsigned Reg,
+  MachineInstr *findAvailCopy(MachineInstr &DestCopy, MCRegister Reg,
                               const TargetRegisterInfo &TRI) {
     // We check the first RegUnit here, since we'll only be interested in the
     // copy if it copies the entire register anyway.
@@ -274,12 +275,10 @@ public:
 private:
   typedef enum { DebugUse = false, RegularUse = true } DebugType;
 
-  void ClobberRegister(unsigned Reg);
-  void ReadRegister(unsigned Reg, MachineInstr &Reader,
-                    DebugType DT);
+  void ReadRegister(MCRegister Reg, MachineInstr &Reader, DebugType DT);
   void ForwardCopyPropagateBlock(MachineBasicBlock &MBB);
   void BackwardCopyPropagateBlock(MachineBasicBlock &MBB);
-  bool eraseIfRedundant(MachineInstr &Copy, unsigned Src, unsigned Def);
+  bool eraseIfRedundant(MachineInstr &Copy, MCRegister Src, MCRegister Def);
   void forwardUses(MachineInstr &MI);
   void propagateDefs(MachineInstr &MI);
   bool isForwardableRegClassCopy(const MachineInstr &Copy,
@@ -288,6 +287,8 @@ private:
                                           const MachineInstr &UseI,
                                           unsigned UseIdx);
   bool hasImplicitOverlap(const MachineInstr &MI, const MachineOperand &Use);
+  bool hasOverlappingMultipleDef(const MachineInstr &MI,
+                                 const MachineOperand &MODef, Register Def);
 
   /// Candidates for deletion.
   SmallSetVector<MachineInstr *, 8> MaybeDeadCopies;
@@ -309,7 +310,7 @@ char &llvm::MachineCopyPropagationID = MachineCopyPropagation::ID;
 INITIALIZE_PASS(MachineCopyPropagation, DEBUG_TYPE,
                 "Machine Copy Propagation Pass", false, false)
 
-void MachineCopyPropagation::ReadRegister(unsigned Reg, MachineInstr &Reader,
+void MachineCopyPropagation::ReadRegister(MCRegister Reg, MachineInstr &Reader,
                                           DebugType DT) {
   // If 'Reg' is defined by a copy, the copy is no longer a candidate
   // for elimination. If a copy is "read" by a debug user, record the user
@@ -332,10 +333,10 @@ void MachineCopyPropagation::ReadRegister(unsigned Reg, MachineInstr &Reader,
 /// PreviousCopy. e.g.
 /// isNopCopy("ecx = COPY eax", AX, CX) == true
 /// isNopCopy("ecx = COPY eax", AH, CL) == false
-static bool isNopCopy(const MachineInstr &PreviousCopy, unsigned Src,
-                      unsigned Def, const TargetRegisterInfo *TRI) {
-  Register PreviousSrc = PreviousCopy.getOperand(1).getReg();
-  Register PreviousDef = PreviousCopy.getOperand(0).getReg();
+static bool isNopCopy(const MachineInstr &PreviousCopy, MCRegister Src,
+                      MCRegister Def, const TargetRegisterInfo *TRI) {
+  MCRegister PreviousSrc = PreviousCopy.getOperand(1).getReg().asMCReg();
+  MCRegister PreviousDef = PreviousCopy.getOperand(0).getReg().asMCReg();
   if (Src == PreviousSrc && Def == PreviousDef)
     return true;
   if (!TRI->isSubRegister(PreviousSrc, Src))
@@ -347,8 +348,8 @@ static bool isNopCopy(const MachineInstr &PreviousCopy, unsigned Src,
 /// Remove instruction \p Copy if there exists a previous copy that copies the
 /// register \p Src to the register \p Def; This may happen indirectly by
 /// copying the super registers.
-bool MachineCopyPropagation::eraseIfRedundant(MachineInstr &Copy, unsigned Src,
-                                              unsigned Def) {
+bool MachineCopyPropagation::eraseIfRedundant(MachineInstr &Copy,
+                                              MCRegister Src, MCRegister Def) {
   // Avoid eliminating a copy from/to a reserved registers as we cannot predict
   // the value (Example: The sparc zero register is writable but stays zero).
   if (MRI->isReserved(Src) || MRI->isReserved(Def))
@@ -459,6 +460,21 @@ bool MachineCopyPropagation::hasImplicitOverlap(const MachineInstr &MI,
   return false;
 }
 
+/// For an MI that has multiple definitions, check whether \p MI has
+/// a definition that overlaps with another of its definitions.
+/// For example, on ARM: umull   r9, r9, lr, r0
+/// The umull instruction is unpredictable unless RdHi and RdLo are different.
+bool MachineCopyPropagation::hasOverlappingMultipleDef(
+    const MachineInstr &MI, const MachineOperand &MODef, Register Def) {
+  for (const MachineOperand &MIDef : MI.defs()) {
+    if ((&MIDef != &MODef) && MIDef.isReg() &&
+        TRI->regsOverlap(Def, MIDef.getReg()))
+      return true;
+  }
+
+  return false;
+}
+
 /// Look for available copies whose destination register is used by \p MI and
 /// replace the use in \p MI with the copy's source register.
 void MachineCopyPropagation::forwardUses(MachineInstr &MI) {
@@ -489,7 +505,8 @@ void MachineCopyPropagation::forwardUses(MachineInstr &MI) {
     if (!MOUse.isRenamable())
       continue;
 
-    MachineInstr *Copy = Tracker.findAvailCopy(MI, MOUse.getReg(), *TRI);
+    MachineInstr *Copy =
+        Tracker.findAvailCopy(MI, MOUse.getReg().asMCReg(), *TRI);
     if (!Copy)
       continue;
 
@@ -561,13 +578,13 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
     // Analyze copies (which don't overlap themselves).
     if (MI->isCopy() && !TRI->regsOverlap(MI->getOperand(0).getReg(),
                                           MI->getOperand(1).getReg())) {
-      Register Def = MI->getOperand(0).getReg();
-      Register Src = MI->getOperand(1).getReg();
-
-      assert(!Register::isVirtualRegister(Def) &&
-             !Register::isVirtualRegister(Src) &&
+      assert(MI->getOperand(0).getReg().isPhysical() &&
+             MI->getOperand(1).getReg().isPhysical() &&
              "MachineCopyPropagation should be run after register allocation!");
 
+      MCRegister Def = MI->getOperand(0).getReg().asMCReg();
+      MCRegister Src = MI->getOperand(1).getReg().asMCReg();
+
       // The two copies cancel out and the source of the first copy
       // hasn't been overridden, eliminate the second one. e.g.
       //  %ecx = COPY %eax
@@ -589,7 +606,7 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
       forwardUses(*MI);
 
       // Src may have been changed by forwardUses()
-      Src = MI->getOperand(1).getReg();
+      Src = MI->getOperand(1).getReg().asMCReg();
 
       // If Src is defined by a previous copy, the previous copy cannot be
       // eliminated.
@@ -597,7 +614,7 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
       for (const MachineOperand &MO : MI->implicit_operands()) {
         if (!MO.isReg() || !MO.readsReg())
           continue;
-        Register Reg = MO.getReg();
+        MCRegister Reg = MO.getReg().asMCReg();
         if (!Reg)
           continue;
         ReadRegister(Reg, *MI, RegularUse);
@@ -620,7 +637,7 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
       for (const MachineOperand &MO : MI->implicit_operands()) {
         if (!MO.isReg() || !MO.isDef())
           continue;
-        Register Reg = MO.getReg();
+        MCRegister Reg = MO.getReg().asMCReg();
         if (!Reg)
           continue;
         Tracker.clobberRegister(Reg, *TRI);
@@ -634,7 +651,7 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
     // Clobber any earlyclobber regs first.
     for (const MachineOperand &MO : MI->operands())
       if (MO.isReg() && MO.isEarlyClobber()) {
-        Register Reg = MO.getReg();
+        MCRegister Reg = MO.getReg().asMCReg();
         // If we have a tied earlyclobber, that means it is also read by this
         // instruction, so we need to make sure we don't remove it as dead
         // later.
@@ -646,7 +663,7 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
     forwardUses(*MI);
 
     // Not a copy.
-    SmallVector<unsigned, 2> Defs;
+    SmallVector<Register, 2> Defs;
     const MachineOperand *RegMask = nullptr;
     for (const MachineOperand &MO : MI->operands()) {
       if (MO.isRegMask())
@@ -657,14 +674,14 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
       if (!Reg)
         continue;
 
-      assert(!Register::isVirtualRegister(Reg) &&
+      assert(!Reg.isVirtual() &&
              "MachineCopyPropagation should be run after register allocation!");
 
       if (MO.isDef() && !MO.isEarlyClobber()) {
-        Defs.push_back(Reg);
+        Defs.push_back(Reg.asMCReg());
         continue;
       } else if (MO.readsReg())
-        ReadRegister(Reg, *MI, MO.isDebug() ? DebugUse : RegularUse);
+        ReadRegister(Reg.asMCReg(), *MI, MO.isDebug() ? DebugUse : RegularUse);
     }
 
     // The instruction has a register mask operand which means that it clobbers
@@ -676,7 +693,7 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
                MaybeDeadCopies.begin();
            DI != MaybeDeadCopies.end();) {
         MachineInstr *MaybeDead = *DI;
-        Register Reg = MaybeDead->getOperand(0).getReg();
+        MCRegister Reg = MaybeDead->getOperand(0).getReg().asMCReg();
         assert(!MRI->isReserved(Reg));
 
         if (!RegMask->clobbersPhysReg(Reg)) {
@@ -701,7 +718,7 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
     }
 
     // Any previous copy definition or reading the Defs is no longer available.
-    for (unsigned Reg : Defs)
+    for (MCRegister Reg : Defs)
       Tracker.clobberRegister(Reg, *TRI);
   }
 
@@ -716,7 +733,7 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
 
       // Update matching debug values, if any.
       assert(MaybeDead->isCopy());
-      unsigned SrcReg = MaybeDead->getOperand(1).getReg();
+      Register SrcReg = MaybeDead->getOperand(1).getReg();
       MRI->updateDbgUsersToReg(SrcReg, CopyDbgUsers[MaybeDead]);
 
       MaybeDead->eraseFromParent();
@@ -768,7 +785,7 @@ void MachineCopyPropagation::propagateDefs(MachineInstr &MI) {
       continue;
 
     MachineInstr *Copy =
-        Tracker.findAvailBackwardCopy(MI, MODef.getReg(), *TRI);
+        Tracker.findAvailBackwardCopy(MI, MODef.getReg().asMCReg(), *TRI);
     if (!Copy)
       continue;
 
@@ -784,6 +801,9 @@ void MachineCopyPropagation::propagateDefs(MachineInstr &MI) {
     if (hasImplicitOverlap(MI, MODef))
       continue;
 
+    if (hasOverlappingMultipleDef(MI, MODef, Def))
+      continue;
+
     LLVM_DEBUG(dbgs() << "MCP: Replacing " << printReg(MODef.getReg(), TRI)
                       << "\n     with " << printReg(Def, TRI) << "\n     in "
                       << MI << "     from " << *Copy);
@@ -813,8 +833,8 @@ void MachineCopyPropagation::BackwardCopyPropagateBlock(
         !TRI->regsOverlap(MI->getOperand(0).getReg(),
                           MI->getOperand(1).getReg())) {
 
-      Register Def = MI->getOperand(0).getReg();
-      Register Src = MI->getOperand(1).getReg();
+      MCRegister Def = MI->getOperand(0).getReg().asMCReg();
+      MCRegister Src = MI->getOperand(1).getReg().asMCReg();
 
       // Unlike forward cp, we don't invoke propagateDefs here,
       // just let forward cp do COPY-to-COPY propagation.
@@ -829,7 +849,7 @@ void MachineCopyPropagation::BackwardCopyPropagateBlock(
     // Invalidate any earlyclobber regs first.
     for (const MachineOperand &MO : MI->operands())
       if (MO.isReg() && MO.isEarlyClobber()) {
-        Register Reg = MO.getReg();
+        MCRegister Reg = MO.getReg().asMCReg();
         if (!Reg)
           continue;
         Tracker.invalidateRegister(Reg, *TRI);
@@ -844,10 +864,10 @@ void MachineCopyPropagation::BackwardCopyPropagateBlock(
         continue;
 
       if (MO.isDef())
-        Tracker.invalidateRegister(MO.getReg(), *TRI);
+        Tracker.invalidateRegister(MO.getReg().asMCReg(), *TRI);
 
       if (MO.readsReg())
-        Tracker.invalidateRegister(MO.getReg(), *TRI);
+        Tracker.invalidateRegister(MO.getReg().asMCReg(), *TRI);
     }
   }
 
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineDebugify.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineDebugify.cpp
index bf57ec0e8c28..599a81847592 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/MachineDebugify.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineDebugify.cpp
@@ -14,6 +14,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -89,10 +90,11 @@ bool applyDebugifyMetadataToMachineFunction(MachineModuleInfo &MMI,
   // Do this by introducing debug uses of each register definition. If that is
   // not possible (e.g. we have a phi or a meta instruction), emit a constant.
   uint64_t NextImm = 0;
+  SmallSet<DILocalVariable *, 16> VarSet;
   const MCInstrDesc &DbgValDesc = TII.get(TargetOpcode::DBG_VALUE);
   for (MachineBasicBlock &MBB : MF) {
     MachineBasicBlock::iterator FirstNonPHIIt = MBB.getFirstNonPHI();
-    for (auto I = MBB.begin(), E = MBB.end(); I != E; ) {
+    for (auto I = MBB.begin(), E = MBB.end(); I != E;) {
       MachineInstr &MI = *I;
       ++I;
 
@@ -113,6 +115,7 @@ bool applyDebugifyMetadataToMachineFunction(MachineModuleInfo &MMI,
         Line = EarliestDVI->getDebugLoc().getLine();
       DILocalVariable *LocalVar = Line2Var[Line];
       assert(LocalVar && "No variable for current line?");
+      VarSet.insert(LocalVar);
 
       // Emit DBG_VALUEs for register definitions.
       SmallVector<MachineOperand *, 4> RegDefs;
@@ -132,6 +135,33 @@ bool applyDebugifyMetadataToMachineFunction(MachineModuleInfo &MMI,
     }
   }
 
+  // Here we save the number of lines and variables into "llvm.mir.debugify".
+  // It is useful for mir-check-debugify.
+  NamedMDNode *NMD = M.getNamedMetadata("llvm.mir.debugify");
+  IntegerType *Int32Ty = Type::getInt32Ty(Ctx);
+  if (!NMD) {
+    NMD = M.getOrInsertNamedMetadata("llvm.mir.debugify");
+    auto addDebugifyOperand = [&](unsigned N) {
+      NMD->addOperand(MDNode::get(
+          Ctx, ValueAsMetadata::getConstant(ConstantInt::get(Int32Ty, N))));
+    };
+    // Add number of lines.
+    addDebugifyOperand(NextLine - 1);
+    // Add number of variables.
+    addDebugifyOperand(VarSet.size());
+  } else {
+    assert(NMD->getNumOperands() == 2 &&
+           "llvm.mir.debugify should have exactly 2 operands!");
+    auto setDebugifyOperand = [&](unsigned Idx, unsigned N) {
+      NMD->setOperand(Idx, MDNode::get(Ctx, ValueAsMetadata::getConstant(
+                                                ConstantInt::get(Int32Ty, N))));
+    };
+    // Set number of lines.
+    setDebugifyOperand(0, NextLine - 1);
+    // Set number of variables.
+    setDebugifyOperand(1, VarSet.size());
+  }
+
   return true;
 }
 
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineFunction.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineFunction.cpp
index 6d45f08804ed..3f44578b1a2c 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/MachineFunction.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineFunction.cpp
@@ -273,20 +273,7 @@ getOrCreateJumpTableInfo(unsigned EntryKind) {
 }
 
 DenormalMode MachineFunction::getDenormalMode(const fltSemantics &FPType) const {
-  if (&FPType == &APFloat::IEEEsingle()) {
-    Attribute Attr = F.getFnAttribute("denormal-fp-math-f32");
-    StringRef Val = Attr.getValueAsString();
-    if (!Val.empty())
-      return parseDenormalFPAttribute(Val);
-
-    // If the f32 variant of the attribute isn't specified, try to use the
-    // generic one.
-  }
-
-  // TODO: Should probably avoid the connection to the IR and store directly
-  // in the MachineFunction.
-  Attribute Attr = F.getFnAttribute("denormal-fp-math");
-  return parseDenormalFPAttribute(Attr.getValueAsString());
+  return F.getDenormalMode(FPType);
 }
 
 /// Should we be emitting segmented stack stuff for the function
@@ -341,33 +328,6 @@ void MachineFunction::RenumberBlocks(MachineBasicBlock *MBB) {
   MBBNumbering.resize(BlockNo);
 }
 
-/// This is used with -fbasic-block-sections or -fbasicblock-labels option.
-/// A unary encoding of basic block labels is done to keep ".strtab" sizes
-/// small.
-void MachineFunction::createBBLabels() {
-  const TargetInstrInfo *TII = getSubtarget().getInstrInfo();
-  this->BBSectionsSymbolPrefix.resize(getNumBlockIDs(), 'a');
-  for (auto MBBI = begin(), E = end(); MBBI != E; ++MBBI) {
-    assert(
-        (MBBI->getNumber() >= 0 && MBBI->getNumber() < (int)getNumBlockIDs()) &&
-        "BasicBlock number was out of range!");
-    // 'a' - Normal block.
-    // 'r' - Return block.
-    // 'l' - Landing Pad.
-    // 'L' - Return and landing pad.
-    bool isEHPad = MBBI->isEHPad();
-    bool isRetBlock = MBBI->isReturnBlock() && !TII->isTailCall(MBBI->back());
-    char type = 'a';
-    if (isEHPad && isRetBlock)
-      type = 'L';
-    else if (isEHPad)
-      type = 'l';
-    else if (isRetBlock)
-      type = 'r';
-    BBSectionsSymbolPrefix[MBBI->getNumber()] = type;
-  }
-}
-
 /// This method iterates over the basic blocks and assigns their IsBeginSection
 /// and IsEndSection fields. This must be called after MBB layout is finalized
 /// and the SectionID's are assigned to MBBs.
@@ -387,9 +347,9 @@ void MachineFunction::assignBeginEndSections() {
 /// Allocate a new MachineInstr. Use this instead of `new MachineInstr'.
 MachineInstr *MachineFunction::CreateMachineInstr(const MCInstrDesc &MCID,
                                                   const DebugLoc &DL,
-                                                  bool NoImp) {
+                                                  bool NoImplicit) {
   return new (InstructionRecycler.Allocate<MachineInstr>(Allocator))
-    MachineInstr(*this, MCID, DL, NoImp);
+      MachineInstr(*this, MCID, DL, NoImplicit);
 }
 
 /// Create a new MachineInstr which is a copy of the 'Orig' instruction,
@@ -460,6 +420,9 @@ MachineFunction::CreateMachineBasicBlock(const BasicBlock *bb) {
 void
 MachineFunction::DeleteMachineBasicBlock(MachineBasicBlock *MBB) {
   assert(MBB->getParent() == this && "MBB parent mismatch!");
+  // Clean up any references to MBB in jump tables before deleting it.
+  if (JumpTableInfo)
+    JumpTableInfo->RemoveMBBFromJumpTables(MBB);
   MBB->~MachineBasicBlock();
   BasicBlockRecycler.Deallocate(Allocator, MBB);
 }
@@ -474,6 +437,13 @@ MachineMemOperand *MachineFunction::getMachineMemOperand(
                         SSID, Ordering, FailureOrdering);
 }
 
+MachineMemOperand *MachineFunction::getMachineMemOperand(
+    const MachineMemOperand *MMO, MachinePointerInfo &PtrInfo, uint64_t Size) {
+  return new (Allocator) MachineMemOperand(
+      PtrInfo, MMO->getFlags(), Size, MMO->getBaseAlign(), AAMDNodes(), nullptr,
+      MMO->getSyncScopeID(), MMO->getOrdering(), MMO->getFailureOrdering());
+}
+
 MachineMemOperand *
 MachineFunction::getMachineMemOperand(const MachineMemOperand *MMO,
                                       int64_t Offset, uint64_t Size) {
@@ -485,9 +455,11 @@ MachineFunction::getMachineMemOperand(const MachineMemOperand *MMO,
                         ? commonAlignment(MMO->getBaseAlign(), Offset)
                         : MMO->getBaseAlign();
 
+  // Do not preserve ranges, since we don't necessarily know what the high bits
+  // are anymore.
   return new (Allocator)
       MachineMemOperand(PtrInfo.getWithOffset(Offset), MMO->getFlags(), Size,
-                        Alignment, AAMDNodes(), nullptr, MMO->getSyncScopeID(),
+                        Alignment, MMO->getAAInfo(), nullptr, MMO->getSyncScopeID(),
                         MMO->getOrdering(), MMO->getFailureOrdering());
 }
 
@@ -896,7 +868,7 @@ try_next:;
   // Add the new filter.
   int FilterID = -(1 + FilterIds.size());
   FilterIds.reserve(FilterIds.size() + TyIds.size() + 1);
-  FilterIds.insert(FilterIds.end(), TyIds.begin(), TyIds.end());
+  llvm::append_range(FilterIds, TyIds);
   FilterEnds.push_back(FilterIds.size());
   FilterIds.push_back(0); // terminator
   return FilterID;
@@ -974,6 +946,46 @@ void MachineFunction::moveCallSiteInfo(const MachineInstr *Old,
   CallSitesInfo[New] = CSInfo;
 }
 
+void MachineFunction::setDebugInstrNumberingCount(unsigned Num) {
+  DebugInstrNumberingCount = Num;
+}
+
+void MachineFunction::makeDebugValueSubstitution(DebugInstrOperandPair A,
+                                                 DebugInstrOperandPair B) {
+  auto Result = DebugValueSubstitutions.insert(std::make_pair(A, B));
+  (void)Result;
+  assert(Result.second && "Substitution for an already substituted value?");
+}
+
+void MachineFunction::substituteDebugValuesForInst(const MachineInstr &Old,
+                                                   MachineInstr &New,
+                                                   unsigned MaxOperand) {
+  // If the Old instruction wasn't tracked at all, there is no work to do.
+  unsigned OldInstrNum = Old.peekDebugInstrNum();
+  if (!OldInstrNum)
+    return;
+
+  // Iterate over all operands looking for defs to create substitutions for.
+  // Avoid creating new instr numbers unless we create a new substitution.
+  // While this has no functional effect, it risks confusing someone reading
+  // MIR output.
+  // Examine all the operands, or the first N specified by the caller.
+  MaxOperand = std::min(MaxOperand, Old.getNumOperands());
+  for (unsigned int I = 0; I < Old.getNumOperands(); ++I) {
+    const auto &OldMO = Old.getOperand(I);
+    auto &NewMO = New.getOperand(I);
+    (void)NewMO;
+
+    if (!OldMO.isReg() || !OldMO.isDef())
+      continue;
+    assert(NewMO.isDef());
+
+    unsigned NewInstrNum = New.getDebugInstrNum();
+    makeDebugValueSubstitution(std::make_pair(OldInstrNum, I),
+                               std::make_pair(NewInstrNum, I));
+  }
+}
+
 /// \}
 
 //===----------------------------------------------------------------------===//
@@ -1038,6 +1050,17 @@ bool MachineJumpTableInfo::ReplaceMBBInJumpTables(MachineBasicBlock *Old,
   return MadeChange;
 }
 
+/// If MBB is present in any jump tables, remove it.
+bool MachineJumpTableInfo::RemoveMBBFromJumpTables(MachineBasicBlock *MBB) {
+  bool MadeChange = false;
+  for (MachineJumpTableEntry &JTE : JumpTables) {
+    auto removeBeginItr = std::remove(JTE.MBBs.begin(), JTE.MBBs.end(), MBB);
+    MadeChange |= (removeBeginItr != JTE.MBBs.end());
+    JTE.MBBs.erase(removeBeginItr, JTE.MBBs.end());
+  }
+  return MadeChange;
+}
+
 /// If Old is a target of the jump tables, update the jump table to branch to
 /// New instead.
 bool MachineJumpTableInfo::ReplaceMBBInJumpTable(unsigned Idx,
@@ -1084,10 +1107,14 @@ Printable llvm::printJumpTableEntryReference(unsigned Idx) {
 
 void MachineConstantPoolValue::anchor() {}
 
-Type *MachineConstantPoolEntry::getType() const {
+unsigned MachineConstantPoolValue::getSizeInBytes(const DataLayout &DL) const {
+  return DL.getTypeAllocSize(Ty);
+}
+
+unsigned MachineConstantPoolEntry::getSizeInBytes(const DataLayout &DL) const {
   if (isMachineConstantPoolEntry())
-    return Val.MachineCPVal->getType();
-  return Val.ConstVal->getType();
+    return Val.MachineCPVal->getSizeInBytes(DL);
+  return DL.getTypeAllocSize(Val.ConstVal->getType());
 }
 
 bool MachineConstantPoolEntry::needsRelocation() const {
@@ -1100,7 +1127,7 @@ SectionKind
 MachineConstantPoolEntry::getSectionKind(const DataLayout *DL) const {
   if (needsRelocation())
     return SectionKind::getReadOnlyWithRel();
-  switch (DL->getTypeAllocSize(getType())) {
+  switch (getSizeInBytes(*DL)) {
   case 4:
     return SectionKind::getMergeableConst4();
   case 8:
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineFunctionPass.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineFunctionPass.cpp
index 03149aa7db4a..16cde1f601f9 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/MachineFunctionPass.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineFunctionPass.cpp
@@ -11,7 +11,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/DominanceFrontier.h"
 #include "llvm/Analysis/GlobalsModRef.h"
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineFunctionPrinterPass.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineFunctionPrinterPass.cpp
index 3645a4e3466b..c31c065b1976 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/MachineFunctionPrinterPass.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineFunctionPrinterPass.cpp
@@ -14,7 +14,7 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/SlotIndexes.h"
-#include "llvm/IR/IRPrintingPasses.h"
+#include "llvm/IR/PrintPasses.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -44,7 +44,7 @@ struct MachineFunctionPrinterPass : public MachineFunctionPass {
   }
 
   bool runOnMachineFunction(MachineFunction &MF) override {
-    if (!llvm::isFunctionInPrintList(MF.getName()))
+    if (!isFunctionInPrintList(MF.getName()))
       return false;
     OS << "# " << Banner << ":\n";
     MF.print(OS, getAnalysisIfAvailable<SlotIndexes>());
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineFunctionSplitter.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineFunctionSplitter.cpp
new file mode 100644
index 000000000000..483809a8ed96
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineFunctionSplitter.cpp
@@ -0,0 +1,155 @@
+//===-- MachineFunctionSplitter.cpp - Split machine functions //-----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// \file
+// Uses profile information to split out cold blocks.
+//
+// This pass splits out cold machine basic blocks from the parent function. This
+// implementation leverages the basic block section framework. Blocks marked
+// cold by this pass are grouped together in a separate section prefixed with
+// ".text.unlikely.*". The linker can then group these together as a cold
+// section. The split part of the function is a contiguous region identified by
+// the symbol "foo.cold". Grouping all cold blocks across functions together
+// decreases fragmentation and improves icache and itlb utilization. Note that
+// the overall changes to the binary size are negligible; only a small number of
+// additional jump instructions may be introduced.
+//
+// For the original RFC of this pass please see
+// https://groups.google.com/d/msg/llvm-dev/RUegaMg-iqc/wFAVxa6fCgAJ
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/CodeGen/BasicBlockSectionUtils.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+// FIXME: This cutoff value is CPU dependent and should be moved to
+// TargetTransformInfo once we consider enabling this on other platforms.
+// The value is expressed as a ProfileSummaryInfo integer percentile cutoff.
+// Defaults to 999950, i.e. all blocks colder than 99.995 percentile are split.
+// The default was empirically determined to be optimal when considering cutoff
+// values between 99%-ile to 100%-ile with respect to iTLB and icache metrics on
+// Intel CPUs.
+static cl::opt<unsigned>
+    PercentileCutoff("mfs-psi-cutoff",
+                     cl::desc("Percentile profile summary cutoff used to "
+                              "determine cold blocks. Unused if set to zero."),
+                     cl::init(999950), cl::Hidden);
+
+static cl::opt<unsigned> ColdCountThreshold(
+    "mfs-count-threshold",
+    cl::desc(
+        "Minimum number of times a block must be executed to be retained."),
+    cl::init(1), cl::Hidden);
+
+namespace {
+
+class MachineFunctionSplitter : public MachineFunctionPass {
+public:
+  static char ID;
+  MachineFunctionSplitter() : MachineFunctionPass(ID) {
+    initializeMachineFunctionSplitterPass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+    return "Machine Function Splitter Transformation";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+};
+} // end anonymous namespace
+
+static bool isColdBlock(MachineBasicBlock &MBB,
+                        const MachineBlockFrequencyInfo *MBFI,
+                        ProfileSummaryInfo *PSI) {
+  Optional<uint64_t> Count = MBFI->getBlockProfileCount(&MBB);
+  if (!Count.hasValue())
+    return true;
+
+  if (PercentileCutoff > 0) {
+    return PSI->isColdCountNthPercentile(PercentileCutoff, *Count);
+  }
+  return (*Count < ColdCountThreshold);
+}
+
+bool MachineFunctionSplitter::runOnMachineFunction(MachineFunction &MF) {
+  // TODO: We only target functions with profile data. Static information may
+  // also be considered but we don't see performance improvements yet.
+  if (!MF.getFunction().hasProfileData())
+    return false;
+
+  // TODO: We don't split functions where a section attribute has been set
+  // since the split part may not be placed in a contiguous region. It may also
+  // be more beneficial to augment the linker to ensure contiguous layout of
+  // split functions within the same section as specified by the attribute.
+  if (!MF.getFunction().getSection().empty())
+    return false;
+
+  // We don't want to proceed further for cold functions
+  // or functions of unknown hotness. Lukewarm functions have no prefix.
+  Optional<StringRef> SectionPrefix = MF.getFunction().getSectionPrefix();
+  if (SectionPrefix.hasValue() &&
+      (SectionPrefix.getValue().equals("unlikely") ||
+       SectionPrefix.getValue().equals("unknown"))) {
+    return false;
+  }
+
+  // Renumbering blocks here preserves the order of the blocks as
+  // sortBasicBlocksAndUpdateBranches uses the numeric identifier to sort
+  // blocks. Preserving the order of blocks is essential to retaining decisions
+  // made by prior passes such as MachineBlockPlacement.
+  MF.RenumberBlocks();
+  MF.setBBSectionsType(BasicBlockSection::Preset);
+  auto *MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
+  auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+
+  for (auto &MBB : MF) {
+    // FIXME: We retain the entry block and conservatively keep all landing pad
+    // blocks as part of the original function. Once D73739 is submitted, we can
+    // improve the handling of ehpads.
+    if ((MBB.pred_empty() || MBB.isEHPad()))
+      continue;
+    if (isColdBlock(MBB, MBFI, PSI))
+      MBB.setSectionID(MBBSectionID::ColdSectionID);
+  }
+
+  auto Comparator = [](const MachineBasicBlock &X, const MachineBasicBlock &Y) {
+    return X.getSectionID().Type < Y.getSectionID().Type;
+  };
+  llvm::sortBasicBlocksAndUpdateBranches(MF, Comparator);
+
+  return true;
+}
+
+void MachineFunctionSplitter::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<MachineModuleInfoWrapperPass>();
+  AU.addRequired<MachineBlockFrequencyInfo>();
+  AU.addRequired<ProfileSummaryInfoWrapperPass>();
+}
+
+char MachineFunctionSplitter::ID = 0;
+INITIALIZE_PASS(MachineFunctionSplitter, "machine-function-splitter",
+                "Split machine functions using profile information", false,
+                false)
+
+MachineFunctionPass *llvm::createMachineFunctionSplitterPass() {
+  return new MachineFunctionSplitter();
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineInstr.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineInstr.cpp
index d4181591deab..b6cfd7dcbfbc 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineInstr.cpp
@@ -34,6 +34,7 @@
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/StackMaps.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
@@ -116,7 +117,7 @@ void MachineInstr::addImplicitDefUseOperands(MachineFunction &MF) {
 /// the MCInstrDesc.
 MachineInstr::MachineInstr(MachineFunction &MF, const MCInstrDesc &tid,
                            DebugLoc dl, bool NoImp)
-    : MCID(&tid), debugLoc(std::move(dl)) {
+    : MCID(&tid), debugLoc(std::move(dl)), DebugInstrNum(0) {
   assert(debugLoc.hasTrivialDestructor() && "Expected trivial destructor");
 
   // Reserve space for the expected number of operands.
@@ -130,10 +131,12 @@ MachineInstr::MachineInstr(MachineFunction &MF, const MCInstrDesc &tid,
     addImplicitDefUseOperands(MF);
 }
 
-/// MachineInstr ctor - Copies MachineInstr arg exactly
-///
+/// MachineInstr ctor - Copies MachineInstr arg exactly.
+/// Does not copy the number from debug instruction numbering, to preserve
+/// uniqueness.
 MachineInstr::MachineInstr(MachineFunction &MF, const MachineInstr &MI)
-    : MCID(&MI.getDesc()), Info(MI.Info), debugLoc(MI.getDebugLoc()) {
+    : MCID(&MI.getDesc()), Info(MI.Info), debugLoc(MI.getDebugLoc()),
+      DebugInstrNum(0) {
   assert(debugLoc.hasTrivialDestructor() && "Expected trivial destructor");
 
   CapOperands = OperandCapacity::get(MI.getNumOperands());
@@ -147,6 +150,10 @@ MachineInstr::MachineInstr(MachineFunction &MF, const MachineInstr &MI)
   setFlags(MI.Flags);
 }
 
+void MachineInstr::moveBefore(MachineInstr *MovePos) {
+  MovePos->getParent()->splice(MovePos, getParent(), getIterator());
+}
+
 /// getRegInfo - If this instruction is embedded into a MachineFunction,
 /// return the MachineRegisterInfo object for the current function, otherwise
 /// return null.
@@ -701,11 +708,10 @@ bool MachineInstr::isCandidateForCallSiteEntry(QueryType Type) const {
   if (!isCall(Type))
     return false;
   switch (getOpcode()) {
-  case TargetOpcode::PATCHABLE_EVENT_CALL:
-  case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
   case TargetOpcode::PATCHPOINT:
   case TargetOpcode::STACKMAP:
   case TargetOpcode::STATEPOINT:
+  case TargetOpcode::FENTRY_CALL:
     return false;
   }
   return true;
@@ -835,27 +841,27 @@ const DILabel *MachineInstr::getDebugLabel() const {
 }
 
 const MachineOperand &MachineInstr::getDebugVariableOp() const {
-  assert(isDebugValue() && "not a DBG_VALUE");
+  assert((isDebugValue() || isDebugRef()) && "not a DBG_VALUE");
   return getOperand(2);
 }
 
 MachineOperand &MachineInstr::getDebugVariableOp() {
-  assert(isDebugValue() && "not a DBG_VALUE");
+  assert((isDebugValue() || isDebugRef()) && "not a DBG_VALUE");
   return getOperand(2);
 }
 
 const DILocalVariable *MachineInstr::getDebugVariable() const {
-  assert(isDebugValue() && "not a DBG_VALUE");
+  assert((isDebugValue() || isDebugRef()) && "not a DBG_VALUE");
   return cast<DILocalVariable>(getOperand(2).getMetadata());
 }
 
 MachineOperand &MachineInstr::getDebugExpressionOp() {
-  assert(isDebugValue() && "not a DBG_VALUE");
+  assert((isDebugValue() || isDebugRef()) && "not a DBG_VALUE");
   return getOperand(3);
 }
 
 const DIExpression *MachineInstr::getDebugExpression() const {
-  assert(isDebugValue() && "not a DBG_VALUE");
+  assert((isDebugValue() || isDebugRef()) && "not a DBG_VALUE");
   return cast<DIExpression>(getOperand(3).getMetadata());
 }
 
@@ -1094,10 +1100,12 @@ void MachineInstr::tieOperands(unsigned DefIdx, unsigned UseIdx) {
   if (DefIdx < TiedMax)
     UseMO.TiedTo = DefIdx + 1;
   else {
-    // Inline asm can use the group descriptors to find tied operands, but on
-    // normal instruction, the tied def must be within the first TiedMax
+    // Inline asm can use the group descriptors to find tied operands,
+    // statepoint tied operands are trivial to match (1-1 reg def with reg use),
+    // but on normal instruction, the tied def must be within the first TiedMax
     // operands.
-    assert(isInlineAsm() && "DefIdx out of range");
+    assert((isInlineAsm() || getOpcode() == TargetOpcode::STATEPOINT) &&
+           "DefIdx out of range");
     UseMO.TiedTo = TiedMax;
   }
 
@@ -1117,7 +1125,7 @@ unsigned MachineInstr::findTiedOperandIdx(unsigned OpIdx) const {
     return MO.TiedTo - 1;
 
   // Uses on normal instructions can be out of range.
-  if (!isInlineAsm()) {
+  if (!isInlineAsm() && getOpcode() != TargetOpcode::STATEPOINT) {
     // Normal tied defs must be in the 0..TiedMax-1 range.
     if (MO.isUse())
       return TiedMax - 1;
@@ -1130,6 +1138,25 @@ unsigned MachineInstr::findTiedOperandIdx(unsigned OpIdx) const {
     llvm_unreachable("Can't find tied use");
   }
 
+  if (getOpcode() == TargetOpcode::STATEPOINT) {
+    // In STATEPOINT defs correspond 1-1 to GC pointer operands passed
+    // on registers.
+    StatepointOpers SO(this);
+    unsigned CurUseIdx = SO.getFirstGCPtrIdx();
+    assert(CurUseIdx != -1U && "only gc pointer statepoint operands can be tied");
+    unsigned NumDefs = getNumDefs();
+    for (unsigned CurDefIdx = 0; CurDefIdx < NumDefs; ++CurDefIdx) {
+      while (!getOperand(CurUseIdx).isReg())
+        CurUseIdx = StackMaps::getNextMetaArgIdx(this, CurUseIdx);
+      if (OpIdx == CurDefIdx)
+        return CurUseIdx;
+      if (OpIdx == CurUseIdx)
+        return CurDefIdx;
+      CurUseIdx = StackMaps::getNextMetaArgIdx(this, CurUseIdx);
+    }
+    llvm_unreachable("Can't find tied use");
+  }
+
   // Now deal with inline asm by parsing the operand group descriptor flags.
   // Find the beginning of each operand group.
   SmallVector<unsigned, 8> GroupIdx;
@@ -1213,7 +1240,7 @@ bool MachineInstr::isSafeToMove(AAResults *AA, bool &SawStore) const {
 
   // See if this instruction does a load.  If so, we have to guarantee that the
   // loaded value doesn't change between the load and the its intended
-  // destination. The check for isInvariantLoad gives the targe the chance to
+  // destination. The check for isInvariantLoad gives the target the chance to
   // classify the load as always returning a constant, e.g. a constant pool
   // load.
   if (mayLoad() && !isDereferenceableInvariantLoad(AA))
@@ -1224,47 +1251,21 @@ bool MachineInstr::isSafeToMove(AAResults *AA, bool &SawStore) const {
   return true;
 }
 
-bool MachineInstr::mayAlias(AAResults *AA, const MachineInstr &Other,
-                            bool UseTBAA) const {
-  const MachineFunction *MF = getMF();
-  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
-  const MachineFrameInfo &MFI = MF->getFrameInfo();
-
-  // If neither instruction stores to memory, they can't alias in any
-  // meaningful way, even if they read from the same address.
-  if (!mayStore() && !Other.mayStore())
-    return false;
-
-  // Both instructions must be memory operations to be able to alias.
-  if (!mayLoadOrStore() || !Other.mayLoadOrStore())
-    return false;
-
-  // Let the target decide if memory accesses cannot possibly overlap.
-  if (TII->areMemAccessesTriviallyDisjoint(*this, Other))
-    return false;
-
-  // FIXME: Need to handle multiple memory operands to support all targets.
-  if (!hasOneMemOperand() || !Other.hasOneMemOperand())
-    return true;
-
-  MachineMemOperand *MMOa = *memoperands_begin();
-  MachineMemOperand *MMOb = *Other.memoperands_begin();
-
-  // The following interface to AA is fashioned after DAGCombiner::isAlias
-  // and operates with MachineMemOperand offset with some important
-  // assumptions:
+static bool MemOperandsHaveAlias(const MachineFrameInfo &MFI, AAResults *AA,
+                                 bool UseTBAA, const MachineMemOperand *MMOa,
+                                 const MachineMemOperand *MMOb) {
+  // The following interface to AA is fashioned after DAGCombiner::isAlias and
+  // operates with MachineMemOperand offset with some important assumptions:
   //   - LLVM fundamentally assumes flat address spaces.
-  //   - MachineOperand offset can *only* result from legalization and
-  //     cannot affect queries other than the trivial case of overlap
-  //     checking.
-  //   - These offsets never wrap and never step outside
-  //     of allocated objects.
+  //   - MachineOperand offset can *only* result from legalization and cannot
+  //     affect queries other than the trivial case of overlap checking.
+  //   - These offsets never wrap and never step outside of allocated objects.
   //   - There should never be any negative offsets here.
   //
   // FIXME: Modify API to hide this math from "user"
-  // Even before we go to AA we can reason locally about some
-  // memory objects. It can save compile time, and possibly catch some
-  // corner cases not currently covered.
+  // Even before we go to AA we can reason locally about some memory objects. It
+  // can save compile time, and possibly catch some corner cases not currently
+  // covered.
 
   int64_t OffsetA = MMOa->getOffset();
   int64_t OffsetB = MMOb->getOffset();
@@ -1306,20 +1307,63 @@ bool MachineInstr::mayAlias(AAResults *AA, const MachineInstr &Other,
   assert((OffsetA >= 0) && "Negative MachineMemOperand offset");
   assert((OffsetB >= 0) && "Negative MachineMemOperand offset");
 
-  int64_t OverlapA = KnownWidthA ? WidthA + OffsetA - MinOffset
-                                 : MemoryLocation::UnknownSize;
-  int64_t OverlapB = KnownWidthB ? WidthB + OffsetB - MinOffset
-                                 : MemoryLocation::UnknownSize;
+  int64_t OverlapA =
+      KnownWidthA ? WidthA + OffsetA - MinOffset : MemoryLocation::UnknownSize;
+  int64_t OverlapB =
+      KnownWidthB ? WidthB + OffsetB - MinOffset : MemoryLocation::UnknownSize;
 
   AliasResult AAResult = AA->alias(
-      MemoryLocation(ValA, OverlapA,
-                     UseTBAA ? MMOa->getAAInfo() : AAMDNodes()),
+      MemoryLocation(ValA, OverlapA, UseTBAA ? MMOa->getAAInfo() : AAMDNodes()),
       MemoryLocation(ValB, OverlapB,
                      UseTBAA ? MMOb->getAAInfo() : AAMDNodes()));
 
   return (AAResult != NoAlias);
 }
 
+bool MachineInstr::mayAlias(AAResults *AA, const MachineInstr &Other,
+                            bool UseTBAA) const {
+  const MachineFunction *MF = getMF();
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+  const MachineFrameInfo &MFI = MF->getFrameInfo();
+
+  // Exclude call instruction which may alter the memory but can not be handled
+  // by this function.
+  if (isCall() || Other.isCall())
+    return true;
+
+  // If neither instruction stores to memory, they can't alias in any
+  // meaningful way, even if they read from the same address.
+  if (!mayStore() && !Other.mayStore())
+    return false;
+
+  // Both instructions must be memory operations to be able to alias.
+  if (!mayLoadOrStore() || !Other.mayLoadOrStore())
+    return false;
+
+  // Let the target decide if memory accesses cannot possibly overlap.
+  if (TII->areMemAccessesTriviallyDisjoint(*this, Other))
+    return false;
+
+  // Memory operations without memory operands may access anything. Be
+  // conservative and assume `MayAlias`.
+  if (memoperands_empty() || Other.memoperands_empty())
+    return true;
+
+  // Skip if there are too many memory operands.
+  auto NumChecks = getNumMemOperands() * Other.getNumMemOperands();
+  if (NumChecks > TII->getMemOperandAACheckLimit())
+    return true;
+
+  // Check each pair of memory operands from both instructions, which can't
+  // alias only if all pairs won't alias.
+  for (auto *MMOa : memoperands())
+    for (auto *MMOb : Other.memoperands())
+      if (MemOperandsHaveAlias(MFI, AA, UseTBAA, MMOa, MMOb))
+        return true;
+
+  return false;
+}
+
 /// hasOrderedMemoryRef - Return true if this instruction may have an ordered
 /// or volatile memory reference, or if the information describing the memory
 /// reference is not available. Return false if it is known to have no ordered
@@ -1418,7 +1462,8 @@ bool MachineInstr::hasUnmodeledSideEffects() const {
 }
 
 bool MachineInstr::isLoadFoldBarrier() const {
-  return mayStore() || isCall() || hasUnmodeledSideEffects();
+  return mayStore() || isCall() ||
+         (hasUnmodeledSideEffects() && !isPseudoProbe());
 }
 
 /// allDefsAreDead - Return true if all the defs of this instruction are dead.
@@ -1447,6 +1492,8 @@ void MachineInstr::copyImplicitOps(MachineFunction &MF,
 
 bool MachineInstr::hasComplexRegisterTies() const {
   const MCInstrDesc &MCID = getDesc();
+  if (MCID.Opcode == TargetOpcode::STATEPOINT)
+    return true;
   for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
     const auto &Operand = getOperand(I);
     if (!Operand.isReg() || Operand.isDef())
@@ -1753,6 +1800,12 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
     HeapAllocMarker->printAsOperand(OS, MST);
   }
 
+  if (DebugInstrNum) {
+    if (!FirstOp)
+      OS << ",";
+    OS << " debug-instr-number " << DebugInstrNum;
+  }
+
   if (!SkipDebugLoc) {
     if (const DebugLoc &DL = getDebugLoc()) {
       if (!FirstOp)
@@ -2227,3 +2280,9 @@ MachineInstr::getFoldedRestoreSize(const TargetInstrInfo *TII) const {
     return getSpillSlotSize(Accesses, getMF()->getFrameInfo());
   return None;
 }
+
+unsigned MachineInstr::getDebugInstrNum() {
+  if (DebugInstrNum == 0)
+    DebugInstrNum = getParent()->getParent()->getNewDebugInstrNum();
+  return DebugInstrNum;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineLICM.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineLICM.cpp
index 5e8a916b3b3b..c06bc39b4940 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineLICM.cpp
@@ -42,6 +42,7 @@
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCRegister.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
@@ -90,7 +91,7 @@ static cl::opt<UseBFI>
 DisableHoistingToHotterBlocks("disable-hoisting-to-hotter-blocks",
                               cl::desc("Disable hoisting instructions to"
                               " hotter blocks"),
-                              cl::init(UseBFI::None), cl::Hidden,
+                              cl::init(UseBFI::PGO), cl::Hidden,
                               cl::values(clEnumValN(UseBFI::None, "none",
                               "disable the feature"),
                               clEnumValN(UseBFI::PGO, "pgo",
@@ -145,7 +146,7 @@ namespace {
     }
 
     // Track 'estimated' register pressure.
-    SmallSet<unsigned, 32> RegSeen;
+    SmallSet<Register, 32> RegSeen;
     SmallVector<unsigned, 8> RegPressure;
 
     // Register pressure "limit" per register pressure set. If the pressure
@@ -156,7 +157,7 @@ namespace {
     SmallVector<SmallVector<unsigned, 8>, 16> BackTrace;
 
     // For each opcode, keep a list of potential CSE instructions.
-    DenseMap<unsigned, std::vector<const MachineInstr *>> CSEMap;
+    DenseMap<unsigned, std::vector<MachineInstr *>> CSEMap;
 
     enum {
       SpeculateFalse   = 0,
@@ -212,7 +213,7 @@ namespace {
                    BitVector &PhysRegClobbers, SmallSet<int, 32> &StoredFIs,
                    SmallVectorImpl<CandidateInfo> &Candidates);
 
-    void AddToLiveIns(unsigned Reg);
+    void AddToLiveIns(MCRegister Reg);
 
     bool IsLICMCandidate(MachineInstr &I);
 
@@ -221,7 +222,7 @@ namespace {
     bool HasLoopPHIUse(const MachineInstr *MI) const;
 
     bool HasHighOperandLatency(MachineInstr &MI, unsigned DefIdx,
-                               unsigned Reg) const;
+                               Register Reg) const;
 
     bool IsCheapInstruction(MachineInstr &MI) const;
 
@@ -245,8 +246,6 @@ namespace {
 
     void HoistOutOfLoop(MachineDomTreeNode *HeaderN);
 
-    void HoistRegion(MachineDomTreeNode *N, bool IsHeader);
-
     void SinkIntoLoop();
 
     void InitRegPressure(MachineBasicBlock *BB);
@@ -260,13 +259,12 @@ namespace {
 
     MachineInstr *ExtractHoistableLoad(MachineInstr *MI);
 
-    const MachineInstr *
-    LookForDuplicate(const MachineInstr *MI,
-                     std::vector<const MachineInstr *> &PrevMIs);
+    MachineInstr *LookForDuplicate(const MachineInstr *MI,
+                                   std::vector<MachineInstr *> &PrevMIs);
 
-    bool EliminateCSE(
-        MachineInstr *MI,
-        DenseMap<unsigned, std::vector<const MachineInstr *>>::iterator &CI);
+    bool
+    EliminateCSE(MachineInstr *MI,
+                 DenseMap<unsigned, std::vector<MachineInstr *>>::iterator &CI);
 
     bool MayCSE(MachineInstr *MI);
 
@@ -606,7 +604,7 @@ void MachineLICMBase::HoistRegionPostRA() {
 
 /// Add register 'Reg' to the livein sets of BBs in the current loop, and make
 /// sure it is not killed by any instructions in the loop.
-void MachineLICMBase::AddToLiveIns(unsigned Reg) {
+void MachineLICMBase::AddToLiveIns(MCRegister Reg) {
   for (MachineBasicBlock *BB : CurLoop->getBlocks()) {
     if (!BB->isLiveIn(Reg))
       BB->addLiveIn(Reg);
@@ -802,8 +800,13 @@ void MachineLICMBase::SinkIntoLoop() {
        I != Preheader->instr_end(); ++I) {
     // We need to ensure that we can safely move this instruction into the loop.
     // As such, it must not have side-effects, e.g. such as a call has.
-    if (IsLoopInvariantInst(*I) && !HasLoopPHIUse(&*I))
+    LLVM_DEBUG(dbgs() << "LICM: Analysing sink candidate: " << *I);
+    if (IsLoopInvariantInst(*I) && !HasLoopPHIUse(&*I)) {
+      LLVM_DEBUG(dbgs() << "LICM: Added as sink candidate.\n");
       Candidates.push_back(&*I);
+      continue;
+    }
+    LLVM_DEBUG(dbgs() << "LICM: Not added as sink candidate.\n");
   }
 
   for (MachineInstr *I : Candidates) {
@@ -813,8 +816,11 @@ void MachineLICMBase::SinkIntoLoop() {
     if (!MRI->hasOneDef(MO.getReg()))
       continue;
     bool CanSink = true;
-    MachineBasicBlock *B = nullptr;
+    MachineBasicBlock *SinkBlock = nullptr;
+    LLVM_DEBUG(dbgs() << "LICM: Try sinking: " << *I);
+
     for (MachineInstr &MI : MRI->use_instructions(MO.getReg())) {
+      LLVM_DEBUG(dbgs() << "LICM:    Analysing use: "; MI.dump());
       // FIXME: Come up with a proper cost model that estimates whether sinking
       // the instruction (and thus possibly executing it on every loop
       // iteration) is more expensive than a register.
@@ -823,24 +829,40 @@ void MachineLICMBase::SinkIntoLoop() {
         CanSink = false;
         break;
       }
-      if (!B) {
-        B = MI.getParent();
+      if (!SinkBlock) {
+        SinkBlock = MI.getParent();
+        LLVM_DEBUG(dbgs() << "LICM:   Setting sink block to: "
+                          << printMBBReference(*SinkBlock) << "\n");
         continue;
       }
-      B = DT->findNearestCommonDominator(B, MI.getParent());
-      if (!B) {
+      SinkBlock = DT->findNearestCommonDominator(SinkBlock, MI.getParent());
+      if (!SinkBlock) {
+        LLVM_DEBUG(dbgs() << "LICM:   Can't find nearest dominator\n");
         CanSink = false;
         break;
       }
+      LLVM_DEBUG(dbgs() << "LICM:   Setting nearest common dom block: " <<
+                 printMBBReference(*SinkBlock) << "\n");
+    }
+    if (!CanSink) {
+      LLVM_DEBUG(dbgs() << "LICM: Can't sink instruction.\n");
+      continue;
+    }
+    if (!SinkBlock) {
+      LLVM_DEBUG(dbgs() << "LICM: Not sinking, can't find sink block.\n");
+      continue;
     }
-    if (!CanSink || !B || B == Preheader)
+    if (SinkBlock == Preheader) {
+      LLVM_DEBUG(dbgs() << "LICM: Not sinking, sink block is the preheader\n");
       continue;
+    }
 
-    LLVM_DEBUG(dbgs() << "Sinking to " << printMBBReference(*B) << " from "
-                      << printMBBReference(*I->getParent()) << ": " << *I);
-    B->splice(B->getFirstNonPHI(), Preheader, I);
+    LLVM_DEBUG(dbgs() << "LICM: Sinking to " << printMBBReference(*SinkBlock)
+                      << " from " << printMBBReference(*I->getParent())
+                      << ": " << *I);
+    SinkBlock->splice(SinkBlock->getFirstNonPHI(), Preheader, I);
 
-    // The instruction is is moved from its basic block, so do not retain the
+    // The instruction is moved from its basic block, so do not retain the
     // debug information.
     assert(!I->isDebugInstr() && "Should not sink debug inst");
     I->setDebugLoc(DebugLoc());
@@ -978,7 +1000,7 @@ static bool isInvariantStore(const MachineInstr &MI,
         Reg = TRI->lookThruCopyLike(MO.getReg(), MRI);
       if (Register::isVirtualRegister(Reg))
         return false;
-      if (!TRI->isCallerPreservedPhysReg(Reg, *MI.getMF()))
+      if (!TRI->isCallerPreservedPhysReg(Reg.asMCReg(), *MI.getMF()))
         return false;
       else
         FoundCallerPresReg = true;
@@ -1008,7 +1030,7 @@ static bool isCopyFeedingInvariantStore(const MachineInstr &MI,
   if (Register::isVirtualRegister(CopySrcReg))
     return false;
 
-  if (!TRI->isCallerPreservedPhysReg(CopySrcReg, *MF))
+  if (!TRI->isCallerPreservedPhysReg(CopySrcReg.asMCReg(), *MF))
     return false;
 
   Register CopyDstReg = MI.getOperand(0).getReg();
@@ -1030,6 +1052,7 @@ bool MachineLICMBase::IsLICMCandidate(MachineInstr &I) {
   bool DontMoveAcrossStore = true;
   if ((!I.isSafeToMove(AA, DontMoveAcrossStore)) &&
       !(HoistConstStores && isInvariantStore(I, TRI, MRI))) {
+    LLVM_DEBUG(dbgs() << "LICM: Instruction not safe to move.\n");
     return false;
   }
 
@@ -1040,65 +1063,28 @@ bool MachineLICMBase::IsLICMCandidate(MachineInstr &I) {
   // indexed load from a jump table.
   // Stores and side effects are already checked by isSafeToMove.
   if (I.mayLoad() && !mayLoadFromGOTOrConstantPool(I) &&
-      !IsGuaranteedToExecute(I.getParent()))
+      !IsGuaranteedToExecute(I.getParent())) {
+    LLVM_DEBUG(dbgs() << "LICM: Load not guaranteed to execute.\n");
+    return false;
+  }
+
+  // Convergent attribute has been used on operations that involve inter-thread
+  // communication which results are implicitly affected by the enclosing
+  // control flows. It is not safe to hoist or sink such operations across
+  // control flow.
+  if (I.isConvergent())
     return false;
 
   return true;
 }
 
 /// Returns true if the instruction is loop invariant.
-/// I.e., all virtual register operands are defined outside of the loop,
-/// physical registers aren't accessed explicitly, and there are no side
-/// effects that aren't captured by the operands or other flags.
 bool MachineLICMBase::IsLoopInvariantInst(MachineInstr &I) {
-  if (!IsLICMCandidate(I))
+  if (!IsLICMCandidate(I)) {
+    LLVM_DEBUG(dbgs() << "LICM: Instruction not a LICM candidate\n");
     return false;
-
-  // The instruction is loop invariant if all of its operands are.
-  for (const MachineOperand &MO : I.operands()) {
-    if (!MO.isReg())
-      continue;
-
-    Register Reg = MO.getReg();
-    if (Reg == 0) continue;
-
-    // Don't hoist an instruction that uses or defines a physical register.
-    if (Register::isPhysicalRegister(Reg)) {
-      if (MO.isUse()) {
-        // If the physreg has no defs anywhere, it's just an ambient register
-        // and we can freely move its uses. Alternatively, if it's allocatable,
-        // it could get allocated to something with a def during allocation.
-        // However, if the physreg is known to always be caller saved/restored
-        // then this use is safe to hoist.
-        if (!MRI->isConstantPhysReg(Reg) &&
-            !(TRI->isCallerPreservedPhysReg(Reg, *I.getMF())))
-          return false;
-        // Otherwise it's safe to move.
-        continue;
-      } else if (!MO.isDead()) {
-        // A def that isn't dead. We can't move it.
-        return false;
-      } else if (CurLoop->getHeader()->isLiveIn(Reg)) {
-        // If the reg is live into the loop, we can't hoist an instruction
-        // which would clobber it.
-        return false;
-      }
-    }
-
-    if (!MO.isUse())
-      continue;
-
-    assert(MRI->getVRegDef(Reg) &&
-           "Machine instr not mapped for this vreg?!");
-
-    // If the loop contains the definition of an operand, then the instruction
-    // isn't loop invariant.
-    if (CurLoop->contains(MRI->getVRegDef(Reg)))
-      return false;
   }
-
-  // If we got this far, the instruction is loop invariant!
-  return true;
+  return CurLoop->isLoopInvariant(I);
 }
 
 /// Return true if the specified instruction is used by a phi node and hoisting
@@ -1138,9 +1124,8 @@ bool MachineLICMBase::HasLoopPHIUse(const MachineInstr *MI) const {
 
 /// Compute operand latency between a def of 'Reg' and an use in the current
 /// loop, return true if the target considered it high.
-bool MachineLICMBase::HasHighOperandLatency(MachineInstr &MI,
-                                            unsigned DefIdx,
-                                            unsigned Reg) const {
+bool MachineLICMBase::HasHighOperandLatency(MachineInstr &MI, unsigned DefIdx,
+                                            Register Reg) const {
   if (MRI->use_nodbg_empty(Reg))
     return false;
 
@@ -1400,10 +1385,10 @@ void MachineLICMBase::InitCSEMap(MachineBasicBlock *BB) {
 
 /// Find an instruction amount PrevMIs that is a duplicate of MI.
 /// Return this instruction if it's found.
-const MachineInstr*
+MachineInstr *
 MachineLICMBase::LookForDuplicate(const MachineInstr *MI,
-                                  std::vector<const MachineInstr*> &PrevMIs) {
-  for (const MachineInstr *PrevMI : PrevMIs)
+                                  std::vector<MachineInstr *> &PrevMIs) {
+  for (MachineInstr *PrevMI : PrevMIs)
     if (TII->produceSameValue(*MI, *PrevMI, (PreRegAlloc ? MRI : nullptr)))
       return PrevMI;
 
@@ -1414,14 +1399,15 @@ MachineLICMBase::LookForDuplicate(const MachineInstr *MI,
 /// computes the same value. If it's found, do a RAU on with the definition of
 /// the existing instruction rather than hoisting the instruction to the
 /// preheader.
-bool MachineLICMBase::EliminateCSE(MachineInstr *MI,
-    DenseMap<unsigned, std::vector<const MachineInstr *>>::iterator &CI) {
+bool MachineLICMBase::EliminateCSE(
+    MachineInstr *MI,
+    DenseMap<unsigned, std::vector<MachineInstr *>>::iterator &CI) {
   // Do not CSE implicit_def so ProcessImplicitDefs can properly propagate
   // the undef property onto uses.
   if (CI == CSEMap.end() || MI->isImplicitDef())
     return false;
 
-  if (const MachineInstr *Dup = LookForDuplicate(MI, CI->second)) {
+  if (MachineInstr *Dup = LookForDuplicate(MI, CI->second)) {
     LLVM_DEBUG(dbgs() << "CSEing " << *MI << " with " << *Dup);
 
     // Replace virtual registers defined by MI by their counterparts defined
@@ -1461,6 +1447,9 @@ bool MachineLICMBase::EliminateCSE(MachineInstr *MI,
       Register DupReg = Dup->getOperand(Idx).getReg();
       MRI->replaceRegWith(Reg, DupReg);
       MRI->clearKillFlags(DupReg);
+      // Clear Dup dead flag if any, we reuse it for Reg.
+      if (!MRI->use_nodbg_empty(DupReg))
+        Dup->getOperand(Idx).setIsDead(false);
     }
 
     MI->eraseFromParent();
@@ -1474,8 +1463,8 @@ bool MachineLICMBase::EliminateCSE(MachineInstr *MI,
 /// the loop.
 bool MachineLICMBase::MayCSE(MachineInstr *MI) {
   unsigned Opcode = MI->getOpcode();
-  DenseMap<unsigned, std::vector<const MachineInstr *>>::iterator
-    CI = CSEMap.find(Opcode);
+  DenseMap<unsigned, std::vector<MachineInstr *>>::iterator CI =
+      CSEMap.find(Opcode);
   // Do not CSE implicit_def so ProcessImplicitDefs can properly propagate
   // the undef property onto uses.
   if (CI == CSEMap.end() || MI->isImplicitDef())
@@ -1529,8 +1518,8 @@ bool MachineLICMBase::Hoist(MachineInstr *MI, MachineBasicBlock *Preheader) {
 
   // Look for opportunity to CSE the hoisted instruction.
   unsigned Opcode = MI->getOpcode();
-  DenseMap<unsigned, std::vector<const MachineInstr *>>::iterator
-    CI = CSEMap.find(Opcode);
+  DenseMap<unsigned, std::vector<MachineInstr *>>::iterator CI =
+      CSEMap.find(Opcode);
   if (!EliminateCSE(MI, CI)) {
     // Otherwise, splice the instruction to the preheader.
     Preheader->splice(Preheader->getFirstTerminator(),MI->getParent(),MI);
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineLoopInfo.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineLoopInfo.cpp
index 0c1439da9b29..78480d0e1488 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/MachineLoopInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineLoopInfo.cpp
@@ -16,11 +16,14 @@
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/Analysis/LoopInfoImpl.h"
 #include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+
 using namespace llvm;
 
 // Explicitly instantiate methods in LoopInfoImpl.h for MI-level Loops.
@@ -146,6 +149,59 @@ MachineLoopInfo::findLoopPreheader(MachineLoop *L,
   return Preheader;
 }
 
+bool MachineLoop::isLoopInvariant(MachineInstr &I) const {
+  MachineFunction *MF = I.getParent()->getParent();
+  MachineRegisterInfo *MRI = &MF->getRegInfo();
+  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+
+  // The instruction is loop invariant if all of its operands are.
+  for (const MachineOperand &MO : I.operands()) {
+    if (!MO.isReg())
+      continue;
+
+    Register Reg = MO.getReg();
+    if (Reg == 0) continue;
+
+    // An instruction that uses or defines a physical register can't e.g. be
+    // hoisted, so mark this as not invariant.
+    if (Register::isPhysicalRegister(Reg)) {
+      if (MO.isUse()) {
+        // If the physreg has no defs anywhere, it's just an ambient register
+        // and we can freely move its uses. Alternatively, if it's allocatable,
+        // it could get allocated to something with a def during allocation.
+        // However, if the physreg is known to always be caller saved/restored
+        // then this use is safe to hoist.
+        if (!MRI->isConstantPhysReg(Reg) &&
+            !(TRI->isCallerPreservedPhysReg(Reg.asMCReg(), *I.getMF())))
+          return false;
+        // Otherwise it's safe to move.
+        continue;
+      } else if (!MO.isDead()) {
+        // A def that isn't dead can't be moved.
+        return false;
+      } else if (getHeader()->isLiveIn(Reg)) {
+        // If the reg is live into the loop, we can't hoist an instruction
+        // which would clobber it.
+        return false;
+      }
+    }
+
+    if (!MO.isUse())
+      continue;
+
+    assert(MRI->getVRegDef(Reg) &&
+           "Machine instr not mapped for this vreg?!");
+
+    // If the loop contains the definition of an operand, then the instruction
+    // isn't loop invariant.
+    if (contains(MRI->getVRegDef(Reg)))
+      return false;
+  }
+
+  // If we got this far, the instruction is loop invariant!
+  return true;
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void MachineLoop::dump() const {
   print(dbgs());
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineLoopUtils.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineLoopUtils.cpp
index 2295e1ca6d4e..fdcc8472f1c2 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/MachineLoopUtils.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineLoopUtils.cpp
@@ -130,14 +130,3 @@ MachineBasicBlock *llvm::PeelSingleBlockLoop(LoopPeelDirection Direction,
 
   return NewBB;
 }
-
-bool llvm::isRegLiveInExitBlocks(MachineLoop *Loop, int PhysReg) {
-  SmallVector<MachineBasicBlock *, 4> ExitBlocks;
-  Loop->getExitBlocks(ExitBlocks);
-
-  for (auto *MBB : ExitBlocks)
-    if (MBB->isLiveIn(PhysReg))
-      return true;
-
-  return false;
-}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineModuleInfo.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineModuleInfo.cpp
index f866c7ca53c6..5565b9cededa 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/MachineModuleInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineModuleInfo.cpp
@@ -104,7 +104,8 @@ ArrayRef<MCSymbol *> MMIAddrLabelMap::getAddrLabelSymbolToEmit(BasicBlock *BB) {
   BBCallbacks.back().setMap(this);
   Entry.Index = BBCallbacks.size() - 1;
   Entry.Fn = BB->getParent();
-  MCSymbol *Sym = Context.createTempSymbol(!BB->hasAddressTaken());
+  MCSymbol *Sym = BB->hasAddressTaken() ? Context.createNamedTempSymbol()
+                                        : Context.createTempSymbol();
   Entry.Symbols.push_back(Sym);
   return Entry.Symbols;
 }
@@ -143,8 +144,7 @@ void MMIAddrLabelMap::UpdateForRAUWBlock(BasicBlock *Old, BasicBlock *New) {
   BBCallbacks[OldEntry.Index] = nullptr;    // Update the callback.
 
   // Otherwise, we need to add the old symbols to the new block's set.
-  NewEntry.Symbols.insert(NewEntry.Symbols.end(), OldEntry.Symbols.begin(),
-                          OldEntry.Symbols.end());
+  llvm::append_range(NewEntry.Symbols, OldEntry.Symbols);
 }
 
 void MMIAddrLabelMapCallbackPtr::deleted() {
@@ -170,6 +170,7 @@ void MachineModuleInfo::finalize() {
   AddrLabelSymbols = nullptr;
 
   Context.reset();
+  // We don't clear the ExternalContext.
 
   delete ObjFileMMI;
   ObjFileMMI = nullptr;
@@ -178,7 +179,8 @@ void MachineModuleInfo::finalize() {
 MachineModuleInfo::MachineModuleInfo(MachineModuleInfo &&MMI)
     : TM(std::move(MMI.TM)),
       Context(MMI.TM.getMCAsmInfo(), MMI.TM.getMCRegisterInfo(),
-              MMI.TM.getObjFileLowering(), nullptr, nullptr, false) {
+              MMI.TM.getObjFileLowering(), nullptr, nullptr, false),
+      MachineFunctions(std::move(MMI.MachineFunctions)) {
   ObjFileMMI = MMI.ObjFileMMI;
   CurCallSite = MMI.CurCallSite;
   UsesMSVCFloatingPoint = MMI.UsesMSVCFloatingPoint;
@@ -186,6 +188,7 @@ MachineModuleInfo::MachineModuleInfo(MachineModuleInfo &&MMI)
   HasSplitStack = MMI.HasSplitStack;
   HasNosplitStack = MMI.HasNosplitStack;
   AddrLabelSymbols = MMI.AddrLabelSymbols;
+  ExternalContext = MMI.ExternalContext;
   TheModule = MMI.TheModule;
 }
 
@@ -195,6 +198,14 @@ MachineModuleInfo::MachineModuleInfo(const LLVMTargetMachine *TM)
   initialize();
 }
 
+MachineModuleInfo::MachineModuleInfo(const LLVMTargetMachine *TM,
+                                     MCContext *ExtContext)
+    : TM(*TM), Context(TM->getMCAsmInfo(), TM->getMCRegisterInfo(),
+                       TM->getObjFileLowering(), nullptr, nullptr, false),
+      ExternalContext(ExtContext) {
+  initialize();
+}
+
 MachineModuleInfo::~MachineModuleInfo() { finalize(); }
 
 //===- Address of Block Management ----------------------------------------===//
@@ -203,7 +214,7 @@ ArrayRef<MCSymbol *>
 MachineModuleInfo::getAddrLabelSymbolToEmit(const BasicBlock *BB) {
   // Lazily create AddrLabelSymbols.
   if (!AddrLabelSymbols)
-    AddrLabelSymbols = new MMIAddrLabelMap(Context);
+    AddrLabelSymbols = new MMIAddrLabelMap(getContext());
  return AddrLabelSymbols->getAddrLabelSymbolToEmit(const_cast<BasicBlock*>(BB));
 }
 
@@ -295,6 +306,12 @@ MachineModuleInfoWrapperPass::MachineModuleInfoWrapperPass(
   initializeMachineModuleInfoWrapperPassPass(*PassRegistry::getPassRegistry());
 }
 
+MachineModuleInfoWrapperPass::MachineModuleInfoWrapperPass(
+    const LLVMTargetMachine *TM, MCContext *ExtContext)
+    : ImmutablePass(ID), MMI(TM, ExtContext) {
+  initializeMachineModuleInfoWrapperPassPass(*PassRegistry::getPassRegistry());
+}
+
 // Handle the Pass registration stuff necessary to use DataLayout's.
 INITIALIZE_PASS(MachineModuleInfoWrapperPass, "machinemoduleinfo",
                 "Machine Module Information", false, false)
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineOperand.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineOperand.cpp
index 2b4fd654e46c..9b09f5273298 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/MachineOperand.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineOperand.cpp
@@ -85,7 +85,7 @@ void MachineOperand::substVirtReg(Register Reg, unsigned SubIdx,
 }
 
 void MachineOperand::substPhysReg(MCRegister Reg, const TargetRegisterInfo &TRI) {
-  assert(Reg.isPhysical());
+  assert(Register::isPhysicalRegister(Reg));
   if (getSubReg()) {
     Reg = TRI.getSubReg(Reg, getSubReg());
     // Note that getSubReg() may return 0 if the sub-register doesn't exist.
@@ -153,22 +153,25 @@ void MachineOperand::removeRegFromUses() {
 /// ChangeToImmediate - Replace this operand with a new immediate operand of
 /// the specified value.  If an operand is known to be an immediate already,
 /// the setImm method should be used.
-void MachineOperand::ChangeToImmediate(int64_t ImmVal) {
+void MachineOperand::ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags) {
   assert((!isReg() || !isTied()) && "Cannot change a tied operand into an imm");
 
   removeRegFromUses();
 
   OpKind = MO_Immediate;
   Contents.ImmVal = ImmVal;
+  setTargetFlags(TargetFlags);
 }
 
-void MachineOperand::ChangeToFPImmediate(const ConstantFP *FPImm) {
+void MachineOperand::ChangeToFPImmediate(const ConstantFP *FPImm,
+                                         unsigned TargetFlags) {
   assert((!isReg() || !isTied()) && "Cannot change a tied operand into an imm");
 
   removeRegFromUses();
 
   OpKind = MO_FPImmediate;
   Contents.CFP = FPImm;
+  setTargetFlags(TargetFlags);
 }
 
 void MachineOperand::ChangeToES(const char *SymName,
@@ -197,7 +200,7 @@ void MachineOperand::ChangeToGA(const GlobalValue *GV, int64_t Offset,
   setTargetFlags(TargetFlags);
 }
 
-void MachineOperand::ChangeToMCSymbol(MCSymbol *Sym) {
+void MachineOperand::ChangeToMCSymbol(MCSymbol *Sym, unsigned TargetFlags) {
   assert((!isReg() || !isTied()) &&
          "Cannot change a tied operand into an MCSymbol");
 
@@ -205,9 +208,10 @@ void MachineOperand::ChangeToMCSymbol(MCSymbol *Sym) {
 
   OpKind = MO_MCSymbol;
   Contents.Sym = Sym;
+  setTargetFlags(TargetFlags);
 }
 
-void MachineOperand::ChangeToFrameIndex(int Idx) {
+void MachineOperand::ChangeToFrameIndex(int Idx, unsigned TargetFlags) {
   assert((!isReg() || !isTied()) &&
          "Cannot change a tied operand into a FrameIndex");
 
@@ -215,6 +219,7 @@ void MachineOperand::ChangeToFrameIndex(int Idx) {
 
   OpKind = MO_FrameIndex;
   setIndex(Idx);
+  setTargetFlags(TargetFlags);
 }
 
 void MachineOperand::ChangeToTargetIndex(unsigned Idx, int64_t Offset,
@@ -415,6 +420,11 @@ static const char *getTargetIndexName(const MachineFunction &MF, int Index) {
   return nullptr;
 }
 
+const char *MachineOperand::getTargetIndexName() const {
+  const MachineFunction *MF = getMFIfAvailable(*this);
+  return MF ? ::getTargetIndexName(*MF, this->getIndex()) : nullptr;
+}
+
 static const char *getTargetFlagName(const TargetInstrInfo *TII, unsigned TF) {
   auto Flags = TII->getSerializableDirectMachineOperandTargetFlags();
   for (const auto &I : Flags) {
@@ -823,7 +833,7 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
     OS << "target-index(";
     const char *Name = "<unknown>";
     if (const MachineFunction *MF = getMFIfAvailable(*this))
-      if (const auto *TargetIndexName = getTargetIndexName(*MF, getIndex()))
+      if (const auto *TargetIndexName = ::getTargetIndexName(*MF, getIndex()))
         Name = TargetIndexName;
     OS << Name << ')';
     printOperandOffset(OS, getOffset());
@@ -1142,7 +1152,7 @@ void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
       const MIRFormatter *Formatter = TII->getMIRFormatter();
       // FIXME: This is not necessarily the correct MIR serialization format for
       // a custom pseudo source value, but at least it allows
-      // -print-machineinstrs to work on a target with custom pseudo source
+      // MIR printing to work on a target with custom pseudo source
       // values.
       OS << "custom \"";
       Formatter->printCustomPseudoSourceValue(OS, MST, *PVal);
@@ -1152,8 +1162,10 @@ void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
     }
   }
   MachineOperand::printOperandOffset(OS, getOffset());
-  if (getBaseAlign() != getSize())
-    OS << ", align " << getBaseAlign().value();
+  if (getAlign() != getSize())
+    OS << ", align " << getAlign().value();
+  if (getAlign() != getBaseAlign())
+    OS << ", basealign " << getBaseAlign().value();
   auto AAInfo = getAAInfo();
   if (AAInfo.TBAA) {
     OS << ", !tbaa ";
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineOutliner.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineOutliner.cpp
index f9d099e02995..02998d41d831 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/MachineOutliner.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineOutliner.cpp
@@ -59,10 +59,8 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/Twine.h"
-#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
@@ -309,10 +307,8 @@ struct InstructionMapper {
       // repeated substring.
       mapToIllegalUnsigned(It, CanOutlineWithPrevInstr, UnsignedVecForMBB,
                            InstrListForMBB);
-      InstrList.insert(InstrList.end(), InstrListForMBB.begin(),
-                       InstrListForMBB.end());
-      UnsignedVec.insert(UnsignedVec.end(), UnsignedVecForMBB.begin(),
-                         UnsignedVecForMBB.end());
+      llvm::append_range(InstrList, InstrListForMBB);
+      llvm::append_range(UnsignedVec, UnsignedVecForMBB);
     }
   }
 
@@ -549,11 +545,10 @@ void MachineOutliner::findCandidates(
       // That is, one must either
       // * End before the other starts
       // * Start after the other ends
-      if (std::all_of(
-              CandidatesForRepeatedSeq.begin(), CandidatesForRepeatedSeq.end(),
-              [&StartIdx, &EndIdx](const Candidate &C) {
-                return (EndIdx < C.getStartIdx() || StartIdx > C.getEndIdx());
-              })) {
+      if (llvm::all_of(CandidatesForRepeatedSeq, [&StartIdx,
+                                                  &EndIdx](const Candidate &C) {
+            return (EndIdx < C.getStartIdx() || StartIdx > C.getEndIdx());
+          })) {
         // It doesn't overlap with anything, so we can outline it.
         // Each sequence is over [StartIt, EndIt].
         // Save the candidate and its location.
@@ -656,6 +651,8 @@ MachineFunction *MachineOutliner::createOutlinedFunction(
       OriginalMF->getFrameInstructions();
   for (auto I = FirstCand.front(), E = std::next(FirstCand.back()); I != E;
        ++I) {
+    if (I->isDebugInstr())
+      continue;
     MachineInstr *NewMI = MF.CloneMachineInstr(&*I);
     if (I->isCFIInstruction()) {
       unsigned CFIIndex = NewMI->getOperand(0).getCFIIndex();
@@ -691,7 +688,7 @@ MachineFunction *MachineOutliner::createOutlinedFunction(
 
     // The live-in set for the outlined function is the union of the live-ins
     // from all the outlining points.
-    for (MCPhysReg Reg : make_range(CandLiveIns.begin(), CandLiveIns.end()))
+    for (MCPhysReg Reg : CandLiveIns)
       LiveIns.addReg(Reg);
   }
   addLiveIns(MBB, LiveIns);
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachinePassManager.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachinePassManager.cpp
new file mode 100644
index 000000000000..e81575c88935
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachinePassManager.cpp
@@ -0,0 +1,121 @@
+//===---------- MachinePassManager.cpp ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the pass management machinery for machine functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachinePassManager.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/PassManagerImpl.h"
+
+using namespace llvm;
+
+namespace llvm {
+template class AllAnalysesOn<MachineFunction>;
+template class AnalysisManager<MachineFunction>;
+template class PassManager<MachineFunction>;
+
+Error MachineFunctionPassManager::run(Module &M,
+                                      MachineFunctionAnalysisManager &MFAM) {
+  // MachineModuleAnalysis is a module analysis pass that is never invalidated
+  // because we don't run any module pass in codegen pipeline. This is very
+  // important because the codegen state is stored in MMI which is the analysis
+  // result of MachineModuleAnalysis. MMI should not be recomputed.
+  auto &MMI = MFAM.getResult<MachineModuleAnalysis>(M);
+
+  (void)RequireCodeGenSCCOrder;
+  assert(!RequireCodeGenSCCOrder && "not implemented");
+
+  // Add a PIC to verify machine functions.
+  if (VerifyMachineFunction) {
+    PassInstrumentation PI = MFAM.getResult<PassInstrumentationAnalysis>(M);
+
+    // No need to pop this callback later since MIR pipeline is flat which means
+    // current pipeline is the top-level pipeline. Callbacks are not used after
+    // current pipeline.
+    PI.pushBeforeNonSkippedPassCallback([&MFAM](StringRef PassID, Any IR) {
+      assert(any_isa<const MachineFunction *>(IR));
+      const MachineFunction *MF = any_cast<const MachineFunction *>(IR);
+      assert(MF && "Machine function should be valid for printing");
+      std::string Banner = std::string("After ") + std::string(PassID);
+      verifyMachineFunction(&MFAM, Banner, *MF);
+    });
+  }
+
+  if (DebugLogging) {
+    dbgs() << "Starting " << getTypeName<MachineFunction>()
+           << " pass manager run.\n";
+  }
+
+  for (auto &F : InitializationFuncs) {
+    if (auto Err = F(M, MFAM))
+      return Err;
+  }
+
+  unsigned Idx = 0;
+  size_t Size = Passes.size();
+  do {
+    // Run machine module passes
+    for (; MachineModulePasses.count(Idx) && Idx != Size; ++Idx) {
+      if (DebugLogging)
+        dbgs() << "Running pass: " << Passes[Idx]->name() << " on "
+               << M.getName() << '\n';
+      if (auto Err = MachineModulePasses.at(Idx)(M, MFAM))
+        return Err;
+    }
+
+    // Finish running all passes.
+    if (Idx == Size)
+      break;
+
+    // Run machine function passes
+
+    // Get index range of machine function passes.
+    unsigned Begin = Idx;
+    for (; !MachineModulePasses.count(Idx) && Idx != Size; ++Idx)
+      ;
+
+    for (Function &F : M) {
+      // Do not codegen any 'available_externally' functions at all, they have
+      // definitions outside the translation unit.
+      if (F.hasAvailableExternallyLinkage())
+        continue;
+
+      MachineFunction &MF = MMI.getOrCreateMachineFunction(F);
+      PassInstrumentation PI = MFAM.getResult<PassInstrumentationAnalysis>(MF);
+
+      for (unsigned I = Begin, E = Idx; I != E; ++I) {
+        auto *P = Passes[I].get();
+
+        if (!PI.runBeforePass<MachineFunction>(*P, MF))
+          continue;
+
+        // TODO: EmitSizeRemarks
+        PreservedAnalyses PassPA = P->run(MF, MFAM);
+        PI.runAfterPass(*P, MF, PassPA);
+        MFAM.invalidate(MF, PassPA);
+      }
+    }
+  } while (true);
+
+  for (auto &F : FinalizationFuncs) {
+    if (auto Err = F(M, MFAM))
+      return Err;
+  }
+
+  if (DebugLogging) {
+    dbgs() << "Finished " << getTypeName<MachineFunction>()
+           << " pass manager run.\n";
+  }
+
+  return Error::success();
+}
+
+} // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachinePipeliner.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachinePipeliner.cpp
index ef4b02ca9e3e..d0fe29f65ede 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -268,6 +268,7 @@ bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
 void MachinePipeliner::setPragmaPipelineOptions(MachineLoop &L) {
   // Reset the pragma for the next loop in iteration.
   disabledByPragma = false;
+  II_setByPragma = 0;
 
   MachineBasicBlock *LBLK = L.getTopBlock();
 
@@ -441,6 +442,16 @@ bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) {
   return SMS.hasNewSchedule();
 }
 
+void MachinePipeliner::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<AAResultsWrapperPass>();
+  AU.addPreserved<AAResultsWrapperPass>();
+  AU.addRequired<MachineLoopInfo>();
+  AU.addRequired<MachineDominatorTree>();
+  AU.addRequired<LiveIntervals>();
+  AU.addRequired<MachineOptimizationRemarkEmitterPass>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
 void SwingSchedulerDAG::setMII(unsigned ResMII, unsigned RecMII) {
   if (II_setByPragma > 0)
     MII = II_setByPragma;
@@ -705,14 +716,13 @@ static bool isDependenceBarrier(MachineInstr &MI, AliasAnalysis *AA) {
 /// This function calls the code in ValueTracking, but first checks that the
 /// instruction has a memory operand.
 static void getUnderlyingObjects(const MachineInstr *MI,
-                                 SmallVectorImpl<const Value *> &Objs,
-                                 const DataLayout &DL) {
+                                 SmallVectorImpl<const Value *> &Objs) {
   if (!MI->hasOneMemOperand())
     return;
   MachineMemOperand *MM = *MI->memoperands_begin();
   if (!MM->getValue())
     return;
-  GetUnderlyingObjects(MM->getValue(), Objs, DL);
+  getUnderlyingObjects(MM->getValue(), Objs);
   for (const Value *V : Objs) {
     if (!isIdentifiedObject(V)) {
       Objs.clear();
@@ -736,7 +746,7 @@ void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {
       PendingLoads.clear();
     else if (MI.mayLoad()) {
       SmallVector<const Value *, 4> Objs;
-      getUnderlyingObjects(&MI, Objs, MF.getDataLayout());
+      ::getUnderlyingObjects(&MI, Objs);
       if (Objs.empty())
         Objs.push_back(UnknownValue);
       for (auto V : Objs) {
@@ -745,7 +755,7 @@ void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {
       }
     } else if (MI.mayStore()) {
       SmallVector<const Value *, 4> Objs;
-      getUnderlyingObjects(&MI, Objs, MF.getDataLayout());
+      ::getUnderlyingObjects(&MI, Objs);
       if (Objs.empty())
         Objs.push_back(UnknownValue);
       for (auto V : Objs) {
@@ -803,10 +813,8 @@ void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {
             continue;
           }
           AliasResult AAResult = AA->alias(
-              MemoryLocation(MMO1->getValue(), LocationSize::unknown(),
-                             MMO1->getAAInfo()),
-              MemoryLocation(MMO2->getValue(), LocationSize::unknown(),
-                             MMO2->getAAInfo()));
+              MemoryLocation::getAfter(MMO1->getValue(), MMO1->getAAInfo()),
+              MemoryLocation::getAfter(MMO2->getValue(), MMO2->getAAInfo()));
 
           if (AAResult != NoAlias) {
             SDep Dep(Load, SDep::Barrier);
@@ -1587,12 +1595,12 @@ static bool computePath(SUnit *Cur, SetVector<SUnit *> &Path,
                         SmallPtrSet<SUnit *, 8> &Visited) {
   if (Cur->isBoundaryNode())
     return false;
-  if (Exclude.count(Cur) != 0)
+  if (Exclude.contains(Cur))
     return false;
-  if (DestNodes.count(Cur) != 0)
+  if (DestNodes.contains(Cur))
     return true;
   if (!Visited.insert(Cur).second)
-    return Path.count(Cur) != 0;
+    return Path.contains(Cur);
   bool FoundPath = false;
   for (auto &SI : Cur->Succs)
     FoundPath |= computePath(SI.getSUnit(), Path, DestNodes, Exclude, Visited);
@@ -1632,7 +1640,8 @@ static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker,
         if (Register::isVirtualRegister(Reg))
           Uses.insert(Reg);
         else if (MRI.isAllocatable(Reg))
-          for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
+          for (MCRegUnitIterator Units(Reg.asMCReg(), TRI); Units.isValid();
+               ++Units)
             Uses.insert(*Units);
       }
   }
@@ -1645,7 +1654,8 @@ static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker,
             LiveOutRegs.push_back(RegisterMaskPair(Reg,
                                                    LaneBitmask::getNone()));
         } else if (MRI.isAllocatable(Reg)) {
-          for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
+          for (MCRegUnitIterator Units(Reg.asMCReg(), TRI); Units.isValid();
+               ++Units)
             if (!Uses.count(*Units))
               LiveOutRegs.push_back(RegisterMaskPair(*Units,
                                                      LaneBitmask::getNone()));
@@ -1741,7 +1751,6 @@ void SwingSchedulerDAG::checkNodeSets(NodeSetType &NodeSets) {
   }
   NodeSets.clear();
   LLVM_DEBUG(dbgs() << "Clear recurrence node-sets\n");
-  return;
 }
 
 /// Add the nodes that do not belong to a recurrence set into groups
@@ -1946,7 +1955,7 @@ void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {
           for (const auto &I : maxHeight->Succs) {
             if (Nodes.count(I.getSUnit()) == 0)
               continue;
-            if (NodeOrder.count(I.getSUnit()) != 0)
+            if (NodeOrder.contains(I.getSUnit()))
               continue;
             if (ignoreDependence(I, false))
               continue;
@@ -1958,7 +1967,7 @@ void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {
               continue;
             if (Nodes.count(I.getSUnit()) == 0)
               continue;
-            if (NodeOrder.count(I.getSUnit()) != 0)
+            if (NodeOrder.contains(I.getSUnit()))
               continue;
             R.insert(I.getSUnit());
           }
@@ -1997,7 +2006,7 @@ void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {
           for (const auto &I : maxDepth->Preds) {
             if (Nodes.count(I.getSUnit()) == 0)
               continue;
-            if (NodeOrder.count(I.getSUnit()) != 0)
+            if (NodeOrder.contains(I.getSUnit()))
               continue;
             R.insert(I.getSUnit());
           }
@@ -2007,7 +2016,7 @@ void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {
               continue;
             if (Nodes.count(I.getSUnit()) == 0)
               continue;
-            if (NodeOrder.count(I.getSUnit()) != 0)
+            if (NodeOrder.contains(I.getSUnit()))
               continue;
             R.insert(I.getSUnit());
           }
@@ -2270,7 +2279,7 @@ void SwingSchedulerDAG::applyInstrChange(MachineInstr *MI,
 /// Return the instruction in the loop that defines the register.
 /// If the definition is a Phi, then follow the Phi operand to
 /// the instruction in the loop.
-MachineInstr *SwingSchedulerDAG::findDefInLoop(unsigned Reg) {
+MachineInstr *SwingSchedulerDAG::findDefInLoop(Register Reg) {
   SmallPtrSet<MachineInstr *, 8> Visited;
   MachineInstr *Def = MRI.getVRegDef(Reg);
   while (Def->isPHI()) {
@@ -2943,7 +2952,7 @@ void SMSchedule::finalizeSchedule(SwingSchedulerDAG *SSD) {
     }
     // Replace the old order with the new order.
     cycleInstrs.swap(newOrderPhi);
-    cycleInstrs.insert(cycleInstrs.end(), newOrderI.begin(), newOrderI.end());
+    llvm::append_range(cycleInstrs, newOrderI);
     SSD->fixupRegisterOverlaps(cycleInstrs);
   }
 
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineRegisterInfo.cpp
index 4c733738840a..5325eda9d478 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineRegisterInfo.cpp
@@ -417,17 +417,11 @@ MachineInstr *MachineRegisterInfo::getUniqueVRegDef(Register Reg) const {
 }
 
 bool MachineRegisterInfo::hasOneNonDBGUse(Register RegNo) const {
-  use_nodbg_iterator UI = use_nodbg_begin(RegNo);
-  if (UI == use_nodbg_end())
-    return false;
-  return ++UI == use_nodbg_end();
+  return hasSingleElement(use_nodbg_operands(RegNo));
 }
 
 bool MachineRegisterInfo::hasOneNonDBGUser(Register RegNo) const {
-  use_instr_nodbg_iterator UI = use_instr_nodbg_begin(RegNo);
-  if (UI == use_instr_nodbg_end())
-    return false;
-  return ++UI == use_instr_nodbg_end();
+  return hasSingleElement(use_nodbg_instructions(RegNo));
 }
 
 /// clearKillFlags - Iterate over all the uses of the given register and
@@ -532,13 +526,6 @@ bool MachineRegisterInfo::isConstantPhysReg(MCRegister PhysReg) const {
   return true;
 }
 
-bool
-MachineRegisterInfo::isCallerPreservedOrConstPhysReg(MCRegister PhysReg) const {
-  const TargetRegisterInfo *TRI = getTargetRegisterInfo();
-  return isConstantPhysReg(PhysReg) ||
-      TRI->isCallerPreservedPhysReg(PhysReg, *MF);
-}
-
 /// markUsesInDebugValueAsUndef - Mark every DBG_VALUE referencing the
 /// specified register as undefined which causes the DBG_VALUE to be
 /// deleted during LiveDebugVariables analysis.
@@ -630,8 +617,7 @@ void MachineRegisterInfo::disableCalleeSavedRegister(MCRegister Reg) {
 
   // Remove the register (and its aliases from the list).
   for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
-    UpdatedCSRs.erase(std::remove(UpdatedCSRs.begin(), UpdatedCSRs.end(), *AI),
-                      UpdatedCSRs.end());
+    llvm::erase_value(UpdatedCSRs, *AI);
 }
 
 const MCPhysReg *MachineRegisterInfo::getCalleeSavedRegs() const {
@@ -645,8 +631,7 @@ void MachineRegisterInfo::setCalleeSavedRegs(ArrayRef<MCPhysReg> CSRs) {
   if (IsUpdatedCSRsInitialized)
     UpdatedCSRs.clear();
 
-  for (MCPhysReg Reg : CSRs)
-    UpdatedCSRs.push_back(Reg);
+  append_range(UpdatedCSRs, CSRs);
 
   // Zero value represents the end of the register list
   // (no more registers should be pushed).
@@ -660,7 +645,7 @@ bool MachineRegisterInfo::isReservedRegUnit(unsigned Unit) const {
     bool IsRootReserved = true;
     for (MCSuperRegIterator Super(*Root, TRI, /*IncludeSelf=*/true);
          Super.isValid(); ++Super) {
-      unsigned Reg = *Super;
+      MCRegister Reg = *Super;
       if (!isReserved(Reg)) {
         IsRootReserved = false;
         break;
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineSSAUpdater.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineSSAUpdater.cpp
index b12557d6d326..462082df5d05 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/MachineSSAUpdater.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineSSAUpdater.cpp
@@ -50,15 +50,18 @@ MachineSSAUpdater::~MachineSSAUpdater() {
 }
 
 /// Initialize - Reset this object to get ready for a new set of SSA
-/// updates.  ProtoValue is the value used to name PHI nodes.
-void MachineSSAUpdater::Initialize(Register V) {
+/// updates.
+void MachineSSAUpdater::Initialize(const TargetRegisterClass *RC) {
   if (!AV)
     AV = new AvailableValsTy();
   else
     getAvailableVals(AV).clear();
 
-  VR = V;
-  VRC = MRI->getRegClass(VR);
+  VRC = RC;
+}
+
+void MachineSSAUpdater::Initialize(Register V) {
+  Initialize(MRI->getRegClass(V));
 }
 
 /// HasValueForBlock - Return true if the MachineSSAUpdater already has a value for
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineScheduler.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineScheduler.cpp
index cf75d531deb2..8d51bb26103a 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/PriorityQueue.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/LiveInterval.h"
@@ -73,6 +74,8 @@ using namespace llvm;
 
 #define DEBUG_TYPE "machine-scheduler"
 
+STATISTIC(NumClustered, "Number of load/store pairs clustered");
+
 namespace llvm {
 
 cl::opt<bool> ForceTopDown("misched-topdown", cl::Hidden,
@@ -126,6 +129,15 @@ static cl::opt<bool> EnableCyclicPath("misched-cyclicpath", cl::Hidden,
 static cl::opt<bool> EnableMemOpCluster("misched-cluster", cl::Hidden,
                                         cl::desc("Enable memop clustering."),
                                         cl::init(true));
+static cl::opt<bool>
+    ForceFastCluster("force-fast-cluster", cl::Hidden,
+                     cl::desc("Switch to fast cluster algorithm with the lost "
+                              "of some fusion opportunities"),
+                     cl::init(false));
+static cl::opt<unsigned>
+    FastClusterThreshold("fast-cluster-threshold", cl::Hidden,
+                         cl::desc("The threshold for fast cluster"),
+                         cl::init(1000));
 
 // DAG subtrees must have at least this many nodes.
 static const unsigned MinSubtreeSize = 8;
@@ -228,8 +240,13 @@ char PostMachineScheduler::ID = 0;
 
 char &llvm::PostMachineSchedulerID = PostMachineScheduler::ID;
 
-INITIALIZE_PASS(PostMachineScheduler, "postmisched",
-                "PostRA Machine Instruction Scheduler", false, false)
+INITIALIZE_PASS_BEGIN(PostMachineScheduler, "postmisched",
+                      "PostRA Machine Instruction Scheduler", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(PostMachineScheduler, "postmisched",
+                    "PostRA Machine Instruction Scheduler", false, false)
 
 PostMachineScheduler::PostMachineScheduler() : MachineSchedulerBase(ID) {
   initializePostMachineSchedulerPass(*PassRegistry::getPassRegistry());
@@ -1098,7 +1115,7 @@ updateScheduledPressure(const SUnit *SU,
 void ScheduleDAGMILive::updatePressureDiffs(
     ArrayRef<RegisterMaskPair> LiveUses) {
   for (const RegisterMaskPair &P : LiveUses) {
-    unsigned Reg = P.RegUnit;
+    Register Reg = P.RegUnit;
     /// FIXME: Currently assuming single-use physregs.
     if (!Register::isVirtualRegister(Reg))
       continue;
@@ -1298,7 +1315,7 @@ void ScheduleDAGMILive::computeDFSResult() {
 /// The cyclic path estimation identifies a def-use pair that crosses the back
 /// edge and considers the depth and height of the nodes. For example, consider
 /// the following instruction sequence where each instruction has unit latency
-/// and defines an epomymous virtual register:
+/// and defines an eponymous virtual register:
 ///
 /// a->b(a,c)->c(b)->d(c)->exit
 ///
@@ -1323,7 +1340,7 @@ unsigned ScheduleDAGMILive::computeCyclicCriticalPath() {
   unsigned MaxCyclicLatency = 0;
   // Visit each live out vreg def to find def/use pairs that cross iterations.
   for (const RegisterMaskPair &P : RPTracker.getPressure().LiveOutRegs) {
-    unsigned Reg = P.RegUnit;
+    Register Reg = P.RegUnit;
     if (!Register::isVirtualRegister(Reg))
       continue;
     const LiveInterval &LI = LIS->getInterval(Reg);
@@ -1527,7 +1544,12 @@ public:
   void apply(ScheduleDAGInstrs *DAGInstrs) override;
 
 protected:
-  void clusterNeighboringMemOps(ArrayRef<SUnit *> MemOps, ScheduleDAGInstrs *DAG);
+  void clusterNeighboringMemOps(ArrayRef<MemOpInfo> MemOps, bool FastCluster,
+                                ScheduleDAGInstrs *DAG);
+  void collectMemOpRecords(std::vector<SUnit> &SUnits,
+                           SmallVectorImpl<MemOpInfo> &MemOpRecords);
+  bool groupMemOps(ArrayRef<MemOpInfo> MemOps, ScheduleDAGInstrs *DAG,
+                   DenseMap<unsigned, SmallVector<MemOpInfo, 32>> &Groups);
 };
 
 class StoreClusterMutation : public BaseMemOpClusterMutation {
@@ -1563,109 +1585,179 @@ createStoreClusterDAGMutation(const TargetInstrInfo *TII,
 
 } // end namespace llvm
 
+// Sorting all the loads/stores first, then for each load/store, checking the
+// following load/store one by one, until reach the first non-dependent one and
+// call target hook to see if they can cluster.
+// If FastCluster is enabled, we assume that, all the loads/stores have been
+// preprocessed and now, they didn't have dependencies on each other.
 void BaseMemOpClusterMutation::clusterNeighboringMemOps(
-    ArrayRef<SUnit *> MemOps, ScheduleDAGInstrs *DAG) {
-  SmallVector<MemOpInfo, 32> MemOpRecords;
-  for (SUnit *SU : MemOps) {
-    const MachineInstr &MI = *SU->getInstr();
-    SmallVector<const MachineOperand *, 4> BaseOps;
-    int64_t Offset;
-    bool OffsetIsScalable;
-    unsigned Width;
-    if (TII->getMemOperandsWithOffsetWidth(MI, BaseOps, Offset,
-                                           OffsetIsScalable, Width, TRI)) {
-      MemOpRecords.push_back(MemOpInfo(SU, BaseOps, Offset, Width));
-
-      LLVM_DEBUG(dbgs() << "Num BaseOps: " << BaseOps.size() << ", Offset: "
-                        << Offset << ", OffsetIsScalable: " << OffsetIsScalable
-                        << ", Width: " << Width << "\n");
-    }
-#ifndef NDEBUG
-    for (auto *Op : BaseOps)
-      assert(Op);
-#endif
-  }
-  if (MemOpRecords.size() < 2)
-    return;
-
-  llvm::sort(MemOpRecords);
+    ArrayRef<MemOpInfo> MemOpRecords, bool FastCluster,
+    ScheduleDAGInstrs *DAG) {
+  // Keep track of the current cluster length and bytes for each SUnit.
+  DenseMap<unsigned, std::pair<unsigned, unsigned>> SUnit2ClusterInfo;
 
   // At this point, `MemOpRecords` array must hold atleast two mem ops. Try to
   // cluster mem ops collected within `MemOpRecords` array.
-  unsigned ClusterLength = 1;
-  unsigned CurrentClusterBytes = MemOpRecords[0].Width;
   for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) {
     // Decision to cluster mem ops is taken based on target dependent logic
     auto MemOpa = MemOpRecords[Idx];
-    auto MemOpb = MemOpRecords[Idx + 1];
-    ++ClusterLength;
-    CurrentClusterBytes += MemOpb.Width;
-    if (!TII->shouldClusterMemOps(MemOpa.BaseOps, MemOpb.BaseOps, ClusterLength,
-                                  CurrentClusterBytes)) {
-      // Current mem ops pair could not be clustered, reset cluster length, and
-      // go to next pair
-      ClusterLength = 1;
-      CurrentClusterBytes = MemOpb.Width;
+
+    // Seek for the next load/store to do the cluster.
+    unsigned NextIdx = Idx + 1;
+    for (; NextIdx < End; ++NextIdx)
+      // Skip if MemOpb has been clustered already or has dependency with
+      // MemOpa.
+      if (!SUnit2ClusterInfo.count(MemOpRecords[NextIdx].SU->NodeNum) &&
+          (FastCluster ||
+           (!DAG->IsReachable(MemOpRecords[NextIdx].SU, MemOpa.SU) &&
+            !DAG->IsReachable(MemOpa.SU, MemOpRecords[NextIdx].SU))))
+        break;
+    if (NextIdx == End)
       continue;
+
+    auto MemOpb = MemOpRecords[NextIdx];
+    unsigned ClusterLength = 2;
+    unsigned CurrentClusterBytes = MemOpa.Width + MemOpb.Width;
+    if (SUnit2ClusterInfo.count(MemOpa.SU->NodeNum)) {
+      ClusterLength = SUnit2ClusterInfo[MemOpa.SU->NodeNum].first + 1;
+      CurrentClusterBytes =
+          SUnit2ClusterInfo[MemOpa.SU->NodeNum].second + MemOpb.Width;
     }
 
+    if (!TII->shouldClusterMemOps(MemOpa.BaseOps, MemOpb.BaseOps, ClusterLength,
+                                  CurrentClusterBytes))
+      continue;
+
     SUnit *SUa = MemOpa.SU;
     SUnit *SUb = MemOpb.SU;
     if (SUa->NodeNum > SUb->NodeNum)
       std::swap(SUa, SUb);
 
     // FIXME: Is this check really required?
-    if (!DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) {
-      ClusterLength = 1;
-      CurrentClusterBytes = MemOpb.Width;
+    if (!DAG->addEdge(SUb, SDep(SUa, SDep::Cluster)))
       continue;
-    }
 
     LLVM_DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU("
                       << SUb->NodeNum << ")\n");
-
-    // Copy successor edges from SUa to SUb. Interleaving computation
-    // dependent on SUa can prevent load combining due to register reuse.
-    // Predecessor edges do not need to be copied from SUb to SUa since
-    // nearby loads should have effectively the same inputs.
-    for (const SDep &Succ : SUa->Succs) {
-      if (Succ.getSUnit() == SUb)
-        continue;
-      LLVM_DEBUG(dbgs() << "  Copy Succ SU(" << Succ.getSUnit()->NodeNum
-                        << ")\n");
-      DAG->addEdge(Succ.getSUnit(), SDep(SUb, SDep::Artificial));
+    ++NumClustered;
+
+    if (IsLoad) {
+      // Copy successor edges from SUa to SUb. Interleaving computation
+      // dependent on SUa can prevent load combining due to register reuse.
+      // Predecessor edges do not need to be copied from SUb to SUa since
+      // nearby loads should have effectively the same inputs.
+      for (const SDep &Succ : SUa->Succs) {
+        if (Succ.getSUnit() == SUb)
+          continue;
+        LLVM_DEBUG(dbgs() << "  Copy Succ SU(" << Succ.getSUnit()->NodeNum
+                          << ")\n");
+        DAG->addEdge(Succ.getSUnit(), SDep(SUb, SDep::Artificial));
+      }
+    } else {
+      // Copy predecessor edges from SUb to SUa to avoid the SUnits that
+      // SUb dependent on scheduled in-between SUb and SUa. Successor edges
+      // do not need to be copied from SUa to SUb since no one will depend
+      // on stores.
+      // Notice that, we don't need to care about the memory dependency as
+      // we won't try to cluster them if they have any memory dependency.
+      for (const SDep &Pred : SUb->Preds) {
+        if (Pred.getSUnit() == SUa)
+          continue;
+        LLVM_DEBUG(dbgs() << "  Copy Pred SU(" << Pred.getSUnit()->NodeNum
+                          << ")\n");
+        DAG->addEdge(SUa, SDep(Pred.getSUnit(), SDep::Artificial));
+      }
     }
 
+    SUnit2ClusterInfo[MemOpb.SU->NodeNum] = {ClusterLength,
+                                             CurrentClusterBytes};
+
     LLVM_DEBUG(dbgs() << "  Curr cluster length: " << ClusterLength
                       << ", Curr cluster bytes: " << CurrentClusterBytes
                       << "\n");
   }
 }
 
-/// Callback from DAG postProcessing to create cluster edges for loads.
-void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAG) {
-  // Map DAG NodeNum to a set of dependent MemOps in store chain.
-  DenseMap<unsigned, SmallVector<SUnit *, 4>> StoreChains;
-  for (SUnit &SU : DAG->SUnits) {
+void BaseMemOpClusterMutation::collectMemOpRecords(
+    std::vector<SUnit> &SUnits, SmallVectorImpl<MemOpInfo> &MemOpRecords) {
+  for (auto &SU : SUnits) {
     if ((IsLoad && !SU.getInstr()->mayLoad()) ||
         (!IsLoad && !SU.getInstr()->mayStore()))
       continue;
 
+    const MachineInstr &MI = *SU.getInstr();
+    SmallVector<const MachineOperand *, 4> BaseOps;
+    int64_t Offset;
+    bool OffsetIsScalable;
+    unsigned Width;
+    if (TII->getMemOperandsWithOffsetWidth(MI, BaseOps, Offset,
+                                           OffsetIsScalable, Width, TRI)) {
+      MemOpRecords.push_back(MemOpInfo(&SU, BaseOps, Offset, Width));
+
+      LLVM_DEBUG(dbgs() << "Num BaseOps: " << BaseOps.size() << ", Offset: "
+                        << Offset << ", OffsetIsScalable: " << OffsetIsScalable
+                        << ", Width: " << Width << "\n");
+    }
+#ifndef NDEBUG
+    for (auto *Op : BaseOps)
+      assert(Op);
+#endif
+  }
+}
+
+bool BaseMemOpClusterMutation::groupMemOps(
+    ArrayRef<MemOpInfo> MemOps, ScheduleDAGInstrs *DAG,
+    DenseMap<unsigned, SmallVector<MemOpInfo, 32>> &Groups) {
+  bool FastCluster =
+      ForceFastCluster ||
+      MemOps.size() * DAG->SUnits.size() / 1000 > FastClusterThreshold;
+
+  for (const auto &MemOp : MemOps) {
     unsigned ChainPredID = DAG->SUnits.size();
-    for (const SDep &Pred : SU.Preds) {
-      if (Pred.isCtrl() && !Pred.isArtificial()) {
-        ChainPredID = Pred.getSUnit()->NodeNum;
-        break;
+    if (FastCluster) {
+      for (const SDep &Pred : MemOp.SU->Preds) {
+        // We only want to cluster the mem ops that have the same ctrl(non-data)
+        // pred so that they didn't have ctrl dependency for each other. But for
+        // store instrs, we can still cluster them if the pred is load instr.
+        if ((Pred.isCtrl() &&
+             (IsLoad ||
+              (Pred.getSUnit() && Pred.getSUnit()->getInstr()->mayStore()))) &&
+            !Pred.isArtificial()) {
+          ChainPredID = Pred.getSUnit()->NodeNum;
+          break;
+        }
       }
-    }
-    // Insert the SU to corresponding store chain.
-    auto &Chain = StoreChains.FindAndConstruct(ChainPredID).second;
-    Chain.push_back(&SU);
+    } else
+      ChainPredID = 0;
+
+    Groups[ChainPredID].push_back(MemOp);
   }
+  return FastCluster;
+}
 
-  // Iterate over the store chains.
-  for (auto &SCD : StoreChains)
-    clusterNeighboringMemOps(SCD.second, DAG);
+/// Callback from DAG postProcessing to create cluster edges for loads/stores.
+void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAG) {
+  // Collect all the clusterable loads/stores
+  SmallVector<MemOpInfo, 32> MemOpRecords;
+  collectMemOpRecords(DAG->SUnits, MemOpRecords);
+
+  if (MemOpRecords.size() < 2)
+    return;
+
+  // Put the loads/stores without dependency into the same group with some
+  // heuristic if the DAG is too complex to avoid compiling time blow up.
+  // Notice that, some fusion pair could be lost with this.
+  DenseMap<unsigned, SmallVector<MemOpInfo, 32>> Groups;
+  bool FastCluster = groupMemOps(MemOpRecords, DAG, Groups);
+
+  for (auto &Group : Groups) {
+    // Sorting the loads/stores, so that, we can stop the cluster as early as
+    // possible.
+    llvm::sort(Group.second);
+
+    // Trying to cluster all the neighboring loads/stores.
+    clusterNeighboringMemOps(Group.second, FastCluster, DAG);
+  }
 }
 
 //===----------------------------------------------------------------------===//
@@ -2724,7 +2816,11 @@ bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand,
                 GenericSchedulerBase::SchedCandidate &Cand,
                 SchedBoundary &Zone) {
   if (Zone.isTop()) {
-    if (Cand.SU->getDepth() > Zone.getScheduledLatency()) {
+    // Prefer the candidate with the lesser depth, but only if one of them has
+    // depth greater than the total latency scheduled so far, otherwise either
+    // of them could be scheduled now with no stall.
+    if (std::max(TryCand.SU->getDepth(), Cand.SU->getDepth()) >
+        Zone.getScheduledLatency()) {
       if (tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(),
                   TryCand, Cand, GenericSchedulerBase::TopDepthReduce))
         return true;
@@ -2733,7 +2829,11 @@ bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand,
                    TryCand, Cand, GenericSchedulerBase::TopPathReduce))
       return true;
   } else {
-    if (Cand.SU->getHeight() > Zone.getScheduledLatency()) {
+    // Prefer the candidate with the lesser height, but only if one of them has
+    // height greater than the total latency scheduled so far, otherwise either
+    // of them could be scheduled now with no stall.
+    if (std::max(TryCand.SU->getHeight(), Cand.SU->getHeight()) >
+        Zone.getScheduledLatency()) {
       if (tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(),
                   TryCand, Cand, GenericSchedulerBase::BotHeightReduce))
         return true;
@@ -3356,13 +3456,13 @@ ScheduleDAGMILive *llvm::createGenericSchedLive(MachineSchedContext *C) {
   return DAG;
 }
 
-static ScheduleDAGInstrs *createConveringSched(MachineSchedContext *C) {
+static ScheduleDAGInstrs *createConvergingSched(MachineSchedContext *C) {
   return createGenericSchedLive(C);
 }
 
 static MachineSchedRegistry
 GenericSchedRegistry("converge", "Standard converging scheduler.",
-                     createConveringSched);
+                     createConvergingSched);
 
 //===----------------------------------------------------------------------===//
 // PostGenericScheduler - Generic PostRA implementation of MachineSchedStrategy.
@@ -3736,7 +3836,7 @@ struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits {
     return true;
   }
 
-  static bool isNodeHidden(const SUnit *Node) {
+  static bool isNodeHidden(const SUnit *Node, const ScheduleDAG *G) {
     if (ViewMISchedCutoff == 0)
       return false;
     return (Node->Preds.size() > ViewMISchedCutoff
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineSink.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineSink.cpp
index 5f958bbc31b7..378df1b75e25 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/MachineSink.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineSink.cpp
@@ -34,6 +34,8 @@
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/RegisterPressure.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
@@ -77,6 +79,18 @@ static cl::opt<unsigned> SplitEdgeProbabilityThreshold(
         "splitted critical edge"),
     cl::init(40), cl::Hidden);
 
+static cl::opt<unsigned> SinkLoadInstsPerBlockThreshold(
+    "machine-sink-load-instrs-threshold",
+    cl::desc("Do not try to find alias store for a load if there is a in-path "
+             "block whose instruction number is higher than this threshold."),
+    cl::init(2000), cl::Hidden);
+
+static cl::opt<unsigned> SinkLoadBlocksThreshold(
+    "machine-sink-load-blocks-threshold",
+    cl::desc("Do not try to find alias store for a load if the block number in "
+             "the straight line is higher than this threshold."),
+    cl::init(20), cl::Hidden);
+
 STATISTIC(NumSunk,      "Number of machine instructions sunk");
 STATISTIC(NumSplit,     "Number of critical edges split");
 STATISTIC(NumCoalesces, "Number of copies coalesced");
@@ -94,6 +108,7 @@ namespace {
     MachineBlockFrequencyInfo *MBFI;
     const MachineBranchProbabilityInfo *MBPI;
     AliasAnalysis *AA;
+    RegisterClassInfo RegClassInfo;
 
     // Remember which edges have been considered for breaking.
     SmallSet<std::pair<MachineBasicBlock*, MachineBasicBlock*>, 8>
@@ -127,6 +142,15 @@ namespace {
     /// current block.
     DenseSet<DebugVariable> SeenDbgVars;
 
+    std::map<std::pair<MachineBasicBlock *, MachineBasicBlock *>, bool>
+        HasStoreCache;
+    std::map<std::pair<MachineBasicBlock *, MachineBasicBlock *>,
+             std::vector<MachineInstr *>>
+        StoreInstrCache;
+
+    /// Cached BB's register pressure.
+    std::map<MachineBasicBlock *, std::vector<unsigned>> CachedRegisterPressure;
+
   public:
     static char ID; // Pass identification
 
@@ -159,6 +183,9 @@ namespace {
                                      MachineBasicBlock *From,
                                      MachineBasicBlock *To);
 
+    bool hasStoreBetween(MachineBasicBlock *From, MachineBasicBlock *To,
+                         MachineInstr &MI);
+
     /// Postpone the splitting of the given critical
     /// edge (\p From, \p To).
     ///
@@ -184,12 +211,12 @@ namespace {
     /// to the copy source.
     void SalvageUnsunkDebugUsersOfCopy(MachineInstr &,
                                        MachineBasicBlock *TargetBlock);
-    bool AllUsesDominatedByBlock(unsigned Reg, MachineBasicBlock *MBB,
-                                 MachineBasicBlock *DefMBB,
-                                 bool &BreakPHIEdge, bool &LocalUse) const;
+    bool AllUsesDominatedByBlock(Register Reg, MachineBasicBlock *MBB,
+                                 MachineBasicBlock *DefMBB, bool &BreakPHIEdge,
+                                 bool &LocalUse) const;
     MachineBasicBlock *FindSuccToSinkTo(MachineInstr &MI, MachineBasicBlock *MBB,
                bool &BreakPHIEdge, AllSuccsCache &AllSuccessors);
-    bool isProfitableToSinkTo(unsigned Reg, MachineInstr &MI,
+    bool isProfitableToSinkTo(Register Reg, MachineInstr &MI,
                               MachineBasicBlock *MBB,
                               MachineBasicBlock *SuccToSinkTo,
                               AllSuccsCache &AllSuccessors);
@@ -200,6 +227,8 @@ namespace {
     SmallVector<MachineBasicBlock *, 4> &
     GetAllSortedSuccessors(MachineInstr &MI, MachineBasicBlock *MBB,
                            AllSuccsCache &AllSuccessors) const;
+
+    std::vector<unsigned> &getBBRegisterPressure(MachineBasicBlock &MBB);
   };
 
 } // end anonymous namespace
@@ -253,12 +282,11 @@ bool MachineSinking::PerformTrivialForwardCoalescing(MachineInstr &MI,
 /// occur in blocks dominated by the specified block. If any use is in the
 /// definition block, then return false since it is never legal to move def
 /// after uses.
-bool
-MachineSinking::AllUsesDominatedByBlock(unsigned Reg,
-                                        MachineBasicBlock *MBB,
-                                        MachineBasicBlock *DefMBB,
-                                        bool &BreakPHIEdge,
-                                        bool &LocalUse) const {
+bool MachineSinking::AllUsesDominatedByBlock(Register Reg,
+                                             MachineBasicBlock *MBB,
+                                             MachineBasicBlock *DefMBB,
+                                             bool &BreakPHIEdge,
+                                             bool &LocalUse) const {
   assert(Register::isVirtualRegister(Reg) && "Only makes sense for vregs");
 
   // Ignore debug uses because debug info doesn't affect the code.
@@ -327,6 +355,7 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
   MBFI = UseBlockFreqInfo ? &getAnalysis<MachineBlockFrequencyInfo>() : nullptr;
   MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  RegClassInfo.runOnMachineFunction(MF);
 
   bool EverMadeChange = false;
 
@@ -347,11 +376,9 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
                           << printMBBReference(*Pair.first) << " -- "
                           << printMBBReference(*NewSucc) << " -- "
                           << printMBBReference(*Pair.second) << '\n');
-        if (MBFI) {
-          auto NewSuccFreq = MBFI->getBlockFreq(Pair.first) *
-                             MBPI->getEdgeProbability(Pair.first, NewSucc);
-          MBFI->setBlockFreq(NewSucc, NewSuccFreq.getFrequency());
-        }
+        if (MBFI)
+          MBFI->onEdgeSplit(*Pair.first, *NewSucc, *MBPI);
+
         MadeChange = true;
         ++NumSplit;
       } else
@@ -362,6 +389,9 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
     EverMadeChange = true;
   }
 
+  HasStoreCache.clear();
+  StoreInstrCache.clear();
+
   // Now clear any kill flags for recorded registers.
   for (auto I : RegsToClearKillFlags)
     MRI->clearKillFlags(I);
@@ -419,6 +449,8 @@ bool MachineSinking::ProcessBlock(MachineBasicBlock &MBB) {
 
   SeenDbgUsers.clear();
   SeenDbgVars.clear();
+  // recalculate the bb register pressure after sinking one BB.
+  CachedRegisterPressure.clear();
 
   return MadeChange;
 }
@@ -430,7 +462,7 @@ void MachineSinking::ProcessDbgInst(MachineInstr &MI) {
 
   DebugVariable Var(MI.getDebugVariable(), MI.getDebugExpression(),
                     MI.getDebugLoc()->getInlinedAt());
-  bool SeenBefore = SeenDbgVars.count(Var) != 0;
+  bool SeenBefore = SeenDbgVars.contains(Var);
 
   MachineOperand &MO = MI.getDebugOperand(0);
   if (MO.isReg() && MO.getReg().isVirtual())
@@ -561,8 +593,44 @@ bool MachineSinking::PostponeSplitCriticalEdge(MachineInstr &MI,
   return true;
 }
 
+std::vector<unsigned> &
+MachineSinking::getBBRegisterPressure(MachineBasicBlock &MBB) {
+  // Currently to save compiling time, MBB's register pressure will not change
+  // in one ProcessBlock iteration because of CachedRegisterPressure. but MBB's
+  // register pressure is changed after sinking any instructions into it.
+  // FIXME: need a accurate and cheap register pressure estiminate model here.
+  auto RP = CachedRegisterPressure.find(&MBB);
+  if (RP != CachedRegisterPressure.end())
+    return RP->second;
+
+  RegionPressure Pressure;
+  RegPressureTracker RPTracker(Pressure);
+
+  // Initialize the register pressure tracker.
+  RPTracker.init(MBB.getParent(), &RegClassInfo, nullptr, &MBB, MBB.end(),
+                 /*TrackLaneMasks*/ false, /*TrackUntiedDefs=*/true);
+
+  for (MachineBasicBlock::iterator MII = MBB.instr_end(),
+                                   MIE = MBB.instr_begin();
+       MII != MIE; --MII) {
+    MachineInstr &MI = *std::prev(MII);
+    if (MI.isDebugValue() || MI.isDebugLabel())
+      continue;
+    RegisterOperands RegOpers;
+    RegOpers.collect(MI, *TRI, *MRI, false, false);
+    RPTracker.recedeSkipDebugValues();
+    assert(&*RPTracker.getPos() == &MI && "RPTracker sync error!");
+    RPTracker.recede(RegOpers);
+  }
+
+  RPTracker.closeRegion();
+  auto It = CachedRegisterPressure.insert(
+      std::make_pair(&MBB, RPTracker.getPressure().MaxSetPressure));
+  return It.first->second;
+}
+
 /// isProfitableToSinkTo - Return true if it is profitable to sink MI.
-bool MachineSinking::isProfitableToSinkTo(unsigned Reg, MachineInstr &MI,
+bool MachineSinking::isProfitableToSinkTo(Register Reg, MachineInstr &MI,
                                           MachineBasicBlock *MBB,
                                           MachineBasicBlock *SuccToSinkTo,
                                           AllSuccsCache &AllSuccessors) {
@@ -598,9 +666,73 @@ bool MachineSinking::isProfitableToSinkTo(unsigned Reg, MachineInstr &MI,
           FindSuccToSinkTo(MI, SuccToSinkTo, BreakPHIEdge, AllSuccessors))
     return isProfitableToSinkTo(Reg, MI, SuccToSinkTo, MBB2, AllSuccessors);
 
-  // If SuccToSinkTo is final destination and it is a post dominator of current
-  // block then it is not profitable to sink MI into SuccToSinkTo block.
-  return false;
+  MachineLoop *ML = LI->getLoopFor(MBB);
+
+  // If the instruction is not inside a loop, it is not profitable to sink MI to
+  // a post dominate block SuccToSinkTo.
+  if (!ML)
+    return false;
+
+  auto isRegisterPressureSetExceedLimit = [&](const TargetRegisterClass *RC) {
+    unsigned Weight = TRI->getRegClassWeight(RC).RegWeight;
+    const int *PS = TRI->getRegClassPressureSets(RC);
+    // Get register pressure for block SuccToSinkTo.
+    std::vector<unsigned> BBRegisterPressure =
+        getBBRegisterPressure(*SuccToSinkTo);
+    for (; *PS != -1; PS++)
+      // check if any register pressure set exceeds limit in block SuccToSinkTo
+      // after sinking.
+      if (Weight + BBRegisterPressure[*PS] >=
+          TRI->getRegPressureSetLimit(*MBB->getParent(), *PS))
+        return true;
+    return false;
+  };
+
+  // If this instruction is inside a loop and sinking this instruction can make
+  // more registers live range shorten, it is still prifitable.
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
+    // Ignore non-register operands.
+    if (!MO.isReg())
+      continue;
+    Register Reg = MO.getReg();
+    if (Reg == 0)
+      continue;
+
+    // Don't handle physical register.
+    if (Register::isPhysicalRegister(Reg))
+      return false;
+
+    // Users for the defs are all dominated by SuccToSinkTo.
+    if (MO.isDef()) {
+      // This def register's live range is shortened after sinking.
+      bool LocalUse = false;
+      if (!AllUsesDominatedByBlock(Reg, SuccToSinkTo, MBB, BreakPHIEdge,
+                                   LocalUse))
+        return false;
+    } else {
+      MachineInstr *DefMI = MRI->getVRegDef(Reg);
+      // DefMI is defined outside of loop. There should be no live range
+      // impact for this operand. Defination outside of loop means:
+      // 1: defination is outside of loop.
+      // 2: defination is in this loop, but it is a PHI in the loop header.
+      if (LI->getLoopFor(DefMI->getParent()) != ML ||
+          (DefMI->isPHI() && LI->isLoopHeader(DefMI->getParent())))
+        continue;
+      // The DefMI is defined inside the loop.
+      // If sinking this operand makes some register pressure set exceed limit,
+      // it is not profitable.
+      if (isRegisterPressureSetExceedLimit(MRI->getRegClass(Reg))) {
+        LLVM_DEBUG(dbgs() << "register pressure exceed limit, not profitable.");
+        return false;
+      }
+    }
+  }
+
+  // If MI is in loop and all its operands are alive across the whole loop or if
+  // no operand sinking make register pressure set exceed limit, it is
+  // profitable to sink MI.
+  return true;
 }
 
 /// Get the sorted sequence of successors for this MachineBasicBlock, possibly
@@ -613,8 +745,7 @@ MachineSinking::GetAllSortedSuccessors(MachineInstr &MI, MachineBasicBlock *MBB,
   if (Succs != AllSuccessors.end())
     return Succs->second;
 
-  SmallVector<MachineBasicBlock *, 4> AllSuccs(MBB->succ_begin(),
-                                               MBB->succ_end());
+  SmallVector<MachineBasicBlock *, 4> AllSuccs(MBB->successors());
 
   // Handle cases where sinking can happen but where the sink point isn't a
   // successor. For example:
@@ -876,6 +1007,97 @@ static void performSink(MachineInstr &MI, MachineBasicBlock &SuccToSinkTo,
   }
 }
 
+/// hasStoreBetween - check if there is store betweeen straight line blocks From
+/// and To.
+bool MachineSinking::hasStoreBetween(MachineBasicBlock *From,
+                                     MachineBasicBlock *To, MachineInstr &MI) {
+  // Make sure From and To are in straight line which means From dominates To
+  // and To post dominates From.
+  if (!DT->dominates(From, To) || !PDT->dominates(To, From))
+    return true;
+
+  auto BlockPair = std::make_pair(From, To);
+
+  // Does these two blocks pair be queried before and have a definite cached
+  // result?
+  if (HasStoreCache.find(BlockPair) != HasStoreCache.end())
+    return HasStoreCache[BlockPair];
+
+  if (StoreInstrCache.find(BlockPair) != StoreInstrCache.end())
+    return llvm::any_of(StoreInstrCache[BlockPair], [&](MachineInstr *I) {
+      return I->mayAlias(AA, MI, false);
+    });
+
+  bool SawStore = false;
+  bool HasAliasedStore = false;
+  DenseSet<MachineBasicBlock *> HandledBlocks;
+  DenseSet<MachineBasicBlock *> HandledDomBlocks;
+  // Go through all reachable blocks from From.
+  for (MachineBasicBlock *BB : depth_first(From)) {
+    // We insert the instruction at the start of block To, so no need to worry
+    // about stores inside To.
+    // Store in block From should be already considered when just enter function
+    // SinkInstruction.
+    if (BB == To || BB == From)
+      continue;
+
+    // We already handle this BB in previous iteration.
+    if (HandledBlocks.count(BB))
+      continue;
+
+    HandledBlocks.insert(BB);
+    // To post dominates BB, it must be a path from block From.
+    if (PDT->dominates(To, BB)) {
+      if (!HandledDomBlocks.count(BB))
+        HandledDomBlocks.insert(BB);
+
+      // If this BB is too big or the block number in straight line between From
+      // and To is too big, stop searching to save compiling time.
+      if (BB->size() > SinkLoadInstsPerBlockThreshold ||
+          HandledDomBlocks.size() > SinkLoadBlocksThreshold) {
+        for (auto *DomBB : HandledDomBlocks) {
+          if (DomBB != BB && DT->dominates(DomBB, BB))
+            HasStoreCache[std::make_pair(DomBB, To)] = true;
+          else if(DomBB != BB && DT->dominates(BB, DomBB))
+            HasStoreCache[std::make_pair(From, DomBB)] = true;
+        }
+        HasStoreCache[BlockPair] = true;
+        return true;
+      }
+
+      for (MachineInstr &I : *BB) {
+        // Treat as alias conservatively for a call or an ordered memory
+        // operation.
+        if (I.isCall() || I.hasOrderedMemoryRef()) {
+          for (auto *DomBB : HandledDomBlocks) {
+            if (DomBB != BB && DT->dominates(DomBB, BB))
+              HasStoreCache[std::make_pair(DomBB, To)] = true;
+            else if(DomBB != BB && DT->dominates(BB, DomBB))
+              HasStoreCache[std::make_pair(From, DomBB)] = true;
+          }
+          HasStoreCache[BlockPair] = true;
+          return true;
+        }
+
+        if (I.mayStore()) {
+          SawStore = true;
+          // We still have chance to sink MI if all stores between are not
+          // aliased to MI.
+          // Cache all store instructions, so that we don't need to go through
+          // all From reachable blocks for next load instruction.
+          if (I.mayAlias(AA, MI, false))
+            HasAliasedStore = true;
+          StoreInstrCache[BlockPair].push_back(&I);
+        }
+      }
+    }
+  }
+  // If there is no store at all, cache the result.
+  if (!SawStore)
+    HasStoreCache[BlockPair] = false;
+  return HasAliasedStore;
+}
+
 /// SinkInstruction - Determine whether it is safe to sink the specified machine
 /// instruction out of its current block into a successor.
 bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore,
@@ -936,8 +1158,9 @@ bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore,
     // We cannot sink a load across a critical edge - there may be stores in
     // other code paths.
     bool TryBreak = false;
-    bool store = true;
-    if (!MI.isSafeToMove(AA, store)) {
+    bool Store =
+        MI.mayLoad() ? hasStoreBetween(ParentBlock, SuccToSinkTo, MI) : true;
+    if (!MI.isSafeToMove(AA, Store)) {
       LLVM_DEBUG(dbgs() << " *** NOTE: Won't sink load along critical edge.\n");
       TryBreak = true;
     }
@@ -1268,9 +1491,9 @@ static bool hasRegisterDependency(MachineInstr *MI,
   return HasRegDependency;
 }
 
-static SmallSet<unsigned, 4> getRegUnits(unsigned Reg,
-                                         const TargetRegisterInfo *TRI) {
-  SmallSet<unsigned, 4> RegUnits;
+static SmallSet<MCRegister, 4> getRegUnits(MCRegister Reg,
+                                           const TargetRegisterInfo *TRI) {
+  SmallSet<MCRegister, 4> RegUnits;
   for (auto RI = MCRegUnitIterator(Reg, TRI); RI.isValid(); ++RI)
     RegUnits.insert(*RI);
   return RegUnits;
@@ -1320,8 +1543,8 @@ bool PostRAMachineSinking::tryToSinkCopy(MachineBasicBlock &CurBB,
           continue;
 
         // Record debug use of each reg unit.
-        SmallSet<unsigned, 4> Units = getRegUnits(MO.getReg(), TRI);
-        for (unsigned Reg : Units)
+        SmallSet<MCRegister, 4> Units = getRegUnits(MO.getReg(), TRI);
+        for (MCRegister Reg : Units)
           SeenDbgInstrs[Reg].push_back(MI);
       }
       continue;
@@ -1365,18 +1588,17 @@ bool PostRAMachineSinking::tryToSinkCopy(MachineBasicBlock &CurBB,
     // recorded which reg units that DBG_VALUEs read, if this instruction
     // writes any of those units then the corresponding DBG_VALUEs must sink.
     SetVector<MachineInstr *> DbgValsToSinkSet;
-    SmallVector<MachineInstr *, 4> DbgValsToSink;
     for (auto &MO : MI->operands()) {
       if (!MO.isReg() || !MO.isDef())
         continue;
 
-      SmallSet<unsigned, 4> Units = getRegUnits(MO.getReg(), TRI);
-      for (unsigned Reg : Units)
+      SmallSet<MCRegister, 4> Units = getRegUnits(MO.getReg(), TRI);
+      for (MCRegister Reg : Units)
         for (auto *MI : SeenDbgInstrs.lookup(Reg))
           DbgValsToSinkSet.insert(MI);
     }
-    DbgValsToSink.insert(DbgValsToSink.begin(), DbgValsToSinkSet.begin(),
-                         DbgValsToSinkSet.end());
+    SmallVector<MachineInstr *, 4> DbgValsToSink(DbgValsToSinkSet.begin(),
+                                                 DbgValsToSinkSet.end());
 
     // Clear the kill flag if SrcReg is killed between MI and the end of the
     // block.
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineStableHash.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineStableHash.cpp
new file mode 100644
index 000000000000..fb14f0a33209
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineStableHash.cpp
@@ -0,0 +1,194 @@
+//===- lib/CodeGen/MachineStableHash.cpp ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Stable hashing for MachineInstr and MachineOperand. Useful or getting a
+// hash across runs, modules, etc.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineStableHash.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/CodeGen/MIRFormatter.h"
+#include "llvm/CodeGen/MIRPrinter.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/StableHashing.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/IRPrintingPasses.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/ModuleSlotTracker.h"
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/Target/TargetIntrinsicInfo.h"
+#include "llvm/Target/TargetMachine.h"
+
+#define DEBUG_TYPE "machine-stable-hash"
+
+using namespace llvm;
+
+STATISTIC(StableHashBailingMachineBasicBlock,
+          "Number of encountered unsupported MachineOperands that were "
+          "MachineBasicBlocks while computing stable hashes");
+STATISTIC(StableHashBailingConstantPoolIndex,
+          "Number of encountered unsupported MachineOperands that were "
+          "ConstantPoolIndex while computing stable hashes");
+STATISTIC(StableHashBailingTargetIndexNoName,
+          "Number of encountered unsupported MachineOperands that were "
+          "TargetIndex with no name");
+STATISTIC(StableHashBailingGlobalAddress,
+          "Number of encountered unsupported MachineOperands that were "
+          "GlobalAddress while computing stable hashes");
+STATISTIC(StableHashBailingBlockAddress,
+          "Number of encountered unsupported MachineOperands that were "
+          "BlockAddress while computing stable hashes");
+STATISTIC(StableHashBailingMetadataUnsupported,
+          "Number of encountered unsupported MachineOperands that were "
+          "Metadata of an unsupported kind while computing stable hashes");
+
+stable_hash llvm::stableHashValue(const MachineOperand &MO) {
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+    if (Register::isVirtualRegister(MO.getReg())) {
+      const MachineRegisterInfo &MRI = MO.getParent()->getMF()->getRegInfo();
+      return MRI.getVRegDef(MO.getReg())->getOpcode();
+    }
+
+    // Register operands don't have target flags.
+    return stable_hash_combine(MO.getType(), MO.getReg(), MO.getSubReg(),
+                               MO.isDef());
+  case MachineOperand::MO_Immediate:
+    return stable_hash_combine(MO.getType(), MO.getTargetFlags(), MO.getImm());
+  case MachineOperand::MO_CImmediate:
+  case MachineOperand::MO_FPImmediate: {
+    auto Val = MO.isCImm() ? MO.getCImm()->getValue()
+                           : MO.getFPImm()->getValueAPF().bitcastToAPInt();
+    auto ValHash =
+        stable_hash_combine_array(Val.getRawData(), Val.getNumWords());
+    return hash_combine(MO.getType(), MO.getTargetFlags(), ValHash);
+  }
+
+  case MachineOperand::MO_MachineBasicBlock:
+    StableHashBailingMachineBasicBlock++;
+    return 0;
+  case MachineOperand::MO_ConstantPoolIndex:
+    StableHashBailingConstantPoolIndex++;
+    return 0;
+  case MachineOperand::MO_BlockAddress:
+    StableHashBailingBlockAddress++;
+    return 0;
+  case MachineOperand::MO_Metadata:
+    StableHashBailingMetadataUnsupported++;
+    return 0;
+  case MachineOperand::MO_GlobalAddress:
+    StableHashBailingGlobalAddress++;
+    return 0;
+  case MachineOperand::MO_TargetIndex: {
+    if (const char *Name = MO.getTargetIndexName())
+      return stable_hash_combine(MO.getType(), MO.getTargetFlags(),
+                                 stable_hash_combine_string(Name),
+                                 MO.getOffset());
+    StableHashBailingTargetIndexNoName++;
+    return 0;
+  }
+
+  case MachineOperand::MO_FrameIndex:
+  case MachineOperand::MO_JumpTableIndex:
+    return stable_hash_combine(MO.getType(), MO.getTargetFlags(),
+                               MO.getIndex());
+
+  case MachineOperand::MO_ExternalSymbol:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getOffset(),
+                        stable_hash_combine_string(MO.getSymbolName()));
+
+  case MachineOperand::MO_RegisterMask:
+  case MachineOperand::MO_RegisterLiveOut:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getRegMask());
+
+  case MachineOperand::MO_ShuffleMask: {
+    std::vector<llvm::stable_hash> ShuffleMaskHashes;
+
+    llvm::transform(
+        MO.getShuffleMask(), std::back_inserter(ShuffleMaskHashes),
+        [](int S) -> llvm::stable_hash { return llvm::stable_hash(S); });
+
+    return hash_combine(MO.getType(), MO.getTargetFlags(),
+                        stable_hash_combine_array(ShuffleMaskHashes.data(),
+                                                  ShuffleMaskHashes.size()));
+  }
+  case MachineOperand::MO_MCSymbol: {
+    auto SymbolName = MO.getMCSymbol()->getName();
+    return hash_combine(MO.getType(), MO.getTargetFlags(),
+                        stable_hash_combine_string(SymbolName));
+  }
+  case MachineOperand::MO_CFIIndex:
+    return stable_hash_combine(MO.getType(), MO.getTargetFlags(),
+                               MO.getCFIIndex());
+  case MachineOperand::MO_IntrinsicID:
+    return stable_hash_combine(MO.getType(), MO.getTargetFlags(),
+                               MO.getIntrinsicID());
+  case MachineOperand::MO_Predicate:
+    return stable_hash_combine(MO.getType(), MO.getTargetFlags(),
+                               MO.getPredicate());
+  }
+  llvm_unreachable("Invalid machine operand type");
+}
+
+/// A stable hash value for machine instructions.
+/// Returns 0 if no stable hash could be computed.
+/// The hashing and equality testing functions ignore definitions so this is
+/// useful for CSE, etc.
+stable_hash llvm::stableHashValue(const MachineInstr &MI, bool HashVRegs,
+                                  bool HashConstantPoolIndices,
+                                  bool HashMemOperands) {
+  // Build up a buffer of hash code components.
+  SmallVector<stable_hash, 16> HashComponents;
+  HashComponents.reserve(MI.getNumOperands() + MI.getNumMemOperands() + 2);
+  HashComponents.push_back(MI.getOpcode());
+  HashComponents.push_back(MI.getFlags());
+  for (const MachineOperand &MO : MI.operands()) {
+    if (!HashVRegs && MO.isReg() && MO.isDef() &&
+        Register::isVirtualRegister(MO.getReg()))
+      continue; // Skip virtual register defs.
+
+    if (MO.isCPI()) {
+      HashComponents.push_back(stable_hash_combine(
+          MO.getType(), MO.getTargetFlags(), MO.getIndex()));
+      continue;
+    }
+
+    stable_hash StableHash = stableHashValue(MO);
+    if (!StableHash)
+      return 0;
+    HashComponents.push_back(StableHash);
+  }
+
+  for (const auto *Op : MI.memoperands()) {
+    if (!HashMemOperands)
+      break;
+    HashComponents.push_back(static_cast<unsigned>(Op->getSize()));
+    HashComponents.push_back(static_cast<unsigned>(Op->getFlags()));
+    HashComponents.push_back(static_cast<unsigned>(Op->getOffset()));
+    HashComponents.push_back(static_cast<unsigned>(Op->getOrdering()));
+    HashComponents.push_back(static_cast<unsigned>(Op->getAddrSpace()));
+    HashComponents.push_back(static_cast<unsigned>(Op->getSyncScopeID()));
+    HashComponents.push_back(static_cast<unsigned>(Op->getBaseAlign().value()));
+    HashComponents.push_back(static_cast<unsigned>(Op->getFailureOrdering()));
+  }
+
+  return stable_hash_combine_range(HashComponents.begin(),
+                                   HashComponents.end());
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineTraceMetrics.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineTraceMetrics.cpp
index e6b51b7e1e56..8df23b781ffd 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/MachineTraceMetrics.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineTraceMetrics.cpp
@@ -701,17 +701,15 @@ static void updatePhysDepsDownwards(const MachineInstr *UseMI,
                                     SmallVectorImpl<DataDep> &Deps,
                                     SparseSet<LiveRegUnit> &RegUnits,
                                     const TargetRegisterInfo *TRI) {
-  SmallVector<unsigned, 8> Kills;
+  SmallVector<MCRegister, 8> Kills;
   SmallVector<unsigned, 8> LiveDefOps;
 
   for (MachineInstr::const_mop_iterator MI = UseMI->operands_begin(),
        ME = UseMI->operands_end(); MI != ME; ++MI) {
     const MachineOperand &MO = *MI;
-    if (!MO.isReg())
-      continue;
-    Register Reg = MO.getReg();
-    if (!Register::isPhysicalRegister(Reg))
+    if (!MO.isReg() || !MO.getReg().isPhysical())
       continue;
+    MCRegister Reg = MO.getReg().asMCReg();
     // Track live defs and kills for updating RegUnits.
     if (MO.isDef()) {
       if (MO.isDead())
@@ -734,13 +732,14 @@ static void updatePhysDepsDownwards(const MachineInstr *UseMI,
 
   // Update RegUnits to reflect live registers after UseMI.
   // First kills.
-  for (unsigned Kill : Kills)
+  for (MCRegister Kill : Kills)
     for (MCRegUnitIterator Units(Kill, TRI); Units.isValid(); ++Units)
       RegUnits.erase(*Units);
 
   // Second, live defs.
   for (unsigned DefOp : LiveDefOps) {
-    for (MCRegUnitIterator Units(UseMI->getOperand(DefOp).getReg(), TRI);
+    for (MCRegUnitIterator Units(UseMI->getOperand(DefOp).getReg().asMCReg(),
+                                 TRI);
          Units.isValid(); ++Units) {
       LiveRegUnit &LRU = RegUnits[*Units];
       LRU.MI = UseMI;
@@ -766,7 +765,7 @@ computeCrossBlockCriticalPath(const TraceBlockInfo &TBI) {
   assert(TBI.HasValidInstrHeights && "Missing height info");
   unsigned MaxLen = 0;
   for (const LiveInReg &LIR : TBI.LiveIns) {
-    if (!Register::isVirtualRegister(LIR.Reg))
+    if (!LIR.Reg.isVirtual())
       continue;
     const MachineInstr *DefMI = MTM.MRI->getVRegDef(LIR.Reg);
     // Ignore dependencies outside the current trace.
@@ -912,7 +911,8 @@ static unsigned updatePhysDepsUpwards(const MachineInstr &MI, unsigned Height,
       continue;
     // This is a def of Reg. Remove corresponding entries from RegUnits, and
     // update MI Height to consider the physreg dependencies.
-    for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) {
+    for (MCRegUnitIterator Units(Reg.asMCReg(), TRI); Units.isValid();
+         ++Units) {
       SparseSet<LiveRegUnit>::iterator I = RegUnits.find(*Units);
       if (I == RegUnits.end())
         continue;
@@ -930,15 +930,15 @@ static unsigned updatePhysDepsUpwards(const MachineInstr &MI, unsigned Height,
   }
 
   // Now we know the height of MI. Update any regunits read.
-  for (unsigned i = 0, e = ReadOps.size(); i != e; ++i) {
-    Register Reg = MI.getOperand(ReadOps[i]).getReg();
+  for (size_t I = 0, E = ReadOps.size(); I != E; ++I) {
+    MCRegister Reg = MI.getOperand(ReadOps[I]).getReg().asMCReg();
     for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) {
       LiveRegUnit &LRU = RegUnits[*Units];
       // Set the height to the highest reader of the unit.
       if (LRU.Cycle <= Height && LRU.MI != &MI) {
         LRU.Cycle = Height;
         LRU.MI = &MI;
-        LRU.Op = ReadOps[i];
+        LRU.Op = ReadOps[I];
       }
     }
   }
@@ -979,7 +979,7 @@ void MachineTraceMetrics::Ensemble::
 addLiveIns(const MachineInstr *DefMI, unsigned DefOp,
            ArrayRef<const MachineBasicBlock*> Trace) {
   assert(!Trace.empty() && "Trace should contain at least one block");
-  unsigned Reg = DefMI->getOperand(DefOp).getReg();
+  Register Reg = DefMI->getOperand(DefOp).getReg();
   assert(Register::isVirtualRegister(Reg));
   const MachineBasicBlock *DefMBB = DefMI->getParent();
 
@@ -1027,7 +1027,7 @@ computeInstrHeights(const MachineBasicBlock *MBB) {
   if (MBB) {
     TraceBlockInfo &TBI = BlockInfo[MBB->getNumber()];
     for (LiveInReg &LI : TBI.LiveIns) {
-      if (Register::isVirtualRegister(LI.Reg)) {
+      if (LI.Reg.isVirtual()) {
         // For virtual registers, the def latency is included.
         unsigned &Height = Heights[MTM.MRI->getVRegDef(LI.Reg)];
         if (Height < LI.Height)
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineVerifier.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineVerifier.cpp
index c1a2c4e0bc6e..0f6d9b888f47 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -86,7 +86,7 @@ namespace {
   struct MachineVerifier {
     MachineVerifier(Pass *pass, const char *b) : PASS(pass), Banner(b) {}
 
-    unsigned verify(MachineFunction &MF);
+    unsigned verify(const MachineFunction &MF);
 
     Pass *const PASS;
     const char *Banner;
@@ -102,10 +102,10 @@ namespace {
     bool isFunctionRegBankSelected;
     bool isFunctionSelected;
 
-    using RegVector = SmallVector<unsigned, 16>;
+    using RegVector = SmallVector<Register, 16>;
     using RegMaskVector = SmallVector<const uint32_t *, 4>;
-    using RegSet = DenseSet<unsigned>;
-    using RegMap = DenseMap<unsigned, const MachineInstr *>;
+    using RegSet = DenseSet<Register>;
+    using RegMap = DenseMap<Register, const MachineInstr *>;
     using BlockSet = SmallPtrSet<const MachineBasicBlock *, 8>;
 
     const MachineInstr *FirstNonPHI;
@@ -120,11 +120,10 @@ namespace {
     SlotIndex lastIndex;
 
     // Add Reg and any sub-registers to RV
-    void addRegWithSubRegs(RegVector &RV, unsigned Reg) {
+    void addRegWithSubRegs(RegVector &RV, Register Reg) {
       RV.push_back(Reg);
-      if (Register::isPhysicalRegister(Reg))
-        for (const MCPhysReg &SubReg : TRI->subregs(Reg))
-          RV.push_back(SubReg);
+      if (Reg.isPhysical())
+        append_range(RV, TRI->subregs(Reg.asMCReg()));
     }
 
     struct BBInfo {
@@ -132,7 +131,8 @@ namespace {
       bool reachable = false;
 
       // Vregs that must be live in because they are used without being
-      // defined. Map value is the user.
+      // defined. Map value is the user. vregsLiveIn doesn't include regs
+      // that only are used by PHI nodes.
       RegMap vregsLiveIn;
 
       // Regs killed in MBB. They may be defined again, and will then be in both
@@ -158,8 +158,8 @@ namespace {
 
       // Add register to vregsRequired if it belongs there. Return true if
       // anything changed.
-      bool addRequired(unsigned Reg) {
-        if (!Register::isVirtualRegister(Reg))
+      bool addRequired(Register Reg) {
+        if (!Reg.isVirtual())
           return false;
         if (regsLiveOut.count(Reg))
           return false;
@@ -169,7 +169,7 @@ namespace {
       // Same for a full set.
       bool addRequired(const RegSet &RS) {
         bool Changed = false;
-        for (unsigned Reg : RS)
+        for (Register Reg : RS)
           Changed |= addRequired(Reg);
         return Changed;
       }
@@ -183,7 +183,7 @@ namespace {
       }
 
       // Live-out registers are either in regsLiveOut or vregsPassed.
-      bool isLiveOut(unsigned Reg) const {
+      bool isLiveOut(Register Reg) const {
         return regsLiveOut.count(Reg) || vregsPassed.count(Reg);
       }
     };
@@ -191,13 +191,13 @@ namespace {
     // Extra register info per MBB.
     DenseMap<const MachineBasicBlock*, BBInfo> MBBInfoMap;
 
-    bool isReserved(unsigned Reg) {
-      return Reg < regsReserved.size() && regsReserved.test(Reg);
+    bool isReserved(Register Reg) {
+      return Reg.id() < regsReserved.size() && regsReserved.test(Reg.id());
     }
 
-    bool isAllocatable(unsigned Reg) const {
-      return Reg < TRI->getNumRegs() && TRI->isInAllocatableClass(Reg) &&
-             !regsReserved.test(Reg);
+    bool isAllocatable(Register Reg) const {
+      return Reg.id() < TRI->getNumRegs() && TRI->isInAllocatableClass(Reg) &&
+             !regsReserved.test(Reg.id());
     }
 
     // Analysis information if available
@@ -225,7 +225,7 @@ namespace {
                 LLT MOVRegType = LLT{});
 
     void report_context(const LiveInterval &LI) const;
-    void report_context(const LiveRange &LR, unsigned VRegUnit,
+    void report_context(const LiveRange &LR, Register VRegUnit,
                         LaneBitmask LaneMask) const;
     void report_context(const LiveRange::Segment &S) const;
     void report_context(const VNInfo &VNI) const;
@@ -233,18 +233,19 @@ namespace {
     void report_context(MCPhysReg PhysReg) const;
     void report_context_liverange(const LiveRange &LR) const;
     void report_context_lanemask(LaneBitmask LaneMask) const;
-    void report_context_vreg(unsigned VReg) const;
-    void report_context_vreg_regunit(unsigned VRegOrUnit) const;
+    void report_context_vreg(Register VReg) const;
+    void report_context_vreg_regunit(Register VRegOrUnit) const;
 
     void verifyInlineAsm(const MachineInstr *MI);
 
     void checkLiveness(const MachineOperand *MO, unsigned MONum);
     void checkLivenessAtUse(const MachineOperand *MO, unsigned MONum,
-                            SlotIndex UseIdx, const LiveRange &LR, unsigned VRegOrUnit,
+                            SlotIndex UseIdx, const LiveRange &LR,
+                            Register VRegOrUnit,
                             LaneBitmask LaneMask = LaneBitmask::getNone());
     void checkLivenessAtDef(const MachineOperand *MO, unsigned MONum,
-                            SlotIndex DefIdx, const LiveRange &LR, unsigned VRegOrUnit,
-                            bool SubRangeCheck = false,
+                            SlotIndex DefIdx, const LiveRange &LR,
+                            Register VRegOrUnit, bool SubRangeCheck = false,
                             LaneBitmask LaneMask = LaneBitmask::getNone());
 
     void markReachable(const MachineBasicBlock *MBB);
@@ -255,12 +256,12 @@ namespace {
     void verifyLiveVariables();
     void verifyLiveIntervals();
     void verifyLiveInterval(const LiveInterval&);
-    void verifyLiveRangeValue(const LiveRange&, const VNInfo*, unsigned,
+    void verifyLiveRangeValue(const LiveRange &, const VNInfo *, Register,
                               LaneBitmask);
-    void verifyLiveRangeSegment(const LiveRange&,
-                                const LiveRange::const_iterator I, unsigned,
+    void verifyLiveRangeSegment(const LiveRange &,
+                                const LiveRange::const_iterator I, Register,
                                 LaneBitmask);
-    void verifyLiveRange(const LiveRange&, unsigned,
+    void verifyLiveRange(const LiveRange &, Register,
                          LaneBitmask LaneMask = LaneBitmask::getNone());
 
     void verifyStackFrame();
@@ -303,6 +304,19 @@ FunctionPass *llvm::createMachineVerifierPass(const std::string &Banner) {
   return new MachineVerifierPass(Banner);
 }
 
+void llvm::verifyMachineFunction(MachineFunctionAnalysisManager *,
+                                 const std::string &Banner,
+                                 const MachineFunction &MF) {
+  // TODO: Use MFAM after porting below analyses.
+  // LiveVariables *LiveVars;
+  // LiveIntervals *LiveInts;
+  // LiveStacks *LiveStks;
+  // SlotIndexes *Indexes;
+  unsigned FoundErrors = MachineVerifier(nullptr, Banner.c_str()).verify(MF);
+  if (FoundErrors)
+    report_fatal_error("Found " + Twine(FoundErrors) + " machine code errors.");
+}
+
 bool MachineFunction::verify(Pass *p, const char *Banner, bool AbortOnErrors)
     const {
   MachineFunction &MF = const_cast<MachineFunction&>(*this);
@@ -335,7 +349,7 @@ void MachineVerifier::verifyProperties(const MachineFunction &MF) {
     report("Function has NoVRegs property but there are VReg operands", &MF);
 }
 
-unsigned MachineVerifier::verify(MachineFunction &MF) {
+unsigned MachineVerifier::verify(const MachineFunction &MF) {
   foundErrors = 0;
 
   this->MF = &MF;
@@ -474,7 +488,7 @@ void MachineVerifier::report(const char *msg, const MachineInstr *MI) {
   errs() << "- instruction: ";
   if (Indexes && Indexes->hasIndex(*MI))
     errs() << Indexes->getInstructionIndex(*MI) << '\t';
-  MI->print(errs(), /*SkipOpers=*/true);
+  MI->print(errs(), /*IsStandalone=*/true);
 }
 
 void MachineVerifier::report(const char *msg, const MachineOperand *MO,
@@ -494,7 +508,7 @@ void MachineVerifier::report_context(const LiveInterval &LI) const {
   errs() << "- interval:    " << LI << '\n';
 }
 
-void MachineVerifier::report_context(const LiveRange &LR, unsigned VRegUnit,
+void MachineVerifier::report_context(const LiveRange &LR, Register VRegUnit,
                                      LaneBitmask LaneMask) const {
   report_context_liverange(LR);
   report_context_vreg_regunit(VRegUnit);
@@ -518,11 +532,11 @@ void MachineVerifier::report_context(MCPhysReg PReg) const {
   errs() << "- p. register: " << printReg(PReg, TRI) << '\n';
 }
 
-void MachineVerifier::report_context_vreg(unsigned VReg) const {
+void MachineVerifier::report_context_vreg(Register VReg) const {
   errs() << "- v. register: " << printReg(VReg, TRI) << '\n';
 }
 
-void MachineVerifier::report_context_vreg_regunit(unsigned VRegOrUnit) const {
+void MachineVerifier::report_context_vreg_regunit(Register VRegOrUnit) const {
   if (Register::isVirtualRegister(VRegOrUnit)) {
     report_context_vreg(VRegOrUnit);
   } else {
@@ -776,9 +790,7 @@ void MachineVerifier::visitMachineBundleBefore(const MachineInstr *MI) {
   }
 
   // Ensure non-terminators don't follow terminators.
-  // Ignore predicated terminators formed by if conversion.
-  // FIXME: If conversion shouldn't need to violate this rule.
-  if (MI->isTerminator() && !TII->isPredicated(*MI)) {
+  if (MI->isTerminator()) {
     if (!FirstTerminator)
       FirstTerminator = MI;
   } else if (FirstTerminator) {
@@ -992,16 +1004,15 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
   }
   case TargetOpcode::G_PHI: {
     LLT DstTy = MRI->getType(MI->getOperand(0).getReg());
-    if (!DstTy.isValid() ||
-        !std::all_of(MI->operands_begin() + 1, MI->operands_end(),
-                     [this, &DstTy](const MachineOperand &MO) {
-                       if (!MO.isReg())
-                         return true;
-                       LLT Ty = MRI->getType(MO.getReg());
-                       if (!Ty.isValid() || (Ty != DstTy))
-                         return false;
-                       return true;
-                     }))
+    if (!DstTy.isValid() || !all_of(drop_begin(MI->operands()),
+                                    [this, &DstTy](const MachineOperand &MO) {
+                                      if (!MO.isReg())
+                                        return true;
+                                      LLT Ty = MRI->getType(MO.getReg());
+                                      if (!Ty.isValid() || (Ty != DstTy))
+                                        return false;
+                                      return true;
+                                    }))
       report("Generic Instruction G_PHI has operands with incompatible/missing "
              "types",
              MI);
@@ -1343,20 +1354,7 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
         break;
       }
     }
-    switch (IntrID) {
-    case Intrinsic::memcpy:
-      if (MI->getNumOperands() != 5)
-        report("Expected memcpy intrinsic to have 5 operands", MI);
-      break;
-    case Intrinsic::memmove:
-      if (MI->getNumOperands() != 5)
-        report("Expected memmove intrinsic to have 5 operands", MI);
-      break;
-    case Intrinsic::memset:
-      if (MI->getNumOperands() != 5)
-        report("Expected memset intrinsic to have 5 operands", MI);
-      break;
-    }
+
     break;
   }
   case TargetOpcode::G_SEXT_INREG: {
@@ -1434,6 +1432,95 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
     }
     break;
   }
+  case TargetOpcode::G_MEMCPY:
+  case TargetOpcode::G_MEMMOVE: {
+    ArrayRef<MachineMemOperand *> MMOs = MI->memoperands();
+    if (MMOs.size() != 2) {
+      report("memcpy/memmove must have 2 memory operands", MI);
+      break;
+    }
+
+    if ((!MMOs[0]->isStore() || MMOs[0]->isLoad()) ||
+        (MMOs[1]->isStore() || !MMOs[1]->isLoad())) {
+      report("wrong memory operand types", MI);
+      break;
+    }
+
+    if (MMOs[0]->getSize() != MMOs[1]->getSize())
+      report("inconsistent memory operand sizes", MI);
+
+    LLT DstPtrTy = MRI->getType(MI->getOperand(0).getReg());
+    LLT SrcPtrTy = MRI->getType(MI->getOperand(1).getReg());
+
+    if (!DstPtrTy.isPointer() || !SrcPtrTy.isPointer()) {
+      report("memory instruction operand must be a pointer", MI);
+      break;
+    }
+
+    if (DstPtrTy.getAddressSpace() != MMOs[0]->getAddrSpace())
+      report("inconsistent store address space", MI);
+    if (SrcPtrTy.getAddressSpace() != MMOs[1]->getAddrSpace())
+      report("inconsistent load address space", MI);
+
+    break;
+  }
+  case TargetOpcode::G_MEMSET: {
+    ArrayRef<MachineMemOperand *> MMOs = MI->memoperands();
+    if (MMOs.size() != 1) {
+      report("memset must have 1 memory operand", MI);
+      break;
+    }
+
+    if ((!MMOs[0]->isStore() || MMOs[0]->isLoad())) {
+      report("memset memory operand must be a store", MI);
+      break;
+    }
+
+    LLT DstPtrTy = MRI->getType(MI->getOperand(0).getReg());
+    if (!DstPtrTy.isPointer()) {
+      report("memset operand must be a pointer", MI);
+      break;
+    }
+
+    if (DstPtrTy.getAddressSpace() != MMOs[0]->getAddrSpace())
+      report("inconsistent memset address space", MI);
+
+    break;
+  }
+  case TargetOpcode::G_VECREDUCE_SEQ_FADD:
+  case TargetOpcode::G_VECREDUCE_SEQ_FMUL: {
+    LLT DstTy = MRI->getType(MI->getOperand(0).getReg());
+    LLT Src1Ty = MRI->getType(MI->getOperand(1).getReg());
+    LLT Src2Ty = MRI->getType(MI->getOperand(2).getReg());
+    if (!DstTy.isScalar())
+      report("Vector reduction requires a scalar destination type", MI);
+    if (!Src1Ty.isScalar())
+      report("Sequential FADD/FMUL vector reduction requires a scalar 1st operand", MI);
+    if (!Src2Ty.isVector())
+      report("Sequential FADD/FMUL vector reduction must have a vector 2nd operand", MI);
+    break;
+  }
+  case TargetOpcode::G_VECREDUCE_FADD:
+  case TargetOpcode::G_VECREDUCE_FMUL:
+  case TargetOpcode::G_VECREDUCE_FMAX:
+  case TargetOpcode::G_VECREDUCE_FMIN:
+  case TargetOpcode::G_VECREDUCE_ADD:
+  case TargetOpcode::G_VECREDUCE_MUL:
+  case TargetOpcode::G_VECREDUCE_AND:
+  case TargetOpcode::G_VECREDUCE_OR:
+  case TargetOpcode::G_VECREDUCE_XOR:
+  case TargetOpcode::G_VECREDUCE_SMAX:
+  case TargetOpcode::G_VECREDUCE_SMIN:
+  case TargetOpcode::G_VECREDUCE_UMAX:
+  case TargetOpcode::G_VECREDUCE_UMIN: {
+    LLT DstTy = MRI->getType(MI->getOperand(0).getReg());
+    LLT SrcTy = MRI->getType(MI->getOperand(1).getReg());
+    if (!DstTy.isScalar())
+      report("Vector reduction requires a scalar destination type", MI);
+    if (!SrcTy.isVector())
+      report("Vector reduction requires vector source=", MI);
+    break;
+  }
   default:
     break;
   }
@@ -1461,6 +1548,16 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) {
   if (MI->isInlineAsm())
     verifyInlineAsm(MI);
 
+  // Check that unspillable terminators define a reg and have at most one use.
+  if (TII->isUnspillableTerminator(MI)) {
+    if (!MI->getOperand(0).isReg() || !MI->getOperand(0).isDef())
+      report("Unspillable Terminator does not define a reg", MI);
+    Register Def = MI->getOperand(0).getReg();
+    if (Def.isVirtual() &&
+        std::distance(MRI->use_nodbg_begin(Def), MRI->use_nodbg_end()) > 1)
+      report("Unspillable Terminator expected to have at most one use!", MI);
+  }
+
   // A fully-formed DBG_VALUE must have a location. Ignore partially formed
   // DBG_VALUEs: these are convenient to use in tests, but should never get
   // generated.
@@ -1468,6 +1565,11 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) {
     if (!MI->getDebugLoc())
       report("Missing DebugLoc for debug instruction", MI);
 
+  // Meta instructions should never be the subject of debug value tracking,
+  // they don't create a value in the output program at all.
+  if (MI->isMetaInstruction() && MI->peekDebugInstrNum())
+    report("Metadata instruction should not have a value tracking number", MI);
+
   // Check the MachineMemOperands for basic consistency.
   for (MachineMemOperand *Op : MI->memoperands()) {
     if (Op->isLoad() && !MI->mayLoad())
@@ -1543,6 +1645,10 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) {
     }
 
     auto VerifyStackMapConstant = [&](unsigned Offset) {
+      if (Offset >= MI->getNumOperands()) {
+        report("stack map constant to STATEPOINT is out of range!", MI);
+        return;
+      }
       if (!MI->getOperand(Offset - 1).isImm() ||
           MI->getOperand(Offset - 1).getImm() != StackMaps::ConstantOp ||
           !MI->getOperand(Offset).isImm())
@@ -1551,6 +1657,25 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) {
     VerifyStackMapConstant(SO.getCCIdx());
     VerifyStackMapConstant(SO.getFlagsIdx());
     VerifyStackMapConstant(SO.getNumDeoptArgsIdx());
+    VerifyStackMapConstant(SO.getNumGCPtrIdx());
+    VerifyStackMapConstant(SO.getNumAllocaIdx());
+    VerifyStackMapConstant(SO.getNumGcMapEntriesIdx());
+
+    // Verify that all explicit statepoint defs are tied to gc operands as
+    // they are expected to be a relocation of gc operands.
+    unsigned FirstGCPtrIdx = SO.getFirstGCPtrIdx();
+    unsigned LastGCPtrIdx = SO.getNumAllocaIdx() - 2;
+    for (unsigned Idx = 0; Idx < MI->getNumDefs(); Idx++) {
+      unsigned UseOpIdx;
+      if (!MI->isRegTiedToUseOperand(Idx, &UseOpIdx)) {
+        report("STATEPOINT defs expected to be tied", MI);
+        break;
+      }
+      if (UseOpIdx < FirstGCPtrIdx || UseOpIdx > LastGCPtrIdx) {
+        report("STATEPOINT def tied to non-gc operand", MI);
+        break;
+      }
+    }
 
     // TODO: verify we have properly encoded deopt arguments
   } break;
@@ -1865,8 +1990,10 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
 }
 
 void MachineVerifier::checkLivenessAtUse(const MachineOperand *MO,
-    unsigned MONum, SlotIndex UseIdx, const LiveRange &LR, unsigned VRegOrUnit,
-    LaneBitmask LaneMask) {
+                                         unsigned MONum, SlotIndex UseIdx,
+                                         const LiveRange &LR,
+                                         Register VRegOrUnit,
+                                         LaneBitmask LaneMask) {
   LiveQueryResult LRQ = LR.Query(UseIdx);
   // Check if we have a segment at the use, note however that we only need one
   // live subregister range, the others may be dead.
@@ -1887,8 +2014,11 @@ void MachineVerifier::checkLivenessAtUse(const MachineOperand *MO,
 }
 
 void MachineVerifier::checkLivenessAtDef(const MachineOperand *MO,
-    unsigned MONum, SlotIndex DefIdx, const LiveRange &LR, unsigned VRegOrUnit,
-    bool SubRangeCheck, LaneBitmask LaneMask) {
+                                         unsigned MONum, SlotIndex DefIdx,
+                                         const LiveRange &LR,
+                                         Register VRegOrUnit,
+                                         bool SubRangeCheck,
+                                         LaneBitmask LaneMask) {
   if (const VNInfo *VNI = LR.getVNInfoAt(DefIdx)) {
     assert(VNI && "NULL valno is not allowed");
     if (VNI->def != DefIdx) {
@@ -1932,7 +2062,7 @@ void MachineVerifier::checkLivenessAtDef(const MachineOperand *MO,
 
 void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
   const MachineInstr *MI = MO->getParent();
-  const unsigned Reg = MO->getReg();
+  const Register Reg = MO->getReg();
 
   // Both use and def operands can read a register.
   if (MO->readsReg()) {
@@ -1950,8 +2080,9 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
     if (LiveInts && !LiveInts->isNotInMIMap(*MI)) {
       SlotIndex UseIdx = LiveInts->getInstructionIndex(*MI);
       // Check the cached regunit intervals.
-      if (Register::isPhysicalRegister(Reg) && !isReserved(Reg)) {
-        for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) {
+      if (Reg.isPhysical() && !isReserved(Reg)) {
+        for (MCRegUnitIterator Units(Reg.asMCReg(), TRI); Units.isValid();
+             ++Units) {
           if (MRI->isReservedRegUnit(*Units))
             continue;
           if (const LiveRange *LR = LiveInts->getCachedRegUnit(*Units))
@@ -2097,9 +2228,9 @@ void MachineVerifier::visitMachineBundleAfter(const MachineInstr *MI) {
   // Kill any masked registers.
   while (!regMasks.empty()) {
     const uint32_t *Mask = regMasks.pop_back_val();
-    for (unsigned Reg : regsLive)
-      if (Register::isPhysicalRegister(Reg) &&
-          MachineOperand::clobbersPhysReg(Mask, Reg))
+    for (Register Reg : regsLive)
+      if (Reg.isPhysical() &&
+          MachineOperand::clobbersPhysReg(Mask, Reg.asMCReg()))
         regsDead.push_back(Reg);
   }
   set_subtract(regsLive, regsDead);   regsDead.clear();
@@ -2132,7 +2263,7 @@ struct VRegFilter {
   // Add elements to the filter itself. \pre Input set \p FromRegSet must have
   // no duplicates. Both virtual and physical registers are fine.
   template <typename RegSetT> void add(const RegSetT &FromRegSet) {
-    SmallVector<unsigned, 0> VRegsBuffer;
+    SmallVector<Register, 0> VRegsBuffer;
     filterAndAdd(FromRegSet, VRegsBuffer);
   }
   // Filter \p FromRegSet through the filter and append passed elements into \p
@@ -2140,13 +2271,13 @@ struct VRegFilter {
   // \returns true if anything changed.
   template <typename RegSetT>
   bool filterAndAdd(const RegSetT &FromRegSet,
-                    SmallVectorImpl<unsigned> &ToVRegs) {
+                    SmallVectorImpl<Register> &ToVRegs) {
     unsigned SparseUniverse = Sparse.size();
     unsigned NewSparseUniverse = SparseUniverse;
     unsigned NewDenseSize = Dense.size();
     size_t Begin = ToVRegs.size();
-    for (unsigned Reg : FromRegSet) {
-      if (!Register::isVirtualRegister(Reg))
+    for (Register Reg : FromRegSet) {
+      if (!Reg.isVirtual())
         continue;
       unsigned Index = Register::virtReg2Index(Reg);
       if (Index < SparseUniverseMax) {
@@ -2170,7 +2301,7 @@ struct VRegFilter {
     Sparse.resize(NewSparseUniverse);
     Dense.reserve(NewDenseSize);
     for (unsigned I = Begin; I < End; ++I) {
-      unsigned Reg = ToVRegs[I];
+      Register Reg = ToVRegs[I];
       unsigned Index = Register::virtReg2Index(Reg);
       if (Index < SparseUniverseMax)
         Sparse.set(Index);
@@ -2203,7 +2334,7 @@ private:
 // universe). filter_b implicitly contains all physical registers at all times.
 class FilteringVRegSet {
   VRegFilter Filter;
-  SmallVector<unsigned, 0> VRegs;
+  SmallVector<Register, 0> VRegs;
 
 public:
   // Set-up the filter_b. \pre Input register set \p RS must have no duplicates.
@@ -2229,63 +2360,28 @@ public:
 // can pass through an MBB live, but may not be live every time. It is assumed
 // that all vregsPassed sets are empty before the call.
 void MachineVerifier::calcRegsPassed() {
-  // This is a forward dataflow, doing it in RPO. A standard map serves as a
-  // priority (sorting by RPO number) queue, deduplicating worklist, and an RPO
-  // number to MBB mapping all at once.
-  std::map<unsigned, const MachineBasicBlock *> RPOWorklist;
-  DenseMap<const MachineBasicBlock *, unsigned> RPONumbers;
-  if (MF->empty()) {
+  if (MF->empty())
     // ReversePostOrderTraversal doesn't handle empty functions.
     return;
-  }
-  std::vector<FilteringVRegSet> VRegsPassedSets(MF->size());
-  for (const MachineBasicBlock *MBB :
-       ReversePostOrderTraversal<const MachineFunction *>(MF)) {
-    // Careful with the evaluation order, fetch next number before allocating.
-    unsigned Number = RPONumbers.size();
-    RPONumbers[MBB] = Number;
-    // Set-up the transfer functions for all blocks.
-    const BBInfo &MInfo = MBBInfoMap[MBB];
-    VRegsPassedSets[Number].addToFilter(MInfo.regsKilled);
-    VRegsPassedSets[Number].addToFilter(MInfo.regsLiveOut);
-  }
-  // First push live-out regs to successors' vregsPassed. Remember the MBBs that
-  // have any vregsPassed.
-  for (const MachineBasicBlock &MBB : *MF) {
-    const BBInfo &MInfo = MBBInfoMap[&MBB];
-    if (!MInfo.reachable)
-      continue;
-    for (const MachineBasicBlock *Succ : MBB.successors()) {
-      unsigned SuccNumber = RPONumbers[Succ];
-      FilteringVRegSet &SuccSet = VRegsPassedSets[SuccNumber];
-      if (SuccSet.add(MInfo.regsLiveOut))
-        RPOWorklist.emplace(SuccNumber, Succ);
-    }
-  }
 
-  // Iteratively push vregsPassed to successors.
-  while (!RPOWorklist.empty()) {
-    auto Next = RPOWorklist.begin();
-    const MachineBasicBlock *MBB = Next->second;
-    RPOWorklist.erase(Next);
-    FilteringVRegSet &MSet = VRegsPassedSets[RPONumbers[MBB]];
-    for (const MachineBasicBlock *Succ : MBB->successors()) {
-      if (Succ == MBB)
+  for (const MachineBasicBlock *MB :
+       ReversePostOrderTraversal<const MachineFunction *>(MF)) {
+    FilteringVRegSet VRegs;
+    BBInfo &Info = MBBInfoMap[MB];
+    assert(Info.reachable);
+
+    VRegs.addToFilter(Info.regsKilled);
+    VRegs.addToFilter(Info.regsLiveOut);
+    for (const MachineBasicBlock *Pred : MB->predecessors()) {
+      const BBInfo &PredInfo = MBBInfoMap[Pred];
+      if (!PredInfo.reachable)
         continue;
-      unsigned SuccNumber = RPONumbers[Succ];
-      FilteringVRegSet &SuccSet = VRegsPassedSets[SuccNumber];
-      if (SuccSet.add(MSet))
-        RPOWorklist.emplace(SuccNumber, Succ);
+
+      VRegs.add(PredInfo.regsLiveOut);
+      VRegs.add(PredInfo.vregsPassed);
     }
-  }
-  // Copy the results back to BBInfos.
-  for (const MachineBasicBlock &MBB : *MF) {
-    BBInfo &MInfo = MBBInfoMap[&MBB];
-    if (!MInfo.reachable)
-      continue;
-    const FilteringVRegSet &MSet = VRegsPassedSets[RPONumbers[&MBB]];
-    MInfo.vregsPassed.reserve(MSet.size());
-    MInfo.vregsPassed.insert(MSet.begin(), MSet.end());
+    Info.vregsPassed.reserve(VRegs.size());
+    Info.vregsPassed.insert(VRegs.begin(), VRegs.end());
   }
 }
 
@@ -2302,6 +2398,23 @@ void MachineVerifier::calcRegsRequired() {
       if (PInfo.addRequired(MInfo.vregsLiveIn))
         todo.insert(Pred);
     }
+
+    // Handle the PHI node.
+    for (const MachineInstr &MI : MBB.phis()) {
+      for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
+        // Skip those Operands which are undef regs or not regs.
+        if (!MI.getOperand(i).isReg() || !MI.getOperand(i).readsReg())
+          continue;
+
+        // Get register and predecessor for one PHI edge.
+        Register Reg = MI.getOperand(i).getReg();
+        const MachineBasicBlock *Pred = MI.getOperand(i + 1).getMBB();
+
+        BBInfo &PInfo = MBBInfoMap[Pred];
+        if (PInfo.addRequired(Reg))
+          todo.insert(Pred);
+      }
+    }
   }
 
   // Iteratively push vregsRequired to predecessors. This will converge to the
@@ -2399,7 +2512,7 @@ void MachineVerifier::visitMachineFunctionAfter() {
   // Check for killed virtual registers that should be live out.
   for (const auto &MBB : *MF) {
     BBInfo &MInfo = MBBInfoMap[&MBB];
-    for (unsigned VReg : MInfo.vregsRequired)
+    for (Register VReg : MInfo.vregsRequired)
       if (MInfo.regsKilled.count(VReg)) {
         report("Virtual register killed in block, but needed live out.", &MBB);
         errs() << "Virtual register " << printReg(VReg)
@@ -2409,7 +2522,7 @@ void MachineVerifier::visitMachineFunctionAfter() {
 
   if (!MF->empty()) {
     BBInfo &MInfo = MBBInfoMap[&MF->front()];
-    for (unsigned VReg : MInfo.vregsRequired) {
+    for (Register VReg : MInfo.vregsRequired) {
       report("Virtual register defs don't dominate all uses.", MF);
       report_context_vreg(VReg);
     }
@@ -2449,12 +2562,27 @@ void MachineVerifier::visitMachineFunctionAfter() {
   for (auto CSInfo : MF->getCallSitesInfo())
     if (!CSInfo.first->isCall())
       report("Call site info referencing instruction that is not call", MF);
+
+  // If there's debug-info, check that we don't have any duplicate value
+  // tracking numbers.
+  if (MF->getFunction().getSubprogram()) {
+    DenseSet<unsigned> SeenNumbers;
+    for (auto &MBB : *MF) {
+      for (auto &MI : MBB) {
+        if (auto Num = MI.peekDebugInstrNum()) {
+          auto Result = SeenNumbers.insert((unsigned)Num);
+          if (!Result.second)
+            report("Instruction has a duplicated value tracking number", &MI);
+        }
+      }
+    }
+  }
 }
 
 void MachineVerifier::verifyLiveVariables() {
   assert(LiveVars && "Don't call verifyLiveVariables without LiveVars");
-  for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
-    unsigned Reg = Register::index2VirtReg(i);
+  for (unsigned I = 0, E = MRI->getNumVirtRegs(); I != E; ++I) {
+    Register Reg = Register::index2VirtReg(I);
     LiveVariables::VarInfo &VI = LiveVars->getVarInfo(Reg);
     for (const auto &MBB : *MF) {
       BBInfo &MInfo = MBBInfoMap[&MBB];
@@ -2479,8 +2607,8 @@ void MachineVerifier::verifyLiveVariables() {
 
 void MachineVerifier::verifyLiveIntervals() {
   assert(LiveInts && "Don't call verifyLiveIntervals without LiveInts");
-  for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
-    unsigned Reg = Register::index2VirtReg(i);
+  for (unsigned I = 0, E = MRI->getNumVirtRegs(); I != E; ++I) {
+    Register Reg = Register::index2VirtReg(I);
 
     // Spilling and splitting may leave unused registers around. Skip them.
     if (MRI->reg_nodbg_empty(Reg))
@@ -2493,7 +2621,7 @@ void MachineVerifier::verifyLiveIntervals() {
     }
 
     const LiveInterval &LI = LiveInts->getInterval(Reg);
-    assert(Reg == LI.reg && "Invalid reg to interval mapping");
+    assert(Reg == LI.reg() && "Invalid reg to interval mapping");
     verifyLiveInterval(LI);
   }
 
@@ -2504,7 +2632,7 @@ void MachineVerifier::verifyLiveIntervals() {
 }
 
 void MachineVerifier::verifyLiveRangeValue(const LiveRange &LR,
-                                           const VNInfo *VNI, unsigned Reg,
+                                           const VNInfo *VNI, Register Reg,
                                            LaneBitmask LaneMask) {
   if (VNI->isUnused())
     return;
@@ -2597,8 +2725,8 @@ void MachineVerifier::verifyLiveRangeValue(const LiveRange &LR,
 
 void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
                                              const LiveRange::const_iterator I,
-                                             unsigned Reg, LaneBitmask LaneMask)
-{
+                                             Register Reg,
+                                             LaneBitmask LaneMask) {
   const LiveRange::Segment &S = *I;
   const VNInfo *VNI = S.valno;
   assert(VNI && "Live segment has no valno");
@@ -2809,7 +2937,7 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
   }
 }
 
-void MachineVerifier::verifyLiveRange(const LiveRange &LR, unsigned Reg,
+void MachineVerifier::verifyLiveRange(const LiveRange &LR, Register Reg,
                                       LaneBitmask LaneMask) {
   for (const VNInfo *VNI : LR.valnos)
     verifyLiveRangeValue(LR, VNI, Reg, LaneMask);
@@ -2819,7 +2947,7 @@ void MachineVerifier::verifyLiveRange(const LiveRange &LR, unsigned Reg,
 }
 
 void MachineVerifier::verifyLiveInterval(const LiveInterval &LI) {
-  unsigned Reg = LI.reg;
+  Register Reg = LI.reg();
   assert(Register::isVirtualRegister(Reg));
   verifyLiveRange(LI, Reg);
 
@@ -2836,10 +2964,10 @@ void MachineVerifier::verifyLiveInterval(const LiveInterval &LI) {
     }
     if (SR.empty()) {
       report("Subrange must not be empty", MF);
-      report_context(SR, LI.reg, SR.LaneMask);
+      report_context(SR, LI.reg(), SR.LaneMask);
     }
     Mask |= SR.LaneMask;
-    verifyLiveRange(SR, LI.reg, SR.LaneMask);
+    verifyLiveRange(SR, LI.reg(), SR.LaneMask);
     if (!LI.covers(SR)) {
       report("A Subrange is not covered by the main range", MF);
       report_context(LI);
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/ModuloSchedule.cpp b/contrib/llvm-project/llvm/lib/CodeGen/ModuloSchedule.cpp
index d85b1b7988ce..095da09ea82b 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/ModuloSchedule.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/ModuloSchedule.cpp
@@ -11,9 +11,7 @@
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineLoopUtils.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/Support/Debug.h"
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MultiHazardRecognizer.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MultiHazardRecognizer.cpp
new file mode 100644
index 000000000000..e4cd92ac4868
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MultiHazardRecognizer.cpp
@@ -0,0 +1,92 @@
+//===- MultiHazardRecognizer.cpp - Scheduler Support ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MultiHazardRecognizer class, which is a wrapper
+// for a set of ScheduleHazardRecognizer instances
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MultiHazardRecognizer.h"
+#include "llvm/ADT/STLExtras.h"
+#include <algorithm>
+#include <functional>
+#include <numeric>
+
+using namespace llvm;
+
+void MultiHazardRecognizer::AddHazardRecognizer(
+    std::unique_ptr<ScheduleHazardRecognizer> &&R) {
+  MaxLookAhead = std::max(MaxLookAhead, R->getMaxLookAhead());
+  Recognizers.push_back(std::move(R));
+}
+
+bool MultiHazardRecognizer::atIssueLimit() const {
+  return llvm::any_of(Recognizers,
+                      std::mem_fn(&ScheduleHazardRecognizer::atIssueLimit));
+}
+
+ScheduleHazardRecognizer::HazardType
+MultiHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
+  for (auto &R : Recognizers) {
+    auto res = R->getHazardType(SU, Stalls);
+    if (res != NoHazard)
+      return res;
+  }
+  return NoHazard;
+}
+
+void MultiHazardRecognizer::Reset() {
+  for (auto &R : Recognizers)
+    R->Reset();
+}
+
+void MultiHazardRecognizer::EmitInstruction(SUnit *SU) {
+  for (auto &R : Recognizers)
+    R->EmitInstruction(SU);
+}
+
+void MultiHazardRecognizer::EmitInstruction(MachineInstr *MI) {
+  for (auto &R : Recognizers)
+    R->EmitInstruction(MI);
+}
+
+unsigned MultiHazardRecognizer::PreEmitNoops(SUnit *SU) {
+  auto MN = [=](unsigned a, std::unique_ptr<ScheduleHazardRecognizer> &R) {
+    return std::max(a, R->PreEmitNoops(SU));
+  };
+  return std::accumulate(Recognizers.begin(), Recognizers.end(), 0u, MN);
+}
+
+unsigned MultiHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
+  auto MN = [=](unsigned a, std::unique_ptr<ScheduleHazardRecognizer> &R) {
+    return std::max(a, R->PreEmitNoops(MI));
+  };
+  return std::accumulate(Recognizers.begin(), Recognizers.end(), 0u, MN);
+}
+
+bool MultiHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
+  auto SPA = [=](std::unique_ptr<ScheduleHazardRecognizer> &R) {
+    return R->ShouldPreferAnother(SU);
+  };
+  return llvm::any_of(Recognizers, SPA);
+}
+
+void MultiHazardRecognizer::AdvanceCycle() {
+  for (auto &R : Recognizers)
+    R->AdvanceCycle();
+}
+
+void MultiHazardRecognizer::RecedeCycle() {
+  for (auto &R : Recognizers)
+    R->RecedeCycle();
+}
+
+void MultiHazardRecognizer::EmitNoop() {
+  for (auto &R : Recognizers)
+    R->EmitNoop();
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/PHIElimination.cpp b/contrib/llvm-project/llvm/lib/CodeGen/PHIElimination.cpp
index 311b87fa9e3b..8148b64d8443 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/PHIElimination.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/PHIElimination.cpp
@@ -101,10 +101,10 @@ namespace {
 
     // These functions are temporary abstractions around LiveVariables and
     // LiveIntervals, so they can go away when LiveVariables does.
-    bool isLiveIn(unsigned Reg, const MachineBasicBlock *MBB);
-    bool isLiveOutPastPHIs(unsigned Reg, const MachineBasicBlock *MBB);
+    bool isLiveIn(Register Reg, const MachineBasicBlock *MBB);
+    bool isLiveOutPastPHIs(Register Reg, const MachineBasicBlock *MBB);
 
-    using BBVRegPair = std::pair<unsigned, unsigned>;
+    using BBVRegPair = std::pair<unsigned, Register>;
     using VRegPHIUse = DenseMap<BBVRegPair, unsigned>;
 
     VRegPHIUse VRegPHIUseCount;
@@ -324,21 +324,43 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
       // Increment use count of the newly created virtual register.
       LV->setPHIJoin(IncomingReg);
 
-      // When we are reusing the incoming register, it may already have been
-      // killed in this block. The old kill will also have been inserted at
-      // AfterPHIsIt, so it appears before the current PHICopy.
-      if (reusedIncoming)
-        if (MachineInstr *OldKill = VI.findKill(&MBB)) {
-          LLVM_DEBUG(dbgs() << "Remove old kill from " << *OldKill);
-          LV->removeVirtualRegisterKilled(IncomingReg, *OldKill);
-          LLVM_DEBUG(MBB.dump());
+      MachineInstr *OldKill = nullptr;
+      bool IsPHICopyAfterOldKill = false;
+
+      if (reusedIncoming && (OldKill = VI.findKill(&MBB))) {
+        // Calculate whether the PHICopy is after the OldKill.
+        // In general, the PHICopy is inserted as the first non-phi instruction
+        // by default, so it's before the OldKill. But some Target hooks for
+        // createPHIDestinationCopy() may modify the default insert position of
+        // PHICopy.
+        for (auto I = MBB.SkipPHIsAndLabels(MBB.begin()), E = MBB.end();
+             I != E; ++I) {
+          if (I == PHICopy)
+            break;
+
+          if (I == OldKill) {
+            IsPHICopyAfterOldKill = true;
+            break;
+          }
         }
+      }
 
-      // Add information to LiveVariables to know that the incoming value is
-      // killed.  Note that because the value is defined in several places (once
-      // each for each incoming block), the "def" block and instruction fields
-      // for the VarInfo is not filled in.
-      LV->addVirtualRegisterKilled(IncomingReg, *PHICopy);
+      // When we are reusing the incoming register and it has been marked killed
+      // by OldKill, if the PHICopy is after the OldKill, we should remove the
+      // killed flag from OldKill.
+      if (IsPHICopyAfterOldKill) {
+        LLVM_DEBUG(dbgs() << "Remove old kill from " << *OldKill);
+        LV->removeVirtualRegisterKilled(IncomingReg, *OldKill);
+        LLVM_DEBUG(MBB.dump());
+      }
+
+      // Add information to LiveVariables to know that the first used incoming
+      // value or the resued incoming value whose PHICopy is after the OldKIll
+      // is killed. Note that because the value is defined in several places
+      // (once each for each incoming block), the "def" block and instruction
+      // fields for the VarInfo is not filled in.
+      if (!OldKill || IsPHICopyAfterOldKill)
+        LV->addVirtualRegisterKilled(IncomingReg, *PHICopy);
     }
 
     // Since we are going to be deleting the PHI node, if it is the last use of
@@ -372,8 +394,7 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
     }
 
     LiveInterval &DestLI = LIS->getInterval(DestReg);
-    assert(DestLI.begin() != DestLI.end() &&
-           "PHIs should have nonempty LiveIntervals.");
+    assert(!DestLI.empty() && "PHIs should have nonempty LiveIntervals.");
     if (DestLI.endIndex().isDead()) {
       // A dead PHI's live range begins and ends at the start of the MBB, but
       // the lowered copy, which will still be dead, needs to begin and end at
@@ -420,6 +441,19 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
     if (!MBBsInsertedInto.insert(&opBlock).second)
       continue;  // If the copy has already been emitted, we're done.
 
+    MachineInstr *SrcRegDef = MRI->getVRegDef(SrcReg);
+    if (SrcRegDef && TII->isUnspillableTerminator(SrcRegDef)) {
+      assert(SrcRegDef->getOperand(0).isReg() &&
+             SrcRegDef->getOperand(0).isDef() &&
+             "Expected operand 0 to be a reg def!");
+      // Now that the PHI's use has been removed (as the instruction was
+      // removed) there should be no other uses of the SrcReg.
+      assert(MRI->use_empty(SrcReg) &&
+             "Expected a single use from UnspillableTerminator");
+      SrcRegDef->getOperand(0).setReg(IncomingReg);
+      continue;
+    }
+
     // Find a safe location to insert the copy, this may be the first terminator
     // in the block (or end()).
     MachineBasicBlock::iterator InsertPos =
@@ -670,7 +704,7 @@ bool PHIElimination::SplitPHIEdges(MachineFunction &MF,
   return Changed;
 }
 
-bool PHIElimination::isLiveIn(unsigned Reg, const MachineBasicBlock *MBB) {
+bool PHIElimination::isLiveIn(Register Reg, const MachineBasicBlock *MBB) {
   assert((LV || LIS) &&
          "isLiveIn() requires either LiveVariables or LiveIntervals");
   if (LIS)
@@ -679,7 +713,7 @@ bool PHIElimination::isLiveIn(unsigned Reg, const MachineBasicBlock *MBB) {
     return LV->isLiveIn(Reg, *MBB);
 }
 
-bool PHIElimination::isLiveOutPastPHIs(unsigned Reg,
+bool PHIElimination::isLiveOutPastPHIs(Register Reg,
                                        const MachineBasicBlock *MBB) {
   assert((LV || LIS) &&
          "isLiveOutPastPHIs() requires either LiveVariables or LiveIntervals");
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/PHIEliminationUtils.cpp b/contrib/llvm-project/llvm/lib/CodeGen/PHIEliminationUtils.cpp
index 2a72717e711d..016335f420d3 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/PHIEliminationUtils.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/PHIEliminationUtils.cpp
@@ -8,9 +8,9 @@
 
 #include "PHIEliminationUtils.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+
 using namespace llvm;
 
 // findCopyInsertPoint - Find a safe place in MBB to insert a copy from SrcReg
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/ParallelCG.cpp b/contrib/llvm-project/llvm/lib/CodeGen/ParallelCG.cpp
index c19ed1f8f71d..849b667254bd 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/ParallelCG.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/ParallelCG.cpp
@@ -28,6 +28,8 @@ static void codegen(Module *M, llvm::raw_pwrite_stream &OS,
                     function_ref<std::unique_ptr<TargetMachine>()> TMFactory,
                     CodeGenFileType FileType) {
   std::unique_ptr<TargetMachine> TM = TMFactory();
+  assert(TM && "Failed to create target machine!");
+
   legacy::PassManager CodeGenPasses;
   if (TM->addPassesToEmitFile(CodeGenPasses, OS, nullptr, FileType))
     report_fatal_error("Failed to setup codegen");
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/contrib/llvm-project/llvm/lib/CodeGen/PeepholeOptimizer.cpp
index 4a66863ea803..34ac396c0471 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/PeepholeOptimizer.cpp
@@ -178,6 +178,11 @@ namespace {
       }
     }
 
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties()
+        .set(MachineFunctionProperties::Property::IsSSA);
+    }
+
     /// Track Def -> Use info used for rewriting copies.
     using RewriteMapTy = SmallDenseMap<RegSubRegPair, ValueTrackerResult>;
 
@@ -196,41 +201,39 @@ namespace {
                                    SmallPtrSetImpl<MachineInstr *> &LocalMIs);
     bool optimizeRecurrence(MachineInstr &PHI);
     bool findNextSource(RegSubRegPair RegSubReg, RewriteMapTy &RewriteMap);
-    bool isMoveImmediate(MachineInstr &MI,
-                         SmallSet<unsigned, 4> &ImmDefRegs,
-                         DenseMap<unsigned, MachineInstr*> &ImmDefMIs);
-    bool foldImmediate(MachineInstr &MI, SmallSet<unsigned, 4> &ImmDefRegs,
-                       DenseMap<unsigned, MachineInstr*> &ImmDefMIs);
+    bool isMoveImmediate(MachineInstr &MI, SmallSet<Register, 4> &ImmDefRegs,
+                         DenseMap<Register, MachineInstr *> &ImmDefMIs);
+    bool foldImmediate(MachineInstr &MI, SmallSet<Register, 4> &ImmDefRegs,
+                       DenseMap<Register, MachineInstr *> &ImmDefMIs);
 
     /// Finds recurrence cycles, but only ones that formulated around
     /// a def operand and a use operand that are tied. If there is a use
     /// operand commutable with the tied use operand, find recurrence cycle
     /// along that operand as well.
-    bool findTargetRecurrence(unsigned Reg,
-                              const SmallSet<unsigned, 2> &TargetReg,
+    bool findTargetRecurrence(Register Reg,
+                              const SmallSet<Register, 2> &TargetReg,
                               RecurrenceCycle &RC);
 
     /// If copy instruction \p MI is a virtual register copy, track it in
-    /// the set \p CopySrcRegs and \p CopyMIs. If this virtual register was
-    /// previously seen as a copy, replace the uses of this copy with the
-    /// previously seen copy's destination register.
+    /// the set \p CopyMIs. If this virtual register was previously seen as a
+    /// copy, replace the uses of this copy with the previously seen copy's
+    /// destination register.
     bool foldRedundantCopy(MachineInstr &MI,
-                           SmallSet<unsigned, 4> &CopySrcRegs,
-                           DenseMap<unsigned, MachineInstr *> &CopyMIs);
+                           DenseMap<RegSubRegPair, MachineInstr *> &CopyMIs);
 
     /// Is the register \p Reg a non-allocatable physical register?
-    bool isNAPhysCopy(unsigned Reg);
+    bool isNAPhysCopy(Register Reg);
 
     /// If copy instruction \p MI is a non-allocatable virtual<->physical
     /// register copy, track it in the \p NAPhysToVirtMIs map. If this
     /// non-allocatable physical register was previously copied to a virtual
     /// registered and hasn't been clobbered, the virt->phys copy can be
     /// deleted.
-    bool foldRedundantNAPhysCopy(MachineInstr &MI,
-        DenseMap<unsigned, MachineInstr *> &NAPhysToVirtMIs);
+    bool foldRedundantNAPhysCopy(
+        MachineInstr &MI, DenseMap<Register, MachineInstr *> &NAPhysToVirtMIs);
 
     bool isLoadFoldable(MachineInstr &MI,
-                        SmallSet<unsigned, 16> &FoldAsLoadDefCandidates);
+                        SmallSet<Register, 16> &FoldAsLoadDefCandidates);
 
     /// Check whether \p MI is understood by the register coalescer
     /// but may require some rewriting.
@@ -291,7 +294,7 @@ namespace {
   public:
     ValueTrackerResult() = default;
 
-    ValueTrackerResult(unsigned Reg, unsigned SubReg) {
+    ValueTrackerResult(Register Reg, unsigned SubReg) {
       addSource(Reg, SubReg);
     }
 
@@ -305,11 +308,11 @@ namespace {
       Inst = nullptr;
     }
 
-    void addSource(unsigned SrcReg, unsigned SrcSubReg) {
+    void addSource(Register SrcReg, unsigned SrcSubReg) {
       RegSrcs.push_back(RegSubRegPair(SrcReg, SrcSubReg));
     }
 
-    void setSource(int Idx, unsigned SrcReg, unsigned SrcSubReg) {
+    void setSource(int Idx, Register SrcReg, unsigned SrcSubReg) {
       assert(Idx < getNumSources() && "Reg pair source out of index");
       RegSrcs[Idx] = RegSubRegPair(SrcReg, SrcSubReg);
     }
@@ -320,7 +323,7 @@ namespace {
       return RegSrcs[Idx];
     }
 
-    unsigned getSrcReg(int Idx) const {
+    Register getSrcReg(int Idx) const {
       assert(Idx < getNumSources() && "Reg source out of index");
       return RegSrcs[Idx].Reg;
     }
@@ -330,7 +333,7 @@ namespace {
       return RegSrcs[Idx].SubReg;
     }
 
-    bool operator==(const ValueTrackerResult &Other) {
+    bool operator==(const ValueTrackerResult &Other) const {
       if (Other.getInst() != getInst())
         return false;
 
@@ -373,7 +376,7 @@ namespace {
     unsigned DefSubReg;
 
     /// The register where the value can be found.
-    unsigned Reg;
+    Register Reg;
 
     /// MachineRegisterInfo used to perform tracking.
     const MachineRegisterInfo &MRI;
@@ -415,11 +418,11 @@ namespace {
     /// Indeed, when \p Reg is a physical register that constructor does not
     /// know which definition of \p Reg it should track.
     /// Use the next constructor to track a physical register.
-    ValueTracker(unsigned Reg, unsigned DefSubReg,
+    ValueTracker(Register Reg, unsigned DefSubReg,
                  const MachineRegisterInfo &MRI,
                  const TargetInstrInfo *TII = nullptr)
         : DefSubReg(DefSubReg), Reg(Reg), MRI(MRI), TII(TII) {
-      if (!Register::isPhysicalRegister(Reg)) {
+      if (!Reg.isPhysical()) {
         Def = MRI.getVRegDef(Reg);
         DefIdx = MRI.def_begin(Reg).getOperandNo();
       }
@@ -824,7 +827,7 @@ public:
 
   /// Rewrite the current source with \p NewReg and \p NewSubReg if possible.
   /// \return True if the rewriting was possible, false otherwise.
-  virtual bool RewriteCurrentSource(unsigned NewReg, unsigned NewSubReg) = 0;
+  virtual bool RewriteCurrentSource(Register NewReg, unsigned NewSubReg) = 0;
 };
 
 /// Rewriter for COPY instructions.
@@ -852,7 +855,7 @@ public:
     return true;
   }
 
-  bool RewriteCurrentSource(unsigned NewReg, unsigned NewSubReg) override {
+  bool RewriteCurrentSource(Register NewReg, unsigned NewSubReg) override {
     if (CurrentSrcIdx != 1)
       return false;
     MachineOperand &MOSrc = CopyLike.getOperand(CurrentSrcIdx);
@@ -897,7 +900,7 @@ public:
     return true;
   }
 
-  bool RewriteCurrentSource(unsigned NewReg, unsigned NewSubReg) override {
+  bool RewriteCurrentSource(Register NewReg, unsigned NewSubReg) override {
     return false;
   }
 };
@@ -941,7 +944,7 @@ public:
     return true;
   }
 
-  bool RewriteCurrentSource(unsigned NewReg, unsigned NewSubReg) override {
+  bool RewriteCurrentSource(Register NewReg, unsigned NewSubReg) override {
     if (CurrentSrcIdx != 2)
       return false;
     // We are rewriting the inserted reg.
@@ -988,7 +991,7 @@ public:
     return true;
   }
 
-  bool RewriteCurrentSource(unsigned NewReg, unsigned NewSubReg) override {
+  bool RewriteCurrentSource(Register NewReg, unsigned NewSubReg) override {
     // The only source we can rewrite is the input register.
     if (CurrentSrcIdx != 1)
       return false;
@@ -1066,7 +1069,7 @@ public:
     return MODef.getSubReg() == 0;
   }
 
-  bool RewriteCurrentSource(unsigned NewReg, unsigned NewSubReg) override {
+  bool RewriteCurrentSource(Register NewReg, unsigned NewSubReg) override {
     // We cannot rewrite out of bound operands.
     // Moreover, rewritable sources are at odd positions.
     if ((CurrentSrcIdx & 1) != 1 || CurrentSrcIdx > CopyLike.getNumOperands())
@@ -1312,7 +1315,7 @@ bool PeepholeOptimizer::optimizeUncoalescableCopy(
 /// We only fold loads to virtual registers and the virtual register defined
 /// has a single user.
 bool PeepholeOptimizer::isLoadFoldable(
-    MachineInstr &MI, SmallSet<unsigned, 16> &FoldAsLoadDefCandidates) {
+    MachineInstr &MI, SmallSet<Register, 16> &FoldAsLoadDefCandidates) {
   if (!MI.canFoldAsLoad() || !MI.mayLoad())
     return false;
   const MCInstrDesc &MCID = MI.getDesc();
@@ -1323,7 +1326,7 @@ bool PeepholeOptimizer::isLoadFoldable(
   // To reduce compilation time, we check MRI->hasOneNonDBGUser when inserting
   // loads. It should be checked when processing uses of the load, since
   // uses can be removed during peephole.
-  if (!MI.getOperand(0).getSubReg() && Register::isVirtualRegister(Reg) &&
+  if (Reg.isVirtual() && !MI.getOperand(0).getSubReg() &&
       MRI->hasOneNonDBGUser(Reg)) {
     FoldAsLoadDefCandidates.insert(Reg);
     return true;
@@ -1332,15 +1335,15 @@ bool PeepholeOptimizer::isLoadFoldable(
 }
 
 bool PeepholeOptimizer::isMoveImmediate(
-    MachineInstr &MI, SmallSet<unsigned, 4> &ImmDefRegs,
-    DenseMap<unsigned, MachineInstr *> &ImmDefMIs) {
+    MachineInstr &MI, SmallSet<Register, 4> &ImmDefRegs,
+    DenseMap<Register, MachineInstr *> &ImmDefMIs) {
   const MCInstrDesc &MCID = MI.getDesc();
   if (!MI.isMoveImmediate())
     return false;
   if (MCID.getNumDefs() != 1)
     return false;
   Register Reg = MI.getOperand(0).getReg();
-  if (Register::isVirtualRegister(Reg)) {
+  if (Reg.isVirtual()) {
     ImmDefMIs.insert(std::make_pair(Reg, &MI));
     ImmDefRegs.insert(Reg);
     return true;
@@ -1352,22 +1355,19 @@ bool PeepholeOptimizer::isMoveImmediate(
 /// Try folding register operands that are defined by move immediate
 /// instructions, i.e. a trivial constant folding optimization, if
 /// and only if the def and use are in the same BB.
-bool PeepholeOptimizer::foldImmediate(MachineInstr &MI,
-    SmallSet<unsigned, 4> &ImmDefRegs,
-    DenseMap<unsigned, MachineInstr *> &ImmDefMIs) {
+bool PeepholeOptimizer::foldImmediate(
+    MachineInstr &MI, SmallSet<Register, 4> &ImmDefRegs,
+    DenseMap<Register, MachineInstr *> &ImmDefMIs) {
   for (unsigned i = 0, e = MI.getDesc().getNumOperands(); i != e; ++i) {
     MachineOperand &MO = MI.getOperand(i);
     if (!MO.isReg() || MO.isDef())
       continue;
-    // Ignore dead implicit defs.
-    if (MO.isImplicit() && MO.isDead())
-      continue;
     Register Reg = MO.getReg();
-    if (!Register::isVirtualRegister(Reg))
+    if (!Reg.isVirtual())
       continue;
     if (ImmDefRegs.count(Reg) == 0)
       continue;
-    DenseMap<unsigned, MachineInstr*>::iterator II = ImmDefMIs.find(Reg);
+    DenseMap<Register, MachineInstr *>::iterator II = ImmDefMIs.find(Reg);
     assert(II != ImmDefMIs.end() && "couldn't find immediate definition");
     if (TII->FoldImmediate(MI, *II->second, Reg, MRI)) {
       ++NumImmFold;
@@ -1391,33 +1391,30 @@ bool PeepholeOptimizer::foldImmediate(MachineInstr &MI,
 // %2 = COPY %0:sub1
 //
 // Should replace %2 uses with %1:sub1
-bool PeepholeOptimizer::foldRedundantCopy(MachineInstr &MI,
-    SmallSet<unsigned, 4> &CopySrcRegs,
-    DenseMap<unsigned, MachineInstr *> &CopyMIs) {
+bool PeepholeOptimizer::foldRedundantCopy(
+    MachineInstr &MI, DenseMap<RegSubRegPair, MachineInstr *> &CopyMIs) {
   assert(MI.isCopy() && "expected a COPY machine instruction");
 
   Register SrcReg = MI.getOperand(1).getReg();
-  if (!Register::isVirtualRegister(SrcReg))
+  unsigned SrcSubReg = MI.getOperand(1).getSubReg();
+  if (!SrcReg.isVirtual())
     return false;
 
   Register DstReg = MI.getOperand(0).getReg();
-  if (!Register::isVirtualRegister(DstReg))
+  if (!DstReg.isVirtual())
     return false;
 
-  if (CopySrcRegs.insert(SrcReg).second) {
+  RegSubRegPair SrcPair(SrcReg, SrcSubReg);
+
+  if (CopyMIs.insert(std::make_pair(SrcPair, &MI)).second) {
     // First copy of this reg seen.
-    CopyMIs.insert(std::make_pair(SrcReg, &MI));
     return false;
   }
 
-  MachineInstr *PrevCopy = CopyMIs.find(SrcReg)->second;
+  MachineInstr *PrevCopy = CopyMIs.find(SrcPair)->second;
 
-  unsigned SrcSubReg = MI.getOperand(1).getSubReg();
-  unsigned PrevSrcSubReg = PrevCopy->getOperand(1).getSubReg();
-
-  // Can't replace different subregister extracts.
-  if (SrcSubReg != PrevSrcSubReg)
-    return false;
+  assert(SrcSubReg == PrevCopy->getOperand(1).getSubReg() &&
+         "Unexpected mismatching subreg!");
 
   Register PrevDstReg = PrevCopy->getOperand(0).getReg();
 
@@ -1435,12 +1432,12 @@ bool PeepholeOptimizer::foldRedundantCopy(MachineInstr &MI,
   return true;
 }
 
-bool PeepholeOptimizer::isNAPhysCopy(unsigned Reg) {
-  return Register::isPhysicalRegister(Reg) && !MRI->isAllocatable(Reg);
+bool PeepholeOptimizer::isNAPhysCopy(Register Reg) {
+  return Reg.isPhysical() && !MRI->isAllocatable(Reg);
 }
 
 bool PeepholeOptimizer::foldRedundantNAPhysCopy(
-    MachineInstr &MI, DenseMap<unsigned, MachineInstr *> &NAPhysToVirtMIs) {
+    MachineInstr &MI, DenseMap<Register, MachineInstr *> &NAPhysToVirtMIs) {
   assert(MI.isCopy() && "expected a COPY machine instruction");
 
   if (DisableNAPhysCopyOpt)
@@ -1449,17 +1446,17 @@ bool PeepholeOptimizer::foldRedundantNAPhysCopy(
   Register DstReg = MI.getOperand(0).getReg();
   Register SrcReg = MI.getOperand(1).getReg();
   if (isNAPhysCopy(SrcReg) && Register::isVirtualRegister(DstReg)) {
-    // %vreg = COPY %physreg
+    // %vreg = COPY $physreg
     // Avoid using a datastructure which can track multiple live non-allocatable
     // phys->virt copies since LLVM doesn't seem to do this.
     NAPhysToVirtMIs.insert({SrcReg, &MI});
     return false;
   }
 
-  if (!(Register::isVirtualRegister(SrcReg) && isNAPhysCopy(DstReg)))
+  if (!(SrcReg.isVirtual() && isNAPhysCopy(DstReg)))
     return false;
 
-  // %physreg = COPY %vreg
+  // $physreg = COPY %vreg
   auto PrevCopy = NAPhysToVirtMIs.find(DstReg);
   if (PrevCopy == NAPhysToVirtMIs.end()) {
     // We can't remove the copy: there was an intervening clobber of the
@@ -1489,13 +1486,11 @@ bool PeepholeOptimizer::foldRedundantNAPhysCopy(
 
 /// \bried Returns true if \p MO is a virtual register operand.
 static bool isVirtualRegisterOperand(MachineOperand &MO) {
-  if (!MO.isReg())
-    return false;
-  return Register::isVirtualRegister(MO.getReg());
+  return MO.isReg() && MO.getReg().isVirtual();
 }
 
 bool PeepholeOptimizer::findTargetRecurrence(
-    unsigned Reg, const SmallSet<unsigned, 2> &TargetRegs,
+    Register Reg, const SmallSet<Register, 2> &TargetRegs,
     RecurrenceCycle &RC) {
   // Recurrence found if Reg is in TargetRegs.
   if (TargetRegs.count(Reg))
@@ -1566,7 +1561,7 @@ bool PeepholeOptimizer::findTargetRecurrence(
 /// %1 of ADD instruction, the redundant move instruction can be
 /// avoided.
 bool PeepholeOptimizer::optimizeRecurrence(MachineInstr &PHI) {
-  SmallSet<unsigned, 2> TargetRegs;
+  SmallSet<Register, 2> TargetRegs;
   for (unsigned Idx = 1; Idx < PHI.getNumOperands(); Idx += 2) {
     MachineOperand &MO = PHI.getOperand(Idx);
     assert(isVirtualRegisterOperand(MO) && "Invalid PHI instruction");
@@ -1622,20 +1617,20 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
     // during the scan, if a MI is not in the set, it is assumed to be located
     // after. Newly created MIs have to be inserted in the set as well.
     SmallPtrSet<MachineInstr*, 16> LocalMIs;
-    SmallSet<unsigned, 4> ImmDefRegs;
-    DenseMap<unsigned, MachineInstr*> ImmDefMIs;
-    SmallSet<unsigned, 16> FoldAsLoadDefCandidates;
+    SmallSet<Register, 4> ImmDefRegs;
+    DenseMap<Register, MachineInstr *> ImmDefMIs;
+    SmallSet<Register, 16> FoldAsLoadDefCandidates;
 
     // Track when a non-allocatable physical register is copied to a virtual
     // register so that useless moves can be removed.
     //
-    // %physreg is the map index; MI is the last valid `%vreg = COPY %physreg`
-    // without any intervening re-definition of %physreg.
-    DenseMap<unsigned, MachineInstr *> NAPhysToVirtMIs;
+    // $physreg is the map index; MI is the last valid `%vreg = COPY $physreg`
+    // without any intervening re-definition of $physreg.
+    DenseMap<Register, MachineInstr *> NAPhysToVirtMIs;
 
-    // Set of virtual registers that are copied from.
-    SmallSet<unsigned, 4> CopySrcRegs;
-    DenseMap<unsigned, MachineInstr *> CopySrcMIs;
+    // Set of pairs of virtual registers and their subregs that are copied
+    // from.
+    DenseMap<RegSubRegPair, MachineInstr *> CopySrcMIs;
 
     bool IsLoopHeader = MLI->isLoopHeader(&MBB);
 
@@ -1646,9 +1641,10 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
       ++MII;
       LocalMIs.insert(MI);
 
-      // Skip debug instructions. They should not affect this peephole optimization.
+      // Skip debug instructions. They should not affect this peephole
+      // optimization.
       if (MI->isDebugInstr())
-          continue;
+        continue;
 
       if (MI->isPosition())
         continue;
@@ -1678,7 +1674,7 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
           } else if (MO.isRegMask()) {
             const uint32_t *RegMask = MO.getRegMask();
             for (auto &RegMI : NAPhysToVirtMIs) {
-              unsigned Def = RegMI.first;
+              Register Def = RegMI.first;
               if (MachineOperand::clobbersPhysReg(RegMask, Def)) {
                 LLVM_DEBUG(dbgs()
                            << "NAPhysCopy: invalidating because of " << *MI);
@@ -1723,9 +1719,8 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
         continue;
       }
 
-      if (MI->isCopy() &&
-          (foldRedundantCopy(*MI, CopySrcRegs, CopySrcMIs) ||
-           foldRedundantNAPhysCopy(*MI, NAPhysToVirtMIs))) {
+      if (MI->isCopy() && (foldRedundantCopy(*MI, CopySrcMIs) ||
+                           foldRedundantNAPhysCopy(*MI, NAPhysToVirtMIs))) {
         LocalMIs.erase(MI);
         LLVM_DEBUG(dbgs() << "Deleting redundant copy: " << *MI << "\n");
         MI->eraseFromParent();
@@ -1763,13 +1758,13 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
           const MachineOperand &MOp = MI->getOperand(i);
           if (!MOp.isReg())
             continue;
-          unsigned FoldAsLoadDefReg = MOp.getReg();
+          Register FoldAsLoadDefReg = MOp.getReg();
           if (FoldAsLoadDefCandidates.count(FoldAsLoadDefReg)) {
             // We need to fold load after optimizeCmpInstr, since
             // optimizeCmpInstr can enable folding by converting SUB to CMP.
             // Save FoldAsLoadDefReg because optimizeLoadInstr() resets it and
             // we need it for markUsesInDebugValueAsUndef().
-            unsigned FoldedReg = FoldAsLoadDefReg;
+            Register FoldedReg = FoldAsLoadDefReg;
             MachineInstr *DefMI = nullptr;
             if (MachineInstr *FoldMI =
                     TII->optimizeLoadInstr(*MI, MRI, FoldAsLoadDefReg, DefMI)) {
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/PostRAHazardRecognizer.cpp b/contrib/llvm-project/llvm/lib/CodeGen/PostRAHazardRecognizer.cpp
index 4f88f4d3dd6a..82ed386db827 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/PostRAHazardRecognizer.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/PostRAHazardRecognizer.cpp
@@ -82,11 +82,9 @@ bool PostRAHazardRecognizer::runOnMachineFunction(MachineFunction &Fn) {
     for (MachineInstr &MI : MBB) {
       // If we need to emit noops prior to this instruction, then do so.
       unsigned NumPreNoops = HazardRec->PreEmitNoops(&MI);
-      for (unsigned i = 0; i != NumPreNoops; ++i) {
-        HazardRec->EmitNoop();
-        TII->insertNoop(MBB, MachineBasicBlock::iterator(MI));
-        ++NumNoops;
-      }
+      HazardRec->EmitNoops(NumPreNoops);
+      TII->insertNoops(MBB, MachineBasicBlock::iterator(MI), NumPreNoops);
+      NumNoops += NumPreNoops;
 
       HazardRec->EmitInstruction(&MI);
       if (HazardRec->atIssueLimit()) {
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/contrib/llvm-project/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
index 1be9544848ec..80c38f3ec341 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
@@ -96,7 +96,7 @@ static bool lowerObjCCall(Function &F, const char *NewFn,
     ++I;
 
     IRBuilder<> Builder(CI->getParent(), CI->getIterator());
-    SmallVector<Value *, 8> Args(CI->arg_begin(), CI->arg_end());
+    SmallVector<Value *, 8> Args(CI->args());
     CallInst *NewCI = Builder.CreateCall(FCache, Args);
     NewCI->setName(CI->getName());
 
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/contrib/llvm-project/llvm/lib/CodeGen/PrologEpilogInserter.cpp
index a489f493d5ee..378aaba2a65f 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/PrologEpilogInserter.cpp
@@ -620,12 +620,12 @@ void PEI::spillCalleeSavedRegs(MachineFunction &MF) {
       if (!MFI.hasCalls())
         NumLeafFuncWithSpills++;
 
-      for (MachineBasicBlock *SaveBlock : SaveBlocks) {
+      for (MachineBasicBlock *SaveBlock : SaveBlocks)
         insertCSRSaves(*SaveBlock, CSI);
-        // Update the live-in information of all the blocks up to the save
-        // point.
-        updateLiveness(MF);
-      }
+
+      // Update the live-in information of all the blocks up to the save point.
+      updateLiveness(MF);
+
       for (MachineBasicBlock *RestoreBlock : RestoreBlocks)
         insertCSRRestores(*RestoreBlock, CSI);
     }
@@ -1077,7 +1077,26 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &MF) {
     // If the frame pointer is eliminated, all frame offsets will be relative to
     // SP not FP. Align to MaxAlign so this works.
     StackAlign = std::max(StackAlign, MaxAlign);
+    int64_t OffsetBeforeAlignment = Offset;
     Offset = alignTo(Offset, StackAlign, Skew);
+
+    // If we have increased the offset to fulfill the alignment constrants,
+    // then the scavenging spill slots may become harder to reach from the
+    // stack pointer, float them so they stay close.
+    if (OffsetBeforeAlignment != Offset && RS && !EarlyScavengingSlots) {
+      SmallVector<int, 2> SFIs;
+      RS->getScavengingFrameIndices(SFIs);
+      LLVM_DEBUG(if (!SFIs.empty()) llvm::dbgs()
+                     << "Adjusting emergency spill slots!\n";);
+      int64_t Delta = Offset - OffsetBeforeAlignment;
+      for (SmallVectorImpl<int>::iterator I = SFIs.begin(), IE = SFIs.end();
+           I != IE; ++I) {
+        LLVM_DEBUG(llvm::dbgs() << "Adjusting offset of emergency spill slot #"
+                                << *I << " from " << MFI.getObjectOffset(*I););
+        MFI.setObjectOffset(*I, MFI.getObjectOffset(*I) - Delta);
+        LLVM_DEBUG(llvm::dbgs() << " to " << MFI.getObjectOffset(*I) << "\n";);
+      }
+    }
   }
 
   // Update frame info to pretend that this is part of the stack...
@@ -1209,7 +1228,7 @@ void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &MF,
         unsigned FrameIdx = MI.getOperand(0).getIndex();
         unsigned Size = MF.getFrameInfo().getObjectSize(FrameIdx);
 
-        int64_t Offset =
+        StackOffset Offset =
             TFI->getFrameIndexReference(MF, FrameIdx, Reg);
         MI.getOperand(0).ChangeToRegister(Reg, false /*isDef*/);
         MI.getOperand(0).setIsDebug();
@@ -1236,7 +1255,8 @@ void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &MF,
           // Make the DBG_VALUE direct.
           MI.getDebugOffset().ChangeToRegister(0, false);
         }
-        DIExpr = DIExpression::prepend(DIExpr, PrependFlags, Offset);
+
+        DIExpr = TRI.prependOffsetExpression(DIExpr, PrependFlags, Offset);
         MI.getDebugExpressionOp().setMetadata(DIExpr);
         continue;
       }
@@ -1252,9 +1272,11 @@ void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &MF,
                "DBG_VALUE machine instruction");
         Register Reg;
         MachineOperand &Offset = MI.getOperand(i + 1);
-        int refOffset = TFI->getFrameIndexReferencePreferSP(
+        StackOffset refOffset = TFI->getFrameIndexReferencePreferSP(
             MF, MI.getOperand(i).getIndex(), Reg, /*IgnoreSPUpdates*/ false);
-        Offset.setImm(Offset.getImm() + refOffset + SPAdj);
+        assert(!refOffset.getScalable() &&
+               "Frame offsets with a scalable component are not supported");
+        Offset.setImm(Offset.getImm() + refOffset.getFixed() + SPAdj);
         MI.getOperand(i).ChangeToRegister(Reg, false /*isDef*/);
         continue;
       }
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/PseudoProbeInserter.cpp b/contrib/llvm-project/llvm/lib/CodeGen/PseudoProbeInserter.cpp
new file mode 100644
index 000000000000..9c716a5a37ea
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/PseudoProbeInserter.cpp
@@ -0,0 +1,95 @@
+//===- PseudoProbeInserter.cpp - Insert annotation for callsite profiling -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements PseudoProbeInserter pass, which inserts pseudo probe
+// annotations for call instructions with a pseudo-probe-specific dwarf
+// discriminator. such discriminator indicates that the call instruction comes
+// with a pseudo probe, and the discriminator value holds information to
+// identify the corresponding counter.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/PseudoProbe.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Target/TargetMachine.h"
+#include <unordered_map>
+
+#define DEBUG_TYPE "pseudo-probe-inserter"
+
+using namespace llvm;
+
+namespace {
+class PseudoProbeInserter : public MachineFunctionPass {
+public:
+  static char ID;
+
+  PseudoProbeInserter() : MachineFunctionPass(ID) {
+    initializePseudoProbeInserterPass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override { return "Pseudo Probe Inserter"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+    bool Changed = false;
+    for (MachineBasicBlock &MBB : MF) {
+      for (MachineInstr &MI : MBB) {
+        if (MI.isCall()) {
+          if (DILocation *DL = MI.getDebugLoc()) {
+            auto Value = DL->getDiscriminator();
+            if (DILocation::isPseudoProbeDiscriminator(Value)) {
+              BuildMI(MBB, MI, DL, TII->get(TargetOpcode::PSEUDO_PROBE))
+                  .addImm(getFuncGUID(MF.getFunction().getParent(), DL))
+                  .addImm(
+                      PseudoProbeDwarfDiscriminator::extractProbeIndex(Value))
+                  .addImm(
+                      PseudoProbeDwarfDiscriminator::extractProbeType(Value))
+                  .addImm(PseudoProbeDwarfDiscriminator::extractProbeAttributes(
+                      Value));
+              Changed = true;
+            }
+          }
+        }
+      }
+    }
+
+    return Changed;
+  }
+
+private:
+  uint64_t getFuncGUID(Module *M, DILocation *DL) {
+    auto *SP = DL->getScope()->getSubprogram();
+    auto Name = SP->getLinkageName();
+    if (Name.empty())
+      Name = SP->getName();
+    return Function::getGUID(Name);
+  }
+};
+} // namespace
+
+char PseudoProbeInserter::ID = 0;
+INITIALIZE_PASS_BEGIN(PseudoProbeInserter, DEBUG_TYPE,
+                      "Insert pseudo probe annotations for value profiling",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(PseudoProbeInserter, DEBUG_TYPE,
+                    "Insert pseudo probe annotations for value profiling",
+                    false, false)
+
+FunctionPass *llvm::createPseudoProbeInserter() {
+  return new PseudoProbeInserter();
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/RDFGraph.cpp b/contrib/llvm-project/llvm/lib/CodeGen/RDFGraph.cpp
index 437a6b030096..cebb902f0a4a 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/RDFGraph.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/RDFGraph.cpp
@@ -984,11 +984,6 @@ RegisterRef DataFlowGraph::restrictRef(RegisterRef AR, RegisterRef BR) const {
     LaneBitmask M = AR.Mask & BR.Mask;
     return M.any() ? RegisterRef(AR.Reg, M) : RegisterRef();
   }
-#ifndef NDEBUG
-//  RegisterRef NAR = PRI.normalize(AR);
-//  RegisterRef NBR = PRI.normalize(BR);
-//  assert(NAR.Reg != NBR.Reg);
-#endif
   // This isn't strictly correct, because the overlap may happen in the
   // part masked out.
   if (PRI.alias(AR, BR))
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/RDFLiveness.cpp b/contrib/llvm-project/llvm/lib/CodeGen/RDFLiveness.cpp
index 0bcd27f8ea45..76bf0c280970 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/RDFLiveness.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/RDFLiveness.cpp
@@ -23,8 +23,10 @@
 // <10.1145/2086696.2086706>. <hal-00647369>
 //
 #include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineDominanceFrontier.h"
 #include "llvm/CodeGen/MachineDominators.h"
@@ -45,6 +47,7 @@
 #include <cstdint>
 #include <iterator>
 #include <map>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 
@@ -108,7 +111,7 @@ NodeList Liveness::getAllReachingDefs(RegisterRef RefRR,
       const RegisterAggr &DefRRs) {
   NodeList RDefs; // Return value.
   SetVector<NodeId> DefQ;
-  SetVector<NodeId> Owners;
+  DenseMap<MachineInstr*, uint32_t> OrdMap;
 
   // Dead defs will be treated as if they were live, since they are actually
   // on the data-flow path. They cannot be ignored because even though they
@@ -151,18 +154,9 @@ NodeList Liveness::getAllReachingDefs(RegisterRef RefRR,
     for (auto S : DFG.getRelatedRefs(TA.Addr->getOwner(DFG), TA))
       if (NodeId RD = NodeAddr<RefNode*>(S).Addr->getReachingDef())
         DefQ.insert(RD);
-  }
-
-  // Remove all non-phi defs that are not aliased to RefRR, and collect
-  // the owners of the remaining defs.
-  SetVector<NodeId> Defs;
-  for (NodeId N : DefQ) {
-    auto TA = DFG.addr<DefNode*>(N);
-    bool IsPhi = TA.Addr->getFlags() & NodeAttrs::PhiRef;
-    if (!IsPhi && !PRI.alias(RefRR, TA.Addr->getRegRef(DFG)))
-      continue;
-    Defs.insert(TA.Id);
-    Owners.insert(TA.Addr->getOwner(DFG).Id);
+    // Don't visit sibling defs. They share the same reaching def (which
+    // will be visited anyway), but they define something not aliased to
+    // this ref.
   }
 
   // Return the MachineBasicBlock containing a given instruction.
@@ -174,38 +168,80 @@ NodeList Liveness::getAllReachingDefs(RegisterRef RefRR,
     NodeAddr<BlockNode*> BA = PA.Addr->getOwner(DFG);
     return BA.Addr->getCode();
   };
-  // Less(A,B) iff instruction A is further down in the dominator tree than B.
-  auto Less = [&Block,this] (NodeId A, NodeId B) -> bool {
+
+  SmallSet<NodeId,32> Defs;
+
+  // Remove all non-phi defs that are not aliased to RefRR, and segregate
+  // the the remaining defs into buckets for containing blocks.
+  std::map<NodeId, NodeAddr<InstrNode*>> Owners;
+  std::map<MachineBasicBlock*, SmallVector<NodeId,32>> Blocks;
+  for (NodeId N : DefQ) {
+    auto TA = DFG.addr<DefNode*>(N);
+    bool IsPhi = TA.Addr->getFlags() & NodeAttrs::PhiRef;
+    if (!IsPhi && !PRI.alias(RefRR, TA.Addr->getRegRef(DFG)))
+      continue;
+    Defs.insert(TA.Id);
+    NodeAddr<InstrNode*> IA = TA.Addr->getOwner(DFG);
+    Owners[TA.Id] = IA;
+    Blocks[Block(IA)].push_back(IA.Id);
+  }
+
+  auto Precedes = [this,&OrdMap] (NodeId A, NodeId B) {
     if (A == B)
       return false;
-    auto OA = DFG.addr<InstrNode*>(A), OB = DFG.addr<InstrNode*>(B);
-    MachineBasicBlock *BA = Block(OA), *BB = Block(OB);
-    if (BA != BB)
-      return MDT.dominates(BB, BA);
-    // They are in the same block.
+    NodeAddr<InstrNode*> OA = DFG.addr<InstrNode*>(A);
+    NodeAddr<InstrNode*> OB = DFG.addr<InstrNode*>(B);
     bool StmtA = OA.Addr->getKind() == NodeAttrs::Stmt;
     bool StmtB = OB.Addr->getKind() == NodeAttrs::Stmt;
-    if (StmtA) {
-      if (!StmtB)   // OB is a phi and phis dominate statements.
-        return true;
-      MachineInstr *CA = NodeAddr<StmtNode*>(OA).Addr->getCode();
-      MachineInstr *CB = NodeAddr<StmtNode*>(OB).Addr->getCode();
-      // The order must be linear, so tie-break such equalities.
-      if (CA == CB)
-        return A < B;
-      return MDT.dominates(CB, CA);
-    } else {
-      // OA is a phi.
-      if (StmtB)
-        return false;
-      // Both are phis. There is no ordering between phis (in terms of
-      // the data-flow), so tie-break this via node id comparison.
+    if (StmtA && StmtB) {
+      const MachineInstr *InA = NodeAddr<StmtNode*>(OA).Addr->getCode();
+      const MachineInstr *InB = NodeAddr<StmtNode*>(OB).Addr->getCode();
+      assert(InA->getParent() == InB->getParent());
+      auto FA = OrdMap.find(InA);
+      if (FA != OrdMap.end())
+        return FA->second < OrdMap.find(InB)->second;
+      const MachineBasicBlock *BB = InA->getParent();
+      for (auto It = BB->begin(), E = BB->end(); It != E; ++It) {
+        if (It == InA->getIterator())
+          return true;
+        if (It == InB->getIterator())
+          return false;
+      }
+      llvm_unreachable("InA and InB should be in the same block");
+    }
+    // One of them is a phi node.
+    if (!StmtA && !StmtB) {
+      // Both are phis, which are unordered. Break the tie by id numbers.
       return A < B;
     }
+    // Only one of them is a phi. Phis always precede statements.
+    return !StmtA;
   };
 
-  std::vector<NodeId> Tmp(Owners.begin(), Owners.end());
-  llvm::sort(Tmp, Less);
+  auto GetOrder = [&OrdMap] (MachineBasicBlock &B) {
+    uint32_t Pos = 0;
+    for (MachineInstr &In : B)
+      OrdMap.insert({&In, ++Pos});
+  };
+
+  // For each block, sort the nodes in it.
+  std::vector<MachineBasicBlock*> TmpBB;
+  for (auto &Bucket : Blocks) {
+    TmpBB.push_back(Bucket.first);
+    if (Bucket.second.size() > 2)
+      GetOrder(*Bucket.first);
+    llvm::sort(Bucket.second, Precedes);
+  }
+
+  // Sort the blocks with respect to dominance.
+  llvm::sort(TmpBB,
+             [this](auto A, auto B) { return MDT.properlyDominates(A, B); });
+
+  std::vector<NodeId> TmpInst;
+  for (auto I = TmpBB.rbegin(), E = TmpBB.rend(); I != E; ++I) {
+    auto &Bucket = Blocks[*I];
+    TmpInst.insert(TmpInst.end(), Bucket.rbegin(), Bucket.rend());
+  }
 
   // The vector is a list of instructions, so that defs coming from
   // the same instruction don't need to be artificially ordered.
@@ -220,6 +256,9 @@ NodeList Liveness::getAllReachingDefs(RegisterRef RefRR,
   // *d3<C>              If A \incl BuC, and B \incl AuC, then *d2 would be
   //                     covered if we added A first, and A would be covered
   //                     if we added B first.
+  // In this example we want both A and B, because we don't want to give
+  // either one priority over the other, since they belong to the same
+  // statement.
 
   RegisterAggr RRs(DefRRs);
 
@@ -227,7 +266,8 @@ NodeList Liveness::getAllReachingDefs(RegisterRef RefRR,
     return TA.Addr->getKind() == NodeAttrs::Def &&
            Defs.count(TA.Id);
   };
-  for (NodeId T : Tmp) {
+
+  for (NodeId T : TmpInst) {
     if (!FullChain && RRs.hasCoverOf(RefRR))
       break;
     auto TA = DFG.addr<InstrNode*>(T);
@@ -246,7 +286,7 @@ NodeList Liveness::getAllReachingDefs(RegisterRef RefRR,
       if (FullChain || IsPhi || !RRs.hasCoverOf(QR))
         Ds.push_back(DA);
     }
-    RDefs.insert(RDefs.end(), Ds.begin(), Ds.end());
+    llvm::append_range(RDefs, Ds);
     for (NodeAddr<DefNode*> DA : Ds) {
       // When collecting a full chain of definitions, do not consider phi
       // defs to actually define a register.
@@ -260,7 +300,7 @@ NodeList Liveness::getAllReachingDefs(RegisterRef RefRR,
   auto DeadP = [](const NodeAddr<DefNode*> DA) -> bool {
     return DA.Addr->getFlags() & NodeAttrs::Dead;
   };
-  RDefs.resize(std::distance(RDefs.begin(), llvm::remove_if(RDefs, DeadP)));
+  llvm::erase_if(RDefs, DeadP);
 
   return RDefs;
 }
@@ -430,13 +470,13 @@ void Liveness::computePhiInfo() {
   NodeList Blocks = FA.Addr->members(DFG);
   for (NodeAddr<BlockNode*> BA : Blocks) {
     auto Ps = BA.Addr->members_if(DFG.IsCode<NodeAttrs::Phi>, DFG);
-    Phis.insert(Phis.end(), Ps.begin(), Ps.end());
+    llvm::append_range(Phis, Ps);
   }
 
   // phi use -> (map: reaching phi -> set of registers defined in between)
   std::map<NodeId,std::map<NodeId,RegisterAggr>> PhiUp;
   std::vector<NodeId> PhiUQ;  // Work list of phis for upward propagation.
-  std::map<NodeId,RegisterAggr> PhiDRs;  // Phi -> registers defined by it.
+  std::unordered_map<NodeId,RegisterAggr> PhiDRs;  // Phi -> registers defined by it.
 
   // Go over all phis.
   for (NodeAddr<PhiNode*> PhiA : Phis) {
@@ -474,7 +514,7 @@ void Liveness::computePhiInfo() {
         NodeAddr<UseNode*> A = DFG.addr<UseNode*>(UN);
         uint16_t F = A.Addr->getFlags();
         if ((F & (NodeAttrs::Undef | NodeAttrs::PhiRef)) == 0) {
-          RegisterRef R = PRI.normalize(A.Addr->getRegRef(DFG));
+          RegisterRef R = A.Addr->getRegRef(DFG);
           RealUses[R.Reg].insert({A.Id,R.Mask});
         }
         UN = A.Addr->getSibling();
@@ -612,6 +652,23 @@ void Liveness::computePhiInfo() {
   // is covered, or until reaching the final phi. Only assume that the
   // reference reaches the phi in the latter case.
 
+  // The operation "clearIn" can be expensive. For a given set of intervening
+  // defs, cache the result of subtracting these defs from a given register
+  // ref.
+  using SubMap = std::unordered_map<RegisterRef, RegisterRef>;
+  std::unordered_map<RegisterAggr, SubMap> Subs;
+  auto ClearIn = [] (RegisterRef RR, const RegisterAggr &Mid, SubMap &SM) {
+    if (Mid.empty())
+      return RR;
+    auto F = SM.find(RR);
+    if (F != SM.end())
+      return F->second;
+    RegisterRef S = Mid.clearIn(RR);
+    SM.insert({RR, S});
+    return S;
+  };
+
+  // Go over all phis.
   for (unsigned i = 0; i < PhiUQ.size(); ++i) {
     auto PA = DFG.addr<PhiNode*>(PhiUQ[i]);
     NodeList PUs = PA.Addr->members_if(DFG.IsRef<NodeAttrs::Use>, DFG);
@@ -619,17 +676,17 @@ void Liveness::computePhiInfo() {
 
     for (NodeAddr<UseNode*> UA : PUs) {
       std::map<NodeId,RegisterAggr> &PUM = PhiUp[UA.Id];
-      RegisterRef UR = PRI.normalize(UA.Addr->getRegRef(DFG));
+      RegisterRef UR = UA.Addr->getRegRef(DFG);
       for (const std::pair<const NodeId, RegisterAggr> &P : PUM) {
         bool Changed = false;
         const RegisterAggr &MidDefs = P.second;
-
         // Collect the set PropUp of uses that are reached by the current
         // phi PA, and are not covered by any intervening def between the
         // currently visited use UA and the upward phi P.
 
         if (MidDefs.hasCoverOf(UR))
           continue;
+        SubMap &SM = Subs[MidDefs];
 
         // General algorithm:
         //   for each (R,U) : U is use node of R, U is reached by PA
@@ -649,7 +706,7 @@ void Liveness::computePhiInfo() {
             LaneBitmask M = R.Mask & V.second;
             if (M.none())
               continue;
-            if (RegisterRef SS = MidDefs.clearIn(RegisterRef(R.Reg, M))) {
+            if (RegisterRef SS = ClearIn(RegisterRef(R.Reg, M), MidDefs, SM)) {
               NodeRefSet &RS = RealUseMap[P.first][SS.Reg];
               Changed |= RS.insert({V.first,SS.Mask}).second;
             }
@@ -1073,7 +1130,7 @@ void Liveness::traverse(MachineBasicBlock *B, RefMap &LiveIn) {
     for (NodeAddr<UseNode*> UA : IA.Addr->members_if(DFG.IsUse, DFG)) {
       if (UA.Addr->getFlags() & NodeAttrs::Undef)
         continue;
-      RegisterRef RR = PRI.normalize(UA.Addr->getRegRef(DFG));
+      RegisterRef RR = UA.Addr->getRegRef(DFG);
       for (NodeAddr<DefNode*> D : getAllReachingDefs(UA))
         if (getBlockWithRef(D.Id) != B)
           LiveIn[RR.Reg].insert({D.Id,RR.Mask});
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/RDFRegisters.cpp b/contrib/llvm-project/llvm/lib/CodeGen/RDFRegisters.cpp
index bd8661816e71..8760ba118934 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/RDFRegisters.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/RDFRegisters.cpp
@@ -84,18 +84,23 @@ PhysicalRegisterInfo::PhysicalRegisterInfo(const TargetRegisterInfo &tri,
   for (uint32_t M = 1, NM = RegMasks.size(); M <= NM; ++M) {
     BitVector PU(TRI.getNumRegUnits());
     const uint32_t *MB = RegMasks.get(M);
-    for (unsigned i = 1, e = TRI.getNumRegs(); i != e; ++i) {
-      if (!(MB[i/32] & (1u << (i%32))))
+    for (unsigned I = 1, E = TRI.getNumRegs(); I != E; ++I) {
+      if (!(MB[I / 32] & (1u << (I % 32))))
         continue;
-      for (MCRegUnitIterator U(i, &TRI); U.isValid(); ++U)
+      for (MCRegUnitIterator U(MCRegister::from(I), &TRI); U.isValid(); ++U)
         PU.set(*U);
     }
     MaskInfos[M].Units = PU.flip();
   }
-}
 
-RegisterRef PhysicalRegisterInfo::normalize(RegisterRef RR) const {
-  return RR;
+  AliasInfos.resize(TRI.getNumRegUnits());
+  for (uint32_t U = 0, NU = TRI.getNumRegUnits(); U != NU; ++U) {
+    BitVector AS(TRI.getNumRegs());
+    for (MCRegUnitRootIterator R(U, &TRI); R.isValid(); ++R)
+      for (MCSuperRegIterator S(*R, &TRI, true); S.isValid(); ++S)
+        AS.set(*S);
+    AliasInfos[U].Regs = AS;
+  }
 }
 
 std::set<RegisterId> PhysicalRegisterInfo::getAliasSet(RegisterId Reg) const {
@@ -321,26 +326,17 @@ RegisterRef RegisterAggr::makeRegRef() const {
   if (U < 0)
     return RegisterRef();
 
-  auto AliasedRegs = [this] (uint32_t Unit, BitVector &Regs) {
-    for (MCRegUnitRootIterator R(Unit, &PRI.getTRI()); R.isValid(); ++R)
-      for (MCSuperRegIterator S(*R, &PRI.getTRI(), true); S.isValid(); ++S)
-        Regs.set(*S);
-  };
-
   // Find the set of all registers that are aliased to all the units
   // in this aggregate.
 
   // Get all the registers aliased to the first unit in the bit vector.
-  BitVector Regs(PRI.getTRI().getNumRegs());
-  AliasedRegs(U, Regs);
+  BitVector Regs = PRI.getUnitAliases(U);
   U = Units.find_next(U);
 
   // For each other unit, intersect it with the set of all registers
   // aliased that unit.
   while (U >= 0) {
-    BitVector AR(PRI.getTRI().getNumRegs());
-    AliasedRegs(U, AR);
-    Regs &= AR;
+    Regs &= PRI.getUnitAliases(U);
     U = Units.find_next(U);
   }
 
@@ -378,3 +374,8 @@ RegisterAggr::rr_iterator::rr_iterator(const RegisterAggr &RG,
   Pos = End ? Masks.end() : Masks.begin();
   Index = End ? Masks.size() : 0;
 }
+
+raw_ostream &rdf::operator<<(raw_ostream &OS, const RegisterAggr &A) {
+  A.print(OS);
+  return OS;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/ReachingDefAnalysis.cpp b/contrib/llvm-project/llvm/lib/CodeGen/ReachingDefAnalysis.cpp
index 5bd8b4b8e27f..d16e90a7e0b4 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/ReachingDefAnalysis.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/ReachingDefAnalysis.cpp
@@ -29,7 +29,7 @@ static bool isValidRegUse(const MachineOperand &MO) {
   return isValidReg(MO) && MO.isUse();
 }
 
-static bool isValidRegUseOf(const MachineOperand &MO, int PhysReg) {
+static bool isValidRegUseOf(const MachineOperand &MO, MCRegister PhysReg) {
   return isValidRegUse(MO) && MO.getReg() == PhysReg;
 }
 
@@ -37,7 +37,7 @@ static bool isValidRegDef(const MachineOperand &MO) {
   return isValidReg(MO) && MO.isDef();
 }
 
-static bool isValidRegDefOf(const MachineOperand &MO, int PhysReg) {
+static bool isValidRegDefOf(const MachineOperand &MO, MCRegister PhysReg) {
   return isValidRegDef(MO) && MO.getReg() == PhysReg;
 }
 
@@ -121,7 +121,8 @@ void ReachingDefAnalysis::processDefs(MachineInstr *MI) {
   for (auto &MO : MI->operands()) {
     if (!isValidRegDef(MO))
       continue;
-    for (MCRegUnitIterator Unit(MO.getReg(), TRI); Unit.isValid(); ++Unit) {
+    for (MCRegUnitIterator Unit(MO.getReg().asMCReg(), TRI); Unit.isValid();
+         ++Unit) {
       // This instruction explicitly defines the current reg unit.
       LLVM_DEBUG(dbgs() << printReg(*Unit, TRI) << ":\t" << CurInstr
                         << '\t' << *MI);
@@ -143,10 +144,9 @@ void ReachingDefAnalysis::reprocessBasicBlock(MachineBasicBlock *MBB) {
          "Unexpected basic block number.");
 
   // Count number of non-debug instructions for end of block adjustment.
-  int NumInsts = 0;
-  for (const MachineInstr &MI : *MBB)
-    if (!MI.isDebugInstr())
-      NumInsts++;
+  auto NonDbgInsts =
+    instructionsWithoutDebug(MBB->instr_begin(), MBB->instr_end());
+  int NumInsts = std::distance(NonDbgInsts.begin(), NonDbgInsts.end());
 
   // When reprocessing a block, the only thing we need to do is check whether
   // there is now a more recent incoming reaching definition from a predecessor.
@@ -197,10 +197,9 @@ void ReachingDefAnalysis::processBasicBlock(
   }
 
   enterBasicBlock(MBB);
-  for (MachineInstr &MI : *MBB) {
-    if (!MI.isDebugInstr())
-      processDefs(&MI);
-  }
+  for (MachineInstr &MI :
+       instructionsWithoutDebug(MBB->instr_begin(), MBB->instr_end()))
+    processDefs(&MI);
   leaveBasicBlock(MBB);
 }
 
@@ -254,7 +253,8 @@ void ReachingDefAnalysis::traverse() {
 #endif
 }
 
-int ReachingDefAnalysis::getReachingDef(MachineInstr *MI, int PhysReg) const {
+int ReachingDefAnalysis::getReachingDef(MachineInstr *MI,
+                                        MCRegister PhysReg) const {
   assert(InstIds.count(MI) && "Unexpected machine instuction.");
   int InstId = InstIds.lookup(MI);
   int DefRes = ReachingDefDefaultVal;
@@ -273,13 +273,16 @@ int ReachingDefAnalysis::getReachingDef(MachineInstr *MI, int PhysReg) const {
   return LatestDef;
 }
 
-MachineInstr* ReachingDefAnalysis::getReachingLocalMIDef(MachineInstr *MI,
-                                                    int PhysReg) const {
-  return getInstFromId(MI->getParent(), getReachingDef(MI, PhysReg));
+MachineInstr *
+ReachingDefAnalysis::getReachingLocalMIDef(MachineInstr *MI,
+                                           MCRegister PhysReg) const {
+  return hasLocalDefBefore(MI, PhysReg)
+    ? getInstFromId(MI->getParent(), getReachingDef(MI, PhysReg))
+    : nullptr;
 }
 
 bool ReachingDefAnalysis::hasSameReachingDef(MachineInstr *A, MachineInstr *B,
-                                             int PhysReg) const {
+                                             MCRegister PhysReg) const {
   MachineBasicBlock *ParentA = A->getParent();
   MachineBasicBlock *ParentB = B->getParent();
   if (ParentA != ParentB)
@@ -307,18 +310,19 @@ MachineInstr *ReachingDefAnalysis::getInstFromId(MachineBasicBlock *MBB,
   return nullptr;
 }
 
-int
-ReachingDefAnalysis::getClearance(MachineInstr *MI, MCPhysReg PhysReg) const {
+int ReachingDefAnalysis::getClearance(MachineInstr *MI,
+                                      MCRegister PhysReg) const {
   assert(InstIds.count(MI) && "Unexpected machine instuction.");
   return InstIds.lookup(MI) - getReachingDef(MI, PhysReg);
 }
 
-bool
-ReachingDefAnalysis::hasLocalDefBefore(MachineInstr *MI, int PhysReg) const {
+bool ReachingDefAnalysis::hasLocalDefBefore(MachineInstr *MI,
+                                            MCRegister PhysReg) const {
   return getReachingDef(MI, PhysReg) >= 0;
 }
 
-void ReachingDefAnalysis::getReachingLocalUses(MachineInstr *Def, int PhysReg,
+void ReachingDefAnalysis::getReachingLocalUses(MachineInstr *Def,
+                                               MCRegister PhysReg,
                                                InstSet &Uses) const {
   MachineBasicBlock *MBB = Def->getParent();
   MachineBasicBlock::iterator MI = MachineBasicBlock::iterator(Def);
@@ -342,12 +346,11 @@ void ReachingDefAnalysis::getReachingLocalUses(MachineInstr *Def, int PhysReg,
   }
 }
 
-bool
-ReachingDefAnalysis::getLiveInUses(MachineBasicBlock *MBB, int PhysReg,
-                                   InstSet &Uses) const {
-  for (auto &MI : *MBB) {
-    if (MI.isDebugInstr())
-      continue;
+bool ReachingDefAnalysis::getLiveInUses(MachineBasicBlock *MBB,
+                                        MCRegister PhysReg,
+                                        InstSet &Uses) const {
+  for (MachineInstr &MI :
+       instructionsWithoutDebug(MBB->instr_begin(), MBB->instr_end())) {
     for (auto &MO : MI.operands()) {
       if (!isValidRegUseOf(MO, PhysReg))
         continue;
@@ -356,12 +359,14 @@ ReachingDefAnalysis::getLiveInUses(MachineBasicBlock *MBB, int PhysReg,
       Uses.insert(&MI);
     }
   }
-  return isReachingDefLiveOut(&MBB->back(), PhysReg);
+  auto Last = MBB->getLastNonDebugInstr();
+  if (Last == MBB->end())
+    return true;
+  return isReachingDefLiveOut(&*Last, PhysReg);
 }
 
-void
-ReachingDefAnalysis::getGlobalUses(MachineInstr *MI, int PhysReg,
-                                   InstSet &Uses) const {
+void ReachingDefAnalysis::getGlobalUses(MachineInstr *MI, MCRegister PhysReg,
+                                        InstSet &Uses) const {
   MachineBasicBlock *MBB = MI->getParent();
 
   // Collect the uses that each def touches within the block.
@@ -372,9 +377,7 @@ ReachingDefAnalysis::getGlobalUses(MachineInstr *MI, int PhysReg,
     if (LiveOut != MI)
       return;
 
-    SmallVector<MachineBasicBlock*, 4> ToVisit;
-    ToVisit.insert(ToVisit.begin(), MBB->successors().begin(),
-                   MBB->successors().end());
+    SmallVector<MachineBasicBlock *, 4> ToVisit(MBB->successors());
     SmallPtrSet<MachineBasicBlock*, 4>Visited;
     while (!ToVisit.empty()) {
       MachineBasicBlock *MBB = ToVisit.back();
@@ -382,22 +385,33 @@ ReachingDefAnalysis::getGlobalUses(MachineInstr *MI, int PhysReg,
       if (Visited.count(MBB) || !MBB->isLiveIn(PhysReg))
         continue;
       if (getLiveInUses(MBB, PhysReg, Uses))
-        ToVisit.insert(ToVisit.end(), MBB->successors().begin(),
-                       MBB->successors().end());
+        llvm::append_range(ToVisit, MBB->successors());
       Visited.insert(MBB);
     }
   }
 }
 
-void ReachingDefAnalysis::getLiveOuts(MachineBasicBlock *MBB, int PhysReg,
-                                      InstSet &Defs) const {
+void ReachingDefAnalysis::getGlobalReachingDefs(MachineInstr *MI,
+                                                MCRegister PhysReg,
+                                                InstSet &Defs) const {
+  if (auto *Def = getUniqueReachingMIDef(MI, PhysReg)) {
+    Defs.insert(Def);
+    return;
+  }
+
+  for (auto *MBB : MI->getParent()->predecessors())
+    getLiveOuts(MBB, PhysReg, Defs);
+}
+
+void ReachingDefAnalysis::getLiveOuts(MachineBasicBlock *MBB,
+                                      MCRegister PhysReg, InstSet &Defs) const {
   SmallPtrSet<MachineBasicBlock*, 2> VisitedBBs;
   getLiveOuts(MBB, PhysReg, Defs, VisitedBBs);
 }
 
-void
-ReachingDefAnalysis::getLiveOuts(MachineBasicBlock *MBB, int PhysReg,
-                                 InstSet &Defs, BlockSet &VisitedBBs) const {
+void ReachingDefAnalysis::getLiveOuts(MachineBasicBlock *MBB,
+                                      MCRegister PhysReg, InstSet &Defs,
+                                      BlockSet &VisitedBBs) const {
   if (VisitedBBs.count(MBB))
     return;
 
@@ -414,26 +428,25 @@ ReachingDefAnalysis::getLiveOuts(MachineBasicBlock *MBB, int PhysReg,
       getLiveOuts(Pred, PhysReg, Defs, VisitedBBs);
 }
 
-MachineInstr *ReachingDefAnalysis::getUniqueReachingMIDef(MachineInstr *MI,
-                                                          int PhysReg) const {
+MachineInstr *
+ReachingDefAnalysis::getUniqueReachingMIDef(MachineInstr *MI,
+                                            MCRegister PhysReg) const {
   // If there's a local def before MI, return it.
   MachineInstr *LocalDef = getReachingLocalMIDef(MI, PhysReg);
   if (LocalDef && InstIds.lookup(LocalDef) < InstIds.lookup(MI))
     return LocalDef;
 
-  SmallPtrSet<MachineBasicBlock*, 4> VisitedBBs;
   SmallPtrSet<MachineInstr*, 2> Incoming;
-  for (auto *Pred : MI->getParent()->predecessors())
-    getLiveOuts(Pred, PhysReg, Incoming, VisitedBBs);
-
-  // If we have a local def and an incoming instruction, then there's not a
-  // unique instruction def.
-  if (!Incoming.empty() && LocalDef)
-    return nullptr;
-  else if (Incoming.size() == 1)
+  MachineBasicBlock *Parent = MI->getParent();
+  for (auto *Pred : Parent->predecessors())
+    getLiveOuts(Pred, PhysReg, Incoming);
+
+  // Check that we have a single incoming value and that it does not
+  // come from the same block as MI - since it would mean that the def
+  // is executed after MI.
+  if (Incoming.size() == 1 && (*Incoming.begin())->getParent() != Parent)
     return *Incoming.begin();
-  else
-    return LocalDef;
+  return nullptr;
 }
 
 MachineInstr *ReachingDefAnalysis::getMIOperand(MachineInstr *MI,
@@ -448,7 +461,8 @@ MachineInstr *ReachingDefAnalysis::getMIOperand(MachineInstr *MI,
   return getUniqueReachingMIDef(MI, MO.getReg());
 }
 
-bool ReachingDefAnalysis::isRegUsedAfter(MachineInstr *MI, int PhysReg) const {
+bool ReachingDefAnalysis::isRegUsedAfter(MachineInstr *MI,
+                                         MCRegister PhysReg) const {
   MachineBasicBlock *MBB = MI->getParent();
   LivePhysRegs LiveRegs(*TRI);
   LiveRegs.addLiveOuts(*MBB);
@@ -459,18 +473,21 @@ bool ReachingDefAnalysis::isRegUsedAfter(MachineInstr *MI, int PhysReg) const {
 
   // Walk backwards through the block to see if the register is live at some
   // point.
-  for (auto Last = MBB->rbegin(), End = MBB->rend(); Last != End; ++Last) {
-    LiveRegs.stepBackward(*Last);
+  for (MachineInstr &Last :
+       instructionsWithoutDebug(MBB->instr_rbegin(), MBB->instr_rend())) {
+    LiveRegs.stepBackward(Last);
     if (LiveRegs.contains(PhysReg))
-      return InstIds.lookup(&*Last) > InstIds.lookup(MI);
+      return InstIds.lookup(&Last) > InstIds.lookup(MI);
   }
   return false;
 }
 
 bool ReachingDefAnalysis::isRegDefinedAfter(MachineInstr *MI,
-                                            int PhysReg) const {
+                                            MCRegister PhysReg) const {
   MachineBasicBlock *MBB = MI->getParent();
-  if (getReachingDef(MI, PhysReg) != getReachingDef(&MBB->back(), PhysReg))
+  auto Last = MBB->getLastNonDebugInstr();
+  if (Last != MBB->end() &&
+      getReachingDef(MI, PhysReg) != getReachingDef(&*Last, PhysReg))
     return true;
 
   if (auto *Def = getLocalLiveOutMIDef(MBB, PhysReg))
@@ -479,17 +496,17 @@ bool ReachingDefAnalysis::isRegDefinedAfter(MachineInstr *MI,
   return false;
 }
 
-bool
-ReachingDefAnalysis::isReachingDefLiveOut(MachineInstr *MI, int PhysReg) const {
+bool ReachingDefAnalysis::isReachingDefLiveOut(MachineInstr *MI,
+                                               MCRegister PhysReg) const {
   MachineBasicBlock *MBB = MI->getParent();
   LivePhysRegs LiveRegs(*TRI);
   LiveRegs.addLiveOuts(*MBB);
   if (!LiveRegs.contains(PhysReg))
     return false;
 
-  MachineInstr *Last = &MBB->back();
+  auto Last = MBB->getLastNonDebugInstr();
   int Def = getReachingDef(MI, PhysReg);
-  if (getReachingDef(Last, PhysReg) != Def)
+  if (Last != MBB->end() && getReachingDef(&*Last, PhysReg) != Def)
     return false;
 
   // Finally check that the last instruction doesn't redefine the register.
@@ -500,18 +517,22 @@ ReachingDefAnalysis::isReachingDefLiveOut(MachineInstr *MI, int PhysReg) const {
   return true;
 }
 
-MachineInstr* ReachingDefAnalysis::getLocalLiveOutMIDef(MachineBasicBlock *MBB,
-                                                        int PhysReg) const {
+MachineInstr *
+ReachingDefAnalysis::getLocalLiveOutMIDef(MachineBasicBlock *MBB,
+                                          MCRegister PhysReg) const {
   LivePhysRegs LiveRegs(*TRI);
   LiveRegs.addLiveOuts(*MBB);
   if (!LiveRegs.contains(PhysReg))
     return nullptr;
 
-  MachineInstr *Last = &MBB->back();
-  int Def = getReachingDef(Last, PhysReg);
+  auto Last = MBB->getLastNonDebugInstr();
+  if (Last == MBB->end())
+    return nullptr;
+
+  int Def = getReachingDef(&*Last, PhysReg);
   for (auto &MO : Last->operands())
     if (isValidRegDefOf(MO, PhysReg))
-      return Last;
+      return &*Last;
 
   return Def < 0 ? nullptr : getInstFromId(MBB, Def);
 }
@@ -528,7 +549,7 @@ static bool mayHaveSideEffects(MachineInstr &MI) {
 template<typename Iterator>
 bool ReachingDefAnalysis::isSafeToMove(MachineInstr *From,
                                        MachineInstr *To) const {
-  if (From->getParent() != To->getParent())
+  if (From->getParent() != To->getParent() || From == To)
     return false;
 
   SmallSet<int, 2> Defs;
@@ -557,12 +578,22 @@ bool ReachingDefAnalysis::isSafeToMove(MachineInstr *From,
 
 bool ReachingDefAnalysis::isSafeToMoveForwards(MachineInstr *From,
                                                MachineInstr *To) const {
-  return isSafeToMove<MachineBasicBlock::reverse_iterator>(From, To);
+  using Iterator = MachineBasicBlock::iterator;
+  // Walk forwards until we find the instruction.
+  for (auto I = Iterator(From), E = From->getParent()->end(); I != E; ++I)
+    if (&*I == To)
+      return isSafeToMove<Iterator>(From, To);
+  return false;
 }
 
 bool ReachingDefAnalysis::isSafeToMoveBackwards(MachineInstr *From,
                                                 MachineInstr *To) const {
-  return isSafeToMove<MachineBasicBlock::iterator>(From, To);
+  using Iterator = MachineBasicBlock::reverse_iterator;
+  // Walk backwards until we find the instruction.
+  for (auto I = Iterator(From), E = From->getParent()->rend(); I != E; ++I)
+    if (&*I == To)
+      return isSafeToMove<Iterator>(From, To);
+  return false;
 }
 
 bool ReachingDefAnalysis::isSafeToRemove(MachineInstr *MI,
@@ -612,7 +643,10 @@ ReachingDefAnalysis::isSafeToRemove(MachineInstr *MI, InstSet &Visited,
 void ReachingDefAnalysis::collectKilledOperands(MachineInstr *MI,
                                                 InstSet &Dead) const {
   Dead.insert(MI);
-  auto IsDead = [this, &Dead](MachineInstr *Def, int PhysReg) {
+  auto IsDead = [this, &Dead](MachineInstr *Def, MCRegister PhysReg) {
+    if (mayHaveSideEffects(*Def))
+      return false;
+
     unsigned LiveDefs = 0;
     for (auto &MO : Def->operands()) {
       if (!isValidRegDef(MO))
@@ -642,18 +676,18 @@ void ReachingDefAnalysis::collectKilledOperands(MachineInstr *MI,
 }
 
 bool ReachingDefAnalysis::isSafeToDefRegAt(MachineInstr *MI,
-                                           int PhysReg) const {
+                                           MCRegister PhysReg) const {
   SmallPtrSet<MachineInstr*, 1> Ignore;
   return isSafeToDefRegAt(MI, PhysReg, Ignore);
 }
 
-bool ReachingDefAnalysis::isSafeToDefRegAt(MachineInstr *MI, int PhysReg,
+bool ReachingDefAnalysis::isSafeToDefRegAt(MachineInstr *MI, MCRegister PhysReg,
                                            InstSet &Ignore) const {
   // Check for any uses of the register after MI.
   if (isRegUsedAfter(MI, PhysReg)) {
     if (auto *Def = getReachingLocalMIDef(MI, PhysReg)) {
       SmallPtrSet<MachineInstr*, 2> Uses;
-      getReachingLocalUses(Def, PhysReg, Uses);
+      getGlobalUses(Def, PhysReg, Uses);
       for (auto *Use : Uses)
         if (!Ignore.count(Use))
           return false;
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/RegAllocBase.cpp b/contrib/llvm-project/llvm/lib/CodeGen/RegAllocBase.cpp
index d22826853672..aa749ca43e74 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/RegAllocBase.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/RegAllocBase.cpp
@@ -73,7 +73,7 @@ void RegAllocBase::seedLiveRegs() {
   NamedRegionTimer T("seed", "Seed Live Regs", TimerGroupName,
                      TimerGroupDescription, TimePassesIsEnabled);
   for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
-    unsigned Reg = Register::index2VirtReg(i);
+    Register Reg = Register::index2VirtReg(i);
     if (MRI->reg_nodbg_empty(Reg))
       continue;
     enqueue(&LIS->getInterval(Reg));
@@ -87,13 +87,13 @@ void RegAllocBase::allocatePhysRegs() {
 
   // Continue assigning vregs one at a time to available physical registers.
   while (LiveInterval *VirtReg = dequeue()) {
-    assert(!VRM->hasPhys(VirtReg->reg) && "Register already assigned");
+    assert(!VRM->hasPhys(VirtReg->reg()) && "Register already assigned");
 
     // Unused registers can appear when the spiller coalesces snippets.
-    if (MRI->reg_nodbg_empty(VirtReg->reg)) {
+    if (MRI->reg_nodbg_empty(VirtReg->reg())) {
       LLVM_DEBUG(dbgs() << "Dropping unused " << *VirtReg << '\n');
       aboutToRemoveInterval(*VirtReg);
-      LIS->removeInterval(VirtReg->reg);
+      LIS->removeInterval(VirtReg->reg());
       continue;
     }
 
@@ -104,21 +104,22 @@ void RegAllocBase::allocatePhysRegs() {
     // register if possible and populate a list of new live intervals that
     // result from splitting.
     LLVM_DEBUG(dbgs() << "\nselectOrSplit "
-                      << TRI->getRegClassName(MRI->getRegClass(VirtReg->reg))
-                      << ':' << *VirtReg << " w=" << VirtReg->weight << '\n');
+                      << TRI->getRegClassName(MRI->getRegClass(VirtReg->reg()))
+                      << ':' << *VirtReg << " w=" << VirtReg->weight() << '\n');
 
     using VirtRegVec = SmallVector<Register, 4>;
 
     VirtRegVec SplitVRegs;
-    unsigned AvailablePhysReg = selectOrSplit(*VirtReg, SplitVRegs);
+    MCRegister AvailablePhysReg = selectOrSplit(*VirtReg, SplitVRegs);
 
     if (AvailablePhysReg == ~0u) {
       // selectOrSplit failed to find a register!
       // Probably caused by an inline asm.
       MachineInstr *MI = nullptr;
       for (MachineRegisterInfo::reg_instr_iterator
-           I = MRI->reg_instr_begin(VirtReg->reg), E = MRI->reg_instr_end();
-           I != E; ) {
+               I = MRI->reg_instr_begin(VirtReg->reg()),
+               E = MRI->reg_instr_end();
+           I != E;) {
         MI = &*(I++);
         if (MI->isInlineAsm())
           break;
@@ -133,28 +134,29 @@ void RegAllocBase::allocatePhysRegs() {
         report_fatal_error("ran out of registers during register allocation");
       }
       // Keep going after reporting the error.
-      VRM->assignVirt2Phys(VirtReg->reg,
-                 RegClassInfo.getOrder(MRI->getRegClass(VirtReg->reg)).front());
+      VRM->assignVirt2Phys(
+          VirtReg->reg(),
+          RegClassInfo.getOrder(MRI->getRegClass(VirtReg->reg())).front());
       continue;
     }
 
     if (AvailablePhysReg)
       Matrix->assign(*VirtReg, AvailablePhysReg);
 
-    for (unsigned Reg : SplitVRegs) {
+    for (Register Reg : SplitVRegs) {
       assert(LIS->hasInterval(Reg));
 
       LiveInterval *SplitVirtReg = &LIS->getInterval(Reg);
-      assert(!VRM->hasPhys(SplitVirtReg->reg) && "Register already assigned");
-      if (MRI->reg_nodbg_empty(SplitVirtReg->reg)) {
+      assert(!VRM->hasPhys(SplitVirtReg->reg()) && "Register already assigned");
+      if (MRI->reg_nodbg_empty(SplitVirtReg->reg())) {
         assert(SplitVirtReg->empty() && "Non-empty but used interval");
         LLVM_DEBUG(dbgs() << "not queueing unused  " << *SplitVirtReg << '\n');
         aboutToRemoveInterval(*SplitVirtReg);
-        LIS->removeInterval(SplitVirtReg->reg);
+        LIS->removeInterval(SplitVirtReg->reg());
         continue;
       }
       LLVM_DEBUG(dbgs() << "queuing new interval: " << *SplitVirtReg << "\n");
-      assert(Register::isVirtualRegister(SplitVirtReg->reg) &&
+      assert(Register::isVirtualRegister(SplitVirtReg->reg()) &&
              "expect split value in virtual register");
       enqueue(SplitVirtReg);
       ++NumNewQueued;
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/RegAllocBase.h b/contrib/llvm-project/llvm/lib/CodeGen/RegAllocBase.h
index 8e931eaae99a..3144605345e9 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/RegAllocBase.h
+++ b/contrib/llvm-project/llvm/lib/CodeGen/RegAllocBase.h
@@ -101,8 +101,8 @@ protected:
   // Each call must guarantee forward progess by returning an available PhysReg
   // or new set of split live virtual registers. It is up to the splitter to
   // converge quickly toward fully spilled live ranges.
-  virtual Register selectOrSplit(LiveInterval &VirtReg,
-                                 SmallVectorImpl<Register> &splitLVRs) = 0;
+  virtual MCRegister selectOrSplit(LiveInterval &VirtReg,
+                                   SmallVectorImpl<Register> &splitLVRs) = 0;
 
   // Use this group name for NamedRegionTimer.
   static const char TimerGroupName[];
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/RegAllocBasic.cpp b/contrib/llvm-project/llvm/lib/CodeGen/RegAllocBasic.cpp
index 5009bcc0a397..8f2cb48c5d69 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/RegAllocBasic.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/RegAllocBasic.cpp
@@ -46,7 +46,7 @@ static RegisterRegAlloc basicRegAlloc("basic", "basic register allocator",
 namespace {
   struct CompSpillWeight {
     bool operator()(LiveInterval *A, LiveInterval *B) const {
-      return A->weight < B->weight;
+      return A->weight() < B->weight();
     }
   };
 }
@@ -72,8 +72,8 @@ class RABasic : public MachineFunctionPass,
   // selectOrSplit().
   BitVector UsableRegs;
 
-  bool LRE_CanEraseVirtReg(unsigned) override;
-  void LRE_WillShrinkVirtReg(unsigned) override;
+  bool LRE_CanEraseVirtReg(Register) override;
+  void LRE_WillShrinkVirtReg(Register) override;
 
 public:
   RABasic();
@@ -100,8 +100,8 @@ public:
     return LI;
   }
 
-  Register selectOrSplit(LiveInterval &VirtReg,
-                         SmallVectorImpl<Register> &SplitVRegs) override;
+  MCRegister selectOrSplit(LiveInterval &VirtReg,
+                           SmallVectorImpl<Register> &SplitVRegs) override;
 
   /// Perform register allocation.
   bool runOnMachineFunction(MachineFunction &mf) override;
@@ -111,10 +111,15 @@ public:
         MachineFunctionProperties::Property::NoPHIs);
   }
 
+  MachineFunctionProperties getClearedProperties() const override {
+    return MachineFunctionProperties().set(
+      MachineFunctionProperties::Property::IsSSA);
+  }
+
   // Helper for spilling all live virtual registers currently unified under preg
   // that interfere with the most recently queried lvr.  Return true if spilling
   // was successful, and append any new spilled/split intervals to splitLVRs.
-  bool spillInterferences(LiveInterval &VirtReg, Register PhysReg,
+  bool spillInterferences(LiveInterval &VirtReg, MCRegister PhysReg,
                           SmallVectorImpl<Register> &SplitVRegs);
 
   static char ID;
@@ -141,7 +146,7 @@ INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix)
 INITIALIZE_PASS_END(RABasic, "regallocbasic", "Basic Register Allocator", false,
                     false)
 
-bool RABasic::LRE_CanEraseVirtReg(unsigned VirtReg) {
+bool RABasic::LRE_CanEraseVirtReg(Register VirtReg) {
   LiveInterval &LI = LIS->getInterval(VirtReg);
   if (VRM->hasPhys(VirtReg)) {
     Matrix->unassign(LI);
@@ -156,7 +161,7 @@ bool RABasic::LRE_CanEraseVirtReg(unsigned VirtReg) {
   return false;
 }
 
-void RABasic::LRE_WillShrinkVirtReg(unsigned VirtReg) {
+void RABasic::LRE_WillShrinkVirtReg(Register VirtReg) {
   if (!VRM->hasPhys(VirtReg))
     return;
 
@@ -201,7 +206,7 @@ void RABasic::releaseMemory() {
 // Spill or split all live virtual registers currently unified under PhysReg
 // that interfere with VirtReg. The newly spilled or split live intervals are
 // returned by appending them to SplitVRegs.
-bool RABasic::spillInterferences(LiveInterval &VirtReg, Register PhysReg,
+bool RABasic::spillInterferences(LiveInterval &VirtReg, MCRegister PhysReg,
                                  SmallVectorImpl<Register> &SplitVRegs) {
   // Record each interference and determine if all are spillable before mutating
   // either the union or live intervals.
@@ -213,7 +218,7 @@ bool RABasic::spillInterferences(LiveInterval &VirtReg, Register PhysReg,
     Q.collectInterferingVRegs();
     for (unsigned i = Q.interferingVRegs().size(); i; --i) {
       LiveInterval *Intf = Q.interferingVRegs()[i - 1];
-      if (!Intf->isSpillable() || Intf->weight > VirtReg.weight)
+      if (!Intf->isSpillable() || Intf->weight() > VirtReg.weight())
         return false;
       Intfs.push_back(Intf);
     }
@@ -227,7 +232,7 @@ bool RABasic::spillInterferences(LiveInterval &VirtReg, Register PhysReg,
     LiveInterval &Spill = *Intfs[i];
 
     // Skip duplicates.
-    if (!VRM->hasPhys(Spill.reg))
+    if (!VRM->hasPhys(Spill.reg()))
       continue;
 
     // Deallocate the interfering vreg by removing it from the union.
@@ -253,14 +258,16 @@ bool RABasic::spillInterferences(LiveInterval &VirtReg, Register PhysReg,
 // |vregs| * |machineregs|. And since the number of interference tests is
 // minimal, there is no value in caching them outside the scope of
 // selectOrSplit().
-Register RABasic::selectOrSplit(LiveInterval &VirtReg,
-                                SmallVectorImpl<Register> &SplitVRegs) {
+MCRegister RABasic::selectOrSplit(LiveInterval &VirtReg,
+                                  SmallVectorImpl<Register> &SplitVRegs) {
   // Populate a list of physical register spill candidates.
-  SmallVector<Register, 8> PhysRegSpillCands;
+  SmallVector<MCRegister, 8> PhysRegSpillCands;
 
   // Check for an available register in this class.
-  AllocationOrder Order(VirtReg.reg, *VRM, RegClassInfo, Matrix);
-  while (Register PhysReg = Order.next()) {
+  auto Order =
+      AllocationOrder::create(VirtReg.reg(), *VRM, RegClassInfo, Matrix);
+  for (MCRegister PhysReg : Order) {
+    assert(PhysReg.isValid());
     // Check for interference in PhysReg
     switch (Matrix->checkInterference(VirtReg, PhysReg)) {
     case LiveRegMatrix::IK_Free:
@@ -279,8 +286,9 @@ Register RABasic::selectOrSplit(LiveInterval &VirtReg,
   }
 
   // Try to spill another interfering reg with less spill weight.
-  for (SmallVectorImpl<Register>::iterator PhysRegI = PhysRegSpillCands.begin(),
-       PhysRegE = PhysRegSpillCands.end(); PhysRegI != PhysRegE; ++PhysRegI) {
+  for (auto PhysRegI = PhysRegSpillCands.begin(),
+            PhysRegE = PhysRegSpillCands.end();
+       PhysRegI != PhysRegE; ++PhysRegI) {
     if (!spillInterferences(VirtReg, *PhysRegI, SplitVRegs))
       continue;
 
@@ -310,10 +318,9 @@ bool RABasic::runOnMachineFunction(MachineFunction &mf) {
   RegAllocBase::init(getAnalysis<VirtRegMap>(),
                      getAnalysis<LiveIntervals>(),
                      getAnalysis<LiveRegMatrix>());
-
-  calculateSpillWeightsAndHints(*LIS, *MF, VRM,
-                                getAnalysis<MachineLoopInfo>(),
-                                getAnalysis<MachineBlockFrequencyInfo>());
+  VirtRegAuxInfo VRAI(*MF, *LIS, *VRM, getAnalysis<MachineLoopInfo>(),
+                      getAnalysis<MachineBlockFrequencyInfo>());
+  VRAI.calculateSpillWeightsAndHints();
 
   SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM));
 
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/RegAllocFast.cpp b/contrib/llvm-project/llvm/lib/CodeGen/RegAllocFast.cpp
index cf3eaba23bee..6e548d4a93c8 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/RegAllocFast.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/RegAllocFast.cpp
@@ -56,6 +56,10 @@ STATISTIC(NumStores, "Number of stores added");
 STATISTIC(NumLoads , "Number of loads added");
 STATISTIC(NumCoalesced, "Number of copies coalesced");
 
+// FIXME: Remove this switch when all testcases are fixed!
+static cl::opt<bool> IgnoreMissingDefs("rafast-ignore-missing-defs",
+                                       cl::Hidden);
+
 static RegisterRegAlloc
   fastRegAlloc("fast", "fast register allocator", createFastRegisterAllocator);
 
@@ -85,8 +89,9 @@ namespace {
       MachineInstr *LastUse = nullptr; ///< Last instr to use reg.
       Register VirtReg;                ///< Virtual register number.
       MCPhysReg PhysReg = 0;           ///< Currently held here.
-      unsigned short LastOpNum = 0;    ///< OpNum on LastUse.
-      bool Dirty = false;              ///< Register needs spill.
+      bool LiveOut = false;            ///< Register is possibly live out.
+      bool Reloaded = false;           ///< Register was reloaded.
+      bool Error = false;              ///< Could not allocate.
 
       explicit LiveReg(Register VirtReg) : VirtReg(VirtReg) {}
 
@@ -100,44 +105,51 @@ namespace {
     /// available in a physical register.
     LiveRegMap LiveVirtRegs;
 
+    /// Stores assigned virtual registers present in the bundle MI.
+    DenseMap<Register, MCPhysReg> BundleVirtRegsMap;
+
     DenseMap<unsigned, SmallVector<MachineInstr *, 2>> LiveDbgValueMap;
+    /// List of DBG_VALUE that we encountered without the vreg being assigned
+    /// because they were placed after the last use of the vreg.
+    DenseMap<unsigned, SmallVector<MachineInstr *, 1>> DanglingDbgValues;
 
     /// Has a bit set for every virtual register for which it was determined
     /// that it is alive across blocks.
     BitVector MayLiveAcrossBlocks;
 
-    /// State of a physical register.
-    enum RegState {
-      /// A disabled register is not available for allocation, but an alias may
-      /// be in use. A register can only be moved out of the disabled state if
-      /// all aliases are disabled.
-      regDisabled,
-
+    /// State of a register unit.
+    enum RegUnitState {
       /// A free register is not currently in use and can be allocated
       /// immediately without checking aliases.
       regFree,
 
-      /// A reserved register has been assigned explicitly (e.g., setting up a
-      /// call parameter), and it remains reserved until it is used.
-      regReserved
+      /// A pre-assigned register has been assigned before register allocation
+      /// (e.g., setting up a call parameter).
+      regPreAssigned,
+
+      /// Used temporarily in reloadAtBegin() to mark register units that are
+      /// live-in to the basic block.
+      regLiveIn,
 
       /// A register state may also be a virtual register number, indication
       /// that the physical register is currently allocated to a virtual
       /// register. In that case, LiveVirtRegs contains the inverse mapping.
     };
 
-    /// Maps each physical register to a RegState enum or a virtual register.
-    std::vector<unsigned> PhysRegState;
+    /// Maps each physical register to a RegUnitState enum or virtual register.
+    std::vector<unsigned> RegUnitStates;
 
-    SmallVector<Register, 16> VirtDead;
     SmallVector<MachineInstr *, 32> Coalesced;
 
     using RegUnitSet = SparseSet<uint16_t, identity<uint16_t>>;
     /// Set of register units that are used in the current instruction, and so
     /// cannot be allocated.
     RegUnitSet UsedInInstr;
+    RegUnitSet PhysRegUses;
+    SmallVector<uint16_t, 8> DefOperandIndexes;
 
     void setPhysRegState(MCPhysReg PhysReg, unsigned NewState);
+    bool isPhysRegFree(MCPhysReg PhysReg) const;
 
     /// Mark a physreg as used in this instruction.
     void markRegUsedInInstr(MCPhysReg PhysReg) {
@@ -146,13 +158,29 @@ namespace {
     }
 
     /// Check if a physreg or any of its aliases are used in this instruction.
-    bool isRegUsedInInstr(MCPhysReg PhysReg) const {
-      for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units)
+    bool isRegUsedInInstr(MCPhysReg PhysReg, bool LookAtPhysRegUses) const {
+      for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
         if (UsedInInstr.count(*Units))
           return true;
+        if (LookAtPhysRegUses && PhysRegUses.count(*Units))
+          return true;
+      }
       return false;
     }
 
+    /// Mark physical register as being used in a register use operand.
+    /// This is only used by the special livethrough handling code.
+    void markPhysRegUsedInInstr(MCPhysReg PhysReg) {
+      for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units)
+        PhysRegUses.insert(*Units);
+    }
+
+    /// Remove mark of physical register being used in the instruction.
+    void unmarkRegUsedInInstr(MCPhysReg PhysReg) {
+      for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units)
+        UsedInInstr.erase(*Units);
+    }
+
     enum : unsigned {
       spillClean = 50,
       spillDirty = 100,
@@ -178,27 +206,29 @@ namespace {
           MachineFunctionProperties::Property::NoVRegs);
     }
 
+    MachineFunctionProperties getClearedProperties() const override {
+      return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::IsSSA);
+    }
+
   private:
     bool runOnMachineFunction(MachineFunction &MF) override;
 
     void allocateBasicBlock(MachineBasicBlock &MBB);
+
+    void addRegClassDefCounts(std::vector<unsigned> &RegClassDefCounts,
+                              Register Reg) const;
+
     void allocateInstruction(MachineInstr &MI);
     void handleDebugValue(MachineInstr &MI);
-    void handleThroughOperands(MachineInstr &MI,
-                               SmallVectorImpl<Register> &VirtDead);
-    bool isLastUseOfLocalReg(const MachineOperand &MO) const;
-
-    void addKillFlag(const LiveReg &LRI);
-    void killVirtReg(LiveReg &LR);
-    void killVirtReg(Register VirtReg);
-    void spillVirtReg(MachineBasicBlock::iterator MI, LiveReg &LR);
-    void spillVirtReg(MachineBasicBlock::iterator MI, Register VirtReg);
-
-    void usePhysReg(MachineOperand &MO);
-    void definePhysReg(MachineBasicBlock::iterator MI, MCPhysReg PhysReg,
-                       RegState NewState);
+    void handleBundle(MachineInstr &MI);
+
+    bool usePhysReg(MachineInstr &MI, MCPhysReg PhysReg);
+    bool definePhysReg(MachineInstr &MI, MCPhysReg PhysReg);
+    bool displacePhysReg(MachineInstr &MI, MCPhysReg PhysReg);
+    void freePhysReg(MCPhysReg PhysReg);
+
     unsigned calcSpillCost(MCPhysReg PhysReg) const;
-    void assignVirtToPhysReg(LiveReg &, MCPhysReg PhysReg);
 
     LiveRegMap::iterator findLiveVirtReg(Register VirtReg) {
       return LiveVirtRegs.find(Register::virtReg2Index(VirtReg));
@@ -208,28 +238,38 @@ namespace {
       return LiveVirtRegs.find(Register::virtReg2Index(VirtReg));
     }
 
-    void allocVirtReg(MachineInstr &MI, LiveReg &LR, Register Hint);
+    void assignVirtToPhysReg(MachineInstr &MI, LiveReg &, MCPhysReg PhysReg);
+    void allocVirtReg(MachineInstr &MI, LiveReg &LR, Register Hint,
+                      bool LookAtPhysRegUses = false);
     void allocVirtRegUndef(MachineOperand &MO);
-    MCPhysReg defineVirtReg(MachineInstr &MI, unsigned OpNum, Register VirtReg,
-                            Register Hint);
-    LiveReg &reloadVirtReg(MachineInstr &MI, unsigned OpNum, Register VirtReg,
-                           Register Hint);
-    void spillAll(MachineBasicBlock::iterator MI, bool OnlyLiveOut);
-    bool setPhysReg(MachineInstr &MI, MachineOperand &MO, MCPhysReg PhysReg);
+    void assignDanglingDebugValues(MachineInstr &Def, Register VirtReg,
+                                   MCPhysReg Reg);
+    void defineLiveThroughVirtReg(MachineInstr &MI, unsigned OpNum,
+                                  Register VirtReg);
+    void defineVirtReg(MachineInstr &MI, unsigned OpNum, Register VirtReg,
+                       bool LookAtPhysRegUses = false);
+    void useVirtReg(MachineInstr &MI, unsigned OpNum, Register VirtReg);
+
+    MachineBasicBlock::iterator
+    getMBBBeginInsertionPoint(MachineBasicBlock &MBB,
+                              SmallSet<Register, 2> &PrologLiveIns) const;
+
+    void reloadAtBegin(MachineBasicBlock &MBB);
+    void setPhysReg(MachineInstr &MI, MachineOperand &MO, MCPhysReg PhysReg);
 
     Register traceCopies(Register VirtReg) const;
     Register traceCopyChain(Register Reg) const;
 
     int getStackSpaceFor(Register VirtReg);
     void spill(MachineBasicBlock::iterator Before, Register VirtReg,
-               MCPhysReg AssignedReg, bool Kill);
+               MCPhysReg AssignedReg, bool Kill, bool LiveOut);
     void reload(MachineBasicBlock::iterator Before, Register VirtReg,
                 MCPhysReg PhysReg);
 
     bool mayLiveOut(Register VirtReg);
     bool mayLiveIn(Register VirtReg);
 
-    void dumpState();
+    void dumpState() const;
   };
 
 } // end anonymous namespace
@@ -240,7 +280,16 @@ INITIALIZE_PASS(RegAllocFast, "regallocfast", "Fast Register Allocator", false,
                 false)
 
 void RegAllocFast::setPhysRegState(MCPhysReg PhysReg, unsigned NewState) {
-  PhysRegState[PhysReg] = NewState;
+  for (MCRegUnitIterator UI(PhysReg, TRI); UI.isValid(); ++UI)
+    RegUnitStates[*UI] = NewState;
+}
+
+bool RegAllocFast::isPhysRegFree(MCPhysReg PhysReg) const {
+  for (MCRegUnitIterator UI(PhysReg, TRI); UI.isValid(); ++UI) {
+    if (RegUnitStates[*UI] != regFree)
+      return false;
+  }
+  return true;
 }
 
 /// This allocates space for the specified virtual register to be held on the
@@ -263,6 +312,20 @@ int RegAllocFast::getStackSpaceFor(Register VirtReg) {
   return FrameIdx;
 }
 
+static bool dominates(MachineBasicBlock &MBB,
+                      MachineBasicBlock::const_iterator A,
+                      MachineBasicBlock::const_iterator B) {
+  auto MBBEnd = MBB.end();
+  if (B == MBBEnd)
+    return true;
+
+  MachineBasicBlock::const_iterator I = MBB.begin();
+  for (; &*I != A && &*I != B; ++I)
+    ;
+
+  return &*I == A;
+}
+
 /// Returns false if \p VirtReg is known to not live out of the current block.
 bool RegAllocFast::mayLiveOut(Register VirtReg) {
   if (MayLiveAcrossBlocks.test(Register::virtReg2Index(VirtReg))) {
@@ -270,23 +333,38 @@ bool RegAllocFast::mayLiveOut(Register VirtReg) {
     return !MBB->succ_empty();
   }
 
-  // If this block loops back to itself, it would be necessary to check whether
-  // the use comes after the def.
+  const MachineInstr *SelfLoopDef = nullptr;
+
+  // If this block loops back to itself, it is necessary to check whether the
+  // use comes after the def.
   if (MBB->isSuccessor(MBB)) {
-    MayLiveAcrossBlocks.set(Register::virtReg2Index(VirtReg));
-    return true;
+    SelfLoopDef = MRI->getUniqueVRegDef(VirtReg);
+    if (!SelfLoopDef) {
+      MayLiveAcrossBlocks.set(Register::virtReg2Index(VirtReg));
+      return true;
+    }
   }
 
   // See if the first \p Limit uses of the register are all in the current
   // block.
   static const unsigned Limit = 8;
   unsigned C = 0;
-  for (const MachineInstr &UseInst : MRI->reg_nodbg_instructions(VirtReg)) {
+  for (const MachineInstr &UseInst : MRI->use_nodbg_instructions(VirtReg)) {
     if (UseInst.getParent() != MBB || ++C >= Limit) {
       MayLiveAcrossBlocks.set(Register::virtReg2Index(VirtReg));
       // Cannot be live-out if there are no successors.
       return !MBB->succ_empty();
     }
+
+    if (SelfLoopDef) {
+      // Try to handle some simple cases to avoid spilling and reloading every
+      // value inside a self looping block.
+      if (SelfLoopDef == &UseInst ||
+          !dominates(*MBB, SelfLoopDef->getIterator(), UseInst.getIterator())) {
+        MayLiveAcrossBlocks.set(Register::virtReg2Index(VirtReg));
+        return true;
+      }
+    }
   }
 
   return false;
@@ -313,7 +391,7 @@ bool RegAllocFast::mayLiveIn(Register VirtReg) {
 /// Insert spill instruction for \p AssignedReg before \p Before. Update
 /// DBG_VALUEs with \p VirtReg operands with the stack slot.
 void RegAllocFast::spill(MachineBasicBlock::iterator Before, Register VirtReg,
-                         MCPhysReg AssignedReg, bool Kill) {
+                         MCPhysReg AssignedReg, bool Kill, bool LiveOut) {
   LLVM_DEBUG(dbgs() << "Spilling " << printReg(VirtReg, TRI)
                     << " in " << printReg(AssignedReg, TRI));
   int FI = getStackSpaceFor(VirtReg);
@@ -323,15 +401,32 @@ void RegAllocFast::spill(MachineBasicBlock::iterator Before, Register VirtReg,
   TII->storeRegToStackSlot(*MBB, Before, AssignedReg, Kill, FI, &RC, TRI);
   ++NumStores;
 
-  // If this register is used by DBG_VALUE then insert new DBG_VALUE to
-  // identify spilled location as the place to find corresponding variable's
-  // value.
+  MachineBasicBlock::iterator FirstTerm = MBB->getFirstTerminator();
+
+  // When we spill a virtual register, we will have spill instructions behind
+  // every definition of it, meaning we can switch all the DBG_VALUEs over
+  // to just reference the stack slot.
   SmallVectorImpl<MachineInstr *> &LRIDbgValues = LiveDbgValueMap[VirtReg];
   for (MachineInstr *DBG : LRIDbgValues) {
     MachineInstr *NewDV = buildDbgValueForSpill(*MBB, Before, *DBG, FI);
     assert(NewDV->getParent() == MBB && "dangling parent pointer");
     (void)NewDV;
     LLVM_DEBUG(dbgs() << "Inserting debug info due to spill:\n" << *NewDV);
+
+    if (LiveOut) {
+      // We need to insert a DBG_VALUE at the end of the block if the spill slot
+      // is live out, but there is another use of the value after the
+      // spill. This will allow LiveDebugValues to see the correct live out
+      // value to propagate to the successors.
+      MachineInstr *ClonedDV = MBB->getParent()->CloneMachineInstr(NewDV);
+      MBB->insert(FirstTerm, ClonedDV);
+      LLVM_DEBUG(dbgs() << "Cloning debug info due to live out spill\n");
+    }
+
+    // Rewrite unassigned dbg_values to use the stack slot.
+    MachineOperand &MO = DBG->getOperand(0);
+    if (MO.isReg() && MO.getReg() == 0)
+      updateDbgValueForSpill(*DBG, FI);
   }
   // Now this register is spilled there is should not be any DBG_VALUE
   // pointing to this register because they are all pointing to spilled value
@@ -350,100 +445,75 @@ void RegAllocFast::reload(MachineBasicBlock::iterator Before, Register VirtReg,
   ++NumLoads;
 }
 
-/// Return true if MO is the only remaining reference to its virtual register,
-/// and it is guaranteed to be a block-local register.
-bool RegAllocFast::isLastUseOfLocalReg(const MachineOperand &MO) const {
-  // If the register has ever been spilled or reloaded, we conservatively assume
-  // it is a global register used in multiple blocks.
-  if (StackSlotForVirtReg[MO.getReg()] != -1)
-    return false;
-
-  // Check that the use/def chain has exactly one operand - MO.
-  MachineRegisterInfo::reg_nodbg_iterator I = MRI->reg_nodbg_begin(MO.getReg());
-  if (&*I != &MO)
-    return false;
-  return ++I == MRI->reg_nodbg_end();
-}
+/// Get basic block begin insertion point.
+/// This is not just MBB.begin() because surprisingly we have EH_LABEL
+/// instructions marking the begin of a basic block. This means we must insert
+/// new instructions after such labels...
+MachineBasicBlock::iterator
+RegAllocFast::getMBBBeginInsertionPoint(
+  MachineBasicBlock &MBB, SmallSet<Register, 2> &PrologLiveIns) const {
+  MachineBasicBlock::iterator I = MBB.begin();
+  while (I != MBB.end()) {
+    if (I->isLabel()) {
+      ++I;
+      continue;
+    }
 
-/// Set kill flags on last use of a virtual register.
-void RegAllocFast::addKillFlag(const LiveReg &LR) {
-  if (!LR.LastUse) return;
-  MachineOperand &MO = LR.LastUse->getOperand(LR.LastOpNum);
-  if (MO.isUse() && !LR.LastUse->isRegTiedToDefOperand(LR.LastOpNum)) {
-    if (MO.getReg() == LR.PhysReg)
-      MO.setIsKill();
-    // else, don't do anything we are problably redefining a
-    // subreg of this register and given we don't track which
-    // lanes are actually dead, we cannot insert a kill flag here.
-    // Otherwise we may end up in a situation like this:
-    // ... = (MO) physreg:sub1, implicit killed physreg
-    // ... <== Here we would allow later pass to reuse physreg:sub1
-    //         which is potentially wrong.
-    // LR:sub0 = ...
-    // ... = LR.sub1 <== This is going to use physreg:sub1
-  }
-}
+    // Most reloads should be inserted after prolog instructions.
+    if (!TII->isBasicBlockPrologue(*I))
+      break;
 
-/// Mark virtreg as no longer available.
-void RegAllocFast::killVirtReg(LiveReg &LR) {
-  addKillFlag(LR);
-  assert(PhysRegState[LR.PhysReg] == LR.VirtReg &&
-         "Broken RegState mapping");
-  setPhysRegState(LR.PhysReg, regFree);
-  LR.PhysReg = 0;
-}
+    // However if a prolog instruction reads a register that needs to be
+    // reloaded, the reload should be inserted before the prolog.
+    for (MachineOperand &MO : I->operands()) {
+      if (MO.isReg())
+        PrologLiveIns.insert(MO.getReg());
+    }
 
-/// Mark virtreg as no longer available.
-void RegAllocFast::killVirtReg(Register VirtReg) {
-  assert(Register::isVirtualRegister(VirtReg) &&
-         "killVirtReg needs a virtual register");
-  LiveRegMap::iterator LRI = findLiveVirtReg(VirtReg);
-  if (LRI != LiveVirtRegs.end() && LRI->PhysReg)
-    killVirtReg(*LRI);
-}
+    ++I;
+  }
 
-/// This method spills the value specified by VirtReg into the corresponding
-/// stack slot if needed.
-void RegAllocFast::spillVirtReg(MachineBasicBlock::iterator MI,
-                                Register VirtReg) {
-  assert(Register::isVirtualRegister(VirtReg) &&
-         "Spilling a physical register is illegal!");
-  LiveRegMap::iterator LRI = findLiveVirtReg(VirtReg);
-  assert(LRI != LiveVirtRegs.end() && LRI->PhysReg &&
-         "Spilling unmapped virtual register");
-  spillVirtReg(MI, *LRI);
+  return I;
 }
 
-/// Do the actual work of spilling.
-void RegAllocFast::spillVirtReg(MachineBasicBlock::iterator MI, LiveReg &LR) {
-  assert(PhysRegState[LR.PhysReg] == LR.VirtReg && "Broken RegState mapping");
+/// Reload all currently assigned virtual registers.
+void RegAllocFast::reloadAtBegin(MachineBasicBlock &MBB) {
+  if (LiveVirtRegs.empty())
+    return;
 
-  if (LR.Dirty) {
-    // If this physreg is used by the instruction, we want to kill it on the
-    // instruction, not on the spill.
-    bool SpillKill = MachineBasicBlock::iterator(LR.LastUse) != MI;
-    LR.Dirty = false;
+  for (MachineBasicBlock::RegisterMaskPair P : MBB.liveins()) {
+    MCPhysReg Reg = P.PhysReg;
+    // Set state to live-in. This possibly overrides mappings to virtual
+    // registers but we don't care anymore at this point.
+    setPhysRegState(Reg, regLiveIn);
+  }
 
-    spill(MI, LR.VirtReg, LR.PhysReg, SpillKill);
 
-    if (SpillKill)
-      LR.LastUse = nullptr; // Don't kill register again
-  }
-  killVirtReg(LR);
-}
+  SmallSet<Register, 2> PrologLiveIns;
 
-/// Spill all dirty virtregs without killing them.
-void RegAllocFast::spillAll(MachineBasicBlock::iterator MI, bool OnlyLiveOut) {
-  if (LiveVirtRegs.empty())
-    return;
   // The LiveRegMap is keyed by an unsigned (the virtreg number), so the order
   // of spilling here is deterministic, if arbitrary.
-  for (LiveReg &LR : LiveVirtRegs) {
-    if (!LR.PhysReg)
+  MachineBasicBlock::iterator InsertBefore
+    = getMBBBeginInsertionPoint(MBB, PrologLiveIns);
+  for (const LiveReg &LR : LiveVirtRegs) {
+    MCPhysReg PhysReg = LR.PhysReg;
+    if (PhysReg == 0)
       continue;
-    if (OnlyLiveOut && !mayLiveOut(LR.VirtReg))
+
+    MCRegister FirstUnit = *MCRegUnitIterator(PhysReg, TRI);
+    if (RegUnitStates[FirstUnit] == regLiveIn)
       continue;
-    spillVirtReg(MI, LR);
+
+    assert((&MBB != &MBB.getParent()->front() || IgnoreMissingDefs) &&
+           "no reload in start block. Missing vreg def?");
+
+    if (PrologLiveIns.count(PhysReg)) {
+      // FIXME: Theoretically this should use an insert point skipping labels
+      // but I'm not sure how labels should interact with prolog instruction
+      // that need reloads.
+      reload(MBB.begin(), LR.VirtReg, PhysReg);
+    } else
+      reload(InsertBefore, LR.VirtReg, PhysReg);
   }
   LiveVirtRegs.clear();
 }
@@ -451,105 +521,73 @@ void RegAllocFast::spillAll(MachineBasicBlock::iterator MI, bool OnlyLiveOut) {
 /// Handle the direct use of a physical register.  Check that the register is
 /// not used by a virtreg. Kill the physreg, marking it free. This may add
 /// implicit kills to MO->getParent() and invalidate MO.
-void RegAllocFast::usePhysReg(MachineOperand &MO) {
-  // Ignore undef uses.
-  if (MO.isUndef())
-    return;
+bool RegAllocFast::usePhysReg(MachineInstr &MI, MCPhysReg Reg) {
+  assert(Register::isPhysicalRegister(Reg) && "expected physreg");
+  bool displacedAny = displacePhysReg(MI, Reg);
+  setPhysRegState(Reg, regPreAssigned);
+  markRegUsedInInstr(Reg);
+  return displacedAny;
+}
 
-  Register PhysReg = MO.getReg();
-  assert(PhysReg.isPhysical() && "Bad usePhysReg operand");
+bool RegAllocFast::definePhysReg(MachineInstr &MI, MCPhysReg Reg) {
+  bool displacedAny = displacePhysReg(MI, Reg);
+  setPhysRegState(Reg, regPreAssigned);
+  return displacedAny;
+}
 
-  markRegUsedInInstr(PhysReg);
-  switch (PhysRegState[PhysReg]) {
-  case regDisabled:
-    break;
-  case regReserved:
-    PhysRegState[PhysReg] = regFree;
-    LLVM_FALLTHROUGH;
-  case regFree:
-    MO.setIsKill();
-    return;
-  default:
-    // The physreg was allocated to a virtual register. That means the value we
-    // wanted has been clobbered.
-    llvm_unreachable("Instruction uses an allocated register");
-  }
+/// Mark PhysReg as reserved or free after spilling any virtregs. This is very
+/// similar to defineVirtReg except the physreg is reserved instead of
+/// allocated.
+bool RegAllocFast::displacePhysReg(MachineInstr &MI, MCPhysReg PhysReg) {
+  bool displacedAny = false;
 
-  // Maybe a superregister is reserved?
-  for (MCRegAliasIterator AI(PhysReg, TRI, false); AI.isValid(); ++AI) {
-    MCPhysReg Alias = *AI;
-    switch (PhysRegState[Alias]) {
-    case regDisabled:
+  for (MCRegUnitIterator UI(PhysReg, TRI); UI.isValid(); ++UI) {
+    unsigned Unit = *UI;
+    switch (unsigned VirtReg = RegUnitStates[Unit]) {
+    default: {
+      LiveRegMap::iterator LRI = findLiveVirtReg(VirtReg);
+      assert(LRI != LiveVirtRegs.end() && "datastructures in sync");
+      MachineBasicBlock::iterator ReloadBefore =
+          std::next((MachineBasicBlock::iterator)MI.getIterator());
+      reload(ReloadBefore, VirtReg, LRI->PhysReg);
+
+      setPhysRegState(LRI->PhysReg, regFree);
+      LRI->PhysReg = 0;
+      LRI->Reloaded = true;
+      displacedAny = true;
+      break;
+    }
+    case regPreAssigned:
+      RegUnitStates[Unit] = regFree;
+      displacedAny = true;
       break;
-    case regReserved:
-      // Either PhysReg is a subregister of Alias and we mark the
-      // whole register as free, or PhysReg is the superregister of
-      // Alias and we mark all the aliases as disabled before freeing
-      // PhysReg.
-      // In the latter case, since PhysReg was disabled, this means that
-      // its value is defined only by physical sub-registers. This check
-      // is performed by the assert of the default case in this loop.
-      // Note: The value of the superregister may only be partial
-      // defined, that is why regDisabled is a valid state for aliases.
-      assert((TRI->isSuperRegister(PhysReg, Alias) ||
-              TRI->isSuperRegister(Alias, PhysReg)) &&
-             "Instruction is not using a subregister of a reserved register");
-      LLVM_FALLTHROUGH;
     case regFree:
-      if (TRI->isSuperRegister(PhysReg, Alias)) {
-        // Leave the superregister in the working set.
-        setPhysRegState(Alias, regFree);
-        MO.getParent()->addRegisterKilled(Alias, TRI, true);
-        return;
-      }
-      // Some other alias was in the working set - clear it.
-      setPhysRegState(Alias, regDisabled);
       break;
-    default:
-      llvm_unreachable("Instruction uses an alias of an allocated register");
     }
   }
-
-  // All aliases are disabled, bring register into working set.
-  setPhysRegState(PhysReg, regFree);
-  MO.setIsKill();
+  return displacedAny;
 }
 
-/// Mark PhysReg as reserved or free after spilling any virtregs. This is very
-/// similar to defineVirtReg except the physreg is reserved instead of
-/// allocated.
-void RegAllocFast::definePhysReg(MachineBasicBlock::iterator MI,
-                                 MCPhysReg PhysReg, RegState NewState) {
-  markRegUsedInInstr(PhysReg);
-  switch (Register VirtReg = PhysRegState[PhysReg]) {
-  case regDisabled:
-    break;
-  default:
-    spillVirtReg(MI, VirtReg);
-    LLVM_FALLTHROUGH;
+void RegAllocFast::freePhysReg(MCPhysReg PhysReg) {
+  LLVM_DEBUG(dbgs() << "Freeing " << printReg(PhysReg, TRI) << ':');
+
+  MCRegister FirstUnit = *MCRegUnitIterator(PhysReg, TRI);
+  switch (unsigned VirtReg = RegUnitStates[FirstUnit]) {
   case regFree:
-  case regReserved:
-    setPhysRegState(PhysReg, NewState);
+    LLVM_DEBUG(dbgs() << '\n');
     return;
-  }
-
-  // This is a disabled register, disable all aliases.
-  setPhysRegState(PhysReg, NewState);
-  for (MCRegAliasIterator AI(PhysReg, TRI, false); AI.isValid(); ++AI) {
-    MCPhysReg Alias = *AI;
-    switch (Register VirtReg = PhysRegState[Alias]) {
-    case regDisabled:
-      break;
-    default:
-      spillVirtReg(MI, VirtReg);
-      LLVM_FALLTHROUGH;
-    case regFree:
-    case regReserved:
-      setPhysRegState(Alias, regDisabled);
-      if (TRI->isSuperRegister(PhysReg, Alias))
-        return;
-      break;
+  case regPreAssigned:
+    LLVM_DEBUG(dbgs() << '\n');
+    setPhysRegState(PhysReg, regFree);
+    return;
+  default: {
+      LiveRegMap::iterator LRI = findLiveVirtReg(VirtReg);
+      assert(LRI != LiveVirtRegs.end());
+      LLVM_DEBUG(dbgs() << ' ' << printReg(LRI->VirtReg, TRI) << '\n');
+      setPhysRegState(LRI->PhysReg, regFree);
+      LRI->PhysReg = 0;
     }
+    return;
   }
 }
 
@@ -558,57 +596,61 @@ void RegAllocFast::definePhysReg(MachineBasicBlock::iterator MI,
 /// disabled - it can be allocated directly.
 /// \returns spillImpossible when PhysReg or an alias can't be spilled.
 unsigned RegAllocFast::calcSpillCost(MCPhysReg PhysReg) const {
-  if (isRegUsedInInstr(PhysReg)) {
-    LLVM_DEBUG(dbgs() << printReg(PhysReg, TRI)
-                      << " is already used in instr.\n");
-    return spillImpossible;
-  }
-  switch (Register VirtReg = PhysRegState[PhysReg]) {
-  case regDisabled:
-    break;
-  case regFree:
-    return 0;
-  case regReserved:
-    LLVM_DEBUG(dbgs() << printReg(VirtReg, TRI) << " corresponding "
-                      << printReg(PhysReg, TRI) << " is reserved already.\n");
-    return spillImpossible;
-  default: {
-    LiveRegMap::const_iterator LRI = findLiveVirtReg(VirtReg);
-    assert(LRI != LiveVirtRegs.end() && LRI->PhysReg &&
-           "Missing VirtReg entry");
-    return LRI->Dirty ? spillDirty : spillClean;
-  }
-  }
-
-  // This is a disabled register, add up cost of aliases.
-  LLVM_DEBUG(dbgs() << printReg(PhysReg, TRI) << " is disabled.\n");
-  unsigned Cost = 0;
-  for (MCRegAliasIterator AI(PhysReg, TRI, false); AI.isValid(); ++AI) {
-    MCPhysReg Alias = *AI;
-    switch (Register VirtReg = PhysRegState[Alias]) {
-    case regDisabled:
-      break;
+  for (MCRegUnitIterator UI(PhysReg, TRI); UI.isValid(); ++UI) {
+    switch (unsigned VirtReg = RegUnitStates[*UI]) {
     case regFree:
-      ++Cost;
       break;
-    case regReserved:
+    case regPreAssigned:
+      LLVM_DEBUG(dbgs() << "Cannot spill pre-assigned "
+                        << printReg(PhysReg, TRI) << '\n');
       return spillImpossible;
     default: {
-      LiveRegMap::const_iterator LRI = findLiveVirtReg(VirtReg);
-      assert(LRI != LiveVirtRegs.end() && LRI->PhysReg &&
-             "Missing VirtReg entry");
-      Cost += LRI->Dirty ? spillDirty : spillClean;
-      break;
+      bool SureSpill = StackSlotForVirtReg[VirtReg] != -1 ||
+                       findLiveVirtReg(VirtReg)->LiveOut;
+      return SureSpill ? spillClean : spillDirty;
     }
     }
   }
-  return Cost;
+  return 0;
+}
+
+void RegAllocFast::assignDanglingDebugValues(MachineInstr &Definition,
+                                             Register VirtReg, MCPhysReg Reg) {
+  auto UDBGValIter = DanglingDbgValues.find(VirtReg);
+  if (UDBGValIter == DanglingDbgValues.end())
+    return;
+
+  SmallVectorImpl<MachineInstr*> &Dangling = UDBGValIter->second;
+  for (MachineInstr *DbgValue : Dangling) {
+    assert(DbgValue->isDebugValue());
+    MachineOperand &MO = DbgValue->getOperand(0);
+    if (!MO.isReg())
+      continue;
+
+    // Test whether the physreg survives from the definition to the DBG_VALUE.
+    MCPhysReg SetToReg = Reg;
+    unsigned Limit = 20;
+    for (MachineBasicBlock::iterator I = std::next(Definition.getIterator()),
+         E = DbgValue->getIterator(); I != E; ++I) {
+      if (I->modifiesRegister(Reg, TRI) || --Limit == 0) {
+        LLVM_DEBUG(dbgs() << "Register did not survive for " << *DbgValue
+                   << '\n');
+        SetToReg = 0;
+        break;
+      }
+    }
+    MO.setReg(SetToReg);
+    if (SetToReg != 0)
+      MO.setIsRenamable();
+  }
+  Dangling.clear();
 }
 
 /// This method updates local state so that we know that PhysReg is the
 /// proper container for VirtReg now.  The physical register must not be used
 /// for anything else when this is called.
-void RegAllocFast::assignVirtToPhysReg(LiveReg &LR, MCPhysReg PhysReg) {
+void RegAllocFast::assignVirtToPhysReg(MachineInstr &AtMI, LiveReg &LR,
+                                       MCPhysReg PhysReg) {
   Register VirtReg = LR.VirtReg;
   LLVM_DEBUG(dbgs() << "Assigning " << printReg(VirtReg, TRI) << " to "
                     << printReg(PhysReg, TRI) << '\n');
@@ -616,6 +658,8 @@ void RegAllocFast::assignVirtToPhysReg(LiveReg &LR, MCPhysReg PhysReg) {
   assert(PhysReg != 0 && "Trying to assign no register");
   LR.PhysReg = PhysReg;
   setPhysRegState(PhysReg, VirtReg);
+
+  assignDanglingDebugValues(AtMI, VirtReg, PhysReg);
 }
 
 static bool isCoalescable(const MachineInstr &MI) {
@@ -659,11 +703,10 @@ Register RegAllocFast::traceCopies(Register VirtReg) const {
 }
 
 /// Allocates a physical register for VirtReg.
-void RegAllocFast::allocVirtReg(MachineInstr &MI, LiveReg &LR, Register Hint0) {
+void RegAllocFast::allocVirtReg(MachineInstr &MI, LiveReg &LR,
+                                Register Hint0, bool LookAtPhysRegUses) {
   const Register VirtReg = LR.VirtReg;
-
-  assert(Register::isVirtualRegister(VirtReg) &&
-         "Can only allocate virtual registers");
+  assert(LR.PhysReg == 0);
 
   const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
   LLVM_DEBUG(dbgs() << "Search register for " << printReg(VirtReg)
@@ -671,41 +714,36 @@ void RegAllocFast::allocVirtReg(MachineInstr &MI, LiveReg &LR, Register Hint0) {
                     << " with hint " << printReg(Hint0, TRI) << '\n');
 
   // Take hint when possible.
-  if (Hint0.isPhysical() && MRI->isAllocatable(Hint0) &&
-      RC.contains(Hint0)) {
-    // Ignore the hint if we would have to spill a dirty register.
-    unsigned Cost = calcSpillCost(Hint0);
-    if (Cost < spillDirty) {
+  if (Hint0.isPhysical() && MRI->isAllocatable(Hint0) && RC.contains(Hint0) &&
+      !isRegUsedInInstr(Hint0, LookAtPhysRegUses)) {
+    // Take hint if the register is currently free.
+    if (isPhysRegFree(Hint0)) {
       LLVM_DEBUG(dbgs() << "\tPreferred Register 1: " << printReg(Hint0, TRI)
                         << '\n');
-      if (Cost)
-        definePhysReg(MI, Hint0, regFree);
-      assignVirtToPhysReg(LR, Hint0);
+      assignVirtToPhysReg(MI, LR, Hint0);
       return;
     } else {
-      LLVM_DEBUG(dbgs() << "\tPreferred Register 1: " << printReg(Hint0, TRI)
-                        << "occupied\n");
+      LLVM_DEBUG(dbgs() << "\tPreferred Register 0: " << printReg(Hint0, TRI)
+                        << " occupied\n");
     }
   } else {
     Hint0 = Register();
   }
 
+
   // Try other hint.
   Register Hint1 = traceCopies(VirtReg);
-  if (Hint1.isPhysical() && MRI->isAllocatable(Hint1) &&
-      RC.contains(Hint1) && !isRegUsedInInstr(Hint1)) {
-    // Ignore the hint if we would have to spill a dirty register.
-    unsigned Cost = calcSpillCost(Hint1);
-    if (Cost < spillDirty) {
+  if (Hint1.isPhysical() && MRI->isAllocatable(Hint1) && RC.contains(Hint1) &&
+      !isRegUsedInInstr(Hint1, LookAtPhysRegUses)) {
+    // Take hint if the register is currently free.
+    if (isPhysRegFree(Hint1)) {
       LLVM_DEBUG(dbgs() << "\tPreferred Register 0: " << printReg(Hint1, TRI)
-                        << '\n');
-      if (Cost)
-        definePhysReg(MI, Hint1, regFree);
-      assignVirtToPhysReg(LR, Hint1);
+                 << '\n');
+      assignVirtToPhysReg(MI, LR, Hint1);
       return;
     } else {
-      LLVM_DEBUG(dbgs() << "\tPreferred Register 0: " << printReg(Hint1, TRI)
-                        << "occupied\n");
+      LLVM_DEBUG(dbgs() << "\tPreferred Register 1: " << printReg(Hint1, TRI)
+                 << " occupied\n");
     }
   } else {
     Hint1 = Register();
@@ -716,15 +754,20 @@ void RegAllocFast::allocVirtReg(MachineInstr &MI, LiveReg &LR, Register Hint0) {
   ArrayRef<MCPhysReg> AllocationOrder = RegClassInfo.getOrder(&RC);
   for (MCPhysReg PhysReg : AllocationOrder) {
     LLVM_DEBUG(dbgs() << "\tRegister: " << printReg(PhysReg, TRI) << ' ');
+    if (isRegUsedInInstr(PhysReg, LookAtPhysRegUses)) {
+      LLVM_DEBUG(dbgs() << "already used in instr.\n");
+      continue;
+    }
+
     unsigned Cost = calcSpillCost(PhysReg);
     LLVM_DEBUG(dbgs() << "Cost: " << Cost << " BestCost: " << BestCost << '\n');
     // Immediate take a register with cost 0.
     if (Cost == 0) {
-      assignVirtToPhysReg(LR, PhysReg);
+      assignVirtToPhysReg(MI, LR, PhysReg);
       return;
     }
 
-    if (PhysReg == Hint1 || PhysReg == Hint0)
+    if (PhysReg == Hint0 || PhysReg == Hint1)
       Cost -= spillPrefBonus;
 
     if (Cost < BestCost) {
@@ -740,13 +783,14 @@ void RegAllocFast::allocVirtReg(MachineInstr &MI, LiveReg &LR, Register Hint0) {
       MI.emitError("inline assembly requires more registers than available");
     else
       MI.emitError("ran out of registers during register allocation");
-    definePhysReg(MI, *AllocationOrder.begin(), regFree);
-    assignVirtToPhysReg(LR, *AllocationOrder.begin());
+
+    LR.Error = true;
+    LR.PhysReg = 0;
     return;
   }
 
-  definePhysReg(MI, BestReg, regFree);
-  assignVirtToPhysReg(LR, BestReg);
+  displacePhysReg(MI, BestReg);
+  assignVirtToPhysReg(MI, LR, BestReg);
 }
 
 void RegAllocFast::allocVirtRegUndef(MachineOperand &MO) {
@@ -774,347 +818,491 @@ void RegAllocFast::allocVirtRegUndef(MachineOperand &MO) {
   MO.setIsRenamable(true);
 }
 
-/// Allocates a register for VirtReg and mark it as dirty.
-MCPhysReg RegAllocFast::defineVirtReg(MachineInstr &MI, unsigned OpNum,
-                                      Register VirtReg, Register Hint) {
-  assert(Register::isVirtualRegister(VirtReg) && "Not a virtual register");
+/// Variation of defineVirtReg() with special handling for livethrough regs
+/// (tied or earlyclobber) that may interfere with preassigned uses.
+void RegAllocFast::defineLiveThroughVirtReg(MachineInstr &MI, unsigned OpNum,
+                                            Register VirtReg) {
+  LiveRegMap::iterator LRI = findLiveVirtReg(VirtReg);
+  if (LRI != LiveVirtRegs.end()) {
+    MCPhysReg PrevReg = LRI->PhysReg;
+    if (PrevReg != 0 && isRegUsedInInstr(PrevReg, true)) {
+      LLVM_DEBUG(dbgs() << "Need new assignment for " << printReg(PrevReg, TRI)
+                        << " (tied/earlyclobber resolution)\n");
+      freePhysReg(PrevReg);
+      LRI->PhysReg = 0;
+      allocVirtReg(MI, *LRI, 0, true);
+      MachineBasicBlock::iterator InsertBefore =
+        std::next((MachineBasicBlock::iterator)MI.getIterator());
+      LLVM_DEBUG(dbgs() << "Copy " << printReg(LRI->PhysReg, TRI) << " to "
+                        << printReg(PrevReg, TRI) << '\n');
+      BuildMI(*MBB, InsertBefore, MI.getDebugLoc(),
+              TII->get(TargetOpcode::COPY), PrevReg)
+        .addReg(LRI->PhysReg, llvm::RegState::Kill);
+    }
+    MachineOperand &MO = MI.getOperand(OpNum);
+    if (MO.getSubReg() && !MO.isUndef()) {
+      LRI->LastUse = &MI;
+    }
+  }
+  return defineVirtReg(MI, OpNum, VirtReg, true);
+}
+
+/// Allocates a register for VirtReg definition. Typically the register is
+/// already assigned from a use of the virtreg, however we still need to
+/// perform an allocation if:
+/// - It is a dead definition without any uses.
+/// - The value is live out and all uses are in different basic blocks.
+void RegAllocFast::defineVirtReg(MachineInstr &MI, unsigned OpNum,
+                                 Register VirtReg, bool LookAtPhysRegUses) {
+  assert(VirtReg.isVirtual() && "Not a virtual register");
+  MachineOperand &MO = MI.getOperand(OpNum);
   LiveRegMap::iterator LRI;
   bool New;
   std::tie(LRI, New) = LiveVirtRegs.insert(LiveReg(VirtReg));
-  if (!LRI->PhysReg) {
-    // If there is no hint, peek at the only use of this register.
-    if ((!Hint || !Hint.isPhysical()) &&
-        MRI->hasOneNonDBGUse(VirtReg)) {
-      const MachineInstr &UseMI = *MRI->use_instr_nodbg_begin(VirtReg);
-      // It's a copy, use the destination register as a hint.
-      if (UseMI.isCopyLike())
-        Hint = UseMI.getOperand(0).getReg();
+  if (New) {
+    if (!MO.isDead()) {
+      if (mayLiveOut(VirtReg)) {
+        LRI->LiveOut = true;
+      } else {
+        // It is a dead def without the dead flag; add the flag now.
+        MO.setIsDead(true);
+      }
     }
-    allocVirtReg(MI, *LRI, Hint);
-  } else if (LRI->LastUse) {
-    // Redefining a live register - kill at the last use, unless it is this
-    // instruction defining VirtReg multiple times.
-    if (LRI->LastUse != &MI || LRI->LastUse->getOperand(LRI->LastOpNum).isUse())
-      addKillFlag(*LRI);
   }
-  assert(LRI->PhysReg && "Register not assigned");
-  LRI->LastUse = &MI;
-  LRI->LastOpNum = OpNum;
-  LRI->Dirty = true;
-  markRegUsedInInstr(LRI->PhysReg);
-  return LRI->PhysReg;
+  if (LRI->PhysReg == 0)
+    allocVirtReg(MI, *LRI, 0, LookAtPhysRegUses);
+  else {
+    assert(!isRegUsedInInstr(LRI->PhysReg, LookAtPhysRegUses) &&
+           "TODO: preassign mismatch");
+    LLVM_DEBUG(dbgs() << "In def of " << printReg(VirtReg, TRI)
+                      << " use existing assignment to "
+                      << printReg(LRI->PhysReg, TRI) << '\n');
+  }
+
+  MCPhysReg PhysReg = LRI->PhysReg;
+  assert(PhysReg != 0 && "Register not assigned");
+  if (LRI->Reloaded || LRI->LiveOut) {
+    if (!MI.isImplicitDef()) {
+      MachineBasicBlock::iterator SpillBefore =
+          std::next((MachineBasicBlock::iterator)MI.getIterator());
+      LLVM_DEBUG(dbgs() << "Spill Reason: LO: " << LRI->LiveOut << " RL: "
+                        << LRI->Reloaded << '\n');
+      bool Kill = LRI->LastUse == nullptr;
+      spill(SpillBefore, VirtReg, PhysReg, Kill, LRI->LiveOut);
+      LRI->LastUse = nullptr;
+    }
+    LRI->LiveOut = false;
+    LRI->Reloaded = false;
+  }
+  if (MI.getOpcode() == TargetOpcode::BUNDLE) {
+    BundleVirtRegsMap[VirtReg] = PhysReg;
+  }
+  markRegUsedInInstr(PhysReg);
+  setPhysReg(MI, MO, PhysReg);
 }
 
-/// Make sure VirtReg is available in a physreg and return it.
-RegAllocFast::LiveReg &RegAllocFast::reloadVirtReg(MachineInstr &MI,
-                                                   unsigned OpNum,
-                                                   Register VirtReg,
-                                                   Register Hint) {
-  assert(Register::isVirtualRegister(VirtReg) && "Not a virtual register");
+/// Allocates a register for a VirtReg use.
+void RegAllocFast::useVirtReg(MachineInstr &MI, unsigned OpNum,
+                              Register VirtReg) {
+  assert(VirtReg.isVirtual() && "Not a virtual register");
+  MachineOperand &MO = MI.getOperand(OpNum);
   LiveRegMap::iterator LRI;
   bool New;
   std::tie(LRI, New) = LiveVirtRegs.insert(LiveReg(VirtReg));
-  MachineOperand &MO = MI.getOperand(OpNum);
-  if (!LRI->PhysReg) {
-    allocVirtReg(MI, *LRI, Hint);
-    reload(MI, VirtReg, LRI->PhysReg);
-  } else if (LRI->Dirty) {
-    if (isLastUseOfLocalReg(MO)) {
-      LLVM_DEBUG(dbgs() << "Killing last use: " << MO << '\n');
-      if (MO.isUse())
-        MO.setIsKill();
-      else
-        MO.setIsDead();
-    } else if (MO.isKill()) {
-      LLVM_DEBUG(dbgs() << "Clearing dubious kill: " << MO << '\n');
-      MO.setIsKill(false);
-    } else if (MO.isDead()) {
-      LLVM_DEBUG(dbgs() << "Clearing dubious dead: " << MO << '\n');
-      MO.setIsDead(false);
+  if (New) {
+    MachineOperand &MO = MI.getOperand(OpNum);
+    if (!MO.isKill()) {
+      if (mayLiveOut(VirtReg)) {
+        LRI->LiveOut = true;
+      } else {
+        // It is a last (killing) use without the kill flag; add the flag now.
+        MO.setIsKill(true);
+      }
     }
-  } else if (MO.isKill()) {
-    // We must remove kill flags from uses of reloaded registers because the
-    // register would be killed immediately, and there might be a second use:
-    //   %foo = OR killed %x, %x
-    // This would cause a second reload of %x into a different register.
-    LLVM_DEBUG(dbgs() << "Clearing clean kill: " << MO << '\n');
-    MO.setIsKill(false);
-  } else if (MO.isDead()) {
-    LLVM_DEBUG(dbgs() << "Clearing clean dead: " << MO << '\n');
-    MO.setIsDead(false);
+  } else {
+    assert((!MO.isKill() || LRI->LastUse == &MI) && "Invalid kill flag");
   }
-  assert(LRI->PhysReg && "Register not assigned");
+
+  // If necessary allocate a register.
+  if (LRI->PhysReg == 0) {
+    assert(!MO.isTied() && "tied op should be allocated");
+    Register Hint;
+    if (MI.isCopy() && MI.getOperand(1).getSubReg() == 0) {
+      Hint = MI.getOperand(0).getReg();
+      assert(Hint.isPhysical() &&
+             "Copy destination should already be assigned");
+    }
+    allocVirtReg(MI, *LRI, Hint, false);
+    if (LRI->Error) {
+      const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
+      ArrayRef<MCPhysReg> AllocationOrder = RegClassInfo.getOrder(&RC);
+      setPhysReg(MI, MO, *AllocationOrder.begin());
+      return;
+    }
+  }
+
   LRI->LastUse = &MI;
-  LRI->LastOpNum = OpNum;
+
+  if (MI.getOpcode() == TargetOpcode::BUNDLE) {
+    BundleVirtRegsMap[VirtReg] = LRI->PhysReg;
+  }
   markRegUsedInInstr(LRI->PhysReg);
-  return *LRI;
+  setPhysReg(MI, MO, LRI->PhysReg);
 }
 
 /// Changes operand OpNum in MI the refer the PhysReg, considering subregs. This
 /// may invalidate any operand pointers.  Return true if the operand kills its
 /// register.
-bool RegAllocFast::setPhysReg(MachineInstr &MI, MachineOperand &MO,
+void RegAllocFast::setPhysReg(MachineInstr &MI, MachineOperand &MO,
                               MCPhysReg PhysReg) {
-  bool Dead = MO.isDead();
   if (!MO.getSubReg()) {
     MO.setReg(PhysReg);
     MO.setIsRenamable(true);
-    return MO.isKill() || Dead;
+    return;
   }
 
   // Handle subregister index.
-  MO.setReg(PhysReg ? TRI->getSubReg(PhysReg, MO.getSubReg()) : Register());
+  MO.setReg(PhysReg ? TRI->getSubReg(PhysReg, MO.getSubReg()) : MCRegister());
   MO.setIsRenamable(true);
-  MO.setSubReg(0);
+  // Note: We leave the subreg number around a little longer in case of defs.
+  // This is so that the register freeing logic in allocateInstruction can still
+  // recognize this as subregister defs. The code there will clear the number.
+  if (!MO.isDef())
+    MO.setSubReg(0);
 
   // A kill flag implies killing the full register. Add corresponding super
   // register kill.
   if (MO.isKill()) {
     MI.addRegisterKilled(PhysReg, TRI, true);
-    return true;
+    return;
   }
 
   // A <def,read-undef> of a sub-register requires an implicit def of the full
   // register.
-  if (MO.isDef() && MO.isUndef())
-    MI.addRegisterDefined(PhysReg, TRI);
-
-  return Dead;
-}
-
-// Handles special instruction operand like early clobbers and tied ops when
-// there are additional physreg defines.
-void RegAllocFast::handleThroughOperands(MachineInstr &MI,
-                                         SmallVectorImpl<Register> &VirtDead) {
-  LLVM_DEBUG(dbgs() << "Scanning for through registers:");
-  SmallSet<Register, 8> ThroughRegs;
-  for (const MachineOperand &MO : MI.operands()) {
-    if (!MO.isReg()) continue;
-    Register Reg = MO.getReg();
-    if (!Reg.isVirtual())
-      continue;
-    if (MO.isEarlyClobber() || (MO.isUse() && MO.isTied()) ||
-        (MO.getSubReg() && MI.readsVirtualRegister(Reg))) {
-      if (ThroughRegs.insert(Reg).second)
-        LLVM_DEBUG(dbgs() << ' ' << printReg(Reg));
-    }
-  }
-
-  // If any physreg defines collide with preallocated through registers,
-  // we must spill and reallocate.
-  LLVM_DEBUG(dbgs() << "\nChecking for physdef collisions.\n");
-  for (const MachineOperand &MO : MI.operands()) {
-    if (!MO.isReg() || !MO.isDef()) continue;
-    Register Reg = MO.getReg();
-    if (!Reg || !Reg.isPhysical())
-      continue;
-    markRegUsedInInstr(Reg);
-    for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
-      if (ThroughRegs.count(PhysRegState[*AI]))
-        definePhysReg(MI, *AI, regFree);
-    }
-  }
-
-  SmallVector<Register, 8> PartialDefs;
-  LLVM_DEBUG(dbgs() << "Allocating tied uses.\n");
-  for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
-    MachineOperand &MO = MI.getOperand(I);
-    if (!MO.isReg()) continue;
-    Register Reg = MO.getReg();
-    if (!Register::isVirtualRegister(Reg))
-      continue;
-    if (MO.isUse()) {
-      if (!MO.isTied()) continue;
-      LLVM_DEBUG(dbgs() << "Operand " << I << "(" << MO
-                        << ") is tied to operand " << MI.findTiedOperandIdx(I)
-                        << ".\n");
-      LiveReg &LR = reloadVirtReg(MI, I, Reg, 0);
-      MCPhysReg PhysReg = LR.PhysReg;
-      setPhysReg(MI, MO, PhysReg);
-      // Note: we don't update the def operand yet. That would cause the normal
-      // def-scan to attempt spilling.
-    } else if (MO.getSubReg() && MI.readsVirtualRegister(Reg)) {
-      LLVM_DEBUG(dbgs() << "Partial redefine: " << MO << '\n');
-      // Reload the register, but don't assign to the operand just yet.
-      // That would confuse the later phys-def processing pass.
-      LiveReg &LR = reloadVirtReg(MI, I, Reg, 0);
-      PartialDefs.push_back(LR.PhysReg);
-    }
-  }
-
-  LLVM_DEBUG(dbgs() << "Allocating early clobbers.\n");
-  for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
-    const MachineOperand &MO = MI.getOperand(I);
-    if (!MO.isReg()) continue;
-    Register Reg = MO.getReg();
-    if (!Register::isVirtualRegister(Reg))
-      continue;
-    if (!MO.isEarlyClobber())
-      continue;
-    // Note: defineVirtReg may invalidate MO.
-    MCPhysReg PhysReg = defineVirtReg(MI, I, Reg, 0);
-    if (setPhysReg(MI, MI.getOperand(I), PhysReg))
-      VirtDead.push_back(Reg);
-  }
-
-  // Restore UsedInInstr to a state usable for allocating normal virtual uses.
-  UsedInInstr.clear();
-  for (const MachineOperand &MO : MI.operands()) {
-    if (!MO.isReg() || (MO.isDef() && !MO.isEarlyClobber())) continue;
-    Register Reg = MO.getReg();
-    if (!Reg || !Reg.isPhysical())
-      continue;
-    LLVM_DEBUG(dbgs() << "\tSetting " << printReg(Reg, TRI)
-                      << " as used in instr\n");
-    markRegUsedInInstr(Reg);
+  if (MO.isDef() && MO.isUndef()) {
+    if (MO.isDead())
+      MI.addRegisterDead(PhysReg, TRI, true);
+    else
+      MI.addRegisterDefined(PhysReg, TRI);
   }
-
-  // Also mark PartialDefs as used to avoid reallocation.
-  for (Register PartialDef : PartialDefs)
-    markRegUsedInInstr(PartialDef);
 }
 
 #ifndef NDEBUG
-void RegAllocFast::dumpState() {
-  for (unsigned Reg = 1, E = TRI->getNumRegs(); Reg != E; ++Reg) {
-    if (PhysRegState[Reg] == regDisabled) continue;
-    dbgs() << " " << printReg(Reg, TRI);
-    switch(PhysRegState[Reg]) {
+
+void RegAllocFast::dumpState() const {
+  for (unsigned Unit = 1, UnitE = TRI->getNumRegUnits(); Unit != UnitE;
+       ++Unit) {
+    switch (unsigned VirtReg = RegUnitStates[Unit]) {
     case regFree:
       break;
-    case regReserved:
-      dbgs() << "*";
+    case regPreAssigned:
+      dbgs() << " " << printRegUnit(Unit, TRI) << "[P]";
       break;
+    case regLiveIn:
+      llvm_unreachable("Should not have regLiveIn in map");
     default: {
-      dbgs() << '=' << printReg(PhysRegState[Reg]);
-      LiveRegMap::iterator LRI = findLiveVirtReg(PhysRegState[Reg]);
-      assert(LRI != LiveVirtRegs.end() && LRI->PhysReg &&
-             "Missing VirtReg entry");
-      if (LRI->Dirty)
-        dbgs() << "*";
-      assert(LRI->PhysReg == Reg && "Bad inverse map");
+      dbgs() << ' ' << printRegUnit(Unit, TRI) << '=' << printReg(VirtReg);
+      LiveRegMap::const_iterator I = findLiveVirtReg(VirtReg);
+      assert(I != LiveVirtRegs.end() && "have LiveVirtRegs entry");
+      if (I->LiveOut || I->Reloaded) {
+        dbgs() << '[';
+        if (I->LiveOut) dbgs() << 'O';
+        if (I->Reloaded) dbgs() << 'R';
+        dbgs() << ']';
+      }
+      assert(TRI->hasRegUnit(I->PhysReg, Unit) && "inverse mapping present");
       break;
     }
     }
   }
   dbgs() << '\n';
   // Check that LiveVirtRegs is the inverse.
-  for (LiveRegMap::iterator i = LiveVirtRegs.begin(),
-       e = LiveVirtRegs.end(); i != e; ++i) {
-    if (!i->PhysReg)
-      continue;
-    assert(i->VirtReg.isVirtual() && "Bad map key");
-    assert(Register::isPhysicalRegister(i->PhysReg) && "Bad map value");
-    assert(PhysRegState[i->PhysReg] == i->VirtReg && "Bad inverse map");
+  for (const LiveReg &LR : LiveVirtRegs) {
+    Register VirtReg = LR.VirtReg;
+    assert(VirtReg.isVirtual() && "Bad map key");
+    MCPhysReg PhysReg = LR.PhysReg;
+    if (PhysReg != 0) {
+      assert(Register::isPhysicalRegister(PhysReg) &&
+             "mapped to physreg");
+      for (MCRegUnitIterator UI(PhysReg, TRI); UI.isValid(); ++UI) {
+        assert(RegUnitStates[*UI] == VirtReg && "inverse map valid");
+      }
+    }
   }
 }
 #endif
 
-void RegAllocFast::allocateInstruction(MachineInstr &MI) {
-  const MCInstrDesc &MCID = MI.getDesc();
-
-  // If this is a copy, we may be able to coalesce.
-  Register CopySrcReg;
-  Register CopyDstReg;
-  unsigned CopySrcSub = 0;
-  unsigned CopyDstSub = 0;
-  if (MI.isCopy()) {
-    CopyDstReg = MI.getOperand(0).getReg();
-    CopySrcReg = MI.getOperand(1).getReg();
-    CopyDstSub = MI.getOperand(0).getSubReg();
-    CopySrcSub = MI.getOperand(1).getSubReg();
+/// Count number of defs consumed from each register class by \p Reg
+void RegAllocFast::addRegClassDefCounts(std::vector<unsigned> &RegClassDefCounts,
+                                        Register Reg) const {
+  assert(RegClassDefCounts.size() == TRI->getNumRegClasses());
+
+  if (Reg.isVirtual()) {
+    const TargetRegisterClass *OpRC = MRI->getRegClass(Reg);
+    for (unsigned RCIdx = 0, RCIdxEnd = TRI->getNumRegClasses();
+         RCIdx != RCIdxEnd; ++RCIdx) {
+      const TargetRegisterClass *IdxRC = TRI->getRegClass(RCIdx);
+      // FIXME: Consider aliasing sub/super registers.
+      if (OpRC->hasSubClassEq(IdxRC))
+        ++RegClassDefCounts[RCIdx];
+    }
+
+    return;
   }
 
-  // Track registers used by instruction.
-  UsedInInstr.clear();
+  for (unsigned RCIdx = 0, RCIdxEnd = TRI->getNumRegClasses();
+       RCIdx != RCIdxEnd; ++RCIdx) {
+    const TargetRegisterClass *IdxRC = TRI->getRegClass(RCIdx);
+    for (MCRegAliasIterator Alias(Reg, TRI, true); Alias.isValid(); ++Alias) {
+      if (IdxRC->contains(*Alias)) {
+        ++RegClassDefCounts[RCIdx];
+        break;
+      }
+    }
+  }
+}
 
-  // First scan.
-  // Mark physreg uses and early clobbers as used.
-  // Find the end of the virtreg operands
-  unsigned VirtOpEnd = 0;
-  bool hasTiedOps = false;
-  bool hasEarlyClobbers = false;
-  bool hasPartialRedefs = false;
-  bool hasPhysDefs = false;
-  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
-    MachineOperand &MO = MI.getOperand(i);
-    // Make sure MRI knows about registers clobbered by regmasks.
-    if (MO.isRegMask()) {
-      MRI->addPhysRegsUsedFromRegMask(MO.getRegMask());
-      continue;
+void RegAllocFast::allocateInstruction(MachineInstr &MI) {
+  // The basic algorithm here is:
+  // 1. Mark registers of def operands as free
+  // 2. Allocate registers to use operands and place reload instructions for
+  //    registers displaced by the allocation.
+  //
+  // However we need to handle some corner cases:
+  // - pre-assigned defs and uses need to be handled before the other def/use
+  //   operands are processed to avoid the allocation heuristics clashing with
+  //   the pre-assignment.
+  // - The "free def operands" step has to come last instead of first for tied
+  //   operands and early-clobbers.
+
+  UsedInInstr.clear();
+  BundleVirtRegsMap.clear();
+
+  // Scan for special cases; Apply pre-assigned register defs to state.
+  bool HasPhysRegUse = false;
+  bool HasRegMask = false;
+  bool HasVRegDef = false;
+  bool HasDef = false;
+  bool HasEarlyClobber = false;
+  bool NeedToAssignLiveThroughs = false;
+  for (MachineOperand &MO : MI.operands()) {
+    if (MO.isReg()) {
+      Register Reg = MO.getReg();
+      if (Reg.isVirtual()) {
+        if (MO.isDef()) {
+          HasDef = true;
+          HasVRegDef = true;
+          if (MO.isEarlyClobber()) {
+            HasEarlyClobber = true;
+            NeedToAssignLiveThroughs = true;
+          }
+          if (MO.isTied() || (MO.getSubReg() != 0 && !MO.isUndef()))
+            NeedToAssignLiveThroughs = true;
+        }
+      } else if (Reg.isPhysical()) {
+        if (!MRI->isReserved(Reg)) {
+          if (MO.isDef()) {
+            HasDef = true;
+            bool displacedAny = definePhysReg(MI, Reg);
+            if (MO.isEarlyClobber())
+              HasEarlyClobber = true;
+            if (!displacedAny)
+              MO.setIsDead(true);
+          }
+          if (MO.readsReg())
+            HasPhysRegUse = true;
+        }
+      }
+    } else if (MO.isRegMask()) {
+      HasRegMask = true;
     }
-    if (!MO.isReg()) continue;
-    Register Reg = MO.getReg();
-    if (!Reg) continue;
-    if (Register::isVirtualRegister(Reg)) {
-      VirtOpEnd = i+1;
-      if (MO.isUse()) {
-        hasTiedOps = hasTiedOps ||
-                            MCID.getOperandConstraint(i, MCOI::TIED_TO) != -1;
+  }
+
+  // Allocate virtreg defs.
+  if (HasDef) {
+    if (HasVRegDef) {
+      // Special handling for early clobbers, tied operands or subregister defs:
+      // Compared to "normal" defs these:
+      // - Must not use a register that is pre-assigned for a use operand.
+      // - In order to solve tricky inline assembly constraints we change the
+      //   heuristic to figure out a good operand order before doing
+      //   assignments.
+      if (NeedToAssignLiveThroughs) {
+        DefOperandIndexes.clear();
+        PhysRegUses.clear();
+
+        // Track number of defs which may consume a register from the class.
+        std::vector<unsigned> RegClassDefCounts(TRI->getNumRegClasses(), 0);
+        assert(RegClassDefCounts[0] == 0);
+
+        LLVM_DEBUG(dbgs() << "Need to assign livethroughs\n");
+        for (unsigned I = 0, E = MI.getNumOperands(); I < E; ++I) {
+          const MachineOperand &MO = MI.getOperand(I);
+          if (!MO.isReg())
+            continue;
+          Register Reg = MO.getReg();
+          if (MO.readsReg()) {
+            if (Reg.isPhysical()) {
+              LLVM_DEBUG(dbgs() << "mark extra used: " << printReg(Reg, TRI)
+                                << '\n');
+              markPhysRegUsedInInstr(Reg);
+            }
+          }
+
+          if (MO.isDef()) {
+            if (Reg.isVirtual())
+              DefOperandIndexes.push_back(I);
+
+            addRegClassDefCounts(RegClassDefCounts, Reg);
+          }
+        }
+
+        llvm::sort(DefOperandIndexes, [&](uint16_t I0, uint16_t I1) {
+          const MachineOperand &MO0 = MI.getOperand(I0);
+          const MachineOperand &MO1 = MI.getOperand(I1);
+          Register Reg0 = MO0.getReg();
+          Register Reg1 = MO1.getReg();
+          const TargetRegisterClass &RC0 = *MRI->getRegClass(Reg0);
+          const TargetRegisterClass &RC1 = *MRI->getRegClass(Reg1);
+
+          // Identify regclass that are easy to use up completely just in this
+          // instruction.
+          unsigned ClassSize0 = RegClassInfo.getOrder(&RC0).size();
+          unsigned ClassSize1 = RegClassInfo.getOrder(&RC1).size();
+
+          bool SmallClass0 = ClassSize0 < RegClassDefCounts[RC0.getID()];
+          bool SmallClass1 = ClassSize1 < RegClassDefCounts[RC1.getID()];
+          if (SmallClass0 > SmallClass1)
+            return true;
+          if (SmallClass0 < SmallClass1)
+            return false;
+
+          // Allocate early clobbers and livethrough operands first.
+          bool Livethrough0 = MO0.isEarlyClobber() || MO0.isTied() ||
+                              (MO0.getSubReg() == 0 && !MO0.isUndef());
+          bool Livethrough1 = MO1.isEarlyClobber() || MO1.isTied() ||
+                              (MO1.getSubReg() == 0 && !MO1.isUndef());
+          if (Livethrough0 > Livethrough1)
+            return true;
+          if (Livethrough0 < Livethrough1)
+            return false;
+
+          // Tie-break rule: operand index.
+          return I0 < I1;
+        });
+
+        for (uint16_t OpIdx : DefOperandIndexes) {
+          MachineOperand &MO = MI.getOperand(OpIdx);
+          LLVM_DEBUG(dbgs() << "Allocating " << MO << '\n');
+          unsigned Reg = MO.getReg();
+          if (MO.isEarlyClobber() || MO.isTied() ||
+              (MO.getSubReg() && !MO.isUndef())) {
+            defineLiveThroughVirtReg(MI, OpIdx, Reg);
+          } else {
+            defineVirtReg(MI, OpIdx, Reg);
+          }
+        }
       } else {
-        if (MO.isEarlyClobber())
-          hasEarlyClobbers = true;
-        if (MO.getSubReg() && MI.readsVirtualRegister(Reg))
-          hasPartialRedefs = true;
+        // Assign virtual register defs.
+        for (unsigned I = 0, E = MI.getNumOperands(); I < E; ++I) {
+          MachineOperand &MO = MI.getOperand(I);
+          if (!MO.isReg() || !MO.isDef())
+            continue;
+          Register Reg = MO.getReg();
+          if (Reg.isVirtual())
+            defineVirtReg(MI, I, Reg);
+        }
       }
-      continue;
     }
-    if (!MRI->isAllocatable(Reg)) continue;
-    if (MO.isUse()) {
-      usePhysReg(MO);
-    } else if (MO.isEarlyClobber()) {
-      definePhysReg(MI, Reg,
-                    (MO.isImplicit() || MO.isDead()) ? regFree : regReserved);
-      hasEarlyClobbers = true;
-    } else
-      hasPhysDefs = true;
+
+    // Free registers occupied by defs.
+    // Iterate operands in reverse order, so we see the implicit super register
+    // defs first (we added them earlier in case of <def,read-undef>).
+    for (unsigned I = MI.getNumOperands(); I-- > 0;) {
+      MachineOperand &MO = MI.getOperand(I);
+      if (!MO.isReg() || !MO.isDef())
+        continue;
+
+      // subreg defs don't free the full register. We left the subreg number
+      // around as a marker in setPhysReg() to recognize this case here.
+      if (MO.getSubReg() != 0) {
+        MO.setSubReg(0);
+        continue;
+      }
+
+      // Do not free tied operands and early clobbers.
+      if (MO.isTied() || MO.isEarlyClobber())
+        continue;
+      Register Reg = MO.getReg();
+      if (!Reg)
+        continue;
+      assert(Reg.isPhysical());
+      if (MRI->isReserved(Reg))
+        continue;
+      freePhysReg(Reg);
+      unmarkRegUsedInInstr(Reg);
+    }
   }
 
-  // The instruction may have virtual register operands that must be allocated
-  // the same register at use-time and def-time: early clobbers and tied
-  // operands. If there are also physical defs, these registers must avoid
-  // both physical defs and uses, making them more constrained than normal
-  // operands.
-  // Similarly, if there are multiple defs and tied operands, we must make
-  // sure the same register is allocated to uses and defs.
-  // We didn't detect inline asm tied operands above, so just make this extra
-  // pass for all inline asm.
-  if (MI.isInlineAsm() || hasEarlyClobbers || hasPartialRedefs ||
-      (hasTiedOps && (hasPhysDefs || MCID.getNumDefs() > 1))) {
-    handleThroughOperands(MI, VirtDead);
-    // Don't attempt coalescing when we have funny stuff going on.
-    CopyDstReg = Register();
-    // Pretend we have early clobbers so the use operands get marked below.
-    // This is not necessary for the common case of a single tied use.
-    hasEarlyClobbers = true;
+  // Displace clobbered registers.
+  if (HasRegMask) {
+    for (const MachineOperand &MO : MI.operands()) {
+      if (MO.isRegMask()) {
+        // MRI bookkeeping.
+        MRI->addPhysRegsUsedFromRegMask(MO.getRegMask());
+
+        // Displace clobbered registers.
+        const uint32_t *Mask = MO.getRegMask();
+        for (LiveRegMap::iterator LRI = LiveVirtRegs.begin(),
+             LRIE = LiveVirtRegs.end(); LRI != LRIE; ++LRI) {
+          MCPhysReg PhysReg = LRI->PhysReg;
+          if (PhysReg != 0 && MachineOperand::clobbersPhysReg(Mask, PhysReg))
+            displacePhysReg(MI, PhysReg);
+        }
+      }
+    }
   }
 
-  // Second scan.
-  // Allocate virtreg uses.
+  // Apply pre-assigned register uses to state.
+  if (HasPhysRegUse) {
+    for (MachineOperand &MO : MI.operands()) {
+      if (!MO.isReg() || !MO.readsReg())
+        continue;
+      Register Reg = MO.getReg();
+      if (!Reg.isPhysical())
+        continue;
+      if (MRI->isReserved(Reg))
+        continue;
+      bool displacedAny = usePhysReg(MI, Reg);
+      if (!displacedAny && !MRI->isReserved(Reg))
+        MO.setIsKill(true);
+    }
+  }
+
+  // Allocate virtreg uses and insert reloads as necessary.
   bool HasUndefUse = false;
-  for (unsigned I = 0; I != VirtOpEnd; ++I) {
+  for (unsigned I = 0; I < MI.getNumOperands(); ++I) {
     MachineOperand &MO = MI.getOperand(I);
-    if (!MO.isReg()) continue;
+    if (!MO.isReg() || !MO.isUse())
+      continue;
     Register Reg = MO.getReg();
     if (!Reg.isVirtual())
       continue;
-    if (MO.isUse()) {
-      if (MO.isUndef()) {
-        HasUndefUse = true;
-        // There is no need to allocate a register for an undef use.
-        continue;
-      }
 
-      // Populate MayLiveAcrossBlocks in case the use block is allocated before
-      // the def block (removing the vreg uses).
-      mayLiveIn(Reg);
-
-      LiveReg &LR = reloadVirtReg(MI, I, Reg, CopyDstReg);
-      MCPhysReg PhysReg = LR.PhysReg;
-      CopySrcReg = (CopySrcReg == Reg || CopySrcReg == PhysReg) ? PhysReg : 0;
-      if (setPhysReg(MI, MO, PhysReg))
-        killVirtReg(LR);
+    if (MO.isUndef()) {
+      HasUndefUse = true;
+      continue;
     }
+
+
+    // Populate MayLiveAcrossBlocks in case the use block is allocated before
+    // the def block (removing the vreg uses).
+    mayLiveIn(Reg);
+
+
+    assert(!MO.isInternalRead() && "Bundles not supported");
+    assert(MO.readsReg() && "reading use");
+    useVirtReg(MI, I, Reg);
   }
 
   // Allocate undef operands. This is a separate step because in a situation
@@ -1133,76 +1321,40 @@ void RegAllocFast::allocateInstruction(MachineInstr &MI) {
     }
   }
 
-  // Track registers defined by instruction - early clobbers and tied uses at
-  // this point.
-  UsedInInstr.clear();
-  if (hasEarlyClobbers) {
-    for (const MachineOperand &MO : MI.operands()) {
-      if (!MO.isReg()) continue;
-      Register Reg = MO.getReg();
-      if (!Reg || !Reg.isPhysical())
+  // Free early clobbers.
+  if (HasEarlyClobber) {
+    for (unsigned I = MI.getNumOperands(); I-- > 0; ) {
+      MachineOperand &MO = MI.getOperand(I);
+      if (!MO.isReg() || !MO.isDef() || !MO.isEarlyClobber())
         continue;
-      // Look for physreg defs and tied uses.
-      if (!MO.isDef() && !MO.isTied()) continue;
-      markRegUsedInInstr(Reg);
-    }
-  }
-
-  unsigned DefOpEnd = MI.getNumOperands();
-  if (MI.isCall()) {
-    // Spill all virtregs before a call. This serves one purpose: If an
-    // exception is thrown, the landing pad is going to expect to find
-    // registers in their spill slots.
-    // Note: although this is appealing to just consider all definitions
-    // as call-clobbered, this is not correct because some of those
-    // definitions may be used later on and we do not want to reuse
-    // those for virtual registers in between.
-    LLVM_DEBUG(dbgs() << "  Spilling remaining registers before call.\n");
-    spillAll(MI, /*OnlyLiveOut*/ false);
-  }
-
-  // Third scan.
-  // Mark all physreg defs as used before allocating virtreg defs.
-  for (unsigned I = 0; I != DefOpEnd; ++I) {
-    const MachineOperand &MO = MI.getOperand(I);
-    if (!MO.isReg() || !MO.isDef() || !MO.getReg() || MO.isEarlyClobber())
-      continue;
-    Register Reg = MO.getReg();
-
-    if (!Reg || !Reg.isPhysical() || !MRI->isAllocatable(Reg))
-      continue;
-    definePhysReg(MI, Reg, MO.isDead() ? regFree : regReserved);
-  }
+      // subreg defs don't free the full register. We left the subreg number
+      // around as a marker in setPhysReg() to recognize this case here.
+      if (MO.getSubReg() != 0) {
+        MO.setSubReg(0);
+        continue;
+      }
 
-  // Fourth scan.
-  // Allocate defs and collect dead defs.
-  for (unsigned I = 0; I != DefOpEnd; ++I) {
-    const MachineOperand &MO = MI.getOperand(I);
-    if (!MO.isReg() || !MO.isDef() || !MO.getReg() || MO.isEarlyClobber())
-      continue;
-    Register Reg = MO.getReg();
+      Register Reg = MO.getReg();
+      if (!Reg)
+        continue;
+      assert(Reg.isPhysical() && "should have register assigned");
+
+      // We sometimes get odd situations like:
+      //    early-clobber %x0 = INSTRUCTION %x0
+      // which is semantically questionable as the early-clobber should
+      // apply before the use. But in practice we consider the use to
+      // happen before the early clobber now. Don't free the early clobber
+      // register in this case.
+      if (MI.readsRegister(Reg, TRI))
+        continue;
 
-    // We have already dealt with phys regs in the previous scan.
-    if (Reg.isPhysical())
-      continue;
-    MCPhysReg PhysReg = defineVirtReg(MI, I, Reg, CopySrcReg);
-    if (setPhysReg(MI, MI.getOperand(I), PhysReg)) {
-      VirtDead.push_back(Reg);
-      CopyDstReg = Register(); // cancel coalescing;
-    } else
-      CopyDstReg = (CopyDstReg == Reg || CopyDstReg == PhysReg) ? PhysReg : 0;
+      freePhysReg(Reg);
+    }
   }
 
-  // Kill dead defs after the scan to ensure that multiple defs of the same
-  // register are allocated identically. We didn't need to do this for uses
-  // because we are crerating our own kill flags, and they are always at the
-  // last use.
-  for (Register VirtReg : VirtDead)
-    killVirtReg(VirtReg);
-  VirtDead.clear();
-
   LLVM_DEBUG(dbgs() << "<< " << MI);
-  if (CopyDstReg && CopyDstReg == CopySrcReg && CopyDstSub == CopySrcSub) {
+  if (MI.isCopy() && MI.getOperand(0).getReg() == MI.getOperand(1).getReg() &&
+      MI.getNumOperands() == 2) {
     LLVM_DEBUG(dbgs() << "Mark identity copy for removal\n");
     Coalesced.push_back(&MI);
   }
@@ -1219,23 +1371,22 @@ void RegAllocFast::handleDebugValue(MachineInstr &MI) {
   if (!Register::isVirtualRegister(Reg))
     return;
 
+  // Already spilled to a stackslot?
+  int SS = StackSlotForVirtReg[Reg];
+  if (SS != -1) {
+    // Modify DBG_VALUE now that the value is in a spill slot.
+    updateDbgValueForSpill(MI, SS);
+    LLVM_DEBUG(dbgs() << "Rewrite DBG_VALUE for spilled memory: " << MI);
+    return;
+  }
+
   // See if this virtual register has already been allocated to a physical
   // register or spilled to a stack slot.
   LiveRegMap::iterator LRI = findLiveVirtReg(Reg);
   if (LRI != LiveVirtRegs.end() && LRI->PhysReg) {
     setPhysReg(MI, MO, LRI->PhysReg);
   } else {
-    int SS = StackSlotForVirtReg[Reg];
-    if (SS != -1) {
-      // Modify DBG_VALUE now that the value is in a spill slot.
-      updateDbgValueForSpill(MI, SS);
-      LLVM_DEBUG(dbgs() << "Modifying debug info due to spill:" << "\t" << MI);
-      return;
-    }
-
-    // We can't allocate a physreg for a DebugValue, sorry!
-    LLVM_DEBUG(dbgs() << "Unable to allocate vreg used by DBG_VALUE");
-    MO.setReg(Register());
+    DanglingDbgValues[Reg].push_back(&MI);
   }
 
   // If Reg hasn't been spilled, put this DBG_VALUE in LiveDbgValueMap so
@@ -1243,25 +1394,46 @@ void RegAllocFast::handleDebugValue(MachineInstr &MI) {
   LiveDbgValueMap[Reg].push_back(&MI);
 }
 
+void RegAllocFast::handleBundle(MachineInstr &MI) {
+  MachineBasicBlock::instr_iterator BundledMI = MI.getIterator();
+  ++BundledMI;
+  while (BundledMI->isBundledWithPred()) {
+    for (unsigned I = 0; I < BundledMI->getNumOperands(); ++I) {
+      MachineOperand &MO = BundledMI->getOperand(I);
+      if (!MO.isReg())
+        continue;
+
+      Register Reg = MO.getReg();
+      if (!Reg.isVirtual())
+        continue;
+
+      DenseMap<Register, MCPhysReg>::iterator DI;
+      DI = BundleVirtRegsMap.find(Reg);
+      assert(DI != BundleVirtRegsMap.end() && "Unassigned virtual register");
+
+      setPhysReg(MI, MO, DI->second);
+    }
+
+    ++BundledMI;
+  }
+}
+
 void RegAllocFast::allocateBasicBlock(MachineBasicBlock &MBB) {
   this->MBB = &MBB;
   LLVM_DEBUG(dbgs() << "\nAllocating " << MBB);
 
-  PhysRegState.assign(TRI->getNumRegs(), regDisabled);
+  RegUnitStates.assign(TRI->getNumRegUnits(), regFree);
   assert(LiveVirtRegs.empty() && "Mapping not cleared from last block?");
 
-  MachineBasicBlock::iterator MII = MBB.begin();
-
-  // Add live-in registers as live.
-  for (const MachineBasicBlock::RegisterMaskPair &LI : MBB.liveins())
-    if (MRI->isAllocatable(LI.PhysReg))
-      definePhysReg(MII, LI.PhysReg, regReserved);
+  for (MachineBasicBlock *Succ : MBB.successors()) {
+    for (const MachineBasicBlock::RegisterMaskPair &LI : Succ->liveins())
+      setPhysRegState(LI.PhysReg, regPreAssigned);
+  }
 
-  VirtDead.clear();
   Coalesced.clear();
 
-  // Otherwise, sequentially allocate each instruction in the MBB.
-  for (MachineInstr &MI : MBB) {
+  // Traverse block in reverse order allocating instructions one by one.
+  for (MachineInstr &MI : reverse(MBB)) {
     LLVM_DEBUG(
       dbgs() << "\n>> " << MI << "Regs:";
       dumpState()
@@ -1275,11 +1447,22 @@ void RegAllocFast::allocateBasicBlock(MachineBasicBlock &MBB) {
     }
 
     allocateInstruction(MI);
+
+    // Once BUNDLE header is assigned registers, same assignments need to be
+    // done for bundled MIs.
+    if (MI.getOpcode() == TargetOpcode::BUNDLE) {
+      handleBundle(MI);
+    }
   }
 
+  LLVM_DEBUG(
+    dbgs() << "Begin Regs:";
+    dumpState()
+  );
+
   // Spill all physical registers holding virtual registers now.
-  LLVM_DEBUG(dbgs() << "Spilling live registers at end of block.\n");
-  spillAll(MBB.getFirstTerminator(), /*OnlyLiveOut*/ true);
+  LLVM_DEBUG(dbgs() << "Loading live registers at begin of block.\n");
+  reloadAtBegin(MBB);
 
   // Erase all the coalesced copies. We are delaying it until now because
   // LiveVirtRegs might refer to the instrs.
@@ -1287,6 +1470,20 @@ void RegAllocFast::allocateBasicBlock(MachineBasicBlock &MBB) {
     MBB.erase(MI);
   NumCoalesced += Coalesced.size();
 
+  for (auto &UDBGPair : DanglingDbgValues) {
+    for (MachineInstr *DbgValue : UDBGPair.second) {
+      assert(DbgValue->isDebugValue() && "expected DBG_VALUE");
+      MachineOperand &MO = DbgValue->getOperand(0);
+      // Nothing to do if the vreg was spilled in the meantime.
+      if (!MO.isReg())
+        continue;
+      LLVM_DEBUG(dbgs() << "Register did not survive for " << *DbgValue
+                 << '\n');
+      MO.setReg(0);
+    }
+  }
+  DanglingDbgValues.clear();
+
   LLVM_DEBUG(MBB.dump());
 }
 
@@ -1300,8 +1497,11 @@ bool RegAllocFast::runOnMachineFunction(MachineFunction &MF) {
   MFI = &MF.getFrameInfo();
   MRI->freezeReservedRegs(MF);
   RegClassInfo.runOnMachineFunction(MF);
+  unsigned NumRegUnits = TRI->getNumRegUnits();
   UsedInInstr.clear();
-  UsedInInstr.setUniverse(TRI->getNumRegUnits());
+  UsedInInstr.setUniverse(NumRegUnits);
+  PhysRegUses.clear();
+  PhysRegUses.setUniverse(NumRegUnits);
 
   // initialize the virtual->physical register map to have a 'null'
   // mapping for all virtual registers
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/RegAllocGreedy.cpp b/contrib/llvm-project/llvm/lib/CodeGen/RegAllocGreedy.cpp
index 41cf00261265..166414e4ffa1 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -147,7 +147,7 @@ class RAGreedy : public MachineFunctionPass,
   // Convenient shortcuts.
   using PQueue = std::priority_queue<std::pair<unsigned, unsigned>>;
   using SmallLISet = SmallPtrSet<LiveInterval *, 4>;
-  using SmallVirtRegSet = SmallSet<unsigned, 16>;
+  using SmallVirtRegSet = SmallSet<Register, 16>;
 
   // context
   MachineFunction *MF;
@@ -172,6 +172,7 @@ class RAGreedy : public MachineFunctionPass,
   std::unique_ptr<Spiller> SpillerInstance;
   PQueue Queue;
   unsigned NextCascade;
+  std::unique_ptr<VirtRegAuxInfo> VRAI;
 
   // Live ranges pass through a number of stages as we try to allocate them.
   // Some of the stages may also create new live ranges:
@@ -247,19 +248,19 @@ class RAGreedy : public MachineFunctionPass,
   IndexedMap<RegInfo, VirtReg2IndexFunctor> ExtraRegInfo;
 
   LiveRangeStage getStage(const LiveInterval &VirtReg) const {
-    return ExtraRegInfo[VirtReg.reg].Stage;
+    return ExtraRegInfo[VirtReg.reg()].Stage;
   }
 
   void setStage(const LiveInterval &VirtReg, LiveRangeStage Stage) {
     ExtraRegInfo.resize(MRI->getNumVirtRegs());
-    ExtraRegInfo[VirtReg.reg].Stage = Stage;
+    ExtraRegInfo[VirtReg.reg()].Stage = Stage;
   }
 
   template<typename Iterator>
   void setStage(Iterator Begin, Iterator End, LiveRangeStage NewStage) {
     ExtraRegInfo.resize(MRI->getNumVirtRegs());
     for (;Begin != End; ++Begin) {
-      unsigned Reg = *Begin;
+      Register Reg = *Begin;
       if (ExtraRegInfo[Reg].Stage == RS_New)
         ExtraRegInfo[Reg].Stage = NewStage;
     }
@@ -290,8 +291,8 @@ class RAGreedy : public MachineFunctionPass,
 
   public:
     using EvictorInfo =
-        std::pair<unsigned /* evictor */, unsigned /* physreg */>;
-    using EvicteeInfo = llvm::DenseMap<unsigned /* evictee */, EvictorInfo>;
+        std::pair<Register /* evictor */, MCRegister /* physreg */>;
+    using EvicteeInfo = llvm::DenseMap<Register /* evictee */, EvictorInfo>;
 
   private:
     /// Each Vreg that has been evicted in the last stage of selectOrSplit will
@@ -307,14 +308,14 @@ class RAGreedy : public MachineFunctionPass,
     /// longer relevant.
     /// \param Evictee The evictee Vreg for whom we want to clear collected
     /// eviction info.
-    void clearEvicteeInfo(unsigned Evictee) { Evictees.erase(Evictee); }
+    void clearEvicteeInfo(Register Evictee) { Evictees.erase(Evictee); }
 
     /// Track new eviction.
     /// The Evictor vreg has evicted the Evictee vreg from Physreg.
     /// \param PhysReg The physical register Evictee was evicted from.
     /// \param Evictor The evictor Vreg that evicted Evictee.
     /// \param Evictee The evictee Vreg.
-    void addEviction(unsigned PhysReg, unsigned Evictor, unsigned Evictee) {
+    void addEviction(MCRegister PhysReg, Register Evictor, Register Evictee) {
       Evictees[Evictee].first = Evictor;
       Evictees[Evictee].second = PhysReg;
     }
@@ -323,7 +324,7 @@ class RAGreedy : public MachineFunctionPass,
     /// \param Evictee The evictee vreg.
     /// \return The Evictor vreg which evicted Evictee vreg from PhysReg. 0 if
     /// nobody has evicted Evictee from PhysReg.
-    EvictorInfo getEvictor(unsigned Evictee) {
+    EvictorInfo getEvictor(Register Evictee) {
       if (Evictees.count(Evictee)) {
         return Evictees[Evictee];
       }
@@ -348,7 +349,7 @@ class RAGreedy : public MachineFunctionPass,
   /// Global live range splitting candidate info.
   struct GlobalSplitCandidate {
     // Register intended for assignment, or 0.
-    unsigned PhysReg;
+    MCRegister PhysReg;
 
     // SplitKit interval index for this candidate.
     unsigned IntvIdx;
@@ -360,7 +361,7 @@ class RAGreedy : public MachineFunctionPass,
     BitVector LiveBundles;
     SmallVector<unsigned, 8> ActiveBlocks;
 
-    void reset(InterferenceCache &Cache, unsigned Reg) {
+    void reset(InterferenceCache &Cache, MCRegister Reg) {
       PhysReg = Reg;
       IntvIdx = 0;
       Intf.setPhysReg(Cache, Reg);
@@ -368,12 +369,12 @@ class RAGreedy : public MachineFunctionPass,
       ActiveBlocks.clear();
     }
 
-    // Set B[i] = C for every live bundle where B[i] was NoCand.
+    // Set B[I] = C for every live bundle where B[I] was NoCand.
     unsigned getBundles(SmallVectorImpl<unsigned> &B, unsigned C) {
       unsigned Count = 0;
-      for (unsigned i : LiveBundles.set_bits())
-        if (B[i] == NoCand) {
-          B[i] = C;
+      for (unsigned I : LiveBundles.set_bits())
+        if (B[I] == NoCand) {
+          B[I] = C;
           Count++;
         }
       return Count;
@@ -417,7 +418,8 @@ public:
   Spiller &spiller() override { return *SpillerInstance; }
   void enqueue(LiveInterval *LI) override;
   LiveInterval *dequeue() override;
-  Register selectOrSplit(LiveInterval&, SmallVectorImpl<Register>&) override;
+  MCRegister selectOrSplit(LiveInterval &,
+                           SmallVectorImpl<Register> &) override;
   void aboutToRemoveInterval(LiveInterval &) override;
 
   /// Perform register allocation.
@@ -428,15 +430,20 @@ public:
         MachineFunctionProperties::Property::NoPHIs);
   }
 
+  MachineFunctionProperties getClearedProperties() const override {
+    return MachineFunctionProperties().set(
+      MachineFunctionProperties::Property::IsSSA);
+  }
+
   static char ID;
 
 private:
-  Register selectOrSplitImpl(LiveInterval &, SmallVectorImpl<Register> &,
-                             SmallVirtRegSet &, unsigned = 0);
+  MCRegister selectOrSplitImpl(LiveInterval &, SmallVectorImpl<Register> &,
+                               SmallVirtRegSet &, unsigned = 0);
 
-  bool LRE_CanEraseVirtReg(unsigned) override;
-  void LRE_WillShrinkVirtReg(unsigned) override;
-  void LRE_DidCloneVirtReg(unsigned, unsigned) override;
+  bool LRE_CanEraseVirtReg(Register) override;
+  void LRE_WillShrinkVirtReg(Register) override;
+  void LRE_DidCloneVirtReg(Register, Register) override;
   void enqueue(PQueue &CurQueue, LiveInterval *LI);
   LiveInterval *dequeue(PQueue &CurQueue);
 
@@ -444,7 +451,7 @@ private:
   bool addSplitConstraints(InterferenceCache::Cursor, BlockFrequency&);
   bool addThroughConstraints(InterferenceCache::Cursor, ArrayRef<unsigned>);
   bool growRegion(GlobalSplitCandidate &Cand);
-  bool splitCanCauseEvictionChain(unsigned Evictee, GlobalSplitCandidate &Cand,
+  bool splitCanCauseEvictionChain(Register Evictee, GlobalSplitCandidate &Cand,
                                   unsigned BBNumber,
                                   const AllocationOrder &Order);
   bool splitCanCauseLocalSpill(unsigned VirtRegToSplit,
@@ -455,20 +462,20 @@ private:
                                      bool *CanCauseEvictionChain);
   bool calcCompactRegion(GlobalSplitCandidate&);
   void splitAroundRegion(LiveRangeEdit&, ArrayRef<unsigned>);
-  void calcGapWeights(unsigned, SmallVectorImpl<float>&);
+  void calcGapWeights(MCRegister, SmallVectorImpl<float> &);
   Register canReassign(LiveInterval &VirtReg, Register PrevReg);
   bool shouldEvict(LiveInterval &A, bool, LiveInterval &B, bool);
-  bool canEvictInterference(LiveInterval&, Register, bool, EvictionCost&,
-                            const SmallVirtRegSet&);
-  bool canEvictInterferenceInRange(LiveInterval &VirtReg, Register oPhysReg,
+  bool canEvictInterference(LiveInterval &, MCRegister, bool, EvictionCost &,
+                            const SmallVirtRegSet &);
+  bool canEvictInterferenceInRange(LiveInterval &VirtReg, MCRegister PhysReg,
                                    SlotIndex Start, SlotIndex End,
                                    EvictionCost &MaxCost);
-  unsigned getCheapestEvicteeWeight(const AllocationOrder &Order,
-                                    LiveInterval &VirtReg, SlotIndex Start,
-                                    SlotIndex End, float *BestEvictWeight);
-  void evictInterference(LiveInterval&, Register,
-                         SmallVectorImpl<Register>&);
-  bool mayRecolorAllInterferences(unsigned PhysReg, LiveInterval &VirtReg,
+  MCRegister getCheapestEvicteeWeight(const AllocationOrder &Order,
+                                      LiveInterval &VirtReg, SlotIndex Start,
+                                      SlotIndex End, float *BestEvictWeight);
+  void evictInterference(LiveInterval &, MCRegister,
+                         SmallVectorImpl<Register> &);
+  bool mayRecolorAllInterferences(MCRegister PhysReg, LiveInterval &VirtReg,
                                   SmallLISet &RecoloringCandidates,
                                   const SmallVirtRegSet &FixedRegisters);
 
@@ -478,8 +485,8 @@ private:
   unsigned tryEvict(LiveInterval&, AllocationOrder&,
                     SmallVectorImpl<Register>&, unsigned,
                     const SmallVirtRegSet&);
-  unsigned tryRegionSplit(LiveInterval&, AllocationOrder&,
-                          SmallVectorImpl<Register>&);
+  MCRegister tryRegionSplit(LiveInterval &, AllocationOrder &,
+                            SmallVectorImpl<Register> &);
   /// Calculate cost of region splitting.
   unsigned calculateRegionSplitCost(LiveInterval &VirtReg,
                                     AllocationOrder &Order,
@@ -492,9 +499,10 @@ private:
                          SmallVectorImpl<Register> &NewVRegs);
   /// Check other options before using a callee-saved register for the first
   /// time.
-  unsigned tryAssignCSRFirstTime(LiveInterval &VirtReg, AllocationOrder &Order,
-                                 Register PhysReg, unsigned &CostPerUseLimit,
-                                 SmallVectorImpl<Register> &NewVRegs);
+  MCRegister tryAssignCSRFirstTime(LiveInterval &VirtReg,
+                                   AllocationOrder &Order, MCRegister PhysReg,
+                                   unsigned &CostPerUseLimit,
+                                   SmallVectorImpl<Register> &NewVRegs);
   void initializeCSRCost();
   unsigned tryBlockSplit(LiveInterval&, AllocationOrder&,
                          SmallVectorImpl<Register>&);
@@ -528,8 +536,8 @@ private:
   };
   using HintsInfo = SmallVector<HintInfo, 4>;
 
-  BlockFrequency getBrokenHintFreq(const HintsInfo &, unsigned);
-  void collectHintInfo(unsigned, HintsInfo &);
+  BlockFrequency getBrokenHintFreq(const HintsInfo &, MCRegister);
+  void collectHintInfo(Register, HintsInfo &);
 
   bool isUnusedCalleeSavedReg(MCRegister PhysReg) const;
 
@@ -626,7 +634,7 @@ void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const {
 //                     LiveRangeEdit delegate methods
 //===----------------------------------------------------------------------===//
 
-bool RAGreedy::LRE_CanEraseVirtReg(unsigned VirtReg) {
+bool RAGreedy::LRE_CanEraseVirtReg(Register VirtReg) {
   LiveInterval &LI = LIS->getInterval(VirtReg);
   if (VRM->hasPhys(VirtReg)) {
     Matrix->unassign(LI);
@@ -641,7 +649,7 @@ bool RAGreedy::LRE_CanEraseVirtReg(unsigned VirtReg) {
   return false;
 }
 
-void RAGreedy::LRE_WillShrinkVirtReg(unsigned VirtReg) {
+void RAGreedy::LRE_WillShrinkVirtReg(Register VirtReg) {
   if (!VRM->hasPhys(VirtReg))
     return;
 
@@ -651,7 +659,7 @@ void RAGreedy::LRE_WillShrinkVirtReg(unsigned VirtReg) {
   enqueue(&LI);
 }
 
-void RAGreedy::LRE_DidCloneVirtReg(unsigned New, unsigned Old) {
+void RAGreedy::LRE_DidCloneVirtReg(Register New, Register Old) {
   // Cloning a register we haven't even heard about yet?  Just ignore it.
   if (!ExtraRegInfo.inBounds(Old))
     return;
@@ -677,9 +685,8 @@ void RAGreedy::enqueue(PQueue &CurQueue, LiveInterval *LI) {
   // Prioritize live ranges by size, assigning larger ranges first.
   // The queue holds (size, reg) pairs.
   const unsigned Size = LI->getSize();
-  const unsigned Reg = LI->reg;
-  assert(Register::isVirtualRegister(Reg) &&
-         "Can only enqueue virtual registers");
+  const Register Reg = LI->reg();
+  assert(Reg.isVirtual() && "Can only enqueue virtual registers");
   unsigned Prio;
 
   ExtraRegInfo.grow(Reg);
@@ -756,26 +763,33 @@ Register RAGreedy::tryAssign(LiveInterval &VirtReg,
                              AllocationOrder &Order,
                              SmallVectorImpl<Register> &NewVRegs,
                              const SmallVirtRegSet &FixedRegisters) {
-  Order.rewind();
   Register PhysReg;
-  while ((PhysReg = Order.next()))
-    if (!Matrix->checkInterference(VirtReg, PhysReg))
-      break;
-  if (!PhysReg || Order.isHint())
+  for (auto I = Order.begin(), E = Order.end(); I != E && !PhysReg; ++I) {
+    assert(*I);
+    if (!Matrix->checkInterference(VirtReg, *I)) {
+      if (I.isHint())
+        return *I;
+      else
+        PhysReg = *I;
+    }
+  }
+  if (!PhysReg.isValid())
     return PhysReg;
 
   // PhysReg is available, but there may be a better choice.
 
   // If we missed a simple hint, try to cheaply evict interference from the
   // preferred register.
-  if (Register Hint = MRI->getSimpleHint(VirtReg.reg))
+  if (Register Hint = MRI->getSimpleHint(VirtReg.reg()))
     if (Order.isHint(Hint)) {
-      LLVM_DEBUG(dbgs() << "missed hint " << printReg(Hint, TRI) << '\n');
+      MCRegister PhysHint = Hint.asMCReg();
+      LLVM_DEBUG(dbgs() << "missed hint " << printReg(PhysHint, TRI) << '\n');
       EvictionCost MaxCost;
       MaxCost.setBrokenHints(1);
-      if (canEvictInterference(VirtReg, Hint, true, MaxCost, FixedRegisters)) {
-        evictInterference(VirtReg, Hint, NewVRegs);
-        return Hint;
+      if (canEvictInterference(VirtReg, PhysHint, true, MaxCost,
+                               FixedRegisters)) {
+        evictInterference(VirtReg, PhysHint, NewVRegs);
+        return PhysHint;
       }
       // Record the missed hint, we may be able to recover
       // at the end if the surrounding allocation changed.
@@ -800,13 +814,14 @@ Register RAGreedy::tryAssign(LiveInterval &VirtReg,
 //===----------------------------------------------------------------------===//
 
 Register RAGreedy::canReassign(LiveInterval &VirtReg, Register PrevReg) {
-  AllocationOrder Order(VirtReg.reg, *VRM, RegClassInfo, Matrix);
-  Register PhysReg;
-  while ((PhysReg = Order.next())) {
-    if (PhysReg == PrevReg)
+  auto Order =
+      AllocationOrder::create(VirtReg.reg(), *VRM, RegClassInfo, Matrix);
+  MCRegister PhysReg;
+  for (auto I = Order.begin(), E = Order.end(); I != E && !PhysReg; ++I) {
+    if ((*I).id() == PrevReg.id())
       continue;
 
-    MCRegUnitIterator Units(PhysReg, TRI);
+    MCRegUnitIterator Units(*I, TRI);
     for (; Units.isValid(); ++Units) {
       // Instantiate a "subquery", not to be confused with the Queries array.
       LiveIntervalUnion::Query subQ(VirtReg, Matrix->getLiveUnions()[*Units]);
@@ -815,7 +830,7 @@ Register RAGreedy::canReassign(LiveInterval &VirtReg, Register PrevReg) {
     }
     // If no units have interference, break out with the current PhysReg.
     if (!Units.isValid())
-      break;
+      PhysReg = *I;
   }
   if (PhysReg)
     LLVM_DEBUG(dbgs() << "can reassign: " << VirtReg << " from "
@@ -846,8 +861,8 @@ bool RAGreedy::shouldEvict(LiveInterval &A, bool IsHint,
   if (CanSplit && IsHint && !BreaksHint)
     return true;
 
-  if (A.weight > B.weight) {
-    LLVM_DEBUG(dbgs() << "should evict: " << B << " w= " << B.weight << '\n');
+  if (A.weight() > B.weight()) {
+    LLVM_DEBUG(dbgs() << "should evict: " << B << " w= " << B.weight() << '\n');
     return true;
   }
   return false;
@@ -862,7 +877,7 @@ bool RAGreedy::shouldEvict(LiveInterval &A, bool IsHint,
 /// @param MaxCost Only look for cheaper candidates and update with new cost
 ///                when returning true.
 /// @returns True when interference can be evicted cheaper than MaxCost.
-bool RAGreedy::canEvictInterference(LiveInterval &VirtReg, Register PhysReg,
+bool RAGreedy::canEvictInterference(LiveInterval &VirtReg, MCRegister PhysReg,
                                     bool IsHint, EvictionCost &MaxCost,
                                     const SmallVirtRegSet &FixedRegisters) {
   // It is only possible to evict virtual register interference.
@@ -878,7 +893,7 @@ bool RAGreedy::canEvictInterference(LiveInterval &VirtReg, Register PhysReg,
   //
   // This works out so a register without a cascade number is allowed to evict
   // anything, and it can be evicted by anything.
-  unsigned Cascade = ExtraRegInfo[VirtReg.reg].Cascade;
+  unsigned Cascade = ExtraRegInfo[VirtReg.reg()].Cascade;
   if (!Cascade)
     Cascade = NextCascade;
 
@@ -890,15 +905,14 @@ bool RAGreedy::canEvictInterference(LiveInterval &VirtReg, Register PhysReg,
       return false;
 
     // Check if any interfering live range is heavier than MaxWeight.
-    for (unsigned i = Q.interferingVRegs().size(); i; --i) {
-      LiveInterval *Intf = Q.interferingVRegs()[i - 1];
-      assert(Register::isVirtualRegister(Intf->reg) &&
+    for (LiveInterval *Intf : reverse(Q.interferingVRegs())) {
+      assert(Register::isVirtualRegister(Intf->reg()) &&
              "Only expecting virtual register interference from query");
 
       // Do not allow eviction of a virtual register if we are in the middle
       // of last-chance recoloring and this virtual register is one that we
       // have scavenged a physical register for.
-      if (FixedRegisters.count(Intf->reg))
+      if (FixedRegisters.count(Intf->reg()))
         return false;
 
       // Never evict spill products. They cannot split or spill.
@@ -910,12 +924,14 @@ bool RAGreedy::canEvictInterference(LiveInterval &VirtReg, Register PhysReg,
       //
       // Also allow urgent evictions of unspillable ranges from a strictly
       // larger allocation order.
-      bool Urgent = !VirtReg.isSpillable() &&
-        (Intf->isSpillable() ||
-         RegClassInfo.getNumAllocatableRegs(MRI->getRegClass(VirtReg.reg)) <
-         RegClassInfo.getNumAllocatableRegs(MRI->getRegClass(Intf->reg)));
+      bool Urgent =
+          !VirtReg.isSpillable() &&
+          (Intf->isSpillable() ||
+           RegClassInfo.getNumAllocatableRegs(MRI->getRegClass(VirtReg.reg())) <
+               RegClassInfo.getNumAllocatableRegs(
+                   MRI->getRegClass(Intf->reg())));
       // Only evict older cascades or live ranges without a cascade.
-      unsigned IntfCascade = ExtraRegInfo[Intf->reg].Cascade;
+      unsigned IntfCascade = ExtraRegInfo[Intf->reg()].Cascade;
       if (Cascade <= IntfCascade) {
         if (!Urgent)
           return false;
@@ -924,10 +940,10 @@ bool RAGreedy::canEvictInterference(LiveInterval &VirtReg, Register PhysReg,
         Cost.BrokenHints += 10;
       }
       // Would this break a satisfied hint?
-      bool BreaksHint = VRM->hasPreferredPhys(Intf->reg);
+      bool BreaksHint = VRM->hasPreferredPhys(Intf->reg());
       // Update eviction cost.
       Cost.BrokenHints += BreaksHint;
-      Cost.MaxWeight = std::max(Cost.MaxWeight, Intf->weight);
+      Cost.MaxWeight = std::max(Cost.MaxWeight, Intf->weight());
       // Abort if this would be too expensive.
       if (!(Cost < MaxCost))
         return false;
@@ -960,7 +976,7 @@ bool RAGreedy::canEvictInterference(LiveInterval &VirtReg, Register PhysReg,
 ///                when returning true.
 /// \return True when interference can be evicted cheaper than MaxCost.
 bool RAGreedy::canEvictInterferenceInRange(LiveInterval &VirtReg,
-                                           Register PhysReg, SlotIndex Start,
+                                           MCRegister PhysReg, SlotIndex Start,
                                            SlotIndex End,
                                            EvictionCost &MaxCost) {
   EvictionCost Cost;
@@ -969,25 +985,23 @@ bool RAGreedy::canEvictInterferenceInRange(LiveInterval &VirtReg,
     LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units);
 
     // Check if any interfering live range is heavier than MaxWeight.
-    for (unsigned i = Q.interferingVRegs().size(); i; --i) {
-      LiveInterval *Intf = Q.interferingVRegs()[i - 1];
-
+    for (const LiveInterval *Intf : reverse(Q.interferingVRegs())) {
       // Check if interference overlast the segment in interest.
       if (!Intf->overlaps(Start, End))
         continue;
 
       // Cannot evict non virtual reg interference.
-      if (!Register::isVirtualRegister(Intf->reg))
+      if (!Register::isVirtualRegister(Intf->reg()))
         return false;
       // Never evict spill products. They cannot split or spill.
       if (getStage(*Intf) == RS_Done)
         return false;
 
       // Would this break a satisfied hint?
-      bool BreaksHint = VRM->hasPreferredPhys(Intf->reg);
+      bool BreaksHint = VRM->hasPreferredPhys(Intf->reg());
       // Update eviction cost.
       Cost.BrokenHints += BreaksHint;
-      Cost.MaxWeight = std::max(Cost.MaxWeight, Intf->weight);
+      Cost.MaxWeight = std::max(Cost.MaxWeight, Intf->weight());
       // Abort if this would be too expensive.
       if (!(Cost < MaxCost))
         return false;
@@ -1012,17 +1026,17 @@ bool RAGreedy::canEvictInterferenceInRange(LiveInterval &VirtReg,
 /// \param BestEvictweight  The eviction cost of that eviction
 /// \return The PhysReg which is the best candidate for eviction and the
 /// eviction cost in BestEvictweight
-unsigned RAGreedy::getCheapestEvicteeWeight(const AllocationOrder &Order,
-                                            LiveInterval &VirtReg,
-                                            SlotIndex Start, SlotIndex End,
-                                            float *BestEvictweight) {
+MCRegister RAGreedy::getCheapestEvicteeWeight(const AllocationOrder &Order,
+                                              LiveInterval &VirtReg,
+                                              SlotIndex Start, SlotIndex End,
+                                              float *BestEvictweight) {
   EvictionCost BestEvictCost;
   BestEvictCost.setMax();
-  BestEvictCost.MaxWeight = VirtReg.weight;
-  unsigned BestEvicteePhys = 0;
+  BestEvictCost.MaxWeight = VirtReg.weight();
+  MCRegister BestEvicteePhys;
 
   // Go over all physical registers and find the best candidate for eviction
-  for (auto PhysReg : Order.getOrder()) {
+  for (MCRegister PhysReg : Order.getOrder()) {
 
     if (!canEvictInterferenceInRange(VirtReg, PhysReg, Start, End,
                                      BestEvictCost))
@@ -1038,14 +1052,14 @@ unsigned RAGreedy::getCheapestEvicteeWeight(const AllocationOrder &Order,
 /// evictInterference - Evict any interferring registers that prevent VirtReg
 /// from being assigned to Physreg. This assumes that canEvictInterference
 /// returned true.
-void RAGreedy::evictInterference(LiveInterval &VirtReg, Register PhysReg,
+void RAGreedy::evictInterference(LiveInterval &VirtReg, MCRegister PhysReg,
                                  SmallVectorImpl<Register> &NewVRegs) {
   // Make sure that VirtReg has a cascade number, and assign that cascade
   // number to every evicted register. These live ranges than then only be
   // evicted by a newer cascade, preventing infinite loops.
-  unsigned Cascade = ExtraRegInfo[VirtReg.reg].Cascade;
+  unsigned Cascade = ExtraRegInfo[VirtReg.reg()].Cascade;
   if (!Cascade)
-    Cascade = ExtraRegInfo[VirtReg.reg].Cascade = NextCascade++;
+    Cascade = ExtraRegInfo[VirtReg.reg()].Cascade = NextCascade++;
 
   LLVM_DEBUG(dbgs() << "evicting " << printReg(PhysReg, TRI)
                     << " interference: Cascade " << Cascade << '\n');
@@ -1064,21 +1078,20 @@ void RAGreedy::evictInterference(LiveInterval &VirtReg, Register PhysReg,
   }
 
   // Evict them second. This will invalidate the queries.
-  for (unsigned i = 0, e = Intfs.size(); i != e; ++i) {
-    LiveInterval *Intf = Intfs[i];
+  for (LiveInterval *Intf : Intfs) {
     // The same VirtReg may be present in multiple RegUnits. Skip duplicates.
-    if (!VRM->hasPhys(Intf->reg))
+    if (!VRM->hasPhys(Intf->reg()))
       continue;
 
-    LastEvicted.addEviction(PhysReg, VirtReg.reg, Intf->reg);
+    LastEvicted.addEviction(PhysReg, VirtReg.reg(), Intf->reg());
 
     Matrix->unassign(*Intf);
-    assert((ExtraRegInfo[Intf->reg].Cascade < Cascade ||
+    assert((ExtraRegInfo[Intf->reg()].Cascade < Cascade ||
             VirtReg.isSpillable() < Intf->isSpillable()) &&
            "Cannot decrease cascade number, illegal eviction");
-    ExtraRegInfo[Intf->reg].Cascade = Cascade;
+    ExtraRegInfo[Intf->reg()].Cascade = Cascade;
     ++NumEvicted;
-    NewVRegs.push_back(Intf->reg);
+    NewVRegs.push_back(Intf->reg());
   }
 }
 
@@ -1107,17 +1120,17 @@ unsigned RAGreedy::tryEvict(LiveInterval &VirtReg,
   // Keep track of the cheapest interference seen so far.
   EvictionCost BestCost;
   BestCost.setMax();
-  unsigned BestPhys = 0;
+  MCRegister BestPhys;
   unsigned OrderLimit = Order.getOrder().size();
 
   // When we are just looking for a reduced cost per use, don't break any
   // hints, and only evict smaller spill weights.
   if (CostPerUseLimit < ~0u) {
     BestCost.BrokenHints = 0;
-    BestCost.MaxWeight = VirtReg.weight;
+    BestCost.MaxWeight = VirtReg.weight();
 
     // Check of any registers in RC are below CostPerUseLimit.
-    const TargetRegisterClass *RC = MRI->getRegClass(VirtReg.reg);
+    const TargetRegisterClass *RC = MRI->getRegClass(VirtReg.reg());
     unsigned MinCost = RegClassInfo.getMinCost(RC);
     if (MinCost >= CostPerUseLimit) {
       LLVM_DEBUG(dbgs() << TRI->getRegClassName(RC) << " minimum cost = "
@@ -1134,8 +1147,10 @@ unsigned RAGreedy::tryEvict(LiveInterval &VirtReg,
     }
   }
 
-  Order.rewind();
-  while (MCRegister PhysReg = Order.next(OrderLimit)) {
+  for (auto I = Order.begin(), E = Order.getOrderLimitEnd(OrderLimit); I != E;
+       ++I) {
+    MCRegister PhysReg = *I;
+    assert(PhysReg);
     if (TRI->getCostPerUse(PhysReg) >= CostPerUseLimit)
       continue;
     // The first use of a callee-saved register in a function has cost 1.
@@ -1156,7 +1171,7 @@ unsigned RAGreedy::tryEvict(LiveInterval &VirtReg,
     BestPhys = PhysReg;
 
     // Stop if the hint can be used.
-    if (Order.isHint())
+    if (I.isHint())
       break;
   }
 
@@ -1183,9 +1198,9 @@ bool RAGreedy::addSplitConstraints(InterferenceCache::Cursor Intf,
   // Reset interference dependent info.
   SplitConstraints.resize(UseBlocks.size());
   BlockFrequency StaticCost = 0;
-  for (unsigned i = 0; i != UseBlocks.size(); ++i) {
-    const SplitAnalysis::BlockInfo &BI = UseBlocks[i];
-    SpillPlacement::BlockConstraint &BC = SplitConstraints[i];
+  for (unsigned I = 0; I != UseBlocks.size(); ++I) {
+    const SplitAnalysis::BlockInfo &BI = UseBlocks[I];
+    SpillPlacement::BlockConstraint &BC = SplitConstraints[I];
 
     BC.Number = BI.MBB->getNumber();
     Intf.moveToBlock(BC.Number);
@@ -1256,8 +1271,7 @@ bool RAGreedy::addThroughConstraints(InterferenceCache::Cursor Intf,
   unsigned TBS[GroupSize];
   unsigned B = 0, T = 0;
 
-  for (unsigned i = 0; i != Blocks.size(); ++i) {
-    unsigned Number = Blocks[i];
+  for (unsigned Number : Blocks) {
     Intf.moveToBlock(Number);
 
     if (!Intf.hasInterference()) {
@@ -1314,8 +1328,7 @@ bool RAGreedy::growRegion(GlobalSplitCandidate &Cand) {
   while (true) {
     ArrayRef<unsigned> NewBundles = SpillPlacer->getRecentPositive();
     // Find new through blocks in the periphery of PrefRegBundles.
-    for (int i = 0, e = NewBundles.size(); i != e; ++i) {
-      unsigned Bundle = NewBundles[i];
+    for (unsigned Bundle : NewBundles) {
       // Look at all blocks connected to Bundle in the full graph.
       ArrayRef<unsigned> Blocks = Bundles->getBlocks(Bundle);
       for (ArrayRef<unsigned>::iterator I = Blocks.begin(), E = Blocks.end();
@@ -1367,7 +1380,7 @@ bool RAGreedy::calcCompactRegion(GlobalSplitCandidate &Cand) {
     return false;
 
   // Compact regions don't correspond to any physreg.
-  Cand.reset(IntfCache, 0);
+  Cand.reset(IntfCache, MCRegister::NoRegister);
 
   LLVM_DEBUG(dbgs() << "Compact region bundles");
 
@@ -1395,8 +1408,8 @@ bool RAGreedy::calcCompactRegion(GlobalSplitCandidate &Cand) {
   }
 
   LLVM_DEBUG({
-    for (int i : Cand.LiveBundles.set_bits())
-      dbgs() << " EB#" << i;
+    for (int I : Cand.LiveBundles.set_bits())
+      dbgs() << " EB#" << I;
     dbgs() << ".\n";
   });
   return true;
@@ -1407,8 +1420,7 @@ bool RAGreedy::calcCompactRegion(GlobalSplitCandidate &Cand) {
 BlockFrequency RAGreedy::calcSpillCost() {
   BlockFrequency Cost = 0;
   ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA->getUseBlocks();
-  for (unsigned i = 0; i != UseBlocks.size(); ++i) {
-    const SplitAnalysis::BlockInfo &BI = UseBlocks[i];
+  for (const SplitAnalysis::BlockInfo &BI : UseBlocks) {
     unsigned Number = BI.MBB->getNumber();
     // We normally only need one spill instruction - a load or a store.
     Cost += SpillPlacer->getBlockFrequency(Number);
@@ -1473,20 +1485,20 @@ BlockFrequency RAGreedy::calcSpillCost() {
 ///                 artifact of Evictee.
 /// \return True if splitting Evictee may cause a bad eviction chain, false
 /// otherwise.
-bool RAGreedy::splitCanCauseEvictionChain(unsigned Evictee,
+bool RAGreedy::splitCanCauseEvictionChain(Register Evictee,
                                           GlobalSplitCandidate &Cand,
                                           unsigned BBNumber,
                                           const AllocationOrder &Order) {
   EvictionTrack::EvictorInfo VregEvictorInfo = LastEvicted.getEvictor(Evictee);
   unsigned Evictor = VregEvictorInfo.first;
-  unsigned PhysReg = VregEvictorInfo.second;
+  MCRegister PhysReg = VregEvictorInfo.second;
 
   // No actual evictor.
   if (!Evictor || !PhysReg)
     return false;
 
   float MaxWeight = 0;
-  unsigned FutureEvictedPhysReg =
+  MCRegister FutureEvictedPhysReg =
       getCheapestEvicteeWeight(Order, LIS->getInterval(Evictee),
                                Cand.Intf.first(), Cand.Intf.last(), &MaxWeight);
 
@@ -1511,10 +1523,9 @@ bool RAGreedy::splitCanCauseEvictionChain(unsigned Evictee,
   // Now, check to see if the local interval we will create is going to be
   // expensive enough to evict somebody If so, this may cause a bad eviction
   // chain.
-  VirtRegAuxInfo VRAI(*MF, *LIS, VRM, getAnalysis<MachineLoopInfo>(), *MBFI);
   float splitArtifactWeight =
-      VRAI.futureWeight(LIS->getInterval(Evictee),
-                        Cand.Intf.first().getPrevIndex(), Cand.Intf.last());
+      VRAI->futureWeight(LIS->getInterval(Evictee),
+                         Cand.Intf.first().getPrevIndex(), Cand.Intf.last());
   if (splitArtifactWeight >= 0 && splitArtifactWeight < MaxWeight)
     return false;
 
@@ -1548,16 +1559,15 @@ bool RAGreedy::splitCanCauseLocalSpill(unsigned VirtRegToSplit,
 
   // Check if the local interval will evict a cheaper interval.
   float CheapestEvictWeight = 0;
-  unsigned FutureEvictedPhysReg = getCheapestEvicteeWeight(
+  MCRegister FutureEvictedPhysReg = getCheapestEvicteeWeight(
       Order, LIS->getInterval(VirtRegToSplit), Cand.Intf.first(),
       Cand.Intf.last(), &CheapestEvictWeight);
 
   // Have we found an interval that can be evicted?
   if (FutureEvictedPhysReg) {
-    VirtRegAuxInfo VRAI(*MF, *LIS, VRM, getAnalysis<MachineLoopInfo>(), *MBFI);
     float splitArtifactWeight =
-        VRAI.futureWeight(LIS->getInterval(VirtRegToSplit),
-                          Cand.Intf.first().getPrevIndex(), Cand.Intf.last());
+        VRAI->futureWeight(LIS->getInterval(VirtRegToSplit),
+                           Cand.Intf.first().getPrevIndex(), Cand.Intf.last());
     // Will the weight of the local interval be higher than the cheapest evictee
     // weight? If so it will evict it and will not cause a spill.
     if (splitArtifactWeight >= 0 && splitArtifactWeight > CheapestEvictWeight)
@@ -1578,11 +1588,11 @@ BlockFrequency RAGreedy::calcGlobalSplitCost(GlobalSplitCandidate &Cand,
                                              bool *CanCauseEvictionChain) {
   BlockFrequency GlobalCost = 0;
   const BitVector &LiveBundles = Cand.LiveBundles;
-  unsigned VirtRegToSplit = SA->getParent().reg;
+  Register VirtRegToSplit = SA->getParent().reg();
   ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA->getUseBlocks();
-  for (unsigned i = 0; i != UseBlocks.size(); ++i) {
-    const SplitAnalysis::BlockInfo &BI = UseBlocks[i];
-    SpillPlacement::BlockConstraint &BC = SplitConstraints[i];
+  for (unsigned I = 0; I != UseBlocks.size(); ++I) {
+    const SplitAnalysis::BlockInfo &BI = UseBlocks[I];
+    SpillPlacement::BlockConstraint &BC = SplitConstraints[I];
     bool RegIn  = LiveBundles[Bundles->getBundle(BC.Number, false)];
     bool RegOut = LiveBundles[Bundles->getBundle(BC.Number, true)];
     unsigned Ins = 0;
@@ -1620,8 +1630,7 @@ BlockFrequency RAGreedy::calcGlobalSplitCost(GlobalSplitCandidate &Cand,
       GlobalCost += SpillPlacer->getBlockFrequency(BC.Number);
   }
 
-  for (unsigned i = 0, e = Cand.ActiveBlocks.size(); i != e; ++i) {
-    unsigned Number = Cand.ActiveBlocks[i];
+  for (unsigned Number : Cand.ActiveBlocks) {
     bool RegIn  = LiveBundles[Bundles->getBundle(Number, false)];
     bool RegOut = LiveBundles[Bundles->getBundle(Number, true)];
     if (!RegIn && !RegOut)
@@ -1679,13 +1688,12 @@ void RAGreedy::splitAroundRegion(LiveRangeEdit &LREdit,
   // Isolate even single instructions when dealing with a proper sub-class.
   // That guarantees register class inflation for the stack interval because it
   // is all copies.
-  unsigned Reg = SA->getParent().reg;
+  Register Reg = SA->getParent().reg();
   bool SingleInstrs = RegClassInfo.isProperSubClass(MRI->getRegClass(Reg));
 
   // First handle all the blocks with uses.
   ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA->getUseBlocks();
-  for (unsigned i = 0; i != UseBlocks.size(); ++i) {
-    const SplitAnalysis::BlockInfo &BI = UseBlocks[i];
+  for (const SplitAnalysis::BlockInfo &BI : UseBlocks) {
     unsigned Number = BI.MBB->getNumber();
     unsigned IntvIn = 0, IntvOut = 0;
     SlotIndex IntfIn, IntfOut;
@@ -1730,8 +1738,7 @@ void RAGreedy::splitAroundRegion(LiveRangeEdit &LREdit,
   BitVector Todo = SA->getThroughBlocks();
   for (unsigned c = 0; c != UsedCands.size(); ++c) {
     ArrayRef<unsigned> Blocks = GlobalCand[UsedCands[c]].ActiveBlocks;
-    for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
-      unsigned Number = Blocks[i];
+    for (unsigned Number : Blocks) {
       if (!Todo.test(Number))
         continue;
       Todo.reset(Number);
@@ -1774,8 +1781,8 @@ void RAGreedy::splitAroundRegion(LiveRangeEdit &LREdit,
   // - Candidate intervals can be assigned to Cand.PhysReg.
   // - Block-local splits are candidates for local splitting.
   // - DCE leftovers should go back on the queue.
-  for (unsigned i = 0, e = LREdit.size(); i != e; ++i) {
-    LiveInterval &Reg = LIS->getInterval(LREdit.get(i));
+  for (unsigned I = 0, E = LREdit.size(); I != E; ++I) {
+    LiveInterval &Reg = LIS->getInterval(LREdit.get(I));
 
     // Ignore old intervals from DCE.
     if (getStage(Reg) != RS_New)
@@ -1783,14 +1790,14 @@ void RAGreedy::splitAroundRegion(LiveRangeEdit &LREdit,
 
     // Remainder interval. Don't try splitting again, spill if it doesn't
     // allocate.
-    if (IntvMap[i] == 0) {
+    if (IntvMap[I] == 0) {
       setStage(Reg, RS_Spill);
       continue;
     }
 
     // Global intervals. Allow repeated splitting as long as the number of live
     // blocks is strictly decreasing.
-    if (IntvMap[i] < NumGlobalIntvs) {
+    if (IntvMap[I] < NumGlobalIntvs) {
       if (SA->countLiveBlocks(&Reg) >= OrigBlocks) {
         LLVM_DEBUG(dbgs() << "Main interval covers the same " << OrigBlocks
                           << " blocks as original.\n");
@@ -1808,10 +1815,11 @@ void RAGreedy::splitAroundRegion(LiveRangeEdit &LREdit,
     MF->verify(this, "After splitting live range around region");
 }
 
-unsigned RAGreedy::tryRegionSplit(LiveInterval &VirtReg, AllocationOrder &Order,
-                                  SmallVectorImpl<Register> &NewVRegs) {
+MCRegister RAGreedy::tryRegionSplit(LiveInterval &VirtReg,
+                                    AllocationOrder &Order,
+                                    SmallVectorImpl<Register> &NewVRegs) {
   if (!TRI->shouldRegionSplitForVirtReg(*MF, VirtReg))
-    return 0;
+    return MCRegister::NoRegister;
   unsigned NumCands = 0;
   BlockFrequency SpillCost = calcSpillCost();
   BlockFrequency BestCost;
@@ -1841,12 +1849,12 @@ unsigned RAGreedy::tryRegionSplit(LiveInterval &VirtReg, AllocationOrder &Order,
   // current max frequency.
   if (HasCompact && (BestCost > SpillCost) && (BestCand != NoCand) &&
     CanCauseEvictionChain) {
-    return 0;
+    return MCRegister::NoRegister;
   }
 
   // No solutions found, fall back to single block splitting.
   if (!HasCompact && BestCand == NoCand)
-    return 0;
+    return MCRegister::NoRegister;
 
   return doRegionSplit(VirtReg, BestCand, HasCompact, NewVRegs);
 }
@@ -1857,8 +1865,8 @@ unsigned RAGreedy::calculateRegionSplitCost(LiveInterval &VirtReg,
                                             unsigned &NumCands, bool IgnoreCSR,
                                             bool *CanCauseEvictionChain) {
   unsigned BestCand = NoCand;
-  Order.rewind();
-  while (unsigned PhysReg = Order.next()) {
+  for (MCPhysReg PhysReg : Order) {
+    assert(PhysReg);
     if (IgnoreCSR && isUnusedCalleeSavedReg(PhysReg))
       continue;
 
@@ -1867,12 +1875,12 @@ unsigned RAGreedy::calculateRegionSplitCost(LiveInterval &VirtReg,
     if (NumCands == IntfCache.getMaxCursors()) {
       unsigned WorstCount = ~0u;
       unsigned Worst = 0;
-      for (unsigned i = 0; i != NumCands; ++i) {
-        if (i == BestCand || !GlobalCand[i].PhysReg)
+      for (unsigned CandIndex = 0; CandIndex != NumCands; ++CandIndex) {
+        if (CandIndex == BestCand || !GlobalCand[CandIndex].PhysReg)
           continue;
-        unsigned Count = GlobalCand[i].LiveBundles.count();
+        unsigned Count = GlobalCand[CandIndex].LiveBundles.count();
         if (Count < WorstCount) {
-          Worst = i;
+          Worst = CandIndex;
           WorstCount = Count;
         }
       }
@@ -1923,8 +1931,8 @@ unsigned RAGreedy::calculateRegionSplitCost(LiveInterval &VirtReg,
     LLVM_DEBUG({
       dbgs() << ", total = ";
       MBFI->printBlockFreq(dbgs(), Cost) << " with bundles";
-      for (int i : Cand.LiveBundles.set_bits())
-        dbgs() << " EB#" << i;
+      for (int I : Cand.LiveBundles.set_bits())
+        dbgs() << " EB#" << I;
       dbgs() << ".\n";
     });
     if (Cost < BestCost) {
@@ -1942,7 +1950,7 @@ unsigned RAGreedy::calculateRegionSplitCost(LiveInterval &VirtReg,
     // See splitCanCauseEvictionChain for detailed description of bad
     // eviction chain scenarios.
     LLVM_DEBUG(dbgs() << "Best split candidate of vreg "
-                      << printReg(VirtReg.reg, TRI) << "  may ");
+                      << printReg(VirtReg.reg(), TRI) << "  may ");
     if (!(*CanCauseEvictionChain))
       LLVM_DEBUG(dbgs() << "not ");
     LLVM_DEBUG(dbgs() << "cause bad eviction chain\n");
@@ -2001,13 +2009,12 @@ unsigned RAGreedy::doRegionSplit(LiveInterval &VirtReg, unsigned BestCand,
 unsigned RAGreedy::tryBlockSplit(LiveInterval &VirtReg, AllocationOrder &Order,
                                  SmallVectorImpl<Register> &NewVRegs) {
   assert(&SA->getParent() == &VirtReg && "Live range wasn't analyzed");
-  Register Reg = VirtReg.reg;
+  Register Reg = VirtReg.reg();
   bool SingleInstrs = RegClassInfo.isProperSubClass(MRI->getRegClass(Reg));
   LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this, &DeadRemats);
   SE->reset(LREdit, SplitSpillMode);
   ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA->getUseBlocks();
-  for (unsigned i = 0; i != UseBlocks.size(); ++i) {
-    const SplitAnalysis::BlockInfo &BI = UseBlocks[i];
+  for (const SplitAnalysis::BlockInfo &BI : UseBlocks) {
     if (SA->shouldSplitSingleBlock(BI, SingleInstrs))
       SE->splitSingleBlock(BI);
   }
@@ -2026,9 +2033,9 @@ unsigned RAGreedy::tryBlockSplit(LiveInterval &VirtReg, AllocationOrder &Order,
 
   // Sort out the new intervals created by splitting. The remainder interval
   // goes straight to spilling, the new local ranges get to stay RS_New.
-  for (unsigned i = 0, e = LREdit.size(); i != e; ++i) {
-    LiveInterval &LI = LIS->getInterval(LREdit.get(i));
-    if (getStage(LI) == RS_New && IntvMap[i] == 0)
+  for (unsigned I = 0, E = LREdit.size(); I != E; ++I) {
+    LiveInterval &LI = LIS->getInterval(LREdit.get(I));
+    if (getStage(LI) == RS_New && IntvMap[I] == 0)
       setStage(LI, RS_Spill);
   }
 
@@ -2044,7 +2051,7 @@ unsigned RAGreedy::tryBlockSplit(LiveInterval &VirtReg, AllocationOrder &Order,
 /// Get the number of allocatable registers that match the constraints of \p Reg
 /// on \p MI and that are also in \p SuperRC.
 static unsigned getNumAllocatableRegsForConstraints(
-    const MachineInstr *MI, unsigned Reg, const TargetRegisterClass *SuperRC,
+    const MachineInstr *MI, Register Reg, const TargetRegisterClass *SuperRC,
     const TargetInstrInfo *TII, const TargetRegisterInfo *TRI,
     const RegisterClassInfo &RCI) {
   assert(SuperRC && "Invalid register class");
@@ -2067,7 +2074,7 @@ static unsigned getNumAllocatableRegsForConstraints(
 unsigned
 RAGreedy::tryInstructionSplit(LiveInterval &VirtReg, AllocationOrder &Order,
                               SmallVectorImpl<Register> &NewVRegs) {
-  const TargetRegisterClass *CurRC = MRI->getRegClass(VirtReg.reg);
+  const TargetRegisterClass *CurRC = MRI->getRegClass(VirtReg.reg());
   // There is no point to this if there are no larger sub-classes.
   if (!RegClassInfo.isProperSubClass(CurRC))
     return 0;
@@ -2091,18 +2098,18 @@ RAGreedy::tryInstructionSplit(LiveInterval &VirtReg, AllocationOrder &Order,
   // the constraints on the virtual register.
   // Otherwise, splitting just inserts uncoalescable copies that do not help
   // the allocation.
-  for (unsigned i = 0; i != Uses.size(); ++i) {
-    if (const MachineInstr *MI = Indexes->getInstructionFromIndex(Uses[i]))
+  for (const auto &Use : Uses) {
+    if (const MachineInstr *MI = Indexes->getInstructionFromIndex(Use))
       if (MI->isFullCopy() ||
           SuperRCNumAllocatableRegs ==
-              getNumAllocatableRegsForConstraints(MI, VirtReg.reg, SuperRC, TII,
-                                                  TRI, RCI)) {
-        LLVM_DEBUG(dbgs() << "    skip:\t" << Uses[i] << '\t' << *MI);
+              getNumAllocatableRegsForConstraints(MI, VirtReg.reg(), SuperRC,
+                                                  TII, TRI, RCI)) {
+        LLVM_DEBUG(dbgs() << "    skip:\t" << Use << '\t' << *MI);
         continue;
       }
     SE->openIntv();
-    SlotIndex SegStart = SE->enterIntvBefore(Uses[i]);
-    SlotIndex SegStop  = SE->leaveIntvAfter(Uses[i]);
+    SlotIndex SegStart = SE->enterIntvBefore(Use);
+    SlotIndex SegStop = SE->leaveIntvAfter(Use);
     SE->useIntv(SegStart, SegStop);
   }
 
@@ -2113,7 +2120,7 @@ RAGreedy::tryInstructionSplit(LiveInterval &VirtReg, AllocationOrder &Order,
 
   SmallVector<unsigned, 8> IntvMap;
   SE->finish(&IntvMap);
-  DebugVars->splitRegister(VirtReg.reg, LREdit.regs(), *LIS);
+  DebugVars->splitRegister(VirtReg.reg(), LREdit.regs(), *LIS);
   ExtraRegInfo.resize(MRI->getNumVirtRegs());
 
   // Assign all new registers to RS_Spill. This was the last chance.
@@ -2128,9 +2135,9 @@ RAGreedy::tryInstructionSplit(LiveInterval &VirtReg, AllocationOrder &Order,
 /// calcGapWeights - Compute the maximum spill weight that needs to be evicted
 /// in order to use PhysReg between two entries in SA->UseSlots.
 ///
-/// GapWeight[i] represents the gap between UseSlots[i] and UseSlots[i+1].
+/// GapWeight[I] represents the gap between UseSlots[I] and UseSlots[I + 1].
 ///
-void RAGreedy::calcGapWeights(unsigned PhysReg,
+void RAGreedy::calcGapWeights(MCRegister PhysReg,
                               SmallVectorImpl<float> &GapWeight) {
   assert(SA->getUseBlocks().size() == 1 && "Not a local interval");
   const SplitAnalysis::BlockInfo &BI = SA->getUseBlocks().front();
@@ -2169,7 +2176,7 @@ void RAGreedy::calcGapWeights(unsigned PhysReg,
         break;
 
       // Update the gaps covered by IntI.
-      const float weight = IntI.value()->weight;
+      const float weight = IntI.value()->weight();
       for (; Gap != NumGaps; ++Gap) {
         GapWeight[Gap] = std::max(GapWeight[Gap], weight);
         if (Uses[Gap+1].getBaseIndex() >= IntI.stop())
@@ -2231,8 +2238,8 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
 
   LLVM_DEBUG({
     dbgs() << "tryLocalSplit: ";
-    for (unsigned i = 0, e = Uses.size(); i != e; ++i)
-      dbgs() << ' ' << Uses[i];
+    for (const auto &Use : Uses)
+      dbgs() << ' ' << Use;
     dbgs() << '\n';
   });
 
@@ -2244,25 +2251,25 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
     ArrayRef<SlotIndex> RMS = LIS->getRegMaskSlotsInBlock(BI.MBB->getNumber());
     LLVM_DEBUG(dbgs() << RMS.size() << " regmasks in block:");
     // Constrain to VirtReg's live range.
-    unsigned ri =
+    unsigned RI =
         llvm::lower_bound(RMS, Uses.front().getRegSlot()) - RMS.begin();
-    unsigned re = RMS.size();
-    for (unsigned i = 0; i != NumGaps && ri != re; ++i) {
-      // Look for Uses[i] <= RMS <= Uses[i+1].
-      assert(!SlotIndex::isEarlierInstr(RMS[ri], Uses[i]));
-      if (SlotIndex::isEarlierInstr(Uses[i+1], RMS[ri]))
+    unsigned RE = RMS.size();
+    for (unsigned I = 0; I != NumGaps && RI != RE; ++I) {
+      // Look for Uses[I] <= RMS <= Uses[I + 1].
+      assert(!SlotIndex::isEarlierInstr(RMS[RI], Uses[I]));
+      if (SlotIndex::isEarlierInstr(Uses[I + 1], RMS[RI]))
         continue;
       // Skip a regmask on the same instruction as the last use. It doesn't
       // overlap the live range.
-      if (SlotIndex::isSameInstr(Uses[i+1], RMS[ri]) && i+1 == NumGaps)
+      if (SlotIndex::isSameInstr(Uses[I + 1], RMS[RI]) && I + 1 == NumGaps)
         break;
-      LLVM_DEBUG(dbgs() << ' ' << RMS[ri] << ':' << Uses[i] << '-'
-                        << Uses[i + 1]);
-      RegMaskGaps.push_back(i);
+      LLVM_DEBUG(dbgs() << ' ' << RMS[RI] << ':' << Uses[I] << '-'
+                        << Uses[I + 1]);
+      RegMaskGaps.push_back(I);
       // Advance ri to the next gap. A regmask on one of the uses counts in
       // both gaps.
-      while (ri != re && SlotIndex::isEarlierInstr(RMS[ri], Uses[i+1]))
-        ++ri;
+      while (RI != RE && SlotIndex::isEarlierInstr(RMS[RI], Uses[I + 1]))
+        ++RI;
     }
     LLVM_DEBUG(dbgs() << '\n');
   }
@@ -2297,16 +2304,16 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
     (1.0f / MBFI->getEntryFreq());
   SmallVector<float, 8> GapWeight;
 
-  Order.rewind();
-  while (unsigned PhysReg = Order.next()) {
+  for (MCPhysReg PhysReg : Order) {
+    assert(PhysReg);
     // Keep track of the largest spill weight that would need to be evicted in
-    // order to make use of PhysReg between UseSlots[i] and UseSlots[i+1].
+    // order to make use of PhysReg between UseSlots[I] and UseSlots[I + 1].
     calcGapWeights(PhysReg, GapWeight);
 
     // Remove any gaps with regmask clobbers.
     if (Matrix->checkRegMaskInterference(VirtReg, PhysReg))
-      for (unsigned i = 0, e = RegMaskGaps.size(); i != e; ++i)
-        GapWeight[RegMaskGaps[i]] = huge_valf;
+      for (unsigned I = 0, E = RegMaskGaps.size(); I != E; ++I)
+        GapWeight[RegMaskGaps[I]] = huge_valf;
 
     // Try to find the best sequence of gaps to close.
     // The new spill weight must be larger than any gap interference.
@@ -2324,7 +2331,7 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
       const bool LiveAfter = SplitAfter != NumGaps || BI.LiveOut;
 
       LLVM_DEBUG(dbgs() << printReg(PhysReg, TRI) << ' ' << Uses[SplitBefore]
-                        << '-' << Uses[SplitAfter] << " i=" << MaxGap);
+                        << '-' << Uses[SplitAfter] << " I=" << MaxGap);
 
       // Stop before the interval gets so big we wouldn't be making progress.
       if (!LiveBefore && !LiveAfter) {
@@ -2373,8 +2380,8 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
           // Recompute the max when necessary.
           if (GapWeight[SplitBefore - 1] >= MaxGap) {
             MaxGap = GapWeight[SplitBefore];
-            for (unsigned i = SplitBefore + 1; i != SplitAfter; ++i)
-              MaxGap = std::max(MaxGap, GapWeight[i]);
+            for (unsigned I = SplitBefore + 1; I != SplitAfter; ++I)
+              MaxGap = std::max(MaxGap, GapWeight[I]);
           }
           continue;
         }
@@ -2409,7 +2416,7 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
   SE->useIntv(SegStart, SegStop);
   SmallVector<unsigned, 8> IntvMap;
   SE->finish(&IntvMap);
-  DebugVars->splitRegister(VirtReg.reg, LREdit.regs(), *LIS);
+  DebugVars->splitRegister(VirtReg.reg(), LREdit.regs(), *LIS);
 
   // If the new range has the same number of instructions as before, mark it as
   // RS_Split2 so the next split will be forced to make progress. Otherwise,
@@ -2420,10 +2427,10 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
   if (NewGaps >= NumGaps) {
     LLVM_DEBUG(dbgs() << "Tagging non-progress ranges: ");
     assert(!ProgressRequired && "Didn't make progress when it was required.");
-    for (unsigned i = 0, e = IntvMap.size(); i != e; ++i)
-      if (IntvMap[i] == 1) {
-        setStage(LIS->getInterval(LREdit.get(i)), RS_Split2);
-        LLVM_DEBUG(dbgs() << printReg(LREdit.get(i)));
+    for (unsigned I = 0, E = IntvMap.size(); I != E; ++I)
+      if (IntvMap[I] == 1) {
+        setStage(LIS->getInterval(LREdit.get(I)), RS_Split2);
+        LLVM_DEBUG(dbgs() << printReg(LREdit.get(I)));
       }
     LLVM_DEBUG(dbgs() << '\n');
   }
@@ -2477,7 +2484,7 @@ unsigned RAGreedy::trySplit(LiveInterval &VirtReg, AllocationOrder &Order,
   // ranges already made dubious progress with region splitting, so they go
   // straight to single block splitting.
   if (getStage(VirtReg) < RS_Split2) {
-    unsigned PhysReg = tryRegionSplit(VirtReg, Order, NewVRegs);
+    MCRegister PhysReg = tryRegionSplit(VirtReg, Order, NewVRegs);
     if (PhysReg || !NewVRegs.empty())
       return PhysReg;
   }
@@ -2507,11 +2514,10 @@ static bool hasTiedDef(MachineRegisterInfo *MRI, unsigned reg) {
 /// for \p VirtReg.
 /// \p FixedRegisters contains all the virtual registers that cannot be
 /// recolored.
-bool
-RAGreedy::mayRecolorAllInterferences(unsigned PhysReg, LiveInterval &VirtReg,
-                                     SmallLISet &RecoloringCandidates,
-                                     const SmallVirtRegSet &FixedRegisters) {
-  const TargetRegisterClass *CurRC = MRI->getRegClass(VirtReg.reg);
+bool RAGreedy::mayRecolorAllInterferences(
+    MCRegister PhysReg, LiveInterval &VirtReg, SmallLISet &RecoloringCandidates,
+    const SmallVirtRegSet &FixedRegisters) {
+  const TargetRegisterClass *CurRC = MRI->getRegClass(VirtReg.reg());
 
   for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
     LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units);
@@ -2523,16 +2529,16 @@ RAGreedy::mayRecolorAllInterferences(unsigned PhysReg, LiveInterval &VirtReg,
       CutOffInfo |= CO_Interf;
       return false;
     }
-    for (unsigned i = Q.interferingVRegs().size(); i; --i) {
-      LiveInterval *Intf = Q.interferingVRegs()[i - 1];
+    for (LiveInterval *Intf : reverse(Q.interferingVRegs())) {
       // If Intf is done and sit on the same register class as VirtReg,
       // it would not be recolorable as it is in the same state as VirtReg.
       // However, if VirtReg has tied defs and Intf doesn't, then
       // there is still a point in examining if it can be recolorable.
       if (((getStage(*Intf) == RS_Done &&
-            MRI->getRegClass(Intf->reg) == CurRC) &&
-           !(hasTiedDef(MRI, VirtReg.reg) && !hasTiedDef(MRI, Intf->reg))) ||
-          FixedRegisters.count(Intf->reg)) {
+            MRI->getRegClass(Intf->reg()) == CurRC) &&
+           !(hasTiedDef(MRI, VirtReg.reg()) &&
+             !hasTiedDef(MRI, Intf->reg()))) ||
+          FixedRegisters.count(Intf->reg())) {
         LLVM_DEBUG(
             dbgs() << "Early abort: the interference is not recolorable.\n");
         return false;
@@ -2587,6 +2593,9 @@ unsigned RAGreedy::tryLastChanceRecoloring(LiveInterval &VirtReg,
                                            SmallVectorImpl<Register> &NewVRegs,
                                            SmallVirtRegSet &FixedRegisters,
                                            unsigned Depth) {
+  if (!TRI->shouldUseLastChanceRecoloringForVirtReg(*MF, VirtReg))
+    return ~0u;
+
   LLVM_DEBUG(dbgs() << "Try last chance recoloring for " << VirtReg << '\n');
   // Ranges must be Done.
   assert((getStage(VirtReg) >= RS_Done || !VirtReg.isSpillable()) &&
@@ -2605,15 +2614,15 @@ unsigned RAGreedy::tryLastChanceRecoloring(LiveInterval &VirtReg,
   SmallLISet RecoloringCandidates;
   // Record the original mapping virtual register to physical register in case
   // the recoloring fails.
-  DenseMap<Register, Register> VirtRegToPhysReg;
+  DenseMap<Register, MCRegister> VirtRegToPhysReg;
   // Mark VirtReg as fixed, i.e., it will not be recolored pass this point in
   // this recoloring "session".
-  assert(!FixedRegisters.count(VirtReg.reg));
-  FixedRegisters.insert(VirtReg.reg);
+  assert(!FixedRegisters.count(VirtReg.reg()));
+  FixedRegisters.insert(VirtReg.reg());
   SmallVector<Register, 4> CurrentNewVRegs;
 
-  Order.rewind();
-  while (Register PhysReg = Order.next()) {
+  for (MCRegister PhysReg : Order) {
+    assert(PhysReg.isValid());
     LLVM_DEBUG(dbgs() << "Try to assign: " << VirtReg << " to "
                       << printReg(PhysReg, TRI) << '\n');
     RecoloringCandidates.clear();
@@ -2644,7 +2653,7 @@ unsigned RAGreedy::tryLastChanceRecoloring(LiveInterval &VirtReg,
     for (SmallLISet::iterator It = RecoloringCandidates.begin(),
                               EndIt = RecoloringCandidates.end();
          It != EndIt; ++It) {
-      Register ItVirtReg = (*It)->reg;
+      Register ItVirtReg = (*It)->reg();
       enqueue(RecoloringQueue, *It);
       assert(VRM->hasPhys(ItVirtReg) &&
              "Interferences are supposed to be with allocated variables");
@@ -2697,10 +2706,10 @@ unsigned RAGreedy::tryLastChanceRecoloring(LiveInterval &VirtReg,
     for (SmallLISet::iterator It = RecoloringCandidates.begin(),
                               EndIt = RecoloringCandidates.end();
          It != EndIt; ++It) {
-      Register ItVirtReg = (*It)->reg;
+      Register ItVirtReg = (*It)->reg();
       if (VRM->hasPhys(ItVirtReg))
         Matrix->unassign(**It);
-      Register ItPhysReg = VirtRegToPhysReg[ItVirtReg];
+      MCRegister ItPhysReg = VirtRegToPhysReg[ItVirtReg];
       Matrix->assign(**It, ItPhysReg);
     }
   }
@@ -2724,8 +2733,8 @@ bool RAGreedy::tryRecoloringCandidates(PQueue &RecoloringQueue,
   while (!RecoloringQueue.empty()) {
     LiveInterval *LI = dequeue(RecoloringQueue);
     LLVM_DEBUG(dbgs() << "Try to recolor: " << *LI << '\n');
-    Register PhysReg = selectOrSplitImpl(*LI, NewVRegs, FixedRegisters,
-                                         Depth + 1);
+    MCRegister PhysReg =
+        selectOrSplitImpl(*LI, NewVRegs, FixedRegisters, Depth + 1);
     // When splitting happens, the live-range may actually be empty.
     // In that case, this is okay to continue the recoloring even
     // if we did not find an alternative color for it. Indeed,
@@ -2743,7 +2752,7 @@ bool RAGreedy::tryRecoloringCandidates(PQueue &RecoloringQueue,
                       << " succeeded with: " << printReg(PhysReg, TRI) << '\n');
 
     Matrix->assign(*LI, PhysReg);
-    FixedRegisters.insert(LI->reg);
+    FixedRegisters.insert(LI->reg());
   }
   return true;
 }
@@ -2752,12 +2761,12 @@ bool RAGreedy::tryRecoloringCandidates(PQueue &RecoloringQueue,
 //                            Main Entry Point
 //===----------------------------------------------------------------------===//
 
-Register RAGreedy::selectOrSplit(LiveInterval &VirtReg,
-                                 SmallVectorImpl<Register> &NewVRegs) {
+MCRegister RAGreedy::selectOrSplit(LiveInterval &VirtReg,
+                                   SmallVectorImpl<Register> &NewVRegs) {
   CutOffInfo = CO_None;
   LLVMContext &Ctx = MF->getFunction().getContext();
   SmallVirtRegSet FixedRegisters;
-  Register Reg = selectOrSplitImpl(VirtReg, NewVRegs, FixedRegisters);
+  MCRegister Reg = selectOrSplitImpl(VirtReg, NewVRegs, FixedRegisters);
   if (Reg == ~0U && (CutOffInfo != CO_None)) {
     uint8_t CutOffEncountered = CutOffInfo & (CO_Depth | CO_Interf);
     if (CutOffEncountered == CO_Depth)
@@ -2782,11 +2791,10 @@ Register RAGreedy::selectOrSplit(LiveInterval &VirtReg,
 /// Spilling a live range in the cold path can have lower cost than using
 /// the CSR for the first time. Returns the physical register if we decide
 /// to use the CSR; otherwise return 0.
-unsigned RAGreedy::tryAssignCSRFirstTime(LiveInterval &VirtReg,
-                                         AllocationOrder &Order,
-                                         Register PhysReg,
-                                         unsigned &CostPerUseLimit,
-                                         SmallVectorImpl<Register> &NewVRegs) {
+MCRegister
+RAGreedy::tryAssignCSRFirstTime(LiveInterval &VirtReg, AllocationOrder &Order,
+                                MCRegister PhysReg, unsigned &CostPerUseLimit,
+                                SmallVectorImpl<Register> &NewVRegs) {
   if (getStage(VirtReg) == RS_Spill && VirtReg.isSpillable()) {
     // We choose spill over using the CSR for the first time if the spill cost
     // is lower than CSRCost.
@@ -2851,7 +2859,7 @@ void RAGreedy::initializeCSRCost() {
 /// Collect the hint info for \p Reg.
 /// The results are stored into \p Out.
 /// \p Out is not cleared before being populated.
-void RAGreedy::collectHintInfo(unsigned Reg, HintsInfo &Out) {
+void RAGreedy::collectHintInfo(Register Reg, HintsInfo &Out) {
   for (const MachineInstr &Instr : MRI->reg_nodbg_instructions(Reg)) {
     if (!Instr.isFullCopy())
       continue;
@@ -2863,9 +2871,8 @@ void RAGreedy::collectHintInfo(unsigned Reg, HintsInfo &Out) {
         continue;
     }
     // Get the current assignment.
-    Register OtherPhysReg = Register::isPhysicalRegister(OtherReg)
-                                ? OtherReg
-                                : VRM->getPhys(OtherReg);
+    MCRegister OtherPhysReg =
+        OtherReg.isPhysical() ? OtherReg.asMCReg() : VRM->getPhys(OtherReg);
     // Push the collected information.
     Out.push_back(HintInfo(MBFI->getBlockFreq(Instr.getParent()), OtherReg,
                            OtherPhysReg));
@@ -2876,7 +2883,7 @@ void RAGreedy::collectHintInfo(unsigned Reg, HintsInfo &Out) {
 /// \p PhysReg was used.
 /// \return The cost of \p List for \p PhysReg.
 BlockFrequency RAGreedy::getBrokenHintFreq(const HintsInfo &List,
-                                           unsigned PhysReg) {
+                                           MCRegister PhysReg) {
   BlockFrequency Cost = 0;
   for (const HintInfo &Info : List) {
     if (Info.PhysReg != PhysReg)
@@ -2897,11 +2904,11 @@ void RAGreedy::tryHintRecoloring(LiveInterval &VirtReg) {
   // We have a broken hint, check if it is possible to fix it by
   // reusing PhysReg for the copy-related live-ranges. Indeed, we evicted
   // some register and PhysReg may be available for the other live-ranges.
-  SmallSet<unsigned, 4> Visited;
+  SmallSet<Register, 4> Visited;
   SmallVector<unsigned, 2> RecoloringCandidates;
   HintsInfo Info;
-  unsigned Reg = VirtReg.reg;
-  Register PhysReg = VRM->getPhys(Reg);
+  Register Reg = VirtReg.reg();
+  MCRegister PhysReg = VRM->getPhys(Reg);
   // Start the recoloring algorithm from the input live-interval, then
   // it will propagate to the ones that are copy-related with it.
   Visited.insert(Reg);
@@ -2922,7 +2929,7 @@ void RAGreedy::tryHintRecoloring(LiveInterval &VirtReg) {
     // Get the live interval mapped with this virtual register to be able
     // to check for the interference with the new color.
     LiveInterval &LI = LIS->getInterval(Reg);
-    Register CurrPhys = VRM->getPhys(Reg);
+    MCRegister CurrPhys = VRM->getPhys(Reg);
     // Check that the new color matches the register class constraints and
     // that it is free for this live range.
     if (CurrPhys != PhysReg && (!MRI->getRegClass(Reg)->contains(PhysReg) ||
@@ -3003,33 +3010,35 @@ void RAGreedy::tryHintRecoloring(LiveInterval &VirtReg) {
 /// getting rid of 2 copies.
 void RAGreedy::tryHintsRecoloring() {
   for (LiveInterval *LI : SetOfBrokenHints) {
-    assert(Register::isVirtualRegister(LI->reg) &&
+    assert(Register::isVirtualRegister(LI->reg()) &&
            "Recoloring is possible only for virtual registers");
     // Some dead defs may be around (e.g., because of debug uses).
     // Ignore those.
-    if (!VRM->hasPhys(LI->reg))
+    if (!VRM->hasPhys(LI->reg()))
       continue;
     tryHintRecoloring(*LI);
   }
 }
 
-Register RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg,
-                                     SmallVectorImpl<Register> &NewVRegs,
-                                     SmallVirtRegSet &FixedRegisters,
-                                     unsigned Depth) {
+MCRegister RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg,
+                                       SmallVectorImpl<Register> &NewVRegs,
+                                       SmallVirtRegSet &FixedRegisters,
+                                       unsigned Depth) {
   unsigned CostPerUseLimit = ~0u;
   // First try assigning a free register.
-  AllocationOrder Order(VirtReg.reg, *VRM, RegClassInfo, Matrix);
-  if (unsigned PhysReg = tryAssign(VirtReg, Order, NewVRegs, FixedRegisters)) {
+  auto Order =
+      AllocationOrder::create(VirtReg.reg(), *VRM, RegClassInfo, Matrix);
+  if (MCRegister PhysReg =
+          tryAssign(VirtReg, Order, NewVRegs, FixedRegisters)) {
     // If VirtReg got an assignment, the eviction info is no longre relevant.
-    LastEvicted.clearEvicteeInfo(VirtReg.reg);
+    LastEvicted.clearEvicteeInfo(VirtReg.reg());
     // When NewVRegs is not empty, we may have made decisions such as evicting
     // a virtual register, go with the earlier decisions and use the physical
     // register.
     if (CSRCost.getFrequency() && isUnusedCalleeSavedReg(PhysReg) &&
         NewVRegs.empty()) {
-      Register CSRReg = tryAssignCSRFirstTime(VirtReg, Order, PhysReg,
-                                              CostPerUseLimit, NewVRegs);
+      MCRegister CSRReg = tryAssignCSRFirstTime(VirtReg, Order, PhysReg,
+                                                CostPerUseLimit, NewVRegs);
       if (CSRReg || !NewVRegs.empty())
         // Return now if we decide to use a CSR or create new vregs due to
         // pre-splitting.
@@ -3040,7 +3049,7 @@ Register RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg,
 
   LiveRangeStage Stage = getStage(VirtReg);
   LLVM_DEBUG(dbgs() << StageName[Stage] << " Cascade "
-                    << ExtraRegInfo[VirtReg.reg].Cascade << '\n');
+                    << ExtraRegInfo[VirtReg.reg()].Cascade << '\n');
 
   // Try to evict a less worthy live range, but only for ranges from the primary
   // queue. The RS_Split ranges already failed to do this, and they should not
@@ -3049,7 +3058,7 @@ Register RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg,
     if (Register PhysReg =
             tryEvict(VirtReg, Order, NewVRegs, CostPerUseLimit,
                      FixedRegisters)) {
-      Register Hint = MRI->getSimpleHint(VirtReg.reg);
+      Register Hint = MRI->getSimpleHint(VirtReg.reg());
       // If VirtReg has a hint and that hint is broken record this
       // virtual register as a recoloring candidate for broken hint.
       // Indeed, since we evicted a variable in its neighborhood it is
@@ -3059,7 +3068,7 @@ Register RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg,
         SetOfBrokenHints.insert(&VirtReg);
       // If VirtReg eviction someone, the eviction info for it as an evictee is
       // no longre relevant.
-      LastEvicted.clearEvicteeInfo(VirtReg.reg);
+      LastEvicted.clearEvicteeInfo(VirtReg.reg());
       return PhysReg;
     }
 
@@ -3071,7 +3080,7 @@ Register RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg,
   if (Stage < RS_Split) {
     setStage(VirtReg, RS_Split);
     LLVM_DEBUG(dbgs() << "wait for second round\n");
-    NewVRegs.push_back(VirtReg.reg);
+    NewVRegs.push_back(VirtReg.reg());
     return 0;
   }
 
@@ -3081,7 +3090,7 @@ Register RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg,
     Register PhysReg = trySplit(VirtReg, Order, NewVRegs, FixedRegisters);
     if (PhysReg || (NewVRegs.size() - NewVRegSizeBefore)) {
       // If VirtReg got split, the eviction info is no longer relevant.
-      LastEvicted.clearEvicteeInfo(VirtReg.reg);
+      LastEvicted.clearEvicteeInfo(VirtReg.reg());
       return PhysReg;
     }
   }
@@ -3093,14 +3102,16 @@ Register RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg,
                                    Depth);
 
   // Finally spill VirtReg itself.
-  if (EnableDeferredSpilling && getStage(VirtReg) < RS_Memory) {
+  if ((EnableDeferredSpilling ||
+       TRI->shouldUseDeferredSpillingForVirtReg(*MF, VirtReg)) &&
+      getStage(VirtReg) < RS_Memory) {
     // TODO: This is experimental and in particular, we do not model
     // the live range splitting done by spilling correctly.
     // We would need a deep integration with the spiller to do the
     // right thing here. Anyway, that is still good for early testing.
     setStage(VirtReg, RS_Memory);
     LLVM_DEBUG(dbgs() << "Do as if this register is in memory\n");
-    NewVRegs.push_back(VirtReg.reg);
+    NewVRegs.push_back(VirtReg.reg());
   } else {
     NamedRegionTimer T("spill", "Spiller", TimerGroupName,
                        TimerGroupDescription, TimePassesIsEnabled);
@@ -3111,7 +3122,7 @@ Register RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg,
     // Tell LiveDebugVariables about the new ranges. Ranges not being covered by
     // the new regs are kept in LDV (still mapping to the old register), until
     // we rewrite spilled locations in LDV at a later stage.
-    DebugVars->splitRegister(VirtReg.reg, LRE.regs(), *LIS);
+    DebugVars->splitRegister(VirtReg.reg(), LRE.regs(), *LIS);
 
     if (VerifyEnabled)
       MF->verify(this, "After spilling");
@@ -3230,7 +3241,9 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
 
   initializeCSRCost();
 
-  calculateSpillWeightsAndHints(*LIS, mf, VRM, *Loops, *MBFI);
+  VRAI = std::make_unique<VirtRegAuxInfo>(*MF, *LIS, *VRM, *Loops, *MBFI);
+
+  VRAI->calculateSpillWeightsAndHints();
 
   LLVM_DEBUG(LIS->dump());
 
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/RegAllocPBQP.cpp b/contrib/llvm-project/llvm/lib/CodeGen/RegAllocPBQP.cpp
index 7590dbf1b977..7c5af1a0c56e 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/RegAllocPBQP.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/RegAllocPBQP.cpp
@@ -140,14 +140,13 @@ public:
         MachineFunctionProperties::Property::NoPHIs);
   }
 
+  MachineFunctionProperties getClearedProperties() const override {
+    return MachineFunctionProperties().set(
+      MachineFunctionProperties::Property::IsSSA);
+  }
+
 private:
-  using LI2NodeMap = std::map<const LiveInterval *, unsigned>;
-  using Node2LIMap = std::vector<const LiveInterval *>;
-  using AllowedSet = std::vector<unsigned>;
-  using AllowedSetMap = std::vector<AllowedSet>;
-  using RegPair = std::pair<unsigned, unsigned>;
-  using CoalesceMap = std::map<RegPair, PBQP::PBQPNum>;
-  using RegSet = std::set<unsigned>;
+  using RegSet = std::set<Register>;
 
   char *customPassID;
 
@@ -199,7 +198,7 @@ public:
 
     for (auto NId : G.nodeIds()) {
       PBQP::PBQPNum SpillCost =
-        LIS.getInterval(G.getNodeMetadata(NId).getVReg()).weight;
+          LIS.getInterval(G.getNodeMetadata(NId).getVReg()).weight();
       if (SpillCost == 0.0)
         SpillCost = std::numeric_limits<PBQP::PBQPNum>::min();
       else
@@ -231,9 +230,9 @@ private:
       return false;
 
     if (NRegs < MRegs)
-      return D.count(IKey(NRegs, MRegs)) > 0;
+      return D.contains(IKey(NRegs, MRegs));
 
-    return D.count(IKey(MRegs, NRegs)) > 0;
+    return D.contains(IKey(MRegs, NRegs));
   }
 
   void setDisjointAllowedRegs(const PBQPRAGraph &G, PBQPRAGraph::NodeId NId,
@@ -290,7 +289,7 @@ private:
     // If two intervals end at the same point, we need a way to break the tie or
     // the set will assume they're actually equal and refuse to insert a
     // "duplicate". Just compare the vregs - fast and guaranteed unique.
-    return std::get<0>(I1)->reg < std::get<0>(I2)->reg;
+    return std::get<0>(I1)->reg() < std::get<0>(I2)->reg();
   }
 
   static bool isAtLastSegment(const IntervalInfo &I) {
@@ -331,7 +330,7 @@ public:
 
     // Start by building the inactive set.
     for (auto NId : G.nodeIds()) {
-      unsigned VReg = G.getNodeMetadata(NId).getVReg();
+      Register VReg = G.getNodeMetadata(NId).getVReg();
       LiveInterval &LI = LIS.getInterval(VReg);
       assert(!LI.empty() && "PBQP graph contains node for empty interval");
       Inactive.push(std::make_tuple(&LI, 0, NId));
@@ -413,9 +412,9 @@ private:
     PBQPRAGraph::RawMatrix M(NRegs.size() + 1, MRegs.size() + 1, 0);
     bool NodesInterfere = false;
     for (unsigned I = 0; I != NRegs.size(); ++I) {
-      unsigned PRegN = NRegs[I];
+      MCRegister PRegN = NRegs[I];
       for (unsigned J = 0; J != MRegs.size(); ++J) {
-        unsigned PRegM = MRegs[J];
+        MCRegister PRegM = MRegs[J];
         if (TRI.regsOverlap(PRegN, PRegM)) {
           M[I + 1][J + 1] = std::numeric_limits<PBQP::PBQPNum>::infinity();
           NodesInterfere = true;
@@ -448,11 +447,10 @@ public:
         if (!CP.setRegisters(&MI) || CP.getSrcReg() == CP.getDstReg())
           continue;
 
-        unsigned DstReg = CP.getDstReg();
-        unsigned SrcReg = CP.getSrcReg();
+        Register DstReg = CP.getDstReg();
+        Register SrcReg = CP.getSrcReg();
 
-        const float Scale = 1.0f / MBFI.getEntryFreq();
-        PBQP::PBQPNum CBenefit = MBFI.getBlockFreq(&MBB).getFrequency() * Scale;
+        PBQP::PBQPNum CBenefit = MBFI.getBlockFreqRelativeToEntryBlock(&MBB);
 
         if (CP.isPhys()) {
           if (!MF.getRegInfo().isAllocatable(DstReg))
@@ -464,7 +462,7 @@ public:
             G.getNodeMetadata(NId).getAllowedRegs();
 
           unsigned PRegOpt = 0;
-          while (PRegOpt < Allowed.size() && Allowed[PRegOpt] != DstReg)
+          while (PRegOpt < Allowed.size() && Allowed[PRegOpt].id() != DstReg)
             ++PRegOpt;
 
           if (PRegOpt < Allowed.size()) {
@@ -509,9 +507,9 @@ private:
     assert(CostMat.getRows() == Allowed1.size() + 1 && "Size mismatch.");
     assert(CostMat.getCols() == Allowed2.size() + 1 && "Size mismatch.");
     for (unsigned I = 0; I != Allowed1.size(); ++I) {
-      unsigned PReg1 = Allowed1[I];
+      MCRegister PReg1 = Allowed1[I];
       for (unsigned J = 0; J != Allowed2.size(); ++J) {
-        unsigned PReg2 = Allowed2[J];
+        MCRegister PReg2 = Allowed2[J];
         if (PReg1 == PReg2)
           CostMat[I + 1][J + 1] -= Benefit;
       }
@@ -519,6 +517,20 @@ private:
   }
 };
 
+/// PBQP-specific implementation of weight normalization.
+class PBQPVirtRegAuxInfo final : public VirtRegAuxInfo {
+  float normalize(float UseDefFreq, unsigned Size, unsigned NumInstr) override {
+    // All intervals have a spill weight that is mostly proportional to the
+    // number of uses, with uses in loops having a bigger weight.
+    return NumInstr * VirtRegAuxInfo::normalize(UseDefFreq, Size, 1);
+  }
+
+public:
+  PBQPVirtRegAuxInfo(MachineFunction &MF, LiveIntervals &LIS, VirtRegMap &VRM,
+                     const MachineLoopInfo &Loops,
+                     const MachineBlockFrequencyInfo &MBFI)
+      : VirtRegAuxInfo(MF, LIS, VRM, Loops, MBFI) {}
+};
 } // end anonymous namespace
 
 // Out-of-line destructor/anchor for PBQPRAConstraint.
@@ -558,18 +570,19 @@ void RegAllocPBQP::findVRegIntervalsToAlloc(const MachineFunction &MF,
 
   // Iterate over all live ranges.
   for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
-    unsigned Reg = Register::index2VirtReg(I);
+    Register Reg = Register::index2VirtReg(I);
     if (MRI.reg_nodbg_empty(Reg))
       continue;
     VRegsToAlloc.insert(Reg);
   }
 }
 
-static bool isACalleeSavedRegister(unsigned reg, const TargetRegisterInfo &TRI,
+static bool isACalleeSavedRegister(MCRegister Reg,
+                                   const TargetRegisterInfo &TRI,
                                    const MachineFunction &MF) {
   const MCPhysReg *CSR = MF.getRegInfo().getCalleeSavedRegs();
   for (unsigned i = 0; CSR[i] != 0; ++i)
-    if (TRI.regsOverlap(reg, CSR[i]))
+    if (TRI.regsOverlap(Reg, CSR[i]))
       return true;
   return false;
 }
@@ -583,12 +596,12 @@ void RegAllocPBQP::initializeGraph(PBQPRAGraph &G, VirtRegMap &VRM,
   const TargetRegisterInfo &TRI =
       *G.getMetadata().MF.getSubtarget().getRegisterInfo();
 
-  std::vector<unsigned> Worklist(VRegsToAlloc.begin(), VRegsToAlloc.end());
+  std::vector<Register> Worklist(VRegsToAlloc.begin(), VRegsToAlloc.end());
 
-  std::map<unsigned, std::vector<unsigned>> VRegAllowedMap;
+  std::map<Register, std::vector<MCRegister>> VRegAllowedMap;
 
   while (!Worklist.empty()) {
-    unsigned VReg = Worklist.back();
+    Register VReg = Worklist.back();
     Worklist.pop_back();
 
     LiveInterval &VRegLI = LIS.getInterval(VReg);
@@ -596,8 +609,8 @@ void RegAllocPBQP::initializeGraph(PBQPRAGraph &G, VirtRegMap &VRM,
     // If this is an empty interval move it to the EmptyIntervalVRegs set then
     // continue.
     if (VRegLI.empty()) {
-      EmptyIntervalVRegs.insert(VRegLI.reg);
-      VRegsToAlloc.erase(VRegLI.reg);
+      EmptyIntervalVRegs.insert(VRegLI.reg());
+      VRegsToAlloc.erase(VRegLI.reg());
       continue;
     }
 
@@ -608,10 +621,10 @@ void RegAllocPBQP::initializeGraph(PBQPRAGraph &G, VirtRegMap &VRM,
     LIS.checkRegMaskInterference(VRegLI, RegMaskOverlaps);
 
     // Compute an initial allowed set for the current vreg.
-    std::vector<unsigned> VRegAllowed;
+    std::vector<MCRegister> VRegAllowed;
     ArrayRef<MCPhysReg> RawPRegOrder = TRC->getRawAllocationOrder(MF);
     for (unsigned I = 0; I != RawPRegOrder.size(); ++I) {
-      unsigned PReg = RawPRegOrder[I];
+      MCRegister PReg(RawPRegOrder[I]);
       if (MRI.isReserved(PReg))
         continue;
 
@@ -639,10 +652,11 @@ void RegAllocPBQP::initializeGraph(PBQPRAGraph &G, VirtRegMap &VRM,
     if (VRegAllowed.empty()) {
       SmallVector<Register, 8> NewVRegs;
       spillVReg(VReg, NewVRegs, MF, LIS, VRM, VRegSpiller);
-      Worklist.insert(Worklist.end(), NewVRegs.begin(), NewVRegs.end());
+      llvm::append_range(Worklist, NewVRegs);
       continue;
-    } else
-      VRegAllowedMap[VReg] = std::move(VRegAllowed);
+    }
+
+    VRegAllowedMap[VReg.id()] = std::move(VRegAllowed);
   }
 
   for (auto &KV : VRegAllowedMap) {
@@ -685,7 +699,7 @@ void RegAllocPBQP::spillVReg(Register VReg,
   const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
   (void)TRI;
   LLVM_DEBUG(dbgs() << "VREG " << printReg(VReg, &TRI) << " -> SPILLED (Cost: "
-                    << LRE.getParent().weight << ", New vregs: ");
+                    << LRE.getParent().weight() << ", New vregs: ");
 
   // Copy any newly inserted live intervals into the list of regs to
   // allocate.
@@ -693,8 +707,8 @@ void RegAllocPBQP::spillVReg(Register VReg,
        I != E; ++I) {
     const LiveInterval &LI = LIS.getInterval(*I);
     assert(!LI.empty() && "Empty spill range.");
-    LLVM_DEBUG(dbgs() << printReg(LI.reg, &TRI) << " ");
-    VRegsToAlloc.insert(LI.reg);
+    LLVM_DEBUG(dbgs() << printReg(LI.reg(), &TRI) << " ");
+    VRegsToAlloc.insert(LI.reg());
   }
 
   LLVM_DEBUG(dbgs() << ")\n");
@@ -718,11 +732,11 @@ bool RegAllocPBQP::mapPBQPToRegAlloc(const PBQPRAGraph &G,
   // Iterate over the nodes mapping the PBQP solution to a register
   // assignment.
   for (auto NId : G.nodeIds()) {
-    unsigned VReg = G.getNodeMetadata(NId).getVReg();
-    unsigned AllocOption = Solution.getSelection(NId);
+    Register VReg = G.getNodeMetadata(NId).getVReg();
+    unsigned AllocOpt = Solution.getSelection(NId);
 
-    if (AllocOption != PBQP::RegAlloc::getSpillOptionIdx()) {
-      unsigned PReg = G.getNodeMetadata(NId).getAllowedRegs()[AllocOption - 1];
+    if (AllocOpt != PBQP::RegAlloc::getSpillOptionIdx()) {
+      MCRegister PReg = G.getNodeMetadata(NId).getAllowedRegs()[AllocOpt - 1];
       LLVM_DEBUG(dbgs() << "VREG " << printReg(VReg, &TRI) << " -> "
                         << TRI.getName(PReg) << "\n");
       assert(PReg != 0 && "Invalid preg selected.");
@@ -750,12 +764,12 @@ void RegAllocPBQP::finalizeAlloc(MachineFunction &MF,
          I != E; ++I) {
     LiveInterval &LI = LIS.getInterval(*I);
 
-    unsigned PReg = MRI.getSimpleHint(LI.reg);
+    Register PReg = MRI.getSimpleHint(LI.reg());
 
     if (PReg == 0) {
-      const TargetRegisterClass &RC = *MRI.getRegClass(LI.reg);
+      const TargetRegisterClass &RC = *MRI.getRegClass(LI.reg());
       const ArrayRef<MCPhysReg> RawPRegOrder = RC.getRawAllocationOrder(MF);
-      for (unsigned CandidateReg : RawPRegOrder) {
+      for (MCRegister CandidateReg : RawPRegOrder) {
         if (!VRM.getRegInfo().isReserved(CandidateReg)) {
           PReg = CandidateReg;
           break;
@@ -765,7 +779,7 @@ void RegAllocPBQP::finalizeAlloc(MachineFunction &MF,
              "No un-reserved physical registers in this register class");
     }
 
-    VRM.assignVirt2Phys(LI.reg, PReg);
+    VRM.assignVirt2Phys(LI.reg(), PReg);
   }
 }
 
@@ -779,13 +793,6 @@ void RegAllocPBQP::postOptimization(Spiller &VRegSpiller, LiveIntervals &LIS) {
   DeadRemats.clear();
 }
 
-static inline float normalizePBQPSpillWeight(float UseDefFreq, unsigned Size,
-                                         unsigned NumInstr) {
-  // All intervals have a spill weight that is mostly proportional to the number
-  // of uses, with uses in loops having a bigger weight.
-  return NumInstr * normalizeSpillWeight(UseDefFreq, Size, 1);
-}
-
 bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
   LiveIntervals &LIS = getAnalysis<LiveIntervals>();
   MachineBlockFrequencyInfo &MBFI =
@@ -793,8 +800,8 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
 
   VirtRegMap &VRM = getAnalysis<VirtRegMap>();
 
-  calculateSpillWeightsAndHints(LIS, MF, &VRM, getAnalysis<MachineLoopInfo>(),
-                                MBFI, normalizePBQPSpillWeight);
+  PBQPVirtRegAuxInfo VRAI(MF, LIS, VRM, getAnalysis<MachineLoopInfo>(), MBFI);
+  VRAI.calculateSpillWeightsAndHints();
 
   std::unique_ptr<Spiller> VRegSpiller(createInlineSpiller(*this, MF, VRM));
 
@@ -878,7 +885,7 @@ static Printable PrintNodeInfo(PBQP::RegAlloc::PBQPRAGraph::NodeId NId,
   return Printable([NId, &G](raw_ostream &OS) {
     const MachineRegisterInfo &MRI = G.getMetadata().MF.getRegInfo();
     const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
-    unsigned VReg = G.getNodeMetadata(NId).getVReg();
+    Register VReg = G.getNodeMetadata(NId).getVReg();
     const char *RegClassName = TRI->getRegClassName(MRI.getRegClass(VReg));
     OS << NId << " (" << RegClassName << ':' << printReg(VReg, TRI) << ')';
   });
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/RegisterClassInfo.cpp b/contrib/llvm-project/llvm/lib/CodeGen/RegisterClassInfo.cpp
index 1523bd4d1649..0488db3d09cb 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/RegisterClassInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/RegisterClassInfo.cpp
@@ -188,7 +188,14 @@ unsigned RegisterClassInfo::computePSetLimit(unsigned Idx) const {
   }
   assert(RC && "Failed to find register class");
   compute(RC);
-  unsigned NReserved = RC->getNumRegs() - getNumAllocatableRegs(RC);
-  return TRI->getRegPressureSetLimit(*MF, Idx) -
-         TRI->getRegClassWeight(RC).RegWeight * NReserved;
+  unsigned NAllocatableRegs = getNumAllocatableRegs(RC);
+  unsigned RegPressureSetLimit = TRI->getRegPressureSetLimit(*MF, Idx);
+  // If all the regs are reserved, return raw RegPressureSetLimit.
+  // One example is VRSAVERC in PowerPC.
+  // Avoid returning zero, getRegPressureSetLimit(Idx) assumes computePSetLimit
+  // return non-zero value.
+  if (NAllocatableRegs == 0)
+    return RegPressureSetLimit;
+  unsigned NReserved = RC->getNumRegs() - NAllocatableRegs;
+  return RegPressureSetLimit - TRI->getRegClassWeight(RC).RegWeight * NReserved;
 }
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/RegisterCoalescer.cpp b/contrib/llvm-project/llvm/lib/CodeGen/RegisterCoalescer.cpp
index 17160a9f42cd..7fdc85a6e444 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -137,13 +137,13 @@ namespace {
     /// ordered-by-slot-index set of DBG_VALUEs, to help quick
     /// identification of whether coalescing may change location validity.
     using DbgValueLoc = std::pair<SlotIndex, MachineInstr*>;
-    DenseMap<unsigned, std::vector<DbgValueLoc>> DbgVRegToValues;
+    DenseMap<Register, std::vector<DbgValueLoc>> DbgVRegToValues;
 
     /// VRegs may be repeatedly coalesced, and have many DBG_VALUEs attached.
     /// To avoid repeatedly merging sets of DbgValueLocs, instead record
     /// which vregs have been coalesced, and where to. This map is from
     /// vreg => {set of vregs merged in}.
-    DenseMap<unsigned, SmallVector<unsigned, 4>> DbgMergedVRegNums;
+    DenseMap<Register, SmallVector<Register, 4>> DbgMergedVRegNums;
 
     /// A LaneMask to remember on which subregister live ranges we need to call
     /// shrinkToUses() later.
@@ -173,16 +173,16 @@ namespace {
     SmallVector<MachineInstr*, 8> DeadDefs;
 
     /// Virtual registers to be considered for register class inflation.
-    SmallVector<unsigned, 8> InflateRegs;
+    SmallVector<Register, 8> InflateRegs;
 
     /// The collection of live intervals which should have been updated
     /// immediately after rematerialiation but delayed until
     /// lateLiveIntervalUpdate is called.
-    DenseSet<unsigned> ToBeUpdated;
+    DenseSet<Register> ToBeUpdated;
 
     /// Record how many times the large live interval with many valnos
     /// has been tried to join with other live interval.
-    DenseMap<unsigned, unsigned long> LargeLIVisitCounter;
+    DenseMap<Register, unsigned long> LargeLIVisitCounter;
 
     /// Recursively eliminate dead defs in DeadDefs.
     void eliminateDeadDefs();
@@ -211,6 +211,18 @@ namespace {
     /// live interval update is costly.
     void lateLiveIntervalUpdate();
 
+    /// Check if the incoming value defined by a COPY at \p SLRQ in the subrange
+    /// has no value defined in the predecessors. If the incoming value is the
+    /// same as defined by the copy itself, the value is considered undefined.
+    bool copyValueUndefInPredecessors(LiveRange &S,
+                                      const MachineBasicBlock *MBB,
+                                      LiveQueryResult SLRQ);
+
+    /// Set necessary undef flags on subregister uses after pruning out undef
+    /// lane segments from the subrange.
+    void setUndefOnPrunedSubRegUses(LiveInterval &LI, Register Reg,
+                                    LaneBitmask PrunedLanes);
+
     /// Attempt to join intervals corresponding to SrcReg/DstReg, which are the
     /// src/dst of the copy instruction CopyMI.  This returns true if the copy
     /// was successfully coalesced away. If it is not currently possible to
@@ -285,7 +297,7 @@ namespace {
     /// number if it is not zero. If DstReg is a physical register and the
     /// existing subregister number of the def / use being updated is not zero,
     /// make sure to set it to the correct physical subregister.
-    void updateRegDefsUses(unsigned SrcReg, unsigned DstReg, unsigned SubIdx);
+    void updateRegDefsUses(Register SrcReg, Register DstReg, unsigned SubIdx);
 
     /// If the given machine operand reads only undefined lanes add an undef
     /// flag.
@@ -351,7 +363,7 @@ namespace {
                                       JoinVals &LHSVals, LiveRange &RHS,
                                       JoinVals &RHSVals);
 
-    void checkMergingChangesDbgValuesImpl(unsigned Reg, LiveRange &OtherRange,
+    void checkMergingChangesDbgValuesImpl(Register Reg, LiveRange &OtherRange,
                                           LiveRange &RegRange, JoinVals &Vals2);
 
   public:
@@ -388,8 +400,8 @@ INITIALIZE_PASS_END(RegisterCoalescer, "simple-register-coalescing",
                     "Simple Register Coalescing", false, false)
 
 LLVM_NODISCARD static bool isMoveInstr(const TargetRegisterInfo &tri,
-                                       const MachineInstr *MI, unsigned &Src,
-                                       unsigned &Dst, unsigned &SrcSub,
+                                       const MachineInstr *MI, Register &Src,
+                                       Register &Dst, unsigned &SrcSub,
                                        unsigned &DstSub) {
   if (MI->isCopy()) {
     Dst = MI->getOperand(0).getReg();
@@ -424,12 +436,13 @@ static bool isSplitEdge(const MachineBasicBlock *MBB) {
 }
 
 bool CoalescerPair::setRegisters(const MachineInstr *MI) {
-  SrcReg = DstReg = 0;
+  SrcReg = DstReg = Register();
   SrcIdx = DstIdx = 0;
   NewRC = nullptr;
   Flipped = CrossClass = false;
 
-  unsigned Src, Dst, SrcSub, DstSub;
+  Register Src, Dst;
+  unsigned SrcSub = 0, DstSub = 0;
   if (!isMoveInstr(TRI, MI, Src, Dst, SrcSub, DstSub))
     return false;
   Partial = SrcSub || DstSub;
@@ -523,7 +536,8 @@ bool CoalescerPair::flip() {
 bool CoalescerPair::isCoalescable(const MachineInstr *MI) const {
   if (!MI)
     return false;
-  unsigned Src, Dst, SrcSub, DstSub;
+  Register Src, Dst;
+  unsigned SrcSub = 0, DstSub = 0;
   if (!isMoveInstr(TRI, MI, Src, Dst, SrcSub, DstSub))
     return false;
 
@@ -536,8 +550,8 @@ bool CoalescerPair::isCoalescable(const MachineInstr *MI) const {
   }
 
   // Now check that Dst matches DstReg.
-  if (Register::isPhysicalRegister(DstReg)) {
-    if (!Register::isPhysicalRegister(Dst))
+  if (DstReg.isPhysical()) {
+    if (!Dst.isPhysical())
       return false;
     assert(!DstIdx && !SrcIdx && "Inconsistent CoalescerPair state.");
     // DstSub could be set for a physreg from INSERT_SUBREG.
@@ -547,7 +561,7 @@ bool CoalescerPair::isCoalescable(const MachineInstr *MI) const {
     if (!SrcSub)
       return DstReg == Dst;
     // This is a partial register copy. Check that the parts match.
-    return TRI.getSubReg(DstReg, SrcSub) == Dst;
+    return Register(TRI.getSubReg(DstReg, SrcSub)) == Dst;
   } else {
     // DstReg is virtual.
     if (DstReg != Dst)
@@ -649,7 +663,7 @@ bool RegisterCoalescer::adjustCopiesBackFrom(const CoalescerPair &CP,
   // in IntB, we can merge them.
   if (ValS+1 != BS) return false;
 
-  LLVM_DEBUG(dbgs() << "Extending: " << printReg(IntB.reg, TRI));
+  LLVM_DEBUG(dbgs() << "Extending: " << printReg(IntB.reg(), TRI));
 
   SlotIndex FillerStart = ValS->end, FillerEnd = BS->start;
   // We are about to delete CopyMI, so need to remove it as the 'instruction
@@ -692,13 +706,13 @@ bool RegisterCoalescer::adjustCopiesBackFrom(const CoalescerPair &CP,
 
   // If the source instruction was killing the source register before the
   // merge, unset the isKill marker given the live range has been extended.
-  int UIdx = ValSEndInst->findRegisterUseOperandIdx(IntB.reg, true);
+  int UIdx = ValSEndInst->findRegisterUseOperandIdx(IntB.reg(), true);
   if (UIdx != -1) {
     ValSEndInst->getOperand(UIdx).setIsKill(false);
   }
 
   // Rewrite the copy.
-  CopyMI->substituteRegister(IntA.reg, IntB.reg, 0, *TRI);
+  CopyMI->substituteRegister(IntA.reg(), IntB.reg(), 0, *TRI);
   // If the copy instruction was killing the destination register or any
   // subrange before the merge trim the live range.
   bool RecomputeLiveRange = AS->end == CopyIdx;
@@ -817,7 +831,7 @@ RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
     return { false, false };
   // If DefMI is a two-address instruction then commuting it will change the
   // destination register.
-  int DefIdx = DefMI->findRegisterDefOperandIdx(IntA.reg);
+  int DefIdx = DefMI->findRegisterDefOperandIdx(IntA.reg());
   assert(DefIdx != -1);
   unsigned UseOpIdx;
   if (!DefMI->isRegTiedToUseOperand(DefIdx, &UseOpIdx))
@@ -838,7 +852,7 @@ RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
 
   MachineOperand &NewDstMO = DefMI->getOperand(NewDstIdx);
   Register NewReg = NewDstMO.getReg();
-  if (NewReg != IntB.reg || !IntB.Query(AValNo->def).isKill())
+  if (NewReg != IntB.reg() || !IntB.Query(AValNo->def).isKill())
     return { false, false };
 
   // Make sure there are no other definitions of IntB that would reach the
@@ -848,7 +862,7 @@ RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
 
   // If some of the uses of IntA.reg is already coalesced away, return false.
   // It's not possible to determine whether it's safe to perform the coalescing.
-  for (MachineOperand &MO : MRI->use_nodbg_operands(IntA.reg)) {
+  for (MachineOperand &MO : MRI->use_nodbg_operands(IntA.reg())) {
     MachineInstr *UseMI = MO.getParent();
     unsigned OpNo = &MO - &UseMI->getOperand(0);
     SlotIndex UseIdx = LIS->getInstructionIndex(*UseMI);
@@ -870,9 +884,9 @@ RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
       TII->commuteInstruction(*DefMI, false, UseOpIdx, NewDstIdx);
   if (!NewMI)
     return { false, false };
-  if (Register::isVirtualRegister(IntA.reg) &&
-      Register::isVirtualRegister(IntB.reg) &&
-      !MRI->constrainRegClass(IntB.reg, MRI->getRegClass(IntA.reg)))
+  if (Register::isVirtualRegister(IntA.reg()) &&
+      Register::isVirtualRegister(IntB.reg()) &&
+      !MRI->constrainRegClass(IntB.reg(), MRI->getRegClass(IntA.reg())))
     return { false, false };
   if (NewMI != DefMI) {
     LIS->ReplaceMachineInstrInMaps(*DefMI, *NewMI);
@@ -891,9 +905,10 @@ RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
   //   = B
 
   // Update uses of IntA of the specific Val# with IntB.
-  for (MachineRegisterInfo::use_iterator UI = MRI->use_begin(IntA.reg),
+  for (MachineRegisterInfo::use_iterator UI = MRI->use_begin(IntA.reg()),
                                          UE = MRI->use_end();
-       UI != UE; /* ++UI is below because of possible MI removal */) {
+       UI != UE;
+       /* ++UI is below because of possible MI removal */) {
     MachineOperand &UseMO = *UI;
     ++UI;
     if (UseMO.isUndef())
@@ -920,7 +935,7 @@ RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
       continue;
     if (!UseMI->isCopy())
       continue;
-    if (UseMI->getOperand(0).getReg() != IntB.reg ||
+    if (UseMI->getOperand(0).getReg() != IntB.reg() ||
         UseMI->getOperand(0).getSubReg())
       continue;
 
@@ -951,10 +966,10 @@ RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
   BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator();
   if (IntA.hasSubRanges() || IntB.hasSubRanges()) {
     if (!IntA.hasSubRanges()) {
-      LaneBitmask Mask = MRI->getMaxLaneMaskForVReg(IntA.reg);
+      LaneBitmask Mask = MRI->getMaxLaneMaskForVReg(IntA.reg());
       IntA.createSubRangeFrom(Allocator, Mask, IntA);
     } else if (!IntB.hasSubRanges()) {
-      LaneBitmask Mask = MRI->getMaxLaneMaskForVReg(IntB.reg);
+      LaneBitmask Mask = MRI->getMaxLaneMaskForVReg(IntB.reg());
       IntB.createSubRangeFrom(Allocator, Mask, IntB);
     }
     SlotIndex AIdx = CopyIdx.getRegSlot(true);
@@ -1100,8 +1115,8 @@ bool RegisterCoalescer::removePartialRedundancy(const CoalescerPair &CP,
       continue;
     }
     // Check DefMI is a reverse copy and it is in BB Pred.
-    if (DefMI->getOperand(0).getReg() != IntA.reg ||
-        DefMI->getOperand(1).getReg() != IntB.reg ||
+    if (DefMI->getOperand(0).getReg() != IntA.reg() ||
+        DefMI->getOperand(1).getReg() != IntB.reg() ||
         DefMI->getParent() != Pred) {
       CopyLeftBB = Pred;
       continue;
@@ -1158,8 +1173,8 @@ bool RegisterCoalescer::removePartialRedundancy(const CoalescerPair &CP,
 
     // Insert new copy to CopyLeftBB.
     MachineInstr *NewCopyMI = BuildMI(*CopyLeftBB, InsPos, CopyMI.getDebugLoc(),
-                                      TII->get(TargetOpcode::COPY), IntB.reg)
-                                  .addReg(IntA.reg);
+                                      TII->get(TargetOpcode::COPY), IntB.reg())
+                                  .addReg(IntA.reg());
     SlotIndex NewCopyIdx =
         LIS->InsertMachineInstrInMaps(*NewCopyMI).getRegSlot();
     IntB.createDeadDef(NewCopyIdx, LIS->getVNInfoAllocator());
@@ -1212,7 +1227,10 @@ bool RegisterCoalescer::removePartialRedundancy(const CoalescerPair &CP,
       }
       ++I;
     }
-    LIS->extendToIndices(SR, EndPoints);
+    SmallVector<SlotIndex, 8> Undefs;
+    IntB.computeSubRangeUndefs(Undefs, SR.LaneMask, *MRI,
+                               *LIS->getSlotIndexes());
+    LIS->extendToIndices(SR, EndPoints, Undefs);
   }
   // If any dead defs were extended, truncate them.
   shrinkToUses(&IntB);
@@ -1224,9 +1242,9 @@ bool RegisterCoalescer::removePartialRedundancy(const CoalescerPair &CP,
 
 /// Returns true if @p MI defines the full vreg @p Reg, as opposed to just
 /// defining a subregister.
-static bool definesFullReg(const MachineInstr &MI, unsigned Reg) {
-  assert(!Register::isPhysicalRegister(Reg) &&
-         "This code cannot handle physreg aliasing");
+static bool definesFullReg(const MachineInstr &MI, Register Reg) {
+  assert(!Reg.isPhysical() && "This code cannot handle physreg aliasing");
+
   for (const MachineOperand &Op : MI.operands()) {
     if (!Op.isReg() || !Op.isDef() || Op.getReg() != Reg)
       continue;
@@ -1242,9 +1260,9 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
                                                 MachineInstr *CopyMI,
                                                 bool &IsDefCopy) {
   IsDefCopy = false;
-  unsigned SrcReg = CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg();
+  Register SrcReg = CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg();
   unsigned SrcIdx = CP.isFlipped() ? CP.getDstIdx() : CP.getSrcIdx();
-  unsigned DstReg = CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg();
+  Register DstReg = CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg();
   unsigned DstIdx = CP.isFlipped() ? CP.getSrcIdx() : CP.getDstIdx();
   if (Register::isPhysicalRegister(SrcReg))
     return false;
@@ -1291,8 +1309,8 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
 
   const TargetRegisterClass *DefRC = TII->getRegClass(MCID, 0, TRI, *MF);
   if (!DefMI->isImplicitDef()) {
-    if (Register::isPhysicalRegister(DstReg)) {
-      unsigned NewDstReg = DstReg;
+    if (DstReg.isPhysical()) {
+      Register NewDstReg = DstReg;
 
       unsigned NewDstIdx = TRI->composeSubRegIndices(CP.getSrcIdx(),
                                               DefMI->getOperand(0).getSubReg());
@@ -1366,7 +1384,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
   // NewMI may have dead implicit defs (E.g. EFLAGS for MOV<bits>r0 on X86).
   // We need to remember these so we can add intervals once we insert
   // NewMI into SlotIndexes.
-  SmallVector<unsigned, 4> NewMIImplDefs;
+  SmallVector<MCRegister, 4> NewMIImplDefs;
   for (unsigned i = NewMI.getDesc().getNumOperands(),
                 e = NewMI.getNumOperands();
        i != e; ++i) {
@@ -1374,11 +1392,11 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
     if (MO.isReg() && MO.isDef()) {
       assert(MO.isImplicit() && MO.isDead() &&
              Register::isPhysicalRegister(MO.getReg()));
-      NewMIImplDefs.push_back(MO.getReg());
+      NewMIImplDefs.push_back(MO.getReg().asMCReg());
     }
   }
 
-  if (Register::isVirtualRegister(DstReg)) {
+  if (DstReg.isVirtual()) {
     unsigned NewIdx = NewMI.getOperand(0).getSubReg();
 
     if (DefRC != nullptr) {
@@ -1513,7 +1531,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
 
   SlotIndex NewMIIdx = LIS->getInstructionIndex(NewMI);
   for (unsigned i = 0, e = NewMIImplDefs.size(); i != e; ++i) {
-    unsigned Reg = NewMIImplDefs[i];
+    MCRegister Reg = NewMIImplDefs[i];
     for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
       if (LiveRange *LR = LIS->getCachedRegUnit(*Units))
         LR->createDeadDef(NewMIIdx.getRegSlot(), LIS->getVNInfoAllocator());
@@ -1571,7 +1589,8 @@ MachineInstr *RegisterCoalescer::eliminateUndefCopy(MachineInstr *CopyMI) {
   // Note that we do not query CoalescerPair here but redo isMoveInstr as the
   // CoalescerPair may have a new register class with adjusted subreg indices
   // at this point.
-  unsigned SrcReg, DstReg, SrcSubIdx, DstSubIdx;
+  Register SrcReg, DstReg;
+  unsigned SrcSubIdx = 0, DstSubIdx = 0;
   if(!isMoveInstr(*TRI, CopyMI, SrcReg, DstReg, SrcSubIdx, DstSubIdx))
     return nullptr;
 
@@ -1696,7 +1715,7 @@ void RegisterCoalescer::addUndefFlag(const LiveInterval &Int, SlotIndex UseIdx,
   }
 }
 
-void RegisterCoalescer::updateRegDefsUses(unsigned SrcReg, unsigned DstReg,
+void RegisterCoalescer::updateRegDefsUses(Register SrcReg, Register DstReg,
                                           unsigned SubIdx) {
   bool DstIsPhys = Register::isPhysicalRegister(DstReg);
   LiveInterval *DstInt = DstIsPhys ? nullptr : &LIS->getInterval(DstReg);
@@ -1752,7 +1771,7 @@ void RegisterCoalescer::updateRegDefsUses(unsigned SrcReg, unsigned DstReg,
       if (SubIdx != 0 && MO.isUse() && MRI->shouldTrackSubRegLiveness(DstReg)) {
         if (!DstInt->hasSubRanges()) {
           BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator();
-          LaneBitmask FullMask = MRI->getMaxLaneMaskForVReg(DstInt->reg);
+          LaneBitmask FullMask = MRI->getMaxLaneMaskForVReg(DstInt->reg());
           LaneBitmask UsedLanes = TRI->getSubRegIndexLaneMask(SubIdx);
           LaneBitmask UnusedLanes = FullMask & ~UsedLanes;
           DstInt->createSubRangeFrom(Allocator, UsedLanes, *DstInt);
@@ -1802,6 +1821,49 @@ bool RegisterCoalescer::canJoinPhys(const CoalescerPair &CP) {
   return false;
 }
 
+bool RegisterCoalescer::copyValueUndefInPredecessors(
+    LiveRange &S, const MachineBasicBlock *MBB, LiveQueryResult SLRQ) {
+  for (const MachineBasicBlock *Pred : MBB->predecessors()) {
+    SlotIndex PredEnd = LIS->getMBBEndIdx(Pred);
+    if (VNInfo *V = S.getVNInfoAt(PredEnd.getPrevSlot())) {
+      // If this is a self loop, we may be reading the same value.
+      if (V->id != SLRQ.valueOutOrDead()->id)
+        return false;
+    }
+  }
+
+  return true;
+}
+
+void RegisterCoalescer::setUndefOnPrunedSubRegUses(LiveInterval &LI,
+                                                   Register Reg,
+                                                   LaneBitmask PrunedLanes) {
+  // If we had other instructions in the segment reading the undef sublane
+  // value, we need to mark them with undef.
+  for (MachineOperand &MO : MRI->use_nodbg_operands(Reg)) {
+    unsigned SubRegIdx = MO.getSubReg();
+    if (SubRegIdx == 0 || MO.isUndef())
+      continue;
+
+    LaneBitmask SubRegMask = TRI->getSubRegIndexLaneMask(SubRegIdx);
+    SlotIndex Pos = LIS->getInstructionIndex(*MO.getParent());
+    for (LiveInterval::SubRange &S : LI.subranges()) {
+      if (!S.liveAt(Pos) && (PrunedLanes & SubRegMask).any()) {
+        MO.setIsUndef();
+        break;
+      }
+    }
+  }
+
+  LI.removeEmptySubRanges();
+
+  // A def of a subregister may be a use of other register lanes. Replacing
+  // such a def with a def of a different register will eliminate the use,
+  // and may cause the recorded live range to be larger than the actual
+  // liveness in the program IR.
+  LIS->shrinkToUses(&LI);
+}
+
 bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
   Again = false;
   LLVM_DEBUG(dbgs() << LIS->getInstructionIndex(*CopyMI) << '\t' << *CopyMI);
@@ -1861,16 +1923,35 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
       VNInfo *ReadVNI = LRQ.valueIn();
       assert(ReadVNI && "No value before copy and no <undef> flag.");
       assert(ReadVNI != DefVNI && "Cannot read and define the same value.");
-      LI.MergeValueNumberInto(DefVNI, ReadVNI);
+
+      // Track incoming undef lanes we need to eliminate from the subrange.
+      LaneBitmask PrunedLanes;
+      MachineBasicBlock *MBB = CopyMI->getParent();
 
       // Process subregister liveranges.
       for (LiveInterval::SubRange &S : LI.subranges()) {
         LiveQueryResult SLRQ = S.Query(CopyIdx);
         if (VNInfo *SDefVNI = SLRQ.valueDefined()) {
-          VNInfo *SReadVNI = SLRQ.valueIn();
-          S.MergeValueNumberInto(SDefVNI, SReadVNI);
+          if (VNInfo *SReadVNI = SLRQ.valueIn())
+            SDefVNI = S.MergeValueNumberInto(SDefVNI, SReadVNI);
+
+          // If this copy introduced an undef subrange from an incoming value,
+          // we need to eliminate the undef live in values from the subrange.
+          if (copyValueUndefInPredecessors(S, MBB, SLRQ)) {
+            LLVM_DEBUG(dbgs() << "Incoming sublane value is undef at copy\n");
+            PrunedLanes |= S.LaneMask;
+            S.removeValNo(SDefVNI);
+          }
         }
       }
+
+      LI.MergeValueNumberInto(DefVNI, ReadVNI);
+      if (PrunedLanes.any()) {
+        LLVM_DEBUG(dbgs() << "Pruning undef incoming lanes: "
+                          << PrunedLanes << '\n');
+        setUndefOnPrunedSubRegUses(LI, CP.getSrcReg(), PrunedLanes);
+      }
+
       LLVM_DEBUG(dbgs() << "\tMerged values:          " << LI << '\n');
     }
     deleteInstr(CopyMI);
@@ -1885,7 +1966,7 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
     if (!canJoinPhys(CP)) {
       // Before giving up coalescing, if definition of source is defined by
       // trivial computation, try rematerializing it.
-      bool IsDefCopy;
+      bool IsDefCopy = false;
       if (reMaterializeTrivialDef(CP, CopyMI, IsDefCopy))
         return true;
       if (IsDefCopy)
@@ -1924,7 +2005,7 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
 
     // If definition of source is defined by trivial computation, try
     // rematerializing it.
-    bool IsDefCopy;
+    bool IsDefCopy = false;
     if (reMaterializeTrivialDef(CP, CopyMI, IsDefCopy))
       return true;
 
@@ -1938,7 +2019,7 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
       if (Changed) {
         deleteInstr(CopyMI);
         if (Shrink) {
-          unsigned DstReg = CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg();
+          Register DstReg = CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg();
           LiveInterval &DstLI = LIS->getInterval(DstReg);
           shrinkToUses(&DstLI);
           LLVM_DEBUG(dbgs() << "\t\tshrunk:   " << DstLI << '\n');
@@ -1991,7 +2072,7 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
         continue;
       LLVM_DEBUG(dbgs() << "Shrink LaneUses (Lane " << PrintLaneMask(S.LaneMask)
                         << ")\n");
-      LIS->shrinkToUses(S, LI.reg);
+      LIS->shrinkToUses(S, LI.reg());
     }
     LI.removeEmptySubRanges();
   }
@@ -2030,8 +2111,8 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
 }
 
 bool RegisterCoalescer::joinReservedPhysReg(CoalescerPair &CP) {
-  unsigned DstReg = CP.getDstReg();
-  unsigned SrcReg = CP.getSrcReg();
+  Register DstReg = CP.getDstReg();
+  Register SrcReg = CP.getSrcReg();
   assert(CP.isPhys() && "Must be a physreg copy");
   assert(MRI->isReserved(DstReg) && "Not a reserved register");
   LiveInterval &RHS = LIS->getInterval(SrcReg);
@@ -2128,7 +2209,7 @@ bool RegisterCoalescer::joinReservedPhysReg(CoalescerPair &CP) {
     LLVM_DEBUG(dbgs() << "\t\tRemoving phys reg def of "
                       << printReg(DstReg, TRI) << " at " << CopyRegIdx << "\n");
 
-    LIS->removePhysRegDefAt(DstReg, CopyRegIdx);
+    LIS->removePhysRegDefAt(DstReg.asMCReg(), CopyRegIdx);
     // Create a new dead def at the new def location.
     for (MCRegUnitIterator UI(DstReg, TRI); UI.isValid(); ++UI) {
       LiveRange &LR = LIS->getRegUnit(*UI);
@@ -2219,7 +2300,7 @@ class JoinVals {
   LiveRange &LR;
 
   /// (Main) register we work on.
-  const unsigned Reg;
+  const Register Reg;
 
   /// Reg (and therefore the values in this liverange) will end up as
   /// subregister SubIdx in the coalesced register. Either CP.DstIdx or
@@ -2339,7 +2420,7 @@ class JoinVals {
   LaneBitmask computeWriteLanes(const MachineInstr *DefMI, bool &Redef) const;
 
   /// Find the ultimate value that VNI was copied from.
-  std::pair<const VNInfo*,unsigned> followCopyChain(const VNInfo *VNI) const;
+  std::pair<const VNInfo *, Register> followCopyChain(const VNInfo *VNI) const;
 
   bool valuesIdentical(VNInfo *Value0, VNInfo *Value1, const JoinVals &Other) const;
 
@@ -2378,7 +2459,7 @@ class JoinVals {
 
   /// Return true if MI uses any of the given Lanes from Reg.
   /// This does not include partial redefinitions of Reg.
-  bool usesLanes(const MachineInstr &MI, unsigned, unsigned, LaneBitmask) const;
+  bool usesLanes(const MachineInstr &MI, Register, unsigned, LaneBitmask) const;
 
   /// Determine if ValNo is a copy of a value number in LR or Other.LR that will
   /// be pruned:
@@ -2389,14 +2470,15 @@ class JoinVals {
   bool isPrunedValue(unsigned ValNo, JoinVals &Other);
 
 public:
-  JoinVals(LiveRange &LR, unsigned Reg, unsigned SubIdx, LaneBitmask LaneMask,
-           SmallVectorImpl<VNInfo*> &newVNInfo, const CoalescerPair &cp,
+  JoinVals(LiveRange &LR, Register Reg, unsigned SubIdx, LaneBitmask LaneMask,
+           SmallVectorImpl<VNInfo *> &newVNInfo, const CoalescerPair &cp,
            LiveIntervals *lis, const TargetRegisterInfo *TRI, bool SubRangeJoin,
            bool TrackSubRegLiveness)
-    : LR(LR), Reg(Reg), SubIdx(SubIdx), LaneMask(LaneMask),
-      SubRangeJoin(SubRangeJoin), TrackSubRegLiveness(TrackSubRegLiveness),
-      NewVNInfo(newVNInfo), CP(cp), LIS(lis), Indexes(LIS->getSlotIndexes()),
-      TRI(TRI), Assignments(LR.getNumValNums(), -1), Vals(LR.getNumValNums()) {}
+      : LR(LR), Reg(Reg), SubIdx(SubIdx), LaneMask(LaneMask),
+        SubRangeJoin(SubRangeJoin), TrackSubRegLiveness(TrackSubRegLiveness),
+        NewVNInfo(newVNInfo), CP(cp), LIS(lis), Indexes(LIS->getSlotIndexes()),
+        TRI(TRI), Assignments(LR.getNumValNums(), -1),
+        Vals(LR.getNumValNums()) {}
 
   /// Analyze defs in LR and compute a value mapping in NewVNInfo.
   /// Returns false if any conflicts were impossible to resolve.
@@ -2462,9 +2544,9 @@ LaneBitmask JoinVals::computeWriteLanes(const MachineInstr *DefMI, bool &Redef)
   return L;
 }
 
-std::pair<const VNInfo*, unsigned> JoinVals::followCopyChain(
-    const VNInfo *VNI) const {
-  unsigned TrackReg = Reg;
+std::pair<const VNInfo *, Register>
+JoinVals::followCopyChain(const VNInfo *VNI) const {
+  Register TrackReg = Reg;
 
   while (!VNI->isPHIDef()) {
     SlotIndex Def = VNI->def;
@@ -2473,7 +2555,7 @@ std::pair<const VNInfo*, unsigned> JoinVals::followCopyChain(
     if (!MI->isFullCopy())
       return std::make_pair(VNI, TrackReg);
     Register SrcReg = MI->getOperand(1).getReg();
-    if (!Register::isVirtualRegister(SrcReg))
+    if (!SrcReg.isVirtual())
       return std::make_pair(VNI, TrackReg);
 
     const LiveInterval &LI = LIS->getInterval(SrcReg);
@@ -2518,13 +2600,13 @@ std::pair<const VNInfo*, unsigned> JoinVals::followCopyChain(
 bool JoinVals::valuesIdentical(VNInfo *Value0, VNInfo *Value1,
                                const JoinVals &Other) const {
   const VNInfo *Orig0;
-  unsigned Reg0;
+  Register Reg0;
   std::tie(Orig0, Reg0) = followCopyChain(Value0);
   if (Orig0 == Value1 && Reg0 == Other.Reg)
     return true;
 
   const VNInfo *Orig1;
-  unsigned Reg1;
+  Register Reg1;
   std::tie(Orig1, Reg1) = Other.followCopyChain(Value1);
   // If both values are undefined, and the source registers are the same
   // register, the values are identical. Filter out cases where only one
@@ -2685,14 +2767,8 @@ JoinVals::analyzeValue(unsigned ValNo, JoinVals &Other) {
     return CR_Replace;
 
   // Check for simple erasable conflicts.
-  if (DefMI->isImplicitDef()) {
-    // We need the def for the subregister if there is nothing else live at the
-    // subrange at this point.
-    if (TrackSubRegLiveness
-        && (V.WriteLanes & (OtherV.ValidLanes | OtherV.WriteLanes)).none())
-      return CR_Replace;
+  if (DefMI->isImplicitDef())
     return CR_Erase;
-  }
 
   // Include the non-conflict where DefMI is a coalescable copy that kills
   // OtherVNI. We still want the copy erased and value numbers merged.
@@ -2881,7 +2957,7 @@ taintExtent(unsigned ValNo, LaneBitmask TaintedLanes, JoinVals &Other,
   return true;
 }
 
-bool JoinVals::usesLanes(const MachineInstr &MI, unsigned Reg, unsigned SubIdx,
+bool JoinVals::usesLanes(const MachineInstr &MI, Register Reg, unsigned SubIdx,
                          LaneBitmask Lanes) const {
   if (MI.isDebugInstr())
     return false;
@@ -3353,7 +3429,7 @@ void RegisterCoalescer::mergeSubRangeInto(LiveInterval &LI,
 bool RegisterCoalescer::isHighCostLiveInterval(LiveInterval &LI) {
   if (LI.valnos.size() < LargeIntervalSizeThreshold)
     return false;
-  auto &Counter = LargeLIVisitCounter[LI.reg];
+  auto &Counter = LargeLIVisitCounter[LI.reg()];
   if (Counter < LargeIntervalFreqThreshold) {
     Counter++;
     return false;
@@ -3456,8 +3532,8 @@ bool RegisterCoalescer::joinVirtRegs(CoalescerPair &CP) {
   // Kill flags are going to be wrong if the live ranges were overlapping.
   // Eventually, we should simply clear all kill flags when computing live
   // ranges. They are reinserted after register allocation.
-  MRI->clearKillFlags(LHS.reg);
-  MRI->clearKillFlags(RHS.reg);
+  MRI->clearKillFlags(LHS.reg());
+  MRI->clearKillFlags(RHS.reg());
 
   if (!EndPoints.empty()) {
     // Recompute the parts of the live range we had to remove because of
@@ -3525,20 +3601,20 @@ void RegisterCoalescer::checkMergingChangesDbgValues(CoalescerPair &CP,
                                                      JoinVals &LHSVals,
                                                      LiveRange &RHS,
                                                      JoinVals &RHSVals) {
-  auto ScanForDstReg = [&](unsigned Reg) {
+  auto ScanForDstReg = [&](Register Reg) {
     checkMergingChangesDbgValuesImpl(Reg, RHS, LHS, LHSVals);
   };
 
-  auto ScanForSrcReg = [&](unsigned Reg) {
+  auto ScanForSrcReg = [&](Register Reg) {
     checkMergingChangesDbgValuesImpl(Reg, LHS, RHS, RHSVals);
   };
 
   // Scan for potentially unsound DBG_VALUEs: examine first the register number
   // Reg, and then any other vregs that may have been merged into  it.
-  auto PerformScan = [this](unsigned Reg, std::function<void(unsigned)> Func) {
+  auto PerformScan = [this](Register Reg, std::function<void(Register)> Func) {
     Func(Reg);
     if (DbgMergedVRegNums.count(Reg))
-      for (unsigned X : DbgMergedVRegNums[Reg])
+      for (Register X : DbgMergedVRegNums[Reg])
         Func(X);
   };
 
@@ -3547,7 +3623,7 @@ void RegisterCoalescer::checkMergingChangesDbgValues(CoalescerPair &CP,
   PerformScan(CP.getDstReg(), ScanForDstReg);
 }
 
-void RegisterCoalescer::checkMergingChangesDbgValuesImpl(unsigned Reg,
+void RegisterCoalescer::checkMergingChangesDbgValuesImpl(Register Reg,
                                                          LiveRange &OtherLR,
                                                          LiveRange &RegLR,
                                                          JoinVals &RegVals) {
@@ -3673,7 +3749,7 @@ static bool isLocalCopy(MachineInstr *Copy, const LiveIntervals *LIS) {
 }
 
 void RegisterCoalescer::lateLiveIntervalUpdate() {
-  for (unsigned reg : ToBeUpdated) {
+  for (Register reg : ToBeUpdated) {
     if (!LIS->hasInterval(reg))
       continue;
     LiveInterval &LI = LIS->getInterval(reg);
@@ -3707,7 +3783,7 @@ copyCoalesceWorkList(MutableArrayRef<MachineInstr*> CurrList) {
 
 /// Check if DstReg is a terminal node.
 /// I.e., it does not have any affinity other than \p Copy.
-static bool isTerminalReg(unsigned DstReg, const MachineInstr &Copy,
+static bool isTerminalReg(Register DstReg, const MachineInstr &Copy,
                           const MachineRegisterInfo *MRI) {
   assert(Copy.isCopyLike());
   // Check if the destination of this copy as any other affinity.
@@ -3721,15 +3797,16 @@ bool RegisterCoalescer::applyTerminalRule(const MachineInstr &Copy) const {
   assert(Copy.isCopyLike());
   if (!UseTerminalRule)
     return false;
-  unsigned DstReg, DstSubReg, SrcReg, SrcSubReg;
+  Register SrcReg, DstReg;
+  unsigned SrcSubReg = 0, DstSubReg = 0;
   if (!isMoveInstr(*TRI, &Copy, SrcReg, DstReg, SrcSubReg, DstSubReg))
     return false;
   // Check if the destination of this copy has any other affinity.
-  if (Register::isPhysicalRegister(DstReg) ||
+  if (DstReg.isPhysical() ||
       // If SrcReg is a physical register, the copy won't be coalesced.
       // Ignoring it may have other side effect (like missing
       // rematerialization). So keep it.
-      Register::isPhysicalRegister(SrcReg) || !isTerminalReg(DstReg, Copy, MRI))
+      SrcReg.isPhysical() || !isTerminalReg(DstReg, Copy, MRI))
     return false;
 
   // DstReg is a terminal node. Check if it interferes with any other
@@ -3745,7 +3822,8 @@ bool RegisterCoalescer::applyTerminalRule(const MachineInstr &Copy) const {
     // For now, just consider the copies that are in the same block.
     if (&MI == &Copy || !MI.isCopyLike() || MI.getParent() != OrigBB)
       continue;
-    unsigned OtherReg, OtherSubReg, OtherSrcReg, OtherSrcSubReg;
+    Register OtherSrcReg, OtherReg;
+    unsigned OtherSrcSubReg = 0, OtherSubReg = 0;
     if (!isMoveInstr(*TRI, &Copy, OtherSrcReg, OtherReg, OtherSrcSubReg,
                 OtherSubReg))
       return false;
@@ -3930,7 +4008,7 @@ bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) {
   LLVM_DEBUG(dbgs() << "Trying to inflate " << InflateRegs.size()
                     << " regs.\n");
   for (unsigned i = 0, e = InflateRegs.size(); i != e; ++i) {
-    unsigned Reg = InflateRegs[i];
+    Register Reg = InflateRegs[i];
     if (MRI->reg_nodbg_empty(Reg))
       continue;
     if (MRI->recomputeRegClass(Reg)) {
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/RegisterCoalescer.h b/contrib/llvm-project/llvm/lib/CodeGen/RegisterCoalescer.h
index f505d46cd338..f265d93fb0d6 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/RegisterCoalescer.h
+++ b/contrib/llvm-project/llvm/lib/CodeGen/RegisterCoalescer.h
@@ -14,6 +14,8 @@
 #ifndef LLVM_LIB_CODEGEN_REGISTERCOALESCER_H
 #define LLVM_LIB_CODEGEN_REGISTERCOALESCER_H
 
+#include "llvm/CodeGen/Register.h"
+
 namespace llvm {
 
 class MachineInstr;
@@ -28,10 +30,10 @@ class TargetRegisterInfo;
 
     /// The register that will be left after coalescing. It can be a
     /// virtual or physical register.
-    unsigned DstReg = 0;
+    Register DstReg;
 
     /// The virtual register that will be coalesced into dstReg.
-    unsigned SrcReg = 0;
+    Register SrcReg;
 
     /// The sub-register index of the old DstReg in the new coalesced register.
     unsigned DstIdx = 0;
@@ -59,9 +61,9 @@ class TargetRegisterInfo;
 
     /// Create a CoalescerPair representing a virtreg-to-physreg copy.
     /// No need to call setRegisters().
-    CoalescerPair(unsigned VirtReg, unsigned PhysReg,
+    CoalescerPair(Register VirtReg, MCRegister PhysReg,
                   const TargetRegisterInfo &tri)
-      : TRI(tri), DstReg(PhysReg), SrcReg(VirtReg) {}
+        : TRI(tri), DstReg(PhysReg), SrcReg(VirtReg) {}
 
     /// Set registers to match the copy instruction MI. Return
     /// false if MI is not a coalescable copy instruction.
@@ -92,10 +94,10 @@ class TargetRegisterInfo;
 
     /// Return the register (virtual or physical) that will remain
     /// after coalescing.
-    unsigned getDstReg() const { return DstReg; }
+    Register getDstReg() const { return DstReg; }
 
     /// Return the virtual register that will be coalesced away.
-    unsigned getSrcReg() const { return SrcReg; }
+    Register getSrcReg() const { return SrcReg; }
 
     /// Return the subregister index that DstReg will be coalesced into, or 0.
     unsigned getDstIdx() const { return DstIdx; }
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/RegisterPressure.cpp b/contrib/llvm-project/llvm/lib/CodeGen/RegisterPressure.cpp
index ecbc4ed63ef6..8f1fc103e869 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/RegisterPressure.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/RegisterPressure.cpp
@@ -62,7 +62,7 @@ static void increaseSetPressure(std::vector<unsigned> &CurrSetPressure,
 
 /// Decrease pressure for each pressure set provided by TargetRegisterInfo.
 static void decreaseSetPressure(std::vector<unsigned> &CurrSetPressure,
-                                const MachineRegisterInfo &MRI, unsigned Reg,
+                                const MachineRegisterInfo &MRI, Register Reg,
                                 LaneBitmask PrevMask, LaneBitmask NewMask) {
   //assert((NewMask & !PrevMask) == 0 && "Must not add bits");
   if (NewMask.any() || PrevMask.none())
@@ -152,7 +152,7 @@ void RegPressureDelta::dump() const {
 
 #endif
 
-void RegPressureTracker::increaseRegPressure(unsigned RegUnit,
+void RegPressureTracker::increaseRegPressure(Register RegUnit,
                                              LaneBitmask PreviousMask,
                                              LaneBitmask NewMask) {
   if (PreviousMask.any() || NewMask.none())
@@ -167,7 +167,7 @@ void RegPressureTracker::increaseRegPressure(unsigned RegUnit,
   }
 }
 
-void RegPressureTracker::decreaseRegPressure(unsigned RegUnit,
+void RegPressureTracker::decreaseRegPressure(Register RegUnit,
                                              LaneBitmask PreviousMask,
                                              LaneBitmask NewMask) {
   decreaseSetPressure(CurrSetPressure, *MRI, RegUnit, PreviousMask, NewMask);
@@ -360,7 +360,7 @@ void RegPressureTracker::initLiveThru(const RegPressureTracker &RPTracker) {
   LiveThruPressure.assign(TRI->getNumRegPressureSets(), 0);
   assert(isBottomClosed() && "need bottom-up tracking to intialize.");
   for (const RegisterMaskPair &Pair : P.LiveOutRegs) {
-    unsigned RegUnit = Pair.RegUnit;
+    Register RegUnit = Pair.RegUnit;
     if (Register::isVirtualRegister(RegUnit)
         && !RPTracker.hasUntiedDef(RegUnit))
       increaseSetPressure(LiveThruPressure, *MRI, RegUnit,
@@ -369,7 +369,7 @@ void RegPressureTracker::initLiveThru(const RegPressureTracker &RPTracker) {
 }
 
 static LaneBitmask getRegLanes(ArrayRef<RegisterMaskPair> RegUnits,
-                               unsigned RegUnit) {
+                               Register RegUnit) {
   auto I = llvm::find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) {
     return Other.RegUnit == RegUnit;
   });
@@ -380,7 +380,7 @@ static LaneBitmask getRegLanes(ArrayRef<RegisterMaskPair> RegUnits,
 
 static void addRegLanes(SmallVectorImpl<RegisterMaskPair> &RegUnits,
                         RegisterMaskPair Pair) {
-  unsigned RegUnit = Pair.RegUnit;
+  Register RegUnit = Pair.RegUnit;
   assert(Pair.LaneMask.any());
   auto I = llvm::find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) {
     return Other.RegUnit == RegUnit;
@@ -393,7 +393,7 @@ static void addRegLanes(SmallVectorImpl<RegisterMaskPair> &RegUnits,
 }
 
 static void setRegZero(SmallVectorImpl<RegisterMaskPair> &RegUnits,
-                       unsigned RegUnit) {
+                       Register RegUnit) {
   auto I = llvm::find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) {
     return Other.RegUnit == RegUnit;
   });
@@ -406,7 +406,7 @@ static void setRegZero(SmallVectorImpl<RegisterMaskPair> &RegUnits,
 
 static void removeRegLanes(SmallVectorImpl<RegisterMaskPair> &RegUnits,
                            RegisterMaskPair Pair) {
-  unsigned RegUnit = Pair.RegUnit;
+  Register RegUnit = Pair.RegUnit;
   assert(Pair.LaneMask.any());
   auto I = llvm::find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) {
     return Other.RegUnit == RegUnit;
@@ -418,11 +418,12 @@ static void removeRegLanes(SmallVectorImpl<RegisterMaskPair> &RegUnits,
   }
 }
 
-static LaneBitmask getLanesWithProperty(const LiveIntervals &LIS,
-    const MachineRegisterInfo &MRI, bool TrackLaneMasks, unsigned RegUnit,
-    SlotIndex Pos, LaneBitmask SafeDefault,
-    bool(*Property)(const LiveRange &LR, SlotIndex Pos)) {
-  if (Register::isVirtualRegister(RegUnit)) {
+static LaneBitmask
+getLanesWithProperty(const LiveIntervals &LIS, const MachineRegisterInfo &MRI,
+                     bool TrackLaneMasks, Register RegUnit, SlotIndex Pos,
+                     LaneBitmask SafeDefault,
+                     bool (*Property)(const LiveRange &LR, SlotIndex Pos)) {
+  if (RegUnit.isVirtual()) {
     const LiveInterval &LI = LIS.getInterval(RegUnit);
     LaneBitmask Result;
     if (TrackLaneMasks && LI.hasSubRanges()) {
@@ -448,7 +449,7 @@ static LaneBitmask getLanesWithProperty(const LiveIntervals &LIS,
 
 static LaneBitmask getLiveLanesAt(const LiveIntervals &LIS,
                                   const MachineRegisterInfo &MRI,
-                                  bool TrackLaneMasks, unsigned RegUnit,
+                                  bool TrackLaneMasks, Register RegUnit,
                                   SlotIndex Pos) {
   return getLanesWithProperty(LIS, MRI, TrackLaneMasks, RegUnit, Pos,
                               LaneBitmask::getAll(),
@@ -457,7 +458,6 @@ static LaneBitmask getLiveLanesAt(const LiveIntervals &LIS,
                               });
 }
 
-
 namespace {
 
 /// Collect this instruction's unique uses and defs into SmallVectors for
@@ -517,12 +517,13 @@ class RegisterOperandsCollector {
     }
   }
 
-  void pushReg(unsigned Reg,
+  void pushReg(Register Reg,
                SmallVectorImpl<RegisterMaskPair> &RegUnits) const {
-    if (Register::isVirtualRegister(Reg)) {
+    if (Reg.isVirtual()) {
       addRegLanes(RegUnits, RegisterMaskPair(Reg, LaneBitmask::getAll()));
     } else if (MRI.isAllocatable(Reg)) {
-      for (MCRegUnitIterator Units(Reg, &TRI); Units.isValid(); ++Units)
+      for (MCRegUnitIterator Units(Reg.asMCReg(), &TRI); Units.isValid();
+           ++Units)
         addRegLanes(RegUnits, RegisterMaskPair(*Units, LaneBitmask::getAll()));
     }
   }
@@ -549,15 +550,16 @@ class RegisterOperandsCollector {
     }
   }
 
-  void pushRegLanes(unsigned Reg, unsigned SubRegIdx,
+  void pushRegLanes(Register Reg, unsigned SubRegIdx,
                     SmallVectorImpl<RegisterMaskPair> &RegUnits) const {
-    if (Register::isVirtualRegister(Reg)) {
+    if (Reg.isVirtual()) {
       LaneBitmask LaneMask = SubRegIdx != 0
                              ? TRI.getSubRegIndexLaneMask(SubRegIdx)
                              : MRI.getMaxLaneMaskForVReg(Reg);
       addRegLanes(RegUnits, RegisterMaskPair(Reg, LaneMask));
     } else if (MRI.isAllocatable(Reg)) {
-      for (MCRegUnitIterator Units(Reg, &TRI); Units.isValid(); ++Units)
+      for (MCRegUnitIterator Units(Reg.asMCReg(), &TRI); Units.isValid();
+           ++Units)
         addRegLanes(RegUnits, RegisterMaskPair(*Units, LaneBitmask::getAll()));
     }
   }
@@ -580,7 +582,7 @@ void RegisterOperands::detectDeadDefs(const MachineInstr &MI,
                                       const LiveIntervals &LIS) {
   SlotIndex SlotIdx = LIS.getInstructionIndex(MI);
   for (auto RI = Defs.begin(); RI != Defs.end(); /*empty*/) {
-    unsigned Reg = RI->RegUnit;
+    Register Reg = RI->RegUnit;
     const LiveRange *LR = getLiveRange(LIS, Reg);
     if (LR != nullptr) {
       LiveQueryResult LRQ = LR->Query(SlotIdx);
@@ -605,7 +607,7 @@ void RegisterOperands::adjustLaneLiveness(const LiveIntervals &LIS,
                                            Pos.getDeadSlot());
     // If the def is all that is live after the instruction, then in case
     // of a subregister def we need a read-undef flag.
-    unsigned RegUnit = I->RegUnit;
+    Register RegUnit = I->RegUnit;
     if (Register::isVirtualRegister(RegUnit) &&
         AddFlagsMI != nullptr && (LiveAfter & ~I->LaneMask).none())
       AddFlagsMI->setRegisterDefReadUndef(RegUnit);
@@ -631,7 +633,7 @@ void RegisterOperands::adjustLaneLiveness(const LiveIntervals &LIS,
   }
   if (AddFlagsMI != nullptr) {
     for (const RegisterMaskPair &P : DeadDefs) {
-      unsigned RegUnit = P.RegUnit;
+      Register RegUnit = P.RegUnit;
       if (!Register::isVirtualRegister(RegUnit))
         continue;
       LaneBitmask LiveAfter = getLiveLanesAt(LIS, MRI, true, RegUnit,
@@ -667,7 +669,7 @@ void PressureDiffs::addInstruction(unsigned Idx,
 }
 
 /// Add a change in pressure to the pressure diff of a given instruction.
-void PressureDiff::addPressureChange(unsigned RegUnit, bool IsDec,
+void PressureDiff::addPressureChange(Register RegUnit, bool IsDec,
                                      const MachineRegisterInfo *MRI) {
   PSetIterator PSetI = MRI->getPressureSets(RegUnit);
   int Weight = IsDec ? -PSetI.getWeight() : PSetI.getWeight();
@@ -714,7 +716,7 @@ void RegPressureTracker::discoverLiveInOrOut(RegisterMaskPair Pair,
     SmallVectorImpl<RegisterMaskPair> &LiveInOrOut) {
   assert(Pair.LaneMask.any());
 
-  unsigned RegUnit = Pair.RegUnit;
+  Register RegUnit = Pair.RegUnit;
   auto I = llvm::find_if(LiveInOrOut, [RegUnit](const RegisterMaskPair &Other) {
     return Other.RegUnit == RegUnit;
   });
@@ -742,13 +744,13 @@ void RegPressureTracker::discoverLiveOut(RegisterMaskPair Pair) {
 
 void RegPressureTracker::bumpDeadDefs(ArrayRef<RegisterMaskPair> DeadDefs) {
   for (const RegisterMaskPair &P : DeadDefs) {
-    unsigned Reg = P.RegUnit;
+    Register Reg = P.RegUnit;
     LaneBitmask LiveMask = LiveRegs.contains(Reg);
     LaneBitmask BumpedMask = LiveMask | P.LaneMask;
     increaseRegPressure(Reg, LiveMask, BumpedMask);
   }
   for (const RegisterMaskPair &P : DeadDefs) {
-    unsigned Reg = P.RegUnit;
+    Register Reg = P.RegUnit;
     LaneBitmask LiveMask = LiveRegs.contains(Reg);
     LaneBitmask BumpedMask = LiveMask | P.LaneMask;
     decreaseRegPressure(Reg, BumpedMask, LiveMask);
@@ -770,7 +772,7 @@ void RegPressureTracker::recede(const RegisterOperands &RegOpers,
   // Kill liveness at live defs.
   // TODO: consider earlyclobbers?
   for (const RegisterMaskPair &Def : RegOpers.Defs) {
-    unsigned Reg = Def.RegUnit;
+    Register Reg = Def.RegUnit;
 
     LaneBitmask PreviousMask = LiveRegs.erase(Def);
     LaneBitmask NewMask = PreviousMask & ~Def.LaneMask;
@@ -800,7 +802,7 @@ void RegPressureTracker::recede(const RegisterOperands &RegOpers,
 
   // Generate liveness for uses.
   for (const RegisterMaskPair &Use : RegOpers.Uses) {
-    unsigned Reg = Use.RegUnit;
+    Register Reg = Use.RegUnit;
     assert(Use.LaneMask.any());
     LaneBitmask PreviousMask = LiveRegs.insert(Use);
     LaneBitmask NewMask = PreviousMask | Use.LaneMask;
@@ -840,7 +842,7 @@ void RegPressureTracker::recede(const RegisterOperands &RegOpers,
   }
   if (TrackUntiedDefs) {
     for (const RegisterMaskPair &Def : RegOpers.Defs) {
-      unsigned RegUnit = Def.RegUnit;
+      Register RegUnit = Def.RegUnit;
       if (Register::isVirtualRegister(RegUnit) &&
           (LiveRegs.contains(RegUnit) & Def.LaneMask).none())
         UntiedDefs.insert(RegUnit);
@@ -911,7 +913,7 @@ void RegPressureTracker::advance(const RegisterOperands &RegOpers) {
   }
 
   for (const RegisterMaskPair &Use : RegOpers.Uses) {
-    unsigned Reg = Use.RegUnit;
+    Register Reg = Use.RegUnit;
     LaneBitmask LiveMask = LiveRegs.contains(Reg);
     LaneBitmask LiveIn = Use.LaneMask & ~LiveMask;
     if (LiveIn.any()) {
@@ -1060,7 +1062,7 @@ void RegPressureTracker::bumpUpwardPressure(const MachineInstr *MI) {
 
   // Kill liveness at live defs.
   for (const RegisterMaskPair &P : RegOpers.Defs) {
-    unsigned Reg = P.RegUnit;
+    Register Reg = P.RegUnit;
     LaneBitmask LiveLanes = LiveRegs.contains(Reg);
     LaneBitmask UseLanes = getRegLanes(RegOpers.Uses, Reg);
     LaneBitmask DefLanes = P.LaneMask;
@@ -1069,7 +1071,7 @@ void RegPressureTracker::bumpUpwardPressure(const MachineInstr *MI) {
   }
   // Generate liveness for uses.
   for (const RegisterMaskPair &P : RegOpers.Uses) {
-    unsigned Reg = P.RegUnit;
+    Register Reg = P.RegUnit;
     LaneBitmask LiveLanes = LiveRegs.contains(Reg);
     LaneBitmask LiveAfter = LiveLanes | P.LaneMask;
     increaseRegPressure(Reg, LiveLanes, LiveAfter);
@@ -1240,7 +1242,7 @@ static LaneBitmask findUseBetween(unsigned Reg, LaneBitmask LastUseMask,
   return LastUseMask;
 }
 
-LaneBitmask RegPressureTracker::getLiveLanesAt(unsigned RegUnit,
+LaneBitmask RegPressureTracker::getLiveLanesAt(Register RegUnit,
                                                SlotIndex Pos) const {
   assert(RequireIntervals);
   return getLanesWithProperty(*LIS, *MRI, TrackLaneMasks, RegUnit, Pos,
@@ -1250,7 +1252,7 @@ LaneBitmask RegPressureTracker::getLiveLanesAt(unsigned RegUnit,
       });
 }
 
-LaneBitmask RegPressureTracker::getLastUsedLanes(unsigned RegUnit,
+LaneBitmask RegPressureTracker::getLastUsedLanes(Register RegUnit,
                                                  SlotIndex Pos) const {
   assert(RequireIntervals);
   return getLanesWithProperty(*LIS, *MRI, TrackLaneMasks, RegUnit,
@@ -1261,7 +1263,7 @@ LaneBitmask RegPressureTracker::getLastUsedLanes(unsigned RegUnit,
       });
 }
 
-LaneBitmask RegPressureTracker::getLiveThroughAt(unsigned RegUnit,
+LaneBitmask RegPressureTracker::getLiveThroughAt(Register RegUnit,
                                                  SlotIndex Pos) const {
   assert(RequireIntervals);
   return getLanesWithProperty(*LIS, *MRI, TrackLaneMasks, RegUnit, Pos,
@@ -1294,7 +1296,7 @@ void RegPressureTracker::bumpDownwardPressure(const MachineInstr *MI) {
 
   if (RequireIntervals) {
     for (const RegisterMaskPair &Use : RegOpers.Uses) {
-      unsigned Reg = Use.RegUnit;
+      Register Reg = Use.RegUnit;
       LaneBitmask LastUseMask = getLastUsedLanes(Reg, SlotIdx);
       if (LastUseMask.none())
         continue;
@@ -1317,7 +1319,7 @@ void RegPressureTracker::bumpDownwardPressure(const MachineInstr *MI) {
 
   // Generate liveness for defs.
   for (const RegisterMaskPair &Def : RegOpers.Defs) {
-    unsigned Reg = Def.RegUnit;
+    Register Reg = Def.RegUnit;
     LaneBitmask LiveMask = LiveRegs.contains(Reg);
     LaneBitmask NewMask = LiveMask | Def.LaneMask;
     increaseRegPressure(Reg, LiveMask, NewMask);
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/RegisterScavenging.cpp b/contrib/llvm-project/llvm/lib/CodeGen/RegisterScavenging.cpp
index 41b6de1441d7..a833895c115d 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/RegisterScavenging.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/RegisterScavenging.cpp
@@ -91,18 +91,18 @@ void RegScavenger::enterBasicBlockEnd(MachineBasicBlock &MBB) {
   LiveUnits.addLiveOuts(MBB);
 
   // Move internal iterator at the last instruction of the block.
-  if (MBB.begin() != MBB.end()) {
+  if (!MBB.empty()) {
     MBBI = std::prev(MBB.end());
     Tracking = true;
   }
 }
 
-void RegScavenger::addRegUnits(BitVector &BV, Register Reg) {
+void RegScavenger::addRegUnits(BitVector &BV, MCRegister Reg) {
   for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI)
     BV.set(*RUI);
 }
 
-void RegScavenger::removeRegUnits(BitVector &BV, Register Reg) {
+void RegScavenger::removeRegUnits(BitVector &BV, MCRegister Reg) {
   for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI)
     BV.reset(*RUI);
 }
@@ -134,9 +134,9 @@ void RegScavenger::determineKillsAndDefs() {
     }
     if (!MO.isReg())
       continue;
-    Register Reg = MO.getReg();
-    if (!Register::isPhysicalRegister(Reg) || isReserved(Reg))
+    if (!MO.getReg().isPhysical() || isReserved(MO.getReg()))
       continue;
+    MCRegister Reg = MO.getReg().asMCReg();
 
     if (MO.isUse()) {
       // Ignore undef uses.
@@ -154,25 +154,6 @@ void RegScavenger::determineKillsAndDefs() {
   }
 }
 
-void RegScavenger::unprocess() {
-  assert(Tracking && "Cannot unprocess because we're not tracking");
-
-  MachineInstr &MI = *MBBI;
-  if (!MI.isDebugInstr()) {
-    determineKillsAndDefs();
-
-    // Commit the changes.
-    setUnused(DefRegUnits);
-    setUsed(KillRegUnits);
-  }
-
-  if (MBBI == MBB->begin()) {
-    MBBI = MachineBasicBlock::iterator(nullptr);
-    Tracking = false;
-  } else
-    --MBBI;
-}
-
 void RegScavenger::forward() {
   // Move ptr forward.
   if (!Tracking) {
@@ -592,9 +573,8 @@ Register RegScavenger::scavengeRegisterBackwards(const TargetRegisterClass &RC,
                             RestoreAfter);
   MCPhysReg Reg = P.first;
   MachineBasicBlock::iterator SpillBefore = P.second;
-  assert(Reg != 0 && "No register left to scavenge!");
   // Found an available register?
-  if (SpillBefore == MBB.end()) {
+  if (Reg != 0 && SpillBefore == MBB.end()) {
     LLVM_DEBUG(dbgs() << "Scavenged free register: " << printReg(Reg, TRI)
                << '\n');
     return Reg;
@@ -603,6 +583,8 @@ Register RegScavenger::scavengeRegisterBackwards(const TargetRegisterClass &RC,
   if (!AllowSpill)
     return 0;
 
+  assert(Reg != 0 && "No register left to scavenge!");
+
   MachineBasicBlock::iterator ReloadAfter =
     RestoreAfter ? std::next(MBBI) : MBBI;
   MachineBasicBlock::iterator ReloadBefore = std::next(ReloadAfter);
@@ -652,11 +634,10 @@ static Register scavengeVReg(MachineRegisterInfo &MRI, RegScavenger &RS,
   // we get a single contiguous lifetime.
   //
   // Definitions in MRI.def_begin() are unordered, search for the first.
-  MachineRegisterInfo::def_iterator FirstDef =
-    std::find_if(MRI.def_begin(VReg), MRI.def_end(),
-                 [VReg, &TRI](const MachineOperand &MO) {
-      return !MO.getParent()->readsRegister(VReg, &TRI);
-    });
+  MachineRegisterInfo::def_iterator FirstDef = llvm::find_if(
+      MRI.def_operands(VReg), [VReg, &TRI](const MachineOperand &MO) {
+        return !MO.getParent()->readsRegister(VReg, &TRI);
+      });
   assert(FirstDef != MRI.def_end() &&
          "Must have one definition that does not redefine vreg");
   MachineInstr &DefMI = *FirstDef->getParent();
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/RenameIndependentSubregs.cpp b/contrib/llvm-project/llvm/lib/CodeGen/RenameIndependentSubregs.cpp
index 4ee28d6bbb46..0872ec303460 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/RenameIndependentSubregs.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/RenameIndependentSubregs.cpp
@@ -130,7 +130,7 @@ bool RenameIndependentSubregs::renameComponents(LiveInterval &LI) const {
     return false;
 
   // Create a new VReg for each class.
-  unsigned Reg = LI.reg;
+  unsigned Reg = LI.reg();
   const TargetRegisterClass *RegClass = MRI->getRegClass(Reg);
   SmallVector<LiveInterval*, 4> Intervals;
   Intervals.push_back(&LI);
@@ -175,7 +175,7 @@ bool RenameIndependentSubregs::findComponents(IntEqClasses &Classes,
   // across subranges when they are affected by the same MachineOperand.
   const TargetRegisterInfo &TRI = *MRI->getTargetRegisterInfo();
   Classes.grow(NumComponents);
-  unsigned Reg = LI.reg;
+  unsigned Reg = LI.reg();
   for (const MachineOperand &MO : MRI->reg_nodbg_operands(Reg)) {
     if (!MO.isDef() && !MO.readsReg())
       continue;
@@ -212,7 +212,7 @@ void RenameIndependentSubregs::rewriteOperands(const IntEqClasses &Classes,
     const SmallVectorImpl<SubRangeInfo> &SubRangeInfos,
     const SmallVectorImpl<LiveInterval*> &Intervals) const {
   const TargetRegisterInfo &TRI = *MRI->getTargetRegisterInfo();
-  unsigned Reg = Intervals[0]->reg;
+  unsigned Reg = Intervals[0]->reg();
   for (MachineRegisterInfo::reg_nodbg_iterator I = MRI->reg_nodbg_begin(Reg),
        E = MRI->reg_nodbg_end(); I != E; ) {
     MachineOperand &MO = *I++;
@@ -242,7 +242,7 @@ void RenameIndependentSubregs::rewriteOperands(const IntEqClasses &Classes,
       break;
     }
 
-    unsigned VReg = Intervals[ID]->reg;
+    unsigned VReg = Intervals[ID]->reg();
     MO.setReg(VReg);
 
     if (MO.isTied() && Reg != VReg) {
@@ -304,7 +304,7 @@ void RenameIndependentSubregs::computeMainRangesFixFlags(
   const SlotIndexes &Indexes = *LIS->getSlotIndexes();
   for (size_t I = 0, E = Intervals.size(); I < E; ++I) {
     LiveInterval &LI = *Intervals[I];
-    unsigned Reg = LI.reg;
+    unsigned Reg = LI.reg();
 
     LI.removeEmptySubRanges();
 
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SafeStack.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SafeStack.cpp
index 55478c232dd7..31797631c97b 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/SafeStack.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SafeStack.cpp
@@ -151,7 +151,7 @@ class SafeStack {
   Value *getStackGuard(IRBuilder<> &IRB, Function &F);
 
   /// Load stack guard from the frame and check if it has changed.
-  void checkStackGuard(IRBuilder<> &IRB, Function &F, ReturnInst &RI,
+  void checkStackGuard(IRBuilder<> &IRB, Function &F, Instruction &RI,
                        AllocaInst *StackGuardSlot, Value *StackGuard);
 
   /// Find all static allocas, dynamic allocas, return instructions and
@@ -160,7 +160,7 @@ class SafeStack {
   void findInsts(Function &F, SmallVectorImpl<AllocaInst *> &StaticAllocas,
                  SmallVectorImpl<AllocaInst *> &DynamicAllocas,
                  SmallVectorImpl<Argument *> &ByValArguments,
-                 SmallVectorImpl<ReturnInst *> &Returns,
+                 SmallVectorImpl<Instruction *> &Returns,
                  SmallVectorImpl<Instruction *> &StackRestorePoints);
 
   /// Calculate the allocation size of a given alloca. Returns 0 if the
@@ -168,15 +168,13 @@ class SafeStack {
   uint64_t getStaticAllocaAllocationSize(const AllocaInst* AI);
 
   /// Allocate space for all static allocas in \p StaticAllocas,
-  /// replace allocas with pointers into the unsafe stack and generate code to
-  /// restore the stack pointer before all return instructions in \p Returns.
+  /// replace allocas with pointers into the unsafe stack.
   ///
   /// \returns A pointer to the top of the unsafe stack after all unsafe static
   /// allocas are allocated.
   Value *moveStaticAllocasToUnsafeStack(IRBuilder<> &IRB, Function &F,
                                         ArrayRef<AllocaInst *> StaticAllocas,
                                         ArrayRef<Argument *> ByValArguments,
-                                        ArrayRef<ReturnInst *> Returns,
                                         Instruction *BasePointer,
                                         AllocaInst *StackGuardSlot);
 
@@ -383,7 +381,7 @@ void SafeStack::findInsts(Function &F,
                           SmallVectorImpl<AllocaInst *> &StaticAllocas,
                           SmallVectorImpl<AllocaInst *> &DynamicAllocas,
                           SmallVectorImpl<Argument *> &ByValArguments,
-                          SmallVectorImpl<ReturnInst *> &Returns,
+                          SmallVectorImpl<Instruction *> &Returns,
                           SmallVectorImpl<Instruction *> &StackRestorePoints) {
   for (Instruction &I : instructions(&F)) {
     if (auto AI = dyn_cast<AllocaInst>(&I)) {
@@ -401,7 +399,10 @@ void SafeStack::findInsts(Function &F,
         DynamicAllocas.push_back(AI);
       }
     } else if (auto RI = dyn_cast<ReturnInst>(&I)) {
-      Returns.push_back(RI);
+      if (CallInst *CI = I.getParent()->getTerminatingMustTailCall())
+        Returns.push_back(CI);
+      else
+        Returns.push_back(RI);
     } else if (auto CI = dyn_cast<CallInst>(&I)) {
       // setjmps require stack restore.
       if (CI->getCalledFunction() && CI->canReturnTwice())
@@ -465,7 +466,7 @@ SafeStack::createStackRestorePoints(IRBuilder<> &IRB, Function &F,
   return DynamicTop;
 }
 
-void SafeStack::checkStackGuard(IRBuilder<> &IRB, Function &F, ReturnInst &RI,
+void SafeStack::checkStackGuard(IRBuilder<> &IRB, Function &F, Instruction &RI,
                                 AllocaInst *StackGuardSlot, Value *StackGuard) {
   Value *V = IRB.CreateLoad(StackPtrTy, StackGuardSlot);
   Value *Cmp = IRB.CreateICmpNE(StackGuard, V);
@@ -490,8 +491,8 @@ void SafeStack::checkStackGuard(IRBuilder<> &IRB, Function &F, ReturnInst &RI,
 /// prologue into a local variable and restore it in the epilogue.
 Value *SafeStack::moveStaticAllocasToUnsafeStack(
     IRBuilder<> &IRB, Function &F, ArrayRef<AllocaInst *> StaticAllocas,
-    ArrayRef<Argument *> ByValArguments, ArrayRef<ReturnInst *> Returns,
-    Instruction *BasePointer, AllocaInst *StackGuardSlot) {
+    ArrayRef<Argument *> ByValArguments, Instruction *BasePointer,
+    AllocaInst *StackGuardSlot) {
   if (StaticAllocas.empty() && ByValArguments.empty())
     return BasePointer;
 
@@ -759,7 +760,7 @@ bool SafeStack::run() {
   SmallVector<AllocaInst *, 16> StaticAllocas;
   SmallVector<AllocaInst *, 4> DynamicAllocas;
   SmallVector<Argument *, 4> ByValArguments;
-  SmallVector<ReturnInst *, 4> Returns;
+  SmallVector<Instruction *, 4> Returns;
 
   // Collect all points where stack gets unwound and needs to be restored
   // This is only necessary because the runtime (setjmp and unwind code) is
@@ -788,7 +789,8 @@ bool SafeStack::run() {
   // Calls must always have a debug location, or else inlining breaks. So
   // we explicitly set a artificial debug location here.
   if (DISubprogram *SP = F.getSubprogram())
-    IRB.SetCurrentDebugLocation(DebugLoc::get(SP->getScopeLine(), 0, SP));
+    IRB.SetCurrentDebugLocation(
+        DILocation::get(SP->getContext(), SP->getScopeLine(), 0, SP));
   if (SafeStackUsePointerAddress) {
     FunctionCallee Fn = F.getParent()->getOrInsertFunction(
         "__safestack_pointer_address", StackPtrTy->getPointerTo(0));
@@ -812,7 +814,7 @@ bool SafeStack::run() {
     StackGuardSlot = IRB.CreateAlloca(StackPtrTy, nullptr);
     IRB.CreateStore(StackGuard, StackGuardSlot);
 
-    for (ReturnInst *RI : Returns) {
+    for (Instruction *RI : Returns) {
       IRBuilder<> IRBRet(RI);
       checkStackGuard(IRBRet, F, *RI, StackGuardSlot, StackGuard);
     }
@@ -820,9 +822,8 @@ bool SafeStack::run() {
 
   // The top of the unsafe stack after all unsafe static allocas are
   // allocated.
-  Value *StaticTop =
-      moveStaticAllocasToUnsafeStack(IRB, F, StaticAllocas, ByValArguments,
-                                     Returns, BasePointer, StackGuardSlot);
+  Value *StaticTop = moveStaticAllocasToUnsafeStack(
+      IRB, F, StaticAllocas, ByValArguments, BasePointer, StackGuardSlot);
 
   // Safe stack object that stores the current unsafe stack top. It is updated
   // as unsafe dynamic (non-constant-sized) allocas are allocated and freed.
@@ -838,7 +839,7 @@ bool SafeStack::run() {
                                   DynamicAllocas);
 
   // Restore the unsafe stack pointer before each return.
-  for (ReturnInst *RI : Returns) {
+  for (Instruction *RI : Returns) {
     IRB.SetInsertPoint(RI);
     IRB.CreateStore(BasePointer, UnsafeStackPtr);
   }
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SafeStackLayout.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SafeStackLayout.cpp
index c823454f825c..5d61b3a146b4 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/SafeStackLayout.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SafeStackLayout.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "SafeStackLayout.h"
-#include "llvm/Analysis/StackLifetime.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
@@ -141,10 +140,10 @@ void StackLayout::computeLayout() {
 
   // Sort objects by size (largest first) to reduce fragmentation.
   if (StackObjects.size() > 2)
-    std::stable_sort(StackObjects.begin() + 1, StackObjects.end(),
-                     [](const StackObject &a, const StackObject &b) {
-                       return a.Size > b.Size;
-                     });
+    llvm::stable_sort(drop_begin(StackObjects),
+                      [](const StackObject &a, const StackObject &b) {
+                        return a.Size > b.Size;
+                      });
 
   for (auto &Obj : StackObjects)
     layoutObject(Obj);
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp b/contrib/llvm-project/llvm/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp
deleted file mode 100644
index c93b29617438..000000000000
--- a/contrib/llvm-project/llvm/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp
+++ /dev/null
@@ -1,911 +0,0 @@
-//===- ScalarizeMaskedMemIntrin.cpp - Scalarize unsupported masked mem ----===//
-//                                    instrinsics
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass replaces masked memory intrinsics - when unsupported by the target
-// - with a chain of basic blocks, that deal with the elements one-by-one if the
-// appropriate mask bit is set.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/Twine.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/CodeGen/TargetSubtargetInfo.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Value.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include <algorithm>
-#include <cassert>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "scalarize-masked-mem-intrin"
-
-namespace {
-
-class ScalarizeMaskedMemIntrin : public FunctionPass {
-  const TargetTransformInfo *TTI = nullptr;
-  const DataLayout *DL = nullptr;
-
-public:
-  static char ID; // Pass identification, replacement for typeid
-
-  explicit ScalarizeMaskedMemIntrin() : FunctionPass(ID) {
-    initializeScalarizeMaskedMemIntrinPass(*PassRegistry::getPassRegistry());
-  }
-
-  bool runOnFunction(Function &F) override;
-
-  StringRef getPassName() const override {
-    return "Scalarize Masked Memory Intrinsics";
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<TargetTransformInfoWrapperPass>();
-  }
-
-private:
-  bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT);
-  bool optimizeCallInst(CallInst *CI, bool &ModifiedDT);
-};
-
-} // end anonymous namespace
-
-char ScalarizeMaskedMemIntrin::ID = 0;
-
-INITIALIZE_PASS(ScalarizeMaskedMemIntrin, DEBUG_TYPE,
-                "Scalarize unsupported masked memory intrinsics", false, false)
-
-FunctionPass *llvm::createScalarizeMaskedMemIntrinPass() {
-  return new ScalarizeMaskedMemIntrin();
-}
-
-static bool isConstantIntVector(Value *Mask) {
-  Constant *C = dyn_cast<Constant>(Mask);
-  if (!C)
-    return false;
-
-  unsigned NumElts = cast<FixedVectorType>(Mask->getType())->getNumElements();
-  for (unsigned i = 0; i != NumElts; ++i) {
-    Constant *CElt = C->getAggregateElement(i);
-    if (!CElt || !isa<ConstantInt>(CElt))
-      return false;
-  }
-
-  return true;
-}
-
-// Translate a masked load intrinsic like
-// <16 x i32 > @llvm.masked.load( <16 x i32>* %addr, i32 align,
-//                               <16 x i1> %mask, <16 x i32> %passthru)
-// to a chain of basic blocks, with loading element one-by-one if
-// the appropriate mask bit is set
-//
-//  %1 = bitcast i8* %addr to i32*
-//  %2 = extractelement <16 x i1> %mask, i32 0
-//  br i1 %2, label %cond.load, label %else
-//
-// cond.load:                                        ; preds = %0
-//  %3 = getelementptr i32* %1, i32 0
-//  %4 = load i32* %3
-//  %5 = insertelement <16 x i32> %passthru, i32 %4, i32 0
-//  br label %else
-//
-// else:                                             ; preds = %0, %cond.load
-//  %res.phi.else = phi <16 x i32> [ %5, %cond.load ], [ undef, %0 ]
-//  %6 = extractelement <16 x i1> %mask, i32 1
-//  br i1 %6, label %cond.load1, label %else2
-//
-// cond.load1:                                       ; preds = %else
-//  %7 = getelementptr i32* %1, i32 1
-//  %8 = load i32* %7
-//  %9 = insertelement <16 x i32> %res.phi.else, i32 %8, i32 1
-//  br label %else2
-//
-// else2:                                          ; preds = %else, %cond.load1
-//  %res.phi.else3 = phi <16 x i32> [ %9, %cond.load1 ], [ %res.phi.else, %else ]
-//  %10 = extractelement <16 x i1> %mask, i32 2
-//  br i1 %10, label %cond.load4, label %else5
-//
-static void scalarizeMaskedLoad(CallInst *CI, bool &ModifiedDT) {
-  Value *Ptr = CI->getArgOperand(0);
-  Value *Alignment = CI->getArgOperand(1);
-  Value *Mask = CI->getArgOperand(2);
-  Value *Src0 = CI->getArgOperand(3);
-
-  const Align AlignVal = cast<ConstantInt>(Alignment)->getAlignValue();
-  VectorType *VecType = cast<FixedVectorType>(CI->getType());
-
-  Type *EltTy = VecType->getElementType();
-
-  IRBuilder<> Builder(CI->getContext());
-  Instruction *InsertPt = CI;
-  BasicBlock *IfBlock = CI->getParent();
-
-  Builder.SetInsertPoint(InsertPt);
-  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
-
-  // Short-cut if the mask is all-true.
-  if (isa<Constant>(Mask) && cast<Constant>(Mask)->isAllOnesValue()) {
-    Value *NewI = Builder.CreateAlignedLoad(VecType, Ptr, AlignVal);
-    CI->replaceAllUsesWith(NewI);
-    CI->eraseFromParent();
-    return;
-  }
-
-  // Adjust alignment for the scalar instruction.
-  const Align AdjustedAlignVal =
-      commonAlignment(AlignVal, EltTy->getPrimitiveSizeInBits() / 8);
-  // Bitcast %addr from i8* to EltTy*
-  Type *NewPtrType =
-      EltTy->getPointerTo(Ptr->getType()->getPointerAddressSpace());
-  Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
-  unsigned VectorWidth = cast<FixedVectorType>(VecType)->getNumElements();
-
-  // The result vector
-  Value *VResult = Src0;
-
-  if (isConstantIntVector(Mask)) {
-    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
-      if (cast<Constant>(Mask)->getAggregateElement(Idx)->isNullValue())
-        continue;
-      Value *Gep = Builder.CreateConstInBoundsGEP1_32(EltTy, FirstEltPtr, Idx);
-      LoadInst *Load = Builder.CreateAlignedLoad(EltTy, Gep, AdjustedAlignVal);
-      VResult = Builder.CreateInsertElement(VResult, Load, Idx);
-    }
-    CI->replaceAllUsesWith(VResult);
-    CI->eraseFromParent();
-    return;
-  }
-
-  // If the mask is not v1i1, use scalar bit test operations. This generates
-  // better results on X86 at least.
-  Value *SclrMask;
-  if (VectorWidth != 1) {
-    Type *SclrMaskTy = Builder.getIntNTy(VectorWidth);
-    SclrMask = Builder.CreateBitCast(Mask, SclrMaskTy, "scalar_mask");
-  }
-
-  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
-    // Fill the "else" block, created in the previous iteration
-    //
-    //  %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
-    //  %mask_1 = and i16 %scalar_mask, i32 1 << Idx
-    //  %cond = icmp ne i16 %mask_1, 0
-    //  br i1 %mask_1, label %cond.load, label %else
-    //
-    Value *Predicate;
-    if (VectorWidth != 1) {
-      Value *Mask = Builder.getInt(APInt::getOneBitSet(VectorWidth, Idx));
-      Predicate = Builder.CreateICmpNE(Builder.CreateAnd(SclrMask, Mask),
-                                       Builder.getIntN(VectorWidth, 0));
-    } else {
-      Predicate = Builder.CreateExtractElement(Mask, Idx);
-    }
-
-    // Create "cond" block
-    //
-    //  %EltAddr = getelementptr i32* %1, i32 0
-    //  %Elt = load i32* %EltAddr
-    //  VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
-    //
-    BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt->getIterator(),
-                                                     "cond.load");
-    Builder.SetInsertPoint(InsertPt);
-
-    Value *Gep = Builder.CreateConstInBoundsGEP1_32(EltTy, FirstEltPtr, Idx);
-    LoadInst *Load = Builder.CreateAlignedLoad(EltTy, Gep, AdjustedAlignVal);
-    Value *NewVResult = Builder.CreateInsertElement(VResult, Load, Idx);
-
-    // Create "else" block, fill it in the next iteration
-    BasicBlock *NewIfBlock =
-        CondBlock->splitBasicBlock(InsertPt->getIterator(), "else");
-    Builder.SetInsertPoint(InsertPt);
-    Instruction *OldBr = IfBlock->getTerminator();
-    BranchInst::Create(CondBlock, NewIfBlock, Predicate, OldBr);
-    OldBr->eraseFromParent();
-    BasicBlock *PrevIfBlock = IfBlock;
-    IfBlock = NewIfBlock;
-
-    // Create the phi to join the new and previous value.
-    PHINode *Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
-    Phi->addIncoming(NewVResult, CondBlock);
-    Phi->addIncoming(VResult, PrevIfBlock);
-    VResult = Phi;
-  }
-
-  CI->replaceAllUsesWith(VResult);
-  CI->eraseFromParent();
-
-  ModifiedDT = true;
-}
-
-// Translate a masked store intrinsic, like
-// void @llvm.masked.store(<16 x i32> %src, <16 x i32>* %addr, i32 align,
-//                               <16 x i1> %mask)
-// to a chain of basic blocks, that stores element one-by-one if
-// the appropriate mask bit is set
-//
-//   %1 = bitcast i8* %addr to i32*
-//   %2 = extractelement <16 x i1> %mask, i32 0
-//   br i1 %2, label %cond.store, label %else
-//
-// cond.store:                                       ; preds = %0
-//   %3 = extractelement <16 x i32> %val, i32 0
-//   %4 = getelementptr i32* %1, i32 0
-//   store i32 %3, i32* %4
-//   br label %else
-//
-// else:                                             ; preds = %0, %cond.store
-//   %5 = extractelement <16 x i1> %mask, i32 1
-//   br i1 %5, label %cond.store1, label %else2
-//
-// cond.store1:                                      ; preds = %else
-//   %6 = extractelement <16 x i32> %val, i32 1
-//   %7 = getelementptr i32* %1, i32 1
-//   store i32 %6, i32* %7
-//   br label %else2
-//   . . .
-static void scalarizeMaskedStore(CallInst *CI, bool &ModifiedDT) {
-  Value *Src = CI->getArgOperand(0);
-  Value *Ptr = CI->getArgOperand(1);
-  Value *Alignment = CI->getArgOperand(2);
-  Value *Mask = CI->getArgOperand(3);
-
-  const Align AlignVal = cast<ConstantInt>(Alignment)->getAlignValue();
-  auto *VecType = cast<VectorType>(Src->getType());
-
-  Type *EltTy = VecType->getElementType();
-
-  IRBuilder<> Builder(CI->getContext());
-  Instruction *InsertPt = CI;
-  BasicBlock *IfBlock = CI->getParent();
-  Builder.SetInsertPoint(InsertPt);
-  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
-
-  // Short-cut if the mask is all-true.
-  if (isa<Constant>(Mask) && cast<Constant>(Mask)->isAllOnesValue()) {
-    Builder.CreateAlignedStore(Src, Ptr, AlignVal);
-    CI->eraseFromParent();
-    return;
-  }
-
-  // Adjust alignment for the scalar instruction.
-  const Align AdjustedAlignVal =
-      commonAlignment(AlignVal, EltTy->getPrimitiveSizeInBits() / 8);
-  // Bitcast %addr from i8* to EltTy*
-  Type *NewPtrType =
-      EltTy->getPointerTo(Ptr->getType()->getPointerAddressSpace());
-  Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
-  unsigned VectorWidth = cast<FixedVectorType>(VecType)->getNumElements();
-
-  if (isConstantIntVector(Mask)) {
-    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
-      if (cast<Constant>(Mask)->getAggregateElement(Idx)->isNullValue())
-        continue;
-      Value *OneElt = Builder.CreateExtractElement(Src, Idx);
-      Value *Gep = Builder.CreateConstInBoundsGEP1_32(EltTy, FirstEltPtr, Idx);
-      Builder.CreateAlignedStore(OneElt, Gep, AdjustedAlignVal);
-    }
-    CI->eraseFromParent();
-    return;
-  }
-
-  // If the mask is not v1i1, use scalar bit test operations. This generates
-  // better results on X86 at least.
-  Value *SclrMask;
-  if (VectorWidth != 1) {
-    Type *SclrMaskTy = Builder.getIntNTy(VectorWidth);
-    SclrMask = Builder.CreateBitCast(Mask, SclrMaskTy, "scalar_mask");
-  }
-
-  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
-    // Fill the "else" block, created in the previous iteration
-    //
-    //  %mask_1 = and i16 %scalar_mask, i32 1 << Idx
-    //  %cond = icmp ne i16 %mask_1, 0
-    //  br i1 %mask_1, label %cond.store, label %else
-    //
-    Value *Predicate;
-    if (VectorWidth != 1) {
-      Value *Mask = Builder.getInt(APInt::getOneBitSet(VectorWidth, Idx));
-      Predicate = Builder.CreateICmpNE(Builder.CreateAnd(SclrMask, Mask),
-                                       Builder.getIntN(VectorWidth, 0));
-    } else {
-      Predicate = Builder.CreateExtractElement(Mask, Idx);
-    }
-
-    // Create "cond" block
-    //
-    //  %OneElt = extractelement <16 x i32> %Src, i32 Idx
-    //  %EltAddr = getelementptr i32* %1, i32 0
-    //  %store i32 %OneElt, i32* %EltAddr
-    //
-    BasicBlock *CondBlock =
-        IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.store");
-    Builder.SetInsertPoint(InsertPt);
-
-    Value *OneElt = Builder.CreateExtractElement(Src, Idx);
-    Value *Gep = Builder.CreateConstInBoundsGEP1_32(EltTy, FirstEltPtr, Idx);
-    Builder.CreateAlignedStore(OneElt, Gep, AdjustedAlignVal);
-
-    // Create "else" block, fill it in the next iteration
-    BasicBlock *NewIfBlock =
-        CondBlock->splitBasicBlock(InsertPt->getIterator(), "else");
-    Builder.SetInsertPoint(InsertPt);
-    Instruction *OldBr = IfBlock->getTerminator();
-    BranchInst::Create(CondBlock, NewIfBlock, Predicate, OldBr);
-    OldBr->eraseFromParent();
-    IfBlock = NewIfBlock;
-  }
-  CI->eraseFromParent();
-
-  ModifiedDT = true;
-}
-
-// Translate a masked gather intrinsic like
-// <16 x i32 > @llvm.masked.gather.v16i32( <16 x i32*> %Ptrs, i32 4,
-//                               <16 x i1> %Mask, <16 x i32> %Src)
-// to a chain of basic blocks, with loading element one-by-one if
-// the appropriate mask bit is set
-//
-// %Ptrs = getelementptr i32, i32* %base, <16 x i64> %ind
-// %Mask0 = extractelement <16 x i1> %Mask, i32 0
-// br i1 %Mask0, label %cond.load, label %else
-//
-// cond.load:
-// %Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0
-// %Load0 = load i32, i32* %Ptr0, align 4
-// %Res0 = insertelement <16 x i32> undef, i32 %Load0, i32 0
-// br label %else
-//
-// else:
-// %res.phi.else = phi <16 x i32>[%Res0, %cond.load], [undef, %0]
-// %Mask1 = extractelement <16 x i1> %Mask, i32 1
-// br i1 %Mask1, label %cond.load1, label %else2
-//
-// cond.load1:
-// %Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
-// %Load1 = load i32, i32* %Ptr1, align 4
-// %Res1 = insertelement <16 x i32> %res.phi.else, i32 %Load1, i32 1
-// br label %else2
-// . . .
-// %Result = select <16 x i1> %Mask, <16 x i32> %res.phi.select, <16 x i32> %Src
-// ret <16 x i32> %Result
-static void scalarizeMaskedGather(CallInst *CI, bool &ModifiedDT) {
-  Value *Ptrs = CI->getArgOperand(0);
-  Value *Alignment = CI->getArgOperand(1);
-  Value *Mask = CI->getArgOperand(2);
-  Value *Src0 = CI->getArgOperand(3);
-
-  auto *VecType = cast<FixedVectorType>(CI->getType());
-  Type *EltTy = VecType->getElementType();
-
-  IRBuilder<> Builder(CI->getContext());
-  Instruction *InsertPt = CI;
-  BasicBlock *IfBlock = CI->getParent();
-  Builder.SetInsertPoint(InsertPt);
-  MaybeAlign AlignVal = cast<ConstantInt>(Alignment)->getMaybeAlignValue();
-
-  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
-
-  // The result vector
-  Value *VResult = Src0;
-  unsigned VectorWidth = VecType->getNumElements();
-
-  // Shorten the way if the mask is a vector of constants.
-  if (isConstantIntVector(Mask)) {
-    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
-      if (cast<Constant>(Mask)->getAggregateElement(Idx)->isNullValue())
-        continue;
-      Value *Ptr = Builder.CreateExtractElement(Ptrs, Idx, "Ptr" + Twine(Idx));
-      LoadInst *Load =
-          Builder.CreateAlignedLoad(EltTy, Ptr, AlignVal, "Load" + Twine(Idx));
-      VResult =
-          Builder.CreateInsertElement(VResult, Load, Idx, "Res" + Twine(Idx));
-    }
-    CI->replaceAllUsesWith(VResult);
-    CI->eraseFromParent();
-    return;
-  }
-
-  // If the mask is not v1i1, use scalar bit test operations. This generates
-  // better results on X86 at least.
-  Value *SclrMask;
-  if (VectorWidth != 1) {
-    Type *SclrMaskTy = Builder.getIntNTy(VectorWidth);
-    SclrMask = Builder.CreateBitCast(Mask, SclrMaskTy, "scalar_mask");
-  }
-
-  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
-    // Fill the "else" block, created in the previous iteration
-    //
-    //  %Mask1 = and i16 %scalar_mask, i32 1 << Idx
-    //  %cond = icmp ne i16 %mask_1, 0
-    //  br i1 %Mask1, label %cond.load, label %else
-    //
-
-    Value *Predicate;
-    if (VectorWidth != 1) {
-      Value *Mask = Builder.getInt(APInt::getOneBitSet(VectorWidth, Idx));
-      Predicate = Builder.CreateICmpNE(Builder.CreateAnd(SclrMask, Mask),
-                                       Builder.getIntN(VectorWidth, 0));
-    } else {
-      Predicate = Builder.CreateExtractElement(Mask, Idx, "Mask" + Twine(Idx));
-    }
-
-    // Create "cond" block
-    //
-    //  %EltAddr = getelementptr i32* %1, i32 0
-    //  %Elt = load i32* %EltAddr
-    //  VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
-    //
-    BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.load");
-    Builder.SetInsertPoint(InsertPt);
-
-    Value *Ptr = Builder.CreateExtractElement(Ptrs, Idx, "Ptr" + Twine(Idx));
-    LoadInst *Load =
-        Builder.CreateAlignedLoad(EltTy, Ptr, AlignVal, "Load" + Twine(Idx));
-    Value *NewVResult =
-        Builder.CreateInsertElement(VResult, Load, Idx, "Res" + Twine(Idx));
-
-    // Create "else" block, fill it in the next iteration
-    BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
-    Builder.SetInsertPoint(InsertPt);
-    Instruction *OldBr = IfBlock->getTerminator();
-    BranchInst::Create(CondBlock, NewIfBlock, Predicate, OldBr);
-    OldBr->eraseFromParent();
-    BasicBlock *PrevIfBlock = IfBlock;
-    IfBlock = NewIfBlock;
-
-    PHINode *Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
-    Phi->addIncoming(NewVResult, CondBlock);
-    Phi->addIncoming(VResult, PrevIfBlock);
-    VResult = Phi;
-  }
-
-  CI->replaceAllUsesWith(VResult);
-  CI->eraseFromParent();
-
-  ModifiedDT = true;
-}
-
-// Translate a masked scatter intrinsic, like
-// void @llvm.masked.scatter.v16i32(<16 x i32> %Src, <16 x i32*>* %Ptrs, i32 4,
-//                                  <16 x i1> %Mask)
-// to a chain of basic blocks, that stores element one-by-one if
-// the appropriate mask bit is set.
-//
-// %Ptrs = getelementptr i32, i32* %ptr, <16 x i64> %ind
-// %Mask0 = extractelement <16 x i1> %Mask, i32 0
-// br i1 %Mask0, label %cond.store, label %else
-//
-// cond.store:
-// %Elt0 = extractelement <16 x i32> %Src, i32 0
-// %Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0
-// store i32 %Elt0, i32* %Ptr0, align 4
-// br label %else
-//
-// else:
-// %Mask1 = extractelement <16 x i1> %Mask, i32 1
-// br i1 %Mask1, label %cond.store1, label %else2
-//
-// cond.store1:
-// %Elt1 = extractelement <16 x i32> %Src, i32 1
-// %Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
-// store i32 %Elt1, i32* %Ptr1, align 4
-// br label %else2
-//   . . .
-static void scalarizeMaskedScatter(CallInst *CI, bool &ModifiedDT) {
-  Value *Src = CI->getArgOperand(0);
-  Value *Ptrs = CI->getArgOperand(1);
-  Value *Alignment = CI->getArgOperand(2);
-  Value *Mask = CI->getArgOperand(3);
-
-  auto *SrcFVTy = cast<FixedVectorType>(Src->getType());
-
-  assert(
-      isa<VectorType>(Ptrs->getType()) &&
-      isa<PointerType>(cast<VectorType>(Ptrs->getType())->getElementType()) &&
-      "Vector of pointers is expected in masked scatter intrinsic");
-
-  IRBuilder<> Builder(CI->getContext());
-  Instruction *InsertPt = CI;
-  BasicBlock *IfBlock = CI->getParent();
-  Builder.SetInsertPoint(InsertPt);
-  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
-
-  MaybeAlign AlignVal = cast<ConstantInt>(Alignment)->getMaybeAlignValue();
-  unsigned VectorWidth = SrcFVTy->getNumElements();
-
-  // Shorten the way if the mask is a vector of constants.
-  if (isConstantIntVector(Mask)) {
-    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
-      if (cast<Constant>(Mask)->getAggregateElement(Idx)->isNullValue())
-        continue;
-      Value *OneElt =
-          Builder.CreateExtractElement(Src, Idx, "Elt" + Twine(Idx));
-      Value *Ptr = Builder.CreateExtractElement(Ptrs, Idx, "Ptr" + Twine(Idx));
-      Builder.CreateAlignedStore(OneElt, Ptr, AlignVal);
-    }
-    CI->eraseFromParent();
-    return;
-  }
-
-  // If the mask is not v1i1, use scalar bit test operations. This generates
-  // better results on X86 at least.
-  Value *SclrMask;
-  if (VectorWidth != 1) {
-    Type *SclrMaskTy = Builder.getIntNTy(VectorWidth);
-    SclrMask = Builder.CreateBitCast(Mask, SclrMaskTy, "scalar_mask");
-  }
-
-  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
-    // Fill the "else" block, created in the previous iteration
-    //
-    //  %Mask1 = and i16 %scalar_mask, i32 1 << Idx
-    //  %cond = icmp ne i16 %mask_1, 0
-    //  br i1 %Mask1, label %cond.store, label %else
-    //
-    Value *Predicate;
-    if (VectorWidth != 1) {
-      Value *Mask = Builder.getInt(APInt::getOneBitSet(VectorWidth, Idx));
-      Predicate = Builder.CreateICmpNE(Builder.CreateAnd(SclrMask, Mask),
-                                       Builder.getIntN(VectorWidth, 0));
-    } else {
-      Predicate = Builder.CreateExtractElement(Mask, Idx, "Mask" + Twine(Idx));
-    }
-
-    // Create "cond" block
-    //
-    //  %Elt1 = extractelement <16 x i32> %Src, i32 1
-    //  %Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
-    //  %store i32 %Elt1, i32* %Ptr1
-    //
-    BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
-    Builder.SetInsertPoint(InsertPt);
-
-    Value *OneElt = Builder.CreateExtractElement(Src, Idx, "Elt" + Twine(Idx));
-    Value *Ptr = Builder.CreateExtractElement(Ptrs, Idx, "Ptr" + Twine(Idx));
-    Builder.CreateAlignedStore(OneElt, Ptr, AlignVal);
-
-    // Create "else" block, fill it in the next iteration
-    BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
-    Builder.SetInsertPoint(InsertPt);
-    Instruction *OldBr = IfBlock->getTerminator();
-    BranchInst::Create(CondBlock, NewIfBlock, Predicate, OldBr);
-    OldBr->eraseFromParent();
-    IfBlock = NewIfBlock;
-  }
-  CI->eraseFromParent();
-
-  ModifiedDT = true;
-}
-
-static void scalarizeMaskedExpandLoad(CallInst *CI, bool &ModifiedDT) {
-  Value *Ptr = CI->getArgOperand(0);
-  Value *Mask = CI->getArgOperand(1);
-  Value *PassThru = CI->getArgOperand(2);
-
-  auto *VecType = cast<FixedVectorType>(CI->getType());
-
-  Type *EltTy = VecType->getElementType();
-
-  IRBuilder<> Builder(CI->getContext());
-  Instruction *InsertPt = CI;
-  BasicBlock *IfBlock = CI->getParent();
-
-  Builder.SetInsertPoint(InsertPt);
-  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
-
-  unsigned VectorWidth = VecType->getNumElements();
-
-  // The result vector
-  Value *VResult = PassThru;
-
-  // Shorten the way if the mask is a vector of constants.
-  if (isConstantIntVector(Mask)) {
-    unsigned MemIndex = 0;
-    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
-      if (cast<Constant>(Mask)->getAggregateElement(Idx)->isNullValue())
-        continue;
-      Value *NewPtr = Builder.CreateConstInBoundsGEP1_32(EltTy, Ptr, MemIndex);
-      LoadInst *Load = Builder.CreateAlignedLoad(EltTy, NewPtr, Align(1),
-                                                 "Load" + Twine(Idx));
-      VResult =
-          Builder.CreateInsertElement(VResult, Load, Idx, "Res" + Twine(Idx));
-      ++MemIndex;
-    }
-    CI->replaceAllUsesWith(VResult);
-    CI->eraseFromParent();
-    return;
-  }
-
-  // If the mask is not v1i1, use scalar bit test operations. This generates
-  // better results on X86 at least.
-  Value *SclrMask;
-  if (VectorWidth != 1) {
-    Type *SclrMaskTy = Builder.getIntNTy(VectorWidth);
-    SclrMask = Builder.CreateBitCast(Mask, SclrMaskTy, "scalar_mask");
-  }
-
-  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
-    // Fill the "else" block, created in the previous iteration
-    //
-    //  %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
-    //  %mask_1 = extractelement <16 x i1> %mask, i32 Idx
-    //  br i1 %mask_1, label %cond.load, label %else
-    //
-
-    Value *Predicate;
-    if (VectorWidth != 1) {
-      Value *Mask = Builder.getInt(APInt::getOneBitSet(VectorWidth, Idx));
-      Predicate = Builder.CreateICmpNE(Builder.CreateAnd(SclrMask, Mask),
-                                       Builder.getIntN(VectorWidth, 0));
-    } else {
-      Predicate = Builder.CreateExtractElement(Mask, Idx, "Mask" + Twine(Idx));
-    }
-
-    // Create "cond" block
-    //
-    //  %EltAddr = getelementptr i32* %1, i32 0
-    //  %Elt = load i32* %EltAddr
-    //  VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
-    //
-    BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt->getIterator(),
-                                                     "cond.load");
-    Builder.SetInsertPoint(InsertPt);
-
-    LoadInst *Load = Builder.CreateAlignedLoad(EltTy, Ptr, Align(1));
-    Value *NewVResult = Builder.CreateInsertElement(VResult, Load, Idx);
-
-    // Move the pointer if there are more blocks to come.
-    Value *NewPtr;
-    if ((Idx + 1) != VectorWidth)
-      NewPtr = Builder.CreateConstInBoundsGEP1_32(EltTy, Ptr, 1);
-
-    // Create "else" block, fill it in the next iteration
-    BasicBlock *NewIfBlock =
-        CondBlock->splitBasicBlock(InsertPt->getIterator(), "else");
-    Builder.SetInsertPoint(InsertPt);
-    Instruction *OldBr = IfBlock->getTerminator();
-    BranchInst::Create(CondBlock, NewIfBlock, Predicate, OldBr);
-    OldBr->eraseFromParent();
-    BasicBlock *PrevIfBlock = IfBlock;
-    IfBlock = NewIfBlock;
-
-    // Create the phi to join the new and previous value.
-    PHINode *ResultPhi = Builder.CreatePHI(VecType, 2, "res.phi.else");
-    ResultPhi->addIncoming(NewVResult, CondBlock);
-    ResultPhi->addIncoming(VResult, PrevIfBlock);
-    VResult = ResultPhi;
-
-    // Add a PHI for the pointer if this isn't the last iteration.
-    if ((Idx + 1) != VectorWidth) {
-      PHINode *PtrPhi = Builder.CreatePHI(Ptr->getType(), 2, "ptr.phi.else");
-      PtrPhi->addIncoming(NewPtr, CondBlock);
-      PtrPhi->addIncoming(Ptr, PrevIfBlock);
-      Ptr = PtrPhi;
-    }
-  }
-
-  CI->replaceAllUsesWith(VResult);
-  CI->eraseFromParent();
-
-  ModifiedDT = true;
-}
-
-static void scalarizeMaskedCompressStore(CallInst *CI, bool &ModifiedDT) {
-  Value *Src = CI->getArgOperand(0);
-  Value *Ptr = CI->getArgOperand(1);
-  Value *Mask = CI->getArgOperand(2);
-
-  auto *VecType = cast<FixedVectorType>(Src->getType());
-
-  IRBuilder<> Builder(CI->getContext());
-  Instruction *InsertPt = CI;
-  BasicBlock *IfBlock = CI->getParent();
-
-  Builder.SetInsertPoint(InsertPt);
-  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
-
-  Type *EltTy = VecType->getElementType();
-
-  unsigned VectorWidth = VecType->getNumElements();
-
-  // Shorten the way if the mask is a vector of constants.
-  if (isConstantIntVector(Mask)) {
-    unsigned MemIndex = 0;
-    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
-      if (cast<Constant>(Mask)->getAggregateElement(Idx)->isNullValue())
-        continue;
-      Value *OneElt =
-          Builder.CreateExtractElement(Src, Idx, "Elt" + Twine(Idx));
-      Value *NewPtr = Builder.CreateConstInBoundsGEP1_32(EltTy, Ptr, MemIndex);
-      Builder.CreateAlignedStore(OneElt, NewPtr, Align(1));
-      ++MemIndex;
-    }
-    CI->eraseFromParent();
-    return;
-  }
-
-  // If the mask is not v1i1, use scalar bit test operations. This generates
-  // better results on X86 at least.
-  Value *SclrMask;
-  if (VectorWidth != 1) {
-    Type *SclrMaskTy = Builder.getIntNTy(VectorWidth);
-    SclrMask = Builder.CreateBitCast(Mask, SclrMaskTy, "scalar_mask");
-  }
-
-  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
-    // Fill the "else" block, created in the previous iteration
-    //
-    //  %mask_1 = extractelement <16 x i1> %mask, i32 Idx
-    //  br i1 %mask_1, label %cond.store, label %else
-    //
-    Value *Predicate;
-    if (VectorWidth != 1) {
-      Value *Mask = Builder.getInt(APInt::getOneBitSet(VectorWidth, Idx));
-      Predicate = Builder.CreateICmpNE(Builder.CreateAnd(SclrMask, Mask),
-                                       Builder.getIntN(VectorWidth, 0));
-    } else {
-      Predicate = Builder.CreateExtractElement(Mask, Idx, "Mask" + Twine(Idx));
-    }
-
-    // Create "cond" block
-    //
-    //  %OneElt = extractelement <16 x i32> %Src, i32 Idx
-    //  %EltAddr = getelementptr i32* %1, i32 0
-    //  %store i32 %OneElt, i32* %EltAddr
-    //
-    BasicBlock *CondBlock =
-        IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.store");
-    Builder.SetInsertPoint(InsertPt);
-
-    Value *OneElt = Builder.CreateExtractElement(Src, Idx);
-    Builder.CreateAlignedStore(OneElt, Ptr, Align(1));
-
-    // Move the pointer if there are more blocks to come.
-    Value *NewPtr;
-    if ((Idx + 1) != VectorWidth)
-      NewPtr = Builder.CreateConstInBoundsGEP1_32(EltTy, Ptr, 1);
-
-    // Create "else" block, fill it in the next iteration
-    BasicBlock *NewIfBlock =
-        CondBlock->splitBasicBlock(InsertPt->getIterator(), "else");
-    Builder.SetInsertPoint(InsertPt);
-    Instruction *OldBr = IfBlock->getTerminator();
-    BranchInst::Create(CondBlock, NewIfBlock, Predicate, OldBr);
-    OldBr->eraseFromParent();
-    BasicBlock *PrevIfBlock = IfBlock;
-    IfBlock = NewIfBlock;
-
-    // Add a PHI for the pointer if this isn't the last iteration.
-    if ((Idx + 1) != VectorWidth) {
-      PHINode *PtrPhi = Builder.CreatePHI(Ptr->getType(), 2, "ptr.phi.else");
-      PtrPhi->addIncoming(NewPtr, CondBlock);
-      PtrPhi->addIncoming(Ptr, PrevIfBlock);
-      Ptr = PtrPhi;
-    }
-  }
-  CI->eraseFromParent();
-
-  ModifiedDT = true;
-}
-
-bool ScalarizeMaskedMemIntrin::runOnFunction(Function &F) {
-  bool EverMadeChange = false;
-
-  TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
-  DL = &F.getParent()->getDataLayout();
-
-  bool MadeChange = true;
-  while (MadeChange) {
-    MadeChange = false;
-    for (Function::iterator I = F.begin(); I != F.end();) {
-      BasicBlock *BB = &*I++;
-      bool ModifiedDTOnIteration = false;
-      MadeChange |= optimizeBlock(*BB, ModifiedDTOnIteration);
-
-      // Restart BB iteration if the dominator tree of the Function was changed
-      if (ModifiedDTOnIteration)
-        break;
-    }
-
-    EverMadeChange |= MadeChange;
-  }
-
-  return EverMadeChange;
-}
-
-bool ScalarizeMaskedMemIntrin::optimizeBlock(BasicBlock &BB, bool &ModifiedDT) {
-  bool MadeChange = false;
-
-  BasicBlock::iterator CurInstIterator = BB.begin();
-  while (CurInstIterator != BB.end()) {
-    if (CallInst *CI = dyn_cast<CallInst>(&*CurInstIterator++))
-      MadeChange |= optimizeCallInst(CI, ModifiedDT);
-    if (ModifiedDT)
-      return true;
-  }
-
-  return MadeChange;
-}
-
-bool ScalarizeMaskedMemIntrin::optimizeCallInst(CallInst *CI,
-                                                bool &ModifiedDT) {
-  IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
-  if (II) {
-    switch (II->getIntrinsicID()) {
-    default:
-      break;
-    case Intrinsic::masked_load:
-      // Scalarize unsupported vector masked load
-      if (TTI->isLegalMaskedLoad(
-              CI->getType(),
-              cast<ConstantInt>(CI->getArgOperand(1))->getAlignValue()))
-        return false;
-      scalarizeMaskedLoad(CI, ModifiedDT);
-      return true;
-    case Intrinsic::masked_store:
-      if (TTI->isLegalMaskedStore(
-              CI->getArgOperand(0)->getType(),
-              cast<ConstantInt>(CI->getArgOperand(2))->getAlignValue()))
-        return false;
-      scalarizeMaskedStore(CI, ModifiedDT);
-      return true;
-    case Intrinsic::masked_gather: {
-      unsigned AlignmentInt =
-          cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
-      Type *LoadTy = CI->getType();
-      Align Alignment =
-          DL->getValueOrABITypeAlignment(MaybeAlign(AlignmentInt), LoadTy);
-      if (TTI->isLegalMaskedGather(LoadTy, Alignment))
-        return false;
-      scalarizeMaskedGather(CI, ModifiedDT);
-      return true;
-    }
-    case Intrinsic::masked_scatter: {
-      unsigned AlignmentInt =
-          cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
-      Type *StoreTy = CI->getArgOperand(0)->getType();
-      Align Alignment =
-          DL->getValueOrABITypeAlignment(MaybeAlign(AlignmentInt), StoreTy);
-      if (TTI->isLegalMaskedScatter(StoreTy, Alignment))
-        return false;
-      scalarizeMaskedScatter(CI, ModifiedDT);
-      return true;
-    }
-    case Intrinsic::masked_expandload:
-      if (TTI->isLegalMaskedExpandLoad(CI->getType()))
-        return false;
-      scalarizeMaskedExpandLoad(CI, ModifiedDT);
-      return true;
-    case Intrinsic::masked_compressstore:
-      if (TTI->isLegalMaskedCompressStore(CI->getArgOperand(0)->getType()))
-        return false;
-      scalarizeMaskedCompressStore(CI, ModifiedDT);
-      return true;
-    }
-  }
-
-  return false;
-}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp b/contrib/llvm-project/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
index 10da2d421797..5899da777fe9 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -154,7 +154,7 @@ static bool getUnderlyingObjectsForInstr(const MachineInstr *MI,
         Objects.push_back(UnderlyingObjectsVector::value_type(PSV, MayAlias));
       } else if (const Value *V = MMO->getValue()) {
         SmallVector<Value *, 4> Objs;
-        if (!getUnderlyingObjectsForCodeGen(V, Objs, DL))
+        if (!getUnderlyingObjectsForCodeGen(V, Objs))
           return false;
 
         for (Value *V : Objs) {
@@ -199,7 +199,10 @@ void ScheduleDAGInstrs::exitRegion() {
 }
 
 void ScheduleDAGInstrs::addSchedBarrierDeps() {
-  MachineInstr *ExitMI = RegionEnd != BB->end() ? &*RegionEnd : nullptr;
+  MachineInstr *ExitMI =
+      RegionEnd != BB->end()
+          ? &*skipDebugInstructionsBackward(RegionEnd, RegionBegin)
+          : nullptr;
   ExitSU.setInstr(ExitMI);
   // Add dependencies on the defs and uses of the instruction.
   if (ExitMI) {
@@ -241,8 +244,6 @@ void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) {
                             !DefMIDesc->hasImplicitDefOfPhysReg(MO.getReg()));
   for (MCRegAliasIterator Alias(MO.getReg(), TRI, true);
        Alias.isValid(); ++Alias) {
-    if (!Uses.contains(*Alias))
-      continue;
     for (Reg2SUnitsMap::iterator I = Uses.find(*Alias); I != Uses.end(); ++I) {
       SUnit *UseSU = I->SU;
       if (UseSU == SU)
@@ -513,6 +514,8 @@ void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) {
 /// TODO: Handle ExitSU "uses" properly.
 void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) {
   const MachineInstr *MI = SU->getInstr();
+  assert(!MI->isDebugInstr());
+
   const MachineOperand &MO = MI->getOperand(OperIdx);
   Register Reg = MO.getReg();
 
@@ -804,7 +807,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AAResults *AA,
       DbgMI = nullptr;
     }
 
-    if (MI.isDebugValue()) {
+    if (MI.isDebugValue() || MI.isDebugRef()) {
       DbgMI = &MI;
       continue;
     }
@@ -1184,7 +1187,7 @@ std::string ScheduleDAGInstrs::getGraphNodeLabel(const SUnit *SU) const {
   else if (SU == &ExitSU)
     oss << "<exit>";
   else
-    SU->getInstr()->print(oss, /*SkipOpers=*/true);
+    SU->getInstr()->print(oss, /*IsStandalone=*/true);
   return oss.str();
 }
 
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp b/contrib/llvm-project/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp
index a113c30f851b..05b2a3764cca 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp
@@ -35,7 +35,7 @@ namespace llvm {
       return true;
     }
 
-    static bool isNodeHidden(const SUnit *Node) {
+    static bool isNodeHidden(const SUnit *Node, const ScheduleDAG *G) {
       return (Node->NumPreds > 10 || Node->NumSuccs > 10);
     }
 
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index ec384d2a7c56..7f2add81e80d 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -24,12 +24,14 @@
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/CodeGen/DAGCombine.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
@@ -410,9 +412,11 @@ namespace {
     SDValue visitSUBO(SDNode *N);
     SDValue visitADDE(SDNode *N);
     SDValue visitADDCARRY(SDNode *N);
+    SDValue visitSADDO_CARRY(SDNode *N);
     SDValue visitADDCARRYLike(SDValue N0, SDValue N1, SDValue CarryIn, SDNode *N);
     SDValue visitSUBE(SDNode *N);
     SDValue visitSUBCARRY(SDNode *N);
+    SDValue visitSSUBO_CARRY(SDNode *N);
     SDValue visitMUL(SDNode *N);
     SDValue visitMULFIX(SDNode *N);
     SDValue useDivRem(SDNode *N);
@@ -464,6 +468,7 @@ namespace {
     SDValue visitFREEZE(SDNode *N);
     SDValue visitBUILD_PAIR(SDNode *N);
     SDValue visitFADD(SDNode *N);
+    SDValue visitSTRICT_FADD(SDNode *N);
     SDValue visitFSUB(SDNode *N);
     SDValue visitFMUL(SDNode *N);
     SDValue visitFMA(SDNode *N);
@@ -539,6 +544,7 @@ namespace {
     SDValue convertSelectOfFPConstantsToLoadOffset(
         const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2, SDValue N3,
         ISD::CondCode CC);
+    SDValue foldSignChangeInBitcast(SDNode *N);
     SDValue foldSelectCCToShiftAnd(const SDLoc &DL, SDValue N0, SDValue N1,
                                    SDValue N2, SDValue N3, ISD::CondCode CC);
     SDValue foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1,
@@ -586,7 +592,7 @@ namespace {
                               const SDLoc &DL);
     SDValue MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL);
     SDValue MatchLoadCombine(SDNode *N);
-    SDValue MatchStoreCombine(StoreSDNode *N);
+    SDValue mergeTruncStores(StoreSDNode *N);
     SDValue ReduceLoadWidth(SDNode *N);
     SDValue ReduceLoadOpStoreWidth(SDNode *N);
     SDValue splitMergedValStore(StoreSDNode *ST);
@@ -641,14 +647,18 @@ namespace {
     // Classify the origin of a stored value.
     enum class StoreSource { Unknown, Constant, Extract, Load };
     StoreSource getStoreSource(SDValue StoreVal) {
-      if (isa<ConstantSDNode>(StoreVal) || isa<ConstantFPSDNode>(StoreVal))
+      switch (StoreVal.getOpcode()) {
+      case ISD::Constant:
+      case ISD::ConstantFP:
         return StoreSource::Constant;
-      if (StoreVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
-          StoreVal.getOpcode() == ISD::EXTRACT_SUBVECTOR)
+      case ISD::EXTRACT_VECTOR_ELT:
+      case ISD::EXTRACT_SUBVECTOR:
         return StoreSource::Extract;
-      if (isa<LoadSDNode>(StoreVal))
+      case ISD::LOAD:
         return StoreSource::Load;
-      return StoreSource::Unknown;
+      default:
+        return StoreSource::Unknown;
+      }
     }
 
     /// This is a helper function for visitMUL to check the profitability
@@ -752,9 +762,7 @@ namespace {
     /// is legal or custom before legalizing operations, and whether is
     /// legal (but not custom) after legalization.
     bool hasOperation(unsigned Opcode, EVT VT) {
-      if (LegalOperations)
-        return TLI.isOperationLegal(Opcode, VT);
-      return TLI.isOperationLegalOrCustom(Opcode, VT);
+      return TLI.isOperationLegalOrCustom(Opcode, VT, LegalOperations);
     }
 
   public:
@@ -924,23 +932,40 @@ bool DAGCombiner::isOneUseSetCC(SDValue N) const {
   return false;
 }
 
-// Returns the SDNode if it is a constant float BuildVector
-// or constant float.
-static SDNode *isConstantFPBuildVectorOrConstantFP(SDValue N) {
-  if (isa<ConstantFPSDNode>(N))
-    return N.getNode();
-  if (ISD::isBuildVectorOfConstantFPSDNodes(N.getNode()))
-    return N.getNode();
-  return nullptr;
+static bool isConstantSplatVectorMaskForType(SDNode *N, EVT ScalarTy) {
+  if (!ScalarTy.isSimple())
+    return false;
+
+  uint64_t MaskForTy = 0ULL;
+  switch (ScalarTy.getSimpleVT().SimpleTy) {
+  case MVT::i8:
+    MaskForTy = 0xFFULL;
+    break;
+  case MVT::i16:
+    MaskForTy = 0xFFFFULL;
+    break;
+  case MVT::i32:
+    MaskForTy = 0xFFFFFFFFULL;
+    break;
+  default:
+    return false;
+    break;
+  }
+
+  APInt Val;
+  if (ISD::isConstantSplatVector(N, Val))
+    return Val.getLimitedValue() == MaskForTy;
+
+  return false;
 }
 
-// Determines if it is a constant integer or a build vector of constant
+// Determines if it is a constant integer or a splat/build vector of constant
 // integers (and undefs).
 // Do not permit build vector implicit truncation.
 static bool isConstantOrConstantVector(SDValue N, bool NoOpaques = false) {
   if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N))
     return !(Const->isOpaque() && NoOpaques);
-  if (N.getOpcode() != ISD::BUILD_VECTOR)
+  if (N.getOpcode() != ISD::BUILD_VECTOR && N.getOpcode() != ISD::SPLAT_VECTOR)
     return false;
   unsigned BitWidth = N.getScalarValueSizeInBits();
   for (const SDValue &Op : N->op_values()) {
@@ -1554,9 +1579,15 @@ void DAGCombiner::Run(CombineLevel AtLevel) {
       DAG.ReplaceAllUsesWith(N, &RV);
     }
 
-    // Push the new node and any users onto the worklist
-    AddToWorklist(RV.getNode());
-    AddUsersToWorklist(RV.getNode());
+    // Push the new node and any users onto the worklist.  Omit this if the
+    // new node is the EntryToken (e.g. if a store managed to get optimized
+    // out), because re-visiting the EntryToken and its users will not uncover
+    // any additional opportunities, but there may be a large number of such
+    // users, potentially causing compile time explosion.
+    if (RV.getOpcode() != ISD::EntryToken) {
+      AddToWorklist(RV.getNode());
+      AddUsersToWorklist(RV.getNode());
+    }
 
     // Finally, if the node is now dead, remove it from the graph.  The node
     // may not be dead if the replacement process recursively simplified to
@@ -1589,8 +1620,10 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::USUBO:              return visitSUBO(N);
   case ISD::ADDE:               return visitADDE(N);
   case ISD::ADDCARRY:           return visitADDCARRY(N);
+  case ISD::SADDO_CARRY:        return visitSADDO_CARRY(N);
   case ISD::SUBE:               return visitSUBE(N);
   case ISD::SUBCARRY:           return visitSUBCARRY(N);
+  case ISD::SSUBO_CARRY:        return visitSSUBO_CARRY(N);
   case ISD::SMULFIX:
   case ISD::SMULFIXSAT:
   case ISD::UMULFIX:
@@ -1646,6 +1679,7 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::BITCAST:            return visitBITCAST(N);
   case ISD::BUILD_PAIR:         return visitBUILD_PAIR(N);
   case ISD::FADD:               return visitFADD(N);
+  case ISD::STRICT_FADD:        return visitSTRICT_FADD(N);
   case ISD::FSUB:               return visitFSUB(N);
   case ISD::FMUL:               return visitFMUL(N);
   case ISD::FMA:                return visitFMA(N);
@@ -1805,6 +1839,10 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) {
   if (OptLevel == CodeGenOpt::None)
     return SDValue();
 
+  // Don't simplify the token factor if the node itself has too many operands.
+  if (N->getNumOperands() > TokenFactorInlineLimit)
+    return SDValue();
+
   // If the sole user is a token factor, we should make sure we have a
   // chance to merge them together. This prevents TF chains from inhibiting
   // optimizations.
@@ -1890,7 +1928,7 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) {
   auto AddToWorklist = [&](unsigned CurIdx, SDNode *Op, unsigned OpNumber) {
     // If this is an Op, we can remove the op from the list. Remark any
     // search associated with it as from the current OpNumber.
-    if (SeenOps.count(Op) != 0) {
+    if (SeenOps.contains(Op)) {
       Changed = true;
       DidPruneOps = true;
       unsigned OrigOpNumber = 0;
@@ -2002,6 +2040,62 @@ static ConstantSDNode *getAsNonOpaqueConstant(SDValue N) {
   return Const != nullptr && !Const->isOpaque() ? Const : nullptr;
 }
 
+/// Return true if 'Use' is a load or a store that uses N as its base pointer
+/// and that N may be folded in the load / store addressing mode.
+static bool canFoldInAddressingMode(SDNode *N, SDNode *Use, SelectionDAG &DAG,
+                                    const TargetLowering &TLI) {
+  EVT VT;
+  unsigned AS;
+
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Use)) {
+    if (LD->isIndexed() || LD->getBasePtr().getNode() != N)
+      return false;
+    VT = LD->getMemoryVT();
+    AS = LD->getAddressSpace();
+  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(Use)) {
+    if (ST->isIndexed() || ST->getBasePtr().getNode() != N)
+      return false;
+    VT = ST->getMemoryVT();
+    AS = ST->getAddressSpace();
+  } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(Use)) {
+    if (LD->isIndexed() || LD->getBasePtr().getNode() != N)
+      return false;
+    VT = LD->getMemoryVT();
+    AS = LD->getAddressSpace();
+  } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(Use)) {
+    if (ST->isIndexed() || ST->getBasePtr().getNode() != N)
+      return false;
+    VT = ST->getMemoryVT();
+    AS = ST->getAddressSpace();
+  } else
+    return false;
+
+  TargetLowering::AddrMode AM;
+  if (N->getOpcode() == ISD::ADD) {
+    AM.HasBaseReg = true;
+    ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
+    if (Offset)
+      // [reg +/- imm]
+      AM.BaseOffs = Offset->getSExtValue();
+    else
+      // [reg +/- reg]
+      AM.Scale = 1;
+  } else if (N->getOpcode() == ISD::SUB) {
+    AM.HasBaseReg = true;
+    ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
+    if (Offset)
+      // [reg +/- imm]
+      AM.BaseOffs = -Offset->getSExtValue();
+    else
+      // [reg +/- reg]
+      AM.Scale = 1;
+  } else
+    return false;
+
+  return TLI.isLegalAddressingMode(DAG.getDataLayout(), AM,
+                                   VT.getTypeForEVT(*DAG.getContext()), AS);
+}
+
 SDValue DAGCombiner::foldBinOpIntoSelect(SDNode *BO) {
   assert(TLI.isBinOp(BO->getOpcode()) && BO->getNumValues() == 1 &&
          "Unexpected binary operator");
@@ -2021,12 +2115,12 @@ SDValue DAGCombiner::foldBinOpIntoSelect(SDNode *BO) {
 
   SDValue CT = Sel.getOperand(1);
   if (!isConstantOrConstantVector(CT, true) &&
-      !isConstantFPBuildVectorOrConstantFP(CT))
+      !DAG.isConstantFPBuildVectorOrConstantFP(CT))
     return SDValue();
 
   SDValue CF = Sel.getOperand(2);
   if (!isConstantOrConstantVector(CF, true) &&
-      !isConstantFPBuildVectorOrConstantFP(CF))
+      !DAG.isConstantFPBuildVectorOrConstantFP(CF))
     return SDValue();
 
   // Bail out if any constants are opaque because we can't constant fold those.
@@ -2043,19 +2137,10 @@ SDValue DAGCombiner::foldBinOpIntoSelect(SDNode *BO) {
   SDValue CBO = BO->getOperand(SelOpNo ^ 1);
   if (!CanFoldNonConst &&
       !isConstantOrConstantVector(CBO, true) &&
-      !isConstantFPBuildVectorOrConstantFP(CBO))
+      !DAG.isConstantFPBuildVectorOrConstantFP(CBO))
     return SDValue();
 
-  EVT VT = Sel.getValueType();
-
-  // In case of shift value and shift amount may have different VT. For instance
-  // on x86 shift amount is i8 regardles of LHS type. Bail out if we have
-  // swapped operands and value types do not match. NB: x86 is fine if operands
-  // are not swapped with shift amount VT being not bigger than shifted value.
-  // TODO: that is possible to check for a shift operation, correct VTs and
-  // still perform optimization on x86 if needed.
-  if (SelOpNo && VT != CBO.getValueType())
-    return SDValue();
+  EVT VT = BO->getValueType(0);
 
   // We have a select-of-constants followed by a binary operator with a
   // constant. Eliminate the binop by pulling the constant math into the select.
@@ -2065,14 +2150,14 @@ SDValue DAGCombiner::foldBinOpIntoSelect(SDNode *BO) {
                           : DAG.getNode(BinOpcode, DL, VT, CT, CBO);
   if (!CanFoldNonConst && !NewCT.isUndef() &&
       !isConstantOrConstantVector(NewCT, true) &&
-      !isConstantFPBuildVectorOrConstantFP(NewCT))
+      !DAG.isConstantFPBuildVectorOrConstantFP(NewCT))
     return SDValue();
 
   SDValue NewCF = SelOpNo ? DAG.getNode(BinOpcode, DL, VT, CBO, CF)
                           : DAG.getNode(BinOpcode, DL, VT, CF, CBO);
   if (!CanFoldNonConst && !NewCF.isUndef() &&
       !isConstantOrConstantVector(NewCF, true) &&
-      !isConstantFPBuildVectorOrConstantFP(NewCF))
+      !DAG.isConstantFPBuildVectorOrConstantFP(NewCF))
     return SDValue();
 
   SDValue SelectOp = DAG.getSelect(DL, VT, Sel.getOperand(0), NewCT, NewCF);
@@ -2402,8 +2487,8 @@ SDValue DAGCombiner::visitADD(SDNode *N) {
 
   // Fold (add (vscale * C0), (vscale * C1)) to (vscale * (C0 + C1)).
   if (N0.getOpcode() == ISD::VSCALE && N1.getOpcode() == ISD::VSCALE) {
-    APInt C0 = N0->getConstantOperandAPInt(0);
-    APInt C1 = N1->getConstantOperandAPInt(0);
+    const APInt &C0 = N0->getConstantOperandAPInt(0);
+    const APInt &C1 = N1->getConstantOperandAPInt(0);
     return DAG.getVScale(DL, VT, C0 + C1);
   }
 
@@ -2411,9 +2496,9 @@ SDValue DAGCombiner::visitADD(SDNode *N) {
   if ((N0.getOpcode() == ISD::ADD) &&
       (N0.getOperand(1).getOpcode() == ISD::VSCALE) &&
       (N1.getOpcode() == ISD::VSCALE)) {
-    auto VS0 = N0.getOperand(1)->getConstantOperandAPInt(0);
-    auto VS1 = N1->getConstantOperandAPInt(0);
-    auto VS = DAG.getVScale(DL, VT, VS0 + VS1);
+    const APInt &VS0 = N0.getOperand(1)->getConstantOperandAPInt(0);
+    const APInt &VS1 = N1->getConstantOperandAPInt(0);
+    SDValue VS = DAG.getVScale(DL, VT, VS0 + VS1);
     return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), VS);
   }
 
@@ -2631,36 +2716,18 @@ SDValue DAGCombiner::visitADDC(SDNode *N) {
   return SDValue();
 }
 
-static SDValue flipBoolean(SDValue V, const SDLoc &DL,
-                           SelectionDAG &DAG, const TargetLowering &TLI) {
-  EVT VT = V.getValueType();
-
-  SDValue Cst;
-  switch (TLI.getBooleanContents(VT)) {
-  case TargetLowering::ZeroOrOneBooleanContent:
-  case TargetLowering::UndefinedBooleanContent:
-    Cst = DAG.getConstant(1, DL, VT);
-    break;
-  case TargetLowering::ZeroOrNegativeOneBooleanContent:
-    Cst = DAG.getAllOnesConstant(DL, VT);
-    break;
-  }
-
-  return DAG.getNode(ISD::XOR, DL, VT, V, Cst);
-}
-
 /**
  * Flips a boolean if it is cheaper to compute. If the Force parameters is set,
  * then the flip also occurs if computing the inverse is the same cost.
  * This function returns an empty SDValue in case it cannot flip the boolean
  * without increasing the cost of the computation. If you want to flip a boolean
- * no matter what, use flipBoolean.
+ * no matter what, use DAG.getLogicalNOT.
  */
 static SDValue extractBooleanFlip(SDValue V, SelectionDAG &DAG,
                                   const TargetLowering &TLI,
                                   bool Force) {
   if (Force && isa<ConstantSDNode>(V))
-    return flipBoolean(V, SDLoc(V), DAG, TLI);
+    return DAG.getLogicalNOT(SDLoc(V), V, V.getValueType());
 
   if (V.getOpcode() != ISD::XOR)
     return SDValue();
@@ -2687,7 +2754,7 @@ static SDValue extractBooleanFlip(SDValue V, SelectionDAG &DAG,
   if (IsFlip)
     return V.getOperand(0);
   if (Force)
-    return flipBoolean(V, SDLoc(V), DAG, TLI);
+    return DAG.getLogicalNOT(SDLoc(V), V, V.getValueType());
   return SDValue();
 }
 
@@ -2724,8 +2791,8 @@ SDValue DAGCombiner::visitADDO(SDNode *N) {
     if (isBitwiseNot(N0) && isOneOrOneSplat(N1)) {
       SDValue Sub = DAG.getNode(ISD::USUBO, DL, N->getVTList(),
                                 DAG.getConstant(0, DL, VT), N0.getOperand(0));
-      return CombineTo(N, Sub,
-                       flipBoolean(Sub.getValue(1), DL, DAG, TLI));
+      return CombineTo(
+          N, Sub, DAG.getLogicalNOT(DL, Sub.getValue(1), Sub->getValueType(1)));
     }
 
     if (SDValue Combined = visitUADDOLike(N0, N1, N))
@@ -2820,6 +2887,28 @@ SDValue DAGCombiner::visitADDCARRY(SDNode *N) {
   return SDValue();
 }
 
+SDValue DAGCombiner::visitSADDO_CARRY(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue CarryIn = N->getOperand(2);
+  SDLoc DL(N);
+
+  // canonicalize constant to RHS
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  if (N0C && !N1C)
+    return DAG.getNode(ISD::SADDO_CARRY, DL, N->getVTList(), N1, N0, CarryIn);
+
+  // fold (saddo_carry x, y, false) -> (saddo x, y)
+  if (isNullConstant(CarryIn)) {
+    if (!LegalOperations ||
+        TLI.isOperationLegalOrCustom(ISD::SADDO, N->getValueType(0)))
+      return DAG.getNode(ISD::SADDO, DL, N->getVTList(), N0, N1);
+  }
+
+  return SDValue();
+}
+
 /**
  * If we are facing some sort of diamond carry propapagtion pattern try to
  * break it up to generate something like:
@@ -3005,8 +3094,8 @@ SDValue DAGCombiner::visitADDCARRYLike(SDValue N0, SDValue N1, SDValue CarryIn,
       SDLoc DL(N);
       SDValue Sub = DAG.getNode(ISD::SUBCARRY, DL, N->getVTList(), N1,
                                 N0.getOperand(0), NotC);
-      return CombineTo(N, Sub,
-                       flipBoolean(Sub.getValue(1), DL, DAG, TLI));
+      return CombineTo(
+          N, Sub, DAG.getLogicalNOT(DL, Sub.getValue(1), Sub->getValueType(1)));
     }
 
   // Iff the flag result is dead:
@@ -3111,6 +3200,13 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
       // 0 - X --> X if X is 0 or the minimum signed value.
       return N1;
     }
+
+    // Convert 0 - abs(x).
+    SDValue Result;
+    if (N1->getOpcode() == ISD::ABS &&
+        !TLI.isOperationLegalOrCustom(ISD::ABS, VT) &&
+        TLI.expandABS(N1.getNode(), Result, DAG, true))
+      return Result;
   }
 
   // Canonicalize (sub -1, x) -> ~x, i.e. (xor x, -1)
@@ -3306,12 +3402,10 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
     if (N0.getOpcode() == ISD::XOR && N1.getOpcode() == ISD::SRA) {
       SDValue X0 = N0.getOperand(0), X1 = N0.getOperand(1);
       SDValue S0 = N1.getOperand(0);
-      if ((X0 == S0 && X1 == N1) || (X0 == N1 && X1 == S0)) {
-        unsigned OpSizeInBits = VT.getScalarSizeInBits();
+      if ((X0 == S0 && X1 == N1) || (X0 == N1 && X1 == S0))
         if (ConstantSDNode *C = isConstOrConstSplat(N1.getOperand(1)))
-          if (C->getAPIntValue() == (OpSizeInBits - 1))
+          if (C->getAPIntValue() == (VT.getScalarSizeInBits() - 1))
             return DAG.getNode(ISD::ABS, SDLoc(N), VT, S0);
-      }
     }
   }
 
@@ -3342,7 +3436,7 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
 
   // canonicalize (sub X, (vscale * C)) to (add X,  (vscale * -C))
   if (N1.getOpcode() == ISD::VSCALE) {
-    APInt IntVal = N1.getConstantOperandAPInt(0);
+    const APInt &IntVal = N1.getConstantOperandAPInt(0);
     return DAG.getNode(ISD::ADD, DL, VT, N0, DAG.getVScale(DL, VT, -IntVal));
   }
 
@@ -3501,6 +3595,21 @@ SDValue DAGCombiner::visitSUBCARRY(SDNode *N) {
   return SDValue();
 }
 
+SDValue DAGCombiner::visitSSUBO_CARRY(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue CarryIn = N->getOperand(2);
+
+  // fold (ssubo_carry x, y, false) -> (ssubo x, y)
+  if (isNullConstant(CarryIn)) {
+    if (!LegalOperations ||
+        TLI.isOperationLegalOrCustom(ISD::SSUBO, N->getValueType(0)))
+      return DAG.getNode(ISD::SSUBO, SDLoc(N), N->getVTList(), N0, N1);
+  }
+
+  return SDValue();
+}
+
 // Notice that "mulfix" can be any of SMULFIX, SMULFIXSAT, UMULFIX and
 // UMULFIXSAT here.
 SDValue DAGCombiner::visitMULFIX(SDNode *N) {
@@ -3606,19 +3715,30 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
                                       getShiftAmountTy(N0.getValueType()))));
   }
 
-  // Try to transform multiply-by-(power-of-2 +/- 1) into shift and add/sub.
+  // Try to transform:
+  // (1) multiply-by-(power-of-2 +/- 1) into shift and add/sub.
   // mul x, (2^N + 1) --> add (shl x, N), x
   // mul x, (2^N - 1) --> sub (shl x, N), x
   // Examples: x * 33 --> (x << 5) + x
   //           x * 15 --> (x << 4) - x
   //           x * -33 --> -((x << 5) + x)
   //           x * -15 --> -((x << 4) - x) ; this reduces --> x - (x << 4)
+  // (2) multiply-by-(power-of-2 +/- power-of-2) into shifts and add/sub.
+  // mul x, (2^N + 2^M) --> (add (shl x, N), (shl x, M))
+  // mul x, (2^N - 2^M) --> (sub (shl x, N), (shl x, M))
+  // Examples: x * 0x8800 --> (x << 15) + (x << 11)
+  //           x * 0xf800 --> (x << 16) - (x << 11)
+  //           x * -0x8800 --> -((x << 15) + (x << 11))
+  //           x * -0xf800 --> -((x << 16) - (x << 11)) ; (x << 11) - (x << 16)
   if (N1IsConst && TLI.decomposeMulByConstant(*DAG.getContext(), VT, N1)) {
     // TODO: We could handle more general decomposition of any constant by
     //       having the target set a limit on number of ops and making a
     //       callback to determine that sequence (similar to sqrt expansion).
     unsigned MathOp = ISD::DELETED_NODE;
     APInt MulC = ConstValue1.abs();
+    // The constant `2` should be treated as (2^0 + 1).
+    unsigned TZeros = MulC == 2 ? 0 : MulC.countTrailingZeros();
+    MulC.lshrInPlace(TZeros);
     if ((MulC - 1).isPowerOf2())
       MathOp = ISD::ADD;
     else if ((MulC + 1).isPowerOf2())
@@ -3627,12 +3747,17 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
     if (MathOp != ISD::DELETED_NODE) {
       unsigned ShAmt =
           MathOp == ISD::ADD ? (MulC - 1).logBase2() : (MulC + 1).logBase2();
+      ShAmt += TZeros;
       assert(ShAmt < VT.getScalarSizeInBits() &&
              "multiply-by-constant generated out of bounds shift");
       SDLoc DL(N);
       SDValue Shl =
           DAG.getNode(ISD::SHL, DL, VT, N0, DAG.getConstant(ShAmt, DL, VT));
-      SDValue R = DAG.getNode(MathOp, DL, VT, Shl, N0);
+      SDValue R =
+          TZeros ? DAG.getNode(MathOp, DL, VT, Shl,
+                               DAG.getNode(ISD::SHL, DL, VT, N0,
+                                           DAG.getConstant(TZeros, DL, VT)))
+                 : DAG.getNode(MathOp, DL, VT, Shl, N0);
       if (ConstValue1.isNegative())
         R = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), R);
       return R;
@@ -3684,11 +3809,42 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
   // Fold (mul (vscale * C0), C1) to (vscale * (C0 * C1)).
   if (N0.getOpcode() == ISD::VSCALE)
     if (ConstantSDNode *NC1 = isConstOrConstSplat(N1)) {
-      APInt C0 = N0.getConstantOperandAPInt(0);
-      APInt C1 = NC1->getAPIntValue();
+      const APInt &C0 = N0.getConstantOperandAPInt(0);
+      const APInt &C1 = NC1->getAPIntValue();
       return DAG.getVScale(SDLoc(N), VT, C0 * C1);
     }
 
+  // Fold ((mul x, 0/undef) -> 0,
+  //       (mul x, 1) -> x) -> x)
+  // -> and(x, mask)
+  // We can replace vectors with '0' and '1' factors with a clearing mask.
+  if (VT.isFixedLengthVector()) {
+    unsigned NumElts = VT.getVectorNumElements();
+    SmallBitVector ClearMask;
+    ClearMask.reserve(NumElts);
+    auto IsClearMask = [&ClearMask](ConstantSDNode *V) {
+      if (!V || V->isNullValue()) {
+        ClearMask.push_back(true);
+        return true;
+      }
+      ClearMask.push_back(false);
+      return V->isOne();
+    };
+    if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::AND, VT)) &&
+        ISD::matchUnaryPredicate(N1, IsClearMask, /*AllowUndefs*/ true)) {
+      assert(N1.getOpcode() == ISD::BUILD_VECTOR && "Unknown constant vector");
+      SDLoc DL(N);
+      EVT LegalSVT = N1.getOperand(0).getValueType();
+      SDValue Zero = DAG.getConstant(0, DL, LegalSVT);
+      SDValue AllOnes = DAG.getAllOnesConstant(DL, LegalSVT);
+      SmallVector<SDValue, 16> Mask(NumElts, AllOnes);
+      for (unsigned I = 0; I != NumElts; ++I)
+        if (ClearMask[I])
+          Mask[I] = Zero;
+      return DAG.getNode(ISD::AND, DL, VT, N0, DAG.getBuildVector(VT, DL, Mask));
+    }
+  }
+
   // reassociate mul
   if (SDValue RMUL = reassociateOps(ISD::MUL, SDLoc(N), N0, N1, N->getFlags()))
     return RMUL;
@@ -4108,9 +4264,9 @@ SDValue DAGCombiner::visitREM(SDNode *N) {
     if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0))
       return DAG.getNode(ISD::UREM, DL, VT, N0, N1);
   } else {
-    SDValue NegOne = DAG.getAllOnesConstant(DL, VT);
     if (DAG.isKnownToBeAPowerOfTwo(N1)) {
       // fold (urem x, pow2) -> (and x, pow2-1)
+      SDValue NegOne = DAG.getAllOnesConstant(DL, VT);
       SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, NegOne);
       AddToWorklist(Add.getNode());
       return DAG.getNode(ISD::AND, DL, VT, N0, Add);
@@ -4118,6 +4274,7 @@ SDValue DAGCombiner::visitREM(SDNode *N) {
     if (N1.getOpcode() == ISD::SHL &&
         DAG.isKnownToBeAPowerOfTwo(N1.getOperand(0))) {
       // fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1))
+      SDValue NegOne = DAG.getAllOnesConstant(DL, VT);
       SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, NegOne);
       AddToWorklist(Add.getNode());
       return DAG.getNode(ISD::AND, DL, VT, N0, Add);
@@ -4186,7 +4343,8 @@ SDValue DAGCombiner::visitMULHS(SDNode *N) {
 
   // If the type twice as wide is legal, transform the mulhs to a wider multiply
   // plus a shift.
-  if (!TLI.isMulhCheaperThanMulShift(VT) && VT.isSimple() && !VT.isVector()) {
+  if (!TLI.isOperationLegalOrCustom(ISD::MULHS, VT) && VT.isSimple() &&
+      !VT.isVector()) {
     MVT Simple = VT.getSimpleVT();
     unsigned SimpleSize = Simple.getSizeInBits();
     EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2);
@@ -4242,7 +4400,8 @@ SDValue DAGCombiner::visitMULHU(SDNode *N) {
 
   // If the type twice as wide is legal, transform the mulhu to a wider multiply
   // plus a shift.
-  if (!TLI.isMulhCheaperThanMulShift(VT) && VT.isSimple() && !VT.isVector()) {
+  if (!TLI.isOperationLegalOrCustom(ISD::MULHU, VT) && VT.isSimple() &&
+      !VT.isVector()) {
     MVT Simple = VT.getSimpleVT();
     unsigned SimpleSize = Simple.getSizeInBits();
     EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2);
@@ -4448,6 +4607,10 @@ SDValue DAGCombiner::visitIMINMAX(SDNode *N) {
       return DAG.getNode(AltOpcode, SDLoc(N), VT, N0, N1);
   }
 
+  // Simplify the operands using demanded-bits information.
+  if (SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
   return SDValue();
 }
 
@@ -4916,8 +5079,15 @@ bool DAGCombiner::isLegalNarrowLdSt(LSBaseSDNode *LDST,
   if (!LDST->isSimple())
     return false;
 
+  EVT LdStMemVT = LDST->getMemoryVT();
+
+  // Bail out when changing the scalable property, since we can't be sure that
+  // we're actually narrowing here.
+  if (LdStMemVT.isScalableVector() != MemVT.isScalableVector())
+    return false;
+
   // Verify that we are actually reducing a load width here.
-  if (LDST->getMemoryVT().getSizeInBits() < MemVT.getSizeInBits())
+  if (LdStMemVT.bitsLT(MemVT))
     return false;
 
   // Ensure that this isn't going to produce an unsupported memory access.
@@ -5272,6 +5442,31 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
       return N1;
     if (ISD::isBuildVectorAllOnes(N1.getNode()))
       return N0;
+
+    // fold (and (masked_load) (build_vec (x, ...))) to zext_masked_load
+    auto *MLoad = dyn_cast<MaskedLoadSDNode>(N0);
+    auto *BVec = dyn_cast<BuildVectorSDNode>(N1);
+    if (MLoad && BVec && MLoad->getExtensionType() == ISD::EXTLOAD &&
+        N0.hasOneUse() && N1.hasOneUse()) {
+      EVT LoadVT = MLoad->getMemoryVT();
+      EVT ExtVT = VT;
+      if (TLI.isLoadExtLegal(ISD::ZEXTLOAD, ExtVT, LoadVT)) {
+        // For this AND to be a zero extension of the masked load the elements
+        // of the BuildVec must mask the bottom bits of the extended element
+        // type
+        if (ConstantSDNode *Splat = BVec->getConstantSplatNode()) {
+          uint64_t ElementSize =
+              LoadVT.getVectorElementType().getScalarSizeInBits();
+          if (Splat->getAPIntValue().isMask(ElementSize)) {
+            return DAG.getMaskedLoad(
+                ExtVT, SDLoc(N), MLoad->getChain(), MLoad->getBasePtr(),
+                MLoad->getOffset(), MLoad->getMask(), MLoad->getPassThru(),
+                LoadVT, MLoad->getMemOperand(), MLoad->getAddressingMode(),
+                ISD::ZEXTLOAD, MLoad->isExpandingLoad());
+          }
+        }
+      }
+    }
   }
 
   // fold (and c1, c2) -> c1&c2
@@ -5440,6 +5635,28 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
     }
   }
 
+  // fold (and (masked_gather x)) -> (zext_masked_gather x)
+  if (auto *GN0 = dyn_cast<MaskedGatherSDNode>(N0)) {
+    EVT MemVT = GN0->getMemoryVT();
+    EVT ScalarVT = MemVT.getScalarType();
+
+    if (SDValue(GN0, 0).hasOneUse() &&
+        isConstantSplatVectorMaskForType(N1.getNode(), ScalarVT) &&
+        TLI.isVectorLoadExtDesirable(SDValue(SDValue(GN0, 0)))) {
+      SDValue Ops[] = {GN0->getChain(),   GN0->getPassThru(), GN0->getMask(),
+                       GN0->getBasePtr(), GN0->getIndex(),    GN0->getScale()};
+
+      SDValue ZExtLoad = DAG.getMaskedGather(
+          DAG.getVTList(VT, MVT::Other), MemVT, SDLoc(N), Ops,
+          GN0->getMemOperand(), GN0->getIndexType(), ISD::ZEXTLOAD);
+
+      CombineTo(N, ZExtLoad);
+      AddToWorklist(ZExtLoad.getNode());
+      // Avoid recheck of N.
+      return SDValue(N, 0);
+    }
+  }
+
   // fold (and (load x), 255) -> (zextload x, i8)
   // fold (and (extload x, i16), 255) -> (zextload x, i8)
   // fold (and (any_ext (extload x, i16)), 255) -> (zextload x, i8)
@@ -5534,6 +5751,31 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
     if (SDValue V = combineShiftAnd1ToBitTest(N, DAG))
       return V;
 
+  // Recognize the following pattern:
+  //
+  // AndVT = (and (sign_extend NarrowVT to AndVT) #bitmask)
+  //
+  // where bitmask is a mask that clears the upper bits of AndVT. The
+  // number of bits in bitmask must be a power of two.
+  auto IsAndZeroExtMask = [](SDValue LHS, SDValue RHS) {
+    if (LHS->getOpcode() != ISD::SIGN_EXTEND)
+      return false;
+
+    auto *C = dyn_cast<ConstantSDNode>(RHS);
+    if (!C)
+      return false;
+
+    if (!C->getAPIntValue().isMask(
+            LHS.getOperand(0).getValueType().getFixedSizeInBits()))
+      return false;
+
+    return true;
+  };
+
+  // Replace (and (sign_extend ...) #bitmask) with (zero_extend ...).
+  if (IsAndZeroExtMask(N0, N1))
+    return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, N0.getOperand(0));
+
   return SDValue();
 }
 
@@ -6275,8 +6517,11 @@ static SDValue extractShiftForRotate(SelectionDAG &DAG, SDValue OppShift,
 // reduces to a rotate in direction shift2 by Pos or (equivalently) a rotate
 // in direction shift1 by Neg.  The range [0, EltSize) means that we only need
 // to consider shift amounts with defined behavior.
+//
+// The IsRotate flag should be set when the LHS of both shifts is the same.
+// Otherwise if matching a general funnel shift, it should be clear.
 static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize,
-                           SelectionDAG &DAG) {
+                           SelectionDAG &DAG, bool IsRotate) {
   // If EltSize is a power of 2 then:
   //
   //  (a) (Pos == 0 ? 0 : EltSize - Pos) == (EltSize - Pos) & (EltSize - 1)
@@ -6308,8 +6553,11 @@ static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize,
   // always invokes undefined behavior for 32-bit X.
   //
   // Below, Mask == EltSize - 1 when using [A] and is all-ones otherwise.
+  //
+  // NOTE: We can only do this when matching an AND and not a general
+  // funnel shift.
   unsigned MaskLoBits = 0;
-  if (Neg.getOpcode() == ISD::AND && isPowerOf2_64(EltSize)) {
+  if (IsRotate && Neg.getOpcode() == ISD::AND && isPowerOf2_64(EltSize)) {
     if (ConstantSDNode *NegC = isConstOrConstSplat(Neg.getOperand(1))) {
       KnownBits Known = DAG.computeKnownBits(Neg.getOperand(0));
       unsigned Bits = Log2_64(EltSize);
@@ -6399,7 +6647,8 @@ SDValue DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos,
   //          (srl x, (*ext y))) ->
   //   (rotr x, y) or (rotl x, (sub 32, y))
   EVT VT = Shifted.getValueType();
-  if (matchRotateSub(InnerPos, InnerNeg, VT.getScalarSizeInBits(), DAG)) {
+  if (matchRotateSub(InnerPos, InnerNeg, VT.getScalarSizeInBits(), DAG,
+                     /*IsRotate*/ true)) {
     bool HasPos = TLI.isOperationLegalOrCustom(PosOpcode, VT);
     return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, Shifted,
                        HasPos ? Pos : Neg);
@@ -6428,7 +6677,7 @@ SDValue DAGCombiner::MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos,
   // fold (or (shl x0, (*ext (sub 32, y))),
   //          (srl x1, (*ext y))) ->
   //   (fshr x0, x1, y) or (fshl x0, x1, (sub 32, y))
-  if (matchRotateSub(InnerPos, InnerNeg, EltBits, DAG)) {
+  if (matchRotateSub(InnerPos, InnerNeg, EltBits, DAG, /*IsRotate*/ N0 == N1)) {
     bool HasPos = TLI.isOperationLegalOrCustom(PosOpcode, VT);
     return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, N0, N1,
                        HasPos ? Pos : Neg);
@@ -6782,11 +7031,11 @@ calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth,
   return None;
 }
 
-static unsigned LittleEndianByteAt(unsigned BW, unsigned i) {
+static unsigned littleEndianByteAt(unsigned BW, unsigned i) {
   return i;
 }
 
-static unsigned BigEndianByteAt(unsigned BW, unsigned i) {
+static unsigned bigEndianByteAt(unsigned BW, unsigned i) {
   return BW - i - 1;
 }
 
@@ -6803,8 +7052,8 @@ static Optional<bool> isBigEndian(const ArrayRef<int64_t> ByteOffsets,
   bool BigEndian = true, LittleEndian = true;
   for (unsigned i = 0; i < Width; i++) {
     int64_t CurrentByteOffset = ByteOffsets[i] - FirstOffset;
-    LittleEndian &= CurrentByteOffset == LittleEndianByteAt(Width, i);
-    BigEndian &= CurrentByteOffset == BigEndianByteAt(Width, i);
+    LittleEndian &= CurrentByteOffset == littleEndianByteAt(Width, i);
+    BigEndian &= CurrentByteOffset == bigEndianByteAt(Width, i);
     if (!BigEndian && !LittleEndian)
       return None;
   }
@@ -6847,80 +7096,98 @@ static SDValue stripTruncAndExt(SDValue Value) {
 ///  p[3] = (val >> 0) & 0xFF;
 /// =>
 ///  *((i32)p) = BSWAP(val);
-SDValue DAGCombiner::MatchStoreCombine(StoreSDNode *N) {
-  // Collect all the stores in the chain.
-  SDValue Chain;
-  SmallVector<StoreSDNode *, 8> Stores;
-  for (StoreSDNode *Store = N; Store; Store = dyn_cast<StoreSDNode>(Chain)) {
-    // TODO: Allow unordered atomics when wider type is legal (see D66309)
-    if (Store->getMemoryVT() != MVT::i8 ||
-        !Store->isSimple() || Store->isIndexed())
+SDValue DAGCombiner::mergeTruncStores(StoreSDNode *N) {
+  // The matching looks for "store (trunc x)" patterns that appear early but are
+  // likely to be replaced by truncating store nodes during combining.
+  // TODO: If there is evidence that running this later would help, this
+  //       limitation could be removed. Legality checks may need to be added
+  //       for the created store and optional bswap/rotate.
+  if (LegalOperations)
+    return SDValue();
+
+  // We only handle merging simple stores of 1-4 bytes.
+  // TODO: Allow unordered atomics when wider type is legal (see D66309)
+  EVT MemVT = N->getMemoryVT();
+  if (!(MemVT == MVT::i8 || MemVT == MVT::i16 || MemVT == MVT::i32) ||
+      !N->isSimple() || N->isIndexed())
+    return SDValue();
+
+  // Collect all of the stores in the chain.
+  SDValue Chain = N->getChain();
+  SmallVector<StoreSDNode *, 8> Stores = {N};
+  while (auto *Store = dyn_cast<StoreSDNode>(Chain)) {
+    // All stores must be the same size to ensure that we are writing all of the
+    // bytes in the wide value.
+    // TODO: We could allow multiple sizes by tracking each stored byte.
+    if (Store->getMemoryVT() != MemVT || !Store->isSimple() ||
+        Store->isIndexed())
       return SDValue();
     Stores.push_back(Store);
     Chain = Store->getChain();
   }
-  // Handle the simple type only.
-  unsigned Width = Stores.size();
-  EVT VT = EVT::getIntegerVT(
-    *DAG.getContext(), Width * N->getMemoryVT().getSizeInBits());
-  if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
+  // There is no reason to continue if we do not have at least a pair of stores.
+  if (Stores.size() < 2)
     return SDValue();
 
-  if (LegalOperations && !TLI.isOperationLegal(ISD::STORE, VT))
+  // Handle simple types only.
+  LLVMContext &Context = *DAG.getContext();
+  unsigned NumStores = Stores.size();
+  unsigned NarrowNumBits = N->getMemoryVT().getScalarSizeInBits();
+  unsigned WideNumBits = NumStores * NarrowNumBits;
+  EVT WideVT = EVT::getIntegerVT(Context, WideNumBits);
+  if (WideVT != MVT::i16 && WideVT != MVT::i32 && WideVT != MVT::i64)
     return SDValue();
 
-  // Check if all the bytes of the combined value we are looking at are stored
-  // to the same base address. Collect bytes offsets from Base address into
-  // ByteOffsets.
-  SDValue CombinedValue;
-  SmallVector<int64_t, 8> ByteOffsets(Width, INT64_MAX);
+  // Check if all bytes of the source value that we are looking at are stored
+  // to the same base address. Collect offsets from Base address into OffsetMap.
+  SDValue SourceValue;
+  SmallVector<int64_t, 8> OffsetMap(NumStores, INT64_MAX);
   int64_t FirstOffset = INT64_MAX;
   StoreSDNode *FirstStore = nullptr;
   Optional<BaseIndexOffset> Base;
   for (auto Store : Stores) {
-    // All the stores store different byte of the CombinedValue. A truncate is
-    // required to get that byte value.
+    // All the stores store different parts of the CombinedValue. A truncate is
+    // required to get the partial value.
     SDValue Trunc = Store->getValue();
     if (Trunc.getOpcode() != ISD::TRUNCATE)
       return SDValue();
-    // A shift operation is required to get the right byte offset, except the
-    // first byte.
+    // Other than the first/last part, a shift operation is required to get the
+    // offset.
     int64_t Offset = 0;
-    SDValue Value = Trunc.getOperand(0);
-    if (Value.getOpcode() == ISD::SRL ||
-        Value.getOpcode() == ISD::SRA) {
-      auto *ShiftOffset = dyn_cast<ConstantSDNode>(Value.getOperand(1));
-      // Trying to match the following pattern. The shift offset must be
-      // a constant and a multiple of 8. It is the byte offset in "y".
+    SDValue WideVal = Trunc.getOperand(0);
+    if ((WideVal.getOpcode() == ISD::SRL || WideVal.getOpcode() == ISD::SRA) &&
+        isa<ConstantSDNode>(WideVal.getOperand(1))) {
+      // The shift amount must be a constant multiple of the narrow type.
+      // It is translated to the offset address in the wide source value "y".
       //
-      // x = srl y, offset
+      // x = srl y, ShiftAmtC
       // i8 z = trunc x
       // store z, ...
-      if (!ShiftOffset || (ShiftOffset->getSExtValue() % 8))
+      uint64_t ShiftAmtC = WideVal.getConstantOperandVal(1);
+      if (ShiftAmtC % NarrowNumBits != 0)
         return SDValue();
 
-     Offset = ShiftOffset->getSExtValue()/8;
-     Value = Value.getOperand(0);
+      Offset = ShiftAmtC / NarrowNumBits;
+      WideVal = WideVal.getOperand(0);
     }
 
-    // Stores must share the same combined value with different offsets.
-    if (!CombinedValue)
-      CombinedValue = Value;
-    else if (stripTruncAndExt(CombinedValue) != stripTruncAndExt(Value))
+    // Stores must share the same source value with different offsets.
+    // Truncate and extends should be stripped to get the single source value.
+    if (!SourceValue)
+      SourceValue = WideVal;
+    else if (stripTruncAndExt(SourceValue) != stripTruncAndExt(WideVal))
       return SDValue();
-
-    // The trunc and all the extend operation should be stripped to get the
-    // real value we are stored.
-    else if (CombinedValue.getValueType() != VT) {
-      if (Value.getValueType() == VT ||
-          Value.getValueSizeInBits() > CombinedValue.getValueSizeInBits())
-        CombinedValue = Value;
-      // Give up if the combined value type is smaller than the store size.
-      if (CombinedValue.getValueSizeInBits() < VT.getSizeInBits())
+    else if (SourceValue.getValueType() != WideVT) {
+      if (WideVal.getValueType() == WideVT ||
+          WideVal.getScalarValueSizeInBits() >
+              SourceValue.getScalarValueSizeInBits())
+        SourceValue = WideVal;
+      // Give up if the source value type is smaller than the store size.
+      if (SourceValue.getScalarValueSizeInBits() < WideVT.getScalarSizeInBits())
         return SDValue();
     }
 
-    // Stores must share the same base address
+    // Stores must share the same base address.
     BaseIndexOffset Ptr = BaseIndexOffset::match(Store, DAG);
     int64_t ByteOffsetFromBase = 0;
     if (!Base)
@@ -6928,60 +7195,78 @@ SDValue DAGCombiner::MatchStoreCombine(StoreSDNode *N) {
     else if (!Base->equalBaseIndex(Ptr, DAG, ByteOffsetFromBase))
       return SDValue();
 
-    // Remember the first byte store
+    // Remember the first store.
     if (ByteOffsetFromBase < FirstOffset) {
       FirstStore = Store;
       FirstOffset = ByteOffsetFromBase;
     }
     // Map the offset in the store and the offset in the combined value, and
     // early return if it has been set before.
-    if (Offset < 0 || Offset >= Width || ByteOffsets[Offset] != INT64_MAX)
+    if (Offset < 0 || Offset >= NumStores || OffsetMap[Offset] != INT64_MAX)
       return SDValue();
-    ByteOffsets[Offset] = ByteOffsetFromBase;
+    OffsetMap[Offset] = ByteOffsetFromBase;
   }
 
   assert(FirstOffset != INT64_MAX && "First byte offset must be set");
   assert(FirstStore && "First store must be set");
 
-  // Check if the bytes of the combined value we are looking at match with
-  // either big or little endian value store.
-  Optional<bool> IsBigEndian = isBigEndian(ByteOffsets, FirstOffset);
-  if (!IsBigEndian.hasValue())
-    return SDValue();
-
-  // The node we are looking at matches with the pattern, check if we can
-  // replace it with a single bswap if needed and store.
-
-  // If the store needs byte swap check if the target supports it
-  bool NeedsBswap = DAG.getDataLayout().isBigEndian() != *IsBigEndian;
-
-  // Before legalize we can introduce illegal bswaps which will be later
-  // converted to an explicit bswap sequence. This way we end up with a single
-  // store and byte shuffling instead of several stores and byte shuffling.
-  if (NeedsBswap && LegalOperations && !TLI.isOperationLegal(ISD::BSWAP, VT))
-    return SDValue();
-
   // Check that a store of the wide type is both allowed and fast on the target
+  const DataLayout &Layout = DAG.getDataLayout();
   bool Fast = false;
-  bool Allowed =
-      TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
-                             *FirstStore->getMemOperand(), &Fast);
+  bool Allowed = TLI.allowsMemoryAccess(Context, Layout, WideVT,
+                                        *FirstStore->getMemOperand(), &Fast);
   if (!Allowed || !Fast)
     return SDValue();
 
-  if (VT != CombinedValue.getValueType()) {
-    assert(CombinedValue.getValueType().getSizeInBits() > VT.getSizeInBits() &&
-           "Get unexpected store value to combine");
-    CombinedValue = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
-                             CombinedValue);
+  // Check if the pieces of the value are going to the expected places in memory
+  // to merge the stores.
+  auto checkOffsets = [&](bool MatchLittleEndian) {
+    if (MatchLittleEndian) {
+      for (unsigned i = 0; i != NumStores; ++i)
+        if (OffsetMap[i] != i * (NarrowNumBits / 8) + FirstOffset)
+          return false;
+    } else { // MatchBigEndian by reversing loop counter.
+      for (unsigned i = 0, j = NumStores - 1; i != NumStores; ++i, --j)
+        if (OffsetMap[j] != i * (NarrowNumBits / 8) + FirstOffset)
+          return false;
+    }
+    return true;
+  };
+
+  // Check if the offsets line up for the native data layout of this target.
+  bool NeedBswap = false;
+  bool NeedRotate = false;
+  if (!checkOffsets(Layout.isLittleEndian())) {
+    // Special-case: check if byte offsets line up for the opposite endian.
+    if (NarrowNumBits == 8 && checkOffsets(Layout.isBigEndian()))
+      NeedBswap = true;
+    else if (NumStores == 2 && checkOffsets(Layout.isBigEndian()))
+      NeedRotate = true;
+    else
+      return SDValue();
+  }
+
+  SDLoc DL(N);
+  if (WideVT != SourceValue.getValueType()) {
+    assert(SourceValue.getValueType().getScalarSizeInBits() > WideNumBits &&
+           "Unexpected store value to merge");
+    SourceValue = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SourceValue);
   }
 
-  if (NeedsBswap)
-    CombinedValue = DAG.getNode(ISD::BSWAP, SDLoc(N), VT, CombinedValue);
+  // Before legalize we can introduce illegal bswaps/rotates which will be later
+  // converted to an explicit bswap sequence. This way we end up with a single
+  // store and byte shuffling instead of several stores and byte shuffling.
+  if (NeedBswap) {
+    SourceValue = DAG.getNode(ISD::BSWAP, DL, WideVT, SourceValue);
+  } else if (NeedRotate) {
+    assert(WideNumBits % 2 == 0 && "Unexpected type for rotate");
+    SDValue RotAmt = DAG.getConstant(WideNumBits / 2, DL, WideVT);
+    SourceValue = DAG.getNode(ISD::ROTR, DL, WideVT, SourceValue, RotAmt);
+  }
 
   SDValue NewStore =
-    DAG.getStore(Chain, SDLoc(N),  CombinedValue, FirstStore->getBasePtr(),
-                 FirstStore->getPointerInfo(), FirstStore->getAlignment());
+      DAG.getStore(Chain, DL, SourceValue, FirstStore->getBasePtr(),
+                   FirstStore->getPointerInfo(), FirstStore->getAlign());
 
   // Rely on other DAG combine rules to remove the other individual stores.
   DAG.ReplaceAllUsesWith(N, NewStore.getNode());
@@ -7036,8 +7321,8 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) {
            "can only analyze providers for individual bytes not bit");
     unsigned LoadByteWidth = LoadBitWidth / 8;
     return IsBigEndianTarget
-            ? BigEndianByteAt(LoadByteWidth, P.ByteOffset)
-            : LittleEndianByteAt(LoadByteWidth, P.ByteOffset);
+            ? bigEndianByteAt(LoadByteWidth, P.ByteOffset)
+            : littleEndianByteAt(LoadByteWidth, P.ByteOffset);
   };
 
   Optional<BaseIndexOffset> Base;
@@ -7164,10 +7449,10 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) {
   if (!Allowed || !Fast)
     return SDValue();
 
-  SDValue NewLoad = DAG.getExtLoad(NeedsZext ? ISD::ZEXTLOAD : ISD::NON_EXTLOAD,
-                                   SDLoc(N), VT, Chain, FirstLoad->getBasePtr(),
-                                   FirstLoad->getPointerInfo(), MemVT,
-                                   FirstLoad->getAlignment());
+  SDValue NewLoad =
+      DAG.getExtLoad(NeedsZext ? ISD::ZEXTLOAD : ISD::NON_EXTLOAD, SDLoc(N), VT,
+                     Chain, FirstLoad->getBasePtr(),
+                     FirstLoad->getPointerInfo(), MemVT, FirstLoad->getAlign());
 
   // Transfer chain users from old loads to the new load.
   for (LoadSDNode *L : Loads)
@@ -7337,9 +7622,9 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
         if (N0.hasOneUse()) {
           // FIXME Can we handle multiple uses? Could we token factor the chain
           // results from the new/old setcc?
-          SDValue SetCC = DAG.getSetCC(SDLoc(N0), VT, LHS, RHS, NotCC,
-                                       N0.getOperand(0),
-                                       N0Opcode == ISD::STRICT_FSETCCS);
+          SDValue SetCC =
+              DAG.getSetCC(SDLoc(N0), VT, LHS, RHS, NotCC,
+                           N0.getOperand(0), N0Opcode == ISD::STRICT_FSETCCS);
           CombineTo(N, SetCC);
           DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), SetCC.getValue(1));
           recursivelyDeleteUnusedNodes(N0.getNode());
@@ -7440,12 +7725,10 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
     if (A.getOpcode() == ISD::ADD && S.getOpcode() == ISD::SRA) {
       SDValue A0 = A.getOperand(0), A1 = A.getOperand(1);
       SDValue S0 = S.getOperand(0);
-      if ((A0 == S && A1 == S0) || (A1 == S && A0 == S0)) {
-        unsigned OpSizeInBits = VT.getScalarSizeInBits();
+      if ((A0 == S && A1 == S0) || (A1 == S && A0 == S0))
         if (ConstantSDNode *C = isConstOrConstSplat(S.getOperand(1)))
-          if (C->getAPIntValue() == (OpSizeInBits - 1))
+          if (C->getAPIntValue() == (VT.getScalarSizeInBits() - 1))
             return DAG.getNode(ISD::ABS, DL, VT, S0);
-      }
     }
   }
 
@@ -7980,10 +8263,9 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
   // Fold (shl (vscale * C0), C1) to (vscale * (C0 << C1)).
   if (N0.getOpcode() == ISD::VSCALE)
     if (ConstantSDNode *NC1 = isConstOrConstSplat(N->getOperand(1))) {
-      auto DL = SDLoc(N);
-      APInt C0 = N0.getConstantOperandAPInt(0);
-      APInt C1 = NC1->getAPIntValue();
-      return DAG.getVScale(DL, VT, C0 << C1);
+      const APInt &C0 = N0.getConstantOperandAPInt(0);
+      const APInt &C1 = NC1->getAPIntValue();
+      return DAG.getVScale(SDLoc(N), VT, C0 << C1);
     }
 
   return SDValue();
@@ -8032,12 +8314,6 @@ static SDValue combineShiftToMULH(SDNode *N, SelectionDAG &DAG,
   if (NarrowVT !=  RightOp.getOperand(0).getValueType())
     return SDValue();
 
-  // Only transform into mulh if mulh for the narrow type is cheaper than
-  // a multiply followed by a shift. This should also check if mulh is
-  // legal for NarrowVT on the target.
-  if (!TLI.isMulhCheaperThanMulShift(NarrowVT))
-      return SDValue();
-
   // Proceed with the transformation if the wide type is twice as large
   // as the narrow type.
   unsigned NarrowVTSize = NarrowVT.getScalarSizeInBits();
@@ -8055,6 +8331,10 @@ static SDValue combineShiftToMULH(SDNode *N, SelectionDAG &DAG,
   // we use mulhs. Othewise, zero extends (zext) use mulhu.
   unsigned MulhOpcode = IsSignExt ? ISD::MULHS : ISD::MULHU;
 
+  // Combine to mulh if mulh is legal/custom for the narrow type on the target.
+  if (!TLI.isOperationLegalOrCustom(MulhOpcode, NarrowVT))
+    return SDValue();
+
   SDValue Result = DAG.getNode(MulhOpcode, DL, NarrowVT, LeftOp.getOperand(0),
                                RightOp.getOperand(0));
   return (N->getOpcode() == ISD::SRA ? DAG.getSExtOrTrunc(Result, DL, WideVT1)
@@ -8556,8 +8836,8 @@ SDValue DAGCombiner::visitFunnelShift(SDNode *N) {
                                      RHS->getAddressSpace(), NewAlign,
                                      RHS->getMemOperand()->getFlags(), &Fast) &&
               Fast) {
-            SDValue NewPtr =
-                DAG.getMemBasePlusOffset(RHS->getBasePtr(), PtrOff, DL);
+            SDValue NewPtr = DAG.getMemBasePlusOffset(
+                RHS->getBasePtr(), TypeSize::Fixed(PtrOff), DL);
             AddToWorklist(NewPtr.getNode());
             SDValue Load = DAG.getLoad(
                 VT, DL, RHS->getChain(), NewPtr,
@@ -9154,16 +9434,75 @@ static SDValue ConvertSelectToConcatVector(SDNode *N, SelectionDAG &DAG) {
       TopHalf->isNullValue() ? RHS->getOperand(1) : LHS->getOperand(1));
 }
 
+bool refineUniformBase(SDValue &BasePtr, SDValue &Index, SelectionDAG &DAG) {
+  if (!isNullConstant(BasePtr) || Index.getOpcode() != ISD::ADD)
+    return false;
+
+  // For now we check only the LHS of the add.
+  SDValue LHS = Index.getOperand(0);
+  SDValue SplatVal = DAG.getSplatValue(LHS);
+  if (!SplatVal)
+    return false;
+
+  BasePtr = SplatVal;
+  Index = Index.getOperand(1);
+  return true;
+}
+
+// Fold sext/zext of index into index type.
+bool refineIndexType(MaskedGatherScatterSDNode *MGS, SDValue &Index,
+                     bool Scaled, SelectionDAG &DAG) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  if (Index.getOpcode() == ISD::ZERO_EXTEND) {
+    SDValue Op = Index.getOperand(0);
+    MGS->setIndexType(Scaled ? ISD::UNSIGNED_SCALED : ISD::UNSIGNED_UNSCALED);
+    if (TLI.shouldRemoveExtendFromGSIndex(Op.getValueType())) {
+      Index = Op;
+      return true;
+    }
+  }
+
+  if (Index.getOpcode() == ISD::SIGN_EXTEND) {
+    SDValue Op = Index.getOperand(0);
+    MGS->setIndexType(Scaled ? ISD::SIGNED_SCALED : ISD::SIGNED_UNSCALED);
+    if (TLI.shouldRemoveExtendFromGSIndex(Op.getValueType())) {
+      Index = Op;
+      return true;
+    }
+  }
+
+  return false;
+}
+
 SDValue DAGCombiner::visitMSCATTER(SDNode *N) {
   MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(N);
   SDValue Mask = MSC->getMask();
   SDValue Chain = MSC->getChain();
+  SDValue Index = MSC->getIndex();
+  SDValue Scale = MSC->getScale();
+  SDValue StoreVal = MSC->getValue();
+  SDValue BasePtr = MSC->getBasePtr();
   SDLoc DL(N);
 
   // Zap scatters with a zero mask.
   if (ISD::isBuildVectorAllZeros(Mask.getNode()))
     return Chain;
 
+  if (refineUniformBase(BasePtr, Index, DAG)) {
+    SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
+    return DAG.getMaskedScatter(
+        DAG.getVTList(MVT::Other), StoreVal.getValueType(), DL, Ops,
+        MSC->getMemOperand(), MSC->getIndexType(), MSC->isTruncatingStore());
+  }
+
+  if (refineIndexType(MSC, Index, MSC->isIndexScaled(), DAG)) {
+    SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
+    return DAG.getMaskedScatter(
+        DAG.getVTList(MVT::Other), StoreVal.getValueType(), DL, Ops,
+        MSC->getMemOperand(), MSC->getIndexType(), MSC->isTruncatingStore());
+  }
+
   return SDValue();
 }
 
@@ -9177,6 +9516,14 @@ SDValue DAGCombiner::visitMSTORE(SDNode *N) {
   if (ISD::isBuildVectorAllZeros(Mask.getNode()))
     return Chain;
 
+  // If this is a masked load with an all ones mask, we can use a unmasked load.
+  // FIXME: Can we do this for indexed, compressing, or truncating stores?
+  if (ISD::isBuildVectorAllOnes(Mask.getNode()) &&
+      MST->isUnindexed() && !MST->isCompressingStore() &&
+      !MST->isTruncatingStore())
+    return DAG.getStore(MST->getChain(), SDLoc(N), MST->getValue(),
+                        MST->getBasePtr(), MST->getMemOperand());
+
   // Try transforming N to an indexed store.
   if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N))
     return SDValue(N, 0);
@@ -9187,11 +9534,32 @@ SDValue DAGCombiner::visitMSTORE(SDNode *N) {
 SDValue DAGCombiner::visitMGATHER(SDNode *N) {
   MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(N);
   SDValue Mask = MGT->getMask();
+  SDValue Chain = MGT->getChain();
+  SDValue Index = MGT->getIndex();
+  SDValue Scale = MGT->getScale();
+  SDValue PassThru = MGT->getPassThru();
+  SDValue BasePtr = MGT->getBasePtr();
   SDLoc DL(N);
 
   // Zap gathers with a zero mask.
   if (ISD::isBuildVectorAllZeros(Mask.getNode()))
-    return CombineTo(N, MGT->getPassThru(), MGT->getChain());
+    return CombineTo(N, PassThru, MGT->getChain());
+
+  if (refineUniformBase(BasePtr, Index, DAG)) {
+    SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
+    return DAG.getMaskedGather(DAG.getVTList(N->getValueType(0), MVT::Other),
+                               PassThru.getValueType(), DL, Ops,
+                               MGT->getMemOperand(), MGT->getIndexType(),
+                               MGT->getExtensionType());
+  }
+
+  if (refineIndexType(MGT, Index, MGT->isIndexScaled(), DAG)) {
+    SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
+    return DAG.getMaskedGather(DAG.getVTList(N->getValueType(0), MVT::Other),
+                               PassThru.getValueType(), DL, Ops,
+                               MGT->getMemOperand(), MGT->getIndexType(),
+                               MGT->getExtensionType());
+  }
 
   return SDValue();
 }
@@ -9205,6 +9573,16 @@ SDValue DAGCombiner::visitMLOAD(SDNode *N) {
   if (ISD::isBuildVectorAllZeros(Mask.getNode()))
     return CombineTo(N, MLD->getPassThru(), MLD->getChain());
 
+  // If this is a masked load with an all ones mask, we can use a unmasked load.
+  // FIXME: Can we do this for indexed, expanding, or extending loads?
+  if (ISD::isBuildVectorAllOnes(Mask.getNode()) &&
+      MLD->isUnindexed() && !MLD->isExpandingLoad() &&
+      MLD->getExtensionType() == ISD::NON_EXTLOAD) {
+    SDValue NewLd = DAG.getLoad(N->getValueType(0), SDLoc(N), MLD->getChain(),
+                                MLD->getBasePtr(), MLD->getMemOperand());
+    return CombineTo(N, NewLd, NewLd.getValue(1));
+  }
+
   // Try transforming N to an indexed load.
   if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N))
     return SDValue(N, 0);
@@ -9364,6 +9742,113 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) {
         return DAG.getSelect(DL, N1.getValueType(), WideSetCC, N1, N2);
       }
     }
+
+    // Match VSELECTs into add with unsigned saturation.
+    if (hasOperation(ISD::UADDSAT, VT)) {
+      // Check if one of the arms of the VSELECT is vector with all bits set.
+      // If it's on the left side invert the predicate to simplify logic below.
+      SDValue Other;
+      ISD::CondCode SatCC = CC;
+      if (ISD::isBuildVectorAllOnes(N1.getNode())) {
+        Other = N2;
+        SatCC = ISD::getSetCCInverse(SatCC, VT.getScalarType());
+      } else if (ISD::isBuildVectorAllOnes(N2.getNode())) {
+        Other = N1;
+      }
+
+      if (Other && Other.getOpcode() == ISD::ADD) {
+        SDValue CondLHS = LHS, CondRHS = RHS;
+        SDValue OpLHS = Other.getOperand(0), OpRHS = Other.getOperand(1);
+
+        // Canonicalize condition operands.
+        if (SatCC == ISD::SETUGE) {
+          std::swap(CondLHS, CondRHS);
+          SatCC = ISD::SETULE;
+        }
+
+        // We can test against either of the addition operands.
+        // x <= x+y ? x+y : ~0 --> uaddsat x, y
+        // x+y >= x ? x+y : ~0 --> uaddsat x, y
+        if (SatCC == ISD::SETULE && Other == CondRHS &&
+            (OpLHS == CondLHS || OpRHS == CondLHS))
+          return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS);
+
+        if (isa<BuildVectorSDNode>(OpRHS) && isa<BuildVectorSDNode>(CondRHS) &&
+            CondLHS == OpLHS) {
+          // If the RHS is a constant we have to reverse the const
+          // canonicalization.
+          // x >= ~C ? x+C : ~0 --> uaddsat x, C
+          auto MatchUADDSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) {
+            return Cond->getAPIntValue() == ~Op->getAPIntValue();
+          };
+          if (SatCC == ISD::SETULE &&
+              ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUADDSAT))
+            return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS);
+        }
+      }
+    }
+
+    // Match VSELECTs into sub with unsigned saturation.
+    if (hasOperation(ISD::USUBSAT, VT)) {
+      // Check if one of the arms of the VSELECT is a zero vector. If it's on
+      // the left side invert the predicate to simplify logic below.
+      SDValue Other;
+      ISD::CondCode SatCC = CC;
+      if (ISD::isBuildVectorAllZeros(N1.getNode())) {
+        Other = N2;
+        SatCC = ISD::getSetCCInverse(SatCC, VT.getScalarType());
+      } else if (ISD::isBuildVectorAllZeros(N2.getNode())) {
+        Other = N1;
+      }
+
+      if (Other && Other.getNumOperands() == 2 && Other.getOperand(0) == LHS) {
+        SDValue CondRHS = RHS;
+        SDValue OpLHS = Other.getOperand(0), OpRHS = Other.getOperand(1);
+
+        // Look for a general sub with unsigned saturation first.
+        // x >= y ? x-y : 0 --> usubsat x, y
+        // x >  y ? x-y : 0 --> usubsat x, y
+        if ((SatCC == ISD::SETUGE || SatCC == ISD::SETUGT) &&
+            Other.getOpcode() == ISD::SUB && OpRHS == CondRHS)
+          return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
+
+        if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS)) {
+          if (isa<BuildVectorSDNode>(CondRHS)) {
+            // If the RHS is a constant we have to reverse the const
+            // canonicalization.
+            // x > C-1 ? x+-C : 0 --> usubsat x, C
+            auto MatchUSUBSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) {
+              return (!Op && !Cond) ||
+                     (Op && Cond &&
+                      Cond->getAPIntValue() == (-Op->getAPIntValue() - 1));
+            };
+            if (SatCC == ISD::SETUGT && Other.getOpcode() == ISD::ADD &&
+                ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUSUBSAT,
+                                          /*AllowUndefs*/ true)) {
+              OpRHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
+                                  OpRHS);
+              return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
+            }
+
+            // Another special case: If C was a sign bit, the sub has been
+            // canonicalized into a xor.
+            // FIXME: Would it be better to use computeKnownBits to determine
+            //        whether it's safe to decanonicalize the xor?
+            // x s< 0 ? x^C : 0 --> usubsat x, C
+            if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
+              if (SatCC == ISD::SETLT && Other.getOpcode() == ISD::XOR &&
+                  ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
+                  OpRHSConst->getAPIntValue().isSignMask()) {
+                // Note that we have to rebuild the RHS constant here to ensure
+                // we don't rely on particular values of undef lanes.
+                OpRHS = DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT);
+                return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
+              }
+            }
+          }
+        }
+      }
+    }
   }
 
   if (SimplifySelectOps(N, N1, N2))
@@ -9722,14 +10207,14 @@ SDValue DAGCombiner::CombineExtLoad(SDNode *N) {
   SDValue BasePtr = LN0->getBasePtr();
   for (unsigned Idx = 0; Idx < NumSplits; Idx++) {
     const unsigned Offset = Idx * Stride;
-    const unsigned Align = MinAlign(LN0->getAlignment(), Offset);
+    const Align Align = commonAlignment(LN0->getAlign(), Offset);
 
     SDValue SplitLoad = DAG.getExtLoad(
         ExtType, SDLoc(LN0), SplitDstVT, LN0->getChain(), BasePtr,
         LN0->getPointerInfo().getWithOffset(Offset), SplitSrcVT, Align,
         LN0->getMemOperand()->getFlags(), LN0->getAAInfo());
 
-    BasePtr = DAG.getMemBasePlusOffset(BasePtr, Stride, DL);
+    BasePtr = DAG.getMemBasePlusOffset(BasePtr, TypeSize::Fixed(Stride), DL);
 
     Loads.push_back(SplitLoad.getValue(0));
     Chains.push_back(SplitLoad.getValue(1));
@@ -10146,7 +10631,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
     SDValue N00 = N0.getOperand(0);
     SDValue N01 = N0.getOperand(1);
     ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
-    EVT N00VT = N0.getOperand(0).getValueType();
+    EVT N00VT = N00.getValueType();
 
     // sext(setcc) -> sext_in_reg(vsetcc) for vectors.
     // Only do this before legalize for now.
@@ -10240,6 +10725,29 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
     return DAG.getNode(ISD::ADD, DL, VT, Zext, DAG.getAllOnesConstant(DL, VT));
   }
 
+  // fold sext (not i1 X) -> add (zext i1 X), -1
+  // TODO: This could be extended to handle bool vectors.
+  if (N0.getValueType() == MVT::i1 && isBitwiseNot(N0) && N0.hasOneUse() &&
+      (!LegalOperations || (TLI.isOperationLegal(ISD::ZERO_EXTEND, VT) &&
+                            TLI.isOperationLegal(ISD::ADD, VT)))) {
+    // If we can eliminate the 'not', the sext form should be better
+    if (SDValue NewXor = visitXOR(N0.getNode())) {
+      // Returning N0 is a form of in-visit replacement that may have
+      // invalidated N0.
+      if (NewXor.getNode() == N0.getNode()) {
+        // Return SDValue here as the xor should have already been replaced in
+        // this sext.
+        return SDValue();
+      } else {
+        // Return a new sext with the new xor.
+        return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NewXor);
+      }
+    }
+
+    SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
+    return DAG.getNode(ISD::ADD, DL, VT, Zext, DAG.getAllOnesConstant(DL, VT));
+  }
+
   return SDValue();
 }
 
@@ -10507,13 +11015,16 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
                                     N0.getValueType());
     }
 
-    // zext(setcc x,y,cc) -> select_cc x, y, 1, 0, cc
+    // zext(setcc x,y,cc) -> zext(select x, y, true, false, cc)
     SDLoc DL(N);
+    EVT N0VT = N0.getValueType();
+    EVT N00VT = N0.getOperand(0).getValueType();
     if (SDValue SCC = SimplifySelectCC(
-            DL, N0.getOperand(0), N0.getOperand(1), DAG.getConstant(1, DL, VT),
-            DAG.getConstant(0, DL, VT),
+            DL, N0.getOperand(0), N0.getOperand(1),
+            DAG.getBoolConstant(true, DL, N0VT, N00VT),
+            DAG.getBoolConstant(false, DL, N0VT, N00VT),
             cast<CondCodeSDNode>(N0.getOperand(2))->get(), true))
-      return SCC;
+      return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, SCC);
   }
 
   // (zext (shl (zext x), cst)) -> (shl (zext x), cst)
@@ -10602,22 +11113,26 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
 
   // fold (aext (load x)) -> (aext (truncate (extload x)))
   // None of the supported targets knows how to perform load and any_ext
-  // on vectors in one instruction.  We only perform this transformation on
-  // scalars.
-  if (ISD::isNON_EXTLoad(N0.getNode()) && !VT.isVector() &&
-      ISD::isUNINDEXEDLoad(N0.getNode()) &&
-      TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) {
+  // on vectors in one instruction, so attempt to fold to zext instead.
+  if (VT.isVector()) {
+    // Try to simplify (zext (load x)).
+    if (SDValue foldedExt =
+            tryToFoldExtOfLoad(DAG, *this, TLI, VT, LegalOperations, N, N0,
+                               ISD::ZEXTLOAD, ISD::ZERO_EXTEND))
+      return foldedExt;
+  } else if (ISD::isNON_EXTLoad(N0.getNode()) &&
+             ISD::isUNINDEXEDLoad(N0.getNode()) &&
+             TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) {
     bool DoXform = true;
-    SmallVector<SDNode*, 4> SetCCs;
+    SmallVector<SDNode *, 4> SetCCs;
     if (!N0.hasOneUse())
-      DoXform = ExtendUsesToFormExtLoad(VT, N, N0, ISD::ANY_EXTEND, SetCCs,
-                                        TLI);
+      DoXform =
+          ExtendUsesToFormExtLoad(VT, N, N0, ISD::ANY_EXTEND, SetCCs, TLI);
     if (DoXform) {
       LoadSDNode *LN0 = cast<LoadSDNode>(N0);
       SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
-                                       LN0->getChain(),
-                                       LN0->getBasePtr(), N0.getValueType(),
-                                       LN0->getMemOperand());
+                                       LN0->getChain(), LN0->getBasePtr(),
+                                       N0.getValueType(), LN0->getMemOperand());
       ExtendSetCCUses(SetCCs, N0, ExtLoad, ISD::ANY_EXTEND);
       // If the load value is used only by N, replace it via CombineTo N.
       bool NoReplaceTrunc = N0.hasOneUse();
@@ -10626,8 +11141,8 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
         DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
         recursivelyDeleteUnusedNodes(LN0);
       } else {
-        SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
-                                    N0.getValueType(), ExtLoad);
+        SDValue Trunc =
+            DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), ExtLoad);
         CombineTo(LN0, Trunc, ExtLoad.getValue(1));
       }
       return SDValue(N, 0); // Return N so it doesn't get rechecked!
@@ -10832,12 +11347,12 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
       return SDValue();
 
     uint64_t ShiftAmt = N01->getZExtValue();
-    uint64_t MemoryWidth = LN0->getMemoryVT().getSizeInBits();
+    uint64_t MemoryWidth = LN0->getMemoryVT().getScalarSizeInBits();
     if (LN0->getExtensionType() != ISD::SEXTLOAD && MemoryWidth > ShiftAmt)
       ExtVT = EVT::getIntegerVT(*DAG.getContext(), MemoryWidth - ShiftAmt);
     else
       ExtVT = EVT::getIntegerVT(*DAG.getContext(),
-                                VT.getSizeInBits() - ShiftAmt);
+                                VT.getScalarSizeInBits() - ShiftAmt);
   } else if (Opc == ISD::AND) {
     // An AND with a constant mask is the same as a truncate + zero-extend.
     auto AndC = dyn_cast<ConstantSDNode>(N->getOperand(1));
@@ -10864,12 +11379,12 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
     SDValue SRL = N0;
     if (auto *ConstShift = dyn_cast<ConstantSDNode>(SRL.getOperand(1))) {
       ShAmt = ConstShift->getZExtValue();
-      unsigned EVTBits = ExtVT.getSizeInBits();
+      unsigned EVTBits = ExtVT.getScalarSizeInBits();
       // Is the shift amount a multiple of size of VT?
       if ((ShAmt & (EVTBits-1)) == 0) {
         N0 = N0.getOperand(0);
         // Is the load width a multiple of size of VT?
-        if ((N0.getValueSizeInBits() & (EVTBits-1)) != 0)
+        if ((N0.getScalarValueSizeInBits() & (EVTBits - 1)) != 0)
           return SDValue();
       }
 
@@ -10899,7 +11414,7 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
           EVT MaskedVT = EVT::getIntegerVT(*DAG.getContext(),
                                            ShiftMask.countTrailingOnes());
           // If the mask is smaller, recompute the type.
-          if ((ExtVT.getSizeInBits() > MaskedVT.getSizeInBits()) &&
+          if ((ExtVT.getScalarSizeInBits() > MaskedVT.getScalarSizeInBits()) &&
               TLI.isLoadExtLegal(ExtType, N0.getValueType(), MaskedVT))
             ExtVT = MaskedVT;
         }
@@ -10930,8 +11445,9 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
     return SDValue();
 
   auto AdjustBigEndianShift = [&](unsigned ShAmt) {
-    unsigned LVTStoreBits = LN0->getMemoryVT().getStoreSizeInBits();
-    unsigned EVTStoreBits = ExtVT.getStoreSizeInBits();
+    unsigned LVTStoreBits =
+        LN0->getMemoryVT().getStoreSizeInBits().getFixedSize();
+    unsigned EVTStoreBits = ExtVT.getStoreSizeInBits().getFixedSize();
     return LVTStoreBits - EVTStoreBits - ShAmt;
   };
 
@@ -10941,13 +11457,13 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
     ShAmt = AdjustBigEndianShift(ShAmt);
 
   uint64_t PtrOff = ShAmt / 8;
-  unsigned NewAlign = MinAlign(LN0->getAlignment(), PtrOff);
+  Align NewAlign = commonAlignment(LN0->getAlign(), PtrOff);
   SDLoc DL(LN0);
   // The original load itself didn't wrap, so an offset within it doesn't.
   SDNodeFlags Flags;
   Flags.setNoUnsignedWrap(true);
-  SDValue NewPtr =
-      DAG.getMemBasePlusOffset(LN0->getBasePtr(), PtrOff, DL, Flags);
+  SDValue NewPtr = DAG.getMemBasePlusOffset(LN0->getBasePtr(),
+                                            TypeSize::Fixed(PtrOff), DL, Flags);
   AddToWorklist(NewPtr.getNode());
 
   SDValue Load;
@@ -10969,13 +11485,13 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
   SDValue Result = Load;
   if (ShLeftAmt != 0) {
     EVT ShImmTy = getShiftAmountTy(Result.getValueType());
-    if (!isUIntN(ShImmTy.getSizeInBits(), ShLeftAmt))
+    if (!isUIntN(ShImmTy.getScalarSizeInBits(), ShLeftAmt))
       ShImmTy = VT;
     // If the shift amount is as large as the result size (but, presumably,
     // no larger than the source) then the useful bits of the result are
     // zero; we can't simply return the shortened shift, because the result
     // of that operation is undefined.
-    if (ShLeftAmt >= VT.getSizeInBits())
+    if (ShLeftAmt >= VT.getScalarSizeInBits())
       Result = DAG.getConstant(0, DL, VT);
     else
       Result = DAG.getNode(ISD::SHL, DL, VT,
@@ -11125,6 +11641,41 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
     return SDValue(N, 0);   // Return N so it doesn't get rechecked!
   }
 
+  // fold (sext_inreg (masked_load x)) -> (sext_masked_load x)
+  // ignore it if the masked load is already sign extended
+  if (MaskedLoadSDNode *Ld = dyn_cast<MaskedLoadSDNode>(N0)) {
+    if (ExtVT == Ld->getMemoryVT() && N0.hasOneUse() &&
+        Ld->getExtensionType() != ISD::LoadExtType::NON_EXTLOAD &&
+        TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, ExtVT)) {
+      SDValue ExtMaskedLoad = DAG.getMaskedLoad(
+          VT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(), Ld->getOffset(),
+          Ld->getMask(), Ld->getPassThru(), ExtVT, Ld->getMemOperand(),
+          Ld->getAddressingMode(), ISD::SEXTLOAD, Ld->isExpandingLoad());
+      CombineTo(N, ExtMaskedLoad);
+      CombineTo(N0.getNode(), ExtMaskedLoad, ExtMaskedLoad.getValue(1));
+      return SDValue(N, 0); // Return N so it doesn't get rechecked!
+    }
+  }
+
+  // fold (sext_inreg (masked_gather x)) -> (sext_masked_gather x)
+  if (auto *GN0 = dyn_cast<MaskedGatherSDNode>(N0)) {
+    if (SDValue(GN0, 0).hasOneUse() &&
+        ExtVT == GN0->getMemoryVT() &&
+        TLI.isVectorLoadExtDesirable(SDValue(SDValue(GN0, 0)))) {
+      SDValue Ops[] = {GN0->getChain(),   GN0->getPassThru(), GN0->getMask(),
+                       GN0->getBasePtr(), GN0->getIndex(),    GN0->getScale()};
+
+      SDValue ExtLoad = DAG.getMaskedGather(
+          DAG.getVTList(VT, MVT::Other), ExtVT, SDLoc(N), Ops,
+          GN0->getMemOperand(), GN0->getIndexType(), ISD::SEXTLOAD);
+
+      CombineTo(N, ExtLoad);
+      CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
+      AddToWorklist(ExtLoad.getNode());
+      return SDValue(N, 0); // Return N so it doesn't get rechecked!
+    }
+  }
+
   // Form (sext_inreg (bswap >> 16)) or (sext_inreg (rotl (bswap) 16))
   if (ExtVTBits <= 16 && N0.getOpcode() == ISD::OR) {
     if (SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0),
@@ -11225,10 +11776,11 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
     EVT ExTy = N0.getValueType();
     EVT TrTy = N->getValueType(0);
 
-    unsigned NumElem = VecTy.getVectorNumElements();
+    auto EltCnt = VecTy.getVectorElementCount();
     unsigned SizeRatio = ExTy.getSizeInBits()/TrTy.getSizeInBits();
+    auto NewEltCnt = EltCnt * SizeRatio;
 
-    EVT NVT = EVT::getVectorVT(*DAG.getContext(), TrTy, SizeRatio * NumElem);
+    EVT NVT = EVT::getVectorVT(*DAG.getContext(), TrTy, NewEltCnt);
     assert(NVT.getSizeInBits() == VecTy.getSizeInBits() && "Invalid Size");
 
     SDValue EltNo = N0->getOperand(1);
@@ -11342,8 +11894,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
     // after truncation.
     if (N0.hasOneUse() && ISD::isUNINDEXEDLoad(N0.getNode())) {
       LoadSDNode *LN0 = cast<LoadSDNode>(N0);
-      if (LN0->isSimple() &&
-          LN0->getMemoryVT().getStoreSizeInBits() < VT.getSizeInBits()) {
+      if (LN0->isSimple() && LN0->getMemoryVT().bitsLT(VT)) {
         SDValue NewLoad = DAG.getExtLoad(LN0->getExtensionType(), SDLoc(LN0),
                                          VT, LN0->getChain(), LN0->getBasePtr(),
                                          LN0->getMemoryVT(),
@@ -11416,8 +11967,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
   }
 
   // Simplify the operands using demanded-bits information.
-  if (!VT.isVector() &&
-      SimplifyDemandedBits(SDValue(N, 0)))
+  if (SimplifyDemandedBits(SDValue(N, 0)))
     return SDValue(N, 0);
 
   // (trunc adde(X, Y, Carry)) -> (adde trunc(X), trunc(Y), Carry)
@@ -11644,7 +12194,7 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
                                     *LN0->getMemOperand())) {
       SDValue Load =
           DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
-                      LN0->getPointerInfo(), LN0->getAlignment(),
+                      LN0->getPointerInfo(), LN0->getAlign(),
                       LN0->getMemOperand()->getFlags(), LN0->getAAInfo());
       DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
       return Load;
@@ -11991,7 +12541,6 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
   if (!HasFMAD && !HasFMA)
     return SDValue();
 
-  SDNodeFlags Flags = N->getFlags();
   bool CanFuse = Options.UnsafeFPMath || isContractable(N);
   bool CanReassociate =
       Options.UnsafeFPMath || N->getFlags().hasAllowReassociation();
@@ -12024,15 +12573,15 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
 
   // fold (fadd (fmul x, y), z) -> (fma x, y, z)
   if (isContractableFMUL(N0) && (Aggressive || N0->hasOneUse())) {
-    return DAG.getNode(PreferredFusedOpcode, SL, VT,
-                       N0.getOperand(0), N0.getOperand(1), N1, Flags);
+    return DAG.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(0),
+                       N0.getOperand(1), N1);
   }
 
   // fold (fadd x, (fmul y, z)) -> (fma y, z, x)
   // Note: Commutes FADD operands.
   if (isContractableFMUL(N1) && (Aggressive || N1->hasOneUse())) {
-    return DAG.getNode(PreferredFusedOpcode, SL, VT,
-                       N1.getOperand(0), N1.getOperand(1), N0, Flags);
+    return DAG.getNode(PreferredFusedOpcode, SL, VT, N1.getOperand(0),
+                       N1.getOperand(1), N0);
   }
 
   // fadd (fma A, B, (fmul C, D)), E --> fma A, B, (fma C, D, E)
@@ -12055,8 +12604,8 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
     SDValue B = FMA.getOperand(1);
     SDValue C = FMA.getOperand(2).getOperand(0);
     SDValue D = FMA.getOperand(2).getOperand(1);
-    SDValue CDE = DAG.getNode(PreferredFusedOpcode, SL, VT, C, D, E, Flags);
-    return DAG.getNode(PreferredFusedOpcode, SL, VT, A, B, CDE, Flags);
+    SDValue CDE = DAG.getNode(PreferredFusedOpcode, SL, VT, C, D, E);
+    return DAG.getNode(PreferredFusedOpcode, SL, VT, A, B, CDE);
   }
 
   // Look through FP_EXTEND nodes to do more combining.
@@ -12068,10 +12617,9 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
         TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
                             N00.getValueType())) {
       return DAG.getNode(PreferredFusedOpcode, SL, VT,
-                         DAG.getNode(ISD::FP_EXTEND, SL, VT,
-                                     N00.getOperand(0)),
-                         DAG.getNode(ISD::FP_EXTEND, SL, VT,
-                                     N00.getOperand(1)), N1, Flags);
+                         DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(0)),
+                         DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(1)),
+                         N1);
     }
   }
 
@@ -12083,10 +12631,9 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
         TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
                             N10.getValueType())) {
       return DAG.getNode(PreferredFusedOpcode, SL, VT,
-                         DAG.getNode(ISD::FP_EXTEND, SL, VT,
-                                     N10.getOperand(0)),
-                         DAG.getNode(ISD::FP_EXTEND, SL, VT,
-                                     N10.getOperand(1)), N0, Flags);
+                         DAG.getNode(ISD::FP_EXTEND, SL, VT, N10.getOperand(0)),
+                         DAG.getNode(ISD::FP_EXTEND, SL, VT, N10.getOperand(1)),
+                         N0);
     }
   }
 
@@ -12094,14 +12641,13 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
   if (Aggressive) {
     // fold (fadd (fma x, y, (fpext (fmul u, v))), z)
     //   -> (fma x, y, (fma (fpext u), (fpext v), z))
-    auto FoldFAddFMAFPExtFMul = [&] (
-      SDValue X, SDValue Y, SDValue U, SDValue V, SDValue Z,
-      SDNodeFlags Flags) {
+    auto FoldFAddFMAFPExtFMul = [&](SDValue X, SDValue Y, SDValue U, SDValue V,
+                                    SDValue Z) {
       return DAG.getNode(PreferredFusedOpcode, SL, VT, X, Y,
                          DAG.getNode(PreferredFusedOpcode, SL, VT,
                                      DAG.getNode(ISD::FP_EXTEND, SL, VT, U),
                                      DAG.getNode(ISD::FP_EXTEND, SL, VT, V),
-                                     Z, Flags), Flags);
+                                     Z));
     };
     if (N0.getOpcode() == PreferredFusedOpcode) {
       SDValue N02 = N0.getOperand(2);
@@ -12112,7 +12658,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
                                 N020.getValueType())) {
           return FoldFAddFMAFPExtFMul(N0.getOperand(0), N0.getOperand(1),
                                       N020.getOperand(0), N020.getOperand(1),
-                                      N1, Flags);
+                                      N1);
         }
       }
     }
@@ -12122,16 +12668,14 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
     // FIXME: This turns two single-precision and one double-precision
     // operation into two double-precision operations, which might not be
     // interesting for all targets, especially GPUs.
-    auto FoldFAddFPExtFMAFMul = [&] (
-      SDValue X, SDValue Y, SDValue U, SDValue V, SDValue Z,
-      SDNodeFlags Flags) {
-      return DAG.getNode(PreferredFusedOpcode, SL, VT,
-                         DAG.getNode(ISD::FP_EXTEND, SL, VT, X),
-                         DAG.getNode(ISD::FP_EXTEND, SL, VT, Y),
-                         DAG.getNode(PreferredFusedOpcode, SL, VT,
-                                     DAG.getNode(ISD::FP_EXTEND, SL, VT, U),
-                                     DAG.getNode(ISD::FP_EXTEND, SL, VT, V),
-                                     Z, Flags), Flags);
+    auto FoldFAddFPExtFMAFMul = [&](SDValue X, SDValue Y, SDValue U, SDValue V,
+                                    SDValue Z) {
+      return DAG.getNode(
+          PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, X),
+          DAG.getNode(ISD::FP_EXTEND, SL, VT, Y),
+          DAG.getNode(PreferredFusedOpcode, SL, VT,
+                      DAG.getNode(ISD::FP_EXTEND, SL, VT, U),
+                      DAG.getNode(ISD::FP_EXTEND, SL, VT, V), Z));
     };
     if (N0.getOpcode() == ISD::FP_EXTEND) {
       SDValue N00 = N0.getOperand(0);
@@ -12142,7 +12686,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
                                 N00.getValueType())) {
           return FoldFAddFPExtFMAFMul(N00.getOperand(0), N00.getOperand(1),
                                       N002.getOperand(0), N002.getOperand(1),
-                                      N1, Flags);
+                                      N1);
         }
       }
     }
@@ -12158,7 +12702,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
                                 N120.getValueType())) {
           return FoldFAddFMAFPExtFMul(N1.getOperand(0), N1.getOperand(1),
                                       N120.getOperand(0), N120.getOperand(1),
-                                      N0, Flags);
+                                      N0);
         }
       }
     }
@@ -12177,7 +12721,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
                                 N10.getValueType())) {
           return FoldFAddFPExtFMAFMul(N10.getOperand(0), N10.getOperand(1),
                                       N102.getOperand(0), N102.getOperand(1),
-                                      N0, Flags);
+                                      N0);
         }
       }
     }
@@ -12235,8 +12779,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
   auto tryToFoldXYSubZ = [&](SDValue XY, SDValue Z) {
     if (isContractableFMUL(XY) && (Aggressive || XY->hasOneUse())) {
       return DAG.getNode(PreferredFusedOpcode, SL, VT, XY.getOperand(0),
-                         XY.getOperand(1), DAG.getNode(ISD::FNEG, SL, VT, Z),
-                         Flags);
+                         XY.getOperand(1), DAG.getNode(ISD::FNEG, SL, VT, Z));
     }
     return SDValue();
   };
@@ -12247,7 +12790,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
     if (isContractableFMUL(YZ) && (Aggressive || YZ->hasOneUse())) {
       return DAG.getNode(PreferredFusedOpcode, SL, VT,
                          DAG.getNode(ISD::FNEG, SL, VT, YZ.getOperand(0)),
-                         YZ.getOperand(1), X, Flags);
+                         YZ.getOperand(1), X);
     }
     return SDValue();
   };
@@ -12278,7 +12821,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
     SDValue N01 = N0.getOperand(0).getOperand(1);
     return DAG.getNode(PreferredFusedOpcode, SL, VT,
                        DAG.getNode(ISD::FNEG, SL, VT, N00), N01,
-                       DAG.getNode(ISD::FNEG, SL, VT, N1), Flags);
+                       DAG.getNode(ISD::FNEG, SL, VT, N1));
   }
 
   // Look through FP_EXTEND nodes to do more combining.
@@ -12291,11 +12834,9 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
         TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
                             N00.getValueType())) {
       return DAG.getNode(PreferredFusedOpcode, SL, VT,
-                         DAG.getNode(ISD::FP_EXTEND, SL, VT,
-                                     N00.getOperand(0)),
-                         DAG.getNode(ISD::FP_EXTEND, SL, VT,
-                                     N00.getOperand(1)),
-                         DAG.getNode(ISD::FNEG, SL, VT, N1), Flags);
+                         DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(0)),
+                         DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(1)),
+                         DAG.getNode(ISD::FNEG, SL, VT, N1));
     }
   }
 
@@ -12307,13 +12848,11 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
     if (isContractableFMUL(N10) &&
         TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
                             N10.getValueType())) {
-      return DAG.getNode(PreferredFusedOpcode, SL, VT,
-                         DAG.getNode(ISD::FNEG, SL, VT,
-                                     DAG.getNode(ISD::FP_EXTEND, SL, VT,
-                                                 N10.getOperand(0))),
-                         DAG.getNode(ISD::FP_EXTEND, SL, VT,
-                                     N10.getOperand(1)),
-                         N0, Flags);
+      return DAG.getNode(
+          PreferredFusedOpcode, SL, VT,
+          DAG.getNode(ISD::FNEG, SL, VT,
+                      DAG.getNode(ISD::FP_EXTEND, SL, VT, N10.getOperand(0))),
+          DAG.getNode(ISD::FP_EXTEND, SL, VT, N10.getOperand(1)), N0);
     }
   }
 
@@ -12330,13 +12869,12 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
       if (isContractableFMUL(N000) &&
           TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
                               N00.getValueType())) {
-        return DAG.getNode(ISD::FNEG, SL, VT,
-                           DAG.getNode(PreferredFusedOpcode, SL, VT,
-                                       DAG.getNode(ISD::FP_EXTEND, SL, VT,
-                                                   N000.getOperand(0)),
-                                       DAG.getNode(ISD::FP_EXTEND, SL, VT,
-                                                   N000.getOperand(1)),
-                                       N1, Flags));
+        return DAG.getNode(
+            ISD::FNEG, SL, VT,
+            DAG.getNode(PreferredFusedOpcode, SL, VT,
+                        DAG.getNode(ISD::FP_EXTEND, SL, VT, N000.getOperand(0)),
+                        DAG.getNode(ISD::FP_EXTEND, SL, VT, N000.getOperand(1)),
+                        N1));
       }
     }
   }
@@ -12354,13 +12892,12 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
       if (isContractableFMUL(N000) &&
           TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
                               N000.getValueType())) {
-        return DAG.getNode(ISD::FNEG, SL, VT,
-                           DAG.getNode(PreferredFusedOpcode, SL, VT,
-                                       DAG.getNode(ISD::FP_EXTEND, SL, VT,
-                                                   N000.getOperand(0)),
-                                       DAG.getNode(ISD::FP_EXTEND, SL, VT,
-                                                   N000.getOperand(1)),
-                                       N1, Flags));
+        return DAG.getNode(
+            ISD::FNEG, SL, VT,
+            DAG.getNode(PreferredFusedOpcode, SL, VT,
+                        DAG.getNode(ISD::FP_EXTEND, SL, VT, N000.getOperand(0)),
+                        DAG.getNode(ISD::FP_EXTEND, SL, VT, N000.getOperand(1)),
+                        N1));
       }
     }
   }
@@ -12372,13 +12909,12 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
     if (CanFuse && N0.getOpcode() == PreferredFusedOpcode &&
         isContractableFMUL(N0.getOperand(2)) && N0->hasOneUse() &&
         N0.getOperand(2)->hasOneUse()) {
-      return DAG.getNode(PreferredFusedOpcode, SL, VT,
-                         N0.getOperand(0), N0.getOperand(1),
+      return DAG.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(0),
+                         N0.getOperand(1),
                          DAG.getNode(PreferredFusedOpcode, SL, VT,
                                      N0.getOperand(2).getOperand(0),
                                      N0.getOperand(2).getOperand(1),
-                                     DAG.getNode(ISD::FNEG, SL, VT,
-                                                 N1), Flags), Flags);
+                                     DAG.getNode(ISD::FNEG, SL, VT, N1)));
     }
 
     // fold (fsub x, (fma y, z, (fmul u, v)))
@@ -12388,13 +12924,11 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
         N1->hasOneUse() && NoSignedZero) {
       SDValue N20 = N1.getOperand(2).getOperand(0);
       SDValue N21 = N1.getOperand(2).getOperand(1);
-      return DAG.getNode(PreferredFusedOpcode, SL, VT,
-                         DAG.getNode(ISD::FNEG, SL, VT,
-                                     N1.getOperand(0)),
-                         N1.getOperand(1),
-                         DAG.getNode(PreferredFusedOpcode, SL, VT,
-                                     DAG.getNode(ISD::FNEG, SL, VT, N20),
-                                     N21, N0, Flags), Flags);
+      return DAG.getNode(
+          PreferredFusedOpcode, SL, VT,
+          DAG.getNode(ISD::FNEG, SL, VT, N1.getOperand(0)), N1.getOperand(1),
+          DAG.getNode(PreferredFusedOpcode, SL, VT,
+                      DAG.getNode(ISD::FNEG, SL, VT, N20), N21, N0));
     }
 
 
@@ -12408,15 +12942,13 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
         if (isContractableFMUL(N020) &&
             TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
                                 N020.getValueType())) {
-          return DAG.getNode(PreferredFusedOpcode, SL, VT,
-                             N0.getOperand(0), N0.getOperand(1),
-                             DAG.getNode(PreferredFusedOpcode, SL, VT,
-                                         DAG.getNode(ISD::FP_EXTEND, SL, VT,
-                                                     N020.getOperand(0)),
-                                         DAG.getNode(ISD::FP_EXTEND, SL, VT,
-                                                     N020.getOperand(1)),
-                                         DAG.getNode(ISD::FNEG, SL, VT,
-                                                     N1), Flags), Flags);
+          return DAG.getNode(
+              PreferredFusedOpcode, SL, VT, N0.getOperand(0), N0.getOperand(1),
+              DAG.getNode(
+                  PreferredFusedOpcode, SL, VT,
+                  DAG.getNode(ISD::FP_EXTEND, SL, VT, N020.getOperand(0)),
+                  DAG.getNode(ISD::FP_EXTEND, SL, VT, N020.getOperand(1)),
+                  DAG.getNode(ISD::FNEG, SL, VT, N1)));
         }
       }
     }
@@ -12434,18 +12966,15 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
         if (isContractableFMUL(N002) &&
             TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
                                 N00.getValueType())) {
-          return DAG.getNode(PreferredFusedOpcode, SL, VT,
-                             DAG.getNode(ISD::FP_EXTEND, SL, VT,
-                                         N00.getOperand(0)),
-                             DAG.getNode(ISD::FP_EXTEND, SL, VT,
-                                         N00.getOperand(1)),
-                             DAG.getNode(PreferredFusedOpcode, SL, VT,
-                                         DAG.getNode(ISD::FP_EXTEND, SL, VT,
-                                                     N002.getOperand(0)),
-                                         DAG.getNode(ISD::FP_EXTEND, SL, VT,
-                                                     N002.getOperand(1)),
-                                         DAG.getNode(ISD::FNEG, SL, VT,
-                                                     N1), Flags), Flags);
+          return DAG.getNode(
+              PreferredFusedOpcode, SL, VT,
+              DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(0)),
+              DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(1)),
+              DAG.getNode(
+                  PreferredFusedOpcode, SL, VT,
+                  DAG.getNode(ISD::FP_EXTEND, SL, VT, N002.getOperand(0)),
+                  DAG.getNode(ISD::FP_EXTEND, SL, VT, N002.getOperand(1)),
+                  DAG.getNode(ISD::FNEG, SL, VT, N1)));
         }
       }
     }
@@ -12461,16 +12990,13 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
                               N120.getValueType())) {
         SDValue N1200 = N120.getOperand(0);
         SDValue N1201 = N120.getOperand(1);
-        return DAG.getNode(PreferredFusedOpcode, SL, VT,
-                           DAG.getNode(ISD::FNEG, SL, VT, N1.getOperand(0)),
-                           N1.getOperand(1),
-                           DAG.getNode(PreferredFusedOpcode, SL, VT,
-                                       DAG.getNode(ISD::FNEG, SL, VT,
-                                                   DAG.getNode(ISD::FP_EXTEND, SL,
-                                                               VT, N1200)),
-                                       DAG.getNode(ISD::FP_EXTEND, SL, VT,
-                                                   N1201),
-                                       N0, Flags), Flags);
+        return DAG.getNode(
+            PreferredFusedOpcode, SL, VT,
+            DAG.getNode(ISD::FNEG, SL, VT, N1.getOperand(0)), N1.getOperand(1),
+            DAG.getNode(PreferredFusedOpcode, SL, VT,
+                        DAG.getNode(ISD::FNEG, SL, VT,
+                                    DAG.getNode(ISD::FP_EXTEND, SL, VT, N1200)),
+                        DAG.getNode(ISD::FP_EXTEND, SL, VT, N1201), N0));
       }
     }
 
@@ -12491,18 +13017,15 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
                               CvtSrc.getValueType())) {
         SDValue N1020 = N102.getOperand(0);
         SDValue N1021 = N102.getOperand(1);
-        return DAG.getNode(PreferredFusedOpcode, SL, VT,
-                           DAG.getNode(ISD::FNEG, SL, VT,
-                                       DAG.getNode(ISD::FP_EXTEND, SL, VT,
-                                                   N100)),
-                           DAG.getNode(ISD::FP_EXTEND, SL, VT, N101),
-                           DAG.getNode(PreferredFusedOpcode, SL, VT,
-                                       DAG.getNode(ISD::FNEG, SL, VT,
-                                                   DAG.getNode(ISD::FP_EXTEND, SL,
-                                                               VT, N1020)),
-                                       DAG.getNode(ISD::FP_EXTEND, SL, VT,
-                                                   N1021),
-                                       N0, Flags), Flags);
+        return DAG.getNode(
+            PreferredFusedOpcode, SL, VT,
+            DAG.getNode(ISD::FNEG, SL, VT,
+                        DAG.getNode(ISD::FP_EXTEND, SL, VT, N100)),
+            DAG.getNode(ISD::FP_EXTEND, SL, VT, N101),
+            DAG.getNode(PreferredFusedOpcode, SL, VT,
+                        DAG.getNode(ISD::FNEG, SL, VT,
+                                    DAG.getNode(ISD::FP_EXTEND, SL, VT, N1020)),
+                        DAG.getNode(ISD::FP_EXTEND, SL, VT, N1021), N0));
       }
     }
   }
@@ -12518,7 +13041,6 @@ SDValue DAGCombiner::visitFMULForFMADistributiveCombine(SDNode *N) {
   SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
   SDLoc SL(N);
-  const SDNodeFlags Flags = N->getFlags();
 
   assert(N->getOpcode() == ISD::FMUL && "Expected FMUL Operation");
 
@@ -12550,56 +13072,56 @@ SDValue DAGCombiner::visitFMULForFMADistributiveCombine(SDNode *N) {
 
   // fold (fmul (fadd x0, +1.0), y) -> (fma x0, y, y)
   // fold (fmul (fadd x0, -1.0), y) -> (fma x0, y, (fneg y))
-  auto FuseFADD = [&](SDValue X, SDValue Y, const SDNodeFlags Flags) {
+  auto FuseFADD = [&](SDValue X, SDValue Y) {
     if (X.getOpcode() == ISD::FADD && (Aggressive || X->hasOneUse())) {
       if (auto *C = isConstOrConstSplatFP(X.getOperand(1), true)) {
         if (C->isExactlyValue(+1.0))
           return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
-                             Y, Flags);
+                             Y);
         if (C->isExactlyValue(-1.0))
           return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
-                             DAG.getNode(ISD::FNEG, SL, VT, Y), Flags);
+                             DAG.getNode(ISD::FNEG, SL, VT, Y));
       }
     }
     return SDValue();
   };
 
-  if (SDValue FMA = FuseFADD(N0, N1, Flags))
+  if (SDValue FMA = FuseFADD(N0, N1))
     return FMA;
-  if (SDValue FMA = FuseFADD(N1, N0, Flags))
+  if (SDValue FMA = FuseFADD(N1, N0))
     return FMA;
 
   // fold (fmul (fsub +1.0, x1), y) -> (fma (fneg x1), y, y)
   // fold (fmul (fsub -1.0, x1), y) -> (fma (fneg x1), y, (fneg y))
   // fold (fmul (fsub x0, +1.0), y) -> (fma x0, y, (fneg y))
   // fold (fmul (fsub x0, -1.0), y) -> (fma x0, y, y)
-  auto FuseFSUB = [&](SDValue X, SDValue Y, const SDNodeFlags Flags) {
+  auto FuseFSUB = [&](SDValue X, SDValue Y) {
     if (X.getOpcode() == ISD::FSUB && (Aggressive || X->hasOneUse())) {
       if (auto *C0 = isConstOrConstSplatFP(X.getOperand(0), true)) {
         if (C0->isExactlyValue(+1.0))
           return DAG.getNode(PreferredFusedOpcode, SL, VT,
                              DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y,
-                             Y, Flags);
+                             Y);
         if (C0->isExactlyValue(-1.0))
           return DAG.getNode(PreferredFusedOpcode, SL, VT,
                              DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y,
-                             DAG.getNode(ISD::FNEG, SL, VT, Y), Flags);
+                             DAG.getNode(ISD::FNEG, SL, VT, Y));
       }
       if (auto *C1 = isConstOrConstSplatFP(X.getOperand(1), true)) {
         if (C1->isExactlyValue(+1.0))
           return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
-                             DAG.getNode(ISD::FNEG, SL, VT, Y), Flags);
+                             DAG.getNode(ISD::FNEG, SL, VT, Y));
         if (C1->isExactlyValue(-1.0))
           return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
-                             Y, Flags);
+                             Y);
       }
     }
     return SDValue();
   };
 
-  if (SDValue FMA = FuseFSUB(N0, N1, Flags))
+  if (SDValue FMA = FuseFSUB(N0, N1))
     return FMA;
-  if (SDValue FMA = FuseFSUB(N1, N0, Flags))
+  if (SDValue FMA = FuseFSUB(N1, N0))
     return FMA;
 
   return SDValue();
@@ -12608,12 +13130,13 @@ SDValue DAGCombiner::visitFMULForFMADistributiveCombine(SDNode *N) {
 SDValue DAGCombiner::visitFADD(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  bool N0CFP = isConstantFPBuildVectorOrConstantFP(N0);
-  bool N1CFP = isConstantFPBuildVectorOrConstantFP(N1);
+  bool N0CFP = DAG.isConstantFPBuildVectorOrConstantFP(N0);
+  bool N1CFP = DAG.isConstantFPBuildVectorOrConstantFP(N1);
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
   const TargetOptions &Options = DAG.getTarget().Options;
-  const SDNodeFlags Flags = N->getFlags();
+  SDNodeFlags Flags = N->getFlags();
+  SelectionDAG::FlagInserter FlagsInserter(DAG, N);
 
   if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags))
     return R;
@@ -12625,11 +13148,11 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
 
   // fold (fadd c1, c2) -> c1 + c2
   if (N0CFP && N1CFP)
-    return DAG.getNode(ISD::FADD, DL, VT, N0, N1, Flags);
+    return DAG.getNode(ISD::FADD, DL, VT, N0, N1);
 
   // canonicalize constant to RHS
   if (N0CFP && !N1CFP)
-    return DAG.getNode(ISD::FADD, DL, VT, N1, N0, Flags);
+    return DAG.getNode(ISD::FADD, DL, VT, N1, N0);
 
   // N0 + -0.0 --> N0 (also allowed with +0.0 and fast-math)
   ConstantFPSDNode *N1C = isConstOrConstSplatFP(N1, true);
@@ -12644,13 +13167,13 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
   if (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT))
     if (SDValue NegN1 = TLI.getCheaperNegatedExpression(
             N1, DAG, LegalOperations, ForCodeSize))
-      return DAG.getNode(ISD::FSUB, DL, VT, N0, NegN1, Flags);
+      return DAG.getNode(ISD::FSUB, DL, VT, N0, NegN1);
 
   // fold (fadd (fneg A), B) -> (fsub B, A)
   if (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT))
     if (SDValue NegN0 = TLI.getCheaperNegatedExpression(
             N0, DAG, LegalOperations, ForCodeSize))
-      return DAG.getNode(ISD::FSUB, DL, VT, N1, NegN0, Flags);
+      return DAG.getNode(ISD::FSUB, DL, VT, N1, NegN0);
 
   auto isFMulNegTwo = [](SDValue FMul) {
     if (!FMul.hasOneUse() || FMul.getOpcode() != ISD::FMUL)
@@ -12662,14 +13185,14 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
   // fadd (fmul B, -2.0), A --> fsub A, (fadd B, B)
   if (isFMulNegTwo(N0)) {
     SDValue B = N0.getOperand(0);
-    SDValue Add = DAG.getNode(ISD::FADD, DL, VT, B, B, Flags);
-    return DAG.getNode(ISD::FSUB, DL, VT, N1, Add, Flags);
+    SDValue Add = DAG.getNode(ISD::FADD, DL, VT, B, B);
+    return DAG.getNode(ISD::FSUB, DL, VT, N1, Add);
   }
   // fadd A, (fmul B, -2.0) --> fsub A, (fadd B, B)
   if (isFMulNegTwo(N1)) {
     SDValue B = N1.getOperand(0);
-    SDValue Add = DAG.getNode(ISD::FADD, DL, VT, B, B, Flags);
-    return DAG.getNode(ISD::FSUB, DL, VT, N0, Add, Flags);
+    SDValue Add = DAG.getNode(ISD::FADD, DL, VT, B, B);
+    return DAG.getNode(ISD::FSUB, DL, VT, N0, Add);
   }
 
   // No FP constant should be created after legalization as Instruction
@@ -12695,9 +13218,9 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
       AllowNewConst) {
     // fadd (fadd x, c1), c2 -> fadd x, c1 + c2
     if (N1CFP && N0.getOpcode() == ISD::FADD &&
-        isConstantFPBuildVectorOrConstantFP(N0.getOperand(1))) {
-      SDValue NewC = DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1), N1, Flags);
-      return DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(0), NewC, Flags);
+        DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(1))) {
+      SDValue NewC = DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1), N1);
+      return DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(0), NewC);
     }
 
     // We can fold chains of FADD's of the same value into multiplications.
@@ -12705,14 +13228,14 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
     // of rounding steps.
     if (TLI.isOperationLegalOrCustom(ISD::FMUL, VT) && !N0CFP && !N1CFP) {
       if (N0.getOpcode() == ISD::FMUL) {
-        bool CFP00 = isConstantFPBuildVectorOrConstantFP(N0.getOperand(0));
-        bool CFP01 = isConstantFPBuildVectorOrConstantFP(N0.getOperand(1));
+        bool CFP00 = DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(0));
+        bool CFP01 = DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(1));
 
         // (fadd (fmul x, c), x) -> (fmul x, c+1)
         if (CFP01 && !CFP00 && N0.getOperand(0) == N1) {
           SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1),
-                                       DAG.getConstantFP(1.0, DL, VT), Flags);
-          return DAG.getNode(ISD::FMUL, DL, VT, N1, NewCFP, Flags);
+                                       DAG.getConstantFP(1.0, DL, VT));
+          return DAG.getNode(ISD::FMUL, DL, VT, N1, NewCFP);
         }
 
         // (fadd (fmul x, c), (fadd x, x)) -> (fmul x, c+2)
@@ -12720,20 +13243,20 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
             N1.getOperand(0) == N1.getOperand(1) &&
             N0.getOperand(0) == N1.getOperand(0)) {
           SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1),
-                                       DAG.getConstantFP(2.0, DL, VT), Flags);
-          return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), NewCFP, Flags);
+                                       DAG.getConstantFP(2.0, DL, VT));
+          return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), NewCFP);
         }
       }
 
       if (N1.getOpcode() == ISD::FMUL) {
-        bool CFP10 = isConstantFPBuildVectorOrConstantFP(N1.getOperand(0));
-        bool CFP11 = isConstantFPBuildVectorOrConstantFP(N1.getOperand(1));
+        bool CFP10 = DAG.isConstantFPBuildVectorOrConstantFP(N1.getOperand(0));
+        bool CFP11 = DAG.isConstantFPBuildVectorOrConstantFP(N1.getOperand(1));
 
         // (fadd x, (fmul x, c)) -> (fmul x, c+1)
         if (CFP11 && !CFP10 && N1.getOperand(0) == N0) {
           SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N1.getOperand(1),
-                                       DAG.getConstantFP(1.0, DL, VT), Flags);
-          return DAG.getNode(ISD::FMUL, DL, VT, N0, NewCFP, Flags);
+                                       DAG.getConstantFP(1.0, DL, VT));
+          return DAG.getNode(ISD::FMUL, DL, VT, N0, NewCFP);
         }
 
         // (fadd (fadd x, x), (fmul x, c)) -> (fmul x, c+2)
@@ -12741,28 +13264,28 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
             N0.getOperand(0) == N0.getOperand(1) &&
             N1.getOperand(0) == N0.getOperand(0)) {
           SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N1.getOperand(1),
-                                       DAG.getConstantFP(2.0, DL, VT), Flags);
-          return DAG.getNode(ISD::FMUL, DL, VT, N1.getOperand(0), NewCFP, Flags);
+                                       DAG.getConstantFP(2.0, DL, VT));
+          return DAG.getNode(ISD::FMUL, DL, VT, N1.getOperand(0), NewCFP);
         }
       }
 
       if (N0.getOpcode() == ISD::FADD) {
-        bool CFP00 = isConstantFPBuildVectorOrConstantFP(N0.getOperand(0));
+        bool CFP00 = DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(0));
         // (fadd (fadd x, x), x) -> (fmul x, 3.0)
         if (!CFP00 && N0.getOperand(0) == N0.getOperand(1) &&
             (N0.getOperand(0) == N1)) {
-          return DAG.getNode(ISD::FMUL, DL, VT,
-                             N1, DAG.getConstantFP(3.0, DL, VT), Flags);
+          return DAG.getNode(ISD::FMUL, DL, VT, N1,
+                             DAG.getConstantFP(3.0, DL, VT));
         }
       }
 
       if (N1.getOpcode() == ISD::FADD) {
-        bool CFP10 = isConstantFPBuildVectorOrConstantFP(N1.getOperand(0));
+        bool CFP10 = DAG.isConstantFPBuildVectorOrConstantFP(N1.getOperand(0));
         // (fadd x, (fadd x, x)) -> (fmul x, 3.0)
         if (!CFP10 && N1.getOperand(0) == N1.getOperand(1) &&
             N1.getOperand(0) == N0) {
-          return DAG.getNode(ISD::FMUL, DL, VT,
-                             N0, DAG.getConstantFP(3.0, DL, VT), Flags);
+          return DAG.getNode(ISD::FMUL, DL, VT, N0,
+                             DAG.getConstantFP(3.0, DL, VT));
         }
       }
 
@@ -12772,7 +13295,7 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
           N1.getOperand(0) == N1.getOperand(1) &&
           N0.getOperand(0) == N1.getOperand(0)) {
         return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0),
-                           DAG.getConstantFP(4.0, DL, VT), Flags);
+                           DAG.getConstantFP(4.0, DL, VT));
       }
     }
   } // enable-unsafe-fp-math
@@ -12785,6 +13308,33 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
   return SDValue();
 }
 
+SDValue DAGCombiner::visitSTRICT_FADD(SDNode *N) {
+  SDValue Chain = N->getOperand(0);
+  SDValue N0 = N->getOperand(1);
+  SDValue N1 = N->getOperand(2);
+  EVT VT = N->getValueType(0);
+  EVT ChainVT = N->getValueType(1);
+  SDLoc DL(N);
+  SelectionDAG::FlagInserter FlagsInserter(DAG, N);
+
+  // fold (strict_fadd A, (fneg B)) -> (strict_fsub A, B)
+  if (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::STRICT_FSUB, VT))
+    if (SDValue NegN1 = TLI.getCheaperNegatedExpression(
+            N1, DAG, LegalOperations, ForCodeSize)) {
+      return DAG.getNode(ISD::STRICT_FSUB, DL, DAG.getVTList(VT, ChainVT),
+                         {Chain, N0, NegN1});
+    }
+
+  // fold (strict_fadd (fneg A), B) -> (strict_fsub B, A)
+  if (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::STRICT_FSUB, VT))
+    if (SDValue NegN0 = TLI.getCheaperNegatedExpression(
+            N0, DAG, LegalOperations, ForCodeSize)) {
+      return DAG.getNode(ISD::STRICT_FSUB, DL, DAG.getVTList(VT, ChainVT),
+                         {Chain, N1, NegN0});
+    }
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitFSUB(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -12794,6 +13344,7 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
   SDLoc DL(N);
   const TargetOptions &Options = DAG.getTarget().Options;
   const SDNodeFlags Flags = N->getFlags();
+  SelectionDAG::FlagInserter FlagsInserter(DAG, N);
 
   if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags))
     return R;
@@ -12805,7 +13356,7 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
 
   // fold (fsub c1, c2) -> c1-c2
   if (N0CFP && N1CFP)
-    return DAG.getNode(ISD::FSUB, DL, VT, N0, N1, Flags);
+    return DAG.getNode(ISD::FSUB, DL, VT, N0, N1);
 
   if (SDValue NewSel = foldBinOpIntoSelect(N))
     return NewSel;
@@ -12825,18 +13376,21 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
   }
 
   // (fsub -0.0, N1) -> -N1
-  // NOTE: It is safe to transform an FSUB(-0.0,X) into an FNEG(X), since the
-  //       FSUB does not specify the sign bit of a NaN. Also note that for
-  //       the same reason, the inverse transform is not safe, unless fast math
-  //       flags are in play.
   if (N0CFP && N0CFP->isZero()) {
     if (N0CFP->isNegative() ||
         (Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros())) {
-      if (SDValue NegN1 =
-              TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize))
-        return NegN1;
-      if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))
-        return DAG.getNode(ISD::FNEG, DL, VT, N1, Flags);
+      // We cannot replace an FSUB(+-0.0,X) with FNEG(X) when denormals are
+      // flushed to zero, unless all users treat denorms as zero (DAZ).
+      // FIXME: This transform will change the sign of a NaN and the behavior
+      // of a signaling NaN. It is only valid when a NoNaN flag is present.
+      DenormalMode DenormMode = DAG.getDenormalMode(VT);
+      if (DenormMode == DenormalMode::getIEEE()) {
+        if (SDValue NegN1 =
+                TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize))
+          return NegN1;
+        if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))
+          return DAG.getNode(ISD::FNEG, DL, VT, N1);
+      }
     }
   }
 
@@ -12845,16 +13399,16 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
       N1.getOpcode() == ISD::FADD) {
     // X - (X + Y) -> -Y
     if (N0 == N1->getOperand(0))
-      return DAG.getNode(ISD::FNEG, DL, VT, N1->getOperand(1), Flags);
+      return DAG.getNode(ISD::FNEG, DL, VT, N1->getOperand(1));
     // X - (Y + X) -> -Y
     if (N0 == N1->getOperand(1))
-      return DAG.getNode(ISD::FNEG, DL, VT, N1->getOperand(0), Flags);
+      return DAG.getNode(ISD::FNEG, DL, VT, N1->getOperand(0));
   }
 
   // fold (fsub A, (fneg B)) -> (fadd A, B)
   if (SDValue NegN1 =
           TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize))
-    return DAG.getNode(ISD::FADD, DL, VT, N0, NegN1, Flags);
+    return DAG.getNode(ISD::FADD, DL, VT, N0, NegN1);
 
   // FSUB -> FMA combines:
   if (SDValue Fused = visitFSUBForFMACombine(N)) {
@@ -12874,6 +13428,7 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) {
   SDLoc DL(N);
   const TargetOptions &Options = DAG.getTarget().Options;
   const SDNodeFlags Flags = N->getFlags();
+  SelectionDAG::FlagInserter FlagsInserter(DAG, N);
 
   if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags))
     return R;
@@ -12887,35 +13442,28 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) {
 
   // fold (fmul c1, c2) -> c1*c2
   if (N0CFP && N1CFP)
-    return DAG.getNode(ISD::FMUL, DL, VT, N0, N1, Flags);
+    return DAG.getNode(ISD::FMUL, DL, VT, N0, N1);
 
   // canonicalize constant to RHS
-  if (isConstantFPBuildVectorOrConstantFP(N0) &&
-     !isConstantFPBuildVectorOrConstantFP(N1))
-    return DAG.getNode(ISD::FMUL, DL, VT, N1, N0, Flags);
+  if (DAG.isConstantFPBuildVectorOrConstantFP(N0) &&
+     !DAG.isConstantFPBuildVectorOrConstantFP(N1))
+    return DAG.getNode(ISD::FMUL, DL, VT, N1, N0);
 
   if (SDValue NewSel = foldBinOpIntoSelect(N))
     return NewSel;
 
-  if ((Options.NoNaNsFPMath && Options.NoSignedZerosFPMath) ||
-      (Flags.hasNoNaNs() && Flags.hasNoSignedZeros())) {
-    // fold (fmul A, 0) -> 0
-    if (N1CFP && N1CFP->isZero())
-      return N1;
-  }
-
   if (Options.UnsafeFPMath || Flags.hasAllowReassociation()) {
     // fmul (fmul X, C1), C2 -> fmul X, C1 * C2
-    if (isConstantFPBuildVectorOrConstantFP(N1) &&
+    if (DAG.isConstantFPBuildVectorOrConstantFP(N1) &&
         N0.getOpcode() == ISD::FMUL) {
       SDValue N00 = N0.getOperand(0);
       SDValue N01 = N0.getOperand(1);
       // Avoid an infinite loop by making sure that N00 is not a constant
       // (the inner multiply has not been constant folded yet).
-      if (isConstantFPBuildVectorOrConstantFP(N01) &&
-          !isConstantFPBuildVectorOrConstantFP(N00)) {
-        SDValue MulConsts = DAG.getNode(ISD::FMUL, DL, VT, N01, N1, Flags);
-        return DAG.getNode(ISD::FMUL, DL, VT, N00, MulConsts, Flags);
+      if (DAG.isConstantFPBuildVectorOrConstantFP(N01) &&
+          !DAG.isConstantFPBuildVectorOrConstantFP(N00)) {
+        SDValue MulConsts = DAG.getNode(ISD::FMUL, DL, VT, N01, N1);
+        return DAG.getNode(ISD::FMUL, DL, VT, N00, MulConsts);
       }
     }
 
@@ -12924,14 +13472,14 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) {
     if (N0.getOpcode() == ISD::FADD && N0.hasOneUse() &&
         N0.getOperand(0) == N0.getOperand(1)) {
       const SDValue Two = DAG.getConstantFP(2.0, DL, VT);
-      SDValue MulConsts = DAG.getNode(ISD::FMUL, DL, VT, Two, N1, Flags);
-      return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), MulConsts, Flags);
+      SDValue MulConsts = DAG.getNode(ISD::FMUL, DL, VT, Two, N1);
+      return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), MulConsts);
     }
   }
 
   // fold (fmul X, 2.0) -> (fadd X, X)
   if (N1CFP && N1CFP->isExactlyValue(+2.0))
-    return DAG.getNode(ISD::FADD, DL, VT, N0, N0, Flags);
+    return DAG.getNode(ISD::FADD, DL, VT, N0, N0);
 
   // fold (fmul X, -1.0) -> (fneg X)
   if (N1CFP && N1CFP->isExactlyValue(-1.0))
@@ -12950,7 +13498,7 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) {
   if (NegN0 && NegN1 &&
       (CostN0 == TargetLowering::NegatibleCost::Cheaper ||
        CostN1 == TargetLowering::NegatibleCost::Cheaper))
-    return DAG.getNode(ISD::FMUL, DL, VT, NegN0, NegN1, Flags);
+    return DAG.getNode(ISD::FMUL, DL, VT, NegN0, NegN1);
 
   // fold (fmul X, (select (fcmp X > 0.0), -1.0, 1.0)) -> (fneg (fabs X))
   // fold (fmul X, (select (fcmp X > 0.0), 1.0, -1.0)) -> (fabs X)
@@ -13016,10 +13564,11 @@ SDValue DAGCombiner::visitFMA(SDNode *N) {
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
   const TargetOptions &Options = DAG.getTarget().Options;
-
   // FMA nodes have flags that propagate to the created nodes.
-  const SDNodeFlags Flags = N->getFlags();
-  bool UnsafeFPMath = Options.UnsafeFPMath || isContractable(N);
+  SelectionDAG::FlagInserter FlagsInserter(DAG, N);
+
+  bool UnsafeFPMath =
+      Options.UnsafeFPMath || N->getFlags().hasAllowReassociation();
 
   // Constant fold FMA.
   if (isa<ConstantFPSDNode>(N0) &&
@@ -13040,7 +13589,7 @@ SDValue DAGCombiner::visitFMA(SDNode *N) {
   if (NegN0 && NegN1 &&
       (CostN0 == TargetLowering::NegatibleCost::Cheaper ||
        CostN1 == TargetLowering::NegatibleCost::Cheaper))
-    return DAG.getNode(ISD::FMA, DL, VT, NegN0, NegN1, N2, Flags);
+    return DAG.getNode(ISD::FMA, DL, VT, NegN0, NegN1, N2);
 
   if (UnsafeFPMath) {
     if (N0CFP && N0CFP->isZero())
@@ -13048,51 +13597,45 @@ SDValue DAGCombiner::visitFMA(SDNode *N) {
     if (N1CFP && N1CFP->isZero())
       return N2;
   }
-  // TODO: The FMA node should have flags that propagate to these nodes.
+
   if (N0CFP && N0CFP->isExactlyValue(1.0))
     return DAG.getNode(ISD::FADD, SDLoc(N), VT, N1, N2);
   if (N1CFP && N1CFP->isExactlyValue(1.0))
     return DAG.getNode(ISD::FADD, SDLoc(N), VT, N0, N2);
 
   // Canonicalize (fma c, x, y) -> (fma x, c, y)
-  if (isConstantFPBuildVectorOrConstantFP(N0) &&
-     !isConstantFPBuildVectorOrConstantFP(N1))
+  if (DAG.isConstantFPBuildVectorOrConstantFP(N0) &&
+     !DAG.isConstantFPBuildVectorOrConstantFP(N1))
     return DAG.getNode(ISD::FMA, SDLoc(N), VT, N1, N0, N2);
 
   if (UnsafeFPMath) {
     // (fma x, c1, (fmul x, c2)) -> (fmul x, c1+c2)
     if (N2.getOpcode() == ISD::FMUL && N0 == N2.getOperand(0) &&
-        isConstantFPBuildVectorOrConstantFP(N1) &&
-        isConstantFPBuildVectorOrConstantFP(N2.getOperand(1))) {
+        DAG.isConstantFPBuildVectorOrConstantFP(N1) &&
+        DAG.isConstantFPBuildVectorOrConstantFP(N2.getOperand(1))) {
       return DAG.getNode(ISD::FMUL, DL, VT, N0,
-                         DAG.getNode(ISD::FADD, DL, VT, N1, N2.getOperand(1),
-                                     Flags), Flags);
+                         DAG.getNode(ISD::FADD, DL, VT, N1, N2.getOperand(1)));
     }
 
     // (fma (fmul x, c1), c2, y) -> (fma x, c1*c2, y)
     if (N0.getOpcode() == ISD::FMUL &&
-        isConstantFPBuildVectorOrConstantFP(N1) &&
-        isConstantFPBuildVectorOrConstantFP(N0.getOperand(1))) {
-      return DAG.getNode(ISD::FMA, DL, VT,
-                         N0.getOperand(0),
-                         DAG.getNode(ISD::FMUL, DL, VT, N1, N0.getOperand(1),
-                                     Flags),
+        DAG.isConstantFPBuildVectorOrConstantFP(N1) &&
+        DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(1))) {
+      return DAG.getNode(ISD::FMA, DL, VT, N0.getOperand(0),
+                         DAG.getNode(ISD::FMUL, DL, VT, N1, N0.getOperand(1)),
                          N2);
     }
   }
 
-  // (fma x, 1, y) -> (fadd x, y)
   // (fma x, -1, y) -> (fadd (fneg x), y)
   if (N1CFP) {
     if (N1CFP->isExactlyValue(1.0))
-      // TODO: The FMA node should have flags that propagate to this node.
       return DAG.getNode(ISD::FADD, DL, VT, N0, N2);
 
     if (N1CFP->isExactlyValue(-1.0) &&
         (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))) {
       SDValue RHSNeg = DAG.getNode(ISD::FNEG, DL, VT, N0);
       AddToWorklist(RHSNeg.getNode());
-      // TODO: The FMA node should have flags that propagate to this node.
       return DAG.getNode(ISD::FADD, DL, VT, N2, RHSNeg);
     }
 
@@ -13102,25 +13645,23 @@ SDValue DAGCombiner::visitFMA(SDNode *N) {
          (N1.hasOneUse() && !TLI.isFPImmLegal(N1CFP->getValueAPF(), VT,
                                               ForCodeSize)))) {
       return DAG.getNode(ISD::FMA, DL, VT, N0.getOperand(0),
-                         DAG.getNode(ISD::FNEG, DL, VT, N1, Flags), N2);
+                         DAG.getNode(ISD::FNEG, DL, VT, N1), N2);
     }
   }
 
   if (UnsafeFPMath) {
     // (fma x, c, x) -> (fmul x, (c+1))
     if (N1CFP && N0 == N2) {
-      return DAG.getNode(ISD::FMUL, DL, VT, N0,
-                         DAG.getNode(ISD::FADD, DL, VT, N1,
-                                     DAG.getConstantFP(1.0, DL, VT), Flags),
-                         Flags);
+      return DAG.getNode(
+          ISD::FMUL, DL, VT, N0,
+          DAG.getNode(ISD::FADD, DL, VT, N1, DAG.getConstantFP(1.0, DL, VT)));
     }
 
     // (fma x, c, (fneg x)) -> (fmul x, (c-1))
     if (N1CFP && N2.getOpcode() == ISD::FNEG && N2.getOperand(0) == N0) {
-      return DAG.getNode(ISD::FMUL, DL, VT, N0,
-                         DAG.getNode(ISD::FADD, DL, VT, N1,
-                                     DAG.getConstantFP(-1.0, DL, VT), Flags),
-                         Flags);
+      return DAG.getNode(
+          ISD::FMUL, DL, VT, N0,
+          DAG.getNode(ISD::FADD, DL, VT, N1, DAG.getConstantFP(-1.0, DL, VT)));
     }
   }
 
@@ -13129,7 +13670,7 @@ SDValue DAGCombiner::visitFMA(SDNode *N) {
   if (!TLI.isFNegFree(VT))
     if (SDValue Neg = TLI.getCheaperNegatedExpression(
             SDValue(N, 0), DAG, LegalOperations, ForCodeSize))
-      return DAG.getNode(ISD::FNEG, DL, VT, Neg, Flags);
+      return DAG.getNode(ISD::FNEG, DL, VT, Neg);
   return SDValue();
 }
 
@@ -13150,14 +13691,13 @@ SDValue DAGCombiner::combineRepeatedFPDivisors(SDNode *N) {
     return SDValue();
 
   // Skip if current node is a reciprocal/fneg-reciprocal.
-  SDValue N0 = N->getOperand(0);
+  SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
   ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0, /* AllowUndefs */ true);
   if (N0CFP && (N0CFP->isExactlyValue(1.0) || N0CFP->isExactlyValue(-1.0)))
     return SDValue();
 
   // Exit early if the target does not want this transform or if there can't
   // possibly be enough uses of the divisor to make the transform worthwhile.
-  SDValue N1 = N->getOperand(1);
   unsigned MinUses = TLI.combineRepeatedFPDivisors();
 
   // For splat vectors, scale the number of uses by the splat factor. If we can
@@ -13175,6 +13715,13 @@ SDValue DAGCombiner::combineRepeatedFPDivisors(SDNode *N) {
   SetVector<SDNode *> Users;
   for (auto *U : N1->uses()) {
     if (U->getOpcode() == ISD::FDIV && U->getOperand(1) == N1) {
+      // Skip X/sqrt(X) that has not been simplified to sqrt(X) yet.
+      if (U->getOperand(1).getOpcode() == ISD::FSQRT &&
+          U->getOperand(0) == U->getOperand(1).getOperand(0) &&
+          U->getFlags().hasAllowReassociation() &&
+          U->getFlags().hasNoSignedZeros())
+        continue;
+
       // This division is eligible for optimization only if global unsafe math
       // is enabled or if this division allows reciprocal formation.
       if (UnsafeMath || U->getFlags().hasAllowReciprocal())
@@ -13216,6 +13763,7 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
   SDLoc DL(N);
   const TargetOptions &Options = DAG.getTarget().Options;
   SDNodeFlags Flags = N->getFlags();
+  SelectionDAG::FlagInserter FlagsInserter(DAG, N);
 
   if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags))
     return R;
@@ -13227,7 +13775,7 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
 
   // fold (fdiv c1, c2) -> c1/c2
   if (N0CFP && N1CFP)
-    return DAG.getNode(ISD::FDIV, SDLoc(N), VT, N0, N1, Flags);
+    return DAG.getNode(ISD::FDIV, SDLoc(N), VT, N0, N1);
 
   if (SDValue NewSel = foldBinOpIntoSelect(N))
     return NewSel;
@@ -13252,29 +13800,29 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
            TLI.isOperationLegal(ISD::ConstantFP, VT) ||
            TLI.isFPImmLegal(Recip, VT, ForCodeSize)))
         return DAG.getNode(ISD::FMUL, DL, VT, N0,
-                           DAG.getConstantFP(Recip, DL, VT), Flags);
+                           DAG.getConstantFP(Recip, DL, VT));
     }
 
     // If this FDIV is part of a reciprocal square root, it may be folded
     // into a target-specific square root estimate instruction.
     if (N1.getOpcode() == ISD::FSQRT) {
       if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0), Flags))
-        return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags);
+        return DAG.getNode(ISD::FMUL, DL, VT, N0, RV);
     } else if (N1.getOpcode() == ISD::FP_EXTEND &&
                N1.getOperand(0).getOpcode() == ISD::FSQRT) {
-      if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0).getOperand(0),
-                                          Flags)) {
+      if (SDValue RV =
+              buildRsqrtEstimate(N1.getOperand(0).getOperand(0), Flags)) {
         RV = DAG.getNode(ISD::FP_EXTEND, SDLoc(N1), VT, RV);
         AddToWorklist(RV.getNode());
-        return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags);
+        return DAG.getNode(ISD::FMUL, DL, VT, N0, RV);
       }
     } else if (N1.getOpcode() == ISD::FP_ROUND &&
                N1.getOperand(0).getOpcode() == ISD::FSQRT) {
-      if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0).getOperand(0),
-                                          Flags)) {
+      if (SDValue RV =
+              buildRsqrtEstimate(N1.getOperand(0).getOperand(0), Flags)) {
         RV = DAG.getNode(ISD::FP_ROUND, SDLoc(N1), VT, RV, N1.getOperand(1));
         AddToWorklist(RV.getNode());
-        return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags);
+        return DAG.getNode(ISD::FMUL, DL, VT, N0, RV);
       }
     } else if (N1.getOpcode() == ISD::FMUL) {
       // Look through an FMUL. Even though this won't remove the FDIV directly,
@@ -13289,29 +13837,34 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
       }
       if (Sqrt.getNode()) {
         // If the other multiply operand is known positive, pull it into the
-        // sqrt. That will eliminate the division if we convert to an estimate:
-        // X / (fabs(A) * sqrt(Z)) --> X / sqrt(A*A*Z) --> X * rsqrt(A*A*Z)
-        // TODO: Also fold the case where A == Z (fabs is missing).
+        // sqrt. That will eliminate the division if we convert to an estimate.
         if (Flags.hasAllowReassociation() && N1.hasOneUse() &&
-            N1->getFlags().hasAllowReassociation() && Sqrt.hasOneUse() &&
-            Y.getOpcode() == ISD::FABS && Y.hasOneUse()) {
-          SDValue AA = DAG.getNode(ISD::FMUL, DL, VT, Y.getOperand(0),
-                                   Y.getOperand(0), Flags);
-          SDValue AAZ =
-              DAG.getNode(ISD::FMUL, DL, VT, AA, Sqrt.getOperand(0), Flags);
-          if (SDValue Rsqrt = buildRsqrtEstimate(AAZ, Flags))
-            return DAG.getNode(ISD::FMUL, DL, VT, N0, Rsqrt, Flags);
-
-          // Estimate creation failed. Clean up speculatively created nodes.
-          recursivelyDeleteUnusedNodes(AAZ.getNode());
+            N1->getFlags().hasAllowReassociation() && Sqrt.hasOneUse()) {
+          SDValue A;
+          if (Y.getOpcode() == ISD::FABS && Y.hasOneUse())
+            A = Y.getOperand(0);
+          else if (Y == Sqrt.getOperand(0))
+            A = Y;
+          if (A) {
+            // X / (fabs(A) * sqrt(Z)) --> X / sqrt(A*A*Z) --> X * rsqrt(A*A*Z)
+            // X / (A * sqrt(A))       --> X / sqrt(A*A*A) --> X * rsqrt(A*A*A)
+            SDValue AA = DAG.getNode(ISD::FMUL, DL, VT, A, A);
+            SDValue AAZ =
+                DAG.getNode(ISD::FMUL, DL, VT, AA, Sqrt.getOperand(0));
+            if (SDValue Rsqrt = buildRsqrtEstimate(AAZ, Flags))
+              return DAG.getNode(ISD::FMUL, DL, VT, N0, Rsqrt);
+
+            // Estimate creation failed. Clean up speculatively created nodes.
+            recursivelyDeleteUnusedNodes(AAZ.getNode());
+          }
         }
 
         // We found a FSQRT, so try to make this fold:
         // X / (Y * sqrt(Z)) -> X * (rsqrt(Z) / Y)
         if (SDValue Rsqrt = buildRsqrtEstimate(Sqrt.getOperand(0), Flags)) {
-          SDValue Div = DAG.getNode(ISD::FDIV, SDLoc(N1), VT, Rsqrt, Y, Flags);
+          SDValue Div = DAG.getNode(ISD::FDIV, SDLoc(N1), VT, Rsqrt, Y);
           AddToWorklist(Div.getNode());
-          return DAG.getNode(ISD::FMUL, DL, VT, N0, Div, Flags);
+          return DAG.getNode(ISD::FMUL, DL, VT, N0, Div);
         }
       }
     }
@@ -13322,6 +13875,12 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
         return RV;
   }
 
+  // Fold X/Sqrt(X) -> Sqrt(X)
+  if ((Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros()) &&
+      (Options.UnsafeFPMath || Flags.hasAllowReassociation()))
+    if (N1.getOpcode() == ISD::FSQRT && N0 == N1.getOperand(0))
+      return N1;
+
   // (fdiv (fneg X), (fneg Y)) -> (fdiv X, Y)
   TargetLowering::NegatibleCost CostN0 =
       TargetLowering::NegatibleCost::Expensive;
@@ -13334,7 +13893,7 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
   if (NegN0 && NegN1 &&
       (CostN0 == TargetLowering::NegatibleCost::Cheaper ||
        CostN1 == TargetLowering::NegatibleCost::Cheaper))
-    return DAG.getNode(ISD::FDIV, SDLoc(N), VT, NegN0, NegN1, Flags);
+    return DAG.getNode(ISD::FDIV, SDLoc(N), VT, NegN0, NegN1);
 
   return SDValue();
 }
@@ -13346,13 +13905,14 @@ SDValue DAGCombiner::visitFREM(SDNode *N) {
   ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
   EVT VT = N->getValueType(0);
   SDNodeFlags Flags = N->getFlags();
+  SelectionDAG::FlagInserter FlagsInserter(DAG, N);
 
   if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags))
     return R;
 
   // fold (frem c1, c2) -> fmod(c1,c2)
   if (N0CFP && N1CFP)
-    return DAG.getNode(ISD::FREM, SDLoc(N), VT, N0, N1, N->getFlags());
+    return DAG.getNode(ISD::FREM, SDLoc(N), VT, N0, N1);
 
   if (SDValue NewSel = foldBinOpIntoSelect(N))
     return NewSel;
@@ -13366,7 +13926,7 @@ SDValue DAGCombiner::visitFSQRT(SDNode *N) {
 
   // Require 'ninf' flag since sqrt(+Inf) = +Inf, but the estimation goes as:
   // sqrt(+Inf) == rsqrt(+Inf) * +Inf = 0 * +Inf = NaN
-  if ((!Options.UnsafeFPMath && !Flags.hasApproximateFuncs()) ||
+  if (!Flags.hasApproximateFuncs() ||
       (!Options.NoInfsFPMath && !Flags.hasNoInfs()))
     return SDValue();
 
@@ -13375,6 +13935,10 @@ SDValue DAGCombiner::visitFSQRT(SDNode *N) {
     return SDValue();
 
   // FSQRT nodes have flags that propagate to the created nodes.
+  // TODO: If this is N0/sqrt(N0), and we reach this node before trying to
+  //       transform the fdiv, we may produce a sub-optimal estimate sequence
+  //       because the reciprocal calculation may not have to filter out a
+  //       0.0 input.
   return buildSqrtEstimate(N0, Flags);
 }
 
@@ -13398,8 +13962,8 @@ static inline bool CanCombineFCOPYSIGN_EXTEND_ROUND(SDNode *N) {
 SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  bool N0CFP = isConstantFPBuildVectorOrConstantFP(N0);
-  bool N1CFP = isConstantFPBuildVectorOrConstantFP(N1);
+  bool N0CFP = DAG.isConstantFPBuildVectorOrConstantFP(N0);
+  bool N1CFP = DAG.isConstantFPBuildVectorOrConstantFP(N1);
   EVT VT = N->getValueType(0);
 
   if (N0CFP && N1CFP) // Constant fold
@@ -13446,6 +14010,7 @@ SDValue DAGCombiner::visitFPOW(SDNode *N) {
   ConstantFPSDNode *ExponentC = isConstOrConstSplatFP(N->getOperand(1));
   if (!ExponentC)
     return SDValue();
+  SelectionDAG::FlagInserter FlagsInserter(DAG, N);
 
   // Try to convert x ** (1/3) into cube root.
   // TODO: Handle the various flavors of long double.
@@ -13472,7 +14037,7 @@ SDValue DAGCombiner::visitFPOW(SDNode *N) {
          DAG.getTargetLoweringInfo().isOperationExpand(ISD::FCBRT, VT)))
       return SDValue();
 
-    return DAG.getNode(ISD::FCBRT, SDLoc(N), VT, N->getOperand(0), Flags);
+    return DAG.getNode(ISD::FCBRT, SDLoc(N), VT, N->getOperand(0));
   }
 
   // Try to convert x ** (1/4) and x ** (3/4) into square roots.
@@ -13507,12 +14072,12 @@ SDValue DAGCombiner::visitFPOW(SDNode *N) {
 
     // pow(X, 0.25) --> sqrt(sqrt(X))
     SDLoc DL(N);
-    SDValue Sqrt = DAG.getNode(ISD::FSQRT, DL, VT, N->getOperand(0), Flags);
-    SDValue SqrtSqrt = DAG.getNode(ISD::FSQRT, DL, VT, Sqrt, Flags);
+    SDValue Sqrt = DAG.getNode(ISD::FSQRT, DL, VT, N->getOperand(0));
+    SDValue SqrtSqrt = DAG.getNode(ISD::FSQRT, DL, VT, Sqrt);
     if (ExponentIs025)
       return SqrtSqrt;
     // pow(X, 0.75) --> sqrt(X) * sqrt(sqrt(X))
-    return DAG.getNode(ISD::FMUL, DL, VT, Sqrt, SqrtSqrt, Flags);
+    return DAG.getNode(ISD::FMUL, DL, VT, Sqrt, SqrtSqrt);
   }
 
   return SDValue();
@@ -13695,7 +14260,7 @@ SDValue DAGCombiner::visitFP_TO_SINT(SDNode *N) {
     return DAG.getUNDEF(VT);
 
   // fold (fp_to_sint c1fp) -> c1
-  if (isConstantFPBuildVectorOrConstantFP(N0))
+  if (DAG.isConstantFPBuildVectorOrConstantFP(N0))
     return DAG.getNode(ISD::FP_TO_SINT, SDLoc(N), VT, N0);
 
   return FoldIntToFPToInt(N, DAG);
@@ -13710,7 +14275,7 @@ SDValue DAGCombiner::visitFP_TO_UINT(SDNode *N) {
     return DAG.getUNDEF(VT);
 
   // fold (fp_to_uint c1fp) -> c1
-  if (isConstantFPBuildVectorOrConstantFP(N0))
+  if (DAG.isConstantFPBuildVectorOrConstantFP(N0))
     return DAG.getNode(ISD::FP_TO_UINT, SDLoc(N), VT, N0);
 
   return FoldIntToFPToInt(N, DAG);
@@ -13782,7 +14347,7 @@ SDValue DAGCombiner::visitFP_EXTEND(SDNode *N) {
     return SDValue();
 
   // fold (fp_extend c1fp) -> c1fp
-  if (isConstantFPBuildVectorOrConstantFP(N0))
+  if (DAG.isConstantFPBuildVectorOrConstantFP(N0))
     return DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, N0);
 
   // fold (fp_extend (fp16_to_fp op)) -> (fp16_to_fp op)
@@ -13830,7 +14395,7 @@ SDValue DAGCombiner::visitFCEIL(SDNode *N) {
   EVT VT = N->getValueType(0);
 
   // fold (fceil c1) -> fceil(c1)
-  if (isConstantFPBuildVectorOrConstantFP(N0))
+  if (DAG.isConstantFPBuildVectorOrConstantFP(N0))
     return DAG.getNode(ISD::FCEIL, SDLoc(N), VT, N0);
 
   return SDValue();
@@ -13841,7 +14406,7 @@ SDValue DAGCombiner::visitFTRUNC(SDNode *N) {
   EVT VT = N->getValueType(0);
 
   // fold (ftrunc c1) -> ftrunc(c1)
-  if (isConstantFPBuildVectorOrConstantFP(N0))
+  if (DAG.isConstantFPBuildVectorOrConstantFP(N0))
     return DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, N0);
 
   // fold ftrunc (known rounded int x) -> x
@@ -13865,19 +14430,19 @@ SDValue DAGCombiner::visitFFLOOR(SDNode *N) {
   EVT VT = N->getValueType(0);
 
   // fold (ffloor c1) -> ffloor(c1)
-  if (isConstantFPBuildVectorOrConstantFP(N0))
+  if (DAG.isConstantFPBuildVectorOrConstantFP(N0))
     return DAG.getNode(ISD::FFLOOR, SDLoc(N), VT, N0);
 
   return SDValue();
 }
 
-// FIXME: FNEG and FABS have a lot in common; refactor.
 SDValue DAGCombiner::visitFNEG(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
+  SelectionDAG::FlagInserter FlagsInserter(DAG, N);
 
   // Constant fold FNEG.
-  if (isConstantFPBuildVectorOrConstantFP(N0))
+  if (DAG.isConstantFPBuildVectorOrConstantFP(N0))
     return DAG.getNode(ISD::FNEG, SDLoc(N), VT, N0);
 
   if (SDValue NegN0 =
@@ -13892,51 +14457,12 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) {
       (DAG.getTarget().Options.NoSignedZerosFPMath ||
        N->getFlags().hasNoSignedZeros()) && N0.hasOneUse()) {
     return DAG.getNode(ISD::FSUB, SDLoc(N), VT, N0.getOperand(1),
-                       N0.getOperand(0), N->getFlags());
-  }
-
-  // Transform fneg(bitconvert(x)) -> bitconvert(x ^ sign) to avoid loading
-  // constant pool values.
-  if (!TLI.isFNegFree(VT) &&
-      N0.getOpcode() == ISD::BITCAST &&
-      N0.getNode()->hasOneUse()) {
-    SDValue Int = N0.getOperand(0);
-    EVT IntVT = Int.getValueType();
-    if (IntVT.isInteger() && !IntVT.isVector()) {
-      APInt SignMask;
-      if (N0.getValueType().isVector()) {
-        // For a vector, get a mask such as 0x80... per scalar element
-        // and splat it.
-        SignMask = APInt::getSignMask(N0.getScalarValueSizeInBits());
-        SignMask = APInt::getSplat(IntVT.getSizeInBits(), SignMask);
-      } else {
-        // For a scalar, just generate 0x80...
-        SignMask = APInt::getSignMask(IntVT.getSizeInBits());
-      }
-      SDLoc DL0(N0);
-      Int = DAG.getNode(ISD::XOR, DL0, IntVT, Int,
-                        DAG.getConstant(SignMask, DL0, IntVT));
-      AddToWorklist(Int.getNode());
-      return DAG.getBitcast(VT, Int);
-    }
-  }
-
-  // (fneg (fmul c, x)) -> (fmul -c, x)
-  if (N0.getOpcode() == ISD::FMUL &&
-      (N0.getNode()->hasOneUse() || !TLI.isFNegFree(VT))) {
-    ConstantFPSDNode *CFP1 = dyn_cast<ConstantFPSDNode>(N0.getOperand(1));
-    if (CFP1) {
-      APFloat CVal = CFP1->getValueAPF();
-      CVal.changeSign();
-      if (LegalDAG && (TLI.isFPImmLegal(CVal, VT, ForCodeSize) ||
-                       TLI.isOperationLegal(ISD::ConstantFP, VT)))
-        return DAG.getNode(
-            ISD::FMUL, SDLoc(N), VT, N0.getOperand(0),
-            DAG.getNode(ISD::FNEG, SDLoc(N), VT, N0.getOperand(1)),
-            N0->getFlags());
-    }
+                       N0.getOperand(0));
   }
 
+  if (SDValue Cast = foldSignChangeInBitcast(N))
+    return Cast;
+
   return SDValue();
 }
 
@@ -13947,6 +14473,11 @@ static SDValue visitFMinMax(SelectionDAG &DAG, SDNode *N,
   EVT VT = N->getValueType(0);
   const ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0);
   const ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1);
+  const SDNodeFlags Flags = N->getFlags();
+  unsigned Opc = N->getOpcode();
+  bool PropagatesNaN = Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM;
+  bool IsMin = Opc == ISD::FMINNUM || Opc == ISD::FMINIMUM;
+  SelectionDAG::FlagInserter FlagsInserter(DAG, N);
 
   if (N0CFP && N1CFP) {
     const APFloat &C0 = N0CFP->getValueAPF();
@@ -13955,10 +14486,39 @@ static SDValue visitFMinMax(SelectionDAG &DAG, SDNode *N,
   }
 
   // Canonicalize to constant on RHS.
-  if (isConstantFPBuildVectorOrConstantFP(N0) &&
-      !isConstantFPBuildVectorOrConstantFP(N1))
+  if (DAG.isConstantFPBuildVectorOrConstantFP(N0) &&
+      !DAG.isConstantFPBuildVectorOrConstantFP(N1))
     return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N1, N0);
 
+  if (N1CFP) {
+    const APFloat &AF = N1CFP->getValueAPF();
+
+    // minnum(X, nan) -> X
+    // maxnum(X, nan) -> X
+    // minimum(X, nan) -> nan
+    // maximum(X, nan) -> nan
+    if (AF.isNaN())
+      return PropagatesNaN ? N->getOperand(1) : N->getOperand(0);
+
+    // In the following folds, inf can be replaced with the largest finite
+    // float, if the ninf flag is set.
+    if (AF.isInfinity() || (Flags.hasNoInfs() && AF.isLargest())) {
+      // minnum(X, -inf) -> -inf
+      // maxnum(X, +inf) -> +inf
+      // minimum(X, -inf) -> -inf if nnan
+      // maximum(X, +inf) -> +inf if nnan
+      if (IsMin == AF.isNegative() && (!PropagatesNaN || Flags.hasNoNaNs()))
+        return N->getOperand(1);
+
+      // minnum(X, +inf) -> X if nnan
+      // maxnum(X, -inf) -> X if nnan
+      // minimum(X, +inf) -> X
+      // maximum(X, -inf) -> X
+      if (IsMin != AF.isNegative() && (PropagatesNaN || Flags.hasNoNaNs()))
+        return N->getOperand(0);
+    }
+  }
+
   return SDValue();
 }
 
@@ -13983,7 +14543,7 @@ SDValue DAGCombiner::visitFABS(SDNode *N) {
   EVT VT = N->getValueType(0);
 
   // fold (fabs c1) -> fabs(c1)
-  if (isConstantFPBuildVectorOrConstantFP(N0))
+  if (DAG.isConstantFPBuildVectorOrConstantFP(N0))
     return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0);
 
   // fold (fabs (fabs x)) -> (fabs x)
@@ -13995,28 +14555,8 @@ SDValue DAGCombiner::visitFABS(SDNode *N) {
   if (N0.getOpcode() == ISD::FNEG || N0.getOpcode() == ISD::FCOPYSIGN)
     return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0.getOperand(0));
 
-  // fabs(bitcast(x)) -> bitcast(x & ~sign) to avoid constant pool loads.
-  if (!TLI.isFAbsFree(VT) && N0.getOpcode() == ISD::BITCAST && N0.hasOneUse()) {
-    SDValue Int = N0.getOperand(0);
-    EVT IntVT = Int.getValueType();
-    if (IntVT.isInteger() && !IntVT.isVector()) {
-      APInt SignMask;
-      if (N0.getValueType().isVector()) {
-        // For a vector, get a mask such as 0x7f... per scalar element
-        // and splat it.
-        SignMask = ~APInt::getSignMask(N0.getScalarValueSizeInBits());
-        SignMask = APInt::getSplat(IntVT.getSizeInBits(), SignMask);
-      } else {
-        // For a scalar, just generate 0x7f...
-        SignMask = ~APInt::getSignMask(IntVT.getSizeInBits());
-      }
-      SDLoc DL(N0);
-      Int = DAG.getNode(ISD::AND, DL, IntVT, Int,
-                        DAG.getConstant(SignMask, DL, IntVT));
-      AddToWorklist(Int.getNode());
-      return DAG.getBitcast(N->getValueType(0), Int);
-    }
-  }
+  if (SDValue Cast = foldSignChangeInBitcast(N))
+    return Cast;
 
   return SDValue();
 }
@@ -14026,6 +14566,13 @@ SDValue DAGCombiner::visitBRCOND(SDNode *N) {
   SDValue N1 = N->getOperand(1);
   SDValue N2 = N->getOperand(2);
 
+  // BRCOND(FREEZE(cond)) is equivalent to BRCOND(cond) (both are
+  // nondeterministic jumps).
+  if (N1->getOpcode() == ISD::FREEZE && N1.hasOneUse()) {
+    return DAG.getNode(ISD::BRCOND, SDLoc(N), MVT::Other, Chain,
+                       N1->getOperand(0), N2);
+  }
+
   // If N is a constant we could fold this into a fallthrough or unconditional
   // branch. However that doesn't happen very often in normal code, because
   // Instcombine/SimplifyCFG should have handled the available opportunities.
@@ -14179,63 +14726,6 @@ SDValue DAGCombiner::visitBR_CC(SDNode *N) {
   return SDValue();
 }
 
-/// Return true if 'Use' is a load or a store that uses N as its base pointer
-/// and that N may be folded in the load / store addressing mode.
-static bool canFoldInAddressingMode(SDNode *N, SDNode *Use,
-                                    SelectionDAG &DAG,
-                                    const TargetLowering &TLI) {
-  EVT VT;
-  unsigned AS;
-
-  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Use)) {
-    if (LD->isIndexed() || LD->getBasePtr().getNode() != N)
-      return false;
-    VT = LD->getMemoryVT();
-    AS = LD->getAddressSpace();
-  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(Use)) {
-    if (ST->isIndexed() || ST->getBasePtr().getNode() != N)
-      return false;
-    VT = ST->getMemoryVT();
-    AS = ST->getAddressSpace();
-  } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(Use)) {
-    if (LD->isIndexed() || LD->getBasePtr().getNode() != N)
-      return false;
-    VT = LD->getMemoryVT();
-    AS = LD->getAddressSpace();
-  } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(Use)) {
-    if (ST->isIndexed() || ST->getBasePtr().getNode() != N)
-      return false;
-    VT = ST->getMemoryVT();
-    AS = ST->getAddressSpace();
-  } else
-    return false;
-
-  TargetLowering::AddrMode AM;
-  if (N->getOpcode() == ISD::ADD) {
-    AM.HasBaseReg = true;
-    ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
-    if (Offset)
-      // [reg +/- imm]
-      AM.BaseOffs = Offset->getSExtValue();
-    else
-      // [reg +/- reg]
-      AM.Scale = 1;
-  } else if (N->getOpcode() == ISD::SUB) {
-    AM.HasBaseReg = true;
-    ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
-    if (Offset)
-      // [reg +/- imm]
-      AM.BaseOffs = -Offset->getSExtValue();
-    else
-      // [reg +/- reg]
-      AM.Scale = 1;
-  } else
-    return false;
-
-  return TLI.isLegalAddressingMode(DAG.getDataLayout(), AM,
-                                   VT.getTypeForEVT(*DAG.getContext()), AS);
-}
-
 static bool getCombineLoadStoreParts(SDNode *N, unsigned Inc, unsigned Dec,
                                      bool &IsLoad, bool &IsMasked, SDValue &Ptr,
                                      const TargetLowering &TLI) {
@@ -14464,16 +14954,13 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
     // Therefore, we have:
     //   t0 = (x0 * offset0 - x1 * y0 * y1 *offset1) + (y0 * y1) * t1
 
-    ConstantSDNode *CN =
-      cast<ConstantSDNode>(OtherUses[i]->getOperand(OffsetIdx));
-    int X0, X1, Y0, Y1;
+    auto *CN = cast<ConstantSDNode>(OtherUses[i]->getOperand(OffsetIdx));
     const APInt &Offset0 = CN->getAPIntValue();
-    APInt Offset1 = cast<ConstantSDNode>(Offset)->getAPIntValue();
-
-    X0 = (OtherUses[i]->getOpcode() == ISD::SUB && OffsetIdx == 1) ? -1 : 1;
-    Y0 = (OtherUses[i]->getOpcode() == ISD::SUB && OffsetIdx == 0) ? -1 : 1;
-    X1 = (AM == ISD::PRE_DEC && !Swapped) ? -1 : 1;
-    Y1 = (AM == ISD::PRE_DEC && Swapped) ? -1 : 1;
+    const APInt &Offset1 = cast<ConstantSDNode>(Offset)->getAPIntValue();
+    int X0 = (OtherUses[i]->getOpcode() == ISD::SUB && OffsetIdx == 1) ? -1 : 1;
+    int Y0 = (OtherUses[i]->getOpcode() == ISD::SUB && OffsetIdx == 0) ? -1 : 1;
+    int X1 = (AM == ISD::PRE_DEC && !Swapped) ? -1 : 1;
+    int Y1 = (AM == ISD::PRE_DEC && Swapped) ? -1 : 1;
 
     unsigned Opcode = (Y0 * Y1 < 0) ? ISD::SUB : ISD::ADD;
 
@@ -14665,8 +15152,8 @@ SDValue DAGCombiner::SplitIndexingFromLoad(LoadSDNode *LD) {
   return DAG.getNode(Opc, SDLoc(LD), BP.getSimpleValueType(), BP, Inc);
 }
 
-static inline int numVectorEltsOrZero(EVT T) {
-  return T.isVector() ? T.getVectorNumElements() : 0;
+static inline ElementCount numVectorEltsOrZero(EVT T) {
+  return T.isVector() ? T.getVectorElementCount() : ElementCount::getFixed(0);
 }
 
 bool DAGCombiner::getTruncatedStoreValue(StoreSDNode *ST, SDValue &Val) {
@@ -14734,6 +15221,24 @@ SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) {
   EVT STMemType = ST->getMemoryVT();
   EVT STType = ST->getValue().getValueType();
 
+  // There are two cases to consider here:
+  //  1. The store is fixed width and the load is scalable. In this case we
+  //     don't know at compile time if the store completely envelops the load
+  //     so we abandon the optimisation.
+  //  2. The store is scalable and the load is fixed width. We could
+  //     potentially support a limited number of cases here, but there has been
+  //     no cost-benefit analysis to prove it's worth it.
+  bool LdStScalable = LDMemType.isScalableVector();
+  if (LdStScalable != STMemType.isScalableVector())
+    return SDValue();
+
+  // If we are dealing with scalable vectors on a big endian platform the
+  // calculation of offsets below becomes trickier, since we do not know at
+  // compile time the absolute size of the vector. Until we've done more
+  // analysis on big-endian platforms it seems better to bail out for now.
+  if (LdStScalable && DAG.getDataLayout().isBigEndian())
+    return SDValue();
+
   BaseIndexOffset BasePtrLD = BaseIndexOffset::match(LD, DAG);
   BaseIndexOffset BasePtrST = BaseIndexOffset::match(ST, DAG);
   int64_t Offset;
@@ -14745,13 +15250,21 @@ SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) {
   // the stored value). With Offset=n (for n > 0) the loaded value starts at the
   // n:th least significant byte of the stored value.
   if (DAG.getDataLayout().isBigEndian())
-    Offset = ((int64_t)STMemType.getStoreSizeInBits() -
-              (int64_t)LDMemType.getStoreSizeInBits()) / 8 - Offset;
+    Offset = ((int64_t)STMemType.getStoreSizeInBits().getFixedSize() -
+              (int64_t)LDMemType.getStoreSizeInBits().getFixedSize()) /
+                 8 -
+             Offset;
 
   // Check that the stored value cover all bits that are loaded.
-  bool STCoversLD =
-      (Offset >= 0) &&
-      (Offset * 8 + LDMemType.getSizeInBits() <= STMemType.getSizeInBits());
+  bool STCoversLD;
+
+  TypeSize LdMemSize = LDMemType.getSizeInBits();
+  TypeSize StMemSize = STMemType.getSizeInBits();
+  if (LdStScalable)
+    STCoversLD = (Offset == 0) && LdMemSize == StMemSize;
+  else
+    STCoversLD = (Offset >= 0) && (Offset * 8 + LdMemSize.getFixedSize() <=
+                                   StMemSize.getFixedSize());
 
   auto ReplaceLd = [&](LoadSDNode *LD, SDValue Val, SDValue Chain) -> SDValue {
     if (LD->isIndexed()) {
@@ -14772,15 +15285,15 @@ SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) {
   // Memory as copy space (potentially masked).
   if (Offset == 0 && LDType == STType && STMemType == LDMemType) {
     // Simple case: Direct non-truncating forwarding
-    if (LDType.getSizeInBits() == LDMemType.getSizeInBits())
+    if (LDType.getSizeInBits() == LdMemSize)
       return ReplaceLd(LD, ST->getValue(), Chain);
     // Can we model the truncate and extension with an and mask?
     if (STType.isInteger() && LDMemType.isInteger() && !STType.isVector() &&
         !LDMemType.isVector() && LD->getExtensionType() != ISD::SEXTLOAD) {
       // Mask to size of LDMemType
       auto Mask =
-          DAG.getConstant(APInt::getLowBitsSet(STType.getSizeInBits(),
-                                               STMemType.getSizeInBits()),
+          DAG.getConstant(APInt::getLowBitsSet(STType.getFixedSizeInBits(),
+                                               StMemSize.getFixedSize()),
                           SDLoc(ST), STType);
       auto Val = DAG.getNode(ISD::AND, SDLoc(LD), LDType, ST->getValue(), Mask);
       return ReplaceLd(LD, Val, Chain);
@@ -15603,8 +16116,6 @@ ShrinkLoadReplaceStoreWithStore(const std::pair<unsigned, unsigned> &MaskInfo,
 
   // Figure out the offset for the store and the alignment of the access.
   unsigned StOffset;
-  unsigned NewAlign = St->getAlignment();
-
   if (DAG.getDataLayout().isLittleEndian())
     StOffset = ByteShift;
   else
@@ -15613,8 +16124,7 @@ ShrinkLoadReplaceStoreWithStore(const std::pair<unsigned, unsigned> &MaskInfo,
   SDValue Ptr = St->getBasePtr();
   if (StOffset) {
     SDLoc DL(IVal);
-    Ptr = DAG.getMemBasePlusOffset(Ptr, StOffset, DL);
-    NewAlign = MinAlign(NewAlign, StOffset);
+    Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(StOffset), DL);
   }
 
   // Truncate down to the new size.
@@ -15623,7 +16133,8 @@ ShrinkLoadReplaceStoreWithStore(const std::pair<unsigned, unsigned> &MaskInfo,
   ++OpsNarrowed;
   return DAG
       .getStore(St->getChain(), SDLoc(St), IVal, Ptr,
-                St->getPointerInfo().getWithOffset(StOffset), NewAlign);
+                St->getPointerInfo().getWithOffset(StOffset),
+                St->getOriginalAlign());
 }
 
 /// Look for sequence of load / op / store where op is one of 'or', 'xor', and
@@ -15727,7 +16238,8 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
       if (NewAlign < DAG.getDataLayout().getABITypeAlign(NewVTTy))
         return SDValue();
 
-      SDValue NewPtr = DAG.getMemBasePlusOffset(Ptr, PtrOff, SDLoc(LD));
+      SDValue NewPtr =
+          DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(PtrOff), SDLoc(LD));
       SDValue NewLD =
           DAG.getLoad(NewVT, SDLoc(N0), LD->getChain(), NewPtr,
                       LD->getPointerInfo().getWithOffset(PtrOff), NewAlign,
@@ -16035,9 +16547,9 @@ bool DAGCombiner::mergeStoresOfConstantsOrVecElts(
   // make sure we use trunc store if it's necessary to be legal.
   SDValue NewStore;
   if (!UseTrunc) {
-    NewStore = DAG.getStore(NewChain, DL, StoredVal, FirstInChain->getBasePtr(),
-                            FirstInChain->getPointerInfo(),
-                            FirstInChain->getAlignment());
+    NewStore =
+        DAG.getStore(NewChain, DL, StoredVal, FirstInChain->getBasePtr(),
+                     FirstInChain->getPointerInfo(), FirstInChain->getAlign());
   } else { // Must be realized as a trunc store
     EVT LegalizedStoredValTy =
         TLI.getTypeToTransformTo(*DAG.getContext(), StoredVal.getValueType());
@@ -16049,8 +16561,7 @@ bool DAGCombiner::mergeStoresOfConstantsOrVecElts(
     NewStore = DAG.getTruncStore(
         NewChain, DL, ExtendedStoreVal, FirstInChain->getBasePtr(),
         FirstInChain->getPointerInfo(), StoredVal.getValueType() /*TVT*/,
-        FirstInChain->getAlignment(),
-        FirstInChain->getMemOperand()->getFlags());
+        FirstInChain->getAlign(), FirstInChain->getMemOperand()->getFlags());
   }
 
   // Replace all merged stores with the new store.
@@ -16065,23 +16576,19 @@ void DAGCombiner::getStoreMergeCandidates(
     StoreSDNode *St, SmallVectorImpl<MemOpLink> &StoreNodes,
     SDNode *&RootNode) {
   // This holds the base pointer, index, and the offset in bytes from the base
-  // pointer.
+  // pointer. We must have a base and an offset. Do not handle stores to undef
+  // base pointers.
   BaseIndexOffset BasePtr = BaseIndexOffset::match(St, DAG);
-  EVT MemVT = St->getMemoryVT();
-
-  SDValue Val = peekThroughBitcasts(St->getValue());
-  // We must have a base and an offset.
-  if (!BasePtr.getBase().getNode())
-    return;
-
-  // Do not handle stores to undef base pointers.
-  if (BasePtr.getBase().isUndef())
+  if (!BasePtr.getBase().getNode() || BasePtr.getBase().isUndef())
     return;
 
+  SDValue Val = peekThroughBitcasts(St->getValue());
   StoreSource StoreSrc = getStoreSource(Val);
   assert(StoreSrc != StoreSource::Unknown && "Expected known source for store");
-  BaseIndexOffset LBasePtr;
+
   // Match on loadbaseptr if relevant.
+  EVT MemVT = St->getMemoryVT();
+  BaseIndexOffset LBasePtr;
   EVT LoadVT;
   if (StoreSrc == StoreSource::Load) {
     auto *Ld = cast<LoadSDNode>(Val);
@@ -16102,7 +16609,7 @@ void DAGCombiner::getStoreMergeCandidates(
                             int64_t &Offset) -> bool {
     // The memory operands must not be volatile/indexed/atomic.
     // TODO: May be able to relax for unordered atomics (see D66309)
-    if (!Other->isSimple() ||  Other->isIndexed())
+    if (!Other->isSimple() || Other->isIndexed())
       return false;
     // Don't mix temporal stores with non-temporal stores.
     if (St->isNonTemporal() != Other->isNonTemporal())
@@ -16111,37 +16618,38 @@ void DAGCombiner::getStoreMergeCandidates(
     // Allow merging constants of different types as integers.
     bool NoTypeMatch = (MemVT.isInteger()) ? !MemVT.bitsEq(Other->getMemoryVT())
                                            : Other->getMemoryVT() != MemVT;
-    if (StoreSrc == StoreSource::Load) {
+    switch (StoreSrc) {
+    case StoreSource::Load: {
       if (NoTypeMatch)
         return false;
-      // The Load's Base Ptr must also match
-      if (LoadSDNode *OtherLd = dyn_cast<LoadSDNode>(OtherBC)) {
-        BaseIndexOffset LPtr = BaseIndexOffset::match(OtherLd, DAG);
-        if (LoadVT != OtherLd->getMemoryVT())
-          return false;
-        // Loads must only have one use.
-        if (!OtherLd->hasNUsesOfValue(1, 0))
-          return false;
-        // The memory operands must not be volatile/indexed/atomic.
-        // TODO: May be able to relax for unordered atomics (see D66309)
-        if (!OtherLd->isSimple() ||
-            OtherLd->isIndexed())
-          return false;
-        // Don't mix temporal loads with non-temporal loads.
-        if (cast<LoadSDNode>(Val)->isNonTemporal() != OtherLd->isNonTemporal())
-          return false;
-        if (!(LBasePtr.equalBaseIndex(LPtr, DAG)))
-          return false;
-      } else
+      // The Load's Base Ptr must also match.
+      auto *OtherLd = dyn_cast<LoadSDNode>(OtherBC);
+      if (!OtherLd)
+        return false;
+      BaseIndexOffset LPtr = BaseIndexOffset::match(OtherLd, DAG);
+      if (LoadVT != OtherLd->getMemoryVT())
+        return false;
+      // Loads must only have one use.
+      if (!OtherLd->hasNUsesOfValue(1, 0))
         return false;
+      // The memory operands must not be volatile/indexed/atomic.
+      // TODO: May be able to relax for unordered atomics (see D66309)
+      if (!OtherLd->isSimple() || OtherLd->isIndexed())
+        return false;
+      // Don't mix temporal loads with non-temporal loads.
+      if (cast<LoadSDNode>(Val)->isNonTemporal() != OtherLd->isNonTemporal())
+        return false;
+      if (!(LBasePtr.equalBaseIndex(LPtr, DAG)))
+        return false;
+      break;
     }
-    if (StoreSrc == StoreSource::Constant) {
+    case StoreSource::Constant:
       if (NoTypeMatch)
         return false;
       if (!(isa<ConstantSDNode>(OtherBC) || isa<ConstantFPSDNode>(OtherBC)))
         return false;
-    }
-    if (StoreSrc == StoreSource::Extract) {
+      break;
+    case StoreSource::Extract:
       // Do not merge truncated stores here.
       if (Other->isTruncatingStore())
         return false;
@@ -16150,6 +16658,9 @@ void DAGCombiner::getStoreMergeCandidates(
       if (OtherBC.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
           OtherBC.getOpcode() != ISD::EXTRACT_SUBVECTOR)
         return false;
+      break;
+    default:
+      llvm_unreachable("Unhandled store source for merging");
     }
     Ptr = BaseIndexOffset::match(Other, DAG);
     return (BasePtr.equalBaseIndex(Ptr, DAG, Offset));
@@ -16160,11 +16671,22 @@ void DAGCombiner::getStoreMergeCandidates(
   auto OverLimitInDependenceCheck = [&](SDNode *StoreNode,
                                         SDNode *RootNode) -> bool {
     auto RootCount = StoreRootCountMap.find(StoreNode);
-    if (RootCount != StoreRootCountMap.end() &&
-        RootCount->second.first == RootNode &&
-        RootCount->second.second > StoreMergeDependenceLimit)
-      return true;
-    return false;
+    return RootCount != StoreRootCountMap.end() &&
+           RootCount->second.first == RootNode &&
+           RootCount->second.second > StoreMergeDependenceLimit;
+  };
+
+  auto TryToAddCandidate = [&](SDNode::use_iterator UseIter) {
+    // This must be a chain use.
+    if (UseIter.getOperandNo() != 0)
+      return;
+    if (auto *OtherStore = dyn_cast<StoreSDNode>(*UseIter)) {
+      BaseIndexOffset Ptr;
+      int64_t PtrDiff;
+      if (CandidateMatch(OtherStore, Ptr, PtrDiff) &&
+          !OverLimitInDependenceCheck(OtherStore, RootNode))
+        StoreNodes.push_back(MemOpLink(OtherStore, PtrDiff));
+    }
   };
 
   // We looking for a root node which is an ancestor to all mergable
@@ -16186,31 +16708,21 @@ void DAGCombiner::getStoreMergeCandidates(
   RootNode = St->getChain().getNode();
 
   unsigned NumNodesExplored = 0;
-  if (LoadSDNode *Ldn = dyn_cast<LoadSDNode>(RootNode)) {
+  const unsigned MaxSearchNodes = 1024;
+  if (auto *Ldn = dyn_cast<LoadSDNode>(RootNode)) {
     RootNode = Ldn->getChain().getNode();
     for (auto I = RootNode->use_begin(), E = RootNode->use_end();
-         I != E && NumNodesExplored < 1024; ++I, ++NumNodesExplored)
-      if (I.getOperandNo() == 0 && isa<LoadSDNode>(*I)) // walk down chain
+         I != E && NumNodesExplored < MaxSearchNodes; ++I, ++NumNodesExplored) {
+      if (I.getOperandNo() == 0 && isa<LoadSDNode>(*I)) { // walk down chain
         for (auto I2 = (*I)->use_begin(), E2 = (*I)->use_end(); I2 != E2; ++I2)
-          if (I2.getOperandNo() == 0)
-            if (StoreSDNode *OtherST = dyn_cast<StoreSDNode>(*I2)) {
-              BaseIndexOffset Ptr;
-              int64_t PtrDiff;
-              if (CandidateMatch(OtherST, Ptr, PtrDiff) &&
-                  !OverLimitInDependenceCheck(OtherST, RootNode))
-                StoreNodes.push_back(MemOpLink(OtherST, PtrDiff));
-            }
-  } else
+          TryToAddCandidate(I2);
+      }
+    }
+  } else {
     for (auto I = RootNode->use_begin(), E = RootNode->use_end();
-         I != E && NumNodesExplored < 1024; ++I, ++NumNodesExplored)
-      if (I.getOperandNo() == 0)
-        if (StoreSDNode *OtherST = dyn_cast<StoreSDNode>(*I)) {
-          BaseIndexOffset Ptr;
-          int64_t PtrDiff;
-          if (CandidateMatch(OtherST, Ptr, PtrDiff) &&
-              !OverLimitInDependenceCheck(OtherST, RootNode))
-            StoreNodes.push_back(MemOpLink(OtherST, PtrDiff));
-        }
+         I != E && NumNodesExplored < MaxSearchNodes; ++I, ++NumNodesExplored)
+      TryToAddCandidate(I);
+  }
 }
 
 // We need to check that merging these stores does not cause a loop in
@@ -16580,7 +17092,7 @@ bool DAGCombiner::tryStoreMergeOfLoads(SmallVectorImpl<MemOpLink> &StoreNodes,
     }
     LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
     unsigned FirstStoreAS = FirstInChain->getAddressSpace();
-    unsigned FirstStoreAlign = FirstInChain->getAlignment();
+    Align FirstStoreAlign = FirstInChain->getAlign();
     LoadSDNode *FirstLoad = cast<LoadSDNode>(LoadNodes[0].MemNode);
 
     // Scan the memory operations on the chain and find the first
@@ -16675,7 +17187,7 @@ bool DAGCombiner::tryStoreMergeOfLoads(SmallVectorImpl<MemOpLink> &StoreNodes,
     // the NumElem refers to array/index size.
     unsigned NumElem = std::min(NumConsecutiveStores, LastConsecutiveLoad + 1);
     NumElem = std::min(LastLegalType, NumElem);
-    unsigned FirstLoadAlign = FirstLoad->getAlignment();
+    Align FirstLoadAlign = FirstLoad->getAlign();
 
     if (NumElem < 2) {
       // We know that candidate stores are in order and of correct
@@ -16687,8 +17199,8 @@ bool DAGCombiner::tryStoreMergeOfLoads(SmallVectorImpl<MemOpLink> &StoreNodes,
       // can here.
       unsigned NumSkip = 1;
       while ((NumSkip < LoadNodes.size()) &&
-             (LoadNodes[NumSkip].MemNode->getAlignment() <= FirstLoadAlign) &&
-             (StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign))
+             (LoadNodes[NumSkip].MemNode->getAlign() <= FirstLoadAlign) &&
+             (StoreNodes[NumSkip].MemNode->getAlign() <= FirstStoreAlign))
         NumSkip++;
       StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip);
       LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + NumSkip);
@@ -16761,11 +17273,10 @@ bool DAGCombiner::tryStoreMergeOfLoads(SmallVectorImpl<MemOpLink> &StoreNodes,
                                FirstLoad->getChain(), FirstLoad->getBasePtr(),
                                FirstLoad->getPointerInfo(), JointMemOpVT,
                                FirstLoadAlign, LdMMOFlags);
-      NewStore = DAG.getTruncStore(NewStoreChain, StoreDL, NewLoad,
-                                   FirstInChain->getBasePtr(),
-                                   FirstInChain->getPointerInfo(), JointMemOpVT,
-                                   FirstInChain->getAlignment(),
-                                   FirstInChain->getMemOperand()->getFlags());
+      NewStore = DAG.getTruncStore(
+          NewStoreChain, StoreDL, NewLoad, FirstInChain->getBasePtr(),
+          FirstInChain->getPointerInfo(), JointMemOpVT,
+          FirstInChain->getAlign(), FirstInChain->getMemOperand()->getFlags());
     }
 
     // Transfer chain users from old loads to the new load.
@@ -16967,17 +17478,15 @@ SDValue DAGCombiner::replaceStoreOfFPConstant(StoreSDNode *ST) {
       if (DAG.getDataLayout().isBigEndian())
         std::swap(Lo, Hi);
 
-      unsigned Alignment = ST->getAlignment();
       MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
       AAMDNodes AAInfo = ST->getAAInfo();
 
       SDValue St0 = DAG.getStore(Chain, DL, Lo, Ptr, ST->getPointerInfo(),
-                                 ST->getAlignment(), MMOFlags, AAInfo);
-      Ptr = DAG.getMemBasePlusOffset(Ptr, 4, DL);
-      Alignment = MinAlign(Alignment, 4U);
+                                 ST->getOriginalAlign(), MMOFlags, AAInfo);
+      Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(4), DL);
       SDValue St1 = DAG.getStore(Chain, DL, Hi, Ptr,
                                  ST->getPointerInfo().getWithOffset(4),
-                                 Alignment, MMOFlags, AAInfo);
+                                 ST->getOriginalAlign(), MMOFlags, AAInfo);
       return DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
                          St0, St1);
     }
@@ -17038,7 +17547,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
     return NewST;
 
   // Try transforming several stores into STORE (BSWAP).
-  if (SDValue Store = MatchStoreCombine(ST))
+  if (SDValue Store = mergeTruncStores(ST))
     return Store;
 
   if (ST->isUnindexed()) {
@@ -17111,11 +17620,12 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
           !ST1->getBasePtr().isUndef() &&
           // BaseIndexOffset and the code below requires knowing the size
           // of a vector, so bail out if MemoryVT is scalable.
+          !ST->getMemoryVT().isScalableVector() &&
           !ST1->getMemoryVT().isScalableVector()) {
         const BaseIndexOffset STBase = BaseIndexOffset::match(ST, DAG);
         const BaseIndexOffset ChainBase = BaseIndexOffset::match(ST1, DAG);
-        unsigned STBitSize = ST->getMemoryVT().getSizeInBits();
-        unsigned ChainBitSize = ST1->getMemoryVT().getSizeInBits();
+        unsigned STBitSize = ST->getMemoryVT().getFixedSizeInBits();
+        unsigned ChainBitSize = ST1->getMemoryVT().getFixedSizeInBits();
         // If this is a store who's preceding store to a subset of the current
         // location and no one other node is chained to that store we can
         // effectively drop the store. Do not remove stores to undef as they may
@@ -17186,8 +17696,7 @@ SDValue DAGCombiner::visitLIFETIME_END(SDNode *N) {
   // We walk up the chains to find stores.
   SmallVector<SDValue, 8> Chains = {N->getOperand(0)};
   while (!Chains.empty()) {
-    SDValue Chain = Chains.back();
-    Chains.pop_back();
+    SDValue Chain = Chains.pop_back_val();
     if (!Chain.hasOneUse())
       continue;
     switch (Chain.getOpcode()) {
@@ -17207,11 +17716,16 @@ SDValue DAGCombiner::visitLIFETIME_END(SDNode *N) {
       // TODO: Can relax for unordered atomics (see D66309)
       if (!ST->isSimple() || ST->isIndexed())
         continue;
+      const TypeSize StoreSize = ST->getMemoryVT().getStoreSize();
+      // The bounds of a scalable store are not known until runtime, so this
+      // store cannot be elided.
+      if (StoreSize.isScalable())
+        continue;
       const BaseIndexOffset StoreBase = BaseIndexOffset::match(ST, DAG);
       // If we store purely within object bounds just before its lifetime ends,
       // we can remove the store.
       if (LifetimeEndBase.contains(DAG, LifetimeEnd->getSize() * 8, StoreBase,
-                                   ST->getMemoryVT().getStoreSizeInBits())) {
+                                   StoreSize.getFixedSize() * 8)) {
         LLVM_DEBUG(dbgs() << "\nRemoving store:"; StoreBase.dump();
                    dbgs() << "\nwithin LIFETIME_END of : ";
                    LifetimeEndBase.dump(); dbgs() << "\n");
@@ -17310,7 +17824,6 @@ SDValue DAGCombiner::splitMergedValStore(StoreSDNode *ST) {
     return SDValue();
 
   // Start to split store.
-  unsigned Alignment = ST->getAlignment();
   MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
   AAMDNodes AAInfo = ST->getAAInfo();
 
@@ -17323,13 +17836,12 @@ SDValue DAGCombiner::splitMergedValStore(StoreSDNode *ST) {
   SDValue Ptr = ST->getBasePtr();
   // Lower value store.
   SDValue St0 = DAG.getStore(Chain, DL, Lo, Ptr, ST->getPointerInfo(),
-                             ST->getAlignment(), MMOFlags, AAInfo);
-  Ptr = DAG.getMemBasePlusOffset(Ptr, HalfValBitSize / 8, DL);
+                             ST->getOriginalAlign(), MMOFlags, AAInfo);
+  Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(HalfValBitSize / 8), DL);
   // Higher value store.
-  SDValue St1 =
-      DAG.getStore(St0, DL, Hi, Ptr,
-                   ST->getPointerInfo().getWithOffset(HalfValBitSize / 8),
-                   Alignment / 2, MMOFlags, AAInfo);
+  SDValue St1 = DAG.getStore(
+      St0, DL, Hi, Ptr, ST->getPointerInfo().getWithOffset(HalfValBitSize / 8),
+      ST->getOriginalAlign(), MMOFlags, AAInfo);
   return St1;
 }
 
@@ -17567,6 +18079,13 @@ SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT,
 
   EVT ResultVT = EVE->getValueType(0);
   EVT VecEltVT = InVecVT.getVectorElementType();
+
+  // If the vector element type is not a multiple of a byte then we are unable
+  // to correctly compute an address to load only the extracted element as a
+  // scalar.
+  if (!VecEltVT.isByteSized())
+    return SDValue();
+
   Align Alignment = OriginalLoad->getAlign();
   Align NewAlign = DAG.getDataLayout().getABITypeAlign(
       VecEltVT.getTypeForEVT(*DAG.getContext()));
@@ -18202,20 +18721,24 @@ SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N,
   // operands will all be based off of VecIn1, even those in VecIn2.
   unsigned Vec2Offset = DidSplitVec ? 0 : InVT1.getVectorNumElements();
 
+  uint64_t VTSize = VT.getFixedSizeInBits();
+  uint64_t InVT1Size = InVT1.getFixedSizeInBits();
+  uint64_t InVT2Size = InVT2.getFixedSizeInBits();
+
   // We can't generate a shuffle node with mismatched input and output types.
   // Try to make the types match the type of the output.
   if (InVT1 != VT || InVT2 != VT) {
-    if ((VT.getSizeInBits() % InVT1.getSizeInBits() == 0) && InVT1 == InVT2) {
+    if ((VTSize % InVT1Size == 0) && InVT1 == InVT2) {
       // If the output vector length is a multiple of both input lengths,
       // we can concatenate them and pad the rest with undefs.
-      unsigned NumConcats = VT.getSizeInBits() / InVT1.getSizeInBits();
+      unsigned NumConcats = VTSize / InVT1Size;
       assert(NumConcats >= 2 && "Concat needs at least two inputs!");
       SmallVector<SDValue, 2> ConcatOps(NumConcats, DAG.getUNDEF(InVT1));
       ConcatOps[0] = VecIn1;
       ConcatOps[1] = VecIn2 ? VecIn2 : DAG.getUNDEF(InVT1);
       VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps);
       VecIn2 = SDValue();
-    } else if (InVT1.getSizeInBits() == VT.getSizeInBits() * 2) {
+    } else if (InVT1Size == VTSize * 2) {
       if (!TLI.isExtractSubvectorCheap(VT, InVT1, NumElems))
         return SDValue();
 
@@ -18228,7 +18751,7 @@ SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N,
         // Since we now have shorter input vectors, adjust the offset of the
         // second vector's start.
         Vec2Offset = NumElems;
-      } else if (InVT2.getSizeInBits() <= InVT1.getSizeInBits()) {
+      } else if (InVT2Size <= InVT1Size) {
         // VecIn1 is wider than the output, and we have another, possibly
         // smaller input. Pad the smaller input with undefs, shuffle at the
         // input vector width, and extract the output.
@@ -18253,8 +18776,7 @@ SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N,
         // when we start sorting the vectors by type.
         return SDValue();
       }
-    } else if (InVT2.getSizeInBits() * 2 == VT.getSizeInBits() &&
-               InVT1.getSizeInBits() == VT.getSizeInBits()) {
+    } else if (InVT2Size * 2 == VTSize && InVT1Size == VTSize) {
       SmallVector<SDValue, 2> ConcatOps(2, DAG.getUNDEF(InVT2));
       ConcatOps[0] = VecIn2;
       VecIn2 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps);
@@ -18445,8 +18967,7 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
     // Have we seen this input vector before?
     // The vectors are expected to be tiny (usually 1 or 2 elements), so using
     // a map back from SDValues to numbers isn't worth it.
-    unsigned Idx = std::distance(
-        VecIn.begin(), std::find(VecIn.begin(), VecIn.end(), ExtractedFromVec));
+    unsigned Idx = std::distance(VecIn.begin(), find(VecIn, ExtractedFromVec));
     if (Idx == VecIn.size())
       VecIn.push_back(ExtractedFromVec);
 
@@ -18904,7 +19425,7 @@ static SDValue combineConcatVectorOfCasts(SDNode *N, SelectionDAG &DAG) {
   // check the other type in the cast to make sure this is really legal.
   EVT VT = N->getValueType(0);
   EVT SrcEltVT = SrcVT.getVectorElementType();
-  unsigned NumElts = SrcVT.getVectorElementCount().Min * N->getNumOperands();
+  ElementCount NumElts = SrcVT.getVectorElementCount() * N->getNumOperands();
   EVT ConcatSrcVT = EVT::getVectorVT(*DAG.getContext(), SrcEltVT, NumElts);
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   switch (CastOpcode) {
@@ -18941,9 +19462,8 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
     return DAG.getUNDEF(VT);
 
   // Optimize concat_vectors where all but the first of the vectors are undef.
-  if (std::all_of(std::next(N->op_begin()), N->op_end(), [](const SDValue &Op) {
-        return Op.isUndef();
-      })) {
+  if (all_of(drop_begin(N->ops()),
+             [](const SDValue &Op) { return Op.isUndef(); })) {
     SDValue In = N->getOperand(0);
     assert(In.getValueType().isVector() && "Must concat vectors");
 
@@ -19116,15 +19636,16 @@ static SDValue getSubVectorSrc(SDValue V, SDValue Index, EVT SubVT) {
   auto *IndexC = dyn_cast<ConstantSDNode>(Index);
   if (IndexC && V.getOpcode() == ISD::CONCAT_VECTORS &&
       V.getOperand(0).getValueType() == SubVT &&
-      (IndexC->getZExtValue() % SubVT.getVectorNumElements()) == 0) {
-    uint64_t SubIdx = IndexC->getZExtValue() / SubVT.getVectorNumElements();
+      (IndexC->getZExtValue() % SubVT.getVectorMinNumElements()) == 0) {
+    uint64_t SubIdx = IndexC->getZExtValue() / SubVT.getVectorMinNumElements();
     return V.getOperand(SubIdx);
   }
   return SDValue();
 }
 
 static SDValue narrowInsertExtractVectorBinOp(SDNode *Extract,
-                                              SelectionDAG &DAG) {
+                                              SelectionDAG &DAG,
+                                              bool LegalOperations) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue BinOp = Extract->getOperand(0);
   unsigned BinOpcode = BinOp.getOpcode();
@@ -19138,7 +19659,7 @@ static SDValue narrowInsertExtractVectorBinOp(SDNode *Extract,
 
   SDValue Index = Extract->getOperand(1);
   EVT SubVT = Extract->getValueType(0);
-  if (!TLI.isOperationLegalOrCustom(BinOpcode, SubVT))
+  if (!TLI.isOperationLegalOrCustom(BinOpcode, SubVT, LegalOperations))
     return SDValue();
 
   SDValue Sub0 = getSubVectorSrc(Bop0, Index, SubVT);
@@ -19159,11 +19680,12 @@ static SDValue narrowInsertExtractVectorBinOp(SDNode *Extract,
 
 /// If we are extracting a subvector produced by a wide binary operator try
 /// to use a narrow binary operator and/or avoid concatenation and extraction.
-static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) {
+static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG,
+                                          bool LegalOperations) {
   // TODO: Refactor with the caller (visitEXTRACT_SUBVECTOR), so we can share
   // some of these bailouts with other transforms.
 
-  if (SDValue V = narrowInsertExtractVectorBinOp(Extract, DAG))
+  if (SDValue V = narrowInsertExtractVectorBinOp(Extract, DAG, LegalOperations))
     return V;
 
   // The extract index must be a constant, so we can map it to a concat operand.
@@ -19308,19 +19830,15 @@ static SDValue narrowExtractedVectorLoad(SDNode *Extract, SelectionDAG &DAG) {
     return SDValue();
 
   unsigned Index = ExtIdx->getZExtValue();
-  unsigned NumElts = VT.getVectorNumElements();
+  unsigned NumElts = VT.getVectorMinNumElements();
 
-  // If the index is a multiple of the extract element count, we can offset the
-  // address by the store size multiplied by the subvector index. Otherwise if
-  // the scalar type is byte sized, we can just use the index multiplied by
-  // the element size in bytes as the offset.
-  unsigned Offset;
-  if (Index % NumElts == 0)
-    Offset = (Index / NumElts) * VT.getStoreSize();
-  else if (VT.getScalarType().isByteSized())
-    Offset = Index * VT.getScalarType().getStoreSize();
-  else
-    return SDValue();
+  // The definition of EXTRACT_SUBVECTOR states that the index must be a
+  // multiple of the minimum number of elements in the result type.
+  assert(Index % NumElts == 0 && "The extract subvector index is not a "
+                                 "multiple of the result's element count");
+
+  // It's fine to use TypeSize here as we know the offset will not be negative.
+  TypeSize Offset = VT.getStoreSize() * (Index / NumElts);
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (!TLI.shouldReduceLoadWidth(Ld, Ld->getExtensionType(), VT))
@@ -19329,13 +19847,21 @@ static SDValue narrowExtractedVectorLoad(SDNode *Extract, SelectionDAG &DAG) {
   // The narrow load will be offset from the base address of the old load if
   // we are extracting from something besides index 0 (little-endian).
   SDLoc DL(Extract);
-  SDValue BaseAddr = Ld->getBasePtr();
 
   // TODO: Use "BaseIndexOffset" to make this more effective.
-  SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
+  SDValue NewAddr = DAG.getMemBasePlusOffset(Ld->getBasePtr(), Offset, DL);
+
+  uint64_t StoreSize = MemoryLocation::getSizeOrUnknown(VT.getStoreSize());
   MachineFunction &MF = DAG.getMachineFunction();
-  MachineMemOperand *MMO = MF.getMachineMemOperand(Ld->getMemOperand(), Offset,
-                                                   VT.getStoreSize());
+  MachineMemOperand *MMO;
+  if (Offset.isScalable()) {
+    MachinePointerInfo MPI =
+        MachinePointerInfo(Ld->getPointerInfo().getAddrSpace());
+    MMO = MF.getMachineMemOperand(Ld->getMemOperand(), MPI, StoreSize);
+  } else
+    MMO = MF.getMachineMemOperand(Ld->getMemOperand(), Offset.getFixedSize(),
+                                  StoreSize);
+
   SDValue NewLd = DAG.getLoad(VT, DL, Ld->getChain(), NewAddr, MMO);
   DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
   return NewLd;
@@ -19388,8 +19914,9 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {
     }
     if ((DestNumElts % SrcNumElts) == 0) {
       unsigned DestSrcRatio = DestNumElts / SrcNumElts;
-      if ((NVT.getVectorMinNumElements() % DestSrcRatio) == 0) {
-        ElementCount NewExtEC = NVT.getVectorElementCount() / DestSrcRatio;
+      if (NVT.getVectorElementCount().isKnownMultipleOf(DestSrcRatio)) {
+        ElementCount NewExtEC =
+            NVT.getVectorElementCount().divideCoefficientBy(DestSrcRatio);
         EVT ScalarVT = SrcVT.getScalarType();
         if ((ExtIdx % DestSrcRatio) == 0) {
           SDLoc DL(N);
@@ -19403,7 +19930,7 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {
                             V.getOperand(0), NewIndex);
             return DAG.getBitcast(NVT, NewExtract);
           }
-          if (NewExtEC == 1 &&
+          if (NewExtEC.isScalar() &&
               TLI.isOperationLegalOrCustom(ISD::EXTRACT_VECTOR_ELT, ScalarVT)) {
             SDValue NewIndex = DAG.getVectorIdxConstant(IndexValScaled, DL);
             SDValue NewExtract =
@@ -19508,7 +20035,7 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {
         N->getOperand(1));
   }
 
-  if (SDValue NarrowBOp = narrowExtractedVectorBinOp(N, DAG))
+  if (SDValue NarrowBOp = narrowExtractedVectorBinOp(N, DAG, LegalOperations))
     return NarrowBOp;
 
   if (SimplifyDemandedVectorElts(SDValue(N, 0)))
@@ -20286,52 +20813,52 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
     }
   }
 
-  // Canonicalize shuffles according to rules:
-  //  shuffle(A, shuffle(A, B)) -> shuffle(shuffle(A,B), A)
-  //  shuffle(B, shuffle(A, B)) -> shuffle(shuffle(A,B), B)
-  //  shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B)
-  if (N1.getOpcode() == ISD::VECTOR_SHUFFLE &&
-      N0.getOpcode() != ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG &&
-      TLI.isTypeLegal(VT)) {
-    // The incoming shuffle must be of the same type as the result of the
-    // current shuffle.
-    assert(N1->getOperand(0).getValueType() == VT &&
-           "Shuffle types don't match");
-
-    SDValue SV0 = N1->getOperand(0);
-    SDValue SV1 = N1->getOperand(1);
-    bool HasSameOp0 = N0 == SV0;
-    bool IsSV1Undef = SV1.isUndef();
-    if (HasSameOp0 || IsSV1Undef || N0 == SV1)
-      // Commute the operands of this shuffle so that next rule
-      // will trigger.
+  if (Level < AfterLegalizeDAG && TLI.isTypeLegal(VT)) {
+    // Canonicalize shuffles according to rules:
+    //  shuffle(A, shuffle(A, B)) -> shuffle(shuffle(A,B), A)
+    //  shuffle(B, shuffle(A, B)) -> shuffle(shuffle(A,B), B)
+    //  shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B)
+    if (N1.getOpcode() == ISD::VECTOR_SHUFFLE &&
+        N0.getOpcode() != ISD::VECTOR_SHUFFLE) {
+      // The incoming shuffle must be of the same type as the result of the
+      // current shuffle.
+      assert(N1->getOperand(0).getValueType() == VT &&
+             "Shuffle types don't match");
+
+      SDValue SV0 = N1->getOperand(0);
+      SDValue SV1 = N1->getOperand(1);
+      bool HasSameOp0 = N0 == SV0;
+      bool IsSV1Undef = SV1.isUndef();
+      if (HasSameOp0 || IsSV1Undef || N0 == SV1)
+        // Commute the operands of this shuffle so merging below will trigger.
+        return DAG.getCommutedVectorShuffle(*SVN);
+    }
+
+    // Canonicalize splat shuffles to the RHS to improve merging below.
+    //  shuffle(splat(A,u), shuffle(C,D)) -> shuffle'(shuffle(C,D), splat(A,u))
+    if (N0.getOpcode() == ISD::VECTOR_SHUFFLE &&
+        N1.getOpcode() == ISD::VECTOR_SHUFFLE &&
+        cast<ShuffleVectorSDNode>(N0)->isSplat() &&
+        !cast<ShuffleVectorSDNode>(N1)->isSplat()) {
       return DAG.getCommutedVectorShuffle(*SVN);
+    }
   }
 
-  // Try to fold according to rules:
-  //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, B, M2)
-  //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, C, M2)
-  //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2)
-  // Don't try to fold shuffles with illegal type.
-  // Only fold if this shuffle is the only user of the other shuffle.
-  if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && N->isOnlyUserOf(N0.getNode()) &&
-      Level < AfterLegalizeDAG && TLI.isTypeLegal(VT)) {
-    ShuffleVectorSDNode *OtherSV = cast<ShuffleVectorSDNode>(N0);
-
+  // Compute the combined shuffle mask for a shuffle with SV0 as the first
+  // operand, and SV1 as the second operand.
+  // i.e. Merge SVN(OtherSVN, N1) -> shuffle(SV0, SV1, Mask).
+  auto MergeInnerShuffle = [NumElts](ShuffleVectorSDNode *SVN,
+                                     ShuffleVectorSDNode *OtherSVN, SDValue N1,
+                                     SDValue &SV0, SDValue &SV1,
+                                     SmallVectorImpl<int> &Mask) -> bool {
     // Don't try to fold splats; they're likely to simplify somehow, or they
     // might be free.
-    if (OtherSV->isSplat())
-      return SDValue();
+    if (OtherSVN->isSplat())
+      return false;
 
-    // The incoming shuffle must be of the same type as the result of the
-    // current shuffle.
-    assert(OtherSV->getOperand(0).getValueType() == VT &&
-           "Shuffle types don't match");
+    SV0 = SV1 = SDValue();
+    Mask.clear();
 
-    SDValue SV0, SV1;
-    SmallVector<int, 4> Mask;
-    // Compute the combined shuffle mask for a shuffle with SV0 as the first
-    // operand, and SV1 as the second operand.
     for (unsigned i = 0; i != NumElts; ++i) {
       int Idx = SVN->getMaskElt(i);
       if (Idx < 0) {
@@ -20344,15 +20871,14 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
       if (Idx < (int)NumElts) {
         // This shuffle index refers to the inner shuffle N0. Lookup the inner
         // shuffle mask to identify which vector is actually referenced.
-        Idx = OtherSV->getMaskElt(Idx);
+        Idx = OtherSVN->getMaskElt(Idx);
         if (Idx < 0) {
           // Propagate Undef.
           Mask.push_back(Idx);
           continue;
         }
-
-        CurrentVec = (Idx < (int) NumElts) ? OtherSV->getOperand(0)
-                                           : OtherSV->getOperand(1);
+        CurrentVec = (Idx < (int)NumElts) ? OtherSVN->getOperand(0)
+                                          : OtherSVN->getOperand(1);
       } else {
         // This shuffle index references an element within N1.
         CurrentVec = N1;
@@ -20374,38 +20900,82 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
         Mask.push_back(Idx);
         continue;
       }
+      if (!SV1.getNode() || SV1 == CurrentVec) {
+        // Ok. CurrentVec is the right hand side.
+        // Update the mask accordingly.
+        SV1 = CurrentVec;
+        Mask.push_back(Idx + NumElts);
+        continue;
+      }
 
-      // Bail out if we cannot convert the shuffle pair into a single shuffle.
-      if (SV1.getNode() && SV1 != CurrentVec)
-        return SDValue();
+      // Last chance - see if the vector is another shuffle and if it
+      // uses one of the existing candidate shuffle ops.
+      if (auto *CurrentSVN = dyn_cast<ShuffleVectorSDNode>(CurrentVec)) {
+        int InnerIdx = CurrentSVN->getMaskElt(Idx);
+        if (InnerIdx < 0) {
+          Mask.push_back(-1);
+          continue;
+        }
+        SDValue InnerVec = (InnerIdx < (int)NumElts)
+                               ? CurrentSVN->getOperand(0)
+                               : CurrentSVN->getOperand(1);
+        if (InnerVec.isUndef()) {
+          Mask.push_back(-1);
+          continue;
+        }
+        InnerIdx %= NumElts;
+        if (InnerVec == SV0) {
+          Mask.push_back(InnerIdx);
+          continue;
+        }
+        if (InnerVec == SV1) {
+          Mask.push_back(InnerIdx + NumElts);
+          continue;
+        }
+      }
 
-      // Ok. CurrentVec is the right hand side.
-      // Update the mask accordingly.
-      SV1 = CurrentVec;
-      Mask.push_back(Idx + NumElts);
+      // Bail out if we cannot convert the shuffle pair into a single shuffle.
+      return false;
     }
+    return true;
+  };
+
+  // Try to fold according to rules:
+  //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, B, M2)
+  //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, C, M2)
+  //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2)
+  // Don't try to fold shuffles with illegal type.
+  // Only fold if this shuffle is the only user of the other shuffle.
+  if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && N->isOnlyUserOf(N0.getNode()) &&
+      Level < AfterLegalizeDAG && TLI.isTypeLegal(VT)) {
+    ShuffleVectorSDNode *OtherSV = cast<ShuffleVectorSDNode>(N0);
 
-    // Check if all indices in Mask are Undef. In case, propagate Undef.
-    bool isUndefMask = true;
-    for (unsigned i = 0; i != NumElts && isUndefMask; ++i)
-      isUndefMask &= Mask[i] < 0;
+    // The incoming shuffle must be of the same type as the result of the
+    // current shuffle.
+    assert(OtherSV->getOperand(0).getValueType() == VT &&
+           "Shuffle types don't match");
 
-    if (isUndefMask)
-      return DAG.getUNDEF(VT);
+    SDValue SV0, SV1;
+    SmallVector<int, 4> Mask;
+    if (MergeInnerShuffle(SVN, OtherSV, N1, SV0, SV1, Mask)) {
+      // Check if all indices in Mask are Undef. In case, propagate Undef.
+      if (llvm::all_of(Mask, [](int M) { return M < 0; }))
+        return DAG.getUNDEF(VT);
 
-    if (!SV0.getNode())
-      SV0 = DAG.getUNDEF(VT);
-    if (!SV1.getNode())
-      SV1 = DAG.getUNDEF(VT);
+      if (!SV0.getNode())
+        SV0 = DAG.getUNDEF(VT);
+      if (!SV1.getNode())
+        SV1 = DAG.getUNDEF(VT);
 
-    // Avoid introducing shuffles with illegal mask.
-    //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, B, M2)
-    //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, C, M2)
-    //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2)
-    //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, A, M2)
-    //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(C, A, M2)
-    //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(C, B, M2)
-    return TLI.buildLegalVectorShuffle(VT, SDLoc(N), SV0, SV1, Mask, DAG);
+      // Avoid introducing shuffles with illegal mask.
+      //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, B, M2)
+      //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, C, M2)
+      //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2)
+      //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, A, M2)
+      //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(C, A, M2)
+      //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(C, B, M2)
+      return TLI.buildLegalVectorShuffle(VT, SDLoc(N), SV0, SV1, Mask, DAG);
+    }
   }
 
   if (SDValue V = foldShuffleOfConcatUndefs(SVN, DAG))
@@ -20490,8 +21060,8 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {
   if (N0.isUndef() && N1.getOpcode() == ISD::BITCAST &&
       N1.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR &&
       N1.getOperand(0).getOperand(1) == N2 &&
-      N1.getOperand(0).getOperand(0).getValueType().getVectorNumElements() ==
-          VT.getVectorNumElements() &&
+      N1.getOperand(0).getOperand(0).getValueType().getVectorElementCount() ==
+          VT.getVectorElementCount() &&
       N1.getOperand(0).getOperand(0).getValueType().getSizeInBits() ==
           VT.getSizeInBits()) {
     return DAG.getBitcast(VT, N1.getOperand(0).getOperand(0));
@@ -20508,7 +21078,7 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {
     EVT CN1VT = CN1.getValueType();
     if (CN0VT.isVector() && CN1VT.isVector() &&
         CN0VT.getVectorElementType() == CN1VT.getVectorElementType() &&
-        CN0VT.getVectorNumElements() == VT.getVectorNumElements()) {
+        CN0VT.getVectorElementCount() == VT.getVectorElementCount()) {
       SDValue NewINSERT = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N),
                                       CN0.getValueType(), CN0, CN1, N2);
       return DAG.getBitcast(VT, NewINSERT);
@@ -20547,7 +21117,7 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {
       SDLoc DL(N);
       SDValue NewIdx;
       LLVMContext &Ctx = *DAG.getContext();
-      unsigned NumElts = VT.getVectorNumElements();
+      ElementCount NumElts = VT.getVectorElementCount();
       unsigned EltSizeInBits = VT.getScalarSizeInBits();
       if ((EltSizeInBits % N1SrcSVT.getSizeInBits()) == 0) {
         unsigned Scale = EltSizeInBits / N1SrcSVT.getSizeInBits();
@@ -20555,8 +21125,9 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {
         NewIdx = DAG.getVectorIdxConstant(InsIdx * Scale, DL);
       } else if ((N1SrcSVT.getSizeInBits() % EltSizeInBits) == 0) {
         unsigned Scale = N1SrcSVT.getSizeInBits() / EltSizeInBits;
-        if ((NumElts % Scale) == 0 && (InsIdx % Scale) == 0) {
-          NewVT = EVT::getVectorVT(Ctx, N1SrcSVT, NumElts / Scale);
+        if (NumElts.isKnownMultipleOf(Scale) && (InsIdx % Scale) == 0) {
+          NewVT = EVT::getVectorVT(Ctx, N1SrcSVT,
+                                   NumElts.divideCoefficientBy(Scale));
           NewIdx = DAG.getVectorIdxConstant(InsIdx / Scale, DL);
         }
       }
@@ -20588,8 +21159,10 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {
   // If the input vector is a concatenation, and the insert replaces
   // one of the pieces, we can optimize into a single concat_vectors.
   if (N0.getOpcode() == ISD::CONCAT_VECTORS && N0.hasOneUse() &&
-      N0.getOperand(0).getValueType() == N1.getValueType()) {
-    unsigned Factor = N1.getValueType().getVectorNumElements();
+      N0.getOperand(0).getValueType() == N1.getValueType() &&
+      N0.getOperand(0).getValueType().isScalableVector() ==
+          N1.getValueType().isScalableVector()) {
+    unsigned Factor = N1.getValueType().getVectorMinNumElements();
     SmallVector<SDValue, 8> Ops(N0->op_begin(), N0->op_end());
     Ops[InsIdx / Factor] = N1;
     return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops);
@@ -20616,7 +21189,7 @@ SDValue DAGCombiner::visitFP16_TO_FP(SDNode *N) {
   SDValue N0 = N->getOperand(0);
 
   // fold fp16_to_fp(op & 0xffff) -> fp16_to_fp(op)
-  if (N0->getOpcode() == ISD::AND) {
+  if (!TLI.shouldKeepZExtForFP16Conv() && N0->getOpcode() == ISD::AND) {
     ConstantSDNode *AndConst = getAsNonOpaqueConstant(N0.getOperand(1));
     if (AndConst && AndConst->getAPIntValue() == 0xffff) {
       return DAG.getNode(ISD::FP16_TO_FP, SDLoc(N), N->getValueType(0),
@@ -20633,7 +21206,7 @@ SDValue DAGCombiner::visitVECREDUCE(SDNode *N) {
   unsigned Opcode = N->getOpcode();
 
   // VECREDUCE over 1-element vector is just an extract.
-  if (VT.getVectorNumElements() == 1) {
+  if (VT.getVectorElementCount().isScalar()) {
     SDLoc dl(N);
     SDValue Res =
         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT.getVectorElementType(), N0,
@@ -20872,7 +21445,8 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {
     SDValue Z = LHS.getOperand(2);
     EVT NarrowVT = X.getValueType();
     if (NarrowVT == Y.getValueType() &&
-        TLI.isOperationLegalOrCustomOrPromote(Opcode, NarrowVT)) {
+        TLI.isOperationLegalOrCustomOrPromote(Opcode, NarrowVT,
+                                              LegalOperations)) {
       // (binop undef, undef) may not return undef, so compute that result.
       SDLoc DL(N);
       SDValue VecC =
@@ -20885,11 +21459,10 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {
   // Make sure all but the first op are undef or constant.
   auto ConcatWithConstantOrUndef = [](SDValue Concat) {
     return Concat.getOpcode() == ISD::CONCAT_VECTORS &&
-           std::all_of(std::next(Concat->op_begin()), Concat->op_end(),
-                     [](const SDValue &Op) {
-                       return Op.isUndef() ||
-                              ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
-                     });
+           all_of(drop_begin(Concat->ops()), [](const SDValue &Op) {
+             return Op.isUndef() ||
+                    ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
+           });
   };
 
   // The following pattern is likely to emerge with vector reduction ops. Moving
@@ -21111,7 +21684,7 @@ bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS,
     // It is safe to replace the two loads if they have different alignments,
     // but the new load must be the minimum (most restrictive) alignment of the
     // inputs.
-    unsigned Alignment = std::min(LLD->getAlignment(), RLD->getAlignment());
+    Align Alignment = std::min(LLD->getAlign(), RLD->getAlign());
     MachineMemOperand::Flags MMOFlags = LLD->getMemOperand()->getFlags();
     if (!RLD->isInvariant())
       MMOFlags &= ~MachineMemOperand::MOInvariant;
@@ -21217,6 +21790,46 @@ SDValue DAGCombiner::foldSelectCCToShiftAnd(const SDLoc &DL, SDValue N0,
   return DAG.getNode(ISD::AND, DL, AType, Shift, N2);
 }
 
+// Transform (fneg/fabs (bitconvert x)) to avoid loading constant pool values.
+SDValue DAGCombiner::foldSignChangeInBitcast(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  bool IsFabs = N->getOpcode() == ISD::FABS;
+  bool IsFree = IsFabs ? TLI.isFAbsFree(VT) : TLI.isFNegFree(VT);
+
+  if (IsFree || N0.getOpcode() != ISD::BITCAST || !N0.hasOneUse())
+    return SDValue();
+
+  SDValue Int = N0.getOperand(0);
+  EVT IntVT = Int.getValueType();
+
+  // The operand to cast should be integer.
+  if (!IntVT.isInteger() || IntVT.isVector())
+    return SDValue();
+
+  // (fneg (bitconvert x)) -> (bitconvert (xor x sign))
+  // (fabs (bitconvert x)) -> (bitconvert (and x ~sign))
+  APInt SignMask;
+  if (N0.getValueType().isVector()) {
+    // For vector, create a sign mask (0x80...) or its inverse (for fabs,
+    // 0x7f...) per element and splat it.
+    SignMask = APInt::getSignMask(N0.getScalarValueSizeInBits());
+    if (IsFabs)
+      SignMask = ~SignMask;
+    SignMask = APInt::getSplat(IntVT.getSizeInBits(), SignMask);
+  } else {
+    // For scalar, just use the sign mask (0x80... or the inverse, 0x7f...)
+    SignMask = APInt::getSignMask(IntVT.getSizeInBits());
+    if (IsFabs)
+      SignMask = ~SignMask;
+  }
+  SDLoc DL(N0);
+  Int = DAG.getNode(IsFabs ? ISD::AND : ISD::XOR, DL, IntVT, Int,
+                    DAG.getConstant(SignMask, DL, IntVT));
+  AddToWorklist(Int.getNode());
+  return DAG.getBitcast(VT, Int);
+}
+
 /// Turn "(a cond b) ? 1.0f : 2.0f" into "load (tmp + ((a cond b) ? 0 : 4)"
 /// where "tmp" is a constant pool entry containing an array with 1.0 and 2.0
 /// in it. This may be a win when the constant is not otherwise available
@@ -21498,9 +22111,8 @@ SDValue DAGCombiner::BuildUDIV(SDNode *N) {
 /// transform: LogBase2(V) = (EltBits - 1) - ctlz(V).
 SDValue DAGCombiner::BuildLogBase2(SDValue V, const SDLoc &DL) {
   EVT VT = V.getValueType();
-  unsigned EltBits = VT.getScalarSizeInBits();
   SDValue Ctlz = DAG.getNode(ISD::CTLZ, DL, VT, V);
-  SDValue Base = DAG.getConstant(EltBits - 1, DL, VT);
+  SDValue Base = DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT);
   SDValue LogBase2 = DAG.getNode(ISD::SUB, DL, VT, Base, Ctlz);
   return LogBase2;
 }
@@ -21678,37 +22290,21 @@ SDValue DAGCombiner::buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags,
                           Reciprocal)) {
     AddToWorklist(Est.getNode());
 
-    if (Iterations) {
+    if (Iterations)
       Est = UseOneConstNR
             ? buildSqrtNROneConst(Op, Est, Iterations, Flags, Reciprocal)
             : buildSqrtNRTwoConst(Op, Est, Iterations, Flags, Reciprocal);
-
-      if (!Reciprocal) {
-        // The estimate is now completely wrong if the input was exactly 0.0 or
-        // possibly a denormal. Force the answer to 0.0 for those cases.
-        SDLoc DL(Op);
-        EVT CCVT = getSetCCResultType(VT);
-        ISD::NodeType SelOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT;
-        DenormalMode DenormMode = DAG.getDenormalMode(VT);
-        if (DenormMode.Input == DenormalMode::IEEE) {
-          // This is specifically a check for the handling of denormal inputs,
-          // not the result.
-
-          // fabs(X) < SmallestNormal ? 0.0 : Est
-          const fltSemantics &FltSem = DAG.EVTToAPFloatSemantics(VT);
-          APFloat SmallestNorm = APFloat::getSmallestNormalized(FltSem);
-          SDValue NormC = DAG.getConstantFP(SmallestNorm, DL, VT);
-          SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
-          SDValue Fabs = DAG.getNode(ISD::FABS, DL, VT, Op);
-          SDValue IsDenorm = DAG.getSetCC(DL, CCVT, Fabs, NormC, ISD::SETLT);
-          Est = DAG.getNode(SelOpcode, DL, VT, IsDenorm, FPZero, Est);
-        } else {
-          // X == 0.0 ? 0.0 : Est
-          SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
-          SDValue IsZero = DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
-          Est = DAG.getNode(SelOpcode, DL, VT, IsZero, FPZero, Est);
-        }
-      }
+    if (!Reciprocal) {
+      SDLoc DL(Op);
+      // Try the target specific test first.
+      SDValue Test = TLI.getSqrtInputTest(Op, DAG, DAG.getDenormalMode(VT));
+
+      // The estimate is now completely wrong if the input was exactly 0.0 or
+      // possibly a denormal. Force the answer to 0.0 or value provided by
+      // target for those cases.
+      Est = DAG.getNode(
+          Test.getValueType().isVector() ? ISD::VSELECT : ISD::SELECT, DL, VT,
+          Test, TLI.getSqrtResultForDenormInput(Op, DAG), Est);
     }
     return Est;
   }
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
index f5948d2a20dc..0ff77d4ba1ab 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -113,11 +113,6 @@ using namespace PatternMatch;
 
 #define DEBUG_TYPE "isel"
 
-// FIXME: Remove this after the feature has proven reliable.
-static cl::opt<bool> SinkLocalValues("fast-isel-sink-local-values",
-                                     cl::init(true), cl::Hidden,
-                                     cl::desc("Sink local values in FastISel"));
-
 STATISTIC(NumFastIselSuccessIndependent, "Number of insts selected by "
                                          "target-independent selector");
 STATISTIC(NumFastIselSuccessTarget, "Number of insts selected by "
@@ -139,7 +134,6 @@ void FastISel::startNewBlock() {
   LastLocalValue = EmitStartPt;
 }
 
-/// Flush the local CSE map and sink anything we can.
 void FastISel::finishBasicBlock() { flushLocalValueMap(); }
 
 bool FastISel::lowerArguments() {
@@ -164,48 +158,77 @@ bool FastISel::lowerArguments() {
 
 /// Return the defined register if this instruction defines exactly one
 /// virtual register and uses no other virtual registers. Otherwise return 0.
-static Register findSinkableLocalRegDef(MachineInstr &MI) {
+static Register findLocalRegDef(MachineInstr &MI) {
   Register RegDef;
   for (const MachineOperand &MO : MI.operands()) {
     if (!MO.isReg())
       continue;
     if (MO.isDef()) {
       if (RegDef)
-        return 0;
+        return Register();
       RegDef = MO.getReg();
     } else if (MO.getReg().isVirtual()) {
-      // This is another use of a vreg. Don't try to sink it.
+      // This is another use of a vreg. Don't delete it.
       return Register();
     }
   }
   return RegDef;
 }
 
+static bool isRegUsedByPhiNodes(Register DefReg,
+                                FunctionLoweringInfo &FuncInfo) {
+  for (auto &P : FuncInfo.PHINodesToUpdate)
+    if (P.second == DefReg)
+      return true;
+  return false;
+}
+
 void FastISel::flushLocalValueMap() {
-  // Try to sink local values down to their first use so that we can give them a
-  // better debug location. This has the side effect of shrinking local value
-  // live ranges, which helps out fast regalloc.
-  if (SinkLocalValues && LastLocalValue != EmitStartPt) {
-    // Sink local value materialization instructions between EmitStartPt and
-    // LastLocalValue. Visit them bottom-up, starting from LastLocalValue, to
-    // avoid inserting into the range that we're iterating over.
+  // If FastISel bails out, it could leave local value instructions behind
+  // that aren't used for anything.  Detect and erase those.
+  if (LastLocalValue != EmitStartPt) {
+    // Save the first instruction after local values, for later.
+    MachineBasicBlock::iterator FirstNonValue(LastLocalValue);
+    ++FirstNonValue;
+
     MachineBasicBlock::reverse_iterator RE =
         EmitStartPt ? MachineBasicBlock::reverse_iterator(EmitStartPt)
                     : FuncInfo.MBB->rend();
     MachineBasicBlock::reverse_iterator RI(LastLocalValue);
-
-    InstOrderMap OrderMap;
     for (; RI != RE;) {
       MachineInstr &LocalMI = *RI;
+      // Increment before erasing what it points to.
       ++RI;
-      bool Store = true;
-      if (!LocalMI.isSafeToMove(nullptr, Store))
+      Register DefReg = findLocalRegDef(LocalMI);
+      if (!DefReg)
         continue;
-      Register DefReg = findSinkableLocalRegDef(LocalMI);
-      if (DefReg == 0)
+      if (FuncInfo.RegsWithFixups.count(DefReg))
         continue;
+      bool UsedByPHI = isRegUsedByPhiNodes(DefReg, FuncInfo);
+      if (!UsedByPHI && MRI.use_nodbg_empty(DefReg)) {
+        if (EmitStartPt == &LocalMI)
+          EmitStartPt = EmitStartPt->getPrevNode();
+        LLVM_DEBUG(dbgs() << "removing dead local value materialization"
+                          << LocalMI);
+        LocalMI.eraseFromParent();
+      }
+    }
 
-      sinkLocalValueMaterialization(LocalMI, DefReg, OrderMap);
+    if (FirstNonValue != FuncInfo.MBB->end()) {
+      // See if there are any local value instructions left.  If so, we want to
+      // make sure the first one has a debug location; if it doesn't, use the
+      // first non-value instruction's debug location.
+
+      // If EmitStartPt is non-null, this block had copies at the top before
+      // FastISel started doing anything; it points to the last one, so the
+      // first local value instruction is the one after EmitStartPt.
+      // If EmitStartPt is null, the first local value instruction is at the
+      // top of the block.
+      MachineBasicBlock::iterator FirstLocalValue =
+          EmitStartPt ? ++MachineBasicBlock::iterator(EmitStartPt)
+                      : FuncInfo.MBB->begin();
+      if (FirstLocalValue != FirstNonValue && !FirstLocalValue->getDebugLoc())
+        FirstLocalValue->setDebugLoc(FirstNonValue->getDebugLoc());
     }
   }
 
@@ -213,132 +236,6 @@ void FastISel::flushLocalValueMap() {
   LastLocalValue = EmitStartPt;
   recomputeInsertPt();
   SavedInsertPt = FuncInfo.InsertPt;
-  LastFlushPoint = FuncInfo.InsertPt;
-}
-
-static bool isRegUsedByPhiNodes(Register DefReg,
-                                FunctionLoweringInfo &FuncInfo) {
-  for (auto &P : FuncInfo.PHINodesToUpdate)
-    if (P.second == DefReg)
-      return true;
-  return false;
-}
-
-static bool isTerminatingEHLabel(MachineBasicBlock *MBB, MachineInstr &MI) {
-  // Ignore non-EH labels.
-  if (!MI.isEHLabel())
-    return false;
-
-  // Any EH label outside a landing pad must be for an invoke. Consider it a
-  // terminator.
-  if (!MBB->isEHPad())
-    return true;
-
-  // If this is a landingpad, the first non-phi instruction will be an EH_LABEL.
-  // Don't consider that label to be a terminator.
-  return MI.getIterator() != MBB->getFirstNonPHI();
-}
-
-/// Build a map of instruction orders. Return the first terminator and its
-/// order. Consider EH_LABEL instructions to be terminators as well, since local
-/// values for phis after invokes must be materialized before the call.
-void FastISel::InstOrderMap::initialize(
-    MachineBasicBlock *MBB, MachineBasicBlock::iterator LastFlushPoint) {
-  unsigned Order = 0;
-  for (MachineInstr &I : *MBB) {
-    if (!FirstTerminator &&
-        (I.isTerminator() || isTerminatingEHLabel(MBB, I))) {
-      FirstTerminator = &I;
-      FirstTerminatorOrder = Order;
-    }
-    Orders[&I] = Order++;
-
-    // We don't need to order instructions past the last flush point.
-    if (I.getIterator() == LastFlushPoint)
-      break;
-  }
-}
-
-void FastISel::sinkLocalValueMaterialization(MachineInstr &LocalMI,
-                                             Register DefReg,
-                                             InstOrderMap &OrderMap) {
-  // If this register is used by a register fixup, MRI will not contain all
-  // the uses until after register fixups, so don't attempt to sink or DCE
-  // this instruction. Register fixups typically come from no-op cast
-  // instructions, which replace the cast instruction vreg with the local
-  // value vreg.
-  if (FuncInfo.RegsWithFixups.count(DefReg))
-    return;
-
-  // We can DCE this instruction if there are no uses and it wasn't a
-  // materialized for a successor PHI node.
-  bool UsedByPHI = isRegUsedByPhiNodes(DefReg, FuncInfo);
-  if (!UsedByPHI && MRI.use_nodbg_empty(DefReg)) {
-    if (EmitStartPt == &LocalMI)
-      EmitStartPt = EmitStartPt->getPrevNode();
-    LLVM_DEBUG(dbgs() << "removing dead local value materialization "
-                      << LocalMI);
-    OrderMap.Orders.erase(&LocalMI);
-    LocalMI.eraseFromParent();
-    return;
-  }
-
-  // Number the instructions if we haven't yet so we can efficiently find the
-  // earliest use.
-  if (OrderMap.Orders.empty())
-    OrderMap.initialize(FuncInfo.MBB, LastFlushPoint);
-
-  // Find the first user in the BB.
-  MachineInstr *FirstUser = nullptr;
-  unsigned FirstOrder = std::numeric_limits<unsigned>::max();
-  for (MachineInstr &UseInst : MRI.use_nodbg_instructions(DefReg)) {
-    auto I = OrderMap.Orders.find(&UseInst);
-    assert(I != OrderMap.Orders.end() &&
-           "local value used by instruction outside local region");
-    unsigned UseOrder = I->second;
-    if (UseOrder < FirstOrder) {
-      FirstOrder = UseOrder;
-      FirstUser = &UseInst;
-    }
-  }
-
-  // The insertion point will be the first terminator or the first user,
-  // whichever came first. If there was no terminator, this must be a
-  // fallthrough block and the insertion point is the end of the block.
-  MachineBasicBlock::instr_iterator SinkPos;
-  if (UsedByPHI && OrderMap.FirstTerminatorOrder < FirstOrder) {
-    FirstOrder = OrderMap.FirstTerminatorOrder;
-    SinkPos = OrderMap.FirstTerminator->getIterator();
-  } else if (FirstUser) {
-    SinkPos = FirstUser->getIterator();
-  } else {
-    assert(UsedByPHI && "must be users if not used by a phi");
-    SinkPos = FuncInfo.MBB->instr_end();
-  }
-
-  // Collect all DBG_VALUEs before the new insertion position so that we can
-  // sink them.
-  SmallVector<MachineInstr *, 1> DbgValues;
-  for (MachineInstr &DbgVal : MRI.use_instructions(DefReg)) {
-    if (!DbgVal.isDebugValue())
-      continue;
-    unsigned UseOrder = OrderMap.Orders[&DbgVal];
-    if (UseOrder < FirstOrder)
-      DbgValues.push_back(&DbgVal);
-  }
-
-  // Sink LocalMI before SinkPos and assign it the same DebugLoc.
-  LLVM_DEBUG(dbgs() << "sinking local value to first use " << LocalMI);
-  FuncInfo.MBB->remove(&LocalMI);
-  FuncInfo.MBB->insert(SinkPos, &LocalMI);
-  if (SinkPos != FuncInfo.MBB->end())
-    LocalMI.setDebugLoc(SinkPos->getDebugLoc());
-
-  // Sink any debug values that we've collected.
-  for (MachineInstr *DI : DbgValues) {
-    FuncInfo.MBB->remove(DI);
-    FuncInfo.MBB->insert(SinkPos, DI);
-  }
 }
 
 bool FastISel::hasTrivialKill(const Value *V) {
@@ -364,12 +261,16 @@ bool FastISel::hasTrivialKill(const Value *V) {
     if (GEP->hasAllZeroIndices() && !hasTrivialKill(GEP->getOperand(0)))
       return false;
 
+  // Casts and extractvalues may be trivially coalesced by fast-isel.
+  if (I->getOpcode() == Instruction::BitCast ||
+      I->getOpcode() == Instruction::PtrToInt ||
+      I->getOpcode() == Instruction::IntToPtr ||
+      I->getOpcode() == Instruction::ExtractValue)
+    return false;
+
   // Only instructions with a single use in the same basic block are considered
   // to have trivial kills.
   return I->hasOneUse() &&
-         !(I->getOpcode() == Instruction::BitCast ||
-           I->getOpcode() == Instruction::PtrToInt ||
-           I->getOpcode() == Instruction::IntToPtr) &&
          cast<Instruction>(*I->user_begin())->getParent() == I->getParent();
 }
 
@@ -446,7 +347,7 @@ Register FastISel::materializeConstant(const Value *V, MVT VT) {
             getRegForValue(ConstantInt::get(V->getContext(), SIntVal));
         if (IntegerReg)
           Reg = fastEmit_r(IntVT.getSimpleVT(), VT, ISD::SINT_TO_FP, IntegerReg,
-                           /*Kill=*/false);
+                           /*Op0IsKill=*/false);
       }
     }
   } else if (const auto *Op = dyn_cast<Operator>(V)) {
@@ -560,8 +461,6 @@ void FastISel::removeDeadCode(MachineBasicBlock::iterator I,
   assert(I.isValid() && E.isValid() && std::distance(I, E) > 0 &&
          "Invalid iterator!");
   while (I != E) {
-    if (LastFlushPoint == I)
-      LastFlushPoint = E;
     if (SavedInsertPt == I)
       SavedInsertPt = E;
     if (EmitStartPt == I)
@@ -578,12 +477,9 @@ void FastISel::removeDeadCode(MachineBasicBlock::iterator I,
 }
 
 FastISel::SavePoint FastISel::enterLocalValueArea() {
-  MachineBasicBlock::iterator OldInsertPt = FuncInfo.InsertPt;
-  DebugLoc OldDL = DbgLoc;
+  SavePoint OldInsertPt = FuncInfo.InsertPt;
   recomputeInsertPt();
-  DbgLoc = DebugLoc();
-  SavePoint SP = {OldInsertPt, OldDL};
-  return SP;
+  return OldInsertPt;
 }
 
 void FastISel::leaveLocalValueArea(SavePoint OldInsertPt) {
@@ -591,8 +487,7 @@ void FastISel::leaveLocalValueArea(SavePoint OldInsertPt) {
     LastLocalValue = &*std::prev(FuncInfo.InsertPt);
 
   // Restore the previous insert position.
-  FuncInfo.InsertPt = OldInsertPt.InsertPt;
-  DbgLoc = OldInsertPt.DL;
+  FuncInfo.InsertPt = OldInsertPt;
 }
 
 bool FastISel::selectBinaryOp(const User *I, unsigned ISDOpcode) {
@@ -1316,11 +1211,6 @@ bool FastISel::selectCall(const User *I) {
 
   // Handle simple inline asms.
   if (const InlineAsm *IA = dyn_cast<InlineAsm>(Call->getCalledOperand())) {
-    // If the inline asm has side effects, then make sure that no local value
-    // lives across by flushing the local value map.
-    if (IA->hasSideEffects())
-      flushLocalValueMap();
-
     // Don't attempt to handle constraints.
     if (!IA->getConstraintString().empty())
       return false;
@@ -1350,15 +1240,6 @@ bool FastISel::selectCall(const User *I) {
   if (const auto *II = dyn_cast<IntrinsicInst>(Call))
     return selectIntrinsicCall(II);
 
-  // Usually, it does not make sense to initialize a value,
-  // make an unrelated function call and use the value, because
-  // it tends to be spilled on the stack. So, we move the pointer
-  // to the last local value to the beginning of the block, so that
-  // all the values which have already been materialized,
-  // appear after the call. It also makes sense to skip intrinsics
-  // since they tend to be inlined.
-  flushLocalValueMap();
-
   return lowerCall(Call);
 }
 
@@ -1375,6 +1256,8 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
   case Intrinsic::sideeffect:
   // Neither does the assume intrinsic; it's also OK not to codegen its operand.
   case Intrinsic::assume:
+  // Neither does the llvm.experimental.noalias.scope.decl intrinsic
+  case Intrinsic::experimental_noalias_scope_decl:
     return true;
   case Intrinsic::dbg_declare: {
     const DbgDeclareInst *DI = cast<DbgDeclareInst>(II);
@@ -1643,6 +1526,11 @@ void FastISel::removeDeadLocalValueCode(MachineInstr *SavedLastLocalValue)
 }
 
 bool FastISel::selectInstruction(const Instruction *I) {
+  // Flush the local value map before starting each instruction.
+  // This improves locality and debugging, and can reduce spills.
+  // Reuse of values across IR instructions is relatively uncommon.
+  flushLocalValueMap();
+
   MachineInstr *SavedLastLocalValue = getLastLocalValue();
   // Just before the terminator instruction, insert instructions to
   // feed PHI nodes in successor blocks.
@@ -1789,13 +1677,13 @@ bool FastISel::selectFNeg(const User *I, const Value *In) {
     return false;
 
   Register IntResultReg = fastEmit_ri_(
-      IntVT.getSimpleVT(), ISD::XOR, IntReg, /*IsKill=*/true,
+      IntVT.getSimpleVT(), ISD::XOR, IntReg, /*Op0IsKill=*/true,
       UINT64_C(1) << (VT.getSizeInBits() - 1), IntVT.getSimpleVT());
   if (!IntResultReg)
     return false;
 
   ResultReg = fastEmit_r(IntVT.getSimpleVT(), VT.getSimpleVT(), ISD::BITCAST,
-                         IntResultReg, /*IsKill=*/true);
+                         IntResultReg, /*Op0IsKill=*/true);
   if (!ResultReg)
     return false;
 
@@ -1851,13 +1739,8 @@ bool FastISel::selectOperator(const User *I, unsigned Opcode) {
     return selectBinaryOp(I, ISD::FADD);
   case Instruction::Sub:
     return selectBinaryOp(I, ISD::SUB);
-  case Instruction::FSub: {
-    // FNeg is currently represented in LLVM IR as a special case of FSub.
-    Value *X;
-    if (match(I, m_FNeg(m_Value(X))))
-       return selectFNeg(I, X);
+  case Instruction::FSub:
     return selectBinaryOp(I, ISD::FSUB);
-  }
   case Instruction::Mul:
     return selectBinaryOp(I, ISD::MUL);
   case Instruction::FMul:
@@ -2353,9 +2236,9 @@ bool FastISel::handlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
 
       const Value *PHIOp = PN.getIncomingValueForBlock(LLVMBB);
 
-      // Set the DebugLoc for the copy. Prefer the location of the operand
-      // if there is one; use the location of the PHI otherwise.
-      DbgLoc = PN.getDebugLoc();
+      // Set the DebugLoc for the copy. Use the location of the operand if
+      // there is one; otherwise no location, flushLocalValueMap will fix it.
+      DbgLoc = DebugLoc();
       if (const auto *Inst = dyn_cast<Instruction>(PHIOp))
         DbgLoc = Inst->getDebugLoc();
 
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index 5cf83cff3a90..32a4f60df097 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -197,7 +197,7 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
       // Look for inline asm that clobbers the SP register.
       if (auto *Call = dyn_cast<CallBase>(&I)) {
         if (Call->isInlineAsm()) {
-          unsigned SP = TLI->getStackPointerRegisterToSaveRestore();
+          Register SP = TLI->getStackPointerRegisterToSaveRestore();
           const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
           std::vector<TargetLowering::AsmOperandInfo> Ops =
               TLI->ParseConstraints(Fn->getParent()->getDataLayout(), TRI,
@@ -360,7 +360,7 @@ void FunctionLoweringInfo::clear() {
   RegFixups.clear();
   RegsWithFixups.clear();
   StatepointStackSlots.clear();
-  StatepointSpillMaps.clear();
+  StatepointRelocationMaps.clear();
   PreferredExtendType.clear();
 }
 
@@ -458,8 +458,7 @@ void FunctionLoweringInfo::ComputePHILiveOutRegInfo(const PHINode *PN) {
   if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
     APInt Val = CI->getValue().zextOrTrunc(BitWidth);
     DestLOI.NumSignBits = Val.getNumSignBits();
-    DestLOI.Known.Zero = ~Val;
-    DestLOI.Known.One = Val;
+    DestLOI.Known = KnownBits::makeConstant(Val);
   } else {
     assert(ValueMap.count(V) && "V should have been placed in ValueMap when its"
                                 "CopyToReg node was created.");
@@ -509,8 +508,7 @@ void FunctionLoweringInfo::ComputePHILiveOutRegInfo(const PHINode *PN) {
       return;
     }
     DestLOI.NumSignBits = std::min(DestLOI.NumSignBits, SrcLOI->NumSignBits);
-    DestLOI.Known.Zero &= SrcLOI->Known.Zero;
-    DestLOI.Known.One &= SrcLOI->Known.One;
+    DestLOI.Known = KnownBits::commonBits(DestLOI.Known, SrcLOI->Known);
   }
 }
 
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 0e4e99214aa2..a5978711b871 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -26,6 +26,7 @@
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/PseudoProbe.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
@@ -200,6 +201,8 @@ void InstrEmitter::CreateVirtualRegisters(SDNode *Node,
   bool HasVRegVariadicDefs = !MF->getTarget().usesPhysRegsForValues() &&
                              II.isVariadic() && II.variadicOpsAreDefs();
   unsigned NumVRegs = HasVRegVariadicDefs ? NumResults : II.getNumDefs();
+  if (Node->getMachineOpcode() == TargetOpcode::STATEPOINT)
+    NumVRegs = NumResults;
   for (unsigned i = 0; i < NumVRegs; ++i) {
     // If the specific node value is only used by a CopyToReg and the dest reg
     // is a vreg in the same register class, use the CopyToReg'd destination
@@ -693,6 +696,11 @@ InstrEmitter::EmitDbgValue(SDDbgValue *SD,
     return &*MIB;
   }
 
+  // Attempt to produce a DBG_INSTR_REF if we've been asked to.
+  if (EmitDebugInstrRefs)
+    if (auto *InstrRef = EmitDbgInstrRef(SD, VRBaseMap))
+      return InstrRef;
+
   if (SD->getKind() == SDDbgValue::FRAMEIX) {
     // Stack address; this needs to be lowered in target-dependent fashion.
     // EmitTargetCodeForFrameDebugValue is responsible for allocation.
@@ -759,6 +767,63 @@ InstrEmitter::EmitDbgValue(SDDbgValue *SD,
   return &*MIB;
 }
 
+MachineInstr *
+InstrEmitter::EmitDbgInstrRef(SDDbgValue *SD,
+                              DenseMap<SDValue, Register> &VRBaseMap) {
+  // Instruction referencing is still in a prototype state: for now we're only
+  // going to support SDNodes within a block. Copies are not supported, they
+  // don't actually define a value.
+  if (SD->getKind() != SDDbgValue::SDNODE)
+    return nullptr;
+
+  SDNode *Node = SD->getSDNode();
+  SDValue Op = SDValue(Node, SD->getResNo());
+  DenseMap<SDValue, Register>::iterator I = VRBaseMap.find(Op);
+  if (I==VRBaseMap.end())
+    return nullptr; // undef value: let EmitDbgValue produce a DBG_VALUE $noreg.
+
+  MDNode *Var = SD->getVariable();
+  MDNode *Expr = SD->getExpression();
+  DebugLoc DL = SD->getDebugLoc();
+
+  // Try to pick out a defining instruction at this point.
+  unsigned VReg = getVR(Op, VRBaseMap);
+  MachineInstr *ResultInstr = nullptr;
+
+  // No definition corresponds to scenarios where a vreg is live-in to a block,
+  // and doesn't have a defining instruction (yet). This can be patched up
+  // later; at this early stage of implementation, fall back to using DBG_VALUE.
+  if (!MRI->hasOneDef(VReg))
+    return nullptr;
+
+  MachineInstr &DefMI = *MRI->def_instr_begin(VReg);
+  // Some target specific opcodes can become copies. As stated above, we're
+  // ignoring those for now.
+  if (DefMI.isCopy() || DefMI.getOpcode() == TargetOpcode::SUBREG_TO_REG)
+    return nullptr;
+
+  const MCInstrDesc &RefII = TII->get(TargetOpcode::DBG_INSTR_REF);
+  auto MIB = BuildMI(*MF, DL, RefII);
+
+  // Find the operand which defines the specified VReg.
+  unsigned OperandIdx = 0;
+  for (const auto &MO : DefMI.operands()) {
+    if (MO.isReg() && MO.isDef() && MO.getReg() == VReg)
+      break;
+    ++OperandIdx;
+  }
+  assert(OperandIdx < DefMI.getNumOperands());
+
+  // Make the DBG_INSTR_REF refer to that instruction, and that operand.
+  unsigned InstrNum = DefMI.getDebugInstrNum();
+  MIB.addImm(InstrNum);
+  MIB.addImm(OperandIdx);
+  MIB.addMetadata(Var);
+  MIB.addMetadata(Expr);
+  ResultInstr = &*MIB;
+  return ResultInstr;
+}
+
 MachineInstr *
 InstrEmitter::EmitDbgLabel(SDDbgLabel *SD) {
   MDNode *Label = SD->getLabel();
@@ -821,6 +886,8 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
       NumDefs = NumResults;
     }
     ScratchRegs = TLI->getScratchRegisters((CallingConv::ID) CC);
+  } else if (Opc == TargetOpcode::STATEPOINT) {
+    NumDefs = NumResults;
   }
 
   unsigned NumImpUses = 0;
@@ -970,6 +1037,22 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
   if (!UsedRegs.empty() || II.getImplicitDefs() || II.hasOptionalDef())
     MIB->setPhysRegsDeadExcept(UsedRegs, *TRI);
 
+  // STATEPOINT is too 'dynamic' to have meaningful machine description.
+  // We have to manually tie operands.
+  if (Opc == TargetOpcode::STATEPOINT && NumDefs > 0) {
+    assert(!HasPhysRegOuts && "STATEPOINT mishandled");
+    MachineInstr *MI = MIB;
+    unsigned Def = 0;
+    int First = StatepointOpers(MI).getFirstGCPtrIdx();
+    assert(First > 0 && "Statepoint has Defs but no GC ptr list");
+    unsigned Use = (unsigned)First;
+    while (Def < NumDefs) {
+      if (MI->getOperand(Use).isReg())
+        MI->tieOperands(Def++, Use);
+      Use = StackMaps::getNextMetaArgIdx(MI, Use);
+    }
+  }
+
   // Run post-isel target hook to adjust this instruction if needed.
   if (II.hasPostISelHook())
     TLI->AdjustInstrPostInstrSelection(*MIB, Node);
@@ -1042,6 +1125,20 @@ EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
     break;
   }
 
+  case ISD::PSEUDO_PROBE: {
+    unsigned TarOp = TargetOpcode::PSEUDO_PROBE;
+    auto Guid = cast<PseudoProbeSDNode>(Node)->getGuid();
+    auto Index = cast<PseudoProbeSDNode>(Node)->getIndex();
+    auto Attr = cast<PseudoProbeSDNode>(Node)->getAttributes();
+
+    BuildMI(*MBB, InsertPos, Node->getDebugLoc(), TII->get(TarOp))
+        .addImm(Guid)
+        .addImm(Index)
+        .addImm((uint8_t)PseudoProbeType::Block)
+        .addImm(Attr);
+    break;
+  }
+
   case ISD::INLINEASM:
   case ISD::INLINEASM_BR: {
     unsigned NumOps = Node->getNumOperands();
@@ -1157,10 +1254,12 @@ EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
 
 /// InstrEmitter - Construct an InstrEmitter and set it to start inserting
 /// at the given position in the given block.
-InstrEmitter::InstrEmitter(MachineBasicBlock *mbb,
+InstrEmitter::InstrEmitter(const TargetMachine &TM, MachineBasicBlock *mbb,
                            MachineBasicBlock::iterator insertpos)
     : MF(mbb->getParent()), MRI(&MF->getRegInfo()),
       TII(MF->getSubtarget().getInstrInfo()),
       TRI(MF->getSubtarget().getRegisterInfo()),
       TLI(MF->getSubtarget().getTargetLowering()), MBB(mbb),
-      InsertPos(insertpos) {}
+      InsertPos(insertpos) {
+  EmitDebugInstrRefs = TM.Options.ValueTrackingVariableLocations;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h
index c3567eae9161..09658b8143fe 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h
@@ -26,6 +26,7 @@ class MCInstrDesc;
 class SDDbgLabel;
 class SDDbgValue;
 class TargetLowering;
+class TargetMachine;
 
 class LLVM_LIBRARY_VISIBILITY InstrEmitter {
   MachineFunction *MF;
@@ -37,6 +38,9 @@ class LLVM_LIBRARY_VISIBILITY InstrEmitter {
   MachineBasicBlock *MBB;
   MachineBasicBlock::iterator InsertPos;
 
+  /// Should we try to produce DBG_INSTR_REF instructions?
+  bool EmitDebugInstrRefs;
+
   /// EmitCopyFromReg - Generate machine code for an CopyFromReg node or an
   /// implicit physical register output.
   void EmitCopyFromReg(SDNode *Node, unsigned ResNo,
@@ -109,6 +113,11 @@ public:
   MachineInstr *EmitDbgValue(SDDbgValue *SD,
                              DenseMap<SDValue, Register> &VRBaseMap);
 
+  /// Attempt to emit a dbg_value as a DBG_INSTR_REF. May fail and return
+  /// nullptr, in which case we fall back to plain EmitDbgValue.
+  MachineInstr *EmitDbgInstrRef(SDDbgValue *SD,
+                                DenseMap<SDValue, Register> &VRBaseMap);
+
   /// Generate machine instruction for a dbg_label node.
   MachineInstr *EmitDbgLabel(SDDbgLabel *SD);
 
@@ -130,7 +139,8 @@ public:
 
   /// InstrEmitter - Construct an InstrEmitter and set it to start inserting
   /// at the given position in the given block.
-  InstrEmitter(MachineBasicBlock *mbb, MachineBasicBlock::iterator insertpos);
+  InstrEmitter(const TargetMachine &TM, MachineBasicBlock *mbb,
+               MachineBasicBlock::iterator insertpos);
 
 private:
   void EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 6a6004c158bb..62d7191036ca 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -173,14 +173,17 @@ private:
                           SDValue NewIntValue) const;
   SDValue ExpandFCOPYSIGN(SDNode *Node) const;
   SDValue ExpandFABS(SDNode *Node) const;
+  SDValue ExpandFNEG(SDNode *Node) const;
   SDValue ExpandLegalINT_TO_FP(SDNode *Node, SDValue &Chain);
   void PromoteLegalINT_TO_FP(SDNode *N, const SDLoc &dl,
                              SmallVectorImpl<SDValue> &Results);
   void PromoteLegalFP_TO_INT(SDNode *N, const SDLoc &dl,
                              SmallVectorImpl<SDValue> &Results);
+  SDValue PromoteLegalFP_TO_INT_SAT(SDNode *Node, const SDLoc &dl);
 
   SDValue ExpandBITREVERSE(SDValue Op, const SDLoc &dl);
   SDValue ExpandBSWAP(SDValue Op, const SDLoc &dl);
+  SDValue ExpandPARITY(SDValue Op, const SDLoc &dl);
 
   SDValue ExpandExtractFromVectorThroughStack(SDValue Op);
   SDValue ExpandInsertToVectorThroughStack(SDValue Op);
@@ -428,7 +431,6 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
 
   LLVM_DEBUG(dbgs() << "Optimizing float store operations\n");
   // Turn 'store float 1.0, Ptr' -> 'store int 0x12345678, Ptr'
-  // FIXME: We shouldn't do this for TargetConstantFP's.
   // FIXME: move this to the DAG Combiner!  Note that we can't regress due
   // to phase ordering between legalized code and the dag combiner.  This
   // probably means that we need to integrate dag combiner and legalizer
@@ -436,10 +438,16 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
   // We generally can't do this one for long doubles.
   SDValue Chain = ST->getChain();
   SDValue Ptr = ST->getBasePtr();
+  SDValue Value = ST->getValue();
   MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
   AAMDNodes AAInfo = ST->getAAInfo();
   SDLoc dl(ST);
-  if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(ST->getValue())) {
+
+  // Don't optimise TargetConstantFP
+  if (Value.getOpcode() == ISD::TargetConstantFP)
+    return SDValue();
+
+  if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Value)) {
     if (CFP->getValueType(0) == MVT::f32 &&
         TLI.isTypeLegal(MVT::i32)) {
       SDValue Con = DAG.getConstant(CFP->getValueAPF().
@@ -470,7 +478,7 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
 
         Lo = DAG.getStore(Chain, dl, Lo, Ptr, ST->getPointerInfo(),
                           ST->getOriginalAlign(), MMOFlags, AAInfo);
-        Ptr = DAG.getMemBasePlusOffset(Ptr, 4, dl);
+        Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(4), dl);
         Hi = DAG.getStore(Chain, dl, Hi, Ptr,
                           ST->getPointerInfo().getWithOffset(4),
                           ST->getOriginalAlign(), MMOFlags, AAInfo);
@@ -479,7 +487,7 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
       }
     }
   }
-  return SDValue(nullptr, 0);
+  return SDValue();
 }
 
 void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
@@ -540,28 +548,29 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
   LLVM_DEBUG(dbgs() << "Legalizing truncating store operations\n");
   SDValue Value = ST->getValue();
   EVT StVT = ST->getMemoryVT();
-  unsigned StWidth = StVT.getSizeInBits();
+  TypeSize StWidth = StVT.getSizeInBits();
+  TypeSize StSize = StVT.getStoreSizeInBits();
   auto &DL = DAG.getDataLayout();
 
-  if (StWidth != StVT.getStoreSizeInBits()) {
+  if (StWidth != StSize) {
     // Promote to a byte-sized store with upper bits zero if not
     // storing an integral number of bytes.  For example, promote
     // TRUNCSTORE:i1 X -> TRUNCSTORE:i8 (and X, 1)
-    EVT NVT = EVT::getIntegerVT(*DAG.getContext(),
-                                StVT.getStoreSizeInBits());
+    EVT NVT = EVT::getIntegerVT(*DAG.getContext(), StSize.getFixedSize());
     Value = DAG.getZeroExtendInReg(Value, dl, StVT);
     SDValue Result =
         DAG.getTruncStore(Chain, dl, Value, Ptr, ST->getPointerInfo(), NVT,
                           ST->getOriginalAlign(), MMOFlags, AAInfo);
     ReplaceNode(SDValue(Node, 0), Result);
-  } else if (StWidth & (StWidth - 1)) {
+  } else if (!StVT.isVector() && !isPowerOf2_64(StWidth.getFixedSize())) {
     // If not storing a power-of-2 number of bits, expand as two stores.
     assert(!StVT.isVector() && "Unsupported truncstore!");
-    unsigned LogStWidth = Log2_32(StWidth);
+    unsigned StWidthBits = StWidth.getFixedSize();
+    unsigned LogStWidth = Log2_32(StWidthBits);
     assert(LogStWidth < 32);
     unsigned RoundWidth = 1 << LogStWidth;
-    assert(RoundWidth < StWidth);
-    unsigned ExtraWidth = StWidth - RoundWidth;
+    assert(RoundWidth < StWidthBits);
+    unsigned ExtraWidth = StWidthBits - RoundWidth;
     assert(ExtraWidth < RoundWidth);
     assert(!(RoundWidth % 8) && !(ExtraWidth % 8) &&
            "Store size not an integral number of bytes!");
@@ -578,7 +587,7 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
 
       // Store the remaining ExtraWidth bits.
       IncrementSize = RoundWidth / 8;
-      Ptr = DAG.getMemBasePlusOffset(Ptr, IncrementSize, dl);
+      Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(IncrementSize), dl);
       Hi = DAG.getNode(
           ISD::SRL, dl, Value.getValueType(), Value,
           DAG.getConstant(RoundWidth, dl,
@@ -718,7 +727,7 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
 
   LLVM_DEBUG(dbgs() << "Legalizing extending load operation\n");
   EVT SrcVT = LD->getMemoryVT();
-  unsigned SrcWidth = SrcVT.getSizeInBits();
+  TypeSize SrcWidth = SrcVT.getSizeInBits();
   MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
   AAMDNodes AAInfo = LD->getAAInfo();
 
@@ -764,14 +773,15 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
 
     Value = Result;
     Chain = Ch;
-  } else if (SrcWidth & (SrcWidth - 1)) {
+  } else if (!isPowerOf2_64(SrcWidth.getKnownMinSize())) {
     // If not loading a power-of-2 number of bits, expand as two loads.
     assert(!SrcVT.isVector() && "Unsupported extload!");
-    unsigned LogSrcWidth = Log2_32(SrcWidth);
+    unsigned SrcWidthBits = SrcWidth.getFixedSize();
+    unsigned LogSrcWidth = Log2_32(SrcWidthBits);
     assert(LogSrcWidth < 32);
     unsigned RoundWidth = 1 << LogSrcWidth;
-    assert(RoundWidth < SrcWidth);
-    unsigned ExtraWidth = SrcWidth - RoundWidth;
+    assert(RoundWidth < SrcWidthBits);
+    unsigned ExtraWidth = SrcWidthBits - RoundWidth;
     assert(ExtraWidth < RoundWidth);
     assert(!(RoundWidth % 8) && !(ExtraWidth % 8) &&
            "Load size not an integral number of bytes!");
@@ -790,7 +800,7 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
 
       // Load the remaining ExtraWidth bits.
       IncrementSize = RoundWidth / 8;
-      Ptr = DAG.getMemBasePlusOffset(Ptr, IncrementSize, dl);
+      Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(IncrementSize), dl);
       Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Chain, Ptr,
                           LD->getPointerInfo().getWithOffset(IncrementSize),
                           ExtraVT, LD->getOriginalAlign(), MMOFlags, AAInfo);
@@ -818,7 +828,7 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
 
       // Load the remaining ExtraWidth bits.
       IncrementSize = RoundWidth / 8;
-      Ptr = DAG.getMemBasePlusOffset(Ptr, IncrementSize, dl);
+      Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(IncrementSize), dl);
       Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, Node->getValueType(0), Chain, Ptr,
                           LD->getPointerInfo().getWithOffset(IncrementSize),
                           ExtraVT, LD->getOriginalAlign(), MMOFlags, AAInfo);
@@ -1103,6 +1113,18 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
     // They'll be converted to Copy(To/From)Reg.
     Action = TargetLowering::Legal;
     break;
+  case ISD::UBSANTRAP:
+    Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
+    if (Action == TargetLowering::Expand) {
+      // replace ISD::UBSANTRAP with ISD::TRAP
+      SDValue NewVal;
+      NewVal = DAG.getNode(ISD::TRAP, SDLoc(Node), Node->getVTList(),
+                           Node->getOperand(0));
+      ReplaceNode(Node, NewVal.getNode());
+      LegalizeOp(NewVal.getNode());
+      return;
+    }
+    break;
   case ISD::DEBUGTRAP:
     Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
     if (Action == TargetLowering::Expand) {
@@ -1118,10 +1140,13 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
   case ISD::SADDSAT:
   case ISD::UADDSAT:
   case ISD::SSUBSAT:
-  case ISD::USUBSAT: {
+  case ISD::USUBSAT:
+  case ISD::SSHLSAT:
+  case ISD::USHLSAT:
+  case ISD::FP_TO_SINT_SAT:
+  case ISD::FP_TO_UINT_SAT:
     Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
     break;
-  }
   case ISD::SMULFIX:
   case ISD::SMULFIXSAT:
   case ISD::UMULFIX:
@@ -1159,6 +1184,10 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
     Action = TLI.getOperationAction(
         Node->getOpcode(), Node->getOperand(0).getValueType());
     break;
+  case ISD::VECREDUCE_SEQ_FADD:
+    Action = TLI.getOperationAction(
+        Node->getOpcode(), Node->getOperand(1).getValueType());
+    break;
   default:
     if (Node->getOpcode() >= ISD::BUILTIN_OP_END) {
       Action = TargetLowering::Legal;
@@ -1411,6 +1440,12 @@ SDValue SelectionDAGLegalize::ExpandVectorBuildThroughStack(SDNode* Node) {
   SmallVector<SDValue, 8> Stores;
   unsigned TypeByteSize = MemVT.getSizeInBits() / 8;
   assert(TypeByteSize > 0 && "Vector element type too small for stack store!");
+
+  // If the destination vector element type of a BUILD_VECTOR is narrower than
+  // the source element type, only store the bits necessary.
+  bool Truncate = isa<BuildVectorSDNode>(Node) &&
+                  MemVT.bitsLT(Node->getOperand(0).getValueType());
+
   // Store (in the right endianness) the elements to memory.
   for (unsigned i = 0, e = Node->getNumOperands(); i != e; ++i) {
     // Ignore undef elements.
@@ -1418,11 +1453,9 @@ SDValue SelectionDAGLegalize::ExpandVectorBuildThroughStack(SDNode* Node) {
 
     unsigned Offset = TypeByteSize*i;
 
-    SDValue Idx = DAG.getMemBasePlusOffset(FIPtr, Offset, dl);
+    SDValue Idx = DAG.getMemBasePlusOffset(FIPtr, TypeSize::Fixed(Offset), dl);
 
-    // If the destination vector element type is narrower than the source
-    // element type, only store the bits necessary.
-    if (MemVT.bitsLT(Node->getOperand(i).getValueType()))
+    if (Truncate)
       Stores.push_back(DAG.getTruncStore(DAG.getEntryNode(), dl,
                                          Node->getOperand(i), Idx,
                                          PtrInfo.getWithOffset(Offset), MemVT));
@@ -1448,7 +1481,7 @@ void SelectionDAGLegalize::getSignAsIntValue(FloatSignAsInt &State,
                                              const SDLoc &DL,
                                              SDValue Value) const {
   EVT FloatVT = Value.getValueType();
-  unsigned NumBits = FloatVT.getSizeInBits();
+  unsigned NumBits = FloatVT.getScalarSizeInBits();
   State.FloatVT = FloatVT;
   EVT IVT = EVT::getIntegerVT(*DAG.getContext(), NumBits);
   // Convert to an integer of the same size.
@@ -1480,8 +1513,9 @@ void SelectionDAGLegalize::getSignAsIntValue(FloatSignAsInt &State,
     State.IntPointerInfo = State.FloatPointerInfo;
   } else {
     // Advance the pointer so that the loaded byte will contain the sign bit.
-    unsigned ByteOffset = (FloatVT.getSizeInBits() / 8) - 1;
-    IntPtr = DAG.getMemBasePlusOffset(StackPtr, ByteOffset, DL);
+    unsigned ByteOffset = (NumBits / 8) - 1;
+    IntPtr =
+        DAG.getMemBasePlusOffset(StackPtr, TypeSize::Fixed(ByteOffset), DL);
     State.IntPointerInfo = MachinePointerInfo::getFixedStack(MF, FI,
                                                              ByteOffset);
   }
@@ -1489,7 +1523,7 @@ void SelectionDAGLegalize::getSignAsIntValue(FloatSignAsInt &State,
   State.IntPtr = IntPtr;
   State.IntValue = DAG.getExtLoad(ISD::EXTLOAD, DL, LoadTy, State.Chain, IntPtr,
                                   State.IntPointerInfo, MVT::i8);
-  State.SignMask = APInt::getOneBitSet(LoadTy.getSizeInBits(), 7);
+  State.SignMask = APInt::getOneBitSet(LoadTy.getScalarSizeInBits(), 7);
   State.SignBit = 7;
 }
 
@@ -1544,7 +1578,8 @@ SDValue SelectionDAGLegalize::ExpandFCOPYSIGN(SDNode *Node) const {
   // Get the signbit at the right position for MagAsInt.
   int ShiftAmount = SignAsInt.SignBit - MagAsInt.SignBit;
   EVT ShiftVT = IntVT;
-  if (SignBit.getValueSizeInBits() < ClearedSign.getValueSizeInBits()) {
+  if (SignBit.getScalarValueSizeInBits() <
+      ClearedSign.getScalarValueSizeInBits()) {
     SignBit = DAG.getNode(ISD::ZERO_EXTEND, DL, MagVT, SignBit);
     ShiftVT = MagVT;
   }
@@ -1555,7 +1590,8 @@ SDValue SelectionDAGLegalize::ExpandFCOPYSIGN(SDNode *Node) const {
     SDValue ShiftCnst = DAG.getConstant(-ShiftAmount, DL, ShiftVT);
     SignBit = DAG.getNode(ISD::SHL, DL, ShiftVT, SignBit, ShiftCnst);
   }
-  if (SignBit.getValueSizeInBits() > ClearedSign.getValueSizeInBits()) {
+  if (SignBit.getScalarValueSizeInBits() >
+      ClearedSign.getScalarValueSizeInBits()) {
     SignBit = DAG.getNode(ISD::TRUNCATE, DL, MagVT, SignBit);
   }
 
@@ -1564,6 +1600,22 @@ SDValue SelectionDAGLegalize::ExpandFCOPYSIGN(SDNode *Node) const {
   return modifySignAsInt(MagAsInt, DL, CopiedSign);
 }
 
+SDValue SelectionDAGLegalize::ExpandFNEG(SDNode *Node) const {
+  // Get the sign bit as an integer.
+  SDLoc DL(Node);
+  FloatSignAsInt SignAsInt;
+  getSignAsIntValue(SignAsInt, DL, Node->getOperand(0));
+  EVT IntVT = SignAsInt.IntValue.getValueType();
+
+  // Flip the sign.
+  SDValue SignMask = DAG.getConstant(SignAsInt.SignMask, DL, IntVT);
+  SDValue SignFlip =
+      DAG.getNode(ISD::XOR, DL, IntVT, SignAsInt.IntValue, SignMask);
+
+  // Convert back to float.
+  return modifySignAsInt(SignAsInt, DL, SignFlip);
+}
+
 SDValue SelectionDAGLegalize::ExpandFABS(SDNode *Node) const {
   SDLoc DL(Node);
   SDValue Value = Node->getOperand(0);
@@ -1587,7 +1639,7 @@ SDValue SelectionDAGLegalize::ExpandFABS(SDNode *Node) const {
 
 void SelectionDAGLegalize::ExpandDYNAMIC_STACKALLOC(SDNode* Node,
                                            SmallVectorImpl<SDValue> &Results) {
-  unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
+  Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
   assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
           " not tell us which reg is the stack pointer!");
   SDLoc dl(Node);
@@ -1681,21 +1733,41 @@ bool SelectionDAGLegalize::LegalizeSetCCCondCode(
     unsigned Opc = 0;
     switch (CCCode) {
     default: llvm_unreachable("Don't know how to expand this condition!");
+    case ISD::SETUO:
+        if (TLI.isCondCodeLegal(ISD::SETUNE, OpVT)) {
+          CC1 = ISD::SETUNE; CC2 = ISD::SETUNE; Opc = ISD::OR;
+          break;
+        }
+        assert(TLI.isCondCodeLegal(ISD::SETOEQ, OpVT) &&
+               "If SETUE is expanded, SETOEQ or SETUNE must be legal!");
+        NeedInvert = true;
+        LLVM_FALLTHROUGH;
     case ISD::SETO:
         assert(TLI.isCondCodeLegal(ISD::SETOEQ, OpVT)
             && "If SETO is expanded, SETOEQ must be legal!");
         CC1 = ISD::SETOEQ; CC2 = ISD::SETOEQ; Opc = ISD::AND; break;
-    case ISD::SETUO:
-        assert(TLI.isCondCodeLegal(ISD::SETUNE, OpVT)
-            && "If SETUO is expanded, SETUNE must be legal!");
-        CC1 = ISD::SETUNE; CC2 = ISD::SETUNE; Opc = ISD::OR;  break;
+    case ISD::SETONE:
+    case ISD::SETUEQ:
+        // If the SETUO or SETO CC isn't legal, we might be able to use
+        // SETOGT || SETOLT, inverting the result for SETUEQ. We only need one
+        // of SETOGT/SETOLT to be legal, the other can be emulated by swapping
+        // the operands.
+        CC2 = ((unsigned)CCCode & 0x8U) ? ISD::SETUO : ISD::SETO;
+        if (!TLI.isCondCodeLegal(CC2, OpVT) &&
+            (TLI.isCondCodeLegal(ISD::SETOGT, OpVT) ||
+             TLI.isCondCodeLegal(ISD::SETOLT, OpVT))) {
+          CC1 = ISD::SETOGT;
+          CC2 = ISD::SETOLT;
+          Opc = ISD::OR;
+          NeedInvert = ((unsigned)CCCode & 0x8U);
+          break;
+        }
+        LLVM_FALLTHROUGH;
     case ISD::SETOEQ:
     case ISD::SETOGT:
     case ISD::SETOGE:
     case ISD::SETOLT:
     case ISD::SETOLE:
-    case ISD::SETONE:
-    case ISD::SETUEQ:
     case ISD::SETUNE:
     case ISD::SETUGT:
     case ISD::SETUGE:
@@ -1727,12 +1799,16 @@ bool SelectionDAGLegalize::LegalizeSetCCCondCode(
     if (CCCode != ISD::SETO && CCCode != ISD::SETUO) {
       // If we aren't the ordered or unorder operation,
       // then the pattern is (LHS CC1 RHS) Opc (LHS CC2 RHS).
-      SetCC1 = DAG.getSetCC(dl, VT, LHS, RHS, CC1, Chain, IsSignaling);
-      SetCC2 = DAG.getSetCC(dl, VT, LHS, RHS, CC2, Chain, IsSignaling);
+      SetCC1 = DAG.getSetCC(dl, VT, LHS, RHS, CC1, Chain,
+                            IsSignaling);
+      SetCC2 = DAG.getSetCC(dl, VT, LHS, RHS, CC2, Chain,
+                            IsSignaling);
     } else {
       // Otherwise, the pattern is (LHS CC1 LHS) Opc (RHS CC2 RHS)
-      SetCC1 = DAG.getSetCC(dl, VT, LHS, LHS, CC1, Chain, IsSignaling);
-      SetCC2 = DAG.getSetCC(dl, VT, RHS, RHS, CC2, Chain, IsSignaling);
+      SetCC1 = DAG.getSetCC(dl, VT, LHS, LHS, CC1, Chain,
+                            IsSignaling);
+      SetCC2 = DAG.getSetCC(dl, VT, RHS, RHS, CC2, Chain,
+                            IsSignaling);
     }
     if (Chain)
       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, SetCC1.getValue(1),
@@ -1758,27 +1834,34 @@ SDValue SelectionDAGLegalize::EmitStackConvert(SDValue SrcOp, EVT SlotVT,
 SDValue SelectionDAGLegalize::EmitStackConvert(SDValue SrcOp, EVT SlotVT,
                                                EVT DestVT, const SDLoc &dl,
                                                SDValue Chain) {
+  unsigned SrcSize = SrcOp.getValueSizeInBits();
+  unsigned SlotSize = SlotVT.getSizeInBits();
+  unsigned DestSize = DestVT.getSizeInBits();
+  Type *DestType = DestVT.getTypeForEVT(*DAG.getContext());
+  Align DestAlign = DAG.getDataLayout().getPrefTypeAlign(DestType);
+
+  // Don't convert with stack if the load/store is expensive.
+  if ((SrcSize > SlotSize &&
+       !TLI.isTruncStoreLegalOrCustom(SrcOp.getValueType(), SlotVT)) ||
+      (SlotSize < DestSize &&
+       !TLI.isLoadExtLegalOrCustom(ISD::EXTLOAD, DestVT, SlotVT)))
+    return SDValue();
+
   // Create the stack frame object.
-  unsigned SrcAlign = DAG.getDataLayout().getPrefTypeAlignment(
+  Align SrcAlign = DAG.getDataLayout().getPrefTypeAlign(
       SrcOp.getValueType().getTypeForEVT(*DAG.getContext()));
-  SDValue FIPtr = DAG.CreateStackTemporary(SlotVT, SrcAlign);
+  SDValue FIPtr = DAG.CreateStackTemporary(SlotVT.getStoreSize(), SrcAlign);
 
   FrameIndexSDNode *StackPtrFI = cast<FrameIndexSDNode>(FIPtr);
   int SPFI = StackPtrFI->getIndex();
   MachinePointerInfo PtrInfo =
       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
 
-  unsigned SrcSize = SrcOp.getValueSizeInBits();
-  unsigned SlotSize = SlotVT.getSizeInBits();
-  unsigned DestSize = DestVT.getSizeInBits();
-  Type *DestType = DestVT.getTypeForEVT(*DAG.getContext());
-  unsigned DestAlign = DAG.getDataLayout().getPrefTypeAlignment(DestType);
-
   // Emit a store to the stack slot.  Use a truncstore if the input value is
   // later than DestVT.
   SDValue Store;
 
-  if (SrcSize > SlotSize) 
+  if (SrcSize > SlotSize)
     Store = DAG.getTruncStore(Chain, dl, SrcOp, FIPtr, PtrInfo,
                               SlotVT, SrcAlign);
   else {
@@ -1790,7 +1873,7 @@ SDValue SelectionDAGLegalize::EmitStackConvert(SDValue SrcOp, EVT SlotVT,
   // Result is a load from the stack slot.
   if (SlotSize == DestSize)
     return DAG.getLoad(DestVT, dl, Store, FIPtr, PtrInfo, DestAlign);
-    
+
   assert(SlotSize < DestSize && "Unknown extension!");
   return DAG.getExtLoad(ISD::EXTLOAD, dl, DestVT, Store, FIPtr, PtrInfo, SlotVT,
                         DestAlign);
@@ -2111,7 +2194,7 @@ void SelectionDAGLegalize::ExpandFPLibCall(SDNode* Node,
 
   if (Node->isStrictFPOpcode()) {
     EVT RetVT = Node->getValueType(0);
-    SmallVector<SDValue, 4> Ops(Node->op_begin() + 1, Node->op_end());
+    SmallVector<SDValue, 4> Ops(drop_begin(Node->ops()));
     TargetLowering::MakeLibCallOptions CallOptions;
     // FIXME: This doesn't support tail calls.
     std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, RetVT,
@@ -2361,7 +2444,11 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(SDNode *Node,
 
   // TODO: Should any fast-math-flags be set for the created nodes?
   LLVM_DEBUG(dbgs() << "Legalizing INT_TO_FP\n");
-  if (SrcVT == MVT::i32 && TLI.isTypeLegal(MVT::f64)) {
+  if (SrcVT == MVT::i32 && TLI.isTypeLegal(MVT::f64) &&
+      (DestVT.bitsLE(MVT::f64) ||
+       TLI.isOperationLegal(Node->isStrictFPOpcode() ? ISD::STRICT_FP_EXTEND
+                                                     : ISD::FP_EXTEND,
+                            DestVT))) {
     LLVM_DEBUG(dbgs() << "32-bit [signed|unsigned] integer to float/double "
                          "expansion\n");
 
@@ -2388,7 +2475,7 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(SDNode *Node,
     SDValue Store1 = DAG.getStore(MemChain, dl, Lo, StackSlot,
                                   MachinePointerInfo());
     // Store the hi of the constructed double.
-    SDValue HiPtr = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
+    SDValue HiPtr = DAG.getMemBasePlusOffset(StackSlot, TypeSize::Fixed(4), dl);
     SDValue Store2 =
         DAG.getStore(MemChain, dl, Hi, HiPtr, MachinePointerInfo());
     MemChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2);
@@ -2423,16 +2510,24 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(SDNode *Node,
     }
     return Result;
   }
-  // Code below here assumes !isSigned without checking again.
-  assert(!isSigned && "Legalize cannot Expand SINT_TO_FP for i64 yet");
+
+  if (isSigned)
+    return SDValue();
 
   // TODO: Generalize this for use with other types.
-  if ((SrcVT == MVT::i32 || SrcVT == MVT::i64) && DestVT == MVT::f32) {
-    LLVM_DEBUG(dbgs() << "Converting unsigned i32/i64 to f32\n");
+  if (((SrcVT == MVT::i32 || SrcVT == MVT::i64) && DestVT == MVT::f32) ||
+      (SrcVT == MVT::i64 && DestVT == MVT::f64)) {
+    LLVM_DEBUG(dbgs() << "Converting unsigned i32/i64 to f32/f64\n");
     // For unsigned conversions, convert them to signed conversions using the
     // algorithm from the x86_64 __floatundisf in compiler_rt. That method
     // should be valid for i32->f32 as well.
 
+    // More generally this transform should be valid if there are 3 more bits
+    // in the integer type than the significand. Rounding uses the first bit
+    // after the width of the significand and the OR of all bits after that. So
+    // we need to be able to OR the shifted out bit into one of the bits that
+    // participate in the OR.
+
     // TODO: This really should be implemented using a branch rather than a
     // select.  We happen to get lucky and machinesink does the right
     // thing most of the time.  This would be a good candidate for a
@@ -2476,6 +2571,11 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(SDNode *Node,
     return DAG.getSelect(dl, DestVT, SignBitTest, Slow, Fast);
   }
 
+  // Don't expand it if there isn't cheap fadd.
+  if (!TLI.isOperationLegalOrCustom(
+          Node->isStrictFPOpcode() ? ISD::STRICT_FADD : ISD::FADD, DestVT))
+    return SDValue();
+
   // The following optimization is valid only if every value in SrcVT (when
   // treated as signed) is representable in DestVT.  Check that the mantissa
   // size of DestVT is >= than the number of bits in SrcVT -1.
@@ -2502,7 +2602,8 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(SDNode *Node,
   // offset depending on the data type.
   uint64_t FF;
   switch (SrcVT.getSimpleVT().SimpleTy) {
-  default: llvm_unreachable("Unsupported integer type!");
+  default:
+    return SDValue();
   case MVT::i8 : FF = 0x43800000ULL; break;  // 2^8  (as a float)
   case MVT::i16: FF = 0x47800000ULL; break;  // 2^16 (as a float)
   case MVT::i32: FF = 0x4F800000ULL; break;  // 2^32 (as a float)
@@ -2657,6 +2758,30 @@ void SelectionDAGLegalize::PromoteLegalFP_TO_INT(SDNode *N, const SDLoc &dl,
     Results.push_back(Operation.getValue(1));
 }
 
+/// Promote FP_TO_*INT_SAT operation to a larger result type. At this point
+/// the result and operand types are legal and there must be a legal
+/// FP_TO_*INT_SAT operation for a larger result type.
+SDValue SelectionDAGLegalize::PromoteLegalFP_TO_INT_SAT(SDNode *Node,
+                                                        const SDLoc &dl) {
+  unsigned Opcode = Node->getOpcode();
+
+  // Scan for the appropriate larger type to use.
+  EVT NewOutTy = Node->getValueType(0);
+  while (true) {
+    NewOutTy = (MVT::SimpleValueType)(NewOutTy.getSimpleVT().SimpleTy + 1);
+    assert(NewOutTy.isInteger() && "Ran out of possibilities!");
+
+    if (TLI.isOperationLegalOrCustom(Opcode, NewOutTy))
+      break;
+  }
+
+  // Saturation width is determined by second operand, so we don't have to
+  // perform any fixup and can directly truncate the result.
+  SDValue Result = DAG.getNode(Opcode, dl, NewOutTy, Node->getOperand(0),
+                               Node->getOperand(1));
+  return DAG.getNode(ISD::TRUNCATE, dl, Node->getValueType(0), Result);
+}
+
 /// Legalize a BITREVERSE scalar/vector operation as a series of mask + shifts.
 SDValue SelectionDAGLegalize::ExpandBITREVERSE(SDValue Op, const SDLoc &dl) {
   EVT VT = Op.getValueType();
@@ -2773,6 +2898,28 @@ SDValue SelectionDAGLegalize::ExpandBSWAP(SDValue Op, const SDLoc &dl) {
   }
 }
 
+/// Open code the operations for PARITY of the specified operation.
+SDValue SelectionDAGLegalize::ExpandPARITY(SDValue Op, const SDLoc &dl) {
+  EVT VT = Op.getValueType();
+  EVT ShVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
+  unsigned Sz = VT.getScalarSizeInBits();
+
+  // If CTPOP is legal, use it. Otherwise use shifts and xor.
+  SDValue Result;
+  if (TLI.isOperationLegal(ISD::CTPOP, VT)) {
+    Result = DAG.getNode(ISD::CTPOP, dl, VT, Op);
+  } else {
+    Result = Op;
+    for (unsigned i = Log2_32_Ceil(Sz); i != 0;) {
+      SDValue Shift = DAG.getNode(ISD::SRL, dl, VT, Result,
+                                  DAG.getConstant(1ULL << (--i), dl, ShVT));
+      Result = DAG.getNode(ISD::XOR, dl, VT, Result, Shift);
+    }
+  }
+
+  return DAG.getNode(ISD::AND, dl, VT, Result, DAG.getConstant(1, dl, VT));
+}
+
 bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
   LLVM_DEBUG(dbgs() << "Trying to expand node\n");
   SmallVector<SDValue, 8> Results;
@@ -2804,6 +2951,9 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
   case ISD::BSWAP:
     Results.push_back(ExpandBSWAP(Node->getOperand(0), dl));
     break;
+  case ISD::PARITY:
+    Results.push_back(ExpandPARITY(Node->getOperand(0), dl));
+    break;
   case ISD::FRAMEADDR:
   case ISD::RETURNADDR:
   case ISD::FRAME_TO_ARGS_OFFSET:
@@ -2948,18 +3098,19 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
       break;
     // We fall back to use stack operation when the FP_ROUND operation
     // isn't available.
-    Tmp1 = EmitStackConvert(Node->getOperand(1), 
-                            Node->getValueType(0),
-                            Node->getValueType(0), dl, Node->getOperand(0));
-    ReplaceNode(Node, Tmp1.getNode());
-    LLVM_DEBUG(dbgs() << "Successfully expanded STRICT_FP_ROUND node\n");
-    return true;
+    if ((Tmp1 = EmitStackConvert(Node->getOperand(1), Node->getValueType(0),
+                                 Node->getValueType(0), dl,
+                                 Node->getOperand(0)))) {
+      ReplaceNode(Node, Tmp1.getNode());
+      LLVM_DEBUG(dbgs() << "Successfully expanded STRICT_FP_ROUND node\n");
+      return true;
+    }
+    break;
   case ISD::FP_ROUND:
   case ISD::BITCAST:
-    Tmp1 = EmitStackConvert(Node->getOperand(0), 
-                            Node->getValueType(0),
-                            Node->getValueType(0), dl);
-    Results.push_back(Tmp1);
+    if ((Tmp1 = EmitStackConvert(Node->getOperand(0), Node->getValueType(0),
+                                 Node->getValueType(0), dl)))
+      Results.push_back(Tmp1);
     break;
   case ISD::STRICT_FP_EXTEND:
     // When strict mode is enforced we can't do expansion because it
@@ -2974,17 +3125,19 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
       break;
     // We fall back to use stack operation when the FP_EXTEND operation
     // isn't available.
-    Tmp1 = EmitStackConvert(Node->getOperand(1),
-                            Node->getOperand(1).getValueType(),
-                            Node->getValueType(0), dl, Node->getOperand(0));
-    ReplaceNode(Node, Tmp1.getNode());
-    LLVM_DEBUG(dbgs() << "Successfully expanded STRICT_FP_EXTEND node\n");
-    return true;
+    if ((Tmp1 = EmitStackConvert(
+             Node->getOperand(1), Node->getOperand(1).getValueType(),
+             Node->getValueType(0), dl, Node->getOperand(0)))) {
+      ReplaceNode(Node, Tmp1.getNode());
+      LLVM_DEBUG(dbgs() << "Successfully expanded STRICT_FP_EXTEND node\n");
+      return true;
+    }
+    break;
   case ISD::FP_EXTEND:
-    Tmp1 = EmitStackConvert(Node->getOperand(0),
-                            Node->getOperand(0).getValueType(),
-                            Node->getValueType(0), dl);
-    Results.push_back(Tmp1);
+    if ((Tmp1 = EmitStackConvert(Node->getOperand(0),
+                                 Node->getOperand(0).getValueType(),
+                                 Node->getValueType(0), dl)))
+      Results.push_back(Tmp1);
     break;
   case ISD::SIGN_EXTEND_INREG: {
     EVT ExtraVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
@@ -3029,10 +3182,11 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     LLVM_FALLTHROUGH;
   case ISD::SINT_TO_FP:
   case ISD::STRICT_SINT_TO_FP:
-    Tmp1 = ExpandLegalINT_TO_FP(Node, Tmp2);
-    Results.push_back(Tmp1);
-    if (Node->isStrictFPOpcode())
-      Results.push_back(Tmp2);
+    if ((Tmp1 = ExpandLegalINT_TO_FP(Node, Tmp2))) {
+      Results.push_back(Tmp1);
+      if (Node->isStrictFPOpcode())
+        Results.push_back(Tmp2);
+    }
     break;
   case ISD::FP_TO_SINT:
     if (TLI.expandFP_TO_SINT(Node, Tmp1, DAG))
@@ -3059,6 +3213,10 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
       return true;
     }
     break;
+  case ISD::FP_TO_SINT_SAT:
+  case ISD::FP_TO_UINT_SAT:
+    Results.push_back(TLI.expandFP_TO_INT_SAT(Node, DAG));
+    break;
   case ISD::VAARG:
     Results.push_back(DAG.expandVAArg(Node));
     Results.push_back(Results[0].getValue(1));
@@ -3187,7 +3345,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
   case ISD::STACKSAVE:
     // Expand to CopyFromReg if the target set
     // StackPointerRegisterToSaveRestore.
-    if (unsigned SP = TLI.getStackPointerRegisterToSaveRestore()) {
+    if (Register SP = TLI.getStackPointerRegisterToSaveRestore()) {
       Results.push_back(DAG.getCopyFromReg(Node->getOperand(0), dl, SP,
                                            Node->getValueType(0)));
       Results.push_back(Results[0].getValue(1));
@@ -3199,7 +3357,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
   case ISD::STACKRESTORE:
     // Expand to CopyToReg if the target set
     // StackPointerRegisterToSaveRestore.
-    if (unsigned SP = TLI.getStackPointerRegisterToSaveRestore()) {
+    if (Register SP = TLI.getStackPointerRegisterToSaveRestore()) {
       Results.push_back(DAG.getCopyToReg(Node->getOperand(0), dl, SP,
                                          Node->getOperand(1)));
     } else {
@@ -3214,12 +3372,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Results.push_back(ExpandFCOPYSIGN(Node));
     break;
   case ISD::FNEG:
-    // Expand Y = FNEG(X) ->  Y = SUB -0.0, X
-    Tmp1 = DAG.getConstantFP(-0.0, dl, Node->getValueType(0));
-    // TODO: If FNEG has fast-math-flags, propagate them to the FSUB.
-    Tmp1 = DAG.getNode(ISD::FSUB, dl, Node->getValueType(0), Tmp1,
-                       Node->getOperand(0));
-    Results.push_back(Tmp1);
+    Results.push_back(ExpandFNEG(Node));
     break;
   case ISD::FABS:
     Results.push_back(ExpandFABS(Node));
@@ -3315,7 +3468,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     // Check to see if this FP immediate is already legal.
     // If this is a legal constant, turn it into a TargetConstantFP node.
     if (!TLI.isFPImmLegal(CFP->getValueAPF(), Node->getValueType(0),
-                          DAG.getMachineFunction().getFunction().hasOptSize()))
+                          DAG.shouldOptForSize()))
       Results.push_back(ExpandConstantFP(CFP, true));
     break;
   }
@@ -3394,7 +3547,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     SmallVector<SDValue, 4> Halves;
     EVT HalfType = EVT(VT).getHalfSizedIntegerVT(*DAG.getContext());
     assert(TLI.isTypeLegal(HalfType));
-    if (TLI.expandMUL_LOHI(Node->getOpcode(), VT, Node, LHS, RHS, Halves,
+    if (TLI.expandMUL_LOHI(Node->getOpcode(), VT, dl, LHS, RHS, Halves,
                            HalfType, DAG,
                            TargetLowering::MulExpansionKind::Always)) {
       for (unsigned i = 0; i < 2; ++i) {
@@ -3463,7 +3616,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     break;
   case ISD::ROTL:
   case ISD::ROTR:
-    if (TLI.expandROT(Node, Tmp1, DAG))
+    if (TLI.expandROT(Node, true /*AllowVectorOps*/, Tmp1, DAG))
       Results.push_back(Tmp1);
     break;
   case ISD::SADDSAT:
@@ -3472,6 +3625,10 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
   case ISD::USUBSAT:
     Results.push_back(TLI.expandAddSubSat(Node, DAG));
     break;
+  case ISD::SSHLSAT:
+  case ISD::USHLSAT:
+    Results.push_back(TLI.expandShlSat(Node, DAG));
+    break;
   case ISD::SMULFIX:
   case ISD::SMULFIXSAT:
   case ISD::UMULFIX:
@@ -3809,16 +3966,16 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     (void)Legalized;
     assert(Legalized && "Can't legalize BR_CC with legal condition!");
 
-    assert(!NeedInvert && "Don't know how to invert BR_CC!");
-
     // If we expanded the SETCC by swapping LHS and RHS, create a new BR_CC
     // node.
     if (Tmp4.getNode()) {
+      assert(!NeedInvert && "Don't know how to invert BR_CC!");
+
       Tmp1 = DAG.getNode(ISD::BR_CC, dl, Node->getValueType(0), Tmp1,
                          Tmp4, Tmp2, Tmp3, Node->getOperand(4));
     } else {
       Tmp3 = DAG.getConstant(0, dl, Tmp2.getValueType());
-      Tmp4 = DAG.getCondCode(ISD::SETNE);
+      Tmp4 = DAG.getCondCode(NeedInvert ? ISD::SETEQ : ISD::SETNE);
       Tmp1 = DAG.getNode(ISD::BR_CC, dl, Node->getValueType(0), Tmp1, Tmp4,
                          Tmp2, Tmp3, Node->getOperand(4));
     }
@@ -3899,6 +4056,27 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
           == TargetLowering::Legal)
         return true;
       break;
+    case ISD::STRICT_FSUB: {
+      if (TLI.getStrictFPOperationAction(
+              ISD::STRICT_FSUB, Node->getValueType(0)) == TargetLowering::Legal)
+        return true;
+      if (TLI.getStrictFPOperationAction(
+              ISD::STRICT_FADD, Node->getValueType(0)) != TargetLowering::Legal)
+        break;
+
+      EVT VT = Node->getValueType(0);
+      const SDNodeFlags Flags = Node->getFlags();
+      SDValue Neg = DAG.getNode(ISD::FNEG, dl, VT, Node->getOperand(2), Flags);
+      SDValue Fadd = DAG.getNode(ISD::STRICT_FADD, dl, Node->getVTList(),
+                                 {Node->getOperand(0), Node->getOperand(1), Neg},
+                         Flags);
+
+      Results.push_back(Fadd);
+      Results.push_back(Fadd.getValue(1));
+      break;
+    }
+    case ISD::STRICT_SINT_TO_FP:
+    case ISD::STRICT_UINT_TO_FP:
     case ISD::STRICT_LRINT:
     case ISD::STRICT_LLRINT:
     case ISD::STRICT_LROUND:
@@ -3967,12 +4145,23 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
   case ISD::ATOMIC_LOAD_UMAX:
   case ISD::ATOMIC_CMP_SWAP: {
     MVT VT = cast<AtomicSDNode>(Node)->getMemoryVT().getSimpleVT();
-    RTLIB::Libcall LC = RTLIB::getSYNC(Opc, VT);
-    assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected atomic op or value type!");
-
+    AtomicOrdering Order = cast<AtomicSDNode>(Node)->getOrdering();
+    RTLIB::Libcall LC = RTLIB::getOUTLINE_ATOMIC(Opc, Order, VT);
     EVT RetVT = Node->getValueType(0);
-    SmallVector<SDValue, 4> Ops(Node->op_begin() + 1, Node->op_end());
     TargetLowering::MakeLibCallOptions CallOptions;
+    SmallVector<SDValue, 4> Ops;
+    if (TLI.getLibcallName(LC)) {
+      // If outline atomic available, prepare its arguments and expand.
+      Ops.append(Node->op_begin() + 2, Node->op_end());
+      Ops.push_back(Node->getOperand(1));
+
+    } else {
+      LC = RTLIB::getSYNC(Opc, VT);
+      assert(LC != RTLIB::UNKNOWN_LIBCALL &&
+             "Unexpected atomic op or value type!");
+      // Arguments for expansion to sync libcall
+      Ops.append(Node->op_begin() + 1, Node->op_end());
+    }
     std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, RetVT,
                                                       Ops, CallOptions,
                                                       SDLoc(Node),
@@ -4220,11 +4409,131 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
     Results.push_back(ExpandLibCall(LC, Node, false));
     break;
   }
+  case ISD::STRICT_SINT_TO_FP:
+  case ISD::STRICT_UINT_TO_FP:
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP: {
+    // TODO - Common the code with DAGTypeLegalizer::SoftenFloatRes_XINT_TO_FP
+    bool IsStrict = Node->isStrictFPOpcode();
+    bool Signed = Node->getOpcode() == ISD::SINT_TO_FP ||
+                  Node->getOpcode() == ISD::STRICT_SINT_TO_FP;
+    EVT SVT = Node->getOperand(IsStrict ? 1 : 0).getValueType();
+    EVT RVT = Node->getValueType(0);
+    EVT NVT = EVT();
+    SDLoc dl(Node);
+
+    // Even if the input is legal, no libcall may exactly match, eg. we don't
+    // have i1 -> fp conversions. So, it needs to be promoted to a larger type,
+    // eg: i13 -> fp. Then, look for an appropriate libcall.
+    RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+    for (unsigned t = MVT::FIRST_INTEGER_VALUETYPE;
+         t <= MVT::LAST_INTEGER_VALUETYPE && LC == RTLIB::UNKNOWN_LIBCALL;
+         ++t) {
+      NVT = (MVT::SimpleValueType)t;
+      // The source needs to big enough to hold the operand.
+      if (NVT.bitsGE(SVT))
+        LC = Signed ? RTLIB::getSINTTOFP(NVT, RVT)
+                    : RTLIB::getUINTTOFP(NVT, RVT);
+    }
+    assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unable to legalize as libcall");
+
+    SDValue Chain = IsStrict ? Node->getOperand(0) : SDValue();
+    // Sign/zero extend the argument if the libcall takes a larger type.
+    SDValue Op = DAG.getNode(Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, dl,
+                             NVT, Node->getOperand(IsStrict ? 1 : 0));
+    TargetLowering::MakeLibCallOptions CallOptions;
+    CallOptions.setSExt(Signed);
+    std::pair<SDValue, SDValue> Tmp =
+        TLI.makeLibCall(DAG, LC, RVT, Op, CallOptions, dl, Chain);
+    Results.push_back(Tmp.first);
+    if (IsStrict)
+      Results.push_back(Tmp.second);
+    break;
+  }
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+  case ISD::STRICT_FP_TO_SINT:
+  case ISD::STRICT_FP_TO_UINT: {
+    // TODO - Common the code with DAGTypeLegalizer::SoftenFloatOp_FP_TO_XINT.
+    bool IsStrict = Node->isStrictFPOpcode();
+    bool Signed = Node->getOpcode() == ISD::FP_TO_SINT ||
+                  Node->getOpcode() == ISD::STRICT_FP_TO_SINT;
+
+    SDValue Op = Node->getOperand(IsStrict ? 1 : 0);
+    EVT SVT = Op.getValueType();
+    EVT RVT = Node->getValueType(0);
+    EVT NVT = EVT();
+    SDLoc dl(Node);
+
+    // Even if the result is legal, no libcall may exactly match, eg. we don't
+    // have fp -> i1 conversions. So, it needs to be promoted to a larger type,
+    // eg: fp -> i32. Then, look for an appropriate libcall.
+    RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+    for (unsigned IntVT = MVT::FIRST_INTEGER_VALUETYPE;
+         IntVT <= MVT::LAST_INTEGER_VALUETYPE && LC == RTLIB::UNKNOWN_LIBCALL;
+         ++IntVT) {
+      NVT = (MVT::SimpleValueType)IntVT;
+      // The type needs to big enough to hold the result.
+      if (NVT.bitsGE(RVT))
+        LC = Signed ? RTLIB::getFPTOSINT(SVT, NVT)
+                    : RTLIB::getFPTOUINT(SVT, NVT);
+    }
+    assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unable to legalize as libcall");
+
+    SDValue Chain = IsStrict ? Node->getOperand(0) : SDValue();
+    TargetLowering::MakeLibCallOptions CallOptions;
+    std::pair<SDValue, SDValue> Tmp =
+        TLI.makeLibCall(DAG, LC, NVT, Op, CallOptions, dl, Chain);
+
+    // Truncate the result if the libcall returns a larger type.
+    Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, RVT, Tmp.first));
+    if (IsStrict)
+      Results.push_back(Tmp.second);
+    break;
+  }
+
+  case ISD::FP_ROUND:
+  case ISD::STRICT_FP_ROUND: {
+    // X = FP_ROUND(Y, TRUNC)
+    // TRUNC is a flag, which is always an integer that is zero or one.
+    // If TRUNC is 0, this is a normal rounding, if it is 1, this FP_ROUND
+    // is known to not change the value of Y.
+    // We can only expand it into libcall if the TRUNC is 0.
+    bool IsStrict = Node->isStrictFPOpcode();
+    SDValue Op = Node->getOperand(IsStrict ? 1 : 0);
+    SDValue Chain = IsStrict ? Node->getOperand(0) : SDValue();
+    EVT VT = Node->getValueType(0);
+    assert(cast<ConstantSDNode>(Node->getOperand(IsStrict ? 2 : 1))
+               ->isNullValue() &&
+           "Unable to expand as libcall if it is not normal rounding");
+
+    RTLIB::Libcall LC = RTLIB::getFPROUND(Op.getValueType(), VT);
+    assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unable to legalize as libcall");
+
+    TargetLowering::MakeLibCallOptions CallOptions;
+    std::pair<SDValue, SDValue> Tmp =
+        TLI.makeLibCall(DAG, LC, VT, Op, CallOptions, SDLoc(Node), Chain);
+    Results.push_back(Tmp.first);
+    if (IsStrict)
+      Results.push_back(Tmp.second);
+    break;
+  }
+  case ISD::FP_EXTEND: {
+    Results.push_back(
+        ExpandLibCall(RTLIB::getFPEXT(Node->getOperand(0).getValueType(),
+                                      Node->getValueType(0)),
+                      Node, false));
+    break;
+  }
+  case ISD::STRICT_FP_EXTEND:
   case ISD::STRICT_FP_TO_FP16: {
     RTLIB::Libcall LC =
-        RTLIB::getFPROUND(Node->getOperand(1).getValueType(), MVT::f16);
-    assert(LC != RTLIB::UNKNOWN_LIBCALL &&
-           "Unable to expand strict_fp_to_fp16");
+        Node->getOpcode() == ISD::STRICT_FP_TO_FP16
+            ? RTLIB::getFPROUND(Node->getOperand(1).getValueType(), MVT::f16)
+            : RTLIB::getFPEXT(Node->getOperand(1).getValueType(),
+                              Node->getValueType(0));
+    assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unable to legalize as libcall");
+
     TargetLowering::MakeLibCallOptions CallOptions;
     std::pair<SDValue, SDValue> Tmp =
         TLI.makeLibCall(DAG, LC, Node->getValueType(0), Node->getOperand(1),
@@ -4321,7 +4630,9 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
     OVT = Node->getOperand(0).getSimpleValueType();
   }
   if (Node->getOpcode() == ISD::STRICT_UINT_TO_FP ||
-      Node->getOpcode() == ISD::STRICT_SINT_TO_FP)
+      Node->getOpcode() == ISD::STRICT_SINT_TO_FP ||
+      Node->getOpcode() == ISD::STRICT_FSETCC ||
+      Node->getOpcode() == ISD::STRICT_FSETCCS)
     OVT = Node->getOperand(1).getSimpleValueType();
   if (Node->getOpcode() == ISD::BR_CC)
     OVT = Node->getOperand(2).getSimpleValueType();
@@ -4381,6 +4692,10 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
   case ISD::STRICT_FP_TO_SINT:
     PromoteLegalFP_TO_INT(Node, dl, Results);
     break;
+  case ISD::FP_TO_UINT_SAT:
+  case ISD::FP_TO_SINT_SAT:
+    Results.push_back(PromoteLegalFP_TO_INT_SAT(Node, dl));
+    break;
   case ISD::UINT_TO_FP:
   case ISD::STRICT_UINT_TO_FP:
   case ISD::SINT_TO_FP:
@@ -4515,13 +4830,29 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
     Results.push_back(Tmp1);
     break;
   }
-  case ISD::SETCC: {
+  case ISD::SETCC:
+  case ISD::STRICT_FSETCC:
+  case ISD::STRICT_FSETCCS: {
     unsigned ExtOp = ISD::FP_EXTEND;
     if (NVT.isInteger()) {
-      ISD::CondCode CCCode =
-        cast<CondCodeSDNode>(Node->getOperand(2))->get();
+      ISD::CondCode CCCode = cast<CondCodeSDNode>(Node->getOperand(2))->get();
       ExtOp = isSignedIntSetCC(CCCode) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
     }
+    if (Node->isStrictFPOpcode()) {
+      SDValue InChain = Node->getOperand(0);
+      std::tie(Tmp1, std::ignore) =
+          DAG.getStrictFPExtendOrRound(Node->getOperand(1), InChain, dl, NVT);
+      std::tie(Tmp2, std::ignore) =
+          DAG.getStrictFPExtendOrRound(Node->getOperand(2), InChain, dl, NVT);
+      SmallVector<SDValue, 2> TmpChains = {Tmp1.getValue(1), Tmp2.getValue(1)};
+      SDValue OutChain = DAG.getTokenFactor(dl, TmpChains);
+      SDVTList VTs = DAG.getVTList(Node->getValueType(0), MVT::Other);
+      Results.push_back(DAG.getNode(Node->getOpcode(), dl, VTs,
+                                    {OutChain, Tmp1, Tmp2, Node->getOperand(3)},
+                                    Node->getFlags()));
+      Results.push_back(Results.back().getValue(1));
+      break;
+    }
     Tmp1 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(0));
     Tmp2 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(1));
     Results.push_back(DAG.getNode(ISD::SETCC, dl, Node->getValueType(0), Tmp1,
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 7e8ad28f9b14..966645e3256d 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -134,6 +134,16 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
     case ISD::UINT_TO_FP:  R = SoftenFloatRes_XINT_TO_FP(N); break;
     case ISD::UNDEF:       R = SoftenFloatRes_UNDEF(N); break;
     case ISD::VAARG:       R = SoftenFloatRes_VAARG(N); break;
+    case ISD::VECREDUCE_FADD:
+    case ISD::VECREDUCE_FMUL:
+    case ISD::VECREDUCE_FMIN:
+    case ISD::VECREDUCE_FMAX:
+      R = SoftenFloatRes_VECREDUCE(N);
+      break;
+    case ISD::VECREDUCE_SEQ_FADD:
+    case ISD::VECREDUCE_SEQ_FMUL:
+      R = SoftenFloatRes_VECREDUCE_SEQ(N);
+      break;
   }
 
   // If R is null, the sub-method took care of registering the result.
@@ -772,6 +782,16 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_XINT_TO_FP(SDNode *N) {
   return Tmp.first;
 }
 
+SDValue DAGTypeLegalizer::SoftenFloatRes_VECREDUCE(SDNode *N) {
+  // Expand and soften recursively.
+  ReplaceValueWith(SDValue(N, 0), TLI.expandVecReduce(N, DAG));
+  return SDValue();
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_VECREDUCE_SEQ(SDNode *N) {
+  ReplaceValueWith(SDValue(N, 0), TLI.expandVecReduceSeq(N, DAG));
+  return SDValue();
+}
 
 //===----------------------------------------------------------------------===//
 //  Convert Float Operand to Integer
@@ -799,6 +819,9 @@ bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) {
   case ISD::STRICT_FP_TO_UINT:
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:  Res = SoftenFloatOp_FP_TO_XINT(N); break;
+  case ISD::FP_TO_SINT_SAT:
+  case ISD::FP_TO_UINT_SAT:
+                         Res = SoftenFloatOp_FP_TO_XINT_SAT(N); break;
   case ISD::STRICT_LROUND:
   case ISD::LROUND:      Res = SoftenFloatOp_LROUND(N); break;
   case ISD::STRICT_LLROUND:
@@ -890,6 +913,24 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_BR_CC(SDNode *N) {
                  0);
 }
 
+// Even if the result type is legal, no libcall may exactly match. (e.g. We
+// don't have FP-i8 conversions) This helper method looks for an appropriate
+// promoted libcall.
+static RTLIB::Libcall findFPToIntLibcall(EVT SrcVT, EVT RetVT, EVT &Promoted,
+                                         bool Signed) {
+  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+  for (unsigned IntVT = MVT::FIRST_INTEGER_VALUETYPE;
+       IntVT <= MVT::LAST_INTEGER_VALUETYPE && LC == RTLIB::UNKNOWN_LIBCALL;
+       ++IntVT) {
+    Promoted = (MVT::SimpleValueType)IntVT;
+    // The type needs to big enough to hold the result.
+    if (Promoted.bitsGE(RetVT))
+      LC = Signed ? RTLIB::getFPTOSINT(SrcVT, Promoted)
+                  : RTLIB::getFPTOUINT(SrcVT, Promoted);
+  }
+  return LC;
+}
+
 SDValue DAGTypeLegalizer::SoftenFloatOp_FP_TO_XINT(SDNode *N) {
   bool IsStrict = N->isStrictFPOpcode();
   bool Signed = N->getOpcode() == ISD::FP_TO_SINT ||
@@ -905,16 +946,9 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_FP_TO_XINT(SDNode *N) {
   // a larger type, eg: fp -> i32. Even if it is legal, no libcall may exactly
   // match, eg. we don't have fp -> i8 conversions.
   // Look for an appropriate libcall.
-  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
-  for (unsigned IntVT = MVT::FIRST_INTEGER_VALUETYPE;
-       IntVT <= MVT::LAST_INTEGER_VALUETYPE && LC == RTLIB::UNKNOWN_LIBCALL;
-       ++IntVT) {
-    NVT = (MVT::SimpleValueType)IntVT;
-    // The type needs to big enough to hold the result.
-    if (NVT.bitsGE(RVT))
-      LC = Signed ? RTLIB::getFPTOSINT(SVT, NVT) : RTLIB::getFPTOUINT(SVT, NVT);
-  }
-  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_XINT!");
+  RTLIB::Libcall LC = findFPToIntLibcall(SVT, RVT, NVT, Signed);
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && NVT.isSimple() &&
+         "Unsupported FP_TO_XINT!");
 
   Op = GetSoftenedFloat(Op);
   SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
@@ -934,6 +968,11 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_FP_TO_XINT(SDNode *N) {
   return SDValue();
 }
 
+SDValue DAGTypeLegalizer::SoftenFloatOp_FP_TO_XINT_SAT(SDNode *N) {
+  SDValue Res = TLI.expandFP_TO_INT_SAT(N, DAG);
+  return Res;
+}
+
 SDValue DAGTypeLegalizer::SoftenFloatOp_SELECT_CC(SDNode *N) {
   SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1);
   ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(4))->get();
@@ -1200,6 +1239,8 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) {
   case ISD::STRICT_FTRUNC:
   case ISD::FTRUNC:     ExpandFloatRes_FTRUNC(N, Lo, Hi); break;
   case ISD::LOAD:       ExpandFloatRes_LOAD(N, Lo, Hi); break;
+  case ISD::STRICT_SINT_TO_FP:
+  case ISD::STRICT_UINT_TO_FP:
   case ISD::SINT_TO_FP:
   case ISD::UINT_TO_FP: ExpandFloatRes_XINT_TO_FP(N, Lo, Hi); break;
   case ISD::STRICT_FREM:
@@ -1272,7 +1313,7 @@ void DAGTypeLegalizer::ExpandFloatRes_FABS(SDNode *N, SDValue &Lo,
 
 void DAGTypeLegalizer::ExpandFloatRes_FMINNUM(SDNode *N, SDValue &Lo,
                                               SDValue &Hi) {
-  ExpandFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+  ExpandFloatRes_Binary(N, GetFPLibCall(N->getValueType(0),
                                        RTLIB::FMIN_F32, RTLIB::FMIN_F64,
                                        RTLIB::FMIN_F80, RTLIB::FMIN_F128,
                                        RTLIB::FMIN_PPCF128), Lo, Hi);
@@ -1598,21 +1639,31 @@ void DAGTypeLegalizer::ExpandFloatRes_XINT_TO_FP(SDNode *N, SDValue &Lo,
   assert(N->getValueType(0) == MVT::ppcf128 && "Unsupported XINT_TO_FP!");
   EVT VT = N->getValueType(0);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
-  SDValue Src = N->getOperand(0);
+  bool Strict = N->isStrictFPOpcode();
+  SDValue Src = N->getOperand(Strict ? 1 : 0);
   EVT SrcVT = Src.getValueType();
-  bool isSigned = N->getOpcode() == ISD::SINT_TO_FP;
+  bool isSigned = N->getOpcode() == ISD::SINT_TO_FP ||
+                  N->getOpcode() == ISD::STRICT_SINT_TO_FP;
   SDLoc dl(N);
+  SDValue Chain = Strict ? N->getOperand(0) : DAG.getEntryNode();
+
+  // TODO: Any other flags to propagate?
+  SDNodeFlags Flags;
+  Flags.setNoFPExcept(N->getFlags().hasNoFPExcept());
 
   // First do an SINT_TO_FP, whether the original was signed or unsigned.
   // When promoting partial word types to i32 we must honor the signedness,
   // though.
   if (SrcVT.bitsLE(MVT::i32)) {
     // The integer can be represented exactly in an f64.
-    Src = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, dl,
-                      MVT::i32, Src);
     Lo = DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(NVT),
                                    APInt(NVT.getSizeInBits(), 0)), dl, NVT);
-    Hi = DAG.getNode(ISD::SINT_TO_FP, dl, NVT, Src);
+    if (Strict) {
+      Hi = DAG.getNode(N->getOpcode(), dl, DAG.getVTList(NVT, MVT::Other),
+                       {Chain, Src}, Flags);
+      Chain = Hi.getValue(1);
+    } else
+      Hi = DAG.getNode(N->getOpcode(), dl, NVT, Src);
   } else {
     RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
     if (SrcVT.bitsLE(MVT::i64)) {
@@ -1627,14 +1678,25 @@ void DAGTypeLegalizer::ExpandFloatRes_XINT_TO_FP(SDNode *N, SDValue &Lo,
 
     TargetLowering::MakeLibCallOptions CallOptions;
     CallOptions.setSExt(true);
-    Hi = TLI.makeLibCall(DAG, LC, VT, Src, CallOptions, dl).first;
-    GetPairElements(Hi, Lo, Hi);
+    std::pair<SDValue, SDValue> Tmp =
+        TLI.makeLibCall(DAG, LC, VT, Src, CallOptions, dl, Chain);
+    if (Strict)
+      Chain = Tmp.second;
+    GetPairElements(Tmp.first, Lo, Hi);
   }
 
-  if (isSigned)
+  // No need to complement for unsigned 32-bit integers
+  if (isSigned || SrcVT.bitsLE(MVT::i32)) {
+    if (Strict)
+      ReplaceValueWith(SDValue(N, 1), Chain);
+
     return;
+  }
 
   // Unsigned - fix up the SINT_TO_FP value just calculated.
+  // FIXME: For unsigned i128 to ppc_fp128 conversion, we need to carefully
+  // keep semantics correctness if the integer is not exactly representable
+  // here. See ExpandLegalINT_TO_FP.
   Hi = DAG.getNode(ISD::BUILD_PAIR, dl, VT, Lo, Hi);
   SrcVT = Src.getValueType();
 
@@ -1658,11 +1720,16 @@ void DAGTypeLegalizer::ExpandFloatRes_XINT_TO_FP(SDNode *N, SDValue &Lo,
     break;
   }
 
-  // TODO: Are there fast-math-flags to propagate to this FADD?
-  Lo = DAG.getNode(ISD::FADD, dl, VT, Hi,
-                   DAG.getConstantFP(APFloat(APFloat::PPCDoubleDouble(),
-                                             APInt(128, Parts)),
-                                     dl, MVT::ppcf128));
+  // TODO: Are there other fast-math-flags to propagate to this FADD?
+  SDValue NewLo = DAG.getConstantFP(
+      APFloat(APFloat::PPCDoubleDouble(), APInt(128, Parts)), dl, MVT::ppcf128);
+  if (Strict) {
+    Lo = DAG.getNode(ISD::STRICT_FADD, dl, DAG.getVTList(VT, MVT::Other),
+                     {Chain, Hi, NewLo}, Flags);
+    Chain = Lo.getValue(1);
+    ReplaceValueWith(SDValue(N, 1), Chain);
+  } else
+    Lo = DAG.getNode(ISD::FADD, dl, VT, Hi, NewLo);
   Lo = DAG.getSelectCC(dl, Src, DAG.getConstant(0, dl, SrcVT),
                        Lo, Hi, ISD::SETLT);
   GetPairElements(Lo, Lo, Hi);
@@ -1702,14 +1769,16 @@ bool DAGTypeLegalizer::ExpandFloatOperand(SDNode *N, unsigned OpNo) {
   case ISD::STRICT_FP_ROUND:
   case ISD::FP_ROUND:   Res = ExpandFloatOp_FP_ROUND(N); break;
   case ISD::STRICT_FP_TO_SINT:
-  case ISD::FP_TO_SINT: Res = ExpandFloatOp_FP_TO_SINT(N); break;
   case ISD::STRICT_FP_TO_UINT:
-  case ISD::FP_TO_UINT: Res = ExpandFloatOp_FP_TO_UINT(N); break;
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT: Res = ExpandFloatOp_FP_TO_XINT(N); break;
   case ISD::LROUND:     Res = ExpandFloatOp_LROUND(N); break;
   case ISD::LLROUND:    Res = ExpandFloatOp_LLROUND(N); break;
   case ISD::LRINT:      Res = ExpandFloatOp_LRINT(N); break;
   case ISD::LLRINT:     Res = ExpandFloatOp_LLRINT(N); break;
   case ISD::SELECT_CC:  Res = ExpandFloatOp_SELECT_CC(N); break;
+  case ISD::STRICT_FSETCC:
+  case ISD::STRICT_FSETCCS:
   case ISD::SETCC:      Res = ExpandFloatOp_SETCC(N); break;
   case ISD::STORE:      Res = ExpandFloatOp_STORE(cast<StoreSDNode>(N),
                                                   OpNo); break;
@@ -1735,7 +1804,8 @@ bool DAGTypeLegalizer::ExpandFloatOperand(SDNode *N, unsigned OpNo) {
 void DAGTypeLegalizer::FloatExpandSetCCOperands(SDValue &NewLHS,
                                                 SDValue &NewRHS,
                                                 ISD::CondCode &CCCode,
-                                                const SDLoc &dl) {
+                                                const SDLoc &dl, SDValue &Chain,
+                                                bool IsSignaling) {
   SDValue LHSLo, LHSHi, RHSLo, RHSHi;
   GetExpandedFloat(NewLHS, LHSLo, LHSHi);
   GetExpandedFloat(NewRHS, RHSLo, RHSHi);
@@ -1747,25 +1817,32 @@ void DAGTypeLegalizer::FloatExpandSetCCOperands(SDValue &NewLHS,
   //         BNE crN, L:
   //         FCMPU crN, lo1, lo2
   // The following can be improved, but not that much.
-  SDValue Tmp1, Tmp2, Tmp3;
-  Tmp1 = DAG.getSetCC(dl, getSetCCResultType(LHSHi.getValueType()),
-                      LHSHi, RHSHi, ISD::SETOEQ);
-  Tmp2 = DAG.getSetCC(dl, getSetCCResultType(LHSLo.getValueType()),
-                      LHSLo, RHSLo, CCCode);
+  SDValue Tmp1, Tmp2, Tmp3, OutputChain;
+  Tmp1 = DAG.getSetCC(dl, getSetCCResultType(LHSHi.getValueType()), LHSHi,
+                      RHSHi, ISD::SETOEQ, Chain, IsSignaling);
+  OutputChain = Tmp1->getNumValues() > 1 ? Tmp1.getValue(1) : SDValue();
+  Tmp2 = DAG.getSetCC(dl, getSetCCResultType(LHSLo.getValueType()), LHSLo,
+                      RHSLo, CCCode, OutputChain, IsSignaling);
+  OutputChain = Tmp2->getNumValues() > 1 ? Tmp2.getValue(1) : SDValue();
   Tmp3 = DAG.getNode(ISD::AND, dl, Tmp1.getValueType(), Tmp1, Tmp2);
-  Tmp1 = DAG.getSetCC(dl, getSetCCResultType(LHSHi.getValueType()),
-                      LHSHi, RHSHi, ISD::SETUNE);
-  Tmp2 = DAG.getSetCC(dl, getSetCCResultType(LHSHi.getValueType()),
-                      LHSHi, RHSHi, CCCode);
+  Tmp1 =
+      DAG.getSetCC(dl, getSetCCResultType(LHSHi.getValueType()), LHSHi, RHSHi,
+                   ISD::SETUNE, OutputChain, IsSignaling);
+  OutputChain = Tmp1->getNumValues() > 1 ? Tmp1.getValue(1) : SDValue();
+  Tmp2 = DAG.getSetCC(dl, getSetCCResultType(LHSHi.getValueType()), LHSHi,
+                      RHSHi, CCCode, OutputChain, IsSignaling);
+  OutputChain = Tmp2->getNumValues() > 1 ? Tmp2.getValue(1) : SDValue();
   Tmp1 = DAG.getNode(ISD::AND, dl, Tmp1.getValueType(), Tmp1, Tmp2);
   NewLHS = DAG.getNode(ISD::OR, dl, Tmp1.getValueType(), Tmp1, Tmp3);
   NewRHS = SDValue();   // LHS is the result, not a compare.
+  Chain = OutputChain;
 }
 
 SDValue DAGTypeLegalizer::ExpandFloatOp_BR_CC(SDNode *N) {
   SDValue NewLHS = N->getOperand(2), NewRHS = N->getOperand(3);
   ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(1))->get();
-  FloatExpandSetCCOperands(NewLHS, NewRHS, CCCode, SDLoc(N));
+  SDValue Chain;
+  FloatExpandSetCCOperands(NewLHS, NewRHS, CCCode, SDLoc(N), Chain);
 
   // If ExpandSetCCOperands returned a scalar, we need to compare the result
   // against zero to select between true and false values.
@@ -1820,38 +1897,23 @@ SDValue DAGTypeLegalizer::ExpandFloatOp_FP_ROUND(SDNode *N) {
   return SDValue();
 }
 
-SDValue DAGTypeLegalizer::ExpandFloatOp_FP_TO_SINT(SDNode *N) {
+SDValue DAGTypeLegalizer::ExpandFloatOp_FP_TO_XINT(SDNode *N) {
   EVT RVT = N->getValueType(0);
   SDLoc dl(N);
 
   bool IsStrict = N->isStrictFPOpcode();
+  bool Signed = N->getOpcode() == ISD::FP_TO_SINT ||
+                N->getOpcode() == ISD::STRICT_FP_TO_SINT;
   SDValue Op = N->getOperand(IsStrict ? 1 : 0);
   SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
-  RTLIB::Libcall LC = RTLIB::getFPTOSINT(Op.getValueType(), RVT);
-  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_SINT!");
-  TargetLowering::MakeLibCallOptions CallOptions;
-  std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, RVT, Op,
-                                                    CallOptions, dl, Chain);
-  if (!IsStrict)
-    return Tmp.first;
-
-  ReplaceValueWith(SDValue(N, 1), Tmp.second);
-  ReplaceValueWith(SDValue(N, 0), Tmp.first);
-  return SDValue();
-}
 
-SDValue DAGTypeLegalizer::ExpandFloatOp_FP_TO_UINT(SDNode *N) {
-  EVT RVT = N->getValueType(0);
-  SDLoc dl(N);
-
-  bool IsStrict = N->isStrictFPOpcode();
-  SDValue Op = N->getOperand(IsStrict ? 1 : 0);
-  SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
-  RTLIB::Libcall LC = RTLIB::getFPTOUINT(Op.getValueType(), RVT);
-  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_UINT!");
+  EVT NVT;
+  RTLIB::Libcall LC = findFPToIntLibcall(Op.getValueType(), RVT, NVT, Signed);
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && NVT.isSimple() &&
+         "Unsupported FP_TO_XINT!");
   TargetLowering::MakeLibCallOptions CallOptions;
-  std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, RVT, Op,
-                                                    CallOptions, dl, Chain);
+  std::pair<SDValue, SDValue> Tmp =
+      TLI.makeLibCall(DAG, LC, NVT, Op, CallOptions, dl, Chain);
   if (!IsStrict)
     return Tmp.first;
 
@@ -1863,7 +1925,8 @@ SDValue DAGTypeLegalizer::ExpandFloatOp_FP_TO_UINT(SDNode *N) {
 SDValue DAGTypeLegalizer::ExpandFloatOp_SELECT_CC(SDNode *N) {
   SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1);
   ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(4))->get();
-  FloatExpandSetCCOperands(NewLHS, NewRHS, CCCode, SDLoc(N));
+  SDValue Chain;
+  FloatExpandSetCCOperands(NewLHS, NewRHS, CCCode, SDLoc(N), Chain);
 
   // If ExpandSetCCOperands returned a scalar, we need to compare the result
   // against zero to select between true and false values.
@@ -1879,20 +1942,25 @@ SDValue DAGTypeLegalizer::ExpandFloatOp_SELECT_CC(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::ExpandFloatOp_SETCC(SDNode *N) {
-  SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1);
-  ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(2))->get();
-  FloatExpandSetCCOperands(NewLHS, NewRHS, CCCode, SDLoc(N));
+  bool IsStrict = N->isStrictFPOpcode();
+  SDValue NewLHS = N->getOperand(IsStrict ? 1 : 0);
+  SDValue NewRHS = N->getOperand(IsStrict ? 2 : 1);
+  SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
+  ISD::CondCode CCCode =
+      cast<CondCodeSDNode>(N->getOperand(IsStrict ? 3 : 2))->get();
+  FloatExpandSetCCOperands(NewLHS, NewRHS, CCCode, SDLoc(N), Chain,
+                           N->getOpcode() == ISD::STRICT_FSETCCS);
 
-  // If ExpandSetCCOperands returned a scalar, use it.
-  if (!NewRHS.getNode()) {
-    assert(NewLHS.getValueType() == N->getValueType(0) &&
-           "Unexpected setcc expansion!");
-    return NewLHS;
+  // FloatExpandSetCCOperands always returned a scalar.
+  assert(!NewRHS.getNode() && "Expect to return scalar");
+  assert(NewLHS.getValueType() == N->getValueType(0) &&
+         "Unexpected setcc expansion!");
+  if (Chain) {
+    ReplaceValueWith(SDValue(N, 0), NewLHS);
+    ReplaceValueWith(SDValue(N, 1), Chain);
+    return SDValue();
   }
-
-  // Otherwise, update N to have the operands specified.
-  return SDValue(DAG.UpdateNodeOperands(N, NewLHS, NewRHS,
-                                DAG.getCondCode(CCCode)), 0);
+  return NewLHS;
 }
 
 SDValue DAGTypeLegalizer::ExpandFloatOp_STORE(SDNode *N, unsigned OpNo) {
@@ -2013,6 +2081,9 @@ bool DAGTypeLegalizer::PromoteFloatOperand(SDNode *N, unsigned OpNo) {
     case ISD::FCOPYSIGN:  R = PromoteFloatOp_FCOPYSIGN(N, OpNo); break;
     case ISD::FP_TO_SINT:
     case ISD::FP_TO_UINT: R = PromoteFloatOp_FP_TO_XINT(N, OpNo); break;
+    case ISD::FP_TO_SINT_SAT:
+    case ISD::FP_TO_UINT_SAT:
+                          R = PromoteFloatOp_FP_TO_XINT_SAT(N, OpNo); break;
     case ISD::FP_EXTEND:  R = PromoteFloatOp_FP_EXTEND(N, OpNo); break;
     case ISD::SELECT_CC:  R = PromoteFloatOp_SELECT_CC(N, OpNo); break;
     case ISD::SETCC:      R = PromoteFloatOp_SETCC(N, OpNo); break;
@@ -2056,6 +2127,13 @@ SDValue DAGTypeLegalizer::PromoteFloatOp_FP_TO_XINT(SDNode *N, unsigned OpNo) {
   return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), Op);
 }
 
+SDValue DAGTypeLegalizer::PromoteFloatOp_FP_TO_XINT_SAT(SDNode *N,
+                                                        unsigned OpNo) {
+  SDValue Op = GetPromotedFloat(N->getOperand(0));
+  return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), Op,
+                     N->getOperand(1));
+}
+
 SDValue DAGTypeLegalizer::PromoteFloatOp_FP_EXTEND(SDNode *N, unsigned OpNo) {
   SDValue Op = GetPromotedFloat(N->getOperand(0));
   EVT VT = N->getValueType(0);
@@ -2191,6 +2269,16 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) {
     case ISD::UINT_TO_FP: R = PromoteFloatRes_XINT_TO_FP(N); break;
     case ISD::UNDEF:      R = PromoteFloatRes_UNDEF(N); break;
     case ISD::ATOMIC_SWAP: R = BitcastToInt_ATOMIC_SWAP(N); break;
+    case ISD::VECREDUCE_FADD:
+    case ISD::VECREDUCE_FMUL:
+    case ISD::VECREDUCE_FMIN:
+    case ISD::VECREDUCE_FMAX:
+      R = PromoteFloatRes_VECREDUCE(N);
+      break;
+    case ISD::VECREDUCE_SEQ_FADD:
+    case ISD::VECREDUCE_SEQ_FMUL:
+      R = PromoteFloatRes_VECREDUCE_SEQ(N);
+      break;
   }
 
   if (R.getNode())
@@ -2422,6 +2510,20 @@ SDValue DAGTypeLegalizer::PromoteFloatRes_UNDEF(SDNode *N) {
                                                N->getValueType(0)));
 }
 
+SDValue DAGTypeLegalizer::PromoteFloatRes_VECREDUCE(SDNode *N) {
+  // Expand and promote recursively.
+  // TODO: This is non-optimal, but dealing with the concurrently happening
+  // vector-legalization is non-trivial. We could do something similar to
+  // PromoteFloatRes_EXTRACT_VECTOR_ELT here.
+  ReplaceValueWith(SDValue(N, 0), TLI.expandVecReduce(N, DAG));
+  return SDValue();
+}
+
+SDValue DAGTypeLegalizer::PromoteFloatRes_VECREDUCE_SEQ(SDNode *N) {
+  ReplaceValueWith(SDValue(N, 0), TLI.expandVecReduceSeq(N, DAG));
+  return SDValue();
+}
+
 SDValue DAGTypeLegalizer::BitcastToInt_ATOMIC_SWAP(SDNode *N) {
   EVT VT = N->getValueType(0);
 
@@ -2530,6 +2632,16 @@ void DAGTypeLegalizer::SoftPromoteHalfResult(SDNode *N, unsigned ResNo) {
   case ISD::UINT_TO_FP:  R = SoftPromoteHalfRes_XINT_TO_FP(N); break;
   case ISD::UNDEF:       R = SoftPromoteHalfRes_UNDEF(N); break;
   case ISD::ATOMIC_SWAP: R = BitcastToInt_ATOMIC_SWAP(N); break;
+  case ISD::VECREDUCE_FADD:
+  case ISD::VECREDUCE_FMUL:
+  case ISD::VECREDUCE_FMIN:
+  case ISD::VECREDUCE_FMAX:
+    R = SoftPromoteHalfRes_VECREDUCE(N);
+    break;
+  case ISD::VECREDUCE_SEQ_FADD:
+  case ISD::VECREDUCE_SEQ_FMUL:
+    R = SoftPromoteHalfRes_VECREDUCE_SEQ(N);
+    break;
   }
 
   if (R.getNode())
@@ -2722,6 +2834,18 @@ SDValue DAGTypeLegalizer::SoftPromoteHalfRes_BinOp(SDNode *N) {
   return DAG.getNode(ISD::FP_TO_FP16, dl, MVT::i16, Res);
 }
 
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_VECREDUCE(SDNode *N) {
+  // Expand and soften recursively.
+  ReplaceValueWith(SDValue(N, 0), TLI.expandVecReduce(N, DAG));
+  return SDValue();
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_VECREDUCE_SEQ(SDNode *N) {
+  // Expand and soften.
+  ReplaceValueWith(SDValue(N, 0), TLI.expandVecReduceSeq(N, DAG));
+  return SDValue();
+}
+
 //===----------------------------------------------------------------------===//
 //  Half Operand Soft Promotion
 //===----------------------------------------------------------------------===//
@@ -2753,6 +2877,9 @@ bool DAGTypeLegalizer::SoftPromoteHalfOperand(SDNode *N, unsigned OpNo) {
   case ISD::FCOPYSIGN:  Res = SoftPromoteHalfOp_FCOPYSIGN(N, OpNo); break;
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT: Res = SoftPromoteHalfOp_FP_TO_XINT(N); break;
+  case ISD::FP_TO_SINT_SAT:
+  case ISD::FP_TO_UINT_SAT:
+                        Res = SoftPromoteHalfOp_FP_TO_XINT_SAT(N); break;
   case ISD::STRICT_FP_EXTEND:
   case ISD::FP_EXTEND:  Res = SoftPromoteHalfOp_FP_EXTEND(N); break;
   case ISD::SELECT_CC:  Res = SoftPromoteHalfOp_SELECT_CC(N, OpNo); break;
@@ -2822,6 +2949,20 @@ SDValue DAGTypeLegalizer::SoftPromoteHalfOp_FP_TO_XINT(SDNode *N) {
   return DAG.getNode(N->getOpcode(), dl, N->getValueType(0), Res);
 }
 
+SDValue DAGTypeLegalizer::SoftPromoteHalfOp_FP_TO_XINT_SAT(SDNode *N) {
+  SDValue Op = N->getOperand(0);
+  SDLoc dl(N);
+
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType());
+
+  Op = GetSoftPromotedHalf(Op);
+
+  SDValue Res = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op);
+
+  return DAG.getNode(N->getOpcode(), dl, N->getValueType(0), Res,
+                     N->getOperand(1));
+}
+
 SDValue DAGTypeLegalizer::SoftPromoteHalfOp_SELECT_CC(SDNode *N,
                                                       unsigned OpNo) {
   assert(OpNo == 0 && "Can only soften the comparison values");
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 74071f763dbf..4a686bc227de 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -62,7 +62,8 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::Constant:    Res = PromoteIntRes_Constant(N); break;
   case ISD::CTLZ_ZERO_UNDEF:
   case ISD::CTLZ:        Res = PromoteIntRes_CTLZ(N); break;
-  case ISD::CTPOP:       Res = PromoteIntRes_CTPOP(N); break;
+  case ISD::PARITY:
+  case ISD::CTPOP:       Res = PromoteIntRes_CTPOP_PARITY(N); break;
   case ISD::CTTZ_ZERO_UNDEF:
   case ISD::CTTZ:        Res = PromoteIntRes_CTTZ(N); break;
   case ISD::EXTRACT_VECTOR_ELT:
@@ -81,7 +82,7 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::SMIN:
   case ISD::SMAX:        Res = PromoteIntRes_SExtIntBinOp(N); break;
   case ISD::UMIN:
-  case ISD::UMAX:        Res = PromoteIntRes_ZExtIntBinOp(N); break;
+  case ISD::UMAX:        Res = PromoteIntRes_UMINUMAX(N); break;
 
   case ISD::SHL:         Res = PromoteIntRes_SHL(N); break;
   case ISD::SIGN_EXTEND_INREG:
@@ -122,6 +123,10 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:  Res = PromoteIntRes_FP_TO_XINT(N); break;
 
+  case ISD::FP_TO_SINT_SAT:
+  case ISD::FP_TO_UINT_SAT:
+                         Res = PromoteIntRes_FP_TO_XINT_SAT(N); break;
+
   case ISD::FP_TO_FP16:  Res = PromoteIntRes_FP_TO_FP16(N); break;
 
   case ISD::FLT_ROUNDS_: Res = PromoteIntRes_FLT_ROUNDS(N); break;
@@ -151,10 +156,15 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::ADDCARRY:
   case ISD::SUBCARRY:    Res = PromoteIntRes_ADDSUBCARRY(N, ResNo); break;
 
+  case ISD::SADDO_CARRY:
+  case ISD::SSUBO_CARRY: Res = PromoteIntRes_SADDSUBO_CARRY(N, ResNo); break;
+
   case ISD::SADDSAT:
   case ISD::UADDSAT:
   case ISD::SSUBSAT:
-  case ISD::USUBSAT:     Res = PromoteIntRes_ADDSUBSAT(N); break;
+  case ISD::USUBSAT:
+  case ISD::SSHLSAT:
+  case ISD::USHLSAT:     Res = PromoteIntRes_ADDSUBSHLSAT(N); break;
 
   case ISD::SMULFIX:
   case ISD::SMULFIXSAT:
@@ -205,6 +215,16 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::FREEZE:
     Res = PromoteIntRes_FREEZE(N);
     break;
+
+  case ISD::ROTL:
+  case ISD::ROTR:
+    Res = PromoteIntRes_Rotate(N);
+    break;
+
+  case ISD::FSHL:
+  case ISD::FSHR:
+    Res = PromoteIntRes_FunnelShift(N);
+    break;
   }
 
   // If the result is null then the sub-method took care of registering it.
@@ -491,10 +511,10 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CTLZ(SDNode *N) {
                       NVT));
 }
 
-SDValue DAGTypeLegalizer::PromoteIntRes_CTPOP(SDNode *N) {
-  // Zero extend to the promoted type and do the count there.
+SDValue DAGTypeLegalizer::PromoteIntRes_CTPOP_PARITY(SDNode *N) {
+  // Zero extend to the promoted type and do the count or parity there.
   SDValue Op = ZExtPromotedInteger(N->getOperand(0));
-  return DAG.getNode(ISD::CTPOP, SDLoc(N), Op.getValueType(), Op);
+  return DAG.getNode(N->getOpcode(), SDLoc(N), Op.getValueType(), Op);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_CTTZ(SDNode *N) {
@@ -559,8 +579,8 @@ SDValue DAGTypeLegalizer::PromoteIntRes_FP_TO_XINT(SDNode *N) {
 
   SDValue Res;
   if (N->isStrictFPOpcode()) {
-    Res = DAG.getNode(NewOpc, dl, { NVT, MVT::Other }, 
-                      { N->getOperand(0), N->getOperand(1) });
+    Res = DAG.getNode(NewOpc, dl, {NVT, MVT::Other},
+                      {N->getOperand(0), N->getOperand(1)});
     // Legalize the chain result - switch anything that used the old chain to
     // use the new one.
     ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
@@ -580,6 +600,14 @@ SDValue DAGTypeLegalizer::PromoteIntRes_FP_TO_XINT(SDNode *N) {
                      DAG.getValueType(N->getValueType(0).getScalarType()));
 }
 
+SDValue DAGTypeLegalizer::PromoteIntRes_FP_TO_XINT_SAT(SDNode *N) {
+  // Promote the result type, while keeping the original width in Op1.
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDLoc dl(N);
+  return DAG.getNode(N->getOpcode(), dl, NVT, N->getOperand(0),
+                     N->getOperand(1));
+}
+
 SDValue DAGTypeLegalizer::PromoteIntRes_FP_TO_FP16(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDLoc dl(N);
@@ -663,12 +691,17 @@ SDValue DAGTypeLegalizer::PromoteIntRes_MGATHER(MaskedGatherSDNode *N) {
   assert(NVT == ExtPassThru.getValueType() &&
       "Gather result type and the passThru argument type should be the same");
 
+  ISD::LoadExtType ExtType = N->getExtensionType();
+  if (ExtType == ISD::NON_EXTLOAD)
+    ExtType = ISD::EXTLOAD;
+
   SDLoc dl(N);
   SDValue Ops[] = {N->getChain(), ExtPassThru, N->getMask(), N->getBasePtr(),
                    N->getIndex(), N->getScale() };
   SDValue Res = DAG.getMaskedGather(DAG.getVTList(NVT, MVT::Other),
                                     N->getMemoryVT(), dl, Ops,
-                                    N->getMemOperand(), N->getIndexType());
+                                    N->getMemOperand(), N->getIndexType(),
+                                    ExtType);
   // Legalize the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
@@ -700,11 +733,11 @@ SDValue DAGTypeLegalizer::PromoteIntRes_Overflow(SDNode *N) {
   return DAG.getBoolExtOrTrunc(Res.getValue(1), dl, NVT, VT);
 }
 
-SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSAT(SDNode *N) {
+SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSHLSAT(SDNode *N) {
   // If the promoted type is legal, we can convert this to:
   //   1. ANY_EXTEND iN to iM
   //   2. SHL by M-N
-  //   3. [US][ADD|SUB]SAT
+  //   3. [US][ADD|SUB|SHL]SAT
   //   4. L/ASHR by M-N
   // Else it is more efficient to convert this to a min and a max
   // operation in the higher precision arithmetic.
@@ -714,9 +747,13 @@ SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSAT(SDNode *N) {
   unsigned OldBits = Op1.getScalarValueSizeInBits();
 
   unsigned Opcode = N->getOpcode();
+  bool IsShift = Opcode == ISD::USHLSAT || Opcode == ISD::SSHLSAT;
 
   SDValue Op1Promoted, Op2Promoted;
-  if (Opcode == ISD::UADDSAT || Opcode == ISD::USUBSAT) {
+  if (IsShift) {
+    Op1Promoted = GetPromotedInteger(Op1);
+    Op2Promoted = ZExtPromotedInteger(Op2);
+  } else if (Opcode == ISD::UADDSAT || Opcode == ISD::USUBSAT) {
     Op1Promoted = ZExtPromotedInteger(Op1);
     Op2Promoted = ZExtPromotedInteger(Op2);
   } else {
@@ -726,20 +763,24 @@ SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSAT(SDNode *N) {
   EVT PromotedType = Op1Promoted.getValueType();
   unsigned NewBits = PromotedType.getScalarSizeInBits();
 
-  if (TLI.isOperationLegalOrCustom(Opcode, PromotedType)) {
+  // Shift cannot use a min/max expansion, we can't detect overflow if all of
+  // the bits have been shifted out.
+  if (IsShift || TLI.isOperationLegalOrCustom(Opcode, PromotedType)) {
     unsigned ShiftOp;
     switch (Opcode) {
     case ISD::SADDSAT:
     case ISD::SSUBSAT:
+    case ISD::SSHLSAT:
       ShiftOp = ISD::SRA;
       break;
     case ISD::UADDSAT:
     case ISD::USUBSAT:
+    case ISD::USHLSAT:
       ShiftOp = ISD::SRL;
       break;
     default:
       llvm_unreachable("Expected opcode to be signed or unsigned saturation "
-                       "addition or subtraction");
+                       "addition, subtraction or left shift");
     }
 
     unsigned SHLAmount = NewBits - OldBits;
@@ -747,8 +788,9 @@ SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSAT(SDNode *N) {
     SDValue ShiftAmount = DAG.getConstant(SHLAmount, dl, SHVT);
     Op1Promoted =
         DAG.getNode(ISD::SHL, dl, PromotedType, Op1Promoted, ShiftAmount);
-    Op2Promoted =
-        DAG.getNode(ISD::SHL, dl, PromotedType, Op2Promoted, ShiftAmount);
+    if (!IsShift)
+      Op2Promoted =
+          DAG.getNode(ISD::SHL, dl, PromotedType, Op2Promoted, ShiftAmount);
 
     SDValue Result =
         DAG.getNode(Opcode, dl, PromotedType, Op1Promoted, Op2Promoted);
@@ -1076,6 +1118,15 @@ SDValue DAGTypeLegalizer::PromoteIntRes_ZExtIntBinOp(SDNode *N) {
                      LHS.getValueType(), LHS, RHS);
 }
 
+SDValue DAGTypeLegalizer::PromoteIntRes_UMINUMAX(SDNode *N) {
+  // It doesn't matter if we sign extend or zero extend in the inputs. So do
+  // whatever is best for the target.
+  SDValue LHS = SExtOrZExtPromotedInteger(N->getOperand(0));
+  SDValue RHS = SExtOrZExtPromotedInteger(N->getOperand(1));
+  return DAG.getNode(N->getOpcode(), SDLoc(N),
+                     LHS.getValueType(), LHS, RHS);
+}
+
 SDValue DAGTypeLegalizer::PromoteIntRes_SRA(SDNode *N) {
   // The input value must be properly sign extended.
   SDValue LHS = SExtPromotedInteger(N->getOperand(0));
@@ -1094,6 +1145,60 @@ SDValue DAGTypeLegalizer::PromoteIntRes_SRL(SDNode *N) {
   return DAG.getNode(ISD::SRL, SDLoc(N), LHS.getValueType(), LHS, RHS);
 }
 
+SDValue DAGTypeLegalizer::PromoteIntRes_Rotate(SDNode *N) {
+  // Lower the rotate to shifts and ORs which can be promoted.
+  SDValue Res;
+  TLI.expandROT(N, true /*AllowVectorOps*/, Res, DAG);
+  ReplaceValueWith(SDValue(N, 0), Res);
+  return SDValue();
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_FunnelShift(SDNode *N) {
+  SDValue Hi = GetPromotedInteger(N->getOperand(0));
+  SDValue Lo = GetPromotedInteger(N->getOperand(1));
+  SDValue Amount = GetPromotedInteger(N->getOperand(2));
+
+  SDLoc DL(N);
+  EVT OldVT = N->getOperand(0).getValueType();
+  EVT VT = Lo.getValueType();
+  unsigned Opcode = N->getOpcode();
+  bool IsFSHR = Opcode == ISD::FSHR;
+  unsigned OldBits = OldVT.getScalarSizeInBits();
+  unsigned NewBits = VT.getScalarSizeInBits();
+
+  // Amount has to be interpreted modulo the old bit width.
+  Amount =
+      DAG.getNode(ISD::UREM, DL, VT, Amount, DAG.getConstant(OldBits, DL, VT));
+
+  // If the promoted type is twice the size (or more), then we use the
+  // traditional funnel 'double' shift codegen. This isn't necessary if the
+  // shift amount is constant.
+  // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z % bw)) >> bw.
+  // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z % bw)).
+  if (NewBits >= (2 * OldBits) && !isa<ConstantSDNode>(Amount) &&
+      !TLI.isOperationLegalOrCustom(Opcode, VT)) {
+    SDValue HiShift = DAG.getConstant(OldBits, DL, VT);
+    Hi = DAG.getNode(ISD::SHL, DL, VT, Hi, HiShift);
+    Lo = DAG.getZeroExtendInReg(Lo, DL, OldVT);
+    SDValue Res = DAG.getNode(ISD::OR, DL, VT, Hi, Lo);
+    Res = DAG.getNode(IsFSHR ? ISD::SRL : ISD::SHL, DL, VT, Res, Amount);
+    if (!IsFSHR)
+      Res = DAG.getNode(ISD::SRL, DL, VT, Res, HiShift);
+    return Res;
+  }
+
+  // Shift Lo up to occupy the upper bits of the promoted type.
+  SDValue ShiftOffset = DAG.getConstant(NewBits - OldBits, DL, VT);
+  Lo = DAG.getNode(ISD::SHL, DL, VT, Lo, ShiftOffset);
+
+  // Increase Amount to shift the result into the lower bits of the promoted
+  // type.
+  if (IsFSHR)
+    Amount = DAG.getNode(ISD::ADD, DL, VT, Amount, ShiftOffset);
+
+  return DAG.getNode(Opcode, DL, VT, Hi, Lo, Amount);
+}
+
 SDValue DAGTypeLegalizer::PromoteIntRes_TRUNCATE(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Res;
@@ -1181,7 +1286,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_UADDSUBO(SDNode *N, unsigned ResNo) {
 }
 
 // Handle promotion for the ADDE/SUBE/ADDCARRY/SUBCARRY nodes. Notice that
-// the third operand of ADDE/SUBE nodes is carry flag, which differs from 
+// the third operand of ADDE/SUBE nodes is carry flag, which differs from
 // the ADDCARRY/SUBCARRY nodes in that the third operand is carry Boolean.
 SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBCARRY(SDNode *N, unsigned ResNo) {
   if (ResNo == 1)
@@ -1212,6 +1317,12 @@ SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBCARRY(SDNode *N, unsigned ResNo) {
   return SDValue(Res.getNode(), 0);
 }
 
+SDValue DAGTypeLegalizer::PromoteIntRes_SADDSUBO_CARRY(SDNode *N,
+                                                       unsigned ResNo) {
+  assert(ResNo == 1 && "Don't know how to promote other results yet.");
+  return PromoteIntRes_Overflow(N);
+}
+
 SDValue DAGTypeLegalizer::PromoteIntRes_ABS(SDNode *N) {
   SDValue Op0 = SExtPromotedInteger(N->getOperand(0));
   return DAG.getNode(ISD::ABS, SDLoc(N), Op0.getValueType(), Op0);
@@ -1394,6 +1505,8 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
   case ISD::ROTL:
   case ISD::ROTR: Res = PromoteIntOp_Shift(N); break;
 
+  case ISD::SADDO_CARRY:
+  case ISD::SSUBO_CARRY:
   case ISD::ADDCARRY:
   case ISD::SUBCARRY: Res = PromoteIntOp_ADDSUBCARRY(N, OpNo); break;
 
@@ -1620,8 +1733,9 @@ SDValue DAGTypeLegalizer::PromoteIntOp_SELECT(SDNode *N, unsigned OpNo) {
   EVT OpTy = N->getOperand(1).getValueType();
 
   if (N->getOpcode() == ISD::VSELECT)
-    if (SDValue Res = WidenVSELECTAndMask(N))
-      return Res;
+    if (SDValue Res = WidenVSELECTMask(N))
+      return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0),
+                         Res, N->getOperand(1), N->getOperand(2));
 
   // Promote all the way up to the canonical SetCC type.
   EVT OpVT = N->getOpcode() == ISD::SELECT ? OpTy.getScalarType() : OpTy;
@@ -1763,6 +1877,7 @@ SDValue DAGTypeLegalizer::PromoteIntOp_MGATHER(MaskedGatherSDNode *N,
 
 SDValue DAGTypeLegalizer::PromoteIntOp_MSCATTER(MaskedScatterSDNode *N,
                                                 unsigned OpNo) {
+  bool TruncateStore = N->isTruncatingStore();
   SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
   if (OpNo == 2) {
     // The Mask
@@ -1775,9 +1890,17 @@ SDValue DAGTypeLegalizer::PromoteIntOp_MSCATTER(MaskedScatterSDNode *N,
       NewOps[OpNo] = SExtPromotedInteger(N->getOperand(OpNo));
     else
       NewOps[OpNo] = ZExtPromotedInteger(N->getOperand(OpNo));
-  } else
+
+    N->setIndexType(TLI.getCanonicalIndexType(N->getIndexType(),
+                                              N->getMemoryVT(), NewOps[OpNo]));
+  } else {
     NewOps[OpNo] = GetPromotedInteger(N->getOperand(OpNo));
-  return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
+    TruncateStore = true;
+  }
+
+  return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), N->getMemoryVT(),
+                              SDLoc(N), NewOps, N->getMemOperand(),
+                              N->getIndexType(), TruncateStore);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_TRUNCATE(SDNode *N) {
@@ -1921,6 +2044,7 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::AssertZext:  ExpandIntRes_AssertZext(N, Lo, Hi); break;
   case ISD::BITREVERSE:  ExpandIntRes_BITREVERSE(N, Lo, Hi); break;
   case ISD::BSWAP:       ExpandIntRes_BSWAP(N, Lo, Hi); break;
+  case ISD::PARITY:      ExpandIntRes_PARITY(N, Lo, Hi); break;
   case ISD::Constant:    ExpandIntRes_Constant(N, Lo, Hi); break;
   case ISD::ABS:         ExpandIntRes_ABS(N, Lo, Hi); break;
   case ISD::CTLZ_ZERO_UNDEF:
@@ -1933,6 +2057,8 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::FP_TO_SINT:  ExpandIntRes_FP_TO_SINT(N, Lo, Hi); break;
   case ISD::STRICT_FP_TO_UINT:
   case ISD::FP_TO_UINT:  ExpandIntRes_FP_TO_UINT(N, Lo, Hi); break;
+  case ISD::FP_TO_SINT_SAT:
+  case ISD::FP_TO_UINT_SAT: ExpandIntRes_FP_TO_XINT_SAT(N, Lo, Hi); break;
   case ISD::STRICT_LLROUND:
   case ISD::STRICT_LLRINT:
   case ISD::LLROUND:
@@ -2009,6 +2135,9 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::ADDCARRY:
   case ISD::SUBCARRY: ExpandIntRes_ADDSUBCARRY(N, Lo, Hi); break;
 
+  case ISD::SADDO_CARRY:
+  case ISD::SSUBO_CARRY: ExpandIntRes_SADDSUBO_CARRY(N, Lo, Hi); break;
+
   case ISD::SHL:
   case ISD::SRA:
   case ISD::SRL: ExpandIntRes_Shift(N, Lo, Hi); break;
@@ -2025,6 +2154,9 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::SSUBSAT:
   case ISD::USUBSAT: ExpandIntRes_ADDSUBSAT(N, Lo, Hi); break;
 
+  case ISD::SSHLSAT:
+  case ISD::USHLSAT: ExpandIntRes_SHLSAT(N, Lo, Hi); break;
+
   case ISD::SMULFIX:
   case ISD::SMULFIXSAT:
   case ISD::UMULFIX:
@@ -2044,6 +2176,16 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::VECREDUCE_SMIN:
   case ISD::VECREDUCE_UMAX:
   case ISD::VECREDUCE_UMIN: ExpandIntRes_VECREDUCE(N, Lo, Hi); break;
+
+  case ISD::ROTL:
+  case ISD::ROTR:
+    ExpandIntRes_Rotate(N, Lo, Hi);
+    break;
+
+  case ISD::FSHL:
+  case ISD::FSHR:
+    ExpandIntRes_FunnelShift(N, Lo, Hi);
+    break;
   }
 
   // If Lo/Hi is null, the sub-method took care of registering results etc.
@@ -2055,12 +2197,22 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
 std::pair <SDValue, SDValue> DAGTypeLegalizer::ExpandAtomic(SDNode *Node) {
   unsigned Opc = Node->getOpcode();
   MVT VT = cast<AtomicSDNode>(Node)->getMemoryVT().getSimpleVT();
-  RTLIB::Libcall LC = RTLIB::getSYNC(Opc, VT);
-  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected atomic op or value type!");
-
+  AtomicOrdering order = cast<AtomicSDNode>(Node)->getOrdering();
+  // Lower to outline atomic libcall if outline atomics enabled,
+  // or to sync libcall otherwise
+  RTLIB::Libcall LC = RTLIB::getOUTLINE_ATOMIC(Opc, order, VT);
   EVT RetVT = Node->getValueType(0);
-  SmallVector<SDValue, 4> Ops(Node->op_begin() + 1, Node->op_end());
   TargetLowering::MakeLibCallOptions CallOptions;
+  SmallVector<SDValue, 4> Ops;
+  if (TLI.getLibcallName(LC)) {
+    Ops.append(Node->op_begin() + 2, Node->op_end());
+    Ops.push_back(Node->getOperand(1));
+  } else {
+    LC = RTLIB::getSYNC(Opc, VT);
+    assert(LC != RTLIB::UNKNOWN_LIBCALL &&
+           "Unexpected atomic op or value type!");
+    Ops.append(Node->op_begin() + 1, Node->op_end());
+  }
   return TLI.makeLibCall(DAG, LC, RetVT, Ops, CallOptions, SDLoc(Node),
                          Node->getOperand(0));
 }
@@ -2619,6 +2771,26 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUBCARRY(SDNode *N,
   ReplaceValueWith(SDValue(N, 1), Hi.getValue(1));
 }
 
+void DAGTypeLegalizer::ExpandIntRes_SADDSUBO_CARRY(SDNode *N,
+                                                   SDValue &Lo, SDValue &Hi) {
+  // Expand the subcomponents.
+  SDValue LHSL, LHSH, RHSL, RHSH;
+  SDLoc dl(N);
+  GetExpandedInteger(N->getOperand(0), LHSL, LHSH);
+  GetExpandedInteger(N->getOperand(1), RHSL, RHSH);
+  SDVTList VTList = DAG.getVTList(LHSL.getValueType(), N->getValueType(1));
+
+  // We need to use an unsigned carry op for the lo part.
+  unsigned CarryOp = N->getOpcode() == ISD::SADDO_CARRY ? ISD::ADDCARRY
+                                                        : ISD::SUBCARRY;
+  Lo = DAG.getNode(CarryOp, dl, VTList, { LHSL, RHSL, N->getOperand(2) });
+  Hi = DAG.getNode(N->getOpcode(), dl, VTList, { LHSH, RHSH, Lo.getValue(1) });
+
+  // Legalized the flag result - switch anything that used the old flag to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Hi.getValue(1));
+}
+
 void DAGTypeLegalizer::ExpandIntRes_ANY_EXTEND(SDNode *N,
                                                SDValue &Lo, SDValue &Hi) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
@@ -2700,6 +2872,17 @@ void DAGTypeLegalizer::ExpandIntRes_BSWAP(SDNode *N,
   Hi = DAG.getNode(ISD::BSWAP, dl, Hi.getValueType(), Hi);
 }
 
+void DAGTypeLegalizer::ExpandIntRes_PARITY(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  SDLoc dl(N);
+  // parity(HiLo) -> parity(Lo^Hi)
+  GetExpandedInteger(N->getOperand(0), Lo, Hi);
+  EVT NVT = Lo.getValueType();
+  Lo =
+      DAG.getNode(ISD::PARITY, dl, NVT, DAG.getNode(ISD::XOR, dl, NVT, Lo, Hi));
+  Hi = DAG.getConstant(0, dl, NVT);
+}
+
 void DAGTypeLegalizer::ExpandIntRes_Constant(SDNode *N,
                                              SDValue &Lo, SDValue &Hi) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
@@ -2717,16 +2900,38 @@ void DAGTypeLegalizer::ExpandIntRes_Constant(SDNode *N,
 void DAGTypeLegalizer::ExpandIntRes_ABS(SDNode *N, SDValue &Lo, SDValue &Hi) {
   SDLoc dl(N);
 
+  SDValue N0 = N->getOperand(0);
+  GetExpandedInteger(N0, Lo, Hi);
+  EVT NVT = Lo.getValueType();
+
+  // If we have ADDCARRY, use the expanded form of the sra+add+xor sequence we
+  // use in LegalizeDAG. The ADD part of the expansion is based on
+  // ExpandIntRes_ADDSUB which also uses ADDCARRY/UADDO after checking that
+  // ADDCARRY is LegalOrCustom. Each of the pieces here can be further expanded
+  // if needed. Shift expansion has a special case for filling with sign bits
+  // so that we will only end up with one SRA.
+  bool HasAddCarry = TLI.isOperationLegalOrCustom(
+      ISD::ADDCARRY, TLI.getTypeToExpandTo(*DAG.getContext(), NVT));
+  if (HasAddCarry) {
+    EVT ShiftAmtTy = getShiftAmountTyForConstant(NVT, TLI, DAG);
+    SDValue Sign =
+        DAG.getNode(ISD::SRA, dl, NVT, Hi,
+                    DAG.getConstant(NVT.getSizeInBits() - 1, dl, ShiftAmtTy));
+    SDVTList VTList = DAG.getVTList(NVT, getSetCCResultType(NVT));
+    Lo = DAG.getNode(ISD::UADDO, dl, VTList, Lo, Sign);
+    Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Hi, Sign, Lo.getValue(1));
+    Lo = DAG.getNode(ISD::XOR, dl, NVT, Lo, Sign);
+    Hi = DAG.getNode(ISD::XOR, dl, NVT, Hi, Sign);
+    return;
+  }
+
   // abs(HiLo) -> (Hi < 0 ? -HiLo : HiLo)
   EVT VT = N->getValueType(0);
-  SDValue N0 = N->getOperand(0);
   SDValue Neg = DAG.getNode(ISD::SUB, dl, VT,
                             DAG.getConstant(0, dl, VT), N0);
   SDValue NegLo, NegHi;
   SplitInteger(Neg, NegLo, NegHi);
 
-  GetExpandedInteger(N0, Lo, Hi);
-  EVT NVT = Lo.getValueType();
   SDValue HiIsNeg = DAG.getSetCC(dl, getSetCCResultType(NVT),
                                  DAG.getConstant(0, dl, NVT), Hi, ISD::SETGT);
   Lo = DAG.getSelect(dl, NVT, HiIsNeg, NegLo, Lo);
@@ -2859,6 +3064,12 @@ void DAGTypeLegalizer::ExpandIntRes_FP_TO_UINT(SDNode *N, SDValue &Lo,
     ReplaceValueWith(SDValue(N, 1), Tmp.second);
 }
 
+void DAGTypeLegalizer::ExpandIntRes_FP_TO_XINT_SAT(SDNode *N, SDValue &Lo,
+                                                   SDValue &Hi) {
+  SDValue Res = TLI.expandFP_TO_INT_SAT(N, DAG);
+  SplitInteger(Res, Lo, Hi);
+}
+
 void DAGTypeLegalizer::ExpandIntRes_LLROUND_LLRINT(SDNode *N, SDValue &Lo,
                                                    SDValue &Hi) {
   SDValue Op = N->getOperand(N->isStrictFPOpcode() ? 1 : 0);
@@ -2929,7 +3140,7 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
     ReplaceValueWith(SDValue(N, 1), Swap.getValue(2));
     return;
   }
-  
+
   if (ISD::isNormalLoad(N)) {
     ExpandRes_NormalLoad(N, Lo, Hi);
     return;
@@ -2983,7 +3194,7 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
 
     // Increment the pointer to the other half.
     unsigned IncrementSize = NVT.getSizeInBits()/8;
-    Ptr = DAG.getMemBasePlusOffset(Ptr, IncrementSize, dl);
+    Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(IncrementSize), dl);
     Hi = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr,
                         N->getPointerInfo().getWithOffset(IncrementSize), NEVT,
                         N->getOriginalAlign(), MMOFlags, AAInfo);
@@ -3007,7 +3218,7 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
                         N->getOriginalAlign(), MMOFlags, AAInfo);
 
     // Increment the pointer to the other half.
-    Ptr = DAG.getMemBasePlusOffset(Ptr, IncrementSize, dl);
+    Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(IncrementSize), dl);
     // Load the rest of the low bits.
     Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, NVT, Ch, Ptr,
                         N->getPointerInfo().getWithOffset(IncrementSize),
@@ -3147,6 +3358,12 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUBSAT(SDNode *N, SDValue &Lo,
   SplitInteger(Result, Lo, Hi);
 }
 
+void DAGTypeLegalizer::ExpandIntRes_SHLSAT(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  SDValue Result = TLI.expandShlSat(N, DAG);
+  SplitInteger(Result, Lo, Hi);
+}
+
 /// This performs an expansion of the integer result for a fixed point
 /// multiplication. The default expansion performs rounding down towards
 /// negative infinity, though targets that do care about rounding should specify
@@ -3385,40 +3602,66 @@ void DAGTypeLegalizer::ExpandIntRes_SADDSUBO(SDNode *Node,
   SDValue RHS = Node->getOperand(1);
   SDLoc dl(Node);
 
-  // Expand the result by simply replacing it with the equivalent
-  // non-overflow-checking operation.
-  SDValue Sum = DAG.getNode(Node->getOpcode() == ISD::SADDO ?
-                            ISD::ADD : ISD::SUB, dl, LHS.getValueType(),
-                            LHS, RHS);
-  SplitInteger(Sum, Lo, Hi);
+  SDValue Ovf;
 
-  // Compute the overflow.
-  //
-  //   LHSSign -> LHS >= 0
-  //   RHSSign -> RHS >= 0
-  //   SumSign -> Sum >= 0
-  //
-  //   Add:
-  //   Overflow -> (LHSSign == RHSSign) && (LHSSign != SumSign)
-  //   Sub:
-  //   Overflow -> (LHSSign != RHSSign) && (LHSSign != SumSign)
-  //
-  EVT OType = Node->getValueType(1);
-  SDValue Zero = DAG.getConstant(0, dl, LHS.getValueType());
+  unsigned CarryOp;
+  switch(Node->getOpcode()) {
+  default: llvm_unreachable("Node has unexpected Opcode");
+  case ISD::SADDO: CarryOp = ISD::SADDO_CARRY; break;
+  case ISD::SSUBO: CarryOp = ISD::SSUBO_CARRY; break;
+  }
+
+  bool HasCarryOp = TLI.isOperationLegalOrCustom(
+      CarryOp, TLI.getTypeToExpandTo(*DAG.getContext(), LHS.getValueType()));
 
-  SDValue LHSSign = DAG.getSetCC(dl, OType, LHS, Zero, ISD::SETGE);
-  SDValue RHSSign = DAG.getSetCC(dl, OType, RHS, Zero, ISD::SETGE);
-  SDValue SignsMatch = DAG.getSetCC(dl, OType, LHSSign, RHSSign,
-                                    Node->getOpcode() == ISD::SADDO ?
-                                    ISD::SETEQ : ISD::SETNE);
+  if (HasCarryOp) {
+    // Expand the subcomponents.
+    SDValue LHSL, LHSH, RHSL, RHSH;
+    GetExpandedInteger(LHS, LHSL, LHSH);
+    GetExpandedInteger(RHS, RHSL, RHSH);
+    SDVTList VTList = DAG.getVTList(LHSL.getValueType(), Node->getValueType(1));
+
+    Lo = DAG.getNode(Node->getOpcode() == ISD::SADDO ?
+                     ISD::UADDO : ISD::USUBO, dl, VTList, { LHSL, RHSL });
+    Hi = DAG.getNode(CarryOp, dl, VTList, { LHSH, RHSH, Lo.getValue(1) });
+
+    Ovf = Hi.getValue(1);
+  } else {
+    // Expand the result by simply replacing it with the equivalent
+    // non-overflow-checking operation.
+    SDValue Sum = DAG.getNode(Node->getOpcode() == ISD::SADDO ?
+                              ISD::ADD : ISD::SUB, dl, LHS.getValueType(),
+                              LHS, RHS);
+    SplitInteger(Sum, Lo, Hi);
+
+    // Compute the overflow.
+    //
+    //   LHSSign -> LHS >= 0
+    //   RHSSign -> RHS >= 0
+    //   SumSign -> Sum >= 0
+    //
+    //   Add:
+    //   Overflow -> (LHSSign == RHSSign) && (LHSSign != SumSign)
+    //   Sub:
+    //   Overflow -> (LHSSign != RHSSign) && (LHSSign != SumSign)
+    //
+    EVT OType = Node->getValueType(1);
+    SDValue Zero = DAG.getConstant(0, dl, LHS.getValueType());
+
+    SDValue LHSSign = DAG.getSetCC(dl, OType, LHS, Zero, ISD::SETGE);
+    SDValue RHSSign = DAG.getSetCC(dl, OType, RHS, Zero, ISD::SETGE);
+    SDValue SignsMatch = DAG.getSetCC(dl, OType, LHSSign, RHSSign,
+                                      Node->getOpcode() == ISD::SADDO ?
+                                      ISD::SETEQ : ISD::SETNE);
 
-  SDValue SumSign = DAG.getSetCC(dl, OType, Sum, Zero, ISD::SETGE);
-  SDValue SumSignNE = DAG.getSetCC(dl, OType, LHSSign, SumSign, ISD::SETNE);
+    SDValue SumSign = DAG.getSetCC(dl, OType, Sum, Zero, ISD::SETGE);
+    SDValue SumSignNE = DAG.getSetCC(dl, OType, LHSSign, SumSign, ISD::SETNE);
 
-  SDValue Cmp = DAG.getNode(ISD::AND, dl, OType, SignsMatch, SumSignNE);
+    Ovf = DAG.getNode(ISD::AND, dl, OType, SignsMatch, SumSignNE);
+  }
 
   // Use the calculated overflow everywhere.
-  ReplaceValueWith(SDValue(Node, 1), Cmp);
+  ReplaceValueWith(SDValue(Node, 1), Ovf);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_SDIV(SDNode *N,
@@ -3874,6 +4117,22 @@ void DAGTypeLegalizer::ExpandIntRes_VECREDUCE(SDNode *N,
   SplitInteger(Res, Lo, Hi);
 }
 
+void DAGTypeLegalizer::ExpandIntRes_Rotate(SDNode *N,
+                                           SDValue &Lo, SDValue &Hi) {
+  // Lower the rotate to shifts and ORs which can be expanded.
+  SDValue Res;
+  TLI.expandROT(N, true /*AllowVectorOps*/, Res, DAG);
+  SplitInteger(Res, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_FunnelShift(SDNode *N,
+                                                SDValue &Lo, SDValue &Hi) {
+  // Lower the funnel shift to shifts and ORs which can be expanded.
+  SDValue Res;
+  TLI.expandFunnelShift(N, Res, DAG);
+  SplitInteger(Res, Lo, Hi);
+}
+
 //===----------------------------------------------------------------------===//
 //  Integer Operand Expansion
 //===----------------------------------------------------------------------===//
@@ -4246,7 +4505,7 @@ SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
 
     // Increment the pointer to the other half.
     unsigned IncrementSize = NVT.getSizeInBits()/8;
-    Ptr = DAG.getObjectPtrOffset(dl, Ptr, IncrementSize);
+    Ptr = DAG.getObjectPtrOffset(dl, Ptr, TypeSize::Fixed(IncrementSize));
     Hi = DAG.getTruncStore(Ch, dl, Hi, Ptr,
                            N->getPointerInfo().getWithOffset(IncrementSize),
                            NEVT, N->getOriginalAlign(), MMOFlags, AAInfo);
@@ -4281,7 +4540,7 @@ SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
                          N->getOriginalAlign(), MMOFlags, AAInfo);
 
   // Increment the pointer to the other half.
-  Ptr = DAG.getObjectPtrOffset(dl, Ptr, IncrementSize);
+  Ptr = DAG.getObjectPtrOffset(dl, Ptr, TypeSize::Fixed(IncrementSize));
   // Store the lowest ExcessBits bits in the second half.
   Lo = DAG.getTruncStore(Ch, dl, Lo, Ptr,
                          N->getPointerInfo().getWithOffset(IncrementSize),
@@ -4586,8 +4845,23 @@ SDValue DAGTypeLegalizer::PromoteIntOp_EXTRACT_SUBVECTOR(SDNode *N) {
 
 SDValue DAGTypeLegalizer::PromoteIntOp_CONCAT_VECTORS(SDNode *N) {
   SDLoc dl(N);
+
+  EVT ResVT = N->getValueType(0);
   unsigned NumElems = N->getNumOperands();
 
+  if (ResVT.isScalableVector()) {
+    SDValue ResVec = DAG.getUNDEF(ResVT);
+
+    for (unsigned OpIdx = 0; OpIdx < NumElems; ++OpIdx) {
+      SDValue Op = N->getOperand(OpIdx);
+      unsigned OpNumElts = Op.getValueType().getVectorMinNumElements();
+      ResVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ResVec, Op,
+                           DAG.getIntPtrConstant(OpIdx * OpNumElts, dl));
+    }
+
+    return ResVec;
+  }
+
   EVT RetSclrTy = N->getValueType(0).getVectorElementType();
 
   SmallVector<SDValue, 8> NewOps;
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index ae087d3bbd8c..a59f03854775 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -663,8 +663,7 @@ void DAGTypeLegalizer::ReplaceValueWith(SDValue From, SDValue To) {
 
     // Process the list of nodes that need to be reanalyzed.
     while (!NodesToAnalyze.empty()) {
-      SDNode *N = NodesToAnalyze.back();
-      NodesToAnalyze.pop_back();
+      SDNode *N = NodesToAnalyze.pop_back_val();
       if (N->getNodeId() != DAGTypeLegalizer::NewNode)
         // The node was analyzed while reanalyzing an earlier node - it is safe
         // to skip.  Note that this is not a morphing node - otherwise it would
@@ -753,7 +752,10 @@ void DAGTypeLegalizer::SetScalarizedVector(SDValue Op, SDValue Result) {
   // Note that in some cases vector operation operands may be greater than
   // the vector element type. For example BUILD_VECTOR of type <1 x i1> with
   // a constant i8 operand.
-  assert(Result.getValueSizeInBits() >= Op.getScalarValueSizeInBits() &&
+
+  // We don't currently support the scalarization of scalable vector types.
+  assert(Result.getValueSizeInBits().getFixedSize() >=
+             Op.getScalarValueSizeInBits() &&
          "Invalid type for scalarized vector");
   AnalyzeNewValue(Result);
 
@@ -955,11 +957,12 @@ bool DAGTypeLegalizer::CustomWidenLowerNode(SDNode *N, EVT VT) {
   assert(Results.size() == N->getNumValues() &&
          "Custom lowering returned the wrong number of results!");
   for (unsigned i = 0, e = Results.size(); i != e; ++i) {
-    // If this is a chain output just replace it.
-    if (Results[i].getValueType() == MVT::Other)
-      ReplaceValueWith(SDValue(N, i), Results[i]);
-    else
+    // If this is a chain output or already widened just replace it.
+    bool WasWidened = SDValue(N, i).getValueType() != Results[i].getValueType();
+    if (WasWidened)
       SetWidenedVector(SDValue(N, i), Results[i]);
+    else
+      ReplaceValueWith(SDValue(N, i), Results[i]);
   }
   return true;
 }
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 0fa6d653a836..630a0a9adaf7 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -311,10 +311,11 @@ private:
   SDValue PromoteIntRes_BUILD_PAIR(SDNode *N);
   SDValue PromoteIntRes_Constant(SDNode *N);
   SDValue PromoteIntRes_CTLZ(SDNode *N);
-  SDValue PromoteIntRes_CTPOP(SDNode *N);
+  SDValue PromoteIntRes_CTPOP_PARITY(SDNode *N);
   SDValue PromoteIntRes_CTTZ(SDNode *N);
   SDValue PromoteIntRes_EXTRACT_VECTOR_ELT(SDNode *N);
   SDValue PromoteIntRes_FP_TO_XINT(SDNode *N);
+  SDValue PromoteIntRes_FP_TO_XINT_SAT(SDNode *N);
   SDValue PromoteIntRes_FP_TO_FP16(SDNode *N);
   SDValue PromoteIntRes_FREEZE(SDNode *N);
   SDValue PromoteIntRes_INT_EXTEND(SDNode *N);
@@ -331,22 +332,26 @@ private:
   SDValue PromoteIntRes_SimpleIntBinOp(SDNode *N);
   SDValue PromoteIntRes_ZExtIntBinOp(SDNode *N);
   SDValue PromoteIntRes_SExtIntBinOp(SDNode *N);
+  SDValue PromoteIntRes_UMINUMAX(SDNode *N);
   SDValue PromoteIntRes_SIGN_EXTEND_INREG(SDNode *N);
   SDValue PromoteIntRes_SRA(SDNode *N);
   SDValue PromoteIntRes_SRL(SDNode *N);
   SDValue PromoteIntRes_TRUNCATE(SDNode *N);
   SDValue PromoteIntRes_UADDSUBO(SDNode *N, unsigned ResNo);
   SDValue PromoteIntRes_ADDSUBCARRY(SDNode *N, unsigned ResNo);
+  SDValue PromoteIntRes_SADDSUBO_CARRY(SDNode *N, unsigned ResNo);
   SDValue PromoteIntRes_UNDEF(SDNode *N);
   SDValue PromoteIntRes_VAARG(SDNode *N);
   SDValue PromoteIntRes_VSCALE(SDNode *N);
   SDValue PromoteIntRes_XMULO(SDNode *N, unsigned ResNo);
-  SDValue PromoteIntRes_ADDSUBSAT(SDNode *N);
+  SDValue PromoteIntRes_ADDSUBSHLSAT(SDNode *N);
   SDValue PromoteIntRes_MULFIX(SDNode *N);
   SDValue PromoteIntRes_DIVFIX(SDNode *N);
   SDValue PromoteIntRes_FLT_ROUNDS(SDNode *N);
   SDValue PromoteIntRes_VECREDUCE(SDNode *N);
   SDValue PromoteIntRes_ABS(SDNode *N);
+  SDValue PromoteIntRes_Rotate(SDNode *N);
+  SDValue PromoteIntRes_FunnelShift(SDNode *N);
 
   // Integer Operand Promotion.
   bool PromoteIntegerOperand(SDNode *N, unsigned OpNo);
@@ -420,6 +425,7 @@ private:
   void ExpandIntRes_FLT_ROUNDS        (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_FP_TO_SINT        (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_FP_TO_UINT        (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_FP_TO_XINT_SAT    (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_LLROUND_LLRINT    (SDNode *N, SDValue &Lo, SDValue &Hi);
 
   void ExpandIntRes_Logical           (SDNode *N, SDValue &Lo, SDValue &Hi);
@@ -427,8 +433,10 @@ private:
   void ExpandIntRes_ADDSUBC           (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_ADDSUBE           (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_ADDSUBCARRY       (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_SADDSUBO_CARRY    (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_BITREVERSE        (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_BSWAP             (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_PARITY            (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_MUL               (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_SDIV              (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_SREM              (SDNode *N, SDValue &Lo, SDValue &Hi);
@@ -442,12 +450,16 @@ private:
   void ExpandIntRes_UADDSUBO          (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_XMULO             (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_ADDSUBSAT         (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_SHLSAT            (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_MULFIX            (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_DIVFIX            (SDNode *N, SDValue &Lo, SDValue &Hi);
 
   void ExpandIntRes_ATOMIC_LOAD       (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_VECREDUCE         (SDNode *N, SDValue &Lo, SDValue &Hi);
 
+  void ExpandIntRes_Rotate            (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_FunnelShift       (SDNode *N, SDValue &Lo, SDValue &Hi);
+
   void ExpandShiftByConstant(SDNode *N, const APInt &Amt,
                              SDValue &Lo, SDValue &Hi);
   bool ExpandShiftWithKnownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi);
@@ -541,6 +553,8 @@ private:
   SDValue SoftenFloatRes_UNDEF(SDNode *N);
   SDValue SoftenFloatRes_VAARG(SDNode *N);
   SDValue SoftenFloatRes_XINT_TO_FP(SDNode *N);
+  SDValue SoftenFloatRes_VECREDUCE(SDNode *N);
+  SDValue SoftenFloatRes_VECREDUCE_SEQ(SDNode *N);
 
   // Convert Float Operand to Integer.
   bool SoftenFloatOperand(SDNode *N, unsigned OpNo);
@@ -549,6 +563,7 @@ private:
   SDValue SoftenFloatOp_BR_CC(SDNode *N);
   SDValue SoftenFloatOp_FP_ROUND(SDNode *N);
   SDValue SoftenFloatOp_FP_TO_XINT(SDNode *N);
+  SDValue SoftenFloatOp_FP_TO_XINT_SAT(SDNode *N);
   SDValue SoftenFloatOp_LROUND(SDNode *N);
   SDValue SoftenFloatOp_LLROUND(SDNode *N);
   SDValue SoftenFloatOp_LRINT(SDNode *N);
@@ -617,8 +632,7 @@ private:
   SDValue ExpandFloatOp_BR_CC(SDNode *N);
   SDValue ExpandFloatOp_FCOPYSIGN(SDNode *N);
   SDValue ExpandFloatOp_FP_ROUND(SDNode *N);
-  SDValue ExpandFloatOp_FP_TO_SINT(SDNode *N);
-  SDValue ExpandFloatOp_FP_TO_UINT(SDNode *N);
+  SDValue ExpandFloatOp_FP_TO_XINT(SDNode *N);
   SDValue ExpandFloatOp_LROUND(SDNode *N);
   SDValue ExpandFloatOp_LLROUND(SDNode *N);
   SDValue ExpandFloatOp_LRINT(SDNode *N);
@@ -628,7 +642,8 @@ private:
   SDValue ExpandFloatOp_STORE(SDNode *N, unsigned OpNo);
 
   void FloatExpandSetCCOperands(SDValue &NewLHS, SDValue &NewRHS,
-                                ISD::CondCode &CCCode, const SDLoc &dl);
+                                ISD::CondCode &CCCode, const SDLoc &dl,
+                                SDValue &Chain, bool IsSignaling = false);
 
   //===--------------------------------------------------------------------===//
   // Float promotion support: LegalizeFloatTypes.cpp
@@ -658,12 +673,15 @@ private:
   SDValue PromoteFloatRes_UNDEF(SDNode *N);
   SDValue BitcastToInt_ATOMIC_SWAP(SDNode *N);
   SDValue PromoteFloatRes_XINT_TO_FP(SDNode *N);
+  SDValue PromoteFloatRes_VECREDUCE(SDNode *N);
+  SDValue PromoteFloatRes_VECREDUCE_SEQ(SDNode *N);
 
   bool PromoteFloatOperand(SDNode *N, unsigned OpNo);
   SDValue PromoteFloatOp_BITCAST(SDNode *N, unsigned OpNo);
   SDValue PromoteFloatOp_FCOPYSIGN(SDNode *N, unsigned OpNo);
   SDValue PromoteFloatOp_FP_EXTEND(SDNode *N, unsigned OpNo);
   SDValue PromoteFloatOp_FP_TO_XINT(SDNode *N, unsigned OpNo);
+  SDValue PromoteFloatOp_FP_TO_XINT_SAT(SDNode *N, unsigned OpNo);
   SDValue PromoteFloatOp_STORE(SDNode *N, unsigned OpNo);
   SDValue PromoteFloatOp_SELECT_CC(SDNode *N, unsigned OpNo);
   SDValue PromoteFloatOp_SETCC(SDNode *N, unsigned OpNo);
@@ -695,12 +713,15 @@ private:
   SDValue SoftPromoteHalfRes_UnaryOp(SDNode *N);
   SDValue SoftPromoteHalfRes_XINT_TO_FP(SDNode *N);
   SDValue SoftPromoteHalfRes_UNDEF(SDNode *N);
+  SDValue SoftPromoteHalfRes_VECREDUCE(SDNode *N);
+  SDValue SoftPromoteHalfRes_VECREDUCE_SEQ(SDNode *N);
 
   bool SoftPromoteHalfOperand(SDNode *N, unsigned OpNo);
   SDValue SoftPromoteHalfOp_BITCAST(SDNode *N);
   SDValue SoftPromoteHalfOp_FCOPYSIGN(SDNode *N, unsigned OpNo);
   SDValue SoftPromoteHalfOp_FP_EXTEND(SDNode *N);
   SDValue SoftPromoteHalfOp_FP_TO_XINT(SDNode *N);
+  SDValue SoftPromoteHalfOp_FP_TO_XINT_SAT(SDNode *N);
   SDValue SoftPromoteHalfOp_SETCC(SDNode *N);
   SDValue SoftPromoteHalfOp_SELECT_CC(SDNode *N, unsigned OpNo);
   SDValue SoftPromoteHalfOp_STORE(SDNode *N, unsigned OpNo);
@@ -745,6 +766,7 @@ private:
   SDValue ScalarizeVecRes_SETCC(SDNode *N);
   SDValue ScalarizeVecRes_UNDEF(SDNode *N);
   SDValue ScalarizeVecRes_VECTOR_SHUFFLE(SDNode *N);
+  SDValue ScalarizeVecRes_FP_TO_XINT_SAT(SDNode *N);
 
   SDValue ScalarizeVecRes_FIX(SDNode *N);
 
@@ -760,7 +782,10 @@ private:
   SDValue ScalarizeVecOp_STORE(StoreSDNode *N, unsigned OpNo);
   SDValue ScalarizeVecOp_FP_ROUND(SDNode *N, unsigned OpNo);
   SDValue ScalarizeVecOp_STRICT_FP_ROUND(SDNode *N, unsigned OpNo);
+  SDValue ScalarizeVecOp_FP_EXTEND(SDNode *N);
+  SDValue ScalarizeVecOp_STRICT_FP_EXTEND(SDNode *N);
   SDValue ScalarizeVecOp_VECREDUCE(SDNode *N);
+  SDValue ScalarizeVecOp_VECREDUCE_SEQ(SDNode *N);
 
   //===--------------------------------------------------------------------===//
   // Vector Splitting Support: LegalizeVectorTypes.cpp
@@ -778,8 +803,8 @@ private:
 
   // Helper function for incrementing the pointer when splitting
   // memory operations
-  void IncrementPointer(MemSDNode *N, EVT MemVT,
-                        MachinePointerInfo &MPI, SDValue &Ptr);
+  void IncrementPointer(MemSDNode *N, EVT MemVT, MachinePointerInfo &MPI,
+                        SDValue &Ptr, uint64_t *ScaledOffset = nullptr);
 
   // Vector Result Splitting: <128 x ty> -> 2 x <64 x ty>.
   void SplitVectorResult(SDNode *N, unsigned ResNo);
@@ -806,20 +831,23 @@ private:
   void SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_MLOAD(MaskedLoadSDNode *MLD, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_MGATHER(MaskedGatherSDNode *MGT, SDValue &Lo, SDValue &Hi);
-  void SplitVecRes_SCALAR_TO_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_ScalarOp(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N, SDValue &Lo,
                                   SDValue &Hi);
   void SplitVecRes_VAARG(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_FP_TO_XINT_SAT(SDNode *N, SDValue &Lo, SDValue &Hi);
 
   // Vector Operand Splitting: <128 x ty> -> 2 x <64 x ty>.
   bool SplitVectorOperand(SDNode *N, unsigned OpNo);
   SDValue SplitVecOp_VSELECT(SDNode *N, unsigned OpNo);
   SDValue SplitVecOp_VECREDUCE(SDNode *N, unsigned OpNo);
+  SDValue SplitVecOp_VECREDUCE_SEQ(SDNode *N);
   SDValue SplitVecOp_UnaryOp(SDNode *N);
   SDValue SplitVecOp_TruncateHelper(SDNode *N);
 
   SDValue SplitVecOp_BITCAST(SDNode *N);
+  SDValue SplitVecOp_INSERT_SUBVECTOR(SDNode *N, unsigned OpNo);
   SDValue SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N);
   SDValue SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N);
   SDValue SplitVecOp_ExtVecInRegOp(SDNode *N);
@@ -831,6 +859,7 @@ private:
   SDValue SplitVecOp_VSETCC(SDNode *N);
   SDValue SplitVecOp_FP_ROUND(SDNode *N);
   SDValue SplitVecOp_FCOPYSIGN(SDNode *N);
+  SDValue SplitVecOp_FP_TO_XINT_SAT(SDNode *N);
 
   //===--------------------------------------------------------------------===//
   // Vector Widening Support: LegalizeVectorTypes.cpp
@@ -862,9 +891,9 @@ private:
   SDValue WidenVecRes_LOAD(SDNode* N);
   SDValue WidenVecRes_MLOAD(MaskedLoadSDNode* N);
   SDValue WidenVecRes_MGATHER(MaskedGatherSDNode* N);
-  SDValue WidenVecRes_SCALAR_TO_VECTOR(SDNode* N);
+  SDValue WidenVecRes_ScalarOp(SDNode* N);
   SDValue WidenVecRes_SELECT(SDNode* N);
-  SDValue WidenVSELECTAndMask(SDNode *N);
+  SDValue WidenVSELECTMask(SDNode *N);
   SDValue WidenVecRes_SELECT_CC(SDNode* N);
   SDValue WidenVecRes_SETCC(SDNode* N);
   SDValue WidenVecRes_STRICT_FSETCC(SDNode* N);
@@ -879,9 +908,9 @@ private:
   SDValue WidenVecRes_OverflowOp(SDNode *N, unsigned ResNo);
   SDValue WidenVecRes_Convert(SDNode *N);
   SDValue WidenVecRes_Convert_StrictFP(SDNode *N);
+  SDValue WidenVecRes_FP_TO_XINT_SAT(SDNode *N);
   SDValue WidenVecRes_FCOPYSIGN(SDNode *N);
   SDValue WidenVecRes_POWI(SDNode *N);
-  SDValue WidenVecRes_Shift(SDNode *N);
   SDValue WidenVecRes_Unary(SDNode *N);
   SDValue WidenVecRes_InregOp(SDNode *N);
 
@@ -901,8 +930,10 @@ private:
   SDValue WidenVecOp_VSELECT(SDNode *N);
 
   SDValue WidenVecOp_Convert(SDNode *N);
+  SDValue WidenVecOp_FP_TO_XINT_SAT(SDNode *N);
   SDValue WidenVecOp_FCOPYSIGN(SDNode *N);
   SDValue WidenVecOp_VECREDUCE(SDNode *N);
+  SDValue WidenVecOp_VECREDUCE_SEQ(SDNode *N);
 
   /// Helper function to generate a set of operations to perform
   /// a vector operation for a wider type.
@@ -934,13 +965,6 @@ private:
   ///   ST:      store of a widen value
   void GenWidenVectorStores(SmallVectorImpl<SDValue> &StChain, StoreSDNode *ST);
 
-  /// Helper function to generate a set of stores to store a truncate widen
-  /// vector into non-widen memory.
-  ///   StChain: list of chains for the stores we have generated
-  ///   ST:      store of a widen value
-  void GenWidenVectorTruncStores(SmallVectorImpl<SDValue> &StChain,
-                                 StoreSDNode *ST);
-
   /// Modifies a vector input (widen or narrows) to a vector of NVT.  The
   /// input vector must have the same element type as NVT.
   /// When FillWithZeroes is "on" the vector will be widened with zeroes.
@@ -980,8 +1004,6 @@ private:
   void SplitRes_UNDEF       (SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitRes_FREEZE      (SDNode *N, SDValue &Lo, SDValue &Hi);
 
-  void SplitVSETCC(const SDNode *N);
-
   //===--------------------------------------------------------------------===//
   // Generic Expansion: LegalizeTypesGeneric.cpp
   //===--------------------------------------------------------------------===//
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
index 9cd3b8f76d6c..81cc2bf10d25 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -175,7 +175,8 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
 
   // Increment the pointer to the other half.
   unsigned IncrementSize = NOutVT.getSizeInBits() / 8;
-  StackPtr = DAG.getMemBasePlusOffset(StackPtr, IncrementSize, dl);
+  StackPtr =
+      DAG.getMemBasePlusOffset(StackPtr, TypeSize::Fixed(IncrementSize), dl);
 
   // Load the second half from the stack slot.
   Hi = DAG.getLoad(NOutVT, dl, Store, StackPtr,
@@ -266,7 +267,7 @@ void DAGTypeLegalizer::ExpandRes_NormalLoad(SDNode *N, SDValue &Lo,
 
   // Increment the pointer to the other half.
   unsigned IncrementSize = NVT.getSizeInBits() / 8;
-  Ptr = DAG.getMemBasePlusOffset(Ptr, IncrementSize, dl);
+  Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(IncrementSize), dl);
   Hi = DAG.getLoad(
       NVT, dl, Chain, Ptr, LD->getPointerInfo().getWithOffset(IncrementSize),
       LD->getOriginalAlign(), LD->getMemOperand()->getFlags(), AAInfo);
@@ -481,7 +482,7 @@ SDValue DAGTypeLegalizer::ExpandOp_NormalStore(SDNode *N, unsigned OpNo) {
                     St->getOriginalAlign(), St->getMemOperand()->getFlags(),
                     AAInfo);
 
-  Ptr = DAG.getObjectPtrOffset(dl, Ptr, IncrementSize);
+  Ptr = DAG.getObjectPtrOffset(dl, Ptr, TypeSize::Fixed(IncrementSize));
   Hi = DAG.getStore(
       Chain, dl, Hi, Ptr, St->getPointerInfo().getWithOffset(IncrementSize),
       St->getOriginalAlign(), St->getMemOperand()->getFlags(), AAInfo);
@@ -514,8 +515,8 @@ void DAGTypeLegalizer::SplitRes_SELECT(SDNode *N, SDValue &Lo, SDValue &Hi) {
   SDValue Cond = N->getOperand(0);
   CL = CH = Cond;
   if (Cond.getValueType().isVector()) {
-    if (SDValue Res = WidenVSELECTAndMask(N))
-      std::tie(CL, CH) = DAG.SplitVector(Res->getOperand(0), dl);
+    if (SDValue Res = WidenVSELECTMask(N))
+      std::tie(CL, CH) = DAG.SplitVector(Res, dl);
     // Check if there are already splitted versions of the vector available and
     // use those instead of splitting the mask operand again.
     else if (getTypeAction(Cond.getValueType()) ==
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 6409f924920d..4015a5a0ce70 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -143,7 +143,6 @@ class VectorLegalizer {
   void ExpandSADDSUBO(SDNode *Node, SmallVectorImpl<SDValue> &Results);
   void ExpandMULO(SDNode *Node, SmallVectorImpl<SDValue> &Results);
   void ExpandFixedPointDiv(SDNode *Node, SmallVectorImpl<SDValue> &Results);
-  SDValue ExpandStrictFPOp(SDNode *Node);
   void ExpandStrictFPOp(SDNode *Node, SmallVectorImpl<SDValue> &Results);
   void ExpandREM(SDNode *Node, SmallVectorImpl<SDValue> &Results);
 
@@ -454,6 +453,10 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::UADDSAT:
   case ISD::SSUBSAT:
   case ISD::USUBSAT:
+  case ISD::SSHLSAT:
+  case ISD::USHLSAT:
+  case ISD::FP_TO_SINT_SAT:
+  case ISD::FP_TO_UINT_SAT:
     Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
     break;
   case ISD::SMULFIX:
@@ -487,6 +490,11 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
     Action = TLI.getOperationAction(Node->getOpcode(),
                                     Node->getOperand(0).getValueType());
     break;
+  case ISD::VECREDUCE_SEQ_FADD:
+  case ISD::VECREDUCE_SEQ_FMUL:
+    Action = TLI.getOperationAction(Node->getOpcode(),
+                                    Node->getOperand(1).getValueType());
+    break;
   }
 
   LLVM_DEBUG(dbgs() << "\nLegalizing vector op: "; Node->dump(&DAG));
@@ -794,7 +802,7 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
     break;
   case ISD::ROTL:
   case ISD::ROTR:
-    if (TLI.expandROT(Node, Tmp, DAG)) {
+    if (TLI.expandROT(Node, false /*AllowVectorOps*/, Tmp, DAG)) {
       Results.push_back(Tmp);
       return;
     }
@@ -806,6 +814,15 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
       return;
     }
     break;
+  case ISD::SMIN:
+  case ISD::SMAX:
+  case ISD::UMIN:
+  case ISD::UMAX:
+    if (SDValue Expanded = TLI.expandIntMINMAX(Node, DAG)) {
+      Results.push_back(Expanded);
+      return;
+    }
+    break;
   case ISD::UADDO:
   case ISD::USUBO:
     ExpandUADDSUBO(Node, Results);
@@ -868,6 +885,10 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
   case ISD::VECREDUCE_FMIN:
     Results.push_back(TLI.expandVecReduce(Node, DAG));
     return;
+  case ISD::VECREDUCE_SEQ_FADD:
+  case ISD::VECREDUCE_SEQ_FMUL:
+    Results.push_back(TLI.expandVecReduceSeq(Node, DAG));
+    return;
   case ISD::SREM:
   case ISD::UREM:
     ExpandREM(Node, Results);
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index c81d03cac81b..57cb364f1939 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -129,6 +129,8 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::UADDSAT:
   case ISD::SSUBSAT:
   case ISD::USUBSAT:
+  case ISD::SSHLSAT:
+  case ISD::USHLSAT:
 
   case ISD::FPOW:
   case ISD::FREM:
@@ -144,9 +146,13 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::SHL:
   case ISD::SRA:
   case ISD::SRL:
+  case ISD::ROTL:
+  case ISD::ROTR:
     R = ScalarizeVecRes_BinOp(N);
     break;
   case ISD::FMA:
+  case ISD::FSHL:
+  case ISD::FSHR:
     R = ScalarizeVecRes_TernaryOp(N);
     break;
 
@@ -156,6 +162,11 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
     R = ScalarizeVecRes_StrictFPOp(N);
     break;
 
+  case ISD::FP_TO_UINT_SAT:
+  case ISD::FP_TO_SINT_SAT:
+    R = ScalarizeVecRes_FP_TO_XINT_SAT(N);
+    break;
+
   case ISD::UADDO:
   case ISD::SADDO:
   case ISD::USUBO:
@@ -510,6 +521,23 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_VECTOR_SHUFFLE(SDNode *N) {
   return GetScalarizedVector(N->getOperand(Op));
 }
 
+SDValue DAGTypeLegalizer::ScalarizeVecRes_FP_TO_XINT_SAT(SDNode *N) {
+  SDValue Src = N->getOperand(0);
+  EVT SrcVT = Src.getValueType();
+  SDLoc dl(N);
+
+  // Handle case where result is scalarized but operand is not
+  if (getTypeAction(SrcVT) == TargetLowering::TypeScalarizeVector)
+    Src = GetScalarizedVector(Src);
+  else
+    Src = DAG.getNode(
+        ISD::EXTRACT_VECTOR_ELT, dl, SrcVT.getVectorElementType(), Src,
+        DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+
+  EVT DstVT = N->getValueType(0).getVectorElementType();
+  return DAG.getNode(N->getOpcode(), dl, DstVT, Src, N->getOperand(1));
+}
+
 SDValue DAGTypeLegalizer::ScalarizeVecRes_SETCC(SDNode *N) {
   assert(N->getValueType(0).isVector() &&
          N->getOperand(0).getValueType().isVector() &&
@@ -552,72 +580,80 @@ bool DAGTypeLegalizer::ScalarizeVectorOperand(SDNode *N, unsigned OpNo) {
              dbgs() << "\n");
   SDValue Res = SDValue();
 
-  if (!Res.getNode()) {
-    switch (N->getOpcode()) {
-    default:
+  switch (N->getOpcode()) {
+  default:
 #ifndef NDEBUG
-      dbgs() << "ScalarizeVectorOperand Op #" << OpNo << ": ";
-      N->dump(&DAG);
-      dbgs() << "\n";
+    dbgs() << "ScalarizeVectorOperand Op #" << OpNo << ": ";
+    N->dump(&DAG);
+    dbgs() << "\n";
 #endif
-      report_fatal_error("Do not know how to scalarize this operator's "
-                         "operand!\n");
-    case ISD::BITCAST:
-      Res = ScalarizeVecOp_BITCAST(N);
-      break;
-    case ISD::ANY_EXTEND:
-    case ISD::ZERO_EXTEND:
-    case ISD::SIGN_EXTEND:
-    case ISD::TRUNCATE:
-    case ISD::FP_TO_SINT:
-    case ISD::FP_TO_UINT:
-    case ISD::SINT_TO_FP:
-    case ISD::UINT_TO_FP:
-      Res = ScalarizeVecOp_UnaryOp(N);
-      break;
-    case ISD::STRICT_SINT_TO_FP:
-    case ISD::STRICT_UINT_TO_FP:
-    case ISD::STRICT_FP_TO_SINT:
-    case ISD::STRICT_FP_TO_UINT:
-      Res = ScalarizeVecOp_UnaryOp_StrictFP(N);
-      break;
-    case ISD::CONCAT_VECTORS:
-      Res = ScalarizeVecOp_CONCAT_VECTORS(N);
-      break;
-    case ISD::EXTRACT_VECTOR_ELT:
-      Res = ScalarizeVecOp_EXTRACT_VECTOR_ELT(N);
-      break;
-    case ISD::VSELECT:
-      Res = ScalarizeVecOp_VSELECT(N);
-      break;
-    case ISD::SETCC:
-      Res = ScalarizeVecOp_VSETCC(N);
-      break;
-    case ISD::STORE:
-      Res = ScalarizeVecOp_STORE(cast<StoreSDNode>(N), OpNo);
-      break;
-    case ISD::STRICT_FP_ROUND:
-      Res = ScalarizeVecOp_STRICT_FP_ROUND(N, OpNo);
-      break;
-    case ISD::FP_ROUND:
-      Res = ScalarizeVecOp_FP_ROUND(N, OpNo);
-      break;
-    case ISD::VECREDUCE_FADD:
-    case ISD::VECREDUCE_FMUL:
-    case ISD::VECREDUCE_ADD:
-    case ISD::VECREDUCE_MUL:
-    case ISD::VECREDUCE_AND:
-    case ISD::VECREDUCE_OR:
-    case ISD::VECREDUCE_XOR:
-    case ISD::VECREDUCE_SMAX:
-    case ISD::VECREDUCE_SMIN:
-    case ISD::VECREDUCE_UMAX:
-    case ISD::VECREDUCE_UMIN:
-    case ISD::VECREDUCE_FMAX:
-    case ISD::VECREDUCE_FMIN:
-      Res = ScalarizeVecOp_VECREDUCE(N);
-      break;
-    }
+    report_fatal_error("Do not know how to scalarize this operator's "
+                       "operand!\n");
+  case ISD::BITCAST:
+    Res = ScalarizeVecOp_BITCAST(N);
+    break;
+  case ISD::ANY_EXTEND:
+  case ISD::ZERO_EXTEND:
+  case ISD::SIGN_EXTEND:
+  case ISD::TRUNCATE:
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+    Res = ScalarizeVecOp_UnaryOp(N);
+    break;
+  case ISD::STRICT_SINT_TO_FP:
+  case ISD::STRICT_UINT_TO_FP:
+  case ISD::STRICT_FP_TO_SINT:
+  case ISD::STRICT_FP_TO_UINT:
+    Res = ScalarizeVecOp_UnaryOp_StrictFP(N);
+    break;
+  case ISD::CONCAT_VECTORS:
+    Res = ScalarizeVecOp_CONCAT_VECTORS(N);
+    break;
+  case ISD::EXTRACT_VECTOR_ELT:
+    Res = ScalarizeVecOp_EXTRACT_VECTOR_ELT(N);
+    break;
+  case ISD::VSELECT:
+    Res = ScalarizeVecOp_VSELECT(N);
+    break;
+  case ISD::SETCC:
+    Res = ScalarizeVecOp_VSETCC(N);
+    break;
+  case ISD::STORE:
+    Res = ScalarizeVecOp_STORE(cast<StoreSDNode>(N), OpNo);
+    break;
+  case ISD::STRICT_FP_ROUND:
+    Res = ScalarizeVecOp_STRICT_FP_ROUND(N, OpNo);
+    break;
+  case ISD::FP_ROUND:
+    Res = ScalarizeVecOp_FP_ROUND(N, OpNo);
+    break;
+  case ISD::STRICT_FP_EXTEND:
+    Res = ScalarizeVecOp_STRICT_FP_EXTEND(N);
+    break;
+  case ISD::FP_EXTEND:
+    Res = ScalarizeVecOp_FP_EXTEND(N);
+    break;
+  case ISD::VECREDUCE_FADD:
+  case ISD::VECREDUCE_FMUL:
+  case ISD::VECREDUCE_ADD:
+  case ISD::VECREDUCE_MUL:
+  case ISD::VECREDUCE_AND:
+  case ISD::VECREDUCE_OR:
+  case ISD::VECREDUCE_XOR:
+  case ISD::VECREDUCE_SMAX:
+  case ISD::VECREDUCE_SMIN:
+  case ISD::VECREDUCE_UMAX:
+  case ISD::VECREDUCE_UMIN:
+  case ISD::VECREDUCE_FMAX:
+  case ISD::VECREDUCE_FMIN:
+    Res = ScalarizeVecOp_VECREDUCE(N);
+    break;
+  case ISD::VECREDUCE_SEQ_FADD:
+  case ISD::VECREDUCE_SEQ_FMUL:
+    Res = ScalarizeVecOp_VECREDUCE_SEQ(N);
+    break;
   }
 
   // If the result is null, the sub-method took care of registering results etc.
@@ -762,6 +798,7 @@ SDValue DAGTypeLegalizer::ScalarizeVecOp_STORE(StoreSDNode *N, unsigned OpNo){
 /// If the value to round is a vector that needs to be scalarized, it must be
 /// <1 x ty>. Convert the element instead.
 SDValue DAGTypeLegalizer::ScalarizeVecOp_FP_ROUND(SDNode *N, unsigned OpNo) {
+  assert(OpNo == 0 && "Wrong operand for scalarization!");
   SDValue Elt = GetScalarizedVector(N->getOperand(0));
   SDValue Res = DAG.getNode(ISD::FP_ROUND, SDLoc(N),
                             N->getValueType(0).getVectorElementType(), Elt,
@@ -787,7 +824,36 @@ SDValue DAGTypeLegalizer::ScalarizeVecOp_STRICT_FP_ROUND(SDNode *N,
   // handled all replacements since caller can only handle a single result.
   ReplaceValueWith(SDValue(N, 0), Res);
   return SDValue();
-} 
+}
+
+/// If the value to extend is a vector that needs to be scalarized, it must be
+/// <1 x ty>. Convert the element instead.
+SDValue DAGTypeLegalizer::ScalarizeVecOp_FP_EXTEND(SDNode *N) {
+  SDValue Elt = GetScalarizedVector(N->getOperand(0));
+  SDValue Res = DAG.getNode(ISD::FP_EXTEND, SDLoc(N),
+                            N->getValueType(0).getVectorElementType(), Elt);
+  return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), N->getValueType(0), Res);
+}
+
+/// If the value to extend is a vector that needs to be scalarized, it must be
+/// <1 x ty>. Convert the element instead.
+SDValue DAGTypeLegalizer::ScalarizeVecOp_STRICT_FP_EXTEND(SDNode *N) {
+  SDValue Elt = GetScalarizedVector(N->getOperand(1));
+  SDValue Res =
+      DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(N),
+                  {N->getValueType(0).getVectorElementType(), MVT::Other},
+                  {N->getOperand(0), Elt});
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+
+  Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), N->getValueType(0), Res);
+
+  // Do our own replacement and return SDValue() to tell the caller that we
+  // handled all replacements since caller can only handle a single result.
+  ReplaceValueWith(SDValue(N, 0), Res);
+  return SDValue();
+}
 
 SDValue DAGTypeLegalizer::ScalarizeVecOp_VECREDUCE(SDNode *N) {
   SDValue Res = GetScalarizedVector(N->getOperand(0));
@@ -797,6 +863,17 @@ SDValue DAGTypeLegalizer::ScalarizeVecOp_VECREDUCE(SDNode *N) {
   return Res;
 }
 
+SDValue DAGTypeLegalizer::ScalarizeVecOp_VECREDUCE_SEQ(SDNode *N) {
+  SDValue AccOp = N->getOperand(0);
+  SDValue VecOp = N->getOperand(1);
+
+  unsigned BaseOpc = ISD::getVecReduceBaseOpcode(N->getOpcode());
+
+  SDValue Op = GetScalarizedVector(VecOp);
+  return DAG.getNode(BaseOpc, SDLoc(N), N->getValueType(0),
+                     AccOp, Op, N->getFlags());
+}
+
 //===----------------------------------------------------------------------===//
 //  Result Vector Splitting
 //===----------------------------------------------------------------------===//
@@ -836,7 +913,10 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FPOWI:             SplitVecRes_FPOWI(N, Lo, Hi); break;
   case ISD::FCOPYSIGN:         SplitVecRes_FCOPYSIGN(N, Lo, Hi); break;
   case ISD::INSERT_VECTOR_ELT: SplitVecRes_INSERT_VECTOR_ELT(N, Lo, Hi); break;
-  case ISD::SCALAR_TO_VECTOR:  SplitVecRes_SCALAR_TO_VECTOR(N, Lo, Hi); break;
+  case ISD::SPLAT_VECTOR:
+  case ISD::SCALAR_TO_VECTOR:
+    SplitVecRes_ScalarOp(N, Lo, Hi);
+    break;
   case ISD::SIGN_EXTEND_INREG: SplitVecRes_InregOp(N, Lo, Hi); break;
   case ISD::LOAD:
     SplitVecRes_LOAD(cast<LoadSDNode>(N), Lo, Hi);
@@ -939,9 +1019,15 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::UADDSAT:
   case ISD::SSUBSAT:
   case ISD::USUBSAT:
+  case ISD::SSHLSAT:
+  case ISD::USHLSAT:
+  case ISD::ROTL:
+  case ISD::ROTR:
     SplitVecRes_BinOp(N, Lo, Hi);
     break;
   case ISD::FMA:
+  case ISD::FSHL:
+  case ISD::FSHR:
     SplitVecRes_TernaryOp(N, Lo, Hi);
     break;
 
@@ -951,6 +1037,11 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
     SplitVecRes_StrictFPOp(N, Lo, Hi);
     break;
 
+  case ISD::FP_TO_UINT_SAT:
+  case ISD::FP_TO_SINT_SAT:
+    SplitVecRes_FP_TO_XINT_SAT(N, Lo, Hi);
+    break;
+
   case ISD::UADDO:
   case ISD::SADDO:
   case ISD::USUBO:
@@ -977,21 +1068,26 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
 }
 
 void DAGTypeLegalizer::IncrementPointer(MemSDNode *N, EVT MemVT,
-                                        MachinePointerInfo &MPI,
-                                        SDValue &Ptr) {
+                                        MachinePointerInfo &MPI, SDValue &Ptr,
+                                        uint64_t *ScaledOffset) {
   SDLoc DL(N);
   unsigned IncrementSize = MemVT.getSizeInBits().getKnownMinSize() / 8;
 
   if (MemVT.isScalableVector()) {
+    SDNodeFlags Flags;
     SDValue BytesIncrement = DAG.getVScale(
         DL, Ptr.getValueType(),
         APInt(Ptr.getValueSizeInBits().getFixedSize(), IncrementSize));
     MPI = MachinePointerInfo(N->getPointerInfo().getAddrSpace());
-    Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, BytesIncrement);
+    Flags.setNoUnsignedWrap(true);
+    if (ScaledOffset)
+      *ScaledOffset += IncrementSize;
+    Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, BytesIncrement,
+                      Flags);
   } else {
     MPI = N->getPointerInfo().getWithOffset(IncrementSize);
     // Increment the pointer to the other half.
-    Ptr = DAG.getObjectPtrOffset(DL, Ptr, IncrementSize);
+    Ptr = DAG.getObjectPtrOffset(DL, Ptr, TypeSize::Fixed(IncrementSize));
   }
 }
 
@@ -1200,7 +1296,8 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_SUBVECTOR(SDNode *N, SDValue &Lo,
 
   // Increment the pointer to the other part.
   unsigned IncrementSize = Lo.getValueSizeInBits() / 8;
-  StackPtr = DAG.getMemBasePlusOffset(StackPtr, IncrementSize, dl);
+  StackPtr =
+      DAG.getMemBasePlusOffset(StackPtr, TypeSize::Fixed(IncrementSize), dl);
 
   // Load the Hi part from the stack slot.
   Hi = DAG.getLoad(Hi.getValueType(), dl, Store, StackPtr,
@@ -1448,14 +1545,16 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
 
   if (ConstantSDNode *CIdx = dyn_cast<ConstantSDNode>(Idx)) {
     unsigned IdxVal = CIdx->getZExtValue();
-    unsigned LoNumElts = Lo.getValueType().getVectorNumElements();
-    if (IdxVal < LoNumElts)
+    unsigned LoNumElts = Lo.getValueType().getVectorMinNumElements();
+    if (IdxVal < LoNumElts) {
       Lo = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
                        Lo.getValueType(), Lo, Elt, Idx);
-    else
+      return;
+    } else if (!Vec.getValueType().isScalableVector()) {
       Hi = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Hi.getValueType(), Hi, Elt,
                        DAG.getVectorIdxConstant(IdxVal - LoNumElts, dl));
-    return;
+      return;
+    }
   }
 
   // See if the target wants to custom expand this node.
@@ -1468,7 +1567,7 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
   if (VecVT.getScalarSizeInBits() < 8) {
     EltVT = MVT::i8;
     VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
-                             VecVT.getVectorNumElements());
+                             VecVT.getVectorElementCount());
     Vec = DAG.getNode(ISD::ANY_EXTEND, dl, VecVT, Vec);
     // Extend the element type to match if needed.
     if (EltVT.bitsGT(Elt.getValueType()))
@@ -1493,7 +1592,8 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
   SDValue EltPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
   Store = DAG.getTruncStore(
       Store, dl, Elt, EltPtr, MachinePointerInfo::getUnknownStack(MF), EltVT,
-      commonAlignment(SmallestAlign, EltVT.getSizeInBits() / 8));
+      commonAlignment(SmallestAlign,
+                      EltVT.getFixedSizeInBits() / 8));
 
   EVT LoVT, HiVT;
   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT);
@@ -1502,12 +1602,11 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
   Lo = DAG.getLoad(LoVT, dl, Store, StackPtr, PtrInfo, SmallestAlign);
 
   // Increment the pointer to the other part.
-  unsigned IncrementSize = LoVT.getSizeInBits() / 8;
-  StackPtr = DAG.getMemBasePlusOffset(StackPtr, IncrementSize, dl);
+  auto Load = cast<LoadSDNode>(Lo);
+  MachinePointerInfo MPI = Load->getPointerInfo();
+  IncrementPointer(Load, LoVT, MPI, StackPtr);
 
-  // Load the Hi part from the stack slot.
-  Hi = DAG.getLoad(HiVT, dl, Store, StackPtr,
-                   PtrInfo.getWithOffset(IncrementSize), SmallestAlign);
+  Hi = DAG.getLoad(HiVT, dl, Store, StackPtr, MPI, SmallestAlign);
 
   // If we adjusted the original type, we need to truncate the results.
   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
@@ -1517,13 +1616,18 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
     Hi = DAG.getNode(ISD::TRUNCATE, dl, HiVT, Hi);
 }
 
-void DAGTypeLegalizer::SplitVecRes_SCALAR_TO_VECTOR(SDNode *N, SDValue &Lo,
-                                                    SDValue &Hi) {
+void DAGTypeLegalizer::SplitVecRes_ScalarOp(SDNode *N, SDValue &Lo,
+                                            SDValue &Hi) {
   EVT LoVT, HiVT;
   SDLoc dl(N);
   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
-  Lo = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoVT, N->getOperand(0));
-  Hi = DAG.getUNDEF(HiVT);
+  Lo = DAG.getNode(N->getOpcode(), dl, LoVT, N->getOperand(0));
+  if (N->getOpcode() == ISD::SCALAR_TO_VECTOR) {
+    Hi = DAG.getUNDEF(HiVT);
+  } else {
+    assert(N->getOpcode() == ISD::SPLAT_VECTOR && "Unexpected opcode");
+    Hi = Lo;
+  }
 }
 
 void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo,
@@ -1611,9 +1715,10 @@ void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD,
   else
     std::tie(PassThruLo, PassThruHi) = DAG.SplitVector(PassThru, dl);
 
+  unsigned LoSize = MemoryLocation::getSizeOrUnknown(LoMemVT.getStoreSize());
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
-      MLD->getPointerInfo(), MachineMemOperand::MOLoad, LoMemVT.getStoreSize(),
-      Alignment, MLD->getAAInfo(), MLD->getRanges());
+      MLD->getPointerInfo(), MachineMemOperand::MOLoad, LoSize, Alignment,
+      MLD->getAAInfo(), MLD->getRanges());
 
   Lo = DAG.getMaskedLoad(LoVT, dl, Ch, Ptr, Offset, MaskLo, PassThruLo, LoMemVT,
                          MMO, MLD->getAddressingMode(), ExtType,
@@ -1627,12 +1732,18 @@ void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD,
     // Generate hi masked load.
     Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, dl, LoMemVT, DAG,
                                      MLD->isExpandingLoad());
-    unsigned HiOffset = LoMemVT.getStoreSize();
+    unsigned HiSize = MemoryLocation::getSizeOrUnknown(HiMemVT.getStoreSize());
+
+    MachinePointerInfo MPI;
+    if (LoMemVT.isScalableVector())
+      MPI = MachinePointerInfo(MLD->getPointerInfo().getAddrSpace());
+    else
+      MPI = MLD->getPointerInfo().getWithOffset(
+          LoMemVT.getStoreSize().getFixedSize());
 
     MMO = DAG.getMachineFunction().getMachineMemOperand(
-        MLD->getPointerInfo().getWithOffset(HiOffset),
-        MachineMemOperand::MOLoad, HiMemVT.getStoreSize(), Alignment,
-        MLD->getAAInfo(), MLD->getRanges());
+        MPI, MachineMemOperand::MOLoad, HiSize, Alignment, MLD->getAAInfo(),
+        MLD->getRanges());
 
     Hi = DAG.getMaskedLoad(HiVT, dl, Ch, Ptr, Offset, MaskHi, PassThruHi,
                            HiMemVT, MMO, MLD->getAddressingMode(), ExtType,
@@ -1662,7 +1773,9 @@ void DAGTypeLegalizer::SplitVecRes_MGATHER(MaskedGatherSDNode *MGT,
   SDValue PassThru = MGT->getPassThru();
   SDValue Index = MGT->getIndex();
   SDValue Scale = MGT->getScale();
+  EVT MemoryVT = MGT->getMemoryVT();
   Align Alignment = MGT->getOriginalAlign();
+  ISD::LoadExtType ExtType = MGT->getExtensionType();
 
   // Split Mask operand
   SDValue MaskLo, MaskHi;
@@ -1675,6 +1788,10 @@ void DAGTypeLegalizer::SplitVecRes_MGATHER(MaskedGatherSDNode *MGT,
       std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
   }
 
+  EVT LoMemVT, HiMemVT;
+  // Split MemoryVT
+  std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
+
   SDValue PassThruLo, PassThruHi;
   if (getTypeAction(PassThru.getValueType()) == TargetLowering::TypeSplitVector)
     GetSplitVector(PassThru, PassThruLo, PassThruHi);
@@ -1693,12 +1810,12 @@ void DAGTypeLegalizer::SplitVecRes_MGATHER(MaskedGatherSDNode *MGT,
       MGT->getRanges());
 
   SDValue OpsLo[] = {Ch, PassThruLo, MaskLo, Ptr, IndexLo, Scale};
-  Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoVT, dl, OpsLo,
-                           MMO, MGT->getIndexType());
+  Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoMemVT, dl, OpsLo,
+                           MMO, MGT->getIndexType(), ExtType);
 
   SDValue OpsHi[] = {Ch, PassThruHi, MaskHi, Ptr, IndexHi, Scale};
-  Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiVT, dl, OpsHi,
-                           MMO, MGT->getIndexType());
+  Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiMemVT, dl, OpsHi,
+                           MMO, MGT->getIndexType(), ExtType);
 
   // Build a factor node to remember that this load is independent of the
   // other one.
@@ -1786,8 +1903,8 @@ void DAGTypeLegalizer::SplitVecRes_ExtendOp(SDNode *N, SDValue &Lo,
   // more effectively move in the right direction and prevent falling down
   // to scalarization in many cases due to the input vector being split too
   // far.
-  if ((SrcVT.getVectorMinNumElements() & 1) == 0 &&
-      SrcVT.getSizeInBits() * 2 < DestVT.getSizeInBits()) {
+  if (SrcVT.getVectorElementCount().isKnownEven() &&
+      SrcVT.getScalarSizeInBits() * 2 < DestVT.getScalarSizeInBits()) {
     LLVMContext &Ctx = *DAG.getContext();
     EVT NewSrcVT = SrcVT.widenIntegerVectorElementType(Ctx);
     EVT SplitSrcVT = SrcVT.getHalfNumVectorElementsVT(Ctx);
@@ -1942,6 +2059,22 @@ void DAGTypeLegalizer::SplitVecRes_VAARG(SDNode *N, SDValue &Lo, SDValue &Hi) {
   ReplaceValueWith(SDValue(N, 1), Chain);
 }
 
+void DAGTypeLegalizer::SplitVecRes_FP_TO_XINT_SAT(SDNode *N, SDValue &Lo,
+                                                  SDValue &Hi) {
+  EVT DstVTLo, DstVTHi;
+  std::tie(DstVTLo, DstVTHi) = DAG.GetSplitDestVTs(N->getValueType(0));
+  SDLoc dl(N);
+
+  SDValue SrcLo, SrcHi;
+  EVT SrcVT = N->getOperand(0).getValueType();
+  if (getTypeAction(SrcVT) == TargetLowering::TypeSplitVector)
+    GetSplitVector(N->getOperand(0), SrcLo, SrcHi);
+  else
+    std::tie(SrcLo, SrcHi) = DAG.SplitVectorOperand(N, 0);
+
+  Lo = DAG.getNode(N->getOpcode(), dl, DstVTLo, SrcLo, N->getOperand(1));
+  Hi = DAG.getNode(N->getOpcode(), dl, DstVTHi, SrcHi, N->getOperand(1));
+}
 
 //===----------------------------------------------------------------------===//
 //  Operand Vector Splitting
@@ -1959,92 +2092,95 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
   if (CustomLowerNode(N, N->getOperand(OpNo).getValueType(), false))
     return false;
 
-  if (!Res.getNode()) {
-    switch (N->getOpcode()) {
-    default:
+  switch (N->getOpcode()) {
+  default:
 #ifndef NDEBUG
-      dbgs() << "SplitVectorOperand Op #" << OpNo << ": ";
-      N->dump(&DAG);
-      dbgs() << "\n";
+    dbgs() << "SplitVectorOperand Op #" << OpNo << ": ";
+    N->dump(&DAG);
+    dbgs() << "\n";
 #endif
-      report_fatal_error("Do not know how to split this operator's "
-                         "operand!\n");
-
-    case ISD::SETCC:             Res = SplitVecOp_VSETCC(N); break;
-    case ISD::BITCAST:           Res = SplitVecOp_BITCAST(N); break;
-    case ISD::EXTRACT_SUBVECTOR: Res = SplitVecOp_EXTRACT_SUBVECTOR(N); break;
-    case ISD::EXTRACT_VECTOR_ELT:Res = SplitVecOp_EXTRACT_VECTOR_ELT(N); break;
-    case ISD::CONCAT_VECTORS:    Res = SplitVecOp_CONCAT_VECTORS(N); break;
-    case ISD::TRUNCATE:
+    report_fatal_error("Do not know how to split this operator's "
+                       "operand!\n");
+
+  case ISD::SETCC:             Res = SplitVecOp_VSETCC(N); break;
+  case ISD::BITCAST:           Res = SplitVecOp_BITCAST(N); break;
+  case ISD::EXTRACT_SUBVECTOR: Res = SplitVecOp_EXTRACT_SUBVECTOR(N); break;
+  case ISD::INSERT_SUBVECTOR:  Res = SplitVecOp_INSERT_SUBVECTOR(N, OpNo); break;
+  case ISD::EXTRACT_VECTOR_ELT:Res = SplitVecOp_EXTRACT_VECTOR_ELT(N); break;
+  case ISD::CONCAT_VECTORS:    Res = SplitVecOp_CONCAT_VECTORS(N); break;
+  case ISD::TRUNCATE:
+    Res = SplitVecOp_TruncateHelper(N);
+    break;
+  case ISD::STRICT_FP_ROUND:
+  case ISD::FP_ROUND:          Res = SplitVecOp_FP_ROUND(N); break;
+  case ISD::FCOPYSIGN:         Res = SplitVecOp_FCOPYSIGN(N); break;
+  case ISD::STORE:
+    Res = SplitVecOp_STORE(cast<StoreSDNode>(N), OpNo);
+    break;
+  case ISD::MSTORE:
+    Res = SplitVecOp_MSTORE(cast<MaskedStoreSDNode>(N), OpNo);
+    break;
+  case ISD::MSCATTER:
+    Res = SplitVecOp_MSCATTER(cast<MaskedScatterSDNode>(N), OpNo);
+    break;
+  case ISD::MGATHER:
+    Res = SplitVecOp_MGATHER(cast<MaskedGatherSDNode>(N), OpNo);
+    break;
+  case ISD::VSELECT:
+    Res = SplitVecOp_VSELECT(N, OpNo);
+    break;
+  case ISD::STRICT_SINT_TO_FP:
+  case ISD::STRICT_UINT_TO_FP:
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+    if (N->getValueType(0).bitsLT(
+            N->getOperand(N->isStrictFPOpcode() ? 1 : 0).getValueType()))
       Res = SplitVecOp_TruncateHelper(N);
-      break;
-    case ISD::STRICT_FP_ROUND:
-    case ISD::FP_ROUND:          Res = SplitVecOp_FP_ROUND(N); break;
-    case ISD::FCOPYSIGN:         Res = SplitVecOp_FCOPYSIGN(N); break;
-    case ISD::STORE:
-      Res = SplitVecOp_STORE(cast<StoreSDNode>(N), OpNo);
-      break;
-    case ISD::MSTORE:
-      Res = SplitVecOp_MSTORE(cast<MaskedStoreSDNode>(N), OpNo);
-      break;
-    case ISD::MSCATTER:
-      Res = SplitVecOp_MSCATTER(cast<MaskedScatterSDNode>(N), OpNo);
-      break;
-    case ISD::MGATHER:
-      Res = SplitVecOp_MGATHER(cast<MaskedGatherSDNode>(N), OpNo);
-      break;
-    case ISD::VSELECT:
-      Res = SplitVecOp_VSELECT(N, OpNo);
-      break;
-    case ISD::STRICT_SINT_TO_FP:
-    case ISD::STRICT_UINT_TO_FP:
-    case ISD::SINT_TO_FP:
-    case ISD::UINT_TO_FP:
-      if (N->getValueType(0).bitsLT(
-              N->getOperand(N->isStrictFPOpcode() ? 1 : 0).getValueType()))
-        Res = SplitVecOp_TruncateHelper(N);
-      else
-        Res = SplitVecOp_UnaryOp(N);
-      break;
-    case ISD::FP_TO_SINT:
-    case ISD::FP_TO_UINT:
-    case ISD::STRICT_FP_TO_SINT:
-    case ISD::STRICT_FP_TO_UINT:
-    case ISD::CTTZ:
-    case ISD::CTLZ:
-    case ISD::CTPOP:
-    case ISD::STRICT_FP_EXTEND:
-    case ISD::FP_EXTEND:
-    case ISD::SIGN_EXTEND:
-    case ISD::ZERO_EXTEND:
-    case ISD::ANY_EXTEND:
-    case ISD::FTRUNC:
-    case ISD::FCANONICALIZE:
+    else
       Res = SplitVecOp_UnaryOp(N);
-      break;
+    break;
+  case ISD::FP_TO_SINT_SAT:
+  case ISD::FP_TO_UINT_SAT:
+    Res = SplitVecOp_FP_TO_XINT_SAT(N);
+    break;
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+  case ISD::STRICT_FP_TO_SINT:
+  case ISD::STRICT_FP_TO_UINT:
+  case ISD::STRICT_FP_EXTEND:
+  case ISD::FP_EXTEND:
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+  case ISD::ANY_EXTEND:
+  case ISD::FTRUNC:
+    Res = SplitVecOp_UnaryOp(N);
+    break;
 
-    case ISD::ANY_EXTEND_VECTOR_INREG:
-    case ISD::SIGN_EXTEND_VECTOR_INREG:
-    case ISD::ZERO_EXTEND_VECTOR_INREG:
-      Res = SplitVecOp_ExtVecInRegOp(N);
-      break;
+  case ISD::ANY_EXTEND_VECTOR_INREG:
+  case ISD::SIGN_EXTEND_VECTOR_INREG:
+  case ISD::ZERO_EXTEND_VECTOR_INREG:
+    Res = SplitVecOp_ExtVecInRegOp(N);
+    break;
 
-    case ISD::VECREDUCE_FADD:
-    case ISD::VECREDUCE_FMUL:
-    case ISD::VECREDUCE_ADD:
-    case ISD::VECREDUCE_MUL:
-    case ISD::VECREDUCE_AND:
-    case ISD::VECREDUCE_OR:
-    case ISD::VECREDUCE_XOR:
-    case ISD::VECREDUCE_SMAX:
-    case ISD::VECREDUCE_SMIN:
-    case ISD::VECREDUCE_UMAX:
-    case ISD::VECREDUCE_UMIN:
-    case ISD::VECREDUCE_FMAX:
-    case ISD::VECREDUCE_FMIN:
-      Res = SplitVecOp_VECREDUCE(N, OpNo);
-      break;
-    }
+  case ISD::VECREDUCE_FADD:
+  case ISD::VECREDUCE_FMUL:
+  case ISD::VECREDUCE_ADD:
+  case ISD::VECREDUCE_MUL:
+  case ISD::VECREDUCE_AND:
+  case ISD::VECREDUCE_OR:
+  case ISD::VECREDUCE_XOR:
+  case ISD::VECREDUCE_SMAX:
+  case ISD::VECREDUCE_SMIN:
+  case ISD::VECREDUCE_UMAX:
+  case ISD::VECREDUCE_UMIN:
+  case ISD::VECREDUCE_FMAX:
+  case ISD::VECREDUCE_FMIN:
+    Res = SplitVecOp_VECREDUCE(N, OpNo);
+    break;
+  case ISD::VECREDUCE_SEQ_FADD:
+  case ISD::VECREDUCE_SEQ_FMUL:
+    Res = SplitVecOp_VECREDUCE_SEQ(N);
+    break;
   }
 
   // If the result is null, the sub-method took care of registering results etc.
@@ -2112,36 +2248,35 @@ SDValue DAGTypeLegalizer::SplitVecOp_VECREDUCE(SDNode *N, unsigned OpNo) {
   EVT LoOpVT, HiOpVT;
   std::tie(LoOpVT, HiOpVT) = DAG.GetSplitDestVTs(VecVT);
 
-  bool NoNaN = N->getFlags().hasNoNaNs();
-  unsigned CombineOpc = 0;
-  switch (N->getOpcode()) {
-  case ISD::VECREDUCE_FADD: CombineOpc = ISD::FADD; break;
-  case ISD::VECREDUCE_FMUL: CombineOpc = ISD::FMUL; break;
-  case ISD::VECREDUCE_ADD:  CombineOpc = ISD::ADD; break;
-  case ISD::VECREDUCE_MUL:  CombineOpc = ISD::MUL; break;
-  case ISD::VECREDUCE_AND:  CombineOpc = ISD::AND; break;
-  case ISD::VECREDUCE_OR:   CombineOpc = ISD::OR; break;
-  case ISD::VECREDUCE_XOR:  CombineOpc = ISD::XOR; break;
-  case ISD::VECREDUCE_SMAX: CombineOpc = ISD::SMAX; break;
-  case ISD::VECREDUCE_SMIN: CombineOpc = ISD::SMIN; break;
-  case ISD::VECREDUCE_UMAX: CombineOpc = ISD::UMAX; break;
-  case ISD::VECREDUCE_UMIN: CombineOpc = ISD::UMIN; break;
-  case ISD::VECREDUCE_FMAX:
-    CombineOpc = NoNaN ? ISD::FMAXNUM : ISD::FMAXIMUM;
-    break;
-  case ISD::VECREDUCE_FMIN:
-    CombineOpc = NoNaN ? ISD::FMINNUM : ISD::FMINIMUM;
-    break;
-  default:
-    llvm_unreachable("Unexpected reduce ISD node");
-  }
-
   // Use the appropriate scalar instruction on the split subvectors before
   // reducing the now partially reduced smaller vector.
+  unsigned CombineOpc = ISD::getVecReduceBaseOpcode(N->getOpcode());
   SDValue Partial = DAG.getNode(CombineOpc, dl, LoOpVT, Lo, Hi, N->getFlags());
   return DAG.getNode(N->getOpcode(), dl, ResVT, Partial, N->getFlags());
 }
 
+SDValue DAGTypeLegalizer::SplitVecOp_VECREDUCE_SEQ(SDNode *N) {
+  EVT ResVT = N->getValueType(0);
+  SDValue Lo, Hi;
+  SDLoc dl(N);
+
+  SDValue AccOp = N->getOperand(0);
+  SDValue VecOp = N->getOperand(1);
+  SDNodeFlags Flags = N->getFlags();
+
+  EVT VecVT = VecOp.getValueType();
+  assert(VecVT.isVector() && "Can only split reduce vector operand");
+  GetSplitVector(VecOp, Lo, Hi);
+  EVT LoOpVT, HiOpVT;
+  std::tie(LoOpVT, HiOpVT) = DAG.GetSplitDestVTs(VecVT);
+
+  // Reduce low half.
+  SDValue Partial = DAG.getNode(N->getOpcode(), dl, ResVT, AccOp, Lo, Flags);
+
+  // Reduce high half, using low half result as initial value.
+  return DAG.getNode(N->getOpcode(), dl, ResVT, Partial, Hi, Flags);
+}
+
 SDValue DAGTypeLegalizer::SplitVecOp_UnaryOp(SDNode *N) {
   // The result has a legal vector type, but the input needs splitting.
   EVT ResVT = N->getValueType(0);
@@ -2191,9 +2326,36 @@ SDValue DAGTypeLegalizer::SplitVecOp_BITCAST(SDNode *N) {
                      JoinIntegers(Lo, Hi));
 }
 
+SDValue DAGTypeLegalizer::SplitVecOp_INSERT_SUBVECTOR(SDNode *N,
+                                                      unsigned OpNo) {
+  assert(OpNo == 1 && "Invalid OpNo; can only split SubVec.");
+  // We know that the result type is legal.
+  EVT ResVT = N->getValueType(0);
+
+  SDValue Vec = N->getOperand(0);
+  SDValue SubVec = N->getOperand(1);
+  SDValue Idx = N->getOperand(2);
+  SDLoc dl(N);
+
+  SDValue Lo, Hi;
+  GetSplitVector(SubVec, Lo, Hi);
+
+  uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+  uint64_t LoElts = Lo.getValueType().getVectorMinNumElements();
+
+  SDValue FirstInsertion =
+      DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Lo, Idx);
+  SDValue SecondInsertion =
+      DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, FirstInsertion, Hi,
+                  DAG.getVectorIdxConstant(IdxVal + LoElts, dl));
+
+  return SecondInsertion;
+}
+
 SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N) {
   // We know that the extracted result type is legal.
   EVT SubVT = N->getValueType(0);
+
   SDValue Idx = N->getOperand(1);
   SDLoc dl(N);
   SDValue Lo, Hi;
@@ -2229,13 +2391,14 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
     SDValue Lo, Hi;
     GetSplitVector(Vec, Lo, Hi);
 
-    uint64_t LoElts = Lo.getValueType().getVectorNumElements();
+    uint64_t LoElts = Lo.getValueType().getVectorMinNumElements();
 
     if (IdxVal < LoElts)
       return SDValue(DAG.UpdateNodeOperands(N, Lo, Idx), 0);
-    return SDValue(DAG.UpdateNodeOperands(N, Hi,
-                                  DAG.getConstant(IdxVal - LoElts, SDLoc(N),
-                                                  Idx.getValueType())), 0);
+    else if (!Vec.getValueType().isScalableVector())
+      return SDValue(DAG.UpdateNodeOperands(N, Hi,
+                                    DAG.getConstant(IdxVal - LoElts, SDLoc(N),
+                                                    Idx.getValueType())), 0);
   }
 
   // See if the target wants to custom expand this node.
@@ -2248,7 +2411,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
   if (VecVT.getScalarSizeInBits() < 8) {
     EltVT = MVT::i8;
     VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
-                             VecVT.getVectorNumElements());
+                             VecVT.getVectorElementCount());
     Vec = DAG.getNode(ISD::ANY_EXTEND, dl, VecVT, Vec);
   }
 
@@ -2278,7 +2441,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
   return DAG.getExtLoad(
       ISD::EXTLOAD, dl, N->getValueType(0), Store, StackPtr,
       MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()), EltVT,
-      commonAlignment(SmallestAlign, EltVT.getSizeInBits() / 8));
+      commonAlignment(SmallestAlign, EltVT.getFixedSizeInBits() / 8));
 }
 
 SDValue DAGTypeLegalizer::SplitVecOp_ExtVecInRegOp(SDNode *N) {
@@ -2304,6 +2467,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_MGATHER(MaskedGatherSDNode *MGT,
   SDValue Mask = MGT->getMask();
   SDValue PassThru = MGT->getPassThru();
   Align Alignment = MGT->getOriginalAlign();
+  ISD::LoadExtType ExtType = MGT->getExtensionType();
 
   SDValue MaskLo, MaskHi;
   if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
@@ -2334,12 +2498,12 @@ SDValue DAGTypeLegalizer::SplitVecOp_MGATHER(MaskedGatherSDNode *MGT,
       MGT->getRanges());
 
   SDValue OpsLo[] = {Ch, PassThruLo, MaskLo, Ptr, IndexLo, Scale};
-  SDValue Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoVT, dl,
-                                   OpsLo, MMO, MGT->getIndexType());
+  SDValue Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoMemVT, dl,
+                                   OpsLo, MMO, MGT->getIndexType(), ExtType);
 
   SDValue OpsHi[] = {Ch, PassThruHi, MaskHi, Ptr, IndexHi, Scale};
-  SDValue Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiVT, dl,
-                                   OpsHi, MMO, MGT->getIndexType());
+  SDValue Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiMemVT, dl,
+                                   OpsHi, MMO, MGT->getIndexType(), ExtType);
 
   // Build a factor node to remember that this load is independent of the
   // other one.
@@ -2393,9 +2557,10 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N,
       DAG.GetDependentSplitDestVTs(MemoryVT, DataLo.getValueType(), &HiIsEmpty);
 
   SDValue Lo, Hi, Res;
+  unsigned LoSize = MemoryLocation::getSizeOrUnknown(LoMemVT.getStoreSize());
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
-      N->getPointerInfo(), MachineMemOperand::MOStore, LoMemVT.getStoreSize(),
-      Alignment, N->getAAInfo(), N->getRanges());
+      N->getPointerInfo(), MachineMemOperand::MOStore, LoSize, Alignment,
+      N->getAAInfo(), N->getRanges());
 
   Lo = DAG.getMaskedStore(Ch, DL, DataLo, Ptr, Offset, MaskLo, LoMemVT, MMO,
                           N->getAddressingMode(), N->isTruncatingStore(),
@@ -2409,11 +2574,20 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N,
 
     Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, DL, LoMemVT, DAG,
                                      N->isCompressingStore());
-    unsigned HiOffset = LoMemVT.getStoreSize();
 
+    MachinePointerInfo MPI;
+    if (LoMemVT.isScalableVector()) {
+      Alignment = commonAlignment(
+          Alignment, LoMemVT.getSizeInBits().getKnownMinSize() / 8);
+      MPI = MachinePointerInfo(N->getPointerInfo().getAddrSpace());
+    } else
+      MPI = N->getPointerInfo().getWithOffset(
+          LoMemVT.getStoreSize().getFixedSize());
+
+    unsigned HiSize = MemoryLocation::getSizeOrUnknown(HiMemVT.getStoreSize());
     MMO = DAG.getMachineFunction().getMachineMemOperand(
-        N->getPointerInfo().getWithOffset(HiOffset), MachineMemOperand::MOStore,
-        HiMemVT.getStoreSize(), Alignment, N->getAAInfo(), N->getRanges());
+        MPI, MachineMemOperand::MOStore, HiSize, Alignment, N->getAAInfo(),
+        N->getRanges());
 
     Hi = DAG.getMaskedStore(Ch, DL, DataHi, Ptr, Offset, MaskHi, HiMemVT, MMO,
                             N->getAddressingMode(), N->isTruncatingStore(),
@@ -2435,11 +2609,15 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSCATTER(MaskedScatterSDNode *N,
   SDValue Index = N->getIndex();
   SDValue Scale = N->getScale();
   SDValue Data = N->getValue();
+  EVT MemoryVT = N->getMemoryVT();
   Align Alignment = N->getOriginalAlign();
   SDLoc DL(N);
 
   // Split all operands
 
+  EVT LoMemVT, HiMemVT;
+  std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
+
   SDValue DataLo, DataHi;
   if (getTypeAction(Data.getValueType()) == TargetLowering::TypeSplitVector)
     // Split Data operand
@@ -2470,15 +2648,17 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSCATTER(MaskedScatterSDNode *N,
       MemoryLocation::UnknownSize, Alignment, N->getAAInfo(), N->getRanges());
 
   SDValue OpsLo[] = {Ch, DataLo, MaskLo, Ptr, IndexLo, Scale};
-  Lo = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataLo.getValueType(),
-                            DL, OpsLo, MMO, N->getIndexType());
+  Lo = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), LoMemVT,
+                            DL, OpsLo, MMO, N->getIndexType(),
+                            N->isTruncatingStore());
 
   // The order of the Scatter operation after split is well defined. The "Hi"
   // part comes after the "Lo". So these two operations should be chained one
   // after another.
   SDValue OpsHi[] = {Lo, DataHi, MaskHi, Ptr, IndexHi, Scale};
-  return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataHi.getValueType(),
-                              DL, OpsHi, MMO, N->getIndexType());
+  return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), HiMemVT,
+                              DL, OpsHi, MMO, N->getIndexType(),
+                              N->isTruncatingStore());
 }
 
 SDValue DAGTypeLegalizer::SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo) {
@@ -2604,7 +2784,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_TruncateHelper(SDNode *N) {
     EVT::getFloatingPointVT(InElementSize/2) :
     EVT::getIntegerVT(*DAG.getContext(), InElementSize/2);
   EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), HalfElementVT,
-                                NumElements/2);
+                                NumElements.divideCoefficientBy(2));
 
   SDValue HalfLo;
   SDValue HalfHi;
@@ -2683,7 +2863,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_FP_ROUND(SDNode *N) {
   EVT InVT = Lo.getValueType();
 
   EVT OutVT = EVT::getVectorVT(*DAG.getContext(), ResVT.getVectorElementType(),
-                               InVT.getVectorNumElements());
+                               InVT.getVectorElementCount());
 
   if (N->isStrictFPOpcode()) {
     Lo = DAG.getNode(N->getOpcode(), DL, { OutVT, MVT::Other }, 
@@ -2709,6 +2889,22 @@ SDValue DAGTypeLegalizer::SplitVecOp_FCOPYSIGN(SDNode *N) {
   return DAG.UnrollVectorOp(N, N->getValueType(0).getVectorNumElements());
 }
 
+SDValue DAGTypeLegalizer::SplitVecOp_FP_TO_XINT_SAT(SDNode *N) {
+  EVT ResVT = N->getValueType(0);
+  SDValue Lo, Hi;
+  SDLoc dl(N);
+  GetSplitVector(N->getOperand(0), Lo, Hi);
+  EVT InVT = Lo.getValueType();
+
+  EVT NewResVT =
+      EVT::getVectorVT(*DAG.getContext(), ResVT.getVectorElementType(),
+                       InVT.getVectorElementCount());
+
+  Lo = DAG.getNode(N->getOpcode(), dl, NewResVT, Lo, N->getOperand(1));
+  Hi = DAG.getNode(N->getOpcode(), dl, NewResVT, Hi, N->getOperand(1));
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
+}
 
 //===----------------------------------------------------------------------===//
 //  Result Vector Widening
@@ -2739,7 +2935,10 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::EXTRACT_SUBVECTOR: Res = WidenVecRes_EXTRACT_SUBVECTOR(N); break;
   case ISD::INSERT_VECTOR_ELT: Res = WidenVecRes_INSERT_VECTOR_ELT(N); break;
   case ISD::LOAD:              Res = WidenVecRes_LOAD(N); break;
-  case ISD::SCALAR_TO_VECTOR:  Res = WidenVecRes_SCALAR_TO_VECTOR(N); break;
+  case ISD::SPLAT_VECTOR:
+  case ISD::SCALAR_TO_VECTOR:
+    Res = WidenVecRes_ScalarOp(N);
+    break;
   case ISD::SIGN_EXTEND_INREG: Res = WidenVecRes_InregOp(N); break;
   case ISD::VSELECT:
   case ISD::SELECT:            Res = WidenVecRes_SELECT(N); break;
@@ -2764,6 +2963,9 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::OR:
   case ISD::SUB:
   case ISD::XOR:
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:
   case ISD::FMINNUM:
   case ISD::FMAXNUM:
   case ISD::FMINIMUM:
@@ -2776,6 +2978,10 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::SADDSAT:
   case ISD::USUBSAT:
   case ISD::SSUBSAT:
+  case ISD::SSHLSAT:
+  case ISD::USHLSAT:
+  case ISD::ROTL:
+  case ISD::ROTR:
     Res = WidenVecRes_Binary(N);
     break;
 
@@ -2824,12 +3030,6 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
     Res = WidenVecRes_POWI(N);
     break;
 
-  case ISD::SHL:
-  case ISD::SRA:
-  case ISD::SRL:
-    Res = WidenVecRes_Shift(N);
-    break;
-
   case ISD::ANY_EXTEND_VECTOR_INREG:
   case ISD::SIGN_EXTEND_VECTOR_INREG:
   case ISD::ZERO_EXTEND_VECTOR_INREG:
@@ -2849,6 +3049,11 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
     Res = WidenVecRes_Convert(N);
     break;
 
+  case ISD::FP_TO_SINT_SAT:
+  case ISD::FP_TO_UINT_SAT:
+    Res = WidenVecRes_FP_TO_XINT_SAT(N);
+    break;
+
   case ISD::FABS:
   case ISD::FCEIL:
   case ISD::FCOS:
@@ -2896,6 +3101,8 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
     Res = WidenVecRes_Unary(N);
     break;
   case ISD::FMA:
+  case ISD::FSHL:
+  case ISD::FSHR:
     Res = WidenVecRes_Ternary(N);
     break;
   }
@@ -3261,19 +3468,34 @@ SDValue DAGTypeLegalizer::WidenVecRes_OverflowOp(SDNode *N, unsigned ResNo) {
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
+  LLVMContext &Ctx = *DAG.getContext();
   SDValue InOp = N->getOperand(0);
   SDLoc DL(N);
 
-  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  EVT WidenVT = TLI.getTypeToTransformTo(Ctx, N->getValueType(0));
   unsigned WidenNumElts = WidenVT.getVectorNumElements();
 
   EVT InVT = InOp.getValueType();
-  EVT InEltVT = InVT.getVectorElementType();
-  EVT InWidenVT = EVT::getVectorVT(*DAG.getContext(), InEltVT, WidenNumElts);
 
   unsigned Opcode = N->getOpcode();
-  unsigned InVTNumElts = InVT.getVectorNumElements();
   const SDNodeFlags Flags = N->getFlags();
+
+  // Handle the case of ZERO_EXTEND where the promoted InVT element size does
+  // not equal that of WidenVT.
+  if (N->getOpcode() == ISD::ZERO_EXTEND &&
+      getTypeAction(InVT) == TargetLowering::TypePromoteInteger &&
+      TLI.getTypeToTransformTo(Ctx, InVT).getScalarSizeInBits() !=
+      WidenVT.getScalarSizeInBits()) {
+    InOp = ZExtPromotedInteger(InOp);
+    InVT = InOp.getValueType();
+    if (WidenVT.getScalarSizeInBits() < InVT.getScalarSizeInBits())
+      Opcode = ISD::TRUNCATE;
+  }
+
+  EVT InEltVT = InVT.getVectorElementType();
+  EVT InWidenVT = EVT::getVectorVT(Ctx, InEltVT, WidenNumElts);
+  unsigned InVTNumElts = InVT.getVectorNumElements();
+
   if (getTypeAction(InVT) == TargetLowering::TypeWidenVector) {
     InOp = GetWidenedVector(N->getOperand(0));
     InVT = InOp.getValueType();
@@ -3341,6 +3563,27 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
   return DAG.getBuildVector(WidenVT, DL, Ops);
 }
 
+SDValue DAGTypeLegalizer::WidenVecRes_FP_TO_XINT_SAT(SDNode *N) {
+  SDLoc dl(N);
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  ElementCount WidenNumElts = WidenVT.getVectorElementCount();
+
+  SDValue Src = N->getOperand(0);
+  EVT SrcVT = Src.getValueType();
+
+  // Also widen the input.
+  if (getTypeAction(SrcVT) == TargetLowering::TypeWidenVector) {
+    Src = GetWidenedVector(Src);
+    SrcVT = Src.getValueType();
+  }
+
+  // Input and output not widened to the same size, give up.
+  if (WidenNumElts != SrcVT.getVectorElementCount())
+    return DAG.UnrollVectorOp(N, WidenNumElts.getKnownMinValue());
+
+  return DAG.getNode(N->getOpcode(), dl, WidenVT, Src, N->getOperand(1));
+}
+
 SDValue DAGTypeLegalizer::WidenVecRes_Convert_StrictFP(SDNode *N) {
   SDValue InOp = N->getOperand(1);
   SDLoc DL(N);
@@ -3447,25 +3690,6 @@ SDValue DAGTypeLegalizer::WidenVecRes_POWI(SDNode *N) {
   return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT, InOp, ShOp);
 }
 
-SDValue DAGTypeLegalizer::WidenVecRes_Shift(SDNode *N) {
-  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
-  SDValue InOp = GetWidenedVector(N->getOperand(0));
-  SDValue ShOp = N->getOperand(1);
-
-  EVT ShVT = ShOp.getValueType();
-  if (getTypeAction(ShVT) == TargetLowering::TypeWidenVector) {
-    ShOp = GetWidenedVector(ShOp);
-    ShVT = ShOp.getValueType();
-  }
-  EVT ShWidenVT = EVT::getVectorVT(*DAG.getContext(),
-                                   ShVT.getVectorElementType(),
-                                   WidenVT.getVectorNumElements());
-  if (ShVT != ShWidenVT)
-    ShOp = ModifyToType(ShOp, ShWidenVT);
-
-  return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT, InOp, ShOp);
-}
-
 SDValue DAGTypeLegalizer::WidenVecRes_Unary(SDNode *N) {
   // Unary op widening.
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
@@ -3820,9 +4044,13 @@ SDValue DAGTypeLegalizer::WidenVecRes_MGATHER(MaskedGatherSDNode *N) {
   Index = ModifyToType(Index, WideIndexVT);
   SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
                     Scale };
+
+  // Widen the MemoryType
+  EVT WideMemVT = EVT::getVectorVT(*DAG.getContext(),
+                                   N->getMemoryVT().getScalarType(), NumElts);
   SDValue Res = DAG.getMaskedGather(DAG.getVTList(WideVT, MVT::Other),
-                                    N->getMemoryVT(), dl, Ops,
-                                    N->getMemOperand(), N->getIndexType());
+                                    WideMemVT, dl, Ops, N->getMemOperand(),
+                                    N->getIndexType(), N->getExtensionType());
 
   // Legalize the chain result - switch anything that used the old chain to
   // use the new one.
@@ -3830,10 +4058,9 @@ SDValue DAGTypeLegalizer::WidenVecRes_MGATHER(MaskedGatherSDNode *N) {
   return Res;
 }
 
-SDValue DAGTypeLegalizer::WidenVecRes_SCALAR_TO_VECTOR(SDNode *N) {
+SDValue DAGTypeLegalizer::WidenVecRes_ScalarOp(SDNode *N) {
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
-  return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N),
-                     WidenVT, N->getOperand(0));
+  return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT, N->getOperand(0));
 }
 
 // Return true is this is a SETCC node or a strict version of it.
@@ -3953,11 +4180,11 @@ SDValue DAGTypeLegalizer::convertMask(SDValue InMask, EVT MaskVT,
   return Mask;
 }
 
-// This method tries to handle VSELECT and its mask by legalizing operands
-// (which may require widening) and if needed adjusting the mask vector type
-// to match that of the VSELECT. Without it, many cases end up with
-// scalarization of the SETCC, with many unnecessary instructions.
-SDValue DAGTypeLegalizer::WidenVSELECTAndMask(SDNode *N) {
+// This method tries to handle some special cases for the vselect mask
+// and if needed adjusting the mask vector type to match that of the VSELECT.
+// Without it, many cases end up with scalarization of the SETCC, with many
+// unnecessary instructions.
+SDValue DAGTypeLegalizer::WidenVSELECTMask(SDNode *N) {
   LLVMContext &Ctx = *DAG.getContext();
   SDValue Cond = N->getOperand(0);
 
@@ -4004,14 +4231,9 @@ SDValue DAGTypeLegalizer::WidenVSELECTAndMask(SDNode *N) {
       return SDValue();
   }
 
-  // Get the VT and operands for VSELECT, and widen if needed.
-  SDValue VSelOp1 = N->getOperand(1);
-  SDValue VSelOp2 = N->getOperand(2);
-  if (getTypeAction(VSelVT) == TargetLowering::TypeWidenVector) {
+  // Widen the vselect result type if needed.
+  if (getTypeAction(VSelVT) == TargetLowering::TypeWidenVector)
     VSelVT = TLI.getTypeToTransformTo(Ctx, VSelVT);
-    VSelOp1 = GetWidenedVector(VSelOp1);
-    VSelOp2 = GetWidenedVector(VSelOp2);
-  }
 
   // The mask of the VSELECT should have integer elements.
   EVT ToMaskVT = VSelVT;
@@ -4060,7 +4282,7 @@ SDValue DAGTypeLegalizer::WidenVSELECTAndMask(SDNode *N) {
   } else
     return SDValue();
 
-  return DAG.getNode(ISD::VSELECT, SDLoc(N), VSelVT, Mask, VSelOp1, VSelOp2);
+  return Mask;
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_SELECT(SDNode *N) {
@@ -4070,8 +4292,13 @@ SDValue DAGTypeLegalizer::WidenVecRes_SELECT(SDNode *N) {
   SDValue Cond1 = N->getOperand(0);
   EVT CondVT = Cond1.getValueType();
   if (CondVT.isVector()) {
-    if (SDValue Res = WidenVSELECTAndMask(N))
-      return Res;
+    if (SDValue WideCond = WidenVSELECTMask(N)) {
+      SDValue InOp1 = GetWidenedVector(N->getOperand(1));
+      SDValue InOp2 = GetWidenedVector(N->getOperand(2));
+      assert(InOp1.getValueType() == WidenVT && InOp2.getValueType() == WidenVT);
+      return DAG.getNode(N->getOpcode(), SDLoc(N),
+                         WidenVT, WideCond, InOp1, InOp2);
+    }
 
     EVT CondEltVT = CondVT.getVectorElementType();
     EVT CondWidenVT =  EVT::getVectorVT(*DAG.getContext(),
@@ -4278,6 +4505,11 @@ bool DAGTypeLegalizer::WidenVectorOperand(SDNode *N, unsigned OpNo) {
     Res = WidenVecOp_Convert(N);
     break;
 
+  case ISD::FP_TO_SINT_SAT:
+  case ISD::FP_TO_UINT_SAT:
+    Res = WidenVecOp_FP_TO_XINT_SAT(N);
+    break;
+
   case ISD::VECREDUCE_FADD:
   case ISD::VECREDUCE_FMUL:
   case ISD::VECREDUCE_ADD:
@@ -4293,6 +4525,10 @@ bool DAGTypeLegalizer::WidenVectorOperand(SDNode *N, unsigned OpNo) {
   case ISD::VECREDUCE_FMIN:
     Res = WidenVecOp_VECREDUCE(N);
     break;
+  case ISD::VECREDUCE_SEQ_FADD:
+  case ISD::VECREDUCE_SEQ_FMUL:
+    Res = WidenVecOp_VECREDUCE_SEQ(N);
+    break;
   }
 
   // If Res is null, the sub-method took care of registering the result.
@@ -4447,6 +4683,28 @@ SDValue DAGTypeLegalizer::WidenVecOp_Convert(SDNode *N) {
   return DAG.getBuildVector(VT, dl, Ops);
 }
 
+SDValue DAGTypeLegalizer::WidenVecOp_FP_TO_XINT_SAT(SDNode *N) {
+  EVT DstVT = N->getValueType(0);
+  SDValue Src = GetWidenedVector(N->getOperand(0));
+  EVT SrcVT = Src.getValueType();
+  ElementCount WideNumElts = SrcVT.getVectorElementCount();
+  SDLoc dl(N);
+
+  // See if a widened result type would be legal, if so widen the node.
+  EVT WideDstVT = EVT::getVectorVT(*DAG.getContext(),
+                                   DstVT.getVectorElementType(), WideNumElts);
+  if (TLI.isTypeLegal(WideDstVT)) {
+    SDValue Res =
+        DAG.getNode(N->getOpcode(), dl, WideDstVT, Src, N->getOperand(1));
+    return DAG.getNode(
+        ISD::EXTRACT_SUBVECTOR, dl, DstVT, Res,
+        DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+  }
+
+  // Give up and unroll.
+  return DAG.UnrollVectorOp(N);
+}
+
 SDValue DAGTypeLegalizer::WidenVecOp_BITCAST(SDNode *N) {
   EVT VT = N->getValueType(0);
   SDValue InOp = GetWidenedVector(N->getOperand(0));
@@ -4547,11 +4805,11 @@ SDValue DAGTypeLegalizer::WidenVecOp_STORE(SDNode *N) {
   if (!ST->getMemoryVT().getScalarType().isByteSized())
     return TLI.scalarizeVectorStore(ST, DAG);
 
-  SmallVector<SDValue, 16> StChain;
   if (ST->isTruncatingStore())
-    GenWidenVectorTruncStores(StChain, ST);
-  else
-    GenWidenVectorStores(StChain, ST);
+    return TLI.scalarizeVectorStore(ST, DAG);
+
+  SmallVector<SDValue, 16> StChain;
+  GenWidenVectorStores(StChain, ST);
 
   if (StChain.size() == 1)
     return StChain[0];
@@ -4613,7 +4871,8 @@ SDValue DAGTypeLegalizer::WidenVecOp_MGATHER(SDNode *N, unsigned OpNo) {
   SDValue Ops[] = {MG->getChain(), DataOp, Mask, MG->getBasePtr(), Index,
                    Scale};
   SDValue Res = DAG.getMaskedGather(MG->getVTList(), MG->getMemoryVT(), dl, Ops,
-                                    MG->getMemOperand(), MG->getIndexType());
+                                    MG->getMemOperand(), MG->getIndexType(),
+                                    MG->getExtensionType());
   ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
   ReplaceValueWith(SDValue(N, 0), Res.getValue(0));
   return SDValue();
@@ -4625,6 +4884,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_MSCATTER(SDNode *N, unsigned OpNo) {
   SDValue Mask = MSC->getMask();
   SDValue Index = MSC->getIndex();
   SDValue Scale = MSC->getScale();
+  EVT WideMemVT = MSC->getMemoryVT();
 
   if (OpNo == 1) {
     DataOp = GetWidenedVector(DataOp);
@@ -4641,6 +4901,10 @@ SDValue DAGTypeLegalizer::WidenVecOp_MSCATTER(SDNode *N, unsigned OpNo) {
     EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(),
                                       MaskVT.getVectorElementType(), NumElts);
     Mask = ModifyToType(Mask, WideMaskVT, true);
+
+    // Widen the MemoryType
+    WideMemVT = EVT::getVectorVT(*DAG.getContext(),
+                                 MSC->getMemoryVT().getScalarType(), NumElts);
   } else if (OpNo == 4) {
     // Just widen the index. It's allowed to have extra elements.
     Index = GetWidenedVector(Index);
@@ -4649,9 +4913,9 @@ SDValue DAGTypeLegalizer::WidenVecOp_MSCATTER(SDNode *N, unsigned OpNo) {
 
   SDValue Ops[] = {MSC->getChain(), DataOp, Mask, MSC->getBasePtr(), Index,
                    Scale};
-  return DAG.getMaskedScatter(DAG.getVTList(MVT::Other),
-                              MSC->getMemoryVT(), SDLoc(N), Ops,
-                              MSC->getMemOperand(), MSC->getIndexType());
+  return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), WideMemVT, SDLoc(N),
+                              Ops, MSC->getMemOperand(), MSC->getIndexType(),
+                              MSC->isTruncatingStore());
 }
 
 SDValue DAGTypeLegalizer::WidenVecOp_SETCC(SDNode *N) {
@@ -4730,45 +4994,12 @@ SDValue DAGTypeLegalizer::WidenVecOp_VECREDUCE(SDNode *N) {
   EVT OrigVT = N->getOperand(0).getValueType();
   EVT WideVT = Op.getValueType();
   EVT ElemVT = OrigVT.getVectorElementType();
+  SDNodeFlags Flags = N->getFlags();
 
-  SDValue NeutralElem;
-  switch (N->getOpcode()) {
-  case ISD::VECREDUCE_ADD:
-  case ISD::VECREDUCE_OR:
-  case ISD::VECREDUCE_XOR:
-  case ISD::VECREDUCE_UMAX:
-    NeutralElem = DAG.getConstant(0, dl, ElemVT);
-    break;
-  case ISD::VECREDUCE_MUL:
-    NeutralElem = DAG.getConstant(1, dl, ElemVT);
-    break;
-  case ISD::VECREDUCE_AND:
-  case ISD::VECREDUCE_UMIN:
-    NeutralElem = DAG.getAllOnesConstant(dl, ElemVT);
-    break;
-  case ISD::VECREDUCE_SMAX:
-    NeutralElem = DAG.getConstant(
-        APInt::getSignedMinValue(ElemVT.getSizeInBits()), dl, ElemVT);
-    break;
-  case ISD::VECREDUCE_SMIN:
-    NeutralElem = DAG.getConstant(
-        APInt::getSignedMaxValue(ElemVT.getSizeInBits()), dl, ElemVT);
-    break;
-  case ISD::VECREDUCE_FADD:
-    NeutralElem = DAG.getConstantFP(0.0, dl, ElemVT);
-    break;
-  case ISD::VECREDUCE_FMUL:
-    NeutralElem = DAG.getConstantFP(1.0, dl, ElemVT);
-    break;
-  case ISD::VECREDUCE_FMAX:
-    NeutralElem = DAG.getConstantFP(
-        -std::numeric_limits<double>::infinity(), dl, ElemVT);
-    break;
-  case ISD::VECREDUCE_FMIN:
-    NeutralElem = DAG.getConstantFP(
-        std::numeric_limits<double>::infinity(), dl, ElemVT);
-    break;
-  }
+  unsigned Opc = N->getOpcode();
+  unsigned BaseOpc = ISD::getVecReduceBaseOpcode(Opc);
+  SDValue NeutralElem = DAG.getNeutralElement(BaseOpc, dl, ElemVT, Flags);
+  assert(NeutralElem && "Neutral element must exist");
 
   // Pad the vector with the neutral element.
   unsigned OrigElts = OrigVT.getVectorNumElements();
@@ -4777,7 +5008,32 @@ SDValue DAGTypeLegalizer::WidenVecOp_VECREDUCE(SDNode *N) {
     Op = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, WideVT, Op, NeutralElem,
                      DAG.getVectorIdxConstant(Idx, dl));
 
-  return DAG.getNode(N->getOpcode(), dl, N->getValueType(0), Op, N->getFlags());
+  return DAG.getNode(Opc, dl, N->getValueType(0), Op, Flags);
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_VECREDUCE_SEQ(SDNode *N) {
+  SDLoc dl(N);
+  SDValue AccOp = N->getOperand(0);
+  SDValue VecOp = N->getOperand(1);
+  SDValue Op = GetWidenedVector(VecOp);
+
+  EVT OrigVT = VecOp.getValueType();
+  EVT WideVT = Op.getValueType();
+  EVT ElemVT = OrigVT.getVectorElementType();
+  SDNodeFlags Flags = N->getFlags();
+
+  unsigned Opc = N->getOpcode();
+  unsigned BaseOpc = ISD::getVecReduceBaseOpcode(Opc);
+  SDValue NeutralElem = DAG.getNeutralElement(BaseOpc, dl, ElemVT, Flags);
+
+  // Pad the vector with the neutral element.
+  unsigned OrigElts = OrigVT.getVectorNumElements();
+  unsigned WideElts = WideVT.getVectorNumElements();
+  for (unsigned Idx = OrigElts; Idx < WideElts; Idx++)
+    Op = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, WideVT, Op, NeutralElem,
+                     DAG.getVectorIdxConstant(Idx, dl));
+
+  return DAG.getNode(Opc, dl, N->getValueType(0), AccOp, Op, Flags);
 }
 
 SDValue DAGTypeLegalizer::WidenVecOp_VSELECT(SDNode *N) {
@@ -4820,7 +5076,7 @@ static EVT FindMemType(SelectionDAG& DAG, const TargetLowering &TLI,
 
   // If we have one element to load/store, return it.
   EVT RetVT = WidenEltVT;
-  if (Width == WidenEltWidth)
+  if (!Scalable && Width == WidenEltWidth)
     return RetVT;
 
   // See if there is larger legal integer than the element type to load/store.
@@ -4866,11 +5122,14 @@ static EVT FindMemType(SelectionDAG& DAG, const TargetLowering &TLI,
         isPowerOf2_32(WidenWidth / MemVTWidth) &&
         (MemVTWidth <= Width ||
          (Align!=0 && MemVTWidth<=AlignInBits && MemVTWidth<=Width+WidenEx))) {
-      if (RetVT.getSizeInBits() < MemVTWidth || MemVT == WidenVT)
+      if (RetVT.getFixedSizeInBits() < MemVTWidth || MemVT == WidenVT)
         return MemVT;
     }
   }
 
+  if (Scalable)
+    report_fatal_error("Using element-wise loads and stores for widening "
+                       "operations is not supported for scalable vectors");
   return RetVT;
 }
 
@@ -4913,10 +5172,10 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
   // element type or scalar loads and then recombines it to the widen vector
   // type.
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(),LD->getValueType(0));
-  unsigned WidenWidth = WidenVT.getSizeInBits();
   EVT LdVT    = LD->getMemoryVT();
   SDLoc dl(LD);
   assert(LdVT.isVector() && WidenVT.isVector());
+  assert(LdVT.isScalableVector() == WidenVT.isScalableVector());
   assert(LdVT.getVectorElementType() == WidenVT.getVectorElementType());
 
   // Load information
@@ -4925,23 +5184,25 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
   MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
   AAMDNodes AAInfo = LD->getAAInfo();
 
-  int LdWidth = LdVT.getSizeInBits();
-  int WidthDiff = WidenWidth - LdWidth;
+  TypeSize LdWidth = LdVT.getSizeInBits();
+  TypeSize WidenWidth = WidenVT.getSizeInBits();
+  TypeSize WidthDiff = WidenWidth - LdWidth;
   // Allow wider loads if they are sufficiently aligned to avoid memory faults
   // and if the original load is simple.
   unsigned LdAlign = (!LD->isSimple()) ? 0 : LD->getAlignment();
 
   // Find the vector type that can load from.
-  EVT NewVT = FindMemType(DAG, TLI, LdWidth, WidenVT, LdAlign, WidthDiff);
-  int NewVTWidth = NewVT.getSizeInBits();
+  EVT NewVT = FindMemType(DAG, TLI, LdWidth.getKnownMinSize(), WidenVT, LdAlign,
+                          WidthDiff.getKnownMinSize());
+  TypeSize NewVTWidth = NewVT.getSizeInBits();
   SDValue LdOp = DAG.getLoad(NewVT, dl, Chain, BasePtr, LD->getPointerInfo(),
                              LD->getOriginalAlign(), MMOFlags, AAInfo);
   LdChain.push_back(LdOp.getValue(1));
 
   // Check if we can load the element with one instruction.
-  if (LdWidth <= NewVTWidth) {
+  if (TypeSize::isKnownLE(LdWidth, NewVTWidth)) {
     if (!NewVT.isVector()) {
-      unsigned NumElts = WidenWidth / NewVTWidth;
+      unsigned NumElts = WidenWidth.getFixedSize() / NewVTWidth.getFixedSize();
       EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewVT, NumElts);
       SDValue VecOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NewVecVT, LdOp);
       return DAG.getNode(ISD::BITCAST, dl, WidenVT, VecOp);
@@ -4949,8 +5210,9 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
     if (NewVT == WidenVT)
       return LdOp;
 
-    assert(WidenWidth % NewVTWidth == 0);
-    unsigned NumConcat = WidenWidth / NewVTWidth;
+    // TODO: We don't currently have any tests that exercise this code path.
+    assert(WidenWidth.getFixedSize() % NewVTWidth.getFixedSize() == 0);
+    unsigned NumConcat = WidenWidth.getFixedSize() / NewVTWidth.getFixedSize();
     SmallVector<SDValue, 16> ConcatOps(NumConcat);
     SDValue UndefVal = DAG.getUNDEF(NewVT);
     ConcatOps[0] = LdOp;
@@ -4963,35 +5225,30 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
   SmallVector<SDValue, 16> LdOps;
   LdOps.push_back(LdOp);
 
-  LdWidth -= NewVTWidth;
-  unsigned Offset = 0;
-
-  while (LdWidth > 0) {
-    unsigned Increment = NewVTWidth / 8;
-    Offset += Increment;
-    BasePtr = DAG.getObjectPtrOffset(dl, BasePtr, Increment);
+  uint64_t ScaledOffset = 0;
+  MachinePointerInfo MPI = LD->getPointerInfo();
+  do {
+    LdWidth -= NewVTWidth;
+    IncrementPointer(cast<LoadSDNode>(LdOp), NewVT, MPI, BasePtr,
+                     &ScaledOffset);
 
-    SDValue L;
-    if (LdWidth < NewVTWidth) {
+    if (TypeSize::isKnownLT(LdWidth, NewVTWidth)) {
       // The current type we are using is too large. Find a better size.
-      NewVT = FindMemType(DAG, TLI, LdWidth, WidenVT, LdAlign, WidthDiff);
+      NewVT = FindMemType(DAG, TLI, LdWidth.getKnownMinSize(), WidenVT, LdAlign,
+                          WidthDiff.getKnownMinSize());
       NewVTWidth = NewVT.getSizeInBits();
-      L = DAG.getLoad(NewVT, dl, Chain, BasePtr,
-                      LD->getPointerInfo().getWithOffset(Offset),
-                      LD->getOriginalAlign(), MMOFlags, AAInfo);
-      LdChain.push_back(L.getValue(1));
-    } else {
-      L = DAG.getLoad(NewVT, dl, Chain, BasePtr,
-                      LD->getPointerInfo().getWithOffset(Offset),
-                      LD->getOriginalAlign(), MMOFlags, AAInfo);
-      LdChain.push_back(L.getValue(1));
     }
 
+    Align NewAlign = ScaledOffset == 0
+                         ? LD->getOriginalAlign()
+                         : commonAlignment(LD->getAlign(), ScaledOffset);
+    SDValue L =
+        DAG.getLoad(NewVT, dl, Chain, BasePtr, MPI, NewAlign, MMOFlags, AAInfo);
+    LdChain.push_back(L.getValue(1));
+
     LdOps.push_back(L);
     LdOp = L;
-
-    LdWidth -= NewVTWidth;
-  }
+  } while (TypeSize::isKnownGT(LdWidth, NewVTWidth));
 
   // Build the vector from the load operations.
   unsigned End = LdOps.size();
@@ -5015,13 +5272,18 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
     }
     ConcatOps[--Idx] = BuildVectorFromScalar(DAG, LdTy, LdOps, i + 1, End);
   }
+
   ConcatOps[--Idx] = LdOps[i];
   for (--i; i >= 0; --i) {
     EVT NewLdTy = LdOps[i].getValueType();
     if (NewLdTy != LdTy) {
       // Create a larger vector.
-      unsigned NumOps = NewLdTy.getSizeInBits() / LdTy.getSizeInBits();
-      assert(NewLdTy.getSizeInBits() % LdTy.getSizeInBits() == 0);
+      TypeSize LdTySize = LdTy.getSizeInBits();
+      TypeSize NewLdTySize = NewLdTy.getSizeInBits();
+      assert(NewLdTySize.isScalable() == LdTySize.isScalable() &&
+             NewLdTySize.isKnownMultipleOf(LdTySize.getKnownMinSize()));
+      unsigned NumOps =
+          NewLdTySize.getKnownMinSize() / LdTySize.getKnownMinSize();
       SmallVector<SDValue, 16> WidenOps(NumOps);
       unsigned j = 0;
       for (; j != End-Idx; ++j)
@@ -5042,7 +5304,8 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
                        makeArrayRef(&ConcatOps[Idx], End - Idx));
 
   // We need to fill the rest with undefs to build the vector.
-  unsigned NumOps = WidenWidth / LdTy.getSizeInBits();
+  unsigned NumOps =
+      WidenWidth.getKnownMinSize() / LdTy.getSizeInBits().getKnownMinSize();
   SmallVector<SDValue, 16> WidenOps(NumOps);
   SDValue UndefVal = DAG.getUNDEF(LdTy);
   {
@@ -5065,6 +5328,7 @@ DAGTypeLegalizer::GenWidenVectorExtLoads(SmallVectorImpl<SDValue> &LdChain,
   EVT LdVT    = LD->getMemoryVT();
   SDLoc dl(LD);
   assert(LdVT.isVector() && WidenVT.isVector());
+  assert(LdVT.isScalableVector() == WidenVT.isScalableVector());
 
   // Load information
   SDValue Chain = LD->getChain();
@@ -5072,6 +5336,10 @@ DAGTypeLegalizer::GenWidenVectorExtLoads(SmallVectorImpl<SDValue> &LdChain,
   MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
   AAMDNodes AAInfo = LD->getAAInfo();
 
+  if (LdVT.isScalableVector())
+    report_fatal_error("Generating widen scalable extending vector loads is "
+                       "not yet supported");
+
   EVT EltVT = WidenVT.getVectorElementType();
   EVT LdEltVT = LdVT.getVectorElementType();
   unsigned NumElts = LdVT.getVectorNumElements();
@@ -5086,7 +5354,8 @@ DAGTypeLegalizer::GenWidenVectorExtLoads(SmallVectorImpl<SDValue> &LdChain,
   LdChain.push_back(Ops[0].getValue(1));
   unsigned i = 0, Offset = Increment;
   for (i=1; i < NumElts; ++i, Offset += Increment) {
-    SDValue NewBasePtr = DAG.getObjectPtrOffset(dl, BasePtr, Offset);
+    SDValue NewBasePtr =
+        DAG.getObjectPtrOffset(dl, BasePtr, TypeSize::Fixed(Offset));
     Ops[i] = DAG.getExtLoad(ExtType, dl, EltVT, Chain, NewBasePtr,
                             LD->getPointerInfo().getWithOffset(Offset), LdEltVT,
                             LD->getOriginalAlign(), MMOFlags, AAInfo);
@@ -5114,99 +5383,66 @@ void DAGTypeLegalizer::GenWidenVectorStores(SmallVectorImpl<SDValue> &StChain,
   SDLoc dl(ST);
 
   EVT StVT = ST->getMemoryVT();
-  unsigned StWidth = StVT.getSizeInBits();
+  TypeSize StWidth = StVT.getSizeInBits();
   EVT ValVT = ValOp.getValueType();
-  unsigned ValWidth = ValVT.getSizeInBits();
+  TypeSize ValWidth = ValVT.getSizeInBits();
   EVT ValEltVT = ValVT.getVectorElementType();
-  unsigned ValEltWidth = ValEltVT.getSizeInBits();
+  unsigned ValEltWidth = ValEltVT.getFixedSizeInBits();
   assert(StVT.getVectorElementType() == ValEltVT);
+  assert(StVT.isScalableVector() == ValVT.isScalableVector() &&
+         "Mismatch between store and value types");
 
   int Idx = 0;          // current index to store
-  unsigned Offset = 0;  // offset from base to store
-  while (StWidth != 0) {
+
+  MachinePointerInfo MPI = ST->getPointerInfo();
+  uint64_t ScaledOffset = 0;
+  while (StWidth.isNonZero()) {
     // Find the largest vector type we can store with.
-    EVT NewVT = FindMemType(DAG, TLI, StWidth, ValVT);
-    unsigned NewVTWidth = NewVT.getSizeInBits();
-    unsigned Increment = NewVTWidth / 8;
+    EVT NewVT = FindMemType(DAG, TLI, StWidth.getKnownMinSize(), ValVT);
+    TypeSize NewVTWidth = NewVT.getSizeInBits();
+
     if (NewVT.isVector()) {
-      unsigned NumVTElts = NewVT.getVectorNumElements();
+      unsigned NumVTElts = NewVT.getVectorMinNumElements();
       do {
+        Align NewAlign = ScaledOffset == 0
+                             ? ST->getOriginalAlign()
+                             : commonAlignment(ST->getAlign(), ScaledOffset);
         SDValue EOp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NewVT, ValOp,
                                   DAG.getVectorIdxConstant(Idx, dl));
-        StChain.push_back(DAG.getStore(
-            Chain, dl, EOp, BasePtr, ST->getPointerInfo().getWithOffset(Offset),
-            ST->getOriginalAlign(), MMOFlags, AAInfo));
+        SDValue PartStore = DAG.getStore(Chain, dl, EOp, BasePtr, MPI, NewAlign,
+                                         MMOFlags, AAInfo);
+        StChain.push_back(PartStore);
+
         StWidth -= NewVTWidth;
-        Offset += Increment;
         Idx += NumVTElts;
 
-        BasePtr = DAG.getObjectPtrOffset(dl, BasePtr, Increment);
-      } while (StWidth != 0 && StWidth >= NewVTWidth);
+        IncrementPointer(cast<StoreSDNode>(PartStore), NewVT, MPI, BasePtr,
+                         &ScaledOffset);
+      } while (StWidth.isNonZero() && TypeSize::isKnownGE(StWidth, NewVTWidth));
     } else {
       // Cast the vector to the scalar type we can store.
-      unsigned NumElts = ValWidth / NewVTWidth;
+      unsigned NumElts = ValWidth.getFixedSize() / NewVTWidth.getFixedSize();
       EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewVT, NumElts);
       SDValue VecOp = DAG.getNode(ISD::BITCAST, dl, NewVecVT, ValOp);
       // Readjust index position based on new vector type.
-      Idx = Idx * ValEltWidth / NewVTWidth;
+      Idx = Idx * ValEltWidth / NewVTWidth.getFixedSize();
       do {
         SDValue EOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NewVT, VecOp,
                                   DAG.getVectorIdxConstant(Idx++, dl));
-        StChain.push_back(DAG.getStore(
-            Chain, dl, EOp, BasePtr, ST->getPointerInfo().getWithOffset(Offset),
-            ST->getOriginalAlign(), MMOFlags, AAInfo));
+        SDValue PartStore =
+            DAG.getStore(Chain, dl, EOp, BasePtr, MPI, ST->getOriginalAlign(),
+                         MMOFlags, AAInfo);
+        StChain.push_back(PartStore);
+
         StWidth -= NewVTWidth;
-        Offset += Increment;
-        BasePtr = DAG.getObjectPtrOffset(dl, BasePtr, Increment);
-      } while (StWidth != 0 && StWidth >= NewVTWidth);
+        IncrementPointer(cast<StoreSDNode>(PartStore), NewVT, MPI, BasePtr);
+      } while (StWidth.isNonZero() && TypeSize::isKnownGE(StWidth, NewVTWidth));
       // Restore index back to be relative to the original widen element type.
-      Idx = Idx * NewVTWidth / ValEltWidth;
+      Idx = Idx * NewVTWidth.getFixedSize() / ValEltWidth;
     }
   }
 }
 
-void
-DAGTypeLegalizer::GenWidenVectorTruncStores(SmallVectorImpl<SDValue> &StChain,
-                                            StoreSDNode *ST) {
-  // For extension loads, it may not be more efficient to truncate the vector
-  // and then store it. Instead, we extract each element and then store it.
-  SDValue Chain = ST->getChain();
-  SDValue BasePtr = ST->getBasePtr();
-  MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
-  AAMDNodes AAInfo = ST->getAAInfo();
-  SDValue ValOp = GetWidenedVector(ST->getValue());
-  SDLoc dl(ST);
-
-  EVT StVT = ST->getMemoryVT();
-  EVT ValVT = ValOp.getValueType();
-
-  // It must be true that the wide vector type is bigger than where we need to
-  // store.
-  assert(StVT.isVector() && ValOp.getValueType().isVector());
-  assert(StVT.bitsLT(ValOp.getValueType()));
-
-  // For truncating stores, we can not play the tricks of chopping legal vector
-  // types and bitcast it to the right type. Instead, we unroll the store.
-  EVT StEltVT  = StVT.getVectorElementType();
-  EVT ValEltVT = ValVT.getVectorElementType();
-  unsigned Increment = ValEltVT.getSizeInBits() / 8;
-  unsigned NumElts = StVT.getVectorNumElements();
-  SDValue EOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ValEltVT, ValOp,
-                            DAG.getVectorIdxConstant(0, dl));
-  StChain.push_back(
-      DAG.getTruncStore(Chain, dl, EOp, BasePtr, ST->getPointerInfo(), StEltVT,
-                        ST->getOriginalAlign(), MMOFlags, AAInfo));
-  unsigned Offset = Increment;
-  for (unsigned i=1; i < NumElts; ++i, Offset += Increment) {
-    SDValue NewBasePtr = DAG.getObjectPtrOffset(dl, BasePtr, Offset);
-    SDValue EOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ValEltVT, ValOp,
-                              DAG.getVectorIdxConstant(0, dl));
-    StChain.push_back(DAG.getTruncStore(
-        Chain, dl, EOp, NewBasePtr, ST->getPointerInfo().getWithOffset(Offset),
-        StEltVT, ST->getOriginalAlign(), MMOFlags, AAInfo));
-  }
-}
-
 /// Modifies a vector input (widen or narrows) to a vector of NVT.  The
 /// input vector must have the same element type as NVT.
 /// FillWithZeroes specifies that the vector should be widened with zeroes.
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
index 2902c96c7658..0022e5ec31f0 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
@@ -760,7 +760,7 @@ void ScheduleDAGLinearize::Schedule() {
 
 MachineBasicBlock*
 ScheduleDAGLinearize::EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
-  InstrEmitter Emitter(BB, InsertPos);
+  InstrEmitter Emitter(DAG->getTarget(), BB, InsertPos);
   DenseMap<SDValue, Register> VRBaseMap;
 
   LLVM_DEBUG({ dbgs() << "\n*** Final schedule ***\n"; });
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index 72e68a5045c6..7a5e8ac6075e 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -1838,13 +1838,16 @@ protected:
 
 template<class SF>
 static SUnit *popFromQueueImpl(std::vector<SUnit *> &Q, SF &Picker) {
-  std::vector<SUnit *>::iterator Best = Q.begin();
-  for (auto I = std::next(Q.begin()), E = Q.end(); I != E; ++I)
-    if (Picker(*Best, *I))
-      Best = I;
-  SUnit *V = *Best;
-  if (Best != std::prev(Q.end()))
-    std::swap(*Best, Q.back());
+  unsigned BestIdx = 0;
+  // Only compute the cost for the first 1000 items in the queue, to avoid
+  // excessive compile-times for very large queues.
+  for (unsigned I = 1, E = std::min(Q.size(), (decltype(Q.size()))1000); I != E;
+       I++)
+    if (Picker(Q[BestIdx], Q[I]))
+      BestIdx = I;
+  SUnit *V = Q[BestIdx];
+  if (BestIdx + 1 != Q.size())
+    std::swap(Q[BestIdx], Q.back());
   Q.pop_back();
   return V;
 }
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index ce20d506586f..debfdda90e1e 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -125,8 +125,7 @@ static void CheckForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op,
     PhysReg = Reg;
   } else if (Def->isMachineOpcode()) {
     const MCInstrDesc &II = TII->get(Def->getMachineOpcode());
-    if (ResNo >= II.getNumDefs() &&
-        II.ImplicitDefs[ResNo - II.getNumDefs()] == Reg)
+    if (ResNo >= II.getNumDefs() && II.hasImplicitDefOfPhysReg(Reg))
       PhysReg = Reg;
   }
 
@@ -173,7 +172,7 @@ static bool AddGlue(SDNode *N, SDValue Glue, bool AddGlue, SelectionDAG *DAG) {
   // Don't add glue to something that already has a glue value.
   if (N->getValueType(N->getNumValues() - 1) == MVT::Glue) return false;
 
-  SmallVector<EVT, 4> VTs(N->value_begin(), N->value_end());
+  SmallVector<EVT, 4> VTs(N->values());
   if (AddGlue)
     VTs.push_back(MVT::Glue);
 
@@ -830,7 +829,7 @@ EmitPhysRegCopy(SUnit *SU, DenseMap<SUnit*, Register> &VRBaseMap,
 /// not necessarily refer to returned BB. The emitter may split blocks.
 MachineBasicBlock *ScheduleDAGSDNodes::
 EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
-  InstrEmitter Emitter(BB, InsertPos);
+  InstrEmitter Emitter(DAG->getTarget(), BB, InsertPos);
   DenseMap<SDValue, Register> VRBaseMap;
   DenseMap<SUnit*, Register> CopyVRBaseMap;
   SmallVector<std::pair<unsigned, MachineInstr*>, 32> Orders;
@@ -1034,7 +1033,29 @@ EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
   }
 
   InsertPos = Emitter.getInsertPos();
-  return Emitter.getBlock();
+  // In some cases, DBG_VALUEs might be inserted after the first terminator,
+  // which results in an invalid MBB. If that happens, move the DBG_VALUEs
+  // before the first terminator.
+  MachineBasicBlock *InsertBB = Emitter.getBlock();
+  auto FirstTerm = InsertBB->getFirstTerminator();
+  if (FirstTerm != InsertBB->end()) {
+    assert(!FirstTerm->isDebugValue() &&
+           "first terminator cannot be a debug value");
+    for (MachineInstr &MI : make_early_inc_range(
+             make_range(std::next(FirstTerm), InsertBB->end()))) {
+      if (!MI.isDebugValue())
+        continue;
+
+      if (&MI == InsertPos)
+        InsertPos = std::prev(InsertPos->getIterator());
+
+      // The DBG_VALUE was referencing a value produced by a terminator. By
+      // moving the DBG_VALUE, the referenced value also needs invalidating.
+      MI.getOperand(0).ChangeToRegister(0, false);
+      MI.moveBefore(&*FirstTerm);
+    }
+  }
+  return InsertBB;
 }
 
 /// Return the basic block label.
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 592c09c10fb0..2090762e2ff4 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -28,6 +28,7 @@
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
@@ -138,6 +139,15 @@ bool ConstantFPSDNode::isValueValidForType(EVT VT,
 //===----------------------------------------------------------------------===//
 
 bool ISD::isConstantSplatVector(const SDNode *N, APInt &SplatVal) {
+  if (N->getOpcode() == ISD::SPLAT_VECTOR) {
+    unsigned EltSize =
+        N->getValueType(0).getVectorElementType().getSizeInBits();
+    if (auto *Op0 = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
+      SplatVal = Op0->getAPIntValue().truncOrSelf(EltSize);
+      return true;
+    }
+  }
+
   auto *BV = dyn_cast<BuildVectorSDNode>(N);
   if (!BV)
     return false;
@@ -154,11 +164,16 @@ bool ISD::isConstantSplatVector(const SDNode *N, APInt &SplatVal) {
 // FIXME: AllOnes and AllZeros duplicate a lot of code. Could these be
 // specializations of the more general isConstantSplatVector()?
 
-bool ISD::isBuildVectorAllOnes(const SDNode *N) {
+bool ISD::isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly) {
   // Look through a bit convert.
   while (N->getOpcode() == ISD::BITCAST)
     N = N->getOperand(0).getNode();
 
+  if (!BuildVectorOnly && N->getOpcode() == ISD::SPLAT_VECTOR) {
+    APInt SplatVal;
+    return isConstantSplatVector(N, SplatVal) && SplatVal.isAllOnesValue();
+  }
+
   if (N->getOpcode() != ISD::BUILD_VECTOR) return false;
 
   unsigned i = 0, e = N->getNumOperands();
@@ -198,11 +213,16 @@ bool ISD::isBuildVectorAllOnes(const SDNode *N) {
   return true;
 }
 
-bool ISD::isBuildVectorAllZeros(const SDNode *N) {
+bool ISD::isConstantSplatVectorAllZeros(const SDNode *N, bool BuildVectorOnly) {
   // Look through a bit convert.
   while (N->getOpcode() == ISD::BITCAST)
     N = N->getOperand(0).getNode();
 
+  if (!BuildVectorOnly && N->getOpcode() == ISD::SPLAT_VECTOR) {
+    APInt SplatVal;
+    return isConstantSplatVector(N, SplatVal) && SplatVal.isNullValue();
+  }
+
   if (N->getOpcode() != ISD::BUILD_VECTOR) return false;
 
   bool IsAllUndef = true;
@@ -235,6 +255,14 @@ bool ISD::isBuildVectorAllZeros(const SDNode *N) {
   return true;
 }
 
+bool ISD::isBuildVectorAllOnes(const SDNode *N) {
+  return isConstantSplatVectorAllOnes(N, /*BuildVectorOnly*/ true);
+}
+
+bool ISD::isBuildVectorAllZeros(const SDNode *N) {
+  return isConstantSplatVectorAllZeros(N, /*BuildVectorOnly*/ true);
+}
+
 bool ISD::isBuildVectorOfConstantSDNodes(const SDNode *N) {
   if (N->getOpcode() != ISD::BUILD_VECTOR)
     return false;
@@ -278,7 +306,8 @@ bool ISD::matchUnaryPredicate(SDValue Op,
     return Match(Cst);
 
   // FIXME: Add support for vector UNDEF cases?
-  if (ISD::BUILD_VECTOR != Op.getOpcode())
+  if (ISD::BUILD_VECTOR != Op.getOpcode() &&
+      ISD::SPLAT_VECTOR != Op.getOpcode())
     return false;
 
   EVT SVT = Op.getValueType().getScalarType();
@@ -332,6 +361,76 @@ bool ISD::matchBinaryPredicate(
   return true;
 }
 
+ISD::NodeType ISD::getVecReduceBaseOpcode(unsigned VecReduceOpcode) {
+  switch (VecReduceOpcode) {
+  default:
+    llvm_unreachable("Expected VECREDUCE opcode");
+  case ISD::VECREDUCE_FADD:
+  case ISD::VECREDUCE_SEQ_FADD:
+    return ISD::FADD;
+  case ISD::VECREDUCE_FMUL:
+  case ISD::VECREDUCE_SEQ_FMUL:
+    return ISD::FMUL;
+  case ISD::VECREDUCE_ADD:
+    return ISD::ADD;
+  case ISD::VECREDUCE_MUL:
+    return ISD::MUL;
+  case ISD::VECREDUCE_AND:
+    return ISD::AND;
+  case ISD::VECREDUCE_OR:
+    return ISD::OR;
+  case ISD::VECREDUCE_XOR:
+    return ISD::XOR;
+  case ISD::VECREDUCE_SMAX:
+    return ISD::SMAX;
+  case ISD::VECREDUCE_SMIN:
+    return ISD::SMIN;
+  case ISD::VECREDUCE_UMAX:
+    return ISD::UMAX;
+  case ISD::VECREDUCE_UMIN:
+    return ISD::UMIN;
+  case ISD::VECREDUCE_FMAX:
+    return ISD::FMAXNUM;
+  case ISD::VECREDUCE_FMIN:
+    return ISD::FMINNUM;
+  }
+}
+
+bool ISD::isVPOpcode(unsigned Opcode) {
+  switch (Opcode) {
+  default:
+    return false;
+#define BEGIN_REGISTER_VP_SDNODE(SDOPC, ...)                                   \
+  case ISD::SDOPC:                                                             \
+    return true;
+#include "llvm/IR/VPIntrinsics.def"
+  }
+}
+
+/// The operand position of the vector mask.
+Optional<unsigned> ISD::getVPMaskIdx(unsigned Opcode) {
+  switch (Opcode) {
+  default:
+    return None;
+#define BEGIN_REGISTER_VP_SDNODE(SDOPC, LEGALPOS, TDNAME, MASKPOS, ...)        \
+  case ISD::SDOPC:                                                             \
+    return MASKPOS;
+#include "llvm/IR/VPIntrinsics.def"
+  }
+}
+
+/// The operand position of the explicit vector length parameter.
+Optional<unsigned> ISD::getVPExplicitVectorLengthIdx(unsigned Opcode) {
+  switch (Opcode) {
+  default:
+    return None;
+#define BEGIN_REGISTER_VP_SDNODE(SDOPC, LEGALPOS, TDNAME, MASKPOS, EVLPOS)     \
+  case ISD::SDOPC:                                                             \
+    return EVLPOS;
+#include "llvm/IR/VPIntrinsics.def"
+  }
+}
+
 ISD::NodeType ISD::getExtForLoadExtType(bool IsFP, ISD::LoadExtType ExtType) {
   switch (ExtType) {
   case ISD::EXTLOAD:
@@ -536,6 +635,11 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
       ID.AddInteger(cast<LifetimeSDNode>(N)->getOffset());
     }
     break;
+  case ISD::PSEUDO_PROBE:
+    ID.AddInteger(cast<PseudoProbeSDNode>(N)->getGuid());
+    ID.AddInteger(cast<PseudoProbeSDNode>(N)->getIndex());
+    ID.AddInteger(cast<PseudoProbeSDNode>(N)->getAttributes());
+    break;
   case ISD::JumpTable:
   case ISD::TargetJumpTable:
     ID.AddInteger(cast<JumpTableSDNode>(N)->getIndex());
@@ -1229,7 +1333,7 @@ SDValue SelectionDAG::getConstant(uint64_t Val, const SDLoc &DL, EVT VT,
                                   bool isT, bool isO) {
   EVT EltVT = VT.getScalarType();
   assert((EltVT.getSizeInBits() >= 64 ||
-         (uint64_t)((int64_t)Val >> EltVT.getSizeInBits()) + 1 < 2) &&
+          (uint64_t)((int64_t)Val >> EltVT.getSizeInBits()) + 1 < 2) &&
          "getConstant with a uint64_t value that doesn't fit in the type!");
   return getConstant(APInt(EltVT.getSizeInBits(), Val), DL, VT, isT, isO);
 }
@@ -1251,10 +1355,10 @@ SDValue SelectionDAG::getConstant(const ConstantInt &Val, const SDLoc &DL,
   // inserted value (the type does not need to match the vector element type).
   // Any extra bits introduced will be truncated away.
   if (VT.isVector() && TLI->getTypeAction(*getContext(), EltVT) ==
-      TargetLowering::TypePromoteInteger) {
-   EltVT = TLI->getTypeToTransformTo(*getContext(), EltVT);
-   APInt NewVal = Elt->getValue().zextOrTrunc(EltVT.getSizeInBits());
-   Elt = ConstantInt::get(*getContext(), NewVal);
+                           TargetLowering::TypePromoteInteger) {
+    EltVT = TLI->getTypeToTransformTo(*getContext(), EltVT);
+    APInt NewVal = Elt->getValue().zextOrTrunc(EltVT.getSizeInBits());
+    Elt = ConstantInt::get(*getContext(), NewVal);
   }
   // In other cases the element type is illegal and needs to be expanded, for
   // example v2i64 on MIPS32. In this case, find the nearest legal type, split
@@ -1264,7 +1368,7 @@ SDValue SelectionDAG::getConstant(const ConstantInt &Val, const SDLoc &DL,
   // only legalize if the DAG tells us we must produce legal types.
   else if (NewNodesMustHaveLegalTypes && VT.isVector() &&
            TLI->getTypeAction(*getContext(), EltVT) ==
-           TargetLowering::TypeExpandInteger) {
+               TargetLowering::TypeExpandInteger) {
     const APInt &NewVal = Elt->getValue();
     EVT ViaEltVT = TLI->getTypeToTransformTo(*getContext(), EltVT);
     unsigned ViaEltSizeInBits = ViaEltVT.getSizeInBits();
@@ -1278,9 +1382,9 @@ SDValue SelectionDAG::getConstant(const ConstantInt &Val, const SDLoc &DL,
 
     SmallVector<SDValue, 2> EltParts;
     for (unsigned i = 0; i < ViaVecNumElts / VT.getVectorNumElements(); ++i) {
-      EltParts.push_back(getConstant(NewVal.lshr(i * ViaEltSizeInBits)
-                                           .zextOrTrunc(ViaEltSizeInBits), DL,
-                                     ViaEltVT, isT, isO));
+      EltParts.push_back(getConstant(
+          NewVal.lshr(i * ViaEltSizeInBits).zextOrTrunc(ViaEltSizeInBits), DL,
+          ViaEltVT, isT, isO));
     }
 
     // EltParts is currently in little endian order. If we actually want
@@ -1297,9 +1401,10 @@ SDValue SelectionDAG::getConstant(const ConstantInt &Val, const SDLoc &DL,
 
     SmallVector<SDValue, 8> Ops;
     for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
-      Ops.insert(Ops.end(), EltParts.begin(), EltParts.end());
+      llvm::append_range(Ops, EltParts);
 
-    SDValue V = getNode(ISD::BITCAST, DL, VT, getBuildVector(ViaVecVT, DL, Ops));
+    SDValue V =
+        getNode(ISD::BITCAST, DL, VT, getBuildVector(ViaVecVT, DL, Ops));
     return V;
   }
 
@@ -1380,7 +1485,9 @@ SDValue SelectionDAG::getConstantFP(const ConstantFP &V, const SDLoc &DL,
   }
 
   SDValue Result(N, 0);
-  if (VT.isVector())
+  if (VT.isScalableVector())
+    Result = getSplatVector(VT, DL, Result);
+  else if (VT.isVector())
     Result = getSplatBuildVector(VT, DL, Result);
   NewSDValueDbgMsg(Result, "Creating fp constant: ", this);
   return Result;
@@ -2023,7 +2130,14 @@ Align SelectionDAG::getReducedAlign(EVT VT, bool UseABI) {
 
 SDValue SelectionDAG::CreateStackTemporary(TypeSize Bytes, Align Alignment) {
   MachineFrameInfo &MFI = MF->getFrameInfo();
-  int FrameIdx = MFI.CreateStackObject(Bytes, Alignment, false);
+  const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
+  int StackID = 0;
+  if (Bytes.isScalable())
+    StackID = TFI->getStackIDForScalableVectors();
+  // The stack id gives an indication of whether the object is scalable or
+  // not, so it's safe to pass in the minimum size here.
+  int FrameIdx = MFI.CreateStackObject(Bytes.getKnownMinSize(), Alignment,
+                                       false, nullptr, StackID);
   return getFrameIndex(FrameIdx, TLI->getFrameIndexTy(getDataLayout()));
 }
 
@@ -2035,7 +2149,14 @@ SDValue SelectionDAG::CreateStackTemporary(EVT VT, unsigned minAlign) {
 }
 
 SDValue SelectionDAG::CreateStackTemporary(EVT VT1, EVT VT2) {
-  TypeSize Bytes = std::max(VT1.getStoreSize(), VT2.getStoreSize());
+  TypeSize VT1Size = VT1.getStoreSize();
+  TypeSize VT2Size = VT2.getStoreSize();
+  assert(VT1Size.isScalable() == VT2Size.isScalable() &&
+         "Don't know how to choose the maximum size when creating a stack "
+         "temporary");
+  TypeSize Bytes =
+      VT1Size.getKnownMinSize() > VT2Size.getKnownMinSize() ? VT1Size : VT2Size;
+
   Type *Ty1 = VT1.getTypeForEVT(*getContext());
   Type *Ty2 = VT2.getTypeForEVT(*getContext());
   const DataLayout &DL = getDataLayout();
@@ -2204,6 +2325,10 @@ SDValue SelectionDAG::FoldSetCC(EVT VT, SDValue N1, SDValue N2,
 /// SimplifyMultipleUseDemandedBits and not generate any new nodes.
 SDValue SelectionDAG::GetDemandedBits(SDValue V, const APInt &DemandedBits) {
   EVT VT = V.getValueType();
+
+  if (VT.isScalableVector())
+    return SDValue();
+
   APInt DemandedElts = VT.isVector()
                            ? APInt::getAllOnesValue(VT.getVectorNumElements())
                            : APInt(1, 1);
@@ -2221,7 +2346,6 @@ SDValue SelectionDAG::GetDemandedBits(SDValue V, const APInt &DemandedBits,
   default:
     return TLI->SimplifyMultipleUseDemandedBits(V, DemandedBits, DemandedElts,
                                                 *this, 0);
-    break;
   case ISD::Constant: {
     const APInt &CVal = cast<ConstantSDNode>(V)->getAPIntValue();
     APInt NewVal = CVal & DemandedBits;
@@ -2247,18 +2371,6 @@ SDValue SelectionDAG::GetDemandedBits(SDValue V, const APInt &DemandedBits,
                        V.getOperand(1));
     }
     break;
-  case ISD::AND: {
-    // X & -1 -> X (ignoring bits which aren't demanded).
-    // Also handle the case where masked out bits in X are known to be zero.
-    if (ConstantSDNode *RHSC = isConstOrConstSplat(V.getOperand(1))) {
-      const APInt &AndVal = RHSC->getAPIntValue();
-      if (DemandedBits.isSubsetOf(AndVal) ||
-          DemandedBits.isSubsetOf(computeKnownBits(V.getOperand(0)).Zero |
-                                  AndVal))
-        return V.getOperand(0);
-    }
-    break;
-  }
   }
   return SDValue();
 }
@@ -2298,17 +2410,23 @@ bool SelectionDAG::MaskedValueIsAllOnes(SDValue V, const APInt &Mask,
 /// sense to specify which elements are demanded or undefined, therefore
 /// they are simply ignored.
 bool SelectionDAG::isSplatValue(SDValue V, const APInt &DemandedElts,
-                                APInt &UndefElts) {
+                                APInt &UndefElts, unsigned Depth) {
   EVT VT = V.getValueType();
   assert(VT.isVector() && "Vector type expected");
 
   if (!VT.isScalableVector() && !DemandedElts)
     return false; // No demanded elts, better to assume we don't know anything.
 
+  if (Depth >= MaxRecursionDepth)
+    return false; // Limit search depth.
+
   // Deal with some common cases here that work for both fixed and scalable
   // vector types.
   switch (V.getOpcode()) {
   case ISD::SPLAT_VECTOR:
+    UndefElts = V.getOperand(0).isUndef()
+                    ? APInt::getAllOnesValue(DemandedElts.getBitWidth())
+                    : APInt(DemandedElts.getBitWidth(), 0);
     return true;
   case ISD::ADD:
   case ISD::SUB:
@@ -2316,13 +2434,17 @@ bool SelectionDAG::isSplatValue(SDValue V, const APInt &DemandedElts,
     APInt UndefLHS, UndefRHS;
     SDValue LHS = V.getOperand(0);
     SDValue RHS = V.getOperand(1);
-    if (isSplatValue(LHS, DemandedElts, UndefLHS) &&
-        isSplatValue(RHS, DemandedElts, UndefRHS)) {
+    if (isSplatValue(LHS, DemandedElts, UndefLHS, Depth + 1) &&
+        isSplatValue(RHS, DemandedElts, UndefRHS, Depth + 1)) {
       UndefElts = UndefLHS | UndefRHS;
       return true;
     }
     break;
   }
+  case ISD::TRUNCATE:
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+    return isSplatValue(V.getOperand(0), DemandedElts, UndefElts, Depth + 1);
   }
 
   // We don't support other cases than those above for scalable vectors at
@@ -2377,7 +2499,7 @@ bool SelectionDAG::isSplatValue(SDValue V, const APInt &DemandedElts,
     unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
     APInt UndefSrcElts;
     APInt DemandedSrcElts = DemandedElts.zextOrSelf(NumSrcElts).shl(Idx);
-    if (isSplatValue(Src, DemandedSrcElts, UndefSrcElts)) {
+    if (isSplatValue(Src, DemandedSrcElts, UndefSrcElts, Depth + 1)) {
       UndefElts = UndefSrcElts.extractBits(NumElts, Idx);
       return true;
     }
@@ -2574,15 +2696,11 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
 
   if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
     // We know all of the bits for a constant!
-    Known.One = C->getAPIntValue();
-    Known.Zero = ~Known.One;
-    return Known;
+    return KnownBits::makeConstant(C->getAPIntValue());
   }
   if (auto *C = dyn_cast<ConstantFPSDNode>(Op)) {
     // We know all of the bits for a constant fp!
-    Known.One = C->getValueAPF().bitcastToAPInt();
-    Known.Zero = ~Known.One;
-    return Known;
+    return KnownBits::makeConstant(C->getValueAPF().bitcastToAPInt());
   }
 
   if (Depth >= MaxRecursionDepth)
@@ -2617,8 +2735,7 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
       }
 
       // Known bits are the values that are shared by every demanded element.
-      Known.One &= Known2.One;
-      Known.Zero &= Known2.Zero;
+      Known = KnownBits::commonBits(Known, Known2);
 
       // If we don't know any bits, early out.
       if (Known.isUnknown())
@@ -2655,8 +2772,7 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
     if (!!DemandedLHS) {
       SDValue LHS = Op.getOperand(0);
       Known2 = computeKnownBits(LHS, DemandedLHS, Depth + 1);
-      Known.One &= Known2.One;
-      Known.Zero &= Known2.Zero;
+      Known = KnownBits::commonBits(Known, Known2);
     }
     // If we don't know any bits, early out.
     if (Known.isUnknown())
@@ -2664,8 +2780,7 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
     if (!!DemandedRHS) {
       SDValue RHS = Op.getOperand(1);
       Known2 = computeKnownBits(RHS, DemandedRHS, Depth + 1);
-      Known.One &= Known2.One;
-      Known.Zero &= Known2.Zero;
+      Known = KnownBits::commonBits(Known, Known2);
     }
     break;
   }
@@ -2681,8 +2796,7 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
       if (!!DemandedSub) {
         SDValue Sub = Op.getOperand(i);
         Known2 = computeKnownBits(Sub, DemandedSub, Depth + 1);
-        Known.One &= Known2.One;
-        Known.Zero &= Known2.Zero;
+        Known = KnownBits::commonBits(Known, Known2);
       }
       // If we don't know any bits, early out.
       if (Known.isUnknown())
@@ -2710,8 +2824,7 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
     }
     if (!!DemandedSrcElts) {
       Known2 = computeKnownBits(Src, DemandedSrcElts, Depth + 1);
-      Known.One &= Known2.One;
-      Known.Zero &= Known2.Zero;
+      Known = KnownBits::commonBits(Known, Known2);
     }
     break;
   }
@@ -2830,35 +2943,13 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
   case ISD::MUL: {
     Known = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
     Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
-
-    // If low bits are zero in either operand, output low known-0 bits.
-    // Also compute a conservative estimate for high known-0 bits.
-    // More trickiness is possible, but this is sufficient for the
-    // interesting case of alignment computation.
-    unsigned TrailZ = Known.countMinTrailingZeros() +
-                      Known2.countMinTrailingZeros();
-    unsigned LeadZ =  std::max(Known.countMinLeadingZeros() +
-                               Known2.countMinLeadingZeros(),
-                               BitWidth) - BitWidth;
-
-    Known.resetAll();
-    Known.Zero.setLowBits(std::min(TrailZ, BitWidth));
-    Known.Zero.setHighBits(std::min(LeadZ, BitWidth));
+    Known = KnownBits::computeForMul(Known, Known2);
     break;
   }
   case ISD::UDIV: {
-    // For the purposes of computing leading zeros we can conservatively
-    // treat a udiv as a logical right shift by the power of 2 known to
-    // be less than the denominator.
-    Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
-    unsigned LeadZ = Known2.countMinLeadingZeros();
-
+    Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
     Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
-    unsigned RHSMaxLeadingZeros = Known2.countMaxLeadingZeros();
-    if (RHSMaxLeadingZeros != BitWidth)
-      LeadZ = std::min(BitWidth, LeadZ + BitWidth - RHSMaxLeadingZeros - 1);
-
-    Known.Zero.setHighBits(LeadZ);
+    Known = KnownBits::udiv(Known, Known2);
     break;
   }
   case ISD::SELECT:
@@ -2870,8 +2961,7 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
     Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth+1);
 
     // Only known if known in both the LHS and RHS.
-    Known.One &= Known2.One;
-    Known.Zero &= Known2.Zero;
+    Known = KnownBits::commonBits(Known, Known2);
     break;
   case ISD::SELECT_CC:
     Known = computeKnownBits(Op.getOperand(3), DemandedElts, Depth+1);
@@ -2881,8 +2971,7 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
     Known2 = computeKnownBits(Op.getOperand(2), DemandedElts, Depth+1);
 
     // Only known if known in both the LHS and RHS.
-    Known.One &= Known2.One;
-    Known.Zero &= Known2.Zero;
+    Known = KnownBits::commonBits(Known, Known2);
     break;
   case ISD::SMULO:
   case ISD::UMULO:
@@ -2911,19 +3000,8 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
   }
   case ISD::SHL:
     Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
-
-    if (const APInt *ShAmt = getValidShiftAmountConstant(Op, DemandedElts)) {
-      unsigned Shift = ShAmt->getZExtValue();
-      Known.Zero <<= Shift;
-      Known.One <<= Shift;
-      // Low bits are known zero.
-      Known.Zero.setLowBits(Shift);
-      break;
-    }
-
-    // No matter the shift amount, the trailing zeros will stay zero.
-    Known.Zero = APInt::getLowBitsSet(BitWidth, Known.countMinTrailingZeros());
-    Known.One.clearAllBits();
+    Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    Known = KnownBits::shl(Known, Known2);
 
     // Minimum shift low bits are known zero.
     if (const APInt *ShMinAmt =
@@ -2932,19 +3010,8 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
     break;
   case ISD::SRL:
     Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
-
-    if (const APInt *ShAmt = getValidShiftAmountConstant(Op, DemandedElts)) {
-      unsigned Shift = ShAmt->getZExtValue();
-      Known.Zero.lshrInPlace(Shift);
-      Known.One.lshrInPlace(Shift);
-      // High bits are known zero.
-      Known.Zero.setHighBits(Shift);
-      break;
-    }
-
-    // No matter the shift amount, the leading zeros will stay zero.
-    Known.Zero = APInt::getHighBitsSet(BitWidth, Known.countMinLeadingZeros());
-    Known.One.clearAllBits();
+    Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    Known = KnownBits::lshr(Known, Known2);
 
     // Minimum shift high bits are known zero.
     if (const APInt *ShMinAmt =
@@ -2952,13 +3019,10 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
       Known.Zero.setHighBits(ShMinAmt->getZExtValue());
     break;
   case ISD::SRA:
-    if (const APInt *ShAmt = getValidShiftAmountConstant(Op, DemandedElts)) {
-      Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
-      unsigned Shift = ShAmt->getZExtValue();
-      // Sign extend known zero/one bit (else is unknown).
-      Known.Zero.ashrInPlace(Shift);
-      Known.One.ashrInPlace(Shift);
-    }
+    Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    Known = KnownBits::ashr(Known, Known2);
+    // TODO: Add minimum shift high known sign bits.
     break;
   case ISD::FSHL:
   case ISD::FSHR:
@@ -2993,38 +3057,9 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
     }
     break;
   case ISD::SIGN_EXTEND_INREG: {
-    EVT EVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
-    unsigned EBits = EVT.getScalarSizeInBits();
-
-    // Sign extension.  Compute the demanded bits in the result that are not
-    // present in the input.
-    APInt NewBits = APInt::getHighBitsSet(BitWidth, BitWidth - EBits);
-
-    APInt InSignMask = APInt::getSignMask(EBits);
-    APInt InputDemandedBits = APInt::getLowBitsSet(BitWidth, EBits);
-
-    // If the sign extended bits are demanded, we know that the sign
-    // bit is demanded.
-    InSignMask = InSignMask.zext(BitWidth);
-    if (NewBits.getBoolValue())
-      InputDemandedBits |= InSignMask;
-
     Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
-    Known.One &= InputDemandedBits;
-    Known.Zero &= InputDemandedBits;
-
-    // If the sign bit of the input is known set or clear, then we know the
-    // top bits of the result.
-    if (Known.Zero.intersects(InSignMask)) {        // Input sign bit known clear
-      Known.Zero |= NewBits;
-      Known.One  &= ~NewBits;
-    } else if (Known.One.intersects(InSignMask)) {  // Input sign bit known set
-      Known.One  |= NewBits;
-      Known.Zero &= ~NewBits;
-    } else {                              // Input sign bit unknown
-      Known.Zero &= ~NewBits;
-      Known.One  &= ~NewBits;
-    }
+    EVT EVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+    Known = Known.sextInReg(EVT.getScalarSizeInBits());
     break;
   }
   case ISD::CTTZ:
@@ -3052,6 +3087,11 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
     Known.Zero.setBitsFrom(Log2_32(PossibleOnes) + 1);
     break;
   }
+  case ISD::PARITY: {
+    // Parity returns 0 everywhere but the LSB.
+    Known.Zero.setBitsFrom(1);
+    break;
+  }
   case ISD::LOAD: {
     LoadSDNode *LD = cast<LoadSDNode>(Op);
     const Constant *Cst = TLI->getTargetConstantFromLoad(LD);
@@ -3095,13 +3135,10 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
           }
         } else if (BitWidth == CstTy->getPrimitiveSizeInBits()) {
           if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
-            const APInt &Value = CInt->getValue();
-            Known.One = Value;
-            Known.Zero = ~Value;
+            Known = KnownBits::makeConstant(CInt->getValue());
           } else if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
-            APInt Value = CFP->getValueAPF().bitcastToAPInt();
-            Known.One = Value;
-            Known.Zero = ~Value;
+            Known =
+                KnownBits::makeConstant(CFP->getValueAPF().bitcastToAPInt());
           }
         }
       }
@@ -3241,53 +3278,16 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
     Known = KnownBits::computeForAddCarry(Known, Known2, Carry);
     break;
   }
-  case ISD::SREM:
-    if (ConstantSDNode *Rem = isConstOrConstSplat(Op.getOperand(1))) {
-      const APInt &RA = Rem->getAPIntValue().abs();
-      if (RA.isPowerOf2()) {
-        APInt LowBits = RA - 1;
-        Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
-
-        // The low bits of the first operand are unchanged by the srem.
-        Known.Zero = Known2.Zero & LowBits;
-        Known.One = Known2.One & LowBits;
-
-        // If the first operand is non-negative or has all low bits zero, then
-        // the upper bits are all zero.
-        if (Known2.isNonNegative() || LowBits.isSubsetOf(Known2.Zero))
-          Known.Zero |= ~LowBits;
-
-        // If the first operand is negative and not all low bits are zero, then
-        // the upper bits are all one.
-        if (Known2.isNegative() && LowBits.intersects(Known2.One))
-          Known.One |= ~LowBits;
-        assert((Known.Zero & Known.One) == 0&&"Bits known to be one AND zero?");
-      }
-    }
+  case ISD::SREM: {
+    Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    Known = KnownBits::srem(Known, Known2);
     break;
+  }
   case ISD::UREM: {
-    if (ConstantSDNode *Rem = isConstOrConstSplat(Op.getOperand(1))) {
-      const APInt &RA = Rem->getAPIntValue();
-      if (RA.isPowerOf2()) {
-        APInt LowBits = (RA - 1);
-        Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
-
-        // The upper bits are all zero, the lower ones are unchanged.
-        Known.Zero = Known2.Zero | ~LowBits;
-        Known.One = Known2.One & LowBits;
-        break;
-      }
-    }
-
-    // Since the result is less than or equal to either operand, any leading
-    // zero bits in either operand must also exist in the result.
     Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
     Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
-
-    uint32_t Leaders =
-        std::max(Known.countMinLeadingZeros(), Known2.countMinLeadingZeros());
-    Known.resetAll();
-    Known.Zero.setHighBits(Leaders);
+    Known = KnownBits::urem(Known, Known2);
     break;
   }
   case ISD::EXTRACT_ELEMENT: {
@@ -3307,6 +3307,9 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
     SDValue InVec = Op.getOperand(0);
     SDValue EltNo = Op.getOperand(1);
     EVT VecVT = InVec.getValueType();
+    // computeKnownBits not yet implemented for scalable vectors.
+    if (VecVT.isScalableVector())
+      break;
     const unsigned EltBitWidth = VecVT.getScalarSizeInBits();
     const unsigned NumSrcElts = VecVT.getVectorNumElements();
 
@@ -3347,73 +3350,39 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
     Known.Zero.setAllBits();
     if (DemandedVal) {
       Known2 = computeKnownBits(InVal, Depth + 1);
-      Known.One &= Known2.One.zextOrTrunc(BitWidth);
-      Known.Zero &= Known2.Zero.zextOrTrunc(BitWidth);
+      Known = KnownBits::commonBits(Known, Known2.zextOrTrunc(BitWidth));
     }
     if (!!DemandedVecElts) {
       Known2 = computeKnownBits(InVec, DemandedVecElts, Depth + 1);
-      Known.One &= Known2.One;
-      Known.Zero &= Known2.Zero;
+      Known = KnownBits::commonBits(Known, Known2);
     }
     break;
   }
   case ISD::BITREVERSE: {
     Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
-    Known.Zero = Known2.Zero.reverseBits();
-    Known.One = Known2.One.reverseBits();
+    Known = Known2.reverseBits();
     break;
   }
   case ISD::BSWAP: {
     Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
-    Known.Zero = Known2.Zero.byteSwap();
-    Known.One = Known2.One.byteSwap();
+    Known = Known2.byteSwap();
     break;
   }
   case ISD::ABS: {
     Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
-
-    // If the source's MSB is zero then we know the rest of the bits already.
-    if (Known2.isNonNegative()) {
-      Known.Zero = Known2.Zero;
-      Known.One = Known2.One;
-      break;
-    }
-
-    // We only know that the absolute values's MSB will be zero iff there is
-    // a set bit that isn't the sign bit (otherwise it could be INT_MIN).
-    Known2.One.clearSignBit();
-    if (Known2.One.getBoolValue()) {
-      Known.Zero = APInt::getSignMask(BitWidth);
-      break;
-    }
+    Known = Known2.abs();
     break;
   }
   case ISD::UMIN: {
     Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
     Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
-
-    // UMIN - we know that the result will have the maximum of the
-    // known zero leading bits of the inputs.
-    unsigned LeadZero = Known.countMinLeadingZeros();
-    LeadZero = std::max(LeadZero, Known2.countMinLeadingZeros());
-
-    Known.Zero &= Known2.Zero;
-    Known.One &= Known2.One;
-    Known.Zero.setHighBits(LeadZero);
+    Known = KnownBits::umin(Known, Known2);
     break;
   }
   case ISD::UMAX: {
     Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
     Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
-
-    // UMAX - we know that the result will have the maximum of the
-    // known one leading bits of the inputs.
-    unsigned LeadOne = Known.countMinLeadingOnes();
-    LeadOne = std::max(LeadOne, Known2.countMinLeadingOnes());
-
-    Known.Zero &= Known2.Zero;
-    Known.One &= Known2.One;
-    Known.One.setHighBits(LeadOne);
+    Known = KnownBits::umax(Known, Known2);
     break;
   }
   case ISD::SMIN:
@@ -3447,12 +3416,12 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
       }
     }
 
-    // Fallback - just get the shared known bits of the operands.
     Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
-    if (Known.isUnknown()) break; // Early-out
     Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
-    Known.Zero &= Known2.Zero;
-    Known.One &= Known2.One;
+    if (IsMax)
+      Known = KnownBits::smax(Known, Known2);
+    else
+      Known = KnownBits::smin(Known, Known2);
     break;
   }
   case ISD::FrameIndex:
@@ -4395,11 +4364,16 @@ static SDValue foldCONCAT_VECTORS(const SDLoc &DL, EVT VT,
   for (SDValue Op : Elts)
     SVT = (SVT.bitsLT(Op.getValueType()) ? Op.getValueType() : SVT);
 
-  if (SVT.bitsGT(VT.getScalarType()))
-    for (SDValue &Op : Elts)
-      Op = DAG.getTargetLoweringInfo().isZExtFree(Op.getValueType(), SVT)
-               ? DAG.getZExtOrTrunc(Op, DL, SVT)
-               : DAG.getSExtOrTrunc(Op, DL, SVT);
+  if (SVT.bitsGT(VT.getScalarType())) {
+    for (SDValue &Op : Elts) {
+      if (Op.isUndef())
+        Op = DAG.getUNDEF(SVT);
+      else
+        Op = DAG.getTargetLoweringInfo().isZExtFree(Op.getValueType(), SVT)
+                 ? DAG.getZExtOrTrunc(Op, DL, SVT)
+                 : DAG.getSExtOrTrunc(Op, DL, SVT);
+    }
+  }
 
   SDValue V = DAG.getBuildVector(VT, DL, Elts);
   NewSDValueDbgMsg(V, "New node fold concat vectors: ", &DAG);
@@ -4424,6 +4398,14 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT) {
   return V;
 }
 
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
+                              SDValue Operand) {
+  SDNodeFlags Flags;
+  if (Inserter)
+    Flags = Inserter->getFlags();
+  return getNode(Opcode, DL, VT, Operand, Flags);
+}
+
 SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
                               SDValue Operand, const SDNodeFlags Flags) {
   // Constant fold unary operations with an integer constant operand. Even
@@ -4625,8 +4607,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
            Operand.getValueType().isFloatingPoint() && "Invalid FP cast!");
     if (Operand.getValueType() == VT) return Operand;  // noop conversion.
     assert((!VT.isVector() ||
-            VT.getVectorNumElements() ==
-            Operand.getValueType().getVectorNumElements()) &&
+            VT.getVectorElementCount() ==
+            Operand.getValueType().getVectorElementCount()) &&
            "Vector element count mismatch!");
     assert(Operand.getValueType().bitsLT(VT) &&
            "Invalid fpext node, dst < src!");
@@ -4811,6 +4793,25 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
   case ISD::VSCALE:
     assert(VT == Operand.getValueType() && "Unexpected VT!");
     break;
+  case ISD::CTPOP:
+    if (Operand.getValueType().getScalarType() == MVT::i1)
+      return Operand;
+    break;
+  case ISD::CTLZ:
+  case ISD::CTTZ:
+    if (Operand.getValueType().getScalarType() == MVT::i1)
+      return getNOT(DL, Operand, Operand.getValueType());
+    break;
+  case ISD::VECREDUCE_SMIN:
+  case ISD::VECREDUCE_UMAX:
+    if (Operand.getValueType().getScalarType() == MVT::i1)
+      return getNode(ISD::VECREDUCE_OR, DL, VT, Operand);
+    break;
+  case ISD::VECREDUCE_SMAX:
+  case ISD::VECREDUCE_UMIN:
+    if (Operand.getValueType().getScalarType() == MVT::i1)
+      return getNode(ISD::VECREDUCE_AND, DL, VT, Operand);
+    break;
   }
 
   SDNode *N;
@@ -5232,6 +5233,14 @@ SDValue SelectionDAG::getAssertAlign(const SDLoc &DL, SDValue Val, Align A) {
   return V;
 }
 
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
+                              SDValue N1, SDValue N2) {
+  SDNodeFlags Flags;
+  if (Inserter)
+    Flags = Inserter->getFlags();
+  return getNode(Opcode, DL, VT, N1, N2, Flags);
+}
+
 SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
                               SDValue N1, SDValue N2, const SDNodeFlags Flags) {
   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
@@ -5312,10 +5321,6 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
   case ISD::MULHS:
   case ISD::SDIV:
   case ISD::SREM:
-  case ISD::SMIN:
-  case ISD::SMAX:
-  case ISD::UMIN:
-  case ISD::UMAX:
   case ISD::SADDSAT:
   case ISD::SSUBSAT:
   case ISD::UADDSAT:
@@ -5324,6 +5329,22 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     assert(N1.getValueType() == N2.getValueType() &&
            N1.getValueType() == VT && "Binary operator types must match!");
     break;
+  case ISD::SMIN:
+  case ISD::UMAX:
+    assert(VT.isInteger() && "This operator does not apply to FP types!");
+    assert(N1.getValueType() == N2.getValueType() &&
+           N1.getValueType() == VT && "Binary operator types must match!");
+    if (VT.isVector() && VT.getVectorElementType() == MVT::i1)
+      return getNode(ISD::OR, DL, VT, N1, N2);
+    break;
+  case ISD::SMAX:
+  case ISD::UMIN:
+    assert(VT.isInteger() && "This operator does not apply to FP types!");
+    assert(N1.getValueType() == N2.getValueType() &&
+           N1.getValueType() == VT && "Binary operator types must match!");
+    if (VT.isVector() && VT.getVectorElementType() == MVT::i1)
+      return getNode(ISD::AND, DL, VT, N1, N2);
+    break;
   case ISD::FADD:
   case ISD::FSUB:
   case ISD::FMUL:
@@ -5365,8 +5386,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     // amounts.  This catches things like trying to shift an i1024 value by an
     // i8, which is easy to fall into in generic code that uses
     // TLI.getShiftAmount().
-    assert(N2.getValueType().getScalarSizeInBits().getFixedSize() >=
-               Log2_32_Ceil(VT.getScalarSizeInBits().getFixedSize()) &&
+    assert(N2.getValueType().getScalarSizeInBits() >=
+               Log2_32_Ceil(VT.getScalarSizeInBits()) &&
            "Invalid use of small shift amount with oversized value!");
 
     // Always fold shifts of i1 values so the code generator doesn't need to
@@ -5562,6 +5583,11 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
             (VT.getVectorMinNumElements() + N2C->getZExtValue()) <=
                 N1VT.getVectorMinNumElements()) &&
            "Extract subvector overflow!");
+    assert(N2C->getAPIntValue().getBitWidth() ==
+               TLI->getVectorIdxTy(getDataLayout())
+                   .getSizeInBits()
+                   .getFixedSize() &&
+           "Constant index for EXTRACT_SUBVECTOR has an invalid size");
 
     // Trivial extraction.
     if (VT == N1VT)
@@ -5573,8 +5599,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
 
     // EXTRACT_SUBVECTOR of CONCAT_VECTOR can be simplified if the pieces of
     // the concat have the same type as the extract.
-    if (N2C && N1.getOpcode() == ISD::CONCAT_VECTORS &&
-        N1.getNumOperands() > 0 && VT == N1.getOperand(0).getValueType()) {
+    if (N1.getOpcode() == ISD::CONCAT_VECTORS && N1.getNumOperands() > 0 &&
+        VT == N1.getOperand(0).getValueType()) {
       unsigned Factor = VT.getVectorMinNumElements();
       return N1.getOperand(N2C->getZExtValue() / Factor);
     }
@@ -5670,6 +5696,14 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
   return V;
 }
 
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
+                              SDValue N1, SDValue N2, SDValue N3) {
+  SDNodeFlags Flags;
+  if (Inserter)
+    Flags = Inserter->getFlags();
+  return getNode(Opcode, DL, VT, N1, N2, N3, Flags);
+}
+
 SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
                               SDValue N1, SDValue N2, SDValue N3,
                               const SDNodeFlags Flags) {
@@ -5940,11 +5974,20 @@ static SDValue getMemsetStringVal(EVT VT, const SDLoc &dl, SelectionDAG &DAG,
   return SDValue(nullptr, 0);
 }
 
-SDValue SelectionDAG::getMemBasePlusOffset(SDValue Base, int64_t Offset,
+SDValue SelectionDAG::getMemBasePlusOffset(SDValue Base, TypeSize Offset,
                                            const SDLoc &DL,
                                            const SDNodeFlags Flags) {
   EVT VT = Base.getValueType();
-  return getMemBasePlusOffset(Base, getConstant(Offset, DL, VT), DL, Flags);
+  SDValue Index;
+
+  if (Offset.isScalable())
+    Index = getVScale(DL, Base.getValueType(),
+                      APInt(Base.getValueSizeInBits().getFixedSize(),
+                            Offset.getKnownMinSize()));
+  else
+    Index = getConstant(Offset.getFixedSize(), DL, VT);
+
+  return getMemBasePlusOffset(Base, Index, DL, Flags);
 }
 
 SDValue SelectionDAG::getMemBasePlusOffset(SDValue Ptr, SDValue Offset,
@@ -6039,7 +6082,8 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
     SrcAlign = Alignment;
   assert(SrcAlign && "SrcAlign must be set");
   ConstantDataArraySlice Slice;
-  bool CopyFromConstant = isMemSrcFromConstant(Src, Slice);
+  // If marked as volatile, perform a copy even when marked as constant.
+  bool CopyFromConstant = !isVol && isMemSrcFromConstant(Src, Slice);
   bool isZeroConstant = CopyFromConstant && Slice.Array == nullptr;
   unsigned Limit = AlwaysInline ? ~0U : TLI.getMaxStoresPerMemcpy(OptSize);
   const MemOp Op = isZeroConstant
@@ -6111,8 +6155,9 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
       Value = getMemsetStringVal(VT, dl, DAG, TLI, SubSlice);
       if (Value.getNode()) {
         Store = DAG.getStore(
-            Chain, dl, Value, DAG.getMemBasePlusOffset(Dst, DstOff, dl),
-            DstPtrInfo.getWithOffset(DstOff), Alignment.value(), MMOFlags);
+            Chain, dl, Value,
+            DAG.getMemBasePlusOffset(Dst, TypeSize::Fixed(DstOff), dl),
+            DstPtrInfo.getWithOffset(DstOff), Alignment, MMOFlags);
         OutChains.push_back(Store);
       }
     }
@@ -6132,16 +6177,17 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
       if (isDereferenceable)
         SrcMMOFlags |= MachineMemOperand::MODereferenceable;
 
-      Value = DAG.getExtLoad(ISD::EXTLOAD, dl, NVT, Chain,
-                             DAG.getMemBasePlusOffset(Src, SrcOff, dl),
-                             SrcPtrInfo.getWithOffset(SrcOff), VT,
-                             commonAlignment(*SrcAlign, SrcOff).value(),
-                             SrcMMOFlags);
+      Value = DAG.getExtLoad(
+          ISD::EXTLOAD, dl, NVT, Chain,
+          DAG.getMemBasePlusOffset(Src, TypeSize::Fixed(SrcOff), dl),
+          SrcPtrInfo.getWithOffset(SrcOff), VT,
+          commonAlignment(*SrcAlign, SrcOff), SrcMMOFlags);
       OutLoadChains.push_back(Value.getValue(1));
 
       Store = DAG.getTruncStore(
-          Chain, dl, Value, DAG.getMemBasePlusOffset(Dst, DstOff, dl),
-          DstPtrInfo.getWithOffset(DstOff), VT, Alignment.value(), MMOFlags);
+          Chain, dl, Value,
+          DAG.getMemBasePlusOffset(Dst, TypeSize::Fixed(DstOff), dl),
+          DstPtrInfo.getWithOffset(DstOff), VT, Alignment, MMOFlags);
       OutStoreChains.push_back(Store);
     }
     SrcOff += VTSize;
@@ -6261,9 +6307,10 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
     if (isDereferenceable)
       SrcMMOFlags |= MachineMemOperand::MODereferenceable;
 
-    Value = DAG.getLoad(
-        VT, dl, Chain, DAG.getMemBasePlusOffset(Src, SrcOff, dl),
-        SrcPtrInfo.getWithOffset(SrcOff), SrcAlign->value(), SrcMMOFlags);
+    Value =
+        DAG.getLoad(VT, dl, Chain,
+                    DAG.getMemBasePlusOffset(Src, TypeSize::Fixed(SrcOff), dl),
+                    SrcPtrInfo.getWithOffset(SrcOff), *SrcAlign, SrcMMOFlags);
     LoadValues.push_back(Value);
     LoadChains.push_back(Value.getValue(1));
     SrcOff += VTSize;
@@ -6275,9 +6322,10 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
     unsigned VTSize = VT.getSizeInBits() / 8;
     SDValue Store;
 
-    Store = DAG.getStore(
-        Chain, dl, LoadValues[i], DAG.getMemBasePlusOffset(Dst, DstOff, dl),
-        DstPtrInfo.getWithOffset(DstOff), Alignment.value(), MMOFlags);
+    Store =
+        DAG.getStore(Chain, dl, LoadValues[i],
+                     DAG.getMemBasePlusOffset(Dst, TypeSize::Fixed(DstOff), dl),
+                     DstPtrInfo.getWithOffset(DstOff), Alignment, MMOFlags);
     OutChains.push_back(Store);
     DstOff += VTSize;
   }
@@ -6375,8 +6423,9 @@ static SDValue getMemsetStores(SelectionDAG &DAG, const SDLoc &dl,
     }
     assert(Value.getValueType() == VT && "Value with wrong type.");
     SDValue Store = DAG.getStore(
-        Chain, dl, Value, DAG.getMemBasePlusOffset(Dst, DstOff, dl),
-        DstPtrInfo.getWithOffset(DstOff), Alignment.value(),
+        Chain, dl, Value,
+        DAG.getMemBasePlusOffset(Dst, TypeSize::Fixed(DstOff), dl),
+        DstPtrInfo.getWithOffset(DstOff), Alignment,
         isVol ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone);
     OutChains.push_back(Store);
     DstOff += VT.getSizeInBits() / 8;
@@ -6390,7 +6439,7 @@ static void checkAddrSpaceIsValidForLibcall(const TargetLowering *TLI,
                                             unsigned AS) {
   // Lowering memcpy / memset / memmove intrinsics to calls is only valid if all
   // pointer operands can be losslessly bitcasted to pointers of address space 0
-  if (AS != 0 && !TLI->isNoopAddrSpaceCast(AS, 0)) {
+  if (AS != 0 && !TLI->getTargetMachine().isNoopAddrSpaceCast(AS, 0)) {
     report_fatal_error("cannot lower memory intrinsic in address space " +
                        Twine(AS));
   }
@@ -6882,6 +6931,30 @@ SDValue SelectionDAG::getLifetimeNode(bool IsStart, const SDLoc &dl,
   return V;
 }
 
+SDValue SelectionDAG::getPseudoProbeNode(const SDLoc &Dl, SDValue Chain,
+                                         uint64_t Guid, uint64_t Index,
+                                         uint32_t Attr) {
+  const unsigned Opcode = ISD::PSEUDO_PROBE;
+  const auto VTs = getVTList(MVT::Other);
+  SDValue Ops[] = {Chain};
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, Opcode, VTs, Ops);
+  ID.AddInteger(Guid);
+  ID.AddInteger(Index);
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, Dl, IP))
+    return SDValue(E, 0);
+
+  auto *N = newSDNode<PseudoProbeSDNode>(
+      Opcode, Dl.getIROrder(), Dl.getDebugLoc(), VTs, Guid, Index, Attr);
+  createOperands(N, Ops);
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  SDValue V(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
+}
+
 /// InferPointerInfo - If the specified ptr/offset is a frame index, infer a
 /// MachinePointerInfo record from it.  This is particularly useful because the
 /// code generator has many cases where it doesn't bother passing in a
@@ -6962,7 +7035,7 @@ SDValue SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
     assert(VT.isVector() == MemVT.isVector() &&
            "Cannot use an ext load to convert to or from a vector!");
     assert((!VT.isVector() ||
-            VT.getVectorNumElements() == MemVT.getVectorNumElements()) &&
+            VT.getVectorElementCount() == MemVT.getVectorElementCount()) &&
            "Cannot use an ext load to change the number of vector elements!");
   }
 
@@ -7041,8 +7114,7 @@ SDValue SelectionDAG::getIndexedLoad(SDValue OrigLoad, const SDLoc &dl,
       ~(MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable);
   return getLoad(AM, LD->getExtensionType(), OrigLoad.getValueType(), dl,
                  LD->getChain(), Base, Offset, LD->getPointerInfo(),
-                 LD->getMemoryVT(), LD->getAlignment(), MMOFlags,
-                 LD->getAAInfo());
+                 LD->getMemoryVT(), LD->getAlign(), MMOFlags, LD->getAAInfo());
 }
 
 SDValue SelectionDAG::getStore(SDValue Chain, const SDLoc &dl, SDValue Val,
@@ -7112,7 +7184,8 @@ SDValue SelectionDAG::getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val,
 
   MachineFunction &MF = getMachineFunction();
   MachineMemOperand *MMO = MF.getMachineMemOperand(
-      PtrInfo, MMOFlags, SVT.getStoreSize(), Alignment, AAInfo);
+      PtrInfo, MMOFlags, MemoryLocation::getSizeOrUnknown(SVT.getStoreSize()),
+      Alignment, AAInfo);
   return getTruncStore(Chain, dl, Val, Ptr, SVT, MMO);
 }
 
@@ -7133,7 +7206,7 @@ SDValue SelectionDAG::getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val,
   assert(VT.isVector() == SVT.isVector() &&
          "Cannot use trunc store to convert to or from a vector!");
   assert((!VT.isVector() ||
-          VT.getVectorNumElements() == SVT.getVectorNumElements()) &&
+          VT.getVectorElementCount() == SVT.getVectorElementCount()) &&
          "Cannot use trunc store to change the number of vector elements!");
 
   SDVTList VTs = getVTList(MVT::Other);
@@ -7285,14 +7358,15 @@ SDValue SelectionDAG::getIndexedMaskedStore(SDValue OrigStore, const SDLoc &dl,
 SDValue SelectionDAG::getMaskedGather(SDVTList VTs, EVT VT, const SDLoc &dl,
                                       ArrayRef<SDValue> Ops,
                                       MachineMemOperand *MMO,
-                                      ISD::MemIndexType IndexType) {
+                                      ISD::MemIndexType IndexType,
+                                      ISD::LoadExtType ExtTy) {
   assert(Ops.size() == 6 && "Incompatible number of operands");
 
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, ISD::MGATHER, VTs, Ops);
   ID.AddInteger(VT.getRawBits());
   ID.AddInteger(getSyntheticNodeSubclassData<MaskedGatherSDNode>(
-      dl.getIROrder(), VTs, VT, MMO, IndexType));
+      dl.getIROrder(), VTs, VT, MMO, IndexType, ExtTy));
   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
@@ -7300,17 +7374,22 @@ SDValue SelectionDAG::getMaskedGather(SDVTList VTs, EVT VT, const SDLoc &dl,
     return SDValue(E, 0);
   }
 
+  IndexType = TLI->getCanonicalIndexType(IndexType, VT, Ops[4]);
   auto *N = newSDNode<MaskedGatherSDNode>(dl.getIROrder(), dl.getDebugLoc(),
-                                          VTs, VT, MMO, IndexType);
+                                          VTs, VT, MMO, IndexType, ExtTy);
   createOperands(N, Ops);
 
   assert(N->getPassThru().getValueType() == N->getValueType(0) &&
          "Incompatible type of the PassThru value in MaskedGatherSDNode");
-  assert(N->getMask().getValueType().getVectorNumElements() ==
-             N->getValueType(0).getVectorNumElements() &&
+  assert(N->getMask().getValueType().getVectorElementCount() ==
+             N->getValueType(0).getVectorElementCount() &&
          "Vector width mismatch between mask and data");
-  assert(N->getIndex().getValueType().getVectorNumElements() >=
-             N->getValueType(0).getVectorNumElements() &&
+  assert(N->getIndex().getValueType().getVectorElementCount().isScalable() ==
+             N->getValueType(0).getVectorElementCount().isScalable() &&
+         "Scalable flags of index and data do not match");
+  assert(ElementCount::isKnownGE(
+             N->getIndex().getValueType().getVectorElementCount(),
+             N->getValueType(0).getVectorElementCount()) &&
          "Vector width mismatch between index and data");
   assert(isa<ConstantSDNode>(N->getScale()) &&
          cast<ConstantSDNode>(N->getScale())->getAPIntValue().isPowerOf2() &&
@@ -7326,29 +7405,37 @@ SDValue SelectionDAG::getMaskedGather(SDVTList VTs, EVT VT, const SDLoc &dl,
 SDValue SelectionDAG::getMaskedScatter(SDVTList VTs, EVT VT, const SDLoc &dl,
                                        ArrayRef<SDValue> Ops,
                                        MachineMemOperand *MMO,
-                                       ISD::MemIndexType IndexType) {
+                                       ISD::MemIndexType IndexType,
+                                       bool IsTrunc) {
   assert(Ops.size() == 6 && "Incompatible number of operands");
 
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, ISD::MSCATTER, VTs, Ops);
   ID.AddInteger(VT.getRawBits());
   ID.AddInteger(getSyntheticNodeSubclassData<MaskedScatterSDNode>(
-      dl.getIROrder(), VTs, VT, MMO, IndexType));
+      dl.getIROrder(), VTs, VT, MMO, IndexType, IsTrunc));
   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
     cast<MaskedScatterSDNode>(E)->refineAlignment(MMO);
     return SDValue(E, 0);
   }
+
+  IndexType = TLI->getCanonicalIndexType(IndexType, VT, Ops[4]);
   auto *N = newSDNode<MaskedScatterSDNode>(dl.getIROrder(), dl.getDebugLoc(),
-                                           VTs, VT, MMO, IndexType);
+                                           VTs, VT, MMO, IndexType, IsTrunc);
   createOperands(N, Ops);
 
-  assert(N->getMask().getValueType().getVectorNumElements() ==
-             N->getValue().getValueType().getVectorNumElements() &&
+  assert(N->getMask().getValueType().getVectorElementCount() ==
+             N->getValue().getValueType().getVectorElementCount() &&
          "Vector width mismatch between mask and data");
-  assert(N->getIndex().getValueType().getVectorNumElements() >=
-             N->getValue().getValueType().getVectorNumElements() &&
+  assert(
+      N->getIndex().getValueType().getVectorElementCount().isScalable() ==
+          N->getValue().getValueType().getVectorElementCount().isScalable() &&
+      "Scalable flags of index and data do not match");
+  assert(ElementCount::isKnownGE(
+             N->getIndex().getValueType().getVectorElementCount(),
+             N->getValue().getValueType().getVectorElementCount()) &&
          "Vector width mismatch between index and data");
   assert(isa<ConstantSDNode>(N->getScale()) &&
          cast<ConstantSDNode>(N->getScale())->getAPIntValue().isPowerOf2() &&
@@ -7452,6 +7539,11 @@ SDValue SelectionDAG::simplifyFPBinop(unsigned Opcode, SDValue X, SDValue Y,
     if (YC->getValueAPF().isExactlyValue(1.0))
       return X;
 
+  // X * 0.0 --> 0.0
+  if (Opcode == ISD::FMUL && Flags.hasNoNaNs() && Flags.hasNoSignedZeros())
+    if (YC->getValueAPF().isZero())
+      return getConstantFP(0.0, SDLoc(Y), Y.getValueType());
+
   return SDValue();
 }
 
@@ -7477,6 +7569,14 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
   return getNode(Opcode, DL, VT, NewOps);
 }
 
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
+                              ArrayRef<SDValue> Ops) {
+  SDNodeFlags Flags;
+  if (Inserter)
+    Flags = Inserter->getFlags();
+  return getNode(Opcode, DL, VT, Ops, Flags);
+}
+
 SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
                               ArrayRef<SDValue> Ops, const SDNodeFlags Flags) {
   unsigned NumOps = Ops.size();
@@ -7548,6 +7648,14 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL,
   return getNode(Opcode, DL, getVTList(ResultTys), Ops);
 }
 
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
+                              ArrayRef<SDValue> Ops) {
+  SDNodeFlags Flags;
+  if (Inserter)
+    Flags = Inserter->getFlags();
+  return getNode(Opcode, DL, VTList, Ops, Flags);
+}
+
 SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
                               ArrayRef<SDValue> Ops, const SDNodeFlags Flags) {
   if (VTList.NumVTs == 1)
@@ -8244,6 +8352,14 @@ SDValue SelectionDAG::getTargetInsertSubreg(int SRIdx, const SDLoc &DL, EVT VT,
 
 /// getNodeIfExists - Get the specified node if it's already available, or
 /// else return NULL.
+SDNode *SelectionDAG::getNodeIfExists(unsigned Opcode, SDVTList VTList,
+                                      ArrayRef<SDValue> Ops) {
+  SDNodeFlags Flags;
+  if (Inserter)
+    Flags = Inserter->getFlags();
+  return getNodeIfExists(Opcode, VTList, Ops, Flags);
+}
+
 SDNode *SelectionDAG::getNodeIfExists(unsigned Opcode, SDVTList VTList,
                                       ArrayRef<SDValue> Ops,
                                       const SDNodeFlags Flags) {
@@ -8259,6 +8375,19 @@ SDNode *SelectionDAG::getNodeIfExists(unsigned Opcode, SDVTList VTList,
   return nullptr;
 }
 
+/// doesNodeExist - Check if a node exists without modifying its flags.
+bool SelectionDAG::doesNodeExist(unsigned Opcode, SDVTList VTList,
+                                 ArrayRef<SDValue> Ops) {
+  if (VTList.VTs[VTList.NumVTs - 1] != MVT::Glue) {
+    FoldingSetNodeID ID;
+    AddNodeIDNode(ID, Opcode, VTList, Ops);
+    void *IP = nullptr;
+    if (FindNodeOrInsertPos(ID, SDLoc(), IP))
+      return true;
+  }
+  return false;
+}
+
 /// getDbgValue - Creates a SDDbgValue node.
 ///
 /// SDNode
@@ -8676,21 +8805,31 @@ namespace {
 
 } // end anonymous namespace
 
-void SelectionDAG::updateDivergence(SDNode * N)
-{
-  if (TLI->isSDNodeAlwaysUniform(N))
-    return;
-  bool IsDivergent = TLI->isSDNodeSourceOfDivergence(N, FLI, DA);
+bool SelectionDAG::calculateDivergence(SDNode *N) {
+  if (TLI->isSDNodeAlwaysUniform(N)) {
+    assert(!TLI->isSDNodeSourceOfDivergence(N, FLI, DA) &&
+           "Conflicting divergence information!");
+    return false;
+  }
+  if (TLI->isSDNodeSourceOfDivergence(N, FLI, DA))
+    return true;
   for (auto &Op : N->ops()) {
-    if (Op.Val.getValueType() != MVT::Other)
-      IsDivergent |= Op.getNode()->isDivergent();
+    if (Op.Val.getValueType() != MVT::Other && Op.getNode()->isDivergent())
+      return true;
   }
-  if (N->SDNodeBits.IsDivergent != IsDivergent) {
-    N->SDNodeBits.IsDivergent = IsDivergent;
-    for (auto U : N->uses()) {
-      updateDivergence(U);
+  return false;
+}
+
+void SelectionDAG::updateDivergence(SDNode *N) {
+  SmallVector<SDNode *, 16> Worklist(1, N);
+  do {
+    N = Worklist.pop_back_val();
+    bool IsDivergent = calculateDivergence(N);
+    if (N->SDNodeBits.IsDivergent != IsDivergent) {
+      N->SDNodeBits.IsDivergent = IsDivergent;
+      llvm::append_range(Worklist, N->uses());
     }
-  }
+  } while (!Worklist.empty());
 }
 
 void SelectionDAG::CreateTopologicalOrder(std::vector<SDNode *> &Order) {
@@ -8716,26 +8855,9 @@ void SelectionDAG::CreateTopologicalOrder(std::vector<SDNode *> &Order) {
 void SelectionDAG::VerifyDAGDiverence() {
   std::vector<SDNode *> TopoOrder;
   CreateTopologicalOrder(TopoOrder);
-  const TargetLowering &TLI = getTargetLoweringInfo();
-  DenseMap<const SDNode *, bool> DivergenceMap;
-  for (auto &N : allnodes()) {
-    DivergenceMap[&N] = false;
-  }
-  for (auto N : TopoOrder) {
-    bool IsDivergent = DivergenceMap[N];
-    bool IsSDNodeDivergent = TLI.isSDNodeSourceOfDivergence(N, FLI, DA);
-    for (auto &Op : N->ops()) {
-      if (Op.Val.getValueType() != MVT::Other)
-        IsSDNodeDivergent |= DivergenceMap[Op.getNode()];
-    }
-    if (!IsDivergent && IsSDNodeDivergent && !TLI.isSDNodeAlwaysUniform(N)) {
-      DivergenceMap[N] = true;
-    }
-  }
-  for (auto &N : allnodes()) {
-    (void)N;
-    assert(DivergenceMap[&N] == N.isDivergent() &&
-           "Divergence bit inconsistency detected\n");
+  for (auto *N : TopoOrder) {
+    assert(calculateDivergence(N) == N->isDivergent() &&
+           "Divergence bit inconsistency detected");
   }
 }
 #endif
@@ -8904,25 +9026,32 @@ void SelectionDAG::AddDbgLabel(SDDbgLabel *DB) {
   DbgInfo->add(DB);
 }
 
-SDValue SelectionDAG::makeEquivalentMemoryOrdering(LoadSDNode *OldLoad,
-                                                   SDValue NewMemOp) {
-  assert(isa<MemSDNode>(NewMemOp.getNode()) && "Expected a memop node");
+SDValue SelectionDAG::makeEquivalentMemoryOrdering(SDValue OldChain,
+                                                   SDValue NewMemOpChain) {
+  assert(isa<MemSDNode>(NewMemOpChain) && "Expected a memop node");
+  assert(NewMemOpChain.getValueType() == MVT::Other && "Expected a token VT");
   // The new memory operation must have the same position as the old load in
   // terms of memory dependency. Create a TokenFactor for the old load and new
   // memory operation and update uses of the old load's output chain to use that
   // TokenFactor.
-  SDValue OldChain = SDValue(OldLoad, 1);
-  SDValue NewChain = SDValue(NewMemOp.getNode(), 1);
-  if (OldChain == NewChain || !OldLoad->hasAnyUseOfValue(1))
-    return NewChain;
+  if (OldChain == NewMemOpChain || OldChain.use_empty())
+    return NewMemOpChain;
 
-  SDValue TokenFactor =
-      getNode(ISD::TokenFactor, SDLoc(OldLoad), MVT::Other, OldChain, NewChain);
+  SDValue TokenFactor = getNode(ISD::TokenFactor, SDLoc(OldChain), MVT::Other,
+                                OldChain, NewMemOpChain);
   ReplaceAllUsesOfValueWith(OldChain, TokenFactor);
-  UpdateNodeOperands(TokenFactor.getNode(), OldChain, NewChain);
+  UpdateNodeOperands(TokenFactor.getNode(), OldChain, NewMemOpChain);
   return TokenFactor;
 }
 
+SDValue SelectionDAG::makeEquivalentMemoryOrdering(LoadSDNode *OldLoad,
+                                                   SDValue NewMemOp) {
+  assert(isa<MemSDNode>(NewMemOp.getNode()) && "Expected a memop node");
+  SDValue OldChain = SDValue(OldLoad, 1);
+  SDValue NewMemOpChain = NewMemOp.getValue(1);
+  return makeEquivalentMemoryOrdering(OldChain, NewMemOpChain);
+}
+
 SDValue SelectionDAG::getSymbolFunctionGlobalAddress(SDValue Op,
                                                      Function **OutFunction) {
   assert(isa<ExternalSymbolSDNode>(Op) && "Node should be an ExternalSymbol");
@@ -9006,6 +9135,18 @@ ConstantSDNode *llvm::isConstOrConstSplat(SDValue N, bool AllowUndefs,
   if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N))
     return CN;
 
+  // SplatVectors can truncate their operands. Ignore that case here unless
+  // AllowTruncation is set.
+  if (N->getOpcode() == ISD::SPLAT_VECTOR) {
+    EVT VecEltVT = N->getValueType(0).getVectorElementType();
+    if (auto *CN = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
+      EVT CVT = CN->getValueType(0);
+      assert(CVT.bitsGE(VecEltVT) && "Illegal splat_vector element extension");
+      if (AllowTruncation || CVT == VecEltVT)
+        return CN;
+    }
+  }
+
   if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(N)) {
     BitVector UndefElements;
     ConstantSDNode *CN = BV->getConstantSplatNode(&UndefElements);
@@ -9059,6 +9200,10 @@ ConstantFPSDNode *llvm::isConstOrConstSplatFP(SDValue N, bool AllowUndefs) {
       return CN;
   }
 
+  if (N.getOpcode() == ISD::SPLAT_VECTOR)
+    if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N.getOperand(0)))
+      return CN;
+
   return nullptr;
 }
 
@@ -9220,8 +9365,7 @@ bool SDNode::areOnlyUsersOf(ArrayRef<const SDNode *> Nodes, const SDNode *N) {
   bool Seen = false;
   for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) {
     SDNode *User = *I;
-    if (llvm::any_of(Nodes,
-                     [&User](const SDNode *Node) { return User == Node; }))
+    if (llvm::is_contained(Nodes, User))
       Seen = true;
     else
       return false;
@@ -9232,7 +9376,7 @@ bool SDNode::areOnlyUsersOf(ArrayRef<const SDNode *> Nodes, const SDNode *N) {
 
 /// isOperand - Return true if this node is an operand of N.
 bool SDValue::isOperandOf(const SDNode *N) const {
-  return any_of(N->op_values(), [this](SDValue Op) { return *this == Op; });
+  return is_contained(N->op_values(), *this);
 }
 
 bool SDNode::isOperandOf(const SDNode *N) const {
@@ -9616,24 +9760,24 @@ std::pair<EVT, EVT>
 SelectionDAG::GetDependentSplitDestVTs(const EVT &VT, const EVT &EnvVT,
                                        bool *HiIsEmpty) const {
   EVT EltTp = VT.getVectorElementType();
-  bool IsScalable = VT.isScalableVector();
   // Examples:
   //   custom VL=8  with enveloping VL=8/8 yields 8/0 (hi empty)
   //   custom VL=9  with enveloping VL=8/8 yields 8/1
   //   custom VL=10 with enveloping VL=8/8 yields 8/2
   //   etc.
-  unsigned VTNumElts = VT.getVectorNumElements();
-  unsigned EnvNumElts = EnvVT.getVectorNumElements();
+  ElementCount VTNumElts = VT.getVectorElementCount();
+  ElementCount EnvNumElts = EnvVT.getVectorElementCount();
+  assert(VTNumElts.isScalable() == EnvNumElts.isScalable() &&
+         "Mixing fixed width and scalable vectors when enveloping a type");
   EVT LoVT, HiVT;
-  if (VTNumElts > EnvNumElts) {
+  if (VTNumElts.getKnownMinValue() > EnvNumElts.getKnownMinValue()) {
     LoVT = EnvVT;
-    HiVT = EVT::getVectorVT(*getContext(), EltTp, VTNumElts - EnvNumElts,
-                            IsScalable);
+    HiVT = EVT::getVectorVT(*getContext(), EltTp, VTNumElts - EnvNumElts);
     *HiIsEmpty = false;
   } else {
     // Flag that hi type has zero storage size, but return split envelop type
     // (this would be easier if vector types with zero elements were allowed).
-    LoVT = EVT::getVectorVT(*getContext(), EltTp, VTNumElts, IsScalable);
+    LoVT = EVT::getVectorVT(*getContext(), EltTp, VTNumElts);
     HiVT = EnvVT;
     *HiIsEmpty = true;
   }
@@ -9768,16 +9912,16 @@ bool BuildVectorSDNode::isConstantSplat(APInt &SplatValue, APInt &SplatUndef,
 
 SDValue BuildVectorSDNode::getSplatValue(const APInt &DemandedElts,
                                          BitVector *UndefElements) const {
+  unsigned NumOps = getNumOperands();
   if (UndefElements) {
     UndefElements->clear();
-    UndefElements->resize(getNumOperands());
+    UndefElements->resize(NumOps);
   }
-  assert(getNumOperands() == DemandedElts.getBitWidth() &&
-         "Unexpected vector size");
+  assert(NumOps == DemandedElts.getBitWidth() && "Unexpected vector size");
   if (!DemandedElts)
     return SDValue();
   SDValue Splatted;
-  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+  for (unsigned i = 0; i != NumOps; ++i) {
     if (!DemandedElts[i])
       continue;
     SDValue Op = getOperand(i);
@@ -9806,6 +9950,58 @@ SDValue BuildVectorSDNode::getSplatValue(BitVector *UndefElements) const {
   return getSplatValue(DemandedElts, UndefElements);
 }
 
+bool BuildVectorSDNode::getRepeatedSequence(const APInt &DemandedElts,
+                                            SmallVectorImpl<SDValue> &Sequence,
+                                            BitVector *UndefElements) const {
+  unsigned NumOps = getNumOperands();
+  Sequence.clear();
+  if (UndefElements) {
+    UndefElements->clear();
+    UndefElements->resize(NumOps);
+  }
+  assert(NumOps == DemandedElts.getBitWidth() && "Unexpected vector size");
+  if (!DemandedElts || NumOps < 2 || !isPowerOf2_32(NumOps))
+    return false;
+
+  // Set the undefs even if we don't find a sequence (like getSplatValue).
+  if (UndefElements)
+    for (unsigned I = 0; I != NumOps; ++I)
+      if (DemandedElts[I] && getOperand(I).isUndef())
+        (*UndefElements)[I] = true;
+
+  // Iteratively widen the sequence length looking for repetitions.
+  for (unsigned SeqLen = 1; SeqLen < NumOps; SeqLen *= 2) {
+    Sequence.append(SeqLen, SDValue());
+    for (unsigned I = 0; I != NumOps; ++I) {
+      if (!DemandedElts[I])
+        continue;
+      SDValue &SeqOp = Sequence[I % SeqLen];
+      SDValue Op = getOperand(I);
+      if (Op.isUndef()) {
+        if (!SeqOp)
+          SeqOp = Op;
+        continue;
+      }
+      if (SeqOp && !SeqOp.isUndef() && SeqOp != Op) {
+        Sequence.clear();
+        break;
+      }
+      SeqOp = Op;
+    }
+    if (!Sequence.empty())
+      return true;
+  }
+
+  assert(Sequence.empty() && "Failed to empty non-repeating sequence pattern");
+  return false;
+}
+
+bool BuildVectorSDNode::getRepeatedSequence(SmallVectorImpl<SDValue> &Sequence,
+                                            BitVector *UndefElements) const {
+  APInt DemandedElts = APInt::getAllOnesValue(getNumOperands());
+  return getRepeatedSequence(DemandedElts, Sequence, UndefElements);
+}
+
 ConstantSDNode *
 BuildVectorSDNode::getConstantSplatNode(const APInt &DemandedElts,
                                         BitVector *UndefElements) const {
@@ -9878,7 +10074,7 @@ bool ShuffleVectorSDNode::isSplatMask(const int *Mask, EVT VT) {
 
 // Returns the SDNode if it is a constant integer BuildVector
 // or constant integer.
-SDNode *SelectionDAG::isConstantIntBuildVectorOrConstantInt(SDValue N) {
+SDNode *SelectionDAG::isConstantIntBuildVectorOrConstantInt(SDValue N) const {
   if (isa<ConstantSDNode>(N))
     return N.getNode();
   if (ISD::isBuildVectorOfConstantSDNodes(N.getNode()))
@@ -9889,10 +10085,15 @@ SDNode *SelectionDAG::isConstantIntBuildVectorOrConstantInt(SDValue N) {
     if (GA->getOpcode() == ISD::GlobalAddress &&
         TLI->isOffsetFoldingLegal(GA))
       return GA;
+  if ((N.getOpcode() == ISD::SPLAT_VECTOR) &&
+      isa<ConstantSDNode>(N.getOperand(0)))
+    return N.getNode();
   return nullptr;
 }
 
-SDNode *SelectionDAG::isConstantFPBuildVectorOrConstantFP(SDValue N) {
+// Returns the SDNode if it is a constant float BuildVector
+// or constant float.
+SDNode *SelectionDAG::isConstantFPBuildVectorOrConstantFP(SDValue N) const {
   if (isa<ConstantFPSDNode>(N))
     return N.getNode();
 
@@ -9914,13 +10115,14 @@ void SelectionDAG::createOperands(SDNode *Node, ArrayRef<SDValue> Vals) {
     Ops[I].setUser(Node);
     Ops[I].setInitial(Vals[I]);
     if (Ops[I].Val.getValueType() != MVT::Other) // Skip Chain. It does not carry divergence.
-      IsDivergent = IsDivergent || Ops[I].getNode()->isDivergent();
+      IsDivergent |= Ops[I].getNode()->isDivergent();
   }
   Node->NumOperands = Vals.size();
   Node->OperandList = Ops;
-  IsDivergent |= TLI->isSDNodeSourceOfDivergence(Node, FLI, DA);
-  if (!TLI->isSDNodeAlwaysUniform(Node))
+  if (!TLI->isSDNodeAlwaysUniform(Node)) {
+    IsDivergent |= TLI->isSDNodeSourceOfDivergence(Node, FLI, DA);
     Node->SDNodeBits.IsDivergent = IsDivergent;
+  }
   checkForCycles(Node);
 }
 
@@ -9937,6 +10139,44 @@ SDValue SelectionDAG::getTokenFactor(const SDLoc &DL,
   return getNode(ISD::TokenFactor, DL, MVT::Other, Vals);
 }
 
+SDValue SelectionDAG::getNeutralElement(unsigned Opcode, const SDLoc &DL,
+                                        EVT VT, SDNodeFlags Flags) {
+  switch (Opcode) {
+  default:
+    return SDValue();
+  case ISD::ADD:
+  case ISD::OR:
+  case ISD::XOR:
+  case ISD::UMAX:
+    return getConstant(0, DL, VT);
+  case ISD::MUL:
+    return getConstant(1, DL, VT);
+  case ISD::AND:
+  case ISD::UMIN:
+    return getAllOnesConstant(DL, VT);
+  case ISD::SMAX:
+    return getConstant(APInt::getSignedMinValue(VT.getSizeInBits()), DL, VT);
+  case ISD::SMIN:
+    return getConstant(APInt::getSignedMaxValue(VT.getSizeInBits()), DL, VT);
+  case ISD::FADD:
+    return getConstantFP(-0.0, DL, VT);
+  case ISD::FMUL:
+    return getConstantFP(1.0, DL, VT);
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM: {
+    // Neutral element for fminnum is NaN, Inf or FLT_MAX, depending on FMF.
+    const fltSemantics &Semantics = EVTToAPFloatSemantics(VT);
+    APFloat NeutralAF = !Flags.hasNoNaNs() ? APFloat::getQNaN(Semantics) :
+                        !Flags.hasNoInfs() ? APFloat::getInf(Semantics) :
+                        APFloat::getLargest(Semantics);
+    if (Opcode == ISD::FMAXNUM)
+      NeutralAF.changeSign();
+
+    return getConstantFP(NeutralAF, DL, VT);
+  }
+  }
+}
+
 #ifndef NDEBUG
 static void checkForCyclesHelper(const SDNode *N,
                                  SmallPtrSetImpl<const SDNode*> &Visited,
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
index 3a53ab9717a4..20c7d771bfb6 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/SelectionDAGAddressAnalysis.h"
+#include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -96,18 +97,28 @@ bool BaseIndexOffset::computeAliasing(const SDNode *Op0,
   int64_t PtrDiff;
   if (NumBytes0.hasValue() && NumBytes1.hasValue() &&
       BasePtr0.equalBaseIndex(BasePtr1, DAG, PtrDiff)) {
+    // If the size of memory access is unknown, do not use it to analysis.
+    // One example of unknown size memory access is to load/store scalable
+    // vector objects on the stack.
     // BasePtr1 is PtrDiff away from BasePtr0. They alias if none of the
     // following situations arise:
-    IsAlias = !(
-        // [----BasePtr0----]
-        //                         [---BasePtr1--]
-        // ========PtrDiff========>
-        (*NumBytes0 <= PtrDiff) ||
-        //                     [----BasePtr0----]
-        // [---BasePtr1--]
-        // =====(-PtrDiff)====>
-        (PtrDiff + *NumBytes1 <= 0)); // i.e. *NumBytes1 < -PtrDiff.
-    return true;
+    if (PtrDiff >= 0 &&
+        *NumBytes0 != static_cast<int64_t>(MemoryLocation::UnknownSize)) {
+      // [----BasePtr0----]
+      //                         [---BasePtr1--]
+      // ========PtrDiff========>
+      IsAlias = !(*NumBytes0 <= PtrDiff);
+      return true;
+    }
+    if (PtrDiff < 0 &&
+        *NumBytes1 != static_cast<int64_t>(MemoryLocation::UnknownSize)) {
+      //                     [----BasePtr0----]
+      // [---BasePtr1--]
+      // =====(-PtrDiff)====>
+      IsAlias = !((PtrDiff + *NumBytes1) <= 0);
+      return true;
+    }
+    return false;
   }
   // If both BasePtr0 and BasePtr1 are FrameIndexes, we will not be
   // able to calculate their relative offset if at least one arises
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index d2930391f87a..a6bd774934ac 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -14,15 +14,12 @@
 #include "SDNodeDbgValue.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
-#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
@@ -40,7 +37,6 @@
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/GCMetadata.h"
-#include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -53,17 +49,14 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 #include "llvm/CodeGen/StackMaps.h"
 #include "llvm/CodeGen/SwiftErrorValueTracking.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
-#include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
-#include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/CodeGen/WinEHFuncInfo.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
@@ -75,13 +68,11 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
@@ -99,31 +90,22 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/AtomicOrdering.h"
-#include "llvm/Support/BranchProbability.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetIntrinsicInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include <algorithm>
-#include <cassert>
 #include <cstddef>
-#include <cstdint>
 #include <cstring>
 #include <iterator>
 #include <limits>
 #include <numeric>
 #include <tuple>
-#include <utility>
-#include <vector>
 
 using namespace llvm;
 using namespace PatternMatch;
@@ -422,10 +404,10 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
     // vector widening case (e.g. <2 x float> -> <4 x float>).  Extract the
     // elements we want.
     if (PartEVT.getVectorElementType() == ValueVT.getVectorElementType()) {
-      assert((PartEVT.getVectorElementCount().Min >
-              ValueVT.getVectorElementCount().Min) &&
-             (PartEVT.getVectorElementCount().Scalable ==
-              ValueVT.getVectorElementCount().Scalable) &&
+      assert((PartEVT.getVectorElementCount().getKnownMinValue() >
+              ValueVT.getVectorElementCount().getKnownMinValue()) &&
+             (PartEVT.getVectorElementCount().isScalable() ==
+              ValueVT.getVectorElementCount().isScalable()) &&
              "Cannot narrow, it would be a lossy transformation");
       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ValueVT, Val,
                          DAG.getVectorIdxConstant(0, DL));
@@ -453,7 +435,7 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
      // are the same size, this is an obvious bitcast.
      if (ValueVT.getSizeInBits() == PartEVT.getSizeInBits()) {
        return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
-     } else if (ValueVT.getSizeInBits() < PartEVT.getSizeInBits()) {
+     } else if (ValueVT.bitsLT(PartEVT)) {
        // Bitcast Val back the original type and extract the corresponding
        // vector we want.
        unsigned Elts = PartEVT.getSizeInBits() / ValueVT.getScalarSizeInBits();
@@ -683,14 +665,14 @@ static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &DL,
       // Promoted vector extract
       Val = DAG.getAnyExtOrTrunc(Val, DL, PartVT);
     } else {
-      if (ValueVT.getVectorNumElements() == 1) {
+      if (ValueVT.getVectorElementCount().isScalar()) {
         Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, PartVT, Val,
                           DAG.getVectorIdxConstant(0, DL));
       } else {
-        assert(PartVT.getSizeInBits() > ValueVT.getSizeInBits() &&
+        uint64_t ValueSize = ValueVT.getFixedSizeInBits();
+        assert(PartVT.getFixedSizeInBits() > ValueSize &&
                "lossy conversion of vector to scalar type");
-        EVT IntermediateType =
-            EVT::getIntegerVT(*DAG.getContext(), ValueVT.getSizeInBits());
+        EVT IntermediateType = EVT::getIntegerVT(*DAG.getContext(), ValueSize);
         Val = DAG.getBitcast(IntermediateType, Val);
         Val = DAG.getAnyExtOrTrunc(Val, DL, PartVT);
       }
@@ -723,15 +705,15 @@ static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &DL,
   assert(IntermediateVT.isScalableVector() == ValueVT.isScalableVector() &&
          "Mixing scalable and fixed vectors when copying in parts");
 
-  ElementCount DestEltCnt;
+  Optional<ElementCount> DestEltCnt;
 
   if (IntermediateVT.isVector())
     DestEltCnt = IntermediateVT.getVectorElementCount() * NumIntermediates;
   else
-    DestEltCnt = ElementCount(NumIntermediates, false);
+    DestEltCnt = ElementCount::getFixed(NumIntermediates);
 
   EVT BuiltVectorTy = EVT::getVectorVT(
-      *DAG.getContext(), IntermediateVT.getScalarType(), DestEltCnt);
+      *DAG.getContext(), IntermediateVT.getScalarType(), DestEltCnt.getValue());
   if (ValueVT != BuiltVectorTy) {
     if (SDValue Widened = widenVectorToPartType(DAG, Val, DL, BuiltVectorTy))
       Val = Widened;
@@ -975,7 +957,7 @@ void RegsForValue::AddInlineAsmOperands(unsigned Code, bool HasMatching,
     // shouldn't try to apply any sort of splitting logic to them.
     assert(Regs.size() == RegVTs.size() && Regs.size() == ValueVTs.size() &&
            "No 1:1 mapping from clobbers to regs?");
-    unsigned SP = TLI.getStackPointerRegisterToSaveRestore();
+    Register SP = TLI.getStackPointerRegisterToSaveRestore();
     (void)SP;
     for (unsigned I = 0, E = ValueVTs.size(); I != E; ++I) {
       Ops.push_back(DAG.getRegister(Regs[I], RegVTs[I]));
@@ -998,14 +980,14 @@ void RegsForValue::AddInlineAsmOperands(unsigned Code, bool HasMatching,
   }
 }
 
-SmallVector<std::pair<unsigned, unsigned>, 4>
+SmallVector<std::pair<unsigned, TypeSize>, 4>
 RegsForValue::getRegsAndSizes() const {
-  SmallVector<std::pair<unsigned, unsigned>, 4> OutVec;
+  SmallVector<std::pair<unsigned, TypeSize>, 4> OutVec;
   unsigned I = 0;
   for (auto CountAndVT : zip_first(RegCount, RegVTs)) {
     unsigned RegCount = std::get<0>(CountAndVT);
     MVT RegisterVT = std::get<1>(CountAndVT);
-    unsigned RegisterSize = RegisterVT.getSizeInBits();
+    TypeSize RegisterSize = RegisterVT.getSizeInBits();
     for (unsigned E = I + RegCount; I != E; ++I)
       OutVec.push_back(std::make_pair(Regs[I], RegisterSize));
   }
@@ -1114,25 +1096,6 @@ void SelectionDAGBuilder::visit(const Instruction &I) {
 
   visit(I.getOpcode(), I);
 
-  if (auto *FPMO = dyn_cast<FPMathOperator>(&I)) {
-    // ConstrainedFPIntrinsics handle their own FMF.
-    if (!isa<ConstrainedFPIntrinsic>(&I)) {
-      // Propagate the fast-math-flags of this IR instruction to the DAG node that
-      // maps to this instruction.
-      // TODO: We could handle all flags (nsw, etc) here.
-      // TODO: If an IR instruction maps to >1 node, only the final node will have
-      //       flags set.
-      if (SDNode *Node = getNodeForIRValue(&I)) {
-        SDNodeFlags IncomingFlags;
-        IncomingFlags.copyFMF(*FPMO);
-        if (!Node->getFlags().isDefined())
-          Node->setFlags(IncomingFlags);
-        else
-          Node->intersectFlagsWith(IncomingFlags);
-      }
-    }
-  }
-
   if (!I.isTerminator() && !HasTailCall &&
       !isa<GCStatepointInst>(I)) // statepoints handle their exports internally
     CopyToExportRegsIfNeeded(&I);
@@ -1178,7 +1141,7 @@ void SelectionDAGBuilder::dropDanglingDebugInfo(const DILocalVariable *Variable,
       if (isMatchingDbgValue(DDI))
         salvageUnresolvedDbgValue(DDI);
 
-    DDIV.erase(remove_if(DDIV, isMatchingDbgValue), DDIV.end());
+    erase_if(DDIV, isMatchingDbgValue);
   }
 }
 
@@ -1551,6 +1514,9 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
     if (const BlockAddress *BA = dyn_cast<BlockAddress>(C))
       return DAG.getBlockAddress(BA, VT);
 
+    if (const auto *Equiv = dyn_cast<DSOLocalEquivalent>(C))
+      return getValue(Equiv->getGlobalValue());
+
     VectorType *VecTy = cast<VectorType>(V->getType());
 
     // Now that we know the number and type of the elements, get that number of
@@ -1671,10 +1637,32 @@ void SelectionDAGBuilder::visitCleanupPad(const CleanupPadInst &CPI) {
   }
 }
 
-// For wasm, there's alwyas a single catch pad attached to a catchswitch, and
-// the control flow always stops at the single catch pad, as it does for a
-// cleanup pad. In case the exception caught is not of the types the catch pad
-// catches, it will be rethrown by a rethrow.
+// In wasm EH, even though a catchpad may not catch an exception if a tag does
+// not match, it is OK to add only the first unwind destination catchpad to the
+// successors, because there will be at least one invoke instruction within the
+// catch scope that points to the next unwind destination, if one exists, so
+// CFGSort cannot mess up with BB sorting order.
+// (All catchpads with 'catch (type)' clauses have a 'llvm.rethrow' intrinsic
+// call within them, and catchpads only consisting of 'catch (...)' have a
+// '__cxa_end_catch' call within them, both of which generate invokes in case
+// the next unwind destination exists, i.e., the next unwind destination is not
+// the caller.)
+//
+// Having at most one EH pad successor is also simpler and helps later
+// transformations.
+//
+// For example,
+// current:
+//   invoke void @foo to ... unwind label %catch.dispatch
+// catch.dispatch:
+//   %0 = catchswitch within ... [label %catch.start] unwind label %next
+// catch.start:
+//   ...
+//   ... in this BB or some other child BB dominated by this BB there will be an
+//   invoke that points to 'next' BB as an unwind destination
+//
+// next: ; We don't need to add this to 'current' BB's successor
+//   ...
 static void findWasmUnwindDestinations(
     FunctionLoweringInfo &FuncInfo, const BasicBlock *EHPadBB,
     BranchProbability Prob,
@@ -1837,7 +1825,8 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
     for (unsigned i = 0; i != NumValues; ++i) {
       // An aggregate return value cannot wrap around the address space, so
       // offsets to its parts don't wrap either.
-      SDValue Ptr = DAG.getObjectPtrOffset(getCurSDLoc(), RetPtr, Offsets[i]);
+      SDValue Ptr = DAG.getObjectPtrOffset(getCurSDLoc(), RetPtr,
+                                           TypeSize::Fixed(Offsets[i]));
 
       SDValue Val = RetOp.getValue(RetOp.getResNo() + i);
       if (MemVTs[i] != ValueVTs[i])
@@ -2118,14 +2107,19 @@ void SelectionDAGBuilder::FindMergedConditions(const Value *Cond,
   }
 
   const Instruction *BOp = dyn_cast<Instruction>(Cond);
+  const Value *BOpOp0, *BOpOp1;
   // Compute the effective opcode for Cond, taking into account whether it needs
   // to be inverted, e.g.
   //   and (not (or A, B)), C
   // gets lowered as
   //   and (and (not A, not B), C)
-  unsigned BOpc = 0;
+  Instruction::BinaryOps BOpc = (Instruction::BinaryOps)0;
   if (BOp) {
-    BOpc = BOp->getOpcode();
+    BOpc = match(BOp, m_LogicalAnd(m_Value(BOpOp0), m_Value(BOpOp1)))
+               ? Instruction::And
+               : (match(BOp, m_LogicalOr(m_Value(BOpOp0), m_Value(BOpOp1)))
+                      ? Instruction::Or
+                      : (Instruction::BinaryOps)0);
     if (InvertCond) {
       if (BOpc == Instruction::And)
         BOpc = Instruction::Or;
@@ -2135,11 +2129,11 @@ void SelectionDAGBuilder::FindMergedConditions(const Value *Cond,
   }
 
   // If this node is not part of the or/and tree, emit it as a branch.
-  if (!BOp || !(isa<BinaryOperator>(BOp) || isa<CmpInst>(BOp)) ||
-      BOpc != unsigned(Opc) || !BOp->hasOneUse() ||
-      BOp->getParent() != CurBB->getBasicBlock() ||
-      !InBlock(BOp->getOperand(0), CurBB->getBasicBlock()) ||
-      !InBlock(BOp->getOperand(1), CurBB->getBasicBlock())) {
+  // Note that all nodes in the tree should have same opcode.
+  bool BOpIsInOrAndTree = BOpc && BOpc == Opc && BOp->hasOneUse();
+  if (!BOpIsInOrAndTree || BOp->getParent() != CurBB->getBasicBlock() ||
+      !InBlock(BOpOp0, CurBB->getBasicBlock()) ||
+      !InBlock(BOpOp1, CurBB->getBasicBlock())) {
     EmitBranchForMergedCondition(Cond, TBB, FBB, CurBB, SwitchBB,
                                  TProb, FProb, InvertCond);
     return;
@@ -2175,15 +2169,15 @@ void SelectionDAGBuilder::FindMergedConditions(const Value *Cond,
     auto NewTrueProb = TProb / 2;
     auto NewFalseProb = TProb / 2 + FProb;
     // Emit the LHS condition.
-    FindMergedConditions(BOp->getOperand(0), TBB, TmpBB, CurBB, SwitchBB, Opc,
-                         NewTrueProb, NewFalseProb, InvertCond);
+    FindMergedConditions(BOpOp0, TBB, TmpBB, CurBB, SwitchBB, Opc, NewTrueProb,
+                         NewFalseProb, InvertCond);
 
     // Normalize A/2 and B to get A/(1+B) and 2B/(1+B).
     SmallVector<BranchProbability, 2> Probs{TProb / 2, FProb};
     BranchProbability::normalizeProbabilities(Probs.begin(), Probs.end());
     // Emit the RHS condition into TmpBB.
-    FindMergedConditions(BOp->getOperand(1), TBB, FBB, TmpBB, SwitchBB, Opc,
-                         Probs[0], Probs[1], InvertCond);
+    FindMergedConditions(BOpOp1, TBB, FBB, TmpBB, SwitchBB, Opc, Probs[0],
+                         Probs[1], InvertCond);
   } else {
     assert(Opc == Instruction::And && "Unknown merge op!");
     // Codegen X & Y as:
@@ -2208,15 +2202,15 @@ void SelectionDAGBuilder::FindMergedConditions(const Value *Cond,
     auto NewTrueProb = TProb + FProb / 2;
     auto NewFalseProb = FProb / 2;
     // Emit the LHS condition.
-    FindMergedConditions(BOp->getOperand(0), TmpBB, FBB, CurBB, SwitchBB, Opc,
-                         NewTrueProb, NewFalseProb, InvertCond);
+    FindMergedConditions(BOpOp0, TmpBB, FBB, CurBB, SwitchBB, Opc, NewTrueProb,
+                         NewFalseProb, InvertCond);
 
     // Normalize A and B/2 to get 2A/(1+A) and B/(1+A).
     SmallVector<BranchProbability, 2> Probs{TProb, FProb / 2};
     BranchProbability::normalizeProbabilities(Probs.begin(), Probs.end());
     // Emit the RHS condition into TmpBB.
-    FindMergedConditions(BOp->getOperand(1), TBB, FBB, TmpBB, SwitchBB, Opc,
-                         Probs[0], Probs[1], InvertCond);
+    FindMergedConditions(BOpOp1, TBB, FBB, TmpBB, SwitchBB, Opc, Probs[0],
+                         Probs[1], InvertCond);
   }
 }
 
@@ -2293,16 +2287,20 @@ void SelectionDAGBuilder::visitBr(const BranchInst &I) {
   //     je foo
   //     cmp D, E
   //     jle foo
-  if (const BinaryOperator *BOp = dyn_cast<BinaryOperator>(CondVal)) {
-    Instruction::BinaryOps Opcode = BOp->getOpcode();
-    Value *Vec, *BOp0 = BOp->getOperand(0), *BOp1 = BOp->getOperand(1);
-    if (!DAG.getTargetLoweringInfo().isJumpExpensive() && BOp->hasOneUse() &&
-        !I.hasMetadata(LLVMContext::MD_unpredictable) &&
-        (Opcode == Instruction::And || Opcode == Instruction::Or) &&
-        !(match(BOp0, m_ExtractElt(m_Value(Vec), m_Value())) &&
-          match(BOp1, m_ExtractElt(m_Specific(Vec), m_Value())))) {
-      FindMergedConditions(BOp, Succ0MBB, Succ1MBB, BrMBB, BrMBB,
-                           Opcode,
+  const Instruction *BOp = dyn_cast<Instruction>(CondVal);
+  if (!DAG.getTargetLoweringInfo().isJumpExpensive() && BOp &&
+      BOp->hasOneUse() && !I.hasMetadata(LLVMContext::MD_unpredictable)) {
+    Value *Vec;
+    const Value *BOp0, *BOp1;
+    Instruction::BinaryOps Opcode = (Instruction::BinaryOps)0;
+    if (match(BOp, m_LogicalAnd(m_Value(BOp0), m_Value(BOp1))))
+      Opcode = Instruction::And;
+    else if (match(BOp, m_LogicalOr(m_Value(BOp0), m_Value(BOp1))))
+      Opcode = Instruction::Or;
+
+    if (Opcode && !(match(BOp0, m_ExtractElt(m_Value(Vec), m_Value())) &&
+                    match(BOp1, m_ExtractElt(m_Specific(Vec), m_Value())))) {
+      FindMergedConditions(BOp, Succ0MBB, Succ1MBB, BrMBB, BrMBB, Opcode,
                            getEdgeProbability(BrMBB, Succ0MBB),
                            getEdgeProbability(BrMBB, Succ1MBB),
                            /*InvertCond=*/false);
@@ -2551,7 +2549,7 @@ void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD,
   SDLoc dl = getCurSDLoc();
   SDValue StackSlotPtr = DAG.getFrameIndex(FI, PtrTy);
   const Module &M = *ParentBB->getParent()->getFunction().getParent();
-  unsigned Align = DL->getPrefTypeAlignment(Type::getInt8PtrTy(M.getContext()));
+  Align Align = DL->getPrefTypeAlign(Type::getInt8PtrTy(M.getContext()));
 
   // Generate code to load the content of the guard slot.
   SDValue GuardVal = DAG.getLoad(
@@ -2809,7 +2807,7 @@ void SelectionDAGBuilder::visitInvoke(const InvokeInst &I) {
     case Intrinsic::experimental_gc_statepoint:
       LowerStatepoint(cast<GCStatepointInst>(I), EHPadBB);
       break;
-    case Intrinsic::wasm_rethrow_in_catch: {
+    case Intrinsic::wasm_rethrow: {
       // This is usually done in visitTargetIntrinsic, but this intrinsic is
       // special because it can be invoked, so we manually lower it to a DAG
       // node here.
@@ -2817,7 +2815,7 @@ void SelectionDAGBuilder::visitInvoke(const InvokeInst &I) {
       Ops.push_back(getRoot()); // inchain
       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
       Ops.push_back(
-          DAG.getTargetConstant(Intrinsic::wasm_rethrow_in_catch, getCurSDLoc(),
+          DAG.getTargetConstant(Intrinsic::wasm_rethrow, getCurSDLoc(),
                                 TLI.getPointerTy(DAG.getDataLayout())));
       SDVTList VTs = DAG.getVTList(ArrayRef<EVT>({MVT::Other})); // outchain
       DAG.setRoot(DAG.getNode(ISD::INTRINSIC_VOID, getCurSDLoc(), VTs, Ops));
@@ -2999,20 +2997,6 @@ void SelectionDAGBuilder::visitUnreachable(const UnreachableInst &I) {
   DAG.setRoot(DAG.getNode(ISD::TRAP, getCurSDLoc(), MVT::Other, DAG.getRoot()));
 }
 
-void SelectionDAGBuilder::visitFSub(const User &I) {
-  // -0.0 - X --> fneg
-  Type *Ty = I.getType();
-  if (isa<Constant>(I.getOperand(0)) &&
-      I.getOperand(0) == ConstantFP::getZeroValueForNegation(Ty)) {
-    SDValue Op2 = getValue(I.getOperand(1));
-    setValue(&I, DAG.getNode(ISD::FNEG, getCurSDLoc(),
-                             Op2.getValueType(), Op2));
-    return;
-  }
-
-  visitBinary(I, ISD::FSUB);
-}
-
 void SelectionDAGBuilder::visitUnary(const User &I, unsigned Opcode) {
   SDNodeFlags Flags;
 
@@ -3028,9 +3012,10 @@ void SelectionDAGBuilder::visitBinary(const User &I, unsigned Opcode) {
     Flags.setNoSignedWrap(OFBinOp->hasNoSignedWrap());
     Flags.setNoUnsignedWrap(OFBinOp->hasNoUnsignedWrap());
   }
-  if (auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I)) {
+  if (auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I))
     Flags.setExact(ExactOp->isExact());
-  }
+  if (auto *FPOp = dyn_cast<FPMathOperator>(&I))
+    Flags.copyFMF(*FPOp);
 
   SDValue Op1 = getValue(I.getOperand(0));
   SDValue Op2 = getValue(I.getOperand(1));
@@ -3140,10 +3125,14 @@ void SelectionDAGBuilder::visitFCmp(const User &I) {
   SDValue Op2 = getValue(I.getOperand(1));
 
   ISD::CondCode Condition = getFCmpCondCode(predicate);
-  auto *FPMO = dyn_cast<FPMathOperator>(&I);
-  if ((FPMO && FPMO->hasNoNaNs()) || TM.Options.NoNaNsFPMath)
+  auto *FPMO = cast<FPMathOperator>(&I);
+  if (FPMO->hasNoNaNs() || TM.Options.NoNaNsFPMath)
     Condition = getFCmpCodeWithoutNaN(Condition);
 
+  SDNodeFlags Flags;
+  Flags.copyFMF(*FPMO);
+  SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);
+
   EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
                                                         I.getType());
   setValue(&I, DAG.getSetCC(getCurSDLoc(), DestVT, Op1, Op2, Condition));
@@ -3173,6 +3162,11 @@ void SelectionDAGBuilder::visitSelect(const User &I) {
       Cond.getValueType().isVector() ? ISD::VSELECT : ISD::SELECT;
 
   bool IsUnaryAbs = false;
+  bool Negate = false;
+
+  SDNodeFlags Flags;
+  if (auto *FPOp = dyn_cast<FPMathOperator>(&I))
+    Flags.copyFMF(*FPOp);
 
   // Min/max matching is only viable if all output VTs are the same.
   if (is_splat(ValueVTs)) {
@@ -3233,12 +3227,13 @@ void SelectionDAGBuilder::visitSelect(const User &I) {
         break;
       }
       break;
+    case SPF_NABS:
+      Negate = true;
+      LLVM_FALLTHROUGH;
     case SPF_ABS:
       IsUnaryAbs = true;
       Opc = ISD::ABS;
       break;
-    case SPF_NABS:
-      // TODO: we need to produce sub(0, abs(X)).
     default: break;
     }
 
@@ -3265,10 +3260,13 @@ void SelectionDAGBuilder::visitSelect(const User &I) {
 
   if (IsUnaryAbs) {
     for (unsigned i = 0; i != NumValues; ++i) {
+      SDLoc dl = getCurSDLoc();
+      EVT VT = LHSVal.getNode()->getValueType(LHSVal.getResNo() + i);
       Values[i] =
-          DAG.getNode(OpCode, getCurSDLoc(),
-                      LHSVal.getNode()->getValueType(LHSVal.getResNo() + i),
-                      SDValue(LHSVal.getNode(), LHSVal.getResNo() + i));
+          DAG.getNode(OpCode, dl, VT, LHSVal.getValue(LHSVal.getResNo() + i));
+      if (Negate)
+        Values[i] = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT),
+                                Values[i]);
     }
   } else {
     for (unsigned i = 0; i != NumValues; ++i) {
@@ -3277,7 +3275,7 @@ void SelectionDAGBuilder::visitSelect(const User &I) {
       Ops.push_back(SDValue(RHSVal.getNode(), RHSVal.getResNo() + i));
       Values[i] = DAG.getNode(
           OpCode, getCurSDLoc(),
-          LHSVal.getNode()->getValueType(LHSVal.getResNo() + i), Ops);
+          LHSVal.getNode()->getValueType(LHSVal.getResNo() + i), Ops, Flags);
     }
   }
 
@@ -3419,7 +3417,7 @@ void SelectionDAGBuilder::visitAddrSpaceCast(const User &I) {
   unsigned SrcAS = SV->getType()->getPointerAddressSpace();
   unsigned DestAS = I.getType()->getPointerAddressSpace();
 
-  if (!TLI.isNoopAddrSpaceCast(SrcAS, DestAS))
+  if (!TM.isNoopAddrSpaceCast(SrcAS, DestAS))
     N = DAG.getAddrSpaceCast(getCurSDLoc(), DestVT, N, SrcAS, DestAS);
 
   setValue(&I, N);
@@ -3747,20 +3745,18 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
   SDValue N = getValue(Op0);
   SDLoc dl = getCurSDLoc();
   auto &TLI = DAG.getTargetLoweringInfo();
-  MVT PtrTy = TLI.getPointerTy(DAG.getDataLayout(), AS);
-  MVT PtrMemTy = TLI.getPointerMemTy(DAG.getDataLayout(), AS);
 
   // Normalize Vector GEP - all scalar operands should be converted to the
   // splat vector.
   bool IsVectorGEP = I.getType()->isVectorTy();
   ElementCount VectorElementCount =
       IsVectorGEP ? cast<VectorType>(I.getType())->getElementCount()
-                  : ElementCount(0, false);
+                  : ElementCount::getFixed(0);
 
   if (IsVectorGEP && !N.getValueType().isVector()) {
     LLVMContext &Context = *DAG.getContext();
     EVT VT = EVT::getVectorVT(Context, N.getValueType(), VectorElementCount);
-    if (VectorElementCount.Scalable)
+    if (VectorElementCount.isScalable())
       N = DAG.getSplatVector(VT, dl, N);
     else
       N = DAG.getSplatBuildVector(VT, dl, N);
@@ -3833,7 +3829,7 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
       if (!IdxN.getValueType().isVector() && IsVectorGEP) {
         EVT VT = EVT::getVectorVT(*Context, IdxN.getValueType(),
                                   VectorElementCount);
-        if (VectorElementCount.Scalable)
+        if (VectorElementCount.isScalable())
           IdxN = DAG.getSplatVector(VT, dl, IdxN);
         else
           IdxN = DAG.getSplatBuildVector(VT, dl, IdxN);
@@ -3874,6 +3870,13 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
     }
   }
 
+  MVT PtrTy = TLI.getPointerTy(DAG.getDataLayout(), AS);
+  MVT PtrMemTy = TLI.getPointerMemTy(DAG.getDataLayout(), AS);
+  if (IsVectorGEP) {
+    PtrTy = MVT::getVectorVT(PtrTy, VectorElementCount);
+    PtrMemTy = MVT::getVectorVT(PtrMemTy, VectorElementCount);
+  }
+
   if (PtrMemTy != PtrTy && !cast<GEPOperator>(I).isInBounds())
     N = DAG.getPtrExtendInReg(N, dl, PtrMemTy);
 
@@ -4170,7 +4173,8 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
       Root = Chain;
       ChainI = 0;
     }
-    SDValue Add = DAG.getMemBasePlusOffset(Ptr, Offsets[i], dl, Flags);
+    SDValue Add =
+        DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(Offsets[i]), dl, Flags);
     SDValue Val = SDValue(Src.getNode(), Src.getResNo() + i);
     if (MemVTs[i] != ValueVTs[i])
       Val = DAG.getPtrExtOrTrunc(Val, dl, MemVTs[i]);
@@ -4332,12 +4336,12 @@ void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) {
   if (!UniformBase) {
     Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
     Index = getValue(Ptr);
-    IndexType = ISD::SIGNED_SCALED;
+    IndexType = ISD::SIGNED_UNSCALED;
     Scale = DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout()));
   }
   SDValue Ops[] = { getMemoryRoot(), Src0, Mask, Base, Index, Scale };
   SDValue Scatter = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), VT, sdl,
-                                         Ops, MMO, IndexType);
+                                         Ops, MMO, IndexType, false);
   DAG.setRoot(Scatter);
   setValue(&I, Scatter);
 }
@@ -4385,7 +4389,7 @@ void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I, bool IsExpanding) {
   // Do not serialize masked loads of constant memory with anything.
   MemoryLocation ML;
   if (VT.isScalableVector())
-    ML = MemoryLocation(PtrOperand);
+    ML = MemoryLocation::getAfter(PtrOperand);
   else
     ML = MemoryLocation(PtrOperand, LocationSize::precise(
                            DAG.getDataLayout().getTypeStoreSize(I.getType())),
@@ -4443,12 +4447,12 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
   if (!UniformBase) {
     Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
     Index = getValue(Ptr);
-    IndexType = ISD::SIGNED_SCALED;
+    IndexType = ISD::SIGNED_UNSCALED;
     Scale = DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout()));
   }
   SDValue Ops[] = { Root, Src0, Mask, Base, Index, Scale };
   SDValue Gather = DAG.getMaskedGather(DAG.getVTList(VT, MVT::Other), VT, sdl,
-                                       Ops, MMO, IndexType);
+                                       Ops, MMO, IndexType, ISD::NON_EXTLOAD);
 
   PendingLoads.push_back(Gather.getValue(1));
   setValue(&I, Gather);
@@ -4875,7 +4879,7 @@ static SDValue getLimitedPrecisionExp2(SDValue t0, const SDLoc &dl,
 /// expandExp - Lower an exp intrinsic. Handles the special sequences for
 /// limited-precision mode.
 static SDValue expandExp(const SDLoc &dl, SDValue Op, SelectionDAG &DAG,
-                         const TargetLowering &TLI) {
+                         const TargetLowering &TLI, SDNodeFlags Flags) {
   if (Op.getValueType() == MVT::f32 &&
       LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
 
@@ -4891,13 +4895,13 @@ static SDValue expandExp(const SDLoc &dl, SDValue Op, SelectionDAG &DAG,
   }
 
   // No special expansion.
-  return DAG.getNode(ISD::FEXP, dl, Op.getValueType(), Op);
+  return DAG.getNode(ISD::FEXP, dl, Op.getValueType(), Op, Flags);
 }
 
 /// expandLog - Lower a log intrinsic. Handles the special sequences for
 /// limited-precision mode.
 static SDValue expandLog(const SDLoc &dl, SDValue Op, SelectionDAG &DAG,
-                         const TargetLowering &TLI) {
+                         const TargetLowering &TLI, SDNodeFlags Flags) {
   // TODO: What fast-math-flags should be set on the floating-point nodes?
 
   if (Op.getValueType() == MVT::f32 &&
@@ -4990,13 +4994,13 @@ static SDValue expandLog(const SDLoc &dl, SDValue Op, SelectionDAG &DAG,
   }
 
   // No special expansion.
-  return DAG.getNode(ISD::FLOG, dl, Op.getValueType(), Op);
+  return DAG.getNode(ISD::FLOG, dl, Op.getValueType(), Op, Flags);
 }
 
 /// expandLog2 - Lower a log2 intrinsic. Handles the special sequences for
 /// limited-precision mode.
 static SDValue expandLog2(const SDLoc &dl, SDValue Op, SelectionDAG &DAG,
-                          const TargetLowering &TLI) {
+                          const TargetLowering &TLI, SDNodeFlags Flags) {
   // TODO: What fast-math-flags should be set on the floating-point nodes?
 
   if (Op.getValueType() == MVT::f32 &&
@@ -5087,13 +5091,13 @@ static SDValue expandLog2(const SDLoc &dl, SDValue Op, SelectionDAG &DAG,
   }
 
   // No special expansion.
-  return DAG.getNode(ISD::FLOG2, dl, Op.getValueType(), Op);
+  return DAG.getNode(ISD::FLOG2, dl, Op.getValueType(), Op, Flags);
 }
 
 /// expandLog10 - Lower a log10 intrinsic. Handles the special sequences for
 /// limited-precision mode.
 static SDValue expandLog10(const SDLoc &dl, SDValue Op, SelectionDAG &DAG,
-                           const TargetLowering &TLI) {
+                           const TargetLowering &TLI, SDNodeFlags Flags) {
   // TODO: What fast-math-flags should be set on the floating-point nodes?
 
   if (Op.getValueType() == MVT::f32 &&
@@ -5177,25 +5181,26 @@ static SDValue expandLog10(const SDLoc &dl, SDValue Op, SelectionDAG &DAG,
   }
 
   // No special expansion.
-  return DAG.getNode(ISD::FLOG10, dl, Op.getValueType(), Op);
+  return DAG.getNode(ISD::FLOG10, dl, Op.getValueType(), Op, Flags);
 }
 
 /// expandExp2 - Lower an exp2 intrinsic. Handles the special sequences for
 /// limited-precision mode.
 static SDValue expandExp2(const SDLoc &dl, SDValue Op, SelectionDAG &DAG,
-                          const TargetLowering &TLI) {
+                          const TargetLowering &TLI, SDNodeFlags Flags) {
   if (Op.getValueType() == MVT::f32 &&
       LimitFloatPrecision > 0 && LimitFloatPrecision <= 18)
     return getLimitedPrecisionExp2(Op, dl, DAG);
 
   // No special expansion.
-  return DAG.getNode(ISD::FEXP2, dl, Op.getValueType(), Op);
+  return DAG.getNode(ISD::FEXP2, dl, Op.getValueType(), Op, Flags);
 }
 
 /// visitPow - Lower a pow intrinsic. Handles the special sequences for
 /// limited-precision mode with x == 10.0f.
 static SDValue expandPow(const SDLoc &dl, SDValue LHS, SDValue RHS,
-                         SelectionDAG &DAG, const TargetLowering &TLI) {
+                         SelectionDAG &DAG, const TargetLowering &TLI,
+                         SDNodeFlags Flags) {
   bool IsExp10 = false;
   if (LHS.getValueType() == MVT::f32 && RHS.getValueType() == MVT::f32 &&
       LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
@@ -5218,7 +5223,7 @@ static SDValue expandPow(const SDLoc &dl, SDValue LHS, SDValue RHS,
   }
 
   // No special expansion.
-  return DAG.getNode(ISD::FPOW, dl, LHS.getValueType(), LHS, RHS);
+  return DAG.getNode(ISD::FPOW, dl, LHS.getValueType(), LHS, RHS, Flags);
 }
 
 /// ExpandPowI - Expand a llvm.powi intrinsic.
@@ -5343,7 +5348,7 @@ static SDValue expandDivFix(unsigned Opcode, const SDLoc &DL,
 // getUnderlyingArgRegs - Find underlying registers used for a truncated,
 // bitcasted, or split argument. Returns a list of <Register, size in bits>
 static void
-getUnderlyingArgRegs(SmallVectorImpl<std::pair<unsigned, unsigned>> &Regs,
+getUnderlyingArgRegs(SmallVectorImpl<std::pair<unsigned, TypeSize>> &Regs,
                      const SDValue &N) {
   switch (N.getOpcode()) {
   case ISD::CopyFromReg: {
@@ -5454,7 +5459,7 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue(
   if (FI != std::numeric_limits<int>::max())
     Op = MachineOperand::CreateFI(FI);
 
-  SmallVector<std::pair<unsigned, unsigned>, 8> ArgRegsAndSizes;
+  SmallVector<std::pair<unsigned, TypeSize>, 8> ArgRegsAndSizes;
   if (!Op && N.getNode()) {
     getUnderlyingArgRegs(ArgRegsAndSizes, N);
     Register Reg;
@@ -5484,8 +5489,8 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue(
 
   if (!Op) {
     // Create a DBG_VALUE for each decomposed value in ArgRegs to cover Reg
-    auto splitMultiRegDbgValue
-      = [&](ArrayRef<std::pair<unsigned, unsigned>> SplitRegs) {
+    auto splitMultiRegDbgValue = [&](ArrayRef<std::pair<unsigned, TypeSize>>
+                                         SplitRegs) {
       unsigned Offset = 0;
       for (auto RegAndSize : SplitRegs) {
         // If the expression is already a fragment, the current register
@@ -5639,6 +5644,10 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
   DebugLoc dl = getCurDebugLoc();
   SDValue Res;
 
+  SDNodeFlags Flags;
+  if (auto *FPOp = dyn_cast<FPMathOperator>(&I))
+    Flags.copyFMF(*FPOp);
+
   switch (Intrinsic) {
   default:
     // By default, turn this into a target intrinsic node.
@@ -6053,23 +6062,26 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
                             getValue(I.getArgOperand(1)), DAG));
     return;
   case Intrinsic::log:
-    setValue(&I, expandLog(sdl, getValue(I.getArgOperand(0)), DAG, TLI));
+    setValue(&I, expandLog(sdl, getValue(I.getArgOperand(0)), DAG, TLI, Flags));
     return;
   case Intrinsic::log2:
-    setValue(&I, expandLog2(sdl, getValue(I.getArgOperand(0)), DAG, TLI));
+    setValue(&I,
+             expandLog2(sdl, getValue(I.getArgOperand(0)), DAG, TLI, Flags));
     return;
   case Intrinsic::log10:
-    setValue(&I, expandLog10(sdl, getValue(I.getArgOperand(0)), DAG, TLI));
+    setValue(&I,
+             expandLog10(sdl, getValue(I.getArgOperand(0)), DAG, TLI, Flags));
     return;
   case Intrinsic::exp:
-    setValue(&I, expandExp(sdl, getValue(I.getArgOperand(0)), DAG, TLI));
+    setValue(&I, expandExp(sdl, getValue(I.getArgOperand(0)), DAG, TLI, Flags));
     return;
   case Intrinsic::exp2:
-    setValue(&I, expandExp2(sdl, getValue(I.getArgOperand(0)), DAG, TLI));
+    setValue(&I,
+             expandExp2(sdl, getValue(I.getArgOperand(0)), DAG, TLI, Flags));
     return;
   case Intrinsic::pow:
     setValue(&I, expandPow(sdl, getValue(I.getArgOperand(0)),
-                           getValue(I.getArgOperand(1)), DAG, TLI));
+                           getValue(I.getArgOperand(1)), DAG, TLI, Flags));
     return;
   case Intrinsic::sqrt:
   case Intrinsic::fabs:
@@ -6102,7 +6114,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
 
     setValue(&I, DAG.getNode(Opcode, sdl,
                              getValue(I.getArgOperand(0)).getValueType(),
-                             getValue(I.getArgOperand(0))));
+                             getValue(I.getArgOperand(0)), Flags));
     return;
   }
   case Intrinsic::lround:
@@ -6127,44 +6139,47 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     setValue(&I, DAG.getNode(ISD::FMINNUM, sdl,
                              getValue(I.getArgOperand(0)).getValueType(),
                              getValue(I.getArgOperand(0)),
-                             getValue(I.getArgOperand(1))));
+                             getValue(I.getArgOperand(1)), Flags));
     return;
   case Intrinsic::maxnum:
     setValue(&I, DAG.getNode(ISD::FMAXNUM, sdl,
                              getValue(I.getArgOperand(0)).getValueType(),
                              getValue(I.getArgOperand(0)),
-                             getValue(I.getArgOperand(1))));
+                             getValue(I.getArgOperand(1)), Flags));
     return;
   case Intrinsic::minimum:
     setValue(&I, DAG.getNode(ISD::FMINIMUM, sdl,
                              getValue(I.getArgOperand(0)).getValueType(),
                              getValue(I.getArgOperand(0)),
-                             getValue(I.getArgOperand(1))));
+                             getValue(I.getArgOperand(1)), Flags));
     return;
   case Intrinsic::maximum:
     setValue(&I, DAG.getNode(ISD::FMAXIMUM, sdl,
                              getValue(I.getArgOperand(0)).getValueType(),
                              getValue(I.getArgOperand(0)),
-                             getValue(I.getArgOperand(1))));
+                             getValue(I.getArgOperand(1)), Flags));
     return;
   case Intrinsic::copysign:
     setValue(&I, DAG.getNode(ISD::FCOPYSIGN, sdl,
                              getValue(I.getArgOperand(0)).getValueType(),
                              getValue(I.getArgOperand(0)),
-                             getValue(I.getArgOperand(1))));
+                             getValue(I.getArgOperand(1)), Flags));
     return;
   case Intrinsic::fma:
-    setValue(&I, DAG.getNode(ISD::FMA, sdl,
-                             getValue(I.getArgOperand(0)).getValueType(),
-                             getValue(I.getArgOperand(0)),
-                             getValue(I.getArgOperand(1)),
-                             getValue(I.getArgOperand(2))));
+    setValue(&I, DAG.getNode(
+                     ISD::FMA, sdl, getValue(I.getArgOperand(0)).getValueType(),
+                     getValue(I.getArgOperand(0)), getValue(I.getArgOperand(1)),
+                     getValue(I.getArgOperand(2)), Flags));
     return;
 #define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC)                         \
   case Intrinsic::INTRINSIC:
 #include "llvm/IR/ConstrainedOps.def"
     visitConstrainedFPIntrinsic(cast<ConstrainedFPIntrinsic>(I));
     return;
+#define BEGIN_REGISTER_VP_INTRINSIC(VPID, ...) case Intrinsic::VPID:
+#include "llvm/IR/VPIntrinsics.def"
+    visitVectorPredicationIntrinsic(cast<VPIntrinsic>(I));
+    return;
   case Intrinsic::fmuladd: {
     EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
     if (TM.Options.AllowFPOpFusion != FPOpFusion::Strict &&
@@ -6173,17 +6188,15 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
                                getValue(I.getArgOperand(0)).getValueType(),
                                getValue(I.getArgOperand(0)),
                                getValue(I.getArgOperand(1)),
-                               getValue(I.getArgOperand(2))));
+                               getValue(I.getArgOperand(2)), Flags));
     } else {
       // TODO: Intrinsic calls should have fast-math-flags.
-      SDValue Mul = DAG.getNode(ISD::FMUL, sdl,
-                                getValue(I.getArgOperand(0)).getValueType(),
-                                getValue(I.getArgOperand(0)),
-                                getValue(I.getArgOperand(1)));
+      SDValue Mul = DAG.getNode(
+          ISD::FMUL, sdl, getValue(I.getArgOperand(0)).getValueType(),
+          getValue(I.getArgOperand(0)), getValue(I.getArgOperand(1)), Flags);
       SDValue Add = DAG.getNode(ISD::FADD, sdl,
                                 getValue(I.getArgOperand(0)).getValueType(),
-                                Mul,
-                                getValue(I.getArgOperand(2)));
+                                Mul, getValue(I.getArgOperand(2)), Flags);
       setValue(&I, Add);
     }
     return;
@@ -6201,6 +6214,20 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
                              DAG.getNode(ISD::BITCAST, sdl, MVT::f16,
                                          getValue(I.getArgOperand(0)))));
     return;
+  case Intrinsic::fptosi_sat: {
+    EVT Type = TLI.getValueType(DAG.getDataLayout(), I.getType());
+    SDValue SatW = DAG.getConstant(Type.getScalarSizeInBits(), sdl, MVT::i32);
+    setValue(&I, DAG.getNode(ISD::FP_TO_SINT_SAT, sdl, Type,
+                             getValue(I.getArgOperand(0)), SatW));
+    return;
+  }
+  case Intrinsic::fptoui_sat: {
+    EVT Type = TLI.getValueType(DAG.getDataLayout(), I.getType());
+    SDValue SatW = DAG.getConstant(Type.getScalarSizeInBits(), sdl, MVT::i32);
+    setValue(&I, DAG.getNode(ISD::FP_TO_UINT_SAT, sdl, Type,
+                             getValue(I.getArgOperand(0)), SatW));
+    return;
+  }
   case Intrinsic::pcmarker: {
     SDValue Tmp = getValue(I.getArgOperand(0));
     DAG.setRoot(DAG.getNode(ISD::PCMARKER, sdl, MVT::Other, getRoot(), Tmp));
@@ -6253,62 +6280,14 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     SDValue Y = getValue(I.getArgOperand(1));
     SDValue Z = getValue(I.getArgOperand(2));
     EVT VT = X.getValueType();
-    SDValue BitWidthC = DAG.getConstant(VT.getScalarSizeInBits(), sdl, VT);
-    SDValue Zero = DAG.getConstant(0, sdl, VT);
-    SDValue ShAmt = DAG.getNode(ISD::UREM, sdl, VT, Z, BitWidthC);
 
-    auto FunnelOpcode = IsFSHL ? ISD::FSHL : ISD::FSHR;
-    if (TLI.isOperationLegalOrCustom(FunnelOpcode, VT)) {
-      setValue(&I, DAG.getNode(FunnelOpcode, sdl, VT, X, Y, Z));
-      return;
-    }
-
-    // When X == Y, this is rotate. If the data type has a power-of-2 size, we
-    // avoid the select that is necessary in the general case to filter out
-    // the 0-shift possibility that leads to UB.
-    if (X == Y && isPowerOf2_32(VT.getScalarSizeInBits())) {
+    if (X == Y) {
       auto RotateOpcode = IsFSHL ? ISD::ROTL : ISD::ROTR;
-      if (TLI.isOperationLegalOrCustom(RotateOpcode, VT)) {
-        setValue(&I, DAG.getNode(RotateOpcode, sdl, VT, X, Z));
-        return;
-      }
-
-      // Some targets only rotate one way. Try the opposite direction.
-      RotateOpcode = IsFSHL ? ISD::ROTR : ISD::ROTL;
-      if (TLI.isOperationLegalOrCustom(RotateOpcode, VT)) {
-        // Negate the shift amount because it is safe to ignore the high bits.
-        SDValue NegShAmt = DAG.getNode(ISD::SUB, sdl, VT, Zero, Z);
-        setValue(&I, DAG.getNode(RotateOpcode, sdl, VT, X, NegShAmt));
-        return;
-      }
-
-      // fshl (rotl): (X << (Z % BW)) | (X >> ((0 - Z) % BW))
-      // fshr (rotr): (X << ((0 - Z) % BW)) | (X >> (Z % BW))
-      SDValue NegZ = DAG.getNode(ISD::SUB, sdl, VT, Zero, Z);
-      SDValue NShAmt = DAG.getNode(ISD::UREM, sdl, VT, NegZ, BitWidthC);
-      SDValue ShX = DAG.getNode(ISD::SHL, sdl, VT, X, IsFSHL ? ShAmt : NShAmt);
-      SDValue ShY = DAG.getNode(ISD::SRL, sdl, VT, X, IsFSHL ? NShAmt : ShAmt);
-      setValue(&I, DAG.getNode(ISD::OR, sdl, VT, ShX, ShY));
-      return;
+      setValue(&I, DAG.getNode(RotateOpcode, sdl, VT, X, Z));
+    } else {
+      auto FunnelOpcode = IsFSHL ? ISD::FSHL : ISD::FSHR;
+      setValue(&I, DAG.getNode(FunnelOpcode, sdl, VT, X, Y, Z));
     }
-
-    // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
-    // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
-    SDValue InvShAmt = DAG.getNode(ISD::SUB, sdl, VT, BitWidthC, ShAmt);
-    SDValue ShX = DAG.getNode(ISD::SHL, sdl, VT, X, IsFSHL ? ShAmt : InvShAmt);
-    SDValue ShY = DAG.getNode(ISD::SRL, sdl, VT, Y, IsFSHL ? InvShAmt : ShAmt);
-    SDValue Or = DAG.getNode(ISD::OR, sdl, VT, ShX, ShY);
-
-    // If (Z % BW == 0), then the opposite direction shift is shift-by-bitwidth,
-    // and that is undefined. We must compare and select to avoid UB.
-    EVT CCVT = MVT::i1;
-    if (VT.isVector())
-      CCVT = EVT::getVectorVT(*Context, CCVT, VT.getVectorNumElements());
-
-    // For fshl, 0-shift returns the 1st arg (X).
-    // For fshr, 0-shift returns the 2nd arg (Y).
-    SDValue IsZeroShift = DAG.getSetCC(sdl, CCVT, ShAmt, Zero, ISD::SETEQ);
-    setValue(&I, DAG.getSelect(sdl, VT, IsZeroShift, IsFSHL ? X : Y, Or));
     return;
   }
   case Intrinsic::sadd_sat: {
@@ -6335,6 +6314,18 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     setValue(&I, DAG.getNode(ISD::USUBSAT, sdl, Op1.getValueType(), Op1, Op2));
     return;
   }
+  case Intrinsic::sshl_sat: {
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    setValue(&I, DAG.getNode(ISD::SSHLSAT, sdl, Op1.getValueType(), Op1, Op2));
+    return;
+  }
+  case Intrinsic::ushl_sat: {
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    setValue(&I, DAG.getNode(ISD::USHLSAT, sdl, Op1.getValueType(), Op1, Op2));
+    return;
+  }
   case Intrinsic::smul_fix:
   case Intrinsic::umul_fix:
   case Intrinsic::smul_fix_sat:
@@ -6357,6 +6348,36 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
                               Op1, Op2, Op3, DAG, TLI));
     return;
   }
+  case Intrinsic::smax: {
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    setValue(&I, DAG.getNode(ISD::SMAX, sdl, Op1.getValueType(), Op1, Op2));
+    return;
+  }
+  case Intrinsic::smin: {
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    setValue(&I, DAG.getNode(ISD::SMIN, sdl, Op1.getValueType(), Op1, Op2));
+    return;
+  }
+  case Intrinsic::umax: {
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    setValue(&I, DAG.getNode(ISD::UMAX, sdl, Op1.getValueType(), Op1, Op2));
+    return;
+  }
+  case Intrinsic::umin: {
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    setValue(&I, DAG.getNode(ISD::UMIN, sdl, Op1.getValueType(), Op1, Op2));
+    return;
+  }
+  case Intrinsic::abs: {
+    // TODO: Preserve "int min is poison" arg in SDAG?
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    setValue(&I, DAG.getNode(ISD::ABS, sdl, Op1.getValueType(), Op1));
+    return;
+  }
   case Intrinsic::stacksave: {
     SDValue Op = getRoot();
     EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
@@ -6375,7 +6396,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     EVT ResTy = TLI.getValueType(DAG.getDataLayout(), I.getType());
     // Result type for @llvm.get.dynamic.area.offset should match PtrTy for
     // target.
-    if (PtrTy.getSizeInBits() < ResTy.getSizeInBits())
+    if (PtrTy.getFixedSizeInBits() < ResTy.getFixedSizeInBits())
       report_fatal_error("Wrong result type for @llvm.get.dynamic.area.offset"
                          " intrinsic!");
     Res = DAG.getNode(ISD::GET_DYNAMIC_AREA_OFFSET, sdl, DAG.getVTList(ResTy),
@@ -6393,7 +6414,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     } else {
       EVT PtrTy = TLI.getValueType(DAG.getDataLayout(), I.getType());
       const Value *Global = TLI.getSDagStackGuard(M);
-      unsigned Align = DL->getPrefTypeAlignment(Global->getType());
+      Align Align = DL->getPrefTypeAlign(Global->getType());
       Res = DAG.getLoad(PtrTy, sdl, Chain, getValue(Global),
                         MachinePointerInfo(Global, 0), Align,
                         MachineMemOperand::MOVolatile);
@@ -6424,9 +6445,10 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     SDValue FIN = DAG.getFrameIndex(FI, PtrTy);
 
     // Store the stack protector onto the stack.
-    Res = DAG.getStore(Chain, sdl, Src, FIN, MachinePointerInfo::getFixedStack(
-                                                 DAG.getMachineFunction(), FI),
-                       /* Alignment = */ 0, MachineMemOperand::MOVolatile);
+    Res = DAG.getStore(
+        Chain, sdl, Src, FIN,
+        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
+        MaybeAlign(), MachineMemOperand::MOVolatile);
     setValue(&I, Res);
     DAG.setRoot(Res);
     return;
@@ -6444,10 +6466,13 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     // Drop the intrinsic, but forward the value
     setValue(&I, getValue(I.getOperand(0)));
     return;
+
   case Intrinsic::assume:
+  case Intrinsic::experimental_noalias_scope_decl:
   case Intrinsic::var_annotation:
   case Intrinsic::sideeffect:
-    // Discard annotate attributes, assumptions, and artificial side-effects.
+    // Discard annotate attributes, noalias scope declarations, assumptions, and
+    // artificial side-effects.
     return;
 
   case Intrinsic::codeview_annotation: {
@@ -6508,6 +6533,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     setValue(&I, getValue(I.getArgOperand(0)));
     return;
 
+  case Intrinsic::ubsantrap:
   case Intrinsic::debugtrap:
   case Intrinsic::trap: {
     StringRef TrapFuncName =
@@ -6515,12 +6541,31 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
             .getAttribute(AttributeList::FunctionIndex, "trap-func-name")
             .getValueAsString();
     if (TrapFuncName.empty()) {
-      ISD::NodeType Op = (Intrinsic == Intrinsic::trap) ?
-        ISD::TRAP : ISD::DEBUGTRAP;
-      DAG.setRoot(DAG.getNode(Op, sdl,MVT::Other, getRoot()));
+      switch (Intrinsic) {
+      case Intrinsic::trap:
+        DAG.setRoot(DAG.getNode(ISD::TRAP, sdl, MVT::Other, getRoot()));
+        break;
+      case Intrinsic::debugtrap:
+        DAG.setRoot(DAG.getNode(ISD::DEBUGTRAP, sdl, MVT::Other, getRoot()));
+        break;
+      case Intrinsic::ubsantrap:
+        DAG.setRoot(DAG.getNode(
+            ISD::UBSANTRAP, sdl, MVT::Other, getRoot(),
+            DAG.getTargetConstant(
+                cast<ConstantInt>(I.getArgOperand(0))->getZExtValue(), sdl,
+                MVT::i32)));
+        break;
+      default: llvm_unreachable("unknown trap intrinsic");
+      }
       return;
     }
     TargetLowering::ArgListTy Args;
+    if (Intrinsic == Intrinsic::ubsantrap) {
+      Args.push_back(TargetLoweringBase::ArgListEntry());
+      Args[0].Val = I.getArgOperand(0);
+      Args[0].Node = getValue(Args[0].Val);
+      Args[0].Ty = Args[0].Val->getType();
+    }
 
     TargetLowering::CallLoweringInfo CLI(DAG);
     CLI.setDebugLoc(sdl).setChain(getRoot()).setLibCallee(
@@ -6557,7 +6602,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     EVT OverflowVT = MVT::i1;
     if (ResultVT.isVector())
       OverflowVT = EVT::getVectorVT(
-          *Context, OverflowVT, ResultVT.getVectorNumElements());
+          *Context, OverflowVT, ResultVT.getVectorElementCount());
 
     SDVTList VTs = DAG.getVTList(ResultVT, OverflowVT);
     setValue(&I, DAG.getNode(Op, sdl, VTs, Op1, Op2));
@@ -6595,7 +6640,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
         cast<ConstantInt>(I.getArgOperand(0))->getSExtValue();
     Value *const ObjectPtr = I.getArgOperand(1);
     SmallVector<const Value *, 4> Allocas;
-    GetUnderlyingObjects(ObjectPtr, Allocas, *DL);
+    getUnderlyingObjects(ObjectPtr, Allocas);
 
     for (SmallVectorImpl<const Value*>::iterator Object = Allocas.begin(),
            E = Allocas.end(); Object != E; ++Object) {
@@ -6622,6 +6667,14 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     }
     return;
   }
+  case Intrinsic::pseudoprobe: {
+    auto Guid = cast<ConstantInt>(I.getArgOperand(0))->getZExtValue();
+    auto Index = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
+    auto Attr = cast<ConstantInt>(I.getArgOperand(2))->getZExtValue();
+    Res = DAG.getPseudoProbeNode(sdl, getRoot(), Guid, Index, Attr);
+    DAG.setRoot(Res);
+    return;
+  }
   case Intrinsic::invariant_start:
     // Discard region information.
     setValue(&I, DAG.getUNDEF(TLI.getPointerTy(DAG.getDataLayout())));
@@ -6732,7 +6785,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     // specific calling convention, and only for x86_64.
     // FIXME: Support other platforms later.
     const auto &Triple = DAG.getTarget().getTargetTriple();
-    if (Triple.getArch() != Triple::x86_64 || !Triple.isOSLinux())
+    if (Triple.getArch() != Triple::x86_64)
       return;
 
     SDLoc DL = getCurSDLoc();
@@ -6763,7 +6816,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     // specific calling convention, and only for x86_64.
     // FIXME: Support other platforms later.
     const auto &Triple = DAG.getTarget().getTargetTriple();
-    if (Triple.getArch() != Triple::x86_64 || !Triple.isOSLinux())
+    if (Triple.getArch() != Triple::x86_64)
       return;
 
     SDLoc DL = getCurSDLoc();
@@ -6797,19 +6850,19 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     LowerDeoptimizeCall(&I);
     return;
 
-  case Intrinsic::experimental_vector_reduce_v2_fadd:
-  case Intrinsic::experimental_vector_reduce_v2_fmul:
-  case Intrinsic::experimental_vector_reduce_add:
-  case Intrinsic::experimental_vector_reduce_mul:
-  case Intrinsic::experimental_vector_reduce_and:
-  case Intrinsic::experimental_vector_reduce_or:
-  case Intrinsic::experimental_vector_reduce_xor:
-  case Intrinsic::experimental_vector_reduce_smax:
-  case Intrinsic::experimental_vector_reduce_smin:
-  case Intrinsic::experimental_vector_reduce_umax:
-  case Intrinsic::experimental_vector_reduce_umin:
-  case Intrinsic::experimental_vector_reduce_fmax:
-  case Intrinsic::experimental_vector_reduce_fmin:
+  case Intrinsic::vector_reduce_fadd:
+  case Intrinsic::vector_reduce_fmul:
+  case Intrinsic::vector_reduce_add:
+  case Intrinsic::vector_reduce_mul:
+  case Intrinsic::vector_reduce_and:
+  case Intrinsic::vector_reduce_or:
+  case Intrinsic::vector_reduce_xor:
+  case Intrinsic::vector_reduce_smax:
+  case Intrinsic::vector_reduce_smin:
+  case Intrinsic::vector_reduce_umax:
+  case Intrinsic::vector_reduce_umin:
+  case Intrinsic::vector_reduce_fmax:
+  case Intrinsic::vector_reduce_fmin:
     visitVectorReduce(I, Intrinsic);
     return;
 
@@ -6897,36 +6950,57 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
   case Intrinsic::get_active_lane_mask: {
     auto DL = getCurSDLoc();
     SDValue Index = getValue(I.getOperand(0));
-    SDValue BTC = getValue(I.getOperand(1));
+    SDValue TripCount = getValue(I.getOperand(1));
     Type *ElementTy = I.getOperand(0)->getType();
     EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
     unsigned VecWidth = VT.getVectorNumElements();
 
-    SmallVector<SDValue, 16> OpsBTC;
+    SmallVector<SDValue, 16> OpsTripCount;
     SmallVector<SDValue, 16> OpsIndex;
     SmallVector<SDValue, 16> OpsStepConstants;
     for (unsigned i = 0; i < VecWidth; i++) {
-      OpsBTC.push_back(BTC);
+      OpsTripCount.push_back(TripCount);
       OpsIndex.push_back(Index);
-      OpsStepConstants.push_back(DAG.getConstant(i, DL, MVT::getVT(ElementTy)));
+      OpsStepConstants.push_back(
+          DAG.getConstant(i, DL, EVT::getEVT(ElementTy)));
     }
 
-    EVT CCVT = MVT::i1;
-    CCVT = EVT::getVectorVT(I.getContext(), CCVT, VecWidth);
+    EVT CCVT = EVT::getVectorVT(I.getContext(), MVT::i1, VecWidth);
 
-    auto VecTy = MVT::getVT(FixedVectorType::get(ElementTy, VecWidth));
+    auto VecTy = EVT::getEVT(FixedVectorType::get(ElementTy, VecWidth));
     SDValue VectorIndex = DAG.getBuildVector(VecTy, DL, OpsIndex);
     SDValue VectorStep = DAG.getBuildVector(VecTy, DL, OpsStepConstants);
     SDValue VectorInduction = DAG.getNode(
        ISD::UADDO, DL, DAG.getVTList(VecTy, CCVT), VectorIndex, VectorStep);
-    SDValue VectorBTC = DAG.getBuildVector(VecTy, DL, OpsBTC);
+    SDValue VectorTripCount = DAG.getBuildVector(VecTy, DL, OpsTripCount);
     SDValue SetCC = DAG.getSetCC(DL, CCVT, VectorInduction.getValue(0),
-                                 VectorBTC, ISD::CondCode::SETULE);
+                                 VectorTripCount, ISD::CondCode::SETULT);
     setValue(&I, DAG.getNode(ISD::AND, DL, CCVT,
                              DAG.getNOT(DL, VectorInduction.getValue(1), CCVT),
                              SetCC));
     return;
   }
+  case Intrinsic::experimental_vector_insert: {
+    auto DL = getCurSDLoc();
+
+    SDValue Vec = getValue(I.getOperand(0));
+    SDValue SubVec = getValue(I.getOperand(1));
+    SDValue Index = getValue(I.getOperand(2));
+    EVT ResultVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+    setValue(&I, DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ResultVT, Vec, SubVec,
+                             Index));
+    return;
+  }
+  case Intrinsic::experimental_vector_extract: {
+    auto DL = getCurSDLoc();
+
+    SDValue Vec = getValue(I.getOperand(0));
+    SDValue Index = getValue(I.getOperand(1));
+    EVT ResultVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+
+    setValue(&I, DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResultVT, Vec, Index));
+    return;
+  }
   }
 }
 
@@ -7042,6 +7116,41 @@ void SelectionDAGBuilder::visitConstrainedFPIntrinsic(
   setValue(&FPI, FPResult);
 }
 
+static unsigned getISDForVPIntrinsic(const VPIntrinsic &VPIntrin) {
+  Optional<unsigned> ResOPC;
+  switch (VPIntrin.getIntrinsicID()) {
+#define BEGIN_REGISTER_VP_INTRINSIC(INTRIN, ...) case Intrinsic::INTRIN:
+#define BEGIN_REGISTER_VP_SDNODE(VPSDID, ...) ResOPC = ISD::VPSDID;
+#define END_REGISTER_VP_INTRINSIC(...) break;
+#include "llvm/IR/VPIntrinsics.def"
+  }
+
+  if (!ResOPC.hasValue())
+    llvm_unreachable(
+        "Inconsistency: no SDNode available for this VPIntrinsic!");
+
+  return ResOPC.getValue();
+}
+
+void SelectionDAGBuilder::visitVectorPredicationIntrinsic(
+    const VPIntrinsic &VPIntrin) {
+  unsigned Opcode = getISDForVPIntrinsic(VPIntrin);
+
+  SmallVector<EVT, 4> ValueVTs;
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  ComputeValueVTs(TLI, DAG.getDataLayout(), VPIntrin.getType(), ValueVTs);
+  SDVTList VTs = DAG.getVTList(ValueVTs);
+
+  // Request operands.
+  SmallVector<SDValue, 7> OpValues;
+  for (int i = 0; i < (int)VPIntrin.getNumArgOperands(); ++i)
+    OpValues.push_back(getValue(VPIntrin.getArgOperand(i)));
+
+  SDLoc DL = getCurSDLoc();
+  SDValue Result = DAG.getNode(Opcode, DL, VTs, OpValues);
+  setValue(&VPIntrin, Result);
+}
+
 std::pair<SDValue, SDValue>
 SelectionDAGBuilder::lowerInvokable(TargetLowering::CallLoweringInfo &CLI,
                                     const BasicBlock *EHPadBB) {
@@ -7258,9 +7367,9 @@ static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT,
   }
 
   SDValue Ptr = Builder.getValue(PtrVal);
-  SDValue LoadVal = Builder.DAG.getLoad(LoadVT, Builder.getCurSDLoc(), Root,
-                                        Ptr, MachinePointerInfo(PtrVal),
-                                        /* Alignment = */ 1);
+  SDValue LoadVal =
+      Builder.DAG.getLoad(LoadVT, Builder.getCurSDLoc(), Root, Ptr,
+                          MachinePointerInfo(PtrVal), Align(1));
 
   if (!ConstantMemory)
     Builder.PendingLoads.push_back(LoadVal.getValue(1));
@@ -7281,12 +7390,12 @@ void SelectionDAGBuilder::processIntegerCallValue(const Instruction &I,
   setValue(&I, Value);
 }
 
-/// See if we can lower a memcmp call into an optimized form. If so, return
+/// See if we can lower a memcmp/bcmp call into an optimized form. If so, return
 /// true and lower it. Otherwise return false, and it will be lowered like a
 /// normal call.
 /// The caller already checked that \p I calls the appropriate LibFunc with a
 /// correct prototype.
-bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
+bool SelectionDAGBuilder::visitMemCmpBCmpCall(const CallInst &I) {
   const Value *LHS = I.getArgOperand(0), *RHS = I.getArgOperand(1);
   const Value *Size = I.getArgOperand(2);
   const ConstantInt *CSize = dyn_cast<ConstantInt>(Size);
@@ -7537,8 +7646,12 @@ bool SelectionDAGBuilder::visitUnaryFloatCall(const CallInst &I,
   if (!I.onlyReadsMemory())
     return false;
 
+  SDNodeFlags Flags;
+  Flags.copyFMF(cast<FPMathOperator>(I));
+
   SDValue Tmp = getValue(I.getArgOperand(0));
-  setValue(&I, DAG.getNode(Opcode, getCurSDLoc(), Tmp.getValueType(), Tmp));
+  setValue(&I,
+           DAG.getNode(Opcode, getCurSDLoc(), Tmp.getValueType(), Tmp, Flags));
   return true;
 }
 
@@ -7553,10 +7666,13 @@ bool SelectionDAGBuilder::visitBinaryFloatCall(const CallInst &I,
   if (!I.onlyReadsMemory())
     return false;
 
+  SDNodeFlags Flags;
+  Flags.copyFMF(cast<FPMathOperator>(I));
+
   SDValue Tmp0 = getValue(I.getArgOperand(0));
   SDValue Tmp1 = getValue(I.getArgOperand(1));
   EVT VT = Tmp0.getValueType();
-  setValue(&I, DAG.getNode(Opcode, getCurSDLoc(), VT, Tmp0, Tmp1));
+  setValue(&I, DAG.getNode(Opcode, getCurSDLoc(), VT, Tmp0, Tmp1, Flags));
   return true;
 }
 
@@ -7590,6 +7706,10 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) {
         LibInfo->hasOptimizedCodeGen(Func)) {
       switch (Func) {
       default: break;
+      case LibFunc_bcmp:
+        if (visitMemCmpBCmpCall(I))
+          return;
+        break;
       case LibFunc_copysign:
       case LibFunc_copysignf:
       case LibFunc_copysignl:
@@ -7691,7 +7811,7 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) {
           return;
         break;
       case LibFunc_memcmp:
-        if (visitMemCmpCall(I))
+        if (visitMemCmpBCmpCall(I))
           return;
         break;
       case LibFunc_mempcpy:
@@ -8111,10 +8231,9 @@ void SelectionDAGBuilder::visitInlineAsm(const CallBase &Call) {
         OpInfo.CallOperand = getValue(OpInfo.CallOperandVal);
       }
 
-      OpInfo.ConstraintVT =
-          OpInfo
-              .getCallOperandValEVT(*DAG.getContext(), TLI, DAG.getDataLayout())
-              .getSimpleVT();
+      EVT VT = OpInfo.getCallOperandValEVT(*DAG.getContext(), TLI,
+                                           DAG.getDataLayout());
+      OpInfo.ConstraintVT = VT.isSimple() ? VT.getSimpleVT() : MVT::Other;
     } else if (OpInfo.Type == InlineAsm::isOutput && !OpInfo.isIndirect) {
       // The return value of the call is this value.  As such, there is no
       // corresponding argument.
@@ -8376,7 +8495,7 @@ void SelectionDAGBuilder::visitInlineAsm(const CallBase &Call) {
           InlineAsm::getFlagWord(InlineAsm::Kind_Imm, Ops.size());
         AsmNodeOperands.push_back(DAG.getTargetConstant(
             ResOpType, getCurSDLoc(), TLI.getPointerTy(DAG.getDataLayout())));
-        AsmNodeOperands.insert(AsmNodeOperands.end(), Ops.begin(), Ops.end());
+        llvm::append_range(AsmNodeOperands, Ops);
         break;
       }
 
@@ -8956,57 +9075,59 @@ void SelectionDAGBuilder::visitVectorReduce(const CallInst &I,
   SDLoc dl = getCurSDLoc();
   EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
   SDValue Res;
-  FastMathFlags FMF;
-  if (isa<FPMathOperator>(I))
-    FMF = I.getFastMathFlags();
+  SDNodeFlags SDFlags;
+  if (auto *FPMO = dyn_cast<FPMathOperator>(&I))
+    SDFlags.copyFMF(*FPMO);
 
   switch (Intrinsic) {
-  case Intrinsic::experimental_vector_reduce_v2_fadd:
-    if (FMF.allowReassoc())
+  case Intrinsic::vector_reduce_fadd:
+    if (SDFlags.hasAllowReassociation())
       Res = DAG.getNode(ISD::FADD, dl, VT, Op1,
-                        DAG.getNode(ISD::VECREDUCE_FADD, dl, VT, Op2));
+                        DAG.getNode(ISD::VECREDUCE_FADD, dl, VT, Op2, SDFlags),
+                        SDFlags);
     else
-      Res = DAG.getNode(ISD::VECREDUCE_STRICT_FADD, dl, VT, Op1, Op2);
+      Res = DAG.getNode(ISD::VECREDUCE_SEQ_FADD, dl, VT, Op1, Op2, SDFlags);
     break;
-  case Intrinsic::experimental_vector_reduce_v2_fmul:
-    if (FMF.allowReassoc())
+  case Intrinsic::vector_reduce_fmul:
+    if (SDFlags.hasAllowReassociation())
       Res = DAG.getNode(ISD::FMUL, dl, VT, Op1,
-                        DAG.getNode(ISD::VECREDUCE_FMUL, dl, VT, Op2));
+                        DAG.getNode(ISD::VECREDUCE_FMUL, dl, VT, Op2, SDFlags),
+                        SDFlags);
     else
-      Res = DAG.getNode(ISD::VECREDUCE_STRICT_FMUL, dl, VT, Op1, Op2);
+      Res = DAG.getNode(ISD::VECREDUCE_SEQ_FMUL, dl, VT, Op1, Op2, SDFlags);
     break;
-  case Intrinsic::experimental_vector_reduce_add:
+  case Intrinsic::vector_reduce_add:
     Res = DAG.getNode(ISD::VECREDUCE_ADD, dl, VT, Op1);
     break;
-  case Intrinsic::experimental_vector_reduce_mul:
+  case Intrinsic::vector_reduce_mul:
     Res = DAG.getNode(ISD::VECREDUCE_MUL, dl, VT, Op1);
     break;
-  case Intrinsic::experimental_vector_reduce_and:
+  case Intrinsic::vector_reduce_and:
     Res = DAG.getNode(ISD::VECREDUCE_AND, dl, VT, Op1);
     break;
-  case Intrinsic::experimental_vector_reduce_or:
+  case Intrinsic::vector_reduce_or:
     Res = DAG.getNode(ISD::VECREDUCE_OR, dl, VT, Op1);
     break;
-  case Intrinsic::experimental_vector_reduce_xor:
+  case Intrinsic::vector_reduce_xor:
     Res = DAG.getNode(ISD::VECREDUCE_XOR, dl, VT, Op1);
     break;
-  case Intrinsic::experimental_vector_reduce_smax:
+  case Intrinsic::vector_reduce_smax:
     Res = DAG.getNode(ISD::VECREDUCE_SMAX, dl, VT, Op1);
     break;
-  case Intrinsic::experimental_vector_reduce_smin:
+  case Intrinsic::vector_reduce_smin:
     Res = DAG.getNode(ISD::VECREDUCE_SMIN, dl, VT, Op1);
     break;
-  case Intrinsic::experimental_vector_reduce_umax:
+  case Intrinsic::vector_reduce_umax:
     Res = DAG.getNode(ISD::VECREDUCE_UMAX, dl, VT, Op1);
     break;
-  case Intrinsic::experimental_vector_reduce_umin:
+  case Intrinsic::vector_reduce_umin:
     Res = DAG.getNode(ISD::VECREDUCE_UMIN, dl, VT, Op1);
     break;
-  case Intrinsic::experimental_vector_reduce_fmax:
-    Res = DAG.getNode(ISD::VECREDUCE_FMAX, dl, VT, Op1);
+  case Intrinsic::vector_reduce_fmax:
+    Res = DAG.getNode(ISD::VECREDUCE_FMAX, dl, VT, Op1, SDFlags);
     break;
-  case Intrinsic::experimental_vector_reduce_fmin:
-    Res = DAG.getNode(ISD::VECREDUCE_FMIN, dl, VT, Op1);
+  case Intrinsic::vector_reduce_fmin:
+    Res = DAG.getNode(ISD::VECREDUCE_FMIN, dl, VT, Op1, SDFlags);
     break;
   default:
     llvm_unreachable("Unhandled vector reduce intrinsic");
@@ -9093,6 +9214,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
     Entry.IsSRet = true;
     Entry.IsNest = false;
     Entry.IsByVal = false;
+    Entry.IsByRef = false;
     Entry.IsReturned = false;
     Entry.IsSwiftSelf = false;
     Entry.IsSwiftError = false;
@@ -9213,6 +9335,8 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
         Flags.setCFGuardTarget();
       if (Args[i].IsByVal)
         Flags.setByVal();
+      if (Args[i].IsByRef)
+        Flags.setByRef();
       if (Args[i].IsPreallocated) {
         Flags.setPreallocated();
         // Set the byval flag for CCAssignFn callbacks that don't know about
@@ -9418,11 +9542,33 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
   return std::make_pair(Res, CLI.Chain);
 }
 
+/// Places new result values for the node in Results (their number
+/// and types must exactly match those of the original return values of
+/// the node), or leaves Results empty, which indicates that the node is not
+/// to be custom lowered after all.
 void TargetLowering::LowerOperationWrapper(SDNode *N,
                                            SmallVectorImpl<SDValue> &Results,
                                            SelectionDAG &DAG) const {
-  if (SDValue Res = LowerOperation(SDValue(N, 0), DAG))
+  SDValue Res = LowerOperation(SDValue(N, 0), DAG);
+
+  if (!Res.getNode())
+    return;
+
+  // If the original node has one result, take the return value from
+  // LowerOperation as is. It might not be result number 0.
+  if (N->getNumValues() == 1) {
     Results.push_back(Res);
+    return;
+  }
+
+  // If the original node has multiple results, then the return node should
+  // have the same number of results.
+  assert((N->getNumValues() == Res->getNumValues()) &&
+      "Lowering returned the wrong number of results!");
+
+  // Places new result values base on N result number.
+  for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
+    Results.push_back(Res.getValue(I));
 }
 
 SDValue TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
@@ -9514,8 +9660,9 @@ findArgumentCopyElisionCandidates(const DataLayout &DL,
       // We will look through cast uses, so ignore them completely.
       if (I.isCast())
         continue;
-      // Ignore debug info intrinsics, they don't escape or store to allocas.
-      if (isa<DbgInfoIntrinsic>(I))
+      // Ignore debug info and pseudo op intrinsics, they don't escape or store
+      // to allocas.
+      if (I.isDebugOrPseudoInst())
         continue;
       // This is an unknown instruction. Assume it escapes or writes to all
       // static alloca operands.
@@ -9545,7 +9692,7 @@ findArgumentCopyElisionCandidates(const DataLayout &DL,
     // initializes the alloca. Don't elide copies from the same argument twice.
     const Value *Val = SI->getValueOperand()->stripPointerCasts();
     const auto *Arg = dyn_cast<Argument>(Val);
-    if (!Arg || Arg->hasPassPointeeByValueAttr() ||
+    if (!Arg || Arg->hasPassPointeeByValueCopyAttr() ||
         Arg->getType()->isEmptyTy() ||
         DL.getTypeStoreSize(Arg->getType()) !=
             DL.getTypeAllocSize(AI->getAllocatedType()) ||
@@ -9726,6 +9873,8 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
         Flags.setSwiftError();
       if (Arg.hasAttribute(Attribute::ByVal))
         Flags.setByVal();
+      if (Arg.hasAttribute(Attribute::ByRef))
+        Flags.setByRef();
       if (Arg.hasAttribute(Attribute::InAlloca)) {
         Flags.setInAlloca();
         // Set the byval flag for CCAssignFn callbacks that don't know about
@@ -9744,27 +9893,31 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
         // preallocated handling in the various CC lowering callbacks.
         Flags.setByVal();
       }
-      if (F.getCallingConv() == CallingConv::X86_INTR) {
-        // IA Interrupt passes frame (1st parameter) by value in the stack.
-        if (ArgNo == 0)
-          Flags.setByVal();
-      }
-      if (Flags.isByVal() || Flags.isInAlloca() || Flags.isPreallocated()) {
-        Type *ElementTy = Arg.getParamByValType();
 
-        // For ByVal, size and alignment should be passed from FE.  BE will
-        // guess if this info is not there but there are cases it cannot get
-        // right.
-        unsigned FrameSize = DL.getTypeAllocSize(Arg.getParamByValType());
-        Flags.setByValSize(FrameSize);
+      Type *ArgMemTy = nullptr;
+      if (Flags.isByVal() || Flags.isInAlloca() || Flags.isPreallocated() ||
+          Flags.isByRef()) {
+        if (!ArgMemTy)
+          ArgMemTy = Arg.getPointeeInMemoryValueType();
 
-        unsigned FrameAlign;
-        if (Arg.getParamAlignment())
-          FrameAlign = Arg.getParamAlignment();
-        else
-          FrameAlign = TLI->getByValTypeAlignment(ElementTy, DL);
-        Flags.setByValAlign(Align(FrameAlign));
+        uint64_t MemSize = DL.getTypeAllocSize(ArgMemTy);
+
+        // For in-memory arguments, size and alignment should be passed from FE.
+        // BE will guess if this info is not there but there are cases it cannot
+        // get right.
+        MaybeAlign MemAlign = Arg.getParamAlign();
+        if (!MemAlign)
+          MemAlign = Align(TLI->getByValTypeAlignment(ArgMemTy, DL));
+
+        if (Flags.isByRef()) {
+          Flags.setByRefSize(MemSize);
+          Flags.setByRefAlign(*MemAlign);
+        } else {
+          Flags.setByValSize(MemSize);
+          Flags.setByValAlign(*MemAlign);
+        }
       }
+
       if (Arg.hasAttribute(Attribute::Nest))
         Flags.setNest();
       if (NeedsRegBlock)
@@ -10641,8 +10794,7 @@ void SelectionDAGBuilder::visitSwitch(const SwitchInst &SI) {
       {PeeledSwitchMBB, First, Last, nullptr, nullptr, DefaultProb});
 
   while (!WorkList.empty()) {
-    SwitchWorkListItem W = WorkList.back();
-    WorkList.pop_back();
+    SwitchWorkListItem W = WorkList.pop_back_val();
     unsigned NumClusters = W.LastCluster - W.FirstCluster + 1;
 
     if (NumClusters > 3 && TM.getOptLevel() != CodeGenOpt::None &&
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index f0b7fb0d5229..8f6e98c40161 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -18,7 +18,6 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/SwitchLoweringUtils.h"
@@ -26,7 +25,6 @@
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Instruction.h"
-#include "llvm/IR/Statepoint.h"
 #include "llvm/Support/BranchProbability.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -39,6 +37,7 @@
 
 namespace llvm {
 
+class AAResults;
 class AllocaInst;
 class AtomicCmpXchgInst;
 class AtomicRMWInst;
@@ -63,6 +62,7 @@ class FunctionLoweringInfo;
 class GCFunctionInfo;
 class GCRelocateInst;
 class GCResultInst;
+class GCStatepointInst;
 class IndirectBrInst;
 class InvokeInst;
 class LandingPadInst;
@@ -388,7 +388,7 @@ public:
 
   SelectionDAG &DAG;
   const DataLayout *DL = nullptr;
-  AliasAnalysis *AA = nullptr;
+  AAResults *AA = nullptr;
   const TargetLibraryInfo *LibInfo;
 
   class SDAGSwitchLowering : public SwitchCG::SwitchLowering {
@@ -442,7 +442,7 @@ public:
         SL(std::make_unique<SDAGSwitchLowering>(this, funcinfo)), FuncInfo(funcinfo),
         SwiftError(swifterror) {}
 
-  void init(GCFunctionInfo *gfi, AliasAnalysis *AA,
+  void init(GCFunctionInfo *gfi, AAResults *AA,
             const TargetLibraryInfo *li);
 
   /// Clear out the current SelectionDAG and the associated state and prepare
@@ -518,13 +518,6 @@ public:
 
   SDValue getValue(const Value *V);
 
-  /// Return the SDNode for the specified IR value if it exists.
-  SDNode *getNodeForIRValue(const Value *V) {
-    if (NodeMap.find(V) == NodeMap.end())
-      return nullptr;
-    return NodeMap[V].getNode();
-  }
-
   SDValue getNonRegisterValue(const Value *V);
   SDValue getValueImpl(const Value *V);
 
@@ -692,7 +685,7 @@ private:
   void visitAdd(const User &I)  { visitBinary(I, ISD::ADD); }
   void visitFAdd(const User &I) { visitBinary(I, ISD::FADD); }
   void visitSub(const User &I)  { visitBinary(I, ISD::SUB); }
-  void visitFSub(const User &I);
+  void visitFSub(const User &I) { visitBinary(I, ISD::FSUB); }
   void visitMul(const User &I)  { visitBinary(I, ISD::MUL); }
   void visitFMul(const User &I) { visitBinary(I, ISD::FMUL); }
   void visitURem(const User &I) { visitBinary(I, ISD::UREM); }
@@ -747,7 +740,7 @@ private:
   void visitFence(const FenceInst &I);
   void visitPHI(const PHINode &I);
   void visitCall(const CallInst &I);
-  bool visitMemCmpCall(const CallInst &I);
+  bool visitMemCmpBCmpCall(const CallInst &I);
   bool visitMemPCpyCall(const CallInst &I);
   bool visitMemChrCall(const CallInst &I);
   bool visitStrCpyCall(const CallInst &I, bool isStpcpy);
@@ -766,6 +759,7 @@ private:
   void visitIntrinsicCall(const CallInst &I, unsigned Intrinsic);
   void visitTargetIntrinsic(const CallInst &I, unsigned Intrinsic);
   void visitConstrainedFPIntrinsic(const ConstrainedFPIntrinsic &FPI);
+  void visitVectorPredicationIntrinsic(const VPIntrinsic &VPIntrin);
 
   void visitVAStart(const CallInst &I);
   void visitVAArg(const VAArgInst &I);
@@ -902,7 +896,7 @@ struct RegsForValue {
   }
 
   /// Return a list of registers and their sizes.
-  SmallVector<std::pair<unsigned, unsigned>, 4> getRegsAndSizes() const;
+  SmallVector<std::pair<unsigned, TypeSize>, 4> getRegsAndSizes() const;
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 42e3016e65b8..d867f3e09e9c 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -293,6 +293,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::ADDC:                       return "addc";
   case ISD::ADDE:                       return "adde";
   case ISD::ADDCARRY:                   return "addcarry";
+  case ISD::SADDO_CARRY:                return "saddo_carry";
   case ISD::SADDO:                      return "saddo";
   case ISD::UADDO:                      return "uaddo";
   case ISD::SSUBO:                      return "ssubo";
@@ -302,6 +303,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::SUBC:                       return "subc";
   case ISD::SUBE:                       return "sube";
   case ISD::SUBCARRY:                   return "subcarry";
+  case ISD::SSUBO_CARRY:                return "ssubo_carry";
   case ISD::SHL_PARTS:                  return "shl_parts";
   case ISD::SRA_PARTS:                  return "sra_parts";
   case ISD::SRL_PARTS:                  return "srl_parts";
@@ -310,6 +312,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::UADDSAT:                    return "uaddsat";
   case ISD::SSUBSAT:                    return "ssubsat";
   case ISD::USUBSAT:                    return "usubsat";
+  case ISD::SSHLSAT:                    return "sshlsat";
+  case ISD::USHLSAT:                    return "ushlsat";
 
   case ISD::SMULFIX:                    return "smulfix";
   case ISD::SMULFIXSAT:                 return "smulfixsat";
@@ -344,6 +348,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::STRICT_FP_TO_SINT:          return "strict_fp_to_sint";
   case ISD::FP_TO_UINT:                 return "fp_to_uint";
   case ISD::STRICT_FP_TO_UINT:          return "strict_fp_to_uint";
+  case ISD::FP_TO_SINT_SAT:             return "fp_to_sint_sat";
+  case ISD::FP_TO_UINT_SAT:             return "fp_to_uint_sat";
   case ISD::BITCAST:                    return "bitcast";
   case ISD::ADDRSPACECAST:              return "addrspacecast";
   case ISD::FP16_TO_FP:                 return "fp16_to_fp";
@@ -390,8 +396,11 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::STACKRESTORE:               return "stackrestore";
   case ISD::TRAP:                       return "trap";
   case ISD::DEBUGTRAP:                  return "debugtrap";
+  case ISD::UBSANTRAP:                  return "ubsantrap";
   case ISD::LIFETIME_START:             return "lifetime.start";
   case ISD::LIFETIME_END:               return "lifetime.end";
+  case ISD::PSEUDO_PROBE:
+    return "pseudoprobe";
   case ISD::GC_TRANSITION_START:        return "gc_transition.start";
   case ISD::GC_TRANSITION_END:          return "gc_transition.end";
   case ISD::GET_DYNAMIC_AREA_OFFSET:    return "get.dynamic.area.offset";
@@ -410,6 +419,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::CTTZ_ZERO_UNDEF:            return "cttz_zero_undef";
   case ISD::CTLZ:                       return "ctlz";
   case ISD::CTLZ_ZERO_UNDEF:            return "ctlz_zero_undef";
+  case ISD::PARITY:                     return "parity";
 
   // Trampolines
   case ISD::INIT_TRAMPOLINE:            return "init_trampoline";
@@ -447,9 +457,9 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
     case ISD::SETFALSE2:                return "setfalse2";
     }
   case ISD::VECREDUCE_FADD:             return "vecreduce_fadd";
-  case ISD::VECREDUCE_STRICT_FADD:      return "vecreduce_strict_fadd";
+  case ISD::VECREDUCE_SEQ_FADD:         return "vecreduce_seq_fadd";
   case ISD::VECREDUCE_FMUL:             return "vecreduce_fmul";
-  case ISD::VECREDUCE_STRICT_FMUL:      return "vecreduce_strict_fmul";
+  case ISD::VECREDUCE_SEQ_FMUL:         return "vecreduce_seq_fmul";
   case ISD::VECREDUCE_ADD:              return "vecreduce_add";
   case ISD::VECREDUCE_MUL:              return "vecreduce_mul";
   case ISD::VECREDUCE_AND:              return "vecreduce_and";
@@ -461,6 +471,12 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::VECREDUCE_UMIN:             return "vecreduce_umin";
   case ISD::VECREDUCE_FMAX:             return "vecreduce_fmax";
   case ISD::VECREDUCE_FMIN:             return "vecreduce_fmin";
+
+    // Vector Predication
+#define BEGIN_REGISTER_VP_SDNODE(SDID, LEGALARG, NAME, ...)                    \
+  case ISD::SDID:                                                              \
+    return #NAME;
+#include "llvm/IR/VPIntrinsics.def"
   }
 }
 
@@ -730,7 +746,38 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
       OS << ", compressing";
 
     OS << ">";
-  } else if (const MemSDNode* M = dyn_cast<MemSDNode>(this)) {
+  } else if (const auto *MGather = dyn_cast<MaskedGatherSDNode>(this)) {
+    OS << "<";
+    printMemOperand(OS, *MGather->getMemOperand(), G);
+
+    bool doExt = true;
+    switch (MGather->getExtensionType()) {
+    default: doExt = false; break;
+    case ISD::EXTLOAD:  OS << ", anyext"; break;
+    case ISD::SEXTLOAD: OS << ", sext"; break;
+    case ISD::ZEXTLOAD: OS << ", zext"; break;
+    }
+    if (doExt)
+      OS << " from " << MGather->getMemoryVT().getEVTString();
+
+    auto Signed = MGather->isIndexSigned() ? "signed" : "unsigned";
+    auto Scaled = MGather->isIndexScaled() ? "scaled" : "unscaled";
+    OS << ", " << Signed << " " << Scaled << " offset";
+
+    OS << ">";
+  } else if (const auto *MScatter = dyn_cast<MaskedScatterSDNode>(this)) {
+    OS << "<";
+    printMemOperand(OS, *MScatter->getMemOperand(), G);
+
+    if (MScatter->isTruncatingStore())
+      OS << ", trunc to " << MScatter->getMemoryVT().getEVTString();
+
+    auto Signed = MScatter->isIndexSigned() ? "signed" : "unsigned";
+    auto Scaled = MScatter->isIndexScaled() ? "scaled" : "unscaled";
+    OS << ", " << Signed << " " << Scaled << " offset";
+
+    OS << ">";
+  } else if (const MemSDNode *M = dyn_cast<MemSDNode>(this)) {
     OS << "<";
     printMemOperand(OS, *M->getMemOperand(), G);
     OS << ">";
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 1f0432196a2d..d17dd1c5eccb 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -75,6 +75,7 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsWebAssembly.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/IR/Statepoint.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
@@ -778,6 +779,11 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
                     << "'\n";
              CurDAG->dump());
 
+#ifndef NDEBUG
+  if (TTI.hasBranchDivergence())
+    CurDAG->VerifyDAGDiverence();
+#endif
+
   if (ViewDAGCombine1 && MatchFilterBB)
     CurDAG->viewGraph("dag-combine1 input for " + BlockName);
 
@@ -788,16 +794,16 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
     CurDAG->Combine(BeforeLegalizeTypes, AA, OptLevel);
   }
 
-#ifndef NDEBUG
-  if (TTI.hasBranchDivergence())
-    CurDAG->VerifyDAGDiverence();
-#endif
-
   LLVM_DEBUG(dbgs() << "Optimized lowered selection DAG: "
                     << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
                     << "'\n";
              CurDAG->dump());
 
+#ifndef NDEBUG
+  if (TTI.hasBranchDivergence())
+    CurDAG->VerifyDAGDiverence();
+#endif
+
   // Second step, hack on the DAG until it only uses operations and types that
   // the target supports.
   if (ViewLegalizeTypesDAGs && MatchFilterBB)
@@ -810,16 +816,16 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
     Changed = CurDAG->LegalizeTypes();
   }
 
-#ifndef NDEBUG
-  if (TTI.hasBranchDivergence())
-    CurDAG->VerifyDAGDiverence();
-#endif
-
   LLVM_DEBUG(dbgs() << "Type-legalized selection DAG: "
                     << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
                     << "'\n";
              CurDAG->dump());
 
+#ifndef NDEBUG
+  if (TTI.hasBranchDivergence())
+    CurDAG->VerifyDAGDiverence();
+#endif
+
   // Only allow creation of legal node types.
   CurDAG->NewNodesMustHaveLegalTypes = true;
 
@@ -834,15 +840,15 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
       CurDAG->Combine(AfterLegalizeTypes, AA, OptLevel);
     }
 
-#ifndef NDEBUG
-    if (TTI.hasBranchDivergence())
-      CurDAG->VerifyDAGDiverence();
-#endif
-
     LLVM_DEBUG(dbgs() << "Optimized type-legalized selection DAG: "
                       << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
                       << "'\n";
                CurDAG->dump());
+
+#ifndef NDEBUG
+    if (TTI.hasBranchDivergence())
+      CurDAG->VerifyDAGDiverence();
+#endif
   }
 
   {
@@ -857,6 +863,11 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
                       << "'\n";
                CurDAG->dump());
 
+#ifndef NDEBUG
+    if (TTI.hasBranchDivergence())
+      CurDAG->VerifyDAGDiverence();
+#endif
+
     {
       NamedRegionTimer T("legalize_types2", "Type Legalization 2", GroupName,
                          GroupDescription, TimePassesIsEnabled);
@@ -868,6 +879,11 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
                       << "'\n";
                CurDAG->dump());
 
+#ifndef NDEBUG
+    if (TTI.hasBranchDivergence())
+      CurDAG->VerifyDAGDiverence();
+#endif
+
     if (ViewDAGCombineLT && MatchFilterBB)
       CurDAG->viewGraph("dag-combine-lv input for " + BlockName);
 
@@ -898,16 +914,16 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
     CurDAG->Legalize();
   }
 
-#ifndef NDEBUG
-  if (TTI.hasBranchDivergence())
-    CurDAG->VerifyDAGDiverence();
-#endif
-
   LLVM_DEBUG(dbgs() << "Legalized selection DAG: "
                     << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
                     << "'\n";
              CurDAG->dump());
 
+#ifndef NDEBUG
+  if (TTI.hasBranchDivergence())
+    CurDAG->VerifyDAGDiverence();
+#endif
+
   if (ViewDAGCombine2 && MatchFilterBB)
     CurDAG->viewGraph("dag-combine2 input for " + BlockName);
 
@@ -918,16 +934,16 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
     CurDAG->Combine(AfterLegalizeDAG, AA, OptLevel);
   }
 
-#ifndef NDEBUG
-  if (TTI.hasBranchDivergence())
-    CurDAG->VerifyDAGDiverence();
-#endif
-
   LLVM_DEBUG(dbgs() << "Optimized legalized selection DAG: "
                     << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
                     << "'\n";
              CurDAG->dump());
 
+#ifndef NDEBUG
+  if (TTI.hasBranchDivergence())
+    CurDAG->VerifyDAGDiverence();
+#endif
+
   if (OptLevel != CodeGenOpt::None)
     ComputeLiveOutVRegInfo();
 
@@ -1251,6 +1267,12 @@ bool SelectionDAGISel::PrepareEHLandingPad() {
   BuildMI(*MBB, FuncInfo->InsertPt, SDB->getCurDebugLoc(), II)
     .addSym(Label);
 
+  // If the unwinder does not preserve all registers, ensure that the
+  // function marks the clobbered registers as used.
+  const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
+  if (auto *RegMask = TRI.getCustomEHPadPreservedMask(*MF))
+    MF->getRegInfo().addPhysRegsUsedFromRegMask(RegMask);
+
   if (Pers == EHPersonality::Wasm_CXX) {
     if (const auto *CPI = dyn_cast<CatchPadInst>(LLVMBB->getFirstNonPHI()))
       mapWasmLandingPadIndex(MBB, CPI);
@@ -1669,9 +1691,9 @@ static bool MIIsInTerminatorSequence(const MachineInstr &MI) {
 /// terminator, but additionally the copies that move the vregs into the
 /// physical registers.
 static MachineBasicBlock::iterator
-FindSplitPointForStackProtector(MachineBasicBlock *BB) {
+FindSplitPointForStackProtector(MachineBasicBlock *BB,
+                                const TargetInstrInfo &TII) {
   MachineBasicBlock::iterator SplitPoint = BB->getFirstTerminator();
-  //
   if (SplitPoint == BB->begin())
     return SplitPoint;
 
@@ -1679,6 +1701,31 @@ FindSplitPointForStackProtector(MachineBasicBlock *BB) {
   MachineBasicBlock::iterator Previous = SplitPoint;
   --Previous;
 
+  if (TII.isTailCall(*SplitPoint) &&
+      Previous->getOpcode() == TII.getCallFrameDestroyOpcode()) {
+    // call itself, then we must insert before the sequence even starts. For
+    // example:
+    //     <split point>
+    //     ADJCALLSTACKDOWN ...
+    //     <Moves>
+    //     ADJCALLSTACKUP ...
+    //     TAILJMP somewhere
+    // On the other hand, it could be an unrelated call in which case this tail call
+    // has to register moves of its own and should be the split point. For example:
+    //     ADJCALLSTACKDOWN
+    //     CALL something_else
+    //     ADJCALLSTACKUP
+    //     <split point>
+    //     TAILJMP somewhere
+    do {
+      --Previous;
+      if (Previous->isCall())
+        return SplitPoint;
+    } while(Previous->getOpcode() != TII.getCallFrameSetupOpcode());
+
+    return Previous;
+  }
+
   while (MIIsInTerminatorSequence(*Previous)) {
     SplitPoint = Previous;
     if (Previous == Start)
@@ -1718,7 +1765,7 @@ SelectionDAGISel::FinishBasicBlock() {
     // Add load and check to the basicblock.
     FuncInfo->MBB = ParentMBB;
     FuncInfo->InsertPt =
-        FindSplitPointForStackProtector(ParentMBB);
+        FindSplitPointForStackProtector(ParentMBB, *TII);
     SDB->visitSPDescriptorParent(SDB->SPDescriptor, ParentMBB);
     CurDAG->setRoot(SDB->getRoot());
     SDB->clear();
@@ -1737,7 +1784,7 @@ SelectionDAGISel::FinishBasicBlock() {
     // register allocation issues caused by us splitting the parent mbb. The
     // register allocator will clean up said virtual copies later on.
     MachineBasicBlock::iterator SplitPoint =
-        FindSplitPointForStackProtector(ParentMBB);
+        FindSplitPointForStackProtector(ParentMBB, *TII);
 
     // Splice the terminator of ParentMBB into SuccessMBB.
     SuccessMBB->splice(SuccessMBB->end(), ParentMBB,
@@ -2072,7 +2119,7 @@ void SelectionDAGISel::SelectInlineAsmMemoryOperands(std::vector<SDValue> &Ops,
         InlineAsm::getFlagWord(InlineAsm::Kind_Mem, SelOps.size());
       NewFlags = InlineAsm::getFlagWordForMem(NewFlags, ConstraintID);
       Ops.push_back(CurDAG->getTargetConstant(NewFlags, DL, MVT::i32));
-      Ops.insert(Ops.end(), SelOps.begin(), SelOps.end());
+      llvm::append_range(Ops, SelOps);
       i += 2;
     }
   }
@@ -2272,7 +2319,7 @@ void SelectionDAGISel::Select_FREEZE(SDNode *N) {
 }
 
 /// GetVBR - decode a vbr encoding whose top bit is set.
-LLVM_ATTRIBUTE_ALWAYS_INLINE static inline uint64_t
+LLVM_ATTRIBUTE_ALWAYS_INLINE static uint64_t
 GetVBR(uint64_t Val, const unsigned char *MatcherTable, unsigned &Idx) {
   assert(Val >= 128 && "Not a VBR");
   Val &= 127;  // Remove first vbr bit.
@@ -2331,7 +2378,7 @@ void SelectionDAGISel::UpdateChains(
 
       // If the node became dead and we haven't already seen it, delete it.
       if (ChainNode != NodeToMatch && ChainNode->use_empty() &&
-          !std::count(NowDeadNodes.begin(), NowDeadNodes.end(), ChainNode))
+          !llvm::is_contained(NowDeadNodes, ChainNode))
         NowDeadNodes.push_back(ChainNode);
     }
   }
@@ -2469,10 +2516,9 @@ MorphNode(SDNode *Node, unsigned TargetOpc, SDVTList VTList,
 }
 
 /// CheckSame - Implements OP_CheckSame.
-LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
-CheckSame(const unsigned char *MatcherTable, unsigned &MatcherIndex,
-          SDValue N,
-          const SmallVectorImpl<std::pair<SDValue, SDNode*>> &RecordedNodes) {
+LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
+CheckSame(const unsigned char *MatcherTable, unsigned &MatcherIndex, SDValue N,
+          const SmallVectorImpl<std::pair<SDValue, SDNode *>> &RecordedNodes) {
   // Accept if it is exactly the same as a previously recorded node.
   unsigned RecNo = MatcherTable[MatcherIndex++];
   assert(RecNo < RecordedNodes.size() && "Invalid CheckSame");
@@ -2480,11 +2526,10 @@ CheckSame(const unsigned char *MatcherTable, unsigned &MatcherIndex,
 }
 
 /// CheckChildSame - Implements OP_CheckChildXSame.
-LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
-CheckChildSame(const unsigned char *MatcherTable, unsigned &MatcherIndex,
-              SDValue N,
-              const SmallVectorImpl<std::pair<SDValue, SDNode*>> &RecordedNodes,
-              unsigned ChildNo) {
+LLVM_ATTRIBUTE_ALWAYS_INLINE static bool CheckChildSame(
+    const unsigned char *MatcherTable, unsigned &MatcherIndex, SDValue N,
+    const SmallVectorImpl<std::pair<SDValue, SDNode *>> &RecordedNodes,
+    unsigned ChildNo) {
   if (ChildNo >= N.getNumOperands())
     return false;  // Match fails if out of range child #.
   return ::CheckSame(MatcherTable, MatcherIndex, N.getOperand(ChildNo),
@@ -2492,20 +2537,20 @@ CheckChildSame(const unsigned char *MatcherTable, unsigned &MatcherIndex,
 }
 
 /// CheckPatternPredicate - Implements OP_CheckPatternPredicate.
-LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
+LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
 CheckPatternPredicate(const unsigned char *MatcherTable, unsigned &MatcherIndex,
                       const SelectionDAGISel &SDISel) {
   return SDISel.CheckPatternPredicate(MatcherTable[MatcherIndex++]);
 }
 
 /// CheckNodePredicate - Implements OP_CheckNodePredicate.
-LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
+LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
 CheckNodePredicate(const unsigned char *MatcherTable, unsigned &MatcherIndex,
                    const SelectionDAGISel &SDISel, SDNode *N) {
   return SDISel.CheckNodePredicate(N, MatcherTable[MatcherIndex++]);
 }
 
-LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
+LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
 CheckOpcode(const unsigned char *MatcherTable, unsigned &MatcherIndex,
             SDNode *N) {
   uint16_t Opc = MatcherTable[MatcherIndex++];
@@ -2513,7 +2558,7 @@ CheckOpcode(const unsigned char *MatcherTable, unsigned &MatcherIndex,
   return N->getOpcode() == Opc;
 }
 
-LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
+LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
 CheckType(const unsigned char *MatcherTable, unsigned &MatcherIndex, SDValue N,
           const TargetLowering *TLI, const DataLayout &DL) {
   MVT::SimpleValueType VT = (MVT::SimpleValueType)MatcherTable[MatcherIndex++];
@@ -2523,7 +2568,7 @@ CheckType(const unsigned char *MatcherTable, unsigned &MatcherIndex, SDValue N,
   return VT == MVT::iPTR && N.getValueType() == TLI->getPointerTy(DL);
 }
 
-LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
+LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
 CheckChildType(const unsigned char *MatcherTable, unsigned &MatcherIndex,
                SDValue N, const TargetLowering *TLI, const DataLayout &DL,
                unsigned ChildNo) {
@@ -2533,14 +2578,14 @@ CheckChildType(const unsigned char *MatcherTable, unsigned &MatcherIndex,
                      DL);
 }
 
-LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
+LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
 CheckCondCode(const unsigned char *MatcherTable, unsigned &MatcherIndex,
               SDValue N) {
   return cast<CondCodeSDNode>(N)->get() ==
       (ISD::CondCode)MatcherTable[MatcherIndex++];
 }
 
-LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
+LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
 CheckChild2CondCode(const unsigned char *MatcherTable, unsigned &MatcherIndex,
                     SDValue N) {
   if (2 >= N.getNumOperands())
@@ -2548,7 +2593,7 @@ CheckChild2CondCode(const unsigned char *MatcherTable, unsigned &MatcherIndex,
   return ::CheckCondCode(MatcherTable, MatcherIndex, N.getOperand(2));
 }
 
-LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
+LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
 CheckValueType(const unsigned char *MatcherTable, unsigned &MatcherIndex,
                SDValue N, const TargetLowering *TLI, const DataLayout &DL) {
   MVT::SimpleValueType VT = (MVT::SimpleValueType)MatcherTable[MatcherIndex++];
@@ -2559,7 +2604,7 @@ CheckValueType(const unsigned char *MatcherTable, unsigned &MatcherIndex,
   return VT == MVT::iPTR && cast<VTSDNode>(N)->getVT() == TLI->getPointerTy(DL);
 }
 
-LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
+LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
 CheckInteger(const unsigned char *MatcherTable, unsigned &MatcherIndex,
              SDValue N) {
   int64_t Val = MatcherTable[MatcherIndex++];
@@ -2570,7 +2615,7 @@ CheckInteger(const unsigned char *MatcherTable, unsigned &MatcherIndex,
   return C && C->getSExtValue() == Val;
 }
 
-LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
+LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
 CheckChildInteger(const unsigned char *MatcherTable, unsigned &MatcherIndex,
                   SDValue N, unsigned ChildNo) {
   if (ChildNo >= N.getNumOperands())
@@ -2578,7 +2623,7 @@ CheckChildInteger(const unsigned char *MatcherTable, unsigned &MatcherIndex,
   return ::CheckInteger(MatcherTable, MatcherIndex, N.getOperand(ChildNo));
 }
 
-LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
+LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
 CheckAndImm(const unsigned char *MatcherTable, unsigned &MatcherIndex,
             SDValue N, const SelectionDAGISel &SDISel) {
   int64_t Val = MatcherTable[MatcherIndex++];
@@ -2591,9 +2636,9 @@ CheckAndImm(const unsigned char *MatcherTable, unsigned &MatcherIndex,
   return C && SDISel.CheckAndMask(N.getOperand(0), C, Val);
 }
 
-LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
-CheckOrImm(const unsigned char *MatcherTable, unsigned &MatcherIndex,
-           SDValue N, const SelectionDAGISel &SDISel) {
+LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
+CheckOrImm(const unsigned char *MatcherTable, unsigned &MatcherIndex, SDValue N,
+           const SelectionDAGISel &SDISel) {
   int64_t Val = MatcherTable[MatcherIndex++];
   if (Val & 128)
     Val = GetVBR(Val, MatcherTable, MatcherIndex);
@@ -2786,6 +2831,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
   case ISD::ANNOTATION_LABEL:
   case ISD::LIFETIME_START:
   case ISD::LIFETIME_END:
+  case ISD::PSEUDO_PROBE:
     NodeToMatch->setNodeId(-1); // Mark selected.
     return;
   case ISD::AssertSext:
@@ -3181,10 +3227,12 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
       if (!::CheckOrImm(MatcherTable, MatcherIndex, N, *this)) break;
       continue;
     case OPC_CheckImmAllOnesV:
-      if (!ISD::isBuildVectorAllOnes(N.getNode())) break;
+      if (!ISD::isConstantSplatVectorAllOnes(N.getNode()))
+        break;
       continue;
     case OPC_CheckImmAllZerosV:
-      if (!ISD::isBuildVectorAllZeros(N.getNode())) break;
+      if (!ISD::isConstantSplatVectorAllZeros(N.getNode()))
+        break;
       continue;
 
     case OPC_CheckFoldableChainNode: {
@@ -3489,7 +3537,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
           auto &Chain = ChainNodesMatched;
           assert((!E || !is_contained(Chain, N)) &&
                  "Chain node replaced during MorphNode");
-          Chain.erase(std::remove(Chain.begin(), Chain.end(), N), Chain.end());
+          llvm::erase_value(Chain, N);
         });
         Res = cast<MachineSDNode>(MorphNode(NodeToMatch, TargetOpc, VTList,
                                             Ops, EmitNodeInfo));
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
index 2cb57c1d1ccc..0172646c22ec 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
@@ -14,12 +14,10 @@
 #include "StatepointLowering.h"
 #include "SelectionDAGBuilder.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/GCMetadata.h"
@@ -30,7 +28,6 @@
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/StackMaps.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
@@ -67,6 +64,20 @@ cl::opt<bool> UseRegistersForDeoptValues(
     "use-registers-for-deopt-values", cl::Hidden, cl::init(false),
     cl::desc("Allow using registers for non pointer deopt args"));
 
+cl::opt<bool> UseRegistersForGCPointersInLandingPad(
+    "use-registers-for-gc-values-in-landing-pad", cl::Hidden, cl::init(false),
+    cl::desc("Allow using registers for gc pointer in landing pad"));
+
+cl::opt<unsigned> MaxRegistersForGCPointers(
+    "max-registers-for-gc-values", cl::Hidden, cl::init(0),
+    cl::desc("Max number of VRegs allowed to pass GC pointer meta args in"));
+
+cl::opt<bool> AlwaysSpillBase("statepoint-always-spill-base", cl::Hidden,
+                              cl::init(true),
+                              cl::desc("Force spilling of base GC pointers"));
+
+typedef FunctionLoweringInfo::StatepointRelocationRecord RecordType;
+
 static void pushStackMapConstant(SmallVectorImpl<SDValue>& Ops,
                                  SelectionDAGBuilder &Builder, uint64_t Value) {
   SDLoc L = Builder.getCurSDLoc();
@@ -156,14 +167,18 @@ static Optional<int> findPreviousSpillSlot(const Value *Val,
 
   // Spill location is known for gc relocates
   if (const auto *Relocate = dyn_cast<GCRelocateInst>(Val)) {
-    const auto &SpillMap =
-        Builder.FuncInfo.StatepointSpillMaps[Relocate->getStatepoint()];
+    const auto &RelocationMap =
+        Builder.FuncInfo.StatepointRelocationMaps[Relocate->getStatepoint()];
 
-    auto It = SpillMap.find(Relocate->getDerivedPtr());
-    if (It == SpillMap.end())
+    auto It = RelocationMap.find(Relocate->getDerivedPtr());
+    if (It == RelocationMap.end())
       return None;
 
-    return It->second;
+    auto &Record = It->second;
+    if (Record.type != RecordType::Spill)
+      return None;
+
+    return Record.payload.FI;
   }
 
   // Look through bitcast instructions.
@@ -221,7 +236,6 @@ static Optional<int> findPreviousSpillSlot(const Value *Val,
   return None;
 }
 
-
 /// Return true if-and-only-if the given SDValue can be lowered as either a
 /// constant argument or a stack reference.  The key point is that the value
 /// doesn't need to be spilled or tracked as a vreg use.
@@ -242,7 +256,6 @@ static bool willLowerDirectly(SDValue Incoming) {
           Incoming.isUndef());
 }
 
-
 /// Try to find existing copies of the incoming values in stack slots used for
 /// statepoint spilling.  If we can find a spill slot for the incoming value,
 /// mark that slot as allocated, and reuse the same slot for this safepoint.
@@ -388,7 +401,7 @@ spillIncomingStatepointValue(SDValue Incoming, SDValue Chain,
                                  StoreMMO);
 
     MMO = getMachineMemOperand(MF, *cast<FrameIndexSDNode>(Loc));
-    
+
     Builder.StatepointLowering.setLocation(Incoming, Loc);
   }
 
@@ -485,7 +498,10 @@ lowerIncomingStatepointValue(SDValue Incoming, bool RequireSpillSlot,
 /// will be set to the last value spilled (if any were).
 static void
 lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops,
-                        SmallVectorImpl<MachineMemOperand*> &MemRefs,                                    SelectionDAGBuilder::StatepointLoweringInfo &SI,
+                        SmallVectorImpl<MachineMemOperand *> &MemRefs,
+                        SmallVectorImpl<SDValue> &GCPtrs,
+                        DenseMap<SDValue, int> &LowerAsVReg,
+                        SelectionDAGBuilder::StatepointLoweringInfo &SI,
                         SelectionDAGBuilder &Builder) {
   // Lower the deopt and gc arguments for this statepoint.  Layout will be:
   // deopt argument length, deopt arguments.., gc arguments...
@@ -531,6 +547,66 @@ lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops,
   const bool LiveInDeopt =
     SI.StatepointFlags & (uint64_t)StatepointFlags::DeoptLiveIn;
 
+  // Decide which deriver pointers will go on VRegs
+  unsigned MaxVRegPtrs = MaxRegistersForGCPointers.getValue();
+
+  // Pointers used on exceptional path of invoke statepoint.
+  // We cannot assing them to VRegs.
+  SmallSet<SDValue, 8> LPadPointers;
+  if (!UseRegistersForGCPointersInLandingPad)
+    if (auto *StInvoke = dyn_cast_or_null<InvokeInst>(SI.StatepointInstr)) {
+      LandingPadInst *LPI = StInvoke->getLandingPadInst();
+      for (auto *Relocate : SI.GCRelocates)
+        if (Relocate->getOperand(0) == LPI) {
+          LPadPointers.insert(Builder.getValue(Relocate->getBasePtr()));
+          LPadPointers.insert(Builder.getValue(Relocate->getDerivedPtr()));
+        }
+    }
+
+  LLVM_DEBUG(dbgs() << "Deciding how to lower GC Pointers:\n");
+
+  // List of unique lowered GC Pointer values.
+  SmallSetVector<SDValue, 16> LoweredGCPtrs;
+  // Map lowered GC Pointer value to the index in above vector
+  DenseMap<SDValue, unsigned> GCPtrIndexMap;
+
+  unsigned CurNumVRegs = 0;
+
+  auto canPassGCPtrOnVReg = [&](SDValue SD) {
+    if (SD.getValueType().isVector())
+      return false;
+    if (LPadPointers.count(SD))
+      return false;
+    return !willLowerDirectly(SD);
+  };
+
+  auto processGCPtr = [&](const Value *V) {
+    SDValue PtrSD = Builder.getValue(V);
+    if (!LoweredGCPtrs.insert(PtrSD))
+      return; // skip duplicates
+    GCPtrIndexMap[PtrSD] = LoweredGCPtrs.size() - 1;
+
+    assert(!LowerAsVReg.count(PtrSD) && "must not have been seen");
+    if (LowerAsVReg.size() == MaxVRegPtrs)
+      return;
+    assert(V->getType()->isVectorTy() == PtrSD.getValueType().isVector() &&
+           "IR and SD types disagree");
+    if (!canPassGCPtrOnVReg(PtrSD)) {
+      LLVM_DEBUG(dbgs() << "direct/spill "; PtrSD.dump(&Builder.DAG));
+      return;
+    }
+    LLVM_DEBUG(dbgs() << "vreg "; PtrSD.dump(&Builder.DAG));
+    LowerAsVReg[PtrSD] = CurNumVRegs++;
+  };
+
+  // Process derived pointers first to give them more chance to go on VReg.
+  for (const Value *V : SI.Ptrs)
+    processGCPtr(V);
+  for (const Value *V : SI.Bases)
+    processGCPtr(V);
+
+  LLVM_DEBUG(dbgs() << LowerAsVReg.size() << " pointers will go in vregs\n");
+
   auto isGCValue = [&](const Value *V) {
     auto *Ty = V->getType();
     if (!Ty->isPtrOrPtrVectorTy())
@@ -542,7 +618,9 @@ lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops,
   };
 
   auto requireSpillSlot = [&](const Value *V) {
-    return !(LiveInDeopt || UseRegistersForDeoptValues) || isGCValue(V);
+    if (isGCValue(V))
+      return !LowerAsVReg.count(Builder.getValue(V));
+    return !(LiveInDeopt || UseRegistersForDeoptValues);
   };
 
   // Before we actually start lowering (and allocating spill slots for values),
@@ -554,9 +632,17 @@ lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops,
     if (requireSpillSlot(V))
       reservePreviousStackSlotForValue(V, Builder);
   }
-  for (unsigned i = 0; i < SI.Bases.size(); ++i) {
-    reservePreviousStackSlotForValue(SI.Bases[i], Builder);
-    reservePreviousStackSlotForValue(SI.Ptrs[i], Builder);
+
+  for (const Value *V : SI.Ptrs) {
+    SDValue SDV = Builder.getValue(V);
+    if (!LowerAsVReg.count(SDV))
+      reservePreviousStackSlotForValue(V, Builder);
+  }
+
+  for (const Value *V : SI.Bases) {
+    SDValue SDV = Builder.getValue(V);
+    if (!LowerAsVReg.count(SDV))
+      reservePreviousStackSlotForValue(V, Builder);
   }
 
   // First, prefix the list with the number of unique values to be
@@ -567,6 +653,7 @@ lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops,
 
   // The vm state arguments are lowered in an opaque manner.  We do not know
   // what type of values are contained within.
+  LLVM_DEBUG(dbgs() << "Lowering deopt state\n");
   for (const Value *V : SI.DeoptState) {
     SDValue Incoming;
     // If this is a function argument at a static frame index, generate it as
@@ -578,78 +665,56 @@ lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops,
     }
     if (!Incoming.getNode())
       Incoming = Builder.getValue(V);
+    LLVM_DEBUG(dbgs() << "Value " << *V
+                      << " requireSpillSlot = " << requireSpillSlot(V) << "\n");
     lowerIncomingStatepointValue(Incoming, requireSpillSlot(V), Ops, MemRefs,
                                  Builder);
   }
 
-  // Finally, go ahead and lower all the gc arguments.  There's no prefixed
-  // length for this one.  After lowering, we'll have the base and pointer
-  // arrays interwoven with each (lowered) base pointer immediately followed by
-  // it's (lowered) derived pointer.  i.e
-  // (base[0], ptr[0], base[1], ptr[1], ...)
-  for (unsigned i = 0; i < SI.Bases.size(); ++i) {
-    const Value *Base = SI.Bases[i];
-    lowerIncomingStatepointValue(Builder.getValue(Base),
-                                 /*RequireSpillSlot*/ true, Ops, MemRefs,
+  // Finally, go ahead and lower all the gc arguments.
+  pushStackMapConstant(Ops, Builder, LoweredGCPtrs.size());
+  for (SDValue SDV : LoweredGCPtrs)
+    lowerIncomingStatepointValue(SDV, !LowerAsVReg.count(SDV), Ops, MemRefs,
                                  Builder);
 
-    const Value *Ptr = SI.Ptrs[i];
-    lowerIncomingStatepointValue(Builder.getValue(Ptr),
-                                 /*RequireSpillSlot*/ true, Ops, MemRefs,
-                                 Builder);
-  }
+  // Copy to out vector. LoweredGCPtrs will be empty after this point.
+  GCPtrs = LoweredGCPtrs.takeVector();
 
   // If there are any explicit spill slots passed to the statepoint, record
   // them, but otherwise do not do anything special.  These are user provided
   // allocas and give control over placement to the consumer.  In this case,
   // it is the contents of the slot which may get updated, not the pointer to
   // the alloca
+  SmallVector<SDValue, 4> Allocas;
   for (Value *V : SI.GCArgs) {
     SDValue Incoming = Builder.getValue(V);
     if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Incoming)) {
       // This handles allocas as arguments to the statepoint
       assert(Incoming.getValueType() == Builder.getFrameIndexTy() &&
              "Incoming value is a frame index!");
-      Ops.push_back(Builder.DAG.getTargetFrameIndex(FI->getIndex(),
-                                                    Builder.getFrameIndexTy()));
+      Allocas.push_back(Builder.DAG.getTargetFrameIndex(
+          FI->getIndex(), Builder.getFrameIndexTy()));
 
       auto &MF = Builder.DAG.getMachineFunction();
       auto *MMO = getMachineMemOperand(MF, *FI);
       MemRefs.push_back(MMO);
     }
   }
+  pushStackMapConstant(Ops, Builder, Allocas.size());
+  Ops.append(Allocas.begin(), Allocas.end());
 
-  // Record computed locations for all lowered values.
-  // This can not be embedded in lowering loops as we need to record *all*
-  // values, while previous loops account only values with unique SDValues.
-  const Instruction *StatepointInstr = SI.StatepointInstr;
-  auto &SpillMap = Builder.FuncInfo.StatepointSpillMaps[StatepointInstr];
-
-  for (const GCRelocateInst *Relocate : SI.GCRelocates) {
-    const Value *V = Relocate->getDerivedPtr();
-    SDValue SDV = Builder.getValue(V);
-    SDValue Loc = Builder.StatepointLowering.getLocation(SDV);
-
-    if (Loc.getNode()) {
-      SpillMap[V] = cast<FrameIndexSDNode>(Loc)->getIndex();
-    } else {
-      // Record value as visited, but not spilled. This is case for allocas
-      // and constants. For this values we can avoid emitting spill load while
-      // visiting corresponding gc_relocate.
-      // Actually we do not need to record them in this map at all.
-      // We do this only to check that we are not relocating any unvisited
-      // value.
-      SpillMap[V] = None;
-
-      // Default llvm mechanisms for exporting values which are used in
-      // different basic blocks does not work for gc relocates.
-      // Note that it would be incorrect to teach llvm that all relocates are
-      // uses of the corresponding values so that it would automatically
-      // export them. Relocates of the spilled values does not use original
-      // value.
-      if (Relocate->getParent() != StatepointInstr->getParent())
-        Builder.ExportFromCurrentBlock(V);
-    }
+  // Now construct GC base/derived map;
+  pushStackMapConstant(Ops, Builder, SI.Ptrs.size());
+  SDLoc L = Builder.getCurSDLoc();
+  for (unsigned i = 0; i < SI.Ptrs.size(); ++i) {
+    SDValue Base = Builder.getValue(SI.Bases[i]);
+    assert(GCPtrIndexMap.count(Base) && "base not found in index map");
+    Ops.push_back(
+        Builder.DAG.getTargetConstant(GCPtrIndexMap[Base], L, MVT::i64));
+    SDValue Derived = Builder.getValue(SI.Ptrs[i]);
+    assert(GCPtrIndexMap.count(Derived) && "derived not found in index map");
+    Ops.push_back(
+        Builder.DAG.getTargetConstant(GCPtrIndexMap[Derived], L, MVT::i64));
   }
 }
 
@@ -665,6 +730,7 @@ SDValue SelectionDAGBuilder::LowerAsSTATEPOINT(
   assert(SI.Bases.size() == SI.Ptrs.size() &&
          SI.Ptrs.size() <= SI.GCRelocates.size());
 
+  LLVM_DEBUG(dbgs() << "Lowering statepoint " << *SI.StatepointInstr << "\n");
 #ifndef NDEBUG
   for (auto *Reloc : SI.GCRelocates)
     if (Reloc->getParent() == SI.StatepointInstr->getParent())
@@ -672,9 +738,16 @@ SDValue SelectionDAGBuilder::LowerAsSTATEPOINT(
 #endif
 
   // Lower statepoint vmstate and gcstate arguments
+
+  // All lowered meta args.
   SmallVector<SDValue, 10> LoweredMetaArgs;
+  // Lowered GC pointers (subset of above).
+  SmallVector<SDValue, 16> LoweredGCArgs;
   SmallVector<MachineMemOperand*, 16> MemRefs;
-  lowerStatepointMetaArgs(LoweredMetaArgs, MemRefs, SI, *this);
+  // Maps derived pointer SDValue to statepoint result of relocated pointer.
+  DenseMap<SDValue, int> LowerAsVReg;
+  lowerStatepointMetaArgs(LoweredMetaArgs, MemRefs, LoweredGCArgs, LowerAsVReg,
+                          SI, *this);
 
   // Now that we've emitted the spills, we need to update the root so that the
   // call sequence is ordered correctly.
@@ -774,7 +847,7 @@ SDValue SelectionDAGBuilder::LowerAsSTATEPOINT(
   pushStackMapConstant(Ops, *this, Flags);
 
   // Insert all vmstate and gcstate arguments
-  Ops.insert(Ops.end(), LoweredMetaArgs.begin(), LoweredMetaArgs.end());
+  llvm::append_range(Ops, LoweredMetaArgs);
 
   // Add register mask from call node
   Ops.push_back(*RegMaskIt);
@@ -788,12 +861,79 @@ SDValue SelectionDAGBuilder::LowerAsSTATEPOINT(
 
   // Compute return values.  Provide a glue output since we consume one as
   // input.  This allows someone else to chain off us as needed.
-  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SmallVector<EVT, 8> NodeTys;
+  for (auto SD : LoweredGCArgs) {
+    if (!LowerAsVReg.count(SD))
+      continue;
+    NodeTys.push_back(SD.getValueType());
+  }
+  LLVM_DEBUG(dbgs() << "Statepoint has " << NodeTys.size() << " results\n");
+  assert(NodeTys.size() == LowerAsVReg.size() && "Inconsistent GC Ptr lowering");
+  NodeTys.push_back(MVT::Other);
+  NodeTys.push_back(MVT::Glue);
 
+  unsigned NumResults = NodeTys.size();
   MachineSDNode *StatepointMCNode =
     DAG.getMachineNode(TargetOpcode::STATEPOINT, getCurSDLoc(), NodeTys, Ops);
   DAG.setNodeMemRefs(StatepointMCNode, MemRefs);
 
+  // For values lowered to tied-defs, create the virtual registers.  Note that
+  // for simplicity, we *always* create a vreg even within a single block.
+  DenseMap<SDValue, Register> VirtRegs;
+  for (const auto *Relocate : SI.GCRelocates) {
+    Value *Derived = Relocate->getDerivedPtr();
+    SDValue SD = getValue(Derived);
+    if (!LowerAsVReg.count(SD))
+      continue;
+
+    // Handle multiple gc.relocates of the same input efficiently.
+    if (VirtRegs.count(SD))
+      continue;
+
+    SDValue Relocated = SDValue(StatepointMCNode, LowerAsVReg[SD]);
+
+    auto *RetTy = Relocate->getType();
+    Register Reg = FuncInfo.CreateRegs(RetTy);
+    RegsForValue RFV(*DAG.getContext(), DAG.getTargetLoweringInfo(),
+                     DAG.getDataLayout(), Reg, RetTy, None);
+    SDValue Chain = DAG.getRoot();
+    RFV.getCopyToRegs(Relocated, DAG, getCurSDLoc(), Chain, nullptr);
+    PendingExports.push_back(Chain);
+
+    VirtRegs[SD] = Reg;
+  }
+
+  // Record for later use how each relocation was lowered.  This is needed to
+  // allow later gc.relocates to mirror the lowering chosen.
+  const Instruction *StatepointInstr = SI.StatepointInstr;
+  auto &RelocationMap = FuncInfo.StatepointRelocationMaps[StatepointInstr];
+  for (const GCRelocateInst *Relocate : SI.GCRelocates) {
+    const Value *V = Relocate->getDerivedPtr();
+    SDValue SDV = getValue(V);
+    SDValue Loc = StatepointLowering.getLocation(SDV);
+
+    RecordType Record;
+    if (LowerAsVReg.count(SDV)) {
+      Record.type = RecordType::VReg;
+      assert(VirtRegs.count(SDV));
+      Record.payload.Reg = VirtRegs[SDV];
+    } else if (Loc.getNode()) {
+      Record.type = RecordType::Spill;
+      Record.payload.FI = cast<FrameIndexSDNode>(Loc)->getIndex();
+    } else {
+      Record.type = RecordType::NoRelocate;
+      // If we didn't relocate a value, we'll essentialy end up inserting an
+      // additional use of the original value when lowering the gc.relocate.
+      // We need to make sure the value is available at the new use, which
+      // might be in another block.
+      if (Relocate->getParent() != StatepointInstr->getParent())
+        ExportFromCurrentBlock(V);
+    }
+    RelocationMap[V] = Record;
+  }
+
+  
+
   SDNode *SinkNode = StatepointMCNode;
 
   // Build the GC_TRANSITION_END node if necessary.
@@ -804,7 +944,7 @@ SDValue SelectionDAGBuilder::LowerAsSTATEPOINT(
     SmallVector<SDValue, 8> TEOps;
 
     // Add chain
-    TEOps.push_back(SDValue(StatepointMCNode, 0));
+    TEOps.push_back(SDValue(StatepointMCNode, NumResults - 2));
 
     // Add GC transition arguments
     for (const Value *V : SI.GCTransitionArgs) {
@@ -814,7 +954,7 @@ SDValue SelectionDAGBuilder::LowerAsSTATEPOINT(
     }
 
     // Add glue
-    TEOps.push_back(SDValue(StatepointMCNode, 1));
+    TEOps.push_back(SDValue(StatepointMCNode, NumResults - 1));
 
     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
 
@@ -825,12 +965,18 @@ SDValue SelectionDAGBuilder::LowerAsSTATEPOINT(
   }
 
   // Replace original call
-  DAG.ReplaceAllUsesWith(CallNode, SinkNode); // This may update Root
+  // Call: ch,glue = CALL ...
+  // Statepoint: [gc relocates],ch,glue = STATEPOINT ...
+  unsigned NumSinkValues = SinkNode->getNumValues();
+  SDValue StatepointValues[2] = {SDValue(SinkNode, NumSinkValues - 2),
+                                 SDValue(SinkNode, NumSinkValues - 1)};
+  DAG.ReplaceAllUsesWith(CallNode, StatepointValues);
   // Remove original call node
   DAG.DeleteNode(CallNode);
 
-  // DON'T set the root - under the assumption that it's already set past the
-  // inserted node we created.
+  // Since we always emit CopyToRegs (even for local relocates), we must
+  // update root, so that they are emitted before any local uses.
+  (void)getControlRoot();
 
   // TODO: A better future implementation would be to emit a single variable
   // argument, variable return value STATEPOINT node here and then hookup the
@@ -927,7 +1073,7 @@ SelectionDAGBuilder::LowerStatepoint(const GCStatepointInst &I,
     setValue(&I, ReturnValue);
     return;
   }
-  
+
   // Result value will be used in a different basic block so we need to export
   // it now.  Default exporting mechanism will not work here because statepoint
   // call has a different type than the actual call. It means that by default
@@ -1024,6 +1170,28 @@ void SelectionDAGBuilder::visitGCRelocate(const GCRelocateInst &Relocate) {
 #endif
 
   const Value *DerivedPtr = Relocate.getDerivedPtr();
+  auto &RelocationMap =
+    FuncInfo.StatepointRelocationMaps[Relocate.getStatepoint()];
+  auto SlotIt = RelocationMap.find(DerivedPtr);
+  assert(SlotIt != RelocationMap.end() && "Relocating not lowered gc value");
+  const RecordType &Record = SlotIt->second;
+
+  // If relocation was done via virtual register..
+  if (Record.type == RecordType::VReg) {
+    Register InReg = Record.payload.Reg;
+    RegsForValue RFV(*DAG.getContext(), DAG.getTargetLoweringInfo(),
+                     DAG.getDataLayout(), InReg, Relocate.getType(),
+                     None); // This is not an ABI copy.
+    // We generate copy to/from regs even for local uses, hence we must
+    // chain with current root to ensure proper ordering of copies w.r.t.
+    // statepoint.
+    SDValue Chain = DAG.getRoot();
+    SDValue Relocation = RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(),
+                                             Chain, nullptr, nullptr);
+    setValue(&Relocate, Relocation);
+    return;
+  }
+  
   SDValue SD = getValue(DerivedPtr);
 
   if (SD.isUndef() && SD.getValueType().getSizeInBits() <= 64) {
@@ -1033,19 +1201,17 @@ void SelectionDAGBuilder::visitGCRelocate(const GCRelocateInst &Relocate) {
     return;
   }
 
-  auto &SpillMap = FuncInfo.StatepointSpillMaps[Relocate.getStatepoint()];
-  auto SlotIt = SpillMap.find(DerivedPtr);
-  assert(SlotIt != SpillMap.end() && "Relocating not lowered gc value");
-  Optional<int> DerivedPtrLocation = SlotIt->second;
 
   // We didn't need to spill these special cases (constants and allocas).
   // See the handling in spillIncomingValueForStatepoint for detail.
-  if (!DerivedPtrLocation) {
+  if (Record.type == RecordType::NoRelocate) {
     setValue(&Relocate, SD);
     return;
   }
 
-  unsigned Index = *DerivedPtrLocation;
+  assert(Record.type == RecordType::Spill);
+
+  unsigned Index = Record.payload.FI;;
   SDValue SpillSlot = DAG.getTargetFrameIndex(Index, getFrameIndexTy());
 
   // All the reloads are independent and are reading memory only modified by
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 8b3e6189a07f..b0ad86899d25 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -93,7 +93,7 @@ bool TargetLowering::parametersInCSRMatch(const MachineRegisterInfo &MRI,
     SDValue Value = OutVals[I];
     if (Value->getOpcode() != ISD::CopyFromReg)
       return false;
-    MCRegister ArgReg = cast<RegisterSDNode>(Value->getOperand(1))->getReg();
+    Register ArgReg = cast<RegisterSDNode>(Value->getOperand(1))->getReg();
     if (MRI.getLiveInPhysReg(ArgReg) != Reg)
       return false;
   }
@@ -250,7 +250,7 @@ bool TargetLowering::findOptimalMemOpLowering(
       bool Fast;
       if (NumMemOps && Op.allowOverlap() && NewVTSize < Size &&
           allowsMisalignedMemoryAccesses(
-              VT, DstAS, Op.isFixedDstAlign() ? Op.getDstAlign().value() : 0,
+              VT, DstAS, Op.isFixedDstAlign() ? Op.getDstAlign().value() : 1,
               MachineMemOperand::MONone, &Fast) &&
           Fast)
         VTSize = Size;
@@ -912,8 +912,14 @@ bool TargetLowering::SimplifyDemandedBits(
 
   if (Op.getOpcode() == ISD::Constant) {
     // We know all of the bits for a constant!
-    Known.One = cast<ConstantSDNode>(Op)->getAPIntValue();
-    Known.Zero = ~Known.One;
+    Known = KnownBits::makeConstant(cast<ConstantSDNode>(Op)->getAPIntValue());
+    return false;
+  }
+
+  if (Op.getOpcode() == ISD::ConstantFP) {
+    // We know all of the bits for a floating point constant!
+    Known = KnownBits::makeConstant(
+        cast<ConstantFPSDNode>(Op)->getValueAPF().bitcastToAPInt());
     return false;
   }
 
@@ -1009,10 +1015,8 @@ bool TargetLowering::SimplifyDemandedBits(
                              Depth + 1))
       return true;
 
-    if (!!DemandedVecElts) {
-      Known.One &= KnownVec.One;
-      Known.Zero &= KnownVec.Zero;
-    }
+    if (!!DemandedVecElts)
+      Known = KnownBits::commonBits(Known, KnownVec);
 
     return false;
   }
@@ -1037,14 +1041,10 @@ bool TargetLowering::SimplifyDemandedBits(
 
     Known.Zero.setAllBits();
     Known.One.setAllBits();
-    if (!!DemandedSubElts) {
-      Known.One &= KnownSub.One;
-      Known.Zero &= KnownSub.Zero;
-    }
-    if (!!DemandedSrcElts) {
-      Known.One &= KnownSrc.One;
-      Known.Zero &= KnownSrc.Zero;
-    }
+    if (!!DemandedSubElts)
+      Known = KnownBits::commonBits(Known, KnownSub);
+    if (!!DemandedSrcElts)
+      Known = KnownBits::commonBits(Known, KnownSrc);
 
     // Attempt to avoid multi-use src if we don't need anything from it.
     if (!DemandedBits.isAllOnesValue() || !DemandedSubElts.isAllOnesValue() ||
@@ -1101,10 +1101,8 @@ bool TargetLowering::SimplifyDemandedBits(
                                Known2, TLO, Depth + 1))
         return true;
       // Known bits are shared by every demanded subvector element.
-      if (!!DemandedSubElts) {
-        Known.One &= Known2.One;
-        Known.Zero &= Known2.Zero;
-      }
+      if (!!DemandedSubElts)
+        Known = KnownBits::commonBits(Known, Known2);
     }
     break;
   }
@@ -1142,15 +1140,13 @@ bool TargetLowering::SimplifyDemandedBits(
         if (SimplifyDemandedBits(Op0, DemandedBits, DemandedLHS, Known2, TLO,
                                  Depth + 1))
           return true;
-        Known.One &= Known2.One;
-        Known.Zero &= Known2.Zero;
+        Known = KnownBits::commonBits(Known, Known2);
       }
       if (!!DemandedRHS) {
         if (SimplifyDemandedBits(Op1, DemandedBits, DemandedRHS, Known2, TLO,
                                  Depth + 1))
           return true;
-        Known.One &= Known2.One;
-        Known.Zero &= Known2.Zero;
+        Known = KnownBits::commonBits(Known, Known2);
       }
 
       // Attempt to avoid multi-use ops if we don't need anything from them.
@@ -1325,15 +1321,15 @@ bool TargetLowering::SimplifyDemandedBits(
       return true;
 
     // If all of the unknown bits are known to be zero on one side or the other
-    // (but not both) turn this into an *inclusive* or.
+    // turn this into an *inclusive* or.
     //    e.g. (A & C1)^(B & C2) -> (A & C1)|(B & C2) iff C1&C2 == 0
     if (DemandedBits.isSubsetOf(Known.Zero | Known2.Zero))
       return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::OR, dl, VT, Op0, Op1));
 
     ConstantSDNode* C = isConstOrConstSplat(Op1, DemandedElts);
     if (C) {
-      // If one side is a constant, and all of the known set bits on the other
-      // side are also set in the constant, turn this into an AND, as we know
+      // If one side is a constant, and all of the set bits in the constant are
+      // also known set on the other side, turn this into an AND, as we know
       // the bits will be cleared.
       //    e.g. (X | C1) ^ C2 --> (X | C1) & ~C2 iff (C1&C2) == C2
       // NB: it is okay if more bits are known than are requested
@@ -1377,8 +1373,7 @@ bool TargetLowering::SimplifyDemandedBits(
       return true;
 
     // Only known if known in both the LHS and RHS.
-    Known.One &= Known2.One;
-    Known.Zero &= Known2.Zero;
+    Known = KnownBits::commonBits(Known, Known2);
     break;
   case ISD::SELECT_CC:
     if (SimplifyDemandedBits(Op.getOperand(3), DemandedBits, Known, TLO,
@@ -1395,8 +1390,7 @@ bool TargetLowering::SimplifyDemandedBits(
       return true;
 
     // Only known if known in both the LHS and RHS.
-    Known.One &= Known2.One;
-    Known.Zero &= Known2.Zero;
+    Known = KnownBits::commonBits(Known, Known2);
     break;
   case ISD::SETCC: {
     SDValue Op0 = Op.getOperand(0);
@@ -1728,6 +1722,32 @@ bool TargetLowering::SimplifyDemandedBits(
     }
     break;
   }
+  case ISD::UMIN: {
+    // Check if one arg is always less than (or equal) to the other arg.
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+    KnownBits Known0 = TLO.DAG.computeKnownBits(Op0, DemandedElts, Depth + 1);
+    KnownBits Known1 = TLO.DAG.computeKnownBits(Op1, DemandedElts, Depth + 1);
+    Known = KnownBits::umin(Known0, Known1);
+    if (Optional<bool> IsULE = KnownBits::ule(Known0, Known1))
+      return TLO.CombineTo(Op, IsULE.getValue() ? Op0 : Op1);
+    if (Optional<bool> IsULT = KnownBits::ult(Known0, Known1))
+      return TLO.CombineTo(Op, IsULT.getValue() ? Op0 : Op1);
+    break;
+  }
+  case ISD::UMAX: {
+    // Check if one arg is always greater than (or equal) to the other arg.
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+    KnownBits Known0 = TLO.DAG.computeKnownBits(Op0, DemandedElts, Depth + 1);
+    KnownBits Known1 = TLO.DAG.computeKnownBits(Op1, DemandedElts, Depth + 1);
+    Known = KnownBits::umax(Known0, Known1);
+    if (Optional<bool> IsUGE = KnownBits::uge(Known0, Known1))
+      return TLO.CombineTo(Op, IsUGE.getValue() ? Op0 : Op1);
+    if (Optional<bool> IsUGT = KnownBits::ugt(Known0, Known1))
+      return TLO.CombineTo(Op, IsUGT.getValue() ? Op0 : Op1);
+    break;
+  }
   case ISD::BITREVERSE: {
     SDValue Src = Op.getOperand(0);
     APInt DemandedSrcBits = DemandedBits.reverseBits();
@@ -1748,6 +1768,17 @@ bool TargetLowering::SimplifyDemandedBits(
     Known.Zero = Known2.Zero.byteSwap();
     break;
   }
+  case ISD::CTPOP: {
+    // If only 1 bit is demanded, replace with PARITY as long as we're before
+    // op legalization.
+    // FIXME: Limit to scalars for now.
+    if (DemandedBits.isOneValue() && !TLO.LegalOps && !VT.isVector())
+      return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::PARITY, dl, VT,
+                                               Op.getOperand(0)));
+
+    Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth);
+    break;
+  }
   case ISD::SIGN_EXTEND_INREG: {
     SDValue Op0 = Op.getOperand(0);
     EVT ExVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
@@ -1858,6 +1889,11 @@ bool TargetLowering::SimplifyDemandedBits(
     assert(!Known.hasConflict() && "Bits known to be one AND zero?");
     assert(Known.getBitWidth() == InBits && "Src width has changed?");
     Known = Known.zext(BitWidth);
+
+    // Attempt to avoid multi-use ops if we don't need anything from them.
+    if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(
+            Src, InDemandedBits, InDemandedElts, TLO.DAG, Depth + 1))
+      return TLO.CombineTo(Op, TLO.DAG.getNode(Op.getOpcode(), dl, VT, NewSrc));
     break;
   }
   case ISD::SIGN_EXTEND:
@@ -1906,6 +1942,11 @@ bool TargetLowering::SimplifyDemandedBits(
       if (!TLO.LegalOperations() || isOperationLegal(Opc, VT))
         return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, dl, VT, Src));
     }
+
+    // Attempt to avoid multi-use ops if we don't need anything from them.
+    if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(
+            Src, InDemandedBits, InDemandedElts, TLO.DAG, Depth + 1))
+      return TLO.CombineTo(Op, TLO.DAG.getNode(Op.getOpcode(), dl, VT, NewSrc));
     break;
   }
   case ISD::ANY_EXTEND:
@@ -1945,7 +1986,8 @@ bool TargetLowering::SimplifyDemandedBits(
     // zero/one bits live out.
     unsigned OperandBitWidth = Src.getScalarValueSizeInBits();
     APInt TruncMask = DemandedBits.zext(OperandBitWidth);
-    if (SimplifyDemandedBits(Src, TruncMask, Known, TLO, Depth + 1))
+    if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, Known, TLO,
+                             Depth + 1))
       return true;
     Known = Known.trunc(BitWidth);
 
@@ -1968,9 +2010,9 @@ bool TargetLowering::SimplifyDemandedBits(
           // undesirable.
           break;
 
-        SDValue ShAmt = Src.getOperand(1);
-        auto *ShAmtC = dyn_cast<ConstantSDNode>(ShAmt);
-        if (!ShAmtC || ShAmtC->getAPIntValue().uge(BitWidth))
+        const APInt *ShAmtC =
+            TLO.DAG.getValidShiftAmountConstant(Src, DemandedElts);
+        if (!ShAmtC || ShAmtC->uge(BitWidth))
           break;
         uint64_t ShVal = ShAmtC->getZExtValue();
 
@@ -1982,12 +2024,12 @@ bool TargetLowering::SimplifyDemandedBits(
         if (!(HighBits & DemandedBits)) {
           // None of the shifted in bits are needed.  Add a truncate of the
           // shift input, then shift it.
-          if (TLO.LegalTypes())
-            ShAmt = TLO.DAG.getConstant(ShVal, dl, getShiftAmountTy(VT, DL));
+          SDValue NewShAmt = TLO.DAG.getConstant(
+              ShVal, dl, getShiftAmountTy(VT, DL, TLO.LegalTypes()));
           SDValue NewTrunc =
               TLO.DAG.getNode(ISD::TRUNCATE, dl, VT, Src.getOperand(0));
           return TLO.CombineTo(
-              Op, TLO.DAG.getNode(ISD::SRL, dl, VT, NewTrunc, ShAmt));
+              Op, TLO.DAG.getNode(ISD::SRL, dl, VT, NewTrunc, NewShAmt));
         }
         break;
       }
@@ -2012,10 +2054,14 @@ bool TargetLowering::SimplifyDemandedBits(
   case ISD::EXTRACT_VECTOR_ELT: {
     SDValue Src = Op.getOperand(0);
     SDValue Idx = Op.getOperand(1);
-    unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
+    ElementCount SrcEltCnt = Src.getValueType().getVectorElementCount();
     unsigned EltBitWidth = Src.getScalarValueSizeInBits();
 
+    if (SrcEltCnt.isScalable())
+      return false;
+
     // Demand the bits from every vector element without a constant index.
+    unsigned NumSrcElts = SrcEltCnt.getFixedValue();
     APInt DemandedSrcElts = APInt::getAllOnesValue(NumSrcElts);
     if (auto *CIdx = dyn_cast<ConstantSDNode>(Idx))
       if (CIdx->getAPIntValue().ult(NumSrcElts))
@@ -2229,9 +2275,13 @@ bool TargetLowering::SimplifyDemandedBits(
         if (C->isOpaque())
           return false;
     }
-    // TODO: Handle float bits as well.
     if (VT.isInteger())
       return TLO.CombineTo(Op, TLO.DAG.getConstant(Known.One, dl, VT));
+    if (VT.isFloatingPoint())
+      return TLO.CombineTo(
+          Op,
+          TLO.DAG.getConstantFP(
+              APFloat(TLO.DAG.EVTToAPFloatSemantics(VT), Known.One), dl, VT));
   }
 
   return false;
@@ -2593,13 +2643,9 @@ bool TargetLowering::SimplifyDemandedVectorElts(
                                      KnownZero, TLO, Depth + 1))
         return true;
 
-      KnownUndef.clearBit(Idx);
-      if (Scl.isUndef())
-        KnownUndef.setBit(Idx);
+      KnownUndef.setBitVal(Idx, Scl.isUndef());
 
-      KnownZero.clearBit(Idx);
-      if (isNullConstant(Scl) || isNullFPConstant(Scl))
-        KnownZero.setBit(Idx);
+      KnownZero.setBitVal(Idx, isNullConstant(Scl) || isNullFPConstant(Scl));
       break;
     }
 
@@ -3347,6 +3393,74 @@ SDValue TargetLowering::foldSetCCWithBinOp(EVT VT, SDValue N0, SDValue N1,
   return DAG.getSetCC(DL, VT, X, YShl1, Cond);
 }
 
+static SDValue simplifySetCCWithCTPOP(const TargetLowering &TLI, EVT VT,
+                                      SDValue N0, const APInt &C1,
+                                      ISD::CondCode Cond, const SDLoc &dl,
+                                      SelectionDAG &DAG) {
+  // Look through truncs that don't change the value of a ctpop.
+  // FIXME: Add vector support? Need to be careful with setcc result type below.
+  SDValue CTPOP = N0;
+  if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() && !VT.isVector() &&
+      N0.getScalarValueSizeInBits() > Log2_32(N0.getOperand(0).getScalarValueSizeInBits()))
+    CTPOP = N0.getOperand(0);
+
+  if (CTPOP.getOpcode() != ISD::CTPOP || !CTPOP.hasOneUse())
+    return SDValue();
+
+  EVT CTVT = CTPOP.getValueType();
+  SDValue CTOp = CTPOP.getOperand(0);
+
+  // If this is a vector CTPOP, keep the CTPOP if it is legal.
+  // TODO: Should we check if CTPOP is legal(or custom) for scalars?
+  if (VT.isVector() && TLI.isOperationLegal(ISD::CTPOP, CTVT))
+    return SDValue();
+
+  // (ctpop x) u< 2 -> (x & x-1) == 0
+  // (ctpop x) u> 1 -> (x & x-1) != 0
+  if (Cond == ISD::SETULT || Cond == ISD::SETUGT) {
+    unsigned CostLimit = TLI.getCustomCtpopCost(CTVT, Cond);
+    if (C1.ugt(CostLimit + (Cond == ISD::SETULT)))
+      return SDValue();
+    if (C1 == 0 && (Cond == ISD::SETULT))
+      return SDValue(); // This is handled elsewhere.
+
+    unsigned Passes = C1.getLimitedValue() - (Cond == ISD::SETULT);
+
+    SDValue NegOne = DAG.getAllOnesConstant(dl, CTVT);
+    SDValue Result = CTOp;
+    for (unsigned i = 0; i < Passes; i++) {
+      SDValue Add = DAG.getNode(ISD::ADD, dl, CTVT, Result, NegOne);
+      Result = DAG.getNode(ISD::AND, dl, CTVT, Result, Add);
+    }
+    ISD::CondCode CC = Cond == ISD::SETULT ? ISD::SETEQ : ISD::SETNE;
+    return DAG.getSetCC(dl, VT, Result, DAG.getConstant(0, dl, CTVT), CC);
+  }
+
+  // If ctpop is not supported, expand a power-of-2 comparison based on it.
+  if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && C1 == 1) {
+    // For scalars, keep CTPOP if it is legal or custom.
+    if (!VT.isVector() && TLI.isOperationLegalOrCustom(ISD::CTPOP, CTVT))
+      return SDValue();
+    // This is based on X86's custom lowering for CTPOP which produces more
+    // instructions than the expansion here.
+
+    // (ctpop x) == 1 --> (x != 0) && ((x & x-1) == 0)
+    // (ctpop x) != 1 --> (x == 0) || ((x & x-1) != 0)
+    SDValue Zero = DAG.getConstant(0, dl, CTVT);
+    SDValue NegOne = DAG.getAllOnesConstant(dl, CTVT);
+    assert(CTVT.isInteger());
+    ISD::CondCode InvCond = ISD::getSetCCInverse(Cond, CTVT);
+    SDValue Add = DAG.getNode(ISD::ADD, dl, CTVT, CTOp, NegOne);
+    SDValue And = DAG.getNode(ISD::AND, dl, CTVT, CTOp, Add);
+    SDValue LHS = DAG.getSetCC(dl, VT, CTOp, Zero, InvCond);
+    SDValue RHS = DAG.getSetCC(dl, VT, And, Zero, Cond);
+    unsigned LogicOpcode = Cond == ISD::SETEQ ? ISD::AND : ISD::OR;
+    return DAG.getNode(LogicOpcode, dl, VT, LHS, RHS);
+  }
+
+  return SDValue();
+}
+
 /// Try to simplify a setcc built with the specified operands and cc. If it is
 /// unable to simplify it, return a null SDValue.
 SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
@@ -3363,8 +3477,11 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
 
   // Ensure that the constant occurs on the RHS and fold constant comparisons.
   // TODO: Handle non-splat vector constants. All undef causes trouble.
+  // FIXME: We can't yet fold constant scalable vector splats, so avoid an
+  // infinite loop here when we encounter one.
   ISD::CondCode SwappedCC = ISD::getSetCCSwappedOperands(Cond);
   if (isConstOrConstSplat(N0) &&
+      (!OpVT.isScalableVector() || !isConstOrConstSplat(N1)) &&
       (DCI.isBeforeLegalizeOps() ||
        isCondCodeLegal(SwappedCC, N0.getSimpleValueType())))
     return DAG.getSetCC(dl, VT, N1, N0, SwappedCC);
@@ -3376,75 +3493,46 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
   if (!isConstOrConstSplat(N0) && !isConstOrConstSplat(N1) &&
       (DCI.isBeforeLegalizeOps() ||
        isCondCodeLegal(SwappedCC, N0.getSimpleValueType())) &&
-      DAG.getNodeIfExists(ISD::SUB, DAG.getVTList(OpVT), { N1, N0 } ) &&
-      !DAG.getNodeIfExists(ISD::SUB, DAG.getVTList(OpVT), { N0, N1 } ))
+      DAG.doesNodeExist(ISD::SUB, DAG.getVTList(OpVT), {N1, N0}) &&
+      !DAG.doesNodeExist(ISD::SUB, DAG.getVTList(OpVT), {N0, N1}))
     return DAG.getSetCC(dl, VT, N1, N0, SwappedCC);
 
-  if (auto *N1C = dyn_cast<ConstantSDNode>(N1.getNode())) {
+  if (auto *N1C = isConstOrConstSplat(N1)) {
     const APInt &C1 = N1C->getAPIntValue();
 
+    // Optimize some CTPOP cases.
+    if (SDValue V = simplifySetCCWithCTPOP(*this, VT, N0, C1, Cond, dl, DAG))
+      return V;
+
     // If the LHS is '(srl (ctlz x), 5)', the RHS is 0/1, and this is an
     // equality comparison, then we're just comparing whether X itself is
     // zero.
     if (N0.getOpcode() == ISD::SRL && (C1.isNullValue() || C1.isOneValue()) &&
         N0.getOperand(0).getOpcode() == ISD::CTLZ &&
-        N0.getOperand(1).getOpcode() == ISD::Constant) {
-      const APInt &ShAmt = N0.getConstantOperandAPInt(1);
-      if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
-          ShAmt == Log2_32(N0.getValueSizeInBits())) {
-        if ((C1 == 0) == (Cond == ISD::SETEQ)) {
-          // (srl (ctlz x), 5) == 0  -> X != 0
-          // (srl (ctlz x), 5) != 1  -> X != 0
-          Cond = ISD::SETNE;
-        } else {
-          // (srl (ctlz x), 5) != 0  -> X == 0
-          // (srl (ctlz x), 5) == 1  -> X == 0
-          Cond = ISD::SETEQ;
+        isPowerOf2_32(N0.getScalarValueSizeInBits())) {
+      if (ConstantSDNode *ShAmt = isConstOrConstSplat(N0.getOperand(1))) {
+        if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
+            ShAmt->getAPIntValue() == Log2_32(N0.getScalarValueSizeInBits())) {
+          if ((C1 == 0) == (Cond == ISD::SETEQ)) {
+            // (srl (ctlz x), 5) == 0  -> X != 0
+            // (srl (ctlz x), 5) != 1  -> X != 0
+            Cond = ISD::SETNE;
+          } else {
+            // (srl (ctlz x), 5) != 0  -> X == 0
+            // (srl (ctlz x), 5) == 1  -> X == 0
+            Cond = ISD::SETEQ;
+          }
+          SDValue Zero = DAG.getConstant(0, dl, N0.getValueType());
+          return DAG.getSetCC(dl, VT, N0.getOperand(0).getOperand(0), Zero,
+                              Cond);
         }
-        SDValue Zero = DAG.getConstant(0, dl, N0.getValueType());
-        return DAG.getSetCC(dl, VT, N0.getOperand(0).getOperand(0),
-                            Zero, Cond);
       }
     }
+  }
 
-    SDValue CTPOP = N0;
-    // Look through truncs that don't change the value of a ctpop.
-    if (N0.hasOneUse() && N0.getOpcode() == ISD::TRUNCATE)
-      CTPOP = N0.getOperand(0);
-
-    if (CTPOP.hasOneUse() && CTPOP.getOpcode() == ISD::CTPOP &&
-        (N0 == CTPOP ||
-         N0.getValueSizeInBits() > Log2_32_Ceil(CTPOP.getValueSizeInBits()))) {
-      EVT CTVT = CTPOP.getValueType();
-      SDValue CTOp = CTPOP.getOperand(0);
-
-      // (ctpop x) u< 2 -> (x & x-1) == 0
-      // (ctpop x) u> 1 -> (x & x-1) != 0
-      if ((Cond == ISD::SETULT && C1 == 2) || (Cond == ISD::SETUGT && C1 == 1)){
-        SDValue NegOne = DAG.getAllOnesConstant(dl, CTVT);
-        SDValue Add = DAG.getNode(ISD::ADD, dl, CTVT, CTOp, NegOne);
-        SDValue And = DAG.getNode(ISD::AND, dl, CTVT, CTOp, Add);
-        ISD::CondCode CC = Cond == ISD::SETULT ? ISD::SETEQ : ISD::SETNE;
-        return DAG.getSetCC(dl, VT, And, DAG.getConstant(0, dl, CTVT), CC);
-      }
-
-      // If ctpop is not supported, expand a power-of-2 comparison based on it.
-      if (C1 == 1 && !isOperationLegalOrCustom(ISD::CTPOP, CTVT) &&
-          (Cond == ISD::SETEQ || Cond == ISD::SETNE)) {
-        // (ctpop x) == 1 --> (x != 0) && ((x & x-1) == 0)
-        // (ctpop x) != 1 --> (x == 0) || ((x & x-1) != 0)
-        SDValue Zero = DAG.getConstant(0, dl, CTVT);
-        SDValue NegOne = DAG.getAllOnesConstant(dl, CTVT);
-        assert(CTVT.isInteger());
-        ISD::CondCode InvCond = ISD::getSetCCInverse(Cond, CTVT);
-        SDValue Add = DAG.getNode(ISD::ADD, dl, CTVT, CTOp, NegOne);
-        SDValue And = DAG.getNode(ISD::AND, dl, CTVT, CTOp, Add);
-        SDValue LHS = DAG.getSetCC(dl, VT, CTOp, Zero, InvCond);
-        SDValue RHS = DAG.getSetCC(dl, VT, And, Zero, Cond);
-        unsigned LogicOpcode = Cond == ISD::SETEQ ? ISD::AND : ISD::OR;
-        return DAG.getNode(LogicOpcode, dl, VT, LHS, RHS);
-      }
-    }
+  // FIXME: Support vectors.
+  if (auto *N1C = dyn_cast<ConstantSDNode>(N1.getNode())) {
+    const APInt &C1 = N1C->getAPIntValue();
 
     // (zext x) == C --> x == (trunc C)
     // (sext x) == C --> x == (trunc C)
@@ -3578,11 +3666,12 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
             shouldReduceLoadWidth(Lod, ISD::NON_EXTLOAD, newVT)) {
           SDValue Ptr = Lod->getBasePtr();
           if (bestOffset != 0)
-            Ptr = DAG.getMemBasePlusOffset(Ptr, bestOffset, dl);
-          unsigned NewAlign = MinAlign(Lod->getAlignment(), bestOffset);
-          SDValue NewLoad = DAG.getLoad(
-              newVT, dl, Lod->getChain(), Ptr,
-              Lod->getPointerInfo().getWithOffset(bestOffset), NewAlign);
+            Ptr =
+                DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(bestOffset), dl);
+          SDValue NewLoad =
+              DAG.getLoad(newVT, dl, Lod->getChain(), Ptr,
+                          Lod->getPointerInfo().getWithOffset(bestOffset),
+                          Lod->getOriginalAlign());
           return DAG.getSetCC(dl, VT,
                               DAG.getNode(ISD::AND, dl, newVT, NewLoad,
                                       DAG.getConstant(bestMask.trunc(bestWidth),
@@ -3647,7 +3736,9 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
         break; // todo, be more careful with signed comparisons
       }
     } else if (N0.getOpcode() == ISD::SIGN_EXTEND_INREG &&
-               (Cond == ISD::SETEQ || Cond == ISD::SETNE)) {
+               (Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
+               !isSExtCheaperThanZExt(cast<VTSDNode>(N0.getOperand(1))->getVT(),
+                                      OpVT)) {
       EVT ExtSrcTy = cast<VTSDNode>(N0.getOperand(1))->getVT();
       unsigned ExtSrcTyBits = ExtSrcTy.getSizeInBits();
       EVT ExtDstTy = N0.getValueType();
@@ -3656,26 +3747,18 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
       // If the constant doesn't fit into the number of bits for the source of
       // the sign extension, it is impossible for both sides to be equal.
       if (C1.getMinSignedBits() > ExtSrcTyBits)
-        return DAG.getConstant(Cond == ISD::SETNE, dl, VT);
+        return DAG.getBoolConstant(Cond == ISD::SETNE, dl, VT, OpVT);
 
-      SDValue ZextOp;
-      EVT Op0Ty = N0.getOperand(0).getValueType();
-      if (Op0Ty == ExtSrcTy) {
-        ZextOp = N0.getOperand(0);
-      } else {
-        APInt Imm = APInt::getLowBitsSet(ExtDstTyBits, ExtSrcTyBits);
-        ZextOp = DAG.getNode(ISD::AND, dl, Op0Ty, N0.getOperand(0),
-                             DAG.getConstant(Imm, dl, Op0Ty));
-      }
+      assert(ExtDstTy == N0.getOperand(0).getValueType() &&
+             ExtDstTy != ExtSrcTy && "Unexpected types!");
+      APInt Imm = APInt::getLowBitsSet(ExtDstTyBits, ExtSrcTyBits);
+      SDValue ZextOp = DAG.getNode(ISD::AND, dl, ExtDstTy, N0.getOperand(0),
+                                   DAG.getConstant(Imm, dl, ExtDstTy));
       if (!DCI.isCalledByLegalizer())
         DCI.AddToWorklist(ZextOp.getNode());
       // Otherwise, make this a use of a zext.
       return DAG.getSetCC(dl, VT, ZextOp,
-                          DAG.getConstant(C1 & APInt::getLowBitsSet(
-                                                              ExtDstTyBits,
-                                                              ExtSrcTyBits),
-                                          dl, ExtDstTy),
-                          Cond);
+                          DAG.getConstant(C1 & Imm, dl, ExtDstTy), Cond);
     } else if ((N1C->isNullValue() || N1C->isOne()) &&
                 (Cond == ISD::SETEQ || Cond == ISD::SETNE)) {
       // SETCC (SETCC), [0|1], [EQ|NE]  -> SETCC
@@ -3699,8 +3782,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
            (N0.getOpcode() == ISD::AND &&
             N0.getOperand(0).getOpcode() == ISD::XOR &&
             N0.getOperand(1) == N0.getOperand(0).getOperand(1))) &&
-          isa<ConstantSDNode>(N0.getOperand(1)) &&
-          cast<ConstantSDNode>(N0.getOperand(1))->isOne()) {
+          isOneConstant(N0.getOperand(1))) {
         // If this is (X^1) == 0/1, swap the RHS and eliminate the xor.  We
         // can only do this if the top bits are known zero.
         unsigned BitWidth = N0.getValueSizeInBits();
@@ -3744,9 +3826,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
             return DAG.getSetCC(dl, VT, XorLHS, XorRHS, Cond);
           }
         }
-        if (Op0.getOpcode() == ISD::AND &&
-            isa<ConstantSDNode>(Op0.getOperand(1)) &&
-            cast<ConstantSDNode>(Op0.getOperand(1))->isOne()) {
+        if (Op0.getOpcode() == ISD::AND && isOneConstant(Op0.getOperand(1))) {
           // If this is (X&1) == / != 1, normalize it to (X&1) != / == 0.
           if (Op0.getValueType().bitsGT(VT))
             Op0 = DAG.getNode(ISD::AND, dl, VT,
@@ -3884,6 +3964,67 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
         if (SDValue CC = optimizeSetCCByHoistingAndByConstFromLogicalShift(
                 VT, N0, N1, Cond, DCI, dl))
           return CC;
+
+      // For all/any comparisons, replace or(x,shl(y,bw/2)) with and/or(x,y).
+      // For example, when high 32-bits of i64 X are known clear:
+      // all bits clear: (X | (Y<<32)) ==  0 --> (X | Y) ==  0
+      // all bits set:   (X | (Y<<32)) == -1 --> (X & Y) == -1
+      bool CmpZero = N1C->getAPIntValue().isNullValue();
+      bool CmpNegOne = N1C->getAPIntValue().isAllOnesValue();
+      if ((CmpZero || CmpNegOne) && N0.hasOneUse()) {
+        // Match or(lo,shl(hi,bw/2)) pattern.
+        auto IsConcat = [&](SDValue V, SDValue &Lo, SDValue &Hi) {
+          unsigned EltBits = V.getScalarValueSizeInBits();
+          if (V.getOpcode() != ISD::OR || (EltBits % 2) != 0)
+            return false;
+          SDValue LHS = V.getOperand(0);
+          SDValue RHS = V.getOperand(1);
+          APInt HiBits = APInt::getHighBitsSet(EltBits, EltBits / 2);
+          // Unshifted element must have zero upperbits.
+          if (RHS.getOpcode() == ISD::SHL &&
+              isa<ConstantSDNode>(RHS.getOperand(1)) &&
+              RHS.getConstantOperandAPInt(1) == (EltBits / 2) &&
+              DAG.MaskedValueIsZero(LHS, HiBits)) {
+            Lo = LHS;
+            Hi = RHS.getOperand(0);
+            return true;
+          }
+          if (LHS.getOpcode() == ISD::SHL &&
+              isa<ConstantSDNode>(LHS.getOperand(1)) &&
+              LHS.getConstantOperandAPInt(1) == (EltBits / 2) &&
+              DAG.MaskedValueIsZero(RHS, HiBits)) {
+            Lo = RHS;
+            Hi = LHS.getOperand(0);
+            return true;
+          }
+          return false;
+        };
+
+        auto MergeConcat = [&](SDValue Lo, SDValue Hi) {
+          unsigned EltBits = N0.getScalarValueSizeInBits();
+          unsigned HalfBits = EltBits / 2;
+          APInt HiBits = APInt::getHighBitsSet(EltBits, HalfBits);
+          SDValue LoBits = DAG.getConstant(~HiBits, dl, OpVT);
+          SDValue HiMask = DAG.getNode(ISD::AND, dl, OpVT, Hi, LoBits);
+          SDValue NewN0 =
+              DAG.getNode(CmpZero ? ISD::OR : ISD::AND, dl, OpVT, Lo, HiMask);
+          SDValue NewN1 = CmpZero ? DAG.getConstant(0, dl, OpVT) : LoBits;
+          return DAG.getSetCC(dl, VT, NewN0, NewN1, Cond);
+        };
+
+        SDValue Lo, Hi;
+        if (IsConcat(N0, Lo, Hi))
+          return MergeConcat(Lo, Hi);
+
+        if (N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR) {
+          SDValue Lo0, Lo1, Hi0, Hi1;
+          if (IsConcat(N0.getOperand(0), Lo0, Hi0) &&
+              IsConcat(N0.getOperand(1), Lo1, Hi1)) {
+            return MergeConcat(DAG.getNode(N0.getOpcode(), dl, OpVT, Lo0, Lo1),
+                               DAG.getNode(N0.getOpcode(), dl, OpVT, Hi0, Hi1));
+          }
+        }
+      }
     }
 
     // If we have "setcc X, C0", check to see if we can shrink the immediate
@@ -3891,20 +4032,20 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
     // TODO: Support this for vectors after legalize ops.
     if (!VT.isVector() || DCI.isBeforeLegalizeOps()) {
       // SETUGT X, SINTMAX  -> SETLT X, 0
-      if (Cond == ISD::SETUGT &&
-          C1 == APInt::getSignedMaxValue(OperandBitSize))
+      // SETUGE X, SINTMIN -> SETLT X, 0
+      if ((Cond == ISD::SETUGT && C1.isMaxSignedValue()) ||
+          (Cond == ISD::SETUGE && C1.isMinSignedValue()))
         return DAG.getSetCC(dl, VT, N0,
                             DAG.getConstant(0, dl, N1.getValueType()),
                             ISD::SETLT);
 
       // SETULT X, SINTMIN  -> SETGT X, -1
-      if (Cond == ISD::SETULT &&
-          C1 == APInt::getSignedMinValue(OperandBitSize)) {
-        SDValue ConstMinusOne =
-            DAG.getConstant(APInt::getAllOnesValue(OperandBitSize), dl,
-                            N1.getValueType());
-        return DAG.getSetCC(dl, VT, N0, ConstMinusOne, ISD::SETGT);
-      }
+      // SETULE X, SINTMAX  -> SETGT X, -1
+      if ((Cond == ISD::SETULT && C1.isMinSignedValue()) ||
+          (Cond == ISD::SETULE && C1.isMaxSignedValue()))
+        return DAG.getSetCC(dl, VT, N0,
+                            DAG.getAllOnesConstant(dl, N1.getValueType()),
+                            ISD::SETGT);
     }
   }
 
@@ -3915,8 +4056,13 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
     const APInt &C1 = N1C->getAPIntValue();
     EVT ShValTy = N0.getValueType();
 
-    // Fold bit comparisons when we can.
-    if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
+    // Fold bit comparisons when we can. This will result in an
+    // incorrect value when boolean false is negative one, unless
+    // the bitsize is 1 in which case the false value is the same
+    // in practice regardless of the representation.
+    if ((VT.getSizeInBits() == 1 ||
+         getBooleanContents(N0.getValueType()) == ZeroOrOneBooleanContent) &&
+        (Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
         (VT == ShValTy || (isTypeLegal(VT) && VT.bitsLE(ShValTy))) &&
         N0.getOpcode() == ISD::AND) {
       if (auto *AndRHS = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
@@ -4312,8 +4458,8 @@ const char *TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
 }
 
 SDValue TargetLowering::LowerAsmOutputForConstraint(
-    SDValue &Chain, SDValue &Flag, SDLoc DL, const AsmOperandInfo &OpInfo,
-    SelectionDAG &DAG) const {
+    SDValue &Chain, SDValue &Flag, const SDLoc &DL,
+    const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
   return SDValue();
 }
 
@@ -4887,9 +5033,15 @@ static SDValue BuildExactSDIV(const TargetLowering &TLI, SDNode *N,
     return SDValue();
 
   SDValue Shift, Factor;
-  if (VT.isVector()) {
+  if (VT.isFixedLengthVector()) {
     Shift = DAG.getBuildVector(ShVT, dl, Shifts);
     Factor = DAG.getBuildVector(VT, dl, Factors);
+  } else if (VT.isScalableVector()) {
+    assert(Shifts.size() == 1 && Factors.size() == 1 &&
+           "Expected matchUnaryPredicate to return one element for scalable "
+           "vectors");
+    Shift = DAG.getSplatVector(ShVT, dl, Shifts[0]);
+    Factor = DAG.getSplatVector(VT, dl, Factors[0]);
   } else {
     Shift = Shifts[0];
     Factor = Factors[0];
@@ -4982,11 +5134,20 @@ SDValue TargetLowering::BuildSDIV(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   SDValue MagicFactor, Factor, Shift, ShiftMask;
-  if (VT.isVector()) {
+  if (VT.isFixedLengthVector()) {
     MagicFactor = DAG.getBuildVector(VT, dl, MagicFactors);
     Factor = DAG.getBuildVector(VT, dl, Factors);
     Shift = DAG.getBuildVector(ShVT, dl, Shifts);
     ShiftMask = DAG.getBuildVector(VT, dl, ShiftMasks);
+  } else if (VT.isScalableVector()) {
+    assert(MagicFactors.size() == 1 && Factors.size() == 1 &&
+           Shifts.size() == 1 && ShiftMasks.size() == 1 &&
+           "Expected matchUnaryPredicate to return one element for scalable "
+           "vectors");
+    MagicFactor = DAG.getSplatVector(VT, dl, MagicFactors[0]);
+    Factor = DAG.getSplatVector(VT, dl, Factors[0]);
+    Shift = DAG.getSplatVector(ShVT, dl, Shifts[0]);
+    ShiftMask = DAG.getSplatVector(VT, dl, ShiftMasks[0]);
   } else {
     MagicFactor = MagicFactors[0];
     Factor = Factors[0];
@@ -5100,11 +5261,19 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   SDValue PreShift, PostShift, MagicFactor, NPQFactor;
-  if (VT.isVector()) {
+  if (VT.isFixedLengthVector()) {
     PreShift = DAG.getBuildVector(ShVT, dl, PreShifts);
     MagicFactor = DAG.getBuildVector(VT, dl, MagicFactors);
     NPQFactor = DAG.getBuildVector(VT, dl, NPQFactors);
     PostShift = DAG.getBuildVector(ShVT, dl, PostShifts);
+  } else if (VT.isScalableVector()) {
+    assert(PreShifts.size() == 1 && MagicFactors.size() == 1 &&
+           NPQFactors.size() == 1 && PostShifts.size() == 1 &&
+           "Expected matchUnaryPredicate to return one for scalable vectors");
+    PreShift = DAG.getSplatVector(ShVT, dl, PreShifts[0]);
+    MagicFactor = DAG.getSplatVector(VT, dl, MagicFactors[0]);
+    NPQFactor = DAG.getSplatVector(VT, dl, NPQFactors[0]);
+    PostShift = DAG.getSplatVector(ShVT, dl, PostShifts[0]);
   } else {
     PreShift = PreShifts[0];
     MagicFactor = MagicFactors[0];
@@ -5156,8 +5325,10 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
   Q = DAG.getNode(ISD::SRL, dl, VT, Q, PostShift);
   Created.push_back(Q.getNode());
 
+  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+
   SDValue One = DAG.getConstant(1, dl, VT);
-  SDValue IsOne = DAG.getSetCC(dl, VT, N1, One, ISD::SETEQ);
+  SDValue IsOne = DAG.getSetCC(dl, SetCCVT, N1, One, ISD::SETEQ);
   return DAG.getSelect(dl, VT, IsOne, N0, Q);
 }
 
@@ -5584,7 +5755,7 @@ TargetLowering::prepareSREMEqFold(EVT SETCCVT, SDValue REMNode,
     return SDValue();
 
   SDValue PVal, AVal, KVal, QVal;
-  if (VT.isVector()) {
+  if (VT.isFixedLengthVector()) {
     if (HadOneDivisor) {
       // Try to turn PAmts into a splat, since we don't care about the values
       // that are currently '0'. If we can't, just keep '0'`s.
@@ -5603,6 +5774,15 @@ TargetLowering::prepareSREMEqFold(EVT SETCCVT, SDValue REMNode,
     AVal = DAG.getBuildVector(VT, DL, AAmts);
     KVal = DAG.getBuildVector(ShVT, DL, KAmts);
     QVal = DAG.getBuildVector(VT, DL, QAmts);
+  } else if (VT.isScalableVector()) {
+    assert(PAmts.size() == 1 && AAmts.size() == 1 && KAmts.size() == 1 &&
+           QAmts.size() == 1 &&
+           "Expected matchUnaryPredicate to return one element for scalable "
+           "vectors");
+    PVal = DAG.getSplatVector(VT, DL, PAmts[0]);
+    AVal = DAG.getSplatVector(VT, DL, AAmts[0]);
+    KVal = DAG.getSplatVector(ShVT, DL, KAmts[0]);
+    QVal = DAG.getSplatVector(VT, DL, QAmts[0]);
   } else {
     PVal = PAmts[0];
     AVal = AAmts[0];
@@ -5697,6 +5877,28 @@ verifyReturnAddressArgumentIsConstant(SDValue Op, SelectionDAG &DAG) const {
   return false;
 }
 
+SDValue TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
+                                         const DenormalMode &Mode) const {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+  SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
+  // Testing it with denormal inputs to avoid wrong estimate.
+  if (Mode.Input == DenormalMode::IEEE) {
+    // This is specifically a check for the handling of denormal inputs,
+    // not the result.
+
+    // Test = fabs(X) < SmallestNormal
+    const fltSemantics &FltSem = DAG.EVTToAPFloatSemantics(VT);
+    APFloat SmallestNorm = APFloat::getSmallestNormalized(FltSem);
+    SDValue NormC = DAG.getConstantFP(SmallestNorm, DL, VT);
+    SDValue Fabs = DAG.getNode(ISD::FABS, DL, VT, Op);
+    return DAG.getSetCC(DL, CCVT, Fabs, NormC, ISD::SETLT);
+  }
+  // Test = X == 0.0
+  return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
+}
+
 SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
                                              bool LegalOps, bool OptForSize,
                                              NegatibleCost &Cost,
@@ -5733,6 +5935,11 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
 
   SDLoc DL(Op);
 
+  // Because getNegatedExpression can delete nodes we need a handle to keep
+  // temporary nodes alive in case the recursion manages to create an identical
+  // node.
+  std::list<HandleSDNode> Handles;
+
   switch (Opcode) {
   case ISD::ConstantFP: {
     // Don't invert constant FP values after legalization unless the target says
@@ -5801,11 +6008,18 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
     NegatibleCost CostX = NegatibleCost::Expensive;
     SDValue NegX =
         getNegatedExpression(X, DAG, LegalOps, OptForSize, CostX, Depth);
+    // Prevent this node from being deleted by the next call.
+    if (NegX)
+      Handles.emplace_back(NegX);
+
     // fold (fneg (fadd X, Y)) -> (fsub (fneg Y), X)
     NegatibleCost CostY = NegatibleCost::Expensive;
     SDValue NegY =
         getNegatedExpression(Y, DAG, LegalOps, OptForSize, CostY, Depth);
 
+    // We're done with the handles.
+    Handles.clear();
+
     // Negate the X if its cost is less or equal than Y.
     if (NegX && (CostX <= CostY)) {
       Cost = CostX;
@@ -5850,11 +6064,18 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
     NegatibleCost CostX = NegatibleCost::Expensive;
     SDValue NegX =
         getNegatedExpression(X, DAG, LegalOps, OptForSize, CostX, Depth);
+    // Prevent this node from being deleted by the next call.
+    if (NegX)
+      Handles.emplace_back(NegX);
+
     // fold (fneg (fmul X, Y)) -> (fmul X, (fneg Y))
     NegatibleCost CostY = NegatibleCost::Expensive;
     SDValue NegY =
         getNegatedExpression(Y, DAG, LegalOps, OptForSize, CostY, Depth);
 
+    // We're done with the handles.
+    Handles.clear();
+
     // Negate the X if its cost is less or equal than Y.
     if (NegX && (CostX <= CostY)) {
       Cost = CostX;
@@ -5892,15 +6113,25 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
     if (!NegZ)
       break;
 
+    // Prevent this node from being deleted by the next two calls.
+    Handles.emplace_back(NegZ);
+
     // fold (fneg (fma X, Y, Z)) -> (fma (fneg X), Y, (fneg Z))
     NegatibleCost CostX = NegatibleCost::Expensive;
     SDValue NegX =
         getNegatedExpression(X, DAG, LegalOps, OptForSize, CostX, Depth);
+    // Prevent this node from being deleted by the next call.
+    if (NegX)
+      Handles.emplace_back(NegX);
+
     // fold (fneg (fma X, Y, Z)) -> (fma X, (fneg Y), (fneg Z))
     NegatibleCost CostY = NegatibleCost::Expensive;
     SDValue NegY =
         getNegatedExpression(Y, DAG, LegalOps, OptForSize, CostY, Depth);
 
+    // We're done with the handles.
+    Handles.clear();
+
     // Negate the X if its cost is less or equal than Y.
     if (NegX && (CostX <= CostY)) {
       Cost = std::min(CostX, CostZ);
@@ -5941,7 +6172,7 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
 // Legalization Utilities
 //===----------------------------------------------------------------------===//
 
-bool TargetLowering::expandMUL_LOHI(unsigned Opcode, EVT VT, SDLoc dl,
+bool TargetLowering::expandMUL_LOHI(unsigned Opcode, EVT VT, const SDLoc &dl,
                                     SDValue LHS, SDValue RHS,
                                     SmallVectorImpl<SDValue> &Result,
                                     EVT HiLoVT, SelectionDAG &DAG,
@@ -5964,8 +6195,6 @@ bool TargetLowering::expandMUL_LOHI(unsigned Opcode, EVT VT, SDLoc dl,
 
   unsigned OuterBitSize = VT.getScalarSizeInBits();
   unsigned InnerBitSize = HiLoVT.getScalarSizeInBits();
-  unsigned LHSSB = DAG.ComputeNumSignBits(LHS);
-  unsigned RHSSB = DAG.ComputeNumSignBits(RHS);
 
   // LL, LH, RL, and RH must be either all NULL or all set to a value.
   assert((LL.getNode() && LH.getNode() && RL.getNode() && RH.getNode()) ||
@@ -6014,8 +6243,9 @@ bool TargetLowering::expandMUL_LOHI(unsigned Opcode, EVT VT, SDLoc dl,
     }
   }
 
-  if (!VT.isVector() && Opcode == ISD::MUL && LHSSB > InnerBitSize &&
-      RHSSB > InnerBitSize) {
+  if (!VT.isVector() && Opcode == ISD::MUL &&
+      DAG.ComputeNumSignBits(LHS) > InnerBitSize &&
+      DAG.ComputeNumSignBits(RHS) > InnerBitSize) {
     // The input values are both sign-extended.
     // TODO non-MUL case?
     if (MakeMUL_LOHI(LL, RL, Lo, Hi, true)) {
@@ -6129,7 +6359,7 @@ bool TargetLowering::expandMUL(SDNode *N, SDValue &Lo, SDValue &Hi, EVT HiLoVT,
                                SDValue LL, SDValue LH, SDValue RL,
                                SDValue RH) const {
   SmallVector<SDValue, 2> Result;
-  bool Ok = expandMUL_LOHI(N->getOpcode(), N->getValueType(0), N,
+  bool Ok = expandMUL_LOHI(N->getOpcode(), N->getValueType(0), SDLoc(N),
                            N->getOperand(0), N->getOperand(1), Result, HiLoVT,
                            DAG, Kind, LL, LH, RL, RH);
   if (Ok) {
@@ -6141,7 +6371,7 @@ bool TargetLowering::expandMUL(SDNode *N, SDValue &Lo, SDValue &Hi, EVT HiLoVT,
 }
 
 // Check that (every element of) Z is undef or not an exact multiple of BW.
-static bool isNonZeroModBitWidth(SDValue Z, unsigned BW) {
+static bool isNonZeroModBitWidthOrUndef(SDValue Z, unsigned BW) {
   return ISD::matchUnaryPredicate(
       Z,
       [=](ConstantSDNode *C) { return !C || C->getAPIntValue().urem(BW) != 0; },
@@ -6168,9 +6398,35 @@ bool TargetLowering::expandFunnelShift(SDNode *Node, SDValue &Result,
 
   EVT ShVT = Z.getValueType();
 
+  // If a funnel shift in the other direction is more supported, use it.
+  unsigned RevOpcode = IsFSHL ? ISD::FSHR : ISD::FSHL;
+  if (!isOperationLegalOrCustom(Node->getOpcode(), VT) &&
+      isOperationLegalOrCustom(RevOpcode, VT) && isPowerOf2_32(BW)) {
+    if (isNonZeroModBitWidthOrUndef(Z, BW)) {
+      // fshl X, Y, Z -> fshr X, Y, -Z
+      // fshr X, Y, Z -> fshl X, Y, -Z
+      SDValue Zero = DAG.getConstant(0, DL, ShVT);
+      Z = DAG.getNode(ISD::SUB, DL, VT, Zero, Z);
+    } else {
+      // fshl X, Y, Z -> fshr (srl X, 1), (fshr X, Y, 1), ~Z
+      // fshr X, Y, Z -> fshl (fshl X, Y, 1), (shl Y, 1), ~Z
+      SDValue One = DAG.getConstant(1, DL, ShVT);
+      if (IsFSHL) {
+        Y = DAG.getNode(RevOpcode, DL, VT, X, Y, One);
+        X = DAG.getNode(ISD::SRL, DL, VT, X, One);
+      } else {
+        X = DAG.getNode(RevOpcode, DL, VT, X, Y, One);
+        Y = DAG.getNode(ISD::SHL, DL, VT, Y, One);
+      }
+      Z = DAG.getNOT(DL, Z, ShVT);
+    }
+    Result = DAG.getNode(RevOpcode, DL, VT, X, Y, Z);
+    return true;
+  }
+
   SDValue ShX, ShY;
   SDValue ShAmt, InvShAmt;
-  if (isNonZeroModBitWidth(Z, BW)) {
+  if (isNonZeroModBitWidthOrUndef(Z, BW)) {
     // fshl: X << C | Y >> (BW - C)
     // fshr: X << (BW - C) | Y >> C
     // where C = Z % BW is not zero
@@ -6210,8 +6466,8 @@ bool TargetLowering::expandFunnelShift(SDNode *Node, SDValue &Result,
 }
 
 // TODO: Merge with expandFunnelShift.
-bool TargetLowering::expandROT(SDNode *Node, SDValue &Result,
-                               SelectionDAG &DAG) const {
+bool TargetLowering::expandROT(SDNode *Node, bool AllowVectorOps,
+                               SDValue &Result, SelectionDAG &DAG) const {
   EVT VT = Node->getValueType(0);
   unsigned EltSizeInBits = VT.getScalarSizeInBits();
   bool IsLeft = Node->getOpcode() == ISD::ROTL;
@@ -6222,36 +6478,47 @@ bool TargetLowering::expandROT(SDNode *Node, SDValue &Result,
   EVT ShVT = Op1.getValueType();
   SDValue Zero = DAG.getConstant(0, DL, ShVT);
 
-  assert(isPowerOf2_32(EltSizeInBits) && EltSizeInBits > 1 &&
-         "Expecting the type bitwidth to be a power of 2");
-
   // If a rotate in the other direction is supported, use it.
   unsigned RevRot = IsLeft ? ISD::ROTR : ISD::ROTL;
-  if (isOperationLegalOrCustom(RevRot, VT)) {
+  if (isOperationLegalOrCustom(RevRot, VT) && isPowerOf2_32(EltSizeInBits)) {
     SDValue Sub = DAG.getNode(ISD::SUB, DL, ShVT, Zero, Op1);
     Result = DAG.getNode(RevRot, DL, VT, Op0, Sub);
     return true;
   }
 
-  if (VT.isVector() && (!isOperationLegalOrCustom(ISD::SHL, VT) ||
-                        !isOperationLegalOrCustom(ISD::SRL, VT) ||
-                        !isOperationLegalOrCustom(ISD::SUB, VT) ||
-                        !isOperationLegalOrCustomOrPromote(ISD::OR, VT) ||
-                        !isOperationLegalOrCustomOrPromote(ISD::AND, VT)))
+  if (!AllowVectorOps && VT.isVector() &&
+      (!isOperationLegalOrCustom(ISD::SHL, VT) ||
+       !isOperationLegalOrCustom(ISD::SRL, VT) ||
+       !isOperationLegalOrCustom(ISD::SUB, VT) ||
+       !isOperationLegalOrCustomOrPromote(ISD::OR, VT) ||
+       !isOperationLegalOrCustomOrPromote(ISD::AND, VT)))
     return false;
 
-  // Otherwise,
-  //   (rotl x, c) -> (or (shl x, (and c, w-1)), (srl x, (and -c, w-1)))
-  //   (rotr x, c) -> (or (srl x, (and c, w-1)), (shl x, (and -c, w-1)))
-  //
   unsigned ShOpc = IsLeft ? ISD::SHL : ISD::SRL;
   unsigned HsOpc = IsLeft ? ISD::SRL : ISD::SHL;
   SDValue BitWidthMinusOneC = DAG.getConstant(EltSizeInBits - 1, DL, ShVT);
-  SDValue NegOp1 = DAG.getNode(ISD::SUB, DL, ShVT, Zero, Op1);
-  SDValue And0 = DAG.getNode(ISD::AND, DL, ShVT, Op1, BitWidthMinusOneC);
-  SDValue And1 = DAG.getNode(ISD::AND, DL, ShVT, NegOp1, BitWidthMinusOneC);
-  Result = DAG.getNode(ISD::OR, DL, VT, DAG.getNode(ShOpc, DL, VT, Op0, And0),
-                       DAG.getNode(HsOpc, DL, VT, Op0, And1));
+  SDValue ShVal;
+  SDValue HsVal;
+  if (isPowerOf2_32(EltSizeInBits)) {
+    // (rotl x, c) -> x << (c & (w - 1)) | x >> (-c & (w - 1))
+    // (rotr x, c) -> x >> (c & (w - 1)) | x << (-c & (w - 1))
+    SDValue NegOp1 = DAG.getNode(ISD::SUB, DL, ShVT, Zero, Op1);
+    SDValue ShAmt = DAG.getNode(ISD::AND, DL, ShVT, Op1, BitWidthMinusOneC);
+    ShVal = DAG.getNode(ShOpc, DL, VT, Op0, ShAmt);
+    SDValue HsAmt = DAG.getNode(ISD::AND, DL, ShVT, NegOp1, BitWidthMinusOneC);
+    HsVal = DAG.getNode(HsOpc, DL, VT, Op0, HsAmt);
+  } else {
+    // (rotl x, c) -> x << (c % w) | x >> 1 >> (w - 1 - (c % w))
+    // (rotr x, c) -> x >> (c % w) | x << 1 << (w - 1 - (c % w))
+    SDValue BitWidthC = DAG.getConstant(EltSizeInBits, DL, ShVT);
+    SDValue ShAmt = DAG.getNode(ISD::UREM, DL, ShVT, Op1, BitWidthC);
+    ShVal = DAG.getNode(ShOpc, DL, VT, Op0, ShAmt);
+    SDValue HsAmt = DAG.getNode(ISD::SUB, DL, ShVT, BitWidthMinusOneC, ShAmt);
+    SDValue One = DAG.getConstant(1, DL, ShVT);
+    HsVal =
+        DAG.getNode(HsOpc, DL, VT, DAG.getNode(HsOpc, DL, VT, Op0, One), HsAmt);
+  }
+  Result = DAG.getNode(ISD::OR, DL, VT, ShVal, HsVal);
   return true;
 }
 
@@ -6270,7 +6537,7 @@ bool TargetLowering::expandFP_TO_SINT(SDNode *Node, SDValue &Result,
   if (Node->isStrictFPOpcode())
     // When a NaN is converted to an integer a trap is allowed. We can't
     // use this expansion here because it would eliminate that trap. Other
-    // traps are also allowed and cannot be eliminated. See 
+    // traps are also allowed and cannot be eliminated. See
     // IEEE 754-2008 sec 5.8.
     return false;
 
@@ -6341,7 +6608,7 @@ bool TargetLowering::expandFP_TO_UINT(SDNode *Node, SDValue &Result,
       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), DstVT);
 
   // Only expand vector types if we have the appropriate vector bit operations.
-  unsigned SIntOpcode = Node->isStrictFPOpcode() ? ISD::STRICT_FP_TO_SINT : 
+  unsigned SIntOpcode = Node->isStrictFPOpcode() ? ISD::STRICT_FP_TO_SINT :
                                                    ISD::FP_TO_SINT;
   if (DstVT.isVector() && (!isOperationLegalOrCustom(SIntOpcode, DstVT) ||
                            !isOperationLegalOrCustomOrPromote(ISD::XOR, SrcVT)))
@@ -6356,14 +6623,19 @@ bool TargetLowering::expandFP_TO_UINT(SDNode *Node, SDValue &Result,
   if (APFloat::opOverflow &
       APF.convertFromAPInt(SignMask, false, APFloat::rmNearestTiesToEven)) {
     if (Node->isStrictFPOpcode()) {
-      Result = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { DstVT, MVT::Other }, 
-                           { Node->getOperand(0), Src }); 
+      Result = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { DstVT, MVT::Other },
+                           { Node->getOperand(0), Src });
       Chain = Result.getValue(1);
     } else
       Result = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, Src);
     return true;
   }
 
+  // Don't expand it if there isn't cheap fsub instruction.
+  if (!isOperationLegalOrCustom(
+          Node->isStrictFPOpcode() ? ISD::STRICT_FSUB : ISD::FSUB, SrcVT))
+    return false;
+
   SDValue Cst = DAG.getConstantFP(APF, dl, SrcVT);
   SDValue Sel;
 
@@ -6395,9 +6667,9 @@ bool TargetLowering::expandFP_TO_UINT(SDNode *Node, SDValue &Result,
                                    DAG.getConstant(SignMask, dl, DstVT));
     SDValue SInt;
     if (Node->isStrictFPOpcode()) {
-      SDValue Val = DAG.getNode(ISD::STRICT_FSUB, dl, { SrcVT, MVT::Other }, 
+      SDValue Val = DAG.getNode(ISD::STRICT_FSUB, dl, { SrcVT, MVT::Other },
                                 { Chain, Src, FltOfs });
-      SInt = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { DstVT, MVT::Other }, 
+      SInt = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { DstVT, MVT::Other },
                          { Val.getValue(1), Val });
       Chain = SInt.getValue(1);
     } else {
@@ -6426,8 +6698,13 @@ bool TargetLowering::expandFP_TO_UINT(SDNode *Node, SDValue &Result,
 bool TargetLowering::expandUINT_TO_FP(SDNode *Node, SDValue &Result,
                                       SDValue &Chain,
                                       SelectionDAG &DAG) const {
-  unsigned OpNo = Node->isStrictFPOpcode() ? 1 : 0;
-  SDValue Src = Node->getOperand(OpNo);
+  // This transform is not correct for converting 0 when rounding mode is set
+  // to round toward negative infinity which will produce -0.0. So disable under
+  // strictfp.
+  if (Node->isStrictFPOpcode())
+    return false;
+
+  SDValue Src = Node->getOperand(0);
   EVT SrcVT = Src.getValueType();
   EVT DstVT = Node->getValueType(0);
 
@@ -6446,9 +6723,10 @@ bool TargetLowering::expandUINT_TO_FP(SDNode *Node, SDValue &Result,
   EVT ShiftVT = getShiftAmountTy(SrcVT, DAG.getDataLayout());
 
   // Implementation of unsigned i64 to f64 following the algorithm in
-  // __floatundidf in compiler_rt. This implementation has the advantage
-  // of performing rounding correctly, both in the default rounding mode
-  // and in all alternate rounding modes.
+  // __floatundidf in compiler_rt.  This implementation performs rounding
+  // correctly in all rounding modes with the exception of converting 0
+  // when rounding toward negative infinity. In that case the fsub will produce
+  // -0.0. This will be added to +0.0 and produce -0.0 which is incorrect.
   SDValue TwoP52 = DAG.getConstant(UINT64_C(0x4330000000000000), dl, SrcVT);
   SDValue TwoP84PlusTwoP52 = DAG.getConstantFP(
       BitsToDouble(UINT64_C(0x4530000000100000)), dl, DstVT);
@@ -6462,18 +6740,9 @@ bool TargetLowering::expandUINT_TO_FP(SDNode *Node, SDValue &Result,
   SDValue HiOr = DAG.getNode(ISD::OR, dl, SrcVT, Hi, TwoP84);
   SDValue LoFlt = DAG.getBitcast(DstVT, LoOr);
   SDValue HiFlt = DAG.getBitcast(DstVT, HiOr);
-  if (Node->isStrictFPOpcode()) {
-    SDValue HiSub =
-        DAG.getNode(ISD::STRICT_FSUB, dl, {DstVT, MVT::Other},
-                    {Node->getOperand(0), HiFlt, TwoP84PlusTwoP52});
-    Result = DAG.getNode(ISD::STRICT_FADD, dl, {DstVT, MVT::Other},
-                         {HiSub.getValue(1), LoFlt, HiSub});
-    Chain = Result.getValue(1);
-  } else {
-    SDValue HiSub =
-        DAG.getNode(ISD::FSUB, dl, DstVT, HiFlt, TwoP84PlusTwoP52);
-    Result = DAG.getNode(ISD::FADD, dl, DstVT, LoFlt, HiSub);
-  }
+  SDValue HiSub =
+      DAG.getNode(ISD::FSUB, dl, DstVT, HiFlt, TwoP84PlusTwoP52);
+  Result = DAG.getNode(ISD::FADD, dl, DstVT, LoFlt, HiSub);
   return true;
 }
 
@@ -6483,6 +6752,11 @@ SDValue TargetLowering::expandFMINNUM_FMAXNUM(SDNode *Node,
   unsigned NewOp = Node->getOpcode() == ISD::FMINNUM ?
     ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
   EVT VT = Node->getValueType(0);
+
+  if (VT.isScalableVector())
+    report_fatal_error(
+        "Expanding fminnum/fmaxnum for scalable vectors is undefined.");
+
   if (isOperationLegalOrCustom(NewOp, VT)) {
     SDValue Quiet0 = Node->getOperand(0);
     SDValue Quiet1 = Node->getOperand(1);
@@ -6706,23 +6980,58 @@ bool TargetLowering::expandCTTZ(SDNode *Node, SDValue &Result,
 }
 
 bool TargetLowering::expandABS(SDNode *N, SDValue &Result,
-                               SelectionDAG &DAG) const {
+                               SelectionDAG &DAG, bool IsNegative) const {
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
   EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout());
   SDValue Op = N->getOperand(0);
 
+  // abs(x) -> smax(x,sub(0,x))
+  if (!IsNegative && isOperationLegal(ISD::SUB, VT) &&
+      isOperationLegal(ISD::SMAX, VT)) {
+    SDValue Zero = DAG.getConstant(0, dl, VT);
+    Result = DAG.getNode(ISD::SMAX, dl, VT, Op,
+                         DAG.getNode(ISD::SUB, dl, VT, Zero, Op));
+    return true;
+  }
+
+  // abs(x) -> umin(x,sub(0,x))
+  if (!IsNegative && isOperationLegal(ISD::SUB, VT) &&
+      isOperationLegal(ISD::UMIN, VT)) {
+    SDValue Zero = DAG.getConstant(0, dl, VT);
+    Result = DAG.getNode(ISD::UMIN, dl, VT, Op,
+                         DAG.getNode(ISD::SUB, dl, VT, Zero, Op));
+    return true;
+  }
+
+  // 0 - abs(x) -> smin(x, sub(0,x))
+  if (IsNegative && isOperationLegal(ISD::SUB, VT) &&
+      isOperationLegal(ISD::SMIN, VT)) {
+    SDValue Zero = DAG.getConstant(0, dl, VT);
+    Result = DAG.getNode(ISD::SMIN, dl, VT, Op,
+                         DAG.getNode(ISD::SUB, dl, VT, Zero, Op));
+    return true;
+  }
+
   // Only expand vector types if we have the appropriate vector operations.
-  if (VT.isVector() && (!isOperationLegalOrCustom(ISD::SRA, VT) ||
-                        !isOperationLegalOrCustom(ISD::ADD, VT) ||
-                        !isOperationLegalOrCustomOrPromote(ISD::XOR, VT)))
+  if (VT.isVector() &&
+      (!isOperationLegalOrCustom(ISD::SRA, VT) ||
+       (!IsNegative && !isOperationLegalOrCustom(ISD::ADD, VT)) ||
+       (IsNegative && !isOperationLegalOrCustom(ISD::SUB, VT)) ||
+       !isOperationLegalOrCustomOrPromote(ISD::XOR, VT)))
     return false;
 
   SDValue Shift =
       DAG.getNode(ISD::SRA, dl, VT, Op,
                   DAG.getConstant(VT.getScalarSizeInBits() - 1, dl, ShVT));
-  SDValue Add = DAG.getNode(ISD::ADD, dl, VT, Op, Shift);
-  Result = DAG.getNode(ISD::XOR, dl, VT, Add, Shift);
+  if (!IsNegative) {
+    SDValue Add = DAG.getNode(ISD::ADD, dl, VT, Op, Shift);
+    Result = DAG.getNode(ISD::XOR, dl, VT, Add, Shift);
+  } else {
+    // 0 - abs(x) -> Y = sra (X, size(X)-1); sub (Y, xor (X, Y))
+    SDValue Xor = DAG.getNode(ISD::XOR, dl, VT, Op, Shift);
+    Result = DAG.getNode(ISD::SUB, dl, VT, Shift, Xor);
+  }
   return true;
 }
 
@@ -6736,6 +7045,9 @@ TargetLowering::scalarizeVectorLoad(LoadSDNode *LD,
   EVT DstVT = LD->getValueType(0);
   ISD::LoadExtType ExtType = LD->getExtensionType();
 
+  if (SrcVT.isScalableVector())
+    report_fatal_error("Cannot scalarize scalable vector loads");
+
   unsigned NumElem = SrcVT.getVectorNumElements();
 
   EVT SrcEltVT = SrcVT.getScalarType();
@@ -6762,7 +7074,7 @@ TargetLowering::scalarizeVectorLoad(LoadSDNode *LD,
     // the codegen worse.
     SDValue Load =
         DAG.getExtLoad(ISD::EXTLOAD, SL, LoadVT, Chain, BasePTR,
-                       LD->getPointerInfo(), SrcIntVT, LD->getAlignment(),
+                       LD->getPointerInfo(), SrcIntVT, LD->getOriginalAlign(),
                        LD->getMemOperand()->getFlags(), LD->getAAInfo());
 
     SmallVector<SDValue, 8> Vals;
@@ -6799,10 +7111,10 @@ TargetLowering::scalarizeVectorLoad(LoadSDNode *LD,
     SDValue ScalarLoad =
         DAG.getExtLoad(ExtType, SL, DstEltVT, Chain, BasePTR,
                        LD->getPointerInfo().getWithOffset(Idx * Stride),
-                       SrcEltVT, MinAlign(LD->getAlignment(), Idx * Stride),
+                       SrcEltVT, LD->getOriginalAlign(),
                        LD->getMemOperand()->getFlags(), LD->getAAInfo());
 
-    BasePTR = DAG.getObjectPtrOffset(SL, BasePTR, Stride);
+    BasePTR = DAG.getObjectPtrOffset(SL, BasePTR, TypeSize::Fixed(Stride));
 
     Vals.push_back(ScalarLoad.getValue(0));
     LoadChains.push_back(ScalarLoad.getValue(1));
@@ -6823,6 +7135,9 @@ SDValue TargetLowering::scalarizeVectorStore(StoreSDNode *ST,
   SDValue Value = ST->getValue();
   EVT StVT = ST->getMemoryVT();
 
+  if (StVT.isScalableVector())
+    report_fatal_error("Cannot scalarize scalable vector stores");
+
   // The type of the data we want to save
   EVT RegVT = Value.getValueType();
   EVT RegSclVT = RegVT.getScalarType();
@@ -6859,7 +7174,7 @@ SDValue TargetLowering::scalarizeVectorStore(StoreSDNode *ST,
     }
 
     return DAG.getStore(Chain, SL, CurrVal, BasePtr, ST->getPointerInfo(),
-                        ST->getAlignment(), ST->getMemOperand()->getFlags(),
+                        ST->getOriginalAlign(), ST->getMemOperand()->getFlags(),
                         ST->getAAInfo());
   }
 
@@ -6873,13 +7188,14 @@ SDValue TargetLowering::scalarizeVectorStore(StoreSDNode *ST,
     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, RegSclVT, Value,
                               DAG.getVectorIdxConstant(Idx, SL));
 
-    SDValue Ptr = DAG.getObjectPtrOffset(SL, BasePtr, Idx * Stride);
+    SDValue Ptr =
+        DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::Fixed(Idx * Stride));
 
     // This scalar TruncStore may be illegal, but we legalize it later.
     SDValue Store = DAG.getTruncStore(
         Chain, SL, Elt, Ptr, ST->getPointerInfo().getWithOffset(Idx * Stride),
-        MemSclVT, MinAlign(ST->getAlignment(), Idx * Stride),
-        ST->getMemOperand()->getFlags(), ST->getAAInfo());
+        MemSclVT, ST->getOriginalAlign(), ST->getMemOperand()->getFlags(),
+        ST->getAAInfo());
 
     Stores.push_back(Store);
   }
@@ -6944,7 +7260,7 @@ TargetLowering::expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const {
       // Load one integer register's worth from the original location.
       SDValue Load = DAG.getLoad(
           RegVT, dl, Chain, Ptr, LD->getPointerInfo().getWithOffset(Offset),
-          MinAlign(LD->getAlignment(), Offset), LD->getMemOperand()->getFlags(),
+          LD->getOriginalAlign(), LD->getMemOperand()->getFlags(),
           LD->getAAInfo());
       // Follow the load with a store to the stack slot.  Remember the store.
       Stores.push_back(DAG.getStore(
@@ -6963,8 +7279,8 @@ TargetLowering::expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const {
     SDValue Load =
         DAG.getExtLoad(ISD::EXTLOAD, dl, RegVT, Chain, Ptr,
                        LD->getPointerInfo().getWithOffset(Offset), MemVT,
-                       MinAlign(LD->getAlignment(), Offset),
-                       LD->getMemOperand()->getFlags(), LD->getAAInfo());
+                       LD->getOriginalAlign(), LD->getMemOperand()->getFlags(),
+                       LD->getAAInfo());
     // Follow the load with a store to the stack slot.  Remember the store.
     // On big-endian machines this requires a truncating store to ensure
     // that the bits end up in the right place.
@@ -6994,7 +7310,7 @@ TargetLowering::expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const {
   NewLoadedVT = EVT::getIntegerVT(*DAG.getContext(), NumBits/2);
   NumBits >>= 1;
 
-  unsigned Alignment = LD->getAlignment();
+  Align Alignment = LD->getOriginalAlign();
   unsigned IncrementSize = NumBits / 8;
   ISD::LoadExtType HiExtType = LD->getExtensionType();
 
@@ -7009,21 +7325,21 @@ TargetLowering::expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const {
                         NewLoadedVT, Alignment, LD->getMemOperand()->getFlags(),
                         LD->getAAInfo());
 
-    Ptr = DAG.getObjectPtrOffset(dl, Ptr, IncrementSize);
+    Ptr = DAG.getObjectPtrOffset(dl, Ptr, TypeSize::Fixed(IncrementSize));
     Hi = DAG.getExtLoad(HiExtType, dl, VT, Chain, Ptr,
                         LD->getPointerInfo().getWithOffset(IncrementSize),
-                        NewLoadedVT, MinAlign(Alignment, IncrementSize),
-                        LD->getMemOperand()->getFlags(), LD->getAAInfo());
+                        NewLoadedVT, Alignment, LD->getMemOperand()->getFlags(),
+                        LD->getAAInfo());
   } else {
     Hi = DAG.getExtLoad(HiExtType, dl, VT, Chain, Ptr, LD->getPointerInfo(),
                         NewLoadedVT, Alignment, LD->getMemOperand()->getFlags(),
                         LD->getAAInfo());
 
-    Ptr = DAG.getObjectPtrOffset(dl, Ptr, IncrementSize);
+    Ptr = DAG.getObjectPtrOffset(dl, Ptr, TypeSize::Fixed(IncrementSize));
     Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, VT, Chain, Ptr,
                         LD->getPointerInfo().getWithOffset(IncrementSize),
-                        NewLoadedVT, MinAlign(Alignment, IncrementSize),
-                        LD->getMemOperand()->getFlags(), LD->getAAInfo());
+                        NewLoadedVT, Alignment, LD->getMemOperand()->getFlags(),
+                        LD->getAAInfo());
   }
 
   // aggregate the two parts
@@ -7047,7 +7363,7 @@ SDValue TargetLowering::expandUnalignedStore(StoreSDNode *ST,
   SDValue Ptr = ST->getBasePtr();
   SDValue Val = ST->getValue();
   EVT VT = Val.getValueType();
-  int Alignment = ST->getAlignment();
+  Align Alignment = ST->getOriginalAlign();
   auto &MF = DAG.getMachineFunction();
   EVT StoreMemVT = ST->getMemoryVT();
 
@@ -7104,7 +7420,7 @@ SDValue TargetLowering::expandUnalignedStore(StoreSDNode *ST,
       // Store it to the final location.  Remember the store.
       Stores.push_back(DAG.getStore(Load.getValue(1), dl, Load, Ptr,
                                     ST->getPointerInfo().getWithOffset(Offset),
-                                    MinAlign(ST->getAlignment(), Offset),
+                                    ST->getOriginalAlign(),
                                     ST->getMemOperand()->getFlags()));
       // Increment the pointers.
       Offset += RegBytes;
@@ -7126,7 +7442,7 @@ SDValue TargetLowering::expandUnalignedStore(StoreSDNode *ST,
     Stores.push_back(
         DAG.getTruncStore(Load.getValue(1), dl, Load, Ptr,
                           ST->getPointerInfo().getWithOffset(Offset), LoadMemVT,
-                          MinAlign(ST->getAlignment(), Offset),
+                          ST->getOriginalAlign(),
                           ST->getMemOperand()->getFlags(), ST->getAAInfo()));
     // The order of the stores doesn't matter - say it with a TokenFactor.
     SDValue Result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
@@ -7137,8 +7453,8 @@ SDValue TargetLowering::expandUnalignedStore(StoreSDNode *ST,
          "Unaligned store of unknown type.");
   // Get the half-size VT
   EVT NewStoredVT = StoreMemVT.getHalfSizedIntegerVT(*DAG.getContext());
-  int NumBits = NewStoredVT.getSizeInBits();
-  int IncrementSize = NumBits / 8;
+  unsigned NumBits = NewStoredVT.getFixedSizeInBits();
+  unsigned IncrementSize = NumBits / 8;
 
   // Divide the stored value in two parts.
   SDValue ShiftAmount = DAG.getConstant(
@@ -7153,8 +7469,7 @@ SDValue TargetLowering::expandUnalignedStore(StoreSDNode *ST,
                              Ptr, ST->getPointerInfo(), NewStoredVT, Alignment,
                              ST->getMemOperand()->getFlags());
 
-  Ptr = DAG.getObjectPtrOffset(dl, Ptr, IncrementSize);
-  Alignment = MinAlign(Alignment, IncrementSize);
+  Ptr = DAG.getObjectPtrOffset(dl, Ptr, TypeSize::Fixed(IncrementSize));
   Store2 = DAG.getTruncStore(
       Chain, dl, DAG.getDataLayout().isLittleEndian() ? Hi : Lo, Ptr,
       ST->getPointerInfo().getWithOffset(IncrementSize), NewStoredVT, Alignment,
@@ -7173,9 +7488,12 @@ TargetLowering::IncrementMemoryAddress(SDValue Addr, SDValue Mask,
   SDValue Increment;
   EVT AddrVT = Addr.getValueType();
   EVT MaskVT = Mask.getValueType();
-  assert(DataVT.getVectorNumElements() == MaskVT.getVectorNumElements() &&
+  assert(DataVT.getVectorElementCount() == MaskVT.getVectorElementCount() &&
          "Incompatible types of Data and Mask");
   if (IsCompressedMemory) {
+    if (DataVT.isScalableVector())
+      report_fatal_error(
+          "Cannot currently handle compressed memory with scalable vectors");
     // Incrementing the pointer according to number of '1's in the mask.
     EVT MaskIntVT = EVT::getIntegerVT(*DAG.getContext(), MaskVT.getSizeInBits());
     SDValue MaskInIntReg = DAG.getBitcast(MaskIntVT, Mask);
@@ -7191,6 +7509,10 @@ TargetLowering::IncrementMemoryAddress(SDValue Addr, SDValue Mask,
     SDValue Scale = DAG.getConstant(DataVT.getScalarSizeInBits() / 8, DL,
                                     AddrVT);
     Increment = DAG.getNode(ISD::MUL, DL, AddrVT, Increment, Scale);
+  } else if (DataVT.isScalableVector()) {
+    Increment = DAG.getVScale(DL, AddrVT,
+                              APInt(AddrVT.getFixedSizeInBits(),
+                                    DataVT.getStoreSize().getKnownMinSize()));
   } else
     Increment = DAG.getConstant(DataVT.getStoreSize(), DL, AddrVT);
 
@@ -7201,16 +7523,26 @@ static SDValue clampDynamicVectorIndex(SelectionDAG &DAG,
                                        SDValue Idx,
                                        EVT VecVT,
                                        const SDLoc &dl) {
-  if (isa<ConstantSDNode>(Idx))
+  if (!VecVT.isScalableVector() && isa<ConstantSDNode>(Idx))
     return Idx;
 
   EVT IdxVT = Idx.getValueType();
-  unsigned NElts = VecVT.getVectorNumElements();
-  if (isPowerOf2_32(NElts)) {
-    APInt Imm = APInt::getLowBitsSet(IdxVT.getSizeInBits(),
-                                     Log2_32(NElts));
-    return DAG.getNode(ISD::AND, dl, IdxVT, Idx,
-                       DAG.getConstant(Imm, dl, IdxVT));
+  unsigned NElts = VecVT.getVectorMinNumElements();
+  if (VecVT.isScalableVector()) {
+    SDValue VS = DAG.getVScale(dl, IdxVT,
+                               APInt(IdxVT.getFixedSizeInBits(),
+                                     NElts));
+    SDValue Sub = DAG.getNode(ISD::SUB, dl, IdxVT, VS,
+                              DAG.getConstant(1, dl, IdxVT));
+
+    return DAG.getNode(ISD::UMIN, dl, IdxVT, Idx, Sub);
+  } else {
+    if (isPowerOf2_32(NElts)) {
+      APInt Imm = APInt::getLowBitsSet(IdxVT.getSizeInBits(),
+                                       Log2_32(NElts));
+      return DAG.getNode(ISD::AND, dl, IdxVT, Idx,
+                         DAG.getConstant(Imm, dl, IdxVT));
+    }
   }
 
   return DAG.getNode(ISD::UMIN, dl, IdxVT, Idx,
@@ -7227,8 +7559,8 @@ SDValue TargetLowering::getVectorElementPointer(SelectionDAG &DAG,
   EVT EltVT = VecVT.getVectorElementType();
 
   // Calculate the element offset and add it to the pointer.
-  unsigned EltSize = EltVT.getSizeInBits() / 8; // FIXME: should be ABI size.
-  assert(EltSize * 8 == EltVT.getSizeInBits() &&
+  unsigned EltSize = EltVT.getFixedSizeInBits() / 8; // FIXME: should be ABI size.
+  assert(EltSize * 8 == EltVT.getFixedSizeInBits() &&
          "Converting bits to bytes lost precision");
 
   Index = clampDynamicVectorIndex(DAG, Index, VecVT, dl);
@@ -7306,6 +7638,65 @@ SDValue TargetLowering::lowerCmpEqZeroToCtlzSrl(SDValue Op,
   return SDValue();
 }
 
+// Convert redundant addressing modes (e.g. scaling is redundant
+// when accessing bytes).
+ISD::MemIndexType
+TargetLowering::getCanonicalIndexType(ISD::MemIndexType IndexType, EVT MemVT,
+                                      SDValue Offsets) const {
+  bool IsScaledIndex =
+      (IndexType == ISD::SIGNED_SCALED) || (IndexType == ISD::UNSIGNED_SCALED);
+  bool IsSignedIndex =
+      (IndexType == ISD::SIGNED_SCALED) || (IndexType == ISD::SIGNED_UNSCALED);
+
+  // Scaling is unimportant for bytes, canonicalize to unscaled.
+  if (IsScaledIndex && MemVT.getScalarType() == MVT::i8) {
+    IsScaledIndex = false;
+    IndexType = IsSignedIndex ? ISD::SIGNED_UNSCALED : ISD::UNSIGNED_UNSCALED;
+  }
+
+  return IndexType;
+}
+
+SDValue TargetLowering::expandIntMINMAX(SDNode *Node, SelectionDAG &DAG) const {
+  SDValue Op0 = Node->getOperand(0);
+  SDValue Op1 = Node->getOperand(1);
+  EVT VT = Op0.getValueType();
+  unsigned Opcode = Node->getOpcode();
+  SDLoc DL(Node);
+
+  // umin(x,y) -> sub(x,usubsat(x,y))
+  if (Opcode == ISD::UMIN && isOperationLegal(ISD::SUB, VT) &&
+      isOperationLegal(ISD::USUBSAT, VT)) {
+    return DAG.getNode(ISD::SUB, DL, VT, Op0,
+                       DAG.getNode(ISD::USUBSAT, DL, VT, Op0, Op1));
+  }
+
+  // umax(x,y) -> add(x,usubsat(y,x))
+  if (Opcode == ISD::UMAX && isOperationLegal(ISD::ADD, VT) &&
+      isOperationLegal(ISD::USUBSAT, VT)) {
+    return DAG.getNode(ISD::ADD, DL, VT, Op0,
+                       DAG.getNode(ISD::USUBSAT, DL, VT, Op1, Op0));
+  }
+
+  // Expand Y = MAX(A, B) -> Y = (A > B) ? A : B
+  ISD::CondCode CC;
+  switch (Opcode) {
+  default: llvm_unreachable("How did we get here?");
+  case ISD::SMAX: CC = ISD::SETGT; break;
+  case ISD::SMIN: CC = ISD::SETLT; break;
+  case ISD::UMAX: CC = ISD::SETUGT; break;
+  case ISD::UMIN: CC = ISD::SETULT; break;
+  }
+
+  // FIXME: Should really try to split the vector in case it's legal on a
+  // subvector.
+  if (VT.isVector() && !isOperationLegalOrCustom(ISD::VSELECT, VT))
+    return DAG.UnrollVectorOp(Node);
+
+  SDValue Cond = DAG.getSetCC(DL, VT, Op0, Op1, CC);
+  return DAG.getSelect(DL, VT, Cond, Op0, Op1);
+}
+
 SDValue TargetLowering::expandAddSubSat(SDNode *Node, SelectionDAG &DAG) const {
   unsigned Opcode = Node->getOpcode();
   SDValue LHS = Node->getOperand(0);
@@ -7317,12 +7708,13 @@ SDValue TargetLowering::expandAddSubSat(SDNode *Node, SelectionDAG &DAG) const {
   assert(VT.isInteger() && "Expected operands to be integers");
 
   // usub.sat(a, b) -> umax(a, b) - b
-  if (Opcode == ISD::USUBSAT && isOperationLegalOrCustom(ISD::UMAX, VT)) {
+  if (Opcode == ISD::USUBSAT && isOperationLegal(ISD::UMAX, VT)) {
     SDValue Max = DAG.getNode(ISD::UMAX, dl, VT, LHS, RHS);
     return DAG.getNode(ISD::SUB, dl, VT, Max, RHS);
   }
 
-  if (Opcode == ISD::UADDSAT && isOperationLegalOrCustom(ISD::UMIN, VT)) {
+  // uadd.sat(a, b) -> umin(a, ~b) + b
+  if (Opcode == ISD::UADDSAT && isOperationLegal(ISD::UMIN, VT)) {
     SDValue InvRHS = DAG.getNOT(dl, RHS, VT);
     SDValue Min = DAG.getNode(ISD::UMIN, dl, VT, LHS, InvRHS);
     return DAG.getNode(ISD::ADD, dl, VT, Min, RHS);
@@ -7347,6 +7739,11 @@ SDValue TargetLowering::expandAddSubSat(SDNode *Node, SelectionDAG &DAG) const {
                      "addition or subtraction node.");
   }
 
+  // FIXME: Should really try to split the vector in case it's legal on a
+  // subvector.
+  if (VT.isVector() && !isOperationLegalOrCustom(ISD::VSELECT, VT))
+    return DAG.UnrollVectorOp(Node);
+
   unsigned BitWidth = LHS.getScalarValueSizeInBits();
   EVT BoolVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
   SDValue Result = DAG.getNode(OverflowOp, dl, DAG.getVTList(VT, BoolVT),
@@ -7386,6 +7783,41 @@ SDValue TargetLowering::expandAddSubSat(SDNode *Node, SelectionDAG &DAG) const {
   }
 }
 
+SDValue TargetLowering::expandShlSat(SDNode *Node, SelectionDAG &DAG) const {
+  unsigned Opcode = Node->getOpcode();
+  bool IsSigned = Opcode == ISD::SSHLSAT;
+  SDValue LHS = Node->getOperand(0);
+  SDValue RHS = Node->getOperand(1);
+  EVT VT = LHS.getValueType();
+  SDLoc dl(Node);
+
+  assert((Node->getOpcode() == ISD::SSHLSAT ||
+          Node->getOpcode() == ISD::USHLSAT) &&
+          "Expected a SHLSAT opcode");
+  assert(VT == RHS.getValueType() && "Expected operands to be the same type");
+  assert(VT.isInteger() && "Expected operands to be integers");
+
+  // If LHS != (LHS << RHS) >> RHS, we have overflow and must saturate.
+
+  unsigned BW = VT.getScalarSizeInBits();
+  SDValue Result = DAG.getNode(ISD::SHL, dl, VT, LHS, RHS);
+  SDValue Orig =
+      DAG.getNode(IsSigned ? ISD::SRA : ISD::SRL, dl, VT, Result, RHS);
+
+  SDValue SatVal;
+  if (IsSigned) {
+    SDValue SatMin = DAG.getConstant(APInt::getSignedMinValue(BW), dl, VT);
+    SDValue SatMax = DAG.getConstant(APInt::getSignedMaxValue(BW), dl, VT);
+    SatVal = DAG.getSelectCC(dl, LHS, DAG.getConstant(0, dl, VT),
+                             SatMin, SatMax, ISD::SETLT);
+  } else {
+    SatVal = DAG.getConstant(APInt::getMaxValue(BW), dl, VT);
+  }
+  Result = DAG.getSelectCC(dl, LHS, Orig, SatVal, Result, ISD::SETNE);
+
+  return Result;
+}
+
 SDValue
 TargetLowering::expandFixedPointMul(SDNode *Node, SelectionDAG &DAG) const {
   assert((Node->getOpcode() == ISD::SMULFIX ||
@@ -7759,7 +8191,7 @@ bool TargetLowering::expandMULO(SDNode *Node, SDValue &Result,
     if (isSigned) {
       // The high part is obtained by SRA'ing all but one of the bits of low
       // part.
-      unsigned LoSize = VT.getSizeInBits();
+      unsigned LoSize = VT.getFixedSizeInBits();
       HiLHS =
           DAG.getNode(ISD::SRA, dl, VT, LHS,
                       DAG.getConstant(LoSize - 1, dl,
@@ -7818,7 +8250,7 @@ bool TargetLowering::expandMULO(SDNode *Node, SDValue &Result,
 
   // Truncate the result if SetCC returns a larger type than needed.
   EVT RType = Node->getValueType(1);
-  if (RType.getSizeInBits() < Overflow.getValueSizeInBits())
+  if (RType.bitsLT(Overflow.getValueType()))
     Overflow = DAG.getNode(ISD::TRUNCATE, dl, RType, Overflow);
 
   assert(RType.getSizeInBits() == Overflow.getValueSizeInBits() &&
@@ -7828,32 +8260,14 @@ bool TargetLowering::expandMULO(SDNode *Node, SDValue &Result,
 
 SDValue TargetLowering::expandVecReduce(SDNode *Node, SelectionDAG &DAG) const {
   SDLoc dl(Node);
-  bool NoNaN = Node->getFlags().hasNoNaNs();
-  unsigned BaseOpcode = 0;
-  switch (Node->getOpcode()) {
-  default: llvm_unreachable("Expected VECREDUCE opcode");
-  case ISD::VECREDUCE_FADD: BaseOpcode = ISD::FADD; break;
-  case ISD::VECREDUCE_FMUL: BaseOpcode = ISD::FMUL; break;
-  case ISD::VECREDUCE_ADD:  BaseOpcode = ISD::ADD; break;
-  case ISD::VECREDUCE_MUL:  BaseOpcode = ISD::MUL; break;
-  case ISD::VECREDUCE_AND:  BaseOpcode = ISD::AND; break;
-  case ISD::VECREDUCE_OR:   BaseOpcode = ISD::OR; break;
-  case ISD::VECREDUCE_XOR:  BaseOpcode = ISD::XOR; break;
-  case ISD::VECREDUCE_SMAX: BaseOpcode = ISD::SMAX; break;
-  case ISD::VECREDUCE_SMIN: BaseOpcode = ISD::SMIN; break;
-  case ISD::VECREDUCE_UMAX: BaseOpcode = ISD::UMAX; break;
-  case ISD::VECREDUCE_UMIN: BaseOpcode = ISD::UMIN; break;
-  case ISD::VECREDUCE_FMAX:
-    BaseOpcode = NoNaN ? ISD::FMAXNUM : ISD::FMAXIMUM;
-    break;
-  case ISD::VECREDUCE_FMIN:
-    BaseOpcode = NoNaN ? ISD::FMINNUM : ISD::FMINIMUM;
-    break;
-  }
-
+  unsigned BaseOpcode = ISD::getVecReduceBaseOpcode(Node->getOpcode());
   SDValue Op = Node->getOperand(0);
   EVT VT = Op.getValueType();
 
+  if (VT.isScalableVector())
+    report_fatal_error(
+        "Expanding reductions for scalable vectors is undefined.");
+
   // Try to use a shuffle reduction for power of two vectors.
   if (VT.isPow2VectorType()) {
     while (VT.getVectorNumElements() > 1) {
@@ -7884,6 +8298,33 @@ SDValue TargetLowering::expandVecReduce(SDNode *Node, SelectionDAG &DAG) const {
   return Res;
 }
 
+SDValue TargetLowering::expandVecReduceSeq(SDNode *Node, SelectionDAG &DAG) const {
+  SDLoc dl(Node);
+  SDValue AccOp = Node->getOperand(0);
+  SDValue VecOp = Node->getOperand(1);
+  SDNodeFlags Flags = Node->getFlags();
+
+  EVT VT = VecOp.getValueType();
+  EVT EltVT = VT.getVectorElementType();
+
+  if (VT.isScalableVector())
+    report_fatal_error(
+        "Expanding reductions for scalable vectors is undefined.");
+
+  unsigned NumElts = VT.getVectorNumElements();
+
+  SmallVector<SDValue, 8> Ops;
+  DAG.ExtractVectorElements(VecOp, Ops, 0, NumElts);
+
+  unsigned BaseOpcode = ISD::getVecReduceBaseOpcode(Node->getOpcode());
+
+  SDValue Res = AccOp;
+  for (unsigned i = 0; i < NumElts; i++)
+    Res = DAG.getNode(BaseOpcode, dl, EltVT, Res, Ops[i], Flags);
+
+  return Res;
+}
+
 bool TargetLowering::expandREM(SDNode *Node, SDValue &Result,
                                SelectionDAG &DAG) const {
   EVT VT = Node->getValueType(0);
@@ -7906,3 +8347,105 @@ bool TargetLowering::expandREM(SDNode *Node, SDValue &Result,
   }
   return false;
 }
+
+SDValue TargetLowering::expandFP_TO_INT_SAT(SDNode *Node,
+                                            SelectionDAG &DAG) const {
+  bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
+  SDLoc dl(SDValue(Node, 0));
+  SDValue Src = Node->getOperand(0);
+
+  // DstVT is the result type, while SatVT is the size to which we saturate
+  EVT SrcVT = Src.getValueType();
+  EVT DstVT = Node->getValueType(0);
+
+  unsigned SatWidth = Node->getConstantOperandVal(1);
+  unsigned DstWidth = DstVT.getScalarSizeInBits();
+  assert(SatWidth <= DstWidth &&
+         "Expected saturation width smaller than result width");
+
+  // Determine minimum and maximum integer values and their corresponding
+  // floating-point values.
+  APInt MinInt, MaxInt;
+  if (IsSigned) {
+    MinInt = APInt::getSignedMinValue(SatWidth).sextOrSelf(DstWidth);
+    MaxInt = APInt::getSignedMaxValue(SatWidth).sextOrSelf(DstWidth);
+  } else {
+    MinInt = APInt::getMinValue(SatWidth).zextOrSelf(DstWidth);
+    MaxInt = APInt::getMaxValue(SatWidth).zextOrSelf(DstWidth);
+  }
+
+  // We cannot risk emitting FP_TO_XINT nodes with a source VT of f16, as
+  // libcall emission cannot handle this. Large result types will fail.
+  if (SrcVT == MVT::f16) {
+    Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, Src);
+    SrcVT = Src.getValueType();
+  }
+
+  APFloat MinFloat(DAG.EVTToAPFloatSemantics(SrcVT));
+  APFloat MaxFloat(DAG.EVTToAPFloatSemantics(SrcVT));
+
+  APFloat::opStatus MinStatus =
+      MinFloat.convertFromAPInt(MinInt, IsSigned, APFloat::rmTowardZero);
+  APFloat::opStatus MaxStatus =
+      MaxFloat.convertFromAPInt(MaxInt, IsSigned, APFloat::rmTowardZero);
+  bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact) &&
+                             !(MaxStatus & APFloat::opStatus::opInexact);
+
+  SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
+  SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
+
+  // If the integer bounds are exactly representable as floats and min/max are
+  // legal, emit a min+max+fptoi sequence. Otherwise we have to use a sequence
+  // of comparisons and selects.
+  bool MinMaxLegal = isOperationLegal(ISD::FMINNUM, SrcVT) &&
+                     isOperationLegal(ISD::FMAXNUM, SrcVT);
+  if (AreExactFloatBounds && MinMaxLegal) {
+    SDValue Clamped = Src;
+
+    // Clamp Src by MinFloat from below. If Src is NaN the result is MinFloat.
+    Clamped = DAG.getNode(ISD::FMAXNUM, dl, SrcVT, Clamped, MinFloatNode);
+    // Clamp by MaxFloat from above. NaN cannot occur.
+    Clamped = DAG.getNode(ISD::FMINNUM, dl, SrcVT, Clamped, MaxFloatNode);
+    // Convert clamped value to integer.
+    SDValue FpToInt = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT,
+                                  dl, DstVT, Clamped);
+
+    // In the unsigned case we're done, because we mapped NaN to MinFloat,
+    // which will cast to zero.
+    if (!IsSigned)
+      return FpToInt;
+
+    // Otherwise, select 0 if Src is NaN.
+    SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
+    return DAG.getSelectCC(dl, Src, Src, ZeroInt, FpToInt,
+                           ISD::CondCode::SETUO);
+  }
+
+  SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
+  SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);
+
+  // Result of direct conversion. The assumption here is that the operation is
+  // non-trapping and it's fine to apply it to an out-of-range value if we
+  // select it away later.
+  SDValue FpToInt =
+      DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl, DstVT, Src);
+
+  SDValue Select = FpToInt;
+
+  // If Src ULT MinFloat, select MinInt. In particular, this also selects
+  // MinInt if Src is NaN.
+  Select = DAG.getSelectCC(dl, Src, MinFloatNode, MinIntNode, Select,
+                           ISD::CondCode::SETULT);
+  // If Src OGT MaxFloat, select MaxInt.
+  Select = DAG.getSelectCC(dl, Src, MaxFloatNode, MaxIntNode, Select,
+                           ISD::CondCode::SETOGT);
+
+  // In the unsigned case we are done, because we mapped NaN to MinInt, which
+  // is already zero.
+  if (!IsSigned)
+    return Select;
+
+  // Otherwise, select 0 if Src is NaN.
+  SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
+  return DAG.getSelectCC(dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/ShrinkWrap.cpp b/contrib/llvm-project/llvm/lib/CodeGen/ShrinkWrap.cpp
index ce43fb1fbd4b..f89069e9f728 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/ShrinkWrap.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/ShrinkWrap.cpp
@@ -144,7 +144,7 @@ class ShrinkWrap : public MachineFunctionPass {
   unsigned FrameDestroyOpcode;
 
   /// Stack pointer register, used by llvm.{savestack,restorestack}
-  unsigned SP;
+  Register SP;
 
   /// Entry block.
   const MachineBasicBlock *Entry;
@@ -331,11 +331,7 @@ void ShrinkWrap::updateSaveRestorePoints(MachineBasicBlock &MBB,
     Save = &MBB;
   else
     Save = MDT->findNearestCommonDominator(Save, &MBB);
-
-  if (!Save) {
-    LLVM_DEBUG(dbgs() << "Found a block that is not reachable from Entry\n");
-    return;
-  }
+  assert(Save);
 
   if (!Restore)
     Restore = &MBB;
@@ -381,7 +377,7 @@ void ShrinkWrap::updateSaveRestorePoints(MachineBasicBlock &MBB,
   // C. Save and Restore are in the same loop.
   bool SaveDominatesRestore = false;
   bool RestorePostDominatesSave = false;
-  while (Save && Restore &&
+  while (Restore &&
          (!(SaveDominatesRestore = MDT->dominates(Save, Restore)) ||
           !(RestorePostDominatesSave = MPDT->dominates(Restore, Save)) ||
           // Post-dominance is not enough in loops to ensure that all uses/defs
@@ -412,8 +408,7 @@ void ShrinkWrap::updateSaveRestorePoints(MachineBasicBlock &MBB,
       Restore = MPDT->findNearestCommonDominator(Restore, Save);
 
     // Fix (C).
-    if (Save && Restore &&
-        (MLI->getLoopFor(Save) || MLI->getLoopFor(Restore))) {
+    if (Restore && (MLI->getLoopFor(Save) || MLI->getLoopFor(Restore))) {
       if (MLI->getLoopDepth(Save) > MLI->getLoopDepth(Restore)) {
         // Push Save outside of this loop if immediate dominator is different
         // from save block. If immediate dominator is not different, bail out.
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SjLjEHPrepare.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SjLjEHPrepare.cpp
index 0683058f177e..d2fd4a6d8fd9 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/SjLjEHPrepare.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SjLjEHPrepare.cpp
@@ -142,7 +142,7 @@ static void MarkBlocksLiveIn(BasicBlock *BB,
 /// instruction with those returned by the personality function.
 void SjLjEHPrepare::substituteLPadValues(LandingPadInst *LPI, Value *ExnVal,
                                          Value *SelVal) {
-  SmallVector<Value *, 8> UseWorkList(LPI->user_begin(), LPI->user_end());
+  SmallVector<Value *, 8> UseWorkList(LPI->users());
   while (!UseWorkList.empty()) {
     Value *Val = UseWorkList.pop_back_val();
     auto *EVI = dyn_cast<ExtractValueInst>(Val);
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SpillPlacement.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SpillPlacement.cpp
index 36a0ddf67b19..4bb50a285497 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/SpillPlacement.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SpillPlacement.cpp
@@ -27,10 +27,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "SpillPlacement.h"
-#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/SparseSet.h"
 #include "llvm/CodeGen/EdgeBundles.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
@@ -39,7 +36,6 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/BlockFrequency.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SplitKit.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SplitKit.cpp
index 8dec620536a7..a6a3149ae25b 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/SplitKit.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SplitKit.cpp
@@ -12,28 +12,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "SplitKit.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/CodeGen/LiveInterval.h"
-#include "llvm/CodeGen/LiveIntervalCalc.h"
-#include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/LiveRangeEdit.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
@@ -41,10 +31,8 @@
 #include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/DebugLoc.h"
-#include "llvm/MC/LaneBitmask.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/BlockFrequency.h"
-#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -53,7 +41,6 @@
 #include <iterator>
 #include <limits>
 #include <tuple>
-#include <utility>
 
 using namespace llvm;
 
@@ -181,7 +168,7 @@ void SplitAnalysis::analyzeUses() {
 
   // Get use slots form the use-def chain.
   const MachineRegisterInfo &MRI = MF.getRegInfo();
-  for (MachineOperand &MO : MRI.use_nodbg_operands(CurLI->reg))
+  for (MachineOperand &MO : MRI.use_nodbg_operands(CurLI->reg()))
     if (!MO.isUndef())
       UseSlots.push_back(LIS.getInstructionIndex(*MO.getParent()).getRegSlot());
 
@@ -346,7 +333,7 @@ unsigned SplitAnalysis::countLiveBlocks(const LiveInterval *cli) const {
 }
 
 bool SplitAnalysis::isOriginalEndpoint(SlotIndex Idx) const {
-  unsigned OrigReg = VRM.getOriginal(CurLI->reg);
+  unsigned OrigReg = VRM.getOriginal(CurLI->reg());
   const LiveInterval &Orig = LIS.getInterval(OrigReg);
   assert(!Orig.empty() && "Splitting empty interval?");
   LiveInterval::const_iterator I = Orig.find(Idx);
@@ -412,10 +399,18 @@ LLVM_DUMP_METHOD void SplitEditor::dump() const {
 }
 #endif
 
+LiveInterval::SubRange &SplitEditor::getSubRangeForMaskExact(LaneBitmask LM,
+                                                             LiveInterval &LI) {
+  for (LiveInterval::SubRange &S : LI.subranges())
+    if (S.LaneMask == LM)
+      return S;
+  llvm_unreachable("SubRange for this mask not found");
+}
+
 LiveInterval::SubRange &SplitEditor::getSubRangeForMask(LaneBitmask LM,
                                                         LiveInterval &LI) {
   for (LiveInterval::SubRange &S : LI.subranges())
-    if (S.LaneMask == LM)
+    if ((S.LaneMask & LM) == LM)
       return S;
   llvm_unreachable("SubRange for this mask not found");
 }
@@ -446,7 +441,7 @@ void SplitEditor::addDeadDef(LiveInterval &LI, VNInfo *VNI, bool Original) {
     LaneBitmask LM;
     for (const MachineOperand &DefOp : DefMI->defs()) {
       Register R = DefOp.getReg();
-      if (R != LI.reg)
+      if (R != LI.reg())
         continue;
       if (unsigned SR = DefOp.getSubReg())
         LM |= TRI.getSubRegIndexLaneMask(SR);
@@ -517,7 +512,7 @@ void SplitEditor::forceRecompute(unsigned RegIdx, const VNInfo &ParentVNI) {
   VFP = ValueForcePair(nullptr, true);
 }
 
-SlotIndex SplitEditor::buildSingleSubRegCopy(unsigned FromReg, unsigned ToReg,
+SlotIndex SplitEditor::buildSingleSubRegCopy(Register FromReg, Register ToReg,
     MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
     unsigned SubIdx, LiveInterval &DestLI, bool Late, SlotIndex Def) {
   const MCInstrDesc &Desc = TII.get(TargetOpcode::COPY);
@@ -543,7 +538,7 @@ SlotIndex SplitEditor::buildSingleSubRegCopy(unsigned FromReg, unsigned ToReg,
   return Def;
 }
 
-SlotIndex SplitEditor::buildCopy(unsigned FromReg, unsigned ToReg,
+SlotIndex SplitEditor::buildCopy(Register FromReg, Register ToReg,
     LaneBitmask LaneMask, MachineBasicBlock &MBB,
     MachineBasicBlock::iterator InsertBefore, bool Late, unsigned RegIdx) {
   const MCInstrDesc &Desc = TII.get(TargetOpcode::COPY);
@@ -649,7 +644,7 @@ VNInfo *SplitEditor::defFromParent(unsigned RegIdx,
   LiveInterval &OrigLI = LIS.getInterval(Original);
   VNInfo *OrigVNI = OrigLI.getVNInfoAt(UseIdx);
 
-  unsigned Reg = LI->reg;
+  Register Reg = LI->reg();
   bool DidRemat = false;
   if (OrigVNI) {
     LiveRangeEdit::Remat RM(ParentVNI);
@@ -662,16 +657,25 @@ VNInfo *SplitEditor::defFromParent(unsigned RegIdx,
   }
   if (!DidRemat) {
     LaneBitmask LaneMask;
-    if (LI->hasSubRanges()) {
+    if (OrigLI.hasSubRanges()) {
       LaneMask = LaneBitmask::getNone();
-      for (LiveInterval::SubRange &S : LI->subranges())
-        LaneMask |= S.LaneMask;
+      for (LiveInterval::SubRange &S : OrigLI.subranges()) {
+        if (S.liveAt(UseIdx))
+          LaneMask |= S.LaneMask;
+      }
     } else {
       LaneMask = LaneBitmask::getAll();
     }
 
-    ++NumCopies;
-    Def = buildCopy(Edit->getReg(), Reg, LaneMask, MBB, I, Late, RegIdx);
+    if (LaneMask.none()) {
+      const MCInstrDesc &Desc = TII.get(TargetOpcode::IMPLICIT_DEF);
+      MachineInstr *ImplicitDef = BuildMI(MBB, I, DebugLoc(), Desc, Reg);
+      SlotIndexes &Indexes = *LIS.getSlotIndexes();
+      Def = Indexes.insertMachineInstrInMaps(*ImplicitDef, Late).getRegSlot();
+    } else {
+      ++NumCopies;
+      Def = buildCopy(Edit->getReg(), Reg, LaneMask, MBB, I, Late, RegIdx);
+    }
   }
 
   // Define the value in Reg.
@@ -994,9 +998,7 @@ void SplitEditor::computeRedundantBackCopies(
     }
     if (!DominatedVNIs.empty()) {
       forceRecompute(0, *ParentVNI);
-      for (auto VNI : DominatedVNIs) {
-        BackCopies.push_back(VNI);
-      }
+      append_range(BackCopies, DominatedVNIs);
       DominatedVNIs.clear();
     }
   }
@@ -1257,8 +1259,8 @@ void SplitEditor::extendPHIRange(MachineBasicBlock &B, LiveIntervalCalc &LIC,
     LiveInterval &PLI = Edit->getParent();
     // Need the cast because the inputs to ?: would otherwise be deemed
     // "incompatible": SubRange vs LiveInterval.
-    LiveRange &PSR = !LM.all() ? getSubRangeForMask(LM, PLI)
-                               : static_cast<LiveRange&>(PLI);
+    LiveRange &PSR = !LM.all() ? getSubRangeForMaskExact(LM, PLI)
+                               : static_cast<LiveRange &>(PLI);
     if (PSR.liveAt(LastUse))
       LIC.extend(LR, End, /*PhysReg=*/0, Undefs);
   }
@@ -1293,7 +1295,7 @@ void SplitEditor::extendPHIKillRanges() {
         continue;
       unsigned RegIdx = RegAssign.lookup(V->def);
       LiveInterval &LI = LIS.getInterval(Edit->get(RegIdx));
-      LiveInterval::SubRange &S = getSubRangeForMask(PS.LaneMask, LI);
+      LiveInterval::SubRange &S = getSubRangeForMaskExact(PS.LaneMask, LI);
       if (removeDeadSegment(V->def, S))
         continue;
 
@@ -1342,7 +1344,7 @@ void SplitEditor::rewriteAssigned(bool ExtendRanges) {
     // Rewrite to the mapped register at Idx.
     unsigned RegIdx = RegAssign.lookup(Idx);
     LiveInterval &LI = LIS.getInterval(Edit->get(RegIdx));
-    MO.setReg(LI.reg);
+    MO.setReg(LI.reg());
     LLVM_DEBUG(dbgs() << "  rewr " << printMBBReference(*MI->getParent())
                       << '\t' << Idx << ':' << RegIdx << '\t' << *MI);
 
@@ -1402,7 +1404,7 @@ void SplitEditor::rewriteAssigned(bool ExtendRanges) {
     }
   }
 
-  for (unsigned R : *Edit) {
+  for (Register R : *Edit) {
     LiveInterval &LI = LIS.getInterval(R);
     if (!LI.hasSubRanges())
       continue;
@@ -1424,7 +1426,7 @@ void SplitEditor::deleteRematVictims() {
         continue;
       MachineInstr *MI = LIS.getInstructionFromIndex(S.valno->def);
       assert(MI && "Missing instruction for dead def");
-      MI->addRegisterDead(LI->reg, &TRI);
+      MI->addRegisterDead(LI->reg(), &TRI);
 
       if (!MI->allDefsAreDead())
         continue;
@@ -1521,7 +1523,7 @@ void SplitEditor::finish(SmallVectorImpl<unsigned> *LRMap) {
     deleteRematVictims();
 
   // Get rid of unused values and set phi-kill flags.
-  for (unsigned Reg : *Edit) {
+  for (Register Reg : *Edit) {
     LiveInterval &LI = LIS.getInterval(Reg);
     LI.removeEmptySubRanges();
     LI.RenumberValues();
@@ -1538,13 +1540,13 @@ void SplitEditor::finish(SmallVectorImpl<unsigned> *LRMap) {
   ConnectedVNInfoEqClasses ConEQ(LIS);
   for (unsigned i = 0, e = Edit->size(); i != e; ++i) {
     // Don't use iterators, they are invalidated by create() below.
-    unsigned VReg = Edit->get(i);
+    Register VReg = Edit->get(i);
     LiveInterval &LI = LIS.getInterval(VReg);
     SmallVector<LiveInterval*, 8> SplitLIs;
     LIS.splitSeparateComponents(LI, SplitLIs);
-    unsigned Original = VRM.getOriginal(VReg);
+    Register Original = VRM.getOriginal(VReg);
     for (LiveInterval *SplitLI : SplitLIs)
-      VRM.setIsSplitFromReg(SplitLI->reg, Original);
+      VRM.setIsSplitFromReg(SplitLI->reg(), Original);
 
     // The new intervals all map back to i.
     if (LRMap)
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SplitKit.h b/contrib/llvm-project/llvm/lib/CodeGen/SplitKit.h
index 3ab5f2585f34..a94518f5a4fc 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/SplitKit.h
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SplitKit.h
@@ -345,10 +345,17 @@ private:
     return LICalc[SpillMode != SM_Partition && RegIdx != 0];
   }
 
-  /// Find a subrange corresponding to the lane mask @p LM in the live
+  /// Find a subrange corresponding to the exact lane mask @p LM in the live
   /// interval @p LI. The interval @p LI is assumed to contain such a subrange.
   /// This function is used to find corresponding subranges between the
   /// original interval and the new intervals.
+  LiveInterval::SubRange &getSubRangeForMaskExact(LaneBitmask LM,
+                                                  LiveInterval &LI);
+
+  /// Find a subrange corresponding to the lane mask @p LM, or a superset of it,
+  /// in the live interval @p LI. The interval @p LI is assumed to contain such
+  /// a subrange.  This function is used to find corresponding subranges between
+  /// the original interval and the new intervals.
   LiveInterval::SubRange &getSubRangeForMask(LaneBitmask LM, LiveInterval &LI);
 
   /// Add a segment to the interval LI for the value number VNI. If LI has
@@ -432,11 +439,11 @@ private:
   /// Add a copy instruction copying \p FromReg to \p ToReg before
   /// \p InsertBefore. This can be invoked with a \p LaneMask which may make it
   /// necessary to construct a sequence of copies to cover it exactly.
-  SlotIndex buildCopy(unsigned FromReg, unsigned ToReg, LaneBitmask LaneMask,
+  SlotIndex buildCopy(Register FromReg, Register ToReg, LaneBitmask LaneMask,
       MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
       bool Late, unsigned RegIdx);
 
-  SlotIndex buildSingleSubRegCopy(unsigned FromReg, unsigned ToReg,
+  SlotIndex buildSingleSubRegCopy(Register FromReg, Register ToReg,
       MachineBasicBlock &MB, MachineBasicBlock::iterator InsertBefore,
       unsigned SubIdx, LiveInterval &DestLI, bool Late, SlotIndex Def);
 
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/StackColoring.cpp b/contrib/llvm-project/llvm/lib/CodeGen/StackColoring.cpp
index d720d93c306d..af58204f6db5 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/StackColoring.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/StackColoring.cpp
@@ -373,6 +373,36 @@ STATISTIC(EscapedAllocas, "Number of allocas that escaped the lifetime region");
 // before visiting the memcpy block (which will contain the lifetime start
 // for "b" then it will appear that 'b' has a degenerate lifetime.
 //
+// Handle Windows Exception with LifetimeStartOnFirstUse:
+// -----------------
+//
+// There was a bug for using LifetimeStartOnFirstUse in win32.
+// class Type1 {
+// ...
+// ~Type1(){ write memory;}
+// }
+// ...
+// try{
+// Type1 V
+// ...
+// } catch (Type2 X){
+// ...
+// }
+// For variable X in catch(X), we put point pX=&(&X) into ConservativeSlots
+// to prevent using LifetimeStartOnFirstUse. Because pX may merged with
+// object V which may call destructor after implicitly writing pX. All these
+// are done in C++ EH runtime libs (through CxxThrowException), and can't
+// obviously check it in IR level.
+//
+// The loader of pX, without obvious writing IR, is usually the first LOAD MI
+// in EHPad, Some like:
+// bb.x.catch.i (landing-pad, ehfunclet-entry):
+// ; predecessors: %bb...
+//   successors: %bb...
+//  %n:gr32 = MOV32rm %stack.pX ...
+//  ...
+// The Type2** %stack.pX will only be written in EH runtime libs, so we
+// check the StoreSlots to screen it out.
 
 namespace {
 
@@ -434,6 +464,9 @@ class StackColoring : public MachineFunctionPass {
   /// slots lifetime-start-on-first-use is disabled).
   BitVector ConservativeSlots;
 
+  /// Record the FI slots referenced by a 'may write to memory'.
+  BitVector StoreSlots;
+
   /// Number of iterations taken during data flow analysis.
   unsigned NumIterations;
 
@@ -629,10 +662,13 @@ unsigned StackColoring::collectMarkers(unsigned NumSlot) {
   InterestingSlots.resize(NumSlot);
   ConservativeSlots.clear();
   ConservativeSlots.resize(NumSlot);
+  StoreSlots.clear();
+  StoreSlots.resize(NumSlot);
 
   // number of start and end lifetime ops for each slot
   SmallVector<int, 8> NumStartLifetimes(NumSlot, 0);
   SmallVector<int, 8> NumEndLifetimes(NumSlot, 0);
+  SmallVector<int, 8> NumLoadInCatchPad(NumSlot, 0);
 
   // Step 1: collect markers and populate the "InterestingSlots"
   // and "ConservativeSlots" sets.
@@ -687,6 +723,13 @@ unsigned StackColoring::collectMarkers(unsigned NumSlot) {
           if (! BetweenStartEnd.test(Slot)) {
             ConservativeSlots.set(Slot);
           }
+          // Here we check the StoreSlots to screen catch point out. For more
+          // information, please refer "Handle Windows Exception with
+          // LifetimeStartOnFirstUse" at the head of this file.
+          if (MI.mayStore())
+            StoreSlots.set(Slot);
+          if (MF->getWinEHFuncInfo() && MBB->isEHPad() && MI.mayLoad())
+            NumLoadInCatchPad[Slot] += 1;
         }
       }
     }
@@ -697,11 +740,14 @@ unsigned StackColoring::collectMarkers(unsigned NumSlot) {
     return 0;
   }
 
-  // PR27903: slots with multiple start or end lifetime ops are not
+  // 1) PR27903: slots with multiple start or end lifetime ops are not
   // safe to enable for "lifetime-start-on-first-use".
-  for (unsigned slot = 0; slot < NumSlot; ++slot)
-    if (NumStartLifetimes[slot] > 1 || NumEndLifetimes[slot] > 1)
+  // 2) And also not safe for variable X in catch(X) in windows.
+  for (unsigned slot = 0; slot < NumSlot; ++slot) {
+    if (NumStartLifetimes[slot] > 1 || NumEndLifetimes[slot] > 1 ||
+        (NumLoadInCatchPad[slot] > 1 && !StoreSlots.test(slot)))
       ConservativeSlots.set(slot);
+  }
   LLVM_DEBUG(dumpBV("Conservative slots", ConservativeSlots));
 
   // Step 2: compute begin/end sets for each block
@@ -1048,7 +1094,7 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {
         if (MMO->getAAInfo()) {
           if (const Value *MMOV = MMO->getValue()) {
             SmallVector<Value *, 4> Objs;
-            getUnderlyingObjectsForCodeGen(MMOV, Objs, MF->getDataLayout());
+            getUnderlyingObjectsForCodeGen(MMOV, Objs);
 
             if (Objs.empty())
               MayHaveConflictingAAMD = true;
@@ -1241,7 +1287,7 @@ bool StackColoring::runOnMachineFunction(MachineFunction &Func) {
 
   // This is a simple greedy algorithm for merging allocas. First, sort the
   // slots, placing the largest slots first. Next, perform an n^2 scan and look
-  // for disjoint slots. When you find disjoint slots, merge the samller one
+  // for disjoint slots. When you find disjoint slots, merge the smaller one
   // into the bigger one and update the live interval. Remove the small alloca
   // and continue.
 
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/StackMaps.cpp b/contrib/llvm-project/llvm/lib/CodeGen/StackMaps.cpp
index 1e060ecbeb43..faf07e90c39c 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/StackMaps.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/StackMaps.cpp
@@ -45,6 +45,14 @@ static cl::opt<int> StackMapVersion(
 
 const char *StackMaps::WSMP = "Stack Maps: ";
 
+static uint64_t getConstMetaVal(const MachineInstr &MI, unsigned Idx) {
+  assert(MI.getOperand(Idx).isImm() &&
+         MI.getOperand(Idx).getImm() == StackMaps::ConstantOp);
+  const auto &MO = MI.getOperand(Idx + 1);
+  assert(MO.isImm());
+  return MO.getImm();
+}
+
 StackMapOpers::StackMapOpers(const MachineInstr *MI)
   : MI(MI) {
   assert(getVarIdx() <= MI->getNumOperands() &&
@@ -83,11 +91,89 @@ unsigned PatchPointOpers::getNextScratchIdx(unsigned StartIdx) const {
   return ScratchIdx;
 }
 
+unsigned StatepointOpers::getNumGcMapEntriesIdx() {
+  // Take index of num of allocas and skip all allocas records.
+  unsigned CurIdx = getNumAllocaIdx();
+  unsigned NumAllocas = getConstMetaVal(*MI, CurIdx - 1);
+  CurIdx++;
+  while (NumAllocas--)
+    CurIdx = StackMaps::getNextMetaArgIdx(MI, CurIdx);
+  return CurIdx + 1; // skip <StackMaps::ConstantOp>
+}
+
+unsigned StatepointOpers::getNumAllocaIdx() {
+  // Take index of num of gc ptrs and skip all gc ptr records.
+  unsigned CurIdx = getNumGCPtrIdx();
+  unsigned NumGCPtrs = getConstMetaVal(*MI, CurIdx - 1);
+  CurIdx++;
+  while (NumGCPtrs--)
+    CurIdx = StackMaps::getNextMetaArgIdx(MI, CurIdx);
+  return CurIdx + 1; // skip <StackMaps::ConstantOp>
+}
+
+unsigned StatepointOpers::getNumGCPtrIdx() {
+  // Take index of num of deopt args and skip all deopt records.
+  unsigned CurIdx = getNumDeoptArgsIdx();
+  unsigned NumDeoptArgs = getConstMetaVal(*MI, CurIdx - 1);
+  CurIdx++;
+  while (NumDeoptArgs--) {
+    CurIdx = StackMaps::getNextMetaArgIdx(MI, CurIdx);
+  }
+  return CurIdx + 1; // skip <StackMaps::ConstantOp>
+}
+
+int StatepointOpers::getFirstGCPtrIdx() {
+  unsigned NumGCPtrsIdx = getNumGCPtrIdx();
+  unsigned NumGCPtrs = getConstMetaVal(*MI, NumGCPtrsIdx - 1);
+  if (NumGCPtrs == 0)
+    return -1;
+  ++NumGCPtrsIdx; // skip <num gc ptrs>
+  assert(NumGCPtrsIdx < MI->getNumOperands());
+  return (int)NumGCPtrsIdx;
+}
+
+unsigned StatepointOpers::getGCPointerMap(
+    SmallVectorImpl<std::pair<unsigned, unsigned>> &GCMap) {
+  unsigned CurIdx = getNumGcMapEntriesIdx();
+  unsigned GCMapSize = getConstMetaVal(*MI, CurIdx - 1);
+  CurIdx++;
+  for (unsigned N = 0; N < GCMapSize; ++N) {
+    unsigned B = MI->getOperand(CurIdx++).getImm();
+    unsigned D = MI->getOperand(CurIdx++).getImm();
+    GCMap.push_back(std::make_pair(B, D));
+  }
+
+  return GCMapSize;
+}
+
 StackMaps::StackMaps(AsmPrinter &AP) : AP(AP) {
   if (StackMapVersion != 3)
     llvm_unreachable("Unsupported stackmap version!");
 }
 
+unsigned StackMaps::getNextMetaArgIdx(const MachineInstr *MI, unsigned CurIdx) {
+  assert(CurIdx < MI->getNumOperands() && "Bad meta arg index");
+  const auto &MO = MI->getOperand(CurIdx);
+  if (MO.isImm()) {
+    switch (MO.getImm()) {
+    default:
+      llvm_unreachable("Unrecognized operand type.");
+    case StackMaps::DirectMemRefOp:
+      CurIdx += 2;
+      break;
+    case StackMaps::IndirectMemRefOp:
+      CurIdx += 3;
+      break;
+    case StackMaps::ConstantOp:
+      ++CurIdx;
+      break;
+    }
+  }
+  ++CurIdx;
+  assert(CurIdx < MI->getNumOperands() && "points past operand list");
+  return CurIdx;
+}
+
 /// Go up the super-register chain until we hit a valid dwarf register number.
 static unsigned getDwarfRegNum(unsigned Reg, const TargetRegisterInfo *TRI) {
   int RegNum = TRI->getDwarfRegNum(Reg, false);
@@ -148,6 +234,12 @@ StackMaps::parseOperand(MachineInstr::const_mop_iterator MOI,
     if (MOI->isImplicit())
       return ++MOI;
 
+    if (MOI->isUndef()) {
+      // Record `undef` register as constant. Use same value as ISel uses.
+      Locs.emplace_back(Location::Constant, sizeof(int64_t), 0, 0xFEFEFEFE);
+      return ++MOI;
+    }
+
     assert(Register::isPhysicalRegister(MOI->getReg()) &&
            "Virtreg operands should have been rewritten before now.");
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(MOI->getReg());
@@ -286,14 +378,82 @@ StackMaps::parseRegisterLiveOutMask(const uint32_t *Mask) const {
     }
   }
 
-  LiveOuts.erase(
-      llvm::remove_if(LiveOuts,
-                      [](const LiveOutReg &LO) { return LO.Reg == 0; }),
-      LiveOuts.end());
+  llvm::erase_if(LiveOuts, [](const LiveOutReg &LO) { return LO.Reg == 0; });
 
   return LiveOuts;
 }
 
+// See statepoint MI format description in StatepointOpers' class comment
+// in include/llvm/CodeGen/StackMaps.h
+void StackMaps::parseStatepointOpers(const MachineInstr &MI,
+                                     MachineInstr::const_mop_iterator MOI,
+                                     MachineInstr::const_mop_iterator MOE,
+                                     LocationVec &Locations,
+                                     LiveOutVec &LiveOuts) {
+  LLVM_DEBUG(dbgs() << "record statepoint : " << MI << "\n");
+  StatepointOpers SO(&MI);
+  MOI = parseOperand(MOI, MOE, Locations, LiveOuts); // CC
+  MOI = parseOperand(MOI, MOE, Locations, LiveOuts); // Flags
+  MOI = parseOperand(MOI, MOE, Locations, LiveOuts); // Num Deopts
+
+  // Record Deopt Args.
+  unsigned NumDeoptArgs = Locations.back().Offset;
+  assert(Locations.back().Type == Location::Constant);
+  assert(NumDeoptArgs == SO.getNumDeoptArgs());
+
+  while (NumDeoptArgs--)
+    MOI = parseOperand(MOI, MOE, Locations, LiveOuts);
+
+  // Record gc base/derived pairs
+  assert(MOI->isImm() && MOI->getImm() == StackMaps::ConstantOp);
+  ++MOI;
+  assert(MOI->isImm());
+  unsigned NumGCPointers = MOI->getImm();
+  ++MOI;
+  if (NumGCPointers) {
+    // Map logical index of GC ptr to MI operand index.
+    SmallVector<unsigned, 8> GCPtrIndices;
+    unsigned GCPtrIdx = (unsigned)SO.getFirstGCPtrIdx();
+    assert((int)GCPtrIdx != -1);
+    assert(MOI - MI.operands_begin() == GCPtrIdx + 0LL);
+    while (NumGCPointers--) {
+      GCPtrIndices.push_back(GCPtrIdx);
+      GCPtrIdx = StackMaps::getNextMetaArgIdx(&MI, GCPtrIdx);
+    }
+
+    SmallVector<std::pair<unsigned, unsigned>, 8> GCPairs;
+    unsigned NumGCPairs = SO.getGCPointerMap(GCPairs);
+    (void)NumGCPairs;
+    LLVM_DEBUG(dbgs() << "NumGCPairs = " << NumGCPairs << "\n");
+
+    auto MOB = MI.operands_begin();
+    for (auto &P : GCPairs) {
+      assert(P.first < GCPtrIndices.size() && "base pointer index not found");
+      assert(P.second < GCPtrIndices.size() &&
+             "derived pointer index not found");
+      unsigned BaseIdx = GCPtrIndices[P.first];
+      unsigned DerivedIdx = GCPtrIndices[P.second];
+      LLVM_DEBUG(dbgs() << "Base : " << BaseIdx << " Derived : " << DerivedIdx
+                        << "\n");
+      (void)parseOperand(MOB + BaseIdx, MOE, Locations, LiveOuts);
+      (void)parseOperand(MOB + DerivedIdx, MOE, Locations, LiveOuts);
+    }
+
+    MOI = MOB + GCPtrIdx;
+  }
+
+  // Record gc allocas
+  assert(MOI < MOE);
+  assert(MOI->isImm() && MOI->getImm() == StackMaps::ConstantOp);
+  ++MOI;
+  unsigned NumAllocas = MOI->getImm();
+  ++MOI;
+  while (NumAllocas--) {
+    MOI = parseOperand(MOI, MOE, Locations, LiveOuts);
+    assert(MOI < MOE);
+  }
+}
+
 void StackMaps::recordStackMapOpers(const MCSymbol &MILabel,
                                     const MachineInstr &MI, uint64_t ID,
                                     MachineInstr::const_mop_iterator MOI,
@@ -311,9 +471,11 @@ void StackMaps::recordStackMapOpers(const MCSymbol &MILabel,
   }
 
   // Parse operands.
-  while (MOI != MOE) {
-    MOI = parseOperand(MOI, MOE, Locations, LiveOuts);
-  }
+  if (MI.getOpcode() == TargetOpcode::STATEPOINT)
+    parseStatepointOpers(MI, MOI, MOE, Locations, LiveOuts);
+  else
+    while (MOI != MOE)
+      MOI = parseOperand(MOI, MOE, Locations, LiveOuts);
 
   // Move large constants into the constant pool.
   for (auto &Loc : Locations) {
@@ -394,8 +556,6 @@ void StackMaps::recordStatepoint(const MCSymbol &L, const MachineInstr &MI) {
   assert(MI.getOpcode() == TargetOpcode::STATEPOINT && "expected statepoint");
 
   StatepointOpers opers(&MI);
-  // Record all the deopt and gc operands (they're contiguous and run from the
-  // initial index to the end of the operand list)
   const unsigned StartIdx = opers.getVarIdx();
   recordStackMapOpers(L, MI, opers.getID(), MI.operands_begin() + StartIdx,
                       MI.operands_end(), false);
@@ -404,7 +564,7 @@ void StackMaps::recordStatepoint(const MCSymbol &L, const MachineInstr &MI) {
 /// Emit the stackmap header.
 ///
 /// Header {
-///   uint8  : Stack Map Version (currently 2)
+///   uint8  : Stack Map Version (currently 3)
 ///   uint8  : Reserved (expected to be 0)
 ///   uint16 : Reserved (expected to be 0)
 /// }
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/StackProtector.cpp b/contrib/llvm-project/llvm/lib/CodeGen/StackProtector.cpp
index a343791807e6..10c6dcbdb049 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/StackProtector.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/StackProtector.cpp
@@ -170,7 +170,8 @@ bool StackProtector::HasAddressTaken(const Instruction *AI,
     // If this instruction accesses memory make sure it doesn't access beyond
     // the bounds of the allocated object.
     Optional<MemoryLocation> MemLoc = MemoryLocation::getOrNone(I);
-    if (MemLoc.hasValue() && MemLoc->Size.getValue() > AllocSize)
+    if (MemLoc.hasValue() && MemLoc->Size.hasValue() &&
+        MemLoc->Size.getValue() > AllocSize)
       return true;
     switch (I->getOpcode()) {
     case Instruction::Store:
@@ -191,7 +192,7 @@ bool StackProtector::HasAddressTaken(const Instruction *AI,
       // Ignore intrinsics that do not become real instructions.
       // TODO: Narrow this to intrinsics that have store-like effects.
       const auto *CI = cast<CallInst>(I);
-      if (!isa<DbgInfoIntrinsic>(CI) && !CI->isLifetimeStartOrEnd())
+      if (!CI->isDebugOrPseudoInst() && !CI->isLifetimeStartOrEnd())
         return true;
       break;
     }
@@ -251,10 +252,9 @@ bool StackProtector::HasAddressTaken(const Instruction *AI,
 static const CallInst *findStackProtectorIntrinsic(Function &F) {
   for (const BasicBlock &BB : F)
     for (const Instruction &I : BB)
-      if (const CallInst *CI = dyn_cast<CallInst>(&I))
-        if (CI->getCalledFunction() ==
-            Intrinsic::getDeclaration(F.getParent(), Intrinsic::stackprotector))
-          return CI;
+      if (const auto *II = dyn_cast<IntrinsicInst>(&I))
+        if (II->getIntrinsicID() == Intrinsic::stackprotector)
+          return II;
   return nullptr;
 }
 
@@ -274,7 +274,6 @@ static const CallInst *findStackProtectorIntrinsic(Function &F) {
 bool StackProtector::RequiresStackProtector() {
   bool Strong = false;
   bool NeedsProtector = false;
-  HasPrologue = findStackProtectorIntrinsic(*F);
 
   if (F->hasFnAttribute(Attribute::SafeStack))
     return false;
@@ -295,8 +294,6 @@ bool StackProtector::RequiresStackProtector() {
     Strong = true; // Use the same heuristic as strong to determine SSPLayout
   } else if (F->hasFnAttribute(Attribute::StackProtectStrong))
     Strong = true;
-  else if (HasPrologue)
-    NeedsProtector = true;
   else if (!F->hasFnAttribute(Attribute::StackProtect))
     return false;
 
@@ -381,7 +378,10 @@ bool StackProtector::RequiresStackProtector() {
 static Value *getStackGuard(const TargetLoweringBase *TLI, Module *M,
                             IRBuilder<> &B,
                             bool *SupportsSelectionDAGSP = nullptr) {
-  if (Value *Guard = TLI->getIRStackGuard(B))
+  Value *Guard = TLI->getIRStackGuard(B);
+  auto GuardMode = TLI->getTargetMachine().Options.StackProtectorGuard;
+  if ((GuardMode == llvm::StackProtectorGuards::TLS ||
+       GuardMode == llvm::StackProtectorGuards::None) && Guard)
     return B.CreateLoad(B.getInt8PtrTy(), Guard, true, "StackGuard");
 
   // Use SelectionDAG SSP handling, since there isn't an IR guard.
@@ -470,21 +470,36 @@ bool StackProtector::InsertStackProtectors() {
     // instrumentation has already been generated.
     HasIRCheck = true;
 
+    // If we're instrumenting a block with a musttail call, the check has to be
+    // inserted before the call rather than between it and the return. The
+    // verifier guarantees that a musttail call is either directly before the
+    // return or with a single correct bitcast of the return value in between so
+    // we don't need to worry about many situations here.
+    Instruction *CheckLoc = RI;
+    Instruction *Prev = RI->getPrevNonDebugInstruction();
+    if (Prev && isa<CallInst>(Prev) && cast<CallInst>(Prev)->isMustTailCall())
+      CheckLoc = Prev;
+    else if (Prev) {
+      Prev = Prev->getPrevNonDebugInstruction();
+      if (Prev && isa<CallInst>(Prev) && cast<CallInst>(Prev)->isMustTailCall())
+        CheckLoc = Prev;
+    }
+
     // Generate epilogue instrumentation. The epilogue intrumentation can be
     // function-based or inlined depending on which mechanism the target is
     // providing.
     if (Function *GuardCheck = TLI->getSSPStackGuardCheck(*M)) {
       // Generate the function-based epilogue instrumentation.
       // The target provides a guard check function, generate a call to it.
-      IRBuilder<> B(RI);
+      IRBuilder<> B(CheckLoc);
       LoadInst *Guard = B.CreateLoad(B.getInt8PtrTy(), AI, true, "Guard");
       CallInst *Call = B.CreateCall(GuardCheck, {Guard});
       Call->setAttributes(GuardCheck->getAttributes());
       Call->setCallingConv(GuardCheck->getCallingConv());
     } else {
       // Generate the epilogue with inline instrumentation.
-      // If we do not support SelectionDAG based tail calls, generate IR level
-      // tail calls.
+      // If we do not support SelectionDAG based calls, generate IR level
+      // calls.
       //
       // For each block with a return instruction, convert this:
       //
@@ -514,7 +529,8 @@ bool StackProtector::InsertStackProtectors() {
       BasicBlock *FailBB = CreateFailBB();
 
       // Split the basic block before the return instruction.
-      BasicBlock *NewBB = BB->splitBasicBlock(RI->getIterator(), "SP_return");
+      BasicBlock *NewBB =
+          BB->splitBasicBlock(CheckLoc->getIterator(), "SP_return");
 
       // Update the dominator tree if we need to.
       if (DT && DT->isReachableFromEntry(BB)) {
@@ -556,7 +572,9 @@ BasicBlock *StackProtector::CreateFailBB() {
   LLVMContext &Context = F->getContext();
   BasicBlock *FailBB = BasicBlock::Create(Context, "CallStackCheckFailBlk", F);
   IRBuilder<> B(FailBB);
-  B.SetCurrentDebugLocation(DebugLoc::get(0, 0, F->getSubprogram()));
+  if (F->getSubprogram())
+    B.SetCurrentDebugLocation(
+        DILocation::get(Context, 0, 0, F->getSubprogram()));
   if (Trip.isOSOpenBSD()) {
     FunctionCallee StackChkFail = M->getOrInsertFunction(
         "__stack_smash_handler", Type::getVoidTy(Context),
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/StackSlotColoring.cpp b/contrib/llvm-project/llvm/lib/CodeGen/StackSlotColoring.cpp
index 3cc5d30ebad7..a6f8974f3343 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/StackSlotColoring.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/StackSlotColoring.cpp
@@ -145,7 +145,7 @@ namespace {
 // their weight.
 struct IntervalSorter {
   bool operator()(LiveInterval* LHS, LiveInterval* RHS) const {
-    return LHS->weight > RHS->weight;
+    return LHS->weight() > RHS->weight();
   }
 };
 
@@ -174,7 +174,8 @@ void StackSlotColoring::ScanForSpillSlotRefs(MachineFunction &MF) {
           continue;
         LiveInterval &li = LS->getInterval(FI);
         if (!MI.isDebugValue())
-          li.weight += LiveIntervals::getSpillWeight(false, true, MBFI, MI);
+          li.incrementWeight(
+              LiveIntervals::getSpillWeight(false, true, MBFI, MI));
       }
       for (MachineInstr::mmo_iterator MMOI = MI.memoperands_begin(),
                                       EE = MI.memoperands_end();
@@ -222,7 +223,7 @@ void StackSlotColoring::InitializeSlots() {
   for (auto *I : Intervals) {
     LiveInterval &li = I->second;
     LLVM_DEBUG(li.dump());
-    int FI = Register::stackSlot2Index(li.reg);
+    int FI = Register::stackSlot2Index(li.reg());
     if (MFI->isDeadObjectIndex(FI))
       continue;
 
@@ -269,7 +270,7 @@ StackSlotColoring::OverlapWithAssignments(LiveInterval *li, int Color) const {
 int StackSlotColoring::ColorSlot(LiveInterval *li) {
   int Color = -1;
   bool Share = false;
-  int FI = Register::stackSlot2Index(li->reg);
+  int FI = Register::stackSlot2Index(li->reg());
   uint8_t StackID = MFI->getStackID(FI);
 
   if (!DisableSharing) {
@@ -331,12 +332,12 @@ bool StackSlotColoring::ColorSlots(MachineFunction &MF) {
   bool Changed = false;
   for (unsigned i = 0, e = SSIntervals.size(); i != e; ++i) {
     LiveInterval *li = SSIntervals[i];
-    int SS = Register::stackSlot2Index(li->reg);
+    int SS = Register::stackSlot2Index(li->reg());
     int NewSS = ColorSlot(li);
     assert(NewSS >= 0 && "Stack coloring failed?");
     SlotMapping[SS] = NewSS;
     RevMap[NewSS].push_back(SS);
-    SlotWeights[NewSS] += li->weight;
+    SlotWeights[NewSS] += li->weight();
     UsedColors.set(NewSS);
     Changed |= (SS != NewSS);
   }
@@ -344,8 +345,8 @@ bool StackSlotColoring::ColorSlots(MachineFunction &MF) {
   LLVM_DEBUG(dbgs() << "\nSpill slots after coloring:\n");
   for (unsigned i = 0, e = SSIntervals.size(); i != e; ++i) {
     LiveInterval *li = SSIntervals[i];
-    int SS = Register::stackSlot2Index(li->reg);
-    li->weight = SlotWeights[SS];
+    int SS = Register::stackSlot2Index(li->reg());
+    li->setWeight(SlotWeights[SS]);
   }
   // Sort them by new weight.
   llvm::stable_sort(SSIntervals, IntervalSorter());
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SwiftErrorValueTracking.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SwiftErrorValueTracking.cpp
index dd0b9d4c2e48..4408011c95c0 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/SwiftErrorValueTracking.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SwiftErrorValueTracking.cpp
@@ -202,8 +202,8 @@ void SwiftErrorValueTracking::propagateVRegs() {
       // downward defs.
       bool needPHI =
           VRegs.size() >= 1 &&
-          std::find_if(
-              VRegs.begin(), VRegs.end(),
+          llvm::find_if(
+              VRegs,
               [&](const std::pair<const MachineBasicBlock *, Register> &V)
                   -> bool { return V.second != VRegs[0].second; }) !=
               VRegs.end();
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SwitchLoweringUtils.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SwitchLoweringUtils.cpp
index 078c9691f8dc..dfcec32d9537 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/SwitchLoweringUtils.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SwitchLoweringUtils.cpp
@@ -11,8 +11,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/SwitchLoweringUtils.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/TailDuplicator.cpp b/contrib/llvm-project/llvm/lib/CodeGen/TailDuplicator.cpp
index f9773f74a7bd..575bf555c489 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/TailDuplicator.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/TailDuplicator.cpp
@@ -720,8 +720,7 @@ bool TailDuplicator::duplicateSimpleBB(
     SmallVectorImpl<MachineInstr *> &Copies) {
   SmallPtrSet<MachineBasicBlock *, 8> Succs(TailBB->succ_begin(),
                                             TailBB->succ_end());
-  SmallVector<MachineBasicBlock *, 8> Preds(TailBB->pred_begin(),
-                                            TailBB->pred_end());
+  SmallVector<MachineBasicBlock *, 8> Preds(TailBB->predecessors());
   bool Changed = false;
   for (MachineBasicBlock *PredBB : Preds) {
     if (PredBB->hasEHPadSuccessor() || PredBB->mayHaveInlineAsmBr())
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp b/contrib/llvm-project/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
index f8b482c04a58..b0594ec086b2 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
@@ -41,9 +41,9 @@ bool TargetFrameLowering::enableCalleeSaveSkip(const MachineFunction &MF) const
 /// frame of the specified index, along with the frame register used
 /// (in output arg FrameReg). This is the default implementation which
 /// is overridden for some targets.
-int TargetFrameLowering::getFrameIndexReference(const MachineFunction &MF,
-                                                int FI,
-                                                Register &FrameReg) const {
+StackOffset
+TargetFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
+                                            Register &FrameReg) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
 
@@ -52,8 +52,9 @@ int TargetFrameLowering::getFrameIndexReference(const MachineFunction &MF,
   // something different.
   FrameReg = RI->getFrameRegister(MF);
 
-  return MFI.getObjectOffset(FI) + MFI.getStackSize() -
-         getOffsetOfLocalArea() + MFI.getOffsetAdjustment();
+  return StackOffset::getFixed(MFI.getObjectOffset(FI) + MFI.getStackSize() -
+                               getOffsetOfLocalArea() +
+                               MFI.getOffsetAdjustment());
 }
 
 bool TargetFrameLowering::needsFrameIndexResolution(
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/TargetInstrInfo.cpp b/contrib/llvm-project/llvm/lib/CodeGen/TargetInstrInfo.cpp
index 24f3f96d0b1d..165860ef1aa8 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/TargetInstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/TargetInstrInfo.cpp
@@ -69,6 +69,15 @@ void TargetInstrInfo::insertNoop(MachineBasicBlock &MBB,
   llvm_unreachable("Target didn't implement insertNoop!");
 }
 
+/// insertNoops - Insert noops into the instruction stream at the specified
+/// point.
+void TargetInstrInfo::insertNoops(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator MI,
+                                  unsigned Quantity) const {
+  for (unsigned i = 0; i < Quantity; ++i)
+    insertNoop(MBB, MI);
+}
+
 static bool isAsmComment(const char *Str, const MCAsmInfo &MAI) {
   return strncmp(Str, MAI.getCommentString().data(),
                  MAI.getCommentString().size()) == 0;
@@ -471,6 +480,7 @@ static MachineInstr *foldPatchpoint(MachineFunction &MF, MachineInstr &MI,
                                     ArrayRef<unsigned> Ops, int FrameIndex,
                                     const TargetInstrInfo &TII) {
   unsigned StartIdx = 0;
+  unsigned NumDefs = 0;
   switch (MI.getOpcode()) {
   case TargetOpcode::STACKMAP: {
     // StackMapLiveValues are foldable
@@ -486,16 +496,25 @@ static MachineInstr *foldPatchpoint(MachineFunction &MF, MachineInstr &MI,
   case TargetOpcode::STATEPOINT: {
     // For statepoints, fold deopt and gc arguments, but not call arguments.
     StartIdx = StatepointOpers(&MI).getVarIdx();
+    NumDefs = MI.getNumDefs();
     break;
   }
   default:
     llvm_unreachable("unexpected stackmap opcode");
   }
 
+  unsigned DefToFoldIdx = MI.getNumOperands();
+
   // Return false if any operands requested for folding are not foldable (not
   // part of the stackmap's live values).
   for (unsigned Op : Ops) {
-    if (Op < StartIdx)
+    if (Op < NumDefs) {
+      assert(DefToFoldIdx == MI.getNumOperands() && "Folding multiple defs");
+      DefToFoldIdx = Op;
+    } else if (Op < StartIdx) {
+      return nullptr;
+    }
+    if (MI.getOperand(Op).isTied())
       return nullptr;
   }
 
@@ -505,11 +524,16 @@ static MachineInstr *foldPatchpoint(MachineFunction &MF, MachineInstr &MI,
 
   // No need to fold return, the meta data, and function arguments
   for (unsigned i = 0; i < StartIdx; ++i)
-    MIB.add(MI.getOperand(i));
+    if (i != DefToFoldIdx)
+      MIB.add(MI.getOperand(i));
 
-  for (unsigned i = StartIdx; i < MI.getNumOperands(); ++i) {
+  for (unsigned i = StartIdx, e = MI.getNumOperands(); i < e; ++i) {
     MachineOperand &MO = MI.getOperand(i);
+    unsigned TiedTo = e;
+    (void)MI.isRegTiedToDefOperand(i, &TiedTo);
+
     if (is_contained(Ops, i)) {
+      assert(TiedTo == e && "Cannot fold tied operands");
       unsigned SpillSize;
       unsigned SpillOffset;
       // Compute the spill slot size and offset.
@@ -523,9 +547,15 @@ static MachineInstr *foldPatchpoint(MachineFunction &MF, MachineInstr &MI,
       MIB.addImm(SpillSize);
       MIB.addFrameIndex(FrameIndex);
       MIB.addImm(SpillOffset);
-    }
-    else
+    } else {
       MIB.add(MO);
+      if (TiedTo < e) {
+        assert(TiedTo < NumDefs && "Bad tied operand");
+        if (TiedTo > DefToFoldIdx)
+          --TiedTo;
+        NewMI->tieOperands(TiedTo, NewMI->getNumOperands() - 1);
+      }
+    }
   }
   return NewMI;
 }
@@ -748,8 +778,8 @@ bool TargetInstrInfo::isReassociationCandidate(const MachineInstr &Inst,
 //    instruction is known to not increase the critical path, then don't match
 //    that pattern.
 bool TargetInstrInfo::getMachineCombinerPatterns(
-    MachineInstr &Root,
-    SmallVectorImpl<MachineCombinerPattern> &Patterns) const {
+    MachineInstr &Root, SmallVectorImpl<MachineCombinerPattern> &Patterns,
+    bool DoRegPressureReduce) const {
   bool Commute;
   if (isReassociationCandidate(Root, Commute)) {
     // We found a sequence of instructions that may be suitable for a
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/TargetLoweringBase.cpp b/contrib/llvm-project/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 42c1fa8af0e6..28c8bd0a7ded 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -135,23 +135,28 @@ void TargetLoweringBase::InitLibcalls(const Triple &TT) {
     setLibcallCallingConv((RTLIB::Libcall)LC, CallingConv::C);
 
   // For IEEE quad-precision libcall names, PPC uses "kf" instead of "tf".
-  if (TT.getArch() == Triple::ppc || TT.isPPC64()) {
+  if (TT.isPPC()) {
     setLibcallName(RTLIB::ADD_F128, "__addkf3");
     setLibcallName(RTLIB::SUB_F128, "__subkf3");
     setLibcallName(RTLIB::MUL_F128, "__mulkf3");
     setLibcallName(RTLIB::DIV_F128, "__divkf3");
+    setLibcallName(RTLIB::POWI_F128, "__powikf2");
     setLibcallName(RTLIB::FPEXT_F32_F128, "__extendsfkf2");
     setLibcallName(RTLIB::FPEXT_F64_F128, "__extenddfkf2");
     setLibcallName(RTLIB::FPROUND_F128_F32, "__trunckfsf2");
     setLibcallName(RTLIB::FPROUND_F128_F64, "__trunckfdf2");
     setLibcallName(RTLIB::FPTOSINT_F128_I32, "__fixkfsi");
     setLibcallName(RTLIB::FPTOSINT_F128_I64, "__fixkfdi");
+    setLibcallName(RTLIB::FPTOSINT_F128_I128, "__fixkfti");
     setLibcallName(RTLIB::FPTOUINT_F128_I32, "__fixunskfsi");
     setLibcallName(RTLIB::FPTOUINT_F128_I64, "__fixunskfdi");
+    setLibcallName(RTLIB::FPTOUINT_F128_I128, "__fixunskfti");
     setLibcallName(RTLIB::SINTTOFP_I32_F128, "__floatsikf");
     setLibcallName(RTLIB::SINTTOFP_I64_F128, "__floatdikf");
+    setLibcallName(RTLIB::SINTTOFP_I128_F128, "__floattikf");
     setLibcallName(RTLIB::UINTTOFP_I32_F128, "__floatunsikf");
     setLibcallName(RTLIB::UINTTOFP_I64_F128, "__floatundikf");
+    setLibcallName(RTLIB::UINTTOFP_I128_F128, "__floatuntikf");
     setLibcallName(RTLIB::OEQ_F128, "__eqkf2");
     setLibcallName(RTLIB::UNE_F128, "__nekf2");
     setLibcallName(RTLIB::OGE_F128, "__gekf2");
@@ -224,6 +229,10 @@ RTLIB::Libcall RTLIB::getFPEXT(EVT OpVT, EVT RetVT) {
   if (OpVT == MVT::f16) {
     if (RetVT == MVT::f32)
       return FPEXT_F16_F32;
+    if (RetVT == MVT::f64)
+      return FPEXT_F16_F64;
+    if (RetVT == MVT::f128)
+      return FPEXT_F16_F128;
   } else if (OpVT == MVT::f32) {
     if (RetVT == MVT::f64)
       return FPEXT_F32_F64;
@@ -285,7 +294,14 @@ RTLIB::Libcall RTLIB::getFPROUND(EVT OpVT, EVT RetVT) {
 /// getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or
 /// UNKNOWN_LIBCALL if there is none.
 RTLIB::Libcall RTLIB::getFPTOSINT(EVT OpVT, EVT RetVT) {
-  if (OpVT == MVT::f32) {
+  if (OpVT == MVT::f16) {
+    if (RetVT == MVT::i32)
+      return FPTOSINT_F16_I32;
+    if (RetVT == MVT::i64)
+      return FPTOSINT_F16_I64;
+    if (RetVT == MVT::i128)
+      return FPTOSINT_F16_I128;
+  } else if (OpVT == MVT::f32) {
     if (RetVT == MVT::i32)
       return FPTOSINT_F32_I32;
     if (RetVT == MVT::i64)
@@ -327,7 +343,14 @@ RTLIB::Libcall RTLIB::getFPTOSINT(EVT OpVT, EVT RetVT) {
 /// getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or
 /// UNKNOWN_LIBCALL if there is none.
 RTLIB::Libcall RTLIB::getFPTOUINT(EVT OpVT, EVT RetVT) {
-  if (OpVT == MVT::f32) {
+  if (OpVT == MVT::f16) {
+    if (RetVT == MVT::i32)
+      return FPTOUINT_F16_I32;
+    if (RetVT == MVT::i64)
+      return FPTOUINT_F16_I64;
+    if (RetVT == MVT::i128)
+      return FPTOUINT_F16_I128;
+  } else if (OpVT == MVT::f32) {
     if (RetVT == MVT::i32)
       return FPTOUINT_F32_I32;
     if (RetVT == MVT::i64)
@@ -370,6 +393,8 @@ RTLIB::Libcall RTLIB::getFPTOUINT(EVT OpVT, EVT RetVT) {
 /// UNKNOWN_LIBCALL if there is none.
 RTLIB::Libcall RTLIB::getSINTTOFP(EVT OpVT, EVT RetVT) {
   if (OpVT == MVT::i32) {
+    if (RetVT == MVT::f16)
+      return SINTTOFP_I32_F16;
     if (RetVT == MVT::f32)
       return SINTTOFP_I32_F32;
     if (RetVT == MVT::f64)
@@ -381,6 +406,8 @@ RTLIB::Libcall RTLIB::getSINTTOFP(EVT OpVT, EVT RetVT) {
     if (RetVT == MVT::ppcf128)
       return SINTTOFP_I32_PPCF128;
   } else if (OpVT == MVT::i64) {
+    if (RetVT == MVT::f16)
+      return SINTTOFP_I64_F16;
     if (RetVT == MVT::f32)
       return SINTTOFP_I64_F32;
     if (RetVT == MVT::f64)
@@ -392,6 +419,8 @@ RTLIB::Libcall RTLIB::getSINTTOFP(EVT OpVT, EVT RetVT) {
     if (RetVT == MVT::ppcf128)
       return SINTTOFP_I64_PPCF128;
   } else if (OpVT == MVT::i128) {
+    if (RetVT == MVT::f16)
+      return SINTTOFP_I128_F16;
     if (RetVT == MVT::f32)
       return SINTTOFP_I128_F32;
     if (RetVT == MVT::f64)
@@ -410,6 +439,8 @@ RTLIB::Libcall RTLIB::getSINTTOFP(EVT OpVT, EVT RetVT) {
 /// UNKNOWN_LIBCALL if there is none.
 RTLIB::Libcall RTLIB::getUINTTOFP(EVT OpVT, EVT RetVT) {
   if (OpVT == MVT::i32) {
+    if (RetVT == MVT::f16)
+      return UINTTOFP_I32_F16;
     if (RetVT == MVT::f32)
       return UINTTOFP_I32_F32;
     if (RetVT == MVT::f64)
@@ -421,6 +452,8 @@ RTLIB::Libcall RTLIB::getUINTTOFP(EVT OpVT, EVT RetVT) {
     if (RetVT == MVT::ppcf128)
       return UINTTOFP_I32_PPCF128;
   } else if (OpVT == MVT::i64) {
+    if (RetVT == MVT::f16)
+      return UINTTOFP_I64_F16;
     if (RetVT == MVT::f32)
       return UINTTOFP_I64_F32;
     if (RetVT == MVT::f64)
@@ -432,6 +465,8 @@ RTLIB::Libcall RTLIB::getUINTTOFP(EVT OpVT, EVT RetVT) {
     if (RetVT == MVT::ppcf128)
       return UINTTOFP_I64_PPCF128;
   } else if (OpVT == MVT::i128) {
+    if (RetVT == MVT::f16)
+      return UINTTOFP_I128_F16;
     if (RetVT == MVT::f32)
       return UINTTOFP_I128_F32;
     if (RetVT == MVT::f64)
@@ -446,6 +481,83 @@ RTLIB::Libcall RTLIB::getUINTTOFP(EVT OpVT, EVT RetVT) {
   return UNKNOWN_LIBCALL;
 }
 
+RTLIB::Libcall RTLIB::getOUTLINE_ATOMIC(unsigned Opc, AtomicOrdering Order,
+                                        MVT VT) {
+  unsigned ModeN, ModelN;
+  switch (VT.SimpleTy) {
+  case MVT::i8:
+    ModeN = 0;
+    break;
+  case MVT::i16:
+    ModeN = 1;
+    break;
+  case MVT::i32:
+    ModeN = 2;
+    break;
+  case MVT::i64:
+    ModeN = 3;
+    break;
+  case MVT::i128:
+    ModeN = 4;
+    break;
+  default:
+    return UNKNOWN_LIBCALL;
+  }
+
+  switch (Order) {
+  case AtomicOrdering::Monotonic:
+    ModelN = 0;
+    break;
+  case AtomicOrdering::Acquire:
+    ModelN = 1;
+    break;
+  case AtomicOrdering::Release:
+    ModelN = 2;
+    break;
+  case AtomicOrdering::AcquireRelease:
+  case AtomicOrdering::SequentiallyConsistent:
+    ModelN = 3;
+    break;
+  default:
+    return UNKNOWN_LIBCALL;
+  }
+
+#define LCALLS(A, B)                                                           \
+  { A##B##_RELAX, A##B##_ACQ, A##B##_REL, A##B##_ACQ_REL }
+#define LCALL5(A)                                                              \
+  LCALLS(A, 1), LCALLS(A, 2), LCALLS(A, 4), LCALLS(A, 8), LCALLS(A, 16)
+  switch (Opc) {
+  case ISD::ATOMIC_CMP_SWAP: {
+    const Libcall LC[5][4] = {LCALL5(OUTLINE_ATOMIC_CAS)};
+    return LC[ModeN][ModelN];
+  }
+  case ISD::ATOMIC_SWAP: {
+    const Libcall LC[5][4] = {LCALL5(OUTLINE_ATOMIC_SWP)};
+    return LC[ModeN][ModelN];
+  }
+  case ISD::ATOMIC_LOAD_ADD: {
+    const Libcall LC[5][4] = {LCALL5(OUTLINE_ATOMIC_LDADD)};
+    return LC[ModeN][ModelN];
+  }
+  case ISD::ATOMIC_LOAD_OR: {
+    const Libcall LC[5][4] = {LCALL5(OUTLINE_ATOMIC_LDSET)};
+    return LC[ModeN][ModelN];
+  }
+  case ISD::ATOMIC_LOAD_CLR: {
+    const Libcall LC[5][4] = {LCALL5(OUTLINE_ATOMIC_LDCLR)};
+    return LC[ModeN][ModelN];
+  }
+  case ISD::ATOMIC_LOAD_XOR: {
+    const Libcall LC[5][4] = {LCALL5(OUTLINE_ATOMIC_LDEOR)};
+    return LC[ModeN][ModelN];
+  }
+  default:
+    return UNKNOWN_LIBCALL;
+  }
+#undef LCALLS
+#undef LCALL5
+}
+
 RTLIB::Libcall RTLIB::getSYNC(unsigned Opc, MVT VT) {
 #define OP_TO_LIBCALL(Name, Enum)                                              \
   case Name:                                                                   \
@@ -615,7 +727,7 @@ void TargetLoweringBase::initActions() {
             std::end(TargetDAGCombineArray), 0);
 
   for (MVT VT : MVT::fp_valuetypes()) {
-    MVT IntVT = MVT::getIntegerVT(VT.getSizeInBits().getFixedSize());
+    MVT IntVT = MVT::getIntegerVT(VT.getFixedSizeInBits());
     if (IntVT.isValid()) {
       setOperationAction(ISD::ATOMIC_SWAP, VT, Promote);
       AddPromotedToType(ISD::ATOMIC_SWAP, VT, IntVT);
@@ -657,6 +769,8 @@ void TargetLoweringBase::initActions() {
     setOperationAction(ISD::UADDSAT, VT, Expand);
     setOperationAction(ISD::SSUBSAT, VT, Expand);
     setOperationAction(ISD::USUBSAT, VT, Expand);
+    setOperationAction(ISD::SSHLSAT, VT, Expand);
+    setOperationAction(ISD::USHLSAT, VT, Expand);
     setOperationAction(ISD::SMULFIX, VT, Expand);
     setOperationAction(ISD::SMULFIXSAT, VT, Expand);
     setOperationAction(ISD::UMULFIX, VT, Expand);
@@ -665,6 +779,8 @@ void TargetLoweringBase::initActions() {
     setOperationAction(ISD::SDIVFIXSAT, VT, Expand);
     setOperationAction(ISD::UDIVFIX, VT, Expand);
     setOperationAction(ISD::UDIVFIXSAT, VT, Expand);
+    setOperationAction(ISD::FP_TO_SINT_SAT, VT, Expand);
+    setOperationAction(ISD::FP_TO_UINT_SAT, VT, Expand);
 
     // Overflow operations default to expand
     setOperationAction(ISD::SADDO, VT, Expand);
@@ -678,6 +794,8 @@ void TargetLoweringBase::initActions() {
     setOperationAction(ISD::ADDCARRY, VT, Expand);
     setOperationAction(ISD::SUBCARRY, VT, Expand);
     setOperationAction(ISD::SETCCCARRY, VT, Expand);
+    setOperationAction(ISD::SADDO_CARRY, VT, Expand);
+    setOperationAction(ISD::SSUBO_CARRY, VT, Expand);
 
     // ADDC/ADDE/SUBC/SUBE default to expand.
     setOperationAction(ISD::ADDC, VT, Expand);
@@ -690,6 +808,7 @@ void TargetLoweringBase::initActions() {
     setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
 
     setOperationAction(ISD::BITREVERSE, VT, Expand);
+    setOperationAction(ISD::PARITY, VT, Expand);
 
     // These library functions default to expand.
     setOperationAction(ISD::FROUND, VT, Expand);
@@ -728,6 +847,8 @@ void TargetLoweringBase::initActions() {
     setOperationAction(ISD::VECREDUCE_UMIN, VT, Expand);
     setOperationAction(ISD::VECREDUCE_FMAX, VT, Expand);
     setOperationAction(ISD::VECREDUCE_FMIN, VT, Expand);
+    setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Expand);
+    setOperationAction(ISD::VECREDUCE_SEQ_FMUL, VT, Expand);
   }
 
   // Most targets ignore the @llvm.prefetch intrinsic.
@@ -772,6 +893,8 @@ void TargetLoweringBase::initActions() {
   // On most systems, DEBUGTRAP and TRAP have no difference. The "Expand"
   // here is to inform DAG Legalizer to replace DEBUGTRAP with TRAP.
   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Expand);
+
+  setOperationAction(ISD::UBSANTRAP, MVT::Other, Expand);
 }
 
 MVT TargetLoweringBase::getScalarShiftAmountTy(const DataLayout &DL,
@@ -801,6 +924,11 @@ bool TargetLoweringBase::canOpTrap(unsigned Op, EVT VT) const {
   }
 }
 
+bool TargetLoweringBase::isFreeAddrSpaceCast(unsigned SrcAS,
+                                             unsigned DestAS) const {
+  return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
+}
+
 void TargetLoweringBase::setJumpIsExpensive(bool isExpensive) {
   // If the command-line option was specified, ignore this request.
   if (!JumpIsExpensiveOverride.getNumOccurrences())
@@ -823,9 +951,7 @@ TargetLoweringBase::getTypeConversion(LLVMContext &Context, EVT VT) const {
            "Promote may not follow Expand or Promote");
 
     if (LA == TypeSplitVector)
-      return LegalizeKind(LA,
-                          EVT::getVectorVT(Context, SVT.getVectorElementType(),
-                                           SVT.getVectorElementCount() / 2));
+      return LegalizeKind(LA, EVT(SVT).getHalfNumVectorElementsVT(Context));
     if (LA == TypeScalarizeVector)
       return LegalizeKind(LA, SVT.getVectorElementType());
     return LegalizeKind(LA, NVT);
@@ -856,10 +982,10 @@ TargetLoweringBase::getTypeConversion(LLVMContext &Context, EVT VT) const {
   EVT EltVT = VT.getVectorElementType();
 
   // Vectors with only one element are always scalarized.
-  if (NumElts == 1)
+  if (NumElts.isScalar())
     return LegalizeKind(TypeScalarizeVector, EltVT);
 
-  if (VT.getVectorElementCount() == ElementCount(1, true))
+  if (VT.getVectorElementCount() == ElementCount::getScalable(1))
     report_fatal_error("Cannot legalize this vector");
 
   // Try to widen vector elements until the element type is a power of two and
@@ -869,7 +995,7 @@ TargetLoweringBase::getTypeConversion(LLVMContext &Context, EVT VT) const {
     // Vectors with a number of elements that is not a power of two are always
     // widened, for example <3 x i8> -> <4 x i8>.
     if (!VT.isPow2VectorType()) {
-      NumElts = NumElts.NextPowerOf2();
+      NumElts = NumElts.coefficientNextPowerOf2();
       EVT NVT = EVT::getVectorVT(Context, EltVT, NumElts);
       return LegalizeKind(TypeWidenVector, NVT);
     }
@@ -881,7 +1007,7 @@ TargetLoweringBase::getTypeConversion(LLVMContext &Context, EVT VT) const {
     //  <4 x i140> -> <2 x i140>
     if (LK.first == TypeExpandInteger)
       return LegalizeKind(TypeSplitVector,
-                          EVT::getVectorVT(Context, EltVT, NumElts / 2));
+                          VT.getHalfNumVectorElementsVT(Context));
 
     // Promote the integer element types until a legal vector type is found
     // or until the element integer type is too big. If a legal type was not
@@ -918,7 +1044,7 @@ TargetLoweringBase::getTypeConversion(LLVMContext &Context, EVT VT) const {
   // If there is no wider legal type, split the vector.
   while (true) {
     // Round up to the next power of 2.
-    NumElts = NumElts.NextPowerOf2();
+    NumElts = NumElts.coefficientNextPowerOf2();
 
     // If there is no simple vector type with this many elements then there
     // cannot be a larger legal vector type.  Note that this assumes that
@@ -941,7 +1067,8 @@ TargetLoweringBase::getTypeConversion(LLVMContext &Context, EVT VT) const {
   }
 
   // Vectors with illegal element types are expanded.
-  EVT NVT = EVT::getVectorVT(Context, EltVT, VT.getVectorElementCount() / 2);
+  EVT NVT = EVT::getVectorVT(Context, EltVT,
+                             VT.getVectorElementCount().divideCoefficientBy(2));
   return LegalizeKind(TypeSplitVector, NVT);
 }
 
@@ -957,23 +1084,24 @@ static unsigned getVectorTypeBreakdownMVT(MVT VT, MVT &IntermediateVT,
 
   // Scalable vectors cannot be scalarized, so splitting or widening is
   // required.
-  if (VT.isScalableVector() && !isPowerOf2_32(EC.Min))
+  if (VT.isScalableVector() && !isPowerOf2_32(EC.getKnownMinValue()))
     llvm_unreachable(
         "Splitting or widening of non-power-of-2 MVTs is not implemented.");
 
   // FIXME: We don't support non-power-of-2-sized vectors for now.
   // Ideally we could break down into LHS/RHS like LegalizeDAG does.
-  if (!isPowerOf2_32(EC.Min)) {
+  if (!isPowerOf2_32(EC.getKnownMinValue())) {
     // Split EC to unit size (scalable property is preserved).
-    NumVectorRegs = EC.Min;
-    EC = EC / NumVectorRegs;
+    NumVectorRegs = EC.getKnownMinValue();
+    EC = ElementCount::getFixed(1);
   }
 
   // Divide the input until we get to a supported size. This will
   // always end up with an EC that represent a scalar or a scalable
   // scalar.
-  while (EC.Min > 1 && !TLI->isTypeLegal(MVT::getVectorVT(EltTy, EC))) {
-    EC.Min >>= 1;
+  while (EC.getKnownMinValue() > 1 &&
+         !TLI->isTypeLegal(MVT::getVectorVT(EltTy, EC))) {
+    EC = EC.divideCoefficientBy(2);
     NumVectorRegs <<= 1;
   }
 
@@ -984,7 +1112,7 @@ static unsigned getVectorTypeBreakdownMVT(MVT VT, MVT &IntermediateVT,
     NewVT = EltTy;
   IntermediateVT = NewVT;
 
-  unsigned LaneSizeInBits = NewVT.getScalarSizeInBits().getFixedSize();
+  unsigned LaneSizeInBits = NewVT.getScalarSizeInBits();
 
   // Convert sizes such as i33 to i64.
   if (!isPowerOf2_32(LaneSizeInBits))
@@ -993,8 +1121,7 @@ static unsigned getVectorTypeBreakdownMVT(MVT VT, MVT &IntermediateVT,
   MVT DestVT = TLI->getRegisterType(NewVT);
   RegisterVT = DestVT;
   if (EVT(DestVT).bitsLT(NewVT))    // Value is expanded, e.g. i64 -> i16.
-    return NumVectorRegs *
-           (LaneSizeInBits / DestVT.getScalarSizeInBits().getFixedSize());
+    return NumVectorRegs * (LaneSizeInBits / DestVT.getScalarSizeInBits());
 
   // Otherwise, promotion or legal types use the same number of registers as
   // the vector decimated to the appropriate level.
@@ -1041,9 +1168,19 @@ TargetLoweringBase::emitPatchPoint(MachineInstr &InitialMI,
   // Inherit previous memory operands.
   MIB.cloneMemRefs(*MI);
 
-  for (auto &MO : MI->operands()) {
+  for (unsigned i = 0; i < MI->getNumOperands(); ++i) {
+    MachineOperand &MO = MI->getOperand(i);
     if (!MO.isFI()) {
+      // Index of Def operand this Use it tied to.
+      // Since Defs are coming before Uses, if Use is tied, then
+      // index of Def must be smaller that index of that Use.
+      // Also, Defs preserve their position in new MI.
+      unsigned TiedTo = i;
+      if (MO.isReg() && MO.isTied())
+        TiedTo = MI->findTiedOperandIdx(i);
       MIB.add(MO);
+      if (TiedTo < i)
+        MIB->tieOperands(TiedTo, MIB->getNumOperands() - 1);
       continue;
     }
 
@@ -1090,36 +1227,6 @@ TargetLoweringBase::emitPatchPoint(MachineInstr &InitialMI,
   return MBB;
 }
 
-MachineBasicBlock *
-TargetLoweringBase::emitXRayCustomEvent(MachineInstr &MI,
-                                        MachineBasicBlock *MBB) const {
-  assert(MI.getOpcode() == TargetOpcode::PATCHABLE_EVENT_CALL &&
-         "Called emitXRayCustomEvent on the wrong MI!");
-  auto &MF = *MI.getMF();
-  auto MIB = BuildMI(MF, MI.getDebugLoc(), MI.getDesc());
-  for (unsigned OpIdx = 0; OpIdx != MI.getNumOperands(); ++OpIdx)
-    MIB.add(MI.getOperand(OpIdx));
-
-  MBB->insert(MachineBasicBlock::iterator(MI), MIB);
-  MI.eraseFromParent();
-  return MBB;
-}
-
-MachineBasicBlock *
-TargetLoweringBase::emitXRayTypedEvent(MachineInstr &MI,
-                                       MachineBasicBlock *MBB) const {
-  assert(MI.getOpcode() == TargetOpcode::PATCHABLE_TYPED_EVENT_CALL &&
-         "Called emitXRayTypedEvent on the wrong MI!");
-  auto &MF = *MI.getMF();
-  auto MIB = BuildMI(MF, MI.getDebugLoc(), MI.getDesc());
-  for (unsigned OpIdx = 0; OpIdx != MI.getNumOperands(); ++OpIdx)
-    MIB.add(MI.getOperand(OpIdx));
-
-  MBB->insert(MachineBasicBlock::iterator(MI), MIB);
-  MI.eraseFromParent();
-  return MBB;
-}
-
 /// findRepresentativeClass - Return the largest legal super-reg register class
 /// of the register class for the specified type and its associated "cost".
 // This function is in TargetLowering because it uses RegClassForVT which would
@@ -1282,7 +1389,7 @@ void TargetLoweringBase::computeRegisterProperties(
         MVT SVT = (MVT::SimpleValueType) nVT;
         // Promote vectors of integers to vectors with the same number
         // of elements, with a wider element type.
-        if (SVT.getScalarSizeInBits() > EltVT.getSizeInBits() &&
+        if (SVT.getScalarSizeInBits() > EltVT.getFixedSizeInBits() &&
             SVT.getVectorElementCount() == EC && isTypeLegal(SVT)) {
           TransformToType[i] = SVT;
           RegisterTypeForVT[i] = SVT;
@@ -1298,13 +1405,15 @@ void TargetLoweringBase::computeRegisterProperties(
     }
 
     case TypeWidenVector:
-      if (isPowerOf2_32(EC.Min)) {
+      if (isPowerOf2_32(EC.getKnownMinValue())) {
         // Try to widen the vector.
         for (unsigned nVT = i + 1; nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) {
           MVT SVT = (MVT::SimpleValueType) nVT;
           if (SVT.getVectorElementType() == EltVT &&
               SVT.isScalableVector() == IsScalable &&
-              SVT.getVectorElementCount().Min > EC.Min && isTypeLegal(SVT)) {
+              SVT.getVectorElementCount().getKnownMinValue() >
+                  EC.getKnownMinValue() &&
+              isTypeLegal(SVT)) {
             TransformToType[i] = SVT;
             RegisterTypeForVT[i] = SVT;
             NumRegistersForVT[i] = 1;
@@ -1348,10 +1457,10 @@ void TargetLoweringBase::computeRegisterProperties(
           ValueTypeActions.setTypeAction(VT, TypeScalarizeVector);
         else if (PreferredAction == TypeSplitVector)
           ValueTypeActions.setTypeAction(VT, TypeSplitVector);
-        else if (EC.Min > 1)
+        else if (EC.getKnownMinValue() > 1)
           ValueTypeActions.setTypeAction(VT, TypeSplitVector);
         else
-          ValueTypeActions.setTypeAction(VT, EC.Scalable
+          ValueTypeActions.setTypeAction(VT, EC.isScalable()
                                                  ? TypeScalarizeScalableVector
                                                  : TypeScalarizeVector);
       } else {
@@ -1409,7 +1518,8 @@ unsigned TargetLoweringBase::getVectorTypeBreakdown(LLVMContext &Context, EVT VT
   // This handles things like <2 x float> -> <4 x float> and
   // <4 x i1> -> <4 x i32>.
   LegalizeTypeAction TA = getTypeAction(Context, VT);
-  if (EltCnt.Min != 1 && (TA == TypeWidenVector || TA == TypePromoteInteger)) {
+  if (EltCnt.getKnownMinValue() != 1 &&
+      (TA == TypeWidenVector || TA == TypePromoteInteger)) {
     EVT RegisterEVT = getTypeToTransformTo(Context, VT);
     if (isTypeLegal(RegisterEVT)) {
       IntermediateVT = RegisterEVT;
@@ -1426,7 +1536,7 @@ unsigned TargetLoweringBase::getVectorTypeBreakdown(LLVMContext &Context, EVT VT
 
   // Scalable vectors cannot be scalarized, so handle the legalisation of the
   // types like done elsewhere in SelectionDAG.
-  if (VT.isScalableVector() && !isPowerOf2_32(EltCnt.Min)) {
+  if (VT.isScalableVector() && !isPowerOf2_32(EltCnt.getKnownMinValue())) {
     LegalizeKind LK;
     EVT PartVT = VT;
     do {
@@ -1435,15 +1545,15 @@ unsigned TargetLoweringBase::getVectorTypeBreakdown(LLVMContext &Context, EVT VT
       PartVT = LK.second;
     } while (LK.first != TypeLegal);
 
-    NumIntermediates =
-        VT.getVectorElementCount().Min / PartVT.getVectorElementCount().Min;
+    NumIntermediates = VT.getVectorElementCount().getKnownMinValue() /
+                       PartVT.getVectorElementCount().getKnownMinValue();
 
     // FIXME: This code needs to be extended to handle more complex vector
     // breakdowns, like nxv7i64 -> nxv8i64 -> 4 x nxv2i64. Currently the only
     // supported cases are vectors that are broken down into equal parts
     // such as nxv6i64 -> 3 x nxv2i64.
-    assert(NumIntermediates * PartVT.getVectorElementCount().Min ==
-               VT.getVectorElementCount().Min &&
+    assert((PartVT.getVectorElementCount() * NumIntermediates) ==
+               VT.getVectorElementCount() &&
            "Expected an integer multiple of PartVT");
     IntermediateVT = PartVT;
     RegisterVT = getRegisterType(Context, IntermediateVT);
@@ -1452,16 +1562,16 @@ unsigned TargetLoweringBase::getVectorTypeBreakdown(LLVMContext &Context, EVT VT
 
   // FIXME: We don't support non-power-of-2-sized vectors for now.  Ideally
   // we could break down into LHS/RHS like LegalizeDAG does.
-  if (!isPowerOf2_32(EltCnt.Min)) {
-    NumVectorRegs = EltCnt.Min;
-    EltCnt.Min = 1;
+  if (!isPowerOf2_32(EltCnt.getKnownMinValue())) {
+    NumVectorRegs = EltCnt.getKnownMinValue();
+    EltCnt = ElementCount::getFixed(1);
   }
 
   // Divide the input until we get to a supported size.  This will always
   // end with a scalar if the target doesn't support vectors.
-  while (EltCnt.Min > 1 &&
+  while (EltCnt.getKnownMinValue() > 1 &&
          !isTypeLegal(EVT::getVectorVT(Context, EltTy, EltCnt))) {
-    EltCnt.Min >>= 1;
+    EltCnt = EltCnt.divideCoefficientBy(2);
     NumVectorRegs <<= 1;
   }
 
@@ -1479,7 +1589,7 @@ unsigned TargetLoweringBase::getVectorTypeBreakdown(LLVMContext &Context, EVT VT
     TypeSize NewVTSize = NewVT.getSizeInBits();
     // Convert sizes such as i33 to i64.
     if (!isPowerOf2_32(NewVTSize.getKnownMinSize()))
-      NewVTSize = NewVTSize.NextPowerOf2();
+      NewVTSize = NewVTSize.coefficientNextPowerOf2();
     return NumVectorRegs*(NewVTSize/DestVT.getSizeInBits());
   }
 
@@ -1616,6 +1726,14 @@ bool TargetLoweringBase::allowsMemoryAccess(LLVMContext &Context,
                             MMO.getFlags(), Fast);
 }
 
+bool TargetLoweringBase::allowsMemoryAccess(LLVMContext &Context,
+                                            const DataLayout &DL, LLT Ty,
+                                            const MachineMemOperand &MMO,
+                                            bool *Fast) const {
+  return allowsMemoryAccess(Context, DL, getMVTForLLT(Ty), MMO.getAddrSpace(),
+                            MMO.getAlign(), MMO.getFlags(), Fast);
+}
+
 BranchProbability TargetLoweringBase::getPredictableBranchThreshold() const {
   return BranchProbability(MinPercentageForPredictableBranch, 100);
 }
@@ -1838,10 +1956,14 @@ Value *TargetLoweringBase::getIRStackGuard(IRBuilder<> &IRB) const {
 // Currently only support "standard" __stack_chk_guard.
 // TODO: add LOAD_STACK_GUARD support.
 void TargetLoweringBase::insertSSPDeclarations(Module &M) const {
-  if (!M.getNamedValue("__stack_chk_guard"))
-    new GlobalVariable(M, Type::getInt8PtrTy(M.getContext()), false,
-                       GlobalVariable::ExternalLinkage,
-                       nullptr, "__stack_chk_guard");
+  if (!M.getNamedValue("__stack_chk_guard")) {
+    auto *GV = new GlobalVariable(M, Type::getInt8PtrTy(M.getContext()), false,
+                                  GlobalVariable::ExternalLinkage, nullptr,
+                                  "__stack_chk_guard");
+    if (TM.getRelocationModel() == Reloc::Static &&
+        !TM.getTargetTriple().isWindowsGNUEnvironment())
+      GV->setDSOLocal(true);
+  }
 }
 
 // Currently only support "standard" __stack_chk_guard.
@@ -1925,7 +2047,7 @@ static bool parseRefinementStep(StringRef In, size_t &Position,
   // step parameter.
   if (RefStepString.size() == 1) {
     char RefStepChar = RefStepString[0];
-    if (RefStepChar >= '0' && RefStepChar <= '9') {
+    if (isDigit(RefStepChar)) {
       Value = RefStepChar - '0';
       return true;
     }
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/contrib/llvm-project/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index 27bebe503ce6..fe64b38cf0be 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -21,6 +21,7 @@
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/BinaryFormat/MachO.h"
+#include "llvm/CodeGen/BasicBlockSectionUtils.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
@@ -39,6 +40,7 @@
 #include "llvm/IR/Mangler.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/PseudoProbe.h"
 #include "llvm/IR/Type.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
@@ -104,10 +106,14 @@ static void GetObjCImageInfo(Module &M, unsigned &Version, unsigned &Flags,
 //                                  ELF
 //===----------------------------------------------------------------------===//
 
+TargetLoweringObjectFileELF::TargetLoweringObjectFileELF()
+    : TargetLoweringObjectFile() {
+  SupportDSOLocalEquivalentLowering = true;
+}
+
 void TargetLoweringObjectFileELF::Initialize(MCContext &Ctx,
                                              const TargetMachine &TgtM) {
   TargetLoweringObjectFile::Initialize(Ctx, TgtM);
-  TM = &TgtM;
 
   CodeModel::Model CM = TgtM.getCodeModel();
   InitializeELF(TgtM.Options.UseInitArray);
@@ -122,6 +128,7 @@ void TargetLoweringObjectFileELF::Initialize(MCContext &Ctx,
     // Fallthrough if not using EHABI
     LLVM_FALLTHROUGH;
   case Triple::ppc:
+  case Triple::ppcle:
   case Triple::x86:
     PersonalityEncoding = isPositionIndependent()
                               ? dwarf::DW_EH_PE_indirect |
@@ -174,11 +181,20 @@ void TargetLoweringObjectFileELF::Initialize(MCContext &Ctx,
     // will be in memory. Most of these could end up >2GB away so even a signed
     // pc-relative 32-bit address is insufficient, theoretically.
     if (isPositionIndependent()) {
-      PersonalityEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
-        dwarf::DW_EH_PE_sdata8;
-      LSDAEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata8;
-      TTypeEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
-        dwarf::DW_EH_PE_sdata8;
+      // ILP32 uses sdata4 instead of sdata8
+      if (TgtM.getTargetTriple().getEnvironment() == Triple::GNUILP32) {
+        PersonalityEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
+                              dwarf::DW_EH_PE_sdata4;
+        LSDAEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4;
+        TTypeEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
+                        dwarf::DW_EH_PE_sdata4;
+      } else {
+        PersonalityEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
+                              dwarf::DW_EH_PE_sdata8;
+        LSDAEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata8;
+        TTypeEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
+                        dwarf::DW_EH_PE_sdata8;
+      }
     } else {
       PersonalityEncoding = dwarf::DW_EH_PE_absptr;
       LSDAEncoding = dwarf::DW_EH_PE_absptr;
@@ -310,6 +326,29 @@ void TargetLoweringObjectFileELF::emitModuleMetadata(MCStreamer &Streamer,
     }
   }
 
+  if (NamedMDNode *FuncInfo = M.getNamedMetadata(PseudoProbeDescMetadataName)) {
+    // Emit a descriptor for every function including functions that have an
+    // available external linkage. We may not want this for imported functions
+    // that has code in another thinLTO module but we don't have a good way to
+    // tell them apart from inline functions defined in header files. Therefore
+    // we put each descriptor in a separate comdat section and rely on the
+    // linker to deduplicate.
+    for (const auto *Operand : FuncInfo->operands()) {
+      const auto *MD = cast<MDNode>(Operand);
+      auto *GUID = mdconst::dyn_extract<ConstantInt>(MD->getOperand(0));
+      auto *Hash = mdconst::dyn_extract<ConstantInt>(MD->getOperand(1));
+      auto *Name = cast<MDString>(MD->getOperand(2));
+      auto *S = C.getObjectFileInfo()->getPseudoProbeDescSection(
+          TM->getFunctionSections() ? Name->getString() : StringRef());
+
+      Streamer.SwitchSection(S);
+      Streamer.emitInt64(GUID->getZExtValue());
+      Streamer.emitInt64(Hash->getZExtValue());
+      Streamer.emitULEB128IntValue(Name->getString().size());
+      Streamer.emitBytes(Name->getString());
+    }
+  }
+
   unsigned Version = 0;
   unsigned Flags = 0;
   StringRef Section;
@@ -324,46 +363,7 @@ void TargetLoweringObjectFileELF::emitModuleMetadata(MCStreamer &Streamer,
     Streamer.AddBlankLine();
   }
 
-  SmallVector<Module::ModuleFlagEntry, 8> ModuleFlags;
-  M.getModuleFlagsMetadata(ModuleFlags);
-
-  MDNode *CFGProfile = nullptr;
-
-  for (const auto &MFE : ModuleFlags) {
-    StringRef Key = MFE.Key->getString();
-    if (Key == "CG Profile") {
-      CFGProfile = cast<MDNode>(MFE.Val);
-      break;
-    }
-  }
-
-  if (!CFGProfile)
-    return;
-
-  auto GetSym = [this](const MDOperand &MDO) -> MCSymbol * {
-    if (!MDO)
-      return nullptr;
-    auto V = cast<ValueAsMetadata>(MDO);
-    const Function *F = cast<Function>(V->getValue());
-    return TM->getSymbol(F);
-  };
-
-  for (const auto &Edge : CFGProfile->operands()) {
-    MDNode *E = cast<MDNode>(Edge);
-    const MCSymbol *From = GetSym(E->getOperand(0));
-    const MCSymbol *To = GetSym(E->getOperand(1));
-    // Skip null functions. This can happen if functions are dead stripped after
-    // the CGProfile pass has been run.
-    if (!From || !To)
-      continue;
-    uint64_t Count = cast<ConstantAsMetadata>(E->getOperand(2))
-                         ->getValue()
-                         ->getUniqueInteger()
-                         .getZExtValue();
-    Streamer.emitCGProfileEntry(
-        MCSymbolRefExpr::create(From, MCSymbolRefExpr::VK_None, C),
-        MCSymbolRefExpr::create(To, MCSymbolRefExpr::VK_None, C), Count);
-  }
+  emitCGProfileMetadata(Streamer, M);
 }
 
 MCSymbol *TargetLoweringObjectFileELF::getCFIPersonalitySymbol(
@@ -436,7 +436,8 @@ static SectionKind getELFKindForNamedSection(StringRef Name, SectionKind K) {
   if (Name == getInstrProfSectionName(IPSK_covmap, Triple::ELF,
                                       /*AddSegmentInfo=*/false) ||
       Name == getInstrProfSectionName(IPSK_covfun, Triple::ELF,
-                                      /*AddSegmentInfo=*/false))
+                                      /*AddSegmentInfo=*/false) ||
+      Name == ".llvmbc" || Name == ".llvmcmd")
     return SectionKind::getMetadata();
 
   if (Name.empty() || Name[0] != '.') return K;
@@ -614,7 +615,7 @@ getELFSectionNameForGlobal(const GlobalObject *GO, SectionKind Kind,
   bool HasPrefix = false;
   if (const auto *F = dyn_cast<Function>(GO)) {
     if (Optional<StringRef> Prefix = F->getSectionPrefix()) {
-      Name += *Prefix;
+      raw_svector_ostream(Name) << '.' << *Prefix;
       HasPrefix = true;
     }
   }
@@ -680,11 +681,12 @@ MCSection *TargetLoweringObjectFileELF::getExplicitSectionGlobal(
   // MD_associated in a unique section.
   unsigned UniqueID = MCContext::GenericSectionID;
   const MCSymbolELF *LinkedToSym = getLinkedToSymbol(GO, TM);
-  if (LinkedToSym) {
+  if (GO->getMetadata(LLVMContext::MD_associated)) {
     UniqueID = NextUniqueID++;
     Flags |= ELF::SHF_LINK_ORDER;
   } else {
-    if (getContext().getAsmInfo()->useIntegratedAssembler()) {
+    if (getContext().getAsmInfo()->useIntegratedAssembler() ||
+        getContext().getAsmInfo()->binutilsIsAtLeast(2, 35)) {
       // Symbols must be placed into sections with compatible entry
       // sizes. Generate unique sections for symbols that have not
       // been assigned to compatible sections.
@@ -735,8 +737,9 @@ MCSection *TargetLoweringObjectFileELF::getExplicitSectionGlobal(
   assert(Section->getLinkedToSymbol() == LinkedToSym &&
          "Associated symbol mismatch between sections");
 
-  if (!getContext().getAsmInfo()->useIntegratedAssembler()) {
-    // If we are not using the integrated assembler then this symbol might have
+  if (!(getContext().getAsmInfo()->useIntegratedAssembler() ||
+        getContext().getAsmInfo()->binutilsIsAtLeast(2, 35))) {
+    // If we are using GNU as before 2.35, then this symbol might have
     // been placed in an incompatible mergeable section. Emit an error if this
     // is the case to avoid creating broken output.
     if ((Section->getFlags() & ELF::SHF_MERGE) &&
@@ -831,6 +834,43 @@ MCSection *TargetLoweringObjectFileELF::getSectionForJumpTable(
                                    /* AssociatedSymbol */ nullptr);
 }
 
+MCSection *
+TargetLoweringObjectFileELF::getSectionForLSDA(const Function &F,
+                                               const TargetMachine &TM) const {
+  // If neither COMDAT nor function sections, use the monolithic LSDA section.
+  // Re-use this path if LSDASection is null as in the Arm EHABI.
+  if (!LSDASection || (!F.hasComdat() && !TM.getFunctionSections()))
+    return LSDASection;
+
+  const auto *LSDA = cast<MCSectionELF>(LSDASection);
+  unsigned Flags = LSDA->getFlags();
+  StringRef Group;
+  if (F.hasComdat()) {
+    Group = F.getComdat()->getName();
+    Flags |= ELF::SHF_GROUP;
+  }
+
+  // Append the function name as the suffix like GCC, assuming
+  // -funique-section-names applies to .gcc_except_table sections.
+  if (TM.getUniqueSectionNames())
+    return getContext().getELFSection(LSDA->getName() + "." + F.getName(),
+                                      LSDA->getType(), Flags, 0, Group,
+                                      MCSection::NonUniqueID, nullptr);
+
+  // Allocate a unique ID if function sections && (integrated assembler or GNU
+  // as>=2.35). Note we could use SHF_LINK_ORDER to facilitate --gc-sections but
+  // that would require that we know the linker is a modern LLD (12.0 or later).
+  // GNU ld as of 2.35 does not support mixed SHF_LINK_ORDER &
+  // non-SHF_LINK_ORDER components in an output section
+  // https://sourceware.org/bugzilla/show_bug.cgi?id=26256
+  unsigned ID = TM.getFunctionSections() &&
+                        getContext().getAsmInfo()->useIntegratedAssembler()
+                    ? NextUniqueID++
+                    : MCSection::NonUniqueID;
+  return getContext().getELFSection(LSDA->getName(), LSDA->getType(), Flags, 0,
+                                    Group, ID, nullptr);
+}
+
 bool TargetLoweringObjectFileELF::shouldPutJumpTableInFunctionSection(
     bool UsesLabelDifference, const Function &F) const {
   // We can always create relative relocations, so use another section
@@ -865,14 +905,14 @@ MCSection *TargetLoweringObjectFileELF::getSectionForMachineBasicBlock(
   assert(MBB.isBeginSection() && "Basic block does not start a section!");
   unsigned UniqueID = MCContext::GenericSectionID;
 
-  // For cold sections use the .text.unlikely prefix along with the parent
+  // For cold sections use the .text.split. prefix along with the parent
   // function name. All cold blocks for the same function go to the same
   // section. Similarly all exception blocks are grouped by symbol name
   // under the .text.eh prefix. For regular sections, we either use a unique
   // name, or a unique ID for the section.
   SmallString<128> Name;
   if (MBB.getSectionID() == MBBSectionID::ColdSectionID) {
-    Name += ".text.unlikely.";
+    Name += BBSectionsColdTextPrefix;
     Name += MBB.getParent()->getName();
   } else if (MBB.getSectionID() == MBBSectionID::ExceptionSectionID) {
     Name += ".text.eh.";
@@ -888,7 +928,7 @@ MCSection *TargetLoweringObjectFileELF::getSectionForMachineBasicBlock(
   }
 
   unsigned Flags = ELF::SHF_ALLOC | ELF::SHF_EXECINSTR;
-  std::string GroupName = "";
+  std::string GroupName;
   if (F.hasComdat()) {
     Flags |= ELF::SHF_GROUP;
     GroupName = F.getComdat()->getName().str();
@@ -968,6 +1008,20 @@ const MCExpr *TargetLoweringObjectFileELF::lowerRelativeReference(
       MCSymbolRefExpr::create(TM.getSymbol(RHS), getContext()), getContext());
 }
 
+const MCExpr *TargetLoweringObjectFileELF::lowerDSOLocalEquivalent(
+    const DSOLocalEquivalent *Equiv, const TargetMachine &TM) const {
+  assert(supportDSOLocalEquivalentLowering());
+
+  const auto *GV = Equiv->getGlobalValue();
+
+  // A PLT entry is not needed for dso_local globals.
+  if (GV->isDSOLocal() || GV->isImplicitDSOLocal())
+    return MCSymbolRefExpr::create(TM.getSymbol(GV), getContext());
+
+  return MCSymbolRefExpr::create(TM.getSymbol(GV), PLTRelativeVariantKind,
+                                 getContext());
+}
+
 MCSection *TargetLoweringObjectFileELF::getSectionForCommandLines() const {
   // Use ".GCC.command.line" since this feature is to support clang's
   // -frecord-gcc-switches which in turn attempts to mimic GCC's switch of the
@@ -1515,6 +1569,10 @@ MCSection *TargetLoweringObjectFileCOFF::SelectSectionForGlobal(
       MCSymbol *Sym = TM.getSymbol(ComdatGV);
       StringRef COMDATSymName = Sym->getName();
 
+      if (const auto *F = dyn_cast<Function>(GO))
+        if (Optional<StringRef> Prefix = F->getSectionPrefix())
+          raw_svector_ostream(Name) << '$' << *Prefix;
+
       // Append "$symbol" to the section name *before* IR-level mangling is
       // applied when targetting mingw. This is what GCC does, and the ld.bfd
       // COFF linker will not properly handle comdats otherwise.
@@ -1590,6 +1648,31 @@ MCSection *TargetLoweringObjectFileCOFF::getSectionForJumpTable(
 
 void TargetLoweringObjectFileCOFF::emitModuleMetadata(MCStreamer &Streamer,
                                                       Module &M) const {
+  emitLinkerDirectives(Streamer, M);
+
+  unsigned Version = 0;
+  unsigned Flags = 0;
+  StringRef Section;
+
+  GetObjCImageInfo(M, Version, Flags, Section);
+  if (!Section.empty()) {
+    auto &C = getContext();
+    auto *S = C.getCOFFSection(Section,
+                               COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                                   COFF::IMAGE_SCN_MEM_READ,
+                               SectionKind::getReadOnly());
+    Streamer.SwitchSection(S);
+    Streamer.emitLabel(C.getOrCreateSymbol(StringRef("OBJC_IMAGE_INFO")));
+    Streamer.emitInt32(Version);
+    Streamer.emitInt32(Flags);
+    Streamer.AddBlankLine();
+  }
+
+  emitCGProfileMetadata(Streamer, M);
+}
+
+void TargetLoweringObjectFileCOFF::emitLinkerDirectives(
+    MCStreamer &Streamer, Module &M) const {
   if (NamedMDNode *LinkerOptions = M.getNamedMetadata("llvm.linker.options")) {
     // Emit the linker options to the linker .drectve section.  According to the
     // spec, this section is a space-separated string containing flags for
@@ -1606,28 +1689,51 @@ void TargetLoweringObjectFileCOFF::emitModuleMetadata(MCStreamer &Streamer,
     }
   }
 
-  unsigned Version = 0;
-  unsigned Flags = 0;
-  StringRef Section;
-
-  GetObjCImageInfo(M, Version, Flags, Section);
-  if (Section.empty())
-    return;
+  // Emit /EXPORT: flags for each exported global as necessary.
+  std::string Flags;
+  for (const GlobalValue &GV : M.global_values()) {
+    raw_string_ostream OS(Flags);
+    emitLinkerFlagsForGlobalCOFF(OS, &GV, getTargetTriple(), getMangler());
+    OS.flush();
+    if (!Flags.empty()) {
+      Streamer.SwitchSection(getDrectveSection());
+      Streamer.emitBytes(Flags);
+    }
+    Flags.clear();
+  }
 
-  auto &C = getContext();
-  auto *S = C.getCOFFSection(
-      Section, COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | COFF::IMAGE_SCN_MEM_READ,
-      SectionKind::getReadOnly());
-  Streamer.SwitchSection(S);
-  Streamer.emitLabel(C.getOrCreateSymbol(StringRef("OBJC_IMAGE_INFO")));
-  Streamer.emitInt32(Version);
-  Streamer.emitInt32(Flags);
-  Streamer.AddBlankLine();
+  // Emit /INCLUDE: flags for each used global as necessary.
+  if (const auto *LU = M.getNamedGlobal("llvm.used")) {
+    assert(LU->hasInitializer() && "expected llvm.used to have an initializer");
+    assert(isa<ArrayType>(LU->getValueType()) &&
+           "expected llvm.used to be an array type");
+    if (const auto *A = cast<ConstantArray>(LU->getInitializer())) {
+      for (const Value *Op : A->operands()) {
+        const auto *GV = cast<GlobalValue>(Op->stripPointerCasts());
+        // Global symbols with internal or private linkage are not visible to
+        // the linker, and thus would cause an error when the linker tried to
+        // preserve the symbol due to the `/include:` directive.
+        if (GV->hasLocalLinkage())
+          continue;
+
+        raw_string_ostream OS(Flags);
+        emitLinkerFlagsForUsedCOFF(OS, GV, getTargetTriple(), getMangler());
+        OS.flush();
+
+        if (!Flags.empty()) {
+          Streamer.SwitchSection(getDrectveSection());
+          Streamer.emitBytes(Flags);
+        }
+        Flags.clear();
+      }
+    }
+  }
 }
 
 void TargetLoweringObjectFileCOFF::Initialize(MCContext &Ctx,
                                               const TargetMachine &TM) {
   TargetLoweringObjectFile::Initialize(Ctx, TM);
+  this->TM = &TM;
   const Triple &T = TM.getTargetTriple();
   if (T.isWindowsMSVCEnvironment() || T.isWindowsItaniumEnvironment()) {
     StaticCtorSection =
@@ -1702,16 +1808,6 @@ MCSection *TargetLoweringObjectFileCOFF::getStaticDtorSection(
                                       cast<MCSectionCOFF>(StaticDtorSection));
 }
 
-void TargetLoweringObjectFileCOFF::emitLinkerFlagsForGlobal(
-    raw_ostream &OS, const GlobalValue *GV) const {
-  emitLinkerFlagsForGlobalCOFF(OS, GV, getTargetTriple(), getMangler());
-}
-
-void TargetLoweringObjectFileCOFF::emitLinkerFlagsForUsed(
-    raw_ostream &OS, const GlobalValue *GV) const {
-  emitLinkerFlagsForUsedCOFF(OS, GV, getTargetTriple(), getMangler());
-}
-
 const MCExpr *TargetLoweringObjectFileCOFF::lowerRelativeReference(
     const GlobalValue *LHS, const GlobalValue *RHS,
     const TargetMachine &TM) const {
@@ -1882,7 +1978,7 @@ static MCSectionWasm *selectWasmSectionForGlobal(
   if (const auto *F = dyn_cast<Function>(GO)) {
     const auto &OptionalPrefix = F->getSectionPrefix();
     if (OptionalPrefix)
-      Name += *OptionalPrefix;
+      raw_svector_ostream(Name) << '.' << *OptionalPrefix;
   }
 
   if (EmitUniqueSection && UniqueSectionNames) {
@@ -1970,14 +2066,36 @@ MCSection *TargetLoweringObjectFileWasm::getStaticDtorSection(
 //===----------------------------------------------------------------------===//
 //                                  XCOFF
 //===----------------------------------------------------------------------===//
+bool TargetLoweringObjectFileXCOFF::ShouldEmitEHBlock(
+    const MachineFunction *MF) {
+  if (!MF->getLandingPads().empty())
+    return true;
+
+  const Function &F = MF->getFunction();
+  if (!F.hasPersonalityFn() || !F.needsUnwindTableEntry())
+    return false;
+
+  const Function *Per =
+      dyn_cast<Function>(F.getPersonalityFn()->stripPointerCasts());
+  if (isNoOpWithoutInvoke(classifyEHPersonality(Per)))
+    return false;
+
+  return true;
+}
+
+MCSymbol *
+TargetLoweringObjectFileXCOFF::getEHInfoTableSymbol(const MachineFunction *MF) {
+  return MF->getMMI().getContext().getOrCreateSymbol(
+      "__ehinfo." + Twine(MF->getFunctionNumber()));
+}
+
 MCSymbol *
 TargetLoweringObjectFileXCOFF::getTargetSymbol(const GlobalValue *GV,
                                                const TargetMachine &TM) const {
-  if (TM.getDataSections())
-    report_fatal_error("XCOFF unique data sections not yet implemented");
-
   // We always use a qualname symbol for a GV that represents
   // a declaration, a function descriptor, or a common symbol.
+  // If a GV represents a GlobalVariable and -fdata-sections is enabled, we
+  // also return a qualname so that a label symbol could be avoided.
   // It is inherently ambiguous when the GO represents the address of a
   // function, as the GO could either represent a function descriptor or a
   // function entry point. We choose to always return a function descriptor
@@ -1992,21 +2110,34 @@ TargetLoweringObjectFileXCOFF::getTargetSymbol(const GlobalValue *GV,
       return cast<MCSectionXCOFF>(
                  getSectionForFunctionDescriptor(cast<Function>(GO), TM))
           ->getQualNameSymbol();
-    if (GOKind.isCommon() || GOKind.isBSSLocal())
+    if ((TM.getDataSections() && !GO->hasSection()) || GOKind.isCommon() ||
+        GOKind.isBSSLocal())
       return cast<MCSectionXCOFF>(SectionForGlobal(GO, GOKind, TM))
           ->getQualNameSymbol();
   }
 
   // For all other cases, fall back to getSymbol to return the unqualified name.
-  // This could change for a GV that is a GlobalVariable when we decide to
-  // support -fdata-sections since we could avoid having label symbols if the
-  // linkage name is applied to the csect symbol.
   return nullptr;
 }
 
 MCSection *TargetLoweringObjectFileXCOFF::getExplicitSectionGlobal(
     const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
-  report_fatal_error("XCOFF explicit sections not yet implemented.");
+  if (!GO->hasSection())
+    report_fatal_error("#pragma clang section is not yet supported");
+
+  StringRef SectionName = GO->getSection();
+  XCOFF::StorageMappingClass MappingClass;
+  if (Kind.isText())
+    MappingClass = XCOFF::XMC_PR;
+  else if (Kind.isData() || Kind.isReadOnlyWithRel() || Kind.isBSS())
+    MappingClass = XCOFF::XMC_RW;
+  else if (Kind.isReadOnly())
+    MappingClass = XCOFF::XMC_RO;
+  else
+    report_fatal_error("XCOFF other section types not yet implemented.");
+
+  return getContext().getXCOFFSection(SectionName, MappingClass, XCOFF::XTY_SD,
+                                      Kind, /* MultiSymbolsAllowed*/ true);
 }
 
 MCSection *TargetLoweringObjectFileXCOFF::getSectionForExternalReference(
@@ -2016,30 +2147,23 @@ MCSection *TargetLoweringObjectFileXCOFF::getSectionForExternalReference(
 
   SmallString<128> Name;
   getNameWithPrefix(Name, GO, TM);
-  XCOFF::StorageClass SC =
-      TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(GO);
 
   // Externals go into a csect of type ER.
   return getContext().getXCOFFSection(
       Name, isa<Function>(GO) ? XCOFF::XMC_DS : XCOFF::XMC_UA, XCOFF::XTY_ER,
-      SC, SectionKind::getMetadata());
+      SectionKind::getMetadata());
 }
 
 MCSection *TargetLoweringObjectFileXCOFF::SelectSectionForGlobal(
     const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
-  assert(!TM.getFunctionSections() && !TM.getDataSections() &&
-         "XCOFF unique sections not yet implemented.");
-
   // Common symbols go into a csect with matching name which will get mapped
   // into the .bss section.
   if (Kind.isBSSLocal() || Kind.isCommon()) {
     SmallString<128> Name;
     getNameWithPrefix(Name, GO, TM);
-    XCOFF::StorageClass SC =
-        TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(GO);
     return getContext().getXCOFFSection(
         Name, Kind.isBSSLocal() ? XCOFF::XMC_BS : XCOFF::XMC_RW, XCOFF::XTY_CM,
-        SC, Kind, /* BeginSymbolName */ nullptr);
+        Kind);
   }
 
   if (Kind.isMergeableCString()) {
@@ -2051,40 +2175,65 @@ MCSection *TargetLoweringObjectFileXCOFF::SelectSectionForGlobal(
     SmallString<128> Name;
     Name = SizeSpec + utostr(Alignment.value());
 
+    if (TM.getDataSections())
+      getNameWithPrefix(Name, GO, TM);
+
     return getContext().getXCOFFSection(
-        Name, XCOFF::XMC_RO, XCOFF::XTY_SD,
-        TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(GO),
-        Kind, /* BeginSymbolName */ nullptr);
+        Name, XCOFF::XMC_RO, XCOFF::XTY_SD, Kind,
+        /* MultiSymbolsAllowed*/ !TM.getDataSections());
   }
 
-  if (Kind.isText())
+  if (Kind.isText()) {
+    if (TM.getFunctionSections()) {
+      return cast<MCSymbolXCOFF>(getFunctionEntryPointSymbol(GO, TM))
+          ->getRepresentedCsect();
+    }
     return TextSection;
+  }
 
-  if (Kind.isData() || Kind.isReadOnlyWithRel())
-    // TODO: We may put this under option control, because user may want to
-    // have read-only data with relocations placed into a read-only section by
-    // the compiler.
-    return DataSection;
-
-  // Zero initialized data must be emitted to the .data section because external
-  // linkage control sections that get mapped to the .bss section will be linked
-  // as tentative defintions, which is only appropriate for SectionKind::Common.
-  if (Kind.isBSS())
+  // TODO: We may put Kind.isReadOnlyWithRel() under option control, because
+  // user may want to have read-only data with relocations placed into a
+  // read-only section by the compiler.
+  // For BSS kind, zero initialized data must be emitted to the .data section
+  // because external linkage control sections that get mapped to the .bss
+  // section will be linked as tentative defintions, which is only appropriate
+  // for SectionKind::Common.
+  if (Kind.isData() || Kind.isReadOnlyWithRel() || Kind.isBSS()) {
+    if (TM.getDataSections()) {
+      SmallString<128> Name;
+      getNameWithPrefix(Name, GO, TM);
+      return getContext().getXCOFFSection(Name, XCOFF::XMC_RW, XCOFF::XTY_SD,
+                                          SectionKind::getData());
+    }
     return DataSection;
+  }
 
-  if (Kind.isReadOnly())
+  if (Kind.isReadOnly()) {
+    if (TM.getDataSections()) {
+      SmallString<128> Name;
+      getNameWithPrefix(Name, GO, TM);
+      return getContext().getXCOFFSection(Name, XCOFF::XMC_RO, XCOFF::XTY_SD,
+                                          SectionKind::getReadOnly());
+    }
     return ReadOnlySection;
+  }
 
   report_fatal_error("XCOFF other section types not yet implemented.");
 }
 
 MCSection *TargetLoweringObjectFileXCOFF::getSectionForJumpTable(
     const Function &F, const TargetMachine &TM) const {
-  assert (!TM.getFunctionSections() && "Unique sections not supported on XCOFF"
-          " yet.");
   assert (!F.getComdat() && "Comdat not supported on XCOFF.");
-  //TODO: Enable emiting jump table to unique sections when we support it.
-  return ReadOnlySection;
+
+  if (!TM.getFunctionSections())
+    return ReadOnlySection;
+
+  // If the function can be removed, produce a unique section so that
+  // the table doesn't prevent the removal.
+  SmallString<128> NameStr(".rodata.jmp..");
+  getNameWithPrefix(NameStr, &F, TM);
+  return getContext().getXCOFFSection(NameStr, XCOFF::XMC_RO, XCOFF::XTY_SD,
+                                      SectionKind::getReadOnly());
 }
 
 bool TargetLoweringObjectFileXCOFF::shouldPutJumpTableInFunctionSection(
@@ -2104,19 +2253,23 @@ MCSection *TargetLoweringObjectFileXCOFF::getSectionForConstant(
 void TargetLoweringObjectFileXCOFF::Initialize(MCContext &Ctx,
                                                const TargetMachine &TgtM) {
   TargetLoweringObjectFile::Initialize(Ctx, TgtM);
-  TTypeEncoding = 0;
+  TTypeEncoding =
+      dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_datarel |
+      (TgtM.getTargetTriple().isArch32Bit() ? dwarf::DW_EH_PE_sdata4
+                                            : dwarf::DW_EH_PE_sdata8);
   PersonalityEncoding = 0;
   LSDAEncoding = 0;
+  CallSiteEncoding = dwarf::DW_EH_PE_udata4;
 }
 
 MCSection *TargetLoweringObjectFileXCOFF::getStaticCtorSection(
-    unsigned Priority, const MCSymbol *KeySym) const {
-  report_fatal_error("XCOFF ctor section not yet implemented.");
+	unsigned Priority, const MCSymbol *KeySym) const {
+  report_fatal_error("no static constructor section on AIX");
 }
 
 MCSection *TargetLoweringObjectFileXCOFF::getStaticDtorSection(
-    unsigned Priority, const MCSymbol *KeySym) const {
-  report_fatal_error("XCOFF dtor section not yet implemented.");
+	unsigned Priority, const MCSymbol *KeySym) const {
+  report_fatal_error("no static destructor section on AIX");
 }
 
 const MCExpr *TargetLoweringObjectFileXCOFF::lowerRelativeReference(
@@ -2125,9 +2278,11 @@ const MCExpr *TargetLoweringObjectFileXCOFF::lowerRelativeReference(
   report_fatal_error("XCOFF not yet implemented.");
 }
 
-XCOFF::StorageClass TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(
-    const GlobalObject *GO) {
-  switch (GO->getLinkage()) {
+XCOFF::StorageClass
+TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(const GlobalValue *GV) {
+  assert(!isa<GlobalIFunc>(GV) && "GlobalIFunc is not supported on AIX.");
+
+  switch (GV->getLinkage()) {
   case GlobalValue::InternalLinkage:
   case GlobalValue::PrivateLinkage:
     return XCOFF::C_HIDEXT;
@@ -2149,10 +2304,32 @@ XCOFF::StorageClass TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(
 }
 
 MCSymbol *TargetLoweringObjectFileXCOFF::getFunctionEntryPointSymbol(
-    const Function *F, const TargetMachine &TM) const {
+    const GlobalValue *Func, const TargetMachine &TM) const {
+  assert(
+      (isa<Function>(Func) ||
+       (isa<GlobalAlias>(Func) &&
+        isa_and_nonnull<Function>(cast<GlobalAlias>(Func)->getBaseObject()))) &&
+      "Func must be a function or an alias which has a function as base "
+      "object.");
+
   SmallString<128> NameStr;
   NameStr.push_back('.');
-  getNameWithPrefix(NameStr, F, TM);
+  getNameWithPrefix(NameStr, Func, TM);
+
+  // When -function-sections is enabled and explicit section is not specified,
+  // it's not necessary to emit function entry point label any more. We will use
+  // function entry point csect instead. And for function delcarations, the
+  // undefined symbols gets treated as csect with XTY_ER property.
+  if (((TM.getFunctionSections() && !Func->hasSection()) ||
+       Func->isDeclaration()) &&
+      isa<Function>(Func)) {
+    return getContext()
+        .getXCOFFSection(NameStr, XCOFF::XMC_PR,
+                         Func->isDeclaration() ? XCOFF::XTY_ER : XCOFF::XTY_SD,
+                         SectionKind::getText())
+        ->getQualNameSymbol();
+  }
+
   return getContext().getOrCreateSymbol(NameStr);
 }
 
@@ -2161,13 +2338,15 @@ MCSection *TargetLoweringObjectFileXCOFF::getSectionForFunctionDescriptor(
   SmallString<128> NameStr;
   getNameWithPrefix(NameStr, F, TM);
   return getContext().getXCOFFSection(NameStr, XCOFF::XMC_DS, XCOFF::XTY_SD,
-                                      getStorageClassForGlobal(F),
                                       SectionKind::getData());
 }
 
 MCSection *TargetLoweringObjectFileXCOFF::getSectionForTOCEntry(
-    const MCSymbol *Sym) const {
+    const MCSymbol *Sym, const TargetMachine &TM) const {
+  // Use TE storage-mapping class when large code model is enabled so that
+  // the chance of needing -bbigtoc is decreased.
   return getContext().getXCOFFSection(
-      cast<MCSymbolXCOFF>(Sym)->getSymbolTableName(), XCOFF::XMC_TC,
-      XCOFF::XTY_SD, XCOFF::C_HIDEXT, SectionKind::getData());
+      cast<MCSymbolXCOFF>(Sym)->getSymbolTableName(),
+      TM.getCodeModel() == CodeModel::Large ? XCOFF::XMC_TE : XCOFF::XMC_TC,
+      XCOFF::XTY_SD, SectionKind::getData());
 }
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/TargetOptionsImpl.cpp b/contrib/llvm-project/llvm/lib/CodeGen/TargetOptionsImpl.cpp
index 4866d4c171c0..0731cf9b28f4 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/TargetOptionsImpl.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/TargetOptionsImpl.cpp
@@ -47,7 +47,11 @@ bool TargetOptions::HonorSignDependentRoundingFPMath() const {
 }
 
 /// NOTE: There are targets that still do not support the debug entry values
-/// production.
+/// production and that is being controlled with the SupportsDebugEntryValues.
+/// In addition, SCE debugger does not have the feature implemented, so prefer
+/// not to emit the debug entry values in that case.
+/// The EnableDebugEntryValues can be used for the testing purposes.
 bool TargetOptions::ShouldEmitDebugEntryValues() const {
-  return SupportsDebugEntryValues || EnableDebugEntryValues;
+  return (SupportsDebugEntryValues && DebuggerTuning != DebuggerKind::SCE) ||
+         EnableDebugEntryValues;
 }
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/TargetPassConfig.cpp b/contrib/llvm-project/llvm/lib/CodeGen/TargetPassConfig.cpp
index e0fdb0cefcb8..e844d03854e2 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -29,6 +29,7 @@
 #include "llvm/CodeGen/RegAllocRegistry.h"
 #include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/PassInstrumentation.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -41,6 +42,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/SaveAndRestore.h"
 #include "llvm/Support/Threading.h"
+#include "llvm/Target/CGPassBuilderOption.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils.h"
@@ -120,16 +122,22 @@ static cl::opt<cl::boolOrDefault> DebugifyAndStripAll(
         "Debugify MIR before and Strip debug after "
         "each pass except those known to be unsafe when debug info is present"),
     cl::ZeroOrMore);
-enum RunOutliner { AlwaysOutline, NeverOutline, TargetDefault };
+static cl::opt<cl::boolOrDefault> DebugifyCheckAndStripAll(
+    "debugify-check-and-strip-all-safe", cl::Hidden,
+    cl::desc(
+        "Debugify MIR before, by checking and stripping the debug info after, "
+        "each pass except those known to be unsafe when debug info is present"),
+    cl::ZeroOrMore);
 // Enable or disable the MachineOutliner.
 static cl::opt<RunOutliner> EnableMachineOutliner(
     "enable-machine-outliner", cl::desc("Enable the machine outliner"),
-    cl::Hidden, cl::ValueOptional, cl::init(TargetDefault),
-    cl::values(clEnumValN(AlwaysOutline, "always",
+    cl::Hidden, cl::ValueOptional, cl::init(RunOutliner::TargetDefault),
+    cl::values(clEnumValN(RunOutliner::AlwaysOutline, "always",
                           "Run on all functions guaranteed to be beneficial"),
-               clEnumValN(NeverOutline, "never", "Disable all outlining"),
+               clEnumValN(RunOutliner::NeverOutline, "never",
+                          "Disable all outlining"),
                // Sentinel value for unspecified option.
-               clEnumValN(AlwaysOutline, "", "")));
+               clEnumValN(RunOutliner::AlwaysOutline, "", "")));
 // Enable or disable FastISel. Both options are needed, because
 // FastISel is enabled by default with -fast, and we wish to be
 // able to enable or disable fast-isel independently from -O0.
@@ -141,9 +149,11 @@ static cl::opt<cl::boolOrDefault> EnableGlobalISelOption(
     "global-isel", cl::Hidden,
     cl::desc("Enable the \"global\" instruction selector"));
 
-static cl::opt<std::string> PrintMachineInstrs(
-    "print-machineinstrs", cl::ValueOptional, cl::desc("Print machine instrs"),
-    cl::value_desc("pass-name"), cl::init("option-unspecified"), cl::Hidden);
+// FIXME: remove this after switching to NPM or GlobalISel, whichever gets there
+//        first...
+static cl::opt<bool>
+    PrintAfterISel("print-after-isel", cl::init(false), cl::Hidden,
+                   cl::desc("Print machine instrs after ISel"));
 
 static cl::opt<GlobalISelAbortMode> EnableGlobalISelAbort(
     "global-isel-abort", cl::Hidden,
@@ -170,7 +180,6 @@ static cl::opt<bool> EarlyLiveIntervals("early-live-intervals", cl::Hidden,
     cl::desc("Run live interval analysis earlier in the pipeline"));
 
 // Experimental option to use CFL-AA in codegen
-enum class CFLAAType { None, Steensgaard, Andersen, Both };
 static cl::opt<CFLAAType> UseCFLAA(
     "use-cfl-aa-in-codegen", cl::init(CFLAAType::None), cl::Hidden,
     cl::desc("Enable the new, experimental CFL alias analysis in CodeGen"),
@@ -210,6 +219,17 @@ static cl::opt<std::string>
                   cl::desc("Stop compilation before a specific pass"),
                   cl::value_desc("pass-name"), cl::init(""), cl::Hidden);
 
+/// Enable the machine function splitter pass.
+static cl::opt<bool> EnableMachineFunctionSplitter(
+    "enable-split-machine-functions", cl::Hidden,
+    cl::desc("Split out cold blocks from machine functions based on profile "
+             "information."));
+
+/// Disable the expand reductions pass for testing.
+static cl::opt<bool> DisableExpandReductions(
+    "disable-expand-reductions", cl::init(false), cl::Hidden,
+    cl::desc("Disable the expand reduction intrinsics pass from running"));
+
 /// Allow standard passes to be disabled by command line options. This supports
 /// simple binary flags that either suppress the pass or do nothing.
 /// i.e. -disable-mypass=false has no effect.
@@ -294,12 +314,11 @@ struct InsertedPass {
   AnalysisID TargetPassID;
   IdentifyingPassPtr InsertedPassID;
   bool VerifyAfter;
-  bool PrintAfter;
 
   InsertedPass(AnalysisID TargetPassID, IdentifyingPassPtr InsertedPassID,
-               bool VerifyAfter, bool PrintAfter)
+               bool VerifyAfter)
       : TargetPassID(TargetPassID), InsertedPassID(InsertedPassID),
-        VerifyAfter(VerifyAfter), PrintAfter(PrintAfter) {}
+        VerifyAfter(VerifyAfter) {}
 
   Pass *getInsertedPass() const {
     assert(InsertedPassID.isValid() && "Illegal Pass ID!");
@@ -397,6 +416,145 @@ void TargetPassConfig::setStartStopPasses() {
   Started = (StartAfter == nullptr) && (StartBefore == nullptr);
 }
 
+CGPassBuilderOption llvm::getCGPassBuilderOption() {
+  CGPassBuilderOption Opt;
+
+#define SET_OPTION(Option)                                                     \
+  if (Option.getNumOccurrences())                                              \
+    Opt.Option = Option;
+
+  SET_OPTION(EnableFastISelOption)
+  SET_OPTION(EnableGlobalISelAbort)
+  SET_OPTION(EnableGlobalISelOption)
+  SET_OPTION(EnableIPRA)
+  SET_OPTION(OptimizeRegAlloc)
+  SET_OPTION(VerifyMachineCode)
+
+#define SET_BOOLEAN_OPTION(Option) Opt.Option = Option;
+
+  SET_BOOLEAN_OPTION(EarlyLiveIntervals)
+  SET_BOOLEAN_OPTION(EnableBlockPlacementStats)
+  SET_BOOLEAN_OPTION(EnableImplicitNullChecks)
+  SET_BOOLEAN_OPTION(EnableMachineOutliner)
+  SET_BOOLEAN_OPTION(MISchedPostRA)
+  SET_BOOLEAN_OPTION(UseCFLAA)
+  SET_BOOLEAN_OPTION(DisableMergeICmps)
+  SET_BOOLEAN_OPTION(DisableLSR)
+  SET_BOOLEAN_OPTION(DisableConstantHoisting)
+  SET_BOOLEAN_OPTION(DisableCGP)
+  SET_BOOLEAN_OPTION(DisablePartialLibcallInlining)
+  SET_BOOLEAN_OPTION(PrintLSR)
+  SET_BOOLEAN_OPTION(PrintISelInput)
+  SET_BOOLEAN_OPTION(PrintGCInfo)
+
+  return Opt;
+}
+
+static void registerPartialPipelineCallback(PassInstrumentationCallbacks &PIC,
+                                            LLVMTargetMachine &LLVMTM) {
+  StringRef StartBefore;
+  StringRef StartAfter;
+  StringRef StopBefore;
+  StringRef StopAfter;
+
+  unsigned StartBeforeInstanceNum = 0;
+  unsigned StartAfterInstanceNum = 0;
+  unsigned StopBeforeInstanceNum = 0;
+  unsigned StopAfterInstanceNum = 0;
+
+  std::tie(StartBefore, StartBeforeInstanceNum) =
+      getPassNameAndInstanceNum(StartBeforeOpt);
+  std::tie(StartAfter, StartAfterInstanceNum) =
+      getPassNameAndInstanceNum(StartAfterOpt);
+  std::tie(StopBefore, StopBeforeInstanceNum) =
+      getPassNameAndInstanceNum(StopBeforeOpt);
+  std::tie(StopAfter, StopAfterInstanceNum) =
+      getPassNameAndInstanceNum(StopAfterOpt);
+
+  if (StartBefore.empty() && StartAfter.empty() && StopBefore.empty() &&
+      StopAfter.empty())
+    return;
+
+  std::tie(StartBefore, std::ignore) =
+      LLVMTM.getPassNameFromLegacyName(StartBefore);
+  std::tie(StartAfter, std::ignore) =
+      LLVMTM.getPassNameFromLegacyName(StartAfter);
+  std::tie(StopBefore, std::ignore) =
+      LLVMTM.getPassNameFromLegacyName(StopBefore);
+  std::tie(StopAfter, std::ignore) =
+      LLVMTM.getPassNameFromLegacyName(StopAfter);
+  if (!StartBefore.empty() && !StartAfter.empty())
+    report_fatal_error(Twine(StartBeforeOptName) + Twine(" and ") +
+                       Twine(StartAfterOptName) + Twine(" specified!"));
+  if (!StopBefore.empty() && !StopAfter.empty())
+    report_fatal_error(Twine(StopBeforeOptName) + Twine(" and ") +
+                       Twine(StopAfterOptName) + Twine(" specified!"));
+
+  PIC.registerShouldRunOptionalPassCallback(
+      [=, EnableCurrent = StartBefore.empty() && StartAfter.empty(),
+       EnableNext = Optional<bool>(), StartBeforeCount = 0u,
+       StartAfterCount = 0u, StopBeforeCount = 0u,
+       StopAfterCount = 0u](StringRef P, Any) mutable {
+        bool StartBeforePass = !StartBefore.empty() && P.contains(StartBefore);
+        bool StartAfterPass = !StartAfter.empty() && P.contains(StartAfter);
+        bool StopBeforePass = !StopBefore.empty() && P.contains(StopBefore);
+        bool StopAfterPass = !StopAfter.empty() && P.contains(StopAfter);
+
+        // Implement -start-after/-stop-after
+        if (EnableNext) {
+          EnableCurrent = *EnableNext;
+          EnableNext.reset();
+        }
+
+        // Using PIC.registerAfterPassCallback won't work because if this
+        // callback returns false, AfterPassCallback is also skipped.
+        if (StartAfterPass && StartAfterCount++ == StartAfterInstanceNum) {
+          assert(!EnableNext && "Error: assign to EnableNext more than once");
+          EnableNext = true;
+        }
+        if (StopAfterPass && StopAfterCount++ == StopAfterInstanceNum) {
+          assert(!EnableNext && "Error: assign to EnableNext more than once");
+          EnableNext = false;
+        }
+
+        if (StartBeforePass && StartBeforeCount++ == StartBeforeInstanceNum)
+          EnableCurrent = true;
+        if (StopBeforePass && StopBeforeCount++ == StopBeforeInstanceNum)
+          EnableCurrent = false;
+        return EnableCurrent;
+      });
+}
+
+void llvm::registerCodeGenCallback(PassInstrumentationCallbacks &PIC,
+                                   LLVMTargetMachine &LLVMTM) {
+
+  // Register a callback for disabling passes.
+  PIC.registerShouldRunOptionalPassCallback([](StringRef P, Any) {
+
+#define DISABLE_PASS(Option, Name)                                             \
+  if (Option && P.contains(#Name))                                             \
+    return false;
+    DISABLE_PASS(DisableBlockPlacement, MachineBlockPlacementPass)
+    DISABLE_PASS(DisableBranchFold, BranchFolderPass)
+    DISABLE_PASS(DisableCopyProp, MachineCopyPropagationPass)
+    DISABLE_PASS(DisableEarlyIfConversion, EarlyIfConverterPass)
+    DISABLE_PASS(DisableEarlyTailDup, EarlyTailDuplicatePass)
+    DISABLE_PASS(DisableMachineCSE, MachineCSEPass)
+    DISABLE_PASS(DisableMachineDCE, DeadMachineInstructionElimPass)
+    DISABLE_PASS(DisableMachineLICM, EarlyMachineLICMPass)
+    DISABLE_PASS(DisableMachineSink, MachineSinkingPass)
+    DISABLE_PASS(DisablePostRAMachineLICM, MachineLICMPass)
+    DISABLE_PASS(DisablePostRAMachineSink, PostRAMachineSinkingPass)
+    DISABLE_PASS(DisablePostRASched, PostRASchedulerPass)
+    DISABLE_PASS(DisableSSC, StackSlotColoringPass)
+    DISABLE_PASS(DisableTailDuplicate, TailDuplicatePass)
+
+    return true;
+  });
+
+  registerPartialPipelineCallback(PIC, LLVMTM);
+}
+
 // Out of line constructor provides default values for pass options and
 // registers all common codegen passes.
 TargetPassConfig::TargetPassConfig(LLVMTargetMachine &TM, PassManagerBase &pm)
@@ -411,9 +569,6 @@ TargetPassConfig::TargetPassConfig(LLVMTargetMachine &TM, PassManagerBase &pm)
   initializeBasicAAWrapperPassPass(*PassRegistry::getPassRegistry());
   initializeAAResultsWrapperPassPass(*PassRegistry::getPassRegistry());
 
-  if (StringRef(PrintMachineInstrs.getValue()).equals(""))
-    TM.Options.PrintMachineCode = true;
-
   if (EnableIPRA.getNumOccurrences())
     TM.Options.EnableIPRA = EnableIPRA;
   else {
@@ -437,14 +592,13 @@ CodeGenOpt::Level TargetPassConfig::getOptLevel() const {
 /// Insert InsertedPassID pass after TargetPassID.
 void TargetPassConfig::insertPass(AnalysisID TargetPassID,
                                   IdentifyingPassPtr InsertedPassID,
-                                  bool VerifyAfter, bool PrintAfter) {
+                                  bool VerifyAfter) {
   assert(((!InsertedPassID.isInstance() &&
            TargetPassID != InsertedPassID.getID()) ||
           (InsertedPassID.isInstance() &&
            TargetPassID != InsertedPassID.getInstance()->getPassID())) &&
          "Insert a pass after itself!");
-  Impl->InsertedPasses.emplace_back(TargetPassID, InsertedPassID, VerifyAfter,
-                                    PrintAfter);
+  Impl->InsertedPasses.emplace_back(TargetPassID, InsertedPassID, VerifyAfter);
 }
 
 /// createPassConfig - Create a pass configuration object to be used by
@@ -522,7 +676,7 @@ bool TargetPassConfig::isPassSubstitutedOrOverridden(AnalysisID ID) const {
 /// a later pass or that it should stop after an earlier pass, then do not add
 /// the pass.  Finally, compare the current pass against the StartAfter
 /// and StopAfter options and change the Started/Stopped flags accordingly.
-void TargetPassConfig::addPass(Pass *P, bool verifyAfter, bool printAfter) {
+void TargetPassConfig::addPass(Pass *P, bool verifyAfter) {
   assert(!Initialized && "PassConfig is immutable");
 
   // Cache the Pass ID here in case the pass manager finds this pass is
@@ -540,17 +694,16 @@ void TargetPassConfig::addPass(Pass *P, bool verifyAfter, bool printAfter) {
       addMachinePrePasses();
     std::string Banner;
     // Construct banner message before PM->add() as that may delete the pass.
-    if (AddingMachinePasses && (printAfter || verifyAfter))
+    if (AddingMachinePasses && verifyAfter)
       Banner = std::string("After ") + std::string(P->getPassName());
     PM->add(P);
     if (AddingMachinePasses)
-      addMachinePostPasses(Banner, /*AllowPrint*/ printAfter,
-                           /*AllowVerify*/ verifyAfter);
+      addMachinePostPasses(Banner, /*AllowVerify*/ verifyAfter);
 
     // Add the passes after the pass P if there is any.
-    for (auto IP : Impl->InsertedPasses) {
+    for (const auto &IP : Impl->InsertedPasses) {
       if (IP.TargetPassID == PassID)
-        addPass(IP.getInsertedPass(), IP.VerifyAfter, IP.PrintAfter);
+        addPass(IP.getInsertedPass(), IP.VerifyAfter);
     }
   } else {
     delete P;
@@ -570,8 +723,7 @@ void TargetPassConfig::addPass(Pass *P, bool verifyAfter, bool printAfter) {
 ///
 /// addPass cannot return a pointer to the pass instance because is internal the
 /// PassManager and the instance we create here may already be freed.
-AnalysisID TargetPassConfig::addPass(AnalysisID PassID, bool verifyAfter,
-                                     bool printAfter) {
+AnalysisID TargetPassConfig::addPass(AnalysisID PassID, bool verifyAfter) {
   IdentifyingPassPtr TargetID = getPassSubstitution(PassID);
   IdentifyingPassPtr FinalPtr = overridePass(PassID, TargetID);
   if (!FinalPtr.isValid())
@@ -586,7 +738,7 @@ AnalysisID TargetPassConfig::addPass(AnalysisID PassID, bool verifyAfter,
       llvm_unreachable("Pass ID not registered");
   }
   AnalysisID FinalID = P->getPassID();
-  addPass(P, verifyAfter, printAfter); // Ends the lifetime of P.
+  addPass(P, verifyAfter); // Ends the lifetime of P.
 
   return FinalID;
 }
@@ -597,7 +749,7 @@ void TargetPassConfig::printAndVerify(const std::string &Banner) {
 }
 
 void TargetPassConfig::addPrintPass(const std::string &Banner) {
-  if (TM->shouldPrintMachineCode())
+  if (PrintAfterISel)
     PM->add(createMachineFunctionPrinterPass(dbgs(), Banner));
 }
 
@@ -619,18 +771,26 @@ void TargetPassConfig::addStripDebugPass() {
   PM->add(createStripDebugMachineModulePass(/*OnlyDebugified=*/true));
 }
 
+void TargetPassConfig::addCheckDebugPass() {
+  PM->add(createCheckDebugMachineModulePass());
+}
+
 void TargetPassConfig::addMachinePrePasses(bool AllowDebugify) {
-  if (AllowDebugify && DebugifyAndStripAll == cl::BOU_TRUE && DebugifyIsSafe)
+  if (AllowDebugify && DebugifyIsSafe &&
+      (DebugifyAndStripAll == cl::BOU_TRUE ||
+       DebugifyCheckAndStripAll == cl::BOU_TRUE))
     addDebugifyPass();
 }
 
 void TargetPassConfig::addMachinePostPasses(const std::string &Banner,
-                                            bool AllowPrint, bool AllowVerify,
-                                            bool AllowStrip) {
-  if (DebugifyAndStripAll == cl::BOU_TRUE && DebugifyIsSafe)
-    addStripDebugPass();
-  if (AllowPrint)
-    addPrintPass(Banner);
+                                            bool AllowVerify, bool AllowStrip) {
+  if (DebugifyIsSafe) {
+    if (DebugifyCheckAndStripAll == cl::BOU_TRUE) {
+      addCheckDebugPass();
+      addStripDebugPass();
+    } else if (DebugifyAndStripAll == cl::BOU_TRUE)
+      addStripDebugPass();
+  }
   if (AllowVerify)
     addVerifyPass(Banner);
 }
@@ -707,10 +867,12 @@ void TargetPassConfig::addIRPasses() {
   // Add scalarization of target's unsupported masked memory intrinsics pass.
   // the unsupported intrinsic will be replaced with a chain of basic blocks,
   // that stores/loads element one-by-one if the appropriate mask bit is set.
-  addPass(createScalarizeMaskedMemIntrinPass());
+  addPass(createScalarizeMaskedMemIntrinLegacyPass());
 
   // Expand reduction intrinsics into shuffle sequences if the target wants to.
-  addPass(createExpandReductionsPass());
+  // Allow disabling it for testing purposes.
+  if (!DisableExpandReductions)
+    addPass(createExpandReductionsPass());
 }
 
 /// Turn exception handling constructs into something the code generators can
@@ -730,6 +892,7 @@ void TargetPassConfig::addPassesToHandleExceptions() {
     LLVM_FALLTHROUGH;
   case ExceptionHandling::DwarfCFI:
   case ExceptionHandling::ARM:
+  case ExceptionHandling::AIX:
     addPass(createDwarfEHPass(getOptLevel()));
     break;
   case ExceptionHandling::WinEH:
@@ -879,7 +1042,7 @@ bool TargetPassConfig::addISelPasses() {
     addPass(createLowerEmuTLSPass());
 
   addPass(createPreISelIntrinsicLoweringPass());
-  addPass(createTargetTransformInfoWrapperPass(TM->getTargetIRAnalysis()));
+  PM->add(createTargetTransformInfoWrapperPass(TM->getTargetIRAnalysis()));
   addIRPasses();
   addCodeGenPrepare();
   addPassesToHandleExceptions();
@@ -916,20 +1079,6 @@ static cl::opt<RegisterRegAlloc::FunctionPassCtor, false,
 void TargetPassConfig::addMachinePasses() {
   AddingMachinePasses = true;
 
-  // Insert a machine instr printer pass after the specified pass.
-  StringRef PrintMachineInstrsPassName = PrintMachineInstrs.getValue();
-  if (!PrintMachineInstrsPassName.equals("") &&
-      !PrintMachineInstrsPassName.equals("option-unspecified")) {
-    if (const PassInfo *TPI = getPassInfo(PrintMachineInstrsPassName)) {
-      const PassRegistry *PR = PassRegistry::getPassRegistry();
-      const PassInfo *IPI = PR->getPassInfo(StringRef("machineinstr-printer"));
-      assert(IPI && "failed to get \"machineinstr-printer\" PassInfo!");
-      const char *TID = (const char *)(TPI->getTypeInfo());
-      const char *IID = (const char *)(IPI->getTypeInfo());
-      insertPass(TID, IID);
-    }
-  }
-
   // Add passes that optimize machine instructions in SSA form.
   if (getOptLevel() != CodeGenOpt::None) {
     addMachineSSAOptimization();
@@ -1000,7 +1149,7 @@ void TargetPassConfig::addMachinePasses() {
   // GC
   if (addGCPasses()) {
     if (PrintGCInfo)
-      addPass(createGCInfoPrinter(dbgs()), false, false);
+      addPass(createGCInfoPrinter(dbgs()), false);
   }
 
   // Basic block placement.
@@ -1028,20 +1177,31 @@ void TargetPassConfig::addMachinePasses() {
   addPass(&LiveDebugValuesID, false);
 
   if (TM->Options.EnableMachineOutliner && getOptLevel() != CodeGenOpt::None &&
-      EnableMachineOutliner != NeverOutline) {
-    bool RunOnAllFunctions = (EnableMachineOutliner == AlwaysOutline);
-    bool AddOutliner = RunOnAllFunctions ||
-                       TM->Options.SupportsDefaultOutlining;
+      EnableMachineOutliner != RunOutliner::NeverOutline) {
+    bool RunOnAllFunctions =
+        (EnableMachineOutliner == RunOutliner::AlwaysOutline);
+    bool AddOutliner =
+        RunOnAllFunctions || TM->Options.SupportsDefaultOutlining;
     if (AddOutliner)
       addPass(createMachineOutlinerPass(RunOnAllFunctions));
   }
 
-  if (TM->getBBSectionsType() != llvm::BasicBlockSection::None)
-    addPass(llvm::createBBSectionsPreparePass(TM->getBBSectionsFuncListBuf()));
+  // Machine function splitter uses the basic block sections feature. Both
+  // cannot be enabled at the same time.
+  if (TM->Options.EnableMachineFunctionSplitter ||
+      EnableMachineFunctionSplitter) {
+    addPass(createMachineFunctionSplitterPass());
+  } else if (TM->getBBSectionsType() != llvm::BasicBlockSection::None) {
+    addPass(llvm::createBasicBlockSectionsPass(TM->getBBSectionsFuncListBuf()));
+  }
 
   // Add passes that directly emit MI after all other MI passes.
   addPreEmitPass2();
 
+  // Insert pseudo probe annotation for callsite profiling
+  if (TM->Options.PseudoProbeForProfiling)
+    addPass(createPseudoProbeInserter());
+
   AddingMachinePasses = false;
 }
 
@@ -1148,7 +1308,7 @@ FunctionPass *TargetPassConfig::createRegAllocPass(bool Optimized) {
   return createTargetRegisterAllocator(Optimized);
 }
 
-bool TargetPassConfig::addRegAssignmentFast() {
+bool TargetPassConfig::addRegAssignAndRewriteFast() {
   if (RegAlloc != &useDefaultRegisterAllocator &&
       RegAlloc != &createFastRegisterAllocator)
     report_fatal_error("Must use fast (default) register allocator for unoptimized regalloc.");
@@ -1157,7 +1317,7 @@ bool TargetPassConfig::addRegAssignmentFast() {
   return true;
 }
 
-bool TargetPassConfig::addRegAssignmentOptimized() {
+bool TargetPassConfig::addRegAssignAndRewriteOptimized() {
   // Add the selected register allocation pass.
   addPass(createRegAllocPass(true));
 
@@ -1167,12 +1327,6 @@ bool TargetPassConfig::addRegAssignmentOptimized() {
   // Finally rewrite virtual registers.
   addPass(&VirtRegRewriterID);
 
-  // Perform stack slot coloring and post-ra machine LICM.
-  //
-  // FIXME: Re-enable coloring with register when it's capable of adding
-  // kill markers.
-  addPass(&StackSlotColoringID);
-
   return true;
 }
 
@@ -1188,7 +1342,7 @@ void TargetPassConfig::addFastRegAlloc() {
   addPass(&PHIEliminationID, false);
   addPass(&TwoAddressInstructionPassID, false);
 
-  addRegAssignmentFast();
+  addRegAssignAndRewriteFast();
 }
 
 /// Add standard target-independent passes that are tightly coupled with
@@ -1205,6 +1359,11 @@ void TargetPassConfig::addOptimizedRegAlloc() {
   // LiveVariables can be removed completely, and LiveIntervals can be directly
   // computed. (We still either need to regenerate kill flags after regalloc, or
   // preferably fix the scavenger to not depend on them).
+  // FIXME: UnreachableMachineBlockElim is a dependant pass of LiveVariables.
+  // When LiveVariables is removed this has to be removed/moved either.
+  // Explicit addition of UnreachableMachineBlockElim allows stopping before or
+  // after it with -stop-before/-stop-after.
+  addPass(&UnreachableMachineBlockElimID, false);
   addPass(&LiveVariablesID, false);
 
   // Edge splitting is smarter with machine loop info.
@@ -1226,7 +1385,13 @@ void TargetPassConfig::addOptimizedRegAlloc() {
   // PreRA instruction scheduling.
   addPass(&MachineSchedulerID);
 
-  if (addRegAssignmentOptimized()) {
+  if (addRegAssignAndRewriteOptimized()) {
+    // Perform stack slot coloring and post-ra machine LICM.
+    //
+    // FIXME: Re-enable coloring with register when it's capable of adding
+    // kill markers.
+    addPass(&StackSlotColoringID);
+
     // Allow targets to expand pseudo instructions depending on the choice of
     // registers before MachineCopyPropagation.
     addPostRewrite();
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/contrib/llvm-project/llvm/lib/CodeGen/TargetRegisterInfo.cpp
index e2ef12d8ac77..5fd7eef5808f 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/TargetRegisterInfo.cpp
@@ -26,6 +26,7 @@
 #include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Attributes.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Function.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/CommandLine.h"
@@ -68,7 +69,7 @@ bool TargetRegisterInfo::shouldRegionSplitForVirtReg(
     const MachineFunction &MF, const LiveInterval &VirtReg) const {
   const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
   const MachineRegisterInfo &MRI = MF.getRegInfo();
-  MachineInstr *MI = MRI.getUniqueVRegDef(VirtReg.reg);
+  MachineInstr *MI = MRI.getUniqueVRegDef(VirtReg.reg());
   if (MI && TII->isTriviallyReMaterializable(*MI) &&
       VirtReg.size() > HugeSizeForSplit)
     return false;
@@ -532,6 +533,56 @@ TargetRegisterInfo::lookThruCopyLike(Register SrcReg,
   }
 }
 
+Register TargetRegisterInfo::lookThruSingleUseCopyChain(
+    Register SrcReg, const MachineRegisterInfo *MRI) const {
+  while (true) {
+    const MachineInstr *MI = MRI->getVRegDef(SrcReg);
+    // Found the real definition, return it if it has a single use.
+    if (!MI->isCopyLike())
+      return MRI->hasOneNonDBGUse(SrcReg) ? SrcReg : Register();
+
+    Register CopySrcReg;
+    if (MI->isCopy())
+      CopySrcReg = MI->getOperand(1).getReg();
+    else {
+      assert(MI->isSubregToReg() && "Bad opcode for lookThruCopyLike");
+      CopySrcReg = MI->getOperand(2).getReg();
+    }
+
+    // Continue only if the next definition in the chain is for a virtual
+    // register that has a single use.
+    if (!CopySrcReg.isVirtual() || !MRI->hasOneNonDBGUse(CopySrcReg))
+      return Register();
+
+    SrcReg = CopySrcReg;
+  }
+}
+
+void TargetRegisterInfo::getOffsetOpcodes(
+    const StackOffset &Offset, SmallVectorImpl<uint64_t> &Ops) const {
+  assert(!Offset.getScalable() && "Scalable offsets are not handled");
+  DIExpression::appendOffset(Ops, Offset.getFixed());
+}
+
+DIExpression *
+TargetRegisterInfo::prependOffsetExpression(const DIExpression *Expr,
+                                            unsigned PrependFlags,
+                                            const StackOffset &Offset) const {
+  assert((PrependFlags &
+          ~(DIExpression::DerefBefore | DIExpression::DerefAfter |
+            DIExpression::StackValue | DIExpression::EntryValue)) == 0 &&
+         "Unsupported prepend flag");
+  SmallVector<uint64_t, 16> OffsetExpr;
+  if (PrependFlags & DIExpression::DerefBefore)
+    OffsetExpr.push_back(dwarf::DW_OP_deref);
+  getOffsetOpcodes(Offset, OffsetExpr);
+  if (PrependFlags & DIExpression::DerefAfter)
+    OffsetExpr.push_back(dwarf::DW_OP_deref);
+  return DIExpression::prependOpcodes(Expr, OffsetExpr,
+                                      PrependFlags & DIExpression::StackValue,
+                                      PrependFlags & DIExpression::EntryValue);
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD
 void TargetRegisterInfo::dumpReg(Register Reg, unsigned SubRegIndex,
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/TargetSubtargetInfo.cpp b/contrib/llvm-project/llvm/lib/CodeGen/TargetSubtargetInfo.cpp
index 63766df4d2be..e4520d8ccb1e 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/TargetSubtargetInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/TargetSubtargetInfo.cpp
@@ -15,13 +15,12 @@
 using namespace llvm;
 
 TargetSubtargetInfo::TargetSubtargetInfo(
-    const Triple &TT, StringRef CPU, StringRef FS,
+    const Triple &TT, StringRef CPU, StringRef TuneCPU, StringRef FS,
     ArrayRef<SubtargetFeatureKV> PF, ArrayRef<SubtargetSubTypeKV> PD,
-    const MCWriteProcResEntry *WPR,
-    const MCWriteLatencyEntry *WL, const MCReadAdvanceEntry *RA,
-    const InstrStage *IS, const unsigned *OC, const unsigned *FP)
-    : MCSubtargetInfo(TT, CPU, FS, PF, PD, WPR, WL, RA, IS, OC, FP) {
-}
+    const MCWriteProcResEntry *WPR, const MCWriteLatencyEntry *WL,
+    const MCReadAdvanceEntry *RA, const InstrStage *IS, const unsigned *OC,
+    const unsigned *FP)
+    : MCSubtargetInfo(TT, CPU, TuneCPU, FS, PF, PD, WPR, WL, RA, IS, OC, FP) {}
 
 TargetSubtargetInfo::~TargetSubtargetInfo() = default;
 
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/contrib/llvm-project/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 615ff4b8789c..2a9132bd2fe0 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -111,37 +111,35 @@ class TwoAddressInstructionPass : public MachineFunctionPass {
   // A map from virtual registers to physical registers which are likely targets
   // to be coalesced to due to copies from physical registers to virtual
   // registers. e.g. v1024 = move r0.
-  DenseMap<unsigned, unsigned> SrcRegMap;
+  DenseMap<Register, Register> SrcRegMap;
 
   // A map from virtual registers to physical registers which are likely targets
   // to be coalesced to due to copies to physical registers from virtual
   // registers. e.g. r1 = move v1024.
-  DenseMap<unsigned, unsigned> DstRegMap;
+  DenseMap<Register, Register> DstRegMap;
 
-  bool isRevCopyChain(unsigned FromReg, unsigned ToReg, int Maxlen);
+  bool isRevCopyChain(Register FromReg, Register ToReg, int Maxlen);
 
-  bool noUseAfterLastDef(unsigned Reg, unsigned Dist, unsigned &LastDef);
+  bool noUseAfterLastDef(Register Reg, unsigned Dist, unsigned &LastDef);
 
-  bool isProfitableToCommute(unsigned regA, unsigned regB, unsigned regC,
+  bool isProfitableToCommute(Register RegA, Register RegB, Register RegC,
                              MachineInstr *MI, unsigned Dist);
 
   bool commuteInstruction(MachineInstr *MI, unsigned DstIdx,
                           unsigned RegBIdx, unsigned RegCIdx, unsigned Dist);
 
-  bool isProfitableToConv3Addr(unsigned RegA, unsigned RegB);
+  bool isProfitableToConv3Addr(Register RegA, Register RegB);
 
   bool convertInstTo3Addr(MachineBasicBlock::iterator &mi,
-                          MachineBasicBlock::iterator &nmi,
-                          unsigned RegA, unsigned RegB, unsigned Dist);
+                          MachineBasicBlock::iterator &nmi, Register RegA,
+                          Register RegB, unsigned Dist);
 
-  bool isDefTooClose(unsigned Reg, unsigned Dist, MachineInstr *MI);
+  bool isDefTooClose(Register Reg, unsigned Dist, MachineInstr *MI);
 
   bool rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
-                             MachineBasicBlock::iterator &nmi,
-                             unsigned Reg);
+                             MachineBasicBlock::iterator &nmi, Register Reg);
   bool rescheduleKillAboveMI(MachineBasicBlock::iterator &mi,
-                             MachineBasicBlock::iterator &nmi,
-                             unsigned Reg);
+                             MachineBasicBlock::iterator &nmi, Register Reg);
 
   bool tryInstructionTransform(MachineBasicBlock::iterator &mi,
                                MachineBasicBlock::iterator &nmi,
@@ -153,7 +151,7 @@ class TwoAddressInstructionPass : public MachineFunctionPass {
                              unsigned BaseOpIdx,
                              bool BaseOpKilled,
                              unsigned Dist);
-  void scanUses(unsigned DstReg);
+  void scanUses(Register DstReg);
 
   void processCopy(MachineInstr *MI);
 
@@ -199,10 +197,10 @@ INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_END(TwoAddressInstructionPass, DEBUG_TYPE,
                 "Two-Address instruction pass", false, false)
 
-static bool isPlainlyKilled(MachineInstr *MI, unsigned Reg, LiveIntervals *LIS);
+static bool isPlainlyKilled(MachineInstr *MI, Register Reg, LiveIntervals *LIS);
 
 /// Return the MachineInstr* if it is the single def of the Reg in current BB.
-static MachineInstr *getSingleDef(unsigned Reg, MachineBasicBlock *BB,
+static MachineInstr *getSingleDef(Register Reg, MachineBasicBlock *BB,
                                   const MachineRegisterInfo *MRI) {
   MachineInstr *Ret = nullptr;
   for (MachineInstr &DefMI : MRI->def_instructions(Reg)) {
@@ -223,9 +221,9 @@ static MachineInstr *getSingleDef(unsigned Reg, MachineBasicBlock *BB,
 /// %Tmp2 = copy %ToReg;
 /// MaxLen specifies the maximum length of the copy chain the func
 /// can walk through.
-bool TwoAddressInstructionPass::isRevCopyChain(unsigned FromReg, unsigned ToReg,
+bool TwoAddressInstructionPass::isRevCopyChain(Register FromReg, Register ToReg,
                                                int Maxlen) {
-  unsigned TmpReg = FromReg;
+  Register TmpReg = FromReg;
   for (int i = 0; i < Maxlen; i++) {
     MachineInstr *Def = getSingleDef(TmpReg, MBB, MRI);
     if (!Def || !Def->isCopy())
@@ -243,7 +241,7 @@ bool TwoAddressInstructionPass::isRevCopyChain(unsigned FromReg, unsigned ToReg,
 /// in the MBB that defines the specified register and the two-address
 /// instruction which is being processed. It also returns the last def location
 /// by reference.
-bool TwoAddressInstructionPass::noUseAfterLastDef(unsigned Reg, unsigned Dist,
+bool TwoAddressInstructionPass::noUseAfterLastDef(Register Reg, unsigned Dist,
                                                   unsigned &LastDef) {
   LastDef = 0;
   unsigned LastUse = Dist;
@@ -267,8 +265,8 @@ bool TwoAddressInstructionPass::noUseAfterLastDef(unsigned Reg, unsigned Dist,
 /// instruction. It also returns the source and destination registers and
 /// whether they are physical registers by reference.
 static bool isCopyToReg(MachineInstr &MI, const TargetInstrInfo *TII,
-                        unsigned &SrcReg, unsigned &DstReg,
-                        bool &IsSrcPhys, bool &IsDstPhys) {
+                        Register &SrcReg, Register &DstReg, bool &IsSrcPhys,
+                        bool &IsDstPhys) {
   SrcReg = 0;
   DstReg = 0;
   if (MI.isCopy()) {
@@ -277,19 +275,20 @@ static bool isCopyToReg(MachineInstr &MI, const TargetInstrInfo *TII,
   } else if (MI.isInsertSubreg() || MI.isSubregToReg()) {
     DstReg = MI.getOperand(0).getReg();
     SrcReg = MI.getOperand(2).getReg();
-  } else
+  } else {
     return false;
+  }
 
-  IsSrcPhys = Register::isPhysicalRegister(SrcReg);
-  IsDstPhys = Register::isPhysicalRegister(DstReg);
+  IsSrcPhys = SrcReg.isPhysical();
+  IsDstPhys = DstReg.isPhysical();
   return true;
 }
 
 /// Test if the given register value, which is used by the
 /// given instruction, is killed by the given instruction.
-static bool isPlainlyKilled(MachineInstr *MI, unsigned Reg,
+static bool isPlainlyKilled(MachineInstr *MI, Register Reg,
                             LiveIntervals *LIS) {
-  if (LIS && Register::isVirtualRegister(Reg) && !LIS->isNotInMIMap(*MI)) {
+  if (LIS && Reg.isVirtual() && !LIS->isNotInMIMap(*MI)) {
     // FIXME: Sometimes tryInstructionTransform() will add instructions and
     // test whether they can be folded before keeping them. In this case it
     // sets a kill before recursively calling tryInstructionTransform() again.
@@ -328,20 +327,17 @@ static bool isPlainlyKilled(MachineInstr *MI, unsigned Reg,
 ///
 /// If allowFalsePositives is true then likely kills are treated as kills even
 /// if it can't be proven that they are kills.
-static bool isKilled(MachineInstr &MI, unsigned Reg,
-                     const MachineRegisterInfo *MRI,
-                     const TargetInstrInfo *TII,
-                     LiveIntervals *LIS,
-                     bool allowFalsePositives) {
+static bool isKilled(MachineInstr &MI, Register Reg,
+                     const MachineRegisterInfo *MRI, const TargetInstrInfo *TII,
+                     LiveIntervals *LIS, bool allowFalsePositives) {
   MachineInstr *DefMI = &MI;
   while (true) {
     // All uses of physical registers are likely to be kills.
-    if (Register::isPhysicalRegister(Reg) &&
-        (allowFalsePositives || MRI->hasOneUse(Reg)))
+    if (Reg.isPhysical() && (allowFalsePositives || MRI->hasOneUse(Reg)))
       return true;
     if (!isPlainlyKilled(DefMI, Reg, LIS))
       return false;
-    if (Register::isPhysicalRegister(Reg))
+    if (Reg.isPhysical())
       return true;
     MachineRegisterInfo::def_iterator Begin = MRI->def_begin(Reg);
     // If there are multiple defs, we can't do a simple analysis, so just
@@ -350,7 +346,7 @@ static bool isKilled(MachineInstr &MI, unsigned Reg,
       return true;
     DefMI = Begin->getParent();
     bool IsSrcPhys, IsDstPhys;
-    unsigned SrcReg,  DstReg;
+    Register SrcReg, DstReg;
     // If the def is something other than a copy, then it isn't going to
     // be coalesced, so follow the kill flag.
     if (!isCopyToReg(*DefMI, TII, SrcReg, DstReg, IsSrcPhys, IsDstPhys))
@@ -361,7 +357,7 @@ static bool isKilled(MachineInstr &MI, unsigned Reg,
 
 /// Return true if the specified MI uses the specified register as a two-address
 /// use. If so, return the destination register by reference.
-static bool isTwoAddrUse(MachineInstr &MI, unsigned Reg, unsigned &DstReg) {
+static bool isTwoAddrUse(MachineInstr &MI, Register Reg, Register &DstReg) {
   for (unsigned i = 0, NumOps = MI.getNumOperands(); i != NumOps; ++i) {
     const MachineOperand &MO = MI.getOperand(i);
     if (!MO.isReg() || !MO.isUse() || MO.getReg() != Reg)
@@ -377,19 +373,17 @@ static bool isTwoAddrUse(MachineInstr &MI, unsigned Reg, unsigned &DstReg) {
 
 /// Given a register, if has a single in-basic block use, return the use
 /// instruction if it's a copy or a two-address use.
-static
-MachineInstr *findOnlyInterestingUse(unsigned Reg, MachineBasicBlock *MBB,
-                                     MachineRegisterInfo *MRI,
-                                     const TargetInstrInfo *TII,
-                                     bool &IsCopy,
-                                     unsigned &DstReg, bool &IsDstPhys) {
+static MachineInstr *
+findOnlyInterestingUse(Register Reg, MachineBasicBlock *MBB,
+                       MachineRegisterInfo *MRI, const TargetInstrInfo *TII,
+                       bool &IsCopy, Register &DstReg, bool &IsDstPhys) {
   if (!MRI->hasOneNonDBGUse(Reg))
     // None or more than one use.
     return nullptr;
   MachineInstr &UseMI = *MRI->use_instr_nodbg_begin(Reg);
   if (UseMI.getParent() != MBB)
     return nullptr;
-  unsigned SrcReg;
+  Register SrcReg;
   bool IsSrcPhys;
   if (isCopyToReg(UseMI, TII, SrcReg, DstReg, IsSrcPhys, IsDstPhys)) {
     IsCopy = true;
@@ -397,7 +391,7 @@ MachineInstr *findOnlyInterestingUse(unsigned Reg, MachineBasicBlock *MBB,
   }
   IsDstPhys = false;
   if (isTwoAddrUse(UseMI, Reg, DstReg)) {
-    IsDstPhys = Register::isPhysicalRegister(DstReg);
+    IsDstPhys = DstReg.isPhysical();
     return &UseMI;
   }
   return nullptr;
@@ -405,22 +399,22 @@ MachineInstr *findOnlyInterestingUse(unsigned Reg, MachineBasicBlock *MBB,
 
 /// Return the physical register the specified virtual register might be mapped
 /// to.
-static unsigned
-getMappedReg(unsigned Reg, DenseMap<unsigned, unsigned> &RegMap) {
-  while (Register::isVirtualRegister(Reg)) {
-    DenseMap<unsigned, unsigned>::iterator SI = RegMap.find(Reg);
+static MCRegister getMappedReg(Register Reg,
+                               DenseMap<Register, Register> &RegMap) {
+  while (Reg.isVirtual()) {
+    DenseMap<Register, Register>::iterator SI = RegMap.find(Reg);
     if (SI == RegMap.end())
       return 0;
     Reg = SI->second;
   }
-  if (Register::isPhysicalRegister(Reg))
+  if (Reg.isPhysical())
     return Reg;
   return 0;
 }
 
 /// Return true if the two registers are equal or aliased.
-static bool
-regsAreCompatible(unsigned RegA, unsigned RegB, const TargetRegisterInfo *TRI) {
+static bool regsAreCompatible(Register RegA, Register RegB,
+                              const TargetRegisterInfo *TRI) {
   if (RegA == RegB)
     return true;
   if (!RegA || !RegB)
@@ -429,7 +423,7 @@ regsAreCompatible(unsigned RegA, unsigned RegB, const TargetRegisterInfo *TRI) {
 }
 
 // Returns true if Reg is equal or aliased to at least one register in Set.
-static bool regOverlapsSet(const SmallVectorImpl<unsigned> &Set, unsigned Reg,
+static bool regOverlapsSet(const SmallVectorImpl<Register> &Set, Register Reg,
                            const TargetRegisterInfo *TRI) {
   for (unsigned R : Set)
     if (TRI->regsOverlap(R, Reg))
@@ -440,10 +434,11 @@ static bool regOverlapsSet(const SmallVectorImpl<unsigned> &Set, unsigned Reg,
 
 /// Return true if it's potentially profitable to commute the two-address
 /// instruction that's being processed.
-bool
-TwoAddressInstructionPass::
-isProfitableToCommute(unsigned regA, unsigned regB, unsigned regC,
-                      MachineInstr *MI, unsigned Dist) {
+bool TwoAddressInstructionPass::isProfitableToCommute(Register RegA,
+                                                      Register RegB,
+                                                      Register RegC,
+                                                      MachineInstr *MI,
+                                                      unsigned Dist) {
   if (OptLevel == CodeGenOpt::None)
     return false;
 
@@ -465,7 +460,7 @@ isProfitableToCommute(unsigned regA, unsigned regB, unsigned regC,
   // insert => %reg1030 = COPY %reg1029
   // %reg1030 = ADD8rr killed %reg1029, killed %reg1028, implicit dead %eflags
 
-  if (!isPlainlyKilled(MI, regC, LIS))
+  if (!isPlainlyKilled(MI, RegC, LIS))
     return false;
 
   // Ok, we have something like:
@@ -478,10 +473,10 @@ isProfitableToCommute(unsigned regA, unsigned regB, unsigned regC,
   // %reg1026 = ADD %reg1024, %reg1025
   // r0            = MOV %reg1026
   // Commute the ADD to hopefully eliminate an otherwise unavoidable copy.
-  unsigned ToRegA = getMappedReg(regA, DstRegMap);
+  MCRegister ToRegA = getMappedReg(RegA, DstRegMap);
   if (ToRegA) {
-    unsigned FromRegB = getMappedReg(regB, SrcRegMap);
-    unsigned FromRegC = getMappedReg(regC, SrcRegMap);
+    MCRegister FromRegB = getMappedReg(RegB, SrcRegMap);
+    MCRegister FromRegC = getMappedReg(RegC, SrcRegMap);
     bool CompB = FromRegB && regsAreCompatible(FromRegB, ToRegA, TRI);
     bool CompC = FromRegC && regsAreCompatible(FromRegC, ToRegA, TRI);
 
@@ -499,16 +494,16 @@ isProfitableToCommute(unsigned regA, unsigned regB, unsigned regC,
       return false;
   }
 
-  // If there is a use of regC between its last def (could be livein) and this
+  // If there is a use of RegC between its last def (could be livein) and this
   // instruction, then bail.
   unsigned LastDefC = 0;
-  if (!noUseAfterLastDef(regC, Dist, LastDefC))
+  if (!noUseAfterLastDef(RegC, Dist, LastDefC))
     return false;
 
-  // If there is a use of regB between its last def (could be livein) and this
+  // If there is a use of RegB between its last def (could be livein) and this
   // instruction, then go ahead and make this transformation.
   unsigned LastDefB = 0;
-  if (!noUseAfterLastDef(regB, Dist, LastDefB))
+  if (!noUseAfterLastDef(RegB, Dist, LastDefB))
     return true;
 
   // Look for situation like this:
@@ -526,14 +521,14 @@ isProfitableToCommute(unsigned regA, unsigned regB, unsigned regC,
   // To more generally minimize register copies, ideally the logic of two addr
   // instruction pass should be integrated with register allocation pass where
   // interference graph is available.
-  if (isRevCopyChain(regC, regA, MaxDataFlowEdge))
+  if (isRevCopyChain(RegC, RegA, MaxDataFlowEdge))
     return true;
 
-  if (isRevCopyChain(regB, regA, MaxDataFlowEdge))
+  if (isRevCopyChain(RegB, RegA, MaxDataFlowEdge))
     return false;
 
   // Since there are no intervening uses for both registers, then commute
-  // if the def of regC is closer. Its live interval is shorter.
+  // if the def of RegC is closer. Its live interval is shorter.
   return LastDefB && LastDefC && LastDefC > LastDefB;
 }
 
@@ -559,7 +554,7 @@ bool TwoAddressInstructionPass::commuteInstruction(MachineInstr *MI,
          "instruction unless it was requested.");
 
   // Update source register map.
-  unsigned FromRegC = getMappedReg(RegC, SrcRegMap);
+  MCRegister FromRegC = getMappedReg(RegC, SrcRegMap);
   if (FromRegC) {
     Register RegA = MI->getOperand(DstIdx).getReg();
     SrcRegMap[RegA] = FromRegC;
@@ -570,28 +565,26 @@ bool TwoAddressInstructionPass::commuteInstruction(MachineInstr *MI,
 
 /// Return true if it is profitable to convert the given 2-address instruction
 /// to a 3-address one.
-bool
-TwoAddressInstructionPass::isProfitableToConv3Addr(unsigned RegA,unsigned RegB){
+bool TwoAddressInstructionPass::isProfitableToConv3Addr(Register RegA,
+                                                        Register RegB) {
   // Look for situations like this:
   // %reg1024 = MOV r1
   // %reg1025 = MOV r0
   // %reg1026 = ADD %reg1024, %reg1025
   // r2            = MOV %reg1026
   // Turn ADD into a 3-address instruction to avoid a copy.
-  unsigned FromRegB = getMappedReg(RegB, SrcRegMap);
+  MCRegister FromRegB = getMappedReg(RegB, SrcRegMap);
   if (!FromRegB)
     return false;
-  unsigned ToRegA = getMappedReg(RegA, DstRegMap);
+  MCRegister ToRegA = getMappedReg(RegA, DstRegMap);
   return (ToRegA && !regsAreCompatible(FromRegB, ToRegA, TRI));
 }
 
 /// Convert the specified two-address instruction into a three address one.
 /// Return true if this transformation was successful.
-bool
-TwoAddressInstructionPass::convertInstTo3Addr(MachineBasicBlock::iterator &mi,
-                                              MachineBasicBlock::iterator &nmi,
-                                              unsigned RegA, unsigned RegB,
-                                              unsigned Dist) {
+bool TwoAddressInstructionPass::convertInstTo3Addr(
+    MachineBasicBlock::iterator &mi, MachineBasicBlock::iterator &nmi,
+    Register RegA, Register RegB, unsigned Dist) {
   // FIXME: Why does convertToThreeAddress() need an iterator reference?
   MachineFunction::iterator MFI = MBB->getIterator();
   MachineInstr *NewMI = TII->convertToThreeAddress(MFI, *mi, LV);
@@ -606,6 +599,24 @@ TwoAddressInstructionPass::convertInstTo3Addr(MachineBasicBlock::iterator &mi,
   if (LIS)
     LIS->ReplaceMachineInstrInMaps(*mi, *NewMI);
 
+  // If the old instruction is debug value tracked, an update is required.
+  if (auto OldInstrNum = mi->peekDebugInstrNum()) {
+    // Sanity check.
+    assert(mi->getNumExplicitDefs() == 1);
+    assert(NewMI->getNumExplicitDefs() == 1);
+
+    // Find the old and new def location.
+    auto OldIt = mi->defs().begin();
+    auto NewIt = NewMI->defs().begin();
+    unsigned OldIdx = mi->getOperandNo(OldIt);
+    unsigned NewIdx = NewMI->getOperandNo(NewIt);
+
+    // Record that one def has been replaced by the other.
+    unsigned NewInstrNum = NewMI->getDebugInstrNum();
+    MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx),
+                                   std::make_pair(NewInstrNum, NewIdx));
+  }
+
   MBB->erase(mi); // Nuke the old inst.
 
   DistanceMap.insert(std::make_pair(NewMI, Dist));
@@ -620,13 +631,12 @@ TwoAddressInstructionPass::convertInstTo3Addr(MachineBasicBlock::iterator &mi,
 
 /// Scan forward recursively for only uses, update maps if the use is a copy or
 /// a two-address instruction.
-void
-TwoAddressInstructionPass::scanUses(unsigned DstReg) {
-  SmallVector<unsigned, 4> VirtRegPairs;
+void TwoAddressInstructionPass::scanUses(Register DstReg) {
+  SmallVector<Register, 4> VirtRegPairs;
   bool IsDstPhys;
   bool IsCopy = false;
-  unsigned NewReg = 0;
-  unsigned Reg = DstReg;
+  Register NewReg;
+  Register Reg = DstReg;
   while (MachineInstr *UseMI = findOnlyInterestingUse(Reg, MBB, MRI, TII,IsCopy,
                                                       NewReg, IsDstPhys)) {
     if (IsCopy && !Processed.insert(UseMI).second)
@@ -682,13 +692,13 @@ void TwoAddressInstructionPass::processCopy(MachineInstr *MI) {
     return;
 
   bool IsSrcPhys, IsDstPhys;
-  unsigned SrcReg, DstReg;
+  Register SrcReg, DstReg;
   if (!isCopyToReg(*MI, TII, SrcReg, DstReg, IsSrcPhys, IsDstPhys))
     return;
 
-  if (IsDstPhys && !IsSrcPhys)
+  if (IsDstPhys && !IsSrcPhys) {
     DstRegMap.insert(std::make_pair(SrcReg, DstReg));
-  else if (!IsDstPhys && IsSrcPhys) {
+  } else if (!IsDstPhys && IsSrcPhys) {
     bool isNew = SrcRegMap.insert(std::make_pair(DstReg, SrcReg)).second;
     if (!isNew)
       assert(SrcRegMap[DstReg] == SrcReg &&
@@ -703,10 +713,9 @@ void TwoAddressInstructionPass::processCopy(MachineInstr *MI) {
 /// If there is one more local instruction that reads 'Reg' and it kills 'Reg,
 /// consider moving the instruction below the kill instruction in order to
 /// eliminate the need for the copy.
-bool TwoAddressInstructionPass::
-rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
-                      MachineBasicBlock::iterator &nmi,
-                      unsigned Reg) {
+bool TwoAddressInstructionPass::rescheduleMIBelowKill(
+    MachineBasicBlock::iterator &mi, MachineBasicBlock::iterator &nmi,
+    Register Reg) {
   // Bail immediately if we don't have LV or LIS available. We use them to find
   // kills efficiently.
   if (!LV && !LIS)
@@ -743,7 +752,7 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
     // Don't move pass calls, etc.
     return false;
 
-  unsigned DstReg;
+  Register DstReg;
   if (isTwoAddrUse(*KillMI, Reg, DstReg))
     return false;
 
@@ -755,9 +764,9 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
     // FIXME: Needs more sophisticated heuristics.
     return false;
 
-  SmallVector<unsigned, 2> Uses;
-  SmallVector<unsigned, 2> Kills;
-  SmallVector<unsigned, 2> Defs;
+  SmallVector<Register, 2> Uses;
+  SmallVector<Register, 2> Kills;
+  SmallVector<Register, 2> Defs;
   for (const MachineOperand &MO : MI->operands()) {
     if (!MO.isReg())
       continue;
@@ -792,8 +801,8 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
   MachineBasicBlock::iterator KillPos = KillMI;
   ++KillPos;
   for (MachineInstr &OtherMI : make_range(End, KillPos)) {
-    // Debug instructions cannot be counted against the limit.
-    if (OtherMI.isDebugInstr())
+    // Debug or pseudo instructions cannot be counted against the limit.
+    if (OtherMI.isDebugOrPseudoInstr())
       continue;
     if (NumVisited > 10)  // FIXME: Arbitrary limit to reduce compile time cost.
       return false;
@@ -872,7 +881,7 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
 
 /// Return true if the re-scheduling will put the given instruction too close
 /// to the defs of its register dependencies.
-bool TwoAddressInstructionPass::isDefTooClose(unsigned Reg, unsigned Dist,
+bool TwoAddressInstructionPass::isDefTooClose(Register Reg, unsigned Dist,
                                               MachineInstr *MI) {
   for (MachineInstr &DefMI : MRI->def_instructions(Reg)) {
     if (DefMI.getParent() != MBB || DefMI.isCopy() || DefMI.isCopyLike())
@@ -893,10 +902,9 @@ bool TwoAddressInstructionPass::isDefTooClose(unsigned Reg, unsigned Dist,
 /// If there is one more local instruction that reads 'Reg' and it kills 'Reg,
 /// consider moving the kill instruction above the current two-address
 /// instruction in order to eliminate the need for the copy.
-bool TwoAddressInstructionPass::
-rescheduleKillAboveMI(MachineBasicBlock::iterator &mi,
-                      MachineBasicBlock::iterator &nmi,
-                      unsigned Reg) {
+bool TwoAddressInstructionPass::rescheduleKillAboveMI(
+    MachineBasicBlock::iterator &mi, MachineBasicBlock::iterator &nmi,
+    Register Reg) {
   // Bail immediately if we don't have LV or LIS available. We use them to find
   // kills efficiently.
   if (!LV && !LIS)
@@ -928,7 +936,7 @@ rescheduleKillAboveMI(MachineBasicBlock::iterator &mi,
     // Don't mess with copies, they may be coalesced later.
     return false;
 
-  unsigned DstReg;
+  Register DstReg;
   if (isTwoAddrUse(*KillMI, Reg, DstReg))
     return false;
 
@@ -936,10 +944,10 @@ rescheduleKillAboveMI(MachineBasicBlock::iterator &mi,
   if (!KillMI->isSafeToMove(AA, SeenStore))
     return false;
 
-  SmallSet<unsigned, 2> Uses;
-  SmallSet<unsigned, 2> Kills;
-  SmallSet<unsigned, 2> Defs;
-  SmallSet<unsigned, 2> LiveDefs;
+  SmallVector<Register, 2> Uses;
+  SmallVector<Register, 2> Kills;
+  SmallVector<Register, 2> Defs;
+  SmallVector<Register, 2> LiveDefs;
   for (const MachineOperand &MO : KillMI->operands()) {
     if (!MO.isReg())
       continue;
@@ -952,13 +960,13 @@ rescheduleKillAboveMI(MachineBasicBlock::iterator &mi,
       bool isKill = MO.isKill() || (LIS && isPlainlyKilled(KillMI, MOReg, LIS));
       if (MOReg == Reg && !isKill)
         return false;
-      Uses.insert(MOReg);
+      Uses.push_back(MOReg);
       if (isKill && MOReg != Reg)
-        Kills.insert(MOReg);
-    } else if (Register::isPhysicalRegister(MOReg)) {
-      Defs.insert(MOReg);
+        Kills.push_back(MOReg);
+    } else if (MOReg.isPhysical()) {
+      Defs.push_back(MOReg);
       if (!MO.isDead())
-        LiveDefs.insert(MOReg);
+        LiveDefs.push_back(MOReg);
     }
   }
 
@@ -966,8 +974,8 @@ rescheduleKillAboveMI(MachineBasicBlock::iterator &mi,
   unsigned NumVisited = 0;
   for (MachineInstr &OtherMI :
        make_range(mi, MachineBasicBlock::iterator(KillMI))) {
-    // Debug instructions cannot be counted against the limit.
-    if (OtherMI.isDebugInstr())
+    // Debug or pseudo instructions cannot be counted against the limit.
+    if (OtherMI.isDebugOrPseudoInstr())
       continue;
     if (NumVisited > 10)  // FIXME: Arbitrary limit to reduce compile time cost.
       return false;
@@ -976,7 +984,7 @@ rescheduleKillAboveMI(MachineBasicBlock::iterator &mi,
         OtherMI.isBranch() || OtherMI.isTerminator())
       // Don't move pass calls, etc.
       return false;
-    SmallVector<unsigned, 2> OtherDefs;
+    SmallVector<Register, 2> OtherDefs;
     for (const MachineOperand &MO : OtherMI.operands()) {
       if (!MO.isReg())
         continue;
@@ -984,11 +992,11 @@ rescheduleKillAboveMI(MachineBasicBlock::iterator &mi,
       if (!MOReg)
         continue;
       if (MO.isUse()) {
-        if (Defs.count(MOReg))
+        if (regOverlapsSet(Defs, MOReg, TRI))
           // Moving KillMI can clobber the physical register if the def has
           // not been seen.
           return false;
-        if (Kills.count(MOReg))
+        if (regOverlapsSet(Kills, MOReg, TRI))
           // Don't want to extend other live ranges and update kills.
           return false;
         if (&OtherMI != MI && MOReg == Reg &&
@@ -1001,13 +1009,13 @@ rescheduleKillAboveMI(MachineBasicBlock::iterator &mi,
     }
 
     for (unsigned i = 0, e = OtherDefs.size(); i != e; ++i) {
-      unsigned MOReg = OtherDefs[i];
-      if (Uses.count(MOReg))
+      Register MOReg = OtherDefs[i];
+      if (regOverlapsSet(Uses, MOReg, TRI))
         return false;
-      if (Register::isPhysicalRegister(MOReg) && LiveDefs.count(MOReg))
+      if (MOReg.isPhysical() && regOverlapsSet(LiveDefs, MOReg, TRI))
         return false;
       // Physical register def is seen.
-      Defs.erase(MOReg);
+      llvm::erase_value(Defs, MOReg);
     }
   }
 
@@ -1125,11 +1133,10 @@ tryInstructionTransform(MachineBasicBlock::iterator &mi,
   Register regA = MI.getOperand(DstIdx).getReg();
   Register regB = MI.getOperand(SrcIdx).getReg();
 
-  assert(Register::isVirtualRegister(regB) &&
-         "cannot make instruction into two-address form");
+  assert(regB.isVirtual() && "cannot make instruction into two-address form");
   bool regBKilled = isKilled(MI, regB, MRI, TII, LIS, true);
 
-  if (Register::isVirtualRegister(regA))
+  if (regA.isVirtual())
     scanUses(regA);
 
   bool Commuted = tryInstructionCommute(&MI, DstIdx, SrcIdx, regBKilled, Dist);
@@ -1245,7 +1252,7 @@ tryInstructionTransform(MachineBasicBlock::iterator &mi,
           if (LV) {
             for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
               MachineOperand &MO = MI.getOperand(i);
-              if (MO.isReg() && Register::isVirtualRegister(MO.getReg())) {
+              if (MO.isReg() && MO.getReg().isVirtual()) {
                 if (MO.isUse()) {
                   if (MO.isKill()) {
                     if (NewMIs[0]->killsRegister(MO.getReg()))
@@ -1330,7 +1337,7 @@ collectTiedOperands(MachineInstr *MI, TiedOperandMap &TiedOperands) {
     // Deal with undef uses immediately - simply rewrite the src operand.
     if (SrcMO.isUndef() && !DstMO.getSubReg()) {
       // Constrain the DstReg register class if required.
-      if (Register::isVirtualRegister(DstReg))
+      if (DstReg.isVirtual())
         if (const TargetRegisterClass *RC = TII->getRegClass(MCID, SrcIdx,
                                                              TRI, *MF))
           MRI->constrainRegClass(DstReg, RC);
@@ -1360,7 +1367,7 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI,
   bool AllUsesCopied = true;
   unsigned LastCopiedReg = 0;
   SlotIndex LastCopyIdx;
-  unsigned RegB = 0;
+  Register RegB = 0;
   unsigned SubRegB = 0;
   for (unsigned tpi = 0, tpe = TiedPairs.size(); tpi != tpe; ++tpi) {
     unsigned SrcIdx = TiedPairs[tpi].first;
@@ -1383,8 +1390,7 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI,
     }
     LastCopiedReg = RegA;
 
-    assert(Register::isVirtualRegister(RegB) &&
-           "cannot make instruction into two-address form");
+    assert(RegB.isVirtual() && "cannot make instruction into two-address form");
 
 #ifndef NDEBUG
     // First, verify that we don't have a use of "a" in the instruction
@@ -1404,7 +1410,7 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI,
     MIB.addReg(RegB, 0, SubRegB);
     const TargetRegisterClass *RC = MRI->getRegClass(RegB);
     if (SubRegB) {
-      if (Register::isVirtualRegister(RegA)) {
+      if (RegA.isVirtual()) {
         assert(TRI->getMatchingSuperRegClass(RC, MRI->getRegClass(RegA),
                                              SubRegB) &&
                "tied subregister must be a truncation");
@@ -1425,7 +1431,7 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI,
     if (LIS) {
       LastCopyIdx = LIS->InsertMachineInstrInMaps(*PrevMI).getRegSlot();
 
-      if (Register::isVirtualRegister(RegA)) {
+      if (RegA.isVirtual()) {
         LiveInterval &LI = LIS->getInterval(RegA);
         VNInfo *VNI = LI.getNextValue(LastCopyIdx, LIS->getVNInfoAllocator());
         SlotIndex endIdx =
@@ -1445,7 +1451,7 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI,
     }
 
     // Make sure regA is a legal regclass for the SrcIdx operand.
-    if (Register::isVirtualRegister(RegA) && Register::isVirtualRegister(RegB))
+    if (RegA.isVirtual() && RegB.isVirtual())
       MRI->constrainRegClass(RegA, RC);
     MO.setReg(RegA);
     // The getMatchingSuper asserts guarantee that the register class projected
@@ -1649,7 +1655,7 @@ void TwoAddressInstructionPass::
 eliminateRegSequence(MachineBasicBlock::iterator &MBBI) {
   MachineInstr &MI = *MBBI;
   Register DstReg = MI.getOperand(0).getReg();
-  if (MI.getOperand(0).getSubReg() || Register::isPhysicalRegister(DstReg) ||
+  if (MI.getOperand(0).getSubReg() || DstReg.isPhysical() ||
       !(MI.getNumOperands() & 1)) {
     LLVM_DEBUG(dbgs() << "Illegal REG_SEQUENCE instruction:" << MI);
     llvm_unreachable(nullptr);
@@ -1699,7 +1705,7 @@ eliminateRegSequence(MachineBasicBlock::iterator &MBBI) {
     DefEmitted = true;
 
     // Update LiveVariables' kill info.
-    if (LV && isKill && !Register::isPhysicalRegister(SrcReg))
+    if (LV && isKill && !SrcReg.isPhysical())
       LV->replaceKillInstruction(SrcReg, MI, *CopyMI);
 
     LLVM_DEBUG(dbgs() << "Inserted: " << *CopyMI);
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/TypePromotion.cpp b/contrib/llvm-project/llvm/lib/CodeGen/TypePromotion.cpp
index 807babdcaf25..a42095d8718a 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/TypePromotion.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/TypePromotion.cpp
@@ -134,8 +134,9 @@ public:
     Ctx(C), OrigTy(Ty), PromotedWidth(Width), Visited(visited),
     Sources(sources), Sinks(sinks), SafeWrap(wrap) {
     ExtTy = IntegerType::get(Ctx, PromotedWidth);
-    assert(OrigTy->getPrimitiveSizeInBits() < ExtTy->getPrimitiveSizeInBits()
-           && "Original type not smaller than extended type");
+    assert(OrigTy->getPrimitiveSizeInBits().getFixedSize() <
+               ExtTy->getPrimitiveSizeInBits().getFixedSize() &&
+           "Original type not smaller than extended type");
   }
 
   void Mutate();
@@ -809,7 +810,7 @@ bool TypePromotion::isLegalToPromote(Value *V) {
 
 bool TypePromotion::TryToPromote(Value *V, unsigned PromotedWidth) {
   Type *OrigTy = V->getType();
-  TypeSize = OrigTy->getPrimitiveSizeInBits();
+  TypeSize = OrigTy->getPrimitiveSizeInBits().getFixedSize();
   SafeToPromote.clear();
   SafeWrap.clear();
 
@@ -980,15 +981,14 @@ bool TypePromotion::runOnFunction(Function &F) {
           if (TLI->getTypeAction(ICmp->getContext(), SrcVT) !=
               TargetLowering::TypePromoteInteger)
             break;
-
           EVT PromotedVT = TLI->getTypeToTransformTo(ICmp->getContext(), SrcVT);
-          if (RegisterBitWidth < PromotedVT.getSizeInBits()) {
+          if (RegisterBitWidth < PromotedVT.getFixedSizeInBits()) {
             LLVM_DEBUG(dbgs() << "IR Promotion: Couldn't find target register "
                        << "for promoted type\n");
             break;
           }
 
-          MadeChange |= TryToPromote(I, PromotedVT.getSizeInBits());
+          MadeChange |= TryToPromote(I, PromotedVT.getFixedSizeInBits());
           break;
         }
       }
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/ValueTypes.cpp b/contrib/llvm-project/llvm/lib/CodeGen/ValueTypes.cpp
index 66bcdd9b2c4a..978357d8f539 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/ValueTypes.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/ValueTypes.cpp
@@ -49,8 +49,7 @@ EVT EVT::getExtendedVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements,
 
 EVT EVT::getExtendedVectorVT(LLVMContext &Context, EVT VT, ElementCount EC) {
   EVT ResultVT;
-  ResultVT.LLVMTy =
-      VectorType::get(VT.getTypeForEVT(Context), {EC.Min, EC.Scalable});
+  ResultVT.LLVMTy = VectorType::get(VT.getTypeForEVT(Context), EC);
   assert(ResultVT.isExtended() && "Type is not extended!");
   return ResultVT;
 }
@@ -123,13 +122,13 @@ EVT EVT::getExtendedVectorElementType() const {
 unsigned EVT::getExtendedVectorNumElements() const {
   assert(isExtended() && "Type is not extended!");
   ElementCount EC = cast<VectorType>(LLVMTy)->getElementCount();
-  if (EC.Scalable) {
+  if (EC.isScalable()) {
     WithColor::warning()
         << "The code that requested the fixed number of elements has made the "
            "assumption that this vector is not scalable. This assumption was "
            "not correct, and this may lead to broken code\n";
   }
-  return EC.Min;
+  return EC.getKnownMinValue();
 }
 
 ElementCount EVT::getExtendedVectorElementCount() const {
@@ -151,23 +150,25 @@ std::string EVT::getEVTString() const {
   switch (V.SimpleTy) {
   default:
     if (isVector())
-      return (isScalableVector() ? "nxv" : "v")
-             + utostr(getVectorElementCount().Min)
-             + getVectorElementType().getEVTString();
+      return (isScalableVector() ? "nxv" : "v") +
+             utostr(getVectorElementCount().getKnownMinValue()) +
+             getVectorElementType().getEVTString();
     if (isInteger())
       return "i" + utostr(getSizeInBits());
     if (isFloatingPoint())
       return "f" + utostr(getSizeInBits());
     llvm_unreachable("Invalid EVT!");
-  case MVT::bf16:    return "bf16";
-  case MVT::ppcf128: return "ppcf128";
-  case MVT::isVoid:  return "isVoid";
-  case MVT::Other:   return "ch";
-  case MVT::Glue:    return "glue";
-  case MVT::x86mmx:  return "x86mmx";
-  case MVT::Metadata:return "Metadata";
-  case MVT::Untyped: return "Untyped";
-  case MVT::exnref : return "exnref";
+  case MVT::bf16:      return "bf16";
+  case MVT::ppcf128:   return "ppcf128";
+  case MVT::isVoid:    return "isVoid";
+  case MVT::Other:     return "ch";
+  case MVT::Glue:      return "glue";
+  case MVT::x86mmx:    return "x86mmx";
+  case MVT::x86amx:    return "x86amx";
+  case MVT::Metadata:  return "Metadata";
+  case MVT::Untyped:   return "Untyped";
+  case MVT::funcref:   return "funcref";
+  case MVT::externref: return "externref";
   }
 }
 
@@ -194,6 +195,7 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const {
   case MVT::f128:    return Type::getFP128Ty(Context);
   case MVT::ppcf128: return Type::getPPC_FP128Ty(Context);
   case MVT::x86mmx:  return Type::getX86_MMXTy(Context);
+  case MVT::x86amx:  return Type::getX86_AMXTy(Context);
   case MVT::v1i1:
     return FixedVectorType::get(Type::getInt1Ty(Context), 1);
   case MVT::v2i1:
@@ -292,6 +294,12 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const {
     return FixedVectorType::get(Type::getInt64Ty(Context), 16);
   case MVT::v32i64:
     return FixedVectorType::get(Type::getInt64Ty(Context), 32);
+  case MVT::v64i64:
+    return FixedVectorType::get(Type::getInt64Ty(Context), 64);
+  case MVT::v128i64:
+    return FixedVectorType::get(Type::getInt64Ty(Context), 128);
+  case MVT::v256i64:
+    return FixedVectorType::get(Type::getInt64Ty(Context), 256);
   case MVT::v1i128:
     return FixedVectorType::get(Type::getInt128Ty(Context), 1);
   case MVT::v2f16:
@@ -307,9 +315,9 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const {
   case MVT::v32f16:
     return FixedVectorType::get(Type::getHalfTy(Context), 32);
   case MVT::v64f16:
-    return FixedVectorType::get(Type::getBFloatTy(Context), 64);
+    return FixedVectorType::get(Type::getHalfTy(Context), 64);
   case MVT::v128f16:
-    return FixedVectorType::get(Type::getBFloatTy(Context), 128);
+    return FixedVectorType::get(Type::getHalfTy(Context), 128);
   case MVT::v2bf16:
     return FixedVectorType::get(Type::getBFloatTy(Context), 2);
   case MVT::v3bf16:
@@ -366,6 +374,12 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const {
     return FixedVectorType::get(Type::getDoubleTy(Context), 16);
   case MVT::v32f64:
     return FixedVectorType::get(Type::getDoubleTy(Context), 32);
+  case MVT::v64f64:
+    return FixedVectorType::get(Type::getDoubleTy(Context), 64);
+  case MVT::v128f64:
+    return FixedVectorType::get(Type::getDoubleTy(Context), 128);
+  case MVT::v256f64:
+    return FixedVectorType::get(Type::getDoubleTy(Context), 256);
   case MVT::nxv1i1:
     return ScalableVectorType::get(Type::getInt1Ty(Context), 1);
   case MVT::nxv2i1:
@@ -488,6 +502,7 @@ MVT MVT::getVT(Type *Ty, bool HandleUnknown){
   case Type::DoubleTyID:    return MVT(MVT::f64);
   case Type::X86_FP80TyID:  return MVT(MVT::f80);
   case Type::X86_MMXTyID:   return MVT(MVT::x86mmx);
+  case Type::X86_AMXTyID:   return MVT(MVT::x86amx);
   case Type::FP128TyID:     return MVT(MVT::f128);
   case Type::PPC_FP128TyID: return MVT(MVT::ppcf128);
   case Type::PointerTyID:   return MVT(MVT::iPTR);
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/VirtRegMap.cpp b/contrib/llvm-project/llvm/lib/CodeGen/VirtRegMap.cpp
index 2c83f13b651b..5e0ff9d9092c 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/VirtRegMap.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/VirtRegMap.cpp
@@ -68,6 +68,7 @@ bool VirtRegMap::runOnMachineFunction(MachineFunction &mf) {
   Virt2PhysMap.clear();
   Virt2StackSlotMap.clear();
   Virt2SplitMap.clear();
+  Virt2ShapeMap.clear();
 
   grow();
   return false;
@@ -104,7 +105,7 @@ bool VirtRegMap::hasPreferredPhys(Register VirtReg) {
     return false;
   if (Hint.isVirtual())
     Hint = getPhys(Hint);
-  return getPhys(VirtReg) == Hint;
+  return Register(getPhys(VirtReg)) == Hint;
 }
 
 bool VirtRegMap::hasKnownPreference(Register VirtReg) {
@@ -187,7 +188,7 @@ class VirtRegRewriter : public MachineFunctionPass {
   void addLiveInsForSubRanges(const LiveInterval &LI, Register PhysReg) const;
   void handleIdentityCopy(MachineInstr &MI) const;
   void expandCopyBundle(MachineInstr &MI) const;
-  bool subRegLiveThrough(const MachineInstr &MI, Register SuperPhysReg) const;
+  bool subRegLiveThrough(const MachineInstr &MI, MCRegister SuperPhysReg) const;
 
 public:
   static char ID;
@@ -400,18 +401,18 @@ void VirtRegRewriter::handleIdentityCopy(MachineInstr &MI) const {
 /// after processing the last in the bundle. Does not update LiveIntervals
 /// which we shouldn't need for this instruction anymore.
 void VirtRegRewriter::expandCopyBundle(MachineInstr &MI) const {
-  if (!MI.isCopy())
+  if (!MI.isCopy() && !MI.isKill())
     return;
 
   if (MI.isBundledWithPred() && !MI.isBundledWithSucc()) {
     SmallVector<MachineInstr *, 2> MIs({&MI});
 
-    // Only do this when the complete bundle is made out of COPYs.
+    // Only do this when the complete bundle is made out of COPYs and KILLs.
     MachineBasicBlock &MBB = *MI.getParent();
     for (MachineBasicBlock::reverse_instr_iterator I =
          std::next(MI.getReverseIterator()), E = MBB.instr_rend();
          I != E && I->isBundledWithSucc(); ++I) {
-      if (!I->isCopy())
+      if (!I->isCopy() && !I->isKill())
         return;
       MIs.push_back(&*I);
     }
@@ -452,7 +453,7 @@ void VirtRegRewriter::expandCopyBundle(MachineInstr &MI) const {
       // instruction, the bundle will have been completely undone.
       if (BundledMI != BundleStart) {
         BundledMI->removeFromBundle();
-        MBB.insert(FirstMI, BundledMI);
+        MBB.insert(BundleStart, BundledMI);
       } else if (BundledMI->isBundledWithSucc()) {
         BundledMI->unbundleFromSucc();
         BundleStart = &*std::next(BundledMI->getIterator());
@@ -468,7 +469,7 @@ void VirtRegRewriter::expandCopyBundle(MachineInstr &MI) const {
 /// \pre \p MI defines a subregister of a virtual register that
 /// has been assigned to \p SuperPhysReg.
 bool VirtRegRewriter::subRegLiveThrough(const MachineInstr &MI,
-                                        Register SuperPhysReg) const {
+                                        MCRegister SuperPhysReg) const {
   SlotIndex MIIndex = LIS->getInstructionIndex(MI);
   SlotIndex BeforeMIUses = MIIndex.getBaseIndex();
   SlotIndex AfterMIDefs = MIIndex.getBoundaryIndex();
@@ -515,7 +516,7 @@ void VirtRegRewriter::rewrite() {
         if (!MO.isReg() || !MO.getReg().isVirtual())
           continue;
         Register VirtReg = MO.getReg();
-        Register PhysReg = VRM->getPhys(VirtReg);
+        MCRegister PhysReg = VRM->getPhys(VirtReg);
         assert(PhysReg != VirtRegMap::NO_PHYS_REG &&
                "Instruction uses unmapped VirtReg");
         assert(!MRI->isReserved(PhysReg) && "Reserved register assignment");
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/WasmEHPrepare.cpp b/contrib/llvm-project/llvm/lib/CodeGen/WasmEHPrepare.cpp
index 44f4fe2ff9b1..53424556682d 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/WasmEHPrepare.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/WasmEHPrepare.cpp
@@ -23,7 +23,7 @@
 //
 // - After:
 //   catchpad ...
-//   exn = wasm.extract.exception();
+//   exn = wasm.catch(WebAssembly::CPP_EXCEPTION);
 //   // Only add below in case it's not a single catch (...)
 //   wasm.landingpad.index(index);
 //   __wasm_lpad_context.lpad_index = index;
@@ -112,7 +112,7 @@ class WasmEHPrepare : public FunctionPass {
   Function *LPadIndexF = nullptr;   // wasm.landingpad.index() intrinsic
   Function *LSDAF = nullptr;        // wasm.lsda() intrinsic
   Function *GetExnF = nullptr;      // wasm.get.exception() intrinsic
-  Function *ExtractExnF = nullptr;  // wasm.extract.exception() intrinsic
+  Function *CatchF = nullptr;       // wasm.catch() intrinsic
   Function *GetSelectorF = nullptr; // wasm.get.ehselector() intrinsic
   FunctionCallee CallPersonalityF =
       nullptr; // _Unwind_CallPersonality() wrapper
@@ -124,7 +124,6 @@ class WasmEHPrepare : public FunctionPass {
   void setupEHPadFunctions(Function &F);
   void prepareEHPad(BasicBlock *BB, bool NeedPersonality, bool NeedLSDA = false,
                     unsigned Index = 0);
-  void prepareTerminateCleanupPad(BasicBlock *BB);
 
 public:
   static char ID; // Pass identification, replacement for typeid
@@ -169,7 +168,7 @@ static void eraseDeadBBsAndChildren(const Container &BBs, DomTreeUpdater *DTU) {
   SmallVector<BasicBlock *, 8> WL(BBs.begin(), BBs.end());
   while (!WL.empty()) {
     auto *BB = WL.pop_back_val();
-    if (pred_begin(BB) != pred_end(BB))
+    if (!pred_empty(BB))
       continue;
     WL.append(succ_begin(BB), succ_end(BB));
     DeleteDeadBlock(BB, DTU);
@@ -205,7 +204,7 @@ bool WasmEHPrepare::prepareThrows(Function &F) {
       continue;
     Changed = true;
     auto *BB = ThrowI->getParent();
-    SmallVector<BasicBlock *, 4> Succs(succ_begin(BB), succ_end(BB));
+    SmallVector<BasicBlock *, 4> Succs(successors(BB));
     auto &InstList = BB->getInstList();
     InstList.erase(std::next(BasicBlock::iterator(ThrowI)), InstList.end());
     IRB.SetInsertPoint(BB);
@@ -328,12 +327,9 @@ void WasmEHPrepare::setupEHPadFunctions(Function &F) {
   GetExnF = Intrinsic::getDeclaration(&M, Intrinsic::wasm_get_exception);
   GetSelectorF = Intrinsic::getDeclaration(&M, Intrinsic::wasm_get_ehselector);
 
-  // wasm.extract.exception() is the same as wasm.get.exception() but it does
-  // not take a token argument. This will be lowered down to EXTRACT_EXCEPTION
-  // pseudo instruction in instruction selection, which will be expanded using
-  // 'br_on_exn' instruction later.
-  ExtractExnF =
-      Intrinsic::getDeclaration(&M, Intrinsic::wasm_extract_exception);
+  // wasm.catch() will be lowered down to wasm 'catch' instruction in
+  // instruction selection.
+  CatchF = Intrinsic::getDeclaration(&M, Intrinsic::wasm_catch);
 
   // _Unwind_CallPersonality() wrapper function, which calls the personality
   CallPersonalityF = M.getOrInsertFunction(
@@ -373,8 +369,13 @@ void WasmEHPrepare::prepareEHPad(BasicBlock *BB, bool NeedPersonality,
     return;
   }
 
-  Instruction *ExtractExnCI = IRB.CreateCall(ExtractExnF, {}, "exn");
-  GetExnCI->replaceAllUsesWith(ExtractExnCI);
+  // Replace wasm.get.exception intrinsic with wasm.catch intrinsic, which will
+  // be lowered to wasm 'catch' instruction. We do this mainly because
+  // instruction selection cannot handle wasm.get.exception intrinsic's token
+  // argument.
+  Instruction *CatchCI =
+      IRB.CreateCall(CatchF, {IRB.getInt32(WebAssembly::CPP_EXCEPTION)}, "exn");
+  GetExnCI->replaceAllUsesWith(CatchCI);
   GetExnCI->eraseFromParent();
 
   // In case it is a catchpad with single catch (...) or a cleanuppad, we don't
@@ -387,7 +388,7 @@ void WasmEHPrepare::prepareEHPad(BasicBlock *BB, bool NeedPersonality,
     }
     return;
   }
-  IRB.SetInsertPoint(ExtractExnCI->getNextNode());
+  IRB.SetInsertPoint(CatchCI->getNextNode());
 
   // This is to create a map of <landingpad EH label, landingpad index> in
   // SelectionDAGISel, which is to be used in EHStreamer to emit LSDA tables.
@@ -403,7 +404,7 @@ void WasmEHPrepare::prepareEHPad(BasicBlock *BB, bool NeedPersonality,
     IRB.CreateStore(IRB.CreateCall(LSDAF), LSDAField);
 
   // Pseudocode: _Unwind_CallPersonality(exn);
-  CallInst *PersCI = IRB.CreateCall(CallPersonalityF, ExtractExnCI,
+  CallInst *PersCI = IRB.CreateCall(CallPersonalityF, CatchCI,
                                     OperandBundleDef("funclet", CPI));
   PersCI->setDoesNotThrow();
 
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/WinEHPrepare.cpp b/contrib/llvm-project/llvm/lib/CodeGen/WinEHPrepare.cpp
index 5a25234ba850..96d256ba57a3 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/WinEHPrepare.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/WinEHPrepare.cpp
@@ -804,13 +804,9 @@ void WinEHPrepare::cloneCommonBlocks(Function &F) {
                               << "\' to block \'" << NewBlock->getName()
                               << "\'.\n");
 
-      BlocksInFunclet.erase(
-          std::remove(BlocksInFunclet.begin(), BlocksInFunclet.end(), OldBlock),
-          BlocksInFunclet.end());
+      llvm::erase_value(BlocksInFunclet, OldBlock);
       ColorVector &OldColors = BlockColors[OldBlock];
-      OldColors.erase(
-          std::remove(OldColors.begin(), OldColors.end(), FuncletPadBB),
-          OldColors.end());
+      llvm::erase_value(OldColors, FuncletPadBB);
 
       DEBUG_WITH_TYPE("winehprepare-coloring",
                       dbgs() << "  Removed color \'" << FuncletPadBB->getName()
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/XRayInstrumentation.cpp b/contrib/llvm-project/llvm/lib/CodeGen/XRayInstrumentation.cpp
index ab9c0e81ebdc..11d1b309aa64 100644
--- a/contrib/llvm-project/llvm/lib/CodeGen/XRayInstrumentation.cpp
+++ b/contrib/llvm-project/llvm/lib/CodeGen/XRayInstrumentation.cpp
@@ -145,20 +145,22 @@ void XRayInstrumentation::prependRetWithPatchableExit(
 bool XRayInstrumentation::runOnMachineFunction(MachineFunction &MF) {
   auto &F = MF.getFunction();
   auto InstrAttr = F.getFnAttribute("function-instrument");
-  bool AlwaysInstrument = !InstrAttr.hasAttribute(Attribute::None) &&
-                          InstrAttr.isStringAttribute() &&
+  bool AlwaysInstrument = InstrAttr.isStringAttribute() &&
                           InstrAttr.getValueAsString() == "xray-always";
+  bool NeverInstrument = InstrAttr.isStringAttribute() &&
+                         InstrAttr.getValueAsString() == "xray-never";
+  if (NeverInstrument && !AlwaysInstrument)
+    return false;
   auto ThresholdAttr = F.getFnAttribute("xray-instruction-threshold");
   auto IgnoreLoopsAttr = F.getFnAttribute("xray-ignore-loops");
   unsigned int XRayThreshold = 0;
   if (!AlwaysInstrument) {
-    if (ThresholdAttr.hasAttribute(Attribute::None) ||
-        !ThresholdAttr.isStringAttribute())
+    if (!ThresholdAttr.isStringAttribute())
       return false; // XRay threshold attribute not found.
     if (ThresholdAttr.getValueAsString().getAsInteger(10, XRayThreshold))
       return false; // Invalid value for threshold.
 
-    bool IgnoreLoops = !IgnoreLoopsAttr.hasAttribute(Attribute::None);
+    bool IgnoreLoops = IgnoreLoopsAttr.isValid();
 
     // Count the number of MachineInstr`s in MachineFunction
     int64_t MICount = 0;
diff --git a/contrib/llvm-project/llvm/lib/DWARFLinker/DWARFLinker.cpp b/contrib/llvm-project/llvm/lib/DWARFLinker/DWARFLinker.cpp
index 12b19e77a422..d20f6dd8f338 100644
--- a/contrib/llvm-project/llvm/lib/DWARFLinker/DWARFLinker.cpp
+++ b/contrib/llvm-project/llvm/lib/DWARFLinker/DWARFLinker.cpp
@@ -52,9 +52,8 @@ static uint64_t getDebugInfoSize(DWARFContext &Dwarf) {
 /// Similar to DWARFUnitSection::getUnitForOffset(), but returning our
 /// CompileUnit object instead.
 static CompileUnit *getUnitForOffset(const UnitListTy &Units, uint64_t Offset) {
-  auto CU = std::upper_bound(
-      Units.begin(), Units.end(), Offset,
-      [](uint64_t LHS, const std::unique_ptr<CompileUnit> &RHS) {
+  auto CU = llvm::upper_bound(
+      Units, Offset, [](uint64_t LHS, const std::unique_ptr<CompileUnit> &RHS) {
         return LHS < RHS->getOrigUnit().getNextUnitOffset();
       });
   return CU != Units.end() ? CU->get() : nullptr;
@@ -63,7 +62,7 @@ static CompileUnit *getUnitForOffset(const UnitListTy &Units, uint64_t Offset) {
 /// Resolve the DIE attribute reference that has been extracted in \p RefValue.
 /// The resulting DIE might be in another CompileUnit which is stored into \p
 /// ReferencedCU. \returns null if resolving fails for any reason.
-DWARFDie DWARFLinker::resolveDIEReference(const DwarfFile &File,
+DWARFDie DWARFLinker::resolveDIEReference(const DWARFFile &File,
                                           const UnitListTy &Units,
                                           const DWARFFormValue &RefValue,
                                           const DWARFDie &DIE,
@@ -242,70 +241,49 @@ static void analyzeImportedModule(
     }
 }
 
-/// Recursive helper to build the global DeclContext information and
-/// gather the child->parent relationships in the original compile unit.
-///
-/// \return true when this DIE and all of its children are only
-/// forward declarations to types defined in external clang modules
-/// (i.e., forward declarations that are children of a DW_TAG_module).
-static bool analyzeContextInfo(
-    const DWARFDie &DIE, unsigned ParentIdx, CompileUnit &CU,
-    DeclContext *CurrentDeclContext, UniquingStringPool &StringPool,
-    DeclContextTree &Contexts, uint64_t ModulesEndOffset,
-    swiftInterfacesMap *ParseableSwiftInterfaces,
-    std::function<void(const Twine &, const DWARFDie &)> ReportWarning,
-    bool InImportedModule = false) {
-  unsigned MyIdx = CU.getOrigUnit().getDIEIndex(DIE);
-  CompileUnit::DIEInfo &Info = CU.getInfo(MyIdx);
+/// The distinct types of work performed by the work loop in
+/// analyzeContextInfo.
+enum class ContextWorklistItemType : uint8_t {
+  AnalyzeContextInfo,
+  UpdateChildPruning,
+  UpdatePruning,
+};
 
-  // Clang imposes an ODR on modules(!) regardless of the language:
-  //  "The module-id should consist of only a single identifier,
-  //   which provides the name of the module being defined. Each
-  //   module shall have a single definition."
-  //
-  // This does not extend to the types inside the modules:
-  //  "[I]n C, this implies that if two structs are defined in
-  //   different submodules with the same name, those two types are
-  //   distinct types (but may be compatible types if their
-  //   definitions match)."
-  //
-  // We treat non-C++ modules like namespaces for this reason.
-  if (DIE.getTag() == dwarf::DW_TAG_module && ParentIdx == 0 &&
-      dwarf::toString(DIE.find(dwarf::DW_AT_name), "") !=
-          CU.getClangModuleName()) {
-    InImportedModule = true;
-    analyzeImportedModule(DIE, CU, ParseableSwiftInterfaces, ReportWarning);
-  }
-
-  Info.ParentIdx = ParentIdx;
-  bool InClangModule = CU.isClangModule() || InImportedModule;
-  if (CU.hasODR() || InClangModule) {
-    if (CurrentDeclContext) {
-      auto PtrInvalidPair = Contexts.getChildDeclContext(
-          *CurrentDeclContext, DIE, CU, StringPool, InClangModule);
-      CurrentDeclContext = PtrInvalidPair.getPointer();
-      Info.Ctxt =
-          PtrInvalidPair.getInt() ? nullptr : PtrInvalidPair.getPointer();
-      if (Info.Ctxt)
-        Info.Ctxt->setDefinedInClangModule(InClangModule);
-    } else
-      Info.Ctxt = CurrentDeclContext = nullptr;
-  }
+/// This class represents an item in the work list. The type defines what kind
+/// of work needs to be performed when processing the current item. Everything
+/// but the Type and Die fields are optional based on the type.
+struct ContextWorklistItem {
+  DWARFDie Die;
+  unsigned ParentIdx;
+  union {
+    CompileUnit::DIEInfo *OtherInfo;
+    DeclContext *Context;
+  };
+  ContextWorklistItemType Type;
+  bool InImportedModule;
+
+  ContextWorklistItem(DWARFDie Die, ContextWorklistItemType T,
+                      CompileUnit::DIEInfo *OtherInfo = nullptr)
+      : Die(Die), ParentIdx(0), OtherInfo(OtherInfo), Type(T),
+        InImportedModule(false) {}
+
+  ContextWorklistItem(DWARFDie Die, DeclContext *Context, unsigned ParentIdx,
+                      bool InImportedModule)
+      : Die(Die), ParentIdx(ParentIdx), Context(Context),
+        Type(ContextWorklistItemType::AnalyzeContextInfo),
+        InImportedModule(InImportedModule) {}
+};
 
-  Info.Prune = InImportedModule;
-  if (DIE.hasChildren())
-    for (auto Child : DIE.children())
-      Info.Prune &= analyzeContextInfo(Child, MyIdx, CU, CurrentDeclContext,
-                                       StringPool, Contexts, ModulesEndOffset,
-                                       ParseableSwiftInterfaces, ReportWarning,
-                                       InImportedModule);
+static bool updatePruning(const DWARFDie &Die, CompileUnit &CU,
+                          uint64_t ModulesEndOffset) {
+  CompileUnit::DIEInfo &Info = CU.getInfo(Die);
 
   // Prune this DIE if it is either a forward declaration inside a
   // DW_TAG_module or a DW_TAG_module that contains nothing but
   // forward declarations.
-  Info.Prune &= (DIE.getTag() == dwarf::DW_TAG_module) ||
-                (isTypeTag(DIE.getTag()) &&
-                 dwarf::toUnsigned(DIE.find(dwarf::DW_AT_declaration), 0));
+  Info.Prune &= (Die.getTag() == dwarf::DW_TAG_module) ||
+                (isTypeTag(Die.getTag()) &&
+                 dwarf::toUnsigned(Die.find(dwarf::DW_AT_declaration), 0));
 
   // Only prune forward declarations inside a DW_TAG_module for which a
   // definition exists elsewhere.
@@ -318,6 +296,100 @@ static bool analyzeContextInfo(
   return Info.Prune;
 }
 
+static void updateChildPruning(const DWARFDie &Die, CompileUnit &CU,
+                               CompileUnit::DIEInfo &ChildInfo) {
+  CompileUnit::DIEInfo &Info = CU.getInfo(Die);
+  Info.Prune &= ChildInfo.Prune;
+}
+
+/// Recursive helper to build the global DeclContext information and
+/// gather the child->parent relationships in the original compile unit.
+///
+/// This function uses the same work list approach as lookForDIEsToKeep.
+///
+/// \return true when this DIE and all of its children are only
+/// forward declarations to types defined in external clang modules
+/// (i.e., forward declarations that are children of a DW_TAG_module).
+static bool analyzeContextInfo(
+    const DWARFDie &DIE, unsigned ParentIdx, CompileUnit &CU,
+    DeclContext *CurrentDeclContext, DeclContextTree &Contexts,
+    uint64_t ModulesEndOffset, swiftInterfacesMap *ParseableSwiftInterfaces,
+    std::function<void(const Twine &, const DWARFDie &)> ReportWarning,
+    bool InImportedModule = false) {
+  // LIFO work list.
+  std::vector<ContextWorklistItem> Worklist;
+  Worklist.emplace_back(DIE, CurrentDeclContext, ParentIdx, InImportedModule);
+
+  while (!Worklist.empty()) {
+    ContextWorklistItem Current = Worklist.back();
+    Worklist.pop_back();
+
+    switch (Current.Type) {
+    case ContextWorklistItemType::UpdatePruning:
+      updatePruning(Current.Die, CU, ModulesEndOffset);
+      continue;
+    case ContextWorklistItemType::UpdateChildPruning:
+      updateChildPruning(Current.Die, CU, *Current.OtherInfo);
+      continue;
+    case ContextWorklistItemType::AnalyzeContextInfo:
+      break;
+    }
+
+    unsigned Idx = CU.getOrigUnit().getDIEIndex(Current.Die);
+    CompileUnit::DIEInfo &Info = CU.getInfo(Idx);
+
+    // Clang imposes an ODR on modules(!) regardless of the language:
+    //  "The module-id should consist of only a single identifier,
+    //   which provides the name of the module being defined. Each
+    //   module shall have a single definition."
+    //
+    // This does not extend to the types inside the modules:
+    //  "[I]n C, this implies that if two structs are defined in
+    //   different submodules with the same name, those two types are
+    //   distinct types (but may be compatible types if their
+    //   definitions match)."
+    //
+    // We treat non-C++ modules like namespaces for this reason.
+    if (Current.Die.getTag() == dwarf::DW_TAG_module &&
+        Current.ParentIdx == 0 &&
+        dwarf::toString(Current.Die.find(dwarf::DW_AT_name), "") !=
+            CU.getClangModuleName()) {
+      Current.InImportedModule = true;
+      analyzeImportedModule(Current.Die, CU, ParseableSwiftInterfaces,
+                            ReportWarning);
+    }
+
+    Info.ParentIdx = Current.ParentIdx;
+    bool InClangModule = CU.isClangModule() || Current.InImportedModule;
+    if (CU.hasODR() || InClangModule) {
+      if (Current.Context) {
+        auto PtrInvalidPair = Contexts.getChildDeclContext(
+            *Current.Context, Current.Die, CU, InClangModule);
+        Current.Context = PtrInvalidPair.getPointer();
+        Info.Ctxt =
+            PtrInvalidPair.getInt() ? nullptr : PtrInvalidPair.getPointer();
+        if (Info.Ctxt)
+          Info.Ctxt->setDefinedInClangModule(InClangModule);
+      } else
+        Info.Ctxt = Current.Context = nullptr;
+    }
+
+    Info.Prune = Current.InImportedModule;
+    // Add children in reverse order to the worklist to effectively process
+    // them in order.
+    Worklist.emplace_back(Current.Die, ContextWorklistItemType::UpdatePruning);
+    for (auto Child : reverse(Current.Die.children())) {
+      CompileUnit::DIEInfo &ChildInfo = CU.getInfo(Child);
+      Worklist.emplace_back(
+          Current.Die, ContextWorklistItemType::UpdateChildPruning, &ChildInfo);
+      Worklist.emplace_back(Child, Current.Context, Idx,
+                            Current.InImportedModule);
+    }
+  }
+
+  return CU.getInfo(DIE).Prune;
+}
+
 static bool dieNeedsChildrenToBeMeaningful(uint32_t Tag) {
   switch (Tag) {
   default:
@@ -347,32 +419,10 @@ void DWARFLinker::cleanupAuxiliarryData(LinkContext &Context) {
   DIEAlloc.Reset();
 }
 
-/// Get the starting and ending (exclusive) offset for the
-/// attribute with index \p Idx descibed by \p Abbrev. \p Offset is
-/// supposed to point to the position of the first attribute described
-/// by \p Abbrev.
-/// \return [StartOffset, EndOffset) as a pair.
-static std::pair<uint64_t, uint64_t>
-getAttributeOffsets(const DWARFAbbreviationDeclaration *Abbrev, unsigned Idx,
-                    uint64_t Offset, const DWARFUnit &Unit) {
-  DataExtractor Data = Unit.getDebugInfoExtractor();
-
-  for (unsigned I = 0; I < Idx; ++I)
-    DWARFFormValue::skipValue(Abbrev->getFormByIndex(I), Data, &Offset,
-                              Unit.getFormParams());
-
-  uint64_t End = Offset;
-  DWARFFormValue::skipValue(Abbrev->getFormByIndex(Idx), Data, &End,
-                            Unit.getFormParams());
-
-  return std::make_pair(Offset, End);
-}
-
 /// Check if a variable describing DIE should be kept.
 /// \returns updated TraversalFlags.
 unsigned DWARFLinker::shouldKeepVariableDIE(AddressesMap &RelocMgr,
                                             const DWARFDie &DIE,
-                                            CompileUnit &Unit,
                                             CompileUnit::DIEInfo &MyInfo,
                                             unsigned Flags) {
   const auto *Abbrev = DIE.getAbbreviationDeclarationPtr();
@@ -384,24 +434,12 @@ unsigned DWARFLinker::shouldKeepVariableDIE(AddressesMap &RelocMgr,
     return Flags | TF_Keep;
   }
 
-  Optional<uint32_t> LocationIdx =
-      Abbrev->findAttributeIndex(dwarf::DW_AT_location);
-  if (!LocationIdx)
-    return Flags;
-
-  uint64_t Offset = DIE.getOffset() + getULEB128Size(Abbrev->getCode());
-  const DWARFUnit &OrigUnit = Unit.getOrigUnit();
-  uint64_t LocationOffset, LocationEndOffset;
-  std::tie(LocationOffset, LocationEndOffset) =
-      getAttributeOffsets(Abbrev, *LocationIdx, Offset, OrigUnit);
-
   // See if there is a relocation to a valid debug map entry inside
   // this variable's location. The order is important here. We want to
   // always check if the variable has a valid relocation, so that the
   // DIEInfo is filled. However, we don't want a static variable in a
   // function to force us to keep the enclosing function.
-  if (!RelocMgr.hasValidRelocationAt(LocationOffset, LocationEndOffset,
-                                     MyInfo) ||
+  if (!RelocMgr.hasLiveMemoryLocation(DIE, MyInfo) ||
       (Flags & TF_InFunctionScope))
     return Flags;
 
@@ -420,26 +458,16 @@ unsigned DWARFLinker::shouldKeepVariableDIE(AddressesMap &RelocMgr,
 /// \returns updated TraversalFlags.
 unsigned DWARFLinker::shouldKeepSubprogramDIE(
     AddressesMap &RelocMgr, RangesTy &Ranges, const DWARFDie &DIE,
-    const DwarfFile &File, CompileUnit &Unit, CompileUnit::DIEInfo &MyInfo,
+    const DWARFFile &File, CompileUnit &Unit, CompileUnit::DIEInfo &MyInfo,
     unsigned Flags) {
-  const auto *Abbrev = DIE.getAbbreviationDeclarationPtr();
-
   Flags |= TF_InFunctionScope;
 
-  Optional<uint32_t> LowPcIdx = Abbrev->findAttributeIndex(dwarf::DW_AT_low_pc);
-  if (!LowPcIdx)
+  auto LowPc = dwarf::toAddress(DIE.find(dwarf::DW_AT_low_pc));
+  if (!LowPc)
     return Flags;
 
-  uint64_t Offset = DIE.getOffset() + getULEB128Size(Abbrev->getCode());
-  DWARFUnit &OrigUnit = Unit.getOrigUnit();
-  uint64_t LowPcOffset, LowPcEndOffset;
-  std::tie(LowPcOffset, LowPcEndOffset) =
-      getAttributeOffsets(Abbrev, *LowPcIdx, Offset, OrigUnit);
-
-  auto LowPc = dwarf::toAddress(DIE.find(dwarf::DW_AT_low_pc));
   assert(LowPc.hasValue() && "low_pc attribute is not an address.");
-  if (!LowPc ||
-      !RelocMgr.hasValidRelocationAt(LowPcOffset, LowPcEndOffset, MyInfo))
+  if (!RelocMgr.hasLiveAddressRange(DIE, MyInfo))
     return Flags;
 
   if (Options.Verbose) {
@@ -453,6 +481,8 @@ unsigned DWARFLinker::shouldKeepSubprogramDIE(
   if (DIE.getTag() == dwarf::DW_TAG_label) {
     if (Unit.hasLabelAt(*LowPc))
       return Flags;
+
+    DWARFUnit &OrigUnit = Unit.getOrigUnit();
     // FIXME: dsymutil-classic compat. dsymutil-classic doesn't consider labels
     // that don't fall into the CU's aranges. This is wrong IMO. Debug info
     // generation bugs aside, this is really wrong in the case of labels, where
@@ -482,14 +512,14 @@ unsigned DWARFLinker::shouldKeepSubprogramDIE(
 /// Check if a DIE should be kept.
 /// \returns updated TraversalFlags.
 unsigned DWARFLinker::shouldKeepDIE(AddressesMap &RelocMgr, RangesTy &Ranges,
-                                    const DWARFDie &DIE, const DwarfFile &File,
+                                    const DWARFDie &DIE, const DWARFFile &File,
                                     CompileUnit &Unit,
                                     CompileUnit::DIEInfo &MyInfo,
                                     unsigned Flags) {
   switch (DIE.getTag()) {
   case dwarf::DW_TAG_constant:
   case dwarf::DW_TAG_variable:
-    return shouldKeepVariableDIE(RelocMgr, DIE, Unit, MyInfo, Flags);
+    return shouldKeepVariableDIE(RelocMgr, DIE, MyInfo, Flags);
   case dwarf::DW_TAG_subprogram:
   case dwarf::DW_TAG_label:
     return shouldKeepSubprogramDIE(RelocMgr, Ranges, DIE, File, Unit, MyInfo,
@@ -522,8 +552,7 @@ static void updateChildIncompleteness(const DWARFDie &Die, CompileUnit &CU,
     return;
   }
 
-  unsigned Idx = CU.getOrigUnit().getDIEIndex(Die);
-  CompileUnit::DIEInfo &MyInfo = CU.getInfo(Idx);
+  CompileUnit::DIEInfo &MyInfo = CU.getInfo(Die);
 
   if (ChildInfo.Incomplete || ChildInfo.Prune)
     MyInfo.Incomplete = true;
@@ -545,8 +574,7 @@ static void updateRefIncompleteness(const DWARFDie &Die, CompileUnit &CU,
     return;
   }
 
-  unsigned Idx = CU.getOrigUnit().getDIEIndex(Die);
-  CompileUnit::DIEInfo &MyInfo = CU.getInfo(Idx);
+  CompileUnit::DIEInfo &MyInfo = CU.getInfo(Die);
 
   if (MyInfo.Incomplete)
     return;
@@ -578,8 +606,7 @@ void DWARFLinker::lookForChildDIEsToKeep(
   for (auto Child : reverse(Die.children())) {
     // Add a worklist item before every child to calculate incompleteness right
     // after the current child is processed.
-    unsigned Idx = CU.getOrigUnit().getDIEIndex(Child);
-    CompileUnit::DIEInfo &ChildInfo = CU.getInfo(Idx);
+    CompileUnit::DIEInfo &ChildInfo = CU.getInfo(Child);
     Worklist.emplace_back(Die, CU, WorklistItemType::UpdateChildIncompleteness,
                           &ChildInfo);
     Worklist.emplace_back(Child, CU, Flags);
@@ -590,7 +617,7 @@ void DWARFLinker::lookForChildDIEsToKeep(
 /// kept. All DIEs referenced though attributes should be kept.
 void DWARFLinker::lookForRefDIEsToKeep(
     const DWARFDie &Die, CompileUnit &CU, unsigned Flags,
-    const UnitListTy &Units, const DwarfFile &File,
+    const UnitListTy &Units, const DWARFFile &File,
     SmallVectorImpl<WorklistItem> &Worklist) {
   bool UseOdr = (Flags & DWARFLinker::TF_DependencyWalk)
                     ? (Flags & DWARFLinker::TF_ODR)
@@ -614,8 +641,7 @@ void DWARFLinker::lookForRefDIEsToKeep(
     CompileUnit *ReferencedCU;
     if (auto RefDie =
             resolveDIEReference(File, Units, Val, Die, ReferencedCU)) {
-      uint32_t RefIdx = ReferencedCU->getOrigUnit().getDIEIndex(RefDie);
-      CompileUnit::DIEInfo &Info = ReferencedCU->getInfo(RefIdx);
+      CompileUnit::DIEInfo &Info = ReferencedCU->getInfo(RefDie);
       bool IsModuleRef = Info.Ctxt && Info.Ctxt->getCanonicalDIEOffset() &&
                          Info.Ctxt->isDefinedInClangModule();
       // If the referenced DIE has a DeclContext that has already been
@@ -649,8 +675,7 @@ void DWARFLinker::lookForRefDIEsToKeep(
   for (auto &P : reverse(ReferencedDIEs)) {
     // Add a worklist item before every child to calculate incompleteness right
     // after the current child is processed.
-    uint32_t RefIdx = P.second.getOrigUnit().getDIEIndex(P.first);
-    CompileUnit::DIEInfo &Info = P.second.getInfo(RefIdx);
+    CompileUnit::DIEInfo &Info = P.second.getInfo(P.first);
     Worklist.emplace_back(Die, CU, WorklistItemType::UpdateRefIncompleteness,
                           &Info);
     Worklist.emplace_back(P.first, P.second,
@@ -700,15 +725,14 @@ void DWARFLinker::lookForParentDIEsToKeep(
 /// The return value indicates whether the DIE is incomplete.
 void DWARFLinker::lookForDIEsToKeep(AddressesMap &AddressesMap,
                                     RangesTy &Ranges, const UnitListTy &Units,
-                                    const DWARFDie &Die, const DwarfFile &File,
+                                    const DWARFDie &Die, const DWARFFile &File,
                                     CompileUnit &Cu, unsigned Flags) {
   // LIFO work list.
   SmallVector<WorklistItem, 4> Worklist;
   Worklist.emplace_back(Die, Cu, Flags);
 
   while (!Worklist.empty()) {
-    WorklistItem Current = Worklist.back();
-    Worklist.pop_back();
+    WorklistItem Current = Worklist.pop_back_val();
 
     // Look at the worklist type to decide what kind of work to perform.
     switch (Current.Type) {
@@ -819,9 +843,12 @@ void DWARFLinker::assignAbbrev(DIEAbbrev &Abbrev) {
 unsigned DWARFLinker::DIECloner::cloneStringAttribute(
     DIE &Die, AttributeSpec AttrSpec, const DWARFFormValue &Val,
     const DWARFUnit &U, OffsetsStringPool &StringPool, AttributesInfo &Info) {
+  Optional<const char *> String = Val.getAsCString();
+  if (!String)
+    return 0;
+
   // Switch everything to out of line strings.
-  const char *String = *Val.getAsCString();
-  auto StringEntry = StringPool.getEntry(String);
+  auto StringEntry = StringPool.getEntry(*String);
 
   // Update attributes info.
   if (AttrSpec.Attr == dwarf::DW_AT_name)
@@ -838,7 +865,7 @@ unsigned DWARFLinker::DIECloner::cloneStringAttribute(
 
 unsigned DWARFLinker::DIECloner::cloneDieReferenceAttribute(
     DIE &Die, const DWARFDie &InputDIE, AttributeSpec AttrSpec,
-    unsigned AttrSize, const DWARFFormValue &Val, const DwarfFile &File,
+    unsigned AttrSize, const DWARFFormValue &Val, const DWARFFile &File,
     CompileUnit &Unit) {
   const DWARFUnit &U = Unit.getOrigUnit();
   uint64_t Ref = *Val.getAsReference();
@@ -854,8 +881,7 @@ unsigned DWARFLinker::DIECloner::cloneDieReferenceAttribute(
   if (!RefDie || AttrSpec.Attr == dwarf::DW_AT_sibling)
     return 0;
 
-  unsigned Idx = RefUnit->getOrigUnit().getDIEIndex(RefDie);
-  CompileUnit::DIEInfo &RefInfo = RefUnit->getInfo(Idx);
+  CompileUnit::DIEInfo &RefInfo = RefUnit->getInfo(RefDie);
 
   // If we already have emitted an equivalent DeclContext, just point
   // at it.
@@ -910,7 +936,7 @@ unsigned DWARFLinker::DIECloner::cloneDieReferenceAttribute(
 }
 
 void DWARFLinker::DIECloner::cloneExpression(
-    DataExtractor &Data, DWARFExpression Expression, const DwarfFile &File,
+    DataExtractor &Data, DWARFExpression Expression, const DWARFFile &File,
     CompileUnit &Unit, SmallVectorImpl<uint8_t> &OutputBuffer) {
   using Encoding = DWARFExpression::Operation::Encoding;
 
@@ -947,8 +973,7 @@ void DWARFLinker::DIECloner::cloneExpression(
       // DW_OP_reinterpret, which is currently not supported.
       if (RefOffset > 0 || Op.getCode() != dwarf::DW_OP_convert) {
         auto RefDie = Unit.getOrigUnit().getDIEForOffset(RefOffset);
-        uint32_t RefIdx = Unit.getOrigUnit().getDIEIndex(RefDie);
-        CompileUnit::DIEInfo &Info = Unit.getInfo(RefIdx);
+        CompileUnit::DIEInfo &Info = Unit.getInfo(RefDie);
         if (DIE *Clone = Info.Clone)
           Offset = Clone->getOffset();
         else
@@ -975,7 +1000,7 @@ void DWARFLinker::DIECloner::cloneExpression(
 }
 
 unsigned DWARFLinker::DIECloner::cloneBlockAttribute(
-    DIE &Die, const DwarfFile &File, CompileUnit &Unit, AttributeSpec AttrSpec,
+    DIE &Die, const DWARFFile &File, CompileUnit &Unit, AttributeSpec AttrSpec,
     const DWARFFormValue &Val, unsigned AttrSize, bool IsLittleEndian) {
   DIEValueList *Attr;
   DIEValue Value;
@@ -1032,6 +1057,7 @@ unsigned DWARFLinker::DIECloner::cloneBlockAttribute(
 unsigned DWARFLinker::DIECloner::cloneAddressAttribute(
     DIE &Die, AttributeSpec AttrSpec, const DWARFFormValue &Val,
     const CompileUnit &Unit, AttributesInfo &Info) {
+  dwarf::Form Form = AttrSpec.Form;
   uint64_t Addr = *Val.getAsAddress();
 
   if (LLVM_UNLIKELY(Linker.Options.Update)) {
@@ -1081,13 +1107,24 @@ unsigned DWARFLinker::DIECloner::cloneAddressAttribute(
       Addr = (Info.OrigCallPc ? Info.OrigCallPc : Addr) + Info.PCOffset;
   }
 
+  // If this is an indexed address emit the relocated address.
+  if (Form == dwarf::DW_FORM_addrx) {
+    if (llvm::Expected<uint64_t> RelocAddr =
+            ObjFile.Addresses->relocateIndexedAddr(Addr)) {
+      Addr = *RelocAddr;
+      Form = dwarf::DW_FORM_addr;
+    } else {
+      Linker.reportWarning(toString(RelocAddr.takeError()), ObjFile);
+    }
+  }
+
   Die.addValue(DIEAlloc, static_cast<dwarf::Attribute>(AttrSpec.Attr),
-               static_cast<dwarf::Form>(AttrSpec.Form), DIEInteger(Addr));
+               static_cast<dwarf::Form>(Form), DIEInteger(Addr));
   return Unit.getOrigUnit().getAddressByteSize();
 }
 
 unsigned DWARFLinker::DIECloner::cloneScalarAttribute(
-    DIE &Die, const DWARFDie &InputDIE, const DwarfFile &File,
+    DIE &Die, const DWARFDie &InputDIE, const DWARFFile &File,
     CompileUnit &Unit, AttributeSpec AttrSpec, const DWARFFormValue &Val,
     unsigned AttrSize, AttributesInfo &Info) {
   uint64_t Value;
@@ -1155,7 +1192,7 @@ unsigned DWARFLinker::DIECloner::cloneScalarAttribute(
 /// value \p Val, and add it to \p Die.
 /// \returns the size of the cloned attribute.
 unsigned DWARFLinker::DIECloner::cloneAttribute(
-    DIE &Die, const DWARFDie &InputDIE, const DwarfFile &File,
+    DIE &Die, const DWARFDie &InputDIE, const DWARFFile &File,
     CompileUnit &Unit, OffsetsStringPool &StringPool, const DWARFFormValue &Val,
     const AttributeSpec AttrSpec, unsigned AttrSize, AttributesInfo &Info,
     bool IsLittleEndian) {
@@ -1164,6 +1201,11 @@ unsigned DWARFLinker::DIECloner::cloneAttribute(
   switch (AttrSpec.Form) {
   case dwarf::DW_FORM_strp:
   case dwarf::DW_FORM_string:
+  case dwarf::DW_FORM_strx:
+  case dwarf::DW_FORM_strx1:
+  case dwarf::DW_FORM_strx2:
+  case dwarf::DW_FORM_strx3:
+  case dwarf::DW_FORM_strx4:
     return cloneStringAttribute(Die, AttrSpec, Val, U, StringPool, Info);
   case dwarf::DW_FORM_ref_addr:
   case dwarf::DW_FORM_ref1:
@@ -1180,6 +1222,7 @@ unsigned DWARFLinker::DIECloner::cloneAttribute(
     return cloneBlockAttribute(Die, File, Unit, AttrSpec, Val, AttrSize,
                                IsLittleEndian);
   case dwarf::DW_FORM_addr:
+  case dwarf::DW_FORM_addrx:
     return cloneAddressAttribute(Die, AttrSpec, Val, Unit, Info);
   case dwarf::DW_FORM_data1:
   case dwarf::DW_FORM_data2:
@@ -1193,9 +1236,10 @@ unsigned DWARFLinker::DIECloner::cloneAttribute(
     return cloneScalarAttribute(Die, InputDIE, File, Unit, AttrSpec, Val,
                                 AttrSize, Info);
   default:
-    Linker.reportWarning(
-        "Unsupported attribute form in cloneAttribute. Dropping.", File,
-        &InputDIE);
+    Linker.reportWarning("Unsupported attribute form " +
+                             dwarf::FormEncodingString(AttrSpec.Form) +
+                             " in cloneAttribute. Dropping.",
+                         File, &InputDIE);
   }
 
   return 0;
@@ -1259,6 +1303,9 @@ shouldSkipAttribute(DWARFAbbreviationDeclaration::AttributeSpec AttrSpec,
   case dwarf::DW_AT_high_pc:
   case dwarf::DW_AT_ranges:
     return SkipPC;
+  case dwarf::DW_AT_str_offsets_base:
+    // FIXME: Use the string offset table with Dwarf 5.
+    return true;
   case dwarf::DW_AT_location:
   case dwarf::DW_AT_frame_base:
     // FIXME: for some reason dsymutil-classic keeps the location attributes
@@ -1273,7 +1320,7 @@ shouldSkipAttribute(DWARFAbbreviationDeclaration::AttributeSpec AttrSpec,
 }
 
 DIE *DWARFLinker::DIECloner::cloneDIE(const DWARFDie &InputDIE,
-                                      const DwarfFile &File, CompileUnit &Unit,
+                                      const DWARFFile &File, CompileUnit &Unit,
                                       OffsetsStringPool &StringPool,
                                       int64_t PCOffset, uint32_t OutOffset,
                                       unsigned Flags, bool IsLittleEndian,
@@ -1483,7 +1530,7 @@ DIE *DWARFLinker::DIECloner::cloneDIE(const DWARFDie &InputDIE,
 /// to point at the new entries.
 void DWARFLinker::patchRangesForUnit(const CompileUnit &Unit,
                                      DWARFContext &OrigDwarf,
-                                     const DwarfFile &File) const {
+                                     const DWARFFile &File) const {
   DWARFDebugRangeList RangeList;
   const auto &FunctionRanges = Unit.getFunctionRanges();
   unsigned AddressSize = Unit.getOrigUnit().getAddressByteSize();
@@ -1551,7 +1598,7 @@ static void insertLineSequence(std::vector<DWARFDebugLine::Row> &Seq,
     return;
 
   if (!Rows.empty() && Rows.back().Address < Seq.front().Address) {
-    Rows.insert(Rows.end(), Seq.begin(), Seq.end());
+    llvm::append_range(Rows, Seq);
     Seq.clear();
     return;
   }
@@ -1590,7 +1637,7 @@ static void patchStmtList(DIE &Die, DIEInteger Offset) {
 /// are present in the binary.
 void DWARFLinker::patchLineTableForUnit(CompileUnit &Unit,
                                         DWARFContext &OrigDwarf,
-                                        const DwarfFile &File) {
+                                        const DWARFFile &File) {
   DWARFDie CUDie = Unit.getOrigUnit().getUnitDIE();
   auto StmtList = dwarf::toSectionOffset(CUDie.find(dwarf::DW_AT_stmt_list));
   if (!StmtList)
@@ -1790,7 +1837,7 @@ void DWARFLinker::emitDwarfAcceleratorEntriesForUnit(CompileUnit &Unit) {
 /// This is actually pretty easy as the data of the CIEs and FDEs can
 /// be considered as black boxes and moved as is. The only thing to do
 /// is to patch the addresses in the headers.
-void DWARFLinker::patchFrameInfoForObject(const DwarfFile &File,
+void DWARFLinker::patchFrameInfoForObject(const DWARFFile &File,
                                           RangesTy &Ranges,
                                           DWARFContext &OrigDwarf,
                                           unsigned AddrSize) {
@@ -1887,7 +1934,7 @@ void DWARFLinker::DIECloner::copyAbbrev(
 
 uint32_t DWARFLinker::DIECloner::hashFullyQualifiedName(DWARFDie DIE,
                                                         CompileUnit &U,
-                                                        const DwarfFile &File,
+                                                        const DWARFFile &File,
                                                         int ChildRecurseDepth) {
   const char *Name = nullptr;
   DWARFUnit *OrigUnit = &U.getOrigUnit();
@@ -1951,11 +1998,13 @@ static std::string remapPath(StringRef Path,
   return p.str().str();
 }
 
-bool DWARFLinker::registerModuleReference(
-    DWARFDie CUDie, const DWARFUnit &Unit, const DwarfFile &File,
-    OffsetsStringPool &StringPool, UniquingStringPool &UniquingStringPool,
-    DeclContextTree &ODRContexts, uint64_t ModulesEndOffset, unsigned &UnitID,
-    bool IsLittleEndian, unsigned Indent, bool Quiet) {
+bool DWARFLinker::registerModuleReference(DWARFDie CUDie, const DWARFUnit &Unit,
+                                          const DWARFFile &File,
+                                          OffsetsStringPool &StringPool,
+                                          DeclContextTree &ODRContexts,
+                                          uint64_t ModulesEndOffset,
+                                          unsigned &UnitID, bool IsLittleEndian,
+                                          unsigned Indent, bool Quiet) {
   std::string PCMfile = dwarf::toString(
       CUDie.find({dwarf::DW_AT_dwo_name, dwarf::DW_AT_GNU_dwo_name}), "");
   if (PCMfile.empty())
@@ -1999,10 +2048,9 @@ bool DWARFLinker::registerModuleReference(
   // shouldn't run into an infinite loop, so mark it as processed now.
   ClangModules.insert({PCMfile, DwoId});
 
-  if (Error E =
-          loadClangModule(CUDie, PCMfile, Name, DwoId, File, StringPool,
-                          UniquingStringPool, ODRContexts, ModulesEndOffset,
-                          UnitID, IsLittleEndian, Indent + 2, Quiet)) {
+  if (Error E = loadClangModule(CUDie, PCMfile, Name, DwoId, File, StringPool,
+                                ODRContexts, ModulesEndOffset, UnitID,
+                                IsLittleEndian, Indent + 2, Quiet)) {
     consumeError(std::move(E));
     return false;
   }
@@ -2011,10 +2059,9 @@ bool DWARFLinker::registerModuleReference(
 
 Error DWARFLinker::loadClangModule(
     DWARFDie CUDie, StringRef Filename, StringRef ModuleName, uint64_t DwoId,
-    const DwarfFile &File, OffsetsStringPool &StringPool,
-    UniquingStringPool &UniquingStringPool, DeclContextTree &ODRContexts,
-    uint64_t ModulesEndOffset, unsigned &UnitID, bool IsLittleEndian,
-    unsigned Indent, bool Quiet) {
+    const DWARFFile &File, OffsetsStringPool &StringPool,
+    DeclContextTree &ODRContexts, uint64_t ModulesEndOffset, unsigned &UnitID,
+    bool IsLittleEndian, unsigned Indent, bool Quiet) {
   /// Using a SmallString<0> because loadClangModule() is recursive.
   SmallString<0> Path(Options.PrependPath);
   if (sys::path::is_relative(Filename))
@@ -2038,9 +2085,9 @@ Error DWARFLinker::loadClangModule(
     auto CUDie = CU->getUnitDIE(false);
     if (!CUDie)
       continue;
-    if (!registerModuleReference(
-            CUDie, *CU, File, StringPool, UniquingStringPool, ODRContexts,
-            ModulesEndOffset, UnitID, IsLittleEndian, Indent, Quiet)) {
+    if (!registerModuleReference(CUDie, *CU, File, StringPool, ODRContexts,
+                                 ModulesEndOffset, UnitID, IsLittleEndian,
+                                 Indent, Quiet)) {
       if (Unit) {
         std::string Err =
             (Filename +
@@ -2068,9 +2115,8 @@ Error DWARFLinker::loadClangModule(
       Unit = std::make_unique<CompileUnit>(*CU, UnitID++, !Options.NoODR,
                                            ModuleName);
       Unit->setHasInterestingContent();
-      analyzeContextInfo(CUDie, 0, *Unit, &ODRContexts.getRoot(),
-                         UniquingStringPool, ODRContexts, ModulesEndOffset,
-                         Options.ParseableSwiftInterfaces,
+      analyzeContextInfo(CUDie, 0, *Unit, &ODRContexts.getRoot(), ODRContexts,
+                         ModulesEndOffset, Options.ParseableSwiftInterfaces,
                          [&](const Twine &Warning, const DWARFDie &DIE) {
                            reportWarning(Warning, File, &DIE);
                          });
@@ -2096,17 +2142,19 @@ Error DWARFLinker::loadClangModule(
 }
 
 uint64_t DWARFLinker::DIECloner::cloneAllCompileUnits(
-    DWARFContext &DwarfContext, const DwarfFile &File,
+    DWARFContext &DwarfContext, const DWARFFile &File,
     OffsetsStringPool &StringPool, bool IsLittleEndian) {
   uint64_t OutputDebugInfoSize =
       Linker.Options.NoOutput ? 0 : Emitter->getDebugInfoSectionSize();
   const uint64_t StartOutputDebugInfoSize = OutputDebugInfoSize;
 
   for (auto &CurrentUnit : CompileUnits) {
+    const uint16_t DwarfVersion = CurrentUnit->getOrigUnit().getVersion();
+    const uint32_t UnitHeaderSize = DwarfVersion >= 5 ? 12 : 11;
     auto InputDIE = CurrentUnit->getOrigUnit().getUnitDIE();
     CurrentUnit->setStartOffset(OutputDebugInfoSize);
     if (!InputDIE) {
-      OutputDebugInfoSize = CurrentUnit->computeNextUnitOffset();
+      OutputDebugInfoSize = CurrentUnit->computeNextUnitOffset(DwarfVersion);
       continue;
     }
     if (CurrentUnit->getInfo(0).Keep) {
@@ -2114,11 +2162,11 @@ uint64_t DWARFLinker::DIECloner::cloneAllCompileUnits(
       // already has a DIE inside of it.
       CurrentUnit->createOutputDIE();
       cloneDIE(InputDIE, File, *CurrentUnit, StringPool, 0 /* PC offset */,
-               11 /* Unit Header size */, 0, IsLittleEndian,
+               UnitHeaderSize, 0, IsLittleEndian,
                CurrentUnit->getOutputUnitDIE());
     }
 
-    OutputDebugInfoSize = CurrentUnit->computeNextUnitOffset();
+    OutputDebugInfoSize = CurrentUnit->computeNextUnitOffset(DwarfVersion);
 
     if (!Linker.Options.NoOutput) {
       assert(Emitter);
@@ -2159,12 +2207,14 @@ uint64_t DWARFLinker::DIECloner::cloneAllCompileUnits(
       if (!CurrentUnit->getOutputUnitDIE())
         continue;
 
+      unsigned DwarfVersion = CurrentUnit->getOrigUnit().getVersion();
+
       assert(Emitter->getDebugInfoSectionSize() ==
              CurrentUnit->getStartOffset());
-      Emitter->emitCompileUnitHeader(*CurrentUnit);
+      Emitter->emitCompileUnitHeader(*CurrentUnit, DwarfVersion);
       Emitter->emitDIE(*CurrentUnit->getOutputUnitDIE());
       assert(Emitter->getDebugInfoSectionSize() ==
-             CurrentUnit->computeNextUnitOffset());
+             CurrentUnit->computeNextUnitOffset(DwarfVersion));
     }
   }
 
@@ -2190,7 +2240,7 @@ void DWARFLinker::updateAccelKind(DWARFContext &Dwarf) {
   }
 }
 
-bool DWARFLinker::emitPaperTrailWarnings(const DwarfFile &File,
+bool DWARFLinker::emitPaperTrailWarnings(const DWARFFile &File,
                                          OffsetsStringPool &StringPool) {
 
   if (File.Warnings.empty())
@@ -2267,7 +2317,7 @@ void DWARFLinker::copyInvariantDebugSection(DWARFContext &Dwarf) {
                                        "debug_aranges");
 }
 
-void DWARFLinker::addObjectFile(DwarfFile &File) {
+void DWARFLinker::addObjectFile(DWARFFile &File) {
   ObjectContexts.emplace_back(LinkContext(File));
 
   if (ObjectContexts.back().File.Dwarf)
@@ -2284,10 +2334,6 @@ bool DWARFLinker::link() {
   // parallel loop.
   unsigned NumObjects = ObjectContexts.size();
 
-  // This Dwarf string pool which is only used for uniquing. This one should
-  // never be used for offsets as its not thread-safe or predictable.
-  UniquingStringPool UniquingStringPool(nullptr, true);
-
   // This Dwarf string pool which is used for emission. It must be used
   // serially as the order of calling getStringOffset matters for
   // reproducibility.
@@ -2357,7 +2403,7 @@ bool DWARFLinker::link() {
       }
       if (CUDie && !LLVM_UNLIKELY(Options.Update))
         registerModuleReference(CUDie, *CU, OptContext.File, OffsetsStringPool,
-                                UniquingStringPool, ODRContexts, 0, UnitID,
+                                ODRContexts, 0, UnitID,
                                 OptContext.File.Dwarf->isLittleEndian());
     }
   }
@@ -2399,8 +2445,8 @@ bool DWARFLinker::link() {
       auto CUDie = CU->getUnitDIE(false);
       if (!CUDie || LLVM_UNLIKELY(Options.Update) ||
           !registerModuleReference(CUDie, *CU, Context.File, OffsetsStringPool,
-                                   UniquingStringPool, ODRContexts,
-                                   ModulesEndOffset, UnitID, Quiet)) {
+                                   ODRContexts, ModulesEndOffset, UnitID,
+                                   Quiet)) {
         Context.CompileUnits.push_back(std::make_unique<CompileUnit>(
             *CU, UnitID++, !Options.NoODR && !Options.Update, ""));
       }
@@ -2412,9 +2458,8 @@ bool DWARFLinker::link() {
       if (!CUDie)
         continue;
       analyzeContextInfo(CurrentUnit->getOrigUnit().getUnitDIE(), 0,
-                         *CurrentUnit, &ODRContexts.getRoot(),
-                         UniquingStringPool, ODRContexts, ModulesEndOffset,
-                         Options.ParseableSwiftInterfaces,
+                         *CurrentUnit, &ODRContexts.getRoot(), ODRContexts,
+                         ModulesEndOffset, Options.ParseableSwiftInterfaces,
                          [&](const Twine &Warning, const DWARFDie &DIE) {
                            reportWarning(Warning, Context.File, &DIE);
                          });
@@ -2544,7 +2589,7 @@ bool DWARFLinker::link() {
     std::vector<std::pair<StringRef, DebugInfoSize>> Sorted;
     for (auto &E : SizeByObject)
       Sorted.emplace_back(E.first(), E.second);
-    llvm::sort(Sorted.begin(), Sorted.end(), [](auto &LHS, auto &RHS) {
+    llvm::sort(Sorted, [](auto &LHS, auto &RHS) {
       return LHS.second.Output > RHS.second.Output;
     });
 
diff --git a/contrib/llvm-project/llvm/lib/DWARFLinker/DWARFLinkerCompileUnit.cpp b/contrib/llvm-project/llvm/lib/DWARFLinker/DWARFLinkerCompileUnit.cpp
index f59a9023c690..925ab3d295c2 100644
--- a/contrib/llvm-project/llvm/lib/DWARFLinker/DWARFLinkerCompileUnit.cpp
+++ b/contrib/llvm-project/llvm/lib/DWARFLinker/DWARFLinkerCompileUnit.cpp
@@ -36,7 +36,7 @@ StringRef CompileUnit::getSysRoot() {
   }
   return SysRoot;
 }
- 
+
 void CompileUnit::markEverythingAsKept() {
   unsigned Idx = 0;
 
@@ -69,10 +69,10 @@ void CompileUnit::markEverythingAsKept() {
   }
 }
 
-uint64_t CompileUnit::computeNextUnitOffset() {
+uint64_t CompileUnit::computeNextUnitOffset(uint16_t DwarfVersion) {
   NextUnitOffset = StartOffset;
   if (NewUnit) {
-    NextUnitOffset += 11 /* Header size */;
+    NextUnitOffset += (DwarfVersion >= 5) ? 12 : 11; // Header size
     NextUnitOffset += NewUnit->getUnitDie().getSize();
   }
   return NextUnitOffset;
diff --git a/contrib/llvm-project/llvm/lib/DWARFLinker/DWARFLinkerDeclContext.cpp b/contrib/llvm-project/llvm/lib/DWARFLinker/DWARFLinkerDeclContext.cpp
index c9a5da6676b3..d9b3c4235b4d 100644
--- a/contrib/llvm-project/llvm/lib/DWARFLinker/DWARFLinkerDeclContext.cpp
+++ b/contrib/llvm-project/llvm/lib/DWARFLinker/DWARFLinkerDeclContext.cpp
@@ -40,9 +40,9 @@ bool DeclContext::setLastSeenDIE(CompileUnit &U, const DWARFDie &Die) {
   return true;
 }
 
-PointerIntPair<DeclContext *, 1> DeclContextTree::getChildDeclContext(
-    DeclContext &Context, const DWARFDie &DIE, CompileUnit &U,
-    UniquingStringPool &StringPool, bool InClangModule) {
+PointerIntPair<DeclContext *, 1>
+DeclContextTree::getChildDeclContext(DeclContext &Context, const DWARFDie &DIE,
+                                     CompileUnit &U, bool InClangModule) {
   unsigned Tag = DIE.getTag();
 
   // FIXME: dsymutil-classic compat: We should bail out here if we
@@ -80,27 +80,20 @@ PointerIntPair<DeclContext *, 1> DeclContextTree::getChildDeclContext(
     break;
   }
 
-  const char *Name = DIE.getLinkageName();
-  const char *ShortName = DIE.getShortName();
-
-  if (!Name)
-    Name = ShortName;
-
   StringRef NameRef;
-  StringRef ShortNameRef;
   StringRef FileRef;
 
-  if (Name)
-    NameRef = StringPool.internString(Name);
-  else if (Tag == dwarf::DW_TAG_namespace)
+  if (const char *LinkageName = DIE.getLinkageName())
+    NameRef = StringPool.internString(LinkageName);
+  else if (const char *ShortName = DIE.getShortName())
+    NameRef = StringPool.internString(ShortName);
+
+  bool IsAnonymousNamespace = NameRef.empty() && Tag == dwarf::DW_TAG_namespace;
+  if (IsAnonymousNamespace) {
     // FIXME: For dsymutil-classic compatibility. I think uniquing within
     // anonymous namespaces is wrong. There is no ODR guarantee there.
-    NameRef = StringPool.internString("(anonymous namespace)");
-
-  if (ShortName && ShortName != Name)
-    ShortNameRef = StringPool.internString(ShortName);
-  else
-    ShortNameRef = NameRef;
+    NameRef = "(anonymous namespace)";
+  }
 
   if (Tag != dwarf::DW_TAG_class_type && Tag != dwarf::DW_TAG_structure_type &&
       Tag != dwarf::DW_TAG_union_type &&
@@ -121,7 +114,7 @@ PointerIntPair<DeclContext *, 1> DeclContextTree::getChildDeclContext(
     // module-defined types do not have a file and line.
     ByteSize = dwarf::toUnsigned(DIE.find(dwarf::DW_AT_byte_size),
                                  std::numeric_limits<uint64_t>::max());
-    if (Tag != dwarf::DW_TAG_namespace || !Name) {
+    if (Tag != dwarf::DW_TAG_namespace || IsAnonymousNamespace) {
       if (unsigned FileNum =
               dwarf::toUnsigned(DIE.find(dwarf::DW_AT_decl_file), 0)) {
         if (const auto *LT = U.getOrigUnit().getContext().getLineTableForUnit(
@@ -129,29 +122,14 @@ PointerIntPair<DeclContext *, 1> DeclContextTree::getChildDeclContext(
           // FIXME: dsymutil-classic compatibility. I'd rather not
           // unique anything in anonymous namespaces, but if we do, then
           // verify that the file and line correspond.
-          if (!Name && Tag == dwarf::DW_TAG_namespace)
+          if (IsAnonymousNamespace)
             FileNum = 1;
 
           if (LT->hasFileAtIndex(FileNum)) {
             Line = dwarf::toUnsigned(DIE.find(dwarf::DW_AT_decl_line), 0);
             // Cache the resolved paths based on the index in the line table,
-            // because calling realpath is expansive.
-            StringRef ResolvedPath = U.getResolvedPath(FileNum);
-            if (!ResolvedPath.empty()) {
-              FileRef = ResolvedPath;
-            } else {
-              std::string File;
-              bool FoundFileName = LT->getFileNameByIndex(
-                  FileNum, U.getOrigUnit().getCompilationDir(),
-                  DILineInfoSpecifier::FileLineInfoKind::AbsoluteFilePath,
-                  File);
-              (void)FoundFileName;
-              assert(FoundFileName && "Must get file name from line table");
-              // Second level of caching, this time based on the file's parent
-              // path.
-              FileRef = PathResolver.resolve(File, StringPool);
-              U.setResolvedPath(FileNum, FileRef);
-            }
+            // because calling realpath is expensive.
+            FileRef = getResolvedPath(U, FileNum, *LT);
           }
         }
       }
@@ -175,7 +153,7 @@ PointerIntPair<DeclContext *, 1> DeclContextTree::getChildDeclContext(
 
   // FIXME: dsymutil-classic compatibility: when we don't have a name,
   // use the filename.
-  if (Tag == dwarf::DW_TAG_namespace && NameRef == "(anonymous namespace)")
+  if (IsAnonymousNamespace)
     Hash = hash_combine(Hash, FileRef);
 
   // Now look if this context already exists.
@@ -210,4 +188,28 @@ PointerIntPair<DeclContext *, 1> DeclContextTree::getChildDeclContext(
   return PointerIntPair<DeclContext *, 1>(*ContextIter);
 }
 
+StringRef
+DeclContextTree::getResolvedPath(CompileUnit &CU, unsigned FileNum,
+                                 const DWARFDebugLine::LineTable &LineTable) {
+  std::pair<unsigned, unsigned> Key = {CU.getUniqueID(), FileNum};
+
+  ResolvedPathsMap::const_iterator It = ResolvedPaths.find(Key);
+  if (It == ResolvedPaths.end()) {
+    std::string FileName;
+    bool FoundFileName = LineTable.getFileNameByIndex(
+        FileNum, CU.getOrigUnit().getCompilationDir(),
+        DILineInfoSpecifier::FileLineInfoKind::AbsoluteFilePath, FileName);
+    (void)FoundFileName;
+    assert(FoundFileName && "Must get file name from line table");
+
+    // Second level of caching, this time based on the file's parent
+    // path.
+    StringRef ResolvedPath = PathResolver.resolve(FileName, StringPool);
+
+    It = ResolvedPaths.insert(std::make_pair(Key, ResolvedPath)).first;
+  }
+
+  return It->second;
+}
+
 } // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/DWARFLinker/DWARFStreamer.cpp b/contrib/llvm-project/llvm/lib/DWARFLinker/DWARFStreamer.cpp
index e900335f24b3..c0043ae39efe 100644
--- a/contrib/llvm-project/llvm/lib/DWARFLinker/DWARFStreamer.cpp
+++ b/contrib/llvm-project/llvm/lib/DWARFLinker/DWARFStreamer.cpp
@@ -121,16 +121,23 @@ void DwarfStreamer::switchToDebugInfoSection(unsigned DwarfVersion) {
 
 /// Emit the compilation unit header for \p Unit in the debug_info section.
 ///
-/// A Dwarf section header is encoded as:
+/// A Dwarf 4 section header is encoded as:
 ///  uint32_t   Unit length (omitting this field)
 ///  uint16_t   Version
 ///  uint32_t   Abbreviation table offset
 ///  uint8_t    Address size
-///
 /// Leading to a total of 11 bytes.
-void DwarfStreamer::emitCompileUnitHeader(CompileUnit &Unit) {
-  unsigned Version = Unit.getOrigUnit().getVersion();
-  switchToDebugInfoSection(Version);
+///
+/// A Dwarf 5 section header is encoded as:
+///  uint32_t   Unit length (omitting this field)
+///  uint16_t   Version
+///  uint8_t    Unit type
+///  uint8_t    Address size
+///  uint32_t   Abbreviation table offset
+/// Leading to a total of 12 bytes.
+void DwarfStreamer::emitCompileUnitHeader(CompileUnit &Unit,
+                                          unsigned DwarfVersion) {
+  switchToDebugInfoSection(DwarfVersion);
 
   /// The start of the unit within its section.
   Unit.setLabelBegin(Asm->createTempSymbol("cu_begin"));
@@ -140,13 +147,22 @@ void DwarfStreamer::emitCompileUnitHeader(CompileUnit &Unit) {
   // been computed in CompileUnit::computeOffsets(). Subtract 4 to that size to
   // account for the length field.
   Asm->emitInt32(Unit.getNextUnitOffset() - Unit.getStartOffset() - 4);
-  Asm->emitInt16(Version);
-
-  // We share one abbreviations table across all units so it's always at the
-  // start of the section.
-  Asm->emitInt32(0);
-  Asm->emitInt8(Unit.getOrigUnit().getAddressByteSize());
-  DebugInfoSectionSize += 11;
+  Asm->emitInt16(DwarfVersion);
+
+  if (DwarfVersion >= 5) {
+    Asm->emitInt8(dwarf::DW_UT_compile);
+    Asm->emitInt8(Unit.getOrigUnit().getAddressByteSize());
+    // We share one abbreviations table across all units so it's always at the
+    // start of the section.
+    Asm->emitInt32(0);
+    DebugInfoSectionSize += 12;
+  } else {
+    // We share one abbreviations table across all units so it's always at the
+    // start of the section.
+    Asm->emitInt32(0);
+    Asm->emitInt8(Unit.getOrigUnit().getAddressByteSize());
+    DebugInfoSectionSize += 11;
+  }
 
   // Remember this CU.
   EmittedUnits.push_back({Unit.getUniqueID(), Unit.getLabelBegin()});
@@ -211,6 +227,16 @@ void DwarfStreamer::emitStrings(const NonRelocatableStringpool &Pool) {
     // Emit a null terminator.
     Asm->emitInt8(0);
   }
+
+#if 0
+  if (DwarfVersion >= 5) {
+    // Emit an empty string offset section.
+    Asm->OutStreamer->SwitchSection(MOFI->getDwarfStrOffSection());
+    Asm->emitDwarfUnitLength(4, "Length of String Offsets Set");
+    Asm->emitInt16(DwarfVersion);
+    Asm->emitInt16(0);
+  }
+#endif
 }
 
 void DwarfStreamer::emitDebugNames(
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/CodeView/CodeViewRecordIO.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/CodeView/CodeViewRecordIO.cpp
index 49761b9dce88..c272985cf2d4 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/CodeView/CodeViewRecordIO.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/CodeView/CodeViewRecordIO.cpp
@@ -273,7 +273,6 @@ Error CodeViewRecordIO::mapStringZVectorZ(std::vector<StringRef> &Value,
 
 void CodeViewRecordIO::emitEncodedSignedInteger(const int64_t &Value,
                                                 const Twine &Comment) {
-  assert(Value < 0 && "Encoded integer is not signed!");
   if (Value >= std::numeric_limits<int8_t>::min()) {
     Streamer->emitIntValue(LF_CHAR, 2);
     emitComment(Comment);
@@ -322,7 +321,6 @@ void CodeViewRecordIO::emitEncodedUnsignedInteger(const uint64_t &Value,
 }
 
 Error CodeViewRecordIO::writeEncodedSignedInteger(const int64_t &Value) {
-  assert(Value < 0 && "Encoded integer is not signed!");
   if (Value >= std::numeric_limits<int8_t>::min()) {
     if (auto EC = Writer->writeInteger<uint16_t>(LF_CHAR))
       return EC;
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/CodeView/DebugFrameDataSubsection.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/CodeView/DebugFrameDataSubsection.cpp
index be8c32d5b294..9bc69abea102 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/CodeView/DebugFrameDataSubsection.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/CodeView/DebugFrameDataSubsection.cpp
@@ -47,10 +47,9 @@ Error DebugFrameDataSubsection::commit(BinaryStreamWriter &Writer) const {
   }
 
   std::vector<FrameData> SortedFrames(Frames.begin(), Frames.end());
-  std::sort(SortedFrames.begin(), SortedFrames.end(),
-            [](const FrameData &LHS, const FrameData &RHS) {
-              return LHS.RvaStart < RHS.RvaStart;
-            });
+  llvm::sort(SortedFrames, [](const FrameData &LHS, const FrameData &RHS) {
+    return LHS.RvaStart < RHS.RvaStart;
+  });
   if (auto EC = Writer.writeArray(makeArrayRef(SortedFrames)))
     return EC;
   return Error::success();
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/CodeView/EnumTables.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/CodeView/EnumTables.cpp
index 82f6713a88f5..949707bf5475 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/CodeView/EnumTables.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/CodeView/EnumTables.cpp
@@ -39,6 +39,14 @@ static const EnumEntry<uint16_t> RegisterNames_X86[] = {
 #undef CV_REGISTERS_X86
 };
 
+static const EnumEntry<uint16_t> RegisterNames_ARM[] = {
+#define CV_REGISTERS_ARM
+#define CV_REGISTER(name, val) CV_ENUM_CLASS_ENT(RegisterId, name),
+#include "llvm/DebugInfo/CodeView/CodeViewRegisters.def"
+#undef CV_REGISTER
+#undef CV_REGISTERS_ARM
+};
+
 static const EnumEntry<uint16_t> RegisterNames_ARM64[] = {
 #define CV_REGISTERS_ARM64
 #define CV_REGISTER(name, val) CV_ENUM_CLASS_ENT(RegisterId, name),
@@ -434,7 +442,9 @@ ArrayRef<EnumEntry<TypeLeafKind>> getTypeLeafNames() {
 }
 
 ArrayRef<EnumEntry<uint16_t>> getRegisterNames(CPUType Cpu) {
-  if (Cpu == CPUType::ARM64) {
+  if (Cpu == CPUType::ARMNT) {
+    return makeArrayRef(RegisterNames_ARM);
+  } else if (Cpu == CPUType::ARM64) {
     return makeArrayRef(RegisterNames_ARM64);
   }
   return makeArrayRef(RegisterNames_X86);
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp
index 06b20ba33eec..c0fc3e0ef65a 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp
@@ -172,10 +172,10 @@ Error LazyRandomTypeCollection::visitRangeForType(TypeIndex TI) {
   if (PartialOffsets.empty())
     return fullScanForType(TI);
 
-  auto Next = std::upper_bound(PartialOffsets.begin(), PartialOffsets.end(), TI,
-                               [](TypeIndex Value, const TypeIndexOffset &IO) {
-                                 return Value < IO.Type;
-                               });
+  auto Next = llvm::upper_bound(PartialOffsets, TI,
+                                [](TypeIndex Value, const TypeIndexOffset &IO) {
+                                  return Value < IO.Type;
+                                });
 
   assert(Next != PartialOffsets.begin());
   auto Prev = std::prev(Next);
@@ -185,7 +185,7 @@ Error LazyRandomTypeCollection::visitRangeForType(TypeIndex TI) {
     // They've asked us to fetch a type index, but the entry we found in the
     // partial offsets array has already been visited.  Since we visit an entire
     // block every time, that means this record should have been previously
-    // discovered.  Ultimately, this means this is a request for a non-existant
+    // discovered.  Ultimately, this means this is a request for a non-existent
     // type index.
     return make_error<CodeViewError>("Invalid type index");
   }
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/CodeView/RecordName.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/CodeView/RecordName.cpp
index 47b5498181b7..1ca899789bef 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/CodeView/RecordName.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/CodeView/RecordName.cpp
@@ -9,6 +9,7 @@
 #include "llvm/DebugInfo/CodeView/RecordName.h"
 
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/DebugInfo/CodeView/CVSymbolVisitor.h"
 #include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecordMapping.h"
@@ -77,9 +78,10 @@ Error TypeNameComputer::visitKnownRecord(CVType &CVR, ArgListRecord &Args) {
   uint32_t Size = Indices.size();
   Name = "(";
   for (uint32_t I = 0; I < Size; ++I) {
-    assert(Indices[I] < CurrentTypeIndex);
-
-    Name.append(Types.getTypeName(Indices[I]));
+    if (Indices[I] < CurrentTypeIndex)
+      Name.append(Types.getTypeName(Indices[I]));
+    else
+      Name.append("<unknown 0x" + utohexstr(Indices[I].getIndex()) + ">");
     if (I + 1 != Size)
       Name.append(", ");
   }
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/CodeView/RecordSerialization.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/CodeView/RecordSerialization.cpp
index e7f032f9c670..63ce302a4e09 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/CodeView/RecordSerialization.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/CodeView/RecordSerialization.cpp
@@ -34,7 +34,7 @@ StringRef llvm::codeview::getBytesAsCString(ArrayRef<uint8_t> LeafData) {
 }
 
 Error llvm::codeview::consume(BinaryStreamReader &Reader, APSInt &Num) {
-  // Used to avoid overload ambiguity on APInt construtor.
+  // Used to avoid overload ambiguity on APInt constructor.
   bool FalseVal = false;
   uint16_t Short;
   if (auto EC = Reader.readInteger(Short))
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/CodeView/TypeIndexDiscovery.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/CodeView/TypeIndexDiscovery.cpp
index e84e1c9cea78..682747a2b81f 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/CodeView/TypeIndexDiscovery.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/CodeView/TypeIndexDiscovery.cpp
@@ -5,8 +5,9 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-#include "llvm/DebugInfo/CodeView/TypeIndexDiscovery.h"
 
+#include "llvm/DebugInfo/CodeView/TypeIndexDiscovery.h"
+#include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Support/Endian.h"
 
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/CodeView/TypeRecordMapping.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/CodeView/TypeRecordMapping.cpp
index bb71c86a0609..7ac376156146 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/CodeView/TypeRecordMapping.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/CodeView/TypeRecordMapping.cpp
@@ -232,7 +232,7 @@ Error TypeRecordMapping::visitMemberBegin(CVMemberRecord &Record) {
 
   // The largest possible subrecord is one in which there is a record prefix,
   // followed by the subrecord, followed by a continuation, and that entire
-  // sequence spaws `MaxRecordLength` bytes.  So the record's length is
+  // sequence spawns `MaxRecordLength` bytes.  So the record's length is
   // calculated as follows.
 
   constexpr uint32_t ContinuationLength = 8;
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/CodeView/TypeStreamMerger.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/CodeView/TypeStreamMerger.cpp
index 8c4b640bcd19..587a68142a4a 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/CodeView/TypeStreamMerger.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/CodeView/TypeStreamMerger.cpp
@@ -38,7 +38,7 @@ namespace {
 /// 0x1000.
 ///
 /// Type records are only allowed to use type indices smaller than their own, so
-/// a type stream is effectively a topologically sorted DAG. Cycles occuring in
+/// a type stream is effectively a topologically sorted DAG. Cycles occurring in
 /// the type graph of the source program are resolved with forward declarations
 /// of composite types. This class implements the following type stream merging
 /// algorithm, which relies on this DAG structure:
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFAddressRange.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFAddressRange.cpp
index ddf307de2221..25d2e852a7fe 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFAddressRange.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFAddressRange.cpp
@@ -18,8 +18,9 @@ void DWARFAddressRange::dump(raw_ostream &OS, uint32_t AddressSize,
                              const DWARFObject *Obj) const {
 
   OS << (DumpOpts.DisplayRawContents ? " " : "[");
-  OS << format("0x%*.*" PRIx64 ", ", AddressSize * 2, AddressSize * 2, LowPC)
-     << format("0x%*.*" PRIx64, AddressSize * 2, AddressSize * 2, HighPC);
+  DWARFFormValue::dumpAddress(OS, AddressSize, LowPC);
+  OS << ", ";
+  DWARFFormValue::dumpAddress(OS, AddressSize, HighPC);
   OS << (DumpOpts.DisplayRawContents ? "" : ")");
 
   if (Obj)
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp
index 9bd134105c9b..2b08120ef4dc 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp
@@ -22,9 +22,10 @@ void DWARFCompileUnit::dump(raw_ostream &OS, DIDumpOptions DumpOpts) {
      << ", version = " << format("0x%04x", getVersion());
   if (getVersion() >= 5)
     OS << ", unit_type = " << dwarf::UnitTypeString(getUnitType());
-  OS << ", abbr_offset = "
-     << format("0x%04" PRIx64, getAbbreviations()->getOffset())
-     << ", addr_size = " << format("0x%02x", getAddressByteSize());
+  OS << ", abbr_offset = " << format("0x%04" PRIx64, getAbbrOffset());
+  if (!getAbbreviations())
+    OS << " (invalid)";
+  OS << ", addr_size = " << format("0x%02x", getAddressByteSize());
   if (getVersion() >= 5 && getUnitType() != dwarf::DW_UT_compile)
     OS << ", DWO_id = " << format("0x%016" PRIx64, *getDWOId());
   OS << " (next unit at " << format("0x%08" PRIx64, getNextUnitOffset())
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
index bf6219497770..749d738af9c1 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
@@ -255,7 +255,7 @@ static void dumpRnglistsSection(
         break;
       Offset = TableOffset + Length;
     } else {
-      Rnglists.dump(OS, LookupPooledAddress, DumpOpts);
+      Rnglists.dump(rnglistData, OS, LookupPooledAddress, DumpOpts);
     }
   }
 }
@@ -316,7 +316,7 @@ static void dumpLoclistsSection(raw_ostream &OS, DIDumpOptions DumpOpts,
       return;
     }
 
-    Header.dump(OS, DumpOpts);
+    Header.dump(Data, OS, DumpOpts);
 
     uint64_t EndOffset = Header.length() + Header.getHeaderOffset();
     Data.setAddressSize(Header.getAddrSize());
@@ -457,7 +457,7 @@ void DWARFContext::dump(
           shouldDump(Explicit, ".debug_frame", DIDT_ID_DebugFrame,
                      DObj->getFrameSection().Data)) {
     if (Expected<const DWARFDebugFrame *> DF = getDebugFrame())
-      (*DF)->dump(OS, getRegisterInfo(), *Off);
+      (*DF)->dump(OS, DumpOpts, getRegisterInfo(), *Off);
     else
       RecoverableErrorHandler(DF.takeError());
   }
@@ -466,7 +466,7 @@ void DWARFContext::dump(
           shouldDump(Explicit, ".eh_frame", DIDT_ID_DebugFrame,
                      DObj->getEHFrameSection().Data)) {
     if (Expected<const DWARFDebugFrame *> DF = getEHFrame())
-      (*DF)->dump(OS, getRegisterInfo(), *Off);
+      (*DF)->dump(OS, DumpOpts, getRegisterInfo(), *Off);
     else
       RecoverableErrorHandler(DF.takeError());
   }
@@ -502,7 +502,8 @@ void DWARFContext::dump(
                                    0);
     DWARFDebugArangeSet set;
     while (arangesData.isValidOffset(offset)) {
-      if (Error E = set.extract(arangesData, &offset)) {
+      if (Error E =
+              set.extract(arangesData, &offset, DumpOpts.WarningHandler)) {
         RecoverableErrorHandler(std::move(E));
         break;
       }
@@ -525,12 +526,29 @@ void DWARFContext::dump(
     }
   };
 
+  auto DumpStrSection = [&](StringRef Section) {
+    DataExtractor StrData(Section, isLittleEndian(), 0);
+    uint64_t Offset = 0;
+    uint64_t StrOffset = 0;
+    while (StrData.isValidOffset(Offset)) {
+      Error Err = Error::success();
+      const char *CStr = StrData.getCStr(&Offset, &Err);
+      if (Err) {
+        DumpOpts.WarningHandler(std::move(Err));
+        return;
+      }
+      OS << format("0x%8.8" PRIx64 ": \"", StrOffset);
+      OS.write_escaped(CStr);
+      OS << "\"\n";
+      StrOffset = Offset;
+    }
+  };
+
   if (const auto *Off = shouldDump(Explicit, ".debug_line", DIDT_ID_DebugLine,
                                    DObj->getLineSection().Data)) {
     DWARFDataExtractor LineData(*DObj, DObj->getLineSection(), isLittleEndian(),
                                 0);
-    DWARFDebugLine::SectionParser Parser(LineData, *this, compile_units(),
-                                         type_units());
+    DWARFDebugLine::SectionParser Parser(LineData, *this, normal_units());
     DumpLineSection(Parser, DumpOpts, *Off);
   }
 
@@ -539,8 +557,7 @@ void DWARFContext::dump(
                      DObj->getLineDWOSection().Data)) {
     DWARFDataExtractor LineData(*DObj, DObj->getLineDWOSection(),
                                 isLittleEndian(), 0);
-    DWARFDebugLine::SectionParser Parser(LineData, *this, dwo_compile_units(),
-                                         dwo_type_units());
+    DWARFDebugLine::SectionParser Parser(LineData, *this, dwo_units());
     DumpLineSection(Parser, DumpOpts, *Off);
   }
 
@@ -555,37 +572,16 @@ void DWARFContext::dump(
   }
 
   if (shouldDump(Explicit, ".debug_str", DIDT_ID_DebugStr,
-                 DObj->getStrSection())) {
-    DataExtractor strData(DObj->getStrSection(), isLittleEndian(), 0);
-    uint64_t offset = 0;
-    uint64_t strOffset = 0;
-    while (const char *s = strData.getCStr(&offset)) {
-      OS << format("0x%8.8" PRIx64 ": \"%s\"\n", strOffset, s);
-      strOffset = offset;
-    }
-  }
+                 DObj->getStrSection()))
+    DumpStrSection(DObj->getStrSection());
+
   if (shouldDump(ExplicitDWO, ".debug_str.dwo", DIDT_ID_DebugStr,
-                 DObj->getStrDWOSection())) {
-    DataExtractor strDWOData(DObj->getStrDWOSection(), isLittleEndian(), 0);
-    uint64_t offset = 0;
-    uint64_t strDWOOffset = 0;
-    while (const char *s = strDWOData.getCStr(&offset)) {
-      OS << format("0x%8.8" PRIx64 ": \"%s\"\n", strDWOOffset, s);
-      strDWOOffset = offset;
-    }
-  }
+                 DObj->getStrDWOSection()))
+    DumpStrSection(DObj->getStrDWOSection());
+
   if (shouldDump(Explicit, ".debug_line_str", DIDT_ID_DebugLineStr,
-                 DObj->getLineStrSection())) {
-    DataExtractor strData(DObj->getLineStrSection(), isLittleEndian(), 0);
-    uint64_t offset = 0;
-    uint64_t strOffset = 0;
-    while (const char *s = strData.getCStr(&offset)) {
-      OS << format("0x%8.8" PRIx64 ": \"", strOffset);
-      OS.write_escaped(s);
-      OS << "\"\n";
-      strOffset = offset;
-    }
-  }
+                 DObj->getLineStrSection()))
+    DumpStrSection(DObj->getLineStrSection());
 
   if (shouldDump(Explicit, ".debug_addr", DIDT_ID_DebugAddr,
                  DObj->getAddrSection().Data)) {
@@ -1038,7 +1034,9 @@ DWARFContext::DIEsForAddress DWARFContext::getDIEsForAddress(uint64_t Address) {
 static bool getFunctionNameAndStartLineForAddress(DWARFCompileUnit *CU,
                                                   uint64_t Address,
                                                   FunctionNameKind Kind,
+                                                  DILineInfoSpecifier::FileLineInfoKind FileNameKind,
                                                   std::string &FunctionName,
+                                                  std::string &StartFile,
                                                   uint32_t &StartLine) {
   // The address may correspond to instruction in some inlined function,
   // so we have to build the chain of inlined functions and take the
@@ -1055,6 +1053,11 @@ static bool getFunctionNameAndStartLineForAddress(DWARFCompileUnit *CU,
     FunctionName = Name;
     FoundResult = true;
   }
+  std::string DeclFile = DIE.getDeclFile(FileNameKind);
+  if (!DeclFile.empty()) {
+    StartFile = DeclFile;
+    FoundResult = true;
+  }
   if (auto DeclLineResult = DIE.getDeclLine()) {
     StartLine = DeclLineResult;
     FoundResult = true;
@@ -1226,8 +1229,9 @@ DILineInfo DWARFContext::getLineInfoForAddress(object::SectionedAddress Address,
   if (!CU)
     return Result;
 
-  getFunctionNameAndStartLineForAddress(CU, Address.Address, Spec.FNKind,
-                                        Result.FunctionName, Result.StartLine);
+  getFunctionNameAndStartLineForAddress(CU, Address.Address, Spec.FNKind, Spec.FLIKind,
+                                        Result.FunctionName,
+                                        Result.StartFileName, Result.StartLine);
   if (Spec.FLIKind != FileLineInfoKind::None) {
     if (const DWARFLineTable *LineTable = getLineTableForUnit(CU)) {
       LineTable->getFileLineInfoForAddress(
@@ -1246,15 +1250,17 @@ DILineInfoTable DWARFContext::getLineInfoForAddressRange(
     return Lines;
 
   uint32_t StartLine = 0;
+  std::string StartFileName;
   std::string FunctionName(DILineInfo::BadString);
-  getFunctionNameAndStartLineForAddress(CU, Address.Address, Spec.FNKind,
-                                        FunctionName, StartLine);
+  getFunctionNameAndStartLineForAddress(CU, Address.Address, Spec.FNKind, Spec.FLIKind,
+                                        FunctionName, StartFileName, StartLine);
 
   // If the Specifier says we don't need FileLineInfo, just
   // return the top-most function at the starting address.
   if (Spec.FLIKind == FileLineInfoKind::None) {
     DILineInfo Result;
     Result.FunctionName = FunctionName;
+    Result.StartFileName = StartFileName;
     Result.StartLine = StartLine;
     Lines.push_back(std::make_pair(Address.Address, Result));
     return Lines;
@@ -1278,6 +1284,7 @@ DILineInfoTable DWARFContext::getLineInfoForAddressRange(
     Result.FunctionName = FunctionName;
     Result.Line = Row.Line;
     Result.Column = Row.Column;
+    Result.StartFileName = StartFileName;
     Result.StartLine = StartLine;
     Lines.push_back(std::make_pair(Row.Address.Address, Result));
   }
@@ -1320,6 +1327,7 @@ DWARFContext::getInliningInfoForAddress(object::SectionedAddress Address,
       Frame.FunctionName = Name;
     if (auto DeclLineResult = FunctionDIE.getDeclLine())
       Frame.StartLine = DeclLineResult;
+    Frame.StartFileName = FunctionDIE.getDeclFile(Spec.FLIKind);
     if (Spec.FLIKind != FileLineInfoKind::None) {
       if (i == 0) {
         // For the topmost frame, initialize the line table of this
@@ -1701,16 +1709,17 @@ public:
           // FIXME: Use the other dwo range section when we emit it.
           RangesDWOSection.Data = Data;
         }
-      } else if (Name == "debug_info") {
+      } else if (InfoSectionMap *Sections =
+                     StringSwitch<InfoSectionMap *>(Name)
+                         .Case("debug_info", &InfoSections)
+                         .Case("debug_info.dwo", &InfoDWOSections)
+                         .Case("debug_types", &TypesSections)
+                         .Case("debug_types.dwo", &TypesDWOSections)
+                         .Default(nullptr)) {
         // Find debug_info and debug_types data by section rather than name as
         // there are multiple, comdat grouped, of these sections.
-        InfoSections[Section].Data = Data;
-      } else if (Name == "debug_info.dwo") {
-        InfoDWOSections[Section].Data = Data;
-      } else if (Name == "debug_types") {
-        TypesSections[Section].Data = Data;
-      } else if (Name == "debug_types.dwo") {
-        TypesDWOSections[Section].Data = Data;
+        DWARFSectionMap &S = (*Sections)[Section];
+        S.Data = Data;
       }
 
       if (RelocatedSection == Obj.section_end())
@@ -1771,7 +1780,7 @@ public:
 
       // Symbol to [address, section index] cache mapping.
       std::map<SymbolRef, SymInfo> AddrCache;
-      bool (*Supports)(uint64_t);
+      SupportsRelocation Supports;
       RelocationResolver Resolver;
       std::tie(Supports, Resolver) = getRelocationResolver(Obj);
       for (const RelocationRef &Reloc : Section.relocations()) {
@@ -1989,6 +1998,6 @@ uint8_t DWARFContext::getCUAddrSize() {
   // first compile unit. In practice the address size field is repeated across
   // various DWARF headers (at least in version 5) to make it easier to dump
   // them independently, not to enable varying the address size.
-  unit_iterator_range CUs = compile_units();
+  auto CUs = compile_units();
   return CUs.empty() ? 0 : (*CUs.begin())->getAddressByteSize();
 }
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFDataExtractor.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFDataExtractor.cpp
index 886fe1dff976..da6f6ad903f4 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFDataExtractor.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFDataExtractor.cpp
@@ -53,14 +53,16 @@ uint64_t DWARFDataExtractor::getRelocatedValue(uint32_t Size, uint64_t *Off,
 
   ErrorAsOutParameter ErrAsOut(Err);
   Optional<RelocAddrEntry> E = Obj->find(*Section, *Off);
-  uint64_t A = getUnsigned(Off, Size, Err);
+  uint64_t LocData = getUnsigned(Off, Size, Err);
   if (!E || (Err && *Err))
-    return A;
+    return LocData;
   if (SecNdx)
     *SecNdx = E->SectionIndex;
-  uint64_t R = E->Resolver(E->Reloc, E->SymbolValue, A);
+
+  uint64_t R =
+      object::resolveRelocation(E->Resolver, E->Reloc, E->SymbolValue, LocData);
   if (E->Reloc2)
-    R = E->Resolver(*E->Reloc2, E->SymbolValue2, R);
+    R = object::resolveRelocation(E->Resolver, *E->Reloc2, E->SymbolValue2, R);
   return R;
 }
 
@@ -104,10 +106,10 @@ DWARFDataExtractor::getEncodedPointer(uint64_t *Offset, uint8_t Encoding,
     Result = getSigned(Offset, 2);
     break;
   case dwarf::DW_EH_PE_sdata4:
-    Result = getSigned(Offset, 4);
+    Result = SignExtend64<32>(getRelocatedValue(4, Offset));
     break;
   case dwarf::DW_EH_PE_sdata8:
-    Result = getSigned(Offset, 8);
+    Result = getRelocatedValue(8, Offset);
     break;
   default:
     return None;
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFDebugArangeSet.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFDebugArangeSet.cpp
index c3b039b05f30..598e3ecee30e 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFDebugArangeSet.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFDebugArangeSet.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h"
 #include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
@@ -20,9 +21,11 @@ using namespace llvm;
 
 void DWARFDebugArangeSet::Descriptor::dump(raw_ostream &OS,
                                            uint32_t AddressSize) const {
-  OS << format("[0x%*.*" PRIx64 ", ", AddressSize * 2, AddressSize * 2, Address)
-     << format(" 0x%*.*" PRIx64 ")", AddressSize * 2, AddressSize * 2,
-               getEndAddress());
+  OS << '[';
+  DWARFFormValue::dumpAddress(OS, AddressSize, Address);
+  OS << ", ";
+  DWARFFormValue::dumpAddress(OS, AddressSize, getEndAddress());
+  OS << ')';
 }
 
 void DWARFDebugArangeSet::clear() {
@@ -32,7 +35,8 @@ void DWARFDebugArangeSet::clear() {
 }
 
 Error DWARFDebugArangeSet::extract(DWARFDataExtractor data,
-                                   uint64_t *offset_ptr) {
+                                   uint64_t *offset_ptr,
+                                   function_ref<void(Error)> WarningHandler) {
   assert(data.isValidOffset(*offset_ptr));
   ArangeDescriptors.clear();
   Offset = *offset_ptr;
@@ -141,11 +145,11 @@ Error DWARFDebugArangeSet::extract(DWARFDataExtractor data,
     if (arangeDescriptor.Length == 0 && arangeDescriptor.Address == 0) {
       if (*offset_ptr == end_offset)
         return ErrorSuccess();
-      return createStringError(
+      WarningHandler(createStringError(
           errc::invalid_argument,
           "address range table at offset 0x%" PRIx64
           " has a premature terminator entry at offset 0x%" PRIx64,
-          Offset, EntryOffset);
+          Offset, EntryOffset));
     }
 
     ArangeDescriptors.push_back(arangeDescriptor);
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp
index e8ed63075055..e0db469752cd 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp
@@ -28,7 +28,8 @@ void DWARFDebugAranges::extract(
   DWARFDebugArangeSet Set;
 
   while (DebugArangesData.isValidOffset(Offset)) {
-    if (Error E = Set.extract(DebugArangesData, &Offset)) {
+    if (Error E =
+            Set.extract(DebugArangesData, &Offset, RecoverableErrorHandler)) {
       RecoverableErrorHandler(std::move(E));
       return;
     }
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
index 0a1b75592290..b74ecac681f3 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
@@ -12,6 +12,7 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/DataExtractor.h"
@@ -29,6 +30,18 @@
 using namespace llvm;
 using namespace dwarf;
 
+static void printRegister(raw_ostream &OS, const MCRegisterInfo *MRI, bool IsEH,
+                          unsigned RegNum) {
+  if (MRI) {
+    if (Optional<unsigned> LLVMRegNum = MRI->getLLVMRegNum(RegNum, IsEH)) {
+      if (const char *RegName = MRI->getName(*LLVMRegNum)) {
+        OS << RegName;
+        return;
+      }
+    }
+  }
+  OS << "reg" << RegNum;
+}
 
 // See DWARF standard v3, section 7.23
 const uint8_t DWARF_CFI_PRIMARY_OPCODE_MASK = 0xc0;
@@ -221,9 +234,10 @@ ArrayRef<CFIProgram::OperandType[2]> CFIProgram::getOperandTypes() {
 }
 
 /// Print \p Opcode's operand number \p OperandIdx which has value \p Operand.
-void CFIProgram::printOperand(raw_ostream &OS, const MCRegisterInfo *MRI,
-                              bool IsEH, const Instruction &Instr,
-                              unsigned OperandIdx, uint64_t Operand) const {
+void CFIProgram::printOperand(raw_ostream &OS, DIDumpOptions DumpOpts,
+                              const MCRegisterInfo *MRI, bool IsEH,
+                              const Instruction &Instr, unsigned OperandIdx,
+                              uint64_t Operand) const {
   assert(OperandIdx < 2);
   uint8_t Opcode = Instr.Opcode;
   OperandType Type = getOperandTypes()[Opcode][OperandIdx];
@@ -268,17 +282,19 @@ void CFIProgram::printOperand(raw_ostream &OS, const MCRegisterInfo *MRI,
       OS << format(" %" PRId64 "*data_alignment_factor" , Operand);
     break;
   case OT_Register:
-    OS << format(" reg%" PRId64, Operand);
+    OS << ' ';
+    printRegister(OS, MRI, IsEH, Operand);
     break;
   case OT_Expression:
     assert(Instr.Expression && "missing DWARFExpression object");
     OS << " ";
-    Instr.Expression->print(OS, MRI, nullptr, IsEH);
+    Instr.Expression->print(OS, DumpOpts, MRI, nullptr, IsEH);
     break;
   }
 }
 
-void CFIProgram::dump(raw_ostream &OS, const MCRegisterInfo *MRI, bool IsEH,
+void CFIProgram::dump(raw_ostream &OS, DIDumpOptions DumpOpts,
+                      const MCRegisterInfo *MRI, bool IsEH,
                       unsigned IndentLevel) const {
   for (const auto &Instr : Instructions) {
     uint8_t Opcode = Instr.Opcode;
@@ -287,7 +303,7 @@ void CFIProgram::dump(raw_ostream &OS, const MCRegisterInfo *MRI, bool IsEH,
     OS.indent(2 * IndentLevel);
     OS << CallFrameString(Opcode, Arch) << ":";
     for (unsigned i = 0; i < Instr.Ops.size(); ++i)
-      printOperand(OS, MRI, IsEH, Instr, i, Instr.Ops[i]);
+      printOperand(OS, DumpOpts, MRI, IsEH, Instr, i, Instr.Ops[i]);
     OS << '\n';
   }
 }
@@ -304,7 +320,8 @@ constexpr uint64_t getCIEId(bool IsDWARF64, bool IsEH) {
   return DW_CIE_ID;
 }
 
-void CIE::dump(raw_ostream &OS, const MCRegisterInfo *MRI, bool IsEH) const {
+void CIE::dump(raw_ostream &OS, DIDumpOptions DumpOpts,
+               const MCRegisterInfo *MRI, bool IsEH) const {
   // A CIE with a zero length is a terminator entry in the .eh_frame section.
   if (IsEH && Length == 0) {
     OS << format("%08" PRIx64, Offset) << " ZERO terminator\n";
@@ -336,11 +353,12 @@ void CIE::dump(raw_ostream &OS, const MCRegisterInfo *MRI, bool IsEH) const {
     OS << "\n";
   }
   OS << "\n";
-  CFIs.dump(OS, MRI, IsEH);
+  CFIs.dump(OS, DumpOpts, MRI, IsEH);
   OS << "\n";
 }
 
-void FDE::dump(raw_ostream &OS, const MCRegisterInfo *MRI, bool IsEH) const {
+void FDE::dump(raw_ostream &OS, DIDumpOptions DumpOpts,
+               const MCRegisterInfo *MRI, bool IsEH) const {
   OS << format("%08" PRIx64, Offset)
      << format(" %0*" PRIx64, IsDWARF64 ? 16 : 8, Length)
      << format(" %0*" PRIx64, IsDWARF64 && !IsEH ? 16 : 8, CIEPointer)
@@ -354,7 +372,7 @@ void FDE::dump(raw_ostream &OS, const MCRegisterInfo *MRI, bool IsEH) const {
   OS << "  Format:       " << FormatString(IsDWARF64) << "\n";
   if (LSDAAddress)
     OS << format("  LSDA Address: %016" PRIx64 "\n", *LSDAAddress);
-  CFIs.dump(OS, MRI, IsEH);
+  CFIs.dump(OS, DumpOpts, MRI, IsEH);
   OS << "\n";
 }
 
@@ -521,9 +539,9 @@ Error DWARFDebugFrame::parse(DWARFDataExtractor Data) {
                                    "parsing FDE data at 0x%" PRIx64
                                    " failed due to missing CIE",
                                    StartOffset);
-        if (auto Val = Data.getEncodedPointer(
-                &Offset, Cie->getFDEPointerEncoding(),
-                EHFrameAddress ? EHFrameAddress + Offset : 0)) {
+        if (auto Val =
+                Data.getEncodedPointer(&Offset, Cie->getFDEPointerEncoding(),
+                                       EHFrameAddress + Offset)) {
           InitialLocation = *Val;
         }
         if (auto Val = Data.getEncodedPointer(
@@ -583,15 +601,16 @@ FrameEntry *DWARFDebugFrame::getEntryAtOffset(uint64_t Offset) const {
   return nullptr;
 }
 
-void DWARFDebugFrame::dump(raw_ostream &OS, const MCRegisterInfo *MRI,
+void DWARFDebugFrame::dump(raw_ostream &OS, DIDumpOptions DumpOpts,
+                           const MCRegisterInfo *MRI,
                            Optional<uint64_t> Offset) const {
   if (Offset) {
     if (auto *Entry = getEntryAtOffset(*Offset))
-      Entry->dump(OS, MRI, IsEH);
+      Entry->dump(OS, DumpOpts, MRI, IsEH);
     return;
   }
 
   OS << "\n";
   for (const auto &Entry : Entries)
-    Entry->dump(OS, MRI, IsEH);
+    Entry->dump(OS, DumpOpts, MRI, IsEH);
 }
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp
index 87eab34d58ee..2b7d0c3363a1 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp
@@ -38,7 +38,8 @@ bool DWARFDebugInfoEntry::extractFast(const DWARFUnit &U, uint64_t *OffsetPtr,
     AbbrevDecl = nullptr;
     return true;
   }
-  AbbrevDecl = U.getAbbreviations()->getAbbreviationDeclaration(AbbrCode);
+  if (const auto *AbbrevSet = U.getAbbreviations())
+    AbbrevDecl = AbbrevSet->getAbbreviationDeclaration(AbbrCode);
   if (nullptr == AbbrevDecl) {
     // Restore the original offset.
     *OffsetPtr = Offset;
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
index 3ca21e97888c..bda41b1f34e9 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
@@ -79,6 +79,18 @@ bool DWARFDebugLine::Prologue::hasFileAtIndex(uint64_t FileIndex) const {
   return FileIndex != 0 && FileIndex <= FileNames.size();
 }
 
+Optional<uint64_t> DWARFDebugLine::Prologue::getLastValidFileIndex() const {
+  if (FileNames.empty())
+    return None;
+  uint16_t DwarfVersion = getVersion();
+  assert(DwarfVersion != 0 &&
+         "line table prologue has no dwarf version information");
+  // In DWARF v5 the file names are 0-indexed.
+  if (DwarfVersion >= 5)
+    return FileNames.size() - 1;
+  return FileNames.size();
+}
+
 const llvm::DWARFDebugLine::FileNameEntry &
 DWARFDebugLine::Prologue::getFileNameEntry(uint64_t Index) const {
   uint16_t DwarfVersion = getVersion();
@@ -771,6 +783,18 @@ Error DWARFDebugLine::LineTable::parse(
     *OS << '\n';
     Row::dumpTableHeader(*OS, /*Indent=*/Verbose ? 12 : 0);
   }
+  bool TombstonedAddress = false;
+  auto EmitRow = [&] {
+    if (!TombstonedAddress) {
+      if (Verbose) {
+        *OS << "\n";
+        OS->indent(12);
+      }
+      if (OS)
+        State.Row.dump(*OS);
+      State.appendRowToMatrix();
+    }
+  };
   while (*OffsetPtr < EndOffset) {
     DataExtractor::Cursor Cursor(*OffsetPtr);
 
@@ -822,13 +846,7 @@ Error DWARFDebugLine::LineTable::parse(
         // No need to test the Cursor is valid here, since it must be to get
         // into this code path - if it were invalid, the default case would be
         // followed.
-        if (Verbose) {
-          *OS << "\n";
-          OS->indent(12);
-        }
-        if (OS)
-          State.Row.dump(*OS);
-        State.appendRowToMatrix();
+        EmitRow();
         State.resetRowAndSequence();
         break;
 
@@ -870,13 +888,20 @@ Error DWARFDebugLine::LineTable::parse(
             State.Row.Address.Address = TableData.getRelocatedAddress(
                 Cursor, &State.Row.Address.SectionIndex);
 
+            uint64_t Tombstone =
+                dwarf::computeTombstoneAddress(OpcodeAddressSize);
+            TombstonedAddress = State.Row.Address.Address == Tombstone;
+
             // Restore the address size if the extractor already had it.
             if (ExtractorAddressSize != 0)
               TableData.setAddressSize(ExtractorAddressSize);
           }
 
-          if (Cursor && Verbose)
-            *OS << format(" (0x%16.16" PRIx64 ")", State.Row.Address.Address);
+          if (Cursor && Verbose) {
+            *OS << " (";
+            DWARFFormValue::dumpAddress(*OS, OpcodeAddressSize, State.Row.Address.Address);
+            *OS << ')';
+          }
         }
         break;
 
@@ -969,13 +994,7 @@ Error DWARFDebugLine::LineTable::parse(
       case DW_LNS_copy:
         // Takes no arguments. Append a row to the matrix using the
         // current values of the state-machine registers.
-        if (Verbose) {
-          *OS << "\n";
-          OS->indent(12);
-        }
-        if (OS)
-          State.Row.dump(*OS);
-        State.appendRowToMatrix();
+        EmitRow();
         break;
 
       case DW_LNS_advance_pc:
@@ -1140,15 +1159,9 @@ Error DWARFDebugLine::LineTable::parse(
       ParsingState::AddrAndLineDelta Delta =
           State.handleSpecialOpcode(Opcode, OpcodeOffset);
 
-      if (Verbose) {
-        *OS << "address += " << Delta.Address << ",  line += " << Delta.Line
-            << "\n";
-        OS->indent(12);
-      }
-      if (OS)
-        State.Row.dump(*OS);
-
-      State.appendRowToMatrix();
+      if (Verbose)
+        *OS << "address += " << Delta.Address << ",  line += " << Delta.Line;
+      EmitRow();
       *OffsetPtr = Cursor.tell();
     }
 
@@ -1406,25 +1419,20 @@ bool DWARFDebugLine::LineTable::getFileLineInfoForAddress(
 // Therefore, collect up handles on all the Units that point into the
 // line-table section.
 static DWARFDebugLine::SectionParser::LineToUnitMap
-buildLineToUnitMap(DWARFDebugLine::SectionParser::cu_range CUs,
-                   DWARFDebugLine::SectionParser::tu_range TUs) {
+buildLineToUnitMap(DWARFUnitVector::iterator_range Units) {
   DWARFDebugLine::SectionParser::LineToUnitMap LineToUnit;
-  for (const auto &CU : CUs)
-    if (auto CUDIE = CU->getUnitDIE())
+  for (const auto &U : Units)
+    if (auto CUDIE = U->getUnitDIE())
       if (auto StmtOffset = toSectionOffset(CUDIE.find(DW_AT_stmt_list)))
-        LineToUnit.insert(std::make_pair(*StmtOffset, &*CU));
-  for (const auto &TU : TUs)
-    if (auto TUDIE = TU->getUnitDIE())
-      if (auto StmtOffset = toSectionOffset(TUDIE.find(DW_AT_stmt_list)))
-        LineToUnit.insert(std::make_pair(*StmtOffset, &*TU));
+        LineToUnit.insert(std::make_pair(*StmtOffset, &*U));
   return LineToUnit;
 }
 
-DWARFDebugLine::SectionParser::SectionParser(DWARFDataExtractor &Data,
-                                             const DWARFContext &C,
-                                             cu_range CUs, tu_range TUs)
+DWARFDebugLine::SectionParser::SectionParser(
+    DWARFDataExtractor &Data, const DWARFContext &C,
+    DWARFUnitVector::iterator_range Units)
     : DebugLineData(Data), Context(C) {
-  LineToUnit = buildLineToUnitMap(CUs, TUs);
+  LineToUnit = buildLineToUnitMap(Units);
   if (!DebugLineData.isValidOffset(Offset))
     Done = true;
 }
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
index f38126364401..cdffb36741c8 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
@@ -106,15 +106,16 @@ DWARFLocationInterpreter::Interpret(const DWARFLocationEntry &E) {
   }
 }
 
-static void dumpExpression(raw_ostream &OS, ArrayRef<uint8_t> Data,
-                           bool IsLittleEndian, unsigned AddressSize,
-                           const MCRegisterInfo *MRI, DWARFUnit *U) {
+static void dumpExpression(raw_ostream &OS, DIDumpOptions DumpOpts,
+                           ArrayRef<uint8_t> Data, bool IsLittleEndian,
+                           unsigned AddressSize, const MCRegisterInfo *MRI,
+                           DWARFUnit *U) {
   DWARFDataExtractor Extractor(Data, IsLittleEndian, AddressSize);
   // Note. We do not pass any format to DWARFExpression, even if the
   // corresponding unit is known. For now, there is only one operation,
   // DW_OP_call_ref, which depends on the format; it is rarely used, and
   // is unexpected in location tables.
-  DWARFExpression(Extractor, AddressSize).print(OS, MRI, U);
+  DWARFExpression(Extractor, AddressSize).print(OS, DumpOpts, MRI, U);
 }
 
 bool DWARFLocationTable::dumpLocationList(uint64_t *Offset, raw_ostream &OS,
@@ -154,8 +155,8 @@ bool DWARFLocationTable::dumpLocationList(uint64_t *Offset, raw_ostream &OS,
         E.Kind != dwarf::DW_LLE_base_addressx &&
         E.Kind != dwarf::DW_LLE_end_of_list) {
       OS << ": ";
-      dumpExpression(OS, E.Loc, Data.isLittleEndian(), Data.getAddressSize(),
-                     MRI, U);
+      dumpExpression(OS, DumpOpts, E.Loc, Data.isLittleEndian(),
+                     Data.getAddressSize(), MRI, U);
     }
     return true;
   });
@@ -259,7 +260,6 @@ void DWARFDebugLoc::dumpRawEntry(const DWARFLocationEntry &Entry,
     Value1 = Entry.Value1;
     break;
   case dwarf::DW_LLE_end_of_list:
-    Value0 = Value1 = 0;
     return;
   default:
     llvm_unreachable("Not possible in DWARF4!");
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp
index f920d69cc43f..80ffd81b3403 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp
@@ -40,7 +40,7 @@ void DWARFDebugMacro::dump(raw_ostream &OS) const {
   unsigned IndLevel = 0;
   for (const auto &Macros : MacroLists) {
     OS << format("0x%08" PRIx64 ":\n", Macros.Offset);
-    if (Macros.Header.Version >= 5)
+    if (Macros.IsDebugMacro)
       Macros.Header.dumpMacroHeader(OS);
     for (const Entry &E : Macros.Macros) {
       // There should not be DW_MACINFO_end_file when IndLevel is Zero. However,
@@ -52,8 +52,10 @@ void DWARFDebugMacro::dump(raw_ostream &OS) const {
         OS << "  ";
       IndLevel += (E.Type == DW_MACINFO_start_file);
       // Based on which version we are handling choose appropriate macro forms.
-      if (Macros.Header.Version >= 5)
-        WithColor(OS, HighlightColor::Macro).get() << MacroString(E.Type);
+      if (Macros.IsDebugMacro)
+        WithColor(OS, HighlightColor::Macro).get()
+            << (Macros.Header.Version < 5 ? GnuMacroString(E.Type)
+                                          : MacroString(E.Type));
       else
         WithColor(OS, HighlightColor::Macro).get() << MacinfoString(E.Type);
       switch (E.Type) {
@@ -67,6 +69,9 @@ void DWARFDebugMacro::dump(raw_ostream &OS) const {
         // DW_MACRO_start_file == DW_MACINFO_start_file
         // DW_MACRO_end_file   == DW_MACINFO_end_file
         // For readability/uniformity we are using DW_MACRO_*.
+        //
+        // The GNU .debug_macro extension's entries have the same encoding
+        // as DWARF 5's DW_MACRO_* entries, so we only use the latter here.
       case DW_MACRO_define:
       case DW_MACRO_undef:
       case DW_MACRO_define_strp:
@@ -97,7 +102,7 @@ void DWARFDebugMacro::dump(raw_ostream &OS) const {
 }
 
 Error DWARFDebugMacro::parseImpl(
-    Optional<DWARFUnitVector::iterator_range> Units,
+    Optional<DWARFUnitVector::compile_unit_range> Units,
     Optional<DataExtractor> StringExtractor, DWARFDataExtractor Data,
     bool IsMacro) {
   uint64_t Offset = 0;
@@ -118,6 +123,7 @@ Error DWARFDebugMacro::parseImpl(
       MacroLists.emplace_back();
       M = &MacroLists.back();
       M->Offset = Offset;
+      M->IsDebugMacro = IsMacro;
       if (IsMacro) {
         auto Err = M->Header.parseMacroHeader(Data, &Offset);
         if (Err)
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp
index 1a1857d8cd79..dc7da5d9348f 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp
@@ -70,6 +70,9 @@ void DWARFDebugRangeList::dump(raw_ostream &OS) const {
 DWARFAddressRangesVector DWARFDebugRangeList::getAbsoluteRanges(
     llvm::Optional<object::SectionedAddress> BaseAddr) const {
   DWARFAddressRangesVector Res;
+  // debug_addr can't use the max integer tombstone because that's used for the
+  // base address specifier entry - so use max-1.
+  uint64_t Tombstone = dwarf::computeTombstoneAddress(AddressSize) - 1;
   for (const RangeListEntry &RLE : Entries) {
     if (RLE.isBaseAddressSelectionEntry(AddressSize)) {
       BaseAddr = {RLE.EndAddress, RLE.SectionIndex};
@@ -78,12 +81,16 @@ DWARFAddressRangesVector DWARFDebugRangeList::getAbsoluteRanges(
 
     DWARFAddressRange E;
     E.LowPC = RLE.StartAddress;
+    if (E.LowPC == Tombstone)
+      continue;
     E.HighPC = RLE.EndAddress;
     E.SectionIndex = RLE.SectionIndex;
     // Base address of a range list entry is determined by the closest preceding
     // base address selection entry in the same range list. It defaults to the
     // base address of the compilation unit if there is no such entry.
     if (BaseAddr) {
+      if (BaseAddr->Address == Tombstone)
+        continue;
       E.LowPC += BaseAddr->Address;
       E.HighPC += BaseAddr->Address;
       if (E.SectionIndex == -1ULL)
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFDebugRnglists.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFDebugRnglists.cpp
index 9ae4c5b73ebe..d12acca1962e 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFDebugRnglists.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFDebugRnglists.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/DebugInfo/DWARF/DWARFDebugRnglists.h"
 #include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
 #include "llvm/DebugInfo/DWARF/DWARFUnit.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/Error.h"
@@ -16,114 +17,87 @@
 
 using namespace llvm;
 
-Error RangeListEntry::extract(DWARFDataExtractor Data, uint64_t End,
-                              uint64_t *OffsetPtr) {
+Error RangeListEntry::extract(DWARFDataExtractor Data, uint64_t *OffsetPtr) {
   Offset = *OffsetPtr;
   SectionIndex = -1ULL;
   // The caller should guarantee that we have at least 1 byte available, so
   // we just assert instead of revalidate.
-  assert(*OffsetPtr < End &&
+  assert(*OffsetPtr < Data.size() &&
          "not enough space to extract a rangelist encoding");
   uint8_t Encoding = Data.getU8(OffsetPtr);
 
+  DataExtractor::Cursor C(*OffsetPtr);
   switch (Encoding) {
   case dwarf::DW_RLE_end_of_list:
     Value0 = Value1 = 0;
     break;
   // TODO: Support other encodings.
   case dwarf::DW_RLE_base_addressx: {
-    uint64_t PreviousOffset = *OffsetPtr - 1;
-    Value0 = Data.getULEB128(OffsetPtr);
-    if (End < *OffsetPtr)
-      return createStringError(
-          errc::invalid_argument,
-          "read past end of table when reading "
-          "DW_RLE_base_addressx encoding at offset 0x%" PRIx64,
-          PreviousOffset);
+    Value0 = Data.getULEB128(C);
     break;
   }
   case dwarf::DW_RLE_startx_endx:
-    return createStringError(errc::not_supported,
-                       "unsupported rnglists encoding DW_RLE_startx_endx at "
-                       "offset 0x%" PRIx64,
-                       *OffsetPtr - 1);
+    Value0 = Data.getULEB128(C);
+    Value1 = Data.getULEB128(C);
+    break;
   case dwarf::DW_RLE_startx_length: {
-    uint64_t PreviousOffset = *OffsetPtr - 1;
-    Value0 = Data.getULEB128(OffsetPtr);
-    Value1 = Data.getULEB128(OffsetPtr);
-    if (End < *OffsetPtr)
-      return createStringError(
-          errc::invalid_argument,
-          "read past end of table when reading "
-          "DW_RLE_startx_length encoding at offset 0x%" PRIx64,
-          PreviousOffset);
+    Value0 = Data.getULEB128(C);
+    Value1 = Data.getULEB128(C);
     break;
   }
   case dwarf::DW_RLE_offset_pair: {
-    uint64_t PreviousOffset = *OffsetPtr - 1;
-    Value0 = Data.getULEB128(OffsetPtr);
-    Value1 = Data.getULEB128(OffsetPtr);
-    if (End < *OffsetPtr)
-      return createStringError(errc::invalid_argument,
-                         "read past end of table when reading "
-                         "DW_RLE_offset_pair encoding at offset 0x%" PRIx64,
-                         PreviousOffset);
+    Value0 = Data.getULEB128(C);
+    Value1 = Data.getULEB128(C);
     break;
   }
   case dwarf::DW_RLE_base_address: {
-    if ((End - *OffsetPtr) < Data.getAddressSize())
-      return createStringError(errc::invalid_argument,
-                         "insufficient space remaining in table for "
-                         "DW_RLE_base_address encoding at offset 0x%" PRIx64,
-                         *OffsetPtr - 1);
-    Value0 = Data.getRelocatedAddress(OffsetPtr, &SectionIndex);
+    Value0 = Data.getRelocatedAddress(C, &SectionIndex);
     break;
   }
   case dwarf::DW_RLE_start_end: {
-    if ((End - *OffsetPtr) < unsigned(Data.getAddressSize() * 2))
-      return createStringError(errc::invalid_argument,
-                         "insufficient space remaining in table for "
-                         "DW_RLE_start_end encoding "
-                         "at offset 0x%" PRIx64,
-                         *OffsetPtr - 1);
-    Value0 = Data.getRelocatedAddress(OffsetPtr, &SectionIndex);
-    Value1 = Data.getRelocatedAddress(OffsetPtr);
+    Value0 = Data.getRelocatedAddress(C, &SectionIndex);
+    Value1 = Data.getRelocatedAddress(C);
     break;
   }
   case dwarf::DW_RLE_start_length: {
-    uint64_t PreviousOffset = *OffsetPtr - 1;
-    Value0 = Data.getRelocatedAddress(OffsetPtr, &SectionIndex);
-    Value1 = Data.getULEB128(OffsetPtr);
-    if (End < *OffsetPtr)
-      return createStringError(errc::invalid_argument,
-                         "read past end of table when reading "
-                         "DW_RLE_start_length encoding at offset 0x%" PRIx64,
-                         PreviousOffset);
+    Value0 = Data.getRelocatedAddress(C, &SectionIndex);
+    Value1 = Data.getULEB128(C);
     break;
   }
   default:
+    consumeError(C.takeError());
     return createStringError(errc::not_supported,
-                       "unknown rnglists encoding 0x%" PRIx32
-                       " at offset 0x%" PRIx64,
-                       uint32_t(Encoding), *OffsetPtr - 1);
+                             "unknown rnglists encoding 0x%" PRIx32
+                             " at offset 0x%" PRIx64,
+                             uint32_t(Encoding), Offset);
+  }
+
+  if (!C) {
+    consumeError(C.takeError());
+    return createStringError(
+        errc::invalid_argument,
+        "read past end of table when reading %s encoding at offset 0x%" PRIx64,
+        dwarf::RLEString(Encoding).data(), Offset);
   }
 
+  *OffsetPtr = C.tell();
   EntryKind = Encoding;
   return Error::success();
 }
 
 DWARFAddressRangesVector DWARFDebugRnglist::getAbsoluteRanges(
     llvm::Optional<object::SectionedAddress> BaseAddr, DWARFUnit &U) const {
-  return getAbsoluteRanges(BaseAddr, [&](uint32_t Index) {
-    return U.getAddrOffsetSectionItem(Index);
-  });
+  return getAbsoluteRanges(
+      BaseAddr, U.getAddressByteSize(),
+      [&](uint32_t Index) { return U.getAddrOffsetSectionItem(Index); });
 }
 
 DWARFAddressRangesVector DWARFDebugRnglist::getAbsoluteRanges(
-    Optional<object::SectionedAddress> BaseAddr,
+    Optional<object::SectionedAddress> BaseAddr, uint8_t AddressByteSize,
     function_ref<Optional<object::SectionedAddress>(uint32_t)>
         LookupPooledAddress) const {
   DWARFAddressRangesVector Res;
+  uint64_t Tombstone = dwarf::computeTombstoneAddress(AddressByteSize);
   for (const RangeListEntry &RLE : Entries) {
     if (RLE.EntryKind == dwarf::DW_RLE_end_of_list)
       break;
@@ -146,8 +120,12 @@ DWARFAddressRangesVector DWARFDebugRnglist::getAbsoluteRanges(
     switch (RLE.EntryKind) {
     case dwarf::DW_RLE_offset_pair:
       E.LowPC = RLE.Value0;
+      if (E.LowPC == Tombstone)
+        continue;
       E.HighPC = RLE.Value1;
       if (BaseAddr) {
+        if (BaseAddr->Address == Tombstone)
+          continue;
         E.LowPC += BaseAddr->Address;
         E.HighPC += BaseAddr->Address;
       }
@@ -169,11 +147,26 @@ DWARFAddressRangesVector DWARFDebugRnglist::getAbsoluteRanges(
       E.HighPC = E.LowPC + RLE.Value1;
       break;
     }
+    case dwarf::DW_RLE_startx_endx: {
+      auto Start = LookupPooledAddress(RLE.Value0);
+      if (!Start)
+        Start = {0, -1ULL};
+      auto End = LookupPooledAddress(RLE.Value1);
+      if (!End)
+        End = {0, -1ULL};
+      // FIXME: Some error handling if Start.SectionIndex != End.SectionIndex
+      E.SectionIndex = Start->SectionIndex;
+      E.LowPC = Start->Address;
+      E.HighPC = End->Address;
+      break;
+    }
     default:
       // Unsupported encodings should have been reported during extraction,
       // so we should not run into any here.
       llvm_unreachable("Unsupported range list encoding");
     }
+    if (E.LowPC == Tombstone)
+      continue;
     Res.push_back(E);
   }
   return Res;
@@ -206,6 +199,8 @@ void RangeListEntry::dump(
       OS << ": ";
   }
 
+  uint64_t Tombstone = dwarf::computeTombstoneAddress(AddrSize);
+
   switch (EntryKind) {
   case dwarf::DW_RLE_end_of_list:
     OS << (DumpOpts.Verbose ? "" : "<End of list>");
@@ -217,7 +212,7 @@ void RangeListEntry::dump(
       CurrentBase = Value0;
     if (!DumpOpts.Verbose)
       return;
-    OS << format(" 0x%*.*" PRIx64, AddrSize * 2, AddrSize * 2, Value0);
+    DWARFFormValue::dumpAddress(OS << ' ', AddrSize, Value0);
     break;
   }
   case dwarf::DW_RLE_base_address:
@@ -225,7 +220,7 @@ void RangeListEntry::dump(
     CurrentBase = Value0;
     if (!DumpOpts.Verbose)
       return;
-    OS << format(" 0x%*.*" PRIx64, AddrSize * 2, AddrSize * 2, Value0);
+    DWARFFormValue::dumpAddress(OS << ' ', AddrSize, Value0);
     break;
   case dwarf::DW_RLE_start_length:
     PrintRawEntry(OS, *this, AddrSize, DumpOpts);
@@ -233,8 +228,11 @@ void RangeListEntry::dump(
     break;
   case dwarf::DW_RLE_offset_pair:
     PrintRawEntry(OS, *this, AddrSize, DumpOpts);
-    DWARFAddressRange(Value0 + CurrentBase, Value1 + CurrentBase)
-        .dump(OS, AddrSize, DumpOpts);
+    if (CurrentBase != Tombstone)
+      DWARFAddressRange(Value0 + CurrentBase, Value1 + CurrentBase)
+          .dump(OS, AddrSize, DumpOpts);
+    else
+      OS << "dead code";
     break;
   case dwarf::DW_RLE_start_end:
     DWARFAddressRange(Value0, Value1).dump(OS, AddrSize, DumpOpts);
@@ -247,6 +245,17 @@ void RangeListEntry::dump(
     DWARFAddressRange(Start, Start + Value1).dump(OS, AddrSize, DumpOpts);
     break;
   }
+  case dwarf::DW_RLE_startx_endx: {
+    PrintRawEntry(OS, *this, AddrSize, DumpOpts);
+    uint64_t Start = 0;
+    if (auto SA = LookupPooledAddress(Value0))
+      Start = SA->Address;
+    uint64_t End = 0;
+    if (auto SA = LookupPooledAddress(Value1))
+      End = SA->Address;
+    DWARFAddressRange(Start, End).dump(OS, AddrSize, DumpOpts);
+    break;
+  }
   default:
     llvm_unreachable("Unsupported range list encoding");
   }
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
index 81a6b5dcd5e7..5a55f3a04148 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
@@ -69,7 +69,7 @@ static void dumpRanges(const DWARFObject &Obj, raw_ostream &OS,
   }
 }
 
-static void dumpLocation(raw_ostream &OS, DWARFFormValue &FormValue,
+static void dumpLocation(raw_ostream &OS, const DWARFFormValue &FormValue,
                          DWARFUnit *U, unsigned Indent,
                          DIDumpOptions DumpOpts) {
   DWARFContext &Ctx = U->getContext();
@@ -80,7 +80,7 @@ static void dumpLocation(raw_ostream &OS, DWARFFormValue &FormValue,
     DataExtractor Data(StringRef((const char *)Expr.data(), Expr.size()),
                        Ctx.isLittleEndian(), 0);
     DWARFExpression(Data, U->getAddressByteSize(), U->getFormParams().Format)
-        .print(OS, MRI, U);
+        .print(OS, DumpOpts, MRI, U);
     return;
   }
 
@@ -113,7 +113,6 @@ static void dumpTypeTagName(raw_ostream &OS, dwarf::Tag T) {
 }
 
 static void dumpArrayType(raw_ostream &OS, const DWARFDie &D) {
-  Optional<uint64_t> Bound;
   for (const DWARFDie &C : D.children())
     if (C.getTag() == DW_TAG_subrange_type) {
       Optional<uint64_t> LB;
@@ -231,21 +230,22 @@ static void dumpTypeName(raw_ostream &OS, const DWARFDie &D) {
 }
 
 static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die,
-                          uint64_t *OffsetPtr, dwarf::Attribute Attr,
-                          dwarf::Form Form, unsigned Indent,
+                          const DWARFAttribute &AttrValue, unsigned Indent,
                           DIDumpOptions DumpOpts) {
   if (!Die.isValid())
     return;
   const char BaseIndent[] = "            ";
   OS << BaseIndent;
   OS.indent(Indent + 2);
+  dwarf::Attribute Attr = AttrValue.Attr;
   WithColor(OS, HighlightColor::Attribute) << formatv("{0}", Attr);
 
+  dwarf::Form Form = AttrValue.Value.getForm();
   if (DumpOpts.Verbose || DumpOpts.ShowForm)
     OS << formatv(" [{0}]", Form);
 
   DWARFUnit *U = Die.getDwarfUnit();
-  DWARFFormValue FormValue = DWARFFormValue::createFromUnit(Form, U, OffsetPtr);
+  const DWARFFormValue &FormValue = AttrValue.Value;
 
   OS << "\t(";
 
@@ -269,13 +269,23 @@ static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die,
     WithColor(OS, Color) << Name;
   else if (Attr == DW_AT_decl_line || Attr == DW_AT_call_line)
     OS << *FormValue.getAsUnsignedConstant();
-  else if (Attr == DW_AT_high_pc && !DumpOpts.ShowForm && !DumpOpts.Verbose &&
-           FormValue.getAsUnsignedConstant()) {
+  else if (Attr == DW_AT_low_pc &&
+           (FormValue.getAsAddress() ==
+            dwarf::computeTombstoneAddress(U->getAddressByteSize()))) {
+    if (DumpOpts.Verbose) {
+      FormValue.dump(OS, DumpOpts);
+      OS << " (";
+    }
+    OS << "dead code";
+    if (DumpOpts.Verbose)
+      OS << ')';
+  } else if (Attr == DW_AT_high_pc && !DumpOpts.ShowForm && !DumpOpts.Verbose &&
+             FormValue.getAsUnsignedConstant()) {
     if (DumpOpts.ShowAddresses) {
       // Print the actual address rather than the offset.
       uint64_t LowPC, HighPC, Index;
       if (Die.getLowAndHighPC(LowPC, HighPC, Index))
-        OS << format("0x%016" PRIx64, HighPC);
+        DWARFFormValue::dumpAddress(OS, U->getAddressByteSize(), HighPC);
       else
         FormValue.dump(OS, DumpOpts);
     }
@@ -368,8 +378,7 @@ DWARFDie::findRecursively(ArrayRef<dwarf::Attribute> Attrs) const {
   Seen.insert(*this);
 
   while (!Worklist.empty()) {
-    DWARFDie Die = Worklist.back();
-    Worklist.pop_back();
+    DWARFDie Die = Worklist.pop_back_val();
 
     if (!Die.isValid())
       continue;
@@ -416,6 +425,9 @@ Optional<uint64_t> DWARFDie::getLocBaseAttribute() const {
 }
 
 Optional<uint64_t> DWARFDie::getHighPC(uint64_t LowPC) const {
+  uint64_t Tombstone = dwarf::computeTombstoneAddress(U->getAddressByteSize());
+  if (LowPC == Tombstone)
+    return None;
   if (auto FormValue = find(DW_AT_high_pc)) {
     if (auto Address = FormValue->getAsAddress()) {
       // High PC is an address.
@@ -467,8 +479,7 @@ void DWARFDie::collectChildrenAddressRanges(
     return;
   if (isSubprogramDIE()) {
     if (auto DIERangesOrError = getAddressRanges())
-      Ranges.insert(Ranges.end(), DIERangesOrError.get().begin(),
-                    DIERangesOrError.get().end());
+      llvm::append_range(Ranges, DIERangesOrError.get());
     else
       llvm::consumeError(DIERangesOrError.takeError());
   }
@@ -558,6 +569,17 @@ uint64_t DWARFDie::getDeclLine() const {
   return toUnsigned(findRecursively(DW_AT_decl_line), 0);
 }
 
+std::string
+DWARFDie::getDeclFile(DILineInfoSpecifier::FileLineInfoKind Kind) const {
+  std::string FileName;
+  if (auto DeclFile = toUnsigned(findRecursively(DW_AT_decl_file))) {
+    if (const auto *LT = U->getContext().getLineTableForUnit(U)) {
+      LT->getFileNameByIndex(*DeclFile, U->getCompilationDir(), Kind, FileName);
+    }
+  }
+  return FileName;
+}
+
 void DWARFDie::getCallerFrame(uint32_t &CallFile, uint32_t &CallLine,
                               uint32_t &CallColumn,
                               uint32_t &CallDiscriminator) const {
@@ -610,16 +632,8 @@ void DWARFDie::dump(raw_ostream &OS, unsigned Indent,
         OS << '\n';
 
         // Dump all data in the DIE for the attributes.
-        for (const auto &AttrSpec : AbbrevDecl->attributes()) {
-          if (AttrSpec.Form == DW_FORM_implicit_const) {
-            // We are dumping .debug_info section ,
-            // implicit_const attribute values are not really stored here,
-            // but in .debug_abbrev section. So we just skip such attrs.
-            continue;
-          }
-          dumpAttribute(OS, *this, &offset, AttrSpec.Attr, AttrSpec.Form,
-                        Indent, DumpOpts);
-        }
+        for (const DWARFAttribute &AttrValue : attributes())
+          dumpAttribute(OS, *this, AttrValue, Indent, DumpOpts);
 
         DWARFDie child = getFirstChild();
         if (DumpOpts.ShowChildren && DumpOpts.ChildRecurseDepth > 0 && child) {
@@ -702,10 +716,16 @@ void DWARFDie::attribute_iterator::updateForIndex(
     // Add the previous byte size of any previous attribute value.
     AttrValue.Offset += AttrValue.ByteSize;
     uint64_t ParseOffset = AttrValue.Offset;
-    auto U = Die.getDwarfUnit();
-    assert(U && "Die must have valid DWARF unit");
-    AttrValue.Value = DWARFFormValue::createFromUnit(
-        AbbrDecl.getFormByIndex(Index), U, &ParseOffset);
+    if (AbbrDecl.getAttrIsImplicitConstByIndex(Index))
+      AttrValue.Value = DWARFFormValue::createFromSValue(
+          AbbrDecl.getFormByIndex(Index),
+          AbbrDecl.getAttrImplicitConstValueByIndex(Index));
+    else {
+      auto U = Die.getDwarfUnit();
+      assert(U && "Die must have valid DWARF unit");
+      AttrValue.Value = DWARFFormValue::createFromUnit(
+          AbbrDecl.getFormByIndex(Index), U, &ParseOffset);
+    }
     AttrValue.ByteSize = ParseOffset - AttrValue.Offset;
   } else {
     assert(Index == NumAttrs && "Indexes should be [0, NumAttrs) only");
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFExpression.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFExpression.cpp
index de5e11e084f4..811716111be5 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFExpression.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFExpression.cpp
@@ -204,11 +204,15 @@ bool DWARFExpression::Operation::extract(DataExtractor Data,
 }
 
 static void prettyPrintBaseTypeRef(DWARFUnit *U, raw_ostream &OS,
-                                   uint64_t Operands[2], unsigned Operand) {
+                                   DIDumpOptions DumpOpts, uint64_t Operands[2],
+                                   unsigned Operand) {
   assert(Operand < 2 && "operand out of bounds");
   auto Die = U->getDIEForOffset(U->getOffset() + Operands[Operand]);
   if (Die && Die.getTag() == dwarf::DW_TAG_base_type) {
-    OS << format(" (0x%08" PRIx64 ")", U->getOffset() + Operands[Operand]);
+    OS << " (";
+    if (DumpOpts.Verbose)
+      OS << format("0x%08" PRIx64 " -> ", Operands[Operand]);
+    OS << format("0x%08" PRIx64 ")", U->getOffset() + Operands[Operand]);
     if (auto Name = Die.find(dwarf::DW_AT_name))
       OS << " \"" << Name->getAsCString() << "\"";
   } else {
@@ -217,7 +221,8 @@ static void prettyPrintBaseTypeRef(DWARFUnit *U, raw_ostream &OS,
   }
 }
 
-static bool prettyPrintRegisterOp(DWARFUnit *U, raw_ostream &OS, uint8_t Opcode,
+static bool prettyPrintRegisterOp(DWARFUnit *U, raw_ostream &OS,
+                                  DIDumpOptions DumpOpts, uint8_t Opcode,
                                   uint64_t Operands[2],
                                   const MCRegisterInfo *MRI, bool isEH) {
   if (!MRI)
@@ -243,7 +248,7 @@ static bool prettyPrintRegisterOp(DWARFUnit *U, raw_ostream &OS, uint8_t Opcode,
         OS << ' ' << RegName;
 
       if (Opcode == DW_OP_regval_type)
-        prettyPrintBaseTypeRef(U, OS, Operands, 1);
+        prettyPrintBaseTypeRef(U, OS, DumpOpts, Operands, 1);
       return true;
     }
   }
@@ -251,11 +256,10 @@ static bool prettyPrintRegisterOp(DWARFUnit *U, raw_ostream &OS, uint8_t Opcode,
   return false;
 }
 
-bool DWARFExpression::Operation::print(raw_ostream &OS,
+bool DWARFExpression::Operation::print(raw_ostream &OS, DIDumpOptions DumpOpts,
                                        const DWARFExpression *Expr,
                                        const MCRegisterInfo *RegInfo,
-                                       DWARFUnit *U,
-                                       bool isEH) {
+                                       DWARFUnit *U, bool isEH) {
   if (Error) {
     OS << "<decoding error>";
     return false;
@@ -269,7 +273,7 @@ bool DWARFExpression::Operation::print(raw_ostream &OS,
       (Opcode >= DW_OP_reg0 && Opcode <= DW_OP_reg31) ||
       Opcode == DW_OP_bregx || Opcode == DW_OP_regx ||
       Opcode == DW_OP_regval_type)
-    if (prettyPrintRegisterOp(U, OS, Opcode, Operands, RegInfo, isEH))
+    if (prettyPrintRegisterOp(U, OS, DumpOpts, Opcode, Operands, RegInfo, isEH))
       return true;
 
   for (unsigned Operand = 0; Operand < 2; ++Operand) {
@@ -286,7 +290,7 @@ bool DWARFExpression::Operation::print(raw_ostream &OS,
       if (Opcode == DW_OP_convert && Operands[Operand] == 0)
         OS << " 0x0";
       else
-        prettyPrintBaseTypeRef(U, OS, Operands, Operand);
+        prettyPrintBaseTypeRef(U, OS, DumpOpts, Operands, Operand);
     } else if (Size == Operation::WasmLocationArg) {
       assert(Operand == 1);
       switch (Operands[0]) {
@@ -311,11 +315,12 @@ bool DWARFExpression::Operation::print(raw_ostream &OS,
   return true;
 }
 
-void DWARFExpression::print(raw_ostream &OS, const MCRegisterInfo *RegInfo,
-                            DWARFUnit *U, bool IsEH) const {
+void DWARFExpression::print(raw_ostream &OS, DIDumpOptions DumpOpts,
+                            const MCRegisterInfo *RegInfo, DWARFUnit *U,
+                            bool IsEH) const {
   uint32_t EntryValExprSize = 0;
   for (auto &Op : *this) {
-    if (!Op.print(OS, this, RegInfo, U, IsEH)) {
+    if (!Op.print(OS, DumpOpts, this, RegInfo, U, IsEH)) {
       uint64_t FailOffset = Op.getEndOffset();
       while (FailOffset < Data.getData().size())
         OS << format(" %02x", Data.getU8(&FailOffset));
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp
index a7da5acc380b..2559765876d9 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp
@@ -168,6 +168,7 @@ bool DWARFFormValue::skipValue(dwarf::Form Form, DataExtractor DebugInfoData,
     case DW_FORM_line_strp:
     case DW_FORM_GNU_ref_alt:
     case DW_FORM_GNU_strp_alt:
+    case DW_FORM_implicit_const:
       if (Optional<uint8_t> FixedSize =
               dwarf::getFixedFormByteSize(Form, Params)) {
         *OffsetPtr += *FixedSize;
@@ -345,6 +346,9 @@ bool DWARFFormValue::extractValue(const DWARFDataExtractor &Data,
     case DW_FORM_ref_sig8:
       Value.uval = Data.getU64(OffsetPtr, &Err);
       break;
+    case DW_FORM_implicit_const:
+      // Value has been already set by DWARFFormValue::createFromSValue.
+      break;
     default:
       // DWARFFormValue::skipValue() will have caught this and caused all
       // DWARF DIEs to fail to be parsed, so this code is not be reachable.
@@ -358,10 +362,16 @@ bool DWARFFormValue::extractValue(const DWARFDataExtractor &Data,
   return !errorToBool(std::move(Err));
 }
 
+void DWARFFormValue::dumpAddress(raw_ostream &OS, uint8_t AddressSize,
+                                 uint64_t Address) {
+  uint8_t HexDigits = AddressSize * 2;
+  OS << format("0x%*.*" PRIx64, HexDigits, HexDigits, Address);
+}
+
 void DWARFFormValue::dumpSectionedAddress(raw_ostream &OS,
                                           DIDumpOptions DumpOpts,
                                           object::SectionedAddress SA) const {
-  OS << format("0x%016" PRIx64, SA.Address);
+  dumpAddress(OS, U->getAddressByteSize(), SA.Address);
   dumpAddressSection(U->getContext().getDWARFObj(), OS, DumpOpts,
                      SA.SectionIndex);
 }
@@ -476,6 +486,7 @@ void DWARFFormValue::dump(raw_ostream &OS, DIDumpOptions DumpOpts) const {
     break;
 
   case DW_FORM_sdata:
+  case DW_FORM_implicit_const:
     OS << Value.sval;
     break;
   case DW_FORM_udata:
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp
index 252b58e5a591..ace7000f07b2 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp
@@ -71,8 +71,8 @@ void DWARFGdbIndex::dumpSymbolTable(raw_ostream &OS) const {
     StringRef Name = ConstantPoolStrings.substr(
         ConstantPoolOffset - StringPoolOffset + E.NameOffset);
 
-    auto CuVector = std::find_if(
-        ConstantPoolVectors.begin(), ConstantPoolVectors.end(),
+    auto CuVector = llvm::find_if(
+        ConstantPoolVectors,
         [&](const std::pair<uint32_t, SmallVector<uint32_t, 0>> &V) {
           return V.first == E.VecOffset;
         });
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFListTable.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFListTable.cpp
index 2124a49bef60..c876af1e9b51 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFListTable.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFListTable.cpp
@@ -71,12 +71,12 @@ Error DWARFListTableHeader::extract(DWARFDataExtractor Data,
         ") than there is space for",
         SectionName.data(), HeaderOffset, HeaderData.OffsetEntryCount);
   Data.setAddressSize(HeaderData.AddrSize);
-  for (uint32_t I = 0; I < HeaderData.OffsetEntryCount; ++I)
-    Offsets.push_back(Data.getRelocatedValue(OffsetByteSize, OffsetPtr));
+  *OffsetPtr += HeaderData.OffsetEntryCount * OffsetByteSize;
   return Error::success();
 }
 
-void DWARFListTableHeader::dump(raw_ostream &OS, DIDumpOptions DumpOpts) const {
+void DWARFListTableHeader::dump(DataExtractor Data, raw_ostream &OS,
+                                DIDumpOptions DumpOpts) const {
   if (DumpOpts.Verbose)
     OS << format("0x%8.8" PRIx64 ": ", HeaderOffset);
   int OffsetDumpWidth = 2 * dwarf::getDwarfOffsetByteSize(Format);
@@ -91,7 +91,8 @@ void DWARFListTableHeader::dump(raw_ostream &OS, DIDumpOptions DumpOpts) const {
 
   if (HeaderData.OffsetEntryCount > 0) {
     OS << "offsets: [";
-    for (const auto &Off : Offsets) {
+    for (uint32_t I = 0; I < HeaderData.OffsetEntryCount; ++I) {
+      auto Off = *getOffsetEntry(Data, I);
       OS << format("\n0x%0*" PRIx64, OffsetDumpWidth, Off);
       if (DumpOpts.Verbose)
         OS << format(" => 0x%08" PRIx64,
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp
index c219f34bbc31..a301b65dd444 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp
@@ -36,9 +36,10 @@ void DWARFTypeUnit::dump(raw_ostream &OS, DIDumpOptions DumpOpts) {
      << ", version = " << format("0x%04x", getVersion());
   if (getVersion() >= 5)
     OS << ", unit_type = " << dwarf::UnitTypeString(getUnitType());
-  OS << ", abbr_offset = "
-     << format("0x%04" PRIx64, getAbbreviations()->getOffset())
-     << ", addr_size = " << format("0x%02x", getAddressByteSize())
+  OS << ", abbr_offset = " << format("0x%04" PRIx64, getAbbrOffset());
+  if (!getAbbreviations())
+    OS << " (invalid)";
+  OS << ", addr_size = " << format("0x%02x", getAddressByteSize())
      << ", name = '" << Name << "'"
      << ", type_signature = " << format("0x%016" PRIx64, getTypeHash())
      << ", type_offset = " << format("0x%04" PRIx64, getTypeOffset())
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp
index a6d44f04e468..8493950a29b2 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp
@@ -181,31 +181,6 @@ DWARFUnit::DWARFUnit(DWARFContext &DC, const DWARFSection &Section,
       StringOffsetSection(SOS), AddrOffsetSection(AOS), isLittleEndian(LE),
       IsDWO(IsDWO), UnitVector(UnitVector) {
   clear();
-  if (IsDWO) {
-    // If we are reading a package file, we need to adjust the location list
-    // data based on the index entries.
-    StringRef Data = Header.getVersion() >= 5
-                         ? Context.getDWARFObj().getLoclistsDWOSection().Data
-                         : LocSection->Data;
-    if (auto *IndexEntry = Header.getIndexEntry())
-      if (const auto *C = IndexEntry->getContribution(
-              Header.getVersion() >= 5 ? DW_SECT_LOCLISTS : DW_SECT_EXT_LOC))
-        Data = Data.substr(C->Offset, C->Length);
-
-    DWARFDataExtractor DWARFData(Data, isLittleEndian, getAddressByteSize());
-    LocTable =
-        std::make_unique<DWARFDebugLoclists>(DWARFData, Header.getVersion());
-  } else if (Header.getVersion() >= 5) {
-    LocTable = std::make_unique<DWARFDebugLoclists>(
-        DWARFDataExtractor(Context.getDWARFObj(),
-                           Context.getDWARFObj().getLoclistsSection(),
-                           isLittleEndian, getAddressByteSize()),
-        Header.getVersion());
-  } else {
-    LocTable = std::make_unique<DWARFDebugLoc>(
-        DWARFDataExtractor(Context.getDWARFObj(), *LocSection, isLittleEndian,
-                           getAddressByteSize()));
-  }
 }
 
 DWARFUnit::~DWARFUnit() = default;
@@ -219,13 +194,12 @@ Optional<object::SectionedAddress>
 DWARFUnit::getAddrOffsetSectionItem(uint32_t Index) const {
   if (IsDWO) {
     auto R = Context.info_section_units();
-    auto I = R.begin();
     // Surprising if a DWO file has more than one skeleton unit in it - this
     // probably shouldn't be valid, but if a use case is found, here's where to
     // support it (probably have to linearly search for the matching skeleton CU
     // here)
-    if (I != R.end() && std::next(I) == R.end())
-      return (*I)->getAddrOffsetSectionItem(Index);
+    if (hasSingleElement(R))
+      return (*R.begin())->getAddrOffsetSectionItem(Index);
   }
   if (!AddrOffsetSectionBase)
     return None;
@@ -492,8 +466,8 @@ Error DWARFUnit::tryExtractDIEsIfNeeded(bool CUDieOnly) {
   // describe address ranges.
   if (getVersion() >= 5) {
     // In case of DWP, the base offset from the index has to be added.
-    uint64_t ContributionBaseOffset = 0;
     if (IsDWO) {
+      uint64_t ContributionBaseOffset = 0;
       if (auto *IndexEntry = Header.getIndexEntry())
         if (auto *Contrib = IndexEntry->getContribution(DW_SECT_RNGLISTS))
           ContributionBaseOffset = Contrib->Offset;
@@ -503,67 +477,36 @@ Error DWARFUnit::tryExtractDIEsIfNeeded(bool CUDieOnly) {
               DWARFListTableHeader::getHeaderSize(Header.getFormat()));
     } else
       setRangesSection(&Context.getDWARFObj().getRnglistsSection(),
-                       toSectionOffset(UnitDie.find(DW_AT_rnglists_base), 0));
-    if (RangeSection->Data.size()) {
-      // Parse the range list table header. Individual range lists are
-      // extracted lazily.
-      DWARFDataExtractor RangesDA(Context.getDWARFObj(), *RangeSection,
-                                  isLittleEndian, 0);
-      auto TableOrError = parseListTableHeader<DWARFDebugRnglistTable>(
-          RangesDA, RangeSectionBase, Header.getFormat());
-      if (!TableOrError)
-        return createStringError(errc::invalid_argument,
-                                 "parsing a range list table: " +
-                                     toString(TableOrError.takeError()));
-
-      RngListTable = TableOrError.get();
-
-      // In a split dwarf unit, there is no DW_AT_rnglists_base attribute.
-      // Adjust RangeSectionBase to point past the table header.
-      if (IsDWO && RngListTable)
-        RangeSectionBase =
-            ContributionBaseOffset + RngListTable->getHeaderSize();
-    }
+                       toSectionOffset(UnitDie.find(DW_AT_rnglists_base),
+                                       DWARFListTableHeader::getHeaderSize(
+                                           Header.getFormat())));
+  }
 
-    // In a split dwarf unit, there is no DW_AT_loclists_base attribute.
-    // Setting LocSectionBase to point past the table header.
-    if (IsDWO) {
-      auto &DWOSection = Context.getDWARFObj().getLoclistsDWOSection();
-      if (DWOSection.Data.empty())
-        return Error::success();
-      setLocSection(&DWOSection,
-                    DWARFListTableHeader::getHeaderSize(Header.getFormat()));
-    } else if (auto X = UnitDie.find(DW_AT_loclists_base)) {
-      setLocSection(&Context.getDWARFObj().getLoclistsSection(),
-                    toSectionOffset(X, 0));
-    } else {
-      return Error::success();
-    }
+  if (IsDWO) {
+    // If we are reading a package file, we need to adjust the location list
+    // data based on the index entries.
+    StringRef Data = Header.getVersion() >= 5
+                         ? Context.getDWARFObj().getLoclistsDWOSection().Data
+                         : Context.getDWARFObj().getLocDWOSection().Data;
+    if (auto *IndexEntry = Header.getIndexEntry())
+      if (const auto *C = IndexEntry->getContribution(
+              Header.getVersion() >= 5 ? DW_SECT_LOCLISTS : DW_SECT_EXT_LOC))
+        Data = Data.substr(C->Offset, C->Length);
 
-    if (LocSection) {
-      if (IsDWO)
-        LoclistTableHeader.emplace(".debug_loclists.dwo", "locations");
-      else
-        LoclistTableHeader.emplace(".debug_loclists", "locations");
-
-      uint64_t HeaderSize = DWARFListTableHeader::getHeaderSize(Header.getFormat());
-      uint64_t Offset = getLocSectionBase();
-      DWARFDataExtractor Data(Context.getDWARFObj(), *LocSection,
-                              isLittleEndian, getAddressByteSize());
-      if (Offset < HeaderSize)
-        return createStringError(errc::invalid_argument,
-                                 "did not detect a valid"
-                                 " list table with base = 0x%" PRIx64 "\n",
-                                 Offset);
-      Offset -= HeaderSize;
-      if (auto *IndexEntry = Header.getIndexEntry())
-        if (const auto *Contrib = IndexEntry->getContribution(DW_SECT_LOCLISTS))
-          Offset += Contrib->Offset;
-      if (Error E = LoclistTableHeader->extract(Data, &Offset))
-        return createStringError(errc::invalid_argument,
-                                 "parsing a loclist table: " +
-                                     toString(std::move(E)));
-    }
+    DWARFDataExtractor DWARFData(Data, isLittleEndian, getAddressByteSize());
+    LocTable =
+        std::make_unique<DWARFDebugLoclists>(DWARFData, Header.getVersion());
+    LocSectionBase = DWARFListTableHeader::getHeaderSize(Header.getFormat());
+  } else if (getVersion() >= 5) {
+    LocTable = std::make_unique<DWARFDebugLoclists>(
+        DWARFDataExtractor(Context.getDWARFObj(),
+                           Context.getDWARFObj().getLoclistsSection(),
+                           isLittleEndian, getAddressByteSize()),
+        getVersion());
+  } else {
+    LocTable = std::make_unique<DWARFDebugLoc>(DWARFDataExtractor(
+        Context.getDWARFObj(), Context.getDWARFObj().getLocSection(),
+        isLittleEndian, getAddressByteSize()));
   }
 
   // Don't fall back to DW_AT_GNU_ranges_base: it should be ignored for
@@ -606,19 +549,8 @@ bool DWARFUnit::parseDWO() {
   if (AddrOffsetSectionBase)
     DWO->setAddrOffsetSection(AddrOffsetSection, *AddrOffsetSectionBase);
   if (getVersion() >= 5) {
-    DWO->setRangesSection(&Context.getDWARFObj().getRnglistsDWOSection(), 0);
-    DWARFDataExtractor RangesDA(Context.getDWARFObj(), *RangeSection,
-                                isLittleEndian, 0);
-    if (auto TableOrError = parseListTableHeader<DWARFDebugRnglistTable>(
-            RangesDA, RangeSectionBase, Header.getFormat()))
-      DWO->RngListTable = TableOrError.get();
-    else
-      Context.getRecoverableErrorHandler()(createStringError(
-          errc::invalid_argument, "parsing a range list table: %s",
-          toString(TableOrError.takeError()).c_str()));
-
-    if (DWO->RngListTable)
-      DWO->RangeSectionBase = DWO->RngListTable->getHeaderSize();
+    DWO->setRangesSection(&Context.getDWARFObj().getRnglistsDWOSection(),
+                          DWARFListTableHeader::getHeaderSize(getFormat()));
   } else {
     auto DWORangesBase = UnitDie.getRangesBaseAttribute();
     DWO->setRangesSection(RangeSection, DWORangesBase ? *DWORangesBase : 0);
@@ -642,17 +574,13 @@ DWARFUnit::findRnglistFromOffset(uint64_t Offset) {
       return std::move(E);
     return RangeList.getAbsoluteRanges(getBaseAddress());
   }
-  if (RngListTable) {
-    DWARFDataExtractor RangesData(Context.getDWARFObj(), *RangeSection,
-                                  isLittleEndian, RngListTable->getAddrSize());
-    auto RangeListOrError = RngListTable->findList(RangesData, Offset);
-    if (RangeListOrError)
-      return RangeListOrError.get().getAbsoluteRanges(getBaseAddress(), *this);
-    return RangeListOrError.takeError();
-  }
-
-  return createStringError(errc::invalid_argument,
-                           "missing or invalid range list table");
+  DWARFDataExtractor RangesData(Context.getDWARFObj(), *RangeSection,
+                                isLittleEndian, Header.getAddressByteSize());
+  DWARFDebugRnglistTable RnglistTable;
+  auto RangeListOrError = RnglistTable.findList(RangesData, Offset);
+  if (RangeListOrError)
+    return RangeListOrError.get().getAbsoluteRanges(getBaseAddress(), *this);
+  return RangeListOrError.takeError();
 }
 
 Expected<DWARFAddressRangesVector>
@@ -660,12 +588,10 @@ DWARFUnit::findRnglistFromIndex(uint32_t Index) {
   if (auto Offset = getRnglistOffset(Index))
     return findRnglistFromOffset(*Offset);
 
-  if (RngListTable)
-    return createStringError(errc::invalid_argument,
-                             "invalid range list table index %d", Index);
-
   return createStringError(errc::invalid_argument,
-                           "missing or invalid range list table");
+                           "invalid range list table index %d (possibly "
+                           "missing the entire range list table)",
+                           Index);
 }
 
 Expected<DWARFAddressRangesVector> DWARFUnit::collectAddressRanges() {
@@ -995,11 +921,35 @@ DWARFUnit::determineStringOffsetsTableContributionDWO(DWARFDataExtractor & DA) {
   // Prior to DWARF v5, we derive the contribution size from the
   // index table (in a package file). In a .dwo file it is simply
   // the length of the string offsets section.
-  if (!IndexEntry)
-    return {Optional<StrOffsetsContributionDescriptor>(
-        {0, StringOffsetSection.Data.size(), 4, Header.getFormat()})};
+  StrOffsetsContributionDescriptor Desc;
   if (C)
-    return {Optional<StrOffsetsContributionDescriptor>(
-        {C->Offset, C->Length, 4, Header.getFormat()})};
+    Desc = StrOffsetsContributionDescriptor(C->Offset, C->Length, 4,
+                                            Header.getFormat());
+  else if (!IndexEntry && !StringOffsetSection.Data.empty())
+    Desc = StrOffsetsContributionDescriptor(0, StringOffsetSection.Data.size(),
+                                            4, Header.getFormat());
+  else
+    return None;
+  auto DescOrError = Desc.validateContributionSize(DA);
+  if (!DescOrError)
+    return DescOrError.takeError();
+  return *DescOrError;
+}
+
+Optional<uint64_t> DWARFUnit::getRnglistOffset(uint32_t Index) {
+  DataExtractor RangesData(RangeSection->Data, isLittleEndian,
+                           getAddressByteSize());
+  DWARFDataExtractor RangesDA(Context.getDWARFObj(), *RangeSection,
+                              isLittleEndian, 0);
+  if (Optional<uint64_t> Off = llvm::DWARFListTableHeader::getOffsetEntry(
+          RangesData, RangeSectionBase, getFormat(), Index))
+    return *Off + RangeSectionBase;
+  return None;
+}
+
+Optional<uint64_t> DWARFUnit::getLoclistOffset(uint32_t Index) {
+  if (Optional<uint64_t> Off = llvm::DWARFListTableHeader::getOffsetEntry(
+          LocTable->getData(), LocSectionBase, getFormat(), Index))
+    return *Off + LocSectionBase;
   return None;
 }
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
index 3a83317a73a3..ac624ec8b80f 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
@@ -172,6 +172,15 @@ unsigned DWARFVerifier::verifyUnitContents(DWARFUnit &Unit) {
       NumUnitErrors += verifyDebugInfoForm(Die, AttrValue);
     }
 
+    if (Die.hasChildren()) {
+      if (Die.getFirstChild().isValid() &&
+          Die.getFirstChild().getTag() == DW_TAG_null) {
+        warn() << dwarf::TagString(Die.getTag())
+               << " has DW_CHILDREN_yes but DIE has no children: ";
+        Die.dump(OS);
+      }
+    }
+
     NumUnitErrors += verifyDebugInfoCallSite(Die);
   }
 
@@ -538,6 +547,39 @@ unsigned DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die,
     }
     break;
   }
+  case DW_AT_call_file:
+  case DW_AT_decl_file: {
+    if (auto FileIdx = AttrValue.Value.getAsUnsignedConstant()) {
+      DWARFUnit *U = Die.getDwarfUnit();
+      const auto *LT = U->getContext().getLineTableForUnit(U);
+      if (LT) {
+        if (!LT->hasFileAtIndex(*FileIdx)) {
+          bool IsZeroIndexed = LT->Prologue.getVersion() >= 5;
+          if (Optional<uint64_t> LastFileIdx = LT->getLastValidFileIndex()) {
+            ReportError("DIE has " + AttributeString(Attr) +
+                        " with an invalid file index " +
+                        llvm::formatv("{0}", *FileIdx) +
+                        " (valid values are [" + (IsZeroIndexed ? "0-" : "1-") +
+                        llvm::formatv("{0}", *LastFileIdx) + "])");
+          } else {
+            ReportError("DIE has " + AttributeString(Attr) +
+                        " with an invalid file index " +
+                        llvm::formatv("{0}", *FileIdx) +
+                        " (the file table in the prologue is empty)");
+          }
+        }
+      } else {
+        ReportError("DIE has " + AttributeString(Attr) +
+                    " that references a file with index " +
+                    llvm::formatv("{0}", *FileIdx) +
+                    " and the compile unit has no line table");
+      }
+    } else {
+      ReportError("DIE has " + AttributeString(Attr) +
+                  " with invalid encoding");
+    }
+    break;
+  }
   default:
     break;
   }
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/GSYM/GsymCreator.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/GSYM/GsymCreator.cpp
index 7d9b72c6283d..2001478e8047 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/GSYM/GsymCreator.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/GSYM/GsymCreator.cpp
@@ -169,7 +169,7 @@ llvm::Error GsymCreator::finalize(llvm::raw_ostream &OS) {
   Finalized = true;
 
   // Sort function infos so we can emit sorted functions.
-  llvm::sort(Funcs.begin(), Funcs.end());
+  llvm::sort(Funcs);
 
   // Don't let the string table indexes change by finalizing in order.
   StrTab.finalizeInOrder();
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/MSF/MSFBuilder.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/MSF/MSFBuilder.cpp
index c6fe764ab7e0..f946dd4860ac 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/MSF/MSFBuilder.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/MSF/MSFBuilder.cpp
@@ -204,8 +204,7 @@ Error MSFBuilder::setStreamSize(uint32_t Idx, uint32_t Size) {
     if (auto EC = allocateBlocks(AddedBlocks, AddedBlockList))
       return EC;
     auto &CurrentBlocks = StreamData[Idx].second;
-    CurrentBlocks.insert(CurrentBlocks.end(), AddedBlockList.begin(),
-                         AddedBlockList.end());
+    llvm::append_range(CurrentBlocks, AddedBlockList);
   } else if (OldBlocks > NewBlocks) {
     // For shrinking, free all the Blocks in the Block map, update the stream
     // data, then shrink the directory.
@@ -268,8 +267,7 @@ Expected<MSFLayout> MSFBuilder::generateLayout() {
     ExtraBlocks.resize(NumExtraBlocks);
     if (auto EC = allocateBlocks(NumExtraBlocks, ExtraBlocks))
       return std::move(EC);
-    DirectoryBlocks.insert(DirectoryBlocks.end(), ExtraBlocks.begin(),
-                           ExtraBlocks.end());
+    llvm::append_range(DirectoryBlocks, ExtraBlocks);
   } else if (NumDirectoryBlocks < DirectoryBlocks.size()) {
     uint32_t NumUnnecessaryBlocks = DirectoryBlocks.size() - NumDirectoryBlocks;
     for (auto B :
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp
index 73801ea1dd1b..b6f11a942a26 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp
@@ -74,7 +74,7 @@ void DbiModuleDescriptorBuilder::addSymbolsInBulk(
   if (BulkSymbols.empty())
     return;
 
-  Symbols.push_back(BulkSymbols);
+  Symbols.push_back(SymbolListWrapper(BulkSymbols));
   // Symbols written to a PDB file are required to be 4 byte aligned. The same
   // is not true of object files.
   assert(BulkSymbols.size() % alignOf(CodeViewContainer::Pdb) == 0 &&
@@ -82,6 +82,18 @@ void DbiModuleDescriptorBuilder::addSymbolsInBulk(
   SymbolByteSize += BulkSymbols.size();
 }
 
+void DbiModuleDescriptorBuilder::addUnmergedSymbols(void *SymSrc,
+                                                    uint32_t SymLength) {
+  assert(SymLength > 0);
+  Symbols.push_back(SymbolListWrapper(SymSrc, SymLength));
+
+  // Symbols written to a PDB file are required to be 4 byte aligned. The same
+  // is not true of object files.
+  assert(SymLength % alignOf(CodeViewContainer::Pdb) == 0 &&
+         "Invalid Symbol alignment!");
+  SymbolByteSize += SymLength;
+}
+
 void DbiModuleDescriptorBuilder::addSourceFile(StringRef Path) {
   SourceFiles.push_back(std::string(Path));
 }
@@ -131,9 +143,7 @@ Error DbiModuleDescriptorBuilder::finalizeMsfLayout() {
   return Error::success();
 }
 
-Error DbiModuleDescriptorBuilder::commit(BinaryStreamWriter &ModiWriter,
-                                         const msf::MSFLayout &MsfLayout,
-                                         WritableBinaryStreamRef MsfBuffer) {
+Error DbiModuleDescriptorBuilder::commit(BinaryStreamWriter &ModiWriter) {
   // We write the Modi record to the `ModiWriter`, but we additionally write its
   // symbol stream to a brand new stream.
   if (auto EC = ModiWriter.writeObject(Layout))
@@ -144,34 +154,55 @@ Error DbiModuleDescriptorBuilder::commit(BinaryStreamWriter &ModiWriter,
     return EC;
   if (auto EC = ModiWriter.padToAlignment(sizeof(uint32_t)))
     return EC;
+  return Error::success();
+}
 
-  if (Layout.ModDiStream != kInvalidStreamIndex) {
-    auto NS = WritableMappedBlockStream::createIndexedStream(
-        MsfLayout, MsfBuffer, Layout.ModDiStream, MSF.getAllocator());
-    WritableBinaryStreamRef Ref(*NS);
-    BinaryStreamWriter SymbolWriter(Ref);
-    // Write the symbols.
-    if (auto EC =
-            SymbolWriter.writeInteger<uint32_t>(COFF::DEBUG_SECTION_MAGIC))
-      return EC;
-    for (ArrayRef<uint8_t> Syms : Symbols) {
-      if (auto EC = SymbolWriter.writeBytes(Syms))
+Error DbiModuleDescriptorBuilder::commitSymbolStream(
+    const msf::MSFLayout &MsfLayout, WritableBinaryStreamRef MsfBuffer) {
+  if (Layout.ModDiStream == kInvalidStreamIndex)
+    return Error::success();
+
+  auto NS = WritableMappedBlockStream::createIndexedStream(
+      MsfLayout, MsfBuffer, Layout.ModDiStream, MSF.getAllocator());
+  WritableBinaryStreamRef Ref(*NS);
+  BinaryStreamWriter SymbolWriter(Ref);
+  // Write the symbols.
+  if (auto EC = SymbolWriter.writeInteger<uint32_t>(COFF::DEBUG_SECTION_MAGIC))
+    return EC;
+  for (const SymbolListWrapper &Sym : Symbols) {
+    if (Sym.NeedsToBeMerged) {
+      assert(MergeSymsCallback);
+      if (auto EC = MergeSymsCallback(MergeSymsCtx, Sym.SymPtr, SymbolWriter))
         return EC;
-    }
-    assert(SymbolWriter.getOffset() % alignOf(CodeViewContainer::Pdb) == 0 &&
-           "Invalid debug section alignment!");
-    // TODO: Write C11 Line data
-    for (const auto &Builder : C13Builders) {
-      if (auto EC = Builder.commit(SymbolWriter, CodeViewContainer::Pdb))
+    } else {
+      if (auto EC = SymbolWriter.writeBytes(Sym.asArray()))
         return EC;
     }
+  }
+
+  // Apply the string table fixups.
+  auto SavedOffset = SymbolWriter.getOffset();
+  for (const StringTableFixup &Fixup : StringTableFixups) {
+    SymbolWriter.setOffset(Fixup.SymOffsetOfReference);
+    if (auto E = SymbolWriter.writeInteger<uint32_t>(Fixup.StrTabOffset))
+      return E;
+  }
+  SymbolWriter.setOffset(SavedOffset);
 
-    // TODO: Figure out what GlobalRefs substream actually is and populate it.
-    if (auto EC = SymbolWriter.writeInteger<uint32_t>(0))
+  assert(SymbolWriter.getOffset() % alignOf(CodeViewContainer::Pdb) == 0 &&
+         "Invalid debug section alignment!");
+  // TODO: Write C11 Line data
+  for (const auto &Builder : C13Builders) {
+    if (auto EC = Builder.commit(SymbolWriter, CodeViewContainer::Pdb))
       return EC;
-    if (SymbolWriter.bytesRemaining() > 0)
-      return make_error<RawError>(raw_error_code::stream_too_long);
   }
+
+  // TODO: Figure out what GlobalRefs substream actually is and populate it.
+  if (auto EC = SymbolWriter.writeInteger<uint32_t>(0))
+    return EC;
+  if (SymbolWriter.bytesRemaining() > 0)
+    return make_error<RawError>(raw_error_code::stream_too_long);
+
   return Error::success();
 }
 
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp
index 627aef7506fd..98a8acaffd60 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp
@@ -18,6 +18,7 @@
 #include "llvm/DebugInfo/PDB/Native/RawError.h"
 #include "llvm/Object/COFF.h"
 #include "llvm/Support/BinaryStreamWriter.h"
+#include "llvm/Support/Parallel.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
@@ -394,10 +395,17 @@ Error DbiStreamBuilder::commit(const msf::MSFLayout &Layout,
     return EC;
 
   for (auto &M : ModiList) {
-    if (auto EC = M->commit(Writer, Layout, MsfBuffer))
+    if (auto EC = M->commit(Writer))
       return EC;
   }
 
+  // Commit symbol streams. This is a lot of data, so do it in parallel.
+  if (auto EC = parallelForEachError(
+          ModiList, [&](std::unique_ptr<DbiModuleDescriptorBuilder> &M) {
+            return M->commitSymbolStream(Layout, MsfBuffer);
+          }))
+    return EC;
+
   if (!SectionContribs.empty()) {
     if (auto EC = Writer.writeEnum(DbiSecContribVer60))
       return EC;
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp
index 4e58489f1401..52df26b67916 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp
@@ -162,7 +162,7 @@ static int gsiRecordCmp(StringRef S1, StringRef S2) {
   if (LLVM_UNLIKELY(!isAsciiString(S1) || !isAsciiString(S2)))
     return memcmp(S1.data(), S2.data(), LS);
 
-  // Both strings are ascii, perform a case-insenstive comparison.
+  // Both strings are ascii, perform a case-insensitive comparison.
   return S1.compare_lower(S2.data());
 }
 
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/NamedStreamMap.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/NamedStreamMap.cpp
index 4a88391494cd..1d873b87b347 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/NamedStreamMap.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/NamedStreamMap.cpp
@@ -116,7 +116,7 @@ StringMap<uint32_t> NamedStreamMap::entries() const {
 
 uint32_t NamedStreamMap::appendStringData(StringRef S) {
   uint32_t Offset = NamesBuffer.size();
-  NamesBuffer.insert(NamesBuffer.end(), S.begin(), S.end());
+  llvm::append_range(NamesBuffer, S);
   NamesBuffer.push_back('\0');
   return Offset;
 }
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/NativeEnumSymbols.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/NativeEnumSymbols.cpp
new file mode 100644
index 000000000000..feede1dbc958
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/NativeEnumSymbols.cpp
@@ -0,0 +1,41 @@
+//==- NativeEnumSymbols.cpp - Native Symbol Enumerator impl ------*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Native/NativeEnumSymbols.h"
+
+#include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
+#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+#include "llvm/DebugInfo/PDB/Native/NativeTypeEnum.h"
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+using namespace llvm::pdb;
+
+NativeEnumSymbols::NativeEnumSymbols(NativeSession &PDBSession,
+                                     std::vector<SymIndexId> Symbols)
+    : Symbols(std::move(Symbols)), Index(0), Session(PDBSession) {}
+
+uint32_t NativeEnumSymbols::getChildCount() const {
+  return static_cast<uint32_t>(Symbols.size());
+}
+
+std::unique_ptr<PDBSymbol>
+NativeEnumSymbols::getChildAtIndex(uint32_t N) const {
+  if (N < Symbols.size()) {
+    return Session.getSymbolCache().getSymbolById(Symbols[N]);
+  }
+  return nullptr;
+}
+
+std::unique_ptr<PDBSymbol> NativeEnumSymbols::getNext() {
+  return getChildAtIndex(Index++);
+}
+
+void NativeEnumSymbols::reset() { Index = 0; }
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/NativeFunctionSymbol.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/NativeFunctionSymbol.cpp
index 2537daa7493c..7f3b35c297b4 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/NativeFunctionSymbol.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/NativeFunctionSymbol.cpp
@@ -8,7 +8,9 @@
 
 #include "llvm/DebugInfo/PDB/Native/NativeFunctionSymbol.h"
 
+#include "llvm/DebugInfo/CodeView/SymbolDeserializer.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
+#include "llvm/DebugInfo/PDB/Native/NativeEnumSymbols.h"
 #include "llvm/DebugInfo/PDB/Native/NativeTypeBuiltin.h"
 #include "llvm/DebugInfo/PDB/Native/NativeTypeEnum.h"
 
@@ -18,8 +20,10 @@ using namespace llvm::pdb;
 
 NativeFunctionSymbol::NativeFunctionSymbol(NativeSession &Session,
                                            SymIndexId Id,
-                                           const codeview::ProcSym &Sym)
-    : NativeRawSymbol(Session, PDB_SymType::Data, Id), Sym(Sym) {}
+                                           const codeview::ProcSym &Sym,
+                                           uint32_t Offset)
+    : NativeRawSymbol(Session, PDB_SymType::Function, Id), Sym(Sym),
+      RecordOffset(Offset) {}
 
 NativeFunctionSymbol::~NativeFunctionSymbol() {}
 
@@ -42,10 +46,6 @@ std::string NativeFunctionSymbol::getName() const {
   return std::string(Sym.Name);
 }
 
-PDB_SymType NativeFunctionSymbol::getSymTag() const {
-  return PDB_SymType::Function;
-}
-
 uint64_t NativeFunctionSymbol::getLength() const { return Sym.CodeSize; }
 
 uint32_t NativeFunctionSymbol::getRelativeVirtualAddress() const {
@@ -55,3 +55,89 @@ uint32_t NativeFunctionSymbol::getRelativeVirtualAddress() const {
 uint64_t NativeFunctionSymbol::getVirtualAddress() const {
   return Session.getVAFromSectOffset(Sym.Segment, Sym.CodeOffset);
 }
+
+static bool inlineSiteContainsAddress(InlineSiteSym &IS,
+                                      uint32_t OffsetInFunc) {
+  // Returns true if inline site contains the offset.
+  bool Found = false;
+  uint32_t CodeOffset = 0;
+  for (auto &Annot : IS.annotations()) {
+    switch (Annot.OpCode) {
+    case BinaryAnnotationsOpCode::CodeOffset:
+    case BinaryAnnotationsOpCode::ChangeCodeOffset:
+    case BinaryAnnotationsOpCode::ChangeCodeOffsetAndLineOffset:
+      CodeOffset += Annot.U1;
+      if (OffsetInFunc >= CodeOffset)
+        Found = true;
+      break;
+    case BinaryAnnotationsOpCode::ChangeCodeLength:
+      CodeOffset += Annot.U1;
+      if (Found && OffsetInFunc < CodeOffset)
+        return true;
+      Found = false;
+      break;
+    case BinaryAnnotationsOpCode::ChangeCodeLengthAndCodeOffset:
+      CodeOffset += Annot.U2;
+      if (OffsetInFunc >= CodeOffset && OffsetInFunc < CodeOffset + Annot.U1)
+        return true;
+      Found = false;
+      break;
+    default:
+      break;
+    }
+  }
+  return false;
+}
+
+std::unique_ptr<IPDBEnumSymbols>
+NativeFunctionSymbol::findInlineFramesByVA(uint64_t VA) const {
+  uint16_t Modi;
+  if (!Session.moduleIndexForVA(VA, Modi))
+    return nullptr;
+
+  Expected<ModuleDebugStreamRef> ModS = Session.getModuleDebugStream(Modi);
+  if (!ModS) {
+    consumeError(ModS.takeError());
+    return nullptr;
+  }
+  CVSymbolArray Syms = ModS->getSymbolArray();
+
+  // Search for inline sites. There should be one matching top level inline
+  // site. Then search in its nested inline sites.
+  std::vector<SymIndexId> Frames;
+  uint32_t CodeOffset = VA - getVirtualAddress();
+  auto Start = Syms.at(RecordOffset);
+  auto End = Syms.at(Sym.End);
+  while (Start != End) {
+    bool Found = false;
+    // Find matching inline site within Start and End.
+    for (; Start != End; ++Start) {
+      if (Start->kind() != S_INLINESITE)
+        continue;
+
+      InlineSiteSym IS =
+          cantFail(SymbolDeserializer::deserializeAs<InlineSiteSym>(*Start));
+      if (inlineSiteContainsAddress(IS, CodeOffset)) {
+        // Insert frames in reverse order.
+        SymIndexId Id = Session.getSymbolCache().getOrCreateInlineSymbol(
+            IS, getVirtualAddress(), Modi, Start.offset());
+        Frames.insert(Frames.begin(), Id);
+
+        // Update offsets to search within this inline site.
+        ++Start;
+        End = Syms.at(IS.End);
+        Found = true;
+        break;
+      }
+
+      Start = Syms.at(IS.End);
+      if (Start == End)
+        break;
+    }
+
+    if (!Found)
+      break;
+  }
+
+  return std::make_unique<NativeEnumSymbols>(Session, std::move(Frames));
+}
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/NativeInlineSiteSymbol.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/NativeInlineSiteSymbol.cpp
new file mode 100644
index 000000000000..8314353c3890
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/NativeInlineSiteSymbol.cpp
@@ -0,0 +1,177 @@
+//===- NativeInlineSiteSymbol.cpp - info about inline sites -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Native/NativeInlineSiteSymbol.h"
+
+#include "llvm/DebugInfo/CodeView/DebugInlineeLinesSubsection.h"
+#include "llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h"
+#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
+#include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
+#include "llvm/DebugInfo/PDB/Native/NativeEnumLineNumbers.h"
+#include "llvm/DebugInfo/PDB/Native/TpiStream.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+using namespace llvm::pdb;
+
+NativeInlineSiteSymbol::NativeInlineSiteSymbol(
+    NativeSession &Session, SymIndexId Id, const codeview::InlineSiteSym &Sym,
+    uint64_t ParentAddr)
+    : NativeRawSymbol(Session, PDB_SymType::InlineSite, Id), Sym(Sym),
+      ParentAddr(ParentAddr) {}
+
+NativeInlineSiteSymbol::~NativeInlineSiteSymbol() {}
+
+void NativeInlineSiteSymbol::dump(raw_ostream &OS, int Indent,
+                                  PdbSymbolIdField ShowIdFields,
+                                  PdbSymbolIdField RecurseIdFields) const {
+  NativeRawSymbol::dump(OS, Indent, ShowIdFields, RecurseIdFields);
+  dumpSymbolField(OS, "name", getName(), Indent);
+}
+
+static Optional<InlineeSourceLine>
+findInlineeByTypeIndex(TypeIndex Id, ModuleDebugStreamRef &ModS) {
+  for (const auto &SS : ModS.getSubsectionsArray()) {
+    if (SS.kind() != DebugSubsectionKind::InlineeLines)
+      continue;
+
+    DebugInlineeLinesSubsectionRef InlineeLines;
+    BinaryStreamReader Reader(SS.getRecordData());
+    if (auto EC = InlineeLines.initialize(Reader)) {
+      consumeError(std::move(EC));
+      continue;
+    }
+
+    for (const InlineeSourceLine &Line : InlineeLines)
+      if (Line.Header->Inlinee == Id)
+        return Line;
+  }
+  return None;
+}
+
+std::string NativeInlineSiteSymbol::getName() const {
+  auto Tpi = Session.getPDBFile().getPDBTpiStream();
+  if (!Tpi) {
+    consumeError(Tpi.takeError());
+    return "";
+  }
+  auto Ipi = Session.getPDBFile().getPDBIpiStream();
+  if (!Ipi) {
+    consumeError(Ipi.takeError());
+    return "";
+  }
+
+  LazyRandomTypeCollection &Types = Tpi->typeCollection();
+  LazyRandomTypeCollection &Ids = Ipi->typeCollection();
+  CVType InlineeType = Ids.getType(Sym.Inlinee);
+  std::string QualifiedName;
+  if (InlineeType.kind() == LF_MFUNC_ID) {
+    MemberFuncIdRecord MFRecord;
+    cantFail(TypeDeserializer::deserializeAs<MemberFuncIdRecord>(InlineeType,
+                                                                 MFRecord));
+    TypeIndex ClassTy = MFRecord.getClassType();
+    QualifiedName.append(std::string(Types.getTypeName(ClassTy)));
+    QualifiedName.append("::");
+  } else if (InlineeType.kind() == LF_FUNC_ID) {
+    FuncIdRecord FRecord;
+    cantFail(
+        TypeDeserializer::deserializeAs<FuncIdRecord>(InlineeType, FRecord));
+    TypeIndex ParentScope = FRecord.getParentScope();
+    if (!ParentScope.isNoneType()) {
+      QualifiedName.append(std::string(Ids.getTypeName(ParentScope)));
+      QualifiedName.append("::");
+    }
+  }
+
+  QualifiedName.append(std::string(Ids.getTypeName(Sym.Inlinee)));
+  return QualifiedName;
+}
+
+void NativeInlineSiteSymbol::getLineOffset(uint32_t OffsetInFunc,
+                                           uint32_t &LineOffset,
+                                           uint32_t &FileOffset) const {
+  LineOffset = 0;
+  FileOffset = 0;
+  uint32_t CodeOffset = 0;
+  for (const auto &Annot : Sym.annotations()) {
+    switch (Annot.OpCode) {
+    case BinaryAnnotationsOpCode::CodeOffset:
+    case BinaryAnnotationsOpCode::ChangeCodeOffset:
+    case BinaryAnnotationsOpCode::ChangeCodeLength:
+      CodeOffset += Annot.U1;
+      break;
+    case BinaryAnnotationsOpCode::ChangeCodeLengthAndCodeOffset:
+      CodeOffset += Annot.U2;
+      break;
+    case BinaryAnnotationsOpCode::ChangeLineOffset:
+    case BinaryAnnotationsOpCode::ChangeCodeOffsetAndLineOffset:
+      CodeOffset += Annot.U1;
+      LineOffset += Annot.S1;
+      break;
+    case BinaryAnnotationsOpCode::ChangeFile:
+      FileOffset = Annot.U1;
+      break;
+    default:
+      break;
+    }
+
+    if (CodeOffset >= OffsetInFunc)
+      return;
+  }
+}
+
+std::unique_ptr<IPDBEnumLineNumbers>
+NativeInlineSiteSymbol::findInlineeLinesByVA(uint64_t VA,
+                                             uint32_t Length) const {
+  uint16_t Modi;
+  if (!Session.moduleIndexForVA(VA, Modi))
+    return nullptr;
+
+  Expected<ModuleDebugStreamRef> ModS = Session.getModuleDebugStream(Modi);
+  if (!ModS) {
+    consumeError(ModS.takeError());
+    return nullptr;
+  }
+
+  Expected<DebugChecksumsSubsectionRef> Checksums =
+      ModS->findChecksumsSubsection();
+  if (!Checksums) {
+    consumeError(Checksums.takeError());
+    return nullptr;
+  }
+
+  // Get the line number offset and source file offset.
+  uint32_t SrcLineOffset;
+  uint32_t SrcFileOffset;
+  getLineOffset(VA - ParentAddr, SrcLineOffset, SrcFileOffset);
+
+  // Get line info from inlinee line table.
+  Optional<InlineeSourceLine> Inlinee =
+      findInlineeByTypeIndex(Sym.Inlinee, ModS.get());
+
+  if (!Inlinee)
+    return nullptr;
+
+  uint32_t SrcLine = Inlinee->Header->SourceLineNum + SrcLineOffset;
+  uint32_t SrcCol = 0; // Inline sites don't seem to have column info.
+  uint32_t FileChecksumOffset =
+      (SrcFileOffset == 0) ? Inlinee->Header->FileID : SrcFileOffset;
+
+  auto ChecksumIter = Checksums->getArray().at(FileChecksumOffset);
+  uint32_t SrcFileId =
+      Session.getSymbolCache().getOrCreateSourceFile(*ChecksumIter);
+
+  uint32_t LineSect, LineOff;
+  Session.addressForVA(VA, LineSect, LineOff);
+  NativeLineNumber LineNum(Session, SrcLine, SrcCol, LineSect, LineOff, Length,
+                           SrcFileId, Modi);
+  auto SrcFile = Session.getSymbolCache().getSourceFileById(SrcFileId);
+  std::vector<NativeLineNumber> Lines{LineNum};
+
+  return std::make_unique<NativeEnumLineNumbers>(std::move(Lines));
+}
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/NativeLineNumber.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/NativeLineNumber.cpp
index 2535e09baf62..155ed0cdb828 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/NativeLineNumber.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/NativeLineNumber.cpp
@@ -15,9 +15,10 @@ NativeLineNumber::NativeLineNumber(const NativeSession &Session,
                                    const codeview::LineInfo Line,
                                    uint32_t ColumnNumber, uint32_t Section,
                                    uint32_t Offset, uint32_t Length,
-                                   uint32_t SrcFileId)
+                                   uint32_t SrcFileId, uint32_t CompilandId)
     : Session(Session), Line(Line), ColumnNumber(ColumnNumber),
-      Section(Section), Offset(Offset), Length(Length), SrcFileId(SrcFileId) {}
+      Section(Section), Offset(Offset), Length(Length), SrcFileId(SrcFileId),
+      CompilandId(CompilandId) {}
 
 uint32_t NativeLineNumber::getLineNumber() const { return Line.getStartLine(); }
 
@@ -45,6 +46,6 @@ uint32_t NativeLineNumber::getLength() const { return Length; }
 
 uint32_t NativeLineNumber::getSourceFileId() const { return SrcFileId; }
 
-uint32_t NativeLineNumber::getCompilandId() const { return 0; }
+uint32_t NativeLineNumber::getCompilandId() const { return CompilandId; }
 
 bool NativeLineNumber::isStatement() const { return Line.isStatement(); }
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/NativePublicSymbol.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/NativePublicSymbol.cpp
index 7086af7e67a2..1265e688b867 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/NativePublicSymbol.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/NativePublicSymbol.cpp
@@ -18,7 +18,7 @@ using namespace llvm::pdb;
 
 NativePublicSymbol::NativePublicSymbol(NativeSession &Session, SymIndexId Id,
                                        const codeview::PublicSym32 &Sym)
-    : NativeRawSymbol(Session, PDB_SymType::Data, Id), Sym(Sym) {}
+    : NativeRawSymbol(Session, PDB_SymType::PublicSymbol, Id), Sym(Sym) {}
 
 NativePublicSymbol::~NativePublicSymbol() {}
 
@@ -39,10 +39,6 @@ std::string NativePublicSymbol::getName() const {
   return std::string(Sym.Name);
 }
 
-PDB_SymType NativePublicSymbol::getSymTag() const {
-  return PDB_SymType::PublicSymbol;
-}
-
 uint32_t NativePublicSymbol::getRelativeVirtualAddress() const {
   return Session.getRVAFromSectOffset(Sym.Segment, Sym.Offset);
 }
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/NativeSession.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/NativeSession.cpp
index ac8449df44ff..5d7946cdc2f7 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/NativeSession.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/NativeSession.cpp
@@ -13,6 +13,7 @@
 #include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
 #include "llvm/DebugInfo/PDB/IPDBSourceFile.h"
 #include "llvm/DebugInfo/PDB/Native/DbiStream.h"
+#include "llvm/DebugInfo/PDB/Native/ISectionContribVisitor.h"
 #include "llvm/DebugInfo/PDB/Native/NativeCompilandSymbol.h"
 #include "llvm/DebugInfo/PDB/Native/NativeEnumInjectedSources.h"
 #include "llvm/DebugInfo/PDB/Native/NativeEnumTypes.h"
@@ -56,7 +57,7 @@ static DbiStream *getDbiStreamPtr(PDBFile &File) {
 NativeSession::NativeSession(std::unique_ptr<PDBFile> PdbFile,
                              std::unique_ptr<BumpPtrAllocator> Allocator)
     : Pdb(std::move(PdbFile)), Allocator(std::move(Allocator)),
-      Cache(*this, getDbiStreamPtr(*Pdb)) {}
+      Cache(*this, getDbiStreamPtr(*Pdb)), AddrToModuleIndex(IMapAllocator) {}
 
 NativeSession::~NativeSession() = default;
 
@@ -255,6 +256,9 @@ std::unique_ptr<PDBSymbol> NativeSession::findSymbolByRVA(uint32_t RVA,
 std::unique_ptr<PDBSymbol>
 NativeSession::findSymbolBySectOffset(uint32_t Sect, uint32_t Offset,
                                       PDB_SymType Type) {
+  if (AddrToModuleIndex.empty())
+    parseSectionContribs();
+
   return Cache.findSymbolBySectOffset(Sect, Offset, Type);
 }
 
@@ -272,14 +276,14 @@ NativeSession::findLineNumbersByAddress(uint64_t Address,
 
 std::unique_ptr<IPDBEnumLineNumbers>
 NativeSession::findLineNumbersByRVA(uint32_t RVA, uint32_t Length) const {
-  return findLineNumbersByAddress(getLoadAddress() + RVA, Length);
+  return Cache.findLineNumbersByVA(getLoadAddress() + RVA, Length);
 }
 
 std::unique_ptr<IPDBEnumLineNumbers>
 NativeSession::findLineNumbersBySectOffset(uint32_t Section, uint32_t Offset,
                                            uint32_t Length) const {
   uint64_t VA = getVAFromSectOffset(Section, Offset);
-  return findLineNumbersByAddress(VA, Length);
+  return Cache.findLineNumbersByVA(VA, Length);
 }
 
 std::unique_ptr<IPDBEnumSourceFiles>
@@ -386,3 +390,74 @@ uint64_t NativeSession::getVAFromSectOffset(uint32_t Section,
                                             uint32_t Offset) const {
   return LoadAddress + getRVAFromSectOffset(Section, Offset);
 }
+
+bool NativeSession::moduleIndexForVA(uint64_t VA, uint16_t &ModuleIndex) const {
+  ModuleIndex = 0;
+  auto Iter = AddrToModuleIndex.find(VA);
+  if (Iter == AddrToModuleIndex.end())
+    return false;
+  ModuleIndex = Iter.value();
+  return true;
+}
+
+bool NativeSession::moduleIndexForSectOffset(uint32_t Sect, uint32_t Offset,
+                                             uint16_t &ModuleIndex) const {
+  ModuleIndex = 0;
+  auto Iter = AddrToModuleIndex.find(getVAFromSectOffset(Sect, Offset));
+  if (Iter == AddrToModuleIndex.end())
+    return false;
+  ModuleIndex = Iter.value();
+  return true;
+}
+
+void NativeSession::parseSectionContribs() {
+  auto Dbi = Pdb->getPDBDbiStream();
+  if (!Dbi)
+    return;
+
+  class Visitor : public ISectionContribVisitor {
+    NativeSession &Session;
+    IMap &AddrMap;
+
+  public:
+    Visitor(NativeSession &Session, IMap &AddrMap)
+        : Session(Session), AddrMap(AddrMap) {}
+    void visit(const SectionContrib &C) override {
+      if (C.Size == 0)
+        return;
+
+      uint64_t VA = Session.getVAFromSectOffset(C.ISect, C.Off);
+      uint64_t End = VA + C.Size;
+
+      // Ignore overlapping sections based on the assumption that a valid
+      // PDB file should not have overlaps.
+      if (!AddrMap.overlaps(VA, End))
+        AddrMap.insert(VA, End, C.Imod);
+    }
+    void visit(const SectionContrib2 &C) override { visit(C.Base); }
+  };
+
+  Visitor V(*this, AddrToModuleIndex);
+  Dbi->visitSectionContributions(V);
+}
+
+Expected<ModuleDebugStreamRef>
+NativeSession::getModuleDebugStream(uint32_t Index) const {
+  auto *Dbi = getDbiStreamPtr(*Pdb);
+  assert(Dbi && "Dbi stream not present");
+
+  DbiModuleDescriptor Modi = Dbi->modules().getModuleDescriptor(Index);
+
+  uint16_t ModiStream = Modi.getModuleStreamIndex();
+  if (ModiStream == kInvalidStreamIndex)
+    return make_error<RawError>("Module stream not present");
+
+  std::unique_ptr<msf::MappedBlockStream> ModStreamData =
+      Pdb->createIndexedStream(ModiStream);
+
+  ModuleDebugStreamRef ModS(Modi, std::move(ModStreamData));
+  if (auto EC = ModS.reload())
+    return std::move(EC);
+
+  return std::move(ModS);
+}
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/NativeSourceFile.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/NativeSourceFile.cpp
index 6473207e058a..fd813dee6b9f 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/NativeSourceFile.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/NativeSourceFile.cpp
@@ -1,4 +1,4 @@
-//===- NativeSourceFile.cpp - Native line number implementaiton -*- C++ -*-===//
+//===- NativeSourceFile.cpp - Native line number implementation -*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/NativeTypeUDT.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/NativeTypeUDT.cpp
index b0be7f76e86e..917ec14e58d6 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/NativeTypeUDT.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/NativeTypeUDT.cpp
@@ -120,7 +120,7 @@ PDB_UdtType NativeTypeUDT::getUdtKind() const {
   case TypeRecordKind::Interface:
     return PDB_UdtType::Interface;
   default:
-    llvm_unreachable("Unexected udt kind");
+    llvm_unreachable("Unexpected udt kind");
   }
 }
 
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/SymbolCache.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/SymbolCache.cpp
index 9f15907b519e..fd9a0deb54d6 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/SymbolCache.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/SymbolCache.cpp
@@ -1,5 +1,6 @@
 #include "llvm/DebugInfo/PDB/Native/SymbolCache.h"
 
+#include "llvm/DebugInfo/CodeView/DebugInlineeLinesSubsection.h"
 #include "llvm/DebugInfo/CodeView/DebugLinesSubsection.h"
 #include "llvm/DebugInfo/CodeView/SymbolDeserializer.h"
 #include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
@@ -10,8 +11,10 @@
 #include "llvm/DebugInfo/PDB/Native/NativeCompilandSymbol.h"
 #include "llvm/DebugInfo/PDB/Native/NativeEnumGlobals.h"
 #include "llvm/DebugInfo/PDB/Native/NativeEnumLineNumbers.h"
+#include "llvm/DebugInfo/PDB/Native/NativeEnumSymbols.h"
 #include "llvm/DebugInfo/PDB/Native/NativeEnumTypes.h"
 #include "llvm/DebugInfo/PDB/Native/NativeFunctionSymbol.h"
+#include "llvm/DebugInfo/PDB/Native/NativeInlineSiteSymbol.h"
 #include "llvm/DebugInfo/PDB/Native/NativePublicSymbol.h"
 #include "llvm/DebugInfo/PDB/Native/NativeRawSymbol.h"
 #include "llvm/DebugInfo/PDB/Native/NativeSession.h"
@@ -68,7 +71,7 @@ static const struct BuiltinTypeEntry {
 };
 
 SymbolCache::SymbolCache(NativeSession &Session, DbiStream *Dbi)
-    : Session(Session), Dbi(Dbi), AddrToModuleIndex(IMapAllocator) {
+    : Session(Session), Dbi(Dbi) {
   // Id 0 is reserved for the invalid symbol.
   Cache.push_back(nullptr);
   SourceFiles.push_back(nullptr);
@@ -101,14 +104,15 @@ SymbolCache::createGlobalsEnumerator(codeview::SymbolKind Kind) {
 }
 
 SymIndexId SymbolCache::createSimpleType(TypeIndex Index,
-                                         ModifierOptions Mods) {
+                                         ModifierOptions Mods) const {
   if (Index.getSimpleMode() != codeview::SimpleTypeMode::Direct)
     return createSymbol<NativeTypePointer>(Index);
 
   const auto Kind = Index.getSimpleKind();
-  const auto It = std::find_if(
-      std::begin(BuiltinTypes), std::end(BuiltinTypes),
-      [Kind](const BuiltinTypeEntry &Builtin) { return Builtin.Kind == Kind; });
+  const auto It =
+      llvm::find_if(BuiltinTypes, [Kind](const BuiltinTypeEntry &Builtin) {
+        return Builtin.Kind == Kind;
+      });
   if (It == std::end(BuiltinTypes))
     return 0;
   return createSymbol<NativeTypeBuiltin>(Mods, It->Type, It->Size);
@@ -116,7 +120,7 @@ SymIndexId SymbolCache::createSimpleType(TypeIndex Index,
 
 SymIndexId
 SymbolCache::createSymbolForModifiedType(codeview::TypeIndex ModifierTI,
-                                         codeview::CVType CVT) {
+                                         codeview::CVType CVT) const {
   ModifierRecord Record;
   if (auto EC = TypeDeserializer::deserializeAs<ModifierRecord>(CVT, Record)) {
     consumeError(std::move(EC));
@@ -146,7 +150,7 @@ SymbolCache::createSymbolForModifiedType(codeview::TypeIndex ModifierTI,
   return 0;
 }
 
-SymIndexId SymbolCache::findSymbolByTypeIndex(codeview::TypeIndex Index) {
+SymIndexId SymbolCache::findSymbolByTypeIndex(codeview::TypeIndex Index) const {
   // First see if it's already in our cache.
   const auto Entry = TypeIndexToSymbolId.find(Index);
   if (Entry != TypeIndexToSymbolId.end())
@@ -243,7 +247,7 @@ SymbolCache::getSymbolById(SymIndexId SymbolId) const {
     return nullptr;
 
   // Make sure to handle the case where we've inserted a placeholder symbol
-  // for types we don't yet suppport.
+  // for types we don't yet support.
   NativeRawSymbol *NRS = Cache[SymbolId].get();
   if (!NRS)
     return nullptr;
@@ -288,39 +292,36 @@ SymIndexId SymbolCache::getOrCreateGlobalSymbolByOffset(uint32_t Offset) {
   return Id;
 }
 
-Expected<ModuleDebugStreamRef>
-SymbolCache::getModuleDebugStream(uint32_t Index) const {
-  assert(Dbi && "Dbi stream not present");
-
-  DbiModuleDescriptor Modi = Dbi->modules().getModuleDescriptor(Index);
-
-  uint16_t ModiStream = Modi.getModuleStreamIndex();
-  if (ModiStream == kInvalidStreamIndex)
-    return make_error<RawError>("Module stream not present");
-
-  std::unique_ptr<msf::MappedBlockStream> ModStreamData =
-      Session.getPDBFile().createIndexedStream(ModiStream);
-
-  ModuleDebugStreamRef ModS(Modi, std::move(ModStreamData));
-  if (auto EC = ModS.reload())
-    return std::move(EC);
+SymIndexId SymbolCache::getOrCreateInlineSymbol(InlineSiteSym Sym,
+                                                uint64_t ParentAddr,
+                                                uint16_t Modi,
+                                                uint32_t RecordOffset) const {
+  auto Iter = SymTabOffsetToSymbolId.find({Modi, RecordOffset});
+  if (Iter != SymTabOffsetToSymbolId.end())
+    return Iter->second;
 
-  return std::move(ModS);
+  SymIndexId Id = createSymbol<NativeInlineSiteSymbol>(Sym, ParentAddr);
+  SymTabOffsetToSymbolId.insert({{Modi, RecordOffset}, Id});
+  return Id;
 }
 
 std::unique_ptr<PDBSymbol>
 SymbolCache::findSymbolBySectOffset(uint32_t Sect, uint32_t Offset,
                                     PDB_SymType Type) {
-  if (AddrToModuleIndex.empty())
-    parseSectionContribs();
-
   switch (Type) {
   case PDB_SymType::Function:
     return findFunctionSymbolBySectOffset(Sect, Offset);
   case PDB_SymType::PublicSymbol:
     return findPublicSymbolBySectOffset(Sect, Offset);
+  case PDB_SymType::Compiland: {
+    uint16_t Modi;
+    if (!Session.moduleIndexForSectOffset(Sect, Offset, Modi))
+      return nullptr;
+    return getOrCreateCompiland(Modi);
+  }
   case PDB_SymType::None: {
-    // FIXME: Implement for PDB_SymType::Data.
+    // FIXME: Implement for PDB_SymType::Data. The symbolizer calls this but
+    // only uses it to find the symbol length.
     if (auto Sym = findFunctionSymbolBySectOffset(Sect, Offset))
       return Sym;
     return nullptr;
@@ -332,18 +333,19 @@ SymbolCache::findSymbolBySectOffset(uint32_t Sect, uint32_t Offset,
 
 std::unique_ptr<PDBSymbol>
 SymbolCache::findFunctionSymbolBySectOffset(uint32_t Sect, uint32_t Offset) {
-  auto Iter = AddressToFunctionSymId.find({Sect, Offset});
-  if (Iter != AddressToFunctionSymId.end())
+  auto Iter = AddressToSymbolId.find({Sect, Offset});
+  if (Iter != AddressToSymbolId.end())
     return getSymbolById(Iter->second);
 
   if (!Dbi)
     return nullptr;
 
-  auto Modi = getModuleIndexForAddr(Session.getVAFromSectOffset(Sect, Offset));
-  if (!Modi)
+  uint16_t Modi;
+  if (!Session.moduleIndexForSectOffset(Sect, Offset, Modi))
     return nullptr;
 
-  auto ExpectedModS = getModuleDebugStream(*Modi);
+  Expected<ModuleDebugStreamRef> ExpectedModS =
+      Session.getModuleDebugStream(Modi);
   if (!ExpectedModS) {
     consumeError(ExpectedModS.takeError());
     return nullptr;
@@ -357,8 +359,14 @@ SymbolCache::findFunctionSymbolBySectOffset(uint32_t Sect, uint32_t Offset) {
     auto PS = cantFail(SymbolDeserializer::deserializeAs<ProcSym>(*I));
     if (Sect == PS.Segment && Offset >= PS.CodeOffset &&
         Offset < PS.CodeOffset + PS.CodeSize) {
-      SymIndexId Id = createSymbol<NativeFunctionSymbol>(PS);
-      AddressToFunctionSymId.insert({{Sect, Offset}, Id});
+      // Check if the symbol is already cached.
+      auto Found = AddressToSymbolId.find({PS.Segment, PS.CodeOffset});
+      if (Found != AddressToSymbolId.end())
+        return getSymbolById(Found->second);
+
+      // Otherwise, create a new symbol.
+      SymIndexId Id = createSymbol<NativeFunctionSymbol>(PS, I.offset());
+      AddressToSymbolId.insert({{PS.Segment, PS.CodeOffset}, Id});
       return getSymbolById(Id);
     }
 
@@ -418,9 +426,16 @@ SymbolCache::findPublicSymbolBySectOffset(uint32_t Sect, uint32_t Offset) {
     consumeError(Sym.takeError());
     return nullptr;
   }
+
+  // Check if the symbol is already cached.
   auto PS = cantFail(SymbolDeserializer::deserializeAs<PublicSym32>(Sym.get()));
+  auto Found = AddressToPublicSymId.find({PS.Segment, PS.Offset});
+  if (Found != AddressToPublicSymId.end())
+    return getSymbolById(Found->second);
+
+  // Otherwise, create a new symbol.
   SymIndexId Id = createSymbol<NativePublicSymbol>(PS);
-  AddressToPublicSymId.insert({{Sect, Offset}, Id});
+  AddressToPublicSymId.insert({{PS.Segment, PS.Offset}, Id});
   return getSymbolById(Id);
 }
 
@@ -435,7 +450,8 @@ SymbolCache::findLineTable(uint16_t Modi) const {
 
   // If there is an error or there are no lines, just return the
   // empty vector.
-  Expected<ModuleDebugStreamRef> ExpectedModS = getModuleDebugStream(Modi);
+  Expected<ModuleDebugStreamRef> ExpectedModS =
+      Session.getModuleDebugStream(Modi);
   if (!ExpectedModS) {
     consumeError(ExpectedModS.takeError());
     return ModuleLineTable;
@@ -466,11 +482,19 @@ SymbolCache::findLineTable(uint16_t Modi) const {
       auto ColIt = Group.Columns.begin();
       auto ColsEnd = Group.Columns.end();
 
+      // Add a line to mark the beginning of this section.
+      uint64_t StartAddr =
+          Session.getVAFromSectOffset(RelocSegment, RelocOffset);
+      LineInfo FirstLine(Group.LineNumbers.front().Flags);
+      uint32_t ColNum =
+          (Lines.hasColumnInfo()) ? Group.Columns.front().StartColumn : 0;
+      Entries.push_back({StartAddr, FirstLine, ColNum, Group.NameIndex, false});
+
       for (const LineNumberEntry &LN : Group.LineNumbers) {
         uint64_t VA =
             Session.getVAFromSectOffset(RelocSegment, RelocOffset + LN.Offset);
         LineInfo Line(LN.Flags);
-        uint32_t ColNum = 0;
+        ColNum = 0;
 
         if (Lines.hasColumnInfo() && ColIt != ColsEnd) {
           ColNum = ColIt->StartColumn;
@@ -480,36 +504,31 @@ SymbolCache::findLineTable(uint16_t Modi) const {
       }
 
       // Add a terminal entry line to mark the end of this subsection.
-      uint64_t VA = Session.getVAFromSectOffset(
-          RelocSegment, RelocOffset + Lines.header()->CodeSize);
+      uint64_t EndAddr = StartAddr + Lines.header()->CodeSize;
       LineInfo LastLine(Group.LineNumbers.back().Flags);
-      uint32_t ColNum =
-          (Lines.hasColumnInfo()) ? Group.Columns.back().StartColumn : 0;
-      Entries.push_back({VA, LastLine, ColNum, Group.NameIndex, true});
+      ColNum = (Lines.hasColumnInfo()) ? Group.Columns.back().StartColumn : 0;
+      Entries.push_back({EndAddr, LastLine, ColNum, Group.NameIndex, true});
 
       EntryList.push_back(Entries);
     }
   }
 
   // Sort EntryList, and add flattened contents to the line table.
-  std::sort(EntryList.begin(), EntryList.end(),
-            [](const std::vector<LineTableEntry> &LHS,
-               const std::vector<LineTableEntry> &RHS) {
-              return LHS[0].Addr < RHS[0].Addr;
-            });
+  llvm::sort(EntryList, [](const std::vector<LineTableEntry> &LHS,
+                           const std::vector<LineTableEntry> &RHS) {
+    return LHS[0].Addr < RHS[0].Addr;
+  });
   for (size_t I = 0; I < EntryList.size(); ++I)
-    ModuleLineTable.insert(ModuleLineTable.end(), EntryList[I].begin(),
-                           EntryList[I].end());
+    llvm::append_range(ModuleLineTable, EntryList[I]);
 
   return ModuleLineTable;
 }
 
 std::unique_ptr<IPDBEnumLineNumbers>
 SymbolCache::findLineNumbersByVA(uint64_t VA, uint32_t Length) const {
-  Optional<uint16_t> MaybeModi = getModuleIndexForAddr(VA);
-  if (!MaybeModi)
+  uint16_t Modi;
+  if (!Session.moduleIndexForVA(VA, Modi))
     return nullptr;
-  uint16_t Modi = *MaybeModi;
 
   std::vector<LineTableEntry> Lines = findLineTable(Modi);
   if (Lines.empty())
@@ -528,7 +547,8 @@ SymbolCache::findLineNumbersByVA(uint64_t VA, uint32_t Length) const {
     --LineIter;
   }
 
-  Expected<ModuleDebugStreamRef> ExpectedModS = getModuleDebugStream(Modi);
+  Expected<ModuleDebugStreamRef> ExpectedModS =
+      Session.getModuleDebugStream(Modi);
   if (!ExpectedModS) {
     consumeError(ExpectedModS.takeError());
     return nullptr;
@@ -542,34 +562,8 @@ SymbolCache::findLineNumbersByVA(uint64_t VA, uint32_t Length) const {
 
   // Populate a vector of NativeLineNumbers that have addresses in the given
   // address range.
-  Optional<uint16_t> EndModi = getModuleIndexForAddr(VA + Length);
-  if (!EndModi)
-    return nullptr;
   std::vector<NativeLineNumber> LineNumbers;
-  while (Modi <= *EndModi) {
-    // If we reached the end of the current module, increment Modi and get the
-    // new line table and checksums array.
-    if (LineIter == Lines.end()) {
-      ++Modi;
-
-      ExpectedModS = getModuleDebugStream(Modi);
-      if (!ExpectedModS) {
-        consumeError(ExpectedModS.takeError());
-        break;
-      }
-      ExpectedChecksums = ExpectedModS->findChecksumsSubsection();
-      if (!ExpectedChecksums) {
-        consumeError(ExpectedChecksums.takeError());
-        break;
-      }
-
-      Lines = findLineTable(Modi);
-      LineIter = Lines.begin();
-
-      if (Lines.empty())
-        continue;
-    }
-
+  while (LineIter != Lines.end()) {
     if (LineIter->IsTerminalEntry) {
       ++LineIter;
       continue;
@@ -587,7 +581,7 @@ SymbolCache::findLineNumbersByVA(uint64_t VA, uint32_t Length) const {
         ExpectedChecksums->getArray().at(LineIter->FileNameIndex);
     uint32_t SrcFileId = getOrCreateSourceFile(*ChecksumIter);
     NativeLineNumber LineNum(Session, LineIter->Line, LineIter->ColumnNumber,
-                             LineSect, LineOff, LineLength, SrcFileId);
+                             LineSect, LineOff, LineLength, SrcFileId, Modi);
     LineNumbers.push_back(LineNum);
     ++LineIter;
   }
@@ -636,39 +630,4 @@ SymbolCache::getOrCreateSourceFile(const FileChecksumEntry &Checksums) const {
   return Id;
 }
 
-void SymbolCache::parseSectionContribs() {
-  if (!Dbi)
-    return;
-
-  class Visitor : public ISectionContribVisitor {
-    NativeSession &Session;
-    IMap &AddrMap;
-
-  public:
-    Visitor(NativeSession &Session, IMap &AddrMap)
-        : Session(Session), AddrMap(AddrMap) {}
-    void visit(const SectionContrib &C) override {
-      if (C.Size == 0)
-        return;
-
-      uint64_t VA = Session.getVAFromSectOffset(C.ISect, C.Off);
-      uint64_t End = VA + C.Size;
-
-      // Ignore overlapping sections based on the assumption that a valid
-      // PDB file should not have overlaps.
-      if (!AddrMap.overlaps(VA, End))
-        AddrMap.insert(VA, End, C.Imod);
-    }
-    void visit(const SectionContrib2 &C) override { visit(C.Base); }
-  };
 
-  Visitor V(Session, AddrToModuleIndex);
-  Dbi->visitSectionContributions(V);
-}
-
-Optional<uint16_t> SymbolCache::getModuleIndexForAddr(uint64_t Addr) const {
-  auto Iter = AddrToModuleIndex.find(Addr);
-  if (Iter == AddrToModuleIndex.end())
-    return None;
-  return Iter.value();
-}
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp
index 51a1f0a544e3..5f4f497690b6 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp
@@ -25,6 +25,7 @@
 #include "llvm/Support/Error.h"
 #include <algorithm>
 #include <cstdint>
+#include <numeric>
 
 using namespace llvm;
 using namespace llvm::msf;
@@ -41,39 +42,68 @@ void TpiStreamBuilder::setVersionHeader(PdbRaw_TpiVer Version) {
   VerHeader = Version;
 }
 
+void TpiStreamBuilder::updateTypeIndexOffsets(ArrayRef<uint16_t> Sizes) {
+  // If we just crossed an 8KB threshold, add a type index offset.
+  for (uint16_t Size : Sizes) {
+    size_t NewSize = TypeRecordBytes + Size;
+    constexpr size_t EightKB = 8 * 1024;
+    if (NewSize / EightKB > TypeRecordBytes / EightKB || TypeRecordCount == 0) {
+      TypeIndexOffsets.push_back(
+          {codeview::TypeIndex(codeview::TypeIndex::FirstNonSimpleIndex +
+                               TypeRecordCount),
+           ulittle32_t(TypeRecordBytes)});
+    }
+    ++TypeRecordCount;
+    TypeRecordBytes = NewSize;
+  }
+}
+
 void TpiStreamBuilder::addTypeRecord(ArrayRef<uint8_t> Record,
                                      Optional<uint32_t> Hash) {
-  // If we just crossed an 8KB threshold, add a type index offset.
   assert(((Record.size() & 3) == 0) &&
          "The type record's size is not a multiple of 4 bytes which will "
          "cause misalignment in the output TPI stream!");
-  size_t NewSize = TypeRecordBytes + Record.size();
-  constexpr size_t EightKB = 8 * 1024;
-  if (NewSize / EightKB > TypeRecordBytes / EightKB || TypeRecords.empty()) {
-    TypeIndexOffsets.push_back(
-        {codeview::TypeIndex(codeview::TypeIndex::FirstNonSimpleIndex +
-                             TypeRecords.size()),
-         ulittle32_t(TypeRecordBytes)});
-  }
-  TypeRecordBytes = NewSize;
+  assert(Record.size() <= codeview::MaxRecordLength);
+  uint16_t OneSize = (uint16_t)Record.size();
+  updateTypeIndexOffsets(makeArrayRef(&OneSize, 1));
 
-  TypeRecords.push_back(Record);
+  TypeRecBuffers.push_back(Record);
+  // FIXME: Require it.
   if (Hash)
     TypeHashes.push_back(*Hash);
 }
 
+void TpiStreamBuilder::addTypeRecords(ArrayRef<uint8_t> Types,
+                                      ArrayRef<uint16_t> Sizes,
+                                      ArrayRef<uint32_t> Hashes) {
+  // Ignore empty type buffers. There should be no hashes or sizes in this case.
+  if (Types.empty()) {
+    assert(Sizes.empty() && Hashes.empty());
+    return;
+  }
+
+  assert(((Types.size() & 3) == 0) &&
+         "The type record's size is not a multiple of 4 bytes which will "
+         "cause misalignment in the output TPI stream!");
+  assert(Sizes.size() == Hashes.size() && "sizes and hashes should be in sync");
+  assert(std::accumulate(Sizes.begin(), Sizes.end(), 0U) == Types.size() &&
+         "sizes of type records should sum to the size of the types");
+  updateTypeIndexOffsets(Sizes);
+
+  TypeRecBuffers.push_back(Types);
+  llvm::append_range(TypeHashes, Hashes);
+}
+
 Error TpiStreamBuilder::finalize() {
   if (Header)
     return Error::success();
 
   TpiStreamHeader *H = Allocator.Allocate<TpiStreamHeader>();
 
-  uint32_t Count = TypeRecords.size();
-
   H->Version = VerHeader;
   H->HeaderSize = sizeof(TpiStreamHeader);
   H->TypeIndexBegin = codeview::TypeIndex::FirstNonSimpleIndex;
-  H->TypeIndexEnd = H->TypeIndexBegin + Count;
+  H->TypeIndexEnd = H->TypeIndexBegin + TypeRecordCount;
   H->TypeRecordBytes = TypeRecordBytes;
 
   H->HashStreamIndex = HashStreamIndex;
@@ -104,7 +134,7 @@ uint32_t TpiStreamBuilder::calculateSerializedLength() {
 }
 
 uint32_t TpiStreamBuilder::calculateHashBufferSize() const {
-  assert((TypeRecords.size() == TypeHashes.size() || TypeHashes.empty()) &&
+  assert((TypeRecordCount == TypeHashes.size() || TypeHashes.empty()) &&
          "either all or no type records should have hashes");
   return TypeHashes.size() * sizeof(ulittle32_t);
 }
@@ -155,7 +185,7 @@ Error TpiStreamBuilder::commit(const msf::MSFLayout &Layout,
   if (auto EC = Writer.writeObject(*Header))
     return EC;
 
-  for (auto Rec : TypeRecords) {
+  for (auto Rec : TypeRecBuffers) {
     assert(!Rec.empty() && "Attempting to write an empty type record shifts "
                            "all offsets in the TPI stream!");
     assert(((Rec.size() & 3) == 0) &&
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/PDB/PDBContext.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/PDB/PDBContext.cpp
index e452f1d4ced7..0ebb70e010d5 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/PDB/PDBContext.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/PDB/PDBContext.cpp
@@ -86,8 +86,43 @@ DIInliningInfo
 PDBContext::getInliningInfoForAddress(object::SectionedAddress Address,
                                       DILineInfoSpecifier Specifier) {
   DIInliningInfo InlineInfo;
-  DILineInfo Frame = getLineInfoForAddress(Address, Specifier);
-  InlineInfo.addFrame(Frame);
+  DILineInfo CurrentLine = getLineInfoForAddress(Address, Specifier);
+
+  // Find the function at this address.
+  std::unique_ptr<PDBSymbol> ParentFunc =
+      Session->findSymbolByAddress(Address.Address, PDB_SymType::Function);
+  if (!ParentFunc) {
+    InlineInfo.addFrame(CurrentLine);
+    return InlineInfo;
+  }
+
+  auto Frames = ParentFunc->findInlineFramesByVA(Address.Address);
+  if (!Frames || Frames->getChildCount() == 0) {
+    InlineInfo.addFrame(CurrentLine);
+    return InlineInfo;
+  }
+
+  while (auto Frame = Frames->getNext()) {
+    uint32_t Length = 1;
+    auto LineNumbers = Frame->findInlineeLinesByVA(Address.Address, Length);
+    if (!LineNumbers || LineNumbers->getChildCount() == 0)
+      break;
+
+    std::unique_ptr<IPDBLineNumber> Line = LineNumbers->getNext();
+    assert(Line);
+
+    DILineInfo LineInfo;
+    LineInfo.FunctionName = Frame->getName();
+    auto SourceFile = Session->getSourceFileById(Line->getSourceFileId());
+    if (SourceFile &&
+        Specifier.FLIKind != DILineInfoSpecifier::FileLineInfoKind::None)
+      LineInfo.FileName = SourceFile->getFileName();
+    LineInfo.Line = Line->getLineNumber();
+    LineInfo.Column = Line->getColumnNumber();
+    InlineInfo.addFrame(LineInfo);
+  }
+
+  InlineInfo.addFrame(CurrentLine);
   return InlineInfo;
 }
 
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/PDB/PDBExtras.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/PDB/PDBExtras.cpp
index 354a99476c4b..25962e5152eb 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/PDB/PDBExtras.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/PDB/PDBExtras.cpp
@@ -118,7 +118,21 @@ raw_ostream &llvm::pdb::operator<<(raw_ostream &OS, const PDB_DataKind &Data) {
 
 raw_ostream &llvm::pdb::operator<<(raw_ostream &OS,
                                    const llvm::codeview::CPURegister &CpuReg) {
-  if (CpuReg.Cpu == llvm::codeview::CPUType::ARM64) {
+  if (CpuReg.Cpu == llvm::codeview::CPUType::ARMNT) {
+    switch (CpuReg.Reg) {
+#define CV_REGISTERS_ARM
+#define CV_REGISTER(name, val)                                                 \
+  case codeview::RegisterId::name:                                             \
+    OS << #name;                                                               \
+    return OS;
+#include "llvm/DebugInfo/CodeView/CodeViewRegisters.def"
+#undef CV_REGISTER
+#undef CV_REGISTERS_ARM
+
+    default:
+      break;
+    }
+  } else if (CpuReg.Cpu == llvm::codeview::CPUType::ARM64) {
     switch (CpuReg.Reg) {
 #define CV_REGISTERS_ARM64
 #define CV_REGISTER(name, val)                                                 \
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/PDB/PDBInterfaceAnchors.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/PDB/PDBInterfaceAnchors.cpp
index 8eb3311b09e3..d51091d80933 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/PDB/PDBInterfaceAnchors.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/PDB/PDBInterfaceAnchors.cpp
@@ -1,4 +1,4 @@
-//===- PDBInterfaceAnchors.h - defines class anchor funcions ----*- C++ -*-===//
+//===- PDBInterfaceAnchors.h - defines class anchor functions ---*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 // Class anchors are necessary per the LLVM Coding style guide, to ensure that
 // the vtable is only generated in this object file, and not in every object
-// file that incldues the corresponding header.
+// file that includes the corresponding header.
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/PDB/IPDBDataStream.h"
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/PDB/PDBSymbol.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/PDB/PDBSymbol.cpp
index 34c8ac41d45b..d6bc7ee9c951 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/PDB/PDBSymbol.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/PDB/PDBSymbol.cpp
@@ -160,11 +160,28 @@ PDBSymbol::findChildrenByRVA(PDB_SymType Type, StringRef Name,
   return RawSymbol->findChildrenByRVA(Type, Name, Flags, RVA);
 }
 
+std::unique_ptr<IPDBEnumSymbols>
+PDBSymbol::findInlineFramesByVA(uint64_t VA) const {
+  return RawSymbol->findInlineFramesByVA(VA);
+}
+
 std::unique_ptr<IPDBEnumSymbols>
 PDBSymbol::findInlineFramesByRVA(uint32_t RVA) const {
   return RawSymbol->findInlineFramesByRVA(RVA);
 }
 
+std::unique_ptr<IPDBEnumLineNumbers>
+PDBSymbol::findInlineeLinesByVA(uint64_t VA, uint32_t Length) const {
+  return RawSymbol->findInlineeLinesByVA(VA, Length);
+}
+
+std::unique_ptr<IPDBEnumLineNumbers>
+PDBSymbol::findInlineeLinesByRVA(uint32_t RVA, uint32_t Length) const {
+  return RawSymbol->findInlineeLinesByRVA(RVA, Length);
+}
+
+std::string PDBSymbol::getName() const { return RawSymbol->getName(); }
+
 std::unique_ptr<IPDBEnumSymbols>
 PDBSymbol::getChildStats(TagStats &Stats) const {
   std::unique_ptr<IPDBEnumSymbols> Result(findAllChildren());
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/PDB/UDTLayout.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/PDB/UDTLayout.cpp
index a8e1d0a619ca..55854bb49888 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/PDB/UDTLayout.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/PDB/UDTLayout.cpp
@@ -289,10 +289,10 @@ void UDTLayoutBase::addChildToLayout(std::unique_ptr<LayoutItemBase> Child) {
     UsedBytes |= ChildBytes;
 
     if (ChildBytes.count() > 0) {
-      auto Loc = std::upper_bound(LayoutItems.begin(), LayoutItems.end(), Begin,
-                                  [](uint32_t Off, const LayoutItemBase *Item) {
-                                    return (Off < Item->getOffsetInParent());
-                                  });
+      auto Loc = llvm::upper_bound(
+          LayoutItems, Begin, [](uint32_t Off, const LayoutItemBase *Item) {
+            return (Off < Item->getOffsetInParent());
+          });
 
       LayoutItems.insert(Loc, Child.get());
     }
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp
index 10352237763c..01dc31d84965 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp
@@ -84,8 +84,10 @@ void DIPrinter::print(const DILineInfo &Info, bool Inlined) {
     return;
   }
   OS << "  Filename: " << Filename << "\n";
-  if (Info.StartLine)
-    OS << "Function start line: " << Info.StartLine << "\n";
+  if (Info.StartLine) {
+    OS << "  Function start filename: " << Info.StartFileName << "\n";
+    OS << "  Function start line: " << Info.StartLine << "\n";
+  }
   OS << "  Line: " << Info.Line << "\n";
   OS << "  Column: " << Info.Column << "\n";
   if (Info.Discriminator)
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp
index 84524195fa8a..93d05e4e27bf 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp
@@ -12,24 +12,15 @@
 
 #include "SymbolizableObjectFile.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/BinaryFormat/COFF.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
-#include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
 #include "llvm/Object/COFF.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Object/SymbolSize.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/DataExtractor.h"
-#include "llvm/Support/Error.h"
 #include <algorithm>
-#include <cstdint>
-#include <memory>
-#include <string>
-#include <system_error>
-#include <utility>
-#include <vector>
 
 using namespace llvm;
 using namespace object;
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.h b/contrib/llvm-project/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.h
index 0ba304ee4c61..be3c66df056f 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.h
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.h
@@ -15,12 +15,12 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/DebugInfo/DIContext.h"
 #include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
-#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/Error.h"
 #include <cstdint>
-#include <map>
 #include <memory>
 #include <string>
-#include <system_error>
+#include <utility>
+#include <vector>
 
 namespace llvm {
 
diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp
index 051195934aaf..741860562e16 100644
--- a/contrib/llvm-project/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp
+++ b/contrib/llvm-project/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp
@@ -16,6 +16,7 @@
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/BinaryFormat/COFF.h"
+#include "llvm/Config/config.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
 #include "llvm/DebugInfo/PDB/PDB.h"
 #include "llvm/DebugInfo/PDB/PDBContext.h"
@@ -286,10 +287,8 @@ bool darwinDsymMatchesBinary(const MachOObjectFile *DbgObj,
 }
 
 template <typename ELFT>
-Optional<ArrayRef<uint8_t>> getBuildID(const ELFFile<ELFT> *Obj) {
-  if (!Obj)
-    return {};
-  auto PhdrsOrErr = Obj->program_headers();
+Optional<ArrayRef<uint8_t>> getBuildID(const ELFFile<ELFT> &Obj) {
+  auto PhdrsOrErr = Obj.program_headers();
   if (!PhdrsOrErr) {
     consumeError(PhdrsOrErr.takeError());
     return {};
@@ -298,7 +297,7 @@ Optional<ArrayRef<uint8_t>> getBuildID(const ELFFile<ELFT> *Obj) {
     if (P.p_type != ELF::PT_NOTE)
       continue;
     Error Err = Error::success();
-    for (auto N : Obj->notes(P, Err))
+    for (auto N : Obj.notes(P, Err))
       if (N.getType() == ELF::NT_GNU_BUILD_ID && N.getName() == ELF::ELF_NOTE_GNU)
         return N.getDesc();
     consumeError(std::move(Err));
@@ -556,9 +555,9 @@ LLVMSymbolizer::getOrCreateModuleInfo(const std::string &ModuleName) {
 #if 0
       using namespace pdb;
       std::unique_ptr<IPDBSession> Session;
-      PDB_ReaderType ReaderType = Opts.UseNativePDBReader
-                                      ? PDB_ReaderType::Native
-                                      : PDB_ReaderType::DIA;
+
+      PDB_ReaderType ReaderType =
+          Opts.UseDIA ? PDB_ReaderType::DIA : PDB_ReaderType::Native;
       if (auto Err = loadDataForEXE(ReaderType, Objects.first->getFileName(),
                                     Session)) {
         Modules.emplace(ModuleName, std::unique_ptr<SymbolizableModule>());
@@ -596,10 +595,8 @@ StringRef demanglePE32ExternCFunc(StringRef SymbolName) {
   if (Front != '?') {
     size_t AtPos = SymbolName.rfind('@');
     if (AtPos != StringRef::npos &&
-        std::all_of(SymbolName.begin() + AtPos + 1, SymbolName.end(),
-                    [](char C) { return C >= '0' && C <= '9'; })) {
+        all_of(drop_begin(SymbolName, AtPos + 1), isDigit))
       SymbolName = SymbolName.substr(0, AtPos);
-    }
   }
 
   // Remove any ending '@' for vectorcall.
diff --git a/contrib/llvm-project/llvm/lib/Demangle/Demangle.cpp b/contrib/llvm-project/llvm/lib/Demangle/Demangle.cpp
index 71dafa0b2e43..000f75b6a977 100644
--- a/contrib/llvm-project/llvm/lib/Demangle/Demangle.cpp
+++ b/contrib/llvm-project/llvm/lib/Demangle/Demangle.cpp
@@ -31,6 +31,6 @@ std::string llvm::demangle(const std::string &MangledName) {
     return MangledName;
 
   std::string Ret = Demangled;
-  free(Demangled);
+  std::free(Demangled);
   return Ret;
 }
diff --git a/contrib/llvm-project/llvm/lib/Demangle/MicrosoftDemangleNodes.cpp b/contrib/llvm-project/llvm/lib/Demangle/MicrosoftDemangleNodes.cpp
index 9cee975231a2..8b15ffcee778 100644
--- a/contrib/llvm-project/llvm/lib/Demangle/MicrosoftDemangleNodes.cpp
+++ b/contrib/llvm-project/llvm/lib/Demangle/MicrosoftDemangleNodes.cpp
@@ -649,5 +649,4 @@ void SpecialTableSymbolNode::output(OutputStream &OS, OutputFlags Flags) const {
     TargetName->output(OS, Flags);
     OS << "'}";
   }
-  return;
 }
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/ExecutionEngine.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/ExecutionEngine.cpp
index d8bd671c6661..c8bbf0bcdfda 100644
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/ExecutionEngine.cpp
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/ExecutionEngine.cpp
@@ -53,11 +53,6 @@ ExecutionEngine *(*ExecutionEngine::MCJITCtor)(
     std::shared_ptr<LegacyJITSymbolResolver> Resolver,
     std::unique_ptr<TargetMachine> TM) = nullptr;
 
-ExecutionEngine *(*ExecutionEngine::OrcMCJITReplacementCtor)(
-    std::string *ErrorStr, std::shared_ptr<MCJITMemoryManager> MemMgr,
-    std::shared_ptr<LegacyJITSymbolResolver> Resolver,
-    std::unique_ptr<TargetMachine> TM) = nullptr;
-
 ExecutionEngine *(*ExecutionEngine::InterpCtor)(std::unique_ptr<Module> M,
                                                 std::string *ErrorStr) =nullptr;
 
@@ -476,8 +471,7 @@ EngineBuilder::EngineBuilder() : EngineBuilder(nullptr) {}
 
 EngineBuilder::EngineBuilder(std::unique_ptr<Module> M)
     : M(std::move(M)), WhichEngine(EngineKind::Either), ErrorStr(nullptr),
-      OptLevel(CodeGenOpt::Default), MemMgr(nullptr), Resolver(nullptr),
-      UseOrcMCJITReplacement(false) {
+      OptLevel(CodeGenOpt::Default), MemMgr(nullptr), Resolver(nullptr) {
 // IR module verification is enabled by default in debug builds, and disabled
 // by default in release builds.
 #ifndef NDEBUG
@@ -540,12 +534,7 @@ ExecutionEngine *EngineBuilder::create(TargetMachine *TM) {
     }
 
     ExecutionEngine *EE = nullptr;
-    if (ExecutionEngine::OrcMCJITReplacementCtor && UseOrcMCJITReplacement) {
-      EE = ExecutionEngine::OrcMCJITReplacementCtor(ErrorStr, std::move(MemMgr),
-                                                    std::move(Resolver),
-                                                    std::move(TheTM));
-      EE->addModule(std::move(M));
-    } else if (ExecutionEngine::MCJITCtor)
+    if (ExecutionEngine::MCJITCtor)
       EE = ExecutionEngine::MCJITCtor(std::move(M), ErrorStr, std::move(MemMgr),
                                       std::move(Resolver), std::move(TheTM));
 
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
index 1ebc820a8b49..3f75012f5cf9 100644
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "IntelJITEventsWrapper.h"
+#include "ittnotify.h"
 #include "llvm-c/ExecutionEngine.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -36,6 +37,86 @@ using namespace llvm::object;
 
 namespace {
 
+class IntelIttnotifyInfo {
+  std::string ModuleName;
+  std::vector<std::string> SectionNamesVector;
+  std::vector<__itt_section_info> SectionInfoVector;
+  __itt_module_object *ModuleObject;
+  IntelJITEventsWrapper &WrapperRef;
+
+public:
+  IntelIttnotifyInfo(IntelJITEventsWrapper &Wrapper)
+      : ModuleObject(NULL), WrapperRef(Wrapper){};
+  ~IntelIttnotifyInfo() { delete ModuleObject; };
+
+  void setModuleName(const char *Name) { ModuleName = std::string(Name); }
+
+  const char *getModuleName() { return ModuleName.c_str(); }
+
+  void setModuleObject(__itt_module_object *ModuleObj) {
+    ModuleObject = ModuleObj;
+  }
+
+  __itt_module_object *getModuleObject() { return ModuleObject; }
+
+  __itt_section_info *getSectionInfoVectorBegin() {
+    if (SectionInfoVector.size())
+      return &SectionInfoVector[0];
+    return NULL;
+  }
+
+  void reportSection(llvm::IttEventType EventType, const char *SectionName,
+                     unsigned int SectionSize) {
+    WrapperRef.iJitIttNotifyInfo(EventType, SectionName, SectionSize);
+  }
+
+  int fillSectionInformation(const ObjectFile &Obj,
+                             const RuntimeDyld::LoadedObjectInfo &L) {
+
+    int SectionCounter = 0;
+
+    for (auto &Section : Obj.sections()) {
+      uint64_t SectionLoadAddr = L.getSectionLoadAddress(Section);
+      if (SectionLoadAddr) {
+        object::ELFSectionRef ElfSection(Section);
+
+        __itt_section_info SectionInfo;
+        memset(&SectionInfo, 0, sizeof(SectionInfo));
+        SectionInfo.start_addr = reinterpret_cast<void *>(SectionLoadAddr);
+        SectionInfo.file_offset = ElfSection.getOffset();
+        SectionInfo.flags = ElfSection.getFlags();
+
+        StringRef SectionName("");
+        auto SectionNameOrError = ElfSection.getName();
+        if (SectionNameOrError)
+          SectionName = *SectionNameOrError;
+
+        SectionNamesVector.push_back(SectionName.str());
+        SectionInfo.size = ElfSection.getSize();
+        reportSection(llvm::LoadBinarySection, SectionName.str().c_str(),
+                      SectionInfo.size);
+
+        if (ElfSection.isBSS()) {
+          SectionInfo.type = itt_section_type_bss;
+        } else if (ElfSection.isData()) {
+          SectionInfo.type = itt_section_type_data;
+        } else if (ElfSection.isText()) {
+          SectionInfo.type = itt_section_type_text;
+        }
+        SectionInfoVector.push_back(SectionInfo);
+        ++SectionCounter;
+      }
+    }
+    // Hereinafter: don't change SectionNamesVector content to avoid vector
+    // reallocation - reallocation invalidates all the references, pointers, and
+    // iterators referring to the elements in the sequence.
+    for (int I = 0; I < SectionCounter; ++I) {
+      SectionInfoVector[I].name = SectionNamesVector[I].c_str();
+    }
+    return SectionCounter;
+  }
+};
+
 class IntelJITEventListener : public JITEventListener {
   typedef DenseMap<void*, unsigned int> MethodIDMap;
 
@@ -48,6 +129,8 @@ class IntelJITEventListener : public JITEventListener {
   ObjectMap  LoadedObjectMap;
   std::map<ObjectKey, OwningBinary<ObjectFile>> DebugObjects;
 
+  std::map<ObjectKey, std::unique_ptr<IntelIttnotifyInfo>> KeyToIttnotify;
+
 public:
   IntelJITEventListener(IntelJITEventsWrapper* libraryWrapper) {
       Wrapper.reset(libraryWrapper);
@@ -95,146 +178,205 @@ static iJIT_Method_Load FunctionDescToIntelJITFormat(
   return Result;
 }
 
+int getBackwardCompatibilityMode() {
+
+  char *BackwardCompatibilityEnv = getenv("INTEL_JIT_BACKWARD_COMPATIBILITY");
+  int BackwardCompatibilityMode = 0;
+  if (BackwardCompatibilityEnv) {
+    StringRef(BackwardCompatibilityEnv)
+        .getAsInteger(10, BackwardCompatibilityMode);
+  }
+  return BackwardCompatibilityMode;
+}
+
 void IntelJITEventListener::notifyObjectLoaded(
     ObjectKey Key, const ObjectFile &Obj,
     const RuntimeDyld::LoadedObjectInfo &L) {
 
-  OwningBinary<ObjectFile> DebugObjOwner = L.getObjectForDebug(Obj);
-  const ObjectFile *DebugObj = DebugObjOwner.getBinary();
-  if (!DebugObj)
-    return;
-
-  // Get the address of the object image for use as a unique identifier
-  const void* ObjData = DebugObj->getData().data();
-  std::unique_ptr<DIContext> Context = DWARFContext::create(*DebugObj);
-  MethodAddressVector Functions;
-
-  // Use symbol info to iterate functions in the object.
-  for (const std::pair<SymbolRef, uint64_t> &P : computeSymbolSizes(*DebugObj)) {
-    SymbolRef Sym = P.first;
-    std::vector<LineNumberInfo> LineInfo;
-    std::string SourceFileName;
-
-    Expected<SymbolRef::Type> SymTypeOrErr = Sym.getType();
-    if (!SymTypeOrErr) {
-      // TODO: Actually report errors helpfully.
-      consumeError(SymTypeOrErr.takeError());
-      continue;
+  int BackwardCompatibilityMode = getBackwardCompatibilityMode();
+  if (BackwardCompatibilityMode == 0) {
+    if (Obj.isELF()) {
+      std::unique_ptr<IntelIttnotifyInfo> ModuleIttnotify =
+          std::make_unique<IntelIttnotifyInfo>(*Wrapper);
+      ModuleIttnotify->setModuleName(
+          StringRef(llvm::utohexstr(
+                        MD5Hash(Obj.getMemoryBufferRef().getBuffer()), true))
+              .str()
+              .c_str());
+
+      __itt_module_object *ModuleObject = new __itt_module_object();
+      ModuleObject->module_name = ModuleIttnotify->getModuleName();
+      ModuleObject->module_size = Obj.getMemoryBufferRef().getBufferSize();
+      Wrapper->iJitIttNotifyInfo(llvm::LoadBinaryModule,
+                                 ModuleObject->module_name,
+                                 ModuleObject->module_size);
+      ModuleObject->module_type = __itt_module_type_elf;
+      ModuleObject->section_number =
+          ModuleIttnotify->fillSectionInformation(Obj, L);
+      ModuleObject->module_buffer =
+          (void *)const_cast<char *>(Obj.getMemoryBufferRef().getBufferStart());
+      ModuleObject->module_id =
+          __itt_id_make((void *)&(*ModuleObject), ModuleObject->module_size);
+      ModuleObject->section_array =
+          ModuleIttnotify->getSectionInfoVectorBegin();
+      ModuleIttnotify->setModuleObject(ModuleObject);
+
+      __itt_module_load_with_sections(ModuleObject);
+
+      KeyToIttnotify[Key] = std::move(ModuleIttnotify);
     }
-    SymbolRef::Type SymType = *SymTypeOrErr;
-    if (SymType != SymbolRef::ST_Function)
-      continue;
-
-    Expected<StringRef> Name = Sym.getName();
-    if (!Name) {
-      // TODO: Actually report errors helpfully.
-      consumeError(Name.takeError());
-      continue;
+  } else if (BackwardCompatibilityMode == 1) {
+
+    OwningBinary<ObjectFile> DebugObjOwner = L.getObjectForDebug(Obj);
+    const ObjectFile *DebugObj = DebugObjOwner.getBinary();
+    if (!DebugObj)
+      return;
+
+    // Get the address of the object image for use as a unique identifier
+    const void *ObjData = DebugObj->getData().data();
+    std::unique_ptr<DIContext> Context = DWARFContext::create(*DebugObj);
+    MethodAddressVector Functions;
+
+    // Use symbol info to iterate functions in the object.
+    for (const std::pair<SymbolRef, uint64_t> &P :
+         computeSymbolSizes(*DebugObj)) {
+      SymbolRef Sym = P.first;
+      std::vector<LineNumberInfo> LineInfo;
+      std::string SourceFileName;
+
+      Expected<SymbolRef::Type> SymTypeOrErr = Sym.getType();
+      if (!SymTypeOrErr) {
+        // TODO: Actually report errors helpfully.
+        consumeError(SymTypeOrErr.takeError());
+        continue;
+      }
+      SymbolRef::Type SymType = *SymTypeOrErr;
+      if (SymType != SymbolRef::ST_Function)
+        continue;
+
+      Expected<StringRef> Name = Sym.getName();
+      if (!Name) {
+        // TODO: Actually report errors helpfully.
+        consumeError(Name.takeError());
+        continue;
+      }
+
+      Expected<uint64_t> AddrOrErr = Sym.getAddress();
+      if (!AddrOrErr) {
+        // TODO: Actually report errors helpfully.
+        consumeError(AddrOrErr.takeError());
+        continue;
+      }
+      uint64_t Addr = *AddrOrErr;
+      uint64_t Size = P.second;
+
+      auto SecOrErr = Sym.getSection();
+      if (!SecOrErr) {
+        // TODO: Actually report errors helpfully.
+        consumeError(SecOrErr.takeError());
+        continue;
+      }
+      object::section_iterator Sec = *SecOrErr;
+      if (Sec == Obj.section_end())
+        continue;
+      uint64_t Index = Sec->getIndex();
+
+      // Record this address in a local vector
+      Functions.push_back((void *)Addr);
+
+      // Build the function loaded notification message
+      iJIT_Method_Load FunctionMessage =
+          FunctionDescToIntelJITFormat(*Wrapper, Name->data(), Addr, Size);
+      DILineInfoTable Lines =
+          Context->getLineInfoForAddressRange({Addr, Index}, Size);
+      DILineInfoTable::iterator Begin = Lines.begin();
+      DILineInfoTable::iterator End = Lines.end();
+      for (DILineInfoTable::iterator It = Begin; It != End; ++It) {
+        LineInfo.push_back(
+            DILineInfoToIntelJITFormat((uintptr_t)Addr, It->first, It->second));
+      }
+      if (LineInfo.size() == 0) {
+        FunctionMessage.source_file_name = 0;
+        FunctionMessage.line_number_size = 0;
+        FunctionMessage.line_number_table = 0;
+      } else {
+        // Source line information for the address range is provided as
+        // a code offset for the start of the corresponding sub-range and
+        // a source line. JIT API treats offsets in LineNumberInfo structures
+        // as the end of the corresponding code region. The start of the code
+        // is taken from the previous element. Need to shift the elements.
+
+        LineNumberInfo last = LineInfo.back();
+        last.Offset = FunctionMessage.method_size;
+        LineInfo.push_back(last);
+        for (size_t i = LineInfo.size() - 2; i > 0; --i)
+          LineInfo[i].LineNumber = LineInfo[i - 1].LineNumber;
+
+        SourceFileName = Lines.front().second.FileName;
+        FunctionMessage.source_file_name =
+            const_cast<char *>(SourceFileName.c_str());
+        FunctionMessage.line_number_size = LineInfo.size();
+        FunctionMessage.line_number_table = &*LineInfo.begin();
+      }
+
+      Wrapper->iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED,
+                                &FunctionMessage);
+      MethodIDs[(void *)Addr] = FunctionMessage.method_id;
     }
 
-    Expected<uint64_t> AddrOrErr = Sym.getAddress();
-    if (!AddrOrErr) {
-      // TODO: Actually report errors helpfully.
-      consumeError(AddrOrErr.takeError());
-      continue;
-    }
-    uint64_t Addr = *AddrOrErr;
-    uint64_t Size = P.second;
-
-    auto SecOrErr = Sym.getSection();
-    if (!SecOrErr) {
-      // TODO: Actually report errors helpfully.
-      consumeError(SecOrErr.takeError());
-      continue;
-    }
-    object::section_iterator Sec = *SecOrErr;
-    if (Sec == Obj.section_end())
-      continue;
-    uint64_t Index = Sec->getIndex();
-
-    // Record this address in a local vector
-    Functions.push_back((void*)Addr);
-
-    // Build the function loaded notification message
-    iJIT_Method_Load FunctionMessage =
-      FunctionDescToIntelJITFormat(*Wrapper, Name->data(), Addr, Size);
-    DILineInfoTable Lines =
-      Context->getLineInfoForAddressRange({Addr, Index}, Size);
-    DILineInfoTable::iterator Begin = Lines.begin();
-    DILineInfoTable::iterator End = Lines.end();
-    for (DILineInfoTable::iterator It = Begin; It != End; ++It) {
-      LineInfo.push_back(
-          DILineInfoToIntelJITFormat((uintptr_t)Addr, It->first, It->second));
-    }
-    if (LineInfo.size() == 0) {
-      FunctionMessage.source_file_name = 0;
-      FunctionMessage.line_number_size = 0;
-      FunctionMessage.line_number_table = 0;
-    } else {
-      // Source line information for the address range is provided as
-      // a code offset for the start of the corresponding sub-range and
-      // a source line. JIT API treats offsets in LineNumberInfo structures
-      // as the end of the corresponding code region. The start of the code
-      // is taken from the previous element. Need to shift the elements.
-
-      LineNumberInfo last = LineInfo.back();
-      last.Offset = FunctionMessage.method_size;
-      LineInfo.push_back(last);
-      for (size_t i = LineInfo.size() - 2; i > 0; --i)
-        LineInfo[i].LineNumber = LineInfo[i - 1].LineNumber;
-
-      SourceFileName = Lines.front().second.FileName;
-      FunctionMessage.source_file_name =
-        const_cast<char *>(SourceFileName.c_str());
-      FunctionMessage.line_number_size = LineInfo.size();
-      FunctionMessage.line_number_table = &*LineInfo.begin();
-    }
-
-    Wrapper->iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED,
-                              &FunctionMessage);
-    MethodIDs[(void*)Addr] = FunctionMessage.method_id;
+    // To support object unload notification, we need to keep a list of
+    // registered function addresses for each loaded object.  We will
+    // use the MethodIDs map to get the registered ID for each function.
+    LoadedObjectMap[ObjData] = Functions;
+    DebugObjects[Key] = std::move(DebugObjOwner);
   }
-
-  // To support object unload notification, we need to keep a list of
-  // registered function addresses for each loaded object.  We will
-  // use the MethodIDs map to get the registered ID for each function.
-  LoadedObjectMap[ObjData] = Functions;
-  DebugObjects[Key] = std::move(DebugObjOwner);
 }
 
 void IntelJITEventListener::notifyFreeingObject(ObjectKey Key) {
-  // This object may not have been registered with the listener. If it wasn't,
-  // bail out.
-  if (DebugObjects.find(Key) == DebugObjects.end())
-    return;
-
-  // Get the address of the object image for use as a unique identifier
-  const ObjectFile &DebugObj = *DebugObjects[Key].getBinary();
-  const void* ObjData = DebugObj.getData().data();
-
-  // Get the object's function list from LoadedObjectMap
-  ObjectMap::iterator OI = LoadedObjectMap.find(ObjData);
-  if (OI == LoadedObjectMap.end())
-    return;
-  MethodAddressVector& Functions = OI->second;
-
-  // Walk the function list, unregistering each function
-  for (MethodAddressVector::iterator FI = Functions.begin(),
-                                     FE = Functions.end();
-       FI != FE;
-       ++FI) {
-    void* FnStart = const_cast<void*>(*FI);
-    MethodIDMap::iterator MI = MethodIDs.find(FnStart);
-    if (MI != MethodIDs.end()) {
-      Wrapper->iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_UNLOAD_START,
-                                &MI->second);
-      MethodIDs.erase(MI);
+
+  int BackwardCompatibilityMode = getBackwardCompatibilityMode();
+  if (BackwardCompatibilityMode == 0) {
+    if (KeyToIttnotify.find(Key) == KeyToIttnotify.end())
+      return;
+    __itt_module_unload_with_sections(KeyToIttnotify[Key]->getModuleObject());
+    Wrapper->iJitIttNotifyInfo(
+        llvm::UnloadBinaryModule,
+        KeyToIttnotify[Key]->getModuleObject()->module_name,
+        KeyToIttnotify[Key]->getModuleObject()->module_size);
+    KeyToIttnotify.erase(Key);
+  } else if (BackwardCompatibilityMode == 1) {
+    // This object may not have been registered with the listener. If it wasn't,
+    // bail out.
+    if (DebugObjects.find(Key) == DebugObjects.end())
+      return;
+
+    // Get the address of the object image for use as a unique identifier
+    const ObjectFile &DebugObj = *DebugObjects[Key].getBinary();
+    const void *ObjData = DebugObj.getData().data();
+
+    // Get the object's function list from LoadedObjectMap
+    ObjectMap::iterator OI = LoadedObjectMap.find(ObjData);
+    if (OI == LoadedObjectMap.end())
+      return;
+    MethodAddressVector &Functions = OI->second;
+
+    // Walk the function list, unregistering each function
+    for (MethodAddressVector::iterator FI = Functions.begin(),
+                                       FE = Functions.end();
+         FI != FE; ++FI) {
+      void *FnStart = const_cast<void *>(*FI);
+      MethodIDMap::iterator MI = MethodIDs.find(FnStart);
+      if (MI != MethodIDs.end()) {
+        Wrapper->iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_UNLOAD_START,
+                                  &MI->second);
+        MethodIDs.erase(MI);
+      }
     }
-  }
 
-  // Erase the object from LoadedObjectMap
-  LoadedObjectMap.erase(OI);
-  DebugObjects.erase(Key);
+    // Erase the object from LoadedObjectMap
+    LoadedObjectMap.erase(OI);
+    DebugObjects.erase(Key);
+  }
 }
 
 }  // anonymous namespace.
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/IntelJITEvents/IntelJITEventsWrapper.h b/contrib/llvm-project/llvm/lib/ExecutionEngine/IntelJITEvents/IntelJITEventsWrapper.h
index 68699c6a2200..088b33b798a3 100644
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/IntelJITEvents/IntelJITEventsWrapper.h
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/IntelJITEvents/IntelJITEventsWrapper.h
@@ -21,10 +21,18 @@
 
 namespace llvm {
 
+typedef enum {
+  LoadBinaryModule,
+  LoadBinarySection,
+  UnloadBinaryModule,
+  UnloadBinarySection
+} IttEventType;
+
 class IntelJITEventsWrapper {
   // Function pointer types for testing implementation of Intel jitprofiling
   // library
   typedef int (*NotifyEventPtr)(iJIT_JVM_EVENT, void*);
+  typedef int (*IttnotifyInfoPtr)(IttEventType, const char *, unsigned int);
   typedef void (*RegisterCallbackExPtr)(void *, iJIT_ModeChangedEx );
   typedef iJIT_IsProfilingActiveFlags (*IsProfilingActivePtr)(void);
   typedef void (*FinalizeThreadPtr)(void);
@@ -32,6 +40,7 @@ class IntelJITEventsWrapper {
   typedef unsigned int (*GetNewMethodIDPtr)(void);
 
   NotifyEventPtr NotifyEventFunc;
+  IttnotifyInfoPtr IttnotifyInfoFunc;
   RegisterCallbackExPtr RegisterCallbackExFunc;
   IsProfilingActivePtr IsProfilingActiveFunc;
   GetNewMethodIDPtr GetNewMethodIDFunc;
@@ -42,23 +51,22 @@ public:
   }
 
   IntelJITEventsWrapper()
-  : NotifyEventFunc(::iJIT_NotifyEvent),
-    RegisterCallbackExFunc(::iJIT_RegisterCallbackEx),
-    IsProfilingActiveFunc(::iJIT_IsProfilingActive),
-    GetNewMethodIDFunc(::iJIT_GetNewMethodID) {
-  }
+      : NotifyEventFunc(::iJIT_NotifyEvent), IttnotifyInfoFunc(0),
+        RegisterCallbackExFunc(::iJIT_RegisterCallbackEx),
+        IsProfilingActiveFunc(::iJIT_IsProfilingActive),
+        GetNewMethodIDFunc(::iJIT_GetNewMethodID) {}
 
   IntelJITEventsWrapper(NotifyEventPtr NotifyEventImpl,
-                   RegisterCallbackExPtr RegisterCallbackExImpl,
-                   IsProfilingActivePtr IsProfilingActiveImpl,
-                   FinalizeThreadPtr FinalizeThreadImpl,
-                   FinalizeProcessPtr FinalizeProcessImpl,
-                   GetNewMethodIDPtr GetNewMethodIDImpl)
-  : NotifyEventFunc(NotifyEventImpl),
-    RegisterCallbackExFunc(RegisterCallbackExImpl),
-    IsProfilingActiveFunc(IsProfilingActiveImpl),
-    GetNewMethodIDFunc(GetNewMethodIDImpl) {
-  }
+                        IttnotifyInfoPtr IttnotifyInfoImpl,
+                        RegisterCallbackExPtr RegisterCallbackExImpl,
+                        IsProfilingActivePtr IsProfilingActiveImpl,
+                        FinalizeThreadPtr FinalizeThreadImpl,
+                        FinalizeProcessPtr FinalizeProcessImpl,
+                        GetNewMethodIDPtr GetNewMethodIDImpl)
+      : NotifyEventFunc(NotifyEventImpl), IttnotifyInfoFunc(IttnotifyInfoImpl),
+        RegisterCallbackExFunc(RegisterCallbackExImpl),
+        IsProfilingActiveFunc(IsProfilingActiveImpl),
+        GetNewMethodIDFunc(GetNewMethodIDImpl) {}
 
   // Sends an event announcing that a function has been emitted
   //   return values are event-specific.  See Intel documentation for details.
@@ -68,6 +76,13 @@ public:
     return NotifyEventFunc(EventType, EventSpecificData);
   }
 
+  int iJitIttNotifyInfo(IttEventType EventType, const char *Name,
+                        unsigned int Size) {
+    if (!IttnotifyInfoFunc)
+      return -1;
+    return IttnotifyInfoFunc(EventType, Name, Size);
+  }
+
   // Registers a callback function to receive notice of profiling state changes
   void iJIT_RegisterCallbackEx(void *UserData,
                                iJIT_ModeChangedEx NewModeCallBackFuncEx) {
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
index cb1b35d62388..3aa77557862e 100644
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
@@ -419,7 +419,7 @@ static GenericValue lle_X_printf(FunctionType *FT,
   char Buffer[10000];
   std::vector<GenericValue> NewArgs;
   NewArgs.push_back(PTOGV((void*)&Buffer[0]));
-  NewArgs.insert(NewArgs.end(), Args.begin(), Args.end());
+  llvm::append_range(NewArgs, Args);
   GenericValue GV = lle_X_sprintf(FT, NewArgs);
   outs() << Buffer;
   return GV;
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp
index f1114e92c360..3602601287f4 100644
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp
@@ -10,6 +10,8 @@
 #include "EHFrameSupportImpl.h"
 
 #include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/Config/config.h"
+#include "llvm/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.h"
 #include "llvm/Support/DynamicLibrary.h"
 
 #define DEBUG_TYPE "jitlink"
@@ -117,10 +119,10 @@ Error EHFrameSplitter::processBlock(LinkGraph &G, Block &B,
 }
 
 EHFrameEdgeFixer::EHFrameEdgeFixer(StringRef EHFrameSectionName,
-                                   Edge::Kind FDEToCIE, Edge::Kind FDEToPCBegin,
-                                   Edge::Kind FDEToLSDA)
-    : EHFrameSectionName(EHFrameSectionName), FDEToCIE(FDEToCIE),
-      FDEToPCBegin(FDEToPCBegin), FDEToLSDA(FDEToLSDA) {}
+                                   unsigned PointerSize, Edge::Kind Delta64,
+                                   Edge::Kind Delta32, Edge::Kind NegDelta32)
+    : EHFrameSectionName(EHFrameSectionName), PointerSize(PointerSize),
+      Delta64(Delta64), Delta32(Delta32), NegDelta32(NegDelta32) {}
 
 Error EHFrameEdgeFixer::operator()(LinkGraph &G) {
   auto *EHFrame = G.findSectionByName(EHFrameSectionName);
@@ -133,6 +135,11 @@ Error EHFrameEdgeFixer::operator()(LinkGraph &G) {
     return Error::success();
   }
 
+  // Check that we support the graph's pointer size.
+  if (G.getPointerSize() != 4 && G.getPointerSize() != 8)
+    return make_error<JITLinkError>(
+        "EHFrameEdgeFixer only supports 32 and 64 bit targets");
+
   LLVM_DEBUG({
     dbgs() << "EHFrameEdgeFixer: Processing " << EHFrameSectionName << "...\n";
   });
@@ -257,7 +264,6 @@ Error EHFrameEdgeFixer::processBlock(ParseContext &PC, Block &B) {
 Error EHFrameEdgeFixer::processCIE(ParseContext &PC, Block &B,
                                    size_t RecordOffset, size_t RecordLength,
                                    size_t CIEDeltaFieldOffset) {
-  using namespace dwarf;
 
   LLVM_DEBUG(dbgs() << "      Record is CIE\n");
 
@@ -328,11 +334,12 @@ Error EHFrameEdgeFixer::processCIE(ParseContext &PC, Block &B,
       uint8_t LSDAPointerEncoding;
       if (auto Err = RecordReader.readInteger(LSDAPointerEncoding))
         return Err;
-      if (LSDAPointerEncoding != (DW_EH_PE_pcrel | DW_EH_PE_absptr))
+      if (!isSupportedPointerEncoding(LSDAPointerEncoding))
         return make_error<JITLinkError>(
             "Unsupported LSDA pointer encoding " +
             formatv("{0:x2}", LSDAPointerEncoding) + " in CIE at " +
             formatv("{0:x16}", CIESymbol.getAddress()));
+      CIEInfo.LSDAPointerEncoding = LSDAPointerEncoding;
       break;
     }
     case 'P': {
@@ -340,7 +347,8 @@ Error EHFrameEdgeFixer::processCIE(ParseContext &PC, Block &B,
       if (auto Err = RecordReader.readInteger(PersonalityPointerEncoding))
         return Err;
       if (PersonalityPointerEncoding !=
-          (DW_EH_PE_indirect | DW_EH_PE_pcrel | DW_EH_PE_sdata4))
+          (dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
+           dwarf::DW_EH_PE_sdata4))
         return make_error<JITLinkError>(
             "Unspported personality pointer "
             "encoding " +
@@ -355,12 +363,12 @@ Error EHFrameEdgeFixer::processCIE(ParseContext &PC, Block &B,
       uint8_t FDEPointerEncoding;
       if (auto Err = RecordReader.readInteger(FDEPointerEncoding))
         return Err;
-      if (FDEPointerEncoding != (DW_EH_PE_pcrel | DW_EH_PE_absptr))
+      if (!isSupportedPointerEncoding(FDEPointerEncoding))
         return make_error<JITLinkError>(
-            "Unsupported FDE address pointer "
-            "encoding " +
+            "Unsupported FDE pointer encoding " +
             formatv("{0:x2}", FDEPointerEncoding) + " in CIE at " +
             formatv("{0:x16}", CIESymbol.getAddress()));
+      CIEInfo.FDEPointerEncoding = FDEPointerEncoding;
       break;
     }
     default:
@@ -417,7 +425,7 @@ Error EHFrameEdgeFixer::processFDE(ParseContext &PC, Block &B,
       else
         return CIEInfoOrErr.takeError();
       assert(CIEInfo->CIESymbol && "CIEInfo has no CIE symbol set");
-      B.addEdge(FDEToCIE, RecordOffset + CIEDeltaFieldOffset,
+      B.addEdge(NegDelta32, RecordOffset + CIEDeltaFieldOffset,
                 *CIEInfo->CIESymbol, 0);
     } else {
       LLVM_DEBUG({
@@ -444,11 +452,13 @@ Error EHFrameEdgeFixer::processFDE(ParseContext &PC, Block &B,
     JITTargetAddress PCBeginFieldOffset = RecordReader.getOffset();
     auto PCEdgeItr = BlockEdges.find(RecordOffset + PCBeginFieldOffset);
     if (PCEdgeItr == BlockEdges.end()) {
-      auto PCBeginDelta = readAbsolutePointer(PC.G, RecordReader);
-      if (!PCBeginDelta)
-        return PCBeginDelta.takeError();
-      JITTargetAddress PCBegin =
-          RecordAddress + PCBeginFieldOffset + *PCBeginDelta;
+      auto PCBeginPtrInfo =
+          readEncodedPointer(CIEInfo->FDEPointerEncoding,
+                             RecordAddress + PCBeginFieldOffset, RecordReader);
+      if (!PCBeginPtrInfo)
+        return PCBeginPtrInfo.takeError();
+      JITTargetAddress PCBegin = PCBeginPtrInfo->first;
+      Edge::Kind PCBeginEdgeKind = PCBeginPtrInfo->second;
       LLVM_DEBUG({
         dbgs() << "        Adding edge at "
                << formatv("{0:x16}", RecordAddress + PCBeginFieldOffset)
@@ -457,7 +467,7 @@ Error EHFrameEdgeFixer::processFDE(ParseContext &PC, Block &B,
       auto PCBeginSym = getOrCreateSymbol(PC, PCBegin);
       if (!PCBeginSym)
         return PCBeginSym.takeError();
-      B.addEdge(FDEToPCBegin, RecordOffset + PCBeginFieldOffset, *PCBeginSym,
+      B.addEdge(PCBeginEdgeKind, RecordOffset + PCBeginFieldOffset, *PCBeginSym,
                 0);
       PCBeginBlock = &PCBeginSym->getBlock();
     } else {
@@ -479,38 +489,42 @@ Error EHFrameEdgeFixer::processFDE(ParseContext &PC, Block &B,
                                         " points at external block");
       }
       PCBeginBlock = &EI.Target->getBlock();
-      if (auto Err = RecordReader.skip(PC.G.getPointerSize()))
+      if (auto Err = RecordReader.skip(
+              getPointerEncodingDataSize(CIEInfo->FDEPointerEncoding)))
         return Err;
     }
 
     // Add a keep-alive edge from the FDE target to the FDE to ensure that the
     // FDE is kept alive if its target is.
     assert(PCBeginBlock && "PC-begin block not recorded");
+    LLVM_DEBUG({
+      dbgs() << "        Adding keep-alive edge from target at "
+             << formatv("{0:x16}", PCBeginBlock->getAddress()) << " to FDE at "
+             << formatv("{0:x16}", RecordAddress) << "\n";
+    });
     PCBeginBlock->addEdge(Edge::KeepAlive, 0, FDESymbol, 0);
   }
 
   // Skip over the PC range size field.
-  if (auto Err = RecordReader.skip(PC.G.getPointerSize()))
+  if (auto Err = RecordReader.skip(
+          getPointerEncodingDataSize(CIEInfo->FDEPointerEncoding)))
     return Err;
 
   if (CIEInfo->FDEsHaveLSDAField) {
     uint64_t AugmentationDataSize;
     if (auto Err = RecordReader.readULEB128(AugmentationDataSize))
       return Err;
-    if (AugmentationDataSize != PC.G.getPointerSize())
-      return make_error<JITLinkError>(
-          "Unexpected FDE augmentation data size (expected " +
-          Twine(PC.G.getPointerSize()) + ", got " +
-          Twine(AugmentationDataSize) + ") for FDE at " +
-          formatv("{0:x16}", RecordAddress));
 
     JITTargetAddress LSDAFieldOffset = RecordReader.getOffset();
     auto LSDAEdgeItr = BlockEdges.find(RecordOffset + LSDAFieldOffset);
     if (LSDAEdgeItr == BlockEdges.end()) {
-      auto LSDADelta = readAbsolutePointer(PC.G, RecordReader);
-      if (!LSDADelta)
-        return LSDADelta.takeError();
-      JITTargetAddress LSDA = RecordAddress + LSDAFieldOffset + *LSDADelta;
+      auto LSDAPointerInfo =
+          readEncodedPointer(CIEInfo->LSDAPointerEncoding,
+                             RecordAddress + LSDAFieldOffset, RecordReader);
+      if (!LSDAPointerInfo)
+        return LSDAPointerInfo.takeError();
+      JITTargetAddress LSDA = LSDAPointerInfo->first;
+      Edge::Kind LSDAEdgeKind = LSDAPointerInfo->second;
       auto LSDASym = getOrCreateSymbol(PC, LSDA);
       if (!LSDASym)
         return LSDASym.takeError();
@@ -519,7 +533,7 @@ Error EHFrameEdgeFixer::processFDE(ParseContext &PC, Block &B,
                << formatv("{0:x16}", RecordAddress + LSDAFieldOffset)
                << " to LSDA at " << formatv("{0:x16}", LSDA) << "\n";
       });
-      B.addEdge(FDEToLSDA, RecordOffset + LSDAFieldOffset, *LSDASym, 0);
+      B.addEdge(LSDAEdgeKind, RecordOffset + LSDAFieldOffset, *LSDASym, 0);
     } else {
       LLVM_DEBUG({
         auto &EI = LSDAEdgeItr->second;
@@ -530,7 +544,7 @@ Error EHFrameEdgeFixer::processFDE(ParseContext &PC, Block &B,
           dbgs() << " + " << formatv("{0:x16}", EI.Addend);
         dbgs() << "\n";
       });
-      if (auto Err = RecordReader.skip(PC.G.getPointerSize()))
+      if (auto Err = RecordReader.skip(AugmentationDataSize))
         return Err;
     }
   } else {
@@ -581,23 +595,110 @@ EHFrameEdgeFixer::parseAugmentationString(BinaryStreamReader &RecordReader) {
   return std::move(AugInfo);
 }
 
-Expected<JITTargetAddress>
-EHFrameEdgeFixer::readAbsolutePointer(LinkGraph &G,
-                                      BinaryStreamReader &RecordReader) {
+bool EHFrameEdgeFixer::isSupportedPointerEncoding(uint8_t PointerEncoding) {
+  using namespace dwarf;
+
+  // We only support PC-rel for now.
+  if ((PointerEncoding & 0x70) != DW_EH_PE_pcrel)
+    return false;
+
+  // readEncodedPointer does not handle indirect.
+  if (PointerEncoding & DW_EH_PE_indirect)
+    return false;
+
+  // Supported datatypes.
+  switch (PointerEncoding & 0xf) {
+  case DW_EH_PE_absptr:
+  case DW_EH_PE_udata4:
+  case DW_EH_PE_udata8:
+  case DW_EH_PE_sdata4:
+  case DW_EH_PE_sdata8:
+    return true;
+  }
+
+  return false;
+}
+
+unsigned EHFrameEdgeFixer::getPointerEncodingDataSize(uint8_t PointerEncoding) {
+  using namespace dwarf;
+
+  assert(isSupportedPointerEncoding(PointerEncoding) &&
+         "Unsupported pointer encoding");
+  switch (PointerEncoding & 0xf) {
+  case DW_EH_PE_absptr:
+    return PointerSize;
+  case DW_EH_PE_udata4:
+  case DW_EH_PE_sdata4:
+    return 4;
+  case DW_EH_PE_udata8:
+  case DW_EH_PE_sdata8:
+    return 8;
+  default:
+    llvm_unreachable("Unsupported encoding");
+  }
+}
+
+Expected<std::pair<JITTargetAddress, Edge::Kind>>
+EHFrameEdgeFixer::readEncodedPointer(uint8_t PointerEncoding,
+                                     JITTargetAddress PointerFieldAddress,
+                                     BinaryStreamReader &RecordReader) {
   static_assert(sizeof(JITTargetAddress) == sizeof(uint64_t),
                 "Result must be able to hold a uint64_t");
+  assert(isSupportedPointerEncoding(PointerEncoding) &&
+         "Unsupported pointer encoding");
+
+  using namespace dwarf;
+
+  // Isolate data type, remap absptr to udata4 or udata8. This relies on us
+  // having verified that the graph uses 32-bit or 64-bit pointers only at the
+  // start of this pass.
+  uint8_t EffectiveType = PointerEncoding & 0xf;
+  if (EffectiveType == DW_EH_PE_absptr)
+    EffectiveType = (PointerSize == 8) ? DW_EH_PE_udata8 : DW_EH_PE_udata4;
+
   JITTargetAddress Addr;
-  if (G.getPointerSize() == 8) {
-    if (auto Err = RecordReader.readInteger(Addr))
+  Edge::Kind PointerEdgeKind;
+  switch (EffectiveType) {
+  case DW_EH_PE_udata4: {
+    uint32_t Val;
+    if (auto Err = RecordReader.readInteger(Val))
       return std::move(Err);
-  } else if (G.getPointerSize() == 4) {
-    uint32_t Addr32;
-    if (auto Err = RecordReader.readInteger(Addr32))
+    Addr = PointerFieldAddress + Val;
+    PointerEdgeKind = Delta32;
+    break;
+  }
+  case DW_EH_PE_udata8: {
+    uint64_t Val;
+    if (auto Err = RecordReader.readInteger(Val))
+      return std::move(Err);
+    Addr = PointerFieldAddress + Val;
+    PointerEdgeKind = Delta64;
+    break;
+  }
+  case DW_EH_PE_sdata4: {
+    int32_t Val;
+    if (auto Err = RecordReader.readInteger(Val))
+      return std::move(Err);
+    Addr = PointerFieldAddress + Val;
+    PointerEdgeKind = Delta32;
+    break;
+  }
+  case DW_EH_PE_sdata8: {
+    int64_t Val;
+    if (auto Err = RecordReader.readInteger(Val))
       return std::move(Err);
-    Addr = Addr32;
-  } else
-    llvm_unreachable("Pointer size is not 32-bit or 64-bit");
-  return Addr;
+    Addr = PointerFieldAddress + Val;
+    PointerEdgeKind = Delta64;
+    break;
+  }
+  }
+
+  if (PointerEdgeKind == Edge::Invalid)
+    return make_error<JITLinkError>(
+        "Unspported edge kind for encoded pointer at " +
+        formatv("{0:x}", PointerFieldAddress));
+
+  return std::make_pair(Addr, Delta64);
 }
 
 Expected<Symbol &> EHFrameEdgeFixer::getOrCreateSymbol(ParseContext &PC,
@@ -629,146 +730,21 @@ Expected<Symbol &> EHFrameEdgeFixer::getOrCreateSymbol(ParseContext &PC,
   return PC.G.addAnonymousSymbol(*B, Addr - B->getAddress(), 0, false, false);
 }
 
-// Determine whether we can register EH tables.
-#if (defined(__GNUC__) && !defined(__ARM_EABI__) && !defined(__ia64__) &&      \
-     !(defined(_AIX) && defined(__ibmxl__)) && !defined(__SEH__) &&            \
-     !defined(__USING_SJLJ_EXCEPTIONS__))
-#define HAVE_EHTABLE_SUPPORT 1
-#else
-#define HAVE_EHTABLE_SUPPORT 0
-#endif
-
-#if HAVE_EHTABLE_SUPPORT
-extern "C" void __register_frame(const void *);
-extern "C" void __deregister_frame(const void *);
-
-Error registerFrameWrapper(const void *P) {
-  __register_frame(P);
-  return Error::success();
-}
-
-Error deregisterFrameWrapper(const void *P) {
-  __deregister_frame(P);
-  return Error::success();
-}
-
-#else
-
-// The building compiler does not have __(de)register_frame but
-// it may be found at runtime in a dynamically-loaded library.
-// For example, this happens when building LLVM with Visual C++
-// but using the MingW runtime.
-static Error registerFrameWrapper(const void *P) {
-  static void((*RegisterFrame)(const void *)) = 0;
-
-  if (!RegisterFrame)
-    *(void **)&RegisterFrame =
-        llvm::sys::DynamicLibrary::SearchForAddressOfSymbol("__register_frame");
-
-  if (RegisterFrame) {
-    RegisterFrame(P);
-    return Error::success();
-  }
-
-  return make_error<JITLinkError>("could not register eh-frame: "
-                                  "__register_frame function not found");
-}
-
-static Error deregisterFrameWrapper(const void *P) {
-  static void((*DeregisterFrame)(const void *)) = 0;
-
-  if (!DeregisterFrame)
-    *(void **)&DeregisterFrame =
-        llvm::sys::DynamicLibrary::SearchForAddressOfSymbol(
-            "__deregister_frame");
-
-  if (DeregisterFrame) {
-    DeregisterFrame(P);
-    return Error::success();
-  }
-
-  return make_error<JITLinkError>("could not deregister eh-frame: "
-                                  "__deregister_frame function not found");
-}
-#endif
-
-#ifdef __APPLE__
-
-template <typename HandleFDEFn>
-Error walkAppleEHFrameSection(const char *const SectionStart,
-                              size_t SectionSize,
-                              HandleFDEFn HandleFDE) {
-  const char *CurCFIRecord = SectionStart;
-  const char *End = SectionStart + SectionSize;
-  uint64_t Size = *reinterpret_cast<const uint32_t *>(CurCFIRecord);
-
-  while (CurCFIRecord != End && Size != 0) {
-    const char *OffsetField = CurCFIRecord + (Size == 0xffffffff ? 12 : 4);
-    if (Size == 0xffffffff)
-      Size = *reinterpret_cast<const uint64_t *>(CurCFIRecord + 4) + 12;
-    else
-      Size += 4;
-    uint32_t Offset = *reinterpret_cast<const uint32_t *>(OffsetField);
-
-    LLVM_DEBUG({
-      dbgs() << "Registering eh-frame section:\n";
-      dbgs() << "Processing " << (Offset ? "FDE" : "CIE") << " @"
-             << (void *)CurCFIRecord << ": [";
-      for (unsigned I = 0; I < Size; ++I)
-        dbgs() << format(" 0x%02" PRIx8, *(CurCFIRecord + I));
-      dbgs() << " ]\n";
-    });
-
-    if (Offset != 0)
-      if (auto Err = HandleFDE(CurCFIRecord))
-        return Err;
-
-    CurCFIRecord += Size;
-
-    Size = *reinterpret_cast<const uint32_t *>(CurCFIRecord);
-  }
-
-  return Error::success();
-}
-
-#endif // __APPLE__
-
-Error registerEHFrameSection(const void *EHFrameSectionAddr,
-                             size_t EHFrameSectionSize) {
-#ifdef __APPLE__
-  // On Darwin __register_frame has to be called for each FDE entry.
-  return walkAppleEHFrameSection(static_cast<const char *>(EHFrameSectionAddr),
-                                 EHFrameSectionSize,
-                                 registerFrameWrapper);
-#else
-  // On Linux __register_frame takes a single argument:
-  // a pointer to the start of the .eh_frame section.
-
-  // How can it find the end? Because crtendS.o is linked
-  // in and it has an .eh_frame section with four zero chars.
-  return registerFrameWrapper(EHFrameSectionAddr);
-#endif
-}
-
-Error deregisterEHFrameSection(const void *EHFrameSectionAddr,
-                               size_t EHFrameSectionSize) {
-#ifdef __APPLE__
-  return walkAppleEHFrameSection(static_cast<const char *>(EHFrameSectionAddr),
-                                 EHFrameSectionSize,
-                                 deregisterFrameWrapper);
-#else
-  return deregisterFrameWrapper(EHFrameSectionAddr);
-#endif
-}
-
 EHFrameRegistrar::~EHFrameRegistrar() {}
 
-InProcessEHFrameRegistrar &InProcessEHFrameRegistrar::getInstance() {
-  static InProcessEHFrameRegistrar Instance;
-  return Instance;
+Error InProcessEHFrameRegistrar::registerEHFrames(
+    JITTargetAddress EHFrameSectionAddr, size_t EHFrameSectionSize) {
+  return orc::registerEHFrameSection(
+      jitTargetAddressToPointer<void *>(EHFrameSectionAddr),
+      EHFrameSectionSize);
 }
 
-InProcessEHFrameRegistrar::InProcessEHFrameRegistrar() {}
+Error InProcessEHFrameRegistrar::deregisterEHFrames(
+    JITTargetAddress EHFrameSectionAddr, size_t EHFrameSectionSize) {
+  return orc::deregisterEHFrameSection(
+      jitTargetAddressToPointer<void *>(EHFrameSectionAddr),
+      EHFrameSectionSize);
+}
 
 LinkGraphPassFunction
 createEHFrameRecorderPass(const Triple &TT,
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/EHFrameSupportImpl.h b/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/EHFrameSupportImpl.h
index a8cd32c664dc..5e68e72ba18d 100644
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/EHFrameSupportImpl.h
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/EHFrameSupportImpl.h
@@ -40,8 +40,9 @@ private:
 /// edges.
 class EHFrameEdgeFixer {
 public:
-  EHFrameEdgeFixer(StringRef EHFrameSectionName, Edge::Kind FDEToCIE,
-                   Edge::Kind FDEToPCBegin, Edge::Kind FDEToLSDA);
+  EHFrameEdgeFixer(StringRef EHFrameSectionName, unsigned PointerSize,
+                   Edge::Kind Delta64, Edge::Kind Delta32,
+                   Edge::Kind NegDelta32);
   Error operator()(LinkGraph &G);
 
 private:
@@ -57,6 +58,8 @@ private:
     CIEInformation(Symbol &CIESymbol) : CIESymbol(&CIESymbol) {}
     Symbol *CIESymbol = nullptr;
     bool FDEsHaveLSDAField = false;
+    uint8_t FDEPointerEncoding = 0;
+    uint8_t LSDAPointerEncoding = 0;
   };
 
   struct EdgeTarget {
@@ -96,14 +99,21 @@ private:
 
   Expected<AugmentationInfo>
   parseAugmentationString(BinaryStreamReader &RecordReader);
-  Expected<JITTargetAddress>
-  readAbsolutePointer(LinkGraph &G, BinaryStreamReader &RecordReader);
+
+  static bool isSupportedPointerEncoding(uint8_t PointerEncoding);
+  unsigned getPointerEncodingDataSize(uint8_t PointerEncoding);
+  Expected<std::pair<JITTargetAddress, Edge::Kind>>
+  readEncodedPointer(uint8_t PointerEncoding,
+                     JITTargetAddress PointerFieldAddress,
+                     BinaryStreamReader &RecordReader);
+
   Expected<Symbol &> getOrCreateSymbol(ParseContext &PC, JITTargetAddress Addr);
 
   StringRef EHFrameSectionName;
-  Edge::Kind FDEToCIE;
-  Edge::Kind FDEToPCBegin;
-  Edge::Kind FDEToLSDA;
+  unsigned PointerSize;
+  Edge::Kind Delta64;
+  Edge::Kind Delta32;
+  Edge::Kind NegDelta32;
 };
 
 } // end namespace jitlink
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/ELF.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/ELF.cpp
index 6160583b13fe..27eb7d576e2d 100644
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/ELF.cpp
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/ELF.cpp
@@ -15,6 +15,7 @@
 
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/ExecutionEngine/JITLink/ELF_x86_64.h"
+#include "llvm/Object/ELF.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/MemoryBuffer.h"
@@ -27,24 +28,63 @@ using namespace llvm;
 namespace llvm {
 namespace jitlink {
 
-void jitLink_ELF(std::unique_ptr<JITLinkContext> Ctx) {
+Expected<uint16_t> readTargetMachineArch(StringRef Buffer) {
+  const char *Data = Buffer.data();
 
-  // We don't want to do full ELF validation here. We just verify it is elf'ish.
-  // Probably should parse into an elf header when we support more than x86 :)
-
-  StringRef Data = Ctx->getObjectBuffer().getBuffer();
-  if (Data.size() < llvm::ELF::EI_MAG3 + 1) {
-    Ctx->notifyFailed(make_error<JITLinkError>("Truncated ELF buffer"));
-    return;
+  if (Data[ELF::EI_DATA] == ELF::ELFDATA2LSB) {
+    if (Data[ELF::EI_CLASS] == ELF::ELFCLASS64) {
+      if (auto File = llvm::object::ELF64LEFile::create(Buffer)) {
+        return File->getHeader().e_machine;
+      } else {
+        return File.takeError();
+      }
+    } else if (Data[ELF::EI_CLASS] == ELF::ELFCLASS32) {
+      if (auto File = llvm::object::ELF32LEFile::create(Buffer)) {
+        return File->getHeader().e_machine;
+      } else {
+        return File.takeError();
+      }
+    }
   }
 
-  if (!memcmp(Data.data(), llvm::ELF::ElfMagic, strlen(llvm::ELF::ElfMagic))) {
-    if (Data.data()[llvm::ELF::EI_CLASS] == ELF::ELFCLASS64) {
-      return jitLink_ELF_x86_64(std::move(Ctx));
-    }
+  return ELF::EM_NONE;
+}
+
+Expected<std::unique_ptr<LinkGraph>>
+createLinkGraphFromELFObject(MemoryBufferRef ObjectBuffer) {
+  StringRef Buffer = ObjectBuffer.getBuffer();
+  if (Buffer.size() < ELF::EI_MAG3 + 1)
+    return make_error<JITLinkError>("Truncated ELF buffer");
+
+  if (memcmp(Buffer.data(), ELF::ElfMagic, strlen(ELF::ElfMagic)) != 0)
+    return make_error<JITLinkError>("ELF magic not valid");
+
+  Expected<uint16_t> TargetMachineArch = readTargetMachineArch(Buffer);
+  if (!TargetMachineArch)
+    return TargetMachineArch.takeError();
+
+  switch (*TargetMachineArch) {
+  case ELF::EM_X86_64:
+    return createLinkGraphFromELFObject_x86_64(std::move(ObjectBuffer));
+  default:
+    return make_error<JITLinkError>(
+        "Unsupported target machine architecture in ELF object " +
+        ObjectBuffer.getBufferIdentifier());
   }
+}
 
-  Ctx->notifyFailed(make_error<JITLinkError>("ELF magic not valid"));
+void link_ELF(std::unique_ptr<LinkGraph> G,
+              std::unique_ptr<JITLinkContext> Ctx) {
+  switch (G->getTargetTriple().getArch()) {
+  case Triple::x86_64:
+    link_ELF_x86_64(std::move(G), std::move(Ctx));
+    return;
+  default:
+    Ctx->notifyFailed(make_error<JITLinkError>(
+        "Unsupported target machine architecture in ELF link graph " +
+        G->getName()));
+    return;
+  }
 }
 
 } // end namespace jitlink
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp
index 505f03590b6b..2a6b3eb19ded 100644
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp
@@ -11,16 +11,204 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ExecutionEngine/JITLink/ELF_x86_64.h"
-#include "JITLinkGeneric.h"
 #include "llvm/ExecutionEngine/JITLink/JITLink.h"
 #include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Support/Endian.h"
+
+#include "BasicGOTAndStubsBuilder.h"
+#include "EHFrameSupportImpl.h"
+#include "JITLinkGeneric.h"
 
 #define DEBUG_TYPE "jitlink"
 
 using namespace llvm;
 using namespace llvm::jitlink;
+using namespace llvm::jitlink::ELF_x86_64_Edges;
+
+namespace {
+
+class ELF_x86_64_GOTAndStubsBuilder
+    : public BasicGOTAndStubsBuilder<ELF_x86_64_GOTAndStubsBuilder> {
+public:
+  static const uint8_t NullGOTEntryContent[8];
+  static const uint8_t StubContent[6];
+
+  ELF_x86_64_GOTAndStubsBuilder(LinkGraph &G)
+      : BasicGOTAndStubsBuilder<ELF_x86_64_GOTAndStubsBuilder>(G) {}
+
+  bool isGOTEdge(Edge &E) const {
+    return E.getKind() == PCRel32GOT || E.getKind() == PCRel32GOTLoad;
+  }
+
+  Symbol &createGOTEntry(Symbol &Target) {
+    auto &GOTEntryBlock = G.createContentBlock(
+        getGOTSection(), getGOTEntryBlockContent(), 0, 8, 0);
+    GOTEntryBlock.addEdge(Pointer64, 0, Target, 0);
+    return G.addAnonymousSymbol(GOTEntryBlock, 0, 8, false, false);
+  }
+
+  void fixGOTEdge(Edge &E, Symbol &GOTEntry) {
+    assert((E.getKind() == PCRel32GOT || E.getKind() == PCRel32GOTLoad) &&
+           "Not a GOT edge?");
+    // If this is a PCRel32GOT then change it to an ordinary PCRel32. If it is
+    // a PCRel32GOTLoad then leave it as-is for now. We will use the kind to
+    // check for GOT optimization opportunities in the
+    // optimizeMachO_x86_64_GOTAndStubs pass below.
+    if (E.getKind() == PCRel32GOT)
+      E.setKind(PCRel32);
+
+    E.setTarget(GOTEntry);
+    // Leave the edge addend as-is.
+  }
+
+  bool isExternalBranchEdge(Edge &E) {
+    return E.getKind() == Branch32 && !E.getTarget().isDefined();
+  }
+
+  Symbol &createStub(Symbol &Target) {
+    auto &StubContentBlock =
+        G.createContentBlock(getStubsSection(), getStubBlockContent(), 0, 1, 0);
+    // Re-use GOT entries for stub targets.
+    auto &GOTEntrySymbol = getGOTEntrySymbol(Target);
+    StubContentBlock.addEdge(PCRel32, 2, GOTEntrySymbol, -4);
+    return G.addAnonymousSymbol(StubContentBlock, 0, 6, true, false);
+  }
+
+  void fixExternalBranchEdge(Edge &E, Symbol &Stub) {
+    assert(E.getKind() == Branch32 && "Not a Branch32 edge?");
+
+    // Set the edge kind to Branch32ToStub. We will use this to check for stub
+    // optimization opportunities in the optimize ELF_x86_64_GOTAndStubs pass
+    // below.
+    E.setKind(Branch32ToStub);
+    E.setTarget(Stub);
+  }
+
+private:
+  Section &getGOTSection() {
+    if (!GOTSection)
+      GOTSection = &G.createSection("$__GOT", sys::Memory::MF_READ);
+    return *GOTSection;
+  }
+
+  Section &getStubsSection() {
+    if (!StubsSection) {
+      auto StubsProt = static_cast<sys::Memory::ProtectionFlags>(
+          sys::Memory::MF_READ | sys::Memory::MF_EXEC);
+      StubsSection = &G.createSection("$__STUBS", StubsProt);
+    }
+    return *StubsSection;
+  }
+
+  StringRef getGOTEntryBlockContent() {
+    return StringRef(reinterpret_cast<const char *>(NullGOTEntryContent),
+                     sizeof(NullGOTEntryContent));
+  }
+
+  StringRef getStubBlockContent() {
+    return StringRef(reinterpret_cast<const char *>(StubContent),
+                     sizeof(StubContent));
+  }
+
+  Section *GOTSection = nullptr;
+  Section *StubsSection = nullptr;
+};
+
+const char *const DwarfSectionNames[] = {
+#define HANDLE_DWARF_SECTION(ENUM_NAME, ELF_NAME, CMDLINE_NAME, OPTION)        \
+  ELF_NAME,
+#include "llvm/BinaryFormat/Dwarf.def"
+#undef HANDLE_DWARF_SECTION
+};
+
+} // namespace
+
+const uint8_t ELF_x86_64_GOTAndStubsBuilder::NullGOTEntryContent[8] = {
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+const uint8_t ELF_x86_64_GOTAndStubsBuilder::StubContent[6] = {
+    0xFF, 0x25, 0x00, 0x00, 0x00, 0x00};
 
 static const char *CommonSectionName = "__common";
+static Error optimizeELF_x86_64_GOTAndStubs(LinkGraph &G) {
+  LLVM_DEBUG(dbgs() << "Optimizing GOT entries and stubs:\n");
+
+  for (auto *B : G.blocks())
+    for (auto &E : B->edges())
+      if (E.getKind() == PCRel32GOTLoad) {
+        // Replace GOT load with LEA only for MOVQ instructions.
+        constexpr uint8_t MOVQRIPRel[] = {0x48, 0x8b};
+        if (E.getOffset() < 3 ||
+            strncmp(B->getContent().data() + E.getOffset() - 3,
+                    reinterpret_cast<const char *>(MOVQRIPRel), 2) != 0)
+          continue;
+
+        auto &GOTBlock = E.getTarget().getBlock();
+        assert(GOTBlock.getSize() == G.getPointerSize() &&
+               "GOT entry block should be pointer sized");
+        assert(GOTBlock.edges_size() == 1 &&
+               "GOT entry should only have one outgoing edge");
+
+        auto &GOTTarget = GOTBlock.edges().begin()->getTarget();
+        JITTargetAddress EdgeAddr = B->getAddress() + E.getOffset();
+        JITTargetAddress TargetAddr = GOTTarget.getAddress();
+
+        int64_t Displacement = TargetAddr - EdgeAddr + 4;
+        if (Displacement >= std::numeric_limits<int32_t>::min() &&
+            Displacement <= std::numeric_limits<int32_t>::max()) {
+          // Change the edge kind as we don't go through GOT anymore. This is
+          // for formal correctness only. Technically, the two relocation kinds
+          // are resolved the same way.
+          E.setKind(PCRel32);
+          E.setTarget(GOTTarget);
+          auto *BlockData = reinterpret_cast<uint8_t *>(
+              const_cast<char *>(B->getContent().data()));
+          BlockData[E.getOffset() - 2] = 0x8d;
+          LLVM_DEBUG({
+            dbgs() << "  Replaced GOT load wih LEA:\n    ";
+            printEdge(dbgs(), *B, E, getELFX86RelocationKindName(E.getKind()));
+            dbgs() << "\n";
+          });
+        }
+      } else if (E.getKind() == Branch32ToStub) {
+        auto &StubBlock = E.getTarget().getBlock();
+        assert(StubBlock.getSize() ==
+                   sizeof(ELF_x86_64_GOTAndStubsBuilder::StubContent) &&
+               "Stub block should be stub sized");
+        assert(StubBlock.edges_size() == 1 &&
+               "Stub block should only have one outgoing edge");
+
+        auto &GOTBlock = StubBlock.edges().begin()->getTarget().getBlock();
+        assert(GOTBlock.getSize() == G.getPointerSize() &&
+               "GOT block should be pointer sized");
+        assert(GOTBlock.edges_size() == 1 &&
+               "GOT block should only have one outgoing edge");
+
+        auto &GOTTarget = GOTBlock.edges().begin()->getTarget();
+        JITTargetAddress EdgeAddr = B->getAddress() + E.getOffset();
+        JITTargetAddress TargetAddr = GOTTarget.getAddress();
+
+        int64_t Displacement = TargetAddr - EdgeAddr + 4;
+        if (Displacement >= std::numeric_limits<int32_t>::min() &&
+            Displacement <= std::numeric_limits<int32_t>::max()) {
+          E.setKind(Branch32);
+          E.setTarget(GOTTarget);
+          LLVM_DEBUG({
+            dbgs() << "  Replaced stub branch with direct branch:\n    ";
+            printEdge(dbgs(), *B, E, getELFX86RelocationKindName(E.getKind()));
+            dbgs() << "\n";
+          });
+        }
+      }
+
+  return Error::success();
+}
+
+static bool isDwarfSection(StringRef SectionName) {
+  for (auto &DwarfSectionName : DwarfSectionNames)
+    if (SectionName == DwarfSectionName)
+      return true;
+  return false;
+}
 
 namespace llvm {
 namespace jitlink {
@@ -35,7 +223,8 @@ private:
   // Find a better way
   using SymbolTable = object::ELFFile<object::ELF64LE>::Elf_Shdr;
   // For now we just assume
-  std::map<int32_t, Symbol *> JITSymbolTable;
+  using SymbolMap = std::map<int32_t, Symbol *>;
+  SymbolMap JITSymbolTable;
 
   Section &getCommonSection() {
     if (!CommonSection) {
@@ -51,6 +240,16 @@ private:
     switch (Type) {
     case ELF::R_X86_64_PC32:
       return ELF_x86_64_Edges::ELFX86RelocationKind::PCRel32;
+    case ELF::R_X86_64_PC64:
+      return ELF_x86_64_Edges::ELFX86RelocationKind::Delta64;
+    case ELF::R_X86_64_64:
+      return ELF_x86_64_Edges::ELFX86RelocationKind::Pointer64;
+    case ELF::R_X86_64_GOTPCREL:
+    case ELF::R_X86_64_GOTPCRELX:
+    case ELF::R_X86_64_REX_GOTPCRELX:
+      return ELF_x86_64_Edges::ELFX86RelocationKind::PCRel32GOTLoad;
+    case ELF::R_X86_64_PLT32:
+      return ELF_x86_64_Edges::ELFX86RelocationKind::Branch32;
     }
     return make_error<JITLinkError>("Unsupported x86-64 relocation:" +
                                     formatv("{0:d}", Type));
@@ -62,7 +261,7 @@ private:
   object::ELFFile<object::ELF64LE>::Elf_Shdr_Range sections;
   SymbolTable SymTab;
 
-  bool isRelocatable() { return Obj.getHeader()->e_type == llvm::ELF::ET_REL; }
+  bool isRelocatable() { return Obj.getHeader().e_type == llvm::ELF::ET_REL; }
 
   support::endianness
   getEndianness(const object::ELFFile<object::ELF64LE> &Obj) {
@@ -71,7 +270,7 @@ private:
 
   // This could also just become part of a template
   unsigned getPointerSize(const object::ELFFile<object::ELF64LE> &Obj) {
-    return Obj.getHeader()->getFileClass() == ELF::ELFCLASS64 ? 8 : 4;
+    return Obj.getHeader().getFileClass() == ELF::ELFCLASS64 ? 8 : 4;
   }
 
   // We don't technically need this right now
@@ -95,16 +294,12 @@ private:
       auto StrTabSec = Obj.getSection(SecRef.sh_link);
       if (!StrTabSec)
         return StrTabSec.takeError();
-      auto StringTable = Obj.getStringTable(*StrTabSec);
+      auto StringTable = Obj.getStringTable(**StrTabSec);
       if (!StringTable)
         return StringTable.takeError();
 
       for (auto SymRef : *Symbols) {
         Optional<StringRef> Name;
-        uint64_t Size = 0;
-
-        // FIXME: Read size.
-        (void)Size;
 
         if (auto NameOrErr = SymRef.getName(*StringTable))
           Name = *NameOrErr;
@@ -112,16 +307,13 @@ private:
           return NameOrErr.takeError();
 
         LLVM_DEBUG({
-          dbgs() << "  ";
-          if (!Name)
-            dbgs() << "<anonymous symbol>";
-          else
-            dbgs() << *Name;
-          dbgs() << ": value = " << formatv("{0:x16}", SymRef.getValue())
+          dbgs() << "  value = " << formatv("{0:x16}", SymRef.getValue())
                  << ", type = " << formatv("{0:x2}", SymRef.getType())
-                 << ", binding = " << SymRef.getBinding()
-                 << ", size =" << Size;
-          dbgs() << "\n";
+                 << ", binding = " << formatv("{0:x2}", SymRef.getBinding())
+                 << ", size = "
+                 << formatv("{0:x16}", static_cast<uint64_t>(SymRef.st_size))
+                 << ", info = " << formatv("{0:x2}", SymRef.st_info)
+                 << " :" << (Name ? *Name : "<anonymous symbol>") << "\n";
         });
       }
     }
@@ -131,9 +323,19 @@ private:
   Error createNormalizedSections() {
     LLVM_DEBUG(dbgs() << "Creating normalized sections...\n");
     for (auto &SecRef : sections) {
-      auto Name = Obj.getSectionName(&SecRef);
+      auto Name = Obj.getSectionName(SecRef);
       if (!Name)
         return Name.takeError();
+
+      // Skip Dwarf sections.
+      if (isDwarfSection(*Name)) {
+        LLVM_DEBUG({
+          dbgs() << *Name
+                 << " is a debug section: No graph section will be created.\n";
+        });
+        continue;
+      }
+
       sys::Memory::ProtectionFlags Prot;
       if (SecRef.sh_flags & ELF::SHF_EXECINSTR) {
         Prot = static_cast<sys::Memory::ProtectionFlags>(sys::Memory::MF_READ |
@@ -147,8 +349,8 @@ private:
       uint64_t Flags = SecRef.sh_flags;
       uint64_t Alignment = SecRef.sh_addralign;
       const char *Data = nullptr;
-      // TODO: figure out what it is that has 0 size no name and address
-      // 0000-0000
+      // for now we just use this to skip the "undefined" section, probably need
+      // to revist
       if (Size == 0)
         continue;
 
@@ -158,13 +360,13 @@ private:
       LLVM_DEBUG({
         dbgs() << "  " << *Name << ": " << formatv("{0:x16}", Address) << " -- "
                << formatv("{0:x16}", Address + Size) << ", align: " << Alignment
-               << " Flags:" << Flags << "\n";
+               << " Flags: " << formatv("{0:x}", Flags) << "\n";
       });
 
       if (SecRef.sh_type != ELF::SHT_NOBITS) {
         // .sections() already checks that the data is not beyond the end of
         // file
-        auto contents = Obj.getSectionContentsAsArray<char>(&SecRef);
+        auto contents = Obj.getSectionContentsAsArray<char>(SecRef);
         if (!contents)
           return contents.takeError();
 
@@ -178,6 +380,9 @@ private:
         if (SecRef.sh_type == ELF::SHT_SYMTAB)
           // TODO: Dynamic?
           SymTab = SecRef;
+      } else {
+        auto &Section = G->createSection(*Name, Prot);
+        G->createZeroFillBlock(Section, Size, Address, Alignment, 0);
       }
     }
 
@@ -196,21 +401,34 @@ private:
         return make_error<llvm::StringError>("Shouldn't have REL in x64",
                                              llvm::inconvertibleErrorCode());
 
-      auto RelSectName = Obj.getSectionName(&SecRef);
+      auto RelSectName = Obj.getSectionName(SecRef);
       if (!RelSectName)
         return RelSectName.takeError();
-      // Deal with .eh_frame later
-      if (*RelSectName == StringRef(".rela.eh_frame"))
-        continue;
+
+      LLVM_DEBUG({
+        dbgs() << "Adding relocations from section " << *RelSectName << "\n";
+      });
 
       auto UpdateSection = Obj.getSection(SecRef.sh_info);
       if (!UpdateSection)
         return UpdateSection.takeError();
 
-      auto UpdateSectionName = Obj.getSectionName(*UpdateSection);
+      auto UpdateSectionName = Obj.getSectionName(**UpdateSection);
       if (!UpdateSectionName)
         return UpdateSectionName.takeError();
 
+      // Don't process relocations for debug sections.
+      if (isDwarfSection(*UpdateSectionName)) {
+        LLVM_DEBUG({
+          dbgs() << "  Target is dwarf section " << *UpdateSectionName
+                 << ". Skipping.\n";
+        });
+        continue;
+      } else
+        LLVM_DEBUG({
+          dbgs() << "  For target section " << *UpdateSectionName << "\n";
+        });
+
       auto JITSection = G->findSectionByName(*UpdateSectionName);
       if (!JITSection)
         return make_error<llvm::StringError>(
@@ -218,7 +436,7 @@ private:
                 *UpdateSectionName,
             llvm::inconvertibleErrorCode());
 
-      auto Relocations = Obj.relas(&SecRef);
+      auto Relocations = Obj.relas(SecRef);
       if (!Relocations)
         return Relocations.takeError();
 
@@ -229,13 +447,22 @@ private:
           dbgs() << "Relocation Type: " << Type << "\n"
                  << "Name: " << Obj.getRelocationTypeName(Type) << "\n";
         });
-
-        auto Symbol = Obj.getRelocationSymbol(&Rela, &SymTab);
+        auto SymbolIndex = Rela.getSymbol(false);
+        auto Symbol = Obj.getRelocationSymbol(Rela, &SymTab);
         if (!Symbol)
           return Symbol.takeError();
 
         auto BlockToFix = *(JITSection->blocks().begin());
-        auto TargetSymbol = JITSymbolTable[(*Symbol)->st_shndx];
+        auto *TargetSymbol = JITSymbolTable[SymbolIndex];
+
+        if (!TargetSymbol) {
+          return make_error<llvm::StringError>(
+              "Could not find symbol at given index, did you add it to "
+              "JITSymbolTable? index: " + std::to_string(SymbolIndex)
+              + ", shndx: " + std::to_string((*Symbol)->st_shndx) +
+                  " Size of table: " + std::to_string(JITSymbolTable.size()),
+              llvm::inconvertibleErrorCode());
+        }
         uint64_t Addend = Rela.r_addend;
         JITTargetAddress FixupAddress =
             (*UpdateSection)->sh_addr + Rela.r_offset;
@@ -251,8 +478,8 @@ private:
         LLVM_DEBUG({
           Edge GE(*Kind, FixupAddress - BlockToFix->getAddress(), *TargetSymbol,
                   Addend);
-          // TODO a mapping of KIND => type then call getRelocationTypeName4
-          printEdge(dbgs(), *BlockToFix, GE, StringRef(""));
+          printEdge(dbgs(), *BlockToFix, GE,
+                    getELFX86RelocationKindName(*Kind));
           dbgs() << "\n";
         });
         BlockToFix->addEdge(*Kind, FixupAddress - BlockToFix->getAddress(),
@@ -284,25 +511,31 @@ private:
       auto StrTabSec = Obj.getSection(SecRef.sh_link);
       if (!StrTabSec)
         return StrTabSec.takeError();
-      auto StringTable = Obj.getStringTable(*StrTabSec);
+      auto StringTable = Obj.getStringTable(**StrTabSec);
       if (!StringTable)
         return StringTable.takeError();
-      auto Name = Obj.getSectionName(&SecRef);
+      auto Name = Obj.getSectionName(SecRef);
       if (!Name)
         return Name.takeError();
+
+      LLVM_DEBUG(dbgs() << "Processing symbol section " << *Name << ":\n");
+
       auto Section = G->findSectionByName(*Name);
       if (!Section)
-        return make_error<llvm::StringError>("Could not find a section",
+        return make_error<llvm::StringError>("Could not find a section " +
+                                             *Name,
                                              llvm::inconvertibleErrorCode());
       // we only have one for now
       auto blocks = Section->blocks();
       if (blocks.empty())
         return make_error<llvm::StringError>("Section has no block",
                                              llvm::inconvertibleErrorCode());
-
+      int SymbolIndex = -1;
       for (auto SymRef : *Symbols) {
+        ++SymbolIndex;
         auto Type = SymRef.getType();
-        if (Type == ELF::STT_NOTYPE || Type == ELF::STT_FILE)
+
+        if (Type == ELF::STT_FILE || SymbolIndex == 0)
           continue;
         // these should do it for now
         // if(Type != ELF::STT_NOTYPE &&
@@ -312,68 +545,119 @@ private:
         //   Type != ELF::STT_COMMON) {
         //     continue;
         //   }
-        std::pair<Linkage, Scope> bindings;
         auto Name = SymRef.getName(*StringTable);
         // I am not sure on If this is going to hold as an invariant. Revisit.
         if (!Name)
           return Name.takeError();
-        // TODO: weak and hidden
-        if (SymRef.isExternal())
-          bindings = {Linkage::Strong, Scope::Default};
-        else
-          bindings = {Linkage::Strong, Scope::Local};
+
+        if (SymRef.isCommon()) {
+          // Symbols in SHN_COMMON refer to uninitialized data. The st_value
+          // field holds alignment constraints.
+          Symbol &S =
+              G->addCommonSymbol(*Name, Scope::Default, getCommonSection(), 0,
+                                 SymRef.st_size, SymRef.getValue(), false);
+          JITSymbolTable[SymbolIndex] = &S;
+          continue;
+        }
+
+        // Map Visibility and Binding to Scope and Linkage:
+        Linkage L = Linkage::Strong;
+        Scope S = Scope::Default;
+
+        switch (SymRef.getBinding()) {
+        case ELF::STB_LOCAL:
+          S = Scope::Local;
+          break;
+        case ELF::STB_GLOBAL:
+          // Nothing to do here.
+          break;
+        case ELF::STB_WEAK:
+          L = Linkage::Weak;
+          break;
+        default:
+          return make_error<StringError>("Unrecognized symbol binding for " +
+                                             *Name,
+                                         inconvertibleErrorCode());
+        }
+
+        switch (SymRef.getVisibility()) {
+        case ELF::STV_DEFAULT:
+        case ELF::STV_PROTECTED:
+          // FIXME: Make STV_DEFAULT symbols pre-emptible? This probably needs
+          // Orc support.
+          // Otherwise nothing to do here.
+          break;
+        case ELF::STV_HIDDEN:
+          // Default scope -> Hidden scope. No effect on local scope.
+          if (S == Scope::Default)
+            S = Scope::Hidden;
+          break;
+        case ELF::STV_INTERNAL:
+          return make_error<StringError>("Unrecognized symbol visibility for " +
+                                             *Name,
+                                         inconvertibleErrorCode());
+        }
 
         if (SymRef.isDefined() &&
-            (Type == ELF::STT_FUNC || Type == ELF::STT_OBJECT)) {
+            (Type == ELF::STT_FUNC || Type == ELF::STT_OBJECT ||
+             Type == ELF::STT_SECTION)) {
 
           auto DefinedSection = Obj.getSection(SymRef.st_shndx);
           if (!DefinedSection)
             return DefinedSection.takeError();
-          auto sectName = Obj.getSectionName(*DefinedSection);
+          auto sectName = Obj.getSectionName(**DefinedSection);
           if (!sectName)
             return Name.takeError();
 
+          // Skip debug section symbols.
+          if (isDwarfSection(*sectName))
+            continue;
+
           auto JitSection = G->findSectionByName(*sectName);
           if (!JitSection)
             return make_error<llvm::StringError>(
-                "Could not find a section", llvm::inconvertibleErrorCode());
+                "Could not find the JitSection " + *sectName,
+                llvm::inconvertibleErrorCode());
           auto bs = JitSection->blocks();
           if (bs.empty())
             return make_error<llvm::StringError>(
                 "Section has no block", llvm::inconvertibleErrorCode());
 
-          auto B = *bs.begin();
-          LLVM_DEBUG({ dbgs() << "  " << *Name << ": "; });
-
-          auto &S = G->addDefinedSymbol(
-              *B, SymRef.getValue(), *Name, SymRef.st_size, bindings.first,
-              bindings.second, SymRef.getType() == ELF::STT_FUNC, false);
-          JITSymbolTable[SymRef.st_shndx] = &S;
-        }
-        //TODO: The following has to be implmented.
+          auto *B = *bs.begin();
+          LLVM_DEBUG({ dbgs() << "  " << *Name << " at index " << SymbolIndex << "\n"; });
+          if (SymRef.getType() == ELF::STT_SECTION)
+            *Name = *sectName;
+          auto &Sym = G->addDefinedSymbol(
+              *B, SymRef.getValue(), *Name, SymRef.st_size, L, S,
+              SymRef.getType() == ELF::STT_FUNC, false);
+          JITSymbolTable[SymbolIndex] = &Sym;
+        } else if (SymRef.isUndefined() && SymRef.isExternal()) {
+          auto &Sym = G->addExternalSymbol(*Name, SymRef.st_size, L);
+          JITSymbolTable[SymbolIndex] = &Sym;
+        } else
+          LLVM_DEBUG({
+              dbgs()
+                << "Not creating graph symbol for normalized symbol at index "
+                << SymbolIndex << ", \"" << *Name << "\"\n";
+            });
+
+        // TODO: The following has to be implmented.
         // leaving commented out to save time for future patchs
         /*
           G->addAbsoluteSymbol(*Name, SymRef.getValue(), SymRef.st_size,
           Linkage::Strong, Scope::Default, false);
-
-          if(SymRef.isCommon()) {
-            G->addCommonSymbol(*Name, Scope::Default, getCommonSection(), 0, 0,
-          SymRef.getValue(), false);
-          }
-
-
-          //G->addExternalSymbol(*Name, SymRef.st_size, Linkage::Strong);
-  */
+        */
       }
     }
     return Error::success();
   }
 
 public:
-  ELFLinkGraphBuilder_x86_64(std::string filename,
+  ELFLinkGraphBuilder_x86_64(StringRef FileName,
                              const object::ELFFile<object::ELF64LE> &Obj)
-      : G(std::make_unique<LinkGraph>(filename, getPointerSize(Obj),
-                                      getEndianness(Obj))),
+      : G(std::make_unique<LinkGraph>(FileName.str(),
+                                      Triple("x86_64-unknown-linux"),
+                                      getPointerSize(Obj), getEndianness(Obj))),
         Obj(Obj) {}
 
   Expected<std::unique_ptr<LinkGraph>> buildGraph() {
@@ -409,55 +693,121 @@ class ELFJITLinker_x86_64 : public JITLinker<ELFJITLinker_x86_64> {
 
 public:
   ELFJITLinker_x86_64(std::unique_ptr<JITLinkContext> Ctx,
+                      std::unique_ptr<LinkGraph> G,
                       PassConfiguration PassConfig)
-      : JITLinker(std::move(Ctx), std::move(PassConfig)) {}
+      : JITLinker(std::move(Ctx), std::move(G), std::move(PassConfig)) {}
 
 private:
-  StringRef getEdgeKindName(Edge::Kind R) const override { return StringRef(); }
-
-  Expected<std::unique_ptr<LinkGraph>>
-  buildGraph(MemoryBufferRef ObjBuffer) override {
-    auto ELFObj = object::ObjectFile::createELFObjectFile(ObjBuffer);
-    if (!ELFObj)
-      return ELFObj.takeError();
-
-    auto &ELFObjFile = cast<object::ELFObjectFile<object::ELF64LE>>(**ELFObj);
-    std::string fileName(ELFObj->get()->getFileName());
-    return ELFLinkGraphBuilder_x86_64(std::move(fileName),
-                                      *ELFObjFile.getELFFile())
-        .buildGraph();
+  StringRef getEdgeKindName(Edge::Kind R) const override {
+    return getELFX86RelocationKindName(R);
+  }
+
+  static Error targetOutOfRangeError(const Block &B, const Edge &E) {
+    std::string ErrMsg;
+    {
+      raw_string_ostream ErrStream(ErrMsg);
+      ErrStream << "Relocation target out of range: ";
+      printEdge(ErrStream, B, E, getELFX86RelocationKindName(E.getKind()));
+      ErrStream << "\n";
+    }
+    return make_error<JITLinkError>(std::move(ErrMsg));
   }
 
   Error applyFixup(Block &B, const Edge &E, char *BlockWorkingMem) const {
     using namespace ELF_x86_64_Edges;
+    using namespace llvm::support;
     char *FixupPtr = BlockWorkingMem + E.getOffset();
     JITTargetAddress FixupAddress = B.getAddress() + E.getOffset();
     switch (E.getKind()) {
-
+    case ELFX86RelocationKind::Branch32:
+    case ELFX86RelocationKind::Branch32ToStub:
     case ELFX86RelocationKind::PCRel32:
+    case ELFX86RelocationKind::PCRel32GOTLoad: {
+      int64_t Value = E.getTarget().getAddress() + E.getAddend() - FixupAddress;
+      if (Value < std::numeric_limits<int32_t>::min() ||
+          Value > std::numeric_limits<int32_t>::max())
+        return targetOutOfRangeError(B, E);
+      *(little32_t *)FixupPtr = Value;
+      break;
+    }
+    case ELFX86RelocationKind::Pointer64: {
+      int64_t Value = E.getTarget().getAddress() + E.getAddend();
+      *(ulittle64_t *)FixupPtr = Value;
+      break;
+    }
+    case ELFX86RelocationKind::Delta64: {
       int64_t Value = E.getTarget().getAddress() + E.getAddend() - FixupAddress;
-      // verify
-      *(support::little32_t *)FixupPtr = Value;
+      *(little64_t *)FixupPtr = Value;
       break;
     }
+    }
     return Error::success();
   }
 };
 
-void jitLink_ELF_x86_64(std::unique_ptr<JITLinkContext> Ctx) {
+Expected<std::unique_ptr<LinkGraph>>
+createLinkGraphFromELFObject_x86_64(MemoryBufferRef ObjectBuffer) {
+  LLVM_DEBUG({
+    dbgs() << "Building jitlink graph for new input "
+           << ObjectBuffer.getBufferIdentifier() << "...\n";
+  });
+
+  auto ELFObj = object::ObjectFile::createELFObjectFile(ObjectBuffer);
+  if (!ELFObj)
+    return ELFObj.takeError();
+
+  auto &ELFObjFile = cast<object::ELFObjectFile<object::ELF64LE>>(**ELFObj);
+  return ELFLinkGraphBuilder_x86_64((*ELFObj)->getFileName(),
+                                    ELFObjFile.getELFFile())
+      .buildGraph();
+}
+
+void link_ELF_x86_64(std::unique_ptr<LinkGraph> G,
+                     std::unique_ptr<JITLinkContext> Ctx) {
   PassConfiguration Config;
-  Triple TT("x86_64-linux");
-  // Construct a JITLinker and run the link function.
-  // Add a mark-live pass.
-  if (auto MarkLive = Ctx->getMarkLivePass(TT))
-    Config.PrePrunePasses.push_back(std::move(MarkLive));
-  else
-    Config.PrePrunePasses.push_back(markAllSymbolsLive);
-
-  if (auto Err = Ctx->modifyPassConfig(TT, Config))
+
+  if (Ctx->shouldAddDefaultTargetPasses(G->getTargetTriple())) {
+
+    Config.PrePrunePasses.push_back(EHFrameSplitter(".eh_frame"));
+    Config.PrePrunePasses.push_back(EHFrameEdgeFixer(
+        ".eh_frame", G->getPointerSize(), Delta64, Delta32, NegDelta32));
+
+    // Construct a JITLinker and run the link function.
+    // Add a mark-live pass.
+    if (auto MarkLive = Ctx->getMarkLivePass(G->getTargetTriple()))
+      Config.PrePrunePasses.push_back(std::move(MarkLive));
+    else
+      Config.PrePrunePasses.push_back(markAllSymbolsLive);
+
+    // Add an in-place GOT/Stubs pass.
+    Config.PostPrunePasses.push_back([](LinkGraph &G) -> Error {
+      ELF_x86_64_GOTAndStubsBuilder(G).run();
+      return Error::success();
+    });
+
+    // Add GOT/Stubs optimizer pass.
+    Config.PreFixupPasses.push_back(optimizeELF_x86_64_GOTAndStubs);
+  }
+
+  if (auto Err = Ctx->modifyPassConfig(G->getTargetTriple(), Config))
     return Ctx->notifyFailed(std::move(Err));
 
-  ELFJITLinker_x86_64::link(std::move(Ctx), std::move(Config));
+  ELFJITLinker_x86_64::link(std::move(Ctx), std::move(G), std::move(Config));
+}
+StringRef getELFX86RelocationKindName(Edge::Kind R) {
+  switch (R) {
+  case PCRel32:
+    return "PCRel32";
+  case Pointer64:
+    return "Pointer64";
+  case PCRel32GOTLoad:
+    return "PCRel32GOTLoad";
+  case Branch32:
+    return "Branch32";
+  case Branch32ToStub:
+    return "Branch32ToStub";
+  }
+  return getGenericEdgeKindName(static_cast<Edge::Kind>(R));
 }
 } // end namespace jitlink
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp
index 5105ec495148..93dfba9c759b 100644
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp
@@ -64,7 +64,7 @@ const char *getGenericEdgeKindName(Edge::Kind K) {
   case Edge::KeepAlive:
     return "Keep-Alive";
   default:
-    llvm_unreachable("Unrecognized relocation kind");
+    return "<Unrecognized edge kind>";
   }
 }
 
@@ -93,6 +93,7 @@ const char *getScopeName(Scope S) {
 raw_ostream &operator<<(raw_ostream &OS, const Block &B) {
   return OS << formatv("{0:x16}", B.getAddress()) << " -- "
             << formatv("{0:x16}", B.getAddress() + B.getSize()) << ": "
+            << "size = " << formatv("{0:x}", B.getSize()) << ", "
             << (B.isZeroFill() ? "zero-fill" : "content")
             << ", align = " << B.getAlignment()
             << ", align-ofs = " << B.getAlignmentOffset()
@@ -126,10 +127,10 @@ raw_ostream &operator<<(raw_ostream &OS, const Symbol &Sym) {
     break;
   }
   OS << (Sym.isLive() ? '+' : '-')
-     << ", size = " << formatv("{0:x8}", Sym.getSize())
+     << ", size = " << formatv("{0:x}", Sym.getSize())
      << ", addr = " << formatv("{0:x16}", Sym.getAddress()) << " ("
      << formatv("{0:x16}", Sym.getAddressable().getAddress()) << " + "
-     << formatv("{0:x8}", Sym.getOffset());
+     << formatv("{0:x}", Sym.getOffset());
   if (Sym.isDefined())
     OS << " " << Sym.getBlock().getSection().getName();
   OS << ")>";
@@ -139,8 +140,33 @@ raw_ostream &operator<<(raw_ostream &OS, const Symbol &Sym) {
 void printEdge(raw_ostream &OS, const Block &B, const Edge &E,
                StringRef EdgeKindName) {
   OS << "edge@" << formatv("{0:x16}", B.getAddress() + E.getOffset()) << ": "
-     << formatv("{0:x16}", B.getAddress()) << " + " << E.getOffset() << " -- "
-     << EdgeKindName << " -> " << E.getTarget() << " + " << E.getAddend();
+     << formatv("{0:x16}", B.getAddress()) << " + "
+     << formatv("{0:x}", E.getOffset()) << " -- " << EdgeKindName << " -> ";
+
+  auto &TargetSym = E.getTarget();
+  if (TargetSym.hasName())
+    OS << TargetSym.getName();
+  else {
+    auto &TargetBlock = TargetSym.getBlock();
+    auto &TargetSec = TargetBlock.getSection();
+    JITTargetAddress SecAddress = ~JITTargetAddress(0);
+    for (auto *B : TargetSec.blocks())
+      if (B->getAddress() < SecAddress)
+        SecAddress = B->getAddress();
+
+    JITTargetAddress SecDelta = TargetSym.getAddress() - SecAddress;
+    OS << formatv("{0:x16}", TargetSym.getAddress()) << " (section "
+       << TargetSec.getName();
+    if (SecDelta)
+      OS << " + " << formatv("{0:x}", SecDelta);
+    OS << " / block " << formatv("{0:x16}", TargetBlock.getAddress());
+    if (TargetSym.getOffset())
+      OS << " + " << formatv("{0:x}", TargetSym.getOffset());
+    OS << ")";
+  }
+
+  if (E.getAddend() != 0)
+    OS << " + " << E.getAddend();
 }
 
 Section::~Section() {
@@ -296,15 +322,27 @@ Error markAllSymbolsLive(LinkGraph &G) {
   return Error::success();
 }
 
-void jitLink(std::unique_ptr<JITLinkContext> Ctx) {
-  auto Magic = identify_magic(Ctx->getObjectBuffer().getBuffer());
+Expected<std::unique_ptr<LinkGraph>>
+createLinkGraphFromObject(MemoryBufferRef ObjectBuffer) {
+  auto Magic = identify_magic(ObjectBuffer.getBuffer());
   switch (Magic) {
   case file_magic::macho_object:
-    return jitLink_MachO(std::move(Ctx));
+    return createLinkGraphFromMachOObject(std::move(ObjectBuffer));
   case file_magic::elf_relocatable:
-    return jitLink_ELF(std::move(Ctx));
+    return createLinkGraphFromELFObject(std::move(ObjectBuffer));
+  default:
+    return make_error<JITLinkError>("Unsupported file format");
+  };
+}
+
+void link(std::unique_ptr<LinkGraph> G, std::unique_ptr<JITLinkContext> Ctx) {
+  switch (G->getTargetTriple().getObjectFormat()) {
+  case Triple::MachO:
+    return link_MachO(std::move(G), std::move(Ctx));
+  case Triple::ELF:
+    return link_ELF(std::move(G), std::move(Ctx));
   default:
-    Ctx->notifyFailed(make_error<JITLinkError>("Unsupported file format"));
+    Ctx->notifyFailed(make_error<JITLinkError>("Unsupported object format"));
   };
 }
 
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp
index 1d76a49939dc..7a5e014f223d 100644
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp
@@ -24,15 +24,6 @@ JITLinkerBase::~JITLinkerBase() {}
 
 void JITLinkerBase::linkPhase1(std::unique_ptr<JITLinkerBase> Self) {
 
-  LLVM_DEBUG({ dbgs() << "Building jitlink graph for new input...\n"; });
-
-  // Build the link graph.
-  if (auto GraphOrErr = buildGraph(Ctx->getObjectBuffer()))
-    G = std::move(*GraphOrErr);
-  else
-    return Ctx->notifyFailed(GraphOrErr.takeError());
-  assert(G && "Graph should have been created by buildGraph above");
-
   LLVM_DEBUG({
     dbgs() << "Starting link phase 1 for graph " << G->getName() << "\n";
   });
@@ -64,10 +55,22 @@ void JITLinkerBase::linkPhase1(std::unique_ptr<JITLinkerBase> Self) {
   if (auto Err = allocateSegments(Layout))
     return Ctx->notifyFailed(std::move(Err));
 
+  LLVM_DEBUG({
+    dbgs() << "Link graph \"" << G->getName()
+           << "\" before post-allocation passes:\n";
+    dumpGraph(dbgs());
+  });
+
+  // Run post-allocation passes.
+  if (auto Err = runPasses(Passes.PostAllocationPasses))
+    return Ctx->notifyFailed(std::move(Err));
+
   // Notify client that the defined symbols have been assigned addresses.
   LLVM_DEBUG(
       { dbgs() << "Resolving symbols defined in " << G->getName() << "\n"; });
-  Ctx->notifyResolved(*G);
+
+  if (auto Err = Ctx->notifyResolved(*G))
+    return Ctx->notifyFailed(std::move(Err));
 
   auto ExternalSymbols = getExternalSymbolNames();
 
@@ -117,11 +120,11 @@ void JITLinkerBase::linkPhase2(std::unique_ptr<JITLinkerBase> Self,
 
   LLVM_DEBUG({
     dbgs() << "Link graph \"" << G->getName()
-           << "\" before post-allocation passes:\n";
+           << "\" before pre-fixup passes:\n";
     dumpGraph(dbgs());
   });
 
-  if (auto Err = runPasses(Passes.PostAllocationPasses))
+  if (auto Err = runPasses(Passes.PreFixupPasses))
     return deallocateAndBailOut(std::move(Err));
 
   LLVM_DEBUG({
@@ -261,7 +264,8 @@ Error JITLinkerBase::allocateSegments(const SegmentLayoutMap &Layout) {
   }
   LLVM_DEBUG(dbgs() << " }\n");
 
-  if (auto AllocOrErr = Ctx->getMemoryManager().allocate(Segments))
+  if (auto AllocOrErr =
+          Ctx->getMemoryManager().allocate(Ctx->getJITLinkDylib(), Segments))
     Alloc = std::move(*AllocOrErr);
   else
     return AllocOrErr.takeError();
@@ -332,12 +336,6 @@ void JITLinkerBase::applyLookupResult(AsyncLookupResult Result) {
       dbgs() << "  " << Sym->getName() << ": "
              << formatv("{0:x16}", Sym->getAddress()) << "\n";
   });
-  assert(llvm::all_of(G->external_symbols(),
-                      [](Symbol *Sym) {
-                        return Sym->getAddress() != 0 ||
-                               Sym->getLinkage() == Linkage::Weak;
-                      }) &&
-         "All strong external symbols should have been resolved by now");
 }
 
 void JITLinkerBase::copyBlockContentToWorkingMemory(
@@ -445,16 +443,19 @@ void prune(LinkGraph &G) {
     VisitedBlocks.insert(&B);
 
     for (auto &E : Sym->getBlock().edges()) {
-      if (E.getTarget().isDefined() && !E.getTarget().isLive()) {
-        E.getTarget().setLive(true);
+      // If the edge target is a defined symbol that is being newly marked live
+      // then add it to the worklist.
+      if (E.getTarget().isDefined() && !E.getTarget().isLive())
         Worklist.push_back(&E.getTarget());
-      }
+
+      // Mark the target live.
+      E.getTarget().setLive(true);
     }
   }
 
-  // Collect all the symbols to remove, then remove them.
+  // Collect all defined symbols to remove, then remove them.
   {
-    LLVM_DEBUG(dbgs() << "Dead-stripping symbols:\n");
+    LLVM_DEBUG(dbgs() << "Dead-stripping defined symbols:\n");
     std::vector<Symbol *> SymbolsToRemove;
     for (auto *Sym : G.defined_symbols())
       if (!Sym->isLive())
@@ -477,6 +478,19 @@ void prune(LinkGraph &G) {
       G.removeBlock(*B);
     }
   }
+
+  // Collect all external symbols to remove, then remove them.
+  {
+    LLVM_DEBUG(dbgs() << "Removing unused external symbols:\n");
+    std::vector<Symbol *> SymbolsToRemove;
+    for (auto *Sym : G.external_symbols())
+      if (!Sym->isLive())
+        SymbolsToRemove.push_back(Sym);
+    for (auto *Sym : SymbolsToRemove) {
+      LLVM_DEBUG(dbgs() << "  " << *Sym << "...\n");
+      G.removeExternalSymbol(*Sym);
+    }
+  }
 }
 
 } // end namespace jitlink
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.h b/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.h
index 87e5e8bbc98d..1d28f5006b2b 100644
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.h
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.h
@@ -32,9 +32,11 @@ namespace jitlink {
 /// remaining linker work) to allow them to be performed asynchronously.
 class JITLinkerBase {
 public:
-  JITLinkerBase(std::unique_ptr<JITLinkContext> Ctx, PassConfiguration Passes)
-      : Ctx(std::move(Ctx)), Passes(std::move(Passes)) {
+  JITLinkerBase(std::unique_ptr<JITLinkContext> Ctx,
+                std::unique_ptr<LinkGraph> G, PassConfiguration Passes)
+      : Ctx(std::move(Ctx)), G(std::move(G)), Passes(std::move(Passes)) {
     assert(this->Ctx && "Ctx can not be null");
+    assert(this->G && "G can not be null");
   }
 
   virtual ~JITLinkerBase();
@@ -50,8 +52,7 @@ protected:
   using SegmentLayoutMap = DenseMap<unsigned, SegmentLayout>;
 
   // Phase 1:
-  //   1.1: Build link graph
-  //   1.2: Run pre-prune passes
+  //   1.1: Run pre-prune passes
   //   1.2: Prune graph
   //   1.3: Run post-prune passes
   //   1.4: Sort blocks into segments
@@ -72,11 +73,6 @@ protected:
   //   3.1: Call OnFinalized callback, handing off allocation.
   void linkPhase3(std::unique_ptr<JITLinkerBase> Self, Error Err);
 
-  // Build a graph from the given object buffer.
-  // To be implemented by the client.
-  virtual Expected<std::unique_ptr<LinkGraph>>
-  buildGraph(MemoryBufferRef ObjBuffer) = 0;
-
   // For debug dumping of the link graph.
   virtual StringRef getEdgeKindName(Edge::Kind K) const = 0;
 
@@ -113,8 +109,8 @@ private:
   void dumpGraph(raw_ostream &OS);
 
   std::unique_ptr<JITLinkContext> Ctx;
-  PassConfiguration Passes;
   std::unique_ptr<LinkGraph> G;
+  PassConfiguration Passes;
   std::unique_ptr<JITLinkMemoryManager::Allocation> Alloc;
 };
 
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp
index 68ec9d79af9b..fbbb29e9164a 100644
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp
@@ -17,7 +17,8 @@ JITLinkMemoryManager::~JITLinkMemoryManager() = default;
 JITLinkMemoryManager::Allocation::~Allocation() = default;
 
 Expected<std::unique_ptr<JITLinkMemoryManager::Allocation>>
-InProcessMemoryManager::allocate(const SegmentsRequestMap &Request) {
+InProcessMemoryManager::allocate(const JITLinkDylib *JD,
+                                 const SegmentsRequestMap &Request) {
 
   using AllocationMap = DenseMap<unsigned, sys::MemoryBlock>;
 
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/MachO.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/MachO.cpp
index b3e45868ab22..e9327df6da41 100644
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/MachO.cpp
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/MachO.cpp
@@ -27,39 +27,29 @@ using namespace llvm;
 namespace llvm {
 namespace jitlink {
 
-void jitLink_MachO(std::unique_ptr<JITLinkContext> Ctx) {
-
-  // We don't want to do full MachO validation here. Just parse enough of the
-  // header to find out what MachO linker to use.
-
-  StringRef Data = Ctx->getObjectBuffer().getBuffer();
-  if (Data.size() < 4) {
-    StringRef BufferName = Ctx->getObjectBuffer().getBufferIdentifier();
-    Ctx->notifyFailed(make_error<JITLinkError>("Truncated MachO buffer \"" +
-                                               BufferName + "\""));
-    return;
-  }
+Expected<std::unique_ptr<LinkGraph>>
+createLinkGraphFromMachOObject(MemoryBufferRef ObjectBuffer) {
+  StringRef Data = ObjectBuffer.getBuffer();
+  if (Data.size() < 4)
+    return make_error<JITLinkError>("Truncated MachO buffer \"" +
+                                    ObjectBuffer.getBufferIdentifier() + "\"");
 
   uint32_t Magic;
   memcpy(&Magic, Data.data(), sizeof(uint32_t));
   LLVM_DEBUG({
     dbgs() << "jitLink_MachO: magic = " << format("0x%08" PRIx32, Magic)
-           << ", identifier = \""
-           << Ctx->getObjectBuffer().getBufferIdentifier() << "\"\n";
+           << ", identifier = \"" << ObjectBuffer.getBufferIdentifier()
+           << "\"\n";
   });
 
-  if (Magic == MachO::MH_MAGIC || Magic == MachO::MH_CIGAM) {
-    Ctx->notifyFailed(
-        make_error<JITLinkError>("MachO 32-bit platforms not supported"));
-    return;
-  } else if (Magic == MachO::MH_MAGIC_64 || Magic == MachO::MH_CIGAM_64) {
+  if (Magic == MachO::MH_MAGIC || Magic == MachO::MH_CIGAM)
+    return make_error<JITLinkError>("MachO 32-bit platforms not supported");
+  else if (Magic == MachO::MH_MAGIC_64 || Magic == MachO::MH_CIGAM_64) {
 
-    if (Data.size() < sizeof(MachO::mach_header_64)) {
-      StringRef BufferName = Ctx->getObjectBuffer().getBufferIdentifier();
-      Ctx->notifyFailed(make_error<JITLinkError>("Truncated MachO buffer \"" +
-                                                 BufferName + "\""));
-      return;
-    }
+    if (Data.size() < sizeof(MachO::mach_header_64))
+      return make_error<JITLinkError>("Truncated MachO buffer \"" +
+                                      ObjectBuffer.getBufferIdentifier() +
+                                      "\"");
 
     // Read the CPU type from the header.
     uint32_t CPUType;
@@ -74,15 +64,27 @@ void jitLink_MachO(std::unique_ptr<JITLinkContext> Ctx) {
 
     switch (CPUType) {
     case MachO::CPU_TYPE_ARM64:
-      return jitLink_MachO_arm64(std::move(Ctx));
+      return createLinkGraphFromMachOObject_arm64(std::move(ObjectBuffer));
     case MachO::CPU_TYPE_X86_64:
-      return jitLink_MachO_x86_64(std::move(Ctx));
+      return createLinkGraphFromMachOObject_x86_64(std::move(ObjectBuffer));
     }
+    return make_error<JITLinkError>("MachO-64 CPU type not valid");
+  } else
+    return make_error<JITLinkError>("Unrecognized MachO magic value");
+}
+
+void link_MachO(std::unique_ptr<LinkGraph> G,
+                std::unique_ptr<JITLinkContext> Ctx) {
+
+  switch (G->getTargetTriple().getArch()) {
+  case Triple::aarch64:
+    return link_MachO_arm64(std::move(G), std::move(Ctx));
+  case Triple::x86_64:
+    return link_MachO_x86_64(std::move(G), std::move(Ctx));
+  default:
     Ctx->notifyFailed(make_error<JITLinkError>("MachO-64 CPU type not valid"));
     return;
   }
-
-  Ctx->notifyFailed(make_error<JITLinkError>("MachO magic not valid"));
 }
 
 } // end namespace jitlink
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp
index fa3f403b717c..4602154eb579 100644
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp
@@ -45,10 +45,12 @@ Expected<std::unique_ptr<LinkGraph>> MachOLinkGraphBuilder::buildGraph() {
   return std::move(G);
 }
 
-MachOLinkGraphBuilder::MachOLinkGraphBuilder(const object::MachOObjectFile &Obj)
+MachOLinkGraphBuilder::MachOLinkGraphBuilder(const object::MachOObjectFile &Obj,
+                                             Triple TT)
     : Obj(Obj),
       G(std::make_unique<LinkGraph>(std::string(Obj.getFileName()),
-                                    getPointerSize(Obj), getEndianness(Obj))) {}
+                                    std::move(TT), getPointerSize(Obj),
+                                    getEndianness(Obj))) {}
 
 void MachOLinkGraphBuilder::addCustomSectionParser(
     StringRef SectionName, SectionParserFunction Parser) {
@@ -64,10 +66,8 @@ Linkage MachOLinkGraphBuilder::getLinkage(uint16_t Desc) {
 }
 
 Scope MachOLinkGraphBuilder::getScope(StringRef Name, uint8_t Type) {
-  if (Type & MachO::N_PEXT)
-    return Scope::Hidden;
   if (Type & MachO::N_EXT) {
-    if (Name.startswith("l"))
+    if ((Type & MachO::N_PEXT) || Name.startswith("l"))
       return Scope::Hidden;
     else
       return Scope::Default;
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.h b/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.h
index dd3bcf27494c..26e6859de91d 100644
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.h
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.h
@@ -16,10 +16,10 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ExecutionEngine/JITLink/JITLink.h"
+#include "llvm/Object/MachO.h"
 
 #include "EHFrameSupportImpl.h"
 #include "JITLinkGeneric.h"
-#include "llvm/Object/MachO.h"
 
 #include <list>
 
@@ -81,7 +81,7 @@ protected:
 
   using SectionParserFunction = std::function<Error(NormalizedSection &S)>;
 
-  MachOLinkGraphBuilder(const object::MachOObjectFile &Obj);
+  MachOLinkGraphBuilder(const object::MachOObjectFile &Obj, Triple TT);
 
   LinkGraph &getGraph() const { return *G; }
 
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp
index 463845a5b8cb..8366e9658539 100644
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp
@@ -26,7 +26,7 @@ namespace {
 class MachOLinkGraphBuilder_arm64 : public MachOLinkGraphBuilder {
 public:
   MachOLinkGraphBuilder_arm64(const object::MachOObjectFile &Obj)
-      : MachOLinkGraphBuilder(Obj),
+      : MachOLinkGraphBuilder(Obj, Triple("arm64-apple-darwin")),
         NumSymbols(Obj.getSymtabLoadCommand().nsyms) {}
 
 private:
@@ -148,10 +148,11 @@ private:
       else
         return ToSymbolOrErr.takeError();
     } else {
-      if (auto ToSymbolOrErr = findSymbolByAddress(FixupValue))
-        ToSymbol = &*ToSymbolOrErr;
-      else
-        return ToSymbolOrErr.takeError();
+      auto ToSymbolSec = findSectionByIndex(UnsignedRI.r_symbolnum - 1);
+      if (!ToSymbolSec)
+        return ToSymbolSec.takeError();
+      ToSymbol = getSymbolByAddress(ToSymbolSec->Address);
+      assert(ToSymbol && "No symbol for section");
       FixupValue -= ToSymbol->getAddress();
     }
 
@@ -181,6 +182,8 @@ private:
     using namespace support;
     auto &Obj = getObject();
 
+    LLVM_DEBUG(dbgs() << "Processing relocations:\n");
+
     for (auto &S : Obj.sections()) {
 
       JITTargetAddress SectionAddress = S.getAddress();
@@ -199,8 +202,8 @@ private:
             getSectionByIndex(Obj.getSectionIndex(S.getRawDataRefImpl()));
         if (!NSec.GraphSection) {
           LLVM_DEBUG({
-            dbgs() << "Skipping relocations for MachO section " << NSec.SegName
-                   << "/" << NSec.SectName
+            dbgs() << "  Skipping relocations for MachO section "
+                   << NSec.SegName << "/" << NSec.SectName
                    << " which has no associated graph section\n";
           });
           continue;
@@ -221,9 +224,10 @@ private:
         JITTargetAddress FixupAddress = SectionAddress + (uint32_t)RI.r_address;
 
         LLVM_DEBUG({
-          dbgs() << "Processing " << getMachOARM64RelocationKindName(*Kind)
-                 << " relocation at " << format("0x%016" PRIx64, FixupAddress)
-                 << "\n";
+          auto &NSec =
+              getSectionByIndex(Obj.getSectionIndex(S.getRawDataRefImpl()));
+          dbgs() << "  " << NSec.SectName << " + "
+                 << formatv("{0:x8}", RI.r_address) << ":\n";
         });
 
         // Find the block that the fixup points to.
@@ -252,7 +256,7 @@ private:
           // If this is an Addend relocation then process it and move to the
           // paired reloc.
 
-          Addend = RI.r_symbolnum;
+          Addend = SignExtend64(RI.r_symbolnum, 24);
 
           if (RelItr == RelEnd)
             return make_error<JITLinkError>("Unpaired Addend reloc at " +
@@ -268,11 +272,12 @@ private:
             return make_error<JITLinkError>(
                 "Invalid relocation pair: Addend + " +
                 getMachOARM64RelocationKindName(*Kind));
-          else
-            LLVM_DEBUG({
-              dbgs() << "  pair is " << getMachOARM64RelocationKindName(*Kind)
-                     << "`\n";
-            });
+
+          LLVM_DEBUG({
+            dbgs() << "    Addend: value = " << formatv("{0:x6}", Addend)
+                   << ", pair is " << getMachOARM64RelocationKindName(*Kind)
+                   << "\n";
+          });
 
           // Find the address of the value to fix up.
           JITTargetAddress PairedFixupAddress =
@@ -335,6 +340,11 @@ private:
             TargetSymbol = TargetSymbolOrErr->GraphSymbol;
           else
             return TargetSymbolOrErr.takeError();
+          uint32_t Instr = *(const ulittle32_t *)FixupContent;
+          uint32_t EncodedAddend = (Instr & 0x003FFC00) >> 10;
+          if (EncodedAddend != 0)
+            return make_error<JITLinkError>("GOTPAGEOFF12 target has non-zero "
+                                            "encoded addend");
           break;
         }
         case GOTPageOffset12: {
@@ -377,6 +387,7 @@ private:
         }
 
         LLVM_DEBUG({
+          dbgs() << "    ";
           Edge GE(*Kind, FixupAddress - BlockToFix->getAddress(), *TargetSymbol,
                   Addend);
           printEdge(dbgs(), *BlockToFix, GE,
@@ -490,22 +501,15 @@ class MachOJITLinker_arm64 : public JITLinker<MachOJITLinker_arm64> {
 
 public:
   MachOJITLinker_arm64(std::unique_ptr<JITLinkContext> Ctx,
+                       std::unique_ptr<LinkGraph> G,
                        PassConfiguration PassConfig)
-      : JITLinker(std::move(Ctx), std::move(PassConfig)) {}
+      : JITLinker(std::move(Ctx), std::move(G), std::move(PassConfig)) {}
 
 private:
   StringRef getEdgeKindName(Edge::Kind R) const override {
     return getMachOARM64RelocationKindName(R);
   }
 
-  Expected<std::unique_ptr<LinkGraph>>
-  buildGraph(MemoryBufferRef ObjBuffer) override {
-    auto MachOObj = object::ObjectFile::createMachOObjectFile(ObjBuffer);
-    if (!MachOObj)
-      return MachOObj.takeError();
-    return MachOLinkGraphBuilder_arm64(**MachOObj).buildGraph();
-  }
-
   static Error targetOutOfRangeError(const Block &B, const Edge &E) {
     std::string ErrMsg;
     {
@@ -518,23 +522,17 @@ private:
   }
 
   static unsigned getPageOffset12Shift(uint32_t Instr) {
-    constexpr uint32_t LDRLiteralMask = 0x3ffffc00;
+    constexpr uint32_t LoadStoreImm12Mask = 0x3b000000;
+    constexpr uint32_t Vec128Mask = 0x04800000;
 
-    // Check for a GPR LDR immediate with a zero embedded literal.
-    // If found, the top two bits contain the shift.
-    if ((Instr & LDRLiteralMask) == 0x39400000)
-      return Instr >> 30;
+    if ((Instr & LoadStoreImm12Mask) == 0x39000000) {
+      uint32_t ImplicitShift = Instr >> 30;
+      if (ImplicitShift == 0)
+        if ((Instr & Vec128Mask) == Vec128Mask)
+          ImplicitShift = 4;
 
-    // Check for a Neon LDR immediate of size 64-bit or less with a zero
-    // embedded literal. If found, the top two bits contain the shift.
-    if ((Instr & LDRLiteralMask) == 0x3d400000)
-      return Instr >> 30;
-
-    // Check for a Neon LDR immediate of size 128-bit with a zero embedded
-    // literal.
-    constexpr uint32_t SizeBitsMask = 0xc0000000;
-    if ((Instr & (LDRLiteralMask | SizeBitsMask)) == 0x3dc00000)
-      return 4;
+      return ImplicitShift;
+    }
 
     return 0;
   }
@@ -581,10 +579,12 @@ private:
     }
     case Page21:
     case GOTPage21: {
-      assert(E.getAddend() == 0 && "PAGE21/GOTPAGE21 with non-zero addend");
+      assert((E.getKind() != GOTPage21 || E.getAddend() == 0) &&
+             "GOTPAGE21 with non-zero addend");
       uint64_t TargetPage =
-          E.getTarget().getAddress() & ~static_cast<uint64_t>(4096 - 1);
-      uint64_t PCPage = B.getAddress() & ~static_cast<uint64_t>(4096 - 1);
+          (E.getTarget().getAddress() + E.getAddend()) &
+            ~static_cast<uint64_t>(4096 - 1);
+      uint64_t PCPage = FixupAddress & ~static_cast<uint64_t>(4096 - 1);
 
       int64_t PageDelta = TargetPage - PCPage;
       if (PageDelta < -(1 << 30) || PageDelta > ((1 << 30) - 1))
@@ -600,8 +600,8 @@ private:
       break;
     }
     case PageOffset12: {
-      assert(E.getAddend() == 0 && "PAGEOFF12 with non-zero addend");
-      uint64_t TargetOffset = E.getTarget().getAddress() & 0xfff;
+      uint64_t TargetOffset =
+        (E.getTarget().getAddress() + E.getAddend()) & 0xfff;
 
       uint32_t RawInstr = *(ulittle32_t *)FixupPtr;
       unsigned ImmShift = getPageOffset12Shift(RawInstr);
@@ -674,13 +674,22 @@ private:
   uint64_t NullValue = 0;
 };
 
-void jitLink_MachO_arm64(std::unique_ptr<JITLinkContext> Ctx) {
+Expected<std::unique_ptr<LinkGraph>>
+createLinkGraphFromMachOObject_arm64(MemoryBufferRef ObjectBuffer) {
+  auto MachOObj = object::ObjectFile::createMachOObjectFile(ObjectBuffer);
+  if (!MachOObj)
+    return MachOObj.takeError();
+  return MachOLinkGraphBuilder_arm64(**MachOObj).buildGraph();
+}
+
+void link_MachO_arm64(std::unique_ptr<LinkGraph> G,
+                      std::unique_ptr<JITLinkContext> Ctx) {
+
   PassConfiguration Config;
-  Triple TT("arm64-apple-ios");
 
-  if (Ctx->shouldAddDefaultTargetPasses(TT)) {
+  if (Ctx->shouldAddDefaultTargetPasses(G->getTargetTriple())) {
     // Add a mark-live pass.
-    if (auto MarkLive = Ctx->getMarkLivePass(TT))
+    if (auto MarkLive = Ctx->getMarkLivePass(G->getTargetTriple()))
       Config.PrePrunePasses.push_back(std::move(MarkLive));
     else
       Config.PrePrunePasses.push_back(markAllSymbolsLive);
@@ -692,11 +701,11 @@ void jitLink_MachO_arm64(std::unique_ptr<JITLinkContext> Ctx) {
     });
   }
 
-  if (auto Err = Ctx->modifyPassConfig(TT, Config))
+  if (auto Err = Ctx->modifyPassConfig(G->getTargetTriple(), Config))
     return Ctx->notifyFailed(std::move(Err));
 
   // Construct a JITLinker and run the link function.
-  MachOJITLinker_arm64::link(std::move(Ctx), std::move(Config));
+  MachOJITLinker_arm64::link(std::move(Ctx), std::move(G), std::move(Config));
 }
 
 StringRef getMachOARM64RelocationKindName(Edge::Kind R) {
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp
index a91bc3b6033c..bde4a19e71ba 100644
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp
@@ -26,7 +26,7 @@ namespace {
 class MachOLinkGraphBuilder_x86_64 : public MachOLinkGraphBuilder {
 public:
   MachOLinkGraphBuilder_x86_64(const object::MachOObjectFile &Obj)
-      : MachOLinkGraphBuilder(Obj) {}
+      : MachOLinkGraphBuilder(Obj, Triple("x86_64-apple-darwin")) {}
 
 private:
   static Expected<MachOX86RelocationKind>
@@ -150,10 +150,11 @@ private:
       else
         return ToSymbolOrErr.takeError();
     } else {
-      if (auto ToSymbolOrErr = findSymbolByAddress(FixupValue))
-        ToSymbol = &*ToSymbolOrErr;
-      else
-        return ToSymbolOrErr.takeError();
+      auto ToSymbolSec = findSectionByIndex(UnsignedRI.r_symbolnum - 1);
+      if (!ToSymbolSec)
+        return ToSymbolSec.takeError();
+      ToSymbol = getSymbolByAddress(ToSymbolSec->Address);
+      assert(ToSymbol && "No symbol for section");
       FixupValue -= ToSymbol->getAddress();
     }
 
@@ -183,6 +184,8 @@ private:
     using namespace support;
     auto &Obj = getObject();
 
+    LLVM_DEBUG(dbgs() << "Processing relocations:\n");
+
     for (auto &S : Obj.sections()) {
 
       JITTargetAddress SectionAddress = S.getAddress();
@@ -201,8 +204,8 @@ private:
             getSectionByIndex(Obj.getSectionIndex(S.getRawDataRefImpl()));
         if (!NSec.GraphSection) {
           LLVM_DEBUG({
-            dbgs() << "Skipping relocations for MachO section " << NSec.SegName
-                   << "/" << NSec.SectName
+            dbgs() << "  Skipping relocations for MachO section "
+                   << NSec.SegName << "/" << NSec.SectName
                    << " which has no associated graph section\n";
           });
           continue;
@@ -224,8 +227,10 @@ private:
         JITTargetAddress FixupAddress = SectionAddress + (uint32_t)RI.r_address;
 
         LLVM_DEBUG({
-          dbgs() << "Processing relocation at "
-                 << format("0x%016" PRIx64, FixupAddress) << "\n";
+          auto &NSec =
+              getSectionByIndex(Obj.getSectionIndex(S.getRawDataRefImpl()));
+          dbgs() << "  " << NSec.SectName << " + "
+                 << formatv("{0:x8}", RI.r_address) << ":\n";
         });
 
         // Find the block that the fixup points to.
@@ -334,12 +339,16 @@ private:
           assert(TargetSymbol && "No target symbol from parsePairRelocation?");
           break;
         }
+        case PCRel32TLV:
+          return make_error<JITLinkError>(
+              "MachO TLV relocations not yet supported");
         default:
           llvm_unreachable("Special relocation kind should not appear in "
                            "mach-o file");
         }
 
         LLVM_DEBUG({
+          dbgs() << "    ";
           Edge GE(*Kind, FixupAddress - BlockToFix->getAddress(), *TargetSymbol,
                   Addend);
           printEdge(dbgs(), *BlockToFix, GE,
@@ -539,22 +548,15 @@ class MachOJITLinker_x86_64 : public JITLinker<MachOJITLinker_x86_64> {
 
 public:
   MachOJITLinker_x86_64(std::unique_ptr<JITLinkContext> Ctx,
+                        std::unique_ptr<LinkGraph> G,
                         PassConfiguration PassConfig)
-      : JITLinker(std::move(Ctx), std::move(PassConfig)) {}
+      : JITLinker(std::move(Ctx), std::move(G), std::move(PassConfig)) {}
 
 private:
   StringRef getEdgeKindName(Edge::Kind R) const override {
     return getMachOX86RelocationKindName(R);
   }
 
-  Expected<std::unique_ptr<LinkGraph>>
-  buildGraph(MemoryBufferRef ObjBuffer) override {
-    auto MachOObj = object::ObjectFile::createMachOObjectFile(ObjBuffer);
-    if (!MachOObj)
-      return MachOObj.takeError();
-    return MachOLinkGraphBuilder_x86_64(**MachOObj).buildGraph();
-  }
-
   static Error targetOutOfRangeError(const Block &B, const Edge &E) {
     std::string ErrMsg;
     {
@@ -651,18 +653,27 @@ private:
   uint64_t NullValue = 0;
 };
 
-void jitLink_MachO_x86_64(std::unique_ptr<JITLinkContext> Ctx) {
+Expected<std::unique_ptr<LinkGraph>>
+createLinkGraphFromMachOObject_x86_64(MemoryBufferRef ObjectBuffer) {
+  auto MachOObj = object::ObjectFile::createMachOObjectFile(ObjectBuffer);
+  if (!MachOObj)
+    return MachOObj.takeError();
+  return MachOLinkGraphBuilder_x86_64(**MachOObj).buildGraph();
+}
+
+void link_MachO_x86_64(std::unique_ptr<LinkGraph> G,
+                       std::unique_ptr<JITLinkContext> Ctx) {
+
   PassConfiguration Config;
-  Triple TT("x86_64-apple-macosx");
 
-  if (Ctx->shouldAddDefaultTargetPasses(TT)) {
+  if (Ctx->shouldAddDefaultTargetPasses(G->getTargetTriple())) {
     // Add eh-frame passses.
     Config.PrePrunePasses.push_back(EHFrameSplitter("__eh_frame"));
-    Config.PrePrunePasses.push_back(
-        EHFrameEdgeFixer("__eh_frame", NegDelta32, Delta64, Delta64));
+    Config.PrePrunePasses.push_back(EHFrameEdgeFixer(
+        "__eh_frame", G->getPointerSize(), Delta64, Delta32, NegDelta32));
 
     // Add a mark-live pass.
-    if (auto MarkLive = Ctx->getMarkLivePass(TT))
+    if (auto MarkLive = Ctx->getMarkLivePass(G->getTargetTriple()))
       Config.PrePrunePasses.push_back(std::move(MarkLive));
     else
       Config.PrePrunePasses.push_back(markAllSymbolsLive);
@@ -674,14 +685,14 @@ void jitLink_MachO_x86_64(std::unique_ptr<JITLinkContext> Ctx) {
     });
 
     // Add GOT/Stubs optimizer pass.
-    Config.PostAllocationPasses.push_back(optimizeMachO_x86_64_GOTAndStubs);
+    Config.PreFixupPasses.push_back(optimizeMachO_x86_64_GOTAndStubs);
   }
 
-  if (auto Err = Ctx->modifyPassConfig(TT, Config))
+  if (auto Err = Ctx->modifyPassConfig(G->getTargetTriple(), Config))
     return Ctx->notifyFailed(std::move(Err));
 
   // Construct a JITLinker and run the link function.
-  MachOJITLinker_x86_64::link(std::move(Ctx), std::move(Config));
+  MachOJITLinker_x86_64::link(std::move(Ctx), std::move(G), std::move(Config));
 }
 
 StringRef getMachOX86RelocationKindName(Edge::Kind R) {
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/MCJIT/MCJIT.h b/contrib/llvm-project/llvm/lib/ExecutionEngine/MCJIT/MCJIT.h
index 83b64b5171c0..52e7eda90310 100644
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/MCJIT/MCJIT.h
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/MCJIT/MCJIT.h
@@ -102,22 +102,22 @@ class MCJIT : public ExecutionEngine {
     }
 
     bool hasModuleBeenAddedButNotLoaded(Module *M) {
-      return AddedModules.count(M) != 0;
+      return AddedModules.contains(M);
     }
 
     bool hasModuleBeenLoaded(Module *M) {
       // If the module is in either the "loaded" or "finalized" sections it
       // has been loaded.
-      return (LoadedModules.count(M) != 0 ) || (FinalizedModules.count(M) != 0);
+      return LoadedModules.contains(M) || FinalizedModules.contains(M);
     }
 
     bool hasModuleBeenFinalized(Module *M) {
-      return FinalizedModules.count(M) != 0;
+      return FinalizedModules.contains(M);
     }
 
     bool ownsModule(Module* M) {
-      return (AddedModules.count(M) != 0) || (LoadedModules.count(M) != 0) ||
-             (FinalizedModules.count(M) != 0);
+      return AddedModules.contains(M) || LoadedModules.contains(M) ||
+             FinalizedModules.contains(M);
     }
 
     void markModuleAsLoaded(Module *M) {
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp
index 9e38dc36faae..68878f6729e9 100644
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp
@@ -73,22 +73,21 @@ class PartitioningIRMaterializationUnit : public IRMaterializationUnit {
 public:
   PartitioningIRMaterializationUnit(ExecutionSession &ES,
                                     const IRSymbolMapper::ManglingOptions &MO,
-                                    ThreadSafeModule TSM, VModuleKey K,
+                                    ThreadSafeModule TSM,
                                     CompileOnDemandLayer &Parent)
-      : IRMaterializationUnit(ES, MO, std::move(TSM), std::move(K)),
-        Parent(Parent) {}
+      : IRMaterializationUnit(ES, MO, std::move(TSM)), Parent(Parent) {}
 
   PartitioningIRMaterializationUnit(
-      ThreadSafeModule TSM, VModuleKey K, SymbolFlagsMap SymbolFlags,
+      ThreadSafeModule TSM, SymbolFlagsMap SymbolFlags,
       SymbolStringPtr InitSymbol, SymbolNameToDefinitionMap SymbolToDefinition,
       CompileOnDemandLayer &Parent)
-      : IRMaterializationUnit(std::move(TSM), std::move(K),
-                              std::move(SymbolFlags), std::move(InitSymbol),
+      : IRMaterializationUnit(std::move(TSM), std::move(SymbolFlags),
+                              std::move(InitSymbol),
                               std::move(SymbolToDefinition)),
         Parent(Parent) {}
 
 private:
-  void materialize(MaterializationResponsibility R) override {
+  void materialize(std::unique_ptr<MaterializationResponsibility> R) override {
     Parent.emitPartition(std::move(R), std::move(TSM),
                          std::move(SymbolToDefinition));
   }
@@ -128,15 +127,15 @@ void CompileOnDemandLayer::setPartitionFunction(PartitionFunction Partition) {
 void CompileOnDemandLayer::setImplMap(ImplSymbolMap *Imp) {
   this->AliaseeImpls = Imp;
 }
-void CompileOnDemandLayer::emit(MaterializationResponsibility R,
-                                ThreadSafeModule TSM) {
+void CompileOnDemandLayer::emit(
+    std::unique_ptr<MaterializationResponsibility> R, ThreadSafeModule TSM) {
   assert(TSM && "Null module");
 
   auto &ES = getExecutionSession();
 
   // Sort the callables and non-callables, build re-exports and lodge the
   // actual module with the implementation dylib.
-  auto &PDR = getPerDylibResources(R.getTargetJITDylib());
+  auto &PDR = getPerDylibResources(R->getTargetJITDylib());
 
   SymbolAliasMap NonCallables;
   SymbolAliasMap Callables;
@@ -145,7 +144,7 @@ void CompileOnDemandLayer::emit(MaterializationResponsibility R,
     cleanUpModule(M);
   });
 
-  for (auto &KV : R.getSymbols()) {
+  for (auto &KV : R->getSymbols()) {
     auto &Name = KV.first;
     auto &Flags = KV.second;
     if (Flags.isCallable())
@@ -158,19 +157,29 @@ void CompileOnDemandLayer::emit(MaterializationResponsibility R,
   // implementation dylib.
   if (auto Err = PDR.getImplDylib().define(
           std::make_unique<PartitioningIRMaterializationUnit>(
-              ES, *getManglingOptions(), std::move(TSM), R.getVModuleKey(),
-              *this))) {
+              ES, *getManglingOptions(), std::move(TSM), *this))) {
     ES.reportError(std::move(Err));
-    R.failMaterialization();
+    R->failMaterialization();
     return;
   }
 
   if (!NonCallables.empty())
-    R.replace(reexports(PDR.getImplDylib(), std::move(NonCallables),
-                        JITDylibLookupFlags::MatchAllSymbols));
-  if (!Callables.empty())
-    R.replace(lazyReexports(LCTMgr, PDR.getISManager(), PDR.getImplDylib(),
-                            std::move(Callables), AliaseeImpls));
+    if (auto Err =
+            R->replace(reexports(PDR.getImplDylib(), std::move(NonCallables),
+                                 JITDylibLookupFlags::MatchAllSymbols))) {
+      getExecutionSession().reportError(std::move(Err));
+      R->failMaterialization();
+      return;
+    }
+  if (!Callables.empty()) {
+    if (auto Err = R->replace(
+            lazyReexports(LCTMgr, PDR.getISManager(), PDR.getImplDylib(),
+                          std::move(Callables), AliaseeImpls))) {
+      getExecutionSession().reportError(std::move(Err));
+      R->failMaterialization();
+      return;
+    }
+  }
 }
 
 CompileOnDemandLayer::PerDylibResources &
@@ -247,7 +256,7 @@ void CompileOnDemandLayer::expandPartition(GlobalValueSet &Partition) {
 }
 
 void CompileOnDemandLayer::emitPartition(
-    MaterializationResponsibility R, ThreadSafeModule TSM,
+    std::unique_ptr<MaterializationResponsibility> R, ThreadSafeModule TSM,
     IRMaterializationUnit::SymbolNameToDefinitionMap Defs) {
 
   // FIXME: Need a 'notify lazy-extracting/emitting' callback to tie the
@@ -257,8 +266,8 @@ void CompileOnDemandLayer::emitPartition(
 
   auto &ES = getExecutionSession();
   GlobalValueSet RequestedGVs;
-  for (auto &Name : R.getRequestedSymbols()) {
-    if (Name == R.getInitializerSymbol())
+  for (auto &Name : R->getRequestedSymbols()) {
+    if (Name == R->getInitializerSymbol())
       TSM.withModuleDo([&](Module &M) {
         for (auto &GV : getStaticInitGVs(M))
           RequestedGVs.insert(&GV);
@@ -285,9 +294,14 @@ void CompileOnDemandLayer::emitPartition(
 
   // If the partition is empty, return the whole module to the symbol table.
   if (GVsToExtract->empty()) {
-    R.replace(std::make_unique<PartitioningIRMaterializationUnit>(
-        std::move(TSM), R.getVModuleKey(), R.getSymbols(),
-        R.getInitializerSymbol(), std::move(Defs), *this));
+    if (auto Err =
+            R->replace(std::make_unique<PartitioningIRMaterializationUnit>(
+                std::move(TSM), R->getSymbols(), R->getInitializerSymbol(),
+                std::move(Defs), *this))) {
+      getExecutionSession().reportError(std::move(Err));
+      R->failMaterialization();
+      return;
+    }
     return;
   }
 
@@ -308,7 +322,7 @@ void CompileOnDemandLayer::emitPartition(
           IRSymbolMapper::add(ES, *getManglingOptions(),
                               PromotedGlobals, SymbolFlags);
 
-          if (auto Err = R.defineMaterializing(SymbolFlags))
+          if (auto Err = R->defineMaterializing(SymbolFlags))
             return std::move(Err);
         }
 
@@ -348,12 +362,16 @@ void CompileOnDemandLayer::emitPartition(
 
   if (!ExtractedTSM) {
     ES.reportError(ExtractedTSM.takeError());
-    R.failMaterialization();
+    R->failMaterialization();
     return;
   }
 
-  R.replace(std::make_unique<PartitioningIRMaterializationUnit>(
-      ES, *getManglingOptions(), std::move(TSM), R.getVModuleKey(), *this));
+  if (auto Err = R->replace(std::make_unique<PartitioningIRMaterializationUnit>(
+          ES, *getManglingOptions(), std::move(TSM), *this))) {
+    ES.reportError(std::move(Err));
+    R->failMaterialization();
+    return;
+  }
   BaseLayer.emit(std::move(R), std::move(*ExtractedTSM));
 }
 
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/Core.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/Core.cpp
index bad13cfebbc6..dfb558808c32 100644
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/Core.cpp
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/Core.cpp
@@ -11,18 +11,19 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/ExecutionEngine/Orc/DebugUtils.h"
-#include "llvm/ExecutionEngine/Orc/OrcError.h"
+#include "llvm/ExecutionEngine/Orc/Shared/OrcError.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/MSVCErrorWorkarounds.h"
 
 #include <condition_variable>
-#if LLVM_ENABLE_THREADS
 #include <future>
-#endif
 
 #define DEBUG_TYPE "orc"
 
 namespace llvm {
 namespace orc {
 
+char ResourceTrackerDefunct::ID = 0;
 char FailedToMaterialize::ID = 0;
 char SymbolsNotFound::ID = 0;
 char SymbolsCouldNotBeRemoved::ID = 0;
@@ -34,6 +35,45 @@ RegisterDependenciesFunction NoDependenciesToRegister =
 
 void MaterializationUnit::anchor() {}
 
+ResourceTracker::ResourceTracker(JITDylibSP JD) {
+  assert((reinterpret_cast<uintptr_t>(JD.get()) & 0x1) == 0 &&
+         "JITDylib must be two byte aligned");
+  JD->Retain();
+  JDAndFlag.store(reinterpret_cast<uintptr_t>(JD.get()));
+}
+
+ResourceTracker::~ResourceTracker() {
+  getJITDylib().getExecutionSession().destroyResourceTracker(*this);
+  getJITDylib().Release();
+}
+
+Error ResourceTracker::remove() {
+  return getJITDylib().getExecutionSession().removeResourceTracker(*this);
+}
+
+void ResourceTracker::transferTo(ResourceTracker &DstRT) {
+  getJITDylib().getExecutionSession().transferResourceTracker(DstRT, *this);
+}
+
+void ResourceTracker::makeDefunct() {
+  uintptr_t Val = JDAndFlag.load();
+  Val |= 0x1U;
+  JDAndFlag.store(Val);
+}
+
+ResourceManager::~ResourceManager() {}
+
+ResourceTrackerDefunct::ResourceTrackerDefunct(ResourceTrackerSP RT)
+    : RT(std::move(RT)) {}
+
+std::error_code ResourceTrackerDefunct::convertToErrorCode() const {
+  return orcError(OrcErrorCode::UnknownORCError);
+}
+
+void ResourceTrackerDefunct::log(raw_ostream &OS) const {
+  OS << "Resource tracker " << (void *)RT.get() << " became defunct";
+}
+
 FailedToMaterialize::FailedToMaterialize(
     std::shared_ptr<SymbolDependenceMap> Symbols)
     : Symbols(std::move(Symbols)) {
@@ -137,8 +177,6 @@ void AsynchronousSymbolQuery::handleComplete() {
   TmpNotifyComplete(std::move(ResolvedSymbols));
 }
 
-bool AsynchronousSymbolQuery::canStillFail() { return !!NotifyComplete; }
-
 void AsynchronousSymbolQuery::handleFailed(Error Err) {
   assert(QueryRegistrations.empty() && ResolvedSymbols.empty() &&
          OutstandingSymbolsCount == 0 &&
@@ -181,156 +219,9 @@ void AsynchronousSymbolQuery::detach() {
   QueryRegistrations.clear();
 }
 
-MaterializationResponsibility::~MaterializationResponsibility() {
-  assert(SymbolFlags.empty() &&
-         "All symbols should have been explicitly materialized or failed");
-}
-
-SymbolNameSet MaterializationResponsibility::getRequestedSymbols() const {
-  return JD->getRequestedSymbols(SymbolFlags);
-}
-
-Error MaterializationResponsibility::notifyResolved(const SymbolMap &Symbols) {
-  LLVM_DEBUG({
-    dbgs() << "In " << JD->getName() << " resolving " << Symbols << "\n";
-  });
-#ifndef NDEBUG
-  for (auto &KV : Symbols) {
-    auto WeakFlags = JITSymbolFlags::Weak | JITSymbolFlags::Common;
-    auto I = SymbolFlags.find(KV.first);
-    assert(I != SymbolFlags.end() &&
-           "Resolving symbol outside this responsibility set");
-    assert(!I->second.hasMaterializationSideEffectsOnly() &&
-           "Can't resolve materialization-side-effects-only symbol");
-    assert((KV.second.getFlags() & ~WeakFlags) == (I->second & ~WeakFlags) &&
-           "Resolving symbol with incorrect flags");
-  }
-#endif
-
-  return JD->resolve(Symbols);
-}
-
-Error MaterializationResponsibility::notifyEmitted() {
-
-  LLVM_DEBUG({
-    dbgs() << "In " << JD->getName() << " emitting " << SymbolFlags << "\n";
-  });
-
-  if (auto Err = JD->emit(SymbolFlags))
-    return Err;
-
-  SymbolFlags.clear();
-  return Error::success();
-}
-
-Error MaterializationResponsibility::defineMaterializing(
-    SymbolFlagsMap NewSymbolFlags) {
-
-  LLVM_DEBUG({
-    dbgs() << "In " << JD->getName() << " defining materializing symbols "
-           << NewSymbolFlags << "\n";
-  });
-  if (auto AcceptedDefs = JD->defineMaterializing(std::move(NewSymbolFlags))) {
-    // Add all newly accepted symbols to this responsibility object.
-    for (auto &KV : *AcceptedDefs)
-      SymbolFlags.insert(KV);
-    return Error::success();
-  } else
-    return AcceptedDefs.takeError();
-}
-
-void MaterializationResponsibility::failMaterialization() {
-
-  LLVM_DEBUG({
-    dbgs() << "In " << JD->getName() << " failing materialization for "
-           << SymbolFlags << "\n";
-  });
-
-  JITDylib::FailedSymbolsWorklist Worklist;
-
-  for (auto &KV : SymbolFlags)
-    Worklist.push_back(std::make_pair(JD.get(), KV.first));
-  SymbolFlags.clear();
-
-  JD->notifyFailed(std::move(Worklist));
-}
-
-void MaterializationResponsibility::replace(
-    std::unique_ptr<MaterializationUnit> MU) {
-
-  // If the replacement MU is empty then return.
-  if (MU->getSymbols().empty())
-    return;
-
-  for (auto &KV : MU->getSymbols()) {
-    assert(SymbolFlags.count(KV.first) &&
-           "Replacing definition outside this responsibility set");
-    SymbolFlags.erase(KV.first);
-  }
-
-  if (MU->getInitializerSymbol() == InitSymbol)
-    InitSymbol = nullptr;
-
-  LLVM_DEBUG(JD->getExecutionSession().runSessionLocked([&]() {
-    dbgs() << "In " << JD->getName() << " replacing symbols with " << *MU
-           << "\n";
-  }););
-
-  JD->replace(std::move(MU));
-}
-
-MaterializationResponsibility
-MaterializationResponsibility::delegate(const SymbolNameSet &Symbols,
-                                        VModuleKey NewKey) {
-
-  if (NewKey == VModuleKey())
-    NewKey = K;
-
-  SymbolStringPtr DelegatedInitSymbol;
-  SymbolFlagsMap DelegatedFlags;
-
-  for (auto &Name : Symbols) {
-    auto I = SymbolFlags.find(Name);
-    assert(I != SymbolFlags.end() &&
-           "Symbol is not tracked by this MaterializationResponsibility "
-           "instance");
-
-    DelegatedFlags[Name] = std::move(I->second);
-    if (Name == InitSymbol)
-      std::swap(InitSymbol, DelegatedInitSymbol);
-
-    SymbolFlags.erase(I);
-  }
-
-  return MaterializationResponsibility(JD, std::move(DelegatedFlags),
-                                       std::move(DelegatedInitSymbol),
-                                       std::move(NewKey));
-}
-
-void MaterializationResponsibility::addDependencies(
-    const SymbolStringPtr &Name, const SymbolDependenceMap &Dependencies) {
-  LLVM_DEBUG({
-    dbgs() << "Adding dependencies for " << Name << ": " << Dependencies
-           << "\n";
-  });
-  assert(SymbolFlags.count(Name) &&
-         "Symbol not covered by this MaterializationResponsibility instance");
-  JD->addDependencies(Name, Dependencies);
-}
-
-void MaterializationResponsibility::addDependenciesForAll(
-    const SymbolDependenceMap &Dependencies) {
-  LLVM_DEBUG({
-    dbgs() << "Adding dependencies for all symbols in " << SymbolFlags << ": "
-           << Dependencies << "\n";
-  });
-  for (auto &KV : SymbolFlags)
-    JD->addDependencies(KV.first, Dependencies);
-}
-
 AbsoluteSymbolsMaterializationUnit::AbsoluteSymbolsMaterializationUnit(
-    SymbolMap Symbols, VModuleKey K)
-    : MaterializationUnit(extractFlags(Symbols), nullptr, std::move(K)),
+    SymbolMap Symbols)
+    : MaterializationUnit(extractFlags(Symbols), nullptr),
       Symbols(std::move(Symbols)) {}
 
 StringRef AbsoluteSymbolsMaterializationUnit::getName() const {
@@ -338,10 +229,10 @@ StringRef AbsoluteSymbolsMaterializationUnit::getName() const {
 }
 
 void AbsoluteSymbolsMaterializationUnit::materialize(
-    MaterializationResponsibility R) {
+    std::unique_ptr<MaterializationResponsibility> R) {
   // No dependencies, so these calls can't fail.
-  cantFail(R.notifyResolved(Symbols));
-  cantFail(R.notifyEmitted());
+  cantFail(R->notifyResolved(Symbols));
+  cantFail(R->notifyEmitted());
 }
 
 void AbsoluteSymbolsMaterializationUnit::discard(const JITDylib &JD,
@@ -360,26 +251,25 @@ AbsoluteSymbolsMaterializationUnit::extractFlags(const SymbolMap &Symbols) {
 
 ReExportsMaterializationUnit::ReExportsMaterializationUnit(
     JITDylib *SourceJD, JITDylibLookupFlags SourceJDLookupFlags,
-    SymbolAliasMap Aliases, VModuleKey K)
-    : MaterializationUnit(extractFlags(Aliases), nullptr, std::move(K)),
-      SourceJD(SourceJD), SourceJDLookupFlags(SourceJDLookupFlags),
-      Aliases(std::move(Aliases)) {}
+    SymbolAliasMap Aliases)
+    : MaterializationUnit(extractFlags(Aliases), nullptr), SourceJD(SourceJD),
+      SourceJDLookupFlags(SourceJDLookupFlags), Aliases(std::move(Aliases)) {}
 
 StringRef ReExportsMaterializationUnit::getName() const {
   return "<Reexports>";
 }
 
 void ReExportsMaterializationUnit::materialize(
-    MaterializationResponsibility R) {
+    std::unique_ptr<MaterializationResponsibility> R) {
 
-  auto &ES = R.getTargetJITDylib().getExecutionSession();
-  JITDylib &TgtJD = R.getTargetJITDylib();
+  auto &ES = R->getTargetJITDylib().getExecutionSession();
+  JITDylib &TgtJD = R->getTargetJITDylib();
   JITDylib &SrcJD = SourceJD ? *SourceJD : TgtJD;
 
   // Find the set of requested aliases and aliasees. Return any unrequested
   // aliases back to the JITDylib so as to not prematurely materialize any
   // aliasees.
-  auto RequestedSymbols = R.getRequestedSymbols();
+  auto RequestedSymbols = R->getRequestedSymbols();
   SymbolAliasMap RequestedAliases;
 
   for (auto &Name : RequestedSymbols) {
@@ -398,19 +288,27 @@ void ReExportsMaterializationUnit::materialize(
   });
 
   if (!Aliases.empty()) {
-    if (SourceJD)
-      R.replace(reexports(*SourceJD, std::move(Aliases), SourceJDLookupFlags));
-    else
-      R.replace(symbolAliases(std::move(Aliases)));
+    auto Err = SourceJD ? R->replace(reexports(*SourceJD, std::move(Aliases),
+                                               SourceJDLookupFlags))
+                        : R->replace(symbolAliases(std::move(Aliases)));
+
+    if (Err) {
+      // FIXME: Should this be reported / treated as failure to materialize?
+      // Or should this be treated as a sanctioned bailing-out?
+      ES.reportError(std::move(Err));
+      R->failMaterialization();
+      return;
+    }
   }
 
   // The OnResolveInfo struct will hold the aliases and responsibilty for each
   // query in the list.
   struct OnResolveInfo {
-    OnResolveInfo(MaterializationResponsibility R, SymbolAliasMap Aliases)
+    OnResolveInfo(std::unique_ptr<MaterializationResponsibility> R,
+                  SymbolAliasMap Aliases)
         : R(std::move(R)), Aliases(std::move(Aliases)) {}
 
-    MaterializationResponsibility R;
+    std::unique_ptr<MaterializationResponsibility> R;
     SymbolAliasMap Aliases;
   };
 
@@ -450,8 +348,15 @@ void ReExportsMaterializationUnit::materialize(
 
     assert(!QuerySymbols.empty() && "Alias cycle detected!");
 
-    auto QueryInfo = std::make_shared<OnResolveInfo>(
-        R.delegate(ResponsibilitySymbols), std::move(QueryAliases));
+    auto NewR = R->delegate(ResponsibilitySymbols);
+    if (!NewR) {
+      ES.reportError(NewR.takeError());
+      R->failMaterialization();
+      return;
+    }
+
+    auto QueryInfo = std::make_shared<OnResolveInfo>(std::move(*NewR),
+                                                     std::move(QueryAliases));
     QueryInfos.push_back(
         make_pair(std::move(QuerySymbols), std::move(QueryInfo)));
   }
@@ -480,12 +385,12 @@ void ReExportsMaterializationUnit::materialize(
       for (auto &KV : QueryInfo->Aliases)
         if (SrcJDDeps.count(KV.second.Aliasee)) {
           PerAliasDeps = {KV.second.Aliasee};
-          QueryInfo->R.addDependencies(KV.first, PerAliasDepsMap);
+          QueryInfo->R->addDependencies(KV.first, PerAliasDepsMap);
         }
     };
 
     auto OnComplete = [QueryInfo](Expected<SymbolMap> Result) {
-      auto &ES = QueryInfo->R.getTargetJITDylib().getExecutionSession();
+      auto &ES = QueryInfo->R->getTargetJITDylib().getExecutionSession();
       if (Result) {
         SymbolMap ResolutionMap;
         for (auto &KV : QueryInfo->Aliases) {
@@ -499,19 +404,19 @@ void ReExportsMaterializationUnit::materialize(
           ResolutionMap[KV.first] = JITEvaluatedSymbol(
               (*Result)[KV.second.Aliasee].getAddress(), KV.second.AliasFlags);
         }
-        if (auto Err = QueryInfo->R.notifyResolved(ResolutionMap)) {
+        if (auto Err = QueryInfo->R->notifyResolved(ResolutionMap)) {
           ES.reportError(std::move(Err));
-          QueryInfo->R.failMaterialization();
+          QueryInfo->R->failMaterialization();
           return;
         }
-        if (auto Err = QueryInfo->R.notifyEmitted()) {
+        if (auto Err = QueryInfo->R->notifyEmitted()) {
           ES.reportError(std::move(Err));
-          QueryInfo->R.failMaterialization();
+          QueryInfo->R->failMaterialization();
           return;
         }
       } else {
         ES.reportError(Result.takeError());
-        QueryInfo->R.failMaterialization();
+        QueryInfo->R->failMaterialization();
       }
     };
 
@@ -538,20 +443,16 @@ ReExportsMaterializationUnit::extractFlags(const SymbolAliasMap &Aliases) {
   return SymbolFlags;
 }
 
-Expected<SymbolAliasMap>
-buildSimpleReexportsAliasMap(JITDylib &SourceJD, const SymbolNameSet &Symbols) {
+Expected<SymbolAliasMap> buildSimpleReexportsAliasMap(JITDylib &SourceJD,
+                                                      SymbolNameSet Symbols) {
   SymbolLookupSet LookupSet(Symbols);
-  auto Flags = SourceJD.lookupFlags(
-      LookupKind::Static, JITDylibLookupFlags::MatchAllSymbols, LookupSet);
+  auto Flags = SourceJD.getExecutionSession().lookupFlags(
+      LookupKind::Static, {{&SourceJD, JITDylibLookupFlags::MatchAllSymbols}},
+      SymbolLookupSet(std::move(Symbols)));
 
   if (!Flags)
     return Flags.takeError();
 
-  if (!LookupSet.empty()) {
-    LookupSet.sortByName();
-    return make_error<SymbolsNotFound>(LookupSet.getSymbolNames());
-  }
-
   SymbolAliasMap Result;
   for (auto &Name : Symbols) {
     assert(Flags->count(Name) && "Missing entry in flags map");
@@ -561,19 +462,100 @@ buildSimpleReexportsAliasMap(JITDylib &SourceJD, const SymbolNameSet &Symbols) {
   return Result;
 }
 
+class InProgressLookupState {
+public:
+  InProgressLookupState(LookupKind K, JITDylibSearchOrder SearchOrder,
+                        SymbolLookupSet LookupSet, SymbolState RequiredState)
+      : K(K), SearchOrder(std::move(SearchOrder)),
+        LookupSet(std::move(LookupSet)), RequiredState(RequiredState) {
+    DefGeneratorCandidates = this->LookupSet;
+  }
+  virtual ~InProgressLookupState() {}
+  virtual void complete(std::unique_ptr<InProgressLookupState> IPLS) = 0;
+  virtual void fail(Error Err) = 0;
+
+  LookupKind K;
+  JITDylibSearchOrder SearchOrder;
+  SymbolLookupSet LookupSet;
+  SymbolState RequiredState;
+
+  std::unique_lock<std::mutex> GeneratorLock;
+  size_t CurSearchOrderIndex = 0;
+  bool NewJITDylib = true;
+  SymbolLookupSet DefGeneratorCandidates;
+  SymbolLookupSet DefGeneratorNonCandidates;
+  std::vector<std::weak_ptr<DefinitionGenerator>> CurDefGeneratorStack;
+};
+
+class InProgressLookupFlagsState : public InProgressLookupState {
+public:
+  InProgressLookupFlagsState(
+      LookupKind K, JITDylibSearchOrder SearchOrder, SymbolLookupSet LookupSet,
+      unique_function<void(Expected<SymbolFlagsMap>)> OnComplete)
+      : InProgressLookupState(K, std::move(SearchOrder), std::move(LookupSet),
+                              SymbolState::NeverSearched),
+        OnComplete(std::move(OnComplete)) {}
+
+  void complete(std::unique_ptr<InProgressLookupState> IPLS) override {
+    GeneratorLock = {}; // Unlock and release.
+    auto &ES = SearchOrder.front().first->getExecutionSession();
+    ES.OL_completeLookupFlags(std::move(IPLS), std::move(OnComplete));
+  }
+
+  void fail(Error Err) override {
+    GeneratorLock = {}; // Unlock and release.
+    OnComplete(std::move(Err));
+  }
+
+private:
+  unique_function<void(Expected<SymbolFlagsMap>)> OnComplete;
+};
+
+class InProgressFullLookupState : public InProgressLookupState {
+public:
+  InProgressFullLookupState(LookupKind K, JITDylibSearchOrder SearchOrder,
+                            SymbolLookupSet LookupSet,
+                            SymbolState RequiredState,
+                            std::shared_ptr<AsynchronousSymbolQuery> Q,
+                            RegisterDependenciesFunction RegisterDependencies)
+      : InProgressLookupState(K, std::move(SearchOrder), std::move(LookupSet),
+                              RequiredState),
+        Q(std::move(Q)), RegisterDependencies(std::move(RegisterDependencies)) {
+  }
+
+  void complete(std::unique_ptr<InProgressLookupState> IPLS) override {
+    GeneratorLock = {}; // Unlock and release.
+    auto &ES = SearchOrder.front().first->getExecutionSession();
+    ES.OL_completeLookup(std::move(IPLS), std::move(Q),
+                         std::move(RegisterDependencies));
+  }
+
+  void fail(Error Err) override {
+    GeneratorLock = {};
+    Q->detach();
+    Q->handleFailed(std::move(Err));
+  }
+
+private:
+  std::shared_ptr<AsynchronousSymbolQuery> Q;
+  RegisterDependenciesFunction RegisterDependencies;
+};
+
 ReexportsGenerator::ReexportsGenerator(JITDylib &SourceJD,
                                        JITDylibLookupFlags SourceJDLookupFlags,
                                        SymbolPredicate Allow)
     : SourceJD(SourceJD), SourceJDLookupFlags(SourceJDLookupFlags),
       Allow(std::move(Allow)) {}
 
-Error ReexportsGenerator::tryToGenerate(LookupKind K, JITDylib &JD,
+Error ReexportsGenerator::tryToGenerate(LookupState &LS, LookupKind K,
+                                        JITDylib &JD,
                                         JITDylibLookupFlags JDLookupFlags,
                                         const SymbolLookupSet &LookupSet) {
   assert(&JD != &SourceJD && "Cannot re-export from the same dylib");
 
   // Use lookupFlags to find the subset of symbols that match our lookup.
-  auto Flags = SourceJD.lookupFlags(K, JDLookupFlags, LookupSet);
+  auto Flags = JD.getExecutionSession().lookupFlags(
+      K, {{&SourceJD, JDLookupFlags}}, LookupSet);
   if (!Flags)
     return Flags.takeError();
 
@@ -590,17 +572,61 @@ Error ReexportsGenerator::tryToGenerate(LookupKind K, JITDylib &JD,
   return JD.define(reexports(SourceJD, AliasMap, SourceJDLookupFlags));
 }
 
-JITDylib::DefinitionGenerator::~DefinitionGenerator() {}
+LookupState::LookupState(std::unique_ptr<InProgressLookupState> IPLS)
+    : IPLS(std::move(IPLS)) {}
 
-void JITDylib::removeGenerator(DefinitionGenerator &G) {
+void LookupState::reset(InProgressLookupState *IPLS) { this->IPLS.reset(IPLS); }
+
+LookupState::LookupState() = default;
+LookupState::LookupState(LookupState &&) = default;
+LookupState &LookupState::operator=(LookupState &&) = default;
+LookupState::~LookupState() = default;
+
+void LookupState::continueLookup(Error Err) {
+  assert(IPLS && "Cannot call continueLookup on empty LookupState");
+  auto &ES = IPLS->SearchOrder.begin()->first->getExecutionSession();
+  ES.OL_applyQueryPhase1(std::move(IPLS), std::move(Err));
+}
+
+DefinitionGenerator::~DefinitionGenerator() {}
+
+Error JITDylib::clear() {
+  std::vector<ResourceTrackerSP> TrackersToRemove;
   ES.runSessionLocked([&]() {
-    auto I = std::find_if(DefGenerators.begin(), DefGenerators.end(),
-                          [&](const std::unique_ptr<DefinitionGenerator> &H) {
-                            return H.get() == &G;
-                          });
-    assert(I != DefGenerators.end() && "Generator not found");
-    DefGenerators.erase(I);
+    for (auto &KV : TrackerSymbols)
+      TrackersToRemove.push_back(KV.first);
+    TrackersToRemove.push_back(getDefaultResourceTracker());
   });
+
+  Error Err = Error::success();
+  for (auto &RT : TrackersToRemove)
+    Err = joinErrors(std::move(Err), RT->remove());
+  return Err;
+}
+
+ResourceTrackerSP JITDylib::getDefaultResourceTracker() {
+  return ES.runSessionLocked([this] {
+    if (!DefaultTracker)
+      DefaultTracker = new ResourceTracker(this);
+    return DefaultTracker;
+  });
+}
+
+ResourceTrackerSP JITDylib::createResourceTracker() {
+  return ES.runSessionLocked([this] {
+    ResourceTrackerSP RT = new ResourceTracker(this);
+    return RT;
+  });
+}
+
+void JITDylib::removeGenerator(DefinitionGenerator &G) {
+  std::lock_guard<std::mutex> Lock(GeneratorsMutex);
+  auto I = llvm::find_if(DefGenerators,
+                         [&](const std::shared_ptr<DefinitionGenerator> &H) {
+                           return H.get() == &G;
+                         });
+  assert(I != DefGenerators.end() && "Generator not found");
+  DefGenerators.erase(I);
 }
 
 Expected<SymbolFlagsMap>
@@ -652,11 +678,18 @@ JITDylib::defineMaterializing(SymbolFlagsMap SymbolFlags) {
   });
 }
 
-void JITDylib::replace(std::unique_ptr<MaterializationUnit> MU) {
+Error JITDylib::replace(MaterializationResponsibility &FromMR,
+                        std::unique_ptr<MaterializationUnit> MU) {
   assert(MU != nullptr && "Can not replace with a null MaterializationUnit");
+  std::unique_ptr<MaterializationUnit> MustRunMU;
+  std::unique_ptr<MaterializationResponsibility> MustRunMR;
 
-  auto MustRunMU =
-      ES.runSessionLocked([&, this]() -> std::unique_ptr<MaterializationUnit> {
+  auto Err =
+      ES.runSessionLocked([&, this]() -> Error {
+        auto RT = getTracker(FromMR);
+
+        if (RT->isDefunct())
+          return make_error<ResourceTrackerDefunct>(std::move(RT));
 
 #ifndef NDEBUG
         for (auto &KV : MU->getSymbols()) {
@@ -671,18 +704,27 @@ void JITDylib::replace(std::unique_ptr<MaterializationUnit> MU) {
         }
 #endif // NDEBUG
 
+        // If the tracker is defunct we need to bail out immediately.
+
         // If any symbol has pending queries against it then we need to
         // materialize MU immediately.
         for (auto &KV : MU->getSymbols()) {
           auto MII = MaterializingInfos.find(KV.first);
           if (MII != MaterializingInfos.end()) {
-            if (MII->second.hasQueriesPending())
-              return std::move(MU);
+            if (MII->second.hasQueriesPending()) {
+              MustRunMR = ES.createMaterializationResponsibility(
+                  *RT, std::move(MU->SymbolFlags), std::move(MU->InitSymbol));
+              MustRunMU = std::move(MU);
+              return Error::success();
+            }
           }
         }
 
         // Otherwise, make MU responsible for all the symbols.
-        auto UMI = std::make_shared<UnmaterializedInfo>(std::move(MU));
+        auto RTI = MRTrackers.find(&FromMR);
+        assert(RTI != MRTrackers.end() && "No tracker for FromMR");
+        auto UMI =
+            std::make_shared<UnmaterializedInfo>(std::move(MU), RTI->second);
         for (auto &KV : UMI->MU->getSymbols()) {
           auto SymI = Symbols.find(KV.first);
           assert(SymI->second.getState() == SymbolState::Materializing &&
@@ -700,14 +742,36 @@ void JITDylib::replace(std::unique_ptr<MaterializationUnit> MU) {
           UMIEntry = UMI;
         }
 
-        return nullptr;
+        return Error::success();
       });
 
+  if (Err)
+    return Err;
+
   if (MustRunMU) {
-    auto MR =
-        MustRunMU->createMaterializationResponsibility(shared_from_this());
-    ES.dispatchMaterialization(std::move(MustRunMU), std::move(MR));
+    assert(MustRunMR && "MustRunMU set implies MustRunMR set");
+    ES.dispatchMaterialization(std::move(MustRunMU), std::move(MustRunMR));
+  } else {
+    assert(!MustRunMR && "MustRunMU unset implies MustRunMR unset");
   }
+
+  return Error::success();
+}
+
+Expected<std::unique_ptr<MaterializationResponsibility>>
+JITDylib::delegate(MaterializationResponsibility &FromMR,
+                   SymbolFlagsMap SymbolFlags, SymbolStringPtr InitSymbol) {
+
+  return ES.runSessionLocked(
+      [&]() -> Expected<std::unique_ptr<MaterializationResponsibility>> {
+        auto RT = getTracker(FromMR);
+
+        if (RT->isDefunct())
+          return make_error<ResourceTrackerDefunct>(std::move(RT));
+
+        return ES.createMaterializationResponsibility(
+            *RT, std::move(SymbolFlags), std::move(InitSymbol));
+      });
 }
 
 SymbolNameSet
@@ -808,89 +872,93 @@ void JITDylib::addDependencies(const SymbolStringPtr &Name,
     Symbols[Name].setFlags(Symbols[Name].getFlags() | JITSymbolFlags::HasError);
 }
 
-Error JITDylib::resolve(const SymbolMap &Resolved) {
-  SymbolNameSet SymbolsInErrorState;
+Error JITDylib::resolve(MaterializationResponsibility &MR,
+                        const SymbolMap &Resolved) {
   AsynchronousSymbolQuerySet CompletedQueries;
 
-  ES.runSessionLocked([&, this]() {
-    struct WorklistEntry {
-      SymbolTable::iterator SymI;
-      JITEvaluatedSymbol ResolvedSym;
-    };
-
-    std::vector<WorklistEntry> Worklist;
-    Worklist.reserve(Resolved.size());
-
-    // Build worklist and check for any symbols in the error state.
-    for (const auto &KV : Resolved) {
+  if (auto Err = ES.runSessionLocked([&, this]() -> Error {
+        auto RTI = MRTrackers.find(&MR);
+        assert(RTI != MRTrackers.end() && "No resource tracker for MR?");
+        if (RTI->second->isDefunct())
+          return make_error<ResourceTrackerDefunct>(RTI->second);
 
-      assert(!KV.second.getFlags().hasError() &&
-             "Resolution result can not have error flag set");
+        struct WorklistEntry {
+          SymbolTable::iterator SymI;
+          JITEvaluatedSymbol ResolvedSym;
+        };
 
-      auto SymI = Symbols.find(KV.first);
+        SymbolNameSet SymbolsInErrorState;
+        std::vector<WorklistEntry> Worklist;
+        Worklist.reserve(Resolved.size());
 
-      assert(SymI != Symbols.end() && "Symbol not found");
-      assert(!SymI->second.hasMaterializerAttached() &&
-             "Resolving symbol with materializer attached?");
-      assert(SymI->second.getState() == SymbolState::Materializing &&
-             "Symbol should be materializing");
-      assert(SymI->second.getAddress() == 0 &&
-             "Symbol has already been resolved");
+        // Build worklist and check for any symbols in the error state.
+        for (const auto &KV : Resolved) {
 
-      if (SymI->second.getFlags().hasError())
-        SymbolsInErrorState.insert(KV.first);
-      else {
-        auto Flags = KV.second.getFlags();
-        Flags &= ~(JITSymbolFlags::Weak | JITSymbolFlags::Common);
-        assert(Flags == (SymI->second.getFlags() &
-                         ~(JITSymbolFlags::Weak | JITSymbolFlags::Common)) &&
-               "Resolved flags should match the declared flags");
+          assert(!KV.second.getFlags().hasError() &&
+                 "Resolution result can not have error flag set");
 
-        Worklist.push_back(
-            {SymI, JITEvaluatedSymbol(KV.second.getAddress(), Flags)});
-      }
-    }
+          auto SymI = Symbols.find(KV.first);
 
-    // If any symbols were in the error state then bail out.
-    if (!SymbolsInErrorState.empty())
-      return;
+          assert(SymI != Symbols.end() && "Symbol not found");
+          assert(!SymI->second.hasMaterializerAttached() &&
+                 "Resolving symbol with materializer attached?");
+          assert(SymI->second.getState() == SymbolState::Materializing &&
+                 "Symbol should be materializing");
+          assert(SymI->second.getAddress() == 0 &&
+                 "Symbol has already been resolved");
+
+          if (SymI->second.getFlags().hasError())
+            SymbolsInErrorState.insert(KV.first);
+          else {
+            auto Flags = KV.second.getFlags();
+            Flags &= ~(JITSymbolFlags::Weak | JITSymbolFlags::Common);
+            assert(Flags ==
+                       (SymI->second.getFlags() &
+                        ~(JITSymbolFlags::Weak | JITSymbolFlags::Common)) &&
+                   "Resolved flags should match the declared flags");
+
+            Worklist.push_back(
+                {SymI, JITEvaluatedSymbol(KV.second.getAddress(), Flags)});
+          }
+        }
 
-    while (!Worklist.empty()) {
-      auto SymI = Worklist.back().SymI;
-      auto ResolvedSym = Worklist.back().ResolvedSym;
-      Worklist.pop_back();
+        // If any symbols were in the error state then bail out.
+        if (!SymbolsInErrorState.empty()) {
+          auto FailedSymbolsDepMap = std::make_shared<SymbolDependenceMap>();
+          (*FailedSymbolsDepMap)[this] = std::move(SymbolsInErrorState);
+          return make_error<FailedToMaterialize>(
+              std::move(FailedSymbolsDepMap));
+        }
 
-      auto &Name = SymI->first;
+        while (!Worklist.empty()) {
+          auto SymI = Worklist.back().SymI;
+          auto ResolvedSym = Worklist.back().ResolvedSym;
+          Worklist.pop_back();
 
-      // Resolved symbols can not be weak: discard the weak flag.
-      JITSymbolFlags ResolvedFlags = ResolvedSym.getFlags();
-      SymI->second.setAddress(ResolvedSym.getAddress());
-      SymI->second.setFlags(ResolvedFlags);
-      SymI->second.setState(SymbolState::Resolved);
+          auto &Name = SymI->first;
 
-      auto MII = MaterializingInfos.find(Name);
-      if (MII == MaterializingInfos.end())
-        continue;
+          // Resolved symbols can not be weak: discard the weak flag.
+          JITSymbolFlags ResolvedFlags = ResolvedSym.getFlags();
+          SymI->second.setAddress(ResolvedSym.getAddress());
+          SymI->second.setFlags(ResolvedFlags);
+          SymI->second.setState(SymbolState::Resolved);
 
-      auto &MI = MII->second;
-      for (auto &Q : MI.takeQueriesMeeting(SymbolState::Resolved)) {
-        Q->notifySymbolMetRequiredState(Name, ResolvedSym);
-        Q->removeQueryDependence(*this, Name);
-        if (Q->isComplete())
-          CompletedQueries.insert(std::move(Q));
-      }
-    }
-  });
+          auto MII = MaterializingInfos.find(Name);
+          if (MII == MaterializingInfos.end())
+            continue;
 
-  assert((SymbolsInErrorState.empty() || CompletedQueries.empty()) &&
-         "Can't fail symbols and completed queries at the same time");
+          auto &MI = MII->second;
+          for (auto &Q : MI.takeQueriesMeeting(SymbolState::Resolved)) {
+            Q->notifySymbolMetRequiredState(Name, ResolvedSym);
+            Q->removeQueryDependence(*this, Name);
+            if (Q->isComplete())
+              CompletedQueries.insert(std::move(Q));
+          }
+        }
 
-  // If we failed any symbols then return an error.
-  if (!SymbolsInErrorState.empty()) {
-    auto FailedSymbolsDepMap = std::make_shared<SymbolDependenceMap>();
-    (*FailedSymbolsDepMap)[this] = std::move(SymbolsInErrorState);
-    return make_error<FailedToMaterialize>(std::move(FailedSymbolsDepMap));
-  }
+        return Error::success();
+      }))
+    return Err;
 
   // Otherwise notify all the completed queries.
   for (auto &Q : CompletedQueries) {
@@ -901,139 +969,145 @@ Error JITDylib::resolve(const SymbolMap &Resolved) {
   return Error::success();
 }
 
-Error JITDylib::emit(const SymbolFlagsMap &Emitted) {
+Error JITDylib::emit(MaterializationResponsibility &MR,
+                     const SymbolFlagsMap &Emitted) {
   AsynchronousSymbolQuerySet CompletedQueries;
-  SymbolNameSet SymbolsInErrorState;
   DenseMap<JITDylib *, SymbolNameVector> ReadySymbols;
 
-  ES.runSessionLocked([&, this]() {
-    std::vector<SymbolTable::iterator> Worklist;
+  if (auto Err = ES.runSessionLocked([&, this]() -> Error {
+        auto RTI = MRTrackers.find(&MR);
+        assert(RTI != MRTrackers.end() && "No resource tracker for MR?");
+        if (RTI->second->isDefunct())
+          return make_error<ResourceTrackerDefunct>(RTI->second);
 
-    // Scan to build worklist, record any symbols in the erorr state.
-    for (const auto &KV : Emitted) {
-      auto &Name = KV.first;
+        SymbolNameSet SymbolsInErrorState;
+        std::vector<SymbolTable::iterator> Worklist;
 
-      auto SymI = Symbols.find(Name);
-      assert(SymI != Symbols.end() && "No symbol table entry for Name");
+        // Scan to build worklist, record any symbols in the erorr state.
+        for (const auto &KV : Emitted) {
+          auto &Name = KV.first;
 
-      if (SymI->second.getFlags().hasError())
-        SymbolsInErrorState.insert(Name);
-      else
-        Worklist.push_back(SymI);
-    }
+          auto SymI = Symbols.find(Name);
+          assert(SymI != Symbols.end() && "No symbol table entry for Name");
 
-    // If any symbols were in the error state then bail out.
-    if (!SymbolsInErrorState.empty())
-      return;
+          if (SymI->second.getFlags().hasError())
+            SymbolsInErrorState.insert(Name);
+          else
+            Worklist.push_back(SymI);
+        }
 
-    // Otherwise update dependencies and move to the emitted state.
-    while (!Worklist.empty()) {
-      auto SymI = Worklist.back();
-      Worklist.pop_back();
+        // If any symbols were in the error state then bail out.
+        if (!SymbolsInErrorState.empty()) {
+          auto FailedSymbolsDepMap = std::make_shared<SymbolDependenceMap>();
+          (*FailedSymbolsDepMap)[this] = std::move(SymbolsInErrorState);
+          return make_error<FailedToMaterialize>(
+              std::move(FailedSymbolsDepMap));
+        }
 
-      auto &Name = SymI->first;
-      auto &SymEntry = SymI->second;
+        // Otherwise update dependencies and move to the emitted state.
+        while (!Worklist.empty()) {
+          auto SymI = Worklist.back();
+          Worklist.pop_back();
 
-      // Move symbol to the emitted state.
-      assert(((SymEntry.getFlags().hasMaterializationSideEffectsOnly() &&
-               SymEntry.getState() == SymbolState::Materializing) ||
-              SymEntry.getState() == SymbolState::Resolved) &&
-             "Emitting from state other than Resolved");
-      SymEntry.setState(SymbolState::Emitted);
+          auto &Name = SymI->first;
+          auto &SymEntry = SymI->second;
 
-      auto MII = MaterializingInfos.find(Name);
+          // Move symbol to the emitted state.
+          assert(((SymEntry.getFlags().hasMaterializationSideEffectsOnly() &&
+                   SymEntry.getState() == SymbolState::Materializing) ||
+                  SymEntry.getState() == SymbolState::Resolved) &&
+                 "Emitting from state other than Resolved");
+          SymEntry.setState(SymbolState::Emitted);
 
-      // If this symbol has no MaterializingInfo then it's trivially ready.
-      // Update its state and continue.
-      if (MII == MaterializingInfos.end()) {
-        SymEntry.setState(SymbolState::Ready);
-        continue;
-      }
+          auto MII = MaterializingInfos.find(Name);
+
+          // If this symbol has no MaterializingInfo then it's trivially ready.
+          // Update its state and continue.
+          if (MII == MaterializingInfos.end()) {
+            SymEntry.setState(SymbolState::Ready);
+            continue;
+          }
+
+          auto &MI = MII->second;
+
+          // For each dependant, transfer this node's emitted dependencies to
+          // it. If the dependant node is ready (i.e. has no unemitted
+          // dependencies) then notify any pending queries.
+          for (auto &KV : MI.Dependants) {
+            auto &DependantJD = *KV.first;
+            auto &DependantJDReadySymbols = ReadySymbols[&DependantJD];
+            for (auto &DependantName : KV.second) {
+              auto DependantMII =
+                  DependantJD.MaterializingInfos.find(DependantName);
+              assert(DependantMII != DependantJD.MaterializingInfos.end() &&
+                     "Dependant should have MaterializingInfo");
+
+              auto &DependantMI = DependantMII->second;
+
+              // Remove the dependant's dependency on this node.
+              assert(DependantMI.UnemittedDependencies.count(this) &&
+                     "Dependant does not have an unemitted dependencies record "
+                     "for "
+                     "this JITDylib");
+              assert(DependantMI.UnemittedDependencies[this].count(Name) &&
+                     "Dependant does not count this symbol as a dependency?");
+
+              DependantMI.UnemittedDependencies[this].erase(Name);
+              if (DependantMI.UnemittedDependencies[this].empty())
+                DependantMI.UnemittedDependencies.erase(this);
+
+              // Transfer unemitted dependencies from this node to the
+              // dependant.
+              DependantJD.transferEmittedNodeDependencies(DependantMI,
+                                                          DependantName, MI);
+
+              auto DependantSymI = DependantJD.Symbols.find(DependantName);
+              assert(DependantSymI != DependantJD.Symbols.end() &&
+                     "Dependant has no entry in the Symbols table");
+              auto &DependantSymEntry = DependantSymI->second;
+
+              // If the dependant is emitted and this node was the last of its
+              // unemitted dependencies then the dependant node is now ready, so
+              // notify any pending queries on the dependant node.
+              if (DependantSymEntry.getState() == SymbolState::Emitted &&
+                  DependantMI.UnemittedDependencies.empty()) {
+                assert(DependantMI.Dependants.empty() &&
+                       "Dependants should be empty by now");
+
+                // Since this dependant is now ready, we erase its
+                // MaterializingInfo and update its materializing state.
+                DependantSymEntry.setState(SymbolState::Ready);
+                DependantJDReadySymbols.push_back(DependantName);
+
+                for (auto &Q :
+                     DependantMI.takeQueriesMeeting(SymbolState::Ready)) {
+                  Q->notifySymbolMetRequiredState(
+                      DependantName, DependantSymI->second.getSymbol());
+                  if (Q->isComplete())
+                    CompletedQueries.insert(Q);
+                  Q->removeQueryDependence(DependantJD, DependantName);
+                }
+              }
+            }
+          }
 
-      auto &MI = MII->second;
-
-      // For each dependant, transfer this node's emitted dependencies to
-      // it. If the dependant node is ready (i.e. has no unemitted
-      // dependencies) then notify any pending queries.
-      for (auto &KV : MI.Dependants) {
-        auto &DependantJD = *KV.first;
-        auto &DependantJDReadySymbols = ReadySymbols[&DependantJD];
-        for (auto &DependantName : KV.second) {
-          auto DependantMII =
-              DependantJD.MaterializingInfos.find(DependantName);
-          assert(DependantMII != DependantJD.MaterializingInfos.end() &&
-                 "Dependant should have MaterializingInfo");
-
-          auto &DependantMI = DependantMII->second;
-
-          // Remove the dependant's dependency on this node.
-          assert(DependantMI.UnemittedDependencies.count(this) &&
-                 "Dependant does not have an unemitted dependencies record for "
-                 "this JITDylib");
-          assert(DependantMI.UnemittedDependencies[this].count(Name) &&
-                 "Dependant does not count this symbol as a dependency?");
-
-          DependantMI.UnemittedDependencies[this].erase(Name);
-          if (DependantMI.UnemittedDependencies[this].empty())
-            DependantMI.UnemittedDependencies.erase(this);
-
-          // Transfer unemitted dependencies from this node to the dependant.
-          DependantJD.transferEmittedNodeDependencies(DependantMI,
-                                                      DependantName, MI);
-
-          auto DependantSymI = DependantJD.Symbols.find(DependantName);
-          assert(DependantSymI != DependantJD.Symbols.end() &&
-                 "Dependant has no entry in the Symbols table");
-          auto &DependantSymEntry = DependantSymI->second;
-
-          // If the dependant is emitted and this node was the last of its
-          // unemitted dependencies then the dependant node is now ready, so
-          // notify any pending queries on the dependant node.
-          if (DependantSymEntry.getState() == SymbolState::Emitted &&
-              DependantMI.UnemittedDependencies.empty()) {
-            assert(DependantMI.Dependants.empty() &&
-                   "Dependants should be empty by now");
-
-            // Since this dependant is now ready, we erase its MaterializingInfo
-            // and update its materializing state.
-            DependantSymEntry.setState(SymbolState::Ready);
-            DependantJDReadySymbols.push_back(DependantName);
-
-            for (auto &Q : DependantMI.takeQueriesMeeting(SymbolState::Ready)) {
-              Q->notifySymbolMetRequiredState(
-                  DependantName, DependantSymI->second.getSymbol());
+          auto &ThisJDReadySymbols = ReadySymbols[this];
+          MI.Dependants.clear();
+          if (MI.UnemittedDependencies.empty()) {
+            SymI->second.setState(SymbolState::Ready);
+            ThisJDReadySymbols.push_back(Name);
+            for (auto &Q : MI.takeQueriesMeeting(SymbolState::Ready)) {
+              Q->notifySymbolMetRequiredState(Name, SymI->second.getSymbol());
               if (Q->isComplete())
                 CompletedQueries.insert(Q);
-              Q->removeQueryDependence(DependantJD, DependantName);
+              Q->removeQueryDependence(*this, Name);
             }
           }
         }
-      }
-
-      auto &ThisJDReadySymbols = ReadySymbols[this];
-      MI.Dependants.clear();
-      if (MI.UnemittedDependencies.empty()) {
-        SymI->second.setState(SymbolState::Ready);
-        ThisJDReadySymbols.push_back(Name);
-        for (auto &Q : MI.takeQueriesMeeting(SymbolState::Ready)) {
-          Q->notifySymbolMetRequiredState(Name, SymI->second.getSymbol());
-          if (Q->isComplete())
-            CompletedQueries.insert(Q);
-          Q->removeQueryDependence(*this, Name);
-        }
-      }
-    }
-  });
-
-  assert((SymbolsInErrorState.empty() || CompletedQueries.empty()) &&
-         "Can't fail symbols and completed queries at the same time");
 
-  // If we failed any symbols then return an error.
-  if (!SymbolsInErrorState.empty()) {
-    auto FailedSymbolsDepMap = std::make_shared<SymbolDependenceMap>();
-    (*FailedSymbolsDepMap)[this] = std::move(SymbolsInErrorState);
-    return make_error<FailedToMaterialize>(std::move(FailedSymbolsDepMap));
-  }
+        return Error::success();
+      }))
+    return Err;
 
   // Otherwise notify all the completed queries.
   for (auto &Q : CompletedQueries) {
@@ -1044,120 +1118,122 @@ Error JITDylib::emit(const SymbolFlagsMap &Emitted) {
   return Error::success();
 }
 
-void JITDylib::notifyFailed(FailedSymbolsWorklist Worklist) {
-  AsynchronousSymbolQuerySet FailedQueries;
-  auto FailedSymbolsMap = std::make_shared<SymbolDependenceMap>();
-
-  // Failing no symbols is a no-op.
-  if (Worklist.empty())
-    return;
-
-  auto &ES = Worklist.front().first->getExecutionSession();
-
+void JITDylib::unlinkMaterializationResponsibility(
+    MaterializationResponsibility &MR) {
   ES.runSessionLocked([&]() {
-    while (!Worklist.empty()) {
-      assert(Worklist.back().first && "Failed JITDylib can not be null");
-      auto &JD = *Worklist.back().first;
-      auto Name = std::move(Worklist.back().second);
-      Worklist.pop_back();
-
-      (*FailedSymbolsMap)[&JD].insert(Name);
-
-      assert(JD.Symbols.count(Name) && "No symbol table entry for Name");
-      auto &Sym = JD.Symbols[Name];
-
-      // Move the symbol into the error state.
-      // Note that this may be redundant: The symbol might already have been
-      // moved to this state in response to the failure of a dependence.
-      Sym.setFlags(Sym.getFlags() | JITSymbolFlags::HasError);
-
-      // FIXME: Come up with a sane mapping of state to
-      // presence-of-MaterializingInfo so that we can assert presence / absence
-      // here, rather than testing it.
-      auto MII = JD.MaterializingInfos.find(Name);
+    auto I = MRTrackers.find(&MR);
+    assert(I != MRTrackers.end() && "MaterializationResponsibility not linked");
+    MRTrackers.erase(I);
+  });
+}
 
-      if (MII == JD.MaterializingInfos.end())
-        continue;
+std::pair<JITDylib::AsynchronousSymbolQuerySet,
+          std::shared_ptr<SymbolDependenceMap>>
+JITDylib::failSymbols(FailedSymbolsWorklist Worklist) {
+  AsynchronousSymbolQuerySet FailedQueries;
+  auto FailedSymbolsMap = std::make_shared<SymbolDependenceMap>();
 
-      auto &MI = MII->second;
-
-      // Move all dependants to the error state and disconnect from them.
-      for (auto &KV : MI.Dependants) {
-        auto &DependantJD = *KV.first;
-        for (auto &DependantName : KV.second) {
-          assert(DependantJD.Symbols.count(DependantName) &&
-                 "No symbol table entry for DependantName");
-          auto &DependantSym = DependantJD.Symbols[DependantName];
-          DependantSym.setFlags(DependantSym.getFlags() |
-                                JITSymbolFlags::HasError);
-
-          assert(DependantJD.MaterializingInfos.count(DependantName) &&
-                 "No MaterializingInfo for dependant");
-          auto &DependantMI = DependantJD.MaterializingInfos[DependantName];
-
-          auto UnemittedDepI = DependantMI.UnemittedDependencies.find(&JD);
-          assert(UnemittedDepI != DependantMI.UnemittedDependencies.end() &&
-                 "No UnemittedDependencies entry for this JITDylib");
-          assert(UnemittedDepI->second.count(Name) &&
-                 "No UnemittedDependencies entry for this symbol");
-          UnemittedDepI->second.erase(Name);
-          if (UnemittedDepI->second.empty())
-            DependantMI.UnemittedDependencies.erase(UnemittedDepI);
-
-          // If this symbol is already in the emitted state then we need to
-          // take responsibility for failing its queries, so add it to the
-          // worklist.
-          if (DependantSym.getState() == SymbolState::Emitted) {
-            assert(DependantMI.Dependants.empty() &&
-                   "Emitted symbol should not have dependants");
-            Worklist.push_back(std::make_pair(&DependantJD, DependantName));
-          }
-        }
-      }
-      MI.Dependants.clear();
-
-      // Disconnect from all unemitted depenencies.
-      for (auto &KV : MI.UnemittedDependencies) {
-        auto &UnemittedDepJD = *KV.first;
-        for (auto &UnemittedDepName : KV.second) {
-          auto UnemittedDepMII =
-              UnemittedDepJD.MaterializingInfos.find(UnemittedDepName);
-          assert(UnemittedDepMII != UnemittedDepJD.MaterializingInfos.end() &&
-                 "Missing MII for unemitted dependency");
-          assert(UnemittedDepMII->second.Dependants.count(&JD) &&
-                 "JD not listed as a dependant of unemitted dependency");
-          assert(UnemittedDepMII->second.Dependants[&JD].count(Name) &&
-                 "Name is not listed as a dependant of unemitted dependency");
-          UnemittedDepMII->second.Dependants[&JD].erase(Name);
-          if (UnemittedDepMII->second.Dependants[&JD].empty())
-            UnemittedDepMII->second.Dependants.erase(&JD);
+  while (!Worklist.empty()) {
+    assert(Worklist.back().first && "Failed JITDylib can not be null");
+    auto &JD = *Worklist.back().first;
+    auto Name = std::move(Worklist.back().second);
+    Worklist.pop_back();
+
+    (*FailedSymbolsMap)[&JD].insert(Name);
+
+    assert(JD.Symbols.count(Name) && "No symbol table entry for Name");
+    auto &Sym = JD.Symbols[Name];
+
+    // Move the symbol into the error state.
+    // Note that this may be redundant: The symbol might already have been
+    // moved to this state in response to the failure of a dependence.
+    Sym.setFlags(Sym.getFlags() | JITSymbolFlags::HasError);
+
+    // FIXME: Come up with a sane mapping of state to
+    // presence-of-MaterializingInfo so that we can assert presence / absence
+    // here, rather than testing it.
+    auto MII = JD.MaterializingInfos.find(Name);
+
+    if (MII == JD.MaterializingInfos.end())
+      continue;
+
+    auto &MI = MII->second;
+
+    // Move all dependants to the error state and disconnect from them.
+    for (auto &KV : MI.Dependants) {
+      auto &DependantJD = *KV.first;
+      for (auto &DependantName : KV.second) {
+        assert(DependantJD.Symbols.count(DependantName) &&
+               "No symbol table entry for DependantName");
+        auto &DependantSym = DependantJD.Symbols[DependantName];
+        DependantSym.setFlags(DependantSym.getFlags() |
+                              JITSymbolFlags::HasError);
+
+        assert(DependantJD.MaterializingInfos.count(DependantName) &&
+               "No MaterializingInfo for dependant");
+        auto &DependantMI = DependantJD.MaterializingInfos[DependantName];
+
+        auto UnemittedDepI = DependantMI.UnemittedDependencies.find(&JD);
+        assert(UnemittedDepI != DependantMI.UnemittedDependencies.end() &&
+               "No UnemittedDependencies entry for this JITDylib");
+        assert(UnemittedDepI->second.count(Name) &&
+               "No UnemittedDependencies entry for this symbol");
+        UnemittedDepI->second.erase(Name);
+        if (UnemittedDepI->second.empty())
+          DependantMI.UnemittedDependencies.erase(UnemittedDepI);
+
+        // If this symbol is already in the emitted state then we need to
+        // take responsibility for failing its queries, so add it to the
+        // worklist.
+        if (DependantSym.getState() == SymbolState::Emitted) {
+          assert(DependantMI.Dependants.empty() &&
+                 "Emitted symbol should not have dependants");
+          Worklist.push_back(std::make_pair(&DependantJD, DependantName));
         }
       }
-      MI.UnemittedDependencies.clear();
-
-      // Collect queries to be failed for this MII.
-      AsynchronousSymbolQueryList ToDetach;
-      for (auto &Q : MII->second.pendingQueries()) {
-        // Add the query to the list to be failed and detach it.
-        FailedQueries.insert(Q);
-        ToDetach.push_back(Q);
+    }
+    MI.Dependants.clear();
+
+    // Disconnect from all unemitted depenencies.
+    for (auto &KV : MI.UnemittedDependencies) {
+      auto &UnemittedDepJD = *KV.first;
+      for (auto &UnemittedDepName : KV.second) {
+        auto UnemittedDepMII =
+            UnemittedDepJD.MaterializingInfos.find(UnemittedDepName);
+        assert(UnemittedDepMII != UnemittedDepJD.MaterializingInfos.end() &&
+               "Missing MII for unemitted dependency");
+        assert(UnemittedDepMII->second.Dependants.count(&JD) &&
+               "JD not listed as a dependant of unemitted dependency");
+        assert(UnemittedDepMII->second.Dependants[&JD].count(Name) &&
+               "Name is not listed as a dependant of unemitted dependency");
+        UnemittedDepMII->second.Dependants[&JD].erase(Name);
+        if (UnemittedDepMII->second.Dependants[&JD].empty())
+          UnemittedDepMII->second.Dependants.erase(&JD);
       }
-      for (auto &Q : ToDetach)
-        Q->detach();
-
-      assert(MI.Dependants.empty() &&
-             "Can not delete MaterializingInfo with dependants still attached");
-      assert(MI.UnemittedDependencies.empty() &&
-             "Can not delete MaterializingInfo with unemitted dependencies "
-             "still attached");
-      assert(!MI.hasQueriesPending() &&
-             "Can not delete MaterializingInfo with queries pending");
-      JD.MaterializingInfos.erase(MII);
     }
-  });
+    MI.UnemittedDependencies.clear();
+
+    // Collect queries to be failed for this MII.
+    AsynchronousSymbolQueryList ToDetach;
+    for (auto &Q : MII->second.pendingQueries()) {
+      // Add the query to the list to be failed and detach it.
+      FailedQueries.insert(Q);
+      ToDetach.push_back(Q);
+    }
+    for (auto &Q : ToDetach)
+      Q->detach();
 
-  for (auto &Q : FailedQueries)
-    Q->handleFailed(make_error<FailedToMaterialize>(FailedSymbolsMap));
+    assert(MI.Dependants.empty() &&
+           "Can not delete MaterializingInfo with dependants still attached");
+    assert(MI.UnemittedDependencies.empty() &&
+           "Can not delete MaterializingInfo with unemitted dependencies "
+           "still attached");
+    assert(!MI.hasQueriesPending() &&
+           "Can not delete MaterializingInfo with queries pending");
+    JD.MaterializingInfos.erase(MII);
+  }
+
+  return std::make_pair(std::move(FailedQueries), std::move(FailedSymbolsMap));
 }
 
 void JITDylib::setLinkOrder(JITDylibSearchOrder NewLinkOrder,
@@ -1168,8 +1244,7 @@ void JITDylib::setLinkOrder(JITDylibSearchOrder NewLinkOrder,
       if (NewLinkOrder.empty() || NewLinkOrder.front().first != this)
         LinkOrder.push_back(
             std::make_pair(this, JITDylibLookupFlags::MatchAllSymbols));
-      LinkOrder.insert(LinkOrder.end(), NewLinkOrder.begin(),
-                       NewLinkOrder.end());
+      llvm::append_range(LinkOrder, NewLinkOrder);
     } else
       LinkOrder = std::move(NewLinkOrder);
   });
@@ -1192,10 +1267,10 @@ void JITDylib::replaceInLinkOrder(JITDylib &OldJD, JITDylib &NewJD,
 
 void JITDylib::removeFromLinkOrder(JITDylib &JD) {
   ES.runSessionLocked([&]() {
-    auto I = std::find_if(LinkOrder.begin(), LinkOrder.end(),
-                          [&](const JITDylibSearchOrder::value_type &KV) {
-                            return KV.first == &JD;
-                          });
+    auto I = llvm::find_if(LinkOrder,
+                           [&](const JITDylibSearchOrder::value_type &KV) {
+                             return KV.first == &JD;
+                           });
     if (I != LinkOrder.end())
       LinkOrder.erase(I);
   });
@@ -1257,295 +1332,22 @@ Error JITDylib::remove(const SymbolNameSet &Names) {
   });
 }
 
-Expected<SymbolFlagsMap>
-JITDylib::lookupFlags(LookupKind K, JITDylibLookupFlags JDLookupFlags,
-                      SymbolLookupSet LookupSet) {
-  return ES.runSessionLocked([&, this]() -> Expected<SymbolFlagsMap> {
-    SymbolFlagsMap Result;
-    lookupFlagsImpl(Result, K, JDLookupFlags, LookupSet);
-
-    // Run any definition generators.
-    for (auto &DG : DefGenerators) {
-
-      // Bail out early if we found everything.
-      if (LookupSet.empty())
-        break;
+void JITDylib::dump(raw_ostream &OS) {
+  ES.runSessionLocked([&, this]() {
+    OS << "JITDylib \"" << JITDylibName << "\" (ES: "
+       << format("0x%016" PRIx64, reinterpret_cast<uintptr_t>(&ES)) << "):\n"
+       << "Link order: " << LinkOrder << "\n"
+       << "Symbol table:\n";
 
-      // Run this generator.
-      if (auto Err = DG->tryToGenerate(K, *this, JDLookupFlags, LookupSet))
-        return std::move(Err);
+    for (auto &KV : Symbols) {
+      OS << "    \"" << *KV.first << "\": ";
+      if (auto Addr = KV.second.getAddress())
+        OS << format("0x%016" PRIx64, Addr) << ", " << KV.second.getFlags()
+           << " ";
+      else
+        OS << "<not resolved> ";
 
-      // Re-try the search.
-      lookupFlagsImpl(Result, K, JDLookupFlags, LookupSet);
-    }
-
-    return Result;
-  });
-}
-
-void JITDylib::lookupFlagsImpl(SymbolFlagsMap &Result, LookupKind K,
-                               JITDylibLookupFlags JDLookupFlags,
-                               SymbolLookupSet &LookupSet) {
-
-  LookupSet.forEachWithRemoval(
-      [&](const SymbolStringPtr &Name, SymbolLookupFlags Flags) -> bool {
-        auto I = Symbols.find(Name);
-        if (I == Symbols.end())
-          return false;
-        assert(!Result.count(Name) && "Symbol already present in Flags map");
-        Result[Name] = I->second.getFlags();
-        return true;
-      });
-}
-
-Error JITDylib::lodgeQuery(MaterializationUnitList &MUs,
-                           std::shared_ptr<AsynchronousSymbolQuery> &Q,
-                           LookupKind K, JITDylibLookupFlags JDLookupFlags,
-                           SymbolLookupSet &Unresolved) {
-  assert(Q && "Query can not be null");
-
-  if (auto Err = lodgeQueryImpl(MUs, Q, K, JDLookupFlags, Unresolved))
-    return Err;
-
-  // Run any definition generators.
-  for (auto &DG : DefGenerators) {
-
-    // Bail out early if we have resolved everything.
-    if (Unresolved.empty())
-      break;
-
-    // Run the generator.
-    if (auto Err = DG->tryToGenerate(K, *this, JDLookupFlags, Unresolved))
-      return Err;
-
-    // Lodge query. This can not fail as any new definitions were added
-    // by the generator under the session locked. Since they can't have
-    // started materializing yet they can not have failed.
-    cantFail(lodgeQueryImpl(MUs, Q, K, JDLookupFlags, Unresolved));
-  }
-
-  return Error::success();
-}
-
-Error JITDylib::lodgeQueryImpl(MaterializationUnitList &MUs,
-                               std::shared_ptr<AsynchronousSymbolQuery> &Q,
-                               LookupKind K, JITDylibLookupFlags JDLookupFlags,
-                               SymbolLookupSet &Unresolved) {
-
-  return Unresolved.forEachWithRemoval(
-      [&](const SymbolStringPtr &Name,
-          SymbolLookupFlags SymLookupFlags) -> Expected<bool> {
-        // Search for name in symbols. If not found then continue without
-        // removal.
-        auto SymI = Symbols.find(Name);
-        if (SymI == Symbols.end())
-          return false;
-
-        // If we match against a materialization-side-effects only symbol then
-        // make sure it is weakly-referenced. Otherwise bail out with an error.
-        if (SymI->second.getFlags().hasMaterializationSideEffectsOnly() &&
-            SymLookupFlags != SymbolLookupFlags::WeaklyReferencedSymbol)
-          return make_error<SymbolsNotFound>(SymbolNameVector({Name}));
-
-        // If this is a non exported symbol and we're matching exported symbols
-        // only then skip this symbol without removal.
-        if (!SymI->second.getFlags().isExported() &&
-            JDLookupFlags == JITDylibLookupFlags::MatchExportedSymbolsOnly)
-          return false;
-
-        // If we matched against this symbol but it is in the error state then
-        // bail out and treat it as a failure to materialize.
-        if (SymI->second.getFlags().hasError()) {
-          auto FailedSymbolsMap = std::make_shared<SymbolDependenceMap>();
-          (*FailedSymbolsMap)[this] = {Name};
-          return make_error<FailedToMaterialize>(std::move(FailedSymbolsMap));
-        }
-
-        // If this symbol already meets the required state for then notify the
-        // query, then remove the symbol and continue.
-        if (SymI->second.getState() >= Q->getRequiredState()) {
-          Q->notifySymbolMetRequiredState(Name, SymI->second.getSymbol());
-          return true;
-        }
-
-        // Otherwise this symbol does not yet meet the required state. Check
-        // whether it has a materializer attached, and if so prepare to run it.
-        if (SymI->second.hasMaterializerAttached()) {
-          assert(SymI->second.getAddress() == 0 &&
-                 "Symbol not resolved but already has address?");
-          auto UMII = UnmaterializedInfos.find(Name);
-          assert(UMII != UnmaterializedInfos.end() &&
-                 "Lazy symbol should have UnmaterializedInfo");
-          auto MU = std::move(UMII->second->MU);
-          assert(MU != nullptr && "Materializer should not be null");
-
-          // Move all symbols associated with this MaterializationUnit into
-          // materializing state.
-          for (auto &KV : MU->getSymbols()) {
-            auto SymK = Symbols.find(KV.first);
-            SymK->second.setMaterializerAttached(false);
-            SymK->second.setState(SymbolState::Materializing);
-            UnmaterializedInfos.erase(KV.first);
-          }
-
-          // Add MU to the list of MaterializationUnits to be materialized.
-          MUs.push_back(std::move(MU));
-        }
-
-        // Add the query to the PendingQueries list and continue, deleting the
-        // element.
-        assert(SymI->second.getState() != SymbolState::NeverSearched &&
-               SymI->second.getState() != SymbolState::Ready &&
-               "By this line the symbol should be materializing");
-        auto &MI = MaterializingInfos[Name];
-        MI.addQuery(Q);
-        Q->addQueryDependence(*this, Name);
-        return true;
-      });
-}
-
-Expected<SymbolNameSet>
-JITDylib::legacyLookup(std::shared_ptr<AsynchronousSymbolQuery> Q,
-                       SymbolNameSet Names) {
-  assert(Q && "Query can not be null");
-
-  ES.runOutstandingMUs();
-
-  bool QueryComplete = false;
-  std::vector<std::unique_ptr<MaterializationUnit>> MUs;
-
-  SymbolLookupSet Unresolved(Names);
-  auto Err = ES.runSessionLocked([&, this]() -> Error {
-    QueryComplete = lookupImpl(Q, MUs, Unresolved);
-
-    // Run any definition generators.
-    for (auto &DG : DefGenerators) {
-
-      // Bail out early if we have resolved everything.
-      if (Unresolved.empty())
-        break;
-
-      assert(!QueryComplete && "query complete but unresolved symbols remain?");
-      if (auto Err = DG->tryToGenerate(LookupKind::Static, *this,
-                                       JITDylibLookupFlags::MatchAllSymbols,
-                                       Unresolved))
-        return Err;
-
-      if (!Unresolved.empty())
-        QueryComplete = lookupImpl(Q, MUs, Unresolved);
-    }
-    return Error::success();
-  });
-
-  if (Err)
-    return std::move(Err);
-
-  assert((MUs.empty() || !QueryComplete) &&
-         "If action flags are set, there should be no work to do (so no MUs)");
-
-  if (QueryComplete)
-    Q->handleComplete();
-
-  // FIXME: Swap back to the old code below once RuntimeDyld works with
-  //        callbacks from asynchronous queries.
-  // Add MUs to the OutstandingMUs list.
-  {
-    std::lock_guard<std::recursive_mutex> Lock(ES.OutstandingMUsMutex);
-    auto ThisJD = shared_from_this();
-    for (auto &MU : MUs) {
-      auto MR = MU->createMaterializationResponsibility(ThisJD);
-      ES.OutstandingMUs.push_back(make_pair(std::move(MU), std::move(MR)));
-    }
-  }
-  ES.runOutstandingMUs();
-
-  // Dispatch any required MaterializationUnits for materialization.
-  // for (auto &MU : MUs)
-  //  ES.dispatchMaterialization(*this, std::move(MU));
-
-  SymbolNameSet RemainingSymbols;
-  for (auto &KV : Unresolved)
-    RemainingSymbols.insert(KV.first);
-
-  return RemainingSymbols;
-}
-
-bool JITDylib::lookupImpl(
-    std::shared_ptr<AsynchronousSymbolQuery> &Q,
-    std::vector<std::unique_ptr<MaterializationUnit>> &MUs,
-    SymbolLookupSet &Unresolved) {
-  bool QueryComplete = false;
-
-  std::vector<SymbolStringPtr> ToRemove;
-  Unresolved.forEachWithRemoval(
-      [&](const SymbolStringPtr &Name, SymbolLookupFlags Flags) -> bool {
-        // Search for the name in Symbols. Skip without removing if not found.
-        auto SymI = Symbols.find(Name);
-        if (SymI == Symbols.end())
-          return false;
-
-        // If the symbol is already in the required state then notify the query
-        // and remove.
-        if (SymI->second.getState() >= Q->getRequiredState()) {
-          Q->notifySymbolMetRequiredState(Name, SymI->second.getSymbol());
-          if (Q->isComplete())
-            QueryComplete = true;
-          return true;
-        }
-
-        // If the symbol is lazy, get the MaterialiaztionUnit for it.
-        if (SymI->second.hasMaterializerAttached()) {
-          assert(SymI->second.getAddress() == 0 &&
-                 "Lazy symbol should not have a resolved address");
-          auto UMII = UnmaterializedInfos.find(Name);
-          assert(UMII != UnmaterializedInfos.end() &&
-                 "Lazy symbol should have UnmaterializedInfo");
-          auto MU = std::move(UMII->second->MU);
-          assert(MU != nullptr && "Materializer should not be null");
-
-          // Kick all symbols associated with this MaterializationUnit into
-          // materializing state.
-          for (auto &KV : MU->getSymbols()) {
-            auto SymK = Symbols.find(KV.first);
-            assert(SymK != Symbols.end() && "Missing symbol table entry");
-            SymK->second.setState(SymbolState::Materializing);
-            SymK->second.setMaterializerAttached(false);
-            UnmaterializedInfos.erase(KV.first);
-          }
-
-          // Add MU to the list of MaterializationUnits to be materialized.
-          MUs.push_back(std::move(MU));
-        }
-
-        // Add the query to the PendingQueries list.
-        assert(SymI->second.getState() != SymbolState::NeverSearched &&
-               SymI->second.getState() != SymbolState::Ready &&
-               "By this line the symbol should be materializing");
-        auto &MI = MaterializingInfos[Name];
-        MI.addQuery(Q);
-        Q->addQueryDependence(*this, Name);
-        return true;
-      });
-
-  return QueryComplete;
-}
-
-void JITDylib::dump(raw_ostream &OS) {
-  ES.runSessionLocked([&, this]() {
-    OS << "JITDylib \"" << JITDylibName << "\" (ES: "
-       << format("0x%016" PRIx64, reinterpret_cast<uintptr_t>(&ES)) << "):\n"
-       << "Link order: " << LinkOrder << "\n"
-       << "Symbol table:\n";
-
-    for (auto &KV : Symbols) {
-      OS << "    \"" << *KV.first << "\": ";
-      if (auto Addr = KV.second.getAddress())
-        OS << format("0x%016" PRIx64, Addr) << ", " << KV.second.getFlags()
-           << " ";
-      else
-        OS << "<not resolved> ";
-
-      OS << KV.second.getFlags() << " " << KV.second.getState();
+      OS << KV.second.getFlags() << " " << KV.second.getState();
 
       if (KV.second.hasMaterializerAttached()) {
         OS << " (Materializer ";
@@ -1589,11 +1391,10 @@ void JITDylib::MaterializingInfo::addQuery(
 void JITDylib::MaterializingInfo::removeQuery(
     const AsynchronousSymbolQuery &Q) {
   // FIXME: Implement 'find_as' for shared_ptr<T>/T*.
-  auto I =
-      std::find_if(PendingQueries.begin(), PendingQueries.end(),
-                   [&Q](const std::shared_ptr<AsynchronousSymbolQuery> &V) {
-                     return V.get() == &Q;
-                   });
+  auto I = llvm::find_if(
+      PendingQueries, [&Q](const std::shared_ptr<AsynchronousSymbolQuery> &V) {
+        return V.get() == &Q;
+      });
   assert(I != PendingQueries.end() &&
          "Query is not attached to this MaterializingInfo");
   PendingQueries.erase(I);
@@ -1618,6 +1419,137 @@ JITDylib::JITDylib(ExecutionSession &ES, std::string Name)
   LinkOrder.push_back({this, JITDylibLookupFlags::MatchAllSymbols});
 }
 
+ResourceTrackerSP JITDylib::getTracker(MaterializationResponsibility &MR) {
+  auto I = MRTrackers.find(&MR);
+  assert(I != MRTrackers.end() && "MR is not linked");
+  assert(I->second && "Linked tracker is null");
+  return I->second;
+}
+
+std::pair<JITDylib::AsynchronousSymbolQuerySet,
+          std::shared_ptr<SymbolDependenceMap>>
+JITDylib::removeTracker(ResourceTracker &RT) {
+  // Note: Should be called under the session lock.
+
+  SymbolNameVector SymbolsToRemove;
+  std::vector<std::pair<JITDylib *, SymbolStringPtr>> SymbolsToFail;
+
+  if (&RT == DefaultTracker.get()) {
+    SymbolNameSet TrackedSymbols;
+    for (auto &KV : TrackerSymbols)
+      for (auto &Sym : KV.second)
+        TrackedSymbols.insert(Sym);
+
+    for (auto &KV : Symbols) {
+      auto &Sym = KV.first;
+      if (!TrackedSymbols.count(Sym))
+        SymbolsToRemove.push_back(Sym);
+    }
+
+    DefaultTracker.reset();
+  } else {
+    /// Check for a non-default tracker.
+    auto I = TrackerSymbols.find(&RT);
+    if (I != TrackerSymbols.end()) {
+      SymbolsToRemove = std::move(I->second);
+      TrackerSymbols.erase(I);
+    }
+    // ... if not found this tracker was already defunct. Nothing to do.
+  }
+
+  for (auto &Sym : SymbolsToRemove) {
+    assert(Symbols.count(Sym) && "Symbol not in symbol table");
+
+    // If there is a MaterializingInfo then collect any queries to fail.
+    auto MII = MaterializingInfos.find(Sym);
+    if (MII != MaterializingInfos.end())
+      SymbolsToFail.push_back({this, Sym});
+  }
+
+  AsynchronousSymbolQuerySet QueriesToFail;
+  auto Result = failSymbols(std::move(SymbolsToFail));
+
+  // Removed symbols should be taken out of the table altogether.
+  for (auto &Sym : SymbolsToRemove) {
+    auto I = Symbols.find(Sym);
+    assert(I != Symbols.end() && "Symbol not present in table");
+
+    // Remove Materializer if present.
+    if (I->second.hasMaterializerAttached()) {
+      // FIXME: Should this discard the symbols?
+      UnmaterializedInfos.erase(Sym);
+    } else {
+      assert(!UnmaterializedInfos.count(Sym) &&
+             "Symbol has materializer attached");
+    }
+
+    Symbols.erase(I);
+  }
+
+  return Result;
+}
+
+void JITDylib::transferTracker(ResourceTracker &DstRT, ResourceTracker &SrcRT) {
+  assert(&DstRT != &SrcRT && "No-op transfers shouldn't call transferTracker");
+  assert(&DstRT.getJITDylib() == this && "DstRT is not for this JITDylib");
+  assert(&SrcRT.getJITDylib() == this && "SrcRT is not for this JITDylib");
+
+  // Update trackers for any not-yet materialized units.
+  for (auto &KV : UnmaterializedInfos) {
+    if (KV.second->RT == &SrcRT)
+      KV.second->RT = &DstRT;
+  }
+
+  // Update trackers for any active materialization responsibilities.
+  for (auto &KV : MRTrackers) {
+    if (KV.second == &SrcRT)
+      KV.second = &DstRT;
+  }
+
+  // If we're transfering to the default tracker we just need to delete the
+  // tracked symbols for the source tracker.
+  if (&DstRT == DefaultTracker.get()) {
+    TrackerSymbols.erase(&SrcRT);
+    return;
+  }
+
+  // If we're transferring from the default tracker we need to find all
+  // currently untracked symbols.
+  if (&SrcRT == DefaultTracker.get()) {
+    assert(!TrackerSymbols.count(&SrcRT) &&
+           "Default tracker should not appear in TrackerSymbols");
+
+    SymbolNameVector SymbolsToTrack;
+
+    SymbolNameSet CurrentlyTrackedSymbols;
+    for (auto &KV : TrackerSymbols)
+      for (auto &Sym : KV.second)
+        CurrentlyTrackedSymbols.insert(Sym);
+
+    for (auto &KV : Symbols) {
+      auto &Sym = KV.first;
+      if (!CurrentlyTrackedSymbols.count(Sym))
+        SymbolsToTrack.push_back(Sym);
+    }
+
+    TrackerSymbols[&DstRT] = std::move(SymbolsToTrack);
+    return;
+  }
+
+  auto &DstTrackedSymbols = TrackerSymbols[&DstRT];
+
+  // Finally if neither SrtRT or DstRT are the default tracker then
+  // just append DstRT's tracked symbols to SrtRT's.
+  auto SI = TrackerSymbols.find(&SrcRT);
+  if (SI == TrackerSymbols.end())
+    return;
+
+  DstTrackedSymbols.reserve(DstTrackedSymbols.size() + SI->second.size());
+  for (auto &Sym : SI->second)
+    DstTrackedSymbols.push_back(std::move(Sym));
+  TrackerSymbols.erase(SI);
+}
+
 Error JITDylib::defineImpl(MaterializationUnit &MU) {
 
   LLVM_DEBUG({ dbgs() << "  " << MU.getSymbols() << "\n"; });
@@ -1685,6 +1617,22 @@ Error JITDylib::defineImpl(MaterializationUnit &MU) {
   return Error::success();
 }
 
+void JITDylib::installMaterializationUnit(
+    std::unique_ptr<MaterializationUnit> MU, ResourceTracker &RT) {
+
+  /// defineImpl succeeded.
+  if (&RT != DefaultTracker.get()) {
+    auto &TS = TrackerSymbols[&RT];
+    TS.reserve(TS.size() + MU->getSymbols().size());
+    for (auto &KV : MU->getSymbols())
+      TS.push_back(KV.first);
+  }
+
+  auto UMI = std::make_shared<UnmaterializedInfo>(std::move(MU), &RT);
+  for (auto &KV : UMI->MU->getSymbols())
+    UnmaterializedInfos[KV.first] = UMI;
+}
+
 void JITDylib::detachQueryHelper(AsynchronousSymbolQuery &Q,
                                  const SymbolNameSet &QuerySymbols) {
   for (auto &QuerySymbol : QuerySymbols) {
@@ -1773,7 +1721,39 @@ Expected<DenseMap<JITDylib *, SymbolMap>> Platform::lookupInitSymbols(
 }
 
 ExecutionSession::ExecutionSession(std::shared_ptr<SymbolStringPool> SSP)
-    : SSP(SSP ? std::move(SSP) : std::make_shared<SymbolStringPool>()) {
+    : SSP(SSP ? std::move(SSP) : std::make_shared<SymbolStringPool>()) {}
+
+Error ExecutionSession::endSession() {
+  LLVM_DEBUG(dbgs() << "Ending ExecutionSession " << this << "\n");
+
+  std::vector<JITDylibSP> JITDylibsToClose = runSessionLocked([&] {
+    SessionOpen = false;
+    return std::move(JDs);
+  });
+
+  // TODO: notifiy platform? run static deinits?
+
+  Error Err = Error::success();
+  for (auto &JD : JITDylibsToClose)
+    Err = joinErrors(std::move(Err), JD->clear());
+  return Err;
+}
+
+void ExecutionSession::registerResourceManager(ResourceManager &RM) {
+  runSessionLocked([&] { ResourceManagers.push_back(&RM); });
+}
+
+void ExecutionSession::deregisterResourceManager(ResourceManager &RM) {
+  runSessionLocked([&] {
+    assert(!ResourceManagers.empty() && "No managers registered");
+    if (ResourceManagers.back() == &RM)
+      ResourceManagers.pop_back();
+    else {
+      auto I = llvm::find(ResourceManagers, &RM);
+      assert(I != ResourceManagers.end() && "RM not registered");
+      ResourceManagers.erase(I);
+    }
+  });
 }
 
 JITDylib *ExecutionSession::getJITDylibByName(StringRef Name) {
@@ -1788,8 +1768,7 @@ JITDylib *ExecutionSession::getJITDylibByName(StringRef Name) {
 JITDylib &ExecutionSession::createBareJITDylib(std::string Name) {
   assert(!getJITDylibByName(Name) && "JITDylib with that name already exists");
   return runSessionLocked([&, this]() -> JITDylib & {
-    JDs.push_back(
-        std::shared_ptr<JITDylib>(new JITDylib(*this, std::move(Name))));
+    JDs.push_back(new JITDylib(*this, std::move(Name)));
     return *JDs.back();
   });
 }
@@ -1802,29 +1781,121 @@ Expected<JITDylib &> ExecutionSession::createJITDylib(std::string Name) {
   return JD;
 }
 
-void ExecutionSession::legacyFailQuery(AsynchronousSymbolQuery &Q, Error Err) {
-  assert(!!Err && "Error should be in failure state");
+std::vector<JITDylibSP> JITDylib::getDFSLinkOrder(ArrayRef<JITDylibSP> JDs) {
+  if (JDs.empty())
+    return {};
 
-  bool SendErrorToQuery;
-  runSessionLocked([&]() {
-    Q.detach();
-    SendErrorToQuery = Q.canStillFail();
+  auto &ES = JDs.front()->getExecutionSession();
+  return ES.runSessionLocked([&]() {
+    DenseSet<JITDylib *> Visited;
+    std::vector<JITDylibSP> Result;
+
+    for (auto &JD : JDs) {
+
+      if (Visited.count(JD.get()))
+        continue;
+
+      SmallVector<JITDylibSP, 64> WorkStack;
+      WorkStack.push_back(JD);
+      Visited.insert(JD.get());
+
+      while (!WorkStack.empty()) {
+        Result.push_back(std::move(WorkStack.back()));
+        WorkStack.pop_back();
+
+        for (auto &KV : llvm::reverse(Result.back()->LinkOrder)) {
+          auto &JD = *KV.first;
+          if (Visited.count(&JD))
+            continue;
+          Visited.insert(&JD);
+          WorkStack.push_back(&JD);
+        }
+      }
+    }
+    return Result;
   });
+}
 
-  if (SendErrorToQuery)
-    Q.handleFailed(std::move(Err));
-  else
-    reportError(std::move(Err));
+std::vector<JITDylibSP>
+JITDylib::getReverseDFSLinkOrder(ArrayRef<JITDylibSP> JDs) {
+  auto Tmp = getDFSLinkOrder(JDs);
+  std::reverse(Tmp.begin(), Tmp.end());
+  return Tmp;
+}
+
+std::vector<JITDylibSP> JITDylib::getDFSLinkOrder() {
+  return getDFSLinkOrder({this});
+}
+
+std::vector<JITDylibSP> JITDylib::getReverseDFSLinkOrder() {
+  return getReverseDFSLinkOrder({this});
+}
+
+void ExecutionSession::lookupFlags(
+    LookupKind K, JITDylibSearchOrder SearchOrder, SymbolLookupSet LookupSet,
+    unique_function<void(Expected<SymbolFlagsMap>)> OnComplete) {
+
+  OL_applyQueryPhase1(std::make_unique<InProgressLookupFlagsState>(
+                          K, std::move(SearchOrder), std::move(LookupSet),
+                          std::move(OnComplete)),
+                      Error::success());
 }
 
-Expected<SymbolMap> ExecutionSession::legacyLookup(
-    LegacyAsyncLookupFunction AsyncLookup, SymbolNameSet Names,
-    SymbolState RequiredState,
+Expected<SymbolFlagsMap>
+ExecutionSession::lookupFlags(LookupKind K, JITDylibSearchOrder SearchOrder,
+                              SymbolLookupSet LookupSet) {
+
+  std::promise<MSVCPExpected<SymbolFlagsMap>> ResultP;
+  OL_applyQueryPhase1(std::make_unique<InProgressLookupFlagsState>(
+                          K, std::move(SearchOrder), std::move(LookupSet),
+                          [&ResultP](Expected<SymbolFlagsMap> Result) {
+                            ResultP.set_value(std::move(Result));
+                          }),
+                      Error::success());
+
+  auto ResultF = ResultP.get_future();
+  return ResultF.get();
+}
+
+void ExecutionSession::lookup(
+    LookupKind K, const JITDylibSearchOrder &SearchOrder,
+    SymbolLookupSet Symbols, SymbolState RequiredState,
+    SymbolsResolvedCallback NotifyComplete,
     RegisterDependenciesFunction RegisterDependencies) {
+
+  LLVM_DEBUG({
+    runSessionLocked([&]() {
+      dbgs() << "Looking up " << Symbols << " in " << SearchOrder
+             << " (required state: " << RequiredState << ")\n";
+    });
+  });
+
+  // lookup can be re-entered recursively if running on a single thread. Run any
+  // outstanding MUs in case this query depends on them, otherwise this lookup
+  // will starve waiting for a result from an MU that is stuck in the queue.
+  dispatchOutstandingMUs();
+
+  auto Unresolved = std::move(Symbols);
+  auto Q = std::make_shared<AsynchronousSymbolQuery>(Unresolved, RequiredState,
+                                                     std::move(NotifyComplete));
+
+  auto IPLS = std::make_unique<InProgressFullLookupState>(
+      K, SearchOrder, std::move(Unresolved), RequiredState, std::move(Q),
+      std::move(RegisterDependencies));
+
+  OL_applyQueryPhase1(std::move(IPLS), Error::success());
+}
+
+Expected<SymbolMap>
+ExecutionSession::lookup(const JITDylibSearchOrder &SearchOrder,
+                         const SymbolLookupSet &Symbols, LookupKind K,
+                         SymbolState RequiredState,
+                         RegisterDependenciesFunction RegisterDependencies) {
 #if LLVM_ENABLE_THREADS
   // In the threaded case we use promises to return the results.
   std::promise<SymbolMap> PromisedResult;
   Error ResolutionError = Error::success();
+
   auto NotifyComplete = [&](Expected<SymbolMap> R) {
     if (R)
       PromisedResult.set_value(std::move(*R));
@@ -1834,6 +1905,7 @@ Expected<SymbolMap> ExecutionSession::legacyLookup(
       PromisedResult.set_value(SymbolMap());
     }
   };
+
 #else
   SymbolMap Result;
   Error ResolutionError = Error::success();
@@ -1847,33 +1919,17 @@ Expected<SymbolMap> ExecutionSession::legacyLookup(
   };
 #endif
 
-  auto Query = std::make_shared<AsynchronousSymbolQuery>(
-      SymbolLookupSet(Names), RequiredState, std::move(NotifyComplete));
-  // FIXME: This should be run session locked along with the registration code
-  // and error reporting below.
-  SymbolNameSet UnresolvedSymbols = AsyncLookup(Query, std::move(Names));
-
-  // If the query was lodged successfully then register the dependencies,
-  // otherwise fail it with an error.
-  if (UnresolvedSymbols.empty())
-    RegisterDependencies(Query->QueryRegistrations);
-  else {
-    bool DeliverError = runSessionLocked([&]() {
-      Query->detach();
-      return Query->canStillFail();
-    });
-    auto Err = make_error<SymbolsNotFound>(std::move(UnresolvedSymbols));
-    if (DeliverError)
-      Query->handleFailed(std::move(Err));
-    else
-      reportError(std::move(Err));
-  }
+  // Perform the asynchronous lookup.
+  lookup(K, SearchOrder, Symbols, RequiredState, NotifyComplete,
+         RegisterDependencies);
 
 #if LLVM_ENABLE_THREADS
   auto ResultFuture = PromisedResult.get_future();
   auto Result = ResultFuture.get();
+
   if (ResolutionError)
     return std::move(ResolutionError);
+
   return std::move(Result);
 
 #else
@@ -1884,219 +1940,829 @@ Expected<SymbolMap> ExecutionSession::legacyLookup(
 #endif
 }
 
-void ExecutionSession::lookup(
-    LookupKind K, const JITDylibSearchOrder &SearchOrder,
-    SymbolLookupSet Symbols, SymbolState RequiredState,
-    SymbolsResolvedCallback NotifyComplete,
-    RegisterDependenciesFunction RegisterDependencies) {
-
-  LLVM_DEBUG({
-    runSessionLocked([&]() {
-      dbgs() << "Looking up " << Symbols << " in " << SearchOrder
-             << " (required state: " << RequiredState << ")\n";
-    });
-  });
-
-  // lookup can be re-entered recursively if running on a single thread. Run any
-  // outstanding MUs in case this query depends on them, otherwise this lookup
-  // will starve waiting for a result from an MU that is stuck in the queue.
-  runOutstandingMUs();
+Expected<JITEvaluatedSymbol>
+ExecutionSession::lookup(const JITDylibSearchOrder &SearchOrder,
+                         SymbolStringPtr Name, SymbolState RequiredState) {
+  SymbolLookupSet Names({Name});
+
+  if (auto ResultMap = lookup(SearchOrder, std::move(Names), LookupKind::Static,
+                              RequiredState, NoDependenciesToRegister)) {
+    assert(ResultMap->size() == 1 && "Unexpected number of results");
+    assert(ResultMap->count(Name) && "Missing result for symbol");
+    return std::move(ResultMap->begin()->second);
+  } else
+    return ResultMap.takeError();
+}
+
+Expected<JITEvaluatedSymbol>
+ExecutionSession::lookup(ArrayRef<JITDylib *> SearchOrder, SymbolStringPtr Name,
+                         SymbolState RequiredState) {
+  return lookup(makeJITDylibSearchOrder(SearchOrder), Name, RequiredState);
+}
+
+Expected<JITEvaluatedSymbol>
+ExecutionSession::lookup(ArrayRef<JITDylib *> SearchOrder, StringRef Name,
+                         SymbolState RequiredState) {
+  return lookup(SearchOrder, intern(Name), RequiredState);
+}
+
+void ExecutionSession::dump(raw_ostream &OS) {
+  runSessionLocked([this, &OS]() {
+    for (auto &JD : JDs)
+      JD->dump(OS);
+  });
+}
+
+void ExecutionSession::dispatchOutstandingMUs() {
+  LLVM_DEBUG(dbgs() << "Dispatching MaterializationUnits...\n");
+  while (1) {
+    Optional<std::pair<std::unique_ptr<MaterializationUnit>,
+                       std::unique_ptr<MaterializationResponsibility>>>
+        JMU;
+
+    {
+      std::lock_guard<std::recursive_mutex> Lock(OutstandingMUsMutex);
+      if (!OutstandingMUs.empty()) {
+        JMU.emplace(std::move(OutstandingMUs.back()));
+        OutstandingMUs.pop_back();
+      }
+    }
+
+    if (!JMU)
+      break;
+
+    assert(JMU->first && "No MU?");
+    LLVM_DEBUG(dbgs() << "  Dispatching \"" << JMU->first->getName() << "\"\n");
+    dispatchMaterialization(std::move(JMU->first), std::move(JMU->second));
+  }
+  LLVM_DEBUG(dbgs() << "Done dispatching MaterializationUnits.\n");
+}
+
+Error ExecutionSession::removeResourceTracker(ResourceTracker &RT) {
+  LLVM_DEBUG({
+    dbgs() << "In " << RT.getJITDylib().getName() << " removing tracker "
+           << formatv("{0:x}", RT.getKeyUnsafe()) << "\n";
+  });
+  std::vector<ResourceManager *> CurrentResourceManagers;
+
+  JITDylib::AsynchronousSymbolQuerySet QueriesToFail;
+  std::shared_ptr<SymbolDependenceMap> FailedSymbols;
+
+  runSessionLocked([&] {
+    CurrentResourceManagers = ResourceManagers;
+    RT.makeDefunct();
+    std::tie(QueriesToFail, FailedSymbols) = RT.getJITDylib().removeTracker(RT);
+  });
+
+  Error Err = Error::success();
+
+  for (auto *L : reverse(CurrentResourceManagers))
+    Err =
+        joinErrors(std::move(Err), L->handleRemoveResources(RT.getKeyUnsafe()));
+
+  for (auto &Q : QueriesToFail)
+    Q->handleFailed(make_error<FailedToMaterialize>(FailedSymbols));
+
+  return Err;
+}
+
+void ExecutionSession::transferResourceTracker(ResourceTracker &DstRT,
+                                               ResourceTracker &SrcRT) {
+  LLVM_DEBUG({
+    dbgs() << "In " << SrcRT.getJITDylib().getName()
+           << " transfering resources from tracker "
+           << formatv("{0:x}", SrcRT.getKeyUnsafe()) << " to tracker "
+           << formatv("{0:x}", DstRT.getKeyUnsafe()) << "\n";
+  });
+
+  // No-op transfers are allowed and do not invalidate the source.
+  if (&DstRT == &SrcRT)
+    return;
+
+  assert(&DstRT.getJITDylib() == &SrcRT.getJITDylib() &&
+         "Can't transfer resources between JITDylibs");
+  runSessionLocked([&]() {
+    SrcRT.makeDefunct();
+    auto &JD = DstRT.getJITDylib();
+    JD.transferTracker(DstRT, SrcRT);
+    for (auto *L : reverse(ResourceManagers))
+      L->handleTransferResources(DstRT.getKeyUnsafe(), SrcRT.getKeyUnsafe());
+  });
+}
+
+void ExecutionSession::destroyResourceTracker(ResourceTracker &RT) {
+  runSessionLocked([&]() {
+    LLVM_DEBUG({
+      dbgs() << "In " << RT.getJITDylib().getName() << " destroying tracker "
+             << formatv("{0:x}", RT.getKeyUnsafe()) << "\n";
+    });
+    if (!RT.isDefunct())
+      transferResourceTracker(*RT.getJITDylib().getDefaultResourceTracker(),
+                              RT);
+  });
+}
+
+Error ExecutionSession::IL_updateCandidatesFor(
+    JITDylib &JD, JITDylibLookupFlags JDLookupFlags,
+    SymbolLookupSet &Candidates, SymbolLookupSet *NonCandidates) {
+  return Candidates.forEachWithRemoval(
+      [&](const SymbolStringPtr &Name,
+          SymbolLookupFlags SymLookupFlags) -> Expected<bool> {
+        /// Search for the symbol. If not found then continue without
+        /// removal.
+        auto SymI = JD.Symbols.find(Name);
+        if (SymI == JD.Symbols.end())
+          return false;
+
+        // If this is a non-exported symbol and we're matching exported
+        // symbols only then remove this symbol from the candidates list.
+        //
+        // If we're tracking non-candidates then add this to the non-candidate
+        // list.
+        if (!SymI->second.getFlags().isExported() &&
+            JDLookupFlags == JITDylibLookupFlags::MatchExportedSymbolsOnly) {
+          if (NonCandidates)
+            NonCandidates->add(Name, SymLookupFlags);
+          return true;
+        }
+
+        // If we match against a materialization-side-effects only symbol
+        // then make sure it is weakly-referenced. Otherwise bail out with
+        // an error.
+        // FIXME: Use a "materialization-side-effects-only symbols must be
+        // weakly referenced" specific error here to reduce confusion.
+        if (SymI->second.getFlags().hasMaterializationSideEffectsOnly() &&
+            SymLookupFlags != SymbolLookupFlags::WeaklyReferencedSymbol)
+          return make_error<SymbolsNotFound>(SymbolNameVector({Name}));
+
+        // If we matched against this symbol but it is in the error state
+        // then bail out and treat it as a failure to materialize.
+        if (SymI->second.getFlags().hasError()) {
+          auto FailedSymbolsMap = std::make_shared<SymbolDependenceMap>();
+          (*FailedSymbolsMap)[&JD] = {Name};
+          return make_error<FailedToMaterialize>(std::move(FailedSymbolsMap));
+        }
+
+        // Otherwise this is a match. Remove it from the candidate set.
+        return true;
+      });
+}
+
+void ExecutionSession::OL_applyQueryPhase1(
+    std::unique_ptr<InProgressLookupState> IPLS, Error Err) {
+
+  LLVM_DEBUG({
+    dbgs() << "Entering OL_applyQueryPhase1:\n"
+           << "  Lookup kind: " << IPLS->K << "\n"
+           << "  Search order: " << IPLS->SearchOrder
+           << ", Current index = " << IPLS->CurSearchOrderIndex
+           << (IPLS->NewJITDylib ? " (entering new JITDylib)" : "") << "\n"
+           << "  Lookup set: " << IPLS->LookupSet << "\n"
+           << "  Definition generator candidates: "
+           << IPLS->DefGeneratorCandidates << "\n"
+           << "  Definition generator non-candidates: "
+           << IPLS->DefGeneratorNonCandidates << "\n";
+  });
+
+  // FIXME: We should attach the query as we go: This provides a result in a
+  // single pass in the common case where all symbols have already reached the
+  // required state. The query could be detached again in the 'fail' method on
+  // IPLS. Phase 2 would be reduced to collecting and dispatching the MUs.
+
+  while (IPLS->CurSearchOrderIndex != IPLS->SearchOrder.size()) {
+
+    // If we've been handed an error or received one back from a generator then
+    // fail the query. We don't need to unlink: At this stage the query hasn't
+    // actually been lodged.
+    if (Err)
+      return IPLS->fail(std::move(Err));
+
+    // Get the next JITDylib and lookup flags.
+    auto &KV = IPLS->SearchOrder[IPLS->CurSearchOrderIndex];
+    auto &JD = *KV.first;
+    auto JDLookupFlags = KV.second;
+
+    LLVM_DEBUG({
+      dbgs() << "Visiting \"" << JD.getName() << "\" (" << JDLookupFlags
+             << ") with lookup set " << IPLS->LookupSet << ":\n";
+    });
+
+    // If we've just reached a new JITDylib then perform some setup.
+    if (IPLS->NewJITDylib) {
+
+      // Acquire the generator lock for this JITDylib.
+      IPLS->GeneratorLock = std::unique_lock<std::mutex>(JD.GeneratorsMutex);
+
+      // Add any non-candidates from the last JITDylib (if any) back on to the
+      // list of definition candidates for this JITDylib, reset definition
+      // non-candiates to the empty set.
+      SymbolLookupSet Tmp;
+      std::swap(IPLS->DefGeneratorNonCandidates, Tmp);
+      IPLS->DefGeneratorCandidates.append(std::move(Tmp));
+
+      LLVM_DEBUG({
+        dbgs() << "  First time visiting " << JD.getName()
+               << ", resetting candidate sets and building generator stack\n";
+      });
+
+      // Build the definition generator stack for this JITDylib.
+      for (auto &DG : reverse(JD.DefGenerators))
+        IPLS->CurDefGeneratorStack.push_back(DG);
+
+      // Flag that we've done our initialization.
+      IPLS->NewJITDylib = false;
+    }
+
+    // Remove any generation candidates that are already defined (and match) in
+    // this JITDylib.
+    runSessionLocked([&] {
+      // Update the list of candidates (and non-candidates) for definition
+      // generation.
+      LLVM_DEBUG(dbgs() << "  Updating candidate set...\n");
+      Err = IL_updateCandidatesFor(
+          JD, JDLookupFlags, IPLS->DefGeneratorCandidates,
+          JD.DefGenerators.empty() ? nullptr
+                                   : &IPLS->DefGeneratorNonCandidates);
+      LLVM_DEBUG({
+        dbgs() << "    Remaining candidates = " << IPLS->DefGeneratorCandidates
+               << "\n";
+      });
+    });
+
+    // If we encountered an error while filtering generation candidates then
+    // bail out.
+    if (Err)
+      return IPLS->fail(std::move(Err));
+
+    /// Apply any definition generators on the stack.
+    LLVM_DEBUG({
+      if (IPLS->CurDefGeneratorStack.empty())
+        LLVM_DEBUG(dbgs() << "  No generators to run for this JITDylib.\n");
+      else if (IPLS->DefGeneratorCandidates.empty())
+        LLVM_DEBUG(dbgs() << "  No candidates to generate.\n");
+      else
+        dbgs() << "  Running " << IPLS->CurDefGeneratorStack.size()
+               << " remaining generators for "
+               << IPLS->DefGeneratorCandidates.size() << " candidates\n";
+    });
+    while (!IPLS->CurDefGeneratorStack.empty() &&
+           !IPLS->DefGeneratorCandidates.empty()) {
+      auto DG = IPLS->CurDefGeneratorStack.back().lock();
+      IPLS->CurDefGeneratorStack.pop_back();
+
+      if (!DG)
+        return IPLS->fail(make_error<StringError>(
+            "DefinitionGenerator removed while lookup in progress",
+            inconvertibleErrorCode()));
+
+      auto K = IPLS->K;
+      auto &LookupSet = IPLS->DefGeneratorCandidates;
+
+      // Run the generator. If the generator takes ownership of QA then this
+      // will break the loop.
+      {
+        LLVM_DEBUG(dbgs() << "  Attempting to generate " << LookupSet << "\n");
+        LookupState LS(std::move(IPLS));
+        Err = DG->tryToGenerate(LS, K, JD, JDLookupFlags, LookupSet);
+        IPLS = std::move(LS.IPLS);
+      }
+
+      // If there was an error then fail the query.
+      if (Err) {
+        LLVM_DEBUG({
+          dbgs() << "  Error attempting to generate " << LookupSet << "\n";
+        });
+        assert(IPLS && "LS cannot be retained if error is returned");
+        return IPLS->fail(std::move(Err));
+      }
+
+      // Otherwise if QA was captured then break the loop.
+      if (!IPLS) {
+        LLVM_DEBUG(
+            { dbgs() << "  LookupState captured. Exiting phase1 for now.\n"; });
+        return;
+      }
+
+      // Otherwise if we're continuing around the loop then update candidates
+      // for the next round.
+      runSessionLocked([&] {
+        LLVM_DEBUG(dbgs() << "  Updating candidate set post-generation\n");
+        Err = IL_updateCandidatesFor(
+            JD, JDLookupFlags, IPLS->DefGeneratorCandidates,
+            JD.DefGenerators.empty() ? nullptr
+                                     : &IPLS->DefGeneratorNonCandidates);
+      });
+
+      // If updating candidates failed then fail the query.
+      if (Err) {
+        LLVM_DEBUG(dbgs() << "  Error encountered while updating candidates\n");
+        return IPLS->fail(std::move(Err));
+      }
+    }
+
+    // If we get here then we've moved on to the next JITDylib.
+    LLVM_DEBUG(dbgs() << "Phase 1 moving to next JITDylib.\n");
+    ++IPLS->CurSearchOrderIndex;
+    IPLS->NewJITDylib = true;
+  }
+
+  // Remove any weakly referenced candidates that could not be found/generated.
+  IPLS->DefGeneratorCandidates.remove_if(
+      [](const SymbolStringPtr &Name, SymbolLookupFlags SymLookupFlags) {
+        return SymLookupFlags == SymbolLookupFlags::WeaklyReferencedSymbol;
+      });
+
+  // If we get here then we've finished searching all JITDylibs.
+  // If we matched all symbols then move to phase 2, otherwise fail the query
+  // with a SymbolsNotFound error.
+  if (IPLS->DefGeneratorCandidates.empty()) {
+    LLVM_DEBUG(dbgs() << "Phase 1 succeeded.\n");
+    IPLS->complete(std::move(IPLS));
+  } else {
+    LLVM_DEBUG(dbgs() << "Phase 1 failed with unresolved symbols.\n");
+    IPLS->fail(make_error<SymbolsNotFound>(
+        IPLS->DefGeneratorCandidates.getSymbolNames()));
+  }
+}
+
+void ExecutionSession::OL_completeLookup(
+    std::unique_ptr<InProgressLookupState> IPLS,
+    std::shared_ptr<AsynchronousSymbolQuery> Q,
+    RegisterDependenciesFunction RegisterDependencies) {
+
+  LLVM_DEBUG({
+    dbgs() << "Entering OL_completeLookup:\n"
+           << "  Lookup kind: " << IPLS->K << "\n"
+           << "  Search order: " << IPLS->SearchOrder
+           << ", Current index = " << IPLS->CurSearchOrderIndex
+           << (IPLS->NewJITDylib ? " (entering new JITDylib)" : "") << "\n"
+           << "  Lookup set: " << IPLS->LookupSet << "\n"
+           << "  Definition generator candidates: "
+           << IPLS->DefGeneratorCandidates << "\n"
+           << "  Definition generator non-candidates: "
+           << IPLS->DefGeneratorNonCandidates << "\n";
+  });
 
-  auto Unresolved = std::move(Symbols);
-  std::map<JITDylib *, MaterializationUnitList> CollectedMUsMap;
-  auto Q = std::make_shared<AsynchronousSymbolQuery>(Unresolved, RequiredState,
-                                                     std::move(NotifyComplete));
   bool QueryComplete = false;
+  DenseMap<JITDylib *, JITDylib::UnmaterializedInfosList> CollectedUMIs;
 
   auto LodgingErr = runSessionLocked([&]() -> Error {
-    auto LodgeQuery = [&]() -> Error {
-      for (auto &KV : SearchOrder) {
-        assert(KV.first && "JITDylibList entries must not be null");
-        assert(!CollectedMUsMap.count(KV.first) &&
-               "JITDylibList should not contain duplicate entries");
+    for (auto &KV : IPLS->SearchOrder) {
+      auto &JD = *KV.first;
+      auto JDLookupFlags = KV.second;
+      LLVM_DEBUG({
+        dbgs() << "Visiting \"" << JD.getName() << "\" (" << JDLookupFlags
+               << ") with lookup set " << IPLS->LookupSet << ":\n";
+      });
 
-        auto &JD = *KV.first;
-        auto JDLookupFlags = KV.second;
-        if (auto Err = JD.lodgeQuery(CollectedMUsMap[&JD], Q, K, JDLookupFlags,
-                                     Unresolved))
-          return Err;
-      }
+      auto Err = IPLS->LookupSet.forEachWithRemoval(
+          [&](const SymbolStringPtr &Name,
+              SymbolLookupFlags SymLookupFlags) -> Expected<bool> {
+            LLVM_DEBUG({
+              dbgs() << "  Attempting to match \"" << Name << "\" ("
+                     << SymLookupFlags << ")... ";
+            });
+
+            /// Search for the symbol. If not found then continue without
+            /// removal.
+            auto SymI = JD.Symbols.find(Name);
+            if (SymI == JD.Symbols.end()) {
+              LLVM_DEBUG(dbgs() << "skipping: not present\n");
+              return false;
+            }
 
-      // Strip any weakly referenced symbols that were not found.
-      Unresolved.forEachWithRemoval(
-          [&](const SymbolStringPtr &Name, SymbolLookupFlags Flags) {
-            if (Flags == SymbolLookupFlags::WeaklyReferencedSymbol) {
-              Q->dropSymbol(Name);
+            // If this is a non-exported symbol and we're matching exported
+            // symbols only then skip this symbol without removal.
+            if (!SymI->second.getFlags().isExported() &&
+                JDLookupFlags ==
+                    JITDylibLookupFlags::MatchExportedSymbolsOnly) {
+              LLVM_DEBUG(dbgs() << "skipping: not exported\n");
+              return false;
+            }
+
+            // If we match against a materialization-side-effects only symbol
+            // then make sure it is weakly-referenced. Otherwise bail out with
+            // an error.
+            // FIXME: Use a "materialization-side-effects-only symbols must be
+            // weakly referenced" specific error here to reduce confusion.
+            if (SymI->second.getFlags().hasMaterializationSideEffectsOnly() &&
+                SymLookupFlags != SymbolLookupFlags::WeaklyReferencedSymbol) {
+              LLVM_DEBUG({
+                dbgs() << "error: "
+                          "required, but symbol is has-side-effects-only\n";
+              });
+              return make_error<SymbolsNotFound>(SymbolNameVector({Name}));
+            }
+
+            // If we matched against this symbol but it is in the error state
+            // then bail out and treat it as a failure to materialize.
+            if (SymI->second.getFlags().hasError()) {
+              LLVM_DEBUG(dbgs() << "error: symbol is in error state\n");
+              auto FailedSymbolsMap = std::make_shared<SymbolDependenceMap>();
+              (*FailedSymbolsMap)[&JD] = {Name};
+              return make_error<FailedToMaterialize>(
+                  std::move(FailedSymbolsMap));
+            }
+
+            // Otherwise this is a match.
+
+            // If this symbol is already in the requried state then notify the
+            // query, remove the symbol and continue.
+            if (SymI->second.getState() >= Q->getRequiredState()) {
+              LLVM_DEBUG(dbgs()
+                         << "matched, symbol already in required state\n");
+              Q->notifySymbolMetRequiredState(Name, SymI->second.getSymbol());
               return true;
             }
-            return false;
-          });
 
-      if (!Unresolved.empty())
-        return make_error<SymbolsNotFound>(Unresolved.getSymbolNames());
+            // Otherwise this symbol does not yet meet the required state. Check
+            // whether it has a materializer attached, and if so prepare to run
+            // it.
+            if (SymI->second.hasMaterializerAttached()) {
+              assert(SymI->second.getAddress() == 0 &&
+                     "Symbol not resolved but already has address?");
+              auto UMII = JD.UnmaterializedInfos.find(Name);
+              assert(UMII != JD.UnmaterializedInfos.end() &&
+                     "Lazy symbol should have UnmaterializedInfo");
+
+              auto UMI = UMII->second;
+              assert(UMI->MU && "Materializer should not be null");
+              assert(UMI->RT && "Tracker should not be null");
+              LLVM_DEBUG({
+                dbgs() << "matched, preparing to dispatch MU@" << UMI->MU.get()
+                       << " (" << UMI->MU->getName() << ")\n";
+              });
+
+              // Move all symbols associated with this MaterializationUnit into
+              // materializing state.
+              for (auto &KV : UMI->MU->getSymbols()) {
+                auto SymK = JD.Symbols.find(KV.first);
+                assert(SymK != JD.Symbols.end() &&
+                       "No entry for symbol covered by MaterializationUnit");
+                SymK->second.setMaterializerAttached(false);
+                SymK->second.setState(SymbolState::Materializing);
+                JD.UnmaterializedInfos.erase(KV.first);
+              }
+
+              // Add MU to the list of MaterializationUnits to be materialized.
+              CollectedUMIs[&JD].push_back(std::move(UMI));
+            } else
+              LLVM_DEBUG(dbgs() << "matched, registering query");
+
+            // Add the query to the PendingQueries list and continue, deleting
+            // the element from the lookup set.
+            assert(SymI->second.getState() != SymbolState::NeverSearched &&
+                   SymI->second.getState() != SymbolState::Ready &&
+                   "By this line the symbol should be materializing");
+            auto &MI = JD.MaterializingInfos[Name];
+            MI.addQuery(Q);
+            Q->addQueryDependence(JD, Name);
+
+            return true;
+          });
 
-      return Error::success();
-    };
+      // Handle failure.
+      if (Err) {
 
-    if (auto Err = LodgeQuery()) {
-      // Query failed.
+        LLVM_DEBUG({
+          dbgs() << "Lookup failed. Detaching query and replacing MUs.\n";
+        });
 
-      // Disconnect the query from its dependencies.
-      Q->detach();
+        // Detach the query.
+        Q->detach();
 
-      // Replace the MUs.
-      for (auto &KV : CollectedMUsMap)
-        for (auto &MU : KV.second)
-          KV.first->replace(std::move(MU));
+        // Replace the MUs.
+        for (auto &KV : CollectedUMIs) {
+          auto &JD = *KV.first;
+          for (auto &UMI : KV.second)
+            for (auto &KV2 : UMI->MU->getSymbols()) {
+              assert(!JD.UnmaterializedInfos.count(KV2.first) &&
+                     "Unexpected materializer in map");
+              auto SymI = JD.Symbols.find(KV2.first);
+              assert(SymI != JD.Symbols.end() && "Missing symbol entry");
+              assert(SymI->second.getState() == SymbolState::Materializing &&
+                     "Can not replace symbol that is not materializing");
+              assert(!SymI->second.hasMaterializerAttached() &&
+                     "MaterializerAttached flag should not be set");
+              SymI->second.setMaterializerAttached(true);
+              JD.UnmaterializedInfos[KV2.first] = UMI;
+            }
+        }
 
-      return Err;
+        return Err;
+      }
     }
 
-    // Query lodged successfully.
+    LLVM_DEBUG(dbgs() << "Stripping unmatched weakly-refererced symbols\n");
+    IPLS->LookupSet.forEachWithRemoval(
+        [&](const SymbolStringPtr &Name, SymbolLookupFlags SymLookupFlags) {
+          if (SymLookupFlags == SymbolLookupFlags::WeaklyReferencedSymbol) {
+            Q->dropSymbol(Name);
+            return true;
+          } else
+            return false;
+        });
+
+    if (!IPLS->LookupSet.empty()) {
+      LLVM_DEBUG(dbgs() << "Failing due to unresolved symbols\n");
+      return make_error<SymbolsNotFound>(IPLS->LookupSet.getSymbolNames());
+    }
 
-    // Record whether this query is fully ready / resolved. We will use
-    // this to call handleFullyResolved/handleFullyReady outside the session
-    // lock.
+    // Record whether the query completed.
     QueryComplete = Q->isComplete();
 
-    // Call the register dependencies function.
-    if (RegisterDependencies && !Q->QueryRegistrations.empty())
+    LLVM_DEBUG({
+      dbgs() << "Query successfully "
+             << (QueryComplete ? "completed" : "lodged") << "\n";
+    });
+
+    // Move the collected MUs to the OutstandingMUs list.
+    if (!CollectedUMIs.empty()) {
+      std::lock_guard<std::recursive_mutex> Lock(OutstandingMUsMutex);
+
+      LLVM_DEBUG(dbgs() << "Adding MUs to dispatch:\n");
+      for (auto &KV : CollectedUMIs) {
+        auto &JD = *KV.first;
+        LLVM_DEBUG({
+          dbgs() << "  For " << JD.getName() << ": Adding " << KV.second.size()
+                 << " MUs.\n";
+        });
+        for (auto &UMI : KV.second) {
+          std::unique_ptr<MaterializationResponsibility> MR(
+              new MaterializationResponsibility(
+                  &JD, std::move(UMI->MU->SymbolFlags),
+                  std::move(UMI->MU->InitSymbol)));
+          JD.MRTrackers[MR.get()] = UMI->RT;
+          OutstandingMUs.push_back(
+              std::make_pair(std::move(UMI->MU), std::move(MR)));
+        }
+      }
+    } else
+      LLVM_DEBUG(dbgs() << "No MUs to dispatch.\n");
+
+    if (RegisterDependencies && !Q->QueryRegistrations.empty()) {
+      LLVM_DEBUG(dbgs() << "Registering dependencies\n");
       RegisterDependencies(Q->QueryRegistrations);
+    } else
+      LLVM_DEBUG(dbgs() << "No dependencies to register\n");
 
     return Error::success();
   });
 
   if (LodgingErr) {
+    LLVM_DEBUG(dbgs() << "Failing query\n");
+    Q->detach();
     Q->handleFailed(std::move(LodgingErr));
     return;
   }
 
-  if (QueryComplete)
+  if (QueryComplete) {
+    LLVM_DEBUG(dbgs() << "Completing query\n");
     Q->handleComplete();
+  }
 
-  // Move the MUs to the OutstandingMUs list, then materialize.
-  {
-    std::lock_guard<std::recursive_mutex> Lock(OutstandingMUsMutex);
+  dispatchOutstandingMUs();
+}
+
+void ExecutionSession::OL_completeLookupFlags(
+    std::unique_ptr<InProgressLookupState> IPLS,
+    unique_function<void(Expected<SymbolFlagsMap>)> OnComplete) {
+
+  auto Result = runSessionLocked([&]() -> Expected<SymbolFlagsMap> {
+    LLVM_DEBUG({
+      dbgs() << "Entering OL_completeLookupFlags:\n"
+             << "  Lookup kind: " << IPLS->K << "\n"
+             << "  Search order: " << IPLS->SearchOrder
+             << ", Current index = " << IPLS->CurSearchOrderIndex
+             << (IPLS->NewJITDylib ? " (entering new JITDylib)" : "") << "\n"
+             << "  Lookup set: " << IPLS->LookupSet << "\n"
+             << "  Definition generator candidates: "
+             << IPLS->DefGeneratorCandidates << "\n"
+             << "  Definition generator non-candidates: "
+             << IPLS->DefGeneratorNonCandidates << "\n";
+    });
 
-    for (auto &KV : CollectedMUsMap) {
-      auto JD = KV.first->shared_from_this();
-      for (auto &MU : KV.second) {
-        auto MR = MU->createMaterializationResponsibility(JD);
-        OutstandingMUs.push_back(std::make_pair(std::move(MU), std::move(MR)));
-      }
-    }
-  }
+    SymbolFlagsMap Result;
 
-  runOutstandingMUs();
-}
+    // Attempt to find flags for each symbol.
+    for (auto &KV : IPLS->SearchOrder) {
+      auto &JD = *KV.first;
+      auto JDLookupFlags = KV.second;
+      LLVM_DEBUG({
+        dbgs() << "Visiting \"" << JD.getName() << "\" (" << JDLookupFlags
+               << ") with lookup set " << IPLS->LookupSet << ":\n";
+      });
 
-Expected<SymbolMap>
-ExecutionSession::lookup(const JITDylibSearchOrder &SearchOrder,
-                         const SymbolLookupSet &Symbols, LookupKind K,
-                         SymbolState RequiredState,
-                         RegisterDependenciesFunction RegisterDependencies) {
-#if LLVM_ENABLE_THREADS
-  // In the threaded case we use promises to return the results.
-  std::promise<SymbolMap> PromisedResult;
-  Error ResolutionError = Error::success();
+      IPLS->LookupSet.forEachWithRemoval([&](const SymbolStringPtr &Name,
+                                             SymbolLookupFlags SymLookupFlags) {
+        LLVM_DEBUG({
+          dbgs() << "  Attempting to match \"" << Name << "\" ("
+                 << SymLookupFlags << ")... ";
+        });
+
+        // Search for the symbol. If not found then continue without removing
+        // from the lookup set.
+        auto SymI = JD.Symbols.find(Name);
+        if (SymI == JD.Symbols.end()) {
+          LLVM_DEBUG(dbgs() << "skipping: not present\n");
+          return false;
+        }
 
-  auto NotifyComplete = [&](Expected<SymbolMap> R) {
-    if (R)
-      PromisedResult.set_value(std::move(*R));
-    else {
-      ErrorAsOutParameter _(&ResolutionError);
-      ResolutionError = R.takeError();
-      PromisedResult.set_value(SymbolMap());
+        // If this is a non-exported symbol then it doesn't match. Skip it.
+        if (!SymI->second.getFlags().isExported() &&
+            JDLookupFlags == JITDylibLookupFlags::MatchExportedSymbolsOnly) {
+          LLVM_DEBUG(dbgs() << "skipping: not exported\n");
+          return false;
+        }
+
+        LLVM_DEBUG({
+          dbgs() << "matched, \"" << Name << "\" -> " << SymI->second.getFlags()
+                 << "\n";
+        });
+        Result[Name] = SymI->second.getFlags();
+        return true;
+      });
     }
-  };
 
-#else
-  SymbolMap Result;
-  Error ResolutionError = Error::success();
+    // Remove any weakly referenced symbols that haven't been resolved.
+    IPLS->LookupSet.remove_if(
+        [](const SymbolStringPtr &Name, SymbolLookupFlags SymLookupFlags) {
+          return SymLookupFlags == SymbolLookupFlags::WeaklyReferencedSymbol;
+        });
 
-  auto NotifyComplete = [&](Expected<SymbolMap> R) {
-    ErrorAsOutParameter _(&ResolutionError);
-    if (R)
-      Result = std::move(*R);
-    else
-      ResolutionError = R.takeError();
-  };
-#endif
+    if (!IPLS->LookupSet.empty()) {
+      LLVM_DEBUG(dbgs() << "Failing due to unresolved symbols\n");
+      return make_error<SymbolsNotFound>(IPLS->LookupSet.getSymbolNames());
+    }
 
-  // Perform the asynchronous lookup.
-  lookup(K, SearchOrder, Symbols, RequiredState, NotifyComplete,
-         RegisterDependencies);
+    LLVM_DEBUG(dbgs() << "Succeded, result = " << Result << "\n");
+    return Result;
+  });
 
-#if LLVM_ENABLE_THREADS
-  auto ResultFuture = PromisedResult.get_future();
-  auto Result = ResultFuture.get();
+  // Run the callback on the result.
+  LLVM_DEBUG(dbgs() << "Sending result to handler.\n");
+  OnComplete(std::move(Result));
+}
 
-  if (ResolutionError)
-    return std::move(ResolutionError);
+void ExecutionSession::OL_destroyMaterializationResponsibility(
+    MaterializationResponsibility &MR) {
 
-  return std::move(Result);
+  assert(MR.SymbolFlags.empty() &&
+         "All symbols should have been explicitly materialized or failed");
+  MR.JD->unlinkMaterializationResponsibility(MR);
+}
 
-#else
-  if (ResolutionError)
-    return std::move(ResolutionError);
+SymbolNameSet ExecutionSession::OL_getRequestedSymbols(
+    const MaterializationResponsibility &MR) {
+  return MR.JD->getRequestedSymbols(MR.SymbolFlags);
+}
 
-  return Result;
+Error ExecutionSession::OL_notifyResolved(MaterializationResponsibility &MR,
+                                          const SymbolMap &Symbols) {
+  LLVM_DEBUG({
+    dbgs() << "In " << MR.JD->getName() << " resolving " << Symbols << "\n";
+  });
+#ifndef NDEBUG
+  for (auto &KV : Symbols) {
+    auto WeakFlags = JITSymbolFlags::Weak | JITSymbolFlags::Common;
+    auto I = MR.SymbolFlags.find(KV.first);
+    assert(I != MR.SymbolFlags.end() &&
+           "Resolving symbol outside this responsibility set");
+    assert(!I->second.hasMaterializationSideEffectsOnly() &&
+           "Can't resolve materialization-side-effects-only symbol");
+    assert((KV.second.getFlags() & ~WeakFlags) == (I->second & ~WeakFlags) &&
+           "Resolving symbol with incorrect flags");
+  }
 #endif
+
+  return MR.JD->resolve(MR, Symbols);
 }
 
-Expected<JITEvaluatedSymbol>
-ExecutionSession::lookup(const JITDylibSearchOrder &SearchOrder,
-                         SymbolStringPtr Name, SymbolState RequiredState) {
-  SymbolLookupSet Names({Name});
+Error ExecutionSession::OL_notifyEmitted(MaterializationResponsibility &MR) {
+  LLVM_DEBUG({
+    dbgs() << "In " << MR.JD->getName() << " emitting " << MR.SymbolFlags << "\n";
+  });
 
-  if (auto ResultMap = lookup(SearchOrder, std::move(Names), LookupKind::Static,
-                              RequiredState, NoDependenciesToRegister)) {
-    assert(ResultMap->size() == 1 && "Unexpected number of results");
-    assert(ResultMap->count(Name) && "Missing result for symbol");
-    return std::move(ResultMap->begin()->second);
-  } else
-    return ResultMap.takeError();
-}
+  if (auto Err = MR.JD->emit(MR, MR.SymbolFlags))
+    return Err;
 
-Expected<JITEvaluatedSymbol>
-ExecutionSession::lookup(ArrayRef<JITDylib *> SearchOrder, SymbolStringPtr Name,
-                         SymbolState RequiredState) {
-  return lookup(makeJITDylibSearchOrder(SearchOrder), Name, RequiredState);
+  MR.SymbolFlags.clear();
+  return Error::success();
 }
 
-Expected<JITEvaluatedSymbol>
-ExecutionSession::lookup(ArrayRef<JITDylib *> SearchOrder, StringRef Name,
-                         SymbolState RequiredState) {
-  return lookup(SearchOrder, intern(Name), RequiredState);
+Error ExecutionSession::OL_defineMaterializing(
+    MaterializationResponsibility &MR, SymbolFlagsMap NewSymbolFlags) {
+
+  LLVM_DEBUG({
+    dbgs() << "In " << MR.JD->getName() << " defining materializing symbols "
+           << NewSymbolFlags << "\n";
+  });
+  if (auto AcceptedDefs = MR.JD->defineMaterializing(std::move(NewSymbolFlags))) {
+    // Add all newly accepted symbols to this responsibility object.
+    for (auto &KV : *AcceptedDefs)
+      MR.SymbolFlags.insert(KV);
+    return Error::success();
+  } else
+    return AcceptedDefs.takeError();
 }
 
-void ExecutionSession::dump(raw_ostream &OS) {
-  runSessionLocked([this, &OS]() {
-    for (auto &JD : JDs)
-      JD->dump(OS);
+void ExecutionSession::OL_notifyFailed(MaterializationResponsibility &MR) {
+
+  LLVM_DEBUG({
+    dbgs() << "In " << MR.JD->getName() << " failing materialization for "
+           << MR.SymbolFlags << "\n";
+  });
+
+  JITDylib::FailedSymbolsWorklist Worklist;
+
+  for (auto &KV : MR.SymbolFlags)
+    Worklist.push_back(std::make_pair(MR.JD.get(), KV.first));
+  MR.SymbolFlags.clear();
+
+  if (Worklist.empty())
+    return;
+
+  JITDylib::AsynchronousSymbolQuerySet FailedQueries;
+  std::shared_ptr<SymbolDependenceMap> FailedSymbols;
+
+  runSessionLocked([&]() {
+    auto RTI = MR.JD->MRTrackers.find(&MR);
+    assert(RTI != MR.JD->MRTrackers.end() && "No tracker for this");
+    if (RTI->second->isDefunct())
+      return;
+
+    std::tie(FailedQueries, FailedSymbols) =
+        JITDylib::failSymbols(std::move(Worklist));
   });
+
+  for (auto &Q : FailedQueries)
+    Q->handleFailed(make_error<FailedToMaterialize>(FailedSymbols));
 }
 
-void ExecutionSession::runOutstandingMUs() {
-  while (1) {
-    Optional<std::pair<std::unique_ptr<MaterializationUnit>,
-                       MaterializationResponsibility>>
-        JMU;
+Error ExecutionSession::OL_replace(MaterializationResponsibility &MR,
+                                   std::unique_ptr<MaterializationUnit> MU) {
+  for (auto &KV : MU->getSymbols()) {
+    assert(MR.SymbolFlags.count(KV.first) &&
+           "Replacing definition outside this responsibility set");
+    MR.SymbolFlags.erase(KV.first);
+  }
 
-    {
-      std::lock_guard<std::recursive_mutex> Lock(OutstandingMUsMutex);
-      if (!OutstandingMUs.empty()) {
-        JMU.emplace(std::move(OutstandingMUs.back()));
-        OutstandingMUs.pop_back();
-      }
-    }
+  if (MU->getInitializerSymbol() == MR.InitSymbol)
+    MR.InitSymbol = nullptr;
 
-    if (!JMU)
-      break;
+  LLVM_DEBUG(MR.JD->getExecutionSession().runSessionLocked([&]() {
+    dbgs() << "In " << MR.JD->getName() << " replacing symbols with " << *MU
+           << "\n";
+  }););
 
-    assert(JMU->first && "No MU?");
-    dispatchMaterialization(std::move(JMU->first), std::move(JMU->second));
+  return MR.JD->replace(MR, std::move(MU));
+}
+
+Expected<std::unique_ptr<MaterializationResponsibility>>
+ExecutionSession::OL_delegate(MaterializationResponsibility &MR,
+                              const SymbolNameSet &Symbols) {
+
+  SymbolStringPtr DelegatedInitSymbol;
+  SymbolFlagsMap DelegatedFlags;
+
+  for (auto &Name : Symbols) {
+    auto I = MR.SymbolFlags.find(Name);
+    assert(I != MR.SymbolFlags.end() &&
+           "Symbol is not tracked by this MaterializationResponsibility "
+           "instance");
+
+    DelegatedFlags[Name] = std::move(I->second);
+    if (Name == MR.InitSymbol)
+      std::swap(MR.InitSymbol, DelegatedInitSymbol);
+
+    MR.SymbolFlags.erase(I);
   }
+
+  return MR.JD->delegate(MR, std::move(DelegatedFlags),
+                         std::move(DelegatedInitSymbol));
+}
+
+void ExecutionSession::OL_addDependencies(
+    MaterializationResponsibility &MR, const SymbolStringPtr &Name,
+    const SymbolDependenceMap &Dependencies) {
+  LLVM_DEBUG({
+    dbgs() << "Adding dependencies for " << Name << ": " << Dependencies
+           << "\n";
+  });
+  assert(MR.SymbolFlags.count(Name) &&
+         "Symbol not covered by this MaterializationResponsibility instance");
+  MR.JD->addDependencies(Name, Dependencies);
+}
+
+void ExecutionSession::OL_addDependenciesForAll(
+    MaterializationResponsibility &MR,
+    const SymbolDependenceMap &Dependencies) {
+  LLVM_DEBUG({
+    dbgs() << "Adding dependencies for all symbols in " << MR.SymbolFlags << ": "
+           << Dependencies << "\n";
+  });
+  for (auto &KV : MR.SymbolFlags)
+    MR.JD->addDependencies(KV.first, Dependencies);
 }
 
 #ifndef NDEBUG
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
index 4d255cd66c1b..6a1a41a13a1b 100644
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
@@ -21,32 +21,6 @@
 namespace llvm {
 namespace orc {
 
-int runAsMain(int (*Main)(int, char *[]), ArrayRef<std::string> Args,
-              Optional<StringRef> ProgramName) {
-  std::vector<std::unique_ptr<char[]>> ArgVStorage;
-  std::vector<char *> ArgV;
-
-  ArgVStorage.reserve(Args.size() + (ProgramName ? 1 : 0));
-  ArgV.reserve(Args.size() + 1 + (ProgramName ? 1 : 0));
-
-  if (ProgramName) {
-    ArgVStorage.push_back(std::make_unique<char[]>(ProgramName->size() + 1));
-    llvm::copy(*ProgramName, &ArgVStorage.back()[0]);
-    ArgVStorage.back()[ProgramName->size()] = '\0';
-    ArgV.push_back(ArgVStorage.back().get());
-  }
-
-  for (auto &Arg : Args) {
-    ArgVStorage.push_back(std::make_unique<char[]>(Arg.size() + 1));
-    llvm::copy(Arg, &ArgVStorage.back()[0]);
-    ArgVStorage.back()[Arg.size()] = '\0';
-    ArgV.push_back(ArgVStorage.back().get());
-  }
-  ArgV.push_back(nullptr);
-
-  return Main(Args.size() + !!ProgramName, ArgV.data());
-}
-
 CtorDtorIterator::CtorDtorIterator(const GlobalVariable *GV, bool End)
   : InitList(
       GV ? dyn_cast_or_null<ConstantArray>(GV->getInitializer()) : nullptr),
@@ -261,8 +235,8 @@ DynamicLibrarySearchGenerator::Load(const char *FileName, char GlobalPrefix,
 }
 
 Error DynamicLibrarySearchGenerator::tryToGenerate(
-    LookupKind K, JITDylib &JD, JITDylibLookupFlags JDLookupFlags,
-    const SymbolLookupSet &Symbols) {
+    LookupState &LS, LookupKind K, JITDylib &JD,
+    JITDylibLookupFlags JDLookupFlags, const SymbolLookupSet &Symbols) {
   orc::SymbolMap NewSymbols;
 
   bool HasGlobalPrefix = (GlobalPrefix != '\0');
@@ -322,7 +296,8 @@ StaticLibraryDefinitionGenerator::Load(ObjectLayer &L, const char *FileName,
       auto ObjTT = Obj.getTriple();
       if (ObjTT.getArch() == TT.getArch() &&
           ObjTT.getSubArch() == TT.getSubArch() &&
-          ObjTT.getVendor() == TT.getVendor()) {
+          (TT.getVendor() == Triple::UnknownVendor ||
+           ObjTT.getVendor() == TT.getVendor())) {
         // We found a match. Create an instance from a buffer covering this
         // slice.
         auto SliceBuffer = MemoryBuffer::getFileSlice(FileName, Obj.getSize(),
@@ -364,8 +339,8 @@ StaticLibraryDefinitionGenerator::Create(
 }
 
 Error StaticLibraryDefinitionGenerator::tryToGenerate(
-    LookupKind K, JITDylib &JD, JITDylibLookupFlags JDLookupFlags,
-    const SymbolLookupSet &Symbols) {
+    LookupState &LS, LookupKind K, JITDylib &JD,
+    JITDylibLookupFlags JDLookupFlags, const SymbolLookupSet &Symbols) {
 
   // Don't materialize symbols from static archives unless this is a static
   // lookup.
@@ -396,8 +371,7 @@ Error StaticLibraryDefinitionGenerator::tryToGenerate(
     MemoryBufferRef ChildBufferRef(ChildBufferInfo.first,
                                    ChildBufferInfo.second);
 
-    if (auto Err = L.add(JD, MemoryBuffer::getMemBuffer(ChildBufferRef, false),
-                         VModuleKey()))
+    if (auto Err = L.add(JD, MemoryBuffer::getMemBuffer(ChildBufferRef, false)))
       return Err;
   }
 
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/IRCompileLayer.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/IRCompileLayer.cpp
index 023940dc8298..aadc437c80c4 100644
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/IRCompileLayer.cpp
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/IRCompileLayer.cpp
@@ -25,7 +25,7 @@ void IRCompileLayer::setNotifyCompiled(NotifyCompiledFunction NotifyCompiled) {
   this->NotifyCompiled = std::move(NotifyCompiled);
 }
 
-void IRCompileLayer::emit(MaterializationResponsibility R,
+void IRCompileLayer::emit(std::unique_ptr<MaterializationResponsibility> R,
                           ThreadSafeModule TSM) {
   assert(TSM && "Module must not be null");
 
@@ -33,13 +33,13 @@ void IRCompileLayer::emit(MaterializationResponsibility R,
     {
       std::lock_guard<std::mutex> Lock(IRLayerMutex);
       if (NotifyCompiled)
-        NotifyCompiled(R.getVModuleKey(), std::move(TSM));
+        NotifyCompiled(*R, std::move(TSM));
       else
         TSM = ThreadSafeModule();
     }
     BaseLayer.emit(std::move(R), std::move(*Obj));
   } else {
-    R.failMaterialization();
+    R->failMaterialization();
     getExecutionSession().reportError(Obj.takeError());
   }
 }
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/IRTransformLayer.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/IRTransformLayer.cpp
index 511248f83b25..d5b11349277c 100644
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/IRTransformLayer.cpp
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/IRTransformLayer.cpp
@@ -17,14 +17,14 @@ IRTransformLayer::IRTransformLayer(ExecutionSession &ES, IRLayer &BaseLayer,
     : IRLayer(ES, BaseLayer.getManglingOptions()), BaseLayer(BaseLayer),
       Transform(std::move(Transform)) {}
 
-void IRTransformLayer::emit(MaterializationResponsibility R,
+void IRTransformLayer::emit(std::unique_ptr<MaterializationResponsibility> R,
                             ThreadSafeModule TSM) {
   assert(TSM && "Module must not be null");
 
-  if (auto TransformedTSM = Transform(std::move(TSM), R))
+  if (auto TransformedTSM = Transform(std::move(TSM), *R))
     BaseLayer.emit(std::move(R), std::move(*TransformedTSM));
   else {
-    R.failMaterialization();
+    R->failMaterialization();
     getExecutionSession().reportError(TransformedTSM.takeError());
   }
 }
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
index 031b1afefc9d..1cfcf8ae943d 100644
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
@@ -25,20 +25,20 @@ public:
   using CompileFunction = JITCompileCallbackManager::CompileFunction;
 
   CompileCallbackMaterializationUnit(SymbolStringPtr Name,
-                                     CompileFunction Compile, VModuleKey K)
+                                     CompileFunction Compile)
       : MaterializationUnit(SymbolFlagsMap({{Name, JITSymbolFlags::Exported}}),
-                            nullptr, std::move(K)),
+                            nullptr),
         Name(std::move(Name)), Compile(std::move(Compile)) {}
 
   StringRef getName() const override { return "<Compile Callbacks>"; }
 
 private:
-  void materialize(MaterializationResponsibility R) override {
+  void materialize(std::unique_ptr<MaterializationResponsibility> R) override {
     SymbolMap Result;
     Result[Name] = JITEvaluatedSymbol(Compile(), JITSymbolFlags::Exported);
     // No dependencies, so these calls cannot fail.
-    cantFail(R.notifyResolved(Result));
-    cantFail(R.notifyEmitted());
+    cantFail(R->notifyResolved(Result));
+    cantFail(R->notifyEmitted());
   }
 
   void discard(const JITDylib &JD, const SymbolStringPtr &Name) override {
@@ -54,8 +54,8 @@ private:
 namespace llvm {
 namespace orc {
 
+TrampolinePool::~TrampolinePool() {}
 void IndirectStubsManager::anchor() {}
-void TrampolinePool::anchor() {}
 
 Expected<JITTargetAddress>
 JITCompileCallbackManager::getCompileCallback(CompileFunction Compile) {
@@ -65,10 +65,9 @@ JITCompileCallbackManager::getCompileCallback(CompileFunction Compile) {
 
     std::lock_guard<std::mutex> Lock(CCMgrMutex);
     AddrToSymbol[*TrampolineAddr] = CallbackName;
-    cantFail(CallbacksJD.define(
-        std::make_unique<CompileCallbackMaterializationUnit>(
-            std::move(CallbackName), std::move(Compile),
-            ES.allocateVModule())));
+    cantFail(
+        CallbacksJD.define(std::make_unique<CompileCallbackMaterializationUnit>(
+            std::move(CallbackName), std::move(Compile))));
     return *TrampolineAddr;
   } else
     return TrampolineAddr.takeError();
@@ -149,7 +148,7 @@ createLocalCompileCallbackManager(const Triple &T, ExecutionSession &ES,
     }
 
     case Triple::x86_64: {
-      if ( T.getOS() == Triple::OSType::Win32 ) {
+      if (T.getOS() == Triple::OSType::Win32) {
         typedef orc::LocalJITCompileCallbackManager<orc::OrcX86_64_Win32> CCMgrT;
         return CCMgrT::Create(ES, ErrorHandlerAddress);
       } else {
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
index 713a48fbf3eb..c368c1e37134 100644
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
@@ -11,8 +11,10 @@
 #include "llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h"
 #include "llvm/ExecutionEngine/Orc/MachOPlatform.h"
 #include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h"
-#include "llvm/ExecutionEngine/Orc/OrcError.h"
+#include "llvm/ExecutionEngine/Orc/ObjectTransformLayer.h"
 #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
+#include "llvm/ExecutionEngine/Orc/Shared/OrcError.h"
+#include "llvm/ExecutionEngine/Orc/TargetProcessControl.h"
 #include "llvm/ExecutionEngine/SectionMemoryManager.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
@@ -87,8 +89,9 @@ class GenericLLVMIRPlatform : public Platform {
 public:
   GenericLLVMIRPlatform(GenericLLVMIRPlatformSupport &S) : S(S) {}
   Error setupJITDylib(JITDylib &JD) override;
-  Error notifyAdding(JITDylib &JD, const MaterializationUnit &MU) override;
-  Error notifyRemoving(JITDylib &JD, VModuleKey K) override {
+  Error notifyAdding(ResourceTracker &RT,
+                     const MaterializationUnit &MU) override;
+  Error notifyRemoving(ResourceTracker &RT) override {
     // Noop -- Nothing to do (yet).
     return Error::success();
   }
@@ -186,7 +189,8 @@ public:
     return J.addIRModule(JD, ThreadSafeModule(std::move(M), std::move(Ctx)));
   }
 
-  Error notifyAdding(JITDylib &JD, const MaterializationUnit &MU) {
+  Error notifyAdding(ResourceTracker &RT, const MaterializationUnit &MU) {
+    auto &JD = RT.getJITDylib();
     if (auto &InitSym = MU.getInitializerSymbol())
       InitSymbols[&JD].add(InitSym, SymbolLookupFlags::WeaklyReferencedSymbol);
     else {
@@ -235,7 +239,7 @@ public:
       });
       for (auto DeinitFnAddr : *Deinitializers) {
         LLVM_DEBUG({
-          dbgs() << "  Running init " << formatv("{0:x16}", DeinitFnAddr)
+          dbgs() << "  Running deinit " << formatv("{0:x16}", DeinitFnAddr)
                  << "...\n";
         });
         auto *DeinitFn = jitTargetAddressToFunction<void (*)()>(DeinitFnAddr);
@@ -260,23 +264,23 @@ private:
       return std::move(Err);
 
     DenseMap<JITDylib *, SymbolLookupSet> LookupSymbols;
-    std::vector<JITDylib *> DFSLinkOrder;
+    std::vector<JITDylibSP> DFSLinkOrder;
 
     getExecutionSession().runSessionLocked([&]() {
-        DFSLinkOrder = getDFSLinkOrder(JD);
-
-        for (auto *NextJD : DFSLinkOrder) {
-          auto IFItr = InitFunctions.find(NextJD);
-          if (IFItr != InitFunctions.end()) {
-            LookupSymbols[NextJD] = std::move(IFItr->second);
-            InitFunctions.erase(IFItr);
-          }
+      DFSLinkOrder = JD.getDFSLinkOrder();
+
+      for (auto &NextJD : DFSLinkOrder) {
+        auto IFItr = InitFunctions.find(NextJD.get());
+        if (IFItr != InitFunctions.end()) {
+          LookupSymbols[NextJD.get()] = std::move(IFItr->second);
+          InitFunctions.erase(IFItr);
         }
-      });
+      }
+    });
 
     LLVM_DEBUG({
       dbgs() << "JITDylib init order is [ ";
-      for (auto *JD : llvm::reverse(DFSLinkOrder))
+      for (auto &JD : llvm::reverse(DFSLinkOrder))
         dbgs() << "\"" << JD->getName() << "\" ";
       dbgs() << "]\n";
       dbgs() << "Looking up init functions:\n";
@@ -310,26 +314,26 @@ private:
     auto LLJITRunAtExits = J.mangleAndIntern("__lljit_run_atexits");
 
     DenseMap<JITDylib *, SymbolLookupSet> LookupSymbols;
-    std::vector<JITDylib *> DFSLinkOrder;
+    std::vector<JITDylibSP> DFSLinkOrder;
 
     ES.runSessionLocked([&]() {
-        DFSLinkOrder = getDFSLinkOrder(JD);
-
-        for (auto *NextJD : DFSLinkOrder) {
-          auto &JDLookupSymbols = LookupSymbols[NextJD];
-          auto DIFItr = DeInitFunctions.find(NextJD);
-          if (DIFItr != DeInitFunctions.end()) {
-            LookupSymbols[NextJD] = std::move(DIFItr->second);
-            DeInitFunctions.erase(DIFItr);
-          }
-          JDLookupSymbols.add(LLJITRunAtExits,
+      DFSLinkOrder = JD.getDFSLinkOrder();
+
+      for (auto &NextJD : DFSLinkOrder) {
+        auto &JDLookupSymbols = LookupSymbols[NextJD.get()];
+        auto DIFItr = DeInitFunctions.find(NextJD.get());
+        if (DIFItr != DeInitFunctions.end()) {
+          LookupSymbols[NextJD.get()] = std::move(DIFItr->second);
+          DeInitFunctions.erase(DIFItr);
+        }
+        JDLookupSymbols.add(LLJITRunAtExits,
                             SymbolLookupFlags::WeaklyReferencedSymbol);
       }
     });
 
     LLVM_DEBUG({
       dbgs() << "JITDylib deinit order is [ ";
-      for (auto *JD : DFSLinkOrder)
+      for (auto &JD : DFSLinkOrder)
         dbgs() << "\"" << JD->getName() << "\" ";
       dbgs() << "]\n";
       dbgs() << "Looking up deinit functions:\n";
@@ -343,8 +347,8 @@ private:
       return LookupResult.takeError();
 
     std::vector<JITTargetAddress> DeInitializers;
-    for (auto *NextJD : DFSLinkOrder) {
-      auto DeInitsItr = LookupResult->find(NextJD);
+    for (auto &NextJD : DFSLinkOrder) {
+      auto DeInitsItr = LookupResult->find(NextJD.get());
       assert(DeInitsItr != LookupResult->end() &&
              "Every JD should have at least __lljit_run_atexits");
 
@@ -360,46 +364,23 @@ private:
     return DeInitializers;
   }
 
-  // Returns a DFS traversal order of the JITDylibs reachable (via
-  // links-against edges) from JD, starting with JD itself.
-  static std::vector<JITDylib *> getDFSLinkOrder(JITDylib &JD) {
-    std::vector<JITDylib *> DFSLinkOrder;
-    std::vector<JITDylib *> WorkStack({&JD});
-    DenseSet<JITDylib *> Visited;
-
-    while (!WorkStack.empty()) {
-      auto &NextJD = *WorkStack.back();
-      WorkStack.pop_back();
-      if (Visited.count(&NextJD))
-        continue;
-      Visited.insert(&NextJD);
-      DFSLinkOrder.push_back(&NextJD);
-      NextJD.withLinkOrderDo([&](const JITDylibSearchOrder &LinkOrder) {
-        for (auto &KV : LinkOrder)
-          WorkStack.push_back(KV.first);
-      });
-    }
-
-    return DFSLinkOrder;
-  }
-
   /// Issue lookups for all init symbols required to initialize JD (and any
   /// JITDylibs that it depends on).
   Error issueInitLookups(JITDylib &JD) {
     DenseMap<JITDylib *, SymbolLookupSet> RequiredInitSymbols;
-    std::vector<JITDylib *> DFSLinkOrder;
+    std::vector<JITDylibSP> DFSLinkOrder;
 
     getExecutionSession().runSessionLocked([&]() {
-        DFSLinkOrder = getDFSLinkOrder(JD);
-
-        for (auto *NextJD : DFSLinkOrder) {
-          auto ISItr = InitSymbols.find(NextJD);
-          if (ISItr != InitSymbols.end()) {
-            RequiredInitSymbols[NextJD] = std::move(ISItr->second);
-            InitSymbols.erase(ISItr);
-          }
+      DFSLinkOrder = JD.getDFSLinkOrder();
+
+      for (auto &NextJD : DFSLinkOrder) {
+        auto ISItr = InitSymbols.find(NextJD.get());
+        if (ISItr != InitSymbols.end()) {
+          RequiredInitSymbols[NextJD.get()] = std::move(ISItr->second);
+          InitSymbols.erase(ISItr);
         }
-      });
+      }
+    });
 
     return Platform::lookupInitSymbols(getExecutionSession(),
                                        RequiredInitSymbols)
@@ -468,9 +449,9 @@ Error GenericLLVMIRPlatform::setupJITDylib(JITDylib &JD) {
   return S.setupJITDylib(JD);
 }
 
-Error GenericLLVMIRPlatform::notifyAdding(JITDylib &JD,
+Error GenericLLVMIRPlatform::notifyAdding(ResourceTracker &RT,
                                           const MaterializationUnit &MU) {
-  return S.notifyAdding(JD, MU);
+  return S.notifyAdding(RT, MU);
 }
 
 Expected<ThreadSafeModule>
@@ -927,7 +908,7 @@ LLJIT::PlatformSupport::~PlatformSupport() {}
 
 Error LLJITBuilderState::prepareForConstruction() {
 
-  LLVM_DEBUG(dbgs() << "Preparing to create LLIT instance...\n");
+  LLVM_DEBUG(dbgs() << "Preparing to create LLJIT instance...\n");
 
   if (!JTMB) {
     LLVM_DEBUG({
@@ -973,12 +954,18 @@ Error LLJITBuilderState::prepareForConstruction() {
       JTMB->setRelocationModel(Reloc::PIC_);
       JTMB->setCodeModel(CodeModel::Small);
       CreateObjectLinkingLayer =
-          [](ExecutionSession &ES,
-             const Triple &) -> std::unique_ptr<ObjectLayer> {
-        auto ObjLinkingLayer = std::make_unique<ObjectLinkingLayer>(
-            ES, std::make_unique<jitlink::InProcessMemoryManager>());
+          [TPC = this->TPC](
+              ExecutionSession &ES,
+              const Triple &) -> Expected<std::unique_ptr<ObjectLayer>> {
+        std::unique_ptr<ObjectLinkingLayer> ObjLinkingLayer;
+        if (TPC)
+          ObjLinkingLayer =
+              std::make_unique<ObjectLinkingLayer>(ES, TPC->getMemMgr());
+        else
+          ObjLinkingLayer = std::make_unique<ObjectLinkingLayer>(
+              ES, std::make_unique<jitlink::InProcessMemoryManager>());
         ObjLinkingLayer->addPlugin(std::make_unique<EHFrameRegistrationPlugin>(
-            jitlink::InProcessEHFrameRegistrar::getInstance()));
+            ES, std::make_unique<jitlink::InProcessEHFrameRegistrar>()));
         return std::move(ObjLinkingLayer);
       };
     }
@@ -990,23 +977,33 @@ Error LLJITBuilderState::prepareForConstruction() {
 LLJIT::~LLJIT() {
   if (CompileThreads)
     CompileThreads->wait();
+  if (auto Err = ES->endSession())
+    ES->reportError(std::move(Err));
 }
 
-Error LLJIT::addIRModule(JITDylib &JD, ThreadSafeModule TSM) {
+Error LLJIT::addIRModule(ResourceTrackerSP RT, ThreadSafeModule TSM) {
   assert(TSM && "Can not add null module");
 
   if (auto Err =
           TSM.withModuleDo([&](Module &M) { return applyDataLayout(M); }))
     return Err;
 
-  return InitHelperTransformLayer->add(JD, std::move(TSM),
-                                       ES->allocateVModule());
+  return InitHelperTransformLayer->add(std::move(RT), std::move(TSM));
 }
 
-Error LLJIT::addObjectFile(JITDylib &JD, std::unique_ptr<MemoryBuffer> Obj) {
+Error LLJIT::addIRModule(JITDylib &JD, ThreadSafeModule TSM) {
+  return addIRModule(JD.getDefaultResourceTracker(), std::move(TSM));
+}
+
+Error LLJIT::addObjectFile(ResourceTrackerSP RT,
+                           std::unique_ptr<MemoryBuffer> Obj) {
   assert(Obj && "Can not add null object");
 
-  return ObjTransformLayer.add(JD, std::move(Obj), ES->allocateVModule());
+  return ObjTransformLayer->add(std::move(RT), std::move(Obj));
+}
+
+Error LLJIT::addObjectFile(JITDylib &JD, std::unique_ptr<MemoryBuffer> Obj) {
+  return addObjectFile(JD.getDefaultResourceTracker(), std::move(Obj));
 }
 
 Expected<JITEvaluatedSymbol> LLJIT::lookupLinkerMangled(JITDylib &JD,
@@ -1015,7 +1012,7 @@ Expected<JITEvaluatedSymbol> LLJIT::lookupLinkerMangled(JITDylib &JD,
       makeJITDylibSearchOrder(&JD, JITDylibLookupFlags::MatchAllSymbols), Name);
 }
 
-std::unique_ptr<ObjectLayer>
+Expected<std::unique_ptr<ObjectLayer>>
 LLJIT::createObjectLinkingLayer(LLJITBuilderState &S, ExecutionSession &ES) {
 
   // If the config state provided an ObjectLinkingLayer factory then use it.
@@ -1061,9 +1058,7 @@ LLJIT::createCompileFunction(LLJITBuilderState &S,
 
 LLJIT::LLJIT(LLJITBuilderState &S, Error &Err)
     : ES(S.ES ? std::move(S.ES) : std::make_unique<ExecutionSession>()), Main(),
-      DL(""), TT(S.JTMB->getTargetTriple()),
-      ObjLinkingLayer(createObjectLinkingLayer(S, *ES)),
-      ObjTransformLayer(*this->ES, *ObjLinkingLayer) {
+      DL(""), TT(S.JTMB->getTargetTriple()) {
 
   ErrorAsOutParameter _(&Err);
 
@@ -1083,6 +1078,15 @@ LLJIT::LLJIT(LLJITBuilderState &S, Error &Err)
     return;
   }
 
+  auto ObjLayer = createObjectLinkingLayer(S, *ES);
+  if (!ObjLayer) {
+    Err = ObjLayer.takeError();
+    return;
+  }
+  ObjLinkingLayer = std::move(*ObjLayer);
+  ObjTransformLayer =
+      std::make_unique<ObjectTransformLayer>(*ES, *ObjLinkingLayer);
+
   {
     auto CompileFunction = createCompileFunction(S, std::move(*S.JTMB));
     if (!CompileFunction) {
@@ -1090,7 +1094,7 @@ LLJIT::LLJIT(LLJITBuilderState &S, Error &Err)
       return;
     }
     CompileLayer = std::make_unique<IRCompileLayer>(
-        *ES, ObjTransformLayer, std::move(*CompileFunction));
+        *ES, *ObjTransformLayer, std::move(*CompileFunction));
     TransformLayer = std::make_unique<IRTransformLayer>(*ES, *CompileLayer);
     InitHelperTransformLayer =
         std::make_unique<IRTransformLayer>(*ES, *TransformLayer);
@@ -1102,15 +1106,17 @@ LLJIT::LLJIT(LLJITBuilderState &S, Error &Err)
         std::make_unique<ThreadPool>(hardware_concurrency(S.NumCompileThreads));
     ES->setDispatchMaterialization(
         [this](std::unique_ptr<MaterializationUnit> MU,
-               MaterializationResponsibility MR) {
-          // FIXME: Switch to move capture once ThreadPool uses unique_function.
-          auto SharedMU = std::shared_ptr<MaterializationUnit>(std::move(MU));
-          auto SharedMR =
-              std::make_shared<MaterializationResponsibility>(std::move(MR));
-          auto Work = [SharedMU, SharedMR]() mutable {
-            SharedMU->materialize(std::move(*SharedMR));
-          };
-          CompileThreads->async(std::move(Work));
+               std::unique_ptr<MaterializationResponsibility> MR) {
+          // FIXME: We should be able to use move-capture here, but ThreadPool's
+          // AsyncTaskTys are std::functions rather than unique_functions
+          // (because MSVC's std::packaged_tasks don't support move-only types).
+          // Fix this when all the above gets sorted out.
+          CompileThreads->async(
+              [UnownedMU = MU.release(), UnownedMR = MR.release()]() mutable {
+                std::unique_ptr<MaterializationUnit> MU(UnownedMU);
+                std::unique_ptr<MaterializationResponsibility> MR(UnownedMR);
+                MU->materialize(std::move(MR));
+              });
         });
   }
 
@@ -1172,7 +1178,7 @@ Error LLLazyJIT::addLazyIRModule(JITDylib &JD, ThreadSafeModule TSM) {
           [&](Module &M) -> Error { return applyDataLayout(M); }))
     return Err;
 
-  return CODLayer->add(JD, std::move(TSM), ES->allocateVModule());
+  return CODLayer->add(JD, std::move(TSM));
 }
 
 LLLazyJIT::LLLazyJIT(LLLazyJITBuilderState &S, Error &Err) : LLJIT(S, Err) {
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/Layer.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/Layer.cpp
index 61e7ab5ae68b..5e27e343d23b 100644
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/Layer.cpp
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/Layer.cpp
@@ -22,16 +22,18 @@ namespace orc {
 
 IRLayer::~IRLayer() {}
 
-Error IRLayer::add(JITDylib &JD, ThreadSafeModule TSM, VModuleKey K) {
+Error IRLayer::add(ResourceTrackerSP RT, ThreadSafeModule TSM) {
+  assert(RT && "RT can not be null");
+  auto &JD = RT->getJITDylib();
   return JD.define(std::make_unique<BasicIRLayerMaterializationUnit>(
-      *this, *getManglingOptions(), std::move(TSM), std::move(K)));
+                       *this, *getManglingOptions(), std::move(TSM)),
+                   std::move(RT));
 }
 
 IRMaterializationUnit::IRMaterializationUnit(
     ExecutionSession &ES, const IRSymbolMapper::ManglingOptions &MO,
-    ThreadSafeModule TSM, VModuleKey K)
-    : MaterializationUnit(SymbolFlagsMap(), nullptr, std::move(K)),
-      TSM(std::move(TSM)) {
+    ThreadSafeModule TSM)
+    : MaterializationUnit(SymbolFlagsMap(), nullptr), TSM(std::move(TSM)) {
 
   assert(this->TSM && "Module must not be null");
 
@@ -83,26 +85,22 @@ IRMaterializationUnit::IRMaterializationUnit(
     if (!llvm::empty(getStaticInitGVs(M))) {
       size_t Counter = 0;
 
-      while (true) {
+      do {
         std::string InitSymbolName;
         raw_string_ostream(InitSymbolName)
             << "$." << M.getModuleIdentifier() << ".__inits." << Counter++;
         InitSymbol = ES.intern(InitSymbolName);
-        if (SymbolFlags.count(InitSymbol))
-          continue;
-        SymbolFlags[InitSymbol] =
-            JITSymbolFlags::MaterializationSideEffectsOnly;
-        break;
-      }
+      } while (SymbolFlags.count(InitSymbol));
+
+      SymbolFlags[InitSymbol] = JITSymbolFlags::MaterializationSideEffectsOnly;
     }
   });
 }
 
 IRMaterializationUnit::IRMaterializationUnit(
-    ThreadSafeModule TSM, VModuleKey K, SymbolFlagsMap SymbolFlags,
+    ThreadSafeModule TSM, SymbolFlagsMap SymbolFlags,
     SymbolStringPtr InitSymbol, SymbolNameToDefinitionMap SymbolToDefinition)
-    : MaterializationUnit(std::move(SymbolFlags), std::move(InitSymbol),
-                          std::move(K)),
+    : MaterializationUnit(std::move(SymbolFlags), std::move(InitSymbol)),
       TSM(std::move(TSM)), SymbolToDefinition(std::move(SymbolToDefinition)) {}
 
 StringRef IRMaterializationUnit::getName() const {
@@ -129,14 +127,12 @@ void IRMaterializationUnit::discard(const JITDylib &JD,
 }
 
 BasicIRLayerMaterializationUnit::BasicIRLayerMaterializationUnit(
-    IRLayer &L, const IRSymbolMapper::ManglingOptions &MO, ThreadSafeModule TSM,
-    VModuleKey K)
-    : IRMaterializationUnit(L.getExecutionSession(), MO, std::move(TSM),
-                            std::move(K)),
-      L(L), K(std::move(K)) {}
+    IRLayer &L, const IRSymbolMapper::ManglingOptions &MO, ThreadSafeModule TSM)
+    : IRMaterializationUnit(L.getExecutionSession(), MO, std::move(TSM)), L(L) {
+}
 
 void BasicIRLayerMaterializationUnit::materialize(
-    MaterializationResponsibility R) {
+    std::unique_ptr<MaterializationResponsibility> R) {
 
   // Throw away the SymbolToDefinition map: it's not usable after we hand
   // off the module.
@@ -147,8 +143,8 @@ void BasicIRLayerMaterializationUnit::materialize(
     TSM = cloneToNewContext(TSM);
 
 #ifndef NDEBUG
-  auto &ES = R.getTargetJITDylib().getExecutionSession();
-  auto &N = R.getTargetJITDylib().getName();
+  auto &ES = R->getTargetJITDylib().getExecutionSession();
+  auto &N = R->getTargetJITDylib().getName();
 #endif // NDEBUG
 
   LLVM_DEBUG(ES.runSessionLocked(
@@ -163,17 +159,17 @@ ObjectLayer::ObjectLayer(ExecutionSession &ES) : ES(ES) {}
 
 ObjectLayer::~ObjectLayer() {}
 
-Error ObjectLayer::add(JITDylib &JD, std::unique_ptr<MemoryBuffer> O,
-                       VModuleKey K) {
-  auto ObjMU = BasicObjectLayerMaterializationUnit::Create(*this, std::move(K),
-                                                           std::move(O));
+Error ObjectLayer::add(ResourceTrackerSP RT, std::unique_ptr<MemoryBuffer> O) {
+  assert(RT && "RT can not be null");
+  auto ObjMU = BasicObjectLayerMaterializationUnit::Create(*this, std::move(O));
   if (!ObjMU)
     return ObjMU.takeError();
-  return JD.define(std::move(*ObjMU));
+  auto &JD = RT->getJITDylib();
+  return JD.define(std::move(*ObjMU), std::move(RT));
 }
 
 Expected<std::unique_ptr<BasicObjectLayerMaterializationUnit>>
-BasicObjectLayerMaterializationUnit::Create(ObjectLayer &L, VModuleKey K,
+BasicObjectLayerMaterializationUnit::Create(ObjectLayer &L,
                                             std::unique_ptr<MemoryBuffer> O) {
   auto ObjSymInfo =
       getObjectSymbolInfo(L.getExecutionSession(), O->getMemBufferRef());
@@ -186,15 +182,14 @@ BasicObjectLayerMaterializationUnit::Create(ObjectLayer &L, VModuleKey K,
 
   return std::unique_ptr<BasicObjectLayerMaterializationUnit>(
       new BasicObjectLayerMaterializationUnit(
-          L, K, std::move(O), std::move(SymbolFlags), std::move(InitSymbol)));
+          L, std::move(O), std::move(SymbolFlags), std::move(InitSymbol)));
 }
 
 BasicObjectLayerMaterializationUnit::BasicObjectLayerMaterializationUnit(
-    ObjectLayer &L, VModuleKey K, std::unique_ptr<MemoryBuffer> O,
-    SymbolFlagsMap SymbolFlags, SymbolStringPtr InitSymbol)
-    : MaterializationUnit(std::move(SymbolFlags), std::move(InitSymbol),
-                          std::move(K)),
-      L(L), O(std::move(O)) {}
+    ObjectLayer &L, std::unique_ptr<MemoryBuffer> O, SymbolFlagsMap SymbolFlags,
+    SymbolStringPtr InitSymbol)
+    : MaterializationUnit(std::move(SymbolFlags), std::move(InitSymbol)), L(L),
+      O(std::move(O)) {}
 
 StringRef BasicObjectLayerMaterializationUnit::getName() const {
   if (O)
@@ -203,7 +198,7 @@ StringRef BasicObjectLayerMaterializationUnit::getName() const {
 }
 
 void BasicObjectLayerMaterializationUnit::materialize(
-    MaterializationResponsibility R) {
+    std::unique_ptr<MaterializationResponsibility> R) {
   L.emit(std::move(R), std::move(O));
 }
 
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/LazyReexports.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/LazyReexports.cpp
index 153f6b80784f..e1f494415e86 100644
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/LazyReexports.cpp
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/LazyReexports.cpp
@@ -75,27 +75,31 @@ void LazyCallThroughManager::resolveTrampolineLandingAddress(
   if (!Entry)
     return NotifyLandingResolved(reportCallThroughError(Entry.takeError()));
 
-  ES.lookup(
-      LookupKind::Static,
-      makeJITDylibSearchOrder(Entry->SourceJD,
-                              JITDylibLookupFlags::MatchAllSymbols),
-      SymbolLookupSet({Entry->SymbolName}), SymbolState::Ready,
-      [this, TrampolineAddr, SymbolName = Entry->SymbolName,
-       NotifyLandingResolved = std::move(NotifyLandingResolved)](
-          Expected<SymbolMap> Result) mutable {
-        if (Result) {
-          assert(Result->size() == 1 && "Unexpected result size");
-          assert(Result->count(SymbolName) && "Unexpected result value");
-          JITTargetAddress LandingAddr = (*Result)[SymbolName].getAddress();
-
-          if (auto Err = notifyResolved(TrampolineAddr, LandingAddr))
-            NotifyLandingResolved(reportCallThroughError(std::move(Err)));
-          else
-            NotifyLandingResolved(LandingAddr);
-        } else
-          NotifyLandingResolved(reportCallThroughError(Result.takeError()));
-      },
-      NoDependenciesToRegister);
+  // Declaring SLS and the callback outside of the call to ES.lookup is a
+  // workaround to fix build failures on AIX and on z/OS platforms.
+  SymbolLookupSet SLS({Entry->SymbolName});
+  auto Callback = [this, TrampolineAddr, SymbolName = Entry->SymbolName,
+                   NotifyLandingResolved = std::move(NotifyLandingResolved)](
+                      Expected<SymbolMap> Result) mutable {
+    if (Result) {
+      assert(Result->size() == 1 && "Unexpected result size");
+      assert(Result->count(SymbolName) && "Unexpected result value");
+      JITTargetAddress LandingAddr = (*Result)[SymbolName].getAddress();
+
+      if (auto Err = notifyResolved(TrampolineAddr, LandingAddr))
+        NotifyLandingResolved(reportCallThroughError(std::move(Err)));
+      else
+        NotifyLandingResolved(LandingAddr);
+    } else {
+      NotifyLandingResolved(reportCallThroughError(Result.takeError()));
+    }
+  };
+
+  ES.lookup(LookupKind::Static,
+            makeJITDylibSearchOrder(Entry->SourceJD,
+                                    JITDylibLookupFlags::MatchAllSymbols),
+            std::move(SLS), SymbolState::Ready, std::move(Callback),
+            NoDependenciesToRegister);
 }
 
 Expected<std::unique_ptr<LazyCallThroughManager>>
@@ -139,9 +143,8 @@ createLocalLazyCallThroughManager(const Triple &T, ExecutionSession &ES,
 
 LazyReexportsMaterializationUnit::LazyReexportsMaterializationUnit(
     LazyCallThroughManager &LCTManager, IndirectStubsManager &ISManager,
-    JITDylib &SourceJD, SymbolAliasMap CallableAliases, ImplSymbolMap *SrcJDLoc,
-    VModuleKey K)
-    : MaterializationUnit(extractFlags(CallableAliases), nullptr, std::move(K)),
+    JITDylib &SourceJD, SymbolAliasMap CallableAliases, ImplSymbolMap *SrcJDLoc)
+    : MaterializationUnit(extractFlags(CallableAliases), nullptr),
       LCTManager(LCTManager), ISManager(ISManager), SourceJD(SourceJD),
       CallableAliases(std::move(CallableAliases)), AliaseeTable(SrcJDLoc) {}
 
@@ -150,8 +153,8 @@ StringRef LazyReexportsMaterializationUnit::getName() const {
 }
 
 void LazyReexportsMaterializationUnit::materialize(
-    MaterializationResponsibility R) {
-  auto RequestedSymbols = R.getRequestedSymbols();
+    std::unique_ptr<MaterializationResponsibility> R) {
+  auto RequestedSymbols = R->getRequestedSymbols();
 
   SymbolAliasMap RequestedAliases;
   for (auto &RequestedSymbol : RequestedSymbols) {
@@ -162,8 +165,13 @@ void LazyReexportsMaterializationUnit::materialize(
   }
 
   if (!CallableAliases.empty())
-    R.replace(lazyReexports(LCTManager, ISManager, SourceJD,
-                            std::move(CallableAliases), AliaseeTable));
+    if (auto Err = R->replace(lazyReexports(LCTManager, ISManager, SourceJD,
+                                            std::move(CallableAliases),
+                                            AliaseeTable))) {
+      R->getExecutionSession().reportError(std::move(Err));
+      R->failMaterialization();
+      return;
+    }
 
   IndirectStubsManager::StubInitsMap StubInits;
   for (auto &Alias : RequestedAliases) {
@@ -178,7 +186,7 @@ void LazyReexportsMaterializationUnit::materialize(
     if (!CallThroughTrampoline) {
       SourceJD.getExecutionSession().reportError(
           CallThroughTrampoline.takeError());
-      R.failMaterialization();
+      R->failMaterialization();
       return;
     }
 
@@ -191,7 +199,7 @@ void LazyReexportsMaterializationUnit::materialize(
 
   if (auto Err = ISManager.createStubs(StubInits)) {
     SourceJD.getExecutionSession().reportError(std::move(Err));
-    R.failMaterialization();
+    R->failMaterialization();
     return;
   }
 
@@ -200,8 +208,8 @@ void LazyReexportsMaterializationUnit::materialize(
     Stubs[Alias.first] = ISManager.findStub(*Alias.first, false);
 
   // No registered dependencies, so these calls cannot fail.
-  cantFail(R.notifyResolved(Stubs));
-  cantFail(R.notifyEmitted());
+  cantFail(R->notifyResolved(Stubs));
+  cantFail(R->notifyEmitted());
 }
 
 void LazyReexportsMaterializationUnit::discard(const JITDylib &JD,
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/Legacy.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/Legacy.cpp
deleted file mode 100644
index 67b804c37287..000000000000
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/Legacy.cpp
+++ /dev/null
@@ -1,68 +0,0 @@
-//===------- Legacy.cpp - Adapters for ExecutionEngine API interop --------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ExecutionEngine/Orc/Legacy.h"
-
-namespace llvm {
-namespace orc {
-
-void SymbolResolver::anchor() {}
-
-JITSymbolResolverAdapter::JITSymbolResolverAdapter(
-    ExecutionSession &ES, SymbolResolver &R, MaterializationResponsibility *MR)
-    : ES(ES), R(R), MR(MR) {}
-
-void JITSymbolResolverAdapter::lookup(const LookupSet &Symbols,
-                                      OnResolvedFunction OnResolved) {
-  SymbolNameSet InternedSymbols;
-  for (auto &S : Symbols)
-    InternedSymbols.insert(ES.intern(S));
-
-  auto OnResolvedWithUnwrap = [OnResolved = std::move(OnResolved)](
-                                  Expected<SymbolMap> InternedResult) mutable {
-    if (!InternedResult) {
-      OnResolved(InternedResult.takeError());
-      return;
-    }
-
-    LookupResult Result;
-    for (auto &KV : *InternedResult)
-      Result[*KV.first] = std::move(KV.second);
-    OnResolved(Result);
-  };
-
-  auto Q = std::make_shared<AsynchronousSymbolQuery>(
-      SymbolLookupSet(InternedSymbols), SymbolState::Resolved,
-      std::move(OnResolvedWithUnwrap));
-
-  auto Unresolved = R.lookup(Q, InternedSymbols);
-  if (Unresolved.empty()) {
-    if (MR)
-      MR->addDependenciesForAll(Q->QueryRegistrations);
-  } else
-    ES.legacyFailQuery(*Q, make_error<SymbolsNotFound>(std::move(Unresolved)));
-}
-
-Expected<JITSymbolResolverAdapter::LookupSet>
-JITSymbolResolverAdapter::getResponsibilitySet(const LookupSet &Symbols) {
-  SymbolNameSet InternedSymbols;
-  for (auto &S : Symbols)
-    InternedSymbols.insert(ES.intern(S));
-
-  auto InternedResult = R.getResponsibilitySet(InternedSymbols);
-  LookupSet Result;
-  for (auto &S : InternedResult) {
-    ResolvedStrings.insert(S);
-    Result.insert(*S);
-  }
-
-  return Result;
-}
-
-} // End namespace orc.
-} // End namespace llvm.
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
index 15c3aa79a2a8..17b9465a0541 100644
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
@@ -159,7 +159,9 @@ Error MachOPlatform::setupJITDylib(JITDylib &JD) {
   return ObjLinkingLayer.add(JD, std::move(ObjBuffer));
 }
 
-Error MachOPlatform::notifyAdding(JITDylib &JD, const MaterializationUnit &MU) {
+Error MachOPlatform::notifyAdding(ResourceTracker &RT,
+                                  const MaterializationUnit &MU) {
+  auto &JD = RT.getJITDylib();
   const auto &InitSym = MU.getInitializerSymbol();
   if (!InitSym)
     return Error::success();
@@ -173,7 +175,7 @@ Error MachOPlatform::notifyAdding(JITDylib &JD, const MaterializationUnit &MU) {
   return Error::success();
 }
 
-Error MachOPlatform::notifyRemoving(JITDylib &JD, VModuleKey K) {
+Error MachOPlatform::notifyRemoving(ResourceTracker &RT) {
   llvm_unreachable("Not supported yet");
 }
 
@@ -185,19 +187,19 @@ MachOPlatform::getInitializerSequence(JITDylib &JD) {
            << JD.getName() << "\n";
   });
 
-  std::vector<JITDylib *> DFSLinkOrder;
+  std::vector<JITDylibSP> DFSLinkOrder;
 
   while (true) {
 
     DenseMap<JITDylib *, SymbolLookupSet> NewInitSymbols;
 
     ES.runSessionLocked([&]() {
-      DFSLinkOrder = getDFSLinkOrder(JD);
+      DFSLinkOrder = JD.getDFSLinkOrder();
 
-      for (auto *InitJD : DFSLinkOrder) {
-        auto RISItr = RegisteredInitSymbols.find(InitJD);
+      for (auto &InitJD : DFSLinkOrder) {
+        auto RISItr = RegisteredInitSymbols.find(InitJD.get());
         if (RISItr != RegisteredInitSymbols.end()) {
-          NewInitSymbols[InitJD] = std::move(RISItr->second);
+          NewInitSymbols[InitJD.get()] = std::move(RISItr->second);
           RegisteredInitSymbols.erase(RISItr);
         }
       }
@@ -229,14 +231,14 @@ MachOPlatform::getInitializerSequence(JITDylib &JD) {
   InitializerSequence FullInitSeq;
   {
     std::lock_guard<std::mutex> Lock(InitSeqsMutex);
-    for (auto *InitJD : reverse(DFSLinkOrder)) {
+    for (auto &InitJD : reverse(DFSLinkOrder)) {
       LLVM_DEBUG({
         dbgs() << "MachOPlatform: Appending inits for \"" << InitJD->getName()
                << "\" to sequence\n";
       });
-      auto ISItr = InitSeqs.find(InitJD);
+      auto ISItr = InitSeqs.find(InitJD.get());
       if (ISItr != InitSeqs.end()) {
-        FullInitSeq.emplace_back(InitJD, std::move(ISItr->second));
+        FullInitSeq.emplace_back(InitJD.get(), std::move(ISItr->second));
         InitSeqs.erase(ISItr);
       }
     }
@@ -247,39 +249,19 @@ MachOPlatform::getInitializerSequence(JITDylib &JD) {
 
 Expected<MachOPlatform::DeinitializerSequence>
 MachOPlatform::getDeinitializerSequence(JITDylib &JD) {
-  std::vector<JITDylib *> DFSLinkOrder = getDFSLinkOrder(JD);
+  std::vector<JITDylibSP> DFSLinkOrder = JD.getDFSLinkOrder();
 
   DeinitializerSequence FullDeinitSeq;
   {
     std::lock_guard<std::mutex> Lock(InitSeqsMutex);
-    for (auto *DeinitJD : DFSLinkOrder) {
-      FullDeinitSeq.emplace_back(DeinitJD, MachOJITDylibDeinitializers());
+    for (auto &DeinitJD : DFSLinkOrder) {
+      FullDeinitSeq.emplace_back(DeinitJD.get(), MachOJITDylibDeinitializers());
     }
   }
 
   return FullDeinitSeq;
 }
 
-std::vector<JITDylib *> MachOPlatform::getDFSLinkOrder(JITDylib &JD) {
-  std::vector<JITDylib *> Result, WorkStack({&JD});
-  DenseSet<JITDylib *> Visited;
-
-  while (!WorkStack.empty()) {
-    auto *NextJD = WorkStack.back();
-    WorkStack.pop_back();
-    if (Visited.count(NextJD))
-      continue;
-    Visited.insert(NextJD);
-    Result.push_back(NextJD);
-    NextJD->withLinkOrderDo([&](const JITDylibSearchOrder &LO) {
-      for (auto &KV : LO)
-        WorkStack.push_back(KV.first);
-    });
-  }
-
-  return Result;
-}
-
 void MachOPlatform::registerInitInfo(
     JITDylib &JD, JITTargetAddress ObjCImageInfoAddr,
     MachOJITDylibInitializers::SectionExtent ModInits,
@@ -319,13 +301,16 @@ void MachOPlatform::InitScraperPlugin::modifyPassConfig(
     MaterializationResponsibility &MR, const Triple &TT,
     jitlink::PassConfiguration &Config) {
 
+  if (!MR.getInitializerSymbol())
+    return;
+
   Config.PrePrunePasses.push_back([this, &MR](jitlink::LinkGraph &G) -> Error {
     JITLinkSymbolVector InitSectionSymbols;
     preserveInitSectionIfPresent(InitSectionSymbols, G, "__mod_init_func");
     preserveInitSectionIfPresent(InitSectionSymbols, G, "__objc_selrefs");
     preserveInitSectionIfPresent(InitSectionSymbols, G, "__objc_classlist");
 
-    if (!InitSymbolDeps.empty()) {
+    if (!InitSectionSymbols.empty()) {
       std::lock_guard<std::mutex> Lock(InitScraperMutex);
       InitSymbolDeps[&MR] = std::move(InitSectionSymbols);
     }
@@ -343,10 +328,8 @@ void MachOPlatform::InitScraperPlugin::modifyPassConfig(
 
     JITTargetAddress ObjCImageInfoAddr = 0;
     if (auto *ObjCImageInfoSec = G.findSectionByName("__objc_image_info")) {
-      if (auto Addr = jitlink::SectionRange(*ObjCImageInfoSec).getStart()) {
+      if (auto Addr = jitlink::SectionRange(*ObjCImageInfoSec).getStart())
         ObjCImageInfoAddr = Addr;
-        dbgs() << "Recorded __objc_imageinfo @ " << formatv("{0:x16}", Addr);
-      }
     }
 
     // Record __mod_init_func.
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/NullResolver.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/NullResolver.cpp
deleted file mode 100644
index 5b4345b870bb..000000000000
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/NullResolver.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-//===---------- NullResolver.cpp - Reject symbol lookup requests ----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ExecutionEngine/Orc/NullResolver.h"
-
-#include "llvm/Support/ErrorHandling.h"
-
-namespace llvm {
-namespace orc {
-
-SymbolNameSet NullResolver::getResponsibilitySet(const SymbolNameSet &Symbols) {
-  return Symbols;
-}
-
-SymbolNameSet
-NullResolver::lookup(std::shared_ptr<AsynchronousSymbolQuery> Query,
-                     SymbolNameSet Symbols) {
-  assert(Symbols.empty() && "Null resolver: Symbols must be empty");
-  return Symbols;
-}
-
-JITSymbol NullLegacyResolver::findSymbol(const std::string &Name) {
-  llvm_unreachable("Unexpected cross-object symbol reference");
-}
-
-JITSymbol
-NullLegacyResolver::findSymbolInLogicalDylib(const std::string &Name) {
-  llvm_unreachable("Unexpected cross-object symbol reference");
-}
-
-} // End namespace orc.
-} // End namespace llvm.
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
index 02066b458dfc..26f77acd91fc 100644
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
@@ -24,34 +24,34 @@ namespace orc {
 
 class ObjectLinkingLayerJITLinkContext final : public JITLinkContext {
 public:
-  ObjectLinkingLayerJITLinkContext(ObjectLinkingLayer &Layer,
-                                   MaterializationResponsibility MR,
-                                   std::unique_ptr<MemoryBuffer> ObjBuffer)
-      : Layer(Layer), MR(std::move(MR)), ObjBuffer(std::move(ObjBuffer)) {}
+  ObjectLinkingLayerJITLinkContext(
+      ObjectLinkingLayer &Layer,
+      std::unique_ptr<MaterializationResponsibility> MR,
+      std::unique_ptr<MemoryBuffer> ObjBuffer)
+      : JITLinkContext(&MR->getTargetJITDylib()), Layer(Layer),
+        MR(std::move(MR)), ObjBuffer(std::move(ObjBuffer)) {}
 
   ~ObjectLinkingLayerJITLinkContext() {
     // If there is an object buffer return function then use it to
     // return ownership of the buffer.
-    if (Layer.ReturnObjectBuffer)
+    if (Layer.ReturnObjectBuffer && ObjBuffer)
       Layer.ReturnObjectBuffer(std::move(ObjBuffer));
   }
 
-  JITLinkMemoryManager &getMemoryManager() override { return *Layer.MemMgr; }
-
-  MemoryBufferRef getObjectBuffer() const override {
-    return ObjBuffer->getMemBufferRef();
-  }
+  JITLinkMemoryManager &getMemoryManager() override { return Layer.MemMgr; }
 
   void notifyFailed(Error Err) override {
+    for (auto &P : Layer.Plugins)
+      Err = joinErrors(std::move(Err), P->notifyFailed(*MR));
     Layer.getExecutionSession().reportError(std::move(Err));
-    MR.failMaterialization();
+    MR->failMaterialization();
   }
 
   void lookup(const LookupMap &Symbols,
               std::unique_ptr<JITLinkAsyncLookupContinuation> LC) override {
 
     JITDylibSearchOrder LinkOrder;
-    MR.getTargetJITDylib().withLinkOrderDo(
+    MR->getTargetJITDylib().withLinkOrderDo(
         [&](const JITDylibSearchOrder &LO) { LinkOrder = LO; });
 
     auto &ES = Layer.getExecutionSession();
@@ -71,9 +71,8 @@ public:
     }
 
     // OnResolve -- De-intern the symbols and pass the result to the linker.
-    auto OnResolve = [this, LookupContinuation = std::move(LC)](
-                         Expected<SymbolMap> Result) mutable {
-      auto Main = Layer.getExecutionSession().intern("_main");
+    auto OnResolve = [LookupContinuation =
+                          std::move(LC)](Expected<SymbolMap> Result) mutable {
       if (!Result)
         LookupContinuation->run(Result.takeError());
       else {
@@ -86,8 +85,8 @@ public:
 
     for (auto &KV : InternalNamedSymbolDeps) {
       SymbolDependenceMap InternalDeps;
-      InternalDeps[&MR.getTargetJITDylib()] = std::move(KV.second);
-      MR.addDependencies(KV.first, InternalDeps);
+      InternalDeps[&MR->getTargetJITDylib()] = std::move(KV.second);
+      MR->addDependencies(KV.first, InternalDeps);
     }
 
     ES.lookup(LookupKind::Static, LinkOrder, std::move(LookupSet),
@@ -97,7 +96,7 @@ public:
               });
   }
 
-  void notifyResolved(LinkGraph &G) override {
+  Error notifyResolved(LinkGraph &G) override {
     auto &ES = Layer.getExecutionSession();
 
     SymbolFlagsMap ExtraSymbolsToClaim;
@@ -116,7 +115,7 @@ public:
 
         InternedResult[InternedName] =
             JITEvaluatedSymbol(Sym->getAddress(), Flags);
-        if (AutoClaim && !MR.getSymbols().count(InternedName)) {
+        if (AutoClaim && !MR->getSymbols().count(InternedName)) {
           assert(!ExtraSymbolsToClaim.count(InternedName) &&
                  "Duplicate symbol to claim?");
           ExtraSymbolsToClaim[InternedName] = Flags;
@@ -134,7 +133,7 @@ public:
           Flags |= JITSymbolFlags::Weak;
         InternedResult[InternedName] =
             JITEvaluatedSymbol(Sym->getAddress(), Flags);
-        if (AutoClaim && !MR.getSymbols().count(InternedName)) {
+        if (AutoClaim && !MR->getSymbols().count(InternedName)) {
           assert(!ExtraSymbolsToClaim.count(InternedName) &&
                  "Duplicate symbol to claim?");
           ExtraSymbolsToClaim[InternedName] = Flags;
@@ -142,19 +141,19 @@ public:
       }
 
     if (!ExtraSymbolsToClaim.empty())
-      if (auto Err = MR.defineMaterializing(ExtraSymbolsToClaim))
-        return notifyFailed(std::move(Err));
+      if (auto Err = MR->defineMaterializing(ExtraSymbolsToClaim))
+        return Err;
 
     {
 
-      // Check that InternedResult matches up with MR.getSymbols().
+      // Check that InternedResult matches up with MR->getSymbols().
       // This guards against faulty transformations / compilers / object caches.
 
       // First check that there aren't any missing symbols.
       size_t NumMaterializationSideEffectsOnlySymbols = 0;
       SymbolNameVector ExtraSymbols;
       SymbolNameVector MissingSymbols;
-      for (auto &KV : MR.getSymbols()) {
+      for (auto &KV : MR->getSymbols()) {
 
         // If this is a materialization-side-effects only symbol then bump
         // the counter and make sure it's *not* defined, otherwise make
@@ -169,49 +168,42 @@ public:
       }
 
       // If there were missing symbols then report the error.
-      if (!MissingSymbols.empty()) {
-        ES.reportError(make_error<MissingSymbolDefinitions>(
-            G.getName(), std::move(MissingSymbols)));
-        MR.failMaterialization();
-        return;
-      }
+      if (!MissingSymbols.empty())
+        return make_error<MissingSymbolDefinitions>(G.getName(),
+                                                    std::move(MissingSymbols));
 
       // If there are more definitions than expected, add them to the
       // ExtraSymbols vector.
       if (InternedResult.size() >
-          MR.getSymbols().size() - NumMaterializationSideEffectsOnlySymbols) {
+          MR->getSymbols().size() - NumMaterializationSideEffectsOnlySymbols) {
         for (auto &KV : InternedResult)
-          if (!MR.getSymbols().count(KV.first))
+          if (!MR->getSymbols().count(KV.first))
             ExtraSymbols.push_back(KV.first);
       }
 
       // If there were extra definitions then report the error.
-      if (!ExtraSymbols.empty()) {
-        ES.reportError(make_error<UnexpectedSymbolDefinitions>(
-            G.getName(), std::move(ExtraSymbols)));
-        MR.failMaterialization();
-        return;
-      }
+      if (!ExtraSymbols.empty())
+        return make_error<UnexpectedSymbolDefinitions>(G.getName(),
+                                                       std::move(ExtraSymbols));
     }
 
-    if (auto Err = MR.notifyResolved(InternedResult)) {
-      Layer.getExecutionSession().reportError(std::move(Err));
-      MR.failMaterialization();
-      return;
-    }
-    Layer.notifyLoaded(MR);
+    if (auto Err = MR->notifyResolved(InternedResult))
+      return Err;
+
+    Layer.notifyLoaded(*MR);
+    return Error::success();
   }
 
   void notifyFinalized(
       std::unique_ptr<JITLinkMemoryManager::Allocation> A) override {
-    if (auto Err = Layer.notifyEmitted(MR, std::move(A))) {
+    if (auto Err = Layer.notifyEmitted(*MR, std::move(A))) {
       Layer.getExecutionSession().reportError(std::move(Err));
-      MR.failMaterialization();
+      MR->failMaterialization();
       return;
     }
-    if (auto Err = MR.notifyEmitted()) {
+    if (auto Err = MR->notifyEmitted()) {
       Layer.getExecutionSession().reportError(std::move(Err));
-      MR.failMaterialization();
+      MR->failMaterialization();
     }
   }
 
@@ -222,10 +214,11 @@ public:
   Error modifyPassConfig(const Triple &TT, PassConfiguration &Config) override {
     // Add passes to mark duplicate defs as should-discard, and to walk the
     // link graph to build the symbol dependence graph.
-    Config.PrePrunePasses.push_back(
-        [this](LinkGraph &G) { return externalizeWeakAndCommonSymbols(G); });
+    Config.PrePrunePasses.push_back([this](LinkGraph &G) {
+      return claimOrExternalizeWeakAndCommonSymbols(G);
+    });
 
-    Layer.modifyPassConfig(MR, TT, Config);
+    Layer.modifyPassConfig(*MR, TT, Config);
 
     Config.PostPrunePasses.push_back(
         [this](LinkGraph &G) { return computeNamedSymbolDependencies(G); });
@@ -241,19 +234,38 @@ private:
   using LocalSymbolNamedDependenciesMap =
       DenseMap<const Symbol *, LocalSymbolNamedDependencies>;
 
-  Error externalizeWeakAndCommonSymbols(LinkGraph &G) {
+  Error claimOrExternalizeWeakAndCommonSymbols(LinkGraph &G) {
     auto &ES = Layer.getExecutionSession();
-    for (auto *Sym : G.defined_symbols())
+
+    SymbolFlagsMap NewSymbolsToClaim;
+    std::vector<std::pair<SymbolStringPtr, Symbol *>> NameToSym;
+
+    auto ProcessSymbol = [&](Symbol *Sym) {
       if (Sym->hasName() && Sym->getLinkage() == Linkage::Weak) {
-        if (!MR.getSymbols().count(ES.intern(Sym->getName())))
-          G.makeExternal(*Sym);
+        auto Name = ES.intern(Sym->getName());
+        if (!MR->getSymbols().count(ES.intern(Sym->getName()))) {
+          JITSymbolFlags SF = JITSymbolFlags::Weak;
+          if (Sym->getScope() == Scope::Default)
+            SF |= JITSymbolFlags::Exported;
+          NewSymbolsToClaim[Name] = SF;
+          NameToSym.push_back(std::make_pair(std::move(Name), Sym));
+        }
       }
+    };
 
+    for (auto *Sym : G.defined_symbols())
+      ProcessSymbol(Sym);
     for (auto *Sym : G.absolute_symbols())
-      if (Sym->hasName() && Sym->getLinkage() == Linkage::Weak) {
-        if (!MR.getSymbols().count(ES.intern(Sym->getName())))
-          G.makeExternal(*Sym);
-      }
+      ProcessSymbol(Sym);
+
+    // Attempt to claim all weak defs that we're not already responsible for.
+    // This cannot fail -- any clashes will just result in rejection of our
+    // claim, at which point we'll externalize that symbol.
+    cantFail(MR->defineMaterializing(std::move(NewSymbolsToClaim)));
+
+    for (auto &KV : NameToSym)
+      if (!MR->getSymbols().count(KV.first))
+        G.makeExternal(*KV.second);
 
     return Error::success();
   }
@@ -261,13 +273,13 @@ private:
   Error markResponsibilitySymbolsLive(LinkGraph &G) const {
     auto &ES = Layer.getExecutionSession();
     for (auto *Sym : G.defined_symbols())
-      if (Sym->hasName() && MR.getSymbols().count(ES.intern(Sym->getName())))
+      if (Sym->hasName() && MR->getSymbols().count(ES.intern(Sym->getName())))
         Sym->setLive(true);
     return Error::success();
   }
 
   Error computeNamedSymbolDependencies(LinkGraph &G) {
-    auto &ES = MR.getTargetJITDylib().getExecutionSession();
+    auto &ES = MR->getTargetJITDylib().getExecutionSession();
     auto LocalDeps = computeLocalDeps(G);
 
     // Compute dependencies for symbols defined in the JITLink graph.
@@ -314,7 +326,7 @@ private:
     }
 
     for (auto &P : Layer.Plugins) {
-      auto SyntheticLocalDeps = P->getSyntheticSymbolLocalDependencies(MR);
+      auto SyntheticLocalDeps = P->getSyntheticSymbolLocalDependencies(*MR);
       if (SyntheticLocalDeps.empty())
         continue;
 
@@ -434,12 +446,12 @@ private:
           SymbolDeps.erase(&SourceJD);
       }
 
-      MR.addDependencies(Name, SymbolDeps);
+      MR->addDependencies(Name, SymbolDeps);
     }
   }
 
   ObjectLinkingLayer &Layer;
-  MaterializationResponsibility MR;
+  std::unique_ptr<MaterializationResponsibility> MR;
   std::unique_ptr<MemoryBuffer> ObjBuffer;
   DenseMap<SymbolStringPtr, SymbolNameSet> ExternalNamedSymbolDeps;
   DenseMap<SymbolStringPtr, SymbolNameSet> InternalNamedSymbolDeps;
@@ -447,20 +459,39 @@ private:
 
 ObjectLinkingLayer::Plugin::~Plugin() {}
 
+ObjectLinkingLayer::ObjectLinkingLayer(ExecutionSession &ES,
+                                       JITLinkMemoryManager &MemMgr)
+    : ObjectLayer(ES), MemMgr(MemMgr) {
+  ES.registerResourceManager(*this);
+}
+
 ObjectLinkingLayer::ObjectLinkingLayer(
     ExecutionSession &ES, std::unique_ptr<JITLinkMemoryManager> MemMgr)
-    : ObjectLayer(ES), MemMgr(std::move(MemMgr)) {}
+    : ObjectLayer(ES), MemMgr(*MemMgr), MemMgrOwnership(std::move(MemMgr)) {
+  ES.registerResourceManager(*this);
+}
 
 ObjectLinkingLayer::~ObjectLinkingLayer() {
-  if (auto Err = removeAllModules())
-    getExecutionSession().reportError(std::move(Err));
+  assert(Allocs.empty() && "Layer destroyed with resources still attached");
+  getExecutionSession().deregisterResourceManager(*this);
 }
 
-void ObjectLinkingLayer::emit(MaterializationResponsibility R,
+void ObjectLinkingLayer::emit(std::unique_ptr<MaterializationResponsibility> R,
                               std::unique_ptr<MemoryBuffer> O) {
   assert(O && "Object must not be null");
-  jitLink(std::make_unique<ObjectLinkingLayerJITLinkContext>(
-      *this, std::move(R), std::move(O)));
+  auto ObjBuffer = O->getMemBufferRef();
+  auto Ctx = std::make_unique<ObjectLinkingLayerJITLinkContext>(
+      *this, std::move(R), std::move(O));
+  if (auto G = createLinkGraphFromObject(std::move(ObjBuffer)))
+    link(std::move(*G), std::move(Ctx));
+  else
+    Ctx->notifyFailed(G.takeError());
+}
+
+void ObjectLinkingLayer::emit(std::unique_ptr<MaterializationResponsibility> R,
+                              std::unique_ptr<LinkGraph> G) {
+  link(std::move(G), std::make_unique<ObjectLinkingLayerJITLinkContext>(
+                         *this, std::move(R), nullptr));
 }
 
 void ObjectLinkingLayer::modifyPassConfig(MaterializationResponsibility &MR,
@@ -484,63 +515,56 @@ Error ObjectLinkingLayer::notifyEmitted(MaterializationResponsibility &MR,
   if (Err)
     return Err;
 
-  {
-    std::lock_guard<std::mutex> Lock(LayerMutex);
-    UntrackedAllocs.push_back(std::move(Alloc));
-  }
-
-  return Error::success();
+  return MR.withResourceKeyDo(
+      [&](ResourceKey K) { Allocs[K].push_back(std::move(Alloc)); });
 }
 
-Error ObjectLinkingLayer::removeModule(VModuleKey K) {
+Error ObjectLinkingLayer::handleRemoveResources(ResourceKey K) {
+
   Error Err = Error::success();
 
   for (auto &P : Plugins)
-    Err = joinErrors(std::move(Err), P->notifyRemovingModule(K));
-
-  AllocPtr Alloc;
+    Err = joinErrors(std::move(Err), P->notifyRemovingResources(K));
+
+  std::vector<AllocPtr> AllocsToRemove;
+  getExecutionSession().runSessionLocked([&] {
+    auto I = Allocs.find(K);
+    if (I != Allocs.end()) {
+      std::swap(AllocsToRemove, I->second);
+      Allocs.erase(I);
+    }
+  });
 
-  {
-    std::lock_guard<std::mutex> Lock(LayerMutex);
-    auto AllocItr = TrackedAllocs.find(K);
-    Alloc = std::move(AllocItr->second);
-    TrackedAllocs.erase(AllocItr);
+  while (!AllocsToRemove.empty()) {
+    Err = joinErrors(std::move(Err), AllocsToRemove.back()->deallocate());
+    AllocsToRemove.pop_back();
   }
 
-  assert(Alloc && "No allocation for key K");
-
-  return joinErrors(std::move(Err), Alloc->deallocate());
+  return Err;
 }
 
-Error ObjectLinkingLayer::removeAllModules() {
-
-  Error Err = Error::success();
-
-  for (auto &P : Plugins)
-    Err = joinErrors(std::move(Err), P->notifyRemovingAllModules());
-
-  std::vector<AllocPtr> Allocs;
-  {
-    std::lock_guard<std::mutex> Lock(LayerMutex);
-    Allocs = std::move(UntrackedAllocs);
-
-    for (auto &KV : TrackedAllocs)
-      Allocs.push_back(std::move(KV.second));
-
-    TrackedAllocs.clear();
+void ObjectLinkingLayer::handleTransferResources(ResourceKey DstKey,
+                                                 ResourceKey SrcKey) {
+  auto I = Allocs.find(SrcKey);
+  if (I != Allocs.end()) {
+    auto &SrcAllocs = I->second;
+    auto &DstAllocs = Allocs[DstKey];
+    DstAllocs.reserve(DstAllocs.size() + SrcAllocs.size());
+    for (auto &Alloc : SrcAllocs)
+      DstAllocs.push_back(std::move(Alloc));
+
+    // Erase SrcKey entry using value rather than iterator I: I may have been
+    // invalidated when we looked up DstKey.
+    Allocs.erase(SrcKey);
   }
 
-  while (!Allocs.empty()) {
-    Err = joinErrors(std::move(Err), Allocs.back()->deallocate());
-    Allocs.pop_back();
-  }
-
-  return Err;
+  for (auto &P : Plugins)
+    P->notifyTransferringResources(DstKey, SrcKey);
 }
 
 EHFrameRegistrationPlugin::EHFrameRegistrationPlugin(
-    EHFrameRegistrar &Registrar)
-    : Registrar(Registrar) {}
+    ExecutionSession &ES, std::unique_ptr<EHFrameRegistrar> Registrar)
+    : ES(ES), Registrar(std::move(Registrar)) {}
 
 void EHFrameRegistrationPlugin::modifyPassConfig(
     MaterializationResponsibility &MR, const Triple &TT,
@@ -559,65 +583,70 @@ void EHFrameRegistrationPlugin::modifyPassConfig(
 
 Error EHFrameRegistrationPlugin::notifyEmitted(
     MaterializationResponsibility &MR) {
-  std::lock_guard<std::mutex> Lock(EHFramePluginMutex);
-
-  auto EHFrameRangeItr = InProcessLinks.find(&MR);
-  if (EHFrameRangeItr == InProcessLinks.end())
-    return Error::success();
-
-  auto EHFrameRange = EHFrameRangeItr->second;
-  assert(EHFrameRange.Addr &&
-         "eh-frame addr to register can not be null");
-
-  InProcessLinks.erase(EHFrameRangeItr);
-  if (auto Key = MR.getVModuleKey())
-    TrackedEHFrameRanges[Key] = EHFrameRange;
-  else
-    UntrackedEHFrameRanges.push_back(EHFrameRange);
 
-  return Registrar.registerEHFrames(EHFrameRange.Addr, EHFrameRange.Size);
-}
+  EHFrameRange EmittedRange;
+  {
+    std::lock_guard<std::mutex> Lock(EHFramePluginMutex);
 
-Error EHFrameRegistrationPlugin::notifyRemovingModule(VModuleKey K) {
-  std::lock_guard<std::mutex> Lock(EHFramePluginMutex);
+    auto EHFrameRangeItr = InProcessLinks.find(&MR);
+    if (EHFrameRangeItr == InProcessLinks.end())
+      return Error::success();
 
-  auto EHFrameRangeItr = TrackedEHFrameRanges.find(K);
-  if (EHFrameRangeItr == TrackedEHFrameRanges.end())
-    return Error::success();
-
-  auto EHFrameRange = EHFrameRangeItr->second;
-  assert(EHFrameRange.Addr && "Tracked eh-frame range must not be null");
+    EmittedRange = EHFrameRangeItr->second;
+    assert(EmittedRange.Addr && "eh-frame addr to register can not be null");
+    InProcessLinks.erase(EHFrameRangeItr);
+  }
 
-  TrackedEHFrameRanges.erase(EHFrameRangeItr);
+  if (auto Err = MR.withResourceKeyDo(
+          [&](ResourceKey K) { EHFrameRanges[K].push_back(EmittedRange); }))
+    return Err;
 
-  return Registrar.deregisterEHFrames(EHFrameRange.Addr, EHFrameRange.Size);
+  return Registrar->registerEHFrames(EmittedRange.Addr, EmittedRange.Size);
 }
 
-Error EHFrameRegistrationPlugin::notifyRemovingAllModules() {
+Error EHFrameRegistrationPlugin::notifyFailed(
+    MaterializationResponsibility &MR) {
   std::lock_guard<std::mutex> Lock(EHFramePluginMutex);
+  InProcessLinks.erase(&MR);
+  return Error::success();
+}
 
-  std::vector<EHFrameRange> EHFrameRanges =
-    std::move(UntrackedEHFrameRanges);
-  EHFrameRanges.reserve(EHFrameRanges.size() + TrackedEHFrameRanges.size());
-
-  for (auto &KV : TrackedEHFrameRanges)
-    EHFrameRanges.push_back(KV.second);
+Error EHFrameRegistrationPlugin::notifyRemovingResources(ResourceKey K) {
+  std::vector<EHFrameRange> RangesToRemove;
 
-  TrackedEHFrameRanges.clear();
+  ES.runSessionLocked([&] {
+    auto I = EHFrameRanges.find(K);
+    if (I != EHFrameRanges.end()) {
+      RangesToRemove = std::move(I->second);
+      EHFrameRanges.erase(I);
+    }
+  });
 
   Error Err = Error::success();
-
-  while (!EHFrameRanges.empty()) {
-    auto EHFrameRange = EHFrameRanges.back();
-    assert(EHFrameRange.Addr && "Untracked eh-frame range must not be null");
-    EHFrameRanges.pop_back();
-    Err = joinErrors(std::move(Err),
-                     Registrar.deregisterEHFrames(EHFrameRange.Addr,
-                                                  EHFrameRange.Size));
+  while (!RangesToRemove.empty()) {
+    auto RangeToRemove = RangesToRemove.back();
+    RangesToRemove.pop_back();
+    assert(RangeToRemove.Addr && "Untracked eh-frame range must not be null");
+    Err = joinErrors(
+        std::move(Err),
+        Registrar->deregisterEHFrames(RangeToRemove.Addr, RangeToRemove.Size));
   }
 
   return Err;
 }
 
+void EHFrameRegistrationPlugin::notifyTransferringResources(
+    ResourceKey DstKey, ResourceKey SrcKey) {
+  auto SI = EHFrameRanges.find(SrcKey);
+  if (SI != EHFrameRanges.end()) {
+    auto &SrcRanges = SI->second;
+    auto &DstRanges = EHFrameRanges[DstKey];
+    DstRanges.reserve(DstRanges.size() + SrcRanges.size());
+    for (auto &SrcRange : SrcRanges)
+      DstRanges.push_back(std::move(SrcRange));
+    EHFrameRanges.erase(SI);
+  }
+}
+
 } // End namespace orc.
 } // End namespace llvm.
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/ObjectTransformLayer.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/ObjectTransformLayer.cpp
index d18eb38a4142..a57662e10a79 100644
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/ObjectTransformLayer.cpp
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/ObjectTransformLayer.cpp
@@ -17,8 +17,9 @@ ObjectTransformLayer::ObjectTransformLayer(ExecutionSession &ES,
                                             TransformFunction Transform)
     : ObjectLayer(ES), BaseLayer(BaseLayer), Transform(std::move(Transform)) {}
 
-void ObjectTransformLayer::emit(MaterializationResponsibility R,
-                                std::unique_ptr<MemoryBuffer> O) {
+void ObjectTransformLayer::emit(
+    std::unique_ptr<MaterializationResponsibility> R,
+    std::unique_ptr<MemoryBuffer> O) {
   assert(O && "Module must not be null");
 
   // If there is a transform set then apply it.
@@ -26,7 +27,7 @@ void ObjectTransformLayer::emit(MaterializationResponsibility R,
     if (auto TransformedObj = Transform(std::move(O)))
       O = std::move(*TransformedObj);
     else {
-      R.failMaterialization();
+      R->failMaterialization();
       getExecutionSession().reportError(TransformedObj.takeError());
       return;
     }
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/OrcCBindings.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/OrcCBindings.cpp
deleted file mode 100644
index 28c8479abba4..000000000000
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/OrcCBindings.cpp
+++ /dev/null
@@ -1,158 +0,0 @@
-//===----------- OrcCBindings.cpp - C bindings for the Orc APIs -----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "OrcCBindingsStack.h"
-#include "llvm-c/OrcBindings.h"
-#include "llvm/ExecutionEngine/JITEventListener.h"
-
-using namespace llvm;
-
-LLVMOrcJITStackRef LLVMOrcCreateInstance(LLVMTargetMachineRef TM) {
-  TargetMachine *TM2(unwrap(TM));
-
-  Triple T(TM2->getTargetTriple());
-
-  auto IndirectStubsMgrBuilder =
-      orc::createLocalIndirectStubsManagerBuilder(T);
-
-  OrcCBindingsStack *JITStack =
-      new OrcCBindingsStack(*TM2, std::move(IndirectStubsMgrBuilder));
-
-  return wrap(JITStack);
-}
-
-const char *LLVMOrcGetErrorMsg(LLVMOrcJITStackRef JITStack) {
-  OrcCBindingsStack &J = *unwrap(JITStack);
-  return J.getErrorMessage().c_str();
-}
-
-void LLVMOrcGetMangledSymbol(LLVMOrcJITStackRef JITStack, char **MangledName,
-                             const char *SymbolName) {
-  OrcCBindingsStack &J = *unwrap(JITStack);
-  std::string Mangled = J.mangle(SymbolName);
-  *MangledName = new char[Mangled.size() + 1];
-  strcpy(*MangledName, Mangled.c_str());
-}
-
-void LLVMOrcDisposeMangledSymbol(char *MangledName) { delete[] MangledName; }
-
-LLVMErrorRef LLVMOrcCreateLazyCompileCallback(
-    LLVMOrcJITStackRef JITStack, LLVMOrcTargetAddress *RetAddr,
-    LLVMOrcLazyCompileCallbackFn Callback, void *CallbackCtx) {
-  OrcCBindingsStack &J = *unwrap(JITStack);
-  if (auto Addr = J.createLazyCompileCallback(Callback, CallbackCtx)) {
-    *RetAddr = *Addr;
-    return LLVMErrorSuccess;
-  } else
-    return wrap(Addr.takeError());
-}
-
-LLVMErrorRef LLVMOrcCreateIndirectStub(LLVMOrcJITStackRef JITStack,
-                                       const char *StubName,
-                                       LLVMOrcTargetAddress InitAddr) {
-  OrcCBindingsStack &J = *unwrap(JITStack);
-  return wrap(J.createIndirectStub(StubName, InitAddr));
-}
-
-LLVMErrorRef LLVMOrcSetIndirectStubPointer(LLVMOrcJITStackRef JITStack,
-                                           const char *StubName,
-                                           LLVMOrcTargetAddress NewAddr) {
-  OrcCBindingsStack &J = *unwrap(JITStack);
-  return wrap(J.setIndirectStubPointer(StubName, NewAddr));
-}
-
-LLVMErrorRef LLVMOrcAddEagerlyCompiledIR(LLVMOrcJITStackRef JITStack,
-                                         LLVMOrcModuleHandle *RetHandle,
-                                         LLVMModuleRef Mod,
-                                         LLVMOrcSymbolResolverFn SymbolResolver,
-                                         void *SymbolResolverCtx) {
-  OrcCBindingsStack &J = *unwrap(JITStack);
-  std::unique_ptr<Module> M(unwrap(Mod));
-  if (auto Handle =
-          J.addIRModuleEager(std::move(M), SymbolResolver, SymbolResolverCtx)) {
-    *RetHandle = *Handle;
-    return LLVMErrorSuccess;
-  } else
-    return wrap(Handle.takeError());
-}
-
-LLVMErrorRef LLVMOrcAddLazilyCompiledIR(LLVMOrcJITStackRef JITStack,
-                                        LLVMOrcModuleHandle *RetHandle,
-                                        LLVMModuleRef Mod,
-                                        LLVMOrcSymbolResolverFn SymbolResolver,
-                                        void *SymbolResolverCtx) {
-  OrcCBindingsStack &J = *unwrap(JITStack);
-  std::unique_ptr<Module> M(unwrap(Mod));
-  if (auto Handle =
-          J.addIRModuleLazy(std::move(M), SymbolResolver, SymbolResolverCtx)) {
-    *RetHandle = *Handle;
-    return LLVMErrorSuccess;
-  } else
-    return wrap(Handle.takeError());
-}
-
-LLVMErrorRef LLVMOrcAddObjectFile(LLVMOrcJITStackRef JITStack,
-                                  LLVMOrcModuleHandle *RetHandle,
-                                  LLVMMemoryBufferRef Obj,
-                                  LLVMOrcSymbolResolverFn SymbolResolver,
-                                  void *SymbolResolverCtx) {
-  OrcCBindingsStack &J = *unwrap(JITStack);
-  std::unique_ptr<MemoryBuffer> O(unwrap(Obj));
-  if (auto Handle =
-          J.addObject(std::move(O), SymbolResolver, SymbolResolverCtx)) {
-    *RetHandle = *Handle;
-    return LLVMErrorSuccess;
-  } else
-    return wrap(Handle.takeError());
-}
-
-LLVMErrorRef LLVMOrcRemoveModule(LLVMOrcJITStackRef JITStack,
-                                 LLVMOrcModuleHandle H) {
-  OrcCBindingsStack &J = *unwrap(JITStack);
-  return wrap(J.removeModule(H));
-}
-
-LLVMErrorRef LLVMOrcGetSymbolAddress(LLVMOrcJITStackRef JITStack,
-                                     LLVMOrcTargetAddress *RetAddr,
-                                     const char *SymbolName) {
-  OrcCBindingsStack &J = *unwrap(JITStack);
-  if (auto Addr = J.findSymbolAddress(SymbolName, true)) {
-    *RetAddr = *Addr;
-    return LLVMErrorSuccess;
-  } else
-    return wrap(Addr.takeError());
-}
-
-LLVMErrorRef LLVMOrcGetSymbolAddressIn(LLVMOrcJITStackRef JITStack,
-                                       LLVMOrcTargetAddress *RetAddr,
-                                       LLVMOrcModuleHandle H,
-                                       const char *SymbolName) {
-  OrcCBindingsStack &J = *unwrap(JITStack);
-  if (auto Addr = J.findSymbolAddressIn(H, SymbolName, true)) {
-    *RetAddr = *Addr;
-    return LLVMErrorSuccess;
-  } else
-    return wrap(Addr.takeError());
-}
-
-LLVMErrorRef LLVMOrcDisposeInstance(LLVMOrcJITStackRef JITStack) {
-  auto *J = unwrap(JITStack);
-  auto Err = J->shutdown();
-  delete J;
-  return wrap(std::move(Err));
-}
-
-void LLVMOrcRegisterJITEventListener(LLVMOrcJITStackRef JITStack, LLVMJITEventListenerRef L)
-{
-  unwrap(JITStack)->RegisterJITEventListener(unwrap(L));
-}
-
-void LLVMOrcUnregisterJITEventListener(LLVMOrcJITStackRef JITStack, LLVMJITEventListenerRef L)
-{
-  unwrap(JITStack)->UnregisterJITEventListener(unwrap(L));
-}
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/OrcCBindingsStack.h b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/OrcCBindingsStack.h
deleted file mode 100644
index 87bb4398765d..000000000000
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/OrcCBindingsStack.h
+++ /dev/null
@@ -1,534 +0,0 @@
-//===- OrcCBindingsStack.h - Orc JIT stack for C bindings -----*- C++ -*---===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_EXECUTIONENGINE_ORC_ORCCBINDINGSSTACK_H
-#define LLVM_LIB_EXECUTIONENGINE_ORC_ORCCBINDINGSSTACK_H
-
-#include "llvm-c/OrcBindings.h"
-#include "llvm-c/TargetMachine.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ExecutionEngine/JITSymbol.h"
-#include "llvm/ExecutionEngine/JITEventListener.h"
-#include "llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h"
-#include "llvm/ExecutionEngine/Orc/CompileUtils.h"
-#include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
-#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
-#include "llvm/ExecutionEngine/Orc/LambdaResolver.h"
-#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
-#include "llvm/ExecutionEngine/RuntimeDyld.h"
-#include "llvm/ExecutionEngine/SectionMemoryManager.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Mangler.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/CBindingWrapping.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
-#include <algorithm>
-#include <cstdint>
-#include <functional>
-#include <map>
-#include <memory>
-#include <set>
-#include <string>
-#include <vector>
-
-namespace llvm {
-
-class OrcCBindingsStack;
-
-DEFINE_SIMPLE_CONVERSION_FUNCTIONS(OrcCBindingsStack, LLVMOrcJITStackRef)
-DEFINE_SIMPLE_CONVERSION_FUNCTIONS(TargetMachine, LLVMTargetMachineRef)
-
-namespace detail {
-
-// FIXME: Kill this off once the Layer concept becomes an interface.
-class GenericLayer {
-public:
-  virtual ~GenericLayer() = default;
-
-  virtual JITSymbol findSymbolIn(orc::VModuleKey K, const std::string &Name,
-                                 bool ExportedSymbolsOnly) = 0;
-  virtual Error removeModule(orc::VModuleKey K) = 0;
-  };
-
-  template <typename LayerT> class GenericLayerImpl : public GenericLayer {
-  public:
-    GenericLayerImpl(LayerT &Layer) : Layer(Layer) {}
-
-    JITSymbol findSymbolIn(orc::VModuleKey K, const std::string &Name,
-                           bool ExportedSymbolsOnly) override {
-      return Layer.findSymbolIn(K, Name, ExportedSymbolsOnly);
-    }
-
-    Error removeModule(orc::VModuleKey K) override {
-      return Layer.removeModule(K);
-    }
-
-  private:
-    LayerT &Layer;
-  };
-
-  template <>
-  class GenericLayerImpl<orc::LegacyRTDyldObjectLinkingLayer> : public GenericLayer {
-  private:
-    using LayerT = orc::LegacyRTDyldObjectLinkingLayer;
-  public:
-    GenericLayerImpl(LayerT &Layer) : Layer(Layer) {}
-
-    JITSymbol findSymbolIn(orc::VModuleKey K, const std::string &Name,
-                           bool ExportedSymbolsOnly) override {
-      return Layer.findSymbolIn(K, Name, ExportedSymbolsOnly);
-    }
-
-    Error removeModule(orc::VModuleKey K) override {
-      return Layer.removeObject(K);
-    }
-
-  private:
-    LayerT &Layer;
-  };
-
-  template <typename LayerT>
-  std::unique_ptr<GenericLayerImpl<LayerT>> createGenericLayer(LayerT &Layer) {
-    return std::make_unique<GenericLayerImpl<LayerT>>(Layer);
-  }
-
-} // end namespace detail
-
-class OrcCBindingsStack {
-public:
-
-  using CompileCallbackMgr = orc::JITCompileCallbackManager;
-  using ObjLayerT = orc::LegacyRTDyldObjectLinkingLayer;
-  using CompileLayerT = orc::LegacyIRCompileLayer<ObjLayerT, orc::SimpleCompiler>;
-  using CODLayerT =
-        orc::LegacyCompileOnDemandLayer<CompileLayerT, CompileCallbackMgr>;
-
-  using CallbackManagerBuilder =
-      std::function<std::unique_ptr<CompileCallbackMgr>()>;
-
-  using IndirectStubsManagerBuilder = CODLayerT::IndirectStubsManagerBuilderT;
-
-private:
-
-  using OwningObject = object::OwningBinary<object::ObjectFile>;
-
-  class CBindingsResolver : public orc::SymbolResolver {
-  public:
-    CBindingsResolver(OrcCBindingsStack &Stack,
-                      LLVMOrcSymbolResolverFn ExternalResolver,
-                      void *ExternalResolverCtx)
-        : Stack(Stack), ExternalResolver(std::move(ExternalResolver)),
-          ExternalResolverCtx(std::move(ExternalResolverCtx)) {}
-
-    orc::SymbolNameSet
-    getResponsibilitySet(const orc::SymbolNameSet &Symbols) override {
-      orc::SymbolNameSet Result;
-
-      for (auto &S : Symbols) {
-        if (auto Sym = findSymbol(std::string(*S))) {
-          if (!Sym.getFlags().isStrong())
-            Result.insert(S);
-        } else if (auto Err = Sym.takeError()) {
-          Stack.reportError(std::move(Err));
-          return orc::SymbolNameSet();
-        }
-      }
-
-      return Result;
-    }
-
-    orc::SymbolNameSet
-    lookup(std::shared_ptr<orc::AsynchronousSymbolQuery> Query,
-           orc::SymbolNameSet Symbols) override {
-      orc::SymbolNameSet UnresolvedSymbols;
-
-      for (auto &S : Symbols) {
-        if (auto Sym = findSymbol(std::string(*S))) {
-          if (auto Addr = Sym.getAddress()) {
-            Query->notifySymbolMetRequiredState(
-                S, JITEvaluatedSymbol(*Addr, Sym.getFlags()));
-          } else {
-            Stack.ES.legacyFailQuery(*Query, Addr.takeError());
-            return orc::SymbolNameSet();
-          }
-        } else if (auto Err = Sym.takeError()) {
-          Stack.ES.legacyFailQuery(*Query, std::move(Err));
-          return orc::SymbolNameSet();
-        } else
-          UnresolvedSymbols.insert(S);
-      }
-
-      if (Query->isComplete())
-        Query->handleComplete();
-
-      return UnresolvedSymbols;
-    }
-
-  private:
-    JITSymbol findSymbol(const std::string &Name) {
-      // Search order:
-      // 1. JIT'd symbols.
-      // 2. Runtime overrides.
-      // 3. External resolver (if present).
-
-      if (Stack.CODLayer) {
-        if (auto Sym = Stack.CODLayer->findSymbol(Name, true))
-          return Sym;
-        else if (auto Err = Sym.takeError())
-          return Sym.takeError();
-      } else {
-        if (auto Sym = Stack.CompileLayer.findSymbol(Name, true))
-          return Sym;
-        else if (auto Err = Sym.takeError())
-          return Sym.takeError();
-      }
-
-      if (auto Sym = Stack.CXXRuntimeOverrides.searchOverrides(Name))
-        return Sym;
-
-      if (ExternalResolver)
-        return JITSymbol(ExternalResolver(Name.c_str(), ExternalResolverCtx),
-                         JITSymbolFlags::Exported);
-
-      return JITSymbol(nullptr);
-    }
-
-    OrcCBindingsStack &Stack;
-    LLVMOrcSymbolResolverFn ExternalResolver;
-    void *ExternalResolverCtx = nullptr;
-  };
-
-public:
-  OrcCBindingsStack(TargetMachine &TM,
-                    IndirectStubsManagerBuilder IndirectStubsMgrBuilder)
-      : CCMgr(createCompileCallbackManager(TM, ES)), DL(TM.createDataLayout()),
-        IndirectStubsMgr(IndirectStubsMgrBuilder()),
-        ObjectLayer(
-            AcknowledgeORCv1Deprecation, ES,
-            [this](orc::VModuleKey K) {
-              auto ResolverI = Resolvers.find(K);
-              assert(ResolverI != Resolvers.end() &&
-                     "No resolver for module K");
-              auto Resolver = std::move(ResolverI->second);
-              Resolvers.erase(ResolverI);
-              return ObjLayerT::Resources{
-                  std::make_shared<SectionMemoryManager>(), Resolver};
-            },
-            nullptr,
-            [this](orc::VModuleKey K, const object::ObjectFile &Obj,
-                   const RuntimeDyld::LoadedObjectInfo &LoadedObjInfo) {
-              this->notifyFinalized(K, Obj, LoadedObjInfo);
-            },
-            [this](orc::VModuleKey K, const object::ObjectFile &Obj) {
-              this->notifyFreed(K, Obj);
-            }),
-        CompileLayer(AcknowledgeORCv1Deprecation, ObjectLayer,
-                     orc::SimpleCompiler(TM)),
-        CODLayer(createCODLayer(ES, CompileLayer, CCMgr.get(),
-                                std::move(IndirectStubsMgrBuilder), Resolvers)),
-        CXXRuntimeOverrides(
-            AcknowledgeORCv1Deprecation,
-            [this](const std::string &S) { return mangle(S); }) {}
-
-  Error shutdown() {
-    // Run any destructors registered with __cxa_atexit.
-    CXXRuntimeOverrides.runDestructors();
-    // Run any IR destructors.
-    for (auto &DtorRunner : IRStaticDestructorRunners)
-      if (auto Err = DtorRunner.runViaLayer(*this))
-        return Err;
-    return Error::success();
-  }
-
-  std::string mangle(StringRef Name) {
-    std::string MangledName;
-    {
-      raw_string_ostream MangledNameStream(MangledName);
-      Mangler::getNameWithPrefix(MangledNameStream, Name, DL);
-    }
-    return MangledName;
-  }
-
-  template <typename PtrTy>
-  static PtrTy fromTargetAddress(JITTargetAddress Addr) {
-    return reinterpret_cast<PtrTy>(static_cast<uintptr_t>(Addr));
-  }
-
-  Expected<JITTargetAddress>
-  createLazyCompileCallback(LLVMOrcLazyCompileCallbackFn Callback,
-                            void *CallbackCtx) {
-    auto WrappedCallback = [=]() -> JITTargetAddress {
-      return Callback(wrap(this), CallbackCtx);
-    };
-
-    return CCMgr->getCompileCallback(std::move(WrappedCallback));
-  }
-
-  Error createIndirectStub(StringRef StubName, JITTargetAddress Addr) {
-    return IndirectStubsMgr->createStub(StubName, Addr,
-                                        JITSymbolFlags::Exported);
-  }
-
-  Error setIndirectStubPointer(StringRef Name, JITTargetAddress Addr) {
-    return IndirectStubsMgr->updatePointer(Name, Addr);
-  }
-
-  template <typename LayerT>
-  Expected<orc::VModuleKey>
-  addIRModule(LayerT &Layer, std::unique_ptr<Module> M,
-              std::unique_ptr<RuntimeDyld::MemoryManager> MemMgr,
-              LLVMOrcSymbolResolverFn ExternalResolver,
-              void *ExternalResolverCtx) {
-
-    // Attach a data-layout if one isn't already present.
-    if (M->getDataLayout().isDefault())
-      M->setDataLayout(DL);
-
-    // Record the static constructors and destructors. We have to do this before
-    // we hand over ownership of the module to the JIT.
-    std::vector<std::string> CtorNames, DtorNames;
-    for (auto Ctor : orc::getConstructors(*M))
-      CtorNames.push_back(mangle(Ctor.Func->getName()));
-    for (auto Dtor : orc::getDestructors(*M))
-      DtorNames.push_back(mangle(Dtor.Func->getName()));
-
-    // Add the module to the JIT.
-    auto K = ES.allocateVModule();
-    Resolvers[K] = std::make_shared<CBindingsResolver>(*this, ExternalResolver,
-                                                       ExternalResolverCtx);
-    if (auto Err = Layer.addModule(K, std::move(M)))
-      return std::move(Err);
-
-    KeyLayers[K] = detail::createGenericLayer(Layer);
-
-    // Run the static constructors, and save the static destructor runner for
-    // execution when the JIT is torn down.
-    orc::LegacyCtorDtorRunner<OrcCBindingsStack> CtorRunner(
-        AcknowledgeORCv1Deprecation, std::move(CtorNames), K);
-    if (auto Err = CtorRunner.runViaLayer(*this))
-      return std::move(Err);
-
-    IRStaticDestructorRunners.emplace_back(AcknowledgeORCv1Deprecation,
-                                           std::move(DtorNames), K);
-
-    return K;
-  }
-
-  Expected<orc::VModuleKey>
-  addIRModuleEager(std::unique_ptr<Module> M,
-                   LLVMOrcSymbolResolverFn ExternalResolver,
-                   void *ExternalResolverCtx) {
-    return addIRModule(CompileLayer, std::move(M),
-                       std::make_unique<SectionMemoryManager>(),
-                       std::move(ExternalResolver), ExternalResolverCtx);
-  }
-
-  Expected<orc::VModuleKey>
-  addIRModuleLazy(std::unique_ptr<Module> M,
-                  LLVMOrcSymbolResolverFn ExternalResolver,
-                  void *ExternalResolverCtx) {
-    if (!CODLayer)
-      return make_error<StringError>("Can not add lazy module: No compile "
-                                     "callback manager available",
-                                     inconvertibleErrorCode());
-
-    return addIRModule(*CODLayer, std::move(M),
-                       std::make_unique<SectionMemoryManager>(),
-                       std::move(ExternalResolver), ExternalResolverCtx);
-  }
-
-  Error removeModule(orc::VModuleKey K) {
-    // FIXME: Should error release the module key?
-    if (auto Err = KeyLayers[K]->removeModule(K))
-      return Err;
-    ES.releaseVModule(K);
-    KeyLayers.erase(K);
-    return Error::success();
-  }
-
-  Expected<orc::VModuleKey> addObject(std::unique_ptr<MemoryBuffer> ObjBuffer,
-                                      LLVMOrcSymbolResolverFn ExternalResolver,
-                                      void *ExternalResolverCtx) {
-    if (auto Obj = object::ObjectFile::createObjectFile(
-            ObjBuffer->getMemBufferRef())) {
-
-      auto K = ES.allocateVModule();
-      Resolvers[K] = std::make_shared<CBindingsResolver>(
-          *this, ExternalResolver, ExternalResolverCtx);
-
-      if (auto Err = ObjectLayer.addObject(K, std::move(ObjBuffer)))
-        return std::move(Err);
-
-      KeyLayers[K] = detail::createGenericLayer(ObjectLayer);
-
-      return K;
-    } else
-      return Obj.takeError();
-  }
-
-  JITSymbol findSymbol(const std::string &Name,
-                                 bool ExportedSymbolsOnly) {
-    if (auto Sym = IndirectStubsMgr->findStub(Name, ExportedSymbolsOnly))
-      return Sym;
-    if (CODLayer)
-      return CODLayer->findSymbol(mangle(Name), ExportedSymbolsOnly);
-    return CompileLayer.findSymbol(mangle(Name), ExportedSymbolsOnly);
-  }
-
-  JITSymbol findSymbolIn(orc::VModuleKey K, const std::string &Name,
-                         bool ExportedSymbolsOnly) {
-    assert(KeyLayers.count(K) && "looking up symbol in unknown module");
-    return KeyLayers[K]->findSymbolIn(K, mangle(Name), ExportedSymbolsOnly);
-  }
-
-  Expected<JITTargetAddress> findSymbolAddress(const std::string &Name,
-                                               bool ExportedSymbolsOnly) {
-    if (auto Sym = findSymbol(Name, ExportedSymbolsOnly)) {
-      // Successful lookup, non-null symbol:
-      if (auto AddrOrErr = Sym.getAddress())
-        return *AddrOrErr;
-      else
-        return AddrOrErr.takeError();
-    } else if (auto Err = Sym.takeError()) {
-      // Lookup failure - report error.
-      return std::move(Err);
-    }
-
-    // No symbol not found. Return 0.
-    return 0;
-  }
-
-  Expected<JITTargetAddress> findSymbolAddressIn(orc::VModuleKey K,
-                                                 const std::string &Name,
-                                                 bool ExportedSymbolsOnly) {
-    if (auto Sym = findSymbolIn(K, Name, ExportedSymbolsOnly)) {
-      // Successful lookup, non-null symbol:
-      if (auto AddrOrErr = Sym.getAddress())
-        return *AddrOrErr;
-      else
-        return AddrOrErr.takeError();
-    } else if (auto Err = Sym.takeError()) {
-      // Lookup failure - report error.
-      return std::move(Err);
-    }
-
-    // Symbol not found. Return 0.
-    return 0;
-  }
-
-  const std::string &getErrorMessage() const { return ErrMsg; }
-
-  void RegisterJITEventListener(JITEventListener *L) {
-    if (!L)
-      return;
-    EventListeners.push_back(L);
-  }
-
-  void UnregisterJITEventListener(JITEventListener *L) {
-    if (!L)
-      return;
-
-    auto I = find(reverse(EventListeners), L);
-    if (I != EventListeners.rend()) {
-      std::swap(*I, EventListeners.back());
-      EventListeners.pop_back();
-    }
-  }
-
-private:
-  using ResolverMap =
-      std::map<orc::VModuleKey, std::shared_ptr<orc::SymbolResolver>>;
-
-  static std::unique_ptr<CompileCallbackMgr>
-  createCompileCallbackManager(TargetMachine &TM, orc::ExecutionSession &ES) {
-    auto CCMgr = createLocalCompileCallbackManager(TM.getTargetTriple(), ES, 0);
-    if (!CCMgr) {
-      // FIXME: It would be good if we could report this somewhere, but we do
-      //        have an instance yet.
-      logAllUnhandledErrors(CCMgr.takeError(), errs(), "ORC error: ");
-      return nullptr;
-    }
-    return std::move(*CCMgr);
-  }
-
-  static std::unique_ptr<CODLayerT>
-  createCODLayer(orc::ExecutionSession &ES, CompileLayerT &CompileLayer,
-                 CompileCallbackMgr *CCMgr,
-                 IndirectStubsManagerBuilder IndirectStubsMgrBuilder,
-                 ResolverMap &Resolvers) {
-    // If there is no compile callback manager available we can not create a
-    // compile on demand layer.
-    if (!CCMgr)
-      return nullptr;
-
-    return std::make_unique<CODLayerT>(
-        AcknowledgeORCv1Deprecation, ES, CompileLayer,
-        [&Resolvers](orc::VModuleKey K) {
-          auto ResolverI = Resolvers.find(K);
-          assert(ResolverI != Resolvers.end() && "No resolver for module K");
-          return ResolverI->second;
-        },
-        [&Resolvers](orc::VModuleKey K,
-                     std::shared_ptr<orc::SymbolResolver> Resolver) {
-          assert(!Resolvers.count(K) && "Resolver already present");
-          Resolvers[K] = std::move(Resolver);
-        },
-        [](Function &F) { return std::set<Function *>({&F}); }, *CCMgr,
-        std::move(IndirectStubsMgrBuilder), false);
-  }
-
-  void reportError(Error Err) {
-    // FIXME: Report errors on the execution session.
-    logAllUnhandledErrors(std::move(Err), errs(), "ORC error: ");
-  };
-
-  void notifyFinalized(orc::VModuleKey K,
-		       const object::ObjectFile &Obj,
-		       const RuntimeDyld::LoadedObjectInfo &LoadedObjInfo) {
-    uint64_t Key = static_cast<uint64_t>(
-        reinterpret_cast<uintptr_t>(Obj.getData().data()));
-    for (auto &Listener : EventListeners)
-      Listener->notifyObjectLoaded(Key, Obj, LoadedObjInfo);
-  }
-
-  void notifyFreed(orc::VModuleKey K, const object::ObjectFile &Obj) {
-    uint64_t Key = static_cast<uint64_t>(
-        reinterpret_cast<uintptr_t>(Obj.getData().data()));
-    for (auto &Listener : EventListeners)
-      Listener->notifyFreeingObject(Key);
-  }
-
-  orc::ExecutionSession ES;
-  std::unique_ptr<CompileCallbackMgr> CCMgr;
-
-  std::vector<JITEventListener *> EventListeners;
-
-  DataLayout DL;
-  SectionMemoryManager CCMgrMemMgr;
-
-  std::unique_ptr<orc::IndirectStubsManager> IndirectStubsMgr;
-
-  ObjLayerT ObjectLayer;
-  CompileLayerT CompileLayer;
-  std::unique_ptr<CODLayerT> CODLayer;
-
-  std::map<orc::VModuleKey, std::unique_ptr<detail::GenericLayer>> KeyLayers;
-
-  orc::LegacyLocalCXXRuntimeOverrides CXXRuntimeOverrides;
-  std::vector<orc::LegacyCtorDtorRunner<OrcCBindingsStack>> IRStaticDestructorRunners;
-  std::string ErrMsg;
-
-  ResolverMap Resolvers;
-};
-
-} // end namespace llvm
-
-#endif // LLVM_LIB_EXECUTIONENGINE_ORC_ORCCBINDINGSSTACK_H
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/OrcMCJITReplacement.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/OrcMCJITReplacement.cpp
deleted file mode 100644
index 772a9c2c4ab2..000000000000
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/OrcMCJITReplacement.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-//===-------- OrcMCJITReplacement.cpp - Orc-based MCJIT replacement -------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "OrcMCJITReplacement.h"
-#include "llvm/ExecutionEngine/GenericValue.h"
-
-namespace {
-
-static struct RegisterJIT {
-  RegisterJIT() { llvm::orc::OrcMCJITReplacement::Register(); }
-} JITRegistrator;
-
-}
-
-extern "C" void LLVMLinkInOrcMCJITReplacement() {}
-
-namespace llvm {
-namespace orc {
-
-GenericValue
-OrcMCJITReplacement::runFunction(Function *F,
-                                 ArrayRef<GenericValue> ArgValues) {
-  assert(F && "Function *F was null at entry to run()");
-
-  void *FPtr = getPointerToFunction(F);
-  assert(FPtr && "Pointer to fn's code was null after getPointerToFunction");
-  FunctionType *FTy = F->getFunctionType();
-  Type *RetTy = FTy->getReturnType();
-
-  assert((FTy->getNumParams() == ArgValues.size() ||
-          (FTy->isVarArg() && FTy->getNumParams() <= ArgValues.size())) &&
-         "Wrong number of arguments passed into function!");
-  assert(FTy->getNumParams() == ArgValues.size() &&
-         "This doesn't support passing arguments through varargs (yet)!");
-
-  // Handle some common cases first.  These cases correspond to common `main'
-  // prototypes.
-  if (RetTy->isIntegerTy(32) || RetTy->isVoidTy()) {
-    switch (ArgValues.size()) {
-    case 3:
-      if (FTy->getParamType(0)->isIntegerTy(32) &&
-          FTy->getParamType(1)->isPointerTy() &&
-          FTy->getParamType(2)->isPointerTy()) {
-        int (*PF)(int, char **, const char **) =
-            (int (*)(int, char **, const char **))(intptr_t)FPtr;
-
-        // Call the function.
-        GenericValue rv;
-        rv.IntVal = APInt(32, PF(ArgValues[0].IntVal.getZExtValue(),
-                                 (char **)GVTOP(ArgValues[1]),
-                                 (const char **)GVTOP(ArgValues[2])));
-        return rv;
-      }
-      break;
-    case 2:
-      if (FTy->getParamType(0)->isIntegerTy(32) &&
-          FTy->getParamType(1)->isPointerTy()) {
-        int (*PF)(int, char **) = (int (*)(int, char **))(intptr_t)FPtr;
-
-        // Call the function.
-        GenericValue rv;
-        rv.IntVal = APInt(32, PF(ArgValues[0].IntVal.getZExtValue(),
-                                 (char **)GVTOP(ArgValues[1])));
-        return rv;
-      }
-      break;
-    case 1:
-      if (FTy->getNumParams() == 1 && FTy->getParamType(0)->isIntegerTy(32)) {
-        GenericValue rv;
-        int (*PF)(int) = (int (*)(int))(intptr_t)FPtr;
-        rv.IntVal = APInt(32, PF(ArgValues[0].IntVal.getZExtValue()));
-        return rv;
-      }
-      break;
-    }
-  }
-
-  // Handle cases where no arguments are passed first.
-  if (ArgValues.empty()) {
-    GenericValue rv;
-    switch (RetTy->getTypeID()) {
-    default:
-      llvm_unreachable("Unknown return type for function call!");
-    case Type::IntegerTyID: {
-      unsigned BitWidth = cast<IntegerType>(RetTy)->getBitWidth();
-      if (BitWidth == 1)
-        rv.IntVal = APInt(BitWidth, ((bool (*)())(intptr_t)FPtr)());
-      else if (BitWidth <= 8)
-        rv.IntVal = APInt(BitWidth, ((char (*)())(intptr_t)FPtr)());
-      else if (BitWidth <= 16)
-        rv.IntVal = APInt(BitWidth, ((short (*)())(intptr_t)FPtr)());
-      else if (BitWidth <= 32)
-        rv.IntVal = APInt(BitWidth, ((int (*)())(intptr_t)FPtr)());
-      else if (BitWidth <= 64)
-        rv.IntVal = APInt(BitWidth, ((int64_t (*)())(intptr_t)FPtr)());
-      else
-        llvm_unreachable("Integer types > 64 bits not supported");
-      return rv;
-    }
-    case Type::VoidTyID:
-      rv.IntVal = APInt(32, ((int (*)())(intptr_t)FPtr)());
-      return rv;
-    case Type::FloatTyID:
-      rv.FloatVal = ((float (*)())(intptr_t)FPtr)();
-      return rv;
-    case Type::DoubleTyID:
-      rv.DoubleVal = ((double (*)())(intptr_t)FPtr)();
-      return rv;
-    case Type::X86_FP80TyID:
-    case Type::FP128TyID:
-    case Type::PPC_FP128TyID:
-      llvm_unreachable("long double not supported yet");
-    case Type::PointerTyID:
-      return PTOGV(((void *(*)())(intptr_t)FPtr)());
-    }
-  }
-
-  llvm_unreachable("Full-featured argument passing not supported yet!");
-}
-
-void OrcMCJITReplacement::runStaticConstructorsDestructors(bool isDtors) {
-  auto &CtorDtorsMap = isDtors ? UnexecutedDestructors : UnexecutedConstructors;
-
-  for (auto &KV : CtorDtorsMap)
-    cantFail(LegacyCtorDtorRunner<LazyEmitLayerT>(
-                 AcknowledgeORCv1Deprecation, std::move(KV.second), KV.first)
-                 .runViaLayer(LazyEmitLayer));
-
-  CtorDtorsMap.clear();
-}
-
-} // End namespace orc.
-} // End namespace llvm.
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h
deleted file mode 100644
index 139572bd6977..000000000000
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h
+++ /dev/null
@@ -1,502 +0,0 @@
-//===- OrcMCJITReplacement.h - Orc based MCJIT replacement ------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Orc based MCJIT replacement.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_EXECUTIONENGINE_ORC_ORCMCJITREPLACEMENT_H
-#define LLVM_LIB_EXECUTIONENGINE_ORC_ORCMCJITREPLACEMENT_H
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ExecutionEngine/ExecutionEngine.h"
-#include "llvm/ExecutionEngine/GenericValue.h"
-#include "llvm/ExecutionEngine/JITSymbol.h"
-#include "llvm/ExecutionEngine/Orc/CompileUtils.h"
-#include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
-#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
-#include "llvm/ExecutionEngine/Orc/LazyEmittingLayer.h"
-#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
-#include "llvm/ExecutionEngine/RTDyldMemoryManager.h"
-#include "llvm/ExecutionEngine/RuntimeDyld.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Mangler.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Object/Archive.h"
-#include "llvm/Object/Binary.h"
-#include "llvm/Object/ObjectFile.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
-#include <algorithm>
-#include <cassert>
-#include <cstddef>
-#include <cstdint>
-#include <map>
-#include <memory>
-#include <set>
-#include <string>
-#include <vector>
-
-namespace llvm {
-
-class ObjectCache;
-
-namespace orc {
-
-class OrcMCJITReplacement : public ExecutionEngine {
-
-  // OrcMCJITReplacement needs to do a little extra book-keeping to ensure that
-  // Orc's automatic finalization doesn't kick in earlier than MCJIT clients are
-  // expecting - see finalizeMemory.
-  class MCJITReplacementMemMgr : public MCJITMemoryManager {
-  public:
-    MCJITReplacementMemMgr(OrcMCJITReplacement &M,
-                           std::shared_ptr<MCJITMemoryManager> ClientMM)
-      : M(M), ClientMM(std::move(ClientMM)) {}
-
-    uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment,
-                                 unsigned SectionID,
-                                 StringRef SectionName) override {
-      uint8_t *Addr =
-          ClientMM->allocateCodeSection(Size, Alignment, SectionID,
-                                        SectionName);
-      M.SectionsAllocatedSinceLastLoad.insert(Addr);
-      return Addr;
-    }
-
-    uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment,
-                                 unsigned SectionID, StringRef SectionName,
-                                 bool IsReadOnly) override {
-      uint8_t *Addr = ClientMM->allocateDataSection(Size, Alignment, SectionID,
-                                                    SectionName, IsReadOnly);
-      M.SectionsAllocatedSinceLastLoad.insert(Addr);
-      return Addr;
-    }
-
-    void reserveAllocationSpace(uintptr_t CodeSize, uint32_t CodeAlign,
-                                uintptr_t RODataSize, uint32_t RODataAlign,
-                                uintptr_t RWDataSize,
-                                uint32_t RWDataAlign) override {
-      return ClientMM->reserveAllocationSpace(CodeSize, CodeAlign,
-                                              RODataSize, RODataAlign,
-                                              RWDataSize, RWDataAlign);
-    }
-
-    bool needsToReserveAllocationSpace() override {
-      return ClientMM->needsToReserveAllocationSpace();
-    }
-
-    void registerEHFrames(uint8_t *Addr, uint64_t LoadAddr,
-                          size_t Size) override {
-      return ClientMM->registerEHFrames(Addr, LoadAddr, Size);
-    }
-
-    void deregisterEHFrames() override {
-      return ClientMM->deregisterEHFrames();
-    }
-
-    void notifyObjectLoaded(RuntimeDyld &RTDyld,
-                            const object::ObjectFile &O) override {
-      return ClientMM->notifyObjectLoaded(RTDyld, O);
-    }
-
-    void notifyObjectLoaded(ExecutionEngine *EE,
-                            const object::ObjectFile &O) override {
-      return ClientMM->notifyObjectLoaded(EE, O);
-    }
-
-    bool finalizeMemory(std::string *ErrMsg = nullptr) override {
-      // Each set of objects loaded will be finalized exactly once, but since
-      // symbol lookup during relocation may recursively trigger the
-      // loading/relocation of other modules, and since we're forwarding all
-      // finalizeMemory calls to a single underlying memory manager, we need to
-      // defer forwarding the call on until all necessary objects have been
-      // loaded. Otherwise, during the relocation of a leaf object, we will end
-      // up finalizing memory, causing a crash further up the stack when we
-      // attempt to apply relocations to finalized memory.
-      // To avoid finalizing too early, look at how many objects have been
-      // loaded but not yet finalized. This is a bit of a hack that relies on
-      // the fact that we're lazily emitting object files: The only way you can
-      // get more than one set of objects loaded but not yet finalized is if
-      // they were loaded during relocation of another set.
-      if (M.UnfinalizedSections.size() == 1)
-        return ClientMM->finalizeMemory(ErrMsg);
-      return false;
-    }
-
-  private:
-    OrcMCJITReplacement &M;
-    std::shared_ptr<MCJITMemoryManager> ClientMM;
-  };
-
-  class LinkingORCResolver : public orc::SymbolResolver {
-  public:
-    LinkingORCResolver(OrcMCJITReplacement &M) : M(M) {}
-
-    SymbolNameSet getResponsibilitySet(const SymbolNameSet &Symbols) override {
-      SymbolNameSet Result;
-
-      for (auto &S : Symbols) {
-        if (auto Sym = M.findMangledSymbol(*S)) {
-          if (!Sym.getFlags().isStrong())
-            Result.insert(S);
-        } else if (auto Err = Sym.takeError()) {
-          M.reportError(std::move(Err));
-          return SymbolNameSet();
-        } else {
-          if (auto Sym2 =
-                  M.ClientResolver->findSymbolInLogicalDylib(std::string(*S))) {
-            if (!Sym2.getFlags().isStrong())
-              Result.insert(S);
-          } else if (auto Err = Sym2.takeError()) {
-            M.reportError(std::move(Err));
-            return SymbolNameSet();
-          } else
-            Result.insert(S);
-        }
-      }
-
-      return Result;
-    }
-
-    SymbolNameSet lookup(std::shared_ptr<AsynchronousSymbolQuery> Query,
-                         SymbolNameSet Symbols) override {
-      SymbolNameSet UnresolvedSymbols;
-      bool NewSymbolsResolved = false;
-
-      for (auto &S : Symbols) {
-        if (auto Sym = M.findMangledSymbol(*S)) {
-          if (auto Addr = Sym.getAddress()) {
-            Query->notifySymbolMetRequiredState(
-                S, JITEvaluatedSymbol(*Addr, Sym.getFlags()));
-            NewSymbolsResolved = true;
-          } else {
-            M.ES.legacyFailQuery(*Query, Addr.takeError());
-            return SymbolNameSet();
-          }
-        } else if (auto Err = Sym.takeError()) {
-          M.ES.legacyFailQuery(*Query, std::move(Err));
-          return SymbolNameSet();
-        } else {
-          if (auto Sym2 = M.ClientResolver->findSymbol(std::string(*S))) {
-            if (auto Addr = Sym2.getAddress()) {
-              Query->notifySymbolMetRequiredState(
-                  S, JITEvaluatedSymbol(*Addr, Sym2.getFlags()));
-              NewSymbolsResolved = true;
-            } else {
-              M.ES.legacyFailQuery(*Query, Addr.takeError());
-              return SymbolNameSet();
-            }
-          } else if (auto Err = Sym2.takeError()) {
-            M.ES.legacyFailQuery(*Query, std::move(Err));
-            return SymbolNameSet();
-          } else
-            UnresolvedSymbols.insert(S);
-        }
-      }
-
-      if (NewSymbolsResolved && Query->isComplete())
-        Query->handleComplete();
-
-      return UnresolvedSymbols;
-    }
-
-  private:
-    OrcMCJITReplacement &M;
-  };
-
-private:
-  static ExecutionEngine *
-  createOrcMCJITReplacement(std::string *ErrorMsg,
-                            std::shared_ptr<MCJITMemoryManager> MemMgr,
-                            std::shared_ptr<LegacyJITSymbolResolver> Resolver,
-                            std::unique_ptr<TargetMachine> TM) {
-    return new OrcMCJITReplacement(std::move(MemMgr), std::move(Resolver),
-                                   std::move(TM));
-  }
-
-  void reportError(Error Err) {
-    logAllUnhandledErrors(std::move(Err), errs(), "MCJIT error: ");
-  }
-
-public:
-  OrcMCJITReplacement(std::shared_ptr<MCJITMemoryManager> MemMgr,
-                      std::shared_ptr<LegacyJITSymbolResolver> ClientResolver,
-                      std::unique_ptr<TargetMachine> TM)
-      : ExecutionEngine(TM->createDataLayout()), TM(std::move(TM)),
-        MemMgr(
-            std::make_shared<MCJITReplacementMemMgr>(*this, std::move(MemMgr))),
-        Resolver(std::make_shared<LinkingORCResolver>(*this)),
-        ClientResolver(std::move(ClientResolver)), NotifyObjectLoaded(*this),
-        NotifyFinalized(*this),
-        ObjectLayer(
-            AcknowledgeORCv1Deprecation, ES,
-            [this](VModuleKey K) {
-              return ObjectLayerT::Resources{this->MemMgr, this->Resolver};
-            },
-            NotifyObjectLoaded, NotifyFinalized),
-        CompileLayer(AcknowledgeORCv1Deprecation, ObjectLayer,
-                     SimpleCompiler(*this->TM),
-                     [this](VModuleKey K, std::unique_ptr<Module> M) {
-                       Modules.push_back(std::move(M));
-                     }),
-        LazyEmitLayer(AcknowledgeORCv1Deprecation, CompileLayer) {}
-
-  static void Register() {
-    OrcMCJITReplacementCtor = createOrcMCJITReplacement;
-  }
-
-  void addModule(std::unique_ptr<Module> M) override {
-    // If this module doesn't have a DataLayout attached then attach the
-    // default.
-    if (M->getDataLayout().isDefault()) {
-      M->setDataLayout(getDataLayout());
-    } else {
-      assert(M->getDataLayout() == getDataLayout() && "DataLayout Mismatch");
-    }
-
-    // Rename, bump linkage and record static constructors and destructors.
-    // We have to do this before we hand over ownership of the module to the
-    // JIT.
-    std::vector<std::string> CtorNames, DtorNames;
-    {
-      unsigned CtorId = 0, DtorId = 0;
-      for (auto Ctor : orc::getConstructors(*M)) {
-        std::string NewCtorName = ("__ORCstatic_ctor." + Twine(CtorId++)).str();
-        Ctor.Func->setName(NewCtorName);
-        Ctor.Func->setLinkage(GlobalValue::ExternalLinkage);
-        Ctor.Func->setVisibility(GlobalValue::HiddenVisibility);
-        CtorNames.push_back(mangle(NewCtorName));
-      }
-      for (auto Dtor : orc::getDestructors(*M)) {
-        std::string NewDtorName = ("__ORCstatic_dtor." + Twine(DtorId++)).str();
-        dbgs() << "Found dtor: " << NewDtorName << "\n";
-        Dtor.Func->setName(NewDtorName);
-        Dtor.Func->setLinkage(GlobalValue::ExternalLinkage);
-        Dtor.Func->setVisibility(GlobalValue::HiddenVisibility);
-        DtorNames.push_back(mangle(NewDtorName));
-      }
-    }
-
-    auto K = ES.allocateVModule();
-
-    UnexecutedConstructors[K] = std::move(CtorNames);
-    UnexecutedDestructors[K] = std::move(DtorNames);
-
-    cantFail(LazyEmitLayer.addModule(K, std::move(M)));
-  }
-
-  void addObjectFile(std::unique_ptr<object::ObjectFile> O) override {
-    cantFail(ObjectLayer.addObject(
-        ES.allocateVModule(), MemoryBuffer::getMemBufferCopy(O->getData())));
-  }
-
-  void addObjectFile(object::OwningBinary<object::ObjectFile> O) override {
-    std::unique_ptr<object::ObjectFile> Obj;
-    std::unique_ptr<MemoryBuffer> ObjBuffer;
-    std::tie(Obj, ObjBuffer) = O.takeBinary();
-    cantFail(ObjectLayer.addObject(ES.allocateVModule(), std::move(ObjBuffer)));
-  }
-
-  void addArchive(object::OwningBinary<object::Archive> A) override {
-    Archives.push_back(std::move(A));
-  }
-
-  bool removeModule(Module *M) override {
-    auto I = Modules.begin();
-    for (auto E = Modules.end(); I != E; ++I)
-      if (I->get() == M)
-        break;
-    if (I == Modules.end())
-      return false;
-    Modules.erase(I);
-    return true;
-  }
-
-  uint64_t getSymbolAddress(StringRef Name) {
-    return cantFail(findSymbol(Name).getAddress());
-  }
-
-  JITSymbol findSymbol(StringRef Name) {
-    return findMangledSymbol(mangle(Name));
-  }
-
-  void finalizeObject() override {
-    // This is deprecated - Aim to remove in ExecutionEngine.
-    // REMOVE IF POSSIBLE - Doesn't make sense for New JIT.
-  }
-
-  void mapSectionAddress(const void *LocalAddress,
-                         uint64_t TargetAddress) override {
-    for (auto &P : UnfinalizedSections)
-      if (P.second.count(LocalAddress))
-        ObjectLayer.mapSectionAddress(P.first, LocalAddress, TargetAddress);
-  }
-
-  uint64_t getGlobalValueAddress(const std::string &Name) override {
-    return getSymbolAddress(Name);
-  }
-
-  uint64_t getFunctionAddress(const std::string &Name) override {
-    return getSymbolAddress(Name);
-  }
-
-  void *getPointerToFunction(Function *F) override {
-    uint64_t FAddr = getSymbolAddress(F->getName());
-    return reinterpret_cast<void *>(static_cast<uintptr_t>(FAddr));
-  }
-
-  void *getPointerToNamedFunction(StringRef Name,
-                                  bool AbortOnFailure = true) override {
-    uint64_t Addr = getSymbolAddress(Name);
-    if (!Addr && AbortOnFailure)
-      llvm_unreachable("Missing symbol!");
-    return reinterpret_cast<void *>(static_cast<uintptr_t>(Addr));
-  }
-
-  GenericValue runFunction(Function *F,
-                           ArrayRef<GenericValue> ArgValues) override;
-
-  void setObjectCache(ObjectCache *NewCache) override {
-    CompileLayer.getCompiler().setObjectCache(NewCache);
-  }
-
-  void setProcessAllSections(bool ProcessAllSections) override {
-    ObjectLayer.setProcessAllSections(ProcessAllSections);
-  }
-
-  void runStaticConstructorsDestructors(bool isDtors) override;
-
-private:
-  JITSymbol findMangledSymbol(StringRef Name) {
-    if (auto Sym = LazyEmitLayer.findSymbol(std::string(Name), false))
-      return Sym;
-    if (auto Sym = ClientResolver->findSymbol(std::string(Name)))
-      return Sym;
-    if (auto Sym = scanArchives(Name))
-      return Sym;
-
-    return nullptr;
-  }
-
-  JITSymbol scanArchives(StringRef Name) {
-    for (object::OwningBinary<object::Archive> &OB : Archives) {
-      object::Archive *A = OB.getBinary();
-      // Look for our symbols in each Archive
-      auto OptionalChildOrErr = A->findSym(Name);
-      if (!OptionalChildOrErr)
-        report_fatal_error(OptionalChildOrErr.takeError());
-      auto &OptionalChild = *OptionalChildOrErr;
-      if (OptionalChild) {
-        // FIXME: Support nested archives?
-        Expected<std::unique_ptr<object::Binary>> ChildBinOrErr =
-            OptionalChild->getAsBinary();
-        if (!ChildBinOrErr) {
-          // TODO: Actually report errors helpfully.
-          consumeError(ChildBinOrErr.takeError());
-          continue;
-        }
-        std::unique_ptr<object::Binary> &ChildBin = ChildBinOrErr.get();
-        if (ChildBin->isObject()) {
-          cantFail(ObjectLayer.addObject(
-              ES.allocateVModule(),
-              MemoryBuffer::getMemBufferCopy(ChildBin->getData())));
-          if (auto Sym = ObjectLayer.findSymbol(Name, true))
-            return Sym;
-        }
-      }
-    }
-    return nullptr;
-  }
-
-  class NotifyObjectLoadedT {
-  public:
-    using LoadedObjInfoListT =
-        std::vector<std::unique_ptr<RuntimeDyld::LoadedObjectInfo>>;
-
-    NotifyObjectLoadedT(OrcMCJITReplacement &M) : M(M) {}
-
-    void operator()(VModuleKey K, const object::ObjectFile &Obj,
-                    const RuntimeDyld::LoadedObjectInfo &Info) const {
-      M.UnfinalizedSections[K] = std::move(M.SectionsAllocatedSinceLastLoad);
-      M.SectionsAllocatedSinceLastLoad = SectionAddrSet();
-      M.MemMgr->notifyObjectLoaded(&M, Obj);
-    }
-  private:
-    OrcMCJITReplacement &M;
-  };
-
-  class NotifyFinalizedT {
-  public:
-    NotifyFinalizedT(OrcMCJITReplacement &M) : M(M) {}
-
-    void operator()(VModuleKey K, const object::ObjectFile &Obj,
-                    const RuntimeDyld::LoadedObjectInfo &Info) {
-      M.UnfinalizedSections.erase(K);
-    }
-
-  private:
-    OrcMCJITReplacement &M;
-  };
-
-  std::string mangle(StringRef Name) {
-    std::string MangledName;
-    {
-      raw_string_ostream MangledNameStream(MangledName);
-      Mang.getNameWithPrefix(MangledNameStream, Name, getDataLayout());
-    }
-    return MangledName;
-  }
-
-  using ObjectLayerT = LegacyRTDyldObjectLinkingLayer;
-  using CompileLayerT = LegacyIRCompileLayer<ObjectLayerT, orc::SimpleCompiler>;
-  using LazyEmitLayerT = LazyEmittingLayer<CompileLayerT>;
-
-  ExecutionSession ES;
-
-  std::unique_ptr<TargetMachine> TM;
-  std::shared_ptr<MCJITReplacementMemMgr> MemMgr;
-  std::shared_ptr<LinkingORCResolver> Resolver;
-  std::shared_ptr<LegacyJITSymbolResolver> ClientResolver;
-  Mangler Mang;
-
-  // IMPORTANT: ShouldDelete *must* come before LocalModules: The shared_ptr
-  // delete blocks in LocalModules refer to the ShouldDelete map, so
-  // LocalModules needs to be destructed before ShouldDelete.
-  std::map<Module*, bool> ShouldDelete;
-
-  NotifyObjectLoadedT NotifyObjectLoaded;
-  NotifyFinalizedT NotifyFinalized;
-
-  ObjectLayerT ObjectLayer;
-  CompileLayerT CompileLayer;
-  LazyEmitLayerT LazyEmitLayer;
-
-  std::map<VModuleKey, std::vector<std::string>> UnexecutedConstructors;
-  std::map<VModuleKey, std::vector<std::string>> UnexecutedDestructors;
-
-  // We need to store ObjLayerT::ObjSetHandles for each of the object sets
-  // that have been emitted but not yet finalized so that we can forward the
-  // mapSectionAddress calls appropriately.
-  using SectionAddrSet = std::set<const void *>;
-  SectionAddrSet SectionsAllocatedSinceLastLoad;
-  std::map<VModuleKey, SectionAddrSet> UnfinalizedSections;
-
-  std::vector<object::OwningBinary<object::Archive>> Archives;
-};
-
-} // end namespace orc
-
-} // end namespace llvm
-
-#endif // LLVM_LIB_EXECUTIONENGINE_ORC_MCJITREPLACEMENT_H
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp
index 5933c2e666d1..834d4cc8f514 100644
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp
@@ -6,11 +6,15 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm-c/LLJIT.h"
 #include "llvm-c/Orc.h"
+#include "llvm-c/OrcEE.h"
 #include "llvm-c/TargetMachine.h"
 
 #include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
 #include "llvm/ExecutionEngine/Orc/LLJIT.h"
+#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
+#include "llvm/ExecutionEngine/SectionMemoryManager.h"
 
 using namespace llvm;
 using namespace llvm::orc;
@@ -18,6 +22,8 @@ using namespace llvm::orc;
 namespace llvm {
 namespace orc {
 
+class InProgressLookupState;
+
 class OrcV2CAPIHelper {
 public:
   using PoolEntry = SymbolStringPtr::PoolEntry;
@@ -29,58 +35,278 @@ public:
     return Result;
   }
 
+  static SymbolStringPtr retainSymbolStringPtr(PoolEntryPtr P) {
+    return SymbolStringPtr(P);
+  }
+
   static PoolEntryPtr getRawPoolEntryPtr(const SymbolStringPtr &S) {
     return S.S;
   }
 
+  static void retainPoolEntry(PoolEntryPtr P) {
+    SymbolStringPtr S(P);
+    S.S = nullptr;
+  }
+
   static void releasePoolEntry(PoolEntryPtr P) {
     SymbolStringPtr S;
     S.S = P;
   }
+
+  static InProgressLookupState *extractLookupState(LookupState &LS) {
+    return LS.IPLS.release();
+  }
+
+  static void resetLookupState(LookupState &LS, InProgressLookupState *IPLS) {
+    return LS.reset(IPLS);
+  }
 };
 
-} // end namespace orc
-} // end namespace llvm
+} // namespace orc
+} // namespace llvm
 
 DEFINE_SIMPLE_CONVERSION_FUNCTIONS(ExecutionSession, LLVMOrcExecutionSessionRef)
+DEFINE_SIMPLE_CONVERSION_FUNCTIONS(SymbolStringPool, LLVMOrcSymbolStringPoolRef)
 DEFINE_SIMPLE_CONVERSION_FUNCTIONS(OrcV2CAPIHelper::PoolEntry,
                                    LLVMOrcSymbolStringPoolEntryRef)
+DEFINE_SIMPLE_CONVERSION_FUNCTIONS(MaterializationUnit,
+                                   LLVMOrcMaterializationUnitRef)
 DEFINE_SIMPLE_CONVERSION_FUNCTIONS(JITDylib, LLVMOrcJITDylibRef)
-DEFINE_SIMPLE_CONVERSION_FUNCTIONS(JITDylib::DefinitionGenerator,
-                                   LLVMOrcJITDylibDefinitionGeneratorRef)
+DEFINE_SIMPLE_CONVERSION_FUNCTIONS(ResourceTracker, LLVMOrcResourceTrackerRef)
+DEFINE_SIMPLE_CONVERSION_FUNCTIONS(DefinitionGenerator,
+                                   LLVMOrcDefinitionGeneratorRef)
+DEFINE_SIMPLE_CONVERSION_FUNCTIONS(InProgressLookupState, LLVMOrcLookupStateRef)
 DEFINE_SIMPLE_CONVERSION_FUNCTIONS(ThreadSafeContext,
                                    LLVMOrcThreadSafeContextRef)
 DEFINE_SIMPLE_CONVERSION_FUNCTIONS(ThreadSafeModule, LLVMOrcThreadSafeModuleRef)
 DEFINE_SIMPLE_CONVERSION_FUNCTIONS(JITTargetMachineBuilder,
                                    LLVMOrcJITTargetMachineBuilderRef)
+DEFINE_SIMPLE_CONVERSION_FUNCTIONS(ObjectLayer, LLVMOrcObjectLayerRef)
 DEFINE_SIMPLE_CONVERSION_FUNCTIONS(LLJITBuilder, LLVMOrcLLJITBuilderRef)
 DEFINE_SIMPLE_CONVERSION_FUNCTIONS(LLJIT, LLVMOrcLLJITRef)
 
 DEFINE_SIMPLE_CONVERSION_FUNCTIONS(TargetMachine, LLVMTargetMachineRef)
 
+namespace llvm {
+namespace orc {
+
+class CAPIDefinitionGenerator final : public DefinitionGenerator {
+public:
+  CAPIDefinitionGenerator(
+      void *Ctx,
+      LLVMOrcCAPIDefinitionGeneratorTryToGenerateFunction TryToGenerate)
+      : Ctx(Ctx), TryToGenerate(TryToGenerate) {}
+
+  Error tryToGenerate(LookupState &LS, LookupKind K, JITDylib &JD,
+                      JITDylibLookupFlags JDLookupFlags,
+                      const SymbolLookupSet &LookupSet) override {
+
+    // Take the lookup state.
+    LLVMOrcLookupStateRef LSR = ::wrap(OrcV2CAPIHelper::extractLookupState(LS));
+
+    // Translate the lookup kind.
+    LLVMOrcLookupKind CLookupKind;
+    switch (K) {
+    case LookupKind::Static:
+      CLookupKind = LLVMOrcLookupKindStatic;
+      break;
+    case LookupKind::DLSym:
+      CLookupKind = LLVMOrcLookupKindDLSym;
+      break;
+    }
+
+    // Translate the JITDylibSearchFlags.
+    LLVMOrcJITDylibLookupFlags CJDLookupFlags;
+    switch (JDLookupFlags) {
+    case JITDylibLookupFlags::MatchExportedSymbolsOnly:
+      CJDLookupFlags = LLVMOrcJITDylibLookupFlagsMatchExportedSymbolsOnly;
+      break;
+    case JITDylibLookupFlags::MatchAllSymbols:
+      CJDLookupFlags = LLVMOrcJITDylibLookupFlagsMatchAllSymbols;
+      break;
+    }
+
+    // Translate the lookup set.
+    std::vector<LLVMOrcCLookupSetElement> CLookupSet;
+    CLookupSet.reserve(LookupSet.size());
+    for (auto &KV : LookupSet) {
+      LLVMOrcSymbolLookupFlags SLF;
+      LLVMOrcSymbolStringPoolEntryRef Name =
+        ::wrap(OrcV2CAPIHelper::getRawPoolEntryPtr(KV.first));
+      switch (KV.second) {
+      case SymbolLookupFlags::RequiredSymbol:
+        SLF = LLVMOrcSymbolLookupFlagsRequiredSymbol;
+        break;
+      case SymbolLookupFlags::WeaklyReferencedSymbol:
+        SLF = LLVMOrcSymbolLookupFlagsWeaklyReferencedSymbol;
+        break;
+      }
+      CLookupSet.push_back({Name, SLF});
+    }
+
+    // Run the C TryToGenerate function.
+    auto Err = unwrap(TryToGenerate(::wrap(this), Ctx, &LSR, CLookupKind,
+                                    ::wrap(&JD), CJDLookupFlags,
+                                    CLookupSet.data(), CLookupSet.size()));
+
+    // Restore the lookup state.
+    OrcV2CAPIHelper::resetLookupState(LS, ::unwrap(LSR));
+
+    return Err;
+  }
+
+private:
+  void *Ctx;
+  LLVMOrcCAPIDefinitionGeneratorTryToGenerateFunction TryToGenerate;
+};
+
+} // end namespace orc
+} // end namespace llvm
+
+void LLVMOrcExecutionSessionSetErrorReporter(
+    LLVMOrcExecutionSessionRef ES, LLVMOrcErrorReporterFunction ReportError,
+    void *Ctx) {
+  unwrap(ES)->setErrorReporter(
+      [=](Error Err) { ReportError(Ctx, wrap(std::move(Err))); });
+}
+
+LLVMOrcSymbolStringPoolRef
+LLVMOrcExecutionSessionGetSymbolStringPool(LLVMOrcExecutionSessionRef ES) {
+  return wrap(unwrap(ES)->getSymbolStringPool().get());
+}
+
+void LLVMOrcSymbolStringPoolClearDeadEntries(LLVMOrcSymbolStringPoolRef SSP) {
+  unwrap(SSP)->clearDeadEntries();
+}
+
 LLVMOrcSymbolStringPoolEntryRef
 LLVMOrcExecutionSessionIntern(LLVMOrcExecutionSessionRef ES, const char *Name) {
   return wrap(
       OrcV2CAPIHelper::releaseSymbolStringPtr(unwrap(ES)->intern(Name)));
 }
 
+void LLVMOrcRetainSymbolStringPoolEntry(LLVMOrcSymbolStringPoolEntryRef S) {
+  OrcV2CAPIHelper::retainPoolEntry(unwrap(S));
+}
+
 void LLVMOrcReleaseSymbolStringPoolEntry(LLVMOrcSymbolStringPoolEntryRef S) {
   OrcV2CAPIHelper::releasePoolEntry(unwrap(S));
 }
 
-void LLVMOrcDisposeJITDylibDefinitionGenerator(
-    LLVMOrcJITDylibDefinitionGeneratorRef DG) {
-  delete unwrap(DG);
+const char *LLVMOrcSymbolStringPoolEntryStr(LLVMOrcSymbolStringPoolEntryRef S) {
+  return unwrap(S)->getKey().data();
+}
+
+LLVMOrcResourceTrackerRef
+LLVMOrcJITDylibCreateResourceTracker(LLVMOrcJITDylibRef JD) {
+  auto RT = unwrap(JD)->createResourceTracker();
+  // Retain the pointer for the C API client.
+  RT->Retain();
+  return wrap(RT.get());
+}
+
+LLVMOrcResourceTrackerRef
+LLVMOrcJITDylibGetDefaultResourceTracker(LLVMOrcJITDylibRef JD) {
+  auto RT = unwrap(JD)->getDefaultResourceTracker();
+  // Retain the pointer for the C API client.
+  return wrap(RT.get());
+}
+
+void LLVMOrcReleaseResourceTracker(LLVMOrcResourceTrackerRef RT) {
+  ResourceTrackerSP TmpRT(unwrap(RT));
+  TmpRT->Release();
+}
+
+void LLVMOrcResourceTrackerTransferTo(LLVMOrcResourceTrackerRef SrcRT,
+                                      LLVMOrcResourceTrackerRef DstRT) {
+  ResourceTrackerSP TmpRT(unwrap(SrcRT));
+  TmpRT->transferTo(*unwrap(DstRT));
+}
+
+LLVMErrorRef LLVMOrcResourceTrackerRemove(LLVMOrcResourceTrackerRef RT) {
+  ResourceTrackerSP TmpRT(unwrap(RT));
+  return wrap(TmpRT->remove());
+}
+
+void LLVMOrcDisposeDefinitionGenerator(LLVMOrcDefinitionGeneratorRef DG) {
+  std::unique_ptr<DefinitionGenerator> TmpDG(unwrap(DG));
+}
+
+void LLVMOrcDisposeMaterializationUnit(LLVMOrcMaterializationUnitRef MU) {
+  std::unique_ptr<MaterializationUnit> TmpMU(unwrap(MU));
+}
+
+LLVMOrcMaterializationUnitRef
+LLVMOrcAbsoluteSymbols(LLVMOrcCSymbolMapPairs Syms, size_t NumPairs) {
+  SymbolMap SM;
+  for (size_t I = 0; I != NumPairs; ++I) {
+    JITSymbolFlags Flags;
+
+    if (Syms[I].Sym.Flags.GenericFlags & LLVMJITSymbolGenericFlagsExported)
+      Flags |= JITSymbolFlags::Exported;
+    if (Syms[I].Sym.Flags.GenericFlags & LLVMJITSymbolGenericFlagsWeak)
+      Flags |= JITSymbolFlags::Weak;
+
+    Flags.getTargetFlags() = Syms[I].Sym.Flags.TargetFlags;
+
+    SM[OrcV2CAPIHelper::retainSymbolStringPtr(unwrap(Syms[I].Name))] =
+        JITEvaluatedSymbol(Syms[I].Sym.Address, Flags);
+  }
+
+  return wrap(absoluteSymbols(std::move(SM)).release());
+}
+
+LLVMOrcJITDylibRef
+LLVMOrcExecutionSessionCreateBareJITDylib(LLVMOrcExecutionSessionRef ES,
+                                          const char *Name) {
+  return wrap(&unwrap(ES)->createBareJITDylib(Name));
+}
+
+LLVMErrorRef
+LLVMOrcExecutionSessionCreateJITDylib(LLVMOrcExecutionSessionRef ES,
+                                      LLVMOrcJITDylibRef *Result,
+                                      const char *Name) {
+  auto JD = unwrap(ES)->createJITDylib(Name);
+  if (!JD)
+    return wrap(JD.takeError());
+  *Result = wrap(&*JD);
+  return LLVMErrorSuccess;
+}
+
+LLVMOrcJITDylibRef
+LLVMOrcExecutionSessionGetJITDylibByName(LLVMOrcExecutionSessionRef ES,
+                                         const char *Name) {
+  return wrap(unwrap(ES)->getJITDylibByName(Name));
+}
+
+LLVMErrorRef LLVMOrcJITDylibDefine(LLVMOrcJITDylibRef JD,
+                                   LLVMOrcMaterializationUnitRef MU) {
+  std::unique_ptr<MaterializationUnit> TmpMU(unwrap(MU));
+
+  if (auto Err = unwrap(JD)->define(TmpMU)) {
+    TmpMU.release();
+    return wrap(std::move(Err));
+  }
+  return LLVMErrorSuccess;
+}
+
+LLVMErrorRef LLVMOrcJITDylibClear(LLVMOrcJITDylibRef JD) {
+  return wrap(unwrap(JD)->clear());
 }
 
 void LLVMOrcJITDylibAddGenerator(LLVMOrcJITDylibRef JD,
-                                 LLVMOrcJITDylibDefinitionGeneratorRef DG) {
-  unwrap(JD)->addGenerator(
-      std::unique_ptr<JITDylib::DefinitionGenerator>(unwrap(DG)));
+                                 LLVMOrcDefinitionGeneratorRef DG) {
+  unwrap(JD)->addGenerator(std::unique_ptr<DefinitionGenerator>(unwrap(DG)));
+}
+
+LLVMOrcDefinitionGeneratorRef LLVMOrcCreateCustomCAPIDefinitionGenerator(
+    LLVMOrcCAPIDefinitionGeneratorTryToGenerateFunction F, void *Ctx) {
+  auto DG = std::make_unique<CAPIDefinitionGenerator>(Ctx, F);
+  return wrap(DG.release());
 }
 
 LLVMErrorRef LLVMOrcCreateDynamicLibrarySearchGeneratorForProcess(
-    LLVMOrcJITDylibDefinitionGeneratorRef *Result, char GlobalPrefix,
+    LLVMOrcDefinitionGeneratorRef *Result, char GlobalPrefix,
     LLVMOrcSymbolPredicate Filter, void *FilterCtx) {
   assert(Result && "Result can not be null");
   assert((Filter || !FilterCtx) &&
@@ -89,7 +315,7 @@ LLVMErrorRef LLVMOrcCreateDynamicLibrarySearchGeneratorForProcess(
   DynamicLibrarySearchGenerator::SymbolPredicate Pred;
   if (Filter)
     Pred = [=](const SymbolStringPtr &Name) -> bool {
-      return Filter(wrap(OrcV2CAPIHelper::getRawPoolEntryPtr(Name)), FilterCtx);
+      return Filter(FilterCtx, wrap(OrcV2CAPIHelper::getRawPoolEntryPtr(Name)));
     };
 
   auto ProcessSymsGenerator =
@@ -143,7 +369,7 @@ LLVMErrorRef LLVMOrcJITTargetMachineBuilderDetectHost(
 }
 
 LLVMOrcJITTargetMachineBuilderRef
-LLVMOrcJITTargetMachineBuilderFromTargetMachine(LLVMTargetMachineRef TM) {
+LLVMOrcJITTargetMachineBuilderCreateFromTargetMachine(LLVMTargetMachineRef TM) {
   auto *TemplateTM = unwrap(TM);
 
   auto JTMB =
@@ -167,6 +393,10 @@ void LLVMOrcDisposeJITTargetMachineBuilder(
   delete unwrap(JTMB);
 }
 
+void LLVMOrcDisposeObjectLayer(LLVMOrcObjectLayerRef ObjLayer) {
+  delete unwrap(ObjLayer);
+}
+
 LLVMOrcLLJITBuilderRef LLVMOrcCreateLLJITBuilder(void) {
   return wrap(new LLJITBuilder());
 }
@@ -180,6 +410,17 @@ void LLVMOrcLLJITBuilderSetJITTargetMachineBuilder(
   unwrap(Builder)->setJITTargetMachineBuilder(*unwrap(JTMB));
 }
 
+void LLVMOrcLLJITBuilderSetObjectLinkingLayerCreator(
+    LLVMOrcLLJITBuilderRef Builder,
+    LLVMOrcLLJITBuilderObjectLinkingLayerCreatorFunction F, void *Ctx) {
+  unwrap(Builder)->setObjectLinkingLayerCreator(
+      [=](ExecutionSession &ES, const Triple &TT) {
+        auto TTStr = TT.str();
+        return std::unique_ptr<ObjectLayer>(
+            unwrap(F(Ctx, wrap(&ES), TTStr.c_str())));
+      });
+}
+
 LLVMErrorRef LLVMOrcCreateLLJIT(LLVMOrcLLJITRef *Result,
                                 LLVMOrcLLJITBuilderRef Builder) {
   assert(Result && "Result can not be null");
@@ -232,10 +473,27 @@ LLVMErrorRef LLVMOrcLLJITAddObjectFile(LLVMOrcLLJITRef J, LLVMOrcJITDylibRef JD,
       *unwrap(JD), std::unique_ptr<MemoryBuffer>(unwrap(ObjBuffer))));
 }
 
+LLVMErrorRef LLVMOrcLLJITAddObjectFileWithRT(LLVMOrcLLJITRef J,
+                                             LLVMOrcResourceTrackerRef RT,
+                                             LLVMMemoryBufferRef ObjBuffer) {
+  return wrap(unwrap(J)->addObjectFile(
+      ResourceTrackerSP(unwrap(RT)),
+      std::unique_ptr<MemoryBuffer>(unwrap(ObjBuffer))));
+}
+
 LLVMErrorRef LLVMOrcLLJITAddLLVMIRModule(LLVMOrcLLJITRef J,
                                          LLVMOrcJITDylibRef JD,
                                          LLVMOrcThreadSafeModuleRef TSM) {
-  return wrap(unwrap(J)->addIRModule(*unwrap(JD), std::move(*unwrap(TSM))));
+  std::unique_ptr<ThreadSafeModule> TmpTSM(unwrap(TSM));
+  return wrap(unwrap(J)->addIRModule(*unwrap(JD), std::move(*TmpTSM)));
+}
+
+LLVMErrorRef LLVMOrcLLJITAddLLVMIRModuleWithRT(LLVMOrcLLJITRef J,
+                                               LLVMOrcResourceTrackerRef RT,
+                                               LLVMOrcThreadSafeModuleRef TSM) {
+  std::unique_ptr<ThreadSafeModule> TmpTSM(unwrap(TSM));
+  return wrap(unwrap(J)->addIRModule(ResourceTrackerSP(unwrap(RT)),
+                                     std::move(*TmpTSM)));
 }
 
 LLVMErrorRef LLVMOrcLLJITLookup(LLVMOrcLLJITRef J,
@@ -252,3 +510,20 @@ LLVMErrorRef LLVMOrcLLJITLookup(LLVMOrcLLJITRef J,
   *Result = Sym->getAddress();
   return LLVMErrorSuccess;
 }
+
+LLVMOrcObjectLayerRef
+LLVMOrcCreateRTDyldObjectLinkingLayerWithSectionMemoryManager(
+    LLVMOrcExecutionSessionRef ES) {
+  assert(ES && "ES must not be null");
+  return wrap(new RTDyldObjectLinkingLayer(
+      *unwrap(ES), [] { return std::make_unique<SectionMemoryManager>(); }));
+}
+
+void LLVMOrcRTDyldObjectLinkingLayerRegisterJITEventListener(
+    LLVMOrcObjectLayerRef RTDyldObjLinkingLayer,
+    LLVMJITEventListenerRef Listener) {
+  assert(RTDyldObjLinkingLayer && "RTDyldObjLinkingLayer must not be null");
+  assert(Listener && "Listener must not be null");
+  reinterpret_cast<RTDyldObjectLinkingLayer *>(unwrap(RTDyldObjLinkingLayer))
+      ->registerJITEventListener(*unwrap(Listener));
+}
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp
index 21925726072e..0ad666ebbebd 100644
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp
@@ -18,7 +18,7 @@ class JITDylibSearchOrderResolver : public JITSymbolResolver {
 public:
   JITDylibSearchOrderResolver(MaterializationResponsibility &MR) : MR(MR) {}
 
-  void lookup(const LookupSet &Symbols, OnResolvedFunction OnResolved) {
+  void lookup(const LookupSet &Symbols, OnResolvedFunction OnResolved) override {
     auto &ES = MR.getTargetJITDylib().getExecutionSession();
     SymbolLookupSet InternedSymbols;
 
@@ -55,7 +55,7 @@ public:
               RegisterDependencies);
   }
 
-  Expected<LookupSet> getResponsibilitySet(const LookupSet &Symbols) {
+  Expected<LookupSet> getResponsibilitySet(const LookupSet &Symbols) override {
     LookupSet Result;
 
     for (auto &KV : MR.getSymbols()) {
@@ -77,35 +77,26 @@ namespace orc {
 
 RTDyldObjectLinkingLayer::RTDyldObjectLinkingLayer(
     ExecutionSession &ES, GetMemoryManagerFunction GetMemoryManager)
-    : ObjectLayer(ES), GetMemoryManager(GetMemoryManager) {}
+    : ObjectLayer(ES), GetMemoryManager(GetMemoryManager) {
+  ES.registerResourceManager(*this);
+}
 
 RTDyldObjectLinkingLayer::~RTDyldObjectLinkingLayer() {
-  std::lock_guard<std::mutex> Lock(RTDyldLayerMutex);
-  for (auto &MemMgr : MemMgrs) {
-    for (auto *L : EventListeners)
-      L->notifyFreeingObject(
-          static_cast<uint64_t>(reinterpret_cast<uintptr_t>(MemMgr.get())));
-    MemMgr->deregisterEHFrames();
-  }
+  assert(MemMgrs.empty() && "Layer destroyed with resources still attached");
 }
 
-void RTDyldObjectLinkingLayer::emit(MaterializationResponsibility R,
-                                    std::unique_ptr<MemoryBuffer> O) {
+void RTDyldObjectLinkingLayer::emit(
+    std::unique_ptr<MaterializationResponsibility> R,
+    std::unique_ptr<MemoryBuffer> O) {
   assert(O && "Object must not be null");
 
-  // This method launches an asynchronous link step that will fulfill our
-  // materialization responsibility. We need to switch R to be heap
-  // allocated before that happens so it can live as long as the asynchronous
-  // link needs it to (i.e. it must be able to outlive this method).
-  auto SharedR = std::make_shared<MaterializationResponsibility>(std::move(R));
-
   auto &ES = getExecutionSession();
 
   auto Obj = object::ObjectFile::createObjectFile(*O);
 
   if (!Obj) {
     getExecutionSession().reportError(Obj.takeError());
-    SharedR->failMaterialization();
+    R->failMaterialization();
     return;
   }
 
@@ -121,7 +112,7 @@ void RTDyldObjectLinkingLayer::emit(MaterializationResponsibility R,
           continue;
       } else {
         ES.reportError(SymType.takeError());
-        R.failMaterialization();
+        R->failMaterialization();
         return;
       }
 
@@ -129,7 +120,7 @@ void RTDyldObjectLinkingLayer::emit(MaterializationResponsibility R,
       if (!SymFlagsOrErr) {
         // TODO: Test this error.
         ES.reportError(SymFlagsOrErr.takeError());
-        R.failMaterialization();
+        R->failMaterialization();
         return;
       }
 
@@ -139,46 +130,44 @@ void RTDyldObjectLinkingLayer::emit(MaterializationResponsibility R,
           InternalSymbols->insert(*SymName);
         else {
           ES.reportError(SymName.takeError());
-          R.failMaterialization();
+          R->failMaterialization();
           return;
         }
       }
     }
   }
 
-  auto K = R.getVModuleKey();
-  RuntimeDyld::MemoryManager *MemMgr = nullptr;
+  auto MemMgr = GetMemoryManager();
+  auto &MemMgrRef = *MemMgr;
 
-  // Create a record a memory manager for this object.
-  {
-    auto Tmp = GetMemoryManager();
-    std::lock_guard<std::mutex> Lock(RTDyldLayerMutex);
-    MemMgrs.push_back(std::move(Tmp));
-    MemMgr = MemMgrs.back().get();
-  }
+  // Switch to shared ownership of MR so that it can be captured by both
+  // lambdas below.
+  std::shared_ptr<MaterializationResponsibility> SharedR(std::move(R));
 
   JITDylibSearchOrderResolver Resolver(*SharedR);
 
   jitLinkForORC(
       object::OwningBinary<object::ObjectFile>(std::move(*Obj), std::move(O)),
-      *MemMgr, Resolver, ProcessAllSections,
-      [this, K, SharedR, MemMgr, InternalSymbols](
+      MemMgrRef, Resolver, ProcessAllSections,
+      [this, SharedR, &MemMgrRef, InternalSymbols](
           const object::ObjectFile &Obj,
-          std::unique_ptr<RuntimeDyld::LoadedObjectInfo> LoadedObjInfo,
+          RuntimeDyld::LoadedObjectInfo &LoadedObjInfo,
           std::map<StringRef, JITEvaluatedSymbol> ResolvedSymbols) {
-        return onObjLoad(K, *SharedR, Obj, MemMgr, std::move(LoadedObjInfo),
+        return onObjLoad(*SharedR, Obj, MemMgrRef, LoadedObjInfo,
                          ResolvedSymbols, *InternalSymbols);
       },
-      [this, K, SharedR, MemMgr](object::OwningBinary<object::ObjectFile> Obj,
-                                 Error Err) mutable {
-        onObjEmit(K, *SharedR, std::move(Obj), MemMgr, std::move(Err));
+      [this, SharedR, MemMgr = std::move(MemMgr)](
+          object::OwningBinary<object::ObjectFile> Obj,
+          std::unique_ptr<RuntimeDyld::LoadedObjectInfo> LoadedObjInfo,
+          Error Err) mutable {
+        onObjEmit(*SharedR, std::move(Obj), std::move(MemMgr),
+                  std::move(LoadedObjInfo), std::move(Err));
       });
 }
 
 void RTDyldObjectLinkingLayer::registerJITEventListener(JITEventListener &L) {
   std::lock_guard<std::mutex> Lock(RTDyldLayerMutex);
-  assert(llvm::none_of(EventListeners,
-                       [&](JITEventListener *O) { return O == &L; }) &&
+  assert(!llvm::is_contained(EventListeners, &L) &&
          "Listener has already been registered");
   EventListeners.push_back(&L);
 }
@@ -191,9 +180,9 @@ void RTDyldObjectLinkingLayer::unregisterJITEventListener(JITEventListener &L) {
 }
 
 Error RTDyldObjectLinkingLayer::onObjLoad(
-    VModuleKey K, MaterializationResponsibility &R,
-    const object::ObjectFile &Obj, RuntimeDyld::MemoryManager *MemMgr,
-    std::unique_ptr<RuntimeDyld::LoadedObjectInfo> LoadedObjInfo,
+    MaterializationResponsibility &R, const object::ObjectFile &Obj,
+    RuntimeDyld::MemoryManager &MemMgr,
+    RuntimeDyld::LoadedObjectInfo &LoadedObjInfo,
     std::map<StringRef, JITEvaluatedSymbol> Resolved,
     std::set<StringRef> &InternalSymbols) {
   SymbolFlagsMap ExtraSymbolsToClaim;
@@ -274,19 +263,16 @@ Error RTDyldObjectLinkingLayer::onObjLoad(
   }
 
   if (NotifyLoaded)
-    NotifyLoaded(K, Obj, *LoadedObjInfo);
-
-  std::lock_guard<std::mutex> Lock(RTDyldLayerMutex);
-  assert(!LoadedObjInfos.count(MemMgr) && "Duplicate loaded info for MemMgr");
-  LoadedObjInfos[MemMgr] = std::move(LoadedObjInfo);
+    NotifyLoaded(R, Obj, LoadedObjInfo);
 
   return Error::success();
 }
 
 void RTDyldObjectLinkingLayer::onObjEmit(
-    VModuleKey K, MaterializationResponsibility &R,
+    MaterializationResponsibility &R,
     object::OwningBinary<object::ObjectFile> O,
-    RuntimeDyld::MemoryManager *MemMgr, Error Err) {
+    std::unique_ptr<RuntimeDyld::MemoryManager> MemMgr,
+    std::unique_ptr<RuntimeDyld::LoadedObjectInfo> LoadedObjInfo, Error Err) {
   if (Err) {
     getExecutionSession().reportError(std::move(Err));
     R.failMaterialization();
@@ -306,27 +292,60 @@ void RTDyldObjectLinkingLayer::onObjEmit(
   // Run EventListener notifyLoaded callbacks.
   {
     std::lock_guard<std::mutex> Lock(RTDyldLayerMutex);
-    auto LOIItr = LoadedObjInfos.find(MemMgr);
-    assert(LOIItr != LoadedObjInfos.end() && "LoadedObjInfo missing");
     for (auto *L : EventListeners)
-      L->notifyObjectLoaded(
-          static_cast<uint64_t>(reinterpret_cast<uintptr_t>(MemMgr)), *Obj,
-          *LOIItr->second);
-    LoadedObjInfos.erase(MemMgr);
+      L->notifyObjectLoaded(pointerToJITTargetAddress(MemMgr.get()), *Obj,
+                            *LoadedObjInfo);
   }
 
   if (NotifyEmitted)
-    NotifyEmitted(K, std::move(ObjBuffer));
+    NotifyEmitted(R, std::move(ObjBuffer));
+
+  if (auto Err = R.withResourceKeyDo(
+          [&](ResourceKey K) { MemMgrs[K].push_back(std::move(MemMgr)); })) {
+    getExecutionSession().reportError(std::move(Err));
+    R.failMaterialization();
+  }
 }
 
-LegacyRTDyldObjectLinkingLayer::LegacyRTDyldObjectLinkingLayer(
-    ExecutionSession &ES, ResourcesGetter GetResources,
-    NotifyLoadedFtor NotifyLoaded, NotifyFinalizedFtor NotifyFinalized,
-    NotifyFreedFtor NotifyFreed)
-    : ES(ES), GetResources(std::move(GetResources)),
-      NotifyLoaded(std::move(NotifyLoaded)),
-      NotifyFinalized(std::move(NotifyFinalized)),
-      NotifyFreed(std::move(NotifyFreed)), ProcessAllSections(false) {}
+Error RTDyldObjectLinkingLayer::handleRemoveResources(ResourceKey K) {
+
+  std::vector<MemoryManagerUP> MemMgrsToRemove;
+
+  getExecutionSession().runSessionLocked([&] {
+    auto I = MemMgrs.find(K);
+    if (I != MemMgrs.end()) {
+      std::swap(MemMgrsToRemove, I->second);
+      MemMgrs.erase(I);
+    }
+  });
+
+  {
+    std::lock_guard<std::mutex> Lock(RTDyldLayerMutex);
+    for (auto &MemMgr : MemMgrsToRemove) {
+      for (auto *L : EventListeners)
+        L->notifyFreeingObject(pointerToJITTargetAddress(MemMgr.get()));
+      MemMgr->deregisterEHFrames();
+    }
+  }
+
+  return Error::success();
+}
+
+void RTDyldObjectLinkingLayer::handleTransferResources(ResourceKey DstKey,
+                                                       ResourceKey SrcKey) {
+  auto I = MemMgrs.find(SrcKey);
+  if (I != MemMgrs.end()) {
+    auto &SrcMemMgrs = I->second;
+    auto &DstMemMgrs = MemMgrs[DstKey];
+    DstMemMgrs.reserve(DstMemMgrs.size() + SrcMemMgrs.size());
+    for (auto &MemMgr : SrcMemMgrs)
+      DstMemMgrs.push_back(std::move(MemMgr));
+
+    // Erase SrcKey entry using value rather than iterator I: I may have been
+    // invalidated when we looked up DstKey.
+    MemMgrs.erase(SrcKey);
+  }
+}
 
 } // End namespace orc.
 } // End namespace llvm.
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/Shared/OrcError.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/Shared/OrcError.cpp
new file mode 100644
index 000000000000..fdad90cbcfb7
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/Shared/OrcError.cpp
@@ -0,0 +1,120 @@
+//===---------------- OrcError.cpp - Error codes for ORC ------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Error codes for ORC.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/Shared/OrcError.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ManagedStatic.h"
+
+#include <type_traits>
+
+using namespace llvm;
+using namespace llvm::orc;
+
+namespace {
+
+// FIXME: This class is only here to support the transition to llvm::Error. It
+// will be removed once this transition is complete. Clients should prefer to
+// deal with the Error value directly, rather than converting to error_code.
+class OrcErrorCategory : public std::error_category {
+public:
+  const char *name() const noexcept override { return "orc"; }
+
+  std::string message(int condition) const override {
+    switch (static_cast<OrcErrorCode>(condition)) {
+    case OrcErrorCode::UnknownORCError:
+      return "Unknown ORC error";
+    case OrcErrorCode::DuplicateDefinition:
+      return "Duplicate symbol definition";
+    case OrcErrorCode::JITSymbolNotFound:
+      return "JIT symbol not found";
+    case OrcErrorCode::RemoteAllocatorDoesNotExist:
+      return "Remote allocator does not exist";
+    case OrcErrorCode::RemoteAllocatorIdAlreadyInUse:
+      return "Remote allocator Id already in use";
+    case OrcErrorCode::RemoteMProtectAddrUnrecognized:
+      return "Remote mprotect call references unallocated memory";
+    case OrcErrorCode::RemoteIndirectStubsOwnerDoesNotExist:
+      return "Remote indirect stubs owner does not exist";
+    case OrcErrorCode::RemoteIndirectStubsOwnerIdAlreadyInUse:
+      return "Remote indirect stubs owner Id already in use";
+    case OrcErrorCode::RPCConnectionClosed:
+      return "RPC connection closed";
+    case OrcErrorCode::RPCCouldNotNegotiateFunction:
+      return "Could not negotiate RPC function";
+    case OrcErrorCode::RPCResponseAbandoned:
+      return "RPC response abandoned";
+    case OrcErrorCode::UnexpectedRPCCall:
+      return "Unexpected RPC call";
+    case OrcErrorCode::UnexpectedRPCResponse:
+      return "Unexpected RPC response";
+    case OrcErrorCode::UnknownErrorCodeFromRemote:
+      return "Unknown error returned from remote RPC function "
+             "(Use StringError to get error message)";
+    case OrcErrorCode::UnknownResourceHandle:
+      return "Unknown resource handle";
+    case OrcErrorCode::MissingSymbolDefinitions:
+      return "MissingSymbolsDefinitions";
+    case OrcErrorCode::UnexpectedSymbolDefinitions:
+      return "UnexpectedSymbolDefinitions";
+    }
+    llvm_unreachable("Unhandled error code");
+  }
+};
+
+static ManagedStatic<OrcErrorCategory> OrcErrCat;
+} // namespace
+
+namespace llvm {
+namespace orc {
+
+char DuplicateDefinition::ID = 0;
+char JITSymbolNotFound::ID = 0;
+
+std::error_code orcError(OrcErrorCode ErrCode) {
+  typedef std::underlying_type<OrcErrorCode>::type UT;
+  return std::error_code(static_cast<UT>(ErrCode), *OrcErrCat);
+}
+
+DuplicateDefinition::DuplicateDefinition(std::string SymbolName)
+    : SymbolName(std::move(SymbolName)) {}
+
+std::error_code DuplicateDefinition::convertToErrorCode() const {
+  return orcError(OrcErrorCode::DuplicateDefinition);
+}
+
+void DuplicateDefinition::log(raw_ostream &OS) const {
+  OS << "Duplicate definition of symbol '" << SymbolName << "'";
+}
+
+const std::string &DuplicateDefinition::getSymbolName() const {
+  return SymbolName;
+}
+
+JITSymbolNotFound::JITSymbolNotFound(std::string SymbolName)
+    : SymbolName(std::move(SymbolName)) {}
+
+std::error_code JITSymbolNotFound::convertToErrorCode() const {
+  typedef std::underlying_type<OrcErrorCode>::type UT;
+  return std::error_code(static_cast<UT>(OrcErrorCode::JITSymbolNotFound),
+                         *OrcErrCat);
+}
+
+void JITSymbolNotFound::log(raw_ostream &OS) const {
+  OS << "Could not find symbol '" << SymbolName << "'";
+}
+
+const std::string &JITSymbolNotFound::getSymbolName() const {
+  return SymbolName;
+}
+
+} // namespace orc
+} // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/Shared/RPCError.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/Shared/RPCError.cpp
new file mode 100644
index 000000000000..a55cb220f218
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/Shared/RPCError.cpp
@@ -0,0 +1,58 @@
+//===--------------- RPCError.cpp - RPCERror implementation ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// RPC Error type implmentations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/Shared/RPCUtils.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <string>
+#include <system_error>
+
+char llvm::orc::shared::RPCFatalError::ID = 0;
+char llvm::orc::shared::ConnectionClosed::ID = 0;
+char llvm::orc::shared::ResponseAbandoned::ID = 0;
+char llvm::orc::shared::CouldNotNegotiate::ID = 0;
+
+namespace llvm {
+namespace orc {
+namespace shared {
+
+std::error_code ConnectionClosed::convertToErrorCode() const {
+  return orcError(OrcErrorCode::RPCConnectionClosed);
+}
+
+void ConnectionClosed::log(raw_ostream &OS) const {
+  OS << "RPC connection already closed";
+}
+
+std::error_code ResponseAbandoned::convertToErrorCode() const {
+  return orcError(OrcErrorCode::RPCResponseAbandoned);
+}
+
+void ResponseAbandoned::log(raw_ostream &OS) const {
+  OS << "RPC response abandoned";
+}
+
+CouldNotNegotiate::CouldNotNegotiate(std::string Signature)
+    : Signature(std::move(Signature)) {}
+
+std::error_code CouldNotNegotiate::convertToErrorCode() const {
+  return orcError(OrcErrorCode::RPCCouldNotNegotiateFunction);
+}
+
+void CouldNotNegotiate::log(raw_ostream &OS) const {
+  OS << "Could not negotiate RPC function " << Signature;
+}
+
+} // end namespace shared
+} // end namespace orc
+} // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/Shared/TargetProcessControlTypes.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/Shared/TargetProcessControlTypes.cpp
new file mode 100644
index 000000000000..52d11f0741d4
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/Shared/TargetProcessControlTypes.cpp
@@ -0,0 +1,44 @@
+//===---------- TargetProcessControlTypes.cpp - Shared TPC types ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// TargetProcessControl types.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/Shared/TargetProcessControlTypes.h"
+
+namespace llvm {
+namespace orc {
+namespace tpctypes {
+
+WrapperFunctionResult WrapperFunctionResult::from(StringRef S) {
+  CWrapperFunctionResult R;
+  zeroInit(R);
+  R.Size = S.size();
+  if (R.Size > sizeof(uint64_t)) {
+    R.Data.ValuePtr = new uint8_t[R.Size];
+    memcpy(R.Data.ValuePtr, S.data(), R.Size);
+    R.Destroy = destroyWithDeleteArray;
+  } else
+    memcpy(R.Data.Value, S.data(), R.Size);
+  return R;
+}
+
+void WrapperFunctionResult::destroyWithFree(CWrapperFunctionResultData Data,
+                                            uint64_t Size) {
+  free(Data.ValuePtr);
+}
+
+void WrapperFunctionResult::destroyWithDeleteArray(
+    CWrapperFunctionResultData Data, uint64_t Size) {
+  delete[] Data.ValuePtr;
+}
+
+} // end namespace tpctypes
+} // end namespace orc
+} // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/SpeculateAnalyses.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/SpeculateAnalyses.cpp
index 7240c1ed0ce9..c2fa4466eab6 100644
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/SpeculateAnalyses.cpp
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/SpeculateAnalyses.cpp
@@ -106,11 +106,10 @@ BlockFreqQuery::ResultTy BlockFreqQuery::operator()(Function &F) {
 
   assert(IBBs.size() == BBFreqs.size() && "BB Count Mismatch");
 
-  llvm::sort(BBFreqs.begin(), BBFreqs.end(),
-             [](decltype(BBFreqs)::const_reference BBF,
-                decltype(BBFreqs)::const_reference BBS) {
-               return BBF.second > BBS.second ? true : false;
-             });
+  llvm::sort(BBFreqs, [](decltype(BBFreqs)::const_reference BBF,
+                         decltype(BBFreqs)::const_reference BBS) {
+    return BBF.second > BBS.second ? true : false;
+  });
 
   // ignoring number of direct calls in a BB
   auto Topk = numBBToGet(BBFreqs.size());
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/Speculation.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/Speculation.cpp
index 0530b1a97b67..0b4755fe23cf 100644
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/Speculation.cpp
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/Speculation.cpp
@@ -16,9 +16,6 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Verifier.h"
-#include "llvm/Support/Debug.h"
-
-#include <vector>
 
 namespace llvm {
 
@@ -58,7 +55,7 @@ Error Speculator::addSpeculationRuntime(JITDylib &JD,
 // If two modules, share the same LLVMContext, different threads must
 // not access them concurrently without locking the associated LLVMContext
 // this implementation follows this contract.
-void IRSpeculationLayer::emit(MaterializationResponsibility R,
+void IRSpeculationLayer::emit(std::unique_ptr<MaterializationResponsibility> R,
                               ThreadSafeModule TSM) {
 
   assert(TSM && "Speculation Layer received Null Module ?");
@@ -130,7 +127,7 @@ void IRSpeculationLayer::emit(MaterializationResponsibility R,
           assert(Mutator.GetInsertBlock()->getParent() == &Fn &&
                  "IR builder association mismatch?");
           S.registerSymbols(internToJITSymbols(IRNames.getValue()),
-                            &R.getTargetJITDylib());
+                            &R->getTargetJITDylib());
         }
       }
     }
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/TPCDynamicLibrarySearchGenerator.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/TPCDynamicLibrarySearchGenerator.cpp
new file mode 100644
index 000000000000..bbf3ada1d4ba
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/TPCDynamicLibrarySearchGenerator.cpp
@@ -0,0 +1,70 @@
+//===---------------- TPCDynamicLibrarySearchGenerator.cpp ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/TPCDynamicLibrarySearchGenerator.h"
+
+namespace llvm {
+namespace orc {
+
+Expected<std::unique_ptr<TPCDynamicLibrarySearchGenerator>>
+TPCDynamicLibrarySearchGenerator::Load(TargetProcessControl &TPC,
+                                       const char *LibraryPath,
+                                       SymbolPredicate Allow) {
+  auto Handle = TPC.loadDylib(LibraryPath);
+  if (!Handle)
+    return Handle.takeError();
+
+  return std::make_unique<TPCDynamicLibrarySearchGenerator>(TPC, *Handle,
+                                                            std::move(Allow));
+}
+
+Error TPCDynamicLibrarySearchGenerator::tryToGenerate(
+    LookupState &LS, LookupKind K, JITDylib &JD,
+    JITDylibLookupFlags JDLookupFlags, const SymbolLookupSet &Symbols) {
+
+  if (Symbols.empty())
+    return Error::success();
+
+  SymbolLookupSet LookupSymbols;
+
+  for (auto &KV : Symbols) {
+    // Skip symbols that don't match the filter.
+    if (Allow && !Allow(KV.first))
+      continue;
+    LookupSymbols.add(KV.first, SymbolLookupFlags::WeaklyReferencedSymbol);
+  }
+
+  SymbolMap NewSymbols;
+
+  TargetProcessControl::LookupRequest Request(H, LookupSymbols);
+  auto Result = TPC.lookupSymbols(Request);
+  if (!Result)
+    return Result.takeError();
+
+  assert(Result->size() == 1 && "Results for more than one library returned");
+  assert(Result->front().size() == LookupSymbols.size() &&
+         "Result has incorrect number of elements");
+
+  auto ResultI = Result->front().begin();
+  for (auto &KV : LookupSymbols) {
+    if (*ResultI)
+      NewSymbols[KV.first] =
+          JITEvaluatedSymbol(*ResultI, JITSymbolFlags::Exported);
+    ++ResultI;
+  }
+
+  // If there were no resolved symbols bail out.
+  if (NewSymbols.empty())
+    return Error::success();
+
+  // Define resolved symbols.
+  return JD.define(absoluteSymbols(std::move(NewSymbols)));
+}
+
+} // end namespace orc
+} // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/TPCEHFrameRegistrar.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/TPCEHFrameRegistrar.cpp
new file mode 100644
index 000000000000..4f901ce6d445
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/TPCEHFrameRegistrar.cpp
@@ -0,0 +1,80 @@
+//===------ TPCEHFrameRegistrar.cpp - TPC-based eh-frame registration -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/TPCEHFrameRegistrar.h"
+#include "llvm/Support/BinaryStreamWriter.h"
+
+namespace llvm {
+namespace orc {
+
+Expected<std::unique_ptr<TPCEHFrameRegistrar>>
+TPCEHFrameRegistrar::Create(TargetProcessControl &TPC) {
+  // FIXME: Proper mangling here -- we really need to decouple linker mangling
+  // from DataLayout.
+
+  // Find the addresses of the registration/deregistration functions in the
+  // target process.
+  auto ProcessHandle = TPC.loadDylib(nullptr);
+  if (!ProcessHandle)
+    return ProcessHandle.takeError();
+
+  std::string RegisterWrapperName, DeregisterWrapperName;
+  if (TPC.getTargetTriple().isOSBinFormatMachO()) {
+    RegisterWrapperName += '_';
+    DeregisterWrapperName += '_';
+  }
+  RegisterWrapperName += "llvm_orc_registerEHFrameSectionWrapper";
+  DeregisterWrapperName += "llvm_orc_deregisterEHFrameSectionWrapper";
+
+  SymbolLookupSet RegistrationSymbols;
+  RegistrationSymbols.add(TPC.intern(RegisterWrapperName));
+  RegistrationSymbols.add(TPC.intern(DeregisterWrapperName));
+
+  auto Result = TPC.lookupSymbols({{*ProcessHandle, RegistrationSymbols}});
+  if (!Result)
+    return Result.takeError();
+
+  assert(Result->size() == 1 && "Unexpected number of dylibs in result");
+  assert((*Result)[0].size() == 2 &&
+         "Unexpected number of addresses in result");
+
+  auto RegisterEHFrameWrapperFnAddr = (*Result)[0][0];
+  auto DeregisterEHFrameWrapperFnAddr = (*Result)[0][1];
+
+  return std::make_unique<TPCEHFrameRegistrar>(
+      TPC, RegisterEHFrameWrapperFnAddr, DeregisterEHFrameWrapperFnAddr);
+}
+
+Error TPCEHFrameRegistrar::registerEHFrames(JITTargetAddress EHFrameSectionAddr,
+                                            size_t EHFrameSectionSize) {
+  constexpr size_t ArgBufferSize = sizeof(uint64_t) + sizeof(uint64_t);
+  uint8_t ArgBuffer[ArgBufferSize];
+  BinaryStreamWriter ArgWriter(
+      MutableArrayRef<uint8_t>(ArgBuffer, ArgBufferSize),
+      support::endianness::big);
+  cantFail(ArgWriter.writeInteger(static_cast<uint64_t>(EHFrameSectionAddr)));
+  cantFail(ArgWriter.writeInteger(static_cast<uint64_t>(EHFrameSectionSize)));
+
+  return TPC.runWrapper(RegisterEHFrameWrapperFnAddr, ArgBuffer).takeError();
+}
+
+Error TPCEHFrameRegistrar::deregisterEHFrames(
+    JITTargetAddress EHFrameSectionAddr, size_t EHFrameSectionSize) {
+  constexpr size_t ArgBufferSize = sizeof(uint64_t) + sizeof(uint64_t);
+  uint8_t ArgBuffer[ArgBufferSize];
+  BinaryStreamWriter ArgWriter(
+      MutableArrayRef<uint8_t>(ArgBuffer, ArgBufferSize),
+      support::endianness::big);
+  cantFail(ArgWriter.writeInteger(static_cast<uint64_t>(EHFrameSectionAddr)));
+  cantFail(ArgWriter.writeInteger(static_cast<uint64_t>(EHFrameSectionSize)));
+
+  return TPC.runWrapper(DeregisterEHFrameWrapperFnAddr, ArgBuffer).takeError();
+}
+
+} // end namespace orc
+} // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/TPCIndirectionUtils.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/TPCIndirectionUtils.cpp
new file mode 100644
index 000000000000..7989ec41952d
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/TPCIndirectionUtils.cpp
@@ -0,0 +1,423 @@
+//===------ TargetProcessControl.cpp -- Target process control APIs -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/TPCIndirectionUtils.h"
+
+#include "llvm/ExecutionEngine/Orc/TargetProcessControl.h"
+#include "llvm/Support/MathExtras.h"
+
+#include <future>
+
+using namespace llvm;
+using namespace llvm::orc;
+
+namespace llvm {
+namespace orc {
+
+class TPCIndirectionUtilsAccess {
+public:
+  using IndirectStubInfo = TPCIndirectionUtils::IndirectStubInfo;
+  using IndirectStubInfoVector = TPCIndirectionUtils::IndirectStubInfoVector;
+
+  static Expected<IndirectStubInfoVector>
+  getIndirectStubs(TPCIndirectionUtils &TPCIU, unsigned NumStubs) {
+    return TPCIU.getIndirectStubs(NumStubs);
+  };
+};
+
+} // end namespace orc
+} // end namespace llvm
+
+namespace {
+
+class TPCTrampolinePool : public TrampolinePool {
+public:
+  TPCTrampolinePool(TPCIndirectionUtils &TPCIU);
+  Error deallocatePool();
+
+protected:
+  Error grow() override;
+
+  using Allocation = jitlink::JITLinkMemoryManager::Allocation;
+
+  TPCIndirectionUtils &TPCIU;
+  unsigned TrampolineSize = 0;
+  unsigned TrampolinesPerPage = 0;
+  std::vector<std::unique_ptr<Allocation>> TrampolineBlocks;
+};
+
+class TPCIndirectStubsManager : public IndirectStubsManager,
+                                private TPCIndirectionUtilsAccess {
+public:
+  TPCIndirectStubsManager(TPCIndirectionUtils &TPCIU) : TPCIU(TPCIU) {}
+
+  Error deallocateStubs();
+
+  Error createStub(StringRef StubName, JITTargetAddress StubAddr,
+                   JITSymbolFlags StubFlags) override;
+
+  Error createStubs(const StubInitsMap &StubInits) override;
+
+  JITEvaluatedSymbol findStub(StringRef Name, bool ExportedStubsOnly) override;
+
+  JITEvaluatedSymbol findPointer(StringRef Name) override;
+
+  Error updatePointer(StringRef Name, JITTargetAddress NewAddr) override;
+
+private:
+  using StubInfo = std::pair<IndirectStubInfo, JITSymbolFlags>;
+
+  std::mutex ISMMutex;
+  TPCIndirectionUtils &TPCIU;
+  StringMap<StubInfo> StubInfos;
+};
+
+TPCTrampolinePool::TPCTrampolinePool(TPCIndirectionUtils &TPCIU)
+    : TPCIU(TPCIU) {
+  auto &TPC = TPCIU.getTargetProcessControl();
+  auto &ABI = TPCIU.getABISupport();
+
+  TrampolineSize = ABI.getTrampolineSize();
+  TrampolinesPerPage =
+      (TPC.getPageSize() - ABI.getPointerSize()) / TrampolineSize;
+}
+
+Error TPCTrampolinePool::deallocatePool() {
+  Error Err = Error::success();
+  for (auto &Alloc : TrampolineBlocks)
+    Err = joinErrors(std::move(Err), Alloc->deallocate());
+  return Err;
+}
+
+Error TPCTrampolinePool::grow() {
+  assert(AvailableTrampolines.empty() &&
+         "Grow called with trampolines still available");
+
+  auto ResolverAddress = TPCIU.getResolverBlockAddress();
+  assert(ResolverAddress && "Resolver address can not be null");
+
+  auto &TPC = TPCIU.getTargetProcessControl();
+  constexpr auto TrampolinePagePermissions =
+      static_cast<sys::Memory::ProtectionFlags>(sys::Memory::MF_READ |
+                                                sys::Memory::MF_EXEC);
+  auto PageSize = TPC.getPageSize();
+  jitlink::JITLinkMemoryManager::SegmentsRequestMap Request;
+  Request[TrampolinePagePermissions] = {PageSize, static_cast<size_t>(PageSize),
+                                        0};
+  auto Alloc = TPC.getMemMgr().allocate(nullptr, Request);
+
+  if (!Alloc)
+    return Alloc.takeError();
+
+  unsigned NumTrampolines = TrampolinesPerPage;
+
+  auto WorkingMemory = (*Alloc)->getWorkingMemory(TrampolinePagePermissions);
+  auto TargetAddress = (*Alloc)->getTargetMemory(TrampolinePagePermissions);
+
+  TPCIU.getABISupport().writeTrampolines(WorkingMemory.data(), TargetAddress,
+                                         ResolverAddress, NumTrampolines);
+
+  auto TargetAddr = (*Alloc)->getTargetMemory(TrampolinePagePermissions);
+  for (unsigned I = 0; I < NumTrampolines; ++I)
+    AvailableTrampolines.push_back(TargetAddr + (I * TrampolineSize));
+
+  if (auto Err = (*Alloc)->finalize())
+    return Err;
+
+  TrampolineBlocks.push_back(std::move(*Alloc));
+
+  return Error::success();
+}
+
+Error TPCIndirectStubsManager::createStub(StringRef StubName,
+                                          JITTargetAddress StubAddr,
+                                          JITSymbolFlags StubFlags) {
+  StubInitsMap SIM;
+  SIM[StubName] = std::make_pair(StubAddr, StubFlags);
+  return createStubs(SIM);
+}
+
+Error TPCIndirectStubsManager::createStubs(const StubInitsMap &StubInits) {
+  auto AvailableStubInfos = getIndirectStubs(TPCIU, StubInits.size());
+  if (!AvailableStubInfos)
+    return AvailableStubInfos.takeError();
+
+  {
+    std::lock_guard<std::mutex> Lock(ISMMutex);
+    unsigned ASIdx = 0;
+    for (auto &SI : StubInits) {
+      auto &A = (*AvailableStubInfos)[ASIdx++];
+      StubInfos[SI.first()] = std::make_pair(A, SI.second.second);
+    }
+  }
+
+  auto &MemAccess = TPCIU.getTargetProcessControl().getMemoryAccess();
+  switch (TPCIU.getABISupport().getPointerSize()) {
+  case 4: {
+    unsigned ASIdx = 0;
+    std::vector<tpctypes::UInt32Write> PtrUpdates;
+    for (auto &SI : StubInits)
+      PtrUpdates.push_back({(*AvailableStubInfos)[ASIdx++].PointerAddress,
+                            static_cast<uint32_t>(SI.second.first)});
+    return MemAccess.writeUInt32s(PtrUpdates);
+  }
+  case 8: {
+    unsigned ASIdx = 0;
+    std::vector<tpctypes::UInt64Write> PtrUpdates;
+    for (auto &SI : StubInits)
+      PtrUpdates.push_back({(*AvailableStubInfos)[ASIdx++].PointerAddress,
+                            static_cast<uint64_t>(SI.second.first)});
+    return MemAccess.writeUInt64s(PtrUpdates);
+  }
+  default:
+    return make_error<StringError>("Unsupported pointer size",
+                                   inconvertibleErrorCode());
+  }
+}
+
+JITEvaluatedSymbol TPCIndirectStubsManager::findStub(StringRef Name,
+                                                     bool ExportedStubsOnly) {
+  std::lock_guard<std::mutex> Lock(ISMMutex);
+  auto I = StubInfos.find(Name);
+  if (I == StubInfos.end())
+    return nullptr;
+  return {I->second.first.StubAddress, I->second.second};
+}
+
+JITEvaluatedSymbol TPCIndirectStubsManager::findPointer(StringRef Name) {
+  std::lock_guard<std::mutex> Lock(ISMMutex);
+  auto I = StubInfos.find(Name);
+  if (I == StubInfos.end())
+    return nullptr;
+  return {I->second.first.PointerAddress, I->second.second};
+}
+
+Error TPCIndirectStubsManager::updatePointer(StringRef Name,
+                                             JITTargetAddress NewAddr) {
+
+  JITTargetAddress PtrAddr = 0;
+  {
+    std::lock_guard<std::mutex> Lock(ISMMutex);
+    auto I = StubInfos.find(Name);
+    if (I == StubInfos.end())
+      return make_error<StringError>("Unknown stub name",
+                                     inconvertibleErrorCode());
+    PtrAddr = I->second.first.PointerAddress;
+  }
+
+  auto &MemAccess = TPCIU.getTargetProcessControl().getMemoryAccess();
+  switch (TPCIU.getABISupport().getPointerSize()) {
+  case 4: {
+    tpctypes::UInt32Write PUpdate(PtrAddr, NewAddr);
+    return MemAccess.writeUInt32s(PUpdate);
+  }
+  case 8: {
+    tpctypes::UInt64Write PUpdate(PtrAddr, NewAddr);
+    return MemAccess.writeUInt64s(PUpdate);
+  }
+  default:
+    return make_error<StringError>("Unsupported pointer size",
+                                   inconvertibleErrorCode());
+  }
+}
+
+} // end anonymous namespace.
+
+namespace llvm {
+namespace orc {
+
+TPCIndirectionUtils::ABISupport::~ABISupport() {}
+
+Expected<std::unique_ptr<TPCIndirectionUtils>>
+TPCIndirectionUtils::Create(TargetProcessControl &TPC) {
+  const auto &TT = TPC.getTargetTriple();
+  switch (TT.getArch()) {
+  default:
+    return make_error<StringError>(
+        std::string("No TPCIndirectionUtils available for ") + TT.str(),
+        inconvertibleErrorCode());
+  case Triple::aarch64:
+  case Triple::aarch64_32:
+    return CreateWithABI<OrcAArch64>(TPC);
+
+  case Triple::x86:
+    return CreateWithABI<OrcI386>(TPC);
+
+  case Triple::mips:
+    return CreateWithABI<OrcMips32Be>(TPC);
+
+  case Triple::mipsel:
+    return CreateWithABI<OrcMips32Le>(TPC);
+
+  case Triple::mips64:
+  case Triple::mips64el:
+    return CreateWithABI<OrcMips64>(TPC);
+
+  case Triple::x86_64:
+    if (TT.getOS() == Triple::OSType::Win32)
+      return CreateWithABI<OrcX86_64_Win32>(TPC);
+    else
+      return CreateWithABI<OrcX86_64_SysV>(TPC);
+  }
+}
+
+Error TPCIndirectionUtils::cleanup() {
+  Error Err = Error::success();
+
+  for (auto &A : IndirectStubAllocs)
+    Err = joinErrors(std::move(Err), A->deallocate());
+
+  if (TP)
+    Err = joinErrors(std::move(Err),
+                     static_cast<TPCTrampolinePool &>(*TP).deallocatePool());
+
+  if (ResolverBlock)
+    Err = joinErrors(std::move(Err), ResolverBlock->deallocate());
+
+  return Err;
+}
+
+Expected<JITTargetAddress>
+TPCIndirectionUtils::writeResolverBlock(JITTargetAddress ReentryFnAddr,
+                                        JITTargetAddress ReentryCtxAddr) {
+  assert(ABI && "ABI can not be null");
+  constexpr auto ResolverBlockPermissions =
+      static_cast<sys::Memory::ProtectionFlags>(sys::Memory::MF_READ |
+                                                sys::Memory::MF_EXEC);
+  auto ResolverSize = ABI->getResolverCodeSize();
+
+  jitlink::JITLinkMemoryManager::SegmentsRequestMap Request;
+  Request[ResolverBlockPermissions] = {TPC.getPageSize(),
+                                       static_cast<size_t>(ResolverSize), 0};
+  auto Alloc = TPC.getMemMgr().allocate(nullptr, Request);
+  if (!Alloc)
+    return Alloc.takeError();
+
+  auto WorkingMemory = (*Alloc)->getWorkingMemory(ResolverBlockPermissions);
+  ResolverBlockAddr = (*Alloc)->getTargetMemory(ResolverBlockPermissions);
+  ABI->writeResolverCode(WorkingMemory.data(), ResolverBlockAddr, ReentryFnAddr,
+                         ReentryCtxAddr);
+
+  if (auto Err = (*Alloc)->finalize())
+    return std::move(Err);
+
+  ResolverBlock = std::move(*Alloc);
+  return ResolverBlockAddr;
+}
+
+std::unique_ptr<IndirectStubsManager>
+TPCIndirectionUtils::createIndirectStubsManager() {
+  return std::make_unique<TPCIndirectStubsManager>(*this);
+}
+
+TrampolinePool &TPCIndirectionUtils::getTrampolinePool() {
+  if (!TP)
+    TP = std::make_unique<TPCTrampolinePool>(*this);
+  return *TP;
+}
+
+LazyCallThroughManager &TPCIndirectionUtils::createLazyCallThroughManager(
+    ExecutionSession &ES, JITTargetAddress ErrorHandlerAddr) {
+  assert(!LCTM &&
+         "createLazyCallThroughManager can not have been called before");
+  LCTM = std::make_unique<LazyCallThroughManager>(ES, ErrorHandlerAddr,
+                                                  &getTrampolinePool());
+  return *LCTM;
+}
+
+TPCIndirectionUtils::TPCIndirectionUtils(TargetProcessControl &TPC,
+                                         std::unique_ptr<ABISupport> ABI)
+    : TPC(TPC), ABI(std::move(ABI)) {
+  assert(this->ABI && "ABI can not be null");
+
+  assert(TPC.getPageSize() > getABISupport().getStubSize() &&
+         "Stubs larger than one page are not supported");
+}
+
+Expected<TPCIndirectionUtils::IndirectStubInfoVector>
+TPCIndirectionUtils::getIndirectStubs(unsigned NumStubs) {
+
+  std::lock_guard<std::mutex> Lock(TPCUIMutex);
+
+  // If there aren't enough stubs available then allocate some more.
+  if (NumStubs > AvailableIndirectStubs.size()) {
+    auto NumStubsToAllocate = NumStubs;
+    auto PageSize = TPC.getPageSize();
+    auto StubBytes = alignTo(NumStubsToAllocate * ABI->getStubSize(), PageSize);
+    NumStubsToAllocate = StubBytes / ABI->getStubSize();
+    auto PointerBytes =
+        alignTo(NumStubsToAllocate * ABI->getPointerSize(), PageSize);
+
+    constexpr auto StubPagePermissions =
+        static_cast<sys::Memory::ProtectionFlags>(sys::Memory::MF_READ |
+                                                  sys::Memory::MF_EXEC);
+    constexpr auto PointerPagePermissions =
+        static_cast<sys::Memory::ProtectionFlags>(sys::Memory::MF_READ |
+                                                  sys::Memory::MF_WRITE);
+
+    jitlink::JITLinkMemoryManager::SegmentsRequestMap Request;
+    Request[StubPagePermissions] = {PageSize, static_cast<size_t>(StubBytes),
+                                    0};
+    Request[PointerPagePermissions] = {PageSize, 0, PointerBytes};
+    auto Alloc = TPC.getMemMgr().allocate(nullptr, Request);
+    if (!Alloc)
+      return Alloc.takeError();
+
+    auto StubTargetAddr = (*Alloc)->getTargetMemory(StubPagePermissions);
+    auto PointerTargetAddr = (*Alloc)->getTargetMemory(PointerPagePermissions);
+
+    ABI->writeIndirectStubsBlock(
+        (*Alloc)->getWorkingMemory(StubPagePermissions).data(), StubTargetAddr,
+        PointerTargetAddr, NumStubsToAllocate);
+
+    if (auto Err = (*Alloc)->finalize())
+      return std::move(Err);
+
+    for (unsigned I = 0; I != NumStubsToAllocate; ++I) {
+      AvailableIndirectStubs.push_back(
+          IndirectStubInfo(StubTargetAddr, PointerTargetAddr));
+      StubTargetAddr += ABI->getStubSize();
+      PointerTargetAddr += ABI->getPointerSize();
+    }
+
+    IndirectStubAllocs.push_back(std::move(*Alloc));
+  }
+
+  assert(NumStubs <= AvailableIndirectStubs.size() &&
+         "Sufficient stubs should have been allocated above");
+
+  IndirectStubInfoVector Result;
+  while (NumStubs--) {
+    Result.push_back(AvailableIndirectStubs.back());
+    AvailableIndirectStubs.pop_back();
+  }
+
+  return std::move(Result);
+}
+
+static JITTargetAddress reentry(JITTargetAddress LCTMAddr,
+                                JITTargetAddress TrampolineAddr) {
+  auto &LCTM = *jitTargetAddressToPointer<LazyCallThroughManager *>(LCTMAddr);
+  std::promise<JITTargetAddress> LandingAddrP;
+  auto LandingAddrF = LandingAddrP.get_future();
+  LCTM.resolveTrampolineLandingAddress(
+      TrampolineAddr,
+      [&](JITTargetAddress Addr) { LandingAddrP.set_value(Addr); });
+  return LandingAddrF.get();
+}
+
+Error setUpInProcessLCTMReentryViaTPCIU(TPCIndirectionUtils &TPCIU) {
+  auto &LCTM = TPCIU.getLazyCallThroughManager();
+  return TPCIU
+      .writeResolverBlock(pointerToJITTargetAddress(&reentry),
+                          pointerToJITTargetAddress(&LCTM))
+      .takeError();
+}
+
+} // end namespace orc
+} // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.cpp
new file mode 100644
index 000000000000..aff7296cb6e3
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.cpp
@@ -0,0 +1,208 @@
+//===--------- RegisterEHFrames.cpp - Register EH frame sections ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.h"
+
+#include "llvm/Config/config.h"
+#include "llvm/ExecutionEngine/JITSymbol.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/DynamicLibrary.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include "llvm/Support/FormatVariadic.h"
+
+#define DEBUG_TYPE "orc"
+
+using namespace llvm;
+using namespace llvm::orc;
+using namespace llvm::orc::tpctypes;
+
+namespace llvm {
+namespace orc {
+
+#if defined(HAVE_REGISTER_FRAME) && defined(HAVE_DEREGISTER_FRAME) &&          \
+    !defined(__SEH__) && !defined(__USING_SJLJ_EXCEPTIONS__)
+
+extern "C" void __register_frame(const void *);
+extern "C" void __deregister_frame(const void *);
+
+Error registerFrameWrapper(const void *P) {
+  __register_frame(P);
+  return Error::success();
+}
+
+Error deregisterFrameWrapper(const void *P) {
+  __deregister_frame(P);
+  return Error::success();
+}
+
+#else
+
+// The building compiler does not have __(de)register_frame but
+// it may be found at runtime in a dynamically-loaded library.
+// For example, this happens when building LLVM with Visual C++
+// but using the MingW runtime.
+static Error registerFrameWrapper(const void *P) {
+  static void((*RegisterFrame)(const void *)) = 0;
+
+  if (!RegisterFrame)
+    *(void **)&RegisterFrame =
+        llvm::sys::DynamicLibrary::SearchForAddressOfSymbol("__register_frame");
+
+  if (RegisterFrame) {
+    RegisterFrame(P);
+    return Error::success();
+  }
+
+  return make_error<StringError>("could not register eh-frame: "
+                                 "__register_frame function not found",
+                                 inconvertibleErrorCode());
+}
+
+static Error deregisterFrameWrapper(const void *P) {
+  static void((*DeregisterFrame)(const void *)) = 0;
+
+  if (!DeregisterFrame)
+    *(void **)&DeregisterFrame =
+        llvm::sys::DynamicLibrary::SearchForAddressOfSymbol(
+            "__deregister_frame");
+
+  if (DeregisterFrame) {
+    DeregisterFrame(P);
+    return Error::success();
+  }
+
+  return make_error<StringError>("could not deregister eh-frame: "
+                                 "__deregister_frame function not found",
+                                 inconvertibleErrorCode());
+}
+#endif
+
+#ifdef __APPLE__
+
+template <typename HandleFDEFn>
+Error walkAppleEHFrameSection(const char *const SectionStart,
+                              size_t SectionSize, HandleFDEFn HandleFDE) {
+  const char *CurCFIRecord = SectionStart;
+  const char *End = SectionStart + SectionSize;
+  uint64_t Size = *reinterpret_cast<const uint32_t *>(CurCFIRecord);
+
+  while (CurCFIRecord != End && Size != 0) {
+    const char *OffsetField = CurCFIRecord + (Size == 0xffffffff ? 12 : 4);
+    if (Size == 0xffffffff)
+      Size = *reinterpret_cast<const uint64_t *>(CurCFIRecord + 4) + 12;
+    else
+      Size += 4;
+    uint32_t Offset = *reinterpret_cast<const uint32_t *>(OffsetField);
+
+    LLVM_DEBUG({
+      dbgs() << "Registering eh-frame section:\n";
+      dbgs() << "Processing " << (Offset ? "FDE" : "CIE") << " @"
+             << (void *)CurCFIRecord << ": [";
+      for (unsigned I = 0; I < Size; ++I)
+        dbgs() << format(" 0x%02" PRIx8, *(CurCFIRecord + I));
+      dbgs() << " ]\n";
+    });
+
+    if (Offset != 0)
+      if (auto Err = HandleFDE(CurCFIRecord))
+        return Err;
+
+    CurCFIRecord += Size;
+
+    Size = *reinterpret_cast<const uint32_t *>(CurCFIRecord);
+  }
+
+  return Error::success();
+}
+
+#endif // __APPLE__
+
+Error registerEHFrameSection(const void *EHFrameSectionAddr,
+                             size_t EHFrameSectionSize) {
+#ifdef __APPLE__
+  // On Darwin __register_frame has to be called for each FDE entry.
+  return walkAppleEHFrameSection(static_cast<const char *>(EHFrameSectionAddr),
+                                 EHFrameSectionSize, registerFrameWrapper);
+#else
+  // On Linux __register_frame takes a single argument:
+  // a pointer to the start of the .eh_frame section.
+
+  // How can it find the end? Because crtendS.o is linked
+  // in and it has an .eh_frame section with four zero chars.
+  return registerFrameWrapper(EHFrameSectionAddr);
+#endif
+}
+
+Error deregisterEHFrameSection(const void *EHFrameSectionAddr,
+                               size_t EHFrameSectionSize) {
+#ifdef __APPLE__
+  return walkAppleEHFrameSection(static_cast<const char *>(EHFrameSectionAddr),
+                                 EHFrameSectionSize, deregisterFrameWrapper);
+#else
+  return deregisterFrameWrapper(EHFrameSectionAddr);
+#endif
+}
+
+} // end namespace orc
+} // end namespace llvm
+
+extern "C" CWrapperFunctionResult
+llvm_orc_registerEHFrameSectionWrapper(uint8_t *Data, uint64_t Size) {
+  if (Size != sizeof(uint64_t) + sizeof(uint64_t))
+    return WrapperFunctionResult::from(
+               "Invalid arguments to llvm_orc_registerEHFrameSectionWrapper")
+        .release();
+
+  uint64_t EHFrameSectionAddr;
+  uint64_t EHFrameSectionSize;
+
+  {
+    BinaryStreamReader ArgReader(ArrayRef<uint8_t>(Data, Size),
+                                 support::endianness::big);
+    cantFail(ArgReader.readInteger(EHFrameSectionAddr));
+    cantFail(ArgReader.readInteger(EHFrameSectionSize));
+  }
+
+  if (auto Err = registerEHFrameSection(
+          jitTargetAddressToPointer<void *>(EHFrameSectionAddr),
+          EHFrameSectionSize)) {
+    auto ErrMsg = toString(std::move(Err));
+    return WrapperFunctionResult::from(ErrMsg).release();
+  }
+  return WrapperFunctionResult().release();
+}
+
+extern "C" CWrapperFunctionResult
+llvm_orc_deregisterEHFrameSectionWrapper(uint8_t *Data, uint64_t Size) {
+  if (Size != sizeof(uint64_t) + sizeof(uint64_t))
+    return WrapperFunctionResult::from(
+               "Invalid arguments to llvm_orc_registerEHFrameSectionWrapper")
+        .release();
+
+  uint64_t EHFrameSectionAddr;
+  uint64_t EHFrameSectionSize;
+
+  {
+    BinaryStreamReader ArgReader(ArrayRef<uint8_t>(Data, Size),
+                                 support::endianness::big);
+    cantFail(ArgReader.readInteger(EHFrameSectionAddr));
+    cantFail(ArgReader.readInteger(EHFrameSectionSize));
+  }
+
+  if (auto Err = deregisterEHFrameSection(
+          jitTargetAddressToPointer<void *>(EHFrameSectionAddr),
+          EHFrameSectionSize)) {
+    auto ErrMsg = toString(std::move(Err));
+    return WrapperFunctionResult::from(ErrMsg).release();
+  }
+  return WrapperFunctionResult().release();
+}
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/TargetProcess/TargetExecutionUtils.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/TargetProcess/TargetExecutionUtils.cpp
new file mode 100644
index 000000000000..a8e6c049cf4b
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/TargetProcess/TargetExecutionUtils.cpp
@@ -0,0 +1,43 @@
+//===--- TargetExecutionUtils.cpp - Execution utils for target processes --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/TargetProcess/TargetExecutionUtils.h"
+
+#include <vector>
+
+namespace llvm {
+namespace orc {
+
+int runAsMain(int (*Main)(int, char *[]), ArrayRef<std::string> Args,
+              Optional<StringRef> ProgramName) {
+  std::vector<std::unique_ptr<char[]>> ArgVStorage;
+  std::vector<char *> ArgV;
+
+  ArgVStorage.reserve(Args.size() + (ProgramName ? 1 : 0));
+  ArgV.reserve(Args.size() + 1 + (ProgramName ? 1 : 0));
+
+  if (ProgramName) {
+    ArgVStorage.push_back(std::make_unique<char[]>(ProgramName->size() + 1));
+    llvm::copy(*ProgramName, &ArgVStorage.back()[0]);
+    ArgVStorage.back()[ProgramName->size()] = '\0';
+    ArgV.push_back(ArgVStorage.back().get());
+  }
+
+  for (const auto &Arg : Args) {
+    ArgVStorage.push_back(std::make_unique<char[]>(Arg.size() + 1));
+    llvm::copy(Arg, &ArgVStorage.back()[0]);
+    ArgVStorage.back()[Arg.size()] = '\0';
+    ArgV.push_back(ArgVStorage.back().get());
+  }
+  ArgV.push_back(nullptr);
+
+  return Main(Args.size() + !!ProgramName, ArgV.data());
+}
+
+} // End namespace orc.
+} // End namespace llvm.
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/TargetProcessControl.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/TargetProcessControl.cpp
new file mode 100644
index 000000000000..7bf874e88c26
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/TargetProcessControl.cpp
@@ -0,0 +1,153 @@
+//===------ TargetProcessControl.cpp -- Target process control APIs -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/TargetProcessControl.h"
+
+#include "llvm/ExecutionEngine/Orc/Core.h"
+#include "llvm/ExecutionEngine/Orc/TargetProcess/TargetExecutionUtils.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/Process.h"
+
+#include <mutex>
+
+namespace llvm {
+namespace orc {
+
+TargetProcessControl::MemoryAccess::~MemoryAccess() {}
+
+TargetProcessControl::~TargetProcessControl() {}
+
+SelfTargetProcessControl::SelfTargetProcessControl(
+    std::shared_ptr<SymbolStringPool> SSP, Triple TargetTriple,
+    unsigned PageSize, std::unique_ptr<jitlink::JITLinkMemoryManager> MemMgr)
+    : TargetProcessControl(std::move(SSP)) {
+
+  OwnedMemMgr = std::move(MemMgr);
+  if (!OwnedMemMgr)
+    OwnedMemMgr = std::make_unique<jitlink::InProcessMemoryManager>();
+
+  this->TargetTriple = std::move(TargetTriple);
+  this->PageSize = PageSize;
+  this->MemMgr = OwnedMemMgr.get();
+  this->MemAccess = this;
+  if (this->TargetTriple.isOSBinFormatMachO())
+    GlobalManglingPrefix = '_';
+}
+
+Expected<std::unique_ptr<SelfTargetProcessControl>>
+SelfTargetProcessControl::Create(
+    std::shared_ptr<SymbolStringPool> SSP,
+    std::unique_ptr<jitlink::JITLinkMemoryManager> MemMgr) {
+  auto PageSize = sys::Process::getPageSize();
+  if (!PageSize)
+    return PageSize.takeError();
+
+  Triple TT(sys::getProcessTriple());
+
+  return std::make_unique<SelfTargetProcessControl>(
+      std::move(SSP), std::move(TT), *PageSize, std::move(MemMgr));
+}
+
+Expected<tpctypes::DylibHandle>
+SelfTargetProcessControl::loadDylib(const char *DylibPath) {
+  std::string ErrMsg;
+  auto Dylib = std::make_unique<sys::DynamicLibrary>(
+      sys::DynamicLibrary::getPermanentLibrary(DylibPath, &ErrMsg));
+  if (!Dylib->isValid())
+    return make_error<StringError>(std::move(ErrMsg), inconvertibleErrorCode());
+  DynamicLibraries.push_back(std::move(Dylib));
+  return pointerToJITTargetAddress(DynamicLibraries.back().get());
+}
+
+Expected<std::vector<tpctypes::LookupResult>>
+SelfTargetProcessControl::lookupSymbols(ArrayRef<LookupRequest> Request) {
+  std::vector<tpctypes::LookupResult> R;
+
+  for (auto &Elem : Request) {
+    auto *Dylib = jitTargetAddressToPointer<sys::DynamicLibrary *>(Elem.Handle);
+    assert(llvm::any_of(DynamicLibraries,
+                        [=](const std::unique_ptr<sys::DynamicLibrary> &DL) {
+                          return DL.get() == Dylib;
+                        }) &&
+           "Invalid handle");
+
+    R.push_back(std::vector<JITTargetAddress>());
+    for (auto &KV : Elem.Symbols) {
+      auto &Sym = KV.first;
+      std::string Tmp((*Sym).data() + !!GlobalManglingPrefix,
+                      (*Sym).size() - !!GlobalManglingPrefix);
+      void *Addr = Dylib->getAddressOfSymbol(Tmp.c_str());
+      if (!Addr && KV.second == SymbolLookupFlags::RequiredSymbol) {
+        // FIXME: Collect all failing symbols before erroring out.
+        SymbolNameVector MissingSymbols;
+        MissingSymbols.push_back(Sym);
+        return make_error<SymbolsNotFound>(std::move(MissingSymbols));
+      }
+      R.back().push_back(pointerToJITTargetAddress(Addr));
+    }
+  }
+
+  return R;
+}
+
+Expected<int32_t>
+SelfTargetProcessControl::runAsMain(JITTargetAddress MainFnAddr,
+                                    ArrayRef<std::string> Args) {
+  using MainTy = int (*)(int, char *[]);
+  return orc::runAsMain(jitTargetAddressToFunction<MainTy>(MainFnAddr), Args);
+}
+
+Expected<tpctypes::WrapperFunctionResult>
+SelfTargetProcessControl::runWrapper(JITTargetAddress WrapperFnAddr,
+                                     ArrayRef<uint8_t> ArgBuffer) {
+  using WrapperFnTy =
+      tpctypes::CWrapperFunctionResult (*)(const uint8_t *Data, uint64_t Size);
+  auto *WrapperFn = jitTargetAddressToFunction<WrapperFnTy>(WrapperFnAddr);
+  return WrapperFn(ArgBuffer.data(), ArgBuffer.size());
+}
+
+Error SelfTargetProcessControl::disconnect() { return Error::success(); }
+
+void SelfTargetProcessControl::writeUInt8s(ArrayRef<tpctypes::UInt8Write> Ws,
+                                           WriteResultFn OnWriteComplete) {
+  for (auto &W : Ws)
+    *jitTargetAddressToPointer<uint8_t *>(W.Address) = W.Value;
+  OnWriteComplete(Error::success());
+}
+
+void SelfTargetProcessControl::writeUInt16s(ArrayRef<tpctypes::UInt16Write> Ws,
+                                            WriteResultFn OnWriteComplete) {
+  for (auto &W : Ws)
+    *jitTargetAddressToPointer<uint16_t *>(W.Address) = W.Value;
+  OnWriteComplete(Error::success());
+}
+
+void SelfTargetProcessControl::writeUInt32s(ArrayRef<tpctypes::UInt32Write> Ws,
+                                            WriteResultFn OnWriteComplete) {
+  for (auto &W : Ws)
+    *jitTargetAddressToPointer<uint32_t *>(W.Address) = W.Value;
+  OnWriteComplete(Error::success());
+}
+
+void SelfTargetProcessControl::writeUInt64s(ArrayRef<tpctypes::UInt64Write> Ws,
+                                            WriteResultFn OnWriteComplete) {
+  for (auto &W : Ws)
+    *jitTargetAddressToPointer<uint64_t *>(W.Address) = W.Value;
+  OnWriteComplete(Error::success());
+}
+
+void SelfTargetProcessControl::writeBuffers(ArrayRef<tpctypes::BufferWrite> Ws,
+                                            WriteResultFn OnWriteComplete) {
+  for (auto &W : Ws)
+    memcpy(jitTargetAddressToPointer<char *>(W.Address), W.Buffer.data(),
+           W.Buffer.size());
+  OnWriteComplete(Error::success());
+}
+
+} // end namespace orc
+} // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/ThreadSafeModule.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/ThreadSafeModule.cpp
index 1f4e6f132115..2e128dd23744 100644
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/ThreadSafeModule.cpp
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/ThreadSafeModule.cpp
@@ -15,7 +15,7 @@
 namespace llvm {
 namespace orc {
 
-ThreadSafeModule cloneToNewContext(ThreadSafeModule &TSM,
+ThreadSafeModule cloneToNewContext(const ThreadSafeModule &TSM,
                                    GVPredicate ShouldCloneDef,
                                    GVModifier UpdateClonedDefSource) {
   assert(TSM && "Can not clone null module");
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/OrcError/OrcError.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/OrcError/OrcError.cpp
deleted file mode 100644
index cc99e154fbec..000000000000
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/OrcError/OrcError.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-//===---------------- OrcError.cpp - Error codes for ORC ------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Error codes for ORC.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ExecutionEngine/Orc/OrcError.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/ManagedStatic.h"
-
-#include <type_traits>
-
-using namespace llvm;
-using namespace llvm::orc;
-
-namespace {
-
-// FIXME: This class is only here to support the transition to llvm::Error. It
-// will be removed once this transition is complete. Clients should prefer to
-// deal with the Error value directly, rather than converting to error_code.
-class OrcErrorCategory : public std::error_category {
-public:
-  const char *name() const noexcept override { return "orc"; }
-
-  std::string message(int condition) const override {
-    switch (static_cast<OrcErrorCode>(condition)) {
-    case OrcErrorCode::UnknownORCError:
-      return "Unknown ORC error";
-    case OrcErrorCode::DuplicateDefinition:
-      return "Duplicate symbol definition";
-    case OrcErrorCode::JITSymbolNotFound:
-      return "JIT symbol not found";
-    case OrcErrorCode::RemoteAllocatorDoesNotExist:
-      return "Remote allocator does not exist";
-    case OrcErrorCode::RemoteAllocatorIdAlreadyInUse:
-      return "Remote allocator Id already in use";
-    case OrcErrorCode::RemoteMProtectAddrUnrecognized:
-      return "Remote mprotect call references unallocated memory";
-    case OrcErrorCode::RemoteIndirectStubsOwnerDoesNotExist:
-      return "Remote indirect stubs owner does not exist";
-    case OrcErrorCode::RemoteIndirectStubsOwnerIdAlreadyInUse:
-      return "Remote indirect stubs owner Id already in use";
-    case OrcErrorCode::RPCConnectionClosed:
-      return "RPC connection closed";
-    case OrcErrorCode::RPCCouldNotNegotiateFunction:
-      return "Could not negotiate RPC function";
-    case OrcErrorCode::RPCResponseAbandoned:
-      return "RPC response abandoned";
-    case OrcErrorCode::UnexpectedRPCCall:
-      return "Unexpected RPC call";
-    case OrcErrorCode::UnexpectedRPCResponse:
-      return "Unexpected RPC response";
-    case OrcErrorCode::UnknownErrorCodeFromRemote:
-      return "Unknown error returned from remote RPC function "
-             "(Use StringError to get error message)";
-    case OrcErrorCode::UnknownResourceHandle:
-      return "Unknown resource handle";
-    case OrcErrorCode::MissingSymbolDefinitions:
-      return "MissingSymbolsDefinitions";
-    case OrcErrorCode::UnexpectedSymbolDefinitions:
-      return "UnexpectedSymbolDefinitions";
-    }
-    llvm_unreachable("Unhandled error code");
-  }
-};
-
-static ManagedStatic<OrcErrorCategory> OrcErrCat;
-}
-
-namespace llvm {
-namespace orc {
-
-char DuplicateDefinition::ID = 0;
-char JITSymbolNotFound::ID = 0;
-
-std::error_code orcError(OrcErrorCode ErrCode) {
-  typedef std::underlying_type<OrcErrorCode>::type UT;
-  return std::error_code(static_cast<UT>(ErrCode), *OrcErrCat);
-}
-
-
-DuplicateDefinition::DuplicateDefinition(std::string SymbolName)
-  : SymbolName(std::move(SymbolName)) {}
-
-std::error_code DuplicateDefinition::convertToErrorCode() const {
-  return orcError(OrcErrorCode::DuplicateDefinition);
-}
-
-void DuplicateDefinition::log(raw_ostream &OS) const {
-  OS << "Duplicate definition of symbol '" << SymbolName << "'";
-}
-
-const std::string &DuplicateDefinition::getSymbolName() const {
-  return SymbolName;
-}
-
-JITSymbolNotFound::JITSymbolNotFound(std::string SymbolName)
-  : SymbolName(std::move(SymbolName)) {}
-
-std::error_code JITSymbolNotFound::convertToErrorCode() const {
-  typedef std::underlying_type<OrcErrorCode>::type UT;
-  return std::error_code(static_cast<UT>(OrcErrorCode::JITSymbolNotFound),
-                         *OrcErrCat);
-}
-
-void JITSymbolNotFound::log(raw_ostream &OS) const {
-  OS << "Could not find symbol '" << SymbolName << "'";
-}
-
-const std::string &JITSymbolNotFound::getSymbolName() const {
-  return SymbolName;
-}
-
-}
-}
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/OrcError/RPCError.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/OrcError/RPCError.cpp
deleted file mode 100644
index 3cf78fd9f7ba..000000000000
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/OrcError/RPCError.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-//===--------------- RPCError.cpp - RPCERror implementation ---------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// RPC Error type implmentations.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ExecutionEngine/Orc/RPC/RPCUtils.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/raw_ostream.h"
-
-#include <system_error>
-#include <string>
-
-char llvm::orc::rpc::RPCFatalError::ID = 0;
-char llvm::orc::rpc::ConnectionClosed::ID = 0;
-char llvm::orc::rpc::ResponseAbandoned::ID = 0;
-char llvm::orc::rpc::CouldNotNegotiate::ID = 0;
-
-namespace llvm {
-namespace orc {
-namespace rpc {
-
-std::error_code ConnectionClosed::convertToErrorCode() const {
-  return orcError(OrcErrorCode::RPCConnectionClosed);
-}
-
-void ConnectionClosed::log(raw_ostream &OS) const {
-  OS << "RPC connection already closed";
-}
-
-std::error_code ResponseAbandoned::convertToErrorCode() const {
-  return orcError(OrcErrorCode::RPCResponseAbandoned);
-}
-
-void ResponseAbandoned::log(raw_ostream &OS) const {
-  OS << "RPC response abandoned";
-}
-
-CouldNotNegotiate::CouldNotNegotiate(std::string Signature)
-    : Signature(std::move(Signature)) {}
-
-std::error_code CouldNotNegotiate::convertToErrorCode() const {
-  return orcError(OrcErrorCode::RPCCouldNotNegotiateFunction);
-}
-
-void CouldNotNegotiate::log(raw_ostream &OS) const {
-  OS << "Could not negotiate RPC function " << Signature;
-}
-
-
-} // end namespace rpc
-} // end namespace orc
-} // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp
index 46604ff4000c..b6ccd02405c1 100644
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp
@@ -31,16 +31,8 @@ namespace llvm {
 
 RTDyldMemoryManager::~RTDyldMemoryManager() {}
 
-// Determine whether we can register EH tables.
-#if (defined(__GNUC__) && !defined(__ARM_EABI__) && !defined(__ia64__) &&      \
-     !(defined(_AIX) && defined(__ibmxl__)) && !defined(__SEH__) &&            \
-     !defined(__USING_SJLJ_EXCEPTIONS__))
-#define HAVE_EHTABLE_SUPPORT 1
-#else
-#define HAVE_EHTABLE_SUPPORT 0
-#endif
-
-#if HAVE_EHTABLE_SUPPORT
+#if defined(HAVE_REGISTER_FRAME) && defined(HAVE_DEREGISTER_FRAME) &&          \
+    !defined(__SEH__) && !defined(__USING_SJLJ_EXCEPTIONS__)
 extern "C" void __register_frame(void *);
 extern "C" void __deregister_frame(void *);
 #else
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
index 04f541b59557..e49e6e541f15 100644
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
@@ -1206,16 +1206,19 @@ Error RuntimeDyldImpl::resolveExternalSymbols() {
 
 void RuntimeDyldImpl::finalizeAsync(
     std::unique_ptr<RuntimeDyldImpl> This,
-    unique_function<void(object::OwningBinary<object::ObjectFile>, Error)>
+    unique_function<void(object::OwningBinary<object::ObjectFile>,
+                         std::unique_ptr<RuntimeDyld::LoadedObjectInfo>, Error)>
         OnEmitted,
-    object::OwningBinary<object::ObjectFile> O) {
+    object::OwningBinary<object::ObjectFile> O,
+    std::unique_ptr<RuntimeDyld::LoadedObjectInfo> Info) {
 
   auto SharedThis = std::shared_ptr<RuntimeDyldImpl>(std::move(This));
   auto PostResolveContinuation =
-      [SharedThis, OnEmitted = std::move(OnEmitted), O = std::move(O)](
+      [SharedThis, OnEmitted = std::move(OnEmitted), O = std::move(O),
+       Info = std::move(Info)](
           Expected<JITSymbolResolver::LookupResult> Result) mutable {
         if (!Result) {
-          OnEmitted(std::move(O), Result.takeError());
+          OnEmitted(std::move(O), std::move(Info), Result.takeError());
           return;
         }
 
@@ -1229,11 +1232,11 @@ void RuntimeDyldImpl::finalizeAsync(
         SharedThis->registerEHFrames();
         std::string ErrMsg;
         if (SharedThis->MemMgr.finalizeMemory(&ErrMsg))
-          OnEmitted(std::move(O),
+          OnEmitted(std::move(O), std::move(Info),
                     make_error<StringError>(std::move(ErrMsg),
                                             inconvertibleErrorCode()));
         else
-          OnEmitted(std::move(O), Error::success());
+          OnEmitted(std::move(O), std::move(Info), Error::success());
       };
 
   JITSymbolResolver::LookupSet Symbols;
@@ -1425,12 +1428,12 @@ void jitLinkForORC(
     object::OwningBinary<object::ObjectFile> O,
     RuntimeDyld::MemoryManager &MemMgr, JITSymbolResolver &Resolver,
     bool ProcessAllSections,
-    unique_function<
-        Error(const object::ObjectFile &Obj,
-              std::unique_ptr<RuntimeDyld::LoadedObjectInfo> LoadedObj,
-              std::map<StringRef, JITEvaluatedSymbol>)>
+    unique_function<Error(const object::ObjectFile &Obj,
+                          RuntimeDyld::LoadedObjectInfo &LoadedObj,
+                          std::map<StringRef, JITEvaluatedSymbol>)>
         OnLoaded,
-    unique_function<void(object::OwningBinary<object::ObjectFile>, Error)>
+    unique_function<void(object::OwningBinary<object::ObjectFile>,
+                         std::unique_ptr<RuntimeDyld::LoadedObjectInfo>, Error)>
         OnEmitted) {
 
   RuntimeDyld RTDyld(MemMgr, Resolver);
@@ -1439,17 +1442,17 @@ void jitLinkForORC(
   auto Info = RTDyld.loadObject(*O.getBinary());
 
   if (RTDyld.hasError()) {
-    OnEmitted(std::move(O), make_error<StringError>(RTDyld.getErrorString(),
-                                                    inconvertibleErrorCode()));
+    OnEmitted(std::move(O), std::move(Info),
+              make_error<StringError>(RTDyld.getErrorString(),
+                                      inconvertibleErrorCode()));
     return;
   }
 
-  if (auto Err =
-          OnLoaded(*O.getBinary(), std::move(Info), RTDyld.getSymbolTable()))
-    OnEmitted(std::move(O), std::move(Err));
+  if (auto Err = OnLoaded(*O.getBinary(), *Info, RTDyld.getSymbolTable()))
+    OnEmitted(std::move(O), std::move(Info), std::move(Err));
 
   RuntimeDyldImpl::finalizeAsync(std::move(RTDyld.Dyld), std::move(OnEmitted),
-                                 std::move(O));
+                                 std::move(O), std::move(Info));
 }
 
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp
index e5e512672daa..2fbe707ce8df 100644
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp
@@ -352,7 +352,7 @@ private:
     RemainingExpr = RemainingExpr.substr(1).ltrim();
 
     uint64_t StubAddr;
-    std::string ErrorMsg = "";
+    std::string ErrorMsg;
     std::tie(StubAddr, ErrorMsg) = Checker.getStubOrGOTAddrFor(
         StubContainerName, Symbol, PCtx.IsInsideLoad, IsStubAddr);
 
@@ -389,7 +389,7 @@ private:
     RemainingExpr = RemainingExpr.substr(1).ltrim();
 
     uint64_t StubAddr;
-    std::string ErrorMsg = "";
+    std::string ErrorMsg;
     std::tie(StubAddr, ErrorMsg) = Checker.getSectionAddr(
         FileName, SectionName, PCtx.IsInsideLoad);
 
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index 7ed8a718ed3c..28e1faab5ac7 100644
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -57,13 +57,6 @@ namespace {
 template <class ELFT> class DyldELFObject : public ELFObjectFile<ELFT> {
   LLVM_ELF_IMPORT_TYPES_ELFT(ELFT)
 
-  typedef Elf_Shdr_Impl<ELFT> Elf_Shdr;
-  typedef Elf_Sym_Impl<ELFT> Elf_Sym;
-  typedef Elf_Rel_Impl<ELFT, false> Elf_Rel;
-  typedef Elf_Rel_Impl<ELFT, true> Elf_Rela;
-
-  typedef Elf_Ehdr_Impl<ELFT> Elf_Ehdr;
-
   typedef typename ELFT::uint addr_type;
 
   DyldELFObject(ELFObjectFile<ELFT> &&Obj);
@@ -961,7 +954,8 @@ void RuntimeDyldELF::resolveRelocation(const SectionEntry &Section,
     resolveARMRelocation(Section, Offset, (uint32_t)(Value & 0xffffffffL), Type,
                          (uint32_t)(Addend & 0xffffffffL));
     break;
-  case Triple::ppc:
+  case Triple::ppc: // Fall through.
+  case Triple::ppcle:
     resolvePPC32Relocation(Section, Offset, Value, Type, Addend);
     break;
   case Triple::ppc64: // Fall through.
@@ -1676,30 +1670,33 @@ RuntimeDyldELF::processRelocationRef(
       if (Value.SymbolName) {
         // This is a call to an external function.
         // Look for an existing stub.
-        SectionEntry &Section = Sections[SectionID];
+        SectionEntry *Section = &Sections[SectionID];
         StubMap::const_iterator i = Stubs.find(Value);
         uintptr_t StubAddress;
         if (i != Stubs.end()) {
-          StubAddress = uintptr_t(Section.getAddress()) + i->second;
+          StubAddress = uintptr_t(Section->getAddress()) + i->second;
           LLVM_DEBUG(dbgs() << " Stub function found\n");
         } else {
           // Create a new stub function (equivalent to a PLT entry).
           LLVM_DEBUG(dbgs() << " Create a new stub function\n");
 
-          uintptr_t BaseAddress = uintptr_t(Section.getAddress());
+          uintptr_t BaseAddress = uintptr_t(Section->getAddress());
           uintptr_t StubAlignment = getStubAlignment();
           StubAddress =
-              (BaseAddress + Section.getStubOffset() + StubAlignment - 1) &
+              (BaseAddress + Section->getStubOffset() + StubAlignment - 1) &
               -StubAlignment;
           unsigned StubOffset = StubAddress - BaseAddress;
           Stubs[Value] = StubOffset;
           createStubFunction((uint8_t *)StubAddress);
 
           // Bump our stub offset counter
-          Section.advanceStubOffset(getMaxStubSize());
+          Section->advanceStubOffset(getMaxStubSize());
 
           // Allocate a GOT Entry
           uint64_t GOTOffset = allocateGOTEntries(1);
+          // This potentially creates a new Section which potentially
+          // invalidates the Section pointer, so reload it.
+          Section = &Sections[SectionID];
 
           // The load of the GOT address has an addend of -4
           resolveGOTOffsetRelocation(SectionID, StubOffset + 2, GOTOffset - 4,
@@ -1712,7 +1709,7 @@ RuntimeDyldELF::processRelocationRef(
         }
 
         // Make the target call a call into the stub table.
-        resolveRelocation(Section, Offset, StubAddress, ELF::R_X86_64_PC32,
+        resolveRelocation(*Section, Offset, StubAddress, ELF::R_X86_64_PC32,
                           Addend);
       } else {
         RelocationEntry RE(SectionID, Offset, ELF::R_X86_64_PC32, Value.Addend,
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h b/contrib/llvm-project/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
index a9346536fd09..d34fae9aaf0c 100644
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
@@ -535,9 +535,12 @@ public:
 
   static void finalizeAsync(
       std::unique_ptr<RuntimeDyldImpl> This,
-      unique_function<void(object::OwningBinary<object::ObjectFile>, Error)>
+      unique_function<void(object::OwningBinary<object::ObjectFile>,
+                           std::unique_ptr<RuntimeDyld::LoadedObjectInfo>,
+                           Error)>
           OnEmitted,
-      object::OwningBinary<object::ObjectFile> O);
+      object::OwningBinary<object::ObjectFile> O,
+      std::unique_ptr<RuntimeDyld::LoadedObjectInfo> Info);
 
   void reassignSectionAddress(unsigned SectionID, uint64_t Addr);
 
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h b/contrib/llvm-project/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h
index ebe3ca33d308..9df3e2e3c3bf 100644
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h
@@ -113,11 +113,10 @@ public:
       // The MemoryManager can make sure this is always true by forcing the
       // memory layout to be: CodeSection < ReadOnlySection < ReadWriteSection.
       const uint64_t ImageBase = getImageBase();
-      if (Value < ImageBase || ((Value - ImageBase) > UINT32_MAX)) {
-        llvm::errs() << "IMAGE_REL_AMD64_ADDR32NB relocation requires an"
-                     << "ordered section layout.\n";
-        write32BitOffset(Target, 0, 0);
-      } else {
+      if (Value < ImageBase || ((Value - ImageBase) > UINT32_MAX))
+        report_fatal_error("IMAGE_REL_AMD64_ADDR32NB relocation requires an "
+                           "ordered section layout");
+      else {
         write32BitOffset(Target, RE.Addend, Value - ImageBase);
       }
       break;
diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/SectionMemoryManager.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/SectionMemoryManager.cpp
index 925049b2a1b4..6690dd07d99b 100644
--- a/contrib/llvm-project/llvm/lib/ExecutionEngine/SectionMemoryManager.cpp
+++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/SectionMemoryManager.cpp
@@ -112,6 +112,15 @@ uint8_t *SectionMemoryManager::allocateSection(
   // Save this address as the basis for our next request
   MemGroup.Near = MB;
 
+  // Copy the address to all the other groups, if they have not
+  // been initialized.
+  if (CodeMem.Near.base() == 0)
+    CodeMem.Near = MB;
+  if (RODataMem.Near.base() == 0)
+    RODataMem.Near = MB;
+  if (RWDataMem.Near.base() == 0)
+    RWDataMem.Near = MB;
+
   // Remember that we allocated this memory
   MemGroup.AllocatedMem.push_back(MB);
   Addr = (uintptr_t)MB.base();
@@ -152,8 +161,7 @@ bool SectionMemoryManager::finalizeMemory(std::string *ErrMsg) {
   }
 
   // Make read-only data memory read-only.
-  ec = applyMemoryGroupPermissions(RODataMem,
-                                   sys::Memory::MF_READ | sys::Memory::MF_EXEC);
+  ec = applyMemoryGroupPermissions(RODataMem, sys::Memory::MF_READ);
   if (ec) {
     if (ErrMsg) {
       *ErrMsg = ec.message();
@@ -210,11 +218,9 @@ SectionMemoryManager::applyMemoryGroupPermissions(MemoryGroup &MemGroup,
   }
 
   // Remove all blocks which are now empty
-  MemGroup.FreeMem.erase(remove_if(MemGroup.FreeMem,
-                                   [](FreeMemBlock &FreeMB) {
-                                     return FreeMB.Free.allocatedSize() == 0;
-                                   }),
-                         MemGroup.FreeMem.end());
+  erase_if(MemGroup.FreeMem, [](FreeMemBlock &FreeMB) {
+    return FreeMB.Free.allocatedSize() == 0;
+  });
 
   return std::error_code();
 }
diff --git a/contrib/llvm-project/llvm/lib/FileCheck/FileCheck.cpp b/contrib/llvm-project/llvm/lib/FileCheck/FileCheck.cpp
new file mode 100644
index 000000000000..3169afaed58c
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/FileCheck/FileCheck.cpp
@@ -0,0 +1,2754 @@
+//===- FileCheck.cpp - Check that File's Contents match what is expected --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// FileCheck does a line-by line check of a file that validates whether it
+// contains the expected content.  This is useful for regression tests etc.
+//
+// This file implements most of the API that will be used by the FileCheck utility
+// as well as various unittests.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/FileCheck/FileCheck.h"
+#include "FileCheckImpl.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/CheckedArithmetic.h"
+#include "llvm/Support/FormatVariadic.h"
+#include <cstdint>
+#include <list>
+#include <set>
+#include <tuple>
+#include <utility>
+
+using namespace llvm;
+
+StringRef ExpressionFormat::toString() const {
+  switch (Value) {
+  case Kind::NoFormat:
+    return StringRef("<none>");
+  case Kind::Unsigned:
+    return StringRef("%u");
+  case Kind::Signed:
+    return StringRef("%d");
+  case Kind::HexUpper:
+    return StringRef("%X");
+  case Kind::HexLower:
+    return StringRef("%x");
+  }
+  llvm_unreachable("unknown expression format");
+}
+
+Expected<std::string> ExpressionFormat::getWildcardRegex() const {
+  auto CreatePrecisionRegex = [this](StringRef S) {
+    return (S + Twine('{') + Twine(Precision) + "}").str();
+  };
+
+  switch (Value) {
+  case Kind::Unsigned:
+    if (Precision)
+      return CreatePrecisionRegex("([1-9][0-9]*)?[0-9]");
+    return std::string("[0-9]+");
+  case Kind::Signed:
+    if (Precision)
+      return CreatePrecisionRegex("-?([1-9][0-9]*)?[0-9]");
+    return std::string("-?[0-9]+");
+  case Kind::HexUpper:
+    if (Precision)
+      return CreatePrecisionRegex("([1-9A-F][0-9A-F]*)?[0-9A-F]");
+    return std::string("[0-9A-F]+");
+  case Kind::HexLower:
+    if (Precision)
+      return CreatePrecisionRegex("([1-9a-f][0-9a-f]*)?[0-9a-f]");
+    return std::string("[0-9a-f]+");
+  default:
+    return createStringError(std::errc::invalid_argument,
+                             "trying to match value with invalid format");
+  }
+}
+
+Expected<std::string>
+ExpressionFormat::getMatchingString(ExpressionValue IntegerValue) const {
+  uint64_t AbsoluteValue;
+  StringRef SignPrefix = IntegerValue.isNegative() ? "-" : "";
+
+  if (Value == Kind::Signed) {
+    Expected<int64_t> SignedValue = IntegerValue.getSignedValue();
+    if (!SignedValue)
+      return SignedValue.takeError();
+    if (*SignedValue < 0)
+      AbsoluteValue = cantFail(IntegerValue.getAbsolute().getUnsignedValue());
+    else
+      AbsoluteValue = *SignedValue;
+  } else {
+    Expected<uint64_t> UnsignedValue = IntegerValue.getUnsignedValue();
+    if (!UnsignedValue)
+      return UnsignedValue.takeError();
+    AbsoluteValue = *UnsignedValue;
+  }
+
+  std::string AbsoluteValueStr;
+  switch (Value) {
+  case Kind::Unsigned:
+  case Kind::Signed:
+    AbsoluteValueStr = utostr(AbsoluteValue);
+    break;
+  case Kind::HexUpper:
+  case Kind::HexLower:
+    AbsoluteValueStr = utohexstr(AbsoluteValue, Value == Kind::HexLower);
+    break;
+  default:
+    return createStringError(std::errc::invalid_argument,
+                             "trying to match value with invalid format");
+  }
+
+  if (Precision > AbsoluteValueStr.size()) {
+    unsigned LeadingZeros = Precision - AbsoluteValueStr.size();
+    return (Twine(SignPrefix) + std::string(LeadingZeros, '0') +
+            AbsoluteValueStr)
+        .str();
+  }
+
+  return (Twine(SignPrefix) + AbsoluteValueStr).str();
+}
+
+Expected<ExpressionValue>
+ExpressionFormat::valueFromStringRepr(StringRef StrVal,
+                                      const SourceMgr &SM) const {
+  bool ValueIsSigned = Value == Kind::Signed;
+  StringRef OverflowErrorStr = "unable to represent numeric value";
+  if (ValueIsSigned) {
+    int64_t SignedValue;
+
+    if (StrVal.getAsInteger(10, SignedValue))
+      return ErrorDiagnostic::get(SM, StrVal, OverflowErrorStr);
+
+    return ExpressionValue(SignedValue);
+  }
+
+  bool Hex = Value == Kind::HexUpper || Value == Kind::HexLower;
+  uint64_t UnsignedValue;
+  if (StrVal.getAsInteger(Hex ? 16 : 10, UnsignedValue))
+    return ErrorDiagnostic::get(SM, StrVal, OverflowErrorStr);
+
+  return ExpressionValue(UnsignedValue);
+}
+
+static int64_t getAsSigned(uint64_t UnsignedValue) {
+  // Use memcpy to reinterpret the bitpattern in Value since casting to
+  // signed is implementation-defined if the unsigned value is too big to be
+  // represented in the signed type and using an union violates type aliasing
+  // rules.
+  int64_t SignedValue;
+  memcpy(&SignedValue, &UnsignedValue, sizeof(SignedValue));
+  return SignedValue;
+}
+
+Expected<int64_t> ExpressionValue::getSignedValue() const {
+  if (Negative)
+    return getAsSigned(Value);
+
+  if (Value > (uint64_t)std::numeric_limits<int64_t>::max())
+    return make_error<OverflowError>();
+
+  // Value is in the representable range of int64_t so we can use cast.
+  return static_cast<int64_t>(Value);
+}
+
+Expected<uint64_t> ExpressionValue::getUnsignedValue() const {
+  if (Negative)
+    return make_error<OverflowError>();
+
+  return Value;
+}
+
+ExpressionValue ExpressionValue::getAbsolute() const {
+  if (!Negative)
+    return *this;
+
+  int64_t SignedValue = getAsSigned(Value);
+  int64_t MaxInt64 = std::numeric_limits<int64_t>::max();
+  // Absolute value can be represented as int64_t.
+  if (SignedValue >= -MaxInt64)
+    return ExpressionValue(-getAsSigned(Value));
+
+  // -X == -(max int64_t + Rem), negate each component independently.
+  SignedValue += MaxInt64;
+  uint64_t RemainingValueAbsolute = -SignedValue;
+  return ExpressionValue(MaxInt64 + RemainingValueAbsolute);
+}
+
+Expected<ExpressionValue> llvm::operator+(const ExpressionValue &LeftOperand,
+                                          const ExpressionValue &RightOperand) {
+  if (LeftOperand.isNegative() && RightOperand.isNegative()) {
+    int64_t LeftValue = cantFail(LeftOperand.getSignedValue());
+    int64_t RightValue = cantFail(RightOperand.getSignedValue());
+    Optional<int64_t> Result = checkedAdd<int64_t>(LeftValue, RightValue);
+    if (!Result)
+      return make_error<OverflowError>();
+
+    return ExpressionValue(*Result);
+  }
+
+  // (-A) + B == B - A.
+  if (LeftOperand.isNegative())
+    return RightOperand - LeftOperand.getAbsolute();
+
+  // A + (-B) == A - B.
+  if (RightOperand.isNegative())
+    return LeftOperand - RightOperand.getAbsolute();
+
+  // Both values are positive at this point.
+  uint64_t LeftValue = cantFail(LeftOperand.getUnsignedValue());
+  uint64_t RightValue = cantFail(RightOperand.getUnsignedValue());
+  Optional<uint64_t> Result =
+      checkedAddUnsigned<uint64_t>(LeftValue, RightValue);
+  if (!Result)
+    return make_error<OverflowError>();
+
+  return ExpressionValue(*Result);
+}
+
+Expected<ExpressionValue> llvm::operator-(const ExpressionValue &LeftOperand,
+                                          const ExpressionValue &RightOperand) {
+  // Result will be negative and thus might underflow.
+  if (LeftOperand.isNegative() && !RightOperand.isNegative()) {
+    int64_t LeftValue = cantFail(LeftOperand.getSignedValue());
+    uint64_t RightValue = cantFail(RightOperand.getUnsignedValue());
+    // Result <= -1 - (max int64_t) which overflows on 1- and 2-complement.
+    if (RightValue > (uint64_t)std::numeric_limits<int64_t>::max())
+      return make_error<OverflowError>();
+    Optional<int64_t> Result =
+        checkedSub(LeftValue, static_cast<int64_t>(RightValue));
+    if (!Result)
+      return make_error<OverflowError>();
+
+    return ExpressionValue(*Result);
+  }
+
+  // (-A) - (-B) == B - A.
+  if (LeftOperand.isNegative())
+    return RightOperand.getAbsolute() - LeftOperand.getAbsolute();
+
+  // A - (-B) == A + B.
+  if (RightOperand.isNegative())
+    return LeftOperand + RightOperand.getAbsolute();
+
+  // Both values are positive at this point.
+  uint64_t LeftValue = cantFail(LeftOperand.getUnsignedValue());
+  uint64_t RightValue = cantFail(RightOperand.getUnsignedValue());
+  if (LeftValue >= RightValue)
+    return ExpressionValue(LeftValue - RightValue);
+  else {
+    uint64_t AbsoluteDifference = RightValue - LeftValue;
+    uint64_t MaxInt64 = std::numeric_limits<int64_t>::max();
+    // Value might underflow.
+    if (AbsoluteDifference > MaxInt64) {
+      AbsoluteDifference -= MaxInt64;
+      int64_t Result = -MaxInt64;
+      int64_t MinInt64 = std::numeric_limits<int64_t>::min();
+      // Underflow, tested by:
+      //   abs(Result + (max int64_t)) > abs((min int64_t) + (max int64_t))
+      if (AbsoluteDifference > static_cast<uint64_t>(-(MinInt64 - Result)))
+        return make_error<OverflowError>();
+      Result -= static_cast<int64_t>(AbsoluteDifference);
+      return ExpressionValue(Result);
+    }
+
+    return ExpressionValue(-static_cast<int64_t>(AbsoluteDifference));
+  }
+}
+
+Expected<ExpressionValue> llvm::operator*(const ExpressionValue &LeftOperand,
+                                          const ExpressionValue &RightOperand) {
+  // -A * -B == A * B
+  if (LeftOperand.isNegative() && RightOperand.isNegative())
+    return LeftOperand.getAbsolute() * RightOperand.getAbsolute();
+
+  // A * -B == -B * A
+  if (RightOperand.isNegative())
+    return RightOperand * LeftOperand;
+
+  assert(!RightOperand.isNegative() && "Unexpected negative operand!");
+
+  // Result will be negative and can underflow.
+  if (LeftOperand.isNegative()) {
+    auto Result = LeftOperand.getAbsolute() * RightOperand.getAbsolute();
+    if (!Result)
+      return Result;
+
+    return ExpressionValue(0) - *Result;
+  }
+
+  // Result will be positive and can overflow.
+  uint64_t LeftValue = cantFail(LeftOperand.getUnsignedValue());
+  uint64_t RightValue = cantFail(RightOperand.getUnsignedValue());
+  Optional<uint64_t> Result =
+      checkedMulUnsigned<uint64_t>(LeftValue, RightValue);
+  if (!Result)
+    return make_error<OverflowError>();
+
+  return ExpressionValue(*Result);
+}
+
+Expected<ExpressionValue> llvm::operator/(const ExpressionValue &LeftOperand,
+                                          const ExpressionValue &RightOperand) {
+  // -A / -B == A / B
+  if (LeftOperand.isNegative() && RightOperand.isNegative())
+    return LeftOperand.getAbsolute() / RightOperand.getAbsolute();
+
+  // Check for divide by zero.
+  if (RightOperand == ExpressionValue(0))
+    return make_error<OverflowError>();
+
+  // Result will be negative and can underflow.
+  if (LeftOperand.isNegative() || RightOperand.isNegative())
+    return ExpressionValue(0) -
+           cantFail(LeftOperand.getAbsolute() / RightOperand.getAbsolute());
+
+  uint64_t LeftValue = cantFail(LeftOperand.getUnsignedValue());
+  uint64_t RightValue = cantFail(RightOperand.getUnsignedValue());
+  return ExpressionValue(LeftValue / RightValue);
+}
+
+Expected<ExpressionValue> llvm::max(const ExpressionValue &LeftOperand,
+                                    const ExpressionValue &RightOperand) {
+  if (LeftOperand.isNegative() && RightOperand.isNegative()) {
+    int64_t LeftValue = cantFail(LeftOperand.getSignedValue());
+    int64_t RightValue = cantFail(RightOperand.getSignedValue());
+    return ExpressionValue(std::max(LeftValue, RightValue));
+  }
+
+  if (!LeftOperand.isNegative() && !RightOperand.isNegative()) {
+    uint64_t LeftValue = cantFail(LeftOperand.getUnsignedValue());
+    uint64_t RightValue = cantFail(RightOperand.getUnsignedValue());
+    return ExpressionValue(std::max(LeftValue, RightValue));
+  }
+
+  if (LeftOperand.isNegative())
+    return RightOperand;
+
+  return LeftOperand;
+}
+
+Expected<ExpressionValue> llvm::min(const ExpressionValue &LeftOperand,
+                                    const ExpressionValue &RightOperand) {
+  if (cantFail(max(LeftOperand, RightOperand)) == LeftOperand)
+    return RightOperand;
+
+  return LeftOperand;
+}
+
+Expected<ExpressionValue> NumericVariableUse::eval() const {
+  Optional<ExpressionValue> Value = Variable->getValue();
+  if (Value)
+    return *Value;
+
+  return make_error<UndefVarError>(getExpressionStr());
+}
+
+Expected<ExpressionValue> BinaryOperation::eval() const {
+  Expected<ExpressionValue> LeftOp = LeftOperand->eval();
+  Expected<ExpressionValue> RightOp = RightOperand->eval();
+
+  // Bubble up any error (e.g. undefined variables) in the recursive
+  // evaluation.
+  if (!LeftOp || !RightOp) {
+    Error Err = Error::success();
+    if (!LeftOp)
+      Err = joinErrors(std::move(Err), LeftOp.takeError());
+    if (!RightOp)
+      Err = joinErrors(std::move(Err), RightOp.takeError());
+    return std::move(Err);
+  }
+
+  return EvalBinop(*LeftOp, *RightOp);
+}
+
+Expected<ExpressionFormat>
+BinaryOperation::getImplicitFormat(const SourceMgr &SM) const {
+  Expected<ExpressionFormat> LeftFormat = LeftOperand->getImplicitFormat(SM);
+  Expected<ExpressionFormat> RightFormat = RightOperand->getImplicitFormat(SM);
+  if (!LeftFormat || !RightFormat) {
+    Error Err = Error::success();
+    if (!LeftFormat)
+      Err = joinErrors(std::move(Err), LeftFormat.takeError());
+    if (!RightFormat)
+      Err = joinErrors(std::move(Err), RightFormat.takeError());
+    return std::move(Err);
+  }
+
+  if (*LeftFormat != ExpressionFormat::Kind::NoFormat &&
+      *RightFormat != ExpressionFormat::Kind::NoFormat &&
+      *LeftFormat != *RightFormat)
+    return ErrorDiagnostic::get(
+        SM, getExpressionStr(),
+        "implicit format conflict between '" + LeftOperand->getExpressionStr() +
+            "' (" + LeftFormat->toString() + ") and '" +
+            RightOperand->getExpressionStr() + "' (" + RightFormat->toString() +
+            "), need an explicit format specifier");
+
+  return *LeftFormat != ExpressionFormat::Kind::NoFormat ? *LeftFormat
+                                                         : *RightFormat;
+}
+
+Expected<std::string> NumericSubstitution::getResult() const {
+  assert(ExpressionPointer->getAST() != nullptr &&
+         "Substituting empty expression");
+  Expected<ExpressionValue> EvaluatedValue =
+      ExpressionPointer->getAST()->eval();
+  if (!EvaluatedValue)
+    return EvaluatedValue.takeError();
+  ExpressionFormat Format = ExpressionPointer->getFormat();
+  return Format.getMatchingString(*EvaluatedValue);
+}
+
+Expected<std::string> StringSubstitution::getResult() const {
+  // Look up the value and escape it so that we can put it into the regex.
+  Expected<StringRef> VarVal = Context->getPatternVarValue(FromStr);
+  if (!VarVal)
+    return VarVal.takeError();
+  return Regex::escape(*VarVal);
+}
+
+bool Pattern::isValidVarNameStart(char C) { return C == '_' || isAlpha(C); }
+
+Expected<Pattern::VariableProperties>
+Pattern::parseVariable(StringRef &Str, const SourceMgr &SM) {
+  if (Str.empty())
+    return ErrorDiagnostic::get(SM, Str, "empty variable name");
+
+  size_t I = 0;
+  bool IsPseudo = Str[0] == '@';
+
+  // Global vars start with '$'.
+  if (Str[0] == '$' || IsPseudo)
+    ++I;
+
+  if (!isValidVarNameStart(Str[I++]))
+    return ErrorDiagnostic::get(SM, Str, "invalid variable name");
+
+  for (size_t E = Str.size(); I != E; ++I)
+    // Variable names are composed of alphanumeric characters and underscores.
+    if (Str[I] != '_' && !isAlnum(Str[I]))
+      break;
+
+  StringRef Name = Str.take_front(I);
+  Str = Str.substr(I);
+  return VariableProperties {Name, IsPseudo};
+}
+
+// StringRef holding all characters considered as horizontal whitespaces by
+// FileCheck input canonicalization.
+constexpr StringLiteral SpaceChars = " \t";
+
+// Parsing helper function that strips the first character in S and returns it.
+static char popFront(StringRef &S) {
+  char C = S.front();
+  S = S.drop_front();
+  return C;
+}
+
+char OverflowError::ID = 0;
+char UndefVarError::ID = 0;
+char ErrorDiagnostic::ID = 0;
+char NotFoundError::ID = 0;
+
+Expected<NumericVariable *> Pattern::parseNumericVariableDefinition(
+    StringRef &Expr, FileCheckPatternContext *Context,
+    Optional<size_t> LineNumber, ExpressionFormat ImplicitFormat,
+    const SourceMgr &SM) {
+  Expected<VariableProperties> ParseVarResult = parseVariable(Expr, SM);
+  if (!ParseVarResult)
+    return ParseVarResult.takeError();
+  StringRef Name = ParseVarResult->Name;
+
+  if (ParseVarResult->IsPseudo)
+    return ErrorDiagnostic::get(
+        SM, Name, "definition of pseudo numeric variable unsupported");
+
+  // Detect collisions between string and numeric variables when the latter
+  // is created later than the former.
+  if (Context->DefinedVariableTable.find(Name) !=
+      Context->DefinedVariableTable.end())
+    return ErrorDiagnostic::get(
+        SM, Name, "string variable with name '" + Name + "' already exists");
+
+  Expr = Expr.ltrim(SpaceChars);
+  if (!Expr.empty())
+    return ErrorDiagnostic::get(
+        SM, Expr, "unexpected characters after numeric variable name");
+
+  NumericVariable *DefinedNumericVariable;
+  auto VarTableIter = Context->GlobalNumericVariableTable.find(Name);
+  if (VarTableIter != Context->GlobalNumericVariableTable.end()) {
+    DefinedNumericVariable = VarTableIter->second;
+    if (DefinedNumericVariable->getImplicitFormat() != ImplicitFormat)
+      return ErrorDiagnostic::get(
+          SM, Expr, "format different from previous variable definition");
+  } else
+    DefinedNumericVariable =
+        Context->makeNumericVariable(Name, ImplicitFormat, LineNumber);
+
+  return DefinedNumericVariable;
+}
+
+Expected<std::unique_ptr<NumericVariableUse>> Pattern::parseNumericVariableUse(
+    StringRef Name, bool IsPseudo, Optional<size_t> LineNumber,
+    FileCheckPatternContext *Context, const SourceMgr &SM) {
+  if (IsPseudo && !Name.equals("@LINE"))
+    return ErrorDiagnostic::get(
+        SM, Name, "invalid pseudo numeric variable '" + Name + "'");
+
+  // Numeric variable definitions and uses are parsed in the order in which
+  // they appear in the CHECK patterns. For each definition, the pointer to the
+  // class instance of the corresponding numeric variable definition is stored
+  // in GlobalNumericVariableTable in parsePattern. Therefore, if the pointer
+  // we get below is null, it means no such variable was defined before. When
+  // that happens, we create a dummy variable so that parsing can continue. All
+  // uses of undefined variables, whether string or numeric, are then diagnosed
+  // in printSubstitutions() after failing to match.
+  auto VarTableIter = Context->GlobalNumericVariableTable.find(Name);
+  NumericVariable *NumericVariable;
+  if (VarTableIter != Context->GlobalNumericVariableTable.end())
+    NumericVariable = VarTableIter->second;
+  else {
+    NumericVariable = Context->makeNumericVariable(
+        Name, ExpressionFormat(ExpressionFormat::Kind::Unsigned));
+    Context->GlobalNumericVariableTable[Name] = NumericVariable;
+  }
+
+  Optional<size_t> DefLineNumber = NumericVariable->getDefLineNumber();
+  if (DefLineNumber && LineNumber && *DefLineNumber == *LineNumber)
+    return ErrorDiagnostic::get(
+        SM, Name,
+        "numeric variable '" + Name +
+            "' defined earlier in the same CHECK directive");
+
+  return std::make_unique<NumericVariableUse>(Name, NumericVariable);
+}
+
+Expected<std::unique_ptr<ExpressionAST>> Pattern::parseNumericOperand(
+    StringRef &Expr, AllowedOperand AO, bool MaybeInvalidConstraint,
+    Optional<size_t> LineNumber, FileCheckPatternContext *Context,
+    const SourceMgr &SM) {
+  if (Expr.startswith("(")) {
+    if (AO != AllowedOperand::Any)
+      return ErrorDiagnostic::get(
+          SM, Expr, "parenthesized expression not permitted here");
+    return parseParenExpr(Expr, LineNumber, Context, SM);
+  }
+
+  if (AO == AllowedOperand::LineVar || AO == AllowedOperand::Any) {
+    // Try to parse as a numeric variable use.
+    Expected<Pattern::VariableProperties> ParseVarResult =
+        parseVariable(Expr, SM);
+    if (ParseVarResult) {
+      // Try to parse a function call.
+      if (Expr.ltrim(SpaceChars).startswith("(")) {
+        if (AO != AllowedOperand::Any)
+          return ErrorDiagnostic::get(SM, ParseVarResult->Name,
+                                      "unexpected function call");
+
+        return parseCallExpr(Expr, ParseVarResult->Name, LineNumber, Context,
+                             SM);
+      }
+
+      return parseNumericVariableUse(ParseVarResult->Name,
+                                     ParseVarResult->IsPseudo, LineNumber,
+                                     Context, SM);
+    }
+
+    if (AO == AllowedOperand::LineVar)
+      return ParseVarResult.takeError();
+    // Ignore the error and retry parsing as a literal.
+    consumeError(ParseVarResult.takeError());
+  }
+
+  // Otherwise, parse it as a literal.
+  int64_t SignedLiteralValue;
+  uint64_t UnsignedLiteralValue;
+  StringRef SaveExpr = Expr;
+  // Accept both signed and unsigned literal, default to signed literal.
+  if (!Expr.consumeInteger((AO == AllowedOperand::LegacyLiteral) ? 10 : 0,
+                           UnsignedLiteralValue))
+    return std::make_unique<ExpressionLiteral>(SaveExpr.drop_back(Expr.size()),
+                                               UnsignedLiteralValue);
+  Expr = SaveExpr;
+  if (AO == AllowedOperand::Any && !Expr.consumeInteger(0, SignedLiteralValue))
+    return std::make_unique<ExpressionLiteral>(SaveExpr.drop_back(Expr.size()),
+                                               SignedLiteralValue);
+
+  return ErrorDiagnostic::get(
+      SM, Expr,
+      Twine("invalid ") +
+          (MaybeInvalidConstraint ? "matching constraint or " : "") +
+          "operand format");
+}
+
+Expected<std::unique_ptr<ExpressionAST>>
+Pattern::parseParenExpr(StringRef &Expr, Optional<size_t> LineNumber,
+                        FileCheckPatternContext *Context, const SourceMgr &SM) {
+  Expr = Expr.ltrim(SpaceChars);
+  assert(Expr.startswith("("));
+
+  // Parse right operand.
+  Expr.consume_front("(");
+  Expr = Expr.ltrim(SpaceChars);
+  if (Expr.empty())
+    return ErrorDiagnostic::get(SM, Expr, "missing operand in expression");
+
+  // Note: parseNumericOperand handles nested opening parentheses.
+  Expected<std::unique_ptr<ExpressionAST>> SubExprResult = parseNumericOperand(
+      Expr, AllowedOperand::Any, /*MaybeInvalidConstraint=*/false, LineNumber,
+      Context, SM);
+  Expr = Expr.ltrim(SpaceChars);
+  while (SubExprResult && !Expr.empty() && !Expr.startswith(")")) {
+    StringRef OrigExpr = Expr;
+    SubExprResult = parseBinop(OrigExpr, Expr, std::move(*SubExprResult), false,
+                               LineNumber, Context, SM);
+    Expr = Expr.ltrim(SpaceChars);
+  }
+  if (!SubExprResult)
+    return SubExprResult;
+
+  if (!Expr.consume_front(")")) {
+    return ErrorDiagnostic::get(SM, Expr,
+                                "missing ')' at end of nested expression");
+  }
+  return SubExprResult;
+}
+
+Expected<std::unique_ptr<ExpressionAST>>
+Pattern::parseBinop(StringRef Expr, StringRef &RemainingExpr,
+                    std::unique_ptr<ExpressionAST> LeftOp,
+                    bool IsLegacyLineExpr, Optional<size_t> LineNumber,
+                    FileCheckPatternContext *Context, const SourceMgr &SM) {
+  RemainingExpr = RemainingExpr.ltrim(SpaceChars);
+  if (RemainingExpr.empty())
+    return std::move(LeftOp);
+
+  // Check if this is a supported operation and select a function to perform
+  // it.
+  SMLoc OpLoc = SMLoc::getFromPointer(RemainingExpr.data());
+  char Operator = popFront(RemainingExpr);
+  binop_eval_t EvalBinop;
+  switch (Operator) {
+  case '+':
+    EvalBinop = operator+;
+    break;
+  case '-':
+    EvalBinop = operator-;
+    break;
+  default:
+    return ErrorDiagnostic::get(
+        SM, OpLoc, Twine("unsupported operation '") + Twine(Operator) + "'");
+  }
+
+  // Parse right operand.
+  RemainingExpr = RemainingExpr.ltrim(SpaceChars);
+  if (RemainingExpr.empty())
+    return ErrorDiagnostic::get(SM, RemainingExpr,
+                                "missing operand in expression");
+  // The second operand in a legacy @LINE expression is always a literal.
+  AllowedOperand AO =
+      IsLegacyLineExpr ? AllowedOperand::LegacyLiteral : AllowedOperand::Any;
+  Expected<std::unique_ptr<ExpressionAST>> RightOpResult =
+      parseNumericOperand(RemainingExpr, AO, /*MaybeInvalidConstraint=*/false,
+                          LineNumber, Context, SM);
+  if (!RightOpResult)
+    return RightOpResult;
+
+  Expr = Expr.drop_back(RemainingExpr.size());
+  return std::make_unique<BinaryOperation>(Expr, EvalBinop, std::move(LeftOp),
+                                           std::move(*RightOpResult));
+}
+
+Expected<std::unique_ptr<ExpressionAST>>
+Pattern::parseCallExpr(StringRef &Expr, StringRef FuncName,
+                       Optional<size_t> LineNumber,
+                       FileCheckPatternContext *Context, const SourceMgr &SM) {
+  Expr = Expr.ltrim(SpaceChars);
+  assert(Expr.startswith("("));
+
+  auto OptFunc = StringSwitch<Optional<binop_eval_t>>(FuncName)
+                     .Case("add", operator+)
+                     .Case("div", operator/)
+                     .Case("max", max)
+                     .Case("min", min)
+                     .Case("mul", operator*)
+                     .Case("sub", operator-)
+                     .Default(None);
+
+  if (!OptFunc)
+    return ErrorDiagnostic::get(
+        SM, FuncName, Twine("call to undefined function '") + FuncName + "'");
+
+  Expr.consume_front("(");
+  Expr = Expr.ltrim(SpaceChars);
+
+  // Parse call arguments, which are comma separated.
+  SmallVector<std::unique_ptr<ExpressionAST>, 4> Args;
+  while (!Expr.empty() && !Expr.startswith(")")) {
+    if (Expr.startswith(","))
+      return ErrorDiagnostic::get(SM, Expr, "missing argument");
+
+    // Parse the argument, which is an arbitary expression.
+    StringRef OuterBinOpExpr = Expr;
+    Expected<std::unique_ptr<ExpressionAST>> Arg = parseNumericOperand(
+        Expr, AllowedOperand::Any, /*MaybeInvalidConstraint=*/false, LineNumber,
+        Context, SM);
+    while (Arg && !Expr.empty()) {
+      Expr = Expr.ltrim(SpaceChars);
+      // Have we reached an argument terminator?
+      if (Expr.startswith(",") || Expr.startswith(")"))
+        break;
+
+      // Arg = Arg <op> <expr>
+      Arg = parseBinop(OuterBinOpExpr, Expr, std::move(*Arg), false, LineNumber,
+                       Context, SM);
+    }
+
+    // Prefer an expression error over a generic invalid argument message.
+    if (!Arg)
+      return Arg.takeError();
+    Args.push_back(std::move(*Arg));
+
+    // Have we parsed all available arguments?
+    Expr = Expr.ltrim(SpaceChars);
+    if (!Expr.consume_front(","))
+      break;
+
+    Expr = Expr.ltrim(SpaceChars);
+    if (Expr.startswith(")"))
+      return ErrorDiagnostic::get(SM, Expr, "missing argument");
+  }
+
+  if (!Expr.consume_front(")"))
+    return ErrorDiagnostic::get(SM, Expr,
+                                "missing ')' at end of call expression");
+
+  const unsigned NumArgs = Args.size();
+  if (NumArgs == 2)
+    return std::make_unique<BinaryOperation>(Expr, *OptFunc, std::move(Args[0]),
+                                             std::move(Args[1]));
+
+  // TODO: Support more than binop_eval_t.
+  return ErrorDiagnostic::get(SM, FuncName,
+                              Twine("function '") + FuncName +
+                                  Twine("' takes 2 arguments but ") +
+                                  Twine(NumArgs) + " given");
+}
+
+Expected<std::unique_ptr<Expression>> Pattern::parseNumericSubstitutionBlock(
+    StringRef Expr, Optional<NumericVariable *> &DefinedNumericVariable,
+    bool IsLegacyLineExpr, Optional<size_t> LineNumber,
+    FileCheckPatternContext *Context, const SourceMgr &SM) {
+  std::unique_ptr<ExpressionAST> ExpressionASTPointer = nullptr;
+  StringRef DefExpr = StringRef();
+  DefinedNumericVariable = None;
+  ExpressionFormat ExplicitFormat = ExpressionFormat();
+  unsigned Precision = 0;
+
+  // Parse format specifier (NOTE: ',' is also an argument seperator).
+  size_t FormatSpecEnd = Expr.find(',');
+  size_t FunctionStart = Expr.find('(');
+  if (FormatSpecEnd != StringRef::npos && FormatSpecEnd < FunctionStart) {
+    StringRef FormatExpr = Expr.take_front(FormatSpecEnd);
+    Expr = Expr.drop_front(FormatSpecEnd + 1);
+    FormatExpr = FormatExpr.trim(SpaceChars);
+    if (!FormatExpr.consume_front("%"))
+      return ErrorDiagnostic::get(
+          SM, FormatExpr,
+          "invalid matching format specification in expression");
+
+    // Parse precision.
+    if (FormatExpr.consume_front(".")) {
+      if (FormatExpr.consumeInteger(10, Precision))
+        return ErrorDiagnostic::get(SM, FormatExpr,
+                                    "invalid precision in format specifier");
+    }
+
+    if (!FormatExpr.empty()) {
+      // Check for unknown matching format specifier and set matching format in
+      // class instance representing this expression.
+      SMLoc FmtLoc = SMLoc::getFromPointer(FormatExpr.data());
+      switch (popFront(FormatExpr)) {
+      case 'u':
+        ExplicitFormat =
+            ExpressionFormat(ExpressionFormat::Kind::Unsigned, Precision);
+        break;
+      case 'd':
+        ExplicitFormat =
+            ExpressionFormat(ExpressionFormat::Kind::Signed, Precision);
+        break;
+      case 'x':
+        ExplicitFormat =
+            ExpressionFormat(ExpressionFormat::Kind::HexLower, Precision);
+        break;
+      case 'X':
+        ExplicitFormat =
+            ExpressionFormat(ExpressionFormat::Kind::HexUpper, Precision);
+        break;
+      default:
+        return ErrorDiagnostic::get(SM, FmtLoc,
+                                    "invalid format specifier in expression");
+      }
+    }
+
+    FormatExpr = FormatExpr.ltrim(SpaceChars);
+    if (!FormatExpr.empty())
+      return ErrorDiagnostic::get(
+          SM, FormatExpr,
+          "invalid matching format specification in expression");
+  }
+
+  // Save variable definition expression if any.
+  size_t DefEnd = Expr.find(':');
+  if (DefEnd != StringRef::npos) {
+    DefExpr = Expr.substr(0, DefEnd);
+    Expr = Expr.substr(DefEnd + 1);
+  }
+
+  // Parse matching constraint.
+  Expr = Expr.ltrim(SpaceChars);
+  bool HasParsedValidConstraint = false;
+  if (Expr.consume_front("=="))
+    HasParsedValidConstraint = true;
+
+  // Parse the expression itself.
+  Expr = Expr.ltrim(SpaceChars);
+  if (Expr.empty()) {
+    if (HasParsedValidConstraint)
+      return ErrorDiagnostic::get(
+          SM, Expr, "empty numeric expression should not have a constraint");
+  } else {
+    Expr = Expr.rtrim(SpaceChars);
+    StringRef OuterBinOpExpr = Expr;
+    // The first operand in a legacy @LINE expression is always the @LINE
+    // pseudo variable.
+    AllowedOperand AO =
+        IsLegacyLineExpr ? AllowedOperand::LineVar : AllowedOperand::Any;
+    Expected<std::unique_ptr<ExpressionAST>> ParseResult = parseNumericOperand(
+        Expr, AO, !HasParsedValidConstraint, LineNumber, Context, SM);
+    while (ParseResult && !Expr.empty()) {
+      ParseResult = parseBinop(OuterBinOpExpr, Expr, std::move(*ParseResult),
+                               IsLegacyLineExpr, LineNumber, Context, SM);
+      // Legacy @LINE expressions only allow 2 operands.
+      if (ParseResult && IsLegacyLineExpr && !Expr.empty())
+        return ErrorDiagnostic::get(
+            SM, Expr,
+            "unexpected characters at end of expression '" + Expr + "'");
+    }
+    if (!ParseResult)
+      return ParseResult.takeError();
+    ExpressionASTPointer = std::move(*ParseResult);
+  }
+
+  // Select format of the expression, i.e. (i) its explicit format, if any,
+  // otherwise (ii) its implicit format, if any, otherwise (iii) the default
+  // format (unsigned). Error out in case of conflicting implicit format
+  // without explicit format.
+  ExpressionFormat Format;
+  if (ExplicitFormat)
+    Format = ExplicitFormat;
+  else if (ExpressionASTPointer) {
+    Expected<ExpressionFormat> ImplicitFormat =
+        ExpressionASTPointer->getImplicitFormat(SM);
+    if (!ImplicitFormat)
+      return ImplicitFormat.takeError();
+    Format = *ImplicitFormat;
+  }
+  if (!Format)
+    Format = ExpressionFormat(ExpressionFormat::Kind::Unsigned, Precision);
+
+  std::unique_ptr<Expression> ExpressionPointer =
+      std::make_unique<Expression>(std::move(ExpressionASTPointer), Format);
+
+  // Parse the numeric variable definition.
+  if (DefEnd != StringRef::npos) {
+    DefExpr = DefExpr.ltrim(SpaceChars);
+    Expected<NumericVariable *> ParseResult = parseNumericVariableDefinition(
+        DefExpr, Context, LineNumber, ExpressionPointer->getFormat(), SM);
+
+    if (!ParseResult)
+      return ParseResult.takeError();
+    DefinedNumericVariable = *ParseResult;
+  }
+
+  return std::move(ExpressionPointer);
+}
+
+bool Pattern::parsePattern(StringRef PatternStr, StringRef Prefix,
+                           SourceMgr &SM, const FileCheckRequest &Req) {
+  bool MatchFullLinesHere = Req.MatchFullLines && CheckTy != Check::CheckNot;
+  IgnoreCase = Req.IgnoreCase;
+
+  PatternLoc = SMLoc::getFromPointer(PatternStr.data());
+
+  if (!(Req.NoCanonicalizeWhiteSpace && Req.MatchFullLines))
+    // Ignore trailing whitespace.
+    while (!PatternStr.empty() &&
+           (PatternStr.back() == ' ' || PatternStr.back() == '\t'))
+      PatternStr = PatternStr.substr(0, PatternStr.size() - 1);
+
+  // Check that there is something on the line.
+  if (PatternStr.empty() && CheckTy != Check::CheckEmpty) {
+    SM.PrintMessage(PatternLoc, SourceMgr::DK_Error,
+                    "found empty check string with prefix '" + Prefix + ":'");
+    return true;
+  }
+
+  if (!PatternStr.empty() && CheckTy == Check::CheckEmpty) {
+    SM.PrintMessage(
+        PatternLoc, SourceMgr::DK_Error,
+        "found non-empty check string for empty check with prefix '" + Prefix +
+            ":'");
+    return true;
+  }
+
+  if (CheckTy == Check::CheckEmpty) {
+    RegExStr = "(\n$)";
+    return false;
+  }
+
+  // If literal check, set fixed string.
+  if (CheckTy.isLiteralMatch()) {
+    FixedStr = PatternStr;
+    return false;
+  }
+
+  // Check to see if this is a fixed string, or if it has regex pieces.
+  if (!MatchFullLinesHere &&
+      (PatternStr.size() < 2 || (PatternStr.find("{{") == StringRef::npos &&
+                                 PatternStr.find("[[") == StringRef::npos))) {
+    FixedStr = PatternStr;
+    return false;
+  }
+
+  if (MatchFullLinesHere) {
+    RegExStr += '^';
+    if (!Req.NoCanonicalizeWhiteSpace)
+      RegExStr += " *";
+  }
+
+  // Paren value #0 is for the fully matched string.  Any new parenthesized
+  // values add from there.
+  unsigned CurParen = 1;
+
+  // Otherwise, there is at least one regex piece.  Build up the regex pattern
+  // by escaping scary characters in fixed strings, building up one big regex.
+  while (!PatternStr.empty()) {
+    // RegEx matches.
+    if (PatternStr.startswith("{{")) {
+      // This is the start of a regex match.  Scan for the }}.
+      size_t End = PatternStr.find("}}");
+      if (End == StringRef::npos) {
+        SM.PrintMessage(SMLoc::getFromPointer(PatternStr.data()),
+                        SourceMgr::DK_Error,
+                        "found start of regex string with no end '}}'");
+        return true;
+      }
+
+      // Enclose {{}} patterns in parens just like [[]] even though we're not
+      // capturing the result for any purpose.  This is required in case the
+      // expression contains an alternation like: CHECK:  abc{{x|z}}def.  We
+      // want this to turn into: "abc(x|z)def" not "abcx|zdef".
+      RegExStr += '(';
+      ++CurParen;
+
+      if (AddRegExToRegEx(PatternStr.substr(2, End - 2), CurParen, SM))
+        return true;
+      RegExStr += ')';
+
+      PatternStr = PatternStr.substr(End + 2);
+      continue;
+    }
+
+    // String and numeric substitution blocks. Pattern substitution blocks come
+    // in two forms: [[foo:.*]] and [[foo]]. The former matches .* (or some
+    // other regex) and assigns it to the string variable 'foo'. The latter
+    // substitutes foo's value. Numeric substitution blocks recognize the same
+    // form as string ones, but start with a '#' sign after the double
+    // brackets. They also accept a combined form which sets a numeric variable
+    // to the evaluation of an expression. Both string and numeric variable
+    // names must satisfy the regular expression "[a-zA-Z_][0-9a-zA-Z_]*" to be
+    // valid, as this helps catch some common errors.
+    if (PatternStr.startswith("[[")) {
+      StringRef UnparsedPatternStr = PatternStr.substr(2);
+      // Find the closing bracket pair ending the match.  End is going to be an
+      // offset relative to the beginning of the match string.
+      size_t End = FindRegexVarEnd(UnparsedPatternStr, SM);
+      StringRef MatchStr = UnparsedPatternStr.substr(0, End);
+      bool IsNumBlock = MatchStr.consume_front("#");
+
+      if (End == StringRef::npos) {
+        SM.PrintMessage(SMLoc::getFromPointer(PatternStr.data()),
+                        SourceMgr::DK_Error,
+                        "Invalid substitution block, no ]] found");
+        return true;
+      }
+      // Strip the substitution block we are parsing. End points to the start
+      // of the "]]" closing the expression so account for it in computing the
+      // index of the first unparsed character.
+      PatternStr = UnparsedPatternStr.substr(End + 2);
+
+      bool IsDefinition = false;
+      bool SubstNeeded = false;
+      // Whether the substitution block is a legacy use of @LINE with string
+      // substitution block syntax.
+      bool IsLegacyLineExpr = false;
+      StringRef DefName;
+      StringRef SubstStr;
+      std::string MatchRegexp;
+      size_t SubstInsertIdx = RegExStr.size();
+
+      // Parse string variable or legacy @LINE expression.
+      if (!IsNumBlock) {
+        size_t VarEndIdx = MatchStr.find(':');
+        size_t SpacePos = MatchStr.substr(0, VarEndIdx).find_first_of(" \t");
+        if (SpacePos != StringRef::npos) {
+          SM.PrintMessage(SMLoc::getFromPointer(MatchStr.data() + SpacePos),
+                          SourceMgr::DK_Error, "unexpected whitespace");
+          return true;
+        }
+
+        // Get the name (e.g. "foo") and verify it is well formed.
+        StringRef OrigMatchStr = MatchStr;
+        Expected<Pattern::VariableProperties> ParseVarResult =
+            parseVariable(MatchStr, SM);
+        if (!ParseVarResult) {
+          logAllUnhandledErrors(ParseVarResult.takeError(), errs());
+          return true;
+        }
+        StringRef Name = ParseVarResult->Name;
+        bool IsPseudo = ParseVarResult->IsPseudo;
+
+        IsDefinition = (VarEndIdx != StringRef::npos);
+        SubstNeeded = !IsDefinition;
+        if (IsDefinition) {
+          if ((IsPseudo || !MatchStr.consume_front(":"))) {
+            SM.PrintMessage(SMLoc::getFromPointer(Name.data()),
+                            SourceMgr::DK_Error,
+                            "invalid name in string variable definition");
+            return true;
+          }
+
+          // Detect collisions between string and numeric variables when the
+          // former is created later than the latter.
+          if (Context->GlobalNumericVariableTable.find(Name) !=
+              Context->GlobalNumericVariableTable.end()) {
+            SM.PrintMessage(
+                SMLoc::getFromPointer(Name.data()), SourceMgr::DK_Error,
+                "numeric variable with name '" + Name + "' already exists");
+            return true;
+          }
+          DefName = Name;
+          MatchRegexp = MatchStr.str();
+        } else {
+          if (IsPseudo) {
+            MatchStr = OrigMatchStr;
+            IsLegacyLineExpr = IsNumBlock = true;
+          } else
+            SubstStr = Name;
+        }
+      }
+
+      // Parse numeric substitution block.
+      std::unique_ptr<Expression> ExpressionPointer;
+      Optional<NumericVariable *> DefinedNumericVariable;
+      if (IsNumBlock) {
+        Expected<std::unique_ptr<Expression>> ParseResult =
+            parseNumericSubstitutionBlock(MatchStr, DefinedNumericVariable,
+                                          IsLegacyLineExpr, LineNumber, Context,
+                                          SM);
+        if (!ParseResult) {
+          logAllUnhandledErrors(ParseResult.takeError(), errs());
+          return true;
+        }
+        ExpressionPointer = std::move(*ParseResult);
+        SubstNeeded = ExpressionPointer->getAST() != nullptr;
+        if (DefinedNumericVariable) {
+          IsDefinition = true;
+          DefName = (*DefinedNumericVariable)->getName();
+        }
+        if (SubstNeeded)
+          SubstStr = MatchStr;
+        else {
+          ExpressionFormat Format = ExpressionPointer->getFormat();
+          MatchRegexp = cantFail(Format.getWildcardRegex());
+        }
+      }
+
+      // Handle variable definition: [[<def>:(...)]] and [[#(...)<def>:(...)]].
+      if (IsDefinition) {
+        RegExStr += '(';
+        ++SubstInsertIdx;
+
+        if (IsNumBlock) {
+          NumericVariableMatch NumericVariableDefinition = {
+              *DefinedNumericVariable, CurParen};
+          NumericVariableDefs[DefName] = NumericVariableDefinition;
+          // This store is done here rather than in match() to allow
+          // parseNumericVariableUse() to get the pointer to the class instance
+          // of the right variable definition corresponding to a given numeric
+          // variable use.
+          Context->GlobalNumericVariableTable[DefName] =
+              *DefinedNumericVariable;
+        } else {
+          VariableDefs[DefName] = CurParen;
+          // Mark string variable as defined to detect collisions between
+          // string and numeric variables in parseNumericVariableUse() and
+          // defineCmdlineVariables() when the latter is created later than the
+          // former. We cannot reuse GlobalVariableTable for this by populating
+          // it with an empty string since we would then lose the ability to
+          // detect the use of an undefined variable in match().
+          Context->DefinedVariableTable[DefName] = true;
+        }
+
+        ++CurParen;
+      }
+
+      if (!MatchRegexp.empty() && AddRegExToRegEx(MatchRegexp, CurParen, SM))
+        return true;
+
+      if (IsDefinition)
+        RegExStr += ')';
+
+      // Handle substitutions: [[foo]] and [[#<foo expr>]].
+      if (SubstNeeded) {
+        // Handle substitution of string variables that were defined earlier on
+        // the same line by emitting a backreference. Expressions do not
+        // support substituting a numeric variable defined on the same line.
+        if (!IsNumBlock && VariableDefs.find(SubstStr) != VariableDefs.end()) {
+          unsigned CaptureParenGroup = VariableDefs[SubstStr];
+          if (CaptureParenGroup < 1 || CaptureParenGroup > 9) {
+            SM.PrintMessage(SMLoc::getFromPointer(SubstStr.data()),
+                            SourceMgr::DK_Error,
+                            "Can't back-reference more than 9 variables");
+            return true;
+          }
+          AddBackrefToRegEx(CaptureParenGroup);
+        } else {
+          // Handle substitution of string variables ([[<var>]]) defined in
+          // previous CHECK patterns, and substitution of expressions.
+          Substitution *Substitution =
+              IsNumBlock
+                  ? Context->makeNumericSubstitution(
+                        SubstStr, std::move(ExpressionPointer), SubstInsertIdx)
+                  : Context->makeStringSubstitution(SubstStr, SubstInsertIdx);
+          Substitutions.push_back(Substitution);
+        }
+      }
+    }
+
+    // Handle fixed string matches.
+    // Find the end, which is the start of the next regex.
+    size_t FixedMatchEnd = PatternStr.find("{{");
+    FixedMatchEnd = std::min(FixedMatchEnd, PatternStr.find("[["));
+    RegExStr += Regex::escape(PatternStr.substr(0, FixedMatchEnd));
+    PatternStr = PatternStr.substr(FixedMatchEnd);
+  }
+
+  if (MatchFullLinesHere) {
+    if (!Req.NoCanonicalizeWhiteSpace)
+      RegExStr += " *";
+    RegExStr += '$';
+  }
+
+  return false;
+}
+
+bool Pattern::AddRegExToRegEx(StringRef RS, unsigned &CurParen, SourceMgr &SM) {
+  Regex R(RS);
+  std::string Error;
+  if (!R.isValid(Error)) {
+    SM.PrintMessage(SMLoc::getFromPointer(RS.data()), SourceMgr::DK_Error,
+                    "invalid regex: " + Error);
+    return true;
+  }
+
+  RegExStr += RS.str();
+  CurParen += R.getNumMatches();
+  return false;
+}
+
+void Pattern::AddBackrefToRegEx(unsigned BackrefNum) {
+  assert(BackrefNum >= 1 && BackrefNum <= 9 && "Invalid backref number");
+  std::string Backref = std::string("\\") + std::string(1, '0' + BackrefNum);
+  RegExStr += Backref;
+}
+
+Expected<size_t> Pattern::match(StringRef Buffer, size_t &MatchLen,
+                                const SourceMgr &SM) const {
+  // If this is the EOF pattern, match it immediately.
+  if (CheckTy == Check::CheckEOF) {
+    MatchLen = 0;
+    return Buffer.size();
+  }
+
+  // If this is a fixed string pattern, just match it now.
+  if (!FixedStr.empty()) {
+    MatchLen = FixedStr.size();
+    size_t Pos =
+        IgnoreCase ? Buffer.find_lower(FixedStr) : Buffer.find(FixedStr);
+    if (Pos == StringRef::npos)
+      return make_error<NotFoundError>();
+    return Pos;
+  }
+
+  // Regex match.
+
+  // If there are substitutions, we need to create a temporary string with the
+  // actual value.
+  StringRef RegExToMatch = RegExStr;
+  std::string TmpStr;
+  if (!Substitutions.empty()) {
+    TmpStr = RegExStr;
+    if (LineNumber)
+      Context->LineVariable->setValue(ExpressionValue(*LineNumber));
+
+    size_t InsertOffset = 0;
+    // Substitute all string variables and expressions whose values are only
+    // now known. Use of string variables defined on the same line are handled
+    // by back-references.
+    for (const auto &Substitution : Substitutions) {
+      // Substitute and check for failure (e.g. use of undefined variable).
+      Expected<std::string> Value = Substitution->getResult();
+      if (!Value) {
+        // Convert to an ErrorDiagnostic to get location information. This is
+        // done here rather than PrintNoMatch since now we know which
+        // substitution block caused the overflow.
+        Error Err =
+            handleErrors(Value.takeError(), [&](const OverflowError &E) {
+              return ErrorDiagnostic::get(SM, Substitution->getFromString(),
+                                          "unable to substitute variable or "
+                                          "numeric expression: overflow error");
+            });
+        return std::move(Err);
+      }
+
+      // Plop it into the regex at the adjusted offset.
+      TmpStr.insert(TmpStr.begin() + Substitution->getIndex() + InsertOffset,
+                    Value->begin(), Value->end());
+      InsertOffset += Value->size();
+    }
+
+    // Match the newly constructed regex.
+    RegExToMatch = TmpStr;
+  }
+
+  SmallVector<StringRef, 4> MatchInfo;
+  unsigned int Flags = Regex::Newline;
+  if (IgnoreCase)
+    Flags |= Regex::IgnoreCase;
+  if (!Regex(RegExToMatch, Flags).match(Buffer, &MatchInfo))
+    return make_error<NotFoundError>();
+
+  // Successful regex match.
+  assert(!MatchInfo.empty() && "Didn't get any match");
+  StringRef FullMatch = MatchInfo[0];
+
+  // If this defines any string variables, remember their values.
+  for (const auto &VariableDef : VariableDefs) {
+    assert(VariableDef.second < MatchInfo.size() && "Internal paren error");
+    Context->GlobalVariableTable[VariableDef.first] =
+        MatchInfo[VariableDef.second];
+  }
+
+  // If this defines any numeric variables, remember their values.
+  for (const auto &NumericVariableDef : NumericVariableDefs) {
+    const NumericVariableMatch &NumericVariableMatch =
+        NumericVariableDef.getValue();
+    unsigned CaptureParenGroup = NumericVariableMatch.CaptureParenGroup;
+    assert(CaptureParenGroup < MatchInfo.size() && "Internal paren error");
+    NumericVariable *DefinedNumericVariable =
+        NumericVariableMatch.DefinedNumericVariable;
+
+    StringRef MatchedValue = MatchInfo[CaptureParenGroup];
+    ExpressionFormat Format = DefinedNumericVariable->getImplicitFormat();
+    Expected<ExpressionValue> Value =
+        Format.valueFromStringRepr(MatchedValue, SM);
+    if (!Value)
+      return Value.takeError();
+    DefinedNumericVariable->setValue(*Value, MatchedValue);
+  }
+
+  // Like CHECK-NEXT, CHECK-EMPTY's match range is considered to start after
+  // the required preceding newline, which is consumed by the pattern in the
+  // case of CHECK-EMPTY but not CHECK-NEXT.
+  size_t MatchStartSkip = CheckTy == Check::CheckEmpty;
+  MatchLen = FullMatch.size() - MatchStartSkip;
+  return FullMatch.data() - Buffer.data() + MatchStartSkip;
+}
+
+unsigned Pattern::computeMatchDistance(StringRef Buffer) const {
+  // Just compute the number of matching characters. For regular expressions, we
+  // just compare against the regex itself and hope for the best.
+  //
+  // FIXME: One easy improvement here is have the regex lib generate a single
+  // example regular expression which matches, and use that as the example
+  // string.
+  StringRef ExampleString(FixedStr);
+  if (ExampleString.empty())
+    ExampleString = RegExStr;
+
+  // Only compare up to the first line in the buffer, or the string size.
+  StringRef BufferPrefix = Buffer.substr(0, ExampleString.size());
+  BufferPrefix = BufferPrefix.split('\n').first;
+  return BufferPrefix.edit_distance(ExampleString);
+}
+
+void Pattern::printSubstitutions(const SourceMgr &SM, StringRef Buffer,
+                                 SMRange Range,
+                                 FileCheckDiag::MatchType MatchTy,
+                                 std::vector<FileCheckDiag> *Diags) const {
+  // Print what we know about substitutions.
+  if (!Substitutions.empty()) {
+    for (const auto &Substitution : Substitutions) {
+      SmallString<256> Msg;
+      raw_svector_ostream OS(Msg);
+      Expected<std::string> MatchedValue = Substitution->getResult();
+
+      // Substitution failed or is not known at match time, print the undefined
+      // variables it uses.
+      if (!MatchedValue) {
+        bool UndefSeen = false;
+        handleAllErrors(
+            MatchedValue.takeError(), [](const NotFoundError &E) {},
+            // Handled in PrintNoMatch().
+            [](const ErrorDiagnostic &E) {},
+            // Handled in match().
+            [](const OverflowError &E) {},
+            [&](const UndefVarError &E) {
+              if (!UndefSeen) {
+                OS << "uses undefined variable(s):";
+                UndefSeen = true;
+              }
+              OS << " ";
+              E.log(OS);
+            });
+      } else {
+        // Substitution succeeded. Print substituted value.
+        OS << "with \"";
+        OS.write_escaped(Substitution->getFromString()) << "\" equal to \"";
+        OS.write_escaped(*MatchedValue) << "\"";
+      }
+
+      // We report only the start of the match/search range to suggest we are
+      // reporting the substitutions as set at the start of the match/search.
+      // Indicating a non-zero-length range might instead seem to imply that the
+      // substitution matches or was captured from exactly that range.
+      if (Diags)
+        Diags->emplace_back(SM, CheckTy, getLoc(), MatchTy,
+                            SMRange(Range.Start, Range.Start), OS.str());
+      else
+        SM.PrintMessage(Range.Start, SourceMgr::DK_Note, OS.str());
+    }
+  }
+}
+
+void Pattern::printVariableDefs(const SourceMgr &SM,
+                                FileCheckDiag::MatchType MatchTy,
+                                std::vector<FileCheckDiag> *Diags) const {
+  if (VariableDefs.empty() && NumericVariableDefs.empty())
+    return;
+  // Build list of variable captures.
+  struct VarCapture {
+    StringRef Name;
+    SMRange Range;
+  };
+  SmallVector<VarCapture, 2> VarCaptures;
+  for (const auto &VariableDef : VariableDefs) {
+    VarCapture VC;
+    VC.Name = VariableDef.first;
+    StringRef Value = Context->GlobalVariableTable[VC.Name];
+    SMLoc Start = SMLoc::getFromPointer(Value.data());
+    SMLoc End = SMLoc::getFromPointer(Value.data() + Value.size());
+    VC.Range = SMRange(Start, End);
+    VarCaptures.push_back(VC);
+  }
+  for (const auto &VariableDef : NumericVariableDefs) {
+    VarCapture VC;
+    VC.Name = VariableDef.getKey();
+    StringRef StrValue = VariableDef.getValue()
+                             .DefinedNumericVariable->getStringValue()
+                             .getValue();
+    SMLoc Start = SMLoc::getFromPointer(StrValue.data());
+    SMLoc End = SMLoc::getFromPointer(StrValue.data() + StrValue.size());
+    VC.Range = SMRange(Start, End);
+    VarCaptures.push_back(VC);
+  }
+  // Sort variable captures by the order in which they matched the input.
+  // Ranges shouldn't be overlapping, so we can just compare the start.
+  llvm::sort(VarCaptures, [](const VarCapture &A, const VarCapture &B) {
+    assert(A.Range.Start != B.Range.Start &&
+           "unexpected overlapping variable captures");
+    return A.Range.Start.getPointer() < B.Range.Start.getPointer();
+  });
+  // Create notes for the sorted captures.
+  for (const VarCapture &VC : VarCaptures) {
+    SmallString<256> Msg;
+    raw_svector_ostream OS(Msg);
+    OS << "captured var \"" << VC.Name << "\"";
+    if (Diags)
+      Diags->emplace_back(SM, CheckTy, getLoc(), MatchTy, VC.Range, OS.str());
+    else
+      SM.PrintMessage(VC.Range.Start, SourceMgr::DK_Note, OS.str(), VC.Range);
+  }
+}
+
+static SMRange ProcessMatchResult(FileCheckDiag::MatchType MatchTy,
+                                  const SourceMgr &SM, SMLoc Loc,
+                                  Check::FileCheckType CheckTy,
+                                  StringRef Buffer, size_t Pos, size_t Len,
+                                  std::vector<FileCheckDiag> *Diags,
+                                  bool AdjustPrevDiags = false) {
+  SMLoc Start = SMLoc::getFromPointer(Buffer.data() + Pos);
+  SMLoc End = SMLoc::getFromPointer(Buffer.data() + Pos + Len);
+  SMRange Range(Start, End);
+  if (Diags) {
+    if (AdjustPrevDiags) {
+      SMLoc CheckLoc = Diags->rbegin()->CheckLoc;
+      for (auto I = Diags->rbegin(), E = Diags->rend();
+           I != E && I->CheckLoc == CheckLoc; ++I)
+        I->MatchTy = MatchTy;
+    } else
+      Diags->emplace_back(SM, CheckTy, Loc, MatchTy, Range);
+  }
+  return Range;
+}
+
+void Pattern::printFuzzyMatch(const SourceMgr &SM, StringRef Buffer,
+                              std::vector<FileCheckDiag> *Diags) const {
+  // Attempt to find the closest/best fuzzy match.  Usually an error happens
+  // because some string in the output didn't exactly match. In these cases, we
+  // would like to show the user a best guess at what "should have" matched, to
+  // save them having to actually check the input manually.
+  size_t NumLinesForward = 0;
+  size_t Best = StringRef::npos;
+  double BestQuality = 0;
+
+  // Use an arbitrary 4k limit on how far we will search.
+  for (size_t i = 0, e = std::min(size_t(4096), Buffer.size()); i != e; ++i) {
+    if (Buffer[i] == '\n')
+      ++NumLinesForward;
+
+    // Patterns have leading whitespace stripped, so skip whitespace when
+    // looking for something which looks like a pattern.
+    if (Buffer[i] == ' ' || Buffer[i] == '\t')
+      continue;
+
+    // Compute the "quality" of this match as an arbitrary combination of the
+    // match distance and the number of lines skipped to get to this match.
+    unsigned Distance = computeMatchDistance(Buffer.substr(i));
+    double Quality = Distance + (NumLinesForward / 100.);
+
+    if (Quality < BestQuality || Best == StringRef::npos) {
+      Best = i;
+      BestQuality = Quality;
+    }
+  }
+
+  // Print the "possible intended match here" line if we found something
+  // reasonable and not equal to what we showed in the "scanning from here"
+  // line.
+  if (Best && Best != StringRef::npos && BestQuality < 50) {
+    SMRange MatchRange =
+        ProcessMatchResult(FileCheckDiag::MatchFuzzy, SM, getLoc(),
+                           getCheckTy(), Buffer, Best, 0, Diags);
+    SM.PrintMessage(MatchRange.Start, SourceMgr::DK_Note,
+                    "possible intended match here");
+
+    // FIXME: If we wanted to be really friendly we would show why the match
+    // failed, as it can be hard to spot simple one character differences.
+  }
+}
+
+Expected<StringRef>
+FileCheckPatternContext::getPatternVarValue(StringRef VarName) {
+  auto VarIter = GlobalVariableTable.find(VarName);
+  if (VarIter == GlobalVariableTable.end())
+    return make_error<UndefVarError>(VarName);
+
+  return VarIter->second;
+}
+
+template <class... Types>
+NumericVariable *FileCheckPatternContext::makeNumericVariable(Types... args) {
+  NumericVariables.push_back(std::make_unique<NumericVariable>(args...));
+  return NumericVariables.back().get();
+}
+
+Substitution *
+FileCheckPatternContext::makeStringSubstitution(StringRef VarName,
+                                                size_t InsertIdx) {
+  Substitutions.push_back(
+      std::make_unique<StringSubstitution>(this, VarName, InsertIdx));
+  return Substitutions.back().get();
+}
+
+Substitution *FileCheckPatternContext::makeNumericSubstitution(
+    StringRef ExpressionStr, std::unique_ptr<Expression> Expression,
+    size_t InsertIdx) {
+  Substitutions.push_back(std::make_unique<NumericSubstitution>(
+      this, ExpressionStr, std::move(Expression), InsertIdx));
+  return Substitutions.back().get();
+}
+
+size_t Pattern::FindRegexVarEnd(StringRef Str, SourceMgr &SM) {
+  // Offset keeps track of the current offset within the input Str
+  size_t Offset = 0;
+  // [...] Nesting depth
+  size_t BracketDepth = 0;
+
+  while (!Str.empty()) {
+    if (Str.startswith("]]") && BracketDepth == 0)
+      return Offset;
+    if (Str[0] == '\\') {
+      // Backslash escapes the next char within regexes, so skip them both.
+      Str = Str.substr(2);
+      Offset += 2;
+    } else {
+      switch (Str[0]) {
+      default:
+        break;
+      case '[':
+        BracketDepth++;
+        break;
+      case ']':
+        if (BracketDepth == 0) {
+          SM.PrintMessage(SMLoc::getFromPointer(Str.data()),
+                          SourceMgr::DK_Error,
+                          "missing closing \"]\" for regex variable");
+          exit(1);
+        }
+        BracketDepth--;
+        break;
+      }
+      Str = Str.substr(1);
+      Offset++;
+    }
+  }
+
+  return StringRef::npos;
+}
+
+StringRef FileCheck::CanonicalizeFile(MemoryBuffer &MB,
+                                      SmallVectorImpl<char> &OutputBuffer) {
+  OutputBuffer.reserve(MB.getBufferSize());
+
+  for (const char *Ptr = MB.getBufferStart(), *End = MB.getBufferEnd();
+       Ptr != End; ++Ptr) {
+    // Eliminate trailing dosish \r.
+    if (Ptr <= End - 2 && Ptr[0] == '\r' && Ptr[1] == '\n') {
+      continue;
+    }
+
+    // If current char is not a horizontal whitespace or if horizontal
+    // whitespace canonicalization is disabled, dump it to output as is.
+    if (Req.NoCanonicalizeWhiteSpace || (*Ptr != ' ' && *Ptr != '\t')) {
+      OutputBuffer.push_back(*Ptr);
+      continue;
+    }
+
+    // Otherwise, add one space and advance over neighboring space.
+    OutputBuffer.push_back(' ');
+    while (Ptr + 1 != End && (Ptr[1] == ' ' || Ptr[1] == '\t'))
+      ++Ptr;
+  }
+
+  // Add a null byte and then return all but that byte.
+  OutputBuffer.push_back('\0');
+  return StringRef(OutputBuffer.data(), OutputBuffer.size() - 1);
+}
+
+FileCheckDiag::FileCheckDiag(const SourceMgr &SM,
+                             const Check::FileCheckType &CheckTy,
+                             SMLoc CheckLoc, MatchType MatchTy,
+                             SMRange InputRange, StringRef Note)
+    : CheckTy(CheckTy), CheckLoc(CheckLoc), MatchTy(MatchTy), Note(Note) {
+  auto Start = SM.getLineAndColumn(InputRange.Start);
+  auto End = SM.getLineAndColumn(InputRange.End);
+  InputStartLine = Start.first;
+  InputStartCol = Start.second;
+  InputEndLine = End.first;
+  InputEndCol = End.second;
+}
+
+static bool IsPartOfWord(char c) {
+  return (isAlnum(c) || c == '-' || c == '_');
+}
+
+Check::FileCheckType &Check::FileCheckType::setCount(int C) {
+  assert(Count > 0 && "zero and negative counts are not supported");
+  assert((C == 1 || Kind == CheckPlain) &&
+         "count supported only for plain CHECK directives");
+  Count = C;
+  return *this;
+}
+
+std::string Check::FileCheckType::getModifiersDescription() const {
+  if (Modifiers.none())
+    return "";
+  std::string Ret;
+  raw_string_ostream OS(Ret);
+  OS << '{';
+  if (isLiteralMatch())
+    OS << "LITERAL";
+  OS << '}';
+  return OS.str();
+}
+
+std::string Check::FileCheckType::getDescription(StringRef Prefix) const {
+  // Append directive modifiers.
+  auto WithModifiers = [this, Prefix](StringRef Str) -> std::string {
+    return (Prefix + Str + getModifiersDescription()).str();
+  };
+
+  switch (Kind) {
+  case Check::CheckNone:
+    return "invalid";
+  case Check::CheckPlain:
+    if (Count > 1)
+      return WithModifiers("-COUNT");
+    return WithModifiers("");
+  case Check::CheckNext:
+    return WithModifiers("-NEXT");
+  case Check::CheckSame:
+    return WithModifiers("-SAME");
+  case Check::CheckNot:
+    return WithModifiers("-NOT");
+  case Check::CheckDAG:
+    return WithModifiers("-DAG");
+  case Check::CheckLabel:
+    return WithModifiers("-LABEL");
+  case Check::CheckEmpty:
+    return WithModifiers("-EMPTY");
+  case Check::CheckComment:
+    return std::string(Prefix);
+  case Check::CheckEOF:
+    return "implicit EOF";
+  case Check::CheckBadNot:
+    return "bad NOT";
+  case Check::CheckBadCount:
+    return "bad COUNT";
+  }
+  llvm_unreachable("unknown FileCheckType");
+}
+
+static std::pair<Check::FileCheckType, StringRef>
+FindCheckType(const FileCheckRequest &Req, StringRef Buffer, StringRef Prefix) {
+  if (Buffer.size() <= Prefix.size())
+    return {Check::CheckNone, StringRef()};
+
+  StringRef Rest = Buffer.drop_front(Prefix.size());
+  // Check for comment.
+  if (llvm::is_contained(Req.CommentPrefixes, Prefix)) {
+    if (Rest.consume_front(":"))
+      return {Check::CheckComment, Rest};
+    // Ignore a comment prefix if it has a suffix like "-NOT".
+    return {Check::CheckNone, StringRef()};
+  }
+
+  auto ConsumeModifiers = [&](Check::FileCheckType Ret)
+      -> std::pair<Check::FileCheckType, StringRef> {
+    if (Rest.consume_front(":"))
+      return {Ret, Rest};
+    if (!Rest.consume_front("{"))
+      return {Check::CheckNone, StringRef()};
+
+    // Parse the modifiers, speparated by commas.
+    do {
+      // Allow whitespace in modifiers list.
+      Rest = Rest.ltrim();
+      if (Rest.consume_front("LITERAL"))
+        Ret.setLiteralMatch();
+      else
+        return {Check::CheckNone, Rest};
+      // Allow whitespace in modifiers list.
+      Rest = Rest.ltrim();
+    } while (Rest.consume_front(","));
+    if (!Rest.consume_front("}:"))
+      return {Check::CheckNone, Rest};
+    return {Ret, Rest};
+  };
+
+  // Verify that the prefix is followed by directive modifiers or a colon.
+  if (Rest.consume_front(":"))
+    return {Check::CheckPlain, Rest};
+  if (Rest.front() == '{')
+    return ConsumeModifiers(Check::CheckPlain);
+
+  if (!Rest.consume_front("-"))
+    return {Check::CheckNone, StringRef()};
+
+  if (Rest.consume_front("COUNT-")) {
+    int64_t Count;
+    if (Rest.consumeInteger(10, Count))
+      // Error happened in parsing integer.
+      return {Check::CheckBadCount, Rest};
+    if (Count <= 0 || Count > INT32_MAX)
+      return {Check::CheckBadCount, Rest};
+    if (Rest.front() != ':' && Rest.front() != '{')
+      return {Check::CheckBadCount, Rest};
+    return ConsumeModifiers(
+        Check::FileCheckType(Check::CheckPlain).setCount(Count));
+  }
+
+  // You can't combine -NOT with another suffix.
+  if (Rest.startswith("DAG-NOT:") || Rest.startswith("NOT-DAG:") ||
+      Rest.startswith("NEXT-NOT:") || Rest.startswith("NOT-NEXT:") ||
+      Rest.startswith("SAME-NOT:") || Rest.startswith("NOT-SAME:") ||
+      Rest.startswith("EMPTY-NOT:") || Rest.startswith("NOT-EMPTY:"))
+    return {Check::CheckBadNot, Rest};
+
+  if (Rest.consume_front("NEXT"))
+    return ConsumeModifiers(Check::CheckNext);
+
+  if (Rest.consume_front("SAME"))
+    return ConsumeModifiers(Check::CheckSame);
+
+  if (Rest.consume_front("NOT"))
+    return ConsumeModifiers(Check::CheckNot);
+
+  if (Rest.consume_front("DAG"))
+    return ConsumeModifiers(Check::CheckDAG);
+
+  if (Rest.consume_front("LABEL"))
+    return ConsumeModifiers(Check::CheckLabel);
+
+  if (Rest.consume_front("EMPTY"))
+    return ConsumeModifiers(Check::CheckEmpty);
+
+  return {Check::CheckNone, Rest};
+}
+
+// From the given position, find the next character after the word.
+static size_t SkipWord(StringRef Str, size_t Loc) {
+  while (Loc < Str.size() && IsPartOfWord(Str[Loc]))
+    ++Loc;
+  return Loc;
+}
+
+/// Searches the buffer for the first prefix in the prefix regular expression.
+///
+/// This searches the buffer using the provided regular expression, however it
+/// enforces constraints beyond that:
+/// 1) The found prefix must not be a suffix of something that looks like
+///    a valid prefix.
+/// 2) The found prefix must be followed by a valid check type suffix using \c
+///    FindCheckType above.
+///
+/// \returns a pair of StringRefs into the Buffer, which combines:
+///   - the first match of the regular expression to satisfy these two is
+///   returned,
+///     otherwise an empty StringRef is returned to indicate failure.
+///   - buffer rewound to the location right after parsed suffix, for parsing
+///     to continue from
+///
+/// If this routine returns a valid prefix, it will also shrink \p Buffer to
+/// start at the beginning of the returned prefix, increment \p LineNumber for
+/// each new line consumed from \p Buffer, and set \p CheckTy to the type of
+/// check found by examining the suffix.
+///
+/// If no valid prefix is found, the state of Buffer, LineNumber, and CheckTy
+/// is unspecified.
+static std::pair<StringRef, StringRef>
+FindFirstMatchingPrefix(const FileCheckRequest &Req, Regex &PrefixRE,
+                        StringRef &Buffer, unsigned &LineNumber,
+                        Check::FileCheckType &CheckTy) {
+  SmallVector<StringRef, 2> Matches;
+
+  while (!Buffer.empty()) {
+    // Find the first (longest) match using the RE.
+    if (!PrefixRE.match(Buffer, &Matches))
+      // No match at all, bail.
+      return {StringRef(), StringRef()};
+
+    StringRef Prefix = Matches[0];
+    Matches.clear();
+
+    assert(Prefix.data() >= Buffer.data() &&
+           Prefix.data() < Buffer.data() + Buffer.size() &&
+           "Prefix doesn't start inside of buffer!");
+    size_t Loc = Prefix.data() - Buffer.data();
+    StringRef Skipped = Buffer.substr(0, Loc);
+    Buffer = Buffer.drop_front(Loc);
+    LineNumber += Skipped.count('\n');
+
+    // Check that the matched prefix isn't a suffix of some other check-like
+    // word.
+    // FIXME: This is a very ad-hoc check. it would be better handled in some
+    // other way. Among other things it seems hard to distinguish between
+    // intentional and unintentional uses of this feature.
+    if (Skipped.empty() || !IsPartOfWord(Skipped.back())) {
+      // Now extract the type.
+      StringRef AfterSuffix;
+      std::tie(CheckTy, AfterSuffix) = FindCheckType(Req, Buffer, Prefix);
+
+      // If we've found a valid check type for this prefix, we're done.
+      if (CheckTy != Check::CheckNone)
+        return {Prefix, AfterSuffix};
+    }
+
+    // If we didn't successfully find a prefix, we need to skip this invalid
+    // prefix and continue scanning. We directly skip the prefix that was
+    // matched and any additional parts of that check-like word.
+    Buffer = Buffer.drop_front(SkipWord(Buffer, Prefix.size()));
+  }
+
+  // We ran out of buffer while skipping partial matches so give up.
+  return {StringRef(), StringRef()};
+}
+
+void FileCheckPatternContext::createLineVariable() {
+  assert(!LineVariable && "@LINE pseudo numeric variable already created");
+  StringRef LineName = "@LINE";
+  LineVariable = makeNumericVariable(
+      LineName, ExpressionFormat(ExpressionFormat::Kind::Unsigned));
+  GlobalNumericVariableTable[LineName] = LineVariable;
+}
+
+FileCheck::FileCheck(FileCheckRequest Req)
+    : Req(Req), PatternContext(std::make_unique<FileCheckPatternContext>()),
+      CheckStrings(std::make_unique<std::vector<FileCheckString>>()) {}
+
+FileCheck::~FileCheck() = default;
+
+bool FileCheck::readCheckFile(
+    SourceMgr &SM, StringRef Buffer, Regex &PrefixRE,
+    std::pair<unsigned, unsigned> *ImpPatBufferIDRange) {
+  if (ImpPatBufferIDRange)
+    ImpPatBufferIDRange->first = ImpPatBufferIDRange->second = 0;
+
+  Error DefineError =
+      PatternContext->defineCmdlineVariables(Req.GlobalDefines, SM);
+  if (DefineError) {
+    logAllUnhandledErrors(std::move(DefineError), errs());
+    return true;
+  }
+
+  PatternContext->createLineVariable();
+
+  std::vector<Pattern> ImplicitNegativeChecks;
+  for (StringRef PatternString : Req.ImplicitCheckNot) {
+    // Create a buffer with fake command line content in order to display the
+    // command line option responsible for the specific implicit CHECK-NOT.
+    std::string Prefix = "-implicit-check-not='";
+    std::string Suffix = "'";
+    std::unique_ptr<MemoryBuffer> CmdLine = MemoryBuffer::getMemBufferCopy(
+        (Prefix + PatternString + Suffix).str(), "command line");
+
+    StringRef PatternInBuffer =
+        CmdLine->getBuffer().substr(Prefix.size(), PatternString.size());
+    unsigned BufferID = SM.AddNewSourceBuffer(std::move(CmdLine), SMLoc());
+    if (ImpPatBufferIDRange) {
+      if (ImpPatBufferIDRange->first == ImpPatBufferIDRange->second) {
+        ImpPatBufferIDRange->first = BufferID;
+        ImpPatBufferIDRange->second = BufferID + 1;
+      } else {
+        assert(BufferID == ImpPatBufferIDRange->second &&
+               "expected consecutive source buffer IDs");
+        ++ImpPatBufferIDRange->second;
+      }
+    }
+
+    ImplicitNegativeChecks.push_back(
+        Pattern(Check::CheckNot, PatternContext.get()));
+    ImplicitNegativeChecks.back().parsePattern(PatternInBuffer,
+                                               "IMPLICIT-CHECK", SM, Req);
+  }
+
+  std::vector<Pattern> DagNotMatches = ImplicitNegativeChecks;
+
+  // LineNumber keeps track of the line on which CheckPrefix instances are
+  // found.
+  unsigned LineNumber = 1;
+
+  std::set<StringRef> PrefixesNotFound(Req.CheckPrefixes.begin(),
+                                       Req.CheckPrefixes.end());
+  const size_t DistinctPrefixes = PrefixesNotFound.size();
+  while (true) {
+    Check::FileCheckType CheckTy;
+
+    // See if a prefix occurs in the memory buffer.
+    StringRef UsedPrefix;
+    StringRef AfterSuffix;
+    std::tie(UsedPrefix, AfterSuffix) =
+        FindFirstMatchingPrefix(Req, PrefixRE, Buffer, LineNumber, CheckTy);
+    if (UsedPrefix.empty())
+      break;
+    if (CheckTy != Check::CheckComment)
+      PrefixesNotFound.erase(UsedPrefix);
+
+    assert(UsedPrefix.data() == Buffer.data() &&
+           "Failed to move Buffer's start forward, or pointed prefix outside "
+           "of the buffer!");
+    assert(AfterSuffix.data() >= Buffer.data() &&
+           AfterSuffix.data() < Buffer.data() + Buffer.size() &&
+           "Parsing after suffix doesn't start inside of buffer!");
+
+    // Location to use for error messages.
+    const char *UsedPrefixStart = UsedPrefix.data();
+
+    // Skip the buffer to the end of parsed suffix (or just prefix, if no good
+    // suffix was processed).
+    Buffer = AfterSuffix.empty() ? Buffer.drop_front(UsedPrefix.size())
+                                 : AfterSuffix;
+
+    // Complain about useful-looking but unsupported suffixes.
+    if (CheckTy == Check::CheckBadNot) {
+      SM.PrintMessage(SMLoc::getFromPointer(Buffer.data()), SourceMgr::DK_Error,
+                      "unsupported -NOT combo on prefix '" + UsedPrefix + "'");
+      return true;
+    }
+
+    // Complain about invalid count specification.
+    if (CheckTy == Check::CheckBadCount) {
+      SM.PrintMessage(SMLoc::getFromPointer(Buffer.data()), SourceMgr::DK_Error,
+                      "invalid count in -COUNT specification on prefix '" +
+                          UsedPrefix + "'");
+      return true;
+    }
+
+    // Okay, we found the prefix, yay. Remember the rest of the line, but ignore
+    // leading whitespace.
+    if (!(Req.NoCanonicalizeWhiteSpace && Req.MatchFullLines))
+      Buffer = Buffer.substr(Buffer.find_first_not_of(" \t"));
+
+    // Scan ahead to the end of line.
+    size_t EOL = Buffer.find_first_of("\n\r");
+
+    // Remember the location of the start of the pattern, for diagnostics.
+    SMLoc PatternLoc = SMLoc::getFromPointer(Buffer.data());
+
+    // Extract the pattern from the buffer.
+    StringRef PatternBuffer = Buffer.substr(0, EOL);
+    Buffer = Buffer.substr(EOL);
+
+    // If this is a comment, we're done.
+    if (CheckTy == Check::CheckComment)
+      continue;
+
+    // Parse the pattern.
+    Pattern P(CheckTy, PatternContext.get(), LineNumber);
+    if (P.parsePattern(PatternBuffer, UsedPrefix, SM, Req))
+      return true;
+
+    // Verify that CHECK-LABEL lines do not define or use variables
+    if ((CheckTy == Check::CheckLabel) && P.hasVariable()) {
+      SM.PrintMessage(
+          SMLoc::getFromPointer(UsedPrefixStart), SourceMgr::DK_Error,
+          "found '" + UsedPrefix + "-LABEL:'"
+                                   " with variable definition or use");
+      return true;
+    }
+
+    // Verify that CHECK-NEXT/SAME/EMPTY lines have at least one CHECK line before them.
+    if ((CheckTy == Check::CheckNext || CheckTy == Check::CheckSame ||
+         CheckTy == Check::CheckEmpty) &&
+        CheckStrings->empty()) {
+      StringRef Type = CheckTy == Check::CheckNext
+                           ? "NEXT"
+                           : CheckTy == Check::CheckEmpty ? "EMPTY" : "SAME";
+      SM.PrintMessage(SMLoc::getFromPointer(UsedPrefixStart),
+                      SourceMgr::DK_Error,
+                      "found '" + UsedPrefix + "-" + Type +
+                          "' without previous '" + UsedPrefix + ": line");
+      return true;
+    }
+
+    // Handle CHECK-DAG/-NOT.
+    if (CheckTy == Check::CheckDAG || CheckTy == Check::CheckNot) {
+      DagNotMatches.push_back(P);
+      continue;
+    }
+
+    // Okay, add the string we captured to the output vector and move on.
+    CheckStrings->emplace_back(P, UsedPrefix, PatternLoc);
+    std::swap(DagNotMatches, CheckStrings->back().DagNotStrings);
+    DagNotMatches = ImplicitNegativeChecks;
+  }
+
+  // When there are no used prefixes we report an error except in the case that
+  // no prefix is specified explicitly but -implicit-check-not is specified.
+  const bool NoPrefixesFound = PrefixesNotFound.size() == DistinctPrefixes;
+  const bool SomePrefixesUnexpectedlyNotUsed =
+      !Req.AllowUnusedPrefixes && !PrefixesNotFound.empty();
+  if ((NoPrefixesFound || SomePrefixesUnexpectedlyNotUsed) &&
+      (ImplicitNegativeChecks.empty() || !Req.IsDefaultCheckPrefix)) {
+    errs() << "error: no check strings found with prefix"
+           << (PrefixesNotFound.size() > 1 ? "es " : " ");
+    bool First = true;
+    for (StringRef MissingPrefix : PrefixesNotFound) {
+      if (!First)
+        errs() << ", ";
+      errs() << "\'" << MissingPrefix << ":'";
+      First = false;
+    }
+    errs() << '\n';
+    return true;
+  }
+
+  // Add an EOF pattern for any trailing --implicit-check-not/CHECK-DAG/-NOTs,
+  // and use the first prefix as a filler for the error message.
+  if (!DagNotMatches.empty()) {
+    CheckStrings->emplace_back(
+        Pattern(Check::CheckEOF, PatternContext.get(), LineNumber + 1),
+        *Req.CheckPrefixes.begin(), SMLoc::getFromPointer(Buffer.data()));
+    std::swap(DagNotMatches, CheckStrings->back().DagNotStrings);
+  }
+
+  return false;
+}
+
+static void PrintMatch(bool ExpectedMatch, const SourceMgr &SM,
+                       StringRef Prefix, SMLoc Loc, const Pattern &Pat,
+                       int MatchedCount, StringRef Buffer, size_t MatchPos,
+                       size_t MatchLen, const FileCheckRequest &Req,
+                       std::vector<FileCheckDiag> *Diags) {
+  bool PrintDiag = true;
+  if (ExpectedMatch) {
+    if (!Req.Verbose)
+      return;
+    if (!Req.VerboseVerbose && Pat.getCheckTy() == Check::CheckEOF)
+      return;
+    // Due to their verbosity, we don't print verbose diagnostics here if we're
+    // gathering them for a different rendering, but we always print other
+    // diagnostics.
+    PrintDiag = !Diags;
+  }
+  FileCheckDiag::MatchType MatchTy = ExpectedMatch
+                                         ? FileCheckDiag::MatchFoundAndExpected
+                                         : FileCheckDiag::MatchFoundButExcluded;
+  SMRange MatchRange = ProcessMatchResult(MatchTy, SM, Loc, Pat.getCheckTy(),
+                                          Buffer, MatchPos, MatchLen, Diags);
+  if (Diags) {
+    Pat.printSubstitutions(SM, Buffer, MatchRange, MatchTy, Diags);
+    Pat.printVariableDefs(SM, MatchTy, Diags);
+  }
+  if (!PrintDiag)
+    return;
+
+  std::string Message = formatv("{0}: {1} string found in input",
+                                Pat.getCheckTy().getDescription(Prefix),
+                                (ExpectedMatch ? "expected" : "excluded"))
+                            .str();
+  if (Pat.getCount() > 1)
+    Message += formatv(" ({0} out of {1})", MatchedCount, Pat.getCount()).str();
+
+  SM.PrintMessage(
+      Loc, ExpectedMatch ? SourceMgr::DK_Remark : SourceMgr::DK_Error, Message);
+  SM.PrintMessage(MatchRange.Start, SourceMgr::DK_Note, "found here",
+                  {MatchRange});
+  Pat.printSubstitutions(SM, Buffer, MatchRange, MatchTy, nullptr);
+  Pat.printVariableDefs(SM, MatchTy, nullptr);
+}
+
+static void PrintMatch(bool ExpectedMatch, const SourceMgr &SM,
+                       const FileCheckString &CheckStr, int MatchedCount,
+                       StringRef Buffer, size_t MatchPos, size_t MatchLen,
+                       FileCheckRequest &Req,
+                       std::vector<FileCheckDiag> *Diags) {
+  PrintMatch(ExpectedMatch, SM, CheckStr.Prefix, CheckStr.Loc, CheckStr.Pat,
+             MatchedCount, Buffer, MatchPos, MatchLen, Req, Diags);
+}
+
+static void PrintNoMatch(bool ExpectedMatch, const SourceMgr &SM,
+                         StringRef Prefix, SMLoc Loc, const Pattern &Pat,
+                         int MatchedCount, StringRef Buffer,
+                         bool VerboseVerbose, std::vector<FileCheckDiag> *Diags,
+                         Error MatchErrors) {
+  assert(MatchErrors && "Called on successful match");
+  bool PrintDiag = true;
+  if (!ExpectedMatch) {
+    if (!VerboseVerbose) {
+      consumeError(std::move(MatchErrors));
+      return;
+    }
+    // Due to their verbosity, we don't print verbose diagnostics here if we're
+    // gathering them for a different rendering, but we always print other
+    // diagnostics.
+    PrintDiag = !Diags;
+  }
+
+  // If the current position is at the end of a line, advance to the start of
+  // the next line.
+  Buffer = Buffer.substr(Buffer.find_first_not_of(" \t\n\r"));
+  FileCheckDiag::MatchType MatchTy = ExpectedMatch
+                                         ? FileCheckDiag::MatchNoneButExpected
+                                         : FileCheckDiag::MatchNoneAndExcluded;
+  SMRange SearchRange = ProcessMatchResult(MatchTy, SM, Loc, Pat.getCheckTy(),
+                                           Buffer, 0, Buffer.size(), Diags);
+  if (Diags)
+    Pat.printSubstitutions(SM, Buffer, SearchRange, MatchTy, Diags);
+  if (!PrintDiag) {
+    consumeError(std::move(MatchErrors));
+    return;
+  }
+
+  MatchErrors = handleErrors(std::move(MatchErrors),
+                             [](const ErrorDiagnostic &E) { E.log(errs()); });
+
+  // No problem matching the string per se.
+  if (!MatchErrors)
+    return;
+  consumeError(std::move(MatchErrors));
+
+  // Print "not found" diagnostic.
+  std::string Message = formatv("{0}: {1} string not found in input",
+                                Pat.getCheckTy().getDescription(Prefix),
+                                (ExpectedMatch ? "expected" : "excluded"))
+                            .str();
+  if (Pat.getCount() > 1)
+    Message += formatv(" ({0} out of {1})", MatchedCount, Pat.getCount()).str();
+  SM.PrintMessage(
+      Loc, ExpectedMatch ? SourceMgr::DK_Error : SourceMgr::DK_Remark, Message);
+
+  // Print the "scanning from here" line.
+  SM.PrintMessage(SearchRange.Start, SourceMgr::DK_Note, "scanning from here");
+
+  // Allow the pattern to print additional information if desired.
+  Pat.printSubstitutions(SM, Buffer, SearchRange, MatchTy, nullptr);
+
+  if (ExpectedMatch)
+    Pat.printFuzzyMatch(SM, Buffer, Diags);
+}
+
+static void PrintNoMatch(bool ExpectedMatch, const SourceMgr &SM,
+                         const FileCheckString &CheckStr, int MatchedCount,
+                         StringRef Buffer, bool VerboseVerbose,
+                         std::vector<FileCheckDiag> *Diags, Error MatchErrors) {
+  PrintNoMatch(ExpectedMatch, SM, CheckStr.Prefix, CheckStr.Loc, CheckStr.Pat,
+               MatchedCount, Buffer, VerboseVerbose, Diags,
+               std::move(MatchErrors));
+}
+
+/// Counts the number of newlines in the specified range.
+static unsigned CountNumNewlinesBetween(StringRef Range,
+                                        const char *&FirstNewLine) {
+  unsigned NumNewLines = 0;
+  while (1) {
+    // Scan for newline.
+    Range = Range.substr(Range.find_first_of("\n\r"));
+    if (Range.empty())
+      return NumNewLines;
+
+    ++NumNewLines;
+
+    // Handle \n\r and \r\n as a single newline.
+    if (Range.size() > 1 && (Range[1] == '\n' || Range[1] == '\r') &&
+        (Range[0] != Range[1]))
+      Range = Range.substr(1);
+    Range = Range.substr(1);
+
+    if (NumNewLines == 1)
+      FirstNewLine = Range.begin();
+  }
+}
+
+size_t FileCheckString::Check(const SourceMgr &SM, StringRef Buffer,
+                              bool IsLabelScanMode, size_t &MatchLen,
+                              FileCheckRequest &Req,
+                              std::vector<FileCheckDiag> *Diags) const {
+  size_t LastPos = 0;
+  std::vector<const Pattern *> NotStrings;
+
+  // IsLabelScanMode is true when we are scanning forward to find CHECK-LABEL
+  // bounds; we have not processed variable definitions within the bounded block
+  // yet so cannot handle any final CHECK-DAG yet; this is handled when going
+  // over the block again (including the last CHECK-LABEL) in normal mode.
+  if (!IsLabelScanMode) {
+    // Match "dag strings" (with mixed "not strings" if any).
+    LastPos = CheckDag(SM, Buffer, NotStrings, Req, Diags);
+    if (LastPos == StringRef::npos)
+      return StringRef::npos;
+  }
+
+  // Match itself from the last position after matching CHECK-DAG.
+  size_t LastMatchEnd = LastPos;
+  size_t FirstMatchPos = 0;
+  // Go match the pattern Count times. Majority of patterns only match with
+  // count 1 though.
+  assert(Pat.getCount() != 0 && "pattern count can not be zero");
+  for (int i = 1; i <= Pat.getCount(); i++) {
+    StringRef MatchBuffer = Buffer.substr(LastMatchEnd);
+    size_t CurrentMatchLen;
+    // get a match at current start point
+    Expected<size_t> MatchResult = Pat.match(MatchBuffer, CurrentMatchLen, SM);
+
+    // report
+    if (!MatchResult) {
+      PrintNoMatch(true, SM, *this, i, MatchBuffer, Req.VerboseVerbose, Diags,
+                   MatchResult.takeError());
+      return StringRef::npos;
+    }
+    size_t MatchPos = *MatchResult;
+    PrintMatch(true, SM, *this, i, MatchBuffer, MatchPos, CurrentMatchLen, Req,
+               Diags);
+    if (i == 1)
+      FirstMatchPos = LastPos + MatchPos;
+
+    // move start point after the match
+    LastMatchEnd += MatchPos + CurrentMatchLen;
+  }
+  // Full match len counts from first match pos.
+  MatchLen = LastMatchEnd - FirstMatchPos;
+
+  // Similar to the above, in "label-scan mode" we can't yet handle CHECK-NEXT
+  // or CHECK-NOT
+  if (!IsLabelScanMode) {
+    size_t MatchPos = FirstMatchPos - LastPos;
+    StringRef MatchBuffer = Buffer.substr(LastPos);
+    StringRef SkippedRegion = Buffer.substr(LastPos, MatchPos);
+
+    // If this check is a "CHECK-NEXT", verify that the previous match was on
+    // the previous line (i.e. that there is one newline between them).
+    if (CheckNext(SM, SkippedRegion)) {
+      ProcessMatchResult(FileCheckDiag::MatchFoundButWrongLine, SM, Loc,
+                         Pat.getCheckTy(), MatchBuffer, MatchPos, MatchLen,
+                         Diags, Req.Verbose);
+      return StringRef::npos;
+    }
+
+    // If this check is a "CHECK-SAME", verify that the previous match was on
+    // the same line (i.e. that there is no newline between them).
+    if (CheckSame(SM, SkippedRegion)) {
+      ProcessMatchResult(FileCheckDiag::MatchFoundButWrongLine, SM, Loc,
+                         Pat.getCheckTy(), MatchBuffer, MatchPos, MatchLen,
+                         Diags, Req.Verbose);
+      return StringRef::npos;
+    }
+
+    // If this match had "not strings", verify that they don't exist in the
+    // skipped region.
+    if (CheckNot(SM, SkippedRegion, NotStrings, Req, Diags))
+      return StringRef::npos;
+  }
+
+  return FirstMatchPos;
+}
+
+bool FileCheckString::CheckNext(const SourceMgr &SM, StringRef Buffer) const {
+  if (Pat.getCheckTy() != Check::CheckNext &&
+      Pat.getCheckTy() != Check::CheckEmpty)
+    return false;
+
+  Twine CheckName =
+      Prefix +
+      Twine(Pat.getCheckTy() == Check::CheckEmpty ? "-EMPTY" : "-NEXT");
+
+  // Count the number of newlines between the previous match and this one.
+  const char *FirstNewLine = nullptr;
+  unsigned NumNewLines = CountNumNewlinesBetween(Buffer, FirstNewLine);
+
+  if (NumNewLines == 0) {
+    SM.PrintMessage(Loc, SourceMgr::DK_Error,
+                    CheckName + ": is on the same line as previous match");
+    SM.PrintMessage(SMLoc::getFromPointer(Buffer.end()), SourceMgr::DK_Note,
+                    "'next' match was here");
+    SM.PrintMessage(SMLoc::getFromPointer(Buffer.data()), SourceMgr::DK_Note,
+                    "previous match ended here");
+    return true;
+  }
+
+  if (NumNewLines != 1) {
+    SM.PrintMessage(Loc, SourceMgr::DK_Error,
+                    CheckName +
+                        ": is not on the line after the previous match");
+    SM.PrintMessage(SMLoc::getFromPointer(Buffer.end()), SourceMgr::DK_Note,
+                    "'next' match was here");
+    SM.PrintMessage(SMLoc::getFromPointer(Buffer.data()), SourceMgr::DK_Note,
+                    "previous match ended here");
+    SM.PrintMessage(SMLoc::getFromPointer(FirstNewLine), SourceMgr::DK_Note,
+                    "non-matching line after previous match is here");
+    return true;
+  }
+
+  return false;
+}
+
+bool FileCheckString::CheckSame(const SourceMgr &SM, StringRef Buffer) const {
+  if (Pat.getCheckTy() != Check::CheckSame)
+    return false;
+
+  // Count the number of newlines between the previous match and this one.
+  const char *FirstNewLine = nullptr;
+  unsigned NumNewLines = CountNumNewlinesBetween(Buffer, FirstNewLine);
+
+  if (NumNewLines != 0) {
+    SM.PrintMessage(Loc, SourceMgr::DK_Error,
+                    Prefix +
+                        "-SAME: is not on the same line as the previous match");
+    SM.PrintMessage(SMLoc::getFromPointer(Buffer.end()), SourceMgr::DK_Note,
+                    "'next' match was here");
+    SM.PrintMessage(SMLoc::getFromPointer(Buffer.data()), SourceMgr::DK_Note,
+                    "previous match ended here");
+    return true;
+  }
+
+  return false;
+}
+
+bool FileCheckString::CheckNot(const SourceMgr &SM, StringRef Buffer,
+                               const std::vector<const Pattern *> &NotStrings,
+                               const FileCheckRequest &Req,
+                               std::vector<FileCheckDiag> *Diags) const {
+  bool DirectiveFail = false;
+  for (const Pattern *Pat : NotStrings) {
+    assert((Pat->getCheckTy() == Check::CheckNot) && "Expect CHECK-NOT!");
+
+    size_t MatchLen = 0;
+    Expected<size_t> MatchResult = Pat->match(Buffer, MatchLen, SM);
+
+    if (!MatchResult) {
+      PrintNoMatch(false, SM, Prefix, Pat->getLoc(), *Pat, 1, Buffer,
+                   Req.VerboseVerbose, Diags, MatchResult.takeError());
+      continue;
+    }
+    size_t Pos = *MatchResult;
+
+    PrintMatch(false, SM, Prefix, Pat->getLoc(), *Pat, 1, Buffer, Pos, MatchLen,
+               Req, Diags);
+    DirectiveFail = true;
+  }
+
+  return DirectiveFail;
+}
+
+size_t FileCheckString::CheckDag(const SourceMgr &SM, StringRef Buffer,
+                                 std::vector<const Pattern *> &NotStrings,
+                                 const FileCheckRequest &Req,
+                                 std::vector<FileCheckDiag> *Diags) const {
+  if (DagNotStrings.empty())
+    return 0;
+
+  // The start of the search range.
+  size_t StartPos = 0;
+
+  struct MatchRange {
+    size_t Pos;
+    size_t End;
+  };
+  // A sorted list of ranges for non-overlapping CHECK-DAG matches.  Match
+  // ranges are erased from this list once they are no longer in the search
+  // range.
+  std::list<MatchRange> MatchRanges;
+
+  // We need PatItr and PatEnd later for detecting the end of a CHECK-DAG
+  // group, so we don't use a range-based for loop here.
+  for (auto PatItr = DagNotStrings.begin(), PatEnd = DagNotStrings.end();
+       PatItr != PatEnd; ++PatItr) {
+    const Pattern &Pat = *PatItr;
+    assert((Pat.getCheckTy() == Check::CheckDAG ||
+            Pat.getCheckTy() == Check::CheckNot) &&
+           "Invalid CHECK-DAG or CHECK-NOT!");
+
+    if (Pat.getCheckTy() == Check::CheckNot) {
+      NotStrings.push_back(&Pat);
+      continue;
+    }
+
+    assert((Pat.getCheckTy() == Check::CheckDAG) && "Expect CHECK-DAG!");
+
+    // CHECK-DAG always matches from the start.
+    size_t MatchLen = 0, MatchPos = StartPos;
+
+    // Search for a match that doesn't overlap a previous match in this
+    // CHECK-DAG group.
+    for (auto MI = MatchRanges.begin(), ME = MatchRanges.end(); true; ++MI) {
+      StringRef MatchBuffer = Buffer.substr(MatchPos);
+      Expected<size_t> MatchResult = Pat.match(MatchBuffer, MatchLen, SM);
+      // With a group of CHECK-DAGs, a single mismatching means the match on
+      // that group of CHECK-DAGs fails immediately.
+      if (!MatchResult) {
+        PrintNoMatch(true, SM, Prefix, Pat.getLoc(), Pat, 1, MatchBuffer,
+                     Req.VerboseVerbose, Diags, MatchResult.takeError());
+        return StringRef::npos;
+      }
+      size_t MatchPosBuf = *MatchResult;
+      // Re-calc it as the offset relative to the start of the original string.
+      MatchPos += MatchPosBuf;
+      if (Req.VerboseVerbose)
+        PrintMatch(true, SM, Prefix, Pat.getLoc(), Pat, 1, Buffer, MatchPos,
+                   MatchLen, Req, Diags);
+      MatchRange M{MatchPos, MatchPos + MatchLen};
+      if (Req.AllowDeprecatedDagOverlap) {
+        // We don't need to track all matches in this mode, so we just maintain
+        // one match range that encompasses the current CHECK-DAG group's
+        // matches.
+        if (MatchRanges.empty())
+          MatchRanges.insert(MatchRanges.end(), M);
+        else {
+          auto Block = MatchRanges.begin();
+          Block->Pos = std::min(Block->Pos, M.Pos);
+          Block->End = std::max(Block->End, M.End);
+        }
+        break;
+      }
+      // Iterate previous matches until overlapping match or insertion point.
+      bool Overlap = false;
+      for (; MI != ME; ++MI) {
+        if (M.Pos < MI->End) {
+          // !Overlap => New match has no overlap and is before this old match.
+          // Overlap => New match overlaps this old match.
+          Overlap = MI->Pos < M.End;
+          break;
+        }
+      }
+      if (!Overlap) {
+        // Insert non-overlapping match into list.
+        MatchRanges.insert(MI, M);
+        break;
+      }
+      if (Req.VerboseVerbose) {
+        // Due to their verbosity, we don't print verbose diagnostics here if
+        // we're gathering them for a different rendering, but we always print
+        // other diagnostics.
+        if (!Diags) {
+          SMLoc OldStart = SMLoc::getFromPointer(Buffer.data() + MI->Pos);
+          SMLoc OldEnd = SMLoc::getFromPointer(Buffer.data() + MI->End);
+          SMRange OldRange(OldStart, OldEnd);
+          SM.PrintMessage(OldStart, SourceMgr::DK_Note,
+                          "match discarded, overlaps earlier DAG match here",
+                          {OldRange});
+        } else {
+          SMLoc CheckLoc = Diags->rbegin()->CheckLoc;
+          for (auto I = Diags->rbegin(), E = Diags->rend();
+               I != E && I->CheckLoc == CheckLoc; ++I)
+            I->MatchTy = FileCheckDiag::MatchFoundButDiscarded;
+        }
+      }
+      MatchPos = MI->End;
+    }
+    if (!Req.VerboseVerbose)
+      PrintMatch(true, SM, Prefix, Pat.getLoc(), Pat, 1, Buffer, MatchPos,
+                 MatchLen, Req, Diags);
+
+    // Handle the end of a CHECK-DAG group.
+    if (std::next(PatItr) == PatEnd ||
+        std::next(PatItr)->getCheckTy() == Check::CheckNot) {
+      if (!NotStrings.empty()) {
+        // If there are CHECK-NOTs between two CHECK-DAGs or from CHECK to
+        // CHECK-DAG, verify that there are no 'not' strings occurred in that
+        // region.
+        StringRef SkippedRegion =
+            Buffer.slice(StartPos, MatchRanges.begin()->Pos);
+        if (CheckNot(SM, SkippedRegion, NotStrings, Req, Diags))
+          return StringRef::npos;
+        // Clear "not strings".
+        NotStrings.clear();
+      }
+      // All subsequent CHECK-DAGs and CHECK-NOTs should be matched from the
+      // end of this CHECK-DAG group's match range.
+      StartPos = MatchRanges.rbegin()->End;
+      // Don't waste time checking for (impossible) overlaps before that.
+      MatchRanges.clear();
+    }
+  }
+
+  return StartPos;
+}
+
+static bool ValidatePrefixes(StringRef Kind, StringSet<> &UniquePrefixes,
+                             ArrayRef<StringRef> SuppliedPrefixes) {
+  for (StringRef Prefix : SuppliedPrefixes) {
+    if (Prefix.empty()) {
+      errs() << "error: supplied " << Kind << " prefix must not be the empty "
+             << "string\n";
+      return false;
+    }
+    static const Regex Validator("^[a-zA-Z0-9_-]*$");
+    if (!Validator.match(Prefix)) {
+      errs() << "error: supplied " << Kind << " prefix must start with a "
+             << "letter and contain only alphanumeric characters, hyphens, and "
+             << "underscores: '" << Prefix << "'\n";
+      return false;
+    }
+    if (!UniquePrefixes.insert(Prefix).second) {
+      errs() << "error: supplied " << Kind << " prefix must be unique among "
+             << "check and comment prefixes: '" << Prefix << "'\n";
+      return false;
+    }
+  }
+  return true;
+}
+
+static const char *DefaultCheckPrefixes[] = {"CHECK"};
+static const char *DefaultCommentPrefixes[] = {"COM", "RUN"};
+
+bool FileCheck::ValidateCheckPrefixes() {
+  StringSet<> UniquePrefixes;
+  // Add default prefixes to catch user-supplied duplicates of them below.
+  if (Req.CheckPrefixes.empty()) {
+    for (const char *Prefix : DefaultCheckPrefixes)
+      UniquePrefixes.insert(Prefix);
+  }
+  if (Req.CommentPrefixes.empty()) {
+    for (const char *Prefix : DefaultCommentPrefixes)
+      UniquePrefixes.insert(Prefix);
+  }
+  // Do not validate the default prefixes, or diagnostics about duplicates might
+  // incorrectly indicate that they were supplied by the user.
+  if (!ValidatePrefixes("check", UniquePrefixes, Req.CheckPrefixes))
+    return false;
+  if (!ValidatePrefixes("comment", UniquePrefixes, Req.CommentPrefixes))
+    return false;
+  return true;
+}
+
+Regex FileCheck::buildCheckPrefixRegex() {
+  if (Req.CheckPrefixes.empty()) {
+    for (const char *Prefix : DefaultCheckPrefixes)
+      Req.CheckPrefixes.push_back(Prefix);
+    Req.IsDefaultCheckPrefix = true;
+  }
+  if (Req.CommentPrefixes.empty()) {
+    for (const char *Prefix : DefaultCommentPrefixes)
+      Req.CommentPrefixes.push_back(Prefix);
+  }
+
+  // We already validated the contents of CheckPrefixes and CommentPrefixes so
+  // just concatenate them as alternatives.
+  SmallString<32> PrefixRegexStr;
+  for (size_t I = 0, E = Req.CheckPrefixes.size(); I != E; ++I) {
+    if (I != 0)
+      PrefixRegexStr.push_back('|');
+    PrefixRegexStr.append(Req.CheckPrefixes[I]);
+  }
+  for (StringRef Prefix : Req.CommentPrefixes) {
+    PrefixRegexStr.push_back('|');
+    PrefixRegexStr.append(Prefix);
+  }
+
+  return Regex(PrefixRegexStr);
+}
+
+Error FileCheckPatternContext::defineCmdlineVariables(
+    ArrayRef<StringRef> CmdlineDefines, SourceMgr &SM) {
+  assert(GlobalVariableTable.empty() && GlobalNumericVariableTable.empty() &&
+         "Overriding defined variable with command-line variable definitions");
+
+  if (CmdlineDefines.empty())
+    return Error::success();
+
+  // Create a string representing the vector of command-line definitions. Each
+  // definition is on its own line and prefixed with a definition number to
+  // clarify which definition a given diagnostic corresponds to.
+  unsigned I = 0;
+  Error Errs = Error::success();
+  std::string CmdlineDefsDiag;
+  SmallVector<std::pair<size_t, size_t>, 4> CmdlineDefsIndices;
+  for (StringRef CmdlineDef : CmdlineDefines) {
+    std::string DefPrefix = ("Global define #" + Twine(++I) + ": ").str();
+    size_t EqIdx = CmdlineDef.find('=');
+    if (EqIdx == StringRef::npos) {
+      CmdlineDefsIndices.push_back(std::make_pair(CmdlineDefsDiag.size(), 0));
+      continue;
+    }
+    // Numeric variable definition.
+    if (CmdlineDef[0] == '#') {
+      // Append a copy of the command-line definition adapted to use the same
+      // format as in the input file to be able to reuse
+      // parseNumericSubstitutionBlock.
+      CmdlineDefsDiag += (DefPrefix + CmdlineDef + " (parsed as: [[").str();
+      std::string SubstitutionStr = std::string(CmdlineDef);
+      SubstitutionStr[EqIdx] = ':';
+      CmdlineDefsIndices.push_back(
+          std::make_pair(CmdlineDefsDiag.size(), SubstitutionStr.size()));
+      CmdlineDefsDiag += (SubstitutionStr + Twine("]])\n")).str();
+    } else {
+      CmdlineDefsDiag += DefPrefix;
+      CmdlineDefsIndices.push_back(
+          std::make_pair(CmdlineDefsDiag.size(), CmdlineDef.size()));
+      CmdlineDefsDiag += (CmdlineDef + "\n").str();
+    }
+  }
+
+  // Create a buffer with fake command line content in order to display
+  // parsing diagnostic with location information and point to the
+  // global definition with invalid syntax.
+  std::unique_ptr<MemoryBuffer> CmdLineDefsDiagBuffer =
+      MemoryBuffer::getMemBufferCopy(CmdlineDefsDiag, "Global defines");
+  StringRef CmdlineDefsDiagRef = CmdLineDefsDiagBuffer->getBuffer();
+  SM.AddNewSourceBuffer(std::move(CmdLineDefsDiagBuffer), SMLoc());
+
+  for (std::pair<size_t, size_t> CmdlineDefIndices : CmdlineDefsIndices) {
+    StringRef CmdlineDef = CmdlineDefsDiagRef.substr(CmdlineDefIndices.first,
+                                                     CmdlineDefIndices.second);
+    if (CmdlineDef.empty()) {
+      Errs = joinErrors(
+          std::move(Errs),
+          ErrorDiagnostic::get(SM, CmdlineDef,
+                               "missing equal sign in global definition"));
+      continue;
+    }
+
+    // Numeric variable definition.
+    if (CmdlineDef[0] == '#') {
+      // Now parse the definition both to check that the syntax is correct and
+      // to create the necessary class instance.
+      StringRef CmdlineDefExpr = CmdlineDef.substr(1);
+      Optional<NumericVariable *> DefinedNumericVariable;
+      Expected<std::unique_ptr<Expression>> ExpressionResult =
+          Pattern::parseNumericSubstitutionBlock(
+              CmdlineDefExpr, DefinedNumericVariable, false, None, this, SM);
+      if (!ExpressionResult) {
+        Errs = joinErrors(std::move(Errs), ExpressionResult.takeError());
+        continue;
+      }
+      std::unique_ptr<Expression> Expression = std::move(*ExpressionResult);
+      // Now evaluate the expression whose value this variable should be set
+      // to, since the expression of a command-line variable definition should
+      // only use variables defined earlier on the command-line. If not, this
+      // is an error and we report it.
+      Expected<ExpressionValue> Value = Expression->getAST()->eval();
+      if (!Value) {
+        Errs = joinErrors(std::move(Errs), Value.takeError());
+        continue;
+      }
+
+      assert(DefinedNumericVariable && "No variable defined");
+      (*DefinedNumericVariable)->setValue(*Value);
+
+      // Record this variable definition.
+      GlobalNumericVariableTable[(*DefinedNumericVariable)->getName()] =
+          *DefinedNumericVariable;
+    } else {
+      // String variable definition.
+      std::pair<StringRef, StringRef> CmdlineNameVal = CmdlineDef.split('=');
+      StringRef CmdlineName = CmdlineNameVal.first;
+      StringRef OrigCmdlineName = CmdlineName;
+      Expected<Pattern::VariableProperties> ParseVarResult =
+          Pattern::parseVariable(CmdlineName, SM);
+      if (!ParseVarResult) {
+        Errs = joinErrors(std::move(Errs), ParseVarResult.takeError());
+        continue;
+      }
+      // Check that CmdlineName does not denote a pseudo variable is only
+      // composed of the parsed numeric variable. This catches cases like
+      // "FOO+2" in a "FOO+2=10" definition.
+      if (ParseVarResult->IsPseudo || !CmdlineName.empty()) {
+        Errs = joinErrors(std::move(Errs),
+                          ErrorDiagnostic::get(
+                              SM, OrigCmdlineName,
+                              "invalid name in string variable definition '" +
+                                  OrigCmdlineName + "'"));
+        continue;
+      }
+      StringRef Name = ParseVarResult->Name;
+
+      // Detect collisions between string and numeric variables when the former
+      // is created later than the latter.
+      if (GlobalNumericVariableTable.find(Name) !=
+          GlobalNumericVariableTable.end()) {
+        Errs = joinErrors(std::move(Errs),
+                          ErrorDiagnostic::get(SM, Name,
+                                               "numeric variable with name '" +
+                                                   Name + "' already exists"));
+        continue;
+      }
+      GlobalVariableTable.insert(CmdlineNameVal);
+      // Mark the string variable as defined to detect collisions between
+      // string and numeric variables in defineCmdlineVariables when the latter
+      // is created later than the former. We cannot reuse GlobalVariableTable
+      // for this by populating it with an empty string since we would then
+      // lose the ability to detect the use of an undefined variable in
+      // match().
+      DefinedVariableTable[Name] = true;
+    }
+  }
+
+  return Errs;
+}
+
+void FileCheckPatternContext::clearLocalVars() {
+  SmallVector<StringRef, 16> LocalPatternVars, LocalNumericVars;
+  for (const StringMapEntry<StringRef> &Var : GlobalVariableTable)
+    if (Var.first()[0] != '$')
+      LocalPatternVars.push_back(Var.first());
+
+  // Numeric substitution reads the value of a variable directly, not via
+  // GlobalNumericVariableTable. Therefore, we clear local variables by
+  // clearing their value which will lead to a numeric substitution failure. We
+  // also mark the variable for removal from GlobalNumericVariableTable since
+  // this is what defineCmdlineVariables checks to decide that no global
+  // variable has been defined.
+  for (const auto &Var : GlobalNumericVariableTable)
+    if (Var.first()[0] != '$') {
+      Var.getValue()->clearValue();
+      LocalNumericVars.push_back(Var.first());
+    }
+
+  for (const auto &Var : LocalPatternVars)
+    GlobalVariableTable.erase(Var);
+  for (const auto &Var : LocalNumericVars)
+    GlobalNumericVariableTable.erase(Var);
+}
+
+bool FileCheck::checkInput(SourceMgr &SM, StringRef Buffer,
+                           std::vector<FileCheckDiag> *Diags) {
+  bool ChecksFailed = false;
+
+  unsigned i = 0, j = 0, e = CheckStrings->size();
+  while (true) {
+    StringRef CheckRegion;
+    if (j == e) {
+      CheckRegion = Buffer;
+    } else {
+      const FileCheckString &CheckLabelStr = (*CheckStrings)[j];
+      if (CheckLabelStr.Pat.getCheckTy() != Check::CheckLabel) {
+        ++j;
+        continue;
+      }
+
+      // Scan to next CHECK-LABEL match, ignoring CHECK-NOT and CHECK-DAG
+      size_t MatchLabelLen = 0;
+      size_t MatchLabelPos =
+          CheckLabelStr.Check(SM, Buffer, true, MatchLabelLen, Req, Diags);
+      if (MatchLabelPos == StringRef::npos)
+        // Immediately bail if CHECK-LABEL fails, nothing else we can do.
+        return false;
+
+      CheckRegion = Buffer.substr(0, MatchLabelPos + MatchLabelLen);
+      Buffer = Buffer.substr(MatchLabelPos + MatchLabelLen);
+      ++j;
+    }
+
+    // Do not clear the first region as it's the one before the first
+    // CHECK-LABEL and it would clear variables defined on the command-line
+    // before they get used.
+    if (i != 0 && Req.EnableVarScope)
+      PatternContext->clearLocalVars();
+
+    for (; i != j; ++i) {
+      const FileCheckString &CheckStr = (*CheckStrings)[i];
+
+      // Check each string within the scanned region, including a second check
+      // of any final CHECK-LABEL (to verify CHECK-NOT and CHECK-DAG)
+      size_t MatchLen = 0;
+      size_t MatchPos =
+          CheckStr.Check(SM, CheckRegion, false, MatchLen, Req, Diags);
+
+      if (MatchPos == StringRef::npos) {
+        ChecksFailed = true;
+        i = j;
+        break;
+      }
+
+      CheckRegion = CheckRegion.substr(MatchPos + MatchLen);
+    }
+
+    if (j == e)
+      break;
+  }
+
+  // Success if no checks failed.
+  return !ChecksFailed;
+}
diff --git a/contrib/llvm-project/llvm/lib/FileCheck/FileCheckImpl.h b/contrib/llvm-project/llvm/lib/FileCheck/FileCheckImpl.h
new file mode 100644
index 000000000000..05b2a529002f
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/FileCheck/FileCheckImpl.h
@@ -0,0 +1,859 @@
+//===-- FileCheckImpl.h - Private FileCheck Interface ------------*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the private interfaces of FileCheck. Its purpose is to
+// allow unit testing of FileCheck and to separate the interface from the
+// implementation. It is only meant to be used by FileCheck.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_FILECHECK_FILECHECKIMPL_H
+#define LLVM_LIB_FILECHECK_FILECHECKIMPL_H
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/FileCheck/FileCheck.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/SourceMgr.h"
+#include <map>
+#include <string>
+#include <vector>
+
+namespace llvm {
+
+//===----------------------------------------------------------------------===//
+// Numeric substitution handling code.
+//===----------------------------------------------------------------------===//
+
+class ExpressionValue;
+
+/// Type representing the format an expression value should be textualized into
+/// for matching. Used to represent both explicit format specifiers as well as
+/// implicit format from using numeric variables.
+struct ExpressionFormat {
+  enum class Kind {
+    /// Denote absence of format. Used for implicit format of literals and
+    /// empty expressions.
+    NoFormat,
+    /// Value is an unsigned integer and should be printed as a decimal number.
+    Unsigned,
+    /// Value is a signed integer and should be printed as a decimal number.
+    Signed,
+    /// Value should be printed as an uppercase hex number.
+    HexUpper,
+    /// Value should be printed as a lowercase hex number.
+    HexLower
+  };
+
+private:
+  Kind Value;
+  unsigned Precision = 0;
+
+public:
+  /// Evaluates a format to true if it can be used in a match.
+  explicit operator bool() const { return Value != Kind::NoFormat; }
+
+  /// Define format equality: formats are equal if neither is NoFormat and
+  /// their kinds and precision are the same.
+  bool operator==(const ExpressionFormat &Other) const {
+    return Value != Kind::NoFormat && Value == Other.Value &&
+           Precision == Other.Precision;
+  }
+
+  bool operator!=(const ExpressionFormat &Other) const {
+    return !(*this == Other);
+  }
+
+  bool operator==(Kind OtherValue) const { return Value == OtherValue; }
+
+  bool operator!=(Kind OtherValue) const { return !(*this == OtherValue); }
+
+  /// \returns the format specifier corresponding to this format as a string.
+  StringRef toString() const;
+
+  ExpressionFormat() : Value(Kind::NoFormat){};
+  explicit ExpressionFormat(Kind Value) : Value(Value), Precision(0){};
+  explicit ExpressionFormat(Kind Value, unsigned Precision)
+      : Value(Value), Precision(Precision){};
+
+  /// \returns a wildcard regular expression string that matches any value in
+  /// the format represented by this instance and no other value, or an error
+  /// if the format is NoFormat.
+  Expected<std::string> getWildcardRegex() const;
+
+  /// \returns the string representation of \p Value in the format represented
+  /// by this instance, or an error if conversion to this format failed or the
+  /// format is NoFormat.
+  Expected<std::string> getMatchingString(ExpressionValue Value) const;
+
+  /// \returns the value corresponding to string representation \p StrVal
+  /// according to the matching format represented by this instance or an error
+  /// with diagnostic against \p SM if \p StrVal does not correspond to a valid
+  /// and representable value.
+  Expected<ExpressionValue> valueFromStringRepr(StringRef StrVal,
+                                                const SourceMgr &SM) const;
+};
+
+/// Class to represent an overflow error that might result when manipulating a
+/// value.
+class OverflowError : public ErrorInfo<OverflowError> {
+public:
+  static char ID;
+
+  std::error_code convertToErrorCode() const override {
+    return std::make_error_code(std::errc::value_too_large);
+  }
+
+  void log(raw_ostream &OS) const override { OS << "overflow error"; }
+};
+
+/// Class representing a numeric value.
+class ExpressionValue {
+private:
+  uint64_t Value;
+  bool Negative;
+
+public:
+  template <class T>
+  explicit ExpressionValue(T Val) : Value(Val), Negative(Val < 0) {}
+
+  bool operator==(const ExpressionValue &Other) const {
+    return Value == Other.Value && isNegative() == Other.isNegative();
+  }
+
+  bool operator!=(const ExpressionValue &Other) const {
+    return !(*this == Other);
+  }
+
+  /// Returns true if value is signed and negative, false otherwise.
+  bool isNegative() const {
+    assert((Value != 0 || !Negative) && "Unexpected negative zero!");
+    return Negative;
+  }
+
+  /// \returns the value as a signed integer or an error if the value is out of
+  /// range.
+  Expected<int64_t> getSignedValue() const;
+
+  /// \returns the value as an unsigned integer or an error if the value is out
+  /// of range.
+  Expected<uint64_t> getUnsignedValue() const;
+
+  /// \returns an unsigned ExpressionValue instance whose value is the absolute
+  /// value to this object's value.
+  ExpressionValue getAbsolute() const;
+};
+
+/// Performs operation and \returns its result or an error in case of failure,
+/// such as if an overflow occurs.
+Expected<ExpressionValue> operator+(const ExpressionValue &Lhs,
+                                    const ExpressionValue &Rhs);
+Expected<ExpressionValue> operator-(const ExpressionValue &Lhs,
+                                    const ExpressionValue &Rhs);
+Expected<ExpressionValue> operator*(const ExpressionValue &Lhs,
+                                    const ExpressionValue &Rhs);
+Expected<ExpressionValue> operator/(const ExpressionValue &Lhs,
+                                    const ExpressionValue &Rhs);
+Expected<ExpressionValue> max(const ExpressionValue &Lhs,
+                              const ExpressionValue &Rhs);
+Expected<ExpressionValue> min(const ExpressionValue &Lhs,
+                              const ExpressionValue &Rhs);
+
+/// Base class representing the AST of a given expression.
+class ExpressionAST {
+private:
+  StringRef ExpressionStr;
+
+public:
+  ExpressionAST(StringRef ExpressionStr) : ExpressionStr(ExpressionStr) {}
+
+  virtual ~ExpressionAST() = default;
+
+  StringRef getExpressionStr() const { return ExpressionStr; }
+
+  /// Evaluates and \returns the value of the expression represented by this
+  /// AST or an error if evaluation fails.
+  virtual Expected<ExpressionValue> eval() const = 0;
+
+  /// \returns either the implicit format of this AST, a diagnostic against
+  /// \p SM if implicit formats of the AST's components conflict, or NoFormat
+  /// if the AST has no implicit format (e.g. AST is made up of a single
+  /// literal).
+  virtual Expected<ExpressionFormat>
+  getImplicitFormat(const SourceMgr &SM) const {
+    return ExpressionFormat();
+  }
+};
+
+/// Class representing an unsigned literal in the AST of an expression.
+class ExpressionLiteral : public ExpressionAST {
+private:
+  /// Actual value of the literal.
+  ExpressionValue Value;
+
+public:
+  template <class T>
+  explicit ExpressionLiteral(StringRef ExpressionStr, T Val)
+      : ExpressionAST(ExpressionStr), Value(Val) {}
+
+  /// \returns the literal's value.
+  Expected<ExpressionValue> eval() const override { return Value; }
+};
+
+/// Class to represent an undefined variable error, which quotes that
+/// variable's name when printed.
+class UndefVarError : public ErrorInfo<UndefVarError> {
+private:
+  StringRef VarName;
+
+public:
+  static char ID;
+
+  UndefVarError(StringRef VarName) : VarName(VarName) {}
+
+  StringRef getVarName() const { return VarName; }
+
+  std::error_code convertToErrorCode() const override {
+    return inconvertibleErrorCode();
+  }
+
+  /// Print name of variable associated with this error.
+  void log(raw_ostream &OS) const override {
+    OS << "\"";
+    OS.write_escaped(VarName) << "\"";
+  }
+};
+
+/// Class representing an expression and its matching format.
+class Expression {
+private:
+  /// Pointer to AST of the expression.
+  std::unique_ptr<ExpressionAST> AST;
+
+  /// Format to use (e.g. hex upper case letters) when matching the value.
+  ExpressionFormat Format;
+
+public:
+  /// Generic constructor for an expression represented by the given \p AST and
+  /// whose matching format is \p Format.
+  Expression(std::unique_ptr<ExpressionAST> AST, ExpressionFormat Format)
+      : AST(std::move(AST)), Format(Format) {}
+
+  /// \returns pointer to AST of the expression. Pointer is guaranteed to be
+  /// valid as long as this object is.
+  ExpressionAST *getAST() const { return AST.get(); }
+
+  ExpressionFormat getFormat() const { return Format; }
+};
+
+/// Class representing a numeric variable and its associated current value.
+class NumericVariable {
+private:
+  /// Name of the numeric variable.
+  StringRef Name;
+
+  /// Format to use for expressions using this variable without an explicit
+  /// format.
+  ExpressionFormat ImplicitFormat;
+
+  /// Value of numeric variable, if defined, or None otherwise.
+  Optional<ExpressionValue> Value;
+
+  /// The input buffer's string from which Value was parsed, or None.  See
+  /// comments on getStringValue for a discussion of the None case.
+  Optional<StringRef> StrValue;
+
+  /// Line number where this variable is defined, or None if defined before
+  /// input is parsed. Used to determine whether a variable is defined on the
+  /// same line as a given use.
+  Optional<size_t> DefLineNumber;
+
+public:
+  /// Constructor for a variable \p Name with implicit format \p ImplicitFormat
+  /// defined at line \p DefLineNumber or defined before input is parsed if
+  /// \p DefLineNumber is None.
+  explicit NumericVariable(StringRef Name, ExpressionFormat ImplicitFormat,
+                           Optional<size_t> DefLineNumber = None)
+      : Name(Name), ImplicitFormat(ImplicitFormat),
+        DefLineNumber(DefLineNumber) {}
+
+  /// \returns name of this numeric variable.
+  StringRef getName() const { return Name; }
+
+  /// \returns implicit format of this numeric variable.
+  ExpressionFormat getImplicitFormat() const { return ImplicitFormat; }
+
+  /// \returns this variable's value.
+  Optional<ExpressionValue> getValue() const { return Value; }
+
+  /// \returns the input buffer's string from which this variable's value was
+  /// parsed, or None if the value is not yet defined or was not parsed from the
+  /// input buffer.  For example, the value of @LINE is not parsed from the
+  /// input buffer, and some numeric variables are parsed from the command
+  /// line instead.
+  Optional<StringRef> getStringValue() const { return StrValue; }
+
+  /// Sets value of this numeric variable to \p NewValue, and sets the input
+  /// buffer string from which it was parsed to \p NewStrValue.  See comments on
+  /// getStringValue for a discussion of when the latter can be None.
+  void setValue(ExpressionValue NewValue,
+                Optional<StringRef> NewStrValue = None) {
+    Value = NewValue;
+    StrValue = NewStrValue;
+  }
+
+  /// Clears value of this numeric variable, regardless of whether it is
+  /// currently defined or not.
+  void clearValue() {
+    Value = None;
+    StrValue = None;
+  }
+
+  /// \returns the line number where this variable is defined, if any, or None
+  /// if defined before input is parsed.
+  Optional<size_t> getDefLineNumber() const { return DefLineNumber; }
+};
+
+/// Class representing the use of a numeric variable in the AST of an
+/// expression.
+class NumericVariableUse : public ExpressionAST {
+private:
+  /// Pointer to the class instance for the variable this use is about.
+  NumericVariable *Variable;
+
+public:
+  NumericVariableUse(StringRef Name, NumericVariable *Variable)
+      : ExpressionAST(Name), Variable(Variable) {}
+  /// \returns the value of the variable referenced by this instance.
+  Expected<ExpressionValue> eval() const override;
+
+  /// \returns implicit format of this numeric variable.
+  Expected<ExpressionFormat>
+  getImplicitFormat(const SourceMgr &SM) const override {
+    return Variable->getImplicitFormat();
+  }
+};
+
+/// Type of functions evaluating a given binary operation.
+using binop_eval_t = Expected<ExpressionValue> (*)(const ExpressionValue &,
+                                                   const ExpressionValue &);
+
+/// Class representing a single binary operation in the AST of an expression.
+class BinaryOperation : public ExpressionAST {
+private:
+  /// Left operand.
+  std::unique_ptr<ExpressionAST> LeftOperand;
+
+  /// Right operand.
+  std::unique_ptr<ExpressionAST> RightOperand;
+
+  /// Pointer to function that can evaluate this binary operation.
+  binop_eval_t EvalBinop;
+
+public:
+  BinaryOperation(StringRef ExpressionStr, binop_eval_t EvalBinop,
+                  std::unique_ptr<ExpressionAST> LeftOp,
+                  std::unique_ptr<ExpressionAST> RightOp)
+      : ExpressionAST(ExpressionStr), EvalBinop(EvalBinop) {
+    LeftOperand = std::move(LeftOp);
+    RightOperand = std::move(RightOp);
+  }
+
+  /// Evaluates the value of the binary operation represented by this AST,
+  /// using EvalBinop on the result of recursively evaluating the operands.
+  /// \returns the expression value or an error if an undefined numeric
+  /// variable is used in one of the operands.
+  Expected<ExpressionValue> eval() const override;
+
+  /// \returns the implicit format of this AST, if any, a diagnostic against
+  /// \p SM if the implicit formats of the AST's components conflict, or no
+  /// format if the AST has no implicit format (e.g. AST is made of a single
+  /// literal).
+  Expected<ExpressionFormat>
+  getImplicitFormat(const SourceMgr &SM) const override;
+};
+
+class FileCheckPatternContext;
+
+/// Class representing a substitution to perform in the RegExStr string.
+class Substitution {
+protected:
+  /// Pointer to a class instance holding, among other things, the table with
+  /// the values of live string variables at the start of any given CHECK line.
+  /// Used for substituting string variables with the text they were defined
+  /// as. Expressions are linked to the numeric variables they use at
+  /// parse time and directly access the value of the numeric variable to
+  /// evaluate their value.
+  FileCheckPatternContext *Context;
+
+  /// The string that needs to be substituted for something else. For a
+  /// string variable this is its name, otherwise this is the whole expression.
+  StringRef FromStr;
+
+  // Index in RegExStr of where to do the substitution.
+  size_t InsertIdx;
+
+public:
+  Substitution(FileCheckPatternContext *Context, StringRef VarName,
+               size_t InsertIdx)
+      : Context(Context), FromStr(VarName), InsertIdx(InsertIdx) {}
+
+  virtual ~Substitution() = default;
+
+  /// \returns the string to be substituted for something else.
+  StringRef getFromString() const { return FromStr; }
+
+  /// \returns the index where the substitution is to be performed in RegExStr.
+  size_t getIndex() const { return InsertIdx; }
+
+  /// \returns a string containing the result of the substitution represented
+  /// by this class instance or an error if substitution failed.
+  virtual Expected<std::string> getResult() const = 0;
+};
+
+class StringSubstitution : public Substitution {
+public:
+  StringSubstitution(FileCheckPatternContext *Context, StringRef VarName,
+                     size_t InsertIdx)
+      : Substitution(Context, VarName, InsertIdx) {}
+
+  /// \returns the text that the string variable in this substitution matched
+  /// when defined, or an error if the variable is undefined.
+  Expected<std::string> getResult() const override;
+};
+
+class NumericSubstitution : public Substitution {
+private:
+  /// Pointer to the class representing the expression whose value is to be
+  /// substituted.
+  std::unique_ptr<Expression> ExpressionPointer;
+
+public:
+  NumericSubstitution(FileCheckPatternContext *Context, StringRef ExpressionStr,
+                      std::unique_ptr<Expression> ExpressionPointer,
+                      size_t InsertIdx)
+      : Substitution(Context, ExpressionStr, InsertIdx),
+        ExpressionPointer(std::move(ExpressionPointer)) {}
+
+  /// \returns a string containing the result of evaluating the expression in
+  /// this substitution, or an error if evaluation failed.
+  Expected<std::string> getResult() const override;
+};
+
+//===----------------------------------------------------------------------===//
+// Pattern handling code.
+//===----------------------------------------------------------------------===//
+
+/// Class holding the Pattern global state, shared by all patterns: tables
+/// holding values of variables and whether they are defined or not at any
+/// given time in the matching process.
+class FileCheckPatternContext {
+  friend class Pattern;
+
+private:
+  /// When matching a given pattern, this holds the value of all the string
+  /// variables defined in previous patterns. In a pattern, only the last
+  /// definition for a given variable is recorded in this table.
+  /// Back-references are used for uses after any the other definition.
+  StringMap<StringRef> GlobalVariableTable;
+
+  /// Map of all string variables defined so far. Used at parse time to detect
+  /// a name conflict between a numeric variable and a string variable when
+  /// the former is defined on a later line than the latter.
+  StringMap<bool> DefinedVariableTable;
+
+  /// When matching a given pattern, this holds the pointers to the classes
+  /// representing the numeric variables defined in previous patterns. When
+  /// matching a pattern all definitions for that pattern are recorded in the
+  /// NumericVariableDefs table in the Pattern instance of that pattern.
+  StringMap<NumericVariable *> GlobalNumericVariableTable;
+
+  /// Pointer to the class instance representing the @LINE pseudo variable for
+  /// easily updating its value.
+  NumericVariable *LineVariable = nullptr;
+
+  /// Vector holding pointers to all parsed numeric variables. Used to
+  /// automatically free them once they are guaranteed to no longer be used.
+  std::vector<std::unique_ptr<NumericVariable>> NumericVariables;
+
+  /// Vector holding pointers to all parsed expressions. Used to automatically
+  /// free the expressions once they are guaranteed to no longer be used.
+  std::vector<std::unique_ptr<Expression>> Expressions;
+
+  /// Vector holding pointers to all substitutions. Used to automatically free
+  /// them once they are guaranteed to no longer be used.
+  std::vector<std::unique_ptr<Substitution>> Substitutions;
+
+public:
+  /// \returns the value of string variable \p VarName or an error if no such
+  /// variable has been defined.
+  Expected<StringRef> getPatternVarValue(StringRef VarName);
+
+  /// Defines string and numeric variables from definitions given on the
+  /// command line, passed as a vector of [#]VAR=VAL strings in
+  /// \p CmdlineDefines. \returns an error list containing diagnostics against
+  /// \p SM for all definition parsing failures, if any, or Success otherwise.
+  Error defineCmdlineVariables(ArrayRef<StringRef> CmdlineDefines,
+                               SourceMgr &SM);
+
+  /// Create @LINE pseudo variable. Value is set when pattern are being
+  /// matched.
+  void createLineVariable();
+
+  /// Undefines local variables (variables whose name does not start with a '$'
+  /// sign), i.e. removes them from GlobalVariableTable and from
+  /// GlobalNumericVariableTable and also clears the value of numeric
+  /// variables.
+  void clearLocalVars();
+
+private:
+  /// Makes a new numeric variable and registers it for destruction when the
+  /// context is destroyed.
+  template <class... Types> NumericVariable *makeNumericVariable(Types... args);
+
+  /// Makes a new string substitution and registers it for destruction when the
+  /// context is destroyed.
+  Substitution *makeStringSubstitution(StringRef VarName, size_t InsertIdx);
+
+  /// Makes a new numeric substitution and registers it for destruction when
+  /// the context is destroyed.
+  Substitution *makeNumericSubstitution(StringRef ExpressionStr,
+                                        std::unique_ptr<Expression> Expression,
+                                        size_t InsertIdx);
+};
+
+/// Class to represent an error holding a diagnostic with location information
+/// used when printing it.
+class ErrorDiagnostic : public ErrorInfo<ErrorDiagnostic> {
+private:
+  SMDiagnostic Diagnostic;
+
+public:
+  static char ID;
+
+  ErrorDiagnostic(SMDiagnostic &&Diag) : Diagnostic(Diag) {}
+
+  std::error_code convertToErrorCode() const override {
+    return inconvertibleErrorCode();
+  }
+
+  /// Print diagnostic associated with this error when printing the error.
+  void log(raw_ostream &OS) const override { Diagnostic.print(nullptr, OS); }
+
+  static Error get(const SourceMgr &SM, SMLoc Loc, const Twine &ErrMsg) {
+    return make_error<ErrorDiagnostic>(
+        SM.GetMessage(Loc, SourceMgr::DK_Error, ErrMsg));
+  }
+
+  static Error get(const SourceMgr &SM, StringRef Buffer, const Twine &ErrMsg) {
+    return get(SM, SMLoc::getFromPointer(Buffer.data()), ErrMsg);
+  }
+};
+
+class NotFoundError : public ErrorInfo<NotFoundError> {
+public:
+  static char ID;
+
+  std::error_code convertToErrorCode() const override {
+    return inconvertibleErrorCode();
+  }
+
+  /// Print diagnostic associated with this error when printing the error.
+  void log(raw_ostream &OS) const override {
+    OS << "String not found in input";
+  }
+};
+
+class Pattern {
+  SMLoc PatternLoc;
+
+  /// A fixed string to match as the pattern or empty if this pattern requires
+  /// a regex match.
+  StringRef FixedStr;
+
+  /// A regex string to match as the pattern or empty if this pattern requires
+  /// a fixed string to match.
+  std::string RegExStr;
+
+  /// Entries in this vector represent a substitution of a string variable or
+  /// an expression in the RegExStr regex at match time. For example, in the
+  /// case of a CHECK directive with the pattern "foo[[bar]]baz[[#N+1]]",
+  /// RegExStr will contain "foobaz" and we'll get two entries in this vector
+  /// that tells us to insert the value of string variable "bar" at offset 3
+  /// and the value of expression "N+1" at offset 6.
+  std::vector<Substitution *> Substitutions;
+
+  /// Maps names of string variables defined in a pattern to the number of
+  /// their parenthesis group in RegExStr capturing their last definition.
+  ///
+  /// E.g. for the pattern "foo[[bar:.*]]baz([[bar]][[QUUX]][[bar:.*]])",
+  /// RegExStr will be "foo(.*)baz(\1<quux value>(.*))" where <quux value> is
+  /// the value captured for QUUX on the earlier line where it was defined, and
+  /// VariableDefs will map "bar" to the third parenthesis group which captures
+  /// the second definition of "bar".
+  ///
+  /// Note: uses std::map rather than StringMap to be able to get the key when
+  /// iterating over values.
+  std::map<StringRef, unsigned> VariableDefs;
+
+  /// Structure representing the definition of a numeric variable in a pattern.
+  /// It holds the pointer to the class instance holding the value and matching
+  /// format of the numeric variable whose value is being defined and the
+  /// number of the parenthesis group in RegExStr to capture that value.
+  struct NumericVariableMatch {
+    /// Pointer to class instance holding the value and matching format of the
+    /// numeric variable being defined.
+    NumericVariable *DefinedNumericVariable;
+
+    /// Number of the parenthesis group in RegExStr that captures the value of
+    /// this numeric variable definition.
+    unsigned CaptureParenGroup;
+  };
+
+  /// Holds the number of the parenthesis group in RegExStr and pointer to the
+  /// corresponding NumericVariable class instance of all numeric variable
+  /// definitions. Used to set the matched value of all those variables.
+  StringMap<NumericVariableMatch> NumericVariableDefs;
+
+  /// Pointer to a class instance holding the global state shared by all
+  /// patterns:
+  /// - separate tables with the values of live string and numeric variables
+  ///   respectively at the start of any given CHECK line;
+  /// - table holding whether a string variable has been defined at any given
+  ///   point during the parsing phase.
+  FileCheckPatternContext *Context;
+
+  Check::FileCheckType CheckTy;
+
+  /// Line number for this CHECK pattern or None if it is an implicit pattern.
+  /// Used to determine whether a variable definition is made on an earlier
+  /// line to the one with this CHECK.
+  Optional<size_t> LineNumber;
+
+  /// Ignore case while matching if set to true.
+  bool IgnoreCase = false;
+
+public:
+  Pattern(Check::FileCheckType Ty, FileCheckPatternContext *Context,
+          Optional<size_t> Line = None)
+      : Context(Context), CheckTy(Ty), LineNumber(Line) {}
+
+  /// \returns the location in source code.
+  SMLoc getLoc() const { return PatternLoc; }
+
+  /// \returns the pointer to the global state for all patterns in this
+  /// FileCheck instance.
+  FileCheckPatternContext *getContext() const { return Context; }
+
+  /// \returns whether \p C is a valid first character for a variable name.
+  static bool isValidVarNameStart(char C);
+
+  /// Parsing information about a variable.
+  struct VariableProperties {
+    StringRef Name;
+    bool IsPseudo;
+  };
+
+  /// Parses the string at the start of \p Str for a variable name. \returns
+  /// a VariableProperties structure holding the variable name and whether it
+  /// is the name of a pseudo variable, or an error holding a diagnostic
+  /// against \p SM if parsing fail. If parsing was successful, also strips
+  /// \p Str from the variable name.
+  static Expected<VariableProperties> parseVariable(StringRef &Str,
+                                                    const SourceMgr &SM);
+  /// Parses \p Expr for a numeric substitution block at line \p LineNumber,
+  /// or before input is parsed if \p LineNumber is None. Parameter
+  /// \p IsLegacyLineExpr indicates whether \p Expr should be a legacy @LINE
+  /// expression and \p Context points to the class instance holding the live
+  /// string and numeric variables. \returns a pointer to the class instance
+  /// representing the expression whose value must be substitued, or an error
+  /// holding a diagnostic against \p SM if parsing fails. If substitution was
+  /// successful, sets \p DefinedNumericVariable to point to the class
+  /// representing the numeric variable defined in this numeric substitution
+  /// block, or None if this block does not define any variable.
+  static Expected<std::unique_ptr<Expression>> parseNumericSubstitutionBlock(
+      StringRef Expr, Optional<NumericVariable *> &DefinedNumericVariable,
+      bool IsLegacyLineExpr, Optional<size_t> LineNumber,
+      FileCheckPatternContext *Context, const SourceMgr &SM);
+  /// Parses the pattern in \p PatternStr and initializes this Pattern instance
+  /// accordingly.
+  ///
+  /// \p Prefix provides which prefix is being matched, \p Req describes the
+  /// global options that influence the parsing such as whitespace
+  /// canonicalization, \p SM provides the SourceMgr used for error reports.
+  /// \returns true in case of an error, false otherwise.
+  bool parsePattern(StringRef PatternStr, StringRef Prefix, SourceMgr &SM,
+                    const FileCheckRequest &Req);
+  /// Matches the pattern string against the input buffer \p Buffer
+  ///
+  /// \returns the position that is matched or an error indicating why matching
+  /// failed. If there is a match, updates \p MatchLen with the size of the
+  /// matched string.
+  ///
+  /// The GlobalVariableTable StringMap in the FileCheckPatternContext class
+  /// instance provides the current values of FileCheck string variables and is
+  /// updated if this match defines new values. Likewise, the
+  /// GlobalNumericVariableTable StringMap in the same class provides the
+  /// current values of FileCheck numeric variables and is updated if this
+  /// match defines new numeric values.
+  Expected<size_t> match(StringRef Buffer, size_t &MatchLen,
+                         const SourceMgr &SM) const;
+  /// Prints the value of successful substitutions or the name of the undefined
+  /// string or numeric variables preventing a successful substitution.
+  void printSubstitutions(const SourceMgr &SM, StringRef Buffer,
+                          SMRange MatchRange, FileCheckDiag::MatchType MatchTy,
+                          std::vector<FileCheckDiag> *Diags) const;
+  void printFuzzyMatch(const SourceMgr &SM, StringRef Buffer,
+                       std::vector<FileCheckDiag> *Diags) const;
+
+  bool hasVariable() const {
+    return !(Substitutions.empty() && VariableDefs.empty());
+  }
+  void printVariableDefs(const SourceMgr &SM, FileCheckDiag::MatchType MatchTy,
+                         std::vector<FileCheckDiag> *Diags) const;
+
+  Check::FileCheckType getCheckTy() const { return CheckTy; }
+
+  int getCount() const { return CheckTy.getCount(); }
+
+private:
+  bool AddRegExToRegEx(StringRef RS, unsigned &CurParen, SourceMgr &SM);
+  void AddBackrefToRegEx(unsigned BackrefNum);
+  /// Computes an arbitrary estimate for the quality of matching this pattern
+  /// at the start of \p Buffer; a distance of zero should correspond to a
+  /// perfect match.
+  unsigned computeMatchDistance(StringRef Buffer) const;
+  /// Finds the closing sequence of a regex variable usage or definition.
+  ///
+  /// \p Str has to point in the beginning of the definition (right after the
+  /// opening sequence). \p SM holds the SourceMgr used for error reporting.
+  ///  \returns the offset of the closing sequence within Str, or npos if it
+  /// was not found.
+  static size_t FindRegexVarEnd(StringRef Str, SourceMgr &SM);
+
+  /// Parses \p Expr for the name of a numeric variable to be defined at line
+  /// \p LineNumber, or before input is parsed if \p LineNumber is None.
+  /// \returns a pointer to the class instance representing that variable,
+  /// creating it if needed, or an error holding a diagnostic against \p SM
+  /// should defining such a variable be invalid.
+  static Expected<NumericVariable *> parseNumericVariableDefinition(
+      StringRef &Expr, FileCheckPatternContext *Context,
+      Optional<size_t> LineNumber, ExpressionFormat ImplicitFormat,
+      const SourceMgr &SM);
+  /// Parses \p Name as a (pseudo if \p IsPseudo is true) numeric variable use
+  /// at line \p LineNumber, or before input is parsed if \p LineNumber is
+  /// None. Parameter \p Context points to the class instance holding the live
+  /// string and numeric variables. \returns the pointer to the class instance
+  /// representing that variable if successful, or an error holding a
+  /// diagnostic against \p SM otherwise.
+  static Expected<std::unique_ptr<NumericVariableUse>> parseNumericVariableUse(
+      StringRef Name, bool IsPseudo, Optional<size_t> LineNumber,
+      FileCheckPatternContext *Context, const SourceMgr &SM);
+  enum class AllowedOperand { LineVar, LegacyLiteral, Any };
+  /// Parses \p Expr for use of a numeric operand at line \p LineNumber, or
+  /// before input is parsed if \p LineNumber is None. Accepts literal values,
+  /// numeric variables and function calls, depending on the value of \p AO.
+  /// \p MaybeInvalidConstraint indicates whether the text being parsed could
+  /// be an invalid constraint. \p Context points to the class instance holding
+  /// the live string and numeric variables. \returns the class representing
+  /// that operand in the AST of the expression or an error holding a
+  /// diagnostic against \p SM otherwise. If \p Expr starts with a "(" this
+  /// function will attempt to parse a parenthesized expression.
+  static Expected<std::unique_ptr<ExpressionAST>>
+  parseNumericOperand(StringRef &Expr, AllowedOperand AO, bool ConstraintParsed,
+                      Optional<size_t> LineNumber,
+                      FileCheckPatternContext *Context, const SourceMgr &SM);
+  /// Parses and updates \p RemainingExpr for a binary operation at line
+  /// \p LineNumber, or before input is parsed if \p LineNumber is None. The
+  /// left operand of this binary operation is given in \p LeftOp and \p Expr
+  /// holds the string for the full expression, including the left operand.
+  /// Parameter \p IsLegacyLineExpr indicates whether we are parsing a legacy
+  /// @LINE expression. Parameter \p Context points to the class instance
+  /// holding the live string and numeric variables. \returns the class
+  /// representing the binary operation in the AST of the expression, or an
+  /// error holding a diagnostic against \p SM otherwise.
+  static Expected<std::unique_ptr<ExpressionAST>>
+  parseBinop(StringRef Expr, StringRef &RemainingExpr,
+             std::unique_ptr<ExpressionAST> LeftOp, bool IsLegacyLineExpr,
+             Optional<size_t> LineNumber, FileCheckPatternContext *Context,
+             const SourceMgr &SM);
+
+  /// Parses a parenthesized expression inside \p Expr at line \p LineNumber, or
+  /// before input is parsed if \p LineNumber is None. \p Expr must start with
+  /// a '('. Accepts both literal values and numeric variables. Parameter \p
+  /// Context points to the class instance holding the live string and numeric
+  /// variables. \returns the class representing that operand in the AST of the
+  /// expression or an error holding a diagnostic against \p SM otherwise.
+  static Expected<std::unique_ptr<ExpressionAST>>
+  parseParenExpr(StringRef &Expr, Optional<size_t> LineNumber,
+                 FileCheckPatternContext *Context, const SourceMgr &SM);
+
+  /// Parses \p Expr for an argument list belonging to a call to function \p
+  /// FuncName at line \p LineNumber, or before input is parsed if \p LineNumber
+  /// is None. Parameter \p FuncLoc is the source location used for diagnostics.
+  /// Parameter \p Context points to the class instance holding the live string
+  /// and numeric variables. \returns the class representing that call in the
+  /// AST of the expression or an error holding a diagnostic against \p SM
+  /// otherwise.
+  static Expected<std::unique_ptr<ExpressionAST>>
+  parseCallExpr(StringRef &Expr, StringRef FuncName,
+                Optional<size_t> LineNumber, FileCheckPatternContext *Context,
+                const SourceMgr &SM);
+};
+
+//===----------------------------------------------------------------------===//
+// Check Strings.
+//===----------------------------------------------------------------------===//
+
+/// A check that we found in the input file.
+struct FileCheckString {
+  /// The pattern to match.
+  Pattern Pat;
+
+  /// Which prefix name this check matched.
+  StringRef Prefix;
+
+  /// The location in the match file that the check string was specified.
+  SMLoc Loc;
+
+  /// All of the strings that are disallowed from occurring between this match
+  /// string and the previous one (or start of file).
+  std::vector<Pattern> DagNotStrings;
+
+  FileCheckString(const Pattern &P, StringRef S, SMLoc L)
+      : Pat(P), Prefix(S), Loc(L) {}
+
+  /// Matches check string and its "not strings" and/or "dag strings".
+  size_t Check(const SourceMgr &SM, StringRef Buffer, bool IsLabelScanMode,
+               size_t &MatchLen, FileCheckRequest &Req,
+               std::vector<FileCheckDiag> *Diags) const;
+
+  /// Verifies that there is a single line in the given \p Buffer. Errors are
+  /// reported against \p SM.
+  bool CheckNext(const SourceMgr &SM, StringRef Buffer) const;
+  /// Verifies that there is no newline in the given \p Buffer. Errors are
+  /// reported against \p SM.
+  bool CheckSame(const SourceMgr &SM, StringRef Buffer) const;
+  /// Verifies that none of the strings in \p NotStrings are found in the given
+  /// \p Buffer. Errors are reported against \p SM and diagnostics recorded in
+  /// \p Diags according to the verbosity level set in \p Req.
+  bool CheckNot(const SourceMgr &SM, StringRef Buffer,
+                const std::vector<const Pattern *> &NotStrings,
+                const FileCheckRequest &Req,
+                std::vector<FileCheckDiag> *Diags) const;
+  /// Matches "dag strings" and their mixed "not strings".
+  size_t CheckDag(const SourceMgr &SM, StringRef Buffer,
+                  std::vector<const Pattern *> &NotStrings,
+                  const FileCheckRequest &Req,
+                  std::vector<FileCheckDiag> *Diags) const;
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/Frontend/OpenMP/OMPContext.cpp b/contrib/llvm-project/llvm/lib/Frontend/OpenMP/OMPContext.cpp
index c44e858ab5ed..11d8da097c6c 100644
--- a/contrib/llvm-project/llvm/lib/Frontend/OpenMP/OMPContext.cpp
+++ b/contrib/llvm-project/llvm/lib/Frontend/OpenMP/OMPContext.cpp
@@ -14,7 +14,9 @@
 
 #include "llvm/Frontend/OpenMP/OMPContext.h"
 #include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -40,6 +42,7 @@ OMPContext::OMPContext(bool IsDeviceCompilation, Triple TargetTriple) {
   case Triple::mips64:
   case Triple::mips64el:
   case Triple::ppc:
+  case Triple::ppcle:
   case Triple::ppc64:
   case Triple::ppc64le:
   case Triple::x86:
@@ -57,9 +60,13 @@ OMPContext::OMPContext(bool IsDeviceCompilation, Triple TargetTriple) {
 
   // Add the appropriate device architecture trait based on the triple.
 #define OMP_TRAIT_PROPERTY(Enum, TraitSetEnum, TraitSelectorEnum, Str)         \
-  if (TraitSelector::TraitSelectorEnum == TraitSelector::device_arch)          \
+  if (TraitSelector::TraitSelectorEnum == TraitSelector::device_arch) {        \
     if (TargetTriple.getArch() == TargetTriple.getArchTypeForLLVMName(Str))    \
-      ActiveTraits.set(unsigned(TraitProperty::Enum));
+      ActiveTraits.set(unsigned(TraitProperty::Enum));                         \
+    if (StringRef(Str) == StringRef("x86_64") &&                               \
+        TargetTriple.getArch() == Triple::x86_64)                              \
+      ActiveTraits.set(unsigned(TraitProperty::Enum));                         \
+  }
 #include "llvm/Frontend/OpenMP/OMPKinds.def"
 
   // TODO: What exactly do we want to see as device ISA trait?
@@ -175,11 +182,11 @@ static int isVariantApplicableInContextHelper(
     LLVM_DEBUG({
       if (MK == MK_ALL)
         dbgs() << "[" << DEBUG_TYPE << "] Property "
-               << getOpenMPContextTraitPropertyName(Property)
+               << getOpenMPContextTraitPropertyName(Property, "")
                << " was not in the OpenMP context but match kind is all.\n";
       if (MK == MK_NONE)
         dbgs() << "[" << DEBUG_TYPE << "] Property "
-               << getOpenMPContextTraitPropertyName(Property)
+               << getOpenMPContextTraitPropertyName(Property, "")
                << " was in the OpenMP context but match kind is none.\n";
     });
     return false;
@@ -198,6 +205,14 @@ static int isVariantApplicableInContextHelper(
       continue;
 
     bool IsActiveTrait = Ctx.ActiveTraits.test(unsigned(Property));
+
+    // We overwrite the isa trait as it is actually up to the OMPContext hook to
+    // check the raw string(s).
+    if (Property == TraitProperty::device_isa___ANY)
+      IsActiveTrait = llvm::all_of(VMI.ISATraits, [&](StringRef RawString) {
+        return Ctx.matchesISATrait(RawString);
+      });
+
     Optional<bool> Result = HandleTrait(Property, IsActiveTrait);
     if (Result.hasValue())
       return Result.getValue();
@@ -225,7 +240,7 @@ static int isVariantApplicableInContextHelper(
 
       if (!FoundInOrder) {
         LLVM_DEBUG(dbgs() << "[" << DEBUG_TYPE << "] Construct property "
-                          << getOpenMPContextTraitPropertyName(Property)
+                          << getOpenMPContextTraitPropertyName(Property, "")
                           << " was not nested properly.\n");
         return false;
       }
@@ -425,8 +440,12 @@ StringRef llvm::omp::getOpenMPContextTraitSelectorName(TraitSelector Kind) {
   llvm_unreachable("Unknown trait selector!");
 }
 
-TraitProperty llvm::omp::getOpenMPContextTraitPropertyKind(TraitSet Set,
-                                                           StringRef S) {
+TraitProperty llvm::omp::getOpenMPContextTraitPropertyKind(
+    TraitSet Set, TraitSelector Selector, StringRef S) {
+  // Special handling for `device={isa(...)}` as we accept anything here. It is
+  // up to the target to decide if the feature is available.
+  if (Set == TraitSet::device && Selector == TraitSelector::device_isa)
+    return TraitProperty::device_isa___ANY;
 #define OMP_TRAIT_PROPERTY(Enum, TraitSetEnum, TraitSelectorEnum, Str)         \
   if (Set == TraitSet::TraitSetEnum && Str == S)                               \
     return TraitProperty::Enum;
@@ -444,7 +463,10 @@ llvm::omp::getOpenMPContextTraitPropertyForSelector(TraitSelector Selector) {
 #include "llvm/Frontend/OpenMP/OMPKinds.def"
       .Default(TraitProperty::invalid);
 }
-StringRef llvm::omp::getOpenMPContextTraitPropertyName(TraitProperty Kind) {
+StringRef llvm::omp::getOpenMPContextTraitPropertyName(TraitProperty Kind,
+                                                       StringRef RawString) {
+  if (Kind == TraitProperty::device_isa___ANY)
+    return RawString;
   switch (Kind) {
 #define OMP_TRAIT_PROPERTY(Enum, TraitSetEnum, TraitSelectorEnum, Str)         \
   case TraitProperty::Enum:                                                    \
diff --git a/contrib/llvm-project/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/contrib/llvm-project/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 6c72cd01ce6e..1f67aecb57e9 100644
--- a/contrib/llvm-project/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/contrib/llvm-project/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -15,7 +15,7 @@
 #include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
 
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/IRBuilder.h"
@@ -126,7 +126,7 @@ Function *OpenMPIRBuilder::getOrCreateRuntimeFunctionPtr(RuntimeFunction FnID) {
 
 void OpenMPIRBuilder::initialize() { initializeTypes(M); }
 
-void OpenMPIRBuilder::finalize() {
+void OpenMPIRBuilder::finalize(bool AllowExtractorSinking) {
   SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
   SmallVector<BasicBlock *, 32> Blocks;
   for (OutlineInfo &OI : OutlineInfos) {
@@ -169,6 +169,25 @@ void OpenMPIRBuilder::finalize() {
       BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
       assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB);
       assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry);
+      if (AllowExtractorSinking) {
+        // Move instructions from the to-be-deleted ArtificialEntry to the entry
+        // basic block of the parallel region. CodeExtractor may have sunk
+        // allocas/bitcasts for values that are solely used in the outlined
+        // region and do not escape.
+        assert(!ArtificialEntry.empty() &&
+               "Expected instructions to sink in the outlined region");
+        for (BasicBlock::iterator It = ArtificialEntry.begin(),
+                                  End = ArtificialEntry.end();
+             It != End;) {
+          Instruction &I = *It;
+          It++;
+
+          if (I.isTerminator())
+            continue;
+
+          I.moveBefore(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt());
+        }
+      }
       OI.EntryBB->moveBefore(&ArtificialEntry);
       ArtificialEntry.eraseFromParent();
     }
@@ -214,7 +233,15 @@ Value *OpenMPIRBuilder::getOrCreateIdent(Constant *SrcLocStr,
     GV->setAlignment(Align(8));
     Ident = GV;
   }
-  return Ident;
+  return Builder.CreatePointerCast(Ident, IdentPtr);
+}
+
+Type *OpenMPIRBuilder::getLanemaskType() {
+  LLVMContext &Ctx = M.getContext();
+  Triple triple(M.getTargetTriple());
+
+  // This test is adequate until deviceRTL has finer grained lane widths
+  return triple.isAMDGCN() ? Type::getInt64Ty(Ctx) : Type::getInt32Ty(Ctx);
 }
 
 Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef LocStr) {
@@ -263,8 +290,10 @@ OpenMPIRBuilder::getOrCreateSrcLocStr(const LocationDescription &Loc) {
   DILocation *DIL = Loc.DL.get();
   if (!DIL)
     return getOrCreateDefaultSrcLocStr();
-  StringRef FileName =
-      !DIL->getFilename().empty() ? DIL->getFilename() : M.getName();
+  StringRef FileName = M.getName();
+  if (DIFile *DIF = DIL->getFile())
+    if (Optional<StringRef> Source = DIF->getSource())
+      FileName = *Source;
   StringRef Function = DIL->getScope()->getSubprogram()->getName();
   Function =
       !Function.empty() ? Function : Loc.IP.getBlock()->getParent()->getName();
@@ -279,7 +308,7 @@ Value *OpenMPIRBuilder::getOrCreateThreadID(Value *Ident) {
 }
 
 OpenMPIRBuilder::InsertPointTy
-OpenMPIRBuilder::CreateBarrier(const LocationDescription &Loc, Directive DK,
+OpenMPIRBuilder::createBarrier(const LocationDescription &Loc, Directive DK,
                                bool ForceSimpleCall, bool CheckCancelFlag) {
   if (!updateToLocation(Loc))
     return Loc.IP;
@@ -334,7 +363,7 @@ OpenMPIRBuilder::emitBarrierImpl(const LocationDescription &Loc, Directive Kind,
 }
 
 OpenMPIRBuilder::InsertPointTy
-OpenMPIRBuilder::CreateCancel(const LocationDescription &Loc,
+OpenMPIRBuilder::createCancel(const LocationDescription &Loc,
                               Value *IfCondition,
                               omp::Directive CanceledDirective) {
   if (!updateToLocation(Loc))
@@ -411,10 +440,11 @@ void OpenMPIRBuilder::emitCancelationCheckImpl(
   Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin());
 }
 
-IRBuilder<>::InsertPoint OpenMPIRBuilder::CreateParallel(
-    const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
-    PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, Value *IfCondition,
-    Value *NumThreads, omp::ProcBindKind ProcBind, bool IsCancellable) {
+IRBuilder<>::InsertPoint OpenMPIRBuilder::createParallel(
+    const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
+    BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
+    FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads,
+    omp::ProcBindKind ProcBind, bool IsCancellable) {
   if (!updateToLocation(Loc))
     return Loc.IP;
 
@@ -443,11 +473,17 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::CreateParallel(
   BasicBlock *InsertBB = Builder.GetInsertBlock();
   Function *OuterFn = InsertBB->getParent();
 
+  // Save the outer alloca block because the insertion iterator may get
+  // invalidated and we still need this later.
+  BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock();
+
   // Vector to remember instructions we used only during the modeling but which
   // we want to delete at the end.
   SmallVector<Instruction *, 4> ToBeDeleted;
 
-  Builder.SetInsertPoint(OuterFn->getEntryBlock().getFirstNonPHI());
+  // Change the location to the outer alloca insertion point to create and
+  // initialize the allocas we pass into the parallel region.
+  Builder.restoreIP(OuterAllocaIP);
   AllocaInst *TIDAddr = Builder.CreateAlloca(Int32, nullptr, "tid.addr");
   AllocaInst *ZeroAddr = Builder.CreateAlloca(Int32, nullptr, "zero.addr");
 
@@ -499,16 +535,17 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::CreateParallel(
 
   // Generate the privatization allocas in the block that will become the entry
   // of the outlined function.
-  InsertPointTy AllocaIP(PRegEntryBB,
-                         PRegEntryBB->getTerminator()->getIterator());
-  Builder.restoreIP(AllocaIP);
+  Builder.SetInsertPoint(PRegEntryBB->getTerminator());
+  InsertPointTy InnerAllocaIP = Builder.saveIP();
+
   AllocaInst *PrivTIDAddr =
       Builder.CreateAlloca(Int32, nullptr, "tid.addr.local");
   Instruction *PrivTID = Builder.CreateLoad(PrivTIDAddr, "tid");
 
   // Add some fake uses for OpenMP provided arguments.
   ToBeDeleted.push_back(Builder.CreateLoad(TIDAddr, "tid.addr.use"));
-  ToBeDeleted.push_back(Builder.CreateLoad(ZeroAddr, "zero.addr.use"));
+  Instruction *ZeroAddrUse = Builder.CreateLoad(ZeroAddr, "zero.addr.use");
+  ToBeDeleted.push_back(ZeroAddrUse);
 
   // ThenBB
   //   |
@@ -530,7 +567,7 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::CreateParallel(
   // Let the caller create the body.
   assert(BodyGenCB && "Expected body generation callback!");
   InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin());
-  BodyGenCB(AllocaIP, CodeGenIP, *PRegPreFiniBB);
+  BodyGenCB(InnerAllocaIP, CodeGenIP, *PRegPreFiniBB);
 
   LLVM_DEBUG(dbgs() << "After  body codegen: " << *OuterFn << "\n");
 
@@ -677,11 +714,37 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::CreateParallel(
     if (&V == TIDAddr || &V == ZeroAddr)
       return;
 
-    SmallVector<Use *, 8> Uses;
+    SetVector<Use *> Uses;
     for (Use &U : V.uses())
       if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
         if (ParallelRegionBlockSet.count(UserI->getParent()))
-          Uses.push_back(&U);
+          Uses.insert(&U);
+
+    // __kmpc_fork_call expects extra arguments as pointers. If the input
+    // already has a pointer type, everything is fine. Otherwise, store the
+    // value onto stack and load it back inside the to-be-outlined region. This
+    // will ensure only the pointer will be passed to the function.
+    // FIXME: if there are more than 15 trailing arguments, they must be
+    // additionally packed in a struct.
+    Value *Inner = &V;
+    if (!V.getType()->isPointerTy()) {
+      IRBuilder<>::InsertPointGuard Guard(Builder);
+      LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
+
+      Builder.restoreIP(OuterAllocaIP);
+      Value *Ptr =
+          Builder.CreateAlloca(V.getType(), nullptr, V.getName() + ".reloaded");
+
+      // Store to stack at end of the block that currently branches to the entry
+      // block of the to-be-outlined region.
+      Builder.SetInsertPoint(InsertBB,
+                             InsertBB->getTerminator()->getIterator());
+      Builder.CreateStore(&V, Ptr);
+
+      // Load back next to allocations in the to-be-outlined region.
+      Builder.restoreIP(InnerAllocaIP);
+      Inner = Builder.CreateLoad(Ptr);
+    }
 
     Value *ReplacementValue = nullptr;
     CallInst *CI = dyn_cast<CallInst>(&V);
@@ -689,7 +752,7 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::CreateParallel(
       ReplacementValue = PrivTID;
     } else {
       Builder.restoreIP(
-          PrivCB(AllocaIP, Builder.saveIP(), V, ReplacementValue));
+          PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue));
       assert(ReplacementValue &&
              "Expected copy/create callback to set replacement value!");
       if (ReplacementValue == &V)
@@ -700,10 +763,28 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::CreateParallel(
       UPtr->set(ReplacementValue);
   };
 
+  // Reset the inner alloca insertion as it will be used for loading the values
+  // wrapped into pointers before passing them into the to-be-outlined region.
+  // Configure it to insert immediately after the fake use of zero address so
+  // that they are available in the generated body and so that the
+  // OpenMP-related values (thread ID and zero address pointers) remain leading
+  // in the argument list.
+  InnerAllocaIP = IRBuilder<>::InsertPoint(
+      ZeroAddrUse->getParent(), ZeroAddrUse->getNextNode()->getIterator());
+
+  // Reset the outer alloca insertion point to the entry of the relevant block
+  // in case it was invalidated.
+  OuterAllocaIP = IRBuilder<>::InsertPoint(
+      OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
+
   for (Value *Input : Inputs) {
     LLVM_DEBUG(dbgs() << "Captured input: " << *Input << "\n");
     PrivHelper(*Input);
   }
+  LLVM_DEBUG({
+    for (Value *Output : Outputs)
+      LLVM_DEBUG(dbgs() << "Captured output: " << *Output << "\n");
+  });
   assert(Outputs.empty() &&
          "OpenMP outlining should not produce live-out values!");
 
@@ -730,7 +811,7 @@ void OpenMPIRBuilder::emitFlush(const LocationDescription &Loc) {
   Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_flush), Args);
 }
 
-void OpenMPIRBuilder::CreateFlush(const LocationDescription &Loc) {
+void OpenMPIRBuilder::createFlush(const LocationDescription &Loc) {
   if (!updateToLocation(Loc))
     return;
   emitFlush(Loc);
@@ -748,7 +829,7 @@ void OpenMPIRBuilder::emitTaskwaitImpl(const LocationDescription &Loc) {
                      Args);
 }
 
-void OpenMPIRBuilder::CreateTaskwait(const LocationDescription &Loc) {
+void OpenMPIRBuilder::createTaskwait(const LocationDescription &Loc) {
   if (!updateToLocation(Loc))
     return;
   emitTaskwaitImpl(Loc);
@@ -765,14 +846,14 @@ void OpenMPIRBuilder::emitTaskyieldImpl(const LocationDescription &Loc) {
                      Args);
 }
 
-void OpenMPIRBuilder::CreateTaskyield(const LocationDescription &Loc) {
+void OpenMPIRBuilder::createTaskyield(const LocationDescription &Loc) {
   if (!updateToLocation(Loc))
     return;
   emitTaskyieldImpl(Loc);
 }
 
 OpenMPIRBuilder::InsertPointTy
-OpenMPIRBuilder::CreateMaster(const LocationDescription &Loc,
+OpenMPIRBuilder::createMaster(const LocationDescription &Loc,
                               BodyGenCallbackTy BodyGenCB,
                               FinalizeCallbackTy FiniCB) {
 
@@ -795,7 +876,597 @@ OpenMPIRBuilder::CreateMaster(const LocationDescription &Loc,
                               /*Conditional*/ true, /*hasFinalize*/ true);
 }
 
-OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::CreateCritical(
+CanonicalLoopInfo *OpenMPIRBuilder::createLoopSkeleton(
+    DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore,
+    BasicBlock *PostInsertBefore, const Twine &Name) {
+  Module *M = F->getParent();
+  LLVMContext &Ctx = M->getContext();
+  Type *IndVarTy = TripCount->getType();
+
+  // Create the basic block structure.
+  BasicBlock *Preheader =
+      BasicBlock::Create(Ctx, "omp_" + Name + ".preheader", F, PreInsertBefore);
+  BasicBlock *Header =
+      BasicBlock::Create(Ctx, "omp_" + Name + ".header", F, PreInsertBefore);
+  BasicBlock *Cond =
+      BasicBlock::Create(Ctx, "omp_" + Name + ".cond", F, PreInsertBefore);
+  BasicBlock *Body =
+      BasicBlock::Create(Ctx, "omp_" + Name + ".body", F, PreInsertBefore);
+  BasicBlock *Latch =
+      BasicBlock::Create(Ctx, "omp_" + Name + ".inc", F, PostInsertBefore);
+  BasicBlock *Exit =
+      BasicBlock::Create(Ctx, "omp_" + Name + ".exit", F, PostInsertBefore);
+  BasicBlock *After =
+      BasicBlock::Create(Ctx, "omp_" + Name + ".after", F, PostInsertBefore);
+
+  // Use specified DebugLoc for new instructions.
+  Builder.SetCurrentDebugLocation(DL);
+
+  Builder.SetInsertPoint(Preheader);
+  Builder.CreateBr(Header);
+
+  Builder.SetInsertPoint(Header);
+  PHINode *IndVarPHI = Builder.CreatePHI(IndVarTy, 2, "omp_" + Name + ".iv");
+  IndVarPHI->addIncoming(ConstantInt::get(IndVarTy, 0), Preheader);
+  Builder.CreateBr(Cond);
+
+  Builder.SetInsertPoint(Cond);
+  Value *Cmp =
+      Builder.CreateICmpULT(IndVarPHI, TripCount, "omp_" + Name + ".cmp");
+  Builder.CreateCondBr(Cmp, Body, Exit);
+
+  Builder.SetInsertPoint(Body);
+  Builder.CreateBr(Latch);
+
+  Builder.SetInsertPoint(Latch);
+  Value *Next = Builder.CreateAdd(IndVarPHI, ConstantInt::get(IndVarTy, 1),
+                                  "omp_" + Name + ".next", /*HasNUW=*/true);
+  Builder.CreateBr(Header);
+  IndVarPHI->addIncoming(Next, Latch);
+
+  Builder.SetInsertPoint(Exit);
+  Builder.CreateBr(After);
+
+  // Remember and return the canonical control flow.
+  LoopInfos.emplace_front();
+  CanonicalLoopInfo *CL = &LoopInfos.front();
+
+  CL->Preheader = Preheader;
+  CL->Header = Header;
+  CL->Cond = Cond;
+  CL->Body = Body;
+  CL->Latch = Latch;
+  CL->Exit = Exit;
+  CL->After = After;
+
+  CL->IsValid = true;
+
+#ifndef NDEBUG
+  CL->assertOK();
+#endif
+  return CL;
+}
+
+CanonicalLoopInfo *
+OpenMPIRBuilder::createCanonicalLoop(const LocationDescription &Loc,
+                                     LoopBodyGenCallbackTy BodyGenCB,
+                                     Value *TripCount, const Twine &Name) {
+  BasicBlock *BB = Loc.IP.getBlock();
+  BasicBlock *NextBB = BB->getNextNode();
+
+  CanonicalLoopInfo *CL = createLoopSkeleton(Loc.DL, TripCount, BB->getParent(),
+                                             NextBB, NextBB, Name);
+  BasicBlock *After = CL->getAfter();
+
+  // If location is not set, don't connect the loop.
+  if (updateToLocation(Loc)) {
+    // Split the loop at the insertion point: Branch to the preheader and move
+    // every following instruction to after the loop (the After BB). Also, the
+    // new successor is the loop's after block.
+    Builder.CreateBr(CL->Preheader);
+    After->getInstList().splice(After->begin(), BB->getInstList(),
+                                Builder.GetInsertPoint(), BB->end());
+    After->replaceSuccessorsPhiUsesWith(BB, After);
+  }
+
+  // Emit the body content. We do it after connecting the loop to the CFG to
+  // avoid that the callback encounters degenerate BBs.
+  BodyGenCB(CL->getBodyIP(), CL->getIndVar());
+
+#ifndef NDEBUG
+  CL->assertOK();
+#endif
+  return CL;
+}
+
+CanonicalLoopInfo *OpenMPIRBuilder::createCanonicalLoop(
+    const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
+    Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
+    InsertPointTy ComputeIP, const Twine &Name) {
+
+  // Consider the following difficulties (assuming 8-bit signed integers):
+  //  * Adding \p Step to the loop counter which passes \p Stop may overflow:
+  //      DO I = 1, 100, 50
+  ///  * A \p Step of INT_MIN cannot not be normalized to a positive direction:
+  //      DO I = 100, 0, -128
+
+  // Start, Stop and Step must be of the same integer type.
+  auto *IndVarTy = cast<IntegerType>(Start->getType());
+  assert(IndVarTy == Stop->getType() && "Stop type mismatch");
+  assert(IndVarTy == Step->getType() && "Step type mismatch");
+
+  LocationDescription ComputeLoc =
+      ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
+  updateToLocation(ComputeLoc);
+
+  ConstantInt *Zero = ConstantInt::get(IndVarTy, 0);
+  ConstantInt *One = ConstantInt::get(IndVarTy, 1);
+
+  // Like Step, but always positive.
+  Value *Incr = Step;
+
+  // Distance between Start and Stop; always positive.
+  Value *Span;
+
+  // Condition whether there are no iterations are executed at all, e.g. because
+  // UB < LB.
+  Value *ZeroCmp;
+
+  if (IsSigned) {
+    // Ensure that increment is positive. If not, negate and invert LB and UB.
+    Value *IsNeg = Builder.CreateICmpSLT(Step, Zero);
+    Incr = Builder.CreateSelect(IsNeg, Builder.CreateNeg(Step), Step);
+    Value *LB = Builder.CreateSelect(IsNeg, Stop, Start);
+    Value *UB = Builder.CreateSelect(IsNeg, Start, Stop);
+    Span = Builder.CreateSub(UB, LB, "", false, true);
+    ZeroCmp = Builder.CreateICmp(
+        InclusiveStop ? CmpInst::ICMP_SLT : CmpInst::ICMP_SLE, UB, LB);
+  } else {
+    Span = Builder.CreateSub(Stop, Start, "", true);
+    ZeroCmp = Builder.CreateICmp(
+        InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Stop, Start);
+  }
+
+  Value *CountIfLooping;
+  if (InclusiveStop) {
+    CountIfLooping = Builder.CreateAdd(Builder.CreateUDiv(Span, Incr), One);
+  } else {
+    // Avoid incrementing past stop since it could overflow.
+    Value *CountIfTwo = Builder.CreateAdd(
+        Builder.CreateUDiv(Builder.CreateSub(Span, One), Incr), One);
+    Value *OneCmp = Builder.CreateICmp(
+        InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Span, Incr);
+    CountIfLooping = Builder.CreateSelect(OneCmp, One, CountIfTwo);
+  }
+  Value *TripCount = Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping,
+                                          "omp_" + Name + ".tripcount");
+
+  auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
+    Builder.restoreIP(CodeGenIP);
+    Value *Span = Builder.CreateMul(IV, Step);
+    Value *IndVar = Builder.CreateAdd(Span, Start);
+    BodyGenCB(Builder.saveIP(), IndVar);
+  };
+  LocationDescription LoopLoc = ComputeIP.isSet() ? Loc.IP : Builder.saveIP();
+  return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name);
+}
+
+// Returns an LLVM function to call for initializing loop bounds using OpenMP
+// static scheduling depending on `type`. Only i32 and i64 are supported by the
+// runtime. Always interpret integers as unsigned similarly to
+// CanonicalLoopInfo.
+static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M,
+                                                  OpenMPIRBuilder &OMPBuilder) {
+  unsigned Bitwidth = Ty->getIntegerBitWidth();
+  if (Bitwidth == 32)
+    return OMPBuilder.getOrCreateRuntimeFunction(
+        M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u);
+  if (Bitwidth == 64)
+    return OMPBuilder.getOrCreateRuntimeFunction(
+        M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u);
+  llvm_unreachable("unknown OpenMP loop iterator bitwidth");
+}
+
+// Sets the number of loop iterations to the given value. This value must be
+// valid in the condition block (i.e., defined in the preheader) and is
+// interpreted as an unsigned integer.
+void setCanonicalLoopTripCount(CanonicalLoopInfo *CLI, Value *TripCount) {
+  Instruction *CmpI = &CLI->getCond()->front();
+  assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
+  CmpI->setOperand(1, TripCount);
+  CLI->assertOK();
+}
+
+CanonicalLoopInfo *OpenMPIRBuilder::createStaticWorkshareLoop(
+    const LocationDescription &Loc, CanonicalLoopInfo *CLI,
+    InsertPointTy AllocaIP, bool NeedsBarrier, Value *Chunk) {
+  // Set up the source location value for OpenMP runtime.
+  if (!updateToLocation(Loc))
+    return nullptr;
+
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
+  Value *SrcLoc = getOrCreateIdent(SrcLocStr);
+
+  // Declare useful OpenMP runtime functions.
+  Value *IV = CLI->getIndVar();
+  Type *IVTy = IV->getType();
+  FunctionCallee StaticInit = getKmpcForStaticInitForType(IVTy, M, *this);
+  FunctionCallee StaticFini =
+      getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
+
+  // Allocate space for computed loop bounds as expected by the "init" function.
+  Builder.restoreIP(AllocaIP);
+  Type *I32Type = Type::getInt32Ty(M.getContext());
+  Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
+  Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
+  Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
+  Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
+
+  // At the end of the preheader, prepare for calling the "init" function by
+  // storing the current loop bounds into the allocated space. A canonical loop
+  // always iterates from 0 to trip-count with step 1. Note that "init" expects
+  // and produces an inclusive upper bound.
+  Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
+  Constant *Zero = ConstantInt::get(IVTy, 0);
+  Constant *One = ConstantInt::get(IVTy, 1);
+  Builder.CreateStore(Zero, PLowerBound);
+  Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One);
+  Builder.CreateStore(UpperBound, PUpperBound);
+  Builder.CreateStore(One, PStride);
+
+  if (!Chunk)
+    Chunk = One;
+
+  Value *ThreadNum = getOrCreateThreadID(SrcLoc);
+
+  // TODO: extract scheduling type and map it to OMP constant. This is curently
+  // happening in kmp.h and its ilk and needs to be moved to OpenMP.td first.
+  constexpr int StaticSchedType = 34;
+  Constant *SchedulingType = ConstantInt::get(I32Type, StaticSchedType);
+
+  // Call the "init" function and update the trip count of the loop with the
+  // value it produced.
+  Builder.CreateCall(StaticInit,
+                     {SrcLoc, ThreadNum, SchedulingType, PLastIter, PLowerBound,
+                      PUpperBound, PStride, One, Chunk});
+  Value *LowerBound = Builder.CreateLoad(PLowerBound);
+  Value *InclusiveUpperBound = Builder.CreateLoad(PUpperBound);
+  Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound);
+  Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One);
+  setCanonicalLoopTripCount(CLI, TripCount);
+
+  // Update all uses of the induction variable except the one in the condition
+  // block that compares it with the actual upper bound, and the increment in
+  // the latch block.
+  // TODO: this can eventually move to CanonicalLoopInfo or to a new
+  // CanonicalLoopInfoUpdater interface.
+  Builder.SetInsertPoint(CLI->getBody(), CLI->getBody()->getFirstInsertionPt());
+  Value *UpdatedIV = Builder.CreateAdd(IV, LowerBound);
+  IV->replaceUsesWithIf(UpdatedIV, [&](Use &U) {
+    auto *Instr = dyn_cast<Instruction>(U.getUser());
+    return !Instr ||
+           (Instr->getParent() != CLI->getCond() &&
+            Instr->getParent() != CLI->getLatch() && Instr != UpdatedIV);
+  });
+
+  // In the "exit" block, call the "fini" function.
+  Builder.SetInsertPoint(CLI->getExit(),
+                         CLI->getExit()->getTerminator()->getIterator());
+  Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
+
+  // Add the barrier if requested.
+  if (NeedsBarrier)
+    createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
+                  omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
+                  /* CheckCancelFlag */ false);
+
+  CLI->assertOK();
+  return CLI;
+}
+
+/// Make \p Source branch to \p Target.
+///
+/// Handles two situations:
+/// * \p Source already has an unconditional branch.
+/// * \p Source is a degenerate block (no terminator because the BB is
+///             the current head of the IR construction).
+static void redirectTo(BasicBlock *Source, BasicBlock *Target, DebugLoc DL) {
+  if (Instruction *Term = Source->getTerminator()) {
+    auto *Br = cast<BranchInst>(Term);
+    assert(!Br->isConditional() &&
+           "BB's terminator must be an unconditional branch (or degenerate)");
+    BasicBlock *Succ = Br->getSuccessor(0);
+    Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true);
+    Br->setSuccessor(0, Target);
+    return;
+  }
+
+  auto *NewBr = BranchInst::Create(Target, Source);
+  NewBr->setDebugLoc(DL);
+}
+
+/// Redirect all edges that branch to \p OldTarget to \p NewTarget. That is,
+/// after this \p OldTarget will be orphaned.
+static void redirectAllPredecessorsTo(BasicBlock *OldTarget,
+                                      BasicBlock *NewTarget, DebugLoc DL) {
+  for (BasicBlock *Pred : make_early_inc_range(predecessors(OldTarget)))
+    redirectTo(Pred, NewTarget, DL);
+}
+
+/// Determine which blocks in \p BBs are reachable from outside and remove the
+/// ones that are not reachable from the function.
+static void removeUnusedBlocksFromParent(ArrayRef<BasicBlock *> BBs) {
+  SmallPtrSet<BasicBlock *, 6> BBsToErase{BBs.begin(), BBs.end()};
+  auto HasRemainingUses = [&BBsToErase](BasicBlock *BB) {
+    for (Use &U : BB->uses()) {
+      auto *UseInst = dyn_cast<Instruction>(U.getUser());
+      if (!UseInst)
+        continue;
+      if (BBsToErase.count(UseInst->getParent()))
+        continue;
+      return true;
+    }
+    return false;
+  };
+
+  while (true) {
+    bool Changed = false;
+    for (BasicBlock *BB : make_early_inc_range(BBsToErase)) {
+      if (HasRemainingUses(BB)) {
+        BBsToErase.erase(BB);
+        Changed = true;
+      }
+    }
+    if (!Changed)
+      break;
+  }
+
+  SmallVector<BasicBlock *, 7> BBVec(BBsToErase.begin(), BBsToErase.end());
+  DeleteDeadBlocks(BBVec);
+}
+
+std::vector<CanonicalLoopInfo *>
+OpenMPIRBuilder::tileLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
+                           ArrayRef<Value *> TileSizes) {
+  assert(TileSizes.size() == Loops.size() &&
+         "Must pass as many tile sizes as there are loops");
+  int NumLoops = Loops.size();
+  assert(NumLoops >= 1 && "At least one loop to tile required");
+
+  CanonicalLoopInfo *OutermostLoop = Loops.front();
+  CanonicalLoopInfo *InnermostLoop = Loops.back();
+  Function *F = OutermostLoop->getBody()->getParent();
+  BasicBlock *InnerEnter = InnermostLoop->getBody();
+  BasicBlock *InnerLatch = InnermostLoop->getLatch();
+
+  // Collect original trip counts and induction variable to be accessible by
+  // index. Also, the structure of the original loops is not preserved during
+  // the construction of the tiled loops, so do it before we scavenge the BBs of
+  // any original CanonicalLoopInfo.
+  SmallVector<Value *, 4> OrigTripCounts, OrigIndVars;
+  for (CanonicalLoopInfo *L : Loops) {
+    OrigTripCounts.push_back(L->getTripCount());
+    OrigIndVars.push_back(L->getIndVar());
+  }
+
+  // Collect the code between loop headers. These may contain SSA definitions
+  // that are used in the loop nest body. To be usable with in the innermost
+  // body, these BasicBlocks will be sunk into the loop nest body. That is,
+  // these instructions may be executed more often than before the tiling.
+  // TODO: It would be sufficient to only sink them into body of the
+  // corresponding tile loop.
+  SmallVector<std::pair<BasicBlock *, BasicBlock *>, 4> InbetweenCode;
+  for (int i = 0; i < NumLoops - 1; ++i) {
+    CanonicalLoopInfo *Surrounding = Loops[i];
+    CanonicalLoopInfo *Nested = Loops[i + 1];
+
+    BasicBlock *EnterBB = Surrounding->getBody();
+    BasicBlock *ExitBB = Nested->getHeader();
+    InbetweenCode.emplace_back(EnterBB, ExitBB);
+  }
+
+  // Compute the trip counts of the floor loops.
+  Builder.SetCurrentDebugLocation(DL);
+  Builder.restoreIP(OutermostLoop->getPreheaderIP());
+  SmallVector<Value *, 4> FloorCount, FloorRems;
+  for (int i = 0; i < NumLoops; ++i) {
+    Value *TileSize = TileSizes[i];
+    Value *OrigTripCount = OrigTripCounts[i];
+    Type *IVType = OrigTripCount->getType();
+
+    Value *FloorTripCount = Builder.CreateUDiv(OrigTripCount, TileSize);
+    Value *FloorTripRem = Builder.CreateURem(OrigTripCount, TileSize);
+
+    // 0 if tripcount divides the tilesize, 1 otherwise.
+    // 1 means we need an additional iteration for a partial tile.
+    //
+    // Unfortunately we cannot just use the roundup-formula
+    //   (tripcount + tilesize - 1)/tilesize
+    // because the summation might overflow. We do not want introduce undefined
+    // behavior when the untiled loop nest did not.
+    Value *FloorTripOverflow =
+        Builder.CreateICmpNE(FloorTripRem, ConstantInt::get(IVType, 0));
+
+    FloorTripOverflow = Builder.CreateZExt(FloorTripOverflow, IVType);
+    FloorTripCount =
+        Builder.CreateAdd(FloorTripCount, FloorTripOverflow,
+                          "omp_floor" + Twine(i) + ".tripcount", true);
+
+    // Remember some values for later use.
+    FloorCount.push_back(FloorTripCount);
+    FloorRems.push_back(FloorTripRem);
+  }
+
+  // Generate the new loop nest, from the outermost to the innermost.
+  std::vector<CanonicalLoopInfo *> Result;
+  Result.reserve(NumLoops * 2);
+
+  // The basic block of the surrounding loop that enters the nest generated
+  // loop.
+  BasicBlock *Enter = OutermostLoop->getPreheader();
+
+  // The basic block of the surrounding loop where the inner code should
+  // continue.
+  BasicBlock *Continue = OutermostLoop->getAfter();
+
+  // Where the next loop basic block should be inserted.
+  BasicBlock *OutroInsertBefore = InnermostLoop->getExit();
+
+  auto EmbeddNewLoop =
+      [this, DL, F, InnerEnter, &Enter, &Continue, &OutroInsertBefore](
+          Value *TripCount, const Twine &Name) -> CanonicalLoopInfo * {
+    CanonicalLoopInfo *EmbeddedLoop = createLoopSkeleton(
+        DL, TripCount, F, InnerEnter, OutroInsertBefore, Name);
+    redirectTo(Enter, EmbeddedLoop->getPreheader(), DL);
+    redirectTo(EmbeddedLoop->getAfter(), Continue, DL);
+
+    // Setup the position where the next embedded loop connects to this loop.
+    Enter = EmbeddedLoop->getBody();
+    Continue = EmbeddedLoop->getLatch();
+    OutroInsertBefore = EmbeddedLoop->getLatch();
+    return EmbeddedLoop;
+  };
+
+  auto EmbeddNewLoops = [&Result, &EmbeddNewLoop](ArrayRef<Value *> TripCounts,
+                                                  const Twine &NameBase) {
+    for (auto P : enumerate(TripCounts)) {
+      CanonicalLoopInfo *EmbeddedLoop =
+          EmbeddNewLoop(P.value(), NameBase + Twine(P.index()));
+      Result.push_back(EmbeddedLoop);
+    }
+  };
+
+  EmbeddNewLoops(FloorCount, "floor");
+
+  // Within the innermost floor loop, emit the code that computes the tile
+  // sizes.
+  Builder.SetInsertPoint(Enter->getTerminator());
+  SmallVector<Value *, 4> TileCounts;
+  for (int i = 0; i < NumLoops; ++i) {
+    CanonicalLoopInfo *FloorLoop = Result[i];
+    Value *TileSize = TileSizes[i];
+
+    Value *FloorIsEpilogue =
+        Builder.CreateICmpEQ(FloorLoop->getIndVar(), FloorCount[i]);
+    Value *TileTripCount =
+        Builder.CreateSelect(FloorIsEpilogue, FloorRems[i], TileSize);
+
+    TileCounts.push_back(TileTripCount);
+  }
+
+  // Create the tile loops.
+  EmbeddNewLoops(TileCounts, "tile");
+
+  // Insert the inbetween code into the body.
+  BasicBlock *BodyEnter = Enter;
+  BasicBlock *BodyEntered = nullptr;
+  for (std::pair<BasicBlock *, BasicBlock *> P : InbetweenCode) {
+    BasicBlock *EnterBB = P.first;
+    BasicBlock *ExitBB = P.second;
+
+    if (BodyEnter)
+      redirectTo(BodyEnter, EnterBB, DL);
+    else
+      redirectAllPredecessorsTo(BodyEntered, EnterBB, DL);
+
+    BodyEnter = nullptr;
+    BodyEntered = ExitBB;
+  }
+
+  // Append the original loop nest body into the generated loop nest body.
+  if (BodyEnter)
+    redirectTo(BodyEnter, InnerEnter, DL);
+  else
+    redirectAllPredecessorsTo(BodyEntered, InnerEnter, DL);
+  redirectAllPredecessorsTo(InnerLatch, Continue, DL);
+
+  // Replace the original induction variable with an induction variable computed
+  // from the tile and floor induction variables.
+  Builder.restoreIP(Result.back()->getBodyIP());
+  for (int i = 0; i < NumLoops; ++i) {
+    CanonicalLoopInfo *FloorLoop = Result[i];
+    CanonicalLoopInfo *TileLoop = Result[NumLoops + i];
+    Value *OrigIndVar = OrigIndVars[i];
+    Value *Size = TileSizes[i];
+
+    Value *Scale =
+        Builder.CreateMul(Size, FloorLoop->getIndVar(), {}, /*HasNUW=*/true);
+    Value *Shift =
+        Builder.CreateAdd(Scale, TileLoop->getIndVar(), {}, /*HasNUW=*/true);
+    OrigIndVar->replaceAllUsesWith(Shift);
+  }
+
+  // Remove unused parts of the original loops.
+  SmallVector<BasicBlock *, 12> OldControlBBs;
+  OldControlBBs.reserve(6 * Loops.size());
+  for (CanonicalLoopInfo *Loop : Loops)
+    Loop->collectControlBlocks(OldControlBBs);
+  removeUnusedBlocksFromParent(OldControlBBs);
+
+#ifndef NDEBUG
+  for (CanonicalLoopInfo *GenL : Result)
+    GenL->assertOK();
+#endif
+  return Result;
+}
+
+OpenMPIRBuilder::InsertPointTy
+OpenMPIRBuilder::createCopyPrivate(const LocationDescription &Loc,
+                                   llvm::Value *BufSize, llvm::Value *CpyBuf,
+                                   llvm::Value *CpyFn, llvm::Value *DidIt) {
+  if (!updateToLocation(Loc))
+    return Loc.IP;
+
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
+  Value *Ident = getOrCreateIdent(SrcLocStr);
+  Value *ThreadId = getOrCreateThreadID(Ident);
+
+  llvm::Value *DidItLD = Builder.CreateLoad(DidIt);
+
+  Value *Args[] = {Ident, ThreadId, BufSize, CpyBuf, CpyFn, DidItLD};
+
+  Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_copyprivate);
+  Builder.CreateCall(Fn, Args);
+
+  return Builder.saveIP();
+}
+
+OpenMPIRBuilder::InsertPointTy
+OpenMPIRBuilder::createSingle(const LocationDescription &Loc,
+                              BodyGenCallbackTy BodyGenCB,
+                              FinalizeCallbackTy FiniCB, llvm::Value *DidIt) {
+
+  if (!updateToLocation(Loc))
+    return Loc.IP;
+
+  // If needed (i.e. not null), initialize `DidIt` with 0
+  if (DidIt) {
+    Builder.CreateStore(Builder.getInt32(0), DidIt);
+  }
+
+  Directive OMPD = Directive::OMPD_single;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
+  Value *Ident = getOrCreateIdent(SrcLocStr);
+  Value *ThreadId = getOrCreateThreadID(Ident);
+  Value *Args[] = {Ident, ThreadId};
+
+  Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_single);
+  Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
+
+  Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_single);
+  Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
+
+  // generates the following:
+  // if (__kmpc_single()) {
+  //		.... single region ...
+  // 		__kmpc_end_single
+  // }
+
+  return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
+                              /*Conditional*/ true, /*hasFinalize*/ true);
+}
+
+OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createCritical(
     const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
     FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst) {
 
@@ -959,7 +1630,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveExit(
                                   ExitCall->getIterator());
 }
 
-OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::CreateCopyinClauseBlocks(
+OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createCopyinClauseBlocks(
     InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr,
     llvm::IntegerType *IntPtrTy, bool BranchtoEnd) {
   if (!IP.isSet())
@@ -1009,7 +1680,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::CreateCopyinClauseBlocks(
   return Builder.saveIP();
 }
 
-CallInst *OpenMPIRBuilder::CreateOMPAlloc(const LocationDescription &Loc,
+CallInst *OpenMPIRBuilder::createOMPAlloc(const LocationDescription &Loc,
                                           Value *Size, Value *Allocator,
                                           std::string Name) {
   IRBuilder<>::InsertPointGuard IPG(Builder);
@@ -1025,7 +1696,7 @@ CallInst *OpenMPIRBuilder::CreateOMPAlloc(const LocationDescription &Loc,
   return Builder.CreateCall(Fn, Args, Name);
 }
 
-CallInst *OpenMPIRBuilder::CreateOMPFree(const LocationDescription &Loc,
+CallInst *OpenMPIRBuilder::createOMPFree(const LocationDescription &Loc,
                                          Value *Addr, Value *Allocator,
                                          std::string Name) {
   IRBuilder<>::InsertPointGuard IPG(Builder);
@@ -1039,7 +1710,7 @@ CallInst *OpenMPIRBuilder::CreateOMPFree(const LocationDescription &Loc,
   return Builder.CreateCall(Fn, Args, Name);
 }
 
-CallInst *OpenMPIRBuilder::CreateCachedThreadPrivate(
+CallInst *OpenMPIRBuilder::createCachedThreadPrivate(
     const LocationDescription &Loc, llvm::Value *Pointer,
     llvm::ConstantInt *Size, const llvm::Twine &Name) {
   IRBuilder<>::InsertPointGuard IPG(Builder);
@@ -1120,7 +1791,7 @@ void OpenMPIRBuilder::initializeTypes(Module &M) {
   VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg);            \
   VarName##Ptr = PointerType::getUnqual(VarName);
 #define OMP_STRUCT_TYPE(VarName, StructName, ...)                              \
-  T = M.getTypeByName(StructName);                                             \
+  T = StructType::getTypeByName(Ctx, StructName);                              \
   if (!T)                                                                      \
     T = StructType::create(Ctx, {__VA_ARGS__}, StructName);                    \
   VarName = T;                                                                 \
@@ -1144,3 +1815,102 @@ void OpenMPIRBuilder::OutlineInfo::collectBlocks(
         Worklist.push_back(SuccBB);
   }
 }
+
+void CanonicalLoopInfo::collectControlBlocks(
+    SmallVectorImpl<BasicBlock *> &BBs) {
+  // We only count those BBs as control block for which we do not need to
+  // reverse the CFG, i.e. not the loop body which can contain arbitrary control
+  // flow. For consistency, this also means we do not add the Body block, which
+  // is just the entry to the body code.
+  BBs.reserve(BBs.size() + 6);
+  BBs.append({Preheader, Header, Cond, Latch, Exit, After});
+}
+
+void CanonicalLoopInfo::assertOK() const {
+#ifndef NDEBUG
+  if (!IsValid)
+    return;
+
+  // Verify standard control-flow we use for OpenMP loops.
+  assert(Preheader);
+  assert(isa<BranchInst>(Preheader->getTerminator()) &&
+         "Preheader must terminate with unconditional branch");
+  assert(Preheader->getSingleSuccessor() == Header &&
+         "Preheader must jump to header");
+
+  assert(Header);
+  assert(isa<BranchInst>(Header->getTerminator()) &&
+         "Header must terminate with unconditional branch");
+  assert(Header->getSingleSuccessor() == Cond &&
+         "Header must jump to exiting block");
+
+  assert(Cond);
+  assert(Cond->getSinglePredecessor() == Header &&
+         "Exiting block only reachable from header");
+
+  assert(isa<BranchInst>(Cond->getTerminator()) &&
+         "Exiting block must terminate with conditional branch");
+  assert(size(successors(Cond)) == 2 &&
+         "Exiting block must have two successors");
+  assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(0) == Body &&
+         "Exiting block's first successor jump to the body");
+  assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(1) == Exit &&
+         "Exiting block's second successor must exit the loop");
+
+  assert(Body);
+  assert(Body->getSinglePredecessor() == Cond &&
+         "Body only reachable from exiting block");
+  assert(!isa<PHINode>(Body->front()));
+
+  assert(Latch);
+  assert(isa<BranchInst>(Latch->getTerminator()) &&
+         "Latch must terminate with unconditional branch");
+  assert(Latch->getSingleSuccessor() == Header && "Latch must jump to header");
+  // TODO: To support simple redirecting of the end of the body code that has
+  // multiple; introduce another auxiliary basic block like preheader and after.
+  assert(Latch->getSinglePredecessor() != nullptr);
+  assert(!isa<PHINode>(Latch->front()));
+
+  assert(Exit);
+  assert(isa<BranchInst>(Exit->getTerminator()) &&
+         "Exit block must terminate with unconditional branch");
+  assert(Exit->getSingleSuccessor() == After &&
+         "Exit block must jump to after block");
+
+  assert(After);
+  assert(After->getSinglePredecessor() == Exit &&
+         "After block only reachable from exit block");
+  assert(After->empty() || !isa<PHINode>(After->front()));
+
+  Instruction *IndVar = getIndVar();
+  assert(IndVar && "Canonical induction variable not found?");
+  assert(isa<IntegerType>(IndVar->getType()) &&
+         "Induction variable must be an integer");
+  assert(cast<PHINode>(IndVar)->getParent() == Header &&
+         "Induction variable must be a PHI in the loop header");
+  assert(cast<PHINode>(IndVar)->getIncomingBlock(0) == Preheader);
+  assert(
+      cast<ConstantInt>(cast<PHINode>(IndVar)->getIncomingValue(0))->isZero());
+  assert(cast<PHINode>(IndVar)->getIncomingBlock(1) == Latch);
+
+  auto *NextIndVar = cast<PHINode>(IndVar)->getIncomingValue(1);
+  assert(cast<Instruction>(NextIndVar)->getParent() == Latch);
+  assert(cast<BinaryOperator>(NextIndVar)->getOpcode() == BinaryOperator::Add);
+  assert(cast<BinaryOperator>(NextIndVar)->getOperand(0) == IndVar);
+  assert(cast<ConstantInt>(cast<BinaryOperator>(NextIndVar)->getOperand(1))
+             ->isOne());
+
+  Value *TripCount = getTripCount();
+  assert(TripCount && "Loop trip count not found?");
+  assert(IndVar->getType() == TripCount->getType() &&
+         "Trip count and induction variable must have the same type");
+
+  auto *CmpI = cast<CmpInst>(&Cond->front());
+  assert(CmpI->getPredicate() == CmpInst::ICMP_ULT &&
+         "Exit condition must be a signed less-than comparison");
+  assert(CmpI->getOperand(0) == IndVar &&
+         "Exit condition must compare the induction variable");
+  assert(CmpI->getOperand(1) == TripCount &&
+         "Exit condition must compare with the trip count");
+#endif
+}
diff --git a/contrib/llvm-project/llvm/lib/FuzzMutate/IRMutator.cpp b/contrib/llvm-project/llvm/lib/FuzzMutate/IRMutator.cpp
index 2fc65981f1db..33b90097ab2c 100644
--- a/contrib/llvm-project/llvm/lib/FuzzMutate/IRMutator.cpp
+++ b/contrib/llvm-project/llvm/lib/FuzzMutate/IRMutator.cpp
@@ -197,3 +197,46 @@ void InstDeleterIRStrategy::mutate(Instruction &Inst, RandomIRBuilder &IB) {
   Inst.replaceAllUsesWith(RS.getSelection());
   Inst.eraseFromParent();
 }
+
+void InstModificationIRStrategy::mutate(Instruction &Inst,
+                                        RandomIRBuilder &IB) {
+  SmallVector<std::function<void()>, 8> Modifications;
+  CmpInst *CI = nullptr;
+  GetElementPtrInst *GEP = nullptr;
+  switch (Inst.getOpcode()) {
+  default:
+    break;
+  case Instruction::Add:
+  case Instruction::Mul:
+  case Instruction::Sub:
+  case Instruction::Shl:
+    Modifications.push_back([&Inst]() { Inst.setHasNoSignedWrap(true); }),
+        Modifications.push_back([&Inst]() { Inst.setHasNoSignedWrap(false); });
+    Modifications.push_back([&Inst]() { Inst.setHasNoUnsignedWrap(true); });
+    Modifications.push_back([&Inst]() { Inst.setHasNoUnsignedWrap(false); });
+
+    break;
+  case Instruction::ICmp:
+    CI = cast<ICmpInst>(&Inst);
+    Modifications.push_back([CI]() { CI->setPredicate(CmpInst::ICMP_EQ); });
+    Modifications.push_back([CI]() { CI->setPredicate(CmpInst::ICMP_NE); });
+    Modifications.push_back([CI]() { CI->setPredicate(CmpInst::ICMP_UGT); });
+    Modifications.push_back([CI]() { CI->setPredicate(CmpInst::ICMP_UGE); });
+    Modifications.push_back([CI]() { CI->setPredicate(CmpInst::ICMP_ULT); });
+    Modifications.push_back([CI]() { CI->setPredicate(CmpInst::ICMP_ULE); });
+    Modifications.push_back([CI]() { CI->setPredicate(CmpInst::ICMP_SGT); });
+    Modifications.push_back([CI]() { CI->setPredicate(CmpInst::ICMP_SGE); });
+    Modifications.push_back([CI]() { CI->setPredicate(CmpInst::ICMP_SLT); });
+    Modifications.push_back([CI]() { CI->setPredicate(CmpInst::ICMP_SLE); });
+    break;
+  case Instruction::GetElementPtr:
+    GEP = cast<GetElementPtrInst>(&Inst);
+    Modifications.push_back([GEP]() { GEP->setIsInBounds(true); });
+    Modifications.push_back([GEP]() { GEP->setIsInBounds(false); });
+    break;
+  }
+
+  auto RS = makeSampler(IB.Rand, Modifications);
+  if (RS)
+    RS.getSelection()();
+}
diff --git a/contrib/llvm-project/llvm/lib/IR/AsmWriter.cpp b/contrib/llvm-project/llvm/lib/IR/AsmWriter.cpp
index fd08310316b3..69abf8769e4b 100644
--- a/contrib/llvm-project/llvm/lib/IR/AsmWriter.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/AsmWriter.cpp
@@ -116,6 +116,15 @@ struct OrderMap {
 
 } // end anonymous namespace
 
+/// Look for a value that might be wrapped as metadata, e.g. a value in a
+/// metadata operand. Returns the input value as-is if it is not wrapped.
+static const Value *skipMetadataWrapper(const Value *V) {
+  if (const auto *MAV = dyn_cast<MetadataAsValue>(V))
+    if (const auto *VAM = dyn_cast<ValueAsMetadata>(MAV->getMetadata()))
+      return VAM->getValue();
+  return V;
+}
+
 static void orderValue(const Value *V, OrderMap &OM) {
   if (OM.lookup(V).first)
     return;
@@ -132,8 +141,6 @@ static void orderValue(const Value *V, OrderMap &OM) {
 }
 
 static OrderMap orderModule(const Module *M) {
-  // This needs to match the order used by ValueEnumerator::ValueEnumerator()
-  // and ValueEnumerator::incorporateFunction().
   OrderMap OM;
 
   for (const GlobalVariable &G : M->globals()) {
@@ -167,10 +174,12 @@ static OrderMap orderModule(const Module *M) {
     for (const BasicBlock &BB : F) {
       orderValue(&BB, OM);
       for (const Instruction &I : BB) {
-        for (const Value *Op : I.operands())
+        for (const Value *Op : I.operands()) {
+          Op = skipMetadataWrapper(Op);
           if ((isa<Constant>(*Op) && !isa<GlobalValue>(*Op)) ||
               isa<InlineAsm>(*Op))
             orderValue(Op, OM);
+        }
         orderValue(&I, OM);
       }
     }
@@ -284,9 +293,11 @@ static UseListOrderStack predictUseListOrder(const Module *M) {
       predictValueUseListOrder(&A, &F, OM, Stack);
     for (const BasicBlock &BB : F)
       for (const Instruction &I : BB)
-        for (const Value *Op : I.operands())
+        for (const Value *Op : I.operands()) {
+          Op = skipMetadataWrapper(Op);
           if (isa<Constant>(*Op) || isa<InlineAsm>(*Op)) // Visit GlobalValues.
             predictValueUseListOrder(Op, &F, OM, Stack);
+        }
     for (const BasicBlock &BB : F)
       for (const Instruction &I : BB)
         predictValueUseListOrder(&I, &F, OM, Stack);
@@ -388,6 +399,7 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) {
   case CallingConv::AMDGPU_PS:     Out << "amdgpu_ps"; break;
   case CallingConv::AMDGPU_CS:     Out << "amdgpu_cs"; break;
   case CallingConv::AMDGPU_KERNEL: Out << "amdgpu_kernel"; break;
+  case CallingConv::AMDGPU_Gfx:    Out << "amdgpu_gfx"; break;
   }
 }
 
@@ -597,6 +609,7 @@ void TypePrinting::print(Type *Ty, raw_ostream &OS) {
   case Type::LabelTyID:     OS << "label"; return;
   case Type::MetadataTyID:  OS << "metadata"; return;
   case Type::X86_MMXTyID:   OS << "x86_mmx"; return;
+  case Type::X86_AMXTyID:   OS << "x86_amx"; return;
   case Type::TokenTyID:     OS << "token"; return;
   case Type::IntegerTyID:
     OS << 'i' << cast<IntegerType>(Ty)->getBitWidth();
@@ -656,9 +669,9 @@ void TypePrinting::print(Type *Ty, raw_ostream &OS) {
     VectorType *PTy = cast<VectorType>(Ty);
     ElementCount EC = PTy->getElementCount();
     OS << "<";
-    if (EC.Scalable)
+    if (EC.isScalable())
       OS << "vscale x ";
-    OS << EC.Min << " x ";
+    OS << EC.getKnownMinValue() << " x ";
     print(PTy->getElementType(), OS);
     OS << '>';
     return;
@@ -1355,9 +1368,8 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
         // "Inf" or NaN, that atof will accept, but the lexer will not.  Check
         // that the string matches the "[-+]?[0-9]" regex.
         //
-        assert(((StrVal[0] >= '0' && StrVal[0] <= '9') ||
-                ((StrVal[0] == '-' || StrVal[0] == '+') &&
-                 (StrVal[1] >= '0' && StrVal[1] <= '9'))) &&
+        assert((isDigit(StrVal[0]) || ((StrVal[0] == '-' || StrVal[0] == '+') &&
+                                       isDigit(StrVal[1]))) &&
                "[-+]?[0-9] regex does not match!");
         // Reparse stringized version!
         if (APFloat(APFloat::IEEEdouble(), StrVal).convertToDouble() == Val) {
@@ -1373,9 +1385,19 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
                     "assuming that double is 64 bits!");
       APFloat apf = APF;
       // Floats are represented in ASCII IR as double, convert.
-      if (!isDouble)
+      // FIXME: We should allow 32-bit hex float and remove this.
+      if (!isDouble) {
+        // A signaling NaN is quieted on conversion, so we need to recreate the
+        // expected value after convert (quiet bit of the payload is clear).
+        bool IsSNAN = apf.isSignaling();
         apf.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
-                          &ignored);
+                    &ignored);
+        if (IsSNAN) {
+          APInt Payload = apf.bitcastToAPInt();
+          apf = APFloat::getSNaN(APFloat::IEEEdouble(), apf.isNegative(),
+                                 &Payload);
+        }
+      }
       Out << format_hex(apf.bitcastToAPInt().getZExtValue(), 0, /*Upper=*/true);
       return;
     }
@@ -1433,6 +1455,13 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
     return;
   }
 
+  if (const auto *Equiv = dyn_cast<DSOLocalEquivalent>(CV)) {
+    Out << "dso_local_equivalent ";
+    WriteAsOperandInternal(Out, Equiv->getGlobalValue(), &TypePrinter, Machine,
+                           Context);
+    return;
+  }
+
   if (const ConstantArray *CA = dyn_cast<ConstantArray>(CV)) {
     Type *ETy = CA->getType()->getElementType();
     Out << '[';
@@ -1511,7 +1540,7 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
   }
 
   if (isa<ConstantVector>(CV) || isa<ConstantDataVector>(CV)) {
-    auto *CVVTy = cast<VectorType>(CV->getType());
+    auto *CVVTy = cast<FixedVectorType>(CV->getType());
     Type *ETy = CVVTy->getElementType();
     Out << '<';
     TypePrinter.print(ETy, Out);
@@ -1539,6 +1568,11 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
     return;
   }
 
+  if (isa<PoisonValue>(CV)) {
+    Out << "poison";
+    return;
+  }
+
   if (isa<UndefValue>(CV)) {
     Out << "undef";
     return;
@@ -1889,6 +1923,57 @@ static void writeDISubrange(raw_ostream &Out, const DISubrange *N,
   Out << ")";
 }
 
+static void writeDIGenericSubrange(raw_ostream &Out, const DIGenericSubrange *N,
+                                   TypePrinting *TypePrinter,
+                                   SlotTracker *Machine,
+                                   const Module *Context) {
+  Out << "!DIGenericSubrange(";
+  MDFieldPrinter Printer(Out, TypePrinter, Machine, Context);
+
+  auto IsConstant = [&](Metadata *Bound) -> bool {
+    if (auto *BE = dyn_cast_or_null<DIExpression>(Bound)) {
+      return BE->isSignedConstant();
+    }
+    return false;
+  };
+
+  auto GetConstant = [&](Metadata *Bound) -> int64_t {
+    assert(IsConstant(Bound) && "Expected constant");
+    auto *BE = dyn_cast_or_null<DIExpression>(Bound);
+    return static_cast<int64_t>(BE->getElement(1));
+  };
+
+  auto *Count = N->getRawCountNode();
+  if (IsConstant(Count))
+    Printer.printInt("count", GetConstant(Count),
+                     /* ShouldSkipZero */ false);
+  else
+    Printer.printMetadata("count", Count, /*ShouldSkipNull */ true);
+
+  auto *LBound = N->getRawLowerBound();
+  if (IsConstant(LBound))
+    Printer.printInt("lowerBound", GetConstant(LBound),
+                     /* ShouldSkipZero */ false);
+  else
+    Printer.printMetadata("lowerBound", LBound, /*ShouldSkipNull */ true);
+
+  auto *UBound = N->getRawUpperBound();
+  if (IsConstant(UBound))
+    Printer.printInt("upperBound", GetConstant(UBound),
+                     /* ShouldSkipZero */ false);
+  else
+    Printer.printMetadata("upperBound", UBound, /*ShouldSkipNull */ true);
+
+  auto *Stride = N->getRawStride();
+  if (IsConstant(Stride))
+    Printer.printInt("stride", GetConstant(Stride),
+                     /* ShouldSkipZero */ false);
+  else
+    Printer.printMetadata("stride", Stride, /*ShouldSkipNull */ true);
+
+  Out << ")";
+}
+
 static void writeDIEnumerator(raw_ostream &Out, const DIEnumerator *N,
                               TypePrinting *, SlotTracker *, const Module *) {
   Out << "!DIEnumerator(";
@@ -1916,6 +2001,23 @@ static void writeDIBasicType(raw_ostream &Out, const DIBasicType *N,
   Out << ")";
 }
 
+static void writeDIStringType(raw_ostream &Out, const DIStringType *N,
+                              TypePrinting *TypePrinter, SlotTracker *Machine,
+                              const Module *Context) {
+  Out << "!DIStringType(";
+  MDFieldPrinter Printer(Out, TypePrinter, Machine, Context);
+  if (N->getTag() != dwarf::DW_TAG_string_type)
+    Printer.printTag(N);
+  Printer.printString("name", N->getName());
+  Printer.printMetadata("stringLength", N->getRawStringLength());
+  Printer.printMetadata("stringLengthExpression", N->getRawStringLengthExp());
+  Printer.printInt("size", N->getSizeInBits());
+  Printer.printInt("align", N->getAlignInBits());
+  Printer.printDwarfEnum("encoding", N->getEncoding(),
+                         dwarf::AttributeEncodingString);
+  Out << ")";
+}
+
 static void writeDIDerivedType(raw_ostream &Out, const DIDerivedType *N,
                                TypePrinting *TypePrinter, SlotTracker *Machine,
                                const Module *Context) {
@@ -1962,6 +2064,13 @@ static void writeDICompositeType(raw_ostream &Out, const DICompositeType *N,
   Printer.printString("identifier", N->getIdentifier());
   Printer.printMetadata("discriminator", N->getRawDiscriminator());
   Printer.printMetadata("dataLocation", N->getRawDataLocation());
+  Printer.printMetadata("associated", N->getRawAssociated());
+  Printer.printMetadata("allocated", N->getRawAllocated());
+  if (auto *RankConst = N->getRankConst())
+    Printer.printInt("rank", RankConst->getSExtValue(),
+                     /* ShouldSkipZero */ false);
+  else
+    Printer.printMetadata("rank", N->getRawRank(), /*ShouldSkipNull */ true);
   Out << ")";
 }
 
@@ -2136,6 +2245,7 @@ static void writeDIModule(raw_ostream &Out, const DIModule *N,
   Printer.printString("apinotes", N->getAPINotesFile());
   Printer.printMetadata("file", N->getRawFile());
   Printer.printInt("line", N->getLineNo());
+  Printer.printBool("isDecl", N->getIsDecl(), /* Default */ false);
   Out << ")";
 }
 
@@ -3087,7 +3197,7 @@ void AssemblyWriter::printFunctionSummary(const FunctionSummary *FS) {
     printTypeIdInfo(*TIdInfo);
 
   auto PrintRange = [&](const ConstantRange &Range) {
-    Out << "[" << Range.getLower() << ", " << Range.getSignedMax() << "]";
+    Out << "[" << Range.getSignedMin() << ", " << Range.getSignedMax() << "]";
   };
 
   if (!FS->paramAccesses().empty()) {
@@ -3103,7 +3213,7 @@ void AssemblyWriter::printFunctionSummary(const FunctionSummary *FS) {
         FieldSeparator IFS;
         for (auto &Call : PS.Calls) {
           Out << IFS;
-          Out << "(callee: ^" << Machine.getGUIDSlot(Call.Callee);
+          Out << "(callee: ^" << Machine.getGUIDSlot(Call.Callee.getGUID());
           Out << ", param: " << Call.ParamNo;
           Out << ", offset: ";
           PrintRange(Call.Offsets);
@@ -4269,11 +4379,17 @@ void AssemblyWriter::writeAttribute(const Attribute &Attr, bool InAttrGroup) {
   }
 
   assert((Attr.hasAttribute(Attribute::ByVal) ||
+          Attr.hasAttribute(Attribute::StructRet) ||
+          Attr.hasAttribute(Attribute::ByRef) ||
           Attr.hasAttribute(Attribute::Preallocated)) &&
          "unexpected type attr");
 
   if (Attr.hasAttribute(Attribute::ByVal)) {
     Out << "byval";
+  } else if (Attr.hasAttribute(Attribute::StructRet)) {
+    Out << "sret";
+  } else if (Attr.hasAttribute(Attribute::ByRef)) {
+    Out << "byref";
   } else {
     Out << "preallocated";
   }
@@ -4366,7 +4482,7 @@ void Function::print(raw_ostream &ROS, AssemblyAnnotationWriter *AAW,
 void BasicBlock::print(raw_ostream &ROS, AssemblyAnnotationWriter *AAW,
                      bool ShouldPreserveUseListOrder,
                      bool IsForDebug) const {
-  SlotTracker SlotTable(this->getModule());
+  SlotTracker SlotTable(this->getParent());
   formatted_raw_ostream OS(ROS);
   AssemblyWriter W(OS, SlotTable, this->getModule(), AAW,
                    IsForDebug,
diff --git a/contrib/llvm-project/llvm/lib/IR/Assumptions.cpp b/contrib/llvm-project/llvm/lib/IR/Assumptions.cpp
new file mode 100644
index 000000000000..1bd8b7f51e67
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/IR/Assumptions.cpp
@@ -0,0 +1,36 @@
+//===- Assumptions.cpp ------ Collection of helpers for assumptions -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/Assumptions.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
+
+using namespace llvm;
+
+bool llvm::hasAssumption(Function &F,
+                         const KnownAssumptionString &AssumptionStr) {
+  const Attribute &A = F.getFnAttribute(AssumptionAttrKey);
+  if (!A.isValid())
+    return false;
+  assert(A.isStringAttribute() && "Expected a string attribute!");
+
+  SmallVector<StringRef, 8> Strings;
+  A.getValueAsString().split(Strings, ",");
+
+  return llvm::any_of(Strings, [=](StringRef Assumption) {
+    return Assumption == AssumptionStr;
+  });
+}
+
+StringSet<> llvm::KnownAssumptionStrings({
+    "omp_no_openmp",          // OpenMP 5.1
+    "omp_no_openmp_routines", // OpenMP 5.1
+    "omp_no_parallelism",     // OpenMP 5.1
+});
diff --git a/contrib/llvm-project/llvm/lib/IR/AttributeImpl.h b/contrib/llvm-project/llvm/lib/IR/AttributeImpl.h
index 5c334348cde3..c69fe3fe0827 100644
--- a/contrib/llvm-project/llvm/lib/IR/AttributeImpl.h
+++ b/contrib/llvm-project/llvm/lib/IR/AttributeImpl.h
@@ -121,7 +121,10 @@ protected:
 
 public:
   EnumAttributeImpl(Attribute::AttrKind Kind)
-      : AttributeImpl(EnumAttrEntry), Kind(Kind) {}
+      : AttributeImpl(EnumAttrEntry), Kind(Kind) {
+    assert(Kind != Attribute::AttrKind::None &&
+           "Can't create a None attribute!");
+  }
 
   Attribute::AttrKind getEnumKind() const { return Kind; }
 };
@@ -251,6 +254,8 @@ public:
   std::pair<unsigned, Optional<unsigned>> getAllocSizeArgs() const;
   std::string getAsString(bool InAttrGrp) const;
   Type *getByValType() const;
+  Type *getStructRetType() const;
+  Type *getByRefType() const;
   Type *getPreallocatedType() const;
 
   using iterator = const Attribute *;
diff --git a/contrib/llvm-project/llvm/lib/IR/Attributes.cpp b/contrib/llvm-project/llvm/lib/IR/Attributes.cpp
index f67d96a854f4..c4629decc6d9 100644
--- a/contrib/llvm-project/llvm/lib/IR/Attributes.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/Attributes.cpp
@@ -172,6 +172,14 @@ Attribute Attribute::getWithByValType(LLVMContext &Context, Type *Ty) {
   return get(Context, ByVal, Ty);
 }
 
+Attribute Attribute::getWithStructRetType(LLVMContext &Context, Type *Ty) {
+  return get(Context, StructRet, Ty);
+}
+
+Attribute Attribute::getWithByRefType(LLVMContext &Context, Type *Ty) {
+  return get(Context, ByRef, Ty);
+}
+
 Attribute Attribute::getWithPreallocatedType(LLVMContext &Context, Type *Ty) {
   return get(Context, Preallocated, Ty);
 }
@@ -363,6 +371,8 @@ std::string Attribute::getAsString(bool InAttrGrp) const {
     return "noalias";
   if (hasAttribute(Attribute::NoBuiltin))
     return "nobuiltin";
+  if (hasAttribute(Attribute::NoCallback))
+    return "nocallback";
   if (hasAttribute(Attribute::NoCapture))
     return "nocapture";
   if (hasAttribute(Attribute::NoDuplicate))
@@ -393,6 +403,8 @@ std::string Attribute::getAsString(bool InAttrGrp) const {
     return "nocf_check";
   if (hasAttribute(Attribute::NoRecurse))
     return "norecurse";
+  if (hasAttribute(Attribute::NoProfile))
+    return "noprofile";
   if (hasAttribute(Attribute::NoUnwind))
     return "nounwind";
   if (hasAttribute(Attribute::OptForFuzzing))
@@ -429,8 +441,6 @@ std::string Attribute::getAsString(bool InAttrGrp) const {
     return "shadowcallstack";
   if (hasAttribute(Attribute::StrictFP))
     return "strictfp";
-  if (hasAttribute(Attribute::StructRet))
-    return "sret";
   if (hasAttribute(Attribute::SanitizeThread))
     return "sanitize_thread";
   if (hasAttribute(Attribute::SanitizeMemory))
@@ -441,14 +451,19 @@ std::string Attribute::getAsString(bool InAttrGrp) const {
     return "zeroext";
   if (hasAttribute(Attribute::Cold))
     return "cold";
+  if (hasAttribute(Attribute::Hot))
+    return "hot";
   if (hasAttribute(Attribute::ImmArg))
     return "immarg";
   if (hasAttribute(Attribute::NoUndef))
     return "noundef";
+  if (hasAttribute(Attribute::MustProgress))
+    return "mustprogress";
 
-  if (hasAttribute(Attribute::ByVal)) {
+  const bool IsByVal = hasAttribute(Attribute::ByVal);
+  if (IsByVal || hasAttribute(Attribute::StructRet)) {
     std::string Result;
-    Result += "byval";
+    Result += IsByVal ? "byval" : "sret";
     if (Type *Ty = getValueAsType()) {
       raw_string_ostream OS(Result);
       Result += '(';
@@ -459,9 +474,9 @@ std::string Attribute::getAsString(bool InAttrGrp) const {
     return Result;
   }
 
-  if (hasAttribute(Attribute::Preallocated)) {
-    std::string Result;
-    Result += "preallocated";
+  const bool IsByRef = hasAttribute(Attribute::ByRef);
+  if (IsByRef || hasAttribute(Attribute::Preallocated)) {
+    std::string Result = IsByRef ? "byref" : "preallocated";
     raw_string_ostream OS(Result);
     Result += '(';
     getValueAsType()->print(OS, false, true);
@@ -742,10 +757,18 @@ uint64_t AttributeSet::getDereferenceableOrNullBytes() const {
   return SetNode ? SetNode->getDereferenceableOrNullBytes() : 0;
 }
 
+Type *AttributeSet::getByRefType() const {
+  return SetNode ? SetNode->getByRefType() : nullptr;
+}
+
 Type *AttributeSet::getByValType() const {
   return SetNode ? SetNode->getByValType() : nullptr;
 }
 
+Type *AttributeSet::getStructRetType() const {
+  return SetNode ? SetNode->getStructRetType() : nullptr;
+}
+
 Type *AttributeSet::getPreallocatedType() const {
   return SetNode ? SetNode->getPreallocatedType() : nullptr;
 }
@@ -842,6 +865,12 @@ AttributeSetNode *AttributeSetNode::get(LLVMContext &C, const AttrBuilder &B) {
     case Attribute::ByVal:
       Attr = Attribute::getWithByValType(C, B.getByValType());
       break;
+    case Attribute::StructRet:
+      Attr = Attribute::getWithStructRetType(C, B.getStructRetType());
+      break;
+    case Attribute::ByRef:
+      Attr = Attribute::getWithByRefType(C, B.getByRefType());
+      break;
     case Attribute::Preallocated:
       Attr = Attribute::getWithPreallocatedType(C, B.getPreallocatedType());
       break;
@@ -925,14 +954,25 @@ MaybeAlign AttributeSetNode::getStackAlignment() const {
 Type *AttributeSetNode::getByValType() const {
   if (auto A = findEnumAttribute(Attribute::ByVal))
     return A->getValueAsType();
-  return 0;
+  return nullptr;
+}
+
+Type *AttributeSetNode::getStructRetType() const {
+  if (auto A = findEnumAttribute(Attribute::StructRet))
+    return A->getValueAsType();
+  return nullptr;
+}
+
+Type *AttributeSetNode::getByRefType() const {
+  if (auto A = findEnumAttribute(Attribute::ByRef))
+    return A->getValueAsType();
+  return nullptr;
 }
 
 Type *AttributeSetNode::getPreallocatedType() const {
-  for (const auto &I : *this)
-    if (I.hasAttribute(Attribute::Preallocated))
-      return I.getValueAsType();
-  return 0;
+  if (auto A = findEnumAttribute(Attribute::Preallocated))
+    return A->getValueAsType();
+  return nullptr;
 }
 
 uint64_t AttributeSetNode::getDereferenceableBytes() const {
@@ -970,7 +1010,7 @@ std::string AttributeSetNode::getAsString(bool InAttrGrp) const {
 
 /// Map from AttributeList index to the internal array index. Adding one happens
 /// to work, because -1 wraps around to 0.
-static constexpr unsigned attrIdxToArrayIdx(unsigned Index) {
+static unsigned attrIdxToArrayIdx(unsigned Index) {
   return Index + 1;
 }
 
@@ -983,9 +1023,7 @@ AttributeListImpl::AttributeListImpl(ArrayRef<AttributeSet> Sets)
 
   // Initialize AvailableFunctionAttrs and AvailableSomewhereAttrs
   // summary bitsets.
-  static_assert(attrIdxToArrayIdx(AttributeList::FunctionIndex) == 0U,
-                "function should be stored in slot 0");
-  for (const auto &I : Sets[0])
+  for (const auto &I : Sets[attrIdxToArrayIdx(AttributeList::FunctionIndex)])
     if (!I.isStringAttribute())
       AvailableFunctionAttrs.addAttribute(I.getKindAsEnum());
 
@@ -1073,10 +1111,10 @@ AttributeList::get(LLVMContext &C,
                            return LHS.first < RHS.first;
                          }) &&
          "Misordered Attributes list!");
-  assert(llvm::none_of(Attrs,
-                       [](const std::pair<unsigned, Attribute> &Pair) {
-                         return Pair.second.hasAttribute(Attribute::None);
-                       }) &&
+  assert(llvm::all_of(Attrs,
+                      [](const std::pair<unsigned, Attribute> &Pair) {
+                        return Pair.second.isValid();
+                      }) &&
          "Pointless attribute!");
 
   // Create a vector if (unsigned, AttributeSetNode*) pairs from the attributes
@@ -1164,7 +1202,7 @@ AttributeList AttributeList::get(LLVMContext &C, AttributeSet FnAttrs,
   if (NumSets > 2) {
     // Drop the empty argument attribute sets at the end.
     ArgAttrs = ArgAttrs.take_front(NumSets - 2);
-    AttrSets.insert(AttrSets.end(), ArgAttrs.begin(), ArgAttrs.end());
+    llvm::append_range(AttrSets, ArgAttrs);
   }
 
   return getImpl(C, AttrSets);
@@ -1236,9 +1274,11 @@ AttributeList AttributeList::get(LLVMContext &C,
 AttributeList AttributeList::addAttribute(LLVMContext &C, unsigned Index,
                                           Attribute::AttrKind Kind) const {
   if (hasAttribute(Index, Kind)) return *this;
-  AttrBuilder B;
-  B.addAttribute(Kind);
-  return addAttributes(C, Index, B);
+  AttributeSet Attrs = getAttributes(Index);
+  // TODO: Insert at correct position and avoid sort.
+  SmallVector<Attribute, 8> NewAttrs(Attrs.begin(), Attrs.end());
+  NewAttrs.push_back(Attribute::get(C, Kind));
+  return setAttributes(C, Index, AttributeSet::get(C, NewAttrs));
 }
 
 AttributeList AttributeList::addAttribute(LLVMContext &C, unsigned Index,
@@ -1256,6 +1296,16 @@ AttributeList AttributeList::addAttribute(LLVMContext &C, unsigned Index,
   return addAttributes(C, Index, B);
 }
 
+AttributeList AttributeList::setAttributes(LLVMContext &C, unsigned Index,
+                                           AttributeSet Attrs) const {
+  Index = attrIdxToArrayIdx(Index);
+  SmallVector<AttributeSet, 4> AttrSets(this->begin(), this->end());
+  if (Index >= AttrSets.size())
+    AttrSets.resize(Index + 1);
+  AttrSets[Index] = Attrs;
+  return AttributeList::getImpl(C, AttrSets);
+}
+
 AttributeList AttributeList::addAttributes(LLVMContext &C, unsigned Index,
                                            const AttrBuilder &B) const {
   if (!B.hasAttributes())
@@ -1273,16 +1323,9 @@ AttributeList AttributeList::addAttributes(LLVMContext &C, unsigned Index,
          "Attempt to change alignment!");
 #endif
 
-  Index = attrIdxToArrayIdx(Index);
-  SmallVector<AttributeSet, 4> AttrSets(this->begin(), this->end());
-  if (Index >= AttrSets.size())
-    AttrSets.resize(Index + 1);
-
-  AttrBuilder Merged(AttrSets[Index]);
+  AttrBuilder Merged(getAttributes(Index));
   Merged.merge(B);
-  AttrSets[Index] = AttributeSet::get(C, Merged);
-
-  return getImpl(C, AttrSets);
+  return setAttributes(C, Index, AttributeSet::get(C, Merged));
 }
 
 AttributeList AttributeList::addParamAttribute(LLVMContext &C,
@@ -1452,6 +1495,14 @@ Type *AttributeList::getParamByValType(unsigned Index) const {
   return getAttributes(Index+FirstArgIndex).getByValType();
 }
 
+Type *AttributeList::getParamStructRetType(unsigned Index) const {
+  return getAttributes(Index + FirstArgIndex).getStructRetType();
+}
+
+Type *AttributeList::getParamByRefType(unsigned Index) const {
+  return getAttributes(Index + FirstArgIndex).getByRefType();
+}
+
 Type *AttributeList::getParamPreallocatedType(unsigned Index) const {
   return getAttributes(Index + FirstArgIndex).getPreallocatedType();
 }
@@ -1537,17 +1588,11 @@ void AttrBuilder::clear() {
   DerefBytes = DerefOrNullBytes = 0;
   AllocSizeArgs = 0;
   ByValType = nullptr;
+  StructRetType = nullptr;
+  ByRefType = nullptr;
   PreallocatedType = nullptr;
 }
 
-AttrBuilder &AttrBuilder::addAttribute(Attribute::AttrKind Val) {
-  assert((unsigned)Val < Attribute::EndAttrKinds && "Attribute out of range!");
-  assert(!Attribute::doesAttrKindHaveArgument(Val) &&
-         "Adding integer attribute without adding a value!");
-  Attrs[Val] = true;
-  return *this;
-}
-
 AttrBuilder &AttrBuilder::addAttribute(Attribute Attr) {
   if (Attr.isStringAttribute()) {
     addAttribute(Attr.getKindAsString(), Attr.getValueAsString());
@@ -1563,6 +1608,10 @@ AttrBuilder &AttrBuilder::addAttribute(Attribute Attr) {
     StackAlignment = Attr.getStackAlignment();
   else if (Kind == Attribute::ByVal)
     ByValType = Attr.getValueAsType();
+  else if (Kind == Attribute::StructRet)
+    StructRetType = Attr.getValueAsType();
+  else if (Kind == Attribute::ByRef)
+    ByRefType = Attr.getValueAsType();
   else if (Kind == Attribute::Preallocated)
     PreallocatedType = Attr.getValueAsType();
   else if (Kind == Attribute::Dereferenceable)
@@ -1589,6 +1638,10 @@ AttrBuilder &AttrBuilder::removeAttribute(Attribute::AttrKind Val) {
     StackAlignment.reset();
   else if (Val == Attribute::ByVal)
     ByValType = nullptr;
+  else if (Val == Attribute::StructRet)
+    StructRetType = nullptr;
+  else if (Val == Attribute::ByRef)
+    ByRefType = nullptr;
   else if (Val == Attribute::Preallocated)
     PreallocatedType = nullptr;
   else if (Val == Attribute::Dereferenceable)
@@ -1679,6 +1732,18 @@ AttrBuilder &AttrBuilder::addByValAttr(Type *Ty) {
   return *this;
 }
 
+AttrBuilder &AttrBuilder::addStructRetAttr(Type *Ty) {
+  Attrs[Attribute::StructRet] = true;
+  StructRetType = Ty;
+  return *this;
+}
+
+AttrBuilder &AttrBuilder::addByRefAttr(Type *Ty) {
+  Attrs[Attribute::ByRef] = true;
+  ByRefType = Ty;
+  return *this;
+}
+
 AttrBuilder &AttrBuilder::addPreallocatedAttr(Type *Ty) {
   Attrs[Attribute::Preallocated] = true;
   PreallocatedType = Ty;
@@ -1705,6 +1770,12 @@ AttrBuilder &AttrBuilder::merge(const AttrBuilder &B) {
   if (!ByValType)
     ByValType = B.ByValType;
 
+  if (!StructRetType)
+    StructRetType = B.StructRetType;
+
+  if (!ByRefType)
+    ByRefType = B.ByRefType;
+
   if (!PreallocatedType)
     PreallocatedType = B.PreallocatedType;
 
@@ -1736,6 +1807,12 @@ AttrBuilder &AttrBuilder::remove(const AttrBuilder &B) {
   if (B.ByValType)
     ByValType = nullptr;
 
+  if (B.StructRetType)
+    StructRetType = nullptr;
+
+  if (B.ByRefType)
+    ByRefType = nullptr;
+
   if (B.PreallocatedType)
     PreallocatedType = nullptr;
 
@@ -1788,7 +1865,7 @@ bool AttrBuilder::hasAlignmentAttr() const {
   return Alignment != 0;
 }
 
-bool AttrBuilder::operator==(const AttrBuilder &B) {
+bool AttrBuilder::operator==(const AttrBuilder &B) const {
   if (Attrs != B.Attrs)
     return false;
 
@@ -1799,6 +1876,7 @@ bool AttrBuilder::operator==(const AttrBuilder &B) {
 
   return Alignment == B.Alignment && StackAlignment == B.StackAlignment &&
          DerefBytes == B.DerefBytes && ByValType == B.ByValType &&
+         StructRetType == B.StructRetType && ByRefType == B.ByRefType &&
          PreallocatedType == B.PreallocatedType;
 }
 
@@ -1821,14 +1899,20 @@ AttrBuilder AttributeFuncs::typeIncompatible(Type *Ty) {
         .addAttribute(Attribute::NoAlias)
         .addAttribute(Attribute::NoCapture)
         .addAttribute(Attribute::NonNull)
+        .addAlignmentAttr(1)             // the int here is ignored
         .addDereferenceableAttr(1)       // the int here is ignored
         .addDereferenceableOrNullAttr(1) // the int here is ignored
         .addAttribute(Attribute::ReadNone)
         .addAttribute(Attribute::ReadOnly)
-        .addAttribute(Attribute::StructRet)
         .addAttribute(Attribute::InAlloca)
         .addPreallocatedAttr(Ty)
-        .addByValAttr(Ty);
+        .addByValAttr(Ty)
+        .addStructRetAttr(Ty)
+        .addByRefAttr(Ty);
+
+  // Some attributes can apply to all "values" but there are no `void` values.
+  if (Ty->isVoidTy())
+    Incompatible.addAttribute(Attribute::NoUndef);
 
   return Incompatible;
 }
@@ -1866,6 +1950,16 @@ static void setOR(Function &Caller, const Function &Callee) {
 /// If the inlined function had a higher stack protection level than the
 /// calling function, then bump up the caller's stack protection level.
 static void adjustCallerSSPLevel(Function &Caller, const Function &Callee) {
+#ifndef NDEBUG
+  if (!Callee.hasFnAttribute(Attribute::AlwaysInline)) {
+    assert(!(!Callee.hasStackProtectorFnAttr() &&
+             Caller.hasStackProtectorFnAttr()) &&
+           "stack protected caller but callee requested no stack protector");
+    assert(!(!Caller.hasStackProtectorFnAttr() &&
+             Callee.hasStackProtectorFnAttr()) &&
+           "stack protected callee but caller requested no stack protector");
+  }
+#endif
   // If upgrading the SSP attribute, clear out the old SSP Attributes first.
   // Having multiple SSP attributes doesn't actually hurt, but it adds useless
   // clutter to the IR.
@@ -1901,21 +1995,19 @@ static void adjustCallerStackProbes(Function &Caller, const Function &Callee) {
 /// that is no larger.
 static void
 adjustCallerStackProbeSize(Function &Caller, const Function &Callee) {
-  if (Callee.hasFnAttribute("stack-probe-size")) {
-    uint64_t CalleeStackProbeSize;
-    Callee.getFnAttribute("stack-probe-size")
-          .getValueAsString()
-          .getAsInteger(0, CalleeStackProbeSize);
-    if (Caller.hasFnAttribute("stack-probe-size")) {
-      uint64_t CallerStackProbeSize;
-      Caller.getFnAttribute("stack-probe-size")
-            .getValueAsString()
-            .getAsInteger(0, CallerStackProbeSize);
+  Attribute CalleeAttr = Callee.getFnAttribute("stack-probe-size");
+  if (CalleeAttr.isValid()) {
+    Attribute CallerAttr = Caller.getFnAttribute("stack-probe-size");
+    if (CallerAttr.isValid()) {
+      uint64_t CallerStackProbeSize, CalleeStackProbeSize;
+      CallerAttr.getValueAsString().getAsInteger(0, CallerStackProbeSize);
+      CalleeAttr.getValueAsString().getAsInteger(0, CalleeStackProbeSize);
+
       if (CallerStackProbeSize > CalleeStackProbeSize) {
-        Caller.addFnAttr(Callee.getFnAttribute("stack-probe-size"));
+        Caller.addFnAttr(CalleeAttr);
       }
     } else {
-      Caller.addFnAttr(Callee.getFnAttribute("stack-probe-size"));
+      Caller.addFnAttr(CalleeAttr);
     }
   }
 }
@@ -1931,18 +2023,15 @@ adjustCallerStackProbeSize(Function &Caller, const Function &Callee) {
 /// handled as part of inline cost analysis.
 static void
 adjustMinLegalVectorWidth(Function &Caller, const Function &Callee) {
-  if (Caller.hasFnAttribute("min-legal-vector-width")) {
-    if (Callee.hasFnAttribute("min-legal-vector-width")) {
-      uint64_t CallerVectorWidth;
-      Caller.getFnAttribute("min-legal-vector-width")
-            .getValueAsString()
-            .getAsInteger(0, CallerVectorWidth);
-      uint64_t CalleeVectorWidth;
-      Callee.getFnAttribute("min-legal-vector-width")
-            .getValueAsString()
-            .getAsInteger(0, CalleeVectorWidth);
+  Attribute CallerAttr = Caller.getFnAttribute("min-legal-vector-width");
+  if (CallerAttr.isValid()) {
+    Attribute CalleeAttr = Callee.getFnAttribute("min-legal-vector-width");
+    if (CalleeAttr.isValid()) {
+      uint64_t CallerVectorWidth, CalleeVectorWidth;
+      CallerAttr.getValueAsString().getAsInteger(0, CallerVectorWidth);
+      CalleeAttr.getValueAsString().getAsInteger(0, CalleeVectorWidth);
       if (CallerVectorWidth < CalleeVectorWidth)
-        Caller.addFnAttr(Callee.getFnAttribute("min-legal-vector-width"));
+        Caller.addFnAttr(CalleeAttr);
     } else {
       // If the callee doesn't have the attribute then we don't know anything
       // and must drop the attribute from the caller.
@@ -2009,7 +2098,25 @@ bool AttributeFuncs::areInlineCompatible(const Function &Caller,
   return hasCompatibleFnAttrs(Caller, Callee);
 }
 
+bool AttributeFuncs::areOutlineCompatible(const Function &A,
+                                          const Function &B) {
+  return hasCompatibleFnAttrs(A, B);
+}
+
 void AttributeFuncs::mergeAttributesForInlining(Function &Caller,
                                                 const Function &Callee) {
   mergeFnAttrs(Caller, Callee);
 }
+
+void AttributeFuncs::mergeAttributesForOutlining(Function &Base,
+                                                const Function &ToMerge) {
+
+  // We merge functions so that they meet the most general case.
+  // For example, if the NoNansFPMathAttr is set in one function, but not in
+  // the other, in the merged function we can say that the NoNansFPMathAttr
+  // is not set.
+  // However if we have the SpeculativeLoadHardeningAttr set true in one
+  // function, but not the other, we make sure that the function retains
+  // that aspect in the merged function.
+  mergeFnAttrs(Base, ToMerge);
+}
diff --git a/contrib/llvm-project/llvm/lib/IR/AutoUpgrade.cpp b/contrib/llvm-project/llvm/lib/IR/AutoUpgrade.cpp
index 1e8fdb506619..7d83cf5dcf1d 100644
--- a/contrib/llvm-project/llvm/lib/IR/AutoUpgrade.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/AutoUpgrade.cpp
@@ -23,6 +23,7 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsAArch64.h"
 #include "llvm/IR/IntrinsicsARM.h"
 #include "llvm/IR/IntrinsicsX86.h"
@@ -68,6 +69,19 @@ static bool UpgradeX86IntrinsicsWith8BitMask(Function *F, Intrinsic::ID IID,
   return true;
 }
 
+// Upgrade the declaration of fp compare intrinsics that change return type
+// from scalar to vXi1 mask.
+static bool UpgradeX86MaskedFPCompare(Function *F, Intrinsic::ID IID,
+                                      Function *&NewFn) {
+  // Check if the return type is a vector.
+  if (F->getReturnType()->isVectorTy())
+    return false;
+
+  rename(F);
+  NewFn = Intrinsic::getDeclaration(F->getParent(), IID);
+  return true;
+}
+
 static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
   // All of the intrinsics matches below should be marked with which llvm
   // version started autoupgrading them. At some point in the future we would
@@ -241,7 +255,7 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
       Name.startswith("avx512.mask.cmp.d") || // Added in 5.0
       Name.startswith("avx512.mask.cmp.q") || // Added in 5.0
       Name.startswith("avx512.mask.cmp.w") || // Added in 5.0
-      Name.startswith("avx512.mask.cmp.p") || // Added in 7.0
+      Name.startswith("avx512.cmp.p") || // Added in 12.0
       Name.startswith("avx512.mask.ucmp.") || // Added in 5.0
       Name.startswith("avx512.cvtb2mask.") || // Added in 7.0
       Name.startswith("avx512.cvtw2mask.") || // Added in 7.0
@@ -456,6 +470,24 @@ static bool UpgradeX86IntrinsicFunction(Function *F, StringRef Name,
   if (Name == "avx2.mpsadbw") // Added in 3.6
     return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_avx2_mpsadbw,
                                             NewFn);
+  if (Name == "avx512.mask.cmp.pd.128") // Added in 7.0
+    return UpgradeX86MaskedFPCompare(F, Intrinsic::x86_avx512_mask_cmp_pd_128,
+                                     NewFn);
+  if (Name == "avx512.mask.cmp.pd.256") // Added in 7.0
+    return UpgradeX86MaskedFPCompare(F, Intrinsic::x86_avx512_mask_cmp_pd_256,
+                                     NewFn);
+  if (Name == "avx512.mask.cmp.pd.512") // Added in 7.0
+    return UpgradeX86MaskedFPCompare(F, Intrinsic::x86_avx512_mask_cmp_pd_512,
+                                     NewFn);
+  if (Name == "avx512.mask.cmp.ps.128") // Added in 7.0
+    return UpgradeX86MaskedFPCompare(F, Intrinsic::x86_avx512_mask_cmp_ps_128,
+                                     NewFn);
+  if (Name == "avx512.mask.cmp.ps.256") // Added in 7.0
+    return UpgradeX86MaskedFPCompare(F, Intrinsic::x86_avx512_mask_cmp_ps_256,
+                                     NewFn);
+  if (Name == "avx512.mask.cmp.ps.512") // Added in 7.0
+    return UpgradeX86MaskedFPCompare(F, Intrinsic::x86_avx512_mask_cmp_ps_512,
+                                     NewFn);
 
   // frcz.ss/sd may need to have an argument dropped. Added in 3.2
   if (Name.startswith("xop.vfrcz.ss") && F->arg_size() == 2) {
@@ -601,6 +633,63 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
         return true;
       }
     }
+
+    // Changed in 12.0: bfdot accept v4bf16 and v8bf16 instead of v8i8 and v16i8
+    // respectively
+    if ((Name.startswith("arm.neon.bfdot.") ||
+         Name.startswith("aarch64.neon.bfdot.")) &&
+        Name.endswith("i8")) {
+      Intrinsic::ID IID =
+          StringSwitch<Intrinsic::ID>(Name)
+              .Cases("arm.neon.bfdot.v2f32.v8i8",
+                     "arm.neon.bfdot.v4f32.v16i8",
+                     Intrinsic::arm_neon_bfdot)
+              .Cases("aarch64.neon.bfdot.v2f32.v8i8",
+                     "aarch64.neon.bfdot.v4f32.v16i8",
+                     Intrinsic::aarch64_neon_bfdot)
+              .Default(Intrinsic::not_intrinsic);
+      if (IID == Intrinsic::not_intrinsic)
+        break;
+
+      size_t OperandWidth = F->getReturnType()->getPrimitiveSizeInBits();
+      assert((OperandWidth == 64 || OperandWidth == 128) &&
+             "Unexpected operand width");
+      LLVMContext &Ctx = F->getParent()->getContext();
+      std::array<Type *, 2> Tys {{
+        F->getReturnType(),
+        FixedVectorType::get(Type::getBFloatTy(Ctx), OperandWidth / 16)
+      }};
+      NewFn = Intrinsic::getDeclaration(F->getParent(), IID, Tys);
+      return true;
+    }
+
+    // Changed in 12.0: bfmmla, bfmlalb and bfmlalt are not polymorphic anymore
+    // and accept v8bf16 instead of v16i8
+    if ((Name.startswith("arm.neon.bfm") ||
+         Name.startswith("aarch64.neon.bfm")) &&
+        Name.endswith(".v4f32.v16i8")) {
+      Intrinsic::ID IID =
+          StringSwitch<Intrinsic::ID>(Name)
+              .Case("arm.neon.bfmmla.v4f32.v16i8",
+                    Intrinsic::arm_neon_bfmmla)
+              .Case("arm.neon.bfmlalb.v4f32.v16i8",
+                    Intrinsic::arm_neon_bfmlalb)
+              .Case("arm.neon.bfmlalt.v4f32.v16i8",
+                    Intrinsic::arm_neon_bfmlalt)
+              .Case("aarch64.neon.bfmmla.v4f32.v16i8",
+                    Intrinsic::aarch64_neon_bfmmla)
+              .Case("aarch64.neon.bfmlalb.v4f32.v16i8",
+                    Intrinsic::aarch64_neon_bfmlalb)
+              .Case("aarch64.neon.bfmlalt.v4f32.v16i8",
+                    Intrinsic::aarch64_neon_bfmlalt)
+              .Default(Intrinsic::not_intrinsic);
+      if (IID == Intrinsic::not_intrinsic)
+        break;
+
+      std::array<Type *, 0> Tys;
+      NewFn = Intrinsic::getDeclaration(F->getParent(), IID, Tys);
+      return true;
+    }
     break;
   }
 
@@ -629,18 +718,42 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
   }
   case 'e': {
     SmallVector<StringRef, 2> Groups;
-    static const Regex R("^experimental.vector.reduce.([a-z]+)\\.[fi][0-9]+");
+    static const Regex R("^experimental.vector.reduce.([a-z]+)\\.[a-z][0-9]+");
     if (R.match(Name, &Groups)) {
+      Intrinsic::ID ID;
+      ID = StringSwitch<Intrinsic::ID>(Groups[1])
+               .Case("add", Intrinsic::vector_reduce_add)
+               .Case("mul", Intrinsic::vector_reduce_mul)
+               .Case("and", Intrinsic::vector_reduce_and)
+               .Case("or", Intrinsic::vector_reduce_or)
+               .Case("xor", Intrinsic::vector_reduce_xor)
+               .Case("smax", Intrinsic::vector_reduce_smax)
+               .Case("smin", Intrinsic::vector_reduce_smin)
+               .Case("umax", Intrinsic::vector_reduce_umax)
+               .Case("umin", Intrinsic::vector_reduce_umin)
+               .Case("fmax", Intrinsic::vector_reduce_fmax)
+               .Case("fmin", Intrinsic::vector_reduce_fmin)
+               .Default(Intrinsic::not_intrinsic);
+      if (ID != Intrinsic::not_intrinsic) {
+        rename(F);
+        auto Args = F->getFunctionType()->params();
+        NewFn = Intrinsic::getDeclaration(F->getParent(), ID, {Args[0]});
+        return true;
+      }
+    }
+    static const Regex R2(
+        "^experimental.vector.reduce.v2.([a-z]+)\\.[fi][0-9]+");
+    Groups.clear();
+    if (R2.match(Name, &Groups)) {
       Intrinsic::ID ID = Intrinsic::not_intrinsic;
       if (Groups[1] == "fadd")
-        ID = Intrinsic::experimental_vector_reduce_v2_fadd;
+        ID = Intrinsic::vector_reduce_fadd;
       if (Groups[1] == "fmul")
-        ID = Intrinsic::experimental_vector_reduce_v2_fmul;
-
+        ID = Intrinsic::vector_reduce_fmul;
       if (ID != Intrinsic::not_intrinsic) {
         rename(F);
         auto Args = F->getFunctionType()->params();
-        Type *Tys[] = {F->getFunctionType()->getReturnType(), Args[1]};
+        Type *Tys[] = {Args[1]};
         NewFn = Intrinsic::getDeclaration(F->getParent(), ID, Tys);
         return true;
       }
@@ -824,6 +937,12 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
             Intrinsic::getDeclaration(F->getParent(), Intrinsic::prefetch, Tys);
         return true;
       }
+    } else if (Name.startswith("ptr.annotation.") && F->arg_size() == 4) {
+      rename(F);
+      NewFn = Intrinsic::getDeclaration(F->getParent(),
+                                        Intrinsic::ptr_annotation,
+                                        F->arg_begin()->getType());
+      return true;
     }
     break;
 
@@ -834,6 +953,16 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
     }
     break;
 
+  case 'v': {
+    if (Name == "var.annotation" && F->arg_size() == 4) {
+      rename(F);
+      NewFn = Intrinsic::getDeclaration(F->getParent(),
+                                        Intrinsic::var_annotation);
+      return true;
+    }
+    break;
+  }
+
   case 'x':
     if (UpgradeX86IntrinsicFunction(F, Name, NewFn))
       return true;
@@ -900,7 +1029,7 @@ GlobalVariable *llvm::UpgradeGlobalVariable(GlobalVariable *GV) {
 // to byte shuffles.
 static Value *UpgradeX86PSLLDQIntrinsics(IRBuilder<> &Builder,
                                          Value *Op, unsigned Shift) {
-  auto *ResultTy = cast<VectorType>(Op->getType());
+  auto *ResultTy = cast<FixedVectorType>(Op->getType());
   unsigned NumElts = ResultTy->getNumElements() * 8;
 
   // Bitcast from a 64-bit element type to a byte element type.
@@ -934,7 +1063,7 @@ static Value *UpgradeX86PSLLDQIntrinsics(IRBuilder<> &Builder,
 // to byte shuffles.
 static Value *UpgradeX86PSRLDQIntrinsics(IRBuilder<> &Builder, Value *Op,
                                          unsigned Shift) {
-  auto *ResultTy = cast<VectorType>(Op->getType());
+  auto *ResultTy = cast<FixedVectorType>(Op->getType());
   unsigned NumElts = ResultTy->getNumElements() * 8;
 
   // Bitcast from a 64-bit element type to a byte element type.
@@ -966,19 +1095,19 @@ static Value *UpgradeX86PSRLDQIntrinsics(IRBuilder<> &Builder, Value *Op,
 
 static Value *getX86MaskVec(IRBuilder<> &Builder, Value *Mask,
                             unsigned NumElts) {
+  assert(isPowerOf2_32(NumElts) && "Expected power-of-2 mask elements");
   llvm::VectorType *MaskTy = FixedVectorType::get(
       Builder.getInt1Ty(), cast<IntegerType>(Mask->getType())->getBitWidth());
   Mask = Builder.CreateBitCast(Mask, MaskTy);
 
-  // If we have less than 8 elements, then the starting mask was an i8 and
-  // we need to extract down to the right number of elements.
-  if (NumElts < 8) {
+  // If we have less than 8 elements (1, 2 or 4), then the starting mask was an
+  // i8 and we need to extract down to the right number of elements.
+  if (NumElts <= 4) {
     int Indices[4];
     for (unsigned i = 0; i != NumElts; ++i)
       Indices[i] = i;
-    Mask = Builder.CreateShuffleVector(Mask, Mask,
-                                       makeArrayRef(Indices, NumElts),
-                                       "extract");
+    Mask = Builder.CreateShuffleVector(
+        Mask, Mask, makeArrayRef(Indices, NumElts), "extract");
   }
 
   return Mask;
@@ -992,7 +1121,7 @@ static Value *EmitX86Select(IRBuilder<> &Builder, Value *Mask,
       return Op0;
 
   Mask = getX86MaskVec(Builder, Mask,
-                       cast<VectorType>(Op0->getType())->getNumElements());
+                       cast<FixedVectorType>(Op0->getType())->getNumElements());
   return Builder.CreateSelect(Mask, Op0, Op1);
 }
 
@@ -1019,7 +1148,7 @@ static Value *UpgradeX86ALIGNIntrinsics(IRBuilder<> &Builder, Value *Op0,
                                         bool IsVALIGN) {
   unsigned ShiftVal = cast<llvm::ConstantInt>(Shift)->getZExtValue();
 
-  unsigned NumElts = cast<VectorType>(Op0->getType())->getNumElements();
+  unsigned NumElts = cast<FixedVectorType>(Op0->getType())->getNumElements();
   assert((IsVALIGN || NumElts % 16 == 0) && "Illegal NumElts for PALIGNR!");
   assert((!IsVALIGN || NumElts <= 16) && "NumElts too large for VALIGN!");
   assert(isPowerOf2_32(NumElts) && "NumElts not a power of 2!");
@@ -1120,15 +1249,11 @@ static Value *UpgradeX86VPERMT2Intrinsics(IRBuilder<> &Builder, CallInst &CI,
   return EmitX86Select(Builder, CI.getArgOperand(3), V, PassThru);
 }
 
-static Value *UpgradeX86AddSubSatIntrinsics(IRBuilder<> &Builder, CallInst &CI,
-                                            bool IsSigned, bool IsAddition) {
+static Value *UpgradeX86BinaryIntrinsics(IRBuilder<> &Builder, CallInst &CI,
+                                         Intrinsic::ID IID) {
   Type *Ty = CI.getType();
   Value *Op0 = CI.getOperand(0);
   Value *Op1 = CI.getOperand(1);
-
-  Intrinsic::ID IID =
-      IsSigned ? (IsAddition ? Intrinsic::sadd_sat : Intrinsic::ssub_sat)
-               : (IsAddition ? Intrinsic::uadd_sat : Intrinsic::usub_sat);
   Function *Intrin = Intrinsic::getDeclaration(CI.getModule(), IID, Ty);
   Value *Res = Builder.CreateCall(Intrin, {Op0, Op1});
 
@@ -1150,7 +1275,7 @@ static Value *upgradeX86Rotate(IRBuilder<> &Builder, CallInst &CI,
   // Funnel shifts amounts are treated as modulo and types are all power-of-2 so
   // we only care about the lowest log2 bits anyway.
   if (Amt->getType() != Ty) {
-    unsigned NumElts = cast<VectorType>(Ty)->getNumElements();
+    unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
     Amt = Builder.CreateIntCast(Amt, Ty->getScalarType(), false);
     Amt = Builder.CreateVectorSplat(NumElts, Amt);
   }
@@ -1220,7 +1345,7 @@ static Value *upgradeX86ConcatShift(IRBuilder<> &Builder, CallInst &CI,
   // Funnel shifts amounts are treated as modulo and types are all power-of-2 so
   // we only care about the lowest log2 bits anyway.
   if (Amt->getType() != Ty) {
-    unsigned NumElts = cast<VectorType>(Ty)->getNumElements();
+    unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
     Amt = Builder.CreateIntCast(Amt, Ty->getScalarType(), false);
     Amt = Builder.CreateVectorSplat(NumElts, Amt);
   }
@@ -1257,7 +1382,7 @@ static Value *UpgradeMaskedStore(IRBuilder<> &Builder,
       return Builder.CreateAlignedStore(Data, Ptr, Alignment);
 
   // Convert the mask from an integer type to a vector of i1.
-  unsigned NumElts = cast<VectorType>(Data->getType())->getNumElements();
+  unsigned NumElts = cast<FixedVectorType>(Data->getType())->getNumElements();
   Mask = getX86MaskVec(Builder, Mask, NumElts);
   return Builder.CreateMaskedStore(Data, Ptr, Alignment, Mask);
 }
@@ -1280,35 +1405,19 @@ static Value *UpgradeMaskedLoad(IRBuilder<> &Builder,
       return Builder.CreateAlignedLoad(ValTy, Ptr, Alignment);
 
   // Convert the mask from an integer type to a vector of i1.
-  unsigned NumElts = cast<VectorType>(Passthru->getType())->getNumElements();
+  unsigned NumElts =
+      cast<FixedVectorType>(Passthru->getType())->getNumElements();
   Mask = getX86MaskVec(Builder, Mask, NumElts);
   return Builder.CreateMaskedLoad(Ptr, Alignment, Mask, Passthru);
 }
 
 static Value *upgradeAbs(IRBuilder<> &Builder, CallInst &CI) {
+  Type *Ty = CI.getType();
   Value *Op0 = CI.getArgOperand(0);
-  llvm::Type *Ty = Op0->getType();
-  Value *Zero = llvm::Constant::getNullValue(Ty);
-  Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_SGT, Op0, Zero);
-  Value *Neg = Builder.CreateNeg(Op0);
-  Value *Res = Builder.CreateSelect(Cmp, Op0, Neg);
-
+  Function *F = Intrinsic::getDeclaration(CI.getModule(), Intrinsic::abs, Ty);
+  Value *Res = Builder.CreateCall(F, {Op0, Builder.getInt1(false)});
   if (CI.getNumArgOperands() == 3)
-    Res = EmitX86Select(Builder,CI.getArgOperand(2), Res, CI.getArgOperand(1));
-
-  return Res;
-}
-
-static Value *upgradeIntMinMax(IRBuilder<> &Builder, CallInst &CI,
-                               ICmpInst::Predicate Pred) {
-  Value *Op0 = CI.getArgOperand(0);
-  Value *Op1 = CI.getArgOperand(1);
-  Value *Cmp = Builder.CreateICmp(Pred, Op0, Op1);
-  Value *Res = Builder.CreateSelect(Cmp, Op0, Op1);
-
-  if (CI.getNumArgOperands() == 4)
-    Res = EmitX86Select(Builder, CI.getArgOperand(3), Res, CI.getArgOperand(2));
-
+    Res = EmitX86Select(Builder, CI.getArgOperand(2), Res, CI.getArgOperand(1));
   return Res;
 }
 
@@ -1344,7 +1453,7 @@ static Value *upgradePMULDQ(IRBuilder<> &Builder, CallInst &CI, bool IsSigned) {
 // Applying mask on vector of i1's and make sure result is at least 8 bits wide.
 static Value *ApplyX86MaskOn1BitsVec(IRBuilder<> &Builder, Value *Vec,
                                      Value *Mask) {
-  unsigned NumElts = cast<VectorType>(Vec->getType())->getNumElements();
+  unsigned NumElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
   if (Mask) {
     const auto *C = dyn_cast<Constant>(Mask);
     if (!C || !C->isAllOnesValue())
@@ -1367,7 +1476,7 @@ static Value *ApplyX86MaskOn1BitsVec(IRBuilder<> &Builder, Value *Vec,
 static Value *upgradeMaskedCompare(IRBuilder<> &Builder, CallInst &CI,
                                    unsigned CC, bool Signed) {
   Value *Op0 = CI.getArgOperand(0);
-  unsigned NumElts = cast<VectorType>(Op0->getType())->getNumElements();
+  unsigned NumElts = cast<FixedVectorType>(Op0->getType())->getNumElements();
 
   Value *Cmp;
   if (CC == 3) {
@@ -1422,7 +1531,7 @@ static Value* upgradeMaskedMove(IRBuilder<> &Builder, CallInst &CI) {
 static Value* UpgradeMaskToInt(IRBuilder<> &Builder, CallInst &CI) {
   Value* Op = CI.getArgOperand(0);
   Type* ReturnOp = CI.getType();
-  unsigned NumElts = cast<VectorType>(CI.getType())->getNumElements();
+  unsigned NumElts = cast<FixedVectorType>(CI.getType())->getNumElements();
   Value *Mask = getX86MaskVec(Builder, Op, NumElts);
   return Builder.CreateSExt(Mask, ReturnOp, "vpmovm2");
 }
@@ -1676,7 +1785,6 @@ void llvm::UpgradeInlineAsmString(std::string *AsmStr) {
       (Pos = AsmStr->find("# marker")) != std::string::npos) {
     AsmStr->replace(Pos, 1, ";");
   }
-  return;
 }
 
 /// Upgrade a call to an old intrinsic. All argument and return casting must be
@@ -1871,8 +1979,8 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Rep = Builder.CreateICmp(Pred, Rep, Zero);
       Rep = ApplyX86MaskOn1BitsVec(Builder, Rep, Mask);
     } else if (IsX86 && (Name.startswith("avx512.mask.pbroadcast"))){
-      unsigned NumElts =
-          cast<VectorType>(CI->getArgOperand(1)->getType())->getNumElements();
+      unsigned NumElts = cast<FixedVectorType>(CI->getArgOperand(1)->getType())
+                             ->getNumElements();
       Rep = Builder.CreateVectorSplat(NumElts, CI->getArgOperand(0));
       Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
                           CI->getArgOperand(1));
@@ -2000,38 +2108,36 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
                                { CI->getOperand(0), CI->getArgOperand(1) });
       Rep = ApplyX86MaskOn1BitsVec(Builder, Rep, CI->getArgOperand(2));
-    } else if (IsX86 && Name.startswith("avx512.mask.cmp.p")) {
-      Type *OpTy = CI->getArgOperand(0)->getType();
+    } else if (IsX86 && Name.startswith("avx512.cmp.p")) {
+      SmallVector<Value *, 4> Args(CI->arg_operands().begin(),
+                                   CI->arg_operands().end());
+      Type *OpTy = Args[0]->getType();
       unsigned VecWidth = OpTy->getPrimitiveSizeInBits();
       unsigned EltWidth = OpTy->getScalarSizeInBits();
       Intrinsic::ID IID;
       if (VecWidth == 128 && EltWidth == 32)
-        IID = Intrinsic::x86_avx512_cmp_ps_128;
+        IID = Intrinsic::x86_avx512_mask_cmp_ps_128;
       else if (VecWidth == 256 && EltWidth == 32)
-        IID = Intrinsic::x86_avx512_cmp_ps_256;
+        IID = Intrinsic::x86_avx512_mask_cmp_ps_256;
       else if (VecWidth == 512 && EltWidth == 32)
-        IID = Intrinsic::x86_avx512_cmp_ps_512;
+        IID = Intrinsic::x86_avx512_mask_cmp_ps_512;
       else if (VecWidth == 128 && EltWidth == 64)
-        IID = Intrinsic::x86_avx512_cmp_pd_128;
+        IID = Intrinsic::x86_avx512_mask_cmp_pd_128;
       else if (VecWidth == 256 && EltWidth == 64)
-        IID = Intrinsic::x86_avx512_cmp_pd_256;
+        IID = Intrinsic::x86_avx512_mask_cmp_pd_256;
       else if (VecWidth == 512 && EltWidth == 64)
-        IID = Intrinsic::x86_avx512_cmp_pd_512;
+        IID = Intrinsic::x86_avx512_mask_cmp_pd_512;
       else
         llvm_unreachable("Unexpected intrinsic");
 
-      SmallVector<Value *, 4> Args;
-      Args.push_back(CI->getArgOperand(0));
-      Args.push_back(CI->getArgOperand(1));
-      Args.push_back(CI->getArgOperand(2));
-      if (CI->getNumArgOperands() == 5)
-        Args.push_back(CI->getArgOperand(4));
+      Value *Mask = Constant::getAllOnesValue(CI->getType());
+      if (VecWidth == 512)
+        std::swap(Mask, Args.back());
+      Args.push_back(Mask);
 
       Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
                                Args);
-      Rep = ApplyX86MaskOn1BitsVec(Builder, Rep, CI->getArgOperand(3));
-    } else if (IsX86 && Name.startswith("avx512.mask.cmp.") &&
-               Name[16] != 'p') {
+    } else if (IsX86 && Name.startswith("avx512.mask.cmp.")) {
       // Integer compare intrinsics.
       unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
       Rep = upgradeMaskedCompare(Builder, *CI, Imm, true);
@@ -2057,25 +2163,25 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
                          Name == "sse41.pmaxsd" ||
                          Name.startswith("avx2.pmaxs") ||
                          Name.startswith("avx512.mask.pmaxs"))) {
-      Rep = upgradeIntMinMax(Builder, *CI, ICmpInst::ICMP_SGT);
+      Rep = UpgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::smax);
     } else if (IsX86 && (Name == "sse2.pmaxu.b" ||
                          Name == "sse41.pmaxuw" ||
                          Name == "sse41.pmaxud" ||
                          Name.startswith("avx2.pmaxu") ||
                          Name.startswith("avx512.mask.pmaxu"))) {
-      Rep = upgradeIntMinMax(Builder, *CI, ICmpInst::ICMP_UGT);
+      Rep = UpgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::umax);
     } else if (IsX86 && (Name == "sse41.pminsb" ||
                          Name == "sse2.pmins.w" ||
                          Name == "sse41.pminsd" ||
                          Name.startswith("avx2.pmins") ||
                          Name.startswith("avx512.mask.pmins"))) {
-      Rep = upgradeIntMinMax(Builder, *CI, ICmpInst::ICMP_SLT);
+      Rep = UpgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::smin);
     } else if (IsX86 && (Name == "sse2.pminu.b" ||
                          Name == "sse41.pminuw" ||
                          Name == "sse41.pminud" ||
                          Name.startswith("avx2.pminu") ||
                          Name.startswith("avx512.mask.pminu"))) {
-      Rep = upgradeIntMinMax(Builder, *CI, ICmpInst::ICMP_ULT);
+      Rep = UpgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::umin);
     } else if (IsX86 && (Name == "sse2.pmulu.dq" ||
                          Name == "avx2.pmulu.dq" ||
                          Name == "avx512.pmulu.dq.512" ||
@@ -2122,9 +2228,9 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
                          Name == "avx.cvt.ps2.pd.256" ||
                          Name == "avx512.mask.cvtps2pd.128" ||
                          Name == "avx512.mask.cvtps2pd.256")) {
-      auto *DstTy = cast<VectorType>(CI->getType());
+      auto *DstTy = cast<FixedVectorType>(CI->getType());
       Rep = CI->getArgOperand(0);
-      auto *SrcTy = cast<VectorType>(Rep->getType());
+      auto *SrcTy = cast<FixedVectorType>(Rep->getType());
 
       unsigned NumDstElts = DstTy->getNumElements();
       if (NumDstElts < SrcTy->getNumElements()) {
@@ -2154,9 +2260,9 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
                             CI->getArgOperand(1));
     } else if (IsX86 && (Name.startswith("avx512.mask.vcvtph2ps.") ||
                          Name.startswith("vcvtph2ps."))) {
-      auto *DstTy = cast<VectorType>(CI->getType());
+      auto *DstTy = cast<FixedVectorType>(CI->getType());
       Rep = CI->getArgOperand(0);
-      auto *SrcTy = cast<VectorType>(Rep->getType());
+      auto *SrcTy = cast<FixedVectorType>(Rep->getType());
       unsigned NumDstElts = DstTy->getNumElements();
       if (NumDstElts != SrcTy->getNumElements()) {
         assert(NumDstElts == 4 && "Unexpected vector size");
@@ -2177,7 +2283,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
                               CI->getArgOperand(1),CI->getArgOperand(2),
                               /*Aligned*/true);
     } else if (IsX86 && Name.startswith("avx512.mask.expand.load.")) {
-      auto *ResultTy = cast<VectorType>(CI->getType());
+      auto *ResultTy = cast<FixedVectorType>(CI->getType());
       Type *PtrTy = ResultTy->getElementType();
 
       // Cast the pointer to element type.
@@ -2199,8 +2305,9 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Value *Ptr = Builder.CreateBitCast(CI->getOperand(0),
                                          llvm::PointerType::getUnqual(PtrTy));
 
-      Value *MaskVec = getX86MaskVec(Builder, CI->getArgOperand(2),
-                                     ResultTy->getNumElements());
+      Value *MaskVec =
+          getX86MaskVec(Builder, CI->getArgOperand(2),
+                        cast<FixedVectorType>(ResultTy)->getNumElements());
 
       Function *CSt = Intrinsic::getDeclaration(F->getParent(),
                                                 Intrinsic::masked_compressstore,
@@ -2208,7 +2315,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Rep = Builder.CreateCall(CSt, { CI->getArgOperand(1), Ptr, MaskVec });
     } else if (IsX86 && (Name.startswith("avx512.mask.compress.") ||
                          Name.startswith("avx512.mask.expand."))) {
-      auto *ResultTy = cast<VectorType>(CI->getType());
+      auto *ResultTy = cast<FixedVectorType>(CI->getType());
 
       Value *MaskVec = getX86MaskVec(Builder, CI->getArgOperand(2),
                                      ResultTy->getNumElements());
@@ -2288,7 +2395,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
     } else if (IsX86 && (Name.startswith("avx.vbroadcast.s") ||
                          Name.startswith("avx512.vbroadcast.s"))) {
       // Replace broadcasts with a series of insertelements.
-      auto *VecTy = cast<VectorType>(CI->getType());
+      auto *VecTy = cast<FixedVectorType>(CI->getType());
       Type *EltTy = VecTy->getElementType();
       unsigned EltNum = VecTy->getNumElements();
       Value *Cast = Builder.CreateBitCast(CI->getArgOperand(0),
@@ -2305,8 +2412,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
                          Name.startswith("avx2.pmovzx") ||
                          Name.startswith("avx512.mask.pmovsx") ||
                          Name.startswith("avx512.mask.pmovzx"))) {
-      VectorType *SrcTy = cast<VectorType>(CI->getArgOperand(0)->getType());
-      VectorType *DstTy = cast<VectorType>(CI->getType());
+      auto *DstTy = cast<FixedVectorType>(CI->getType());
       unsigned NumDstElts = DstTy->getNumElements();
 
       // Extract a subvector of the first NumDstElts lanes and sign/zero extend.
@@ -2314,8 +2420,8 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       for (unsigned i = 0; i != NumDstElts; ++i)
         ShuffleMask[i] = i;
 
-      Value *SV = Builder.CreateShuffleVector(
-          CI->getArgOperand(0), UndefValue::get(SrcTy), ShuffleMask);
+      Value *SV =
+          Builder.CreateShuffleVector(CI->getArgOperand(0), ShuffleMask);
 
       bool DoSext = (StringRef::npos != Name.find("pmovsx"));
       Rep = DoSext ? Builder.CreateSExt(SV, DstTy)
@@ -2342,12 +2448,10 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
                                             PointerType::getUnqual(VT));
       Value *Load = Builder.CreateAlignedLoad(VT, Op, Align(1));
       if (NumSrcElts == 2)
-        Rep = Builder.CreateShuffleVector(
-            Load, UndefValue::get(Load->getType()), ArrayRef<int>{0, 1, 0, 1});
+        Rep = Builder.CreateShuffleVector(Load, ArrayRef<int>{0, 1, 0, 1});
       else
-        Rep =
-            Builder.CreateShuffleVector(Load, UndefValue::get(Load->getType()),
-                                        ArrayRef<int>{0, 1, 2, 3, 0, 1, 2, 3});
+        Rep = Builder.CreateShuffleVector(
+            Load, ArrayRef<int>{0, 1, 2, 3, 0, 1, 2, 3});
     } else if (IsX86 && (Name.startswith("avx512.mask.shuf.i") ||
                          Name.startswith("avx512.mask.shuf.f"))) {
       unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
@@ -2373,8 +2477,10 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
     }else if (IsX86 && (Name.startswith("avx512.mask.broadcastf") ||
                          Name.startswith("avx512.mask.broadcasti"))) {
       unsigned NumSrcElts =
-          cast<VectorType>(CI->getArgOperand(0)->getType())->getNumElements();
-      unsigned NumDstElts = cast<VectorType>(CI->getType())->getNumElements();
+          cast<FixedVectorType>(CI->getArgOperand(0)->getType())
+              ->getNumElements();
+      unsigned NumDstElts =
+          cast<FixedVectorType>(CI->getType())->getNumElements();
 
       SmallVector<int, 8> ShuffleMask(NumDstElts);
       for (unsigned i = 0; i != NumDstElts; ++i)
@@ -2393,30 +2499,31 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Value *Op = CI->getArgOperand(0);
       ElementCount EC = cast<VectorType>(CI->getType())->getElementCount();
       Type *MaskTy = VectorType::get(Type::getInt32Ty(C), EC);
-      Rep = Builder.CreateShuffleVector(Op, UndefValue::get(Op->getType()),
-                                        Constant::getNullValue(MaskTy));
+      SmallVector<int, 8> M;
+      ShuffleVectorInst::getShuffleMask(Constant::getNullValue(MaskTy), M);
+      Rep = Builder.CreateShuffleVector(Op, M);
 
       if (CI->getNumArgOperands() == 3)
         Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
                             CI->getArgOperand(1));
     } else if (IsX86 && (Name.startswith("sse2.padds.") ||
-                         Name.startswith("sse2.psubs.") ||
                          Name.startswith("avx2.padds.") ||
-                         Name.startswith("avx2.psubs.") ||
                          Name.startswith("avx512.padds.") ||
+                         Name.startswith("avx512.mask.padds."))) {
+      Rep = UpgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::sadd_sat);
+    } else if (IsX86 && (Name.startswith("sse2.psubs.") ||
+                         Name.startswith("avx2.psubs.") ||
                          Name.startswith("avx512.psubs.") ||
-                         Name.startswith("avx512.mask.padds.") ||
                          Name.startswith("avx512.mask.psubs."))) {
-      bool IsAdd = Name.contains(".padds");
-      Rep = UpgradeX86AddSubSatIntrinsics(Builder, *CI, true, IsAdd);
+      Rep = UpgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::ssub_sat);
     } else if (IsX86 && (Name.startswith("sse2.paddus.") ||
-                         Name.startswith("sse2.psubus.") ||
                          Name.startswith("avx2.paddus.") ||
+                         Name.startswith("avx512.mask.paddus."))) {
+      Rep = UpgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::uadd_sat);
+    } else if (IsX86 && (Name.startswith("sse2.psubus.") ||
                          Name.startswith("avx2.psubus.") ||
-                         Name.startswith("avx512.mask.paddus.") ||
                          Name.startswith("avx512.mask.psubus."))) {
-      bool IsAdd = Name.contains(".paddus");
-      Rep = UpgradeX86AddSubSatIntrinsics(Builder, *CI, false, IsAdd);
+      Rep = UpgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::usub_sat);
     } else if (IsX86 && Name.startswith("avx512.mask.palignr.")) {
       Rep = UpgradeX86ALIGNIntrinsics(Builder, CI->getArgOperand(0),
                                       CI->getArgOperand(1),
@@ -2463,7 +2570,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Value *Op0 = CI->getArgOperand(0);
       Value *Op1 = CI->getArgOperand(1);
       unsigned Imm = cast <ConstantInt>(CI->getArgOperand(2))->getZExtValue();
-      VectorType *VecTy = cast<VectorType>(CI->getType());
+      auto *VecTy = cast<FixedVectorType>(CI->getType());
       unsigned NumElts = VecTy->getNumElements();
 
       SmallVector<int, 16> Idxs(NumElts);
@@ -2477,21 +2584,22 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Value *Op0 = CI->getArgOperand(0);
       Value *Op1 = CI->getArgOperand(1);
       unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
-      unsigned DstNumElts = cast<VectorType>(CI->getType())->getNumElements();
-      unsigned SrcNumElts = cast<VectorType>(Op1->getType())->getNumElements();
+      unsigned DstNumElts =
+          cast<FixedVectorType>(CI->getType())->getNumElements();
+      unsigned SrcNumElts =
+          cast<FixedVectorType>(Op1->getType())->getNumElements();
       unsigned Scale = DstNumElts / SrcNumElts;
 
       // Mask off the high bits of the immediate value; hardware ignores those.
       Imm = Imm % Scale;
 
       // Extend the second operand into a vector the size of the destination.
-      Value *UndefV = UndefValue::get(Op1->getType());
       SmallVector<int, 8> Idxs(DstNumElts);
       for (unsigned i = 0; i != SrcNumElts; ++i)
         Idxs[i] = i;
       for (unsigned i = SrcNumElts; i != DstNumElts; ++i)
         Idxs[i] = SrcNumElts;
-      Rep = Builder.CreateShuffleVector(Op1, UndefV, Idxs);
+      Rep = Builder.CreateShuffleVector(Op1, Idxs);
 
       // Insert the second operand into the first operand.
 
@@ -2521,8 +2629,10 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
                          Name.startswith("avx512.mask.vextract"))) {
       Value *Op0 = CI->getArgOperand(0);
       unsigned Imm = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
-      unsigned DstNumElts = cast<VectorType>(CI->getType())->getNumElements();
-      unsigned SrcNumElts = cast<VectorType>(Op0->getType())->getNumElements();
+      unsigned DstNumElts =
+          cast<FixedVectorType>(CI->getType())->getNumElements();
+      unsigned SrcNumElts =
+          cast<FixedVectorType>(Op0->getType())->getNumElements();
       unsigned Scale = SrcNumElts / DstNumElts;
 
       // Mask off the high bits of the immediate value; hardware ignores those.
@@ -2545,7 +2655,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
                          Name.startswith("avx512.mask.perm.di."))) {
       Value *Op0 = CI->getArgOperand(0);
       unsigned Imm = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
-      VectorType *VecTy = cast<VectorType>(CI->getType());
+      auto *VecTy = cast<FixedVectorType>(CI->getType());
       unsigned NumElts = VecTy->getNumElements();
 
       SmallVector<int, 8> Idxs(NumElts);
@@ -2569,7 +2679,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
 
       uint8_t Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
 
-      unsigned NumElts = cast<VectorType>(CI->getType())->getNumElements();
+      unsigned NumElts = cast<FixedVectorType>(CI->getType())->getNumElements();
       unsigned HalfSize = NumElts / 2;
       SmallVector<int, 8> ShuffleMask(NumElts);
 
@@ -2599,7 +2709,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
                          Name.startswith("avx512.mask.pshuf.d."))) {
       Value *Op0 = CI->getArgOperand(0);
       unsigned Imm = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
-      VectorType *VecTy = cast<VectorType>(CI->getType());
+      auto *VecTy = cast<FixedVectorType>(CI->getType());
       unsigned NumElts = VecTy->getNumElements();
       // Calculate the size of each index in the immediate.
       unsigned IdxSize = 64 / VecTy->getScalarSizeInBits();
@@ -2621,7 +2731,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
                          Name.startswith("avx512.mask.pshufl.w."))) {
       Value *Op0 = CI->getArgOperand(0);
       unsigned Imm = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
-      unsigned NumElts = cast<VectorType>(CI->getType())->getNumElements();
+      unsigned NumElts = cast<FixedVectorType>(CI->getType())->getNumElements();
 
       SmallVector<int, 16> Idxs(NumElts);
       for (unsigned l = 0; l != NumElts; l += 8) {
@@ -2640,7 +2750,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
                          Name.startswith("avx512.mask.pshufh.w."))) {
       Value *Op0 = CI->getArgOperand(0);
       unsigned Imm = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
-      unsigned NumElts = cast<VectorType>(CI->getType())->getNumElements();
+      unsigned NumElts = cast<FixedVectorType>(CI->getType())->getNumElements();
 
       SmallVector<int, 16> Idxs(NumElts);
       for (unsigned l = 0; l != NumElts; l += 8) {
@@ -2659,7 +2769,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Value *Op0 = CI->getArgOperand(0);
       Value *Op1 = CI->getArgOperand(1);
       unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
-      unsigned NumElts = cast<VectorType>(CI->getType())->getNumElements();
+      unsigned NumElts = cast<FixedVectorType>(CI->getType())->getNumElements();
 
       unsigned NumLaneElts = 128/CI->getType()->getScalarSizeInBits();
       unsigned HalfLaneElts = NumLaneElts / 2;
@@ -2684,7 +2794,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
                          Name.startswith("avx512.mask.movshdup") ||
                          Name.startswith("avx512.mask.movsldup"))) {
       Value *Op0 = CI->getArgOperand(0);
-      unsigned NumElts = cast<VectorType>(CI->getType())->getNumElements();
+      unsigned NumElts = cast<FixedVectorType>(CI->getType())->getNumElements();
       unsigned NumLaneElts = 128/CI->getType()->getScalarSizeInBits();
 
       unsigned Offset = 0;
@@ -2706,7 +2816,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
                          Name.startswith("avx512.mask.unpckl."))) {
       Value *Op0 = CI->getArgOperand(0);
       Value *Op1 = CI->getArgOperand(1);
-      int NumElts = cast<VectorType>(CI->getType())->getNumElements();
+      int NumElts = cast<FixedVectorType>(CI->getType())->getNumElements();
       int NumLaneElts = 128/CI->getType()->getScalarSizeInBits();
 
       SmallVector<int, 64> Idxs(NumElts);
@@ -2722,7 +2832,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
                          Name.startswith("avx512.mask.unpckh."))) {
       Value *Op0 = CI->getArgOperand(0);
       Value *Op1 = CI->getArgOperand(1);
-      int NumElts = cast<VectorType>(CI->getType())->getNumElements();
+      int NumElts = cast<FixedVectorType>(CI->getType())->getNumElements();
       int NumLaneElts = 128/CI->getType()->getScalarSizeInBits();
 
       SmallVector<int, 64> Idxs(NumElts);
@@ -3290,7 +3400,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
         Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
                                  Ops);
       } else {
-        int NumElts = cast<VectorType>(CI->getType())->getNumElements();
+        int NumElts = cast<FixedVectorType>(CI->getType())->getNumElements();
 
         Value *Ops[] = { CI->getArgOperand(0), CI->getArgOperand(1),
                          CI->getArgOperand(2) };
@@ -3547,28 +3657,6 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
     DefaultCase();
     return;
   }
-  case Intrinsic::experimental_vector_reduce_v2_fmul: {
-    SmallVector<Value *, 2> Args;
-    if (CI->isFast())
-      Args.push_back(ConstantFP::get(CI->getOperand(0)->getType(), 1.0));
-    else
-      Args.push_back(CI->getOperand(0));
-    Args.push_back(CI->getOperand(1));
-    NewCall = Builder.CreateCall(NewFn, Args);
-    cast<Instruction>(NewCall)->copyFastMathFlags(CI);
-    break;
-  }
-  case Intrinsic::experimental_vector_reduce_v2_fadd: {
-    SmallVector<Value *, 2> Args;
-    if (CI->isFast())
-      Args.push_back(Constant::getNullValue(CI->getOperand(0)->getType()));
-    else
-      Args.push_back(CI->getOperand(0));
-    Args.push_back(CI->getOperand(1));
-    NewCall = Builder.CreateCall(NewFn, Args);
-    cast<Instruction>(NewCall)->copyFastMathFlags(CI);
-    break;
-  }
   case Intrinsic::arm_neon_vld1:
   case Intrinsic::arm_neon_vld2:
   case Intrinsic::arm_neon_vld3:
@@ -3589,6 +3677,30 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
     break;
   }
 
+  case Intrinsic::arm_neon_bfdot:
+  case Intrinsic::arm_neon_bfmmla:
+  case Intrinsic::arm_neon_bfmlalb:
+  case Intrinsic::arm_neon_bfmlalt:
+  case Intrinsic::aarch64_neon_bfdot:
+  case Intrinsic::aarch64_neon_bfmmla:
+  case Intrinsic::aarch64_neon_bfmlalb:
+  case Intrinsic::aarch64_neon_bfmlalt: {
+    SmallVector<Value *, 3> Args;
+    assert(CI->getNumArgOperands() == 3 &&
+           "Mismatch between function args and call args");
+    size_t OperandWidth =
+        CI->getArgOperand(1)->getType()->getPrimitiveSizeInBits();
+    assert((OperandWidth == 64 || OperandWidth == 128) &&
+           "Unexpected operand width");
+    Type *NewTy = FixedVectorType::get(Type::getBFloatTy(C), OperandWidth / 16);
+    auto Iter = CI->arg_operands().begin();
+    Args.push_back(*Iter++);
+    Args.push_back(Builder.CreateBitCast(*Iter++, NewTy));
+    Args.push_back(Builder.CreateBitCast(*Iter++, NewTy));
+    NewCall = Builder.CreateCall(NewFn, Args);
+    break;
+  }
+
   case Intrinsic::bitreverse:
     NewCall = Builder.CreateCall(NewFn, {CI->getArgOperand(0)});
     break;
@@ -3634,6 +3746,32 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
     CI->eraseFromParent();
     return;
 
+  case Intrinsic::ptr_annotation:
+    // Upgrade from versions that lacked the annotation attribute argument.
+    assert(CI->getNumArgOperands() == 4 &&
+           "Before LLVM 12.0 this intrinsic took four arguments");
+    // Create a new call with an added null annotation attribute argument.
+    NewCall = Builder.CreateCall(
+        NewFn,
+        {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2),
+         CI->getArgOperand(3), Constant::getNullValue(Builder.getInt8PtrTy())});
+    NewCall->takeName(CI);
+    CI->replaceAllUsesWith(NewCall);
+    CI->eraseFromParent();
+    return;
+
+  case Intrinsic::var_annotation:
+    // Upgrade from versions that lacked the annotation attribute argument.
+    assert(CI->getNumArgOperands() == 4 &&
+           "Before LLVM 12.0 this intrinsic took four arguments");
+    // Create a new call with an added null annotation attribute argument.
+    NewCall = Builder.CreateCall(
+        NewFn,
+        {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2),
+         CI->getArgOperand(3), Constant::getNullValue(Builder.getInt8PtrTy())});
+    CI->eraseFromParent();
+    return;
+
   case Intrinsic::x86_xop_vfrcz_ss:
   case Intrinsic::x86_xop_vfrcz_sd:
     NewCall = Builder.CreateCall(NewFn, {CI->getArgOperand(1)});
@@ -3691,11 +3829,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
     // Replace the original call result with the first result of the new call.
     Value *TSC = Builder.CreateExtractValue(NewCall, 0);
 
-    std::string Name = std::string(CI->getName());
-    if (!Name.empty()) {
-      CI->setName(Name + ".old");
-      NewCall->setName(Name);
-    }
+    NewCall->takeName(CI);
     CI->replaceAllUsesWith(TSC);
     CI->eraseFromParent();
     return;
@@ -3718,6 +3852,27 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
     break;
   }
 
+  case Intrinsic::x86_avx512_mask_cmp_pd_128:
+  case Intrinsic::x86_avx512_mask_cmp_pd_256:
+  case Intrinsic::x86_avx512_mask_cmp_pd_512:
+  case Intrinsic::x86_avx512_mask_cmp_ps_128:
+  case Intrinsic::x86_avx512_mask_cmp_ps_256:
+  case Intrinsic::x86_avx512_mask_cmp_ps_512: {
+    SmallVector<Value *, 4> Args(CI->arg_operands().begin(),
+                                 CI->arg_operands().end());
+    unsigned NumElts =
+        cast<FixedVectorType>(Args[0]->getType())->getNumElements();
+    Args[3] = getX86MaskVec(Builder, Args[3], NumElts);
+
+    NewCall = Builder.CreateCall(NewFn, Args);
+    Value *Res = ApplyX86MaskOn1BitsVec(Builder, NewCall, nullptr);
+
+    NewCall->takeName(CI);
+    CI->replaceAllUsesWith(Res);
+    CI->eraseFromParent();
+    return;
+  }
+
   case Intrinsic::thread_pointer: {
     NewCall = Builder.CreateCall(NewFn, {});
     break;
@@ -3766,11 +3921,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
   }
   assert(NewCall && "Should have either set this variable or returned through "
                     "the default case");
-  std::string Name = std::string(CI->getName());
-  if (!Name.empty()) {
-    CI->setName(Name + ".old");
-    NewCall->setName(Name);
-  }
+  NewCall->takeName(CI);
   CI->replaceAllUsesWith(NewCall);
   CI->eraseFromParent();
 }
@@ -3784,8 +3935,8 @@ void llvm::UpgradeCallsToIntrinsic(Function *F) {
   if (UpgradeIntrinsicFunction(F, NewFn)) {
     // Replace all users of the old function with the new function or new
     // instructions. This is not a range loop because the call is deleted.
-    for (auto UI = F->user_begin(), UE = F->user_end(); UI != UE; )
-      if (CallInst *CI = dyn_cast<CallInst>(*UI++))
+    for (User *U : make_early_inc_range(F->users()))
+      if (CallInst *CI = dyn_cast<CallInst>(U))
         UpgradeIntrinsicCall(CI, NewFn);
 
     // Remove old function, no longer used, from the module.
@@ -3921,8 +4072,8 @@ void llvm::UpgradeARCRuntime(Module &M) {
 
     Function *NewFn = llvm::Intrinsic::getDeclaration(&M, IntrinsicFunc);
 
-    for (auto I = Fn->user_begin(), E = Fn->user_end(); I != E;) {
-      CallInst *CI = dyn_cast<CallInst>(*I++);
+    for (User *U : make_early_inc_range(Fn->users())) {
+      CallInst *CI = dyn_cast<CallInst>(U);
       if (!CI || CI->getCalledFunction() != Fn)
         continue;
 
@@ -3963,7 +4114,7 @@ void llvm::UpgradeARCRuntime(Module &M) {
       // Create a call instruction that calls the new function.
       CallInst *NewCall = Builder.CreateCall(NewFuncTy, NewFn, Args);
       NewCall->setTailCallKind(cast<CallInst>(CI)->getTailCallKind());
-      NewCall->setName(CI->getName());
+      NewCall->takeName(CI);
 
       // Bitcast the return value back to the type of the old call.
       Value *NewRetVal = Builder.CreateBitCast(NewCall, CI->getType());
@@ -4202,6 +4353,13 @@ void llvm::UpgradeFunctionAttributes(Function &F) {
     StrictFPUpgradeVisitor SFPV;
     SFPV.visit(F);
   }
+
+  if (F.getCallingConv() == CallingConv::X86_INTR &&
+      !F.arg_empty() && !F.hasParamAttribute(0, Attribute::ByVal)) {
+    Type *ByValTy = cast<PointerType>(F.getArg(0)->getType())->getElementType();
+    Attribute NewAttr = Attribute::getWithByValType(F.getContext(), ByValTy);
+    F.addParamAttr(0, NewAttr);
+  }
 }
 
 static bool isOldLoopArgument(Metadata *MD) {
@@ -4267,11 +4425,17 @@ MDNode *llvm::upgradeInstructionLoopAttachment(MDNode &N) {
 }
 
 std::string llvm::UpgradeDataLayoutString(StringRef DL, StringRef TT) {
-  std::string AddrSpaces = "-p270:32:32-p271:32:32-p272:64:64";
+  Triple T(TT);
+  // For AMDGPU we uprgrade older DataLayouts to include the default globals
+  // address space of 1.
+  if (T.isAMDGPU() && !DL.contains("-G") && !DL.startswith("G")) {
+    return DL.empty() ? std::string("G1") : (DL + "-G1").str();
+  }
 
+  std::string AddrSpaces = "-p270:32:32-p271:32:32-p272:64:64";
   // If X86, and the datalayout matches the expected format, add pointer size
   // address spaces to the datalayout.
-  if (!Triple(TT).isX86() || DL.contains(AddrSpaces))
+  if (!T.isX86() || DL.contains(AddrSpaces))
     return std::string(DL);
 
   SmallVector<StringRef, 4> Groups;
@@ -4279,9 +4443,7 @@ std::string llvm::UpgradeDataLayoutString(StringRef DL, StringRef TT) {
   if (!R.match(DL, &Groups))
     return std::string(DL);
 
-  SmallString<1024> Buf;
-  std::string Res = (Groups[1] + AddrSpaces + Groups[3]).toStringRef(Buf).str();
-  return Res;
+  return (Groups[1] + AddrSpaces + Groups[3]).str();
 }
 
 void llvm::UpgradeAttributes(AttrBuilder &B) {
diff --git a/contrib/llvm-project/llvm/lib/IR/BasicBlock.cpp b/contrib/llvm-project/llvm/lib/IR/BasicBlock.cpp
index 64f1d3f3100c..00ef10dd53af 100644
--- a/contrib/llvm-project/llvm/lib/IR/BasicBlock.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/BasicBlock.cpp
@@ -97,18 +97,20 @@ void BasicBlock::setParent(Function *parent) {
 
 iterator_range<filter_iterator<BasicBlock::const_iterator,
                                std::function<bool(const Instruction &)>>>
-BasicBlock::instructionsWithoutDebug() const {
-  std::function<bool(const Instruction &)> Fn = [](const Instruction &I) {
-    return !isa<DbgInfoIntrinsic>(I);
+BasicBlock::instructionsWithoutDebug(bool SkipPseudoOp) const {
+  std::function<bool(const Instruction &)> Fn = [=](const Instruction &I) {
+    return !isa<DbgInfoIntrinsic>(I) &&
+           !(SkipPseudoOp && isa<PseudoProbeInst>(I));
   };
   return make_filter_range(*this, Fn);
 }
 
-iterator_range<filter_iterator<BasicBlock::iterator,
-                               std::function<bool(Instruction &)>>>
-BasicBlock::instructionsWithoutDebug() {
-  std::function<bool(Instruction &)> Fn = [](Instruction &I) {
-    return !isa<DbgInfoIntrinsic>(I);
+iterator_range<
+    filter_iterator<BasicBlock::iterator, std::function<bool(Instruction &)>>>
+BasicBlock::instructionsWithoutDebug(bool SkipPseudoOp) {
+  std::function<bool(Instruction &)> Fn = [=](Instruction &I) {
+    return !isa<DbgInfoIntrinsic>(I) &&
+           !(SkipPseudoOp && isa<PseudoProbeInst>(I));
   };
   return make_filter_range(*this, Fn);
 }
@@ -128,15 +130,11 @@ iplist<BasicBlock>::iterator BasicBlock::eraseFromParent() {
   return getParent()->getBasicBlockList().erase(getIterator());
 }
 
-/// Unlink this basic block from its current function and
-/// insert it into the function that MovePos lives in, right before MovePos.
 void BasicBlock::moveBefore(BasicBlock *MovePos) {
   MovePos->getParent()->getBasicBlockList().splice(
       MovePos->getIterator(), getParent()->getBasicBlockList(), getIterator());
 }
 
-/// Unlink this basic block from its current function and
-/// insert it into the function that MovePos lives in, right after MovePos.
 void BasicBlock::moveAfter(BasicBlock *MovePos) {
   MovePos->getParent()->getBasicBlockList().splice(
       ++MovePos->getIterator(), getParent()->getBasicBlockList(),
@@ -218,14 +216,21 @@ const Instruction* BasicBlock::getFirstNonPHI() const {
   return nullptr;
 }
 
-const Instruction* BasicBlock::getFirstNonPHIOrDbg() const {
-  for (const Instruction &I : *this)
-    if (!isa<PHINode>(I) && !isa<DbgInfoIntrinsic>(I))
-      return &I;
+const Instruction *BasicBlock::getFirstNonPHIOrDbg(bool SkipPseudoOp) const {
+  for (const Instruction &I : *this) {
+    if (isa<PHINode>(I) || isa<DbgInfoIntrinsic>(I))
+      continue;
+
+    if (SkipPseudoOp && isa<PseudoProbeInst>(I))
+      continue;
+
+    return &I;
+  }
   return nullptr;
 }
 
-const Instruction* BasicBlock::getFirstNonPHIOrDbgOrLifetime() const {
+const Instruction *
+BasicBlock::getFirstNonPHIOrDbgOrLifetime(bool SkipPseudoOp) const {
   for (const Instruction &I : *this) {
     if (isa<PHINode>(I) || isa<DbgInfoIntrinsic>(I))
       continue;
@@ -233,6 +238,9 @@ const Instruction* BasicBlock::getFirstNonPHIOrDbgOrLifetime() const {
     if (I.isLifetimeStartOrEnd())
       continue;
 
+    if (SkipPseudoOp && isa<PseudoProbeInst>(I))
+      continue;
+
     return &I;
   }
   return nullptr;
@@ -253,8 +261,6 @@ void BasicBlock::dropAllReferences() {
     I.dropAllReferences();
 }
 
-/// If this basic block has a single predecessor block,
-/// return the block, otherwise return a null pointer.
 const BasicBlock *BasicBlock::getSinglePredecessor() const {
   const_pred_iterator PI = pred_begin(this), E = pred_end(this);
   if (PI == E) return nullptr;         // No preds.
@@ -263,11 +269,6 @@ const BasicBlock *BasicBlock::getSinglePredecessor() const {
   return (PI == E) ? ThePred : nullptr /*multiple preds*/;
 }
 
-/// If this basic block has a unique predecessor block,
-/// return the block, otherwise return a null pointer.
-/// Note that unique predecessor doesn't mean single edge, there can be
-/// multiple edges from the unique predecessor to this block (for example
-/// a switch statement with multiple cases having the same destination).
 const BasicBlock *BasicBlock::getUniquePredecessor() const {
   const_pred_iterator PI = pred_begin(this), E = pred_end(this);
   if (PI == E) return nullptr; // No preds.
@@ -317,38 +318,31 @@ iterator_range<BasicBlock::phi_iterator> BasicBlock::phis() {
   return make_range<phi_iterator>(P, nullptr);
 }
 
-/// Update PHI nodes in this BasicBlock before removal of predecessor \p Pred.
-/// Note that this function does not actually remove the predecessor.
-///
-/// If \p KeepOneInputPHIs is true then don't remove PHIs that are left with
-/// zero or one incoming values, and don't simplify PHIs with all incoming
-/// values the same.
 void BasicBlock::removePredecessor(BasicBlock *Pred,
                                    bool KeepOneInputPHIs) {
   // Use hasNUsesOrMore to bound the cost of this assertion for complex CFGs.
-  assert((hasNUsesOrMore(16) ||
-          find(pred_begin(this), pred_end(this), Pred) != pred_end(this)) &&
+  assert((hasNUsesOrMore(16) || llvm::is_contained(predecessors(this), Pred)) &&
          "Pred is not a predecessor!");
 
   // Return early if there are no PHI nodes to update.
-  if (!isa<PHINode>(begin()))
+  if (empty() || !isa<PHINode>(begin()))
     return;
+
   unsigned NumPreds = cast<PHINode>(front()).getNumIncomingValues();
+  for (PHINode &Phi : make_early_inc_range(phis())) {
+    Phi.removeIncomingValue(Pred, !KeepOneInputPHIs);
+    if (KeepOneInputPHIs)
+      continue;
+
+    // If we have a single predecessor, removeIncomingValue may have erased the
+    // PHI node itself.
+    if (NumPreds == 1)
+      continue;
 
-  // Update all PHI nodes.
-  for (iterator II = begin(); isa<PHINode>(II);) {
-    PHINode *PN = cast<PHINode>(II++);
-    PN->removeIncomingValue(Pred, !KeepOneInputPHIs);
-    if (!KeepOneInputPHIs) {
-      // If we have a single predecessor, removeIncomingValue erased the PHI
-      // node itself.
-      if (NumPreds > 1) {
-        if (Value *PNV = PN->hasConstantValue()) {
-          // Replace the PHI node with its constant value.
-          PN->replaceAllUsesWith(PNV);
-          PN->eraseFromParent();
-        }
-      }
+    // Try to replace the PHI node with a constant value.
+    if (Value *PhiConstant = Phi.hasConstantValue()) {
+      Phi.replaceAllUsesWith(PhiConstant);
+      Phi.eraseFromParent();
     }
   }
 }
@@ -378,18 +372,11 @@ bool BasicBlock::isLegalToHoistInto() const {
   return !Term->isExceptionalTerminator();
 }
 
-/// This splits a basic block into two at the specified
-/// instruction.  Note that all instructions BEFORE the specified iterator stay
-/// as part of the original basic block, an unconditional branch is added to
-/// the new BB, and the rest of the instructions in the BB are moved to the new
-/// BB, including the old terminator.  This invalidates the iterator.
-///
-/// Note that this only works on well formed basic blocks (must have a
-/// terminator), and 'I' must not be the end of instruction list (which would
-/// cause a degenerate basic block to be formed, having a terminator inside of
-/// the basic block).
-///
-BasicBlock *BasicBlock::splitBasicBlock(iterator I, const Twine &BBName) {
+BasicBlock *BasicBlock::splitBasicBlock(iterator I, const Twine &BBName,
+                                        bool Before) {
+  if (Before)
+    return splitBasicBlockBefore(I, BBName);
+
   assert(getTerminator() && "Can't use splitBasicBlock on degenerate BB!");
   assert(I != InstList.end() &&
          "Trying to get me to create degenerate basic block!");
@@ -416,6 +403,40 @@ BasicBlock *BasicBlock::splitBasicBlock(iterator I, const Twine &BBName) {
   return New;
 }
 
+BasicBlock *BasicBlock::splitBasicBlockBefore(iterator I, const Twine &BBName) {
+  assert(getTerminator() &&
+         "Can't use splitBasicBlockBefore on degenerate BB!");
+  assert(I != InstList.end() &&
+         "Trying to get me to create degenerate basic block!");
+
+  assert((!isa<PHINode>(*I) || getSinglePredecessor()) &&
+         "cannot split on multi incoming phis");
+
+  BasicBlock *New = BasicBlock::Create(getContext(), BBName, getParent(), this);
+  // Save DebugLoc of split point before invalidating iterator.
+  DebugLoc Loc = I->getDebugLoc();
+  // Move all of the specified instructions from the original basic block into
+  // the new basic block.
+  New->getInstList().splice(New->end(), this->getInstList(), begin(), I);
+
+  // Loop through all of the predecessors of the 'this' block (which will be the
+  // predecessors of the New block), replace the specified successor 'this'
+  // block to point at the New block and update any PHI nodes in 'this' block.
+  // If there were PHI nodes in 'this' block, the PHI nodes are updated
+  // to reflect that the incoming branches will be from the New block and not
+  // from predecessors of the 'this' block.
+  for (BasicBlock *Pred : predecessors(this)) {
+    Instruction *TI = Pred->getTerminator();
+    TI->replaceSuccessorWith(this, New);
+    this->replacePhiUsesWith(Pred, New);
+  }
+  // Add a branch instruction from  "New" to "this" Block.
+  BranchInst *BI = BranchInst::Create(this, New);
+  BI->setDebugLoc(Loc);
+
+  return New;
+}
+
 void BasicBlock::replacePhiUsesWith(BasicBlock *Old, BasicBlock *New) {
   // N.B. This might not be a complete BasicBlock, so don't assume
   // that it ends with a non-phi instruction.
@@ -443,13 +464,10 @@ void BasicBlock::replaceSuccessorsPhiUsesWith(BasicBlock *New) {
   this->replaceSuccessorsPhiUsesWith(this, New);
 }
 
-/// Return true if this basic block is a landing pad. I.e., it's
-/// the destination of the 'unwind' edge of an invoke instruction.
 bool BasicBlock::isLandingPad() const {
   return isa<LandingPadInst>(getFirstNonPHI());
 }
 
-/// Return the landingpad instruction associated with the landing pad.
 const LandingPadInst *BasicBlock::getLandingPadInst() const {
   return dyn_cast<LandingPadInst>(getFirstNonPHI());
 }
diff --git a/contrib/llvm-project/llvm/lib/IR/ConstantFold.cpp b/contrib/llvm-project/llvm/lib/IR/ConstantFold.cpp
index c20d0955f3d8..95dd55237e5f 100644
--- a/contrib/llvm-project/llvm/lib/IR/ConstantFold.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/ConstantFold.cpp
@@ -522,6 +522,9 @@ static Constant *getFoldedOffsetOf(Type *Ty, Constant *FieldNo, Type *DestTy,
 
 Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
                                             Type *DestTy) {
+  if (isa<PoisonValue>(V))
+    return PoisonValue::get(DestTy);
+
   if (isa<UndefValue>(V)) {
     // zext(undef) = 0, because the top bits will be zero.
     // sext(undef) = 0, because the top bits will all be the same.
@@ -532,7 +535,7 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
     return UndefValue::get(DestTy);
   }
 
-  if (V->isNullValue() && !DestTy->isX86_MMXTy() &&
+  if (V->isNullValue() && !DestTy->isX86_MMXTy() && !DestTy->isX86_AMXTy() &&
       opc != Instruction::AddrSpaceCast)
     return Constant::getNullValue(DestTy);
 
@@ -759,7 +762,9 @@ Constant *llvm::ConstantFoldSelectInstruction(Constant *Cond,
       Constant *V2Element = ConstantExpr::getExtractElement(V2,
                                                     ConstantInt::get(Ty, i));
       auto *Cond = cast<Constant>(CondV->getOperand(i));
-      if (V1Element == V2Element) {
+      if (isa<PoisonValue>(Cond)) {
+        V = PoisonValue::get(V1Element->getType());
+      } else if (V1Element == V2Element) {
         V = V1Element;
       } else if (isa<UndefValue>(Cond)) {
         V = isa<UndefValue>(V1Element) ? V1Element : V2Element;
@@ -775,14 +780,45 @@ Constant *llvm::ConstantFoldSelectInstruction(Constant *Cond,
       return ConstantVector::get(Result);
   }
 
+  if (isa<PoisonValue>(Cond))
+    return PoisonValue::get(V1->getType());
+
   if (isa<UndefValue>(Cond)) {
     if (isa<UndefValue>(V1)) return V1;
     return V2;
   }
-  if (isa<UndefValue>(V1)) return V2;
-  if (isa<UndefValue>(V2)) return V1;
+
   if (V1 == V2) return V1;
 
+  if (isa<PoisonValue>(V1))
+    return V2;
+  if (isa<PoisonValue>(V2))
+    return V1;
+
+  // If the true or false value is undef, we can fold to the other value as
+  // long as the other value isn't poison.
+  auto NotPoison = [](Constant *C) {
+    if (isa<PoisonValue>(C))
+      return false;
+
+    // TODO: We can analyze ConstExpr by opcode to determine if there is any
+    //       possibility of poison.
+    if (isa<ConstantExpr>(C))
+      return false;
+
+    if (isa<ConstantInt>(C) || isa<GlobalVariable>(C) || isa<ConstantFP>(C) ||
+        isa<ConstantPointerNull>(C) || isa<Function>(C))
+      return true;
+
+    if (C->getType()->isVectorTy())
+      return !C->containsPoisonElement() && !C->containsConstantExpression();
+
+    // TODO: Recursively analyze aggregates or other constants.
+    return false;
+  };
+  if (isa<UndefValue>(V1) && NotPoison(V2)) return V2;
+  if (isa<UndefValue>(V2) && NotPoison(V1)) return V1;
+
   if (ConstantExpr *TrueVal = dyn_cast<ConstantExpr>(V1)) {
     if (TrueVal->getOpcode() == Instruction::Select)
       if (TrueVal->getOperand(0) == Cond)
@@ -801,9 +837,13 @@ Constant *llvm::ConstantFoldExtractElementInstruction(Constant *Val,
                                                       Constant *Idx) {
   auto *ValVTy = cast<VectorType>(Val->getType());
 
+  // extractelt poison, C -> poison
+  // extractelt C, undef -> poison
+  if (isa<PoisonValue>(Val) || isa<UndefValue>(Idx))
+    return PoisonValue::get(ValVTy->getElementType());
+
   // extractelt undef, C -> undef
-  // extractelt C, undef -> undef
-  if (isa<UndefValue>(Val) || isa<UndefValue>(Idx))
+  if (isa<UndefValue>(Val))
     return UndefValue::get(ValVTy->getElementType());
 
   auto *CIdx = dyn_cast<ConstantInt>(Idx);
@@ -811,9 +851,9 @@ Constant *llvm::ConstantFoldExtractElementInstruction(Constant *Val,
     return nullptr;
 
   if (auto *ValFVTy = dyn_cast<FixedVectorType>(Val->getType())) {
-    // ee({w,x,y,z}, wrong_value) -> undef
+    // ee({w,x,y,z}, wrong_value) -> poison
     if (CIdx->uge(ValFVTy->getNumElements()))
-      return UndefValue::get(ValFVTy->getElementType());
+      return PoisonValue::get(ValFVTy->getElementType());
   }
 
   // ee (gep (ptr, idx0, ...), idx) -> gep (ee (ptr, idx), ee (idx0, idx), ...)
@@ -833,6 +873,15 @@ Constant *llvm::ConstantFoldExtractElementInstruction(Constant *Val,
       }
       return CE->getWithOperands(Ops, ValVTy->getElementType(), false,
                                  Ops[0]->getType()->getPointerElementType());
+    } else if (CE->getOpcode() == Instruction::InsertElement) {
+      if (const auto *IEIdx = dyn_cast<ConstantInt>(CE->getOperand(2))) {
+        if (APSInt::isSameValue(APSInt(IEIdx->getValue()),
+                                APSInt(CIdx->getValue()))) {
+          return CE->getOperand(1);
+        } else {
+          return ConstantExpr::getExtractElement(CE->getOperand(0), CIdx);
+        }
+      }
     }
   }
 
@@ -853,7 +902,7 @@ Constant *llvm::ConstantFoldInsertElementInstruction(Constant *Val,
                                                      Constant *Elt,
                                                      Constant *Idx) {
   if (isa<UndefValue>(Idx))
-    return UndefValue::get(Val->getType());
+    return PoisonValue::get(Val->getType());
 
   ConstantInt *CIdx = dyn_cast<ConstantInt>(Idx);
   if (!CIdx) return nullptr;
@@ -890,7 +939,8 @@ Constant *llvm::ConstantFoldShuffleVectorInstruction(Constant *V1, Constant *V2,
                                                      ArrayRef<int> Mask) {
   auto *V1VTy = cast<VectorType>(V1->getType());
   unsigned MaskNumElts = Mask.size();
-  ElementCount MaskEltCount = {MaskNumElts, isa<ScalableVectorType>(V1VTy)};
+  auto MaskEltCount =
+      ElementCount::get(MaskNumElts, isa<ScalableVectorType>(V1VTy));
   Type *EltTy = V1VTy->getElementType();
 
   // Undefined shuffle mask -> undefined value.
@@ -901,7 +951,7 @@ Constant *llvm::ConstantFoldShuffleVectorInstruction(Constant *V1, Constant *V2,
   // If the mask is all zeros this is a splat, no need to go through all
   // elements.
   if (all_of(Mask, [](int Elt) { return Elt == 0; }) &&
-      !MaskEltCount.Scalable) {
+      !MaskEltCount.isScalable()) {
     Type *Ty = IntegerType::get(V1->getContext(), 32);
     Constant *Elt =
         ConstantExpr::getExtractElement(V1, ConstantInt::get(Ty, 0));
@@ -912,7 +962,7 @@ Constant *llvm::ConstantFoldShuffleVectorInstruction(Constant *V1, Constant *V2,
   if (isa<ScalableVectorType>(V1VTy))
     return nullptr;
 
-  unsigned SrcNumElts = V1VTy->getElementCount().Min;
+  unsigned SrcNumElts = V1VTy->getElementCount().getKnownMinValue();
 
   // Loop over the shuffle mask, evaluating each element.
   SmallVector<Constant*, 32> Result;
@@ -1054,6 +1104,17 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode, Constant *C1,
       return C1;
   }
 
+  // Binary operations propagate poison.
+  // FIXME: Currently, or/and i1 poison aren't folded into poison because
+  // it causes miscompilation when combined with another optimization in
+  // InstCombine (select i1 -> and/or). The select fold is wrong, but
+  // fixing it requires an effort, so temporarily disable this until it is
+  // fixed.
+  bool PoisonFold = !C1->getType()->isIntegerTy(1) ||
+                    (Opcode != Instruction::Or && Opcode != Instruction::And);
+  if (PoisonFold && (isa<PoisonValue>(C1) || isa<PoisonValue>(C2)))
+    return PoisonValue::get(C1->getType());
+
   // Handle scalar UndefValue and scalable vector UndefValue. Fixed-length
   // vectors are always evaluated per element.
   bool IsScalableVector = isa<ScalableVectorType>(C1->getType());
@@ -1378,12 +1439,7 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode, Constant *C1,
         return ConstantFP::get(C1->getContext(), C3V);
       }
     }
-  } else if (IsScalableVector) {
-    // Do not iterate on scalable vector. The number of elements is unknown at
-    // compile-time.
-    // FIXME: this branch can potentially be removed
-    return nullptr;
-  } else if (auto *VTy = dyn_cast<FixedVectorType>(C1->getType())) {
+  } else if (auto *VTy = dyn_cast<VectorType>(C1->getType())) {
     // Fast path for splatted constants.
     if (Constant *C2Splat = C2->getSplatValue()) {
       if (Instruction::isIntDivRem(Opcode) && C2Splat->isNullValue())
@@ -1395,22 +1451,24 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode, Constant *C1,
       }
     }
 
-    // Fold each element and create a vector constant from those constants.
-    SmallVector<Constant*, 16> Result;
-    Type *Ty = IntegerType::get(VTy->getContext(), 32);
-    for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i) {
-      Constant *ExtractIdx = ConstantInt::get(Ty, i);
-      Constant *LHS = ConstantExpr::getExtractElement(C1, ExtractIdx);
-      Constant *RHS = ConstantExpr::getExtractElement(C2, ExtractIdx);
+    if (auto *FVTy = dyn_cast<FixedVectorType>(VTy)) {
+      // Fold each element and create a vector constant from those constants.
+      SmallVector<Constant*, 16> Result;
+      Type *Ty = IntegerType::get(FVTy->getContext(), 32);
+      for (unsigned i = 0, e = FVTy->getNumElements(); i != e; ++i) {
+        Constant *ExtractIdx = ConstantInt::get(Ty, i);
+        Constant *LHS = ConstantExpr::getExtractElement(C1, ExtractIdx);
+        Constant *RHS = ConstantExpr::getExtractElement(C2, ExtractIdx);
 
-      // If any element of a divisor vector is zero, the whole op is undef.
-      if (Instruction::isIntDivRem(Opcode) && RHS->isNullValue())
-        return UndefValue::get(VTy);
+        // If any element of a divisor vector is zero, the whole op is undef.
+        if (Instruction::isIntDivRem(Opcode) && RHS->isNullValue())
+          return UndefValue::get(VTy);
 
-      Result.push_back(ConstantExpr::get(Opcode, LHS, RHS));
-    }
+        Result.push_back(ConstantExpr::get(Opcode, LHS, RHS));
+      }
 
-    return ConstantVector::get(Result);
+      return ConstantVector::get(Result);
+    }
   }
 
   if (ConstantExpr *CE1 = dyn_cast<ConstantExpr>(C1)) {
@@ -1724,9 +1782,15 @@ static ICmpInst::Predicate evaluateICmpRelation(Constant *V1, Constant *V2,
     case Instruction::FPToSI:
       break; // We can't evaluate floating point casts or truncations.
 
+    case Instruction::BitCast:
+      // If this is a global value cast, check to see if the RHS is also a
+      // GlobalValue.
+      if (const GlobalValue *GV = dyn_cast<GlobalValue>(CE1Op0))
+        if (const GlobalValue *GV2 = dyn_cast<GlobalValue>(V2))
+          return areGlobalsPotentiallyEqual(GV, GV2);
+      LLVM_FALLTHROUGH;
     case Instruction::UIToFP:
     case Instruction::SIToFP:
-    case Instruction::BitCast:
     case Instruction::ZExt:
     case Instruction::SExt:
       // We can't evaluate floating point casts or truncations.
@@ -1889,6 +1953,9 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred,
     return Constant::getAllOnesValue(ResultTy);
 
   // Handle some degenerate cases first
+  if (isa<PoisonValue>(C1) || isa<PoisonValue>(C2))
+    return PoisonValue::get(ResultTy);
+
   if (isa<UndefValue>(C1) || isa<UndefValue>(C2)) {
     CmpInst::Predicate Predicate = CmpInst::Predicate(pred);
     bool isIntegerPredicate = ICmpInst::isIntPredicate(Predicate);
@@ -2009,11 +2076,6 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred,
     }
   } else if (auto *C1VTy = dyn_cast<VectorType>(C1->getType())) {
 
-    // Do not iterate on scalable vector. The number of elements is unknown at
-    // compile-time.
-    if (isa<ScalableVectorType>(C1VTy))
-      return nullptr;
-
     // Fast path for splatted constants.
     if (Constant *C1Splat = C1->getSplatValue())
       if (Constant *C2Splat = C2->getSplatValue())
@@ -2021,16 +2083,22 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred,
             C1VTy->getElementCount(),
             ConstantExpr::getCompare(pred, C1Splat, C2Splat));
 
+    // Do not iterate on scalable vector. The number of elements is unknown at
+    // compile-time.
+    if (isa<ScalableVectorType>(C1VTy))
+      return nullptr;
+
     // If we can constant fold the comparison of each element, constant fold
     // the whole vector comparison.
     SmallVector<Constant*, 4> ResElts;
     Type *Ty = IntegerType::get(C1->getContext(), 32);
     // Compare the elements, producing an i1 result or constant expr.
-    for (unsigned i = 0, e = C1VTy->getElementCount().Min; i != e; ++i) {
+    for (unsigned I = 0, E = C1VTy->getElementCount().getKnownMinValue();
+         I != E; ++I) {
       Constant *C1E =
-        ConstantExpr::getExtractElement(C1, ConstantInt::get(Ty, i));
+          ConstantExpr::getExtractElement(C1, ConstantInt::get(Ty, I));
       Constant *C2E =
-        ConstantExpr::getExtractElement(C2, ConstantInt::get(Ty, i));
+          ConstantExpr::getExtractElement(C2, ConstantInt::get(Ty, I));
 
       ResElts.push_back(ConstantExpr::getCompare(pred, C1E, C2E));
     }
@@ -2273,6 +2341,9 @@ Constant *llvm::ConstantFoldGetElementPtr(Type *PointeeTy, Constant *C,
   Type *GEPTy = GetElementPtrInst::getGEPReturnType(
       PointeeTy, C, makeArrayRef((Value *const *)Idxs.data(), Idxs.size()));
 
+  if (isa<PoisonValue>(C))
+    return PoisonValue::get(GEPTy);
+
   if (isa<UndefValue>(C))
     return UndefValue::get(GEPTy);
 
diff --git a/contrib/llvm-project/llvm/lib/IR/ConstantRange.cpp b/contrib/llvm-project/llvm/lib/IR/ConstantRange.cpp
index eabaaa203927..4b0ad1bd25a0 100644
--- a/contrib/llvm-project/llvm/lib/IR/ConstantRange.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/ConstantRange.cpp
@@ -26,6 +26,7 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/Support/Compiler.h"
@@ -412,6 +413,21 @@ bool ConstantRange::contains(const ConstantRange &Other) const {
   return Other.getUpper().ule(Upper) && Lower.ule(Other.getLower());
 }
 
+unsigned ConstantRange::getActiveBits() const {
+  if (isEmptySet())
+    return 0;
+
+  return getUnsignedMax().getActiveBits();
+}
+
+unsigned ConstantRange::getMinSignedBits() const {
+  if (isEmptySet())
+    return 0;
+
+  return std::max(getSignedMin().getMinSignedBits(),
+                  getSignedMax().getMinSignedBits());
+}
+
 ConstantRange ConstantRange::subtract(const APInt &Val) const {
   assert(Val.getBitWidth() == getBitWidth() && "Wrong bit width");
   // If the set is empty or full, don't modify the endpoints.
@@ -835,6 +851,54 @@ ConstantRange ConstantRange::overflowingBinaryOp(Instruction::BinaryOps BinOp,
   }
 }
 
+bool ConstantRange::isIntrinsicSupported(Intrinsic::ID IntrinsicID) {
+  switch (IntrinsicID) {
+  case Intrinsic::uadd_sat:
+  case Intrinsic::usub_sat:
+  case Intrinsic::sadd_sat:
+  case Intrinsic::ssub_sat:
+  case Intrinsic::umin:
+  case Intrinsic::umax:
+  case Intrinsic::smin:
+  case Intrinsic::smax:
+  case Intrinsic::abs:
+    return true;
+  default:
+    return false;
+  }
+}
+
+ConstantRange ConstantRange::intrinsic(Intrinsic::ID IntrinsicID,
+                                       ArrayRef<ConstantRange> Ops) {
+  switch (IntrinsicID) {
+  case Intrinsic::uadd_sat:
+    return Ops[0].uadd_sat(Ops[1]);
+  case Intrinsic::usub_sat:
+    return Ops[0].usub_sat(Ops[1]);
+  case Intrinsic::sadd_sat:
+    return Ops[0].sadd_sat(Ops[1]);
+  case Intrinsic::ssub_sat:
+    return Ops[0].ssub_sat(Ops[1]);
+  case Intrinsic::umin:
+    return Ops[0].umin(Ops[1]);
+  case Intrinsic::umax:
+    return Ops[0].umax(Ops[1]);
+  case Intrinsic::smin:
+    return Ops[0].smin(Ops[1]);
+  case Intrinsic::smax:
+    return Ops[0].smax(Ops[1]);
+  case Intrinsic::abs: {
+    const APInt *IntMinIsPoison = Ops[1].getSingleElement();
+    assert(IntMinIsPoison && "Must be known (immarg)");
+    assert(IntMinIsPoison->getBitWidth() == 1 && "Must be boolean");
+    return Ops[0].abs(IntMinIsPoison->getBoolValue());
+  }
+  default:
+    assert(!isIntrinsicSupported(IntrinsicID) && "Shouldn't be supported");
+    llvm_unreachable("Unsupported intrinsic");
+  }
+}
+
 ConstantRange
 ConstantRange::add(const ConstantRange &Other) const {
   if (isEmptySet() || Other.isEmptySet())
@@ -1191,6 +1255,16 @@ ConstantRange ConstantRange::srem(const ConstantRange &RHS) const {
   return ConstantRange(std::move(Lower), std::move(Upper));
 }
 
+ConstantRange ConstantRange::binaryNot() const {
+  if (isEmptySet())
+    return getEmpty();
+
+  if (isWrappedSet())
+    return getFull();
+
+  return ConstantRange(APInt::getAllOnesValue(getBitWidth())).sub(*this);
+}
+
 ConstantRange
 ConstantRange::binaryAnd(const ConstantRange &Other) const {
   if (isEmptySet() || Other.isEmptySet())
@@ -1229,6 +1303,12 @@ ConstantRange ConstantRange::binaryXor(const ConstantRange &Other) const {
   if (isSingleElement() && Other.isSingleElement())
     return {*getSingleElement() ^ *Other.getSingleElement()};
 
+  // Special-case binary complement, since we can give a precise answer.
+  if (Other.isSingleElement() && Other.getSingleElement()->isAllOnesValue())
+    return binaryNot();
+  if (isSingleElement() && getSingleElement()->isAllOnesValue())
+    return Other.binaryNot();
+
   // TODO: replace this with something less conservative
   return getFull();
 }
@@ -1418,7 +1498,7 @@ ConstantRange ConstantRange::inverse() const {
   return ConstantRange(Upper, Lower);
 }
 
-ConstantRange ConstantRange::abs() const {
+ConstantRange ConstantRange::abs(bool IntMinIsPoison) const {
   if (isEmptySet())
     return getEmpty();
 
@@ -1430,12 +1510,23 @@ ConstantRange ConstantRange::abs() const {
     else
       Lo = APIntOps::umin(Lower, -Upper + 1);
 
-    // SignedMin is included in the result range.
-    return ConstantRange(Lo, APInt::getSignedMinValue(getBitWidth()) + 1);
+    // If SignedMin is not poison, then it is included in the result range.
+    if (IntMinIsPoison)
+      return ConstantRange(Lo, APInt::getSignedMinValue(getBitWidth()));
+    else
+      return ConstantRange(Lo, APInt::getSignedMinValue(getBitWidth()) + 1);
   }
 
   APInt SMin = getSignedMin(), SMax = getSignedMax();
 
+  // Skip SignedMin if it is poison.
+  if (IntMinIsPoison && SMin.isMinSignedValue()) {
+    // The range may become empty if it *only* contains SignedMin.
+    if (SMax.isMinSignedValue())
+      return getEmpty();
+    ++SMin;
+  }
+
   // All non-negative.
   if (SMin.isNonNegative())
     return *this;
diff --git a/contrib/llvm-project/llvm/lib/IR/Constants.cpp b/contrib/llvm-project/llvm/lib/IR/Constants.cpp
index cbbcca20ea51..9f05917cf7cc 100644
--- a/contrib/llvm-project/llvm/lib/IR/Constants.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/Constants.cpp
@@ -161,7 +161,7 @@ bool Constant::isNotOneValue() const {
 
   // Check that vectors don't contain 1
   if (auto *VTy = dyn_cast<VectorType>(this->getType())) {
-    unsigned NumElts = VTy->getNumElements();
+    unsigned NumElts = cast<FixedVectorType>(VTy)->getNumElements();
     for (unsigned i = 0; i != NumElts; ++i) {
       Constant *Elt = this->getAggregateElement(i);
       if (!Elt || !Elt->isNotOneValue())
@@ -211,7 +211,7 @@ bool Constant::isNotMinSignedValue() const {
 
   // Check that vectors don't contain INT_MIN
   if (auto *VTy = dyn_cast<VectorType>(this->getType())) {
-    unsigned NumElts = VTy->getNumElements();
+    unsigned NumElts = cast<FixedVectorType>(VTy)->getNumElements();
     for (unsigned i = 0; i != NumElts; ++i) {
       Constant *Elt = this->getAggregateElement(i);
       if (!Elt || !Elt->isNotMinSignedValue())
@@ -227,7 +227,7 @@ bool Constant::isNotMinSignedValue() const {
 bool Constant::isFiniteNonZeroFP() const {
   if (auto *CFP = dyn_cast<ConstantFP>(this))
     return CFP->getValueAPF().isFiniteNonZero();
-  auto *VTy = dyn_cast<VectorType>(getType());
+  auto *VTy = dyn_cast<FixedVectorType>(getType());
   if (!VTy)
     return false;
   for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i) {
@@ -304,23 +304,42 @@ bool Constant::isElementWiseEqual(Value *Y) const {
   return isa<UndefValue>(CmpEq) || match(CmpEq, m_One());
 }
 
-bool Constant::containsUndefElement() const {
-  if (auto *VTy = dyn_cast<VectorType>(getType())) {
-    for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i)
-      if (isa<UndefValue>(getAggregateElement(i)))
+static bool
+containsUndefinedElement(const Constant *C,
+                         function_ref<bool(const Constant *)> HasFn) {
+  if (auto *VTy = dyn_cast<VectorType>(C->getType())) {
+    if (HasFn(C))
+      return true;
+    if (isa<ConstantAggregateZero>(C))
+      return false;
+    if (isa<ScalableVectorType>(C->getType()))
+      return false;
+
+    for (unsigned i = 0, e = cast<FixedVectorType>(VTy)->getNumElements();
+         i != e; ++i)
+      if (HasFn(C->getAggregateElement(i)))
         return true;
   }
 
   return false;
 }
 
+bool Constant::containsUndefOrPoisonElement() const {
+  return containsUndefinedElement(
+      this, [&](const auto *C) { return isa<UndefValue>(C); });
+}
+
+bool Constant::containsPoisonElement() const {
+  return containsUndefinedElement(
+      this, [&](const auto *C) { return isa<PoisonValue>(C); });
+}
+
 bool Constant::containsConstantExpression() const {
-  if (auto *VTy = dyn_cast<VectorType>(getType())) {
+  if (auto *VTy = dyn_cast<FixedVectorType>(getType())) {
     for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i)
       if (isa<ConstantExpr>(getAggregateElement(i)))
         return true;
   }
-
   return false;
 }
 
@@ -400,16 +419,23 @@ Constant *Constant::getAllOnesValue(Type *Ty) {
 }
 
 Constant *Constant::getAggregateElement(unsigned Elt) const {
-  if (const ConstantAggregate *CC = dyn_cast<ConstantAggregate>(this))
+  if (const auto *CC = dyn_cast<ConstantAggregate>(this))
     return Elt < CC->getNumOperands() ? CC->getOperand(Elt) : nullptr;
 
-  if (const ConstantAggregateZero *CAZ = dyn_cast<ConstantAggregateZero>(this))
+  // FIXME: getNumElements() will fail for non-fixed vector types.
+  if (isa<ScalableVectorType>(getType()))
+    return nullptr;
+
+  if (const auto *CAZ = dyn_cast<ConstantAggregateZero>(this))
     return Elt < CAZ->getNumElements() ? CAZ->getElementValue(Elt) : nullptr;
 
-  if (const UndefValue *UV = dyn_cast<UndefValue>(this))
+  if (const auto *PV = dyn_cast<PoisonValue>(this))
+    return Elt < PV->getNumElements() ? PV->getElementValue(Elt) : nullptr;
+
+  if (const auto *UV = dyn_cast<UndefValue>(this))
     return Elt < UV->getNumElements() ? UV->getElementValue(Elt) : nullptr;
 
-  if (const ConstantDataSequential *CDS =dyn_cast<ConstantDataSequential>(this))
+  if (const auto *CDS = dyn_cast<ConstantDataSequential>(this))
     return Elt < CDS->getNumElements() ? CDS->getElementAsConstant(Elt)
                                        : nullptr;
   return nullptr;
@@ -501,9 +527,15 @@ void llvm::deleteConstant(Constant *C) {
   case Constant::BlockAddressVal:
     delete static_cast<BlockAddress *>(C);
     break;
+  case Constant::DSOLocalEquivalentVal:
+    delete static_cast<DSOLocalEquivalent *>(C);
+    break;
   case Constant::UndefValueVal:
     delete static_cast<UndefValue *>(C);
     break;
+  case Constant::PoisonValueVal:
+    delete static_cast<PoisonValue *>(C);
+    break;
   case Constant::ConstantExprVal:
     if (isa<UnaryConstantExpr>(C))
       delete static_cast<UnaryConstantExpr *>(C);
@@ -646,10 +678,17 @@ bool Constant::needsRelocation() const {
           return false;
 
         // Relative pointers do not need to be dynamically relocated.
-        if (auto *LHSGV = dyn_cast<GlobalValue>(LHSOp0->stripPointerCasts()))
-          if (auto *RHSGV = dyn_cast<GlobalValue>(RHSOp0->stripPointerCasts()))
+        if (auto *RHSGV =
+                dyn_cast<GlobalValue>(RHSOp0->stripInBoundsConstantOffsets())) {
+          auto *LHS = LHSOp0->stripInBoundsConstantOffsets();
+          if (auto *LHSGV = dyn_cast<GlobalValue>(LHS)) {
             if (LHSGV->isDSOLocal() && RHSGV->isDSOLocal())
               return false;
+          } else if (isa<DSOLocalEquivalent>(LHS)) {
+            if (RHSGV->isDSOLocal())
+              return false;
+          }
+        }
       }
     }
   }
@@ -729,6 +768,52 @@ Constant *Constant::replaceUndefsWith(Constant *C, Constant *Replacement) {
   return ConstantVector::get(NewC);
 }
 
+Constant *Constant::mergeUndefsWith(Constant *C, Constant *Other) {
+  assert(C && Other && "Expected non-nullptr constant arguments");
+  if (match(C, m_Undef()))
+    return C;
+
+  Type *Ty = C->getType();
+  if (match(Other, m_Undef()))
+    return UndefValue::get(Ty);
+
+  auto *VTy = dyn_cast<FixedVectorType>(Ty);
+  if (!VTy)
+    return C;
+
+  Type *EltTy = VTy->getElementType();
+  unsigned NumElts = VTy->getNumElements();
+  assert(isa<FixedVectorType>(Other->getType()) &&
+         cast<FixedVectorType>(Other->getType())->getNumElements() == NumElts &&
+         "Type mismatch");
+
+  bool FoundExtraUndef = false;
+  SmallVector<Constant *, 32> NewC(NumElts);
+  for (unsigned I = 0; I != NumElts; ++I) {
+    NewC[I] = C->getAggregateElement(I);
+    Constant *OtherEltC = Other->getAggregateElement(I);
+    assert(NewC[I] && OtherEltC && "Unknown vector element");
+    if (!match(NewC[I], m_Undef()) && match(OtherEltC, m_Undef())) {
+      NewC[I] = UndefValue::get(EltTy);
+      FoundExtraUndef = true;
+    }
+  }
+  if (FoundExtraUndef)
+    return ConstantVector::get(NewC);
+  return C;
+}
+
+bool Constant::isManifestConstant() const {
+  if (isa<ConstantData>(this))
+    return true;
+  if (isa<ConstantAggregate>(this) || isa<ConstantExpr>(this)) {
+    for (const Value *Op : operand_values())
+      if (!cast<Constant>(Op)->isManifestConstant())
+        return false;
+    return true;
+  }
+  return false;
+}
 
 //===----------------------------------------------------------------------===//
 //                                ConstantInt
@@ -753,6 +838,10 @@ ConstantInt *ConstantInt::getFalse(LLVMContext &Context) {
   return pImpl->TheFalseVal;
 }
 
+ConstantInt *ConstantInt::getBool(LLVMContext &Context, bool V) {
+  return V ? getTrue(Context) : getFalse(Context);
+}
+
 Constant *ConstantInt::getTrue(Type *Ty) {
   assert(Ty->isIntOrIntVectorTy(1) && "Type not i1 or vector of i1.");
   ConstantInt *TrueC = ConstantInt::getTrue(Ty->getContext());
@@ -769,6 +858,10 @@ Constant *ConstantInt::getFalse(Type *Ty) {
   return FalseC;
 }
 
+Constant *ConstantInt::getBool(Type *Ty, bool V) {
+  return V ? getTrue(Ty) : getFalse(Ty);
+}
+
 // Get a ConstantInt from an APInt.
 ConstantInt *ConstantInt::get(LLVMContext &Context, const APInt &V) {
   // get an existing value or the insertion position
@@ -830,30 +923,12 @@ void ConstantInt::destroyConstantImpl() {
 //                                ConstantFP
 //===----------------------------------------------------------------------===//
 
-static const fltSemantics *TypeToFloatSemantics(Type *Ty) {
-  if (Ty->isHalfTy())
-    return &APFloat::IEEEhalf();
-  if (Ty->isBFloatTy())
-    return &APFloat::BFloat();
-  if (Ty->isFloatTy())
-    return &APFloat::IEEEsingle();
-  if (Ty->isDoubleTy())
-    return &APFloat::IEEEdouble();
-  if (Ty->isX86_FP80Ty())
-    return &APFloat::x87DoubleExtended();
-  else if (Ty->isFP128Ty())
-    return &APFloat::IEEEquad();
-
-  assert(Ty->isPPC_FP128Ty() && "Unknown FP format");
-  return &APFloat::PPCDoubleDouble();
-}
-
 Constant *ConstantFP::get(Type *Ty, double V) {
   LLVMContext &Context = Ty->getContext();
 
   APFloat FV(V);
   bool ignored;
-  FV.convert(*TypeToFloatSemantics(Ty->getScalarType()),
+  FV.convert(Ty->getScalarType()->getFltSemantics(),
              APFloat::rmNearestTiesToEven, &ignored);
   Constant *C = get(Context, FV);
 
@@ -879,7 +954,7 @@ Constant *ConstantFP::get(Type *Ty, const APFloat &V) {
 Constant *ConstantFP::get(Type *Ty, StringRef Str) {
   LLVMContext &Context = Ty->getContext();
 
-  APFloat FV(*TypeToFloatSemantics(Ty->getScalarType()), Str);
+  APFloat FV(Ty->getScalarType()->getFltSemantics(), Str);
   Constant *C = get(Context, FV);
 
   // For vectors, broadcast the value.
@@ -890,7 +965,7 @@ Constant *ConstantFP::get(Type *Ty, StringRef Str) {
 }
 
 Constant *ConstantFP::getNaN(Type *Ty, bool Negative, uint64_t Payload) {
-  const fltSemantics &Semantics = *TypeToFloatSemantics(Ty->getScalarType());
+  const fltSemantics &Semantics = Ty->getScalarType()->getFltSemantics();
   APFloat NaN = APFloat::getNaN(Semantics, Negative, Payload);
   Constant *C = get(Ty->getContext(), NaN);
 
@@ -901,7 +976,7 @@ Constant *ConstantFP::getNaN(Type *Ty, bool Negative, uint64_t Payload) {
 }
 
 Constant *ConstantFP::getQNaN(Type *Ty, bool Negative, APInt *Payload) {
-  const fltSemantics &Semantics = *TypeToFloatSemantics(Ty->getScalarType());
+  const fltSemantics &Semantics = Ty->getScalarType()->getFltSemantics();
   APFloat NaN = APFloat::getQNaN(Semantics, Negative, Payload);
   Constant *C = get(Ty->getContext(), NaN);
 
@@ -912,7 +987,7 @@ Constant *ConstantFP::getQNaN(Type *Ty, bool Negative, APInt *Payload) {
 }
 
 Constant *ConstantFP::getSNaN(Type *Ty, bool Negative, APInt *Payload) {
-  const fltSemantics &Semantics = *TypeToFloatSemantics(Ty->getScalarType());
+  const fltSemantics &Semantics = Ty->getScalarType()->getFltSemantics();
   APFloat NaN = APFloat::getSNaN(Semantics, Negative, Payload);
   Constant *C = get(Ty->getContext(), NaN);
 
@@ -923,7 +998,7 @@ Constant *ConstantFP::getSNaN(Type *Ty, bool Negative, APInt *Payload) {
 }
 
 Constant *ConstantFP::getNegativeZero(Type *Ty) {
-  const fltSemantics &Semantics = *TypeToFloatSemantics(Ty->getScalarType());
+  const fltSemantics &Semantics = Ty->getScalarType()->getFltSemantics();
   APFloat NegZero = APFloat::getZero(Semantics, /*Negative=*/true);
   Constant *C = get(Ty->getContext(), NegZero);
 
@@ -949,24 +1024,7 @@ ConstantFP* ConstantFP::get(LLVMContext &Context, const APFloat& V) {
   std::unique_ptr<ConstantFP> &Slot = pImpl->FPConstants[V];
 
   if (!Slot) {
-    Type *Ty;
-    if (&V.getSemantics() == &APFloat::IEEEhalf())
-      Ty = Type::getHalfTy(Context);
-    else if (&V.getSemantics() == &APFloat::BFloat())
-      Ty = Type::getBFloatTy(Context);
-    else if (&V.getSemantics() == &APFloat::IEEEsingle())
-      Ty = Type::getFloatTy(Context);
-    else if (&V.getSemantics() == &APFloat::IEEEdouble())
-      Ty = Type::getDoubleTy(Context);
-    else if (&V.getSemantics() == &APFloat::x87DoubleExtended())
-      Ty = Type::getX86_FP80Ty(Context);
-    else if (&V.getSemantics() == &APFloat::IEEEquad())
-      Ty = Type::getFP128Ty(Context);
-    else {
-      assert(&V.getSemantics() == &APFloat::PPCDoubleDouble() &&
-             "Unknown FP format");
-      Ty = Type::getPPC_FP128Ty(Context);
-    }
+    Type *Ty = Type::getFloatingPointTy(Context, V.getSemantics());
     Slot.reset(new ConstantFP(Ty, V));
   }
 
@@ -974,7 +1032,7 @@ ConstantFP* ConstantFP::get(LLVMContext &Context, const APFloat& V) {
 }
 
 Constant *ConstantFP::getInfinity(Type *Ty, bool Negative) {
-  const fltSemantics &Semantics = *TypeToFloatSemantics(Ty->getScalarType());
+  const fltSemantics &Semantics = Ty->getScalarType()->getFltSemantics();
   Constant *C = get(Ty->getContext(), APFloat::getInf(Semantics, Negative));
 
   if (VectorType *VTy = dyn_cast<VectorType>(Ty))
@@ -985,7 +1043,7 @@ Constant *ConstantFP::getInfinity(Type *Ty, bool Negative) {
 
 ConstantFP::ConstantFP(Type *Ty, const APFloat &V)
     : ConstantData(Ty, ConstantFPVal), Val(V) {
-  assert(&V.getSemantics() == TypeToFloatSemantics(Ty) &&
+  assert(&V.getSemantics() == &Ty->getFltSemantics() &&
          "FP type Mismatch");
 }
 
@@ -1029,7 +1087,7 @@ unsigned ConstantAggregateZero::getNumElements() const {
   if (auto *AT = dyn_cast<ArrayType>(Ty))
     return AT->getNumElements();
   if (auto *VT = dyn_cast<VectorType>(Ty))
-    return VT->getNumElements();
+    return cast<FixedVectorType>(VT)->getNumElements();
   return Ty->getStructNumElements();
 }
 
@@ -1064,10 +1122,36 @@ unsigned UndefValue::getNumElements() const {
   if (auto *AT = dyn_cast<ArrayType>(Ty))
     return AT->getNumElements();
   if (auto *VT = dyn_cast<VectorType>(Ty))
-    return VT->getNumElements();
+    return cast<FixedVectorType>(VT)->getNumElements();
   return Ty->getStructNumElements();
 }
 
+//===----------------------------------------------------------------------===//
+//                         PoisonValue Implementation
+//===----------------------------------------------------------------------===//
+
+PoisonValue *PoisonValue::getSequentialElement() const {
+  if (ArrayType *ATy = dyn_cast<ArrayType>(getType()))
+    return PoisonValue::get(ATy->getElementType());
+  return PoisonValue::get(cast<VectorType>(getType())->getElementType());
+}
+
+PoisonValue *PoisonValue::getStructElement(unsigned Elt) const {
+  return PoisonValue::get(getType()->getStructElementType(Elt));
+}
+
+PoisonValue *PoisonValue::getElementValue(Constant *C) const {
+  if (isa<ArrayType>(getType()) || isa<VectorType>(getType()))
+    return getSequentialElement();
+  return getStructElement(cast<ConstantInt>(C)->getZExtValue());
+}
+
+PoisonValue *PoisonValue::getElementValue(unsigned Idx) const {
+  if (isa<ArrayType>(getType()) || isa<VectorType>(getType()))
+    return getSequentialElement();
+  return getStructElement(Idx);
+}
+
 //===----------------------------------------------------------------------===//
 //                            ConstantXXX Classes
 //===----------------------------------------------------------------------===//
@@ -1246,7 +1330,7 @@ Constant *ConstantStruct::get(StructType *ST, ArrayRef<Constant*> V) {
 
 ConstantVector::ConstantVector(VectorType *T, ArrayRef<Constant *> V)
     : ConstantAggregate(T, ConstantVectorVal, V) {
-  assert(V.size() == T->getNumElements() &&
+  assert(V.size() == cast<FixedVectorType>(T)->getNumElements() &&
          "Invalid initializer for constant vector");
 }
 
@@ -1267,17 +1351,20 @@ Constant *ConstantVector::getImpl(ArrayRef<Constant*> V) {
   Constant *C = V[0];
   bool isZero = C->isNullValue();
   bool isUndef = isa<UndefValue>(C);
+  bool isPoison = isa<PoisonValue>(C);
 
   if (isZero || isUndef) {
     for (unsigned i = 1, e = V.size(); i != e; ++i)
       if (V[i] != C) {
-        isZero = isUndef = false;
+        isZero = isUndef = isPoison = false;
         break;
       }
   }
 
   if (isZero)
     return ConstantAggregateZero::get(T);
+  if (isPoison)
+    return PoisonValue::get(T);
   if (isUndef)
     return UndefValue::get(T);
 
@@ -1292,14 +1379,14 @@ Constant *ConstantVector::getImpl(ArrayRef<Constant*> V) {
 }
 
 Constant *ConstantVector::getSplat(ElementCount EC, Constant *V) {
-  if (!EC.Scalable) {
+  if (!EC.isScalable()) {
     // If this splat is compatible with ConstantDataVector, use it instead of
     // ConstantVector.
     if ((isa<ConstantFP>(V) || isa<ConstantInt>(V)) &&
         ConstantDataSequential::isElementTypeCompatible(V->getType()))
-      return ConstantDataVector::getSplat(EC.Min, V);
+      return ConstantDataVector::getSplat(EC.getKnownMinValue(), V);
 
-    SmallVector<Constant *, 32> Elts(EC.Min, V);
+    SmallVector<Constant *, 32> Elts(EC.getKnownMinValue(), V);
     return get(Elts);
   }
 
@@ -1316,7 +1403,7 @@ Constant *ConstantVector::getSplat(ElementCount EC, Constant *V) {
   Constant *UndefV = UndefValue::get(VTy);
   V = ConstantExpr::getInsertElement(UndefV, V, ConstantInt::get(I32Ty, 0));
   // Build shuffle mask to perform the splat.
-  SmallVector<int, 8> Zeros(EC.Min, 0);
+  SmallVector<int, 8> Zeros(EC.getKnownMinValue(), 0);
   // Splat.
   return ConstantExpr::getShuffleVector(V, UndefV, Zeros);
 }
@@ -1441,6 +1528,8 @@ Constant *ConstantExpr::getWithOperands(ArrayRef<Constant *> Ops, Type *Ty,
                                         OnlyIfReducedTy);
   case Instruction::ExtractValue:
     return ConstantExpr::getExtractValue(Ops[0], getIndices(), OnlyIfReducedTy);
+  case Instruction::FNeg:
+    return ConstantExpr::getFNeg(Ops[0]);
   case Instruction::ShuffleVector:
     return ConstantExpr::getShuffleVector(Ops[0], Ops[1], getShuffleMask(),
                                           OnlyIfReducedTy);
@@ -1601,7 +1690,7 @@ Constant *Constant::getSplatValue(bool AllowUndefs) const {
       ConstantInt *Index = dyn_cast<ConstantInt>(IElt->getOperand(2));
 
       if (Index && Index->getValue() == 0 &&
-          std::all_of(Mask.begin(), Mask.end(), [](int I) { return I == 0; }))
+          llvm::all_of(Mask, [](int I) { return I == 0; }))
         return SplatVal;
     }
   }
@@ -1673,7 +1762,26 @@ UndefValue *UndefValue::get(Type *Ty) {
 /// Remove the constant from the constant table.
 void UndefValue::destroyConstantImpl() {
   // Free the constant and any dangling references to it.
-  getContext().pImpl->UVConstants.erase(getType());
+  if (getValueID() == UndefValueVal) {
+    getContext().pImpl->UVConstants.erase(getType());
+  } else if (getValueID() == PoisonValueVal) {
+    getContext().pImpl->PVConstants.erase(getType());
+  }
+  llvm_unreachable("Not a undef or a poison!");
+}
+
+PoisonValue *PoisonValue::get(Type *Ty) {
+  std::unique_ptr<PoisonValue> &Entry = Ty->getContext().pImpl->PVConstants[Ty];
+  if (!Entry)
+    Entry.reset(new PoisonValue(Ty));
+
+  return Entry.get();
+}
+
+/// Remove the constant from the constant table.
+void PoisonValue::destroyConstantImpl() {
+  // Free the constant and any dangling references to it.
+  getContext().pImpl->PVConstants.erase(getType());
 }
 
 BlockAddress *BlockAddress::get(BasicBlock *BB) {
@@ -1754,6 +1862,58 @@ Value *BlockAddress::handleOperandChangeImpl(Value *From, Value *To) {
   return nullptr;
 }
 
+DSOLocalEquivalent *DSOLocalEquivalent::get(GlobalValue *GV) {
+  DSOLocalEquivalent *&Equiv = GV->getContext().pImpl->DSOLocalEquivalents[GV];
+  if (!Equiv)
+    Equiv = new DSOLocalEquivalent(GV);
+
+  assert(Equiv->getGlobalValue() == GV &&
+         "DSOLocalFunction does not match the expected global value");
+  return Equiv;
+}
+
+DSOLocalEquivalent::DSOLocalEquivalent(GlobalValue *GV)
+    : Constant(GV->getType(), Value::DSOLocalEquivalentVal, &Op<0>(), 1) {
+  setOperand(0, GV);
+}
+
+/// Remove the constant from the constant table.
+void DSOLocalEquivalent::destroyConstantImpl() {
+  const GlobalValue *GV = getGlobalValue();
+  GV->getContext().pImpl->DSOLocalEquivalents.erase(GV);
+}
+
+Value *DSOLocalEquivalent::handleOperandChangeImpl(Value *From, Value *To) {
+  assert(From == getGlobalValue() && "Changing value does not match operand.");
+  assert(isa<Constant>(To) && "Can only replace the operands with a constant");
+
+  // The replacement is with another global value.
+  if (const auto *ToObj = dyn_cast<GlobalValue>(To)) {
+    DSOLocalEquivalent *&NewEquiv =
+        getContext().pImpl->DSOLocalEquivalents[ToObj];
+    if (NewEquiv)
+      return llvm::ConstantExpr::getBitCast(NewEquiv, getType());
+  }
+
+  // If the argument is replaced with a null value, just replace this constant
+  // with a null value.
+  if (cast<Constant>(To)->isNullValue())
+    return To;
+
+  // The replacement could be a bitcast or an alias to another function. We can
+  // replace it with a bitcast to the dso_local_equivalent of that function.
+  auto *Func = cast<Function>(To->stripPointerCastsAndAliases());
+  DSOLocalEquivalent *&NewEquiv = getContext().pImpl->DSOLocalEquivalents[Func];
+  if (NewEquiv)
+    return llvm::ConstantExpr::getBitCast(NewEquiv, getType());
+
+  // Replace this with the new one.
+  getContext().pImpl->DSOLocalEquivalents.erase(getGlobalValue());
+  NewEquiv = this;
+  setOperand(0, Func);
+  return nullptr;
+}
+
 //---- ConstantExpr::get() implementations.
 //
 
@@ -2002,8 +2162,8 @@ Constant *ConstantExpr::getPtrToInt(Constant *C, Type *DstTy,
          "PtrToInt destination must be integer or integer vector");
   assert(isa<VectorType>(C->getType()) == isa<VectorType>(DstTy));
   if (isa<VectorType>(C->getType()))
-    assert(cast<VectorType>(C->getType())->getNumElements() ==
-               cast<VectorType>(DstTy)->getNumElements() &&
+    assert(cast<FixedVectorType>(C->getType())->getNumElements() ==
+               cast<FixedVectorType>(DstTy)->getNumElements() &&
            "Invalid cast between a different number of vector elements");
   return getFoldedCast(Instruction::PtrToInt, C, DstTy, OnlyIfReduced);
 }
@@ -2016,8 +2176,8 @@ Constant *ConstantExpr::getIntToPtr(Constant *C, Type *DstTy,
          "IntToPtr destination must be a pointer or pointer vector");
   assert(isa<VectorType>(C->getType()) == isa<VectorType>(DstTy));
   if (isa<VectorType>(C->getType()))
-    assert(cast<VectorType>(C->getType())->getNumElements() ==
-               cast<VectorType>(DstTy)->getNumElements() &&
+    assert(cast<VectorType>(C->getType())->getElementCount() ==
+               cast<VectorType>(DstTy)->getElementCount() &&
            "Invalid cast between a different number of vector elements");
   return getFoldedCast(Instruction::IntToPtr, C, DstTy, OnlyIfReduced);
 }
@@ -2048,7 +2208,8 @@ Constant *ConstantExpr::getAddrSpaceCast(Constant *C, Type *DstTy,
     Type *MidTy = PointerType::get(DstElemTy, SrcScalarTy->getAddressSpace());
     if (VectorType *VT = dyn_cast<VectorType>(DstTy)) {
       // Handle vectors of pointers.
-      MidTy = FixedVectorType::get(MidTy, VT->getNumElements());
+      MidTy = FixedVectorType::get(MidTy,
+                                   cast<FixedVectorType>(VT)->getNumElements());
     }
     C = getBitCast(C, MidTy);
   }
@@ -2245,7 +2406,7 @@ Constant *ConstantExpr::getGetElementPtr(Type *Ty, Constant *C,
   unsigned AS = C->getType()->getPointerAddressSpace();
   Type *ReqTy = DestTy->getPointerTo(AS);
 
-  ElementCount EltCount = {0, false};
+  auto EltCount = ElementCount::getFixed(0);
   if (VectorType *VecTy = dyn_cast<VectorType>(C->getType()))
     EltCount = VecTy->getElementCount();
   else
@@ -2253,7 +2414,7 @@ Constant *ConstantExpr::getGetElementPtr(Type *Ty, Constant *C,
       if (VectorType *VecTy = dyn_cast<VectorType>(Idx->getType()))
         EltCount = VecTy->getElementCount();
 
-  if (EltCount.Min != 0)
+  if (EltCount.isNonZero())
     ReqTy = VectorType::get(ReqTy, EltCount);
 
   if (OnlyIfReducedTy == ReqTy)
@@ -2273,7 +2434,7 @@ Constant *ConstantExpr::getGetElementPtr(Type *Ty, Constant *C,
 
     if (GTI.isStruct() && Idx->getType()->isVectorTy()) {
       Idx = Idx->getSplatValue();
-    } else if (GTI.isSequential() && EltCount.Min != 0 &&
+    } else if (GTI.isSequential() && EltCount.isNonZero() &&
                !Idx->getType()->isVectorTy()) {
       Idx = ConstantVector::getSplat(EltCount, Idx);
     }
@@ -2549,6 +2710,11 @@ Constant *ConstantExpr::getXor(Constant *C1, Constant *C2) {
   return get(Instruction::Xor, C1, C2);
 }
 
+Constant *ConstantExpr::getUMin(Constant *C1, Constant *C2) {
+  Constant *Cmp = ConstantExpr::getICmp(CmpInst::ICMP_ULT, C1, C2);
+  return getSelect(Cmp, C1, C2);
+}
+
 Constant *ConstantExpr::getShl(Constant *C1, Constant *C2,
                                bool HasNUW, bool HasNSW) {
   unsigned Flags = (HasNUW ? OverflowingBinaryOperator::NoUnsignedWrap : 0) |
@@ -2566,6 +2732,35 @@ Constant *ConstantExpr::getAShr(Constant *C1, Constant *C2, bool isExact) {
              isExact ? PossiblyExactOperator::IsExact : 0);
 }
 
+Constant *ConstantExpr::getExactLogBase2(Constant *C) {
+  Type *Ty = C->getType();
+  const APInt *IVal;
+  if (match(C, m_APInt(IVal)) && IVal->isPowerOf2())
+    return ConstantInt::get(Ty, IVal->logBase2());
+
+  // FIXME: We can extract pow of 2 of splat constant for scalable vectors.
+  auto *VecTy = dyn_cast<FixedVectorType>(Ty);
+  if (!VecTy)
+    return nullptr;
+
+  SmallVector<Constant *, 4> Elts;
+  for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
+    Constant *Elt = C->getAggregateElement(I);
+    if (!Elt)
+      return nullptr;
+    // Note that log2(iN undef) is *NOT* iN undef, because log2(iN undef) u< N.
+    if (isa<UndefValue>(Elt)) {
+      Elts.push_back(Constant::getNullValue(Ty->getScalarType()));
+      continue;
+    }
+    if (!match(Elt, m_APInt(IVal)) || !IVal->isPowerOf2())
+      return nullptr;
+    Elts.push_back(ConstantInt::get(Ty->getScalarType(), IVal->logBase2()));
+  }
+
+  return ConstantVector::get(Elts);
+}
+
 Constant *ConstantExpr::getBinOpIdentity(unsigned Opcode, Type *Ty,
                                          bool AllowRHSConstant) {
   assert(Instruction::isBinaryOp(Opcode) && "Only binops allowed");
@@ -2690,7 +2885,7 @@ bool ConstantDataSequential::isElementTypeCompatible(Type *Ty) {
 unsigned ConstantDataSequential::getNumElements() const {
   if (ArrayType *AT = dyn_cast<ArrayType>(getType()))
     return AT->getNumElements();
-  return cast<VectorType>(getType())->getNumElements();
+  return cast<FixedVectorType>(getType())->getNumElements();
 }
 
 
@@ -2739,56 +2934,58 @@ Constant *ConstantDataSequential::getImpl(StringRef Elements, Type *Ty) {
   // body but different types.  For example, 0,0,0,1 could be a 4 element array
   // of i8, or a 1-element array of i32.  They'll both end up in the same
   /// StringMap bucket, linked up by their Next pointers.  Walk the list.
-  ConstantDataSequential **Entry = &Slot.second;
-  for (ConstantDataSequential *Node = *Entry; Node;
-       Entry = &Node->Next, Node = *Entry)
-    if (Node->getType() == Ty)
-      return Node;
+  std::unique_ptr<ConstantDataSequential> *Entry = &Slot.second;
+  for (; *Entry; Entry = &(*Entry)->Next)
+    if ((*Entry)->getType() == Ty)
+      return Entry->get();
 
   // Okay, we didn't get a hit.  Create a node of the right class, link it in,
   // and return it.
-  if (isa<ArrayType>(Ty))
-    return *Entry = new ConstantDataArray(Ty, Slot.first().data());
+  if (isa<ArrayType>(Ty)) {
+    // Use reset because std::make_unique can't access the constructor.
+    Entry->reset(new ConstantDataArray(Ty, Slot.first().data()));
+    return Entry->get();
+  }
 
   assert(isa<VectorType>(Ty));
-  return *Entry = new ConstantDataVector(Ty, Slot.first().data());
+  // Use reset because std::make_unique can't access the constructor.
+  Entry->reset(new ConstantDataVector(Ty, Slot.first().data()));
+  return Entry->get();
 }
 
 void ConstantDataSequential::destroyConstantImpl() {
   // Remove the constant from the StringMap.
-  StringMap<ConstantDataSequential*> &CDSConstants =
-    getType()->getContext().pImpl->CDSConstants;
+  StringMap<std::unique_ptr<ConstantDataSequential>> &CDSConstants =
+      getType()->getContext().pImpl->CDSConstants;
 
-  StringMap<ConstantDataSequential*>::iterator Slot =
-    CDSConstants.find(getRawDataValues());
+  auto Slot = CDSConstants.find(getRawDataValues());
 
   assert(Slot != CDSConstants.end() && "CDS not found in uniquing table");
 
-  ConstantDataSequential **Entry = &Slot->getValue();
+  std::unique_ptr<ConstantDataSequential> *Entry = &Slot->getValue();
 
   // Remove the entry from the hash table.
   if (!(*Entry)->Next) {
     // If there is only one value in the bucket (common case) it must be this
     // entry, and removing the entry should remove the bucket completely.
-    assert((*Entry) == this && "Hash mismatch in ConstantDataSequential");
+    assert(Entry->get() == this && "Hash mismatch in ConstantDataSequential");
     getContext().pImpl->CDSConstants.erase(Slot);
-  } else {
-    // Otherwise, there are multiple entries linked off the bucket, unlink the
-    // node we care about but keep the bucket around.
-    for (ConstantDataSequential *Node = *Entry; ;
-         Entry = &Node->Next, Node = *Entry) {
-      assert(Node && "Didn't find entry in its uniquing hash table!");
-      // If we found our entry, unlink it from the list and we're done.
-      if (Node == this) {
-        *Entry = Node->Next;
-        break;
-      }
-    }
+    return;
   }
 
-  // If we were part of a list, make sure that we don't delete the list that is
-  // still owned by the uniquing map.
-  Next = nullptr;
+  // Otherwise, there are multiple entries linked off the bucket, unlink the
+  // node we care about but keep the bucket around.
+  while (true) {
+    std::unique_ptr<ConstantDataSequential> &Node = *Entry;
+    assert(Node && "Didn't find entry in its uniquing hash table!");
+    // If we found our entry, unlink it from the list and we're done.
+    if (Node.get() == this) {
+      Node = std::move(Node->Next);
+      return;
+    }
+
+    Entry = &Node->Next;
+  }
 }
 
 /// getFP() constructors - Return a constant of array type with a float
@@ -2938,7 +3135,7 @@ Constant *ConstantDataVector::getSplat(unsigned NumElts, Constant *V) {
       return getFP(V->getType(), Elts);
     }
   }
-  return ConstantVector::getSplat({NumElts, false}, V);
+  return ConstantVector::getSplat(ElementCount::getFixed(NumElts), V);
 }
 
 
@@ -3248,7 +3445,7 @@ Value *ConstantExpr::handleOperandChangeImpl(Value *From, Value *ToV) {
 }
 
 Instruction *ConstantExpr::getAsInstruction() const {
-  SmallVector<Value *, 4> ValueOperands(op_begin(), op_end());
+  SmallVector<Value *, 4> ValueOperands(operands());
   ArrayRef<Value*> Ops(ValueOperands);
 
   switch (getOpcode()) {
diff --git a/contrib/llvm-project/llvm/lib/IR/Core.cpp b/contrib/llvm-project/llvm/lib/IR/Core.cpp
index c1f7329034e0..039b34ace6ab 100644
--- a/contrib/llvm-project/llvm/lib/IR/Core.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/Core.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm-c/Core.h"
-#include "llvm/ADT/StringSwitch.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfoMetadata.h"
@@ -146,6 +145,11 @@ LLVMAttributeRef LLVMCreateEnumAttribute(LLVMContextRef C, unsigned KindID,
     return wrap(Attribute::getWithByValType(Ctx, NULL));
   }
 
+  if (AttrKind == Attribute::AttrKind::StructRet) {
+    // Same as byval.
+    return wrap(Attribute::getWithStructRetType(Ctx, NULL));
+  }
+
   return wrap(Attribute::get(Ctx, AttrKind, Val));
 }
 
@@ -160,6 +164,18 @@ uint64_t LLVMGetEnumAttributeValue(LLVMAttributeRef A) {
   return Attr.getValueAsInt();
 }
 
+LLVMAttributeRef LLVMCreateTypeAttribute(LLVMContextRef C, unsigned KindID,
+                                         LLVMTypeRef type_ref) {
+  auto &Ctx = *unwrap(C);
+  auto AttrKind = (Attribute::AttrKind)KindID;
+  return wrap(Attribute::get(Ctx, AttrKind, unwrap(type_ref)));
+}
+
+LLVMTypeRef LLVMGetTypeAttributeValue(LLVMAttributeRef A) {
+  auto Attr = unwrap(A);
+  return wrap(Attr.getValueAsType());
+}
+
 LLVMAttributeRef LLVMCreateStringAttribute(LLVMContextRef C,
                                            const char *K, unsigned KLength,
                                            const char *V, unsigned VLength) {
@@ -190,6 +206,10 @@ LLVMBool LLVMIsStringAttribute(LLVMAttributeRef A) {
   return unwrap(A).isStringAttribute();
 }
 
+LLVMBool LLVMIsTypeAttribute(LLVMAttributeRef A) {
+  return unwrap(A).isTypeAttribute();
+}
+
 char *LLVMGetDiagInfoDescription(LLVMDiagnosticInfoRef DI) {
   std::string MsgStorage;
   raw_string_ostream Stream(MsgStorage);
@@ -507,6 +527,8 @@ LLVMTypeKind LLVMGetTypeKind(LLVMTypeRef Ty) {
     return LLVMVectorTypeKind;
   case Type::X86_MMXTyID:
     return LLVMX86_MMXTypeKind;
+  case Type::X86_AMXTyID:
+    return LLVMX86_AMXTypeKind;
   case Type::TokenTyID:
     return LLVMTokenTypeKind;
   case Type::ScalableVectorTyID:
@@ -618,6 +640,9 @@ LLVMTypeRef LLVMPPCFP128TypeInContext(LLVMContextRef C) {
 LLVMTypeRef LLVMX86MMXTypeInContext(LLVMContextRef C) {
   return (LLVMTypeRef) Type::getX86_MMXTy(*unwrap(C));
 }
+LLVMTypeRef LLVMX86AMXTypeInContext(LLVMContextRef C) {
+  return (LLVMTypeRef) Type::getX86_AMXTy(*unwrap(C));
+}
 
 LLVMTypeRef LLVMHalfType(void) {
   return LLVMHalfTypeInContext(LLVMGetGlobalContext());
@@ -643,6 +668,9 @@ LLVMTypeRef LLVMPPCFP128Type(void) {
 LLVMTypeRef LLVMX86MMXType(void) {
   return LLVMX86MMXTypeInContext(LLVMGetGlobalContext());
 }
+LLVMTypeRef LLVMX86AMXType(void) {
+  return LLVMX86AMXTypeInContext(LLVMGetGlobalContext());
+}
 
 /*--.. Operations on function types ........................................--*/
 
@@ -734,7 +762,11 @@ LLVMBool LLVMIsLiteralStruct(LLVMTypeRef StructTy) {
 }
 
 LLVMTypeRef LLVMGetTypeByName(LLVMModuleRef M, const char *Name) {
-  return wrap(unwrap(M)->getTypeByName(Name));
+  return wrap(StructType::getTypeByName(unwrap(M)->getContext(), Name));
+}
+
+LLVMTypeRef LLVMGetTypeByName2(LLVMContextRef C, const char *Name) {
+  return wrap(StructType::getTypeByName(*unwrap(C), Name));
 }
 
 /*--.. Operations on array, pointer, and vector types (sequence types) .....--*/
@@ -759,6 +791,11 @@ LLVMTypeRef LLVMVectorType(LLVMTypeRef ElementType, unsigned ElementCount) {
   return wrap(FixedVectorType::get(unwrap(ElementType), ElementCount));
 }
 
+LLVMTypeRef LLVMScalableVectorType(LLVMTypeRef ElementType,
+                                   unsigned ElementCount) {
+  return wrap(ScalableVectorType::get(unwrap(ElementType), ElementCount));
+}
+
 LLVMTypeRef LLVMGetElementType(LLVMTypeRef WrappedTy) {
   auto *Ty = unwrap<Type>(WrappedTy);
   if (auto *PTy = dyn_cast<PointerType>(Ty))
@@ -781,7 +818,7 @@ unsigned LLVMGetPointerAddressSpace(LLVMTypeRef PointerTy) {
 }
 
 unsigned LLVMGetVectorSize(LLVMTypeRef VectorTy) {
-  return unwrap<VectorType>(VectorTy)->getNumElements();
+  return unwrap<VectorType>(VectorTy)->getElementCount().getKnownMinValue();
 }
 
 /*--.. Operations on other types ...........................................--*/
@@ -816,6 +853,7 @@ LLVMTypeRef LLVMTypeOf(LLVMValueRef Val) {
 
 LLVMValueKind LLVMGetValueKind(LLVMValueRef Val) {
     switch(unwrap(Val)->getValueID()) {
+#define LLVM_C_API 1
 #define HANDLE_VALUE(Name) \
   case Value::Name##Val: \
     return LLVM##Name##ValueKind;
@@ -925,6 +963,7 @@ LLVMValueMetadataEntry *
 LLVMInstructionGetAllMetadataOtherThanDebugLoc(LLVMValueRef Value,
                                                size_t *NumEntries) {
   return llvm_getMetadata(NumEntries, [&Value](MetadataEntries &Entries) {
+    Entries.clear();
     unwrap<Instruction>(Value)->getAllMetadata(Entries);
   });
 }
@@ -1034,6 +1073,10 @@ LLVMValueRef LLVMGetUndef(LLVMTypeRef Ty) {
   return wrap(UndefValue::get(unwrap(Ty)));
 }
 
+LLVMValueRef LLVMGetPoison(LLVMTypeRef Ty) {
+  return wrap(PoisonValue::get(unwrap(Ty)));
+}
+
 LLVMBool LLVMIsConstant(LLVMValueRef Ty) {
   return isa<Constant>(unwrap(Ty));
 }
@@ -1048,6 +1091,10 @@ LLVMBool LLVMIsUndef(LLVMValueRef Val) {
   return isa<UndefValue>(unwrap(Val));
 }
 
+LLVMBool LLVMIsPoison(LLVMValueRef Val) {
+  return isa<PoisonValue>(unwrap(Val));
+}
+
 LLVMValueRef LLVMConstPointerNull(LLVMTypeRef Ty) {
   return wrap(ConstantPointerNull::get(unwrap<PointerType>(Ty)));
 }
@@ -2034,6 +2081,7 @@ void LLVMSetAlignment(LLVMValueRef V, unsigned Bytes) {
 LLVMValueMetadataEntry *LLVMGlobalCopyAllMetadata(LLVMValueRef Value,
                                                   size_t *NumEntries) {
   return llvm_getMetadata(NumEntries, [&Value](MetadataEntries &Entries) {
+    Entries.clear();
     if (Instruction *Instr = dyn_cast<Instruction>(unwrap(Value))) {
       Instr->getAllMetadata(Entries);
     } else {
diff --git a/contrib/llvm-project/llvm/lib/IR/DIBuilder.cpp b/contrib/llvm-project/llvm/lib/IR/DIBuilder.cpp
index 45cbbb3a6037..5104dc349d0b 100644
--- a/contrib/llvm-project/llvm/lib/IR/DIBuilder.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/DIBuilder.cpp
@@ -267,6 +267,12 @@ DIBasicType *DIBuilder::createBasicType(StringRef Name, uint64_t SizeInBits,
                           0, Encoding, Flags);
 }
 
+DIStringType *DIBuilder::createStringType(StringRef Name, uint64_t SizeInBits) {
+  assert(!Name.empty() && "Unable to create type without name");
+  return DIStringType::get(VMContext, dwarf::DW_TAG_string_type, Name,
+                           SizeInBits, 0);
+}
+
 DIDerivedType *DIBuilder::createQualifiedType(unsigned Tag, DIType *FromTy) {
   return DIDerivedType::get(VMContext, Tag, "", nullptr, 0, nullptr, FromTy, 0,
                             0, 0, None, DINode::FlagZero);
@@ -519,12 +525,24 @@ DICompositeType *DIBuilder::createEnumerationType(
   return CTy;
 }
 
-DICompositeType *DIBuilder::createArrayType(uint64_t Size,
-                                            uint32_t AlignInBits, DIType *Ty,
-                                            DINodeArray Subscripts) {
-  auto *R = DICompositeType::get(VMContext, dwarf::DW_TAG_array_type, "",
-                                 nullptr, 0, nullptr, Ty, Size, AlignInBits, 0,
-                                 DINode::FlagZero, Subscripts, 0, nullptr);
+DICompositeType *DIBuilder::createArrayType(
+    uint64_t Size, uint32_t AlignInBits, DIType *Ty, DINodeArray Subscripts,
+    PointerUnion<DIExpression *, DIVariable *> DL,
+    PointerUnion<DIExpression *, DIVariable *> AS,
+    PointerUnion<DIExpression *, DIVariable *> AL,
+    PointerUnion<DIExpression *, DIVariable *> RK) {
+  auto *R = DICompositeType::get(
+      VMContext, dwarf::DW_TAG_array_type, "", nullptr, 0,
+      nullptr, Ty, Size, AlignInBits, 0, DINode::FlagZero,
+      Subscripts, 0, nullptr, nullptr, "", nullptr,
+      DL.is<DIExpression *>() ? (Metadata *)DL.get<DIExpression *>()
+                              : (Metadata *)DL.get<DIVariable *>(),
+      AS.is<DIExpression *>() ? (Metadata *)AS.get<DIExpression *>()
+                              : (Metadata *)AS.get<DIVariable *>(),
+      AL.is<DIExpression *>() ? (Metadata *)AL.get<DIExpression *>()
+                              : (Metadata *)AL.get<DIVariable *>(),
+      RK.is<DIExpression *>() ? (Metadata *)RK.get<DIExpression *>()
+                              : (Metadata *)RK.get<DIVariable *>());
   trackIfUnresolved(R);
   return R;
 }
@@ -643,6 +661,18 @@ DISubrange *DIBuilder::getOrCreateSubrange(Metadata *CountNode, Metadata *LB,
   return DISubrange::get(VMContext, CountNode, LB, UB, Stride);
 }
 
+DIGenericSubrange *DIBuilder::getOrCreateGenericSubrange(
+    DIGenericSubrange::BoundType CountNode, DIGenericSubrange::BoundType LB,
+    DIGenericSubrange::BoundType UB, DIGenericSubrange::BoundType Stride) {
+  auto ConvToMetadata = [&](DIGenericSubrange::BoundType Bound) -> Metadata * {
+    return Bound.is<DIExpression *>() ? (Metadata *)Bound.get<DIExpression *>()
+                                      : (Metadata *)Bound.get<DIVariable *>();
+  };
+  return DIGenericSubrange::get(VMContext, ConvToMetadata(CountNode),
+                                ConvToMetadata(LB), ConvToMetadata(UB),
+                                ConvToMetadata(Stride));
+}
+
 static void checkGlobalVariableScope(DIScope *Context) {
 #ifndef NDEBUG
   if (auto *CT =
@@ -844,9 +874,10 @@ DINamespace *DIBuilder::createNameSpace(DIScope *Scope, StringRef Name,
 DIModule *DIBuilder::createModule(DIScope *Scope, StringRef Name,
                                   StringRef ConfigurationMacros,
                                   StringRef IncludePath, StringRef APINotesFile,
-                                  DIFile *File, unsigned LineNo) {
+                                  DIFile *File, unsigned LineNo, bool IsDecl) {
   return DIModule::get(VMContext, File, getNonCompileUnitScope(Scope), Name,
-                       ConfigurationMacros, IncludePath, APINotesFile, LineNo);
+                       ConfigurationMacros, IncludePath, APINotesFile, LineNo,
+                       IsDecl);
 }
 
 DILexicalBlockFile *DIBuilder::createLexicalBlockFile(DIScope *Scope,
diff --git a/contrib/llvm-project/llvm/lib/IR/DataLayout.cpp b/contrib/llvm-project/llvm/lib/IR/DataLayout.cpp
index c44737c5bfc2..274ea0aa5fd1 100644
--- a/contrib/llvm-project/llvm/lib/IR/DataLayout.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/DataLayout.cpp
@@ -27,6 +27,7 @@
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/TypeSize.h"
@@ -64,7 +65,8 @@ StructLayout::StructLayout(StructType *ST, const DataLayout &DL) {
     StructAlignment = std::max(TyAlign, StructAlignment);
 
     MemberOffsets[i] = StructSize;
-    StructSize += DL.getTypeAllocSize(Ty); // Consume space for this data item
+    // Consume space for this data item
+    StructSize += DL.getTypeAllocSize(Ty).getFixedValue();
   }
 
   // Add padding to the end of the struct so that it could be put in an array
@@ -181,6 +183,7 @@ void DataLayout::reset(StringRef Desc) {
   AllocaAddrSpace = 0;
   StackNaturalAlign.reset();
   ProgramAddrSpace = 0;
+  DefaultGlobalsAddrSpace = 0;
   FunctionPtrAlign.reset();
   TheFunctionPtrAlignType = FunctionPtrAlignType::Independent;
   ManglingMode = MM_None;
@@ -188,57 +191,80 @@ void DataLayout::reset(StringRef Desc) {
 
   // Default alignments
   for (const LayoutAlignElem &E : DefaultAlignments) {
-    setAlignment((AlignTypeEnum)E.AlignType, E.ABIAlign, E.PrefAlign,
-                 E.TypeBitWidth);
+    if (Error Err = setAlignment((AlignTypeEnum)E.AlignType, E.ABIAlign,
+                                 E.PrefAlign, E.TypeBitWidth))
+      return report_fatal_error(std::move(Err));
   }
-  setPointerAlignment(0, Align(8), Align(8), 8, 8);
+  if (Error Err = setPointerAlignment(0, Align(8), Align(8), 8, 8))
+    return report_fatal_error(std::move(Err));
 
-  parseSpecifier(Desc);
+  if (Error Err = parseSpecifier(Desc))
+    return report_fatal_error(std::move(Err));
+}
+
+Expected<DataLayout> DataLayout::parse(StringRef LayoutDescription) {
+  DataLayout Layout("");
+  if (Error Err = Layout.parseSpecifier(LayoutDescription))
+    return std::move(Err);
+  return Layout;
+}
+
+static Error reportError(const Twine &Message) {
+  return createStringError(inconvertibleErrorCode(), Message);
 }
 
 /// Checked version of split, to ensure mandatory subparts.
-static std::pair<StringRef, StringRef> split(StringRef Str, char Separator) {
+static Error split(StringRef Str, char Separator,
+                   std::pair<StringRef, StringRef> &Split) {
   assert(!Str.empty() && "parse error, string can't be empty here");
-  std::pair<StringRef, StringRef> Split = Str.split(Separator);
+  Split = Str.split(Separator);
   if (Split.second.empty() && Split.first != Str)
-    report_fatal_error("Trailing separator in datalayout string");
+    return reportError("Trailing separator in datalayout string");
   if (!Split.second.empty() && Split.first.empty())
-    report_fatal_error("Expected token before separator in datalayout string");
-  return Split;
+    return reportError("Expected token before separator in datalayout string");
+  return Error::success();
 }
 
 /// Get an unsigned integer, including error checks.
-static unsigned getInt(StringRef R) {
-  unsigned Result;
+template <typename IntTy> static Error getInt(StringRef R, IntTy &Result) {
   bool error = R.getAsInteger(10, Result); (void)error;
   if (error)
-    report_fatal_error("not a number, or does not fit in an unsigned int");
-  return Result;
+    return reportError("not a number, or does not fit in an unsigned int");
+  return Error::success();
 }
 
-/// Convert bits into bytes. Assert if not a byte width multiple.
-static unsigned inBytes(unsigned Bits) {
-  if (Bits % 8)
-    report_fatal_error("number of bits must be a byte width multiple");
-  return Bits / 8;
+/// Get an unsigned integer representing the number of bits and convert it into
+/// bytes. Error out of not a byte width multiple.
+template <typename IntTy>
+static Error getIntInBytes(StringRef R, IntTy &Result) {
+  if (Error Err = getInt<IntTy>(R, Result))
+    return Err;
+  if (Result % 8)
+    return reportError("number of bits must be a byte width multiple");
+  Result /= 8;
+  return Error::success();
 }
 
-static unsigned getAddrSpace(StringRef R) {
-  unsigned AddrSpace = getInt(R);
+static Error getAddrSpace(StringRef R, unsigned &AddrSpace) {
+  if (Error Err = getInt(R, AddrSpace))
+    return Err;
   if (!isUInt<24>(AddrSpace))
-    report_fatal_error("Invalid address space, must be a 24-bit integer");
-  return AddrSpace;
+    return reportError("Invalid address space, must be a 24-bit integer");
+  return Error::success();
 }
 
-void DataLayout::parseSpecifier(StringRef Desc) {
+Error DataLayout::parseSpecifier(StringRef Desc) {
   StringRepresentation = std::string(Desc);
   while (!Desc.empty()) {
     // Split at '-'.
-    std::pair<StringRef, StringRef> Split = split(Desc, '-');
+    std::pair<StringRef, StringRef> Split;
+    if (Error Err = split(Desc, '-', Split))
+      return Err;
     Desc = Split.second;
 
     // Split at ':'.
-    Split = split(Split.first, ':');
+    if (Error Err = split(Split.first, ':', Split))
+      return Err;
 
     // Aliases used below.
     StringRef &Tok  = Split.first;  // Current token.
@@ -246,11 +272,14 @@ void DataLayout::parseSpecifier(StringRef Desc) {
 
     if (Tok == "ni") {
       do {
-        Split = split(Rest, ':');
+        if (Error Err = split(Rest, ':', Split))
+          return Err;
         Rest = Split.second;
-        unsigned AS = getInt(Split.first);
+        unsigned AS;
+        if (Error Err = getInt(Split.first, AS))
+          return Err;
         if (AS == 0)
-          report_fatal_error("Address space 0 can never be non-integral");
+          return reportError("Address space 0 can never be non-integral");
         NonIntegralAddressSpaces.push_back(AS);
       } while (!Rest.empty());
 
@@ -273,28 +302,36 @@ void DataLayout::parseSpecifier(StringRef Desc) {
       break;
     case 'p': {
       // Address space.
-      unsigned AddrSpace = Tok.empty() ? 0 : getInt(Tok);
+      unsigned AddrSpace = 0;
+      if (!Tok.empty())
+        if (Error Err = getInt(Tok, AddrSpace))
+          return Err;
       if (!isUInt<24>(AddrSpace))
-        report_fatal_error("Invalid address space, must be a 24bit integer");
+        return reportError("Invalid address space, must be a 24bit integer");
 
       // Size.
       if (Rest.empty())
-        report_fatal_error(
+        return reportError(
             "Missing size specification for pointer in datalayout string");
-      Split = split(Rest, ':');
-      unsigned PointerMemSize = inBytes(getInt(Tok));
+      if (Error Err = split(Rest, ':', Split))
+        return Err;
+      unsigned PointerMemSize;
+      if (Error Err = getIntInBytes(Tok, PointerMemSize))
+        return Err;
       if (!PointerMemSize)
-        report_fatal_error("Invalid pointer size of 0 bytes");
+        return reportError("Invalid pointer size of 0 bytes");
 
       // ABI alignment.
       if (Rest.empty())
-        report_fatal_error(
+        return reportError(
             "Missing alignment specification for pointer in datalayout string");
-      Split = split(Rest, ':');
-      unsigned PointerABIAlign = inBytes(getInt(Tok));
+      if (Error Err = split(Rest, ':', Split))
+        return Err;
+      unsigned PointerABIAlign;
+      if (Error Err = getIntInBytes(Tok, PointerABIAlign))
+        return Err;
       if (!isPowerOf2_64(PointerABIAlign))
-        report_fatal_error(
-            "Pointer ABI alignment must be a power of 2");
+        return reportError("Pointer ABI alignment must be a power of 2");
 
       // Size of index used in GEP for address calculation.
       // The parameter is optional. By default it is equal to size of pointer.
@@ -303,23 +340,28 @@ void DataLayout::parseSpecifier(StringRef Desc) {
       // Preferred alignment.
       unsigned PointerPrefAlign = PointerABIAlign;
       if (!Rest.empty()) {
-        Split = split(Rest, ':');
-        PointerPrefAlign = inBytes(getInt(Tok));
+        if (Error Err = split(Rest, ':', Split))
+          return Err;
+        if (Error Err = getIntInBytes(Tok, PointerPrefAlign))
+          return Err;
         if (!isPowerOf2_64(PointerPrefAlign))
-          report_fatal_error(
-            "Pointer preferred alignment must be a power of 2");
+          return reportError(
+              "Pointer preferred alignment must be a power of 2");
 
         // Now read the index. It is the second optional parameter here.
         if (!Rest.empty()) {
-          Split = split(Rest, ':');
-          IndexSize = inBytes(getInt(Tok));
+          if (Error Err = split(Rest, ':', Split))
+            return Err;
+          if (Error Err = getIntInBytes(Tok, IndexSize))
+            return Err;
           if (!IndexSize)
-            report_fatal_error("Invalid index size of 0 bytes");
+            return reportError("Invalid index size of 0 bytes");
         }
       }
-      setPointerAlignment(AddrSpace, assumeAligned(PointerABIAlign),
-                          assumeAligned(PointerPrefAlign), PointerMemSize,
-                          IndexSize);
+      if (Error Err = setPointerAlignment(
+              AddrSpace, assumeAligned(PointerABIAlign),
+              assumeAligned(PointerPrefAlign), PointerMemSize, IndexSize))
+        return Err;
       break;
     }
     case 'i':
@@ -336,61 +378,75 @@ void DataLayout::parseSpecifier(StringRef Desc) {
       }
 
       // Bit size.
-      unsigned Size = Tok.empty() ? 0 : getInt(Tok);
+      unsigned Size = 0;
+      if (!Tok.empty())
+        if (Error Err = getInt(Tok, Size))
+          return Err;
 
       if (AlignType == AGGREGATE_ALIGN && Size != 0)
-        report_fatal_error(
+        return reportError(
             "Sized aggregate specification in datalayout string");
 
       // ABI alignment.
       if (Rest.empty())
-        report_fatal_error(
+        return reportError(
             "Missing alignment specification in datalayout string");
-      Split = split(Rest, ':');
-      const unsigned ABIAlign = inBytes(getInt(Tok));
+      if (Error Err = split(Rest, ':', Split))
+        return Err;
+      unsigned ABIAlign;
+      if (Error Err = getIntInBytes(Tok, ABIAlign))
+        return Err;
       if (AlignType != AGGREGATE_ALIGN && !ABIAlign)
-        report_fatal_error(
+        return reportError(
             "ABI alignment specification must be >0 for non-aggregate types");
 
       if (!isUInt<16>(ABIAlign))
-        report_fatal_error("Invalid ABI alignment, must be a 16bit integer");
+        return reportError("Invalid ABI alignment, must be a 16bit integer");
       if (ABIAlign != 0 && !isPowerOf2_64(ABIAlign))
-        report_fatal_error("Invalid ABI alignment, must be a power of 2");
+        return reportError("Invalid ABI alignment, must be a power of 2");
 
       // Preferred alignment.
       unsigned PrefAlign = ABIAlign;
       if (!Rest.empty()) {
-        Split = split(Rest, ':');
-        PrefAlign = inBytes(getInt(Tok));
+        if (Error Err = split(Rest, ':', Split))
+          return Err;
+        if (Error Err = getIntInBytes(Tok, PrefAlign))
+          return Err;
       }
 
       if (!isUInt<16>(PrefAlign))
-        report_fatal_error(
+        return reportError(
             "Invalid preferred alignment, must be a 16bit integer");
       if (PrefAlign != 0 && !isPowerOf2_64(PrefAlign))
-        report_fatal_error("Invalid preferred alignment, must be a power of 2");
+        return reportError("Invalid preferred alignment, must be a power of 2");
 
-      setAlignment(AlignType, assumeAligned(ABIAlign), assumeAligned(PrefAlign),
-                   Size);
+      if (Error Err = setAlignment(AlignType, assumeAligned(ABIAlign),
+                                   assumeAligned(PrefAlign), Size))
+        return Err;
 
       break;
     }
     case 'n':  // Native integer types.
       while (true) {
-        unsigned Width = getInt(Tok);
+        unsigned Width;
+        if (Error Err = getInt(Tok, Width))
+          return Err;
         if (Width == 0)
-          report_fatal_error(
+          return reportError(
               "Zero width native integer type in datalayout string");
         LegalIntWidths.push_back(Width);
         if (Rest.empty())
           break;
-        Split = split(Rest, ':');
+        if (Error Err = split(Rest, ':', Split))
+          return Err;
       }
       break;
     case 'S': { // Stack natural alignment.
-      uint64_t Alignment = inBytes(getInt(Tok));
+      uint64_t Alignment;
+      if (Error Err = getIntInBytes(Tok, Alignment))
+        return Err;
       if (Alignment != 0 && !llvm::isPowerOf2_64(Alignment))
-        report_fatal_error("Alignment is neither 0 nor a power of 2");
+        return reportError("Alignment is neither 0 nor a power of 2");
       StackNaturalAlign = MaybeAlign(Alignment);
       break;
     }
@@ -403,34 +459,44 @@ void DataLayout::parseSpecifier(StringRef Desc) {
         TheFunctionPtrAlignType = FunctionPtrAlignType::MultipleOfFunctionAlign;
         break;
       default:
-        report_fatal_error("Unknown function pointer alignment type in "
+        return reportError("Unknown function pointer alignment type in "
                            "datalayout string");
       }
       Tok = Tok.substr(1);
-      uint64_t Alignment = inBytes(getInt(Tok));
+      uint64_t Alignment;
+      if (Error Err = getIntInBytes(Tok, Alignment))
+        return Err;
       if (Alignment != 0 && !llvm::isPowerOf2_64(Alignment))
-        report_fatal_error("Alignment is neither 0 nor a power of 2");
+        return reportError("Alignment is neither 0 nor a power of 2");
       FunctionPtrAlign = MaybeAlign(Alignment);
       break;
     }
     case 'P': { // Function address space.
-      ProgramAddrSpace = getAddrSpace(Tok);
+      if (Error Err = getAddrSpace(Tok, ProgramAddrSpace))
+        return Err;
       break;
     }
     case 'A': { // Default stack/alloca address space.
-      AllocaAddrSpace = getAddrSpace(Tok);
+      if (Error Err = getAddrSpace(Tok, AllocaAddrSpace))
+        return Err;
+      break;
+    }
+    case 'G': { // Default address space for global variables.
+      if (Error Err = getAddrSpace(Tok, DefaultGlobalsAddrSpace))
+        return Err;
       break;
     }
     case 'm':
       if (!Tok.empty())
-        report_fatal_error("Unexpected trailing characters after mangling specifier in datalayout string");
+        return reportError("Unexpected trailing characters after mangling "
+                           "specifier in datalayout string");
       if (Rest.empty())
-        report_fatal_error("Expected mangling specifier in datalayout string");
+        return reportError("Expected mangling specifier in datalayout string");
       if (Rest.size() > 1)
-        report_fatal_error("Unknown mangling specifier in datalayout string");
+        return reportError("Unknown mangling specifier in datalayout string");
       switch(Rest[0]) {
       default:
-        report_fatal_error("Unknown mangling in datalayout string");
+        return reportError("Unknown mangling in datalayout string");
       case 'e':
         ManglingMode = MM_ELF;
         break;
@@ -452,10 +518,12 @@ void DataLayout::parseSpecifier(StringRef Desc) {
       }
       break;
     default:
-      report_fatal_error("Unknown specifier in datalayout string");
+      return reportError("Unknown specifier in datalayout string");
       break;
     }
   }
+
+  return Error::success();
 }
 
 DataLayout::DataLayout(const Module *M) {
@@ -469,6 +537,7 @@ bool DataLayout::operator==(const DataLayout &Other) const {
              AllocaAddrSpace == Other.AllocaAddrSpace &&
              StackNaturalAlign == Other.StackNaturalAlign &&
              ProgramAddrSpace == Other.ProgramAddrSpace &&
+             DefaultGlobalsAddrSpace == Other.DefaultGlobalsAddrSpace &&
              FunctionPtrAlign == Other.FunctionPtrAlign &&
              TheFunctionPtrAlignType == Other.TheFunctionPtrAlignType &&
              ManglingMode == Other.ManglingMode &&
@@ -487,17 +556,17 @@ DataLayout::findAlignmentLowerBound(AlignTypeEnum AlignType,
   });
 }
 
-void DataLayout::setAlignment(AlignTypeEnum align_type, Align abi_align,
-                              Align pref_align, uint32_t bit_width) {
+Error DataLayout::setAlignment(AlignTypeEnum align_type, Align abi_align,
+                               Align pref_align, uint32_t bit_width) {
   // AlignmentsTy::ABIAlign and AlignmentsTy::PrefAlign were once stored as
   // uint16_t, it is unclear if there are requirements for alignment to be less
   // than 2^16 other than storage. In the meantime we leave the restriction as
   // an assert. See D67400 for context.
   assert(Log2(abi_align) < 16 && Log2(pref_align) < 16 && "Alignment too big");
   if (!isUInt<24>(bit_width))
-    report_fatal_error("Invalid bit width, must be a 24bit integer");
+    return reportError("Invalid bit width, must be a 24bit integer");
   if (pref_align < abi_align)
-    report_fatal_error(
+    return reportError(
         "Preferred alignment cannot be less than the ABI alignment");
 
   AlignmentsTy::iterator I = findAlignmentLowerBound(align_type, bit_width);
@@ -511,24 +580,35 @@ void DataLayout::setAlignment(AlignTypeEnum align_type, Align abi_align,
     Alignments.insert(I, LayoutAlignElem::get(align_type, abi_align,
                                               pref_align, bit_width));
   }
-}
+  return Error::success();
+}
+
+const PointerAlignElem &
+DataLayout::getPointerAlignElem(uint32_t AddressSpace) const {
+  if (AddressSpace != 0) {
+    auto I = lower_bound(Pointers, AddressSpace,
+                         [](const PointerAlignElem &A, uint32_t AddressSpace) {
+      return A.AddressSpace < AddressSpace;
+    });
+    if (I != Pointers.end() && I->AddressSpace == AddressSpace)
+      return *I;
+  }
 
-DataLayout::PointersTy::iterator
-DataLayout::findPointerLowerBound(uint32_t AddressSpace) {
-  return std::lower_bound(Pointers.begin(), Pointers.end(), AddressSpace,
-                          [](const PointerAlignElem &A, uint32_t AddressSpace) {
-    return A.AddressSpace < AddressSpace;
-  });
+  assert(Pointers[0].AddressSpace == 0);
+  return Pointers[0];
 }
 
-void DataLayout::setPointerAlignment(uint32_t AddrSpace, Align ABIAlign,
-                                     Align PrefAlign, uint32_t TypeByteWidth,
-                                     uint32_t IndexWidth) {
+Error DataLayout::setPointerAlignment(uint32_t AddrSpace, Align ABIAlign,
+                                      Align PrefAlign, uint32_t TypeByteWidth,
+                                      uint32_t IndexWidth) {
   if (PrefAlign < ABIAlign)
-    report_fatal_error(
+    return reportError(
         "Preferred alignment cannot be less than the ABI alignment");
 
-  PointersTy::iterator I = findPointerLowerBound(AddrSpace);
+  auto I = lower_bound(Pointers, AddrSpace,
+                       [](const PointerAlignElem &A, uint32_t AddressSpace) {
+    return A.AddressSpace < AddressSpace;
+  });
   if (I == Pointers.end() || I->AddressSpace != AddrSpace) {
     Pointers.insert(I, PointerAlignElem::get(AddrSpace, ABIAlign, PrefAlign,
                                              TypeByteWidth, IndexWidth));
@@ -538,49 +618,19 @@ void DataLayout::setPointerAlignment(uint32_t AddrSpace, Align ABIAlign,
     I->TypeByteWidth = TypeByteWidth;
     I->IndexWidth = IndexWidth;
   }
+  return Error::success();
 }
 
-/// getAlignmentInfo - Return the alignment (either ABI if ABIInfo = true or
-/// preferred if ABIInfo = false) the layout wants for the specified datatype.
-Align DataLayout::getAlignmentInfo(AlignTypeEnum AlignType, uint32_t BitWidth,
-                                   bool ABIInfo, Type *Ty) const {
-  AlignmentsTy::const_iterator I = findAlignmentLowerBound(AlignType, BitWidth);
-  // See if we found an exact match. Of if we are looking for an integer type,
-  // but don't have an exact match take the next largest integer. This is where
-  // the lower_bound will point to when it fails an exact match.
-  if (I != Alignments.end() && I->AlignType == (unsigned)AlignType &&
-      (I->TypeBitWidth == BitWidth || AlignType == INTEGER_ALIGN))
-    return ABIInfo ? I->ABIAlign : I->PrefAlign;
-
-  if (AlignType == INTEGER_ALIGN) {
-    // If we didn't have a larger value try the largest value we have.
-    if (I != Alignments.begin()) {
-      --I; // Go to the previous entry and see if its an integer.
-      if (I->AlignType == INTEGER_ALIGN)
-        return ABIInfo ? I->ABIAlign : I->PrefAlign;
-    }
-  } else if (AlignType == VECTOR_ALIGN) {
-    // By default, use natural alignment for vector types. This is consistent
-    // with what clang and llvm-gcc do.
-    unsigned Alignment =
-        getTypeAllocSize(cast<VectorType>(Ty)->getElementType());
-    // We're only calculating a natural alignment, so it doesn't have to be
-    // based on the full size for scalable vectors. Using the minimum element
-    // count should be enough here.
-    Alignment *= cast<VectorType>(Ty)->getElementCount().Min;
-    Alignment = PowerOf2Ceil(Alignment);
-    return Align(Alignment);
-   }
-
-  // If we still couldn't find a reasonable default alignment, fall back
-  // to a simple heuristic that the alignment is the first power of two
-  // greater-or-equal to the store size of the type.  This is a reasonable
-  // approximation of reality, and if the user wanted something less
-  // less conservative, they should have specified it explicitly in the data
-  // layout.
-   unsigned Alignment = getTypeStoreSize(Ty);
-   Alignment = PowerOf2Ceil(Alignment);
-   return Align(Alignment);
+Align DataLayout::getIntegerAlignment(uint32_t BitWidth,
+                                      bool abi_or_pref) const {
+  auto I = findAlignmentLowerBound(INTEGER_ALIGN, BitWidth);
+  // If we don't have an exact match, use alignment of next larger integer
+  // type. If there is none, use alignment of largest integer type by going
+  // back one element.
+  if (I == Alignments.end() || I->AlignType != INTEGER_ALIGN)
+    --I;
+  assert(I->AlignType == INTEGER_ALIGN && "Must be integer alignment");
+  return abi_or_pref ? I->ABIAlign : I->PrefAlign;
 }
 
 namespace {
@@ -642,30 +692,15 @@ const StructLayout *DataLayout::getStructLayout(StructType *Ty) const {
 }
 
 Align DataLayout::getPointerABIAlignment(unsigned AS) const {
-  PointersTy::const_iterator I = findPointerLowerBound(AS);
-  if (I == Pointers.end() || I->AddressSpace != AS) {
-    I = findPointerLowerBound(0);
-    assert(I->AddressSpace == 0);
-  }
-  return I->ABIAlign;
+  return getPointerAlignElem(AS).ABIAlign;
 }
 
 Align DataLayout::getPointerPrefAlignment(unsigned AS) const {
-  PointersTy::const_iterator I = findPointerLowerBound(AS);
-  if (I == Pointers.end() || I->AddressSpace != AS) {
-    I = findPointerLowerBound(0);
-    assert(I->AddressSpace == 0);
-  }
-  return I->PrefAlign;
+  return getPointerAlignElem(AS).PrefAlign;
 }
 
 unsigned DataLayout::getPointerSize(unsigned AS) const {
-  PointersTy::const_iterator I = findPointerLowerBound(AS);
-  if (I == Pointers.end() || I->AddressSpace != AS) {
-    I = findPointerLowerBound(0);
-    assert(I->AddressSpace == 0);
-  }
-  return I->TypeByteWidth;
+  return getPointerAlignElem(AS).TypeByteWidth;
 }
 
 unsigned DataLayout::getMaxPointerSize() const {
@@ -684,12 +719,7 @@ unsigned DataLayout::getPointerTypeSizeInBits(Type *Ty) const {
 }
 
 unsigned DataLayout::getIndexSize(unsigned AS) const {
-  PointersTy::const_iterator I = findPointerLowerBound(AS);
-  if (I == Pointers.end() || I->AddressSpace != AS) {
-    I = findPointerLowerBound(0);
-    assert(I->AddressSpace == 0);
-  }
-  return I->IndexWidth;
+  return getPointerAlignElem(AS).IndexWidth;
 }
 
 unsigned DataLayout::getIndexTypeSizeInBits(Type *Ty) const {
@@ -708,8 +738,6 @@ unsigned DataLayout::getIndexTypeSizeInBits(Type *Ty) const {
   == false) for the requested type \a Ty.
  */
 Align DataLayout::getAlignment(Type *Ty, bool abi_or_pref) const {
-  AlignTypeEnum AlignType;
-
   assert(Ty->isSized() && "Cannot getTypeInfo() on a type that is unsized!");
   switch (Ty->getTypeID()) {
   // Early escape for the non-numeric types.
@@ -730,12 +758,15 @@ Align DataLayout::getAlignment(Type *Ty, bool abi_or_pref) const {
 
     // Get the layout annotation... which is lazily created on demand.
     const StructLayout *Layout = getStructLayout(cast<StructType>(Ty));
-    const Align Align = getAlignmentInfo(AGGREGATE_ALIGN, 0, abi_or_pref, Ty);
+    const LayoutAlignElem &AggregateAlign = Alignments[0];
+    assert(AggregateAlign.AlignType == AGGREGATE_ALIGN &&
+           "Aggregate alignment must be first alignment entry");
+    const Align Align =
+        abi_or_pref ? AggregateAlign.ABIAlign : AggregateAlign.PrefAlign;
     return std::max(Align, Layout->getAlignment());
   }
   case Type::IntegerTyID:
-    AlignType = INTEGER_ALIGN;
-    break;
+    return getIntegerAlignment(Ty->getIntegerBitWidth(), abi_or_pref);
   case Type::HalfTyID:
   case Type::BFloatTyID:
   case Type::FloatTyID:
@@ -744,22 +775,47 @@ Align DataLayout::getAlignment(Type *Ty, bool abi_or_pref) const {
   // same size and alignment, so they look the same here.
   case Type::PPC_FP128TyID:
   case Type::FP128TyID:
-  case Type::X86_FP80TyID:
-    AlignType = FLOAT_ALIGN;
-    break;
+  case Type::X86_FP80TyID: {
+    unsigned BitWidth = getTypeSizeInBits(Ty).getFixedSize();
+    auto I = findAlignmentLowerBound(FLOAT_ALIGN, BitWidth);
+    if (I != Alignments.end() && I->AlignType == FLOAT_ALIGN &&
+        I->TypeBitWidth == BitWidth)
+      return abi_or_pref ? I->ABIAlign : I->PrefAlign;
+
+    // If we still couldn't find a reasonable default alignment, fall back
+    // to a simple heuristic that the alignment is the first power of two
+    // greater-or-equal to the store size of the type.  This is a reasonable
+    // approximation of reality, and if the user wanted something less
+    // less conservative, they should have specified it explicitly in the data
+    // layout.
+    return Align(PowerOf2Ceil(BitWidth / 8));
+  }
   case Type::X86_MMXTyID:
   case Type::FixedVectorTyID:
-  case Type::ScalableVectorTyID:
-    AlignType = VECTOR_ALIGN;
-    break;
+  case Type::ScalableVectorTyID: {
+    unsigned BitWidth = getTypeSizeInBits(Ty).getKnownMinSize();
+    auto I = findAlignmentLowerBound(VECTOR_ALIGN, BitWidth);
+    if (I != Alignments.end() && I->AlignType == VECTOR_ALIGN &&
+        I->TypeBitWidth == BitWidth)
+      return abi_or_pref ? I->ABIAlign : I->PrefAlign;
+
+    // By default, use natural alignment for vector types. This is consistent
+    // with what clang and llvm-gcc do.
+    // TODO: This should probably not be using the alloc size.
+    unsigned Alignment =
+        getTypeAllocSize(cast<VectorType>(Ty)->getElementType());
+    // We're only calculating a natural alignment, so it doesn't have to be
+    // based on the full size for scalable vectors. Using the minimum element
+    // count should be enough here.
+    Alignment *= cast<VectorType>(Ty)->getElementCount().getKnownMinValue();
+    Alignment = PowerOf2Ceil(Alignment);
+    return Align(Alignment);
+  }
+  case Type::X86_AMXTyID:
+    return Align(64);
   default:
     llvm_unreachable("Bad type for getAlignment!!!");
   }
-
-  // If we're dealing with a scalable vector, we just need the known minimum
-  // size for determining alignment. If not, we'll get the exact size.
-  return getAlignmentInfo(AlignType, getTypeSizeInBits(Ty).getKnownMinSize(),
-                          abi_or_pref, Ty);
 }
 
 /// TODO: Remove this function once the transition to Align is over.
@@ -771,12 +827,6 @@ Align DataLayout::getABITypeAlign(Type *Ty) const {
   return getAlignment(Ty, true);
 }
 
-/// getABIIntegerTypeAlignment - Return the minimum ABI-required alignment for
-/// an integer type of the specified bitwidth.
-Align DataLayout::getABIIntegerTypeAlignment(unsigned BitWidth) const {
-  return getAlignmentInfo(INTEGER_ALIGN, BitWidth, true, nullptr);
-}
-
 /// TODO: Remove this function once the transition to Align is over.
 unsigned DataLayout::getPrefTypeAlignment(Type *Ty) const {
   return getPrefTypeAlign(Ty).value();
diff --git a/contrib/llvm-project/llvm/lib/IR/DebugInfo.cpp b/contrib/llvm-project/llvm/lib/IR/DebugInfo.cpp
index 190b220dc9aa..d7656b9dd1f8 100644
--- a/contrib/llvm-project/llvm/lib/IR/DebugInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/DebugInfo.cpp
@@ -652,7 +652,8 @@ bool llvm::stripNonLineTableDebugInfo(Module &M) {
           MDNode *InlinedAt = DL.getInlinedAt();
           Scope = remap(Scope);
           InlinedAt = remap(InlinedAt);
-          return DebugLoc::get(DL.getLine(), DL.getCol(), Scope, InlinedAt);
+          return DILocation::get(M.getContext(), DL.getLine(), DL.getCol(),
+                                 Scope, InlinedAt);
         };
 
         if (I.getDebugLoc() != DebugLoc())
@@ -696,6 +697,38 @@ void Instruction::applyMergedLocation(const DILocation *LocA,
   setDebugLoc(DILocation::getMergedLocation(LocA, LocB));
 }
 
+void Instruction::updateLocationAfterHoist() { dropLocation(); }
+
+void Instruction::dropLocation() {
+  const DebugLoc &DL = getDebugLoc();
+  if (!DL)
+    return;
+
+  // If this isn't a call, drop the location to allow a location from a
+  // preceding instruction to propagate.
+  if (!isa<CallBase>(this)) {
+    setDebugLoc(DebugLoc());
+    return;
+  }
+
+  // Set a line 0 location for calls to preserve scope information in case
+  // inlining occurs.
+  DISubprogram *SP = getFunction()->getSubprogram();
+  if (SP)
+    // If a function scope is available, set it on the line 0 location. When
+    // hoisting a call to a predecessor block, using the function scope avoids
+    // making it look like the callee was reached earlier than it should be.
+    setDebugLoc(DILocation::get(getContext(), 0, 0, SP));
+  else
+    // The parent function has no scope. Go ahead and drop the location. If
+    // the parent function is inlined, and the callee has a subprogram, the
+    // inliner will attach a location to the call.
+    //
+    // One alternative is to set a line 0 location with the existing scope and
+    // inlinedAt info. The location might be sensitive to when inlining occurs.
+    setDebugLoc(DebugLoc());
+}
+
 //===----------------------------------------------------------------------===//
 // LLVM C API implementations.
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm-project/llvm/lib/IR/DebugInfoMetadata.cpp b/contrib/llvm-project/llvm/lib/IR/DebugInfoMetadata.cpp
index 110d94116f10..77bba9f7ed0e 100644
--- a/contrib/llvm-project/llvm/lib/IR/DebugInfoMetadata.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/DebugInfoMetadata.cpp
@@ -133,7 +133,7 @@ const DILocation *DILocation::getMergedLocation(const DILocation *LocA,
 }
 
 Optional<unsigned> DILocation::encodeDiscriminator(unsigned BD, unsigned DF, unsigned CI) {
-  SmallVector<unsigned, 3> Components = {BD, DF, CI};
+  std::array<unsigned, 3> Components = {BD, DF, CI};
   uint64_t RemainingWork = 0U;
   // We use RemainingWork to figure out if we have no remaining components to
   // encode. For example: if BD != 0 but DF == 0 && CI == 0, we don't need to
@@ -435,6 +435,84 @@ DISubrange::BoundType DISubrange::getStride() const {
   return BoundType();
 }
 
+DIGenericSubrange *DIGenericSubrange::getImpl(LLVMContext &Context,
+                                              Metadata *CountNode, Metadata *LB,
+                                              Metadata *UB, Metadata *Stride,
+                                              StorageType Storage,
+                                              bool ShouldCreate) {
+  DEFINE_GETIMPL_LOOKUP(DIGenericSubrange, (CountNode, LB, UB, Stride));
+  Metadata *Ops[] = {CountNode, LB, UB, Stride};
+  DEFINE_GETIMPL_STORE_NO_CONSTRUCTOR_ARGS(DIGenericSubrange, Ops);
+}
+
+DIGenericSubrange::BoundType DIGenericSubrange::getCount() const {
+  Metadata *CB = getRawCountNode();
+  if (!CB)
+    return BoundType();
+
+  assert((isa<DIVariable>(CB) || isa<DIExpression>(CB)) &&
+         "Count must be signed constant or DIVariable or DIExpression");
+
+  if (auto *MD = dyn_cast<DIVariable>(CB))
+    return BoundType(MD);
+
+  if (auto *MD = dyn_cast<DIExpression>(CB))
+    return BoundType(MD);
+
+  return BoundType();
+}
+
+DIGenericSubrange::BoundType DIGenericSubrange::getLowerBound() const {
+  Metadata *LB = getRawLowerBound();
+  if (!LB)
+    return BoundType();
+
+  assert((isa<DIVariable>(LB) || isa<DIExpression>(LB)) &&
+         "LowerBound must be signed constant or DIVariable or DIExpression");
+
+  if (auto *MD = dyn_cast<DIVariable>(LB))
+    return BoundType(MD);
+
+  if (auto *MD = dyn_cast<DIExpression>(LB))
+    return BoundType(MD);
+
+  return BoundType();
+}
+
+DIGenericSubrange::BoundType DIGenericSubrange::getUpperBound() const {
+  Metadata *UB = getRawUpperBound();
+  if (!UB)
+    return BoundType();
+
+  assert((isa<DIVariable>(UB) || isa<DIExpression>(UB)) &&
+         "UpperBound must be signed constant or DIVariable or DIExpression");
+
+  if (auto *MD = dyn_cast<DIVariable>(UB))
+    return BoundType(MD);
+
+  if (auto *MD = dyn_cast<DIExpression>(UB))
+    return BoundType(MD);
+
+  return BoundType();
+}
+
+DIGenericSubrange::BoundType DIGenericSubrange::getStride() const {
+  Metadata *ST = getRawStride();
+  if (!ST)
+    return BoundType();
+
+  assert((isa<DIVariable>(ST) || isa<DIExpression>(ST)) &&
+         "Stride must be signed constant or DIVariable or DIExpression");
+
+  if (auto *MD = dyn_cast<DIVariable>(ST))
+    return BoundType(MD);
+
+  if (auto *MD = dyn_cast<DIExpression>(ST))
+    return BoundType(MD);
+
+  return BoundType();
+}
+
 DIEnumerator *DIEnumerator::getImpl(LLVMContext &Context, const APInt &Value,
                                     bool IsUnsigned, MDString *Name,
                                     StorageType Storage, bool ShouldCreate) {
@@ -470,6 +548,20 @@ Optional<DIBasicType::Signedness> DIBasicType::getSignedness() const {
   }
 }
 
+DIStringType *DIStringType::getImpl(LLVMContext &Context, unsigned Tag,
+                                    MDString *Name, Metadata *StringLength,
+                                    Metadata *StringLengthExp,
+                                    uint64_t SizeInBits, uint32_t AlignInBits,
+                                    unsigned Encoding, StorageType Storage,
+                                    bool ShouldCreate) {
+  assert(isCanonical(Name) && "Expected canonical MDString");
+  DEFINE_GETIMPL_LOOKUP(DIStringType, (Tag, Name, StringLength, StringLengthExp,
+                                       SizeInBits, AlignInBits, Encoding));
+  Metadata *Ops[] = {nullptr, nullptr, Name, StringLength, StringLengthExp};
+  DEFINE_GETIMPL_STORE(DIStringType, (Tag, SizeInBits, AlignInBits, Encoding),
+                       Ops);
+}
+
 DIDerivedType *DIDerivedType::getImpl(
     LLVMContext &Context, unsigned Tag, MDString *Name, Metadata *File,
     unsigned Line, Metadata *Scope, Metadata *BaseType, uint64_t SizeInBits,
@@ -493,18 +585,20 @@ DICompositeType *DICompositeType::getImpl(
     uint32_t AlignInBits, uint64_t OffsetInBits, DIFlags Flags,
     Metadata *Elements, unsigned RuntimeLang, Metadata *VTableHolder,
     Metadata *TemplateParams, MDString *Identifier, Metadata *Discriminator,
-    Metadata *DataLocation, StorageType Storage, bool ShouldCreate) {
+    Metadata *DataLocation, Metadata *Associated, Metadata *Allocated,
+    Metadata *Rank, StorageType Storage, bool ShouldCreate) {
   assert(isCanonical(Name) && "Expected canonical MDString");
 
   // Keep this in sync with buildODRType.
-  DEFINE_GETIMPL_LOOKUP(DICompositeType,
-                        (Tag, Name, File, Line, Scope, BaseType, SizeInBits,
-                         AlignInBits, OffsetInBits, Flags, Elements,
-                         RuntimeLang, VTableHolder, TemplateParams, Identifier,
-                         Discriminator, DataLocation));
+  DEFINE_GETIMPL_LOOKUP(
+      DICompositeType,
+      (Tag, Name, File, Line, Scope, BaseType, SizeInBits, AlignInBits,
+       OffsetInBits, Flags, Elements, RuntimeLang, VTableHolder, TemplateParams,
+       Identifier, Discriminator, DataLocation, Associated, Allocated, Rank));
   Metadata *Ops[] = {File,          Scope,        Name,           BaseType,
                      Elements,      VTableHolder, TemplateParams, Identifier,
-                     Discriminator, DataLocation};
+                     Discriminator, DataLocation, Associated,     Allocated,
+                     Rank};
   DEFINE_GETIMPL_STORE(DICompositeType, (Tag, Line, RuntimeLang, SizeInBits,
                                          AlignInBits, OffsetInBits, Flags),
                        Ops);
@@ -516,7 +610,8 @@ DICompositeType *DICompositeType::buildODRType(
     uint64_t SizeInBits, uint32_t AlignInBits, uint64_t OffsetInBits,
     DIFlags Flags, Metadata *Elements, unsigned RuntimeLang,
     Metadata *VTableHolder, Metadata *TemplateParams, Metadata *Discriminator,
-    Metadata *DataLocation) {
+    Metadata *DataLocation, Metadata *Associated, Metadata *Allocated,
+    Metadata *Rank) {
   assert(!Identifier.getString().empty() && "Expected valid identifier");
   if (!Context.isODRUniquingDebugTypes())
     return nullptr;
@@ -526,7 +621,7 @@ DICompositeType *DICompositeType::buildODRType(
                Context, Tag, Name, File, Line, Scope, BaseType, SizeInBits,
                AlignInBits, OffsetInBits, Flags, Elements, RuntimeLang,
                VTableHolder, TemplateParams, &Identifier, Discriminator,
-               DataLocation);
+               DataLocation, Associated, Allocated, Rank);
 
   // Only mutate CT if it's a forward declaration and the new operands aren't.
   assert(CT->getRawIdentifier() == &Identifier && "Wrong ODR identifier?");
@@ -538,7 +633,8 @@ DICompositeType *DICompositeType::buildODRType(
              Flags);
   Metadata *Ops[] = {File,          Scope,        Name,           BaseType,
                      Elements,      VTableHolder, TemplateParams, &Identifier,
-                     Discriminator, DataLocation};
+                     Discriminator, DataLocation, Associated,     Allocated,
+                     Rank};
   assert((std::end(Ops) - std::begin(Ops)) == (int)CT->getNumOperands() &&
          "Mismatched number of operands");
   for (unsigned I = 0, E = CT->getNumOperands(); I != E; ++I)
@@ -553,7 +649,8 @@ DICompositeType *DICompositeType::getODRType(
     uint64_t SizeInBits, uint32_t AlignInBits, uint64_t OffsetInBits,
     DIFlags Flags, Metadata *Elements, unsigned RuntimeLang,
     Metadata *VTableHolder, Metadata *TemplateParams, Metadata *Discriminator,
-    Metadata *DataLocation) {
+    Metadata *DataLocation, Metadata *Associated, Metadata *Allocated,
+    Metadata *Rank) {
   assert(!Identifier.getString().empty() && "Expected valid identifier");
   if (!Context.isODRUniquingDebugTypes())
     return nullptr;
@@ -562,7 +659,8 @@ DICompositeType *DICompositeType::getODRType(
     CT = DICompositeType::getDistinct(
         Context, Tag, Name, File, Line, Scope, BaseType, SizeInBits,
         AlignInBits, OffsetInBits, Flags, Elements, RuntimeLang, VTableHolder,
-        TemplateParams, &Identifier, Discriminator, DataLocation);
+        TemplateParams, &Identifier, Discriminator, DataLocation, Associated,
+        Allocated, Rank);
   return CT;
 }
 
@@ -828,14 +926,14 @@ DIModule *DIModule::getImpl(LLVMContext &Context, Metadata *File,
                             Metadata *Scope, MDString *Name,
                             MDString *ConfigurationMacros,
                             MDString *IncludePath, MDString *APINotesFile,
-                            unsigned LineNo, StorageType Storage,
+                            unsigned LineNo, bool IsDecl, StorageType Storage,
                             bool ShouldCreate) {
   assert(isCanonical(Name) && "Expected canonical MDString");
   DEFINE_GETIMPL_LOOKUP(DIModule, (File, Scope, Name, ConfigurationMacros,
-                                   IncludePath, APINotesFile, LineNo));
+                                   IncludePath, APINotesFile, LineNo, IsDecl));
   Metadata *Ops[] = {File,        Scope,       Name, ConfigurationMacros,
                      IncludePath, APINotesFile};
-  DEFINE_GETIMPL_STORE(DIModule, (LineNo), Ops);
+  DEFINE_GETIMPL_STORE(DIModule, (LineNo, IsDecl), Ops);
 }
 
 DITemplateTypeParameter *
@@ -1016,6 +1114,7 @@ bool DIExpression::isValid() const {
       return I->get() == expr_op_begin()->get() && I->getArg(0) == 1 &&
              getNumElements() == 2;
     }
+    case dwarf::DW_OP_LLVM_implicit_pointer:
     case dwarf::DW_OP_LLVM_convert:
     case dwarf::DW_OP_LLVM_tag_offset:
     case dwarf::DW_OP_constu:
@@ -1040,6 +1139,8 @@ bool DIExpression::isValid() const {
     case dwarf::DW_OP_regx:
     case dwarf::DW_OP_bregx:
     case dwarf::DW_OP_push_object_address:
+    case dwarf::DW_OP_over:
+    case dwarf::DW_OP_consts:
       break;
     }
   }
@@ -1313,6 +1414,15 @@ bool DIExpression::isConstant() const {
   return true;
 }
 
+bool DIExpression::isSignedConstant() const {
+  // Recognize DW_OP_consts C
+  if (getNumElements() != 2)
+    return false;
+  if (getElement(0) != dwarf::DW_OP_consts)
+    return false;
+  return true;
+}
+
 DIExpression::ExtOps DIExpression::getExtOps(unsigned FromSize, unsigned ToSize,
                                              bool Signed) {
   dwarf::TypeKind TK = Signed ? dwarf::DW_ATE_signed : dwarf::DW_ATE_unsigned;
diff --git a/contrib/llvm-project/llvm/lib/IR/DebugLoc.cpp b/contrib/llvm-project/llvm/lib/IR/DebugLoc.cpp
index e945cbcba782..993f3a39e6ff 100644
--- a/contrib/llvm-project/llvm/lib/IR/DebugLoc.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/DebugLoc.cpp
@@ -50,7 +50,7 @@ DebugLoc DebugLoc::getFnDebugLoc() const {
   // FIXME: Add a method on \a DILocation that does this work.
   const MDNode *Scope = getInlinedAtScope();
   if (auto *SP = getDISubprogram(Scope))
-    return DebugLoc::get(SP->getScopeLine(), 0, SP);
+    return DILocation::get(SP->getContext(), SP->getScopeLine(), 0, SP);
 
   return DebugLoc();
 }
@@ -68,21 +68,9 @@ void DebugLoc::setImplicitCode(bool ImplicitCode) {
   }
 }
 
-DebugLoc DebugLoc::get(unsigned Line, unsigned Col, const MDNode *Scope,
-                       const MDNode *InlinedAt, bool ImplicitCode) {
-  // If no scope is available, this is an unknown location.
-  if (!Scope)
-    return DebugLoc();
-
-  return DILocation::get(Scope->getContext(), Line, Col,
-                         const_cast<MDNode *>(Scope),
-                         const_cast<MDNode *>(InlinedAt), ImplicitCode);
-}
-
 DebugLoc DebugLoc::appendInlinedAt(const DebugLoc &DL, DILocation *InlinedAt,
                                    LLVMContext &Ctx,
-                                   DenseMap<const MDNode *, MDNode *> &Cache,
-                                   bool ReplaceLast) {
+                                   DenseMap<const MDNode *, MDNode *> &Cache) {
   SmallVector<DILocation *, 3> InlinedAtLocations;
   DILocation *Last = InlinedAt;
   DILocation *CurInlinedAt = DL;
@@ -95,8 +83,6 @@ DebugLoc DebugLoc::appendInlinedAt(const DebugLoc &DL, DILocation *InlinedAt,
       break;
     }
 
-    if (ReplaceLast && !IA->getInlinedAt())
-      break;
     InlinedAtLocations.push_back(IA);
     CurInlinedAt = IA;
   }
diff --git a/contrib/llvm-project/llvm/lib/IR/DiagnosticInfo.cpp b/contrib/llvm-project/llvm/lib/IR/DiagnosticInfo.cpp
index 6528c723fbfa..8820a79421c3 100644
--- a/contrib/llvm-project/llvm/lib/IR/DiagnosticInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/DiagnosticInfo.cpp
@@ -32,6 +32,7 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/InstructionCost.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Regex.h"
 #include "llvm/Support/ScopedPrinter.h"
@@ -213,6 +214,20 @@ DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key,
                                                    unsigned long long N)
     : Key(std::string(Key)), Val(utostr(N)) {}
 
+DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key,
+                                                   ElementCount EC)
+    : Key(std::string(Key)) {
+  raw_string_ostream OS(Val);
+  EC.print(OS);
+}
+
+DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key,
+                                                   InstructionCost C)
+    : Key(std::string(Key)) {
+  raw_string_ostream OS(Val);
+  C.print(OS);
+}
+
 DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, DebugLoc Loc)
     : Key(std::string(Key)), Loc(Loc) {
   if (Loc) {
@@ -368,16 +383,5 @@ std::string DiagnosticInfoOptimizationBase::getMsg() const {
   return OS.str();
 }
 
-DiagnosticInfoMisExpect::DiagnosticInfoMisExpect(const Instruction *Inst,
-                                                 Twine &Msg)
-    : DiagnosticInfoWithLocationBase(DK_MisExpect, DS_Warning,
-                                     *Inst->getParent()->getParent(),
-                                     Inst->getDebugLoc()),
-      Msg(Msg) {}
-
-void DiagnosticInfoMisExpect::print(DiagnosticPrinter &DP) const {
-  DP << getLocationStr() << ": " << getMsg();
-}
-
 void OptimizationRemarkAnalysisFPCommute::anchor() {}
 void OptimizationRemarkAnalysisAliasing::anchor() {}
diff --git a/contrib/llvm-project/llvm/lib/IR/Dominators.cpp b/contrib/llvm-project/llvm/lib/IR/Dominators.cpp
index bb1cc347dcb1..fbc28c202aec 100644
--- a/contrib/llvm-project/llvm/lib/IR/Dominators.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/Dominators.cpp
@@ -90,9 +90,11 @@ template void llvm::DomTreeBuilder::DeleteEdge<DomTreeBuilder::BBPostDomTree>(
     DomTreeBuilder::BBPostDomTree &DT, BasicBlock *From, BasicBlock *To);
 
 template void llvm::DomTreeBuilder::ApplyUpdates<DomTreeBuilder::BBDomTree>(
-    DomTreeBuilder::BBDomTree &DT, DomTreeBuilder::BBUpdates);
+    DomTreeBuilder::BBDomTree &DT, DomTreeBuilder::BBDomTreeGraphDiff &,
+    DomTreeBuilder::BBDomTreeGraphDiff *);
 template void llvm::DomTreeBuilder::ApplyUpdates<DomTreeBuilder::BBPostDomTree>(
-    DomTreeBuilder::BBPostDomTree &DT, DomTreeBuilder::BBUpdates);
+    DomTreeBuilder::BBPostDomTree &DT, DomTreeBuilder::BBPostDomTreeGraphDiff &,
+    DomTreeBuilder::BBPostDomTreeGraphDiff *);
 
 template bool llvm::DomTreeBuilder::Verify<DomTreeBuilder::BBDomTree>(
     const DomTreeBuilder::BBDomTree &DT,
@@ -113,8 +115,15 @@ bool DominatorTree::invalidate(Function &F, const PreservedAnalyses &PA,
 // dominates - Return true if Def dominates a use in User. This performs
 // the special checks necessary if Def and User are in the same basic block.
 // Note that Def doesn't dominate a use in Def itself!
-bool DominatorTree::dominates(const Instruction *Def,
+bool DominatorTree::dominates(const Value *DefV,
                               const Instruction *User) const {
+  const Instruction *Def = dyn_cast<Instruction>(DefV);
+  if (!Def) {
+    assert((isa<Argument>(DefV) || isa<Constant>(DefV)) &&
+           "Should be called with an instruction, argument or constant");
+    return true; // Arguments and constants dominate everything.
+  }
+
   const BasicBlock *UseBB = User->getParent();
   const BasicBlock *DefBB = Def->getParent();
 
@@ -248,7 +257,14 @@ bool DominatorTree::dominates(const BasicBlockEdge &BBE, const Use &U) const {
   return dominates(BBE, UseBB);
 }
 
-bool DominatorTree::dominates(const Instruction *Def, const Use &U) const {
+bool DominatorTree::dominates(const Value *DefV, const Use &U) const {
+  const Instruction *Def = dyn_cast<Instruction>(DefV);
+  if (!Def) {
+    assert((isa<Argument>(DefV) || isa<Constant>(DefV)) &&
+           "Should be called with an instruction, argument or constant");
+    return true; // Arguments and constants dominate everything.
+  }
+
   Instruction *UserInst = cast<Instruction>(U.getUser());
   const BasicBlock *DefBB = Def->getParent();
 
diff --git a/contrib/llvm-project/llvm/lib/IR/Function.cpp b/contrib/llvm-project/llvm/lib/IR/Function.cpp
index 10d535e3ab11..17247123f87f 100644
--- a/contrib/llvm-project/llvm/lib/IR/Function.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/Function.cpp
@@ -43,6 +43,7 @@
 #include "llvm/IR/IntrinsicsR600.h"
 #include "llvm/IR/IntrinsicsRISCV.h"
 #include "llvm/IR/IntrinsicsS390.h"
+#include "llvm/IR/IntrinsicsVE.h"
 #include "llvm/IR/IntrinsicsWebAssembly.h"
 #include "llvm/IR/IntrinsicsX86.h"
 #include "llvm/IR/IntrinsicsXCore.h"
@@ -86,9 +87,11 @@ void Argument::setParent(Function *parent) {
   Parent = parent;
 }
 
-bool Argument::hasNonNullAttr() const {
+bool Argument::hasNonNullAttr(bool AllowUndefOrPoison) const {
   if (!getType()->isPointerTy()) return false;
-  if (getParent()->hasParamAttribute(getArgNo(), Attribute::NonNull))
+  if (getParent()->hasParamAttribute(getArgNo(), Attribute::NonNull) &&
+      (AllowUndefOrPoison ||
+       getParent()->hasParamAttribute(getArgNo(), Attribute::NoUndef)))
     return true;
   else if (getDereferenceableBytes() > 0 &&
            !NullPointerIsDefined(getParent(),
@@ -102,6 +105,12 @@ bool Argument::hasByValAttr() const {
   return hasAttribute(Attribute::ByVal);
 }
 
+bool Argument::hasByRefAttr() const {
+  if (!getType()->isPointerTy())
+    return false;
+  return hasAttribute(Attribute::ByRef);
+}
+
 bool Argument::hasSwiftSelfAttr() const {
   return getParent()->hasParamAttribute(getArgNo(), Attribute::SwiftSelf);
 }
@@ -121,7 +130,7 @@ bool Argument::hasPreallocatedAttr() const {
   return hasAttribute(Attribute::Preallocated);
 }
 
-bool Argument::hasPassPointeeByValueAttr() const {
+bool Argument::hasPassPointeeByValueCopyAttr() const {
   if (!getType()->isPointerTy()) return false;
   AttributeList Attrs = getParent()->getAttributes();
   return Attrs.hasParamAttribute(getArgNo(), Attribute::ByVal) ||
@@ -129,27 +138,54 @@ bool Argument::hasPassPointeeByValueAttr() const {
          Attrs.hasParamAttribute(getArgNo(), Attribute::Preallocated);
 }
 
-uint64_t Argument::getPassPointeeByValueCopySize(const DataLayout &DL) const {
-  AttributeSet ParamAttrs
-    = getParent()->getAttributes().getParamAttributes(getArgNo());
+bool Argument::hasPointeeInMemoryValueAttr() const {
+  if (!getType()->isPointerTy())
+    return false;
+  AttributeList Attrs = getParent()->getAttributes();
+  return Attrs.hasParamAttribute(getArgNo(), Attribute::ByVal) ||
+         Attrs.hasParamAttribute(getArgNo(), Attribute::StructRet) ||
+         Attrs.hasParamAttribute(getArgNo(), Attribute::InAlloca) ||
+         Attrs.hasParamAttribute(getArgNo(), Attribute::Preallocated) ||
+         Attrs.hasParamAttribute(getArgNo(), Attribute::ByRef);
+}
 
+/// For a byval, sret, inalloca, or preallocated parameter, get the in-memory
+/// parameter type.
+static Type *getMemoryParamAllocType(AttributeSet ParamAttrs, Type *ArgTy) {
   // FIXME: All the type carrying attributes are mutually exclusive, so there
   // should be a single query to get the stored type that handles any of them.
   if (Type *ByValTy = ParamAttrs.getByValType())
-    return DL.getTypeAllocSize(ByValTy);
+    return ByValTy;
+  if (Type *ByRefTy = ParamAttrs.getByRefType())
+    return ByRefTy;
   if (Type *PreAllocTy = ParamAttrs.getPreallocatedType())
-    return DL.getTypeAllocSize(PreAllocTy);
+    return PreAllocTy;
 
-  // FIXME: inalloca always depends on pointee element type. It's also possible
-  // for byval to miss it.
+  // FIXME: sret and inalloca always depends on pointee element type. It's also
+  // possible for byval to miss it.
   if (ParamAttrs.hasAttribute(Attribute::InAlloca) ||
       ParamAttrs.hasAttribute(Attribute::ByVal) ||
+      ParamAttrs.hasAttribute(Attribute::StructRet) ||
       ParamAttrs.hasAttribute(Attribute::Preallocated))
-    return DL.getTypeAllocSize(cast<PointerType>(getType())->getElementType());
+    return cast<PointerType>(ArgTy)->getElementType();
 
+  return nullptr;
+}
+
+uint64_t Argument::getPassPointeeByValueCopySize(const DataLayout &DL) const {
+  AttributeSet ParamAttrs =
+      getParent()->getAttributes().getParamAttributes(getArgNo());
+  if (Type *MemTy = getMemoryParamAllocType(ParamAttrs, getType()))
+    return DL.getTypeAllocSize(MemTy);
   return 0;
 }
 
+Type *Argument::getPointeeInMemoryValueType() const {
+  AttributeSet ParamAttrs =
+      getParent()->getAttributes().getParamAttributes(getArgNo());
+  return getMemoryParamAllocType(ParamAttrs, getType());
+}
+
 unsigned Argument::getParamAlignment() const {
   assert(getType()->isPointerTy() && "Only pointers have alignments");
   return getParent()->getParamAlignment(getArgNo());
@@ -165,6 +201,16 @@ Type *Argument::getParamByValType() const {
   return getParent()->getParamByValType(getArgNo());
 }
 
+Type *Argument::getParamStructRetType() const {
+  assert(getType()->isPointerTy() && "Only pointers have sret types");
+  return getParent()->getParamStructRetType(getArgNo());
+}
+
+Type *Argument::getParamByRefType() const {
+  assert(getType()->isPointerTy() && "Only pointers have byval types");
+  return getParent()->getParamByRefType(getArgNo());
+}
+
 uint64_t Argument::getDereferenceableBytes() const {
   assert(getType()->isPointerTy() &&
          "Only pointers have dereferenceable bytes");
@@ -534,6 +580,21 @@ void Function::addDereferenceableOrNullParamAttr(unsigned ArgNo,
   setAttributes(PAL);
 }
 
+DenormalMode Function::getDenormalMode(const fltSemantics &FPType) const {
+  if (&FPType == &APFloat::IEEEsingle()) {
+    Attribute Attr = getFnAttribute("denormal-fp-math-f32");
+    StringRef Val = Attr.getValueAsString();
+    if (!Val.empty())
+      return parseDenormalFPAttribute(Val);
+
+    // If the f32 variant of the attribute isn't specified, try to use the
+    // generic one.
+  }
+
+  Attribute Attr = getFnAttribute("denormal-fp-math");
+  return parseDenormalFPAttribute(Attr.getValueAsString());
+}
+
 const std::string &Function::getGC() const {
   assert(hasGC() && "Function has no collector");
   return getContext().getGC(*this);
@@ -551,6 +612,12 @@ void Function::clearGC() {
   setValueSubclassDataBit(14, false);
 }
 
+bool Function::hasStackProtectorFnAttr() const {
+  return hasFnAttribute(Attribute::StackProtect) ||
+         hasFnAttribute(Attribute::StackProtectStrong) ||
+         hasFnAttribute(Attribute::StackProtectReq);
+}
+
 /// Copy all additional attributes (those not needed to create a Function) from
 /// the Function Src to this one.
 void Function::copyAttributesFrom(const Function *Src) {
@@ -582,6 +649,14 @@ static const char * const IntrinsicNameTable[] = {
 #include "llvm/IR/IntrinsicImpl.inc"
 #undef GET_INTRINSIC_TARGET_DATA
 
+bool Function::isTargetIntrinsic(Intrinsic::ID IID) {
+  return IID > TargetInfos[0].Count;
+}
+
+bool Function::isTargetIntrinsic() const {
+  return isTargetIntrinsic(IntID);
+}
+
 /// Find the segment of \c IntrinsicNameTable for intrinsics with the same
 /// target as \c Name, or the generic table if \c Name is not target specific.
 ///
@@ -674,9 +749,10 @@ static std::string getMangledTypeStr(Type* Ty) {
     Result += "f";
   } else if (VectorType* VTy = dyn_cast<VectorType>(Ty)) {
     ElementCount EC = VTy->getElementCount();
-    if (EC.Scalable)
+    if (EC.isScalable())
       Result += "nx";
-    Result += "v" + utostr(EC.Min) + getMangledTypeStr(VTy->getElementType());
+    Result += "v" + utostr(EC.getKnownMinValue()) +
+              getMangledTypeStr(VTy->getElementType());
   } else if (Ty) {
     switch (Ty->getTypeID()) {
     default: llvm_unreachable("Unhandled type");
@@ -690,6 +766,7 @@ static std::string getMangledTypeStr(Type* Ty) {
     case Type::FP128TyID:     Result += "f128";     break;
     case Type::PPC_FP128TyID: Result += "ppcf128";  break;
     case Type::X86_MMXTyID:   Result += "x86mmx";   break;
+    case Type::X86_AMXTyID:   Result += "x86amx";   break;
     case Type::IntegerTyID:
       Result += "i" + utostr(cast<IntegerType>(Ty)->getBitWidth());
       break;
@@ -707,6 +784,8 @@ StringRef Intrinsic::getName(ID id) {
 
 std::string Intrinsic::getName(ID id, ArrayRef<Type*> Tys) {
   assert(id < num_intrinsics && "Invalid intrinsic ID!");
+  assert((Tys.empty() || Intrinsic::isOverloaded(id)) &&
+         "This version of getName is for overloaded intrinsics only");
   std::string Result(IntrinsicNameTable[id]);
   for (Type *Ty : Tys) {
     Result += "." + getMangledTypeStr(Ty);
@@ -770,7 +849,10 @@ enum IIT_Info {
   IIT_SUBDIVIDE4_ARG = 45,
   IIT_VEC_OF_BITCASTS_TO_INT = 46,
   IIT_V128 = 47,
-  IIT_BF16 = 48
+  IIT_BF16 = 48,
+  IIT_STRUCT9 = 49,
+  IIT_V256 = 50,
+  IIT_AMX  = 51
 };
 
 static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
@@ -793,6 +875,9 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
   case IIT_MMX:
     OutputTable.push_back(IITDescriptor::get(IITDescriptor::MMX, 0));
     return;
+  case IIT_AMX:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::AMX, 0));
+    return;
   case IIT_TOKEN:
     OutputTable.push_back(IITDescriptor::get(IITDescriptor::Token, 0));
     return;
@@ -864,6 +949,10 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
     OutputTable.push_back(IITDescriptor::getVector(128, IsScalableVector));
     DecodeIITType(NextElt, Infos, Info, OutputTable);
     return;
+  case IIT_V256:
+    OutputTable.push_back(IITDescriptor::getVector(256, IsScalableVector));
+    DecodeIITType(NextElt, Infos, Info, OutputTable);
+    return;
   case IIT_V512:
     OutputTable.push_back(IITDescriptor::getVector(512, IsScalableVector));
     DecodeIITType(NextElt, Infos, Info, OutputTable);
@@ -932,6 +1021,7 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
   case IIT_EMPTYSTRUCT:
     OutputTable.push_back(IITDescriptor::get(IITDescriptor::Struct, 0));
     return;
+  case IIT_STRUCT9: ++StructElts; LLVM_FALLTHROUGH;
   case IIT_STRUCT8: ++StructElts; LLVM_FALLTHROUGH;
   case IIT_STRUCT7: ++StructElts; LLVM_FALLTHROUGH;
   case IIT_STRUCT6: ++StructElts; LLVM_FALLTHROUGH;
@@ -1025,6 +1115,7 @@ static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
   case IITDescriptor::Void: return Type::getVoidTy(Context);
   case IITDescriptor::VarArg: return Type::getVoidTy(Context);
   case IITDescriptor::MMX: return Type::getX86_MMXTy(Context);
+  case IITDescriptor::AMX: return Type::getX86_AMXTy(Context);
   case IITDescriptor::Token: return Type::getTokenTy(Context);
   case IITDescriptor::Metadata: return Type::getMetadataTy(Context);
   case IITDescriptor::Half: return Type::getHalfTy(Context);
@@ -1162,7 +1253,7 @@ Function *Intrinsic::getDeclaration(Module *M, ID id, ArrayRef<Type*> Tys) {
   // There can never be multiple globals with the same name of different types,
   // because intrinsics must be a specific type.
   return cast<Function>(
-      M->getOrInsertFunction(getName(id, Tys),
+      M->getOrInsertFunction(Tys.empty() ? getName(id) : getName(id, Tys),
                              getType(M->getContext(), id, Tys))
           .getCallee());
 }
@@ -1204,6 +1295,7 @@ static bool matchIntrinsicType(
     case IITDescriptor::Void: return !Ty->isVoidTy();
     case IITDescriptor::VarArg: return true;
     case IITDescriptor::MMX:  return !Ty->isX86_MMXTy();
+    case IITDescriptor::AMX:  return !Ty->isX86_AMXTy();
     case IITDescriptor::Token: return !Ty->isTokenTy();
     case IITDescriptor::Metadata: return !Ty->isMetadataTy();
     case IITDescriptor::Half: return !Ty->isHalfTy();
@@ -1356,10 +1448,10 @@ static bool matchIntrinsicType(
       // Verify the overloaded type "matches" the Ref type.
       // i.e. Ty is a vector with the same width as Ref.
       // Composed of pointers to the same element type as Ref.
-      VectorType *ReferenceType = dyn_cast<VectorType>(ArgTys[RefArgNumber]);
-      VectorType *ThisArgVecTy = dyn_cast<VectorType>(Ty);
+      auto *ReferenceType = dyn_cast<VectorType>(ArgTys[RefArgNumber]);
+      auto *ThisArgVecTy = dyn_cast<VectorType>(Ty);
       if (!ThisArgVecTy || !ReferenceType ||
-          (ReferenceType->getNumElements() != ThisArgVecTy->getNumElements()))
+          (ReferenceType->getElementCount() != ThisArgVecTy->getElementCount()))
         return true;
       PointerType *ThisArgEltTy =
           dyn_cast<PointerType>(ThisArgVecTy->getElementType());
diff --git a/contrib/llvm-project/llvm/lib/IR/Globals.cpp b/contrib/llvm-project/llvm/lib/IR/Globals.cpp
index ed946ef3fd12..c2cbe7ddddf8 100644
--- a/contrib/llvm-project/llvm/lib/IR/Globals.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/Globals.cpp
@@ -352,11 +352,15 @@ GlobalVariable::GlobalVariable(Type *Ty, bool constant, LinkageTypes Link,
 GlobalVariable::GlobalVariable(Module &M, Type *Ty, bool constant,
                                LinkageTypes Link, Constant *InitVal,
                                const Twine &Name, GlobalVariable *Before,
-                               ThreadLocalMode TLMode, unsigned AddressSpace,
+                               ThreadLocalMode TLMode,
+                               Optional<unsigned> AddressSpace,
                                bool isExternallyInitialized)
     : GlobalObject(Ty, Value::GlobalVariableVal,
                    OperandTraits<GlobalVariable>::op_begin(this),
-                   InitVal != nullptr, Link, Name, AddressSpace),
+                   InitVal != nullptr, Link, Name,
+                   AddressSpace
+                       ? *AddressSpace
+                       : M.getDataLayout().getDefaultGlobalsAddressSpace()),
       isConstantGlobal(constant),
       isExternallyInitializedConstant(isExternallyInitialized) {
   assert(!Ty->isFunctionTy() && PointerType::isValidElementType(Ty) &&
diff --git a/contrib/llvm-project/llvm/lib/IR/IRBuilder.cpp b/contrib/llvm-project/llvm/lib/IR/IRBuilder.cpp
index a82f15895782..91ca984b755c 100644
--- a/contrib/llvm-project/llvm/lib/IR/IRBuilder.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/IRBuilder.cpp
@@ -72,13 +72,25 @@ Value *IRBuilderBase::getCastedInt8PtrValue(Value *Ptr) {
 static CallInst *createCallHelper(Function *Callee, ArrayRef<Value *> Ops,
                                   IRBuilderBase *Builder,
                                   const Twine &Name = "",
-                                  Instruction *FMFSource = nullptr) {
-  CallInst *CI = Builder->CreateCall(Callee, Ops, Name);
+                                  Instruction *FMFSource = nullptr,
+                                  ArrayRef<OperandBundleDef> OpBundles = {}) {
+  CallInst *CI = Builder->CreateCall(Callee, Ops, OpBundles, Name);
   if (FMFSource)
     CI->copyFastMathFlags(FMFSource);
   return CI;
 }
 
+Value *IRBuilderBase::CreateVScale(Constant *Scaling, const Twine &Name) {
+  Module *M = GetInsertBlock()->getParent()->getParent();
+  assert(isa<ConstantInt>(Scaling) && "Expected constant integer");
+  Function *TheFn =
+      Intrinsic::getDeclaration(M, Intrinsic::vscale, {Scaling->getType()});
+  CallInst *CI = createCallHelper(TheFn, {}, this, Name);
+  return cast<ConstantInt>(Scaling)->getSExtValue() == 1
+             ? CI
+             : CreateMul(CI, Scaling);
+}
+
 CallInst *IRBuilderBase::CreateMemSet(Value *Ptr, Value *Val, Value *Size,
                                       MaybeAlign Align, bool isVolatile,
                                       MDNode *TBAATag, MDNode *ScopeTag,
@@ -135,22 +147,21 @@ CallInst *IRBuilderBase::CreateElementUnorderedAtomicMemSet(
   return CI;
 }
 
-CallInst *IRBuilderBase::CreateMemCpy(Value *Dst, MaybeAlign DstAlign,
-                                      Value *Src, MaybeAlign SrcAlign,
-                                      Value *Size, bool isVolatile,
-                                      MDNode *TBAATag, MDNode *TBAAStructTag,
-                                      MDNode *ScopeTag, MDNode *NoAliasTag) {
+CallInst *IRBuilderBase::CreateMemTransferInst(
+    Intrinsic::ID IntrID, Value *Dst, MaybeAlign DstAlign, Value *Src,
+    MaybeAlign SrcAlign, Value *Size, bool isVolatile, MDNode *TBAATag,
+    MDNode *TBAAStructTag, MDNode *ScopeTag, MDNode *NoAliasTag) {
   Dst = getCastedInt8PtrValue(Dst);
   Src = getCastedInt8PtrValue(Src);
 
   Value *Ops[] = {Dst, Src, Size, getInt1(isVolatile)};
   Type *Tys[] = { Dst->getType(), Src->getType(), Size->getType() };
   Module *M = BB->getParent()->getParent();
-  Function *TheFn = Intrinsic::getDeclaration(M, Intrinsic::memcpy, Tys);
+  Function *TheFn = Intrinsic::getDeclaration(M, IntrID, Tys);
 
   CallInst *CI = createCallHelper(TheFn, Ops, this);
 
-  auto* MCI = cast<MemCpyInst>(CI);
+  auto* MCI = cast<MemTransferInst>(CI);
   if (DstAlign)
     MCI->setDestAlignment(*DstAlign);
   if (SrcAlign)
@@ -324,78 +335,57 @@ static CallInst *getReductionIntrinsic(IRBuilderBase *Builder, Intrinsic::ID ID,
 CallInst *IRBuilderBase::CreateFAddReduce(Value *Acc, Value *Src) {
   Module *M = GetInsertBlock()->getParent()->getParent();
   Value *Ops[] = {Acc, Src};
-  Type *Tys[] = {Acc->getType(), Src->getType()};
-  auto Decl = Intrinsic::getDeclaration(
-      M, Intrinsic::experimental_vector_reduce_v2_fadd, Tys);
+  auto Decl = Intrinsic::getDeclaration(M, Intrinsic::vector_reduce_fadd,
+                                        {Src->getType()});
   return createCallHelper(Decl, Ops, this);
 }
 
 CallInst *IRBuilderBase::CreateFMulReduce(Value *Acc, Value *Src) {
   Module *M = GetInsertBlock()->getParent()->getParent();
   Value *Ops[] = {Acc, Src};
-  Type *Tys[] = {Acc->getType(), Src->getType()};
-  auto Decl = Intrinsic::getDeclaration(
-      M, Intrinsic::experimental_vector_reduce_v2_fmul, Tys);
+  auto Decl = Intrinsic::getDeclaration(M, Intrinsic::vector_reduce_fmul,
+                                        {Src->getType()});
   return createCallHelper(Decl, Ops, this);
 }
 
 CallInst *IRBuilderBase::CreateAddReduce(Value *Src) {
-  return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_add,
-                               Src);
+  return getReductionIntrinsic(this, Intrinsic::vector_reduce_add, Src);
 }
 
 CallInst *IRBuilderBase::CreateMulReduce(Value *Src) {
-  return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_mul,
-                               Src);
+  return getReductionIntrinsic(this, Intrinsic::vector_reduce_mul, Src);
 }
 
 CallInst *IRBuilderBase::CreateAndReduce(Value *Src) {
-  return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_and,
-                               Src);
+  return getReductionIntrinsic(this, Intrinsic::vector_reduce_and, Src);
 }
 
 CallInst *IRBuilderBase::CreateOrReduce(Value *Src) {
-  return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_or,
-                               Src);
+  return getReductionIntrinsic(this, Intrinsic::vector_reduce_or, Src);
 }
 
 CallInst *IRBuilderBase::CreateXorReduce(Value *Src) {
-  return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_xor,
-                               Src);
+  return getReductionIntrinsic(this, Intrinsic::vector_reduce_xor, Src);
 }
 
 CallInst *IRBuilderBase::CreateIntMaxReduce(Value *Src, bool IsSigned) {
-  auto ID = IsSigned ? Intrinsic::experimental_vector_reduce_smax
-                     : Intrinsic::experimental_vector_reduce_umax;
+  auto ID =
+      IsSigned ? Intrinsic::vector_reduce_smax : Intrinsic::vector_reduce_umax;
   return getReductionIntrinsic(this, ID, Src);
 }
 
 CallInst *IRBuilderBase::CreateIntMinReduce(Value *Src, bool IsSigned) {
-  auto ID = IsSigned ? Intrinsic::experimental_vector_reduce_smin
-                     : Intrinsic::experimental_vector_reduce_umin;
+  auto ID =
+      IsSigned ? Intrinsic::vector_reduce_smin : Intrinsic::vector_reduce_umin;
   return getReductionIntrinsic(this, ID, Src);
 }
 
-CallInst *IRBuilderBase::CreateFPMaxReduce(Value *Src, bool NoNaN) {
-  auto Rdx = getReductionIntrinsic(
-      this, Intrinsic::experimental_vector_reduce_fmax, Src);
-  if (NoNaN) {
-    FastMathFlags FMF;
-    FMF.setNoNaNs();
-    Rdx->setFastMathFlags(FMF);
-  }
-  return Rdx;
+CallInst *IRBuilderBase::CreateFPMaxReduce(Value *Src) {
+  return getReductionIntrinsic(this, Intrinsic::vector_reduce_fmax, Src);
 }
 
-CallInst *IRBuilderBase::CreateFPMinReduce(Value *Src, bool NoNaN) {
-  auto Rdx = getReductionIntrinsic(
-      this, Intrinsic::experimental_vector_reduce_fmin, Src);
-  if (NoNaN) {
-    FastMathFlags FMF;
-    FMF.setNoNaNs();
-    Rdx->setFastMathFlags(FMF);
-  }
-  return Rdx;
+CallInst *IRBuilderBase::CreateFPMinReduce(Value *Src) {
+  return getReductionIntrinsic(this, Intrinsic::vector_reduce_fmin, Src);
 }
 
 CallInst *IRBuilderBase::CreateLifetimeStart(Value *Ptr, ConstantInt *Size) {
@@ -450,14 +440,23 @@ CallInst *IRBuilderBase::CreateInvariantStart(Value *Ptr, ConstantInt *Size) {
   return createCallHelper(TheFn, Ops, this);
 }
 
-CallInst *IRBuilderBase::CreateAssumption(Value *Cond) {
+CallInst *
+IRBuilderBase::CreateAssumption(Value *Cond,
+                                ArrayRef<OperandBundleDef> OpBundles) {
   assert(Cond->getType() == getInt1Ty() &&
          "an assumption condition must be of type i1");
 
   Value *Ops[] = { Cond };
   Module *M = BB->getParent()->getParent();
   Function *FnAssume = Intrinsic::getDeclaration(M, Intrinsic::assume);
-  return createCallHelper(FnAssume, Ops, this);
+  return createCallHelper(FnAssume, Ops, this, "", nullptr, OpBundles);
+}
+
+Instruction *IRBuilderBase::CreateNoAliasScopeDeclaration(Value *Scope) {
+  Module *M = BB->getModule();
+  auto *FnIntrinsic = Intrinsic::getDeclaration(
+      M, Intrinsic::experimental_noalias_scope_decl, {});
+  return createCallHelper(FnIntrinsic, {Scope}, this);
 }
 
 /// Create a call to a Masked Load intrinsic.
@@ -523,8 +522,8 @@ CallInst *IRBuilderBase::CreateMaskedIntrinsic(Intrinsic::ID Id,
 CallInst *IRBuilderBase::CreateMaskedGather(Value *Ptrs, Align Alignment,
                                             Value *Mask, Value *PassThru,
                                             const Twine &Name) {
-  auto PtrsTy = cast<VectorType>(Ptrs->getType());
-  auto PtrTy = cast<PointerType>(PtrsTy->getElementType());
+  auto *PtrsTy = cast<FixedVectorType>(Ptrs->getType());
+  auto *PtrTy = cast<PointerType>(PtrsTy->getElementType());
   unsigned NumElts = PtrsTy->getNumElements();
   auto *DataTy = FixedVectorType::get(PtrTy->getElementType(), NumElts);
 
@@ -553,8 +552,8 @@ CallInst *IRBuilderBase::CreateMaskedGather(Value *Ptrs, Align Alignment,
 ///            be accessed in memory
 CallInst *IRBuilderBase::CreateMaskedScatter(Value *Data, Value *Ptrs,
                                              Align Alignment, Value *Mask) {
-  auto PtrsTy = cast<VectorType>(Ptrs->getType());
-  auto DataTy = cast<VectorType>(Data->getType());
+  auto *PtrsTy = cast<FixedVectorType>(Ptrs->getType());
+  auto *DataTy = cast<FixedVectorType>(Data->getType());
   unsigned NumElts = PtrsTy->getNumElements();
 
 #ifndef NDEBUG
@@ -586,7 +585,7 @@ getStatepointArgs(IRBuilderBase &B, uint64_t ID, uint32_t NumPatchBytes,
   Args.push_back(ActualCallee);
   Args.push_back(B.getInt32(CallArgs.size()));
   Args.push_back(B.getInt32(Flags));
-  Args.insert(Args.end(), CallArgs.begin(), CallArgs.end());
+  llvm::append_range(Args, CallArgs);
   // GC Transition and Deopt args are now always handled via operand bundle.
   // They will be removed from the signature of gc.statepoint shortly.
   Args.push_back(B.getInt32(0));
@@ -603,18 +602,17 @@ getStatepointBundles(Optional<ArrayRef<T1>> TransitionArgs,
   std::vector<OperandBundleDef> Rval;
   if (DeoptArgs) {
     SmallVector<Value*, 16> DeoptValues;
-    DeoptValues.insert(DeoptValues.end(), DeoptArgs->begin(), DeoptArgs->end());
+    llvm::append_range(DeoptValues, *DeoptArgs);
     Rval.emplace_back("deopt", DeoptValues);
   }
   if (TransitionArgs) {
     SmallVector<Value*, 16> TransitionValues;
-    TransitionValues.insert(TransitionValues.end(),
-                            TransitionArgs->begin(), TransitionArgs->end());
+    llvm::append_range(TransitionValues, *TransitionArgs);
     Rval.emplace_back("gc-transition", TransitionValues);
   }
   if (GCArgs.size()) {
     SmallVector<Value*, 16> LiveValues;
-    LiveValues.insert(LiveValues.end(), GCArgs.begin(), GCArgs.end());
+    llvm::append_range(LiveValues, GCArgs);
     Rval.emplace_back("gc-live", LiveValues);
   }
   return Rval;
@@ -660,10 +658,10 @@ CallInst *IRBuilderBase::CreateGCStatepointCall(
 
 CallInst *IRBuilderBase::CreateGCStatepointCall(
     uint64_t ID, uint32_t NumPatchBytes, Value *ActualCallee, uint32_t Flags,
-    ArrayRef<Use> CallArgs, Optional<ArrayRef<Use>> TransitionArgs,
+    ArrayRef<Value *> CallArgs, Optional<ArrayRef<Use>> TransitionArgs,
     Optional<ArrayRef<Use>> DeoptArgs, ArrayRef<Value *> GCArgs,
     const Twine &Name) {
-  return CreateGCStatepointCallCommon<Use, Use, Use, Value *>(
+  return CreateGCStatepointCallCommon<Value *, Use, Use, Value *>(
       this, ID, NumPatchBytes, ActualCallee, Flags, CallArgs, TransitionArgs,
       DeoptArgs, GCArgs, Name);
 }
@@ -718,9 +716,9 @@ InvokeInst *IRBuilderBase::CreateGCStatepointInvoke(
 InvokeInst *IRBuilderBase::CreateGCStatepointInvoke(
     uint64_t ID, uint32_t NumPatchBytes, Value *ActualInvokee,
     BasicBlock *NormalDest, BasicBlock *UnwindDest, uint32_t Flags,
-    ArrayRef<Use> InvokeArgs, Optional<ArrayRef<Use>> TransitionArgs,
+    ArrayRef<Value *> InvokeArgs, Optional<ArrayRef<Use>> TransitionArgs,
     Optional<ArrayRef<Use>> DeoptArgs, ArrayRef<Value *> GCArgs, const Twine &Name) {
-  return CreateGCStatepointInvokeCommon<Use, Use, Use, Value *>(
+  return CreateGCStatepointInvokeCommon<Value *, Use, Use, Value *>(
       this, ID, NumPatchBytes, ActualInvokee, NormalDest, UnwindDest, Flags,
       InvokeArgs, TransitionArgs, DeoptArgs, GCArgs, Name);
 }
@@ -997,18 +995,24 @@ Value *IRBuilderBase::CreateStripInvariantGroup(Value *Ptr) {
 
 Value *IRBuilderBase::CreateVectorSplat(unsigned NumElts, Value *V,
                                         const Twine &Name) {
-  assert(NumElts > 0 && "Cannot splat to an empty vector!");
+  auto EC = ElementCount::getFixed(NumElts);
+  return CreateVectorSplat(EC, V, Name);
+}
+
+Value *IRBuilderBase::CreateVectorSplat(ElementCount EC, Value *V,
+                                        const Twine &Name) {
+  assert(EC.isNonZero() && "Cannot splat to an empty vector!");
 
-  // First insert it into an undef vector so we can shuffle it.
+  // First insert it into a poison vector so we can shuffle it.
   Type *I32Ty = getInt32Ty();
-  Value *Undef = UndefValue::get(FixedVectorType::get(V->getType(), NumElts));
-  V = CreateInsertElement(Undef, V, ConstantInt::get(I32Ty, 0),
+  Value *Poison = PoisonValue::get(VectorType::get(V->getType(), EC));
+  V = CreateInsertElement(Poison, V, ConstantInt::get(I32Ty, 0),
                           Name + ".splatinsert");
 
   // Shuffle the value across the desired number of elements.
-  Value *Zeros =
-      ConstantAggregateZero::get(FixedVectorType::get(I32Ty, NumElts));
-  return CreateShuffleVector(V, Undef, Zeros, Name + ".splat");
+  SmallVector<int, 16> Zeros;
+  Zeros.resize(EC.getKnownMinValue());
+  return CreateShuffleVector(V, Zeros, Name + ".splat");
 }
 
 Value *IRBuilderBase::CreateExtractInteger(
@@ -1043,9 +1047,7 @@ Value *IRBuilderBase::CreatePreserveArrayAccessIndex(
 
   Value *LastIndexV = getInt32(LastIndex);
   Constant *Zero = ConstantInt::get(Type::getInt32Ty(Context), 0);
-  SmallVector<Value *, 4> IdxList;
-  for (unsigned I = 0; I < Dimension; ++I)
-    IdxList.push_back(Zero);
+  SmallVector<Value *, 4> IdxList(Dimension, Zero);
   IdxList.push_back(LastIndexV);
 
   Type *ResultType =
@@ -1108,63 +1110,37 @@ Value *IRBuilderBase::CreatePreserveStructAccessIndex(
   return Fn;
 }
 
-CallInst *IRBuilderBase::CreateAlignmentAssumptionHelper(
-    const DataLayout &DL, Value *PtrValue, Value *Mask, Type *IntPtrTy,
-    Value *OffsetValue, Value **TheCheck) {
-  Value *PtrIntValue = CreatePtrToInt(PtrValue, IntPtrTy, "ptrint");
-
-  if (OffsetValue) {
-    bool IsOffsetZero = false;
-    if (const auto *CI = dyn_cast<ConstantInt>(OffsetValue))
-      IsOffsetZero = CI->isZero();
-
-    if (!IsOffsetZero) {
-      if (OffsetValue->getType() != IntPtrTy)
-        OffsetValue = CreateIntCast(OffsetValue, IntPtrTy, /*isSigned*/ true,
-                                    "offsetcast");
-      PtrIntValue = CreateSub(PtrIntValue, OffsetValue, "offsetptr");
-    }
-  }
-
-  Value *Zero = ConstantInt::get(IntPtrTy, 0);
-  Value *MaskedPtr = CreateAnd(PtrIntValue, Mask, "maskedptr");
-  Value *InvCond = CreateICmpEQ(MaskedPtr, Zero, "maskcond");
-  if (TheCheck)
-    *TheCheck = InvCond;
-
-  return CreateAssumption(InvCond);
+CallInst *IRBuilderBase::CreateAlignmentAssumptionHelper(const DataLayout &DL,
+                                                         Value *PtrValue,
+                                                         Value *AlignValue,
+                                                         Value *OffsetValue) {
+  SmallVector<Value *, 4> Vals({PtrValue, AlignValue});
+  if (OffsetValue)
+    Vals.push_back(OffsetValue);
+  OperandBundleDefT<Value *> AlignOpB("align", Vals);
+  return CreateAssumption(ConstantInt::getTrue(getContext()), {AlignOpB});
 }
 
-CallInst *IRBuilderBase::CreateAlignmentAssumption(
-    const DataLayout &DL, Value *PtrValue, unsigned Alignment,
-    Value *OffsetValue, Value **TheCheck) {
+CallInst *IRBuilderBase::CreateAlignmentAssumption(const DataLayout &DL,
+                                                   Value *PtrValue,
+                                                   unsigned Alignment,
+                                                   Value *OffsetValue) {
   assert(isa<PointerType>(PtrValue->getType()) &&
          "trying to create an alignment assumption on a non-pointer?");
   assert(Alignment != 0 && "Invalid Alignment");
   auto *PtrTy = cast<PointerType>(PtrValue->getType());
   Type *IntPtrTy = getIntPtrTy(DL, PtrTy->getAddressSpace());
-
-  Value *Mask = ConstantInt::get(IntPtrTy, Alignment - 1);
-  return CreateAlignmentAssumptionHelper(DL, PtrValue, Mask, IntPtrTy,
-                                         OffsetValue, TheCheck);
+  Value *AlignValue = ConstantInt::get(IntPtrTy, Alignment);
+  return CreateAlignmentAssumptionHelper(DL, PtrValue, AlignValue, OffsetValue);
 }
 
-CallInst *IRBuilderBase::CreateAlignmentAssumption(
-    const DataLayout &DL, Value *PtrValue, Value *Alignment,
-    Value *OffsetValue, Value **TheCheck) {
+CallInst *IRBuilderBase::CreateAlignmentAssumption(const DataLayout &DL,
+                                                   Value *PtrValue,
+                                                   Value *Alignment,
+                                                   Value *OffsetValue) {
   assert(isa<PointerType>(PtrValue->getType()) &&
          "trying to create an alignment assumption on a non-pointer?");
-  auto *PtrTy = cast<PointerType>(PtrValue->getType());
-  Type *IntPtrTy = getIntPtrTy(DL, PtrTy->getAddressSpace());
-
-  if (Alignment->getType() != IntPtrTy)
-    Alignment = CreateIntCast(Alignment, IntPtrTy, /*isSigned*/ false,
-                              "alignmentcast");
-
-  Value *Mask = CreateSub(Alignment, ConstantInt::get(IntPtrTy, 1), "mask");
-
-  return CreateAlignmentAssumptionHelper(DL, PtrValue, Mask, IntPtrTy,
-                                         OffsetValue, TheCheck);
+  return CreateAlignmentAssumptionHelper(DL, PtrValue, Alignment, OffsetValue);
 }
 
 IRBuilderDefaultInserter::~IRBuilderDefaultInserter() {}
diff --git a/contrib/llvm-project/llvm/lib/IR/IRPrintingPasses.cpp b/contrib/llvm-project/llvm/lib/IR/IRPrintingPasses.cpp
index 03657ff8d9d4..8d6fe1eb6134 100644
--- a/contrib/llvm-project/llvm/lib/IR/IRPrintingPasses.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/IRPrintingPasses.cpp
@@ -11,13 +11,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/IR/IRPrintingPasses.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Module.h"
-#include "llvm/IR/PassManager.h"
+#include "llvm/IR/PrintPasses.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+
 using namespace llvm;
 
 PrintModulePass::PrintModulePass() : OS(dbgs()) {}
diff --git a/contrib/llvm-project/llvm/lib/IR/Instruction.cpp b/contrib/llvm-project/llvm/lib/IR/Instruction.cpp
index bfbd801cb7a7..8e52dd3ddc71 100644
--- a/contrib/llvm-project/llvm/lib/IR/Instruction.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/Instruction.cpp
@@ -55,9 +55,6 @@ Instruction::~Instruction() {
   //   instructions in a BasicBlock are deleted).
   if (isUsedByMetadata())
     ValueAsMetadata::handleRAUW(this, UndefValue::get(getType()));
-
-  if (hasMetadataHashEntry())
-    clearMetadataHashEntries();
 }
 
 
@@ -488,6 +485,7 @@ bool Instruction::isIdenticalToWhenDefined(const Instruction *I) const {
   if (!std::equal(op_begin(), op_end(), I->op_begin()))
     return false;
 
+  // WARNING: this logic must be kept in sync with EliminateDuplicatePHINodes()!
   if (const PHINode *thisPHI = dyn_cast<PHINode>(this)) {
     const PHINode *otherPHI = cast<PHINode>(I);
     return std::equal(thisPHI->block_begin(), thisPHI->block_end(),
@@ -635,6 +633,16 @@ bool Instruction::isSafeToRemove() const {
          !this->isTerminator();
 }
 
+bool Instruction::willReturn() const {
+  if (const auto *CB = dyn_cast<CallBase>(this))
+    // FIXME: Temporarily assume that all side-effect free intrinsics will
+    // return. Remove this workaround once all intrinsics are appropriately
+    // annotated.
+    return CB->hasFnAttr(Attribute::WillReturn) ||
+           (isa<IntrinsicInst>(CB) && CB->onlyReadsMemory());
+  return true;
+}
+
 bool Instruction::isLifetimeStartOrEnd() const {
   auto II = dyn_cast<IntrinsicInst>(this);
   if (!II)
@@ -643,16 +651,22 @@ bool Instruction::isLifetimeStartOrEnd() const {
   return ID == Intrinsic::lifetime_start || ID == Intrinsic::lifetime_end;
 }
 
-const Instruction *Instruction::getNextNonDebugInstruction() const {
+bool Instruction::isDebugOrPseudoInst() const {
+  return isa<DbgInfoIntrinsic>(this) || isa<PseudoProbeInst>(this);
+}
+
+const Instruction *
+Instruction::getNextNonDebugInstruction(bool SkipPseudoOp) const {
   for (const Instruction *I = getNextNode(); I; I = I->getNextNode())
-    if (!isa<DbgInfoIntrinsic>(I))
+    if (!isa<DbgInfoIntrinsic>(I) && !(SkipPseudoOp && isa<PseudoProbeInst>(I)))
       return I;
   return nullptr;
 }
 
-const Instruction *Instruction::getPrevNonDebugInstruction() const {
+const Instruction *
+Instruction::getPrevNonDebugInstruction(bool SkipPseudoOp) const {
   for (const Instruction *I = getPrevNode(); I; I = I->getPrevNode())
-    if (!isa<DbgInfoIntrinsic>(I))
+    if (!isa<DbgInfoIntrinsic>(I) && !(SkipPseudoOp && isa<PseudoProbeInst>(I)))
       return I;
   return nullptr;
 }
@@ -672,6 +686,13 @@ bool Instruction::isAssociative() const {
   }
 }
 
+bool Instruction::isCommutative() const {
+  if (auto *II = dyn_cast<IntrinsicInst>(this))
+    return II->isCommutative();
+  // TODO: Should allow icmp/fcmp?
+  return isCommutative(getOpcode());
+}
+
 unsigned Instruction::getNumSuccessors() const {
   switch (getOpcode()) {
 #define HANDLE_TERM_INST(N, OPC, CLASS)                                        \
diff --git a/contrib/llvm-project/llvm/lib/IR/Instructions.cpp b/contrib/llvm-project/llvm/lib/IR/Instructions.cpp
index 2f17a0d73af4..d6b4a4f5030f 100644
--- a/contrib/llvm-project/llvm/lib/IR/Instructions.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/Instructions.cpp
@@ -49,13 +49,14 @@ using namespace llvm;
 //                            AllocaInst Class
 //===----------------------------------------------------------------------===//
 
-Optional<uint64_t>
+Optional<TypeSize>
 AllocaInst::getAllocationSizeInBits(const DataLayout &DL) const {
-  uint64_t Size = DL.getTypeAllocSizeInBits(getAllocatedType());
+  TypeSize Size = DL.getTypeAllocSizeInBits(getAllocatedType());
   if (isArrayAllocation()) {
     auto *C = dyn_cast<ConstantInt>(getArraySize());
     if (!C)
       return None;
+    assert(!Size.isScalable() && "Array elements cannot have a scalable size");
     Size *= C->getZExtValue();
   }
   return Size;
@@ -321,16 +322,6 @@ Value *CallBase::getReturnedArgOperand() const {
   return nullptr;
 }
 
-bool CallBase::hasRetAttr(Attribute::AttrKind Kind) const {
-  if (Attrs.hasAttribute(AttributeList::ReturnIndex, Kind))
-    return true;
-
-  // Look at the callee, if available.
-  if (const Function *F = getCalledFunction())
-    return F->getAttributes().hasAttribute(AttributeList::ReturnIndex, Kind);
-  return false;
-}
-
 /// Determine whether the argument or parameter has the given attribute.
 bool CallBase::paramHasAttr(unsigned ArgNo, Attribute::AttrKind Kind) const {
   assert(ArgNo < getNumArgOperands() && "Param index out of bounds!");
@@ -409,7 +400,7 @@ CallBase::BundleOpInfo &CallBase::getBundleOpInfoForOperand(unsigned OpIdx) {
 
   bundle_op_iterator Begin = bundle_op_info_begin();
   bundle_op_iterator End = bundle_op_info_end();
-  bundle_op_iterator Current;
+  bundle_op_iterator Current = Begin;
 
   while (Begin != End) {
     unsigned ScaledOperandPerBundle =
@@ -515,6 +506,18 @@ CallInst *CallInst::Create(CallInst *CI, ArrayRef<OperandBundleDef> OpB,
   return NewCI;
 }
 
+CallInst *CallInst::CreateWithReplacedBundle(CallInst *CI, OperandBundleDef OpB,
+                                             Instruction *InsertPt) {
+  SmallVector<OperandBundleDef, 2> OpDefs;
+  for (unsigned i = 0, e = CI->getNumOperandBundles(); i < e; ++i) {
+    auto ChildOB = CI->getOperandBundleAt(i);
+    if (ChildOB.getTagName() != OpB.getTag())
+      OpDefs.emplace_back(ChildOB);
+  }
+  OpDefs.emplace_back(OpB);
+  return CallInst::Create(CI, OpDefs, InsertPt);
+}
+
 // Update profile weight for call instruction by scaling it using the ratio
 // of S/T. The meaning of "branch_weights" meta data for call instruction is
 // transfered to represent call count.
@@ -548,8 +551,9 @@ void CallInst::updateProfWeight(uint64_t S, uint64_t T) {
                        ->getValue()
                        .getZExtValue());
     Val *= APS;
-    Vals.push_back(MDB.createConstant(ConstantInt::get(
-        Type::getInt64Ty(getContext()), Val.udiv(APT).getLimitedValue())));
+    Vals.push_back(MDB.createConstant(
+        ConstantInt::get(Type::getInt32Ty(getContext()),
+                         Val.udiv(APT).getLimitedValue(UINT32_MAX))));
   } else if (ProfDataName->getString().equals("VP"))
     for (unsigned i = 1; i < ProfileData->getNumOperands(); i += 2) {
       // The first value is the key of the value profile, which will not change.
@@ -826,6 +830,18 @@ InvokeInst *InvokeInst::Create(InvokeInst *II, ArrayRef<OperandBundleDef> OpB,
   return NewII;
 }
 
+InvokeInst *InvokeInst::CreateWithReplacedBundle(InvokeInst *II,
+                                                 OperandBundleDef OpB,
+                                                 Instruction *InsertPt) {
+  SmallVector<OperandBundleDef, 2> OpDefs;
+  for (unsigned i = 0, e = II->getNumOperandBundles(); i < e; ++i) {
+    auto ChildOB = II->getOperandBundleAt(i);
+    if (ChildOB.getTagName() != OpB.getTag())
+      OpDefs.emplace_back(ChildOB);
+  }
+  OpDefs.emplace_back(OpB);
+  return InvokeInst::Create(II, OpDefs, InsertPt);
+}
 
 LandingPadInst *InvokeInst::getLandingPadInst() const {
   return cast<LandingPadInst>(getUnwindDest()->getFirstNonPHI());
@@ -1919,7 +1935,7 @@ ShuffleVectorInst::ShuffleVectorInst(Value *V1, Value *V2, ArrayRef<int> Mask,
 }
 
 void ShuffleVectorInst::commute() {
-  int NumOpElts = cast<VectorType>(Op<0>()->getType())->getNumElements();
+  int NumOpElts = cast<FixedVectorType>(Op<0>()->getType())->getNumElements();
   int NumMaskElts = ShuffleMask.size();
   SmallVector<int, 16> NewMask(NumMaskElts);
   for (int i = 0; i != NumMaskElts; ++i) {
@@ -1943,7 +1959,8 @@ bool ShuffleVectorInst::isValidOperands(const Value *V1, const Value *V2,
     return false;
 
   // Make sure the mask elements make sense.
-  int V1Size = cast<VectorType>(V1->getType())->getElementCount().Min;
+  int V1Size =
+      cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
   for (int Elem : Mask)
     if (Elem != UndefMaskElem && Elem >= V1Size * 2)
       return false;
@@ -1973,7 +1990,7 @@ bool ShuffleVectorInst::isValidOperands(const Value *V1, const Value *V2,
     return true;
 
   if (const auto *MV = dyn_cast<ConstantVector>(Mask)) {
-    unsigned V1Size = cast<VectorType>(V1->getType())->getNumElements();
+    unsigned V1Size = cast<FixedVectorType>(V1->getType())->getNumElements();
     for (Value *Op : MV->operands()) {
       if (auto *CI = dyn_cast<ConstantInt>(Op)) {
         if (CI->uge(V1Size*2))
@@ -1986,8 +2003,9 @@ bool ShuffleVectorInst::isValidOperands(const Value *V1, const Value *V2,
   }
 
   if (const auto *CDS = dyn_cast<ConstantDataSequential>(Mask)) {
-    unsigned V1Size = cast<VectorType>(V1->getType())->getNumElements();
-    for (unsigned i = 0, e = MaskTy->getNumElements(); i != e; ++i)
+    unsigned V1Size = cast<FixedVectorType>(V1->getType())->getNumElements();
+    for (unsigned i = 0, e = cast<FixedVectorType>(MaskTy)->getNumElements();
+         i != e; ++i)
       if (CDS->getElementAsInteger(i) >= V1Size*2)
         return false;
     return true;
@@ -1998,12 +2016,26 @@ bool ShuffleVectorInst::isValidOperands(const Value *V1, const Value *V2,
 
 void ShuffleVectorInst::getShuffleMask(const Constant *Mask,
                                        SmallVectorImpl<int> &Result) {
-  unsigned NumElts = cast<VectorType>(Mask->getType())->getElementCount().Min;
+  ElementCount EC = cast<VectorType>(Mask->getType())->getElementCount();
+
   if (isa<ConstantAggregateZero>(Mask)) {
-    Result.resize(NumElts, 0);
+    Result.resize(EC.getKnownMinValue(), 0);
     return;
   }
-  Result.reserve(NumElts);
+
+  Result.reserve(EC.getKnownMinValue());
+
+  if (EC.isScalable()) {
+    assert((isa<ConstantAggregateZero>(Mask) || isa<UndefValue>(Mask)) &&
+           "Scalable vector shuffle mask must be undef or zeroinitializer");
+    int MaskVal = isa<UndefValue>(Mask) ? -1 : 0;
+    for (unsigned I = 0; I < EC.getKnownMinValue(); ++I)
+      Result.emplace_back(MaskVal);
+    return;
+  }
+
+  unsigned NumElts = EC.getKnownMinValue();
+
   if (auto *CDS = dyn_cast<ConstantDataSequential>(Mask)) {
     for (unsigned i = 0; i != NumElts; ++i)
       Result.push_back(CDS->getElementAsInteger(i));
@@ -2185,8 +2217,14 @@ bool ShuffleVectorInst::isExtractSubvectorMask(ArrayRef<int> Mask,
 bool ShuffleVectorInst::isIdentityWithPadding() const {
   if (isa<UndefValue>(Op<2>()))
     return false;
-  int NumOpElts = cast<VectorType>(Op<0>()->getType())->getNumElements();
-  int NumMaskElts = cast<VectorType>(getType())->getNumElements();
+
+  // FIXME: Not currently possible to express a shuffle mask for a scalable
+  // vector for this case.
+  if (isa<ScalableVectorType>(getType()))
+    return false;
+
+  int NumOpElts = cast<FixedVectorType>(Op<0>()->getType())->getNumElements();
+  int NumMaskElts = cast<FixedVectorType>(getType())->getNumElements();
   if (NumMaskElts <= NumOpElts)
     return false;
 
@@ -2208,7 +2246,7 @@ bool ShuffleVectorInst::isIdentityWithExtract() const {
     return false;
 
   // FIXME: Not currently possible to express a shuffle mask for a scalable
-  // vector for this case
+  // vector for this case.
   if (isa<ScalableVectorType>(getType()))
     return false;
 
@@ -2226,8 +2264,13 @@ bool ShuffleVectorInst::isConcat() const {
       isa<UndefValue>(Op<2>()))
     return false;
 
-  int NumOpElts = cast<VectorType>(Op<0>()->getType())->getNumElements();
-  int NumMaskElts = getType()->getNumElements();
+  // FIXME: Not currently possible to express a shuffle mask for a scalable
+  // vector for this case.
+  if (isa<ScalableVectorType>(getType()))
+    return false;
+
+  int NumOpElts = cast<FixedVectorType>(Op<0>()->getType())->getNumElements();
+  int NumMaskElts = cast<FixedVectorType>(getType())->getNumElements();
   if (NumMaskElts != NumOpElts * 2)
     return false;
 
@@ -2614,6 +2657,7 @@ bool CastInst::isNoopCast(Instruction::CastOps Opcode,
                           Type *SrcTy,
                           Type *DestTy,
                           const DataLayout &DL) {
+  assert(castIsValid(Opcode, SrcTy, DestTy) && "method precondition");
   switch (Opcode) {
     default: llvm_unreachable("Invalid CastOp");
     case Instruction::Trunc:
@@ -2968,8 +3012,8 @@ CastInst *CastInst::CreatePointerCast(Value *S, Type *Ty,
          "Invalid cast");
   assert(Ty->isVectorTy() == S->getType()->isVectorTy() && "Invalid cast");
   assert((!Ty->isVectorTy() ||
-          cast<VectorType>(Ty)->getNumElements() ==
-              cast<VectorType>(S->getType())->getNumElements()) &&
+          cast<VectorType>(Ty)->getElementCount() ==
+              cast<VectorType>(S->getType())->getElementCount()) &&
          "Invalid cast");
 
   if (Ty->isIntOrIntVectorTy())
@@ -2987,8 +3031,8 @@ CastInst *CastInst::CreatePointerCast(Value *S, Type *Ty,
          "Invalid cast");
   assert(Ty->isVectorTy() == S->getType()->isVectorTy() && "Invalid cast");
   assert((!Ty->isVectorTy() ||
-          cast<VectorType>(Ty)->getNumElements() ==
-              cast<VectorType>(S->getType())->getNumElements()) &&
+          cast<VectorType>(Ty)->getElementCount() ==
+              cast<VectorType>(S->getType())->getElementCount()) &&
          "Invalid cast");
 
   if (Ty->isIntOrIntVectorTy())
@@ -3088,63 +3132,6 @@ CastInst *CastInst::CreateFPCast(Value *C, Type *Ty,
   return Create(opcode, C, Ty, Name, InsertAtEnd);
 }
 
-// Check whether it is valid to call getCastOpcode for these types.
-// This routine must be kept in sync with getCastOpcode.
-bool CastInst::isCastable(Type *SrcTy, Type *DestTy) {
-  if (!SrcTy->isFirstClassType() || !DestTy->isFirstClassType())
-    return false;
-
-  if (SrcTy == DestTy)
-    return true;
-
-  if (VectorType *SrcVecTy = dyn_cast<VectorType>(SrcTy))
-    if (VectorType *DestVecTy = dyn_cast<VectorType>(DestTy))
-      if (SrcVecTy->getNumElements() == DestVecTy->getNumElements()) {
-        // An element by element cast.  Valid if casting the elements is valid.
-        SrcTy = SrcVecTy->getElementType();
-        DestTy = DestVecTy->getElementType();
-      }
-
-  // Get the bit sizes, we'll need these
-  TypeSize SrcBits = SrcTy->getPrimitiveSizeInBits();   // 0 for ptr
-  TypeSize DestBits = DestTy->getPrimitiveSizeInBits(); // 0 for ptr
-
-  // Run through the possibilities ...
-  if (DestTy->isIntegerTy()) {               // Casting to integral
-    if (SrcTy->isIntegerTy())                // Casting from integral
-        return true;
-    if (SrcTy->isFloatingPointTy())   // Casting from floating pt
-      return true;
-    if (SrcTy->isVectorTy())          // Casting from vector
-      return DestBits == SrcBits;
-                                      // Casting from something else
-    return SrcTy->isPointerTy();
-  }
-  if (DestTy->isFloatingPointTy()) {  // Casting to floating pt
-    if (SrcTy->isIntegerTy())                // Casting from integral
-      return true;
-    if (SrcTy->isFloatingPointTy())   // Casting from floating pt
-      return true;
-    if (SrcTy->isVectorTy())          // Casting from vector
-      return DestBits == SrcBits;
-                                    // Casting from something else
-    return false;
-  }
-  if (DestTy->isVectorTy())         // Casting to vector
-    return DestBits == SrcBits;
-  if (DestTy->isPointerTy()) {        // Casting to pointer
-    if (SrcTy->isPointerTy())                // Casting from pointer
-      return true;
-    return SrcTy->isIntegerTy();             // Casting from integral
-  }
-  if (DestTy->isX86_MMXTy()) {
-    if (SrcTy->isVectorTy())
-      return DestBits == SrcBits;       // 64-bit vector to MMX
-    return false;
-  }                                    // Casting to something else
-  return false;
-}
-
 bool CastInst::isBitCastable(Type *SrcTy, Type *DestTy) {
   if (!SrcTy->isFirstClassType() || !DestTy->isFirstClassType())
     return false;
@@ -3206,7 +3193,6 @@ bool CastInst::isBitOrNoopPointerCastable(Type *SrcTy, Type *DestTy,
 //   castIsValid( getCastOpcode(Val, Ty), Val, Ty)
 // should not assert in castIsValid. In other words, this produces a "correct"
 // casting opcode for the arguments passed to it.
-// This routine must be kept in sync with isCastable.
 Instruction::CastOps
 CastInst::getCastOpcode(
   const Value *Src, bool SrcIsSigned, Type *DestTy, bool DestIsSigned) {
@@ -3221,7 +3207,7 @@ CastInst::getCastOpcode(
   // FIXME: Check address space sizes here
   if (VectorType *SrcVecTy = dyn_cast<VectorType>(SrcTy))
     if (VectorType *DestVecTy = dyn_cast<VectorType>(DestTy))
-      if (SrcVecTy->getNumElements() == DestVecTy->getNumElements()) {
+      if (SrcVecTy->getElementCount() == DestVecTy->getElementCount()) {
         // An element by element cast.  Find the appropriate opcode based on the
         // element types.
         SrcTy = SrcVecTy->getElementType();
@@ -3311,10 +3297,7 @@ CastInst::getCastOpcode(
 /// it in one place and to eliminate the redundant code for getting the sizes
 /// of the types involved.
 bool
-CastInst::castIsValid(Instruction::CastOps op, Value *S, Type *DstTy) {
-  // Check for type sanity on the arguments
-  Type *SrcTy = S->getType();
-
+CastInst::castIsValid(Instruction::CastOps op, Type *SrcTy, Type *DstTy) {
   if (!SrcTy->isFirstClassType() || !DstTy->isFirstClassType() ||
       SrcTy->isAggregateType() || DstTy->isAggregateType())
     return false;
@@ -3330,9 +3313,9 @@ CastInst::castIsValid(Instruction::CastOps op, Value *S, Type *DstTy) {
   // scalar types means that checking that vector lengths match also checks that
   // scalars are not being converted to vectors or vectors to scalars).
   ElementCount SrcEC = SrcIsVec ? cast<VectorType>(SrcTy)->getElementCount()
-                                : ElementCount(0, false);
+                                : ElementCount::getFixed(0);
   ElementCount DstEC = DstIsVec ? cast<VectorType>(DstTy)->getElementCount()
-                                : ElementCount(0, false);
+                                : ElementCount::getFixed(0);
 
   // Switch on the opcode provided
   switch (op) {
@@ -3390,9 +3373,9 @@ CastInst::castIsValid(Instruction::CastOps op, Value *S, Type *DstTy) {
     if (SrcIsVec && DstIsVec)
       return SrcEC == DstEC;
     if (SrcIsVec)
-      return SrcEC == ElementCount(1, false);
+      return SrcEC == ElementCount::getFixed(1);
     if (DstIsVec)
-      return DstEC == ElementCount(1, false);
+      return DstEC == ElementCount::getFixed(1);
 
     return true;
   }
@@ -3643,10 +3626,12 @@ bool CmpInst::isCommutative() const {
   return cast<FCmpInst>(this)->isCommutative();
 }
 
-bool CmpInst::isEquality() const {
-  if (const ICmpInst *IC = dyn_cast<ICmpInst>(this))
-    return IC->isEquality();
-  return cast<FCmpInst>(this)->isEquality();
+bool CmpInst::isEquality(Predicate P) {
+  if (ICmpInst::isIntPredicate(P))
+    return ICmpInst::isEquality(P);
+  if (FCmpInst::isFPPredicate(P))
+    return FCmpInst::isEquality(P);
+  llvm_unreachable("Unsupported predicate kind");
 }
 
 CmpInst::Predicate CmpInst::getInversePredicate(Predicate pred) {
@@ -3740,29 +3725,6 @@ ICmpInst::Predicate ICmpInst::getUnsignedPredicate(Predicate pred) {
   }
 }
 
-CmpInst::Predicate CmpInst::getFlippedStrictnessPredicate(Predicate pred) {
-  switch (pred) {
-    default: llvm_unreachable("Unknown or unsupported cmp predicate!");
-    case ICMP_SGT: return ICMP_SGE;
-    case ICMP_SLT: return ICMP_SLE;
-    case ICMP_SGE: return ICMP_SGT;
-    case ICMP_SLE: return ICMP_SLT;
-    case ICMP_UGT: return ICMP_UGE;
-    case ICMP_ULT: return ICMP_ULE;
-    case ICMP_UGE: return ICMP_UGT;
-    case ICMP_ULE: return ICMP_ULT;
-
-    case FCMP_OGT: return FCMP_OGE;
-    case FCMP_OLT: return FCMP_OLE;
-    case FCMP_OGE: return FCMP_OGT;
-    case FCMP_OLE: return FCMP_OLT;
-    case FCMP_UGT: return FCMP_UGE;
-    case FCMP_ULT: return FCMP_ULE;
-    case FCMP_UGE: return FCMP_UGT;
-    case FCMP_ULE: return FCMP_ULT;
-  }
-}
-
 CmpInst::Predicate CmpInst::getSwappedPredicate(Predicate pred) {
   switch (pred) {
     default: llvm_unreachable("Unknown cmp predicate!");
@@ -3793,22 +3755,97 @@ CmpInst::Predicate CmpInst::getSwappedPredicate(Predicate pred) {
   }
 }
 
+bool CmpInst::isNonStrictPredicate(Predicate pred) {
+  switch (pred) {
+  case ICMP_SGE:
+  case ICMP_SLE:
+  case ICMP_UGE:
+  case ICMP_ULE:
+  case FCMP_OGE:
+  case FCMP_OLE:
+  case FCMP_UGE:
+  case FCMP_ULE:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool CmpInst::isStrictPredicate(Predicate pred) {
+  switch (pred) {
+  case ICMP_SGT:
+  case ICMP_SLT:
+  case ICMP_UGT:
+  case ICMP_ULT:
+  case FCMP_OGT:
+  case FCMP_OLT:
+  case FCMP_UGT:
+  case FCMP_ULT:
+    return true;
+  default:
+    return false;
+  }
+}
+
+CmpInst::Predicate CmpInst::getStrictPredicate(Predicate pred) {
+  switch (pred) {
+  case ICMP_SGE:
+    return ICMP_SGT;
+  case ICMP_SLE:
+    return ICMP_SLT;
+  case ICMP_UGE:
+    return ICMP_UGT;
+  case ICMP_ULE:
+    return ICMP_ULT;
+  case FCMP_OGE:
+    return FCMP_OGT;
+  case FCMP_OLE:
+    return FCMP_OLT;
+  case FCMP_UGE:
+    return FCMP_UGT;
+  case FCMP_ULE:
+    return FCMP_ULT;
+  default:
+    return pred;
+  }
+}
+
 CmpInst::Predicate CmpInst::getNonStrictPredicate(Predicate pred) {
   switch (pred) {
-  case ICMP_SGT: return ICMP_SGE;
-  case ICMP_SLT: return ICMP_SLE;
-  case ICMP_UGT: return ICMP_UGE;
-  case ICMP_ULT: return ICMP_ULE;
-  case FCMP_OGT: return FCMP_OGE;
-  case FCMP_OLT: return FCMP_OLE;
-  case FCMP_UGT: return FCMP_UGE;
-  case FCMP_ULT: return FCMP_ULE;
-  default: return pred;
+  case ICMP_SGT:
+    return ICMP_SGE;
+  case ICMP_SLT:
+    return ICMP_SLE;
+  case ICMP_UGT:
+    return ICMP_UGE;
+  case ICMP_ULT:
+    return ICMP_ULE;
+  case FCMP_OGT:
+    return FCMP_OGE;
+  case FCMP_OLT:
+    return FCMP_OLE;
+  case FCMP_UGT:
+    return FCMP_UGE;
+  case FCMP_ULT:
+    return FCMP_ULE;
+  default:
+    return pred;
   }
 }
 
+CmpInst::Predicate CmpInst::getFlippedStrictnessPredicate(Predicate pred) {
+  assert(CmpInst::isRelational(pred) && "Call only with relational predicate!");
+
+  if (isStrictPredicate(pred))
+    return getNonStrictPredicate(pred);
+  if (isNonStrictPredicate(pred))
+    return getStrictPredicate(pred);
+
+  llvm_unreachable("Unknown predicate!");
+}
+
 CmpInst::Predicate CmpInst::getSignedPredicate(Predicate pred) {
-  assert(CmpInst::isUnsigned(pred) && "Call only with signed predicates!");
+  assert(CmpInst::isUnsigned(pred) && "Call only with unsigned predicates!");
 
   switch (pred) {
   default:
@@ -3824,6 +3861,23 @@ CmpInst::Predicate CmpInst::getSignedPredicate(Predicate pred) {
   }
 }
 
+CmpInst::Predicate CmpInst::getUnsignedPredicate(Predicate pred) {
+  assert(CmpInst::isSigned(pred) && "Call only with signed predicates!");
+
+  switch (pred) {
+  default:
+    llvm_unreachable("Unknown predicate!");
+  case CmpInst::ICMP_SLT:
+    return CmpInst::ICMP_ULT;
+  case CmpInst::ICMP_SLE:
+    return CmpInst::ICMP_ULE;
+  case CmpInst::ICMP_SGT:
+    return CmpInst::ICMP_UGT;
+  case CmpInst::ICMP_SGE:
+    return CmpInst::ICMP_UGE;
+  }
+}
+
 bool CmpInst::isUnsigned(Predicate predicate) {
   switch (predicate) {
     default: return false;
@@ -3840,6 +3894,18 @@ bool CmpInst::isSigned(Predicate predicate) {
   }
 }
 
+CmpInst::Predicate CmpInst::getFlippedSignednessPredicate(Predicate pred) {
+  assert(CmpInst::isRelational(pred) &&
+         "Call only with non-equality predicates!");
+
+  if (isSigned(pred))
+    return getUnsignedPredicate(pred);
+  if (isUnsigned(pred))
+    return getSignedPredicate(pred);
+
+  llvm_unreachable("Unknown predicate!");
+}
+
 bool CmpInst::isOrdered(Predicate predicate) {
   switch (predicate) {
     default: return false;
diff --git a/contrib/llvm-project/llvm/lib/IR/IntrinsicInst.cpp b/contrib/llvm-project/llvm/lib/IR/IntrinsicInst.cpp
index c4e06cd979ed..3d1ea2853591 100644
--- a/contrib/llvm-project/llvm/lib/IR/IntrinsicInst.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/IntrinsicInst.cpp
@@ -208,7 +208,7 @@ Optional<int> VPIntrinsic::GetMaskParamPos(Intrinsic::ID IntrinsicID) {
   default:
     return None;
 
-#define REGISTER_VP_INTRINSIC(VPID, MASKPOS, VLENPOS)                          \
+#define BEGIN_REGISTER_VP_INTRINSIC(VPID, MASKPOS, VLENPOS)                    \
   case Intrinsic::VPID:                                                        \
     return MASKPOS;
 #include "llvm/IR/VPIntrinsics.def"
@@ -220,7 +220,7 @@ Optional<int> VPIntrinsic::GetVectorLengthParamPos(Intrinsic::ID IntrinsicID) {
   default:
     return None;
 
-#define REGISTER_VP_INTRINSIC(VPID, MASKPOS, VLENPOS)                          \
+#define BEGIN_REGISTER_VP_INTRINSIC(VPID, MASKPOS, VLENPOS)                    \
   case Intrinsic::VPID:                                                        \
     return VLENPOS;
 #include "llvm/IR/VPIntrinsics.def"
@@ -232,7 +232,7 @@ bool VPIntrinsic::IsVPIntrinsic(Intrinsic::ID ID) {
   default:
     return false;
 
-#define REGISTER_VP_INTRINSIC(VPID, MASKPOS, VLENPOS)                          \
+#define BEGIN_REGISTER_VP_INTRINSIC(VPID, MASKPOS, VLENPOS)                    \
   case Intrinsic::VPID:                                                        \
     break;
 #include "llvm/IR/VPIntrinsics.def"
@@ -242,25 +242,26 @@ bool VPIntrinsic::IsVPIntrinsic(Intrinsic::ID ID) {
 
 // Equivalent non-predicated opcode
 unsigned VPIntrinsic::GetFunctionalOpcodeForVP(Intrinsic::ID ID) {
+  unsigned FunctionalOC = Instruction::Call;
   switch (ID) {
   default:
-    return Instruction::Call;
-
-#define HANDLE_VP_TO_OC(VPID, OC)                                              \
-  case Intrinsic::VPID:                                                        \
-    return Instruction::OC;
+    break;
+#define BEGIN_REGISTER_VP_INTRINSIC(VPID, ...) case Intrinsic::VPID:
+#define HANDLE_VP_TO_OPC(OPC) FunctionalOC = Instruction::OPC;
+#define END_REGISTER_VP_INTRINSIC(...) break;
 #include "llvm/IR/VPIntrinsics.def"
   }
+
+  return FunctionalOC;
 }
 
-Intrinsic::ID VPIntrinsic::GetForOpcode(unsigned OC) {
-  switch (OC) {
+Intrinsic::ID VPIntrinsic::GetForOpcode(unsigned IROPC) {
+  switch (IROPC) {
   default:
     return Intrinsic::not_intrinsic;
 
-#define HANDLE_VP_TO_OC(VPID, OC)                                              \
-  case Instruction::OC:                                                        \
-    return Intrinsic::VPID;
+#define HANDLE_VP_TO_OPC(OPC) case Instruction::OPC:
+#define END_REGISTER_VP_INTRINSIC(VPID) return Intrinsic::VPID;
 #include "llvm/IR/VPIntrinsics.def"
   }
 }
@@ -280,8 +281,8 @@ bool VPIntrinsic::canIgnoreVectorLengthParam() const {
   // the operation. This function returns true when this is detected statically
   // in the IR.
 
-  // Check whether "W == vscale * EC.Min"
-  if (EC.Scalable) {
+  // Check whether "W == vscale * EC.getKnownMinValue()"
+  if (EC.isScalable()) {
     // Undig the DL
     auto ParMod = this->getModule();
     if (!ParMod)
@@ -291,8 +292,8 @@ bool VPIntrinsic::canIgnoreVectorLengthParam() const {
     // Compare vscale patterns
     uint64_t VScaleFactor;
     if (match(VLParam, m_c_Mul(m_ConstantInt(VScaleFactor), m_VScale(DL))))
-      return VScaleFactor >= EC.Min;
-    return (EC.Min == 1) && match(VLParam, m_VScale(DL));
+      return VScaleFactor >= EC.getKnownMinValue();
+    return (EC.getKnownMinValue() == 1) && match(VLParam, m_VScale(DL));
   }
 
   // standard SIMD operation
@@ -301,7 +302,7 @@ bool VPIntrinsic::canIgnoreVectorLengthParam() const {
     return false;
 
   uint64_t VLNum = VLConst->getZExtValue();
-  if (VLNum >= EC.Min)
+  if (VLNum >= EC.getKnownMinValue())
     return true;
 
   return false;
diff --git a/contrib/llvm-project/llvm/lib/IR/LLVMContext.cpp b/contrib/llvm-project/llvm/lib/IR/LLVMContext.cpp
index 7ebca5274369..4f292101256d 100644
--- a/contrib/llvm-project/llvm/lib/IR/LLVMContext.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/LLVMContext.cpp
@@ -146,11 +146,16 @@ bool LLVMContext::getDiagnosticsHotnessRequested() const {
   return pImpl->DiagnosticsHotnessRequested;
 }
 
-void LLVMContext::setDiagnosticsHotnessThreshold(uint64_t Threshold) {
+void LLVMContext::setDiagnosticsHotnessThreshold(Optional<uint64_t> Threshold) {
   pImpl->DiagnosticsHotnessThreshold = Threshold;
 }
+
 uint64_t LLVMContext::getDiagnosticsHotnessThreshold() const {
-  return pImpl->DiagnosticsHotnessThreshold;
+  return pImpl->DiagnosticsHotnessThreshold.getValueOr(UINT64_MAX);
+}
+
+bool LLVMContext::isDiagnosticsHotnessThresholdSetFromPSI() const {
+  return !pImpl->DiagnosticsHotnessThreshold.hasValue();
 }
 
 remarks::RemarkStreamer *LLVMContext::getMainRemarkStreamer() {
diff --git a/contrib/llvm-project/llvm/lib/IR/LLVMContextImpl.cpp b/contrib/llvm-project/llvm/lib/IR/LLVMContextImpl.cpp
index f197b3e67d30..e998138ec3cb 100644
--- a/contrib/llvm-project/llvm/lib/IR/LLVMContextImpl.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/LLVMContextImpl.cpp
@@ -35,6 +35,7 @@ LLVMContextImpl::LLVMContextImpl(LLVMContext &C)
     FP128Ty(C, Type::FP128TyID),
     PPC_FP128Ty(C, Type::PPC_FP128TyID),
     X86_MMXTy(C, Type::X86_MMXTyID),
+    X86_AMXTy(C, Type::X86_AMXTyID),
     Int1Ty(C, 1),
     Int8Ty(C, 8),
     Int16Ty(C, 16),
@@ -50,11 +51,10 @@ LLVMContextImpl::~LLVMContextImpl() {
     delete *OwnedModules.begin();
 
 #ifndef NDEBUG
-  // Check for metadata references from leaked Instructions.
-  for (auto &Pair : InstructionMetadata)
+  // Check for metadata references from leaked Values.
+  for (auto &Pair : ValueMetadata)
     Pair.first->dump();
-  assert(InstructionMetadata.empty() &&
-         "Instructions with metadata have been leaked");
+  assert(ValueMetadata.empty() && "Values with metadata have been leaked");
 #endif
 
   // Drop references for MDNodes.  Do this before Values get deleted to avoid
@@ -98,11 +98,9 @@ LLVMContextImpl::~LLVMContextImpl() {
   CAZConstants.clear();
   CPNConstants.clear();
   UVConstants.clear();
+  PVConstants.clear();
   IntConstants.clear();
   FPConstants.clear();
-
-  for (auto &CDSConstant : CDSConstants)
-    delete CDSConstant.second;
   CDSConstants.clear();
 
   // Destroy attribute node lists.
@@ -129,8 +127,15 @@ LLVMContextImpl::~LLVMContextImpl() {
 }
 
 void LLVMContextImpl::dropTriviallyDeadConstantArrays() {
-  SmallSetVector<ConstantArray *, 4> WorkList(ArrayConstants.begin(),
-                                              ArrayConstants.end());
+  SmallSetVector<ConstantArray *, 4> WorkList;
+
+  // When ArrayConstants are of substantial size and only a few in them are
+  // dead, starting WorkList with all elements of ArrayConstants can be
+  // wasteful. Instead, starting WorkList with only elements that have empty
+  // uses.
+  for (ConstantArray *C : ArrayConstants)
+    if (C->use_empty())
+      WorkList.insert(C);
 
   while (!WorkList.empty()) {
     ConstantArray *C = WorkList.pop_back_val();
@@ -171,7 +176,7 @@ unsigned MDNodeOpsKey::calculateHash(MDNode *N, unsigned Offset) {
   unsigned Hash = hash_combine_range(N->op_begin() + Offset, N->op_end());
 #ifndef NDEBUG
   {
-    SmallVector<Metadata *, 8> MDs(N->op_begin() + Offset, N->op_end());
+    SmallVector<Metadata *, 8> MDs(drop_begin(N->operands(), Offset));
     unsigned RawHash = calculateHash(MDs);
     assert(Hash == RawHash &&
            "Expected hash of MDOperand to equal hash of Metadata*");
@@ -215,19 +220,8 @@ void LLVMContextImpl::getSyncScopeNames(
     SSNs[SSE.second] = SSE.first();
 }
 
-/// Singleton instance of the OptBisect class.
-///
-/// This singleton is accessed via the LLVMContext::getOptPassGate() function.
-/// It provides a mechanism to disable passes and individual optimizations at
-/// compile time based on a command line option (-opt-bisect-limit) in order to
-/// perform a bisecting search for optimization-related problems.
-///
-/// Even if multiple LLVMContext objects are created, they will all return the
-/// same instance of OptBisect in order to provide a single bisect count.  Any
-/// code that uses the OptBisect object should be serialized when bisection is
-/// enabled in order to enable a consistent bisect count.
-static ManagedStatic<OptBisect> OptBisector;
-
+/// Gets the OptPassGate for this LLVMContextImpl, which defaults to the
+/// singleton OptBisect if not explicitly set.
 OptPassGate &LLVMContextImpl::getOptPassGate() const {
   if (!OPG)
     OPG = &(*OptBisector);
diff --git a/contrib/llvm-project/llvm/lib/IR/LLVMContextImpl.h b/contrib/llvm-project/llvm/lib/IR/LLVMContextImpl.h
index 1c7d8746d242..05fd1814e2dc 100644
--- a/contrib/llvm-project/llvm/lib/IR/LLVMContextImpl.h
+++ b/contrib/llvm-project/llvm/lib/IR/LLVMContextImpl.h
@@ -57,27 +57,7 @@ class Type;
 class Value;
 class ValueHandleBase;
 
-struct DenseMapAPIntKeyInfo {
-  static inline APInt getEmptyKey() {
-    APInt V(nullptr, 0);
-    V.U.VAL = 0;
-    return V;
-  }
-
-  static inline APInt getTombstoneKey() {
-    APInt V(nullptr, 0);
-    V.U.VAL = 1;
-    return V;
-  }
-
-  static unsigned getHashValue(const APInt &Key) {
-    return static_cast<unsigned>(hash_value(Key));
-  }
-
-  static bool isEqual(const APInt &LHS, const APInt &RHS) {
-    return LHS.getBitWidth() == RHS.getBitWidth() && LHS == RHS;
-  }
-};
+using DenseMapAPIntKeyInfo = DenseMapInfo<APInt>;
 
 struct DenseMapAPFloatKeyInfo {
   static inline APFloat getEmptyKey() { return APFloat(APFloat::Bogus(), 1); }
@@ -366,6 +346,36 @@ template <> struct MDNodeKeyImpl<DISubrange> {
   }
 };
 
+template <> struct MDNodeKeyImpl<DIGenericSubrange> {
+  Metadata *CountNode;
+  Metadata *LowerBound;
+  Metadata *UpperBound;
+  Metadata *Stride;
+
+  MDNodeKeyImpl(Metadata *CountNode, Metadata *LowerBound, Metadata *UpperBound,
+                Metadata *Stride)
+      : CountNode(CountNode), LowerBound(LowerBound), UpperBound(UpperBound),
+        Stride(Stride) {}
+  MDNodeKeyImpl(const DIGenericSubrange *N)
+      : CountNode(N->getRawCountNode()), LowerBound(N->getRawLowerBound()),
+        UpperBound(N->getRawUpperBound()), Stride(N->getRawStride()) {}
+
+  bool isKeyOf(const DIGenericSubrange *RHS) const {
+    return (CountNode == RHS->getRawCountNode()) &&
+           (LowerBound == RHS->getRawLowerBound()) &&
+           (UpperBound == RHS->getRawUpperBound()) &&
+           (Stride == RHS->getRawStride());
+  }
+
+  unsigned getHashValue() const {
+    auto *MD = dyn_cast_or_null<ConstantAsMetadata>(CountNode);
+    if (CountNode && MD)
+      return hash_combine(cast<ConstantInt>(MD->getValue())->getSExtValue(),
+                          LowerBound, UpperBound, Stride);
+    return hash_combine(CountNode, LowerBound, UpperBound, Stride);
+  }
+};
+
 template <> struct MDNodeKeyImpl<DIEnumerator> {
   APInt Value;
   MDString *Name;
@@ -417,6 +427,37 @@ template <> struct MDNodeKeyImpl<DIBasicType> {
   }
 };
 
+template <> struct MDNodeKeyImpl<DIStringType> {
+  unsigned Tag;
+  MDString *Name;
+  Metadata *StringLength;
+  Metadata *StringLengthExp;
+  uint64_t SizeInBits;
+  uint32_t AlignInBits;
+  unsigned Encoding;
+
+  MDNodeKeyImpl(unsigned Tag, MDString *Name, Metadata *StringLength,
+                Metadata *StringLengthExp, uint64_t SizeInBits,
+                uint32_t AlignInBits, unsigned Encoding)
+      : Tag(Tag), Name(Name), StringLength(StringLength),
+        StringLengthExp(StringLengthExp), SizeInBits(SizeInBits),
+        AlignInBits(AlignInBits), Encoding(Encoding) {}
+  MDNodeKeyImpl(const DIStringType *N)
+      : Tag(N->getTag()), Name(N->getRawName()),
+        StringLength(N->getRawStringLength()),
+        StringLengthExp(N->getRawStringLengthExp()),
+        SizeInBits(N->getSizeInBits()), AlignInBits(N->getAlignInBits()),
+        Encoding(N->getEncoding()) {}
+
+  bool isKeyOf(const DIStringType *RHS) const {
+    return Tag == RHS->getTag() && Name == RHS->getRawName() &&
+           SizeInBits == RHS->getSizeInBits() &&
+           AlignInBits == RHS->getAlignInBits() &&
+           Encoding == RHS->getEncoding();
+  }
+  unsigned getHashValue() const { return hash_combine(Tag, Name, Encoding); }
+};
+
 template <> struct MDNodeKeyImpl<DIDerivedType> {
   unsigned Tag;
   MDString *Name;
@@ -525,6 +566,9 @@ template <> struct MDNodeKeyImpl<DICompositeType> {
   MDString *Identifier;
   Metadata *Discriminator;
   Metadata *DataLocation;
+  Metadata *Associated;
+  Metadata *Allocated;
+  Metadata *Rank;
 
   MDNodeKeyImpl(unsigned Tag, MDString *Name, Metadata *File, unsigned Line,
                 Metadata *Scope, Metadata *BaseType, uint64_t SizeInBits,
@@ -532,13 +576,15 @@ template <> struct MDNodeKeyImpl<DICompositeType> {
                 Metadata *Elements, unsigned RuntimeLang,
                 Metadata *VTableHolder, Metadata *TemplateParams,
                 MDString *Identifier, Metadata *Discriminator,
-                Metadata *DataLocation)
+                Metadata *DataLocation, Metadata *Associated,
+                Metadata *Allocated, Metadata *Rank)
       : Tag(Tag), Name(Name), File(File), Line(Line), Scope(Scope),
         BaseType(BaseType), SizeInBits(SizeInBits), OffsetInBits(OffsetInBits),
         AlignInBits(AlignInBits), Flags(Flags), Elements(Elements),
         RuntimeLang(RuntimeLang), VTableHolder(VTableHolder),
         TemplateParams(TemplateParams), Identifier(Identifier),
-        Discriminator(Discriminator), DataLocation(DataLocation) {}
+        Discriminator(Discriminator), DataLocation(DataLocation),
+        Associated(Associated), Allocated(Allocated), Rank(Rank) {}
   MDNodeKeyImpl(const DICompositeType *N)
       : Tag(N->getTag()), Name(N->getRawName()), File(N->getRawFile()),
         Line(N->getLine()), Scope(N->getRawScope()),
@@ -549,7 +595,9 @@ template <> struct MDNodeKeyImpl<DICompositeType> {
         TemplateParams(N->getRawTemplateParams()),
         Identifier(N->getRawIdentifier()),
         Discriminator(N->getRawDiscriminator()),
-        DataLocation(N->getRawDataLocation()) {}
+        DataLocation(N->getRawDataLocation()),
+        Associated(N->getRawAssociated()), Allocated(N->getRawAllocated()),
+        Rank(N->getRawRank()) {}
 
   bool isKeyOf(const DICompositeType *RHS) const {
     return Tag == RHS->getTag() && Name == RHS->getRawName() &&
@@ -564,7 +612,9 @@ template <> struct MDNodeKeyImpl<DICompositeType> {
            TemplateParams == RHS->getRawTemplateParams() &&
            Identifier == RHS->getRawIdentifier() &&
            Discriminator == RHS->getRawDiscriminator() &&
-           DataLocation == RHS->getRawDataLocation();
+           DataLocation == RHS->getRawDataLocation() &&
+           Associated == RHS->getRawAssociated() &&
+           Allocated == RHS->getRawAllocated() && Rank == RHS->getRawRank();
   }
 
   unsigned getHashValue() const {
@@ -841,25 +891,28 @@ template <> struct MDNodeKeyImpl<DIModule> {
   MDString *IncludePath;
   MDString *APINotesFile;
   unsigned LineNo;
+  bool IsDecl;
 
   MDNodeKeyImpl(Metadata *File, Metadata *Scope, MDString *Name,
                 MDString *ConfigurationMacros, MDString *IncludePath,
-                MDString *APINotesFile, unsigned LineNo)
+                MDString *APINotesFile, unsigned LineNo, bool IsDecl)
       : File(File), Scope(Scope), Name(Name),
         ConfigurationMacros(ConfigurationMacros), IncludePath(IncludePath),
-        APINotesFile(APINotesFile), LineNo(LineNo) {}
+        APINotesFile(APINotesFile), LineNo(LineNo), IsDecl(IsDecl) {}
   MDNodeKeyImpl(const DIModule *N)
       : File(N->getRawFile()), Scope(N->getRawScope()), Name(N->getRawName()),
         ConfigurationMacros(N->getRawConfigurationMacros()),
         IncludePath(N->getRawIncludePath()),
-        APINotesFile(N->getRawAPINotesFile()), LineNo(N->getLineNo()) {}
+        APINotesFile(N->getRawAPINotesFile()), LineNo(N->getLineNo()),
+        IsDecl(N->getIsDecl()) {}
 
   bool isKeyOf(const DIModule *RHS) const {
     return Scope == RHS->getRawScope() && Name == RHS->getRawName() &&
            ConfigurationMacros == RHS->getRawConfigurationMacros() &&
            IncludePath == RHS->getRawIncludePath() &&
            APINotesFile == RHS->getRawAPINotesFile() &&
-           File == RHS->getRawFile() && LineNo == RHS->getLineNo();
+           File == RHS->getRawFile() && LineNo == RHS->getLineNo() &&
+           IsDecl == RHS->getIsDecl();
   }
 
   unsigned getHashValue() const {
@@ -1204,72 +1257,55 @@ template <class NodeTy> struct MDNodeInfo {
 #define HANDLE_MDNODE_LEAF(CLASS) using CLASS##Info = MDNodeInfo<CLASS>;
 #include "llvm/IR/Metadata.def"
 
-/// Map-like storage for metadata attachments.
-class MDAttachmentMap {
-  SmallVector<std::pair<unsigned, TrackingMDNodeRef>, 2> Attachments;
-
+/// Multimap-like storage for metadata attachments.
+class MDAttachments {
 public:
-  bool empty() const { return Attachments.empty(); }
-  size_t size() const { return Attachments.size(); }
-
-  /// Get a particular attachment (if any).
-  MDNode *lookup(unsigned ID) const;
-
-  /// Set an attachment to a particular node.
-  ///
-  /// Set the \c ID attachment to \c MD, replacing the current attachment at \c
-  /// ID (if anyway).
-  void set(unsigned ID, MDNode &MD);
-
-  /// Remove an attachment.
-  ///
-  /// Remove the attachment at \c ID, if any.
-  bool erase(unsigned ID);
-
-  /// Copy out all the attachments.
-  ///
-  /// Copies all the current attachments into \c Result, sorting by attachment
-  /// ID.  This function does \em not clear \c Result.
-  void getAll(SmallVectorImpl<std::pair<unsigned, MDNode *>> &Result) const;
-
-  /// Erase matching attachments.
-  ///
-  /// Erases all attachments matching the \c shouldRemove predicate.
-  template <class PredTy> void remove_if(PredTy shouldRemove) {
-    Attachments.erase(llvm::remove_if(Attachments, shouldRemove),
-                      Attachments.end());
-  }
-};
-
-/// Multimap-like storage for metadata attachments for globals. This differs
-/// from MDAttachmentMap in that it allows multiple attachments per metadata
-/// kind.
-class MDGlobalAttachmentMap {
   struct Attachment {
     unsigned MDKind;
     TrackingMDNodeRef Node;
   };
+
+private:
   SmallVector<Attachment, 1> Attachments;
 
 public:
   bool empty() const { return Attachments.empty(); }
-
-  /// Appends all attachments with the given ID to \c Result in insertion order.
-  /// If the global has no attachments with the given ID, or if ID is invalid,
-  /// leaves Result unchanged.
-  void get(unsigned ID, SmallVectorImpl<MDNode *> &Result) const;
+  size_t size() const { return Attachments.size(); }
 
   /// Returns the first attachment with the given ID or nullptr if no such
   /// attachment exists.
   MDNode *lookup(unsigned ID) const;
 
-  void insert(unsigned ID, MDNode &MD);
-  bool erase(unsigned ID);
+  /// Appends all attachments with the given ID to \c Result in insertion order.
+  /// If the global has no attachments with the given ID, or if ID is invalid,
+  /// leaves Result unchanged.
+  void get(unsigned ID, SmallVectorImpl<MDNode *> &Result) const;
 
   /// Appends all attachments for the global to \c Result, sorting by attachment
   /// ID. Attachments with the same ID appear in insertion order. This function
   /// does \em not clear \c Result.
   void getAll(SmallVectorImpl<std::pair<unsigned, MDNode *>> &Result) const;
+
+  /// Set an attachment to a particular node.
+  ///
+  /// Set the \c ID attachment to \c MD, replacing the current attachments at \c
+  /// ID (if anyway).
+  void set(unsigned ID, MDNode *MD);
+
+  /// Adds an attachment to a particular node.
+  void insert(unsigned ID, MDNode &MD);
+
+  /// Remove attachments with the given ID.
+  ///
+  /// Remove the attachments at \c ID, if any.
+  bool erase(unsigned ID);
+
+  /// Erase matching attachments.
+  ///
+  /// Erases all attachments matching the \c shouldRemove predicate.
+  template <class PredTy> void remove_if(PredTy shouldRemove) {
+    llvm::erase_if(Attachments, shouldRemove);
+  }
 };
 
 class LLVMContextImpl {
@@ -1289,7 +1325,26 @@ public:
   std::unique_ptr<DiagnosticHandler> DiagHandler;
   bool RespectDiagnosticFilters = false;
   bool DiagnosticsHotnessRequested = false;
-  uint64_t DiagnosticsHotnessThreshold = 0;
+  /// The minimum hotness value a diagnostic needs in order to be included in
+  /// optimization diagnostics.
+  ///
+  /// The threshold is an Optional value, which maps to one of the 3 states:
+  /// 1). 0            => threshold disabled. All emarks will be printed.
+  /// 2). positive int => manual threshold by user. Remarks with hotness exceed
+  ///                     threshold will be printed.
+  /// 3). None         => 'auto' threshold by user. The actual value is not
+  ///                     available at command line, but will be synced with
+  ///                     hotness threhold from profile summary during
+  ///                     compilation.
+  ///
+  /// State 1 and 2 are considered as terminal states. State transition is
+  /// only allowed from 3 to 2, when the threshold is first synced with profile
+  /// summary. This ensures that the threshold is set only once and stays
+  /// constant.
+  ///
+  /// If threshold option is not specified, it is disabled (0) by default.
+  Optional<uint64_t> DiagnosticsHotnessThreshold = 0;
+
   /// The specialized remark streamer used by LLVM's OptimizationRemarkEmitter.
   std::unique_ptr<LLVMRemarkStreamer> LLVMRS;
 
@@ -1342,10 +1397,15 @@ public:
 
   DenseMap<Type *, std::unique_ptr<UndefValue>> UVConstants;
 
-  StringMap<ConstantDataSequential*> CDSConstants;
+  DenseMap<Type *, std::unique_ptr<PoisonValue>> PVConstants;
+
+  StringMap<std::unique_ptr<ConstantDataSequential>> CDSConstants;
 
   DenseMap<std::pair<const Function *, const BasicBlock *>, BlockAddress *>
     BlockAddresses;
+
+  DenseMap<const GlobalValue *, DSOLocalEquivalent *> DSOLocalEquivalents;
+
   ConstantUniqueMap<ConstantExpr> ExprConstants;
 
   ConstantUniqueMap<InlineAsm> InlineAsms;
@@ -1358,7 +1418,7 @@ public:
   // Basic type instances.
   Type VoidTy, LabelTy, HalfTy, BFloatTy, FloatTy, DoubleTy, MetadataTy,
       TokenTy;
-  Type X86_FP80Ty, FP128Ty, PPC_FP128Ty, X86_MMXTy;
+  Type X86_FP80Ty, FP128Ty, PPC_FP128Ty, X86_MMXTy, X86_AMXTy;
   IntegerType Int1Ty, Int8Ty, Int16Ty, Int32Ty, Int64Ty, Int128Ty;
 
   BumpPtrAllocator Alloc;
@@ -1387,11 +1447,8 @@ public:
   /// CustomMDKindNames - Map to hold the metadata string to ID mapping.
   StringMap<unsigned> CustomMDKindNames;
 
-  /// Collection of per-instruction metadata used in this context.
-  DenseMap<const Instruction *, MDAttachmentMap> InstructionMetadata;
-
-  /// Collection of per-GlobalObject metadata used in this context.
-  DenseMap<const GlobalObject *, MDGlobalAttachmentMap> GlobalObjectMetadata;
+  /// Collection of metadata used in this context.
+  DenseMap<const Value *, MDAttachments> ValueMetadata;
 
   /// Collection of per-GlobalObject sections used in this context.
   DenseMap<const GlobalObject *, StringRef> GlobalObjectSections;
diff --git a/contrib/llvm-project/llvm/lib/IR/LLVMRemarkStreamer.cpp b/contrib/llvm-project/llvm/lib/IR/LLVMRemarkStreamer.cpp
index 96001ab42c38..18b47611c97c 100644
--- a/contrib/llvm-project/llvm/lib/IR/LLVMRemarkStreamer.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/LLVMRemarkStreamer.cpp
@@ -92,12 +92,11 @@ char LLVMRemarkSetupFormatError::ID = 0;
 Expected<std::unique_ptr<ToolOutputFile>> llvm::setupLLVMOptimizationRemarks(
     LLVMContext &Context, StringRef RemarksFilename, StringRef RemarksPasses,
     StringRef RemarksFormat, bool RemarksWithHotness,
-    unsigned RemarksHotnessThreshold) {
+    Optional<uint64_t> RemarksHotnessThreshold) {
   if (RemarksWithHotness)
     Context.setDiagnosticsHotnessRequested(true);
 
-  if (RemarksHotnessThreshold)
-    Context.setDiagnosticsHotnessThreshold(RemarksHotnessThreshold);
+  Context.setDiagnosticsHotnessThreshold(RemarksHotnessThreshold);
 
   if (RemarksFilename.empty())
     return nullptr;
@@ -137,16 +136,14 @@ Expected<std::unique_ptr<ToolOutputFile>> llvm::setupLLVMOptimizationRemarks(
   return std::move(RemarksFile);
 }
 
-Error llvm::setupLLVMOptimizationRemarks(LLVMContext &Context, raw_ostream &OS,
-                                         StringRef RemarksPasses,
-                                         StringRef RemarksFormat,
-                                         bool RemarksWithHotness,
-                                         unsigned RemarksHotnessThreshold) {
+Error llvm::setupLLVMOptimizationRemarks(
+    LLVMContext &Context, raw_ostream &OS, StringRef RemarksPasses,
+    StringRef RemarksFormat, bool RemarksWithHotness,
+    Optional<uint64_t> RemarksHotnessThreshold) {
   if (RemarksWithHotness)
     Context.setDiagnosticsHotnessRequested(true);
 
-  if (RemarksHotnessThreshold)
-    Context.setDiagnosticsHotnessThreshold(RemarksHotnessThreshold);
+  Context.setDiagnosticsHotnessThreshold(RemarksHotnessThreshold);
 
   Expected<remarks::Format> Format = remarks::parseFormat(RemarksFormat);
   if (Error E = Format.takeError())
diff --git a/contrib/llvm-project/llvm/lib/IR/LegacyPassManager.cpp b/contrib/llvm-project/llvm/lib/IR/LegacyPassManager.cpp
index 4189aea46294..4547c3a01239 100644
--- a/contrib/llvm-project/llvm/lib/IR/LegacyPassManager.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/LegacyPassManager.cpp
@@ -20,6 +20,8 @@
 #include "llvm/IR/LegacyPassNameParser.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassTimingInfo.h"
+#include "llvm/IR/PrintPasses.h"
+#include "llvm/IR/StructuralHash.h"
 #include "llvm/Support/Chrono.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -48,7 +50,7 @@ namespace {
 enum PassDebugLevel {
   Disabled, Arguments, Structure, Executions, Details
 };
-}
+} // namespace
 
 static cl::opt<enum PassDebugLevel>
 PassDebugging("debug-pass", cl::Hidden,
@@ -60,80 +62,6 @@ PassDebugging("debug-pass", cl::Hidden,
   clEnumVal(Executions, "print pass name before it is executed"),
   clEnumVal(Details   , "print pass details when it is executed")));
 
-namespace {
-typedef llvm::cl::list<const llvm::PassInfo *, bool, PassNameParser>
-PassOptionList;
-}
-
-// Print IR out before/after specified passes.
-static PassOptionList
-PrintBefore("print-before",
-            llvm::cl::desc("Print IR before specified passes"),
-            cl::Hidden);
-
-static PassOptionList
-PrintAfter("print-after",
-           llvm::cl::desc("Print IR after specified passes"),
-           cl::Hidden);
-
-static cl::opt<bool> PrintBeforeAll("print-before-all",
-                                    llvm::cl::desc("Print IR before each pass"),
-                                    cl::init(false), cl::Hidden);
-static cl::opt<bool> PrintAfterAll("print-after-all",
-                                   llvm::cl::desc("Print IR after each pass"),
-                                   cl::init(false), cl::Hidden);
-
-static cl::opt<bool>
-    PrintModuleScope("print-module-scope",
-                     cl::desc("When printing IR for print-[before|after]{-all} "
-                              "always print a module IR"),
-                     cl::init(false), cl::Hidden);
-
-static cl::list<std::string>
-    PrintFuncsList("filter-print-funcs", cl::value_desc("function names"),
-                   cl::desc("Only print IR for functions whose name "
-                            "match this for all print-[before|after][-all] "
-                            "options"),
-                   cl::CommaSeparated, cl::Hidden);
-
-/// This is a helper to determine whether to print IR before or
-/// after a pass.
-
-bool llvm::shouldPrintBeforePass() {
-  return PrintBeforeAll || !PrintBefore.empty();
-}
-
-bool llvm::shouldPrintAfterPass() {
-  return PrintAfterAll || !PrintAfter.empty();
-}
-
-static bool ShouldPrintBeforeOrAfterPass(StringRef PassID,
-                                         PassOptionList &PassesToPrint) {
-  for (auto *PassInf : PassesToPrint) {
-    if (PassInf)
-      if (PassInf->getPassArgument() == PassID) {
-        return true;
-      }
-  }
-  return false;
-}
-
-bool llvm::shouldPrintBeforePass(StringRef PassID) {
-  return PrintBeforeAll || ShouldPrintBeforeOrAfterPass(PassID, PrintBefore);
-}
-
-bool llvm::shouldPrintAfterPass(StringRef PassID) {
-  return PrintAfterAll || ShouldPrintBeforeOrAfterPass(PassID, PrintAfter);
-}
-
-bool llvm::forcePrintModuleIR() { return PrintModuleScope; }
-
-bool llvm::isFunctionInPrintList(StringRef FunctionName) {
-  static std::unordered_set<std::string> PrintFuncNames(PrintFuncsList.begin(),
-                                                        PrintFuncsList.end());
-  return PrintFuncNames.empty() ||
-         PrintFuncNames.count(std::string(FunctionName));
-}
 /// isPassDebuggingExecutionsOrMore - Return true if -debug-pass=Executions
 /// or higher is specified.
 bool PMDataManager::isPassDebuggingExecutionsOrMore() const {
@@ -209,8 +137,7 @@ void PMDataManager::emitInstrCountChangedRemark(
     // remarks. Since it's possible that the first function in the module
     // doesn't actually contain a basic block, we have to go and find one that's
     // suitable for emitting remarks.
-    auto It = std::find_if(M.begin(), M.end(),
-                          [](const Function &Fn) { return !Fn.empty(); });
+    auto It = llvm::find_if(M, [](const Function &Fn) { return !Fn.empty(); });
 
     // Didn't find a function. Quit.
     if (It == M.end())
@@ -641,7 +568,12 @@ PMTopLevelManager::setLastUser(ArrayRef<Pass*> AnalysisPasses, Pass *P) {
     PDepth = P->getResolver()->getPMDataManager().getDepth();
 
   for (Pass *AP : AnalysisPasses) {
-    LastUser[AP] = P;
+    // Record P as the new last user of AP.
+    auto &LastUserOfAP = LastUser[AP];
+    if (LastUserOfAP)
+      InversedLastUser[LastUserOfAP].erase(AP);
+    LastUserOfAP = P;
+    InversedLastUser[P].insert(AP);
 
     if (P == AP)
       continue;
@@ -671,31 +603,25 @@ PMTopLevelManager::setLastUser(ArrayRef<Pass*> AnalysisPasses, Pass *P) {
     if (P->getResolver())
       setLastUser(LastPMUses, P->getResolver()->getPMDataManager().getAsPass());
 
-
     // If AP is the last user of other passes then make P last user of
     // such passes.
-    for (auto LU : LastUser) {
-      if (LU.second == AP)
-        // DenseMap iterator is not invalidated here because
-        // this is just updating existing entries.
-        LastUser[LU.first] = P;
-    }
+    auto &LastUsedByAP = InversedLastUser[AP];
+    for (Pass *L : LastUsedByAP)
+      LastUser[L] = P;
+    InversedLastUser[P].insert(LastUsedByAP.begin(), LastUsedByAP.end());
+    LastUsedByAP.clear();
   }
 }
 
 /// Collect passes whose last user is P
 void PMTopLevelManager::collectLastUses(SmallVectorImpl<Pass *> &LastUses,
                                         Pass *P) {
-  DenseMap<Pass *, SmallPtrSet<Pass *, 8> >::iterator DMI =
-    InversedLastUser.find(P);
+  auto DMI = InversedLastUser.find(P);
   if (DMI == InversedLastUser.end())
     return;
 
-  SmallPtrSet<Pass *, 8> &LU = DMI->second;
-  for (Pass *LUP : LU) {
-    LastUses.push_back(LUP);
-  }
-
+  auto &LU = DMI->second;
+  LastUses.append(LU.begin(), LU.end());
 }
 
 AnalysisUsage *PMTopLevelManager::findAnalysisUsage(Pass *P) {
@@ -929,11 +855,6 @@ void PMTopLevelManager::initializeAllAnalysisInfo() {
   // Initailize other pass managers
   for (PMDataManager *IPM : IndirectPassManagers)
     IPM->initializeAnalysisInfo();
-
-  for (auto LU : LastUser) {
-    SmallPtrSet<Pass *, 8> &L = InversedLastUser[LU.second];
-    L.insert(LU.first);
-  }
 }
 
 /// Destructor
@@ -1189,12 +1110,6 @@ void PMDataManager::collectRequiredAndUsedAnalyses(
       UP.push_back(AnalysisPass);
     else
       RP_NotAvail.push_back(RequiredID);
-
-  for (const auto &RequiredID : AnUsage->getRequiredTransitiveSet())
-    if (Pass *AnalysisPass = findAnalysisPass(RequiredID, true))
-      UP.push_back(AnalysisPass);
-    else
-      RP_NotAvail.push_back(RequiredID);
 }
 
 // All Required analyses should be available to the pass as it runs!  Here
@@ -1236,6 +1151,8 @@ Pass *PMDataManager::findAnalysisPass(AnalysisID AID, bool SearchParent) {
 
 // Print list of passes that are last used by P.
 void PMDataManager::dumpLastUses(Pass *P, unsigned Offset) const{
+  if (PassDebugging < Details)
+    return;
 
   SmallVector<Pass *, 12> LUses;
 
@@ -1391,8 +1308,8 @@ PMDataManager::~PMDataManager() {
 //===----------------------------------------------------------------------===//
 // NOTE: Is this the right place to define this method ?
 // getAnalysisIfAvailable - Return analysis result or null if it doesn't exist.
-Pass *AnalysisResolver::getAnalysisIfAvailable(AnalysisID ID, bool dir) const {
-  return PM.findAnalysisPass(ID, dir);
+Pass *AnalysisResolver::getAnalysisIfAvailable(AnalysisID ID) const {
+  return PM.findAnalysisPass(ID, true);
 }
 
 std::tuple<Pass *, bool>
@@ -1475,7 +1392,6 @@ void FPPassManager::dumpPassStructure(unsigned Offset) {
   }
 }
 
-
 /// Execute all of the passes scheduled for execution by invoking
 /// runOnFunction method.  Keep track of whether any of the passes modifies
 /// the function, and if so, return true.
@@ -1513,7 +1429,19 @@ bool FPPassManager::runOnFunction(Function &F) {
     {
       PassManagerPrettyStackEntry X(FP, F);
       TimeRegion PassTimer(getPassTimer(FP));
+#ifdef EXPENSIVE_CHECKS
+      uint64_t RefHash = StructuralHash(F);
+#endif
       LocalChanged |= FP->runOnFunction(F);
+
+#if defined(EXPENSIVE_CHECKS) && !defined(NDEBUG)
+      if (!LocalChanged && (RefHash != StructuralHash(F))) {
+        llvm::errs() << "Pass modifies its input and doesn't report it: "
+                     << FP->getPassName() << "\n";
+        llvm_unreachable("Pass modifies its input and doesn't report it");
+      }
+#endif
+
       if (EmitICRemark) {
         unsigned NewSize = F.getInstructionCount();
 
@@ -1537,7 +1465,8 @@ bool FPPassManager::runOnFunction(Function &F) {
     dumpUsedSet(FP);
 
     verifyPreservedAnalysis(FP);
-    removeNotPreservedAnalysis(FP);
+    if (LocalChanged)
+      removeNotPreservedAnalysis(FP);
     recordAvailableAnalysis(FP);
     removeDeadPasses(FP, F.getName(), ON_FUNCTION_MSG);
   }
@@ -1614,7 +1543,17 @@ MPPassManager::runOnModule(Module &M) {
       PassManagerPrettyStackEntry X(MP, M);
       TimeRegion PassTimer(getPassTimer(MP));
 
+#ifdef EXPENSIVE_CHECKS
+      uint64_t RefHash = StructuralHash(M);
+#endif
+
       LocalChanged |= MP->runOnModule(M);
+
+#ifdef EXPENSIVE_CHECKS
+      assert((LocalChanged || (RefHash == StructuralHash(M))) &&
+             "Pass modifies its input and doesn't report it.");
+#endif
+
       if (EmitICRemark) {
         // Update the size of the module.
         unsigned ModuleCount = M.getInstructionCount();
@@ -1636,7 +1575,8 @@ MPPassManager::runOnModule(Module &M) {
     dumpUsedSet(MP);
 
     verifyPreservedAnalysis(MP);
-    removeNotPreservedAnalysis(MP);
+    if (LocalChanged)
+      removeNotPreservedAnalysis(MP);
     recordAvailableAnalysis(MP);
     removeDeadPasses(MP, M.getModuleIdentifier(), ON_MODULE_MSG);
   }
diff --git a/contrib/llvm-project/llvm/lib/IR/MDBuilder.cpp b/contrib/llvm-project/llvm/lib/IR/MDBuilder.cpp
index 40d70f43132d..35af8490287b 100644
--- a/contrib/llvm-project/llvm/lib/IR/MDBuilder.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/MDBuilder.cpp
@@ -151,24 +151,20 @@ MDNode *MDBuilder::mergeCallbackEncodings(MDNode *ExistingCallbacks,
 }
 
 MDNode *MDBuilder::createAnonymousAARoot(StringRef Name, MDNode *Extra) {
-  // To ensure uniqueness the root node is self-referential.
-  auto Dummy = MDNode::getTemporary(Context, None);
-
-  SmallVector<Metadata *, 3> Args(1, Dummy.get());
+  SmallVector<Metadata *, 3> Args(1, nullptr);
   if (Extra)
     Args.push_back(Extra);
   if (!Name.empty())
     Args.push_back(createString(Name));
-  MDNode *Root = MDNode::get(Context, Args);
+  MDNode *Root = MDNode::getDistinct(Context, Args);
 
   // At this point we have
-  //   !0 = metadata !{}            <- dummy
-  //   !1 = metadata !{metadata !0} <- root
-  // Replace the dummy operand with the root node itself and delete the dummy.
+  //   !0 = distinct !{null} <- root
+  // Replace the reserved operand with the root node itself.
   Root->replaceOperandWith(0, Root);
 
   // We now have
-  //   !1 = metadata !{metadata !1} <- self-referential root
+  //   !0 = distinct !{!0} <- root
   return Root;
 }
 
@@ -310,14 +306,12 @@ MDNode *MDBuilder::createIrrLoopHeaderWeight(uint64_t Weight) {
   return MDNode::get(Context, Vals);
 }
 
-MDNode *MDBuilder::createMisExpect(uint64_t Index, uint64_t LikleyWeight,
-                                   uint64_t UnlikleyWeight) {
-  auto *IntType = Type::getInt64Ty(Context);
-  Metadata *Vals[] = {
-      createString("misexpect"),
-      createConstant(ConstantInt::get(IntType, Index)),
-      createConstant(ConstantInt::get(IntType, LikleyWeight)),
-      createConstant(ConstantInt::get(IntType, UnlikleyWeight)),
-  };
-  return MDNode::get(Context, Vals);
+MDNode *MDBuilder::createPseudoProbeDesc(uint64_t GUID, uint64_t Hash,
+                                         Function *F) {
+  auto *Int64Ty = Type::getInt64Ty(Context);
+  SmallVector<Metadata *, 3> Ops(3);
+  Ops[0] = createConstant(ConstantInt::get(Int64Ty, GUID));
+  Ops[1] = createConstant(ConstantInt::get(Int64Ty, Hash));
+  Ops[2] = createString(F->getName());
+  return MDNode::get(Context, Ops);
 }
diff --git a/contrib/llvm-project/llvm/lib/IR/Mangler.cpp b/contrib/llvm-project/llvm/lib/IR/Mangler.cpp
index 0d66e321c396..674ba3cdaa24 100644
--- a/contrib/llvm-project/llvm/lib/IR/Mangler.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/Mangler.cpp
@@ -12,6 +12,7 @@
 
 #include "llvm/IR/Mangler.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/IR/DataLayout.h"
@@ -100,7 +101,7 @@ static void addByteCountSuffix(raw_ostream &OS, const Function *F,
   for (Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
        AI != AE; ++AI) {
     // 'Dereference' type in case of byval or inalloca parameter attribute.
-    uint64_t AllocSize = AI->hasPassPointeeByValueAttr() ?
+    uint64_t AllocSize = AI->hasPassPointeeByValueCopyAttr() ?
       AI->getPassPointeeByValueCopySize(DL) :
       DL.getTypeAllocSize(AI->getType());
 
@@ -184,6 +185,25 @@ void Mangler::getNameWithPrefix(SmallVectorImpl<char> &OutName,
   getNameWithPrefix(OS, GV, CannotUsePrivateLabel);
 }
 
+// Check if the name needs quotes to be safe for the linker to interpret.
+static bool canBeUnquotedInDirective(char C) {
+  return isAlnum(C) || C == '_' || C == '$' || C == '.' || C == '@';
+}
+
+static bool canBeUnquotedInDirective(StringRef Name) {
+  if (Name.empty())
+    return false;
+
+  // If any of the characters in the string is an unacceptable character, force
+  // quotes.
+  for (char C : Name) {
+    if (!canBeUnquotedInDirective(C))
+      return false;
+  }
+
+  return true;
+}
+
 void llvm::emitLinkerFlagsForGlobalCOFF(raw_ostream &OS, const GlobalValue *GV,
                                         const Triple &TT, Mangler &Mangler) {
   if (!GV->hasDLLExportStorageClass() || GV->isDeclaration())
@@ -194,6 +214,9 @@ void llvm::emitLinkerFlagsForGlobalCOFF(raw_ostream &OS, const GlobalValue *GV,
   else
     OS << " -export:";
 
+  bool NeedQuotes = GV->hasName() && !canBeUnquotedInDirective(GV->getName());
+  if (NeedQuotes)
+    OS << "\"";
   if (TT.isWindowsGNUEnvironment() || TT.isWindowsCygwinEnvironment()) {
     std::string Flag;
     raw_string_ostream FlagOS(Flag);
@@ -206,6 +229,8 @@ void llvm::emitLinkerFlagsForGlobalCOFF(raw_ostream &OS, const GlobalValue *GV,
   } else {
     Mangler.getNameWithPrefix(OS, GV, false);
   }
+  if (NeedQuotes)
+    OS << "\"";
 
   if (!GV->getValueType()->isFunctionTy()) {
     if (TT.isWindowsMSVCEnvironment())
@@ -221,6 +246,11 @@ void llvm::emitLinkerFlagsForUsedCOFF(raw_ostream &OS, const GlobalValue *GV,
     return;
 
   OS << " /INCLUDE:";
+  bool NeedQuotes = GV->hasName() && !canBeUnquotedInDirective(GV->getName());
+  if (NeedQuotes)
+    OS << "\"";
   M.getNameWithPrefix(OS, GV, false);
+  if (NeedQuotes)
+    OS << "\"";
 }
 
diff --git a/contrib/llvm-project/llvm/lib/IR/Metadata.cpp b/contrib/llvm-project/llvm/lib/IR/Metadata.cpp
index ce89009e86eb..7ca538995db2 100644
--- a/contrib/llvm-project/llvm/lib/IR/Metadata.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/Metadata.cpp
@@ -10,6 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/IR/Metadata.h"
 #include "LLVMContextImpl.h"
 #include "MetadataImpl.h"
 #include "SymbolTableListTraitsImpl.h"
@@ -38,7 +39,7 @@
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Metadata.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/TrackingMDRef.h"
 #include "llvm/IR/Type.h"
@@ -640,10 +641,7 @@ void MDNode::resolveCycles() {
 }
 
 static bool hasSelfReference(MDNode *N) {
-  for (Metadata *MD : N->operands())
-    if (MD == N)
-      return true;
-  return false;
+  return llvm::is_contained(N->operands(), N);
 }
 
 MDNode *MDNode::replaceWithPermanentImpl() {
@@ -925,7 +923,32 @@ MDNode *MDNode::getMostGenericAliasScope(MDNode *A, MDNode *B) {
   if (!A || !B)
     return nullptr;
 
-  return concatenate(A, B);
+  // Take the intersection of domains then union the scopes
+  // within those domains
+  SmallPtrSet<const MDNode *, 16> ADomains;
+  SmallPtrSet<const MDNode *, 16> IntersectDomains;
+  SmallSetVector<Metadata *, 4> MDs;
+  for (const MDOperand &MDOp : A->operands())
+    if (const MDNode *NAMD = dyn_cast<MDNode>(MDOp))
+      if (const MDNode *Domain = AliasScopeNode(NAMD).getDomain())
+        ADomains.insert(Domain);
+
+  for (const MDOperand &MDOp : B->operands())
+    if (const MDNode *NAMD = dyn_cast<MDNode>(MDOp))
+      if (const MDNode *Domain = AliasScopeNode(NAMD).getDomain())
+        if (ADomains.contains(Domain)) {
+          IntersectDomains.insert(Domain);
+          MDs.insert(MDOp);
+        }
+
+  for (const MDOperand &MDOp : A->operands())
+    if (const MDNode *NAMD = dyn_cast<MDNode>(MDOp))
+      if (const MDNode *Domain = AliasScopeNode(NAMD).getDomain())
+        if (IntersectDomains.contains(Domain))
+          MDs.insert(MDOp);
+
+  return MDs.empty() ? nullptr
+                     : getOrSelfReference(A->getContext(), MDs.getArrayRef());
 }
 
 MDNode *MDNode::getMostGenericFPMath(MDNode *A, MDNode *B) {
@@ -1101,87 +1124,158 @@ StringRef NamedMDNode::getName() const { return StringRef(Name); }
 //===----------------------------------------------------------------------===//
 // Instruction Metadata method implementations.
 //
-void MDAttachmentMap::set(unsigned ID, MDNode &MD) {
-  for (auto &I : Attachments)
-    if (I.first == ID) {
-      I.second.reset(&MD);
-      return;
-    }
-  Attachments.emplace_back(std::piecewise_construct, std::make_tuple(ID),
-                           std::make_tuple(&MD));
+
+MDNode *MDAttachments::lookup(unsigned ID) const {
+  for (const auto &A : Attachments)
+    if (A.MDKind == ID)
+      return A.Node;
+  return nullptr;
+}
+
+void MDAttachments::get(unsigned ID, SmallVectorImpl<MDNode *> &Result) const {
+  for (const auto &A : Attachments)
+    if (A.MDKind == ID)
+      Result.push_back(A.Node);
 }
 
-bool MDAttachmentMap::erase(unsigned ID) {
+void MDAttachments::getAll(
+    SmallVectorImpl<std::pair<unsigned, MDNode *>> &Result) const {
+  for (const auto &A : Attachments)
+    Result.emplace_back(A.MDKind, A.Node);
+
+  // Sort the resulting array so it is stable with respect to metadata IDs. We
+  // need to preserve the original insertion order though.
+  if (Result.size() > 1)
+    llvm::stable_sort(Result, less_first());
+}
+
+void MDAttachments::set(unsigned ID, MDNode *MD) {
+  erase(ID);
+  if (MD)
+    insert(ID, *MD);
+}
+
+void MDAttachments::insert(unsigned ID, MDNode &MD) {
+  Attachments.push_back({ID, TrackingMDNodeRef(&MD)});
+}
+
+bool MDAttachments::erase(unsigned ID) {
   if (empty())
     return false;
 
-  // Common case is one/last value.
-  if (Attachments.back().first == ID) {
+  // Common case is one value.
+  if (Attachments.size() == 1 && Attachments.back().MDKind == ID) {
     Attachments.pop_back();
     return true;
   }
 
-  for (auto I = Attachments.begin(), E = std::prev(Attachments.end()); I != E;
-       ++I)
-    if (I->first == ID) {
-      *I = std::move(Attachments.back());
-      Attachments.pop_back();
-      return true;
-    }
+  auto OldSize = Attachments.size();
+  llvm::erase_if(Attachments,
+                 [ID](const Attachment &A) { return A.MDKind == ID; });
+  return OldSize != Attachments.size();
+}
 
-  return false;
+MDNode *Value::getMetadata(unsigned KindID) const {
+  if (!hasMetadata())
+    return nullptr;
+  const auto &Info = getContext().pImpl->ValueMetadata[this];
+  assert(!Info.empty() && "bit out of sync with hash table");
+  return Info.lookup(KindID);
 }
 
-MDNode *MDAttachmentMap::lookup(unsigned ID) const {
-  for (const auto &I : Attachments)
-    if (I.first == ID)
-      return I.second;
-  return nullptr;
+MDNode *Value::getMetadata(StringRef Kind) const {
+  if (!hasMetadata())
+    return nullptr;
+  const auto &Info = getContext().pImpl->ValueMetadata[this];
+  assert(!Info.empty() && "bit out of sync with hash table");
+  return Info.lookup(getContext().getMDKindID(Kind));
 }
 
-void MDAttachmentMap::getAll(
-    SmallVectorImpl<std::pair<unsigned, MDNode *>> &Result) const {
-  Result.append(Attachments.begin(), Attachments.end());
+void Value::getMetadata(unsigned KindID, SmallVectorImpl<MDNode *> &MDs) const {
+  if (hasMetadata())
+    getContext().pImpl->ValueMetadata[this].get(KindID, MDs);
+}
 
-  // Sort the resulting array so it is stable.
-  if (Result.size() > 1)
-    array_pod_sort(Result.begin(), Result.end());
+void Value::getMetadata(StringRef Kind, SmallVectorImpl<MDNode *> &MDs) const {
+  if (hasMetadata())
+    getMetadata(getContext().getMDKindID(Kind), MDs);
 }
 
-void MDGlobalAttachmentMap::insert(unsigned ID, MDNode &MD) {
-  Attachments.push_back({ID, TrackingMDNodeRef(&MD)});
+void Value::getAllMetadata(
+    SmallVectorImpl<std::pair<unsigned, MDNode *>> &MDs) const {
+  if (hasMetadata()) {
+    assert(getContext().pImpl->ValueMetadata.count(this) &&
+           "bit out of sync with hash table");
+    const auto &Info = getContext().pImpl->ValueMetadata.find(this)->second;
+    assert(!Info.empty() && "Shouldn't have called this");
+    Info.getAll(MDs);
+  }
 }
 
-MDNode *MDGlobalAttachmentMap::lookup(unsigned ID) const {
-  for (const auto &A : Attachments)
-    if (A.MDKind == ID)
-      return A.Node;
-  return nullptr;
+void Value::setMetadata(unsigned KindID, MDNode *Node) {
+  assert(isa<Instruction>(this) || isa<GlobalObject>(this));
+
+  // Handle the case when we're adding/updating metadata on a value.
+  if (Node) {
+    auto &Info = getContext().pImpl->ValueMetadata[this];
+    assert(!Info.empty() == HasMetadata && "bit out of sync with hash table");
+    if (Info.empty())
+      HasMetadata = true;
+    Info.set(KindID, Node);
+    return;
+  }
+
+  // Otherwise, we're removing metadata from an instruction.
+  assert((HasMetadata == (getContext().pImpl->ValueMetadata.count(this) > 0)) &&
+         "bit out of sync with hash table");
+  if (!HasMetadata)
+    return; // Nothing to remove!
+  auto &Info = getContext().pImpl->ValueMetadata[this];
+
+  // Handle removal of an existing value.
+  Info.erase(KindID);
+  if (!Info.empty())
+    return;
+  getContext().pImpl->ValueMetadata.erase(this);
+  HasMetadata = false;
 }
 
-void MDGlobalAttachmentMap::get(unsigned ID,
-                                SmallVectorImpl<MDNode *> &Result) const {
-  for (const auto &A : Attachments)
-    if (A.MDKind == ID)
-      Result.push_back(A.Node);
+void Value::setMetadata(StringRef Kind, MDNode *Node) {
+  if (!Node && !HasMetadata)
+    return;
+  setMetadata(getContext().getMDKindID(Kind), Node);
 }
 
-bool MDGlobalAttachmentMap::erase(unsigned ID) {
-  auto I = std::remove_if(Attachments.begin(), Attachments.end(),
-                          [ID](const Attachment &A) { return A.MDKind == ID; });
-  bool Changed = I != Attachments.end();
-  Attachments.erase(I, Attachments.end());
-  return Changed;
+void Value::addMetadata(unsigned KindID, MDNode &MD) {
+  assert(isa<Instruction>(this) || isa<GlobalObject>(this));
+  if (!HasMetadata)
+    HasMetadata = true;
+  getContext().pImpl->ValueMetadata[this].insert(KindID, MD);
 }
 
-void MDGlobalAttachmentMap::getAll(
-    SmallVectorImpl<std::pair<unsigned, MDNode *>> &Result) const {
-  for (const auto &A : Attachments)
-    Result.emplace_back(A.MDKind, A.Node);
+void Value::addMetadata(StringRef Kind, MDNode &MD) {
+  addMetadata(getContext().getMDKindID(Kind), MD);
+}
 
-  // Sort the resulting array so it is stable with respect to metadata IDs. We
-  // need to preserve the original insertion order though.
-  llvm::stable_sort(Result, less_first());
+bool Value::eraseMetadata(unsigned KindID) {
+  // Nothing to unset.
+  if (!HasMetadata)
+    return false;
+
+  auto &Store = getContext().pImpl->ValueMetadata[this];
+  bool Changed = Store.erase(KindID);
+  if (Store.empty())
+    clearMetadata();
+  return Changed;
+}
+
+void Value::clearMetadata() {
+  if (!HasMetadata)
+    return;
+  assert(getContext().pImpl->ValueMetadata.count(this) &&
+         "bit out of sync with hash table");
+  getContext().pImpl->ValueMetadata.erase(this);
+  HasMetadata = false;
 }
 
 void Instruction::setMetadata(StringRef Kind, MDNode *Node) {
@@ -1195,29 +1289,28 @@ MDNode *Instruction::getMetadataImpl(StringRef Kind) const {
 }
 
 void Instruction::dropUnknownNonDebugMetadata(ArrayRef<unsigned> KnownIDs) {
-  if (!hasMetadataHashEntry())
+  if (!Value::hasMetadata())
     return; // Nothing to remove!
 
-  auto &InstructionMetadata = getContext().pImpl->InstructionMetadata;
-
-  SmallSet<unsigned, 4> KnownSet;
-  KnownSet.insert(KnownIDs.begin(), KnownIDs.end());
-  if (KnownSet.empty()) {
+  if (KnownIDs.empty()) {
     // Just drop our entry at the store.
-    InstructionMetadata.erase(this);
-    setHasMetadataHashEntry(false);
+    clearMetadata();
     return;
   }
 
-  auto &Info = InstructionMetadata[this];
-  Info.remove_if([&KnownSet](const std::pair<unsigned, TrackingMDNodeRef> &I) {
-    return !KnownSet.count(I.first);
+  SmallSet<unsigned, 4> KnownSet;
+  KnownSet.insert(KnownIDs.begin(), KnownIDs.end());
+
+  auto &MetadataStore = getContext().pImpl->ValueMetadata;
+  auto &Info = MetadataStore[this];
+  assert(!Info.empty() && "bit out of sync with hash table");
+  Info.remove_if([&KnownSet](const MDAttachments::Attachment &I) {
+    return !KnownSet.count(I.MDKind);
   });
 
   if (Info.empty()) {
     // Drop our entry at the store.
-    InstructionMetadata.erase(this);
-    setHasMetadataHashEntry(false);
+    clearMetadata();
   }
 }
 
@@ -1231,33 +1324,28 @@ void Instruction::setMetadata(unsigned KindID, MDNode *Node) {
     return;
   }
 
-  // Handle the case when we're adding/updating metadata on an instruction.
-  if (Node) {
-    auto &Info = getContext().pImpl->InstructionMetadata[this];
-    assert(!Info.empty() == hasMetadataHashEntry() &&
-           "HasMetadata bit is wonked");
-    if (Info.empty())
-      setHasMetadataHashEntry(true);
-    Info.set(KindID, *Node);
-    return;
-  }
+  Value::setMetadata(KindID, Node);
+}
 
-  // Otherwise, we're removing metadata from an instruction.
-  assert((hasMetadataHashEntry() ==
-          (getContext().pImpl->InstructionMetadata.count(this) > 0)) &&
-         "HasMetadata bit out of date!");
-  if (!hasMetadataHashEntry())
-    return; // Nothing to remove!
-  auto &Info = getContext().pImpl->InstructionMetadata[this];
+void Instruction::addAnnotationMetadata(StringRef Name) {
+  MDBuilder MDB(getContext());
 
-  // Handle removal of an existing value.
-  Info.erase(KindID);
-
-  if (!Info.empty())
-    return;
+  auto *Existing = getMetadata(LLVMContext::MD_annotation);
+  SmallVector<Metadata *, 4> Names;
+  bool AppendName = true;
+  if (Existing) {
+    auto *Tuple = cast<MDTuple>(Existing);
+    for (auto &N : Tuple->operands()) {
+      if (cast<MDString>(N.get())->getString() == Name)
+        AppendName = false;
+      Names.push_back(N.get());
+    }
+  }
+  if (AppendName)
+    Names.push_back(MDB.createString(Name));
 
-  getContext().pImpl->InstructionMetadata.erase(this);
-  setHasMetadataHashEntry(false);
+  MDNode *MD = MDTuple::get(getContext(), Names);
+  setMetadata(LLVMContext::MD_annotation, MD);
 }
 
 void Instruction::setAAMetadata(const AAMDNodes &N) {
@@ -1271,13 +1359,7 @@ MDNode *Instruction::getMetadataImpl(unsigned KindID) const {
   // Handle 'dbg' as a special case since it is not stored in the hash table.
   if (KindID == LLVMContext::MD_dbg)
     return DbgLoc.getAsMDNode();
-
-  if (!hasMetadataHashEntry())
-    return nullptr;
-  auto &Info = getContext().pImpl->InstructionMetadata[this];
-  assert(!Info.empty() && "bit out of sync with hash table");
-
-  return Info.lookup(KindID);
+  return Value::getMetadata(KindID);
 }
 
 void Instruction::getAllMetadataImpl(
@@ -1288,27 +1370,8 @@ void Instruction::getAllMetadataImpl(
   if (DbgLoc) {
     Result.push_back(
         std::make_pair((unsigned)LLVMContext::MD_dbg, DbgLoc.getAsMDNode()));
-    if (!hasMetadataHashEntry())
-      return;
   }
-
-  assert(hasMetadataHashEntry() &&
-         getContext().pImpl->InstructionMetadata.count(this) &&
-         "Shouldn't have called this");
-  const auto &Info = getContext().pImpl->InstructionMetadata.find(this)->second;
-  assert(!Info.empty() && "Shouldn't have called this");
-  Info.getAll(Result);
-}
-
-void Instruction::getAllMetadataOtherThanDebugLocImpl(
-    SmallVectorImpl<std::pair<unsigned, MDNode *>> &Result) const {
-  Result.clear();
-  assert(hasMetadataHashEntry() &&
-         getContext().pImpl->InstructionMetadata.count(this) &&
-         "Shouldn't have called this");
-  const auto &Info = getContext().pImpl->InstructionMetadata.find(this)->second;
-  assert(!Info.empty() && "Shouldn't have called this");
-  Info.getAll(Result);
+  Value::getAllMetadata(Result);
 }
 
 bool Instruction::extractProfMetadata(uint64_t &TrueVal,
@@ -1372,84 +1435,6 @@ bool Instruction::extractProfTotalWeight(uint64_t &TotalVal) const {
   return false;
 }
 
-void Instruction::clearMetadataHashEntries() {
-  assert(hasMetadataHashEntry() && "Caller should check");
-  getContext().pImpl->InstructionMetadata.erase(this);
-  setHasMetadataHashEntry(false);
-}
-
-void GlobalObject::getMetadata(unsigned KindID,
-                               SmallVectorImpl<MDNode *> &MDs) const {
-  if (hasMetadata())
-    getContext().pImpl->GlobalObjectMetadata[this].get(KindID, MDs);
-}
-
-void GlobalObject::getMetadata(StringRef Kind,
-                               SmallVectorImpl<MDNode *> &MDs) const {
-  if (hasMetadata())
-    getMetadata(getContext().getMDKindID(Kind), MDs);
-}
-
-void GlobalObject::addMetadata(unsigned KindID, MDNode &MD) {
-  if (!hasMetadata())
-    setHasMetadataHashEntry(true);
-
-  getContext().pImpl->GlobalObjectMetadata[this].insert(KindID, MD);
-}
-
-void GlobalObject::addMetadata(StringRef Kind, MDNode &MD) {
-  addMetadata(getContext().getMDKindID(Kind), MD);
-}
-
-bool GlobalObject::eraseMetadata(unsigned KindID) {
-  // Nothing to unset.
-  if (!hasMetadata())
-    return false;
-
-  auto &Store = getContext().pImpl->GlobalObjectMetadata[this];
-  bool Changed = Store.erase(KindID);
-  if (Store.empty())
-    clearMetadata();
-  return Changed;
-}
-
-void GlobalObject::getAllMetadata(
-    SmallVectorImpl<std::pair<unsigned, MDNode *>> &MDs) const {
-  MDs.clear();
-
-  if (!hasMetadata())
-    return;
-
-  getContext().pImpl->GlobalObjectMetadata[this].getAll(MDs);
-}
-
-void GlobalObject::clearMetadata() {
-  if (!hasMetadata())
-    return;
-  getContext().pImpl->GlobalObjectMetadata.erase(this);
-  setHasMetadataHashEntry(false);
-}
-
-void GlobalObject::setMetadata(unsigned KindID, MDNode *N) {
-  eraseMetadata(KindID);
-  if (N)
-    addMetadata(KindID, *N);
-}
-
-void GlobalObject::setMetadata(StringRef Kind, MDNode *N) {
-  setMetadata(getContext().getMDKindID(Kind), N);
-}
-
-MDNode *GlobalObject::getMetadata(unsigned KindID) const {
-  if (hasMetadata())
-    return getContext().pImpl->GlobalObjectMetadata[this].lookup(KindID);
-  return nullptr;
-}
-
-MDNode *GlobalObject::getMetadata(StringRef Kind) const {
-  return getMetadata(getContext().getMDKindID(Kind));
-}
-
 void GlobalObject::copyMetadata(const GlobalObject *Other, unsigned Offset) {
   SmallVector<std::pair<unsigned, MDNode *>, 8> MDs;
   Other->getAllMetadata(MDs);
diff --git a/contrib/llvm-project/llvm/lib/IR/Module.cpp b/contrib/llvm-project/llvm/lib/IR/Module.cpp
index 3ea181a9b48d..b4f10e2e2d23 100644
--- a/contrib/llvm-project/llvm/lib/IR/Module.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/Module.cpp
@@ -582,7 +582,7 @@ void Module::setProfileSummary(Metadata *M, ProfileSummary::Kind Kind) {
     setModuleFlag(ModFlagBehavior::Error, "ProfileSummary", M);
 }
 
-Metadata *Module::getProfileSummary(bool IsCS) {
+Metadata *Module::getProfileSummary(bool IsCS) const {
   return (IsCS ? getModuleFlag("CSProfileSummary")
                : getModuleFlag("ProfileSummary"));
 }
@@ -601,13 +601,6 @@ void Module::setSemanticInterposition(bool SI) {
   addModuleFlag(ModFlagBehavior::Error, "SemanticInterposition", SI);
 }
 
-bool Module::noSemanticInterposition() const {
-  // Conservatively require an explicit zero value for now.
-  Metadata *MF = getModuleFlag("SemanticInterposition");
-  auto *Val = cast_or_null<ConstantAsMetadata>(MF);
-  return Val && cast<ConstantInt>(Val->getValue())->getZExtValue() == 0;
-}
-
 void Module::setOwnedMemoryBuffer(std::unique_ptr<MemoryBuffer> MB) {
   OwnedMemoryBuffer = std::move(MB);
 }
diff --git a/contrib/llvm-project/llvm/lib/IR/ModuleSummaryIndex.cpp b/contrib/llvm-project/llvm/lib/IR/ModuleSummaryIndex.cpp
index 91612eafada7..5d21ca759f35 100644
--- a/contrib/llvm-project/llvm/lib/IR/ModuleSummaryIndex.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/ModuleSummaryIndex.cpp
@@ -163,7 +163,9 @@ bool ModuleSummaryIndex::isGUIDLive(GlobalValue::GUID GUID) const {
   return false;
 }
 
-static void propagateAttributesToRefs(GlobalValueSummary *S) {
+static void
+propagateAttributesToRefs(GlobalValueSummary *S,
+                          DenseSet<ValueInfo> &MarkedNonReadWriteOnly) {
   // If reference is not readonly or writeonly then referenced summary is not
   // read/writeonly either. Note that:
   // - All references from GlobalVarSummary are conservatively considered as
@@ -174,6 +176,11 @@ static void propagateAttributesToRefs(GlobalValueSummary *S) {
   //   for them.
   for (auto &VI : S->refs()) {
     assert(VI.getAccessSpecifier() == 0 || isa<FunctionSummary>(S));
+    if (!VI.getAccessSpecifier()) {
+      if (!MarkedNonReadWriteOnly.insert(VI).second)
+        continue;
+    } else if (MarkedNonReadWriteOnly.contains(VI))
+      continue;
     for (auto &Ref : VI.getSummaryList())
       // If references to alias is not read/writeonly then aliasee
       // is not read/writeonly
@@ -216,11 +223,24 @@ void ModuleSummaryIndex::propagateAttributes(
     const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols) {
   if (!PropagateAttrs)
     return;
+  DenseSet<ValueInfo> MarkedNonReadWriteOnly;
   for (auto &P : *this)
     for (auto &S : P.second.SummaryList) {
-      if (!isGlobalValueLive(S.get()))
+      if (!isGlobalValueLive(S.get())) {
+        // computeDeadSymbols should have marked all copies live. Note that
+        // it is possible that there is a GUID collision between internal
+        // symbols with the same name in different files of the same name but
+        // not enough distinguishing path. Because computeDeadSymbols should
+        // conservatively mark all copies live we can assert here that all are
+        // dead if any copy is dead.
+        assert(llvm::none_of(
+            P.second.SummaryList,
+            [&](const std::unique_ptr<GlobalValueSummary> &Summary) {
+              return isGlobalValueLive(Summary.get());
+            }));
         // We don't examine references from dead objects
-        continue;
+        break;
+      }
 
       // Global variable can't be marked read/writeonly if it is not eligible
       // to import since we need to ensure that all external references get
@@ -240,7 +260,7 @@ void ModuleSummaryIndex::propagateAttributes(
           GVS->setReadOnly(false);
           GVS->setWriteOnly(false);
         }
-      propagateAttributesToRefs(S.get());
+      propagateAttributesToRefs(S.get(), MarkedNonReadWriteOnly);
     }
   setWithAttributePropagation();
   if (llvm::AreStatisticsEnabled())
diff --git a/contrib/llvm-project/llvm/lib/IR/Operator.cpp b/contrib/llvm-project/llvm/lib/IR/Operator.cpp
index 0f70fc37dee2..69181f35827b 100644
--- a/contrib/llvm-project/llvm/lib/IR/Operator.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/Operator.cpp
@@ -61,10 +61,17 @@ Align GEPOperator::getMaxPreservedAlignment(const DataLayout &DL) const {
 bool GEPOperator::accumulateConstantOffset(
     const DataLayout &DL, APInt &Offset,
     function_ref<bool(Value &, APInt &)> ExternalAnalysis) const {
-   assert(Offset.getBitWidth() ==
-              DL.getIndexSizeInBits(getPointerAddressSpace()) &&
-          "The offset bit width does not match DL specification.");
+  assert(Offset.getBitWidth() ==
+             DL.getIndexSizeInBits(getPointerAddressSpace()) &&
+         "The offset bit width does not match DL specification.");
+  SmallVector<const Value *> Index(value_op_begin() + 1, value_op_end());
+  return GEPOperator::accumulateConstantOffset(getSourceElementType(), Index,
+                                               DL, Offset, ExternalAnalysis);
+}
 
+bool GEPOperator::accumulateConstantOffset(
+    Type *SourceType, ArrayRef<const Value *> Index, const DataLayout &DL,
+    APInt &Offset, function_ref<bool(Value &, APInt &)> ExternalAnalysis) {
   bool UsedExternalAnalysis = false;
   auto AccumulateOffset = [&](APInt Index, uint64_t Size) -> bool {
     Index = Index.sextOrTrunc(Offset.getBitWidth());
@@ -85,9 +92,10 @@ bool GEPOperator::accumulateConstantOffset(
     }
     return true;
   };
-
-  for (gep_type_iterator GTI = gep_type_begin(this), GTE = gep_type_end(this);
-       GTI != GTE; ++GTI) {
+  auto begin = generic_gep_type_iterator<decltype(Index.begin())>::begin(
+      SourceType, Index.begin());
+  auto end = generic_gep_type_iterator<decltype(Index.end())>::end(Index.end());
+  for (auto GTI = begin, GTE = end; GTI != GTE; ++GTI) {
     // Scalable vectors are multiplied by a runtime constant.
     bool ScalableType = false;
     if (isa<ScalableVectorType>(GTI.getIndexedType()))
diff --git a/contrib/llvm-project/llvm/lib/IR/OptBisect.cpp b/contrib/llvm-project/llvm/lib/IR/OptBisect.cpp
index 3104b90f3070..dc85e1316d48 100644
--- a/contrib/llvm-project/llvm/lib/IR/OptBisect.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/OptBisect.cpp
@@ -54,3 +54,5 @@ bool OptBisect::checkPass(const StringRef PassName,
   printPassMessage(PassName, CurBisectNum, TargetDesc, ShouldRun);
   return ShouldRun;
 }
+
+ManagedStatic<OptBisect> llvm::OptBisector;
diff --git a/contrib/llvm-project/llvm/lib/IR/Pass.cpp b/contrib/llvm-project/llvm/lib/IR/Pass.cpp
index a815da2bdc51..755ea57c63fd 100644
--- a/contrib/llvm-project/llvm/lib/IR/Pass.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/Pass.cpp
@@ -62,7 +62,7 @@ bool ModulePass::skipModule(Module &M) const {
 }
 
 bool Pass::mustPreserveAnalysisID(char &AID) const {
-  return Resolver->getAnalysisIfAvailable(&AID, true) != nullptr;
+  return Resolver->getAnalysisIfAvailable(&AID) != nullptr;
 }
 
 // dumpPassStructure - Implement the -debug-pass=Structure option
@@ -259,22 +259,23 @@ void AnalysisUsage::setPreservesCFG() {
 AnalysisUsage &AnalysisUsage::addPreserved(StringRef Arg) {
   const PassInfo *PI = Pass::lookupPassInfo(Arg);
   // If the pass exists, preserve it. Otherwise silently do nothing.
-  if (PI) Preserved.push_back(PI->getTypeInfo());
+  if (PI)
+    pushUnique(Preserved, PI->getTypeInfo());
   return *this;
 }
 
 AnalysisUsage &AnalysisUsage::addRequiredID(const void *ID) {
-  Required.push_back(ID);
+  pushUnique(Required, ID);
   return *this;
 }
 
 AnalysisUsage &AnalysisUsage::addRequiredID(char &ID) {
-  Required.push_back(&ID);
+  pushUnique(Required, &ID);
   return *this;
 }
 
 AnalysisUsage &AnalysisUsage::addRequiredTransitiveID(char &ID) {
-  Required.push_back(&ID);
-  RequiredTransitive.push_back(&ID);
+  pushUnique(Required, &ID);
+  pushUnique(RequiredTransitive, &ID);
   return *this;
 }
diff --git a/contrib/llvm-project/llvm/lib/IR/PassInstrumentation.cpp b/contrib/llvm-project/llvm/lib/IR/PassInstrumentation.cpp
index 49cc6ec04d90..56a36db21e28 100644
--- a/contrib/llvm-project/llvm/lib/IR/PassInstrumentation.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/PassInstrumentation.cpp
@@ -12,10 +12,29 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/IR/PassInstrumentation.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/IR/PassManager.h"
 
 namespace llvm {
 
+void PassInstrumentationCallbacks::addClassToPassName(StringRef ClassName,
+                                                      StringRef PassName) {
+  ClassToPassName[ClassName] = PassName.str();
+}
+
+StringRef
+PassInstrumentationCallbacks::getPassNameForClassName(StringRef ClassName) {
+  return ClassToPassName[ClassName];
+}
+
 AnalysisKey PassInstrumentationAnalysis::Key;
 
+bool isSpecialPass(StringRef PassID, const std::vector<StringRef> &Specials) {
+  size_t Pos = PassID.find('<');
+  StringRef Prefix = PassID;
+  if (Pos != StringRef::npos)
+    Prefix = PassID.substr(0, Pos);
+  return any_of(Specials, [Prefix](StringRef S) { return Prefix.endswith(S); });
+}
+
 } // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/IR/PassManager.cpp b/contrib/llvm-project/llvm/lib/IR/PassManager.cpp
index 624827ff8cd9..4cf7ab2a602b 100644
--- a/contrib/llvm-project/llvm/lib/IR/PassManager.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/PassManager.cpp
@@ -91,6 +91,54 @@ bool FunctionAnalysisManagerModuleProxy::Result::invalidate(
 }
 } // namespace llvm
 
+PreservedAnalyses ModuleToFunctionPassAdaptor::run(Module &M,
+                                                   ModuleAnalysisManager &AM) {
+  FunctionAnalysisManager &FAM =
+      AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+
+  // Request PassInstrumentation from analysis manager, will use it to run
+  // instrumenting callbacks for the passes later.
+  PassInstrumentation PI = AM.getResult<PassInstrumentationAnalysis>(M);
+
+  PreservedAnalyses PA = PreservedAnalyses::all();
+  for (Function &F : M) {
+    if (F.isDeclaration())
+      continue;
+
+    // Check the PassInstrumentation's BeforePass callbacks before running the
+    // pass, skip its execution completely if asked to (callback returns
+    // false).
+    if (!PI.runBeforePass<Function>(*Pass, F))
+      continue;
+
+    PreservedAnalyses PassPA;
+    {
+      TimeTraceScope TimeScope(Pass->name(), F.getName());
+      PassPA = Pass->run(F, FAM);
+    }
+
+    PI.runAfterPass(*Pass, F, PassPA);
+
+    // We know that the function pass couldn't have invalidated any other
+    // function's analyses (that's the contract of a function pass), so
+    // directly handle the function analysis manager's invalidation here.
+    FAM.invalidate(F, PassPA);
+
+    // Then intersect the preserved set so that invalidation of module
+    // analyses will eventually occur when the module pass completes.
+    PA.intersect(std::move(PassPA));
+  }
+
+  // The FunctionAnalysisManagerModuleProxy is preserved because (we assume)
+  // the function passes we ran didn't add or remove any functions.
+  //
+  // We also preserve all analyses on Functions, because we did all the
+  // invalidation we needed to do above.
+  PA.preserveSet<AllAnalysesOn<Function>>();
+  PA.preserve<FunctionAnalysisManagerModuleProxy>();
+  return PA;
+}
+
 AnalysisSetKey CFGAnalyses::SetKey;
 
 AnalysisSetKey PreservedAnalyses::AllAnalysesKey;
diff --git a/contrib/llvm-project/llvm/lib/IR/PassRegistry.cpp b/contrib/llvm-project/llvm/lib/IR/PassRegistry.cpp
index 0572c4fe5237..94f607afec47 100644
--- a/contrib/llvm-project/llvm/lib/IR/PassRegistry.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/PassRegistry.cpp
@@ -40,14 +40,12 @@ PassRegistry::~PassRegistry() = default;
 
 const PassInfo *PassRegistry::getPassInfo(const void *TI) const {
   sys::SmartScopedReader<true> Guard(Lock);
-  MapType::const_iterator I = PassInfoMap.find(TI);
-  return I != PassInfoMap.end() ? I->second : nullptr;
+  return PassInfoMap.lookup(TI);
 }
 
 const PassInfo *PassRegistry::getPassInfo(StringRef Arg) const {
   sys::SmartScopedReader<true> Guard(Lock);
-  StringMapType::const_iterator I = PassInfoStringMap.find(Arg);
-  return I != PassInfoStringMap.end() ? I->second : nullptr;
+  return PassInfoStringMap.lookup(Arg);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm-project/llvm/lib/IR/PassTimingInfo.cpp b/contrib/llvm-project/llvm/lib/IR/PassTimingInfo.cpp
index 25275e5733ac..d0c1517f480b 100644
--- a/contrib/llvm-project/llvm/lib/IR/PassTimingInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/PassTimingInfo.cpp
@@ -16,9 +16,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/IR/PassTimingInfo.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringRef.h"
 #include "llvm/IR/PassInstrumentation.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
@@ -26,9 +24,8 @@
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Mutex.h"
-#include "llvm/Support/Timer.h"
+#include "llvm/Support/TypeName.h"
 #include "llvm/Support/raw_ostream.h"
-#include <memory>
 #include <string>
 
 using namespace llvm;
@@ -38,11 +35,17 @@ using namespace llvm;
 namespace llvm {
 
 bool TimePassesIsEnabled = false;
+bool TimePassesPerRun = false;
 
 static cl::opt<bool, true> EnableTiming(
     "time-passes", cl::location(TimePassesIsEnabled), cl::Hidden,
     cl::desc("Time each pass, printing elapsed time for each on exit"));
 
+static cl::opt<bool, true> EnableTimingPerRun(
+    "time-passes-per-run", cl::location(TimePassesPerRun), cl::Hidden,
+    cl::desc("Time each pass run, printing elapsed time for each run on exit"),
+    cl::callback([](const bool &) { TimePassesIsEnabled = true; }));
+
 namespace {
 namespace legacy {
 
@@ -168,6 +171,13 @@ void reportAndResetTimings(raw_ostream *OutStream) {
 /// Returns the timer for the specified pass invocation of \p PassID.
 /// Each time it creates a new timer.
 Timer &TimePassesHandler::getPassTimer(StringRef PassID) {
+  if (!PerRun) {
+    TimerVector &Timers = TimingData[PassID];
+    if (Timers.size() == 0)
+      Timers.emplace_back(new Timer(PassID, PassID, TG));
+    return *Timers.front();
+  }
+
   // Take a vector of Timers created for this \p PassID and append
   // one more timer to it.
   TimerVector &Timers = TimingData[PassID];
@@ -182,8 +192,12 @@ Timer &TimePassesHandler::getPassTimer(StringRef PassID) {
   return *T;
 }
 
-TimePassesHandler::TimePassesHandler(bool Enabled)
-    : TG("pass", "... Pass execution timing report ..."), Enabled(Enabled) {}
+TimePassesHandler::TimePassesHandler(bool Enabled, bool PerRun)
+    : TG("pass", "... Pass execution timing report ..."), Enabled(Enabled),
+      PerRun(PerRun) {}
+
+TimePassesHandler::TimePassesHandler()
+    : TimePassesHandler(TimePassesIsEnabled, TimePassesPerRun) {}
 
 void TimePassesHandler::setOutStream(raw_ostream &Out) {
   OutStream = &Out;
@@ -234,30 +248,20 @@ void TimePassesHandler::stopTimer(StringRef PassID) {
     MyTimer->stopTimer();
 }
 
-static bool matchPassManager(StringRef PassID) {
-  size_t prefix_pos = PassID.find('<');
-  if (prefix_pos == StringRef::npos)
-    return false;
-  StringRef Prefix = PassID.substr(0, prefix_pos);
-  return Prefix.endswith("PassManager") || Prefix.endswith("PassAdaptor") ||
-         Prefix.endswith("AnalysisManagerProxy");
-}
-
-bool TimePassesHandler::runBeforePass(StringRef PassID) {
-  if (matchPassManager(PassID))
-    return true;
+void TimePassesHandler::runBeforePass(StringRef PassID) {
+  if (isSpecialPass(PassID,
+                    {"PassManager", "PassAdaptor", "AnalysisManagerProxy"}))
+    return;
 
   startTimer(PassID);
 
   LLVM_DEBUG(dbgs() << "after runBeforePass(" << PassID << ")\n");
   LLVM_DEBUG(dump());
-
-  // we are not going to skip this pass, thus return true.
-  return true;
 }
 
 void TimePassesHandler::runAfterPass(StringRef PassID) {
-  if (matchPassManager(PassID))
+  if (isSpecialPass(PassID,
+                    {"PassManager", "PassAdaptor", "AnalysisManagerProxy"}))
     return;
 
   stopTimer(PassID);
@@ -270,12 +274,16 @@ void TimePassesHandler::registerCallbacks(PassInstrumentationCallbacks &PIC) {
   if (!Enabled)
     return;
 
-  PIC.registerBeforePassCallback(
-      [this](StringRef P, Any) { return this->runBeforePass(P); });
+  PIC.registerBeforeNonSkippedPassCallback(
+      [this](StringRef P, Any) { this->runBeforePass(P); });
   PIC.registerAfterPassCallback(
-      [this](StringRef P, Any) { this->runAfterPass(P); });
+      [this](StringRef P, Any, const PreservedAnalyses &) {
+        this->runAfterPass(P);
+      });
   PIC.registerAfterPassInvalidatedCallback(
-      [this](StringRef P) { this->runAfterPass(P); });
+      [this](StringRef P, const PreservedAnalyses &) {
+        this->runAfterPass(P);
+      });
   PIC.registerBeforeAnalysisCallback(
       [this](StringRef P, Any) { this->runBeforePass(P); });
   PIC.registerAfterAnalysisCallback(
diff --git a/contrib/llvm-project/llvm/lib/IR/PrintPasses.cpp b/contrib/llvm-project/llvm/lib/IR/PrintPasses.cpp
new file mode 100644
index 000000000000..83b8c93e766f
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/IR/PrintPasses.cpp
@@ -0,0 +1,88 @@
+//===- PrintPasses.cpp ----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/PrintPasses.h"
+#include "llvm/Support/CommandLine.h"
+#include <unordered_set>
+
+using namespace llvm;
+
+// Print IR out before/after specified passes.
+static cl::list<std::string>
+    PrintBefore("print-before",
+                llvm::cl::desc("Print IR before specified passes"),
+                cl::CommaSeparated, cl::Hidden);
+
+static cl::list<std::string>
+    PrintAfter("print-after", llvm::cl::desc("Print IR after specified passes"),
+               cl::CommaSeparated, cl::Hidden);
+
+static cl::opt<bool> PrintBeforeAll("print-before-all",
+                                    llvm::cl::desc("Print IR before each pass"),
+                                    cl::init(false), cl::Hidden);
+static cl::opt<bool> PrintAfterAll("print-after-all",
+                                   llvm::cl::desc("Print IR after each pass"),
+                                   cl::init(false), cl::Hidden);
+
+static cl::opt<bool>
+    PrintModuleScope("print-module-scope",
+                     cl::desc("When printing IR for print-[before|after]{-all} "
+                              "always print a module IR"),
+                     cl::init(false), cl::Hidden);
+
+static cl::list<std::string>
+    PrintFuncsList("filter-print-funcs", cl::value_desc("function names"),
+                   cl::desc("Only print IR for functions whose name "
+                            "match this for all print-[before|after][-all] "
+                            "options"),
+                   cl::CommaSeparated, cl::Hidden);
+
+/// This is a helper to determine whether to print IR before or
+/// after a pass.
+
+bool llvm::shouldPrintBeforeSomePass() {
+  return PrintBeforeAll || !PrintBefore.empty();
+}
+
+bool llvm::shouldPrintAfterSomePass() {
+  return PrintAfterAll || !PrintAfter.empty();
+}
+
+static bool shouldPrintBeforeOrAfterPass(StringRef PassID,
+                                         ArrayRef<std::string> PassesToPrint) {
+  return llvm::is_contained(PassesToPrint, PassID);
+}
+
+bool llvm::shouldPrintBeforeAll() { return PrintBeforeAll; }
+
+bool llvm::shouldPrintAfterAll() { return PrintAfterAll; }
+
+bool llvm::shouldPrintBeforePass(StringRef PassID) {
+  return PrintBeforeAll || shouldPrintBeforeOrAfterPass(PassID, PrintBefore);
+}
+
+bool llvm::shouldPrintAfterPass(StringRef PassID) {
+  return PrintAfterAll || shouldPrintBeforeOrAfterPass(PassID, PrintAfter);
+}
+
+std::vector<std::string> llvm::printBeforePasses() {
+  return std::vector<std::string>(PrintBefore);
+}
+
+std::vector<std::string> llvm::printAfterPasses() {
+  return std::vector<std::string>(PrintAfter);
+}
+
+bool llvm::forcePrintModuleIR() { return PrintModuleScope; }
+
+bool llvm::isFunctionInPrintList(StringRef FunctionName) {
+  static std::unordered_set<std::string> PrintFuncNames(PrintFuncsList.begin(),
+                                                        PrintFuncsList.end());
+  return PrintFuncNames.empty() ||
+         PrintFuncNames.count(std::string(FunctionName));
+}
diff --git a/contrib/llvm-project/llvm/lib/IR/ProfileSummary.cpp b/contrib/llvm-project/llvm/lib/IR/ProfileSummary.cpp
index ac6bcd9fe3af..453a278a7f3f 100644
--- a/contrib/llvm-project/llvm/lib/IR/ProfileSummary.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/ProfileSummary.cpp
@@ -259,7 +259,7 @@ void ProfileSummary::printSummary(raw_ostream &OS) {
 
 void ProfileSummary::printDetailedSummary(raw_ostream &OS) {
   OS << "Detailed summary:\n";
-  for (auto Entry : DetailedSummary) {
+  for (const auto &Entry : DetailedSummary) {
     OS << Entry.NumCounts << " blocks with count >= " << Entry.MinCount
        << " account for "
        << format("%0.6g", (float)Entry.Cutoff / Scale * 100)
diff --git a/contrib/llvm-project/llvm/lib/IR/PseudoProbe.cpp b/contrib/llvm-project/llvm/lib/IR/PseudoProbe.cpp
new file mode 100644
index 000000000000..80d2963938d4
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/IR/PseudoProbe.cpp
@@ -0,0 +1,99 @@
+//===- PseudoProbe.cpp - Pseudo Probe Helpers -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the helpers to manipulate pseudo probe IR intrinsic
+// calls.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/PseudoProbe.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+
+using namespace llvm;
+
+namespace llvm {
+
+Optional<PseudoProbe> extractProbeFromDiscriminator(const Instruction &Inst) {
+  assert(isa<CallBase>(&Inst) && !isa<IntrinsicInst>(&Inst) &&
+         "Only call instructions should have pseudo probe encodes as their "
+         "Dwarf discriminators");
+  if (const DebugLoc &DLoc = Inst.getDebugLoc()) {
+    const DILocation *DIL = DLoc;
+    auto Discriminator = DIL->getDiscriminator();
+    if (DILocation::isPseudoProbeDiscriminator(Discriminator)) {
+      PseudoProbe Probe;
+      Probe.Id =
+          PseudoProbeDwarfDiscriminator::extractProbeIndex(Discriminator);
+      Probe.Type =
+          PseudoProbeDwarfDiscriminator::extractProbeType(Discriminator);
+      Probe.Attr =
+          PseudoProbeDwarfDiscriminator::extractProbeAttributes(Discriminator);
+      Probe.Factor =
+          PseudoProbeDwarfDiscriminator::extractProbeFactor(Discriminator) /
+          (float)PseudoProbeDwarfDiscriminator::FullDistributionFactor;
+      return Probe;
+    }
+  }
+  return None;
+}
+
+Optional<PseudoProbe> extractProbe(const Instruction &Inst) {
+  if (const auto *II = dyn_cast<PseudoProbeInst>(&Inst)) {
+    PseudoProbe Probe;
+    Probe.Id = II->getIndex()->getZExtValue();
+    Probe.Type = (uint32_t)PseudoProbeType::Block;
+    Probe.Attr = II->getAttributes()->getZExtValue();
+    Probe.Factor = II->getFactor()->getZExtValue() /
+                   (float)PseudoProbeFullDistributionFactor;
+    return Probe;
+  }
+
+  if (isa<CallBase>(&Inst) && !isa<IntrinsicInst>(&Inst))
+    return extractProbeFromDiscriminator(Inst);
+
+  return None;
+}
+
+void setProbeDistributionFactor(Instruction &Inst, float Factor) {
+  assert(Factor >= 0 && Factor <= 1 &&
+         "Distribution factor must be in [0, 1.0]");
+  if (auto *II = dyn_cast<PseudoProbeInst>(&Inst)) {
+    IRBuilder<> Builder(&Inst);
+    uint64_t IntFactor = PseudoProbeFullDistributionFactor;
+    if (Factor < 1)
+      IntFactor *= Factor;
+    auto OrigFactor = II->getFactor()->getZExtValue();
+    if (IntFactor != OrigFactor)
+      II->replaceUsesOfWith(II->getFactor(), Builder.getInt64(IntFactor));
+  } else if (isa<CallBase>(&Inst) && !isa<IntrinsicInst>(&Inst)) {
+    if (const DebugLoc &DLoc = Inst.getDebugLoc()) {
+      const DILocation *DIL = DLoc;
+      auto Discriminator = DIL->getDiscriminator();
+      if (DILocation::isPseudoProbeDiscriminator(Discriminator)) {
+        auto Index =
+            PseudoProbeDwarfDiscriminator::extractProbeIndex(Discriminator);
+        auto Type =
+            PseudoProbeDwarfDiscriminator::extractProbeType(Discriminator);
+        auto Attr = PseudoProbeDwarfDiscriminator::extractProbeAttributes(
+            Discriminator);
+        // Round small factors to 0 to avoid over-counting.
+        uint32_t IntFactor =
+            PseudoProbeDwarfDiscriminator::FullDistributionFactor;
+        if (Factor < 1)
+          IntFactor *= Factor;
+        uint32_t V = PseudoProbeDwarfDiscriminator::packProbeData(
+            Index, Type, Attr, IntFactor);
+        DIL = DIL->cloneWithDiscriminator(V);
+        Inst.setDebugLoc(DIL);
+      }
+    }
+  }
+}
+} // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/IR/ReplaceConstant.cpp b/contrib/llvm-project/llvm/lib/IR/ReplaceConstant.cpp
new file mode 100644
index 000000000000..7efa525d427e
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/IR/ReplaceConstant.cpp
@@ -0,0 +1,70 @@
+//===- ReplaceConstant.cpp - Replace LLVM constant expression--------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a utility function for replacing LLVM constant
+// expressions by instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/ReplaceConstant.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/NoFolder.h"
+
+namespace llvm {
+// Replace a constant expression by instructions with equivalent operations at
+// a specified location.
+Instruction *createReplacementInstr(ConstantExpr *CE, Instruction *Instr) {
+  IRBuilder<NoFolder> Builder(Instr);
+  unsigned OpCode = CE->getOpcode();
+  switch (OpCode) {
+  case Instruction::GetElementPtr: {
+    SmallVector<Value *, 4> CEOpVec(CE->operands());
+    ArrayRef<Value *> CEOps(CEOpVec);
+    return dyn_cast<Instruction>(
+        Builder.CreateInBoundsGEP(cast<GEPOperator>(CE)->getSourceElementType(),
+                                  CEOps[0], CEOps.slice(1)));
+  }
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::FDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+  case Instruction::FRem:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    return dyn_cast<Instruction>(
+        Builder.CreateBinOp((Instruction::BinaryOps)OpCode, CE->getOperand(0),
+                            CE->getOperand(1), CE->getName()));
+  case Instruction::Trunc:
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::UIToFP:
+  case Instruction::SIToFP:
+  case Instruction::FPTrunc:
+  case Instruction::FPExt:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+  case Instruction::BitCast:
+    return dyn_cast<Instruction>(
+        Builder.CreateCast((Instruction::CastOps)OpCode, CE->getOperand(0),
+                           CE->getType(), CE->getName()));
+  default:
+    llvm_unreachable("Unhandled constant expression!\n");
+  }
+}
+} // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/IR/SafepointIRVerifier.cpp b/contrib/llvm-project/llvm/lib/IR/SafepointIRVerifier.cpp
index 6bf7caa50a12..8ed31b6668e0 100644
--- a/contrib/llvm-project/llvm/lib/IR/SafepointIRVerifier.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/SafepointIRVerifier.cpp
@@ -561,8 +561,7 @@ GCPtrTracker::GCPtrTracker(const Function &F, const DominatorTree &DT,
 }
 
 BasicBlockState *GCPtrTracker::getBasicBlockState(const BasicBlock *BB) {
-  auto it = BlockMap.find(BB);
-  return it != BlockMap.end() ? it->second : nullptr;
+  return BlockMap.lookup(BB);
 }
 
 const BasicBlockState *GCPtrTracker::getBasicBlockState(
diff --git a/contrib/llvm-project/llvm/lib/IR/StructuralHash.cpp b/contrib/llvm-project/llvm/lib/IR/StructuralHash.cpp
new file mode 100644
index 000000000000..5a6e07451326
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/IR/StructuralHash.cpp
@@ -0,0 +1,84 @@
+//===-- StructuralHash.cpp - IR Hash for expensive checks -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+
+#ifdef EXPENSIVE_CHECKS
+
+#include "llvm/IR/StructuralHash.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+
+using namespace llvm;
+
+namespace {
+namespace details {
+
+// Basic hashing mechanism to detect structural change to the IR, used to verify
+// pass return status consistency with actual change. Loosely copied from
+// llvm/lib/Transforms/Utils/FunctionComparator.cpp
+
+class StructuralHash {
+  uint64_t Hash = 0x6acaa36bef8325c5ULL;
+
+  void update(uint64_t V) { Hash = hashing::detail::hash_16_bytes(Hash, V); }
+
+public:
+  StructuralHash() = default;
+
+  void update(const Function &F) {
+    if (F.empty())
+      return;
+
+    update(F.isVarArg());
+    update(F.arg_size());
+
+    SmallVector<const BasicBlock *, 8> BBs;
+    SmallPtrSet<const BasicBlock *, 16> VisitedBBs;
+
+    BBs.push_back(&F.getEntryBlock());
+    VisitedBBs.insert(BBs[0]);
+    while (!BBs.empty()) {
+      const BasicBlock *BB = BBs.pop_back_val();
+      update(45798); // Block header
+      for (auto &Inst : *BB)
+        update(Inst.getOpcode());
+
+      const Instruction *Term = BB->getTerminator();
+      for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) {
+        if (!VisitedBBs.insert(Term->getSuccessor(i)).second)
+          continue;
+        BBs.push_back(Term->getSuccessor(i));
+      }
+    }
+  }
+
+  void update(const Module &M) {
+    for (const Function &F : M)
+      update(F);
+  }
+
+  uint64_t getHash() const { return Hash; }
+};
+
+} // namespace details
+
+} // namespace
+
+uint64_t llvm::StructuralHash(const Function &F) {
+  details::StructuralHash H;
+  H.update(F);
+  return H.getHash();
+}
+
+uint64_t llvm::StructuralHash(const Module &M) {
+  details::StructuralHash H;
+  H.update(M);
+  return H.getHash();
+}
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/IR/Type.cpp b/contrib/llvm-project/llvm/lib/IR/Type.cpp
index d869a6e07cca..bade7dc325f4 100644
--- a/contrib/llvm-project/llvm/lib/IR/Type.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/Type.cpp
@@ -49,6 +49,7 @@ Type *Type::getPrimitiveType(LLVMContext &C, TypeID IDNumber) {
   case LabelTyID     : return getLabelTy(C);
   case MetadataTyID  : return getMetadataTy(C);
   case X86_MMXTyID   : return getX86_MMXTy(C);
+  case X86_AMXTyID   : return getX86_AMXTy(C);
   case TokenTyID     : return getTokenTy(C);
   default:
     return nullptr;
@@ -81,6 +82,14 @@ bool Type::canLosslesslyBitCastTo(Type *Ty) const {
       Ty->getPrimitiveSizeInBits().getFixedSize() == 64)
     return true;
 
+  //  8192-bit fixed width vector types can be losslessly converted to x86amx.
+  if (((isa<FixedVectorType>(this)) && Ty->isX86_AMXTy()) &&
+      getPrimitiveSizeInBits().getFixedSize() == 8192)
+    return true;
+  if ((isX86_AMXTy() && isa<FixedVectorType>(Ty)) &&
+      Ty->getPrimitiveSizeInBits().getFixedSize() == 8192)
+    return true;
+
   // At this point we have only various mismatches of the first class types
   // remaining and ptr->ptr. Just select the lossless conversions. Everything
   // else is not lossless. Conservatively assume we can't losslessly convert
@@ -120,6 +129,7 @@ TypeSize Type::getPrimitiveSizeInBits() const {
   case Type::FP128TyID: return TypeSize::Fixed(128);
   case Type::PPC_FP128TyID: return TypeSize::Fixed(128);
   case Type::X86_MMXTyID: return TypeSize::Fixed(64);
+  case Type::X86_AMXTyID: return TypeSize::Fixed(8192);
   case Type::IntegerTyID:
     return TypeSize::Fixed(cast<IntegerType>(this)->getBitWidth());
   case Type::FixedVectorTyID:
@@ -128,7 +138,7 @@ TypeSize Type::getPrimitiveSizeInBits() const {
     ElementCount EC = VTy->getElementCount();
     TypeSize ETS = VTy->getElementType()->getPrimitiveSizeInBits();
     assert(!ETS.isScalable() && "Vector type should have fixed-width elements");
-    return {ETS.getFixedSize() * EC.Min, EC.Scalable};
+    return {ETS.getFixedSize() * EC.getKnownMinValue(), EC.isScalable()};
   }
   default: return TypeSize::Fixed(0);
   }
@@ -179,6 +189,7 @@ Type *Type::getX86_FP80Ty(LLVMContext &C) { return &C.pImpl->X86_FP80Ty; }
 Type *Type::getFP128Ty(LLVMContext &C) { return &C.pImpl->FP128Ty; }
 Type *Type::getPPC_FP128Ty(LLVMContext &C) { return &C.pImpl->PPC_FP128Ty; }
 Type *Type::getX86_MMXTy(LLVMContext &C) { return &C.pImpl->X86_MMXTy; }
+Type *Type::getX86_AMXTy(LLVMContext &C) { return &C.pImpl->X86_AMXTy; }
 
 IntegerType *Type::getInt1Ty(LLVMContext &C) { return &C.pImpl->Int1Ty; }
 IntegerType *Type::getInt8Ty(LLVMContext &C) { return &C.pImpl->Int8Ty; }
@@ -223,6 +234,10 @@ PointerType *Type::getX86_MMXPtrTy(LLVMContext &C, unsigned AS) {
   return getX86_MMXTy(C)->getPointerTo(AS);
 }
 
+PointerType *Type::getX86_AMXPtrTy(LLVMContext &C, unsigned AS) {
+  return getX86_AMXTy(C)->getPointerTo(AS);
+}
+
 PointerType *Type::getIntNPtrTy(LLVMContext &C, unsigned N, unsigned AS) {
   return getIntNTy(C, N)->getPointerTo(AS);
 }
@@ -275,11 +290,6 @@ IntegerType *IntegerType::get(LLVMContext &C, unsigned NumBits) {
   return Entry;
 }
 
-bool IntegerType::isPowerOf2ByteWidth() const {
-  unsigned BitWidth = getBitWidth();
-  return (BitWidth > 7) && isPowerOf2_32(BitWidth);
-}
-
 APInt IntegerType::getMask() const {
   return APInt::getAllOnesValue(getBitWidth());
 }
@@ -380,6 +390,18 @@ StructType *StructType::get(LLVMContext &Context, ArrayRef<Type*> ETypes,
   return ST;
 }
 
+bool StructType::containsScalableVectorType() const {
+  for (Type *Ty : elements()) {
+    if (isa<ScalableVectorType>(Ty))
+      return true;
+    if (auto *STy = dyn_cast<StructType>(Ty))
+      if (STy->containsScalableVectorType())
+        return true;
+  }
+
+  return false;
+}
+
 void StructType::setBody(ArrayRef<Type*> Elements, bool isPacked) {
   assert(isOpaque() && "Struct body already set!");
 
@@ -499,9 +521,14 @@ bool StructType::isSized(SmallPtrSetImpl<Type*> *Visited) const {
   // Okay, our struct is sized if all of the elements are, but if one of the
   // elements is opaque, the struct isn't sized *yet*, but may become sized in
   // the future, so just bail out without caching.
-  for (element_iterator I = element_begin(), E = element_end(); I != E; ++I)
-    if (!(*I)->isSized(Visited))
+  for (Type *Ty : elements()) {
+    // If the struct contains a scalable vector type, don't consider it sized.
+    // This prevents it from being used in loads/stores/allocas/GEPs.
+    if (isa<ScalableVectorType>(Ty))
+      return false;
+    if (!Ty->isSized(Visited))
       return false;
+  }
 
   // Here we cheat a bit and cast away const-ness. The goal is to memoize when
   // we find a sized type, as types can only move from opaque to sized, not the
@@ -521,7 +548,7 @@ StringRef StructType::getName() const {
 bool StructType::isValidElementType(Type *ElemTy) {
   return !ElemTy->isVoidTy() && !ElemTy->isLabelTy() &&
          !ElemTy->isMetadataTy() && !ElemTy->isFunctionTy() &&
-         !ElemTy->isTokenTy() && !isa<ScalableVectorType>(ElemTy);
+         !ElemTy->isTokenTy();
 }
 
 bool StructType::isLayoutIdentical(StructType *Other) const {
@@ -533,10 +560,6 @@ bool StructType::isLayoutIdentical(StructType *Other) const {
   return elements() == Other->elements();
 }
 
-StructType *Module::getTypeByName(StringRef Name) const {
-  return getContext().pImpl->NamedStructTypes.lookup(Name);
-}
-
 Type *StructType::getTypeAtIndex(const Value *V) const {
   unsigned Idx = (unsigned)cast<Constant>(V)->getUniqueInteger().getZExtValue();
   assert(indexValid(Idx) && "Invalid structure index!");
@@ -557,6 +580,10 @@ bool StructType::indexValid(const Value *V) const {
   return CU && CU->getZExtValue() < getNumElements();
 }
 
+StructType *StructType::getTypeByName(LLVMContext &C, StringRef Name) {
+  return C.pImpl->NamedStructTypes.lookup(Name);
+}
+
 //===----------------------------------------------------------------------===//
 //                           ArrayType Implementation
 //===----------------------------------------------------------------------===//
@@ -598,10 +625,10 @@ VectorType::VectorType(Type *ElType, unsigned EQ, Type::TypeID TID)
 }
 
 VectorType *VectorType::get(Type *ElementType, ElementCount EC) {
-  if (EC.Scalable)
-    return ScalableVectorType::get(ElementType, EC.Min);
+  if (EC.isScalable())
+    return ScalableVectorType::get(ElementType, EC.getKnownMinValue());
   else
-    return FixedVectorType::get(ElementType, EC.Min);
+    return FixedVectorType::get(ElementType, EC.getKnownMinValue());
 }
 
 bool VectorType::isValidElementType(Type *ElemTy) {
@@ -619,7 +646,7 @@ FixedVectorType *FixedVectorType::get(Type *ElementType, unsigned NumElts) {
                                             "be an integer, floating point, or "
                                             "pointer type.");
 
-  ElementCount EC(NumElts, false);
+  auto EC = ElementCount::getFixed(NumElts);
 
   LLVMContextImpl *pImpl = ElementType->getContext().pImpl;
   VectorType *&Entry = ElementType->getContext()
@@ -641,7 +668,7 @@ ScalableVectorType *ScalableVectorType::get(Type *ElementType,
                                             "be an integer, floating point, or "
                                             "pointer type.");
 
-  ElementCount EC(MinNumElts, true);
+  auto EC = ElementCount::getScalable(MinNumElts);
 
   LLVMContextImpl *pImpl = ElementType->getContext().pImpl;
   VectorType *&Entry = ElementType->getContext()
diff --git a/contrib/llvm-project/llvm/lib/IR/Use.cpp b/contrib/llvm-project/llvm/lib/IR/Use.cpp
index dc0716b85372..99049c0232aa 100644
--- a/contrib/llvm-project/llvm/lib/IR/Use.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/Use.cpp
@@ -17,24 +17,17 @@ void Use::swap(Use &RHS) {
   if (Val == RHS.Val)
     return;
 
-  if (Val)
-    removeFromList();
-
-  Value *OldVal = Val;
-  if (RHS.Val) {
-    RHS.removeFromList();
-    Val = RHS.Val;
-    Val->addUse(*this);
-  } else {
-    Val = nullptr;
-  }
-
-  if (OldVal) {
-    RHS.Val = OldVal;
-    RHS.Val->addUse(RHS);
-  } else {
-    RHS.Val = nullptr;
-  }
+  std::swap(Val, RHS.Val);
+  std::swap(Next, RHS.Next);
+  std::swap(Prev, RHS.Prev);
+
+  *Prev = this;
+  if (Next)
+    Next->Prev = &Next;
+
+  *RHS.Prev = &RHS;
+  if (RHS.Next)
+    RHS.Next->Prev = &RHS.Next;
 }
 
 unsigned Use::getOperandNo() const {
diff --git a/contrib/llvm-project/llvm/lib/IR/User.cpp b/contrib/llvm-project/llvm/lib/IR/User.cpp
index 7da592f40127..9105c6fbd230 100644
--- a/contrib/llvm-project/llvm/lib/IR/User.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/User.cpp
@@ -29,7 +29,7 @@ void User::replaceUsesOfWith(Value *From, Value *To) {
       // The side effects of this setOperand call include linking to
       // "To", adding "this" to the uses list of To, and
       // most importantly, removing "this" from the use list of "From".
-      setOperand(i, To); // Fix it now...
+      setOperand(i, To);
     }
 }
 
diff --git a/contrib/llvm-project/llvm/lib/IR/Value.cpp b/contrib/llvm-project/llvm/lib/IR/Value.cpp
index efb8d53e8964..572f37a32410 100644
--- a/contrib/llvm-project/llvm/lib/IR/Value.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/Value.cpp
@@ -51,9 +51,9 @@ static inline Type *checkType(Type *Ty) {
 }
 
 Value::Value(Type *ty, unsigned scid)
-    : VTy(checkType(ty)), UseList(nullptr), SubclassID(scid),
-      HasValueHandle(0), SubclassOptionalData(0), SubclassData(0),
-      NumUserOperands(0), IsUsedByMD(false), HasName(false) {
+    : VTy(checkType(ty)), UseList(nullptr), SubclassID(scid), HasValueHandle(0),
+      SubclassOptionalData(0), SubclassData(0), NumUserOperands(0),
+      IsUsedByMD(false), HasName(false), HasMetadata(false) {
   static_assert(ConstantFirstVal == 0, "!(SubclassID < ConstantFirstVal)");
   // FIXME: Why isn't this in the subclass gunk??
   // Note, we cannot call isa<CallInst> before the CallInst has been
@@ -77,6 +77,10 @@ Value::~Value() {
   if (isUsedByMetadata())
     ValueAsMetadata::handleDeletion(this);
 
+  // Remove associated metadata from context.
+  if (HasMetadata)
+    clearMetadata();
+
 #ifndef NDEBUG      // Only in -g mode...
   // Check to make sure that there are no uses of this value that are still
   // around when the value is destroyed.  If there are, then we have a dangling
@@ -147,6 +151,14 @@ bool Value::hasNUsesOrMore(unsigned N) const {
   return hasNItemsOrMore(use_begin(), use_end(), N);
 }
 
+bool Value::hasOneUser() const {
+  if (use_empty())
+    return false;
+  if (hasOneUse())
+    return true;
+  return std::equal(++user_begin(), user_end(), user_begin());
+}
+
 static bool isUnDroppableUser(const User *U) { return !U->isDroppable(); }
 
 Use *Value::getSingleUndroppableUse() {
@@ -175,21 +187,34 @@ void Value::dropDroppableUses(
   for (Use &U : uses())
     if (U.getUser()->isDroppable() && ShouldDrop(&U))
       ToBeEdited.push_back(&U);
-  for (Use *U : ToBeEdited) {
-    U->removeFromList();
-    if (auto *Assume = dyn_cast<IntrinsicInst>(U->getUser())) {
-      assert(Assume->getIntrinsicID() == Intrinsic::assume);
-      unsigned OpNo = U->getOperandNo();
-      if (OpNo == 0)
-        Assume->setOperand(0, ConstantInt::getTrue(Assume->getContext()));
-      else {
-        Assume->setOperand(OpNo, UndefValue::get(U->get()->getType()));
-        CallInst::BundleOpInfo &BOI = Assume->getBundleOpInfoForOperand(OpNo);
-        BOI.Tag = getContext().pImpl->getOrInsertBundleTag("ignore");
-      }
-    } else
-      llvm_unreachable("unkown droppable use");
+  for (Use *U : ToBeEdited)
+    dropDroppableUse(*U);
+}
+
+void Value::dropDroppableUsesIn(User &Usr) {
+  assert(Usr.isDroppable() && "Expected a droppable user!");
+  for (Use &UsrOp : Usr.operands()) {
+    if (UsrOp.get() == this)
+      dropDroppableUse(UsrOp);
+  }
+}
+
+void Value::dropDroppableUse(Use &U) {
+  U.removeFromList();
+  if (auto *Assume = dyn_cast<IntrinsicInst>(U.getUser())) {
+    assert(Assume->getIntrinsicID() == Intrinsic::assume);
+    unsigned OpNo = U.getOperandNo();
+    if (OpNo == 0)
+      U.set(ConstantInt::getTrue(Assume->getContext()));
+    else {
+      U.set(UndefValue::get(U.get()->getType()));
+      CallInst::BundleOpInfo &BOI = Assume->getBundleOpInfoForOperand(OpNo);
+      BOI.Tag = Assume->getContext().pImpl->getOrInsertBundleTag("ignore");
+    }
+    return;
   }
+
+  llvm_unreachable("unkown droppable use");
 }
 
 bool Value::isUsedInBasicBlock(const BasicBlock *BB) const {
@@ -405,6 +430,18 @@ void Value::takeName(Value *V) {
     ST->reinsertValue(this);
 }
 
+#ifndef NDEBUG
+std::string Value::getNameOrAsOperand() const {
+  if (!getName().empty())
+    return std::string(getName());
+
+  std::string BBName;
+  raw_string_ostream OS(BBName);
+  printAsOperand(OS, false);
+  return OS.str();
+}
+#endif
+
 void Value::assertModuleIsMaterializedImpl() const {
 #ifndef NDEBUG
   const GlobalValue *GV = dyn_cast<GlobalValue>(this);
@@ -691,11 +728,16 @@ uint64_t Value::getPointerDereferenceableBytes(const DataLayout &DL,
   CanBeNull = false;
   if (const Argument *A = dyn_cast<Argument>(this)) {
     DerefBytes = A->getDereferenceableBytes();
-    if (DerefBytes == 0 && (A->hasByValAttr() || A->hasStructRetAttr())) {
-      Type *PT = cast<PointerType>(A->getType())->getElementType();
-      if (PT->isSized())
-        DerefBytes = DL.getTypeStoreSize(PT).getKnownMinSize();
+    if (DerefBytes == 0) {
+      // Handle byval/byref/inalloca/preallocated arguments
+      if (Type *ArgMemTy = A->getPointeeInMemoryValueType()) {
+        if (ArgMemTy->isSized()) {
+          // FIXME: Why isn't this the type alloc size?
+          DerefBytes = DL.getTypeStoreSize(ArgMemTy).getKnownMinSize();
+        }
+      }
     }
+
     if (DerefBytes == 0) {
       DerefBytes = A->getDereferenceableOrNullBytes();
       CanBeNull = true;
@@ -783,7 +825,7 @@ Align Value::getPointerAlignment(const DataLayout &DL) const {
     const MaybeAlign Alignment = A->getParamAlign();
     if (!Alignment && A->hasStructRetAttr()) {
       // An sret parameter has at least the ABI alignment of the return type.
-      Type *EltTy = cast<PointerType>(A->getType())->getElementType();
+      Type *EltTy = A->getParamStructRetType();
       if (EltTy->isSized())
         return DL.getABITypeAlign(EltTy);
     }
diff --git a/contrib/llvm-project/llvm/lib/IR/Verifier.cpp b/contrib/llvm-project/llvm/lib/IR/Verifier.cpp
index c518ae87ea9b..6dd299ee9845 100644
--- a/contrib/llvm-project/llvm/lib/IR/Verifier.cpp
+++ b/contrib/llvm-project/llvm/lib/IR/Verifier.cpp
@@ -115,6 +115,11 @@
 
 using namespace llvm;
 
+static cl::opt<bool> VerifyNoAliasScopeDomination(
+    "verify-noalias-scope-decl-dom", cl::Hidden, cl::init(false),
+    cl::desc("Ensure that llvm.experimental.noalias.scope.decl for identical "
+             "scopes are not dominating"));
+
 namespace llvm {
 
 struct VerifierSupport {
@@ -282,6 +287,9 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport {
   /// Whether the current function has a DISubprogram attached to it.
   bool HasDebugInfo = false;
 
+  /// The current source language.
+  dwarf::SourceLanguage CurrentSourceLang = dwarf::DW_LANG_lo_user;
+
   /// Whether source was present on the first DIFile encountered in each CU.
   DenseMap<const DICompileUnit *, bool> HasSourceDebugInfo;
 
@@ -310,6 +318,8 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport {
 
   TBAAVerifier TBAAVerifyHelper;
 
+  SmallVector<IntrinsicInst *, 4> NoAliasScopeDecls;
+
   void checkAtomicMemAccessSize(Type *Ty, const Instruction *I);
 
 public:
@@ -357,6 +367,8 @@ public:
     LandingPadResultTy = nullptr;
     SawFrameEscape = false;
     SiblingFuncletInfo.clear();
+    verifyNoAliasScopeDecl();
+    NoAliasScopeDecls.clear();
 
     return !Broken;
   }
@@ -424,6 +436,7 @@ private:
   void visitRangeMetadata(Instruction &I, MDNode *Range, Type *Ty);
   void visitDereferenceableMetadata(Instruction &I, MDNode *MD);
   void visitProfMetadata(Instruction &I, MDNode *MD);
+  void visitAnnotationMetadata(MDNode *Annotation);
 
   template <class Ty> bool isValidMetadataArray(const MDTuple &N);
 #define HANDLE_SPECIALIZED_MDNODE_LEAF(CLASS) void visit##CLASS(const CLASS &N);
@@ -501,8 +514,6 @@ private:
   void verifySwiftErrorCall(CallBase &Call, const Value *SwiftErrorVal);
   void verifySwiftErrorValue(const Value *SwiftErrorVal);
   void verifyMustTailCall(CallInst &CI);
-  bool performTypeCheck(Intrinsic::ID ID, Function *F, Type *Ty, int VT,
-                        unsigned ArgNo, std::string &Suffix);
   bool verifyAttributeCount(AttributeList Attrs, unsigned Params);
   void verifyAttributeTypes(AttributeSet Attrs, bool IsFunction,
                             const Value *V);
@@ -534,6 +545,9 @@ private:
 
   /// Verify all-or-nothing property of DIFile source attribute within a CU.
   void verifySourceDebugInfo(const DICompileUnit &U, const DIFile &F);
+
+  /// Verify the llvm.experimental.noalias.scope.decl declarations
+  void verifyNoAliasScopeDecl();
 };
 
 } // end anonymous namespace
@@ -589,7 +603,8 @@ void Verifier::visitGlobalValue(const GlobalValue &GV) {
     Assert(!GV.isDSOLocal(),
            "GlobalValue with DLLImport Storage is dso_local!", &GV);
 
-    Assert((GV.isDeclaration() && GV.hasExternalLinkage()) ||
+    Assert((GV.isDeclaration() &&
+            (GV.hasExternalLinkage() || GV.hasExternalWeakLinkage())) ||
                GV.hasAvailableExternallyLinkage(),
            "Global is marked as dllimport, but not external", &GV);
   }
@@ -699,12 +714,16 @@ void Verifier::visitGlobalVariable(const GlobalVariable &GV) {
   }
 
   // Scalable vectors cannot be global variables, since we don't know
-  // the runtime size. If the global is a struct or an array containing
-  // scalable vectors, that will be caught by the isValidElementType methods
-  // in StructType or ArrayType instead.
+  // the runtime size. If the global is an array containing scalable vectors,
+  // that will be caught by the isValidElementType methods in StructType or
+  // ArrayType instead.
   Assert(!isa<ScalableVectorType>(GV.getValueType()),
          "Globals cannot contain scalable vectors", &GV);
 
+  if (auto *STy = dyn_cast<StructType>(GV.getValueType()))
+    Assert(!STy->containsScalableVectorType(),
+           "Globals cannot contain scalable vectors", &GV);
+
   if (!GV.hasInitializer()) {
     visitGlobalValue(GV);
     return;
@@ -894,7 +913,9 @@ void Verifier::visitDIScope(const DIScope &N) {
 
 void Verifier::visitDISubrange(const DISubrange &N) {
   AssertDI(N.getTag() == dwarf::DW_TAG_subrange_type, "invalid tag", &N);
-  AssertDI(N.getRawCountNode() || N.getRawUpperBound(),
+  bool HasAssumedSizedArraySupport = dwarf::isFortran(CurrentSourceLang);
+  AssertDI(HasAssumedSizedArraySupport || N.getRawCountNode() ||
+               N.getRawUpperBound(),
            "Subrange must contain count or upperBound", &N);
   AssertDI(!N.getRawCountNode() || !N.getRawUpperBound(),
            "Subrange can have any one of count or upperBound", &N);
@@ -920,14 +941,43 @@ void Verifier::visitDISubrange(const DISubrange &N) {
            "Stride must be signed constant or DIVariable or DIExpression", &N);
 }
 
+void Verifier::visitDIGenericSubrange(const DIGenericSubrange &N) {
+  AssertDI(N.getTag() == dwarf::DW_TAG_generic_subrange, "invalid tag", &N);
+  AssertDI(N.getRawCountNode() || N.getRawUpperBound(),
+           "GenericSubrange must contain count or upperBound", &N);
+  AssertDI(!N.getRawCountNode() || !N.getRawUpperBound(),
+           "GenericSubrange can have any one of count or upperBound", &N);
+  auto *CBound = N.getRawCountNode();
+  AssertDI(!CBound || isa<DIVariable>(CBound) || isa<DIExpression>(CBound),
+           "Count must be signed constant or DIVariable or DIExpression", &N);
+  auto *LBound = N.getRawLowerBound();
+  AssertDI(LBound, "GenericSubrange must contain lowerBound", &N);
+  AssertDI(isa<DIVariable>(LBound) || isa<DIExpression>(LBound),
+           "LowerBound must be signed constant or DIVariable or DIExpression",
+           &N);
+  auto *UBound = N.getRawUpperBound();
+  AssertDI(!UBound || isa<DIVariable>(UBound) || isa<DIExpression>(UBound),
+           "UpperBound must be signed constant or DIVariable or DIExpression",
+           &N);
+  auto *Stride = N.getRawStride();
+  AssertDI(Stride, "GenericSubrange must contain stride", &N);
+  AssertDI(isa<DIVariable>(Stride) || isa<DIExpression>(Stride),
+           "Stride must be signed constant or DIVariable or DIExpression", &N);
+}
+
 void Verifier::visitDIEnumerator(const DIEnumerator &N) {
   AssertDI(N.getTag() == dwarf::DW_TAG_enumerator, "invalid tag", &N);
 }
 
 void Verifier::visitDIBasicType(const DIBasicType &N) {
   AssertDI(N.getTag() == dwarf::DW_TAG_base_type ||
-               N.getTag() == dwarf::DW_TAG_unspecified_type,
+               N.getTag() == dwarf::DW_TAG_unspecified_type ||
+               N.getTag() == dwarf::DW_TAG_string_type,
            "invalid tag", &N);
+}
+
+void Verifier::visitDIStringType(const DIStringType &N) {
+  AssertDI(N.getTag() == dwarf::DW_TAG_string_type, "invalid tag", &N);
   AssertDI(!(N.isBigEndian() && N.isLittleEndian()) ,
             "has conflicting flags", &N);
 }
@@ -1020,12 +1070,6 @@ void Verifier::visitDICompositeType(const DICompositeType &N) {
   if (auto *Params = N.getRawTemplateParams())
     visitTemplateParams(N, *Params);
 
-  if (N.getTag() == dwarf::DW_TAG_class_type ||
-      N.getTag() == dwarf::DW_TAG_union_type) {
-    AssertDI(N.getFile() && !N.getFile()->getFilename().empty(),
-             "class/union requires a filename", &N, N.getFile());
-  }
-
   if (auto *D = N.getRawDiscriminator()) {
     AssertDI(isa<DIDerivedType>(D) && N.getTag() == dwarf::DW_TAG_variant_part,
              "discriminator can only appear on variant part");
@@ -1035,6 +1079,21 @@ void Verifier::visitDICompositeType(const DICompositeType &N) {
     AssertDI(N.getTag() == dwarf::DW_TAG_array_type,
              "dataLocation can only appear in array type");
   }
+
+  if (N.getRawAssociated()) {
+    AssertDI(N.getTag() == dwarf::DW_TAG_array_type,
+             "associated can only appear in array type");
+  }
+
+  if (N.getRawAllocated()) {
+    AssertDI(N.getTag() == dwarf::DW_TAG_array_type,
+             "allocated can only appear in array type");
+  }
+
+  if (N.getRawRank()) {
+    AssertDI(N.getTag() == dwarf::DW_TAG_array_type,
+             "rank can only appear in array type");
+  }
 }
 
 void Verifier::visitDISubroutineType(const DISubroutineType &N) {
@@ -1084,6 +1143,8 @@ void Verifier::visitDICompileUnit(const DICompileUnit &N) {
   AssertDI(!N.getFile()->getFilename().empty(), "invalid filename", &N,
            N.getFile());
 
+  CurrentSourceLang = (dwarf::SourceLanguage)N.getSourceLanguage();
+
   verifySourceDebugInfo(N, *N.getFile());
 
   AssertDI((N.getEmissionKind() <= DICompileUnit::LastEmissionKind),
@@ -1525,8 +1586,8 @@ void Verifier::visitModuleFlagCGProfileEntry(const MDOperand &MDO) {
     if (!FuncMDO)
       return;
     auto F = dyn_cast<ValueAsMetadata>(FuncMDO);
-    Assert(F && isa<Function>(F->getValue()), "expected a Function or null",
-           FuncMDO);
+    Assert(F && isa<Function>(F->getValue()->stripPointerCasts()),
+           "expected a Function or null", FuncMDO);
   };
   auto Node = dyn_cast_or_null<MDNode>(MDO);
   Assert(Node && Node->getNumOperands() == 3, "expected a MDNode triple", MDO);
@@ -1544,6 +1605,7 @@ static bool isFuncOnlyAttr(Attribute::AttrKind Kind) {
   case Attribute::NoReturn:
   case Attribute::NoSync:
   case Attribute::WillReturn:
+  case Attribute::NoCallback:
   case Attribute::NoCfCheck:
   case Attribute::NoUnwind:
   case Attribute::NoInline:
@@ -1572,6 +1634,7 @@ static bool isFuncOnlyAttr(Attribute::AttrKind Kind) {
   case Attribute::Builtin:
   case Attribute::NoBuiltin:
   case Attribute::Cold:
+  case Attribute::Hot:
   case Attribute::OptForFuzzing:
   case Attribute::OptimizeNone:
   case Attribute::JumpTable:
@@ -1585,6 +1648,8 @@ static bool isFuncOnlyAttr(Attribute::AttrKind Kind) {
   case Attribute::Speculatable:
   case Attribute::StrictFP:
   case Attribute::NullPointerIsValid:
+  case Attribute::MustProgress:
+  case Attribute::NoProfile:
     return true;
   default:
     break;
@@ -1652,9 +1717,10 @@ void Verifier::verifyParameterAttrs(AttributeSet Attrs, Type *Ty,
   AttrCount += Attrs.hasAttribute(Attribute::StructRet) ||
                Attrs.hasAttribute(Attribute::InReg);
   AttrCount += Attrs.hasAttribute(Attribute::Nest);
+  AttrCount += Attrs.hasAttribute(Attribute::ByRef);
   Assert(AttrCount <= 1,
          "Attributes 'byval', 'inalloca', 'preallocated', 'inreg', 'nest', "
-         "and 'sret' are incompatible!",
+         "'byref', and 'sret' are incompatible!",
          V);
 
   Assert(!(Attrs.hasAttribute(Attribute::InAlloca) &&
@@ -1699,17 +1765,6 @@ void Verifier::verifyParameterAttrs(AttributeSet Attrs, Type *Ty,
          "'noinline and alwaysinline' are incompatible!",
          V);
 
-  if (Attrs.hasAttribute(Attribute::ByVal) && Attrs.getByValType()) {
-    Assert(Attrs.getByValType() == cast<PointerType>(Ty)->getElementType(),
-           "Attribute 'byval' type does not match parameter!", V);
-  }
-
-  if (Attrs.hasAttribute(Attribute::Preallocated)) {
-    Assert(Attrs.getPreallocatedType() ==
-               cast<PointerType>(Ty)->getElementType(),
-           "Attribute 'preallocated' type does not match parameter!", V);
-  }
-
   AttrBuilder IncompatibleAttrs = AttributeFuncs::typeIncompatible(Ty);
   Assert(!AttrBuilder(Attrs).overlaps(IncompatibleAttrs),
          "Wrong types for attribute: " +
@@ -1720,9 +1775,10 @@ void Verifier::verifyParameterAttrs(AttributeSet Attrs, Type *Ty,
     SmallPtrSet<Type*, 4> Visited;
     if (!PTy->getElementType()->isSized(&Visited)) {
       Assert(!Attrs.hasAttribute(Attribute::ByVal) &&
-                 !Attrs.hasAttribute(Attribute::InAlloca) &&
-                 !Attrs.hasAttribute(Attribute::Preallocated),
-             "Attributes 'byval', 'inalloca', and 'preallocated' do not "
+             !Attrs.hasAttribute(Attribute::ByRef) &&
+             !Attrs.hasAttribute(Attribute::InAlloca) &&
+             !Attrs.hasAttribute(Attribute::Preallocated),
+             "Attributes 'byval', 'byref', 'inalloca', and 'preallocated' do not "
              "support unsized types!",
              V);
     }
@@ -1731,10 +1787,28 @@ void Verifier::verifyParameterAttrs(AttributeSet Attrs, Type *Ty,
              "Attribute 'swifterror' only applies to parameters "
              "with pointer to pointer type!",
              V);
+
+    if (Attrs.hasAttribute(Attribute::ByRef)) {
+      Assert(Attrs.getByRefType() == PTy->getElementType(),
+             "Attribute 'byref' type does not match parameter!", V);
+    }
+
+    if (Attrs.hasAttribute(Attribute::ByVal) && Attrs.getByValType()) {
+      Assert(Attrs.getByValType() == PTy->getElementType(),
+             "Attribute 'byval' type does not match parameter!", V);
+    }
+
+    if (Attrs.hasAttribute(Attribute::Preallocated)) {
+      Assert(Attrs.getPreallocatedType() == PTy->getElementType(),
+             "Attribute 'preallocated' type does not match parameter!", V);
+    }
   } else {
     Assert(!Attrs.hasAttribute(Attribute::ByVal),
            "Attribute 'byval' only applies to parameters with pointer type!",
            V);
+    Assert(!Attrs.hasAttribute(Attribute::ByRef),
+           "Attribute 'byref' only applies to parameters with pointer type!",
+           V);
     Assert(!Attrs.hasAttribute(Attribute::SwiftError),
            "Attribute 'swifterror' only applies to parameters "
            "with pointer type!",
@@ -1765,10 +1839,11 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs,
           !RetAttrs.hasAttribute(Attribute::Returned) &&
           !RetAttrs.hasAttribute(Attribute::InAlloca) &&
           !RetAttrs.hasAttribute(Attribute::Preallocated) &&
+          !RetAttrs.hasAttribute(Attribute::ByRef) &&
           !RetAttrs.hasAttribute(Attribute::SwiftSelf) &&
           !RetAttrs.hasAttribute(Attribute::SwiftError)),
-         "Attributes 'byval', 'inalloca', 'preallocated', 'nest', 'sret', "
-         "'nocapture', 'nofree', "
+         "Attributes 'byval', 'inalloca', 'preallocated', 'byref', "
+         "'nest', 'sret', 'nocapture', 'nofree', "
          "'returned', 'swiftself', and 'swifterror' do not apply to return "
          "values!",
          V);
@@ -2102,39 +2177,22 @@ void Verifier::verifyStatepoint(const CallBase &Call) {
          Call);
   const int NumTransitionArgs =
       cast<ConstantInt>(NumTransitionArgsV)->getZExtValue();
-  Assert(NumTransitionArgs >= 0,
-         "gc.statepoint number of transition arguments must be positive", Call);
+  Assert(NumTransitionArgs == 0,
+         "gc.statepoint w/inline transition bundle is deprecated", Call);
   const int EndTransitionArgsInx = EndCallArgsInx + 1 + NumTransitionArgs;
 
-  // We're migrating away from inline operands to operand bundles, enforce
-  // the either/or property during transition.
-  if (Call.getOperandBundle(LLVMContext::OB_gc_transition)) {
-    Assert(NumTransitionArgs == 0,
-           "can't use both deopt operands and deopt bundle on a statepoint");
-  }
-
   const Value *NumDeoptArgsV = Call.getArgOperand(EndTransitionArgsInx + 1);
   Assert(isa<ConstantInt>(NumDeoptArgsV),
          "gc.statepoint number of deoptimization arguments "
          "must be constant integer",
          Call);
   const int NumDeoptArgs = cast<ConstantInt>(NumDeoptArgsV)->getZExtValue();
-  Assert(NumDeoptArgs >= 0,
-         "gc.statepoint number of deoptimization arguments "
-         "must be positive",
-         Call);
+  Assert(NumDeoptArgs == 0,
+         "gc.statepoint w/inline deopt operands is deprecated", Call);
 
-  // We're migrating away from inline operands to operand bundles, enforce
-  // the either/or property during transition.
-  if (Call.getOperandBundle(LLVMContext::OB_deopt)) {
-    Assert(NumDeoptArgs == 0,
-           "can't use both deopt operands and deopt bundle on a statepoint");
-  }
-
-  const int ExpectedNumArgs =
-      7 + NumCallArgs + NumTransitionArgs + NumDeoptArgs;
-  Assert(ExpectedNumArgs <= (int)Call.arg_size(),
-         "gc.statepoint too few arguments according to length fields", Call);
+  const int ExpectedNumArgs = 7 + NumCallArgs;
+  Assert(ExpectedNumArgs == (int)Call.arg_size(),
+         "gc.statepoint too many arguments", Call);
 
   // Check that the only uses of this gc.statepoint are gc.result or
   // gc.relocate calls which are tied to this statepoint and thus part
@@ -2280,6 +2338,11 @@ void Verifier::visitFunction(const Function &F) {
   default:
   case CallingConv::C:
     break;
+  case CallingConv::X86_INTR: {
+    Assert(F.arg_empty() || Attrs.hasParamAttribute(0, Attribute::ByVal),
+           "Calling convention parameter requires byval", &F);
+    break;
+  }
   case CallingConv::AMDGPU_KERNEL:
   case CallingConv::SPIR_KERNEL:
     Assert(F.getReturnType()->isVoidTy(),
@@ -2292,6 +2355,28 @@ void Verifier::visitFunction(const Function &F) {
   case CallingConv::AMDGPU_CS:
     Assert(!F.hasStructRetAttr(),
            "Calling convention does not allow sret", &F);
+    if (F.getCallingConv() != CallingConv::SPIR_KERNEL) {
+      const unsigned StackAS = DL.getAllocaAddrSpace();
+      unsigned i = 0;
+      for (const Argument &Arg : F.args()) {
+        Assert(!Attrs.hasParamAttribute(i, Attribute::ByVal),
+               "Calling convention disallows byval", &F);
+        Assert(!Attrs.hasParamAttribute(i, Attribute::Preallocated),
+               "Calling convention disallows preallocated", &F);
+        Assert(!Attrs.hasParamAttribute(i, Attribute::InAlloca),
+               "Calling convention disallows inalloca", &F);
+
+        if (Attrs.hasParamAttribute(i, Attribute::ByRef)) {
+          // FIXME: Should also disallow LDS and GDS, but we don't have the enum
+          // value here.
+          Assert(Arg.getType()->getPointerAddressSpace() != StackAS,
+                 "Calling convention disallows stack byref", &F);
+        }
+
+        ++i;
+      }
+    }
+
     LLVM_FALLTHROUGH;
   case CallingConv::Fast:
   case CallingConv::Cold:
@@ -2394,6 +2479,10 @@ void Verifier::visitFunction(const Function &F) {
                  "function must have a single !dbg attachment", &F, I.second);
         AssertDI(isa<DISubprogram>(I.second),
                  "function !dbg attachment must be a subprogram", &F, I.second);
+        AssertDI(cast<DISubprogram>(I.second)->isDistinct(),
+                 "function definition may only have a distinct !dbg attachment",
+                 &F);
+
         auto *SP = cast<DISubprogram>(I.second);
         const Function *&AttachedTo = DISubprogramAttachments[SP];
         AssertDI(!AttachedTo || AttachedTo == &F,
@@ -2488,15 +2577,10 @@ void Verifier::visitBasicBlock(BasicBlock &BB) {
   // Check constraints that this basic block imposes on all of the PHI nodes in
   // it.
   if (isa<PHINode>(BB.front())) {
-    SmallVector<BasicBlock*, 8> Preds(pred_begin(&BB), pred_end(&BB));
+    SmallVector<BasicBlock *, 8> Preds(predecessors(&BB));
     SmallVector<std::pair<BasicBlock*, Value*>, 8> Values;
     llvm::sort(Preds);
     for (const PHINode &PN : BB.phis()) {
-      // Ensure that PHI nodes have at least one entry!
-      Assert(PN.getNumIncomingValues() != 0,
-             "PHI nodes must have at least one entry.  If the block is dead, "
-             "the PHI should be removed!",
-             &PN);
       Assert(PN.getNumIncomingValues() == Preds.size(),
              "PHINode should have one entry for each predecessor of its "
              "parent basic block!",
@@ -2887,8 +2971,8 @@ void Verifier::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
   Assert(SrcTy->getPointerAddressSpace() != DestTy->getPointerAddressSpace(),
          "AddrSpaceCast must be between different address spaces", &I);
   if (auto *SrcVTy = dyn_cast<VectorType>(SrcTy))
-    Assert(SrcVTy->getNumElements() ==
-               cast<VectorType>(DestTy)->getNumElements(),
+    Assert(SrcVTy->getElementCount() ==
+               cast<VectorType>(DestTy)->getElementCount(),
            "AddrSpaceCast vector pointer number of elements mismatch", &I);
   visitInstruction(I);
 }
@@ -3174,17 +3258,19 @@ static bool isTypeCongruent(Type *L, Type *R) {
 
 static AttrBuilder getParameterABIAttributes(int I, AttributeList Attrs) {
   static const Attribute::AttrKind ABIAttrs[] = {
-      Attribute::StructRet,   Attribute::ByVal,     Attribute::InAlloca,
-      Attribute::InReg,       Attribute::SwiftSelf, Attribute::SwiftError,
-      Attribute::Preallocated};
+      Attribute::StructRet,    Attribute::ByVal,     Attribute::InAlloca,
+      Attribute::InReg,        Attribute::SwiftSelf, Attribute::SwiftError,
+      Attribute::Preallocated, Attribute::ByRef};
   AttrBuilder Copy;
   for (auto AK : ABIAttrs) {
     if (Attrs.hasParamAttribute(I, AK))
       Copy.addAttribute(AK);
   }
-  // `align` is ABI-affecting only in combination with `byval`.
+
+  // `align` is ABI-affecting only in combination with `byval` or `byref`.
   if (Attrs.hasParamAttribute(I, Attribute::Alignment) &&
-      Attrs.hasParamAttribute(I, Attribute::ByVal))
+      (Attrs.hasParamAttribute(I, Attribute::ByVal) ||
+       Attrs.hasParamAttribute(I, Attribute::ByRef)))
     Copy.addAlignmentAttr(Attrs.getParamAlignment(I));
   return Copy;
 }
@@ -3420,7 +3506,7 @@ void Verifier::visitGetElementPtrInst(GetElementPtrInst &GEP) {
          "GEP base pointer is not a vector or a vector of pointers", &GEP);
   Assert(GEP.getSourceElementType()->isSized(), "GEP into unsized type!", &GEP);
 
-  SmallVector<Value*, 16> Idxs(GEP.idx_begin(), GEP.idx_end());
+  SmallVector<Value *, 16> Idxs(GEP.indices());
   Assert(all_of(
       Idxs, [](Value* V) { return V->getType()->isIntOrIntVectorTy(); }),
       "GEP indexes must be integers", &GEP);
@@ -4205,6 +4291,14 @@ void Verifier::visitProfMetadata(Instruction &I, MDNode *MD) {
   }
 }
 
+void Verifier::visitAnnotationMetadata(MDNode *Annotation) {
+  Assert(isa<MDTuple>(Annotation), "annotation must be a tuple");
+  Assert(Annotation->getNumOperands() >= 1,
+         "annotation must have at least one operand");
+  for (const MDOperand &Op : Annotation->operands())
+    Assert(isa<MDString>(Op.get()), "operands must be strings");
+}
+
 /// verifyInstruction - Verify that an instruction is well formed.
 ///
 void Verifier::visitInstruction(Instruction &I) {
@@ -4274,7 +4368,7 @@ void Verifier::visitInstruction(Instruction &I) {
               F->getIntrinsicID() == Intrinsic::experimental_patchpoint_void ||
               F->getIntrinsicID() == Intrinsic::experimental_patchpoint_i64 ||
               F->getIntrinsicID() == Intrinsic::experimental_gc_statepoint ||
-              F->getIntrinsicID() == Intrinsic::wasm_rethrow_in_catch,
+              F->getIntrinsicID() == Intrinsic::wasm_rethrow,
           "Cannot invoke an intrinsic other than donothing, patchpoint, "
           "statepoint, coro_resume or coro_destroy",
           &I);
@@ -4365,6 +4459,9 @@ void Verifier::visitInstruction(Instruction &I) {
   if (MDNode *MD = I.getMetadata(LLVMContext::MD_prof))
     visitProfMetadata(I, MD);
 
+  if (MDNode *Annotation = I.getMetadata(LLVMContext::MD_annotation))
+    visitAnnotationMetadata(Annotation);
+
   if (MDNode *N = I.getDebugLoc().getAsMDNode()) {
     AssertDI(isa<DILocation>(N), "invalid !dbg metadata attachment", &I, N);
     visitMDNode(*N, AreDebugLocsAllowed::Yes);
@@ -4449,21 +4546,32 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
       Assert(Elem.Tag->getKey() == "ignore" ||
                  Attribute::isExistingAttribute(Elem.Tag->getKey()),
              "tags must be valid attribute names");
-      Assert(Elem.End - Elem.Begin <= 2, "to many arguments");
       Attribute::AttrKind Kind =
           Attribute::getAttrKindFromName(Elem.Tag->getKey());
+      unsigned ArgCount = Elem.End - Elem.Begin;
+      if (Kind == Attribute::Alignment) {
+        Assert(ArgCount <= 3 && ArgCount >= 2,
+               "alignment assumptions should have 2 or 3 arguments");
+        Assert(Call.getOperand(Elem.Begin)->getType()->isPointerTy(),
+               "first argument should be a pointer");
+        Assert(Call.getOperand(Elem.Begin + 1)->getType()->isIntegerTy(),
+               "second argument should be an integer");
+        if (ArgCount == 3)
+          Assert(Call.getOperand(Elem.Begin + 2)->getType()->isIntegerTy(),
+                 "third argument should be an integer if present");
+        return;
+      }
+      Assert(ArgCount <= 2, "to many arguments");
       if (Kind == Attribute::None)
         break;
       if (Attribute::doesAttrKindHaveArgument(Kind)) {
-        Assert(Elem.End - Elem.Begin == 2,
-               "this attribute should have 2 arguments");
+        Assert(ArgCount == 2, "this attribute should have 2 arguments");
         Assert(isa<ConstantInt>(Call.getOperand(Elem.Begin + 1)),
                "the second argument should be a constant integral value");
       } else if (isFuncOnlyAttr(Kind)) {
-        Assert((Elem.End - Elem.Begin) == 0, "this attribute has no argument");
+        Assert((ArgCount) == 0, "this attribute has no argument");
       } else if (!isFuncOrArgAttr(Kind)) {
-        Assert((Elem.End - Elem.Begin) == 1,
-               "this attribute should have one argument");
+        Assert((ArgCount) == 1, "this attribute should have one argument");
       }
     }
     break;
@@ -4774,45 +4882,6 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
              "gc.relocate: statepoint base index out of bounds", Call);
       Assert(DerivedIndex < Opt->Inputs.size(),
              "gc.relocate: statepoint derived index out of bounds", Call);
-    } else {
-      Assert(BaseIndex < StatepointCall.arg_size(),
-             "gc.relocate: statepoint base index out of bounds", Call);
-      Assert(DerivedIndex < StatepointCall.arg_size(),
-             "gc.relocate: statepoint derived index out of bounds", Call);
-
-      // Check that BaseIndex and DerivedIndex fall within the 'gc parameters'
-      // section of the statepoint's argument.
-      Assert(StatepointCall.arg_size() > 0,
-             "gc.statepoint: insufficient arguments");
-      Assert(isa<ConstantInt>(StatepointCall.getArgOperand(3)),
-             "gc.statement: number of call arguments must be constant integer");
-      const uint64_t NumCallArgs =
-        cast<ConstantInt>(StatepointCall.getArgOperand(3))->getZExtValue();
-      Assert(StatepointCall.arg_size() > NumCallArgs + 5,
-             "gc.statepoint: mismatch in number of call arguments");
-      Assert(isa<ConstantInt>(StatepointCall.getArgOperand(NumCallArgs + 5)),
-             "gc.statepoint: number of transition arguments must be "
-             "a constant integer");
-      const uint64_t NumTransitionArgs =
-          cast<ConstantInt>(StatepointCall.getArgOperand(NumCallArgs + 5))
-              ->getZExtValue();
-      const uint64_t DeoptArgsStart = 4 + NumCallArgs + 1 + NumTransitionArgs + 1;
-      Assert(isa<ConstantInt>(StatepointCall.getArgOperand(DeoptArgsStart)),
-             "gc.statepoint: number of deoptimization arguments must be "
-             "a constant integer");
-      const uint64_t NumDeoptArgs =
-          cast<ConstantInt>(StatepointCall.getArgOperand(DeoptArgsStart))
-              ->getZExtValue();
-      const uint64_t GCParamArgsStart = DeoptArgsStart + 1 + NumDeoptArgs;
-      const uint64_t GCParamArgsEnd = StatepointCall.arg_size();
-      Assert(GCParamArgsStart <= BaseIndex && BaseIndex < GCParamArgsEnd,
-             "gc.relocate: statepoint base index doesn't fall within the "
-             "'gc parameters' section of the statepoint call",
-             Call);
-      Assert(GCParamArgsStart <= DerivedIndex && DerivedIndex < GCParamArgsEnd,
-             "gc.relocate: statepoint derived index doesn't fall within the "
-             "'gc parameters' section of the statepoint call",
-             Call);
     }
 
     // Relocated value must be either a pointer type or vector-of-pointer type,
@@ -4941,15 +5010,17 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
   case Intrinsic::sadd_sat:
   case Intrinsic::uadd_sat:
   case Intrinsic::ssub_sat:
-  case Intrinsic::usub_sat: {
+  case Intrinsic::usub_sat:
+  case Intrinsic::sshl_sat:
+  case Intrinsic::ushl_sat: {
     Value *Op1 = Call.getArgOperand(0);
     Value *Op2 = Call.getArgOperand(1);
     Assert(Op1->getType()->isIntOrIntVectorTy(),
-           "first operand of [us][add|sub]_sat must be an int type or vector "
-           "of ints");
+           "first operand of [us][add|sub|shl]_sat must be an int type or "
+           "vector of ints");
     Assert(Op2->getType()->isIntOrIntVectorTy(),
-           "second operand of [us][add|sub]_sat must be an int type or vector "
-           "of ints");
+           "second operand of [us][add|sub|shl]_sat must be an int type or "
+           "vector of ints");
     break;
   }
   case Intrinsic::smul_fix:
@@ -5002,6 +5073,14 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     Assert(Size % 16 == 0, "bswap must be an even number of bytes", &Call);
     break;
   }
+  case Intrinsic::invariant_start: {
+    ConstantInt *InvariantSize = dyn_cast<ConstantInt>(Call.getArgOperand(0));
+    Assert(InvariantSize &&
+               (!InvariantSize->isNegative() || InvariantSize->isMinusOne()),
+           "invariant_start parameter must be -1, 0 or a positive number",
+           &Call);
+    break;
+  }
   case Intrinsic::matrix_multiply:
   case Intrinsic::matrix_transpose:
   case Intrinsic::matrix_column_major_load:
@@ -5065,7 +5144,7 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
              "Vector element type mismatch of the result and second operand "
              "vector!", IF);
 
-    Assert(ResultTy->getNumElements() ==
+    Assert(cast<FixedVectorType>(ResultTy)->getNumElements() ==
                NumRows->getZExtValue() * NumColumns->getZExtValue(),
            "Result of a matrix operation does not fit in the returned vector!");
 
@@ -5075,6 +5154,30 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
 
     break;
   }
+  case Intrinsic::experimental_vector_insert: {
+    VectorType *VecTy = cast<VectorType>(Call.getArgOperand(0)->getType());
+    VectorType *SubVecTy = cast<VectorType>(Call.getArgOperand(1)->getType());
+
+    Assert(VecTy->getElementType() == SubVecTy->getElementType(),
+           "experimental_vector_insert parameters must have the same element "
+           "type.",
+           &Call);
+    break;
+  }
+  case Intrinsic::experimental_vector_extract: {
+    VectorType *ResultTy = cast<VectorType>(Call.getType());
+    VectorType *VecTy = cast<VectorType>(Call.getArgOperand(0)->getType());
+
+    Assert(ResultTy->getElementType() == VecTy->getElementType(),
+           "experimental_vector_extract result must have the same element "
+           "type as the input vector.",
+           &Call);
+    break;
+  }
+  case Intrinsic::experimental_noalias_scope_decl: {
+    NoAliasScopeDecls.push_back(cast<IntrinsicInst>(&Call));
+    break;
+  }
   };
 }
 
@@ -5151,7 +5254,7 @@ void Verifier::visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI) {
     Assert(Operand->getType()->isFPOrFPVectorTy(),
            "Intrinsic first argument must be floating point", &FPI);
     if (auto *OperandT = dyn_cast<VectorType>(Operand->getType())) {
-      NumSrcElem = OperandT->getNumElements();
+      NumSrcElem = cast<FixedVectorType>(OperandT)->getNumElements();
     }
 
     Operand = &FPI;
@@ -5160,7 +5263,7 @@ void Verifier::visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI) {
     Assert(Operand->getType()->isIntOrIntVectorTy(),
            "Intrinsic result must be an integer", &FPI);
     if (auto *OperandT = dyn_cast<VectorType>(Operand->getType())) {
-      Assert(NumSrcElem == OperandT->getNumElements(),
+      Assert(NumSrcElem == cast<FixedVectorType>(OperandT)->getNumElements(),
              "Intrinsic first argument and result vector lengths must be equal",
              &FPI);
     }
@@ -5174,7 +5277,7 @@ void Verifier::visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI) {
     Assert(Operand->getType()->isIntOrIntVectorTy(),
            "Intrinsic first argument must be integer", &FPI);
     if (auto *OperandT = dyn_cast<VectorType>(Operand->getType())) {
-      NumSrcElem = OperandT->getNumElements();
+      NumSrcElem = cast<FixedVectorType>(OperandT)->getNumElements();
     }
 
     Operand = &FPI;
@@ -5183,7 +5286,7 @@ void Verifier::visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI) {
     Assert(Operand->getType()->isFPOrFPVectorTy(),
            "Intrinsic result must be a floating point", &FPI);
     if (auto *OperandT = dyn_cast<VectorType>(Operand->getType())) {
-      Assert(NumSrcElem == OperandT->getNumElements(),
+      Assert(NumSrcElem == cast<FixedVectorType>(OperandT)->getNumElements(),
              "Intrinsic first argument and result vector lengths must be equal",
              &FPI);
     }
@@ -5202,9 +5305,8 @@ void Verifier::visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI) {
     Assert(OperandTy->isVectorTy() == ResultTy->isVectorTy(),
            "Intrinsic first argument and result disagree on vector use", &FPI);
     if (OperandTy->isVectorTy()) {
-      auto *OperandVecTy = cast<VectorType>(OperandTy);
-      auto *ResultVecTy = cast<VectorType>(ResultTy);
-      Assert(OperandVecTy->getNumElements() == ResultVecTy->getNumElements(),
+      Assert(cast<FixedVectorType>(OperandTy)->getNumElements() ==
+                 cast<FixedVectorType>(ResultTy)->getNumElements(),
              "Intrinsic first argument and result vector lengths must be equal",
              &FPI);
     }
@@ -5426,6 +5528,75 @@ void Verifier::verifySourceDebugInfo(const DICompileUnit &U, const DIFile &F) {
            "inconsistent use of embedded source");
 }
 
+void Verifier::verifyNoAliasScopeDecl() {
+  if (NoAliasScopeDecls.empty())
+    return;
+
+  // only a single scope must be declared at a time.
+  for (auto *II : NoAliasScopeDecls) {
+    assert(II->getIntrinsicID() == Intrinsic::experimental_noalias_scope_decl &&
+           "Not a llvm.experimental.noalias.scope.decl ?");
+    const auto *ScopeListMV = dyn_cast<MetadataAsValue>(
+        II->getOperand(Intrinsic::NoAliasScopeDeclScopeArg));
+    Assert(ScopeListMV != nullptr,
+           "llvm.experimental.noalias.scope.decl must have a MetadataAsValue "
+           "argument",
+           II);
+
+    const auto *ScopeListMD = dyn_cast<MDNode>(ScopeListMV->getMetadata());
+    Assert(ScopeListMD != nullptr, "!id.scope.list must point to an MDNode",
+           II);
+    Assert(ScopeListMD->getNumOperands() == 1,
+           "!id.scope.list must point to a list with a single scope", II);
+  }
+
+  // Only check the domination rule when requested. Once all passes have been
+  // adapted this option can go away.
+  if (!VerifyNoAliasScopeDomination)
+    return;
+
+  // Now sort the intrinsics based on the scope MDNode so that declarations of
+  // the same scopes are next to each other.
+  auto GetScope = [](IntrinsicInst *II) {
+    const auto *ScopeListMV = cast<MetadataAsValue>(
+        II->getOperand(Intrinsic::NoAliasScopeDeclScopeArg));
+    return &cast<MDNode>(ScopeListMV->getMetadata())->getOperand(0);
+  };
+
+  // We are sorting on MDNode pointers here. For valid input IR this is ok.
+  // TODO: Sort on Metadata ID to avoid non-deterministic error messages.
+  auto Compare = [GetScope](IntrinsicInst *Lhs, IntrinsicInst *Rhs) {
+    return GetScope(Lhs) < GetScope(Rhs);
+  };
+
+  llvm::sort(NoAliasScopeDecls, Compare);
+
+  // Go over the intrinsics and check that for the same scope, they are not
+  // dominating each other.
+  auto ItCurrent = NoAliasScopeDecls.begin();
+  while (ItCurrent != NoAliasScopeDecls.end()) {
+    auto CurScope = GetScope(*ItCurrent);
+    auto ItNext = ItCurrent;
+    do {
+      ++ItNext;
+    } while (ItNext != NoAliasScopeDecls.end() &&
+             GetScope(*ItNext) == CurScope);
+
+    // [ItCurrent, ItNext) represents the declarations for the same scope.
+    // Ensure they are not dominating each other.. but only if it is not too
+    // expensive.
+    if (ItNext - ItCurrent < 32)
+      for (auto *I : llvm::make_range(ItCurrent, ItNext))
+        for (auto *J : llvm::make_range(ItCurrent, ItNext))
+          if (I != J)
+            Assert(!DT.dominates(I, J),
+                   "llvm.experimental.noalias.scope.decl dominates another one "
+                   "with the same scope",
+                   I);
+    ItCurrent = ItNext;
+  }
+}
+
 //===----------------------------------------------------------------------===//
 //  Implement the public interfaces to this file...
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm-project/llvm/lib/IRReader/IRReader.cpp b/contrib/llvm-project/llvm/lib/IRReader/IRReader.cpp
index cb33e40be61b..e7fd835f8ad0 100644
--- a/contrib/llvm-project/llvm/lib/IRReader/IRReader.cpp
+++ b/contrib/llvm-project/llvm/lib/IRReader/IRReader.cpp
@@ -24,10 +24,10 @@ namespace llvm {
   extern bool TimePassesIsEnabled;
 }
 
-static const char *const TimeIRParsingGroupName = "irparse";
-static const char *const TimeIRParsingGroupDescription = "LLVM IR Parsing";
-static const char *const TimeIRParsingName = "parse";
-static const char *const TimeIRParsingDescription = "Parse IR";
+const char TimeIRParsingGroupName[] = "irparse";
+const char TimeIRParsingGroupDescription[] = "LLVM IR Parsing";
+const char TimeIRParsingName[] = "parse";
+const char TimeIRParsingDescription[] = "Parse IR";
 
 std::unique_ptr<Module>
 llvm::getLazyIRModule(std::unique_ptr<MemoryBuffer> Buffer, SMDiagnostic &Err,
diff --git a/contrib/llvm-project/llvm/lib/InterfaceStub/ELFObjHandler.cpp b/contrib/llvm-project/llvm/lib/InterfaceStub/ELFObjHandler.cpp
new file mode 100644
index 000000000000..255d301362eb
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/InterfaceStub/ELFObjHandler.cpp
@@ -0,0 +1,680 @@
+//===- ELFObjHandler.cpp --------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------------===/
+
+#include "llvm/InterfaceStub/ELFObjHandler.h"
+#include "llvm/InterfaceStub/ELFStub.h"
+#include "llvm/MC/StringTableBuilder.h"
+#include "llvm/Object/Binary.h"
+#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Object/ELFTypes.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FileOutputBuffer.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Process.h"
+
+using llvm::MemoryBufferRef;
+using llvm::object::ELFObjectFile;
+
+using namespace llvm;
+using namespace llvm::object;
+using namespace llvm::ELF;
+
+namespace llvm {
+namespace elfabi {
+
+// Simple struct to hold relevant .dynamic entries.
+struct DynamicEntries {
+  uint64_t StrTabAddr = 0;
+  uint64_t StrSize = 0;
+  Optional<uint64_t> SONameOffset;
+  std::vector<uint64_t> NeededLibNames;
+  // Symbol table:
+  uint64_t DynSymAddr = 0;
+  // Hash tables:
+  Optional<uint64_t> ElfHash;
+  Optional<uint64_t> GnuHash;
+};
+
+/// This initializes an ELF file header with information specific to a binary
+/// dynamic shared object.
+/// Offsets, indexes, links, etc. for section and program headers are just
+/// zero-initialized as they will be updated elsewhere.
+///
+/// @param ElfHeader Target ELFT::Ehdr to populate.
+/// @param Machine Target architecture (e_machine from ELF specifications).
+template <class ELFT>
+static void initELFHeader(typename ELFT::Ehdr &ElfHeader, uint16_t Machine) {
+  memset(&ElfHeader, 0, sizeof(ElfHeader));
+  // ELF identification.
+  ElfHeader.e_ident[EI_MAG0] = ElfMagic[EI_MAG0];
+  ElfHeader.e_ident[EI_MAG1] = ElfMagic[EI_MAG1];
+  ElfHeader.e_ident[EI_MAG2] = ElfMagic[EI_MAG2];
+  ElfHeader.e_ident[EI_MAG3] = ElfMagic[EI_MAG3];
+  ElfHeader.e_ident[EI_CLASS] = ELFT::Is64Bits ? ELFCLASS64 : ELFCLASS32;
+  bool IsLittleEndian = ELFT::TargetEndianness == support::little;
+  ElfHeader.e_ident[EI_DATA] = IsLittleEndian ? ELFDATA2LSB : ELFDATA2MSB;
+  ElfHeader.e_ident[EI_VERSION] = EV_CURRENT;
+  ElfHeader.e_ident[EI_OSABI] = ELFOSABI_NONE;
+
+  // Remainder of ELF header.
+  ElfHeader.e_type = ET_DYN;
+  ElfHeader.e_machine = Machine;
+  ElfHeader.e_version = EV_CURRENT;
+  ElfHeader.e_ehsize = sizeof(typename ELFT::Ehdr);
+  ElfHeader.e_phentsize = sizeof(typename ELFT::Phdr);
+  ElfHeader.e_shentsize = sizeof(typename ELFT::Shdr);
+}
+
+namespace {
+template <class ELFT> struct OutputSection {
+  using Elf_Shdr = typename ELFT::Shdr;
+  std::string Name;
+  Elf_Shdr Shdr;
+  uint64_t Addr;
+  uint64_t Offset;
+  uint64_t Size;
+  uint64_t Align;
+  uint32_t Index;
+  bool NoBits = true;
+};
+
+template <class T, class ELFT>
+struct ContentSection : public OutputSection<ELFT> {
+  T Content;
+  ContentSection() { this->NoBits = false; }
+};
+
+// This class just wraps StringTableBuilder for the purpose of adding a
+// default constructor.
+class ELFStringTableBuilder : public StringTableBuilder {
+public:
+  ELFStringTableBuilder() : StringTableBuilder(StringTableBuilder::ELF) {}
+};
+
+template <class ELFT> class ELFSymbolTableBuilder {
+public:
+  using Elf_Sym = typename ELFT::Sym;
+
+  ELFSymbolTableBuilder() { Symbols.push_back({}); }
+
+  void add(size_t StNameOffset, uint64_t StSize, uint8_t StBind, uint8_t StType,
+           uint8_t StOther, uint16_t StShndx) {
+    Elf_Sym S{};
+    S.st_name = StNameOffset;
+    S.st_size = StSize;
+    S.st_info = (StBind << 4) | (StType & 0xf);
+    S.st_other = StOther;
+    S.st_shndx = StShndx;
+    Symbols.push_back(S);
+  }
+
+  size_t getSize() const { return Symbols.size() * sizeof(Elf_Sym); }
+
+  void write(uint8_t *Buf) const {
+    memcpy(Buf, Symbols.data(), sizeof(Elf_Sym) * Symbols.size());
+  }
+
+private:
+  llvm::SmallVector<Elf_Sym, 8> Symbols;
+};
+
+template <class ELFT> class ELFDynamicTableBuilder {
+public:
+  using Elf_Dyn = typename ELFT::Dyn;
+
+  size_t addAddr(uint64_t Tag, uint64_t Addr) {
+    Elf_Dyn Entry;
+    Entry.d_tag = Tag;
+    Entry.d_un.d_ptr = Addr;
+    Entries.push_back(Entry);
+    return Entries.size() - 1;
+  }
+
+  void modifyAddr(size_t Index, uint64_t Addr) {
+    Entries[Index].d_un.d_ptr = Addr;
+  }
+
+  size_t addValue(uint64_t Tag, uint64_t Value) {
+    Elf_Dyn Entry;
+    Entry.d_tag = Tag;
+    Entry.d_un.d_val = Value;
+    Entries.push_back(Entry);
+    return Entries.size() - 1;
+  }
+
+  void modifyValue(size_t Index, uint64_t Value) {
+    Entries[Index].d_un.d_val = Value;
+  }
+
+  size_t getSize() const {
+    // Add DT_NULL entry at the end.
+    return (Entries.size() + 1) * sizeof(Elf_Dyn);
+  }
+
+  void write(uint8_t *Buf) const {
+    memcpy(Buf, Entries.data(), sizeof(Elf_Dyn) * Entries.size());
+    // Add DT_NULL entry at the end.
+    memset(Buf + sizeof(Elf_Dyn) * Entries.size(), 0, sizeof(Elf_Dyn));
+  }
+
+private:
+  llvm::SmallVector<Elf_Dyn, 8> Entries;
+};
+
+template <class ELFT> class ELFStubBuilder {
+public:
+  using Elf_Ehdr = typename ELFT::Ehdr;
+  using Elf_Shdr = typename ELFT::Shdr;
+  using Elf_Phdr = typename ELFT::Phdr;
+  using Elf_Sym = typename ELFT::Sym;
+  using Elf_Addr = typename ELFT::Addr;
+  using Elf_Dyn = typename ELFT::Dyn;
+
+  ELFStubBuilder(const ELFStubBuilder &) = delete;
+  ELFStubBuilder(ELFStubBuilder &&) = default;
+
+  explicit ELFStubBuilder(const ELFStub &Stub) {
+    DynSym.Name = ".dynsym";
+    DynSym.Align = sizeof(Elf_Addr);
+    DynStr.Name = ".dynstr";
+    DynStr.Align = 1;
+    DynTab.Name = ".dynamic";
+    DynTab.Align = sizeof(Elf_Addr);
+    ShStrTab.Name = ".shstrtab";
+    ShStrTab.Align = 1;
+
+    // Populate string tables.
+    for (const ELFSymbol &Sym : Stub.Symbols)
+      DynStr.Content.add(Sym.Name);
+    for (const std::string &Lib : Stub.NeededLibs)
+      DynStr.Content.add(Lib);
+    if (Stub.SoName)
+      DynStr.Content.add(Stub.SoName.getValue());
+
+    std::vector<OutputSection<ELFT> *> Sections = {&DynSym, &DynStr, &DynTab,
+                                                   &ShStrTab};
+    const OutputSection<ELFT> *LastSection = Sections.back();
+    // Now set the Index and put sections names into ".shstrtab".
+    uint64_t Index = 1;
+    for (OutputSection<ELFT> *Sec : Sections) {
+      Sec->Index = Index++;
+      ShStrTab.Content.add(Sec->Name);
+    }
+    ShStrTab.Content.finalize();
+    ShStrTab.Size = ShStrTab.Content.getSize();
+    DynStr.Content.finalize();
+    DynStr.Size = DynStr.Content.getSize();
+
+    // Populate dynamic symbol table.
+    for (const ELFSymbol &Sym : Stub.Symbols) {
+      uint8_t Bind = Sym.Weak ? STB_WEAK : STB_GLOBAL;
+      // For non-undefined symbols, value of the shndx is not relevant at link
+      // time as long as it is not SHN_UNDEF. Set shndx to 1, which
+      // points to ".dynsym".
+      uint16_t Shndx = Sym.Undefined ? SHN_UNDEF : 1;
+      DynSym.Content.add(DynStr.Content.getOffset(Sym.Name), Sym.Size, Bind,
+                         (uint8_t)Sym.Type, 0, Shndx);
+    }
+    DynSym.Size = DynSym.Content.getSize();
+
+    // Poplulate dynamic table.
+    size_t DynSymIndex = DynTab.Content.addAddr(DT_SYMTAB, 0);
+    size_t DynStrIndex = DynTab.Content.addAddr(DT_STRTAB, 0);
+    for (const std::string &Lib : Stub.NeededLibs)
+      DynTab.Content.addValue(DT_NEEDED, DynStr.Content.getOffset(Lib));
+    if (Stub.SoName)
+      DynTab.Content.addValue(DT_SONAME,
+                              DynStr.Content.getOffset(Stub.SoName.getValue()));
+    DynTab.Size = DynTab.Content.getSize();
+    // Calculate sections' addresses and offsets.
+    uint64_t CurrentOffset = sizeof(Elf_Ehdr);
+    for (OutputSection<ELFT> *Sec : Sections) {
+      Sec->Offset = alignTo(CurrentOffset, Sec->Align);
+      Sec->Addr = Sec->Offset;
+      CurrentOffset = Sec->Offset + Sec->Size;
+    }
+    // Fill Addr back to dynamic table.
+    DynTab.Content.modifyAddr(DynSymIndex, DynSym.Addr);
+    DynTab.Content.modifyAddr(DynStrIndex, DynStr.Addr);
+    // Write section headers of string tables.
+    fillSymTabShdr(DynSym, SHT_DYNSYM);
+    fillStrTabShdr(DynStr, SHF_ALLOC);
+    fillDynTabShdr(DynTab);
+    fillStrTabShdr(ShStrTab);
+
+    // Finish initializing the ELF header.
+    initELFHeader<ELFT>(ElfHeader, Stub.Arch);
+    ElfHeader.e_shstrndx = ShStrTab.Index;
+    ElfHeader.e_shnum = LastSection->Index + 1;
+    ElfHeader.e_shoff =
+        alignTo(LastSection->Offset + LastSection->Size, sizeof(Elf_Addr));
+  }
+
+  size_t getSize() const {
+    return ElfHeader.e_shoff + ElfHeader.e_shnum * sizeof(Elf_Shdr);
+  }
+
+  void write(uint8_t *Data) const {
+    write(Data, ElfHeader);
+    DynSym.Content.write(Data + DynSym.Shdr.sh_offset);
+    DynStr.Content.write(Data + DynStr.Shdr.sh_offset);
+    DynTab.Content.write(Data + DynTab.Shdr.sh_offset);
+    ShStrTab.Content.write(Data + ShStrTab.Shdr.sh_offset);
+    writeShdr(Data, DynSym);
+    writeShdr(Data, DynStr);
+    writeShdr(Data, DynTab);
+    writeShdr(Data, ShStrTab);
+  }
+
+private:
+  Elf_Ehdr ElfHeader;
+  ContentSection<ELFStringTableBuilder, ELFT> DynStr;
+  ContentSection<ELFStringTableBuilder, ELFT> ShStrTab;
+  ContentSection<ELFSymbolTableBuilder<ELFT>, ELFT> DynSym;
+  ContentSection<ELFDynamicTableBuilder<ELFT>, ELFT> DynTab;
+
+  template <class T> static void write(uint8_t *Data, const T &Value) {
+    *reinterpret_cast<T *>(Data) = Value;
+  }
+
+  void fillStrTabShdr(ContentSection<ELFStringTableBuilder, ELFT> &StrTab,
+                      uint32_t ShFlags = 0) const {
+    StrTab.Shdr.sh_type = SHT_STRTAB;
+    StrTab.Shdr.sh_flags = ShFlags;
+    StrTab.Shdr.sh_addr = StrTab.Addr;
+    StrTab.Shdr.sh_offset = StrTab.Offset;
+    StrTab.Shdr.sh_info = 0;
+    StrTab.Shdr.sh_size = StrTab.Size;
+    StrTab.Shdr.sh_name = ShStrTab.Content.getOffset(StrTab.Name);
+    StrTab.Shdr.sh_addralign = StrTab.Align;
+    StrTab.Shdr.sh_entsize = 0;
+    StrTab.Shdr.sh_link = 0;
+  }
+  void fillSymTabShdr(ContentSection<ELFSymbolTableBuilder<ELFT>, ELFT> &SymTab,
+                      uint32_t ShType) const {
+    SymTab.Shdr.sh_type = ShType;
+    SymTab.Shdr.sh_flags = SHF_ALLOC;
+    SymTab.Shdr.sh_addr = SymTab.Addr;
+    SymTab.Shdr.sh_offset = SymTab.Offset;
+    SymTab.Shdr.sh_info = SymTab.Size / sizeof(Elf_Sym) > 1 ? 1 : 0;
+    SymTab.Shdr.sh_size = SymTab.Size;
+    SymTab.Shdr.sh_name = this->ShStrTab.Content.getOffset(SymTab.Name);
+    SymTab.Shdr.sh_addralign = SymTab.Align;
+    SymTab.Shdr.sh_entsize = sizeof(Elf_Sym);
+    SymTab.Shdr.sh_link = this->DynStr.Index;
+  }
+  void fillDynTabShdr(
+      ContentSection<ELFDynamicTableBuilder<ELFT>, ELFT> &DynTab) const {
+    DynTab.Shdr.sh_type = SHT_DYNAMIC;
+    DynTab.Shdr.sh_flags = SHF_ALLOC;
+    DynTab.Shdr.sh_addr = DynTab.Addr;
+    DynTab.Shdr.sh_offset = DynTab.Offset;
+    DynTab.Shdr.sh_info = 0;
+    DynTab.Shdr.sh_size = DynTab.Size;
+    DynTab.Shdr.sh_name = this->ShStrTab.Content.getOffset(DynTab.Name);
+    DynTab.Shdr.sh_addralign = DynTab.Align;
+    DynTab.Shdr.sh_entsize = sizeof(Elf_Dyn);
+    DynTab.Shdr.sh_link = this->DynStr.Index;
+  }
+  uint64_t shdrOffset(const OutputSection<ELFT> &Sec) const {
+    return ElfHeader.e_shoff + Sec.Index * sizeof(Elf_Shdr);
+  }
+
+  void writeShdr(uint8_t *Data, const OutputSection<ELFT> &Sec) const {
+    write(Data + shdrOffset(Sec), Sec.Shdr);
+  }
+};
+} // end anonymous namespace
+
+/// This function behaves similarly to StringRef::substr(), but attempts to
+/// terminate the returned StringRef at the first null terminator. If no null
+/// terminator is found, an error is returned.
+///
+/// @param Str Source string to create a substring from.
+/// @param Offset The start index of the desired substring.
+static Expected<StringRef> terminatedSubstr(StringRef Str, size_t Offset) {
+  size_t StrEnd = Str.find('\0', Offset);
+  if (StrEnd == StringLiteral::npos) {
+    return createError(
+        "String overran bounds of string table (no null terminator)");
+  }
+
+  size_t StrLen = StrEnd - Offset;
+  return Str.substr(Offset, StrLen);
+}
+
+/// This function takes an error, and appends a string of text to the end of
+/// that error. Since "appending" to an Error isn't supported behavior of an
+/// Error, this function technically creates a new error with the combined
+/// message and consumes the old error.
+///
+/// @param Err Source error.
+/// @param After Text to append at the end of Err's error message.
+Error appendToError(Error Err, StringRef After) {
+  std::string Message;
+  raw_string_ostream Stream(Message);
+  Stream << Err;
+  Stream << " " << After;
+  consumeError(std::move(Err));
+  return createError(Stream.str().c_str());
+}
+
+/// This function populates a DynamicEntries struct using an ELFT::DynRange.
+/// After populating the struct, the members are validated with
+/// some basic sanity checks.
+///
+/// @param Dyn Target DynamicEntries struct to populate.
+/// @param DynTable Source dynamic table.
+template <class ELFT>
+static Error populateDynamic(DynamicEntries &Dyn,
+                             typename ELFT::DynRange DynTable) {
+  if (DynTable.empty())
+    return createError("No .dynamic section found");
+
+  // Search .dynamic for relevant entries.
+  bool FoundDynStr = false;
+  bool FoundDynStrSz = false;
+  bool FoundDynSym = false;
+  for (auto &Entry : DynTable) {
+    switch (Entry.d_tag) {
+    case DT_SONAME:
+      Dyn.SONameOffset = Entry.d_un.d_val;
+      break;
+    case DT_STRTAB:
+      Dyn.StrTabAddr = Entry.d_un.d_ptr;
+      FoundDynStr = true;
+      break;
+    case DT_STRSZ:
+      Dyn.StrSize = Entry.d_un.d_val;
+      FoundDynStrSz = true;
+      break;
+    case DT_NEEDED:
+      Dyn.NeededLibNames.push_back(Entry.d_un.d_val);
+      break;
+    case DT_SYMTAB:
+      Dyn.DynSymAddr = Entry.d_un.d_ptr;
+      FoundDynSym = true;
+      break;
+    case DT_HASH:
+      Dyn.ElfHash = Entry.d_un.d_ptr;
+      break;
+    case DT_GNU_HASH:
+      Dyn.GnuHash = Entry.d_un.d_ptr;
+    }
+  }
+
+  if (!FoundDynStr) {
+    return createError(
+        "Couldn't locate dynamic string table (no DT_STRTAB entry)");
+  }
+  if (!FoundDynStrSz) {
+    return createError(
+        "Couldn't determine dynamic string table size (no DT_STRSZ entry)");
+  }
+  if (!FoundDynSym) {
+    return createError(
+        "Couldn't locate dynamic symbol table (no DT_SYMTAB entry)");
+  }
+  if (Dyn.SONameOffset.hasValue() && *Dyn.SONameOffset >= Dyn.StrSize) {
+    return createStringError(object_error::parse_failed,
+                             "DT_SONAME string offset (0x%016" PRIx64
+                             ") outside of dynamic string table",
+                             *Dyn.SONameOffset);
+  }
+  for (uint64_t Offset : Dyn.NeededLibNames) {
+    if (Offset >= Dyn.StrSize) {
+      return createStringError(object_error::parse_failed,
+                               "DT_NEEDED string offset (0x%016" PRIx64
+                               ") outside of dynamic string table",
+                               Offset);
+    }
+  }
+
+  return Error::success();
+}
+
+/// This function extracts symbol type from a symbol's st_info member and
+/// maps it to an ELFSymbolType enum.
+/// Currently, STT_NOTYPE, STT_OBJECT, STT_FUNC, and STT_TLS are supported.
+/// Other symbol types are mapped to ELFSymbolType::Unknown.
+///
+/// @param Info Binary symbol st_info to extract symbol type from.
+static ELFSymbolType convertInfoToType(uint8_t Info) {
+  Info = Info & 0xf;
+  switch (Info) {
+  case ELF::STT_NOTYPE:
+    return ELFSymbolType::NoType;
+  case ELF::STT_OBJECT:
+    return ELFSymbolType::Object;
+  case ELF::STT_FUNC:
+    return ELFSymbolType::Func;
+  case ELF::STT_TLS:
+    return ELFSymbolType::TLS;
+  default:
+    return ELFSymbolType::Unknown;
+  }
+}
+
+/// This function creates an ELFSymbol and populates all members using
+/// information from a binary ELFT::Sym.
+///
+/// @param SymName The desired name of the ELFSymbol.
+/// @param RawSym ELFT::Sym to extract symbol information from.
+template <class ELFT>
+static ELFSymbol createELFSym(StringRef SymName,
+                              const typename ELFT::Sym &RawSym) {
+  ELFSymbol TargetSym{std::string(SymName)};
+  uint8_t Binding = RawSym.getBinding();
+  if (Binding == STB_WEAK)
+    TargetSym.Weak = true;
+  else
+    TargetSym.Weak = false;
+
+  TargetSym.Undefined = RawSym.isUndefined();
+  TargetSym.Type = convertInfoToType(RawSym.st_info);
+
+  if (TargetSym.Type == ELFSymbolType::Func) {
+    TargetSym.Size = 0;
+  } else {
+    TargetSym.Size = RawSym.st_size;
+  }
+  return TargetSym;
+}
+
+/// This function populates an ELFStub with symbols using information read
+/// from an ELF binary.
+///
+/// @param TargetStub ELFStub to add symbols to.
+/// @param DynSym Range of dynamic symbols to add to TargetStub.
+/// @param DynStr StringRef to the dynamic string table.
+template <class ELFT>
+static Error populateSymbols(ELFStub &TargetStub,
+                             const typename ELFT::SymRange DynSym,
+                             StringRef DynStr) {
+  // Skips the first symbol since it's the NULL symbol.
+  for (auto RawSym : DynSym.drop_front(1)) {
+    // If a symbol does not have global or weak binding, ignore it.
+    uint8_t Binding = RawSym.getBinding();
+    if (!(Binding == STB_GLOBAL || Binding == STB_WEAK))
+      continue;
+    // If a symbol doesn't have default or protected visibility, ignore it.
+    uint8_t Visibility = RawSym.getVisibility();
+    if (!(Visibility == STV_DEFAULT || Visibility == STV_PROTECTED))
+      continue;
+    // Create an ELFSymbol and populate it with information from the symbol
+    // table entry.
+    Expected<StringRef> SymName = terminatedSubstr(DynStr, RawSym.st_name);
+    if (!SymName)
+      return SymName.takeError();
+    ELFSymbol Sym = createELFSym<ELFT>(*SymName, RawSym);
+    TargetStub.Symbols.insert(std::move(Sym));
+    // TODO: Populate symbol warning.
+  }
+  return Error::success();
+}
+
+/// Returns a new ELFStub with all members populated from an ELFObjectFile.
+/// @param ElfObj Source ELFObjectFile.
+template <class ELFT>
+static Expected<std::unique_ptr<ELFStub>>
+buildStub(const ELFObjectFile<ELFT> &ElfObj) {
+  using Elf_Dyn_Range = typename ELFT::DynRange;
+  using Elf_Phdr_Range = typename ELFT::PhdrRange;
+  using Elf_Sym_Range = typename ELFT::SymRange;
+  using Elf_Sym = typename ELFT::Sym;
+  std::unique_ptr<ELFStub> DestStub = std::make_unique<ELFStub>();
+  const ELFFile<ELFT> &ElfFile = ElfObj.getELFFile();
+  // Fetch .dynamic table.
+  Expected<Elf_Dyn_Range> DynTable = ElfFile.dynamicEntries();
+  if (!DynTable) {
+    return DynTable.takeError();
+  }
+
+  // Fetch program headers.
+  Expected<Elf_Phdr_Range> PHdrs = ElfFile.program_headers();
+  if (!PHdrs) {
+    return PHdrs.takeError();
+  }
+
+  // Collect relevant .dynamic entries.
+  DynamicEntries DynEnt;
+  if (Error Err = populateDynamic<ELFT>(DynEnt, *DynTable))
+    return std::move(Err);
+
+  // Get pointer to in-memory location of .dynstr section.
+  Expected<const uint8_t *> DynStrPtr = ElfFile.toMappedAddr(DynEnt.StrTabAddr);
+  if (!DynStrPtr)
+    return appendToError(DynStrPtr.takeError(),
+                         "when locating .dynstr section contents");
+
+  StringRef DynStr(reinterpret_cast<const char *>(DynStrPtr.get()),
+                   DynEnt.StrSize);
+
+  // Populate Arch from ELF header.
+  DestStub->Arch = ElfFile.getHeader().e_machine;
+
+  // Populate SoName from .dynamic entries and dynamic string table.
+  if (DynEnt.SONameOffset.hasValue()) {
+    Expected<StringRef> NameOrErr =
+        terminatedSubstr(DynStr, *DynEnt.SONameOffset);
+    if (!NameOrErr) {
+      return appendToError(NameOrErr.takeError(), "when reading DT_SONAME");
+    }
+    DestStub->SoName = std::string(*NameOrErr);
+  }
+
+  // Populate NeededLibs from .dynamic entries and dynamic string table.
+  for (uint64_t NeededStrOffset : DynEnt.NeededLibNames) {
+    Expected<StringRef> LibNameOrErr =
+        terminatedSubstr(DynStr, NeededStrOffset);
+    if (!LibNameOrErr) {
+      return appendToError(LibNameOrErr.takeError(), "when reading DT_NEEDED");
+    }
+    DestStub->NeededLibs.push_back(std::string(*LibNameOrErr));
+  }
+
+  // Populate Symbols from .dynsym table and dynamic string table.
+  Expected<uint64_t> SymCount = ElfFile.getDynSymtabSize();
+  if (!SymCount)
+    return SymCount.takeError();
+  if (*SymCount > 0) {
+    // Get pointer to in-memory location of .dynsym section.
+    Expected<const uint8_t *> DynSymPtr =
+        ElfFile.toMappedAddr(DynEnt.DynSymAddr);
+    if (!DynSymPtr)
+      return appendToError(DynSymPtr.takeError(),
+                           "when locating .dynsym section contents");
+    Elf_Sym_Range DynSyms = ArrayRef<Elf_Sym>(
+        reinterpret_cast<const Elf_Sym *>(*DynSymPtr), *SymCount);
+    Error SymReadError = populateSymbols<ELFT>(*DestStub, DynSyms, DynStr);
+    if (SymReadError)
+      return appendToError(std::move(SymReadError),
+                           "when reading dynamic symbols");
+  }
+
+  return std::move(DestStub);
+}
+
+/// This function opens a file for writing and then writes a binary ELF stub to
+/// the file.
+///
+/// @param FilePath File path for writing the ELF binary.
+/// @param Stub Source ELFStub to generate a binary ELF stub from.
+template <class ELFT>
+static Error writeELFBinaryToFile(StringRef FilePath, const ELFStub &Stub,
+                                  bool WriteIfChanged) {
+  ELFStubBuilder<ELFT> Builder{Stub};
+  // Write Stub to memory first.
+  std::vector<uint8_t> Buf(Builder.getSize());
+  Builder.write(Buf.data());
+
+  if (WriteIfChanged) {
+    if (ErrorOr<std::unique_ptr<MemoryBuffer>> BufOrError =
+            MemoryBuffer::getFile(FilePath)) {
+      // Compare Stub output with existing Stub file.
+      // If Stub file unchanged, abort updating.
+      if ((*BufOrError)->getBufferSize() == Builder.getSize() &&
+          !memcmp((*BufOrError)->getBufferStart(), Buf.data(),
+                  Builder.getSize()))
+        return Error::success();
+    }
+  }
+
+  Expected<std::unique_ptr<FileOutputBuffer>> BufOrError =
+      FileOutputBuffer::create(FilePath, Builder.getSize());
+  if (!BufOrError)
+    return createStringError(errc::invalid_argument,
+                             toString(BufOrError.takeError()) +
+                                 " when trying to open `" + FilePath +
+                                 "` for writing");
+
+  // Write binary to file.
+  std::unique_ptr<FileOutputBuffer> FileBuf = std::move(*BufOrError);
+  memcpy(FileBuf->getBufferStart(), Buf.data(), Buf.size());
+
+  return FileBuf->commit();
+}
+
+Expected<std::unique_ptr<ELFStub>> readELFFile(MemoryBufferRef Buf) {
+  Expected<std::unique_ptr<Binary>> BinOrErr = createBinary(Buf);
+  if (!BinOrErr) {
+    return BinOrErr.takeError();
+  }
+
+  Binary *Bin = BinOrErr->get();
+  if (auto Obj = dyn_cast<ELFObjectFile<ELF32LE>>(Bin)) {
+    return buildStub(*Obj);
+  } else if (auto Obj = dyn_cast<ELFObjectFile<ELF64LE>>(Bin)) {
+    return buildStub(*Obj);
+  } else if (auto Obj = dyn_cast<ELFObjectFile<ELF32BE>>(Bin)) {
+    return buildStub(*Obj);
+  } else if (auto Obj = dyn_cast<ELFObjectFile<ELF64BE>>(Bin)) {
+    return buildStub(*Obj);
+  }
+  return createStringError(errc::not_supported, "unsupported binary format");
+}
+
+// This function wraps the ELFT writeELFBinaryToFile() so writeBinaryStub()
+// can be called without having to use ELFType templates directly.
+Error writeBinaryStub(StringRef FilePath, const ELFStub &Stub,
+                      ELFTarget OutputFormat, bool WriteIfChanged) {
+  if (OutputFormat == ELFTarget::ELF32LE)
+    return writeELFBinaryToFile<ELF32LE>(FilePath, Stub, WriteIfChanged);
+  if (OutputFormat == ELFTarget::ELF32BE)
+    return writeELFBinaryToFile<ELF32BE>(FilePath, Stub, WriteIfChanged);
+  if (OutputFormat == ELFTarget::ELF64LE)
+    return writeELFBinaryToFile<ELF64LE>(FilePath, Stub, WriteIfChanged);
+  if (OutputFormat == ELFTarget::ELF64BE)
+    return writeELFBinaryToFile<ELF64BE>(FilePath, Stub, WriteIfChanged);
+  llvm_unreachable("invalid binary output target");
+}
+
+} // end namespace elfabi
+} // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/InterfaceStub/ELFStub.cpp b/contrib/llvm-project/llvm/lib/InterfaceStub/ELFStub.cpp
new file mode 100644
index 000000000000..3c637695d8e7
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/InterfaceStub/ELFStub.cpp
@@ -0,0 +1,28 @@
+//===- ELFStub.cpp --------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------------===/
+
+#include "llvm/InterfaceStub/ELFStub.h"
+
+using namespace llvm;
+using namespace llvm::elfabi;
+
+ELFStub::ELFStub(ELFStub const &Stub) {
+  TbeVersion = Stub.TbeVersion;
+  Arch = Stub.Arch;
+  SoName = Stub.SoName;
+  NeededLibs = Stub.NeededLibs;
+  Symbols = Stub.Symbols;
+}
+
+ELFStub::ELFStub(ELFStub &&Stub) {
+  TbeVersion = std::move(Stub.TbeVersion);
+  Arch = std::move(Stub.Arch);
+  SoName = std::move(Stub.SoName);
+  NeededLibs = std::move(Stub.NeededLibs);
+  Symbols = std::move(Stub.Symbols);
+}
diff --git a/contrib/llvm-project/llvm/lib/InterfaceStub/TBEHandler.cpp b/contrib/llvm-project/llvm/lib/InterfaceStub/TBEHandler.cpp
new file mode 100644
index 000000000000..ee95d21ee661
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/InterfaceStub/TBEHandler.cpp
@@ -0,0 +1,143 @@
+//===- TBEHandler.cpp -----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------------===/
+
+#include "llvm/InterfaceStub/TBEHandler.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/InterfaceStub/ELFStub.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/YAMLTraits.h"
+
+using namespace llvm;
+using namespace llvm::elfabi;
+
+LLVM_YAML_STRONG_TYPEDEF(ELFArch, ELFArchMapper)
+
+namespace llvm {
+namespace yaml {
+
+/// YAML traits for ELFSymbolType.
+template <> struct ScalarEnumerationTraits<ELFSymbolType> {
+  static void enumeration(IO &IO, ELFSymbolType &SymbolType) {
+    IO.enumCase(SymbolType, "NoType", ELFSymbolType::NoType);
+    IO.enumCase(SymbolType, "Func", ELFSymbolType::Func);
+    IO.enumCase(SymbolType, "Object", ELFSymbolType::Object);
+    IO.enumCase(SymbolType, "TLS", ELFSymbolType::TLS);
+    IO.enumCase(SymbolType, "Unknown", ELFSymbolType::Unknown);
+    // Treat other symbol types as noise, and map to Unknown.
+    if (!IO.outputting() && IO.matchEnumFallback())
+      SymbolType = ELFSymbolType::Unknown;
+  }
+};
+
+/// YAML traits for ELFArch.
+template <> struct ScalarTraits<ELFArchMapper> {
+  static void output(const ELFArchMapper &Value, void *,
+                     llvm::raw_ostream &Out) {
+    // Map from integer to architecture string.
+    switch (Value) {
+    case (ELFArch)ELF::EM_X86_64:
+      Out << "x86_64";
+      break;
+    case (ELFArch)ELF::EM_AARCH64:
+      Out << "AArch64";
+      break;
+    case (ELFArch)ELF::EM_NONE:
+    default:
+      Out << "Unknown";
+    }
+  }
+
+  static StringRef input(StringRef Scalar, void *, ELFArchMapper &Value) {
+    // Map from architecture string to integer.
+    Value = StringSwitch<ELFArch>(Scalar)
+                .Case("x86_64", ELF::EM_X86_64)
+                .Case("AArch64", ELF::EM_AARCH64)
+                .Case("Unknown", ELF::EM_NONE)
+                .Default(ELF::EM_NONE);
+
+    // Returning empty StringRef indicates successful parse.
+    return StringRef();
+  }
+
+  // Don't place quotation marks around architecture value.
+  static QuotingType mustQuote(StringRef) { return QuotingType::None; }
+};
+
+/// YAML traits for ELFSymbol.
+template <> struct MappingTraits<ELFSymbol> {
+  static void mapping(IO &IO, ELFSymbol &Symbol) {
+    IO.mapRequired("Type", Symbol.Type);
+    // The need for symbol size depends on the symbol type.
+    if (Symbol.Type == ELFSymbolType::NoType) {
+      IO.mapOptional("Size", Symbol.Size, (uint64_t)0);
+    } else if (Symbol.Type == ELFSymbolType::Func) {
+      Symbol.Size = 0;
+    } else {
+      IO.mapRequired("Size", Symbol.Size);
+    }
+    IO.mapOptional("Undefined", Symbol.Undefined, false);
+    IO.mapOptional("Weak", Symbol.Weak, false);
+    IO.mapOptional("Warning", Symbol.Warning);
+  }
+
+  // Compacts symbol information into a single line.
+  static const bool flow = true;
+};
+
+/// YAML traits for set of ELFSymbols.
+template <> struct CustomMappingTraits<std::set<ELFSymbol>> {
+  static void inputOne(IO &IO, StringRef Key, std::set<ELFSymbol> &Set) {
+    ELFSymbol Sym(Key.str());
+    IO.mapRequired(Key.str().c_str(), Sym);
+    Set.insert(Sym);
+  }
+
+  static void output(IO &IO, std::set<ELFSymbol> &Set) {
+    for (auto &Sym : Set)
+      IO.mapRequired(Sym.Name.c_str(), const_cast<ELFSymbol &>(Sym));
+  }
+};
+
+/// YAML traits for ELFStub objects.
+template <> struct MappingTraits<ELFStub> {
+  static void mapping(IO &IO, ELFStub &Stub) {
+    if (!IO.mapTag("!tapi-tbe", true))
+      IO.setError("Not a .tbe YAML file.");
+    IO.mapRequired("TbeVersion", Stub.TbeVersion);
+    IO.mapOptional("SoName", Stub.SoName);
+    IO.mapRequired("Arch", (ELFArchMapper &)Stub.Arch);
+    IO.mapOptional("NeededLibs", Stub.NeededLibs);
+    IO.mapRequired("Symbols", Stub.Symbols);
+  }
+};
+
+} // end namespace yaml
+} // end namespace llvm
+
+Expected<std::unique_ptr<ELFStub>> elfabi::readTBEFromBuffer(StringRef Buf) {
+  yaml::Input YamlIn(Buf);
+  std::unique_ptr<ELFStub> Stub(new ELFStub());
+  YamlIn >> *Stub;
+  if (std::error_code Err = YamlIn.error())
+    return createStringError(Err, "YAML failed reading as TBE");
+
+  if (Stub->TbeVersion > elfabi::TBEVersionCurrent)
+    return make_error<StringError>(
+        "TBE version " + Stub->TbeVersion.getAsString() + " is unsupported.",
+        std::make_error_code(std::errc::invalid_argument));
+
+  return std::move(Stub);
+}
+
+Error elfabi::writeTBEToOutputStream(raw_ostream &OS, const ELFStub &Stub) {
+  yaml::Output YamlOut(OS, NULL, /*WrapColumn =*/0);
+
+  YamlOut << const_cast<ELFStub &>(Stub);
+  return Error::success();
+}
diff --git a/contrib/llvm-project/llvm/lib/LTO/Caching.cpp b/contrib/llvm-project/llvm/lib/LTO/Caching.cpp
index 46cac3fb1830..75a89e729f43 100644
--- a/contrib/llvm-project/llvm/lib/LTO/Caching.cpp
+++ b/contrib/llvm-project/llvm/lib/LTO/Caching.cpp
@@ -13,6 +13,7 @@
 #include "llvm/LTO/Caching.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/Errc.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Process.h"
diff --git a/contrib/llvm-project/llvm/lib/LTO/LTO.cpp b/contrib/llvm-project/llvm/lib/LTO/LTO.cpp
index 6e1e3998e490..9103d11059e0 100644
--- a/contrib/llvm-project/llvm/lib/LTO/LTO.cpp
+++ b/contrib/llvm-project/llvm/lib/LTO/LTO.cpp
@@ -798,7 +798,7 @@ Error LTO::linkRegularLTO(RegularLTOState::AddedModule Mod,
   for (GlobalValue *GV : Mod.Keep) {
     if (LivenessFromIndex && !ThinLTO.CombinedIndex.isGUIDLive(GV->getGUID())) {
       if (Function *F = dyn_cast<Function>(GV)) {
-        OptimizationRemarkEmitter ORE(F);
+        OptimizationRemarkEmitter ORE(F, nullptr);
         ORE.emit(OptimizationRemark(DEBUG_TYPE, "deadfunction", F)
                  << ore::NV("Function", F)
                  << " not added to the combined module ");
@@ -983,7 +983,8 @@ Error LTO::runRegularLTO(AddStreamFn AddStream) {
   // Setup optimization remarks.
   auto DiagFileOrErr = lto::setupLLVMOptimizationRemarks(
       RegularLTO.CombinedModule->getContext(), Conf.RemarksFilename,
-      Conf.RemarksPasses, Conf.RemarksFormat, Conf.RemarksWithHotness);
+      Conf.RemarksPasses, Conf.RemarksFormat, Conf.RemarksWithHotness,
+      Conf.RemarksHotnessThreshold);
   if (!DiagFileOrErr)
     return DiagFileOrErr.takeError();
 
@@ -1107,6 +1108,7 @@ public:
       const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
       MapVector<StringRef, BitcodeModule> &ModuleMap) = 0;
   virtual Error wait() = 0;
+  virtual unsigned getThreadCount() = 0;
 };
 
 namespace {
@@ -1221,6 +1223,10 @@ public:
     else
       return Error::success();
   }
+
+  unsigned getThreadCount() override {
+    return BackendThreadPool.getThreadCount();
+  }
 };
 } // end anonymous namespace
 
@@ -1309,6 +1315,10 @@ public:
   }
 
   Error wait() override { return Error::success(); }
+
+  // WriteIndexesThinBackend should always return 1 to prevent module
+  // re-ordering and avoid non-determinism in the final link.
+  unsigned getThreadCount() override { return 1; }
 };
 } // end anonymous namespace
 
@@ -1443,23 +1453,44 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache,
   auto &ModuleMap =
       ThinLTO.ModulesToCompile ? *ThinLTO.ModulesToCompile : ThinLTO.ModuleMap;
 
-  // Tasks 0 through ParallelCodeGenParallelismLevel-1 are reserved for combined
-  // module and parallel code generation partitions.
-  unsigned Task = RegularLTO.ParallelCodeGenParallelismLevel;
-  for (auto &Mod : ModuleMap) {
-    if (Error E = BackendProc->start(Task, Mod.second, ImportLists[Mod.first],
-                                     ExportLists[Mod.first],
-                                     ResolvedODR[Mod.first], ThinLTO.ModuleMap))
-      return E;
-    ++Task;
-  }
+  auto ProcessOneModule = [&](int I) -> Error {
+    auto &Mod = *(ModuleMap.begin() + I);
+    // Tasks 0 through ParallelCodeGenParallelismLevel-1 are reserved for
+    // combined module and parallel code generation partitions.
+    return BackendProc->start(RegularLTO.ParallelCodeGenParallelismLevel + I,
+                              Mod.second, ImportLists[Mod.first],
+                              ExportLists[Mod.first], ResolvedODR[Mod.first],
+                              ThinLTO.ModuleMap);
+  };
 
+  if (BackendProc->getThreadCount() == 1) {
+    // Process the modules in the order they were provided on the command-line.
+    // It is important for this codepath to be used for WriteIndexesThinBackend,
+    // to ensure the emitted LinkedObjectsFile lists ThinLTO objects in the same
+    // order as the inputs, which otherwise would affect the final link order.
+    for (int I = 0, E = ModuleMap.size(); I != E; ++I)
+      if (Error E = ProcessOneModule(I))
+        return E;
+  } else {
+    // When executing in parallel, process largest bitsize modules first to
+    // improve parallelism, and avoid starving the thread pool near the end.
+    // This saves about 15 sec on a 36-core machine while link `clang.exe` (out
+    // of 100 sec).
+    std::vector<BitcodeModule *> ModulesVec;
+    ModulesVec.reserve(ModuleMap.size());
+    for (auto &Mod : ModuleMap)
+      ModulesVec.push_back(&Mod.second);
+    for (int I : generateModulesOrdering(ModulesVec))
+      if (Error E = ProcessOneModule(I))
+        return E;
+  }
   return BackendProc->wait();
 }
 
 Expected<std::unique_ptr<ToolOutputFile>> lto::setupLLVMOptimizationRemarks(
     LLVMContext &Context, StringRef RemarksFilename, StringRef RemarksPasses,
-    StringRef RemarksFormat, bool RemarksWithHotness, int Count) {
+    StringRef RemarksFormat, bool RemarksWithHotness,
+    Optional<uint64_t> RemarksHotnessThreshold, int Count) {
   std::string Filename = std::string(RemarksFilename);
   // For ThinLTO, file.opt.<format> becomes
   // file.opt.<format>.thin.<num>.<format>.
@@ -1469,7 +1500,8 @@ Expected<std::unique_ptr<ToolOutputFile>> lto::setupLLVMOptimizationRemarks(
             .str();
 
   auto ResultOrErr = llvm::setupLLVMOptimizationRemarks(
-      Context, Filename, RemarksPasses, RemarksFormat, RemarksWithHotness);
+      Context, Filename, RemarksPasses, RemarksFormat, RemarksWithHotness,
+      RemarksHotnessThreshold);
   if (Error E = ResultOrErr.takeError())
     return std::move(E);
 
@@ -1495,3 +1527,18 @@ lto::setupStatsFile(StringRef StatsFilename) {
   StatsFile->keep();
   return std::move(StatsFile);
 }
+
+// Compute the ordering we will process the inputs: the rough heuristic here
+// is to sort them per size so that the largest module get schedule as soon as
+// possible. This is purely a compile-time optimization.
+std::vector<int> lto::generateModulesOrdering(ArrayRef<BitcodeModule *> R) {
+  std::vector<int> ModulesOrdering;
+  ModulesOrdering.resize(R.size());
+  std::iota(ModulesOrdering.begin(), ModulesOrdering.end(), 0);
+  llvm::sort(ModulesOrdering, [&](int LeftIndex, int RightIndex) {
+    auto LSize = R[LeftIndex]->getBuffer().size();
+    auto RSize = R[RightIndex]->getBuffer().size();
+    return LSize > RSize;
+  });
+  return ModulesOrdering;
+}
diff --git a/contrib/llvm-project/llvm/lib/LTO/LTOBackend.cpp b/contrib/llvm-project/llvm/lib/LTO/LTOBackend.cpp
index 0c395f9bbf28..1796d6ba60cc 100644
--- a/contrib/llvm-project/llvm/lib/LTO/LTOBackend.cpp
+++ b/contrib/llvm-project/llvm/lib/LTO/LTOBackend.cpp
@@ -50,6 +50,30 @@
 using namespace llvm;
 using namespace lto;
 
+#define DEBUG_TYPE "lto-backend"
+
+enum class LTOBitcodeEmbedding {
+  DoNotEmbed = 0,
+  EmbedOptimized = 1,
+  EmbedPostMergePreOptimized = 2
+};
+
+static cl::opt<LTOBitcodeEmbedding> EmbedBitcode(
+    "lto-embed-bitcode", cl::init(LTOBitcodeEmbedding::DoNotEmbed),
+    cl::values(clEnumValN(LTOBitcodeEmbedding::DoNotEmbed, "none",
+                          "Do not embed"),
+               clEnumValN(LTOBitcodeEmbedding::EmbedOptimized, "optimized",
+                          "Embed after all optimization passes"),
+               clEnumValN(LTOBitcodeEmbedding::EmbedPostMergePreOptimized,
+                          "post-merge-pre-opt",
+                          "Embed post merge, but before optimizations")),
+    cl::desc("Embed LLVM bitcode in object files produced by LTO"));
+
+static cl::opt<bool> ThinLTOAssumeMerged(
+    "thinlto-assume-merged", cl::init(false),
+    cl::desc("Assume the input has already undergone ThinLTO function "
+             "importing and the other pre-optimization pipeline changes."));
+
 LLVM_ATTRIBUTE_NORETURN static void reportOpenError(StringRef Path, Twine Msg) {
   errs() << "failed to open " << Path << ": " << Msg << '\n';
   errs().flush();
@@ -152,9 +176,7 @@ static void RegisterPassPlugins(ArrayRef<std::string> PassPlugins,
   }
 }
 
-namespace {
-
-std::unique_ptr<TargetMachine>
+static std::unique_ptr<TargetMachine>
 createTargetMachine(const Config &Conf, const Target *TheTarget, Module &M) {
   StringRef TheTriple = M.getTargetTriple();
   SubtargetFeatures Features;
@@ -175,9 +197,11 @@ createTargetMachine(const Config &Conf, const Target *TheTarget, Module &M) {
   else
     CodeModel = M.getCodeModel();
 
-  return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
+  std::unique_ptr<TargetMachine> TM(TheTarget->createTargetMachine(
       TheTriple, Conf.CPU, Features.getString(), Conf.Options, RelocModel,
       CodeModel, Conf.CGOptLevel));
+  assert(TM && "Failed to create target machine");
+  return TM;
 }
 
 static void runNewPMPasses(const Config &Conf, Module &Mod, TargetMachine *TM,
@@ -197,9 +221,9 @@ static void runNewPMPasses(const Config &Conf, Module &Mod, TargetMachine *TM,
   }
 
   PassInstrumentationCallbacks PIC;
-  StandardInstrumentations SI;
+  StandardInstrumentations SI(Conf.DebugPassManager);
   SI.registerCallbacks(PIC);
-  PassBuilder PB(TM, Conf.PTO, PGOOpt, &PIC);
+  PassBuilder PB(Conf.DebugPassManager, TM, Conf.PTO, PGOOpt, &PIC);
   AAManager AA;
 
   // Parse a custom AA pipeline if asked to.
@@ -213,6 +237,12 @@ static void runNewPMPasses(const Config &Conf, Module &Mod, TargetMachine *TM,
   CGSCCAnalysisManager CGAM(Conf.DebugPassManager);
   ModuleAnalysisManager MAM(Conf.DebugPassManager);
 
+  std::unique_ptr<TargetLibraryInfoImpl> TLII(
+      new TargetLibraryInfoImpl(Triple(TM->getTargetTriple())));
+  if (Conf.Freestanding)
+    TLII->disableAllFunctions();
+  FAM.registerPass([&] { return TargetLibraryAnalysis(*TLII); });
+
   // Register the AA manager first so that our version is the one used.
   FAM.registerPass([&] { return std::move(AA); });
 
@@ -224,7 +254,9 @@ static void runNewPMPasses(const Config &Conf, Module &Mod, TargetMachine *TM,
   PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
 
   ModulePassManager MPM(Conf.DebugPassManager);
-  // FIXME (davide): verify the input.
+
+  if (!Conf.DisableVerify)
+    MPM.addPass(VerifierPass());
 
   PassBuilder::OptimizationLevel OL;
 
@@ -246,20 +278,21 @@ static void runNewPMPasses(const Config &Conf, Module &Mod, TargetMachine *TM,
   }
 
   if (IsThinLTO)
-    MPM = PB.buildThinLTODefaultPipeline(OL, Conf.DebugPassManager,
-                                         ImportSummary);
+    MPM.addPass(PB.buildThinLTODefaultPipeline(OL, ImportSummary));
   else
-    MPM = PB.buildLTODefaultPipeline(OL, Conf.DebugPassManager, ExportSummary);
-  MPM.run(Mod, MAM);
+    MPM.addPass(PB.buildLTODefaultPipeline(OL, ExportSummary));
 
-  // FIXME (davide): verify the output.
+  if (!Conf.DisableVerify)
+    MPM.addPass(VerifierPass());
+
+  MPM.run(Mod, MAM);
 }
 
 static void runNewPMCustomPasses(const Config &Conf, Module &Mod,
                                  TargetMachine *TM, std::string PipelineDesc,
                                  std::string AAPipelineDesc,
                                  bool DisableVerify) {
-  PassBuilder PB(TM);
+  PassBuilder PB(Conf.DebugPassManager, TM);
   AAManager AA;
 
   // Parse a custom AA pipeline if asked to.
@@ -275,6 +308,12 @@ static void runNewPMCustomPasses(const Config &Conf, Module &Mod,
   CGSCCAnalysisManager CGAM;
   ModuleAnalysisManager MAM;
 
+  std::unique_ptr<TargetLibraryInfoImpl> TLII(
+      new TargetLibraryInfoImpl(Triple(TM->getTargetTriple())));
+  if (Conf.Freestanding)
+    TLII->disableAllFunctions();
+  FAM.registerPass([&] { return TargetLibraryAnalysis(*TLII); });
+
   // Register the AA manager first so that our version is the one used.
   FAM.registerPass([&] { return std::move(AA); });
 
@@ -308,6 +347,8 @@ static void runOldPMPasses(const Config &Conf, Module &Mod, TargetMachine *TM,
 
   PassManagerBuilder PMB;
   PMB.LibraryInfo = new TargetLibraryInfoImpl(Triple(TM->getTargetTriple()));
+  if (Conf.Freestanding)
+    PMB.LibraryInfo->disableAllFunctions();
   PMB.Inliner = createFunctionInliningPass();
   PMB.ExportSummary = ExportSummary;
   PMB.ImportSummary = ImportSummary;
@@ -331,9 +372,27 @@ static void runOldPMPasses(const Config &Conf, Module &Mod, TargetMachine *TM,
   passes.run(Mod);
 }
 
-bool opt(const Config &Conf, TargetMachine *TM, unsigned Task, Module &Mod,
-         bool IsThinLTO, ModuleSummaryIndex *ExportSummary,
-         const ModuleSummaryIndex *ImportSummary) {
+bool lto::opt(const Config &Conf, TargetMachine *TM, unsigned Task, Module &Mod,
+              bool IsThinLTO, ModuleSummaryIndex *ExportSummary,
+              const ModuleSummaryIndex *ImportSummary,
+              const std::vector<uint8_t> &CmdArgs) {
+  if (EmbedBitcode == LTOBitcodeEmbedding::EmbedPostMergePreOptimized) {
+    // FIXME: the motivation for capturing post-merge bitcode and command line
+    // is replicating the compilation environment from bitcode, without needing
+    // to understand the dependencies (the functions to be imported). This
+    // assumes a clang - based invocation, case in which we have the command
+    // line.
+    // It's not very clear how the above motivation would map in the
+    // linker-based case, so we currently don't plumb the command line args in
+    // that case.
+    if (CmdArgs.empty())
+      LLVM_DEBUG(
+          dbgs() << "Post-(Thin)LTO merge bitcode embedding was requested, but "
+                    "command line arguments are not available");
+    llvm::EmbedBitcodeInModule(Mod, llvm::MemoryBufferRef(),
+                               /*EmbedBitcode*/ true, /*EmbedCmdline*/ true,
+                               /*Cmdline*/ CmdArgs);
+  }
   // FIXME: Plumb the combined index into the new pass manager.
   if (!Conf.OptPipeline.empty())
     runNewPMCustomPasses(Conf, Mod, TM, Conf.OptPipeline, Conf.AAPipeline,
@@ -346,30 +405,17 @@ bool opt(const Config &Conf, TargetMachine *TM, unsigned Task, Module &Mod,
   return !Conf.PostOptModuleHook || Conf.PostOptModuleHook(Task, Mod);
 }
 
-static cl::opt<bool> EmbedBitcode(
-    "lto-embed-bitcode", cl::init(false),
-    cl::desc("Embed LLVM bitcode in object files produced by LTO"));
-
-static void EmitBitcodeSection(Module &M, const Config &Conf) {
-  if (!EmbedBitcode)
-    return;
-  SmallVector<char, 0> Buffer;
-  raw_svector_ostream OS(Buffer);
-  WriteBitcodeToFile(M, OS);
-
-  std::unique_ptr<MemoryBuffer> Buf(
-      new SmallVectorMemoryBuffer(std::move(Buffer)));
-  llvm::EmbedBitcodeInModule(M, Buf->getMemBufferRef(), /*EmbedBitcode*/ true,
-                             /*EmbedMarker*/ false, /*CmdArgs*/ nullptr);
-}
-
-void codegen(const Config &Conf, TargetMachine *TM, AddStreamFn AddStream,
-             unsigned Task, Module &Mod,
-             const ModuleSummaryIndex &CombinedIndex) {
+static void codegen(const Config &Conf, TargetMachine *TM,
+                    AddStreamFn AddStream, unsigned Task, Module &Mod,
+                    const ModuleSummaryIndex &CombinedIndex) {
   if (Conf.PreCodeGenModuleHook && !Conf.PreCodeGenModuleHook(Task, Mod))
     return;
 
-  EmitBitcodeSection(Mod, Conf);
+  if (EmbedBitcode == LTOBitcodeEmbedding::EmbedOptimized)
+    llvm::EmbedBitcodeInModule(Mod, llvm::MemoryBufferRef(),
+                               /*EmbedBitcode*/ true,
+                               /*EmbedCmdline*/ false,
+                               /*CmdArgs*/ std::vector<uint8_t>());
 
   std::unique_ptr<ToolOutputFile> DwoOut;
   SmallString<1024> DwoFile(Conf.SplitDwarfOutput);
@@ -396,6 +442,8 @@ void codegen(const Config &Conf, TargetMachine *TM, AddStreamFn AddStream,
   legacy::PassManager CodeGenPasses;
   CodeGenPasses.add(
       createImmutableModuleSummaryIndexWrapperPass(&CombinedIndex));
+  if (Conf.PreCodeGenPassesHook)
+    Conf.PreCodeGenPassesHook(CodeGenPasses);
   if (TM->addPassesToEmitFile(CodeGenPasses, *Stream->OS,
                               DwoOut ? &DwoOut->os() : nullptr,
                               Conf.CGFileType))
@@ -406,10 +454,11 @@ void codegen(const Config &Conf, TargetMachine *TM, AddStreamFn AddStream,
     DwoOut->keep();
 }
 
-void splitCodeGen(const Config &C, TargetMachine *TM, AddStreamFn AddStream,
-                  unsigned ParallelCodeGenParallelismLevel,
-                  std::unique_ptr<Module> Mod,
-                  const ModuleSummaryIndex &CombinedIndex) {
+static void splitCodeGen(const Config &C, TargetMachine *TM,
+                         AddStreamFn AddStream,
+                         unsigned ParallelCodeGenParallelismLevel,
+                         std::unique_ptr<Module> Mod,
+                         const ModuleSummaryIndex &CombinedIndex) {
   ThreadPool CodegenThreadPool(
       heavyweight_hardware_concurrency(ParallelCodeGenParallelismLevel));
   unsigned ThreadCount = 0;
@@ -457,7 +506,8 @@ void splitCodeGen(const Config &C, TargetMachine *TM, AddStreamFn AddStream,
   CodegenThreadPool.wait();
 }
 
-Expected<const Target *> initAndLookupTarget(const Config &C, Module &Mod) {
+static Expected<const Target *> initAndLookupTarget(const Config &C,
+                                                    Module &Mod) {
   if (!C.OverrideTriple.empty())
     Mod.setTargetTriple(C.OverrideTriple);
   else if (Mod.getTargetTriple().empty())
@@ -469,7 +519,6 @@ Expected<const Target *> initAndLookupTarget(const Config &C, Module &Mod) {
     return make_error<StringError>(Msg, inconvertibleErrorCode());
   return T;
 }
-}
 
 Error lto::finalizeOptimizationRemarks(
     std::unique_ptr<ToolOutputFile> DiagOutputFile) {
@@ -494,7 +543,8 @@ Error lto::backend(const Config &C, AddStreamFn AddStream,
 
   if (!C.CodeGenOnly) {
     if (!opt(C, TM.get(), 0, *Mod, /*IsThinLTO=*/false,
-             /*ExportSummary=*/&CombinedIndex, /*ImportSummary=*/nullptr))
+             /*ExportSummary=*/&CombinedIndex, /*ImportSummary=*/nullptr,
+             /*CmdArgs*/ std::vector<uint8_t>()))
       return Error::success();
   }
 
@@ -532,7 +582,8 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
                        Module &Mod, const ModuleSummaryIndex &CombinedIndex,
                        const FunctionImporter::ImportMapTy &ImportList,
                        const GVSummaryMapTy &DefinedGlobals,
-                       MapVector<StringRef, BitcodeModule> &ModuleMap) {
+                       MapVector<StringRef, BitcodeModule> &ModuleMap,
+                       const std::vector<uint8_t> &CmdArgs) {
   Expected<const Target *> TOrErr = initAndLookupTarget(Conf, Mod);
   if (!TOrErr)
     return TOrErr.takeError();
@@ -542,7 +593,8 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
   // Setup optimization remarks.
   auto DiagFileOrErr = lto::setupLLVMOptimizationRemarks(
       Mod.getContext(), Conf.RemarksFilename, Conf.RemarksPasses,
-      Conf.RemarksFormat, Conf.RemarksWithHotness, Task);
+      Conf.RemarksFormat, Conf.RemarksWithHotness, Conf.RemarksHotnessThreshold,
+      Task);
   if (!DiagFileOrErr)
     return DiagFileOrErr.takeError();
   auto DiagnosticOutputFile = std::move(*DiagFileOrErr);
@@ -559,6 +611,21 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
   if (Conf.PreOptModuleHook && !Conf.PreOptModuleHook(Task, Mod))
     return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
 
+  auto OptimizeAndCodegen =
+      [&](Module &Mod, TargetMachine *TM,
+          std::unique_ptr<ToolOutputFile> DiagnosticOutputFile) {
+        if (!opt(Conf, TM, Task, Mod, /*IsThinLTO=*/true,
+                 /*ExportSummary=*/nullptr, /*ImportSummary=*/&CombinedIndex,
+                 CmdArgs))
+          return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
+
+        codegen(Conf, TM, AddStream, Task, Mod, CombinedIndex);
+        return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
+      };
+
+  if (ThinLTOAssumeMerged)
+    return OptimizeAndCodegen(Mod, TM.get(), std::move(DiagnosticOutputFile));
+
   // When linking an ELF shared object, dso_local should be dropped. We
   // conservatively do this for -fpic.
   bool ClearDSOLocalOnDeclarations =
@@ -599,10 +666,81 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
   if (Conf.PostImportModuleHook && !Conf.PostImportModuleHook(Task, Mod))
     return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
 
-  if (!opt(Conf, TM.get(), Task, Mod, /*IsThinLTO=*/true,
-           /*ExportSummary=*/nullptr, /*ImportSummary=*/&CombinedIndex))
-    return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
+  return OptimizeAndCodegen(Mod, TM.get(), std::move(DiagnosticOutputFile));
+}
+
+BitcodeModule *lto::findThinLTOModule(MutableArrayRef<BitcodeModule> BMs) {
+  if (ThinLTOAssumeMerged && BMs.size() == 1)
+    return BMs.begin();
 
-  codegen(Conf, TM.get(), AddStream, Task, Mod, CombinedIndex);
-  return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
+  for (BitcodeModule &BM : BMs) {
+    Expected<BitcodeLTOInfo> LTOInfo = BM.getLTOInfo();
+    if (LTOInfo && LTOInfo->IsThinLTO)
+      return &BM;
+  }
+  return nullptr;
+}
+
+Expected<BitcodeModule> lto::findThinLTOModule(MemoryBufferRef MBRef) {
+  Expected<std::vector<BitcodeModule>> BMsOrErr = getBitcodeModuleList(MBRef);
+  if (!BMsOrErr)
+    return BMsOrErr.takeError();
+
+  // The bitcode file may contain multiple modules, we want the one that is
+  // marked as being the ThinLTO module.
+  if (const BitcodeModule *Bm = lto::findThinLTOModule(*BMsOrErr))
+    return *Bm;
+
+  return make_error<StringError>("Could not find module summary",
+                                 inconvertibleErrorCode());
+}
+
+bool lto::loadReferencedModules(
+    const Module &M, const ModuleSummaryIndex &CombinedIndex,
+    FunctionImporter::ImportMapTy &ImportList,
+    MapVector<llvm::StringRef, llvm::BitcodeModule> &ModuleMap,
+    std::vector<std::unique_ptr<llvm::MemoryBuffer>>
+        &OwnedImportsLifetimeManager) {
+  if (ThinLTOAssumeMerged)
+    return true;
+  // We can simply import the values mentioned in the combined index, since
+  // we should only invoke this using the individual indexes written out
+  // via a WriteIndexesThinBackend.
+  for (const auto &GlobalList : CombinedIndex) {
+    // Ignore entries for undefined references.
+    if (GlobalList.second.SummaryList.empty())
+      continue;
+
+    auto GUID = GlobalList.first;
+    for (const auto &Summary : GlobalList.second.SummaryList) {
+      // Skip the summaries for the importing module. These are included to
+      // e.g. record required linkage changes.
+      if (Summary->modulePath() == M.getModuleIdentifier())
+        continue;
+      // Add an entry to provoke importing by thinBackend.
+      ImportList[Summary->modulePath()].insert(GUID);
+    }
+  }
+
+  for (auto &I : ImportList) {
+    ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> MBOrErr =
+        llvm::MemoryBuffer::getFile(I.first());
+    if (!MBOrErr) {
+      errs() << "Error loading imported file '" << I.first()
+             << "': " << MBOrErr.getError().message() << "\n";
+      return false;
+    }
+
+    Expected<BitcodeModule> BMOrErr = findThinLTOModule(**MBOrErr);
+    if (!BMOrErr) {
+      handleAllErrors(BMOrErr.takeError(), [&](ErrorInfoBase &EIB) {
+        errs() << "Error loading imported file '" << I.first()
+               << "': " << EIB.message() << '\n';
+      });
+      return false;
+    }
+    ModuleMap.insert({I.first(), *BMOrErr});
+    OwnedImportsLifetimeManager.push_back(std::move(*MBOrErr));
+  }
+  return true;
 }
diff --git a/contrib/llvm-project/llvm/lib/LTO/LTOCodeGenerator.cpp b/contrib/llvm-project/llvm/lib/LTO/LTOCodeGenerator.cpp
index 25ab1404b4e1..027e197e1e0d 100644
--- a/contrib/llvm-project/llvm/lib/LTO/LTOCodeGenerator.cpp
+++ b/contrib/llvm-project/llvm/lib/LTO/LTOCodeGenerator.cpp
@@ -43,6 +43,7 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/SubtargetFeature.h"
+#include "llvm/Remarks/HotnessThresholdParser.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Host.h"
@@ -87,6 +88,14 @@ cl::opt<bool> RemarksWithHotness(
     cl::desc("With PGO, include profile count in optimization remarks"),
     cl::Hidden);
 
+cl::opt<Optional<uint64_t>, false, remarks::HotnessThresholdParser>
+    RemarksHotnessThreshold(
+        "lto-pass-remarks-hotness-threshold",
+        cl::desc("Minimum profile count required for an "
+                 "optimization remark to be output."
+                 " Use 'auto' to apply the threshold from profile summary."),
+        cl::value_desc("uint or 'auto'"), cl::init(0), cl::Hidden);
+
 cl::opt<std::string>
     RemarksFilename("lto-pass-remarks-output",
                     cl::desc("Output filename for pass remarks"),
@@ -317,22 +326,15 @@ LTOCodeGenerator::compileOptimized() {
   return std::move(*BufferOrErr);
 }
 
-bool LTOCodeGenerator::compile_to_file(const char **Name, bool DisableVerify,
-                                       bool DisableInline,
-                                       bool DisableGVNLoadPRE,
-                                       bool DisableVectorization) {
-  if (!optimize(DisableVerify, DisableInline, DisableGVNLoadPRE,
-                DisableVectorization))
+bool LTOCodeGenerator::compile_to_file(const char **Name) {
+  if (!optimize())
     return false;
 
   return compileOptimizedToFile(Name);
 }
 
-std::unique_ptr<MemoryBuffer>
-LTOCodeGenerator::compile(bool DisableVerify, bool DisableInline,
-                          bool DisableGVNLoadPRE, bool DisableVectorization) {
-  if (!optimize(DisableVerify, DisableInline, DisableGVNLoadPRE,
-                DisableVectorization))
+std::unique_ptr<MemoryBuffer> LTOCodeGenerator::compile() {
+  if (!optimize())
     return nullptr;
 
   return compileOptimized();
@@ -359,7 +361,7 @@ bool LTOCodeGenerator::determineTarget() {
 
   // Construct LTOModule, hand over ownership of module and target. Use MAttr as
   // the default set of features.
-  SubtargetFeatures Features(MAttr);
+  SubtargetFeatures Features(join(MAttrs, ""));
   Features.getDefaultSubtargetFeatures(Triple);
   FeatureStr = Features.getString();
   // Set a default CPU for Darwin triples.
@@ -368,16 +370,21 @@ bool LTOCodeGenerator::determineTarget() {
       MCpu = "core2";
     else if (Triple.getArch() == llvm::Triple::x86)
       MCpu = "yonah";
+    else if (Triple.isArm64e())
+      MCpu = "apple-a12";
     else if (Triple.getArch() == llvm::Triple::aarch64 ||
              Triple.getArch() == llvm::Triple::aarch64_32)
       MCpu = "cyclone";
   }
 
   TargetMach = createTargetMachine();
+  assert(TargetMach && "Unable to create target machine");
+
   return true;
 }
 
 std::unique_ptr<TargetMachine> LTOCodeGenerator::createTargetMachine() {
+  assert(MArch && "MArch is not set!");
   return std::unique_ptr<TargetMachine>(MArch->createTargetMachine(
       TripleStr, MCpu, FeatureStr, Options, RelocModel, None, CGOptLevel));
 }
@@ -466,8 +473,6 @@ void LTOCodeGenerator::applyScopeRestrictions() {
 
   internalizeModule(*MergedModule, mustPreserveGV);
 
-  MergedModule->addModuleFlag(Module::Error, "LTOPostLink", 1);
-
   ScopeRestrictionsDone = true;
 }
 
@@ -522,15 +527,13 @@ void LTOCodeGenerator::finishOptimizationRemarks() {
 }
 
 /// Optimize merged modules using various IPO passes
-bool LTOCodeGenerator::optimize(bool DisableVerify, bool DisableInline,
-                                bool DisableGVNLoadPRE,
-                                bool DisableVectorization) {
+bool LTOCodeGenerator::optimize() {
   if (!this->determineTarget())
     return false;
 
-  auto DiagFileOrErr =
-      lto::setupLLVMOptimizationRemarks(Context, RemarksFilename, RemarksPasses,
-                                        RemarksFormat, RemarksWithHotness);
+  auto DiagFileOrErr = lto::setupLLVMOptimizationRemarks(
+      Context, RemarksFilename, RemarksPasses, RemarksFormat,
+      RemarksWithHotness, RemarksHotnessThreshold);
   if (!DiagFileOrErr) {
     errs() << "Error: " << toString(DiagFileOrErr.takeError()) << "\n";
     report_fatal_error("Can't get an output file for the remarks");
@@ -559,6 +562,9 @@ bool LTOCodeGenerator::optimize(bool DisableVerify, bool DisableInline,
   // Mark which symbols can not be internalized
   this->applyScopeRestrictions();
 
+  // Write LTOPostLink flag for passes that require all the modules.
+  MergedModule->addModuleFlag(Module::Error, "LTOPostLink", 1);
+
   // Instantiate the pass manager to organize the passes.
   legacy::PassManager passes;
 
@@ -570,11 +576,9 @@ bool LTOCodeGenerator::optimize(bool DisableVerify, bool DisableInline,
 
   Triple TargetTriple(TargetMach->getTargetTriple());
   PassManagerBuilder PMB;
-  PMB.DisableGVNLoadPRE = DisableGVNLoadPRE;
-  PMB.LoopVectorize = !DisableVectorization;
-  PMB.SLPVectorize = !DisableVectorization;
-  if (!DisableInline)
-    PMB.Inliner = createFunctionInliningPass();
+  PMB.LoopVectorize = true;
+  PMB.SLPVectorize = true;
+  PMB.Inliner = createFunctionInliningPass();
   PMB.LibraryInfo = new TargetLibraryInfoImpl(TargetTriple);
   if (Freestanding)
     PMB.LibraryInfo->disableAllFunctions();
diff --git a/contrib/llvm-project/llvm/lib/LTO/LTOModule.cpp b/contrib/llvm-project/llvm/lib/LTO/LTOModule.cpp
index ebe779aea62e..1119622578df 100644
--- a/contrib/llvm-project/llvm/lib/LTO/LTOModule.cpp
+++ b/contrib/llvm-project/llvm/lib/LTO/LTOModule.cpp
@@ -46,6 +46,7 @@ using namespace llvm::object;
 LTOModule::LTOModule(std::unique_ptr<Module> M, MemoryBufferRef MBRef,
                      llvm::TargetMachine *TM)
     : Mod(std::move(M)), MBRef(MBRef), _target(TM) {
+  assert(_target && "target machine is null");
   SymTab.addModule(Mod.get());
 }
 
@@ -221,6 +222,8 @@ LTOModule::makeLTOModule(MemoryBufferRef Buffer, const TargetOptions &options,
       CPU = "core2";
     else if (Triple.getArch() == llvm::Triple::x86)
       CPU = "yonah";
+    else if (Triple.isArm64e())
+      CPU = "apple-a12";
     else if (Triple.getArch() == llvm::Triple::aarch64 ||
              Triple.getArch() == llvm::Triple::aarch64_32)
       CPU = "cyclone";
diff --git a/contrib/llvm-project/llvm/lib/LTO/ThinLTOCodeGenerator.cpp b/contrib/llvm-project/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
index d0a1e1889c61..38f49693b62e 100644
--- a/contrib/llvm-project/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
+++ b/contrib/llvm-project/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
@@ -37,6 +37,7 @@
 #include "llvm/LTO/SummaryBasedOptimizations.h"
 #include "llvm/MC/SubtargetFeature.h"
 #include "llvm/Object/IRObjectFile.h"
+#include "llvm/Remarks/HotnessThresholdParser.h"
 #include "llvm/Support/CachePruning.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Error.h"
@@ -75,6 +76,8 @@ extern cl::opt<bool> LTODiscardValueNames;
 extern cl::opt<std::string> RemarksFilename;
 extern cl::opt<std::string> RemarksPasses;
 extern cl::opt<bool> RemarksWithHotness;
+extern cl::opt<Optional<uint64_t>, false, remarks::HotnessThresholdParser>
+    RemarksHotnessThreshold;
 extern cl::opt<std::string> RemarksFormat;
 }
 
@@ -269,16 +272,26 @@ addUsedSymbolToPreservedGUID(const lto::InputFile &File,
 }
 
 // Convert the PreservedSymbols map from "Name" based to "GUID" based.
+static void computeGUIDPreservedSymbols(const lto::InputFile &File,
+                                        const StringSet<> &PreservedSymbols,
+                                        const Triple &TheTriple,
+                                        DenseSet<GlobalValue::GUID> &GUIDs) {
+  // Iterate the symbols in the input file and if the input has preserved symbol
+  // compute the GUID for the symbol.
+  for (const auto &Sym : File.symbols()) {
+    if (PreservedSymbols.count(Sym.getName()) && !Sym.getIRName().empty())
+      GUIDs.insert(GlobalValue::getGUID(GlobalValue::getGlobalIdentifier(
+          Sym.getIRName(), GlobalValue::ExternalLinkage, "")));
+  }
+}
+
 static DenseSet<GlobalValue::GUID>
-computeGUIDPreservedSymbols(const StringSet<> &PreservedSymbols,
+computeGUIDPreservedSymbols(const lto::InputFile &File,
+                            const StringSet<> &PreservedSymbols,
                             const Triple &TheTriple) {
   DenseSet<GlobalValue::GUID> GUIDPreservedSymbols(PreservedSymbols.size());
-  for (auto &Entry : PreservedSymbols) {
-    StringRef Name = Entry.first();
-    if (TheTriple.isOSBinFormatMachO() && Name.size() > 0 && Name[0] == '_')
-      Name = Name.drop_front();
-    GUIDPreservedSymbols.insert(GlobalValue::getGUID(Name));
-  }
+  computeGUIDPreservedSymbols(File, PreservedSymbols, TheTriple,
+                              GUIDPreservedSymbols);
   return GUIDPreservedSymbols;
 }
 
@@ -565,9 +578,12 @@ std::unique_ptr<TargetMachine> TargetMachineBuilder::create() const {
   Features.getDefaultSubtargetFeatures(TheTriple);
   std::string FeatureStr = Features.getString();
 
-  return std::unique_ptr<TargetMachine>(
+  std::unique_ptr<TargetMachine> TM(
       TheTarget->createTargetMachine(TheTriple.str(), MCpu, FeatureStr, Options,
                                      RelocModel, None, CGOptLevel));
+  assert(TM && "Cannot create target machine");
+
+  return TM;
 }
 
 /**
@@ -652,7 +668,7 @@ void ThinLTOCodeGenerator::promote(Module &TheModule, ModuleSummaryIndex &Index,
 
   // Convert the preserved symbols set from string to GUID
   auto GUIDPreservedSymbols = computeGUIDPreservedSymbols(
-      PreservedSymbols, Triple(TheModule.getTargetTriple()));
+      File, PreservedSymbols, Triple(TheModule.getTargetTriple()));
 
   // Add used symbol to the preserved symbols.
   addUsedSymbolToPreservedGUID(File, GUIDPreservedSymbols);
@@ -702,7 +718,7 @@ void ThinLTOCodeGenerator::crossModuleImport(Module &TheModule,
 
   // Convert the preserved symbols set from string to GUID
   auto GUIDPreservedSymbols = computeGUIDPreservedSymbols(
-      PreservedSymbols, Triple(TheModule.getTargetTriple()));
+      File, PreservedSymbols, Triple(TheModule.getTargetTriple()));
 
   addUsedSymbolToPreservedGUID(File, GUIDPreservedSymbols);
 
@@ -737,7 +753,7 @@ void ThinLTOCodeGenerator::gatherImportedSummariesForModule(
 
   // Convert the preserved symbols set from string to GUID
   auto GUIDPreservedSymbols = computeGUIDPreservedSymbols(
-      PreservedSymbols, Triple(TheModule.getTargetTriple()));
+      File, PreservedSymbols, Triple(TheModule.getTargetTriple()));
 
   addUsedSymbolToPreservedGUID(File, GUIDPreservedSymbols);
 
@@ -770,7 +786,7 @@ void ThinLTOCodeGenerator::emitImports(Module &TheModule, StringRef OutputName,
 
   // Convert the preserved symbols set from string to GUID
   auto GUIDPreservedSymbols = computeGUIDPreservedSymbols(
-      PreservedSymbols, Triple(TheModule.getTargetTriple()));
+      File, PreservedSymbols, Triple(TheModule.getTargetTriple()));
 
   addUsedSymbolToPreservedGUID(File, GUIDPreservedSymbols);
 
@@ -808,7 +824,7 @@ void ThinLTOCodeGenerator::internalize(Module &TheModule,
 
   // Convert the preserved symbols set from string to GUID
   auto GUIDPreservedSymbols =
-      computeGUIDPreservedSymbols(PreservedSymbols, TMBuilder.TheTriple);
+      computeGUIDPreservedSymbols(File, PreservedSymbols, TMBuilder.TheTriple);
 
   addUsedSymbolToPreservedGUID(File, GUIDPreservedSymbols);
 
@@ -972,8 +988,10 @@ void ThinLTOCodeGenerator::run() {
 
   // Convert the preserved symbols set from string to GUID, this is needed for
   // computing the caching hash and the internalization.
-  auto GUIDPreservedSymbols =
-      computeGUIDPreservedSymbols(PreservedSymbols, TMBuilder.TheTriple);
+  DenseSet<GlobalValue::GUID> GUIDPreservedSymbols;
+  for (const auto &M : Modules)
+    computeGUIDPreservedSymbols(*M, PreservedSymbols, TMBuilder.TheTriple,
+                                GUIDPreservedSymbols);
 
   // Add used symbol from inputs to the preserved symbols.
   for (const auto &M : Modules)
@@ -1042,19 +1060,11 @@ void ThinLTOCodeGenerator::run() {
     ModuleToDefinedGVSummaries[ModuleIdentifier];
   }
 
-  // Compute the ordering we will process the inputs: the rough heuristic here
-  // is to sort them per size so that the largest module get schedule as soon as
-  // possible. This is purely a compile-time optimization.
-  std::vector<int> ModulesOrdering;
-  ModulesOrdering.resize(Modules.size());
-  std::iota(ModulesOrdering.begin(), ModulesOrdering.end(), 0);
-  llvm::sort(ModulesOrdering, [&](int LeftIndex, int RightIndex) {
-    auto LSize =
-        Modules[LeftIndex]->getSingleBitcodeModule().getBuffer().size();
-    auto RSize =
-        Modules[RightIndex]->getSingleBitcodeModule().getBuffer().size();
-    return LSize > RSize;
-  });
+  std::vector<BitcodeModule *> ModulesVec;
+  ModulesVec.reserve(Modules.size());
+  for (auto &Mod : Modules)
+    ModulesVec.push_back(&Mod->getSingleBitcodeModule());
+  std::vector<int> ModulesOrdering = lto::generateModulesOrdering(ModulesVec);
 
   // Parallel optimizer + codegen
   {
@@ -1097,7 +1107,7 @@ void ThinLTOCodeGenerator::run() {
         Context.enableDebugTypeODRUniquing();
         auto DiagFileOrErr = lto::setupLLVMOptimizationRemarks(
             Context, RemarksFilename, RemarksPasses, RemarksFormat,
-            RemarksWithHotness, count);
+            RemarksWithHotness, RemarksHotnessThreshold, count);
         if (!DiagFileOrErr) {
           errs() << "Error: " << toString(DiagFileOrErr.takeError()) << "\n";
           report_fatal_error("ThinLTO: Can't get an output file for the "
diff --git a/contrib/llvm-project/llvm/lib/Linker/IRMover.cpp b/contrib/llvm-project/llvm/lib/Linker/IRMover.cpp
index 055689b16e8f..6a2f84bb48a0 100644
--- a/contrib/llvm-project/llvm/lib/Linker/IRMover.cpp
+++ b/contrib/llvm-project/llvm/lib/Linker/IRMover.cpp
@@ -17,6 +17,7 @@
 #include "llvm/IR/GVMaterializer.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/TypeFinder.h"
+#include "llvm/Object/ModuleSymbolTable.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include <utility>
@@ -242,15 +243,6 @@ Type *TypeMapTy::get(Type *Ty, SmallPtrSet<StructType *, 8> &Visited) {
   bool IsUniqued = !isa<StructType>(Ty) || cast<StructType>(Ty)->isLiteral();
 
   if (!IsUniqued) {
-    StructType *STy = cast<StructType>(Ty);
-    // This is actually a type from the destination module, this can be reached
-    // when this type is loaded in another module, added to DstStructTypesSet,
-    // and then we reach the same type in another module where it has not been
-    // added to MappedTypes. (PR37684)
-    if (STy->getContext().isODRUniquingDebugTypes() && !STy->isOpaque() &&
-        DstStructTypesSet.hasType(STy))
-      return *Entry = STy;
-
 #ifndef NDEBUG
     for (auto &Pair : MappedTypes) {
       assert(!(Pair.first != Ty && Pair.second == Ty) &&
@@ -258,7 +250,7 @@ Type *TypeMapTy::get(Type *Ty, SmallPtrSet<StructType *, 8> &Visited) {
     }
 #endif
 
-    if (!Visited.insert(STy).second) {
+    if (!Visited.insert(cast<StructType>(Ty)).second) {
       StructType *DTy = StructType::create(Ty->getContext());
       return *Entry = DTy;
     }
@@ -579,6 +571,13 @@ Value *IRLinker::materialize(Value *V, bool ForIndirectSymbol) {
   if (!SGV)
     return nullptr;
 
+  // When linking a global from other modules than source & dest, skip
+  // materializing it because it would be mapped later when its containing
+  // module is linked. Linking it now would potentially pull in many types that
+  // may not be mapped properly.
+  if (SGV->getParent() != &DstM && SGV->getParent() != SrcM.get())
+    return nullptr;
+
   Expected<Constant *> NewProto = linkGlobalValueProto(SGV, ForIndirectSymbol);
   if (!NewProto) {
     setError(NewProto.takeError());
@@ -640,14 +639,14 @@ GlobalVariable *IRLinker::copyGlobalVariableProto(const GlobalVariable *SGVar) {
 
 AttributeList IRLinker::mapAttributeTypes(LLVMContext &C, AttributeList Attrs) {
   for (unsigned i = 0; i < Attrs.getNumAttrSets(); ++i) {
-    if (Attrs.hasAttribute(i, Attribute::ByVal)) {
-      Type *Ty = Attrs.getAttribute(i, Attribute::ByVal).getValueAsType();
-      if (!Ty)
-        continue;
-
-      Attrs = Attrs.removeAttribute(C, i, Attribute::ByVal);
-      Attrs = Attrs.addAttribute(
-          C, i, Attribute::getWithByValType(C, TypeMap.get(Ty)));
+    for (Attribute::AttrKind TypedAttr :
+         {Attribute::ByVal, Attribute::StructRet, Attribute::ByRef}) {
+      if (Attrs.hasAttribute(i, TypedAttr)) {
+        if (Type *Ty = Attrs.getAttribute(i, TypedAttr).getValueAsType()) {
+          Attrs = Attrs.replaceAttributeType(C, i, TypedAttr, TypeMap.get(Ty));
+          break;
+        }
+      }
     }
   }
   return Attrs;
@@ -798,11 +797,11 @@ void IRLinker::computeTypeMapping() {
     }
 
     auto STTypePrefix = getTypeNamePrefix(ST->getName());
-    if (STTypePrefix.size()== ST->getName().size())
+    if (STTypePrefix.size() == ST->getName().size())
       continue;
 
     // Check to see if the destination module has a struct with the prefix name.
-    StructType *DST = DstM.getTypeByName(STTypePrefix);
+    StructType *DST = StructType::getTypeByName(ST->getContext(), STTypePrefix);
     if (!DST)
       continue;
 
@@ -844,6 +843,38 @@ static void getArrayElements(const Constant *C,
 Expected<Constant *>
 IRLinker::linkAppendingVarProto(GlobalVariable *DstGV,
                                 const GlobalVariable *SrcGV) {
+  // Check that both variables have compatible properties.
+  if (DstGV && !DstGV->isDeclaration() && !SrcGV->isDeclaration()) {
+    if (!SrcGV->hasAppendingLinkage() || !DstGV->hasAppendingLinkage())
+      return stringErr(
+          "Linking globals named '" + SrcGV->getName() +
+          "': can only link appending global with another appending "
+          "global!");
+
+    if (DstGV->isConstant() != SrcGV->isConstant())
+      return stringErr("Appending variables linked with different const'ness!");
+
+    if (DstGV->getAlignment() != SrcGV->getAlignment())
+      return stringErr(
+          "Appending variables with different alignment need to be linked!");
+
+    if (DstGV->getVisibility() != SrcGV->getVisibility())
+      return stringErr(
+          "Appending variables with different visibility need to be linked!");
+
+    if (DstGV->hasGlobalUnnamedAddr() != SrcGV->hasGlobalUnnamedAddr())
+      return stringErr(
+          "Appending variables with different unnamed_addr need to be linked!");
+
+    if (DstGV->getSection() != SrcGV->getSection())
+      return stringErr(
+          "Appending variables with different section name need to be linked!");
+  }
+
+  // Do not need to do anything if source is a declaration.
+  if (SrcGV->isDeclaration())
+    return DstGV;
+
   Type *EltTy = cast<ArrayType>(TypeMap.get(SrcGV->getValueType()))
                     ->getElementType();
 
@@ -869,44 +900,20 @@ IRLinker::linkAppendingVarProto(GlobalVariable *DstGV,
   }
 
   uint64_t DstNumElements = 0;
-  if (DstGV) {
+  if (DstGV && !DstGV->isDeclaration()) {
     ArrayType *DstTy = cast<ArrayType>(DstGV->getValueType());
     DstNumElements = DstTy->getNumElements();
 
-    if (!SrcGV->hasAppendingLinkage() || !DstGV->hasAppendingLinkage())
-      return stringErr(
-          "Linking globals named '" + SrcGV->getName() +
-          "': can only link appending global with another appending "
-          "global!");
-
     // Check to see that they two arrays agree on type.
     if (EltTy != DstTy->getElementType())
       return stringErr("Appending variables with different element types!");
-    if (DstGV->isConstant() != SrcGV->isConstant())
-      return stringErr("Appending variables linked with different const'ness!");
-
-    if (DstGV->getAlignment() != SrcGV->getAlignment())
-      return stringErr(
-          "Appending variables with different alignment need to be linked!");
-
-    if (DstGV->getVisibility() != SrcGV->getVisibility())
-      return stringErr(
-          "Appending variables with different visibility need to be linked!");
-
-    if (DstGV->hasGlobalUnnamedAddr() != SrcGV->hasGlobalUnnamedAddr())
-      return stringErr(
-          "Appending variables with different unnamed_addr need to be linked!");
-
-    if (DstGV->getSection() != SrcGV->getSection())
-      return stringErr(
-          "Appending variables with different section name need to be linked!");
   }
 
   SmallVector<Constant *, 16> SrcElements;
   getArrayElements(SrcGV->getInitializer(), SrcElements);
 
   if (IsNewStructor) {
-    auto It = remove_if(SrcElements, [this](Constant *E) {
+    erase_if(SrcElements, [this](Constant *E) {
       auto *Key =
           dyn_cast<GlobalValue>(E->getAggregateElement(2)->stripPointerCasts());
       if (!Key)
@@ -914,7 +921,6 @@ IRLinker::linkAppendingVarProto(GlobalVariable *DstGV,
       GlobalValue *DGV = getLinkedToGlobal(Key);
       return !shouldLink(DGV, *Key);
     });
-    SrcElements.erase(It, SrcElements.end());
   }
   uint64_t NewSize = DstNumElements + SrcElements.size();
   ArrayType *NewType = ArrayType::get(EltTy, NewSize);
@@ -930,9 +936,10 @@ IRLinker::linkAppendingVarProto(GlobalVariable *DstGV,
 
   Constant *Ret = ConstantExpr::getBitCast(NG, TypeMap.get(SrcGV->getType()));
 
-  Mapper.scheduleMapAppendingVariable(*NG,
-                                      DstGV ? DstGV->getInitializer() : nullptr,
-                                      IsOldStructor, SrcElements);
+  Mapper.scheduleMapAppendingVariable(
+      *NG,
+      (DstGV && !DstGV->isDeclaration()) ? DstGV->getInitializer() : nullptr,
+      IsOldStructor, SrcElements);
 
   // Replace any uses of the two global variables with uses of the new
   // global.
@@ -985,8 +992,7 @@ Expected<Constant *> IRLinker::linkGlobalValueProto(GlobalValue *SGV,
     DGV = nullptr;
 
   // Handle the ultra special appending linkage case first.
-  assert(!DGV || SGV->hasAppendingLinkage() == DGV->hasAppendingLinkage());
-  if (SGV->hasAppendingLinkage())
+  if (SGV->hasAppendingLinkage() || (DGV && DGV->hasAppendingLinkage()))
     return linkAppendingVarProto(cast_or_null<GlobalVariable>(DGV),
                                  cast<GlobalVariable>(SGV));
 
@@ -1126,14 +1132,13 @@ void IRLinker::prepareCompileUnitsForImport() {
     assert(CU && "Expected valid compile unit");
     // Enums, macros, and retained types don't need to be listed on the
     // imported DICompileUnit. This means they will only be imported
-    // if reached from the mapped IR. Do this by setting their value map
-    // entries to nullptr, which will automatically prevent their importing
-    // when reached from the DICompileUnit during metadata mapping.
-    ValueMap.MD()[CU->getRawEnumTypes()].reset(nullptr);
-    ValueMap.MD()[CU->getRawMacros()].reset(nullptr);
-    ValueMap.MD()[CU->getRawRetainedTypes()].reset(nullptr);
+    // if reached from the mapped IR.
+    CU->replaceEnumTypes(nullptr);
+    CU->replaceMacros(nullptr);
+    CU->replaceRetainedTypes(nullptr);
+
     // The original definition (or at least its debug info - if the variable is
-    // internalized an optimized away) will remain in the source module, so
+    // internalized and optimized away) will remain in the source module, so
     // there's no need to import them.
     // If LLVM ever does more advanced optimizations on global variables
     // (removing/localizing write operations, for instance) that can track
@@ -1141,7 +1146,7 @@ void IRLinker::prepareCompileUnitsForImport() {
     // with care when it comes to debug info size. Emitting small CUs containing
     // only a few imported entities into every destination module may be very
     // size inefficient.
-    ValueMap.MD()[CU->getRawGlobalVariables()].reset(nullptr);
+    CU->replaceGlobalVariables(nullptr);
 
     // Imported entities only need to be mapped in if they have local
     // scope, as those might correspond to an imported entity inside a
@@ -1174,7 +1179,7 @@ void IRLinker::prepareCompileUnitsForImport() {
       else
         // If there were no local scope imported entities, we can map
         // the whole list to nullptr.
-        ValueMap.MD()[CU->getRawImportedEntities()].reset(nullptr);
+        CU->replaceImportedEntities(nullptr);
     }
   }
 }
@@ -1433,7 +1438,7 @@ Error IRLinker::run() {
 
   if (!SrcM->getTargetTriple().empty()&&
       !SrcTriple.isCompatibleWith(DstTriple))
-    emitWarning("Linking two modules of different target triples: " +
+    emitWarning("Linking two modules of different target triples: '" +
                 SrcM->getModuleIdentifier() + "' is '" +
                 SrcM->getTargetTriple() + "' whereas '" +
                 DstM.getModuleIdentifier() + "' is '" + DstM.getTargetTriple() +
@@ -1441,17 +1446,6 @@ Error IRLinker::run() {
 
   DstM.setTargetTriple(SrcTriple.merge(DstTriple));
 
-  // Append the module inline asm string.
-  if (!IsPerformingImport && !SrcM->getModuleInlineAsm().empty()) {
-    std::string SrcModuleInlineAsm = adjustInlineAsm(SrcM->getModuleInlineAsm(),
-                                                     SrcTriple);
-    if (DstM.getModuleInlineAsm().empty())
-      DstM.setModuleInlineAsm(SrcModuleInlineAsm);
-    else
-      DstM.setModuleInlineAsm(DstM.getModuleInlineAsm() + "\n" +
-                              SrcModuleInlineAsm);
-  }
-
   // Loop over all of the linked values to compute type mappings.
   computeTypeMapping();
 
@@ -1482,6 +1476,24 @@ Error IRLinker::run() {
   // are properly remapped.
   linkNamedMDNodes();
 
+  if (!IsPerformingImport && !SrcM->getModuleInlineAsm().empty()) {
+    // Append the module inline asm string.
+    DstM.appendModuleInlineAsm(adjustInlineAsm(SrcM->getModuleInlineAsm(),
+                                               SrcTriple));
+  } else if (IsPerformingImport) {
+    // Import any symver directives for symbols in DstM.
+    ModuleSymbolTable::CollectAsmSymvers(*SrcM,
+                                         [&](StringRef Name, StringRef Alias) {
+      if (DstM.getNamedValue(Name)) {
+        SmallString<256> S(".symver ");
+        S += Name;
+        S += ", ";
+        S += Alias;
+        DstM.appendModuleInlineAsm(S);
+      }
+    });
+  }
+
   // Merge the module flags into the DstM module.
   return linkModuleFlagsMetadata();
 }
diff --git a/contrib/llvm-project/llvm/lib/Linker/LinkModules.cpp b/contrib/llvm-project/llvm/lib/Linker/LinkModules.cpp
index 35d6290e901b..98793c587388 100644
--- a/contrib/llvm-project/llvm/lib/Linker/LinkModules.cpp
+++ b/contrib/llvm-project/llvm/lib/Linker/LinkModules.cpp
@@ -248,7 +248,7 @@ bool ModuleLinker::shouldLinkFromSource(bool &LinkFromSrc,
   }
 
   // We always have to add Src if it has appending linkage.
-  if (Src.hasAppendingLinkage()) {
+  if (Src.hasAppendingLinkage() || Dest.hasAppendingLinkage()) {
     LinkFromSrc = true;
     return false;
   }
diff --git a/contrib/llvm-project/llvm/lib/MC/ELFObjectWriter.cpp b/contrib/llvm-project/llvm/lib/MC/ELFObjectWriter.cpp
index 1ca9d0fe1e18..2d810ffd350b 100644
--- a/contrib/llvm-project/llvm/lib/MC/ELFObjectWriter.cpp
+++ b/contrib/llvm-project/llvm/lib/MC/ELFObjectWriter.cpp
@@ -464,7 +464,7 @@ void ELFWriter::writeHeader(const MCAssembler &Asm) {
 
 uint64_t ELFWriter::SymbolValue(const MCSymbol &Sym,
                                 const MCAsmLayout &Layout) {
-  if (Sym.isCommon() && (Sym.isTargetCommon() || Sym.isExternal()))
+  if (Sym.isCommon())
     return Sym.getCommonAlignment();
 
   uint64_t Res;
@@ -1024,9 +1024,13 @@ void ELFWriter::writeSection(const SectionIndexMapTy &SectionIndexMap,
   }
 
   if (Section.getFlags() & ELF::SHF_LINK_ORDER) {
+    // If the value in the associated metadata is not a definition, Sym will be
+    // undefined. Represent this with sh_link=0.
     const MCSymbol *Sym = Section.getLinkedToSymbol();
-    const MCSectionELF *Sec = cast<MCSectionELF>(&Sym->getSection());
-    sh_link = SectionIndexMap.lookup(Sec);
+    if (Sym && Sym->isInSection()) {
+      const MCSectionELF *Sec = cast<MCSectionELF>(&Sym->getSection());
+      sh_link = SectionIndexMap.lookup(Sec);
+    }
   }
 
   WriteSecHdrEntry(StrTabBuilder.getOffset(Section.getName()),
@@ -1254,9 +1258,9 @@ void ELFObjectWriter::executePostLayoutBinding(MCAssembler &Asm,
                                                const MCAsmLayout &Layout) {
   // The presence of symbol versions causes undefined symbols and
   // versions declared with @@@ to be renamed.
-  for (const std::pair<StringRef, const MCSymbol *> &P : Asm.Symvers) {
-    StringRef AliasName = P.first;
-    const auto &Symbol = cast<MCSymbolELF>(*P.second);
+  for (const MCAssembler::Symver &S : Asm.Symvers) {
+    StringRef AliasName = S.Name;
+    const auto &Symbol = cast<MCSymbolELF>(*S.Sym);
     size_t Pos = AliasName.find('@');
     assert(Pos != StringRef::npos);
 
@@ -1274,25 +1278,23 @@ void ELFObjectWriter::executePostLayoutBinding(MCAssembler &Asm,
 
     // Aliases defined with .symvar copy the binding from the symbol they alias.
     // This is the first place we are able to copy this information.
-    Alias->setExternal(Symbol.isExternal());
     Alias->setBinding(Symbol.getBinding());
+    Alias->setVisibility(Symbol.getVisibility());
     Alias->setOther(Symbol.getOther());
 
     if (!Symbol.isUndefined() && !Rest.startswith("@@@"))
       continue;
 
-    // FIXME: Get source locations for these errors or diagnose them earlier.
     if (Symbol.isUndefined() && Rest.startswith("@@") &&
         !Rest.startswith("@@@")) {
-      Asm.getContext().reportError(SMLoc(), "versioned symbol " + AliasName +
-                                                " must be defined");
+      Asm.getContext().reportError(S.Loc, "default version symbol " +
+                                              AliasName + " must be defined");
       continue;
     }
 
     if (Renames.count(&Symbol) && Renames[&Symbol] != Alias) {
-      Asm.getContext().reportError(
-          SMLoc(), llvm::Twine("multiple symbol versions defined for ") +
-                       Symbol.getName());
+      Asm.getContext().reportError(S.Loc, Twine("multiple versions for ") +
+                                              Symbol.getName());
       continue;
     }
 
@@ -1390,9 +1392,21 @@ bool ELFObjectWriter::shouldRelocateWithSymbol(const MCAssembler &Asm,
       if (C != 0)
         return true;
 
-      // It looks like gold has a bug (http://sourceware.org/PR16794) and can
-      // only handle section relocations to mergeable sections if using RELA.
-      if (!hasRelocationAddend())
+      // gold<2.34 incorrectly ignored the addend for R_386_GOTOFF (9)
+      // (http://sourceware.org/PR16794).
+      if (TargetObjectWriter->getEMachine() == ELF::EM_386 &&
+          Type == ELF::R_386_GOTOFF)
+        return true;
+
+      // ld.lld handles R_MIPS_HI16/R_MIPS_LO16 separately, not as a whole, so
+      // it doesn't know that an R_MIPS_HI16 with implicit addend 1 and an
+      // R_MIPS_LO16 with implicit addend -32768 represents 32768, which is in
+      // range of a MergeInputSection. We could introduce a new RelExpr member
+      // (like R_RISCV_PC_INDIRECT for R_RISCV_PCREL_HI20 / R_RISCV_PCREL_LO12)
+      // but the complexity is unnecessary given that GNU as keeps the original
+      // symbol for this case as well.
+      if (TargetObjectWriter->getEMachine() == ELF::EM_MIPS &&
+          !hasRelocationAddend())
         return true;
     }
 
diff --git a/contrib/llvm-project/llvm/lib/MC/MCAsmBackend.cpp b/contrib/llvm-project/llvm/lib/MC/MCAsmBackend.cpp
index cf110345df3d..0d32e71c2d8f 100644
--- a/contrib/llvm-project/llvm/lib/MC/MCAsmBackend.cpp
+++ b/contrib/llvm-project/llvm/lib/MC/MCAsmBackend.cpp
@@ -54,10 +54,17 @@ std::unique_ptr<MCObjectWriter>
 MCAsmBackend::createDwoObjectWriter(raw_pwrite_stream &OS,
                                     raw_pwrite_stream &DwoOS) const {
   auto TW = createObjectTargetWriter();
-  if (TW->getFormat() != Triple::ELF)
-    report_fatal_error("dwo only supported with ELF");
-  return createELFDwoObjectWriter(cast<MCELFObjectTargetWriter>(std::move(TW)),
-                                  OS, DwoOS, Endian == support::little);
+  switch (TW->getFormat()) {
+  case Triple::ELF:
+    return createELFDwoObjectWriter(
+        cast<MCELFObjectTargetWriter>(std::move(TW)), OS, DwoOS,
+        Endian == support::little);
+  case Triple::Wasm:
+    return createWasmDwoObjectWriter(
+        cast<MCWasmObjectTargetWriter>(std::move(TW)), OS, DwoOS);
+  default:
+    report_fatal_error("dwo only supported with ELF and Wasm");
+  }
 }
 
 Optional<MCFixupKind> MCAsmBackend::getFixupKind(StringRef Name) const {
diff --git a/contrib/llvm-project/llvm/lib/MC/MCAsmInfo.cpp b/contrib/llvm-project/llvm/lib/MC/MCAsmInfo.cpp
index 9767ee6c1133..620d3e7cffc3 100644
--- a/contrib/llvm-project/llvm/lib/MC/MCAsmInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/MC/MCAsmInfo.cpp
@@ -28,6 +28,12 @@ static cl::opt<DefaultOnOff> DwarfExtendedLoc(
                clEnumVal(Enable, "Enabled"), clEnumVal(Disable, "Disabled")),
     cl::init(Default));
 
+cl::opt<cl::boolOrDefault> UseLEB128Directives(
+    "use-leb128-directives", cl::Hidden,
+    cl::desc(
+        "Disable the usage of LEB128 directives, and generate .byte instead."),
+    cl::init(cl::BOU_UNSET));
+
 MCAsmInfo::MCAsmInfo() {
   SeparatorString = ";";
   CommentString = "#";
@@ -51,6 +57,8 @@ MCAsmInfo::MCAsmInfo() {
   WeakDirective = "\t.weak\t";
   if (DwarfExtendedLoc != Default)
     SupportsExtendedDwarfLocDirective = DwarfExtendedLoc == Enable;
+  if (UseLEB128Directives != cl::BOU_UNSET)
+    HasLEB128Directives = UseLEB128Directives == cl::BOU_TRUE;
 
   // FIXME: Clang's logic should be synced with the logic used to initialize
   //        this member and the two implementations should be merged.
@@ -101,8 +109,7 @@ MCAsmInfo::getExprForFDESymbol(const MCSymbol *Sym,
 }
 
 bool MCAsmInfo::isAcceptableChar(char C) const {
-  return (C >= 'a' && C <= 'z') || (C >= 'A' && C <= 'Z') ||
-         (C >= '0' && C <= '9') || C == '_' || C == '$' || C == '.' || C == '@';
+  return isAlnum(C) || C == '_' || C == '$' || C == '.' || C == '@';
 }
 
 bool MCAsmInfo::isValidUnquotedName(StringRef Name) const {
diff --git a/contrib/llvm-project/llvm/lib/MC/MCAsmInfoXCOFF.cpp b/contrib/llvm-project/llvm/lib/MC/MCAsmInfoXCOFF.cpp
index b5c5bb3ace8e..2f8bc6a49bb7 100644
--- a/contrib/llvm-project/llvm/lib/MC/MCAsmInfoXCOFF.cpp
+++ b/contrib/llvm-project/llvm/lib/MC/MCAsmInfoXCOFF.cpp
@@ -8,9 +8,12 @@
 
 #include "llvm/MC/MCAsmInfoXCOFF.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/CommandLine.h"
 
 using namespace llvm;
 
+extern cl::opt<cl::boolOrDefault> UseLEB128Directives;
+
 void MCAsmInfoXCOFF::anchor() {}
 
 MCAsmInfoXCOFF::MCAsmInfoXCOFF() {
@@ -20,10 +23,14 @@ MCAsmInfoXCOFF::MCAsmInfoXCOFF() {
   PrivateLabelPrefix = "L..";
   SupportsQuotedNames = false;
   UseDotAlignForAlignment = true;
+  if (UseLEB128Directives == cl::BOU_UNSET)
+    HasLEB128Directives = false;
   ZeroDirective = "\t.space\t";
   ZeroDirectiveSupportsNonZeroValue = false;
   AsciiDirective = nullptr; // not supported
   AscizDirective = nullptr; // not supported
+  ByteListDirective = "\t.byte\t";
+  CharacterLiteralSyntax = ACLS_SingleQuotePrefix;
 
   // Use .vbyte for data definition to avoid directives that apply an implicit
   // alignment.
@@ -35,6 +42,8 @@ MCAsmInfoXCOFF::MCAsmInfoXCOFF() {
   HasDotTypeDotSizeDirective = false;
   UseIntegratedAssembler = false;
   NeedsFunctionDescriptors = true;
+
+  ExceptionsType = ExceptionHandling::AIX;
 }
 
 bool MCAsmInfoXCOFF::isAcceptableChar(char C) const {
diff --git a/contrib/llvm-project/llvm/lib/MC/MCAsmMacro.cpp b/contrib/llvm-project/llvm/lib/MC/MCAsmMacro.cpp
index 186a68b02a29..bc95f98f2957 100644
--- a/contrib/llvm-project/llvm/lib/MC/MCAsmMacro.cpp
+++ b/contrib/llvm-project/llvm/lib/MC/MCAsmMacro.cpp
@@ -38,6 +38,11 @@ void MCAsmMacro::dump(raw_ostream &OS) const {
     OS << "    ";
     P.dump();
   }
+  if (!Locals.empty()) {
+    OS << "  Locals:\n";
+    for (StringRef L : Locals)
+      OS << "    " << L << '\n';
+  }
   OS << "  (BEGIN BODY)" << Body << "(END BODY)\n";
 }
 #endif
diff --git a/contrib/llvm-project/llvm/lib/MC/MCAsmStreamer.cpp b/contrib/llvm-project/llvm/lib/MC/MCAsmStreamer.cpp
index 6a8572e57922..10d72553fe6d 100644
--- a/contrib/llvm-project/llvm/lib/MC/MCAsmStreamer.cpp
+++ b/contrib/llvm-project/llvm/lib/MC/MCAsmStreamer.cpp
@@ -24,6 +24,7 @@
 #include "llvm/MC/MCInstPrinter.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCPseudoProbe.h"
 #include "llvm/MC/MCRegister.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSectionMachO.h"
@@ -144,6 +145,11 @@ public:
                               const MCSymbol *Aliasee) override;
 
   void emitLOHDirective(MCLOHType Kind, const MCLOHArgs &Args) override;
+
+  StringRef getMnemonic(MCInst &MI) override {
+    return InstPrinter->getMnemonic(&MI).first;
+  }
+
   void emitLabel(MCSymbol *Symbol, SMLoc Loc = SMLoc()) override;
 
   void emitAssemblerFlag(MCAssemblerFlag Flag) override;
@@ -345,6 +351,10 @@ public:
 
   void emitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override;
 
+  void emitPseudoProbe(uint64_t Guid, uint64_t Index, uint64_t Type,
+                       uint64_t Attr,
+                       const MCPseudoProbeInlineStack &InlineStack) override;
+
   void emitBundleAlignMode(unsigned AlignPow2) override;
   void emitBundleLock(bool AlignToEnd) override;
   void emitBundleUnlock() override;
@@ -580,6 +590,7 @@ static const char *getPlatformName(MachO::PlatformType Type) {
   case MachO::PLATFORM_IOSSIMULATOR:     return "iossimulator";
   case MachO::PLATFORM_TVOSSIMULATOR:    return "tvossimulator";
   case MachO::PLATFORM_WATCHOSSIMULATOR: return "watchossimulator";
+  case MachO::PLATFORM_DRIVERKIT:        return "driverkit";
   }
   llvm_unreachable("Invalid Mach-O platform type");
 }
@@ -796,15 +807,16 @@ void MCAsmStreamer::emitXCOFFLocalCommonSymbol(MCSymbol *LabelSym,
   OS << ',' << Log2_32(ByteAlignment);
 
   EmitEOL();
+
+  // Print symbol's rename (original name contains invalid character(s)) if
+  // there is one.
+  MCSymbolXCOFF *XSym = cast<MCSymbolXCOFF>(CsectSym);
+  if (XSym->hasRename())
+    emitXCOFFRenameDirective(XSym, XSym->getSymbolTableName());
 }
 
 void MCAsmStreamer::emitXCOFFSymbolLinkageWithVisibility(
     MCSymbol *Symbol, MCSymbolAttr Linkage, MCSymbolAttr Visibility) {
-  // Print symbol's rename (original name contains invalid character(s)) if
-  // there is one.
-  if (cast<MCSymbolXCOFF>(Symbol)->hasRename())
-    emitXCOFFRenameDirective(Symbol,
-                             cast<MCSymbolXCOFF>(Symbol)->getSymbolTableName());
 
   switch (Linkage) {
   case MCSA_Global:
@@ -839,6 +851,12 @@ void MCAsmStreamer::emitXCOFFSymbolLinkageWithVisibility(
     report_fatal_error("unexpected value for Visibility type");
   }
   EmitEOL();
+
+  // Print symbol's rename (original name contains invalid character(s)) if
+  // there is one.
+  if (cast<MCSymbolXCOFF>(Symbol)->hasRename())
+    emitXCOFFRenameDirective(Symbol,
+                             cast<MCSymbolXCOFF>(Symbol)->getSymbolTableName());
 }
 
 void MCAsmStreamer::emitXCOFFRenameDirective(const MCSymbol *Name,
@@ -868,12 +886,6 @@ void MCAsmStreamer::emitELFSize(MCSymbol *Symbol, const MCExpr *Value) {
 
 void MCAsmStreamer::emitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                      unsigned ByteAlignment) {
-  // Print symbol's rename (original name contains invalid character(s)) if
-  // there is one.
-  MCSymbolXCOFF *XSym = dyn_cast<MCSymbolXCOFF>(Symbol);
-  if (XSym && XSym->hasRename())
-    emitXCOFFRenameDirective(XSym, XSym->getSymbolTableName());
-
   OS << "\t.comm\t";
   Symbol->print(OS, MAI);
   OS << ',' << Size;
@@ -885,6 +897,13 @@ void MCAsmStreamer::emitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
       OS << ',' << Log2_32(ByteAlignment);
   }
   EmitEOL();
+
+  // Print symbol's rename (original name contains invalid character(s)) if
+  // there is one.
+  MCSymbolXCOFF *XSym = dyn_cast<MCSymbolXCOFF>(Symbol);
+  if (XSym && XSym->hasRename())
+    emitXCOFFRenameDirective(XSym, XSym->getSymbolTableName());
+
 }
 
 void MCAsmStreamer::emitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
@@ -962,6 +981,47 @@ void MCAsmStreamer::emitTBSSSymbol(MCSection *Section, MCSymbol *Symbol,
 
 static inline char toOctal(int X) { return (X&7)+'0'; }
 
+static void PrintByteList(StringRef Data, raw_ostream &OS,
+                          MCAsmInfo::AsmCharLiteralSyntax ACLS) {
+  assert(!Data.empty() && "Cannot generate an empty list.");
+  const auto printCharacterInOctal = [&OS](unsigned char C) {
+    OS << '0';
+    OS << toOctal(C >> 6);
+    OS << toOctal(C >> 3);
+    OS << toOctal(C >> 0);
+  };
+  const auto printOneCharacterFor = [printCharacterInOctal](
+                                        auto printOnePrintingCharacter) {
+    return [printCharacterInOctal, printOnePrintingCharacter](unsigned char C) {
+      if (isPrint(C)) {
+        printOnePrintingCharacter(static_cast<char>(C));
+        return;
+      }
+      printCharacterInOctal(C);
+    };
+  };
+  const auto printCharacterList = [Data, &OS](const auto &printOneCharacter) {
+    const auto BeginPtr = Data.begin(), EndPtr = Data.end();
+    for (const unsigned char C : make_range(BeginPtr, EndPtr - 1)) {
+      printOneCharacter(C);
+      OS << ',';
+    }
+    printOneCharacter(*(EndPtr - 1));
+  };
+  switch (ACLS) {
+  case MCAsmInfo::ACLS_Unknown:
+    printCharacterList(printCharacterInOctal);
+    return;
+  case MCAsmInfo::ACLS_SingleQuotePrefix:
+    printCharacterList(printOneCharacterFor([&OS](char C) {
+      const char AsmCharLitBuf[2] = {'\'', C};
+      OS << StringRef(AsmCharLitBuf, sizeof(AsmCharLitBuf));
+    }));
+    return;
+  }
+  llvm_unreachable("Invalid AsmCharLiteralSyntax value!");
+}
+
 static void PrintQuotedString(StringRef Data, raw_ostream &OS) {
   OS << '"';
 
@@ -1000,33 +1060,42 @@ void MCAsmStreamer::emitBytes(StringRef Data) {
          "Cannot emit contents before setting section!");
   if (Data.empty()) return;
 
-  // If only single byte is provided or no ascii or asciz directives is
-  // supported, emit as vector of 8bits data.
-  if (Data.size() == 1 ||
-      !(MAI->getAscizDirective() || MAI->getAsciiDirective())) {
-    if (MCTargetStreamer *TS = getTargetStreamer()) {
-      TS->emitRawBytes(Data);
+  const auto emitAsString = [this](StringRef Data) {
+    // If the data ends with 0 and the target supports .asciz, use it, otherwise
+    // use .ascii or a byte-list directive
+    if (MAI->getAscizDirective() && Data.back() == 0) {
+      OS << MAI->getAscizDirective();
+      Data = Data.substr(0, Data.size() - 1);
+    } else if (LLVM_LIKELY(MAI->getAsciiDirective())) {
+      OS << MAI->getAsciiDirective();
+    } else if (MAI->getByteListDirective()) {
+      OS << MAI->getByteListDirective();
+      PrintByteList(Data, OS, MAI->characterLiteralSyntax());
+      EmitEOL();
+      return true;
     } else {
-      const char *Directive = MAI->getData8bitsDirective();
-      for (const unsigned char C : Data.bytes()) {
-        OS << Directive << (unsigned)C;
-        EmitEOL();
-      }
+      return false;
     }
+
+    PrintQuotedString(Data, OS);
+    EmitEOL();
+    return true;
+  };
+
+  if (Data.size() != 1 && emitAsString(Data))
     return;
-  }
 
-  // If the data ends with 0 and the target supports .asciz, use it, otherwise
-  // use .ascii
-  if (MAI->getAscizDirective() && Data.back() == 0) {
-    OS << MAI->getAscizDirective();
-    Data = Data.substr(0, Data.size()-1);
-  } else {
-    OS << MAI->getAsciiDirective();
+  // Only single byte is provided or no ascii, asciz, or byte-list directives
+  // are applicable. Emit as vector of individual 8bits data elements.
+  if (MCTargetStreamer *TS = getTargetStreamer()) {
+    TS->emitRawBytes(Data);
+    return;
+  }
+  const char *Directive = MAI->getData8bitsDirective();
+  for (const unsigned char C : Data.bytes()) {
+    OS << Directive << (unsigned)C;
+    EmitEOL();
   }
-
-  PrintQuotedString(Data, OS);
-  EmitEOL();
 }
 
 void MCAsmStreamer::emitBinaryData(StringRef Data) {
@@ -1813,8 +1882,11 @@ void MCAsmStreamer::EmitWinCFIEndProc(SMLoc Loc) {
   EmitEOL();
 }
 
-// TODO: Implement
 void MCAsmStreamer::EmitWinCFIFuncletOrFuncEnd(SMLoc Loc) {
+  MCStreamer::EmitWinCFIFuncletOrFuncEnd(Loc);
+
+  OS << "\t.seh_endfunclet";
+  EmitEOL();
 }
 
 void MCAsmStreamer::EmitWinCFIStartChained(SMLoc Loc) {
@@ -2055,6 +2127,18 @@ void MCAsmStreamer::emitInstruction(const MCInst &Inst,
   EmitEOL();
 }
 
+void MCAsmStreamer::emitPseudoProbe(
+    uint64_t Guid, uint64_t Index, uint64_t Type, uint64_t Attr,
+    const MCPseudoProbeInlineStack &InlineStack) {
+  OS << "\t.pseudoprobe\t" << Guid << " " << Index << " " << Type << " "
+     << Attr;
+  // Emit inline stack like
+  //  @ GUIDmain:3 @ GUIDCaller:1 @ GUIDDirectCaller:11
+  for (const auto &Site : InlineStack)
+    OS << " @ " << std::get<0>(Site) << ":" << std::get<1>(Site);
+  EmitEOL();
+}
+
 void MCAsmStreamer::emitBundleAlignMode(unsigned AlignPow2) {
   OS << "\t.bundle_align_mode " << AlignPow2;
   EmitEOL();
diff --git a/contrib/llvm-project/llvm/lib/MC/MCAssembler.cpp b/contrib/llvm-project/llvm/lib/MC/MCAssembler.cpp
index 3ca8714b7817..ce296d7faa45 100644
--- a/contrib/llvm-project/llvm/lib/MC/MCAssembler.cpp
+++ b/contrib/llvm-project/llvm/lib/MC/MCAssembler.cpp
@@ -62,8 +62,8 @@ STATISTIC(EmittedAlignFragments,
           "Number of emitted assembler fragments - align");
 STATISTIC(EmittedFillFragments,
           "Number of emitted assembler fragments - fill");
-STATISTIC(EmittedOrgFragments,
-          "Number of emitted assembler fragments - org");
+STATISTIC(EmittedNopsFragments, "Number of emitted assembler fragments - nops");
+STATISTIC(EmittedOrgFragments, "Number of emitted assembler fragments - org");
 STATISTIC(evaluateFixup, "Number of evaluated fixups");
 STATISTIC(FragmentLayouts, "Number of fragment layouts");
 STATISTIC(ObjectBytes, "Number of emitted object file bytes");
@@ -312,6 +312,9 @@ uint64_t MCAssembler::computeFragmentSize(const MCAsmLayout &Layout,
     return Size;
   }
 
+  case MCFragment::FT_Nops:
+    return cast<MCNopsFragment>(F).getNumBytes();
+
   case MCFragment::FT_LEB:
     return cast<MCLEBFragment>(F).getContents().size();
 
@@ -380,6 +383,8 @@ uint64_t MCAssembler::computeFragmentSize(const MCAsmLayout &Layout,
     return cast<MCCVInlineLineTableFragment>(F).getContents().size();
   case MCFragment::FT_CVDefRange:
     return cast<MCCVDefRangeFragment>(F).getContents().size();
+  case MCFragment::FT_PseudoProbe:
+    return cast<MCPseudoProbeAddrFragment>(F).getContents().size();
   case MCFragment::FT_Dummy:
     llvm_unreachable("Should not have been added");
   }
@@ -613,6 +618,45 @@ static void writeFragment(raw_ostream &OS, const MCAssembler &Asm,
     break;
   }
 
+  case MCFragment::FT_Nops: {
+    ++stats::EmittedNopsFragments;
+    const MCNopsFragment &NF = cast<MCNopsFragment>(F);
+    int64_t NumBytes = NF.getNumBytes();
+    int64_t ControlledNopLength = NF.getControlledNopLength();
+    int64_t MaximumNopLength = Asm.getBackend().getMaximumNopSize();
+
+    assert(NumBytes > 0 && "Expected positive NOPs fragment size");
+    assert(ControlledNopLength >= 0 && "Expected non-negative NOP size");
+
+    if (ControlledNopLength > MaximumNopLength) {
+      Asm.getContext().reportError(NF.getLoc(),
+                                   "illegal NOP size " +
+                                       std::to_string(ControlledNopLength) +
+                                       ". (expected within [0, " +
+                                       std::to_string(MaximumNopLength) + "])");
+      // Clamp the NOP length as reportError does not stop the execution
+      // immediately.
+      ControlledNopLength = MaximumNopLength;
+    }
+
+    // Use maximum value if the size of each NOP is not specified
+    if (!ControlledNopLength)
+      ControlledNopLength = MaximumNopLength;
+
+    while (NumBytes) {
+      uint64_t NumBytesToEmit =
+          (uint64_t)std::min(NumBytes, ControlledNopLength);
+      assert(NumBytesToEmit && "try to emit empty NOP instruction");
+      if (!Asm.getBackend().writeNopData(OS, NumBytesToEmit)) {
+        report_fatal_error("unable to write nop sequence of the remaining " +
+                           Twine(NumBytesToEmit) + " bytes");
+        break;
+      }
+      NumBytes -= NumBytesToEmit;
+    }
+    break;
+  }
+
   case MCFragment::FT_LEB: {
     const MCLEBFragment &LF = cast<MCLEBFragment>(F);
     OS << LF.getContents();
@@ -662,6 +706,11 @@ static void writeFragment(raw_ostream &OS, const MCAssembler &Asm,
     OS << DRF.getContents();
     break;
   }
+  case MCFragment::FT_PseudoProbe: {
+    const MCPseudoProbeAddrFragment &PF = cast<MCPseudoProbeAddrFragment>(F);
+    OS << PF.getContents();
+    break;
+  }
   case MCFragment::FT_Dummy:
     llvm_unreachable("Should not have been added");
   }
@@ -712,6 +761,8 @@ void MCAssembler::writeSectionData(raw_ostream &OS, const MCSection *Sec,
         assert((cast<MCFillFragment>(F).getValue() == 0) &&
                "Invalid fill in virtual section!");
         break;
+      case MCFragment::FT_Org:
+        break;
       }
     }
 
@@ -724,7 +775,8 @@ void MCAssembler::writeSectionData(raw_ostream &OS, const MCSection *Sec,
   for (const MCFragment &F : *Sec)
     writeFragment(OS, *this, Layout, F);
 
-  assert(OS.tell() - Start == Layout.getSectionAddressSize(Sec));
+  assert(getContext().hadError() ||
+         OS.tell() - Start == Layout.getSectionAddressSize(Sec));
 }
 
 std::tuple<MCValue, uint64_t, bool>
@@ -870,6 +922,12 @@ void MCAssembler::layout(MCAsmLayout &Layout) {
         Contents = DF.getContents();
         break;
       }
+      case MCFragment::FT_PseudoProbe: {
+        MCPseudoProbeAddrFragment &PF = cast<MCPseudoProbeAddrFragment>(Frag);
+        Fixups = PF.getFixups();
+        Contents = PF.getContents();
+        break;
+      }
       }
       for (const MCFixup &Fixup : Fixups) {
         uint64_t FixedValue;
@@ -1061,10 +1119,9 @@ bool MCAssembler::relaxDwarfLineAddr(MCAsmLayout &Layout,
   } else {
     uint32_t Offset;
     uint32_t Size;
-    bool SetDelta = MCDwarfLineAddr::FixedEncode(Context,
-                                                 getDWARFLinetableParams(),
-                                                 LineDelta, AddrDelta,
-                                                 OSE, &Offset, &Size);
+    bool SetDelta;
+    std::tie(Offset, Size, SetDelta) =
+        MCDwarfLineAddr::fixedEncode(Context, LineDelta, AddrDelta, OSE);
     // Add Fixups for address delta or new address.
     const MCExpr *FixupExpr;
     if (SetDelta) {
@@ -1125,6 +1182,27 @@ bool MCAssembler::relaxCVDefRange(MCAsmLayout &Layout,
   return OldSize != F.getContents().size();
 }
 
+bool MCAssembler::relaxPseudoProbeAddr(MCAsmLayout &Layout,
+                                       MCPseudoProbeAddrFragment &PF) {
+  uint64_t OldSize = PF.getContents().size();
+  int64_t AddrDelta;
+  bool Abs = PF.getAddrDelta().evaluateKnownAbsolute(AddrDelta, Layout);
+  assert(Abs && "We created a pseudo probe with an invalid expression");
+  (void)Abs;
+  SmallVectorImpl<char> &Data = PF.getContents();
+  Data.clear();
+  raw_svector_ostream OSE(Data);
+  PF.getFixups().clear();
+
+  // Relocations should not be needed in general except on RISC-V which we are
+  // not targeted for now.
+  assert(!getBackend().requiresDiffExpressionRelocations() &&
+         "cannot relax relocations");
+  // AddrDelta is a signed integer
+  encodeSLEB128(AddrDelta, OSE, OldSize);
+  return OldSize != Data.size();
+}
+
 bool MCAssembler::relaxFragment(MCAsmLayout &Layout, MCFragment &F) {
   switch(F.getKind()) {
   default:
@@ -1146,6 +1224,8 @@ bool MCAssembler::relaxFragment(MCAsmLayout &Layout, MCFragment &F) {
     return relaxCVInlineLineTable(Layout, cast<MCCVInlineLineTableFragment>(F));
   case MCFragment::FT_CVDefRange:
     return relaxCVDefRange(Layout, cast<MCCVDefRangeFragment>(F));
+  case MCFragment::FT_PseudoProbe:
+    return relaxPseudoProbeAddr(Layout, cast<MCPseudoProbeAddrFragment>(F));
   }
 }
 
diff --git a/contrib/llvm-project/llvm/lib/MC/MCCodeView.cpp b/contrib/llvm-project/llvm/lib/MC/MCCodeView.cpp
index 7849196432b8..3da1a9c3e331 100644
--- a/contrib/llvm-project/llvm/lib/MC/MCCodeView.cpp
+++ b/contrib/llvm-project/llvm/lib/MC/MCCodeView.cpp
@@ -563,10 +563,7 @@ void CodeViewContext::encodeInlineLineTable(MCAsmLayout &Layout,
     int LineDelta = CurSourceLoc.Line - LastSourceLoc.Line;
     unsigned EncodedLineDelta = encodeSignedNumber(LineDelta);
     unsigned CodeDelta = computeLabelDiff(Layout, LastLabel, Loc.getLabel());
-    if (CodeDelta == 0 && LineDelta != 0) {
-      compressAnnotation(BinaryAnnotationsOpCode::ChangeLineOffset, Buffer);
-      compressAnnotation(EncodedLineDelta, Buffer);
-    } else if (EncodedLineDelta < 0x8 && CodeDelta <= 0xf) {
+    if (EncodedLineDelta < 0x8 && CodeDelta <= 0xf) {
       // The ChangeCodeOffsetAndLineOffset combination opcode is used when the
       // encoded line delta uses 3 or fewer set bits and the code offset fits
       // in one nibble.
diff --git a/contrib/llvm-project/llvm/lib/MC/MCContext.cpp b/contrib/llvm-project/llvm/lib/MC/MCContext.cpp
index a0f9212f3b14..9dab8a6c0910 100644
--- a/contrib/llvm-project/llvm/lib/MC/MCContext.cpp
+++ b/contrib/llvm-project/llvm/lib/MC/MCContext.cpp
@@ -90,6 +90,7 @@ void MCContext::reset() {
   ELFAllocator.DestroyAll();
   MachOAllocator.DestroyAll();
   XCOFFAllocator.DestroyAll();
+  MCInstAllocator.DestroyAll();
 
   MCSubtargetAllocator.DestroyAll();
   InlineAsmUsedLabelNames.clear();
@@ -126,6 +127,14 @@ void MCContext::reset() {
   HadError = false;
 }
 
+//===----------------------------------------------------------------------===//
+// MCInst Management
+//===----------------------------------------------------------------------===//
+
+MCInst *MCContext::createMCInst() {
+  return new (MCInstAllocator.Allocate()) MCInst;
+}
+
 //===----------------------------------------------------------------------===//
 // Symbol Manipulation
 //===----------------------------------------------------------------------===//
@@ -223,11 +232,16 @@ MCSymbol *MCContext::createSymbol(StringRef Name, bool AlwaysAddSuffix,
   llvm_unreachable("Infinite loop");
 }
 
-MCSymbol *MCContext::createTempSymbol(const Twine &Name, bool AlwaysAddSuffix,
-                                      bool CanBeUnnamed) {
+MCSymbol *MCContext::createTempSymbol(const Twine &Name, bool AlwaysAddSuffix) {
+  SmallString<128> NameSV;
+  raw_svector_ostream(NameSV) << MAI->getPrivateGlobalPrefix() << Name;
+  return createSymbol(NameSV, AlwaysAddSuffix, true);
+}
+
+MCSymbol *MCContext::createNamedTempSymbol(const Twine &Name) {
   SmallString<128> NameSV;
   raw_svector_ostream(NameSV) << MAI->getPrivateGlobalPrefix() << Name;
-  return createSymbol(NameSV, AlwaysAddSuffix, CanBeUnnamed);
+  return createSymbol(NameSV, true, false);
 }
 
 MCSymbol *MCContext::createLinkerPrivateTempSymbol() {
@@ -236,8 +250,10 @@ MCSymbol *MCContext::createLinkerPrivateTempSymbol() {
   return createSymbol(NameSV, true, false);
 }
 
-MCSymbol *MCContext::createTempSymbol(bool CanBeUnnamed) {
-  return createTempSymbol("tmp", true, CanBeUnnamed);
+MCSymbol *MCContext::createTempSymbol() { return createTempSymbol("tmp"); }
+
+MCSymbol *MCContext::createNamedTempSymbol() {
+  return createNamedTempSymbol("tmp");
 }
 
 unsigned MCContext::NextInstance(unsigned LocalLabelVal) {
@@ -258,7 +274,7 @@ MCSymbol *MCContext::getOrCreateDirectionalLocalSymbol(unsigned LocalLabelVal,
                                                        unsigned Instance) {
   MCSymbol *&Sym = LocalSymbols[std::make_pair(LocalLabelVal, Instance)];
   if (!Sym)
-    Sym = createTempSymbol(false);
+    Sym = createNamedTempSymbol();
   return Sym;
 }
 
@@ -635,7 +651,7 @@ MCSectionWasm *MCContext::getWasmSection(const Twine &Section, SectionKind Kind,
 
   StringRef CachedName = Entry.first.SectionName;
 
-  MCSymbol *Begin = createSymbol(CachedName, false, false);
+  MCSymbol *Begin = createSymbol(CachedName, true, false);
   cast<MCSymbolWasm>(Begin)->setType(wasm::WASM_SYMBOL_TYPE_SECTION);
 
   MCSectionWasm *Result = new (WasmAllocator.Allocate())
@@ -650,18 +666,21 @@ MCSectionWasm *MCContext::getWasmSection(const Twine &Section, SectionKind Kind,
   return Result;
 }
 
-MCSectionXCOFF *MCContext::getXCOFFSection(StringRef Section,
-                                           XCOFF::StorageMappingClass SMC,
-                                           XCOFF::SymbolType Type,
-                                           XCOFF::StorageClass SC,
-                                           SectionKind Kind,
-                                           const char *BeginSymName) {
+MCSectionXCOFF *
+MCContext::getXCOFFSection(StringRef Section, XCOFF::StorageMappingClass SMC,
+                           XCOFF::SymbolType Type, SectionKind Kind,
+                           bool MultiSymbolsAllowed, const char *BeginSymName) {
   // Do the lookup. If we have a hit, return it.
   auto IterBool = XCOFFUniquingMap.insert(
       std::make_pair(XCOFFSectionKey{Section.str(), SMC}, nullptr));
   auto &Entry = *IterBool.first;
-  if (!IterBool.second)
-    return Entry.second;
+  if (!IterBool.second) {
+    MCSectionXCOFF *ExistedEntry = Entry.second;
+    if (ExistedEntry->isMultiSymbolsAllowed() != MultiSymbolsAllowed)
+      report_fatal_error("section's multiply symbols policy does not match");
+
+    return ExistedEntry;
+  }
 
   // Otherwise, return a new section.
   StringRef CachedName = Entry.first.SectionName;
@@ -675,8 +694,8 @@ MCSectionXCOFF *MCContext::getXCOFFSection(StringRef Section,
   // QualName->getUnqualifiedName() and CachedName are the same except when
   // CachedName contains invalid character(s) such as '$' for an XCOFF symbol.
   MCSectionXCOFF *Result = new (XCOFFAllocator.Allocate())
-      MCSectionXCOFF(QualName->getUnqualifiedName(), SMC, Type, SC, Kind,
-                     QualName, Begin, CachedName);
+      MCSectionXCOFF(QualName->getUnqualifiedName(), SMC, Type, Kind, QualName,
+                     Begin, CachedName, MultiSymbolsAllowed);
   Entry.second = Result;
 
   auto *F = new MCDataFragment();
@@ -813,13 +832,13 @@ void MCContext::reportError(SMLoc Loc, const Twine &Msg) {
 
   // If we have a source manager use it. Otherwise, try using the inline source
   // manager.
-  // If that fails, use the generic report_fatal_error().
+  // If that fails, construct a temporary SourceMgr.
   if (SrcMgr)
     SrcMgr->PrintMessage(Loc, SourceMgr::DK_Error, Msg);
   else if (InlineSrcMgr)
     InlineSrcMgr->PrintMessage(Loc, SourceMgr::DK_Error, Msg);
   else
-    report_fatal_error(Msg, false);
+    SourceMgr().PrintMessage(Loc, SourceMgr::DK_Error, Msg);
 }
 
 void MCContext::reportWarning(SMLoc Loc, const Twine &Msg) {
diff --git a/contrib/llvm-project/llvm/lib/MC/MCDwarf.cpp b/contrib/llvm-project/llvm/lib/MC/MCDwarf.cpp
index 7f72d062b7ac..f86d4266a1eb 100644
--- a/contrib/llvm-project/llvm/lib/MC/MCDwarf.cpp
+++ b/contrib/llvm-project/llvm/lib/MC/MCDwarf.cpp
@@ -46,10 +46,8 @@
 using namespace llvm;
 
 MCSymbol *mcdwarf::emitListsTableHeaderStart(MCStreamer &S) {
-  MCSymbol *Start =
-      S.getContext().createTempSymbol("debug_list_header_start", true, true);
-  MCSymbol *End =
-      S.getContext().createTempSymbol("debug_list_header_end", true, true);
+  MCSymbol *Start = S.getContext().createTempSymbol("debug_list_header_start");
+  MCSymbol *End = S.getContext().createTempSymbol("debug_list_header_end");
   auto DwarfFormat = S.getContext().getDwarfFormat();
   if (DwarfFormat == dwarf::DWARF64) {
     S.AddComment("DWARF64 mark");
@@ -768,11 +766,10 @@ void MCDwarfLineAddr::Encode(MCContext &Context, MCDwarfLineTableParams Params,
   }
 }
 
-bool MCDwarfLineAddr::FixedEncode(MCContext &Context,
-                                  MCDwarfLineTableParams Params,
-                                  int64_t LineDelta, uint64_t AddrDelta,
-                                  raw_ostream &OS,
-                                  uint32_t *Offset, uint32_t *Size) {
+std::tuple<uint32_t, uint32_t, bool>
+MCDwarfLineAddr::fixedEncode(MCContext &Context, int64_t LineDelta,
+                             uint64_t AddrDelta, raw_ostream &OS) {
+  uint32_t Offset, Size;
   if (LineDelta != INT64_MAX) {
     OS << char(dwarf::DW_LNS_advance_line);
     encodeSLEB128(LineDelta, OS);
@@ -792,15 +789,15 @@ bool MCDwarfLineAddr::FixedEncode(MCContext &Context,
     encodeULEB128(1 + AddrSize, OS);
     OS << char(dwarf::DW_LNE_set_address);
     // Generate fixup for the address.
-    *Offset = OS.tell();
-    *Size = AddrSize;
+    Offset = OS.tell();
+    Size = AddrSize;
     SetDelta = false;
     OS.write_zeros(AddrSize);
   } else {
     OS << char(dwarf::DW_LNS_fixed_advance_pc);
     // Generate fixup for 2-bytes address delta.
-    *Offset = OS.tell();
-    *Size = 2;
+    Offset = OS.tell();
+    Size = 2;
     SetDelta = true;
     OS << char(0);
     OS << char(0);
@@ -814,7 +811,7 @@ bool MCDwarfLineAddr::FixedEncode(MCContext &Context,
     OS << char(dwarf::DW_LNS_copy);
   }
 
-  return SetDelta;
+  return std::make_tuple(Offset, Size, SetDelta);
 }
 
 // Utility function to write a tuple for .debug_abbrev.
@@ -1140,7 +1137,7 @@ static MCSymbol *emitGenDwarfRanges(MCStreamer *MCOS) {
     MCSymbol *EndSymbol = mcdwarf::emitListsTableHeaderStart(*MCOS);
     MCOS->AddComment("Offset entry count");
     MCOS->emitInt32(0);
-    RangesSymbol = context.createTempSymbol("debug_rnglist0_start", true, true);
+    RangesSymbol = context.createTempSymbol("debug_rnglist0_start");
     MCOS->emitLabel(RangesSymbol);
     for (MCSection *Sec : Sections) {
       const MCSymbol *StartSymbol = Sec->getBeginSymbol();
@@ -1157,7 +1154,7 @@ static MCSymbol *emitGenDwarfRanges(MCStreamer *MCOS) {
     MCOS->emitLabel(EndSymbol);
   } else {
     MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfRangesSection());
-    RangesSymbol = context.createTempSymbol("debug_ranges_start", true, true);
+    RangesSymbol = context.createTempSymbol("debug_ranges_start");
     MCOS->emitLabel(RangesSymbol);
     for (MCSection *Sec : Sections) {
       const MCSymbol *StartSymbol = Sec->getBeginSymbol();
diff --git a/contrib/llvm-project/llvm/lib/MC/MCELFStreamer.cpp b/contrib/llvm-project/llvm/lib/MC/MCELFStreamer.cpp
index 49d863f258bf..db45aef7d506 100644
--- a/contrib/llvm-project/llvm/lib/MC/MCELFStreamer.cpp
+++ b/contrib/llvm-project/llvm/lib/MC/MCELFStreamer.cpp
@@ -221,23 +221,35 @@ bool MCELFStreamer::emitSymbolAttribute(MCSymbol *S, MCSymbolAttr Attribute) {
   case MCSA_ELF_TypeGnuUniqueObject:
     Symbol->setType(CombineSymbolTypes(Symbol->getType(), ELF::STT_OBJECT));
     Symbol->setBinding(ELF::STB_GNU_UNIQUE);
-    Symbol->setExternal(true);
     break;
 
   case MCSA_Global:
+    // For `.weak x; .global x`, GNU as sets the binding to STB_WEAK while we
+    // traditionally set the binding to STB_GLOBAL. This is error-prone, so we
+    // error on such cases. Note, we also disallow changed binding from .local.
+    if (Symbol->isBindingSet() && Symbol->getBinding() != ELF::STB_GLOBAL)
+      getContext().reportError(getStartTokLoc(),
+                               Symbol->getName() +
+                                   " changed binding to STB_GLOBAL");
     Symbol->setBinding(ELF::STB_GLOBAL);
-    Symbol->setExternal(true);
     break;
 
   case MCSA_WeakReference:
   case MCSA_Weak:
+    // For `.global x; .weak x`, both MC and GNU as set the binding to STB_WEAK.
+    // We emit a warning for now but may switch to an error in the future.
+    if (Symbol->isBindingSet() && Symbol->getBinding() != ELF::STB_WEAK)
+      getContext().reportWarning(
+          getStartTokLoc(), Symbol->getName() + " changed binding to STB_WEAK");
     Symbol->setBinding(ELF::STB_WEAK);
-    Symbol->setExternal(true);
     break;
 
   case MCSA_Local:
+    if (Symbol->isBindingSet() && Symbol->getBinding() != ELF::STB_LOCAL)
+      getContext().reportError(getStartTokLoc(),
+                               Symbol->getName() +
+                                   " changed binding to STB_LOCAL");
     Symbol->setBinding(ELF::STB_LOCAL);
-    Symbol->setExternal(false);
     break;
 
   case MCSA_ELF_TypeFunction:
@@ -292,10 +304,8 @@ void MCELFStreamer::emitCommonSymbol(MCSymbol *S, uint64_t Size,
   auto *Symbol = cast<MCSymbolELF>(S);
   getAssembler().registerSymbol(*Symbol);
 
-  if (!Symbol->isBindingSet()) {
+  if (!Symbol->isBindingSet())
     Symbol->setBinding(ELF::STB_GLOBAL);
-    Symbol->setExternal(true);
-  }
 
   Symbol->setType(ELF::STT_OBJECT);
 
@@ -326,7 +336,8 @@ void MCELFStreamer::emitELFSize(MCSymbol *Symbol, const MCExpr *Value) {
 
 void MCELFStreamer::emitELFSymverDirective(StringRef AliasName,
                                            const MCSymbol *Aliasee) {
-  getAssembler().Symvers.push_back({AliasName, Aliasee});
+  getAssembler().Symvers.push_back(
+      MCAssembler::Symver{AliasName, Aliasee, getStartTokLoc()});
 }
 
 void MCELFStreamer::emitLocalCommonSymbol(MCSymbol *S, uint64_t Size,
@@ -335,7 +346,6 @@ void MCELFStreamer::emitLocalCommonSymbol(MCSymbol *S, uint64_t Size,
   // FIXME: Should this be caught and done earlier?
   getAssembler().registerSymbol(*Symbol);
   Symbol->setBinding(ELF::STB_LOCAL);
-  Symbol->setExternal(false);
   emitCommonSymbol(Symbol, Size, ByteAlignment);
 }
 
@@ -433,15 +443,18 @@ void MCELFStreamer::fixSymbolsInTLSFixups(const MCExpr *expr) {
     case MCSymbolRefExpr::VK_PPC_GOT_TPREL_LO:
     case MCSymbolRefExpr::VK_PPC_GOT_TPREL_HI:
     case MCSymbolRefExpr::VK_PPC_GOT_TPREL_HA:
+    case MCSymbolRefExpr::VK_PPC_GOT_TPREL_PCREL:
     case MCSymbolRefExpr::VK_PPC_GOT_DTPREL:
     case MCSymbolRefExpr::VK_PPC_GOT_DTPREL_LO:
     case MCSymbolRefExpr::VK_PPC_GOT_DTPREL_HI:
     case MCSymbolRefExpr::VK_PPC_GOT_DTPREL_HA:
     case MCSymbolRefExpr::VK_PPC_TLS:
+    case MCSymbolRefExpr::VK_PPC_TLS_PCREL:
     case MCSymbolRefExpr::VK_PPC_GOT_TLSGD:
     case MCSymbolRefExpr::VK_PPC_GOT_TLSGD_LO:
     case MCSymbolRefExpr::VK_PPC_GOT_TLSGD_HI:
     case MCSymbolRefExpr::VK_PPC_GOT_TLSGD_HA:
+    case MCSymbolRefExpr::VK_PPC_GOT_TLSGD_PCREL:
     case MCSymbolRefExpr::VK_PPC_TLSGD:
     case MCSymbolRefExpr::VK_PPC_GOT_TLSLD:
     case MCSymbolRefExpr::VK_PPC_GOT_TLSLD_LO:
@@ -479,10 +492,8 @@ void MCELFStreamer::finalizeCGProfileEntry(const MCSymbolRefExpr *&SRE) {
   // Not a temporary, referece it as a weak undefined.
   bool Created;
   getAssembler().registerSymbol(*S, &Created);
-  if (Created) {
+  if (Created)
     cast<MCSymbolELF>(S)->setBinding(ELF::STB_WEAK);
-    cast<MCSymbolELF>(S)->setExternal(true);
-  }
 }
 
 void MCELFStreamer::finalizeCGProfile() {
diff --git a/contrib/llvm-project/llvm/lib/MC/MCExpr.cpp b/contrib/llvm-project/llvm/lib/MC/MCExpr.cpp
index ecf63b10f73f..3b123a46d9dc 100644
--- a/contrib/llvm-project/llvm/lib/MC/MCExpr.cpp
+++ b/contrib/llvm-project/llvm/lib/MC/MCExpr.cpp
@@ -47,6 +47,8 @@ void MCExpr::print(raw_ostream &OS, const MCAsmInfo *MAI, bool InParens) const {
     auto Value = cast<MCConstantExpr>(*this).getValue();
     auto PrintInHex = cast<MCConstantExpr>(*this).useHexFormat();
     auto SizeInBytes = cast<MCConstantExpr>(*this).getSizeInBytes();
+    if (Value < 0 && MAI && !MAI->supportsSignedData())
+      PrintInHex = true;
     if (PrintInHex)
       switch (SizeInBytes) {
       default:
@@ -83,8 +85,13 @@ void MCExpr::print(raw_ostream &OS, const MCAsmInfo *MAI, bool InParens) const {
     } else
       Sym.print(OS, MAI);
 
-    if (SRE.getKind() != MCSymbolRefExpr::VK_None)
-      SRE.printVariantKind(OS);
+    const MCSymbolRefExpr::VariantKind Kind = SRE.getKind();
+    if (Kind != MCSymbolRefExpr::VK_None) {
+      if (MAI && MAI->useParensForSymbolVariant()) // ARM
+        OS << '(' << MCSymbolRefExpr::getVariantKindName(Kind) << ')';
+      else
+        OS << '@' << MCSymbolRefExpr::getVariantKindName(Kind);
+    }
 
     return;
   }
@@ -143,6 +150,7 @@ void MCExpr::print(raw_ostream &OS, const MCAsmInfo *MAI, bool InParens) const {
     case MCBinaryExpr::Mul:  OS <<  '*'; break;
     case MCBinaryExpr::NE:   OS << "!="; break;
     case MCBinaryExpr::Or:   OS <<  '|'; break;
+    case MCBinaryExpr::OrNot: OS << '!'; break;
     case MCBinaryExpr::Shl:  OS << "<<"; break;
     case MCBinaryExpr::Sub:  OS <<  '-'; break;
     case MCBinaryExpr::Xor:  OS <<  '^'; break;
@@ -194,8 +202,7 @@ const MCConstantExpr *MCConstantExpr::create(int64_t Value, MCContext &Ctx,
 MCSymbolRefExpr::MCSymbolRefExpr(const MCSymbol *Symbol, VariantKind Kind,
                                  const MCAsmInfo *MAI, SMLoc Loc)
     : MCExpr(MCExpr::SymbolRef, Loc,
-             encodeSubclassData(Kind, MAI->useParensForSymbolVariant(),
-                                MAI->hasSubsectionsViaSymbols())),
+             encodeSubclassData(Kind, MAI->hasSubsectionsViaSymbols())),
       Symbol(Symbol) {
   assert(Symbol);
 }
@@ -246,6 +253,7 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
   case VK_SIZE: return "SIZE";
   case VK_WEAKREF: return "WEAKREF";
   case VK_X86_ABS8: return "ABS8";
+  case VK_X86_PLTOFF: return "PLTOFF";
   case VK_ARM_NONE: return "none";
   case VK_ARM_GOT_PREL: return "GOT_PREL";
   case VK_ARM_TARGET1: return "target1";
@@ -319,9 +327,18 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
   case VK_PPC_GOT_TLSLD_HA: return "got@tlsld@ha";
   case VK_PPC_GOT_PCREL:
     return "got@pcrel";
+  case VK_PPC_GOT_TLSGD_PCREL:
+    return "got@tlsgd@pcrel";
+  case VK_PPC_GOT_TLSLD_PCREL:
+    return "got@tlsld@pcrel";
+  case VK_PPC_GOT_TPREL_PCREL:
+    return "got@tprel@pcrel";
+  case VK_PPC_TLS_PCREL:
+    return "tls@pcrel";
   case VK_PPC_TLSLD: return "tlsld";
   case VK_PPC_LOCAL: return "local";
   case VK_PPC_NOTOC: return "notoc";
+  case VK_PPC_PCREL_OPT: return "<<invalid>>";
   case VK_COFF_IMGREL32: return "IMGREL";
   case VK_Hexagon_LO16: return "LO16";
   case VK_Hexagon_HI16: return "HI16";
@@ -334,6 +351,7 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
   case VK_Hexagon_IE_GOT: return "IEGOT";
   case VK_WASM_TYPEINDEX: return "TYPEINDEX";
   case VK_WASM_MBREL: return "MBREL";
+  case VK_WASM_TLSREL: return "TLSREL";
   case VK_WASM_TBREL: return "TBREL";
   case VK_AMDGPU_GOTPCREL32_LO: return "gotpcrel32@lo";
   case VK_AMDGPU_GOTPCREL32_HI: return "gotpcrel32@hi";
@@ -393,6 +411,7 @@ MCSymbolRefExpr::getVariantKindForName(StringRef Name) {
     .Case("secrel32", VK_SECREL)
     .Case("size", VK_SIZE)
     .Case("abs8", VK_X86_ABS8)
+    .Case("pltoff", VK_X86_PLTOFF)
     .Case("l", VK_PPC_LO)
     .Case("h", VK_PPC_HI)
     .Case("ha", VK_PPC_HA)
@@ -450,6 +469,10 @@ MCSymbolRefExpr::getVariantKindForName(StringRef Name) {
     .Case("got@tlsld@h", VK_PPC_GOT_TLSLD_HI)
     .Case("got@tlsld@ha", VK_PPC_GOT_TLSLD_HA)
     .Case("got@pcrel", VK_PPC_GOT_PCREL)
+    .Case("got@tlsgd@pcrel", VK_PPC_GOT_TLSGD_PCREL)
+    .Case("got@tlsld@pcrel", VK_PPC_GOT_TLSLD_PCREL)
+    .Case("got@tprel@pcrel", VK_PPC_GOT_TPREL_PCREL)
+    .Case("tls@pcrel", VK_PPC_TLS_PCREL)
     .Case("notoc", VK_PPC_NOTOC)
     .Case("gdgot", VK_Hexagon_GD_GOT)
     .Case("gdplt", VK_Hexagon_GD_PLT)
@@ -470,6 +493,7 @@ MCSymbolRefExpr::getVariantKindForName(StringRef Name) {
     .Case("typeindex", VK_WASM_TYPEINDEX)
     .Case("tbrel", VK_WASM_TBREL)
     .Case("mbrel", VK_WASM_MBREL)
+    .Case("tlsrel", VK_WASM_TLSREL)
     .Case("gotpcrel32@lo", VK_AMDGPU_GOTPCREL32_LO)
     .Case("gotpcrel32@hi", VK_AMDGPU_GOTPCREL32_HI)
     .Case("rel32@lo", VK_AMDGPU_REL32_LO)
@@ -494,13 +518,6 @@ MCSymbolRefExpr::getVariantKindForName(StringRef Name) {
     .Default(VK_Invalid);
 }
 
-void MCSymbolRefExpr::printVariantKind(raw_ostream &OS) const {
-  if (useParensForSymbolVariant())
-    OS << '(' << MCSymbolRefExpr::getVariantKindName(getKind()) << ')';
-  else
-    OS << '@' << MCSymbolRefExpr::getVariantKindName(getKind());
-}
-
 /* *** */
 
 void MCTargetExpr::anchor() {}
@@ -575,12 +592,7 @@ static void AttemptToFoldSymbolOffsetDifference(
   if (!Asm->getWriter().isSymbolRefDifferenceFullyResolved(*Asm, A, B, InSet))
     return;
 
-  MCFragment *FA = SA.getFragment();
-  MCFragment *FB = SB.getFragment();
-  if (FA == FB && !SA.isVariable() && !SA.isUnset() && !SB.isVariable() &&
-      !SB.isUnset()) {
-    Addend += (SA.getOffset() - SB.getOffset());
-
+  auto FinalizeFolding = [&]() {
     // Pointers to Thumb symbols need to have their low-bit set to allow
     // for interworking.
     if (Asm->isThumbFunc(&SA))
@@ -594,11 +606,17 @@ static void AttemptToFoldSymbolOffsetDifference(
     // Clear the symbol expr pointers to indicate we have folded these
     // operands.
     A = B = nullptr;
-    return;
-  }
+  };
 
-  if (!Layout)
-    return;
+  const MCFragment *FA = SA.getFragment();
+  const MCFragment *FB = SB.getFragment();
+  // If both symbols are in the same fragment, return the difference of their
+  // offsets
+  if (FA == FB && !SA.isVariable() && !SA.isUnset() && !SB.isVariable() &&
+      !SB.isUnset()) {
+    Addend += SA.getOffset() - SB.getOffset();
+    return FinalizeFolding();
+  }
 
   const MCSection &SecA = *FA->getParent();
   const MCSection &SecB = *FB->getParent();
@@ -606,30 +624,46 @@ static void AttemptToFoldSymbolOffsetDifference(
   if ((&SecA != &SecB) && !Addrs)
     return;
 
-  // One of the symbol involved is part of a fragment being laid out. Quit now
-  // to avoid a self loop.
-  if (!Layout->canGetFragmentOffset(FA) || !Layout->canGetFragmentOffset(FB))
-    return;
+  if (Layout) {
+    // One of the symbol involved is part of a fragment being laid out. Quit now
+    // to avoid a self loop.
+    if (!Layout->canGetFragmentOffset(FA) || !Layout->canGetFragmentOffset(FB))
+      return;
+
+    // Eagerly evaluate when layout is finalized.
+    Addend += Layout->getSymbolOffset(A->getSymbol()) -
+              Layout->getSymbolOffset(B->getSymbol());
+    if (Addrs && (&SecA != &SecB))
+      Addend += (Addrs->lookup(&SecA) - Addrs->lookup(&SecB));
+
+    FinalizeFolding();
+  } else {
+    // When layout is not finalized, our ability to resolve differences between
+    // symbols is limited to specific cases where the fragments between two
+    // symbols (including the fragments the symbols are defined in) are
+    // fixed-size fragments so the difference can be calculated. For example,
+    // this is important when the Subtarget is changed and a new MCDataFragment
+    // is created in the case of foo: instr; .arch_extension ext; instr .if . -
+    // foo.
+    if (SA.isVariable() || SA.isUnset() || SB.isVariable() || SB.isUnset() ||
+        FA->getKind() != MCFragment::FT_Data ||
+        FB->getKind() != MCFragment::FT_Data ||
+        FA->getSubsectionNumber() != FB->getSubsectionNumber())
+      return;
+    // Try to find a constant displacement from FA to FB, add the displacement
+    // between the offset in FA of SA and the offset in FB of SB.
+    int64_t Displacement = SA.getOffset() - SB.getOffset();
+    for (auto FI = FB->getIterator(), FE = SecA.end(); FI != FE; ++FI) {
+      if (&*FI == FA) {
+        Addend += Displacement;
+        return FinalizeFolding();
+      }
 
-  // Eagerly evaluate.
-  Addend += Layout->getSymbolOffset(A->getSymbol()) -
-            Layout->getSymbolOffset(B->getSymbol());
-  if (Addrs && (&SecA != &SecB))
-    Addend += (Addrs->lookup(&SecA) - Addrs->lookup(&SecB));
-
-  // Pointers to Thumb symbols need to have their low-bit set to allow
-  // for interworking.
-  if (Asm->isThumbFunc(&SA))
-    Addend |= 1;
-
-  // If symbol is labeled as micromips, we set low-bit to ensure
-  // correct offset in .gcc_except_table
-  if (Asm->getBackend().isMicroMips(&SA))
-    Addend |= 1;
-
-  // Clear the symbol expr pointers to indicate we have folded these
-  // operands.
-  A = B = nullptr;
+      if (FI->getKind() != MCFragment::FT_Data)
+        return;
+      Displacement += cast<MCDataFragment>(FI)->getContents().size();
+    }
+  }
 }
 
 static bool canFold(const MCAssembler *Asm, const MCSymbolRefExpr *A,
@@ -770,13 +804,30 @@ bool MCExpr::evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm,
   case SymbolRef: {
     const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(this);
     const MCSymbol &Sym = SRE->getSymbol();
+    const auto Kind = SRE->getKind();
 
     // Evaluate recursively if this is a variable.
-    if (Sym.isVariable() && SRE->getKind() == MCSymbolRefExpr::VK_None &&
+    if (Sym.isVariable() && (Kind == MCSymbolRefExpr::VK_None || Layout) &&
         canExpand(Sym, InSet)) {
       bool IsMachO = SRE->hasSubsectionsViaSymbols();
       if (Sym.getVariableValue()->evaluateAsRelocatableImpl(
               Res, Asm, Layout, Fixup, Addrs, InSet || IsMachO)) {
+        if (Kind != MCSymbolRefExpr::VK_None) {
+          if (Res.isAbsolute()) {
+            Res = MCValue::get(SRE, nullptr, 0);
+            return true;
+          }
+          // If the reference has a variant kind, we can only handle expressions
+          // which evaluate exactly to a single unadorned symbol. Attach the
+          // original VariantKind to SymA of the result.
+          if (Res.getRefKind() != MCSymbolRefExpr::VK_None || !Res.getSymA() ||
+              Res.getSymB() || Res.getConstant())
+            return false;
+          Res =
+              MCValue::get(MCSymbolRefExpr::create(&Res.getSymA()->getSymbol(),
+                                                   Kind, Asm->getContext()),
+                           Res.getSymB(), Res.getConstant(), Res.getRefKind());
+        }
         if (!IsMachO)
           return true;
 
@@ -917,6 +968,7 @@ bool MCExpr::evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm,
     case MCBinaryExpr::Mul:  Result = LHS * RHS; break;
     case MCBinaryExpr::NE:   Result = LHS != RHS; break;
     case MCBinaryExpr::Or:   Result = LHS | RHS; break;
+    case MCBinaryExpr::OrNot: Result = LHS | ~RHS; break;
     case MCBinaryExpr::Shl:  Result = uint64_t(LHS) << uint64_t(RHS); break;
     case MCBinaryExpr::Sub:  Result = LHS - RHS; break;
     case MCBinaryExpr::Xor:  Result = LHS ^ RHS; break;
diff --git a/contrib/llvm-project/llvm/lib/MC/MCFragment.cpp b/contrib/llvm-project/llvm/lib/MC/MCFragment.cpp
index 8e90e07a4dbf..0f8543f51096 100644
--- a/contrib/llvm-project/llvm/lib/MC/MCFragment.cpp
+++ b/contrib/llvm-project/llvm/lib/MC/MCFragment.cpp
@@ -279,6 +279,9 @@ void MCFragment::destroy() {
     case FT_Fill:
       delete cast<MCFillFragment>(this);
       return;
+    case FT_Nops:
+      delete cast<MCNopsFragment>(this);
+      return;
     case FT_Relaxable:
       delete cast<MCRelaxableFragment>(this);
       return;
@@ -306,6 +309,9 @@ void MCFragment::destroy() {
     case FT_CVDefRange:
       delete cast<MCCVDefRangeFragment>(this);
       return;
+    case FT_PseudoProbe:
+      delete cast<MCPseudoProbeAddrFragment>(this);
+      return;
     case FT_Dummy:
       delete cast<MCDummyFragment>(this);
       return;
@@ -336,6 +342,9 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
   case MCFragment::FT_CompactEncodedInst:
     OS << "MCCompactEncodedInstFragment"; break;
   case MCFragment::FT_Fill:  OS << "MCFillFragment"; break;
+  case MCFragment::FT_Nops:
+    OS << "MCFNopsFragment";
+    break;
   case MCFragment::FT_Relaxable:  OS << "MCRelaxableFragment"; break;
   case MCFragment::FT_Org:   OS << "MCOrgFragment"; break;
   case MCFragment::FT_Dwarf: OS << "MCDwarfFragment"; break;
@@ -345,6 +354,9 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
   case MCFragment::FT_SymbolId:    OS << "MCSymbolIdFragment"; break;
   case MCFragment::FT_CVInlineLines: OS << "MCCVInlineLineTableFragment"; break;
   case MCFragment::FT_CVDefRange: OS << "MCCVDefRangeTableFragment"; break;
+  case MCFragment::FT_PseudoProbe:
+    OS << "MCPseudoProbe";
+    break;
   case MCFragment::FT_Dummy: OS << "MCDummyFragment"; break;
   }
 
@@ -408,6 +420,12 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
        << " NumValues:" << FF->getNumValues();
     break;
   }
+  case MCFragment::FT_Nops: {
+    const auto *NF = cast<MCNopsFragment>(this);
+    OS << " NumBytes:" << NF->getNumBytes()
+       << " ControlledNopLength:" << NF->getControlledNopLength();
+    break;
+  }
   case MCFragment::FT_Relaxable:  {
     const auto *F = cast<MCRelaxableFragment>(this);
     OS << "\n       ";
@@ -472,6 +490,12 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
     }
     break;
   }
+  case MCFragment::FT_PseudoProbe: {
+    const auto *OF = cast<MCPseudoProbeAddrFragment>(this);
+    OS << "\n       ";
+    OS << " AddrDelta:" << OF->getAddrDelta();
+    break;
+  }
   case MCFragment::FT_Dummy:
     break;
   }
diff --git a/contrib/llvm-project/llvm/lib/MC/MCObjectFileInfo.cpp b/contrib/llvm-project/llvm/lib/MC/MCObjectFileInfo.cpp
index b9b4416fde21..398de873fe0b 100644
--- a/contrib/llvm-project/llvm/lib/MC/MCObjectFileInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/MC/MCObjectFileInfo.cpp
@@ -497,6 +497,10 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(const Triple &T, bool Large) {
       Ctx->getELFSection(".eh_frame", EHSectionType, EHSectionFlags);
 
   StackSizesSection = Ctx->getELFSection(".stack_sizes", ELF::SHT_PROGBITS, 0);
+
+  PseudoProbeSection = Ctx->getELFSection(".pseudo_probe", DebugSecType, 0);
+  PseudoProbeDescSection =
+      Ctx->getELFSection(".pseudo_probe_desc", DebugSecType, 0);
 }
 
 void MCObjectFileInfo::initCOFFMCObjectFileInfo(const Triple &T) {
@@ -754,6 +758,11 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(const Triple &T) {
                                          COFF::IMAGE_SCN_MEM_READ,
                                      SectionKind::getMetadata());
 
+  GIATsSection = Ctx->getCOFFSection(".giats$y",
+                                     COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                                         COFF::IMAGE_SCN_MEM_READ,
+                                     SectionKind::getMetadata());
+
   GLJMPSection = Ctx->getCOFFSection(".gljmp$y",
                                      COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
                                          COFF::IMAGE_SCN_MEM_READ,
@@ -798,6 +807,10 @@ void MCObjectFileInfo::initWasmMCObjectFileInfo(const Triple &T) {
   DwarfFrameSection = Ctx->getWasmSection(".debug_frame", SectionKind::getMetadata());
   DwarfPubNamesSection = Ctx->getWasmSection(".debug_pubnames", SectionKind::getMetadata());
   DwarfPubTypesSection = Ctx->getWasmSection(".debug_pubtypes", SectionKind::getMetadata());
+  DwarfGnuPubNamesSection =
+      Ctx->getWasmSection(".debug_gnu_pubnames", SectionKind::getMetadata());
+  DwarfGnuPubTypesSection =
+      Ctx->getWasmSection(".debug_gnu_pubtypes", SectionKind::getMetadata());
 
   DwarfDebugNamesSection =
       Ctx->getWasmSection(".debug_names", SectionKind::getMetadata());
@@ -810,6 +823,37 @@ void MCObjectFileInfo::initWasmMCObjectFileInfo(const Triple &T) {
   DwarfLoclistsSection =
       Ctx->getWasmSection(".debug_loclists", SectionKind::getMetadata());
 
+  // Fission Sections
+  DwarfInfoDWOSection =
+      Ctx->getWasmSection(".debug_info.dwo", SectionKind::getMetadata());
+  DwarfTypesDWOSection =
+      Ctx->getWasmSection(".debug_types.dwo", SectionKind::getMetadata());
+  DwarfAbbrevDWOSection =
+      Ctx->getWasmSection(".debug_abbrev.dwo", SectionKind::getMetadata());
+  DwarfStrDWOSection =
+      Ctx->getWasmSection(".debug_str.dwo", SectionKind::getMetadata());
+  DwarfLineDWOSection =
+      Ctx->getWasmSection(".debug_line.dwo", SectionKind::getMetadata());
+  DwarfLocDWOSection =
+      Ctx->getWasmSection(".debug_loc.dwo", SectionKind::getMetadata());
+  DwarfStrOffDWOSection =
+      Ctx->getWasmSection(".debug_str_offsets.dwo", SectionKind::getMetadata());
+  DwarfRnglistsDWOSection =
+      Ctx->getWasmSection(".debug_rnglists.dwo", SectionKind::getMetadata());
+  DwarfMacinfoDWOSection =
+      Ctx->getWasmSection(".debug_macinfo.dwo", SectionKind::getMetadata());
+  DwarfMacroDWOSection =
+      Ctx->getWasmSection(".debug_macro.dwo", SectionKind::getMetadata());
+
+  DwarfLoclistsDWOSection =
+      Ctx->getWasmSection(".debug_loclists.dwo", SectionKind::getMetadata());
+
+  // DWP Sections
+  DwarfCUIndexSection =
+      Ctx->getWasmSection(".debug_cu_index", SectionKind::getMetadata(), 0);
+  DwarfTUIndexSection =
+      Ctx->getWasmSection(".debug_tu_index", SectionKind::getMetadata(), 0);
+
   // Wasm use data section for LSDA.
   // TODO Consider putting each function's exception table in a separate
   // section, as in -function-sections, to facilitate lld's --gc-section.
@@ -826,23 +870,31 @@ void MCObjectFileInfo::initXCOFFMCObjectFileInfo(const Triple &T) {
   // csect for program code.
   TextSection = Ctx->getXCOFFSection(
       ".text", XCOFF::StorageMappingClass::XMC_PR, XCOFF::XTY_SD,
-      XCOFF::C_HIDEXT, SectionKind::getText());
+      SectionKind::getText(), /* MultiSymbolsAllowed*/ true);
 
   DataSection = Ctx->getXCOFFSection(
       ".data", XCOFF::StorageMappingClass::XMC_RW, XCOFF::XTY_SD,
-      XCOFF::C_HIDEXT, SectionKind::getData());
+      SectionKind::getData(), /* MultiSymbolsAllowed*/ true);
 
   ReadOnlySection = Ctx->getXCOFFSection(
       ".rodata", XCOFF::StorageMappingClass::XMC_RO, XCOFF::XTY_SD,
-      XCOFF::C_HIDEXT, SectionKind::getReadOnly());
+      SectionKind::getReadOnly(), /* MultiSymbolsAllowed*/ true);
 
-  TOCBaseSection = Ctx->getXCOFFSection(
-      "TOC", XCOFF::StorageMappingClass::XMC_TC0, XCOFF::XTY_SD,
-      XCOFF::C_HIDEXT, SectionKind::getData());
+  TOCBaseSection =
+      Ctx->getXCOFFSection("TOC", XCOFF::StorageMappingClass::XMC_TC0,
+                           XCOFF::XTY_SD, SectionKind::getData());
 
   // The TOC-base always has 0 size, but 4 byte alignment.
   TOCBaseSection->setAlignment(Align(4));
 
+  LSDASection = Ctx->getXCOFFSection(".gcc_except_table",
+                                     XCOFF::StorageMappingClass::XMC_RO,
+                                     XCOFF::XTY_SD, SectionKind::getReadOnly());
+
+  CompactUnwindSection =
+      Ctx->getXCOFFSection(".eh_info_table", XCOFF::StorageMappingClass::XMC_RW,
+                           XCOFF::XTY_SD, SectionKind::getData());
+
   // DWARF sections for XCOFF are not csects. They are special STYP_DWARF
   // sections, and the individual DWARF sections are distinguished by their
   // section subtype.
@@ -906,6 +958,9 @@ void MCObjectFileInfo::InitMCObjectFileInfo(const Triple &TheTriple, bool PIC,
     Env = IsWasm;
     initWasmMCObjectFileInfo(TT);
     break;
+  case Triple::GOFF:
+    report_fatal_error("Cannot initialize MC for GOFF object file format");
+    break;
   case Triple::XCOFF:
     Env = IsXCOFF;
     initXCOFFMCObjectFileInfo(TT);
@@ -922,9 +977,12 @@ MCSection *MCObjectFileInfo::getDwarfComdatSection(const char *Name,
   case Triple::ELF:
     return Ctx->getELFSection(Name, ELF::SHT_PROGBITS, ELF::SHF_GROUP, 0,
                               utostr(Hash));
+  case Triple::Wasm:
+    return Ctx->getWasmSection(Name, SectionKind::getMetadata(), utostr(Hash),
+                               MCContext::GenericSectionID);
   case Triple::MachO:
   case Triple::COFF:
-  case Triple::Wasm:
+  case Triple::GOFF:
   case Triple::XCOFF:
   case Triple::UnknownObjectFormat:
     report_fatal_error("Cannot get DWARF comdat section for this object file "
@@ -948,6 +1006,64 @@ MCObjectFileInfo::getStackSizesSection(const MCSection &TextSec) const {
   }
 
   return Ctx->getELFSection(".stack_sizes", ELF::SHT_PROGBITS, Flags, 0,
-                            GroupName, MCSection::NonUniqueID,
+                            GroupName, ElfSec.getUniqueID(),
                             cast<MCSymbolELF>(TextSec.getBeginSymbol()));
 }
+
+MCSection *
+MCObjectFileInfo::getBBAddrMapSection(const MCSection &TextSec) const {
+  if (Env != IsELF)
+    return nullptr;
+
+  const MCSectionELF &ElfSec = static_cast<const MCSectionELF &>(TextSec);
+  unsigned Flags = ELF::SHF_LINK_ORDER;
+  StringRef GroupName;
+  if (const MCSymbol *Group = ElfSec.getGroup()) {
+    GroupName = Group->getName();
+    Flags |= ELF::SHF_GROUP;
+  }
+
+  // Use the text section's begin symbol and unique ID to create a separate
+  // .llvm_bb_addr_map section associated with every unique text section.
+  return Ctx->getELFSection(".llvm_bb_addr_map", ELF::SHT_LLVM_BB_ADDR_MAP,
+                            Flags, 0, GroupName, ElfSec.getUniqueID(),
+                            cast<MCSymbolELF>(TextSec.getBeginSymbol()));
+}
+
+MCSection *
+MCObjectFileInfo::getPseudoProbeSection(const MCSection *TextSec) const {
+  if (Env == IsELF) {
+    const auto *ElfSec = static_cast<const MCSectionELF *>(TextSec);
+    // Create a separate section for probes that comes with a comdat function.
+    if (const MCSymbol *Group = ElfSec->getGroup()) {
+      auto *S = static_cast<MCSectionELF *>(PseudoProbeSection);
+      auto Flags = S->getFlags() | ELF::SHF_GROUP;
+      return Ctx->getELFSection(S->getName(), S->getType(), Flags,
+                                S->getEntrySize(), Group->getName());
+    }
+  }
+  return PseudoProbeSection;
+}
+
+MCSection *
+MCObjectFileInfo::getPseudoProbeDescSection(StringRef FuncName) const {
+  if (Env == IsELF) {
+    // Create a separate comdat group for each function's descriptor in order
+    // for the linker to deduplicate. The duplication, must be from different
+    // tranlation unit, can come from:
+    //  1. Inline functions defined in header files;
+    //  2. ThinLTO imported funcions;
+    //  3. Weak-linkage definitions.
+    // Use a concatenation of the section name and the function name as the
+    // group name so that descriptor-only groups won't be folded with groups of
+    // code.
+    if (TT.supportsCOMDAT() && !FuncName.empty()) {
+      auto *S = static_cast<MCSectionELF *>(PseudoProbeDescSection);
+      auto Flags = S->getFlags() | ELF::SHF_GROUP;
+      return Ctx->getELFSection(S->getName(), S->getType(), Flags,
+                                S->getEntrySize(),
+                                S->getName() + "_" + FuncName);
+    }
+  }
+  return PseudoProbeDescSection;
+}
diff --git a/contrib/llvm-project/llvm/lib/MC/MCObjectStreamer.cpp b/contrib/llvm-project/llvm/lib/MC/MCObjectStreamer.cpp
index e39c4a03bc1e..1c23d31f8744 100644
--- a/contrib/llvm-project/llvm/lib/MC/MCObjectStreamer.cpp
+++ b/contrib/llvm-project/llvm/lib/MC/MCObjectStreamer.cpp
@@ -248,7 +248,7 @@ void MCObjectStreamer::emitValueImpl(const MCExpr *Value, unsigned Size,
 }
 
 MCSymbol *MCObjectStreamer::emitCFILabel() {
-  MCSymbol *Label = getContext().createTempSymbol("cfi", true);
+  MCSymbol *Label = getContext().createTempSymbol("cfi");
   emitLabel(Label);
   return Label;
 }
@@ -665,6 +665,68 @@ void MCObjectStreamer::emitGPRel64Value(const MCExpr *Value) {
   DF->getContents().resize(DF->getContents().size() + 8, 0);
 }
 
+static Optional<std::pair<bool, std::string>>
+getOffsetAndDataFragment(const MCSymbol &Symbol, uint32_t &RelocOffset,
+                         MCDataFragment *&DF) {
+  if (Symbol.isVariable()) {
+    const MCExpr *SymbolExpr = Symbol.getVariableValue();
+    MCValue OffsetVal;
+    if(!SymbolExpr->evaluateAsRelocatable(OffsetVal, nullptr, nullptr))
+      return std::make_pair(false,
+                            std::string("symbol in .reloc offset is not "
+                                        "relocatable"));
+    if (OffsetVal.isAbsolute()) {
+      RelocOffset = OffsetVal.getConstant();
+      MCFragment *Fragment = Symbol.getFragment();
+      // FIXME Support symbols with no DF. For example:
+      // .reloc .data, ENUM_VALUE, <some expr>
+      if (!Fragment || Fragment->getKind() != MCFragment::FT_Data)
+        return std::make_pair(false,
+                              std::string("symbol in offset has no data "
+                                          "fragment"));
+      DF = cast<MCDataFragment>(Fragment);
+      return None;
+    }
+
+    if (OffsetVal.getSymB())
+      return std::make_pair(false,
+                            std::string(".reloc symbol offset is not "
+                                        "representable"));
+
+    const MCSymbolRefExpr &SRE = cast<MCSymbolRefExpr>(*OffsetVal.getSymA());
+    if (!SRE.getSymbol().isDefined())
+      return std::make_pair(false,
+                            std::string("symbol used in the .reloc offset is "
+                                        "not defined"));
+
+    if (SRE.getSymbol().isVariable())
+      return std::make_pair(false,
+                            std::string("symbol used in the .reloc offset is "
+                                        "variable"));
+
+    MCFragment *Fragment = SRE.getSymbol().getFragment();
+    // FIXME Support symbols with no DF. For example:
+    // .reloc .data, ENUM_VALUE, <some expr>
+    if (!Fragment || Fragment->getKind() != MCFragment::FT_Data)
+      return std::make_pair(false,
+                            std::string("symbol in offset has no data "
+                                        "fragment"));
+    RelocOffset = SRE.getSymbol().getOffset() + OffsetVal.getConstant();
+    DF = cast<MCDataFragment>(Fragment);
+  } else {
+    RelocOffset = Symbol.getOffset();
+    MCFragment *Fragment = Symbol.getFragment();
+    // FIXME Support symbols with no DF. For example:
+    // .reloc .data, ENUM_VALUE, <some expr>
+    if (!Fragment || Fragment->getKind() != MCFragment::FT_Data)
+      return std::make_pair(false,
+                            std::string("symbol in offset has no data "
+                                        "fragment"));
+    DF = cast<MCDataFragment>(Fragment);
+  }
+  return None;
+}
+
 Optional<std::pair<bool, std::string>>
 MCObjectStreamer::emitRelocDirective(const MCExpr &Offset, StringRef Name,
                                      const MCExpr *Expr, SMLoc Loc,
@@ -698,10 +760,17 @@ MCObjectStreamer::emitRelocDirective(const MCExpr &Offset, StringRef Name,
                           std::string(".reloc offset is not representable"));
 
   const MCSymbolRefExpr &SRE = cast<MCSymbolRefExpr>(*OffsetVal.getSymA());
-  if (SRE.getSymbol().isDefined()) {
-    // FIXME SRE.getSymbol() may not be relative to DF.
+  const MCSymbol &Symbol = SRE.getSymbol();
+  if (Symbol.isDefined()) {
+    uint32_t SymbolOffset = 0;
+    Optional<std::pair<bool, std::string>> Error;
+    Error = getOffsetAndDataFragment(Symbol, SymbolOffset, DF);
+
+    if (Error != None)
+      return Error;
+
     DF->getFixups().push_back(
-        MCFixup::create(SRE.getSymbol().getOffset() + OffsetVal.getConstant(),
+        MCFixup::create(SymbolOffset + OffsetVal.getConstant(),
                         Expr, Kind, Loc));
     return None;
   }
@@ -750,6 +819,16 @@ void MCObjectStreamer::emitFill(const MCExpr &NumValues, int64_t Size,
   insert(new MCFillFragment(Expr, Size, NumValues, Loc));
 }
 
+void MCObjectStreamer::emitNops(int64_t NumBytes, int64_t ControlledNopLength,
+                                SMLoc Loc) {
+  // Emit an NOP fragment.
+  MCDataFragment *DF = getOrCreateDataFragment();
+  flushPendingLabels(DF, DF->getContents().size());
+
+  assert(getCurrentSectionOnly() && "need a section");
+  insert(new MCNopsFragment(NumBytes, ControlledNopLength, Loc));
+}
+
 void MCObjectStreamer::emitFileDirective(StringRef Filename) {
   getAssembler().addFileName(Filename);
 }
@@ -773,6 +852,9 @@ void MCObjectStreamer::finishImpl() {
   // Dump out the dwarf file & directory tables and line tables.
   MCDwarfLineTable::Emit(this, getAssembler().getDWARFLinetableParams());
 
+  // Emit pseudo probes for the current module.
+  MCPseudoProbeTable::emit(this);
+
   // Update any remaining pending labels with empty data fragments.
   flushPendingLabels();
 
diff --git a/contrib/llvm-project/llvm/lib/MC/MCParser/AsmLexer.cpp b/contrib/llvm-project/llvm/lib/MC/MCParser/AsmLexer.cpp
index 5a571c7c0c0e..1fa22ab000f0 100644
--- a/contrib/llvm-project/llvm/lib/MC/MCParser/AsmLexer.cpp
+++ b/contrib/llvm-project/llvm/lib/MC/MCParser/AsmLexer.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/SMLoc.h"
 #include "llvm/Support/SaveAndRestore.h"
 #include <cassert>
@@ -63,6 +64,12 @@ int AsmLexer::getNextChar() {
   return (unsigned char)*CurPtr++;
 }
 
+int AsmLexer::peekNextChar() {
+  if (CurPtr == CurBuf.end())
+    return EOF;
+  return (unsigned char)*CurPtr;
+}
+
 /// The leading integral digit sequence and dot should have already been
 /// consumed, some or all of the fractional digit sequence *can* have been
 /// consumed.
@@ -271,13 +278,34 @@ static unsigned doHexLookAhead(const char *&CurPtr, unsigned DefaultRadix,
   return DefaultRadix;
 }
 
-static AsmToken intToken(StringRef Ref, APInt &Value)
-{
+static const char *findLastDigit(const char *CurPtr, unsigned DefaultRadix) {
+  while (hexDigitValue(*CurPtr) < DefaultRadix) {
+    ++CurPtr;
+  }
+  return CurPtr;
+}
+
+static AsmToken intToken(StringRef Ref, APInt &Value) {
   if (Value.isIntN(64))
     return AsmToken(AsmToken::Integer, Ref, Value);
   return AsmToken(AsmToken::BigNum, Ref, Value);
 }
 
+static std::string radixName(unsigned Radix) {
+  switch (Radix) {
+  case 2:
+    return "binary";
+  case 8:
+    return "octal";
+  case 10:
+    return "decimal";
+  case 16:
+    return "hexadecimal";
+  default:
+    return "base-" + std::to_string(Radix);
+  }
+}
+
 /// LexDigit: First character is [0-9].
 ///   Local Label: [0-9][:]
 ///   Forward/Backward Label: [0-9][fb]
@@ -286,16 +314,51 @@ static AsmToken intToken(StringRef Ref, APInt &Value)
 ///   Hex integer: 0x[0-9a-fA-F]+ or [0x]?[0-9][0-9a-fA-F]*[hH]
 ///   Decimal integer: [1-9][0-9]*
 AsmToken AsmLexer::LexDigit() {
-  // MASM-flavor binary integer: [01]+[bB]
+  // MASM-flavor binary integer: [01]+[yY] (if DefaultRadix < 16, [bByY])
+  // MASM-flavor octal integer: [0-7]+[oOqQ]
+  // MASM-flavor decimal integer: [0-9]+[tT] (if DefaultRadix < 16, [dDtT])
   // MASM-flavor hexadecimal integer: [0-9][0-9a-fA-F]*[hH]
   if (LexMasmIntegers && isdigit(CurPtr[-1])) {
-    const char *FirstNonBinary = (CurPtr[-1] != '0' && CurPtr[-1] != '1') ?
-                                   CurPtr - 1 : nullptr;
+    const char *FirstNonBinary =
+        (CurPtr[-1] != '0' && CurPtr[-1] != '1') ? CurPtr - 1 : nullptr;
+    const char *FirstNonDecimal =
+        (CurPtr[-1] < '0' || CurPtr[-1] > '9') ? CurPtr - 1 : nullptr;
     const char *OldCurPtr = CurPtr;
     while (isHexDigit(*CurPtr)) {
-      if (*CurPtr != '0' && *CurPtr != '1' && !FirstNonBinary)
-        FirstNonBinary = CurPtr;
+      switch (*CurPtr) {
+      default:
+        if (!FirstNonDecimal) {
+          FirstNonDecimal = CurPtr;
+        }
+        LLVM_FALLTHROUGH;
+      case '9':
+      case '8':
+      case '7':
+      case '6':
+      case '5':
+      case '4':
+      case '3':
+      case '2':
+        if (!FirstNonBinary) {
+          FirstNonBinary = CurPtr;
+        }
+        break;
+      case '1':
+      case '0':
+        break;
+      }
+      ++CurPtr;
+    }
+    if (*CurPtr == '.') {
+      // MASM float literals (other than hex floats) always contain a ".", and
+      // are always written in decimal.
+      ++CurPtr;
+      return LexFloatLiteral();
+    }
+
+    if (LexMasmHexFloats && (*CurPtr == 'r' || *CurPtr == 'R')) {
       ++CurPtr;
+      return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
     }
 
     unsigned Radix = 0;
@@ -303,28 +366,61 @@ AsmToken AsmLexer::LexDigit() {
       // hexadecimal number
       ++CurPtr;
       Radix = 16;
+    } else if (*CurPtr == 't' || *CurPtr == 'T') {
+      // decimal number
+      ++CurPtr;
+      Radix = 10;
+    } else if (*CurPtr == 'o' || *CurPtr == 'O' || *CurPtr == 'q' ||
+               *CurPtr == 'Q') {
+      // octal number
+      ++CurPtr;
+      Radix = 8;
+    } else if (*CurPtr == 'y' || *CurPtr == 'Y') {
+      // binary number
+      ++CurPtr;
+      Radix = 2;
+    } else if (FirstNonDecimal && FirstNonDecimal + 1 == CurPtr &&
+               DefaultRadix < 14 &&
+               (*FirstNonDecimal == 'd' || *FirstNonDecimal == 'D')) {
+      Radix = 10;
     } else if (FirstNonBinary && FirstNonBinary + 1 == CurPtr &&
-               (*FirstNonBinary == 'b' || *FirstNonBinary == 'B'))
+               DefaultRadix < 12 &&
+               (*FirstNonBinary == 'b' || *FirstNonBinary == 'B')) {
       Radix = 2;
+    }
 
-    if (Radix == 2 || Radix == 16) {
+    if (Radix) {
       StringRef Result(TokStart, CurPtr - TokStart);
       APInt Value(128, 0, true);
 
       if (Result.drop_back().getAsInteger(Radix, Value))
-        return ReturnError(TokStart, Radix == 2 ? "invalid binary number" :
-                             "invalid hexdecimal number");
+        return ReturnError(TokStart, "invalid " + radixName(Radix) + " number");
 
       // MSVC accepts and ignores type suffices on integer literals.
       SkipIgnoredIntegerSuffix(CurPtr);
 
       return intToken(Result, Value);
-   }
+    }
 
-    // octal/decimal integers, or floating point numbers, fall through
+    // default-radix integers, or floating point numbers, fall through
     CurPtr = OldCurPtr;
   }
 
+  // MASM default-radix integers: [0-9a-fA-F]+
+  // (All other integer literals have a radix specifier.)
+  if (LexMasmIntegers && UseMasmDefaultRadix) {
+    CurPtr = findLastDigit(CurPtr, 16);
+    StringRef Result(TokStart, CurPtr - TokStart);
+
+    APInt Value(128, 0, true);
+    if (Result.getAsInteger(DefaultRadix, Value)) {
+      return ReturnError(TokStart,
+                         "invalid " + radixName(DefaultRadix) + " number");
+    }
+
+    return intToken(Result, Value);
+  }
+
   // Decimal integer: [1-9][0-9]*
   if (CurPtr[-1] != '0' || CurPtr[0] == '.') {
     unsigned Radix = doHexLookAhead(CurPtr, 10, LexMasmIntegers);
@@ -339,13 +435,9 @@ AsmToken AsmLexer::LexDigit() {
     StringRef Result(TokStart, CurPtr - TokStart);
 
     APInt Value(128, 0, true);
-    if (Result.getAsInteger(Radix, Value))
-      return ReturnError(TokStart, !isHex ? "invalid decimal number" :
-                           "invalid hexdecimal number");
-
-    // Consume the [hH].
-    if (LexMasmIntegers && Radix == 16)
-      ++CurPtr;
+    if (Result.getAsInteger(Radix, Value)) {
+      return ReturnError(TokStart, "invalid " + radixName(Radix) + " number");
+    }
 
     // The darwin/x86 (and x86-64) assembler accepts and ignores type
     // suffices on integer literals.
@@ -416,11 +508,9 @@ AsmToken AsmLexer::LexDigit() {
   // Either octal or hexadecimal.
   APInt Value(128, 0, true);
   unsigned Radix = doHexLookAhead(CurPtr, 8, LexMasmIntegers);
-  bool isHex = Radix == 16;
   StringRef Result(TokStart, CurPtr - TokStart);
   if (Result.getAsInteger(Radix, Value))
-    return ReturnError(TokStart, !isHex ? "invalid octal number" :
-                       "invalid hexdecimal number");
+    return ReturnError(TokStart, "invalid " + radixName(Radix) + " number");
 
   // Consume the [hH].
   if (Radix == 16)
@@ -437,6 +527,24 @@ AsmToken AsmLexer::LexDigit() {
 AsmToken AsmLexer::LexSingleQuote() {
   int CurChar = getNextChar();
 
+  if (LexMasmStrings) {
+    while (CurChar != EOF) {
+      if (CurChar != '\'') {
+        CurChar = getNextChar();
+      } else if (peekNextChar() == '\'') {
+        // In MASM single-quote strings, doubled single-quotes mean an escaped
+        // single quote, so should be lexed in.
+        getNextChar();
+        CurChar = getNextChar();
+      } else {
+        break;
+      }
+    }
+    if (CurChar == EOF)
+      return ReturnError(TokStart, "unterminated string constant");
+    return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
+  }
+
   if (CurChar == '\\')
     CurChar = getNextChar();
 
@@ -471,6 +579,24 @@ AsmToken AsmLexer::LexSingleQuote() {
 /// LexQuote: String: "..."
 AsmToken AsmLexer::LexQuote() {
   int CurChar = getNextChar();
+  if (LexMasmStrings) {
+    while (CurChar != EOF) {
+      if (CurChar != '"') {
+        CurChar = getNextChar();
+      } else if (peekNextChar() == '"') {
+        // In MASM double-quoted strings, doubled double-quotes mean an escaped
+        // double quote, so should be lexed in.
+        getNextChar();
+        CurChar = getNextChar();
+      } else {
+        break;
+      }
+    }
+    if (CurChar == EOF)
+      return ReturnError(TokStart, "unterminated string constant");
+    return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
+  }
+
   // TODO: does gas allow multiline string constants?
   while (CurChar != '"') {
     if (CurChar == '\\') {
@@ -589,7 +715,7 @@ AsmToken AsmLexer::LexToken() {
   if (CurChar == EOF && !IsAtStartOfStatement && EndStatementAtEOF) {
     IsAtStartOfLine = true;
     IsAtStartOfStatement = true;
-    return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1));
+    return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 0));
   }
   IsAtStartOfLine = false;
   bool OldIsAtStartOfStatement = IsAtStartOfStatement;
diff --git a/contrib/llvm-project/llvm/lib/MC/MCParser/AsmParser.cpp b/contrib/llvm-project/llvm/lib/MC/MCParser/AsmParser.cpp
index c05f26cbdda5..c5ff241ead74 100644
--- a/contrib/llvm-project/llvm/lib/MC/MCParser/AsmParser.cpp
+++ b/contrib/llvm-project/llvm/lib/MC/MCParser/AsmParser.cpp
@@ -126,6 +126,7 @@ private:
   SourceMgr::DiagHandlerTy SavedDiagHandler;
   void *SavedDiagContext;
   std::unique_ptr<MCAsmParserExtension> PlatformParser;
+  SMLoc StartTokLoc;
 
   /// This is the current buffer index we're lexing from as managed by the
   /// SourceMgr object.
@@ -244,7 +245,8 @@ public:
 
   bool parseExpression(const MCExpr *&Res);
   bool parseExpression(const MCExpr *&Res, SMLoc &EndLoc) override;
-  bool parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) override;
+  bool parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc,
+                        AsmTypeInfo *TypeInfo) override;
   bool parseParenExpression(const MCExpr *&Res, SMLoc &EndLoc) override;
   bool parseParenExprOfDepth(unsigned ParenDepth, const MCExpr *&Res,
                              SMLoc &EndLoc) override;
@@ -513,6 +515,7 @@ private:
     DK_PRINT,
     DK_ADDRSIG,
     DK_ADDRSIG_SYM,
+    DK_PSEUDO_PROBE,
     DK_END
   };
 
@@ -676,6 +679,9 @@ private:
   // .print <double-quotes-string>
   bool parseDirectivePrint(SMLoc DirectiveLoc);
 
+  // .pseudoprobe
+  bool parseDirectivePseudoProbe();
+
   // Directives to support address-significance tables.
   bool parseDirectiveAddrsig();
   bool parseDirectiveAddrsigSym();
@@ -708,6 +714,8 @@ AsmParser::AsmParser(SourceMgr &SM, MCContext &Ctx, MCStreamer &Out,
   // Set our own handler which calls the saved handler.
   SrcMgr.setDiagHandler(DiagHandler, this);
   Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer());
+  // Make MCStreamer aware of the StartTokLoc for locations in diagnostics.
+  Out.setStartTokLocPtr(&StartTokLoc);
 
   // Initialize the platform / file format parser.
   switch (Ctx.getObjectFileInfo()->getObjectFileType()) {
@@ -741,6 +749,8 @@ AsmParser::~AsmParser() {
   assert((HadError || ActiveMacros.empty()) &&
          "Unexpected active macro instantiation!");
 
+  // Remove MCStreamer's reference to the parser SMLoc.
+  Out.setStartTokLocPtr(nullptr);
   // Restore the saved diagnostics handler and context for use during
   // finalization.
   SrcMgr.setDiagHandler(SavedDiagHandler, SavedDiagContext);
@@ -988,7 +998,7 @@ bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) {
   // Finalize the output stream if there are no errors and if the client wants
   // us to.
   if (!HadError && !NoFinalize)
-    Out.Finish();
+    Out.Finish(Lexer.getLoc());
 
   return HadError || getContext().hadError();
 }
@@ -1068,7 +1078,8 @@ bool AsmParser::parseBracketExpr(const MCExpr *&Res, SMLoc &EndLoc) {
 ///  primaryexpr ::= number
 ///  primaryexpr ::= '.'
 ///  primaryexpr ::= ~,+,- primaryexpr
-bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
+bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc,
+                                 AsmTypeInfo *TypeInfo) {
   SMLoc FirstTokenLoc = getLexer().getLoc();
   AsmToken::TokenKind FirstTokenKind = Lexer.getKind();
   switch (FirstTokenKind) {
@@ -1079,7 +1090,7 @@ bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
     return true;
   case AsmToken::Exclaim:
     Lex(); // Eat the operator.
-    if (parsePrimaryExpr(Res, EndLoc))
+    if (parsePrimaryExpr(Res, EndLoc, TypeInfo))
       return true;
     Res = MCUnaryExpr::createLNot(Res, getContext(), FirstTokenLoc);
     return false;
@@ -1238,19 +1249,19 @@ bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
     return parseBracketExpr(Res, EndLoc);
   case AsmToken::Minus:
     Lex(); // Eat the operator.
-    if (parsePrimaryExpr(Res, EndLoc))
+    if (parsePrimaryExpr(Res, EndLoc, TypeInfo))
       return true;
     Res = MCUnaryExpr::createMinus(Res, getContext(), FirstTokenLoc);
     return false;
   case AsmToken::Plus:
     Lex(); // Eat the operator.
-    if (parsePrimaryExpr(Res, EndLoc))
+    if (parsePrimaryExpr(Res, EndLoc, TypeInfo))
       return true;
     Res = MCUnaryExpr::createPlus(Res, getContext(), FirstTokenLoc);
     return false;
   case AsmToken::Tilde:
     Lex(); // Eat the operator.
-    if (parsePrimaryExpr(Res, EndLoc))
+    if (parsePrimaryExpr(Res, EndLoc, TypeInfo))
       return true;
     Res = MCUnaryExpr::createNot(Res, getContext(), FirstTokenLoc);
     return false;
@@ -1497,8 +1508,6 @@ static unsigned getDarwinBinOpPrecedence(AsmToken::TokenKind K,
     return 1;
 
   // Low Precedence: |, &, ^
-  //
-  // FIXME: gas seems to support '!' as an infix operator?
   case AsmToken::Pipe:
     Kind = MCBinaryExpr::Or;
     return 2;
@@ -1559,7 +1568,8 @@ static unsigned getDarwinBinOpPrecedence(AsmToken::TokenKind K,
   }
 }
 
-static unsigned getGNUBinOpPrecedence(AsmToken::TokenKind K,
+static unsigned getGNUBinOpPrecedence(const MCAsmInfo &MAI,
+                                      AsmToken::TokenKind K,
                                       MCBinaryExpr::Opcode &Kind,
                                       bool ShouldUseLogicalShr) {
   switch (K) {
@@ -1603,12 +1613,18 @@ static unsigned getGNUBinOpPrecedence(AsmToken::TokenKind K,
     Kind = MCBinaryExpr::Sub;
     return 4;
 
-  // High Intermediate Precedence: |, &, ^
+  // High Intermediate Precedence: |, !, &, ^
   //
-  // FIXME: gas seems to support '!' as an infix operator?
   case AsmToken::Pipe:
     Kind = MCBinaryExpr::Or;
     return 5;
+  case AsmToken::Exclaim:
+    // Hack to support ARM compatible aliases (implied 'sp' operand in 'srs*'
+    // instructions like 'srsda #31!') and not parse ! as an infix operator.
+    if (MAI.getCommentString() == "@")
+      return 0;
+    Kind = MCBinaryExpr::OrNot;
+    return 5;
   case AsmToken::Caret:
     Kind = MCBinaryExpr::Xor;
     return 5;
@@ -1639,7 +1655,7 @@ unsigned AsmParser::getBinOpPrecedence(AsmToken::TokenKind K,
                                        MCBinaryExpr::Opcode &Kind) {
   bool ShouldUseLogicalShr = MAI.shouldUseLogicalShr();
   return IsDarwin ? getDarwinBinOpPrecedence(K, Kind, ShouldUseLogicalShr)
-                  : getGNUBinOpPrecedence(K, Kind, ShouldUseLogicalShr);
+                  : getGNUBinOpPrecedence(MAI, K, Kind, ShouldUseLogicalShr);
 }
 
 /// Parse all binary operators with precedence >= 'Precedence'.
@@ -1698,6 +1714,7 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
   SMLoc IDLoc = ID.getLoc();
   StringRef IDVal;
   int64_t LocalLabelVal = -1;
+  StartTokLoc = ID.getLoc();
   if (Lexer.is(AsmToken::HashDirective))
     return parseCppHashLineFilenameComment(IDLoc,
                                            !isInsideMacroInstantiation());
@@ -2189,6 +2206,8 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
       return parseDirectiveAddrsig();
     case DK_ADDRSIG_SYM:
       return parseDirectiveAddrsigSym();
+    case DK_PSEUDO_PROBE:
+      return parseDirectivePseudoProbe();
     }
 
     return Error(IDLoc, "unknown directive");
@@ -2989,13 +3008,20 @@ bool AsmParser::parseAngleBracketString(std::string &Data) {
 }
 
 /// parseDirectiveAscii:
-///   ::= ( .ascii | .asciz | .string ) [ "string" ( , "string" )* ]
+//    ::= .ascii [ "string"+ ( , "string"+ )* ]
+///   ::= ( .asciz | .string ) [ "string" ( , "string" )* ]
 bool AsmParser::parseDirectiveAscii(StringRef IDVal, bool ZeroTerminated) {
   auto parseOp = [&]() -> bool {
     std::string Data;
-    if (checkForValidSection() || parseEscapedString(Data))
+    if (checkForValidSection())
       return true;
-    getStreamer().emitBytes(Data);
+    // Only support spaces as separators for .ascii directive for now. See the
+    // discusssion at https://reviews.llvm.org/D91460 for more details.
+    do {
+      if (parseEscapedString(Data))
+        return true;
+      getStreamer().emitBytes(Data);
+    } while (!ZeroTerminated && getTok().is(AsmToken::String));
     if (ZeroTerminated)
       getStreamer().emitBytes(StringRef("\0", 1));
     return false;
@@ -3323,6 +3349,8 @@ bool AsmParser::parseDirectiveAlign(bool IsPow2, unsigned ValueSize) {
       Alignment = 1;
     if (!isPowerOf2_64(Alignment))
       ReturnVal |= Error(AlignmentLoc, "alignment must be a power of 2");
+    if (!isUInt<32>(Alignment))
+      ReturnVal |= Error(AlignmentLoc, "alignment must be smaller than 2**32");
   }
 
   // Diagnose non-sensical max bytes to align.
@@ -4064,6 +4092,9 @@ bool AsmParser::parseDirectiveCFISections() {
       Debug = true;
   }
 
+  if (parseToken(AsmToken::EndOfStatement))
+    return addErrorSuffix(" in '.cfi_sections' directive");
+
   getStreamer().emitCFISections(EH, Debug);
   return false;
 }
@@ -4091,6 +4122,8 @@ bool AsmParser::parseDirectiveCFIStartProc() {
 /// parseDirectiveCFIEndProc
 /// ::= .cfi_endproc
 bool AsmParser::parseDirectiveCFIEndProc() {
+  if (parseToken(AsmToken::EndOfStatement))
+    return addErrorSuffix(" in '.cfi_endproc' directive");
   getStreamer().emitCFIEndProc();
   return false;
 }
@@ -5502,6 +5535,7 @@ void AsmParser::initializeDirectiveKindMap() {
   DirectiveKindMap[".print"] = DK_PRINT;
   DirectiveKindMap[".addrsig"] = DK_ADDRSIG;
   DirectiveKindMap[".addrsig_sym"] = DK_ADDRSIG_SYM;
+  DirectiveKindMap[".pseudoprobe"] = DK_PSEUDO_PROBE;
 }
 
 MCAsmMacro *AsmParser::parseMacroLikeBody(SMLoc DirectiveLoc) {
@@ -5757,6 +5791,69 @@ bool AsmParser::parseDirectiveAddrsigSym() {
   return false;
 }
 
+bool AsmParser::parseDirectivePseudoProbe() {
+  int64_t Guid;
+  int64_t Index;
+  int64_t Type;
+  int64_t Attr;
+
+  if (getLexer().is(AsmToken::Integer)) {
+    if (parseIntToken(Guid, "unexpected token in '.pseudoprobe' directive"))
+      return true;
+  }
+
+  if (getLexer().is(AsmToken::Integer)) {
+    if (parseIntToken(Index, "unexpected token in '.pseudoprobe' directive"))
+      return true;
+  }
+
+  if (getLexer().is(AsmToken::Integer)) {
+    if (parseIntToken(Type, "unexpected token in '.pseudoprobe' directive"))
+      return true;
+  }
+
+  if (getLexer().is(AsmToken::Integer)) {
+    if (parseIntToken(Attr, "unexpected token in '.pseudoprobe' directive"))
+      return true;
+  }
+
+  // Parse inline stack like @ GUID:11:12 @ GUID:1:11 @ GUID:3:21
+  MCPseudoProbeInlineStack InlineStack;
+
+  while (getLexer().is(AsmToken::At)) {
+    // eat @
+    Lex();
+
+    int64_t CallerGuid = 0;
+    if (getLexer().is(AsmToken::Integer)) {
+      if (parseIntToken(CallerGuid,
+                        "unexpected token in '.pseudoprobe' directive"))
+        return true;
+    }
+
+    // eat colon
+    if (getLexer().is(AsmToken::Colon))
+      Lex();
+
+    int64_t CallerProbeId = 0;
+    if (getLexer().is(AsmToken::Integer)) {
+      if (parseIntToken(CallerProbeId,
+                        "unexpected token in '.pseudoprobe' directive"))
+        return true;
+    }
+
+    InlineSite Site(CallerGuid, CallerProbeId);
+    InlineStack.push_back(Site);
+  }
+
+  if (parseToken(AsmToken::EndOfStatement,
+                 "unexpected token in '.pseudoprobe' directive"))
+    return true;
+
+  getStreamer().emitPseudoProbe(Guid, Index, Type, Attr, InlineStack);
+  return false;
+}
+
 // We are comparing pointers, but the pointers are relative to a single string.
 // Thus, this should always be deterministic.
 static int rewritesSort(const AsmRewrite *AsmRewriteA,
@@ -5878,7 +5975,7 @@ bool AsmParser::parseMSInlineAsm(
     // Consider implicit defs to be clobbers.  Think of cpuid and push.
     ArrayRef<MCPhysReg> ImpDefs(Desc.getImplicitDefs(),
                                 Desc.getNumImplicitDefs());
-    ClobberRegs.insert(ClobberRegs.end(), ImpDefs.begin(), ImpDefs.end());
+    llvm::append_range(ClobberRegs, ImpDefs);
   }
 
   // Set the number of Outputs and Inputs.
diff --git a/contrib/llvm-project/llvm/lib/MC/MCParser/COFFAsmParser.cpp b/contrib/llvm-project/llvm/lib/MC/MCParser/COFFAsmParser.cpp
index 2104fb83b309..3ac6a883417e 100644
--- a/contrib/llvm-project/llvm/lib/MC/MCParser/COFFAsmParser.cpp
+++ b/contrib/llvm-project/llvm/lib/MC/MCParser/COFFAsmParser.cpp
@@ -77,6 +77,8 @@ class COFFAsmParser : public MCAsmParserExtension {
                                                                    ".seh_proc");
     addDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveEndProc>(
                                                                 ".seh_endproc");
+    addDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveEndFuncletOrFunc>(
+                                                                ".seh_endfunclet");
     addDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveStartChained>(
                                                            ".seh_startchained");
     addDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveEndChained>(
@@ -131,6 +133,7 @@ class COFFAsmParser : public MCAsmParserExtension {
   // Win64 EH directives.
   bool ParseSEHDirectiveStartProc(StringRef, SMLoc);
   bool ParseSEHDirectiveEndProc(StringRef, SMLoc);
+  bool ParseSEHDirectiveEndFuncletOrFunc(StringRef, SMLoc);
   bool ParseSEHDirectiveStartChained(StringRef, SMLoc);
   bool ParseSEHDirectiveEndChained(StringRef, SMLoc);
   bool ParseSEHDirectiveHandler(StringRef, SMLoc);
@@ -139,7 +142,6 @@ class COFFAsmParser : public MCAsmParserExtension {
   bool ParseSEHDirectiveEndProlog(StringRef, SMLoc);
 
   bool ParseAtUnwindOrAtExcept(bool &unwind, bool &except);
-  bool ParseSEHRegisterNumber(unsigned &RegNo);
   bool ParseDirectiveSymbolAttribute(StringRef Directive, SMLoc);
 
 public:
@@ -629,6 +631,12 @@ bool COFFAsmParser::ParseSEHDirectiveEndProc(StringRef, SMLoc Loc) {
   return false;
 }
 
+bool COFFAsmParser::ParseSEHDirectiveEndFuncletOrFunc(StringRef, SMLoc Loc) {
+  Lex();
+  getStreamer().EmitWinCFIFuncletOrFuncEnd(Loc);
+  return false;
+}
+
 bool COFFAsmParser::ParseSEHDirectiveStartChained(StringRef, SMLoc Loc) {
   Lex();
   getStreamer().EmitWinCFIStartChained(Loc);
diff --git a/contrib/llvm-project/llvm/lib/MC/MCParser/COFFMasmParser.cpp b/contrib/llvm-project/llvm/lib/MC/MCParser/COFFMasmParser.cpp
index b7c48e92961b..95128cf7d184 100644
--- a/contrib/llvm-project/llvm/lib/MC/MCParser/COFFMasmParser.cpp
+++ b/contrib/llvm-project/llvm/lib/MC/MCParser/COFFMasmParser.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/BinaryFormat/COFF.h"
@@ -21,6 +20,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbolCOFF.h"
 #include "llvm/MC/SectionKind.h"
 #include "llvm/Support/SMLoc.h"
 #include <cassert>
@@ -53,6 +53,11 @@ class COFFMasmParser : public MCAsmParserExtension {
   bool ParseDirectiveSegmentEnd(StringRef, SMLoc);
   bool ParseDirectiveIncludelib(StringRef, SMLoc);
 
+  bool ParseDirectiveAlias(StringRef, SMLoc);
+
+  bool ParseSEHDirectiveAllocStack(StringRef, SMLoc);
+  bool ParseSEHDirectiveEndProlog(StringRef, SMLoc);
+
   bool IgnoreDirective(StringRef, SMLoc) {
     while (!getLexer().is(AsmToken::EndOfStatement)) {
       Lex();
@@ -65,13 +70,10 @@ class COFFMasmParser : public MCAsmParserExtension {
     MCAsmParserExtension::Initialize(Parser);
 
     // x64 directives
-    // .allocstack
-    // .endprolog
-    // .pushframe
-    // .pushreg
-    // .savereg
-    // .savexmm128
-    // .setframe
+    addDirectiveHandler<&COFFMasmParser::ParseSEHDirectiveAllocStack>(
+        ".allocstack");
+    addDirectiveHandler<&COFFMasmParser::ParseSEHDirectiveEndProlog>(
+        ".endprolog");
 
     // Code label directives
     // label
@@ -92,16 +94,11 @@ class COFFMasmParser : public MCAsmParserExtension {
 
     // Data allocation directives
     // align
-    // byte/sbyte
-    // dword/sdword
     // even
-    // fword
-    // qword
-    // real4
-    // real8
-    // real10
+    // mmword
     // tbyte
-    // word/sword
+    // xmmword
+    // ymmword
 
     // Listing control directives
     addDirectiveHandler<&COFFMasmParser::IgnoreDirective>(".cref");
@@ -120,27 +117,18 @@ class COFFMasmParser : public MCAsmParserExtension {
     addDirectiveHandler<&COFFMasmParser::IgnoreDirective>("title");
 
     // Macro directives
-    // endm
-    // exitm
     // goto
-    // local
-    // macro
-    // purge
 
     // Miscellaneous directives
-    // alias
+    addDirectiveHandler<&COFFMasmParser::ParseDirectiveAlias>("alias");
     // assume
     // .fpo
     addDirectiveHandler<&COFFMasmParser::ParseDirectiveIncludelib>(
         "includelib");
-    // mmword
     // option
     // popcontext
     // pushcontext
-    // .radix
     // .safeseh
-    // xmmword
-    // ymmword
 
     // Procedure directives
     addDirectiveHandler<&COFFMasmParser::ParseDirectiveEndProc>("endp");
@@ -148,7 +136,7 @@ class COFFMasmParser : public MCAsmParserExtension {
     addDirectiveHandler<&COFFMasmParser::ParseDirectiveProc>("proc");
     // proto
 
-    // Processor directives
+    // Processor directives; all ignored
     addDirectiveHandler<&COFFMasmParser::IgnoreDirective>(".386");
     addDirectiveHandler<&COFFMasmParser::IgnoreDirective>(".386P");
     addDirectiveHandler<&COFFMasmParser::IgnoreDirective>(".387");
@@ -162,13 +150,6 @@ class COFFMasmParser : public MCAsmParserExtension {
     addDirectiveHandler<&COFFMasmParser::IgnoreDirective>(".mmx");
     addDirectiveHandler<&COFFMasmParser::IgnoreDirective>(".xmm");
 
-    // Repeat blocks directives
-    // for
-    // forc
-    // goto
-    // repeat
-    // while
-
     // Scope directives
     // comm
     // externdef
@@ -202,11 +183,8 @@ class COFFMasmParser : public MCAsmParserExtension {
     // substr (equivalent to <name> TEXTEQU @SubStr(<params>))
 
     // Structure and record directives
-    // ends
     // record
-    // struct
     // typedef
-    // union
   }
 
   bool ParseSectionDirectiveCode(StringRef, SMLoc) {
@@ -234,6 +212,7 @@ class COFFMasmParser : public MCAsmParserExtension {
   }
 
   StringRef CurrentProcedure;
+  bool CurrentProcedureFramed;
 
 public:
   COFFMasmParser() = default;
@@ -353,16 +332,23 @@ bool COFFMasmParser::ParseDirectiveProc(StringRef Directive, SMLoc Loc) {
       nextLoc = getTok().getLoc();
     }
   }
-  MCSymbol *Sym = getContext().getOrCreateSymbol(Label);
-
-  // Define symbol as simple function
-  getStreamer().BeginCOFFSymbolDef(Sym);
-  getStreamer().EmitCOFFSymbolStorageClass(2);
-  getStreamer().EmitCOFFSymbolType(0x20);
-  getStreamer().EndCOFFSymbolDef();
-
+  MCSymbolCOFF *Sym = cast<MCSymbolCOFF>(getContext().getOrCreateSymbol(Label));
+
+  // Define symbol as simple external function
+  Sym->setExternal(true);
+  Sym->setType(COFF::IMAGE_SYM_DTYPE_FUNCTION << COFF::SCT_COMPLEX_TYPE_SHIFT);
+
+  bool Framed = false;
+  if (getLexer().is(AsmToken::Identifier) &&
+      getTok().getString().equals_lower("frame")) {
+    Lex();
+    Framed = true;
+    getStreamer().EmitWinCFIStartProc(Sym, Loc);
+  }
   getStreamer().emitLabel(Sym, Loc);
+
   CurrentProcedure = Label;
+  CurrentProcedureFramed = Framed;
   return false;
 }
 bool COFFMasmParser::ParseDirectiveEndProc(StringRef Directive, SMLoc Loc) {
@@ -376,6 +362,49 @@ bool COFFMasmParser::ParseDirectiveEndProc(StringRef Directive, SMLoc Loc) {
   else if (CurrentProcedure != Label)
     return Error(LabelLoc, "endp does not match current procedure '" +
                                CurrentProcedure + "'");
+
+  if (CurrentProcedureFramed) {
+    getStreamer().EmitWinCFIEndProc(Loc);
+  }
+  CurrentProcedure = "";
+  CurrentProcedureFramed = false;
+  return false;
+}
+
+bool COFFMasmParser::ParseDirectiveAlias(StringRef Directive, SMLoc Loc) {
+  std::string AliasName, ActualName;
+  if (getTok().isNot(AsmToken::Less) ||
+      getParser().parseAngleBracketString(AliasName))
+    return Error(getTok().getLoc(), "expected <aliasName>");
+  if (getParser().parseToken(AsmToken::Equal))
+    return addErrorSuffix(" in " + Directive + " directive");
+  if (getTok().isNot(AsmToken::Less) ||
+      getParser().parseAngleBracketString(ActualName))
+    return Error(getTok().getLoc(), "expected <actualName>");
+
+  MCSymbol *Alias = getContext().getOrCreateSymbol(AliasName);
+  MCSymbol *Actual = getContext().getOrCreateSymbol(ActualName);
+
+  getStreamer().emitWeakReference(Alias, Actual);
+
+  return false;
+}
+
+bool COFFMasmParser::ParseSEHDirectiveAllocStack(StringRef Directive,
+                                                 SMLoc Loc) {
+  int64_t Size;
+  SMLoc SizeLoc = getTok().getLoc();
+  if (getParser().parseAbsoluteExpression(Size))
+    return Error(SizeLoc, "expected integer size");
+  if (Size % 8 != 0)
+    return Error(SizeLoc, "stack size must be a multiple of 8");
+  getStreamer().EmitWinCFIAllocStack(static_cast<unsigned>(Size), Loc);
+  return false;
+}
+
+bool COFFMasmParser::ParseSEHDirectiveEndProlog(StringRef Directive,
+                                                SMLoc Loc) {
+  getStreamer().EmitWinCFIEndProlog(Loc);
   return false;
 }
 
diff --git a/contrib/llvm-project/llvm/lib/MC/MCParser/DarwinAsmParser.cpp b/contrib/llvm-project/llvm/lib/MC/MCParser/DarwinAsmParser.cpp
index b670355a392b..926483451259 100644
--- a/contrib/llvm-project/llvm/lib/MC/MCParser/DarwinAsmParser.cpp
+++ b/contrib/llvm-project/llvm/lib/MC/MCParser/DarwinAsmParser.cpp
@@ -1152,6 +1152,7 @@ static Triple::OSType getOSTypeFromPlatform(MachO::PlatformType Type) {
   case MachO::PLATFORM_IOSSIMULATOR:     /* silence warning */ break;
   case MachO::PLATFORM_TVOSSIMULATOR:    /* silence warning */ break;
   case MachO::PLATFORM_WATCHOSSIMULATOR: /* silence warning */ break;
+  case MachO::PLATFORM_DRIVERKIT:        /* silence warning */ break;
   }
   llvm_unreachable("Invalid mach-o platform type");
 }
diff --git a/contrib/llvm-project/llvm/lib/MC/MCParser/ELFAsmParser.cpp b/contrib/llvm-project/llvm/lib/MC/MCParser/ELFAsmParser.cpp
index fb8215ef2281..65ac1d6b5ba0 100644
--- a/contrib/llvm-project/llvm/lib/MC/MCParser/ELFAsmParser.cpp
+++ b/contrib/llvm-project/llvm/lib/MC/MCParser/ELFAsmParser.cpp
@@ -450,8 +450,14 @@ bool ELFAsmParser::parseLinkedToSym(MCSymbolELF *&LinkedToSym) {
   Lex();
   StringRef Name;
   SMLoc StartLoc = L.getLoc();
-  if (getParser().parseIdentifier(Name))
+  if (getParser().parseIdentifier(Name)) {
+    if (getParser().getTok().getString() == "0") {
+      getParser().Lex();
+      LinkedToSym = nullptr;
+      return false;
+    }
     return TokError("invalid linked-to symbol");
+  }
   LinkedToSym = dyn_cast_or_null<MCSymbolELF>(getContext().lookupSymbol(Name));
   if (!LinkedToSym || !LinkedToSym->isInSection())
     return Error(StartLoc, "linked-to symbol is not in a section: " + Name);
@@ -620,6 +626,8 @@ EndStmt:
       Type = ELF::SHT_LLVM_DEPENDENT_LIBRARIES;
     else if (TypeName == "llvm_sympart")
       Type = ELF::SHT_LLVM_SYMPART;
+    else if (TypeName == "llvm_bb_addr_map")
+      Type = ELF::SHT_LLVM_BB_ADDR_MAP;
     else if (TypeName.getAsInteger(0, Type))
       return TokError("unknown section type");
   }
@@ -654,7 +662,9 @@ EndStmt:
     Error(loc, "changed section entsize for " + SectionName +
                    ", expected: " + Twine(Section->getEntrySize()));
 
-  if (getContext().getGenDwarfForAssembly()) {
+  if (getContext().getGenDwarfForAssembly() &&
+      (Section->getFlags() & ELF::SHF_ALLOC) &&
+      (Section->getFlags() & ELF::SHF_EXECINSTR)) {
     bool InsertResult = getContext().addGenDwarfSection(Section);
     if (InsertResult) {
       if (getContext().getDwarfVersion() <= 2)
diff --git a/contrib/llvm-project/llvm/lib/MC/MCParser/MasmParser.cpp b/contrib/llvm-project/llvm/lib/MC/MCParser/MasmParser.cpp
index e2aaeaae03b0..4957ee7a0323 100644
--- a/contrib/llvm-project/llvm/lib/MC/MCParser/MasmParser.cpp
+++ b/contrib/llvm-project/llvm/lib/MC/MCParser/MasmParser.cpp
@@ -14,12 +14,14 @@
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
@@ -50,6 +52,7 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/MD5.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/MemoryBuffer.h"
@@ -105,6 +108,9 @@ struct ParseStatementInfo {
   /// Was there an error parsing the inline assembly?
   bool ParseError = false;
 
+  /// The value associated with a macro exit.
+  Optional<std::string> ExitValue;
+
   SmallVectorImpl<AsmRewrite> *AsmRewrites = nullptr;
 
   ParseStatementInfo() = delete;
@@ -122,12 +128,14 @@ struct FieldInfo;
 struct StructInfo {
   StringRef Name;
   bool IsUnion = false;
-  size_t Alignment = 0;
-  size_t Size = 0;
+  unsigned Alignment = 0;
+  unsigned Size = 0;
+  unsigned AlignmentSize = 0;
   std::vector<FieldInfo> Fields;
   StringMap<size_t> FieldsByName;
 
-  FieldInfo &addField(StringRef FieldName, FieldType FT);
+  FieldInfo &addField(StringRef FieldName, FieldType FT,
+                      unsigned FieldAlignmentSize);
 
   StructInfo() = default;
 
@@ -317,30 +325,32 @@ struct FieldInfo {
   size_t Offset = 0;
 
   // Total size of the field (= LengthOf * Type).
-  size_t SizeOf = 0;
+  unsigned SizeOf = 0;
 
   // Number of elements in the field (1 if scalar, >1 if an array).
-  size_t LengthOf = 0;
+  unsigned LengthOf = 0;
 
   // Size of a single entry in this field, in bytes ("type" in MASM standards).
-  size_t Type = 0;
+  unsigned Type = 0;
 
   FieldInitializer Contents;
 
   FieldInfo(FieldType FT) : Contents(FT) {}
 };
 
-FieldInfo &StructInfo::addField(StringRef FieldName, FieldType FT) {
+FieldInfo &StructInfo::addField(StringRef FieldName, FieldType FT,
+                                unsigned FieldAlignmentSize) {
   if (!FieldName.empty())
-    FieldsByName[FieldName] = Fields.size();
+    FieldsByName[FieldName.lower()] = Fields.size();
   Fields.emplace_back(FT);
   FieldInfo &Field = Fields.back();
   if (IsUnion) {
     Field.Offset = 0;
   } else {
-    Size = llvm::alignTo(Size, Alignment);
+    Size = llvm::alignTo(Size, std::min(Alignment, FieldAlignmentSize));
     Field.Offset = Size;
   }
+  AlignmentSize = std::max(AlignmentSize, FieldAlignmentSize);
   return Field;
 }
 
@@ -361,6 +371,7 @@ private:
   /// This is the current buffer index we're lexing from as managed by the
   /// SourceMgr object.
   unsigned CurBuffer;
+  std::vector<bool> EndStatementAtEOFStack;
 
   AsmCond TheCondState;
   std::vector<AsmCond> TheCondStack;
@@ -386,8 +397,8 @@ private:
   /// Maps struct tags to struct definitions.
   StringMap<StructInfo> Structs;
 
-  /// Maps data location names to user-defined types.
-  StringMap<const StructInfo *> KnownType;
+  /// Maps data location names to types.
+  StringMap<AsmTypeInfo> KnownType;
 
   /// Stack of active macro instantiations.
   std::vector<MacroInstantiation*> ActiveMacros;
@@ -427,12 +438,12 @@ private:
   /// Did we already inform the user about inconsistent MD5 usage?
   bool ReportedInconsistentMD5 = false;
 
-  // Is alt macro mode enabled.
-  bool AltMacroMode = false;
-
   // Current <...> expression depth.
   unsigned AngleBracketDepth = 0U;
 
+  // Number of locals defined.
+  uint16_t LocalCounter = 0;
+
 public:
   MasmParser(SourceMgr &SM, MCContext &Ctx, MCStreamer &Out,
              const MCAsmInfo &MAI, unsigned CB);
@@ -490,10 +501,13 @@ public:
 
   bool isParsingMasm() const override { return true; }
 
-  bool lookUpField(StringRef Name, StringRef &Type,
-                   unsigned &Offset) const override;
-  bool lookUpField(StringRef Base, StringRef Member, StringRef &Type,
-                   unsigned &Offset) const override;
+  bool defineMacro(StringRef Name, StringRef Value) override;
+
+  bool lookUpField(StringRef Name, AsmFieldInfo &Info) const override;
+  bool lookUpField(StringRef Base, StringRef Member,
+                   AsmFieldInfo &Info) const override;
+
+  bool lookUpType(StringRef Name, AsmTypeInfo &Info) const override;
 
   bool parseMSInlineAsm(void *AsmLoc, std::string &AsmString,
                         unsigned &NumOutputs, unsigned &NumInputs,
@@ -505,7 +519,8 @@ public:
 
   bool parseExpression(const MCExpr *&Res);
   bool parseExpression(const MCExpr *&Res, SMLoc &EndLoc) override;
-  bool parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) override;
+  bool parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc,
+                        AsmTypeInfo *TypeInfo) override;
   bool parseParenExpression(const MCExpr *&Res, SMLoc &EndLoc) override;
   bool parseParenExprOfDepth(unsigned ParenDepth, const MCExpr *&Res,
                              SMLoc &EndLoc) override;
@@ -530,12 +545,10 @@ private:
   bool parseCurlyBlockScope(SmallVectorImpl<AsmRewrite>& AsmStrRewrites);
   bool parseCppHashLineFilenameComment(SMLoc L);
 
-  void checkForBadMacro(SMLoc DirectiveLoc, StringRef Name, StringRef Body,
-                        ArrayRef<MCAsmMacroParameter> Parameters);
   bool expandMacro(raw_svector_ostream &OS, StringRef Body,
                    ArrayRef<MCAsmMacroParameter> Parameters,
-                   ArrayRef<MCAsmMacroArgument> A, bool EnableAtPseudoVariable,
-                   SMLoc L);
+                   ArrayRef<MCAsmMacroArgument> A,
+                   const std::vector<std::string> &Locals, SMLoc L);
 
   /// Are we inside a macro instantiation?
   bool isInsideMacroInstantiation() {return !ActiveMacros.empty();}
@@ -544,18 +557,33 @@ private:
   ///
   /// \param M The macro.
   /// \param NameLoc Instantiation location.
-  bool handleMacroEntry(const MCAsmMacro *M, SMLoc NameLoc);
+  bool handleMacroEntry(
+      const MCAsmMacro *M, SMLoc NameLoc,
+      AsmToken::TokenKind ArgumentEndTok = AsmToken::EndOfStatement);
+
+  /// Handle invocation of macro function.
+  ///
+  /// \param M The macro.
+  /// \param NameLoc Invocation location.
+  bool handleMacroInvocation(const MCAsmMacro *M, SMLoc NameLoc);
 
   /// Handle exit from macro instantiation.
   void handleMacroExit();
 
   /// Extract AsmTokens for a macro argument.
-  bool parseMacroArgument(MCAsmMacroArgument &MA, bool Vararg);
+  bool
+  parseMacroArgument(const MCAsmMacroParameter *MP, MCAsmMacroArgument &MA,
+                     AsmToken::TokenKind EndTok = AsmToken::EndOfStatement);
 
   /// Parse all macro arguments for a given macro.
-  bool parseMacroArguments(const MCAsmMacro *M, MCAsmMacroArguments &A);
+  bool
+  parseMacroArguments(const MCAsmMacro *M, MCAsmMacroArguments &A,
+                      AsmToken::TokenKind EndTok = AsmToken::EndOfStatement);
 
   void printMacroInstantiations();
+
+  bool expandStatement(SMLoc Loc);
+
   void printMessage(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Msg,
                     SMRange Range = None) const {
     ArrayRef<SMRange> Ranges(Range);
@@ -564,7 +592,7 @@ private:
   static void DiagHandler(const SMDiagnostic &Diag, void *Context);
 
   bool lookUpField(const StructInfo &Structure, StringRef Member,
-                   StringRef &Type, unsigned &Offset) const;
+                   AsmFieldInfo &Info) const;
 
   /// Should we emit DWARF describing this assembler source?  (Returns false if
   /// the source has .file directives, which means we don't want to generate
@@ -580,11 +608,19 @@ private:
   ///
   /// \param InBuffer If not 0, should be the known buffer id that contains the
   /// location.
-  void jumpToLoc(SMLoc Loc, unsigned InBuffer = 0);
-
-  /// Parse up to the end of statement and a return the contents from the
-  /// current token until the end of the statement; the current token on exit
-  /// will be either the EndOfStatement or EOF.
+  void jumpToLoc(SMLoc Loc, unsigned InBuffer = 0,
+                 bool EndStatementAtEOF = true);
+
+  /// Parse up to a token of kind \p EndTok and return the contents from the
+  /// current token up to (but not including) this token; the current token on
+  /// exit will be either this kind or EOF. Reads through instantiated macro
+  /// functions and text macros.
+  SmallVector<StringRef, 1> parseStringRefsTo(AsmToken::TokenKind EndTok);
+  std::string parseStringTo(AsmToken::TokenKind EndTok);
+
+  /// Parse up to the end of statement and return the contents from the current
+  /// token until the end of the statement; the current token on exit will be
+  /// either the EndOfStatement or EOF.
   StringRef parseStringToEndOfStatement() override;
 
   bool parseTextItem(std::string &Data);
@@ -622,10 +658,12 @@ private:
     DK_SQWORD,
     DK_DB,
     DK_DD,
+    DK_DF,
     DK_DQ,
     DK_DW,
     DK_REAL4,
     DK_REAL8,
+    DK_REAL10,
     DK_ALIGN,
     DK_ORG,
     DK_ENDR,
@@ -634,9 +672,10 @@ private:
     DK_COMM,
     DK_COMMENT,
     DK_INCLUDE,
-    DK_REPT,
-    DK_IRP,
-    DK_IRPC,
+    DK_REPEAT,
+    DK_WHILE,
+    DK_FOR,
+    DK_FORC,
     DK_IF,
     DK_IFE,
     DK_IFB,
@@ -697,12 +736,10 @@ private:
     DK_CFI_REGISTER,
     DK_CFI_WINDOW_SAVE,
     DK_CFI_B_KEY_FRAME,
-    DK_ALTMACRO,
-    DK_NOALTMACRO,
     DK_MACRO,
     DK_EXITM,
     DK_ENDM,
-    DK_PURGEM,
+    DK_PURGE,
     DK_ERR,
     DK_ERRB,
     DK_ERRNB,
@@ -718,13 +755,21 @@ private:
     DK_STRUCT,
     DK_UNION,
     DK_ENDS,
-    DK_END
+    DK_END,
+    DK_PUSHFRAME,
+    DK_PUSHREG,
+    DK_SAVEREG,
+    DK_SAVEXMM128,
+    DK_SETFRAME,
+    DK_RADIX,
   };
 
   /// Maps directive name --> DirectiveKind enum, for directives parsed by this
   /// class.
   StringMap<DirectiveKind> DirectiveKindMap;
 
+  bool isMacroLikeDirective();
+
   // Codeview def_range type parsing.
   enum CVDefRangeType {
     CVDR_DEFRANGE = 0, // Placeholder
@@ -738,8 +783,6 @@ private:
   /// def_range types parsed by this class.
   StringMap<CVDefRangeType> CVDefRangeTypeMap;
 
-  bool parseInitValue(unsigned Size);
-
   // ".ascii", ".asciz", ".string"
   bool parseDirectiveAscii(StringRef IDVal, bool ZeroTerminated);
 
@@ -751,22 +794,24 @@ private:
   bool parseScalarInstList(
       unsigned Size, SmallVectorImpl<const MCExpr *> &Values,
       const AsmToken::TokenKind EndToken = AsmToken::EndOfStatement);
-  bool emitIntegralValues(unsigned Size);
+  bool emitIntegralValues(unsigned Size, unsigned *Count = nullptr);
   bool addIntegralField(StringRef Name, unsigned Size);
   bool parseDirectiveValue(StringRef IDVal, unsigned Size);
-  bool parseDirectiveNamedValue(StringRef IDVal, unsigned Size, StringRef Name,
-                                SMLoc NameLoc);
-
-  // "real4", "real8"
-  bool emitRealValues(const fltSemantics &Semantics);
-  bool addRealField(StringRef Name, const fltSemantics &Semantics);
-  bool parseDirectiveRealValue(StringRef IDVal, const fltSemantics &Semantics);
+  bool parseDirectiveNamedValue(StringRef TypeName, unsigned Size,
+                                StringRef Name, SMLoc NameLoc);
+
+  // "real4", "real8", "real10"
+  bool emitRealValues(const fltSemantics &Semantics, unsigned *Count = nullptr);
+  bool addRealField(StringRef Name, const fltSemantics &Semantics, size_t Size);
+  bool parseDirectiveRealValue(StringRef IDVal, const fltSemantics &Semantics,
+                               size_t Size);
   bool parseRealInstList(
       const fltSemantics &Semantics, SmallVectorImpl<APInt> &Values,
       const AsmToken::TokenKind EndToken = AsmToken::EndOfStatement);
-  bool parseDirectiveNamedRealValue(StringRef IDVal,
+  bool parseDirectiveNamedRealValue(StringRef TypeName,
                                     const fltSemantics &Semantics,
-                                    StringRef Name, SMLoc NameLoc);
+                                    unsigned Size, StringRef Name,
+                                    SMLoc NameLoc);
 
   bool parseOptionalAngleBracketOpen();
   bool parseAngleBracketClose(const Twine &Msg = "expected '>'");
@@ -794,8 +839,6 @@ private:
   bool emitFieldValue(const FieldInfo &Field, const RealFieldInfo &Contents);
   bool emitFieldValue(const FieldInfo &Field, const StructFieldInfo &Contents);
 
-  bool emitStructValue(const StructInfo &Structure);
-
   bool emitFieldInitializer(const FieldInfo &Field,
                             const FieldInitializer &Initializer);
   bool emitFieldInitializer(const FieldInfo &Field,
@@ -812,7 +855,7 @@ private:
                              const StructInitializer &Initializer);
 
   // User-defined types (structs, unions):
-  bool emitStructValues(const StructInfo &Structure);
+  bool emitStructValues(const StructInfo &Structure, unsigned *Count = nullptr);
   bool addStructField(StringRef Name, const StructInfo &Structure);
   bool parseDirectiveStructValue(const StructInfo &Structure,
                                  StringRef Directive, SMLoc DirLoc);
@@ -872,11 +915,10 @@ private:
 
   // macro directives
   bool parseDirectivePurgeMacro(SMLoc DirectiveLoc);
-  bool parseDirectiveExitMacro(StringRef Directive);
+  bool parseDirectiveExitMacro(SMLoc DirectiveLoc, StringRef Directive,
+                               std::string &Value);
   bool parseDirectiveEndMacro(StringRef Directive);
-  bool parseDirectiveMacro(SMLoc DirectiveLoc);
-  // alternate macro mode directives
-  bool parseDirectiveAltmacro(StringRef Directive);
+  bool parseDirectiveMacro(StringRef Name, SMLoc NameLoc);
 
   bool parseDirectiveStruct(StringRef Directive, DirectiveKind DirKind,
                             StringRef Name, SMLoc NameLoc);
@@ -923,10 +965,12 @@ private:
   MCAsmMacro *parseMacroLikeBody(SMLoc DirectiveLoc);
   void instantiateMacroLikeBody(MCAsmMacro *M, SMLoc DirectiveLoc,
                                 raw_svector_ostream &OS);
-  bool parseDirectiveRept(SMLoc DirectiveLoc, StringRef Directive);
-  bool parseDirectiveIrp(SMLoc DirectiveLoc);  // ".irp"
-  bool parseDirectiveIrpc(SMLoc DirectiveLoc); // ".irpc"
-  bool parseDirectiveEndr(SMLoc DirectiveLoc); // ".endr"
+  void instantiateMacroLikeBody(MCAsmMacro *M, SMLoc DirectiveLoc,
+                                SMLoc ExitLoc, raw_svector_ostream &OS);
+  bool parseDirectiveRepeat(SMLoc DirectiveLoc, StringRef Directive);
+  bool parseDirectiveFor(SMLoc DirectiveLoc, StringRef Directive);
+  bool parseDirectiveForc(SMLoc DirectiveLoc, StringRef Directive);
+  bool parseDirectiveWhile(SMLoc DirectiveLoc);
 
   // "_emit" or "__emit"
   bool parseDirectiveMSEmit(SMLoc DirectiveLoc, ParseStatementInfo &Info,
@@ -951,6 +995,9 @@ private:
   // ".erre" or ".errnz", depending on ExpectZero.
   bool parseDirectiveErrorIfe(SMLoc DirectiveLoc, bool ExpectZero);
 
+  // ".radix"
+  bool parseDirectiveRadix(SMLoc DirectiveLoc);
+
   // "echo"
   bool parseDirectiveEcho();
 
@@ -979,6 +1026,7 @@ MasmParser::MasmParser(SourceMgr &SM, MCContext &Ctx, MCStreamer &Out,
   // Set our own handler which calls the saved handler.
   SrcMgr.setDiagHandler(DiagHandler, this);
   Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer());
+  EndStatementAtEOFStack.push_back(true);
 
   // Initialize the platform / file format parser.
   switch (Ctx.getObjectFileInfo()->getObjectFileType()) {
@@ -1048,13 +1096,15 @@ bool MasmParser::enterIncludeFile(const std::string &Filename) {
 
   CurBuffer = NewBuf;
   Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer());
+  EndStatementAtEOFStack.push_back(true);
   return false;
 }
 
-void MasmParser::jumpToLoc(SMLoc Loc, unsigned InBuffer) {
+void MasmParser::jumpToLoc(SMLoc Loc, unsigned InBuffer,
+                           bool EndStatementAtEOF) {
   CurBuffer = InBuffer ? InBuffer : SrcMgr.FindBufferContainingLoc(Loc);
   Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(),
-                  Loc.getPointer());
+                  Loc.getPointer(), EndStatementAtEOF);
 }
 
 const AsmToken &MasmParser::Lex() {
@@ -1072,8 +1122,11 @@ const AsmToken &MasmParser::Lex() {
   const AsmToken *tok = &Lexer.Lex();
 
   while (tok->is(AsmToken::Identifier)) {
-    auto it = Variables.find(tok->getIdentifier());
+    auto it = Variables.find(tok->getIdentifier().lower());
+    const llvm::MCAsmMacro *M =
+        getContext().lookupMacro(tok->getIdentifier().lower());
     if (it != Variables.end() && it->second.IsText) {
+      // This is a textmacro; expand it in place.
       std::unique_ptr<MemoryBuffer> Instantiation =
           MemoryBuffer::getMemBufferCopy(it->second.TextValue,
                                          "<instantiation>");
@@ -1083,7 +1136,17 @@ const AsmToken &MasmParser::Lex() {
                                             getTok().getEndLoc());
       Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(), nullptr,
                       /*EndStatementAtEOF=*/false);
+      EndStatementAtEOFStack.push_back(false);
+      tok = &Lexer.Lex();
+    } else if (M && M->IsFunction && Lexer.peekTok().is(AsmToken::LParen)) {
+      // This is a macro function invocation; expand it in place.
+      const AsmToken MacroTok = *tok;
       tok = &Lexer.Lex();
+      if (handleMacroInvocation(M, MacroTok.getLoc())) {
+        Lexer.UnLex(AsmToken(AsmToken::Error, MacroTok.getIdentifier()));
+        tok = &Lexer.Lex();
+      }
+      continue;
     } else {
       break;
     }
@@ -1096,14 +1159,25 @@ const AsmToken &MasmParser::Lex() {
     tok = &Lexer.Lex();
   }
 
+  // Recognize and bypass line continuations.
+  while (tok->is(AsmToken::BackSlash) &&
+         Lexer.peekTok().is(AsmToken::EndOfStatement)) {
+    // Eat both the backslash and the end of statement.
+    Lexer.Lex();
+    tok = &Lexer.Lex();
+  }
+
   if (tok->is(AsmToken::Eof)) {
     // If this is the end of an included file, pop the parent file off the
     // include stack.
     SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer);
     if (ParentIncludeLoc != SMLoc()) {
-      jumpToLoc(ParentIncludeLoc);
+      EndStatementAtEOFStack.pop_back();
+      jumpToLoc(ParentIncludeLoc, 0, EndStatementAtEOFStack.back());
       return Lex();
     }
+    EndStatementAtEOFStack.pop_back();
+    assert(EndStatementAtEOFStack.empty());
   }
 
   return *tok;
@@ -1162,7 +1236,12 @@ bool MasmParser::Run(bool NoInitialTextSection, bool NoFinalize) {
   }
 
   // While we have input, parse each statement.
-  while (Lexer.isNot(AsmToken::Eof)) {
+  while (Lexer.isNot(AsmToken::Eof) ||
+         SrcMgr.getParentIncludeLoc(CurBuffer) != SMLoc()) {
+    // Skip through the EOF at the end of an inclusion.
+    if (Lexer.is(AsmToken::Eof))
+      Lex();
+
     ParseStatementInfo Info(&AsmStrRewrites);
     bool Parsed = parseStatement(Info, nullptr);
 
@@ -1240,7 +1319,7 @@ bool MasmParser::Run(bool NoInitialTextSection, bool NoFinalize) {
   // Finalize the output stream if there are no errors and if the client wants
   // us to.
   if (!HadError && !NoFinalize)
-    Out.Finish();
+    Out.Finish(Lexer.getLoc());
 
   return HadError || getContext().hadError();
 }
@@ -1256,14 +1335,58 @@ bool MasmParser::checkForValidSection() {
 
 /// Throw away the rest of the line for testing purposes.
 void MasmParser::eatToEndOfStatement() {
-  while (Lexer.isNot(AsmToken::EndOfStatement) && Lexer.isNot(AsmToken::Eof))
+  while (Lexer.isNot(AsmToken::EndOfStatement)) {
+    if (Lexer.is(AsmToken::Eof)) {
+      SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer);
+      if (ParentIncludeLoc == SMLoc()) {
+        break;
+      }
+
+      EndStatementAtEOFStack.pop_back();
+      jumpToLoc(ParentIncludeLoc, 0, EndStatementAtEOFStack.back());
+    }
+
     Lexer.Lex();
+  }
 
   // Eat EOL.
   if (Lexer.is(AsmToken::EndOfStatement))
     Lexer.Lex();
 }
 
+SmallVector<StringRef, 1>
+MasmParser::parseStringRefsTo(AsmToken::TokenKind EndTok) {
+  SmallVector<StringRef, 1> Refs;
+  const char *Start = getTok().getLoc().getPointer();
+  while (Lexer.isNot(EndTok)) {
+    if (Lexer.is(AsmToken::Eof)) {
+      SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer);
+      if (ParentIncludeLoc == SMLoc()) {
+        break;
+      }
+      Refs.emplace_back(Start, getTok().getLoc().getPointer() - Start);
+
+      EndStatementAtEOFStack.pop_back();
+      jumpToLoc(ParentIncludeLoc, 0, EndStatementAtEOFStack.back());
+      Lexer.Lex();
+      Start = getTok().getLoc().getPointer();
+    } else {
+      Lexer.Lex();
+    }
+  }
+  Refs.emplace_back(Start, getTok().getLoc().getPointer() - Start);
+  return Refs;
+}
+
+std::string MasmParser::parseStringTo(AsmToken::TokenKind EndTok) {
+  SmallVector<StringRef, 1> Refs = parseStringRefsTo(EndTok);
+  std::string Str;
+  for (StringRef S : Refs) {
+    Str.append(S.str());
+  }
+  return Str;
+}
+
 StringRef MasmParser::parseStringToEndOfStatement() {
   const char *Start = getTok().getLoc().getPointer();
 
@@ -1308,8 +1431,11 @@ bool MasmParser::parseBracketExpr(const MCExpr *&Res, SMLoc &EndLoc) {
 ///  primaryexpr ::= symbol
 ///  primaryexpr ::= number
 ///  primaryexpr ::= '.'
-///  primaryexpr ::= ~,+,- primaryexpr
-bool MasmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
+///  primaryexpr ::= ~,+,-,'not' primaryexpr
+///  primaryexpr ::= string
+///          (a string is interpreted as a 64-bit number in big-endian base-256)
+bool MasmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc,
+                                  AsmTypeInfo *TypeInfo) {
   SMLoc FirstTokenLoc = getLexer().getLoc();
   AsmToken::TokenKind FirstTokenKind = Lexer.getKind();
   switch (FirstTokenKind) {
@@ -1320,13 +1446,12 @@ bool MasmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
     return true;
   case AsmToken::Exclaim:
     Lex(); // Eat the operator.
-    if (parsePrimaryExpr(Res, EndLoc))
+    if (parsePrimaryExpr(Res, EndLoc, nullptr))
       return true;
     Res = MCUnaryExpr::createLNot(Res, getContext(), FirstTokenLoc);
     return false;
   case AsmToken::Dollar:
   case AsmToken::At:
-  case AsmToken::String:
   case AsmToken::Identifier: {
     StringRef Identifier;
     if (parseIdentifier(Identifier)) {
@@ -1346,6 +1471,13 @@ bool MasmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
         return Error(FirstTokenLoc, "invalid token in expression");
       }
     }
+    // Parse named bitwise negation.
+    if (Identifier.equals_lower("not")) {
+      if (parsePrimaryExpr(Res, EndLoc, nullptr))
+        return true;
+      Res = MCUnaryExpr::createNot(Res, getContext(), FirstTokenLoc);
+      return false;
+    }
     // Parse symbol variant.
     std::pair<StringRef, StringRef> Split;
     if (!MAI.useParensForSymbolVariant()) {
@@ -1396,24 +1528,19 @@ bool MasmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
     }
 
     // Find the field offset if used.
-    StringRef Type;
-    unsigned Offset = 0;
+    AsmFieldInfo Info;
     Split = SymbolName.split('.');
-    if (!Split.second.empty()) {
+    if (Split.second.empty()) {
+    } else {
       SymbolName = Split.first;
-      if (Structs.count(SymbolName.lower()) &&
-          !lookUpField(SymbolName, Split.second, Type, Offset)) {
-        // This is actually a reference to a field offset.
-        Res = MCConstantExpr::create(Offset, getContext());
-        return false;
-      }
-
-      auto TypeIt = KnownType.find(SymbolName);
-      if (TypeIt == KnownType.end() ||
-          lookUpField(*TypeIt->second, Split.second, Type, Offset)) {
+      if (lookUpField(SymbolName, Split.second, Info)) {
         std::pair<StringRef, StringRef> BaseMember = Split.second.split('.');
         StringRef Base = BaseMember.first, Member = BaseMember.second;
-        lookUpField(Base, Member, Type, Offset);
+        lookUpField(Base, Member, Info);
+      } else if (Structs.count(SymbolName.lower())) {
+        // This is actually a reference to a field offset.
+        Res = MCConstantExpr::create(Info.Offset, getContext());
+        return false;
       }
     }
 
@@ -1439,13 +1566,23 @@ bool MasmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
     // Otherwise create a symbol ref.
     const MCExpr *SymRef =
         MCSymbolRefExpr::create(Sym, Variant, getContext(), FirstTokenLoc);
-    if (Offset) {
-      Res = MCBinaryExpr::create(MCBinaryExpr::Add, SymRef,
-                                 MCConstantExpr::create(Offset, getContext()),
-                                 getContext());
+    if (Info.Offset) {
+      Res = MCBinaryExpr::create(
+          MCBinaryExpr::Add, SymRef,
+          MCConstantExpr::create(Info.Offset, getContext()), getContext());
     } else {
       Res = SymRef;
     }
+    if (TypeInfo) {
+      if (Info.Type.Name.empty()) {
+        auto TypeIt = KnownType.find(Identifier.lower());
+        if (TypeIt != KnownType.end()) {
+          Info.Type = TypeIt->second;
+        }
+      }
+
+      *TypeInfo = Info.Type;
+    }
     return false;
   }
   case AsmToken::BigNum:
@@ -1481,6 +1618,20 @@ bool MasmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
     }
     return false;
   }
+  case AsmToken::String: {
+    // MASM strings (used as constants) are interpreted as big-endian base-256.
+    SMLoc ValueLoc = getTok().getLoc();
+    std::string Value;
+    if (parseEscapedString(Value))
+      return true;
+    if (Value.size() > 8)
+      return Error(ValueLoc, "literal value out of range");
+    uint64_t IntValue = 0;
+    for (const unsigned char CharVal : Value)
+      IntValue = (IntValue << 8) | CharVal;
+    Res = MCConstantExpr::create(IntValue, getContext());
+    return false;
+  }
   case AsmToken::Real: {
     APFloat RealVal(APFloat::IEEEdouble(), getTok().getString());
     uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
@@ -1509,19 +1660,19 @@ bool MasmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
     return parseBracketExpr(Res, EndLoc);
   case AsmToken::Minus:
     Lex(); // Eat the operator.
-    if (parsePrimaryExpr(Res, EndLoc))
+    if (parsePrimaryExpr(Res, EndLoc, nullptr))
       return true;
     Res = MCUnaryExpr::createMinus(Res, getContext(), FirstTokenLoc);
     return false;
   case AsmToken::Plus:
     Lex(); // Eat the operator.
-    if (parsePrimaryExpr(Res, EndLoc))
+    if (parsePrimaryExpr(Res, EndLoc, nullptr))
       return true;
     Res = MCUnaryExpr::createPlus(Res, getContext(), FirstTokenLoc);
     return false;
   case AsmToken::Tilde:
     Lex(); // Eat the operator.
-    if (parsePrimaryExpr(Res, EndLoc))
+    if (parsePrimaryExpr(Res, EndLoc, nullptr))
       return true;
     Res = MCUnaryExpr::createNot(Res, getContext(), FirstTokenLoc);
     return false;
@@ -1576,11 +1727,6 @@ bool MasmParser::parseExpression(const MCExpr *&Res) {
 /// If the function returns a 'true' value,
 /// the End argument will be filled with the last location pointed to the '>'
 /// character.
-
-/// There is a gap between the AltMacro's documentation and the single quote
-/// implementation. GCC does not fully support this feature and so we will not
-/// support it.
-/// TODO: Adding single quote as a string.
 static bool isAngleBracketString(SMLoc &StrLoc, SMLoc &EndLoc) {
   assert((StrLoc.getPointer() != nullptr) &&
          "Argument to the function cannot be a NULL value");
@@ -1599,12 +1745,12 @@ static bool isAngleBracketString(SMLoc &StrLoc, SMLoc &EndLoc) {
 }
 
 /// creating a string without the escape characters '!'.
-static std::string angleBracketString(StringRef AltMacroStr) {
+static std::string angleBracketString(StringRef BracketContents) {
   std::string Res;
-  for (size_t Pos = 0; Pos < AltMacroStr.size(); Pos++) {
-    if (AltMacroStr[Pos] == '!')
+  for (size_t Pos = 0; Pos < BracketContents.size(); Pos++) {
+    if (BracketContents[Pos] == '!')
       Pos++;
-    Res += AltMacroStr[Pos];
+    Res += BracketContents[Pos];
   }
   return Res;
 }
@@ -1722,8 +1868,6 @@ static unsigned getGNUBinOpPrecedence(AsmToken::TokenKind K,
     return 4;
 
   // High Intermediate Precedence: |, &, ^
-  //
-  // FIXME: gas seems to support '!' as an infix operator?
   case AsmToken::Pipe:
     Kind = MCBinaryExpr::Or;
     return 5;
@@ -1768,8 +1912,22 @@ bool MasmParser::parseBinOpRHS(unsigned Precedence, const MCExpr *&Res,
                                SMLoc &EndLoc) {
   SMLoc StartLoc = Lexer.getLoc();
   while (true) {
+    AsmToken::TokenKind TokKind = Lexer.getKind();
+    if (Lexer.getKind() == AsmToken::Identifier) {
+      TokKind = StringSwitch<AsmToken::TokenKind>(Lexer.getTok().getString())
+                    .CaseLower("and", AsmToken::Amp)
+                    .CaseLower("not", AsmToken::Exclaim)
+                    .CaseLower("or", AsmToken::Pipe)
+                    .CaseLower("eq", AsmToken::EqualEqual)
+                    .CaseLower("ne", AsmToken::ExclaimEqual)
+                    .CaseLower("lt", AsmToken::Less)
+                    .CaseLower("le", AsmToken::LessEqual)
+                    .CaseLower("gt", AsmToken::Greater)
+                    .CaseLower("ge", AsmToken::GreaterEqual)
+                    .Default(TokKind);
+    }
     MCBinaryExpr::Opcode Kind = MCBinaryExpr::Add;
-    unsigned TokPrec = getBinOpPrecedence(Lexer.getKind(), Kind);
+    unsigned TokPrec = getBinOpPrecedence(TokKind, Kind);
 
     // If the next token is lower precedence than we are allowed to eat, return
     // successfully with what we ate already.
@@ -1796,6 +1954,7 @@ bool MasmParser::parseBinOpRHS(unsigned Precedence, const MCExpr *&Res,
 }
 
 /// ParseStatement:
+///   ::= % statement
 ///   ::= EndOfStatement
 ///   ::= Label* Directive ...Operands... EndOfStatement
 ///   ::= Label* Identifier OperandList* EndOfStatement
@@ -1813,6 +1972,15 @@ bool MasmParser::parseStatement(ParseStatementInfo &Info,
     Lex();
     return false;
   }
+
+  // If preceded by an expansion operator, first expand all text macros and
+  // macro functions.
+  if (getTok().is(AsmToken::Percent)) {
+    SMLoc ExpansionLoc = getTok().getLoc();
+    if (parseToken(AsmToken::Percent) || expandStatement(ExpansionLoc))
+      return true;
+  }
+
   // Statements always start with an identifier, unless we're dealing with a
   // processor directive (.386, .686, etc.) that lexes as a real.
   AsmToken ID = getTok();
@@ -1864,6 +2032,11 @@ bool MasmParser::parseStatement(ParseStatementInfo &Info,
     Lex(); // always eat a token
     if (!IDVal.startswith("."))
       return Error(IDLoc, "unexpected token at start of statement");
+  } else if (Lexer.is(AsmToken::Identifier) &&
+             getTok().getString().equals_lower("echo")) {
+    // Intercept echo early to avoid lexical substitution in its message, and
+    // delegate all handling to the appropriate function.
+    return parseDirectiveEcho();
   } else if (parseIdentifier(IDVal)) {
     if (!TheCondState.Ignore) {
       Lex(); // always eat a token
@@ -1983,7 +2156,7 @@ bool MasmParser::parseStatement(ParseStatementInfo &Info,
     // does not understand Labels. This may cause us to see a Hash
     // here instead of a preprocessor line comment.
     if (getTok().is(AsmToken::Hash)) {
-      StringRef CommentStr = parseStringToEndOfStatement();
+      std::string CommentStr = parseStringTo(AsmToken::EndOfStatement);
       Lexer.Lex();
       Lexer.UnLex(AsmToken(AsmToken::EndOfStatement, CommentStr));
     }
@@ -2016,7 +2189,7 @@ bool MasmParser::parseStatement(ParseStatementInfo &Info,
   }
 
   // If macros are enabled, check to see if this is a macro instantiation.
-  if (const MCAsmMacro *M = getContext().lookupMacro(IDVal)) {
+  if (const MCAsmMacro *M = getContext().lookupMacro(IDVal.lower())) {
     return handleMacroEntry(M, IDLoc);
   }
 
@@ -2091,15 +2264,18 @@ bool MasmParser::parseStatement(ParseStatementInfo &Info,
     case DK_DD:
       return parseDirectiveValue(IDVal, 4);
     case DK_FWORD:
+    case DK_DF:
       return parseDirectiveValue(IDVal, 6);
     case DK_QWORD:
     case DK_SQWORD:
     case DK_DQ:
       return parseDirectiveValue(IDVal, 8);
     case DK_REAL4:
-      return parseDirectiveRealValue(IDVal, APFloat::IEEEsingle());
+      return parseDirectiveRealValue(IDVal, APFloat::IEEEsingle(), 4);
     case DK_REAL8:
-      return parseDirectiveRealValue(IDVal, APFloat::IEEEdouble());
+      return parseDirectiveRealValue(IDVal, APFloat::IEEEdouble(), 8);
+    case DK_REAL10:
+      return parseDirectiveRealValue(IDVal, APFloat::x87DoubleExtended(), 10);
     case DK_STRUCT:
     case DK_UNION:
       return parseDirectiveNestedStruct(IDVal, DirKind);
@@ -2120,14 +2296,14 @@ bool MasmParser::parseStatement(ParseStatementInfo &Info,
       return parseDirectiveComment(IDLoc);
     case DK_INCLUDE:
       return parseDirectiveInclude();
-    case DK_REPT:
-      return parseDirectiveRept(IDLoc, IDVal);
-    case DK_IRP:
-      return parseDirectiveIrp(IDLoc);
-    case DK_IRPC:
-      return parseDirectiveIrpc(IDLoc);
-    case DK_ENDR:
-      return parseDirectiveEndr(IDLoc);
+    case DK_REPEAT:
+      return parseDirectiveRepeat(IDLoc, IDVal);
+    case DK_WHILE:
+      return parseDirectiveWhile(IDLoc);
+    case DK_FOR:
+      return parseDirectiveFor(IDLoc, IDVal);
+    case DK_FORC:
+      return parseDirectiveForc(IDLoc, IDVal);
     case DK_FILE:
       return parseDirectiveFile(IDLoc);
     case DK_LINE:
@@ -2202,16 +2378,13 @@ bool MasmParser::parseStatement(ParseStatementInfo &Info,
       return parseDirectiveCFIRegister(IDLoc);
     case DK_CFI_WINDOW_SAVE:
       return parseDirectiveCFIWindowSave();
-    case DK_MACRO:
-      return parseDirectiveMacro(IDLoc);
-    case DK_ALTMACRO:
-    case DK_NOALTMACRO:
-      return parseDirectiveAltmacro(IDVal);
     case DK_EXITM:
-      return parseDirectiveExitMacro(IDVal);
+      Info.ExitValue = "";
+      return parseDirectiveExitMacro(IDLoc, IDVal, *Info.ExitValue);
     case DK_ENDM:
+      Info.ExitValue = "";
       return parseDirectiveEndMacro(IDVal);
-    case DK_PURGEM:
+    case DK_PURGE:
       return parseDirectivePurgeMacro(IDLoc);
     case DK_END:
       return parseDirectiveEnd(IDLoc);
@@ -2241,8 +2414,8 @@ bool MasmParser::parseStatement(ParseStatementInfo &Info,
       return parseDirectiveErrorIfe(IDLoc, true);
     case DK_ERRNZ:
       return parseDirectiveErrorIfe(IDLoc, false);
-    case DK_ECHO:
-      return parseDirectiveEcho();
+    case DK_RADIX:
+      return parseDirectiveRadix(IDLoc);
     }
 
     return Error(IDLoc, "unknown directive");
@@ -2302,32 +2475,41 @@ bool MasmParser::parseStatement(ParseStatementInfo &Info,
     Lex();
     return parseDirectiveEquate(nextVal, IDVal, DirKind);
   case DK_BYTE:
+  case DK_SBYTE:
   case DK_DB:
     Lex();
     return parseDirectiveNamedValue(nextVal, 1, IDVal, IDLoc);
   case DK_WORD:
+  case DK_SWORD:
   case DK_DW:
     Lex();
     return parseDirectiveNamedValue(nextVal, 2, IDVal, IDLoc);
   case DK_DWORD:
+  case DK_SDWORD:
   case DK_DD:
     Lex();
     return parseDirectiveNamedValue(nextVal, 4, IDVal, IDLoc);
   case DK_FWORD:
+  case DK_DF:
     Lex();
     return parseDirectiveNamedValue(nextVal, 6, IDVal, IDLoc);
   case DK_QWORD:
+  case DK_SQWORD:
   case DK_DQ:
     Lex();
     return parseDirectiveNamedValue(nextVal, 8, IDVal, IDLoc);
   case DK_REAL4:
     Lex();
-    return parseDirectiveNamedRealValue(nextVal, APFloat::IEEEsingle(), IDVal,
-                                        IDLoc);
+    return parseDirectiveNamedRealValue(nextVal, APFloat::IEEEsingle(), 4,
+                                        IDVal, IDLoc);
   case DK_REAL8:
     Lex();
-    return parseDirectiveNamedRealValue(nextVal, APFloat::IEEEdouble(), IDVal,
-                                        IDLoc);
+    return parseDirectiveNamedRealValue(nextVal, APFloat::IEEEdouble(), 8,
+                                        IDVal, IDLoc);
+  case DK_REAL10:
+    Lex();
+    return parseDirectiveNamedRealValue(nextVal, APFloat::x87DoubleExtended(),
+                                        10, IDVal, IDLoc);
   case DK_STRUCT:
   case DK_UNION:
     Lex();
@@ -2335,6 +2517,9 @@ bool MasmParser::parseStatement(ParseStatementInfo &Info,
   case DK_ENDS:
     Lex();
     return parseDirectiveEnds(IDVal, IDLoc);
+  case DK_MACRO:
+    Lex();
+    return parseDirectiveMacro(IDVal, IDLoc);
   }
 
   // Finally, we check if this is allocating a variable with user-defined type.
@@ -2528,45 +2713,68 @@ void MasmParser::DiagHandler(const SMDiagnostic &Diag, void *Context) {
     NewDiag.print(nullptr, OS);
 }
 
-// FIXME: This is mostly duplicated from the function in AsmLexer.cpp. The
-// difference being that that function accepts '@' as part of identifiers and
-// we can't do that. AsmLexer.cpp should probably be changed to handle
-// '@' as a special case when needed.
-static bool isIdentifierChar(char c) {
-  return isalnum(static_cast<unsigned char>(c)) || c == '_' || c == '$' ||
-         c == '.';
+// This is similar to the IsIdentifierChar function in AsmLexer.cpp, but does
+// not accept '.'.
+static bool isMacroParameterChar(char C) {
+  return isAlnum(C) || C == '_' || C == '$' || C == '@' || C == '?';
 }
 
 bool MasmParser::expandMacro(raw_svector_ostream &OS, StringRef Body,
                              ArrayRef<MCAsmMacroParameter> Parameters,
                              ArrayRef<MCAsmMacroArgument> A,
-                             bool EnableAtPseudoVariable, SMLoc L) {
+                             const std::vector<std::string> &Locals, SMLoc L) {
   unsigned NParameters = Parameters.size();
-  bool HasVararg = NParameters ? Parameters.back().Vararg : false;
-  if ((!IsDarwin || NParameters != 0) && NParameters != A.size())
+  if (NParameters != A.size())
     return Error(L, "Wrong number of arguments");
-
-  // A macro without parameters is handled differently on Darwin:
-  // gas accepts no arguments and does no substitutions
+  StringMap<std::string> LocalSymbols;
+  std::string Name;
+  Name.reserve(6);
+  for (StringRef Local : Locals) {
+    raw_string_ostream LocalName(Name);
+    LocalName << "??"
+              << format_hex_no_prefix(LocalCounter++, 4, /*Upper=*/true);
+    LocalSymbols.insert({Local, LocalName.str()});
+    Name.clear();
+  }
+
+  Optional<char> CurrentQuote;
   while (!Body.empty()) {
     // Scan for the next substitution.
     std::size_t End = Body.size(), Pos = 0;
+    std::size_t IdentifierPos = End;
     for (; Pos != End; ++Pos) {
-      // Check for a substitution or escape.
-      if (IsDarwin && !NParameters) {
-        // This macro has no parameters, look for $0, $1, etc.
-        if (Body[Pos] != '$' || Pos + 1 == End)
-          continue;
-
-        char Next = Body[Pos + 1];
-        if (Next == '$' || Next == 'n' ||
-            isdigit(static_cast<unsigned char>(Next)))
+      // Find the next possible macro parameter, including preceding a '&'
+      // inside quotes.
+      if (Body[Pos] == '&')
+        break;
+      if (isMacroParameterChar(Body[Pos])) {
+        if (!CurrentQuote.hasValue())
           break;
+        if (IdentifierPos == End)
+          IdentifierPos = Pos;
       } else {
-        // This macro has parameters, look for \foo, \bar, etc.
-        if (Body[Pos] == '\\' && Pos + 1 != End)
-          break;
+        IdentifierPos = End;
       }
+
+      // Track quotation status
+      if (!CurrentQuote.hasValue()) {
+        if (Body[Pos] == '\'' || Body[Pos] == '"')
+          CurrentQuote = Body[Pos];
+      } else if (Body[Pos] == CurrentQuote) {
+        if (Pos + 1 != End && Body[Pos + 1] == CurrentQuote) {
+          // Escaped quote, and quotes aren't identifier chars; skip
+          ++Pos;
+          continue;
+        } else {
+          CurrentQuote.reset();
+        }
+      }
+    }
+    if (IdentifierPos != End) {
+      // We've recognized an identifier before an apostrophe inside quotes;
+      // check once to see if we can expand it.
+      Pos = IdentifierPos;
+      IdentifierPos = End;
     }
 
     // Add the prefix.
@@ -2576,90 +2784,51 @@ bool MasmParser::expandMacro(raw_svector_ostream &OS, StringRef Body,
     if (Pos == End)
       break;
 
-    if (IsDarwin && !NParameters) {
-      switch (Body[Pos + 1]) {
-      // $$ => $
-      case '$':
-        OS << '$';
-        break;
-
-      // $n => number of arguments
-      case 'n':
-        OS << A.size();
-        break;
+    unsigned I = Pos;
+    bool InitialAmpersand = (Body[I] == '&');
+    if (InitialAmpersand) {
+      ++I;
+      ++Pos;
+    }
+    while (I < End && isMacroParameterChar(Body[I]))
+      ++I;
 
-      // $[0-9] => argument
-      default: {
-        // Missing arguments are ignored.
-        unsigned Index = Body[Pos + 1] - '0';
-        if (Index >= A.size())
-          break;
+    const char *Begin = Body.data() + Pos;
+    StringRef Argument(Begin, I - Pos);
+    unsigned Index = 0;
 
-        // Otherwise substitute with the token values, with spaces eliminated.
-        for (const AsmToken &Token : A[Index])
-          OS << Token.getString();
+    for (; Index < NParameters; ++Index)
+      if (Parameters[Index].Name == Argument)
         break;
-      }
-      }
-      Pos += 2;
-    } else {
-      unsigned I = Pos + 1;
 
-      // Check for the \@ pseudo-variable.
-      if (EnableAtPseudoVariable && Body[I] == '@' && I + 1 != End)
-        ++I;
+    if (Index == NParameters) {
+      if (InitialAmpersand)
+        OS << '&';
+      auto it = LocalSymbols.find(Argument.lower());
+      if (it != LocalSymbols.end())
+        OS << it->second;
       else
-        while (isIdentifierChar(Body[I]) && I + 1 != End)
-          ++I;
-
-      const char *Begin = Body.data() + Pos + 1;
-      StringRef Argument(Begin, I - (Pos + 1));
-      unsigned Index = 0;
+        OS << Argument;
+      Pos = I;
+    } else {
+      for (const AsmToken &Token : A[Index]) {
+        // In MASM, you can write '%expr'.
+        // The prefix '%' evaluates the expression 'expr'
+        // and uses the result as a string (e.g. replace %(1+2) with the
+        // string "3").
+        // Here, we identify the integer token which is the result of the
+        // absolute expression evaluation and replace it with its string
+        // representation.
+        if (Token.getString().front() == '%' && Token.is(AsmToken::Integer))
+          // Emit an integer value to the buffer.
+          OS << Token.getIntVal();
+        else
+          OS << Token.getString();
+      }
 
-      if (Argument == "@") {
-        OS << NumOfMacroInstantiations;
-        Pos += 2;
-      } else {
-        for (; Index < NParameters; ++Index)
-          if (Parameters[Index].Name == Argument)
-            break;
-
-        if (Index == NParameters) {
-          if (Body[Pos + 1] == '(' && Body[Pos + 2] == ')')
-            Pos += 3;
-          else {
-            OS << '\\' << Argument;
-            Pos = I;
-          }
-        } else {
-          bool VarargParameter = HasVararg && Index == (NParameters - 1);
-          for (const AsmToken &Token : A[Index])
-            // For altmacro mode, you can write '%expr'.
-            // The prefix '%' evaluates the expression 'expr'
-            // and uses the result as a string (e.g. replace %(1+2) with the
-            // string "3").
-            // Here, we identify the integer token which is the result of the
-            // absolute expression evaluation and replace it with its string
-            // representation.
-            if (AltMacroMode && Token.getString().front() == '%' &&
-                Token.is(AsmToken::Integer))
-              // Emit an integer value to the buffer.
-              OS << Token.getIntVal();
-            // Only Token that was validated as a string and begins with '<'
-            // is considered altMacroString!!!
-            else if (AltMacroMode && Token.getString().front() == '<' &&
-                     Token.is(AsmToken::String)) {
-              OS << angleBracketString(Token.getStringContents());
-            }
-            // We expect no quotes around the string's contents when
-            // parsing for varargs.
-            else if (Token.isNot(AsmToken::String) || VarargParameter)
-              OS << Token.getString();
-            else
-              OS << Token.getStringContents();
-
-          Pos += 1 + Argument.size();
-        }
+      Pos += Argument.size();
+      if (Pos < End && Body[Pos] == '&') {
+        ++Pos;
       }
     }
     // Update the scan point.
@@ -2717,16 +2886,30 @@ private:
 
 } // end anonymous namespace
 
-bool MasmParser::parseMacroArgument(MCAsmMacroArgument &MA, bool Vararg) {
-
-  if (Vararg) {
-    if (Lexer.isNot(AsmToken::EndOfStatement)) {
-      StringRef Str = parseStringToEndOfStatement();
-      MA.emplace_back(AsmToken::String, Str);
+bool MasmParser::parseMacroArgument(const MCAsmMacroParameter *MP,
+                                    MCAsmMacroArgument &MA,
+                                    AsmToken::TokenKind EndTok) {
+  if (MP && MP->Vararg) {
+    if (Lexer.isNot(EndTok)) {
+      SmallVector<StringRef, 1> Str = parseStringRefsTo(EndTok);
+      for (StringRef S : Str) {
+        MA.emplace_back(AsmToken::String, S);
+      }
     }
     return false;
   }
 
+  SMLoc StrLoc = Lexer.getLoc(), EndLoc;
+  if (Lexer.is(AsmToken::Less) && isAngleBracketString(StrLoc, EndLoc)) {
+    const char *StrChar = StrLoc.getPointer() + 1;
+    const char *EndChar = EndLoc.getPointer() - 1;
+    jumpToLoc(EndLoc, CurBuffer, EndStatementAtEOFStack.back());
+    /// Eat from '<' to '>'.
+    Lex();
+    MA.emplace_back(AsmToken::String, StringRef(StrChar, EndChar - StrChar));
+    return false;
+  }
+
   unsigned ParenLevel = 0;
 
   // Darwin doesn't use spaces to delmit arguments.
@@ -2737,29 +2920,28 @@ bool MasmParser::parseMacroArgument(MCAsmMacroArgument &MA, bool Vararg) {
   while (true) {
     SpaceEaten = false;
     if (Lexer.is(AsmToken::Eof) || Lexer.is(AsmToken::Equal))
-      return TokError("unexpected token in macro instantiation");
+      return TokError("unexpected token");
 
     if (ParenLevel == 0) {
-
       if (Lexer.is(AsmToken::Comma))
         break;
 
       if (Lexer.is(AsmToken::Space)) {
         SpaceEaten = true;
-        Lexer.Lex(); // Eat spaces.
+        Lex(); // Eat spaces.
       }
 
       // Spaces can delimit parameters, but could also be part an expression.
       // If the token after a space is an operator, add the token and the next
       // one into this argument
       if (!IsDarwin) {
-        if (isOperator(Lexer.getKind())) {
+        if (isOperator(Lexer.getKind()) && Lexer.isNot(EndTok)) {
           MA.push_back(getTok());
-          Lexer.Lex();
+          Lex();
 
           // Whitespace after an operator can be ignored.
           if (Lexer.is(AsmToken::Space))
-            Lexer.Lex();
+            Lex();
 
           continue;
         }
@@ -2770,7 +2952,7 @@ bool MasmParser::parseMacroArgument(MCAsmMacroArgument &MA, bool Vararg) {
 
     // handleMacroEntry relies on not advancing the lexer here
     // to be able to fill in the remaining default parameter values
-    if (Lexer.is(AsmToken::EndOfStatement))
+    if (Lexer.is(EndTok) && (EndTok != AsmToken::RParen || ParenLevel == 0))
       break;
 
     // Adjust the current parentheses level.
@@ -2781,17 +2963,27 @@ bool MasmParser::parseMacroArgument(MCAsmMacroArgument &MA, bool Vararg) {
 
     // Append the token to the current argument list.
     MA.push_back(getTok());
-    Lexer.Lex();
+    Lex();
   }
 
   if (ParenLevel != 0)
-    return TokError("unbalanced parentheses in macro argument");
+    return TokError("unbalanced parentheses in argument");
+
+  if (MA.empty() && MP) {
+    if (MP->Required) {
+      return TokError("missing value for required parameter '" + MP->Name +
+                      "'");
+    } else {
+      MA = MP->Value;
+    }
+  }
   return false;
 }
 
 // Parse the macro instantiation arguments.
 bool MasmParser::parseMacroArguments(const MCAsmMacro *M,
-                                     MCAsmMacroArguments &A) {
+                                     MCAsmMacroArguments &A,
+                                     AsmToken::TokenKind EndTok) {
   const unsigned NParameters = M ? M->Parameters.size() : 0;
   bool NamedParametersFound = false;
   SmallVector<SMLoc, 4> FALocs;
@@ -2802,7 +2994,6 @@ bool MasmParser::parseMacroArguments(const MCAsmMacro *M,
   // Parse two kinds of macro invocations:
   // - macros defined without any parameters accept an arbitrary number of them
   // - macros defined with parameters accept at most that many of them
-  bool HasVararg = NParameters ? M->Parameters.back().Vararg : false;
   for (unsigned Parameter = 0; !NParameters || Parameter < NParameters;
        ++Parameter) {
     SMLoc IDLoc = Lexer.getLoc();
@@ -2819,14 +3010,31 @@ bool MasmParser::parseMacroArguments(const MCAsmMacro *M,
 
       NamedParametersFound = true;
     }
-    bool Vararg = HasVararg && Parameter == (NParameters - 1);
 
     if (NamedParametersFound && FA.Name.empty())
       return Error(IDLoc, "cannot mix positional and keyword arguments");
 
+    unsigned PI = Parameter;
+    if (!FA.Name.empty()) {
+      assert(M && "expected macro to be defined");
+      unsigned FAI = 0;
+      for (FAI = 0; FAI < NParameters; ++FAI)
+        if (M->Parameters[FAI].Name == FA.Name)
+          break;
+
+      if (FAI >= NParameters) {
+        return Error(IDLoc, "parameter named '" + FA.Name +
+                                "' does not exist for macro '" + M->Name + "'");
+      }
+      PI = FAI;
+    }
+    const MCAsmMacroParameter *MP = nullptr;
+    if (M && PI < NParameters)
+      MP = &M->Parameters[PI];
+
     SMLoc StrLoc = Lexer.getLoc();
     SMLoc EndLoc;
-    if (AltMacroMode && Lexer.is(AsmToken::Percent)) {
+    if (Lexer.is(AsmToken::Percent)) {
       const MCExpr *AbsoluteExp;
       int64_t Value;
       /// Eat '%'.
@@ -2841,32 +3049,11 @@ bool MasmParser::parseMacroArguments(const MCAsmMacro *M,
       AsmToken newToken(AsmToken::Integer,
                         StringRef(StrChar, EndChar - StrChar), Value);
       FA.Value.push_back(newToken);
-    } else if (AltMacroMode && Lexer.is(AsmToken::Less) &&
-               isAngleBracketString(StrLoc, EndLoc)) {
-      const char *StrChar = StrLoc.getPointer();
-      const char *EndChar = EndLoc.getPointer();
-      jumpToLoc(EndLoc, CurBuffer);
-      /// Eat from '<' to '>'.
-      Lex();
-      AsmToken newToken(AsmToken::String,
-                        StringRef(StrChar, EndChar - StrChar));
-      FA.Value.push_back(newToken);
-    } else if(parseMacroArgument(FA.Value, Vararg))
-      return true;
-
-    unsigned PI = Parameter;
-    if (!FA.Name.empty()) {
-      unsigned FAI = 0;
-      for (FAI = 0; FAI < NParameters; ++FAI)
-        if (M->Parameters[FAI].Name == FA.Name)
-          break;
-
-      if (FAI >= NParameters) {
-        assert(M && "expected macro to be defined");
-        return Error(IDLoc, "parameter named '" + FA.Name +
-                                "' does not exist for macro '" + M->Name + "'");
-      }
-      PI = FAI;
+    } else if (parseMacroArgument(MP, FA.Value, EndTok)) {
+      if (M)
+        return addErrorSuffix(" in '" + M->Name + "' macro");
+      else
+        return true;
     }
 
     if (!FA.Value.empty()) {
@@ -2883,14 +3070,15 @@ bool MasmParser::parseMacroArguments(const MCAsmMacro *M,
     // At the end of the statement, fill in remaining arguments that have
     // default values. If there aren't any, then the next argument is
     // required but missing
-    if (Lexer.is(AsmToken::EndOfStatement)) {
+    if (Lexer.is(EndTok)) {
       bool Failure = false;
       for (unsigned FAI = 0; FAI < NParameters; ++FAI) {
         if (A[FAI].empty()) {
           if (M->Parameters[FAI].Required) {
             Error(FALocs[FAI].isValid() ? FALocs[FAI] : Lexer.getLoc(),
                   "missing value for required parameter "
-                  "'" + M->Parameters[FAI].Name + "' in macro '" + M->Name + "'");
+                  "'" +
+                      M->Parameters[FAI].Name + "' in macro '" + M->Name + "'");
             Failure = true;
           }
 
@@ -2908,7 +3096,8 @@ bool MasmParser::parseMacroArguments(const MCAsmMacro *M,
   return TokError("too many positional arguments");
 }
 
-bool MasmParser::handleMacroEntry(const MCAsmMacro *M, SMLoc NameLoc) {
+bool MasmParser::handleMacroEntry(const MCAsmMacro *M, SMLoc NameLoc,
+                                  AsmToken::TokenKind ArgumentEndTok) {
   // Arbitrarily limit macro nesting depth (default matches 'as'). We can
   // eliminate this, although we should protect against infinite loops.
   unsigned MaxNestingDepth = AsmMacroMaxNestingDepth;
@@ -2922,7 +3111,7 @@ bool MasmParser::handleMacroEntry(const MCAsmMacro *M, SMLoc NameLoc) {
   }
 
   MCAsmMacroArguments A;
-  if (parseMacroArguments(M, A))
+  if (parseMacroArguments(M, A, ArgumentEndTok))
     return true;
 
   // Macro instantiation is lexical, unfortunately. We construct a new buffer
@@ -2931,12 +3120,12 @@ bool MasmParser::handleMacroEntry(const MCAsmMacro *M, SMLoc NameLoc) {
   StringRef Body = M->Body;
   raw_svector_ostream OS(Buf);
 
-  if (expandMacro(OS, Body, M->Parameters, A, true, getTok().getLoc()))
+  if (expandMacro(OS, Body, M->Parameters, A, M->Locals, getTok().getLoc()))
     return true;
 
-  // We include the .endmacro in the buffer as our cue to exit the macro
+  // We include the endm in the buffer as our cue to exit the macro
   // instantiation.
-  OS << ".endmacro\n";
+  OS << "endm\n";
 
   std::unique_ptr<MemoryBuffer> Instantiation =
       MemoryBuffer::getMemBufferCopy(OS.str(), "<instantiation>");
@@ -2952,14 +3141,17 @@ bool MasmParser::handleMacroEntry(const MCAsmMacro *M, SMLoc NameLoc) {
   // Jump to the macro instantiation and prime the lexer.
   CurBuffer = SrcMgr.AddNewSourceBuffer(std::move(Instantiation), SMLoc());
   Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer());
+  EndStatementAtEOFStack.push_back(true);
   Lex();
 
   return false;
 }
 
 void MasmParser::handleMacroExit() {
-  // Jump to the EndOfStatement we should return to, and consume it.
-  jumpToLoc(ActiveMacros.back()->ExitLoc, ActiveMacros.back()->ExitBuffer);
+  // Jump to the token we should return to, and consume it.
+  EndStatementAtEOFStack.pop_back();
+  jumpToLoc(ActiveMacros.back()->ExitLoc, ActiveMacros.back()->ExitBuffer,
+            EndStatementAtEOFStack.back());
   Lex();
 
   // Pop the instantiation entry.
@@ -2967,6 +3159,63 @@ void MasmParser::handleMacroExit() {
   ActiveMacros.pop_back();
 }
 
+bool MasmParser::handleMacroInvocation(const MCAsmMacro *M, SMLoc NameLoc) {
+  if (!M->IsFunction)
+    return Error(NameLoc, "cannot invoke macro procedure as function");
+
+  if (parseToken(AsmToken::LParen, "invoking macro function '" + M->Name +
+                                       "' requires arguments in parentheses") ||
+      handleMacroEntry(M, NameLoc, AsmToken::RParen))
+    return true;
+
+  // Parse all statements in the macro, retrieving the exit value when it ends.
+  std::string ExitValue;
+  SmallVector<AsmRewrite, 4> AsmStrRewrites;
+  while (Lexer.isNot(AsmToken::Eof)) {
+    ParseStatementInfo Info(&AsmStrRewrites);
+    bool Parsed = parseStatement(Info, nullptr);
+
+    if (!Parsed && Info.ExitValue.hasValue()) {
+      ExitValue = std::move(*Info.ExitValue);
+      break;
+    }
+
+    // If we have a Lexer Error we are on an Error Token. Load in Lexer Error
+    // for printing ErrMsg via Lex() only if no (presumably better) parser error
+    // exists.
+    if (Parsed && !hasPendingError() && Lexer.getTok().is(AsmToken::Error)) {
+      Lex();
+    }
+
+    // parseStatement returned true so may need to emit an error.
+    printPendingErrors();
+
+    // Skipping to the next line if needed.
+    if (Parsed && !getLexer().isAtStartOfStatement())
+      eatToEndOfStatement();
+  }
+
+  // Consume the right-parenthesis on the other side of the arguments.
+  if (parseToken(AsmToken::RParen, "invoking macro function '" + M->Name +
+                                       "' requires arguments in parentheses"))
+    return true;
+
+  // Exit values may require lexing, unfortunately. We construct a new buffer to
+  // hold the exit value.
+  std::unique_ptr<MemoryBuffer> MacroValue =
+      MemoryBuffer::getMemBufferCopy(ExitValue, "<macro-value>");
+
+  // Jump from this location to the instantiated exit value, and prime the
+  // lexer.
+  CurBuffer = SrcMgr.AddNewSourceBuffer(std::move(MacroValue), Lexer.getLoc());
+  Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(), nullptr,
+                  /*EndStatementAtEOF=*/false);
+  EndStatementAtEOFStack.push_back(false);
+  Lex();
+
+  return false;
+}
+
 /// parseIdentifier:
 ///   ::= identifier
 ///   ::= string
@@ -3035,7 +3284,7 @@ bool MasmParser::parseDirectiveEquate(StringRef IDVal, StringRef Name,
       // Accept a text-list, not just one text-item.
       auto parseItem = [&]() -> bool {
         if (parseTextItem(Value))
-          return true;
+          return TokError("expected text item");
         Var.TextValue += Value;
         return false;
       };
@@ -3053,6 +3302,11 @@ bool MasmParser::parseDirectiveEquate(StringRef IDVal, StringRef Name,
   SMLoc EndLoc, StartLoc = Lexer.getLoc();
   if (parseExpression(Expr, EndLoc))
     return addErrorSuffix(" in '" + Twine(IDVal) + "' directive");
+  MCSymbol *Sym = getContext().getOrCreateSymbol(Var.Name);
+  Sym->setRedefinable(Var.Redefinable);
+  Sym->setVariableValue(Expr);
+  Sym->setExternal(false);
+
   if (Expr->evaluateAsAbsolute(Var.NumericValue,
                                getStreamer().getAssemblerPtr()))
     return false;
@@ -3069,70 +3323,19 @@ bool MasmParser::parseEscapedString(std::string &Data) {
     return true;
 
   Data = "";
+  char Quote = getTok().getString().front();
   StringRef Str = getTok().getStringContents();
-  for (unsigned i = 0, e = Str.size(); i != e; ++i) {
-    if (Str[i] != '\\') {
-      Data += Str[i];
-      continue;
-    }
-
-    // Recognize escaped characters. Note that this escape semantics currently
-    // loosely follows Darwin 'as'.
-    ++i;
-    if (i == e)
-      return TokError("unexpected backslash at end of string");
-
-    // Recognize hex sequences similarly to GNU 'as'.
-    if (Str[i] == 'x' || Str[i] == 'X') {
-      size_t length = Str.size();
-      if (i + 1 >= length || !isHexDigit(Str[i + 1]))
-        return TokError("invalid hexadecimal escape sequence");
-
-      // Consume hex characters. GNU 'as' reads all hexadecimal characters and
-      // then truncates to the lower 16 bits. Seems reasonable.
-      unsigned Value = 0;
-      while (i + 1 < length && isHexDigit(Str[i + 1]))
-        Value = Value * 16 + hexDigitValue(Str[++i]);
-
-      Data += (unsigned char)(Value & 0xFF);
-      continue;
-    }
-
-    // Recognize octal sequences.
-    if ((unsigned)(Str[i] - '0') <= 7) {
-      // Consume up to three octal characters.
-      unsigned Value = Str[i] - '0';
-
-      if (i + 1 != e && ((unsigned)(Str[i + 1] - '0')) <= 7) {
+  Data.reserve(Str.size());
+  for (size_t i = 0, e = Str.size(); i != e; ++i) {
+    Data.push_back(Str[i]);
+    if (Str[i] == Quote) {
+      // MASM treats doubled delimiting quotes as an escaped delimiting quote.
+      // If we're escaping the string's trailing delimiter, we're definitely
+      // missing a quotation mark.
+      if (i + 1 == Str.size())
+        return Error(getTok().getLoc(), "missing quotation mark in string");
+      if (Str[i + 1] == Quote)
         ++i;
-        Value = Value * 8 + (Str[i] - '0');
-
-        if (i + 1 != e && ((unsigned)(Str[i + 1] - '0')) <= 7) {
-          ++i;
-          Value = Value * 8 + (Str[i] - '0');
-        }
-      }
-
-      if (Value > 255)
-        return TokError("invalid octal escape sequence (out of range)");
-
-      Data += (unsigned char)Value;
-      continue;
-    }
-
-    // Otherwise recognize individual escapes.
-    switch (Str[i]) {
-    default:
-      // Just reject invalid escape sequences for now.
-      return TokError("invalid escape sequence (unrecognized character)");
-
-    case 'b': Data += '\b'; break;
-    case 'f': Data += '\f'; break;
-    case 'n': Data += '\n'; break;
-    case 'r': Data += '\r'; break;
-    case 't': Data += '\t'; break;
-    case '"': Data += '"'; break;
-    case '\\': Data += '\\'; break;
     }
   }
 
@@ -3145,7 +3348,7 @@ bool MasmParser::parseAngleBracketString(std::string &Data) {
   if (isAngleBracketString(StartLoc, EndLoc)) {
     const char *StartChar = StartLoc.getPointer() + 1;
     const char *EndChar = EndLoc.getPointer() - 1;
-    jumpToLoc(EndLoc, CurBuffer);
+    jumpToLoc(EndLoc, CurBuffer, EndStatementAtEOFStack.back());
     // Eat from '<' to '>'.
     Lex();
 
@@ -3157,8 +3360,42 @@ bool MasmParser::parseAngleBracketString(std::string &Data) {
 
 /// textItem ::= textLiteral | textMacroID | % constExpr
 bool MasmParser::parseTextItem(std::string &Data) {
-  // TODO(epastor): Support textMacroID and % expansion of expressions.
-  return parseAngleBracketString(Data);
+  switch (getTok().getKind()) {
+  default:
+    return true;
+  case AsmToken::Percent: {
+    int64_t Res;
+    if (parseToken(AsmToken::Percent) || parseAbsoluteExpression(Res))
+      return true;
+    Data = std::to_string(Res);
+    return false;
+  }
+  case AsmToken::Less:
+  case AsmToken::LessEqual:
+  case AsmToken::LessLess:
+  case AsmToken::LessGreater:
+    return parseAngleBracketString(Data);
+  case AsmToken::Identifier: {
+    StringRef ID;
+    if (parseIdentifier(ID))
+      return true;
+    Data = ID.str();
+
+    auto it = Variables.find(ID);
+    if (it == Variables.end())
+      return true;
+
+    while (it != Variables.end()) {
+      const Variable &Var = it->second;
+      if (!Var.IsText)
+        return true;
+      Data = Var.TextValue;
+      it = Variables.find(Data);
+    }
+    return false;
+  }
+  }
+  llvm_unreachable("unhandled token kind");
 }
 
 /// parseDirectiveAscii:
@@ -3202,30 +3439,20 @@ bool MasmParser::emitIntValue(const MCExpr *Value, unsigned Size) {
 bool MasmParser::parseScalarInitializer(unsigned Size,
                                         SmallVectorImpl<const MCExpr *> &Values,
                                         unsigned StringPadLength) {
-  if (getTok().is(AsmToken::String)) {
-    StringRef Value = getTok().getStringContents();
-    if (Size == 1) {
-      // Treat each character as an initializer.
-      for (const char CharVal : Value)
-        Values.push_back(MCConstantExpr::create(CharVal, getContext()));
-
-      // Pad the string with spaces to the specified length.
-      for (size_t i = Value.size(); i < StringPadLength; ++i)
-        Values.push_back(MCConstantExpr::create(' ', getContext()));
-    } else {
-      // Treat the string as an initial value in big-endian representation.
-      if (Value.size() > Size)
-        return Error(getTok().getLoc(), "out of range literal value");
-
-      uint64_t IntValue = 0;
-      for (const unsigned char CharVal : Value.bytes())
-        IntValue = (IntValue << 8) | CharVal;
-      Values.push_back(MCConstantExpr::create(IntValue, getContext()));
-    }
-    Lex();
+  if (Size == 1 && getTok().is(AsmToken::String)) {
+    std::string Value;
+    if (parseEscapedString(Value))
+      return true;
+    // Treat each character as an initializer.
+    for (const unsigned char CharVal : Value)
+      Values.push_back(MCConstantExpr::create(CharVal, getContext()));
+
+    // Pad the string with spaces to the specified length.
+    for (size_t i = Value.size(); i < StringPadLength; ++i)
+      Values.push_back(MCConstantExpr::create(' ', getContext()));
   } else {
     const MCExpr *Value;
-    if (checkForValidSection() || parseExpression(Value))
+    if (parseExpression(Value))
       return true;
     if (getTok().is(AsmToken::Identifier) &&
         getTok().getString().equals_lower("dup")) {
@@ -3271,7 +3498,7 @@ bool MasmParser::parseScalarInstList(unsigned Size,
   return false;
 }
 
-bool MasmParser::emitIntegralValues(unsigned Size) {
+bool MasmParser::emitIntegralValues(unsigned Size, unsigned *Count) {
   SmallVector<const MCExpr *, 1> Values;
   if (checkForValidSection() || parseScalarInstList(Size, Values))
     return true;
@@ -3279,13 +3506,15 @@ bool MasmParser::emitIntegralValues(unsigned Size) {
   for (auto Value : Values) {
     emitIntValue(Value, Size);
   }
+  if (Count)
+    *Count = Values.size();
   return false;
 }
 
 // Add a field to the current structure.
 bool MasmParser::addIntegralField(StringRef Name, unsigned Size) {
   StructInfo &Struct = StructInProgress.back();
-  FieldInfo &Field = Struct.addField(Name, FT_INTEGRAL);
+  FieldInfo &Field = Struct.addField(Name, FT_INTEGRAL, Size);
   IntFieldInfo &IntInfo = Field.Contents.IntInfo;
 
   Field.Type = Size;
@@ -3318,16 +3547,24 @@ bool MasmParser::parseDirectiveValue(StringRef IDVal, unsigned Size) {
 
 /// parseDirectiveNamedValue
 ///  ::= name (byte | word | ... ) [ expression (, expression)* ]
-bool MasmParser::parseDirectiveNamedValue(StringRef IDVal, unsigned Size,
+bool MasmParser::parseDirectiveNamedValue(StringRef TypeName, unsigned Size,
                                           StringRef Name, SMLoc NameLoc) {
   if (StructInProgress.empty()) {
     // Initialize named data value.
     MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
     getStreamer().emitLabel(Sym);
-    if (emitIntegralValues(Size))
-      return addErrorSuffix(" in '" + Twine(IDVal) + "' directive");
+    unsigned Count;
+    if (emitIntegralValues(Size, &Count))
+      return addErrorSuffix(" in '" + Twine(TypeName) + "' directive");
+
+    AsmTypeInfo Type;
+    Type.Name = TypeName;
+    Type.Size = Size * Count;
+    Type.ElementSize = Size;
+    Type.Length = Count;
+    KnownType[Name.lower()] = Type;
   } else if (addIntegralField(Name, Size)) {
-    return addErrorSuffix(" in '" + Twine(IDVal) + "' directive");
+    return addErrorSuffix(" in '" + Twine(TypeName) + "' directive");
   }
 
   return false;
@@ -3356,10 +3593,13 @@ bool MasmParser::parseRealValue(const fltSemantics &Semantics, APInt &Res) {
   // We don't truly support arithmetic on floating point expressions, so we
   // have to manually parse unary prefixes.
   bool IsNeg = false;
+  SMLoc SignLoc;
   if (getLexer().is(AsmToken::Minus)) {
+    SignLoc = getLexer().getLoc();
     Lexer.Lex();
     IsNeg = true;
   } else if (getLexer().is(AsmToken::Plus)) {
+    SignLoc = getLexer().getLoc();
     Lexer.Lex();
   }
 
@@ -3381,6 +3621,20 @@ bool MasmParser::parseRealValue(const fltSemantics &Semantics, APInt &Res) {
       Value = APFloat::getZero(Semantics);
     else
       return TokError("invalid floating point literal");
+  } else if (IDVal.consume_back("r") || IDVal.consume_back("R")) {
+    // MASM hexadecimal floating-point literal; no APFloat conversion needed.
+    // To match ML64.exe, ignore the initial sign.
+    unsigned SizeInBits = Value.getSizeInBits(Semantics);
+    if (SizeInBits != (IDVal.size() << 2))
+      return TokError("invalid floating point literal");
+
+    // Consume the numeric token.
+    Lex();
+
+    Res = APInt(SizeInBits, IDVal, 16);
+    if (SignLoc.isValid())
+      return Warning(SignLoc, "MASM-style hex floats ignore explicit sign");
+    return false;
   } else if (errorToBool(
                  Value.convertFromString(IDVal, APFloat::rmNearestTiesToEven)
                      .takeError())) {
@@ -3444,28 +3698,33 @@ bool MasmParser::parseRealInstList(const fltSemantics &Semantics,
 }
 
 // Initialize real data values.
-bool MasmParser::emitRealValues(const fltSemantics &Semantics) {
+bool MasmParser::emitRealValues(const fltSemantics &Semantics,
+                                unsigned *Count) {
+  if (checkForValidSection())
+    return true;
+
   SmallVector<APInt, 1> ValuesAsInt;
   if (parseRealInstList(Semantics, ValuesAsInt))
     return true;
 
   for (const APInt &AsInt : ValuesAsInt) {
-    getStreamer().emitIntValue(AsInt.getLimitedValue(),
-                               AsInt.getBitWidth() / 8);
+    getStreamer().emitIntValue(AsInt);
   }
+  if (Count)
+    *Count = ValuesAsInt.size();
   return false;
 }
 
 // Add a real field to the current struct.
-bool MasmParser::addRealField(StringRef Name, const fltSemantics &Semantics) {
+bool MasmParser::addRealField(StringRef Name, const fltSemantics &Semantics,
+                              size_t Size) {
   StructInfo &Struct = StructInProgress.back();
-  FieldInfo &Field = Struct.addField(Name, FT_REAL);
+  FieldInfo &Field = Struct.addField(Name, FT_REAL, Size);
   RealFieldInfo &RealInfo = Field.Contents.RealInfo;
 
   Field.SizeOf = 0;
 
-  if (checkForValidSection() ||
-      parseRealInstList(Semantics, RealInfo.AsIntValues))
+  if (parseRealInstList(Semantics, RealInfo.AsIntValues))
     return true;
 
   Field.Type = RealInfo.AsIntValues.back().getBitWidth() / 8;
@@ -3479,38 +3738,42 @@ bool MasmParser::addRealField(StringRef Name, const fltSemantics &Semantics) {
 }
 
 /// parseDirectiveRealValue
-///  ::= (real4 | real8) [ expression (, expression)* ]
+///  ::= (real4 | real8 | real10) [ expression (, expression)* ]
 bool MasmParser::parseDirectiveRealValue(StringRef IDVal,
-                                         const fltSemantics &Semantics) {
-  if (checkForValidSection())
-    return true;
-
+                                         const fltSemantics &Semantics,
+                                         size_t Size) {
   if (StructInProgress.empty()) {
     // Initialize data value.
     if (emitRealValues(Semantics))
       return addErrorSuffix(" in '" + Twine(IDVal) + "' directive");
-  } else if (addRealField("", Semantics)) {
+  } else if (addRealField("", Semantics, Size)) {
     return addErrorSuffix(" in '" + Twine(IDVal) + "' directive");
   }
   return false;
 }
 
 /// parseDirectiveNamedRealValue
-///  ::= name (real4 | real8) [ expression (, expression)* ]
-bool MasmParser::parseDirectiveNamedRealValue(StringRef IDVal,
+///  ::= name (real4 | real8 | real10) [ expression (, expression)* ]
+bool MasmParser::parseDirectiveNamedRealValue(StringRef TypeName,
                                               const fltSemantics &Semantics,
-                                              StringRef Name, SMLoc NameLoc) {
-  if (checkForValidSection())
-    return true;
-
+                                              unsigned Size, StringRef Name,
+                                              SMLoc NameLoc) {
   if (StructInProgress.empty()) {
     // Initialize named data value.
     MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
     getStreamer().emitLabel(Sym);
-    if (emitRealValues(Semantics))
-      return addErrorSuffix(" in '" + Twine(IDVal) + "' directive");
-  } else if (addRealField(Name, Semantics)) {
-    return addErrorSuffix(" in '" + Twine(IDVal) + "' directive");
+    unsigned Count;
+    if (emitRealValues(Semantics, &Count))
+      return addErrorSuffix(" in '" + TypeName + "' directive");
+
+    AsmTypeInfo Type;
+    Type.Name = TypeName;
+    Type.Size = Size * Count;
+    Type.ElementSize = Size;
+    Type.Length = Count;
+    KnownType[Name.lower()] = Type;
+  } else if (addRealField(Name, Semantics, Size)) {
+    return addErrorSuffix(" in '" + TypeName + "' directive");
   }
   return false;
 }
@@ -3584,8 +3847,20 @@ bool MasmParser::parseFieldInitializer(const FieldInfo &Field,
 bool MasmParser::parseFieldInitializer(const FieldInfo &Field,
                                        const RealFieldInfo &Contents,
                                        FieldInitializer &Initializer) {
-  const fltSemantics &Semantics =
-      (Field.Type == 4) ? APFloat::IEEEsingle() : APFloat::IEEEdouble();
+  const fltSemantics *Semantics;
+  switch (Field.Type) {
+  case 4:
+    Semantics = &APFloat::IEEEsingle();
+    break;
+  case 8:
+    Semantics = &APFloat::IEEEdouble();
+    break;
+  case 10:
+    Semantics = &APFloat::x87DoubleExtended();
+    break;
+  default:
+    llvm_unreachable("unknown real field type");
+  }
 
   SMLoc Loc = getTok().getLoc();
 
@@ -3593,20 +3868,20 @@ bool MasmParser::parseFieldInitializer(const FieldInfo &Field,
   if (parseOptionalToken(AsmToken::LCurly)) {
     if (Field.LengthOf == 1)
       return Error(Loc, "Cannot initialize scalar field with array value");
-    if (parseRealInstList(Semantics, AsIntValues, AsmToken::RCurly) ||
+    if (parseRealInstList(*Semantics, AsIntValues, AsmToken::RCurly) ||
         parseToken(AsmToken::RCurly))
       return true;
   } else if (parseOptionalAngleBracketOpen()) {
     if (Field.LengthOf == 1)
       return Error(Loc, "Cannot initialize scalar field with array value");
-    if (parseRealInstList(Semantics, AsIntValues, AsmToken::Greater) ||
+    if (parseRealInstList(*Semantics, AsIntValues, AsmToken::Greater) ||
         parseAngleBracketClose())
       return true;
   } else if (Field.LengthOf > 1) {
     return Error(Loc, "Cannot initialize array field with scalar value");
   } else {
     AsIntValues.emplace_back();
-    if (parseRealValue(Semantics, AsIntValues.back()))
+    if (parseRealValue(*Semantics, AsIntValues.back()))
       return true;
   }
 
@@ -3769,8 +4044,7 @@ bool MasmParser::parseStructInstList(
         return true;
 
       for (int i = 0; i < Repetitions; ++i)
-        Initializers.insert(Initializers.end(), DuplicatedValues.begin(),
-                            DuplicatedValues.end());
+        llvm::append_range(Initializers, DuplicatedValues);
     } else {
       Initializers.emplace_back();
       if (parseStructInitializer(Structure, Initializers.back()))
@@ -3830,20 +4104,6 @@ bool MasmParser::emitFieldValue(const FieldInfo &Field) {
   llvm_unreachable("Unhandled FieldType enum");
 }
 
-bool MasmParser::emitStructValue(const StructInfo &Structure) {
-  size_t Offset = 0;
-  for (const auto &Field : Structure.Fields) {
-    getStreamer().emitZeros(Field.Offset - Offset);
-    if (emitFieldValue(Field))
-      return true;
-    Offset = Field.Offset + Field.SizeOf;
-  }
-  // Add final padding.
-  if (Offset != Structure.Size)
-    getStreamer().emitZeros(Structure.Size - Offset);
-  return false;
-}
-
 bool MasmParser::emitFieldInitializer(const FieldInfo &Field,
                                       const IntFieldInfo &Contents,
                                       const IntFieldInfo &Initializer) {
@@ -3937,7 +4197,8 @@ bool MasmParser::emitStructInitializer(const StructInfo &Structure,
 }
 
 // Set data values from initializers.
-bool MasmParser::emitStructValues(const StructInfo &Structure) {
+bool MasmParser::emitStructValues(const StructInfo &Structure,
+                                  unsigned *Count) {
   std::vector<StructInitializer> Initializers;
   if (parseStructInstList(Structure, Initializers))
     return true;
@@ -3947,13 +4208,16 @@ bool MasmParser::emitStructValues(const StructInfo &Structure) {
       return true;
   }
 
+  if (Count)
+    *Count = Initializers.size();
   return false;
 }
 
 // Declare a field in the current struct.
 bool MasmParser::addStructField(StringRef Name, const StructInfo &Structure) {
   StructInfo &OwningStruct = StructInProgress.back();
-  FieldInfo &Field = OwningStruct.addField(Name, FT_STRUCT);
+  FieldInfo &Field =
+      OwningStruct.addField(Name, FT_STRUCT, Structure.AlignmentSize);
   StructFieldInfo &StructInfo = Field.Contents.StructInfo;
 
   StructInfo.Structure = Structure;
@@ -3996,9 +4260,15 @@ bool MasmParser::parseDirectiveNamedStructValue(const StructInfo &Structure,
     // Initialize named data value.
     MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
     getStreamer().emitLabel(Sym);
-    KnownType[Name] = &Structure;
-    if (emitStructValues(Structure))
+    unsigned Count;
+    if (emitStructValues(Structure, &Count))
       return true;
+    AsmTypeInfo Type;
+    Type.Name = Structure.Name;
+    Type.Size = Structure.Size * Count;
+    Type.ElementSize = Structure.Size;
+    Type.Length = Count;
+    KnownType[Name.lower()] = Type;
   } else if (addStructField(Name, Structure)) {
     return addErrorSuffix(" in '" + Twine(Directive) + "' directive");
   }
@@ -4067,6 +4337,9 @@ bool MasmParser::parseDirectiveNestedStruct(StringRef Directive,
   if (parseToken(AsmToken::EndOfStatement))
     return addErrorSuffix(" in '" + Twine(Directive) + "' directive");
 
+  // Reserve space to ensure Alignment doesn't get invalidated when
+  // StructInProgress grows.
+  StructInProgress.reserve(StructInProgress.size() + 1);
   StructInProgress.emplace_back(Name, DirKind == DK_UNION,
                                 StructInProgress.back().Alignment);
   return false;
@@ -4081,8 +4354,10 @@ bool MasmParser::parseDirectiveEnds(StringRef Name, SMLoc NameLoc) {
     return Error(NameLoc, "mismatched name in ENDS directive; expected '" +
                               StructInProgress.back().Name + "'");
   StructInfo Structure = StructInProgress.pop_back_val();
-  // Pad to make the structure's size divisible by its alignment.
-  Structure.Size = llvm::alignTo(Structure.Size, Structure.Alignment);
+  // Pad to make the structure's size divisible by the smaller of its alignment
+  // and the size of its largest field.
+  Structure.Size = llvm::alignTo(
+      Structure.Size, std::min(Structure.Alignment, Structure.AlignmentSize));
   Structs[Name.lower()] = Structure;
 
   if (parseToken(AsmToken::EndOfStatement))
@@ -4127,7 +4402,8 @@ bool MasmParser::parseDirectiveNestedEnds() {
     else
       ParentStruct.Size += Structure.Size;
   } else {
-    FieldInfo &Field = ParentStruct.addField(Structure.Name, FT_STRUCT);
+    FieldInfo &Field = ParentStruct.addField(Structure.Name, FT_STRUCT,
+                                             Structure.AlignmentSize);
     StructFieldInfo &StructInfo = Field.Contents.StructInfo;
     Field.Type = Structure.Size;
     Field.LengthOf = 1;
@@ -5199,76 +5475,60 @@ bool MasmParser::parseDirectiveCFIUndefined(SMLoc DirectiveLoc) {
   return false;
 }
 
-/// parseDirectiveAltmacro
-/// ::= .altmacro
-/// ::= .noaltmacro
-bool MasmParser::parseDirectiveAltmacro(StringRef Directive) {
-  if (getLexer().isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token in '" + Directive + "' directive");
-  AltMacroMode = (Directive == ".altmacro");
-  return false;
-}
-
 /// parseDirectiveMacro
-/// ::= .macro name[,] [parameters]
-bool MasmParser::parseDirectiveMacro(SMLoc DirectiveLoc) {
-  StringRef Name;
-  if (parseIdentifier(Name))
-    return TokError("expected identifier in '.macro' directive");
-
-  if (getLexer().is(AsmToken::Comma))
-    Lex();
-
+/// ::= name macro [parameters]
+///     ["LOCAL" identifiers]
+///   parameters ::= parameter [, parameter]*
+///   parameter ::= name ":" qualifier
+///   qualifier ::= "req" | "vararg" | "=" macro_argument
+bool MasmParser::parseDirectiveMacro(StringRef Name, SMLoc NameLoc) {
   MCAsmMacroParameters Parameters;
   while (getLexer().isNot(AsmToken::EndOfStatement)) {
-
     if (!Parameters.empty() && Parameters.back().Vararg)
       return Error(Lexer.getLoc(),
                    "Vararg parameter '" + Parameters.back().Name +
-                   "' should be last one in the list of parameters.");
+                       "' should be last in the list of parameters");
 
     MCAsmMacroParameter Parameter;
     if (parseIdentifier(Parameter.Name))
-      return TokError("expected identifier in '.macro' directive");
+      return TokError("expected identifier in 'macro' directive");
 
     // Emit an error if two (or more) named parameters share the same name.
     for (const MCAsmMacroParameter& CurrParam : Parameters)
-      if (CurrParam.Name.equals(Parameter.Name))
+      if (CurrParam.Name.equals_lower(Parameter.Name))
         return TokError("macro '" + Name + "' has multiple parameters"
                         " named '" + Parameter.Name + "'");
 
     if (Lexer.is(AsmToken::Colon)) {
       Lex();  // consume ':'
 
-      SMLoc QualLoc;
-      StringRef Qualifier;
-
-      QualLoc = Lexer.getLoc();
-      if (parseIdentifier(Qualifier))
-        return Error(QualLoc, "missing parameter qualifier for "
-                     "'" + Parameter.Name + "' in macro '" + Name + "'");
-
-      if (Qualifier == "req")
-        Parameter.Required = true;
-      else if (Qualifier == "vararg")
-        Parameter.Vararg = true;
-      else
-        return Error(QualLoc, Qualifier + " is not a valid parameter qualifier "
-                     "for '" + Parameter.Name + "' in macro '" + Name + "'");
-    }
-
-    if (getLexer().is(AsmToken::Equal)) {
-      Lex();
-
-      SMLoc ParamLoc;
-
-      ParamLoc = Lexer.getLoc();
-      if (parseMacroArgument(Parameter.Value, /*Vararg=*/false ))
-        return true;
+      if (parseOptionalToken(AsmToken::Equal)) {
+        // Default value
+        SMLoc ParamLoc;
 
-      if (Parameter.Required)
-        Warning(ParamLoc, "pointless default value for required parameter "
-                "'" + Parameter.Name + "' in macro '" + Name + "'");
+        ParamLoc = Lexer.getLoc();
+        if (parseMacroArgument(nullptr, Parameter.Value))
+          return true;
+      } else {
+        SMLoc QualLoc;
+        StringRef Qualifier;
+
+        QualLoc = Lexer.getLoc();
+        if (parseIdentifier(Qualifier))
+          return Error(QualLoc, "missing parameter qualifier for "
+                                "'" +
+                                    Parameter.Name + "' in macro '" + Name +
+                                    "'");
+
+        if (Qualifier.equals_lower("req"))
+          Parameter.Required = true;
+        else if (Qualifier.equals_lower("vararg"))
+          Parameter.Vararg = true;
+        else
+          return Error(QualLoc,
+                       Qualifier + " is not a valid parameter qualifier for '" +
+                           Parameter.Name + "' in macro '" + Name + "'");
+      }
     }
 
     Parameters.push_back(std::move(Parameter));
@@ -5280,9 +5540,28 @@ bool MasmParser::parseDirectiveMacro(SMLoc DirectiveLoc) {
   // Eat just the end of statement.
   Lexer.Lex();
 
+  std::vector<std::string> Locals;
+  if (getTok().is(AsmToken::Identifier) &&
+      getTok().getIdentifier().equals_lower("local")) {
+    Lex(); // Eat the LOCAL directive.
+
+    StringRef ID;
+    while (true) {
+      if (parseIdentifier(ID))
+        return true;
+      Locals.push_back(ID.lower());
+
+      // If we see a comma, continue (and allow line continuation).
+      if (!parseOptionalToken(AsmToken::Comma))
+        break;
+      parseOptionalToken(AsmToken::EndOfStatement);
+    }
+  }
+
   // Consuming deferred text, so use Lexer.Lex to ignore Lexing Errors.
   AsmToken EndToken, StartToken = getTok();
   unsigned MacroDepth = 0;
+  bool IsMacroFunction = false;
   // Lex the macro definition.
   while (true) {
     // Ignore Lexing errors in macros.
@@ -5292,12 +5571,12 @@ bool MasmParser::parseDirectiveMacro(SMLoc DirectiveLoc) {
 
     // Check whether we have reached the end of the file.
     if (getLexer().is(AsmToken::Eof))
-      return Error(DirectiveLoc, "no matching '.endmacro' in definition");
+      return Error(NameLoc, "no matching 'endm' in definition");
 
-    // Otherwise, check whether we have reach the .endmacro.
+    // Otherwise, check whether we have reached the 'endm'... and determine if
+    // this is a macro function.
     if (getLexer().is(AsmToken::Identifier)) {
-      if (getTok().getIdentifier() == ".endm" ||
-          getTok().getIdentifier() == ".endmacro") {
+      if (getTok().getIdentifier().equals_lower("endm")) {
         if (MacroDepth == 0) { // Outermost macro.
           EndToken = getTok();
           Lexer.Lex();
@@ -5309,9 +5588,14 @@ bool MasmParser::parseDirectiveMacro(SMLoc DirectiveLoc) {
           // Otherwise we just found the end of an inner macro.
           --MacroDepth;
         }
-      } else if (getTok().getIdentifier() == ".macro") {
-        // We allow nested macros. Those aren't instantiated until the outermost
-        // macro is expanded so just ignore them for now.
+      } else if (getTok().getIdentifier().equals_lower("exitm")) {
+        if (MacroDepth == 0 &&
+            getLexer().peekTok().isNot(AsmToken::EndOfStatement)) {
+          IsMacroFunction = true;
+        }
+      } else if (isMacroLikeDirective()) {
+        // We allow nested macros. Those aren't instantiated until the
+        // outermost macro is expanded so just ignore them for now.
         ++MacroDepth;
       }
     }
@@ -5320,129 +5604,31 @@ bool MasmParser::parseDirectiveMacro(SMLoc DirectiveLoc) {
     eatToEndOfStatement();
   }
 
-  if (getContext().lookupMacro(Name)) {
-    return Error(DirectiveLoc, "macro '" + Name + "' is already defined");
+  if (getContext().lookupMacro(Name.lower())) {
+    return Error(NameLoc, "macro '" + Name + "' is already defined");
   }
 
   const char *BodyStart = StartToken.getLoc().getPointer();
   const char *BodyEnd = EndToken.getLoc().getPointer();
   StringRef Body = StringRef(BodyStart, BodyEnd - BodyStart);
-  checkForBadMacro(DirectiveLoc, Name, Body, Parameters);
-  MCAsmMacro Macro(Name, Body, std::move(Parameters));
+  MCAsmMacro Macro(Name, Body, std::move(Parameters), std::move(Locals),
+                   IsMacroFunction);
   DEBUG_WITH_TYPE("asm-macros", dbgs() << "Defining new macro:\n";
                   Macro.dump());
   getContext().defineMacro(Name, std::move(Macro));
   return false;
 }
 
-/// checkForBadMacro
-///
-/// With the support added for named parameters there may be code out there that
-/// is transitioning from positional parameters.  In versions of gas that did
-/// not support named parameters they would be ignored on the macro definition.
-/// But to support both styles of parameters this is not possible so if a macro
-/// definition has named parameters but does not use them and has what appears
-/// to be positional parameters, strings like $1, $2, ... and $n, then issue a
-/// warning that the positional parameter found in body which have no effect.
-/// Hoping the developer will either remove the named parameters from the macro
-/// definition so the positional parameters get used if that was what was
-/// intended or change the macro to use the named parameters.  It is possible
-/// this warning will trigger when the none of the named parameters are used
-/// and the strings like $1 are infact to simply to be passed trough unchanged.
-void MasmParser::checkForBadMacro(SMLoc DirectiveLoc, StringRef Name,
-                                  StringRef Body,
-                                  ArrayRef<MCAsmMacroParameter> Parameters) {
-  // If this macro is not defined with named parameters the warning we are
-  // checking for here doesn't apply.
-  unsigned NParameters = Parameters.size();
-  if (NParameters == 0)
-    return;
-
-  bool NamedParametersFound = false;
-  bool PositionalParametersFound = false;
-
-  // Look at the body of the macro for use of both the named parameters and what
-  // are likely to be positional parameters.  This is what expandMacro() is
-  // doing when it finds the parameters in the body.
-  while (!Body.empty()) {
-    // Scan for the next possible parameter.
-    std::size_t End = Body.size(), Pos = 0;
-    for (; Pos != End; ++Pos) {
-      // Check for a substitution or escape.
-      // This macro is defined with parameters, look for \foo, \bar, etc.
-      if (Body[Pos] == '\\' && Pos + 1 != End)
-        break;
-
-      // This macro should have parameters, but look for $0, $1, ..., $n too.
-      if (Body[Pos] != '$' || Pos + 1 == End)
-        continue;
-      char Next = Body[Pos + 1];
-      if (Next == '$' || Next == 'n' ||
-          isdigit(static_cast<unsigned char>(Next)))
-        break;
-    }
-
-    // Check if we reached the end.
-    if (Pos == End)
-      break;
-
-    if (Body[Pos] == '$') {
-      switch (Body[Pos + 1]) {
-      // $$ => $
-      case '$':
-        break;
-
-      // $n => number of arguments
-      case 'n':
-        PositionalParametersFound = true;
-        break;
-
-      // $[0-9] => argument
-      default: {
-        PositionalParametersFound = true;
-        break;
-      }
-      }
-      Pos += 2;
-    } else {
-      unsigned I = Pos + 1;
-      while (isIdentifierChar(Body[I]) && I + 1 != End)
-        ++I;
-
-      const char *Begin = Body.data() + Pos + 1;
-      StringRef Argument(Begin, I - (Pos + 1));
-      unsigned Index = 0;
-      for (; Index < NParameters; ++Index)
-        if (Parameters[Index].Name == Argument)
-          break;
-
-      if (Index == NParameters) {
-        if (Body[Pos + 1] == '(' && Body[Pos + 2] == ')')
-          Pos += 3;
-        else {
-          Pos = I;
-        }
-      } else {
-        NamedParametersFound = true;
-        Pos += 1 + Argument.size();
-      }
-    }
-    // Update the scan point.
-    Body = Body.substr(Pos);
-  }
-
-  if (!NamedParametersFound && PositionalParametersFound)
-    Warning(DirectiveLoc, "macro defined with named parameters which are not "
-                          "used in macro body, possible positional parameter "
-                          "found in body which will have no effect");
-}
-
 /// parseDirectiveExitMacro
-/// ::= .exitm
-bool MasmParser::parseDirectiveExitMacro(StringRef Directive) {
-  if (parseToken(AsmToken::EndOfStatement,
-                 "unexpected token in '" + Directive + "' directive"))
-    return true;
+/// ::= "exitm" [textitem]
+bool MasmParser::parseDirectiveExitMacro(SMLoc DirectiveLoc,
+                                         StringRef Directive,
+                                         std::string &Value) {
+  SMLoc EndLoc = getTok().getLoc();
+  if (getTok().isNot(AsmToken::EndOfStatement) && parseTextItem(Value))
+    return Error(EndLoc,
+                 "unable to parse text item in '" + Directive + "' directive");
+  eatToEndOfStatement();
 
   if (!isInsideMacroInstantiation())
     return TokError("unexpected '" + Directive + "' in file, "
@@ -5459,8 +5645,7 @@ bool MasmParser::parseDirectiveExitMacro(StringRef Directive) {
 }
 
 /// parseDirectiveEndMacro
-/// ::= .endm
-/// ::= .endmacro
+/// ::= endm
 bool MasmParser::parseDirectiveEndMacro(StringRef Directive) {
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in '" + Directive + "' directive");
@@ -5479,23 +5664,27 @@ bool MasmParser::parseDirectiveEndMacro(StringRef Directive) {
 }
 
 /// parseDirectivePurgeMacro
-/// ::= .purgem
+/// ::= purge identifier ( , identifier )*
 bool MasmParser::parseDirectivePurgeMacro(SMLoc DirectiveLoc) {
   StringRef Name;
-  SMLoc Loc;
-  if (parseTokenLoc(Loc) ||
-      check(parseIdentifier(Name), Loc,
-            "expected identifier in '.purgem' directive") ||
-      parseToken(AsmToken::EndOfStatement,
-                 "unexpected token in '.purgem' directive"))
-    return true;
+  while (true) {
+    SMLoc NameLoc;
+    if (parseTokenLoc(NameLoc) ||
+        check(parseIdentifier(Name), NameLoc,
+              "expected identifier in 'purge' directive"))
+      return true;
 
-  if (!getContext().lookupMacro(Name))
-    return Error(DirectiveLoc, "macro '" + Name + "' is not defined");
+    DEBUG_WITH_TYPE("asm-macros", dbgs()
+                                      << "Un-defining macro: " << Name << "\n");
+    if (!getContext().lookupMacro(Name.lower()))
+      return Error(NameLoc, "macro '" + Name + "' is not defined");
+    getContext().undefineMacro(Name.lower());
+
+    if (!parseOptionalToken(AsmToken::Comma))
+      break;
+    parseOptionalToken(AsmToken::EndOfStatement);
+  }
 
-  getContext().undefineMacro(Name);
-  DEBUG_WITH_TYPE("asm-macros", dbgs()
-                                    << "Un-defining macro: " << Name << "\n");
   return false;
 }
 
@@ -5603,16 +5792,17 @@ bool MasmParser::parseDirectiveComm(bool IsLocal) {
 ///              [[text]]
 ///              [[text]] delimiter [[text]]
 bool MasmParser::parseDirectiveComment(SMLoc DirectiveLoc) {
-  StringRef FirstLine = parseStringToEndOfStatement();
+  std::string FirstLine = parseStringTo(AsmToken::EndOfStatement);
   size_t DelimiterEnd = FirstLine.find_first_of("\b\t\v\f\r\x1A ");
-  StringRef Delimiter = FirstLine.take_front(DelimiterEnd);
+  StringRef Delimiter = StringRef(FirstLine).take_front(DelimiterEnd);
   if (Delimiter.empty())
     return Error(DirectiveLoc, "no delimiter in 'comment' directive");
   do {
     if (getTok().is(AsmToken::Eof))
       return Error(DirectiveLoc, "unmatched delimiter in 'comment' directive");
     Lex();  // eat end of statement
-  } while (!parseStringToEndOfStatement().contains(Delimiter));
+  } while (
+      !StringRef(parseStringTo(AsmToken::EndOfStatement)).contains(Delimiter));
   return parseToken(AsmToken::EndOfStatement,
                     "unexpected token in 'comment' directive");
 }
@@ -5626,7 +5816,7 @@ bool MasmParser::parseDirectiveInclude() {
   SMLoc IncludeLoc = getTok().getLoc();
 
   if (!parseAngleBracketString(Filename))
-    Filename = parseStringToEndOfStatement().str();
+    Filename = parseStringTo(AsmToken::EndOfStatement);
   if (check(!Filename.empty(), "missing filename in 'include' directive") ||
       check(getTok().isNot(AsmToken::EndOfStatement),
             "unexpected token in 'include' directive") ||
@@ -5671,7 +5861,7 @@ bool MasmParser::parseDirectiveIf(SMLoc DirectiveLoc, DirectiveKind DirKind) {
 }
 
 /// parseDirectiveIfb
-/// ::= .ifb string
+/// ::= .ifb textitem
 bool MasmParser::parseDirectiveIfb(SMLoc DirectiveLoc, bool ExpectBlank) {
   TheCondStack.push_back(TheCondState);
   TheCondState.TheCond = AsmCond::IfCond;
@@ -5681,7 +5871,7 @@ bool MasmParser::parseDirectiveIfb(SMLoc DirectiveLoc, bool ExpectBlank) {
   } else {
     std::string Str;
     if (parseTextItem(Str))
-      return TokError("expected string parameter for 'ifb' directive");
+      return TokError("expected text item parameter for 'ifb' directive");
 
     if (parseToken(AsmToken::EndOfStatement,
                    "unexpected token in 'ifb' directive"))
@@ -5695,14 +5885,15 @@ bool MasmParser::parseDirectiveIfb(SMLoc DirectiveLoc, bool ExpectBlank) {
 }
 
 /// parseDirectiveIfidn
-///   ::= ifidn string1, string2
-bool MasmParser::parseDirectiveIfidn(SMLoc DirectiveLoc, bool ExpectEqual, bool CaseInsensitive) {
+///   ::= ifidn textitem, textitem
+bool MasmParser::parseDirectiveIfidn(SMLoc DirectiveLoc, bool ExpectEqual,
+                                     bool CaseInsensitive) {
   std::string String1, String2;
 
   if (parseTextItem(String1)) {
     if (ExpectEqual)
-      return TokError("expected string parameter for 'ifidn' directive");
-    return TokError("expected string parameter for 'ifdif' directive");
+      return TokError("expected text item parameter for 'ifidn' directive");
+    return TokError("expected text item parameter for 'ifdif' directive");
   }
 
   if (Lexer.isNot(AsmToken::Comma)) {
@@ -5715,8 +5906,8 @@ bool MasmParser::parseDirectiveIfidn(SMLoc DirectiveLoc, bool ExpectEqual, bool
 
   if (parseTextItem(String2)) {
     if (ExpectEqual)
-      return TokError("expected string parameter for 'ifidn' directive");
-    return TokError("expected string parameter for 'ifdif' directive");
+      return TokError("expected text item parameter for 'ifidn' directive");
+    return TokError("expected text item parameter for 'ifdif' directive");
   }
 
   TheCondStack.push_back(TheCondState);
@@ -5810,7 +6001,7 @@ bool MasmParser::parseDirectiveElseIf(SMLoc DirectiveLoc,
 }
 
 /// parseDirectiveElseIfb
-/// ::= elseifb expression
+/// ::= elseifb textitem
 bool MasmParser::parseDirectiveElseIfb(SMLoc DirectiveLoc, bool ExpectBlank) {
   if (TheCondState.TheCond != AsmCond::IfCond &&
       TheCondState.TheCond != AsmCond::ElseIfCond)
@@ -5826,8 +6017,11 @@ bool MasmParser::parseDirectiveElseIfb(SMLoc DirectiveLoc, bool ExpectBlank) {
     eatToEndOfStatement();
   } else {
     std::string Str;
-    if (parseTextItem(Str))
-      return TokError("expected string parameter for 'elseifb' directive");
+    if (parseTextItem(Str)) {
+      if (ExpectBlank)
+        return TokError("expected text item parameter for 'elseifb' directive");
+      return TokError("expected text item parameter for 'elseifnb' directive");
+    }
 
     if (parseToken(AsmToken::EndOfStatement,
                    "unexpected token in 'elseifb' directive"))
@@ -5887,7 +6081,7 @@ bool MasmParser::parseDirectiveElseIfdef(SMLoc DirectiveLoc,
 }
 
 /// parseDirectiveElseIfidn
-/// ::= elseifidn string1, string2
+/// ::= elseifidn textitem, textitem
 bool MasmParser::parseDirectiveElseIfidn(SMLoc DirectiveLoc, bool ExpectEqual,
                                          bool CaseInsensitive) {
   if (TheCondState.TheCond != AsmCond::IfCond &&
@@ -5907,8 +6101,9 @@ bool MasmParser::parseDirectiveElseIfidn(SMLoc DirectiveLoc, bool ExpectEqual,
 
     if (parseTextItem(String1)) {
       if (ExpectEqual)
-        return TokError("expected string parameter for 'elseifidn' directive");
-      return TokError("expected string parameter for 'elseifdif' directive");
+        return TokError(
+            "expected text item parameter for 'elseifidn' directive");
+      return TokError("expected text item parameter for 'elseifdif' directive");
     }
 
     if (Lexer.isNot(AsmToken::Comma)) {
@@ -5922,8 +6117,9 @@ bool MasmParser::parseDirectiveElseIfidn(SMLoc DirectiveLoc, bool ExpectEqual,
 
     if (parseTextItem(String2)) {
       if (ExpectEqual)
-        return TokError("expected string parameter for 'elseifidn' directive");
-      return TokError("expected string parameter for 'elseifdif' directive");
+        return TokError(
+            "expected text item parameter for 'elseifidn' directive");
+      return TokError("expected text item parameter for 'elseifdif' directive");
     }
 
     if (CaseInsensitive)
@@ -5983,9 +6179,9 @@ bool MasmParser::parseDirectiveError(SMLoc DirectiveLoc) {
     }
   }
 
-  StringRef Message = ".err directive invoked in source file";
+  std::string Message = ".err directive invoked in source file";
   if (Lexer.isNot(AsmToken::EndOfStatement))
-    Message = parseStringToEndOfStatement();
+    Message = parseStringTo(AsmToken::EndOfStatement);
   Lex();
 
   return Error(DirectiveLoc, Message);
@@ -6005,11 +6201,11 @@ bool MasmParser::parseDirectiveErrorIfb(SMLoc DirectiveLoc, bool ExpectBlank) {
   if (parseTextItem(Text))
     return Error(getTok().getLoc(), "missing text item in '.errb' directive");
 
-  StringRef Message = ".errb directive invoked in source file";
+  std::string Message = ".errb directive invoked in source file";
   if (Lexer.isNot(AsmToken::EndOfStatement)) {
     if (parseToken(AsmToken::Comma))
       return addErrorSuffix(" in '.errb' directive");
-    Message = parseStringToEndOfStatement();
+    Message = parseStringTo(AsmToken::EndOfStatement);
   }
   Lex();
 
@@ -6047,11 +6243,11 @@ bool MasmParser::parseDirectiveErrorIfdef(SMLoc DirectiveLoc,
     }
   }
 
-  StringRef Message = ".errdef directive invoked in source file";
+  std::string Message = ".errdef directive invoked in source file";
   if (Lexer.isNot(AsmToken::EndOfStatement)) {
     if (parseToken(AsmToken::Comma))
       return addErrorSuffix(" in '.errdef' directive");
-    Message = parseStringToEndOfStatement();
+    Message = parseStringTo(AsmToken::EndOfStatement);
   }
   Lex();
 
@@ -6061,7 +6257,7 @@ bool MasmParser::parseDirectiveErrorIfdef(SMLoc DirectiveLoc,
 }
 
 /// parseDirectiveErrorIfidn
-///   ::= .erridn textitem1, textitem2[, message]
+///   ::= .erridn textitem, textitem[, message]
 bool MasmParser::parseDirectiveErrorIfidn(SMLoc DirectiveLoc, bool ExpectEqual,
                                           bool CaseInsensitive) {
   if (!TheCondStack.empty()) {
@@ -6094,7 +6290,7 @@ bool MasmParser::parseDirectiveErrorIfidn(SMLoc DirectiveLoc, bool ExpectEqual,
     return TokError("expected string parameter for '.errdif' directive");
   }
 
-  StringRef Message;
+  std::string Message;
   if (ExpectEqual)
     Message = ".erridn directive invoked in source file";
   else
@@ -6102,7 +6298,7 @@ bool MasmParser::parseDirectiveErrorIfidn(SMLoc DirectiveLoc, bool ExpectEqual,
   if (Lexer.isNot(AsmToken::EndOfStatement)) {
     if (parseToken(AsmToken::Comma))
       return addErrorSuffix(" in '.erridn' directive");
-    Message = parseStringToEndOfStatement();
+    Message = parseStringTo(AsmToken::EndOfStatement);
   }
   Lex();
 
@@ -6134,11 +6330,11 @@ bool MasmParser::parseDirectiveErrorIfe(SMLoc DirectiveLoc, bool ExpectZero) {
   if (parseAbsoluteExpression(ExprValue))
     return addErrorSuffix(" in '.erre' directive");
 
-  StringRef Message = ".erre directive invoked in source file";
+  std::string Message = ".erre directive invoked in source file";
   if (Lexer.isNot(AsmToken::EndOfStatement)) {
     if (parseToken(AsmToken::Comma))
       return addErrorSuffix(" in '.erre' directive");
-    Message = parseStringToEndOfStatement();
+    Message = parseStringTo(AsmToken::EndOfStatement);
   }
   Lex();
 
@@ -6183,6 +6379,7 @@ void MasmParser::initializeDirectiveKindMap() {
   DirectiveKindMap["sqword"] = DK_SQWORD;
   DirectiveKindMap["real4"] = DK_REAL4;
   DirectiveKindMap["real8"] = DK_REAL8;
+  DirectiveKindMap["real10"] = DK_REAL10;
   DirectiveKindMap["align"] = DK_ALIGN;
   // DirectiveKindMap[".org"] = DK_ORG;
   DirectiveKindMap["extern"] = DK_EXTERN;
@@ -6190,11 +6387,13 @@ void MasmParser::initializeDirectiveKindMap() {
   // DirectiveKindMap[".comm"] = DK_COMM;
   DirectiveKindMap["comment"] = DK_COMMENT;
   DirectiveKindMap["include"] = DK_INCLUDE;
-  // DirectiveKindMap[".rept"] = DK_REPT;
-  // DirectiveKindMap[".rep"] = DK_REPT;
-  // DirectiveKindMap[".irp"] = DK_IRP;
-  // DirectiveKindMap[".irpc"] = DK_IRPC;
-  // DirectiveKindMap[".endr"] = DK_ENDR;
+  DirectiveKindMap["repeat"] = DK_REPEAT;
+  DirectiveKindMap["rept"] = DK_REPEAT;
+  DirectiveKindMap["while"] = DK_WHILE;
+  DirectiveKindMap["for"] = DK_FOR;
+  DirectiveKindMap["irp"] = DK_FOR;
+  DirectiveKindMap["forc"] = DK_FORC;
+  DirectiveKindMap["irpc"] = DK_FORC;
   DirectiveKindMap["if"] = DK_IF;
   DirectiveKindMap["ife"] = DK_IFE;
   DirectiveKindMap["ifb"] = DK_IFB;
@@ -6251,10 +6450,10 @@ void MasmParser::initializeDirectiveKindMap() {
   // DirectiveKindMap[".cfi_register"] = DK_CFI_REGISTER;
   // DirectiveKindMap[".cfi_window_save"] = DK_CFI_WINDOW_SAVE;
   // DirectiveKindMap[".cfi_b_key_frame"] = DK_CFI_B_KEY_FRAME;
-  // DirectiveKindMap[".macro"] = DK_MACRO;
-  // DirectiveKindMap[".exitm"] = DK_EXITM;
-  // DirectiveKindMap[".endm"] = DK_ENDM;
-  // DirectiveKindMap[".purgem"] = DK_PURGEM;
+  DirectiveKindMap["macro"] = DK_MACRO;
+  DirectiveKindMap["exitm"] = DK_EXITM;
+  DirectiveKindMap["endm"] = DK_ENDM;
+  DirectiveKindMap["purge"] = DK_PURGE;
   DirectiveKindMap[".err"] = DK_ERR;
   DirectiveKindMap[".errb"] = DK_ERRB;
   DirectiveKindMap[".errnb"] = DK_ERRNB;
@@ -6266,10 +6465,15 @@ void MasmParser::initializeDirectiveKindMap() {
   DirectiveKindMap[".erridni"] = DK_ERRIDNI;
   DirectiveKindMap[".erre"] = DK_ERRE;
   DirectiveKindMap[".errnz"] = DK_ERRNZ;
-  // DirectiveKindMap[".altmacro"] = DK_ALTMACRO;
-  // DirectiveKindMap[".noaltmacro"] = DK_NOALTMACRO;
+  DirectiveKindMap[".pushframe"] = DK_PUSHFRAME;
+  DirectiveKindMap[".pushreg"] = DK_PUSHREG;
+  DirectiveKindMap[".savereg"] = DK_SAVEREG;
+  DirectiveKindMap[".savexmm128"] = DK_SAVEXMM128;
+  DirectiveKindMap[".setframe"] = DK_SETFRAME;
+  DirectiveKindMap[".radix"] = DK_RADIX;
   DirectiveKindMap["db"] = DK_DB;
   DirectiveKindMap["dd"] = DK_DD;
+  DirectiveKindMap["df"] = DK_DF;
   DirectiveKindMap["dq"] = DK_DQ;
   DirectiveKindMap["dw"] = DK_DW;
   DirectiveKindMap["echo"] = DK_ECHO;
@@ -6279,6 +6483,24 @@ void MasmParser::initializeDirectiveKindMap() {
   DirectiveKindMap["ends"] = DK_ENDS;
 }
 
+bool MasmParser::isMacroLikeDirective() {
+  if (getLexer().is(AsmToken::Identifier)) {
+    bool IsMacroLike = StringSwitch<bool>(getTok().getIdentifier())
+                           .CasesLower("repeat", "rept", true)
+                           .CaseLower("while", true)
+                           .CasesLower("for", "irp", true)
+                           .CasesLower("forc", "irpc", true)
+                           .Default(false);
+    if (IsMacroLike)
+      return true;
+  }
+  if (getLexer().peekTok().is(AsmToken::Identifier) &&
+      getLexer().peekTok().getIdentifier().equals_lower("macro"))
+    return true;
+
+  return false;
+}
+
 MCAsmMacro *MasmParser::parseMacroLikeBody(SMLoc DirectiveLoc) {
   AsmToken EndToken, StartToken = getTok();
 
@@ -6286,26 +6508,21 @@ MCAsmMacro *MasmParser::parseMacroLikeBody(SMLoc DirectiveLoc) {
   while (true) {
     // Check whether we have reached the end of the file.
     if (getLexer().is(AsmToken::Eof)) {
-      printError(DirectiveLoc, "no matching '.endr' in definition");
+      printError(DirectiveLoc, "no matching 'endm' in definition");
       return nullptr;
     }
 
-    if (Lexer.is(AsmToken::Identifier) &&
-        (getTok().getIdentifier() == ".rep" ||
-         getTok().getIdentifier() == ".rept" ||
-         getTok().getIdentifier() == ".irp" ||
-         getTok().getIdentifier() == ".irpc")) {
+    if (isMacroLikeDirective())
       ++NestLevel;
-    }
 
-    // Otherwise, check whether we have reached the .endr.
-    if (Lexer.is(AsmToken::Identifier) && getTok().getIdentifier() == ".endr") {
+    // Otherwise, check whether we have reached the endm.
+    if (Lexer.is(AsmToken::Identifier) &&
+        getTok().getIdentifier().equals_lower("endm")) {
       if (NestLevel == 0) {
         EndToken = getTok();
         Lex();
         if (Lexer.isNot(AsmToken::EndOfStatement)) {
-          printError(getTok().getLoc(),
-                     "unexpected token in '.endr' directive");
+          printError(getTok().getLoc(), "unexpected token in 'endm' directive");
           return nullptr;
         }
         break;
@@ -6326,28 +6543,73 @@ MCAsmMacro *MasmParser::parseMacroLikeBody(SMLoc DirectiveLoc) {
   return &MacroLikeBodies.back();
 }
 
+bool MasmParser::expandStatement(SMLoc Loc) {
+  std::string Body = parseStringTo(AsmToken::EndOfStatement);
+  SMLoc EndLoc = getTok().getLoc();
+
+  MCAsmMacroParameters Parameters;
+  MCAsmMacroArguments Arguments;
+  for (const auto &V : Variables) {
+    const Variable &Var = V.getValue();
+    if (Var.IsText) {
+      Parameters.emplace_back();
+      Arguments.emplace_back();
+      MCAsmMacroParameter &P = Parameters.back();
+      MCAsmMacroArgument &A = Arguments.back();
+      P.Name = Var.Name;
+      P.Required = true;
+      A.push_back(AsmToken(AsmToken::String, Var.TextValue));
+    }
+  }
+  MacroLikeBodies.emplace_back(StringRef(), Body, Parameters);
+  MCAsmMacro M = MacroLikeBodies.back();
+
+  // Expand the statement in a new buffer.
+  SmallString<80> Buf;
+  raw_svector_ostream OS(Buf);
+  if (expandMacro(OS, M.Body, M.Parameters, Arguments, M.Locals, EndLoc))
+    return true;
+  std::unique_ptr<MemoryBuffer> Expansion =
+      MemoryBuffer::getMemBufferCopy(OS.str(), "<expansion>");
+
+  // Jump to the expanded statement and prime the lexer.
+  CurBuffer = SrcMgr.AddNewSourceBuffer(std::move(Expansion), EndLoc);
+  Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer());
+  EndStatementAtEOFStack.push_back(false);
+  Lex();
+  return false;
+}
+
+void MasmParser::instantiateMacroLikeBody(MCAsmMacro *M, SMLoc DirectiveLoc,
+                                          raw_svector_ostream &OS) {
+  instantiateMacroLikeBody(M, DirectiveLoc, /*ExitLoc=*/getTok().getLoc(), OS);
+}
 void MasmParser::instantiateMacroLikeBody(MCAsmMacro *M, SMLoc DirectiveLoc,
+                                          SMLoc ExitLoc,
                                           raw_svector_ostream &OS) {
-  OS << ".endr\n";
+  OS << "endm\n";
 
   std::unique_ptr<MemoryBuffer> Instantiation =
       MemoryBuffer::getMemBufferCopy(OS.str(), "<instantiation>");
 
   // Create the macro instantiation object and add to the current macro
   // instantiation stack.
-  MacroInstantiation *MI = new MacroInstantiation{
-      DirectiveLoc, CurBuffer, getTok().getLoc(), TheCondStack.size()};
+  MacroInstantiation *MI = new MacroInstantiation{DirectiveLoc, CurBuffer,
+                                                  ExitLoc, TheCondStack.size()};
   ActiveMacros.push_back(MI);
 
   // Jump to the macro instantiation and prime the lexer.
   CurBuffer = SrcMgr.AddNewSourceBuffer(std::move(Instantiation), SMLoc());
   Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer());
+  EndStatementAtEOFStack.push_back(true);
   Lex();
 }
 
-/// parseDirectiveRept
-///   ::= .rep | .rept count
-bool MasmParser::parseDirectiveRept(SMLoc DirectiveLoc, StringRef Dir) {
+/// parseDirectiveRepeat
+///   ::= ("repeat" | "rept") count
+///       body
+///     endm
+bool MasmParser::parseDirectiveRepeat(SMLoc DirectiveLoc, StringRef Dir) {
   const MCExpr *CountExpr;
   SMLoc CountLoc = getTok().getLoc();
   if (parseExpression(CountExpr))
@@ -6363,7 +6625,7 @@ bool MasmParser::parseDirectiveRept(SMLoc DirectiveLoc, StringRef Dir) {
                  "unexpected token in '" + Dir + "' directive"))
     return true;
 
-  // Lex the rept definition.
+  // Lex the repeat definition.
   MCAsmMacro *M = parseMacroLikeBody(DirectiveLoc);
   if (!M)
     return true;
@@ -6373,8 +6635,7 @@ bool MasmParser::parseDirectiveRept(SMLoc DirectiveLoc, StringRef Dir) {
   SmallString<256> Buf;
   raw_svector_ostream OS(Buf);
   while (Count--) {
-    // Note that the AtPseudoVariable is disabled for instantiations of .rep(t).
-    if (expandMacro(OS, M->Body, None, None, false, getTok().getLoc()))
+    if (expandMacro(OS, M->Body, None, None, M->Locals, getTok().getLoc()))
       return true;
   }
   instantiateMacroLikeBody(M, DirectiveLoc, OS);
@@ -6382,19 +6643,104 @@ bool MasmParser::parseDirectiveRept(SMLoc DirectiveLoc, StringRef Dir) {
   return false;
 }
 
-/// parseDirectiveIrp
-/// ::= .irp symbol,values
-bool MasmParser::parseDirectiveIrp(SMLoc DirectiveLoc) {
+/// parseDirectiveWhile
+/// ::= "while" expression
+///       body
+///     endm
+bool MasmParser::parseDirectiveWhile(SMLoc DirectiveLoc) {
+  const MCExpr *CondExpr;
+  SMLoc CondLoc = getTok().getLoc();
+  if (parseExpression(CondExpr))
+    return true;
+
+  // Lex the repeat definition.
+  MCAsmMacro *M = parseMacroLikeBody(DirectiveLoc);
+  if (!M)
+    return true;
+
+  // Macro instantiation is lexical, unfortunately. We construct a new buffer
+  // to hold the macro body with substitutions.
+  SmallString<256> Buf;
+  raw_svector_ostream OS(Buf);
+  int64_t Condition;
+  if (!CondExpr->evaluateAsAbsolute(Condition, getStreamer().getAssemblerPtr()))
+    return Error(CondLoc, "expected absolute expression in 'while' directive");
+  if (Condition) {
+    // Instantiate the macro, then resume at this directive to recheck the
+    // condition.
+    if (expandMacro(OS, M->Body, None, None, M->Locals, getTok().getLoc()))
+      return true;
+    instantiateMacroLikeBody(M, DirectiveLoc, /*ExitLoc=*/DirectiveLoc, OS);
+  }
+
+  return false;
+}
+
+/// parseDirectiveFor
+/// ::= ("for" | "irp") symbol [":" qualifier], <values>
+///       body
+///     endm
+bool MasmParser::parseDirectiveFor(SMLoc DirectiveLoc, StringRef Dir) {
   MCAsmMacroParameter Parameter;
   MCAsmMacroArguments A;
   if (check(parseIdentifier(Parameter.Name),
-            "expected identifier in '.irp' directive") ||
-      parseToken(AsmToken::Comma, "expected comma in '.irp' directive") ||
-      parseMacroArguments(nullptr, A) ||
+            "expected identifier in '" + Dir + "' directive"))
+    return true;
+
+  // Parse optional qualifier (default value, or "req")
+  if (parseOptionalToken(AsmToken::Colon)) {
+    if (parseOptionalToken(AsmToken::Equal)) {
+      // Default value
+      SMLoc ParamLoc;
+
+      ParamLoc = Lexer.getLoc();
+      if (parseMacroArgument(nullptr, Parameter.Value))
+        return true;
+    } else {
+      SMLoc QualLoc;
+      StringRef Qualifier;
+
+      QualLoc = Lexer.getLoc();
+      if (parseIdentifier(Qualifier))
+        return Error(QualLoc, "missing parameter qualifier for "
+                              "'" +
+                                  Parameter.Name + "' in '" + Dir +
+                                  "' directive");
+
+      if (Qualifier.equals_lower("req"))
+        Parameter.Required = true;
+      else
+        return Error(QualLoc,
+                     Qualifier + " is not a valid parameter qualifier for '" +
+                         Parameter.Name + "' in '" + Dir + "' directive");
+    }
+  }
+
+  if (parseToken(AsmToken::Comma,
+                 "expected comma in '" + Dir + "' directive") ||
+      parseToken(AsmToken::Less,
+                 "values in '" + Dir +
+                     "' directive must be enclosed in angle brackets"))
+    return true;
+
+  while (true) {
+    A.emplace_back();
+    if (parseMacroArgument(&Parameter, A.back(), /*EndTok=*/AsmToken::Greater))
+      return addErrorSuffix(" in arguments for '" + Dir + "' directive");
+
+    // If we see a comma, continue, and allow line continuation.
+    if (!parseOptionalToken(AsmToken::Comma))
+      break;
+    parseOptionalToken(AsmToken::EndOfStatement);
+  }
+
+  if (parseToken(AsmToken::Greater,
+                 "values in '" + Dir +
+                     "' directive must be enclosed in angle brackets") ||
       parseToken(AsmToken::EndOfStatement, "expected End of Statement"))
     return true;
 
-  // Lex the irp definition.
+  // Lex the for definition.
   MCAsmMacro *M = parseMacroLikeBody(DirectiveLoc);
   if (!M)
     return true;
@@ -6405,9 +6751,7 @@ bool MasmParser::parseDirectiveIrp(SMLoc DirectiveLoc) {
   raw_svector_ostream OS(Buf);
 
   for (const MCAsmMacroArgument &Arg : A) {
-    // Note that the AtPseudoVariable is enabled for instantiations of .irp.
-    // This is undocumented, but GAS seems to support it.
-    if (expandMacro(OS, M->Body, Parameter, Arg, true, getTok().getLoc()))
+    if (expandMacro(OS, M->Body, Parameter, Arg, M->Locals, getTok().getLoc()))
       return true;
   }
 
@@ -6416,22 +6760,33 @@ bool MasmParser::parseDirectiveIrp(SMLoc DirectiveLoc) {
   return false;
 }
 
-/// parseDirectiveIrpc
-/// ::= .irpc symbol,values
-bool MasmParser::parseDirectiveIrpc(SMLoc DirectiveLoc) {
+/// parseDirectiveForc
+/// ::= ("forc" | "irpc") symbol, <string>
+///       body
+///     endm
+bool MasmParser::parseDirectiveForc(SMLoc DirectiveLoc, StringRef Directive) {
   MCAsmMacroParameter Parameter;
-  MCAsmMacroArguments A;
 
+  std::string Argument;
   if (check(parseIdentifier(Parameter.Name),
-            "expected identifier in '.irpc' directive") ||
-      parseToken(AsmToken::Comma, "expected comma in '.irpc' directive") ||
-      parseMacroArguments(nullptr, A))
+            "expected identifier in '" + Directive + "' directive") ||
+      parseToken(AsmToken::Comma,
+                 "expected comma in '" + Directive + "' directive"))
     return true;
-
-  if (A.size() != 1 || A.front().size() != 1)
-    return TokError("unexpected token in '.irpc' directive");
-
-  // Eat the end of statement.
+  if (parseAngleBracketString(Argument)) {
+    // Match ml64.exe; treat all characters to end of statement as a string,
+    // ignoring comment markers, then discard anything following a space (using
+    // the C locale).
+    Argument = parseStringTo(AsmToken::EndOfStatement);
+    if (getTok().is(AsmToken::EndOfStatement))
+      Argument += getTok().getString();
+    size_t End = 0;
+    for (; End < Argument.size(); ++End) {
+      if (isSpace(Argument[End]))
+        break;
+    }
+    Argument.resize(End);
+  }
   if (parseToken(AsmToken::EndOfStatement, "expected end of statement"))
     return true;
 
@@ -6445,14 +6800,12 @@ bool MasmParser::parseDirectiveIrpc(SMLoc DirectiveLoc) {
   SmallString<256> Buf;
   raw_svector_ostream OS(Buf);
 
-  StringRef Values = A.front().front().getString();
+  StringRef Values(Argument);
   for (std::size_t I = 0, End = Values.size(); I != End; ++I) {
     MCAsmMacroArgument Arg;
     Arg.emplace_back(AsmToken::Identifier, Values.slice(I, I + 1));
 
-    // Note that the AtPseudoVariable is enabled for instantiations of .irpc.
-    // This is undocumented, but GAS seems to support it.
-    if (expandMacro(OS, M->Body, Parameter, Arg, true, getTok().getLoc()))
+    if (expandMacro(OS, M->Body, Parameter, Arg, M->Locals, getTok().getLoc()))
       return true;
   }
 
@@ -6461,18 +6814,6 @@ bool MasmParser::parseDirectiveIrpc(SMLoc DirectiveLoc) {
   return false;
 }
 
-bool MasmParser::parseDirectiveEndr(SMLoc DirectiveLoc) {
-  if (ActiveMacros.empty())
-    return TokError("unmatched '.endr' directive");
-
-  // The only .repl that should get here are the ones created by
-  // instantiateMacroLikeBody.
-  assert(getLexer().is(AsmToken::EndOfStatement));
-
-  handleMacroExit();
-  return false;
-}
-
 bool MasmParser::parseDirectiveMSEmit(SMLoc IDLoc, ParseStatementInfo &Info,
                                       size_t Len) {
   const MCExpr *Value;
@@ -6506,10 +6847,37 @@ bool MasmParser::parseDirectiveMSAlign(SMLoc IDLoc, ParseStatementInfo &Info) {
   return false;
 }
 
+bool MasmParser::parseDirectiveRadix(SMLoc DirectiveLoc) {
+  const SMLoc Loc = getLexer().getLoc();
+  std::string RadixStringRaw = parseStringTo(AsmToken::EndOfStatement);
+  StringRef RadixString = StringRef(RadixStringRaw).trim();
+  unsigned Radix;
+  if (RadixString.getAsInteger(10, Radix)) {
+    return Error(Loc,
+                 "radix must be a decimal number in the range 2 to 16; was " +
+                     RadixString);
+  }
+  if (Radix < 2 || Radix > 16)
+    return Error(Loc, "radix must be in the range 2 to 16; was " +
+                          std::to_string(Radix));
+  getLexer().setMasmDefaultRadix(Radix);
+  return false;
+}
+
+/// parseDirectiveEcho
+///   ::= "echo" message
 bool MasmParser::parseDirectiveEcho() {
-  StringRef Message = parseStringToEndOfStatement();
-  Lex();  // eat end of statement
-  llvm::outs() << Message << '\n';
+  // We're called before the directive is parsed, to avoid triggering lexical
+  // substitutions in the message. Assert that the next token is the directive,
+  // then eat it without using the Parser's Lex method.
+  assert(getTok().is(AsmToken::Identifier) &&
+         getTok().getString().equals_lower("echo"));
+  Lexer.Lex();
+
+  std::string Message = parseStringTo(AsmToken::EndOfStatement);
+  llvm::outs() << Message;
+  if (!StringRef(Message).endswith("\n"))
+    llvm::outs() << '\n';
   return false;
 }
 
@@ -6536,37 +6904,52 @@ static int rewritesSort(const AsmRewrite *AsmRewriteA,
   llvm_unreachable("Unstable rewrite sort.");
 }
 
-bool MasmParser::lookUpField(StringRef Name, StringRef &Type,
-                             unsigned &Offset) const {
+bool MasmParser::defineMacro(StringRef Name, StringRef Value) {
+  Variable &Var = Variables[Name.lower()];
+  if (Var.Name.empty()) {
+    Var.Name = Name;
+  } else if (!Var.Redefinable) {
+    return TokError("invalid variable redefinition");
+  }
+  Var.Redefinable = true;
+  Var.IsText = true;
+  Var.TextValue = Value.str();
+  return false;
+}
+
+bool MasmParser::lookUpField(StringRef Name, AsmFieldInfo &Info) const {
   const std::pair<StringRef, StringRef> BaseMember = Name.split('.');
   const StringRef Base = BaseMember.first, Member = BaseMember.second;
-  return lookUpField(Base, Member, Type, Offset);
+  return lookUpField(Base, Member, Info);
 }
 
-bool MasmParser::lookUpField(StringRef Base, StringRef Member, StringRef &Type,
-                             unsigned &Offset) const {
+bool MasmParser::lookUpField(StringRef Base, StringRef Member,
+                             AsmFieldInfo &Info) const {
   if (Base.empty())
     return true;
 
-  unsigned BaseOffset = 0;
-  if (Base.contains('.') && !lookUpField(Base, Type, BaseOffset))
-    Base = Type;
-
-  auto TypeIt = KnownType.find(Base);
-  if (TypeIt != KnownType.end())
-    return lookUpField(*TypeIt->second, Member, Type, Offset);
+  AsmFieldInfo BaseInfo;
+  if (Base.contains('.') && !lookUpField(Base, BaseInfo))
+    Base = BaseInfo.Type.Name;
 
   auto StructIt = Structs.find(Base.lower());
+  auto TypeIt = KnownType.find(Base.lower());
+  if (TypeIt != KnownType.end()) {
+    StructIt = Structs.find(TypeIt->second.Name.lower());
+  }
   if (StructIt != Structs.end())
-    return lookUpField(StructIt->second, Member, Type, Offset);
+    return lookUpField(StructIt->second, Member, Info);
 
   return true;
 }
 
 bool MasmParser::lookUpField(const StructInfo &Structure, StringRef Member,
-                             StringRef &Type, unsigned &Offset) const {
+                             AsmFieldInfo &Info) const {
   if (Member.empty()) {
-    Type = Structure.Name;
+    Info.Type.Name = Structure.Name;
+    Info.Type.Size = Structure.Size;
+    Info.Type.ElementSize = Structure.Size;
+    Info.Type.Length = 1;
     return false;
   }
 
@@ -6575,7 +6958,7 @@ bool MasmParser::lookUpField(const StructInfo &Structure, StringRef Member,
 
   auto StructIt = Structs.find(FieldName.lower());
   if (StructIt != Structs.end())
-    return lookUpField(StructIt->second, FieldMember, Type, Offset);
+    return lookUpField(StructIt->second, FieldMember, Info);
 
   auto FieldIt = Structure.FieldsByName.find(FieldName.lower());
   if (FieldIt == Structure.FieldsByName.end())
@@ -6583,9 +6966,14 @@ bool MasmParser::lookUpField(const StructInfo &Structure, StringRef Member,
 
   const FieldInfo &Field = Structure.Fields[FieldIt->second];
   if (FieldMember.empty()) {
-    Offset += Field.Offset;
+    Info.Offset += Field.Offset;
+    Info.Type.Size = Field.SizeOf;
+    Info.Type.ElementSize = Field.Type;
+    Info.Type.Length = Field.LengthOf;
     if (Field.Contents.FT == FT_STRUCT)
-      Type = Field.Contents.StructInfo.Structure.Name;
+      Info.Type.Name = Field.Contents.StructInfo.Structure.Name;
+    else
+      Info.Type.Name = "";
     return false;
   }
 
@@ -6593,14 +6981,45 @@ bool MasmParser::lookUpField(const StructInfo &Structure, StringRef Member,
     return true;
   const StructFieldInfo &StructInfo = Field.Contents.StructInfo;
 
-  bool Result = lookUpField(StructInfo.Structure, FieldMember, Type, Offset);
-  if (Result)
+  if (lookUpField(StructInfo.Structure, FieldMember, Info))
     return true;
 
-  Offset += Field.Offset;
+  Info.Offset += Field.Offset;
   return false;
 }
 
+bool MasmParser::lookUpType(StringRef Name, AsmTypeInfo &Info) const {
+  unsigned Size = StringSwitch<unsigned>(Name)
+                      .CasesLower("byte", "db", "sbyte", 1)
+                      .CasesLower("word", "dw", "sword", 2)
+                      .CasesLower("dword", "dd", "sdword", 4)
+                      .CasesLower("fword", "df", 6)
+                      .CasesLower("qword", "dq", "sqword", 8)
+                      .CaseLower("real4", 4)
+                      .CaseLower("real8", 8)
+                      .CaseLower("real10", 10)
+                      .Default(0);
+  if (Size) {
+    Info.Name = Name;
+    Info.ElementSize = Size;
+    Info.Length = 1;
+    Info.Size = Size;
+    return false;
+  }
+
+  auto StructIt = Structs.find(Name.lower());
+  if (StructIt != Structs.end()) {
+    const StructInfo &Structure = StructIt->second;
+    Info.Name = Name;
+    Info.ElementSize = Structure.Size;
+    Info.Length = 1;
+    Info.Size = Structure.Size;
+    return false;
+  }
+
+  return true;
+}
+
 bool MasmParser::parseMSInlineAsm(
     void *AsmLoc, std::string &AsmString, unsigned &NumOutputs,
     unsigned &NumInputs, SmallVectorImpl<std::pair<void *, bool>> &OpDecls,
@@ -6699,7 +7118,7 @@ bool MasmParser::parseMSInlineAsm(
     // Consider implicit defs to be clobbers.  Think of cpuid and push.
     ArrayRef<MCPhysReg> ImpDefs(Desc.getImplicitDefs(),
                                 Desc.getNumImplicitDefs());
-    ClobberRegs.insert(ClobberRegs.end(), ImpDefs.begin(), ImpDefs.end());
+    llvm::append_range(ClobberRegs, ImpDefs);
   }
 
   // Set the number of Outputs and Inputs.
diff --git a/contrib/llvm-project/llvm/lib/MC/MCParser/WasmAsmParser.cpp b/contrib/llvm-project/llvm/lib/MC/MCParser/WasmAsmParser.cpp
index 05f23e143341..0c255ef02d2a 100644
--- a/contrib/llvm-project/llvm/lib/MC/MCParser/WasmAsmParser.cpp
+++ b/contrib/llvm-project/llvm/lib/MC/MCParser/WasmAsmParser.cpp
@@ -90,15 +90,40 @@ public:
     return false;
   }
 
-  bool parseSectionFlags(StringRef FlagStr, bool &Passive) {
-    SmallVector<StringRef, 2> Flags;
-    // If there are no flags, keep Flags empty
-    FlagStr.split(Flags, ",", -1, false);
-    for (auto &Flag : Flags) {
-      if (Flag == "passive")
+  bool parseSectionFlags(StringRef FlagStr, bool &Passive, bool &Group) {
+    for (char C : FlagStr) {
+      switch (C) {
+      case 'p':
         Passive = true;
-      else
-        return error("Expected section flags, instead got: ", Lexer->getTok());
+        break;
+      case 'G':
+        Group = true;
+        break;
+      default:
+        return Parser->Error(getTok().getLoc(),
+                             StringRef("Unexepcted section flag: ") + FlagStr);
+      }
+    }
+    return false;
+  }
+
+  bool parseGroup(StringRef &GroupName) {
+    if (Lexer->isNot(AsmToken::Comma))
+      return TokError("expected group name");
+    Lex();
+    if (Lexer->is(AsmToken::Integer)) {
+      GroupName = getTok().getString();
+      Lex();
+    } else if (Parser->parseIdentifier(GroupName)) {
+      return TokError("invalid group name");
+    }
+    if (Lexer->is(AsmToken::Comma)) {
+      Lex();
+      StringRef Linkage;
+      if (Parser->parseIdentifier(Linkage))
+        return TokError("invalid linkage");
+      if (Linkage != "comdat")
+        return TokError("Linkage must be 'comdat'");
     }
     return false;
   }
@@ -116,6 +141,8 @@ public:
 
     auto Kind = StringSwitch<Optional<SectionKind>>(Name)
                     .StartsWith(".data", SectionKind::getData())
+                    .StartsWith(".tdata", SectionKind::getThreadData())
+                    .StartsWith(".tbss", SectionKind::getThreadBSS())
                     .StartsWith(".rodata", SectionKind::getReadOnly())
                     .StartsWith(".text", SectionKind::getText())
                     .StartsWith(".custom_section", SectionKind::getMetadata())
@@ -128,27 +155,34 @@ public:
     if (!Kind.hasValue())
       return Parser->Error(Lexer->getLoc(), "unknown section kind: " + Name);
 
-    MCSectionWasm *Section = getContext().getWasmSection(Name, Kind.getValue());
 
     // Update section flags if present in this .section directive
     bool Passive = false;
-    if (parseSectionFlags(getTok().getStringContents(), Passive))
+    bool Group = false;
+    if (parseSectionFlags(getTok().getStringContents(), Passive, Group))
       return true;
 
-    if (Passive) {
-      if (!Section->isWasmData())
-        return Parser->Error(getTok().getLoc(),
-                             "Only data sections can be passive");
-      Section->setPassive();
-    }
-
     Lex();
 
-    if (expect(AsmToken::Comma, ",") || expect(AsmToken::At, "@") ||
-        expect(AsmToken::EndOfStatement, "eol"))
+    if (expect(AsmToken::Comma, ",") || expect(AsmToken::At, "@"))
+      return true;
+
+    StringRef GroupName;
+    if (Group && parseGroup(GroupName))
+      return true;
+
+    if (expect(AsmToken::EndOfStatement, "eol"))
       return true;
 
-    auto WS = getContext().getWasmSection(Name, Kind.getValue());
+    // TODO: Parse UniqueID
+    MCSectionWasm *WS = getContext().getWasmSection(
+        Name, Kind.getValue(), GroupName, MCContext::GenericSectionID);
+    if (Passive) {
+      if (!WS->isWasmData())
+        return Parser->Error(getTok().getLoc(),
+                             "Only data sections can be passive");
+      WS->setPassive();
+    }
     getStreamer().SwitchSection(WS);
     return false;
   }
@@ -187,9 +221,13 @@ public:
           Lexer->is(AsmToken::Identifier)))
       return error("Expected label,@type declaration, got: ", Lexer->getTok());
     auto TypeName = Lexer->getTok().getString();
-    if (TypeName == "function")
+    if (TypeName == "function") {
       WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
-    else if (TypeName == "global")
+      auto *Current =
+          cast<MCSectionWasm>(getStreamer().getCurrentSection().first);
+      if (Current->getGroup())
+        WasmSym->setComdat(true);
+    } else if (TypeName == "global")
       WasmSym->setType(wasm::WASM_SYMBOL_TYPE_GLOBAL);
     else if (TypeName == "object")
       WasmSym->setType(wasm::WASM_SYMBOL_TYPE_DATA);
diff --git a/contrib/llvm-project/llvm/lib/MC/MCPseudoProbe.cpp b/contrib/llvm-project/llvm/lib/MC/MCPseudoProbe.cpp
new file mode 100644
index 000000000000..731831d3bce3
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/MC/MCPseudoProbe.cpp
@@ -0,0 +1,213 @@
+//===- lib/MC/MCPseudoProbe.cpp - Pseudo probe encoding support ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCPseudoProbe.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCStreamer.h"
+
+#define DEBUG_TYPE "mcpseudoprobe"
+
+using namespace llvm;
+
+#ifndef NDEBUG
+int MCPseudoProbeTable::DdgPrintIndent = 0;
+#endif
+
+static const MCExpr *buildSymbolDiff(MCObjectStreamer *MCOS, const MCSymbol *A,
+                                     const MCSymbol *B) {
+  MCContext &Context = MCOS->getContext();
+  MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
+  const MCExpr *ARef = MCSymbolRefExpr::create(A, Variant, Context);
+  const MCExpr *BRef = MCSymbolRefExpr::create(B, Variant, Context);
+  const MCExpr *AddrDelta =
+      MCBinaryExpr::create(MCBinaryExpr::Sub, ARef, BRef, Context);
+  return AddrDelta;
+}
+
+void MCPseudoProbe::emit(MCObjectStreamer *MCOS,
+                         const MCPseudoProbe *LastProbe) const {
+  // Emit Index
+  MCOS->emitULEB128IntValue(Index);
+  // Emit Type and the flag:
+  // Type (bit 0 to 3), with bit 4 to 6 for attributes.
+  // Flag (bit 7, 0 - code address, 1 - address delta). This indicates whether
+  // the following field is a symbolic code address or an address delta.
+  assert(Type <= 0xF && "Probe type too big to encode, exceeding 15");
+  assert(Attributes <= 0x7 &&
+         "Probe attributes too big to encode, exceeding 7");
+  uint8_t PackedType = Type | (Attributes << 4);
+  uint8_t Flag = LastProbe ? ((int8_t)MCPseudoProbeFlag::AddressDelta << 7) : 0;
+  MCOS->emitInt8(Flag | PackedType);
+
+  if (LastProbe) {
+    // Emit the delta between the address label and LastProbe.
+    const MCExpr *AddrDelta =
+        buildSymbolDiff(MCOS, Label, LastProbe->getLabel());
+    int64_t Delta;
+    if (AddrDelta->evaluateAsAbsolute(Delta, MCOS->getAssemblerPtr())) {
+      MCOS->emitSLEB128IntValue(Delta);
+    } else {
+      MCOS->insert(new MCPseudoProbeAddrFragment(AddrDelta));
+    }
+  } else {
+    // Emit label as a symbolic code address.
+    MCOS->emitSymbolValue(
+        Label, MCOS->getContext().getAsmInfo()->getCodePointerSize());
+  }
+
+  LLVM_DEBUG({
+    dbgs().indent(MCPseudoProbeTable::DdgPrintIndent);
+    dbgs() << "Probe: " << Index << "\n";
+  });
+}
+
+MCPseudoProbeInlineTree::~MCPseudoProbeInlineTree() {
+  for (auto &Inlinee : Inlinees)
+    delete Inlinee.second;
+}
+
+MCPseudoProbeInlineTree *
+MCPseudoProbeInlineTree::getOrAddNode(InlineSite Site) {
+  auto Iter = Inlinees.find(Site);
+  if (Iter == Inlinees.end()) {
+    auto *Node = new MCPseudoProbeInlineTree(std::get<0>(Site));
+    Inlinees[Site] = Node;
+    return Node;
+  } else {
+    return Iter->second;
+  }
+}
+
+void MCPseudoProbeInlineTree::addPseudoProbe(
+    const MCPseudoProbe &Probe, const MCPseudoProbeInlineStack &InlineStack) {
+  // The function should not be called on the root.
+  assert(isRoot() && "Should not be called on root");
+
+  // When it comes here, the input look like:
+  //    Probe: GUID of C, ...
+  //    InlineStack: [88, A], [66, B]
+  // which means, Function A inlines function B at call site with a probe id of
+  // 88, and B inlines C at probe 66. The tri-tree expects a tree path like {[0,
+  // A], [88, B], [66, C]} to locate the tree node where the probe should be
+  // added. Note that the edge [0, A] means A is the top-level function we are
+  // emitting probes for.
+
+  // Make a [0, A] edge.
+  // An empty inline stack means the function that the probe originates from
+  // is a top-level function.
+  InlineSite Top;
+  if (InlineStack.empty()) {
+    Top = InlineSite(Probe.getGuid(), 0);
+  } else {
+    Top = InlineSite(std::get<0>(InlineStack.front()), 0);
+  }
+
+  auto *Cur = getOrAddNode(Top);
+
+  // Make interior edges by walking the inline stack. Once it's done, Cur should
+  // point to the node that the probe originates from.
+  if (!InlineStack.empty()) {
+    auto Iter = InlineStack.begin();
+    auto Index = std::get<1>(*Iter);
+    Iter++;
+    for (; Iter != InlineStack.end(); Iter++) {
+      // Make an edge by using the previous probe id and current GUID.
+      Cur = Cur->getOrAddNode(InlineSite(std::get<0>(*Iter), Index));
+      Index = std::get<1>(*Iter);
+    }
+    Cur = Cur->getOrAddNode(InlineSite(Probe.getGuid(), Index));
+  }
+
+  Cur->Probes.push_back(Probe);
+}
+
+void MCPseudoProbeInlineTree::emit(MCObjectStreamer *MCOS,
+                                   const MCPseudoProbe *&LastProbe) {
+  LLVM_DEBUG({
+    dbgs().indent(MCPseudoProbeTable::DdgPrintIndent);
+    dbgs() << "Group [\n";
+    MCPseudoProbeTable::DdgPrintIndent += 2;
+  });
+  // Emit probes grouped by GUID.
+  if (Guid != 0) {
+    LLVM_DEBUG({
+      dbgs().indent(MCPseudoProbeTable::DdgPrintIndent);
+      dbgs() << "GUID: " << Guid << "\n";
+    });
+    // Emit Guid
+    MCOS->emitInt64(Guid);
+    // Emit number of probes in this node
+    MCOS->emitULEB128IntValue(Probes.size());
+    // Emit number of direct inlinees
+    MCOS->emitULEB128IntValue(Inlinees.size());
+    // Emit probes in this group
+    for (const auto &Probe : Probes) {
+      Probe.emit(MCOS, LastProbe);
+      LastProbe = &Probe;
+    }
+  } else {
+    assert(Probes.empty() && "Root should not have probes");
+  }
+
+  // Emit descendent
+  for (const auto &Inlinee : Inlinees) {
+    if (Guid) {
+      // Emit probe index
+      MCOS->emitULEB128IntValue(std::get<1>(Inlinee.first));
+      LLVM_DEBUG({
+        dbgs().indent(MCPseudoProbeTable::DdgPrintIndent);
+        dbgs() << "InlineSite: " << std::get<1>(Inlinee.first) << "\n";
+      });
+    }
+    // Emit the group
+    Inlinee.second->emit(MCOS, LastProbe);
+  }
+
+  LLVM_DEBUG({
+    MCPseudoProbeTable::DdgPrintIndent -= 2;
+    dbgs().indent(MCPseudoProbeTable::DdgPrintIndent);
+    dbgs() << "]\n";
+  });
+}
+
+void MCPseudoProbeSection::emit(MCObjectStreamer *MCOS) {
+  MCContext &Ctx = MCOS->getContext();
+
+  for (auto &ProbeSec : MCProbeDivisions) {
+    const MCPseudoProbe *LastProbe = nullptr;
+    if (auto *S =
+            Ctx.getObjectFileInfo()->getPseudoProbeSection(ProbeSec.first)) {
+      // Switch to the .pseudoprobe section or a comdat group.
+      MCOS->SwitchSection(S);
+      // Emit probes grouped by GUID.
+      ProbeSec.second.emit(MCOS, LastProbe);
+    }
+  }
+}
+
+//
+// This emits the pseudo probe tables.
+//
+void MCPseudoProbeTable::emit(MCObjectStreamer *MCOS) {
+  MCContext &Ctx = MCOS->getContext();
+  auto &ProbeTable = Ctx.getMCPseudoProbeTable();
+
+  // Bail out early so we don't switch to the pseudo_probe section needlessly
+  // and in doing so create an unnecessary (if empty) section.
+  auto &ProbeSections = ProbeTable.getProbeSections();
+  if (ProbeSections.empty())
+    return;
+
+  LLVM_DEBUG(MCPseudoProbeTable::DdgPrintIndent = 0);
+
+  // Put out the probe.
+  ProbeSections.emit(MCOS);
+}
diff --git a/contrib/llvm-project/llvm/lib/MC/MCSchedule.cpp b/contrib/llvm-project/llvm/lib/MC/MCSchedule.cpp
index 1fc5ec5e975f..db08e2044113 100644
--- a/contrib/llvm-project/llvm/lib/MC/MCSchedule.cpp
+++ b/contrib/llvm-project/llvm/lib/MC/MCSchedule.cpp
@@ -74,7 +74,7 @@ int MCSchedModel::computeInstrLatency(const MCSubtargetInfo &STI,
 
   unsigned CPUID = getProcessorID();
   while (SCDesc->isVariant()) {
-    SchedClass = STI.resolveVariantSchedClass(SchedClass, &Inst, CPUID);
+    SchedClass = STI.resolveVariantSchedClass(SchedClass, &Inst, &MCII, CPUID);
     SCDesc = getSchedClassDesc(SchedClass);
   }
 
@@ -120,7 +120,7 @@ MCSchedModel::getReciprocalThroughput(const MCSubtargetInfo &STI,
 
   unsigned CPUID = getProcessorID();
   while (SCDesc->isVariant()) {
-    SchedClass = STI.resolveVariantSchedClass(SchedClass, &Inst, CPUID);
+    SchedClass = STI.resolveVariantSchedClass(SchedClass, &Inst, &MCII, CPUID);
     SCDesc = getSchedClassDesc(SchedClass);
   }
 
diff --git a/contrib/llvm-project/llvm/lib/MC/MCSection.cpp b/contrib/llvm-project/llvm/lib/MC/MCSection.cpp
index ba256102080a..7997b237a7eb 100644
--- a/contrib/llvm-project/llvm/lib/MC/MCSection.cpp
+++ b/contrib/llvm-project/llvm/lib/MC/MCSection.cpp
@@ -28,7 +28,7 @@ MCSection::MCSection(SectionVariant V, StringRef Name, SectionKind K,
 
 MCSymbol *MCSection::getEndSymbol(MCContext &Ctx) {
   if (!End)
-    End = Ctx.createTempSymbol("sec_end", true);
+    End = Ctx.createTempSymbol("sec_end");
   return End;
 }
 
@@ -82,6 +82,7 @@ MCSection::getSubsectionInsertionPoint(unsigned Subsection) {
     SubsectionFragmentMap.insert(MI, std::make_pair(Subsection, F));
     getFragmentList().insert(IP, F);
     F->setParent(this);
+    F->setSubsectionNumber(Subsection);
   }
 
   return IP;
diff --git a/contrib/llvm-project/llvm/lib/MC/MCSectionELF.cpp b/contrib/llvm-project/llvm/lib/MC/MCSectionELF.cpp
index 77c259c27a04..27694cb14419 100644
--- a/contrib/llvm-project/llvm/lib/MC/MCSectionELF.cpp
+++ b/contrib/llvm-project/llvm/lib/MC/MCSectionELF.cpp
@@ -156,6 +156,8 @@ void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
     OS << "llvm_dependent_libraries";
   else if (Type == ELF::SHT_LLVM_SYMPART)
     OS << "llvm_sympart";
+  else if (Type == ELF::SHT_LLVM_BB_ADDR_MAP)
+    OS << "llvm_bb_addr_map";
   else
     report_fatal_error("unsupported type 0x" + Twine::utohexstr(Type) +
                        " for section " + getName());
@@ -172,9 +174,11 @@ void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
   }
 
   if (Flags & ELF::SHF_LINK_ORDER) {
-    assert(LinkedToSym);
     OS << ",";
-    printName(OS, LinkedToSym->getName());
+    if (LinkedToSym)
+      printName(OS, LinkedToSym->getName());
+    else
+      OS << '0';
   }
 
   if (isUnique())
diff --git a/contrib/llvm-project/llvm/lib/MC/MCSectionMachO.cpp b/contrib/llvm-project/llvm/lib/MC/MCSectionMachO.cpp
index 21a63ce83330..794d2c52d7b1 100644
--- a/contrib/llvm-project/llvm/lib/MC/MCSectionMachO.cpp
+++ b/contrib/llvm-project/llvm/lib/MC/MCSectionMachO.cpp
@@ -215,11 +215,11 @@ std::string MCSectionMachO::ParseSectionSpecifier(StringRef Spec,        // In.
     return "";
 
   // Figure out which section type it is.
-  auto TypeDescriptor = std::find_if(
-      std::begin(SectionTypeDescriptors), std::end(SectionTypeDescriptors),
-      [&](decltype(*SectionTypeDescriptors) &Descriptor) {
-        return SectionType == Descriptor.AssemblerName;
-      });
+  auto TypeDescriptor =
+      llvm::find_if(SectionTypeDescriptors,
+                    [&](decltype(*SectionTypeDescriptors) &Descriptor) {
+                      return SectionType == Descriptor.AssemblerName;
+                    });
 
   // If we didn't find the section type, reject it.
   if (TypeDescriptor == std::end(SectionTypeDescriptors))
@@ -243,11 +243,11 @@ std::string MCSectionMachO::ParseSectionSpecifier(StringRef Spec,        // In.
   Attrs.split(SectionAttrs, '+', /*MaxSplit=*/-1, /*KeepEmpty=*/false);
 
   for (StringRef &SectionAttr : SectionAttrs) {
-    auto AttrDescriptorI = std::find_if(
-        std::begin(SectionAttrDescriptors), std::end(SectionAttrDescriptors),
-        [&](decltype(*SectionAttrDescriptors) &Descriptor) {
-          return SectionAttr.trim() == Descriptor.AssemblerName;
-        });
+    auto AttrDescriptorI =
+        llvm::find_if(SectionAttrDescriptors,
+                      [&](decltype(*SectionAttrDescriptors) &Descriptor) {
+                        return SectionAttr.trim() == Descriptor.AssemblerName;
+                      });
     if (AttrDescriptorI == std::end(SectionAttrDescriptors))
       return "mach-o section specifier has invalid attribute";
 
diff --git a/contrib/llvm-project/llvm/lib/MC/MCSectionWasm.cpp b/contrib/llvm-project/llvm/lib/MC/MCSectionWasm.cpp
index 27ed51802a2e..81dc4329be6a 100644
--- a/contrib/llvm-project/llvm/lib/MC/MCSectionWasm.cpp
+++ b/contrib/llvm-project/llvm/lib/MC/MCSectionWasm.cpp
@@ -64,7 +64,9 @@ void MCSectionWasm::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
   OS << ",\"";
 
   if (IsPassive)
-    OS << "passive";
+    OS << "p";
+  if (Group)
+    OS << "G";
 
   OS << '"';
 
@@ -78,6 +80,12 @@ void MCSectionWasm::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
 
   // TODO: Print section type.
 
+  if (Group) {
+    OS << ",";
+    printName(OS, Group->getName());
+    OS << ",comdat";
+  }
+
   if (isUnique())
     OS << ",unique," << UniqueID;
 
diff --git a/contrib/llvm-project/llvm/lib/MC/MCSectionXCOFF.cpp b/contrib/llvm-project/llvm/lib/MC/MCSectionXCOFF.cpp
index 1fa495239f74..17b7b60a04ab 100644
--- a/contrib/llvm-project/llvm/lib/MC/MCSectionXCOFF.cpp
+++ b/contrib/llvm-project/llvm/lib/MC/MCSectionXCOFF.cpp
@@ -45,6 +45,7 @@ void MCSectionXCOFF::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
       printCsectDirective(OS);
       break;
     case XCOFF::XMC_TC:
+    case XCOFF::XMC_TE:
       break;
     case XCOFF::XMC_TC0:
       OS << "\t.toc\n";
diff --git a/contrib/llvm-project/llvm/lib/MC/MCStreamer.cpp b/contrib/llvm-project/llvm/lib/MC/MCStreamer.cpp
index 6d3a933c96a3..4b5ae3cc202d 100644
--- a/contrib/llvm-project/llvm/lib/MC/MCStreamer.cpp
+++ b/contrib/llvm-project/llvm/lib/MC/MCStreamer.cpp
@@ -22,6 +22,7 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstPrinter.h"
 #include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCPseudoProbe.h"
 #include "llvm/MC/MCRegister.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSection.h"
@@ -90,7 +91,7 @@ void MCTargetStreamer::emitAssignment(MCSymbol *Symbol, const MCExpr *Value) {}
 
 MCStreamer::MCStreamer(MCContext &Ctx)
     : Context(Ctx), CurrentWinFrameInfo(nullptr),
-      UseAssemblerInfoForParsing(false) {
+      CurrentProcWinFrameInfoStartIndex(0), UseAssemblerInfoForParsing(false) {
   SectionStack.push_back(std::pair<MCSectionSubPair, MCSectionSubPair>());
 }
 
@@ -138,6 +139,21 @@ void MCStreamer::emitIntValue(uint64_t Value, unsigned Size) {
   unsigned Index = IsLittleEndian ? 0 : 8 - Size;
   emitBytes(StringRef(reinterpret_cast<char *>(&Swapped) + Index, Size));
 }
+void MCStreamer::emitIntValue(APInt Value) {
+  if (Value.getNumWords() == 1) {
+    emitIntValue(Value.getLimitedValue(), Value.getBitWidth() / 8);
+    return;
+  }
+
+  const bool IsLittleEndianTarget = Context.getAsmInfo()->isLittleEndian();
+  const bool ShouldSwap = sys::IsLittleEndianHost != IsLittleEndianTarget;
+  const APInt Swapped = ShouldSwap ? Value.byteSwap() : Value;
+  const unsigned Size = Value.getBitWidth() / 8;
+  SmallString<10> Tmp;
+  Tmp.resize(Size);
+  StoreIntToMemory(Swapped, reinterpret_cast<uint8_t *>(Tmp.data()), Size);
+  emitBytes(Tmp.str());
+}
 
 /// EmitULEB128IntValue - Special case of EmitULEB128Value that avoids the
 /// client having to pass in a MCExpr for constant integers.
@@ -202,6 +218,9 @@ void MCStreamer::emitFill(uint64_t NumBytes, uint8_t FillValue) {
   emitFill(*MCConstantExpr::create(NumBytes, getContext()), FillValue);
 }
 
+void llvm::MCStreamer::emitNops(int64_t NumBytes, int64_t ControlledNopLen,
+                                llvm::SMLoc) {}
+
 /// The implementation in this class just redirects to emitFill.
 void MCStreamer::emitZeros(uint64_t NumBytes) { emitFill(NumBytes, 0); }
 
@@ -255,9 +274,9 @@ bool MCStreamer::hasUnfinishedDwarfFrameInfo() {
 
 MCDwarfFrameInfo *MCStreamer::getCurrentDwarfFrameInfo() {
   if (!hasUnfinishedDwarfFrameInfo()) {
-    getContext().reportError(SMLoc(), "this directive must appear between "
-                                      ".cfi_startproc and .cfi_endproc "
-                                      "directives");
+    getContext().reportError(getStartTokLoc(),
+                             "this directive must appear between "
+                             ".cfi_startproc and .cfi_endproc directives");
     return nullptr;
   }
   return &DwarfFrameInfos.back();
@@ -674,6 +693,7 @@ void MCStreamer::EmitWinCFIStartProc(const MCSymbol *Symbol, SMLoc Loc) {
 
   MCSymbol *StartProc = emitCFILabel();
 
+  CurrentProcWinFrameInfoStartIndex = WinFrameInfos.size();
   WinFrameInfos.emplace_back(
       std::make_unique<WinEH::FrameInfo>(Symbol, StartProc));
   CurrentWinFrameInfo = WinFrameInfos.back().get();
@@ -689,6 +709,13 @@ void MCStreamer::EmitWinCFIEndProc(SMLoc Loc) {
 
   MCSymbol *Label = emitCFILabel();
   CurFrame->End = Label;
+  if (!CurFrame->FuncletOrFuncEnd)
+    CurFrame->FuncletOrFuncEnd = CurFrame->End;
+
+  for (size_t I = CurrentProcWinFrameInfoStartIndex, E = WinFrameInfos.size();
+       I != E; ++I)
+    EmitWindowsUnwindTables(WinFrameInfos[I].get());
+  SwitchSection(CurFrame->TextSection);
 }
 
 void MCStreamer::EmitWinCFIFuncletOrFuncEnd(SMLoc Loc) {
@@ -947,10 +974,13 @@ void MCStreamer::emitRawText(const Twine &T) {
 void MCStreamer::EmitWindowsUnwindTables() {
 }
 
-void MCStreamer::Finish() {
+void MCStreamer::EmitWindowsUnwindTables(WinEH::FrameInfo *Frame) {
+}
+
+void MCStreamer::Finish(SMLoc EndLoc) {
   if ((!DwarfFrameInfos.empty() && !DwarfFrameInfos.back().End) ||
       (!WinFrameInfos.empty() && !WinFrameInfos.back()->End)) {
-    getContext().reportError(SMLoc(), "Unfinished frame!");
+    getContext().reportError(EndLoc, "Unfinished frame!");
     return;
   }
 
@@ -1013,6 +1043,25 @@ void MCStreamer::emitInstruction(const MCInst &Inst, const MCSubtargetInfo &) {
       visitUsedExpr(*Inst.getOperand(i).getExpr());
 }
 
+void MCStreamer::emitPseudoProbe(uint64_t Guid, uint64_t Index, uint64_t Type,
+                                 uint64_t Attr,
+                                 const MCPseudoProbeInlineStack &InlineStack) {
+  auto &Context = getContext();
+
+  // Create a symbol at in the current section for use in the probe.
+  MCSymbol *ProbeSym = Context.createTempSymbol();
+
+  // Set the value of the symbol to use for the MCPseudoProbe.
+  emitLabel(ProbeSym);
+
+  // Create a (local) probe entry with the symbol.
+  MCPseudoProbe Probe(ProbeSym, Guid, Index, Type, Attr);
+
+  // Add the probe entry to this section's entries.
+  Context.getMCPseudoProbeTable().getProbeSections().addPseudoProbe(
+      getCurrentSectionOnly(), Probe, InlineStack);
+}
+
 void MCStreamer::emitAbsoluteSymbolDiff(const MCSymbol *Hi, const MCSymbol *Lo,
                                         unsigned Size) {
   // Get the Hi-Lo expression.
@@ -1027,7 +1076,7 @@ void MCStreamer::emitAbsoluteSymbolDiff(const MCSymbol *Hi, const MCSymbol *Lo,
   }
 
   // Otherwise, emit with .set (aka assignment).
-  MCSymbol *SetLabel = Context.createTempSymbol("set", true);
+  MCSymbol *SetLabel = Context.createTempSymbol("set");
   emitAssignment(SetLabel, Diff);
   emitSymbolValue(SetLabel, Size);
 }
diff --git a/contrib/llvm-project/llvm/lib/MC/MCSubtargetInfo.cpp b/contrib/llvm-project/llvm/lib/MC/MCSubtargetInfo.cpp
index 1c187d616e4e..55ada91857be 100644
--- a/contrib/llvm-project/llvm/lib/MC/MCSubtargetInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/MC/MCSubtargetInfo.cpp
@@ -147,7 +147,7 @@ static void cpuHelp(ArrayRef<SubtargetSubTypeKV> CPUTable) {
   PrintOnce = true;
 }
 
-static FeatureBitset getFeatures(StringRef CPU, StringRef FS,
+static FeatureBitset getFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS,
                                  ArrayRef<SubtargetSubTypeKV> ProcDesc,
                                  ArrayRef<SubtargetFeatureKV> ProcFeatures) {
   SubtargetFeatures Features(FS);
@@ -178,6 +178,19 @@ static FeatureBitset getFeatures(StringRef CPU, StringRef FS,
     }
   }
 
+  if (!TuneCPU.empty()) {
+    const SubtargetSubTypeKV *CPUEntry = Find(TuneCPU, ProcDesc);
+
+    // If there is a match
+    if (CPUEntry) {
+      // Set the features implied by this CPU feature, if any.
+      SetImpliedBits(Bits, CPUEntry->TuneImplies.getAsBitset(), ProcFeatures);
+    } else if (TuneCPU != CPU) {
+      errs() << "'" << TuneCPU << "' is not a recognized processor for this "
+             << "target (ignoring processor)\n";
+    }
+  }
+
   // Iterate through each feature
   for (const std::string &Feature : Features.getFeatures()) {
     // Check for help
@@ -192,30 +205,33 @@ static FeatureBitset getFeatures(StringRef CPU, StringRef FS,
   return Bits;
 }
 
-void MCSubtargetInfo::InitMCProcessorInfo(StringRef CPU, StringRef FS) {
-  FeatureBits = getFeatures(CPU, FS, ProcDesc, ProcFeatures);
-  if (!CPU.empty())
-    CPUSchedModel = &getSchedModelForCPU(CPU);
+void MCSubtargetInfo::InitMCProcessorInfo(StringRef CPU, StringRef TuneCPU,
+                                          StringRef FS) {
+  FeatureBits = getFeatures(CPU, TuneCPU, FS, ProcDesc, ProcFeatures);
+  if (!TuneCPU.empty())
+    CPUSchedModel = &getSchedModelForCPU(TuneCPU);
   else
     CPUSchedModel = &MCSchedModel::GetDefaultSchedModel();
 }
 
-void MCSubtargetInfo::setDefaultFeatures(StringRef CPU, StringRef FS) {
-  FeatureBits = getFeatures(CPU, FS, ProcDesc, ProcFeatures);
+void MCSubtargetInfo::setDefaultFeatures(StringRef CPU, StringRef TuneCPU,
+                                         StringRef FS) {
+  FeatureBits = getFeatures(CPU, TuneCPU, FS, ProcDesc, ProcFeatures);
 }
 
-MCSubtargetInfo::MCSubtargetInfo(const Triple &TT, StringRef C, StringRef FS,
-                                 ArrayRef<SubtargetFeatureKV> PF,
+MCSubtargetInfo::MCSubtargetInfo(const Triple &TT, StringRef C, StringRef TC,
+                                 StringRef FS, ArrayRef<SubtargetFeatureKV> PF,
                                  ArrayRef<SubtargetSubTypeKV> PD,
                                  const MCWriteProcResEntry *WPR,
                                  const MCWriteLatencyEntry *WL,
                                  const MCReadAdvanceEntry *RA,
                                  const InstrStage *IS, const unsigned *OC,
                                  const unsigned *FP)
-    : TargetTriple(TT), CPU(std::string(C)), ProcFeatures(PF), ProcDesc(PD),
-      WriteProcResTable(WPR), WriteLatencyTable(WL), ReadAdvanceTable(RA),
-      Stages(IS), OperandCycles(OC), ForwardingPaths(FP) {
-  InitMCProcessorInfo(CPU, FS);
+    : TargetTriple(TT), CPU(std::string(C)), TuneCPU(std::string(TC)),
+      ProcFeatures(PF), ProcDesc(PD), WriteProcResTable(WPR),
+      WriteLatencyTable(WL), ReadAdvanceTable(RA), Stages(IS),
+      OperandCycles(OC), ForwardingPaths(FP) {
+  InitMCProcessorInfo(CPU, TuneCPU, FS);
 }
 
 FeatureBitset MCSubtargetInfo::ToggleFeature(uint64_t FB) {
diff --git a/contrib/llvm-project/llvm/lib/MC/MCSymbolXCOFF.cpp b/contrib/llvm-project/llvm/lib/MC/MCSymbolXCOFF.cpp
index 536153e5518b..b9dd2908b40b 100644
--- a/contrib/llvm-project/llvm/lib/MC/MCSymbolXCOFF.cpp
+++ b/contrib/llvm-project/llvm/lib/MC/MCSymbolXCOFF.cpp
@@ -13,8 +13,7 @@ using namespace llvm;
 MCSectionXCOFF *MCSymbolXCOFF::getRepresentedCsect() const {
   assert(RepresentedCsect &&
          "Trying to get csect representation of this symbol but none was set.");
-  assert((!getName().equals(getUnqualifiedName()) ||
-          RepresentedCsect->getCSectType() == XCOFF::XTY_ER) &&
+  assert(!getName().equals(getUnqualifiedName()) &&
          "Symbol does not represent a csect; MCSectionXCOFF that represents "
          "the symbol should not be (but is) set.");
   assert(getSymbolTableName().equals(RepresentedCsect->getSymbolTableName()) &&
@@ -26,10 +25,9 @@ MCSectionXCOFF *MCSymbolXCOFF::getRepresentedCsect() const {
 void MCSymbolXCOFF::setRepresentedCsect(MCSectionXCOFF *C) {
   assert(C && "Assigned csect should not be null.");
   assert((!RepresentedCsect || RepresentedCsect == C) &&
-         "Trying to set a csect that doesn't match the one that"
-         "this symbol is already mapped to.");
-  assert((!getName().equals(getUnqualifiedName()) ||
-          C->getCSectType() == XCOFF::XTY_ER) &&
+         "Trying to set a csect that doesn't match the one that this symbol is "
+         "already mapped to.");
+  assert(!getName().equals(getUnqualifiedName()) &&
          "Symbol does not represent a csect; can only set a MCSectionXCOFF "
          "representation for a csect.");
   assert(getSymbolTableName().equals(C->getSymbolTableName()) &&
diff --git a/contrib/llvm-project/llvm/lib/MC/MCWasmStreamer.cpp b/contrib/llvm-project/llvm/lib/MC/MCWasmStreamer.cpp
index bf8b142b355a..e3d2439cef81 100644
--- a/contrib/llvm-project/llvm/lib/MC/MCWasmStreamer.cpp
+++ b/contrib/llvm-project/llvm/lib/MC/MCWasmStreamer.cpp
@@ -148,18 +148,6 @@ void MCWasmStreamer::emitLocalCommonSymbol(MCSymbol *S, uint64_t Size,
   llvm_unreachable("Local common symbols are not yet implemented for Wasm");
 }
 
-void MCWasmStreamer::emitValueImpl(const MCExpr *Value, unsigned Size,
-                                   SMLoc Loc) {
-  MCObjectStreamer::emitValueImpl(Value, Size, Loc);
-}
-
-void MCWasmStreamer::emitValueToAlignment(unsigned ByteAlignment, int64_t Value,
-                                          unsigned ValueSize,
-                                          unsigned MaxBytesToEmit) {
-  MCObjectStreamer::emitValueToAlignment(ByteAlignment, Value, ValueSize,
-                                         MaxBytesToEmit);
-}
-
 void MCWasmStreamer::emitIdent(StringRef IdentString) {
   // TODO(sbc): Add the ident section once we support mergable strings
   // sections in the object format
diff --git a/contrib/llvm-project/llvm/lib/MC/MCWin64EH.cpp b/contrib/llvm-project/llvm/lib/MC/MCWin64EH.cpp
index ac288ca08c93..de1b0fd3c742 100644
--- a/contrib/llvm-project/llvm/lib/MC/MCWin64EH.cpp
+++ b/contrib/llvm-project/llvm/lib/MC/MCWin64EH.cpp
@@ -238,8 +238,9 @@ void llvm::Win64EH::UnwindEmitter::Emit(MCStreamer &Streamer) const {
   }
 }
 
-void llvm::Win64EH::UnwindEmitter::EmitUnwindInfo(
-    MCStreamer &Streamer, WinEH::FrameInfo *info) const {
+void llvm::Win64EH::UnwindEmitter::EmitUnwindInfo(MCStreamer &Streamer,
+                                                  WinEH::FrameInfo *info,
+                                                  bool HandlerData) const {
   // Switch sections (the static function above is meant to be called from
   // here and from Emit().
   MCSection *XData = Streamer.getAssociatedXDataSection(info->TextSection);
@@ -264,8 +265,7 @@ static int64_t GetAbsDifference(MCStreamer &Streamer, const MCSymbol *LHS,
   return value;
 }
 
-static uint32_t
-ARM64CountOfUnwindCodes(const std::vector<WinEH::Instruction> &Insns) {
+static uint32_t ARM64CountOfUnwindCodes(ArrayRef<WinEH::Instruction> Insns) {
   uint32_t Count = 0;
   for (const auto &I : Insns) {
     switch (static_cast<Win64EH::UnwindOpcodes>(I.Operation)) {
@@ -280,6 +280,9 @@ ARM64CountOfUnwindCodes(const std::vector<WinEH::Instruction> &Insns) {
     case Win64EH::UOP_AllocLarge:
       Count += 4;
       break;
+    case Win64EH::UOP_SaveR19R20X:
+      Count += 1;
+      break;
     case Win64EH::UOP_SaveFPLRX:
       Count += 1;
       break;
@@ -298,6 +301,9 @@ ARM64CountOfUnwindCodes(const std::vector<WinEH::Instruction> &Insns) {
     case Win64EH::UOP_SaveRegX:
       Count += 2;
       break;
+    case Win64EH::UOP_SaveLRPair:
+      Count += 2;
+      break;
     case Win64EH::UOP_SaveFReg:
       Count += 2;
       break;
@@ -322,6 +328,21 @@ ARM64CountOfUnwindCodes(const std::vector<WinEH::Instruction> &Insns) {
     case Win64EH::UOP_End:
       Count += 1;
       break;
+    case Win64EH::UOP_SaveNext:
+      Count += 1;
+      break;
+    case Win64EH::UOP_TrapFrame:
+      Count += 1;
+      break;
+    case Win64EH::UOP_PushMachFrame:
+      Count += 1;
+      break;
+    case Win64EH::UOP_Context:
+      Count += 1;
+      break;
+    case Win64EH::UOP_ClearUnwoundToCall:
+      Count += 1;
+      break;
     }
   }
   return Count;
@@ -375,6 +396,11 @@ static void ARM64EmitUnwindCode(MCStreamer &streamer, const MCSymbol *begin,
     b = 0xE3;
     streamer.emitInt8(b);
     break;
+  case Win64EH::UOP_SaveR19R20X:
+    b = 0x20;
+    b |= (inst.Offset >> 3) & 0x1F;
+    streamer.emitInt8(b);
+    break;
   case Win64EH::UOP_SaveFPLRX:
     b = 0x80;
     b |= ((inst.Offset - 1) >> 3) & 0x3F;
@@ -417,6 +443,16 @@ static void ARM64EmitUnwindCode(MCStreamer &streamer, const MCSymbol *begin,
     b = ((reg & 0x3) << 6) | ((inst.Offset >> 3) - 1);
     streamer.emitInt8(b);
     break;
+  case Win64EH::UOP_SaveLRPair:
+    assert(inst.Register >= 19 && "Saved reg must be >= 19");
+    reg = inst.Register - 19;
+    assert((reg % 2) == 0 && "Saved reg must be 19+2*X");
+    reg /= 2;
+    b = 0xD6 | ((reg & 0x7) >> 2);
+    streamer.emitInt8(b);
+    b = ((reg & 0x3) << 6) | (inst.Offset >> 3);
+    streamer.emitInt8(b);
+    break;
   case Win64EH::UOP_SaveFReg:
     assert(inst.Register >= 8 && "Saved dreg must be >= 8");
     reg = inst.Register - 8;
@@ -453,6 +489,26 @@ static void ARM64EmitUnwindCode(MCStreamer &streamer, const MCSymbol *begin,
     b = 0xE4;
     streamer.emitInt8(b);
     break;
+  case Win64EH::UOP_SaveNext:
+    b = 0xE6;
+    streamer.emitInt8(b);
+    break;
+  case Win64EH::UOP_TrapFrame:
+    b = 0xE8;
+    streamer.emitInt8(b);
+    break;
+  case Win64EH::UOP_PushMachFrame:
+    b = 0xE9;
+    streamer.emitInt8(b);
+    break;
+  case Win64EH::UOP_Context:
+    b = 0xEA;
+    streamer.emitInt8(b);
+    break;
+  case Win64EH::UOP_ClearUnwoundToCall:
+    b = 0xEC;
+    streamer.emitInt8(b);
+    break;
   }
 }
 
@@ -488,12 +544,366 @@ FindMatchingEpilog(const std::vector<WinEH::Instruction>& EpilogInstrs,
   return nullptr;
 }
 
+static void simplifyOpcodes(std::vector<WinEH::Instruction> &Instructions,
+                            bool Reverse) {
+  unsigned PrevOffset = -1;
+  unsigned PrevRegister = -1;
+
+  auto VisitInstruction = [&](WinEH::Instruction &Inst) {
+    // Convert 2-byte opcodes into equivalent 1-byte ones.
+    if (Inst.Operation == Win64EH::UOP_SaveRegP && Inst.Register == 29) {
+      Inst.Operation = Win64EH::UOP_SaveFPLR;
+      Inst.Register = -1;
+    } else if (Inst.Operation == Win64EH::UOP_SaveRegPX &&
+               Inst.Register == 29) {
+      Inst.Operation = Win64EH::UOP_SaveFPLRX;
+      Inst.Register = -1;
+    } else if (Inst.Operation == Win64EH::UOP_SaveRegPX &&
+               Inst.Register == 19 && Inst.Offset <= 248) {
+      Inst.Operation = Win64EH::UOP_SaveR19R20X;
+      Inst.Register = -1;
+    } else if (Inst.Operation == Win64EH::UOP_AddFP && Inst.Offset == 0) {
+      Inst.Operation = Win64EH::UOP_SetFP;
+    } else if (Inst.Operation == Win64EH::UOP_SaveRegP &&
+               Inst.Register == PrevRegister + 2 &&
+               Inst.Offset == PrevOffset + 16) {
+      Inst.Operation = Win64EH::UOP_SaveNext;
+      Inst.Register = -1;
+      Inst.Offset = 0;
+      // Intentionally not creating UOP_SaveNext for float register pairs,
+      // as current versions of Windows (up to at least 20.04) is buggy
+      // regarding SaveNext for float pairs.
+    }
+    // Update info about the previous instruction, for detecting if
+    // the next one can be made a UOP_SaveNext
+    if (Inst.Operation == Win64EH::UOP_SaveR19R20X) {
+      PrevOffset = 0;
+      PrevRegister = 19;
+    } else if (Inst.Operation == Win64EH::UOP_SaveRegPX) {
+      PrevOffset = 0;
+      PrevRegister = Inst.Register;
+    } else if (Inst.Operation == Win64EH::UOP_SaveRegP) {
+      PrevOffset = Inst.Offset;
+      PrevRegister = Inst.Register;
+    } else if (Inst.Operation == Win64EH::UOP_SaveNext) {
+      PrevRegister += 2;
+      PrevOffset += 16;
+    } else {
+      PrevRegister = -1;
+      PrevOffset = -1;
+    }
+  };
+
+  // Iterate over instructions in a forward order (for prologues),
+  // backwards for epilogues (i.e. always reverse compared to how the
+  // opcodes are stored).
+  if (Reverse) {
+    for (auto It = Instructions.rbegin(); It != Instructions.rend(); It++)
+      VisitInstruction(*It);
+  } else {
+    for (WinEH::Instruction &Inst : Instructions)
+      VisitInstruction(Inst);
+  }
+}
+
+static int checkPackedEpilog(MCStreamer &streamer, WinEH::FrameInfo *info,
+                             int PrologCodeBytes) {
+  // Can only pack if there's one single epilog
+  if (info->EpilogMap.size() != 1)
+    return -1;
+
+  const std::vector<WinEH::Instruction> &Epilog =
+      info->EpilogMap.begin()->second;
+
+  // Can pack if the epilog is a subset of the prolog but not vice versa
+  if (Epilog.size() > info->Instructions.size())
+    return -1;
+
+  // Check that the epilog actually is a perfect match for the end (backwrds)
+  // of the prolog.
+  for (int I = Epilog.size() - 1; I >= 0; I--) {
+    if (info->Instructions[I] != Epilog[Epilog.size() - 1 - I])
+      return -1;
+  }
+
+  // Check that the epilog actually is at the very end of the function,
+  // otherwise it can't be packed.
+  uint32_t DistanceFromEnd = (uint32_t)GetAbsDifference(
+      streamer, info->FuncletOrFuncEnd, info->EpilogMap.begin()->first);
+  if (DistanceFromEnd / 4 != Epilog.size())
+    return -1;
+
+  int Offset = Epilog.size() == info->Instructions.size()
+                   ? 0
+                   : ARM64CountOfUnwindCodes(ArrayRef<WinEH::Instruction>(
+                         &info->Instructions[Epilog.size()],
+                         info->Instructions.size() - Epilog.size()));
+
+  // Check that the offset and prolog size fits in the first word; it's
+  // unclear whether the epilog count in the extension word can be taken
+  // as packed epilog offset.
+  if (Offset > 31 || PrologCodeBytes > 124)
+    return -1;
+
+  info->EpilogMap.clear();
+  return Offset;
+}
+
+static bool tryPackedUnwind(WinEH::FrameInfo *info, uint32_t FuncLength,
+                            int PackedEpilogOffset) {
+  if (PackedEpilogOffset == 0) {
+    // Fully symmetric prolog and epilog, should be ok for packed format.
+    // For CR=3, the corresponding synthesized epilog actually lacks the
+    // SetFP opcode, but unwinding should work just fine despite that
+    // (if at the SetFP opcode, the unwinder considers it as part of the
+    // function body and just unwinds the full prolog instead).
+  } else if (PackedEpilogOffset == 1) {
+    // One single case of differences between prolog and epilog is allowed:
+    // The epilog can lack a single SetFP that is the last opcode in the
+    // prolog, for the CR=3 case.
+    if (info->Instructions.back().Operation != Win64EH::UOP_SetFP)
+      return false;
+  } else {
+    // Too much difference between prolog and epilog.
+    return false;
+  }
+  unsigned RegI = 0, RegF = 0;
+  int Predecrement = 0;
+  enum {
+    Start,
+    Start2,
+    IntRegs,
+    FloatRegs,
+    InputArgs,
+    StackAdjust,
+    FrameRecord,
+    End
+  } Location = Start;
+  bool StandaloneLR = false, FPLRPair = false;
+  int StackOffset = 0;
+  int Nops = 0;
+  // Iterate over the prolog and check that all opcodes exactly match
+  // the canonical order and form. A more lax check could verify that
+  // all saved registers are in the expected locations, but not enforce
+  // the order - that would work fine when unwinding from within
+  // functions, but not be exactly right if unwinding happens within
+  // prologs/epilogs.
+  for (const WinEH::Instruction &Inst : info->Instructions) {
+    switch (Inst.Operation) {
+    case Win64EH::UOP_End:
+      if (Location != Start)
+        return false;
+      Location = Start2;
+      break;
+    case Win64EH::UOP_SaveR19R20X:
+      if (Location != Start2)
+        return false;
+      Predecrement = Inst.Offset;
+      RegI = 2;
+      Location = IntRegs;
+      break;
+    case Win64EH::UOP_SaveRegX:
+      if (Location != Start2)
+        return false;
+      Predecrement = Inst.Offset;
+      if (Inst.Register == 19)
+        RegI += 1;
+      else if (Inst.Register == 30)
+        StandaloneLR = true;
+      else
+        return false;
+      // Odd register; can't be any further int registers.
+      Location = FloatRegs;
+      break;
+    case Win64EH::UOP_SaveRegPX:
+      // Can't have this in a canonical prologue. Either this has been
+      // canonicalized into SaveR19R20X or SaveFPLRX, or it's an unsupported
+      // register pair.
+      // It can't be canonicalized into SaveR19R20X if the offset is
+      // larger than 248 bytes, but even with the maximum case with
+      // RegI=10/RegF=8/CR=1/H=1, we end up with SavSZ = 216, which should
+      // fit into SaveR19R20X.
+      // The unwinding opcodes can't describe the otherwise seemingly valid
+      // case for RegI=1 CR=1, that would start with a
+      // "stp x19, lr, [sp, #-...]!" as that fits neither SaveRegPX nor
+      // SaveLRPair.
+      return false;
+    case Win64EH::UOP_SaveRegP:
+      if (Location != IntRegs || Inst.Offset != 8 * RegI ||
+          Inst.Register != 19 + RegI)
+        return false;
+      RegI += 2;
+      break;
+    case Win64EH::UOP_SaveReg:
+      if (Location != IntRegs || Inst.Offset != 8 * RegI)
+        return false;
+      if (Inst.Register == 19 + RegI)
+        RegI += 1;
+      else if (Inst.Register == 30)
+        StandaloneLR = true;
+      else
+        return false;
+      // Odd register; can't be any further int registers.
+      Location = FloatRegs;
+      break;
+    case Win64EH::UOP_SaveLRPair:
+      if (Location != IntRegs || Inst.Offset != 8 * RegI ||
+          Inst.Register != 19 + RegI)
+        return false;
+      RegI += 1;
+      StandaloneLR = true;
+      Location = FloatRegs;
+      break;
+    case Win64EH::UOP_SaveFRegX:
+      // Packed unwind can't handle prologs that only save one single
+      // float register.
+      return false;
+    case Win64EH::UOP_SaveFReg:
+      if (Location != FloatRegs || RegF == 0 || Inst.Register != 8 + RegF ||
+          Inst.Offset != 8 * (RegI + (StandaloneLR ? 1 : 0) + RegF))
+        return false;
+      RegF += 1;
+      Location = InputArgs;
+      break;
+    case Win64EH::UOP_SaveFRegPX:
+      if (Location != Start2 || Inst.Register != 8)
+        return false;
+      Predecrement = Inst.Offset;
+      RegF = 2;
+      Location = FloatRegs;
+      break;
+    case Win64EH::UOP_SaveFRegP:
+      if ((Location != IntRegs && Location != FloatRegs) ||
+          Inst.Register != 8 + RegF ||
+          Inst.Offset != 8 * (RegI + (StandaloneLR ? 1 : 0) + RegF))
+        return false;
+      RegF += 2;
+      Location = FloatRegs;
+      break;
+    case Win64EH::UOP_SaveNext:
+      if (Location == IntRegs)
+        RegI += 2;
+      else if (Location == FloatRegs)
+        RegF += 2;
+      else
+        return false;
+      break;
+    case Win64EH::UOP_Nop:
+      if (Location != IntRegs && Location != FloatRegs && Location != InputArgs)
+        return false;
+      Location = InputArgs;
+      Nops++;
+      break;
+    case Win64EH::UOP_AllocSmall:
+    case Win64EH::UOP_AllocMedium:
+      if (Location != Start2 && Location != IntRegs && Location != FloatRegs &&
+          Location != InputArgs && Location != StackAdjust)
+        return false;
+      // Can have either a single decrement, or a pair of decrements with
+      // 4080 and another decrement.
+      if (StackOffset == 0)
+        StackOffset = Inst.Offset;
+      else if (StackOffset != 4080)
+        return false;
+      else
+        StackOffset += Inst.Offset;
+      Location = StackAdjust;
+      break;
+    case Win64EH::UOP_SaveFPLRX:
+      // Not allowing FPLRX after StackAdjust; if a StackAdjust is used, it
+      // should be followed by a FPLR instead.
+      if (Location != Start2 && Location != IntRegs && Location != FloatRegs &&
+          Location != InputArgs)
+        return false;
+      StackOffset = Inst.Offset;
+      Location = FrameRecord;
+      FPLRPair = true;
+      break;
+    case Win64EH::UOP_SaveFPLR:
+      // This can only follow after a StackAdjust
+      if (Location != StackAdjust || Inst.Offset != 0)
+        return false;
+      Location = FrameRecord;
+      FPLRPair = true;
+      break;
+    case Win64EH::UOP_SetFP:
+      if (Location != FrameRecord)
+        return false;
+      Location = End;
+      break;
+    }
+  }
+  if (RegI > 10 || RegF > 8)
+    return false;
+  if (StandaloneLR && FPLRPair)
+    return false;
+  if (FPLRPair && Location != End)
+    return false;
+  if (Nops != 0 && Nops != 4)
+    return false;
+  int H = Nops == 4;
+  int IntSZ = 8 * RegI;
+  if (StandaloneLR)
+    IntSZ += 8;
+  int FpSZ = 8 * RegF; // RegF not yet decremented
+  int SavSZ = (IntSZ + FpSZ + 8 * 8 * H + 0xF) & ~0xF;
+  if (Predecrement != SavSZ)
+    return false;
+  if (FPLRPair && StackOffset < 16)
+    return false;
+  if (StackOffset % 16)
+    return false;
+  uint32_t FrameSize = (StackOffset + SavSZ) / 16;
+  if (FrameSize > 0x1FF)
+    return false;
+  assert(RegF != 1 && "One single float reg not allowed");
+  if (RegF > 0)
+    RegF--; // Convert from actual number of registers, to value stored
+  assert(FuncLength <= 0x7FF && "FuncLength should have been checked earlier");
+  int Flag = 0x01; // Function segments not supported yet
+  int CR = FPLRPair ? 3 : StandaloneLR ? 1 : 0;
+  info->PackedInfo |= Flag << 0;
+  info->PackedInfo |= (FuncLength & 0x7FF) << 2;
+  info->PackedInfo |= (RegF & 0x7) << 13;
+  info->PackedInfo |= (RegI & 0xF) << 16;
+  info->PackedInfo |= (H & 0x1) << 20;
+  info->PackedInfo |= (CR & 0x3) << 21;
+  info->PackedInfo |= (FrameSize & 0x1FF) << 23;
+  return true;
+}
+
 // Populate the .xdata section.  The format of .xdata on ARM64 is documented at
 // https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling
-static void ARM64EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) {
+static void ARM64EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info,
+                                bool TryPacked = true) {
   // If this UNWIND_INFO already has a symbol, it's already been emitted.
   if (info->Symbol)
     return;
+  // If there's no unwind info here (not even a terminating UOP_End), the
+  // unwind info is considered bogus and skipped. If this was done in
+  // response to an explicit .seh_handlerdata, the associated trailing
+  // handler data is left orphaned in the xdata section.
+  if (info->empty()) {
+    info->EmitAttempted = true;
+    return;
+  }
+  if (info->EmitAttempted) {
+    // If we tried to emit unwind info before (due to an explicit
+    // .seh_handlerdata directive), but skipped it (because there was no
+    // valid information to emit at the time), and it later got valid unwind
+    // opcodes, we can't emit it here, because the trailing handler data
+    // was already emitted elsewhere in the xdata section.
+    streamer.getContext().reportError(
+        SMLoc(), "Earlier .seh_handlerdata for " + info->Function->getName() +
+                     " skipped due to no unwind info at the time "
+                     "(.seh_handlerdata too early?), but the function later "
+                     "did get unwind info that can't be emitted");
+    return;
+  }
+
+  simplifyOpcodes(info->Instructions, false);
+  for (auto &I : info->EpilogMap)
+    simplifyOpcodes(I.second, true);
 
   MCContext &context = streamer.getContext();
   MCSymbol *Label = context.createTempSymbol();
@@ -504,9 +914,7 @@ static void ARM64EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) {
 
   int64_t RawFuncLength;
   if (!info->FuncletOrFuncEnd) {
-    // FIXME: This is very wrong; we emit SEH data which covers zero bytes
-    // of code. But otherwise test/MC/AArch64/seh.s crashes.
-    RawFuncLength = 0;
+    report_fatal_error("FuncletOrFuncEnd not set");
   } else {
     // FIXME: GetAbsDifference tries to compute the length of the function
     // immediately, before the whole file is emitted, but in general
@@ -543,6 +951,22 @@ static void ARM64EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) {
   uint32_t PrologCodeBytes = ARM64CountOfUnwindCodes(info->Instructions);
   uint32_t TotalCodeBytes = PrologCodeBytes;
 
+  int PackedEpilogOffset = checkPackedEpilog(streamer, info, PrologCodeBytes);
+
+  if (PackedEpilogOffset >= 0 && !info->HandlesExceptions &&
+      FuncLength <= 0x7ff && TryPacked) {
+    // Matching prolog/epilog and no exception handlers; check if the
+    // prolog matches the patterns that can be described by the packed
+    // format.
+
+    // info->Symbol was already set even if we didn't actually write any
+    // unwind info there. Keep using that as indicator that this unwind
+    // info has been generated already.
+
+    if (tryPackedUnwind(info, FuncLength, PackedEpilogOffset))
+      return;
+  }
+
   // Process epilogs.
   MapVector<MCSymbol *, uint32_t> EpilogInfo;
   // Epilogs processed so far.
@@ -575,15 +999,17 @@ static void ARM64EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) {
   uint32_t CodeWordsMod = TotalCodeBytes % 4;
   if (CodeWordsMod)
     CodeWords++;
-  uint32_t EpilogCount = info->EpilogMap.size();
+  uint32_t EpilogCount =
+      PackedEpilogOffset >= 0 ? PackedEpilogOffset : info->EpilogMap.size();
   bool ExtensionWord = EpilogCount > 31 || TotalCodeBytes > 124;
   if (!ExtensionWord) {
     row1 |= (EpilogCount & 0x1F) << 22;
     row1 |= (CodeWords & 0x1F) << 27;
   }
-  // E is always 0 right now, TODO: packed epilog setup
   if (info->HandlesExceptions) // X
     row1 |= 1 << 20;
+  if (PackedEpilogOffset >= 0) // E
+    row1 |= 1 << 21;
   row1 |= FuncLength & 0x3FFFF;
   streamer.emitInt32(row1);
 
@@ -648,33 +1074,56 @@ static void ARM64EmitRuntimeFunction(MCStreamer &streamer,
 
   streamer.emitValueToAlignment(4);
   EmitSymbolRefWithOfs(streamer, info->Function, info->Begin);
-  streamer.emitValue(MCSymbolRefExpr::create(info->Symbol,
-                                             MCSymbolRefExpr::VK_COFF_IMGREL32,
-                                             context),
-                     4);
+  if (info->PackedInfo)
+    streamer.emitInt32(info->PackedInfo);
+  else
+    streamer.emitValue(
+        MCSymbolRefExpr::create(info->Symbol, MCSymbolRefExpr::VK_COFF_IMGREL32,
+                                context),
+        4);
 }
 
 void llvm::Win64EH::ARM64UnwindEmitter::Emit(MCStreamer &Streamer) const {
   // Emit the unwind info structs first.
   for (const auto &CFI : Streamer.getWinFrameInfos()) {
+    WinEH::FrameInfo *Info = CFI.get();
+    if (Info->empty())
+      continue;
     MCSection *XData = Streamer.getAssociatedXDataSection(CFI->TextSection);
     Streamer.SwitchSection(XData);
-    ARM64EmitUnwindInfo(Streamer, CFI.get());
+    ARM64EmitUnwindInfo(Streamer, Info);
   }
 
   // Now emit RUNTIME_FUNCTION entries.
   for (const auto &CFI : Streamer.getWinFrameInfos()) {
+    WinEH::FrameInfo *Info = CFI.get();
+    // ARM64EmitUnwindInfo above clears the info struct, so we can't check
+    // empty here. But if a Symbol is set, we should create the corresponding
+    // pdata entry.
+    if (!Info->Symbol)
+      continue;
     MCSection *PData = Streamer.getAssociatedPDataSection(CFI->TextSection);
     Streamer.SwitchSection(PData);
-    ARM64EmitRuntimeFunction(Streamer, CFI.get());
+    ARM64EmitRuntimeFunction(Streamer, Info);
   }
 }
 
-void llvm::Win64EH::ARM64UnwindEmitter::EmitUnwindInfo(
-    MCStreamer &Streamer, WinEH::FrameInfo *info) const {
+void llvm::Win64EH::ARM64UnwindEmitter::EmitUnwindInfo(MCStreamer &Streamer,
+                                                       WinEH::FrameInfo *info,
+                                                       bool HandlerData) const {
+  // Called if there's an .seh_handlerdata directive before the end of the
+  // function. This forces writing the xdata record already here - and
+  // in this case, the function isn't actually ended already, but the xdata
+  // record needs to know the function length. In these cases, if the funclet
+  // end hasn't been marked yet, the xdata function length won't cover the
+  // whole function, only up to this point.
+  if (!info->FuncletOrFuncEnd) {
+    Streamer.SwitchSection(info->TextSection);
+    info->FuncletOrFuncEnd = Streamer.emitCFILabel();
+  }
   // Switch sections (the static function above is meant to be called from
   // here and from Emit().
   MCSection *XData = Streamer.getAssociatedXDataSection(info->TextSection);
   Streamer.SwitchSection(XData);
-  ARM64EmitUnwindInfo(Streamer, info);
+  ARM64EmitUnwindInfo(Streamer, info, /* TryPacked = */ !HandlerData);
 }
diff --git a/contrib/llvm-project/llvm/lib/MC/MCWinCOFFStreamer.cpp b/contrib/llvm-project/llvm/lib/MC/MCWinCOFFStreamer.cpp
index d8fde4004d44..97cceac74ac2 100644
--- a/contrib/llvm-project/llvm/lib/MC/MCWinCOFFStreamer.cpp
+++ b/contrib/llvm-project/llvm/lib/MC/MCWinCOFFStreamer.cpp
@@ -308,6 +308,16 @@ void MCWinCOFFStreamer::emitLocalCommonSymbol(MCSymbol *S, uint64_t Size,
   PopSection();
 }
 
+void MCWinCOFFStreamer::emitWeakReference(MCSymbol *AliasS,
+                                          const MCSymbol *Symbol) {
+  auto *Alias = cast<MCSymbolCOFF>(AliasS);
+  emitSymbolAttribute(Alias, MCSA_Weak);
+
+  getAssembler().registerSymbol(*Symbol);
+  Alias->setVariableValue(MCSymbolRefExpr::create(
+      Symbol, MCSymbolRefExpr::VK_WEAKREF, getContext()));
+}
+
 void MCWinCOFFStreamer::emitZerofill(MCSection *Section, MCSymbol *Symbol,
                                      uint64_t Size, unsigned ByteAlignment,
                                      SMLoc Loc) {
@@ -340,10 +350,8 @@ void MCWinCOFFStreamer::finalizeCGProfileEntry(const MCSymbolRefExpr *&SRE) {
   const MCSymbol *S = &SRE->getSymbol();
   bool Created;
   getAssembler().registerSymbol(*S, &Created);
-  if (Created) {
-    cast<MCSymbolCOFF>(S)->setIsWeakExternal();
+  if (Created)
     cast<MCSymbolCOFF>(S)->setExternal(true);
-  }
 }
 
 void MCWinCOFFStreamer::finalizeCGProfile() {
diff --git a/contrib/llvm-project/llvm/lib/MC/StringTableBuilder.cpp b/contrib/llvm-project/llvm/lib/MC/StringTableBuilder.cpp
index c9c88ec58432..973c59057cb5 100644
--- a/contrib/llvm-project/llvm/lib/MC/StringTableBuilder.cpp
+++ b/contrib/llvm-project/llvm/lib/MC/StringTableBuilder.cpp
@@ -33,7 +33,12 @@ void StringTableBuilder::initSize() {
   case DWARF:
     Size = 0;
     break;
+  case MachOLinked:
+  case MachO64Linked:
+    Size = 2;
+    break;
   case MachO:
+  case MachO64:
   case ELF:
     // Start the table with a NUL byte.
     Size = 1;
@@ -161,8 +166,16 @@ void StringTableBuilder::finalizeStringTable(bool Optimize) {
     }
   }
 
-  if (K == MachO)
+  if (K == MachO || K == MachOLinked)
     Size = alignTo(Size, 4); // Pad to multiple of 4.
+  if (K == MachO64 || K == MachO64Linked)
+    Size = alignTo(Size, 8); // Pad to multiple of 8.
+
+  // According to ld64 the string table of a final linked Mach-O binary starts
+  // with " ", i.e. the first byte is ' ' and the second byte is zero. In
+  // 'initSize()' we reserved the first two bytes for holding this string.
+  if (K == MachOLinked || K == MachO64Linked)
+    StringIndexMap[CachedHashStringRef(" ")] = 0;
 
   // The first byte in an ELF string table must be null, according to the ELF
   // specification. In 'initSize()' we reserved the first byte to hold null for
diff --git a/contrib/llvm-project/llvm/lib/MC/WasmObjectWriter.cpp b/contrib/llvm-project/llvm/lib/MC/WasmObjectWriter.cpp
index f51d908c53e1..930413e83438 100644
--- a/contrib/llvm-project/llvm/lib/MC/WasmObjectWriter.cpp
+++ b/contrib/llvm-project/llvm/lib/MC/WasmObjectWriter.cpp
@@ -13,6 +13,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/BinaryFormat/Wasm.h"
+#include "llvm/BinaryFormat/WasmTraits.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAsmLayout.h"
@@ -39,8 +40,8 @@ using namespace llvm;
 
 namespace {
 
-// Went we ceate the indirect function table we start at 1, so that there is
-// and emtpy slot at 0 and therefore calling a null function pointer will trap.
+// When we create the indirect function table we start at 1, so that there is
+// and empty slot at 0 and therefore calling a null function pointer will trap.
 static const uint32_t InitialTableOffset = 1;
 
 // For patching purposes, we need to remember where each section starts, both
@@ -56,50 +57,6 @@ struct SectionBookkeeping {
   uint32_t Index;
 };
 
-// The signature of a wasm function or event, in a struct capable of being used
-// as a DenseMap key.
-// TODO: Consider using wasm::WasmSignature directly instead.
-struct WasmSignature {
-  // Support empty and tombstone instances, needed by DenseMap.
-  enum { Plain, Empty, Tombstone } State = Plain;
-
-  // The return types of the function.
-  SmallVector<wasm::ValType, 1> Returns;
-
-  // The parameter types of the function.
-  SmallVector<wasm::ValType, 4> Params;
-
-  bool operator==(const WasmSignature &Other) const {
-    return State == Other.State && Returns == Other.Returns &&
-           Params == Other.Params;
-  }
-};
-
-// Traits for using WasmSignature in a DenseMap.
-struct WasmSignatureDenseMapInfo {
-  static WasmSignature getEmptyKey() {
-    WasmSignature Sig;
-    Sig.State = WasmSignature::Empty;
-    return Sig;
-  }
-  static WasmSignature getTombstoneKey() {
-    WasmSignature Sig;
-    Sig.State = WasmSignature::Tombstone;
-    return Sig;
-  }
-  static unsigned getHashValue(const WasmSignature &Sig) {
-    uintptr_t Value = Sig.State;
-    for (wasm::ValType Ret : Sig.Returns)
-      Value += DenseMapInfo<uint32_t>::getHashValue(uint32_t(Ret));
-    for (wasm::ValType Param : Sig.Params)
-      Value += DenseMapInfo<uint32_t>::getHashValue(uint32_t(Param));
-    return Value;
-  }
-  static bool isEqual(const WasmSignature &LHS, const WasmSignature &RHS) {
-    return LHS == RHS;
-  }
-};
-
 // A wasm data segment.  A wasm binary contains only a single data section
 // but that can contain many segments, each with their own virtual location
 // in memory.  Each MCSection data created by llvm is modeled as its own
@@ -216,8 +173,12 @@ static void patchI64(raw_pwrite_stream &Stream, uint64_t X, uint64_t Offset) {
   Stream.pwrite((char *)Buffer, sizeof(Buffer), Offset);
 }
 
+bool isDwoSection(const MCSection &Sec) {
+  return Sec.getName().endswith(".dwo");
+}
+
 class WasmObjectWriter : public MCObjectWriter {
-  support::endian::Writer W;
+  support::endian::Writer *W;
 
   /// The target specific Wasm writer instance.
   std::unique_ptr<MCWasmObjectTargetWriter> TargetObjectWriter;
@@ -233,8 +194,8 @@ class WasmObjectWriter : public MCObjectWriter {
   // Maps function symbols to the table element index space. Used
   // for TABLE_INDEX relocation types (i.e. address taken functions).
   DenseMap<const MCSymbolWasm *, uint32_t> TableIndices;
-  // Maps function/global symbols to the function/global/event/section index
-  // space.
+  // Maps function/global/table symbols to the
+  // function/global/table/event/section index space.
   DenseMap<const MCSymbolWasm *, uint32_t> WasmIndices;
   DenseMap<const MCSymbolWasm *, uint32_t> GOTIndices;
   // Maps data symbols to the Wasm segment and offset/size with the segment.
@@ -252,15 +213,25 @@ class WasmObjectWriter : public MCObjectWriter {
   // Map from section to defining function symbol.
   DenseMap<const MCSection *, const MCSymbol *> SectionFunctions;
 
-  DenseMap<WasmSignature, uint32_t, WasmSignatureDenseMapInfo> SignatureIndices;
-  SmallVector<WasmSignature, 4> Signatures;
+  DenseMap<wasm::WasmSignature, uint32_t> SignatureIndices;
+  SmallVector<wasm::WasmSignature, 4> Signatures;
   SmallVector<WasmDataSegment, 4> DataSegments;
   unsigned NumFunctionImports = 0;
   unsigned NumGlobalImports = 0;
+  unsigned NumTableImports = 0;
   unsigned NumEventImports = 0;
   uint32_t SectionCount = 0;
 
-  // TargetObjectWriter wrappers.
+  enum class DwoMode {
+    AllSections,
+    NonDwoOnly,
+    DwoOnly,
+  };
+  bool IsSplitDwarf = false;
+  raw_pwrite_stream *OS = nullptr;
+  raw_pwrite_stream *DwoOS = nullptr;
+
+  // TargetObjectWriter wranppers.
   bool is64Bit() const { return TargetObjectWriter->is64Bit(); }
   bool isEmscripten() const { return TargetObjectWriter->isEmscripten(); }
 
@@ -270,8 +241,13 @@ class WasmObjectWriter : public MCObjectWriter {
 
 public:
   WasmObjectWriter(std::unique_ptr<MCWasmObjectTargetWriter> MOTW,
-                   raw_pwrite_stream &OS)
-      : W(OS, support::little), TargetObjectWriter(std::move(MOTW)) {}
+                   raw_pwrite_stream &OS_)
+      : TargetObjectWriter(std::move(MOTW)), OS(&OS_) {}
+
+  WasmObjectWriter(std::unique_ptr<MCWasmObjectTargetWriter> MOTW,
+                   raw_pwrite_stream &OS_, raw_pwrite_stream &DwoOS_)
+      : TargetObjectWriter(std::move(MOTW)), IsSplitDwarf(true), OS(&OS_),
+        DwoOS(&DwoOS_) {}
 
 private:
   void reset() override {
@@ -292,6 +268,7 @@ private:
     SectionFunctions.clear();
     NumFunctionImports = 0;
     NumGlobalImports = 0;
+    NumTableImports = 0;
     MCObjectWriter::reset();
   }
 
@@ -303,29 +280,33 @@ private:
 
   void executePostLayoutBinding(MCAssembler &Asm,
                                 const MCAsmLayout &Layout) override;
-
+  void prepareImports(SmallVectorImpl<wasm::WasmImport> &Imports,
+                      MCAssembler &Asm, const MCAsmLayout &Layout);
   uint64_t writeObject(MCAssembler &Asm, const MCAsmLayout &Layout) override;
 
+  uint64_t writeOneObject(MCAssembler &Asm, const MCAsmLayout &Layout,
+                          DwoMode Mode);
+
   void writeString(const StringRef Str) {
-    encodeULEB128(Str.size(), W.OS);
-    W.OS << Str;
+    encodeULEB128(Str.size(), W->OS);
+    W->OS << Str;
   }
 
   void writeI32(int32_t val) {
     char Buffer[4];
     support::endian::write32le(Buffer, val);
-    W.OS.write(Buffer, sizeof(Buffer));
+    W->OS.write(Buffer, sizeof(Buffer));
   }
 
   void writeI64(int64_t val) {
     char Buffer[8];
     support::endian::write64le(Buffer, val);
-    W.OS.write(Buffer, sizeof(Buffer));
+    W->OS.write(Buffer, sizeof(Buffer));
   }
 
-  void writeValueType(wasm::ValType Ty) { W.OS << static_cast<char>(Ty); }
+  void writeValueType(wasm::ValType Ty) { W->OS << static_cast<char>(Ty); }
 
-  void writeTypeSection(ArrayRef<WasmSignature> Signatures);
+  void writeTypeSection(ArrayRef<wasm::WasmSignature> Signatures);
   void writeImportSection(ArrayRef<wasm::WasmImport> Imports, uint64_t DataSize,
                           uint32_t NumElements);
   void writeFunctionSection(ArrayRef<WasmFunction> Functions);
@@ -337,6 +318,7 @@ private:
   uint32_t writeDataSection(const MCAsmLayout &Layout);
   void writeEventSection(ArrayRef<wasm::WasmEventType> Events);
   void writeGlobalSection(ArrayRef<wasm::WasmGlobal> Globals);
+  void writeTableSection(ArrayRef<wasm::WasmTable> Tables);
   void writeRelocSection(uint32_t SectionIndex, StringRef Name,
                          std::vector<WasmRelocationEntry> &Relocations);
   void writeLinkingMetaDataSection(
@@ -346,9 +328,6 @@ private:
   void writeCustomSection(WasmCustomSection &CustomSection,
                           const MCAssembler &Asm, const MCAsmLayout &Layout);
   void writeCustomRelocSections();
-  void
-  updateCustomSectionRelocations(const SmallVector<WasmFunction, 4> &Functions,
-                                 const MCAsmLayout &Layout);
 
   uint64_t getProvisionalValue(const WasmRelocationEntry &RelEntry,
                                const MCAsmLayout &Layout);
@@ -368,17 +347,17 @@ private:
 void WasmObjectWriter::startSection(SectionBookkeeping &Section,
                                     unsigned SectionId) {
   LLVM_DEBUG(dbgs() << "startSection " << SectionId << "\n");
-  W.OS << char(SectionId);
+  W->OS << char(SectionId);
 
-  Section.SizeOffset = W.OS.tell();
+  Section.SizeOffset = W->OS.tell();
 
   // The section size. We don't know the size yet, so reserve enough space
   // for any 32-bit value; we'll patch it later.
-  encodeULEB128(0, W.OS, 5);
+  encodeULEB128(0, W->OS, 5);
 
   // The position where the section starts, for measuring its size.
-  Section.ContentsOffset = W.OS.tell();
-  Section.PayloadOffset = W.OS.tell();
+  Section.ContentsOffset = W->OS.tell();
+  Section.PayloadOffset = W->OS.tell();
   Section.Index = SectionCount++;
 }
 
@@ -388,19 +367,19 @@ void WasmObjectWriter::startCustomSection(SectionBookkeeping &Section,
   startSection(Section, wasm::WASM_SEC_CUSTOM);
 
   // The position where the section header ends, for measuring its size.
-  Section.PayloadOffset = W.OS.tell();
+  Section.PayloadOffset = W->OS.tell();
 
   // Custom sections in wasm also have a string identifier.
   writeString(Name);
 
   // The position where the custom section starts.
-  Section.ContentsOffset = W.OS.tell();
+  Section.ContentsOffset = W->OS.tell();
 }
 
 // Now that the section is complete and we know how big it is, patch up the
 // section size field at the start of the section.
 void WasmObjectWriter::endSection(SectionBookkeeping &Section) {
-  uint64_t Size = W.OS.tell();
+  uint64_t Size = W->OS.tell();
   // /dev/null doesn't support seek/tell and can report offset of 0.
   // Simply skip this patching in that case.
   if (!Size)
@@ -414,18 +393,25 @@ void WasmObjectWriter::endSection(SectionBookkeeping &Section) {
 
   // Write the final section size to the payload_len field, which follows
   // the section id byte.
-  writePatchableLEB<5>(static_cast<raw_pwrite_stream &>(W.OS), Size,
+  writePatchableLEB<5>(static_cast<raw_pwrite_stream &>(W->OS), Size,
                        Section.SizeOffset);
 }
 
 // Emit the Wasm header.
 void WasmObjectWriter::writeHeader(const MCAssembler &Asm) {
-  W.OS.write(wasm::WasmMagic, sizeof(wasm::WasmMagic));
-  W.write<uint32_t>(wasm::WasmVersion);
+  W->OS.write(wasm::WasmMagic, sizeof(wasm::WasmMagic));
+  W->write<uint32_t>(wasm::WasmVersion);
 }
 
 void WasmObjectWriter::executePostLayoutBinding(MCAssembler &Asm,
                                                 const MCAsmLayout &Layout) {
+  // As a stopgap measure until call_indirect instructions start explicitly
+  // referencing the indirect function table via TABLE_NUMBER relocs, ensure
+  // that the indirect function table import makes it to the output if anything
+  // in the compilation unit has caused it to be present.
+  if (auto *Sym = Asm.getContext().lookupSymbol("__indirect_function_table"))
+    Asm.registerSymbol(*Sym);
+
   // Build a map of sections to the function that defines them, for use
   // in recordRelocation.
   for (const MCSymbol &S : Asm.symbols()) {
@@ -494,6 +480,7 @@ void WasmObjectWriter::recordRelocation(MCAssembler &Asm,
   // Currently only supported for for metadata sections.
   // See: test/MC/WebAssembly/blockaddress.ll
   if (Type == wasm::R_WASM_FUNCTION_OFFSET_I32 ||
+      Type == wasm::R_WASM_FUNCTION_OFFSET_I64 ||
       Type == wasm::R_WASM_SECTION_OFFSET_I32) {
     if (!FixupSection.getKind().isMetadata())
       report_fatal_error("relocations for function or section offsets are "
@@ -512,6 +499,29 @@ void WasmObjectWriter::recordRelocation(MCAssembler &Asm,
     SymA = cast<MCSymbolWasm>(SectionSymbol);
   }
 
+  if (Type == wasm::R_WASM_TABLE_INDEX_REL_SLEB ||
+      Type == wasm::R_WASM_TABLE_INDEX_SLEB ||
+      Type == wasm::R_WASM_TABLE_INDEX_SLEB64 ||
+      Type == wasm::R_WASM_TABLE_INDEX_I32 ||
+      Type == wasm::R_WASM_TABLE_INDEX_I64) {
+    // TABLE_INDEX relocs implicitly use the default indirect function table.
+    auto TableName = "__indirect_function_table";
+    MCSymbolWasm *Sym = cast_or_null<MCSymbolWasm>(Ctx.lookupSymbol(TableName));
+    if (Sym) {
+      if (!Sym->isFunctionTable())
+        Ctx.reportError(
+            Fixup.getLoc(),
+            "symbol '__indirect_function_table' is not a function table");
+    } else {
+      Sym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(TableName));
+      Sym->setFunctionTable();
+      // The default function table is synthesized by the linker.
+      Sym->setUndefined();
+    }
+    Sym->setUsedInReloc();
+    Asm.registerSymbol(*Sym);
+  }
+
   // Relocation other than R_WASM_TYPE_INDEX_LEB are required to be
   // against a named symbol.
   if (Type != wasm::R_WASM_TYPE_INDEX_LEB) {
@@ -556,7 +566,9 @@ WasmObjectWriter::getProvisionalValue(const WasmRelocationEntry &RelEntry,
   switch (RelEntry.Type) {
   case wasm::R_WASM_TABLE_INDEX_REL_SLEB:
   case wasm::R_WASM_TABLE_INDEX_SLEB:
-  case wasm::R_WASM_TABLE_INDEX_I32: {
+  case wasm::R_WASM_TABLE_INDEX_SLEB64:
+  case wasm::R_WASM_TABLE_INDEX_I32:
+  case wasm::R_WASM_TABLE_INDEX_I64: {
     // Provisional value is table address of the resolved symbol itself
     const MCSymbolWasm *Base =
         cast<MCSymbolWasm>(Layout.getBaseSymbol(*RelEntry.Symbol));
@@ -573,10 +585,12 @@ WasmObjectWriter::getProvisionalValue(const WasmRelocationEntry &RelEntry,
   case wasm::R_WASM_GLOBAL_INDEX_LEB:
   case wasm::R_WASM_GLOBAL_INDEX_I32:
   case wasm::R_WASM_EVENT_INDEX_LEB:
+  case wasm::R_WASM_TABLE_NUMBER_LEB:
     // Provisional value is function/global/event Wasm index
     assert(WasmIndices.count(RelEntry.Symbol) > 0 && "symbol not found in wasm index space");
     return WasmIndices[RelEntry.Symbol];
   case wasm::R_WASM_FUNCTION_OFFSET_I32:
+  case wasm::R_WASM_FUNCTION_OFFSET_I64:
   case wasm::R_WASM_SECTION_OFFSET_I32: {
     const auto &Section =
         static_cast<const MCSectionWasm &>(RelEntry.Symbol->getSection());
@@ -589,17 +603,19 @@ WasmObjectWriter::getProvisionalValue(const WasmRelocationEntry &RelEntry,
   case wasm::R_WASM_MEMORY_ADDR_REL_SLEB:
   case wasm::R_WASM_MEMORY_ADDR_REL_SLEB64:
   case wasm::R_WASM_MEMORY_ADDR_I32:
-  case wasm::R_WASM_MEMORY_ADDR_I64: {
-    // Provisional value is address of the global
+  case wasm::R_WASM_MEMORY_ADDR_I64:
+  case wasm::R_WASM_MEMORY_ADDR_TLS_SLEB: {
+    // Provisional value is address of the global plus the offset
     const MCSymbolWasm *Base =
         cast<MCSymbolWasm>(Layout.getBaseSymbol(*RelEntry.Symbol));
     // For undefined symbols, use zero
     if (!Base->isDefined())
       return 0;
-    const wasm::WasmDataReference &Ref = DataLocations[Base];
-    const WasmDataSegment &Segment = DataSegments[Ref.Segment];
+    const wasm::WasmDataReference &BaseRef = DataLocations[Base],
+                                  &SymRef = DataLocations[RelEntry.Symbol];
+    const WasmDataSegment &Segment = DataSegments[BaseRef.Segment];
     // Ignore overflow. LLVM allows address arithmetic to silently wrap.
-    return Segment.Offset + Ref.Offset + RelEntry.Addend;
+    return Segment.Offset + BaseRef.Offset + SymRef.Offset + RelEntry.Addend;
   }
   default:
     llvm_unreachable("invalid relocation type");
@@ -633,11 +649,11 @@ static void addData(SmallVectorImpl<char> &DataBytes,
                        Fill->getValue());
     } else if (auto *LEB = dyn_cast<MCLEBFragment>(&Frag)) {
       const SmallVectorImpl<char> &Contents = LEB->getContents();
-      DataBytes.insert(DataBytes.end(), Contents.begin(), Contents.end());
+      llvm::append_range(DataBytes, Contents);
     } else {
       const auto &DataFrag = cast<MCDataFragment>(Frag);
       const SmallVectorImpl<char> &Contents = DataFrag.getContents();
-      DataBytes.insert(DataBytes.end(), Contents.begin(), Contents.end());
+      llvm::append_range(DataBytes, Contents);
     }
   }
 
@@ -661,7 +677,7 @@ WasmObjectWriter::getRelocationIndexValue(const WasmRelocationEntry &RelEntry) {
 void WasmObjectWriter::applyRelocations(
     ArrayRef<WasmRelocationEntry> Relocations, uint64_t ContentsOffset,
     const MCAsmLayout &Layout) {
-  auto &Stream = static_cast<raw_pwrite_stream &>(W.OS);
+  auto &Stream = static_cast<raw_pwrite_stream &>(W->OS);
   for (const WasmRelocationEntry &RelEntry : Relocations) {
     uint64_t Offset = ContentsOffset +
                       RelEntry.FixupSection->getSectionOffset() +
@@ -676,6 +692,7 @@ void WasmObjectWriter::applyRelocations(
     case wasm::R_WASM_GLOBAL_INDEX_LEB:
     case wasm::R_WASM_MEMORY_ADDR_LEB:
     case wasm::R_WASM_EVENT_INDEX_LEB:
+    case wasm::R_WASM_TABLE_NUMBER_LEB:
       writePatchableLEB<5>(Stream, Value, Offset);
       break;
     case wasm::R_WASM_MEMORY_ADDR_LEB64:
@@ -688,15 +705,19 @@ void WasmObjectWriter::applyRelocations(
     case wasm::R_WASM_GLOBAL_INDEX_I32:
       patchI32(Stream, Value, Offset);
       break;
+    case wasm::R_WASM_TABLE_INDEX_I64:
     case wasm::R_WASM_MEMORY_ADDR_I64:
+    case wasm::R_WASM_FUNCTION_OFFSET_I64:
       patchI64(Stream, Value, Offset);
       break;
     case wasm::R_WASM_TABLE_INDEX_SLEB:
     case wasm::R_WASM_TABLE_INDEX_REL_SLEB:
     case wasm::R_WASM_MEMORY_ADDR_SLEB:
     case wasm::R_WASM_MEMORY_ADDR_REL_SLEB:
+    case wasm::R_WASM_MEMORY_ADDR_TLS_SLEB:
       writePatchableSLEB<5>(Stream, Value, Offset);
       break;
+    case wasm::R_WASM_TABLE_INDEX_SLEB64:
     case wasm::R_WASM_MEMORY_ADDR_SLEB64:
     case wasm::R_WASM_MEMORY_ADDR_REL_SLEB64:
       writePatchableSLEB<10>(Stream, Value, Offset);
@@ -707,21 +728,22 @@ void WasmObjectWriter::applyRelocations(
   }
 }
 
-void WasmObjectWriter::writeTypeSection(ArrayRef<WasmSignature> Signatures) {
+void WasmObjectWriter::writeTypeSection(
+    ArrayRef<wasm::WasmSignature> Signatures) {
   if (Signatures.empty())
     return;
 
   SectionBookkeeping Section;
   startSection(Section, wasm::WASM_SEC_TYPE);
 
-  encodeULEB128(Signatures.size(), W.OS);
+  encodeULEB128(Signatures.size(), W->OS);
 
-  for (const WasmSignature &Sig : Signatures) {
-    W.OS << char(wasm::WASM_TYPE_FUNC);
-    encodeULEB128(Sig.Params.size(), W.OS);
+  for (const wasm::WasmSignature &Sig : Signatures) {
+    W->OS << char(wasm::WASM_TYPE_FUNC);
+    encodeULEB128(Sig.Params.size(), W->OS);
     for (wasm::ValType Ty : Sig.Params)
       writeValueType(Ty);
-    encodeULEB128(Sig.Returns.size(), W.OS);
+    encodeULEB128(Sig.Returns.size(), W->OS);
     for (wasm::ValType Ty : Sig.Returns)
       writeValueType(Ty);
   }
@@ -740,32 +762,32 @@ void WasmObjectWriter::writeImportSection(ArrayRef<wasm::WasmImport> Imports,
   SectionBookkeeping Section;
   startSection(Section, wasm::WASM_SEC_IMPORT);
 
-  encodeULEB128(Imports.size(), W.OS);
+  encodeULEB128(Imports.size(), W->OS);
   for (const wasm::WasmImport &Import : Imports) {
     writeString(Import.Module);
     writeString(Import.Field);
-    W.OS << char(Import.Kind);
+    W->OS << char(Import.Kind);
 
     switch (Import.Kind) {
     case wasm::WASM_EXTERNAL_FUNCTION:
-      encodeULEB128(Import.SigIndex, W.OS);
+      encodeULEB128(Import.SigIndex, W->OS);
       break;
     case wasm::WASM_EXTERNAL_GLOBAL:
-      W.OS << char(Import.Global.Type);
-      W.OS << char(Import.Global.Mutable ? 1 : 0);
+      W->OS << char(Import.Global.Type);
+      W->OS << char(Import.Global.Mutable ? 1 : 0);
       break;
     case wasm::WASM_EXTERNAL_MEMORY:
-      encodeULEB128(Import.Memory.Flags, W.OS);
-      encodeULEB128(NumPages, W.OS);  // initial
+      encodeULEB128(Import.Memory.Flags, W->OS);
+      encodeULEB128(NumPages, W->OS); // initial
       break;
     case wasm::WASM_EXTERNAL_TABLE:
-      W.OS << char(Import.Table.ElemType);
-      encodeULEB128(0, W.OS);           // flags
-      encodeULEB128(NumElements, W.OS); // initial
+      W->OS << char(Import.Table.ElemType);
+      encodeULEB128(0, W->OS);           // flags
+      encodeULEB128(NumElements, W->OS); // initial
       break;
     case wasm::WASM_EXTERNAL_EVENT:
-      encodeULEB128(Import.Event.Attribute, W.OS);
-      encodeULEB128(Import.Event.SigIndex, W.OS);
+      encodeULEB128(Import.Event.Attribute, W->OS);
+      encodeULEB128(Import.Event.SigIndex, W->OS);
       break;
     default:
       llvm_unreachable("unsupported import kind");
@@ -782,9 +804,9 @@ void WasmObjectWriter::writeFunctionSection(ArrayRef<WasmFunction> Functions) {
   SectionBookkeeping Section;
   startSection(Section, wasm::WASM_SEC_FUNCTION);
 
-  encodeULEB128(Functions.size(), W.OS);
+  encodeULEB128(Functions.size(), W->OS);
   for (const WasmFunction &Func : Functions)
-    encodeULEB128(Func.SigIndex, W.OS);
+    encodeULEB128(Func.SigIndex, W->OS);
 
   endSection(Section);
 }
@@ -796,10 +818,10 @@ void WasmObjectWriter::writeEventSection(ArrayRef<wasm::WasmEventType> Events) {
   SectionBookkeeping Section;
   startSection(Section, wasm::WASM_SEC_EVENT);
 
-  encodeULEB128(Events.size(), W.OS);
+  encodeULEB128(Events.size(), W->OS);
   for (const wasm::WasmEventType &Event : Events) {
-    encodeULEB128(Event.Attribute, W.OS);
-    encodeULEB128(Event.SigIndex, W.OS);
+    encodeULEB128(Event.Attribute, W->OS);
+    encodeULEB128(Event.SigIndex, W->OS);
   }
 
   endSection(Section);
@@ -812,17 +834,17 @@ void WasmObjectWriter::writeGlobalSection(ArrayRef<wasm::WasmGlobal> Globals) {
   SectionBookkeeping Section;
   startSection(Section, wasm::WASM_SEC_GLOBAL);
 
-  encodeULEB128(Globals.size(), W.OS);
+  encodeULEB128(Globals.size(), W->OS);
   for (const wasm::WasmGlobal &Global : Globals) {
-    encodeULEB128(Global.Type.Type, W.OS);
-    W.OS << char(Global.Type.Mutable);
-    W.OS << char(Global.InitExpr.Opcode);
+    encodeULEB128(Global.Type.Type, W->OS);
+    W->OS << char(Global.Type.Mutable);
+    W->OS << char(Global.InitExpr.Opcode);
     switch (Global.Type.Type) {
     case wasm::WASM_TYPE_I32:
-      encodeSLEB128(0, W.OS);
+      encodeSLEB128(0, W->OS);
       break;
     case wasm::WASM_TYPE_I64:
-      encodeSLEB128(0, W.OS);
+      encodeSLEB128(0, W->OS);
       break;
     case wasm::WASM_TYPE_F32:
       writeI32(0);
@@ -836,12 +858,30 @@ void WasmObjectWriter::writeGlobalSection(ArrayRef<wasm::WasmGlobal> Globals) {
     default:
       llvm_unreachable("unexpected type");
     }
-    W.OS << char(wasm::WASM_OPCODE_END);
+    W->OS << char(wasm::WASM_OPCODE_END);
   }
 
   endSection(Section);
 }
 
+void WasmObjectWriter::writeTableSection(ArrayRef<wasm::WasmTable> Tables) {
+  if (Tables.empty())
+    return;
+
+  SectionBookkeeping Section;
+  startSection(Section, wasm::WASM_SEC_TABLE);
+
+  encodeULEB128(Tables.size(), W->OS);
+  for (const wasm::WasmTable &Table : Tables) {
+    encodeULEB128(Table.Type.ElemType, W->OS);
+    encodeULEB128(Table.Type.Limits.Flags, W->OS);
+    encodeULEB128(Table.Type.Limits.Initial, W->OS);
+    if (Table.Type.Limits.Flags & wasm::WASM_LIMITS_FLAG_HAS_MAX)
+      encodeULEB128(Table.Type.Limits.Maximum, W->OS);
+  }
+  endSection(Section);
+}
+
 void WasmObjectWriter::writeExportSection(ArrayRef<wasm::WasmExport> Exports) {
   if (Exports.empty())
     return;
@@ -849,11 +889,11 @@ void WasmObjectWriter::writeExportSection(ArrayRef<wasm::WasmExport> Exports) {
   SectionBookkeeping Section;
   startSection(Section, wasm::WASM_SEC_EXPORT);
 
-  encodeULEB128(Exports.size(), W.OS);
+  encodeULEB128(Exports.size(), W->OS);
   for (const wasm::WasmExport &Export : Exports) {
     writeString(Export.Name);
-    W.OS << char(Export.Kind);
-    encodeULEB128(Export.Index, W.OS);
+    W->OS << char(Export.Kind);
+    encodeULEB128(Export.Index, W->OS);
   }
 
   endSection(Section);
@@ -866,17 +906,17 @@ void WasmObjectWriter::writeElemSection(ArrayRef<uint32_t> TableElems) {
   SectionBookkeeping Section;
   startSection(Section, wasm::WASM_SEC_ELEM);
 
-  encodeULEB128(1, W.OS); // number of "segments"
-  encodeULEB128(0, W.OS); // the table index
+  encodeULEB128(1, W->OS); // number of "segments"
+  encodeULEB128(0, W->OS); // the table index
 
   // init expr for starting offset
-  W.OS << char(wasm::WASM_OPCODE_I32_CONST);
-  encodeSLEB128(InitialTableOffset, W.OS);
-  W.OS << char(wasm::WASM_OPCODE_END);
+  W->OS << char(wasm::WASM_OPCODE_I32_CONST);
+  encodeSLEB128(InitialTableOffset, W->OS);
+  W->OS << char(wasm::WASM_OPCODE_END);
 
-  encodeULEB128(TableElems.size(), W.OS);
+  encodeULEB128(TableElems.size(), W->OS);
   for (uint32_t Elem : TableElems)
-    encodeULEB128(Elem, W.OS);
+    encodeULEB128(Elem, W->OS);
 
   endSection(Section);
 }
@@ -887,7 +927,7 @@ void WasmObjectWriter::writeDataCountSection() {
 
   SectionBookkeeping Section;
   startSection(Section, wasm::WASM_SEC_DATACOUNT);
-  encodeULEB128(DataSegments.size(), W.OS);
+  encodeULEB128(DataSegments.size(), W->OS);
   endSection(Section);
 }
 
@@ -900,7 +940,7 @@ uint32_t WasmObjectWriter::writeCodeSection(const MCAssembler &Asm,
   SectionBookkeeping Section;
   startSection(Section, wasm::WASM_SEC_CODE);
 
-  encodeULEB128(Functions.size(), W.OS);
+  encodeULEB128(Functions.size(), W->OS);
 
   for (const WasmFunction &Func : Functions) {
     auto &FuncSection = static_cast<MCSectionWasm &>(Func.Sym->getSection());
@@ -909,9 +949,9 @@ uint32_t WasmObjectWriter::writeCodeSection(const MCAssembler &Asm,
     if (!Func.Sym->getSize()->evaluateAsAbsolute(Size, Layout))
       report_fatal_error(".size expression must be evaluatable");
 
-    encodeULEB128(Size, W.OS);
-    FuncSection.setSectionOffset(W.OS.tell() - Section.ContentsOffset);
-    Asm.writeSectionData(W.OS, &FuncSection, Layout);
+    encodeULEB128(Size, W->OS);
+    FuncSection.setSectionOffset(W->OS.tell() - Section.ContentsOffset);
+    Asm.writeSectionData(W->OS, &FuncSection, Layout);
   }
 
   // Apply fixups.
@@ -928,22 +968,21 @@ uint32_t WasmObjectWriter::writeDataSection(const MCAsmLayout &Layout) {
   SectionBookkeeping Section;
   startSection(Section, wasm::WASM_SEC_DATA);
 
-  encodeULEB128(DataSegments.size(), W.OS); // count
+  encodeULEB128(DataSegments.size(), W->OS); // count
 
   for (const WasmDataSegment &Segment : DataSegments) {
-    encodeULEB128(Segment.InitFlags, W.OS); // flags
-    if (Segment.InitFlags & wasm::WASM_SEGMENT_HAS_MEMINDEX)
-      encodeULEB128(0, W.OS); // memory index
-    if ((Segment.InitFlags & wasm::WASM_SEGMENT_IS_PASSIVE) == 0) {
-      W.OS << char(Segment.Offset > std::numeric_limits<int32_t>().max()
-                     ? wasm::WASM_OPCODE_I64_CONST
-                     : wasm::WASM_OPCODE_I32_CONST);
-      encodeSLEB128(Segment.Offset, W.OS); // offset
-      W.OS << char(wasm::WASM_OPCODE_END);
+    encodeULEB128(Segment.InitFlags, W->OS); // flags
+    if (Segment.InitFlags & wasm::WASM_DATA_SEGMENT_HAS_MEMINDEX)
+      encodeULEB128(0, W->OS); // memory index
+    if ((Segment.InitFlags & wasm::WASM_DATA_SEGMENT_IS_PASSIVE) == 0) {
+      W->OS << char(Segment.Offset > INT32_MAX ? wasm::WASM_OPCODE_I64_CONST
+                                               : wasm::WASM_OPCODE_I32_CONST);
+      encodeSLEB128(Segment.Offset, W->OS); // offset
+      W->OS << char(wasm::WASM_OPCODE_END);
     }
-    encodeULEB128(Segment.Data.size(), W.OS); // size
-    Segment.Section->setSectionOffset(W.OS.tell() - Section.ContentsOffset);
-    W.OS << Segment.Data; // data
+    encodeULEB128(Segment.Data.size(), W->OS); // size
+    Segment.Section->setSectionOffset(W->OS.tell() - Section.ContentsOffset);
+    W->OS << Segment.Data; // data
   }
 
   // Apply fixups.
@@ -976,18 +1015,18 @@ void WasmObjectWriter::writeRelocSection(
   SectionBookkeeping Section;
   startCustomSection(Section, std::string("reloc.") + Name.str());
 
-  encodeULEB128(SectionIndex, W.OS);
-  encodeULEB128(Relocs.size(), W.OS);
+  encodeULEB128(SectionIndex, W->OS);
+  encodeULEB128(Relocs.size(), W->OS);
   for (const WasmRelocationEntry &RelEntry : Relocs) {
     uint64_t Offset =
         RelEntry.Offset + RelEntry.FixupSection->getSectionOffset();
     uint32_t Index = getRelocationIndexValue(RelEntry);
 
-    W.OS << char(RelEntry.Type);
-    encodeULEB128(Offset, W.OS);
-    encodeULEB128(Index, W.OS);
+    W->OS << char(RelEntry.Type);
+    encodeULEB128(Offset, W->OS);
+    encodeULEB128(Index, W->OS);
     if (RelEntry.hasAddend())
-      encodeSLEB128(RelEntry.Addend, W.OS);
+      encodeSLEB128(RelEntry.Addend, W->OS);
   }
 
   endSection(Section);
@@ -1006,20 +1045,21 @@ void WasmObjectWriter::writeLinkingMetaDataSection(
     const std::map<StringRef, std::vector<WasmComdatEntry>> &Comdats) {
   SectionBookkeeping Section;
   startCustomSection(Section, "linking");
-  encodeULEB128(wasm::WasmMetadataVersion, W.OS);
+  encodeULEB128(wasm::WasmMetadataVersion, W->OS);
 
   SectionBookkeeping SubSection;
   if (SymbolInfos.size() != 0) {
     startSection(SubSection, wasm::WASM_SYMBOL_TABLE);
-    encodeULEB128(SymbolInfos.size(), W.OS);
+    encodeULEB128(SymbolInfos.size(), W->OS);
     for (const wasm::WasmSymbolInfo &Sym : SymbolInfos) {
-      encodeULEB128(Sym.Kind, W.OS);
-      encodeULEB128(Sym.Flags, W.OS);
+      encodeULEB128(Sym.Kind, W->OS);
+      encodeULEB128(Sym.Flags, W->OS);
       switch (Sym.Kind) {
       case wasm::WASM_SYMBOL_TYPE_FUNCTION:
       case wasm::WASM_SYMBOL_TYPE_GLOBAL:
       case wasm::WASM_SYMBOL_TYPE_EVENT:
-        encodeULEB128(Sym.ElementIndex, W.OS);
+      case wasm::WASM_SYMBOL_TYPE_TABLE:
+        encodeULEB128(Sym.ElementIndex, W->OS);
         if ((Sym.Flags & wasm::WASM_SYMBOL_UNDEFINED) == 0 ||
             (Sym.Flags & wasm::WASM_SYMBOL_EXPLICIT_NAME) != 0)
           writeString(Sym.Name);
@@ -1027,15 +1067,15 @@ void WasmObjectWriter::writeLinkingMetaDataSection(
       case wasm::WASM_SYMBOL_TYPE_DATA:
         writeString(Sym.Name);
         if ((Sym.Flags & wasm::WASM_SYMBOL_UNDEFINED) == 0) {
-          encodeULEB128(Sym.DataRef.Segment, W.OS);
-          encodeULEB128(Sym.DataRef.Offset, W.OS);
-          encodeULEB128(Sym.DataRef.Size, W.OS);
+          encodeULEB128(Sym.DataRef.Segment, W->OS);
+          encodeULEB128(Sym.DataRef.Offset, W->OS);
+          encodeULEB128(Sym.DataRef.Size, W->OS);
         }
         break;
       case wasm::WASM_SYMBOL_TYPE_SECTION: {
         const uint32_t SectionIndex =
             CustomSections[Sym.ElementIndex].OutputIndex;
-        encodeULEB128(SectionIndex, W.OS);
+        encodeULEB128(SectionIndex, W->OS);
         break;
       }
       default:
@@ -1047,35 +1087,35 @@ void WasmObjectWriter::writeLinkingMetaDataSection(
 
   if (DataSegments.size()) {
     startSection(SubSection, wasm::WASM_SEGMENT_INFO);
-    encodeULEB128(DataSegments.size(), W.OS);
+    encodeULEB128(DataSegments.size(), W->OS);
     for (const WasmDataSegment &Segment : DataSegments) {
       writeString(Segment.Name);
-      encodeULEB128(Segment.Alignment, W.OS);
-      encodeULEB128(Segment.LinkerFlags, W.OS);
+      encodeULEB128(Segment.Alignment, W->OS);
+      encodeULEB128(Segment.LinkerFlags, W->OS);
     }
     endSection(SubSection);
   }
 
   if (!InitFuncs.empty()) {
     startSection(SubSection, wasm::WASM_INIT_FUNCS);
-    encodeULEB128(InitFuncs.size(), W.OS);
+    encodeULEB128(InitFuncs.size(), W->OS);
     for (auto &StartFunc : InitFuncs) {
-      encodeULEB128(StartFunc.first, W.OS);  // priority
-      encodeULEB128(StartFunc.second, W.OS); // function index
+      encodeULEB128(StartFunc.first, W->OS);  // priority
+      encodeULEB128(StartFunc.second, W->OS); // function index
     }
     endSection(SubSection);
   }
 
   if (Comdats.size()) {
     startSection(SubSection, wasm::WASM_COMDAT_INFO);
-    encodeULEB128(Comdats.size(), W.OS);
+    encodeULEB128(Comdats.size(), W->OS);
     for (const auto &C : Comdats) {
       writeString(C.first);
-      encodeULEB128(0, W.OS); // flags for future use
-      encodeULEB128(C.second.size(), W.OS);
+      encodeULEB128(0, W->OS); // flags for future use
+      encodeULEB128(C.second.size(), W->OS);
       for (const WasmComdatEntry &Entry : C.second) {
-        encodeULEB128(Entry.Kind, W.OS);
-        encodeULEB128(Entry.Index, W.OS);
+        encodeULEB128(Entry.Kind, W->OS);
+        encodeULEB128(Entry.Index, W->OS);
       }
     }
     endSection(SubSection);
@@ -1091,8 +1131,8 @@ void WasmObjectWriter::writeCustomSection(WasmCustomSection &CustomSection,
   auto *Sec = CustomSection.Section;
   startCustomSection(Section, CustomSection.Name);
 
-  Sec->setSectionOffset(W.OS.tell() - Section.ContentsOffset);
-  Asm.writeSectionData(W.OS, Sec, Layout);
+  Sec->setSectionOffset(W->OS.tell() - Section.ContentsOffset);
+  Asm.writeSectionData(W->OS, Sec, Layout);
 
   CustomSection.OutputContentsOffset = Section.ContentsOffset;
   CustomSection.OutputIndex = Section.Index;
@@ -1119,7 +1159,7 @@ uint32_t WasmObjectWriter::getEventType(const MCSymbolWasm &Symbol) {
 void WasmObjectWriter::registerFunctionType(const MCSymbolWasm &Symbol) {
   assert(Symbol.isFunction());
 
-  WasmSignature S;
+  wasm::WasmSignature S;
 
   if (auto *Sig = Symbol.getSignature()) {
     S.Returns = Sig->Returns;
@@ -1141,7 +1181,7 @@ void WasmObjectWriter::registerEventType(const MCSymbolWasm &Symbol) {
 
   // TODO Currently we don't generate imported exceptions, but if we do, we
   // should have a way of infering types of imported exceptions.
-  WasmSignature S;
+  wasm::WasmSignature S;
   if (auto *Sig = Symbol.getSignature()) {
     S.Returns = Sig->Returns;
     S.Params = Sig->Params;
@@ -1173,24 +1213,9 @@ static bool isInSymtab(const MCSymbolWasm &Sym) {
   return true;
 }
 
-uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm,
-                                       const MCAsmLayout &Layout) {
-  uint64_t StartOffset = W.OS.tell();
-
-  LLVM_DEBUG(dbgs() << "WasmObjectWriter::writeObject\n");
-
-  // Collect information from the available symbols.
-  SmallVector<WasmFunction, 4> Functions;
-  SmallVector<uint32_t, 4> TableElems;
-  SmallVector<wasm::WasmImport, 4> Imports;
-  SmallVector<wasm::WasmExport, 4> Exports;
-  SmallVector<wasm::WasmEventType, 1> Events;
-  SmallVector<wasm::WasmGlobal, 1> Globals;
-  SmallVector<wasm::WasmSymbolInfo, 4> SymbolInfos;
-  SmallVector<std::pair<uint16_t, uint32_t>, 2> InitFuncs;
-  std::map<StringRef, std::vector<WasmComdatEntry>> Comdats;
-  uint64_t DataSize = 0;
-
+void WasmObjectWriter::prepareImports(
+    SmallVectorImpl<wasm::WasmImport> &Imports, MCAssembler &Asm,
+    const MCAsmLayout &Layout) {
   // For now, always emit the memory import, since loads and stores are not
   // valid without it. In the future, we could perhaps be more clever and omit
   // it if there are no loads or stores.
@@ -1202,16 +1227,6 @@ uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm,
                                      : wasm::WASM_LIMITS_FLAG_NONE;
   Imports.push_back(MemImport);
 
-  // For now, always emit the table section, since indirect calls are not
-  // valid without it. In the future, we could perhaps be more clever and omit
-  // it if there are no indirect calls.
-  wasm::WasmImport TableImport;
-  TableImport.Module = "env";
-  TableImport.Field = "__indirect_function_table";
-  TableImport.Kind = wasm::WASM_EXTERNAL_TABLE;
-  TableImport.Table.ElemType = wasm::WASM_TYPE_FUNCREF;
-  Imports.push_back(TableImport);
-
   // Populate SignatureIndices, and Imports and WasmIndices for undefined
   // symbols.  This must be done before populating WasmIndices for defined
   // symbols.
@@ -1221,8 +1236,11 @@ uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm,
     // Register types for all functions, including those with private linkage
     // (because wasm always needs a type signature).
     if (WS.isFunction()) {
-      const MCSymbolWasm *Base = cast<MCSymbolWasm>(Layout.getBaseSymbol(S));
-      registerFunctionType(*Base);
+      const auto *BS = Layout.getBaseSymbol(S);
+      if (!BS)
+        report_fatal_error(Twine(S.getName()) +
+                           ": absolute addressing not supported!");
+      registerFunctionType(*cast<MCSymbolWasm>(BS));
     }
 
     if (WS.isEvent())
@@ -1267,6 +1285,23 @@ uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm,
         Imports.push_back(Import);
         assert(WasmIndices.count(&WS) == 0);
         WasmIndices[&WS] = NumEventImports++;
+      } else if (WS.isTable()) {
+        if (WS.isWeak())
+          report_fatal_error("undefined table symbol cannot be weak");
+
+        wasm::WasmImport Import;
+        Import.Module = WS.getImportModule();
+        Import.Field = WS.getImportName();
+        Import.Kind = wasm::WASM_EXTERNAL_TABLE;
+        wasm::ValType ElemType = WS.getTableType();
+        Import.Table.ElemType = uint8_t(ElemType);
+        // FIXME: Extend table type to include limits? For now we don't specify
+        // a min or max which does not place any restrictions on the size of the
+        // imported table.
+        Import.Table.Limits = {wasm::WASM_LIMITS_FLAG_NONE, 0, 0};
+        Imports.push_back(Import);
+        assert(WasmIndices.count(&WS) == 0);
+        WasmIndices[&WS] = NumTableImports++;
       }
     }
   }
@@ -1288,6 +1323,47 @@ uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm,
       GOTIndices[&WS] = NumGlobalImports++;
     }
   }
+}
+
+uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm,
+                                       const MCAsmLayout &Layout) {
+  support::endian::Writer MainWriter(*OS, support::little);
+  W = &MainWriter;
+  if (IsSplitDwarf) {
+    uint64_t TotalSize = writeOneObject(Asm, Layout, DwoMode::NonDwoOnly);
+    assert(DwoOS);
+    support::endian::Writer DwoWriter(*DwoOS, support::little);
+    W = &DwoWriter;
+    return TotalSize + writeOneObject(Asm, Layout, DwoMode::DwoOnly);
+  } else {
+    return writeOneObject(Asm, Layout, DwoMode::AllSections);
+  }
+}
+
+uint64_t WasmObjectWriter::writeOneObject(MCAssembler &Asm,
+                                          const MCAsmLayout &Layout,
+                                          DwoMode Mode) {
+  uint64_t StartOffset = W->OS.tell();
+  SectionCount = 0;
+  CustomSections.clear();
+
+  LLVM_DEBUG(dbgs() << "WasmObjectWriter::writeObject\n");
+
+  // Collect information from the available symbols.
+  SmallVector<WasmFunction, 4> Functions;
+  SmallVector<uint32_t, 4> TableElems;
+  SmallVector<wasm::WasmImport, 4> Imports;
+  SmallVector<wasm::WasmExport, 4> Exports;
+  SmallVector<wasm::WasmEventType, 1> Events;
+  SmallVector<wasm::WasmGlobal, 1> Globals;
+  SmallVector<wasm::WasmTable, 1> Tables;
+  SmallVector<wasm::WasmSymbolInfo, 4> SymbolInfos;
+  SmallVector<std::pair<uint16_t, uint32_t>, 2> InitFuncs;
+  std::map<StringRef, std::vector<WasmComdatEntry>> Comdats;
+  uint64_t DataSize = 0;
+  if (Mode != DwoMode::DwoOnly) {
+    prepareImports(Imports, Asm, Layout);
+  }
 
   // Populate DataSegments and CustomSections, which must be done before
   // populating DataLocations.
@@ -1295,6 +1371,14 @@ uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm,
     auto &Section = static_cast<MCSectionWasm &>(Sec);
     StringRef SectionName = Section.getName();
 
+    if (Mode == DwoMode::NonDwoOnly && isDwoSection(Sec))
+      continue;
+    if (Mode == DwoMode::DwoOnly && !isDwoSection(Sec))
+      continue;
+
+    LLVM_DEBUG(dbgs() << "Processing Section " << SectionName << "  group "
+                      << Section.getGroup() << "\n";);
+
     // .init_array sections are handled specially elsewhere.
     if (SectionName.startswith(".init_array"))
       continue;
@@ -1309,8 +1393,9 @@ uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm,
       DataSegments.emplace_back();
       WasmDataSegment &Segment = DataSegments.back();
       Segment.Name = SectionName;
-      Segment.InitFlags =
-          Section.getPassive() ? (uint32_t)wasm::WASM_SEGMENT_IS_PASSIVE : 0;
+      Segment.InitFlags = Section.getPassive()
+                              ? (uint32_t)wasm::WASM_DATA_SEGMENT_IS_PASSIVE
+                              : 0;
       Segment.Offset = DataSize;
       Segment.Section = &Section;
       addData(Segment.Data, Section);
@@ -1335,10 +1420,8 @@ uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm,
 
       MCSymbol *Begin = Sec.getBeginSymbol();
       if (Begin) {
+        assert(WasmIndices.count(cast<MCSymbolWasm>(Begin)) == 0);
         WasmIndices[cast<MCSymbolWasm>(Begin)] = CustomSections.size();
-        if (SectionName != Begin->getName())
-          report_fatal_error("section name and begin symbol should match: " +
-                             Twine(SectionName));
       }
 
       // Separate out the producers and target features sections
@@ -1352,200 +1435,232 @@ uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm,
         continue;
       }
 
+      // Custom sections can also belong to COMDAT groups. In this case the
+      // decriptor's "index" field is the section index (in the final object
+      // file), but that is not known until after layout, so it must be fixed up
+      // later
+      if (const MCSymbolWasm *C = Section.getGroup()) {
+        Comdats[C->getName()].emplace_back(
+            WasmComdatEntry{wasm::WASM_COMDAT_SECTION,
+                            static_cast<uint32_t>(CustomSections.size())});
+      }
+
       CustomSections.emplace_back(Name, &Section);
     }
   }
 
-  // Populate WasmIndices and DataLocations for defined symbols.
-  for (const MCSymbol &S : Asm.symbols()) {
-    // Ignore unnamed temporary symbols, which aren't ever exported, imported,
-    // or used in relocations.
-    if (S.isTemporary() && S.getName().empty())
-      continue;
+  if (Mode != DwoMode::DwoOnly) {
+    // Populate WasmIndices and DataLocations for defined symbols.
+    for (const MCSymbol &S : Asm.symbols()) {
+      // Ignore unnamed temporary symbols, which aren't ever exported, imported,
+      // or used in relocations.
+      if (S.isTemporary() && S.getName().empty())
+        continue;
 
-    const auto &WS = static_cast<const MCSymbolWasm &>(S);
-    LLVM_DEBUG(
-        dbgs() << "MCSymbol: " << toString(WS.getType()) << " '" << S << "'"
-               << " isDefined=" << S.isDefined() << " isExternal="
-               << S.isExternal() << " isTemporary=" << S.isTemporary()
-               << " isWeak=" << WS.isWeak() << " isHidden=" << WS.isHidden()
-               << " isVariable=" << WS.isVariable() << "\n");
-
-    if (WS.isVariable())
-      continue;
-    if (WS.isComdat() && !WS.isDefined())
-      continue;
+      const auto &WS = static_cast<const MCSymbolWasm &>(S);
+      LLVM_DEBUG(
+          dbgs() << "MCSymbol: " << toString(WS.getType()) << " '" << S << "'"
+                 << " isDefined=" << S.isDefined() << " isExternal="
+                 << S.isExternal() << " isTemporary=" << S.isTemporary()
+                 << " isWeak=" << WS.isWeak() << " isHidden=" << WS.isHidden()
+                 << " isVariable=" << WS.isVariable() << "\n");
 
-    if (WS.isFunction()) {
-      unsigned Index;
-      if (WS.isDefined()) {
-        if (WS.getOffset() != 0)
-          report_fatal_error(
-              "function sections must contain one function each");
-
-        if (WS.getSize() == nullptr)
-          report_fatal_error(
-              "function symbols must have a size set with .size");
-
-        // A definition. Write out the function body.
-        Index = NumFunctionImports + Functions.size();
-        WasmFunction Func;
-        Func.SigIndex = getFunctionType(WS);
-        Func.Sym = &WS;
-        WasmIndices[&WS] = Index;
-        Functions.push_back(Func);
-
-        auto &Section = static_cast<MCSectionWasm &>(WS.getSection());
-        if (const MCSymbolWasm *C = Section.getGroup()) {
-          Comdats[C->getName()].emplace_back(
-              WasmComdatEntry{wasm::WASM_COMDAT_FUNCTION, Index});
+      if (WS.isVariable())
+        continue;
+      if (WS.isComdat() && !WS.isDefined())
+        continue;
+
+      if (WS.isFunction()) {
+        unsigned Index;
+        if (WS.isDefined()) {
+          if (WS.getOffset() != 0)
+            report_fatal_error(
+                "function sections must contain one function each");
+
+          if (WS.getSize() == nullptr)
+            report_fatal_error(
+                "function symbols must have a size set with .size");
+
+          // A definition. Write out the function body.
+          Index = NumFunctionImports + Functions.size();
+          WasmFunction Func;
+          Func.SigIndex = getFunctionType(WS);
+          Func.Sym = &WS;
+          assert(WasmIndices.count(&WS) == 0);
+          WasmIndices[&WS] = Index;
+          Functions.push_back(Func);
+
+          auto &Section = static_cast<MCSectionWasm &>(WS.getSection());
+          if (const MCSymbolWasm *C = Section.getGroup()) {
+            Comdats[C->getName()].emplace_back(
+                WasmComdatEntry{wasm::WASM_COMDAT_FUNCTION, Index});
+          }
+
+          if (WS.hasExportName()) {
+            wasm::WasmExport Export;
+            Export.Name = WS.getExportName();
+            Export.Kind = wasm::WASM_EXTERNAL_FUNCTION;
+            Export.Index = Index;
+            Exports.push_back(Export);
+          }
+        } else {
+          // An import; the index was assigned above.
+          Index = WasmIndices.find(&WS)->second;
         }
 
-        if (WS.hasExportName()) {
-          wasm::WasmExport Export;
-          Export.Name = WS.getExportName();
-          Export.Kind = wasm::WASM_EXTERNAL_FUNCTION;
-          Export.Index = Index;
-          Exports.push_back(Export);
+        LLVM_DEBUG(dbgs() << "  -> function index: " << Index << "\n");
+
+      } else if (WS.isData()) {
+        if (!isInSymtab(WS))
+          continue;
+
+        if (!WS.isDefined()) {
+          LLVM_DEBUG(dbgs() << "  -> segment index: -1"
+                            << "\n");
+          continue;
         }
-      } else {
-        // An import; the index was assigned above.
-        Index = WasmIndices.find(&WS)->second;
-      }
 
-      LLVM_DEBUG(dbgs() << "  -> function index: " << Index << "\n");
+        if (!WS.getSize())
+          report_fatal_error("data symbols must have a size set with .size: " +
+                             WS.getName());
 
-    } else if (WS.isData()) {
-      if (!isInSymtab(WS))
-        continue;
+        int64_t Size = 0;
+        if (!WS.getSize()->evaluateAsAbsolute(Size, Layout))
+          report_fatal_error(".size expression must be evaluatable");
 
-      if (!WS.isDefined()) {
-        LLVM_DEBUG(dbgs() << "  -> segment index: -1"
-                          << "\n");
-        continue;
-      }
+        auto &DataSection = static_cast<MCSectionWasm &>(WS.getSection());
+        if (!DataSection.isWasmData())
+          report_fatal_error("data symbols must live in a data section: " +
+                             WS.getName());
 
-      if (!WS.getSize())
-        report_fatal_error("data symbols must have a size set with .size: " +
-                           WS.getName());
-
-      int64_t Size = 0;
-      if (!WS.getSize()->evaluateAsAbsolute(Size, Layout))
-        report_fatal_error(".size expression must be evaluatable");
-
-      auto &DataSection = static_cast<MCSectionWasm &>(WS.getSection());
-      if (!DataSection.isWasmData())
-        report_fatal_error("data symbols must live in a data section: " +
-                           WS.getName());
-
-      // For each data symbol, export it in the symtab as a reference to the
-      // corresponding Wasm data segment.
-      wasm::WasmDataReference Ref = wasm::WasmDataReference{
-          DataSection.getSegmentIndex(), Layout.getSymbolOffset(WS),
-          static_cast<uint64_t>(Size)};
-      DataLocations[&WS] = Ref;
-      LLVM_DEBUG(dbgs() << "  -> segment index: " << Ref.Segment << "\n");
-
-    } else if (WS.isGlobal()) {
-      // A "true" Wasm global (currently just __stack_pointer)
-      if (WS.isDefined()) {
-        assert(WasmIndices.count(&WS) == 0);
-        wasm::WasmGlobal Global;
-        Global.Type = WS.getGlobalType();
-        Global.Index = NumGlobalImports + Globals.size();
-        switch (Global.Type.Type) {
-        case wasm::WASM_TYPE_I32:
-          Global.InitExpr.Opcode = wasm::WASM_OPCODE_I32_CONST;
-          break;
-        case wasm::WASM_TYPE_I64:
-          Global.InitExpr.Opcode = wasm::WASM_OPCODE_I64_CONST;
-          break;
-        case wasm::WASM_TYPE_F32:
-          Global.InitExpr.Opcode = wasm::WASM_OPCODE_F32_CONST;
-          break;
-        case wasm::WASM_TYPE_F64:
-          Global.InitExpr.Opcode = wasm::WASM_OPCODE_F64_CONST;
-          break;
-        case wasm::WASM_TYPE_EXTERNREF:
-          Global.InitExpr.Opcode = wasm::WASM_OPCODE_REF_NULL;
-          break;
-        default:
-          llvm_unreachable("unexpected type");
+        // For each data symbol, export it in the symtab as a reference to the
+        // corresponding Wasm data segment.
+        wasm::WasmDataReference Ref = wasm::WasmDataReference{
+            DataSection.getSegmentIndex(), Layout.getSymbolOffset(WS),
+            static_cast<uint64_t>(Size)};
+        assert(DataLocations.count(&WS) == 0);
+        DataLocations[&WS] = Ref;
+        LLVM_DEBUG(dbgs() << "  -> segment index: " << Ref.Segment << "\n");
+
+      } else if (WS.isGlobal()) {
+        // A "true" Wasm global (currently just __stack_pointer)
+        if (WS.isDefined()) {
+          wasm::WasmGlobal Global;
+          Global.Type = WS.getGlobalType();
+          Global.Index = NumGlobalImports + Globals.size();
+          switch (Global.Type.Type) {
+          case wasm::WASM_TYPE_I32:
+            Global.InitExpr.Opcode = wasm::WASM_OPCODE_I32_CONST;
+            break;
+          case wasm::WASM_TYPE_I64:
+            Global.InitExpr.Opcode = wasm::WASM_OPCODE_I64_CONST;
+            break;
+          case wasm::WASM_TYPE_F32:
+            Global.InitExpr.Opcode = wasm::WASM_OPCODE_F32_CONST;
+            break;
+          case wasm::WASM_TYPE_F64:
+            Global.InitExpr.Opcode = wasm::WASM_OPCODE_F64_CONST;
+            break;
+          case wasm::WASM_TYPE_EXTERNREF:
+            Global.InitExpr.Opcode = wasm::WASM_OPCODE_REF_NULL;
+            break;
+          default:
+            llvm_unreachable("unexpected type");
+          }
+          assert(WasmIndices.count(&WS) == 0);
+          WasmIndices[&WS] = Global.Index;
+          Globals.push_back(Global);
+        } else {
+          // An import; the index was assigned above
+          LLVM_DEBUG(dbgs() << "  -> global index: "
+                            << WasmIndices.find(&WS)->second << "\n");
         }
-        WasmIndices[&WS] = Global.Index;
-        Globals.push_back(Global);
-      } else {
-        // An import; the index was assigned above
-        LLVM_DEBUG(dbgs() << "  -> global index: "
+      } else if (WS.isTable()) {
+        if (WS.isDefined()) {
+          wasm::WasmTable Table;
+          Table.Index = NumTableImports + Tables.size();
+          Table.Type.ElemType = static_cast<uint8_t>(WS.getTableType());
+          // FIXME: Work on custom limits is ongoing
+          Table.Type.Limits = {wasm::WASM_LIMITS_FLAG_NONE, 0, 0};
+          assert(WasmIndices.count(&WS) == 0);
+          WasmIndices[&WS] = Table.Index;
+          Tables.push_back(Table);
+        }
+        LLVM_DEBUG(dbgs() << " -> table index: "
                           << WasmIndices.find(&WS)->second << "\n");
-      }
-    } else if (WS.isEvent()) {
-      // C++ exception symbol (__cpp_exception)
-      unsigned Index;
-      if (WS.isDefined()) {
-        assert(WasmIndices.count(&WS) == 0);
-        Index = NumEventImports + Events.size();
-        wasm::WasmEventType Event;
-        Event.SigIndex = getEventType(WS);
-        Event.Attribute = wasm::WASM_EVENT_ATTRIBUTE_EXCEPTION;
-        WasmIndices[&WS] = Index;
-        Events.push_back(Event);
+      } else if (WS.isEvent()) {
+        // C++ exception symbol (__cpp_exception)
+        unsigned Index;
+        if (WS.isDefined()) {
+          Index = NumEventImports + Events.size();
+          wasm::WasmEventType Event;
+          Event.SigIndex = getEventType(WS);
+          Event.Attribute = wasm::WASM_EVENT_ATTRIBUTE_EXCEPTION;
+          assert(WasmIndices.count(&WS) == 0);
+          WasmIndices[&WS] = Index;
+          Events.push_back(Event);
+        } else {
+          // An import; the index was assigned above.
+          assert(WasmIndices.count(&WS) > 0);
+        }
+        LLVM_DEBUG(dbgs() << "  -> event index: "
+                          << WasmIndices.find(&WS)->second << "\n");
+
       } else {
-        // An import; the index was assigned above.
-        assert(WasmIndices.count(&WS) > 0);
+        assert(WS.isSection());
       }
-      LLVM_DEBUG(dbgs() << "  -> event index: " << WasmIndices.find(&WS)->second
-                        << "\n");
-
-    } else {
-      assert(WS.isSection());
     }
-  }
 
-  // Populate WasmIndices and DataLocations for aliased symbols.  We need to
-  // process these in a separate pass because we need to have processed the
-  // target of the alias before the alias itself and the symbols are not
-  // necessarily ordered in this way.
-  for (const MCSymbol &S : Asm.symbols()) {
-    if (!S.isVariable())
-      continue;
+    // Populate WasmIndices and DataLocations for aliased symbols.  We need to
+    // process these in a separate pass because we need to have processed the
+    // target of the alias before the alias itself and the symbols are not
+    // necessarily ordered in this way.
+    for (const MCSymbol &S : Asm.symbols()) {
+      if (!S.isVariable())
+        continue;
 
-    assert(S.isDefined());
+      assert(S.isDefined());
 
-    const MCSymbolWasm *Base = cast<MCSymbolWasm>(Layout.getBaseSymbol(S));
+      const auto *BS = Layout.getBaseSymbol(S);
+      if (!BS)
+        report_fatal_error(Twine(S.getName()) +
+                           ": absolute addressing not supported!");
+      const MCSymbolWasm *Base = cast<MCSymbolWasm>(BS);
 
-    // Find the target symbol of this weak alias and export that index
-    const auto &WS = static_cast<const MCSymbolWasm &>(S);
-    LLVM_DEBUG(dbgs() << WS.getName() << ": weak alias of '" << *Base << "'\n");
-
-    if (Base->isFunction()) {
-      assert(WasmIndices.count(Base) > 0);
-      uint32_t WasmIndex = WasmIndices.find(Base)->second;
-      assert(WasmIndices.count(&WS) == 0);
-      WasmIndices[&WS] = WasmIndex;
-      LLVM_DEBUG(dbgs() << "  -> index:" << WasmIndex << "\n");
-    } else if (Base->isData()) {
-      auto &DataSection = static_cast<MCSectionWasm &>(WS.getSection());
-      uint64_t Offset = Layout.getSymbolOffset(S);
-      int64_t Size = 0;
-      // For data symbol alias we use the size of the base symbol as the
-      // size of the alias.  When an offset from the base is involved this
-      // can result in a offset + size goes past the end of the data section
-      // which out object format doesn't support.  So we must clamp it.
-      if (!Base->getSize()->evaluateAsAbsolute(Size, Layout))
-        report_fatal_error(".size expression must be evaluatable");
-      const WasmDataSegment &Segment =
-          DataSegments[DataSection.getSegmentIndex()];
-      Size =
-          std::min(static_cast<uint64_t>(Size), Segment.Data.size() - Offset);
-      wasm::WasmDataReference Ref = wasm::WasmDataReference{
-          DataSection.getSegmentIndex(),
-          static_cast<uint32_t>(Layout.getSymbolOffset(S)),
-          static_cast<uint32_t>(Size)};
-      DataLocations[&WS] = Ref;
-      LLVM_DEBUG(dbgs() << "  -> index:" << Ref.Segment << "\n");
-    } else {
-      report_fatal_error("don't yet support global/event aliases");
+      // Find the target symbol of this weak alias and export that index
+      const auto &WS = static_cast<const MCSymbolWasm &>(S);
+      LLVM_DEBUG(dbgs() << WS.getName() << ": weak alias of '" << *Base
+                        << "'\n");
+
+      if (Base->isFunction()) {
+        assert(WasmIndices.count(Base) > 0);
+        uint32_t WasmIndex = WasmIndices.find(Base)->second;
+        assert(WasmIndices.count(&WS) == 0);
+        WasmIndices[&WS] = WasmIndex;
+        LLVM_DEBUG(dbgs() << "  -> index:" << WasmIndex << "\n");
+      } else if (Base->isData()) {
+        auto &DataSection = static_cast<MCSectionWasm &>(WS.getSection());
+        uint64_t Offset = Layout.getSymbolOffset(S);
+        int64_t Size = 0;
+        // For data symbol alias we use the size of the base symbol as the
+        // size of the alias.  When an offset from the base is involved this
+        // can result in a offset + size goes past the end of the data section
+        // which out object format doesn't support.  So we must clamp it.
+        if (!Base->getSize()->evaluateAsAbsolute(Size, Layout))
+          report_fatal_error(".size expression must be evaluatable");
+        const WasmDataSegment &Segment =
+            DataSegments[DataSection.getSegmentIndex()];
+        Size =
+            std::min(static_cast<uint64_t>(Size), Segment.Data.size() - Offset);
+        wasm::WasmDataReference Ref = wasm::WasmDataReference{
+            DataSection.getSegmentIndex(),
+            static_cast<uint32_t>(Layout.getSymbolOffset(S)),
+            static_cast<uint32_t>(Size)};
+        DataLocations[&WS] = Ref;
+        LLVM_DEBUG(dbgs() << "  -> index:" << Ref.Segment << "\n");
+      } else {
+        report_fatal_error("don't yet support global/event aliases");
+      }
     }
   }
 
@@ -1556,6 +1671,10 @@ uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm,
       WS.setIndex(InvalidIndex);
       continue;
     }
+    if (WS.isTable() && WS.getName() == "__indirect_function_table") {
+      // For the moment, don't emit table symbols -- wasm-ld can't handle them.
+      continue;
+    }
     LLVM_DEBUG(dbgs() << "adding to symtab: " << WS << "\n");
 
     uint32_t Flags = 0;
@@ -1599,7 +1718,9 @@ uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm,
       // purely to make the object file's provisional values readable, and is
       // ignored by the linker, which re-calculates the relocations itself.
       if (Rel.Type != wasm::R_WASM_TABLE_INDEX_I32 &&
+          Rel.Type != wasm::R_WASM_TABLE_INDEX_I64 &&
           Rel.Type != wasm::R_WASM_TABLE_INDEX_SLEB &&
+          Rel.Type != wasm::R_WASM_TABLE_INDEX_SLEB64 &&
           Rel.Type != wasm::R_WASM_TABLE_INDEX_REL_SLEB)
         return;
       assert(Rel.Symbol->isFunction());
@@ -1689,23 +1810,41 @@ uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm,
   // Write out the Wasm header.
   writeHeader(Asm);
 
-  writeTypeSection(Signatures);
-  writeImportSection(Imports, DataSize, TableElems.size());
-  writeFunctionSection(Functions);
-  // Skip the "table" section; we import the table instead.
-  // Skip the "memory" section; we import the memory instead.
-  writeEventSection(Events);
-  writeGlobalSection(Globals);
-  writeExportSection(Exports);
-  writeElemSection(TableElems);
-  writeDataCountSection();
-  uint32_t CodeSectionIndex = writeCodeSection(Asm, Layout, Functions);
-  uint32_t DataSectionIndex = writeDataSection(Layout);
+  uint32_t CodeSectionIndex, DataSectionIndex;
+  if (Mode != DwoMode::DwoOnly) {
+    writeTypeSection(Signatures);
+    writeImportSection(Imports, DataSize, TableElems.size());
+    writeFunctionSection(Functions);
+    writeTableSection(Tables);
+    // Skip the "memory" section; we import the memory instead.
+    writeEventSection(Events);
+    writeGlobalSection(Globals);
+    writeExportSection(Exports);
+    writeElemSection(TableElems);
+    writeDataCountSection();
+
+    CodeSectionIndex = writeCodeSection(Asm, Layout, Functions);
+    DataSectionIndex = writeDataSection(Layout);
+  }
+
+  // The Sections in the COMDAT list have placeholder indices (their index among
+  // custom sections, rather than among all sections). Fix them up here.
+  for (auto &Group : Comdats) {
+    for (auto &Entry : Group.second) {
+      if (Entry.Kind == wasm::WASM_COMDAT_SECTION) {
+        Entry.Index += SectionCount;
+      }
+    }
+  }
   for (auto &CustomSection : CustomSections)
     writeCustomSection(CustomSection, Asm, Layout);
-  writeLinkingMetaDataSection(SymbolInfos, InitFuncs, Comdats);
-  writeRelocSection(CodeSectionIndex, "CODE", CodeRelocations);
-  writeRelocSection(DataSectionIndex, "DATA", DataRelocations);
+
+  if (Mode != DwoMode::DwoOnly) {
+    writeLinkingMetaDataSection(SymbolInfos, InitFuncs, Comdats);
+
+    writeRelocSection(CodeSectionIndex, "CODE", CodeRelocations);
+    writeRelocSection(DataSectionIndex, "DATA", DataRelocations);
+  }
   writeCustomRelocSections();
   if (ProducersSection)
     writeCustomSection(*ProducersSection, Asm, Layout);
@@ -1713,7 +1852,7 @@ uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm,
     writeCustomSection(*TargetFeaturesSection, Asm, Layout);
 
   // TODO: Translate the .comment section to the output.
-  return W.OS.tell() - StartOffset;
+  return W->OS.tell() - StartOffset;
 }
 
 std::unique_ptr<MCObjectWriter>
@@ -1721,3 +1860,10 @@ llvm::createWasmObjectWriter(std::unique_ptr<MCWasmObjectTargetWriter> MOTW,
                              raw_pwrite_stream &OS) {
   return std::make_unique<WasmObjectWriter>(std::move(MOTW), OS);
 }
+
+std::unique_ptr<MCObjectWriter>
+llvm::createWasmDwoObjectWriter(std::unique_ptr<MCWasmObjectTargetWriter> MOTW,
+                                raw_pwrite_stream &OS,
+                                raw_pwrite_stream &DwoOS) {
+  return std::make_unique<WasmObjectWriter>(std::move(MOTW), OS, DwoOS);
+}
diff --git a/contrib/llvm-project/llvm/lib/MC/WinCOFFObjectWriter.cpp b/contrib/llvm-project/llvm/lib/MC/WinCOFFObjectWriter.cpp
index 8e7bf1eb0169..901d2c06e716 100644
--- a/contrib/llvm-project/llvm/lib/MC/WinCOFFObjectWriter.cpp
+++ b/contrib/llvm-project/llvm/lib/MC/WinCOFFObjectWriter.cpp
@@ -353,9 +353,10 @@ COFFSymbol *WinCOFFObjectWriter::getLinkedSymbol(const MCSymbol &Symbol) {
     return nullptr;
 
   const MCSymbol &Aliasee = SymRef->getSymbol();
-  if (!Aliasee.isUndefined())
+  if (Aliasee.isUndefined() || Aliasee.isExternal())
+    return GetOrCreateCOFFSymbol(&Aliasee);
+  else
     return nullptr;
-  return GetOrCreateCOFFSymbol(&Aliasee);
 }
 
 /// This function takes a symbol data object from the assembler
diff --git a/contrib/llvm-project/llvm/lib/MC/XCOFFObjectWriter.cpp b/contrib/llvm-project/llvm/lib/MC/XCOFFObjectWriter.cpp
index 0dabdc9777d6..031eceaadf06 100644
--- a/contrib/llvm-project/llvm/lib/MC/XCOFFObjectWriter.cpp
+++ b/contrib/llvm-project/llvm/lib/MC/XCOFFObjectWriter.cpp
@@ -138,12 +138,13 @@ struct Section {
       Group->clear();
   }
 
-  Section(const char *N, XCOFF::SectionTypeFlags Flags, bool IsVirtual,
+  Section(StringRef N, XCOFF::SectionTypeFlags Flags, bool IsVirtual,
           CsectGroups Groups)
-      : Address(0), Size(0), FileOffsetToData(0), FileOffsetToRelocations(0),
-        RelocationCount(0), Flags(Flags), Index(UninitializedIndex),
-        IsVirtual(IsVirtual), Groups(Groups) {
-    strncpy(Name, N, XCOFF::NameSize);
+      : Name(), Address(0), Size(0), FileOffsetToData(0),
+        FileOffsetToRelocations(0), RelocationCount(0), Flags(Flags),
+        Index(UninitializedIndex), IsVirtual(IsVirtual), Groups(Groups) {
+    assert(N.size() <= XCOFF::NameSize && "section name too long");
+    memcpy(Name, N.data(), N.size());
   }
 };
 
@@ -304,6 +305,7 @@ CsectGroup &XCOFFObjectWriter::getCsectGroup(const MCSectionXCOFF *MCSec) {
            "in this CsectGroup.");
     return TOCCsects;
   case XCOFF::XMC_TC:
+  case XCOFF::XMC_TE:
     assert(XCOFF::XTY_SD == MCSec->getCSectType() &&
            "Only an initialized csect can contain TC entry.");
     assert(!TOCCsects.empty() &&
@@ -427,9 +429,18 @@ void XCOFFObjectWriter::recordRelocation(MCAssembler &Asm,
     // The FixedValue should be symbol's virtual address in this object file
     // plus any constant value that we might get.
     FixedValue = getVirtualAddress(SymA, SymASec) + Target.getConstant();
-  else if (Type == XCOFF::RelocationType::R_TOC)
-    // The FixedValue should be the TC entry offset from TOC-base.
-    FixedValue = SectionMap[SymASec]->Address - TOCCsects.front().Address;
+  else if (Type == XCOFF::RelocationType::R_TOC ||
+           Type == XCOFF::RelocationType::R_TOCL) {
+    // The FixedValue should be the TOC entry offset from the TOC-base plus any
+    // constant offset value.
+    const int64_t TOCEntryOffset = SectionMap[SymASec]->Address -
+                                   TOCCsects.front().Address +
+                                   Target.getConstant();
+    if (Type == XCOFF::RelocationType::R_TOC && !isInt<16>(TOCEntryOffset))
+      report_fatal_error("TOCEntryOffset overflows in small code model mode");
+
+    FixedValue = TOCEntryOffset;
+  }
 
   assert(
       (TargetObjectWriter->is64Bit() ||
diff --git a/contrib/llvm-project/llvm/lib/MCA/HardwareUnits/RegisterFile.cpp b/contrib/llvm-project/llvm/lib/MCA/HardwareUnits/RegisterFile.cpp
index 7ea5506f11d6..11a24a6889f1 100644
--- a/contrib/llvm-project/llvm/lib/MCA/HardwareUnits/RegisterFile.cpp
+++ b/contrib/llvm-project/llvm/lib/MCA/HardwareUnits/RegisterFile.cpp
@@ -196,15 +196,9 @@ void RegisterFile::addRegisterWrite(WriteRef Write,
   // Update zero registers.
   MCPhysReg ZeroRegisterID =
       WS.clearsSuperRegisters() ? RegID : WS.getRegisterID();
-  if (IsWriteZero) {
-    ZeroRegisters.setBit(ZeroRegisterID);
-    for (MCSubRegIterator I(ZeroRegisterID, &MRI); I.isValid(); ++I)
-      ZeroRegisters.setBit(*I);
-  } else {
-    ZeroRegisters.clearBit(ZeroRegisterID);
-    for (MCSubRegIterator I(ZeroRegisterID, &MRI); I.isValid(); ++I)
-      ZeroRegisters.clearBit(*I);
-  }
+  ZeroRegisters.setBitVal(ZeroRegisterID, IsWriteZero);
+  for (MCSubRegIterator I(ZeroRegisterID, &MRI); I.isValid(); ++I)
+    ZeroRegisters.setBitVal(*I, IsWriteZero);
 
   // If this is move has been eliminated, then the call to tryEliminateMove
   // should have already updated all the register mappings.
@@ -233,10 +227,7 @@ void RegisterFile::addRegisterWrite(WriteRef Write,
       RegisterMappings[*I].second.AliasRegID = 0U;
     }
 
-    if (IsWriteZero)
-      ZeroRegisters.setBit(*I);
-    else
-      ZeroRegisters.clearBit(*I);
+    ZeroRegisters.setBitVal(*I, IsWriteZero);
   }
 }
 
diff --git a/contrib/llvm-project/llvm/lib/MCA/HardwareUnits/Scheduler.cpp b/contrib/llvm-project/llvm/lib/MCA/HardwareUnits/Scheduler.cpp
index 8730336c6669..31ea751f1c44 100644
--- a/contrib/llvm-project/llvm/lib/MCA/HardwareUnits/Scheduler.cpp
+++ b/contrib/llvm-project/llvm/lib/MCA/HardwareUnits/Scheduler.cpp
@@ -241,7 +241,7 @@ void Scheduler::updateIssuedSet(SmallVectorImpl<InstRef> &Executed) {
 }
 
 uint64_t Scheduler::analyzeResourcePressure(SmallVectorImpl<InstRef> &Insts) {
-  Insts.insert(Insts.end(), ReadySet.begin(), ReadySet.end());
+  llvm::append_range(Insts, ReadySet);
   return BusyResourceUnits;
 }
 
diff --git a/contrib/llvm-project/llvm/lib/MCA/InstrBuilder.cpp b/contrib/llvm-project/llvm/lib/MCA/InstrBuilder.cpp
index 24e2a9d2f0ce..2bad13601718 100644
--- a/contrib/llvm-project/llvm/lib/MCA/InstrBuilder.cpp
+++ b/contrib/llvm-project/llvm/lib/MCA/InstrBuilder.cpp
@@ -259,8 +259,9 @@ void InstrBuilder::populateWrites(InstrDesc &ID, const MCInst &MCI,
   //     the opcode descriptor (MCInstrDesc).
   //  2. Uses start at index #(MCDesc.getNumDefs()).
   //  3. There can only be a single optional register definition, an it is
-  //     always the last operand of the sequence (excluding extra operands
-  //     contributed by variadic opcodes).
+  //     either the last operand of the sequence (excluding extra operands
+  //     contributed by variadic opcodes) or one of the explicit register
+  //     definitions. The latter occurs for some Thumb1 instructions.
   //
   // These assumptions work quite well for most out-of-order in-tree targets
   // like x86. This is mainly because the vast majority of instructions is
@@ -308,12 +309,18 @@ void InstrBuilder::populateWrites(InstrDesc &ID, const MCInst &MCI,
   // The first NumExplicitDefs register operands are expected to be register
   // definitions.
   unsigned CurrentDef = 0;
+  unsigned OptionalDefIdx = MCDesc.getNumOperands() - 1;
   unsigned i = 0;
   for (; i < MCI.getNumOperands() && CurrentDef < NumExplicitDefs; ++i) {
     const MCOperand &Op = MCI.getOperand(i);
     if (!Op.isReg())
       continue;
 
+    if (MCDesc.OpInfo[CurrentDef].isOptionalDef()) {
+      OptionalDefIdx = CurrentDef++;
+      continue;
+    }
+
     WriteDescriptor &Write = ID.Writes[CurrentDef];
     Write.OpIndex = i;
     if (CurrentDef < NumWriteLatencyEntries) {
@@ -369,7 +376,7 @@ void InstrBuilder::populateWrites(InstrDesc &ID, const MCInst &MCI,
 
   if (MCDesc.hasOptionalDef()) {
     WriteDescriptor &Write = ID.Writes[NumExplicitDefs + NumImplicitDefs];
-    Write.OpIndex = MCDesc.getNumOperands() - 1;
+    Write.OpIndex = OptionalDefIdx;
     // Assign a default latency for this write.
     Write.Latency = ID.MaxLatency;
     Write.SClassOrWriteResourceID = 0;
@@ -518,7 +525,8 @@ InstrBuilder::createInstrDescImpl(const MCInst &MCI) {
   if (IsVariant) {
     unsigned CPUID = SM.getProcessorID();
     while (SchedClassID && SM.getSchedClassDesc(SchedClassID)->isVariant())
-      SchedClassID = STI.resolveVariantSchedClass(SchedClassID, &MCI, CPUID);
+      SchedClassID =
+          STI.resolveVariantSchedClass(SchedClassID, &MCI, &MCII, CPUID);
 
     if (!SchedClassID) {
       return make_error<InstructionError<MCInst>>(
diff --git a/contrib/llvm-project/llvm/lib/MCA/Stages/InstructionTables.cpp b/contrib/llvm-project/llvm/lib/MCA/Stages/InstructionTables.cpp
index a0cdfb89c553..93e368123066 100644
--- a/contrib/llvm-project/llvm/lib/MCA/Stages/InstructionTables.cpp
+++ b/contrib/llvm-project/llvm/lib/MCA/Stages/InstructionTables.cpp
@@ -30,8 +30,7 @@ Error InstructionTables::execute(InstRef &IR) {
     if (!Resource.second.size())
       continue;
     unsigned Cycles = Resource.second.size();
-    unsigned Index = std::distance(
-        Masks.begin(), std::find(Masks.begin(), Masks.end(), Resource.first));
+    unsigned Index = std::distance(Masks.begin(), find(Masks, Resource.first));
     const MCProcResourceDesc &ProcResource = *SM.getProcResource(Index);
     unsigned NumUnits = ProcResource.NumUnits;
     if (!ProcResource.SubUnitsIdxBegin) {
diff --git a/contrib/llvm-project/llvm/lib/Object/Archive.cpp b/contrib/llvm-project/llvm/lib/Object/Archive.cpp
index c18dd11a72cc..11c9de4455dd 100644
--- a/contrib/llvm-project/llvm/lib/Object/Archive.cpp
+++ b/contrib/llvm-project/llvm/lib/Object/Archive.cpp
@@ -38,8 +38,8 @@ using namespace llvm;
 using namespace object;
 using namespace llvm::support::endian;
 
-static const char *const Magic = "!<arch>\n";
-static const char *const ThinMagic = "!<thin>\n";
+const char Magic[] = "!<arch>\n";
+const char ThinMagic[] = "!<thin>\n";
 
 void Archive::anchor() {}
 
diff --git a/contrib/llvm-project/llvm/lib/Object/ArchiveWriter.cpp b/contrib/llvm-project/llvm/lib/Object/ArchiveWriter.cpp
index 6f92c547164b..ce997464caa7 100644
--- a/contrib/llvm-project/llvm/lib/Object/ArchiveWriter.cpp
+++ b/contrib/llvm-project/llvm/lib/Object/ArchiveWriter.cpp
@@ -26,6 +26,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/SmallVectorMemoryBuffer.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -285,22 +286,12 @@ static void printNBits(raw_ostream &Out, object::Archive::Kind Kind,
     print<uint32_t>(Out, Kind, Val);
 }
 
-static void writeSymbolTable(raw_ostream &Out, object::Archive::Kind Kind,
-                             bool Deterministic, ArrayRef<MemberData> Members,
-                             StringRef StringTable) {
-  // We don't write a symbol table on an archive with no members -- except on
-  // Darwin, where the linker will abort unless the archive has a symbol table.
-  if (StringTable.empty() && !isDarwin(Kind))
-    return;
-
-  unsigned NumSyms = 0;
-  for (const MemberData &M : Members)
-    NumSyms += M.Symbols.size();
-
-  unsigned Size = 0;
-  unsigned OffsetSize = is64BitKind(Kind) ? sizeof(uint64_t) : sizeof(uint32_t);
-
-  Size += OffsetSize; // Number of entries
+static uint64_t computeSymbolTableSize(object::Archive::Kind Kind,
+                                       uint64_t NumSyms, uint64_t OffsetSize,
+                                       StringRef StringTable,
+                                       uint32_t *Padding = nullptr) {
+  assert((OffsetSize == 4 || OffsetSize == 8) && "Unsupported OffsetSize");
+  uint64_t Size = OffsetSize; // Number of entries
   if (isBSDLike(Kind))
     Size += NumSyms * OffsetSize * 2; // Table
   else
@@ -312,10 +303,15 @@ static void writeSymbolTable(raw_ostream &Out, object::Archive::Kind Kind,
   // least 4-byte aligned for 32-bit content.  Opt for the larger encoding
   // uniformly.
   // We do this for all bsd formats because it simplifies aligning members.
-  const Align Alignment(isBSDLike(Kind) ? 8 : 2);
-  unsigned Pad = offsetToAlignment(Size, Alignment);
+  uint32_t Pad = offsetToAlignment(Size, Align(isBSDLike(Kind) ? 8 : 2));
   Size += Pad;
+  if (Padding)
+    *Padding = Pad;
+  return Size;
+}
 
+static void writeSymbolTableHeader(raw_ostream &Out, object::Archive::Kind Kind,
+                                   bool Deterministic, uint64_t Size) {
   if (isBSDLike(Kind)) {
     const char *Name = is64BitKind(Kind) ? "__.SYMDEF_64" : "__.SYMDEF";
     printBSDMemberHeader(Out, Out.tell(), Name, now(Deterministic), 0, 0, 0,
@@ -324,6 +320,24 @@ static void writeSymbolTable(raw_ostream &Out, object::Archive::Kind Kind,
     const char *Name = is64BitKind(Kind) ? "/SYM64" : "";
     printGNUSmallMemberHeader(Out, Name, now(Deterministic), 0, 0, 0, Size);
   }
+}
+
+static void writeSymbolTable(raw_ostream &Out, object::Archive::Kind Kind,
+                             bool Deterministic, ArrayRef<MemberData> Members,
+                             StringRef StringTable) {
+  // We don't write a symbol table on an archive with no members -- except on
+  // Darwin, where the linker will abort unless the archive has a symbol table.
+  if (StringTable.empty() && !isDarwin(Kind))
+    return;
+
+  unsigned NumSyms = 0;
+  for (const MemberData &M : Members)
+    NumSyms += M.Symbols.size();
+
+  uint64_t OffsetSize = is64BitKind(Kind) ? 8 : 4;
+  uint32_t Pad;
+  uint64_t Size = computeSymbolTableSize(Kind, NumSyms, OffsetSize, StringTable, &Pad);
+  writeSymbolTableHeader(Out, Kind, Deterministic, Size);
 
   uint64_t Pos = Out.tell() + Size;
 
@@ -358,22 +372,21 @@ getSymbols(MemoryBufferRef Buf, raw_ostream &SymNames, bool &HasObject) {
   // reference to it, thus SymbolicFile should be destroyed first.
   LLVMContext Context;
   std::unique_ptr<object::SymbolicFile> Obj;
-  if (identify_magic(Buf.getBuffer()) == file_magic::bitcode) {
+
+  const file_magic Type = identify_magic(Buf.getBuffer());
+  // Treat unsupported file types as having no symbols.
+  if (!object::SymbolicFile::isSymbolicFile(Type, &Context))
+    return Ret;
+  if (Type == file_magic::bitcode) {
     auto ObjOrErr = object::SymbolicFile::createSymbolicFile(
         Buf, file_magic::bitcode, &Context);
-    if (!ObjOrErr) {
-      // FIXME: check only for "not an object file" errors.
-      consumeError(ObjOrErr.takeError());
-      return Ret;
-    }
+    if (!ObjOrErr)
+      return ObjOrErr.takeError();
     Obj = std::move(*ObjOrErr);
   } else {
     auto ObjOrErr = object::SymbolicFile::createSymbolicFile(Buf);
-    if (!ObjOrErr) {
-      // FIXME: check only for "not an object file" errors.
-      consumeError(ObjOrErr.takeError());
-      return Ret;
-    }
+    if (!ObjOrErr)
+      return ObjOrErr.takeError();
     Obj = std::move(*ObjOrErr);
   }
 
@@ -392,7 +405,7 @@ getSymbols(MemoryBufferRef Buf, raw_ostream &SymNames, bool &HasObject) {
 static Expected<std::vector<MemberData>>
 computeMemberData(raw_ostream &StringTable, raw_ostream &SymNames,
                   object::Archive::Kind Kind, bool Thin, bool Deterministic,
-                  ArrayRef<NewArchiveMember> NewMembers) {
+                  bool NeedSymbols, ArrayRef<NewArchiveMember> NewMembers) {
   static char PaddingData[8] = {'\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'};
 
   // This ignores the symbol table, but we only need the value mod 8 and the
@@ -493,13 +506,17 @@ computeMemberData(raw_ostream &StringTable, raw_ostream &SymNames,
                       ModTime, Size);
     Out.flush();
 
-    Expected<std::vector<unsigned>> Symbols =
-        getSymbols(Buf, SymNames, HasObject);
-    if (auto E = Symbols.takeError())
-      return std::move(E);
+    std::vector<unsigned> Symbols;
+    if (NeedSymbols) {
+      Expected<std::vector<unsigned>> SymbolsOrErr =
+          getSymbols(Buf, SymNames, HasObject);
+      if (auto E = SymbolsOrErr.takeError())
+        return std::move(E);
+      Symbols = std::move(*SymbolsOrErr);
+    }
 
     Pos += Header.size() + Data.size() + Padding.size();
-    Ret.push_back({std::move(*Symbols), std::move(Header), Data, Padding});
+    Ret.push_back({std::move(Symbols), std::move(Header), Data, Padding});
   }
   // If there are no symbols, emit an empty symbol table, to satisfy Solaris
   // tools, older versions of which expect a symbol table in a non-empty
@@ -552,10 +569,10 @@ Expected<std::string> computeArchiveRelativePath(StringRef From, StringRef To) {
   return std::string(Relative.str());
 }
 
-Error writeArchive(StringRef ArcName, ArrayRef<NewArchiveMember> NewMembers,
-                   bool WriteSymtab, object::Archive::Kind Kind,
-                   bool Deterministic, bool Thin,
-                   std::unique_ptr<MemoryBuffer> OldArchiveBuf) {
+static Error writeArchiveToStream(raw_ostream &Out,
+                                  ArrayRef<NewArchiveMember> NewMembers,
+                                  bool WriteSymtab, object::Archive::Kind Kind,
+                                  bool Deterministic, bool Thin) {
   assert((!Thin || !isBSDLike(Kind)) && "Only the gnu format has a thin mode");
 
   SmallString<0> SymNamesBuf;
@@ -563,8 +580,9 @@ Error writeArchive(StringRef ArcName, ArrayRef<NewArchiveMember> NewMembers,
   SmallString<0> StringTableBuf;
   raw_svector_ostream StringTable(StringTableBuf);
 
-  Expected<std::vector<MemberData>> DataOrErr = computeMemberData(
-      StringTable, SymNames, Kind, Thin, Deterministic, NewMembers);
+  Expected<std::vector<MemberData>> DataOrErr =
+      computeMemberData(StringTable, SymNames, Kind, Thin, Deterministic,
+                        WriteSymtab, NewMembers);
   if (Error E = DataOrErr.takeError())
     return E;
   std::vector<MemberData> &Data = *DataOrErr;
@@ -574,17 +592,28 @@ Error writeArchive(StringRef ArcName, ArrayRef<NewArchiveMember> NewMembers,
 
   // We would like to detect if we need to switch to a 64-bit symbol table.
   if (WriteSymtab) {
-    uint64_t MaxOffset = 0;
+    uint64_t MaxOffset = 8; // For the file signature.
     uint64_t LastOffset = MaxOffset;
+    uint64_t NumSyms = 0;
     for (const auto &M : Data) {
       // Record the start of the member's offset
       LastOffset = MaxOffset;
       // Account for the size of each part associated with the member.
       MaxOffset += M.Header.size() + M.Data.size() + M.Padding.size();
-      // We assume 32-bit symbols to see if 32-bit symbols are possible or not.
-      MaxOffset += M.Symbols.size() * 4;
+      NumSyms += M.Symbols.size();
     }
 
+    // We assume 32-bit offsets to see if 32-bit symbols are possible or not.
+    uint64_t SymtabSize = computeSymbolTableSize(Kind, NumSyms, 4, SymNamesBuf);
+    auto computeSymbolTableHeaderSize =
+        [=] {
+          SmallString<0> TmpBuf;
+          raw_svector_ostream Tmp(TmpBuf);
+          writeSymbolTableHeader(Tmp, Kind, Deterministic, SymtabSize);
+          return TmpBuf.size();
+        };
+    LastOffset += computeSymbolTableHeaderSize() + SymtabSize;
+
     // The SYM64 format is used when an archive's member offsets are larger than
     // 32-bits can hold. The need for this shift in format is detected by
     // writeArchive. To test this we need to generate a file with a member that
@@ -592,15 +621,15 @@ Error writeArchive(StringRef ArcName, ArrayRef<NewArchiveMember> NewMembers,
     // speed the test up we use this environment variable to pretend like the
     // cutoff happens before 32-bits and instead happens at some much smaller
     // value.
+    uint64_t Sym64Threshold = 1ULL << 32;
     const char *Sym64Env = std::getenv("SYM64_THRESHOLD");
-    int Sym64Threshold = 32;
     if (Sym64Env)
       StringRef(Sym64Env).getAsInteger(10, Sym64Threshold);
 
     // If LastOffset isn't going to fit in a 32-bit varible we need to switch
     // to 64-bit. Note that the file can be larger than 4GB as long as the last
     // member starts before the 4GB offset.
-    if (LastOffset >= (1ULL << Sym64Threshold)) {
+    if (LastOffset >= Sym64Threshold) {
       if (Kind == object::Archive::K_DARWIN)
         Kind = object::Archive::K_DARWIN64;
       else
@@ -608,12 +637,6 @@ Error writeArchive(StringRef ArcName, ArrayRef<NewArchiveMember> NewMembers,
     }
   }
 
-  Expected<sys::fs::TempFile> Temp =
-      sys::fs::TempFile::create(ArcName + ".temp-archive-%%%%%%%.a");
-  if (!Temp)
-    return Temp.takeError();
-
-  raw_fd_ostream Out(Temp->FD, false);
   if (Thin)
     Out << "!<thin>\n";
   else
@@ -626,6 +649,25 @@ Error writeArchive(StringRef ArcName, ArrayRef<NewArchiveMember> NewMembers,
     Out << M.Header << M.Data << M.Padding;
 
   Out.flush();
+  return Error::success();
+}
+
+Error writeArchive(StringRef ArcName, ArrayRef<NewArchiveMember> NewMembers,
+                   bool WriteSymtab, object::Archive::Kind Kind,
+                   bool Deterministic, bool Thin,
+                   std::unique_ptr<MemoryBuffer> OldArchiveBuf) {
+  Expected<sys::fs::TempFile> Temp =
+      sys::fs::TempFile::create(ArcName + ".temp-archive-%%%%%%%.a");
+  if (!Temp)
+    return Temp.takeError();
+  raw_fd_ostream Out(Temp->FD, false);
+
+  if (Error E = writeArchiveToStream(Out, NewMembers, WriteSymtab, Kind,
+                                     Deterministic, Thin)) {
+    if (Error DiscardError = Temp->discard())
+      return joinErrors(std::move(E), std::move(DiscardError));
+    return E;
+  }
 
   // At this point, we no longer need whatever backing memory
   // was used to generate the NewMembers. On Windows, this buffer
@@ -642,4 +684,19 @@ Error writeArchive(StringRef ArcName, ArrayRef<NewArchiveMember> NewMembers,
   return Temp->keep(ArcName);
 }
 
+Expected<std::unique_ptr<MemoryBuffer>>
+writeArchiveToBuffer(ArrayRef<NewArchiveMember> NewMembers, bool WriteSymtab,
+                     object::Archive::Kind Kind, bool Deterministic,
+                     bool Thin) {
+  SmallVector<char, 0> ArchiveBufferVector;
+  raw_svector_ostream ArchiveStream(ArchiveBufferVector);
+
+  if (Error E = writeArchiveToStream(ArchiveStream, NewMembers, WriteSymtab,
+                                     Kind, Deterministic, Thin))
+    return std::move(E);
+
+  return std::make_unique<SmallVectorMemoryBuffer>(
+      std::move(ArchiveBufferVector));
+}
+
 } // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Object/Binary.cpp b/contrib/llvm-project/llvm/lib/Object/Binary.cpp
index 944d2bc1bca7..e741cbba2882 100644
--- a/contrib/llvm-project/llvm/lib/Object/Binary.cpp
+++ b/contrib/llvm-project/llvm/lib/Object/Binary.cpp
@@ -44,7 +44,8 @@ StringRef Binary::getFileName() const { return Data.getBufferIdentifier(); }
 MemoryBufferRef Binary::getMemoryBufferRef() const { return Data; }
 
 Expected<std::unique_ptr<Binary>> object::createBinary(MemoryBufferRef Buffer,
-                                                      LLVMContext *Context) {
+                                                       LLVMContext *Context,
+                                                       bool InitContent) {
   file_magic Type = identify_magic(Buffer.getBuffer());
 
   switch (Type) {
@@ -73,7 +74,7 @@ Expected<std::unique_ptr<Binary>> object::createBinary(MemoryBufferRef Buffer,
   case file_magic::xcoff_object_32:
   case file_magic::xcoff_object_64:
   case file_magic::wasm_object:
-    return ObjectFile::createSymbolicFile(Buffer, Type, Context);
+    return ObjectFile::createSymbolicFile(Buffer, Type, Context, InitContent);
   case file_magic::macho_universal_binary:
     return MachOUniversalBinary::create(Buffer);
   case file_magic::windows_resource:
@@ -93,7 +94,8 @@ Expected<std::unique_ptr<Binary>> object::createBinary(MemoryBufferRef Buffer,
   llvm_unreachable("Unexpected Binary File Type");
 }
 
-Expected<OwningBinary<Binary>> object::createBinary(StringRef Path) {
+Expected<OwningBinary<Binary>>
+object::createBinary(StringRef Path, LLVMContext *Context, bool InitContent) {
   ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrErr =
       MemoryBuffer::getFileOrSTDIN(Path, /*FileSize=*/-1,
                                    /*RequiresNullTerminator=*/false);
@@ -102,7 +104,7 @@ Expected<OwningBinary<Binary>> object::createBinary(StringRef Path) {
   std::unique_ptr<MemoryBuffer> &Buffer = FileOrErr.get();
 
   Expected<std::unique_ptr<Binary>> BinOrErr =
-      createBinary(Buffer->getMemBufferRef());
+      createBinary(Buffer->getMemBufferRef(), Context, InitContent);
   if (!BinOrErr)
     return BinOrErr.takeError();
   std::unique_ptr<Binary> &Bin = BinOrErr.get();
diff --git a/contrib/llvm-project/llvm/lib/Object/COFFObjectFile.cpp b/contrib/llvm-project/llvm/lib/Object/COFFObjectFile.cpp
index c26d7721b3fe..6e9a8eb35dcf 100644
--- a/contrib/llvm-project/llvm/lib/Object/COFFObjectFile.cpp
+++ b/contrib/llvm-project/llvm/lib/Object/COFFObjectFile.cpp
@@ -28,8 +28,8 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include <algorithm>
 #include <cassert>
+#include <cinttypes>
 #include <cstddef>
-#include <cstdint>
 #include <cstring>
 #include <limits>
 #include <memory>
@@ -57,7 +57,7 @@ static bool checkSize(MemoryBufferRef M, std::error_code &EC, uint64_t Size) {
 template <typename T>
 static Error getObject(const T *&Obj, MemoryBufferRef M, const void *Ptr,
                        const uint64_t Size = sizeof(T)) {
-  uintptr_t Addr = uintptr_t(Ptr);
+  uintptr_t Addr = reinterpret_cast<uintptr_t>(Ptr);
   if (Error E = Binary::checkOffset(M, Addr, Size))
     return E;
   Obj = reinterpret_cast<const T *>(Addr);
@@ -103,10 +103,11 @@ const coff_symbol_type *COFFObjectFile::toSymb(DataRefImpl Ref) const {
   const coff_symbol_type *Addr =
       reinterpret_cast<const coff_symbol_type *>(Ref.p);
 
-  assert(!checkOffset(Data, uintptr_t(Addr), sizeof(*Addr)));
+  assert(!checkOffset(Data, reinterpret_cast<uintptr_t>(Addr), sizeof(*Addr)));
 #ifndef NDEBUG
   // Verify that the symbol points to a valid entry in the symbol table.
-  uintptr_t Offset = uintptr_t(Addr) - uintptr_t(base());
+  uintptr_t Offset =
+      reinterpret_cast<uintptr_t>(Addr) - reinterpret_cast<uintptr_t>(base());
 
   assert((Offset - getPointerToSymbolTable()) % sizeof(coff_symbol_type) == 0 &&
          "Symbol did not point to the beginning of a symbol");
@@ -123,7 +124,8 @@ const coff_section *COFFObjectFile::toSec(DataRefImpl Ref) const {
   if (Addr < SectionTable || Addr >= (SectionTable + getNumberOfSections()))
     report_fatal_error("Section was outside of section table.");
 
-  uintptr_t Offset = uintptr_t(Addr) - uintptr_t(SectionTable);
+  uintptr_t Offset = reinterpret_cast<uintptr_t>(Addr) -
+                     reinterpret_cast<uintptr_t>(SectionTable);
   assert(Offset % sizeof(coff_section) == 0 &&
          "Section did not point to the beginning of a section");
 #endif
@@ -332,7 +334,7 @@ bool COFFObjectFile::isDebugSection(StringRef SectionName) const {
 
 unsigned COFFObjectFile::getSectionID(SectionRef Sec) const {
   uintptr_t Offset =
-      uintptr_t(Sec.getRawDataRefImpl().p) - uintptr_t(SectionTable);
+      Sec.getRawDataRefImpl().p - reinterpret_cast<uintptr_t>(SectionTable);
   assert((Offset % sizeof(coff_section)) == 0);
   return (Offset / sizeof(coff_section)) + 1;
 }
@@ -376,7 +378,7 @@ getFirstReloc(const coff_section *Sec, MemoryBufferRef M, const uint8_t *Base) {
     // relocations.
     begin++;
   }
-  if (auto E = Binary::checkOffset(M, uintptr_t(begin),
+  if (auto E = Binary::checkOffset(M, reinterpret_cast<uintptr_t>(begin),
                                    sizeof(coff_relocation) * NumRelocs)) {
     consumeError(std::move(E));
     return nullptr;
@@ -467,7 +469,8 @@ Error COFFObjectFile::getRvaPtr(uint32_t Addr, uintptr_t &Res) const {
     uint32_t SectionEnd = Section->VirtualAddress + Section->VirtualSize;
     if (SectionStart <= Addr && Addr < SectionEnd) {
       uint32_t Offset = Addr - SectionStart;
-      Res = uintptr_t(base()) + Section->PointerToRawData + Offset;
+      Res = reinterpret_cast<uintptr_t>(base()) + Section->PointerToRawData +
+            Offset;
       return Error::success();
     }
   }
@@ -484,8 +487,8 @@ Error COFFObjectFile::getRvaAndSizeAsBytes(uint32_t RVA, uint32_t Size,
     uint32_t OffsetIntoSection = RVA - SectionStart;
     if (SectionStart <= RVA && OffsetIntoSection < Section->VirtualSize &&
         Size <= Section->VirtualSize - OffsetIntoSection) {
-      uintptr_t Begin =
-          uintptr_t(base()) + Section->PointerToRawData + OffsetIntoSection;
+      uintptr_t Begin = reinterpret_cast<uintptr_t>(base()) +
+                        Section->PointerToRawData + OffsetIntoSection;
       Contents =
           ArrayRef<uint8_t>(reinterpret_cast<const uint8_t *>(Begin), Size);
       return Error::success();
@@ -649,6 +652,38 @@ Error COFFObjectFile::initDebugDirectoryPtr() {
   return Error::success();
 }
 
+Error COFFObjectFile::initTLSDirectoryPtr() {
+  // Get the RVA of the TLS directory. Do nothing if it does not exist.
+  const data_directory *DataEntry = getDataDirectory(COFF::TLS_TABLE);
+  if (!DataEntry)
+    return Error::success();
+
+  // Do nothing if the RVA is NULL.
+  if (DataEntry->RelativeVirtualAddress == 0)
+    return Error::success();
+
+  uint64_t DirSize =
+      is64() ? sizeof(coff_tls_directory64) : sizeof(coff_tls_directory32);
+
+  // Check that the size is correct.
+  if (DataEntry->Size != DirSize)
+    return createStringError(
+        object_error::parse_failed,
+        "TLS Directory size (%u) is not the expected size (%" PRIu64 ").",
+        static_cast<uint32_t>(DataEntry->Size), DirSize);
+
+  uintptr_t IntPtr = 0;
+  if (Error E = getRvaPtr(DataEntry->RelativeVirtualAddress, IntPtr))
+    return E;
+
+  if (is64())
+    TLSDirectory64 = reinterpret_cast<const coff_tls_directory64 *>(IntPtr);
+  else
+    TLSDirectory32 = reinterpret_cast<const coff_tls_directory32 *>(IntPtr);
+
+  return Error::success();
+}
+
 Error COFFObjectFile::initLoadConfigPtr() {
   // Get the RVA of the debug directory. Do nothing if it does not exist.
   const data_directory *DataEntry = getDataDirectory(COFF::LOAD_CONFIG_TABLE);
@@ -682,7 +717,8 @@ COFFObjectFile::COFFObjectFile(MemoryBufferRef Object)
       ImportDirectory(nullptr), DelayImportDirectory(nullptr),
       NumberOfDelayImportDirectory(0), ExportDirectory(nullptr),
       BaseRelocHeader(nullptr), BaseRelocEnd(nullptr),
-      DebugDirectoryBegin(nullptr), DebugDirectoryEnd(nullptr) {}
+      DebugDirectoryBegin(nullptr), DebugDirectoryEnd(nullptr),
+      TLSDirectory32(nullptr), TLSDirectory64(nullptr) {}
 
 Error COFFObjectFile::initialize() {
   // Check that we at least have enough room for a header.
@@ -809,10 +845,14 @@ Error COFFObjectFile::initialize() {
   if (Error E = initBaseRelocPtr())
     return E;
 
-  // Initialize the pointer to the export table.
+  // Initialize the pointer to the debug directory.
   if (Error E = initDebugDirectoryPtr())
     return E;
 
+  // Initialize the pointer to the TLS directory.
+  if (Error E = initTLSDirectoryPtr())
+    return E;
+
   if (Error E = initLoadConfigPtr())
     return E;
 
@@ -1090,7 +1130,8 @@ Error COFFObjectFile::getSectionContents(const coff_section *Sec,
   // The only thing that we need to verify is that the contents is contained
   // within the file bounds. We don't need to make sure it doesn't cover other
   // data, as there's nothing that says that is not allowed.
-  uintptr_t ConStart = uintptr_t(base()) + Sec->PointerToRawData;
+  uintptr_t ConStart =
+      reinterpret_cast<uintptr_t>(base()) + Sec->PointerToRawData;
   uint32_t SectionSize = getSectionSize(Sec);
   if (Error E = checkOffset(Data, ConStart, SectionSize))
     return E;
@@ -1750,10 +1791,9 @@ Error ResourceSectionRef::load(const COFFObjectFile *O, const SectionRef &S) {
   Relocs.reserve(OrigRelocs.size());
   for (const coff_relocation &R : OrigRelocs)
     Relocs.push_back(&R);
-  std::sort(Relocs.begin(), Relocs.end(),
-            [](const coff_relocation *A, const coff_relocation *B) {
-              return A->VirtualAddress < B->VirtualAddress;
-            });
+  llvm::sort(Relocs, [](const coff_relocation *A, const coff_relocation *B) {
+    return A->VirtualAddress < B->VirtualAddress;
+  });
   return Error::success();
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Object/ELF.cpp b/contrib/llvm-project/llvm/lib/Object/ELF.cpp
index 2515695095a1..264f115ddbb5 100644
--- a/contrib/llvm-project/llvm/lib/Object/ELF.cpp
+++ b/contrib/llvm-project/llvm/lib/Object/ELF.cpp
@@ -152,6 +152,13 @@ StringRef llvm::object::getELFRelocationTypeName(uint32_t Machine,
       break;
     }
     break;
+  case ELF::EM_CSKY:
+    switch (Type) {
+#include "llvm/BinaryFormat/ELFRelocs/CSKY.def"
+    default:
+      break;
+    }
+    break;
   default:
     break;
   }
@@ -194,6 +201,8 @@ uint32_t llvm::object::getELFRelativeRelocationType(uint32_t Machine) {
   case ELF::EM_SPARC32PLUS:
   case ELF::EM_SPARCV9:
     return ELF::R_SPARC_RELATIVE;
+  case ELF::EM_CSKY:
+    return ELF::R_CKCORE_RELATIVE;
   case ELF::EM_AMDGPU:
     break;
   case ELF::EM_BPF:
@@ -267,6 +276,7 @@ StringRef llvm::object::getELFSectionTypeName(uint32_t Machine, unsigned Type) {
     STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_SYMPART);
     STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_PART_EHDR);
     STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_PART_PHDR);
+    STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_BB_ADDR_MAP);
     STRINGIFY_ENUM_CASE(ELF, SHT_GNU_ATTRIBUTES);
     STRINGIFY_ENUM_CASE(ELF, SHT_GNU_HASH);
     STRINGIFY_ENUM_CASE(ELF, SHT_GNU_verdef);
@@ -278,7 +288,7 @@ StringRef llvm::object::getELFSectionTypeName(uint32_t Machine, unsigned Type) {
 }
 
 template <class ELFT>
-Expected<std::vector<typename ELFT::Rela>>
+std::vector<typename ELFT::Rel>
 ELFFile<ELFT>::decode_relrs(Elf_Relr_Range relrs) const {
   // This function decodes the contents of an SHT_RELR packed relocation
   // section.
@@ -310,11 +320,10 @@ ELFFile<ELFT>::decode_relrs(Elf_Relr_Range relrs) const {
   //    even means address, odd means bitmap.
   // 2. Just a simple list of addresses is a valid encoding.
 
-  Elf_Rela Rela;
-  Rela.r_info = 0;
-  Rela.r_addend = 0;
-  Rela.setType(getRelativeRelocationType(), false);
-  std::vector<Elf_Rela> Relocs;
+  Elf_Rel Rel;
+  Rel.r_info = 0;
+  Rel.setType(getRelativeRelocationType(), false);
+  std::vector<Elf_Rel> Relocs;
 
   // Word type: uint32_t for Elf32, and uint64_t for Elf64.
   typedef typename ELFT::uint Word;
@@ -331,8 +340,8 @@ ELFFile<ELFT>::decode_relrs(Elf_Relr_Range relrs) const {
     Word Entry = R;
     if ((Entry&1) == 0) {
       // Even entry: encodes the offset for next relocation.
-      Rela.r_offset = Entry;
-      Relocs.push_back(Rela);
+      Rel.r_offset = Entry;
+      Relocs.push_back(Rel);
       // Set base offset for subsequent bitmap entries.
       Base = Entry + WordSize;
       continue;
@@ -343,8 +352,8 @@ ELFFile<ELFT>::decode_relrs(Elf_Relr_Range relrs) const {
     while (Entry != 0) {
       Entry >>= 1;
       if ((Entry&1) != 0) {
-        Rela.r_offset = Offset;
-        Relocs.push_back(Rela);
+        Rel.r_offset = Offset;
+        Relocs.push_back(Rel);
       }
       Offset += WordSize;
     }
@@ -358,7 +367,7 @@ ELFFile<ELFT>::decode_relrs(Elf_Relr_Range relrs) const {
 
 template <class ELFT>
 Expected<std::vector<typename ELFT::Rela>>
-ELFFile<ELFT>::android_relas(const Elf_Shdr *Sec) const {
+ELFFile<ELFT>::android_relas(const Elf_Shdr &Sec) const {
   // This function reads relocations in Android's packed relocation format,
   // which is based on SLEB128 and delta encoding.
   Expected<ArrayRef<uint8_t>> ContentsOrErr = getSectionContents(Sec);
@@ -503,7 +512,7 @@ std::string ELFFile<ELFT>::getDynamicTagAsString(unsigned Arch,
 
 template <class ELFT>
 std::string ELFFile<ELFT>::getDynamicTagAsString(uint64_t Type) const {
-  return getDynamicTagAsString(getHeader()->e_machine, Type);
+  return getDynamicTagAsString(getHeader().e_machine, Type);
 }
 
 template <class ELFT>
@@ -533,7 +542,7 @@ Expected<typename ELFT::DynRange> ELFFile<ELFT>::dynamicEntries() const {
     for (const Elf_Shdr &Sec : *SectionsOrError) {
       if (Sec.sh_type == ELF::SHT_DYNAMIC) {
         Expected<ArrayRef<Elf_Dyn>> DynOrError =
-            getSectionContentsAsArray<Elf_Dyn>(&Sec);
+            getSectionContentsAsArray<Elf_Dyn>(Sec);
         if (!DynOrError)
           return DynOrError.takeError();
         Dyn = *DynOrError;
@@ -557,7 +566,8 @@ Expected<typename ELFT::DynRange> ELFFile<ELFT>::dynamicEntries() const {
 }
 
 template <class ELFT>
-Expected<const uint8_t *> ELFFile<ELFT>::toMappedAddr(uint64_t VAddr) const {
+Expected<const uint8_t *>
+ELFFile<ELFT>::toMappedAddr(uint64_t VAddr, WarningHandler WarnHandler) const {
   auto ProgramHeadersOrError = program_headers();
   if (!ProgramHeadersOrError)
     return ProgramHeadersOrError.takeError();
@@ -568,11 +578,21 @@ Expected<const uint8_t *> ELFFile<ELFT>::toMappedAddr(uint64_t VAddr) const {
     if (Phdr.p_type == ELF::PT_LOAD)
       LoadSegments.push_back(const_cast<Elf_Phdr *>(&Phdr));
 
-  const Elf_Phdr *const *I =
-      std::upper_bound(LoadSegments.begin(), LoadSegments.end(), VAddr,
-                       [](uint64_t VAddr, const Elf_Phdr_Impl<ELFT> *Phdr) {
-                         return VAddr < Phdr->p_vaddr;
-                       });
+  auto SortPred = [](const Elf_Phdr_Impl<ELFT> *A,
+                     const Elf_Phdr_Impl<ELFT> *B) {
+    return A->p_vaddr < B->p_vaddr;
+  };
+  if (!llvm::is_sorted(LoadSegments, SortPred)) {
+    if (Error E =
+            WarnHandler("loadable segments are unsorted by virtual address"))
+      return std::move(E);
+    llvm::stable_sort(LoadSegments, SortPred);
+  }
+
+  const Elf_Phdr *const *I = llvm::upper_bound(
+      LoadSegments, VAddr, [](uint64_t VAddr, const Elf_Phdr_Impl<ELFT> *Phdr) {
+        return VAddr < Phdr->p_vaddr;
+      });
 
   if (I == LoadSegments.begin())
     return createError("virtual address is not in any segment: 0x" +
diff --git a/contrib/llvm-project/llvm/lib/Object/ELFObjectFile.cpp b/contrib/llvm-project/llvm/lib/Object/ELFObjectFile.cpp
index c919d25855d2..91871a6255dc 100644
--- a/contrib/llvm-project/llvm/lib/Object/ELFObjectFile.cpp
+++ b/contrib/llvm-project/llvm/lib/Object/ELFObjectFile.cpp
@@ -61,35 +61,36 @@ ELFObjectFileBase::ELFObjectFileBase(unsigned int Type, MemoryBufferRef Source)
 
 template <class ELFT>
 static Expected<std::unique_ptr<ELFObjectFile<ELFT>>>
-createPtr(MemoryBufferRef Object) {
-  auto Ret = ELFObjectFile<ELFT>::create(Object);
+createPtr(MemoryBufferRef Object, bool InitContent) {
+  auto Ret = ELFObjectFile<ELFT>::create(Object, InitContent);
   if (Error E = Ret.takeError())
     return std::move(E);
   return std::make_unique<ELFObjectFile<ELFT>>(std::move(*Ret));
 }
 
 Expected<std::unique_ptr<ObjectFile>>
-ObjectFile::createELFObjectFile(MemoryBufferRef Obj) {
+ObjectFile::createELFObjectFile(MemoryBufferRef Obj, bool InitContent) {
   std::pair<unsigned char, unsigned char> Ident =
       getElfArchType(Obj.getBuffer());
   std::size_t MaxAlignment =
-      1ULL << countTrailingZeros(uintptr_t(Obj.getBufferStart()));
+      1ULL << countTrailingZeros(
+          reinterpret_cast<uintptr_t>(Obj.getBufferStart()));
 
   if (MaxAlignment < 2)
     return createError("Insufficient alignment");
 
   if (Ident.first == ELF::ELFCLASS32) {
     if (Ident.second == ELF::ELFDATA2LSB)
-      return createPtr<ELF32LE>(Obj);
+      return createPtr<ELF32LE>(Obj, InitContent);
     else if (Ident.second == ELF::ELFDATA2MSB)
-      return createPtr<ELF32BE>(Obj);
+      return createPtr<ELF32BE>(Obj, InitContent);
     else
       return createError("Invalid ELF data");
   } else if (Ident.first == ELF::ELFCLASS64) {
     if (Ident.second == ELF::ELFDATA2LSB)
-      return createPtr<ELF64LE>(Obj);
+      return createPtr<ELF64LE>(Obj, InitContent);
     else if (Ident.second == ELF::ELFDATA2MSB)
-      return createPtr<ELF64BE>(Obj);
+      return createPtr<ELF64BE>(Obj, InitContent);
     else
       return createError("Invalid ELF data");
   }
@@ -355,6 +356,130 @@ SubtargetFeatures ELFObjectFileBase::getFeatures() const {
   }
 }
 
+Optional<StringRef> ELFObjectFileBase::tryGetCPUName() const {
+  switch (getEMachine()) {
+  case ELF::EM_AMDGPU:
+    return getAMDGPUCPUName();
+  default:
+    return None;
+  }
+}
+
+StringRef ELFObjectFileBase::getAMDGPUCPUName() const {
+  assert(getEMachine() == ELF::EM_AMDGPU);
+  unsigned CPU = getPlatformFlags() & ELF::EF_AMDGPU_MACH;
+
+  switch (CPU) {
+  // Radeon HD 2000/3000 Series (R600).
+  case ELF::EF_AMDGPU_MACH_R600_R600:
+    return "r600";
+  case ELF::EF_AMDGPU_MACH_R600_R630:
+    return "r630";
+  case ELF::EF_AMDGPU_MACH_R600_RS880:
+    return "rs880";
+  case ELF::EF_AMDGPU_MACH_R600_RV670:
+    return "rv670";
+
+  // Radeon HD 4000 Series (R700).
+  case ELF::EF_AMDGPU_MACH_R600_RV710:
+    return "rv710";
+  case ELF::EF_AMDGPU_MACH_R600_RV730:
+    return "rv730";
+  case ELF::EF_AMDGPU_MACH_R600_RV770:
+    return "rv770";
+
+  // Radeon HD 5000 Series (Evergreen).
+  case ELF::EF_AMDGPU_MACH_R600_CEDAR:
+    return "cedar";
+  case ELF::EF_AMDGPU_MACH_R600_CYPRESS:
+    return "cypress";
+  case ELF::EF_AMDGPU_MACH_R600_JUNIPER:
+    return "juniper";
+  case ELF::EF_AMDGPU_MACH_R600_REDWOOD:
+    return "redwood";
+  case ELF::EF_AMDGPU_MACH_R600_SUMO:
+    return "sumo";
+
+  // Radeon HD 6000 Series (Northern Islands).
+  case ELF::EF_AMDGPU_MACH_R600_BARTS:
+    return "barts";
+  case ELF::EF_AMDGPU_MACH_R600_CAICOS:
+    return "caicos";
+  case ELF::EF_AMDGPU_MACH_R600_CAYMAN:
+    return "cayman";
+  case ELF::EF_AMDGPU_MACH_R600_TURKS:
+    return "turks";
+
+  // AMDGCN GFX6.
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX600:
+    return "gfx600";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX601:
+    return "gfx601";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX602:
+    return "gfx602";
+
+  // AMDGCN GFX7.
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX700:
+    return "gfx700";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX701:
+    return "gfx701";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX702:
+    return "gfx702";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX703:
+    return "gfx703";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX704:
+    return "gfx704";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX705:
+    return "gfx705";
+
+  // AMDGCN GFX8.
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX801:
+    return "gfx801";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX802:
+    return "gfx802";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX803:
+    return "gfx803";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX805:
+    return "gfx805";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX810:
+    return "gfx810";
+
+  // AMDGCN GFX9.
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX900:
+    return "gfx900";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX902:
+    return "gfx902";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX904:
+    return "gfx904";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX906:
+    return "gfx906";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX908:
+    return "gfx908";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX909:
+    return "gfx909";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C:
+    return "gfx90c";
+
+  // AMDGCN GFX10.
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010:
+    return "gfx1010";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011:
+    return "gfx1011";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012:
+    return "gfx1012";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1030:
+    return "gfx1030";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1031:
+    return "gfx1031";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1032:
+    return "gfx1032";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1033:
+    return "gfx1033";
+  default:
+    llvm_unreachable("Unknown EF_AMDGPU_MACH value");
+  }
+}
+
 // FIXME Encode from a tablegen description or target parser.
 void ELFObjectFileBase::setARMSubArch(Triple &TheTriple) const {
   if (TheTriple.getSubArch() != Triple::NoSubArch)
@@ -440,7 +565,7 @@ void ELFObjectFileBase::setARMSubArch(Triple &TheTriple) const {
   TheTriple.setArchName(Triple);
 }
 
-std::vector<std::pair<DataRefImpl, uint64_t>>
+std::vector<std::pair<Optional<DataRefImpl>, uint64_t>>
 ELFObjectFileBase::getPltAddresses() const {
   std::string Err;
   const auto Triple = makeTriple();
@@ -498,14 +623,18 @@ ELFObjectFileBase::getPltAddresses() const {
     GotToPlt.insert(std::make_pair(Entry.second, Entry.first));
   // Find the relocations in the dynamic relocation table that point to
   // locations in the GOT for which we know the corresponding PLT entry.
-  std::vector<std::pair<DataRefImpl, uint64_t>> Result;
+  std::vector<std::pair<Optional<DataRefImpl>, uint64_t>> Result;
   for (const auto &Relocation : RelaPlt->relocations()) {
     if (Relocation.getType() != JumpSlotReloc)
       continue;
     auto PltEntryIter = GotToPlt.find(Relocation.getOffset());
-    if (PltEntryIter != GotToPlt.end())
-      Result.push_back(std::make_pair(
-          Relocation.getSymbol()->getRawDataRefImpl(), PltEntryIter->second));
+    if (PltEntryIter != GotToPlt.end()) {
+      symbol_iterator Sym = Relocation.getSymbol();
+      if (Sym == symbol_end())
+        Result.emplace_back(None, PltEntryIter->second);
+      else
+        Result.emplace_back(Sym->getRawDataRefImpl(), PltEntryIter->second);
+    }
   }
   return Result;
 }
diff --git a/contrib/llvm-project/llvm/lib/Object/MachOObjectFile.cpp b/contrib/llvm-project/llvm/lib/Object/MachOObjectFile.cpp
index 4d85e6f40ec4..302255926289 100644
--- a/contrib/llvm-project/llvm/lib/Object/MachOObjectFile.cpp
+++ b/contrib/llvm-project/llvm/lib/Object/MachOObjectFile.cpp
@@ -1596,6 +1596,9 @@ MachOObjectFile::MachOObjectFile(MemoryBufferRef Object, bool IsLittleEndian,
        if ((Err = checkTwoLevelHintsCommand(*this, Load, I,
                                             &TwoLevelHintsLoadCmd, Elements)))
          return;
+    } else if (Load.C.cmd == MachO::LC_IDENT) {
+      // Note: LC_IDENT is ignored.
+      continue;
     } else if (isLoadCommandObsolete(Load.C.cmd)) {
       Err = malformedError("load command " + Twine(I) + " for cmd value of: " +
                            Twine(Load.C.cmd) + " is obsolete and not "
@@ -2032,7 +2035,9 @@ bool MachOObjectFile::isSectionBSS(DataRefImpl Sec) const {
 
 bool MachOObjectFile::isDebugSection(StringRef SectionName) const {
   return SectionName.startswith("__debug") ||
-         SectionName.startswith("__zdebug") || SectionName == "__gdb_index";
+         SectionName.startswith("__zdebug") ||
+         SectionName.startswith("__apple") || SectionName == "__gdb_index" ||
+         SectionName == "__swift_ast";
 }
 
 unsigned MachOObjectFile::getSectionID(SectionRef Sec) const {
@@ -2689,6 +2694,12 @@ Triple MachOObjectFile::getArchTriple(uint32_t CPUType, uint32_t CPUSubType,
       if (ArchFlag)
         *ArchFlag = "arm64";
       return Triple("arm64-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM64E:
+      if (McpuDefault)
+        *McpuDefault = "apple-a12";
+      if (ArchFlag)
+        *ArchFlag = "arm64e";
+      return Triple("arm64e-apple-darwin");
     default:
       return Triple();
     }
@@ -2732,17 +2743,32 @@ Triple MachOObjectFile::getHostArch() {
 
 bool MachOObjectFile::isValidArch(StringRef ArchFlag) {
   auto validArchs = getValidArchs();
-  return llvm::find(validArchs, ArchFlag) != validArchs.end();
+  return llvm::is_contained(validArchs, ArchFlag);
 }
 
 ArrayRef<StringRef> MachOObjectFile::getValidArchs() {
-  static const std::array<StringRef, 17> validArchs = {{
-      "i386",   "x86_64", "x86_64h",  "armv4t",  "arm",    "armv5e",
-      "armv6",  "armv6m", "armv7",    "armv7em", "armv7k", "armv7m",
-      "armv7s", "arm64",  "arm64_32", "ppc",     "ppc64",
+  static const std::array<StringRef, 18> ValidArchs = {{
+      "i386",
+      "x86_64",
+      "x86_64h",
+      "armv4t",
+      "arm",
+      "armv5e",
+      "armv6",
+      "armv6m",
+      "armv7",
+      "armv7em",
+      "armv7k",
+      "armv7m",
+      "armv7s",
+      "arm64",
+      "arm64e",
+      "arm64_32",
+      "ppc",
+      "ppc64",
   }};
 
-  return validArchs;
+  return ValidArchs;
 }
 
 Triple::ArchType MachOObjectFile::getArch() const {
diff --git a/contrib/llvm-project/llvm/lib/Object/MachOUniversal.cpp b/contrib/llvm-project/llvm/lib/Object/MachOUniversal.cpp
index a178ecde949e..f3ce005e6ef9 100644
--- a/contrib/llvm-project/llvm/lib/Object/MachOUniversal.cpp
+++ b/contrib/llvm-project/llvm/lib/Object/MachOUniversal.cpp
@@ -12,6 +12,7 @@
 
 #include "llvm/Object/MachOUniversal.h"
 #include "llvm/Object/Archive.h"
+#include "llvm/Object/IRObjectFile.h"
 #include "llvm/Object/MachO.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/Casting.h"
@@ -80,6 +81,25 @@ MachOUniversalBinary::ObjectForArch::getAsObjectFile() const {
   return ObjectFile::createMachOObjectFile(ObjBuffer, cputype, Index);
 }
 
+Expected<std::unique_ptr<IRObjectFile>>
+MachOUniversalBinary::ObjectForArch::getAsIRObject(LLVMContext &Ctx) const {
+  if (!Parent)
+    report_fatal_error("MachOUniversalBinary::ObjectForArch::getAsIRObject() "
+                       "called when Parent is a nullptr");
+
+  StringRef ParentData = Parent->getData();
+  StringRef ObjectData;
+  if (Parent->getMagic() == MachO::FAT_MAGIC) {
+    ObjectData = ParentData.substr(Header.offset, Header.size);
+  } else { // Parent->getMagic() == MachO::FAT_MAGIC_64
+    ObjectData = ParentData.substr(Header64.offset, Header64.size);
+  }
+  StringRef ObjectName = Parent->getFileName();
+  MemoryBufferRef ObjBuffer(ObjectData, ObjectName);
+
+  return IRObjectFile::create(ObjBuffer, Ctx);
+}
+
 Expected<std::unique_ptr<Archive>>
 MachOUniversalBinary::ObjectForArch::getAsArchive() const {
   if (!Parent)
@@ -234,6 +254,15 @@ MachOUniversalBinary::getMachOObjectForArch(StringRef ArchName) const {
   return O->getAsObjectFile();
 }
 
+Expected<std::unique_ptr<IRObjectFile>>
+MachOUniversalBinary::getIRObjectForArch(StringRef ArchName,
+                                         LLVMContext &Ctx) const {
+  Expected<ObjectForArch> O = getObjectForArch(ArchName);
+  if (!O)
+    return O.takeError();
+  return O->getAsIRObject(Ctx);
+}
+
 Expected<std::unique_ptr<Archive>>
 MachOUniversalBinary::getArchiveForArch(StringRef ArchName) const {
   Expected<ObjectForArch> O = getObjectForArch(ArchName);
diff --git a/contrib/llvm-project/llvm/lib/Object/MachOUniversalWriter.cpp b/contrib/llvm-project/llvm/lib/Object/MachOUniversalWriter.cpp
new file mode 100644
index 000000000000..4bb467e56a6f
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Object/MachOUniversalWriter.cpp
@@ -0,0 +1,337 @@
+//===- MachOUniversalWriter.cpp - MachO universal binary writer---*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines the Slice class and writeUniversalBinary function for writing a MachO
+// universal binary file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Object/MachOUniversalWriter.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Object/Archive.h"
+#include "llvm/Object/Binary.h"
+#include "llvm/Object/Error.h"
+#include "llvm/Object/IRObjectFile.h"
+#include "llvm/Object/MachO.h"
+#include "llvm/Object/MachOUniversal.h"
+#include "llvm/Support/SmallVectorMemoryBuffer.h"
+
+using namespace llvm;
+using namespace object;
+
+// For compatibility with cctools lipo, a file's alignment is calculated as the
+// minimum aligment of all segments. For object files, the file's alignment is
+// the maximum alignment of its sections.
+static uint32_t calculateFileAlignment(const MachOObjectFile &O) {
+  uint32_t P2CurrentAlignment;
+  uint32_t P2MinAlignment = MachOUniversalBinary::MaxSectionAlignment;
+  const bool Is64Bit = O.is64Bit();
+
+  for (const auto &LC : O.load_commands()) {
+    if (LC.C.cmd != (Is64Bit ? MachO::LC_SEGMENT_64 : MachO::LC_SEGMENT))
+      continue;
+    if (O.getHeader().filetype == MachO::MH_OBJECT) {
+      unsigned NumberOfSections =
+          (Is64Bit ? O.getSegment64LoadCommand(LC).nsects
+                   : O.getSegmentLoadCommand(LC).nsects);
+      P2CurrentAlignment = NumberOfSections ? 2 : P2MinAlignment;
+      for (unsigned SI = 0; SI < NumberOfSections; ++SI) {
+        P2CurrentAlignment = std::max(P2CurrentAlignment,
+                                      (Is64Bit ? O.getSection64(LC, SI).align
+                                               : O.getSection(LC, SI).align));
+      }
+    } else {
+      P2CurrentAlignment =
+          countTrailingZeros(Is64Bit ? O.getSegment64LoadCommand(LC).vmaddr
+                                     : O.getSegmentLoadCommand(LC).vmaddr);
+    }
+    P2MinAlignment = std::min(P2MinAlignment, P2CurrentAlignment);
+  }
+  // return a value >= 4 byte aligned, and less than MachO MaxSectionAlignment
+  return std::max(
+      static_cast<uint32_t>(2),
+      std::min(P2MinAlignment, static_cast<uint32_t>(
+                                   MachOUniversalBinary::MaxSectionAlignment)));
+}
+
+static uint32_t calculateAlignment(const MachOObjectFile &ObjectFile) {
+  switch (ObjectFile.getHeader().cputype) {
+  case MachO::CPU_TYPE_I386:
+  case MachO::CPU_TYPE_X86_64:
+  case MachO::CPU_TYPE_POWERPC:
+  case MachO::CPU_TYPE_POWERPC64:
+    return 12; // log2 value of page size(4k) for x86 and PPC
+  case MachO::CPU_TYPE_ARM:
+  case MachO::CPU_TYPE_ARM64:
+  case MachO::CPU_TYPE_ARM64_32:
+    return 14; // log2 value of page size(16k) for Darwin ARM
+  default:
+    return calculateFileAlignment(ObjectFile);
+  }
+}
+
+Slice::Slice(const Archive &A, uint32_t CPUType, uint32_t CPUSubType,
+             std::string ArchName, uint32_t Align)
+    : B(&A), CPUType(CPUType), CPUSubType(CPUSubType),
+      ArchName(std::move(ArchName)), P2Alignment(Align) {}
+
+Slice::Slice(const MachOObjectFile &O, uint32_t Align)
+    : B(&O), CPUType(O.getHeader().cputype),
+      CPUSubType(O.getHeader().cpusubtype),
+      ArchName(std::string(O.getArchTriple().getArchName())),
+      P2Alignment(Align) {}
+
+Slice::Slice(const IRObjectFile &IRO, uint32_t CPUType, uint32_t CPUSubType,
+             std::string ArchName, uint32_t Align)
+    : B(&IRO), CPUType(CPUType), CPUSubType(CPUSubType),
+      ArchName(std::move(ArchName)), P2Alignment(Align) {}
+
+Slice::Slice(const MachOObjectFile &O) : Slice(O, calculateAlignment(O)) {}
+
+using MachoCPUTy = std::pair<unsigned, unsigned>;
+
+static Expected<MachoCPUTy> getMachoCPUFromTriple(Triple TT) {
+  auto CPU = std::make_pair(MachO::getCPUType(TT), MachO::getCPUSubType(TT));
+  if (!CPU.first) {
+    return CPU.first.takeError();
+  }
+  if (!CPU.second) {
+    return CPU.second.takeError();
+  }
+  return std::make_pair(*CPU.first, *CPU.second);
+}
+
+static Expected<MachoCPUTy> getMachoCPUFromTriple(StringRef TT) {
+  return getMachoCPUFromTriple(Triple{TT});
+}
+
+Expected<Slice> Slice::create(const Archive &A, LLVMContext *LLVMCtx) {
+  Error Err = Error::success();
+  std::unique_ptr<MachOObjectFile> MFO = nullptr;
+  std::unique_ptr<IRObjectFile> IRFO = nullptr;
+  for (const Archive::Child &Child : A.children(Err)) {
+    Expected<std::unique_ptr<Binary>> ChildOrErr = Child.getAsBinary(LLVMCtx);
+    if (!ChildOrErr)
+      return createFileError(A.getFileName(), ChildOrErr.takeError());
+    Binary *Bin = ChildOrErr.get().get();
+    if (Bin->isMachOUniversalBinary())
+      return createStringError(std::errc::invalid_argument,
+                               ("archive member " + Bin->getFileName() +
+                                " is a fat file (not allowed in an archive)")
+                                   .str()
+                                   .c_str());
+    if (Bin->isMachO()) {
+      MachOObjectFile *O = cast<MachOObjectFile>(Bin);
+      if (IRFO) {
+        return createStringError(
+            std::errc::invalid_argument,
+            "archive member %s is a MachO, while previous archive member "
+            "%s was an IR LLVM object",
+            O->getFileName().str().c_str(), IRFO->getFileName().str().c_str());
+      }
+      if (MFO &&
+          std::tie(MFO->getHeader().cputype, MFO->getHeader().cpusubtype) !=
+              std::tie(O->getHeader().cputype, O->getHeader().cpusubtype)) {
+        return createStringError(
+            std::errc::invalid_argument,
+            ("archive member " + O->getFileName() + " cputype (" +
+             Twine(O->getHeader().cputype) + ") and cpusubtype(" +
+             Twine(O->getHeader().cpusubtype) +
+             ") does not match previous archive members cputype (" +
+             Twine(MFO->getHeader().cputype) + ") and cpusubtype(" +
+             Twine(MFO->getHeader().cpusubtype) +
+             ") (all members must match) " + MFO->getFileName())
+                .str()
+                .c_str());
+      }
+      if (!MFO) {
+        ChildOrErr.get().release();
+        MFO.reset(O);
+      }
+    } else if (Bin->isIR()) {
+      IRObjectFile *O = cast<IRObjectFile>(Bin);
+      if (MFO) {
+        return createStringError(std::errc::invalid_argument,
+                                 "archive member '%s' is an LLVM IR object, "
+                                 "while previous archive member "
+                                 "'%s' was a MachO",
+                                 O->getFileName().str().c_str(),
+                                 MFO->getFileName().str().c_str());
+      }
+      if (IRFO) {
+        Expected<MachoCPUTy> CPUO = getMachoCPUFromTriple(O->getTargetTriple());
+        Expected<MachoCPUTy> CPUFO =
+            getMachoCPUFromTriple(IRFO->getTargetTriple());
+        if (!CPUO)
+          return CPUO.takeError();
+        if (!CPUFO)
+          return CPUFO.takeError();
+        if (*CPUO != *CPUFO) {
+          return createStringError(
+              std::errc::invalid_argument,
+              ("archive member " + O->getFileName() + " cputype (" +
+               Twine(CPUO->first) + ") and cpusubtype(" + Twine(CPUO->second) +
+               ") does not match previous archive members cputype (" +
+               Twine(CPUFO->first) + ") and cpusubtype(" +
+               Twine(CPUFO->second) + ") (all members must match) " +
+               IRFO->getFileName())
+                  .str()
+                  .c_str());
+        }
+      } else {
+        ChildOrErr.get().release();
+        IRFO.reset(O);
+      }
+    } else
+      return createStringError(std::errc::invalid_argument,
+                               ("archive member " + Bin->getFileName() +
+                                " is neither a MachO file or an LLVM IR file "
+                                "(not allowed in an archive)")
+                                   .str()
+                                   .c_str());
+  }
+  if (Err)
+    return createFileError(A.getFileName(), std::move(Err));
+  if (!MFO && !IRFO)
+    return createStringError(
+        std::errc::invalid_argument,
+        ("empty archive with no architecture specification: " +
+         A.getFileName() + " (can't determine architecture for it)")
+            .str()
+            .c_str());
+
+  if (MFO) {
+    Slice ArchiveSlice(*(MFO.get()), MFO->is64Bit() ? 3 : 2);
+    ArchiveSlice.B = &A;
+    return ArchiveSlice;
+  }
+
+  // For IR objects
+  Expected<Slice> ArchiveSliceOrErr = Slice::create(*IRFO, 0);
+  if (!ArchiveSliceOrErr)
+    return createFileError(A.getFileName(), ArchiveSliceOrErr.takeError());
+  auto &ArchiveSlice = ArchiveSliceOrErr.get();
+  ArchiveSlice.B = &A;
+  return std::move(ArchiveSlice);
+}
+
+Expected<Slice> Slice::create(const IRObjectFile &IRO, uint32_t Align) {
+  Expected<MachoCPUTy> CPUOrErr = getMachoCPUFromTriple(IRO.getTargetTriple());
+  if (!CPUOrErr)
+    return CPUOrErr.takeError();
+  unsigned CPUType, CPUSubType;
+  std::tie(CPUType, CPUSubType) = CPUOrErr.get();
+  // We don't directly use the architecture name of the target triple T, as,
+  // for instance, thumb is treated as ARM by the MachOUniversal object.
+  std::string ArchName(
+      MachOObjectFile::getArchTriple(CPUType, CPUSubType).getArchName());
+  return Slice{IRO, CPUType, CPUSubType, std::move(ArchName), Align};
+}
+
+static Expected<SmallVector<MachO::fat_arch, 2>>
+buildFatArchList(ArrayRef<Slice> Slices) {
+  SmallVector<MachO::fat_arch, 2> FatArchList;
+  uint64_t Offset =
+      sizeof(MachO::fat_header) + Slices.size() * sizeof(MachO::fat_arch);
+
+  for (const auto &S : Slices) {
+    Offset = alignTo(Offset, 1ull << S.getP2Alignment());
+    if (Offset > UINT32_MAX)
+      return createStringError(
+          std::errc::invalid_argument,
+          ("fat file too large to be created because the offset "
+           "field in struct fat_arch is only 32-bits and the offset " +
+           Twine(Offset) + " for " + S.getBinary()->getFileName() +
+           " for architecture " + S.getArchString() + "exceeds that.")
+              .str()
+              .c_str());
+
+    MachO::fat_arch FatArch;
+    FatArch.cputype = S.getCPUType();
+    FatArch.cpusubtype = S.getCPUSubType();
+    FatArch.offset = Offset;
+    FatArch.size = S.getBinary()->getMemoryBufferRef().getBufferSize();
+    FatArch.align = S.getP2Alignment();
+    Offset += FatArch.size;
+    FatArchList.push_back(FatArch);
+  }
+  return FatArchList;
+}
+
+static Error writeUniversalBinaryToStream(ArrayRef<Slice> Slices,
+                                          raw_ostream &Out) {
+  MachO::fat_header FatHeader;
+  FatHeader.magic = MachO::FAT_MAGIC;
+  FatHeader.nfat_arch = Slices.size();
+
+  Expected<SmallVector<MachO::fat_arch, 2>> FatArchListOrErr =
+      buildFatArchList(Slices);
+  if (!FatArchListOrErr)
+    return FatArchListOrErr.takeError();
+  SmallVector<MachO::fat_arch, 2> FatArchList = *FatArchListOrErr;
+
+  if (sys::IsLittleEndianHost)
+    MachO::swapStruct(FatHeader);
+  Out.write(reinterpret_cast<const char *>(&FatHeader),
+            sizeof(MachO::fat_header));
+
+  if (sys::IsLittleEndianHost)
+    for (MachO::fat_arch &FA : FatArchList)
+      MachO::swapStruct(FA);
+  Out.write(reinterpret_cast<const char *>(FatArchList.data()),
+            sizeof(MachO::fat_arch) * FatArchList.size());
+
+  if (sys::IsLittleEndianHost)
+    for (MachO::fat_arch &FA : FatArchList)
+      MachO::swapStruct(FA);
+
+  size_t Offset =
+      sizeof(MachO::fat_header) + sizeof(MachO::fat_arch) * FatArchList.size();
+  for (size_t Index = 0, Size = Slices.size(); Index < Size; ++Index) {
+    MemoryBufferRef BufferRef = Slices[Index].getBinary()->getMemoryBufferRef();
+    assert((Offset <= FatArchList[Index].offset) && "Incorrect slice offset");
+    Out.write_zeros(FatArchList[Index].offset - Offset);
+    Out.write(BufferRef.getBufferStart(), BufferRef.getBufferSize());
+    Offset = FatArchList[Index].offset + BufferRef.getBufferSize();
+  }
+
+  Out.flush();
+  return Error::success();
+}
+
+Error object::writeUniversalBinary(ArrayRef<Slice> Slices,
+                                   StringRef OutputFileName) {
+  const bool IsExecutable = any_of(Slices, [](Slice S) {
+    return sys::fs::can_execute(S.getBinary()->getFileName());
+  });
+  unsigned Mode = sys::fs::all_read | sys::fs::all_write;
+  if (IsExecutable)
+    Mode |= sys::fs::all_exe;
+  Expected<sys::fs::TempFile> Temp = sys::fs::TempFile::create(
+      OutputFileName + ".temp-universal-%%%%%%", Mode);
+  if (!Temp)
+    return Temp.takeError();
+  raw_fd_ostream Out(Temp->FD, false);
+  if (Error E = writeUniversalBinaryToStream(Slices, Out)) {
+    if (Error DiscardError = Temp->discard())
+      return joinErrors(std::move(E), std::move(DiscardError));
+    return E;
+  }
+  return Temp->keep(OutputFileName);
+}
+
+Expected<std::unique_ptr<MemoryBuffer>>
+object::writeUniversalBinaryToBuffer(ArrayRef<Slice> Slices) {
+  SmallVector<char, 0> Buffer;
+  raw_svector_ostream Out(Buffer);
+
+  if (Error E = writeUniversalBinaryToStream(Slices, Out))
+    return std::move(E);
+
+  return std::make_unique<SmallVectorMemoryBuffer>(std::move(Buffer));
+}
diff --git a/contrib/llvm-project/llvm/lib/Object/ObjectFile.cpp b/contrib/llvm-project/llvm/lib/Object/ObjectFile.cpp
index 61b36ea0f448..cf09a66d9c7d 100644
--- a/contrib/llvm-project/llvm/lib/Object/ObjectFile.cpp
+++ b/contrib/llvm-project/llvm/lib/Object/ObjectFile.cpp
@@ -132,7 +132,8 @@ Triple ObjectFile::makeTriple() const {
 }
 
 Expected<std::unique_ptr<ObjectFile>>
-ObjectFile::createObjectFile(MemoryBufferRef Object, file_magic Type) {
+ObjectFile::createObjectFile(MemoryBufferRef Object, file_magic Type,
+                             bool InitContent) {
   StringRef Data = Object.getBuffer();
   if (Type == file_magic::unknown)
     Type = identify_magic(Data);
@@ -154,7 +155,7 @@ ObjectFile::createObjectFile(MemoryBufferRef Object, file_magic Type) {
   case file_magic::elf_executable:
   case file_magic::elf_shared_object:
   case file_magic::elf_core:
-    return createELFObjectFile(Object);
+    return createELFObjectFile(Object, InitContent);
   case file_magic::macho_object:
   case file_magic::macho_executable:
   case file_magic::macho_fixed_virtual_memory_shared_lib:
diff --git a/contrib/llvm-project/llvm/lib/Object/RelocationResolver.cpp b/contrib/llvm-project/llvm/lib/Object/RelocationResolver.cpp
index ad7a50d13bb7..204577af7239 100644
--- a/contrib/llvm-project/llvm/lib/Object/RelocationResolver.cpp
+++ b/contrib/llvm-project/llvm/lib/Object/RelocationResolver.cpp
@@ -39,20 +39,21 @@ static bool supportsX86_64(uint64_t Type) {
   }
 }
 
-static uint64_t resolveX86_64(RelocationRef R, uint64_t S, uint64_t A) {
-  switch (R.getType()) {
+static uint64_t resolveX86_64(uint64_t Type, uint64_t Offset, uint64_t S,
+                              uint64_t LocData, int64_t Addend) {
+  switch (Type) {
   case ELF::R_X86_64_NONE:
-    return A;
+    return LocData;
   case ELF::R_X86_64_64:
   case ELF::R_X86_64_DTPOFF32:
   case ELF::R_X86_64_DTPOFF64:
-    return S + getELFAddend(R);
+    return S + Addend;
   case ELF::R_X86_64_PC32:
   case ELF::R_X86_64_PC64:
-    return S + getELFAddend(R) - R.getOffset();
+    return S + Addend - Offset;
   case ELF::R_X86_64_32:
   case ELF::R_X86_64_32S:
-    return (S + getELFAddend(R)) & 0xFFFFFFFF;
+    return (S + Addend) & 0xFFFFFFFF;
   default:
     llvm_unreachable("Invalid relocation type");
   }
@@ -70,16 +71,17 @@ static bool supportsAArch64(uint64_t Type) {
   }
 }
 
-static uint64_t resolveAArch64(RelocationRef R, uint64_t S, uint64_t A) {
-  switch (R.getType()) {
+static uint64_t resolveAArch64(uint64_t Type, uint64_t Offset, uint64_t S,
+                               uint64_t /*LocData*/, int64_t Addend) {
+  switch (Type) {
   case ELF::R_AARCH64_ABS32:
-    return (S + getELFAddend(R)) & 0xFFFFFFFF;
+    return (S + Addend) & 0xFFFFFFFF;
   case ELF::R_AARCH64_ABS64:
-    return S + getELFAddend(R);
+    return S + Addend;
   case ELF::R_AARCH64_PREL32:
-    return (S + getELFAddend(R) - R.getOffset()) & 0xFFFFFFFF;
+    return (S + Addend - Offset) & 0xFFFFFFFF;
   case ELF::R_AARCH64_PREL64:
-    return S + getELFAddend(R) - R.getOffset();
+    return S + Addend - Offset;
   default:
     llvm_unreachable("Invalid relocation type");
   }
@@ -95,12 +97,13 @@ static bool supportsBPF(uint64_t Type) {
   }
 }
 
-static uint64_t resolveBPF(RelocationRef R, uint64_t S, uint64_t A) {
-  switch (R.getType()) {
+static uint64_t resolveBPF(uint64_t Type, uint64_t Offset, uint64_t S,
+                           uint64_t LocData, int64_t /*Addend*/) {
+  switch (Type) {
   case ELF::R_BPF_64_32:
-    return (S + A) & 0xFFFFFFFF;
+    return (S + LocData) & 0xFFFFFFFF;
   case ELF::R_BPF_64_64:
-    return S + A;
+    return S + LocData;
   default:
     llvm_unreachable("Invalid relocation type");
   }
@@ -118,16 +121,17 @@ static bool supportsMips64(uint64_t Type) {
   }
 }
 
-static uint64_t resolveMips64(RelocationRef R, uint64_t S, uint64_t A) {
-  switch (R.getType()) {
+static uint64_t resolveMips64(uint64_t Type, uint64_t Offset, uint64_t S,
+                              uint64_t /*LocData*/, int64_t Addend) {
+  switch (Type) {
   case ELF::R_MIPS_32:
-    return (S + getELFAddend(R)) & 0xFFFFFFFF;
+    return (S + Addend) & 0xFFFFFFFF;
   case ELF::R_MIPS_64:
-    return S + getELFAddend(R);
+    return S + Addend;
   case ELF::R_MIPS_TLS_DTPREL64:
-    return S + getELFAddend(R) - 0x8000;
+    return S + Addend - 0x8000;
   case ELF::R_MIPS_PC32:
-    return S + getELFAddend(R) - R.getOffset();
+    return S + Addend - Offset;
   default:
     llvm_unreachable("Invalid relocation type");
   }
@@ -143,12 +147,13 @@ static bool supportsMSP430(uint64_t Type) {
   }
 }
 
-static uint64_t resolveMSP430(RelocationRef R, uint64_t S, uint64_t A) {
-  switch (R.getType()) {
+static uint64_t resolveMSP430(uint64_t Type, uint64_t Offset, uint64_t S,
+                              uint64_t /*LocData*/, int64_t Addend) {
+  switch (Type) {
   case ELF::R_MSP430_32:
-    return (S + getELFAddend(R)) & 0xFFFFFFFF;
+    return (S + Addend) & 0xFFFFFFFF;
   case ELF::R_MSP430_16_BYTE:
-    return (S + getELFAddend(R)) & 0xFFFF;
+    return (S + Addend) & 0xFFFF;
   default:
     llvm_unreachable("Invalid relocation type");
   }
@@ -166,16 +171,17 @@ static bool supportsPPC64(uint64_t Type) {
   }
 }
 
-static uint64_t resolvePPC64(RelocationRef R, uint64_t S, uint64_t A) {
-  switch (R.getType()) {
+static uint64_t resolvePPC64(uint64_t Type, uint64_t Offset, uint64_t S,
+                             uint64_t /*LocData*/, int64_t Addend) {
+  switch (Type) {
   case ELF::R_PPC64_ADDR32:
-    return (S + getELFAddend(R)) & 0xFFFFFFFF;
+    return (S + Addend) & 0xFFFFFFFF;
   case ELF::R_PPC64_ADDR64:
-    return S + getELFAddend(R);
+    return S + Addend;
   case ELF::R_PPC64_REL32:
-    return (S + getELFAddend(R) - R.getOffset()) & 0xFFFFFFFF;
+    return (S + Addend - Offset) & 0xFFFFFFFF;
   case ELF::R_PPC64_REL64:
-    return S + getELFAddend(R) - R.getOffset();
+    return S + Addend - Offset;
   default:
     llvm_unreachable("Invalid relocation type");
   }
@@ -191,12 +197,13 @@ static bool supportsSystemZ(uint64_t Type) {
   }
 }
 
-static uint64_t resolveSystemZ(RelocationRef R, uint64_t S, uint64_t A) {
-  switch (R.getType()) {
+static uint64_t resolveSystemZ(uint64_t Type, uint64_t Offset, uint64_t S,
+                               uint64_t /*LocData*/, int64_t Addend) {
+  switch (Type) {
   case ELF::R_390_32:
-    return (S + getELFAddend(R)) & 0xFFFFFFFF;
+    return (S + Addend) & 0xFFFFFFFF;
   case ELF::R_390_64:
-    return S + getELFAddend(R);
+    return S + Addend;
   default:
     llvm_unreachable("Invalid relocation type");
   }
@@ -214,13 +221,14 @@ static bool supportsSparc64(uint64_t Type) {
   }
 }
 
-static uint64_t resolveSparc64(RelocationRef R, uint64_t S, uint64_t A) {
-  switch (R.getType()) {
+static uint64_t resolveSparc64(uint64_t Type, uint64_t Offset, uint64_t S,
+                               uint64_t /*LocData*/, int64_t Addend) {
+  switch (Type) {
   case ELF::R_SPARC_32:
   case ELF::R_SPARC_64:
   case ELF::R_SPARC_UA32:
   case ELF::R_SPARC_UA64:
-    return S + getELFAddend(R);
+    return S + Addend;
   default:
     llvm_unreachable("Invalid relocation type");
   }
@@ -236,11 +244,12 @@ static bool supportsAmdgpu(uint64_t Type) {
   }
 }
 
-static uint64_t resolveAmdgpu(RelocationRef R, uint64_t S, uint64_t A) {
-  switch (R.getType()) {
+static uint64_t resolveAmdgpu(uint64_t Type, uint64_t Offset, uint64_t S,
+                              uint64_t /*LocData*/, int64_t Addend) {
+  switch (Type) {
   case ELF::R_AMDGPU_ABS32:
   case ELF::R_AMDGPU_ABS64:
-    return S + getELFAddend(R);
+    return S + Addend;
   default:
     llvm_unreachable("Invalid relocation type");
   }
@@ -257,14 +266,15 @@ static bool supportsX86(uint64_t Type) {
   }
 }
 
-static uint64_t resolveX86(RelocationRef R, uint64_t S, uint64_t A) {
-  switch (R.getType()) {
+static uint64_t resolveX86(uint64_t Type, uint64_t Offset, uint64_t S,
+                           uint64_t LocData, int64_t /*Addend*/) {
+  switch (Type) {
   case ELF::R_386_NONE:
-    return A;
+    return LocData;
   case ELF::R_386_32:
-    return S + A;
+    return S + LocData;
   case ELF::R_386_PC32:
-    return S - R.getOffset() + A;
+    return S - Offset + LocData;
   default:
     llvm_unreachable("Invalid relocation type");
   }
@@ -280,23 +290,35 @@ static bool supportsPPC32(uint64_t Type) {
   }
 }
 
-static uint64_t resolvePPC32(RelocationRef R, uint64_t S, uint64_t A) {
-  switch (R.getType()) {
+static uint64_t resolvePPC32(uint64_t Type, uint64_t Offset, uint64_t S,
+                             uint64_t /*LocData*/, int64_t Addend) {
+  switch (Type) {
   case ELF::R_PPC_ADDR32:
-    return (S + getELFAddend(R)) & 0xFFFFFFFF;
+    return (S + Addend) & 0xFFFFFFFF;
   case ELF::R_PPC_REL32:
-    return (S + getELFAddend(R) - R.getOffset()) & 0xFFFFFFFF;
+    return (S + Addend - Offset) & 0xFFFFFFFF;
   }
   llvm_unreachable("Invalid relocation type");
 }
 
 static bool supportsARM(uint64_t Type) {
-  return Type == ELF::R_ARM_ABS32;
+  switch (Type) {
+  case ELF::R_ARM_ABS32:
+  case ELF::R_ARM_REL32:
+    return true;
+  default:
+    return false;
+  }
 }
 
-static uint64_t resolveARM(RelocationRef R, uint64_t S, uint64_t A) {
-  if (R.getType() == ELF::R_ARM_ABS32)
-    return (S + A) & 0xFFFFFFFF;
+static uint64_t resolveARM(uint64_t Type, uint64_t Offset, uint64_t S,
+                           uint64_t LocData, int64_t /*Addend*/) {
+  switch (Type) {
+  case ELF::R_ARM_ABS32:
+    return (S + LocData) & 0xFFFFFFFF;
+  case ELF::R_ARM_REL32:
+    return (S + LocData - Offset) & 0xFFFFFFFF;
+  }
   llvm_unreachable("Invalid relocation type");
 }
 
@@ -310,12 +332,13 @@ static bool supportsAVR(uint64_t Type) {
   }
 }
 
-static uint64_t resolveAVR(RelocationRef R, uint64_t S, uint64_t A) {
-  switch (R.getType()) {
+static uint64_t resolveAVR(uint64_t Type, uint64_t Offset, uint64_t S,
+                           uint64_t /*LocData*/, int64_t Addend) {
+  switch (Type) {
   case ELF::R_AVR_16:
-    return (S + getELFAddend(R)) & 0xFFFF;
+    return (S + Addend) & 0xFFFF;
   case ELF::R_AVR_32:
-    return (S + getELFAddend(R)) & 0xFFFFFFFF;
+    return (S + Addend) & 0xFFFFFFFF;
   default:
     llvm_unreachable("Invalid relocation type");
   }
@@ -325,9 +348,10 @@ static bool supportsLanai(uint64_t Type) {
   return Type == ELF::R_LANAI_32;
 }
 
-static uint64_t resolveLanai(RelocationRef R, uint64_t S, uint64_t A) {
-  if (R.getType() == ELF::R_LANAI_32)
-    return (S + getELFAddend(R)) & 0xFFFFFFFF;
+static uint64_t resolveLanai(uint64_t Type, uint64_t Offset, uint64_t S,
+                             uint64_t /*LocData*/, int64_t Addend) {
+  if (Type == ELF::R_LANAI_32)
+    return (S + Addend) & 0xFFFFFFFF;
   llvm_unreachable("Invalid relocation type");
 }
 
@@ -341,13 +365,13 @@ static bool supportsMips32(uint64_t Type) {
   }
 }
 
-static uint64_t resolveMips32(RelocationRef R, uint64_t S, uint64_t A) {
+static uint64_t resolveMips32(uint64_t Type, uint64_t Offset, uint64_t S,
+                              uint64_t LocData, int64_t /*Addend*/) {
   // FIXME: Take in account implicit addends to get correct results.
-  uint32_t Rel = R.getType();
-  if (Rel == ELF::R_MIPS_32)
-    return (S + A) & 0xFFFFFFFF;
-  if (Rel == ELF::R_MIPS_TLS_DTPREL32)
-    return (S + A) & 0xFFFFFFFF;
+  if (Type == ELF::R_MIPS_32)
+    return (S + LocData) & 0xFFFFFFFF;
+  if (Type == ELF::R_MIPS_TLS_DTPREL32)
+    return (S + LocData) & 0xFFFFFFFF;
   llvm_unreachable("Invalid relocation type");
 }
 
@@ -361,20 +385,21 @@ static bool supportsSparc32(uint64_t Type) {
   }
 }
 
-static uint64_t resolveSparc32(RelocationRef R, uint64_t S, uint64_t A) {
-  uint32_t Rel = R.getType();
-  if (Rel == ELF::R_SPARC_32 || Rel == ELF::R_SPARC_UA32)
-    return S + getELFAddend(R);
-  return A;
+static uint64_t resolveSparc32(uint64_t Type, uint64_t Offset, uint64_t S,
+                               uint64_t LocData, int64_t Addend) {
+  if (Type == ELF::R_SPARC_32 || Type == ELF::R_SPARC_UA32)
+    return S + Addend;
+  return LocData;
 }
 
 static bool supportsHexagon(uint64_t Type) {
   return Type == ELF::R_HEX_32;
 }
 
-static uint64_t resolveHexagon(RelocationRef R, uint64_t S, uint64_t A) {
-  if (R.getType() == ELF::R_HEX_32)
-    return S + getELFAddend(R);
+static uint64_t resolveHexagon(uint64_t Type, uint64_t Offset, uint64_t S,
+                               uint64_t /*LocData*/, int64_t Addend) {
+  if (Type == ELF::R_HEX_32)
+    return S + Addend;
   llvm_unreachable("Invalid relocation type");
 }
 
@@ -400,15 +425,17 @@ static bool supportsRISCV(uint64_t Type) {
   }
 }
 
-static uint64_t resolveRISCV(RelocationRef R, uint64_t S, uint64_t A) {
-  int64_t RA = getELFAddend(R);
-  switch (R.getType()) {
+static uint64_t resolveRISCV(uint64_t Type, uint64_t Offset, uint64_t S,
+                             uint64_t LocData, int64_t Addend) {
+  int64_t RA = Addend;
+  uint64_t A = LocData;
+  switch (Type) {
   case ELF::R_RISCV_NONE:
-    return A;
+    return LocData;
   case ELF::R_RISCV_32:
     return (S + RA) & 0xFFFFFFFF;
   case ELF::R_RISCV_32_PCREL:
-    return (S + RA - R.getOffset()) & 0xFFFFFFFF;
+    return (S + RA - Offset) & 0xFFFFFFFF;
   case ELF::R_RISCV_64:
     return S + RA;
   case ELF::R_RISCV_SET6:
@@ -446,11 +473,12 @@ static bool supportsCOFFX86(uint64_t Type) {
   }
 }
 
-static uint64_t resolveCOFFX86(RelocationRef R, uint64_t S, uint64_t A) {
-  switch (R.getType()) {
+static uint64_t resolveCOFFX86(uint64_t Type, uint64_t Offset, uint64_t S,
+                               uint64_t LocData, int64_t /*Addend*/) {
+  switch (Type) {
   case COFF::IMAGE_REL_I386_SECREL:
   case COFF::IMAGE_REL_I386_DIR32:
-    return (S + A) & 0xFFFFFFFF;
+    return (S + LocData) & 0xFFFFFFFF;
   default:
     llvm_unreachable("Invalid relocation type");
   }
@@ -466,12 +494,13 @@ static bool supportsCOFFX86_64(uint64_t Type) {
   }
 }
 
-static uint64_t resolveCOFFX86_64(RelocationRef R, uint64_t S, uint64_t A) {
-  switch (R.getType()) {
+static uint64_t resolveCOFFX86_64(uint64_t Type, uint64_t Offset, uint64_t S,
+                                  uint64_t LocData, int64_t /*Addend*/) {
+  switch (Type) {
   case COFF::IMAGE_REL_AMD64_SECREL:
-    return (S + A) & 0xFFFFFFFF;
+    return (S + LocData) & 0xFFFFFFFF;
   case COFF::IMAGE_REL_AMD64_ADDR64:
-    return S + A;
+    return S + LocData;
   default:
     llvm_unreachable("Invalid relocation type");
   }
@@ -487,11 +516,12 @@ static bool supportsCOFFARM(uint64_t Type) {
   }
 }
 
-static uint64_t resolveCOFFARM(RelocationRef R, uint64_t S, uint64_t A) {
-  switch (R.getType()) {
+static uint64_t resolveCOFFARM(uint64_t Type, uint64_t Offset, uint64_t S,
+                               uint64_t LocData, int64_t /*Addend*/) {
+  switch (Type) {
   case COFF::IMAGE_REL_ARM_SECREL:
   case COFF::IMAGE_REL_ARM_ADDR32:
-    return (S + A) & 0xFFFFFFFF;
+    return (S + LocData) & 0xFFFFFFFF;
   default:
     llvm_unreachable("Invalid relocation type");
   }
@@ -507,12 +537,13 @@ static bool supportsCOFFARM64(uint64_t Type) {
   }
 }
 
-static uint64_t resolveCOFFARM64(RelocationRef R, uint64_t S, uint64_t A) {
-  switch (R.getType()) {
+static uint64_t resolveCOFFARM64(uint64_t Type, uint64_t Offset, uint64_t S,
+                                 uint64_t LocData, int64_t /*Addend*/) {
+  switch (Type) {
   case COFF::IMAGE_REL_ARM64_SECREL:
-    return (S + A) & 0xFFFFFFFF;
+    return (S + LocData) & 0xFFFFFFFF;
   case COFF::IMAGE_REL_ARM64_ADDR64:
-    return S + A;
+    return S + LocData;
   default:
     llvm_unreachable("Invalid relocation type");
   }
@@ -522,8 +553,9 @@ static bool supportsMachOX86_64(uint64_t Type) {
   return Type == MachO::X86_64_RELOC_UNSIGNED;
 }
 
-static uint64_t resolveMachOX86_64(RelocationRef R, uint64_t S, uint64_t A) {
-  if (R.getType() == MachO::X86_64_RELOC_UNSIGNED)
+static uint64_t resolveMachOX86_64(uint64_t Type, uint64_t Offset, uint64_t S,
+                                   uint64_t LocData, int64_t /*Addend*/) {
+  if (Type == MachO::X86_64_RELOC_UNSIGNED)
     return S;
   llvm_unreachable("Invalid relocation type");
 }
@@ -542,6 +574,7 @@ static bool supportsWasm32(uint64_t Type) {
   case wasm::R_WASM_SECTION_OFFSET_I32:
   case wasm::R_WASM_EVENT_INDEX_LEB:
   case wasm::R_WASM_GLOBAL_INDEX_I32:
+  case wasm::R_WASM_TABLE_NUMBER_LEB:
     return true;
   default:
     return false;
@@ -553,14 +586,18 @@ static bool supportsWasm64(uint64_t Type) {
   case wasm::R_WASM_MEMORY_ADDR_LEB64:
   case wasm::R_WASM_MEMORY_ADDR_SLEB64:
   case wasm::R_WASM_MEMORY_ADDR_I64:
+  case wasm::R_WASM_TABLE_INDEX_SLEB64:
+  case wasm::R_WASM_TABLE_INDEX_I64:
+  case wasm::R_WASM_FUNCTION_OFFSET_I64:
     return true;
   default:
     return supportsWasm32(Type);
   }
 }
 
-static uint64_t resolveWasm32(RelocationRef R, uint64_t S, uint64_t A) {
-  switch (R.getType()) {
+static uint64_t resolveWasm32(uint64_t Type, uint64_t Offset, uint64_t S,
+                              uint64_t LocData, int64_t /*Addend*/) {
+  switch (Type) {
   case wasm::R_WASM_FUNCTION_INDEX_LEB:
   case wasm::R_WASM_TABLE_INDEX_SLEB:
   case wasm::R_WASM_TABLE_INDEX_I32:
@@ -573,26 +610,31 @@ static uint64_t resolveWasm32(RelocationRef R, uint64_t S, uint64_t A) {
   case wasm::R_WASM_SECTION_OFFSET_I32:
   case wasm::R_WASM_EVENT_INDEX_LEB:
   case wasm::R_WASM_GLOBAL_INDEX_I32:
+  case wasm::R_WASM_TABLE_NUMBER_LEB:
     // For wasm section, its offset at 0 -- ignoring Value
-    return A;
+    return LocData;
   default:
     llvm_unreachable("Invalid relocation type");
   }
 }
 
-static uint64_t resolveWasm64(RelocationRef R, uint64_t S, uint64_t A) {
-  switch (R.getType()) {
+static uint64_t resolveWasm64(uint64_t Type, uint64_t Offset, uint64_t S,
+                              uint64_t LocData, int64_t Addend) {
+  switch (Type) {
   case wasm::R_WASM_MEMORY_ADDR_LEB64:
   case wasm::R_WASM_MEMORY_ADDR_SLEB64:
   case wasm::R_WASM_MEMORY_ADDR_I64:
+  case wasm::R_WASM_TABLE_INDEX_SLEB64:
+  case wasm::R_WASM_TABLE_INDEX_I64:
+  case wasm::R_WASM_FUNCTION_OFFSET_I64:
     // For wasm section, its offset at 0 -- ignoring Value
-    return A;
+    return LocData;
   default:
-    return resolveWasm32(R, S, A);
+    return resolveWasm32(Type, Offset, S, LocData, Addend);
   }
 }
 
-std::pair<bool (*)(uint64_t), RelocationResolver>
+std::pair<SupportsRelocation, RelocationResolver>
 getRelocationResolver(const ObjectFile &Obj) {
   if (Obj.isCOFF()) {
     switch (Obj.getArch()) {
@@ -645,6 +687,7 @@ getRelocationResolver(const ObjectFile &Obj) {
     switch (Obj.getArch()) {
     case Triple::x86:
       return {supportsX86, resolveX86};
+    case Triple::ppcle:
     case Triple::ppc:
       return {supportsPPC32, resolvePPC32};
     case Triple::arm:
@@ -683,5 +726,38 @@ getRelocationResolver(const ObjectFile &Obj) {
   llvm_unreachable("Invalid object file");
 }
 
+uint64_t resolveRelocation(RelocationResolver Resolver, const RelocationRef &R,
+                           uint64_t S, uint64_t LocData) {
+  if (const ObjectFile *Obj = R.getObject()) {
+    int64_t Addend = 0;
+    if (Obj->isELF()) {
+      auto GetRelSectionType = [&]() -> unsigned {
+        if (auto *Elf32LEObj = dyn_cast<ELF32LEObjectFile>(Obj))
+          return Elf32LEObj->getRelSection(R.getRawDataRefImpl())->sh_type;
+        if (auto *Elf64LEObj = dyn_cast<ELF64LEObjectFile>(Obj))
+          return Elf64LEObj->getRelSection(R.getRawDataRefImpl())->sh_type;
+        if (auto *Elf32BEObj = dyn_cast<ELF32BEObjectFile>(Obj))
+          return Elf32BEObj->getRelSection(R.getRawDataRefImpl())->sh_type;
+        auto *Elf64BEObj = cast<ELF64BEObjectFile>(Obj);
+        return Elf64BEObj->getRelSection(R.getRawDataRefImpl())->sh_type;
+      };
+
+      if (GetRelSectionType() == ELF::SHT_RELA)
+        Addend = getELFAddend(R);
+    }
+
+    return Resolver(R.getType(), R.getOffset(), S, LocData, Addend);
+  }
+
+  // Sometimes the caller might want to use its own specific implementation of
+  // the resolver function. E.g. this is used by LLD when it resolves debug
+  // relocations and assumes that all of them have the same computation (S + A).
+  // The relocation R has no owner object in this case and we don't need to
+  // provide Type and Offset fields. It is also assumed the DataRefImpl.p
+  // contains the addend, provided by the caller.
+  return Resolver(/*Type=*/0, /*Offset=*/0, S, LocData,
+                  R.getRawDataRefImpl().p);
+}
+
 } // namespace object
 } // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Object/SymbolSize.cpp b/contrib/llvm-project/llvm/lib/Object/SymbolSize.cpp
index 84eed4d169d3..97baabec084b 100644
--- a/contrib/llvm-project/llvm/lib/Object/SymbolSize.cpp
+++ b/contrib/llvm-project/llvm/lib/Object/SymbolSize.cpp
@@ -48,7 +48,7 @@ llvm::object::computeSymbolSizes(const ObjectFile &O) {
 
   if (const auto *E = dyn_cast<ELFObjectFileBase>(&O)) {
     auto Syms = E->symbols();
-    if (Syms.begin() == Syms.end())
+    if (Syms.empty())
       Syms = E->getDynamicSymbolIterators();
     for (ELFSymbolRef Sym : Syms)
       Ret.push_back({Sym, Sym.getSize()});
diff --git a/contrib/llvm-project/llvm/lib/Object/SymbolicFile.cpp b/contrib/llvm-project/llvm/lib/Object/SymbolicFile.cpp
index 3db4ad9ed14b..34a2c5e1c125 100644
--- a/contrib/llvm-project/llvm/lib/Object/SymbolicFile.cpp
+++ b/contrib/llvm-project/llvm/lib/Object/SymbolicFile.cpp
@@ -36,25 +36,19 @@ SymbolicFile::~SymbolicFile() = default;
 
 Expected<std::unique_ptr<SymbolicFile>>
 SymbolicFile::createSymbolicFile(MemoryBufferRef Object, file_magic Type,
-                                 LLVMContext *Context) {
+                                 LLVMContext *Context, bool InitContent) {
   StringRef Data = Object.getBuffer();
   if (Type == file_magic::unknown)
     Type = identify_magic(Data);
 
+  if (!isSymbolicFile(Type, Context))
+    return errorCodeToError(object_error::invalid_file_type);
+
   switch (Type) {
   case file_magic::bitcode:
-    if (Context)
-      return IRObjectFile::create(Object, *Context);
-    LLVM_FALLTHROUGH;
-  case file_magic::unknown:
-  case file_magic::archive:
-  case file_magic::coff_cl_gl_object:
-  case file_magic::macho_universal_binary:
-  case file_magic::windows_resource:
-  case file_magic::pdb:
-  case file_magic::minidump:
-  case file_magic::tapi_file:
-    return errorCodeToError(object_error::invalid_file_type);
+    // Context is guaranteed to be non-null here, because bitcode magic only
+    // indicates a symbolic file when Context is non-null.
+    return IRObjectFile::create(Object, *Context);
   case file_magic::elf:
   case file_magic::elf_executable:
   case file_magic::elf_shared_object:
@@ -73,14 +67,14 @@ SymbolicFile::createSymbolicFile(MemoryBufferRef Object, file_magic Type,
   case file_magic::xcoff_object_32:
   case file_magic::xcoff_object_64:
   case file_magic::wasm_object:
-    return ObjectFile::createObjectFile(Object, Type);
+    return ObjectFile::createObjectFile(Object, Type, InitContent);
   case file_magic::coff_import_library:
     return std::unique_ptr<SymbolicFile>(new COFFImportFile(Object));
   case file_magic::elf_relocatable:
   case file_magic::macho_object:
   case file_magic::coff_object: {
     Expected<std::unique_ptr<ObjectFile>> Obj =
-        ObjectFile::createObjectFile(Object, Type);
+        ObjectFile::createObjectFile(Object, Type, InitContent);
     if (!Obj || !Context)
       return std::move(Obj);
 
@@ -95,6 +89,39 @@ SymbolicFile::createSymbolicFile(MemoryBufferRef Object, file_magic Type,
         MemoryBufferRef(BCData->getBuffer(), Object.getBufferIdentifier()),
         *Context);
   }
+  default:
+    llvm_unreachable("Unexpected Binary File Type");
+  }
+}
+
+bool SymbolicFile::isSymbolicFile(file_magic Type, const LLVMContext *Context) {
+  switch (Type) {
+  case file_magic::bitcode:
+    return Context != nullptr;
+  case file_magic::elf:
+  case file_magic::elf_executable:
+  case file_magic::elf_shared_object:
+  case file_magic::elf_core:
+  case file_magic::macho_executable:
+  case file_magic::macho_fixed_virtual_memory_shared_lib:
+  case file_magic::macho_core:
+  case file_magic::macho_preload_executable:
+  case file_magic::macho_dynamically_linked_shared_lib:
+  case file_magic::macho_dynamic_linker:
+  case file_magic::macho_bundle:
+  case file_magic::macho_dynamically_linked_shared_lib_stub:
+  case file_magic::macho_dsym_companion:
+  case file_magic::macho_kext_bundle:
+  case file_magic::pecoff_executable:
+  case file_magic::xcoff_object_32:
+  case file_magic::xcoff_object_64:
+  case file_magic::wasm_object:
+  case file_magic::coff_import_library:
+  case file_magic::elf_relocatable:
+  case file_magic::macho_object:
+  case file_magic::coff_object:
+    return true;
+  default:
+    return false;
   }
-  llvm_unreachable("Unexpected Binary File Type");
 }
diff --git a/contrib/llvm-project/llvm/lib/Object/WasmObjectFile.cpp b/contrib/llvm-project/llvm/lib/Object/WasmObjectFile.cpp
index bb2e81d64047..40f468881edd 100644
--- a/contrib/llvm-project/llvm/lib/Object/WasmObjectFile.cpp
+++ b/contrib/llvm-project/llvm/lib/Object/WasmObjectFile.cpp
@@ -187,19 +187,19 @@ static Error readInitExpr(wasm::WasmInitExpr &Expr,
   case wasm::WASM_OPCODE_REF_NULL: {
     wasm::ValType Ty = static_cast<wasm::ValType>(readULEB128(Ctx));
     if (Ty != wasm::ValType::EXTERNREF) {
-      return make_error<GenericBinaryError>("Invalid type for ref.null",
+      return make_error<GenericBinaryError>("invalid type for ref.null",
                                             object_error::parse_failed);
     }
     break;
   }
   default:
-    return make_error<GenericBinaryError>("Invalid opcode in init_expr",
+    return make_error<GenericBinaryError>("invalid opcode in init_expr",
                                           object_error::parse_failed);
   }
 
   uint8_t EndOpcode = readOpcode(Ctx);
   if (EndOpcode != wasm::WASM_OPCODE_END) {
-    return make_error<GenericBinaryError>("Invalid init_expr",
+    return make_error<GenericBinaryError>("invalid init_expr",
                                           object_error::parse_failed);
   }
   return Error::success();
@@ -214,11 +214,11 @@ static wasm::WasmLimits readLimits(WasmObjectFile::ReadContext &Ctx) {
   return Result;
 }
 
-static wasm::WasmTable readTable(WasmObjectFile::ReadContext &Ctx) {
-  wasm::WasmTable Table;
-  Table.ElemType = readUint8(Ctx);
-  Table.Limits = readLimits(Ctx);
-  return Table;
+static wasm::WasmTableType readTableType(WasmObjectFile::ReadContext &Ctx) {
+  wasm::WasmTableType TableType;
+  TableType.ElemType = readUint8(Ctx);
+  TableType.Limits = readLimits(Ctx);
+  return TableType;
 }
 
 static Error readSection(WasmSection &Section, WasmObjectFile::ReadContext &Ctx,
@@ -228,10 +228,10 @@ static Error readSection(WasmSection &Section, WasmObjectFile::ReadContext &Ctx,
   LLVM_DEBUG(dbgs() << "readSection type=" << Section.Type << "\n");
   uint32_t Size = readVaruint32(Ctx);
   if (Size == 0)
-    return make_error<StringError>("Zero length section",
+    return make_error<StringError>("zero length section",
                                    object_error::parse_failed);
   if (Ctx.Ptr + Size > Ctx.End)
-    return make_error<StringError>("Section too large",
+    return make_error<StringError>("section too large",
                                    object_error::parse_failed);
   if (Section.Type == wasm::WASM_SEC_CUSTOM) {
     WasmObjectFile::ReadContext SectionCtx;
@@ -247,7 +247,7 @@ static Error readSection(WasmSection &Section, WasmObjectFile::ReadContext &Ctx,
   }
 
   if (!Checker.isValidSectionOrder(Section.Type, Section.Name)) {
-    return make_error<StringError>("Out of order section type: " +
+    return make_error<StringError>("out of order section type: " +
                                        llvm::to_string(Section.Type),
                                    object_error::parse_failed);
   }
@@ -262,8 +262,8 @@ WasmObjectFile::WasmObjectFile(MemoryBufferRef Buffer, Error &Err)
   ErrorAsOutParameter ErrAsOutParam(&Err);
   Header.Magic = getData().substr(0, 4);
   if (Header.Magic != StringRef("\0asm", 4)) {
-    Err =
-        make_error<StringError>("Bad magic number", object_error::parse_failed);
+    Err = make_error<StringError>("invalid magic number",
+                                  object_error::parse_failed);
     return;
   }
 
@@ -273,14 +273,15 @@ WasmObjectFile::WasmObjectFile(MemoryBufferRef Buffer, Error &Err)
   Ctx.End = Ctx.Start + getData().size();
 
   if (Ctx.Ptr + 4 > Ctx.End) {
-    Err = make_error<StringError>("Missing version number",
+    Err = make_error<StringError>("missing version number",
                                   object_error::parse_failed);
     return;
   }
 
   Header.Version = readUint32(Ctx);
   if (Header.Version != wasm::WasmVersion) {
-    Err = make_error<StringError>("Bad version number",
+    Err = make_error<StringError>("invalid version number: " +
+                                      Twine(Header.Version),
                                   object_error::parse_failed);
     return;
   }
@@ -333,7 +334,7 @@ Error WasmObjectFile::parseSection(WasmSection &Sec) {
     return parseDataCountSection(Ctx);
   default:
     return make_error<GenericBinaryError>(
-        "Invalid section type: " + Twine(Sec.Type), object_error::parse_failed);
+        "invalid section type: " + Twine(Sec.Type), object_error::parse_failed);
   }
 }
 
@@ -355,9 +356,11 @@ Error WasmObjectFile::parseDylinkSection(ReadContext &Ctx) {
 }
 
 Error WasmObjectFile::parseNameSection(ReadContext &Ctx) {
-  llvm::DenseSet<uint64_t> Seen;
+  llvm::DenseSet<uint64_t> SeenFunctions;
+  llvm::DenseSet<uint64_t> SeenGlobals;
+  llvm::DenseSet<uint64_t> SeenSegments;
   if (FunctionTypes.size() && !SeenCodeSection) {
-    return make_error<GenericBinaryError>("Names must come after code section",
+    return make_error<GenericBinaryError>("names must come after code section",
                                           object_error::parse_failed);
   }
 
@@ -366,20 +369,42 @@ Error WasmObjectFile::parseNameSection(ReadContext &Ctx) {
     uint32_t Size = readVaruint32(Ctx);
     const uint8_t *SubSectionEnd = Ctx.Ptr + Size;
     switch (Type) {
-    case wasm::WASM_NAMES_FUNCTION: {
+    case wasm::WASM_NAMES_FUNCTION:
+    case wasm::WASM_NAMES_GLOBAL:
+    case wasm::WASM_NAMES_DATA_SEGMENT: {
       uint32_t Count = readVaruint32(Ctx);
       while (Count--) {
         uint32_t Index = readVaruint32(Ctx);
-        if (!Seen.insert(Index).second)
-          return make_error<GenericBinaryError>("Function named more than once",
-                                                object_error::parse_failed);
         StringRef Name = readString(Ctx);
-        if (!isValidFunctionIndex(Index) || Name.empty())
-          return make_error<GenericBinaryError>("Invalid name entry",
-                                                object_error::parse_failed);
-        DebugNames.push_back(wasm::WasmFunctionName{Index, Name});
-        if (isDefinedFunctionIndex(Index))
-          getDefinedFunction(Index).DebugName = Name;
+        wasm::NameType nameType = wasm::NameType::FUNCTION;
+        if (Type == wasm::WASM_NAMES_FUNCTION) {
+          if (!SeenFunctions.insert(Index).second)
+            return make_error<GenericBinaryError>(
+                "function named more than once", object_error::parse_failed);
+          if (!isValidFunctionIndex(Index) || Name.empty())
+            return make_error<GenericBinaryError>("invalid name entry",
+                                                  object_error::parse_failed);
+
+          if (isDefinedFunctionIndex(Index))
+            getDefinedFunction(Index).DebugName = Name;
+        } else if (Type == wasm::WASM_NAMES_GLOBAL) {
+          nameType = wasm::NameType::GLOBAL;
+          if (!SeenGlobals.insert(Index).second)
+            return make_error<GenericBinaryError>("global named more than once",
+                                                  object_error::parse_failed);
+          if (!isValidGlobalIndex(Index) || Name.empty())
+            return make_error<GenericBinaryError>("invalid name entry",
+                                                  object_error::parse_failed);
+        } else {
+          nameType = wasm::NameType::DATA_SEGMENT;
+          if (!SeenSegments.insert(Index).second)
+            return make_error<GenericBinaryError>(
+                "segment named more than once", object_error::parse_failed);
+          if (Index > DataSegments.size())
+            return make_error<GenericBinaryError>("invalid named data segment",
+                                                  object_error::parse_failed);
+        }
+        DebugNames.push_back(wasm::WasmDebugName{nameType, Index, Name});
       }
       break;
     }
@@ -391,11 +416,11 @@ Error WasmObjectFile::parseNameSection(ReadContext &Ctx) {
     }
     if (Ctx.Ptr != SubSectionEnd)
       return make_error<GenericBinaryError>(
-          "Name sub-section ended prematurely", object_error::parse_failed);
+          "name sub-section ended prematurely", object_error::parse_failed);
   }
 
   if (Ctx.Ptr != Ctx.End)
-    return make_error<GenericBinaryError>("Name section ended prematurely",
+    return make_error<GenericBinaryError>("name section ended prematurely",
                                           object_error::parse_failed);
   return Error::success();
 }
@@ -404,14 +429,14 @@ Error WasmObjectFile::parseLinkingSection(ReadContext &Ctx) {
   HasLinkingSection = true;
   if (FunctionTypes.size() && !SeenCodeSection) {
     return make_error<GenericBinaryError>(
-        "Linking data must come after code section",
+        "linking data must come after code section",
         object_error::parse_failed);
   }
 
   LinkingData.Version = readVaruint32(Ctx);
   if (LinkingData.Version != wasm::WasmMetadataVersion) {
     return make_error<GenericBinaryError>(
-        "Unexpected metadata version: " + Twine(LinkingData.Version) +
+        "unexpected metadata version: " + Twine(LinkingData.Version) +
             " (Expected: " + Twine(wasm::WasmMetadataVersion) + ")",
         object_error::parse_failed);
   }
@@ -432,7 +457,7 @@ Error WasmObjectFile::parseLinkingSection(ReadContext &Ctx) {
     case wasm::WASM_SEGMENT_INFO: {
       uint32_t Count = readVaruint32(Ctx);
       if (Count > DataSegments.size())
-        return make_error<GenericBinaryError>("Too many segment names",
+        return make_error<GenericBinaryError>("too many segment names",
                                               object_error::parse_failed);
       for (uint32_t I = 0; I < Count; I++) {
         DataSegments[I].Data.Name = readString(Ctx);
@@ -449,7 +474,7 @@ Error WasmObjectFile::parseLinkingSection(ReadContext &Ctx) {
         Init.Priority = readVaruint32(Ctx);
         Init.Symbol = readVaruint32(Ctx);
         if (!isValidFunctionSymbol(Init.Symbol))
-          return make_error<GenericBinaryError>("Invalid function symbol: " +
+          return make_error<GenericBinaryError>("invalid function symbol: " +
                                                     Twine(Init.Symbol),
                                                 object_error::parse_failed);
         LinkingData.InitFunctions.emplace_back(Init);
@@ -466,10 +491,10 @@ Error WasmObjectFile::parseLinkingSection(ReadContext &Ctx) {
     }
     if (Ctx.Ptr != Ctx.End)
       return make_error<GenericBinaryError>(
-          "Linking sub-section ended prematurely", object_error::parse_failed);
+          "linking sub-section ended prematurely", object_error::parse_failed);
   }
   if (Ctx.Ptr != OrigEnd)
-    return make_error<GenericBinaryError>("Linking section ended prematurely",
+    return make_error<GenericBinaryError>("linking section ended prematurely",
                                           object_error::parse_failed);
   return Error::success();
 }
@@ -483,9 +508,11 @@ Error WasmObjectFile::parseLinkingSectionSymtab(ReadContext &Ctx) {
   std::vector<wasm::WasmImport *> ImportedGlobals;
   std::vector<wasm::WasmImport *> ImportedFunctions;
   std::vector<wasm::WasmImport *> ImportedEvents;
+  std::vector<wasm::WasmImport *> ImportedTables;
   ImportedGlobals.reserve(Imports.size());
   ImportedFunctions.reserve(Imports.size());
   ImportedEvents.reserve(Imports.size());
+  ImportedTables.reserve(Imports.size());
   for (auto &I : Imports) {
     if (I.Kind == wasm::WASM_EXTERNAL_FUNCTION)
       ImportedFunctions.emplace_back(&I);
@@ -493,12 +520,15 @@ Error WasmObjectFile::parseLinkingSectionSymtab(ReadContext &Ctx) {
       ImportedGlobals.emplace_back(&I);
     else if (I.Kind == wasm::WASM_EXTERNAL_EVENT)
       ImportedEvents.emplace_back(&I);
+    else if (I.Kind == wasm::WASM_EXTERNAL_TABLE)
+      ImportedTables.emplace_back(&I);
   }
 
   while (Count--) {
     wasm::WasmSymbolInfo Info;
     const wasm::WasmSignature *Signature = nullptr;
     const wasm::WasmGlobalType *GlobalType = nullptr;
+    const wasm::WasmTableType *TableType = nullptr;
     const wasm::WasmEventType *EventType = nullptr;
 
     Info.Kind = readUint8(Ctx);
@@ -560,7 +590,38 @@ Error WasmObjectFile::parseLinkingSectionSymtab(ReadContext &Ctx) {
           Info.Name = Import.Field;
         }
         GlobalType = &Import.Global;
-        Info.ImportName = Import.Field;
+        if (!Import.Module.empty()) {
+          Info.ImportModule = Import.Module;
+        }
+      }
+      break;
+
+    case wasm::WASM_SYMBOL_TYPE_TABLE:
+      Info.ElementIndex = readVaruint32(Ctx);
+      if (!isValidTableIndex(Info.ElementIndex) ||
+          IsDefined != isDefinedTableIndex(Info.ElementIndex))
+        return make_error<GenericBinaryError>("invalid table symbol index",
+                                              object_error::parse_failed);
+      if (!IsDefined && (Info.Flags & wasm::WASM_SYMBOL_BINDING_MASK) ==
+                            wasm::WASM_SYMBOL_BINDING_WEAK)
+        return make_error<GenericBinaryError>("undefined weak table symbol",
+                                              object_error::parse_failed);
+      if (IsDefined) {
+        Info.Name = readString(Ctx);
+        unsigned TableIndex = Info.ElementIndex - NumImportedTables;
+        wasm::WasmTable &Table = Tables[TableIndex];
+        TableType = &Table.Type;
+        if (Table.SymbolName.empty())
+          Table.SymbolName = Info.Name;
+      } else {
+        wasm::WasmImport &Import = *ImportedTables[Info.ElementIndex];
+        if ((Info.Flags & wasm::WASM_SYMBOL_EXPLICIT_NAME) != 0) {
+          Info.Name = readString(Ctx);
+          Info.ImportName = Import.Field;
+        } else {
+          Info.Name = Import.Field;
+        }
+        TableType = &Import.Table;
         if (!Import.Module.empty()) {
           Info.ImportModule = Import.Module;
         }
@@ -587,7 +648,7 @@ Error WasmObjectFile::parseLinkingSectionSymtab(ReadContext &Ctx) {
       if ((Info.Flags & wasm::WASM_SYMBOL_BINDING_MASK) !=
           wasm::WASM_SYMBOL_BINDING_LOCAL)
         return make_error<GenericBinaryError>(
-            "Section symbols must have local binding",
+            "section symbols must have local binding",
             object_error::parse_failed);
       Info.ElementIndex = readVaruint32(Ctx);
       // Use somewhat unique section name as symbol name.
@@ -633,19 +694,20 @@ Error WasmObjectFile::parseLinkingSectionSymtab(ReadContext &Ctx) {
     }
 
     default:
-      return make_error<GenericBinaryError>("Invalid symbol type",
+      return make_error<GenericBinaryError>("invalid symbol type: " +
+                                                Twine(unsigned(Info.Kind)),
                                             object_error::parse_failed);
     }
 
     if ((Info.Flags & wasm::WASM_SYMBOL_BINDING_MASK) !=
             wasm::WASM_SYMBOL_BINDING_LOCAL &&
         !SymbolNames.insert(Info.Name).second)
-      return make_error<GenericBinaryError>("Duplicate symbol name " +
+      return make_error<GenericBinaryError>("duplicate symbol name " +
                                                 Twine(Info.Name),
                                             object_error::parse_failed);
     LinkingData.SymbolTable.emplace_back(Info);
-    Symbols.emplace_back(LinkingData.SymbolTable.back(), GlobalType, EventType,
-                         Signature);
+    Symbols.emplace_back(LinkingData.SymbolTable.back(), GlobalType, TableType,
+                         EventType, Signature);
     LLVM_DEBUG(dbgs() << "Adding symbol: " << Symbols.back() << "\n");
   }
 
@@ -658,13 +720,13 @@ Error WasmObjectFile::parseLinkingSectionComdat(ReadContext &Ctx) {
   for (unsigned ComdatIndex = 0; ComdatIndex < ComdatCount; ++ComdatIndex) {
     StringRef Name = readString(Ctx);
     if (Name.empty() || !ComdatSet.insert(Name).second)
-      return make_error<GenericBinaryError>("Bad/duplicate COMDAT name " +
+      return make_error<GenericBinaryError>("bad/duplicate COMDAT name " +
                                                 Twine(Name),
                                             object_error::parse_failed);
     LinkingData.Comdats.emplace_back(Name);
     uint32_t Flags = readVaruint32(Ctx);
     if (Flags != 0)
-      return make_error<GenericBinaryError>("Unsupported COMDAT flags",
+      return make_error<GenericBinaryError>("unsupported COMDAT flags",
                                             object_error::parse_failed);
 
     uint32_t EntryCount = readVaruint32(Ctx);
@@ -673,14 +735,14 @@ Error WasmObjectFile::parseLinkingSectionComdat(ReadContext &Ctx) {
       unsigned Index = readVaruint32(Ctx);
       switch (Kind) {
       default:
-        return make_error<GenericBinaryError>("Invalid COMDAT entry type",
+        return make_error<GenericBinaryError>("invalid COMDAT entry type",
                                               object_error::parse_failed);
       case wasm::WASM_COMDAT_DATA:
         if (Index >= DataSegments.size())
           return make_error<GenericBinaryError>(
               "COMDAT data index out of range", object_error::parse_failed);
         if (DataSegments[Index].Data.Comdat != UINT32_MAX)
-          return make_error<GenericBinaryError>("Data segment in two COMDATs",
+          return make_error<GenericBinaryError>("data segment in two COMDATs",
                                                 object_error::parse_failed);
         DataSegments[Index].Data.Comdat = ComdatIndex;
         break;
@@ -689,10 +751,19 @@ Error WasmObjectFile::parseLinkingSectionComdat(ReadContext &Ctx) {
           return make_error<GenericBinaryError>(
               "COMDAT function index out of range", object_error::parse_failed);
         if (getDefinedFunction(Index).Comdat != UINT32_MAX)
-          return make_error<GenericBinaryError>("Function in two COMDATs",
+          return make_error<GenericBinaryError>("function in two COMDATs",
                                                 object_error::parse_failed);
         getDefinedFunction(Index).Comdat = ComdatIndex;
         break;
+      case wasm::WASM_COMDAT_SECTION:
+        if (Index >= Sections.size())
+          return make_error<GenericBinaryError>(
+              "COMDAT section index out of range", object_error::parse_failed);
+        if (Sections[Index].Type != wasm::WASM_SEC_CUSTOM)
+          return make_error<GenericBinaryError>(
+              "non-custom section in a COMDAT", object_error::parse_failed);
+        Sections[Index].Comdat = ComdatIndex;
+        break;
       }
     }
   }
@@ -706,7 +777,7 @@ Error WasmObjectFile::parseProducersSection(ReadContext &Ctx) {
     StringRef FieldName = readString(Ctx);
     if (!FieldsSeen.insert(FieldName).second)
       return make_error<GenericBinaryError>(
-          "Producers section does not have unique fields",
+          "producers section does not have unique fields",
           object_error::parse_failed);
     std::vector<std::pair<std::string, std::string>> *ProducerVec = nullptr;
     if (FieldName == "language") {
@@ -717,7 +788,7 @@ Error WasmObjectFile::parseProducersSection(ReadContext &Ctx) {
       ProducerVec = &ProducerInfo.SDKs;
     } else {
       return make_error<GenericBinaryError>(
-          "Producers section field is not named one of language, processed-by, "
+          "producers section field is not named one of language, processed-by, "
           "or sdk",
           object_error::parse_failed);
     }
@@ -728,14 +799,14 @@ Error WasmObjectFile::parseProducersSection(ReadContext &Ctx) {
       StringRef Version = readString(Ctx);
       if (!ProducersSeen.insert(Name).second) {
         return make_error<GenericBinaryError>(
-            "Producers section contains repeated producer",
+            "producers section contains repeated producer",
             object_error::parse_failed);
       }
       ProducerVec->emplace_back(std::string(Name), std::string(Version));
     }
   }
   if (Ctx.Ptr != Ctx.End)
-    return make_error<GenericBinaryError>("Producers section ended prematurely",
+    return make_error<GenericBinaryError>("producers section ended prematurely",
                                           object_error::parse_failed);
   return Error::success();
 }
@@ -752,20 +823,20 @@ Error WasmObjectFile::parseTargetFeaturesSection(ReadContext &Ctx) {
     case wasm::WASM_FEATURE_PREFIX_DISALLOWED:
       break;
     default:
-      return make_error<GenericBinaryError>("Unknown feature policy prefix",
+      return make_error<GenericBinaryError>("unknown feature policy prefix",
                                             object_error::parse_failed);
     }
     Feature.Name = std::string(readString(Ctx));
     if (!FeaturesSeen.insert(Feature.Name).second)
       return make_error<GenericBinaryError>(
-          "Target features section contains repeated feature \"" +
+          "target features section contains repeated feature \"" +
               Feature.Name + "\"",
           object_error::parse_failed);
     TargetFeatures.push_back(Feature);
   }
   if (Ctx.Ptr != Ctx.End)
     return make_error<GenericBinaryError>(
-        "Target features section ended prematurely",
+        "target features section ended prematurely",
         object_error::parse_failed);
   return Error::success();
 }
@@ -773,7 +844,7 @@ Error WasmObjectFile::parseTargetFeaturesSection(ReadContext &Ctx) {
 Error WasmObjectFile::parseRelocSection(StringRef Name, ReadContext &Ctx) {
   uint32_t SectionIndex = readVaruint32(Ctx);
   if (SectionIndex >= Sections.size())
-    return make_error<GenericBinaryError>("Invalid section index",
+    return make_error<GenericBinaryError>("invalid section index",
                                           object_error::parse_failed);
   WasmSection &Section = Sections[SectionIndex];
   uint32_t RelocCount = readVaruint32(Ctx);
@@ -781,25 +852,33 @@ Error WasmObjectFile::parseRelocSection(StringRef Name, ReadContext &Ctx) {
   uint32_t PreviousOffset = 0;
   while (RelocCount--) {
     wasm::WasmRelocation Reloc = {};
-    Reloc.Type = readVaruint32(Ctx);
+    uint32_t type = readVaruint32(Ctx);
+    Reloc.Type = type;
     Reloc.Offset = readVaruint32(Ctx);
     if (Reloc.Offset < PreviousOffset)
-      return make_error<GenericBinaryError>("Relocations not in offset order",
+      return make_error<GenericBinaryError>("relocations not in offset order",
                                             object_error::parse_failed);
     PreviousOffset = Reloc.Offset;
     Reloc.Index = readVaruint32(Ctx);
-    switch (Reloc.Type) {
+    switch (type) {
     case wasm::R_WASM_FUNCTION_INDEX_LEB:
     case wasm::R_WASM_TABLE_INDEX_SLEB:
+    case wasm::R_WASM_TABLE_INDEX_SLEB64:
     case wasm::R_WASM_TABLE_INDEX_I32:
+    case wasm::R_WASM_TABLE_INDEX_I64:
     case wasm::R_WASM_TABLE_INDEX_REL_SLEB:
       if (!isValidFunctionSymbol(Reloc.Index))
-        return make_error<GenericBinaryError>("Bad relocation function index",
+        return make_error<GenericBinaryError>(
+            "invalid relocation function index", object_error::parse_failed);
+      break;
+    case wasm::R_WASM_TABLE_NUMBER_LEB:
+      if (!isValidTableSymbol(Reloc.Index))
+        return make_error<GenericBinaryError>("invalid relocation table index",
                                               object_error::parse_failed);
       break;
     case wasm::R_WASM_TYPE_INDEX_LEB:
       if (Reloc.Index >= Signatures.size())
-        return make_error<GenericBinaryError>("Bad relocation type index",
+        return make_error<GenericBinaryError>("invalid relocation type index",
                                               object_error::parse_failed);
       break;
     case wasm::R_WASM_GLOBAL_INDEX_LEB:
@@ -808,25 +887,26 @@ Error WasmObjectFile::parseRelocSection(StringRef Name, ReadContext &Ctx) {
       if (!isValidGlobalSymbol(Reloc.Index) &&
           !isValidDataSymbol(Reloc.Index) &&
           !isValidFunctionSymbol(Reloc.Index))
-        return make_error<GenericBinaryError>("Bad relocation global index",
+        return make_error<GenericBinaryError>("invalid relocation global index",
                                               object_error::parse_failed);
       break;
     case wasm::R_WASM_GLOBAL_INDEX_I32:
       if (!isValidGlobalSymbol(Reloc.Index))
-        return make_error<GenericBinaryError>("Bad relocation global index",
+        return make_error<GenericBinaryError>("invalid relocation global index",
                                               object_error::parse_failed);
       break;
     case wasm::R_WASM_EVENT_INDEX_LEB:
       if (!isValidEventSymbol(Reloc.Index))
-        return make_error<GenericBinaryError>("Bad relocation event index",
+        return make_error<GenericBinaryError>("invalid relocation event index",
                                               object_error::parse_failed);
       break;
     case wasm::R_WASM_MEMORY_ADDR_LEB:
     case wasm::R_WASM_MEMORY_ADDR_SLEB:
     case wasm::R_WASM_MEMORY_ADDR_I32:
     case wasm::R_WASM_MEMORY_ADDR_REL_SLEB:
+    case wasm::R_WASM_MEMORY_ADDR_TLS_SLEB:
       if (!isValidDataSymbol(Reloc.Index))
-        return make_error<GenericBinaryError>("Bad relocation data index",
+        return make_error<GenericBinaryError>("invalid relocation data index",
                                               object_error::parse_failed);
       Reloc.Addend = readVarint32(Ctx);
       break;
@@ -835,25 +915,31 @@ Error WasmObjectFile::parseRelocSection(StringRef Name, ReadContext &Ctx) {
     case wasm::R_WASM_MEMORY_ADDR_I64:
     case wasm::R_WASM_MEMORY_ADDR_REL_SLEB64:
       if (!isValidDataSymbol(Reloc.Index))
-        return make_error<GenericBinaryError>("Bad relocation data index",
+        return make_error<GenericBinaryError>("invalid relocation data index",
                                               object_error::parse_failed);
       Reloc.Addend = readVarint64(Ctx);
       break;
     case wasm::R_WASM_FUNCTION_OFFSET_I32:
       if (!isValidFunctionSymbol(Reloc.Index))
-        return make_error<GenericBinaryError>("Bad relocation function index",
-                                              object_error::parse_failed);
+        return make_error<GenericBinaryError>(
+            "invalid relocation function index", object_error::parse_failed);
       Reloc.Addend = readVarint32(Ctx);
       break;
+    case wasm::R_WASM_FUNCTION_OFFSET_I64:
+      if (!isValidFunctionSymbol(Reloc.Index))
+        return make_error<GenericBinaryError>(
+            "invalid relocation function index", object_error::parse_failed);
+      Reloc.Addend = readVarint64(Ctx);
+      break;
     case wasm::R_WASM_SECTION_OFFSET_I32:
       if (!isValidSectionSymbol(Reloc.Index))
-        return make_error<GenericBinaryError>("Bad relocation section index",
-                                              object_error::parse_failed);
+        return make_error<GenericBinaryError>(
+            "invalid relocation section index", object_error::parse_failed);
       Reloc.Addend = readVarint32(Ctx);
       break;
     default:
-      return make_error<GenericBinaryError>("Bad relocation type: " +
-                                                Twine(Reloc.Type),
+      return make_error<GenericBinaryError>("invalid relocation type: " +
+                                                Twine(type),
                                             object_error::parse_failed);
     }
 
@@ -871,16 +957,18 @@ Error WasmObjectFile::parseRelocSection(StringRef Name, ReadContext &Ctx) {
         Reloc.Type == wasm::R_WASM_FUNCTION_OFFSET_I32 ||
         Reloc.Type == wasm::R_WASM_GLOBAL_INDEX_I32)
       Size = 4;
-    if (Reloc.Type == wasm::R_WASM_MEMORY_ADDR_I64)
+    if (Reloc.Type == wasm::R_WASM_TABLE_INDEX_I64 ||
+        Reloc.Type == wasm::R_WASM_MEMORY_ADDR_I64 ||
+        Reloc.Type == wasm::R_WASM_FUNCTION_OFFSET_I64)
       Size = 8;
     if (Reloc.Offset + Size > EndOffset)
-      return make_error<GenericBinaryError>("Bad relocation offset",
+      return make_error<GenericBinaryError>("invalid relocation offset",
                                             object_error::parse_failed);
 
     Section.Relocations.push_back(Reloc);
   }
   if (Ctx.Ptr != Ctx.End)
-    return make_error<GenericBinaryError>("Reloc section ended prematurely",
+    return make_error<GenericBinaryError>("reloc section ended prematurely",
                                           object_error::parse_failed);
   return Error::success();
 }
@@ -915,7 +1003,7 @@ Error WasmObjectFile::parseTypeSection(ReadContext &Ctx) {
     wasm::WasmSignature Sig;
     uint8_t Form = readUint8(Ctx);
     if (Form != wasm::WASM_TYPE_FUNC) {
-      return make_error<GenericBinaryError>("Invalid signature type",
+      return make_error<GenericBinaryError>("invalid signature type",
                                             object_error::parse_failed);
     }
     uint32_t ParamCount = readVaruint32(Ctx);
@@ -932,7 +1020,7 @@ Error WasmObjectFile::parseTypeSection(ReadContext &Ctx) {
     Signatures.push_back(std::move(Sig));
   }
   if (Ctx.Ptr != Ctx.End)
-    return make_error<GenericBinaryError>("Type section ended prematurely",
+    return make_error<GenericBinaryError>("type section ended prematurely",
                                           object_error::parse_failed);
   return Error::success();
 }
@@ -957,26 +1045,32 @@ Error WasmObjectFile::parseImportSection(ReadContext &Ctx) {
       break;
     case wasm::WASM_EXTERNAL_MEMORY:
       Im.Memory = readLimits(Ctx);
+      if (Im.Memory.Flags & wasm::WASM_LIMITS_FLAG_IS_64)
+        HasMemory64 = true;
       break;
-    case wasm::WASM_EXTERNAL_TABLE:
-      Im.Table = readTable(Ctx);
-      if (Im.Table.ElemType != wasm::WASM_TYPE_FUNCREF)
-        return make_error<GenericBinaryError>("Invalid table element type",
+    case wasm::WASM_EXTERNAL_TABLE: {
+      Im.Table = readTableType(Ctx);
+      NumImportedTables++;
+      auto ElemType = Im.Table.ElemType;
+      if (ElemType != wasm::WASM_TYPE_FUNCREF &&
+          ElemType != wasm::WASM_TYPE_EXTERNREF)
+        return make_error<GenericBinaryError>("invalid table element type",
                                               object_error::parse_failed);
       break;
+    }
     case wasm::WASM_EXTERNAL_EVENT:
       NumImportedEvents++;
       Im.Event.Attribute = readVarint32(Ctx);
       Im.Event.SigIndex = readVarint32(Ctx);
       break;
     default:
-      return make_error<GenericBinaryError>("Unexpected import kind",
+      return make_error<GenericBinaryError>("unexpected import kind",
                                             object_error::parse_failed);
     }
     Imports.push_back(Im);
   }
   if (Ctx.Ptr != Ctx.End)
-    return make_error<GenericBinaryError>("Import section ended prematurely",
+    return make_error<GenericBinaryError>("import section ended prematurely",
                                           object_error::parse_failed);
   return Error::success();
 }
@@ -989,28 +1083,34 @@ Error WasmObjectFile::parseFunctionSection(ReadContext &Ctx) {
   while (Count--) {
     uint32_t Type = readVaruint32(Ctx);
     if (Type >= NumTypes)
-      return make_error<GenericBinaryError>("Invalid function type",
+      return make_error<GenericBinaryError>("invalid function type",
                                             object_error::parse_failed);
     FunctionTypes.push_back(Type);
   }
   if (Ctx.Ptr != Ctx.End)
-    return make_error<GenericBinaryError>("Function section ended prematurely",
+    return make_error<GenericBinaryError>("function section ended prematurely",
                                           object_error::parse_failed);
   return Error::success();
 }
 
 Error WasmObjectFile::parseTableSection(ReadContext &Ctx) {
+  TableSection = Sections.size();
   uint32_t Count = readVaruint32(Ctx);
   Tables.reserve(Count);
   while (Count--) {
-    Tables.push_back(readTable(Ctx));
-    if (Tables.back().ElemType != wasm::WASM_TYPE_FUNCREF) {
-      return make_error<GenericBinaryError>("Invalid table element type",
+    wasm::WasmTable T;
+    T.Type = readTableType(Ctx);
+    T.Index = NumImportedTables + Tables.size();
+    Tables.push_back(T);
+    auto ElemType = Tables.back().Type.ElemType;
+    if (ElemType != wasm::WASM_TYPE_FUNCREF &&
+        ElemType != wasm::WASM_TYPE_EXTERNREF) {
+      return make_error<GenericBinaryError>("invalid table element type",
                                             object_error::parse_failed);
     }
   }
   if (Ctx.Ptr != Ctx.End)
-    return make_error<GenericBinaryError>("Table section ended prematurely",
+    return make_error<GenericBinaryError>("table section ended prematurely",
                                           object_error::parse_failed);
   return Error::success();
 }
@@ -1019,10 +1119,13 @@ Error WasmObjectFile::parseMemorySection(ReadContext &Ctx) {
   uint32_t Count = readVaruint32(Ctx);
   Memories.reserve(Count);
   while (Count--) {
-    Memories.push_back(readLimits(Ctx));
+    auto Limits = readLimits(Ctx);
+    if (Limits.Flags & wasm::WASM_LIMITS_FLAG_IS_64)
+      HasMemory64 = true;
+    Memories.push_back(Limits);
   }
   if (Ctx.Ptr != Ctx.End)
-    return make_error<GenericBinaryError>("Memory section ended prematurely",
+    return make_error<GenericBinaryError>("memory section ended prematurely",
                                           object_error::parse_failed);
   return Error::success();
 }
@@ -1040,7 +1143,7 @@ Error WasmObjectFile::parseEventSection(ReadContext &Ctx) {
   }
 
   if (Ctx.Ptr != Ctx.End)
-    return make_error<GenericBinaryError>("Event section ended prematurely",
+    return make_error<GenericBinaryError>("event section ended prematurely",
                                           object_error::parse_failed);
   return Error::success();
 }
@@ -1059,7 +1162,7 @@ Error WasmObjectFile::parseGlobalSection(ReadContext &Ctx) {
     Globals.push_back(Global);
   }
   if (Ctx.Ptr != Ctx.End)
-    return make_error<GenericBinaryError>("Global section ended prematurely",
+    return make_error<GenericBinaryError>("global section ended prematurely",
                                           object_error::parse_failed);
   return Error::success();
 }
@@ -1076,31 +1179,31 @@ Error WasmObjectFile::parseExportSection(ReadContext &Ctx) {
     case wasm::WASM_EXTERNAL_FUNCTION:
 
       if (!isDefinedFunctionIndex(Ex.Index))
-        return make_error<GenericBinaryError>("Invalid function export",
+        return make_error<GenericBinaryError>("invalid function export",
                                               object_error::parse_failed);
       getDefinedFunction(Ex.Index).ExportName = Ex.Name;
       break;
     case wasm::WASM_EXTERNAL_GLOBAL:
       if (!isValidGlobalIndex(Ex.Index))
-        return make_error<GenericBinaryError>("Invalid global export",
+        return make_error<GenericBinaryError>("invalid global export",
                                               object_error::parse_failed);
       break;
     case wasm::WASM_EXTERNAL_EVENT:
       if (!isValidEventIndex(Ex.Index))
-        return make_error<GenericBinaryError>("Invalid event export",
+        return make_error<GenericBinaryError>("invalid event export",
                                               object_error::parse_failed);
       break;
     case wasm::WASM_EXTERNAL_MEMORY:
     case wasm::WASM_EXTERNAL_TABLE:
       break;
     default:
-      return make_error<GenericBinaryError>("Unexpected export kind",
+      return make_error<GenericBinaryError>("unexpected export kind",
                                             object_error::parse_failed);
     }
     Exports.push_back(Ex);
   }
   if (Ctx.Ptr != Ctx.End)
-    return make_error<GenericBinaryError>("Export section ended prematurely",
+    return make_error<GenericBinaryError>("export section ended prematurely",
                                           object_error::parse_failed);
   return Error::success();
 }
@@ -1117,10 +1220,18 @@ bool WasmObjectFile::isValidGlobalIndex(uint32_t Index) const {
   return Index < NumImportedGlobals + Globals.size();
 }
 
+bool WasmObjectFile::isValidTableIndex(uint32_t Index) const {
+  return Index < NumImportedTables + Tables.size();
+}
+
 bool WasmObjectFile::isDefinedGlobalIndex(uint32_t Index) const {
   return Index >= NumImportedGlobals && isValidGlobalIndex(Index);
 }
 
+bool WasmObjectFile::isDefinedTableIndex(uint32_t Index) const {
+  return Index >= NumImportedTables && isValidTableIndex(Index);
+}
+
 bool WasmObjectFile::isValidEventIndex(uint32_t Index) const {
   return Index < NumImportedEvents + Events.size();
 }
@@ -1133,6 +1244,10 @@ bool WasmObjectFile::isValidFunctionSymbol(uint32_t Index) const {
   return Index < Symbols.size() && Symbols[Index].isTypeFunction();
 }
 
+bool WasmObjectFile::isValidTableSymbol(uint32_t Index) const {
+  return Index < Symbols.size() && Symbols[Index].isTypeTable();
+}
+
 bool WasmObjectFile::isValidGlobalSymbol(uint32_t Index) const {
   return Index < Symbols.size() && Symbols[Index].isTypeGlobal();
 }
@@ -1173,7 +1288,7 @@ wasm::WasmEvent &WasmObjectFile::getDefinedEvent(uint32_t Index) {
 Error WasmObjectFile::parseStartSection(ReadContext &Ctx) {
   StartFunction = readVaruint32(Ctx);
   if (!isValidFunctionIndex(StartFunction))
-    return make_error<GenericBinaryError>("Invalid start function",
+    return make_error<GenericBinaryError>("invalid start function",
                                           object_error::parse_failed);
   return Error::success();
 }
@@ -1183,7 +1298,7 @@ Error WasmObjectFile::parseCodeSection(ReadContext &Ctx) {
   CodeSection = Sections.size();
   uint32_t FunctionCount = readVaruint32(Ctx);
   if (FunctionCount != FunctionTypes.size()) {
-    return make_error<GenericBinaryError>("Invalid function count",
+    return make_error<GenericBinaryError>("invalid function count",
                                           object_error::parse_failed);
   }
 
@@ -1215,7 +1330,7 @@ Error WasmObjectFile::parseCodeSection(ReadContext &Ctx) {
     assert(Ctx.Ptr == FunctionEnd);
   }
   if (Ctx.Ptr != Ctx.End)
-    return make_error<GenericBinaryError>("Code section ended prematurely",
+    return make_error<GenericBinaryError>("code section ended prematurely",
                                           object_error::parse_failed);
   return Error::success();
 }
@@ -1227,7 +1342,7 @@ Error WasmObjectFile::parseElemSection(ReadContext &Ctx) {
     wasm::WasmElemSegment Segment;
     Segment.TableIndex = readVaruint32(Ctx);
     if (Segment.TableIndex != 0) {
-      return make_error<GenericBinaryError>("Invalid TableIndex",
+      return make_error<GenericBinaryError>("invalid TableIndex",
                                             object_error::parse_failed);
     }
     if (Error Err = readInitExpr(Segment.Offset, Ctx))
@@ -1239,7 +1354,7 @@ Error WasmObjectFile::parseElemSection(ReadContext &Ctx) {
     ElemSegments.push_back(Segment);
   }
   if (Ctx.Ptr != Ctx.End)
-    return make_error<GenericBinaryError>("Elem section ended prematurely",
+    return make_error<GenericBinaryError>("elem section ended prematurely",
                                           object_error::parse_failed);
   return Error::success();
 }
@@ -1249,14 +1364,16 @@ Error WasmObjectFile::parseDataSection(ReadContext &Ctx) {
   uint32_t Count = readVaruint32(Ctx);
   if (DataCount && Count != DataCount.getValue())
     return make_error<GenericBinaryError>(
-        "Number of data segments does not match DataCount section");
+        "number of data segments does not match DataCount section");
   DataSegments.reserve(Count);
   while (Count--) {
     WasmSegment Segment;
     Segment.Data.InitFlags = readVaruint32(Ctx);
-    Segment.Data.MemoryIndex = (Segment.Data.InitFlags & wasm::WASM_SEGMENT_HAS_MEMINDEX)
-                               ? readVaruint32(Ctx) : 0;
-    if ((Segment.Data.InitFlags & wasm::WASM_SEGMENT_IS_PASSIVE) == 0) {
+    Segment.Data.MemoryIndex =
+        (Segment.Data.InitFlags & wasm::WASM_DATA_SEGMENT_HAS_MEMINDEX)
+            ? readVaruint32(Ctx)
+            : 0;
+    if ((Segment.Data.InitFlags & wasm::WASM_DATA_SEGMENT_IS_PASSIVE) == 0) {
       if (Error Err = readInitExpr(Segment.Data.Offset, Ctx))
         return Err;
     } else {
@@ -1265,7 +1382,7 @@ Error WasmObjectFile::parseDataSection(ReadContext &Ctx) {
     }
     uint32_t Size = readVaruint32(Ctx);
     if (Size > (size_t)(Ctx.End - Ctx.Ptr))
-      return make_error<GenericBinaryError>("Invalid segment size",
+      return make_error<GenericBinaryError>("invalid segment size",
                                             object_error::parse_failed);
     Segment.Data.Content = ArrayRef<uint8_t>(Ctx.Ptr, Size);
     // The rest of these Data fields are set later, when reading in the linking
@@ -1278,7 +1395,7 @@ Error WasmObjectFile::parseDataSection(ReadContext &Ctx) {
     DataSegments.push_back(Segment);
   }
   if (Ctx.Ptr != Ctx.End)
-    return make_error<GenericBinaryError>("Data section ended prematurely",
+    return make_error<GenericBinaryError>("data section ended prematurely",
                                           object_error::parse_failed);
   return Error::success();
 }
@@ -1352,6 +1469,7 @@ uint64_t WasmObjectFile::getWasmSymbolValue(const WasmSymbol &Sym) const {
   case wasm::WASM_SYMBOL_TYPE_FUNCTION:
   case wasm::WASM_SYMBOL_TYPE_GLOBAL:
   case wasm::WASM_SYMBOL_TYPE_EVENT:
+  case wasm::WASM_SYMBOL_TYPE_TABLE:
     return Sym.Info.ElementIndex;
   case wasm::WASM_SYMBOL_TYPE_DATA: {
     // The value of a data symbol is the segment offset, plus the symbol
@@ -1401,9 +1519,11 @@ WasmObjectFile::getSymbolType(DataRefImpl Symb) const {
     return SymbolRef::ST_Debug;
   case wasm::WASM_SYMBOL_TYPE_EVENT:
     return SymbolRef::ST_Other;
+  case wasm::WASM_SYMBOL_TYPE_TABLE:
+    return SymbolRef::ST_Other;
   }
 
-  llvm_unreachable("Unknown WasmSymbol::SymbolType");
+  llvm_unreachable("unknown WasmSymbol::SymbolType");
   return SymbolRef::ST_Other;
 }
 
@@ -1435,8 +1555,10 @@ uint32_t WasmObjectFile::getSymbolSectionIdImpl(const WasmSymbol &Sym) const {
     return Sym.Info.ElementIndex;
   case wasm::WASM_SYMBOL_TYPE_EVENT:
     return EventSection;
+  case wasm::WASM_SYMBOL_TYPE_TABLE:
+    return TableSection;
   default:
-    llvm_unreachable("Unknown WasmSymbol::SymbolType");
+    llvm_unreachable("unknown WasmSymbol::SymbolType");
   }
 }
 
@@ -1576,11 +1698,15 @@ section_iterator WasmObjectFile::section_end() const {
   return section_iterator(SectionRef(Ref, this));
 }
 
-uint8_t WasmObjectFile::getBytesInAddress() const { return 4; }
+uint8_t WasmObjectFile::getBytesInAddress() const {
+  return HasMemory64 ? 8 : 4;
+}
 
 StringRef WasmObjectFile::getFileFormatName() const { return "WASM"; }
 
-Triple::ArchType WasmObjectFile::getArch() const { return Triple::wasm32; }
+Triple::ArchType WasmObjectFile::getArch() const {
+  return HasMemory64 ? Triple::wasm64 : Triple::wasm32;
+}
 
 SubtargetFeatures WasmObjectFile::getFeatures() const {
   return SubtargetFeatures();
diff --git a/contrib/llvm-project/llvm/lib/Object/XCOFFObjectFile.cpp b/contrib/llvm-project/llvm/lib/Object/XCOFFObjectFile.cpp
index 533361666cf2..a16a458168d4 100644
--- a/contrib/llvm-project/llvm/lib/Object/XCOFFObjectFile.cpp
+++ b/contrib/llvm-project/llvm/lib/Object/XCOFFObjectFile.cpp
@@ -12,10 +12,14 @@
 
 #include "llvm/Object/XCOFFObjectFile.h"
 #include "llvm/MC/SubtargetFeature.h"
+#include "llvm/Support/DataExtractor.h"
 #include <cstddef>
 #include <cstring>
 
 namespace llvm {
+
+using namespace XCOFF;
+
 namespace object {
 
 static const uint8_t FunctionSym = 0x20;
@@ -27,7 +31,7 @@ static const uint16_t NoRelMask = 0x0001;
 template <typename T>
 static Expected<const T *> getObject(MemoryBufferRef M, const void *Ptr,
                                      const uint64_t Size = sizeof(T)) {
-  uintptr_t Addr = uintptr_t(Ptr);
+  uintptr_t Addr = reinterpret_cast<uintptr_t>(Ptr);
   if (Error E = Binary::checkOffset(M, Addr, Size))
     return std::move(E);
   return reinterpret_cast<const T *>(Addr);
@@ -279,7 +283,7 @@ XCOFFObjectFile::getSectionContents(DataRefImpl Sec) const {
 
   const uint8_t * ContentStart = base() + OffsetToRaw;
   uint64_t SectionSize = getSectionSize(Sec);
-  if (checkOffset(Data, uintptr_t(ContentStart), SectionSize))
+  if (checkOffset(Data, reinterpret_cast<uintptr_t>(ContentStart), SectionSize))
     return make_error<BinaryError>();
 
   return makeArrayRef(ContentStart,SectionSize);
@@ -651,7 +655,8 @@ XCOFFObjectFile::relocations(const XCOFFSectionHeader32 &Sec) const {
 
   uint32_t NumRelocEntries = NumRelocEntriesOrErr.get();
 
-  assert(sizeof(XCOFFRelocation32) == XCOFF::RelocationSerializationSize32);
+  static_assert(
+      sizeof(XCOFFRelocation32) == XCOFF::RelocationSerializationSize32, "");
   auto RelocationOrErr =
       getObject<XCOFFRelocation32>(Data, reinterpret_cast<void *>(RelocAddr),
                                    NumRelocEntries * sizeof(XCOFFRelocation32));
@@ -834,5 +839,297 @@ bool XCOFFSymbolRef::isFunction() const {
 template struct XCOFFSectionHeader<XCOFFSectionHeader32>;
 template struct XCOFFSectionHeader<XCOFFSectionHeader64>;
 
+bool doesXCOFFTracebackTableBegin(ArrayRef<uint8_t> Bytes) {
+  if (Bytes.size() < 4)
+    return false;
+
+  return support::endian::read32be(Bytes.data()) == 0;
+}
+
+TBVectorExt::TBVectorExt(StringRef TBvectorStrRef) {
+  const uint8_t *Ptr = reinterpret_cast<const uint8_t *>(TBvectorStrRef.data());
+  Data = support::endian::read16be(Ptr);
+  VecParmsInfo = support::endian::read32be(Ptr + 2);
+}
+
+#define GETVALUEWITHMASK(X) (Data & (TracebackTable::X))
+#define GETVALUEWITHMASKSHIFT(X, S)                                            \
+  ((Data & (TracebackTable::X)) >> (TracebackTable::S))
+uint8_t TBVectorExt::getNumberOfVRSaved() const {
+  return GETVALUEWITHMASKSHIFT(NumberOfVRSavedMask, NumberOfVRSavedShift);
+}
+
+bool TBVectorExt::isVRSavedOnStack() const {
+  return GETVALUEWITHMASK(IsVRSavedOnStackMask);
+}
+
+bool TBVectorExt::hasVarArgs() const {
+  return GETVALUEWITHMASK(HasVarArgsMask);
+}
+uint8_t TBVectorExt::getNumberOfVectorParms() const {
+  return GETVALUEWITHMASKSHIFT(NumberOfVectorParmsMask,
+                               NumberOfVectorParmsShift);
+}
+
+bool TBVectorExt::hasVMXInstruction() const {
+  return GETVALUEWITHMASK(HasVMXInstructionMask);
+}
+#undef GETVALUEWITHMASK
+#undef GETVALUEWITHMASKSHIFT
+
+SmallString<32> TBVectorExt::getVectorParmsInfoString() const {
+  SmallString<32> ParmsType;
+  uint32_t Value = VecParmsInfo;
+  for (uint8_t I = 0; I < getNumberOfVectorParms(); ++I) {
+    if (I != 0)
+      ParmsType += ", ";
+    switch (Value & TracebackTable::ParmTypeMask) {
+    case TracebackTable::ParmTypeIsVectorCharBit:
+      ParmsType += "vc";
+      break;
+
+    case TracebackTable::ParmTypeIsVectorShortBit:
+      ParmsType += "vs";
+      break;
+
+    case TracebackTable::ParmTypeIsVectorIntBit:
+      ParmsType += "vi";
+      break;
+
+    case TracebackTable::ParmTypeIsVectorFloatBit:
+      ParmsType += "vf";
+      break;
+    }
+    Value <<= 2;
+  }
+  return ParmsType;
+}
+
+static SmallString<32> parseParmsTypeWithVecInfo(uint32_t Value,
+                                                 unsigned int ParmsNum) {
+  SmallString<32> ParmsType;
+  unsigned I = 0;
+  bool Begin = false;
+  while (I < ParmsNum || Value) {
+    if (Begin)
+      ParmsType += ", ";
+    else
+      Begin = true;
+
+    switch (Value & TracebackTable::ParmTypeMask) {
+    case TracebackTable::ParmTypeIsFixedBits:
+      ParmsType += "i";
+      ++I;
+      break;
+    case TracebackTable::ParmTypeIsVectorBits:
+      ParmsType += "v";
+      break;
+    case TracebackTable::ParmTypeIsFloatingBits:
+      ParmsType += "f";
+      ++I;
+      break;
+    case TracebackTable::ParmTypeIsDoubleBits:
+      ParmsType += "d";
+      ++I;
+      break;
+    default:
+      assert(false && "Unrecognized bits in ParmsType.");
+    }
+    Value <<= 2;
+  }
+  assert(I == ParmsNum &&
+         "The total parameters number of fixed-point or floating-point "
+         "parameters not equal to the number in the parameter type!");
+  return ParmsType;
+}
+
+Expected<XCOFFTracebackTable> XCOFFTracebackTable::create(const uint8_t *Ptr,
+                                                          uint64_t &Size) {
+  Error Err = Error::success();
+  XCOFFTracebackTable TBT(Ptr, Size, Err);
+  if (Err)
+    return std::move(Err);
+  return TBT;
+}
+
+XCOFFTracebackTable::XCOFFTracebackTable(const uint8_t *Ptr, uint64_t &Size,
+                                         Error &Err)
+    : TBPtr(Ptr) {
+  ErrorAsOutParameter EAO(&Err);
+  DataExtractor DE(ArrayRef<uint8_t>(Ptr, Size), /*IsLittleEndian=*/false,
+                   /*AddressSize=*/0);
+  DataExtractor::Cursor Cur(/*Offset=*/0);
+
+  // Skip 8 bytes of mandatory fields.
+  DE.getU64(Cur);
+
+  // Begin to parse optional fields.
+  if (Cur) {
+    unsigned ParmNum = getNumberOfFixedParms() + getNumberOfFPParms();
+
+    // As long as there are no "fixed-point" or floating-point parameters, this
+    // field remains not present even when hasVectorInfo gives true and
+    // indicates the presence of vector parameters.
+    if (ParmNum > 0) {
+      uint32_t ParamsTypeValue = DE.getU32(Cur);
+      if (Cur)
+        ParmsType = hasVectorInfo()
+                        ? parseParmsTypeWithVecInfo(ParamsTypeValue, ParmNum)
+                        : parseParmsType(ParamsTypeValue, ParmNum);
+    }
+  }
+
+  if (Cur && hasTraceBackTableOffset())
+    TraceBackTableOffset = DE.getU32(Cur);
+
+  if (Cur && isInterruptHandler())
+    HandlerMask = DE.getU32(Cur);
+
+  if (Cur && hasControlledStorage()) {
+    NumOfCtlAnchors = DE.getU32(Cur);
+    if (Cur && NumOfCtlAnchors) {
+      SmallVector<uint32_t, 8> Disp;
+      Disp.reserve(NumOfCtlAnchors.getValue());
+      for (uint32_t I = 0; I < NumOfCtlAnchors && Cur; ++I)
+        Disp.push_back(DE.getU32(Cur));
+      if (Cur)
+        ControlledStorageInfoDisp = std::move(Disp);
+    }
+  }
+
+  if (Cur && isFuncNamePresent()) {
+    uint16_t FunctionNameLen = DE.getU16(Cur);
+    if (Cur)
+      FunctionName = DE.getBytes(Cur, FunctionNameLen);
+  }
+
+  if (Cur && isAllocaUsed())
+    AllocaRegister = DE.getU8(Cur);
+
+  if (Cur && hasVectorInfo()) {
+    StringRef VectorExtRef = DE.getBytes(Cur, 6);
+    if (Cur)
+      VecExt = TBVectorExt(VectorExtRef);
+  }
+
+  if (Cur && hasExtensionTable())
+    ExtensionTable = DE.getU8(Cur);
+
+  if (!Cur)
+    Err = Cur.takeError();
+  Size = Cur.tell();
+}
+
+#define GETBITWITHMASK(P, X)                                                   \
+  (support::endian::read32be(TBPtr + (P)) & (TracebackTable::X))
+#define GETBITWITHMASKSHIFT(P, X, S)                                           \
+  ((support::endian::read32be(TBPtr + (P)) & (TracebackTable::X)) >>           \
+   (TracebackTable::S))
+
+uint8_t XCOFFTracebackTable::getVersion() const {
+  return GETBITWITHMASKSHIFT(0, VersionMask, VersionShift);
+}
+
+uint8_t XCOFFTracebackTable::getLanguageID() const {
+  return GETBITWITHMASKSHIFT(0, LanguageIdMask, LanguageIdShift);
+}
+
+bool XCOFFTracebackTable::isGlobalLinkage() const {
+  return GETBITWITHMASK(0, IsGlobaLinkageMask);
+}
+
+bool XCOFFTracebackTable::isOutOfLineEpilogOrPrologue() const {
+  return GETBITWITHMASK(0, IsOutOfLineEpilogOrPrologueMask);
+}
+
+bool XCOFFTracebackTable::hasTraceBackTableOffset() const {
+  return GETBITWITHMASK(0, HasTraceBackTableOffsetMask);
+}
+
+bool XCOFFTracebackTable::isInternalProcedure() const {
+  return GETBITWITHMASK(0, IsInternalProcedureMask);
+}
+
+bool XCOFFTracebackTable::hasControlledStorage() const {
+  return GETBITWITHMASK(0, HasControlledStorageMask);
+}
+
+bool XCOFFTracebackTable::isTOCless() const {
+  return GETBITWITHMASK(0, IsTOClessMask);
+}
+
+bool XCOFFTracebackTable::isFloatingPointPresent() const {
+  return GETBITWITHMASK(0, IsFloatingPointPresentMask);
+}
+
+bool XCOFFTracebackTable::isFloatingPointOperationLogOrAbortEnabled() const {
+  return GETBITWITHMASK(0, IsFloatingPointOperationLogOrAbortEnabledMask);
+}
+
+bool XCOFFTracebackTable::isInterruptHandler() const {
+  return GETBITWITHMASK(0, IsInterruptHandlerMask);
+}
+
+bool XCOFFTracebackTable::isFuncNamePresent() const {
+  return GETBITWITHMASK(0, IsFunctionNamePresentMask);
+}
+
+bool XCOFFTracebackTable::isAllocaUsed() const {
+  return GETBITWITHMASK(0, IsAllocaUsedMask);
+}
+
+uint8_t XCOFFTracebackTable::getOnConditionDirective() const {
+  return GETBITWITHMASKSHIFT(0, OnConditionDirectiveMask,
+                             OnConditionDirectiveShift);
+}
+
+bool XCOFFTracebackTable::isCRSaved() const {
+  return GETBITWITHMASK(0, IsCRSavedMask);
+}
+
+bool XCOFFTracebackTable::isLRSaved() const {
+  return GETBITWITHMASK(0, IsLRSavedMask);
+}
+
+bool XCOFFTracebackTable::isBackChainStored() const {
+  return GETBITWITHMASK(4, IsBackChainStoredMask);
+}
+
+bool XCOFFTracebackTable::isFixup() const {
+  return GETBITWITHMASK(4, IsFixupMask);
+}
+
+uint8_t XCOFFTracebackTable::getNumOfFPRsSaved() const {
+  return GETBITWITHMASKSHIFT(4, FPRSavedMask, FPRSavedShift);
+}
+
+bool XCOFFTracebackTable::hasExtensionTable() const {
+  return GETBITWITHMASK(4, HasExtensionTableMask);
+}
+
+bool XCOFFTracebackTable::hasVectorInfo() const {
+  return GETBITWITHMASK(4, HasVectorInfoMask);
+}
+
+uint8_t XCOFFTracebackTable::getNumOfGPRsSaved() const {
+  return GETBITWITHMASKSHIFT(4, GPRSavedMask, GPRSavedShift);
+}
+
+uint8_t XCOFFTracebackTable::getNumberOfFixedParms() const {
+  return GETBITWITHMASKSHIFT(4, NumberOfFixedParmsMask,
+                             NumberOfFixedParmsShift);
+}
+
+uint8_t XCOFFTracebackTable::getNumberOfFPParms() const {
+  return GETBITWITHMASKSHIFT(4, NumberOfFloatingPointParmsMask,
+                             NumberOfFloatingPointParmsShift);
+}
+
+bool XCOFFTracebackTable::hasParmsOnStack() const {
+  return GETBITWITHMASK(4, HasParmsOnStackMask);
+}
+
+#undef GETBITWITHMASK
+#undef GETBITWITHMASKSHIFT
 } // namespace object
 } // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/ObjectYAML/ArchiveEmitter.cpp b/contrib/llvm-project/llvm/lib/ObjectYAML/ArchiveEmitter.cpp
new file mode 100644
index 000000000000..a0cf8fe360da
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/ObjectYAML/ArchiveEmitter.cpp
@@ -0,0 +1,51 @@
+//===- ArchiveEmitter.cpp ---------------------------- --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ObjectYAML/ArchiveYAML.h"
+#include "llvm/ObjectYAML/yaml2obj.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace ArchYAML;
+
+namespace llvm {
+namespace yaml {
+
+bool yaml2archive(ArchYAML::Archive &Doc, raw_ostream &Out, ErrorHandler EH) {
+  Out.write(Doc.Magic.data(), Doc.Magic.size());
+
+  if (Doc.Content) {
+    Doc.Content->writeAsBinary(Out);
+    return true;
+  }
+
+  if (!Doc.Members)
+    return true;
+
+  auto WriteField = [&](StringRef Field, uint8_t Size) {
+    Out.write(Field.data(), Field.size());
+    for (size_t I = Field.size(); I != Size; ++I)
+      Out.write(' ');
+  };
+
+  for (const Archive::Child &C : *Doc.Members) {
+    for (auto &P : C.Fields)
+      WriteField(P.second.Value, P.second.MaxLength);
+
+    if (C.Content)
+      C.Content->writeAsBinary(Out);
+    if (C.PaddingByte)
+      Out.write(*C.PaddingByte);
+  }
+
+  return true;
+}
+
+} // namespace yaml
+} // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/ObjectYAML/ArchiveYAML.cpp b/contrib/llvm-project/llvm/lib/ObjectYAML/ArchiveYAML.cpp
new file mode 100644
index 000000000000..d2ea1eaf5210
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/ObjectYAML/ArchiveYAML.cpp
@@ -0,0 +1,58 @@
+//===- ArchiveYAML.cpp - ELF YAMLIO implementation -------------------- ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines classes for handling the YAML representation of archives.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ObjectYAML/ArchiveYAML.h"
+
+namespace llvm {
+
+namespace yaml {
+
+void MappingTraits<ArchYAML::Archive>::mapping(IO &IO, ArchYAML::Archive &A) {
+  assert(!IO.getContext() && "The IO context is initialized already");
+  IO.setContext(&A);
+  IO.mapTag("!Arch", true);
+  IO.mapOptional("Magic", A.Magic, "!<arch>\n");
+  IO.mapOptional("Members", A.Members);
+  IO.mapOptional("Content", A.Content);
+  IO.setContext(nullptr);
+}
+
+std::string MappingTraits<ArchYAML::Archive>::validate(IO &,
+                                                       ArchYAML::Archive &A) {
+  if (A.Members && A.Content)
+    return "\"Content\" and \"Members\" cannot be used together";
+  return "";
+}
+
+void MappingTraits<ArchYAML::Archive::Child>::mapping(
+    IO &IO, ArchYAML::Archive::Child &E) {
+  assert(IO.getContext() && "The IO context is not initialized");
+  for (auto &P : E.Fields)
+    IO.mapOptional(P.first.data(), P.second.Value, P.second.DefaultValue);
+  IO.mapOptional("Content", E.Content);
+  IO.mapOptional("PaddingByte", E.PaddingByte);
+}
+
+std::string
+MappingTraits<ArchYAML::Archive::Child>::validate(IO &,
+                                                  ArchYAML::Archive::Child &C) {
+  for (auto &P : C.Fields)
+    if (P.second.Value.size() > P.second.MaxLength)
+      return ("the maximum length of \"" + P.first + "\" field is " +
+              Twine(P.second.MaxLength))
+          .str();
+  return "";
+}
+
+} // end namespace yaml
+
+} // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/ObjectYAML/COFFEmitter.cpp b/contrib/llvm-project/llvm/lib/ObjectYAML/COFFEmitter.cpp
index 734e1be4b2d5..06ce93affd38 100644
--- a/contrib/llvm-project/llvm/lib/ObjectYAML/COFFEmitter.cpp
+++ b/contrib/llvm-project/llvm/lib/ObjectYAML/COFFEmitter.cpp
@@ -14,7 +14,6 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringMap.h"
-#include "llvm/ADT/StringSwitch.h"
 #include "llvm/DebugInfo/CodeView/DebugStringTableSubsection.h"
 #include "llvm/DebugInfo/CodeView/StringsAndChecksums.h"
 #include "llvm/Object/COFF.h"
diff --git a/contrib/llvm-project/llvm/lib/ObjectYAML/CodeViewYAMLSymbols.cpp b/contrib/llvm-project/llvm/lib/ObjectYAML/CodeViewYAMLSymbols.cpp
index 95409fdc3300..6b6a1176628b 100644
--- a/contrib/llvm-project/llvm/lib/ObjectYAML/CodeViewYAMLSymbols.cpp
+++ b/contrib/llvm-project/llvm/lib/ObjectYAML/CodeViewYAMLSymbols.cpp
@@ -147,7 +147,30 @@ void ScalarEnumerationTraits<CPUType>::enumeration(IO &io, CPUType &Cpu) {
 }
 
 void ScalarEnumerationTraits<RegisterId>::enumeration(IO &io, RegisterId &Reg) {
-  auto RegNames = getRegisterNames(CPUType::X64);
+  const auto *Header = static_cast<COFF::header *>(io.getContext());
+  assert(Header && "The IO context is not initialized");
+
+  Optional<CPUType> CpuType;
+  ArrayRef<EnumEntry<uint16_t>> RegNames;
+
+  switch (Header->Machine) {
+  case COFF::IMAGE_FILE_MACHINE_I386:
+    CpuType = CPUType::Pentium3;
+    break;
+  case COFF::IMAGE_FILE_MACHINE_AMD64:
+    CpuType = CPUType::X64;
+    break;
+  case COFF::IMAGE_FILE_MACHINE_ARMNT:
+    CpuType = CPUType::ARMNT;
+    break;
+  case COFF::IMAGE_FILE_MACHINE_ARM64:
+    CpuType = CPUType::ARM64;
+    break;
+  }
+
+  if (CpuType)
+    RegNames = getRegisterNames(*CpuType);
+
   for (const auto &E : RegNames) {
     io.enumCase(Reg, E.Name.str().c_str(), static_cast<RegisterId>(E.Value));
   }
diff --git a/contrib/llvm-project/llvm/lib/ObjectYAML/DWARFEmitter.cpp b/contrib/llvm-project/llvm/lib/ObjectYAML/DWARFEmitter.cpp
index ed3732ba29f6..eec733c7d7f9 100644
--- a/contrib/llvm-project/llvm/lib/ObjectYAML/DWARFEmitter.cpp
+++ b/contrib/llvm-project/llvm/lib/ObjectYAML/DWARFEmitter.cpp
@@ -12,9 +12,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ObjectYAML/DWARFEmitter.h"
-#include "DWARFVisitor.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/ObjectYAML/DWARFYAML.h"
 #include "llvm/Support/Errc.h"
@@ -62,18 +63,10 @@ static Error writeVariableSizedInteger(uint64_t Integer, size_t Size,
 }
 
 static void ZeroFillBytes(raw_ostream &OS, size_t Size) {
-  std::vector<uint8_t> FillData;
-  FillData.insert(FillData.begin(), Size, 0);
+  std::vector<uint8_t> FillData(Size, 0);
   OS.write(reinterpret_cast<char *>(FillData.data()), Size);
 }
 
-static void writeInitialLength(const DWARFYAML::InitialLength &Length,
-                               raw_ostream &OS, bool IsLittleEndian) {
-  writeInteger((uint32_t)Length.TotalLength, OS, IsLittleEndian);
-  if (Length.isDWARF64())
-    writeInteger((uint64_t)Length.TotalLength64, OS, IsLittleEndian);
-}
-
 static void writeInitialLength(const dwarf::DwarfFormat Format,
                                const uint64_t Length, raw_ostream &OS,
                                bool IsLittleEndian) {
@@ -85,8 +78,14 @@ static void writeInitialLength(const dwarf::DwarfFormat Format,
       writeVariableSizedInteger(Length, IsDWARF64 ? 8 : 4, OS, IsLittleEndian));
 }
 
+static void writeDWARFOffset(uint64_t Offset, dwarf::DwarfFormat Format,
+                             raw_ostream &OS, bool IsLittleEndian) {
+  cantFail(writeVariableSizedInteger(Offset, Format == dwarf::DWARF64 ? 8 : 4,
+                                     OS, IsLittleEndian));
+}
+
 Error DWARFYAML::emitDebugStr(raw_ostream &OS, const DWARFYAML::Data &DI) {
-  for (auto Str : DI.DebugStrings) {
+  for (StringRef Str : *DI.DebugStrings) {
     OS.write(Str.data(), Str.size());
     OS.write('\0');
   }
@@ -94,14 +93,23 @@ Error DWARFYAML::emitDebugStr(raw_ostream &OS, const DWARFYAML::Data &DI) {
   return Error::success();
 }
 
-Error DWARFYAML::emitDebugAbbrev(raw_ostream &OS, const DWARFYAML::Data &DI) {
+StringRef DWARFYAML::Data::getAbbrevTableContentByIndex(uint64_t Index) const {
+  assert(Index < DebugAbbrev.size() &&
+         "Index should be less than the size of DebugAbbrev array");
+  auto It = AbbrevTableContents.find(Index);
+  if (It != AbbrevTableContents.cend())
+    return It->second;
+
+  std::string AbbrevTableBuffer;
+  raw_string_ostream OS(AbbrevTableBuffer);
+
   uint64_t AbbrevCode = 0;
-  for (auto AbbrevDecl : DI.AbbrevDecls) {
+  for (const DWARFYAML::Abbrev &AbbrevDecl : DebugAbbrev[Index].Table) {
     AbbrevCode = AbbrevDecl.Code ? (uint64_t)*AbbrevDecl.Code : AbbrevCode + 1;
     encodeULEB128(AbbrevCode, OS);
     encodeULEB128(AbbrevDecl.Tag, OS);
     OS.write(AbbrevDecl.Children);
-    for (auto Attr : AbbrevDecl.Attributes) {
+    for (const auto &Attr : AbbrevDecl.Attributes) {
       encodeULEB128(Attr.Attribute, OS);
       encodeULEB128(Attr.Form, OS);
       if (Attr.Form == dwarf::DW_FORM_implicit_const)
@@ -111,39 +119,68 @@ Error DWARFYAML::emitDebugAbbrev(raw_ostream &OS, const DWARFYAML::Data &DI) {
     encodeULEB128(0, OS);
   }
 
-  // The abbreviations for a given compilation unit end with an entry consisting
-  // of a 0 byte for the abbreviation code.
+  // The abbreviations for a given compilation unit end with an entry
+  // consisting of a 0 byte for the abbreviation code.
   OS.write_zeros(1);
 
+  AbbrevTableContents.insert({Index, AbbrevTableBuffer});
+
+  return AbbrevTableContents[Index];
+}
+
+Error DWARFYAML::emitDebugAbbrev(raw_ostream &OS, const DWARFYAML::Data &DI) {
+  for (uint64_t I = 0; I < DI.DebugAbbrev.size(); ++I) {
+    StringRef AbbrevTableContent = DI.getAbbrevTableContentByIndex(I);
+    OS.write(AbbrevTableContent.data(), AbbrevTableContent.size());
+  }
+
   return Error::success();
 }
 
 Error DWARFYAML::emitDebugAranges(raw_ostream &OS, const DWARFYAML::Data &DI) {
-  for (auto Range : DI.ARanges) {
-    auto HeaderStart = OS.tell();
-    writeInitialLength(Range.Format, Range.Length, OS, DI.IsLittleEndian);
-    writeInteger((uint16_t)Range.Version, OS, DI.IsLittleEndian);
-    if (Range.Format == dwarf::DWARF64)
-      writeInteger((uint64_t)Range.CuOffset, OS, DI.IsLittleEndian);
+  assert(DI.DebugAranges && "unexpected emitDebugAranges() call");
+  for (const auto &Range : *DI.DebugAranges) {
+    uint8_t AddrSize;
+    if (Range.AddrSize)
+      AddrSize = *Range.AddrSize;
     else
-      writeInteger((uint32_t)Range.CuOffset, OS, DI.IsLittleEndian);
-    writeInteger((uint8_t)Range.AddrSize, OS, DI.IsLittleEndian);
-    writeInteger((uint8_t)Range.SegSize, OS, DI.IsLittleEndian);
+      AddrSize = DI.Is64BitAddrSize ? 8 : 4;
+
+    uint64_t Length = 4; // sizeof(version) 2 + sizeof(address_size) 1 +
+                         // sizeof(segment_selector_size) 1
+    Length +=
+        Range.Format == dwarf::DWARF64 ? 8 : 4; // sizeof(debug_info_offset)
 
-    auto HeaderSize = OS.tell() - HeaderStart;
-    auto FirstDescriptor = alignTo(HeaderSize, Range.AddrSize * 2);
-    ZeroFillBytes(OS, FirstDescriptor - HeaderSize);
+    const uint64_t HeaderLength =
+        Length + (Range.Format == dwarf::DWARF64
+                      ? 12
+                      : 4); // sizeof(unit_header) = 12 (DWARF64) or 4 (DWARF32)
+    const uint64_t PaddedHeaderLength = alignTo(HeaderLength, AddrSize * 2);
+
+    if (Range.Length) {
+      Length = *Range.Length;
+    } else {
+      Length += PaddedHeaderLength - HeaderLength;
+      Length += AddrSize * 2 * (Range.Descriptors.size() + 1);
+    }
+
+    writeInitialLength(Range.Format, Length, OS, DI.IsLittleEndian);
+    writeInteger((uint16_t)Range.Version, OS, DI.IsLittleEndian);
+    writeDWARFOffset(Range.CuOffset, Range.Format, OS, DI.IsLittleEndian);
+    writeInteger((uint8_t)AddrSize, OS, DI.IsLittleEndian);
+    writeInteger((uint8_t)Range.SegSize, OS, DI.IsLittleEndian);
+    ZeroFillBytes(OS, PaddedHeaderLength - HeaderLength);
 
-    for (auto Descriptor : Range.Descriptors) {
-      if (Error Err = writeVariableSizedInteger(
-              Descriptor.Address, Range.AddrSize, OS, DI.IsLittleEndian))
+    for (const auto &Descriptor : Range.Descriptors) {
+      if (Error Err = writeVariableSizedInteger(Descriptor.Address, AddrSize,
+                                                OS, DI.IsLittleEndian))
         return createStringError(errc::not_supported,
                                  "unable to write debug_aranges address: %s",
                                  toString(std::move(Err)).c_str());
-      cantFail(writeVariableSizedInteger(Descriptor.Length, Range.AddrSize, OS,
+      cantFail(writeVariableSizedInteger(Descriptor.Length, AddrSize, OS,
                                          DI.IsLittleEndian));
     }
-    ZeroFillBytes(OS, Range.AddrSize * 2);
+    ZeroFillBytes(OS, AddrSize * 2);
   }
 
   return Error::success();
@@ -152,7 +189,7 @@ Error DWARFYAML::emitDebugAranges(raw_ostream &OS, const DWARFYAML::Data &DI) {
 Error DWARFYAML::emitDebugRanges(raw_ostream &OS, const DWARFYAML::Data &DI) {
   const size_t RangesOffset = OS.tell();
   uint64_t EntryIndex = 0;
-  for (auto DebugRanges : DI.DebugRanges) {
+  for (const auto &DebugRanges : *DI.DebugRanges) {
     const size_t CurrOffset = OS.tell() - RangesOffset;
     if (DebugRanges.Offset && (uint64_t)*DebugRanges.Offset < CurrOffset)
       return createStringError(errc::invalid_argument,
@@ -169,7 +206,7 @@ Error DWARFYAML::emitDebugRanges(raw_ostream &OS, const DWARFYAML::Data &DI) {
       AddrSize = *DebugRanges.AddrSize;
     else
       AddrSize = DI.Is64BitAddrSize ? 8 : 4;
-    for (auto Entry : DebugRanges.Entries) {
+    for (const auto &Entry : DebugRanges.Entries) {
       if (Error Err = writeVariableSizedInteger(Entry.LowOffset, AddrSize, OS,
                                                 DI.IsLittleEndian))
         return createStringError(
@@ -186,97 +223,252 @@ Error DWARFYAML::emitDebugRanges(raw_ostream &OS, const DWARFYAML::Data &DI) {
   return Error::success();
 }
 
-Error DWARFYAML::emitPubSection(raw_ostream &OS,
-                                const DWARFYAML::PubSection &Sect,
-                                bool IsLittleEndian, bool IsGNUPubSec) {
-  writeInitialLength(Sect.Length, OS, IsLittleEndian);
+static Error emitPubSection(raw_ostream &OS, const DWARFYAML::PubSection &Sect,
+                            bool IsLittleEndian, bool IsGNUPubSec = false) {
+  writeInitialLength(Sect.Format, Sect.Length, OS, IsLittleEndian);
   writeInteger((uint16_t)Sect.Version, OS, IsLittleEndian);
   writeInteger((uint32_t)Sect.UnitOffset, OS, IsLittleEndian);
   writeInteger((uint32_t)Sect.UnitSize, OS, IsLittleEndian);
-  for (auto Entry : Sect.Entries) {
+  for (const auto &Entry : Sect.Entries) {
     writeInteger((uint32_t)Entry.DieOffset, OS, IsLittleEndian);
     if (IsGNUPubSec)
       writeInteger((uint8_t)Entry.Descriptor, OS, IsLittleEndian);
     OS.write(Entry.Name.data(), Entry.Name.size());
     OS.write('\0');
   }
-
   return Error::success();
 }
 
-namespace {
-/// An extension of the DWARFYAML::ConstVisitor which writes compile
-/// units and DIEs to a stream.
-class DumpVisitor : public DWARFYAML::ConstVisitor {
-  raw_ostream &OS;
-
-protected:
-  void onStartCompileUnit(const DWARFYAML::Unit &CU) override {
-    writeInitialLength(CU.Format, CU.Length, OS, DebugInfo.IsLittleEndian);
-    writeInteger((uint16_t)CU.Version, OS, DebugInfo.IsLittleEndian);
-    if (CU.Version >= 5) {
-      writeInteger((uint8_t)CU.Type, OS, DebugInfo.IsLittleEndian);
-      writeInteger((uint8_t)CU.AddrSize, OS, DebugInfo.IsLittleEndian);
-      cantFail(writeVariableSizedInteger(CU.AbbrOffset,
-                                         CU.Format == dwarf::DWARF64 ? 8 : 4,
-                                         OS, DebugInfo.IsLittleEndian));
-    } else {
-      cantFail(writeVariableSizedInteger(CU.AbbrOffset,
-                                         CU.Format == dwarf::DWARF64 ? 8 : 4,
-                                         OS, DebugInfo.IsLittleEndian));
-      writeInteger((uint8_t)CU.AddrSize, OS, DebugInfo.IsLittleEndian);
-    }
-  }
+Error DWARFYAML::emitDebugPubnames(raw_ostream &OS, const Data &DI) {
+  assert(DI.PubNames && "unexpected emitDebugPubnames() call");
+  return emitPubSection(OS, *DI.PubNames, DI.IsLittleEndian);
+}
 
-  void onStartDIE(const DWARFYAML::Unit &CU,
-                  const DWARFYAML::Entry &DIE) override {
-    encodeULEB128(DIE.AbbrCode, OS);
-  }
+Error DWARFYAML::emitDebugPubtypes(raw_ostream &OS, const Data &DI) {
+  assert(DI.PubTypes && "unexpected emitDebugPubtypes() call");
+  return emitPubSection(OS, *DI.PubTypes, DI.IsLittleEndian);
+}
 
-  void onValue(const uint8_t U) override {
-    writeInteger(U, OS, DebugInfo.IsLittleEndian);
-  }
+Error DWARFYAML::emitDebugGNUPubnames(raw_ostream &OS, const Data &DI) {
+  assert(DI.GNUPubNames && "unexpected emitDebugGNUPubnames() call");
+  return emitPubSection(OS, *DI.GNUPubNames, DI.IsLittleEndian,
+                        /*IsGNUStyle=*/true);
+}
 
-  void onValue(const uint16_t U) override {
-    writeInteger(U, OS, DebugInfo.IsLittleEndian);
-  }
+Error DWARFYAML::emitDebugGNUPubtypes(raw_ostream &OS, const Data &DI) {
+  assert(DI.GNUPubTypes && "unexpected emitDebugGNUPubtypes() call");
+  return emitPubSection(OS, *DI.GNUPubTypes, DI.IsLittleEndian,
+                        /*IsGNUStyle=*/true);
+}
 
-  void onValue(const uint32_t U) override {
-    writeInteger(U, OS, DebugInfo.IsLittleEndian);
+static Expected<uint64_t> writeDIE(const DWARFYAML::Data &DI, uint64_t CUIndex,
+                                   uint64_t AbbrevTableID,
+                                   const dwarf::FormParams &Params,
+                                   const DWARFYAML::Entry &Entry,
+                                   raw_ostream &OS, bool IsLittleEndian) {
+  uint64_t EntryBegin = OS.tell();
+  encodeULEB128(Entry.AbbrCode, OS);
+  uint32_t AbbrCode = Entry.AbbrCode;
+  if (AbbrCode == 0 || Entry.Values.empty())
+    return OS.tell() - EntryBegin;
+
+  Expected<DWARFYAML::Data::AbbrevTableInfo> AbbrevTableInfoOrErr =
+      DI.getAbbrevTableInfoByID(AbbrevTableID);
+  if (!AbbrevTableInfoOrErr)
+    return createStringError(errc::invalid_argument,
+                             toString(AbbrevTableInfoOrErr.takeError()) +
+                                 " for compilation unit with index " +
+                                 utostr(CUIndex));
+
+  ArrayRef<DWARFYAML::Abbrev> AbbrevDecls(
+      DI.DebugAbbrev[AbbrevTableInfoOrErr->Index].Table);
+
+  if (AbbrCode > AbbrevDecls.size())
+    return createStringError(
+        errc::invalid_argument,
+        "abbrev code must be less than or equal to the number of "
+        "entries in abbreviation table");
+  const DWARFYAML::Abbrev &Abbrev = AbbrevDecls[AbbrCode - 1];
+  auto FormVal = Entry.Values.begin();
+  auto AbbrForm = Abbrev.Attributes.begin();
+  for (; FormVal != Entry.Values.end() && AbbrForm != Abbrev.Attributes.end();
+       ++FormVal, ++AbbrForm) {
+    dwarf::Form Form = AbbrForm->Form;
+    bool Indirect;
+    do {
+      Indirect = false;
+      switch (Form) {
+      case dwarf::DW_FORM_addr:
+        // TODO: Test this error.
+        if (Error Err = writeVariableSizedInteger(
+                FormVal->Value, Params.AddrSize, OS, IsLittleEndian))
+          return std::move(Err);
+        break;
+      case dwarf::DW_FORM_ref_addr:
+        // TODO: Test this error.
+        if (Error Err = writeVariableSizedInteger(FormVal->Value,
+                                                  Params.getRefAddrByteSize(),
+                                                  OS, IsLittleEndian))
+          return std::move(Err);
+        break;
+      case dwarf::DW_FORM_exprloc:
+      case dwarf::DW_FORM_block:
+        encodeULEB128(FormVal->BlockData.size(), OS);
+        OS.write((const char *)FormVal->BlockData.data(),
+                 FormVal->BlockData.size());
+        break;
+      case dwarf::DW_FORM_block1: {
+        writeInteger((uint8_t)FormVal->BlockData.size(), OS, IsLittleEndian);
+        OS.write((const char *)FormVal->BlockData.data(),
+                 FormVal->BlockData.size());
+        break;
+      }
+      case dwarf::DW_FORM_block2: {
+        writeInteger((uint16_t)FormVal->BlockData.size(), OS, IsLittleEndian);
+        OS.write((const char *)FormVal->BlockData.data(),
+                 FormVal->BlockData.size());
+        break;
+      }
+      case dwarf::DW_FORM_block4: {
+        writeInteger((uint32_t)FormVal->BlockData.size(), OS, IsLittleEndian);
+        OS.write((const char *)FormVal->BlockData.data(),
+                 FormVal->BlockData.size());
+        break;
+      }
+      case dwarf::DW_FORM_strx:
+      case dwarf::DW_FORM_addrx:
+      case dwarf::DW_FORM_rnglistx:
+      case dwarf::DW_FORM_loclistx:
+      case dwarf::DW_FORM_udata:
+      case dwarf::DW_FORM_ref_udata:
+      case dwarf::DW_FORM_GNU_addr_index:
+      case dwarf::DW_FORM_GNU_str_index:
+        encodeULEB128(FormVal->Value, OS);
+        break;
+      case dwarf::DW_FORM_data1:
+      case dwarf::DW_FORM_ref1:
+      case dwarf::DW_FORM_flag:
+      case dwarf::DW_FORM_strx1:
+      case dwarf::DW_FORM_addrx1:
+        writeInteger((uint8_t)FormVal->Value, OS, IsLittleEndian);
+        break;
+      case dwarf::DW_FORM_data2:
+      case dwarf::DW_FORM_ref2:
+      case dwarf::DW_FORM_strx2:
+      case dwarf::DW_FORM_addrx2:
+        writeInteger((uint16_t)FormVal->Value, OS, IsLittleEndian);
+        break;
+      case dwarf::DW_FORM_data4:
+      case dwarf::DW_FORM_ref4:
+      case dwarf::DW_FORM_ref_sup4:
+      case dwarf::DW_FORM_strx4:
+      case dwarf::DW_FORM_addrx4:
+        writeInteger((uint32_t)FormVal->Value, OS, IsLittleEndian);
+        break;
+      case dwarf::DW_FORM_data8:
+      case dwarf::DW_FORM_ref8:
+      case dwarf::DW_FORM_ref_sup8:
+      case dwarf::DW_FORM_ref_sig8:
+        writeInteger((uint64_t)FormVal->Value, OS, IsLittleEndian);
+        break;
+      case dwarf::DW_FORM_sdata:
+        encodeSLEB128(FormVal->Value, OS);
+        break;
+      case dwarf::DW_FORM_string:
+        OS.write(FormVal->CStr.data(), FormVal->CStr.size());
+        OS.write('\0');
+        break;
+      case dwarf::DW_FORM_indirect:
+        encodeULEB128(FormVal->Value, OS);
+        Indirect = true;
+        Form = static_cast<dwarf::Form>((uint64_t)FormVal->Value);
+        ++FormVal;
+        break;
+      case dwarf::DW_FORM_strp:
+      case dwarf::DW_FORM_sec_offset:
+      case dwarf::DW_FORM_GNU_ref_alt:
+      case dwarf::DW_FORM_GNU_strp_alt:
+      case dwarf::DW_FORM_line_strp:
+      case dwarf::DW_FORM_strp_sup:
+        cantFail(writeVariableSizedInteger(FormVal->Value,
+                                           Params.getDwarfOffsetByteSize(), OS,
+                                           IsLittleEndian));
+        break;
+      default:
+        break;
+      }
+    } while (Indirect);
   }
 
-  void onValue(const uint64_t U, const bool LEB = false) override {
-    if (LEB)
-      encodeULEB128(U, OS);
-    else
-      writeInteger(U, OS, DebugInfo.IsLittleEndian);
-  }
+  return OS.tell() - EntryBegin;
+}
 
-  void onValue(const int64_t S, const bool LEB = false) override {
-    if (LEB)
-      encodeSLEB128(S, OS);
+Error DWARFYAML::emitDebugInfo(raw_ostream &OS, const DWARFYAML::Data &DI) {
+  for (uint64_t I = 0; I < DI.CompileUnits.size(); ++I) {
+    const DWARFYAML::Unit &Unit = DI.CompileUnits[I];
+    uint8_t AddrSize;
+    if (Unit.AddrSize)
+      AddrSize = *Unit.AddrSize;
     else
-      writeInteger(S, OS, DebugInfo.IsLittleEndian);
-  }
+      AddrSize = DI.Is64BitAddrSize ? 8 : 4;
+    dwarf::FormParams Params = {Unit.Version, AddrSize, Unit.Format};
+    uint64_t Length = 3; // sizeof(version) + sizeof(address_size)
+    Length += Unit.Version >= 5 ? 1 : 0;       // sizeof(unit_type)
+    Length += Params.getDwarfOffsetByteSize(); // sizeof(debug_abbrev_offset)
+
+    // Since the length of the current compilation unit is undetermined yet, we
+    // firstly write the content of the compilation unit to a buffer to
+    // calculate it and then serialize the buffer content to the actual output
+    // stream.
+    std::string EntryBuffer;
+    raw_string_ostream EntryBufferOS(EntryBuffer);
+
+    uint64_t AbbrevTableID = Unit.AbbrevTableID.getValueOr(I);
+    for (const DWARFYAML::Entry &Entry : Unit.Entries) {
+      if (Expected<uint64_t> EntryLength =
+              writeDIE(DI, I, AbbrevTableID, Params, Entry, EntryBufferOS,
+                       DI.IsLittleEndian))
+        Length += *EntryLength;
+      else
+        return EntryLength.takeError();
+    }
 
-  void onValue(const StringRef String) override {
-    OS.write(String.data(), String.size());
-    OS.write('\0');
-  }
+    // If the length is specified in the YAML description, we use it instead of
+    // the actual length.
+    if (Unit.Length)
+      Length = *Unit.Length;
 
-  void onValue(const MemoryBufferRef MBR) override {
-    OS.write(MBR.getBufferStart(), MBR.getBufferSize());
-  }
+    writeInitialLength(Unit.Format, Length, OS, DI.IsLittleEndian);
+    writeInteger((uint16_t)Unit.Version, OS, DI.IsLittleEndian);
+
+    uint64_t AbbrevTableOffset = 0;
+    if (Unit.AbbrOffset) {
+      AbbrevTableOffset = *Unit.AbbrOffset;
+    } else {
+      if (Expected<DWARFYAML::Data::AbbrevTableInfo> AbbrevTableInfoOrErr =
+              DI.getAbbrevTableInfoByID(AbbrevTableID)) {
+        AbbrevTableOffset = AbbrevTableInfoOrErr->Offset;
+      } else {
+        // The current compilation unit may not have DIEs and it will not be
+        // able to find the associated abbrev table. We consume the error and
+        // assign 0 to the debug_abbrev_offset in such circumstances.
+        consumeError(AbbrevTableInfoOrErr.takeError());
+      }
+    }
 
-public:
-  DumpVisitor(const DWARFYAML::Data &DI, raw_ostream &Out)
-      : DWARFYAML::ConstVisitor(DI), OS(Out) {}
-};
-} // namespace
+    if (Unit.Version >= 5) {
+      writeInteger((uint8_t)Unit.Type, OS, DI.IsLittleEndian);
+      writeInteger((uint8_t)AddrSize, OS, DI.IsLittleEndian);
+      writeDWARFOffset(AbbrevTableOffset, Unit.Format, OS, DI.IsLittleEndian);
+    } else {
+      writeDWARFOffset(AbbrevTableOffset, Unit.Format, OS, DI.IsLittleEndian);
+      writeInteger((uint8_t)AddrSize, OS, DI.IsLittleEndian);
+    }
 
-Error DWARFYAML::emitDebugInfo(raw_ostream &OS, const DWARFYAML::Data &DI) {
-  DumpVisitor Visitor(DI, OS);
-  return Visitor.traverseDebugInfo();
+    OS.write(EntryBuffer.data(), EntryBuffer.size());
+  }
+
+  return Error::success();
 }
 
 static void emitFileEntry(raw_ostream &OS, const DWARFYAML::File &File) {
@@ -287,96 +479,156 @@ static void emitFileEntry(raw_ostream &OS, const DWARFYAML::File &File) {
   encodeULEB128(File.Length, OS);
 }
 
+static void writeExtendedOpcode(const DWARFYAML::LineTableOpcode &Op,
+                                uint8_t AddrSize, bool IsLittleEndian,
+                                raw_ostream &OS) {
+  // The first byte of extended opcodes is a zero byte. The next bytes are an
+  // ULEB128 integer giving the number of bytes in the instruction itself (does
+  // not include the first zero byte or the size). We serialize the instruction
+  // itself into the OpBuffer and then write the size of the buffer and the
+  // buffer to the real output stream.
+  std::string OpBuffer;
+  raw_string_ostream OpBufferOS(OpBuffer);
+  writeInteger((uint8_t)Op.SubOpcode, OpBufferOS, IsLittleEndian);
+  switch (Op.SubOpcode) {
+  case dwarf::DW_LNE_set_address:
+    cantFail(writeVariableSizedInteger(Op.Data, AddrSize, OpBufferOS,
+                                       IsLittleEndian));
+    break;
+  case dwarf::DW_LNE_define_file:
+    emitFileEntry(OpBufferOS, Op.FileEntry);
+    break;
+  case dwarf::DW_LNE_set_discriminator:
+    encodeULEB128(Op.Data, OpBufferOS);
+    break;
+  case dwarf::DW_LNE_end_sequence:
+    break;
+  default:
+    for (auto OpByte : Op.UnknownOpcodeData)
+      writeInteger((uint8_t)OpByte, OpBufferOS, IsLittleEndian);
+  }
+  uint64_t ExtLen = Op.ExtLen.getValueOr(OpBuffer.size());
+  encodeULEB128(ExtLen, OS);
+  OS.write(OpBuffer.data(), OpBuffer.size());
+}
+
+static void writeLineTableOpcode(const DWARFYAML::LineTableOpcode &Op,
+                                 uint8_t OpcodeBase, uint8_t AddrSize,
+                                 raw_ostream &OS, bool IsLittleEndian) {
+  writeInteger((uint8_t)Op.Opcode, OS, IsLittleEndian);
+  if (Op.Opcode == 0) {
+    writeExtendedOpcode(Op, AddrSize, IsLittleEndian, OS);
+  } else if (Op.Opcode < OpcodeBase) {
+    switch (Op.Opcode) {
+    case dwarf::DW_LNS_copy:
+    case dwarf::DW_LNS_negate_stmt:
+    case dwarf::DW_LNS_set_basic_block:
+    case dwarf::DW_LNS_const_add_pc:
+    case dwarf::DW_LNS_set_prologue_end:
+    case dwarf::DW_LNS_set_epilogue_begin:
+      break;
+
+    case dwarf::DW_LNS_advance_pc:
+    case dwarf::DW_LNS_set_file:
+    case dwarf::DW_LNS_set_column:
+    case dwarf::DW_LNS_set_isa:
+      encodeULEB128(Op.Data, OS);
+      break;
+
+    case dwarf::DW_LNS_advance_line:
+      encodeSLEB128(Op.SData, OS);
+      break;
+
+    case dwarf::DW_LNS_fixed_advance_pc:
+      writeInteger((uint16_t)Op.Data, OS, IsLittleEndian);
+      break;
+
+    default:
+      for (auto OpData : Op.StandardOpcodeData) {
+        encodeULEB128(OpData, OS);
+      }
+    }
+  }
+}
+
+static std::vector<uint8_t>
+getStandardOpcodeLengths(uint16_t Version, Optional<uint8_t> OpcodeBase) {
+  // If the opcode_base field isn't specified, we returns the
+  // standard_opcode_lengths array according to the version by default.
+  std::vector<uint8_t> StandardOpcodeLengths{0, 1, 1, 1, 1, 0,
+                                             0, 0, 1, 0, 0, 1};
+  if (Version == 2) {
+    // DWARF v2 uses the same first 9 standard opcodes as v3-5.
+    StandardOpcodeLengths.resize(9);
+  } else if (OpcodeBase) {
+    StandardOpcodeLengths.resize(*OpcodeBase > 0 ? *OpcodeBase - 1 : 0, 0);
+  }
+  return StandardOpcodeLengths;
+}
+
 Error DWARFYAML::emitDebugLine(raw_ostream &OS, const DWARFYAML::Data &DI) {
-  for (const auto &LineTable : DI.DebugLines) {
-    writeInitialLength(LineTable.Format, LineTable.Length, OS,
-                       DI.IsLittleEndian);
-    uint64_t SizeOfPrologueLength = LineTable.Format == dwarf::DWARF64 ? 8 : 4;
-    writeInteger((uint16_t)LineTable.Version, OS, DI.IsLittleEndian);
-    cantFail(writeVariableSizedInteger(
-        LineTable.PrologueLength, SizeOfPrologueLength, OS, DI.IsLittleEndian));
-    writeInteger((uint8_t)LineTable.MinInstLength, OS, DI.IsLittleEndian);
+  for (const DWARFYAML::LineTable &LineTable : DI.DebugLines) {
+    // Buffer holds the bytes following the header_length (or prologue_length in
+    // DWARFv2) field to the end of the line number program itself.
+    std::string Buffer;
+    raw_string_ostream BufferOS(Buffer);
+
+    writeInteger(LineTable.MinInstLength, BufferOS, DI.IsLittleEndian);
+    // TODO: Add support for emitting DWARFv5 line table.
     if (LineTable.Version >= 4)
-      writeInteger((uint8_t)LineTable.MaxOpsPerInst, OS, DI.IsLittleEndian);
-    writeInteger((uint8_t)LineTable.DefaultIsStmt, OS, DI.IsLittleEndian);
-    writeInteger((uint8_t)LineTable.LineBase, OS, DI.IsLittleEndian);
-    writeInteger((uint8_t)LineTable.LineRange, OS, DI.IsLittleEndian);
-    writeInteger((uint8_t)LineTable.OpcodeBase, OS, DI.IsLittleEndian);
-
-    for (auto OpcodeLength : LineTable.StandardOpcodeLengths)
-      writeInteger((uint8_t)OpcodeLength, OS, DI.IsLittleEndian);
-
-    for (auto IncludeDir : LineTable.IncludeDirs) {
-      OS.write(IncludeDir.data(), IncludeDir.size());
-      OS.write('\0');
+      writeInteger(LineTable.MaxOpsPerInst, BufferOS, DI.IsLittleEndian);
+    writeInteger(LineTable.DefaultIsStmt, BufferOS, DI.IsLittleEndian);
+    writeInteger(LineTable.LineBase, BufferOS, DI.IsLittleEndian);
+    writeInteger(LineTable.LineRange, BufferOS, DI.IsLittleEndian);
+
+    std::vector<uint8_t> StandardOpcodeLengths =
+        LineTable.StandardOpcodeLengths.getValueOr(
+            getStandardOpcodeLengths(LineTable.Version, LineTable.OpcodeBase));
+    uint8_t OpcodeBase = LineTable.OpcodeBase
+                             ? *LineTable.OpcodeBase
+                             : StandardOpcodeLengths.size() + 1;
+    writeInteger(OpcodeBase, BufferOS, DI.IsLittleEndian);
+    for (uint8_t OpcodeLength : StandardOpcodeLengths)
+      writeInteger(OpcodeLength, BufferOS, DI.IsLittleEndian);
+
+    for (StringRef IncludeDir : LineTable.IncludeDirs) {
+      BufferOS.write(IncludeDir.data(), IncludeDir.size());
+      BufferOS.write('\0');
     }
-    OS.write('\0');
+    BufferOS.write('\0');
 
-    for (auto File : LineTable.Files)
-      emitFileEntry(OS, File);
-    OS.write('\0');
+    for (const DWARFYAML::File &File : LineTable.Files)
+      emitFileEntry(BufferOS, File);
+    BufferOS.write('\0');
 
-    for (auto Op : LineTable.Opcodes) {
-      writeInteger((uint8_t)Op.Opcode, OS, DI.IsLittleEndian);
-      if (Op.Opcode == 0) {
-        encodeULEB128(Op.ExtLen, OS);
-        writeInteger((uint8_t)Op.SubOpcode, OS, DI.IsLittleEndian);
-        switch (Op.SubOpcode) {
-        case dwarf::DW_LNE_set_address:
-        case dwarf::DW_LNE_set_discriminator:
-          // TODO: Test this error.
-          if (Error Err = writeVariableSizedInteger(
-                  Op.Data, DI.CompileUnits[0].AddrSize, OS, DI.IsLittleEndian))
-            return Err;
-          break;
-        case dwarf::DW_LNE_define_file:
-          emitFileEntry(OS, Op.FileEntry);
-          break;
-        case dwarf::DW_LNE_end_sequence:
-          break;
-        default:
-          for (auto OpByte : Op.UnknownOpcodeData)
-            writeInteger((uint8_t)OpByte, OS, DI.IsLittleEndian);
-        }
-      } else if (Op.Opcode < LineTable.OpcodeBase) {
-        switch (Op.Opcode) {
-        case dwarf::DW_LNS_copy:
-        case dwarf::DW_LNS_negate_stmt:
-        case dwarf::DW_LNS_set_basic_block:
-        case dwarf::DW_LNS_const_add_pc:
-        case dwarf::DW_LNS_set_prologue_end:
-        case dwarf::DW_LNS_set_epilogue_begin:
-          break;
-
-        case dwarf::DW_LNS_advance_pc:
-        case dwarf::DW_LNS_set_file:
-        case dwarf::DW_LNS_set_column:
-        case dwarf::DW_LNS_set_isa:
-          encodeULEB128(Op.Data, OS);
-          break;
-
-        case dwarf::DW_LNS_advance_line:
-          encodeSLEB128(Op.SData, OS);
-          break;
-
-        case dwarf::DW_LNS_fixed_advance_pc:
-          writeInteger((uint16_t)Op.Data, OS, DI.IsLittleEndian);
-          break;
-
-        default:
-          for (auto OpData : Op.StandardOpcodeData) {
-            encodeULEB128(OpData, OS);
-          }
-        }
-      }
+    uint64_t HeaderLength =
+        LineTable.PrologueLength ? *LineTable.PrologueLength : Buffer.size();
+
+    for (const DWARFYAML::LineTableOpcode &Op : LineTable.Opcodes)
+      writeLineTableOpcode(Op, OpcodeBase, DI.Is64BitAddrSize ? 8 : 4, BufferOS,
+                           DI.IsLittleEndian);
+
+    uint64_t Length;
+    if (LineTable.Length) {
+      Length = *LineTable.Length;
+    } else {
+      Length = 2; // sizeof(version)
+      Length +=
+          (LineTable.Format == dwarf::DWARF64 ? 8 : 4); // sizeof(header_length)
+      Length += Buffer.size();
     }
+
+    writeInitialLength(LineTable.Format, Length, OS, DI.IsLittleEndian);
+    writeInteger(LineTable.Version, OS, DI.IsLittleEndian);
+    writeDWARFOffset(HeaderLength, LineTable.Format, OS, DI.IsLittleEndian);
+    OS.write(Buffer.data(), Buffer.size());
   }
 
   return Error::success();
 }
 
 Error DWARFYAML::emitDebugAddr(raw_ostream &OS, const Data &DI) {
-  for (const AddrTableEntry &TableEntry : DI.DebugAddr) {
+  for (const AddrTableEntry &TableEntry : *DI.DebugAddr) {
     uint8_t AddrSize;
     if (TableEntry.AddrSize)
       AddrSize = *TableEntry.AddrSize;
@@ -397,7 +649,7 @@ Error DWARFYAML::emitDebugAddr(raw_ostream &OS, const Data &DI) {
     writeInteger((uint8_t)TableEntry.SegSelectorSize, OS, DI.IsLittleEndian);
 
     for (const SegAddrPair &Pair : TableEntry.SegAddrPairs) {
-      if (TableEntry.SegSelectorSize != 0)
+      if (TableEntry.SegSelectorSize != yaml::Hex8{0})
         if (Error Err = writeVariableSizedInteger(Pair.Segment,
                                                   TableEntry.SegSelectorSize,
                                                   OS, DI.IsLittleEndian))
@@ -416,68 +668,389 @@ Error DWARFYAML::emitDebugAddr(raw_ostream &OS, const Data &DI) {
   return Error::success();
 }
 
-using EmitFuncType = Error (*)(raw_ostream &, const DWARFYAML::Data &);
+Error DWARFYAML::emitDebugStrOffsets(raw_ostream &OS, const Data &DI) {
+  assert(DI.DebugStrOffsets && "unexpected emitDebugStrOffsets() call");
+  for (const DWARFYAML::StringOffsetsTable &Table : *DI.DebugStrOffsets) {
+    uint64_t Length;
+    if (Table.Length)
+      Length = *Table.Length;
+    else
+      // sizeof(version) + sizeof(padding) = 4
+      Length =
+          4 + Table.Offsets.size() * (Table.Format == dwarf::DWARF64 ? 8 : 4);
 
-static Error
-emitDebugSectionImpl(const DWARFYAML::Data &DI, EmitFuncType EmitFunc,
-                     StringRef Sec,
-                     StringMap<std::unique_ptr<MemoryBuffer>> &OutputBuffers) {
-  std::string Data;
-  raw_string_ostream DebugInfoStream(Data);
-  if (Error Err = EmitFunc(DebugInfoStream, DI))
-    return Err;
-  DebugInfoStream.flush();
-  if (!Data.empty())
-    OutputBuffers[Sec] = MemoryBuffer::getMemBufferCopy(Data);
+    writeInitialLength(Table.Format, Length, OS, DI.IsLittleEndian);
+    writeInteger((uint16_t)Table.Version, OS, DI.IsLittleEndian);
+    writeInteger((uint16_t)Table.Padding, OS, DI.IsLittleEndian);
+
+    for (uint64_t Offset : Table.Offsets)
+      writeDWARFOffset(Offset, Table.Format, OS, DI.IsLittleEndian);
+  }
+
+  return Error::success();
+}
+
+static Error checkOperandCount(StringRef EncodingString,
+                               ArrayRef<yaml::Hex64> Values,
+                               uint64_t ExpectedOperands) {
+  if (Values.size() != ExpectedOperands)
+    return createStringError(
+        errc::invalid_argument,
+        "invalid number (%zu) of operands for the operator: %s, %" PRIu64
+        " expected",
+        Values.size(), EncodingString.str().c_str(), ExpectedOperands);
 
   return Error::success();
 }
 
-namespace {
-class DIEFixupVisitor : public DWARFYAML::Visitor {
-  uint64_t Length;
+static Error writeListEntryAddress(StringRef EncodingName, raw_ostream &OS,
+                                   uint64_t Addr, uint8_t AddrSize,
+                                   bool IsLittleEndian) {
+  if (Error Err = writeVariableSizedInteger(Addr, AddrSize, OS, IsLittleEndian))
+    return createStringError(errc::invalid_argument,
+                             "unable to write address for the operator %s: %s",
+                             EncodingName.str().c_str(),
+                             toString(std::move(Err)).c_str());
 
-public:
-  DIEFixupVisitor(DWARFYAML::Data &DI) : DWARFYAML::Visitor(DI){};
+  return Error::success();
+}
+
+static Expected<uint64_t>
+writeDWARFExpression(raw_ostream &OS,
+                     const DWARFYAML::DWARFOperation &Operation,
+                     uint8_t AddrSize, bool IsLittleEndian) {
+  auto CheckOperands = [&](uint64_t ExpectedOperands) -> Error {
+    return checkOperandCount(dwarf::OperationEncodingString(Operation.Operator),
+                             Operation.Values, ExpectedOperands);
+  };
 
-protected:
-  void onStartCompileUnit(DWARFYAML::Unit &CU) override {
-    // Size of the unit header, excluding the length field itself.
-    Length = CU.Version >= 5 ? 8 : 7;
+  uint64_t ExpressionBegin = OS.tell();
+  writeInteger((uint8_t)Operation.Operator, OS, IsLittleEndian);
+  switch (Operation.Operator) {
+  case dwarf::DW_OP_consts:
+    if (Error Err = CheckOperands(1))
+      return std::move(Err);
+    encodeSLEB128(Operation.Values[0], OS);
+    break;
+  case dwarf::DW_OP_stack_value:
+    if (Error Err = CheckOperands(0))
+      return std::move(Err);
+    break;
+  default:
+    StringRef EncodingStr = dwarf::OperationEncodingString(Operation.Operator);
+    return createStringError(errc::not_supported,
+                             "DWARF expression: " +
+                                 (EncodingStr.empty()
+                                      ? "0x" + utohexstr(Operation.Operator)
+                                      : EncodingStr) +
+                                 " is not supported");
   }
+  return OS.tell() - ExpressionBegin;
+}
+
+static Expected<uint64_t> writeListEntry(raw_ostream &OS,
+                                         const DWARFYAML::RnglistEntry &Entry,
+                                         uint8_t AddrSize,
+                                         bool IsLittleEndian) {
+  uint64_t BeginOffset = OS.tell();
+  writeInteger((uint8_t)Entry.Operator, OS, IsLittleEndian);
+
+  StringRef EncodingName = dwarf::RangeListEncodingString(Entry.Operator);
+
+  auto CheckOperands = [&](uint64_t ExpectedOperands) -> Error {
+    return checkOperandCount(EncodingName, Entry.Values, ExpectedOperands);
+  };
 
-  void onEndCompileUnit(DWARFYAML::Unit &CU) override { CU.Length = Length; }
+  auto WriteAddress = [&](uint64_t Addr) -> Error {
+    return writeListEntryAddress(EncodingName, OS, Addr, AddrSize,
+                                 IsLittleEndian);
+  };
 
-  void onStartDIE(DWARFYAML::Unit &CU, DWARFYAML::Entry &DIE) override {
-    Length += getULEB128Size(DIE.AbbrCode);
+  switch (Entry.Operator) {
+  case dwarf::DW_RLE_end_of_list:
+    if (Error Err = CheckOperands(0))
+      return std::move(Err);
+    break;
+  case dwarf::DW_RLE_base_addressx:
+    if (Error Err = CheckOperands(1))
+      return std::move(Err);
+    encodeULEB128(Entry.Values[0], OS);
+    break;
+  case dwarf::DW_RLE_startx_endx:
+  case dwarf::DW_RLE_startx_length:
+  case dwarf::DW_RLE_offset_pair:
+    if (Error Err = CheckOperands(2))
+      return std::move(Err);
+    encodeULEB128(Entry.Values[0], OS);
+    encodeULEB128(Entry.Values[1], OS);
+    break;
+  case dwarf::DW_RLE_base_address:
+    if (Error Err = CheckOperands(1))
+      return std::move(Err);
+    if (Error Err = WriteAddress(Entry.Values[0]))
+      return std::move(Err);
+    break;
+  case dwarf::DW_RLE_start_end:
+    if (Error Err = CheckOperands(2))
+      return std::move(Err);
+    if (Error Err = WriteAddress(Entry.Values[0]))
+      return std::move(Err);
+    cantFail(WriteAddress(Entry.Values[1]));
+    break;
+  case dwarf::DW_RLE_start_length:
+    if (Error Err = CheckOperands(2))
+      return std::move(Err);
+    if (Error Err = WriteAddress(Entry.Values[0]))
+      return std::move(Err);
+    encodeULEB128(Entry.Values[1], OS);
+    break;
   }
 
-  void onValue(const uint8_t U) override { Length += 1; }
-  void onValue(const uint16_t U) override { Length += 2; }
-  void onValue(const uint32_t U) override { Length += 4; }
-  void onValue(const uint64_t U, const bool LEB = false) override {
-    if (LEB)
-      Length += getULEB128Size(U);
+  return OS.tell() - BeginOffset;
+}
+
+static Expected<uint64_t> writeListEntry(raw_ostream &OS,
+                                         const DWARFYAML::LoclistEntry &Entry,
+                                         uint8_t AddrSize,
+                                         bool IsLittleEndian) {
+  uint64_t BeginOffset = OS.tell();
+  writeInteger((uint8_t)Entry.Operator, OS, IsLittleEndian);
+
+  StringRef EncodingName = dwarf::LocListEncodingString(Entry.Operator);
+
+  auto CheckOperands = [&](uint64_t ExpectedOperands) -> Error {
+    return checkOperandCount(EncodingName, Entry.Values, ExpectedOperands);
+  };
+
+  auto WriteAddress = [&](uint64_t Addr) -> Error {
+    return writeListEntryAddress(EncodingName, OS, Addr, AddrSize,
+                                 IsLittleEndian);
+  };
+
+  auto WriteDWARFOperations = [&]() -> Error {
+    std::string OpBuffer;
+    raw_string_ostream OpBufferOS(OpBuffer);
+    uint64_t DescriptionsLength = 0;
+
+    for (const DWARFYAML::DWARFOperation &Op : Entry.Descriptions) {
+      if (Expected<uint64_t> OpSize =
+              writeDWARFExpression(OpBufferOS, Op, AddrSize, IsLittleEndian))
+        DescriptionsLength += *OpSize;
+      else
+        return OpSize.takeError();
+    }
+
+    if (Entry.DescriptionsLength)
+      DescriptionsLength = *Entry.DescriptionsLength;
     else
-      Length += 8;
+      DescriptionsLength = OpBuffer.size();
+
+    encodeULEB128(DescriptionsLength, OS);
+    OS.write(OpBuffer.data(), OpBuffer.size());
+
+    return Error::success();
+  };
+
+  switch (Entry.Operator) {
+  case dwarf::DW_LLE_end_of_list:
+    if (Error Err = CheckOperands(0))
+      return std::move(Err);
+    break;
+  case dwarf::DW_LLE_base_addressx:
+    if (Error Err = CheckOperands(1))
+      return std::move(Err);
+    encodeULEB128(Entry.Values[0], OS);
+    break;
+  case dwarf::DW_LLE_startx_endx:
+  case dwarf::DW_LLE_startx_length:
+  case dwarf::DW_LLE_offset_pair:
+    if (Error Err = CheckOperands(2))
+      return std::move(Err);
+    encodeULEB128(Entry.Values[0], OS);
+    encodeULEB128(Entry.Values[1], OS);
+    if (Error Err = WriteDWARFOperations())
+      return std::move(Err);
+    break;
+  case dwarf::DW_LLE_default_location:
+    if (Error Err = CheckOperands(0))
+      return std::move(Err);
+    if (Error Err = WriteDWARFOperations())
+      return std::move(Err);
+    break;
+  case dwarf::DW_LLE_base_address:
+    if (Error Err = CheckOperands(1))
+      return std::move(Err);
+    if (Error Err = WriteAddress(Entry.Values[0]))
+      return std::move(Err);
+    break;
+  case dwarf::DW_LLE_start_end:
+    if (Error Err = CheckOperands(2))
+      return std::move(Err);
+    if (Error Err = WriteAddress(Entry.Values[0]))
+      return std::move(Err);
+    cantFail(WriteAddress(Entry.Values[1]));
+    if (Error Err = WriteDWARFOperations())
+      return std::move(Err);
+    break;
+  case dwarf::DW_LLE_start_length:
+    if (Error Err = CheckOperands(2))
+      return std::move(Err);
+    if (Error Err = WriteAddress(Entry.Values[0]))
+      return std::move(Err);
+    encodeULEB128(Entry.Values[1], OS);
+    if (Error Err = WriteDWARFOperations())
+      return std::move(Err);
+    break;
   }
-  void onValue(const int64_t S, const bool LEB = false) override {
-    if (LEB)
-      Length += getSLEB128Size(S);
+
+  return OS.tell() - BeginOffset;
+}
+
+template <typename EntryType>
+static Error writeDWARFLists(raw_ostream &OS,
+                             ArrayRef<DWARFYAML::ListTable<EntryType>> Tables,
+                             bool IsLittleEndian, bool Is64BitAddrSize) {
+  for (const DWARFYAML::ListTable<EntryType> &Table : Tables) {
+    // sizeof(version) + sizeof(address_size) + sizeof(segment_selector_size) +
+    // sizeof(offset_entry_count) = 8
+    uint64_t Length = 8;
+
+    uint8_t AddrSize;
+    if (Table.AddrSize)
+      AddrSize = *Table.AddrSize;
     else
-      Length += 8;
-  }
-  void onValue(const StringRef String) override { Length += String.size() + 1; }
+      AddrSize = Is64BitAddrSize ? 8 : 4;
+
+    // Since the length of the current range/location lists entry is
+    // undetermined yet, we firstly write the content of the range/location
+    // lists to a buffer to calculate the length and then serialize the buffer
+    // content to the actual output stream.
+    std::string ListBuffer;
+    raw_string_ostream ListBufferOS(ListBuffer);
+
+    // Offsets holds offsets for each range/location list. The i-th element is
+    // the offset from the beginning of the first range/location list to the
+    // location of the i-th range list.
+    std::vector<uint64_t> Offsets;
+
+    for (const DWARFYAML::ListEntries<EntryType> &List : Table.Lists) {
+      Offsets.push_back(ListBufferOS.tell());
+      if (List.Content) {
+        List.Content->writeAsBinary(ListBufferOS, UINT64_MAX);
+        Length += List.Content->binary_size();
+      } else if (List.Entries) {
+        for (const EntryType &Entry : *List.Entries) {
+          Expected<uint64_t> EntrySize =
+              writeListEntry(ListBufferOS, Entry, AddrSize, IsLittleEndian);
+          if (!EntrySize)
+            return EntrySize.takeError();
+          Length += *EntrySize;
+        }
+      }
+    }
 
-  void onValue(const MemoryBufferRef MBR) override {
-    Length += MBR.getBufferSize();
+    // If the offset_entry_count field isn't specified, yaml2obj will infer it
+    // from the 'Offsets' field in the YAML description. If the 'Offsets' field
+    // isn't specified either, yaml2obj will infer it from the auto-generated
+    // offsets.
+    uint32_t OffsetEntryCount;
+    if (Table.OffsetEntryCount)
+      OffsetEntryCount = *Table.OffsetEntryCount;
+    else
+      OffsetEntryCount = Table.Offsets ? Table.Offsets->size() : Offsets.size();
+    uint64_t OffsetsSize =
+        OffsetEntryCount * (Table.Format == dwarf::DWARF64 ? 8 : 4);
+    Length += OffsetsSize;
+
+    // If the length is specified in the YAML description, we use it instead of
+    // the actual length.
+    if (Table.Length)
+      Length = *Table.Length;
+
+    writeInitialLength(Table.Format, Length, OS, IsLittleEndian);
+    writeInteger((uint16_t)Table.Version, OS, IsLittleEndian);
+    writeInteger((uint8_t)AddrSize, OS, IsLittleEndian);
+    writeInteger((uint8_t)Table.SegSelectorSize, OS, IsLittleEndian);
+    writeInteger((uint32_t)OffsetEntryCount, OS, IsLittleEndian);
+
+    auto EmitOffsets = [&](ArrayRef<uint64_t> Offsets, uint64_t OffsetsSize) {
+      for (uint64_t Offset : Offsets)
+        writeDWARFOffset(OffsetsSize + Offset, Table.Format, OS,
+                         IsLittleEndian);
+    };
+
+    if (Table.Offsets)
+      EmitOffsets(ArrayRef<uint64_t>((const uint64_t *)Table.Offsets->data(),
+                                     Table.Offsets->size()),
+                  0);
+    else if (OffsetEntryCount != 0)
+      EmitOffsets(Offsets, OffsetsSize);
+
+    OS.write(ListBuffer.data(), ListBuffer.size());
   }
-};
-} // namespace
+
+  return Error::success();
+}
+
+Error DWARFYAML::emitDebugRnglists(raw_ostream &OS, const Data &DI) {
+  assert(DI.DebugRnglists && "unexpected emitDebugRnglists() call");
+  return writeDWARFLists<DWARFYAML::RnglistEntry>(
+      OS, *DI.DebugRnglists, DI.IsLittleEndian, DI.Is64BitAddrSize);
+}
+
+Error DWARFYAML::emitDebugLoclists(raw_ostream &OS, const Data &DI) {
+  assert(DI.DebugLoclists && "unexpected emitDebugRnglists() call");
+  return writeDWARFLists<DWARFYAML::LoclistEntry>(
+      OS, *DI.DebugLoclists, DI.IsLittleEndian, DI.Is64BitAddrSize);
+}
+
+std::function<Error(raw_ostream &, const DWARFYAML::Data &)>
+DWARFYAML::getDWARFEmitterByName(StringRef SecName) {
+  auto EmitFunc =
+      StringSwitch<
+          std::function<Error(raw_ostream &, const DWARFYAML::Data &)>>(SecName)
+          .Case("debug_abbrev", DWARFYAML::emitDebugAbbrev)
+          .Case("debug_addr", DWARFYAML::emitDebugAddr)
+          .Case("debug_aranges", DWARFYAML::emitDebugAranges)
+          .Case("debug_gnu_pubnames", DWARFYAML::emitDebugGNUPubnames)
+          .Case("debug_gnu_pubtypes", DWARFYAML::emitDebugGNUPubtypes)
+          .Case("debug_info", DWARFYAML::emitDebugInfo)
+          .Case("debug_line", DWARFYAML::emitDebugLine)
+          .Case("debug_loclists", DWARFYAML::emitDebugLoclists)
+          .Case("debug_pubnames", DWARFYAML::emitDebugPubnames)
+          .Case("debug_pubtypes", DWARFYAML::emitDebugPubtypes)
+          .Case("debug_ranges", DWARFYAML::emitDebugRanges)
+          .Case("debug_rnglists", DWARFYAML::emitDebugRnglists)
+          .Case("debug_str", DWARFYAML::emitDebugStr)
+          .Case("debug_str_offsets", DWARFYAML::emitDebugStrOffsets)
+          .Default([&](raw_ostream &, const DWARFYAML::Data &) {
+            return createStringError(errc::not_supported,
+                                     SecName + " is not supported");
+          });
+
+  return EmitFunc;
+}
+
+static Error
+emitDebugSectionImpl(const DWARFYAML::Data &DI, StringRef Sec,
+                     StringMap<std::unique_ptr<MemoryBuffer>> &OutputBuffers) {
+  std::string Data;
+  raw_string_ostream DebugInfoStream(Data);
+
+  auto EmitFunc = DWARFYAML::getDWARFEmitterByName(Sec);
+
+  if (Error Err = EmitFunc(DebugInfoStream, DI))
+    return Err;
+  DebugInfoStream.flush();
+  if (!Data.empty())
+    OutputBuffers[Sec] = MemoryBuffer::getMemBufferCopy(Data);
+
+  return Error::success();
+}
 
 Expected<StringMap<std::unique_ptr<MemoryBuffer>>>
-DWARFYAML::emitDebugSections(StringRef YAMLString, bool ApplyFixups,
-                             bool IsLittleEndian) {
+DWARFYAML::emitDebugSections(StringRef YAMLString, bool IsLittleEndian,
+                             bool Is64BitAddrSize) {
   auto CollectDiagnostic = [](const SMDiagnostic &Diag, void *DiagContext) {
     *static_cast<SMDiagnostic *>(DiagContext) = Diag;
   };
@@ -488,34 +1061,18 @@ DWARFYAML::emitDebugSections(StringRef YAMLString, bool ApplyFixups,
 
   DWARFYAML::Data DI;
   DI.IsLittleEndian = IsLittleEndian;
+  DI.Is64BitAddrSize = Is64BitAddrSize;
+
   YIn >> DI;
   if (YIn.error())
     return createStringError(YIn.error(), GeneratedDiag.getMessage());
 
-  if (ApplyFixups) {
-    DIEFixupVisitor DIFixer(DI);
-    if (Error Err = DIFixer.traverseDebugInfo())
-      return std::move(Err);
-  }
-
   StringMap<std::unique_ptr<MemoryBuffer>> DebugSections;
-  Error Err = emitDebugSectionImpl(DI, &DWARFYAML::emitDebugInfo, "debug_info",
-                                   DebugSections);
-  Err = joinErrors(std::move(Err),
-                   emitDebugSectionImpl(DI, &DWARFYAML::emitDebugLine,
-                                        "debug_line", DebugSections));
-  Err = joinErrors(std::move(Err),
-                   emitDebugSectionImpl(DI, &DWARFYAML::emitDebugStr,
-                                        "debug_str", DebugSections));
-  Err = joinErrors(std::move(Err),
-                   emitDebugSectionImpl(DI, &DWARFYAML::emitDebugAbbrev,
-                                        "debug_abbrev", DebugSections));
-  Err = joinErrors(std::move(Err),
-                   emitDebugSectionImpl(DI, &DWARFYAML::emitDebugAranges,
-                                        "debug_aranges", DebugSections));
-  Err = joinErrors(std::move(Err),
-                   emitDebugSectionImpl(DI, &DWARFYAML::emitDebugRanges,
-                                        "debug_ranges", DebugSections));
+  Error Err = Error::success();
+
+  for (StringRef SecName : DI.getNonEmptySectionNames())
+    Err = joinErrors(std::move(Err),
+                     emitDebugSectionImpl(DI, SecName, DebugSections));
 
   if (Err)
     return std::move(Err);
diff --git a/contrib/llvm-project/llvm/lib/ObjectYAML/DWARFVisitor.cpp b/contrib/llvm-project/llvm/lib/ObjectYAML/DWARFVisitor.cpp
deleted file mode 100644
index a2dd37b5fe32..000000000000
--- a/contrib/llvm-project/llvm/lib/ObjectYAML/DWARFVisitor.cpp
+++ /dev/null
@@ -1,196 +0,0 @@
-//===--- DWARFVisitor.cpp ---------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//===----------------------------------------------------------------------===//
-
-#include "DWARFVisitor.h"
-#include "llvm/BinaryFormat/Dwarf.h"
-#include "llvm/ObjectYAML/DWARFYAML.h"
-#include "llvm/Support/Errc.h"
-#include "llvm/Support/Error.h"
-
-using namespace llvm;
-
-template <typename T>
-void DWARFYAML::VisitorImpl<T>::onVariableSizeValue(uint64_t U, unsigned Size) {
-  switch (Size) {
-  case 8:
-    onValue((uint64_t)U);
-    break;
-  case 4:
-    onValue((uint32_t)U);
-    break;
-  case 2:
-    onValue((uint16_t)U);
-    break;
-  case 1:
-    onValue((uint8_t)U);
-    break;
-  default:
-    llvm_unreachable("Invalid integer write size.");
-  }
-}
-
-static unsigned getOffsetSize(const DWARFYAML::Unit &Unit) {
-  return Unit.Format == dwarf::DWARF64 ? 8 : 4;
-}
-
-static unsigned getRefSize(const DWARFYAML::Unit &Unit) {
-  if (Unit.Version == 2)
-    return Unit.AddrSize;
-  return getOffsetSize(Unit);
-}
-
-template <typename T> Error DWARFYAML::VisitorImpl<T>::traverseDebugInfo() {
-  for (auto &Unit : DebugInfo.CompileUnits) {
-    onStartCompileUnit(Unit);
-    if (Unit.Entries.empty())
-      continue;
-
-    for (auto &Entry : Unit.Entries) {
-      onStartDIE(Unit, Entry);
-      uint32_t AbbrCode = Entry.AbbrCode;
-      if (AbbrCode == 0 || Entry.Values.empty())
-        continue;
-
-      if (AbbrCode > DebugInfo.AbbrevDecls.size())
-        return createStringError(
-            errc::invalid_argument,
-            "abbrev code must be less than or equal to the number of "
-            "entries in abbreviation table");
-      const DWARFYAML::Abbrev &Abbrev = DebugInfo.AbbrevDecls[AbbrCode - 1];
-      auto FormVal = Entry.Values.begin();
-      auto AbbrForm = Abbrev.Attributes.begin();
-      for (;
-           FormVal != Entry.Values.end() && AbbrForm != Abbrev.Attributes.end();
-           ++FormVal, ++AbbrForm) {
-        onForm(*AbbrForm, *FormVal);
-        dwarf::Form Form = AbbrForm->Form;
-        bool Indirect;
-        do {
-          Indirect = false;
-          switch (Form) {
-          case dwarf::DW_FORM_addr:
-            onVariableSizeValue(FormVal->Value, Unit.AddrSize);
-            break;
-          case dwarf::DW_FORM_ref_addr:
-            onVariableSizeValue(FormVal->Value, getRefSize(Unit));
-            break;
-          case dwarf::DW_FORM_exprloc:
-          case dwarf::DW_FORM_block:
-            onValue((uint64_t)FormVal->BlockData.size(), true);
-            onValue(
-                MemoryBufferRef(StringRef((const char *)&FormVal->BlockData[0],
-                                          FormVal->BlockData.size()),
-                                ""));
-            break;
-          case dwarf::DW_FORM_block1: {
-            auto writeSize = FormVal->BlockData.size();
-            onValue((uint8_t)writeSize);
-            onValue(
-                MemoryBufferRef(StringRef((const char *)&FormVal->BlockData[0],
-                                          FormVal->BlockData.size()),
-                                ""));
-            break;
-          }
-          case dwarf::DW_FORM_block2: {
-            auto writeSize = FormVal->BlockData.size();
-            onValue((uint16_t)writeSize);
-            onValue(
-                MemoryBufferRef(StringRef((const char *)&FormVal->BlockData[0],
-                                          FormVal->BlockData.size()),
-                                ""));
-            break;
-          }
-          case dwarf::DW_FORM_block4: {
-            auto writeSize = FormVal->BlockData.size();
-            onValue((uint32_t)writeSize);
-            onValue(
-                MemoryBufferRef(StringRef((const char *)&FormVal->BlockData[0],
-                                          FormVal->BlockData.size()),
-                                ""));
-            break;
-          }
-          case dwarf::DW_FORM_strx:
-          case dwarf::DW_FORM_addrx:
-          case dwarf::DW_FORM_rnglistx:
-          case dwarf::DW_FORM_loclistx:
-            onValue((uint64_t)FormVal->Value, /*LEB=*/true);
-            break;
-          case dwarf::DW_FORM_data1:
-          case dwarf::DW_FORM_ref1:
-          case dwarf::DW_FORM_flag:
-          case dwarf::DW_FORM_strx1:
-          case dwarf::DW_FORM_addrx1:
-            onValue((uint8_t)FormVal->Value);
-            break;
-          case dwarf::DW_FORM_data2:
-          case dwarf::DW_FORM_ref2:
-          case dwarf::DW_FORM_strx2:
-          case dwarf::DW_FORM_addrx2:
-            onValue((uint16_t)FormVal->Value);
-            break;
-          case dwarf::DW_FORM_data4:
-          case dwarf::DW_FORM_ref4:
-          case dwarf::DW_FORM_ref_sup4:
-          case dwarf::DW_FORM_strx4:
-          case dwarf::DW_FORM_addrx4:
-            onValue((uint32_t)FormVal->Value);
-            break;
-          case dwarf::DW_FORM_data8:
-          case dwarf::DW_FORM_ref8:
-          case dwarf::DW_FORM_ref_sup8:
-            onValue((uint64_t)FormVal->Value);
-            break;
-          case dwarf::DW_FORM_sdata:
-            onValue((int64_t)FormVal->Value, true);
-            break;
-          case dwarf::DW_FORM_udata:
-          case dwarf::DW_FORM_ref_udata:
-            onValue((uint64_t)FormVal->Value, true);
-            break;
-          case dwarf::DW_FORM_string:
-            onValue(FormVal->CStr);
-            break;
-          case dwarf::DW_FORM_indirect:
-            onValue((uint64_t)FormVal->Value, true);
-            Indirect = true;
-            Form = static_cast<dwarf::Form>((uint64_t)FormVal->Value);
-            ++FormVal;
-            break;
-          case dwarf::DW_FORM_strp:
-          case dwarf::DW_FORM_sec_offset:
-          case dwarf::DW_FORM_GNU_ref_alt:
-          case dwarf::DW_FORM_GNU_strp_alt:
-          case dwarf::DW_FORM_line_strp:
-          case dwarf::DW_FORM_strp_sup:
-            onVariableSizeValue(FormVal->Value, getOffsetSize(Unit));
-            break;
-          case dwarf::DW_FORM_ref_sig8:
-            onValue((uint64_t)FormVal->Value);
-            break;
-          case dwarf::DW_FORM_GNU_addr_index:
-          case dwarf::DW_FORM_GNU_str_index:
-            onValue((uint64_t)FormVal->Value, true);
-            break;
-          default:
-            break;
-          }
-        } while (Indirect);
-      }
-      onEndDIE(Unit, Entry);
-    }
-    onEndCompileUnit(Unit);
-  }
-
-  return Error::success();
-}
-
-// Explicitly instantiate the two template expansions.
-template class DWARFYAML::VisitorImpl<DWARFYAML::Data>;
-template class DWARFYAML::VisitorImpl<const DWARFYAML::Data>;
diff --git a/contrib/llvm-project/llvm/lib/ObjectYAML/DWARFVisitor.h b/contrib/llvm-project/llvm/lib/ObjectYAML/DWARFVisitor.h
deleted file mode 100644
index 3b2c4303c7f7..000000000000
--- a/contrib/llvm-project/llvm/lib/ObjectYAML/DWARFVisitor.h
+++ /dev/null
@@ -1,97 +0,0 @@
-//===--- DWARFVisitor.h -----------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_OBJECTYAML_DWARFVISITOR_H
-#define LLVM_OBJECTYAML_DWARFVISITOR_H
-
-#include "llvm/ADT/StringRef.h"
-#include "llvm/BinaryFormat/Dwarf.h"
-#include "llvm/Support/MemoryBuffer.h"
-
-namespace llvm {
-class Error;
-
-namespace DWARFYAML {
-
-struct Data;
-struct Unit;
-struct Entry;
-struct FormValue;
-struct AttributeAbbrev;
-
-/// A class to visits DWARFYAML Compile Units and DIEs in preorder.
-///
-/// Extensions of this class can either maintain const or non-const references
-/// to the DWARFYAML::Data object.
-template <typename T> class VisitorImpl {
-protected:
-  T &DebugInfo;
-
-  /// Visitor Functions
-  /// @{
-  virtual void onStartCompileUnit(Unit &CU) {}
-  virtual void onEndCompileUnit(Unit &CU) {}
-  virtual void onStartDIE(Unit &CU, Entry &DIE) {}
-  virtual void onEndDIE(Unit &CU, Entry &DIE) {}
-  virtual void onForm(AttributeAbbrev &AttAbbrev, FormValue &Value) {}
-  /// @}
-
-  /// Const Visitor Functions
-  /// @{
-  virtual void onStartCompileUnit(const Unit &CU) {}
-  virtual void onEndCompileUnit(const Unit &CU) {}
-  virtual void onStartDIE(const Unit &CU, const Entry &DIE) {}
-  virtual void onEndDIE(const Unit &CU, const Entry &DIE) {}
-  virtual void onForm(const AttributeAbbrev &AttAbbrev,
-                      const FormValue &Value) {}
-  /// @}
-
-  /// Value visitors
-  /// @{
-  virtual void onValue(const uint8_t U) {}
-  virtual void onValue(const uint16_t U) {}
-  virtual void onValue(const uint32_t U) {}
-  virtual void onValue(const uint64_t U, const bool LEB = false) {}
-  virtual void onValue(const int64_t S, const bool LEB = false) {}
-  virtual void onValue(const StringRef String) {}
-  virtual void onValue(const MemoryBufferRef MBR) {}
-  /// @}
-
-public:
-  VisitorImpl(T &DI) : DebugInfo(DI) {}
-
-  virtual ~VisitorImpl() {}
-
-  Error traverseDebugInfo();
-
-private:
-  void onVariableSizeValue(uint64_t U, unsigned Size);
-};
-
-// Making the visior instantiations extern and explicit in the cpp file. This
-// prevents them from being instantiated in every compile unit that uses the
-// visitors.
-extern template class VisitorImpl<DWARFYAML::Data>;
-extern template class VisitorImpl<const DWARFYAML::Data>;
-
-class Visitor : public VisitorImpl<Data> {
-public:
-  Visitor(Data &DI) : VisitorImpl<Data>(DI) {}
-};
-
-class ConstVisitor : public VisitorImpl<const Data> {
-public:
-  ConstVisitor(const Data &DI) : VisitorImpl<const Data>(DI) {}
-};
-
-} // namespace DWARFYAML
-} // namespace llvm
-
-#endif
diff --git a/contrib/llvm-project/llvm/lib/ObjectYAML/DWARFYAML.cpp b/contrib/llvm-project/llvm/lib/ObjectYAML/DWARFYAML.cpp
index bedf31dc8179..2591bf4d5af4 100644
--- a/contrib/llvm-project/llvm/lib/ObjectYAML/DWARFYAML.cpp
+++ b/contrib/llvm-project/llvm/lib/ObjectYAML/DWARFYAML.cpp
@@ -13,28 +13,28 @@
 
 #include "llvm/ObjectYAML/DWARFYAML.h"
 #include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/Error.h"
 
 namespace llvm {
 
 bool DWARFYAML::Data::isEmpty() const {
-  return DebugStrings.empty() && AbbrevDecls.empty() && ARanges.empty() &&
-         DebugRanges.empty() && !PubNames && !PubTypes && !GNUPubNames &&
-         !GNUPubTypes && CompileUnits.empty() && DebugLines.empty();
+  return getNonEmptySectionNames().empty();
 }
 
-SetVector<StringRef> DWARFYAML::Data::getUsedSectionNames() const {
+SetVector<StringRef> DWARFYAML::Data::getNonEmptySectionNames() const {
   SetVector<StringRef> SecNames;
-  if (!DebugStrings.empty())
+  if (DebugStrings)
     SecNames.insert("debug_str");
-  if (!ARanges.empty())
+  if (DebugAranges)
     SecNames.insert("debug_aranges");
-  if (!DebugRanges.empty())
+  if (DebugRanges)
     SecNames.insert("debug_ranges");
   if (!DebugLines.empty())
     SecNames.insert("debug_line");
-  if (!DebugAddr.empty())
+  if (DebugAddr)
     SecNames.insert("debug_addr");
-  if (!AbbrevDecls.empty())
+  if (!DebugAbbrev.empty())
     SecNames.insert("debug_abbrev");
   if (!CompileUnits.empty())
     SecNames.insert("debug_info");
@@ -46,9 +46,46 @@ SetVector<StringRef> DWARFYAML::Data::getUsedSectionNames() const {
     SecNames.insert("debug_gnu_pubnames");
   if (GNUPubTypes)
     SecNames.insert("debug_gnu_pubtypes");
+  if (DebugStrOffsets)
+    SecNames.insert("debug_str_offsets");
+  if (DebugRnglists)
+    SecNames.insert("debug_rnglists");
+  if (DebugLoclists)
+    SecNames.insert("debug_loclists");
   return SecNames;
 }
 
+Expected<DWARFYAML::Data::AbbrevTableInfo>
+DWARFYAML::Data::getAbbrevTableInfoByID(uint64_t ID) const {
+  if (AbbrevTableInfoMap.empty()) {
+    uint64_t AbbrevTableOffset = 0;
+    for (auto &AbbrevTable : enumerate(DebugAbbrev)) {
+      // If the abbrev table's ID isn't specified, we use the index as its ID.
+      uint64_t AbbrevTableID =
+          AbbrevTable.value().ID.getValueOr(AbbrevTable.index());
+      auto It = AbbrevTableInfoMap.insert(
+          {AbbrevTableID, AbbrevTableInfo{/*Index=*/AbbrevTable.index(),
+                                          /*Offset=*/AbbrevTableOffset}});
+      if (!It.second)
+        return createStringError(
+            errc::invalid_argument,
+            "the ID (%" PRIu64 ") of abbrev table with index %zu has been used "
+            "by abbrev table with index %" PRIu64,
+            AbbrevTableID, AbbrevTable.index(), It.first->second.Index);
+
+      AbbrevTableOffset +=
+          getAbbrevTableContentByIndex(AbbrevTable.index()).size();
+    }
+  }
+
+  auto It = AbbrevTableInfoMap.find(ID);
+  if (It == AbbrevTableInfoMap.end())
+    return createStringError(errc::invalid_argument,
+                             "cannot find abbrev table whose ID is %" PRIu64,
+                             ID);
+  return It->second;
+}
+
 namespace yaml {
 
 void MappingTraits<DWARFYAML::Data>::mapping(IO &IO, DWARFYAML::Data &DWARF) {
@@ -56,11 +93,9 @@ void MappingTraits<DWARFYAML::Data>::mapping(IO &IO, DWARFYAML::Data &DWARF) {
   DWARFYAML::DWARFContext DWARFCtx;
   IO.setContext(&DWARFCtx);
   IO.mapOptional("debug_str", DWARF.DebugStrings);
-  IO.mapOptional("debug_abbrev", DWARF.AbbrevDecls);
-  if (!DWARF.ARanges.empty() || !IO.outputting())
-    IO.mapOptional("debug_aranges", DWARF.ARanges);
-  if (!DWARF.DebugRanges.empty() || !IO.outputting())
-    IO.mapOptional("debug_ranges", DWARF.DebugRanges);
+  IO.mapOptional("debug_abbrev", DWARF.DebugAbbrev);
+  IO.mapOptional("debug_aranges", DWARF.DebugAranges);
+  IO.mapOptional("debug_ranges", DWARF.DebugRanges);
   IO.mapOptional("debug_pubnames", DWARF.PubNames);
   IO.mapOptional("debug_pubtypes", DWARF.PubTypes);
   DWARFCtx.IsGNUPubSec = true;
@@ -69,15 +104,24 @@ void MappingTraits<DWARFYAML::Data>::mapping(IO &IO, DWARFYAML::Data &DWARF) {
   IO.mapOptional("debug_info", DWARF.CompileUnits);
   IO.mapOptional("debug_line", DWARF.DebugLines);
   IO.mapOptional("debug_addr", DWARF.DebugAddr);
+  IO.mapOptional("debug_str_offsets", DWARF.DebugStrOffsets);
+  IO.mapOptional("debug_rnglists", DWARF.DebugRnglists);
+  IO.mapOptional("debug_loclists", DWARF.DebugLoclists);
   IO.setContext(OldContext);
 }
 
+void MappingTraits<DWARFYAML::AbbrevTable>::mapping(
+    IO &IO, DWARFYAML::AbbrevTable &AbbrevTable) {
+  IO.mapOptional("ID", AbbrevTable.ID);
+  IO.mapOptional("Table", AbbrevTable.Table);
+}
+
 void MappingTraits<DWARFYAML::Abbrev>::mapping(IO &IO,
                                                DWARFYAML::Abbrev &Abbrev) {
   IO.mapOptional("Code", Abbrev.Code);
   IO.mapRequired("Tag", Abbrev.Tag);
   IO.mapRequired("Children", Abbrev.Children);
-  IO.mapRequired("Attributes", Abbrev.Attributes);
+  IO.mapOptional("Attributes", Abbrev.Attributes);
 }
 
 void MappingTraits<DWARFYAML::AttributeAbbrev>::mapping(
@@ -97,12 +141,12 @@ void MappingTraits<DWARFYAML::ARangeDescriptor>::mapping(
 void MappingTraits<DWARFYAML::ARange>::mapping(IO &IO,
                                                DWARFYAML::ARange &ARange) {
   IO.mapOptional("Format", ARange.Format, dwarf::DWARF32);
-  IO.mapRequired("Length", ARange.Length);
+  IO.mapOptional("Length", ARange.Length);
   IO.mapRequired("Version", ARange.Version);
   IO.mapRequired("CuOffset", ARange.CuOffset);
-  IO.mapRequired("AddrSize", ARange.AddrSize);
-  IO.mapRequired("SegSize", ARange.SegSize);
-  IO.mapRequired("Descriptors", ARange.Descriptors);
+  IO.mapOptional("AddressSize", ARange.AddrSize);
+  IO.mapOptional("SegmentSelectorSize", ARange.SegSize, 0);
+  IO.mapOptional("Descriptors", ARange.Descriptors);
 }
 
 void MappingTraits<DWARFYAML::RangeEntry>::mapping(
@@ -128,6 +172,7 @@ void MappingTraits<DWARFYAML::PubEntry>::mapping(IO &IO,
 
 void MappingTraits<DWARFYAML::PubSection>::mapping(
     IO &IO, DWARFYAML::PubSection &Section) {
+  IO.mapOptional("Format", Section.Format, dwarf::DWARF32);
   IO.mapRequired("Length", Section.Length);
   IO.mapRequired("Version", Section.Version);
   IO.mapRequired("UnitOffset", Section.UnitOffset);
@@ -137,18 +182,19 @@ void MappingTraits<DWARFYAML::PubSection>::mapping(
 
 void MappingTraits<DWARFYAML::Unit>::mapping(IO &IO, DWARFYAML::Unit &Unit) {
   IO.mapOptional("Format", Unit.Format, dwarf::DWARF32);
-  IO.mapRequired("Length", Unit.Length);
+  IO.mapOptional("Length", Unit.Length);
   IO.mapRequired("Version", Unit.Version);
   if (Unit.Version >= 5)
     IO.mapRequired("UnitType", Unit.Type);
-  IO.mapRequired("AbbrOffset", Unit.AbbrOffset);
-  IO.mapRequired("AddrSize", Unit.AddrSize);
+  IO.mapOptional("AbbrevTableID", Unit.AbbrevTableID);
+  IO.mapOptional("AbbrOffset", Unit.AbbrOffset);
+  IO.mapOptional("AddrSize", Unit.AddrSize);
   IO.mapOptional("Entries", Unit.Entries);
 }
 
 void MappingTraits<DWARFYAML::Entry>::mapping(IO &IO, DWARFYAML::Entry &Entry) {
   IO.mapRequired("AbbrCode", Entry.AbbrCode);
-  IO.mapRequired("Values", Entry.Values);
+  IO.mapOptional("Values", Entry.Values);
 }
 
 void MappingTraits<DWARFYAML::FormValue>::mapping(
@@ -171,7 +217,7 @@ void MappingTraits<DWARFYAML::LineTableOpcode>::mapping(
     IO &IO, DWARFYAML::LineTableOpcode &LineTableOpcode) {
   IO.mapRequired("Opcode", LineTableOpcode.Opcode);
   if (LineTableOpcode.Opcode == dwarf::DW_LNS_extended_op) {
-    IO.mapRequired("ExtLen", LineTableOpcode.ExtLen);
+    IO.mapOptional("ExtLen", LineTableOpcode.ExtLen);
     IO.mapRequired("SubOpcode", LineTableOpcode.SubOpcode);
   }
 
@@ -189,20 +235,20 @@ void MappingTraits<DWARFYAML::LineTableOpcode>::mapping(
 void MappingTraits<DWARFYAML::LineTable>::mapping(
     IO &IO, DWARFYAML::LineTable &LineTable) {
   IO.mapOptional("Format", LineTable.Format, dwarf::DWARF32);
-  IO.mapRequired("Length", LineTable.Length);
+  IO.mapOptional("Length", LineTable.Length);
   IO.mapRequired("Version", LineTable.Version);
-  IO.mapRequired("PrologueLength", LineTable.PrologueLength);
+  IO.mapOptional("PrologueLength", LineTable.PrologueLength);
   IO.mapRequired("MinInstLength", LineTable.MinInstLength);
   if(LineTable.Version >= 4)
     IO.mapRequired("MaxOpsPerInst", LineTable.MaxOpsPerInst);
   IO.mapRequired("DefaultIsStmt", LineTable.DefaultIsStmt);
   IO.mapRequired("LineBase", LineTable.LineBase);
   IO.mapRequired("LineRange", LineTable.LineRange);
-  IO.mapRequired("OpcodeBase", LineTable.OpcodeBase);
-  IO.mapRequired("StandardOpcodeLengths", LineTable.StandardOpcodeLengths);
-  IO.mapRequired("IncludeDirs", LineTable.IncludeDirs);
-  IO.mapRequired("Files", LineTable.Files);
-  IO.mapRequired("Opcodes", LineTable.Opcodes);
+  IO.mapOptional("OpcodeBase", LineTable.OpcodeBase);
+  IO.mapOptional("StandardOpcodeLengths", LineTable.StandardOpcodeLengths);
+  IO.mapOptional("IncludeDirs", LineTable.IncludeDirs);
+  IO.mapOptional("Files", LineTable.Files);
+  IO.mapOptional("Opcodes", LineTable.Opcodes);
 }
 
 void MappingTraits<DWARFYAML::SegAddrPair>::mapping(
@@ -221,11 +267,61 @@ void MappingTraits<DWARFYAML::AddrTableEntry>::mapping(
   IO.mapOptional("Entries", AddrTable.SegAddrPairs);
 }
 
-void MappingTraits<DWARFYAML::InitialLength>::mapping(
-    IO &IO, DWARFYAML::InitialLength &InitialLength) {
-  IO.mapRequired("TotalLength", InitialLength.TotalLength);
-  if (InitialLength.isDWARF64())
-    IO.mapRequired("TotalLength64", InitialLength.TotalLength64);
+void MappingTraits<DWARFYAML::StringOffsetsTable>::mapping(
+    IO &IO, DWARFYAML::StringOffsetsTable &StrOffsetsTable) {
+  IO.mapOptional("Format", StrOffsetsTable.Format, dwarf::DWARF32);
+  IO.mapOptional("Length", StrOffsetsTable.Length);
+  IO.mapOptional("Version", StrOffsetsTable.Version, 5);
+  IO.mapOptional("Padding", StrOffsetsTable.Padding, 0);
+  IO.mapOptional("Offsets", StrOffsetsTable.Offsets);
+}
+
+void MappingTraits<DWARFYAML::DWARFOperation>::mapping(
+    IO &IO, DWARFYAML::DWARFOperation &DWARFOperation) {
+  IO.mapRequired("Operator", DWARFOperation.Operator);
+  IO.mapOptional("Values", DWARFOperation.Values);
+}
+
+void MappingTraits<DWARFYAML::RnglistEntry>::mapping(
+    IO &IO, DWARFYAML::RnglistEntry &RnglistEntry) {
+  IO.mapRequired("Operator", RnglistEntry.Operator);
+  IO.mapOptional("Values", RnglistEntry.Values);
+}
+
+void MappingTraits<DWARFYAML::LoclistEntry>::mapping(
+    IO &IO, DWARFYAML::LoclistEntry &LoclistEntry) {
+  IO.mapRequired("Operator", LoclistEntry.Operator);
+  IO.mapOptional("Values", LoclistEntry.Values);
+  IO.mapOptional("DescriptionsLength", LoclistEntry.DescriptionsLength);
+  IO.mapOptional("Descriptions", LoclistEntry.Descriptions);
+}
+
+template <typename EntryType>
+void MappingTraits<DWARFYAML::ListEntries<EntryType>>::mapping(
+    IO &IO, DWARFYAML::ListEntries<EntryType> &ListEntries) {
+  IO.mapOptional("Entries", ListEntries.Entries);
+  IO.mapOptional("Content", ListEntries.Content);
+}
+
+template <typename EntryType>
+std::string MappingTraits<DWARFYAML::ListEntries<EntryType>>::validate(
+    IO &IO, DWARFYAML::ListEntries<EntryType> &ListEntries) {
+  if (ListEntries.Entries && ListEntries.Content)
+    return "Entries and Content can't be used together";
+  return "";
+}
+
+template <typename EntryType>
+void MappingTraits<DWARFYAML::ListTable<EntryType>>::mapping(
+    IO &IO, DWARFYAML::ListTable<EntryType> &ListTable) {
+  IO.mapOptional("Format", ListTable.Format, dwarf::DWARF32);
+  IO.mapOptional("Length", ListTable.Length);
+  IO.mapOptional("Version", ListTable.Version, 5);
+  IO.mapOptional("AddressSize", ListTable.AddrSize);
+  IO.mapOptional("SegmentSelectorSize", ListTable.SegSelectorSize, 0);
+  IO.mapOptional("OffsetEntryCount", ListTable.OffsetEntryCount);
+  IO.mapOptional("Offsets", ListTable.Offsets);
+  IO.mapOptional("Lists", ListTable.Lists);
 }
 
 } // end namespace yaml
diff --git a/contrib/llvm-project/llvm/lib/ObjectYAML/ELFEmitter.cpp b/contrib/llvm-project/llvm/lib/ObjectYAML/ELFEmitter.cpp
index f9f2f128e2e8..e477a1b2b8f2 100644
--- a/contrib/llvm-project/llvm/lib/ObjectYAML/ELFEmitter.cpp
+++ b/contrib/llvm-project/llvm/lib/ObjectYAML/ELFEmitter.cpp
@@ -124,6 +124,11 @@ public:
     if (checkLimit(sizeof(T)))
       support::endian::write<T>(OS, Val, E);
   }
+
+  void updateDataAt(uint64_t Pos, void *Data, size_t Size) {
+    assert(Pos >= InitialOffset && Pos + Size <= getOffset());
+    memcpy(&Buf[Pos - InitialOffset], Data, Size);
+  }
 };
 
 // Used to keep track of section and symbol names, so that in the YAML file
@@ -168,15 +173,7 @@ struct Fragment {
 /// TODO: This class still has a ways to go before it is truly a "single
 /// point of truth".
 template <class ELFT> class ELFState {
-  typedef typename ELFT::Ehdr Elf_Ehdr;
-  typedef typename ELFT::Phdr Elf_Phdr;
-  typedef typename ELFT::Shdr Elf_Shdr;
-  typedef typename ELFT::Sym Elf_Sym;
-  typedef typename ELFT::Rel Elf_Rel;
-  typedef typename ELFT::Rela Elf_Rela;
-  typedef typename ELFT::Relr Elf_Relr;
-  typedef typename ELFT::Dyn Elf_Dyn;
-  typedef typename ELFT::uint uintX_t;
+  LLVM_ELF_IMPORT_TYPES_ELFT(ELFT)
 
   enum class SymtabType { Static, Dynamic };
 
@@ -232,7 +229,7 @@ template <class ELFT> class ELFState {
                    ArrayRef<typename ELFT::Shdr> SHeaders);
 
   void finalizeStrings();
-  void writeELFHeader(raw_ostream &OS, uint64_t SHOff);
+  void writeELFHeader(raw_ostream &OS);
   void writeSectionContent(Elf_Shdr &SHeader,
                            const ELFYAML::NoBitsSection &Section,
                            ContiguousBlobAccumulator &CBA);
@@ -245,7 +242,8 @@ template <class ELFT> class ELFState {
   void writeSectionContent(Elf_Shdr &SHeader,
                            const ELFYAML::RelrSection &Section,
                            ContiguousBlobAccumulator &CBA);
-  void writeSectionContent(Elf_Shdr &SHeader, const ELFYAML::Group &Group,
+  void writeSectionContent(Elf_Shdr &SHeader,
+                           const ELFYAML::GroupSection &Group,
                            ContiguousBlobAccumulator &CBA);
   void writeSectionContent(Elf_Shdr &SHeader,
                            const ELFYAML::SymtabShndxSection &Shndx,
@@ -259,6 +257,9 @@ template <class ELFT> class ELFState {
   void writeSectionContent(Elf_Shdr &SHeader,
                            const ELFYAML::VerdefSection &Section,
                            ContiguousBlobAccumulator &CBA);
+  void writeSectionContent(Elf_Shdr &SHeader,
+                           const ELFYAML::ARMIndexTableSection &Section,
+                           ContiguousBlobAccumulator &CBA);
   void writeSectionContent(Elf_Shdr &SHeader,
                            const ELFYAML::MipsABIFlags &Section,
                            ContiguousBlobAccumulator &CBA);
@@ -268,6 +269,9 @@ template <class ELFT> class ELFState {
   void writeSectionContent(Elf_Shdr &SHeader,
                            const ELFYAML::StackSizesSection &Section,
                            ContiguousBlobAccumulator &CBA);
+  void writeSectionContent(Elf_Shdr &SHeader,
+                           const ELFYAML::BBAddrMapSection &Section,
+                           ContiguousBlobAccumulator &CBA);
   void writeSectionContent(Elf_Shdr &SHeader,
                            const ELFYAML::HashSection &Section,
                            ContiguousBlobAccumulator &CBA);
@@ -331,12 +335,22 @@ ELFState<ELFT>::ELFState(ELFYAML::Object &D, yaml::ErrorHandler EH)
         std::make_unique<ELFYAML::Section>(
             ELFYAML::Chunk::ChunkKind::RawContent, /*IsImplicit=*/true));
 
-  // We add a technical suffix for each unnamed section/fill. It does not affect
-  // the output, but allows us to map them by name in the code and report better
-  // error messages.
   StringSet<> DocSections;
+  ELFYAML::SectionHeaderTable *SecHdrTable = nullptr;
   for (size_t I = 0; I < Doc.Chunks.size(); ++I) {
     const std::unique_ptr<ELFYAML::Chunk> &C = Doc.Chunks[I];
+
+    // We might have an explicit section header table declaration.
+    if (auto S = dyn_cast<ELFYAML::SectionHeaderTable>(C.get())) {
+      if (SecHdrTable)
+        reportError("multiple section header tables are not allowed");
+      SecHdrTable = S;
+      continue;
+    }
+
+    // We add a technical suffix for each unnamed section/fill. It does not
+    // affect the output, but allows us to map them by name in the code and
+    // report better error messages.
     if (C->Name.empty()) {
       std::string NewName = ELFYAML::appendUniqueSuffix(
           /*Name=*/"", "index " + Twine(I));
@@ -355,11 +369,13 @@ ELFState<ELFT>::ELFState(ELFYAML::Object &D, yaml::ErrorHandler EH)
   if (Doc.Symbols)
     ImplicitSections.push_back(".symtab");
   if (Doc.DWARF)
-    for (StringRef DebugSecName : Doc.DWARF->getUsedSectionNames()) {
+    for (StringRef DebugSecName : Doc.DWARF->getNonEmptySectionNames()) {
       std::string SecName = ("." + DebugSecName).str();
       ImplicitSections.push_back(StringRef(SecName).copy(StringAlloc));
     }
-  ImplicitSections.insert(ImplicitSections.end(), {".strtab", ".shstrtab"});
+  ImplicitSections.insert(ImplicitSections.end(), {".strtab"});
+  if (!SecHdrTable || !SecHdrTable->NoHeaders.getValueOr(false))
+    ImplicitSections.insert(ImplicitSections.end(), {".shstrtab"});
 
   // Insert placeholders for implicit sections that are not
   // defined explicitly in YAML.
@@ -367,15 +383,37 @@ ELFState<ELFT>::ELFState(ELFYAML::Object &D, yaml::ErrorHandler EH)
     if (DocSections.count(SecName))
       continue;
 
-    std::unique_ptr<ELFYAML::Chunk> Sec = std::make_unique<ELFYAML::Section>(
+    std::unique_ptr<ELFYAML::Section> Sec = std::make_unique<ELFYAML::Section>(
         ELFYAML::Chunk::ChunkKind::RawContent, true /*IsImplicit*/);
     Sec->Name = SecName;
-    Doc.Chunks.push_back(std::move(Sec));
+
+    if (SecName == ".dynsym")
+      Sec->Type = ELF::SHT_DYNSYM;
+    else if (SecName == ".symtab")
+      Sec->Type = ELF::SHT_SYMTAB;
+    else
+      Sec->Type = ELF::SHT_STRTAB;
+
+    // When the section header table is explicitly defined at the end of the
+    // sections list, it is reasonable to assume that the user wants to reorder
+    // section headers, but still wants to place the section header table after
+    // all sections, like it normally happens. In this case we want to insert
+    // other implicit sections right before the section header table.
+    if (Doc.Chunks.back().get() == SecHdrTable)
+      Doc.Chunks.insert(Doc.Chunks.end() - 1, std::move(Sec));
+    else
+      Doc.Chunks.push_back(std::move(Sec));
   }
+
+  // Insert the section header table implicitly at the end, when it is not
+  // explicitly defined.
+  if (!SecHdrTable)
+    Doc.Chunks.push_back(
+        std::make_unique<ELFYAML::SectionHeaderTable>(/*IsImplicit=*/true));
 }
 
 template <class ELFT>
-void ELFState<ELFT>::writeELFHeader(raw_ostream &OS, uint64_t SHOff) {
+void ELFState<ELFT>::writeELFHeader(raw_ostream &OS) {
   using namespace llvm::ELF;
 
   Elf_Ehdr Header;
@@ -390,7 +428,12 @@ void ELFState<ELFT>::writeELFHeader(raw_ostream &OS, uint64_t SHOff) {
   Header.e_ident[EI_OSABI] = Doc.Header.OSABI;
   Header.e_ident[EI_ABIVERSION] = Doc.Header.ABIVersion;
   Header.e_type = Doc.Header.Type;
-  Header.e_machine = Doc.Header.Machine;
+
+  if (Doc.Header.Machine)
+    Header.e_machine = *Doc.Header.Machine;
+  else
+    Header.e_machine = EM_NONE;
+
   Header.e_version = EV_CURRENT;
   Header.e_entry = Doc.Header.Entry;
   Header.e_flags = Doc.Header.Flags;
@@ -420,34 +463,27 @@ void ELFState<ELFT>::writeELFHeader(raw_ostream &OS, uint64_t SHOff) {
   Header.e_shentsize = Doc.Header.EShEntSize ? (uint16_t)*Doc.Header.EShEntSize
                                              : sizeof(Elf_Shdr);
 
-  const bool NoShdrs =
-      Doc.SectionHeaders && Doc.SectionHeaders->NoHeaders.getValueOr(false);
+  const ELFYAML::SectionHeaderTable &SectionHeaders =
+      Doc.getSectionHeaderTable();
 
   if (Doc.Header.EShOff)
     Header.e_shoff = *Doc.Header.EShOff;
-  else if (NoShdrs)
-    Header.e_shoff = 0;
+  else if (SectionHeaders.Offset)
+    Header.e_shoff = *SectionHeaders.Offset;
   else
-    Header.e_shoff = SHOff;
+    Header.e_shoff = 0;
 
   if (Doc.Header.EShNum)
     Header.e_shnum = *Doc.Header.EShNum;
-  else if (!Doc.SectionHeaders)
-    Header.e_shnum = Doc.getSections().size();
-  else if (NoShdrs)
-    Header.e_shnum = 0;
   else
-    Header.e_shnum =
-        (Doc.SectionHeaders->Sections ? Doc.SectionHeaders->Sections->size()
-                                      : 0) +
-        /*Null section*/ 1;
+    Header.e_shnum = SectionHeaders.getNumHeaders(Doc.getSections().size());
 
   if (Doc.Header.EShStrNdx)
     Header.e_shstrndx = *Doc.Header.EShStrNdx;
-  else if (NoShdrs || ExcludedSectionHeaders.count(".shstrtab"))
-    Header.e_shstrndx = 0;
-  else
+  else if (SectionHeaders.Offset && !ExcludedSectionHeaders.count(".shstrtab"))
     Header.e_shstrndx = SN2I.get(".shstrtab");
+  else
+    Header.e_shstrndx = 0;
 
   OS.write((const char *)&Header, sizeof(Header));
 }
@@ -455,12 +491,16 @@ void ELFState<ELFT>::writeELFHeader(raw_ostream &OS, uint64_t SHOff) {
 template <class ELFT>
 void ELFState<ELFT>::initProgramHeaders(std::vector<Elf_Phdr> &PHeaders) {
   DenseMap<StringRef, ELFYAML::Fill *> NameToFill;
-  for (const std::unique_ptr<ELFYAML::Chunk> &D : Doc.Chunks)
-    if (auto S = dyn_cast<ELFYAML::Fill>(D.get()))
+  DenseMap<StringRef, size_t> NameToIndex;
+  for (size_t I = 0, E = Doc.Chunks.size(); I != E; ++I) {
+    if (auto S = dyn_cast<ELFYAML::Fill>(Doc.Chunks[I].get()))
       NameToFill[S->Name] = S;
+    NameToIndex[Doc.Chunks[I]->Name] = I + 1;
+  }
 
   std::vector<ELFYAML::Section *> Sections = Doc.getSections();
-  for (ELFYAML::ProgramHeader &YamlPhdr : Doc.ProgramHeaders) {
+  for (size_t I = 0, E = Doc.ProgramHeaders.size(); I != E; ++I) {
+    ELFYAML::ProgramHeader &YamlPhdr = Doc.ProgramHeaders[I];
     Elf_Phdr Phdr;
     zero(Phdr);
     Phdr.p_type = YamlPhdr.Type;
@@ -469,22 +509,30 @@ void ELFState<ELFT>::initProgramHeaders(std::vector<Elf_Phdr> &PHeaders) {
     Phdr.p_paddr = YamlPhdr.PAddr;
     PHeaders.push_back(Phdr);
 
-    // Map Sections list to corresponding chunks.
-    for (const ELFYAML::SectionName &SecName : YamlPhdr.Sections) {
-      if (ELFYAML::Fill *Fill = NameToFill.lookup(SecName.Section)) {
-        YamlPhdr.Chunks.push_back(Fill);
-        continue;
-      }
+    if (!YamlPhdr.FirstSec && !YamlPhdr.LastSec)
+      continue;
 
-      unsigned Index;
-      if (SN2I.lookup(SecName.Section, Index)) {
-        YamlPhdr.Chunks.push_back(Sections[Index]);
-        continue;
-      }
+    // Get the index of the section, or 0 in the case when the section doesn't exist.
+    size_t First = NameToIndex[*YamlPhdr.FirstSec];
+    if (!First)
+      reportError("unknown section or fill referenced: '" + *YamlPhdr.FirstSec +
+                  "' by the 'FirstSec' key of the program header with index " +
+                  Twine(I));
+    size_t Last = NameToIndex[*YamlPhdr.LastSec];
+    if (!Last)
+      reportError("unknown section or fill referenced: '" + *YamlPhdr.LastSec +
+                  "' by the 'LastSec' key of the program header with index " +
+                  Twine(I));
+    if (!First || !Last)
+      continue;
 
-      reportError("unknown section or fill referenced: '" + SecName.Section +
-                  "' by program header");
-    }
+    if (First > Last)
+      reportError("program header with index " + Twine(I) +
+                  ": the section index of " + *YamlPhdr.FirstSec +
+                  " is greater than the index of " + *YamlPhdr.LastSec);
+
+    for (size_t I = First; I <= Last; ++I)
+      YamlPhdr.Chunks.push_back(Doc.Chunks[I - 1].get());
   }
 }
 
@@ -504,14 +552,16 @@ unsigned ELFState<ELFT>::toSectionIndex(StringRef S, StringRef LocSec,
     return 0;
   }
 
-  if (!Doc.SectionHeaders || (Doc.SectionHeaders->NoHeaders &&
-                              !Doc.SectionHeaders->NoHeaders.getValue()))
+  const ELFYAML::SectionHeaderTable &SectionHeaders =
+      Doc.getSectionHeaderTable();
+  if (SectionHeaders.IsImplicit ||
+      (SectionHeaders.NoHeaders && !SectionHeaders.NoHeaders.getValue()))
     return Index;
 
-  assert(!Doc.SectionHeaders->NoHeaders.getValueOr(false) ||
-         !Doc.SectionHeaders->Sections);
+  assert(!SectionHeaders.NoHeaders.getValueOr(false) ||
+         !SectionHeaders.Sections);
   size_t FirstExcluded =
-      Doc.SectionHeaders->Sections ? Doc.SectionHeaders->Sections->size() : 0;
+      SectionHeaders.Sections ? SectionHeaders.Sections->size() : 0;
   if (Index >= FirstExcluded) {
     if (LocSym.empty())
       reportError("unable to link '" + LocSec + "' to excluded section '" + S +
@@ -542,6 +592,8 @@ template <class ELFT>
 static void overrideFields(ELFYAML::Section *From, typename ELFT::Shdr &To) {
   if (!From)
     return;
+  if (From->ShAddrAlign)
+    To.sh_addralign = *From->ShAddrAlign;
   if (From->ShFlags)
     To.sh_flags = *From->ShFlags;
   if (From->ShName)
@@ -550,6 +602,8 @@ static void overrideFields(ELFYAML::Section *From, typename ELFT::Shdr &To) {
     To.sh_offset = *From->ShOffset;
   if (From->ShSize)
     To.sh_size = *From->ShSize;
+  if (From->ShType)
+    To.sh_type = *From->ShType;
 }
 
 template <class ELFT>
@@ -619,6 +673,45 @@ uint64_t ELFState<ELFT>::getSectionNameOffset(StringRef Name) {
   return DotShStrtab.getOffset(Name);
 }
 
+static uint64_t writeContent(ContiguousBlobAccumulator &CBA,
+                             const Optional<yaml::BinaryRef> &Content,
+                             const Optional<llvm::yaml::Hex64> &Size) {
+  size_t ContentSize = 0;
+  if (Content) {
+    CBA.writeAsBinary(*Content);
+    ContentSize = Content->binary_size();
+  }
+
+  if (!Size)
+    return ContentSize;
+
+  CBA.writeZeros(*Size - ContentSize);
+  return *Size;
+}
+
+static StringRef getDefaultLinkSec(unsigned SecType) {
+  switch (SecType) {
+  case ELF::SHT_REL:
+  case ELF::SHT_RELA:
+  case ELF::SHT_GROUP:
+  case ELF::SHT_LLVM_CALL_GRAPH_PROFILE:
+  case ELF::SHT_LLVM_ADDRSIG:
+    return ".symtab";
+  case ELF::SHT_GNU_versym:
+  case ELF::SHT_HASH:
+  case ELF::SHT_GNU_HASH:
+    return ".dynsym";
+  case ELF::SHT_DYNSYM:
+  case ELF::SHT_GNU_verdef:
+  case ELF::SHT_GNU_verneed:
+    return ".dynstr";
+  case ELF::SHT_SYMTAB:
+    return ".strtab";
+  default:
+    return "";
+  }
+}
+
 template <class ELFT>
 void ELFState<ELFT>::initSectionHeaders(std::vector<Elf_Shdr> &SHeaders,
                                         ContiguousBlobAccumulator &CBA) {
@@ -634,16 +727,51 @@ void ELFState<ELFT>::initSectionHeaders(std::vector<Elf_Shdr> &SHeaders,
       continue;
     }
 
+    if (ELFYAML::SectionHeaderTable *S =
+            dyn_cast<ELFYAML::SectionHeaderTable>(D.get())) {
+      if (S->NoHeaders.getValueOr(false))
+        continue;
+
+      if (!S->Offset)
+        S->Offset = alignToOffset(CBA, sizeof(typename ELFT::uint),
+                                  /*Offset=*/None);
+      else
+        S->Offset = alignToOffset(CBA, /*Align=*/1, S->Offset);
+
+      uint64_t Size = S->getNumHeaders(SHeaders.size()) * sizeof(Elf_Shdr);
+      // The full section header information might be not available here, so
+      // fill the space with zeroes as a placeholder.
+      CBA.writeZeros(Size);
+      LocationCounter += Size;
+      continue;
+    }
+
     ELFYAML::Section *Sec = cast<ELFYAML::Section>(D.get());
-    bool IsFirstUndefSection = D == Doc.Chunks.front();
+    bool IsFirstUndefSection = Sec == Doc.getSections().front();
     if (IsFirstUndefSection && Sec->IsImplicit)
       continue;
 
+    Elf_Shdr &SHeader = SHeaders[SN2I.get(Sec->Name)];
+    if (Sec->Link) {
+      SHeader.sh_link = toSectionIndex(*Sec->Link, Sec->Name);
+    } else {
+      StringRef LinkSec = getDefaultLinkSec(Sec->Type);
+      unsigned Link = 0;
+      if (!LinkSec.empty() && !ExcludedSectionHeaders.count(LinkSec) &&
+          SN2I.lookup(LinkSec, Link))
+        SHeader.sh_link = Link;
+    }
+
+    if (Sec->EntSize)
+      SHeader.sh_entsize = *Sec->EntSize;
+    else
+      SHeader.sh_entsize = ELFYAML::getDefaultShEntSize<ELFT>(
+          Doc.Header.Machine.getValueOr(ELF::EM_NONE), Sec->Type, Sec->Name);
+
     // We have a few sections like string or symbol tables that are usually
     // added implicitly to the end. However, if they are explicitly specified
     // in the YAML, we need to write them here. This ensures the file offset
     // remains correct.
-    Elf_Shdr &SHeader = SHeaders[SN2I.get(Sec->Name)];
     if (initImplicitHeader(CBA, SHeader, Sec->Name,
                            Sec->IsImplicit ? nullptr : Sec))
       continue;
@@ -665,9 +793,6 @@ void ELFState<ELFT>::initSectionHeaders(std::vector<Elf_Shdr> &SHeaders,
 
     assignSectionAddress(SHeader, Sec);
 
-    if (!Sec->Link.empty())
-      SHeader.sh_link = toSectionIndex(Sec->Link, Sec->Name);
-
     if (IsFirstUndefSection) {
       if (auto RawSec = dyn_cast<ELFYAML::RawContentSection>(Sec)) {
         // We do not write any content for special SHN_UNDEF section.
@@ -676,9 +801,16 @@ void ELFState<ELFT>::initSectionHeaders(std::vector<Elf_Shdr> &SHeaders,
         if (RawSec->Info)
           SHeader.sh_info = *RawSec->Info;
       }
-      if (Sec->EntSize)
-        SHeader.sh_entsize = *Sec->EntSize;
-    } else if (auto S = dyn_cast<ELFYAML::RawContentSection>(Sec)) {
+
+      LocationCounter += SHeader.sh_size;
+      overrideFields<ELFT>(Sec, SHeader);
+      continue;
+    }
+
+    if (!isa<ELFYAML::NoBitsSection>(Sec) && (Sec->Content || Sec->Size))
+      SHeader.sh_size = writeContent(CBA, Sec->Content, Sec->Size);
+
+    if (auto S = dyn_cast<ELFYAML::RawContentSection>(Sec)) {
       writeSectionContent(SHeader, *S, CBA);
     } else if (auto S = dyn_cast<ELFYAML::SymtabShndxSection>(Sec)) {
       writeSectionContent(SHeader, *S, CBA);
@@ -686,7 +818,9 @@ void ELFState<ELFT>::initSectionHeaders(std::vector<Elf_Shdr> &SHeaders,
       writeSectionContent(SHeader, *S, CBA);
     } else if (auto S = dyn_cast<ELFYAML::RelrSection>(Sec)) {
       writeSectionContent(SHeader, *S, CBA);
-    } else if (auto S = dyn_cast<ELFYAML::Group>(Sec)) {
+    } else if (auto S = dyn_cast<ELFYAML::GroupSection>(Sec)) {
+      writeSectionContent(SHeader, *S, CBA);
+    } else if (auto S = dyn_cast<ELFYAML::ARMIndexTableSection>(Sec)) {
       writeSectionContent(SHeader, *S, CBA);
     } else if (auto S = dyn_cast<ELFYAML::MipsABIFlags>(Sec)) {
       writeSectionContent(SHeader, *S, CBA);
@@ -716,6 +850,8 @@ void ELFState<ELFT>::initSectionHeaders(std::vector<Elf_Shdr> &SHeaders,
       writeSectionContent(SHeader, *S, CBA);
     } else if (auto S = dyn_cast<ELFYAML::CallGraphProfileSection>(Sec)) {
       writeSectionContent(SHeader, *S, CBA);
+    } else if (auto S = dyn_cast<ELFYAML::BBAddrMapSection>(Sec)) {
+      writeSectionContent(SHeader, *S, CBA);
     } else {
       llvm_unreachable("Unknown section type");
     }
@@ -755,22 +891,6 @@ static size_t findFirstNonGlobal(ArrayRef<ELFYAML::Symbol> Symbols) {
   return Symbols.size();
 }
 
-static uint64_t writeContent(ContiguousBlobAccumulator &CBA,
-                             const Optional<yaml::BinaryRef> &Content,
-                             const Optional<llvm::yaml::Hex64> &Size) {
-  size_t ContentSize = 0;
-  if (Content) {
-    CBA.writeAsBinary(*Content);
-    ContentSize = Content->binary_size();
-  }
-
-  if (!Size)
-    return ContentSize;
-
-  CBA.writeZeros(*Size - ContentSize);
-  return *Size;
-}
-
 template <class ELFT>
 std::vector<typename ELFT::Sym>
 ELFState<ELFT>::toELFSymbols(ArrayRef<ELFYAML::Symbol> Symbols,
@@ -791,14 +911,14 @@ ELFState<ELFT>::toELFSymbols(ArrayRef<ELFYAML::Symbol> Symbols,
       Symbol.st_name = Strtab.getOffset(ELFYAML::dropUniqueSuffix(Sym.Name));
 
     Symbol.setBindingAndType(Sym.Binding, Sym.Type);
-    if (!Sym.Section.empty())
-      Symbol.st_shndx = toSectionIndex(Sym.Section, "", Sym.Name);
+    if (Sym.Section)
+      Symbol.st_shndx = toSectionIndex(*Sym.Section, "", Sym.Name);
     else if (Sym.Index)
       Symbol.st_shndx = *Sym.Index;
 
-    Symbol.st_value = Sym.Value;
+    Symbol.st_value = Sym.Value.getValueOr(yaml::Hex64(0));
     Symbol.st_other = Sym.Other ? *Sym.Other : 0;
-    Symbol.st_size = Sym.Size;
+    Symbol.st_size = Sym.Size.getValueOr(yaml::Hex64(0));
   }
 
   return Ret;
@@ -834,7 +954,6 @@ void ELFState<ELFT>::initSymtabSectionHeader(Elf_Shdr &SHeader,
     }
   }
 
-  zero(SHeader);
   SHeader.sh_name = getSectionNameOffset(IsStatic ? ".symtab" : ".dynsym");
 
   if (YAMLSec)
@@ -842,26 +961,6 @@ void ELFState<ELFT>::initSymtabSectionHeader(Elf_Shdr &SHeader,
   else
     SHeader.sh_type = IsStatic ? ELF::SHT_SYMTAB : ELF::SHT_DYNSYM;
 
-  if (RawSec && !RawSec->Link.empty()) {
-    // If the Link field is explicitly defined in the document,
-    // we should use it.
-    SHeader.sh_link = toSectionIndex(RawSec->Link, RawSec->Name);
-  } else {
-    // When we describe the .dynsym section in the document explicitly, it is
-    // allowed to omit the "DynamicSymbols" tag. In this case .dynstr is not
-    // added implicitly and we should be able to leave the Link zeroed if
-    // .dynstr is not defined.
-    unsigned Link = 0;
-    if (IsStatic) {
-      if (!ExcludedSectionHeaders.count(".strtab"))
-        Link = SN2I.get(".strtab");
-    } else {
-      if (!ExcludedSectionHeaders.count(".dynstr"))
-        SN2I.lookup(".dynstr", Link);
-    }
-    SHeader.sh_link = Link;
-  }
-
   if (YAMLSec && YAMLSec->Flags)
     SHeader.sh_flags = *YAMLSec->Flags;
   else if (!IsStatic)
@@ -871,14 +970,12 @@ void ELFState<ELFT>::initSymtabSectionHeader(Elf_Shdr &SHeader,
   // then we should set the fields requested.
   SHeader.sh_info = (RawSec && RawSec->Info) ? (unsigned)(*RawSec->Info)
                                              : findFirstNonGlobal(Symbols) + 1;
-  SHeader.sh_entsize = (YAMLSec && YAMLSec->EntSize)
-                           ? (uint64_t)(*YAMLSec->EntSize)
-                           : sizeof(Elf_Sym);
   SHeader.sh_addralign = YAMLSec ? (uint64_t)YAMLSec->AddressAlign : 8;
 
   assignSectionAddress(SHeader, YAMLSec);
 
-  SHeader.sh_offset = alignToOffset(CBA, SHeader.sh_addralign, /*Offset=*/None);
+  SHeader.sh_offset =
+      alignToOffset(CBA, SHeader.sh_addralign, RawSec ? RawSec->Offset : None);
 
   if (RawSec && (RawSec->Content || RawSec->Size)) {
     assert(Symbols.empty());
@@ -897,7 +994,6 @@ void ELFState<ELFT>::initStrtabSectionHeader(Elf_Shdr &SHeader, StringRef Name,
                                              StringTableBuilder &STB,
                                              ContiguousBlobAccumulator &CBA,
                                              ELFYAML::Section *YAMLSec) {
-  zero(SHeader);
   SHeader.sh_name = getSectionNameOffset(Name);
   SHeader.sh_type = YAMLSec ? YAMLSec->Type : ELF::SHT_STRTAB;
   SHeader.sh_addralign = YAMLSec ? (uint64_t)YAMLSec->AddressAlign : 1;
@@ -905,7 +1001,8 @@ void ELFState<ELFT>::initStrtabSectionHeader(Elf_Shdr &SHeader, StringRef Name,
   ELFYAML::RawContentSection *RawSec =
       dyn_cast_or_null<ELFYAML::RawContentSection>(YAMLSec);
 
-  SHeader.sh_offset = alignToOffset(CBA, SHeader.sh_addralign, /*Offset=*/None);
+  SHeader.sh_offset = alignToOffset(CBA, SHeader.sh_addralign,
+                                    YAMLSec ? YAMLSec->Offset : None);
 
   if (RawSec && (RawSec->Content || RawSec->Size)) {
     SHeader.sh_size = writeContent(CBA, RawSec->Content, RawSec->Size);
@@ -915,9 +1012,6 @@ void ELFState<ELFT>::initStrtabSectionHeader(Elf_Shdr &SHeader, StringRef Name,
     SHeader.sh_size = STB.getSize();
   }
 
-  if (YAMLSec && YAMLSec->EntSize)
-    SHeader.sh_entsize = *YAMLSec->EntSize;
-
   if (RawSec && RawSec->Info)
     SHeader.sh_info = *RawSec->Info;
 
@@ -930,7 +1024,7 @@ void ELFState<ELFT>::initStrtabSectionHeader(Elf_Shdr &SHeader, StringRef Name,
 }
 
 static bool shouldEmitDWARF(DWARFYAML::Data &DWARF, StringRef Name) {
-  SetVector<StringRef> DebugSecNames = DWARF.getUsedSectionNames();
+  SetVector<StringRef> DebugSecNames = DWARF.getNonEmptySectionNames();
   return Name.consume_front(".") && DebugSecNames.count(Name);
 }
 
@@ -946,37 +1040,9 @@ Expected<uint64_t> emitDWARF(typename ELFT::Shdr &SHeader, StringRef Name,
     return 0;
 
   uint64_t BeginOffset = CBA.tell();
-  Error Err = Error::success();
-  cantFail(std::move(Err));
-
-  if (Name == ".debug_str")
-    Err = DWARFYAML::emitDebugStr(*OS, DWARF);
-  else if (Name == ".debug_aranges")
-    Err = DWARFYAML::emitDebugAranges(*OS, DWARF);
-  else if (Name == ".debug_ranges")
-    Err = DWARFYAML::emitDebugRanges(*OS, DWARF);
-  else if (Name == ".debug_line")
-    Err = DWARFYAML::emitDebugLine(*OS, DWARF);
-  else if (Name == ".debug_addr")
-    Err = DWARFYAML::emitDebugAddr(*OS, DWARF);
-  else if (Name == ".debug_abbrev")
-    Err = DWARFYAML::emitDebugAbbrev(*OS, DWARF);
-  else if (Name == ".debug_info")
-    Err = DWARFYAML::emitDebugInfo(*OS, DWARF);
-  else if (Name == ".debug_pubnames")
-    Err = DWARFYAML::emitPubSection(*OS, *DWARF.PubNames, DWARF.IsLittleEndian);
-  else if (Name == ".debug_pubtypes")
-    Err = DWARFYAML::emitPubSection(*OS, *DWARF.PubTypes, DWARF.IsLittleEndian);
-  else if (Name == ".debug_gnu_pubnames")
-    Err = DWARFYAML::emitPubSection(*OS, *DWARF.GNUPubNames,
-                                    DWARF.IsLittleEndian, /*IsGNUStyle=*/true);
-  else if (Name == ".debug_gnu_pubtypes")
-    Err = DWARFYAML::emitPubSection(*OS, *DWARF.GNUPubTypes,
-                                    DWARF.IsLittleEndian, /*IsGNUStyle=*/true);
-  else
-    llvm_unreachable("unexpected emitDWARF() call");
 
-  if (Err)
+  auto EmitFunc = DWARFYAML::getDWARFEmitterByName(Name.substr(1));
+  if (Error Err = EmitFunc(*OS, DWARF))
     return std::move(Err);
 
   return CBA.tell() - BeginOffset;
@@ -986,7 +1052,6 @@ template <class ELFT>
 void ELFState<ELFT>::initDWARFSectionHeader(Elf_Shdr &SHeader, StringRef Name,
                                             ContiguousBlobAccumulator &CBA,
                                             ELFYAML::Section *YAMLSec) {
-  zero(SHeader);
   SHeader.sh_name = getSectionNameOffset(ELFYAML::dropUniqueSuffix(Name));
   SHeader.sh_type = YAMLSec ? YAMLSec->Type : ELF::SHT_PROGBITS;
   SHeader.sh_addralign = YAMLSec ? (uint64_t)YAMLSec->AddressAlign : 1;
@@ -1013,11 +1078,6 @@ void ELFState<ELFT>::initDWARFSectionHeader(Elf_Shdr &SHeader, StringRef Name,
     llvm_unreachable("debug sections can only be initialized via the 'DWARF' "
                      "entry or a RawContentSection");
 
-  if (YAMLSec && YAMLSec->EntSize)
-    SHeader.sh_entsize = *YAMLSec->EntSize;
-  else if (Name == ".debug_str")
-    SHeader.sh_entsize = 1;
-
   if (RawSec && RawSec->Info)
     SHeader.sh_info = *RawSec->Info;
 
@@ -1026,9 +1086,6 @@ void ELFState<ELFT>::initDWARFSectionHeader(Elf_Shdr &SHeader, StringRef Name,
   else if (Name == ".debug_str")
     SHeader.sh_flags = ELF::SHF_MERGE | ELF::SHF_STRINGS;
 
-  if (YAMLSec && !YAMLSec->Link.empty())
-    SHeader.sh_link = toSectionIndex(YAMLSec->Link, Name);
-
   assignSectionAddress(SHeader, YAMLSec);
 }
 
@@ -1120,8 +1177,8 @@ void ELFState<ELFT>::setProgramHeaderLayout(std::vector<Elf_Phdr> &PHeaders,
   }
 }
 
-static bool shouldAllocateFileSpace(ArrayRef<ELFYAML::ProgramHeader> Phdrs,
-                                    const ELFYAML::NoBitsSection &S) {
+bool llvm::ELFYAML::shouldAllocateFileSpace(
+    ArrayRef<ELFYAML::ProgramHeader> Phdrs, const ELFYAML::NoBitsSection &S) {
   for (const ELFYAML::ProgramHeader &PH : Phdrs) {
     auto It = llvm::find_if(
         PH.Chunks, [&](ELFYAML::Chunk *C) { return C->Name == S.Name; });
@@ -1138,34 +1195,30 @@ template <class ELFT>
 void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
                                          const ELFYAML::NoBitsSection &S,
                                          ContiguousBlobAccumulator &CBA) {
-  // SHT_NOBITS sections do not have any content to write.
-  SHeader.sh_entsize = 0;
-  SHeader.sh_size = S.Size;
+  if (!S.Size)
+    return;
+
+  SHeader.sh_size = *S.Size;
 
   // When a nobits section is followed by a non-nobits section or fill
   // in the same segment, we allocate the file space for it. This behavior
   // matches linkers.
   if (shouldAllocateFileSpace(Doc.ProgramHeaders, S))
-    CBA.writeZeros(S.Size);
+    CBA.writeZeros(*S.Size);
 }
 
 template <class ELFT>
 void ELFState<ELFT>::writeSectionContent(
     Elf_Shdr &SHeader, const ELFYAML::RawContentSection &Section,
     ContiguousBlobAccumulator &CBA) {
-  SHeader.sh_size = writeContent(CBA, Section.Content, Section.Size);
-
-  if (Section.EntSize)
-    SHeader.sh_entsize = *Section.EntSize;
-
   if (Section.Info)
     SHeader.sh_info = *Section.Info;
 }
 
-static bool isMips64EL(const ELFYAML::Object &Doc) {
-  return Doc.Header.Machine == ELFYAML::ELF_EM(llvm::ELF::EM_MIPS) &&
-         Doc.Header.Class == ELFYAML::ELF_ELFCLASS(ELF::ELFCLASS64) &&
-         Doc.Header.Data == ELFYAML::ELF_ELFDATA(ELF::ELFDATA2LSB);
+static bool isMips64EL(const ELFYAML::Object &Obj) {
+  return Obj.getMachine() == llvm::ELF::EM_MIPS &&
+         Obj.Header.Class == ELFYAML::ELF_ELFCLASS(ELF::ELFCLASS64) &&
+         Obj.Header.Data == ELFYAML::ELF_ELFDATA(ELF::ELFDATA2LSB);
 }
 
 template <class ELFT>
@@ -1176,27 +1229,17 @@ void ELFState<ELFT>::writeSectionContent(
           Section.Type == llvm::ELF::SHT_RELA) &&
          "Section type is not SHT_REL nor SHT_RELA");
 
-  bool IsRela = Section.Type == llvm::ELF::SHT_RELA;
-  if (Section.EntSize)
-    SHeader.sh_entsize = *Section.EntSize;
-  else
-    SHeader.sh_entsize = IsRela ? sizeof(Elf_Rela) : sizeof(Elf_Rel);
-  SHeader.sh_size = (IsRela ? sizeof(Elf_Rela) : sizeof(Elf_Rel)) *
-                    Section.Relocations.size();
-
-  // For relocation section set link to .symtab by default.
-  unsigned Link = 0;
-  if (Section.Link.empty() && !ExcludedSectionHeaders.count(".symtab") &&
-      SN2I.lookup(".symtab", Link))
-    SHeader.sh_link = Link;
-
   if (!Section.RelocatableSec.empty())
     SHeader.sh_info = toSectionIndex(Section.RelocatableSec, Section.Name);
 
-  for (const auto &Rel : Section.Relocations) {
-    unsigned SymIdx = Rel.Symbol ? toSymbolIndex(*Rel.Symbol, Section.Name,
-                                                 Section.Link == ".dynsym")
-                                 : 0;
+  if (!Section.Relocations)
+    return;
+
+  const bool IsRela = Section.Type == llvm::ELF::SHT_RELA;
+  for (const ELFYAML::Relocation &Rel : *Section.Relocations) {
+    const bool IsDynamic = Section.Link && (*Section.Link == ".dynsym");
+    unsigned SymIdx =
+        Rel.Symbol ? toSymbolIndex(*Rel.Symbol, Section.Name, IsDynamic) : 0;
     if (IsRela) {
       Elf_Rela REntry;
       zero(REntry);
@@ -1212,20 +1255,15 @@ void ELFState<ELFT>::writeSectionContent(
       CBA.write((const char *)&REntry, sizeof(REntry));
     }
   }
+
+  SHeader.sh_size = (IsRela ? sizeof(Elf_Rela) : sizeof(Elf_Rel)) *
+                    Section.Relocations->size();
 }
 
 template <class ELFT>
 void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
                                          const ELFYAML::RelrSection &Section,
                                          ContiguousBlobAccumulator &CBA) {
-  SHeader.sh_entsize =
-      Section.EntSize ? uint64_t(*Section.EntSize) : sizeof(Elf_Relr);
-
-  if (Section.Content) {
-    SHeader.sh_size = writeContent(CBA, Section.Content, None);
-    return;
-  }
-
   if (!Section.Entries)
     return;
 
@@ -1243,33 +1281,34 @@ template <class ELFT>
 void ELFState<ELFT>::writeSectionContent(
     Elf_Shdr &SHeader, const ELFYAML::SymtabShndxSection &Shndx,
     ContiguousBlobAccumulator &CBA) {
-  for (uint32_t E : Shndx.Entries)
-    CBA.write<uint32_t>(E, ELFT::TargetEndianness);
+  if (Shndx.Content || Shndx.Size) {
+    SHeader.sh_size = writeContent(CBA, Shndx.Content, Shndx.Size);
+    return;
+  }
 
-  SHeader.sh_entsize = Shndx.EntSize ? (uint64_t)*Shndx.EntSize : 4;
-  SHeader.sh_size = Shndx.Entries.size() * SHeader.sh_entsize;
+  if (!Shndx.Entries)
+    return;
+
+  for (uint32_t E : *Shndx.Entries)
+    CBA.write<uint32_t>(E, ELFT::TargetEndianness);
+  SHeader.sh_size = Shndx.Entries->size() * SHeader.sh_entsize;
 }
 
 template <class ELFT>
 void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
-                                         const ELFYAML::Group &Section,
+                                         const ELFYAML::GroupSection &Section,
                                          ContiguousBlobAccumulator &CBA) {
   assert(Section.Type == llvm::ELF::SHT_GROUP &&
          "Section type is not SHT_GROUP");
 
-  unsigned Link = 0;
-  if (Section.Link.empty() && !ExcludedSectionHeaders.count(".symtab") &&
-      SN2I.lookup(".symtab", Link))
-    SHeader.sh_link = Link;
-
-  SHeader.sh_entsize = 4;
-  SHeader.sh_size = SHeader.sh_entsize * Section.Members.size();
-
   if (Section.Signature)
     SHeader.sh_info =
         toSymbolIndex(*Section.Signature, Section.Name, /*IsDynamic=*/false);
 
-  for (const ELFYAML::SectionOrType &Member : Section.Members) {
+  if (!Section.Members)
+    return;
+
+  for (const ELFYAML::SectionOrType &Member : *Section.Members) {
     unsigned int SectionIndex = 0;
     if (Member.sectionNameOrType == "GRP_COMDAT")
       SectionIndex = llvm::ELF::GRP_COMDAT;
@@ -1277,27 +1316,30 @@ void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
       SectionIndex = toSectionIndex(Member.sectionNameOrType, Section.Name);
     CBA.write<uint32_t>(SectionIndex, ELFT::TargetEndianness);
   }
+  SHeader.sh_size = SHeader.sh_entsize * Section.Members->size();
 }
 
 template <class ELFT>
 void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
                                          const ELFYAML::SymverSection &Section,
                                          ContiguousBlobAccumulator &CBA) {
-  for (uint16_t Version : Section.Entries)
-    CBA.write<uint16_t>(Version, ELFT::TargetEndianness);
+  if (!Section.Entries)
+    return;
 
-  SHeader.sh_entsize = Section.EntSize ? (uint64_t)*Section.EntSize : 2;
-  SHeader.sh_size = Section.Entries.size() * SHeader.sh_entsize;
+  for (uint16_t Version : *Section.Entries)
+    CBA.write<uint16_t>(Version, ELFT::TargetEndianness);
+  SHeader.sh_size = Section.Entries->size() * SHeader.sh_entsize;
 }
 
 template <class ELFT>
 void ELFState<ELFT>::writeSectionContent(
     Elf_Shdr &SHeader, const ELFYAML::StackSizesSection &Section,
     ContiguousBlobAccumulator &CBA) {
-  if (Section.Content || Section.Size) {
-    SHeader.sh_size = writeContent(CBA, Section.Content, Section.Size);
+  if (!Section.Entries)
+    return;
+
+  if (!Section.Entries)
     return;
-  }
 
   for (const ELFYAML::StackSizeEntry &E : *Section.Entries) {
     CBA.write<uintX_t>(E.Address, ELFT::TargetEndianness);
@@ -1307,13 +1349,31 @@ void ELFState<ELFT>::writeSectionContent(
 
 template <class ELFT>
 void ELFState<ELFT>::writeSectionContent(
-    Elf_Shdr &SHeader, const ELFYAML::LinkerOptionsSection &Section,
+    Elf_Shdr &SHeader, const ELFYAML::BBAddrMapSection &Section,
     ContiguousBlobAccumulator &CBA) {
-  if (Section.Content) {
-    SHeader.sh_size = writeContent(CBA, Section.Content, None);
+  if (!Section.Entries)
     return;
+
+  for (const ELFYAML::BBAddrMapEntry &E : *Section.Entries) {
+    // Write the address of the function.
+    CBA.write<uintX_t>(E.Address, ELFT::TargetEndianness);
+    // Write number of BBEntries (number of basic blocks in the function).
+    size_t NumBlocks = E.BBEntries ? E.BBEntries->size() : 0;
+    SHeader.sh_size += sizeof(uintX_t) + CBA.writeULEB128(NumBlocks);
+    if (!NumBlocks)
+      continue;
+    // Write all BBEntries.
+    for (const ELFYAML::BBAddrMapEntry::BBEntry &BBE : *E.BBEntries)
+      SHeader.sh_size += CBA.writeULEB128(BBE.AddressOffset) +
+                         CBA.writeULEB128(BBE.Size) +
+                         CBA.writeULEB128(BBE.Metadata);
   }
+}
 
+template <class ELFT>
+void ELFState<ELFT>::writeSectionContent(
+    Elf_Shdr &SHeader, const ELFYAML::LinkerOptionsSection &Section,
+    ContiguousBlobAccumulator &CBA) {
   if (!Section.Options)
     return;
 
@@ -1330,11 +1390,6 @@ template <class ELFT>
 void ELFState<ELFT>::writeSectionContent(
     Elf_Shdr &SHeader, const ELFYAML::DependentLibrariesSection &Section,
     ContiguousBlobAccumulator &CBA) {
-  if (Section.Content) {
-    SHeader.sh_size = writeContent(CBA, Section.Content, None);
-    return;
-  }
-
   if (!Section.Libs)
     return;
 
@@ -1373,21 +1428,6 @@ template <class ELFT>
 void ELFState<ELFT>::writeSectionContent(
     Elf_Shdr &SHeader, const ELFYAML::CallGraphProfileSection &Section,
     ContiguousBlobAccumulator &CBA) {
-  if (Section.EntSize)
-    SHeader.sh_entsize = *Section.EntSize;
-  else
-    SHeader.sh_entsize = 16;
-
-  unsigned Link = 0;
-  if (Section.Link.empty() && !ExcludedSectionHeaders.count(".symtab") &&
-      SN2I.lookup(".symtab", Link))
-    SHeader.sh_link = Link;
-
-  if (Section.Content) {
-    SHeader.sh_size = writeContent(CBA, Section.Content, None);
-    return;
-  }
-
   if (!Section.Entries)
     return;
 
@@ -1406,15 +1446,11 @@ template <class ELFT>
 void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
                                          const ELFYAML::HashSection &Section,
                                          ContiguousBlobAccumulator &CBA) {
-  unsigned Link = 0;
-  if (Section.Link.empty() && !ExcludedSectionHeaders.count(".dynsym") &&
-      SN2I.lookup(".dynsym", Link))
-    SHeader.sh_link = Link;
+  if (!Section.Bucket)
+    return;
 
-  if (Section.Content || Section.Size) {
-    SHeader.sh_size = writeContent(CBA, Section.Content, Section.Size);
+  if (!Section.Bucket)
     return;
-  }
 
   CBA.write<uint32_t>(
       Section.NBucket.getValueOr(llvm::yaml::Hex64(Section.Bucket->size())),
@@ -1435,15 +1471,11 @@ template <class ELFT>
 void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
                                          const ELFYAML::VerdefSection &Section,
                                          ContiguousBlobAccumulator &CBA) {
-  typedef typename ELFT::Verdef Elf_Verdef;
-  typedef typename ELFT::Verdaux Elf_Verdaux;
-
-  SHeader.sh_info = Section.Info;
 
-  if (Section.Content) {
-    SHeader.sh_size = writeContent(CBA, Section.Content, None);
-    return;
-  }
+  if (Section.Info)
+    SHeader.sh_info = *Section.Info;
+  else if (Section.Entries)
+    SHeader.sh_info = Section.Entries->size();
 
   if (!Section.Entries)
     return;
@@ -1453,10 +1485,10 @@ void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
     const ELFYAML::VerdefEntry &E = (*Section.Entries)[I];
 
     Elf_Verdef VerDef;
-    VerDef.vd_version = E.Version;
-    VerDef.vd_flags = E.Flags;
-    VerDef.vd_ndx = E.VersionNdx;
-    VerDef.vd_hash = E.Hash;
+    VerDef.vd_version = E.Version.getValueOr(1);
+    VerDef.vd_flags = E.Flags.getValueOr(0);
+    VerDef.vd_ndx = E.VersionNdx.getValueOr(0);
+    VerDef.vd_hash = E.Hash.getValueOr(0);
     VerDef.vd_aux = sizeof(Elf_Verdef);
     VerDef.vd_cnt = E.VerNames.size();
     if (I == Section.Entries->size() - 1)
@@ -1485,15 +1517,10 @@ template <class ELFT>
 void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
                                          const ELFYAML::VerneedSection &Section,
                                          ContiguousBlobAccumulator &CBA) {
-  typedef typename ELFT::Verneed Elf_Verneed;
-  typedef typename ELFT::Vernaux Elf_Vernaux;
-
-  SHeader.sh_info = Section.Info;
-
-  if (Section.Content) {
-    SHeader.sh_size = writeContent(CBA, Section.Content, None);
-    return;
-  }
+  if (Section.Info)
+    SHeader.sh_info = *Section.Info;
+  else if (Section.VerneedV)
+    SHeader.sh_info = Section.VerneedV->size();
 
   if (!Section.VerneedV)
     return;
@@ -1534,6 +1561,20 @@ void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
                     AuxCnt * sizeof(Elf_Vernaux);
 }
 
+template <class ELFT>
+void ELFState<ELFT>::writeSectionContent(
+    Elf_Shdr &SHeader, const ELFYAML::ARMIndexTableSection &Section,
+    ContiguousBlobAccumulator &CBA) {
+  if (!Section.Entries)
+    return;
+
+  for (const ELFYAML::ARMIndexTableEntry &E : *Section.Entries) {
+    CBA.write<uint32_t>(E.Offset, ELFT::TargetEndianness);
+    CBA.write<uint32_t>(E.Value, ELFT::TargetEndianness);
+  }
+  SHeader.sh_size = Section.Entries->size() * 8;
+}
+
 template <class ELFT>
 void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
                                          const ELFYAML::MipsABIFlags &Section,
@@ -1543,7 +1584,6 @@ void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
 
   object::Elf_Mips_ABIFlags<ELFT> Flags;
   zero(Flags);
-  SHeader.sh_entsize = sizeof(Flags);
   SHeader.sh_size = SHeader.sh_entsize;
 
   Flags.version = Section.Version;
@@ -1567,41 +1607,25 @@ void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
   assert(Section.Type == llvm::ELF::SHT_DYNAMIC &&
          "Section type is not SHT_DYNAMIC");
 
-  if (!Section.Entries.empty() && Section.Content)
-    reportError("cannot specify both raw content and explicit entries "
-                "for dynamic section '" +
-                Section.Name + "'");
-
-  if (Section.Content)
-    SHeader.sh_size = Section.Content->binary_size();
-  else
-    SHeader.sh_size = 2 * sizeof(uintX_t) * Section.Entries.size();
-  if (Section.EntSize)
-    SHeader.sh_entsize = *Section.EntSize;
-  else
-    SHeader.sh_entsize = sizeof(Elf_Dyn);
+  if (!Section.Entries)
+    return;
 
-  for (const ELFYAML::DynamicEntry &DE : Section.Entries) {
+  for (const ELFYAML::DynamicEntry &DE : *Section.Entries) {
     CBA.write<uintX_t>(DE.Tag, ELFT::TargetEndianness);
     CBA.write<uintX_t>(DE.Val, ELFT::TargetEndianness);
   }
-  if (Section.Content)
-    CBA.writeAsBinary(*Section.Content);
+  SHeader.sh_size = 2 * sizeof(uintX_t) * Section.Entries->size();
 }
 
 template <class ELFT>
 void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
                                          const ELFYAML::AddrsigSection &Section,
                                          ContiguousBlobAccumulator &CBA) {
-  unsigned Link = 0;
-  if (Section.Link.empty() && !ExcludedSectionHeaders.count(".symtab") &&
-      SN2I.lookup(".symtab", Link))
-    SHeader.sh_link = Link;
+  if (!Section.Symbols)
+    return;
 
-  if (Section.Content || Section.Size) {
-    SHeader.sh_size = writeContent(CBA, Section.Content, Section.Size);
+  if (!Section.Symbols)
     return;
-  }
 
   for (StringRef Sym : *Section.Symbols)
     SHeader.sh_size +=
@@ -1612,12 +1636,10 @@ template <class ELFT>
 void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
                                          const ELFYAML::NoteSection &Section,
                                          ContiguousBlobAccumulator &CBA) {
-  uint64_t Offset = CBA.tell();
-  if (Section.Content || Section.Size) {
-    SHeader.sh_size = writeContent(CBA, Section.Content, Section.Size);
+  if (!Section.Notes)
     return;
-  }
 
+  uint64_t Offset = CBA.tell();
   for (const ELFYAML::NoteEntry &NE : *Section.Notes) {
     // Write name size.
     if (NE.Name.empty())
@@ -1655,15 +1677,11 @@ template <class ELFT>
 void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
                                          const ELFYAML::GnuHashSection &Section,
                                          ContiguousBlobAccumulator &CBA) {
-  unsigned Link = 0;
-  if (Section.Link.empty() && !ExcludedSectionHeaders.count(".dynsym") &&
-      SN2I.lookup(".dynsym", Link))
-    SHeader.sh_link = Link;
+  if (!Section.HashBuckets)
+    return;
 
-  if (Section.Content) {
-    SHeader.sh_size = writeContent(CBA, Section.Content, None);
+  if (!Section.Header)
     return;
-  }
 
   // We write the header first, starting with the hash buckets count. Normally
   // it is the number of entries in HashBuckets, but the "NBuckets" property can
@@ -1724,7 +1742,9 @@ void ELFState<ELFT>::writeFill(ELFYAML::Fill &Fill,
 
 template <class ELFT>
 DenseMap<StringRef, size_t> ELFState<ELFT>::buildSectionHeaderReorderMap() {
-  if (!Doc.SectionHeaders || Doc.SectionHeaders->NoHeaders)
+  const ELFYAML::SectionHeaderTable &SectionHeaders =
+      Doc.getSectionHeaderTable();
+  if (SectionHeaders.IsImplicit || SectionHeaders.NoHeaders)
     return DenseMap<StringRef, size_t>();
 
   DenseMap<StringRef, size_t> Ret;
@@ -1738,12 +1758,12 @@ DenseMap<StringRef, size_t> ELFState<ELFT>::buildSectionHeaderReorderMap() {
     Seen.insert(Hdr.Name);
   };
 
-  if (Doc.SectionHeaders->Sections)
-    for (const ELFYAML::SectionHeader &Hdr : *Doc.SectionHeaders->Sections)
+  if (SectionHeaders.Sections)
+    for (const ELFYAML::SectionHeader &Hdr : *SectionHeaders.Sections)
       AddSection(Hdr);
 
-  if (Doc.SectionHeaders->Excluded)
-    for (const ELFYAML::SectionHeader &Hdr : *Doc.SectionHeaders->Excluded)
+  if (SectionHeaders.Excluded)
+    for (const ELFYAML::SectionHeader &Hdr : *SectionHeaders.Excluded)
       AddSection(Hdr);
 
   for (const ELFYAML::Section *S : Doc.getSections()) {
@@ -1772,17 +1792,17 @@ template <class ELFT> void ELFState<ELFT>::buildSectionIndex() {
 
   // Build excluded section headers map.
   std::vector<ELFYAML::Section *> Sections = Doc.getSections();
-  if (Doc.SectionHeaders) {
-    if (Doc.SectionHeaders->Excluded)
-      for (const ELFYAML::SectionHeader &Hdr : *Doc.SectionHeaders->Excluded)
-        if (!ExcludedSectionHeaders.insert(Hdr.Name).second)
-          llvm_unreachable("buildSectionIndex() failed");
-
-    if (Doc.SectionHeaders->NoHeaders.getValueOr(false))
-      for (const ELFYAML::Section *S : Sections)
-        if (!ExcludedSectionHeaders.insert(S->Name).second)
-          llvm_unreachable("buildSectionIndex() failed");
-  }
+  const ELFYAML::SectionHeaderTable &SectionHeaders =
+      Doc.getSectionHeaderTable();
+  if (SectionHeaders.Excluded)
+    for (const ELFYAML::SectionHeader &Hdr : *SectionHeaders.Excluded)
+      if (!ExcludedSectionHeaders.insert(Hdr.Name).second)
+        llvm_unreachable("buildSectionIndex() failed");
+
+  if (SectionHeaders.NoHeaders.getValueOr(false))
+    for (const ELFYAML::Section *S : Sections)
+      if (!ExcludedSectionHeaders.insert(S->Name).second)
+        llvm_unreachable("buildSectionIndex() failed");
 
   size_t SecNdx = -1;
   for (const ELFYAML::Section *S : Sections) {
@@ -1885,11 +1905,7 @@ bool ELFState<ELFT>::writeELF(raw_ostream &OS, ELFYAML::Object &Doc,
   // Now we can decide segment offsets.
   State.setProgramHeaderLayout(PHeaders, SHeaders);
 
-  // Align the start of the section header table, which is written after all
-  // section data.
-  uint64_t SHOff =
-      State.alignToOffset(CBA, sizeof(typename ELFT::uint), /*Offset=*/None);
-  bool ReachedLimit = SHOff + arrayDataSize(makeArrayRef(SHeaders)) > MaxSize;
+  bool ReachedLimit = CBA.getOffset() > MaxSize;
   if (Error E = CBA.takeLimitError()) {
     // We report a custom error message instead below.
     consumeError(std::move(E));
@@ -1904,10 +1920,15 @@ bool ELFState<ELFT>::writeELF(raw_ostream &OS, ELFYAML::Object &Doc,
   if (State.HasError)
     return false;
 
-  State.writeELFHeader(OS, SHOff);
+  State.writeELFHeader(OS);
   writeArrayData(OS, makeArrayRef(PHeaders));
+
+  const ELFYAML::SectionHeaderTable &SHT = Doc.getSectionHeaderTable();
+  if (!SHT.NoHeaders.getValueOr(false))
+    CBA.updateDataAt(*SHT.Offset, SHeaders.data(),
+                     SHT.getNumHeaders(SHeaders.size()) * sizeof(Elf_Shdr));
+
   CBA.writeBlobToStream(OS);
-  writeArrayData(OS, makeArrayRef(SHeaders));
   return true;
 }
 
diff --git a/contrib/llvm-project/llvm/lib/ObjectYAML/ELFYAML.cpp b/contrib/llvm-project/llvm/lib/ObjectYAML/ELFYAML.cpp
index 2353b34f188b..05d30577812b 100644
--- a/contrib/llvm-project/llvm/lib/ObjectYAML/ELFYAML.cpp
+++ b/contrib/llvm-project/llvm/lib/ObjectYAML/ELFYAML.cpp
@@ -14,6 +14,7 @@
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/ELF.h"
+#include "llvm/Support/ARMEHABI.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MipsABIFlags.h"
@@ -26,6 +27,16 @@ namespace llvm {
 
 ELFYAML::Chunk::~Chunk() = default;
 
+namespace ELFYAML {
+unsigned Object::getMachine() const {
+  if (Header.Machine)
+    return *Header.Machine;
+  return llvm::ELF::EM_NONE;
+}
+
+constexpr StringRef SectionHeaderTable::TypeStr;
+} // namespace ELFYAML
+
 namespace yaml {
 
 void ScalarEnumerationTraits<ELFYAML::ELF_ET>::enumeration(
@@ -222,6 +233,7 @@ void ScalarEnumerationTraits<ELFYAML::ELF_EM>::enumeration(
   ECase(EM_LANAI);
   ECase(EM_BPF);
   ECase(EM_VE);
+  ECase(EM_CSKY);
 #undef ECase
   IO.enumFallback<Hex16>(Value);
 }
@@ -285,7 +297,7 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
   assert(Object && "The IO context is not initialized");
 #define BCase(X) IO.bitSetCase(Value, #X, ELF::X)
 #define BCaseMask(X, M) IO.maskedBitSetCase(Value, #X, ELF::X, ELF::M)
-  switch (Object->Header.Machine) {
+  switch (Object->getMachine()) {
   case ELF::EM_ARM:
     BCase(EF_ARM_SOFT_FLOAT);
     BCase(EF_ARM_VFP_FLOAT);
@@ -411,14 +423,17 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
     BCaseMask(EF_AMDGPU_MACH_R600_TURKS, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX600, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX601, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX602, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX700, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX701, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX702, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX703, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX704, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX705, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX801, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX802, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX803, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX805, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX810, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX900, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX902, EF_AMDGPU_MACH);
@@ -426,17 +441,19 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX906, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX908, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX909, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX90C, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1010, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1011, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1012, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1030, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1031, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1032, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1033, EF_AMDGPU_MACH);
     BCase(EF_AMDGPU_XNACK);
     BCase(EF_AMDGPU_SRAM_ECC);
     break;
-  case ELF::EM_X86_64:
-    break;
   default:
-    llvm_unreachable("Unsupported architecture");
+    break;
   }
 #undef BCase
 #undef BCaseMask
@@ -477,12 +494,13 @@ void ScalarEnumerationTraits<ELFYAML::ELF_SHT>::enumeration(
   ECase(SHT_LLVM_SYMPART);
   ECase(SHT_LLVM_PART_EHDR);
   ECase(SHT_LLVM_PART_PHDR);
+  ECase(SHT_LLVM_BB_ADDR_MAP);
   ECase(SHT_GNU_ATTRIBUTES);
   ECase(SHT_GNU_HASH);
   ECase(SHT_GNU_verdef);
   ECase(SHT_GNU_verneed);
   ECase(SHT_GNU_versym);
-  switch (Object->Header.Machine) {
+  switch (Object->getMachine()) {
   case ELF::EM_ARM:
     ECase(SHT_ARM_EXIDX);
     ECase(SHT_ARM_PREEMPTMAP);
@@ -537,7 +555,7 @@ void ScalarBitSetTraits<ELFYAML::ELF_SHF>::bitset(IO &IO,
   BCase(SHF_GROUP);
   BCase(SHF_TLS);
   BCase(SHF_COMPRESSED);
-  switch (Object->Header.Machine) {
+  switch (Object->getMachine()) {
   case ELF::EM_ARM:
     BCase(SHF_ARM_PURECODE);
     break;
@@ -629,7 +647,7 @@ void ScalarEnumerationTraits<ELFYAML::ELF_REL>::enumeration(
   const auto *Object = static_cast<ELFYAML::Object *>(IO.getContext());
   assert(Object && "The IO context is not initialized");
 #define ELF_RELOC(X, Y) IO.enumCase(Value, #X, ELF::X);
-  switch (Object->Header.Machine) {
+  switch (Object->getMachine()) {
   case ELF::EM_X86_64:
 #include "llvm/BinaryFormat/ELFRelocs/x86_64.def"
     break;
@@ -667,6 +685,9 @@ void ScalarEnumerationTraits<ELFYAML::ELF_REL>::enumeration(
   case ELF::EM_VE:
 #include "llvm/BinaryFormat/ELFRelocs/VE.def"
     break;
+  case ELF::EM_CSKY:
+#include "llvm/BinaryFormat/ELFRelocs/CSKY.def"
+    break;
   case ELF::EM_PPC64:
 #include "llvm/BinaryFormat/ELFRelocs/PowerPC64.def"
     break;
@@ -694,7 +715,7 @@ void ScalarEnumerationTraits<ELFYAML::ELF_DYNTAG>::enumeration(
 
 #define STRINGIFY(X) (#X)
 #define DYNAMIC_TAG(X, Y) IO.enumCase(Value, STRINGIFY(DT_##X), ELF::DT_##X);
-  switch (Object->Header.Machine) {
+  switch (Object->getMachine()) {
   case ELF::EM_AARCH64:
 #undef AARCH64_DYNAMIC_TAG
 #define AARCH64_DYNAMIC_TAG(name, value) DYNAMIC_TAG(name, value)
@@ -805,6 +826,7 @@ void ScalarEnumerationTraits<ELFYAML::MIPS_ISA>::enumeration(
   IO.enumCase(Value, "MIPS5", 5);
   IO.enumCase(Value, "MIPS32", 32);
   IO.enumCase(Value, "MIPS64", 64);
+  IO.enumFallback<Hex32>(Value);
 }
 
 void ScalarBitSetTraits<ELFYAML::MIPS_AFL_ASE>::bitset(
@@ -823,6 +845,8 @@ void ScalarBitSetTraits<ELFYAML::MIPS_AFL_ASE>::bitset(
   BCase(MIPS16);
   BCase(MICROMIPS);
   BCase(XPA);
+  BCase(CRC);
+  BCase(GINV);
 #undef BCase
 }
 
@@ -838,23 +862,6 @@ void MappingTraits<ELFYAML::SectionHeader>::mapping(
   IO.mapRequired("Name", SHdr.Name);
 }
 
-void MappingTraits<ELFYAML::SectionHeaderTable>::mapping(
-    IO &IO, ELFYAML::SectionHeaderTable &SectionHeader) {
-  IO.mapOptional("Sections", SectionHeader.Sections);
-  IO.mapOptional("Excluded", SectionHeader.Excluded);
-  IO.mapOptional("NoHeaders", SectionHeader.NoHeaders);
-}
-
-StringRef MappingTraits<ELFYAML::SectionHeaderTable>::validate(
-    IO &IO, ELFYAML::SectionHeaderTable &SecHdrTable) {
-  if (SecHdrTable.NoHeaders && (SecHdrTable.Sections || SecHdrTable.Excluded))
-    return "NoHeaders can't be used together with Sections/Excluded";
-  if (!SecHdrTable.NoHeaders && !SecHdrTable.Sections && !SecHdrTable.Excluded)
-    return "SectionHeaderTable can't be empty. Use 'NoHeaders' key to drop the "
-           "section header table";
-  return StringRef();
-}
-
 void MappingTraits<ELFYAML::FileHeader>::mapping(IO &IO,
                                                  ELFYAML::FileHeader &FileHdr) {
   IO.mapRequired("Class", FileHdr.Class);
@@ -862,7 +869,7 @@ void MappingTraits<ELFYAML::FileHeader>::mapping(IO &IO,
   IO.mapOptional("OSABI", FileHdr.OSABI, ELFYAML::ELF_ELFOSABI(0));
   IO.mapOptional("ABIVersion", FileHdr.ABIVersion, Hex8(0));
   IO.mapRequired("Type", FileHdr.Type);
-  IO.mapRequired("Machine", FileHdr.Machine);
+  IO.mapOptional("Machine", FileHdr.Machine);
   IO.mapOptional("Flags", FileHdr.Flags, ELFYAML::ELF_EF(0));
   IO.mapOptional("Entry", FileHdr.Entry, Hex64(0));
 
@@ -882,7 +889,8 @@ void MappingTraits<ELFYAML::ProgramHeader>::mapping(
     IO &IO, ELFYAML::ProgramHeader &Phdr) {
   IO.mapRequired("Type", Phdr.Type);
   IO.mapOptional("Flags", Phdr.Flags, ELFYAML::ELF_PF(0));
-  IO.mapOptional("Sections", Phdr.Sections);
+  IO.mapOptional("FirstSec", Phdr.FirstSec);
+  IO.mapOptional("LastSec", Phdr.LastSec);
   IO.mapOptional("VAddr", Phdr.VAddr, Hex64(0));
   IO.mapOptional("PAddr", Phdr.PAddr, Phdr.VAddr);
   IO.mapOptional("Align", Phdr.Align);
@@ -891,6 +899,15 @@ void MappingTraits<ELFYAML::ProgramHeader>::mapping(
   IO.mapOptional("Offset", Phdr.Offset);
 }
 
+std::string MappingTraits<ELFYAML::ProgramHeader>::validate(
+    IO &IO, ELFYAML::ProgramHeader &FileHdr) {
+  if (!FileHdr.FirstSec && FileHdr.LastSec)
+    return "the \"LastSec\" key can't be used without the \"FirstSec\" key";
+  if (FileHdr.FirstSec && !FileHdr.LastSec)
+    return "the \"FirstSec\" key can't be used without the \"LastSec\" key";
+  return "";
+}
+
 LLVM_YAML_STRONG_TYPEDEF(StringRef, StOtherPiece)
 
 template <> struct ScalarTraits<StOtherPiece> {
@@ -935,7 +952,7 @@ struct NormalizedOther {
     std::vector<StOtherPiece> Ret;
     const auto *Object = static_cast<ELFYAML::Object *>(YamlIO.getContext());
     for (std::pair<StringRef, uint8_t> &P :
-         getFlags(Object->Header.Machine).takeVector()) {
+         getFlags(Object->getMachine()).takeVector()) {
       uint8_t FlagValue = P.second;
       if ((*Original & FlagValue) != FlagValue)
         continue;
@@ -954,7 +971,7 @@ struct NormalizedOther {
 
   uint8_t toValue(StringRef Name) {
     const auto *Object = static_cast<ELFYAML::Object *>(YamlIO.getContext());
-    MapVector<StringRef, uint8_t> Flags = getFlags(Object->Header.Machine);
+    MapVector<StringRef, uint8_t> Flags = getFlags(Object->getMachine());
 
     auto It = Flags.find(Name);
     if (It != Flags.end())
@@ -1008,6 +1025,9 @@ struct NormalizedOther {
       Map["STO_MIPS_PLT"] = ELF::STO_MIPS_PLT;
       Map["STO_MIPS_OPTIONAL"] = ELF::STO_MIPS_OPTIONAL;
     }
+
+    if (EMachine == ELF::EM_AARCH64)
+      Map["STO_AARCH64_VARIANT_PCS"] = ELF::STO_AARCH64_VARIANT_PCS;
     return Map;
   }
 
@@ -1054,11 +1074,11 @@ void MappingTraits<ELFYAML::Symbol>::mapping(IO &IO, ELFYAML::Symbol &Symbol) {
   IO.mapOptional("Name", Symbol.Name, StringRef());
   IO.mapOptional("StName", Symbol.StName);
   IO.mapOptional("Type", Symbol.Type, ELFYAML::ELF_STT(0));
-  IO.mapOptional("Section", Symbol.Section, StringRef());
+  IO.mapOptional("Section", Symbol.Section);
   IO.mapOptional("Index", Symbol.Index);
   IO.mapOptional("Binding", Symbol.Binding, ELFYAML::ELF_STB(0));
-  IO.mapOptional("Value", Symbol.Value, Hex64(0));
-  IO.mapOptional("Size", Symbol.Size, Hex64(0));
+  IO.mapOptional("Value", Symbol.Value);
+  IO.mapOptional("Size", Symbol.Size);
 
   // Symbol's Other field is a bit special. It is usually a field that
   // represents st_other and holds the symbol visibility. However, on some
@@ -1070,11 +1090,11 @@ void MappingTraits<ELFYAML::Symbol>::mapping(IO &IO, ELFYAML::Symbol &Symbol) {
   IO.mapOptional("Other", Keys->Other);
 }
 
-StringRef MappingTraits<ELFYAML::Symbol>::validate(IO &IO,
-                                                   ELFYAML::Symbol &Symbol) {
-  if (Symbol.Index && Symbol.Section.data())
+std::string MappingTraits<ELFYAML::Symbol>::validate(IO &IO,
+                                                     ELFYAML::Symbol &Symbol) {
+  if (Symbol.Index && Symbol.Section)
     return "Index and Section cannot both be specified for Symbol";
-  return StringRef();
+  return "";
 }
 
 static void commonSectionMapping(IO &IO, ELFYAML::Section &Section) {
@@ -1082,32 +1102,35 @@ static void commonSectionMapping(IO &IO, ELFYAML::Section &Section) {
   IO.mapRequired("Type", Section.Type);
   IO.mapOptional("Flags", Section.Flags);
   IO.mapOptional("Address", Section.Address);
-  IO.mapOptional("Link", Section.Link, StringRef());
+  IO.mapOptional("Link", Section.Link);
   IO.mapOptional("AddressAlign", Section.AddressAlign, Hex64(0));
   IO.mapOptional("EntSize", Section.EntSize);
   IO.mapOptional("Offset", Section.Offset);
 
+  IO.mapOptional("Content", Section.Content);
+  IO.mapOptional("Size", Section.Size);
+
   // obj2yaml does not dump these fields. They are expected to be empty when we
   // are producing YAML, because yaml2obj sets appropriate values for them
   // automatically when they are not explicitly defined.
   assert(!IO.outputting() ||
-         (!Section.ShOffset.hasValue() && !Section.ShSize.hasValue() &&
-          !Section.ShName.hasValue() && !Section.ShFlags.hasValue()));
+         (!Section.ShOffset && !Section.ShSize && !Section.ShName &&
+          !Section.ShFlags && !Section.ShType && !Section.ShAddrAlign));
+  IO.mapOptional("ShAddrAlign", Section.ShAddrAlign);
   IO.mapOptional("ShName", Section.ShName);
   IO.mapOptional("ShOffset", Section.ShOffset);
   IO.mapOptional("ShSize", Section.ShSize);
   IO.mapOptional("ShFlags", Section.ShFlags);
+  IO.mapOptional("ShType", Section.ShType);
 }
 
 static void sectionMapping(IO &IO, ELFYAML::DynamicSection &Section) {
   commonSectionMapping(IO, Section);
   IO.mapOptional("Entries", Section.Entries);
-  IO.mapOptional("Content", Section.Content);
 }
 
 static void sectionMapping(IO &IO, ELFYAML::RawContentSection &Section) {
   commonSectionMapping(IO, Section);
-  IO.mapOptional("Content", Section.Content);
 
   // We also support reading a content as array of bytes using the ContentArray
   // key. obj2yaml never prints this field.
@@ -1119,23 +1142,24 @@ static void sectionMapping(IO &IO, ELFYAML::RawContentSection &Section) {
     Section.Content = yaml::BinaryRef(*Section.ContentBuf);
   }
 
-  IO.mapOptional("Size", Section.Size);
   IO.mapOptional("Info", Section.Info);
 }
 
-static void sectionMapping(IO &IO, ELFYAML::StackSizesSection &Section) {
+static void sectionMapping(IO &IO, ELFYAML::BBAddrMapSection &Section) {
   commonSectionMapping(IO, Section);
   IO.mapOptional("Content", Section.Content);
-  IO.mapOptional("Size", Section.Size);
+  IO.mapOptional("Entries", Section.Entries);
+}
+
+static void sectionMapping(IO &IO, ELFYAML::StackSizesSection &Section) {
+  commonSectionMapping(IO, Section);
   IO.mapOptional("Entries", Section.Entries);
 }
 
 static void sectionMapping(IO &IO, ELFYAML::HashSection &Section) {
   commonSectionMapping(IO, Section);
-  IO.mapOptional("Content", Section.Content);
   IO.mapOptional("Bucket", Section.Bucket);
   IO.mapOptional("Chain", Section.Chain);
-  IO.mapOptional("Size", Section.Size);
 
   // obj2yaml does not dump these fields. They can be used to override nchain
   // and nbucket values for creating broken sections.
@@ -1147,15 +1171,12 @@ static void sectionMapping(IO &IO, ELFYAML::HashSection &Section) {
 
 static void sectionMapping(IO &IO, ELFYAML::NoteSection &Section) {
   commonSectionMapping(IO, Section);
-  IO.mapOptional("Content", Section.Content);
-  IO.mapOptional("Size", Section.Size);
   IO.mapOptional("Notes", Section.Notes);
 }
 
 
 static void sectionMapping(IO &IO, ELFYAML::GnuHashSection &Section) {
   commonSectionMapping(IO, Section);
-  IO.mapOptional("Content", Section.Content);
   IO.mapOptional("Header", Section.Header);
   IO.mapOptional("BloomFilter", Section.BloomFilter);
   IO.mapOptional("HashBuckets", Section.HashBuckets);
@@ -1163,26 +1184,23 @@ static void sectionMapping(IO &IO, ELFYAML::GnuHashSection &Section) {
 }
 static void sectionMapping(IO &IO, ELFYAML::NoBitsSection &Section) {
   commonSectionMapping(IO, Section);
-  IO.mapOptional("Size", Section.Size, Hex64(0));
 }
 
 static void sectionMapping(IO &IO, ELFYAML::VerdefSection &Section) {
   commonSectionMapping(IO, Section);
-  IO.mapRequired("Info", Section.Info);
+  IO.mapOptional("Info", Section.Info);
   IO.mapOptional("Entries", Section.Entries);
-  IO.mapOptional("Content", Section.Content);
 }
 
 static void sectionMapping(IO &IO, ELFYAML::SymverSection &Section) {
   commonSectionMapping(IO, Section);
-  IO.mapRequired("Entries", Section.Entries);
+  IO.mapOptional("Entries", Section.Entries);
 }
 
 static void sectionMapping(IO &IO, ELFYAML::VerneedSection &Section) {
   commonSectionMapping(IO, Section);
-  IO.mapRequired("Info", Section.Info);
+  IO.mapOptional("Info", Section.Info);
   IO.mapOptional("Dependencies", Section.VerneedV);
-  IO.mapOptional("Content", Section.Content);
 }
 
 static void sectionMapping(IO &IO, ELFYAML::RelocationSection &Section) {
@@ -1194,24 +1212,21 @@ static void sectionMapping(IO &IO, ELFYAML::RelocationSection &Section) {
 static void sectionMapping(IO &IO, ELFYAML::RelrSection &Section) {
   commonSectionMapping(IO, Section);
   IO.mapOptional("Entries", Section.Entries);
-  IO.mapOptional("Content", Section.Content);
 }
 
-static void groupSectionMapping(IO &IO, ELFYAML::Group &Group) {
+static void groupSectionMapping(IO &IO, ELFYAML::GroupSection &Group) {
   commonSectionMapping(IO, Group);
   IO.mapOptional("Info", Group.Signature);
-  IO.mapRequired("Members", Group.Members);
+  IO.mapOptional("Members", Group.Members);
 }
 
 static void sectionMapping(IO &IO, ELFYAML::SymtabShndxSection &Section) {
   commonSectionMapping(IO, Section);
-  IO.mapRequired("Entries", Section.Entries);
+  IO.mapOptional("Entries", Section.Entries);
 }
 
 static void sectionMapping(IO &IO, ELFYAML::AddrsigSection &Section) {
   commonSectionMapping(IO, Section);
-  IO.mapOptional("Content", Section.Content);
-  IO.mapOptional("Size", Section.Size);
   IO.mapOptional("Symbols", Section.Symbols);
 }
 
@@ -1222,23 +1237,28 @@ static void fillMapping(IO &IO, ELFYAML::Fill &Fill) {
   IO.mapRequired("Size", Fill.Size);
 }
 
+static void sectionHeaderTableMapping(IO &IO,
+                                      ELFYAML::SectionHeaderTable &SHT) {
+  IO.mapOptional("Offset", SHT.Offset);
+  IO.mapOptional("Sections", SHT.Sections);
+  IO.mapOptional("Excluded", SHT.Excluded);
+  IO.mapOptional("NoHeaders", SHT.NoHeaders);
+}
+
 static void sectionMapping(IO &IO, ELFYAML::LinkerOptionsSection &Section) {
   commonSectionMapping(IO, Section);
   IO.mapOptional("Options", Section.Options);
-  IO.mapOptional("Content", Section.Content);
 }
 
 static void sectionMapping(IO &IO,
                            ELFYAML::DependentLibrariesSection &Section) {
   commonSectionMapping(IO, Section);
   IO.mapOptional("Libraries", Section.Libs);
-  IO.mapOptional("Content", Section.Content);
 }
 
 static void sectionMapping(IO &IO, ELFYAML::CallGraphProfileSection &Section) {
   commonSectionMapping(IO, Section);
   IO.mapOptional("Entries", Section.Entries);
-  IO.mapOptional("Content", Section.Content);
 }
 
 void MappingTraits<ELFYAML::SectionOrType>::mapping(
@@ -1246,9 +1266,9 @@ void MappingTraits<ELFYAML::SectionOrType>::mapping(
   IO.mapRequired("SectionOrType", sectionOrType.sectionNameOrType);
 }
 
-void MappingTraits<ELFYAML::SectionName>::mapping(
-    IO &IO, ELFYAML::SectionName &sectionName) {
-  IO.mapRequired("Section", sectionName.Section);
+static void sectionMapping(IO &IO, ELFYAML::ARMIndexTableSection &Section) {
+  commonSectionMapping(IO, Section);
+  IO.mapOptional("Entries", Section.Entries);
 }
 
 static void sectionMapping(IO &IO, ELFYAML::MipsABIFlags &Section) {
@@ -1271,24 +1291,69 @@ static void sectionMapping(IO &IO, ELFYAML::MipsABIFlags &Section) {
   IO.mapOptional("Flags2", Section.Flags2, Hex32(0));
 }
 
+static StringRef getStringValue(IO &IO, const char *Key) {
+  StringRef Val;
+  IO.mapRequired(Key, Val);
+  return Val;
+}
+
+static void setStringValue(IO &IO, const char *Key, StringRef Val) {
+  IO.mapRequired(Key, Val);
+}
+
+static bool isInteger(StringRef Val) {
+  APInt Tmp;
+  return !Val.getAsInteger(0, Tmp);
+}
+
 void MappingTraits<std::unique_ptr<ELFYAML::Chunk>>::mapping(
     IO &IO, std::unique_ptr<ELFYAML::Chunk> &Section) {
   ELFYAML::ELF_SHT Type;
+  StringRef TypeStr;
   if (IO.outputting()) {
-    Type = cast<ELFYAML::Section>(Section.get())->Type;
+    if (auto *S = dyn_cast<ELFYAML::Section>(Section.get()))
+      Type = S->Type;
+    else if (auto *SHT = dyn_cast<ELFYAML::SectionHeaderTable>(Section.get()))
+      TypeStr = SHT->TypeStr;
   } else {
     // When the Type string does not have a "SHT_" prefix, we know it is not a
-    // description of a regular ELF output section. Currently, we have one
-    // special type named "Fill". See comments for Fill.
-    StringRef StrType;
-    IO.mapRequired("Type", StrType);
-    if (StrType == "Fill") {
-      Section.reset(new ELFYAML::Fill());
-      fillMapping(IO, *cast<ELFYAML::Fill>(Section.get()));
-      return;
-    }
+    // description of a regular ELF output section.
+    TypeStr = getStringValue(IO, "Type");
+    if (TypeStr.startswith("SHT_") || isInteger(TypeStr))
+      IO.mapRequired("Type", Type);
+  }
+
+  if (TypeStr == "Fill") {
+    assert(!IO.outputting()); // We don't dump fills currently.
+    Section.reset(new ELFYAML::Fill());
+    fillMapping(IO, *cast<ELFYAML::Fill>(Section.get()));
+    return;
+  }
+
+  if (TypeStr == ELFYAML::SectionHeaderTable::TypeStr) {
+    if (IO.outputting())
+      setStringValue(IO, "Type", TypeStr);
+    else
+      Section.reset(new ELFYAML::SectionHeaderTable(/*IsImplicit=*/false));
+
+    sectionHeaderTableMapping(
+        IO, *cast<ELFYAML::SectionHeaderTable>(Section.get()));
+    return;
+  }
 
-    IO.mapRequired("Type", Type);
+  const auto &Obj = *static_cast<ELFYAML::Object *>(IO.getContext());
+  if (Obj.getMachine() == ELF::EM_MIPS && Type == ELF::SHT_MIPS_ABIFLAGS) {
+    if (!IO.outputting())
+      Section.reset(new ELFYAML::MipsABIFlags());
+    sectionMapping(IO, *cast<ELFYAML::MipsABIFlags>(Section.get()));
+    return;
+  }
+
+  if (Obj.getMachine() == ELF::EM_ARM && Type == ELF::SHT_ARM_EXIDX) {
+    if (!IO.outputting())
+      Section.reset(new ELFYAML::ARMIndexTableSection());
+    sectionMapping(IO, *cast<ELFYAML::ARMIndexTableSection>(Section.get()));
+    return;
   }
 
   switch (Type) {
@@ -1310,8 +1375,8 @@ void MappingTraits<std::unique_ptr<ELFYAML::Chunk>>::mapping(
     break;
   case ELF::SHT_GROUP:
     if (!IO.outputting())
-      Section.reset(new ELFYAML::Group());
-    groupSectionMapping(IO, *cast<ELFYAML::Group>(Section.get()));
+      Section.reset(new ELFYAML::GroupSection());
+    groupSectionMapping(IO, *cast<ELFYAML::GroupSection>(Section.get()));
     break;
   case ELF::SHT_NOBITS:
     if (!IO.outputting())
@@ -1333,11 +1398,6 @@ void MappingTraits<std::unique_ptr<ELFYAML::Chunk>>::mapping(
       Section.reset(new ELFYAML::GnuHashSection());
     sectionMapping(IO, *cast<ELFYAML::GnuHashSection>(Section.get()));
     break;
-  case ELF::SHT_MIPS_ABIFLAGS:
-    if (!IO.outputting())
-      Section.reset(new ELFYAML::MipsABIFlags());
-    sectionMapping(IO, *cast<ELFYAML::MipsABIFlags>(Section.get()));
-    break;
   case ELF::SHT_GNU_verdef:
     if (!IO.outputting())
       Section.reset(new ELFYAML::VerdefSection());
@@ -1379,6 +1439,11 @@ void MappingTraits<std::unique_ptr<ELFYAML::Chunk>>::mapping(
       Section.reset(new ELFYAML::CallGraphProfileSection());
     sectionMapping(IO, *cast<ELFYAML::CallGraphProfileSection>(Section.get()));
     break;
+  case ELF::SHT_LLVM_BB_ADDR_MAP:
+    if (!IO.outputting())
+      Section.reset(new ELFYAML::BBAddrMapSection());
+    sectionMapping(IO, *cast<ELFYAML::BBAddrMapSection>(Section.get()));
+    break;
   default:
     if (!IO.outputting()) {
       StringRef Name;
@@ -1398,170 +1463,77 @@ void MappingTraits<std::unique_ptr<ELFYAML::Chunk>>::mapping(
   }
 }
 
-StringRef MappingTraits<std::unique_ptr<ELFYAML::Chunk>>::validate(
+std::string MappingTraits<std::unique_ptr<ELFYAML::Chunk>>::validate(
     IO &io, std::unique_ptr<ELFYAML::Chunk> &C) {
-  if (const auto *RawSection = dyn_cast<ELFYAML::RawContentSection>(C.get())) {
-    if (RawSection->Size && RawSection->Content &&
-        (uint64_t)(*RawSection->Size) < RawSection->Content->binary_size())
-      return "Section size must be greater than or equal to the content size";
-    if (RawSection->Flags && RawSection->ShFlags)
-      return "ShFlags and Flags cannot be used together";
-    return {};
-  }
-
-  if (const auto *SS = dyn_cast<ELFYAML::StackSizesSection>(C.get())) {
-    if (!SS->Entries && !SS->Content && !SS->Size)
-      return ".stack_sizes: one of Content, Entries and Size must be specified";
-
-    if (SS->Size && SS->Content &&
-        (uint64_t)(*SS->Size) < SS->Content->binary_size())
-      return ".stack_sizes: Size must be greater than or equal to the content "
-             "size";
-
-    // We accept Content, Size or both together when there are no Entries.
-    if (!SS->Entries)
-      return {};
-
-    if (SS->Size)
-      return ".stack_sizes: Size and Entries cannot be used together";
-    if (SS->Content)
-      return ".stack_sizes: Content and Entries cannot be used together";
-    return {};
-  }
-
-  if (const auto *HS = dyn_cast<ELFYAML::HashSection>(C.get())) {
-    if (!HS->Content && !HS->Bucket && !HS->Chain && !HS->Size)
-      return "one of \"Content\", \"Size\", \"Bucket\" or \"Chain\" must be "
-             "specified";
-
-    if (HS->Content || HS->Size) {
-      if (HS->Size && HS->Content &&
-          (uint64_t)*HS->Size < HS->Content->binary_size())
-        return "\"Size\" must be greater than or equal to the content "
-               "size";
-
-      if (HS->Bucket)
-        return "\"Bucket\" cannot be used with \"Content\" or \"Size\"";
-      if (HS->Chain)
-        return "\"Chain\" cannot be used with \"Content\" or \"Size\"";
-      return {};
-    }
-
-    if ((HS->Bucket && !HS->Chain) || (!HS->Bucket && HS->Chain))
-      return "\"Bucket\" and \"Chain\" must be used together";
-    return {};
-  }
-
-  if (const auto *Sec = dyn_cast<ELFYAML::AddrsigSection>(C.get())) {
-    if (!Sec->Symbols && !Sec->Content && !Sec->Size)
-      return "one of \"Content\", \"Size\" or \"Symbols\" must be specified";
-
-    if (Sec->Content || Sec->Size) {
-      if (Sec->Size && Sec->Content &&
-          (uint64_t)*Sec->Size < Sec->Content->binary_size())
-        return "\"Size\" must be greater than or equal to the content "
-               "size";
-
-      if (Sec->Symbols)
-        return "\"Symbols\" cannot be used with \"Content\" or \"Size\"";
-      return {};
-    }
-
-    if (!Sec->Symbols)
-      return {};
-    return {};
+  if (const auto *F = dyn_cast<ELFYAML::Fill>(C.get())) {
+    if (F->Pattern && F->Pattern->binary_size() != 0 && !F->Size)
+      return "\"Size\" can't be 0 when \"Pattern\" is not empty";
+    return "";
   }
 
-  if (const auto *NS = dyn_cast<ELFYAML::NoteSection>(C.get())) {
-    if (!NS->Content && !NS->Size && !NS->Notes)
-      return "one of \"Content\", \"Size\" or \"Notes\" must be "
-             "specified";
-
-    if (!NS->Content && !NS->Size)
-      return {};
-
-    if (NS->Size && NS->Content &&
-        (uint64_t)*NS->Size < NS->Content->binary_size())
-      return "\"Size\" must be greater than or equal to the content "
-             "size";
-
-    if (NS->Notes)
-      return "\"Notes\" cannot be used with \"Content\" or \"Size\"";
-    return {};
+  if (const auto *SHT = dyn_cast<ELFYAML::SectionHeaderTable>(C.get())) {
+    if (SHT->NoHeaders && (SHT->Sections || SHT->Excluded || SHT->Offset))
+      return "NoHeaders can't be used together with Offset/Sections/Excluded";
+    if (!SHT->NoHeaders && !SHT->Sections && !SHT->Excluded)
+      return "SectionHeaderTable can't be empty. Use 'NoHeaders' key to drop "
+             "the section header table";
+    return "";
   }
 
-  if (const auto *Sec = dyn_cast<ELFYAML::GnuHashSection>(C.get())) {
-    if (!Sec->Content && !Sec->Header && !Sec->BloomFilter &&
-        !Sec->HashBuckets && !Sec->HashValues)
-      return "either \"Content\" or \"Header\", \"BloomFilter\", "
-             "\"HashBuckets\" and \"HashBuckets\" must be specified";
-
-    if (Sec->Header || Sec->BloomFilter || Sec->HashBuckets ||
-        Sec->HashValues) {
-      if (!Sec->Header || !Sec->BloomFilter || !Sec->HashBuckets ||
-          !Sec->HashValues)
-        return "\"Header\", \"BloomFilter\", "
-               "\"HashBuckets\" and \"HashValues\" must be used together";
-      if (Sec->Content)
-        return "\"Header\", \"BloomFilter\", "
-               "\"HashBuckets\" and \"HashValues\" can't be used together with "
-               "\"Content\"";
-      return {};
+  const ELFYAML::Section &Sec = *cast<ELFYAML::Section>(C.get());
+  if (Sec.Size && Sec.Content &&
+      (uint64_t)(*Sec.Size) < Sec.Content->binary_size())
+    return "Section size must be greater than or equal to the content size";
+
+  auto BuildErrPrefix = [](ArrayRef<std::pair<StringRef, bool>> EntV) {
+    std::string Msg;
+    for (size_t I = 0, E = EntV.size(); I != E; ++I) {
+      StringRef Name = EntV[I].first;
+      if (I == 0) {
+        Msg = "\"" + Name.str() + "\"";
+        continue;
+      }
+      if (I != EntV.size() - 1)
+        Msg += ", \"" + Name.str() + "\"";
+      else
+        Msg += " and \"" + Name.str() + "\"";
     }
+    return Msg;
+  };
 
-    // Only Content is specified.
-    return {};
-  }
-
-  if (const auto *Sec = dyn_cast<ELFYAML::LinkerOptionsSection>(C.get())) {
-    if (Sec->Options && Sec->Content)
-      return "\"Options\" and \"Content\" can't be used together";
-    return {};
-  }
+  std::vector<std::pair<StringRef, bool>> Entries = Sec.getEntries();
+  const size_t NumUsedEntries = llvm::count_if(
+      Entries, [](const std::pair<StringRef, bool> &P) { return P.second; });
 
-  if (const auto *Sec = dyn_cast<ELFYAML::DependentLibrariesSection>(C.get())) {
-    if (Sec->Libs && Sec->Content)
-      return "SHT_LLVM_DEPENDENT_LIBRARIES: \"Libraries\" and \"Content\" "
-             "can't "
-             "be used together";
-    return {};
-  }
+  if ((Sec.Size || Sec.Content) && NumUsedEntries > 0)
+    return BuildErrPrefix(Entries) +
+           " cannot be used with \"Content\" or \"Size\"";
 
-  if (const auto *F = dyn_cast<ELFYAML::Fill>(C.get())) {
-    if (!F->Pattern)
-      return {};
-    if (F->Pattern->binary_size() != 0 && !F->Size)
-      return "\"Size\" can't be 0 when \"Pattern\" is not empty";
-    return {};
-  }
+  if (NumUsedEntries > 0 && Entries.size() != NumUsedEntries)
+    return BuildErrPrefix(Entries) + " must be used together";
 
-  if (const auto *VD = dyn_cast<ELFYAML::VerdefSection>(C.get())) {
-    if (VD->Entries && VD->Content)
-      return "SHT_GNU_verdef: \"Entries\" and \"Content\" can't be used "
-             "together";
-    return {};
-  }
-
-  if (const auto *VD = dyn_cast<ELFYAML::VerneedSection>(C.get())) {
-    if (VD->VerneedV && VD->Content)
-      return "SHT_GNU_verneed: \"Dependencies\" and \"Content\" can't be used "
-             "together";
-    return {};
+  if (const auto *RawSection = dyn_cast<ELFYAML::RawContentSection>(C.get())) {
+    if (RawSection->Flags && RawSection->ShFlags)
+      return "ShFlags and Flags cannot be used together";
+    return "";
   }
 
-  if (const auto *RS = dyn_cast<ELFYAML::RelrSection>(C.get())) {
-    if (RS->Entries && RS->Content)
-      return "\"Entries\" and \"Content\" can't be used together";
-    return {};
+  if (const auto *NB = dyn_cast<ELFYAML::NoBitsSection>(C.get())) {
+    if (NB->Content)
+      return "SHT_NOBITS section cannot have \"Content\"";
+    return "";
   }
 
-  if (const auto *CGP = dyn_cast<ELFYAML::CallGraphProfileSection>(C.get())) {
-    if (CGP->Entries && CGP->Content)
-      return "\"Entries\" and \"Content\" can't be used together";
-    return {};
+  if (const auto *MF = dyn_cast<ELFYAML::MipsABIFlags>(C.get())) {
+    if (MF->Content)
+      return "\"Content\" key is not implemented for SHT_MIPS_ABIFLAGS "
+             "sections";
+    if (MF->Size)
+      return "\"Size\" key is not implemented for SHT_MIPS_ABIFLAGS sections";
+    return "";
   }
 
-  return {};
+  return "";
 }
 
 namespace {
@@ -1596,6 +1568,21 @@ void MappingTraits<ELFYAML::StackSizeEntry>::mapping(
   IO.mapRequired("Size", E.Size);
 }
 
+void MappingTraits<ELFYAML::BBAddrMapEntry>::mapping(
+    IO &IO, ELFYAML::BBAddrMapEntry &E) {
+  assert(IO.getContext() && "The IO context is not initialized");
+  IO.mapOptional("Address", E.Address, Hex64(0));
+  IO.mapOptional("BBEntries", E.BBEntries);
+}
+
+void MappingTraits<ELFYAML::BBAddrMapEntry::BBEntry>::mapping(
+    IO &IO, ELFYAML::BBAddrMapEntry::BBEntry &E) {
+  assert(IO.getContext() && "The IO context is not initialized");
+  IO.mapRequired("AddressOffset", E.AddressOffset);
+  IO.mapRequired("Size", E.Size);
+  IO.mapRequired("Metadata", E.Metadata);
+}
+
 void MappingTraits<ELFYAML::GnuHashHeader>::mapping(IO &IO,
                                                     ELFYAML::GnuHashHeader &E) {
   assert(IO.getContext() && "The IO context is not initialized");
@@ -1625,10 +1612,10 @@ void MappingTraits<ELFYAML::VerdefEntry>::mapping(IO &IO,
                                                   ELFYAML::VerdefEntry &E) {
   assert(IO.getContext() && "The IO context is not initialized");
 
-  IO.mapRequired("Version", E.Version);
-  IO.mapRequired("Flags", E.Flags);
-  IO.mapRequired("VersionNdx", E.VersionNdx);
-  IO.mapRequired("Hash", E.Hash);
+  IO.mapOptional("Version", E.Version);
+  IO.mapOptional("Flags", E.Flags);
+  IO.mapOptional("VersionNdx", E.VersionNdx);
+  IO.mapOptional("Hash", E.Hash);
   IO.mapRequired("Names", E.VerNames);
 }
 
@@ -1659,7 +1646,7 @@ void MappingTraits<ELFYAML::Relocation>::mapping(IO &IO,
   IO.mapOptional("Offset", Rel.Offset, (Hex64)0);
   IO.mapOptional("Symbol", Rel.Symbol);
 
-  if (Object->Header.Machine == ELFYAML::ELF_EM(ELF::EM_MIPS) &&
+  if (Object->getMachine() == ELFYAML::ELF_EM(ELF::EM_MIPS) &&
       Object->Header.Class == ELFYAML::ELF_ELFCLASS(ELF::ELFCLASS64)) {
     MappingNormalization<NormalizedMips64RelType, ELFYAML::ELF_REL> Key(
         IO, Rel.Type);
@@ -1673,12 +1660,25 @@ void MappingTraits<ELFYAML::Relocation>::mapping(IO &IO,
   IO.mapOptional("Addend", Rel.Addend, (ELFYAML::YAMLIntUInt)0);
 }
 
+void MappingTraits<ELFYAML::ARMIndexTableEntry>::mapping(
+    IO &IO, ELFYAML::ARMIndexTableEntry &E) {
+  assert(IO.getContext() && "The IO context is not initialized");
+  IO.mapRequired("Offset", E.Offset);
+
+  StringRef CantUnwind = "EXIDX_CANTUNWIND";
+  if (IO.outputting() && (uint32_t)E.Value == ARM::EHABI::EXIDX_CANTUNWIND)
+    IO.mapRequired("Value", CantUnwind);
+  else if (!IO.outputting() && getStringValue(IO, "Value") == CantUnwind)
+    E.Value = ARM::EHABI::EXIDX_CANTUNWIND;
+  else
+    IO.mapRequired("Value", E.Value);
+}
+
 void MappingTraits<ELFYAML::Object>::mapping(IO &IO, ELFYAML::Object &Object) {
   assert(!IO.getContext() && "The IO context is initialized already");
   IO.setContext(&Object);
   IO.mapTag("!ELF", true);
   IO.mapRequired("FileHeader", Object.Header);
-  IO.mapOptional("SectionHeaderTable", Object.SectionHeaders);
   IO.mapOptional("ProgramHeaders", Object.ProgramHeaders);
   IO.mapOptional("Sections", Object.Chunks);
   IO.mapOptional("Symbols", Object.Symbols);
diff --git a/contrib/llvm-project/llvm/lib/ObjectYAML/MachOEmitter.cpp b/contrib/llvm-project/llvm/lib/ObjectYAML/MachOEmitter.cpp
index 680264484704..dec9c9f6960b 100644
--- a/contrib/llvm-project/llvm/lib/ObjectYAML/MachOEmitter.cpp
+++ b/contrib/llvm-project/llvm/lib/ObjectYAML/MachOEmitter.cpp
@@ -29,7 +29,7 @@ namespace {
 
 class MachOWriter {
 public:
-  MachOWriter(MachOYAML::Object &Obj) : Obj(Obj), is64Bit(true), fileStart(0) {
+  MachOWriter(MachOYAML::Object &Obj) : Obj(Obj), fileStart(0) {
     is64Bit = Obj.Header.magic == MachO::MH_MAGIC_64 ||
               Obj.Header.magic == MachO::MH_CIGAM_64;
     memset(reinterpret_cast<void *>(&Header), 0, sizeof(MachO::mach_header_64));
@@ -199,14 +199,12 @@ size_t writeLoadCommandData<MachO::build_version_command>(
 }
 
 void ZeroFillBytes(raw_ostream &OS, size_t Size) {
-  std::vector<uint8_t> FillData;
-  FillData.insert(FillData.begin(), Size, 0);
+  std::vector<uint8_t> FillData(Size, 0);
   OS.write(reinterpret_cast<char *>(FillData.data()), Size);
 }
 
 void Fill(raw_ostream &OS, size_t Size, uint32_t Data) {
-  std::vector<uint32_t> FillData;
-  FillData.insert(FillData.begin(), (Size / 4) + 1, Data);
+  std::vector<uint32_t> FillData((Size / 4) + 1, Data);
   OS.write(reinterpret_cast<char *>(FillData.data()), Size);
 }
 
@@ -285,34 +283,20 @@ Error MachOWriter::writeSectionData(raw_ostream &OS) {
           return createStringError(
               errc::invalid_argument,
               "wrote too much data somewhere, section offsets don't line up");
-        if (0 == strncmp(&Sec.segname[0], "__DWARF", 16)) {
-          Error Err = Error::success();
-          cantFail(std::move(Err));
-
-          if (0 == strncmp(&Sec.sectname[0], "__debug_str", 16))
-            Err = DWARFYAML::emitDebugStr(OS, Obj.DWARF);
-          else if (0 == strncmp(&Sec.sectname[0], "__debug_abbrev", 16))
-            Err = DWARFYAML::emitDebugAbbrev(OS, Obj.DWARF);
-          else if (0 == strncmp(&Sec.sectname[0], "__debug_aranges", 16))
-            Err = DWARFYAML::emitDebugAranges(OS, Obj.DWARF);
-          else if (0 == strncmp(&Sec.sectname[0], "__debug_ranges", 16))
-            Err = DWARFYAML::emitDebugRanges(OS, Obj.DWARF);
-          else if (0 == strncmp(&Sec.sectname[0], "__debug_pubnames", 16)) {
-            if (Obj.DWARF.PubNames)
-              Err = DWARFYAML::emitPubSection(OS, *Obj.DWARF.PubNames,
-                                              Obj.IsLittleEndian);
-          } else if (0 == strncmp(&Sec.sectname[0], "__debug_pubtypes", 16)) {
-            if (Obj.DWARF.PubTypes)
-              Err = DWARFYAML::emitPubSection(OS, *Obj.DWARF.PubTypes,
-                                              Obj.IsLittleEndian);
-          } else if (0 == strncmp(&Sec.sectname[0], "__debug_info", 16))
-            Err = DWARFYAML::emitDebugInfo(OS, Obj.DWARF);
-          else if (0 == strncmp(&Sec.sectname[0], "__debug_line", 16))
-            Err = DWARFYAML::emitDebugLine(OS, Obj.DWARF);
-
-          if (Err)
-            return Err;
 
+        StringRef SectName(Sec.sectname,
+                           strnlen(Sec.sectname, sizeof(Sec.sectname)));
+        // If the section's content is specified in the 'DWARF' entry, we will
+        // emit it regardless of the section's segname.
+        if (Obj.DWARF.getNonEmptySectionNames().count(SectName.substr(2))) {
+          if (Sec.content)
+            return createStringError(errc::invalid_argument,
+                                     "cannot specify section '" + SectName +
+                                         "' contents in the 'DWARF' entry and "
+                                         "the 'content' at the same time");
+          auto EmitFunc = DWARFYAML::getDWARFEmitterByName(SectName.substr(2));
+          if (Error Err = EmitFunc(OS, Obj.DWARF))
+            return Err;
           continue;
         }
 
diff --git a/contrib/llvm-project/llvm/lib/ObjectYAML/MachOYAML.cpp b/contrib/llvm-project/llvm/lib/ObjectYAML/MachOYAML.cpp
index 86aad0233767..5a27d37cb726 100644
--- a/contrib/llvm-project/llvm/lib/ObjectYAML/MachOYAML.cpp
+++ b/contrib/llvm-project/llvm/lib/ObjectYAML/MachOYAML.cpp
@@ -305,12 +305,12 @@ void MappingTraits<MachOYAML::Section>::mapping(IO &IO,
   IO.mapOptional("relocations", Section.relocations);
 }
 
-StringRef
+std::string
 MappingTraits<MachOYAML::Section>::validate(IO &IO,
                                             MachOYAML::Section &Section) {
   if (Section.content && Section.size < Section.content->binary_size())
     return "Section size must be greater than or equal to the content size";
-  return {};
+  return "";
 }
 
 void MappingTraits<MachO::build_tool_version>::mapping(
diff --git a/contrib/llvm-project/llvm/lib/ObjectYAML/MinidumpYAML.cpp b/contrib/llvm-project/llvm/lib/ObjectYAML/MinidumpYAML.cpp
index 77ea42c41a4c..e1a80b98e449 100644
--- a/contrib/llvm-project/llvm/lib/ObjectYAML/MinidumpYAML.cpp
+++ b/contrib/llvm-project/llvm/lib/ObjectYAML/MinidumpYAML.cpp
@@ -292,7 +292,7 @@ static void streamMapping(yaml::IO &IO, RawContentStream &Stream) {
   IO.mapOptional("Size", Stream.Size, Stream.Content.binary_size());
 }
 
-static StringRef streamValidate(RawContentStream &Stream) {
+static std::string streamValidate(RawContentStream &Stream) {
   if (Stream.Size.value < Stream.Content.binary_size())
     return "Stream size must be greater or equal to the content size";
   return "";
@@ -434,7 +434,7 @@ void yaml::MappingTraits<std::unique_ptr<Stream>>::mapping(
   }
 }
 
-StringRef yaml::MappingTraits<std::unique_ptr<Stream>>::validate(
+std::string yaml::MappingTraits<std::unique_ptr<Stream>>::validate(
     yaml::IO &IO, std::unique_ptr<MinidumpYAML::Stream> &S) {
   switch (S->Kind) {
   case MinidumpYAML::Stream::StreamKind::RawContent:
diff --git a/contrib/llvm-project/llvm/lib/ObjectYAML/ObjectYAML.cpp b/contrib/llvm-project/llvm/lib/ObjectYAML/ObjectYAML.cpp
index 7f636f4eabac..4564b537c9a1 100644
--- a/contrib/llvm-project/llvm/lib/ObjectYAML/ObjectYAML.cpp
+++ b/contrib/llvm-project/llvm/lib/ObjectYAML/ObjectYAML.cpp
@@ -33,7 +33,14 @@ void MappingTraits<YamlObjectFile>::mapping(IO &IO,
                                                          *ObjectFile.FatMachO);
   } else {
     Input &In = (Input &)IO;
-    if (IO.mapTag("!ELF")) {
+    if (IO.mapTag("!Arch")) {
+      ObjectFile.Arch.reset(new ArchYAML::Archive());
+      MappingTraits<ArchYAML::Archive>::mapping(IO, *ObjectFile.Arch);
+      std::string Err =
+          MappingTraits<ArchYAML::Archive>::validate(IO, *ObjectFile.Arch);
+      if (!Err.empty())
+        IO.setError(Err);
+    } else if (IO.mapTag("!ELF")) {
       ObjectFile.Elf.reset(new ELFYAML::Object());
       MappingTraits<ELFYAML::Object>::mapping(IO, *ObjectFile.Elf);
     } else if (IO.mapTag("!COFF")) {
diff --git a/contrib/llvm-project/llvm/lib/ObjectYAML/WasmEmitter.cpp b/contrib/llvm-project/llvm/lib/ObjectYAML/WasmEmitter.cpp
index cbb062d87ae6..c09d7f1bc95a 100644
--- a/contrib/llvm-project/llvm/lib/ObjectYAML/WasmEmitter.cpp
+++ b/contrib/llvm-project/llvm/lib/ObjectYAML/WasmEmitter.cpp
@@ -60,6 +60,7 @@ private:
   WasmYAML::Object &Obj;
   uint32_t NumImportedFunctions = 0;
   uint32_t NumImportedGlobals = 0;
+  uint32_t NumImportedTables = 0;
   uint32_t NumImportedEvents = 0;
 
   bool HasError = false;
@@ -187,6 +188,7 @@ void WasmWriter::writeSectionContent(raw_ostream &OS,
       switch (Info.Kind) {
       case wasm::WASM_SYMBOL_TYPE_FUNCTION:
       case wasm::WASM_SYMBOL_TYPE_GLOBAL:
+      case wasm::WASM_SYMBOL_TYPE_TABLE:
       case wasm::WASM_SYMBOL_TYPE_EVENT:
         encodeULEB128(Info.ElementIndex, SubSection.getStream());
         if ((Info.Flags & wasm::WASM_SYMBOL_UNDEFINED) == 0 ||
@@ -266,6 +268,32 @@ void WasmWriter::writeSectionContent(raw_ostream &OS,
       writeStringRef(NameEntry.Name, SubSection.getStream());
     }
 
+    SubSection.done();
+  }
+  if (Section.GlobalNames.size()) {
+    writeUint8(OS, wasm::WASM_NAMES_GLOBAL);
+
+    SubSectionWriter SubSection(OS);
+
+    encodeULEB128(Section.GlobalNames.size(), SubSection.getStream());
+    for (const WasmYAML::NameEntry &NameEntry : Section.GlobalNames) {
+      encodeULEB128(NameEntry.Index, SubSection.getStream());
+      writeStringRef(NameEntry.Name, SubSection.getStream());
+    }
+
+    SubSection.done();
+  }
+  if (Section.DataSegmentNames.size()) {
+    writeUint8(OS, wasm::WASM_NAMES_DATA_SEGMENT);
+
+    SubSectionWriter SubSection(OS);
+
+    encodeULEB128(Section.DataSegmentNames.size(), SubSection.getStream());
+    for (const WasmYAML::NameEntry &NameEntry : Section.DataSegmentNames) {
+      encodeULEB128(NameEntry.Index, SubSection.getStream());
+      writeStringRef(NameEntry.Name, SubSection.getStream());
+    }
+
     SubSection.done();
   }
 }
@@ -360,7 +388,7 @@ void WasmWriter::writeSectionContent(raw_ostream &OS,
     case wasm::WASM_EXTERNAL_EVENT:
       writeUint32(OS, Import.EventImport.Attribute);
       writeUint32(OS, Import.EventImport.SigIndex);
-      NumImportedGlobals++;
+      NumImportedEvents++;
       break;
     case wasm::WASM_EXTERNAL_MEMORY:
       writeLimits(Import.Memory, OS);
@@ -368,6 +396,7 @@ void WasmWriter::writeSectionContent(raw_ostream &OS,
     case wasm::WASM_EXTERNAL_TABLE:
       writeUint8(OS, Import.TableImport.ElemType);
       writeLimits(Import.TableImport.TableLimits, OS);
+      NumImportedTables++;
       break;
     default:
       reportError("unknown import type: " +Twine(Import.Kind));
@@ -401,7 +430,13 @@ void WasmWriter::writeSectionContent(raw_ostream &OS,
 void WasmWriter::writeSectionContent(raw_ostream &OS,
                                      WasmYAML::TableSection &Section) {
   encodeULEB128(Section.Tables.size(), OS);
+  uint32_t ExpectedIndex = NumImportedTables;
   for (auto &Table : Section.Tables) {
+    if (Table.Index != ExpectedIndex) {
+      reportError("unexpected table index: " + Twine(Table.Index));
+      return;
+    }
+    ++ExpectedIndex;
     writeUint8(OS, Table.ElemType);
     writeLimits(Table.TableLimits, OS);
   }
@@ -491,9 +526,9 @@ void WasmWriter::writeSectionContent(raw_ostream &OS,
   encodeULEB128(Section.Segments.size(), OS);
   for (auto &Segment : Section.Segments) {
     encodeULEB128(Segment.InitFlags, OS);
-    if (Segment.InitFlags & wasm::WASM_SEGMENT_HAS_MEMINDEX)
+    if (Segment.InitFlags & wasm::WASM_DATA_SEGMENT_HAS_MEMINDEX)
       encodeULEB128(Segment.MemoryIndex, OS);
-    if ((Segment.InitFlags & wasm::WASM_SEGMENT_IS_PASSIVE) == 0)
+    if ((Segment.InitFlags & wasm::WASM_DATA_SEGMENT_IS_PASSIVE) == 0)
       writeInitExpr(OS, Segment.Offset);
     encodeULEB128(Segment.Content.binary_size(), OS);
     Segment.Content.writeAsBinary(OS);
@@ -538,6 +573,7 @@ void WasmWriter::writeRelocSection(raw_ostream &OS, WasmYAML::Section &Sec,
     case wasm::R_WASM_MEMORY_ADDR_I32:
     case wasm::R_WASM_MEMORY_ADDR_I64:
     case wasm::R_WASM_FUNCTION_OFFSET_I32:
+    case wasm::R_WASM_FUNCTION_OFFSET_I64:
     case wasm::R_WASM_SECTION_OFFSET_I32:
       encodeULEB128(Reloc.Addend, OS);
     }
diff --git a/contrib/llvm-project/llvm/lib/ObjectYAML/WasmYAML.cpp b/contrib/llvm-project/llvm/lib/ObjectYAML/WasmYAML.cpp
index d1aa1181a344..b4d2d113fb5a 100644
--- a/contrib/llvm-project/llvm/lib/ObjectYAML/WasmYAML.cpp
+++ b/contrib/llvm-project/llvm/lib/ObjectYAML/WasmYAML.cpp
@@ -61,6 +61,8 @@ static void sectionMapping(IO &IO, WasmYAML::NameSection &Section) {
   commonSectionMapping(IO, Section);
   IO.mapRequired("Name", Section.Name);
   IO.mapOptional("FunctionNames", Section.FunctionNames);
+  IO.mapOptional("GlobalNames", Section.GlobalNames);
+  IO.mapOptional("DataSegmentNames", Section.DataSegmentNames);
 }
 
 static void sectionMapping(IO &IO, WasmYAML::LinkingSection &Section) {
@@ -300,6 +302,7 @@ void MappingTraits<WasmYAML::Signature>::mapping(
 }
 
 void MappingTraits<WasmYAML::Table>::mapping(IO &IO, WasmYAML::Table &Table) {
+  IO.mapRequired("Index", Table.Index);
   IO.mapRequired("ElemType", Table.ElemType);
   IO.mapRequired("Limits", Table.TableLimits);
 }
@@ -445,12 +448,12 @@ void MappingTraits<WasmYAML::DataSegment>::mapping(
     IO &IO, WasmYAML::DataSegment &Segment) {
   IO.mapOptional("SectionOffset", Segment.SectionOffset);
   IO.mapRequired("InitFlags", Segment.InitFlags);
-  if (Segment.InitFlags & wasm::WASM_SEGMENT_HAS_MEMINDEX) {
+  if (Segment.InitFlags & wasm::WASM_DATA_SEGMENT_HAS_MEMINDEX) {
     IO.mapRequired("MemoryIndex", Segment.MemoryIndex);
   } else {
     Segment.MemoryIndex = 0;
   }
-  if ((Segment.InitFlags & wasm::WASM_SEGMENT_IS_PASSIVE) == 0) {
+  if ((Segment.InitFlags & wasm::WASM_DATA_SEGMENT_IS_PASSIVE) == 0) {
     IO.mapRequired("Offset", Segment.Offset);
   } else {
     Segment.Offset.Opcode = wasm::WASM_OPCODE_I32_CONST;
@@ -470,6 +473,7 @@ void ScalarEnumerationTraits<WasmYAML::ComdatKind>::enumeration(
 #define ECase(X) IO.enumCase(Kind, #X, wasm::WASM_COMDAT_##X);
   ECase(FUNCTION);
   ECase(DATA);
+  ECase(SECTION);
 #undef ECase
 }
 
@@ -496,6 +500,8 @@ void MappingTraits<WasmYAML::SymbolInfo>::mapping(IO &IO,
     IO.mapRequired("Function", Info.ElementIndex);
   } else if (Info.Kind == wasm::WASM_SYMBOL_TYPE_GLOBAL) {
     IO.mapRequired("Global", Info.ElementIndex);
+  } else if (Info.Kind == wasm::WASM_SYMBOL_TYPE_TABLE) {
+    IO.mapRequired("Table", Info.ElementIndex);
   } else if (Info.Kind == wasm::WASM_SYMBOL_TYPE_EVENT) {
     IO.mapRequired("Event", Info.ElementIndex);
   } else if (Info.Kind == wasm::WASM_SYMBOL_TYPE_DATA) {
@@ -551,6 +557,7 @@ void ScalarEnumerationTraits<WasmYAML::SymbolKind>::enumeration(
   ECase(FUNCTION);
   ECase(DATA);
   ECase(GLOBAL);
+  ECase(TABLE);
   ECase(SECTION);
   ECase(EVENT);
 #undef ECase
@@ -565,7 +572,6 @@ void ScalarEnumerationTraits<WasmYAML::ValueType>::enumeration(
   ECase(F64);
   ECase(V128);
   ECase(FUNCREF);
-  ECase(EXNREF);
   ECase(EXTERNREF);
   ECase(FUNC);
 #undef ECase
@@ -599,6 +605,7 @@ void ScalarEnumerationTraits<WasmYAML::TableType>::enumeration(
     IO &IO, WasmYAML::TableType &Type) {
 #define ECase(X) IO.enumCase(Type, #X, wasm::WASM_TYPE_##X);
   ECase(FUNCREF);
+  ECase(EXTERNREF);
 #undef ECase
 }
 
diff --git a/contrib/llvm-project/llvm/lib/ObjectYAML/yaml2obj.cpp b/contrib/llvm-project/llvm/lib/ObjectYAML/yaml2obj.cpp
index a04345f1294a..ef2ab83dcd24 100644
--- a/contrib/llvm-project/llvm/lib/ObjectYAML/yaml2obj.cpp
+++ b/contrib/llvm-project/llvm/lib/ObjectYAML/yaml2obj.cpp
@@ -32,6 +32,8 @@ bool convertYAML(yaml::Input &YIn, raw_ostream &Out, ErrorHandler ErrHandler,
       return false;
     }
 
+    if (Doc.Arch)
+      return yaml2archive(*Doc.Arch, Out, ErrHandler);
     if (Doc.Elf)
       return yaml2elf(*Doc.Elf, Out, ErrHandler, MaxSize);
     if (Doc.Coff)
diff --git a/contrib/llvm-project/llvm/lib/Option/OptTable.cpp b/contrib/llvm-project/llvm/lib/Option/OptTable.cpp
index 926eb8e0437f..c78c2cee1edf 100644
--- a/contrib/llvm-project/llvm/lib/Option/OptTable.cpp
+++ b/contrib/llvm-project/llvm/lib/Option/OptTable.cpp
@@ -6,14 +6,15 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Option/OptTable.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/Option/Arg.h"
 #include "llvm/Option/ArgList.h"
-#include "llvm/Option/Option.h"
 #include "llvm/Option/OptSpecifier.h"
-#include "llvm/Option/OptTable.h"
+#include "llvm/Option/Option.h"
+#include "llvm/Support/CommandLine.h" // for expandResponseFiles
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -195,10 +196,13 @@ static unsigned matchOption(const OptTable::Info *I, StringRef Str,
 
 // Returns true if one of the Prefixes + In.Names matches Option
 static bool optionMatches(const OptTable::Info &In, StringRef Option) {
-  if (In.Prefixes)
+  if (In.Prefixes) {
+    StringRef InName(In.Name);
     for (size_t I = 0; In.Prefixes[I]; I++)
-      if (Option == std::string(In.Prefixes[I]) + In.Name)
-        return true;
+      if (Option.endswith(InName))
+        if (Option.slice(0, Option.size() - InName.size()) == In.Prefixes[I])
+          return true;
+  }
   return false;
 }
 
@@ -226,7 +230,7 @@ OptTable::suggestValueCompletions(StringRef Option, StringRef Arg) const {
 }
 
 std::vector<std::string>
-OptTable::findByPrefix(StringRef Cur, unsigned short DisableFlags) const {
+OptTable::findByPrefix(StringRef Cur, unsigned int DisableFlags) const {
   std::vector<std::string> Ret;
   for (size_t I = FirstSearchableIndex, E = OptionInfos.size(); I < E; I++) {
     const Info &In = OptionInfos[I];
@@ -239,7 +243,7 @@ OptTable::findByPrefix(StringRef Cur, unsigned short DisableFlags) const {
       std::string S = std::string(In.Prefixes[I]) + std::string(In.Name) + "\t";
       if (In.HelpText)
         S += In.HelpText;
-      if (StringRef(S).startswith(Cur) && S.compare(std::string(Cur) + "\t"))
+      if (StringRef(S).startswith(Cur) && S != std::string(Cur) + "\t")
         Ret.push_back(S);
     }
   }
@@ -330,6 +334,60 @@ bool OptTable::addValues(const char *Option, const char *Values) {
   return false;
 }
 
+// Parse a single argument, return the new argument, and update Index. If
+// GroupedShortOptions is true, -a matches "-abc" and the argument in Args will
+// be updated to "-bc". This overload does not support
+// FlagsToInclude/FlagsToExclude or case insensitive options.
+Arg *OptTable::parseOneArgGrouped(InputArgList &Args, unsigned &Index) const {
+  // Anything that doesn't start with PrefixesUnion is an input, as is '-'
+  // itself.
+  const char *CStr = Args.getArgString(Index);
+  StringRef Str(CStr);
+  if (isInput(PrefixesUnion, Str))
+    return new Arg(getOption(TheInputOptionID), Str, Index++, CStr);
+
+  const Info *End = OptionInfos.data() + OptionInfos.size();
+  StringRef Name = Str.ltrim(PrefixChars);
+  const Info *Start = std::lower_bound(
+      OptionInfos.data() + FirstSearchableIndex, End, Name.data());
+  const Info *Fallback = nullptr;
+  unsigned Prev = Index;
+
+  // Search for the option which matches Str.
+  for (; Start != End; ++Start) {
+    unsigned ArgSize = matchOption(Start, Str, IgnoreCase);
+    if (!ArgSize)
+      continue;
+
+    Option Opt(Start, this);
+    if (Arg *A = Opt.accept(Args, StringRef(Args.getArgString(Index), ArgSize),
+                            false, Index))
+      return A;
+
+    // If Opt is a Flag of length 2 (e.g. "-a"), we know it is a prefix of
+    // the current argument (e.g. "-abc"). Match it as a fallback if no longer
+    // option (e.g. "-ab") exists.
+    if (ArgSize == 2 && Opt.getKind() == Option::FlagClass)
+      Fallback = Start;
+
+    // Otherwise, see if the argument is missing.
+    if (Prev != Index)
+      return nullptr;
+  }
+  if (Fallback) {
+    Option Opt(Fallback, this);
+    if (Arg *A = Opt.accept(Args, Str.substr(0, 2), true, Index)) {
+      if (Str.size() == 2)
+        ++Index;
+      else
+        Args.replaceArgString(Index, Twine('-') + Str.substr(2));
+      return A;
+    }
+  }
+
+  return new Arg(getOption(TheUnknownOptionID), Str, Index++, CStr);
+}
+
 Arg *OptTable::ParseOneArg(const ArgList &Args, unsigned &Index,
                            unsigned FlagsToInclude,
                            unsigned FlagsToExclude) const {
@@ -373,7 +431,8 @@ Arg *OptTable::ParseOneArg(const ArgList &Args, unsigned &Index,
       continue;
 
     // See if this option matches.
-    if (Arg *A = Opt.accept(Args, Index, ArgSize))
+    if (Arg *A = Opt.accept(Args, StringRef(Args.getArgString(Index), ArgSize),
+                            false, Index))
       return A;
 
     // Otherwise, see if this argument was missing values.
@@ -414,8 +473,11 @@ InputArgList OptTable::ParseArgs(ArrayRef<const char *> ArgArr,
     }
 
     unsigned Prev = Index;
-    Arg *A = ParseOneArg(Args, Index, FlagsToInclude, FlagsToExclude);
-    assert(Index > Prev && "Parser failed to consume argument.");
+    Arg *A = GroupedShortOptions
+                 ? parseOneArgGrouped(Args, Index)
+                 : ParseOneArg(Args, Index, FlagsToInclude, FlagsToExclude);
+    assert((Index > Prev || GroupedShortOptions) &&
+           "Parser failed to consume argument.");
 
     // Check for missing argument error.
     if (!A) {
@@ -432,6 +494,33 @@ InputArgList OptTable::ParseArgs(ArrayRef<const char *> ArgArr,
   return Args;
 }
 
+InputArgList OptTable::parseArgs(int Argc, char *const *Argv,
+                                 OptSpecifier Unknown, StringSaver &Saver,
+                                 function_ref<void(StringRef)> ErrorFn) const {
+  SmallVector<const char *, 0> NewArgv;
+  // The environment variable specifies initial options which can be overridden
+  // by commnad line options.
+  cl::expandResponseFiles(Argc, Argv, EnvVar, Saver, NewArgv);
+
+  unsigned MAI, MAC;
+  opt::InputArgList Args = ParseArgs(makeArrayRef(NewArgv), MAI, MAC);
+  if (MAC)
+    ErrorFn((Twine(Args.getArgString(MAI)) + ": missing argument").str());
+
+  // For each unknwon option, call ErrorFn with a formatted error message. The
+  // message includes a suggested alternative option spelling if available.
+  std::string Nearest;
+  for (const opt::Arg *A : Args.filtered(Unknown)) {
+    std::string Spelling = A->getAsString(Args);
+    if (findNearest(Spelling, Nearest) > 1)
+      ErrorFn("unknown argument '" + A->getAsString(Args) + "'");
+    else
+      ErrorFn("unknown argument '" + A->getAsString(Args) +
+              "', did you mean '" + Nearest + "'?");
+  }
+  return Args;
+}
+
 static std::string getOptionHelpName(const OptTable &Opts, OptSpecifier Id) {
   const Option O = Opts.getOption(Id);
   std::string Name = O.getPrefixedName();
diff --git a/contrib/llvm-project/llvm/lib/Option/Option.cpp b/contrib/llvm-project/llvm/lib/Option/Option.cpp
index 9abc9fdce4c7..68d074b2702e 100644
--- a/contrib/llvm-project/llvm/lib/Option/Option.cpp
+++ b/contrib/llvm-project/llvm/lib/Option/Option.cpp
@@ -106,9 +106,9 @@ bool Option::matches(OptSpecifier Opt) const {
   return false;
 }
 
-Arg *Option::acceptInternal(const ArgList &Args, unsigned &Index,
-                            unsigned ArgSize) const {
-  StringRef Spelling = StringRef(Args.getArgString(Index), ArgSize);
+Arg *Option::acceptInternal(const ArgList &Args, StringRef Spelling,
+                            unsigned &Index) const {
+  size_t ArgSize = Spelling.size();
   switch (getKind()) {
   case FlagClass: {
     if (ArgSize != strlen(Args.getArgString(Index)))
@@ -230,10 +230,11 @@ Arg *Option::acceptInternal(const ArgList &Args, unsigned &Index,
   }
 }
 
-Arg *Option::accept(const ArgList &Args,
-                    unsigned &Index,
-                    unsigned ArgSize) const {
-  std::unique_ptr<Arg> A(acceptInternal(Args, Index, ArgSize));
+Arg *Option::accept(const ArgList &Args, StringRef CurArg,
+                    bool GroupedShortOption, unsigned &Index) const {
+  std::unique_ptr<Arg> A(GroupedShortOption && getKind() == FlagClass
+                             ? new Arg(*this, CurArg, Index)
+                             : acceptInternal(Args, CurArg, Index));
   if (!A)
     return nullptr;
 
diff --git a/contrib/llvm-project/llvm/lib/Passes/PassBuilder.cpp b/contrib/llvm-project/llvm/lib/Passes/PassBuilder.cpp
index 4db7bebcb77c..6c1a7c75d30a 100644
--- a/contrib/llvm-project/llvm/lib/Passes/PassBuilder.cpp
+++ b/contrib/llvm-project/llvm/lib/Passes/PassBuilder.cpp
@@ -16,8 +16,8 @@
 
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/ADT/StringSwitch.h"
-#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AliasAnalysisEvaluator.h"
+#include "llvm/Analysis/AliasSetTracker.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
@@ -28,23 +28,32 @@
 #include "llvm/Analysis/CGSCCPassManager.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/DDG.h"
+#include "llvm/Analysis/DDGPrinter.h"
+#include "llvm/Analysis/Delinearization.h"
 #include "llvm/Analysis/DemandedBits.h"
 #include "llvm/Analysis/DependenceAnalysis.h"
 #include "llvm/Analysis/DominanceFrontier.h"
+#include "llvm/Analysis/FunctionPropertiesAnalysis.h"
 #include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/IRSimilarityIdentifier.h"
 #include "llvm/Analysis/IVUsers.h"
 #include "llvm/Analysis/InlineAdvisor.h"
-#include "llvm/Analysis/InlineFeaturesAnalysis.h"
 #include "llvm/Analysis/InlineSizeEstimatorAnalysis.h"
+#include "llvm/Analysis/InstCount.h"
 #include "llvm/Analysis/LazyCallGraph.h"
 #include "llvm/Analysis/LazyValueInfo.h"
+#include "llvm/Analysis/Lint.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/LoopCacheAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopNestAnalysis.h"
+#include "llvm/Analysis/MemDerefPrinter.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/ModuleDebugInfoPrinter.h"
 #include "llvm/Analysis/ModuleSummaryAnalysis.h"
+#include "llvm/Analysis/MustExecute.h"
+#include "llvm/Analysis/ObjCARCAliasAnalysis.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/PhiValues.h"
 #include "llvm/Analysis/PostDominators.h"
@@ -61,10 +70,12 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/IR/PrintPasses.h"
 #include "llvm/IR/SafepointIRVerifier.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/Regex.h"
 #include "llvm/Target/TargetMachine.h"
@@ -73,9 +84,12 @@
 #include "llvm/Transforms/Coroutines/CoroEarly.h"
 #include "llvm/Transforms/Coroutines/CoroElide.h"
 #include "llvm/Transforms/Coroutines/CoroSplit.h"
+#include "llvm/Transforms/HelloNew/HelloWorld.h"
 #include "llvm/Transforms/IPO/AlwaysInliner.h"
+#include "llvm/Transforms/IPO/Annotation2Metadata.h"
 #include "llvm/Transforms/IPO/ArgumentPromotion.h"
 #include "llvm/Transforms/IPO/Attributor.h"
+#include "llvm/Transforms/IPO/BlockExtractor.h"
 #include "llvm/Transforms/IPO/CalledValuePropagation.h"
 #include "llvm/Transforms/IPO/ConstantMerge.h"
 #include "llvm/Transforms/IPO/CrossDSOCFI.h"
@@ -88,16 +102,20 @@
 #include "llvm/Transforms/IPO/GlobalOpt.h"
 #include "llvm/Transforms/IPO/GlobalSplit.h"
 #include "llvm/Transforms/IPO/HotColdSplitting.h"
+#include "llvm/Transforms/IPO/IROutliner.h"
 #include "llvm/Transforms/IPO/InferFunctionAttrs.h"
 #include "llvm/Transforms/IPO/Inliner.h"
 #include "llvm/Transforms/IPO/Internalize.h"
+#include "llvm/Transforms/IPO/LoopExtractor.h"
 #include "llvm/Transforms/IPO/LowerTypeTests.h"
 #include "llvm/Transforms/IPO/MergeFunctions.h"
 #include "llvm/Transforms/IPO/OpenMPOpt.h"
 #include "llvm/Transforms/IPO/PartialInlining.h"
 #include "llvm/Transforms/IPO/SCCP.h"
 #include "llvm/Transforms/IPO/SampleProfile.h"
+#include "llvm/Transforms/IPO/SampleProfileProbe.h"
 #include "llvm/Transforms/IPO/StripDeadPrototypes.h"
+#include "llvm/Transforms/IPO/StripSymbols.h"
 #include "llvm/Transforms/IPO/SyntheticCountsPropagation.h"
 #include "llvm/Transforms/IPO/WholeProgramDevirt.h"
 #include "llvm/Transforms/InstCombine/InstCombine.h"
@@ -106,20 +124,25 @@
 #include "llvm/Transforms/Instrumentation/BoundsChecking.h"
 #include "llvm/Transforms/Instrumentation/CGProfile.h"
 #include "llvm/Transforms/Instrumentation/ControlHeightReduction.h"
+#include "llvm/Transforms/Instrumentation/DataFlowSanitizer.h"
 #include "llvm/Transforms/Instrumentation/GCOVProfiler.h"
 #include "llvm/Transforms/Instrumentation/HWAddressSanitizer.h"
 #include "llvm/Transforms/Instrumentation/InstrOrderFile.h"
 #include "llvm/Transforms/Instrumentation/InstrProfiling.h"
+#include "llvm/Transforms/Instrumentation/MemProfiler.h"
 #include "llvm/Transforms/Instrumentation/MemorySanitizer.h"
 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
 #include "llvm/Transforms/Instrumentation/PoisonChecking.h"
 #include "llvm/Transforms/Instrumentation/SanitizerCoverage.h"
 #include "llvm/Transforms/Instrumentation/ThreadSanitizer.h"
+#include "llvm/Transforms/ObjCARC.h"
 #include "llvm/Transforms/Scalar/ADCE.h"
 #include "llvm/Transforms/Scalar/AlignmentFromAssumptions.h"
+#include "llvm/Transforms/Scalar/AnnotationRemarks.h"
 #include "llvm/Transforms/Scalar/BDCE.h"
 #include "llvm/Transforms/Scalar/CallSiteSplitting.h"
 #include "llvm/Transforms/Scalar/ConstantHoisting.h"
+#include "llvm/Transforms/Scalar/ConstraintElimination.h"
 #include "llvm/Transforms/Scalar/CorrelatedValuePropagation.h"
 #include "llvm/Transforms/Scalar/DCE.h"
 #include "llvm/Transforms/Scalar/DeadStoreElimination.h"
@@ -131,6 +154,7 @@
 #include "llvm/Transforms/Scalar/IVUsersPrinter.h"
 #include "llvm/Transforms/Scalar/IndVarSimplify.h"
 #include "llvm/Transforms/Scalar/InductiveRangeCheckElimination.h"
+#include "llvm/Transforms/Scalar/InferAddressSpaces.h"
 #include "llvm/Transforms/Scalar/InstSimplifyPass.h"
 #include "llvm/Transforms/Scalar/JumpThreading.h"
 #include "llvm/Transforms/Scalar/LICM.h"
@@ -138,18 +162,22 @@
 #include "llvm/Transforms/Scalar/LoopDataPrefetch.h"
 #include "llvm/Transforms/Scalar/LoopDeletion.h"
 #include "llvm/Transforms/Scalar/LoopDistribute.h"
+#include "llvm/Transforms/Scalar/LoopFlatten.h"
 #include "llvm/Transforms/Scalar/LoopFuse.h"
 #include "llvm/Transforms/Scalar/LoopIdiomRecognize.h"
 #include "llvm/Transforms/Scalar/LoopInstSimplify.h"
+#include "llvm/Transforms/Scalar/LoopInterchange.h"
 #include "llvm/Transforms/Scalar/LoopLoadElimination.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Scalar/LoopPredication.h"
+#include "llvm/Transforms/Scalar/LoopReroll.h"
 #include "llvm/Transforms/Scalar/LoopRotation.h"
 #include "llvm/Transforms/Scalar/LoopSimplifyCFG.h"
 #include "llvm/Transforms/Scalar/LoopSink.h"
 #include "llvm/Transforms/Scalar/LoopStrengthReduce.h"
 #include "llvm/Transforms/Scalar/LoopUnrollAndJamPass.h"
 #include "llvm/Transforms/Scalar/LoopUnrollPass.h"
+#include "llvm/Transforms/Scalar/LoopVersioningLICM.h"
 #include "llvm/Transforms/Scalar/LowerAtomic.h"
 #include "llvm/Transforms/Scalar/LowerConstantIntrinsics.h"
 #include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h"
@@ -164,15 +192,20 @@
 #include "llvm/Transforms/Scalar/NewGVN.h"
 #include "llvm/Transforms/Scalar/PartiallyInlineLibCalls.h"
 #include "llvm/Transforms/Scalar/Reassociate.h"
+#include "llvm/Transforms/Scalar/Reg2Mem.h"
 #include "llvm/Transforms/Scalar/RewriteStatepointsForGC.h"
 #include "llvm/Transforms/Scalar/SCCP.h"
 #include "llvm/Transforms/Scalar/SROA.h"
+#include "llvm/Transforms/Scalar/ScalarizeMaskedMemIntrin.h"
 #include "llvm/Transforms/Scalar/Scalarizer.h"
+#include "llvm/Transforms/Scalar/SeparateConstOffsetFromGEP.h"
 #include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
 #include "llvm/Transforms/Scalar/SimplifyCFG.h"
 #include "llvm/Transforms/Scalar/Sink.h"
 #include "llvm/Transforms/Scalar/SpeculateAroundPHIs.h"
 #include "llvm/Transforms/Scalar/SpeculativeExecution.h"
+#include "llvm/Transforms/Scalar/StraightLineStrengthReduce.h"
+#include "llvm/Transforms/Scalar/StructurizeCFG.h"
 #include "llvm/Transforms/Scalar/TailRecursionElimination.h"
 #include "llvm/Transforms/Scalar/WarnMissedTransforms.h"
 #include "llvm/Transforms/Utils/AddDiscriminators.h"
@@ -181,14 +214,24 @@
 #include "llvm/Transforms/Utils/CanonicalizeAliases.h"
 #include "llvm/Transforms/Utils/CanonicalizeFreezeInLoops.h"
 #include "llvm/Transforms/Utils/EntryExitInstrumenter.h"
+#include "llvm/Transforms/Utils/FixIrreducible.h"
 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
+#include "llvm/Transforms/Utils/InstructionNamer.h"
 #include "llvm/Transforms/Utils/LCSSA.h"
 #include "llvm/Transforms/Utils/LibCallsShrinkWrap.h"
 #include "llvm/Transforms/Utils/LoopSimplify.h"
+#include "llvm/Transforms/Utils/LoopVersioning.h"
 #include "llvm/Transforms/Utils/LowerInvoke.h"
+#include "llvm/Transforms/Utils/LowerSwitch.h"
 #include "llvm/Transforms/Utils/Mem2Reg.h"
+#include "llvm/Transforms/Utils/MetaRenamer.h"
 #include "llvm/Transforms/Utils/NameAnonGlobals.h"
+#include "llvm/Transforms/Utils/StripGCRelocates.h"
+#include "llvm/Transforms/Utils/StripNonLineTableDebugInfo.h"
 #include "llvm/Transforms/Utils/SymbolRewriter.h"
+#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
+#include "llvm/Transforms/Utils/UnifyLoopExits.h"
+#include "llvm/Transforms/Utils/UniqueInternalLinkageNames.h"
 #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
 #include "llvm/Transforms/Vectorize/SLPVectorizer.h"
@@ -196,26 +239,7 @@
 
 using namespace llvm;
 
-static cl::opt<unsigned> MaxDevirtIterations("pm-max-devirt-iterations",
-                                             cl::ReallyHidden, cl::init(4));
-static cl::opt<bool>
-    RunPartialInlining("enable-npm-partial-inlining", cl::init(false),
-                       cl::Hidden, cl::ZeroOrMore,
-                       cl::desc("Run Partial inlinining pass"));
-
-static cl::opt<int> PreInlineThreshold(
-    "npm-preinline-threshold", cl::Hidden, cl::init(75), cl::ZeroOrMore,
-    cl::desc("Control the amount of inlining in pre-instrumentation inliner "
-             "(default = 75)"));
-
-static cl::opt<bool>
-    RunNewGVN("enable-npm-newgvn", cl::init(false),
-              cl::Hidden, cl::ZeroOrMore,
-              cl::desc("Run NewGVN instead of GVN"));
-
-static cl::opt<bool> EnableGVNHoist(
-    "enable-npm-gvn-hoist", cl::init(false), cl::Hidden,
-    cl::desc("Enable the GVN hoisting pass for the new PM (default = off)"));
+extern cl::opt<unsigned> MaxDevirtIterations;
 
 static cl::opt<InliningAdvisorMode> UseInlineAdvisor(
     "enable-ml-inliner", cl::init(InliningAdvisorMode::Default), cl::Hidden,
@@ -227,14 +251,6 @@ static cl::opt<InliningAdvisorMode> UseInlineAdvisor(
                clEnumValN(InliningAdvisorMode::Release, "release",
                           "Use release mode (AOT-compiled model).")));
 
-static cl::opt<bool> EnableGVNSink(
-    "enable-npm-gvn-sink", cl::init(false), cl::Hidden,
-    cl::desc("Enable the GVN hoisting pass for the new PM (default = off)"));
-
-static cl::opt<bool> EnableUnrollAndJam(
-    "enable-npm-unroll-and-jam", cl::init(false), cl::Hidden,
-    cl::desc("Enable the Unroll and Jam pass for the new PM (default = off)"));
-
 static cl::opt<bool> EnableSyntheticCounts(
     "enable-npm-synthetic-counts", cl::init(false), cl::Hidden, cl::ZeroOrMore,
     cl::desc("Run synthetic function entry count generation "
@@ -243,18 +259,21 @@ static cl::opt<bool> EnableSyntheticCounts(
 static const Regex DefaultAliasRegex(
     "^(default|thinlto-pre-link|thinlto|lto-pre-link|lto)<(O[0123sz])>$");
 
-// This option is used in simplifying testing SampleFDO optimizations for
-// profile loading.
-static cl::opt<bool>
-    EnableCHR("enable-chr-npm", cl::init(true), cl::Hidden,
-              cl::desc("Enable control height reduction optimization (CHR)"));
-
 /// Flag to enable inline deferral during PGO.
 static cl::opt<bool>
     EnablePGOInlineDeferral("enable-npm-pgo-inline-deferral", cl::init(true),
                             cl::Hidden,
                             cl::desc("Enable inline deferral during PGO"));
 
+static cl::opt<bool> EnableMemProfiler("enable-mem-prof", cl::init(false),
+                                       cl::Hidden, cl::ZeroOrMore,
+                                       cl::desc("Enable memory profiler"));
+
+static cl::opt<bool> PerformMandatoryInliningsFirst(
+    "mandatory-inlining-first", cl::init(true), cl::Hidden, cl::ZeroOrMore,
+    cl::desc("Perform mandatory inlinings module-wide, before performing "
+             "inlining."));
+
 PipelineTuningOptions::PipelineTuningOptions() {
   LoopInterleaving = true;
   LoopVectorization = true;
@@ -265,16 +284,33 @@ PipelineTuningOptions::PipelineTuningOptions() {
   LicmMssaOptCap = SetLicmMssaOptCap;
   LicmMssaNoAccForPromotionCap = SetLicmMssaNoAccForPromotionCap;
   CallGraphProfile = true;
+  MergeFunctions = false;
+  UniqueLinkageNames = false;
 }
+extern cl::opt<bool> ExtraVectorizerPasses;
 
+extern cl::opt<bool> EnableConstraintElimination;
+extern cl::opt<bool> EnableGVNHoist;
+extern cl::opt<bool> EnableGVNSink;
 extern cl::opt<bool> EnableHotColdSplit;
+extern cl::opt<bool> EnableIROutliner;
 extern cl::opt<bool> EnableOrderFileInstrumentation;
+extern cl::opt<bool> EnableCHR;
+extern cl::opt<bool> EnableUnrollAndJam;
+extern cl::opt<bool> EnableLoopFlatten;
+extern cl::opt<bool> RunNewGVN;
+extern cl::opt<bool> RunPartialInlining;
 
 extern cl::opt<bool> FlattenedProfileUsed;
 
 extern cl::opt<AttributorRunOption> AttributorRun;
 extern cl::opt<bool> EnableKnowledgeRetention;
 
+extern cl::opt<bool> EnableMatrix;
+
+extern cl::opt<bool> DisablePreInliner;
+extern cl::opt<int> PreInlineThreshold;
+
 const PassBuilder::OptimizationLevel PassBuilder::OptimizationLevel::O0 = {
     /*SpeedLevel*/ 0,
     /*SizeLevel*/ 0};
@@ -388,8 +424,45 @@ AnalysisKey NoOpCGSCCAnalysis::Key;
 AnalysisKey NoOpFunctionAnalysis::Key;
 AnalysisKey NoOpLoopAnalysis::Key;
 
+/// Whether or not we should populate a PassInstrumentationCallbacks's class to
+/// pass name map.
+///
+/// This is for optimization purposes so we don't populate it if we never use
+/// it. This should be updated if new pass instrumentation wants to use the map.
+/// We currently only use this for --print-before/after.
+bool shouldPopulateClassToPassNames() {
+  return !printBeforePasses().empty() || !printAfterPasses().empty();
+}
+
 } // namespace
 
+PassBuilder::PassBuilder(bool DebugLogging, TargetMachine *TM,
+                         PipelineTuningOptions PTO, Optional<PGOOptions> PGOOpt,
+                         PassInstrumentationCallbacks *PIC)
+    : DebugLogging(DebugLogging), TM(TM), PTO(PTO), PGOOpt(PGOOpt), PIC(PIC) {
+  if (TM)
+    TM->registerPassBuilderCallbacks(*this, DebugLogging);
+  if (PIC && shouldPopulateClassToPassNames()) {
+#define MODULE_PASS(NAME, CREATE_PASS)                                         \
+  PIC->addClassToPassName(decltype(CREATE_PASS)::name(), NAME);
+#define MODULE_ANALYSIS(NAME, CREATE_PASS)                                     \
+  PIC->addClassToPassName(decltype(CREATE_PASS)::name(), NAME);
+#define FUNCTION_PASS(NAME, CREATE_PASS)                                       \
+  PIC->addClassToPassName(decltype(CREATE_PASS)::name(), NAME);
+#define FUNCTION_ANALYSIS(NAME, CREATE_PASS)                                   \
+  PIC->addClassToPassName(decltype(CREATE_PASS)::name(), NAME);
+#define LOOP_PASS(NAME, CREATE_PASS)                                           \
+  PIC->addClassToPassName(decltype(CREATE_PASS)::name(), NAME);
+#define LOOP_ANALYSIS(NAME, CREATE_PASS)                                       \
+  PIC->addClassToPassName(decltype(CREATE_PASS)::name(), NAME);
+#define CGSCC_PASS(NAME, CREATE_PASS)                                          \
+  PIC->addClassToPassName(decltype(CREATE_PASS)::name(), NAME);
+#define CGSCC_ANALYSIS(NAME, CREATE_PASS)                                      \
+  PIC->addClassToPassName(decltype(CREATE_PASS)::name(), NAME);
+#include "PassRegistry.def"
+  }
+}
+
 void PassBuilder::invokePeepholeEPCallbacks(
     FunctionPassManager &FPM, PassBuilder::OptimizationLevel Level) {
   for (auto &C : PeepholeEPCallbacks)
@@ -432,9 +505,17 @@ void PassBuilder::registerLoopAnalyses(LoopAnalysisManager &LAM) {
     C(LAM);
 }
 
+// Helper to add AnnotationRemarksPass.
+static void addAnnotationRemarksPass(ModulePassManager &MPM) {
+  FunctionPassManager FPM;
+  FPM.addPass(AnnotationRemarksPass());
+  MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+}
+
 // TODO: Investigate the cost/benefit of tail call elimination on debugging.
-FunctionPassManager PassBuilder::buildO1FunctionSimplificationPipeline(
-    OptimizationLevel Level, ThinLTOPhase Phase, bool DebugLogging) {
+FunctionPassManager
+PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level,
+                                                   ThinOrFullLTOPhase Phase) {
 
   FunctionPassManager FPM(DebugLogging);
 
@@ -481,8 +562,9 @@ FunctionPassManager PassBuilder::buildO1FunctionSimplificationPipeline(
   // TODO: Investigate promotion cap for O1.
   LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
   LPM1.addPass(SimpleLoopUnswitchPass());
-  LPM2.addPass(IndVarSimplifyPass());
+
   LPM2.addPass(LoopIdiomRecognizePass());
+  LPM2.addPass(IndVarSimplifyPass());
 
   for (auto &C : LateLoopOptimizationsEPCallbacks)
     C(LPM2, Level);
@@ -493,7 +575,7 @@ FunctionPassManager PassBuilder::buildO1FunctionSimplificationPipeline(
   // inaccurate. The normal unroller doesn't pay attention to forced full unroll
   // attributes so we need to make sure and allow the full unroll pass to pay
   // attention to it.
-  if (Phase != ThinLTOPhase::PreLink || !PGOOpt ||
+  if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink || !PGOOpt ||
       PGOOpt->Action != PGOOptions::SampleUse)
     LPM2.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(),
                                     /* OnlyWhenForced= */ !PTO.LoopUnrolling,
@@ -507,13 +589,17 @@ FunctionPassManager PassBuilder::buildO1FunctionSimplificationPipeline(
   FPM.addPass(
       RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
   FPM.addPass(createFunctionToLoopPassAdaptor(
-      std::move(LPM1), EnableMSSALoopDependency, DebugLogging));
+      std::move(LPM1), EnableMSSALoopDependency, /*UseBlockFrequencyInfo=*/true,
+      DebugLogging));
   FPM.addPass(SimplifyCFGPass());
   FPM.addPass(InstCombinePass());
+  if (EnableLoopFlatten)
+    FPM.addPass(LoopFlattenPass());
   // The loop passes in LPM2 (LoopFullUnrollPass) do not preserve MemorySSA.
   // *All* loop passes must preserve it, in order to be able to use it.
   FPM.addPass(createFunctionToLoopPassAdaptor(
-      std::move(LPM2), /*UseMemorySSA=*/false, DebugLogging));
+      std::move(LPM2), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/false,
+      DebugLogging));
 
   // Delete small array after loop unroll.
   FPM.addPass(SROA());
@@ -555,14 +641,13 @@ FunctionPassManager PassBuilder::buildO1FunctionSimplificationPipeline(
 
 FunctionPassManager
 PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
-                                                 ThinLTOPhase Phase,
-                                                 bool DebugLogging) {
+                                                 ThinOrFullLTOPhase Phase) {
   assert(Level != OptimizationLevel::O0 && "Must request optimizations!");
 
   // The O1 pipeline has a separate pipeline creation function to simplify
   // construction readability.
   if (Level.getSpeedupLevel() == 1)
-    return buildO1FunctionSimplificationPipeline(Level, Phase, DebugLogging);
+    return buildO1FunctionSimplificationPipeline(Level, Phase);
 
   FunctionPassManager FPM(DebugLogging);
 
@@ -585,6 +670,9 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
     FPM.addPass(SimplifyCFGPass());
   }
 
+  if (EnableConstraintElimination)
+    FPM.addPass(ConstraintEliminationPass());
+
   // Speculative execution if the target has divergent branches; otherwise nop.
   FPM.addPass(SpeculativeExecutionPass(/* OnlyIfDivergentTarget =*/true));
 
@@ -633,13 +721,14 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   LPM1.addPass(LoopInstSimplifyPass());
   LPM1.addPass(LoopSimplifyCFGPass());
 
-  // Rotate Loop - disable header duplication at -Oz
+  // Disable header duplication in loop rotation at -Oz.
   LPM1.addPass(LoopRotatePass(Level != OptimizationLevel::Oz));
   // TODO: Investigate promotion cap for O1.
   LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
-  LPM1.addPass(SimpleLoopUnswitchPass());
-  LPM2.addPass(IndVarSimplifyPass());
+  LPM1.addPass(
+      SimpleLoopUnswitchPass(/* NonTrivial */ Level == OptimizationLevel::O3));
   LPM2.addPass(LoopIdiomRecognizePass());
+  LPM2.addPass(IndVarSimplifyPass());
 
   for (auto &C : LateLoopOptimizationsEPCallbacks)
     C(LPM2, Level);
@@ -650,7 +739,7 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   // inaccurate. The normal unroller doesn't pay attention to forced full unroll
   // attributes so we need to make sure and allow the full unroll pass to pay
   // attention to it.
-  if (Phase != ThinLTOPhase::PreLink || !PGOOpt ||
+  if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink || !PGOOpt ||
       PGOOpt->Action != PGOOptions::SampleUse)
     LPM2.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(),
                                     /* OnlyWhenForced= */ !PTO.LoopUnrolling,
@@ -664,14 +753,18 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   FPM.addPass(
       RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
   FPM.addPass(createFunctionToLoopPassAdaptor(
-      std::move(LPM1), EnableMSSALoopDependency, DebugLogging));
+      std::move(LPM1), EnableMSSALoopDependency, /*UseBlockFrequencyInfo=*/true,
+      DebugLogging));
   FPM.addPass(SimplifyCFGPass());
   FPM.addPass(InstCombinePass());
-  // The loop passes in LPM2 (IndVarSimplifyPass, LoopIdiomRecognizePass,
+  if (EnableLoopFlatten)
+    FPM.addPass(LoopFlattenPass());
+  // The loop passes in LPM2 (LoopIdiomRecognizePass, IndVarSimplifyPass,
   // LoopDeletionPass and LoopFullUnrollPass) do not preserve MemorySSA.
   // *All* loop passes must preserve it, in order to be able to use it.
   FPM.addPass(createFunctionToLoopPassAdaptor(
-      std::move(LPM2), /*UseMemorySSA=*/false, DebugLogging));
+      std::move(LPM2), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/false,
+      DebugLogging));
 
   // Delete small array after loop unroll.
   FPM.addPass(SROA());
@@ -705,10 +798,16 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   // redo DCE, etc.
   FPM.addPass(JumpThreadingPass());
   FPM.addPass(CorrelatedValuePropagationPass());
+
+  // Finally, do an expensive DCE pass to catch all the dead code exposed by
+  // the simplifications and basic cleanup after all the simplifications.
+  // TODO: Investigate if this is too expensive.
+  FPM.addPass(ADCEPass());
+
   FPM.addPass(DSEPass());
   FPM.addPass(createFunctionToLoopPassAdaptor(
       LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap),
-      EnableMSSALoopDependency, DebugLogging));
+      EnableMSSALoopDependency, /*UseBlockFrequencyInfo=*/true, DebugLogging));
 
   if (PTO.Coroutines)
     FPM.addPass(CoroElidePass());
@@ -716,10 +815,6 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   for (auto &C : ScalarOptimizerLateEPCallbacks)
     C(FPM, Level);
 
-  // Finally, do an expensive DCE pass to catch all the dead code exposed by
-  // the simplifications and basic cleanup after all the simplifications.
-  // TODO: Investigate if this is too expensive.
-  FPM.addPass(ADCEPass());
   FPM.addPass(SimplifyCFGPass());
   FPM.addPass(InstCombinePass());
   invokePeepholeEPCallbacks(FPM, Level);
@@ -732,26 +827,27 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   return FPM;
 }
 
-void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM, bool DebugLogging,
+void PassBuilder::addRequiredLTOPreLinkPasses(ModulePassManager &MPM) {
+  MPM.addPass(CanonicalizeAliasesPass());
+  MPM.addPass(NameAnonGlobalPass());
+}
+
+void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM,
                                     PassBuilder::OptimizationLevel Level,
                                     bool RunProfileGen, bool IsCS,
                                     std::string ProfileFile,
                                     std::string ProfileRemappingFile) {
   assert(Level != OptimizationLevel::O0 && "Not expecting O0 here!");
-  // Generally running simplification passes and the inliner with an high
-  // threshold results in smaller executables, but there may be cases where
-  // the size grows, so let's be conservative here and skip this simplification
-  // at -Os/Oz. We will not do this  inline for context sensistive PGO (when
-  // IsCS is true).
-  if (!Level.isOptimizingForSize() && !IsCS) {
+  if (!IsCS && !DisablePreInliner) {
     InlineParams IP;
 
     IP.DefaultThreshold = PreInlineThreshold;
 
-    // FIXME: The hint threshold has the same value used by the regular inliner.
-    // This should probably be lowered after performance testing.
+    // FIXME: The hint threshold has the same value used by the regular inliner
+    // when not optimzing for size. This should probably be lowered after
+    // performance testing.
     // FIXME: this comment is cargo culted from the old pass manager, revisit).
-    IP.HintThreshold = 325;
+    IP.HintThreshold = Level.isOptimizingForSize() ? PreInlineThreshold : 325;
     ModuleInlinerWrapperPass MIWP(IP, DebugLogging);
     CGSCCPassManager &CGPipeline = MIWP.getPM();
 
@@ -785,8 +881,10 @@ void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM, bool DebugLogging,
   MPM.addPass(PGOInstrumentationGen(IsCS));
 
   FunctionPassManager FPM;
+  // Disable header duplication in loop rotation at -Oz.
   FPM.addPass(createFunctionToLoopPassAdaptor(
-      LoopRotatePass(), EnableMSSALoopDependency, DebugLogging));
+      LoopRotatePass(Level != OptimizationLevel::Oz), EnableMSSALoopDependency,
+      /*UseBlockFrequencyInfo=*/false, DebugLogging));
   MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
 
   // Add the profile lowering pass.
@@ -800,8 +898,8 @@ void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM, bool DebugLogging,
 }
 
 void PassBuilder::addPGOInstrPassesForO0(ModulePassManager &MPM,
-                                         bool DebugLogging, bool RunProfileGen,
-                                         bool IsCS, std::string ProfileFile,
+                                         bool RunProfileGen, bool IsCS,
+                                         std::string ProfileFile,
                                          std::string ProfileRemappingFile) {
   if (!RunProfileGen) {
     assert(!ProfileFile.empty() && "Profile use expecting a profile file!");
@@ -830,18 +928,19 @@ getInlineParamsFromOptLevel(PassBuilder::OptimizationLevel Level) {
 }
 
 ModuleInlinerWrapperPass
-PassBuilder::buildInlinerPipeline(OptimizationLevel Level, ThinLTOPhase Phase,
-                                  bool DebugLogging) {
+PassBuilder::buildInlinerPipeline(OptimizationLevel Level,
+                                  ThinOrFullLTOPhase Phase) {
   InlineParams IP = getInlineParamsFromOptLevel(Level);
-  if (Phase == PassBuilder::ThinLTOPhase::PreLink && PGOOpt &&
+  if (Phase == ThinOrFullLTOPhase::ThinLTOPreLink && PGOOpt &&
       PGOOpt->Action == PGOOptions::SampleUse)
     IP.HotCallSiteThreshold = 0;
 
   if (PGOOpt)
     IP.EnableDeferral = EnablePGOInlineDeferral;
 
-  ModuleInlinerWrapperPass MIWP(IP, DebugLogging, UseInlineAdvisor,
-                                MaxDevirtIterations);
+  ModuleInlinerWrapperPass MIWP(IP, DebugLogging,
+                                PerformMandatoryInliningsFirst,
+                                UseInlineAdvisor, MaxDevirtIterations);
 
   // Require the GlobalsAA analysis for the module so we can query it within
   // the CGSCC pipeline.
@@ -866,7 +965,7 @@ PassBuilder::buildInlinerPipeline(OptimizationLevel Level, ThinLTOPhase Phase,
     MainCGPipeline.addPass(AttributorCGSCCPass());
 
   if (PTO.Coroutines)
-    MainCGPipeline.addPass(CoroSplitPass());
+    MainCGPipeline.addPass(CoroSplitPass(Level != OptimizationLevel::O0));
 
   // Now deduce any function attributes based in the current code.
   MainCGPipeline.addPass(PostOrderFunctionAttrsPass());
@@ -881,21 +980,33 @@ PassBuilder::buildInlinerPipeline(OptimizationLevel Level, ThinLTOPhase Phase,
   if (Level == OptimizationLevel::O2 || Level == OptimizationLevel::O3)
     MainCGPipeline.addPass(OpenMPOptPass());
 
+  for (auto &C : CGSCCOptimizerLateEPCallbacks)
+    C(MainCGPipeline, Level);
+
   // Lastly, add the core function simplification pipeline nested inside the
   // CGSCC walk.
   MainCGPipeline.addPass(createCGSCCToFunctionPassAdaptor(
-      buildFunctionSimplificationPipeline(Level, Phase, DebugLogging)));
-
-  for (auto &C : CGSCCOptimizerLateEPCallbacks)
-    C(MainCGPipeline, Level);
+      buildFunctionSimplificationPipeline(Level, Phase)));
 
   return MIWP;
 }
 
-ModulePassManager PassBuilder::buildModuleSimplificationPipeline(
-    OptimizationLevel Level, ThinLTOPhase Phase, bool DebugLogging) {
+ModulePassManager
+PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
+                                               ThinOrFullLTOPhase Phase) {
   ModulePassManager MPM(DebugLogging);
 
+  // Add UniqueInternalLinkageNames Pass which renames internal linkage
+  // symbols with unique names.
+  if (PTO.UniqueLinkageNames)
+    MPM.addPass(UniqueInternalLinkageNamesPass());
+
+  // Place pseudo probe instrumentation as the first pass of the pipeline to
+  // minimize the impact of optimization changes.
+  if (PGOOpt && PGOOpt->PseudoProbeForProfiling &&
+      Phase != ThinOrFullLTOPhase::ThinLTOPostLink)
+    MPM.addPass(SampleProfileProbePass(TM));
+
   bool HasSampleProfile = PGOOpt && (PGOOpt->Action == PGOOptions::SampleUse);
 
   // In ThinLTO mode, when flattened profile is used, all the available
@@ -903,7 +1014,7 @@ ModulePassManager PassBuilder::buildModuleSimplificationPipeline(
   // no need to load the profile again in PostLink.
   bool LoadSampleProfile =
       HasSampleProfile &&
-      !(FlattenedProfileUsed && Phase == ThinLTOPhase::PostLink);
+      !(FlattenedProfileUsed && Phase == ThinOrFullLTOPhase::ThinLTOPostLink);
 
   // During the ThinLTO backend phase we perform early indirect call promotion
   // here, before globalopt. Otherwise imported available_externally functions
@@ -919,7 +1030,7 @@ ModulePassManager PassBuilder::buildModuleSimplificationPipeline(
   // command line. E.g. for flattened profiles where we will not be reloading
   // the sample profile in the ThinLTO backend, we ideally shouldn't have to
   // provide the sample profile file.
-  if (Phase == ThinLTOPhase::PostLink && !LoadSampleProfile)
+  if (Phase == ThinOrFullLTOPhase::ThinLTOPostLink && !LoadSampleProfile)
     MPM.addPass(PGOIndirectCallPromotion(true /* InLTO */, HasSampleProfile));
 
   // Do basic inference of function attributes from known properties of system
@@ -952,20 +1063,19 @@ ModulePassManager PassBuilder::buildModuleSimplificationPipeline(
     // Annotate sample profile right after early FPM to ensure freshness of
     // the debug info.
     MPM.addPass(SampleProfileLoaderPass(PGOOpt->ProfileFile,
-                                        PGOOpt->ProfileRemappingFile,
-                                        Phase == ThinLTOPhase::PreLink));
+                                        PGOOpt->ProfileRemappingFile, Phase));
     // Cache ProfileSummaryAnalysis once to avoid the potential need to insert
     // RequireAnalysisPass for PSI before subsequent non-module passes.
     MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>());
     // Do not invoke ICP in the ThinLTOPrelink phase as it makes it hard
     // for the profile annotation to be accurate in the ThinLTO backend.
-    if (Phase != ThinLTOPhase::PreLink)
+    if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink)
       // We perform early indirect call promotion here, before globalopt.
       // This is important for the ThinLTO backend phase because otherwise
       // imported available_externally functions look unreferenced and are
       // removed.
-      MPM.addPass(PGOIndirectCallPromotion(Phase == ThinLTOPhase::PostLink,
-                                           true /* SamplePGO */));
+      MPM.addPass(PGOIndirectCallPromotion(
+          Phase == ThinOrFullLTOPhase::ThinLTOPostLink, true /* SamplePGO */));
   }
 
   if (AttributorRun & AttributorRunOption::MODULE)
@@ -974,9 +1084,12 @@ ModulePassManager PassBuilder::buildModuleSimplificationPipeline(
   // Lower type metadata and the type.test intrinsic in the ThinLTO
   // post link pipeline after ICP. This is to enable usage of the type
   // tests in ICP sequences.
-  if (Phase == ThinLTOPhase::PostLink)
+  if (Phase == ThinOrFullLTOPhase::ThinLTOPostLink)
     MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true));
 
+  for (auto &C : PipelineEarlySimplificationEPCallbacks)
+    C(MPM, Level);
+
   // Interprocedural constant propagation now that basic cleanup has occurred
   // and prior to optimizing globals.
   // FIXME: This position in the pipeline hasn't been carefully considered in
@@ -1011,16 +1124,16 @@ ModulePassManager PassBuilder::buildModuleSimplificationPipeline(
   MPM.addPass(createModuleToFunctionPassAdaptor(std::move(GlobalCleanupPM)));
 
   // Add all the requested passes for instrumentation PGO, if requested.
-  if (PGOOpt && Phase != ThinLTOPhase::PostLink &&
+  if (PGOOpt && Phase != ThinOrFullLTOPhase::ThinLTOPostLink &&
       (PGOOpt->Action == PGOOptions::IRInstr ||
        PGOOpt->Action == PGOOptions::IRUse)) {
-    addPGOInstrPasses(MPM, DebugLogging, Level,
+    addPGOInstrPasses(MPM, Level,
                       /* RunProfileGen */ PGOOpt->Action == PGOOptions::IRInstr,
                       /* IsCS */ false, PGOOpt->ProfileFile,
                       PGOOpt->ProfileRemappingFile);
     MPM.addPass(PGOIndirectCallPromotion(false, false));
   }
-  if (PGOOpt && Phase != ThinLTOPhase::PostLink &&
+  if (PGOOpt && Phase != ThinOrFullLTOPhase::ThinLTOPostLink &&
       PGOOpt->CSAction == PGOOptions::CSIRInstr)
     MPM.addPass(PGOInstrumentationGenCreateVar(PGOOpt->CSProfileGenFile));
 
@@ -1028,12 +1141,19 @@ ModulePassManager PassBuilder::buildModuleSimplificationPipeline(
   if (EnableSyntheticCounts && !PGOOpt)
     MPM.addPass(SyntheticCountsPropagation());
 
-  MPM.addPass(buildInlinerPipeline(Level, Phase, DebugLogging));
+  MPM.addPass(buildInlinerPipeline(Level, Phase));
+
+  if (EnableMemProfiler && Phase != ThinOrFullLTOPhase::ThinLTOPreLink) {
+    MPM.addPass(createModuleToFunctionPassAdaptor(MemProfilerPass()));
+    MPM.addPass(ModuleMemProfilerPass());
+  }
+
   return MPM;
 }
 
-ModulePassManager PassBuilder::buildModuleOptimizationPipeline(
-    OptimizationLevel Level, bool DebugLogging, bool LTOPreLink) {
+ModulePassManager
+PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
+                                             bool LTOPreLink) {
   ModulePassManager MPM(DebugLogging);
 
   // Optimize globals now that the module is fully simplified.
@@ -1071,11 +1191,11 @@ ModulePassManager PassBuilder::buildModuleOptimizationPipeline(
   // instrumentation is after all the inlines are done.
   if (!LTOPreLink && PGOOpt) {
     if (PGOOpt->CSAction == PGOOptions::CSIRInstr)
-      addPGOInstrPasses(MPM, DebugLogging, Level, /* RunProfileGen */ true,
+      addPGOInstrPasses(MPM, Level, /* RunProfileGen */ true,
                         /* IsCS */ true, PGOOpt->CSProfileGenFile,
                         PGOOpt->ProfileRemappingFile);
     else if (PGOOpt->CSAction == PGOOptions::CSIRUse)
-      addPGOInstrPasses(MPM, DebugLogging, Level, /* RunProfileGen */ false,
+      addPGOInstrPasses(MPM, Level, /* RunProfileGen */ false,
                         /* IsCS */ true, PGOOpt->ProfileFile,
                         PGOOpt->ProfileRemappingFile);
   }
@@ -1093,6 +1213,11 @@ ModulePassManager PassBuilder::buildModuleOptimizationPipeline(
   OptimizePM.addPass(Float2IntPass());
   OptimizePM.addPass(LowerConstantIntrinsicsPass());
 
+  if (EnableMatrix) {
+    OptimizePM.addPass(LowerMatrixIntrinsicsPass());
+    OptimizePM.addPass(EarlyCSEPass());
+  }
+
   // FIXME: We need to run some loop optimizations to re-rotate loops after
   // simplify-cfg and others undo their rotation.
 
@@ -1104,8 +1229,11 @@ ModulePassManager PassBuilder::buildModuleOptimizationPipeline(
     C(OptimizePM, Level);
 
   // First rotate loops that may have been un-rotated by prior passes.
+  // Disable header duplication at -Oz.
   OptimizePM.addPass(createFunctionToLoopPassAdaptor(
-      LoopRotatePass(), EnableMSSALoopDependency, DebugLogging));
+      LoopRotatePass(Level != OptimizationLevel::Oz, LTOPreLink),
+      EnableMSSALoopDependency,
+      /*UseBlockFrequencyInfo=*/false, DebugLogging));
 
   // Distribute loops to allow partial vectorization.  I.e. isolate dependences
   // into separate loop that would otherwise inhibit vectorization.  This is
@@ -1128,6 +1256,28 @@ ModulePassManager PassBuilder::buildModuleOptimizationPipeline(
   // Cleanup after the loop optimization passes.
   OptimizePM.addPass(InstCombinePass());
 
+  if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) {
+    // At higher optimization levels, try to clean up any runtime overlap and
+    // alignment checks inserted by the vectorizer. We want to track correlated
+    // runtime checks for two inner loops in the same outer loop, fold any
+    // common computations, hoist loop-invariant aspects out of any outer loop,
+    // and unswitch the runtime checks if possible. Once hoisted, we may have
+    // dead (or speculatable) control flows or more combining opportunities.
+    OptimizePM.addPass(EarlyCSEPass());
+    OptimizePM.addPass(CorrelatedValuePropagationPass());
+    OptimizePM.addPass(InstCombinePass());
+    LoopPassManager LPM(DebugLogging);
+    LPM.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
+    LPM.addPass(
+        SimpleLoopUnswitchPass(/* NonTrivial */ Level == OptimizationLevel::O3));
+    OptimizePM.addPass(RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
+    OptimizePM.addPass(createFunctionToLoopPassAdaptor(
+        std::move(LPM), EnableMSSALoopDependency, /*UseBlockFrequencyInfo=*/true,
+        DebugLogging));
+    OptimizePM.addPass(SimplifyCFGPass());
+    OptimizePM.addPass(InstCombinePass());
+  }
+
   // Now that we've formed fast to execute loop structures, we do further
   // optimizations. These are run afterward as they might block doing complex
   // analyses and transforms such as what are needed for loop vectorization.
@@ -1137,15 +1287,22 @@ ModulePassManager PassBuilder::buildModuleOptimizationPipeline(
   // convert to more optimized IR using more aggressive simplify CFG options.
   // The extra sinking transform can create larger basic blocks, so do this
   // before SLP vectorization.
-  OptimizePM.addPass(SimplifyCFGPass(SimplifyCFGOptions().
-                                     forwardSwitchCondToPhi(true).
-                                     convertSwitchToLookupTable(true).
-                                     needCanonicalLoops(false).
-                                     sinkCommonInsts(true)));
+  // FIXME: study whether hoisting and/or sinking of common instructions should
+  //        be delayed until after SLP vectorizer.
+  OptimizePM.addPass(SimplifyCFGPass(SimplifyCFGOptions()
+                                         .forwardSwitchCondToPhi(true)
+                                         .convertSwitchToLookupTable(true)
+                                         .needCanonicalLoops(false)
+                                         .hoistCommonInsts(true)
+                                         .sinkCommonInsts(true)));
 
   // Optimize parallel scalar instruction chains into SIMD instructions.
-  if (PTO.SLPVectorization)
+  if (PTO.SLPVectorization) {
     OptimizePM.addPass(SLPVectorizerPass());
+    if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) {
+      OptimizePM.addPass(EarlyCSEPass());
+    }
+  }
 
   // Enhance/cleanup vector code.
   OptimizePM.addPass(VectorCombinePass());
@@ -1169,7 +1326,7 @@ ModulePassManager PassBuilder::buildModuleOptimizationPipeline(
   OptimizePM.addPass(RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
   OptimizePM.addPass(createFunctionToLoopPassAdaptor(
       LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap),
-      EnableMSSALoopDependency, DebugLogging));
+      EnableMSSALoopDependency, /*UseBlockFrequencyInfo=*/true, DebugLogging));
 
   // Now that we've vectorized and unrolled loops, we may have more refined
   // alignment information, try to re-derive it here.
@@ -1181,6 +1338,17 @@ ModulePassManager PassBuilder::buildModuleOptimizationPipeline(
   if (EnableHotColdSplit && !LTOPreLink)
     MPM.addPass(HotColdSplittingPass());
 
+  // Search the code for similar regions of code. If enough similar regions can
+  // be found where extracting the regions into their own function will decrease
+  // the size of the program, we extract the regions, a deduplicate the
+  // structurally similar regions.
+  if (EnableIROutliner)
+    MPM.addPass(IROutlinerPass());
+
+  // Merge functions if requested.
+  if (PTO.MergeFunctions)
+    MPM.addPass(MergeFunctionsPass());
+
   // LoopSink pass sinks instructions hoisted by LICM, which serves as a
   // canonicalization pass that enables other optimizations. As a result,
   // LoopSink pass needs to be a very late IR pass to avoid undoing LICM
@@ -1228,55 +1396,70 @@ ModulePassManager PassBuilder::buildModuleOptimizationPipeline(
 
 ModulePassManager
 PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level,
-                                           bool DebugLogging, bool LTOPreLink) {
+                                           bool LTOPreLink) {
   assert(Level != OptimizationLevel::O0 &&
          "Must request optimizations for the default pipeline!");
 
   ModulePassManager MPM(DebugLogging);
 
+  // Convert @llvm.global.annotations to !annotation metadata.
+  MPM.addPass(Annotation2MetadataPass());
+
   // Force any function attributes we want the rest of the pipeline to observe.
   MPM.addPass(ForceFunctionAttrsPass());
 
   // Apply module pipeline start EP callback.
   for (auto &C : PipelineStartEPCallbacks)
-    C(MPM);
+    C(MPM, Level);
 
-  if (PGOOpt && PGOOpt->SamplePGOSupport)
+  if (PGOOpt && PGOOpt->DebugInfoForProfiling)
     MPM.addPass(createModuleToFunctionPassAdaptor(AddDiscriminatorsPass()));
 
   // Add the core simplification pipeline.
-  MPM.addPass(buildModuleSimplificationPipeline(Level, ThinLTOPhase::None,
-                                                DebugLogging));
+  MPM.addPass(buildModuleSimplificationPipeline(
+      Level, LTOPreLink ? ThinOrFullLTOPhase::FullLTOPreLink
+                        : ThinOrFullLTOPhase::None));
 
   // Now add the optimization pipeline.
-  MPM.addPass(buildModuleOptimizationPipeline(Level, DebugLogging, LTOPreLink));
+  MPM.addPass(buildModuleOptimizationPipeline(Level, LTOPreLink));
+
+  if (PGOOpt && PGOOpt->PseudoProbeForProfiling)
+    MPM.addPass(PseudoProbeUpdatePass());
+
+  // Emit annotation remarks.
+  addAnnotationRemarksPass(MPM);
+
+  if (LTOPreLink)
+    addRequiredLTOPreLinkPasses(MPM);
 
   return MPM;
 }
 
 ModulePassManager
-PassBuilder::buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level,
-                                                bool DebugLogging) {
+PassBuilder::buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level) {
   assert(Level != OptimizationLevel::O0 &&
          "Must request optimizations for the default pipeline!");
 
   ModulePassManager MPM(DebugLogging);
 
+  // Convert @llvm.global.annotations to !annotation metadata.
+  MPM.addPass(Annotation2MetadataPass());
+
   // Force any function attributes we want the rest of the pipeline to observe.
   MPM.addPass(ForceFunctionAttrsPass());
 
-  if (PGOOpt && PGOOpt->SamplePGOSupport)
+  if (PGOOpt && PGOOpt->DebugInfoForProfiling)
     MPM.addPass(createModuleToFunctionPassAdaptor(AddDiscriminatorsPass()));
 
   // Apply module pipeline start EP callback.
   for (auto &C : PipelineStartEPCallbacks)
-    C(MPM);
+    C(MPM, Level);
 
   // If we are planning to perform ThinLTO later, we don't bloat the code with
   // unrolling/vectorization/... now. Just simplify the module as much as we
   // can.
-  MPM.addPass(buildModuleSimplificationPipeline(Level, ThinLTOPhase::PreLink,
-                                                DebugLogging));
+  MPM.addPass(buildModuleSimplificationPipeline(
+      Level, ThinOrFullLTOPhase::ThinLTOPreLink));
 
   // Run partial inlining pass to partially inline functions that have
   // large bodies.
@@ -1297,14 +1480,24 @@ PassBuilder::buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level,
   if (PTO.Coroutines)
     MPM.addPass(createModuleToFunctionPassAdaptor(CoroCleanupPass()));
 
+  if (PGOOpt && PGOOpt->PseudoProbeForProfiling)
+    MPM.addPass(PseudoProbeUpdatePass());
+
+  // Emit annotation remarks.
+  addAnnotationRemarksPass(MPM);
+
+  addRequiredLTOPreLinkPasses(MPM);
+
   return MPM;
 }
 
 ModulePassManager PassBuilder::buildThinLTODefaultPipeline(
-    OptimizationLevel Level, bool DebugLogging,
-    const ModuleSummaryIndex *ImportSummary) {
+    OptimizationLevel Level, const ModuleSummaryIndex *ImportSummary) {
   ModulePassManager MPM(DebugLogging);
 
+  // Convert @llvm.global.annotations to !annotation metadata.
+  MPM.addPass(Annotation2MetadataPass());
+
   if (ImportSummary) {
     // These passes import type identifier resolutions for whole-program
     // devirtualization and CFI. They must run early because other passes may
@@ -1332,30 +1525,35 @@ ModulePassManager PassBuilder::buildThinLTODefaultPipeline(
   MPM.addPass(ForceFunctionAttrsPass());
 
   // Add the core simplification pipeline.
-  MPM.addPass(buildModuleSimplificationPipeline(Level, ThinLTOPhase::PostLink,
-                                                DebugLogging));
+  MPM.addPass(buildModuleSimplificationPipeline(
+      Level, ThinOrFullLTOPhase::ThinLTOPostLink));
 
   // Now add the optimization pipeline.
-  MPM.addPass(buildModuleOptimizationPipeline(Level, DebugLogging));
+  MPM.addPass(buildModuleOptimizationPipeline(Level));
+
+  // Emit annotation remarks.
+  addAnnotationRemarksPass(MPM);
 
   return MPM;
 }
 
 ModulePassManager
-PassBuilder::buildLTOPreLinkDefaultPipeline(OptimizationLevel Level,
-                                            bool DebugLogging) {
+PassBuilder::buildLTOPreLinkDefaultPipeline(OptimizationLevel Level) {
   assert(Level != OptimizationLevel::O0 &&
          "Must request optimizations for the default pipeline!");
   // FIXME: We should use a customized pre-link pipeline!
-  return buildPerModuleDefaultPipeline(Level, DebugLogging,
+  return buildPerModuleDefaultPipeline(Level,
                                        /* LTOPreLink */ true);
 }
 
 ModulePassManager
-PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, bool DebugLogging,
+PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
                                      ModuleSummaryIndex *ExportSummary) {
   ModulePassManager MPM(DebugLogging);
 
+  // Convert @llvm.global.annotations to !annotation metadata.
+  MPM.addPass(Annotation2MetadataPass());
+
   if (Level == OptimizationLevel::O0) {
     // The WPD and LowerTypeTest passes need to run at -O0 to lower type
     // metadata and intrinsics.
@@ -1364,6 +1562,10 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, bool DebugLogging,
     // Run a second time to clean up any type tests left behind by WPD for use
     // in ICP.
     MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true));
+
+    // Emit annotation remarks.
+    addAnnotationRemarksPass(MPM);
+
     return MPM;
   }
 
@@ -1371,7 +1573,7 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, bool DebugLogging,
     // Load sample profile before running the LTO optimization pipeline.
     MPM.addPass(SampleProfileLoaderPass(PGOOpt->ProfileFile,
                                         PGOOpt->ProfileRemappingFile,
-                                        false /* ThinLTOPhase::PreLink */));
+                                        ThinOrFullLTOPhase::FullLTOPostLink));
     // Cache ProfileSummaryAnalysis once to avoid the potential need to insert
     // RequireAnalysisPass for PSI before subsequent non-module passes.
     MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>());
@@ -1434,6 +1636,10 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, bool DebugLogging,
     // in ICP (which is performed earlier than this in the regular LTO
     // pipeline).
     MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true));
+
+    // Emit annotation remarks.
+    addAnnotationRemarksPass(MPM);
+
     return MPM;
   }
 
@@ -1482,17 +1688,17 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, bool DebugLogging,
   FPM.addPass(InstCombinePass());
   invokePeepholeEPCallbacks(FPM, Level);
 
-  FPM.addPass(JumpThreadingPass());
+  FPM.addPass(JumpThreadingPass(/*InsertFreezeWhenUnfoldingSelect*/ true));
 
   // Do a post inline PGO instrumentation and use pass. This is a context
   // sensitive PGO pass.
   if (PGOOpt) {
     if (PGOOpt->CSAction == PGOOptions::CSIRInstr)
-      addPGOInstrPasses(MPM, DebugLogging, Level, /* RunProfileGen */ true,
+      addPGOInstrPasses(MPM, Level, /* RunProfileGen */ true,
                         /* IsCS */ true, PGOOpt->CSProfileGenFile,
                         PGOOpt->ProfileRemappingFile);
     else if (PGOOpt->CSAction == PGOOptions::CSIRUse)
-      addPGOInstrPasses(MPM, DebugLogging, Level, /* RunProfileGen */ false,
+      addPGOInstrPasses(MPM, Level, /* RunProfileGen */ false,
                         /* IsCS */ true, PGOOpt->ProfileFile,
                         PGOOpt->ProfileRemappingFile);
   }
@@ -1532,7 +1738,7 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, bool DebugLogging,
   // are sorted out.
 
   MainFPM.addPass(InstCombinePass());
-  MainFPM.addPass(SimplifyCFGPass());
+  MainFPM.addPass(SimplifyCFGPass(SimplifyCFGOptions().hoistCommonInsts(true)));
   MainFPM.addPass(SCCPPass());
   MainFPM.addPass(InstCombinePass());
   MainFPM.addPass(BDCEPass());
@@ -1549,7 +1755,7 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, bool DebugLogging,
 
   MainFPM.addPass(InstCombinePass());
   invokePeepholeEPCallbacks(MainFPM, Level);
-  MainFPM.addPass(JumpThreadingPass());
+  MainFPM.addPass(JumpThreadingPass(/*InsertFreezeWhenUnfoldingSelect*/ true));
   MPM.addPass(createModuleToFunctionPassAdaptor(std::move(MainFPM)));
 
   // Create a function that performs CFI checks for cross-DSO calls with
@@ -1572,7 +1778,8 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, bool DebugLogging,
 
   // Add late LTO optimization passes.
   // Delete basic blocks, which optimization passes may have killed.
-  MPM.addPass(createModuleToFunctionPassAdaptor(SimplifyCFGPass()));
+  MPM.addPass(createModuleToFunctionPassAdaptor(
+      SimplifyCFGPass(SimplifyCFGOptions().hoistCommonInsts(true))));
 
   // Drop bodies of available eternally objects to improve GlobalDCE.
   MPM.addPass(EliminateAvailableExternallyPass());
@@ -1580,7 +1787,112 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, bool DebugLogging,
   // Now that we have optimized the program, discard unreachable functions.
   MPM.addPass(GlobalDCEPass());
 
-  // FIXME: Maybe enable MergeFuncs conditionally after it's ported.
+  if (PTO.MergeFunctions)
+    MPM.addPass(MergeFunctionsPass());
+
+  // Emit annotation remarks.
+  addAnnotationRemarksPass(MPM);
+
+  return MPM;
+}
+
+ModulePassManager PassBuilder::buildO0DefaultPipeline(OptimizationLevel Level,
+                                                      bool LTOPreLink) {
+  assert(Level == OptimizationLevel::O0 &&
+         "buildO0DefaultPipeline should only be used with O0");
+
+  ModulePassManager MPM(DebugLogging);
+
+  // Add UniqueInternalLinkageNames Pass which renames internal linkage
+  // symbols with unique names.
+  if (PTO.UniqueLinkageNames)
+    MPM.addPass(UniqueInternalLinkageNamesPass());
+
+  if (PGOOpt && (PGOOpt->Action == PGOOptions::IRInstr ||
+                 PGOOpt->Action == PGOOptions::IRUse))
+    addPGOInstrPassesForO0(
+        MPM,
+        /* RunProfileGen */ (PGOOpt->Action == PGOOptions::IRInstr),
+        /* IsCS */ false, PGOOpt->ProfileFile, PGOOpt->ProfileRemappingFile);
+
+  for (auto &C : PipelineStartEPCallbacks)
+    C(MPM, Level);
+  for (auto &C : PipelineEarlySimplificationEPCallbacks)
+    C(MPM, Level);
+
+  // Build a minimal pipeline based on the semantics required by LLVM,
+  // which is just that always inlining occurs. Further, disable generating
+  // lifetime intrinsics to avoid enabling further optimizations during
+  // code generation.
+  // However, we need to insert lifetime intrinsics to avoid invalid access
+  // caused by multithreaded coroutines.
+  MPM.addPass(AlwaysInlinerPass(
+      /*InsertLifetimeIntrinsics=*/PTO.Coroutines));
+
+  if (PTO.MergeFunctions)
+    MPM.addPass(MergeFunctionsPass());
+
+  if (EnableMatrix)
+    MPM.addPass(
+        createModuleToFunctionPassAdaptor(LowerMatrixIntrinsicsPass(true)));
+
+  if (!CGSCCOptimizerLateEPCallbacks.empty()) {
+    CGSCCPassManager CGPM(DebugLogging);
+    for (auto &C : CGSCCOptimizerLateEPCallbacks)
+      C(CGPM, Level);
+    if (!CGPM.isEmpty())
+      MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM)));
+  }
+  if (!LateLoopOptimizationsEPCallbacks.empty()) {
+    LoopPassManager LPM(DebugLogging);
+    for (auto &C : LateLoopOptimizationsEPCallbacks)
+      C(LPM, Level);
+    if (!LPM.isEmpty()) {
+      MPM.addPass(createModuleToFunctionPassAdaptor(
+          createFunctionToLoopPassAdaptor(std::move(LPM))));
+    }
+  }
+  if (!LoopOptimizerEndEPCallbacks.empty()) {
+    LoopPassManager LPM(DebugLogging);
+    for (auto &C : LoopOptimizerEndEPCallbacks)
+      C(LPM, Level);
+    if (!LPM.isEmpty()) {
+      MPM.addPass(createModuleToFunctionPassAdaptor(
+          createFunctionToLoopPassAdaptor(std::move(LPM))));
+    }
+  }
+  if (!ScalarOptimizerLateEPCallbacks.empty()) {
+    FunctionPassManager FPM(DebugLogging);
+    for (auto &C : ScalarOptimizerLateEPCallbacks)
+      C(FPM, Level);
+    if (!FPM.isEmpty())
+      MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+  }
+  if (!VectorizerStartEPCallbacks.empty()) {
+    FunctionPassManager FPM(DebugLogging);
+    for (auto &C : VectorizerStartEPCallbacks)
+      C(FPM, Level);
+    if (!FPM.isEmpty())
+      MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+  }
+
+  if (PTO.Coroutines) {
+    MPM.addPass(createModuleToFunctionPassAdaptor(CoroEarlyPass()));
+
+    CGSCCPassManager CGPM(DebugLogging);
+    CGPM.addPass(CoroSplitPass());
+    CGPM.addPass(createCGSCCToFunctionPassAdaptor(CoroElidePass()));
+    MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM)));
+
+    MPM.addPass(createModuleToFunctionPassAdaptor(CoroCleanupPass()));
+  }
+
+  for (auto &C : OptimizerLastEPCallbacks)
+    C(MPM, Level);
+
+  if (LTOPreLink)
+    addRequiredLTOPreLinkPasses(MPM);
+
   return MPM;
 }
 
@@ -1606,6 +1918,10 @@ AAManager PassBuilder::buildDefaultAAPipeline() {
   // results from `GlobalsAA` through a readonly proxy.
   AA.registerModuleAnalysis<GlobalsAA>();
 
+  // Add target-specific alias analyses.
+  if (TM)
+    TM->registerDefaultAliasAnalyses(AA);
+
   return AA;
 }
 
@@ -1622,7 +1938,7 @@ static Optional<int> parseDevirtPassName(StringRef Name) {
   if (!Name.consume_front("devirt<") || !Name.consume_back(">"))
     return None;
   int Count;
-  if (Name.getAsInteger(0, Count) || Count <= 0)
+  if (Name.getAsInteger(0, Count) || Count < 0)
     return None;
   return Count;
 }
@@ -1763,6 +2079,8 @@ Expected<SimplifyCFGOptions> parseSimplifyCFGOptions(StringRef Params) {
       Result.convertSwitchToLookupTable(Enable);
     } else if (ParamName == "keep-loops") {
       Result.needCanonicalLoops(Enable);
+    } else if (ParamName == "hoist-common-insts") {
+      Result.hoistCommonInsts(Enable);
     } else if (ParamName == "sink-common-insts") {
       Result.sinkCommonInsts(Enable);
     } else if (Enable && ParamName.consume_front("bonus-inst-threshold=")) {
@@ -1854,6 +2172,8 @@ Expected<GVNOptions> parseGVNOptions(StringRef Params) {
       Result.setPRE(Enable);
     } else if (ParamName == "load-pre") {
       Result.setLoadPRE(Enable);
+    } else if (ParamName == "split-backedge-load-pre") {
+      Result.setLoadPRESplitBackedge(Enable);
     } else if (ParamName == "memdep") {
       Result.setMemDep(Enable);
     } else {
@@ -2075,8 +2395,7 @@ PassBuilder::parsePipelineText(StringRef Text) {
 }
 
 Error PassBuilder::parseModulePass(ModulePassManager &MPM,
-                                   const PipelineElement &E,
-                                   bool VerifyEachPass, bool DebugLogging) {
+                                   const PipelineElement &E) {
   auto &Name = E.Name;
   auto &InnerPipeline = E.InnerPipeline;
 
@@ -2084,32 +2403,28 @@ Error PassBuilder::parseModulePass(ModulePassManager &MPM,
   if (!InnerPipeline.empty()) {
     if (Name == "module") {
       ModulePassManager NestedMPM(DebugLogging);
-      if (auto Err = parseModulePassPipeline(NestedMPM, InnerPipeline,
-                                             VerifyEachPass, DebugLogging))
+      if (auto Err = parseModulePassPipeline(NestedMPM, InnerPipeline))
         return Err;
       MPM.addPass(std::move(NestedMPM));
       return Error::success();
     }
     if (Name == "cgscc") {
       CGSCCPassManager CGPM(DebugLogging);
-      if (auto Err = parseCGSCCPassPipeline(CGPM, InnerPipeline, VerifyEachPass,
-                                            DebugLogging))
+      if (auto Err = parseCGSCCPassPipeline(CGPM, InnerPipeline))
         return Err;
       MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM)));
       return Error::success();
     }
     if (Name == "function") {
       FunctionPassManager FPM(DebugLogging);
-      if (auto Err = parseFunctionPassPipeline(FPM, InnerPipeline,
-                                               VerifyEachPass, DebugLogging))
+      if (auto Err = parseFunctionPassPipeline(FPM, InnerPipeline))
         return Err;
       MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
       return Error::success();
     }
     if (auto Count = parseRepeatPassName(Name)) {
       ModulePassManager NestedMPM(DebugLogging);
-      if (auto Err = parseModulePassPipeline(NestedMPM, InnerPipeline,
-                                             VerifyEachPass, DebugLogging))
+      if (auto Err = parseModulePassPipeline(NestedMPM, InnerPipeline))
         return Err;
       MPM.addPass(createRepeatedPass(*Count, std::move(NestedMPM)));
       return Error::success();
@@ -2143,31 +2458,10 @@ Error PassBuilder::parseModulePass(ModulePassManager &MPM,
                               .Case("O3", OptimizationLevel::O3)
                               .Case("Os", OptimizationLevel::Os)
                               .Case("Oz", OptimizationLevel::Oz);
-    if (L == OptimizationLevel::O0) {
-      // Add instrumentation PGO passes -- at O0 we can still do PGO.
-      if (PGOOpt && Matches[1] != "thinlto" &&
-          (PGOOpt->Action == PGOOptions::IRInstr ||
-           PGOOpt->Action == PGOOptions::IRUse))
-        addPGOInstrPassesForO0(
-            MPM, DebugLogging,
-            /* RunProfileGen */ (PGOOpt->Action == PGOOptions::IRInstr),
-            /* IsCS */ false, PGOOpt->ProfileFile,
-            PGOOpt->ProfileRemappingFile);
-
-      // For IR that makes use of coroutines intrinsics, coroutine passes must
-      // be run, even at -O0.
-      if (PTO.Coroutines) {
-        MPM.addPass(createModuleToFunctionPassAdaptor(CoroEarlyPass()));
-
-        CGSCCPassManager CGPM(DebugLogging);
-        CGPM.addPass(CoroSplitPass());
-        CGPM.addPass(createCGSCCToFunctionPassAdaptor(CoroElidePass()));
-        MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM)));
-
-        MPM.addPass(createModuleToFunctionPassAdaptor(CoroCleanupPass()));
-      }
-
-      // Do nothing else at all!
+    if (L == OptimizationLevel::O0 && Matches[1] != "thinlto" &&
+        Matches[1] != "lto") {
+      MPM.addPass(buildO0DefaultPipeline(L, Matches[1] == "thinlto-pre-link" ||
+                                                Matches[1] == "lto-pre-link"));
       return Error::success();
     }
 
@@ -2180,16 +2474,16 @@ Error PassBuilder::parseModulePass(ModulePassManager &MPM,
         L.getSpeedupLevel() > 1 && L != OptimizationLevel::Oz;
 
     if (Matches[1] == "default") {
-      MPM.addPass(buildPerModuleDefaultPipeline(L, DebugLogging));
+      MPM.addPass(buildPerModuleDefaultPipeline(L));
     } else if (Matches[1] == "thinlto-pre-link") {
-      MPM.addPass(buildThinLTOPreLinkDefaultPipeline(L, DebugLogging));
+      MPM.addPass(buildThinLTOPreLinkDefaultPipeline(L));
     } else if (Matches[1] == "thinlto") {
-      MPM.addPass(buildThinLTODefaultPipeline(L, DebugLogging, nullptr));
+      MPM.addPass(buildThinLTODefaultPipeline(L, nullptr));
     } else if (Matches[1] == "lto-pre-link") {
-      MPM.addPass(buildLTOPreLinkDefaultPipeline(L, DebugLogging));
+      MPM.addPass(buildLTOPreLinkDefaultPipeline(L));
     } else {
       assert(Matches[1] == "lto" && "Not one of the matched options!");
-      MPM.addPass(buildLTODefaultPipeline(L, DebugLogging, nullptr));
+      MPM.addPass(buildLTODefaultPipeline(L, nullptr));
     }
     return Error::success();
   }
@@ -2212,6 +2506,41 @@ Error PassBuilder::parseModulePass(ModulePassManager &MPM,
                 std::remove_reference<decltype(CREATE_PASS)>::type>());        \
     return Error::success();                                                   \
   }
+#define CGSCC_PASS(NAME, CREATE_PASS)                                          \
+  if (Name == NAME) {                                                          \
+    MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(CREATE_PASS));         \
+    return Error::success();                                                   \
+  }
+#define FUNCTION_PASS(NAME, CREATE_PASS)                                       \
+  if (Name == NAME) {                                                          \
+    MPM.addPass(createModuleToFunctionPassAdaptor(CREATE_PASS));               \
+    return Error::success();                                                   \
+  }
+#define FUNCTION_PASS_WITH_PARAMS(NAME, CREATE_PASS, PARSER)                   \
+  if (checkParametrizedPassName(Name, NAME)) {                                 \
+    auto Params = parsePassParameters(PARSER, Name, NAME);                     \
+    if (!Params)                                                               \
+      return Params.takeError();                                               \
+    MPM.addPass(createModuleToFunctionPassAdaptor(CREATE_PASS(Params.get()))); \
+    return Error::success();                                                   \
+  }
+#define LOOP_PASS(NAME, CREATE_PASS)                                           \
+  if (Name == NAME) {                                                          \
+    MPM.addPass(                                                               \
+        createModuleToFunctionPassAdaptor(createFunctionToLoopPassAdaptor(     \
+            CREATE_PASS, false, false, DebugLogging)));                        \
+    return Error::success();                                                   \
+  }
+#define LOOP_PASS_WITH_PARAMS(NAME, CREATE_PASS, PARSER)                       \
+  if (checkParametrizedPassName(Name, NAME)) {                                 \
+    auto Params = parsePassParameters(PARSER, Name, NAME);                     \
+    if (!Params)                                                               \
+      return Params.takeError();                                               \
+    MPM.addPass(                                                               \
+        createModuleToFunctionPassAdaptor(createFunctionToLoopPassAdaptor(     \
+            CREATE_PASS(Params.get()), false, false, DebugLogging)));          \
+    return Error::success();                                                   \
+  }
 #include "PassRegistry.def"
 
   for (auto &C : ModulePipelineParsingCallbacks)
@@ -2223,8 +2552,7 @@ Error PassBuilder::parseModulePass(ModulePassManager &MPM,
 }
 
 Error PassBuilder::parseCGSCCPass(CGSCCPassManager &CGPM,
-                                  const PipelineElement &E, bool VerifyEachPass,
-                                  bool DebugLogging) {
+                                  const PipelineElement &E) {
   auto &Name = E.Name;
   auto &InnerPipeline = E.InnerPipeline;
 
@@ -2232,8 +2560,7 @@ Error PassBuilder::parseCGSCCPass(CGSCCPassManager &CGPM,
   if (!InnerPipeline.empty()) {
     if (Name == "cgscc") {
       CGSCCPassManager NestedCGPM(DebugLogging);
-      if (auto Err = parseCGSCCPassPipeline(NestedCGPM, InnerPipeline,
-                                            VerifyEachPass, DebugLogging))
+      if (auto Err = parseCGSCCPassPipeline(NestedCGPM, InnerPipeline))
         return Err;
       // Add the nested pass manager with the appropriate adaptor.
       CGPM.addPass(std::move(NestedCGPM));
@@ -2241,8 +2568,7 @@ Error PassBuilder::parseCGSCCPass(CGSCCPassManager &CGPM,
     }
     if (Name == "function") {
       FunctionPassManager FPM(DebugLogging);
-      if (auto Err = parseFunctionPassPipeline(FPM, InnerPipeline,
-                                               VerifyEachPass, DebugLogging))
+      if (auto Err = parseFunctionPassPipeline(FPM, InnerPipeline))
         return Err;
       // Add the nested pass manager with the appropriate adaptor.
       CGPM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
@@ -2250,16 +2576,14 @@ Error PassBuilder::parseCGSCCPass(CGSCCPassManager &CGPM,
     }
     if (auto Count = parseRepeatPassName(Name)) {
       CGSCCPassManager NestedCGPM(DebugLogging);
-      if (auto Err = parseCGSCCPassPipeline(NestedCGPM, InnerPipeline,
-                                            VerifyEachPass, DebugLogging))
+      if (auto Err = parseCGSCCPassPipeline(NestedCGPM, InnerPipeline))
         return Err;
       CGPM.addPass(createRepeatedPass(*Count, std::move(NestedCGPM)));
       return Error::success();
     }
     if (auto MaxRepetitions = parseDevirtPassName(Name)) {
       CGSCCPassManager NestedCGPM(DebugLogging);
-      if (auto Err = parseCGSCCPassPipeline(NestedCGPM, InnerPipeline,
-                                            VerifyEachPass, DebugLogging))
+      if (auto Err = parseCGSCCPassPipeline(NestedCGPM, InnerPipeline))
         return Err;
       CGPM.addPass(
           createDevirtSCCRepeatedPass(std::move(NestedCGPM), *MaxRepetitions));
@@ -2295,6 +2619,36 @@ Error PassBuilder::parseCGSCCPass(CGSCCPassManager &CGPM,
                  std::remove_reference<decltype(CREATE_PASS)>::type>());       \
     return Error::success();                                                   \
   }
+#define FUNCTION_PASS(NAME, CREATE_PASS)                                       \
+  if (Name == NAME) {                                                          \
+    CGPM.addPass(createCGSCCToFunctionPassAdaptor(CREATE_PASS));               \
+    return Error::success();                                                   \
+  }
+#define FUNCTION_PASS_WITH_PARAMS(NAME, CREATE_PASS, PARSER)                   \
+  if (checkParametrizedPassName(Name, NAME)) {                                 \
+    auto Params = parsePassParameters(PARSER, Name, NAME);                     \
+    if (!Params)                                                               \
+      return Params.takeError();                                               \
+    CGPM.addPass(createCGSCCToFunctionPassAdaptor(CREATE_PASS(Params.get()))); \
+    return Error::success();                                                   \
+  }
+#define LOOP_PASS(NAME, CREATE_PASS)                                           \
+  if (Name == NAME) {                                                          \
+    CGPM.addPass(                                                              \
+        createCGSCCToFunctionPassAdaptor(createFunctionToLoopPassAdaptor(      \
+            CREATE_PASS, false, false, DebugLogging)));                        \
+    return Error::success();                                                   \
+  }
+#define LOOP_PASS_WITH_PARAMS(NAME, CREATE_PASS, PARSER)                       \
+  if (checkParametrizedPassName(Name, NAME)) {                                 \
+    auto Params = parsePassParameters(PARSER, Name, NAME);                     \
+    if (!Params)                                                               \
+      return Params.takeError();                                               \
+    CGPM.addPass(                                                              \
+        createCGSCCToFunctionPassAdaptor(createFunctionToLoopPassAdaptor(      \
+            CREATE_PASS(Params.get()), false, false, DebugLogging)));          \
+    return Error::success();                                                   \
+  }
 #include "PassRegistry.def"
 
   for (auto &C : CGSCCPipelineParsingCallbacks)
@@ -2306,8 +2660,7 @@ Error PassBuilder::parseCGSCCPass(CGSCCPassManager &CGPM,
 }
 
 Error PassBuilder::parseFunctionPass(FunctionPassManager &FPM,
-                                     const PipelineElement &E,
-                                     bool VerifyEachPass, bool DebugLogging) {
+                                     const PipelineElement &E) {
   auto &Name = E.Name;
   auto &InnerPipeline = E.InnerPipeline;
 
@@ -2315,8 +2668,7 @@ Error PassBuilder::parseFunctionPass(FunctionPassManager &FPM,
   if (!InnerPipeline.empty()) {
     if (Name == "function") {
       FunctionPassManager NestedFPM(DebugLogging);
-      if (auto Err = parseFunctionPassPipeline(NestedFPM, InnerPipeline,
-                                               VerifyEachPass, DebugLogging))
+      if (auto Err = parseFunctionPassPipeline(NestedFPM, InnerPipeline))
         return Err;
       // Add the nested pass manager with the appropriate adaptor.
       FPM.addPass(std::move(NestedFPM));
@@ -2324,19 +2676,19 @@ Error PassBuilder::parseFunctionPass(FunctionPassManager &FPM,
     }
     if (Name == "loop" || Name == "loop-mssa") {
       LoopPassManager LPM(DebugLogging);
-      if (auto Err = parseLoopPassPipeline(LPM, InnerPipeline, VerifyEachPass,
-                                           DebugLogging))
+      if (auto Err = parseLoopPassPipeline(LPM, InnerPipeline))
         return Err;
       // Add the nested pass manager with the appropriate adaptor.
       bool UseMemorySSA = (Name == "loop-mssa");
+      bool UseBFI = llvm::any_of(
+          InnerPipeline, [](auto Pipeline) { return Pipeline.Name == "licm"; });
       FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM), UseMemorySSA,
-                                                  DebugLogging));
+                                                  UseBFI, DebugLogging));
       return Error::success();
     }
     if (auto Count = parseRepeatPassName(Name)) {
       FunctionPassManager NestedFPM(DebugLogging);
-      if (auto Err = parseFunctionPassPipeline(NestedFPM, InnerPipeline,
-                                               VerifyEachPass, DebugLogging))
+      if (auto Err = parseFunctionPassPipeline(NestedFPM, InnerPipeline))
         return Err;
       FPM.addPass(createRepeatedPass(*Count, std::move(NestedFPM)));
       return Error::success();
@@ -2378,6 +2730,25 @@ Error PassBuilder::parseFunctionPass(FunctionPassManager &FPM,
                 std::remove_reference<decltype(CREATE_PASS)>::type>());        \
     return Error::success();                                                   \
   }
+// FIXME: UseMemorySSA is set to false. Maybe we could do things like:
+//        bool UseMemorySSA = !("canon-freeze" || "loop-predication" ||
+//                              "guard-widening");
+//        The risk is that it may become obsolete if we're not careful.
+#define LOOP_PASS(NAME, CREATE_PASS)                                           \
+  if (Name == NAME) {                                                          \
+    FPM.addPass(createFunctionToLoopPassAdaptor(CREATE_PASS, false, false,     \
+                                                DebugLogging));                \
+    return Error::success();                                                   \
+  }
+#define LOOP_PASS_WITH_PARAMS(NAME, CREATE_PASS, PARSER)                       \
+  if (checkParametrizedPassName(Name, NAME)) {                                 \
+    auto Params = parsePassParameters(PARSER, Name, NAME);                     \
+    if (!Params)                                                               \
+      return Params.takeError();                                               \
+    FPM.addPass(createFunctionToLoopPassAdaptor(CREATE_PASS(Params.get()),     \
+                                                false, false, DebugLogging));  \
+    return Error::success();                                                   \
+  }
 #include "PassRegistry.def"
 
   for (auto &C : FunctionPipelineParsingCallbacks)
@@ -2388,8 +2759,8 @@ Error PassBuilder::parseFunctionPass(FunctionPassManager &FPM,
       inconvertibleErrorCode());
 }
 
-Error PassBuilder::parseLoopPass(LoopPassManager &LPM, const PipelineElement &E,
-                                 bool VerifyEachPass, bool DebugLogging) {
+Error PassBuilder::parseLoopPass(LoopPassManager &LPM,
+                                 const PipelineElement &E) {
   StringRef Name = E.Name;
   auto &InnerPipeline = E.InnerPipeline;
 
@@ -2397,8 +2768,7 @@ Error PassBuilder::parseLoopPass(LoopPassManager &LPM, const PipelineElement &E,
   if (!InnerPipeline.empty()) {
     if (Name == "loop") {
       LoopPassManager NestedLPM(DebugLogging);
-      if (auto Err = parseLoopPassPipeline(NestedLPM, InnerPipeline,
-                                           VerifyEachPass, DebugLogging))
+      if (auto Err = parseLoopPassPipeline(NestedLPM, InnerPipeline))
         return Err;
       // Add the nested pass manager with the appropriate adaptor.
       LPM.addPass(std::move(NestedLPM));
@@ -2406,8 +2776,7 @@ Error PassBuilder::parseLoopPass(LoopPassManager &LPM, const PipelineElement &E,
     }
     if (auto Count = parseRepeatPassName(Name)) {
       LoopPassManager NestedLPM(DebugLogging);
-      if (auto Err = parseLoopPassPipeline(NestedLPM, InnerPipeline,
-                                           VerifyEachPass, DebugLogging))
+      if (auto Err = parseLoopPassPipeline(NestedLPM, InnerPipeline))
         return Err;
       LPM.addPass(createRepeatedPass(*Count, std::move(NestedLPM)));
       return Error::success();
@@ -2481,39 +2850,28 @@ bool PassBuilder::parseAAPassName(AAManager &AA, StringRef Name) {
 }
 
 Error PassBuilder::parseLoopPassPipeline(LoopPassManager &LPM,
-                                         ArrayRef<PipelineElement> Pipeline,
-                                         bool VerifyEachPass,
-                                         bool DebugLogging) {
+                                         ArrayRef<PipelineElement> Pipeline) {
   for (const auto &Element : Pipeline) {
-    if (auto Err = parseLoopPass(LPM, Element, VerifyEachPass, DebugLogging))
+    if (auto Err = parseLoopPass(LPM, Element))
       return Err;
-    // FIXME: No verifier support for Loop passes!
   }
   return Error::success();
 }
 
-Error PassBuilder::parseFunctionPassPipeline(FunctionPassManager &FPM,
-                                             ArrayRef<PipelineElement> Pipeline,
-                                             bool VerifyEachPass,
-                                             bool DebugLogging) {
+Error PassBuilder::parseFunctionPassPipeline(
+    FunctionPassManager &FPM, ArrayRef<PipelineElement> Pipeline) {
   for (const auto &Element : Pipeline) {
-    if (auto Err =
-            parseFunctionPass(FPM, Element, VerifyEachPass, DebugLogging))
+    if (auto Err = parseFunctionPass(FPM, Element))
       return Err;
-    if (VerifyEachPass)
-      FPM.addPass(VerifierPass());
   }
   return Error::success();
 }
 
 Error PassBuilder::parseCGSCCPassPipeline(CGSCCPassManager &CGPM,
-                                          ArrayRef<PipelineElement> Pipeline,
-                                          bool VerifyEachPass,
-                                          bool DebugLogging) {
+                                          ArrayRef<PipelineElement> Pipeline) {
   for (const auto &Element : Pipeline) {
-    if (auto Err = parseCGSCCPass(CGPM, Element, VerifyEachPass, DebugLogging))
+    if (auto Err = parseCGSCCPass(CGPM, Element))
       return Err;
-    // FIXME: No verifier support for CGSCC passes!
   }
   return Error::success();
 }
@@ -2532,14 +2890,10 @@ void PassBuilder::crossRegisterProxies(LoopAnalysisManager &LAM,
 }
 
 Error PassBuilder::parseModulePassPipeline(ModulePassManager &MPM,
-                                           ArrayRef<PipelineElement> Pipeline,
-                                           bool VerifyEachPass,
-                                           bool DebugLogging) {
+                                           ArrayRef<PipelineElement> Pipeline) {
   for (const auto &Element : Pipeline) {
-    if (auto Err = parseModulePass(MPM, Element, VerifyEachPass, DebugLogging))
+    if (auto Err = parseModulePass(MPM, Element))
       return Err;
-    if (VerifyEachPass)
-      MPM.addPass(VerifierPass());
   }
   return Error::success();
 }
@@ -2548,8 +2902,7 @@ Error PassBuilder::parseModulePassPipeline(ModulePassManager &MPM,
 // FIXME: Should this routine accept a TargetMachine or require the caller to
 // pre-populate the analysis managers with target-specific stuff?
 Error PassBuilder::parsePassPipeline(ModulePassManager &MPM,
-                                     StringRef PipelineText,
-                                     bool VerifyEachPass, bool DebugLogging) {
+                                     StringRef PipelineText) {
   auto Pipeline = parsePipelineText(PipelineText);
   if (!Pipeline || Pipeline->empty())
     return make_error<StringError>(
@@ -2570,7 +2923,7 @@ Error PassBuilder::parsePassPipeline(ModulePassManager &MPM,
       Pipeline = {{"function", {{"loop", std::move(*Pipeline)}}}};
     } else {
       for (auto &C : TopLevelPipelineParsingCallbacks)
-        if (C(MPM, *Pipeline, VerifyEachPass, DebugLogging))
+        if (C(MPM, *Pipeline, DebugLogging))
           return Error::success();
 
       // Unknown pass or pipeline name!
@@ -2583,16 +2936,14 @@ Error PassBuilder::parsePassPipeline(ModulePassManager &MPM,
     }
   }
 
-  if (auto Err =
-          parseModulePassPipeline(MPM, *Pipeline, VerifyEachPass, DebugLogging))
+  if (auto Err = parseModulePassPipeline(MPM, *Pipeline))
     return Err;
   return Error::success();
 }
 
 // Primary pass pipeline description parsing routine for a \c CGSCCPassManager
 Error PassBuilder::parsePassPipeline(CGSCCPassManager &CGPM,
-                                     StringRef PipelineText,
-                                     bool VerifyEachPass, bool DebugLogging) {
+                                     StringRef PipelineText) {
   auto Pipeline = parsePipelineText(PipelineText);
   if (!Pipeline || Pipeline->empty())
     return make_error<StringError>(
@@ -2607,8 +2958,7 @@ Error PassBuilder::parsePassPipeline(CGSCCPassManager &CGPM,
             .str(),
         inconvertibleErrorCode());
 
-  if (auto Err =
-          parseCGSCCPassPipeline(CGPM, *Pipeline, VerifyEachPass, DebugLogging))
+  if (auto Err = parseCGSCCPassPipeline(CGPM, *Pipeline))
     return Err;
   return Error::success();
 }
@@ -2616,8 +2966,7 @@ Error PassBuilder::parsePassPipeline(CGSCCPassManager &CGPM,
 // Primary pass pipeline description parsing routine for a \c
 // FunctionPassManager
 Error PassBuilder::parsePassPipeline(FunctionPassManager &FPM,
-                                     StringRef PipelineText,
-                                     bool VerifyEachPass, bool DebugLogging) {
+                                     StringRef PipelineText) {
   auto Pipeline = parsePipelineText(PipelineText);
   if (!Pipeline || Pipeline->empty())
     return make_error<StringError>(
@@ -2632,24 +2981,21 @@ Error PassBuilder::parsePassPipeline(FunctionPassManager &FPM,
             .str(),
         inconvertibleErrorCode());
 
-  if (auto Err = parseFunctionPassPipeline(FPM, *Pipeline, VerifyEachPass,
-                                           DebugLogging))
+  if (auto Err = parseFunctionPassPipeline(FPM, *Pipeline))
     return Err;
   return Error::success();
 }
 
 // Primary pass pipeline description parsing routine for a \c LoopPassManager
 Error PassBuilder::parsePassPipeline(LoopPassManager &CGPM,
-                                     StringRef PipelineText,
-                                     bool VerifyEachPass, bool DebugLogging) {
+                                     StringRef PipelineText) {
   auto Pipeline = parsePipelineText(PipelineText);
   if (!Pipeline || Pipeline->empty())
     return make_error<StringError>(
         formatv("invalid pipeline '{0}'", PipelineText).str(),
         inconvertibleErrorCode());
 
-  if (auto Err =
-          parseLoopPassPipeline(CGPM, *Pipeline, VerifyEachPass, DebugLogging))
+  if (auto Err = parseLoopPassPipeline(CGPM, *Pipeline))
     return Err;
 
   return Error::success();
@@ -2676,6 +3022,9 @@ Error PassBuilder::parseAAPipeline(AAManager &AA, StringRef PipelineText) {
 }
 
 bool PassBuilder::isAAPassName(StringRef PassName) {
+#define MODULE_ALIAS_ANALYSIS(NAME, CREATE_PASS)                               \
+  if (PassName == NAME)                                                        \
+    return true;
 #define FUNCTION_ALIAS_ANALYSIS(NAME, CREATE_PASS)                             \
   if (PassName == NAME)                                                        \
     return true;
@@ -2693,9 +3042,21 @@ bool PassBuilder::isAnalysisPassName(StringRef PassName) {
 #define LOOP_ANALYSIS(NAME, CREATE_PASS)                                       \
   if (PassName == NAME)                                                        \
     return true;
-#define CGSSC_ANALYSIS(NAME, CREATE_PASS)                                      \
+#define CGSCC_ANALYSIS(NAME, CREATE_PASS)                                      \
+  if (PassName == NAME)                                                        \
+    return true;
+#define MODULE_ALIAS_ANALYSIS(NAME, CREATE_PASS)                               \
+  if (PassName == NAME)                                                        \
+    return true;
+#define FUNCTION_ALIAS_ANALYSIS(NAME, CREATE_PASS)                             \
   if (PassName == NAME)                                                        \
     return true;
 #include "PassRegistry.def"
   return false;
 }
+
+void PassBuilder::registerParseTopLevelPipelineCallback(
+    const std::function<bool(ModulePassManager &, ArrayRef<PipelineElement>,
+                             bool DebugLogging)> &C) {
+  TopLevelPipelineParsingCallbacks.push_back(C);
+}
diff --git a/contrib/llvm-project/llvm/lib/Passes/PassRegistry.def b/contrib/llvm-project/llvm/lib/Passes/PassRegistry.def
index dfdfc3d05976..877cb9ed13b3 100644
--- a/contrib/llvm-project/llvm/lib/Passes/PassRegistry.def
+++ b/contrib/llvm-project/llvm/lib/Passes/PassRegistry.def
@@ -28,6 +28,7 @@ MODULE_ANALYSIS("verify", VerifierAnalysis())
 MODULE_ANALYSIS("pass-instrumentation", PassInstrumentationAnalysis(PIC))
 MODULE_ANALYSIS("asan-globals-md", ASanGlobalsMetadataAnalysis())
 MODULE_ANALYSIS("inline-advisor", InlineAdvisorAnalysis())
+MODULE_ANALYSIS("ir-similarity", IRSimilarityAnalysis())
 
 #ifndef MODULE_ALIAS_ANALYSIS
 #define MODULE_ALIAS_ANALYSIS(NAME, CREATE_PASS)                               \
@@ -42,6 +43,7 @@ MODULE_ALIAS_ANALYSIS("globals-aa", GlobalsAA())
 #endif
 MODULE_PASS("always-inline", AlwaysInlinerPass())
 MODULE_PASS("attributor", AttributorPass())
+MODULE_PASS("annotation2metadata", Annotation2MetadataPass())
 MODULE_PASS("called-value-propagation", CalledValuePropagationPass())
 MODULE_PASS("canonicalize-aliases", CanonicalizeAliasesPass())
 MODULE_PASS("cg-profile", CGProfilePass())
@@ -49,6 +51,7 @@ MODULE_PASS("constmerge", ConstantMergePass())
 MODULE_PASS("cross-dso-cfi", CrossDSOCFIPass())
 MODULE_PASS("deadargelim", DeadArgumentEliminationPass())
 MODULE_PASS("elim-avail-extern", EliminateAvailableExternallyPass())
+MODULE_PASS("extract-blocks", BlockExtractorPass())
 MODULE_PASS("forceattrs", ForceFunctionAttrsPass())
 MODULE_PASS("function-import", FunctionImportPass())
 MODULE_PASS("globaldce", GlobalDCEPass())
@@ -59,16 +62,25 @@ MODULE_PASS("hwasan", HWAddressSanitizerPass(false, false))
 MODULE_PASS("khwasan", HWAddressSanitizerPass(true, true))
 MODULE_PASS("inferattrs", InferFunctionAttrsPass())
 MODULE_PASS("inliner-wrapper", ModuleInlinerWrapperPass())
+MODULE_PASS("inliner-wrapper-no-mandatory-first", ModuleInlinerWrapperPass(
+  getInlineParams(), 
+  DebugLogging, 
+  false))
 MODULE_PASS("insert-gcov-profiling", GCOVProfilerPass())
 MODULE_PASS("instrorderfile", InstrOrderFilePass())
 MODULE_PASS("instrprof", InstrProfiling())
 MODULE_PASS("internalize", InternalizePass())
 MODULE_PASS("invalidate<all>", InvalidateAllAnalysesPass())
 MODULE_PASS("ipsccp", IPSCCPPass())
-MODULE_PASS("lowertypetests", LowerTypeTestsPass(nullptr, nullptr))
+MODULE_PASS("iroutliner", IROutlinerPass())
+MODULE_PASS("print-ir-similarity", IRSimilarityAnalysisPrinterPass(dbgs()))
+MODULE_PASS("loop-extract", LoopExtractorPass())
+MODULE_PASS("lowertypetests", LowerTypeTestsPass())
+MODULE_PASS("metarenamer", MetaRenamerPass())
 MODULE_PASS("mergefunc", MergeFunctionsPass())
 MODULE_PASS("name-anon-globals", NameAnonGlobalPass())
 MODULE_PASS("no-op-module", NoOpModulePass())
+MODULE_PASS("objc-arc-apelim", ObjCARCAPElimPass())
 MODULE_PASS("partial-inliner", PartialInlinerPass())
 MODULE_PASS("pgo-icall-prom", PGOIndirectCallPromotion())
 MODULE_PASS("pgo-instr-gen", PGOInstrumentationGen())
@@ -78,23 +90,36 @@ MODULE_PASS("print-callgraph", CallGraphPrinterPass(dbgs()))
 MODULE_PASS("print", PrintModulePass(dbgs()))
 MODULE_PASS("print-lcg", LazyCallGraphPrinterPass(dbgs()))
 MODULE_PASS("print-lcg-dot", LazyCallGraphDOTPrinterPass(dbgs()))
+MODULE_PASS("print-must-be-executed-contexts", MustBeExecutedContextPrinterPass(dbgs()))
 MODULE_PASS("print-stack-safety", StackSafetyGlobalPrinterPass(dbgs()))
+MODULE_PASS("print<module-debuginfo>", ModuleDebugInfoPrinterPass(dbgs()))
 MODULE_PASS("rewrite-statepoints-for-gc", RewriteStatepointsForGC())
 MODULE_PASS("rewrite-symbols", RewriteSymbolPass())
-MODULE_PASS("rpo-functionattrs", ReversePostOrderFunctionAttrsPass())
+MODULE_PASS("rpo-function-attrs", ReversePostOrderFunctionAttrsPass())
 MODULE_PASS("sample-profile", SampleProfileLoaderPass())
 MODULE_PASS("scc-oz-module-inliner",
-  buildInlinerPipeline(OptimizationLevel::Oz, ThinLTOPhase::None, DebugLogging))
+  buildInlinerPipeline(OptimizationLevel::Oz, ThinOrFullLTOPhase::None))
+MODULE_PASS("loop-extract-single", LoopExtractorPass(1))
+MODULE_PASS("strip", StripSymbolsPass())
+MODULE_PASS("strip-dead-debug-info", StripDeadDebugInfoPass())
+MODULE_PASS("pseudo-probe", SampleProfileProbePass(TM))
 MODULE_PASS("strip-dead-prototypes", StripDeadPrototypesPass())
+MODULE_PASS("strip-debug-declare", StripDebugDeclarePass())
+MODULE_PASS("strip-nondebug", StripNonDebugSymbolsPass())
+MODULE_PASS("strip-nonlinetable-debuginfo", StripNonLineTableDebugInfoPass())
 MODULE_PASS("synthetic-counts-propagation", SyntheticCountsPropagation())
-MODULE_PASS("wholeprogramdevirt", WholeProgramDevirtPass(nullptr, nullptr))
+MODULE_PASS("unique-internal-linkage-names", UniqueInternalLinkageNamesPass())
 MODULE_PASS("verify", VerifierPass())
+MODULE_PASS("wholeprogramdevirt", WholeProgramDevirtPass())
+MODULE_PASS("dfsan", DataFlowSanitizerPass())
 MODULE_PASS("asan-module", ModuleAddressSanitizerPass(/*CompileKernel=*/false, false, true, false))
 MODULE_PASS("msan-module", MemorySanitizerPass({}))
 MODULE_PASS("tsan-module", ThreadSanitizerPass())
 MODULE_PASS("kasan-module", ModuleAddressSanitizerPass(/*CompileKernel=*/true, false, true, false))
 MODULE_PASS("sancov-module", ModuleSanitizerCoveragePass())
+MODULE_PASS("memprof-module", ModuleMemProfilerPass())
 MODULE_PASS("poison-checking", PoisonCheckingPass())
+MODULE_PASS("pseudo-probe-update", PseudoProbeUpdatePass())
 #undef MODULE_PASS
 
 #ifndef CGSCC_ANALYSIS
@@ -129,10 +154,10 @@ FUNCTION_ANALYSIS("domtree", DominatorTreeAnalysis())
 FUNCTION_ANALYSIS("postdomtree", PostDominatorTreeAnalysis())
 FUNCTION_ANALYSIS("demanded-bits", DemandedBitsAnalysis())
 FUNCTION_ANALYSIS("domfrontier", DominanceFrontierAnalysis())
+FUNCTION_ANALYSIS("func-properties", FunctionPropertiesAnalysis())
 FUNCTION_ANALYSIS("loops", LoopAnalysis())
 FUNCTION_ANALYSIS("lazy-value-info", LazyValueAnalysis())
 FUNCTION_ANALYSIS("da", DependenceAnalysis())
-FUNCTION_ANALYSIS("inliner-features", InlineFeaturesAnalysis())
 FUNCTION_ANALYSIS("inliner-size-estimator", InlineSizeEstimatorAnalysis())
 FUNCTION_ANALYSIS("memdep", MemoryDependenceAnalysis())
 FUNCTION_ANALYSIS("memoryssa", MemorySSAAnalysis())
@@ -155,9 +180,10 @@ FUNCTION_ANALYSIS("pass-instrumentation", PassInstrumentationAnalysis(PIC))
 FUNCTION_ALIAS_ANALYSIS("basic-aa", BasicAA())
 FUNCTION_ALIAS_ANALYSIS("cfl-anders-aa", CFLAndersAA())
 FUNCTION_ALIAS_ANALYSIS("cfl-steens-aa", CFLSteensAA())
+FUNCTION_ALIAS_ANALYSIS("objc-arc-aa", objcarc::ObjCARCAA())
 FUNCTION_ALIAS_ANALYSIS("scev-aa", SCEVAA())
 FUNCTION_ALIAS_ANALYSIS("scoped-noalias-aa", ScopedNoAliasAA())
-FUNCTION_ALIAS_ANALYSIS("type-based-aa", TypeBasedAA())
+FUNCTION_ALIAS_ANALYSIS("tbaa", TypeBasedAA())
 #undef FUNCTION_ALIAS_ANALYSIS
 #undef FUNCTION_ANALYSIS
 
@@ -171,11 +197,13 @@ FUNCTION_PASS("aggressive-instcombine", AggressiveInstCombinePass())
 FUNCTION_PASS("assume-builder", AssumeBuilderPass())
 FUNCTION_PASS("assume-simplify", AssumeSimplifyPass())
 FUNCTION_PASS("alignment-from-assumptions", AlignmentFromAssumptionsPass())
+FUNCTION_PASS("annotation-remarks", AnnotationRemarksPass())
 FUNCTION_PASS("bdce", BDCEPass())
 FUNCTION_PASS("bounds-checking", BoundsCheckingPass())
 FUNCTION_PASS("break-crit-edges", BreakCriticalEdgesPass())
 FUNCTION_PASS("callsite-splitting", CallSiteSplittingPass())
 FUNCTION_PASS("consthoist", ConstantHoistingPass())
+FUNCTION_PASS("constraint-elimination", ConstraintEliminationPass())
 FUNCTION_PASS("chr", ControlHeightReductionPass())
 FUNCTION_PASS("coro-early", CoroEarlyPass())
 FUNCTION_PASS("coro-elide", CoroElidePass())
@@ -189,32 +217,43 @@ FUNCTION_PASS("dot-cfg-only", CFGOnlyPrinterPass())
 FUNCTION_PASS("early-cse", EarlyCSEPass(/*UseMemorySSA=*/false))
 FUNCTION_PASS("early-cse-memssa", EarlyCSEPass(/*UseMemorySSA=*/true))
 FUNCTION_PASS("ee-instrument", EntryExitInstrumenterPass(/*PostInlining=*/false))
+FUNCTION_PASS("fix-irreducible", FixIrreduciblePass())
 FUNCTION_PASS("make-guards-explicit", MakeGuardsExplicitPass())
 FUNCTION_PASS("post-inline-ee-instrument", EntryExitInstrumenterPass(/*PostInlining=*/true))
 FUNCTION_PASS("gvn-hoist", GVNHoistPass())
+FUNCTION_PASS("gvn-sink", GVNSinkPass())
+FUNCTION_PASS("helloworld", HelloWorldPass())
+FUNCTION_PASS("infer-address-spaces", InferAddressSpacesPass())
 FUNCTION_PASS("instcombine", InstCombinePass())
+FUNCTION_PASS("instcount", InstCountPass())
 FUNCTION_PASS("instsimplify", InstSimplifyPass())
 FUNCTION_PASS("invalidate<all>", InvalidateAllAnalysesPass())
 FUNCTION_PASS("irce", IRCEPass())
 FUNCTION_PASS("float2int", Float2IntPass())
 FUNCTION_PASS("no-op-function", NoOpFunctionPass())
 FUNCTION_PASS("libcalls-shrinkwrap", LibCallsShrinkWrapPass())
+FUNCTION_PASS("lint", LintPass())
 FUNCTION_PASS("inject-tli-mappings", InjectTLIMappings())
+FUNCTION_PASS("instnamer", InstructionNamerPass())
 FUNCTION_PASS("loweratomic", LowerAtomicPass())
 FUNCTION_PASS("lower-expect", LowerExpectIntrinsicPass())
 FUNCTION_PASS("lower-guard-intrinsic", LowerGuardIntrinsicPass())
 FUNCTION_PASS("lower-constant-intrinsics", LowerConstantIntrinsicsPass())
 FUNCTION_PASS("lower-matrix-intrinsics", LowerMatrixIntrinsicsPass())
+FUNCTION_PASS("lower-matrix-intrinsics-minimal", LowerMatrixIntrinsicsPass(true))
 FUNCTION_PASS("lower-widenable-condition", LowerWidenableConditionPass())
 FUNCTION_PASS("guard-widening", GuardWideningPass())
 FUNCTION_PASS("load-store-vectorizer", LoadStoreVectorizerPass())
 FUNCTION_PASS("loop-simplify", LoopSimplifyPass())
 FUNCTION_PASS("loop-sink", LoopSinkPass())
 FUNCTION_PASS("loop-unroll-and-jam", LoopUnrollAndJamPass())
+FUNCTION_PASS("loop-flatten", LoopFlattenPass())
 FUNCTION_PASS("lowerinvoke", LowerInvokePass())
+FUNCTION_PASS("lowerswitch", LowerSwitchPass())
 FUNCTION_PASS("mem2reg", PromotePass())
 FUNCTION_PASS("memcpyopt", MemCpyOptPass())
 FUNCTION_PASS("mergeicmps", MergeICmpsPass())
+FUNCTION_PASS("mergereturn", UnifyFunctionExitNodesPass())
 FUNCTION_PASS("nary-reassociate", NaryReassociatePass())
 FUNCTION_PASS("newgvn", NewGVNPass())
 FUNCTION_PASS("jump-threading", JumpThreadingPass())
@@ -224,6 +263,10 @@ FUNCTION_PASS("loop-data-prefetch", LoopDataPrefetchPass())
 FUNCTION_PASS("loop-load-elim", LoopLoadEliminationPass())
 FUNCTION_PASS("loop-fusion", LoopFusePass())
 FUNCTION_PASS("loop-distribute", LoopDistributePass())
+FUNCTION_PASS("loop-versioning", LoopVersioningPass())
+FUNCTION_PASS("objc-arc", ObjCARCOptPass())
+FUNCTION_PASS("objc-arc-contract", ObjCARCContractPass())
+FUNCTION_PASS("objc-arc-expand", ObjCARCExpandPass())
 FUNCTION_PASS("pgo-memop-opt", PGOMemOPSizeOpt())
 FUNCTION_PASS("print", PrintFunctionPass(dbgs()))
 FUNCTION_PASS("print<assumptions>", AssumptionPrinterPass(dbgs()))
@@ -232,26 +275,42 @@ FUNCTION_PASS("print<branch-prob>", BranchProbabilityPrinterPass(dbgs()))
 FUNCTION_PASS("print<da>", DependenceAnalysisPrinterPass(dbgs()))
 FUNCTION_PASS("print<domtree>", DominatorTreePrinterPass(dbgs()))
 FUNCTION_PASS("print<postdomtree>", PostDominatorTreePrinterPass(dbgs()))
+FUNCTION_PASS("print<delinearization>", DelinearizationPrinterPass(dbgs()))
 FUNCTION_PASS("print<demanded-bits>", DemandedBitsPrinterPass(dbgs()))
 FUNCTION_PASS("print<domfrontier>", DominanceFrontierPrinterPass(dbgs()))
+FUNCTION_PASS("print<func-properties>", FunctionPropertiesPrinterPass(dbgs()))
 FUNCTION_PASS("print<inline-cost>", InlineCostAnnotationPrinterPass(dbgs()))
+FUNCTION_PASS("print<inliner-size-estimator>", 
+  InlineSizeEstimatorAnalysisPrinterPass(dbgs()))
 FUNCTION_PASS("print<loops>", LoopPrinterPass(dbgs()))
 FUNCTION_PASS("print<memoryssa>", MemorySSAPrinterPass(dbgs()))
 FUNCTION_PASS("print<phi-values>", PhiValuesPrinterPass(dbgs()))
 FUNCTION_PASS("print<regions>", RegionInfoPrinterPass(dbgs()))
 FUNCTION_PASS("print<scalar-evolution>", ScalarEvolutionPrinterPass(dbgs()))
 FUNCTION_PASS("print<stack-safety-local>", StackSafetyPrinterPass(dbgs()))
+// TODO: rename to print<foo> after NPM switch
+FUNCTION_PASS("print-alias-sets", AliasSetsPrinterPass(dbgs()))
 FUNCTION_PASS("print-predicateinfo", PredicateInfoPrinterPass(dbgs()))
+FUNCTION_PASS("print-mustexecute", MustExecutePrinterPass(dbgs()))
+FUNCTION_PASS("print-memderefs", MemDerefPrinterPass(dbgs()))
 FUNCTION_PASS("reassociate", ReassociatePass())
+FUNCTION_PASS("redundant-dbg-inst-elim", RedundantDbgInstEliminationPass())
+FUNCTION_PASS("reg2mem", RegToMemPass())
+FUNCTION_PASS("scalarize-masked-mem-intrin", ScalarizeMaskedMemIntrinPass())
 FUNCTION_PASS("scalarizer", ScalarizerPass())
+FUNCTION_PASS("separate-const-offset-from-gep", SeparateConstOffsetFromGEPPass())
 FUNCTION_PASS("sccp", SCCPPass())
 FUNCTION_PASS("simplifycfg", SimplifyCFGPass())
 FUNCTION_PASS("sink", SinkingPass())
 FUNCTION_PASS("slp-vectorizer", SLPVectorizerPass())
+FUNCTION_PASS("slsr", StraightLineStrengthReducePass())
 FUNCTION_PASS("speculative-execution", SpeculativeExecutionPass())
 FUNCTION_PASS("spec-phis", SpeculateAroundPHIsPass())
 FUNCTION_PASS("sroa", SROA())
+FUNCTION_PASS("strip-gc-relocates", StripGCRelocates())
+FUNCTION_PASS("structurizecfg", StructurizeCFGPass())
 FUNCTION_PASS("tailcallelim", TailCallElimPass())
+FUNCTION_PASS("unify-loop-exits", UnifyLoopExitsPass())
 FUNCTION_PASS("vector-combine", VectorCombinePass())
 FUNCTION_PASS("verify", VerifierPass())
 FUNCTION_PASS("verify<domtree>", DominatorTreeVerifierPass())
@@ -268,6 +327,7 @@ FUNCTION_PASS("kasan", AddressSanitizerPass(true, false, false))
 FUNCTION_PASS("msan", MemorySanitizerPass({}))
 FUNCTION_PASS("kmsan", MemorySanitizerPass({0, false, /*Kernel=*/true}))
 FUNCTION_PASS("tsan", ThreadSanitizerPass())
+FUNCTION_PASS("memprof", MemProfilerPass())
 #undef FUNCTION_PASS
 
 #ifndef FUNCTION_PASS_WITH_PARAMS
@@ -316,7 +376,7 @@ FUNCTION_PASS_WITH_PARAMS("print<stack-lifetime>",
 LOOP_ANALYSIS("no-op-loop", NoOpLoopAnalysis())
 LOOP_ANALYSIS("access-info", LoopAccessAnalysis())
 LOOP_ANALYSIS("ddg", DDGAnalysis())
-LOOP_ANALYSIS("ivusers", IVUsersAnalysis())
+LOOP_ANALYSIS("iv-users", IVUsersAnalysis())
 LOOP_ANALYSIS("pass-instrumentation", PassInstrumentationAnalysis(PIC))
 #undef LOOP_ANALYSIS
 
@@ -324,26 +384,30 @@ LOOP_ANALYSIS("pass-instrumentation", PassInstrumentationAnalysis(PIC))
 #define LOOP_PASS(NAME, CREATE_PASS)
 #endif
 LOOP_PASS("canon-freeze", CanonicalizeFreezeInLoopsPass())
+LOOP_PASS("dot-ddg", DDGDotPrinterPass())
 LOOP_PASS("invalidate<all>", InvalidateAllAnalysesPass())
 LOOP_PASS("licm", LICMPass())
 LOOP_PASS("loop-idiom", LoopIdiomRecognizePass())
 LOOP_PASS("loop-instsimplify", LoopInstSimplifyPass())
-LOOP_PASS("rotate", LoopRotatePass())
+LOOP_PASS("loop-interchange", LoopInterchangePass())
+LOOP_PASS("loop-rotate", LoopRotatePass())
 LOOP_PASS("no-op-loop", NoOpLoopPass())
 LOOP_PASS("print", PrintLoopPass(dbgs()))
 LOOP_PASS("loop-deletion", LoopDeletionPass())
-LOOP_PASS("simplify-cfg", LoopSimplifyCFGPass())
+LOOP_PASS("loop-simplifycfg", LoopSimplifyCFGPass())
 LOOP_PASS("loop-reduce", LoopStrengthReducePass())
 LOOP_PASS("indvars", IndVarSimplifyPass())
 LOOP_PASS("loop-unroll-full", LoopFullUnrollPass())
 LOOP_PASS("print-access-info", LoopAccessInfoPrinterPass(dbgs()))
 LOOP_PASS("print<ddg>", DDGAnalysisPrinterPass(dbgs()))
-LOOP_PASS("print<ivusers>", IVUsersPrinterPass(dbgs()))
+LOOP_PASS("print<iv-users>", IVUsersPrinterPass(dbgs()))
 LOOP_PASS("print<loopnest>", LoopNestPrinterPass(dbgs()))
 LOOP_PASS("print<loop-cache-cost>", LoopCachePrinterPass(dbgs()))
 LOOP_PASS("loop-predication", LoopPredicationPass())
 LOOP_PASS("guard-widening", GuardWideningPass())
 LOOP_PASS("simple-loop-unswitch", SimpleLoopUnswitchPass())
+LOOP_PASS("loop-reroll", LoopRerollPass())
+LOOP_PASS("loop-versioning-licm", LoopVersioningLICMPass())
 #undef LOOP_PASS
 
 #ifndef LOOP_PASS_WITH_PARAMS
diff --git a/contrib/llvm-project/llvm/lib/Passes/StandardInstrumentations.cpp b/contrib/llvm-project/llvm/lib/Passes/StandardInstrumentations.cpp
index 1e1a6b98a65a..6795aed7b04e 100644
--- a/contrib/llvm-project/llvm/lib/Passes/StandardInstrumentations.cpp
+++ b/contrib/llvm-project/llvm/lib/Passes/StandardInstrumentations.cpp
@@ -13,32 +13,98 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Passes/StandardInstrumentations.h"
+#include "llvm/ADT/Any.h"
 #include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
 #include "llvm/Analysis/LazyCallGraph.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassInstrumentation.h"
+#include "llvm/IR/PrintPasses.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/raw_ostream.h"
+#include <unordered_set>
+#include <vector>
 
 using namespace llvm;
 
+cl::opt<bool> PreservedCFGCheckerInstrumentation::VerifyPreservedCFG(
+    "verify-cfg-preserved", cl::Hidden,
+#ifdef NDEBUG
+    cl::init(false));
+#else
+    cl::init(false));
+#endif
+
+// FIXME: Change `-debug-pass-manager` from boolean to enum type. Similar to
+// `-debug-pass` in legacy PM.
+static cl::opt<bool>
+    DebugPMVerbose("debug-pass-manager-verbose", cl::Hidden, cl::init(false),
+                   cl::desc("Print all pass management debugging information. "
+                            "`-debug-pass-manager` must also be specified"));
+
+// An option that prints out the IR after passes, similar to
+// -print-after-all except that it only prints the IR after passes that
+// change the IR.  Those passes that do not make changes to the IR are
+// reported as not making any changes.  In addition, the initial IR is
+// also reported.  Other hidden options affect the output from this
+// option.  -filter-passes will limit the output to the named passes
+// that actually change the IR and other passes are reported as filtered out.
+// The specified passes will either be reported as making no changes (with
+// no IR reported) or the changed IR will be reported.  Also, the
+// -filter-print-funcs and -print-module-scope options will do similar
+// filtering based on function name, reporting changed IRs as functions(or
+// modules if -print-module-scope is specified) for a particular function
+// or indicating that the IR has been filtered out.  The extra options
+// can be combined, allowing only changed IRs for certain passes on certain
+// functions to be reported in different formats, with the rest being
+// reported as filtered out.  The -print-before-changed option will print
+// the IR as it was before each pass that changed it.  The optional
+// value of quiet will only report when the IR changes, suppressing
+// all other messages, including the initial IR.
+enum ChangePrinter { NoChangePrinter, PrintChangedVerbose, PrintChangedQuiet };
+static cl::opt<ChangePrinter> PrintChanged(
+    "print-changed", cl::desc("Print changed IRs"), cl::Hidden,
+    cl::ValueOptional, cl::init(NoChangePrinter),
+    cl::values(clEnumValN(PrintChangedQuiet, "quiet", "Run in quiet mode"),
+               // Sentinel value for unspecified option.
+               clEnumValN(PrintChangedVerbose, "", "")));
+
+// An option that supports the -print-changed option.  See
+// the description for -print-changed for an explanation of the use
+// of this option.  Note that this option has no effect without -print-changed.
+static cl::list<std::string>
+    PrintPassesList("filter-passes", cl::value_desc("pass names"),
+                    cl::desc("Only consider IR changes for passes whose names "
+                             "match for the print-changed option"),
+                    cl::CommaSeparated, cl::Hidden);
+// An option that supports the -print-changed option.  See
+// the description for -print-changed for an explanation of the use
+// of this option.  Note that this option has no effect without -print-changed.
+static cl::opt<bool>
+    PrintChangedBefore("print-before-changed",
+                       cl::desc("Print before passes that change them"),
+                       cl::init(false), cl::Hidden);
+
 namespace {
 
 /// Extracting Module out of \p IR unit. Also fills a textual description
 /// of \p IR for use in header when printing.
-Optional<std::pair<const Module *, std::string>> unwrapModule(Any IR) {
+Optional<std::pair<const Module *, std::string>>
+unwrapModule(Any IR, bool Force = false) {
   if (any_isa<const Module *>(IR))
     return std::make_pair(any_cast<const Module *>(IR), std::string());
 
   if (any_isa<const Function *>(IR)) {
     const Function *F = any_cast<const Function *>(IR);
-    if (!llvm::isFunctionInPrintList(F->getName()))
+    if (!Force && !isFunctionInPrintList(F->getName()))
       return None;
+
     const Module *M = F->getParent();
     return std::make_pair(M, formatv(" (function: {0})", F->getName()).str());
   }
@@ -47,18 +113,19 @@ Optional<std::pair<const Module *, std::string>> unwrapModule(Any IR) {
     const LazyCallGraph::SCC *C = any_cast<const LazyCallGraph::SCC *>(IR);
     for (const LazyCallGraph::Node &N : *C) {
       const Function &F = N.getFunction();
-      if (!F.isDeclaration() && isFunctionInPrintList(F.getName())) {
+      if (Force || (!F.isDeclaration() && isFunctionInPrintList(F.getName()))) {
         const Module *M = F.getParent();
         return std::make_pair(M, formatv(" (scc: {0})", C->getName()).str());
       }
     }
+    assert(!Force && "Expected to have made a pair when forced.");
     return None;
   }
 
   if (any_isa<const Loop *>(IR)) {
     const Loop *L = any_cast<const Loop *>(IR);
     const Function *F = L->getHeader()->getParent();
-    if (!isFunctionInPrintList(F->getName()))
+    if (!Force && !isFunctionInPrintList(F->getName()))
       return None;
     const Module *M = F->getParent();
     std::string LoopName;
@@ -70,65 +137,92 @@ Optional<std::pair<const Module *, std::string>> unwrapModule(Any IR) {
   llvm_unreachable("Unknown IR unit");
 }
 
-void printIR(const Function *F, StringRef Banner,
-             StringRef Extra = StringRef()) {
-  if (!llvm::isFunctionInPrintList(F->getName()))
+void printIR(raw_ostream &OS, const Function *F, StringRef Banner,
+             StringRef Extra = StringRef(), bool Brief = false) {
+  if (Brief) {
+    OS << F->getName() << '\n';
+    return;
+  }
+
+  if (!isFunctionInPrintList(F->getName()))
     return;
-  dbgs() << Banner << Extra << "\n" << static_cast<const Value &>(*F);
+  OS << Banner << Extra << "\n" << static_cast<const Value &>(*F);
 }
 
-void printIR(const Module *M, StringRef Banner, StringRef Extra = StringRef()) {
-  if (llvm::isFunctionInPrintList("*") || llvm::forcePrintModuleIR()) {
-    dbgs() << Banner << Extra << "\n";
-    M->print(dbgs(), nullptr, false);
+void printIR(raw_ostream &OS, const Module *M, StringRef Banner,
+             StringRef Extra = StringRef(), bool Brief = false,
+             bool ShouldPreserveUseListOrder = false) {
+  if (Brief) {
+    OS << M->getName() << '\n';
+    return;
+  }
+
+  if (isFunctionInPrintList("*") || forcePrintModuleIR()) {
+    OS << Banner << Extra << "\n";
+    M->print(OS, nullptr, ShouldPreserveUseListOrder);
   } else {
     for (const auto &F : M->functions()) {
-      printIR(&F, Banner, Extra);
+      printIR(OS, &F, Banner, Extra);
     }
   }
 }
 
-void printIR(const LazyCallGraph::SCC *C, StringRef Banner,
-             StringRef Extra = StringRef()) {
+void printIR(raw_ostream &OS, const LazyCallGraph::SCC *C, StringRef Banner,
+             StringRef Extra = StringRef(), bool Brief = false) {
+  if (Brief) {
+    OS << *C << '\n';
+    return;
+  }
+
   bool BannerPrinted = false;
   for (const LazyCallGraph::Node &N : *C) {
     const Function &F = N.getFunction();
-    if (!F.isDeclaration() && llvm::isFunctionInPrintList(F.getName())) {
+    if (!F.isDeclaration() && isFunctionInPrintList(F.getName())) {
       if (!BannerPrinted) {
-        dbgs() << Banner << Extra << "\n";
+        OS << Banner << Extra << "\n";
         BannerPrinted = true;
       }
-      F.print(dbgs());
+      F.print(OS);
     }
   }
 }
-void printIR(const Loop *L, StringRef Banner) {
+
+void printIR(raw_ostream &OS, const Loop *L, StringRef Banner,
+             bool Brief = false) {
+  if (Brief) {
+    OS << *L;
+    return;
+  }
+
   const Function *F = L->getHeader()->getParent();
-  if (!llvm::isFunctionInPrintList(F->getName()))
+  if (!isFunctionInPrintList(F->getName()))
     return;
-  llvm::printLoop(const_cast<Loop &>(*L), dbgs(), std::string(Banner));
+  printLoop(const_cast<Loop &>(*L), OS, std::string(Banner));
 }
 
 /// Generic IR-printing helper that unpacks a pointer to IRUnit wrapped into
 /// llvm::Any and does actual print job.
-void unwrapAndPrint(Any IR, StringRef Banner, bool ForceModule = false) {
+void unwrapAndPrint(raw_ostream &OS, Any IR, StringRef Banner,
+                    bool ForceModule = false, bool Brief = false,
+                    bool ShouldPreserveUseListOrder = false) {
   if (ForceModule) {
     if (auto UnwrappedModule = unwrapModule(IR))
-      printIR(UnwrappedModule->first, Banner, UnwrappedModule->second);
+      printIR(OS, UnwrappedModule->first, Banner, UnwrappedModule->second,
+              Brief, ShouldPreserveUseListOrder);
     return;
   }
 
   if (any_isa<const Module *>(IR)) {
     const Module *M = any_cast<const Module *>(IR);
     assert(M && "module should be valid for printing");
-    printIR(M, Banner);
+    printIR(OS, M, Banner, "", Brief, ShouldPreserveUseListOrder);
     return;
   }
 
   if (any_isa<const Function *>(IR)) {
     const Function *F = any_cast<const Function *>(IR);
     assert(F && "function should be valid for printing");
-    printIR(F, Banner);
+    printIR(OS, F, Banner, "", Brief);
     return;
   }
 
@@ -136,21 +230,240 @@ void unwrapAndPrint(Any IR, StringRef Banner, bool ForceModule = false) {
     const LazyCallGraph::SCC *C = any_cast<const LazyCallGraph::SCC *>(IR);
     assert(C && "scc should be valid for printing");
     std::string Extra = std::string(formatv(" (scc: {0})", C->getName()));
-    printIR(C, Banner, Extra);
+    printIR(OS, C, Banner, Extra, Brief);
     return;
   }
 
   if (any_isa<const Loop *>(IR)) {
     const Loop *L = any_cast<const Loop *>(IR);
     assert(L && "Loop should be valid for printing");
-    printIR(L, Banner);
+    printIR(OS, L, Banner, Brief);
     return;
   }
   llvm_unreachable("Unknown wrapped IR type");
 }
 
+// Return true when this is a pass for which changes should be ignored
+bool isIgnored(StringRef PassID) {
+  return isSpecialPass(PassID,
+                       {"PassManager", "PassAdaptor", "AnalysisManagerProxy"});
+}
+
 } // namespace
 
+template <typename IRUnitT>
+ChangeReporter<IRUnitT>::~ChangeReporter<IRUnitT>() {
+  assert(BeforeStack.empty() && "Problem with Change Printer stack.");
+}
+
+template <typename IRUnitT>
+bool ChangeReporter<IRUnitT>::isInterestingFunction(const Function &F) {
+  return isFunctionInPrintList(F.getName());
+}
+
+template <typename IRUnitT>
+bool ChangeReporter<IRUnitT>::isInterestingPass(StringRef PassID) {
+  if (isIgnored(PassID))
+    return false;
+
+  static std::unordered_set<std::string> PrintPassNames(PrintPassesList.begin(),
+                                                        PrintPassesList.end());
+  return PrintPassNames.empty() || PrintPassNames.count(PassID.str());
+}
+
+// Return true when this is a pass on IR for which printing
+// of changes is desired.
+template <typename IRUnitT>
+bool ChangeReporter<IRUnitT>::isInteresting(Any IR, StringRef PassID) {
+  if (!isInterestingPass(PassID))
+    return false;
+  if (any_isa<const Function *>(IR))
+    return isInterestingFunction(*any_cast<const Function *>(IR));
+  return true;
+}
+
+template <typename IRUnitT>
+void ChangeReporter<IRUnitT>::saveIRBeforePass(Any IR, StringRef PassID) {
+  // Always need to place something on the stack because invalidated passes
+  // are not given the IR so it cannot be determined whether the pass was for
+  // something that was filtered out.
+  BeforeStack.emplace_back();
+
+  if (!isInteresting(IR, PassID))
+    return;
+  // Is this the initial IR?
+  if (InitialIR) {
+    InitialIR = false;
+    if (VerboseMode)
+      handleInitialIR(IR);
+  }
+
+  // Save the IR representation on the stack.
+  IRUnitT &Data = BeforeStack.back();
+  generateIRRepresentation(IR, PassID, Data);
+}
+
+template <typename IRUnitT>
+void ChangeReporter<IRUnitT>::handleIRAfterPass(Any IR, StringRef PassID) {
+  assert(!BeforeStack.empty() && "Unexpected empty stack encountered.");
+  std::string Name;
+
+  // unwrapModule has inconsistent handling of names for function IRs.
+  if (any_isa<const Function *>(IR)) {
+    const Function *F = any_cast<const Function *>(IR);
+    Name = formatv(" (function: {0})", F->getName()).str();
+  } else {
+    if (auto UM = unwrapModule(IR))
+      Name = UM->second;
+  }
+  if (Name == "")
+    Name = " (module)";
+
+  if (isIgnored(PassID)) {
+    if (VerboseMode)
+      handleIgnored(PassID, Name);
+  } else if (!isInteresting(IR, PassID)) {
+    if (VerboseMode)
+      handleFiltered(PassID, Name);
+  } else {
+    // Get the before rep from the stack
+    IRUnitT &Before = BeforeStack.back();
+    // Create the after rep
+    IRUnitT After;
+    generateIRRepresentation(IR, PassID, After);
+
+    // Was there a change in IR?
+    if (same(Before, After)) {
+      if (VerboseMode)
+        omitAfter(PassID, Name);
+    } else
+      handleAfter(PassID, Name, Before, After, IR);
+  }
+  BeforeStack.pop_back();
+}
+
+template <typename IRUnitT>
+void ChangeReporter<IRUnitT>::handleInvalidatedPass(StringRef PassID) {
+  assert(!BeforeStack.empty() && "Unexpected empty stack encountered.");
+
+  // Always flag it as invalidated as we cannot determine when
+  // a pass for a filtered function is invalidated since we do not
+  // get the IR in the call.  Also, the output is just alternate
+  // forms of the banner anyway.
+  if (VerboseMode)
+    handleInvalidated(PassID);
+  BeforeStack.pop_back();
+}
+
+template <typename IRUnitT>
+void ChangeReporter<IRUnitT>::registerRequiredCallbacks(
+    PassInstrumentationCallbacks &PIC) {
+  PIC.registerBeforeNonSkippedPassCallback(
+      [this](StringRef P, Any IR) { saveIRBeforePass(IR, P); });
+
+  PIC.registerAfterPassCallback(
+      [this](StringRef P, Any IR, const PreservedAnalyses &) {
+        handleIRAfterPass(IR, P);
+      });
+  PIC.registerAfterPassInvalidatedCallback(
+      [this](StringRef P, const PreservedAnalyses &) {
+        handleInvalidatedPass(P);
+      });
+}
+
+template <typename IRUnitT>
+TextChangeReporter<IRUnitT>::TextChangeReporter(bool Verbose)
+    : ChangeReporter<IRUnitT>(Verbose), Out(dbgs()) {}
+
+template <typename IRUnitT>
+void TextChangeReporter<IRUnitT>::handleInitialIR(Any IR) {
+  // Always print the module.
+  // Unwrap and print directly to avoid filtering problems in general routines.
+  auto UnwrappedModule = unwrapModule(IR, /*Force=*/true);
+  assert(UnwrappedModule && "Expected module to be unwrapped when forced.");
+  Out << "*** IR Dump At Start: ***" << UnwrappedModule->second << "\n";
+  UnwrappedModule->first->print(Out, nullptr,
+                                /*ShouldPreserveUseListOrder=*/true);
+}
+
+template <typename IRUnitT>
+void TextChangeReporter<IRUnitT>::omitAfter(StringRef PassID,
+                                            std::string &Name) {
+  Out << formatv("*** IR Dump After {0}{1} omitted because no change ***\n",
+                 PassID, Name);
+}
+
+template <typename IRUnitT>
+void TextChangeReporter<IRUnitT>::handleInvalidated(StringRef PassID) {
+  Out << formatv("*** IR Pass {0} invalidated ***\n", PassID);
+}
+
+template <typename IRUnitT>
+void TextChangeReporter<IRUnitT>::handleFiltered(StringRef PassID,
+                                                 std::string &Name) {
+  SmallString<20> Banner =
+      formatv("*** IR Dump After {0}{1} filtered out ***\n", PassID, Name);
+  Out << Banner;
+}
+
+template <typename IRUnitT>
+void TextChangeReporter<IRUnitT>::handleIgnored(StringRef PassID,
+                                                std::string &Name) {
+  Out << formatv("*** IR Pass {0}{1} ignored ***\n", PassID, Name);
+}
+
+IRChangedPrinter::~IRChangedPrinter() {}
+
+void IRChangedPrinter::registerCallbacks(PassInstrumentationCallbacks &PIC) {
+  if (PrintChanged != NoChangePrinter)
+    TextChangeReporter<std::string>::registerRequiredCallbacks(PIC);
+}
+
+void IRChangedPrinter::generateIRRepresentation(Any IR, StringRef PassID,
+                                                std::string &Output) {
+  raw_string_ostream OS(Output);
+  // use the after banner for all cases so it will match
+  SmallString<20> Banner = formatv("*** IR Dump After {0} ***", PassID);
+  unwrapAndPrint(OS, IR, Banner, forcePrintModuleIR(),
+                 /*Brief=*/false, /*ShouldPreserveUseListOrder=*/true);
+
+  OS.str();
+}
+
+void IRChangedPrinter::handleAfter(StringRef PassID, std::string &Name,
+                                   const std::string &Before,
+                                   const std::string &After, Any) {
+  assert(After.find("*** IR Dump") == 0 && "Unexpected banner format.");
+  StringRef AfterRef = After;
+  StringRef Banner =
+      AfterRef.take_until([](char C) -> bool { return C == '\n'; });
+
+  // Report the IR before the changes when requested.
+  if (PrintChangedBefore) {
+    Out << "*** IR Dump Before" << Banner.substr(17);
+    // LazyCallGraph::SCC already has "(scc:..." in banner so only add
+    // in the name if it isn't already there.
+    if (Name.substr(0, 6) != " (scc:" && !forcePrintModuleIR())
+      Out << Name;
+
+    StringRef BeforeRef = Before;
+    Out << BeforeRef.substr(Banner.size());
+  }
+
+  Out << Banner;
+
+  // LazyCallGraph::SCC already has "(scc:..." in banner so only add
+  // in the name if it isn't already there.
+  if (Name.substr(0, 6) != " (scc:" && !forcePrintModuleIR())
+    Out << Name;
+
+  Out << After.substr(Banner.size());
+}
+
+bool IRChangedPrinter::same(const std::string &S1, const std::string &S2) {
+  return S1 == S2;
+}
+
 PrintIRInstrumentation::~PrintIRInstrumentation() {
   assert(ModuleDescStack.empty() && "ModuleDescStack is not empty at exit");
 }
@@ -172,44 +485,44 @@ PrintIRInstrumentation::popModuleDesc(StringRef PassID) {
   return ModuleDesc;
 }
 
-bool PrintIRInstrumentation::printBeforePass(StringRef PassID, Any IR) {
-  if (PassID.startswith("PassManager<") || PassID.contains("PassAdaptor<"))
-    return true;
+void PrintIRInstrumentation::printBeforePass(StringRef PassID, Any IR) {
+  if (isIgnored(PassID))
+    return;
 
   // Saving Module for AfterPassInvalidated operations.
   // Note: here we rely on a fact that we do not change modules while
   // traversing the pipeline, so the latest captured module is good
   // for all print operations that has not happen yet.
-  if (StoreModuleDesc && llvm::shouldPrintAfterPass(PassID))
+  if (StoreModuleDesc && shouldPrintAfterPass(PassID))
     pushModuleDesc(PassID, IR);
 
-  if (!llvm::shouldPrintBeforePass(PassID))
-    return true;
+  if (!shouldPrintBeforePass(PassID))
+    return;
 
   SmallString<20> Banner = formatv("*** IR Dump Before {0} ***", PassID);
-  unwrapAndPrint(IR, Banner, llvm::forcePrintModuleIR());
-  return true;
+  unwrapAndPrint(dbgs(), IR, Banner, forcePrintModuleIR());
 }
 
 void PrintIRInstrumentation::printAfterPass(StringRef PassID, Any IR) {
-  if (PassID.startswith("PassManager<") || PassID.contains("PassAdaptor<"))
+  if (isIgnored(PassID))
     return;
 
-  if (!llvm::shouldPrintAfterPass(PassID))
+  if (!shouldPrintAfterPass(PassID))
     return;
 
   if (StoreModuleDesc)
     popModuleDesc(PassID);
 
   SmallString<20> Banner = formatv("*** IR Dump After {0} ***", PassID);
-  unwrapAndPrint(IR, Banner, llvm::forcePrintModuleIR());
+  unwrapAndPrint(dbgs(), IR, Banner, forcePrintModuleIR());
 }
 
 void PrintIRInstrumentation::printAfterPassInvalidated(StringRef PassID) {
-  if (!StoreModuleDesc || !llvm::shouldPrintAfterPass(PassID))
+  StringRef PassName = PIC->getPassNameForClassName(PassID);
+  if (!StoreModuleDesc || !shouldPrintAfterPass(PassName))
     return;
 
-  if (PassID.startswith("PassManager<") || PassID.contains("PassAdaptor<"))
+  if (isIgnored(PassID))
     return;
 
   const Module *M;
@@ -223,28 +536,360 @@ void PrintIRInstrumentation::printAfterPassInvalidated(StringRef PassID) {
 
   SmallString<20> Banner =
       formatv("*** IR Dump After {0} *** invalidated: ", PassID);
-  printIR(M, Banner, Extra);
+  printIR(dbgs(), M, Banner, Extra);
+}
+
+bool PrintIRInstrumentation::shouldPrintBeforePass(StringRef PassID) {
+  if (shouldPrintBeforeAll())
+    return true;
+
+  StringRef PassName = PIC->getPassNameForClassName(PassID);
+  for (const auto &P : printBeforePasses()) {
+    if (PassName == P)
+      return true;
+  }
+  return false;
+}
+
+bool PrintIRInstrumentation::shouldPrintAfterPass(StringRef PassID) {
+  if (shouldPrintAfterAll())
+    return true;
+
+  StringRef PassName = PIC->getPassNameForClassName(PassID);
+  for (const auto &P : printAfterPasses()) {
+    if (PassName == P)
+      return true;
+  }
+  return false;
 }
 
 void PrintIRInstrumentation::registerCallbacks(
     PassInstrumentationCallbacks &PIC) {
+  this->PIC = &PIC;
+
   // BeforePass callback is not just for printing, it also saves a Module
   // for later use in AfterPassInvalidated.
-  StoreModuleDesc = llvm::forcePrintModuleIR() && llvm::shouldPrintAfterPass();
-  if (llvm::shouldPrintBeforePass() || StoreModuleDesc)
-    PIC.registerBeforePassCallback(
-        [this](StringRef P, Any IR) { return this->printBeforePass(P, IR); });
+  StoreModuleDesc = forcePrintModuleIR() && shouldPrintAfterSomePass();
+  if (shouldPrintBeforeSomePass() || StoreModuleDesc)
+    PIC.registerBeforeNonSkippedPassCallback(
+        [this](StringRef P, Any IR) { this->printBeforePass(P, IR); });
 
-  if (llvm::shouldPrintAfterPass()) {
+  if (shouldPrintAfterSomePass()) {
     PIC.registerAfterPassCallback(
-        [this](StringRef P, Any IR) { this->printAfterPass(P, IR); });
+        [this](StringRef P, Any IR, const PreservedAnalyses &) {
+          this->printAfterPass(P, IR);
+        });
     PIC.registerAfterPassInvalidatedCallback(
-        [this](StringRef P) { this->printAfterPassInvalidated(P); });
+        [this](StringRef P, const PreservedAnalyses &) {
+          this->printAfterPassInvalidated(P);
+        });
   }
 }
 
+void OptNoneInstrumentation::registerCallbacks(
+    PassInstrumentationCallbacks &PIC) {
+  PIC.registerShouldRunOptionalPassCallback(
+      [this](StringRef P, Any IR) { return this->shouldRun(P, IR); });
+}
+
+bool OptNoneInstrumentation::shouldRun(StringRef PassID, Any IR) {
+  const Function *F = nullptr;
+  if (any_isa<const Function *>(IR)) {
+    F = any_cast<const Function *>(IR);
+  } else if (any_isa<const Loop *>(IR)) {
+    F = any_cast<const Loop *>(IR)->getHeader()->getParent();
+  }
+  bool ShouldRun = !(F && F->hasOptNone());
+  if (!ShouldRun && DebugLogging) {
+    errs() << "Skipping pass " << PassID << " on " << F->getName()
+           << " due to optnone attribute\n";
+  }
+  return ShouldRun;
+}
+
+static std::string getBisectDescription(Any IR) {
+  if (any_isa<const Module *>(IR)) {
+    const Module *M = any_cast<const Module *>(IR);
+    assert(M && "module should be valid for printing");
+    return "module (" + M->getName().str() + ")";
+  }
+
+  if (any_isa<const Function *>(IR)) {
+    const Function *F = any_cast<const Function *>(IR);
+    assert(F && "function should be valid for printing");
+    return "function (" + F->getName().str() + ")";
+  }
+
+  if (any_isa<const LazyCallGraph::SCC *>(IR)) {
+    const LazyCallGraph::SCC *C = any_cast<const LazyCallGraph::SCC *>(IR);
+    assert(C && "scc should be valid for printing");
+    return "SCC " + C->getName();
+  }
+
+  if (any_isa<const Loop *>(IR)) {
+    return "loop";
+  }
+
+  llvm_unreachable("Unknown wrapped IR type");
+}
+
+void OptBisectInstrumentation::registerCallbacks(
+    PassInstrumentationCallbacks &PIC) {
+  if (!OptBisector->isEnabled())
+    return;
+  PIC.registerShouldRunOptionalPassCallback([](StringRef PassID, Any IR) {
+    return isIgnored(PassID) ||
+           OptBisector->checkPass(PassID, getBisectDescription(IR));
+  });
+}
+
+void PrintPassInstrumentation::registerCallbacks(
+    PassInstrumentationCallbacks &PIC) {
+  if (!DebugLogging)
+    return;
+
+  std::vector<StringRef> SpecialPasses = {"PassManager"};
+  if (!DebugPMVerbose)
+    SpecialPasses.emplace_back("PassAdaptor");
+
+  PIC.registerBeforeSkippedPassCallback(
+      [SpecialPasses](StringRef PassID, Any IR) {
+        assert(!isSpecialPass(PassID, SpecialPasses) &&
+               "Unexpectedly skipping special pass");
+
+        dbgs() << "Skipping pass: " << PassID << " on ";
+        unwrapAndPrint(dbgs(), IR, "", false, true);
+      });
+
+  PIC.registerBeforeNonSkippedPassCallback(
+      [SpecialPasses](StringRef PassID, Any IR) {
+        if (isSpecialPass(PassID, SpecialPasses))
+          return;
+
+        dbgs() << "Running pass: " << PassID << " on ";
+        unwrapAndPrint(dbgs(), IR, "", false, true);
+      });
+
+  PIC.registerBeforeAnalysisCallback([](StringRef PassID, Any IR) {
+    dbgs() << "Running analysis: " << PassID << " on ";
+    unwrapAndPrint(dbgs(), IR, "", false, true);
+  });
+}
+
+PreservedCFGCheckerInstrumentation::CFG::CFG(const Function *F,
+                                             bool TrackBBLifetime) {
+  if (TrackBBLifetime)
+    BBGuards = DenseMap<intptr_t, BBGuard>(F->size());
+  for (const auto &BB : *F) {
+    if (BBGuards)
+      BBGuards->try_emplace(intptr_t(&BB), &BB);
+    for (auto *Succ : successors(&BB)) {
+      Graph[&BB][Succ]++;
+      if (BBGuards)
+        BBGuards->try_emplace(intptr_t(Succ), Succ);
+    }
+  }
+}
+
+static void printBBName(raw_ostream &out, const BasicBlock *BB) {
+  if (BB->hasName()) {
+    out << BB->getName() << "<" << BB << ">";
+    return;
+  }
+
+  if (!BB->getParent()) {
+    out << "unnamed_removed<" << BB << ">";
+    return;
+  }
+
+  if (BB == &BB->getParent()->getEntryBlock()) {
+    out << "entry"
+        << "<" << BB << ">";
+    return;
+  }
+
+  unsigned FuncOrderBlockNum = 0;
+  for (auto &FuncBB : *BB->getParent()) {
+    if (&FuncBB == BB)
+      break;
+    FuncOrderBlockNum++;
+  }
+  out << "unnamed_" << FuncOrderBlockNum << "<" << BB << ">";
+}
+
+void PreservedCFGCheckerInstrumentation::CFG::printDiff(raw_ostream &out,
+                                                        const CFG &Before,
+                                                        const CFG &After) {
+  assert(!After.isPoisoned());
+
+  // Print function name.
+  const CFG *FuncGraph = nullptr;
+  if (!After.Graph.empty())
+    FuncGraph = &After;
+  else if (!Before.isPoisoned() && !Before.Graph.empty())
+    FuncGraph = &Before;
+
+  if (FuncGraph)
+    out << "In function @"
+        << FuncGraph->Graph.begin()->first->getParent()->getName() << "\n";
+
+  if (Before.isPoisoned()) {
+    out << "Some blocks were deleted\n";
+    return;
+  }
+
+  // Find and print graph differences.
+  if (Before.Graph.size() != After.Graph.size())
+    out << "Different number of non-leaf basic blocks: before="
+        << Before.Graph.size() << ", after=" << After.Graph.size() << "\n";
+
+  for (auto &BB : Before.Graph) {
+    auto BA = After.Graph.find(BB.first);
+    if (BA == After.Graph.end()) {
+      out << "Non-leaf block ";
+      printBBName(out, BB.first);
+      out << " is removed (" << BB.second.size() << " successors)\n";
+    }
+  }
+
+  for (auto &BA : After.Graph) {
+    auto BB = Before.Graph.find(BA.first);
+    if (BB == Before.Graph.end()) {
+      out << "Non-leaf block ";
+      printBBName(out, BA.first);
+      out << " is added (" << BA.second.size() << " successors)\n";
+      continue;
+    }
+
+    if (BB->second == BA.second)
+      continue;
+
+    out << "Different successors of block ";
+    printBBName(out, BA.first);
+    out << " (unordered):\n";
+    out << "- before (" << BB->second.size() << "): ";
+    for (auto &SuccB : BB->second) {
+      printBBName(out, SuccB.first);
+      if (SuccB.second != 1)
+        out << "(" << SuccB.second << "), ";
+      else
+        out << ", ";
+    }
+    out << "\n";
+    out << "- after (" << BA.second.size() << "): ";
+    for (auto &SuccA : BA.second) {
+      printBBName(out, SuccA.first);
+      if (SuccA.second != 1)
+        out << "(" << SuccA.second << "), ";
+      else
+        out << ", ";
+    }
+    out << "\n";
+  }
+}
+
+void PreservedCFGCheckerInstrumentation::registerCallbacks(
+    PassInstrumentationCallbacks &PIC) {
+  if (!VerifyPreservedCFG)
+    return;
+
+  PIC.registerBeforeNonSkippedPassCallback([this](StringRef P, Any IR) {
+    if (any_isa<const Function *>(IR))
+      GraphStackBefore.emplace_back(P, CFG(any_cast<const Function *>(IR)));
+    else
+      GraphStackBefore.emplace_back(P, None);
+  });
+
+  PIC.registerAfterPassInvalidatedCallback(
+      [this](StringRef P, const PreservedAnalyses &PassPA) {
+        auto Before = GraphStackBefore.pop_back_val();
+        assert(Before.first == P &&
+               "Before and After callbacks must correspond");
+        (void)Before;
+      });
+
+  PIC.registerAfterPassCallback([this](StringRef P, Any IR,
+                                       const PreservedAnalyses &PassPA) {
+    auto Before = GraphStackBefore.pop_back_val();
+    assert(Before.first == P && "Before and After callbacks must correspond");
+    auto &GraphBefore = Before.second;
+
+    if (!PassPA.allAnalysesInSetPreserved<CFGAnalyses>())
+      return;
+
+    if (any_isa<const Function *>(IR)) {
+      assert(GraphBefore && "Must be built in BeforePassCallback");
+      CFG GraphAfter(any_cast<const Function *>(IR), false /* NeedsGuard */);
+      if (GraphAfter == *GraphBefore)
+        return;
+
+      dbgs() << "Error: " << P
+             << " reported it preserved CFG, but changes detected:\n";
+      CFG::printDiff(dbgs(), *GraphBefore, GraphAfter);
+      report_fatal_error(Twine("Preserved CFG changed by ", P));
+    }
+  });
+}
+
+void VerifyInstrumentation::registerCallbacks(
+    PassInstrumentationCallbacks &PIC) {
+  PIC.registerAfterPassCallback(
+      [this](StringRef P, Any IR, const PreservedAnalyses &PassPA) {
+        if (isIgnored(P) || P == "VerifierPass")
+          return;
+        if (any_isa<const Function *>(IR) || any_isa<const Loop *>(IR)) {
+          const Function *F;
+          if (any_isa<const Loop *>(IR))
+            F = any_cast<const Loop *>(IR)->getHeader()->getParent();
+          else
+            F = any_cast<const Function *>(IR);
+          if (DebugLogging)
+            dbgs() << "Verifying function " << F->getName() << "\n";
+
+          if (verifyFunction(*F))
+            report_fatal_error("Broken function found, compilation aborted!");
+        } else if (any_isa<const Module *>(IR) ||
+                   any_isa<const LazyCallGraph::SCC *>(IR)) {
+          const Module *M;
+          if (any_isa<const LazyCallGraph::SCC *>(IR))
+            M = any_cast<const LazyCallGraph::SCC *>(IR)
+                    ->begin()
+                    ->getFunction()
+                    .getParent();
+          else
+            M = any_cast<const Module *>(IR);
+          if (DebugLogging)
+            dbgs() << "Verifying module " << M->getName() << "\n";
+
+          if (verifyModule(*M))
+            report_fatal_error("Broken module found, compilation aborted!");
+        }
+      });
+}
+
+StandardInstrumentations::StandardInstrumentations(bool DebugLogging,
+                                                   bool VerifyEach)
+    : PrintPass(DebugLogging), OptNone(DebugLogging),
+      PrintChangedIR(PrintChanged != PrintChangedQuiet), Verify(DebugLogging),
+      VerifyEach(VerifyEach) {}
+
 void StandardInstrumentations::registerCallbacks(
     PassInstrumentationCallbacks &PIC) {
   PrintIR.registerCallbacks(PIC);
+  PrintPass.registerCallbacks(PIC);
   TimePasses.registerCallbacks(PIC);
+  OptNone.registerCallbacks(PIC);
+  OptBisect.registerCallbacks(PIC);
+  PreservedCFGChecker.registerCallbacks(PIC);
+  PrintChangedIR.registerCallbacks(PIC);
+  PseudoProbeVerification.registerCallbacks(PIC);
+  if (VerifyEach)
+    Verify.registerCallbacks(PIC);
 }
+
+namespace llvm {
+
+template class ChangeReporter<std::string>;
+template class TextChangeReporter<std::string>;
+
+} // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp b/contrib/llvm-project/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
index 70f00d333db1..cdbcde50d33a 100644
--- a/contrib/llvm-project/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
+++ b/contrib/llvm-project/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
@@ -249,7 +249,12 @@ Error CoverageMapping::loadFunctionRecord(
       consumeError(std::move(E));
       return Error::success();
     }
-    Function.pushRegion(Region, *ExecutionCount);
+    Expected<int64_t> AltExecutionCount = Ctx.evaluate(Region.FalseCount);
+    if (auto E = AltExecutionCount.takeError()) {
+      consumeError(std::move(E));
+      return Error::success();
+    }
+    Function.pushRegion(Region, *ExecutionCount, *AltExecutionCount);
   }
 
   // Don't create records for (filenames, function) pairs we've already seen.
@@ -485,9 +490,15 @@ class SegmentBuilder {
       if (CurStartLoc == CR.value().endLoc()) {
         // Avoid making zero-length regions active. If it's the last region,
         // emit a skipped segment. Otherwise use its predecessor's count.
-        const bool Skipped = (CR.index() + 1) == Regions.size();
+        const bool Skipped =
+            (CR.index() + 1) == Regions.size() ||
+            CR.value().Kind == CounterMappingRegion::SkippedRegion;
         startSegment(ActiveRegions.empty() ? CR.value() : *ActiveRegions.back(),
                      CurStartLoc, !GapRegion, Skipped);
+        // If it is skipped segment, create a segment with last pushed
+        // regions's count at CurStartLoc.
+        if (Skipped && !ActiveRegions.empty())
+          startSegment(*ActiveRegions.back(), CurStartLoc, false);
         continue;
       }
       if (CR.index() + 1 == Regions.size() ||
@@ -587,6 +598,8 @@ public:
       const auto &L = Segments[I - 1];
       const auto &R = Segments[I];
       if (!(L.Line < R.Line) && !(L.Line == R.Line && L.Col < R.Col)) {
+        if (L.Line == R.Line && L.Col == R.Col && !L.HasCount)
+          continue;
         LLVM_DEBUG(dbgs() << " ! Segment " << L.Line << ":" << L.Col
                           << " followed by " << R.Line << ":" << R.Col << "\n");
         assert(false && "Coverage segments not unique or sorted");
@@ -603,8 +616,7 @@ public:
 std::vector<StringRef> CoverageMapping::getUniqueSourceFiles() const {
   std::vector<StringRef> Filenames;
   for (const auto &Function : getCoveredFunctions())
-    Filenames.insert(Filenames.end(), Function.Filenames.begin(),
-                     Function.Filenames.end());
+    llvm::append_range(Filenames, Function.Filenames);
   llvm::sort(Filenames);
   auto Last = std::unique(Filenames.begin(), Filenames.end());
   Filenames.erase(Last, Filenames.end());
@@ -664,6 +676,10 @@ CoverageData CoverageMapping::getCoverageForFile(StringRef Filename) const {
         if (MainFileID && isExpansion(CR, *MainFileID))
           FileCoverage.Expansions.emplace_back(CR, Function);
       }
+    // Capture branch regions specific to the function (excluding expansions).
+    for (const auto &CR : Function.CountedBranchRegions)
+      if (FileIDs.test(CR.FileID) && (CR.FileID == CR.ExpandedFileID))
+        FileCoverage.BranchRegions.push_back(CR);
   }
 
   LLVM_DEBUG(dbgs() << "Emitting segments for file: " << Filename << "\n");
@@ -711,6 +727,10 @@ CoverageMapping::getCoverageForFunction(const FunctionRecord &Function) const {
       if (isExpansion(CR, *MainFileID))
         FunctionCoverage.Expansions.emplace_back(CR, Function);
     }
+  // Capture branch regions specific to the function (excluding expansions).
+  for (const auto &CR : Function.CountedBranchRegions)
+    if (CR.FileID == *MainFileID)
+      FunctionCoverage.BranchRegions.push_back(CR);
 
   LLVM_DEBUG(dbgs() << "Emitting segments for function: " << Function.Name
                     << "\n");
@@ -730,6 +750,10 @@ CoverageData CoverageMapping::getCoverageForExpansion(
       if (isExpansion(CR, Expansion.FileID))
         ExpansionCoverage.Expansions.emplace_back(CR, Expansion.Function);
     }
+  for (const auto &CR : Expansion.Function.CountedBranchRegions)
+    // Capture branch regions that only pertain to the corresponding expansion.
+    if (CR.FileID == Expansion.FileID)
+      ExpansionCoverage.BranchRegions.push_back(CR);
 
   LLVM_DEBUG(dbgs() << "Emitting segments for expansion of file "
                     << Expansion.FileID << "\n");
@@ -807,6 +831,8 @@ static std::string getCoverageMapErrString(coveragemap_error Err) {
     return "Malformed coverage data";
   case coveragemap_error::decompression_failed:
     return "Failed to decompress coverage data (zlib)";
+  case coveragemap_error::invalid_or_missing_arch_specifier:
+    return "`-arch` specifier is invalid or missing for universal binary";
   }
   llvm_unreachable("A value of coveragemap_error has no message.");
 }
diff --git a/contrib/llvm-project/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp b/contrib/llvm-project/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
index b75738bc360c..1acdcb4bebb9 100644
--- a/contrib/llvm-project/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
+++ b/contrib/llvm-project/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
@@ -213,7 +213,7 @@ Error RawCoverageMappingReader::readMappingRegionsSubArray(
     return Err;
   unsigned LineStart = 0;
   for (size_t I = 0; I < NumRegions; ++I) {
-    Counter C;
+    Counter C, C2;
     CounterMappingRegion::RegionKind Kind = CounterMappingRegion::CodeRegion;
 
     // Read the combined counter + region kind.
@@ -223,6 +223,18 @@ Error RawCoverageMappingReader::readMappingRegionsSubArray(
       return Err;
     unsigned Tag = EncodedCounterAndRegion & Counter::EncodingTagMask;
     uint64_t ExpandedFileID = 0;
+
+    // If Tag does not represent a ZeroCounter, then it is understood to refer
+    // to a counter or counter expression with region kind assumed to be
+    // "CodeRegion". In that case, EncodedCounterAndRegion actually encodes the
+    // referenced counter or counter expression (and nothing else).
+    //
+    // If Tag represents a ZeroCounter and EncodingExpansionRegionBit is set,
+    // then EncodedCounterAndRegion is interpreted to represent an
+    // ExpansionRegion. In all other cases, EncodedCounterAndRegion is
+    // interpreted to refer to a specific region kind, after which additional
+    // fields may be read (e.g. BranchRegions have two encoded counters that
+    // follow an encoded region kind value).
     if (Tag != Counter::Zero) {
       if (auto Err = decodeCounter(EncodedCounterAndRegion, C))
         return Err;
@@ -243,6 +255,14 @@ Error RawCoverageMappingReader::readMappingRegionsSubArray(
         case CounterMappingRegion::SkippedRegion:
           Kind = CounterMappingRegion::SkippedRegion;
           break;
+        case CounterMappingRegion::BranchRegion:
+          // For a Branch Region, read two successive counters.
+          Kind = CounterMappingRegion::BranchRegion;
+          if (auto Err = readCounter(C))
+            return Err;
+          if (auto Err = readCounter(C2))
+            return Err;
+          break;
         default:
           return make_error<CoverageMapError>(coveragemap_error::malformed);
         }
@@ -294,7 +314,7 @@ Error RawCoverageMappingReader::readMappingRegionsSubArray(
       dbgs() << "\n";
     });
 
-    auto CMR = CounterMappingRegion(C, InferredFileID, ExpandedFileID,
+    auto CMR = CounterMappingRegion(C, C2, InferredFileID, ExpandedFileID,
                                     LineStart, ColumnStart,
                                     LineStart + NumLines, ColumnEnd, Kind);
     if (CMR.startLoc() > CMR.endLoc())
@@ -600,7 +620,7 @@ public:
     CovBuf += FilenamesSize;
     FilenameRange FileRange(FilenamesBegin, Filenames.size() - FilenamesBegin);
 
-    if (Version == CovMapVersion::Version4) {
+    if (Version >= CovMapVersion::Version4) {
       // Map a hash of the filenames region to the filename range associated
       // with this coverage header.
       int64_t FilenamesRef =
@@ -628,7 +648,7 @@ public:
     // This is a no-op in Version4 (coverage mappings are not affixed to the
     // coverage header).
     const char *MappingBuf = CovBuf;
-    if (Version == CovMapVersion::Version4 && CoverageSize != 0)
+    if (Version >= CovMapVersion::Version4 && CoverageSize != 0)
       return make_error<CoverageMapError>(coveragemap_error::malformed);
     CovBuf += CoverageSize;
     const char *MappingEnd = CovBuf;
@@ -682,7 +702,7 @@ public:
       if (FileRange && !FileRange->isInvalid()) {
         StringRef Mapping =
             CFR->template getCoverageMapping<Endian>(OutOfLineMappingBuf);
-        if (Version == CovMapVersion::Version4 &&
+        if (Version >= CovMapVersion::Version4 &&
             Mapping.data() + Mapping.size() > FuncRecBufEnd)
           return make_error<CoverageMapError>(coveragemap_error::malformed);
         if (Error Err = insertFunctionRecordIfNeeded(CFR, Mapping, *FileRange))
@@ -711,6 +731,7 @@ Expected<std::unique_ptr<CovMapFuncRecordReader>> CovMapFuncRecordReader::get(
   case CovMapVersion::Version2:
   case CovMapVersion::Version3:
   case CovMapVersion::Version4:
+  case CovMapVersion::Version5:
     // Decompress the name data.
     if (Error E = P.create(P.getNameData()))
       return std::move(E);
@@ -723,6 +744,9 @@ Expected<std::unique_ptr<CovMapFuncRecordReader>> CovMapFuncRecordReader::get(
     else if (Version == CovMapVersion::Version4)
       return std::make_unique<VersionedCovMapFuncRecordReader<
           CovMapVersion::Version4, IntPtrT, Endian>>(P, R, F);
+    else if (Version == CovMapVersion::Version5)
+      return std::make_unique<VersionedCovMapFuncRecordReader<
+          CovMapVersion::Version5, IntPtrT, Endian>>(P, R, F);
   }
   llvm_unreachable("Unsupported version");
 }
@@ -766,7 +790,7 @@ static Error readCoverageMappingData(
   }
   // In Version4, function records are not affixed to coverage headers. Read
   // the records from their dedicated section.
-  if (Version == CovMapVersion::Version4)
+  if (Version >= CovMapVersion::Version4)
     return Reader->readFunctionRecords(FuncRecBuf, FuncRecBufEnd, None, nullptr,
                                        nullptr);
   return Error::success();
@@ -950,6 +974,19 @@ loadBinaryFormat(std::unique_ptr<Binary> Bin, StringRef Arch) {
       BytesInAddress, Endian);
 }
 
+/// Determine whether \p Arch is invalid or empty, given \p Bin.
+static bool isArchSpecifierInvalidOrMissing(Binary *Bin, StringRef Arch) {
+  // If we have a universal binary and Arch doesn't identify any of its slices,
+  // it's user error.
+  if (auto *Universal = dyn_cast<MachOUniversalBinary>(Bin)) {
+    for (auto &ObjForArch : Universal->objects())
+      if (Arch == ObjForArch.getArchFlagName())
+        return false;
+    return true;
+  }
+  return false;
+}
+
 Expected<std::vector<std::unique_ptr<BinaryCoverageReader>>>
 BinaryCoverageReader::create(
     MemoryBufferRef ObjectBuffer, StringRef Arch,
@@ -970,6 +1007,10 @@ BinaryCoverageReader::create(
     return BinOrErr.takeError();
   std::unique_ptr<Binary> Bin = std::move(BinOrErr.get());
 
+  if (isArchSpecifierInvalidOrMissing(Bin.get(), Arch))
+    return make_error<CoverageMapError>(
+        coveragemap_error::invalid_or_missing_arch_specifier);
+
   // MachO universal binaries which contain archives need to be treated as
   // archives, not as regular binaries.
   if (auto *Universal = dyn_cast<MachOUniversalBinary>(Bin.get())) {
diff --git a/contrib/llvm-project/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp b/contrib/llvm-project/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp
index 8d3c429c4484..65b83d1f4197 100644
--- a/contrib/llvm-project/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp
+++ b/contrib/llvm-project/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp
@@ -80,10 +80,14 @@ public:
                               ArrayRef<CounterMappingRegion> MappingRegions)
       : Expressions(Expressions) {
     AdjustedExpressionIDs.resize(Expressions.size(), 0);
-    for (const auto &I : MappingRegions)
+    for (const auto &I : MappingRegions) {
       mark(I.Count);
-    for (const auto &I : MappingRegions)
+      mark(I.FalseCount);
+    }
+    for (const auto &I : MappingRegions) {
       gatherUsed(I.Count);
+      gatherUsed(I.FalseCount);
+    }
   }
 
   void mark(Counter C) {
@@ -201,6 +205,7 @@ void CoverageMappingWriter::write(raw_ostream &OS) {
       PrevLineStart = 0;
     }
     Counter Count = Minimizer.adjust(I->Count);
+    Counter FalseCount = Minimizer.adjust(I->FalseCount);
     switch (I->Kind) {
     case CounterMappingRegion::CodeRegion:
     case CounterMappingRegion::GapRegion:
@@ -226,6 +231,13 @@ void CoverageMappingWriter::write(raw_ostream &OS) {
                         << Counter::EncodingCounterTagAndExpansionRegionTagBits,
                     OS);
       break;
+    case CounterMappingRegion::BranchRegion:
+      encodeULEB128(unsigned(I->Kind)
+                        << Counter::EncodingCounterTagAndExpansionRegionTagBits,
+                    OS);
+      writeCounter(MinExpressions, Count, OS);
+      writeCounter(MinExpressions, FalseCount, OS);
+      break;
     }
     assert(I->LineStart >= PrevLineStart);
     encodeULEB128(I->LineStart - PrevLineStart, OS);
diff --git a/contrib/llvm-project/llvm/lib/ProfileData/GCOV.cpp b/contrib/llvm-project/llvm/lib/ProfileData/GCOV.cpp
index 71ea44a1a722..3332a898603b 100644
--- a/contrib/llvm-project/llvm/lib/ProfileData/GCOV.cpp
+++ b/contrib/llvm-project/llvm/lib/ProfileData/GCOV.cpp
@@ -14,14 +14,16 @@
 #include "llvm/ProfileData/GCOV.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Config/llvm-config.h"
+#include "llvm/Demangle/Demangle.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Format.h"
-#include "llvm/Support/Path.h"
 #include "llvm/Support/MD5.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <system_error>
+#include <unordered_map>
 
 using namespace llvm;
 
@@ -39,6 +41,59 @@ enum : uint32_t {
   GCOV_TAG_PROGRAM_SUMMARY = 0xa3000000,
 };
 
+namespace {
+struct Summary {
+  Summary(StringRef Name) : Name(Name) {}
+
+  StringRef Name;
+  uint64_t lines = 0;
+  uint64_t linesExec = 0;
+  uint64_t branches = 0;
+  uint64_t branchesExec = 0;
+  uint64_t branchesTaken = 0;
+};
+
+struct LineInfo {
+  SmallVector<const GCOVBlock *, 1> blocks;
+  uint64_t count = 0;
+  bool exists = false;
+};
+
+struct SourceInfo {
+  StringRef filename;
+  SmallString<0> displayName;
+  std::vector<std::vector<const GCOVFunction *>> startLineToFunctions;
+  std::vector<LineInfo> lines;
+  bool ignored = false;
+  SourceInfo(StringRef filename) : filename(filename) {}
+};
+
+class Context {
+public:
+  Context(const GCOV::Options &Options) : options(Options) {}
+  void print(StringRef filename, StringRef gcno, StringRef gcda,
+             GCOVFile &file);
+
+private:
+  std::string getCoveragePath(StringRef filename, StringRef mainFilename) const;
+  void printFunctionDetails(const GCOVFunction &f, raw_ostream &os) const;
+  void printBranchInfo(const GCOVBlock &Block, uint32_t &edgeIdx,
+                       raw_ostream &OS) const;
+  void printSummary(const Summary &summary, raw_ostream &os) const;
+
+  void collectFunction(GCOVFunction &f, Summary &summary);
+  void collectSourceLine(SourceInfo &si, Summary *summary, LineInfo &line,
+                         size_t lineNum) const;
+  void collectSource(SourceInfo &si, Summary &summary) const;
+  void annotateSource(SourceInfo &si, const GCOVFile &file, StringRef gcno,
+                      StringRef gcda, raw_ostream &os) const;
+  void printSourceToIntermediate(const SourceInfo &si, raw_ostream &os) const;
+
+  const GCOV::Options &options;
+  std::vector<SourceInfo> sources;
+};
+} // namespace
+
 //===----------------------------------------------------------------------===//
 // GCOVFile implementation.
 
@@ -56,13 +111,13 @@ bool GCOVFile::readGCNO(GCOVBuffer &buf) {
     buf.getWord(); // hasUnexecutedBlocks
 
   uint32_t tag, length;
-  GCOVFunction *fn;
+  GCOVFunction *fn = nullptr;
   while ((tag = buf.getWord())) {
     if (!buf.readInt(length))
       return false;
     if (tag == GCOV_TAG_FUNCTION) {
-      Functions.push_back(std::make_unique<GCOVFunction>(*this));
-      fn = Functions.back().get();
+      functions.push_back(std::make_unique<GCOVFunction>(*this));
+      fn = functions.back().get();
       fn->ident = buf.getWord();
       fn->linenoChecksum = buf.getWord();
       if (Version >= GCOV::V407)
@@ -90,41 +145,40 @@ bool GCOVFile::readGCNO(GCOVBuffer &buf) {
       if (Version < GCOV::V800) {
         for (uint32_t i = 0; i != length; ++i) {
           buf.getWord(); // Ignored block flags
-          fn->Blocks.push_back(std::make_unique<GCOVBlock>(*fn, i));
+          fn->blocks.push_back(std::make_unique<GCOVBlock>(i));
         }
       } else {
         uint32_t num = buf.getWord();
         for (uint32_t i = 0; i != num; ++i)
-          fn->Blocks.push_back(std::make_unique<GCOVBlock>(*fn, i));
+          fn->blocks.push_back(std::make_unique<GCOVBlock>(i));
       }
     } else if (tag == GCOV_TAG_ARCS && fn) {
       uint32_t srcNo = buf.getWord();
-      if (srcNo >= fn->Blocks.size()) {
+      if (srcNo >= fn->blocks.size()) {
         errs() << "unexpected block number: " << srcNo << " (in "
-               << fn->Blocks.size() << ")\n";
+               << fn->blocks.size() << ")\n";
         return false;
       }
-      GCOVBlock *src = fn->Blocks[srcNo].get();
+      GCOVBlock *src = fn->blocks[srcNo].get();
       for (uint32_t i = 0, e = (length - 1) / 2; i != e; ++i) {
         uint32_t dstNo = buf.getWord(), flags = buf.getWord();
-        GCOVBlock *dst = fn->Blocks[dstNo].get();
-        auto arc =
-            std::make_unique<GCOVArc>(*src, *dst, flags & GCOV_ARC_FALLTHROUGH);
+        GCOVBlock *dst = fn->blocks[dstNo].get();
+        auto arc = std::make_unique<GCOVArc>(*src, *dst, flags);
         src->addDstEdge(arc.get());
         dst->addSrcEdge(arc.get());
-        if (flags & GCOV_ARC_ON_TREE)
+        if (arc->onTree())
           fn->treeArcs.push_back(std::move(arc));
         else
           fn->arcs.push_back(std::move(arc));
       }
     } else if (tag == GCOV_TAG_LINES && fn) {
       uint32_t srcNo = buf.getWord();
-      if (srcNo >= fn->Blocks.size()) {
+      if (srcNo >= fn->blocks.size()) {
         errs() << "unexpected block number: " << srcNo << " (in "
-               << fn->Blocks.size() << ")\n";
+               << fn->blocks.size() << ")\n";
         return false;
       }
-      GCOVBlock &Block = *fn->Blocks[srcNo];
+      GCOVBlock &Block = *fn->blocks[srcNo];
       for (;;) {
         uint32_t line = buf.getWord();
         if (line)
@@ -219,12 +273,24 @@ bool GCOVFile::readGCDA(GCOVBuffer &buf) {
         return false;
       }
       for (std::unique_ptr<GCOVArc> &arc : fn->arcs) {
-        if (!buf.readInt64(arc->Count))
+        if (!buf.readInt64(arc->count))
           return false;
-        // FIXME Fix counters
-        arc->src.Counter += arc->Count;
-        if (arc->dst.succ.empty())
-          arc->dst.Counter += arc->Count;
+        arc->src.count += arc->count;
+      }
+
+      if (fn->blocks.size() >= 2) {
+        GCOVBlock &src = *fn->blocks[0];
+        GCOVBlock &sink =
+            Version < GCOV::V408 ? *fn->blocks.back() : *fn->blocks[1];
+        auto arc = std::make_unique<GCOVArc>(sink, src, GCOV_ARC_ON_TREE);
+        sink.addDstEdge(arc.get());
+        src.addSrcEdge(arc.get());
+        fn->treeArcs.push_back(std::move(arc));
+
+        for (GCOVBlock &block : fn->blocksRange())
+          fn->propagateCounts(block, nullptr);
+        for (size_t i = fn->treeArcs.size() - 1; i; --i)
+          fn->treeArcs[i - 1]->src.count += fn->treeArcs[i - 1]->count;
       }
     }
     pos += 4 * length;
@@ -246,41 +312,71 @@ void GCOVFile::print(raw_ostream &OS) const {
 LLVM_DUMP_METHOD void GCOVFile::dump() const { print(dbgs()); }
 #endif
 
-/// collectLineCounts - Collect line counts. This must be used after
-/// reading .gcno and .gcda files.
-void GCOVFile::collectLineCounts(FileInfo &fi) {
-  assert(fi.sources.empty());
-  for (StringRef filename : filenames)
-    fi.sources.emplace_back(filename);
-  for (GCOVFunction &f : *this) {
-    f.collectLineCounts(fi);
-    fi.sources[f.srcIdx].functions.push_back(&f);
-  }
-  fi.setRunCount(RunCount);
-  fi.setProgramCount(ProgramCount);
-}
+bool GCOVArc::onTree() const { return flags & GCOV_ARC_ON_TREE; }
 
 //===----------------------------------------------------------------------===//
 // GCOVFunction implementation.
 
+StringRef GCOVFunction::getName(bool demangle) const {
+  if (!demangle)
+    return Name;
+  if (demangled.empty()) {
+    do {
+      if (Name.startswith("_Z")) {
+        int status = 0;
+        // Name is guaranteed to be NUL-terminated.
+        char *res = itaniumDemangle(Name.data(), nullptr, nullptr, &status);
+        if (status == 0) {
+          demangled = res;
+          free(res);
+          break;
+        }
+      }
+      demangled = Name;
+    } while (0);
+  }
+  return demangled;
+}
 StringRef GCOVFunction::getFilename() const { return file.filenames[srcIdx]; }
 
 /// getEntryCount - Get the number of times the function was called by
 /// retrieving the entry block's count.
 uint64_t GCOVFunction::getEntryCount() const {
-  return Blocks.front()->getCount();
+  return blocks.front()->getCount();
 }
 
-/// getExitCount - Get the number of times the function returned by retrieving
-/// the exit block's count.
-uint64_t GCOVFunction::getExitCount() const {
-  return Blocks.back()->getCount();
+GCOVBlock &GCOVFunction::getExitBlock() const {
+  return file.getVersion() < GCOV::V408 ? *blocks.back() : *blocks[1];
+}
+
+// For each basic block, the sum of incoming edge counts equals the sum of
+// outgoing edge counts by Kirchoff's circuit law. If the unmeasured arcs form a
+// spanning tree, the count for each unmeasured arc (GCOV_ARC_ON_TREE) can be
+// uniquely identified.
+uint64_t GCOVFunction::propagateCounts(const GCOVBlock &v, GCOVArc *pred) {
+  // If GCOV_ARC_ON_TREE edges do form a tree, visited is not needed; otherwise
+  // this prevents infinite recursion.
+  if (!visited.insert(&v).second)
+    return 0;
+
+  uint64_t excess = 0;
+  for (GCOVArc *e : v.srcs())
+    if (e != pred)
+      excess += e->onTree() ? propagateCounts(e->src, e) : e->count;
+  for (GCOVArc *e : v.dsts())
+    if (e != pred)
+      excess -= e->onTree() ? propagateCounts(e->dst, e) : e->count;
+  if (int64_t(excess) < 0)
+    excess = -excess;
+  if (pred)
+    pred->count = excess;
+  return excess;
 }
 
 void GCOVFunction::print(raw_ostream &OS) const {
   OS << "===== " << Name << " (" << ident << ") @ " << getFilename() << ":"
      << startLine << "\n";
-  for (const auto &Block : Blocks)
+  for (const auto &Block : blocks)
     Block->print(OS);
 }
 
@@ -291,44 +387,30 @@ LLVM_DUMP_METHOD void GCOVFunction::dump() const { print(dbgs()); }
 
 /// collectLineCounts - Collect line counts. This must be used after
 /// reading .gcno and .gcda files.
-void GCOVFunction::collectLineCounts(FileInfo &FI) {
-  // If the line number is zero, this is a function that doesn't actually appear
-  // in the source file, so there isn't anything we can do with it.
-  if (startLine == 0)
-    return;
-
-  for (const auto &Block : Blocks)
-    Block->collectLineCounts(FI);
-  FI.addFunctionLine(getFilename(), startLine, this);
-}
 
 //===----------------------------------------------------------------------===//
 // GCOVBlock implementation.
 
-/// collectLineCounts - Collect line counts. This must be used after
-/// reading .gcno and .gcda files.
-void GCOVBlock::collectLineCounts(FileInfo &FI) {
-  for (uint32_t N : Lines)
-    FI.addBlockLine(Parent.getFilename(), N, this);
-}
-
 void GCOVBlock::print(raw_ostream &OS) const {
-  OS << "Block : " << Number << " Counter : " << Counter << "\n";
+  OS << "Block : " << number << " Counter : " << count << "\n";
   if (!pred.empty()) {
     OS << "\tSource Edges : ";
     for (const GCOVArc *Edge : pred)
-      OS << Edge->src.Number << " (" << Edge->Count << "), ";
+      OS << Edge->src.number << " (" << Edge->count << "), ";
     OS << "\n";
   }
   if (!succ.empty()) {
     OS << "\tDestination Edges : ";
-    for (const GCOVArc *Edge : succ)
-      OS << Edge->dst.Number << " (" << Edge->Count << "), ";
+    for (const GCOVArc *Edge : succ) {
+      if (Edge->flags & GCOV_ARC_ON_TREE)
+        OS << '*';
+      OS << Edge->dst.number << " (" << Edge->count << "), ";
+    }
     OS << "\n";
   }
-  if (!Lines.empty()) {
+  if (!lines.empty()) {
     OS << "\tLines : ";
-    for (uint32_t N : Lines)
+    for (uint32_t N : lines)
       OS << (N) << ",";
     OS << "\n";
   }
@@ -339,139 +421,96 @@ void GCOVBlock::print(raw_ostream &OS) const {
 LLVM_DUMP_METHOD void GCOVBlock::dump() const { print(dbgs()); }
 #endif
 
-//===----------------------------------------------------------------------===//
-// Cycles detection
-//
-// The algorithm in GCC is based on the algorithm by Hawick & James:
-//   "Enumerating Circuits and Loops in Graphs with Self-Arcs and Multiple-Arcs"
-//   http://complexity.massey.ac.nz/cstn/013/cstn-013.pdf.
-
-/// Get the count for the detected cycle.
-uint64_t GCOVBlock::getCycleCount(const Edges &Path) {
-  uint64_t CycleCount = std::numeric_limits<uint64_t>::max();
-  for (auto E : Path) {
-    CycleCount = std::min(E->CyclesCount, CycleCount);
-  }
-  for (auto E : Path) {
-    E->CyclesCount -= CycleCount;
-  }
-  return CycleCount;
-}
-
-/// Unblock a vertex previously marked as blocked.
-void GCOVBlock::unblock(const GCOVBlock *U, BlockVector &Blocked,
-                        BlockVectorLists &BlockLists) {
-  auto it = find(Blocked, U);
-  if (it == Blocked.end()) {
-    return;
-  }
-
-  const size_t index = it - Blocked.begin();
-  Blocked.erase(it);
-
-  const BlockVector ToUnblock(BlockLists[index]);
-  BlockLists.erase(BlockLists.begin() + index);
-  for (auto GB : ToUnblock) {
-    GCOVBlock::unblock(GB, Blocked, BlockLists);
-  }
-}
-
-bool GCOVBlock::lookForCircuit(const GCOVBlock *V, const GCOVBlock *Start,
-                               Edges &Path, BlockVector &Blocked,
-                               BlockVectorLists &BlockLists,
-                               const BlockVector &Blocks, uint64_t &Count) {
-  Blocked.push_back(V);
-  BlockLists.emplace_back(BlockVector());
-  bool FoundCircuit = false;
-
-  for (auto E : V->dsts()) {
-    const GCOVBlock *W = &E->dst;
-    if (W < Start || find(Blocks, W) == Blocks.end()) {
+uint64_t
+GCOVBlock::augmentOneCycle(GCOVBlock *src,
+                           std::vector<std::pair<GCOVBlock *, size_t>> &stack) {
+  GCOVBlock *u;
+  size_t i;
+  stack.clear();
+  stack.emplace_back(src, 0);
+  src->incoming = (GCOVArc *)1; // Mark u available for cycle detection
+  for (;;) {
+    std::tie(u, i) = stack.back();
+    if (i == u->succ.size()) {
+      u->traversable = false;
+      stack.pop_back();
+      if (stack.empty())
+        break;
       continue;
     }
-
-    Path.push_back(E);
-
-    if (W == Start) {
-      // We've a cycle.
-      Count += GCOVBlock::getCycleCount(Path);
-      FoundCircuit = true;
-    } else if (find(Blocked, W) == Blocked.end() && // W is not blocked.
-               GCOVBlock::lookForCircuit(W, Start, Path, Blocked, BlockLists,
-                                         Blocks, Count)) {
-      FoundCircuit = true;
+    ++stack.back().second;
+    GCOVArc *succ = u->succ[i];
+    // Ignore saturated arcs (cycleCount has been reduced to 0) and visited
+    // blocks. Ignore self arcs to guard against bad input (.gcno has no
+    // self arcs).
+    if (succ->cycleCount == 0 || !succ->dst.traversable || &succ->dst == u)
+      continue;
+    if (succ->dst.incoming == nullptr) {
+      succ->dst.incoming = succ;
+      stack.emplace_back(&succ->dst, 0);
+      continue;
     }
-
-    Path.pop_back();
-  }
-
-  if (FoundCircuit) {
-    GCOVBlock::unblock(V, Blocked, BlockLists);
-  } else {
-    for (auto E : V->dsts()) {
-      const GCOVBlock *W = &E->dst;
-      if (W < Start || find(Blocks, W) == Blocks.end()) {
-        continue;
-      }
-      const size_t index = find(Blocked, W) - Blocked.begin();
-      BlockVector &List = BlockLists[index];
-      if (find(List, V) == List.end()) {
-        List.push_back(V);
-      }
+    uint64_t minCount = succ->cycleCount;
+    for (GCOVBlock *v = u;;) {
+      minCount = std::min(minCount, v->incoming->cycleCount);
+      v = &v->incoming->src;
+      if (v == &succ->dst)
+        break;
     }
+    succ->cycleCount -= minCount;
+    for (GCOVBlock *v = u;;) {
+      v->incoming->cycleCount -= minCount;
+      v = &v->incoming->src;
+      if (v == &succ->dst)
+        break;
+    }
+    return minCount;
   }
-
-  return FoundCircuit;
-}
-
-/// Get the count for the list of blocks which lie on the same line.
-void GCOVBlock::getCyclesCount(const BlockVector &Blocks, uint64_t &Count) {
-  for (auto Block : Blocks) {
-    Edges Path;
-    BlockVector Blocked;
-    BlockVectorLists BlockLists;
-
-    GCOVBlock::lookForCircuit(Block, Block, Path, Blocked, BlockLists, Blocks,
-                              Count);
-  }
+  return 0;
 }
 
-/// Get the count for the list of blocks which lie on the same line.
-uint64_t GCOVBlock::getLineCount(const BlockVector &Blocks) {
-  uint64_t Count = 0;
-
-  for (auto Block : Blocks) {
-    if (Block->getNumSrcEdges() == 0) {
-      // The block has no predecessors and a non-null counter
-      // (can be the case with entry block in functions).
-      Count += Block->getCount();
-    } else {
-      // Add counts from predecessors that are not on the same line.
-      for (auto E : Block->srcs()) {
-        const GCOVBlock *W = &E->src;
-        if (find(Blocks, W) == Blocks.end()) {
-          Count += E->Count;
-        }
-      }
+// Get the total execution count of loops among blocks on the same line.
+// Assuming a reducible flow graph, the count is the sum of back edge counts.
+// Identifying loops is complex, so we simply find cycles and perform cycle
+// cancelling iteratively.
+uint64_t GCOVBlock::getCyclesCount(const BlockVector &blocks) {
+  std::vector<std::pair<GCOVBlock *, size_t>> stack;
+  uint64_t count = 0, d;
+  for (;;) {
+    // Make blocks on the line traversable and try finding a cycle.
+    for (auto b : blocks) {
+      const_cast<GCOVBlock *>(b)->traversable = true;
+      const_cast<GCOVBlock *>(b)->incoming = nullptr;
     }
-    for (auto E : Block->dsts()) {
-      E->CyclesCount = E->Count;
+    d = 0;
+    for (auto block : blocks) {
+      auto *b = const_cast<GCOVBlock *>(block);
+      if (b->traversable && (d = augmentOneCycle(b, stack)) > 0)
+        break;
     }
+    if (d == 0)
+      break;
+    count += d;
   }
-
-  GCOVBlock::getCyclesCount(Blocks, Count);
-
-  return Count;
+  // If there is no more loop, all traversable bits should have been cleared.
+  // This property is needed by subsequent calls.
+  for (auto b : blocks) {
+    assert(!b->traversable);
+    (void)b;
+  }
+  return count;
 }
 
 //===----------------------------------------------------------------------===//
 // FileInfo implementation.
 
-// Safe integer division, returns 0 if numerator is 0.
-static uint32_t safeDiv(uint64_t Numerator, uint64_t Divisor) {
-  if (!Numerator)
+// Format dividend/divisor as a percentage. Return 1 if the result is greater
+// than 0% and less than 1%.
+static uint32_t formatPercentage(uint64_t dividend, uint64_t divisor) {
+  if (!dividend || !divisor)
     return 0;
-  return Numerator / Divisor;
+  dividend *= 100;
+  return dividend < divisor ? 1 : dividend / divisor;
 }
 
 // This custom division function mimics gcov's branch ouputs:
@@ -522,8 +561,11 @@ class LineConsumer {
 public:
   LineConsumer() = default;
   LineConsumer(StringRef Filename) {
+    // Open source files without requiring a NUL terminator. The concurrent
+    // modification may nullify the NUL terminator condition.
     ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr =
-        MemoryBuffer::getFileOrSTDIN(Filename);
+        MemoryBuffer::getFileOrSTDIN(Filename, -1,
+                                     /*RequiresNullTerminator=*/false);
     if (std::error_code EC = BufferOrErr.getError()) {
       errs() << Filename << ": " << EC.message() << "\n";
       Remaining = "";
@@ -579,23 +621,23 @@ static std::string mangleCoveragePath(StringRef Filename, bool PreservePaths) {
   return std::string(Result.str());
 }
 
-std::string FileInfo::getCoveragePath(StringRef Filename,
-                                      StringRef MainFilename) {
-  if (Options.NoOutput)
+std::string Context::getCoveragePath(StringRef filename,
+                                     StringRef mainFilename) const {
+  if (options.NoOutput)
     // This is probably a bug in gcov, but when -n is specified, paths aren't
     // mangled at all, and the -l and -p options are ignored. Here, we do the
     // same.
-    return std::string(Filename);
+    return std::string(filename);
 
   std::string CoveragePath;
-  if (Options.LongFileNames && !Filename.equals(MainFilename))
+  if (options.LongFileNames && !filename.equals(mainFilename))
     CoveragePath =
-        mangleCoveragePath(MainFilename, Options.PreservePaths) + "##";
-  CoveragePath += mangleCoveragePath(Filename, Options.PreservePaths);
-  if (Options.HashFilenames) {
+        mangleCoveragePath(mainFilename, options.PreservePaths) + "##";
+  CoveragePath += mangleCoveragePath(filename, options.PreservePaths);
+  if (options.HashFilenames) {
     MD5 Hasher;
     MD5::MD5Result Result;
-    Hasher.update(Filename.str());
+    Hasher.update(filename.str());
     Hasher.final(Result);
     CoveragePath += "##" + std::string(Result.digest());
   }
@@ -603,292 +645,302 @@ std::string FileInfo::getCoveragePath(StringRef Filename,
   return CoveragePath;
 }
 
-std::unique_ptr<raw_ostream>
-FileInfo::openCoveragePath(StringRef CoveragePath) {
-  std::error_code EC;
-  auto OS =
-      std::make_unique<raw_fd_ostream>(CoveragePath, EC, sys::fs::OF_Text);
-  if (EC) {
-    errs() << EC.message() << "\n";
-    return std::make_unique<raw_null_ostream>();
+void Context::collectFunction(GCOVFunction &f, Summary &summary) {
+  SourceInfo &si = sources[f.srcIdx];
+  if (f.startLine >= si.startLineToFunctions.size())
+    si.startLineToFunctions.resize(f.startLine + 1);
+  si.startLineToFunctions[f.startLine].push_back(&f);
+  for (const GCOVBlock &b : f.blocksRange()) {
+    if (b.lines.empty())
+      continue;
+    uint32_t maxLineNum = *std::max_element(b.lines.begin(), b.lines.end());
+    if (maxLineNum >= si.lines.size())
+      si.lines.resize(maxLineNum + 1);
+    for (uint32_t lineNum : b.lines) {
+      LineInfo &line = si.lines[lineNum];
+      if (!line.exists)
+        ++summary.lines;
+      if (line.count == 0 && b.count)
+        ++summary.linesExec;
+      line.exists = true;
+      line.count += b.count;
+      line.blocks.push_back(&b);
+    }
   }
-  return std::move(OS);
 }
 
-/// print -  Print source files with collected line count information.
-void FileInfo::print(raw_ostream &InfoOS, StringRef MainFilename,
-                     StringRef GCNOFile, StringRef GCDAFile, GCOVFile &file) {
-  SmallVector<StringRef, 4> Filenames;
-  for (const auto &LI : LineInfo)
-    Filenames.push_back(LI.first());
-  llvm::sort(Filenames);
-
-  for (StringRef Filename : Filenames) {
-    auto AllLines =
-        Options.Intermediate ? LineConsumer() : LineConsumer(Filename);
-    std::string CoveragePath = getCoveragePath(Filename, MainFilename);
-    std::unique_ptr<raw_ostream> CovStream;
-    if (Options.NoOutput || Options.Intermediate)
-      CovStream = std::make_unique<raw_null_ostream>();
-    else if (!Options.UseStdout)
-      CovStream = openCoveragePath(CoveragePath);
-    raw_ostream &CovOS =
-        !Options.NoOutput && Options.UseStdout ? llvm::outs() : *CovStream;
-
-    CovOS << "        -:    0:Source:" << Filename << "\n";
-    CovOS << "        -:    0:Graph:" << GCNOFile << "\n";
-    CovOS << "        -:    0:Data:" << GCDAFile << "\n";
-    CovOS << "        -:    0:Runs:" << RunCount << "\n";
-    if (file.getVersion() < GCOV::V900)
-      CovOS << "        -:    0:Programs:" << ProgramCount << "\n";
-
-    const LineData &Line = LineInfo[Filename];
-    GCOVCoverage FileCoverage(Filename);
-    for (uint32_t LineIndex = 0; LineIndex < Line.LastLine || !AllLines.empty();
-         ++LineIndex) {
-      if (Options.BranchInfo) {
-        FunctionLines::const_iterator FuncsIt = Line.Functions.find(LineIndex);
-        if (FuncsIt != Line.Functions.end())
-          printFunctionSummary(CovOS, FuncsIt->second);
-      }
+void Context::collectSourceLine(SourceInfo &si, Summary *summary,
+                                LineInfo &line, size_t lineNum) const {
+  uint64_t count = 0;
+  for (const GCOVBlock *b : line.blocks) {
+    if (b->number == 0) {
+      // For nonstandard control flows, arcs into the exit block may be
+      // duplicately counted (fork) or not be counted (abnormal exit), and thus
+      // the (exit,entry) counter may be inaccurate. Count the entry block with
+      // the outgoing arcs.
+      for (const GCOVArc *arc : b->succ)
+        count += arc->count;
+    } else {
+      // Add counts from predecessors that are not on the same line.
+      for (const GCOVArc *arc : b->pred)
+        if (!llvm::is_contained(line.blocks, &arc->src))
+          count += arc->count;
+    }
+    for (GCOVArc *arc : b->succ)
+      arc->cycleCount = arc->count;
+  }
 
-      BlockLines::const_iterator BlocksIt = Line.Blocks.find(LineIndex);
-      if (BlocksIt == Line.Blocks.end()) {
-        // No basic blocks are on this line. Not an executable line of code.
-        CovOS << "        -:";
-        AllLines.printNext(CovOS, LineIndex + 1);
-      } else {
-        const BlockVector &Blocks = BlocksIt->second;
-
-        // Add up the block counts to form line counts.
-        DenseMap<const GCOVFunction *, bool> LineExecs;
-        for (const GCOVBlock *Block : Blocks) {
-          if (Options.FuncCoverage) {
-            // This is a slightly convoluted way to most accurately gather line
-            // statistics for functions. Basically what is happening is that we
-            // don't want to count a single line with multiple blocks more than
-            // once. However, we also don't simply want to give the total line
-            // count to every function that starts on the line. Thus, what is
-            // happening here are two things:
-            // 1) Ensure that the number of logical lines is only incremented
-            //    once per function.
-            // 2) If there are multiple blocks on the same line, ensure that the
-            //    number of lines executed is incremented as long as at least
-            //    one of the blocks are executed.
-            const GCOVFunction *Function = &Block->getParent();
-            if (FuncCoverages.find(Function) == FuncCoverages.end()) {
-              std::pair<const GCOVFunction *, GCOVCoverage> KeyValue(
-                  Function, GCOVCoverage(Function->getName()));
-              FuncCoverages.insert(KeyValue);
-            }
-            GCOVCoverage &FuncCoverage = FuncCoverages.find(Function)->second;
-
-            if (LineExecs.find(Function) == LineExecs.end()) {
-              if (Block->getCount()) {
-                ++FuncCoverage.LinesExec;
-                LineExecs[Function] = true;
-              } else {
-                LineExecs[Function] = false;
-              }
-              ++FuncCoverage.LogicalLines;
-            } else if (!LineExecs[Function] && Block->getCount()) {
-              ++FuncCoverage.LinesExec;
-              LineExecs[Function] = true;
-            }
-          }
-        }
+  count += GCOVBlock::getCyclesCount(line.blocks);
+  line.count = count;
+  if (line.exists) {
+    ++summary->lines;
+    if (line.count != 0)
+      ++summary->linesExec;
+  }
 
-        const uint64_t LineCount = GCOVBlock::getLineCount(Blocks);
-        if (LineCount == 0)
-          CovOS << "    #####:";
-        else {
-          CovOS << format("%9" PRIu64 ":", LineCount);
-          ++FileCoverage.LinesExec;
-        }
-        ++FileCoverage.LogicalLines;
-
-        AllLines.printNext(CovOS, LineIndex + 1);
-
-        uint32_t BlockNo = 0;
-        uint32_t EdgeNo = 0;
-        for (const GCOVBlock *Block : Blocks) {
-          // Only print block and branch information at the end of the block.
-          if (Block->getLastLine() != LineIndex + 1)
-            continue;
-          if (Options.AllBlocks)
-            printBlockInfo(CovOS, *Block, LineIndex, BlockNo);
-          if (Options.BranchInfo) {
-            size_t NumEdges = Block->getNumDstEdges();
-            if (NumEdges > 1)
-              printBranchInfo(CovOS, *Block, FileCoverage, EdgeNo);
-            else if (Options.UncondBranch && NumEdges == 1)
-              printUncondBranchInfo(CovOS, EdgeNo, Block->succ[0]->Count);
-          }
-        }
+  if (options.BranchInfo)
+    for (const GCOVBlock *b : line.blocks) {
+      if (b->getLastLine() != lineNum)
+        continue;
+      int branches = 0, execBranches = 0, takenBranches = 0;
+      for (const GCOVArc *arc : b->succ) {
+        ++branches;
+        if (count != 0)
+          ++execBranches;
+        if (arc->count != 0)
+          ++takenBranches;
+      }
+      if (branches > 1) {
+        summary->branches += branches;
+        summary->branchesExec += execBranches;
+        summary->branchesTaken += takenBranches;
       }
     }
-    SourceInfo &source = sources[file.filenameToIdx.find(Filename)->second];
-    source.name = CoveragePath;
-    source.coverage = FileCoverage;
+}
+
+void Context::collectSource(SourceInfo &si, Summary &summary) const {
+  size_t lineNum = 0;
+  for (LineInfo &line : si.lines) {
+    collectSourceLine(si, &summary, line, lineNum);
+    ++lineNum;
   }
+}
 
-  if (Options.Intermediate && !Options.NoOutput) {
-    // gcov 7.* unexpectedly create multiple .gcov files, which was fixed in 8.0
-    // (PR GCC/82702). We create just one file.
-    std::string outputPath(sys::path::filename(MainFilename));
-    std::error_code ec;
-    raw_fd_ostream os(outputPath + ".gcov", ec, sys::fs::OF_Text);
-    if (ec) {
-      errs() << ec.message() << "\n";
-      return;
+void Context::annotateSource(SourceInfo &si, const GCOVFile &file,
+                             StringRef gcno, StringRef gcda,
+                             raw_ostream &os) const {
+  auto source =
+      options.Intermediate ? LineConsumer() : LineConsumer(si.filename);
+
+  os << "        -:    0:Source:" << si.displayName << '\n';
+  os << "        -:    0:Graph:" << gcno << '\n';
+  os << "        -:    0:Data:" << gcda << '\n';
+  os << "        -:    0:Runs:" << file.RunCount << '\n';
+  if (file.Version < GCOV::V900)
+    os << "        -:    0:Programs:" << file.ProgramCount << '\n';
+
+  for (size_t lineNum = 1; !source.empty(); ++lineNum) {
+    if (lineNum >= si.lines.size()) {
+      os << "        -:";
+      source.printNext(os, lineNum);
+      continue;
     }
 
-    for (const SourceInfo &source : sources) {
-      os << "file:" << source.filename << '\n';
-      for (const GCOVFunction *f : source.functions)
-        os << "function:" << f->startLine << ',' << f->getEntryCount() << ','
-           << f->Name << '\n';
-      const LineData &line = LineInfo[source.filename];
-      for (uint32_t lineNum = 0; lineNum != line.LastLine; ++lineNum) {
-        BlockLines::const_iterator BlocksIt = line.Blocks.find(lineNum);
-        if (BlocksIt == line.Blocks.end())
-          continue;
-        const BlockVector &blocks = BlocksIt->second;
-        // GCC 8 (r254259) added third third field for Ada:
-        // lcount:<line>,<count>,<has_unexecuted_blocks>
-        // We don't need the third field.
-        os << "lcount:" << (lineNum + 1) << ','
-           << GCOVBlock::getLineCount(blocks) << '\n';
-
-        if (!Options.BranchInfo)
-          continue;
-        for (const GCOVBlock *block : blocks) {
-          if (block->getLastLine() != lineNum + 1 ||
-              block->getNumDstEdges() < 2)
-            continue;
-          for (const GCOVArc *arc : block->dsts()) {
-            const char *type = block->getCount()
-                                   ? arc->Count ? "taken" : "nottaken"
-                                   : "notexec";
-            os << "branch:" << (lineNum + 1) << ',' << type << '\n';
-          }
+    const LineInfo &line = si.lines[lineNum];
+    if (options.BranchInfo && lineNum < si.startLineToFunctions.size())
+      for (const auto *f : si.startLineToFunctions[lineNum])
+        printFunctionDetails(*f, os);
+    if (!line.exists)
+      os << "        -:";
+    else if (line.count == 0)
+      os << "    #####:";
+    else
+      os << format("%9" PRIu64 ":", line.count);
+    source.printNext(os, lineNum);
+
+    uint32_t blockIdx = 0, edgeIdx = 0;
+    for (const GCOVBlock *b : line.blocks) {
+      if (b->getLastLine() != lineNum)
+        continue;
+      if (options.AllBlocks) {
+        if (b->getCount() == 0)
+          os << "    $$$$$:";
+        else
+          os << format("%9" PRIu64 ":", b->count);
+        os << format("%5u-block %2u\n", lineNum, blockIdx++);
+      }
+      if (options.BranchInfo) {
+        size_t NumEdges = b->succ.size();
+        if (NumEdges > 1)
+          printBranchInfo(*b, edgeIdx, os);
+        else if (options.UncondBranch && NumEdges == 1) {
+          uint64_t count = b->succ[0]->count;
+          os << format("unconditional %2u ", edgeIdx++)
+             << formatBranchInfo(options, count, count) << '\n';
         }
       }
     }
   }
+}
+
+void Context::printSourceToIntermediate(const SourceInfo &si,
+                                        raw_ostream &os) const {
+  os << "file:" << si.filename << '\n';
+  for (const auto &fs : si.startLineToFunctions)
+    for (const GCOVFunction *f : fs)
+      os << "function:" << f->startLine << ',' << f->getEntryCount() << ','
+         << f->getName(options.Demangle) << '\n';
+  for (size_t lineNum = 1, size = si.lines.size(); lineNum < size; ++lineNum) {
+    const LineInfo &line = si.lines[lineNum];
+    if (line.blocks.empty())
+      continue;
+    // GCC 8 (r254259) added third third field for Ada:
+    // lcount:<line>,<count>,<has_unexecuted_blocks>
+    // We don't need the third field.
+    os << "lcount:" << lineNum << ',' << line.count << '\n';
 
-  if (!Options.UseStdout) {
-    // FIXME: There is no way to detect calls given current instrumentation.
-    if (Options.FuncCoverage)
-      printFuncCoverage(InfoOS);
-    printFileCoverage(InfoOS);
+    if (!options.BranchInfo)
+      continue;
+    for (const GCOVBlock *b : line.blocks) {
+      if (b->succ.size() < 2 || b->getLastLine() != lineNum)
+        continue;
+      for (const GCOVArc *arc : b->succ) {
+        const char *type =
+            b->getCount() ? arc->count ? "taken" : "nottaken" : "notexec";
+        os << "branch:" << lineNum << ',' << type << '\n';
+      }
+    }
   }
 }
 
-/// printFunctionSummary - Print function and block summary.
-void FileInfo::printFunctionSummary(raw_ostream &OS,
-                                    const FunctionVector &Funcs) const {
-  for (const GCOVFunction *Func : Funcs) {
-    uint64_t EntryCount = Func->getEntryCount();
-    uint32_t BlocksExec = 0;
-    for (const GCOVBlock &Block : Func->blocks())
-      if (Block.getNumDstEdges() && Block.getCount())
-        ++BlocksExec;
-
-    OS << "function " << Func->getName() << " called " << EntryCount
-       << " returned " << safeDiv(Func->getExitCount() * 100, EntryCount)
-       << "% blocks executed "
-       << safeDiv(BlocksExec * 100, Func->getNumBlocks() - 1) << "%\n";
+void Context::print(StringRef filename, StringRef gcno, StringRef gcda,
+                    GCOVFile &file) {
+  for (StringRef filename : file.filenames) {
+    sources.emplace_back(filename);
+    SourceInfo &si = sources.back();
+    si.displayName = si.filename;
+    if (!options.SourcePrefix.empty() &&
+        sys::path::replace_path_prefix(si.displayName, options.SourcePrefix,
+                                       "") &&
+        !si.displayName.empty()) {
+      // TODO replace_path_prefix may strip the prefix even if the remaining
+      // part does not start with a separator.
+      if (sys::path::is_separator(si.displayName[0]))
+        si.displayName.erase(si.displayName.begin());
+      else
+        si.displayName = si.filename;
+    }
+    if (options.RelativeOnly && sys::path::is_absolute(si.displayName))
+      si.ignored = true;
   }
-}
 
-/// printBlockInfo - Output counts for each block.
-void FileInfo::printBlockInfo(raw_ostream &OS, const GCOVBlock &Block,
-                              uint32_t LineIndex, uint32_t &BlockNo) const {
-  if (Block.getCount() == 0)
-    OS << "    $$$$$:";
-  else
-    OS << format("%9" PRIu64 ":", Block.getCount());
-  OS << format("%5u-block %2u\n", LineIndex + 1, BlockNo++);
-}
+  raw_ostream &os = llvm::outs();
+  for (GCOVFunction &f : make_pointee_range(file.functions)) {
+    Summary summary(f.getName(options.Demangle));
+    collectFunction(f, summary);
+    if (options.FuncCoverage && !options.UseStdout) {
+      os << "Function '" << summary.Name << "'\n";
+      printSummary(summary, os);
+      os << '\n';
+    }
+  }
 
-/// printBranchInfo - Print conditional branch probabilities.
-void FileInfo::printBranchInfo(raw_ostream &OS, const GCOVBlock &Block,
-                               GCOVCoverage &Coverage, uint32_t &EdgeNo) {
-  SmallVector<uint64_t, 16> BranchCounts;
-  uint64_t TotalCounts = 0;
-  for (const GCOVArc *Edge : Block.dsts()) {
-    BranchCounts.push_back(Edge->Count);
-    TotalCounts += Edge->Count;
-    if (Block.getCount())
-      ++Coverage.BranchesExec;
-    if (Edge->Count)
-      ++Coverage.BranchesTaken;
-    ++Coverage.Branches;
-
-    if (Options.FuncCoverage) {
-      const GCOVFunction *Function = &Block.getParent();
-      GCOVCoverage &FuncCoverage = FuncCoverages.find(Function)->second;
-      if (Block.getCount())
-        ++FuncCoverage.BranchesExec;
-      if (Edge->Count)
-        ++FuncCoverage.BranchesTaken;
-      ++FuncCoverage.Branches;
+  for (SourceInfo &si : sources) {
+    if (si.ignored)
+      continue;
+    Summary summary(si.displayName);
+    collectSource(si, summary);
+
+    // Print file summary unless -t is specified.
+    std::string gcovName = getCoveragePath(si.filename, filename);
+    if (!options.UseStdout) {
+      os << "File '" << summary.Name << "'\n";
+      printSummary(summary, os);
+      if (!options.NoOutput && !options.Intermediate)
+        os << "Creating '" << gcovName << "'\n";
+      os << '\n';
+    }
+
+    if (options.NoOutput || options.Intermediate)
+      continue;
+    Optional<raw_fd_ostream> os;
+    if (!options.UseStdout) {
+      std::error_code ec;
+      os.emplace(gcovName, ec, sys::fs::OF_Text);
+      if (ec) {
+        errs() << ec.message() << '\n';
+        continue;
+      }
+    }
+    annotateSource(si, file, gcno, gcda,
+                   options.UseStdout ? llvm::outs() : *os);
+  }
+
+  if (options.Intermediate && !options.NoOutput) {
+    // gcov 7.* unexpectedly create multiple .gcov files, which was fixed in 8.0
+    // (PR GCC/82702). We create just one file.
+    std::string outputPath(sys::path::filename(filename));
+    std::error_code ec;
+    raw_fd_ostream os(outputPath + ".gcov", ec, sys::fs::OF_Text);
+    if (ec) {
+      errs() << ec.message() << '\n';
+      return;
     }
+
+    for (const SourceInfo &si : sources)
+      printSourceToIntermediate(si, os);
   }
+}
 
-  for (uint64_t N : BranchCounts)
-    OS << format("branch %2u ", EdgeNo++)
-       << formatBranchInfo(Options, N, TotalCounts) << "\n";
+void Context::printFunctionDetails(const GCOVFunction &f,
+                                   raw_ostream &os) const {
+  const uint64_t entryCount = f.getEntryCount();
+  uint32_t blocksExec = 0;
+  const GCOVBlock &exitBlock = f.getExitBlock();
+  uint64_t exitCount = 0;
+  for (const GCOVArc *arc : exitBlock.pred)
+    exitCount += arc->count;
+  for (const GCOVBlock &b : f.blocksRange())
+    if (b.number != 0 && &b != &exitBlock && b.getCount())
+      ++blocksExec;
+
+  os << "function " << f.getName(options.Demangle) << " called " << entryCount
+     << " returned " << formatPercentage(exitCount, entryCount)
+     << "% blocks executed "
+     << formatPercentage(blocksExec, f.blocks.size() - 2) << "%\n";
 }
 
-/// printUncondBranchInfo - Print unconditional branch probabilities.
-void FileInfo::printUncondBranchInfo(raw_ostream &OS, uint32_t &EdgeNo,
-                                     uint64_t Count) const {
-  OS << format("unconditional %2u ", EdgeNo++)
-     << formatBranchInfo(Options, Count, Count) << "\n";
+/// printBranchInfo - Print conditional branch probabilities.
+void Context::printBranchInfo(const GCOVBlock &Block, uint32_t &edgeIdx,
+                              raw_ostream &os) const {
+  uint64_t total = 0;
+  for (const GCOVArc *arc : Block.dsts())
+    total += arc->count;
+  for (const GCOVArc *arc : Block.dsts())
+    os << format("branch %2u ", edgeIdx++)
+       << formatBranchInfo(options, arc->count, total) << '\n';
 }
 
-// printCoverage - Print generic coverage info used by both printFuncCoverage
-// and printFileCoverage.
-void FileInfo::printCoverage(raw_ostream &OS,
-                             const GCOVCoverage &Coverage) const {
-  OS << format("Lines executed:%.2f%% of %u\n",
-               double(Coverage.LinesExec) * 100 / Coverage.LogicalLines,
-               Coverage.LogicalLines);
-  if (Options.BranchInfo) {
-    if (Coverage.Branches) {
-      OS << format("Branches executed:%.2f%% of %u\n",
-                   double(Coverage.BranchesExec) * 100 / Coverage.Branches,
-                   Coverage.Branches);
-      OS << format("Taken at least once:%.2f%% of %u\n",
-                   double(Coverage.BranchesTaken) * 100 / Coverage.Branches,
-                   Coverage.Branches);
+void Context::printSummary(const Summary &summary, raw_ostream &os) const {
+  os << format("Lines executed:%.2f%% of %" PRIu64 "\n",
+               double(summary.linesExec) * 100 / summary.lines, summary.lines);
+  if (options.BranchInfo) {
+    if (summary.branches == 0) {
+      os << "No branches\n";
     } else {
-      OS << "No branches\n";
+      os << format("Branches executed:%.2f%% of %" PRIu64 "\n",
+                   double(summary.branchesExec) * 100 / summary.branches,
+                   summary.branches);
+      os << format("Taken at least once:%.2f%% of %" PRIu64 "\n",
+                   double(summary.branchesTaken) * 100 / summary.branches,
+                   summary.branches);
     }
-    OS << "No calls\n"; // to be consistent with gcov
-  }
-}
-
-// printFuncCoverage - Print per-function coverage info.
-void FileInfo::printFuncCoverage(raw_ostream &OS) const {
-  for (const auto &FC : FuncCoverages) {
-    const GCOVCoverage &Coverage = FC.second;
-    OS << "Function '" << Coverage.Name << "'\n";
-    printCoverage(OS, Coverage);
-    OS << "\n";
+    os << "No calls\n";
   }
 }
 
-// printFileCoverage - Print per-file coverage info.
-void FileInfo::printFileCoverage(raw_ostream &OS) const {
-  for (const SourceInfo &source : sources) {
-    const GCOVCoverage &Coverage = source.coverage;
-    OS << "File '" << Coverage.Name << "'\n";
-    printCoverage(OS, Coverage);
-    if (!Options.NoOutput && !Options.Intermediate)
-      OS << "Creating '" << source.name << "'\n";
-    OS << "\n";
-  }
+void llvm::gcovOneInput(const GCOV::Options &options, StringRef filename,
+                        StringRef gcno, StringRef gcda, GCOVFile &file) {
+  Context fi(options);
+  fi.print(filename, gcno, gcda, file);
 }
diff --git a/contrib/llvm-project/llvm/lib/ProfileData/InstrProf.cpp b/contrib/llvm-project/llvm/lib/ProfileData/InstrProf.cpp
index b9d8ae9ba60d..4a0cee67089d 100644
--- a/contrib/llvm-project/llvm/lib/ProfileData/InstrProf.cpp
+++ b/contrib/llvm-project/llvm/lib/ProfileData/InstrProf.cpp
@@ -625,11 +625,11 @@ void InstrProfValueSiteRecord::merge(InstrProfValueSiteRecord &Input,
   }
 }
 
-void InstrProfValueSiteRecord::scale(uint64_t Weight,
+void InstrProfValueSiteRecord::scale(uint64_t N, uint64_t D,
                                      function_ref<void(instrprof_error)> Warn) {
   for (auto I = ValueData.begin(), IE = ValueData.end(); I != IE; ++I) {
     bool Overflowed;
-    I->Count = SaturatingMultiply(I->Count, Weight, &Overflowed);
+    I->Count = SaturatingMultiply(I->Count, N, &Overflowed) / D;
     if (Overflowed)
       Warn(instrprof_error::counter_overflow);
   }
@@ -678,22 +678,23 @@ void InstrProfRecord::merge(InstrProfRecord &Other, uint64_t Weight,
 }
 
 void InstrProfRecord::scaleValueProfData(
-    uint32_t ValueKind, uint64_t Weight,
+    uint32_t ValueKind, uint64_t N, uint64_t D,
     function_ref<void(instrprof_error)> Warn) {
   for (auto &R : getValueSitesForKind(ValueKind))
-    R.scale(Weight, Warn);
+    R.scale(N, D, Warn);
 }
 
-void InstrProfRecord::scale(uint64_t Weight,
+void InstrProfRecord::scale(uint64_t N, uint64_t D,
                             function_ref<void(instrprof_error)> Warn) {
+  assert(D != 0 && "D cannot be 0");
   for (auto &Count : this->Counts) {
     bool Overflowed;
-    Count = SaturatingMultiply(Count, Weight, &Overflowed);
+    Count = SaturatingMultiply(Count, N, &Overflowed) / D;
     if (Overflowed)
       Warn(instrprof_error::counter_overflow);
   }
   for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind)
-    scaleValueProfData(Kind, Weight, Warn);
+    scaleValueProfData(Kind, N, D, Warn);
 }
 
 // Map indirect call target name hash to name string.
@@ -1111,35 +1112,17 @@ bool canRenameComdatFunc(const Function &F, bool CheckAddressTaken) {
   return true;
 }
 
-// Parse the value profile options.
-void getMemOPSizeRangeFromOption(StringRef MemOPSizeRange, int64_t &RangeStart,
-                                 int64_t &RangeLast) {
-  static const int64_t DefaultMemOPSizeRangeStart = 0;
-  static const int64_t DefaultMemOPSizeRangeLast = 8;
-  RangeStart = DefaultMemOPSizeRangeStart;
-  RangeLast = DefaultMemOPSizeRangeLast;
-
-  if (!MemOPSizeRange.empty()) {
-    auto Pos = MemOPSizeRange.find(':');
-    if (Pos != std::string::npos) {
-      if (Pos > 0)
-        MemOPSizeRange.substr(0, Pos).getAsInteger(10, RangeStart);
-      if (Pos < MemOPSizeRange.size() - 1)
-        MemOPSizeRange.substr(Pos + 1).getAsInteger(10, RangeLast);
-    } else
-      MemOPSizeRange.getAsInteger(10, RangeLast);
-  }
-  assert(RangeLast >= RangeStart);
-}
-
 // Create a COMDAT variable INSTR_PROF_RAW_VERSION_VAR to make the runtime
 // aware this is an ir_level profile so it can set the version flag.
-void createIRLevelProfileFlagVar(Module &M, bool IsCS) {
+void createIRLevelProfileFlagVar(Module &M, bool IsCS,
+                                 bool InstrEntryBBEnabled) {
   const StringRef VarName(INSTR_PROF_QUOTE(INSTR_PROF_RAW_VERSION_VAR));
   Type *IntTy64 = Type::getInt64Ty(M.getContext());
   uint64_t ProfileVersion = (INSTR_PROF_RAW_VERSION | VARIANT_MASK_IR_PROF);
   if (IsCS)
     ProfileVersion |= VARIANT_MASK_CSIR_PROF;
+  if (InstrEntryBBEnabled)
+    ProfileVersion |= VARIANT_MASK_INSTR_ENTRY;
   auto IRLevelVersionVariable = new GlobalVariable(
       M, IntTy64, true, GlobalValue::WeakAnyLinkage,
       Constant::getIntegerValue(IntTy64, APInt(64, ProfileVersion)), VarName);
diff --git a/contrib/llvm-project/llvm/lib/ProfileData/InstrProfReader.cpp b/contrib/llvm-project/llvm/lib/ProfileData/InstrProfReader.cpp
index 16a69cb5457b..9581e5b486a6 100644
--- a/contrib/llvm-project/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/contrib/llvm-project/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -154,23 +154,29 @@ bool TextInstrProfReader::hasFormat(const MemoryBuffer &Buffer) {
 Error TextInstrProfReader::readHeader() {
   Symtab.reset(new InstrProfSymtab());
   bool IsIRInstr = false;
-  if (!Line->startswith(":")) {
-    IsIRLevelProfile = false;
-    return success();
+  bool IsEntryFirst = false;
+  bool IsCS = false;
+
+  while (Line->startswith(":")) {
+    StringRef Str = Line->substr(1);
+    if (Str.equals_lower("ir"))
+      IsIRInstr = true;
+    else if (Str.equals_lower("fe"))
+      IsIRInstr = false;
+    else if (Str.equals_lower("csir")) {
+      IsIRInstr = true;
+      IsCS = true;
+    } else if (Str.equals_lower("entry_first"))
+      IsEntryFirst = true;
+    else if (Str.equals_lower("not_entry_first"))
+      IsEntryFirst = false;
+    else
+      return error(instrprof_error::bad_header);
+    ++Line;
   }
-  StringRef Str = (Line)->substr(1);
-  if (Str.equals_lower("ir"))
-    IsIRInstr = true;
-  else if (Str.equals_lower("fe"))
-    IsIRInstr = false;
-  else if (Str.equals_lower("csir")) {
-    IsIRInstr = true;
-    HasCSIRLevelProfile = true;
-  } else
-    return error(instrprof_error::bad_header);
-
-  ++Line;
   IsIRLevelProfile = IsIRInstr;
+  InstrEntryBBEnabled = IsEntryFirst;
+  HasCSIRLevelProfile = IsCS;
   return success();
 }
 
diff --git a/contrib/llvm-project/llvm/lib/ProfileData/InstrProfWriter.cpp b/contrib/llvm-project/llvm/lib/ProfileData/InstrProfWriter.cpp
index ccb270e0b719..d07668322354 100644
--- a/contrib/llvm-project/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/contrib/llvm-project/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -165,8 +165,9 @@ public:
 
 } // end namespace llvm
 
-InstrProfWriter::InstrProfWriter(bool Sparse)
-    : Sparse(Sparse), InfoObj(new InstrProfRecordWriterTrait()) {}
+InstrProfWriter::InstrProfWriter(bool Sparse, bool InstrEntryBBEnabled)
+    : Sparse(Sparse), InstrEntryBBEnabled(InstrEntryBBEnabled),
+      InfoObj(new InstrProfRecordWriterTrait()) {}
 
 InstrProfWriter::~InstrProfWriter() { delete InfoObj; }
 
@@ -240,7 +241,7 @@ void InstrProfWriter::addRecord(StringRef Name, uint64_t Hash,
     // We've never seen a function with this name and hash, add it.
     Dest = std::move(I);
     if (Weight > 1)
-      Dest.scale(Weight, MapWarn);
+      Dest.scale(Weight, 1, MapWarn);
   } else {
     // We're updating a function we've seen before.
     Dest.merge(I, Weight, MapWarn);
@@ -308,6 +309,9 @@ void InstrProfWriter::writeImpl(ProfOStream &OS) {
     Header.Version |= VARIANT_MASK_IR_PROF;
     Header.Version |= VARIANT_MASK_CSIR_PROF;
   }
+  if (InstrEntryBBEnabled)
+    Header.Version |= VARIANT_MASK_INSTR_ENTRY;
+
   Header.Unused = 0;
   Header.HashType = static_cast<uint64_t>(IndexedInstrProf::HashType);
   Header.HashOffset = 0;
@@ -441,6 +445,8 @@ Error InstrProfWriter::writeText(raw_fd_ostream &OS) {
     OS << "# IR level Instrumentation Flag\n:ir\n";
   else if (ProfileKind == PF_IRLevelWithCS)
     OS << "# CSIR level Instrumentation Flag\n:csir\n";
+  if (InstrEntryBBEnabled)
+    OS << "# Always instrument the function entry block\n:entry_first\n";
   InstrProfSymtab Symtab;
 
   using FuncPair = detail::DenseMapPair<uint64_t, InstrProfRecord>;
diff --git a/contrib/llvm-project/llvm/lib/ProfileData/ProfileSummaryBuilder.cpp b/contrib/llvm-project/llvm/lib/ProfileData/ProfileSummaryBuilder.cpp
index 5d3a07640942..0e03aa50173d 100644
--- a/contrib/llvm-project/llvm/lib/ProfileData/ProfileSummaryBuilder.cpp
+++ b/contrib/llvm-project/llvm/lib/ProfileData/ProfileSummaryBuilder.cpp
@@ -18,9 +18,14 @@
 #include "llvm/ProfileData/ProfileCommon.h"
 #include "llvm/ProfileData/SampleProf.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
 
 using namespace llvm;
 
+cl::opt<bool> UseContextLessSummary(
+    "profile-summary-contextless", cl::Hidden, cl::init(false), cl::ZeroOrMore,
+    cl::desc("Merge context profiles before calculating thresholds."));
+
 // A set of cutoff values. Each value, when divided by ProfileSummary::Scale
 // (which is 1000000) is a desired percentile of total counts.
 static const uint32_t DefaultCutoffsData[] = {
@@ -111,6 +116,35 @@ std::unique_ptr<ProfileSummary> SampleProfileSummaryBuilder::getSummary() {
       MaxFunctionCount, NumCounts, NumFunctions);
 }
 
+std::unique_ptr<ProfileSummary>
+SampleProfileSummaryBuilder::computeSummaryForProfiles(
+    const StringMap<sampleprof::FunctionSamples> &Profiles) {
+  assert(NumFunctions == 0 &&
+         "This can only be called on an empty summary builder");
+  StringMap<sampleprof::FunctionSamples> ContextLessProfiles;
+  const StringMap<sampleprof::FunctionSamples> *ProfilesToUse = &Profiles;
+  // For CSSPGO, context-sensitive profile effectively split a function profile
+  // into many copies each representing the CFG profile of a particular calling
+  // context. That makes the count distribution looks more flat as we now have
+  // more function profiles each with lower counts, which in turn leads to lower
+  // hot thresholds. To compensate for that, by defauly we merge context
+  // profiles before coumputing profile summary.
+  if (UseContextLessSummary || (sampleprof::FunctionSamples::ProfileIsCS &&
+                                !UseContextLessSummary.getNumOccurrences())) {
+    for (const auto &I : Profiles) {
+      ContextLessProfiles[I.second.getName()].merge(I.second);
+    }
+    ProfilesToUse = &ContextLessProfiles;
+  }
+
+  for (const auto &I : *ProfilesToUse) {
+    const sampleprof::FunctionSamples &Profile = I.second;
+    addRecord(Profile);
+  }
+
+  return getSummary();
+}
+
 std::unique_ptr<ProfileSummary> InstrProfSummaryBuilder::getSummary() {
   computeDetailedSummary();
   return std::make_unique<ProfileSummary>(
@@ -119,13 +153,22 @@ std::unique_ptr<ProfileSummary> InstrProfSummaryBuilder::getSummary() {
 }
 
 void InstrProfSummaryBuilder::addEntryCount(uint64_t Count) {
-  addCount(Count);
   NumFunctions++;
+
+  // Skip invalid count.
+  if (Count == (uint64_t)-1)
+    return;
+
+  addCount(Count);
   if (Count > MaxFunctionCount)
     MaxFunctionCount = Count;
 }
 
 void InstrProfSummaryBuilder::addInternalCount(uint64_t Count) {
+  // Skip invalid count.
+  if (Count == (uint64_t)-1)
+    return;
+
   addCount(Count);
   if (Count > MaxInternalBlockCount)
     MaxInternalBlockCount = Count;
diff --git a/contrib/llvm-project/llvm/lib/ProfileData/SampleProf.cpp b/contrib/llvm-project/llvm/lib/ProfileData/SampleProf.cpp
index e5d0fdba5fc4..d6acc00e1a6f 100644
--- a/contrib/llvm-project/llvm/lib/ProfileData/SampleProf.cpp
+++ b/contrib/llvm-project/llvm/lib/ProfileData/SampleProf.cpp
@@ -14,6 +14,8 @@
 #include "llvm/ProfileData/SampleProf.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/PseudoProbe.h"
+#include "llvm/ProfileData/SampleProfReader.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Error.h"
@@ -30,6 +32,8 @@ using namespace sampleprof;
 namespace llvm {
 namespace sampleprof {
 SampleProfileFormat FunctionSamples::Format;
+bool FunctionSamples::ProfileIsProbeBased = false;
+bool FunctionSamples::ProfileIsCS = false;
 bool FunctionSamples::UseMD5;
 } // namespace sampleprof
 } // namespace llvm
@@ -75,6 +79,8 @@ class SampleProfErrorCategoryType : public std::error_category {
       return "Uncompress failure";
     case sampleprof_error::zlib_unavailable:
       return "Zlib is unavailable";
+    case sampleprof_error::hash_mismatch:
+      return "Function hash mismatch";
     }
     llvm_unreachable("A value of sampleprof_error has no message.");
   }
@@ -127,6 +133,9 @@ raw_ostream &llvm::sampleprof::operator<<(raw_ostream &OS,
 
 /// Print the samples collected for a function on stream \p OS.
 void FunctionSamples::print(raw_ostream &OS, unsigned Indent) const {
+  if (getFunctionHash())
+    OS << "CFG checksum " << getFunctionHash() << "\n";
+
   OS << TotalSamples << ", " << TotalHeadSamples << ", " << BodySamples.size()
      << " sampled lines\n";
 
@@ -174,8 +183,22 @@ unsigned FunctionSamples::getOffset(const DILocation *DIL) {
       0xffff;
 }
 
-const FunctionSamples *
-FunctionSamples::findFunctionSamples(const DILocation *DIL) const {
+LineLocation FunctionSamples::getCallSiteIdentifier(const DILocation *DIL) {
+  if (FunctionSamples::ProfileIsProbeBased)
+    // In a pseudo-probe based profile, a callsite is simply represented by the
+    // ID of the probe associated with the call instruction. The probe ID is
+    // encoded in the Discriminator field of the call instruction's debug
+    // metadata.
+    return LineLocation(PseudoProbeDwarfDiscriminator::extractProbeIndex(
+                            DIL->getDiscriminator()),
+                        0);
+  else
+    return LineLocation(FunctionSamples::getOffset(DIL),
+                        DIL->getBaseDiscriminator());
+}
+
+const FunctionSamples *FunctionSamples::findFunctionSamples(
+    const DILocation *DIL, SampleProfileReaderItaniumRemapper *Remapper) const {
   assert(DIL);
   SmallVector<std::pair<LineLocation, StringRef>, 10> S;
 
@@ -190,11 +213,59 @@ FunctionSamples::findFunctionSamples(const DILocation *DIL) const {
     return this;
   const FunctionSamples *FS = this;
   for (int i = S.size() - 1; i >= 0 && FS != nullptr; i--) {
-    FS = FS->findFunctionSamplesAt(S[i].first, S[i].second);
+    FS = FS->findFunctionSamplesAt(S[i].first, S[i].second, Remapper);
   }
   return FS;
 }
 
+void FunctionSamples::findAllNames(DenseSet<StringRef> &NameSet) const {
+  NameSet.insert(Name);
+  for (const auto &BS : BodySamples)
+    for (const auto &TS : BS.second.getCallTargets())
+      NameSet.insert(TS.getKey());
+
+  for (const auto &CS : CallsiteSamples) {
+    for (const auto &NameFS : CS.second) {
+      NameSet.insert(NameFS.first);
+      NameFS.second.findAllNames(NameSet);
+    }
+  }
+}
+
+const FunctionSamples *FunctionSamples::findFunctionSamplesAt(
+    const LineLocation &Loc, StringRef CalleeName,
+    SampleProfileReaderItaniumRemapper *Remapper) const {
+  std::string CalleeGUID;
+  CalleeName = getRepInFormat(CalleeName, UseMD5, CalleeGUID);
+
+  auto iter = CallsiteSamples.find(Loc);
+  if (iter == CallsiteSamples.end())
+    return nullptr;
+  auto FS = iter->second.find(CalleeName);
+  if (FS != iter->second.end())
+    return &FS->second;
+  if (Remapper) {
+    if (auto NameInProfile = Remapper->lookUpNameInProfile(CalleeName)) {
+      auto FS = iter->second.find(*NameInProfile);
+      if (FS != iter->second.end())
+        return &FS->second;
+    }
+  }
+  // If we cannot find exact match of the callee name, return the FS with
+  // the max total count. Only do this when CalleeName is not provided,
+  // i.e., only for indirect calls.
+  if (!CalleeName.empty())
+    return nullptr;
+  uint64_t MaxTotalSamples = 0;
+  const FunctionSamples *R = nullptr;
+  for (const auto &NameFS : iter->second)
+    if (NameFS.second.getTotalSamples() >= MaxTotalSamples) {
+      MaxTotalSamples = NameFS.second.getTotalSamples();
+      R = &NameFS.second;
+    }
+  return R;
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void FunctionSamples::dump() const { print(dbgs(), 0); }
 #endif
@@ -216,8 +287,7 @@ std::error_code ProfileSymbolList::read(const uint8_t *Data,
 std::error_code ProfileSymbolList::write(raw_ostream &OS) {
   // Sort the symbols before output. If doing compression.
   // It will make the compression much more effective.
-  std::vector<StringRef> SortedList;
-  SortedList.insert(SortedList.begin(), Syms.begin(), Syms.end());
+  std::vector<StringRef> SortedList(Syms.begin(), Syms.end());
   llvm::sort(SortedList);
 
   std::string OutputString;
@@ -232,8 +302,7 @@ std::error_code ProfileSymbolList::write(raw_ostream &OS) {
 
 void ProfileSymbolList::dump(raw_ostream &OS) const {
   OS << "======== Dump profile symbol list ========\n";
-  std::vector<StringRef> SortedList;
-  SortedList.insert(SortedList.begin(), Syms.begin(), Syms.end());
+  std::vector<StringRef> SortedList(Syms.begin(), Syms.end());
   llvm::sort(SortedList);
 
   for (auto &Sym : SortedList)
diff --git a/contrib/llvm-project/llvm/lib/ProfileData/SampleProfReader.cpp b/contrib/llvm-project/llvm/lib/ProfileData/SampleProfReader.cpp
index 03f1ac190b91..38cbca844c87 100644
--- a/contrib/llvm-project/llvm/lib/ProfileData/SampleProfReader.cpp
+++ b/contrib/llvm-project/llvm/lib/ProfileData/SampleProfReader.cpp
@@ -83,26 +83,52 @@ static bool ParseHead(const StringRef &Input, StringRef &FName,
 /// Returns true if line offset \p L is legal (only has 16 bits).
 static bool isOffsetLegal(unsigned L) { return (L & 0xffff) == L; }
 
+/// Parse \p Input that contains metadata.
+/// Possible metadata:
+/// - CFG Checksum information:
+///     !CFGChecksum: 12345
+/// Stores the FunctionHash (a.k.a. CFG Checksum) into \p FunctionHash.
+static bool parseMetadata(const StringRef &Input, uint64_t &FunctionHash) {
+  if (!Input.startswith("!CFGChecksum:"))
+    return false;
+
+  StringRef CFGInfo = Input.substr(strlen("!CFGChecksum:")).trim();
+  return !CFGInfo.getAsInteger(10, FunctionHash);
+}
+
+enum class LineType {
+  CallSiteProfile,
+  BodyProfile,
+  Metadata,
+};
+
 /// Parse \p Input as line sample.
 ///
 /// \param Input input line.
-/// \param IsCallsite true if the line represents an inlined callsite.
+/// \param LineTy Type of this line.
 /// \param Depth the depth of the inline stack.
 /// \param NumSamples total samples of the line/inlined callsite.
 /// \param LineOffset line offset to the start of the function.
 /// \param Discriminator discriminator of the line.
 /// \param TargetCountMap map from indirect call target to count.
+/// \param FunctionHash the function's CFG hash, used by pseudo probe.
 ///
 /// returns true if parsing is successful.
-static bool ParseLine(const StringRef &Input, bool &IsCallsite, uint32_t &Depth,
+static bool ParseLine(const StringRef &Input, LineType &LineTy, uint32_t &Depth,
                       uint64_t &NumSamples, uint32_t &LineOffset,
                       uint32_t &Discriminator, StringRef &CalleeName,
-                      DenseMap<StringRef, uint64_t> &TargetCountMap) {
+                      DenseMap<StringRef, uint64_t> &TargetCountMap,
+                      uint64_t &FunctionHash) {
   for (Depth = 0; Input[Depth] == ' '; Depth++)
     ;
   if (Depth == 0)
     return false;
 
+  if (Depth == 1 && Input[Depth] == '!') {
+    LineTy = LineType::Metadata;
+    return parseMetadata(Input.substr(Depth), FunctionHash);
+  }
+
   size_t n1 = Input.find(':');
   StringRef Loc = Input.substr(Depth, n1 - Depth);
   size_t n2 = Loc.find('.');
@@ -118,8 +144,8 @@ static bool ParseLine(const StringRef &Input, bool &IsCallsite, uint32_t &Depth,
   }
 
   StringRef Rest = Input.substr(n1 + 2);
-  if (Rest[0] >= '0' && Rest[0] <= '9') {
-    IsCallsite = false;
+  if (isDigit(Rest[0])) {
+    LineTy = LineType::BodyProfile;
     size_t n3 = Rest.find(' ');
     if (n3 == StringRef::npos) {
       if (Rest.getAsInteger(10, NumSamples))
@@ -176,7 +202,7 @@ static bool ParseLine(const StringRef &Input, bool &IsCallsite, uint32_t &Depth,
       n3 = n4;
     }
   } else {
-    IsCallsite = true;
+    LineTy = LineType::CallSiteProfile;
     size_t n3 = Rest.find_last_of(':');
     CalleeName = Rest.substr(0, n3);
     if (Rest.substr(n3 + 1).getAsInteger(10, NumSamples))
@@ -196,6 +222,11 @@ std::error_code SampleProfileReaderText::readImpl() {
   sampleprof_error Result = sampleprof_error::success;
 
   InlineCallStack InlineStack;
+  uint32_t ProbeProfileCount = 0;
+
+  // SeenMetadata tracks whether we have processed metadata for the current
+  // top-level function profile.
+  bool SeenMetadata = false;
 
   for (; !LineIt.is_at_eof(); ++LineIt) {
     if ((*LineIt)[(*LineIt).find_first_not_of(' ')] == '#')
@@ -220,9 +251,14 @@ std::error_code SampleProfileReaderText::readImpl() {
                     "Expected 'mangled_name:NUM:NUM', found " + *LineIt);
         return sampleprof_error::malformed;
       }
-      Profiles[FName] = FunctionSamples();
-      FunctionSamples &FProfile = Profiles[FName];
-      FProfile.setName(FName);
+      SeenMetadata = false;
+      SampleContext FContext(FName);
+      if (FContext.hasContext())
+        ++CSProfileCount;
+      Profiles[FContext] = FunctionSamples();
+      FunctionSamples &FProfile = Profiles[FContext];
+      FProfile.setName(FContext.getNameWithoutContext());
+      FProfile.setContext(FContext);
       MergeResult(Result, FProfile.addTotalSamples(NumSamples));
       MergeResult(Result, FProfile.addHeadSamples(NumHeadSamples));
       InlineStack.clear();
@@ -231,25 +267,35 @@ std::error_code SampleProfileReaderText::readImpl() {
       uint64_t NumSamples;
       StringRef FName;
       DenseMap<StringRef, uint64_t> TargetCountMap;
-      bool IsCallsite;
       uint32_t Depth, LineOffset, Discriminator;
-      if (!ParseLine(*LineIt, IsCallsite, Depth, NumSamples, LineOffset,
-                     Discriminator, FName, TargetCountMap)) {
+      LineType LineTy;
+      uint64_t FunctionHash;
+      if (!ParseLine(*LineIt, LineTy, Depth, NumSamples, LineOffset,
+                     Discriminator, FName, TargetCountMap, FunctionHash)) {
         reportError(LineIt.line_number(),
                     "Expected 'NUM[.NUM]: NUM[ mangled_name:NUM]*', found " +
                         *LineIt);
         return sampleprof_error::malformed;
       }
-      if (IsCallsite) {
-        while (InlineStack.size() > Depth) {
-          InlineStack.pop_back();
-        }
+      if (SeenMetadata && LineTy != LineType::Metadata) {
+        // Metadata must be put at the end of a function profile.
+        reportError(LineIt.line_number(),
+                    "Found non-metadata after metadata: " + *LineIt);
+        return sampleprof_error::malformed;
+      }
+      while (InlineStack.size() > Depth) {
+        InlineStack.pop_back();
+      }
+      switch (LineTy) {
+      case LineType::CallSiteProfile: {
         FunctionSamples &FSamples = InlineStack.back()->functionSamplesAt(
             LineLocation(LineOffset, Discriminator))[std::string(FName)];
         FSamples.setName(FName);
         MergeResult(Result, FSamples.addTotalSamples(NumSamples));
         InlineStack.push_back(&FSamples);
-      } else {
+        break;
+      }
+      case LineType::BodyProfile: {
         while (InlineStack.size() > Depth) {
           InlineStack.pop_back();
         }
@@ -261,9 +307,28 @@ std::error_code SampleProfileReaderText::readImpl() {
         }
         MergeResult(Result, FProfile.addBodySamples(LineOffset, Discriminator,
                                                     NumSamples));
+        break;
+      }
+      case LineType::Metadata: {
+        FunctionSamples &FProfile = *InlineStack.back();
+        FProfile.setFunctionHash(FunctionHash);
+        ++ProbeProfileCount;
+        SeenMetadata = true;
+        break;
+      }
       }
     }
   }
+
+  assert((CSProfileCount == 0 || CSProfileCount == Profiles.size()) &&
+         "Cannot have both context-sensitive and regular profile");
+  ProfileIsCS = (CSProfileCount > 0);
+  assert((ProbeProfileCount == 0 || ProbeProfileCount == Profiles.size()) &&
+         "Cannot have both probe-based profiles and regular profiles");
+  ProfileIsProbeBased = (ProbeProfileCount > 0);
+  FunctionSamples::ProfileIsProbeBased = ProfileIsProbeBased;
+  FunctionSamples::ProfileIsCS = ProfileIsCS;
+
   if (Result == sampleprof_error::success)
     computeSummary();
 
@@ -354,6 +419,34 @@ ErrorOr<StringRef> SampleProfileReaderBinary::readStringFromTable() {
   return NameTable[*Idx];
 }
 
+ErrorOr<StringRef> SampleProfileReaderExtBinaryBase::readStringFromTable() {
+  if (!FixedLengthMD5)
+    return SampleProfileReaderBinary::readStringFromTable();
+
+  // read NameTable index.
+  auto Idx = readStringIndex(NameTable);
+  if (std::error_code EC = Idx.getError())
+    return EC;
+
+  // Check whether the name to be accessed has been accessed before,
+  // if not, read it from memory directly.
+  StringRef &SR = NameTable[*Idx];
+  if (SR.empty()) {
+    const uint8_t *SavedData = Data;
+    Data = MD5NameMemStart + ((*Idx) * sizeof(uint64_t));
+    auto FID = readUnencodedNumber<uint64_t>();
+    if (std::error_code EC = FID.getError())
+      return EC;
+    // Save the string converted from uint64_t in MD5StringBuf. All the
+    // references to the name are all StringRefs refering to the string
+    // in MD5StringBuf.
+    MD5StringBuf->push_back(std::to_string(*FID));
+    SR = MD5StringBuf->back();
+    Data = SavedData;
+  }
+  return SR;
+}
+
 ErrorOr<StringRef> SampleProfileReaderCompactBinary::readStringFromTable() {
   auto Idx = readStringIndex(NameTable);
   if (std::error_code EC = Idx.getError())
@@ -450,12 +543,16 @@ SampleProfileReaderBinary::readFuncProfile(const uint8_t *Start) {
   if (std::error_code EC = FName.getError())
     return EC;
 
-  Profiles[*FName] = FunctionSamples();
-  FunctionSamples &FProfile = Profiles[*FName];
-  FProfile.setName(*FName);
-
+  SampleContext FContext(*FName);
+  Profiles[FContext] = FunctionSamples();
+  FunctionSamples &FProfile = Profiles[FContext];
+  FProfile.setName(FContext.getNameWithoutContext());
+  FProfile.setContext(FContext);
   FProfile.addHeadSamples(*NumHeadSamples);
 
+  if (FContext.hasContext())
+    CSProfileCount++;
+
   if (std::error_code EC = readProfile(FProfile))
     return EC;
   return sampleprof_error::success;
@@ -470,7 +567,7 @@ std::error_code SampleProfileReaderBinary::readImpl() {
   return sampleprof_error::success;
 }
 
-std::error_code SampleProfileReaderExtBinary::readOneSection(
+std::error_code SampleProfileReaderExtBinaryBase::readOneSection(
     const uint8_t *Start, uint64_t Size, const SecHdrTableEntry &Entry) {
   Data = Start;
   End = Start + Size;
@@ -481,37 +578,56 @@ std::error_code SampleProfileReaderExtBinary::readOneSection(
     if (hasSecFlag(Entry, SecProfSummaryFlags::SecFlagPartial))
       Summary->setPartialProfile(true);
     break;
-  case SecNameTable:
-    if (std::error_code EC = readNameTableSec(
-            hasSecFlag(Entry, SecNameTableFlags::SecFlagMD5Name)))
+  case SecNameTable: {
+    FixedLengthMD5 =
+        hasSecFlag(Entry, SecNameTableFlags::SecFlagFixedLengthMD5);
+    bool UseMD5 = hasSecFlag(Entry, SecNameTableFlags::SecFlagMD5Name);
+    assert((!FixedLengthMD5 || UseMD5) &&
+           "If FixedLengthMD5 is true, UseMD5 has to be true");
+    if (std::error_code EC = readNameTableSec(UseMD5))
       return EC;
     break;
+  }
   case SecLBRProfile:
     if (std::error_code EC = readFuncProfiles())
       return EC;
     break;
-  case SecProfileSymbolList:
-    if (std::error_code EC = readProfileSymbolList())
-      return EC;
-    break;
   case SecFuncOffsetTable:
     if (std::error_code EC = readFuncOffsetTable())
       return EC;
     break;
+  case SecFuncMetadata:
+    ProfileIsProbeBased =
+        hasSecFlag(Entry, SecFuncMetadataFlags::SecFlagIsProbeBased);
+    FunctionSamples::ProfileIsProbeBased = ProfileIsProbeBased;
+    if (std::error_code EC = readFuncMetadata())
+      return EC;
+    break;
+  case SecProfileSymbolList:
+    if (std::error_code EC = readProfileSymbolList())
+      return EC;
+    break;
   default:
+    if (std::error_code EC = readCustomSection(Entry))
+      return EC;
     break;
   }
   return sampleprof_error::success;
 }
 
-void SampleProfileReaderExtBinary::collectFuncsFrom(const Module &M) {
+void SampleProfileReaderExtBinaryBase::collectFuncsFrom(const Module &M) {
   UseAllFuncs = false;
   FuncsToUse.clear();
   for (auto &F : M)
     FuncsToUse.insert(FunctionSamples::getCanonicalFnName(F));
 }
 
-std::error_code SampleProfileReaderExtBinary::readFuncOffsetTable() {
+std::error_code SampleProfileReaderExtBinaryBase::readFuncOffsetTable() {
+  // If there are more than one FuncOffsetTable, the profile read associated
+  // with previous FuncOffsetTable has to be done before next FuncOffsetTable
+  // is read.
+  FuncOffsetTable.clear();
+
   auto Size = readNumber<uint64_t>();
   if (std::error_code EC = Size.getError())
     return EC;
@@ -531,7 +647,7 @@ std::error_code SampleProfileReaderExtBinary::readFuncOffsetTable() {
   return sampleprof_error::success;
 }
 
-std::error_code SampleProfileReaderExtBinary::readFuncProfiles() {
+std::error_code SampleProfileReaderExtBinaryBase::readFuncProfiles() {
   const uint8_t *Start = Data;
   if (UseAllFuncs) {
     while (Data < End) {
@@ -539,44 +655,48 @@ std::error_code SampleProfileReaderExtBinary::readFuncProfiles() {
         return EC;
     }
     assert(Data == End && "More data is read than expected");
-    return sampleprof_error::success;
-  }
-
-  if (Remapper) {
-    for (auto Name : FuncsToUse) {
-      Remapper->insert(Name);
+  } else {
+    if (Remapper) {
+      for (auto Name : FuncsToUse) {
+        Remapper->insert(Name);
+      }
     }
-  }
 
-  if (useMD5()) {
-    for (auto Name : FuncsToUse) {
-      auto GUID = std::to_string(MD5Hash(Name));
-      auto iter = FuncOffsetTable.find(StringRef(GUID));
-      if (iter == FuncOffsetTable.end())
-        continue;
-      const uint8_t *FuncProfileAddr = Start + iter->second;
-      assert(FuncProfileAddr < End && "out of LBRProfile section");
-      if (std::error_code EC = readFuncProfile(FuncProfileAddr))
-        return EC;
-    }
-  } else {
-    for (auto NameOffset : FuncOffsetTable) {
-      auto FuncName = NameOffset.first;
-      if (!FuncsToUse.count(FuncName) &&
-          (!Remapper || !Remapper->exist(FuncName)))
-        continue;
-      const uint8_t *FuncProfileAddr = Start + NameOffset.second;
-      assert(FuncProfileAddr < End && "out of LBRProfile section");
-      if (std::error_code EC = readFuncProfile(FuncProfileAddr))
-        return EC;
+    if (useMD5()) {
+      for (auto Name : FuncsToUse) {
+        auto GUID = std::to_string(MD5Hash(Name));
+        auto iter = FuncOffsetTable.find(StringRef(GUID));
+        if (iter == FuncOffsetTable.end())
+          continue;
+        const uint8_t *FuncProfileAddr = Start + iter->second;
+        assert(FuncProfileAddr < End && "out of LBRProfile section");
+        if (std::error_code EC = readFuncProfile(FuncProfileAddr))
+          return EC;
+      }
+    } else {
+      for (auto NameOffset : FuncOffsetTable) {
+        SampleContext FContext(NameOffset.first);
+        auto FuncName = FContext.getNameWithoutContext();
+        if (!FuncsToUse.count(FuncName) &&
+            (!Remapper || !Remapper->exist(FuncName)))
+          continue;
+        const uint8_t *FuncProfileAddr = Start + NameOffset.second;
+        assert(FuncProfileAddr < End && "out of LBRProfile section");
+        if (std::error_code EC = readFuncProfile(FuncProfileAddr))
+          return EC;
+      }
     }
+    Data = End;
   }
 
-  Data = End;
+  assert((CSProfileCount == 0 || CSProfileCount == Profiles.size()) &&
+         "Cannot have both context-sensitive and regular profile");
+  ProfileIsCS = (CSProfileCount > 0);
+  FunctionSamples::ProfileIsCS = ProfileIsCS;
   return sampleprof_error::success;
 }
 
-std::error_code SampleProfileReaderExtBinary::readProfileSymbolList() {
+std::error_code SampleProfileReaderExtBinaryBase::readProfileSymbolList() {
   if (!ProfSymList)
     ProfSymList = std::make_unique<ProfileSymbolList>();
 
@@ -625,6 +745,10 @@ std::error_code SampleProfileReaderExtBinaryBase::readImpl() {
     if (!Entry.Size)
       continue;
 
+    // Skip sections without context when SkipFlatProf is true.
+    if (SkipFlatProf && hasSecFlag(Entry, SecCommonFlags::SecFlagFlat))
+      continue;
+
     const uint8_t *SecStart = BufStart + Entry.Offset;
     uint64_t SecSize = Entry.Size;
 
@@ -709,7 +833,7 @@ std::error_code SampleProfileReaderBinary::readNameTable() {
   auto Size = readNumber<uint32_t>();
   if (std::error_code EC = Size.getError())
     return EC;
-  NameTable.reserve(*Size);
+  NameTable.reserve(*Size + NameTable.size());
   for (uint32_t I = 0; I < *Size; ++I) {
     auto Name(readString());
     if (std::error_code EC = Name.getError())
@@ -720,13 +844,24 @@ std::error_code SampleProfileReaderBinary::readNameTable() {
   return sampleprof_error::success;
 }
 
-std::error_code SampleProfileReaderExtBinary::readMD5NameTable() {
+std::error_code SampleProfileReaderExtBinaryBase::readMD5NameTable() {
   auto Size = readNumber<uint64_t>();
   if (std::error_code EC = Size.getError())
     return EC;
-  NameTable.reserve(*Size);
   MD5StringBuf = std::make_unique<std::vector<std::string>>();
   MD5StringBuf->reserve(*Size);
+  if (FixedLengthMD5) {
+    // Preallocate and initialize NameTable so we can check whether a name
+    // index has been read before by checking whether the element in the
+    // NameTable is empty, meanwhile readStringIndex can do the boundary
+    // check using the size of NameTable.
+    NameTable.resize(*Size + NameTable.size());
+
+    MD5NameMemStart = Data;
+    Data = Data + (*Size) * sizeof(uint64_t);
+    return sampleprof_error::success;
+  }
+  NameTable.reserve(*Size);
   for (uint32_t I = 0; I < *Size; ++I) {
     auto FID = readNumber<uint64_t>();
     if (std::error_code EC = FID.getError())
@@ -739,12 +874,35 @@ std::error_code SampleProfileReaderExtBinary::readMD5NameTable() {
   return sampleprof_error::success;
 }
 
-std::error_code SampleProfileReaderExtBinary::readNameTableSec(bool IsMD5) {
+std::error_code SampleProfileReaderExtBinaryBase::readNameTableSec(bool IsMD5) {
   if (IsMD5)
     return readMD5NameTable();
   return SampleProfileReaderBinary::readNameTable();
 }
 
+std::error_code SampleProfileReaderExtBinaryBase::readFuncMetadata() {
+  if (!ProfileIsProbeBased)
+    return sampleprof_error::success;
+  while (Data < End) {
+    auto FName(readStringFromTable());
+    if (std::error_code EC = FName.getError())
+      return EC;
+
+    auto Checksum = readNumber<uint64_t>();
+    if (std::error_code EC = Checksum.getError())
+      return EC;
+
+    SampleContext FContext(*FName);
+    // No need to load metadata for profiles that are not loaded in the current
+    // module.
+    if (Profiles.count(FContext))
+      Profiles[FContext].setFunctionHash(*Checksum);
+  }
+
+  assert(Data == End && "More data is read than expected");
+  return sampleprof_error::success;
+}
+
 std::error_code SampleProfileReaderCompactBinary::readNameTable() {
   auto Size = readNumber<uint64_t>();
   if (std::error_code EC = Size.getError())
@@ -759,7 +917,8 @@ std::error_code SampleProfileReaderCompactBinary::readNameTable() {
   return sampleprof_error::success;
 }
 
-std::error_code SampleProfileReaderExtBinaryBase::readSecHdrTableEntry() {
+std::error_code
+SampleProfileReaderExtBinaryBase::readSecHdrTableEntry(uint32_t Idx) {
   SecHdrTableEntry Entry;
   auto Type = readUnencodedNumber<uint64_t>();
   if (std::error_code EC = Type.getError())
@@ -781,6 +940,7 @@ std::error_code SampleProfileReaderExtBinaryBase::readSecHdrTableEntry() {
     return EC;
   Entry.Size = *Size;
 
+  Entry.LayoutIndex = Idx;
   SecHdrTable.push_back(std::move(Entry));
   return sampleprof_error::success;
 }
@@ -791,7 +951,7 @@ std::error_code SampleProfileReaderExtBinaryBase::readSecHdrTable() {
     return EC;
 
   for (uint32_t i = 0; i < (*EntryNum); i++)
-    if (std::error_code EC = readSecHdrTableEntry())
+    if (std::error_code EC = readSecHdrTableEntry(i))
       return EC;
 
   return sampleprof_error::success;
@@ -813,11 +973,12 @@ std::error_code SampleProfileReaderExtBinaryBase::readHeader() {
 }
 
 uint64_t SampleProfileReaderExtBinaryBase::getSectionSize(SecType Type) {
+  uint64_t Size = 0;
   for (auto &Entry : SecHdrTable) {
     if (Entry.Type == Type)
-      return Entry.Size;
+      Size += Entry.Size;
   }
-  return 0;
+  return Size;
 }
 
 uint64_t SampleProfileReaderExtBinaryBase::getFileSize() {
@@ -840,9 +1001,14 @@ static std::string getSecFlagsStr(const SecHdrTableEntry &Entry) {
   else
     Flags.append("{");
 
+  if (hasSecFlag(Entry, SecCommonFlags::SecFlagFlat))
+    Flags.append("flat,");
+
   switch (Entry.Type) {
   case SecNameTable:
-    if (hasSecFlag(Entry, SecNameTableFlags::SecFlagMD5Name))
+    if (hasSecFlag(Entry, SecNameTableFlags::SecFlagFixedLengthMD5))
+      Flags.append("fixlenmd5,");
+    else if (hasSecFlag(Entry, SecNameTableFlags::SecFlagMD5Name))
       Flags.append("md5,");
     break;
   case SecProfSummary:
@@ -867,7 +1033,7 @@ bool SampleProfileReaderExtBinaryBase::dumpSectionInfo(raw_ostream &OS) {
        << ", Size: " << Entry.Size << ", Flags: " << getSecFlagsStr(Entry)
        << "\n";
     ;
-    TotalSecsSize += getSectionSize(Entry.Type);
+    TotalSecsSize += Entry.Size;
   }
   uint64_t HeaderSize = SecHdrTable.front().Offset;
   assert(HeaderSize + TotalSecsSize == getFileSize() &&
@@ -1201,7 +1367,7 @@ std::error_code SampleProfileReaderGCC::readOneFunctionProfile(
 
     InlineCallStack NewStack;
     NewStack.push_back(FProfile);
-    NewStack.insert(NewStack.end(), InlineStack.begin(), InlineStack.end());
+    llvm::append_range(NewStack, InlineStack);
     if (Update) {
       // Walk up the inline stack, adding the samples on this line to
       // the total sample count of the callers in the chain.
@@ -1249,7 +1415,7 @@ std::error_code SampleProfileReaderGCC::readOneFunctionProfile(
       return sampleprof_error::truncated;
     InlineCallStack NewStack;
     NewStack.push_back(FProfile);
-    NewStack.insert(NewStack.end(), InlineStack.begin(), InlineStack.end());
+    llvm::append_range(NewStack, InlineStack);
     if (std::error_code EC = readOneFunctionProfile(NewStack, Update, Offset))
       return EC;
   }
@@ -1290,19 +1456,25 @@ void SampleProfileReaderItaniumRemapper::applyRemapping(LLVMContext &Ctx) {
     return;
   }
 
+  // CSSPGO-TODO: Remapper is not yet supported.
+  // We will need to remap the entire context string.
   assert(Remappings && "should be initialized while creating remapper");
-  for (auto &Sample : Reader.getProfiles())
-    if (auto Key = Remappings->insert(Sample.first()))
-      SampleMap.insert({Key, &Sample.second});
+  for (auto &Sample : Reader.getProfiles()) {
+    DenseSet<StringRef> NamesInSample;
+    Sample.second.findAllNames(NamesInSample);
+    for (auto &Name : NamesInSample)
+      if (auto Key = Remappings->insert(Name))
+        NameMap.insert({Key, Name});
+  }
 
   RemappingApplied = true;
 }
 
-FunctionSamples *
-SampleProfileReaderItaniumRemapper::getSamplesFor(StringRef Fname) {
+Optional<StringRef>
+SampleProfileReaderItaniumRemapper::lookUpNameInProfile(StringRef Fname) {
   if (auto Key = Remappings->lookup(Fname))
-    return SampleMap.lookup(Key);
-  return nullptr;
+    return NameMap.lookup(Key);
+  return None;
 }
 
 /// Prepare a memory buffer for the contents of \p Filename.
@@ -1438,9 +1610,5 @@ SampleProfileReader::create(std::unique_ptr<MemoryBuffer> &B, LLVMContext &C,
 // profile. Binary format has the profile summary in its header.
 void SampleProfileReader::computeSummary() {
   SampleProfileSummaryBuilder Builder(ProfileSummaryBuilder::DefaultCutoffs);
-  for (const auto &I : Profiles) {
-    const FunctionSamples &Profile = I.second;
-    Builder.addRecord(Profile);
-  }
-  Summary = Builder.getSummary();
+  Summary = Builder.computeSummaryForProfiles(Profiles);
 }
diff --git a/contrib/llvm-project/llvm/lib/ProfileData/SampleProfWriter.cpp b/contrib/llvm-project/llvm/lib/ProfileData/SampleProfWriter.cpp
index 48d3faa6cd2f..8017f2a82804 100644
--- a/contrib/llvm-project/llvm/lib/ProfileData/SampleProfWriter.cpp
+++ b/contrib/llvm-project/llvm/lib/ProfileData/SampleProfWriter.cpp
@@ -19,6 +19,7 @@
 
 #include "llvm/ProfileData/SampleProfWriter.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/ProfileData/ProfileCommon.h"
 #include "llvm/ProfileData/SampleProf.h"
 #include "llvm/Support/Compression.h"
@@ -73,19 +74,16 @@ SampleProfileWriter::write(const StringMap<FunctionSamples> &ProfileMap) {
   return sampleprof_error::success;
 }
 
-SecHdrTableEntry &
-SampleProfileWriterExtBinaryBase::getEntryInLayout(SecType Type) {
-  auto SecIt = std::find_if(
-      SectionHdrLayout.begin(), SectionHdrLayout.end(),
-      [=](const auto &Entry) -> bool { return Entry.Type == Type; });
-  return *SecIt;
-}
-
 /// Return the current position and prepare to use it as the start
-/// position of a section.
-uint64_t SampleProfileWriterExtBinaryBase::markSectionStart(SecType Type) {
+/// position of a section given the section type \p Type and its position
+/// \p LayoutIdx in SectionHdrLayout.
+uint64_t
+SampleProfileWriterExtBinaryBase::markSectionStart(SecType Type,
+                                                   uint32_t LayoutIdx) {
   uint64_t SectionStart = OutputStream->tell();
-  auto &Entry = getEntryInLayout(Type);
+  assert(LayoutIdx < SectionHdrLayout.size() && "LayoutIdx out of range");
+  const auto &Entry = SectionHdrLayout[LayoutIdx];
+  assert(Entry.Type == Type && "Unexpected section type");
   // Use LocalBuf as a temporary output for writting data.
   if (hasSecFlag(Entry, SecCommonFlags::SecFlagCompress))
     LocalBufStream.swap(OutputStream);
@@ -112,18 +110,21 @@ std::error_code SampleProfileWriterExtBinaryBase::compressAndOutput() {
   return sampleprof_error::success;
 }
 
-/// Add a new section into section header table.
-std::error_code
-SampleProfileWriterExtBinaryBase::addNewSection(SecType Type,
-                                                uint64_t SectionStart) {
-  auto Entry = getEntryInLayout(Type);
+/// Add a new section into section header table given the section type
+/// \p Type, its position \p LayoutIdx in SectionHdrLayout and the
+/// location \p SectionStart where the section should be written to.
+std::error_code SampleProfileWriterExtBinaryBase::addNewSection(
+    SecType Type, uint32_t LayoutIdx, uint64_t SectionStart) {
+  assert(LayoutIdx < SectionHdrLayout.size() && "LayoutIdx out of range");
+  const auto &Entry = SectionHdrLayout[LayoutIdx];
+  assert(Entry.Type == Type && "Unexpected section type");
   if (hasSecFlag(Entry, SecCommonFlags::SecFlagCompress)) {
     LocalBufStream.swap(OutputStream);
     if (std::error_code EC = compressAndOutput())
       return EC;
   }
   SecHdrTable.push_back({Type, Entry.Flags, SectionStart - FileStart,
-                         OutputStream->tell() - SectionStart});
+                         OutputStream->tell() - SectionStart, LayoutIdx});
   return sampleprof_error::success;
 }
 
@@ -144,15 +145,15 @@ std::error_code SampleProfileWriterExtBinaryBase::write(
 }
 
 std::error_code
-SampleProfileWriterExtBinary::writeSample(const FunctionSamples &S) {
+SampleProfileWriterExtBinaryBase::writeSample(const FunctionSamples &S) {
   uint64_t Offset = OutputStream->tell();
-  StringRef Name = S.getName();
+  StringRef Name = S.getNameWithContext(true);
   FuncOffsetTable[Name] = Offset - SecLBRProfileStart;
   encodeULEB128(S.getHeadSamples(), *OutputStream);
   return writeBody(S);
 }
 
-std::error_code SampleProfileWriterExtBinary::writeFuncOffsetTable() {
+std::error_code SampleProfileWriterExtBinaryBase::writeFuncOffsetTable() {
   auto &OS = *OutputStream;
 
   // Write out the table size.
@@ -163,10 +164,23 @@ std::error_code SampleProfileWriterExtBinary::writeFuncOffsetTable() {
     writeNameIdx(entry.first);
     encodeULEB128(entry.second, OS);
   }
+  FuncOffsetTable.clear();
   return sampleprof_error::success;
 }
 
-std::error_code SampleProfileWriterExtBinary::writeNameTable() {
+std::error_code SampleProfileWriterExtBinaryBase::writeFuncMetadata(
+    const StringMap<FunctionSamples> &Profiles) {
+  if (!FunctionSamples::ProfileIsProbeBased)
+    return sampleprof_error::success;
+  auto &OS = *OutputStream;
+  for (const auto &Entry : Profiles) {
+    writeNameIdx(Entry.first());
+    encodeULEB128(Entry.second.getFunctionHash(), OS);
+  }
+  return sampleprof_error::success;
+}
+
+std::error_code SampleProfileWriterExtBinaryBase::writeNameTable() {
   if (!UseMD5)
     return SampleProfileWriterBinary::writeNameTable();
 
@@ -174,59 +188,159 @@ std::error_code SampleProfileWriterExtBinary::writeNameTable() {
   std::set<StringRef> V;
   stablizeNameTable(V);
 
-  // Write out the name table.
+  // Write out the MD5 name table. We wrote unencoded MD5 so reader can
+  // retrieve the name using the name index without having to read the
+  // whole name table.
   encodeULEB128(NameTable.size(), OS);
-  for (auto N : V) {
-    encodeULEB128(MD5Hash(N), OS);
-  }
+  support::endian::Writer Writer(OS, support::little);
+  for (auto N : V)
+    Writer.write(MD5Hash(N));
   return sampleprof_error::success;
 }
 
-std::error_code SampleProfileWriterExtBinary::writeSections(
+std::error_code SampleProfileWriterExtBinaryBase::writeNameTableSection(
     const StringMap<FunctionSamples> &ProfileMap) {
-  uint64_t SectionStart = markSectionStart(SecProfSummary);
-  computeSummary(ProfileMap);
-  if (auto EC = writeSummary())
-    return EC;
-  if (std::error_code EC = addNewSection(SecProfSummary, SectionStart))
-    return EC;
-
-  // Generate the name table for all the functions referenced in the profile.
-  SectionStart = markSectionStart(SecNameTable);
   for (const auto &I : ProfileMap) {
     addName(I.first());
     addNames(I.second);
   }
-  writeNameTable();
-  if (std::error_code EC = addNewSection(SecNameTable, SectionStart))
-    return EC;
-
-  SectionStart = markSectionStart(SecLBRProfile);
-  SecLBRProfileStart = OutputStream->tell();
-  if (std::error_code EC = writeFuncProfiles(ProfileMap))
-    return EC;
-  if (std::error_code EC = addNewSection(SecLBRProfile, SectionStart))
+  if (auto EC = writeNameTable())
     return EC;
+  return sampleprof_error::success;
+}
 
-  if (ProfSymList && ProfSymList->toCompress())
-    setToCompressSection(SecProfileSymbolList);
-
-  SectionStart = markSectionStart(SecProfileSymbolList);
+std::error_code
+SampleProfileWriterExtBinaryBase::writeProfileSymbolListSection() {
   if (ProfSymList && ProfSymList->size() > 0)
     if (std::error_code EC = ProfSymList->write(*OutputStream))
       return EC;
-  if (std::error_code EC = addNewSection(SecProfileSymbolList, SectionStart))
+
+  return sampleprof_error::success;
+}
+
+std::error_code SampleProfileWriterExtBinaryBase::writeOneSection(
+    SecType Type, uint32_t LayoutIdx,
+    const StringMap<FunctionSamples> &ProfileMap) {
+  // The setting of SecFlagCompress should happen before markSectionStart.
+  if (Type == SecProfileSymbolList && ProfSymList && ProfSymList->toCompress())
+    setToCompressSection(SecProfileSymbolList);
+  if (Type == SecFuncMetadata && FunctionSamples::ProfileIsProbeBased)
+    addSectionFlag(SecFuncMetadata, SecFuncMetadataFlags::SecFlagIsProbeBased);
+
+  uint64_t SectionStart = markSectionStart(Type, LayoutIdx);
+  switch (Type) {
+  case SecProfSummary:
+    computeSummary(ProfileMap);
+    if (auto EC = writeSummary())
+      return EC;
+    break;
+  case SecNameTable:
+    if (auto EC = writeNameTableSection(ProfileMap))
+      return EC;
+    break;
+  case SecLBRProfile:
+    SecLBRProfileStart = OutputStream->tell();
+    if (std::error_code EC = writeFuncProfiles(ProfileMap))
+      return EC;
+    break;
+  case SecFuncOffsetTable:
+    if (auto EC = writeFuncOffsetTable())
+      return EC;
+    break;
+  case SecFuncMetadata:
+    if (std::error_code EC = writeFuncMetadata(ProfileMap))
+      return EC;
+    break;
+  case SecProfileSymbolList:
+    if (auto EC = writeProfileSymbolListSection())
+      return EC;
+    break;
+  default:
+    if (auto EC = writeCustomSection(Type))
+      return EC;
+    break;
+  }
+  if (std::error_code EC = addNewSection(Type, LayoutIdx, SectionStart))
     return EC;
+  return sampleprof_error::success;
+}
 
-  SectionStart = markSectionStart(SecFuncOffsetTable);
-  if (std::error_code EC = writeFuncOffsetTable())
+std::error_code SampleProfileWriterExtBinary::writeDefaultLayout(
+    const StringMap<FunctionSamples> &ProfileMap) {
+  // The const indices passed to writeOneSection below are specifying the
+  // positions of the sections in SectionHdrLayout. Look at
+  // initSectionHdrLayout to find out where each section is located in
+  // SectionHdrLayout.
+  if (auto EC = writeOneSection(SecProfSummary, 0, ProfileMap))
+    return EC;
+  if (auto EC = writeOneSection(SecNameTable, 1, ProfileMap))
+    return EC;
+  if (auto EC = writeOneSection(SecLBRProfile, 3, ProfileMap))
+    return EC;
+  if (auto EC = writeOneSection(SecProfileSymbolList, 4, ProfileMap))
+    return EC;
+  if (auto EC = writeOneSection(SecFuncOffsetTable, 2, ProfileMap))
     return EC;
-  if (std::error_code EC = addNewSection(SecFuncOffsetTable, SectionStart))
+  if (auto EC = writeOneSection(SecFuncMetadata, 5, ProfileMap))
+    return EC;
+  return sampleprof_error::success;
+}
+
+static void
+splitProfileMapToTwo(const StringMap<FunctionSamples> &ProfileMap,
+                     StringMap<FunctionSamples> &ContextProfileMap,
+                     StringMap<FunctionSamples> &NoContextProfileMap) {
+  for (const auto &I : ProfileMap) {
+    if (I.second.getCallsiteSamples().size())
+      ContextProfileMap.insert({I.first(), I.second});
+    else
+      NoContextProfileMap.insert({I.first(), I.second});
+  }
+}
+
+std::error_code SampleProfileWriterExtBinary::writeCtxSplitLayout(
+    const StringMap<FunctionSamples> &ProfileMap) {
+  StringMap<FunctionSamples> ContextProfileMap, NoContextProfileMap;
+  splitProfileMapToTwo(ProfileMap, ContextProfileMap, NoContextProfileMap);
+
+  if (auto EC = writeOneSection(SecProfSummary, 0, ProfileMap))
+    return EC;
+  if (auto EC = writeOneSection(SecNameTable, 1, ProfileMap))
+    return EC;
+  if (auto EC = writeOneSection(SecLBRProfile, 3, ContextProfileMap))
+    return EC;
+  if (auto EC = writeOneSection(SecFuncOffsetTable, 2, ContextProfileMap))
+    return EC;
+  // Mark the section to have no context. Note section flag needs to be set
+  // before writing the section.
+  addSectionFlag(5, SecCommonFlags::SecFlagFlat);
+  if (auto EC = writeOneSection(SecLBRProfile, 5, NoContextProfileMap))
+    return EC;
+  // Mark the section to have no context. Note section flag needs to be set
+  // before writing the section.
+  addSectionFlag(4, SecCommonFlags::SecFlagFlat);
+  if (auto EC = writeOneSection(SecFuncOffsetTable, 4, NoContextProfileMap))
+    return EC;
+  if (auto EC = writeOneSection(SecProfileSymbolList, 6, ProfileMap))
+    return EC;
+  if (auto EC = writeOneSection(SecFuncMetadata, 7, ProfileMap))
     return EC;
 
   return sampleprof_error::success;
 }
 
+std::error_code SampleProfileWriterExtBinary::writeSections(
+    const StringMap<FunctionSamples> &ProfileMap) {
+  std::error_code EC;
+  if (SecLayout == DefaultLayout)
+    EC = writeDefaultLayout(ProfileMap);
+  else if (SecLayout == CtxSplitLayout)
+    EC = writeCtxSplitLayout(ProfileMap);
+  else
+    llvm_unreachable("Unsupported layout");
+  return EC;
+}
+
 std::error_code SampleProfileWriterCompactBinary::write(
     const StringMap<FunctionSamples> &ProfileMap) {
   if (std::error_code EC = SampleProfileWriter::write(ProfileMap))
@@ -246,7 +360,7 @@ std::error_code SampleProfileWriterCompactBinary::write(
 /// it needs to be parsed by the SampleProfileReaderText class.
 std::error_code SampleProfileWriterText::writeSample(const FunctionSamples &S) {
   auto &OS = *OutputStream;
-  OS << S.getName() << ":" << S.getTotalSamples();
+  OS << S.getNameWithContext(true) << ":" << S.getTotalSamples();
   if (Indent == 0)
     OS << ":" << S.getHeadSamples();
   OS << "\n";
@@ -285,6 +399,13 @@ std::error_code SampleProfileWriterText::writeSample(const FunctionSamples &S) {
     }
   Indent -= 1;
 
+  if (Indent == 0) {
+    if (FunctionSamples::ProfileIsProbeBased) {
+      OS.indent(Indent + 1);
+      OS << "!CFGChecksum: " << S.getFunctionHash() << "\n";
+    }
+  }
+
   return sampleprof_error::success;
 }
 
@@ -435,24 +556,31 @@ std::error_code SampleProfileWriterExtBinaryBase::writeSecHdrTable() {
     return sampleprof_error::ostream_seek_unsupported;
   support::endian::Writer Writer(*OutputStream, support::little);
 
-  DenseMap<uint32_t, uint32_t> IndexMap;
-  for (uint32_t i = 0; i < SecHdrTable.size(); i++) {
-    IndexMap.insert({static_cast<uint32_t>(SecHdrTable[i].Type), i});
+  assert(SecHdrTable.size() == SectionHdrLayout.size() &&
+         "SecHdrTable entries doesn't match SectionHdrLayout");
+  SmallVector<uint32_t, 16> IndexMap(SecHdrTable.size(), -1);
+  for (uint32_t TableIdx = 0; TableIdx < SecHdrTable.size(); TableIdx++) {
+    IndexMap[SecHdrTable[TableIdx].LayoutIndex] = TableIdx;
   }
 
   // Write the section header table in the order specified in
-  // SectionHdrLayout. That is the sections order Reader will see.
-  // Note that the sections order in which Reader expects to read
-  // may be different from the order in which Writer is able to
-  // write, so we need to adjust the order in SecHdrTable to be
-  // consistent with SectionHdrLayout when we write SecHdrTable
-  // to the memory.
-  for (uint32_t i = 0; i < SectionHdrLayout.size(); i++) {
-    uint32_t idx = IndexMap[static_cast<uint32_t>(SectionHdrLayout[i].Type)];
-    Writer.write(static_cast<uint64_t>(SecHdrTable[idx].Type));
-    Writer.write(static_cast<uint64_t>(SecHdrTable[idx].Flags));
-    Writer.write(static_cast<uint64_t>(SecHdrTable[idx].Offset));
-    Writer.write(static_cast<uint64_t>(SecHdrTable[idx].Size));
+  // SectionHdrLayout. SectionHdrLayout specifies the sections
+  // order in which profile reader expect to read, so the section
+  // header table should be written in the order in SectionHdrLayout.
+  // Note that the section order in SecHdrTable may be different
+  // from the order in SectionHdrLayout, for example, SecFuncOffsetTable
+  // needs to be computed after SecLBRProfile (the order in SecHdrTable),
+  // but it needs to be read before SecLBRProfile (the order in
+  // SectionHdrLayout). So we use IndexMap above to switch the order.
+  for (uint32_t LayoutIdx = 0; LayoutIdx < SectionHdrLayout.size();
+       LayoutIdx++) {
+    assert(IndexMap[LayoutIdx] < SecHdrTable.size() &&
+           "Incorrect LayoutIdx in SecHdrTable");
+    auto Entry = SecHdrTable[IndexMap[LayoutIdx]];
+    Writer.write(static_cast<uint64_t>(Entry.Type));
+    Writer.write(static_cast<uint64_t>(Entry.Flags));
+    Writer.write(static_cast<uint64_t>(Entry.Offset));
+    Writer.write(static_cast<uint64_t>(Entry.Size));
   }
 
   // Reset OutputStream.
@@ -504,7 +632,7 @@ std::error_code SampleProfileWriterBinary::writeSummary() {
 std::error_code SampleProfileWriterBinary::writeBody(const FunctionSamples &S) {
   auto &OS = *OutputStream;
 
-  if (std::error_code EC = writeNameIdx(S.getName()))
+  if (std::error_code EC = writeNameIdx(S.getNameWithContext(true)))
     return EC;
 
   encodeULEB128(S.getTotalSamples(), OS);
@@ -621,9 +749,5 @@ SampleProfileWriter::create(std::unique_ptr<raw_ostream> &OS,
 void SampleProfileWriter::computeSummary(
     const StringMap<FunctionSamples> &ProfileMap) {
   SampleProfileSummaryBuilder Builder(ProfileSummaryBuilder::DefaultCutoffs);
-  for (const auto &I : ProfileMap) {
-    const FunctionSamples &Profile = I.second;
-    Builder.addRecord(Profile);
-  }
-  Summary = Builder.getSummary();
+  Summary = Builder.computeSummaryForProfiles(ProfileMap);
 }
diff --git a/contrib/llvm-project/llvm/lib/Remarks/BitstreamRemarkParser.cpp b/contrib/llvm-project/llvm/lib/Remarks/BitstreamRemarkParser.cpp
index 25fbea7d31c2..3d586a247962 100644
--- a/contrib/llvm-project/llvm/lib/Remarks/BitstreamRemarkParser.cpp
+++ b/contrib/llvm-project/llvm/lib/Remarks/BitstreamRemarkParser.cpp
@@ -13,7 +13,6 @@
 
 #include "llvm/Remarks/BitstreamRemarkParser.h"
 #include "BitstreamRemarkParser.h"
-#include "llvm/Remarks/BitstreamRemarkContainer.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 
diff --git a/contrib/llvm-project/llvm/lib/Remarks/BitstreamRemarkParser.h b/contrib/llvm-project/llvm/lib/Remarks/BitstreamRemarkParser.h
index 749219fc5155..0e40e5d66e00 100644
--- a/contrib/llvm-project/llvm/lib/Remarks/BitstreamRemarkParser.h
+++ b/contrib/llvm-project/llvm/lib/Remarks/BitstreamRemarkParser.h
@@ -14,13 +14,13 @@
 #define LLVM_LIB_REMARKS_BITSTREAM_REMARK_PARSER_H
 
 #include "llvm/ADT/Optional.h"
-#include "llvm/ADT/SmallVector.h"
+#include "llvm/Remarks/BitstreamRemarkContainer.h"
 #include "llvm/Remarks/BitstreamRemarkParser.h"
+#include "llvm/Remarks/Remark.h"
 #include "llvm/Remarks/RemarkFormat.h"
 #include "llvm/Remarks/RemarkParser.h"
-#include "llvm/Support/raw_ostream.h"
+#include <cstdint>
 #include <memory>
-#include <string>
 
 namespace llvm {
 namespace remarks {
diff --git a/contrib/llvm-project/llvm/lib/Support/AArch64TargetParser.cpp b/contrib/llvm-project/llvm/lib/Support/AArch64TargetParser.cpp
index a6de44605675..503a7bd49d15 100644
--- a/contrib/llvm-project/llvm/lib/Support/AArch64TargetParser.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/AArch64TargetParser.cpp
@@ -35,11 +35,11 @@ unsigned AArch64::getDefaultFPU(StringRef CPU, AArch64::ArchKind AK) {
   .Default(ARM::FK_INVALID);
 }
 
-unsigned AArch64::getDefaultExtensions(StringRef CPU, AArch64::ArchKind AK) {
+uint64_t AArch64::getDefaultExtensions(StringRef CPU, AArch64::ArchKind AK) {
   if (CPU == "generic")
     return AArch64ARCHNames[static_cast<unsigned>(AK)].ArchBaseExtensions;
 
-  return StringSwitch<unsigned>(CPU)
+  return StringSwitch<uint64_t>(CPU)
 #define AARCH64_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT)       \
   .Case(NAME, AArch64ARCHNames[static_cast<unsigned>(ArchKind::ID)]            \
                       .ArchBaseExtensions |                                    \
@@ -59,7 +59,7 @@ AArch64::ArchKind AArch64::getCPUArchKind(StringRef CPU) {
   .Default(ArchKind::INVALID);
 }
 
-bool AArch64::getExtensionFeatures(unsigned Extensions,
+bool AArch64::getExtensionFeatures(uint64_t Extensions,
                                    std::vector<StringRef> &Features) {
   if (Extensions == AArch64::AEK_INVALID)
     return false;
@@ -100,6 +100,12 @@ bool AArch64::getExtensionFeatures(unsigned Extensions,
     Features.push_back("+sve2-bitperm");
   if (Extensions & AEK_RCPC)
     Features.push_back("+rcpc");
+  if (Extensions & AEK_BRBE)
+    Features.push_back("+brbe");
+  if (Extensions & AEK_PAUTH)
+    Features.push_back("+pauth");
+  if (Extensions & AEK_FLAGM)
+    Features.push_back("+flagm");
 
   return true;
 }
@@ -118,6 +124,10 @@ bool AArch64::getArchFeatures(AArch64::ArchKind AK,
     Features.push_back("+v8.5a");
   if (AK == AArch64::ArchKind::ARMV8_6A)
     Features.push_back("+v8.6a");
+  if (AK == AArch64::ArchKind::ARMV8_7A)
+    Features.push_back("+v8.7a");
+  if(AK == AArch64::ArchKind::ARMV8R)
+    Features.push_back("+v8r");
 
   return AK != ArchKind::INVALID;
 }
diff --git a/contrib/llvm-project/llvm/lib/Support/AMDGPUMetadata.cpp b/contrib/llvm-project/llvm/lib/Support/AMDGPUMetadata.cpp
index bfa1fe86cd3e..3d6325134d75 100644
--- a/contrib/llvm-project/llvm/lib/Support/AMDGPUMetadata.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/AMDGPUMetadata.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ADT/Twine.h"
 #include "llvm/Support/AMDGPUMetadata.h"
 #include "llvm/Support/YAMLTraits.h"
 
@@ -211,7 +210,7 @@ struct MappingTraits<HSAMD::Metadata> {
 namespace AMDGPU {
 namespace HSAMD {
 
-std::error_code fromString(std::string String, Metadata &HSAMetadata) {
+std::error_code fromString(StringRef String, Metadata &HSAMetadata) {
   yaml::Input YamlInput(String);
   YamlInput >> HSAMetadata;
   return YamlInput.error();
diff --git a/contrib/llvm-project/llvm/lib/Support/APFixedPoint.cpp b/contrib/llvm-project/llvm/lib/Support/APFixedPoint.cpp
new file mode 100644
index 000000000000..9764dd51f572
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Support/APFixedPoint.cpp
@@ -0,0 +1,574 @@
+//===- APFixedPoint.cpp - Fixed point constant handling ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Defines the implementation for the fixed point number interface.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/APFixedPoint.h"
+#include "llvm/ADT/APFloat.h"
+
+namespace llvm {
+
+APFixedPoint APFixedPoint::convert(const FixedPointSemantics &DstSema,
+                                   bool *Overflow) const {
+  APSInt NewVal = Val;
+  unsigned DstWidth = DstSema.getWidth();
+  unsigned DstScale = DstSema.getScale();
+  bool Upscaling = DstScale > getScale();
+  if (Overflow)
+    *Overflow = false;
+
+  if (Upscaling) {
+    NewVal = NewVal.extend(NewVal.getBitWidth() + DstScale - getScale());
+    NewVal <<= (DstScale - getScale());
+  } else {
+    NewVal >>= (getScale() - DstScale);
+  }
+
+  auto Mask = APInt::getBitsSetFrom(
+      NewVal.getBitWidth(),
+      std::min(DstScale + DstSema.getIntegralBits(), NewVal.getBitWidth()));
+  APInt Masked(NewVal & Mask);
+
+  // Change in the bits above the sign
+  if (!(Masked == Mask || Masked == 0)) {
+    // Found overflow in the bits above the sign
+    if (DstSema.isSaturated())
+      NewVal = NewVal.isNegative() ? Mask : ~Mask;
+    else if (Overflow)
+      *Overflow = true;
+  }
+
+  // If the dst semantics are unsigned, but our value is signed and negative, we
+  // clamp to zero.
+  if (!DstSema.isSigned() && NewVal.isSigned() && NewVal.isNegative()) {
+    // Found negative overflow for unsigned result
+    if (DstSema.isSaturated())
+      NewVal = 0;
+    else if (Overflow)
+      *Overflow = true;
+  }
+
+  NewVal = NewVal.extOrTrunc(DstWidth);
+  NewVal.setIsSigned(DstSema.isSigned());
+  return APFixedPoint(NewVal, DstSema);
+}
+
+int APFixedPoint::compare(const APFixedPoint &Other) const {
+  APSInt ThisVal = getValue();
+  APSInt OtherVal = Other.getValue();
+  bool ThisSigned = Val.isSigned();
+  bool OtherSigned = OtherVal.isSigned();
+  unsigned OtherScale = Other.getScale();
+  unsigned OtherWidth = OtherVal.getBitWidth();
+
+  unsigned CommonWidth = std::max(Val.getBitWidth(), OtherWidth);
+
+  // Prevent overflow in the event the widths are the same but the scales differ
+  CommonWidth += getScale() >= OtherScale ? getScale() - OtherScale
+                                          : OtherScale - getScale();
+
+  ThisVal = ThisVal.extOrTrunc(CommonWidth);
+  OtherVal = OtherVal.extOrTrunc(CommonWidth);
+
+  unsigned CommonScale = std::max(getScale(), OtherScale);
+  ThisVal = ThisVal.shl(CommonScale - getScale());
+  OtherVal = OtherVal.shl(CommonScale - OtherScale);
+
+  if (ThisSigned && OtherSigned) {
+    if (ThisVal.sgt(OtherVal))
+      return 1;
+    else if (ThisVal.slt(OtherVal))
+      return -1;
+  } else if (!ThisSigned && !OtherSigned) {
+    if (ThisVal.ugt(OtherVal))
+      return 1;
+    else if (ThisVal.ult(OtherVal))
+      return -1;
+  } else if (ThisSigned && !OtherSigned) {
+    if (ThisVal.isSignBitSet())
+      return -1;
+    else if (ThisVal.ugt(OtherVal))
+      return 1;
+    else if (ThisVal.ult(OtherVal))
+      return -1;
+  } else {
+    // !ThisSigned && OtherSigned
+    if (OtherVal.isSignBitSet())
+      return 1;
+    else if (ThisVal.ugt(OtherVal))
+      return 1;
+    else if (ThisVal.ult(OtherVal))
+      return -1;
+  }
+
+  return 0;
+}
+
+APFixedPoint APFixedPoint::getMax(const FixedPointSemantics &Sema) {
+  bool IsUnsigned = !Sema.isSigned();
+  auto Val = APSInt::getMaxValue(Sema.getWidth(), IsUnsigned);
+  if (IsUnsigned && Sema.hasUnsignedPadding())
+    Val = Val.lshr(1);
+  return APFixedPoint(Val, Sema);
+}
+
+APFixedPoint APFixedPoint::getMin(const FixedPointSemantics &Sema) {
+  auto Val = APSInt::getMinValue(Sema.getWidth(), !Sema.isSigned());
+  return APFixedPoint(Val, Sema);
+}
+
+bool FixedPointSemantics::fitsInFloatSemantics(
+    const fltSemantics &FloatSema) const {
+  // A fixed point semantic fits in a floating point semantic if the maximum
+  // and minimum values as integers of the fixed point semantic can fit in the
+  // floating point semantic.
+
+  // If these values do not fit, then a floating point rescaling of the true
+  // maximum/minimum value will not fit either, so the floating point semantic
+  // cannot be used to perform such a rescaling.
+
+  APSInt MaxInt = APFixedPoint::getMax(*this).getValue();
+  APFloat F(FloatSema);
+  APFloat::opStatus Status = F.convertFromAPInt(MaxInt, MaxInt.isSigned(),
+                                                APFloat::rmNearestTiesToAway);
+  if ((Status & APFloat::opOverflow) || !isSigned())
+    return !(Status & APFloat::opOverflow);
+
+  APSInt MinInt = APFixedPoint::getMin(*this).getValue();
+  Status = F.convertFromAPInt(MinInt, MinInt.isSigned(),
+                              APFloat::rmNearestTiesToAway);
+  return !(Status & APFloat::opOverflow);
+}
+
+FixedPointSemantics FixedPointSemantics::getCommonSemantics(
+    const FixedPointSemantics &Other) const {
+  unsigned CommonScale = std::max(getScale(), Other.getScale());
+  unsigned CommonWidth =
+      std::max(getIntegralBits(), Other.getIntegralBits()) + CommonScale;
+
+  bool ResultIsSigned = isSigned() || Other.isSigned();
+  bool ResultIsSaturated = isSaturated() || Other.isSaturated();
+  bool ResultHasUnsignedPadding = false;
+  if (!ResultIsSigned) {
+    // Both are unsigned.
+    ResultHasUnsignedPadding = hasUnsignedPadding() &&
+                               Other.hasUnsignedPadding() && !ResultIsSaturated;
+  }
+
+  // If the result is signed, add an extra bit for the sign. Otherwise, if it is
+  // unsigned and has unsigned padding, we only need to add the extra padding
+  // bit back if we are not saturating.
+  if (ResultIsSigned || ResultHasUnsignedPadding)
+    CommonWidth++;
+
+  return FixedPointSemantics(CommonWidth, CommonScale, ResultIsSigned,
+                             ResultIsSaturated, ResultHasUnsignedPadding);
+}
+
+APFixedPoint APFixedPoint::add(const APFixedPoint &Other,
+                               bool *Overflow) const {
+  auto CommonFXSema = Sema.getCommonSemantics(Other.getSemantics());
+  APFixedPoint ConvertedThis = convert(CommonFXSema);
+  APFixedPoint ConvertedOther = Other.convert(CommonFXSema);
+  APSInt ThisVal = ConvertedThis.getValue();
+  APSInt OtherVal = ConvertedOther.getValue();
+  bool Overflowed = false;
+
+  APSInt Result;
+  if (CommonFXSema.isSaturated()) {
+    Result = CommonFXSema.isSigned() ? ThisVal.sadd_sat(OtherVal)
+                                     : ThisVal.uadd_sat(OtherVal);
+  } else {
+    Result = ThisVal.isSigned() ? ThisVal.sadd_ov(OtherVal, Overflowed)
+                                : ThisVal.uadd_ov(OtherVal, Overflowed);
+  }
+
+  if (Overflow)
+    *Overflow = Overflowed;
+
+  return APFixedPoint(Result, CommonFXSema);
+}
+
+APFixedPoint APFixedPoint::sub(const APFixedPoint &Other,
+                               bool *Overflow) const {
+  auto CommonFXSema = Sema.getCommonSemantics(Other.getSemantics());
+  APFixedPoint ConvertedThis = convert(CommonFXSema);
+  APFixedPoint ConvertedOther = Other.convert(CommonFXSema);
+  APSInt ThisVal = ConvertedThis.getValue();
+  APSInt OtherVal = ConvertedOther.getValue();
+  bool Overflowed = false;
+
+  APSInt Result;
+  if (CommonFXSema.isSaturated()) {
+    Result = CommonFXSema.isSigned() ? ThisVal.ssub_sat(OtherVal)
+                                     : ThisVal.usub_sat(OtherVal);
+  } else {
+    Result = ThisVal.isSigned() ? ThisVal.ssub_ov(OtherVal, Overflowed)
+                                : ThisVal.usub_ov(OtherVal, Overflowed);
+  }
+
+  if (Overflow)
+    *Overflow = Overflowed;
+
+  return APFixedPoint(Result, CommonFXSema);
+}
+
+APFixedPoint APFixedPoint::mul(const APFixedPoint &Other,
+                               bool *Overflow) const {
+  auto CommonFXSema = Sema.getCommonSemantics(Other.getSemantics());
+  APFixedPoint ConvertedThis = convert(CommonFXSema);
+  APFixedPoint ConvertedOther = Other.convert(CommonFXSema);
+  APSInt ThisVal = ConvertedThis.getValue();
+  APSInt OtherVal = ConvertedOther.getValue();
+  bool Overflowed = false;
+
+  // Widen the LHS and RHS so we can perform a full multiplication.
+  unsigned Wide = CommonFXSema.getWidth() * 2;
+  if (CommonFXSema.isSigned()) {
+    ThisVal = ThisVal.sextOrSelf(Wide);
+    OtherVal = OtherVal.sextOrSelf(Wide);
+  } else {
+    ThisVal = ThisVal.zextOrSelf(Wide);
+    OtherVal = OtherVal.zextOrSelf(Wide);
+  }
+
+  // Perform the full multiplication and downscale to get the same scale.
+  //
+  // Note that the right shifts here perform an implicit downwards rounding.
+  // This rounding could discard bits that would technically place the result
+  // outside the representable range. We interpret the spec as allowing us to
+  // perform the rounding step first, avoiding the overflow case that would
+  // arise.
+  APSInt Result;
+  if (CommonFXSema.isSigned())
+    Result = ThisVal.smul_ov(OtherVal, Overflowed)
+                    .ashr(CommonFXSema.getScale());
+  else
+    Result = ThisVal.umul_ov(OtherVal, Overflowed)
+                    .lshr(CommonFXSema.getScale());
+  assert(!Overflowed && "Full multiplication cannot overflow!");
+  Result.setIsSigned(CommonFXSema.isSigned());
+
+  // If our result lies outside of the representative range of the common
+  // semantic, we either have overflow or saturation.
+  APSInt Max = APFixedPoint::getMax(CommonFXSema).getValue()
+                                                 .extOrTrunc(Wide);
+  APSInt Min = APFixedPoint::getMin(CommonFXSema).getValue()
+                                                 .extOrTrunc(Wide);
+  if (CommonFXSema.isSaturated()) {
+    if (Result < Min)
+      Result = Min;
+    else if (Result > Max)
+      Result = Max;
+  } else
+    Overflowed = Result < Min || Result > Max;
+
+  if (Overflow)
+    *Overflow = Overflowed;
+
+  return APFixedPoint(Result.sextOrTrunc(CommonFXSema.getWidth()),
+                      CommonFXSema);
+}
+
+APFixedPoint APFixedPoint::div(const APFixedPoint &Other,
+                               bool *Overflow) const {
+  auto CommonFXSema = Sema.getCommonSemantics(Other.getSemantics());
+  APFixedPoint ConvertedThis = convert(CommonFXSema);
+  APFixedPoint ConvertedOther = Other.convert(CommonFXSema);
+  APSInt ThisVal = ConvertedThis.getValue();
+  APSInt OtherVal = ConvertedOther.getValue();
+  bool Overflowed = false;
+
+  // Widen the LHS and RHS so we can perform a full division.
+  unsigned Wide = CommonFXSema.getWidth() * 2;
+  if (CommonFXSema.isSigned()) {
+    ThisVal = ThisVal.sextOrSelf(Wide);
+    OtherVal = OtherVal.sextOrSelf(Wide);
+  } else {
+    ThisVal = ThisVal.zextOrSelf(Wide);
+    OtherVal = OtherVal.zextOrSelf(Wide);
+  }
+
+  // Upscale to compensate for the loss of precision from division, and
+  // perform the full division.
+  ThisVal = ThisVal.shl(CommonFXSema.getScale());
+  APSInt Result;
+  if (CommonFXSema.isSigned()) {
+    APInt Rem;
+    APInt::sdivrem(ThisVal, OtherVal, Result, Rem);
+    // If the quotient is negative and the remainder is nonzero, round
+    // towards negative infinity by subtracting epsilon from the result.
+    if (ThisVal.isNegative() != OtherVal.isNegative() && !Rem.isNullValue())
+      Result = Result - 1;
+  } else
+    Result = ThisVal.udiv(OtherVal);
+  Result.setIsSigned(CommonFXSema.isSigned());
+
+  // If our result lies outside of the representative range of the common
+  // semantic, we either have overflow or saturation.
+  APSInt Max = APFixedPoint::getMax(CommonFXSema).getValue()
+                                                 .extOrTrunc(Wide);
+  APSInt Min = APFixedPoint::getMin(CommonFXSema).getValue()
+                                                 .extOrTrunc(Wide);
+  if (CommonFXSema.isSaturated()) {
+    if (Result < Min)
+      Result = Min;
+    else if (Result > Max)
+      Result = Max;
+  } else
+    Overflowed = Result < Min || Result > Max;
+
+  if (Overflow)
+    *Overflow = Overflowed;
+
+  return APFixedPoint(Result.sextOrTrunc(CommonFXSema.getWidth()),
+                      CommonFXSema);
+}
+
+APFixedPoint APFixedPoint::shl(unsigned Amt, bool *Overflow) const {
+  APSInt ThisVal = Val;
+  bool Overflowed = false;
+
+  // Widen the LHS.
+  unsigned Wide = Sema.getWidth() * 2;
+  if (Sema.isSigned())
+    ThisVal = ThisVal.sextOrSelf(Wide);
+  else
+    ThisVal = ThisVal.zextOrSelf(Wide);
+
+  // Clamp the shift amount at the original width, and perform the shift.
+  Amt = std::min(Amt, ThisVal.getBitWidth());
+  APSInt Result = ThisVal << Amt;
+  Result.setIsSigned(Sema.isSigned());
+
+  // If our result lies outside of the representative range of the
+  // semantic, we either have overflow or saturation.
+  APSInt Max = APFixedPoint::getMax(Sema).getValue().extOrTrunc(Wide);
+  APSInt Min = APFixedPoint::getMin(Sema).getValue().extOrTrunc(Wide);
+  if (Sema.isSaturated()) {
+    if (Result < Min)
+      Result = Min;
+    else if (Result > Max)
+      Result = Max;
+  } else
+    Overflowed = Result < Min || Result > Max;
+
+  if (Overflow)
+    *Overflow = Overflowed;
+
+  return APFixedPoint(Result.sextOrTrunc(Sema.getWidth()), Sema);
+}
+
+void APFixedPoint::toString(SmallVectorImpl<char> &Str) const {
+  APSInt Val = getValue();
+  unsigned Scale = getScale();
+
+  if (Val.isSigned() && Val.isNegative() && Val != -Val) {
+    Val = -Val;
+    Str.push_back('-');
+  }
+
+  APSInt IntPart = Val >> Scale;
+
+  // Add 4 digits to hold the value after multiplying 10 (the radix)
+  unsigned Width = Val.getBitWidth() + 4;
+  APInt FractPart = Val.zextOrTrunc(Scale).zext(Width);
+  APInt FractPartMask = APInt::getAllOnesValue(Scale).zext(Width);
+  APInt RadixInt = APInt(Width, 10);
+
+  IntPart.toString(Str, /*Radix=*/10);
+  Str.push_back('.');
+  do {
+    (FractPart * RadixInt)
+        .lshr(Scale)
+        .toString(Str, /*Radix=*/10, Val.isSigned());
+    FractPart = (FractPart * RadixInt) & FractPartMask;
+  } while (FractPart != 0);
+}
+
+APFixedPoint APFixedPoint::negate(bool *Overflow) const {
+  if (!isSaturated()) {
+    if (Overflow)
+      *Overflow =
+          (!isSigned() && Val != 0) || (isSigned() && Val.isMinSignedValue());
+    return APFixedPoint(-Val, Sema);
+  }
+
+  // We never overflow for saturation
+  if (Overflow)
+    *Overflow = false;
+
+  if (isSigned())
+    return Val.isMinSignedValue() ? getMax(Sema) : APFixedPoint(-Val, Sema);
+  else
+    return APFixedPoint(Sema);
+}
+
+APSInt APFixedPoint::convertToInt(unsigned DstWidth, bool DstSign,
+                                  bool *Overflow) const {
+  APSInt Result = getIntPart();
+  unsigned SrcWidth = getWidth();
+
+  APSInt DstMin = APSInt::getMinValue(DstWidth, !DstSign);
+  APSInt DstMax = APSInt::getMaxValue(DstWidth, !DstSign);
+
+  if (SrcWidth < DstWidth) {
+    Result = Result.extend(DstWidth);
+  } else if (SrcWidth > DstWidth) {
+    DstMin = DstMin.extend(SrcWidth);
+    DstMax = DstMax.extend(SrcWidth);
+  }
+
+  if (Overflow) {
+    if (Result.isSigned() && !DstSign) {
+      *Overflow = Result.isNegative() || Result.ugt(DstMax);
+    } else if (Result.isUnsigned() && DstSign) {
+      *Overflow = Result.ugt(DstMax);
+    } else {
+      *Overflow = Result < DstMin || Result > DstMax;
+    }
+  }
+
+  Result.setIsSigned(DstSign);
+  return Result.extOrTrunc(DstWidth);
+}
+
+const fltSemantics *APFixedPoint::promoteFloatSemantics(const fltSemantics *S) {
+  if (S == &APFloat::BFloat())
+    return &APFloat::IEEEdouble();
+  else if (S == &APFloat::IEEEhalf())
+    return &APFloat::IEEEsingle();
+  else if (S == &APFloat::IEEEsingle())
+    return &APFloat::IEEEdouble();
+  else if (S == &APFloat::IEEEdouble())
+    return &APFloat::IEEEquad();
+  llvm_unreachable("Could not promote float type!");
+}
+
+APFloat APFixedPoint::convertToFloat(const fltSemantics &FloatSema) const {
+  // For some operations, rounding mode has an effect on the result, while
+  // other operations are lossless and should never result in rounding.
+  // To signify which these operations are, we define two rounding modes here.
+  APFloat::roundingMode RM = APFloat::rmNearestTiesToEven;
+  APFloat::roundingMode LosslessRM = APFloat::rmTowardZero;
+
+  // Make sure that we are operating in a type that works with this fixed-point
+  // semantic.
+  const fltSemantics *OpSema = &FloatSema;
+  while (!Sema.fitsInFloatSemantics(*OpSema))
+    OpSema = promoteFloatSemantics(OpSema);
+
+  // Convert the fixed point value bits as an integer. If the floating point
+  // value does not have the required precision, we will round according to the
+  // given mode.
+  APFloat Flt(*OpSema);
+  APFloat::opStatus S = Flt.convertFromAPInt(Val, Sema.isSigned(), RM);
+
+  // If we cared about checking for precision loss, we could look at this
+  // status.
+  (void)S;
+
+  // Scale down the integer value in the float to match the correct scaling
+  // factor.
+  APFloat ScaleFactor(std::pow(2, -(int)Sema.getScale()));
+  bool Ignored;
+  ScaleFactor.convert(*OpSema, LosslessRM, &Ignored);
+  Flt.multiply(ScaleFactor, LosslessRM);
+
+  if (OpSema != &FloatSema)
+    Flt.convert(FloatSema, RM, &Ignored);
+
+  return Flt;
+}
+
+APFixedPoint APFixedPoint::getFromIntValue(const APSInt &Value,
+                                           const FixedPointSemantics &DstFXSema,
+                                           bool *Overflow) {
+  FixedPointSemantics IntFXSema = FixedPointSemantics::GetIntegerSemantics(
+      Value.getBitWidth(), Value.isSigned());
+  return APFixedPoint(Value, IntFXSema).convert(DstFXSema, Overflow);
+}
+
+APFixedPoint
+APFixedPoint::getFromFloatValue(const APFloat &Value,
+                                const FixedPointSemantics &DstFXSema,
+                                bool *Overflow) {
+  // For some operations, rounding mode has an effect on the result, while
+  // other operations are lossless and should never result in rounding.
+  // To signify which these operations are, we define two rounding modes here,
+  // even though they are the same mode.
+  APFloat::roundingMode RM = APFloat::rmTowardZero;
+  APFloat::roundingMode LosslessRM = APFloat::rmTowardZero;
+
+  const fltSemantics &FloatSema = Value.getSemantics();
+
+  if (Value.isNaN()) {
+    // Handle NaN immediately.
+    if (Overflow)
+      *Overflow = true;
+    return APFixedPoint(DstFXSema);
+  }
+
+  // Make sure that we are operating in a type that works with this fixed-point
+  // semantic.
+  const fltSemantics *OpSema = &FloatSema;
+  while (!DstFXSema.fitsInFloatSemantics(*OpSema))
+    OpSema = promoteFloatSemantics(OpSema);
+
+  APFloat Val = Value;
+
+  bool Ignored;
+  if (&FloatSema != OpSema)
+    Val.convert(*OpSema, LosslessRM, &Ignored);
+
+  // Scale up the float so that the 'fractional' part of the mantissa ends up in
+  // the integer range instead. Rounding mode is irrelevant here.
+  // It is fine if this overflows to infinity even for saturating types,
+  // since we will use floating point comparisons to check for saturation.
+  APFloat ScaleFactor(std::pow(2, DstFXSema.getScale()));
+  ScaleFactor.convert(*OpSema, LosslessRM, &Ignored);
+  Val.multiply(ScaleFactor, LosslessRM);
+
+  // Convert to the integral representation of the value. This rounding mode
+  // is significant.
+  APSInt Res(DstFXSema.getWidth(), !DstFXSema.isSigned());
+  Val.convertToInteger(Res, RM, &Ignored);
+
+  // Round the integral value and scale back. This makes the
+  // overflow calculations below work properly. If we do not round here,
+  // we risk checking for overflow with a value that is outside the
+  // representable range of the fixed-point semantic even though no overflow
+  // would occur had we rounded first.
+  ScaleFactor = APFloat(std::pow(2, -(int)DstFXSema.getScale()));
+  ScaleFactor.convert(*OpSema, LosslessRM, &Ignored);
+  Val.roundToIntegral(RM);
+  Val.multiply(ScaleFactor, LosslessRM);
+
+  // Check for overflow/saturation by checking if the floating point value
+  // is outside the range representable by the fixed-point value.
+  APFloat FloatMax = getMax(DstFXSema).convertToFloat(*OpSema);
+  APFloat FloatMin = getMin(DstFXSema).convertToFloat(*OpSema);
+  bool Overflowed = false;
+  if (DstFXSema.isSaturated()) {
+    if (Val > FloatMax)
+      Res = getMax(DstFXSema).getValue();
+    else if (Val < FloatMin)
+      Res = getMin(DstFXSema).getValue();
+  } else
+    Overflowed = Val > FloatMax || Val < FloatMin;
+
+  if (Overflow)
+    *Overflow = Overflowed;
+
+  return APFixedPoint(Res, DstFXSema);
+}
+
+} // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Support/APFloat.cpp b/contrib/llvm-project/llvm/lib/Support/APFloat.cpp
index 362595d8f8b1..5dea98ee3993 100644
--- a/contrib/llvm-project/llvm/lib/Support/APFloat.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/APFloat.cpp
@@ -755,6 +755,7 @@ void IEEEFloat::copySignificand(const IEEEFloat &rhs) {
 void IEEEFloat::makeNaN(bool SNaN, bool Negative, const APInt *fill) {
   category = fcNaN;
   sign = Negative;
+  exponent = exponentNaN();
 
   integerPart *significand = significandParts();
   unsigned numParts = partCount();
@@ -841,7 +842,7 @@ bool IEEEFloat::isSignificandAllOnes() const {
   // Test if the significand excluding the integral bit is all ones. This allows
   // us to test for binade boundaries.
   const integerPart *Parts = significandParts();
-  const unsigned PartCount = partCount();
+  const unsigned PartCount = partCountForBits(semantics->precision);
   for (unsigned i = 0; i < PartCount - 1; i++)
     if (~Parts[i])
       return false;
@@ -849,8 +850,8 @@ bool IEEEFloat::isSignificandAllOnes() const {
   // Set the unused high bits to all ones when we compare.
   const unsigned NumHighBits =
     PartCount*integerPartWidth - semantics->precision + 1;
-  assert(NumHighBits <= integerPartWidth && "Can not have more high bits to "
-         "fill than integerPartWidth");
+  assert(NumHighBits <= integerPartWidth && NumHighBits > 0 &&
+         "Can not have more high bits to fill than integerPartWidth");
   const integerPart HighBitFill =
     ~integerPart(0) << (integerPartWidth - NumHighBits);
   if (~(Parts[PartCount - 1] | HighBitFill))
@@ -863,15 +864,16 @@ bool IEEEFloat::isSignificandAllZeros() const {
   // Test if the significand excluding the integral bit is all zeros. This
   // allows us to test for binade boundaries.
   const integerPart *Parts = significandParts();
-  const unsigned PartCount = partCount();
+  const unsigned PartCount = partCountForBits(semantics->precision);
 
   for (unsigned i = 0; i < PartCount - 1; i++)
     if (Parts[i])
       return false;
 
+  // Compute how many bits are used in the final word.
   const unsigned NumHighBits =
     PartCount*integerPartWidth - semantics->precision + 1;
-  assert(NumHighBits <= integerPartWidth && "Can not have more high bits to "
+  assert(NumHighBits < integerPartWidth && "Can not have more high bits to "
          "clear than integerPartWidth");
   const integerPart HighBitMask = ~integerPart(0) >> NumHighBits;
 
@@ -925,8 +927,7 @@ IEEEFloat::IEEEFloat(const fltSemantics &ourSemantics, integerPart value) {
 
 IEEEFloat::IEEEFloat(const fltSemantics &ourSemantics) {
   initialize(&ourSemantics);
-  category = fcZero;
-  sign = false;
+  makeZero(false);
 }
 
 // Delegate to the previous constructor, because later copy constructor may
@@ -2242,26 +2243,15 @@ IEEEFloat::opStatus IEEEFloat::convert(const fltSemantics &toSemantics,
     if (!X86SpecialNan && semantics == &semX87DoubleExtended)
       APInt::tcSetBit(significandParts(), semantics->precision - 1);
 
-    // If we are truncating NaN, it is possible that we shifted out all of the
-    // set bits in a signalling NaN payload. But NaN must remain NaN, so some
-    // bit in the significand must be set (otherwise it is Inf).
-    // This can only happen with sNaN. Set the 1st bit after the quiet bit,
-    // so that we still have an sNaN.
-    // FIXME: Set quiet and return opInvalidOp (on convert of any sNaN).
-    //        But this requires fixing LLVM to parse 32-bit hex FP or ignoring
-    //        conversions while parsing IR.
-    if (APInt::tcIsZero(significandParts(), newPartCount)) {
-      assert(shift < 0 && "Should not lose NaN payload on extend");
-      assert(semantics->precision >= 3 && "Unexpectedly narrow significand");
-      assert(*losesInfo && "Missing payload should have set lost info");
-      APInt::tcSetBit(significandParts(), semantics->precision - 3);
+    // Convert of sNaN creates qNaN and raises an exception (invalid op).
+    // This also guarantees that a sNaN does not become Inf on a truncation
+    // that loses all payload bits.
+    if (isSignaling()) {
+      makeQuiet();
+      fs = opInvalidOp;
+    } else {
+      fs = opOK;
     }
-
-    // gcc forces the Quiet bit on, which means (float)(double)(float_sNan)
-    // does not give you back the same bits.  This is dubious, and we
-    // don't currently do it.  You're really supposed to get
-    // an invalid operation signal at runtime, but nobody does that.
-    fs = opOK;
   } else {
     *losesInfo = false;
     fs = opOK;
@@ -3394,15 +3384,13 @@ void IEEEFloat::initFromF80LongDoubleAPInt(const APInt &api) {
 
   sign = static_cast<unsigned int>(i2>>15);
   if (myexponent == 0 && mysignificand == 0) {
-    // exponent, significand meaningless
-    category = fcZero;
+    makeZero(sign);
   } else if (myexponent==0x7fff && mysignificand==0x8000000000000000ULL) {
-    // exponent, significand meaningless
-    category = fcInfinity;
+    makeInf(sign);
   } else if ((myexponent == 0x7fff && mysignificand != 0x8000000000000000ULL) ||
              (myexponent != 0x7fff && myexponent != 0 && myintegerbit == 0)) {
-    // exponent meaningless
     category = fcNaN;
+    exponent = exponentNaN();
     significandParts()[0] = mysignificand;
     significandParts()[1] = 0;
   } else {
@@ -3453,16 +3441,14 @@ void IEEEFloat::initFromQuadrupleAPInt(const APInt &api) {
   sign = static_cast<unsigned int>(i2>>63);
   if (myexponent==0 &&
       (mysignificand==0 && mysignificand2==0)) {
-    // exponent, significand meaningless
-    category = fcZero;
+    makeZero(sign);
   } else if (myexponent==0x7fff &&
              (mysignificand==0 && mysignificand2==0)) {
-    // exponent, significand meaningless
-    category = fcInfinity;
+    makeInf(sign);
   } else if (myexponent==0x7fff &&
              (mysignificand!=0 || mysignificand2 !=0)) {
-    // exponent meaningless
     category = fcNaN;
+    exponent = exponentNaN();
     significandParts()[0] = mysignificand;
     significandParts()[1] = mysignificand2;
   } else {
@@ -3488,14 +3474,12 @@ void IEEEFloat::initFromDoubleAPInt(const APInt &api) {
 
   sign = static_cast<unsigned int>(i>>63);
   if (myexponent==0 && mysignificand==0) {
-    // exponent, significand meaningless
-    category = fcZero;
+    makeZero(sign);
   } else if (myexponent==0x7ff && mysignificand==0) {
-    // exponent, significand meaningless
-    category = fcInfinity;
+    makeInf(sign);
   } else if (myexponent==0x7ff && mysignificand!=0) {
-    // exponent meaningless
     category = fcNaN;
+    exponent = exponentNaN();
     *significandParts() = mysignificand;
   } else {
     category = fcNormal;
@@ -3519,14 +3503,12 @@ void IEEEFloat::initFromFloatAPInt(const APInt &api) {
 
   sign = i >> 31;
   if (myexponent==0 && mysignificand==0) {
-    // exponent, significand meaningless
-    category = fcZero;
+    makeZero(sign);
   } else if (myexponent==0xff && mysignificand==0) {
-    // exponent, significand meaningless
-    category = fcInfinity;
+    makeInf(sign);
   } else if (myexponent==0xff && mysignificand!=0) {
-    // sign, exponent, significand meaningless
     category = fcNaN;
+    exponent = exponentNaN();
     *significandParts() = mysignificand;
   } else {
     category = fcNormal;
@@ -3550,14 +3532,12 @@ void IEEEFloat::initFromBFloatAPInt(const APInt &api) {
 
   sign = i >> 15;
   if (myexponent == 0 && mysignificand == 0) {
-    // exponent, significand meaningless
-    category = fcZero;
+    makeZero(sign);
   } else if (myexponent == 0xff && mysignificand == 0) {
-    // exponent, significand meaningless
-    category = fcInfinity;
+    makeInf(sign);
   } else if (myexponent == 0xff && mysignificand != 0) {
-    // sign, exponent, significand meaningless
     category = fcNaN;
+    exponent = exponentNaN();
     *significandParts() = mysignificand;
   } else {
     category = fcNormal;
@@ -3581,14 +3561,12 @@ void IEEEFloat::initFromHalfAPInt(const APInt &api) {
 
   sign = i >> 15;
   if (myexponent==0 && mysignificand==0) {
-    // exponent, significand meaningless
-    category = fcZero;
+    makeZero(sign);
   } else if (myexponent==0x1f && mysignificand==0) {
-    // exponent, significand meaningless
-    category = fcInfinity;
+    makeInf(sign);
   } else if (myexponent==0x1f && mysignificand!=0) {
-    // sign, exponent, significand meaningless
     category = fcNaN;
+    exponent = exponentNaN();
     *significandParts() = mysignificand;
   } else {
     category = fcNormal;
@@ -4146,17 +4124,29 @@ IEEEFloat::opStatus IEEEFloat::next(bool nextDown) {
   return result;
 }
 
+APFloatBase::ExponentType IEEEFloat::exponentNaN() const {
+  return semantics->maxExponent + 1;
+}
+
+APFloatBase::ExponentType IEEEFloat::exponentInf() const {
+  return semantics->maxExponent + 1;
+}
+
+APFloatBase::ExponentType IEEEFloat::exponentZero() const {
+  return semantics->minExponent - 1;
+}
+
 void IEEEFloat::makeInf(bool Negative) {
   category = fcInfinity;
   sign = Negative;
-  exponent = semantics->maxExponent + 1;
+  exponent = exponentInf();
   APInt::tcSet(significandParts(), 0, partCount());
 }
 
 void IEEEFloat::makeZero(bool Negative) {
   category = fcZero;
   sign = Negative;
-  exponent = semantics->minExponent-1;
+  exponent = exponentZero();
   APInt::tcSet(significandParts(), 0, partCount());
 }
 
@@ -4884,6 +4874,6 @@ APFloat::opStatus APFloat::convertToInteger(APSInt &result,
   return status;
 }
 
-} // End llvm namespace
+} // namespace llvm
 
 #undef APFLOAT_DISPATCH_ON_SEMANTICS
diff --git a/contrib/llvm-project/llvm/lib/Support/APInt.cpp b/contrib/llvm-project/llvm/lib/Support/APInt.cpp
index 9a6f93feaa29..12ceb2df112e 100644
--- a/contrib/llvm-project/llvm/lib/Support/APInt.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/APInt.cpp
@@ -338,8 +338,7 @@ void APInt::flipAllBitsSlowCase() {
 /// Toggles a given bit to its opposite value.
 void APInt::flipBit(unsigned bitPosition) {
   assert(bitPosition < BitWidth && "Out of the bit-width range!");
-  if ((*this)[bitPosition]) clearBit(bitPosition);
-  else setBit(bitPosition);
+  setBitVal(bitPosition, !(*this)[bitPosition]);
 }
 
 void APInt::insertBits(const APInt &subBits, unsigned bitPosition) {
@@ -393,12 +392,8 @@ void APInt::insertBits(const APInt &subBits, unsigned bitPosition) {
   // General case - set/clear individual bits in dst based on src.
   // TODO - there is scope for optimization here, but at the moment this code
   // path is barely used so prefer readability over performance.
-  for (unsigned i = 0; i != subBitWidth; ++i) {
-    if (subBits[i])
-      setBit(bitPosition + i);
-    else
-      clearBit(bitPosition + i);
-  }
+  for (unsigned i = 0; i != subBitWidth; ++i)
+    setBitVal(bitPosition + i, subBits[i]);
 }
 
 void APInt::insertBits(uint64_t subBits, unsigned bitPosition, unsigned numBits) {
@@ -966,6 +961,12 @@ APInt APInt::sextOrTrunc(unsigned width) const {
   return *this;
 }
 
+APInt APInt::truncOrSelf(unsigned width) const {
+  if (BitWidth > width)
+    return trunc(width);
+  return *this;
+}
+
 APInt APInt::zextOrSelf(unsigned width) const {
   if (BitWidth < width)
     return zext(width);
diff --git a/contrib/llvm-project/llvm/lib/Support/ARMAttributeParser.cpp b/contrib/llvm-project/llvm/lib/Support/ARMAttributeParser.cpp
index 17ad38d22614..459691923af8 100644
--- a/contrib/llvm-project/llvm/lib/Support/ARMAttributeParser.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/ARMAttributeParser.cpp
@@ -113,7 +113,7 @@ Error ARMAttributeParser::ARM_ISA_use(AttrType tag) {
 }
 
 Error ARMAttributeParser::THUMB_ISA_use(AttrType tag) {
-  static const char *strings[] = {"Not Permitted", "Thumb-1", "Thumb-2"};
+  static const char *strings[] = {"Not Permitted", "Thumb-1", "Thumb-2", "Permitted"};
   return parseStringAttribute("THUMB_ISA_use", tag, makeArrayRef(strings));
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Support/ARMTargetParser.cpp b/contrib/llvm-project/llvm/lib/Support/ARMTargetParser.cpp
index 56a91f7dc787..eb425cbb1d25 100644
--- a/contrib/llvm-project/llvm/lib/Support/ARMTargetParser.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/ARMTargetParser.cpp
@@ -76,6 +76,7 @@ unsigned ARM::parseArchVersion(StringRef Arch) {
   case ArchKind::ARMV8_4A:
   case ArchKind::ARMV8_5A:
   case ArchKind::ARMV8_6A:
+  case ArchKind::ARMV8_7A:
   case ArchKind::ARMV8R:
   case ArchKind::ARMV8MBaseline:
   case ArchKind::ARMV8MMainline:
@@ -111,6 +112,7 @@ ARM::ProfileKind ARM::parseArchProfile(StringRef Arch) {
   case ArchKind::ARMV8_4A:
   case ArchKind::ARMV8_5A:
   case ArchKind::ARMV8_6A:
+  case ArchKind::ARMV8_7A:
     return ProfileKind::A;
   case ArchKind::ARMV2:
   case ArchKind::ARMV2A:
@@ -154,6 +156,7 @@ StringRef ARM::getArchSynonym(StringRef Arch) {
       .Case("v8.4a", "v8.4-a")
       .Case("v8.5a", "v8.5-a")
       .Case("v8.6a", "v8.6-a")
+      .Case("v8.7a", "v8.7-a")
       .Case("v8r", "v8-r")
       .Case("v8m.base", "v8-m.base")
       .Case("v8m.main", "v8-m.main")
@@ -255,7 +258,7 @@ ARM::ISAKind ARM::parseArchISA(StringRef Arch) {
 
 unsigned ARM::parseFPU(StringRef FPU) {
   StringRef Syn = getFPUSynonym(FPU);
-  for (const auto F : FPUNames) {
+  for (const auto &F : FPUNames) {
     if (Syn == F.getName())
       return F.ID;
   }
@@ -280,6 +283,8 @@ StringRef ARM::getCanonicalArchName(StringRef Arch) {
   // Begins with "arm" / "thumb", move past it.
   if (A.startswith("arm64_32"))
     offset = 8;
+  else if (A.startswith("arm64e"))
+    offset = 6;
   else if (A.startswith("arm64"))
     offset = 5;
   else if (A.startswith("aarch64_32"))
@@ -409,7 +414,7 @@ bool ARM::getExtensionFeatures(uint64_t Extensions,
   if (Extensions == AEK_INVALID)
     return false;
 
-  for (const auto AE : ARCHExtNames) {
+  for (const auto &AE : ARCHExtNames) {
     if ((Extensions & AE.ID) == AE.ID && AE.Feature)
       Features.push_back(AE.Feature);
     else if (AE.NegFeature)
@@ -436,7 +441,7 @@ unsigned ARM::getArchAttr(ARM::ArchKind AK) {
 }
 
 StringRef ARM::getArchExtName(uint64_t ArchExtKind) {
-  for (const auto AE : ARCHExtNames) {
+  for (const auto &AE : ARCHExtNames) {
     if (ArchExtKind == AE.ID)
       return AE.getName();
   }
@@ -453,7 +458,7 @@ static bool stripNegationPrefix(StringRef &Name) {
 
 StringRef ARM::getArchExtFeature(StringRef ArchExt) {
   bool Negated = stripNegationPrefix(ArchExt);
-  for (const auto AE : ARCHExtNames) {
+  for (const auto &AE : ARCHExtNames) {
     if (AE.Feature && ArchExt == AE.getName())
       return StringRef(Negated ? AE.NegFeature : AE.Feature);
   }
@@ -490,9 +495,10 @@ static unsigned findDoublePrecisionFPU(unsigned InputFPUKind) {
   return ARM::FK_INVALID;
 }
 
-bool ARM::appendArchExtFeatures(
-  StringRef CPU, ARM::ArchKind AK, StringRef ArchExt,
-  std::vector<StringRef> &Features) {
+bool ARM::appendArchExtFeatures(StringRef CPU, ARM::ArchKind AK,
+                                StringRef ArchExt,
+                                std::vector<StringRef> &Features,
+                                unsigned &ArgFPUID) {
 
   size_t StartingNumFeatures = Features.size();
   const bool Negated = stripNegationPrefix(ArchExt);
@@ -501,7 +507,7 @@ bool ARM::appendArchExtFeatures(
   if (ID == AEK_INVALID)
     return false;
 
-  for (const auto AE : ARCHExtNames) {
+  for (const auto &AE : ARCHExtNames) {
     if (Negated) {
       if ((AE.ID & ID) == ID && AE.NegFeature)
         Features.push_back(AE.NegFeature);
@@ -527,13 +533,14 @@ bool ARM::appendArchExtFeatures(
     } else {
       FPUKind = getDefaultFPU(CPU, AK);
     }
+    ArgFPUID = FPUKind;
     return ARM::getFPUFeatures(FPUKind, Features);
   }
   return StartingNumFeatures != Features.size();
 }
 
 StringRef ARM::getHWDivName(uint64_t HWDivKind) {
-  for (const auto D : HWDivNames) {
+  for (const auto &D : HWDivNames) {
     if (HWDivKind == D.ID)
       return D.getName();
   }
@@ -546,7 +553,7 @@ StringRef ARM::getDefaultCPU(StringRef Arch) {
     return StringRef();
 
   // Look for multiple AKs to find the default for pair AK+Name.
-  for (const auto CPU : CPUNames) {
+  for (const auto &CPU : CPUNames) {
     if (CPU.ArchID == AK && CPU.Default)
       return CPU.getName();
   }
@@ -557,7 +564,7 @@ StringRef ARM::getDefaultCPU(StringRef Arch) {
 
 uint64_t ARM::parseHWDiv(StringRef HWDiv) {
   StringRef Syn = getHWDivSynonym(HWDiv);
-  for (const auto D : HWDivNames) {
+  for (const auto &D : HWDivNames) {
     if (Syn == D.getName())
       return D.ID;
   }
@@ -565,7 +572,7 @@ uint64_t ARM::parseHWDiv(StringRef HWDiv) {
 }
 
 uint64_t ARM::parseArchExt(StringRef ArchExt) {
-  for (const auto A : ARCHExtNames) {
+  for (const auto &A : ARCHExtNames) {
     if (ArchExt == A.getName())
       return A.ID;
   }
@@ -573,7 +580,7 @@ uint64_t ARM::parseArchExt(StringRef ArchExt) {
 }
 
 ARM::ArchKind ARM::parseCPUArch(StringRef CPU) {
-  for (const auto C : CPUNames) {
+  for (const auto &C : CPUNames) {
     if (CPU == C.getName())
       return C.ArchID;
   }
diff --git a/contrib/llvm-project/llvm/lib/Support/CRC.cpp b/contrib/llvm-project/llvm/lib/Support/CRC.cpp
index 7ff09debe3b7..2bc668beed32 100644
--- a/contrib/llvm-project/llvm/lib/Support/CRC.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/CRC.cpp
@@ -25,7 +25,7 @@
 
 using namespace llvm;
 
-#if LLVM_ENABLE_ZLIB == 0 || !HAVE_ZLIB_H
+#if !LLVM_ENABLE_ZLIB
 
 static const uint32_t CRCTable[256] = {
     0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f,
diff --git a/contrib/llvm-project/llvm/lib/Support/CachePruning.cpp b/contrib/llvm-project/llvm/lib/Support/CachePruning.cpp
index 7663644db558..5c5759ffbaca 100644
--- a/contrib/llvm-project/llvm/lib/Support/CachePruning.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/CachePruning.cpp
@@ -211,11 +211,12 @@ bool llvm::pruneCache(StringRef Path, CachePruningPolicy Policy) {
   // Walk all of the files within this directory.
   for (sys::fs::directory_iterator File(CachePathNative, EC), FileEnd;
        File != FileEnd && !EC; File.increment(EC)) {
-    // Ignore any files not beginning with the string "llvmcache-". This
+    // Ignore filenames not beginning with "llvmcache-" or "Thin-". This
     // includes the timestamp file as well as any files created by the user.
     // This acts as a safeguard against data loss if the user specifies the
     // wrong directory as their cache directory.
-    if (!sys::path::filename(File->path()).startswith("llvmcache-"))
+    StringRef filename = sys::path::filename(File->path());
+    if (!filename.startswith("llvmcache-") && !filename.startswith("Thin-"))
       continue;
 
     // Look at this file. If we can't stat it, there's nothing interesting
diff --git a/contrib/llvm-project/llvm/lib/Support/CommandLine.cpp b/contrib/llvm-project/llvm/lib/Support/CommandLine.cpp
index 12ef0d511b14..123a23a5242c 100644
--- a/contrib/llvm-project/llvm/lib/Support/CommandLine.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/CommandLine.cpp
@@ -464,7 +464,7 @@ void Option::addCategory(OptionCategory &C) {
   // must be explicitly added if you want multiple categories that include it.
   if (&C != &GeneralCategory && Categories[0] == &GeneralCategory)
     Categories[0] = &C;
-  else if (find(Categories, &C) == Categories.end())
+  else if (!is_contained(Categories, &C))
     Categories.push_back(&C);
 }
 
@@ -531,11 +531,7 @@ Option *CommandLineParser::LookupOption(SubCommand &Sub, StringRef &Arg,
   // If we have an equals sign, remember the value.
   if (EqualPos == StringRef::npos) {
     // Look up the option.
-    auto I = Sub.OptionsMap.find(Arg);
-    if (I == Sub.OptionsMap.end())
-      return nullptr;
-
-    return I != Sub.OptionsMap.end() ? I->second : nullptr;
+    return Sub.OptionsMap.lookup(Arg);
   }
 
   // If the argument before the = is a valid option name and the option allows
@@ -832,7 +828,7 @@ void cl::TokenizeGNUCommandLine(StringRef Src, StringSaver &Saver,
     // Consume runs of whitespace.
     if (Token.empty()) {
       while (I != E && isWhitespace(Src[I])) {
-        // Mark the end of lines in response files
+        // Mark the end of lines in response files.
         if (MarkEOLs && Src[I] == '\n')
           NewArgv.push_back(nullptr);
         ++I;
@@ -869,6 +865,9 @@ void cl::TokenizeGNUCommandLine(StringRef Src, StringSaver &Saver,
     if (isWhitespace(C)) {
       if (!Token.empty())
         NewArgv.push_back(Saver.save(StringRef(Token)).data());
+      // Mark the end of lines in response files.
+      if (MarkEOLs && C == '\n')
+        NewArgv.push_back(nullptr);
       Token.clear();
       continue;
     }
@@ -880,9 +879,6 @@ void cl::TokenizeGNUCommandLine(StringRef Src, StringSaver &Saver,
   // Append the last token after hitting EOF with no whitespace.
   if (!Token.empty())
     NewArgv.push_back(Saver.save(StringRef(Token)).data());
-  // Mark the end of response files
-  if (MarkEOLs)
-    NewArgv.push_back(nullptr);
 }
 
 /// Backslashes are interpreted in a rather complicated way in the Windows-style
@@ -956,11 +952,11 @@ tokenizeWindowsCommandLineImpl(StringRef Src, StringSaver &Saver,
         ++I;
       StringRef NormalChars = Src.slice(Start, I);
       if (I >= E || isWhitespaceOrNull(Src[I])) {
-        if (I < E && Src[I] == '\n')
-          MarkEOL();
         // No special characters: slice out the substring and start the next
         // token. Copy the string if the caller asks us to.
         AddToken(AlwaysCopy ? Saver.save(NormalChars) : NormalChars);
+        if (I < E && Src[I] == '\n')
+          MarkEOL();
       } else if (Src[I] == '\"') {
         Token += NormalChars;
         State = QUOTED;
@@ -1208,7 +1204,7 @@ bool cl::ExpandResponseFiles(StringSaver &Saver, TokenizerCallback Tokenizer,
     };
 
     // Check for recursive response files.
-    if (std::any_of(FileStack.begin() + 1, FileStack.end(), IsEquivalent)) {
+    if (any_of(drop_begin(FileStack), IsEquivalent)) {
       // This file is recursive, so we leave it in the argument stream and
       // move on.
       AllExpanded = false;
@@ -1251,6 +1247,22 @@ bool cl::ExpandResponseFiles(StringSaver &Saver, TokenizerCallback Tokenizer,
   return AllExpanded;
 }
 
+bool cl::expandResponseFiles(int Argc, const char *const *Argv,
+                             const char *EnvVar, StringSaver &Saver,
+                             SmallVectorImpl<const char *> &NewArgv) {
+  auto Tokenize = Triple(sys::getProcessTriple()).isOSWindows()
+                      ? cl::TokenizeWindowsCommandLine
+                      : cl::TokenizeGNUCommandLine;
+  // The environment variable specifies initial options.
+  if (EnvVar)
+    if (llvm::Optional<std::string> EnvValue = sys::Process::GetEnv(EnvVar))
+      Tokenize(*EnvValue, Saver, NewArgv, /*MarkEOLs=*/false);
+
+  // Command line options can override the environment variable.
+  NewArgv.append(Argv + 1, Argv + Argc);
+  return ExpandResponseFiles(Saver, Tokenize, NewArgv);
+}
+
 bool cl::readConfigFile(StringRef CfgFile, StringSaver &Saver,
                         SmallVectorImpl<const char *> &Argv) {
   SmallString<128> AbsPath;
@@ -1271,36 +1283,6 @@ bool cl::readConfigFile(StringRef CfgFile, StringSaver &Saver,
                              /*MarkEOLs*/ false, /*RelativeNames*/ true);
 }
 
-/// ParseEnvironmentOptions - An alternative entry point to the
-/// CommandLine library, which allows you to read the program's name
-/// from the caller (as PROGNAME) and its command-line arguments from
-/// an environment variable (whose name is given in ENVVAR).
-///
-void cl::ParseEnvironmentOptions(const char *progName, const char *envVar,
-                                 const char *Overview) {
-  // Check args.
-  assert(progName && "Program name not specified");
-  assert(envVar && "Environment variable name missing");
-
-  // Get the environment variable they want us to parse options out of.
-  llvm::Optional<std::string> envValue = sys::Process::GetEnv(StringRef(envVar));
-  if (!envValue)
-    return;
-
-  // Get program's "name", which we wouldn't know without the caller
-  // telling us.
-  SmallVector<const char *, 20> newArgv;
-  BumpPtrAllocator A;
-  StringSaver Saver(A);
-  newArgv.push_back(Saver.save(progName).data());
-
-  // Parse the value of the environment variable into a "command line"
-  // and hand it off to ParseCommandLineOptions().
-  TokenizeGNUCommandLine(*envValue, Saver, newArgv);
-  int newArgc = static_cast<int>(newArgv.size());
-  ParseCommandLineOptions(newArgc, &newArgv[0], StringRef(Overview));
-}
-
 bool cl::ParseCommandLineOptions(int argc, const char *const *argv,
                                  StringRef Overview, raw_ostream *Errs,
                                  const char *EnvVar,
@@ -1744,6 +1726,19 @@ void Option::printHelpStr(StringRef HelpStr, size_t Indent,
   }
 }
 
+void Option::printEnumValHelpStr(StringRef HelpStr, size_t BaseIndent,
+                                 size_t FirstLineIndentedBy) {
+  const StringRef ValHelpPrefix = "  ";
+  assert(BaseIndent >= FirstLineIndentedBy);
+  std::pair<StringRef, StringRef> Split = HelpStr.split('\n');
+  outs().indent(BaseIndent - FirstLineIndentedBy)
+      << ArgHelpPrefix << ValHelpPrefix << Split.first << "\n";
+  while (!Split.second.empty()) {
+    Split = Split.second.split('\n');
+    outs().indent(BaseIndent + ValHelpPrefix.size()) << Split.first << "\n";
+  }
+}
+
 // Print out the option for the alias.
 void alias::printOptionInfo(size_t GlobalWidth) const {
   outs() << PrintArg(ArgStr);
@@ -1989,17 +1984,17 @@ void generic_parser_base::printOptionInfo(const Option &O,
       StringRef Description = getDescription(i);
       if (!shouldPrintOption(OptionName, Description, O))
         continue;
-      assert(GlobalWidth >= OptionName.size() + OptionPrefixesSize);
-      size_t NumSpaces = GlobalWidth - OptionName.size() - OptionPrefixesSize;
+      size_t FirstLineIndent = OptionName.size() + OptionPrefixesSize;
       outs() << OptionPrefix << OptionName;
       if (OptionName.empty()) {
         outs() << EmptyOption;
-        assert(NumSpaces >= EmptyOption.size());
-        NumSpaces -= EmptyOption.size();
+        assert(FirstLineIndent >= EmptyOption.size());
+        FirstLineIndent += EmptyOption.size();
       }
       if (!Description.empty())
-        outs().indent(NumSpaces) << ArgHelpPrefix << "  " << Description;
-      outs() << '\n';
+        Option::printEnumValHelpStr(Description, GlobalWidth, FirstLineIndent);
+      else
+        outs() << '\n';
     }
   } else {
     if (!O.HelpStr.empty())
@@ -2604,7 +2599,7 @@ void cl::HideUnrelatedOptions(ArrayRef<const cl::OptionCategory *> Categories,
                               SubCommand &Sub) {
   for (auto &I : Sub.OptionsMap) {
     for (auto &Cat : I.second->Categories) {
-      if (find(Categories, Cat) == Categories.end() && Cat != &GenericCategory)
+      if (!is_contained(Categories, Cat) && Cat != &GenericCategory)
         I.second->setHiddenFlag(cl::ReallyHidden);
     }
   }
diff --git a/contrib/llvm-project/llvm/lib/Support/Compression.cpp b/contrib/llvm-project/llvm/lib/Support/Compression.cpp
index 27d92f0e0aec..b8c77cf69b95 100644
--- a/contrib/llvm-project/llvm/lib/Support/Compression.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/Compression.cpp
@@ -17,13 +17,13 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
-#if LLVM_ENABLE_ZLIB == 1 && HAVE_ZLIB_H
+#if LLVM_ENABLE_ZLIB
 #include <zlib.h>
 #endif
 
 using namespace llvm;
 
-#if LLVM_ENABLE_ZLIB == 1 && HAVE_LIBZ
+#if LLVM_ENABLE_ZLIB
 static Error createError(StringRef Err) {
   return make_error<StringError>(Err, inconvertibleErrorCode());
 }
diff --git a/contrib/llvm-project/llvm/lib/Support/ConvertUTFWrapper.cpp b/contrib/llvm-project/llvm/lib/Support/ConvertUTFWrapper.cpp
index 6ec567882ea6..d8d46712a593 100644
--- a/contrib/llvm-project/llvm/lib/Support/ConvertUTFWrapper.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/ConvertUTFWrapper.cpp
@@ -97,6 +97,8 @@ bool convertUTF16ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out) {
   const UTF16 *Src = reinterpret_cast<const UTF16 *>(SrcBytes.begin());
   const UTF16 *SrcEnd = reinterpret_cast<const UTF16 *>(SrcBytes.end());
 
+  assert((uintptr_t)Src % sizeof(UTF16) == 0);
+
   // Byteswap if necessary.
   std::vector<UTF16> ByteSwapped;
   if (Src[0] == UNI_UTF16_BYTE_ORDER_MARK_SWAPPED) {
diff --git a/contrib/llvm-project/llvm/lib/Support/CrashRecoveryContext.cpp b/contrib/llvm-project/llvm/lib/Support/CrashRecoveryContext.cpp
index ec7d7d641dce..3d3ca7f567c7 100644
--- a/contrib/llvm-project/llvm/lib/Support/CrashRecoveryContext.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/CrashRecoveryContext.cpp
@@ -9,14 +9,12 @@
 #include "llvm/Support/CrashRecoveryContext.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ExitCodes.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/ThreadLocal.h"
 #include <mutex>
 #include <setjmp.h>
-#if LLVM_ON_UNIX
-#include <sysexits.h> // EX_IOERR
-#endif
 
 using namespace llvm;
 
@@ -97,6 +95,13 @@ static void uninstallExceptionOrSignalHandlers();
 
 CrashRecoveryContextCleanup::~CrashRecoveryContextCleanup() {}
 
+CrashRecoveryContext::CrashRecoveryContext() {
+  // On Windows, if abort() was previously triggered (and caught by a previous
+  // CrashRecoveryContext) the Windows CRT removes our installed signal handler,
+  // so we need to install it again.
+  sys::DisableSystemDialogsOnCrash();
+}
+
 CrashRecoveryContext::~CrashRecoveryContext() {
   // Reclaim registered resources.
   CrashRecoveryContextCleanup *i = head;
@@ -370,9 +375,10 @@ static void CrashRecoverySignalHandler(int Signal) {
   sigaddset(&SigMask, Signal);
   sigprocmask(SIG_UNBLOCK, &SigMask, nullptr);
 
-  // As per convention, -2 indicates a crash or timeout as opposed to failure to
-  // execute (see llvm/include/llvm/Support/Program.h)
-  int RetCode = -2;
+  // Return the same error code as if the program crashed, as mentioned in the
+  // section "Exit Status for Commands":
+  // https://pubs.opengroup.org/onlinepubs/9699919799/xrat/V4_xcu_chap02.html
+  int RetCode = 128 + Signal;
 
   // Don't consider a broken pipe as a crash (see clang/lib/Driver/Driver.cpp)
   if (Signal == SIGPIPE)
@@ -436,6 +442,27 @@ void CrashRecoveryContext::HandleExit(int RetCode) {
   llvm_unreachable("Most likely setjmp wasn't called!");
 }
 
+bool CrashRecoveryContext::throwIfCrash(int RetCode) {
+#if defined(_WIN32)
+  // On Windows, the high bits are reserved for kernel return codes. Values
+  // starting with 0x80000000 are reserved for "warnings"; values of 0xC0000000
+  // and up are for "errors". In practice, both are interpreted as a
+  // non-continuable signal.
+  unsigned Code = ((unsigned)RetCode & 0xF0000000) >> 28;
+  if (Code != 0xC && Code != 8)
+    return false;
+  ::RaiseException(RetCode, 0, 0, NULL);
+#else
+  // On Unix, signals are represented by return codes of 128 or higher.
+  // Exit code 128 is a reserved value and should not be raised as a signal.
+  if (RetCode <= 128)
+    return false;
+  llvm::sys::unregisterHandlers();
+  raise(RetCode - 128);
+#endif
+  return true;
+}
+
 // FIXME: Portability.
 static void setThreadBackgroundPriority() {
 #ifdef __APPLE__
diff --git a/contrib/llvm-project/llvm/lib/Support/DebugCounter.cpp b/contrib/llvm-project/llvm/lib/Support/DebugCounter.cpp
index 8c579f395282..7bb231c79239 100644
--- a/contrib/llvm-project/llvm/lib/Support/DebugCounter.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/DebugCounter.cpp
@@ -118,7 +118,7 @@ void DebugCounter::push_back(const std::string &Val) {
 void DebugCounter::print(raw_ostream &OS) const {
   SmallVector<StringRef, 16> CounterNames(RegisteredCounters.begin(),
                                           RegisteredCounters.end());
-  sort(CounterNames.begin(), CounterNames.end());
+  sort(CounterNames);
 
   auto &Us = instance();
   OS << "Counters and values:\n";
diff --git a/contrib/llvm-project/llvm/lib/Support/DynamicLibrary.cpp b/contrib/llvm-project/llvm/lib/Support/DynamicLibrary.cpp
index d23716016fb2..bdf74623670b 100644
--- a/contrib/llvm-project/llvm/lib/Support/DynamicLibrary.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/DynamicLibrary.cpp
@@ -39,9 +39,7 @@ public:
   HandleSet() : Process(nullptr) {}
   ~HandleSet();
 
-  HandleList::iterator Find(void *Handle) {
-    return std::find(Handles.begin(), Handles.end(), Handle);
-  }
+  HandleList::iterator Find(void *Handle) { return find(Handles, Handle); }
 
   bool Contains(void *Handle) {
     return Handle == Process || Find(Handle) != Handles.end();
diff --git a/contrib/llvm-project/llvm/lib/Support/ELFAttributeParser.cpp b/contrib/llvm-project/llvm/lib/Support/ELFAttributeParser.cpp
index df955cdf5d30..2a30794bc1e9 100644
--- a/contrib/llvm-project/llvm/lib/Support/ELFAttributeParser.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/ELFAttributeParser.cpp
@@ -200,7 +200,7 @@ Error ELFAttributeParser::parse(ArrayRef<uint8_t> section,
 
   // Unrecognized format-version.
   uint8_t formatVersion = de.getU8(cursor);
-  if (formatVersion != 'A')
+  if (formatVersion != ELFAttrs::Format_Version)
     return createStringError(errc::invalid_argument,
                              "unrecognized format-version: 0x" +
                                  utohexstr(formatVersion));
diff --git a/contrib/llvm-project/llvm/lib/Support/Error.cpp b/contrib/llvm-project/llvm/lib/Support/Error.cpp
index 315a11e967d1..e7ab4387dfd1 100644
--- a/contrib/llvm-project/llvm/lib/Support/Error.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/Error.cpp
@@ -168,3 +168,7 @@ void LLVMDisposeErrorMessage(char *ErrMsg) { delete[] ErrMsg; }
 LLVMErrorTypeId LLVMGetStringErrorTypeId() {
   return reinterpret_cast<void *>(&StringError::ID);
 }
+
+LLVMErrorRef LLVMCreateStringError(const char *ErrMsg) {
+  return wrap(make_error<StringError>(ErrMsg, inconvertibleErrorCode()));
+}
diff --git a/contrib/llvm-project/llvm/lib/Support/ErrorHandling.cpp b/contrib/llvm-project/llvm/lib/Support/ErrorHandling.cpp
index f70a6921a41a..ce6344284f06 100644
--- a/contrib/llvm-project/llvm/lib/Support/ErrorHandling.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/ErrorHandling.cpp
@@ -168,9 +168,11 @@ void llvm::report_bad_alloc_error(const char *Reason, bool GenCrashDiag) {
 #else
   // Don't call the normal error handler. It may allocate memory. Directly write
   // an OOM to stderr and abort.
-  char OOMMessage[] = "LLVM ERROR: out of memory\n";
-  ssize_t written = ::write(2, OOMMessage, strlen(OOMMessage));
-  (void)written;
+  const char *OOMMessage = "LLVM ERROR: out of memory\n";
+  const char *Newline = "\n";
+  (void)!::write(2, OOMMessage, strlen(OOMMessage));
+  (void)!::write(2, Reason, strlen(Reason));
+  (void)!::write(2, Newline, strlen(Newline));
   abort();
 #endif
 }
@@ -192,7 +194,8 @@ static void out_of_memory_new_handler() {
 void llvm::install_out_of_memory_new_handler() {
   std::new_handler old = std::set_new_handler(out_of_memory_new_handler);
   (void)old;
-  assert(old == nullptr && "new-handler already installed");
+  assert((old == nullptr || old == out_of_memory_new_handler) &&
+         "new-handler already installed");
 }
 #endif
 
diff --git a/contrib/llvm-project/llvm/lib/Support/FileCheck.cpp b/contrib/llvm-project/llvm/lib/Support/FileCheck.cpp
deleted file mode 100644
index d0e79c675bcb..000000000000
--- a/contrib/llvm-project/llvm/lib/Support/FileCheck.cpp
+++ /dev/null
@@ -1,2580 +0,0 @@
-//===- FileCheck.cpp - Check that File's Contents match what is expected --===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// FileCheck does a line-by line check of a file that validates whether it
-// contains the expected content.  This is useful for regression tests etc.
-//
-// This file implements most of the API that will be used by the FileCheck utility
-// as well as various unittests.
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Support/FileCheck.h"
-#include "FileCheckImpl.h"
-#include "llvm/ADT/StringSet.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/Support/CheckedArithmetic.h"
-#include "llvm/Support/FormatVariadic.h"
-#include <cstdint>
-#include <list>
-#include <tuple>
-#include <utility>
-
-using namespace llvm;
-
-StringRef ExpressionFormat::toString() const {
-  switch (Value) {
-  case Kind::NoFormat:
-    return StringRef("<none>");
-  case Kind::Unsigned:
-    return StringRef("%u");
-  case Kind::Signed:
-    return StringRef("%d");
-  case Kind::HexUpper:
-    return StringRef("%X");
-  case Kind::HexLower:
-    return StringRef("%x");
-  }
-  llvm_unreachable("unknown expression format");
-}
-
-Expected<StringRef> ExpressionFormat::getWildcardRegex() const {
-  switch (Value) {
-  case Kind::Unsigned:
-    return StringRef("[0-9]+");
-  case Kind::Signed:
-    return StringRef("-?[0-9]+");
-  case Kind::HexUpper:
-    return StringRef("[0-9A-F]+");
-  case Kind::HexLower:
-    return StringRef("[0-9a-f]+");
-  default:
-    return createStringError(std::errc::invalid_argument,
-                             "trying to match value with invalid format");
-  }
-}
-
-Expected<std::string>
-ExpressionFormat::getMatchingString(ExpressionValue IntegerValue) const {
-  if (Value == Kind::Signed) {
-    Expected<int64_t> SignedValue = IntegerValue.getSignedValue();
-    if (!SignedValue)
-      return SignedValue.takeError();
-    return itostr(*SignedValue);
-  }
-
-  Expected<uint64_t> UnsignedValue = IntegerValue.getUnsignedValue();
-  if (!UnsignedValue)
-    return UnsignedValue.takeError();
-  switch (Value) {
-  case Kind::Unsigned:
-    return utostr(*UnsignedValue);
-  case Kind::HexUpper:
-    return utohexstr(*UnsignedValue, /*LowerCase=*/false);
-  case Kind::HexLower:
-    return utohexstr(*UnsignedValue, /*LowerCase=*/true);
-  default:
-    return createStringError(std::errc::invalid_argument,
-                             "trying to match value with invalid format");
-  }
-}
-
-Expected<ExpressionValue>
-ExpressionFormat::valueFromStringRepr(StringRef StrVal,
-                                      const SourceMgr &SM) const {
-  bool ValueIsSigned = Value == Kind::Signed;
-  StringRef OverflowErrorStr = "unable to represent numeric value";
-  if (ValueIsSigned) {
-    int64_t SignedValue;
-
-    if (StrVal.getAsInteger(10, SignedValue))
-      return ErrorDiagnostic::get(SM, StrVal, OverflowErrorStr);
-
-    return ExpressionValue(SignedValue);
-  }
-
-  bool Hex = Value == Kind::HexUpper || Value == Kind::HexLower;
-  uint64_t UnsignedValue;
-  if (StrVal.getAsInteger(Hex ? 16 : 10, UnsignedValue))
-    return ErrorDiagnostic::get(SM, StrVal, OverflowErrorStr);
-
-  return ExpressionValue(UnsignedValue);
-}
-
-static int64_t getAsSigned(uint64_t UnsignedValue) {
-  // Use memcpy to reinterpret the bitpattern in Value since casting to
-  // signed is implementation-defined if the unsigned value is too big to be
-  // represented in the signed type and using an union violates type aliasing
-  // rules.
-  int64_t SignedValue;
-  memcpy(&SignedValue, &UnsignedValue, sizeof(SignedValue));
-  return SignedValue;
-}
-
-Expected<int64_t> ExpressionValue::getSignedValue() const {
-  if (Negative)
-    return getAsSigned(Value);
-
-  if (Value > (uint64_t)std::numeric_limits<int64_t>::max())
-    return make_error<OverflowError>();
-
-  // Value is in the representable range of int64_t so we can use cast.
-  return static_cast<int64_t>(Value);
-}
-
-Expected<uint64_t> ExpressionValue::getUnsignedValue() const {
-  if (Negative)
-    return make_error<OverflowError>();
-
-  return Value;
-}
-
-ExpressionValue ExpressionValue::getAbsolute() const {
-  if (!Negative)
-    return *this;
-
-  int64_t SignedValue = getAsSigned(Value);
-  int64_t MaxInt64 = std::numeric_limits<int64_t>::max();
-  // Absolute value can be represented as int64_t.
-  if (SignedValue >= -MaxInt64)
-    return ExpressionValue(-getAsSigned(Value));
-
-  // -X == -(max int64_t + Rem), negate each component independently.
-  SignedValue += MaxInt64;
-  uint64_t RemainingValueAbsolute = -SignedValue;
-  return ExpressionValue(MaxInt64 + RemainingValueAbsolute);
-}
-
-Expected<ExpressionValue> llvm::operator+(const ExpressionValue &LeftOperand,
-                                          const ExpressionValue &RightOperand) {
-  if (LeftOperand.isNegative() && RightOperand.isNegative()) {
-    int64_t LeftValue = cantFail(LeftOperand.getSignedValue());
-    int64_t RightValue = cantFail(RightOperand.getSignedValue());
-    Optional<int64_t> Result = checkedAdd<int64_t>(LeftValue, RightValue);
-    if (!Result)
-      return make_error<OverflowError>();
-
-    return ExpressionValue(*Result);
-  }
-
-  // (-A) + B == B - A.
-  if (LeftOperand.isNegative())
-    return RightOperand - LeftOperand.getAbsolute();
-
-  // A + (-B) == A - B.
-  if (RightOperand.isNegative())
-    return LeftOperand - RightOperand.getAbsolute();
-
-  // Both values are positive at this point.
-  uint64_t LeftValue = cantFail(LeftOperand.getUnsignedValue());
-  uint64_t RightValue = cantFail(RightOperand.getUnsignedValue());
-  Optional<uint64_t> Result =
-      checkedAddUnsigned<uint64_t>(LeftValue, RightValue);
-  if (!Result)
-    return make_error<OverflowError>();
-
-  return ExpressionValue(*Result);
-}
-
-Expected<ExpressionValue> llvm::operator-(const ExpressionValue &LeftOperand,
-                                          const ExpressionValue &RightOperand) {
-  // Result will be negative and thus might underflow.
-  if (LeftOperand.isNegative() && !RightOperand.isNegative()) {
-    int64_t LeftValue = cantFail(LeftOperand.getSignedValue());
-    uint64_t RightValue = cantFail(RightOperand.getUnsignedValue());
-    // Result <= -1 - (max int64_t) which overflows on 1- and 2-complement.
-    if (RightValue > (uint64_t)std::numeric_limits<int64_t>::max())
-      return make_error<OverflowError>();
-    Optional<int64_t> Result =
-        checkedSub(LeftValue, static_cast<int64_t>(RightValue));
-    if (!Result)
-      return make_error<OverflowError>();
-
-    return ExpressionValue(*Result);
-  }
-
-  // (-A) - (-B) == B - A.
-  if (LeftOperand.isNegative())
-    return RightOperand.getAbsolute() - LeftOperand.getAbsolute();
-
-  // A - (-B) == A + B.
-  if (RightOperand.isNegative())
-    return LeftOperand + RightOperand.getAbsolute();
-
-  // Both values are positive at this point.
-  uint64_t LeftValue = cantFail(LeftOperand.getUnsignedValue());
-  uint64_t RightValue = cantFail(RightOperand.getUnsignedValue());
-  if (LeftValue >= RightValue)
-    return ExpressionValue(LeftValue - RightValue);
-  else {
-    uint64_t AbsoluteDifference = RightValue - LeftValue;
-    uint64_t MaxInt64 = std::numeric_limits<int64_t>::max();
-    // Value might underflow.
-    if (AbsoluteDifference > MaxInt64) {
-      AbsoluteDifference -= MaxInt64;
-      int64_t Result = -MaxInt64;
-      int64_t MinInt64 = std::numeric_limits<int64_t>::min();
-      // Underflow, tested by:
-      //   abs(Result + (max int64_t)) > abs((min int64_t) + (max int64_t))
-      if (AbsoluteDifference > static_cast<uint64_t>(-(MinInt64 - Result)))
-        return make_error<OverflowError>();
-      Result -= static_cast<int64_t>(AbsoluteDifference);
-      return ExpressionValue(Result);
-    }
-
-    return ExpressionValue(-static_cast<int64_t>(AbsoluteDifference));
-  }
-}
-
-Expected<ExpressionValue> llvm::operator*(const ExpressionValue &LeftOperand,
-                                          const ExpressionValue &RightOperand) {
-  // -A * -B == A * B
-  if (LeftOperand.isNegative() && RightOperand.isNegative())
-    return LeftOperand.getAbsolute() * RightOperand.getAbsolute();
-
-  // A * -B == -B * A
-  if (RightOperand.isNegative())
-    return RightOperand * LeftOperand;
-
-  assert(!RightOperand.isNegative() && "Unexpected negative operand!");
-
-  // Result will be negative and can underflow.
-  if (LeftOperand.isNegative()) {
-    auto Result = LeftOperand.getAbsolute() * RightOperand.getAbsolute();
-    if (!Result)
-      return Result;
-
-    return ExpressionValue(0) - *Result;
-  }
-
-  // Result will be positive and can overflow.
-  uint64_t LeftValue = cantFail(LeftOperand.getUnsignedValue());
-  uint64_t RightValue = cantFail(RightOperand.getUnsignedValue());
-  Optional<uint64_t> Result =
-      checkedMulUnsigned<uint64_t>(LeftValue, RightValue);
-  if (!Result)
-    return make_error<OverflowError>();
-
-  return ExpressionValue(*Result);
-}
-
-Expected<ExpressionValue> llvm::operator/(const ExpressionValue &LeftOperand,
-                                          const ExpressionValue &RightOperand) {
-  // -A / -B == A / B
-  if (LeftOperand.isNegative() && RightOperand.isNegative())
-    return LeftOperand.getAbsolute() / RightOperand.getAbsolute();
-
-  // Check for divide by zero.
-  if (RightOperand == ExpressionValue(0))
-    return make_error<OverflowError>();
-
-  // Result will be negative and can underflow.
-  if (LeftOperand.isNegative() || RightOperand.isNegative())
-    return ExpressionValue(0) -
-           cantFail(LeftOperand.getAbsolute() / RightOperand.getAbsolute());
-
-  uint64_t LeftValue = cantFail(LeftOperand.getUnsignedValue());
-  uint64_t RightValue = cantFail(RightOperand.getUnsignedValue());
-  return ExpressionValue(LeftValue / RightValue);
-}
-
-Expected<ExpressionValue> llvm::max(const ExpressionValue &LeftOperand,
-                                    const ExpressionValue &RightOperand) {
-  if (LeftOperand.isNegative() && RightOperand.isNegative()) {
-    int64_t LeftValue = cantFail(LeftOperand.getSignedValue());
-    int64_t RightValue = cantFail(RightOperand.getSignedValue());
-    return ExpressionValue(std::max(LeftValue, RightValue));
-  }
-
-  if (!LeftOperand.isNegative() && !RightOperand.isNegative()) {
-    uint64_t LeftValue = cantFail(LeftOperand.getUnsignedValue());
-    uint64_t RightValue = cantFail(RightOperand.getUnsignedValue());
-    return ExpressionValue(std::max(LeftValue, RightValue));
-  }
-
-  if (LeftOperand.isNegative())
-    return RightOperand;
-
-  return LeftOperand;
-}
-
-Expected<ExpressionValue> llvm::min(const ExpressionValue &LeftOperand,
-                                    const ExpressionValue &RightOperand) {
-  if (cantFail(max(LeftOperand, RightOperand)) == LeftOperand)
-    return RightOperand;
-
-  return LeftOperand;
-}
-
-Expected<ExpressionValue> NumericVariableUse::eval() const {
-  Optional<ExpressionValue> Value = Variable->getValue();
-  if (Value)
-    return *Value;
-
-  return make_error<UndefVarError>(getExpressionStr());
-}
-
-Expected<ExpressionValue> BinaryOperation::eval() const {
-  Expected<ExpressionValue> LeftOp = LeftOperand->eval();
-  Expected<ExpressionValue> RightOp = RightOperand->eval();
-
-  // Bubble up any error (e.g. undefined variables) in the recursive
-  // evaluation.
-  if (!LeftOp || !RightOp) {
-    Error Err = Error::success();
-    if (!LeftOp)
-      Err = joinErrors(std::move(Err), LeftOp.takeError());
-    if (!RightOp)
-      Err = joinErrors(std::move(Err), RightOp.takeError());
-    return std::move(Err);
-  }
-
-  return EvalBinop(*LeftOp, *RightOp);
-}
-
-Expected<ExpressionFormat>
-BinaryOperation::getImplicitFormat(const SourceMgr &SM) const {
-  Expected<ExpressionFormat> LeftFormat = LeftOperand->getImplicitFormat(SM);
-  Expected<ExpressionFormat> RightFormat = RightOperand->getImplicitFormat(SM);
-  if (!LeftFormat || !RightFormat) {
-    Error Err = Error::success();
-    if (!LeftFormat)
-      Err = joinErrors(std::move(Err), LeftFormat.takeError());
-    if (!RightFormat)
-      Err = joinErrors(std::move(Err), RightFormat.takeError());
-    return std::move(Err);
-  }
-
-  if (*LeftFormat != ExpressionFormat::Kind::NoFormat &&
-      *RightFormat != ExpressionFormat::Kind::NoFormat &&
-      *LeftFormat != *RightFormat)
-    return ErrorDiagnostic::get(
-        SM, getExpressionStr(),
-        "implicit format conflict between '" + LeftOperand->getExpressionStr() +
-            "' (" + LeftFormat->toString() + ") and '" +
-            RightOperand->getExpressionStr() + "' (" + RightFormat->toString() +
-            "), need an explicit format specifier");
-
-  return *LeftFormat != ExpressionFormat::Kind::NoFormat ? *LeftFormat
-                                                         : *RightFormat;
-}
-
-Expected<std::string> NumericSubstitution::getResult() const {
-  assert(ExpressionPointer->getAST() != nullptr &&
-         "Substituting empty expression");
-  Expected<ExpressionValue> EvaluatedValue =
-      ExpressionPointer->getAST()->eval();
-  if (!EvaluatedValue)
-    return EvaluatedValue.takeError();
-  ExpressionFormat Format = ExpressionPointer->getFormat();
-  return Format.getMatchingString(*EvaluatedValue);
-}
-
-Expected<std::string> StringSubstitution::getResult() const {
-  // Look up the value and escape it so that we can put it into the regex.
-  Expected<StringRef> VarVal = Context->getPatternVarValue(FromStr);
-  if (!VarVal)
-    return VarVal.takeError();
-  return Regex::escape(*VarVal);
-}
-
-bool Pattern::isValidVarNameStart(char C) { return C == '_' || isAlpha(C); }
-
-Expected<Pattern::VariableProperties>
-Pattern::parseVariable(StringRef &Str, const SourceMgr &SM) {
-  if (Str.empty())
-    return ErrorDiagnostic::get(SM, Str, "empty variable name");
-
-  size_t I = 0;
-  bool IsPseudo = Str[0] == '@';
-
-  // Global vars start with '$'.
-  if (Str[0] == '$' || IsPseudo)
-    ++I;
-
-  if (!isValidVarNameStart(Str[I++]))
-    return ErrorDiagnostic::get(SM, Str, "invalid variable name");
-
-  for (size_t E = Str.size(); I != E; ++I)
-    // Variable names are composed of alphanumeric characters and underscores.
-    if (Str[I] != '_' && !isAlnum(Str[I]))
-      break;
-
-  StringRef Name = Str.take_front(I);
-  Str = Str.substr(I);
-  return VariableProperties {Name, IsPseudo};
-}
-
-// StringRef holding all characters considered as horizontal whitespaces by
-// FileCheck input canonicalization.
-constexpr StringLiteral SpaceChars = " \t";
-
-// Parsing helper function that strips the first character in S and returns it.
-static char popFront(StringRef &S) {
-  char C = S.front();
-  S = S.drop_front();
-  return C;
-}
-
-char OverflowError::ID = 0;
-char UndefVarError::ID = 0;
-char ErrorDiagnostic::ID = 0;
-char NotFoundError::ID = 0;
-
-Expected<NumericVariable *> Pattern::parseNumericVariableDefinition(
-    StringRef &Expr, FileCheckPatternContext *Context,
-    Optional<size_t> LineNumber, ExpressionFormat ImplicitFormat,
-    const SourceMgr &SM) {
-  Expected<VariableProperties> ParseVarResult = parseVariable(Expr, SM);
-  if (!ParseVarResult)
-    return ParseVarResult.takeError();
-  StringRef Name = ParseVarResult->Name;
-
-  if (ParseVarResult->IsPseudo)
-    return ErrorDiagnostic::get(
-        SM, Name, "definition of pseudo numeric variable unsupported");
-
-  // Detect collisions between string and numeric variables when the latter
-  // is created later than the former.
-  if (Context->DefinedVariableTable.find(Name) !=
-      Context->DefinedVariableTable.end())
-    return ErrorDiagnostic::get(
-        SM, Name, "string variable with name '" + Name + "' already exists");
-
-  Expr = Expr.ltrim(SpaceChars);
-  if (!Expr.empty())
-    return ErrorDiagnostic::get(
-        SM, Expr, "unexpected characters after numeric variable name");
-
-  NumericVariable *DefinedNumericVariable;
-  auto VarTableIter = Context->GlobalNumericVariableTable.find(Name);
-  if (VarTableIter != Context->GlobalNumericVariableTable.end()) {
-    DefinedNumericVariable = VarTableIter->second;
-    if (DefinedNumericVariable->getImplicitFormat() != ImplicitFormat)
-      return ErrorDiagnostic::get(
-          SM, Expr, "format different from previous variable definition");
-  } else
-    DefinedNumericVariable =
-        Context->makeNumericVariable(Name, ImplicitFormat, LineNumber);
-
-  return DefinedNumericVariable;
-}
-
-Expected<std::unique_ptr<NumericVariableUse>> Pattern::parseNumericVariableUse(
-    StringRef Name, bool IsPseudo, Optional<size_t> LineNumber,
-    FileCheckPatternContext *Context, const SourceMgr &SM) {
-  if (IsPseudo && !Name.equals("@LINE"))
-    return ErrorDiagnostic::get(
-        SM, Name, "invalid pseudo numeric variable '" + Name + "'");
-
-  // Numeric variable definitions and uses are parsed in the order in which
-  // they appear in the CHECK patterns. For each definition, the pointer to the
-  // class instance of the corresponding numeric variable definition is stored
-  // in GlobalNumericVariableTable in parsePattern. Therefore, if the pointer
-  // we get below is null, it means no such variable was defined before. When
-  // that happens, we create a dummy variable so that parsing can continue. All
-  // uses of undefined variables, whether string or numeric, are then diagnosed
-  // in printSubstitutions() after failing to match.
-  auto VarTableIter = Context->GlobalNumericVariableTable.find(Name);
-  NumericVariable *NumericVariable;
-  if (VarTableIter != Context->GlobalNumericVariableTable.end())
-    NumericVariable = VarTableIter->second;
-  else {
-    NumericVariable = Context->makeNumericVariable(
-        Name, ExpressionFormat(ExpressionFormat::Kind::Unsigned));
-    Context->GlobalNumericVariableTable[Name] = NumericVariable;
-  }
-
-  Optional<size_t> DefLineNumber = NumericVariable->getDefLineNumber();
-  if (DefLineNumber && LineNumber && *DefLineNumber == *LineNumber)
-    return ErrorDiagnostic::get(
-        SM, Name,
-        "numeric variable '" + Name +
-            "' defined earlier in the same CHECK directive");
-
-  return std::make_unique<NumericVariableUse>(Name, NumericVariable);
-}
-
-Expected<std::unique_ptr<ExpressionAST>> Pattern::parseNumericOperand(
-    StringRef &Expr, AllowedOperand AO, bool MaybeInvalidConstraint,
-    Optional<size_t> LineNumber, FileCheckPatternContext *Context,
-    const SourceMgr &SM) {
-  if (Expr.startswith("(")) {
-    if (AO != AllowedOperand::Any)
-      return ErrorDiagnostic::get(
-          SM, Expr, "parenthesized expression not permitted here");
-    return parseParenExpr(Expr, LineNumber, Context, SM);
-  }
-
-  if (AO == AllowedOperand::LineVar || AO == AllowedOperand::Any) {
-    // Try to parse as a numeric variable use.
-    Expected<Pattern::VariableProperties> ParseVarResult =
-        parseVariable(Expr, SM);
-    if (ParseVarResult) {
-      // Try to parse a function call.
-      if (Expr.ltrim(SpaceChars).startswith("(")) {
-        if (AO != AllowedOperand::Any)
-          return ErrorDiagnostic::get(SM, ParseVarResult->Name,
-                                      "unexpected function call");
-
-        return parseCallExpr(Expr, ParseVarResult->Name, LineNumber, Context,
-                             SM);
-      }
-
-      return parseNumericVariableUse(ParseVarResult->Name,
-                                     ParseVarResult->IsPseudo, LineNumber,
-                                     Context, SM);
-    }
-
-    if (AO == AllowedOperand::LineVar)
-      return ParseVarResult.takeError();
-    // Ignore the error and retry parsing as a literal.
-    consumeError(ParseVarResult.takeError());
-  }
-
-  // Otherwise, parse it as a literal.
-  int64_t SignedLiteralValue;
-  uint64_t UnsignedLiteralValue;
-  StringRef SaveExpr = Expr;
-  // Accept both signed and unsigned literal, default to signed literal.
-  if (!Expr.consumeInteger((AO == AllowedOperand::LegacyLiteral) ? 10 : 0,
-                           UnsignedLiteralValue))
-    return std::make_unique<ExpressionLiteral>(SaveExpr.drop_back(Expr.size()),
-                                               UnsignedLiteralValue);
-  Expr = SaveExpr;
-  if (AO == AllowedOperand::Any && !Expr.consumeInteger(0, SignedLiteralValue))
-    return std::make_unique<ExpressionLiteral>(SaveExpr.drop_back(Expr.size()),
-                                               SignedLiteralValue);
-
-  return ErrorDiagnostic::get(
-      SM, Expr,
-      Twine("invalid ") +
-          (MaybeInvalidConstraint ? "matching constraint or " : "") +
-          "operand format");
-}
-
-Expected<std::unique_ptr<ExpressionAST>>
-Pattern::parseParenExpr(StringRef &Expr, Optional<size_t> LineNumber,
-                        FileCheckPatternContext *Context, const SourceMgr &SM) {
-  Expr = Expr.ltrim(SpaceChars);
-  assert(Expr.startswith("("));
-
-  // Parse right operand.
-  Expr.consume_front("(");
-  Expr = Expr.ltrim(SpaceChars);
-  if (Expr.empty())
-    return ErrorDiagnostic::get(SM, Expr, "missing operand in expression");
-
-  // Note: parseNumericOperand handles nested opening parentheses.
-  Expected<std::unique_ptr<ExpressionAST>> SubExprResult = parseNumericOperand(
-      Expr, AllowedOperand::Any, /*MaybeInvalidConstraint=*/false, LineNumber,
-      Context, SM);
-  Expr = Expr.ltrim(SpaceChars);
-  while (SubExprResult && !Expr.empty() && !Expr.startswith(")")) {
-    StringRef OrigExpr = Expr;
-    SubExprResult = parseBinop(OrigExpr, Expr, std::move(*SubExprResult), false,
-                               LineNumber, Context, SM);
-    Expr = Expr.ltrim(SpaceChars);
-  }
-  if (!SubExprResult)
-    return SubExprResult;
-
-  if (!Expr.consume_front(")")) {
-    return ErrorDiagnostic::get(SM, Expr,
-                                "missing ')' at end of nested expression");
-  }
-  return SubExprResult;
-}
-
-Expected<std::unique_ptr<ExpressionAST>>
-Pattern::parseBinop(StringRef Expr, StringRef &RemainingExpr,
-                    std::unique_ptr<ExpressionAST> LeftOp,
-                    bool IsLegacyLineExpr, Optional<size_t> LineNumber,
-                    FileCheckPatternContext *Context, const SourceMgr &SM) {
-  RemainingExpr = RemainingExpr.ltrim(SpaceChars);
-  if (RemainingExpr.empty())
-    return std::move(LeftOp);
-
-  // Check if this is a supported operation and select a function to perform
-  // it.
-  SMLoc OpLoc = SMLoc::getFromPointer(RemainingExpr.data());
-  char Operator = popFront(RemainingExpr);
-  binop_eval_t EvalBinop;
-  switch (Operator) {
-  case '+':
-    EvalBinop = operator+;
-    break;
-  case '-':
-    EvalBinop = operator-;
-    break;
-  default:
-    return ErrorDiagnostic::get(
-        SM, OpLoc, Twine("unsupported operation '") + Twine(Operator) + "'");
-  }
-
-  // Parse right operand.
-  RemainingExpr = RemainingExpr.ltrim(SpaceChars);
-  if (RemainingExpr.empty())
-    return ErrorDiagnostic::get(SM, RemainingExpr,
-                                "missing operand in expression");
-  // The second operand in a legacy @LINE expression is always a literal.
-  AllowedOperand AO =
-      IsLegacyLineExpr ? AllowedOperand::LegacyLiteral : AllowedOperand::Any;
-  Expected<std::unique_ptr<ExpressionAST>> RightOpResult =
-      parseNumericOperand(RemainingExpr, AO, /*MaybeInvalidConstraint=*/false,
-                          LineNumber, Context, SM);
-  if (!RightOpResult)
-    return RightOpResult;
-
-  Expr = Expr.drop_back(RemainingExpr.size());
-  return std::make_unique<BinaryOperation>(Expr, EvalBinop, std::move(LeftOp),
-                                           std::move(*RightOpResult));
-}
-
-Expected<std::unique_ptr<ExpressionAST>>
-Pattern::parseCallExpr(StringRef &Expr, StringRef FuncName,
-                       Optional<size_t> LineNumber,
-                       FileCheckPatternContext *Context, const SourceMgr &SM) {
-  Expr = Expr.ltrim(SpaceChars);
-  assert(Expr.startswith("("));
-
-  auto OptFunc = StringSwitch<Optional<binop_eval_t>>(FuncName)
-                     .Case("add", operator+)
-                     .Case("div", operator/)
-                     .Case("max", max)
-                     .Case("min", min)
-                     .Case("mul", operator*)
-                     .Case("sub", operator-)
-                     .Default(None);
-
-  if (!OptFunc)
-    return ErrorDiagnostic::get(
-        SM, FuncName, Twine("call to undefined function '") + FuncName + "'");
-
-  Expr.consume_front("(");
-  Expr = Expr.ltrim(SpaceChars);
-
-  // Parse call arguments, which are comma separated.
-  SmallVector<std::unique_ptr<ExpressionAST>, 4> Args;
-  while (!Expr.empty() && !Expr.startswith(")")) {
-    if (Expr.startswith(","))
-      return ErrorDiagnostic::get(SM, Expr, "missing argument");
-
-    // Parse the argument, which is an arbitary expression.
-    StringRef OuterBinOpExpr = Expr;
-    Expected<std::unique_ptr<ExpressionAST>> Arg = parseNumericOperand(
-        Expr, AllowedOperand::Any, /*MaybeInvalidConstraint=*/false, LineNumber,
-        Context, SM);
-    while (Arg && !Expr.empty()) {
-      Expr = Expr.ltrim(SpaceChars);
-      // Have we reached an argument terminator?
-      if (Expr.startswith(",") || Expr.startswith(")"))
-        break;
-
-      // Arg = Arg <op> <expr>
-      Arg = parseBinop(OuterBinOpExpr, Expr, std::move(*Arg), false, LineNumber,
-                       Context, SM);
-    }
-
-    // Prefer an expression error over a generic invalid argument message.
-    if (!Arg)
-      return Arg.takeError();
-    Args.push_back(std::move(*Arg));
-
-    // Have we parsed all available arguments?
-    Expr = Expr.ltrim(SpaceChars);
-    if (!Expr.consume_front(","))
-      break;
-
-    Expr = Expr.ltrim(SpaceChars);
-    if (Expr.startswith(")"))
-      return ErrorDiagnostic::get(SM, Expr, "missing argument");
-  }
-
-  if (!Expr.consume_front(")"))
-    return ErrorDiagnostic::get(SM, Expr,
-                                "missing ')' at end of call expression");
-
-  const unsigned NumArgs = Args.size();
-  if (NumArgs == 2)
-    return std::make_unique<BinaryOperation>(Expr, *OptFunc, std::move(Args[0]),
-                                             std::move(Args[1]));
-
-  // TODO: Support more than binop_eval_t.
-  return ErrorDiagnostic::get(SM, FuncName,
-                              Twine("function '") + FuncName +
-                                  Twine("' takes 2 arguments but ") +
-                                  Twine(NumArgs) + " given");
-}
-
-Expected<std::unique_ptr<Expression>> Pattern::parseNumericSubstitutionBlock(
-    StringRef Expr, Optional<NumericVariable *> &DefinedNumericVariable,
-    bool IsLegacyLineExpr, Optional<size_t> LineNumber,
-    FileCheckPatternContext *Context, const SourceMgr &SM) {
-  std::unique_ptr<ExpressionAST> ExpressionASTPointer = nullptr;
-  StringRef DefExpr = StringRef();
-  DefinedNumericVariable = None;
-  ExpressionFormat ExplicitFormat = ExpressionFormat();
-
-  // Parse format specifier (NOTE: ',' is also an argument seperator).
-  size_t FormatSpecEnd = Expr.find(',');
-  size_t FunctionStart = Expr.find('(');
-  if (FormatSpecEnd != StringRef::npos && FormatSpecEnd < FunctionStart) {
-    Expr = Expr.ltrim(SpaceChars);
-    if (!Expr.consume_front("%"))
-      return ErrorDiagnostic::get(
-          SM, Expr, "invalid matching format specification in expression");
-
-    // Check for unknown matching format specifier and set matching format in
-    // class instance representing this expression.
-    SMLoc fmtloc = SMLoc::getFromPointer(Expr.data());
-    switch (popFront(Expr)) {
-    case 'u':
-      ExplicitFormat = ExpressionFormat(ExpressionFormat::Kind::Unsigned);
-      break;
-    case 'd':
-      ExplicitFormat = ExpressionFormat(ExpressionFormat::Kind::Signed);
-      break;
-    case 'x':
-      ExplicitFormat = ExpressionFormat(ExpressionFormat::Kind::HexLower);
-      break;
-    case 'X':
-      ExplicitFormat = ExpressionFormat(ExpressionFormat::Kind::HexUpper);
-      break;
-    default:
-      return ErrorDiagnostic::get(SM, fmtloc,
-                                  "invalid format specifier in expression");
-    }
-
-    Expr = Expr.ltrim(SpaceChars);
-    if (!Expr.consume_front(","))
-      return ErrorDiagnostic::get(
-          SM, Expr, "invalid matching format specification in expression");
-  }
-
-  // Save variable definition expression if any.
-  size_t DefEnd = Expr.find(':');
-  if (DefEnd != StringRef::npos) {
-    DefExpr = Expr.substr(0, DefEnd);
-    Expr = Expr.substr(DefEnd + 1);
-  }
-
-  // Parse matching constraint.
-  Expr = Expr.ltrim(SpaceChars);
-  bool HasParsedValidConstraint = false;
-  if (Expr.consume_front("=="))
-    HasParsedValidConstraint = true;
-
-  // Parse the expression itself.
-  Expr = Expr.ltrim(SpaceChars);
-  if (Expr.empty()) {
-    if (HasParsedValidConstraint)
-      return ErrorDiagnostic::get(
-          SM, Expr, "empty numeric expression should not have a constraint");
-  } else {
-    Expr = Expr.rtrim(SpaceChars);
-    StringRef OuterBinOpExpr = Expr;
-    // The first operand in a legacy @LINE expression is always the @LINE
-    // pseudo variable.
-    AllowedOperand AO =
-        IsLegacyLineExpr ? AllowedOperand::LineVar : AllowedOperand::Any;
-    Expected<std::unique_ptr<ExpressionAST>> ParseResult = parseNumericOperand(
-        Expr, AO, !HasParsedValidConstraint, LineNumber, Context, SM);
-    while (ParseResult && !Expr.empty()) {
-      ParseResult = parseBinop(OuterBinOpExpr, Expr, std::move(*ParseResult),
-                               IsLegacyLineExpr, LineNumber, Context, SM);
-      // Legacy @LINE expressions only allow 2 operands.
-      if (ParseResult && IsLegacyLineExpr && !Expr.empty())
-        return ErrorDiagnostic::get(
-            SM, Expr,
-            "unexpected characters at end of expression '" + Expr + "'");
-    }
-    if (!ParseResult)
-      return ParseResult.takeError();
-    ExpressionASTPointer = std::move(*ParseResult);
-  }
-
-  // Select format of the expression, i.e. (i) its explicit format, if any,
-  // otherwise (ii) its implicit format, if any, otherwise (iii) the default
-  // format (unsigned). Error out in case of conflicting implicit format
-  // without explicit format.
-  ExpressionFormat Format;
-  if (ExplicitFormat)
-    Format = ExplicitFormat;
-  else if (ExpressionASTPointer) {
-    Expected<ExpressionFormat> ImplicitFormat =
-        ExpressionASTPointer->getImplicitFormat(SM);
-    if (!ImplicitFormat)
-      return ImplicitFormat.takeError();
-    Format = *ImplicitFormat;
-  }
-  if (!Format)
-    Format = ExpressionFormat(ExpressionFormat::Kind::Unsigned);
-
-  std::unique_ptr<Expression> ExpressionPointer =
-      std::make_unique<Expression>(std::move(ExpressionASTPointer), Format);
-
-  // Parse the numeric variable definition.
-  if (DefEnd != StringRef::npos) {
-    DefExpr = DefExpr.ltrim(SpaceChars);
-    Expected<NumericVariable *> ParseResult = parseNumericVariableDefinition(
-        DefExpr, Context, LineNumber, ExpressionPointer->getFormat(), SM);
-
-    if (!ParseResult)
-      return ParseResult.takeError();
-    DefinedNumericVariable = *ParseResult;
-  }
-
-  return std::move(ExpressionPointer);
-}
-
-bool Pattern::parsePattern(StringRef PatternStr, StringRef Prefix,
-                           SourceMgr &SM, const FileCheckRequest &Req) {
-  bool MatchFullLinesHere = Req.MatchFullLines && CheckTy != Check::CheckNot;
-  IgnoreCase = Req.IgnoreCase;
-
-  PatternLoc = SMLoc::getFromPointer(PatternStr.data());
-
-  if (!(Req.NoCanonicalizeWhiteSpace && Req.MatchFullLines))
-    // Ignore trailing whitespace.
-    while (!PatternStr.empty() &&
-           (PatternStr.back() == ' ' || PatternStr.back() == '\t'))
-      PatternStr = PatternStr.substr(0, PatternStr.size() - 1);
-
-  // Check that there is something on the line.
-  if (PatternStr.empty() && CheckTy != Check::CheckEmpty) {
-    SM.PrintMessage(PatternLoc, SourceMgr::DK_Error,
-                    "found empty check string with prefix '" + Prefix + ":'");
-    return true;
-  }
-
-  if (!PatternStr.empty() && CheckTy == Check::CheckEmpty) {
-    SM.PrintMessage(
-        PatternLoc, SourceMgr::DK_Error,
-        "found non-empty check string for empty check with prefix '" + Prefix +
-            ":'");
-    return true;
-  }
-
-  if (CheckTy == Check::CheckEmpty) {
-    RegExStr = "(\n$)";
-    return false;
-  }
-
-  // Check to see if this is a fixed string, or if it has regex pieces.
-  if (!MatchFullLinesHere &&
-      (PatternStr.size() < 2 || (PatternStr.find("{{") == StringRef::npos &&
-                                 PatternStr.find("[[") == StringRef::npos))) {
-    FixedStr = PatternStr;
-    return false;
-  }
-
-  if (MatchFullLinesHere) {
-    RegExStr += '^';
-    if (!Req.NoCanonicalizeWhiteSpace)
-      RegExStr += " *";
-  }
-
-  // Paren value #0 is for the fully matched string.  Any new parenthesized
-  // values add from there.
-  unsigned CurParen = 1;
-
-  // Otherwise, there is at least one regex piece.  Build up the regex pattern
-  // by escaping scary characters in fixed strings, building up one big regex.
-  while (!PatternStr.empty()) {
-    // RegEx matches.
-    if (PatternStr.startswith("{{")) {
-      // This is the start of a regex match.  Scan for the }}.
-      size_t End = PatternStr.find("}}");
-      if (End == StringRef::npos) {
-        SM.PrintMessage(SMLoc::getFromPointer(PatternStr.data()),
-                        SourceMgr::DK_Error,
-                        "found start of regex string with no end '}}'");
-        return true;
-      }
-
-      // Enclose {{}} patterns in parens just like [[]] even though we're not
-      // capturing the result for any purpose.  This is required in case the
-      // expression contains an alternation like: CHECK:  abc{{x|z}}def.  We
-      // want this to turn into: "abc(x|z)def" not "abcx|zdef".
-      RegExStr += '(';
-      ++CurParen;
-
-      if (AddRegExToRegEx(PatternStr.substr(2, End - 2), CurParen, SM))
-        return true;
-      RegExStr += ')';
-
-      PatternStr = PatternStr.substr(End + 2);
-      continue;
-    }
-
-    // String and numeric substitution blocks. Pattern substitution blocks come
-    // in two forms: [[foo:.*]] and [[foo]]. The former matches .* (or some
-    // other regex) and assigns it to the string variable 'foo'. The latter
-    // substitutes foo's value. Numeric substitution blocks recognize the same
-    // form as string ones, but start with a '#' sign after the double
-    // brackets. They also accept a combined form which sets a numeric variable
-    // to the evaluation of an expression. Both string and numeric variable
-    // names must satisfy the regular expression "[a-zA-Z_][0-9a-zA-Z_]*" to be
-    // valid, as this helps catch some common errors.
-    if (PatternStr.startswith("[[")) {
-      StringRef UnparsedPatternStr = PatternStr.substr(2);
-      // Find the closing bracket pair ending the match.  End is going to be an
-      // offset relative to the beginning of the match string.
-      size_t End = FindRegexVarEnd(UnparsedPatternStr, SM);
-      StringRef MatchStr = UnparsedPatternStr.substr(0, End);
-      bool IsNumBlock = MatchStr.consume_front("#");
-
-      if (End == StringRef::npos) {
-        SM.PrintMessage(SMLoc::getFromPointer(PatternStr.data()),
-                        SourceMgr::DK_Error,
-                        "Invalid substitution block, no ]] found");
-        return true;
-      }
-      // Strip the substitution block we are parsing. End points to the start
-      // of the "]]" closing the expression so account for it in computing the
-      // index of the first unparsed character.
-      PatternStr = UnparsedPatternStr.substr(End + 2);
-
-      bool IsDefinition = false;
-      bool SubstNeeded = false;
-      // Whether the substitution block is a legacy use of @LINE with string
-      // substitution block syntax.
-      bool IsLegacyLineExpr = false;
-      StringRef DefName;
-      StringRef SubstStr;
-      StringRef MatchRegexp;
-      size_t SubstInsertIdx = RegExStr.size();
-
-      // Parse string variable or legacy @LINE expression.
-      if (!IsNumBlock) {
-        size_t VarEndIdx = MatchStr.find(":");
-        size_t SpacePos = MatchStr.substr(0, VarEndIdx).find_first_of(" \t");
-        if (SpacePos != StringRef::npos) {
-          SM.PrintMessage(SMLoc::getFromPointer(MatchStr.data() + SpacePos),
-                          SourceMgr::DK_Error, "unexpected whitespace");
-          return true;
-        }
-
-        // Get the name (e.g. "foo") and verify it is well formed.
-        StringRef OrigMatchStr = MatchStr;
-        Expected<Pattern::VariableProperties> ParseVarResult =
-            parseVariable(MatchStr, SM);
-        if (!ParseVarResult) {
-          logAllUnhandledErrors(ParseVarResult.takeError(), errs());
-          return true;
-        }
-        StringRef Name = ParseVarResult->Name;
-        bool IsPseudo = ParseVarResult->IsPseudo;
-
-        IsDefinition = (VarEndIdx != StringRef::npos);
-        SubstNeeded = !IsDefinition;
-        if (IsDefinition) {
-          if ((IsPseudo || !MatchStr.consume_front(":"))) {
-            SM.PrintMessage(SMLoc::getFromPointer(Name.data()),
-                            SourceMgr::DK_Error,
-                            "invalid name in string variable definition");
-            return true;
-          }
-
-          // Detect collisions between string and numeric variables when the
-          // former is created later than the latter.
-          if (Context->GlobalNumericVariableTable.find(Name) !=
-              Context->GlobalNumericVariableTable.end()) {
-            SM.PrintMessage(
-                SMLoc::getFromPointer(Name.data()), SourceMgr::DK_Error,
-                "numeric variable with name '" + Name + "' already exists");
-            return true;
-          }
-          DefName = Name;
-          MatchRegexp = MatchStr;
-        } else {
-          if (IsPseudo) {
-            MatchStr = OrigMatchStr;
-            IsLegacyLineExpr = IsNumBlock = true;
-          } else
-            SubstStr = Name;
-        }
-      }
-
-      // Parse numeric substitution block.
-      std::unique_ptr<Expression> ExpressionPointer;
-      Optional<NumericVariable *> DefinedNumericVariable;
-      if (IsNumBlock) {
-        Expected<std::unique_ptr<Expression>> ParseResult =
-            parseNumericSubstitutionBlock(MatchStr, DefinedNumericVariable,
-                                          IsLegacyLineExpr, LineNumber, Context,
-                                          SM);
-        if (!ParseResult) {
-          logAllUnhandledErrors(ParseResult.takeError(), errs());
-          return true;
-        }
-        ExpressionPointer = std::move(*ParseResult);
-        SubstNeeded = ExpressionPointer->getAST() != nullptr;
-        if (DefinedNumericVariable) {
-          IsDefinition = true;
-          DefName = (*DefinedNumericVariable)->getName();
-        }
-        if (SubstNeeded)
-          SubstStr = MatchStr;
-        else {
-          ExpressionFormat Format = ExpressionPointer->getFormat();
-          MatchRegexp = cantFail(Format.getWildcardRegex());
-        }
-      }
-
-      // Handle variable definition: [[<def>:(...)]] and [[#(...)<def>:(...)]].
-      if (IsDefinition) {
-        RegExStr += '(';
-        ++SubstInsertIdx;
-
-        if (IsNumBlock) {
-          NumericVariableMatch NumericVariableDefinition = {
-              *DefinedNumericVariable, CurParen};
-          NumericVariableDefs[DefName] = NumericVariableDefinition;
-          // This store is done here rather than in match() to allow
-          // parseNumericVariableUse() to get the pointer to the class instance
-          // of the right variable definition corresponding to a given numeric
-          // variable use.
-          Context->GlobalNumericVariableTable[DefName] =
-              *DefinedNumericVariable;
-        } else {
-          VariableDefs[DefName] = CurParen;
-          // Mark string variable as defined to detect collisions between
-          // string and numeric variables in parseNumericVariableUse() and
-          // defineCmdlineVariables() when the latter is created later than the
-          // former. We cannot reuse GlobalVariableTable for this by populating
-          // it with an empty string since we would then lose the ability to
-          // detect the use of an undefined variable in match().
-          Context->DefinedVariableTable[DefName] = true;
-        }
-
-        ++CurParen;
-      }
-
-      if (!MatchRegexp.empty() && AddRegExToRegEx(MatchRegexp, CurParen, SM))
-        return true;
-
-      if (IsDefinition)
-        RegExStr += ')';
-
-      // Handle substitutions: [[foo]] and [[#<foo expr>]].
-      if (SubstNeeded) {
-        // Handle substitution of string variables that were defined earlier on
-        // the same line by emitting a backreference. Expressions do not
-        // support substituting a numeric variable defined on the same line.
-        if (!IsNumBlock && VariableDefs.find(SubstStr) != VariableDefs.end()) {
-          unsigned CaptureParenGroup = VariableDefs[SubstStr];
-          if (CaptureParenGroup < 1 || CaptureParenGroup > 9) {
-            SM.PrintMessage(SMLoc::getFromPointer(SubstStr.data()),
-                            SourceMgr::DK_Error,
-                            "Can't back-reference more than 9 variables");
-            return true;
-          }
-          AddBackrefToRegEx(CaptureParenGroup);
-        } else {
-          // Handle substitution of string variables ([[<var>]]) defined in
-          // previous CHECK patterns, and substitution of expressions.
-          Substitution *Substitution =
-              IsNumBlock
-                  ? Context->makeNumericSubstitution(
-                        SubstStr, std::move(ExpressionPointer), SubstInsertIdx)
-                  : Context->makeStringSubstitution(SubstStr, SubstInsertIdx);
-          Substitutions.push_back(Substitution);
-        }
-      }
-    }
-
-    // Handle fixed string matches.
-    // Find the end, which is the start of the next regex.
-    size_t FixedMatchEnd = PatternStr.find("{{");
-    FixedMatchEnd = std::min(FixedMatchEnd, PatternStr.find("[["));
-    RegExStr += Regex::escape(PatternStr.substr(0, FixedMatchEnd));
-    PatternStr = PatternStr.substr(FixedMatchEnd);
-  }
-
-  if (MatchFullLinesHere) {
-    if (!Req.NoCanonicalizeWhiteSpace)
-      RegExStr += " *";
-    RegExStr += '$';
-  }
-
-  return false;
-}
-
-bool Pattern::AddRegExToRegEx(StringRef RS, unsigned &CurParen, SourceMgr &SM) {
-  Regex R(RS);
-  std::string Error;
-  if (!R.isValid(Error)) {
-    SM.PrintMessage(SMLoc::getFromPointer(RS.data()), SourceMgr::DK_Error,
-                    "invalid regex: " + Error);
-    return true;
-  }
-
-  RegExStr += RS.str();
-  CurParen += R.getNumMatches();
-  return false;
-}
-
-void Pattern::AddBackrefToRegEx(unsigned BackrefNum) {
-  assert(BackrefNum >= 1 && BackrefNum <= 9 && "Invalid backref number");
-  std::string Backref = std::string("\\") + std::string(1, '0' + BackrefNum);
-  RegExStr += Backref;
-}
-
-Expected<size_t> Pattern::match(StringRef Buffer, size_t &MatchLen,
-                                const SourceMgr &SM) const {
-  // If this is the EOF pattern, match it immediately.
-  if (CheckTy == Check::CheckEOF) {
-    MatchLen = 0;
-    return Buffer.size();
-  }
-
-  // If this is a fixed string pattern, just match it now.
-  if (!FixedStr.empty()) {
-    MatchLen = FixedStr.size();
-    size_t Pos =
-        IgnoreCase ? Buffer.find_lower(FixedStr) : Buffer.find(FixedStr);
-    if (Pos == StringRef::npos)
-      return make_error<NotFoundError>();
-    return Pos;
-  }
-
-  // Regex match.
-
-  // If there are substitutions, we need to create a temporary string with the
-  // actual value.
-  StringRef RegExToMatch = RegExStr;
-  std::string TmpStr;
-  if (!Substitutions.empty()) {
-    TmpStr = RegExStr;
-    if (LineNumber)
-      Context->LineVariable->setValue(ExpressionValue(*LineNumber));
-
-    size_t InsertOffset = 0;
-    // Substitute all string variables and expressions whose values are only
-    // now known. Use of string variables defined on the same line are handled
-    // by back-references.
-    for (const auto &Substitution : Substitutions) {
-      // Substitute and check for failure (e.g. use of undefined variable).
-      Expected<std::string> Value = Substitution->getResult();
-      if (!Value) {
-        // Convert to an ErrorDiagnostic to get location information. This is
-        // done here rather than PrintNoMatch since now we know which
-        // substitution block caused the overflow.
-        Error Err =
-            handleErrors(Value.takeError(), [&](const OverflowError &E) {
-              return ErrorDiagnostic::get(SM, Substitution->getFromString(),
-                                          "unable to substitute variable or "
-                                          "numeric expression: overflow error");
-            });
-        return std::move(Err);
-      }
-
-      // Plop it into the regex at the adjusted offset.
-      TmpStr.insert(TmpStr.begin() + Substitution->getIndex() + InsertOffset,
-                    Value->begin(), Value->end());
-      InsertOffset += Value->size();
-    }
-
-    // Match the newly constructed regex.
-    RegExToMatch = TmpStr;
-  }
-
-  SmallVector<StringRef, 4> MatchInfo;
-  unsigned int Flags = Regex::Newline;
-  if (IgnoreCase)
-    Flags |= Regex::IgnoreCase;
-  if (!Regex(RegExToMatch, Flags).match(Buffer, &MatchInfo))
-    return make_error<NotFoundError>();
-
-  // Successful regex match.
-  assert(!MatchInfo.empty() && "Didn't get any match");
-  StringRef FullMatch = MatchInfo[0];
-
-  // If this defines any string variables, remember their values.
-  for (const auto &VariableDef : VariableDefs) {
-    assert(VariableDef.second < MatchInfo.size() && "Internal paren error");
-    Context->GlobalVariableTable[VariableDef.first] =
-        MatchInfo[VariableDef.second];
-  }
-
-  // If this defines any numeric variables, remember their values.
-  for (const auto &NumericVariableDef : NumericVariableDefs) {
-    const NumericVariableMatch &NumericVariableMatch =
-        NumericVariableDef.getValue();
-    unsigned CaptureParenGroup = NumericVariableMatch.CaptureParenGroup;
-    assert(CaptureParenGroup < MatchInfo.size() && "Internal paren error");
-    NumericVariable *DefinedNumericVariable =
-        NumericVariableMatch.DefinedNumericVariable;
-
-    StringRef MatchedValue = MatchInfo[CaptureParenGroup];
-    ExpressionFormat Format = DefinedNumericVariable->getImplicitFormat();
-    Expected<ExpressionValue> Value =
-        Format.valueFromStringRepr(MatchedValue, SM);
-    if (!Value)
-      return Value.takeError();
-    DefinedNumericVariable->setValue(*Value);
-  }
-
-  // Like CHECK-NEXT, CHECK-EMPTY's match range is considered to start after
-  // the required preceding newline, which is consumed by the pattern in the
-  // case of CHECK-EMPTY but not CHECK-NEXT.
-  size_t MatchStartSkip = CheckTy == Check::CheckEmpty;
-  MatchLen = FullMatch.size() - MatchStartSkip;
-  return FullMatch.data() - Buffer.data() + MatchStartSkip;
-}
-
-unsigned Pattern::computeMatchDistance(StringRef Buffer) const {
-  // Just compute the number of matching characters. For regular expressions, we
-  // just compare against the regex itself and hope for the best.
-  //
-  // FIXME: One easy improvement here is have the regex lib generate a single
-  // example regular expression which matches, and use that as the example
-  // string.
-  StringRef ExampleString(FixedStr);
-  if (ExampleString.empty())
-    ExampleString = RegExStr;
-
-  // Only compare up to the first line in the buffer, or the string size.
-  StringRef BufferPrefix = Buffer.substr(0, ExampleString.size());
-  BufferPrefix = BufferPrefix.split('\n').first;
-  return BufferPrefix.edit_distance(ExampleString);
-}
-
-void Pattern::printSubstitutions(const SourceMgr &SM, StringRef Buffer,
-                                 SMRange MatchRange) const {
-  // Print what we know about substitutions.
-  if (!Substitutions.empty()) {
-    for (const auto &Substitution : Substitutions) {
-      SmallString<256> Msg;
-      raw_svector_ostream OS(Msg);
-      Expected<std::string> MatchedValue = Substitution->getResult();
-
-      // Substitution failed or is not known at match time, print the undefined
-      // variables it uses.
-      if (!MatchedValue) {
-        bool UndefSeen = false;
-        handleAllErrors(
-            MatchedValue.takeError(), [](const NotFoundError &E) {},
-            // Handled in PrintNoMatch().
-            [](const ErrorDiagnostic &E) {},
-            // Handled in match().
-            [](const OverflowError &E) {},
-            [&](const UndefVarError &E) {
-              if (!UndefSeen) {
-                OS << "uses undefined variable(s):";
-                UndefSeen = true;
-              }
-              OS << " ";
-              E.log(OS);
-            });
-      } else {
-        // Substitution succeeded. Print substituted value.
-        OS << "with \"";
-        OS.write_escaped(Substitution->getFromString()) << "\" equal to \"";
-        OS.write_escaped(*MatchedValue) << "\"";
-      }
-
-      if (MatchRange.isValid())
-        SM.PrintMessage(MatchRange.Start, SourceMgr::DK_Note, OS.str(),
-                        {MatchRange});
-      else
-        SM.PrintMessage(SMLoc::getFromPointer(Buffer.data()),
-                        SourceMgr::DK_Note, OS.str());
-    }
-  }
-}
-
-static SMRange ProcessMatchResult(FileCheckDiag::MatchType MatchTy,
-                                  const SourceMgr &SM, SMLoc Loc,
-                                  Check::FileCheckType CheckTy,
-                                  StringRef Buffer, size_t Pos, size_t Len,
-                                  std::vector<FileCheckDiag> *Diags,
-                                  bool AdjustPrevDiag = false) {
-  SMLoc Start = SMLoc::getFromPointer(Buffer.data() + Pos);
-  SMLoc End = SMLoc::getFromPointer(Buffer.data() + Pos + Len);
-  SMRange Range(Start, End);
-  if (Diags) {
-    if (AdjustPrevDiag)
-      Diags->rbegin()->MatchTy = MatchTy;
-    else
-      Diags->emplace_back(SM, CheckTy, Loc, MatchTy, Range);
-  }
-  return Range;
-}
-
-void Pattern::printFuzzyMatch(const SourceMgr &SM, StringRef Buffer,
-                              std::vector<FileCheckDiag> *Diags) const {
-  // Attempt to find the closest/best fuzzy match.  Usually an error happens
-  // because some string in the output didn't exactly match. In these cases, we
-  // would like to show the user a best guess at what "should have" matched, to
-  // save them having to actually check the input manually.
-  size_t NumLinesForward = 0;
-  size_t Best = StringRef::npos;
-  double BestQuality = 0;
-
-  // Use an arbitrary 4k limit on how far we will search.
-  for (size_t i = 0, e = std::min(size_t(4096), Buffer.size()); i != e; ++i) {
-    if (Buffer[i] == '\n')
-      ++NumLinesForward;
-
-    // Patterns have leading whitespace stripped, so skip whitespace when
-    // looking for something which looks like a pattern.
-    if (Buffer[i] == ' ' || Buffer[i] == '\t')
-      continue;
-
-    // Compute the "quality" of this match as an arbitrary combination of the
-    // match distance and the number of lines skipped to get to this match.
-    unsigned Distance = computeMatchDistance(Buffer.substr(i));
-    double Quality = Distance + (NumLinesForward / 100.);
-
-    if (Quality < BestQuality || Best == StringRef::npos) {
-      Best = i;
-      BestQuality = Quality;
-    }
-  }
-
-  // Print the "possible intended match here" line if we found something
-  // reasonable and not equal to what we showed in the "scanning from here"
-  // line.
-  if (Best && Best != StringRef::npos && BestQuality < 50) {
-    SMRange MatchRange =
-        ProcessMatchResult(FileCheckDiag::MatchFuzzy, SM, getLoc(),
-                           getCheckTy(), Buffer, Best, 0, Diags);
-    SM.PrintMessage(MatchRange.Start, SourceMgr::DK_Note,
-                    "possible intended match here");
-
-    // FIXME: If we wanted to be really friendly we would show why the match
-    // failed, as it can be hard to spot simple one character differences.
-  }
-}
-
-Expected<StringRef>
-FileCheckPatternContext::getPatternVarValue(StringRef VarName) {
-  auto VarIter = GlobalVariableTable.find(VarName);
-  if (VarIter == GlobalVariableTable.end())
-    return make_error<UndefVarError>(VarName);
-
-  return VarIter->second;
-}
-
-template <class... Types>
-NumericVariable *FileCheckPatternContext::makeNumericVariable(Types... args) {
-  NumericVariables.push_back(std::make_unique<NumericVariable>(args...));
-  return NumericVariables.back().get();
-}
-
-Substitution *
-FileCheckPatternContext::makeStringSubstitution(StringRef VarName,
-                                                size_t InsertIdx) {
-  Substitutions.push_back(
-      std::make_unique<StringSubstitution>(this, VarName, InsertIdx));
-  return Substitutions.back().get();
-}
-
-Substitution *FileCheckPatternContext::makeNumericSubstitution(
-    StringRef ExpressionStr, std::unique_ptr<Expression> Expression,
-    size_t InsertIdx) {
-  Substitutions.push_back(std::make_unique<NumericSubstitution>(
-      this, ExpressionStr, std::move(Expression), InsertIdx));
-  return Substitutions.back().get();
-}
-
-size_t Pattern::FindRegexVarEnd(StringRef Str, SourceMgr &SM) {
-  // Offset keeps track of the current offset within the input Str
-  size_t Offset = 0;
-  // [...] Nesting depth
-  size_t BracketDepth = 0;
-
-  while (!Str.empty()) {
-    if (Str.startswith("]]") && BracketDepth == 0)
-      return Offset;
-    if (Str[0] == '\\') {
-      // Backslash escapes the next char within regexes, so skip them both.
-      Str = Str.substr(2);
-      Offset += 2;
-    } else {
-      switch (Str[0]) {
-      default:
-        break;
-      case '[':
-        BracketDepth++;
-        break;
-      case ']':
-        if (BracketDepth == 0) {
-          SM.PrintMessage(SMLoc::getFromPointer(Str.data()),
-                          SourceMgr::DK_Error,
-                          "missing closing \"]\" for regex variable");
-          exit(1);
-        }
-        BracketDepth--;
-        break;
-      }
-      Str = Str.substr(1);
-      Offset++;
-    }
-  }
-
-  return StringRef::npos;
-}
-
-StringRef FileCheck::CanonicalizeFile(MemoryBuffer &MB,
-                                      SmallVectorImpl<char> &OutputBuffer) {
-  OutputBuffer.reserve(MB.getBufferSize());
-
-  for (const char *Ptr = MB.getBufferStart(), *End = MB.getBufferEnd();
-       Ptr != End; ++Ptr) {
-    // Eliminate trailing dosish \r.
-    if (Ptr <= End - 2 && Ptr[0] == '\r' && Ptr[1] == '\n') {
-      continue;
-    }
-
-    // If current char is not a horizontal whitespace or if horizontal
-    // whitespace canonicalization is disabled, dump it to output as is.
-    if (Req.NoCanonicalizeWhiteSpace || (*Ptr != ' ' && *Ptr != '\t')) {
-      OutputBuffer.push_back(*Ptr);
-      continue;
-    }
-
-    // Otherwise, add one space and advance over neighboring space.
-    OutputBuffer.push_back(' ');
-    while (Ptr + 1 != End && (Ptr[1] == ' ' || Ptr[1] == '\t'))
-      ++Ptr;
-  }
-
-  // Add a null byte and then return all but that byte.
-  OutputBuffer.push_back('\0');
-  return StringRef(OutputBuffer.data(), OutputBuffer.size() - 1);
-}
-
-FileCheckDiag::FileCheckDiag(const SourceMgr &SM,
-                             const Check::FileCheckType &CheckTy,
-                             SMLoc CheckLoc, MatchType MatchTy,
-                             SMRange InputRange)
-    : CheckTy(CheckTy), CheckLoc(CheckLoc), MatchTy(MatchTy) {
-  auto Start = SM.getLineAndColumn(InputRange.Start);
-  auto End = SM.getLineAndColumn(InputRange.End);
-  InputStartLine = Start.first;
-  InputStartCol = Start.second;
-  InputEndLine = End.first;
-  InputEndCol = End.second;
-}
-
-static bool IsPartOfWord(char c) {
-  return (isAlnum(c) || c == '-' || c == '_');
-}
-
-Check::FileCheckType &Check::FileCheckType::setCount(int C) {
-  assert(Count > 0 && "zero and negative counts are not supported");
-  assert((C == 1 || Kind == CheckPlain) &&
-         "count supported only for plain CHECK directives");
-  Count = C;
-  return *this;
-}
-
-std::string Check::FileCheckType::getDescription(StringRef Prefix) const {
-  switch (Kind) {
-  case Check::CheckNone:
-    return "invalid";
-  case Check::CheckPlain:
-    if (Count > 1)
-      return Prefix.str() + "-COUNT";
-    return std::string(Prefix);
-  case Check::CheckNext:
-    return Prefix.str() + "-NEXT";
-  case Check::CheckSame:
-    return Prefix.str() + "-SAME";
-  case Check::CheckNot:
-    return Prefix.str() + "-NOT";
-  case Check::CheckDAG:
-    return Prefix.str() + "-DAG";
-  case Check::CheckLabel:
-    return Prefix.str() + "-LABEL";
-  case Check::CheckEmpty:
-    return Prefix.str() + "-EMPTY";
-  case Check::CheckComment:
-    return std::string(Prefix);
-  case Check::CheckEOF:
-    return "implicit EOF";
-  case Check::CheckBadNot:
-    return "bad NOT";
-  case Check::CheckBadCount:
-    return "bad COUNT";
-  }
-  llvm_unreachable("unknown FileCheckType");
-}
-
-static std::pair<Check::FileCheckType, StringRef>
-FindCheckType(const FileCheckRequest &Req, StringRef Buffer, StringRef Prefix) {
-  if (Buffer.size() <= Prefix.size())
-    return {Check::CheckNone, StringRef()};
-
-  char NextChar = Buffer[Prefix.size()];
-
-  StringRef Rest = Buffer.drop_front(Prefix.size() + 1);
-
-  // Check for comment.
-  if (Req.CommentPrefixes.end() != std::find(Req.CommentPrefixes.begin(),
-                                             Req.CommentPrefixes.end(),
-                                             Prefix)) {
-    if (NextChar == ':')
-      return {Check::CheckComment, Rest};
-    // Ignore a comment prefix if it has a suffix like "-NOT".
-    return {Check::CheckNone, StringRef()};
-  }
-
-  // Verify that the : is present after the prefix.
-  if (NextChar == ':')
-    return {Check::CheckPlain, Rest};
-
-  if (NextChar != '-')
-    return {Check::CheckNone, StringRef()};
-
-  if (Rest.consume_front("COUNT-")) {
-    int64_t Count;
-    if (Rest.consumeInteger(10, Count))
-      // Error happened in parsing integer.
-      return {Check::CheckBadCount, Rest};
-    if (Count <= 0 || Count > INT32_MAX)
-      return {Check::CheckBadCount, Rest};
-    if (!Rest.consume_front(":"))
-      return {Check::CheckBadCount, Rest};
-    return {Check::FileCheckType(Check::CheckPlain).setCount(Count), Rest};
-  }
-
-  if (Rest.consume_front("NEXT:"))
-    return {Check::CheckNext, Rest};
-
-  if (Rest.consume_front("SAME:"))
-    return {Check::CheckSame, Rest};
-
-  if (Rest.consume_front("NOT:"))
-    return {Check::CheckNot, Rest};
-
-  if (Rest.consume_front("DAG:"))
-    return {Check::CheckDAG, Rest};
-
-  if (Rest.consume_front("LABEL:"))
-    return {Check::CheckLabel, Rest};
-
-  if (Rest.consume_front("EMPTY:"))
-    return {Check::CheckEmpty, Rest};
-
-  // You can't combine -NOT with another suffix.
-  if (Rest.startswith("DAG-NOT:") || Rest.startswith("NOT-DAG:") ||
-      Rest.startswith("NEXT-NOT:") || Rest.startswith("NOT-NEXT:") ||
-      Rest.startswith("SAME-NOT:") || Rest.startswith("NOT-SAME:") ||
-      Rest.startswith("EMPTY-NOT:") || Rest.startswith("NOT-EMPTY:"))
-    return {Check::CheckBadNot, Rest};
-
-  return {Check::CheckNone, Rest};
-}
-
-// From the given position, find the next character after the word.
-static size_t SkipWord(StringRef Str, size_t Loc) {
-  while (Loc < Str.size() && IsPartOfWord(Str[Loc]))
-    ++Loc;
-  return Loc;
-}
-
-/// Searches the buffer for the first prefix in the prefix regular expression.
-///
-/// This searches the buffer using the provided regular expression, however it
-/// enforces constraints beyond that:
-/// 1) The found prefix must not be a suffix of something that looks like
-///    a valid prefix.
-/// 2) The found prefix must be followed by a valid check type suffix using \c
-///    FindCheckType above.
-///
-/// \returns a pair of StringRefs into the Buffer, which combines:
-///   - the first match of the regular expression to satisfy these two is
-///   returned,
-///     otherwise an empty StringRef is returned to indicate failure.
-///   - buffer rewound to the location right after parsed suffix, for parsing
-///     to continue from
-///
-/// If this routine returns a valid prefix, it will also shrink \p Buffer to
-/// start at the beginning of the returned prefix, increment \p LineNumber for
-/// each new line consumed from \p Buffer, and set \p CheckTy to the type of
-/// check found by examining the suffix.
-///
-/// If no valid prefix is found, the state of Buffer, LineNumber, and CheckTy
-/// is unspecified.
-static std::pair<StringRef, StringRef>
-FindFirstMatchingPrefix(const FileCheckRequest &Req, Regex &PrefixRE,
-                        StringRef &Buffer, unsigned &LineNumber,
-                        Check::FileCheckType &CheckTy) {
-  SmallVector<StringRef, 2> Matches;
-
-  while (!Buffer.empty()) {
-    // Find the first (longest) match using the RE.
-    if (!PrefixRE.match(Buffer, &Matches))
-      // No match at all, bail.
-      return {StringRef(), StringRef()};
-
-    StringRef Prefix = Matches[0];
-    Matches.clear();
-
-    assert(Prefix.data() >= Buffer.data() &&
-           Prefix.data() < Buffer.data() + Buffer.size() &&
-           "Prefix doesn't start inside of buffer!");
-    size_t Loc = Prefix.data() - Buffer.data();
-    StringRef Skipped = Buffer.substr(0, Loc);
-    Buffer = Buffer.drop_front(Loc);
-    LineNumber += Skipped.count('\n');
-
-    // Check that the matched prefix isn't a suffix of some other check-like
-    // word.
-    // FIXME: This is a very ad-hoc check. it would be better handled in some
-    // other way. Among other things it seems hard to distinguish between
-    // intentional and unintentional uses of this feature.
-    if (Skipped.empty() || !IsPartOfWord(Skipped.back())) {
-      // Now extract the type.
-      StringRef AfterSuffix;
-      std::tie(CheckTy, AfterSuffix) = FindCheckType(Req, Buffer, Prefix);
-
-      // If we've found a valid check type for this prefix, we're done.
-      if (CheckTy != Check::CheckNone)
-        return {Prefix, AfterSuffix};
-    }
-
-    // If we didn't successfully find a prefix, we need to skip this invalid
-    // prefix and continue scanning. We directly skip the prefix that was
-    // matched and any additional parts of that check-like word.
-    Buffer = Buffer.drop_front(SkipWord(Buffer, Prefix.size()));
-  }
-
-  // We ran out of buffer while skipping partial matches so give up.
-  return {StringRef(), StringRef()};
-}
-
-void FileCheckPatternContext::createLineVariable() {
-  assert(!LineVariable && "@LINE pseudo numeric variable already created");
-  StringRef LineName = "@LINE";
-  LineVariable = makeNumericVariable(
-      LineName, ExpressionFormat(ExpressionFormat::Kind::Unsigned));
-  GlobalNumericVariableTable[LineName] = LineVariable;
-}
-
-FileCheck::FileCheck(FileCheckRequest Req)
-    : Req(Req), PatternContext(std::make_unique<FileCheckPatternContext>()),
-      CheckStrings(std::make_unique<std::vector<FileCheckString>>()) {}
-
-FileCheck::~FileCheck() = default;
-
-bool FileCheck::readCheckFile(
-    SourceMgr &SM, StringRef Buffer, Regex &PrefixRE,
-    std::pair<unsigned, unsigned> *ImpPatBufferIDRange) {
-  if (ImpPatBufferIDRange)
-    ImpPatBufferIDRange->first = ImpPatBufferIDRange->second = 0;
-
-  Error DefineError =
-      PatternContext->defineCmdlineVariables(Req.GlobalDefines, SM);
-  if (DefineError) {
-    logAllUnhandledErrors(std::move(DefineError), errs());
-    return true;
-  }
-
-  PatternContext->createLineVariable();
-
-  std::vector<Pattern> ImplicitNegativeChecks;
-  for (StringRef PatternString : Req.ImplicitCheckNot) {
-    // Create a buffer with fake command line content in order to display the
-    // command line option responsible for the specific implicit CHECK-NOT.
-    std::string Prefix = "-implicit-check-not='";
-    std::string Suffix = "'";
-    std::unique_ptr<MemoryBuffer> CmdLine = MemoryBuffer::getMemBufferCopy(
-        (Prefix + PatternString + Suffix).str(), "command line");
-
-    StringRef PatternInBuffer =
-        CmdLine->getBuffer().substr(Prefix.size(), PatternString.size());
-    unsigned BufferID = SM.AddNewSourceBuffer(std::move(CmdLine), SMLoc());
-    if (ImpPatBufferIDRange) {
-      if (ImpPatBufferIDRange->first == ImpPatBufferIDRange->second) {
-        ImpPatBufferIDRange->first = BufferID;
-        ImpPatBufferIDRange->second = BufferID + 1;
-      } else {
-        assert(BufferID == ImpPatBufferIDRange->second &&
-               "expected consecutive source buffer IDs");
-        ++ImpPatBufferIDRange->second;
-      }
-    }
-
-    ImplicitNegativeChecks.push_back(
-        Pattern(Check::CheckNot, PatternContext.get()));
-    ImplicitNegativeChecks.back().parsePattern(PatternInBuffer,
-                                               "IMPLICIT-CHECK", SM, Req);
-  }
-
-  std::vector<Pattern> DagNotMatches = ImplicitNegativeChecks;
-
-  // LineNumber keeps track of the line on which CheckPrefix instances are
-  // found.
-  unsigned LineNumber = 1;
-
-  bool FoundUsedCheckPrefix = false;
-  while (1) {
-    Check::FileCheckType CheckTy;
-
-    // See if a prefix occurs in the memory buffer.
-    StringRef UsedPrefix;
-    StringRef AfterSuffix;
-    std::tie(UsedPrefix, AfterSuffix) =
-        FindFirstMatchingPrefix(Req, PrefixRE, Buffer, LineNumber, CheckTy);
-    if (UsedPrefix.empty())
-      break;
-    if (CheckTy != Check::CheckComment)
-      FoundUsedCheckPrefix = true;
-
-    assert(UsedPrefix.data() == Buffer.data() &&
-           "Failed to move Buffer's start forward, or pointed prefix outside "
-           "of the buffer!");
-    assert(AfterSuffix.data() >= Buffer.data() &&
-           AfterSuffix.data() < Buffer.data() + Buffer.size() &&
-           "Parsing after suffix doesn't start inside of buffer!");
-
-    // Location to use for error messages.
-    const char *UsedPrefixStart = UsedPrefix.data();
-
-    // Skip the buffer to the end of parsed suffix (or just prefix, if no good
-    // suffix was processed).
-    Buffer = AfterSuffix.empty() ? Buffer.drop_front(UsedPrefix.size())
-                                 : AfterSuffix;
-
-    // Complain about useful-looking but unsupported suffixes.
-    if (CheckTy == Check::CheckBadNot) {
-      SM.PrintMessage(SMLoc::getFromPointer(Buffer.data()), SourceMgr::DK_Error,
-                      "unsupported -NOT combo on prefix '" + UsedPrefix + "'");
-      return true;
-    }
-
-    // Complain about invalid count specification.
-    if (CheckTy == Check::CheckBadCount) {
-      SM.PrintMessage(SMLoc::getFromPointer(Buffer.data()), SourceMgr::DK_Error,
-                      "invalid count in -COUNT specification on prefix '" +
-                          UsedPrefix + "'");
-      return true;
-    }
-
-    // Okay, we found the prefix, yay. Remember the rest of the line, but ignore
-    // leading whitespace.
-    if (!(Req.NoCanonicalizeWhiteSpace && Req.MatchFullLines))
-      Buffer = Buffer.substr(Buffer.find_first_not_of(" \t"));
-
-    // Scan ahead to the end of line.
-    size_t EOL = Buffer.find_first_of("\n\r");
-
-    // Remember the location of the start of the pattern, for diagnostics.
-    SMLoc PatternLoc = SMLoc::getFromPointer(Buffer.data());
-
-    // Extract the pattern from the buffer.
-    StringRef PatternBuffer = Buffer.substr(0, EOL);
-    Buffer = Buffer.substr(EOL);
-
-    // If this is a comment, we're done.
-    if (CheckTy == Check::CheckComment)
-      continue;
-
-    // Parse the pattern.
-    Pattern P(CheckTy, PatternContext.get(), LineNumber);
-    if (P.parsePattern(PatternBuffer, UsedPrefix, SM, Req))
-      return true;
-
-    // Verify that CHECK-LABEL lines do not define or use variables
-    if ((CheckTy == Check::CheckLabel) && P.hasVariable()) {
-      SM.PrintMessage(
-          SMLoc::getFromPointer(UsedPrefixStart), SourceMgr::DK_Error,
-          "found '" + UsedPrefix + "-LABEL:'"
-                                   " with variable definition or use");
-      return true;
-    }
-
-    // Verify that CHECK-NEXT/SAME/EMPTY lines have at least one CHECK line before them.
-    if ((CheckTy == Check::CheckNext || CheckTy == Check::CheckSame ||
-         CheckTy == Check::CheckEmpty) &&
-        CheckStrings->empty()) {
-      StringRef Type = CheckTy == Check::CheckNext
-                           ? "NEXT"
-                           : CheckTy == Check::CheckEmpty ? "EMPTY" : "SAME";
-      SM.PrintMessage(SMLoc::getFromPointer(UsedPrefixStart),
-                      SourceMgr::DK_Error,
-                      "found '" + UsedPrefix + "-" + Type +
-                          "' without previous '" + UsedPrefix + ": line");
-      return true;
-    }
-
-    // Handle CHECK-DAG/-NOT.
-    if (CheckTy == Check::CheckDAG || CheckTy == Check::CheckNot) {
-      DagNotMatches.push_back(P);
-      continue;
-    }
-
-    // Okay, add the string we captured to the output vector and move on.
-    CheckStrings->emplace_back(P, UsedPrefix, PatternLoc);
-    std::swap(DagNotMatches, CheckStrings->back().DagNotStrings);
-    DagNotMatches = ImplicitNegativeChecks;
-  }
-
-  // When there are no used prefixes we report an error except in the case that
-  // no prefix is specified explicitly but -implicit-check-not is specified.
-  if (!FoundUsedCheckPrefix &&
-      (ImplicitNegativeChecks.empty() || !Req.IsDefaultCheckPrefix)) {
-    errs() << "error: no check strings found with prefix"
-           << (Req.CheckPrefixes.size() > 1 ? "es " : " ");
-    for (size_t I = 0, E = Req.CheckPrefixes.size(); I != E; ++I) {
-      if (I != 0)
-        errs() << ", ";
-      errs() << "\'" << Req.CheckPrefixes[I] << ":'";
-    }
-    errs() << '\n';
-    return true;
-  }
-
-  // Add an EOF pattern for any trailing --implicit-check-not/CHECK-DAG/-NOTs,
-  // and use the first prefix as a filler for the error message.
-  if (!DagNotMatches.empty()) {
-    CheckStrings->emplace_back(
-        Pattern(Check::CheckEOF, PatternContext.get(), LineNumber + 1),
-        *Req.CheckPrefixes.begin(), SMLoc::getFromPointer(Buffer.data()));
-    std::swap(DagNotMatches, CheckStrings->back().DagNotStrings);
-  }
-
-  return false;
-}
-
-static void PrintMatch(bool ExpectedMatch, const SourceMgr &SM,
-                       StringRef Prefix, SMLoc Loc, const Pattern &Pat,
-                       int MatchedCount, StringRef Buffer, size_t MatchPos,
-                       size_t MatchLen, const FileCheckRequest &Req,
-                       std::vector<FileCheckDiag> *Diags) {
-  bool PrintDiag = true;
-  if (ExpectedMatch) {
-    if (!Req.Verbose)
-      return;
-    if (!Req.VerboseVerbose && Pat.getCheckTy() == Check::CheckEOF)
-      return;
-    // Due to their verbosity, we don't print verbose diagnostics here if we're
-    // gathering them for a different rendering, but we always print other
-    // diagnostics.
-    PrintDiag = !Diags;
-  }
-  SMRange MatchRange = ProcessMatchResult(
-      ExpectedMatch ? FileCheckDiag::MatchFoundAndExpected
-                    : FileCheckDiag::MatchFoundButExcluded,
-      SM, Loc, Pat.getCheckTy(), Buffer, MatchPos, MatchLen, Diags);
-  if (!PrintDiag)
-    return;
-
-  std::string Message = formatv("{0}: {1} string found in input",
-                                Pat.getCheckTy().getDescription(Prefix),
-                                (ExpectedMatch ? "expected" : "excluded"))
-                            .str();
-  if (Pat.getCount() > 1)
-    Message += formatv(" ({0} out of {1})", MatchedCount, Pat.getCount()).str();
-
-  SM.PrintMessage(
-      Loc, ExpectedMatch ? SourceMgr::DK_Remark : SourceMgr::DK_Error, Message);
-  SM.PrintMessage(MatchRange.Start, SourceMgr::DK_Note, "found here",
-                  {MatchRange});
-  Pat.printSubstitutions(SM, Buffer, MatchRange);
-}
-
-static void PrintMatch(bool ExpectedMatch, const SourceMgr &SM,
-                       const FileCheckString &CheckStr, int MatchedCount,
-                       StringRef Buffer, size_t MatchPos, size_t MatchLen,
-                       FileCheckRequest &Req,
-                       std::vector<FileCheckDiag> *Diags) {
-  PrintMatch(ExpectedMatch, SM, CheckStr.Prefix, CheckStr.Loc, CheckStr.Pat,
-             MatchedCount, Buffer, MatchPos, MatchLen, Req, Diags);
-}
-
-static void PrintNoMatch(bool ExpectedMatch, const SourceMgr &SM,
-                         StringRef Prefix, SMLoc Loc, const Pattern &Pat,
-                         int MatchedCount, StringRef Buffer,
-                         bool VerboseVerbose, std::vector<FileCheckDiag> *Diags,
-                         Error MatchErrors) {
-  assert(MatchErrors && "Called on successful match");
-  bool PrintDiag = true;
-  if (!ExpectedMatch) {
-    if (!VerboseVerbose) {
-      consumeError(std::move(MatchErrors));
-      return;
-    }
-    // Due to their verbosity, we don't print verbose diagnostics here if we're
-    // gathering them for a different rendering, but we always print other
-    // diagnostics.
-    PrintDiag = !Diags;
-  }
-
-  // If the current position is at the end of a line, advance to the start of
-  // the next line.
-  Buffer = Buffer.substr(Buffer.find_first_not_of(" \t\n\r"));
-  SMRange SearchRange = ProcessMatchResult(
-      ExpectedMatch ? FileCheckDiag::MatchNoneButExpected
-                    : FileCheckDiag::MatchNoneAndExcluded,
-      SM, Loc, Pat.getCheckTy(), Buffer, 0, Buffer.size(), Diags);
-  if (!PrintDiag) {
-    consumeError(std::move(MatchErrors));
-    return;
-  }
-
-  MatchErrors = handleErrors(std::move(MatchErrors),
-                             [](const ErrorDiagnostic &E) { E.log(errs()); });
-
-  // No problem matching the string per se.
-  if (!MatchErrors)
-    return;
-  consumeError(std::move(MatchErrors));
-
-  // Print "not found" diagnostic.
-  std::string Message = formatv("{0}: {1} string not found in input",
-                                Pat.getCheckTy().getDescription(Prefix),
-                                (ExpectedMatch ? "expected" : "excluded"))
-                            .str();
-  if (Pat.getCount() > 1)
-    Message += formatv(" ({0} out of {1})", MatchedCount, Pat.getCount()).str();
-  SM.PrintMessage(
-      Loc, ExpectedMatch ? SourceMgr::DK_Error : SourceMgr::DK_Remark, Message);
-
-  // Print the "scanning from here" line.
-  SM.PrintMessage(SearchRange.Start, SourceMgr::DK_Note, "scanning from here");
-
-  // Allow the pattern to print additional information if desired.
-  Pat.printSubstitutions(SM, Buffer);
-
-  if (ExpectedMatch)
-    Pat.printFuzzyMatch(SM, Buffer, Diags);
-}
-
-static void PrintNoMatch(bool ExpectedMatch, const SourceMgr &SM,
-                         const FileCheckString &CheckStr, int MatchedCount,
-                         StringRef Buffer, bool VerboseVerbose,
-                         std::vector<FileCheckDiag> *Diags, Error MatchErrors) {
-  PrintNoMatch(ExpectedMatch, SM, CheckStr.Prefix, CheckStr.Loc, CheckStr.Pat,
-               MatchedCount, Buffer, VerboseVerbose, Diags,
-               std::move(MatchErrors));
-}
-
-/// Counts the number of newlines in the specified range.
-static unsigned CountNumNewlinesBetween(StringRef Range,
-                                        const char *&FirstNewLine) {
-  unsigned NumNewLines = 0;
-  while (1) {
-    // Scan for newline.
-    Range = Range.substr(Range.find_first_of("\n\r"));
-    if (Range.empty())
-      return NumNewLines;
-
-    ++NumNewLines;
-
-    // Handle \n\r and \r\n as a single newline.
-    if (Range.size() > 1 && (Range[1] == '\n' || Range[1] == '\r') &&
-        (Range[0] != Range[1]))
-      Range = Range.substr(1);
-    Range = Range.substr(1);
-
-    if (NumNewLines == 1)
-      FirstNewLine = Range.begin();
-  }
-}
-
-size_t FileCheckString::Check(const SourceMgr &SM, StringRef Buffer,
-                              bool IsLabelScanMode, size_t &MatchLen,
-                              FileCheckRequest &Req,
-                              std::vector<FileCheckDiag> *Diags) const {
-  size_t LastPos = 0;
-  std::vector<const Pattern *> NotStrings;
-
-  // IsLabelScanMode is true when we are scanning forward to find CHECK-LABEL
-  // bounds; we have not processed variable definitions within the bounded block
-  // yet so cannot handle any final CHECK-DAG yet; this is handled when going
-  // over the block again (including the last CHECK-LABEL) in normal mode.
-  if (!IsLabelScanMode) {
-    // Match "dag strings" (with mixed "not strings" if any).
-    LastPos = CheckDag(SM, Buffer, NotStrings, Req, Diags);
-    if (LastPos == StringRef::npos)
-      return StringRef::npos;
-  }
-
-  // Match itself from the last position after matching CHECK-DAG.
-  size_t LastMatchEnd = LastPos;
-  size_t FirstMatchPos = 0;
-  // Go match the pattern Count times. Majority of patterns only match with
-  // count 1 though.
-  assert(Pat.getCount() != 0 && "pattern count can not be zero");
-  for (int i = 1; i <= Pat.getCount(); i++) {
-    StringRef MatchBuffer = Buffer.substr(LastMatchEnd);
-    size_t CurrentMatchLen;
-    // get a match at current start point
-    Expected<size_t> MatchResult = Pat.match(MatchBuffer, CurrentMatchLen, SM);
-
-    // report
-    if (!MatchResult) {
-      PrintNoMatch(true, SM, *this, i, MatchBuffer, Req.VerboseVerbose, Diags,
-                   MatchResult.takeError());
-      return StringRef::npos;
-    }
-    size_t MatchPos = *MatchResult;
-    PrintMatch(true, SM, *this, i, MatchBuffer, MatchPos, CurrentMatchLen, Req,
-               Diags);
-    if (i == 1)
-      FirstMatchPos = LastPos + MatchPos;
-
-    // move start point after the match
-    LastMatchEnd += MatchPos + CurrentMatchLen;
-  }
-  // Full match len counts from first match pos.
-  MatchLen = LastMatchEnd - FirstMatchPos;
-
-  // Similar to the above, in "label-scan mode" we can't yet handle CHECK-NEXT
-  // or CHECK-NOT
-  if (!IsLabelScanMode) {
-    size_t MatchPos = FirstMatchPos - LastPos;
-    StringRef MatchBuffer = Buffer.substr(LastPos);
-    StringRef SkippedRegion = Buffer.substr(LastPos, MatchPos);
-
-    // If this check is a "CHECK-NEXT", verify that the previous match was on
-    // the previous line (i.e. that there is one newline between them).
-    if (CheckNext(SM, SkippedRegion)) {
-      ProcessMatchResult(FileCheckDiag::MatchFoundButWrongLine, SM, Loc,
-                         Pat.getCheckTy(), MatchBuffer, MatchPos, MatchLen,
-                         Diags, Req.Verbose);
-      return StringRef::npos;
-    }
-
-    // If this check is a "CHECK-SAME", verify that the previous match was on
-    // the same line (i.e. that there is no newline between them).
-    if (CheckSame(SM, SkippedRegion)) {
-      ProcessMatchResult(FileCheckDiag::MatchFoundButWrongLine, SM, Loc,
-                         Pat.getCheckTy(), MatchBuffer, MatchPos, MatchLen,
-                         Diags, Req.Verbose);
-      return StringRef::npos;
-    }
-
-    // If this match had "not strings", verify that they don't exist in the
-    // skipped region.
-    if (CheckNot(SM, SkippedRegion, NotStrings, Req, Diags))
-      return StringRef::npos;
-  }
-
-  return FirstMatchPos;
-}
-
-bool FileCheckString::CheckNext(const SourceMgr &SM, StringRef Buffer) const {
-  if (Pat.getCheckTy() != Check::CheckNext &&
-      Pat.getCheckTy() != Check::CheckEmpty)
-    return false;
-
-  Twine CheckName =
-      Prefix +
-      Twine(Pat.getCheckTy() == Check::CheckEmpty ? "-EMPTY" : "-NEXT");
-
-  // Count the number of newlines between the previous match and this one.
-  const char *FirstNewLine = nullptr;
-  unsigned NumNewLines = CountNumNewlinesBetween(Buffer, FirstNewLine);
-
-  if (NumNewLines == 0) {
-    SM.PrintMessage(Loc, SourceMgr::DK_Error,
-                    CheckName + ": is on the same line as previous match");
-    SM.PrintMessage(SMLoc::getFromPointer(Buffer.end()), SourceMgr::DK_Note,
-                    "'next' match was here");
-    SM.PrintMessage(SMLoc::getFromPointer(Buffer.data()), SourceMgr::DK_Note,
-                    "previous match ended here");
-    return true;
-  }
-
-  if (NumNewLines != 1) {
-    SM.PrintMessage(Loc, SourceMgr::DK_Error,
-                    CheckName +
-                        ": is not on the line after the previous match");
-    SM.PrintMessage(SMLoc::getFromPointer(Buffer.end()), SourceMgr::DK_Note,
-                    "'next' match was here");
-    SM.PrintMessage(SMLoc::getFromPointer(Buffer.data()), SourceMgr::DK_Note,
-                    "previous match ended here");
-    SM.PrintMessage(SMLoc::getFromPointer(FirstNewLine), SourceMgr::DK_Note,
-                    "non-matching line after previous match is here");
-    return true;
-  }
-
-  return false;
-}
-
-bool FileCheckString::CheckSame(const SourceMgr &SM, StringRef Buffer) const {
-  if (Pat.getCheckTy() != Check::CheckSame)
-    return false;
-
-  // Count the number of newlines between the previous match and this one.
-  const char *FirstNewLine = nullptr;
-  unsigned NumNewLines = CountNumNewlinesBetween(Buffer, FirstNewLine);
-
-  if (NumNewLines != 0) {
-    SM.PrintMessage(Loc, SourceMgr::DK_Error,
-                    Prefix +
-                        "-SAME: is not on the same line as the previous match");
-    SM.PrintMessage(SMLoc::getFromPointer(Buffer.end()), SourceMgr::DK_Note,
-                    "'next' match was here");
-    SM.PrintMessage(SMLoc::getFromPointer(Buffer.data()), SourceMgr::DK_Note,
-                    "previous match ended here");
-    return true;
-  }
-
-  return false;
-}
-
-bool FileCheckString::CheckNot(const SourceMgr &SM, StringRef Buffer,
-                               const std::vector<const Pattern *> &NotStrings,
-                               const FileCheckRequest &Req,
-                               std::vector<FileCheckDiag> *Diags) const {
-  for (const Pattern *Pat : NotStrings) {
-    assert((Pat->getCheckTy() == Check::CheckNot) && "Expect CHECK-NOT!");
-
-    size_t MatchLen = 0;
-    Expected<size_t> MatchResult = Pat->match(Buffer, MatchLen, SM);
-
-    if (!MatchResult) {
-      PrintNoMatch(false, SM, Prefix, Pat->getLoc(), *Pat, 1, Buffer,
-                   Req.VerboseVerbose, Diags, MatchResult.takeError());
-      continue;
-    }
-    size_t Pos = *MatchResult;
-
-    PrintMatch(false, SM, Prefix, Pat->getLoc(), *Pat, 1, Buffer, Pos, MatchLen,
-               Req, Diags);
-
-    return true;
-  }
-
-  return false;
-}
-
-size_t FileCheckString::CheckDag(const SourceMgr &SM, StringRef Buffer,
-                                 std::vector<const Pattern *> &NotStrings,
-                                 const FileCheckRequest &Req,
-                                 std::vector<FileCheckDiag> *Diags) const {
-  if (DagNotStrings.empty())
-    return 0;
-
-  // The start of the search range.
-  size_t StartPos = 0;
-
-  struct MatchRange {
-    size_t Pos;
-    size_t End;
-  };
-  // A sorted list of ranges for non-overlapping CHECK-DAG matches.  Match
-  // ranges are erased from this list once they are no longer in the search
-  // range.
-  std::list<MatchRange> MatchRanges;
-
-  // We need PatItr and PatEnd later for detecting the end of a CHECK-DAG
-  // group, so we don't use a range-based for loop here.
-  for (auto PatItr = DagNotStrings.begin(), PatEnd = DagNotStrings.end();
-       PatItr != PatEnd; ++PatItr) {
-    const Pattern &Pat = *PatItr;
-    assert((Pat.getCheckTy() == Check::CheckDAG ||
-            Pat.getCheckTy() == Check::CheckNot) &&
-           "Invalid CHECK-DAG or CHECK-NOT!");
-
-    if (Pat.getCheckTy() == Check::CheckNot) {
-      NotStrings.push_back(&Pat);
-      continue;
-    }
-
-    assert((Pat.getCheckTy() == Check::CheckDAG) && "Expect CHECK-DAG!");
-
-    // CHECK-DAG always matches from the start.
-    size_t MatchLen = 0, MatchPos = StartPos;
-
-    // Search for a match that doesn't overlap a previous match in this
-    // CHECK-DAG group.
-    for (auto MI = MatchRanges.begin(), ME = MatchRanges.end(); true; ++MI) {
-      StringRef MatchBuffer = Buffer.substr(MatchPos);
-      Expected<size_t> MatchResult = Pat.match(MatchBuffer, MatchLen, SM);
-      // With a group of CHECK-DAGs, a single mismatching means the match on
-      // that group of CHECK-DAGs fails immediately.
-      if (!MatchResult) {
-        PrintNoMatch(true, SM, Prefix, Pat.getLoc(), Pat, 1, MatchBuffer,
-                     Req.VerboseVerbose, Diags, MatchResult.takeError());
-        return StringRef::npos;
-      }
-      size_t MatchPosBuf = *MatchResult;
-      // Re-calc it as the offset relative to the start of the original string.
-      MatchPos += MatchPosBuf;
-      if (Req.VerboseVerbose)
-        PrintMatch(true, SM, Prefix, Pat.getLoc(), Pat, 1, Buffer, MatchPos,
-                   MatchLen, Req, Diags);
-      MatchRange M{MatchPos, MatchPos + MatchLen};
-      if (Req.AllowDeprecatedDagOverlap) {
-        // We don't need to track all matches in this mode, so we just maintain
-        // one match range that encompasses the current CHECK-DAG group's
-        // matches.
-        if (MatchRanges.empty())
-          MatchRanges.insert(MatchRanges.end(), M);
-        else {
-          auto Block = MatchRanges.begin();
-          Block->Pos = std::min(Block->Pos, M.Pos);
-          Block->End = std::max(Block->End, M.End);
-        }
-        break;
-      }
-      // Iterate previous matches until overlapping match or insertion point.
-      bool Overlap = false;
-      for (; MI != ME; ++MI) {
-        if (M.Pos < MI->End) {
-          // !Overlap => New match has no overlap and is before this old match.
-          // Overlap => New match overlaps this old match.
-          Overlap = MI->Pos < M.End;
-          break;
-        }
-      }
-      if (!Overlap) {
-        // Insert non-overlapping match into list.
-        MatchRanges.insert(MI, M);
-        break;
-      }
-      if (Req.VerboseVerbose) {
-        // Due to their verbosity, we don't print verbose diagnostics here if
-        // we're gathering them for a different rendering, but we always print
-        // other diagnostics.
-        if (!Diags) {
-          SMLoc OldStart = SMLoc::getFromPointer(Buffer.data() + MI->Pos);
-          SMLoc OldEnd = SMLoc::getFromPointer(Buffer.data() + MI->End);
-          SMRange OldRange(OldStart, OldEnd);
-          SM.PrintMessage(OldStart, SourceMgr::DK_Note,
-                          "match discarded, overlaps earlier DAG match here",
-                          {OldRange});
-        } else
-          Diags->rbegin()->MatchTy = FileCheckDiag::MatchFoundButDiscarded;
-      }
-      MatchPos = MI->End;
-    }
-    if (!Req.VerboseVerbose)
-      PrintMatch(true, SM, Prefix, Pat.getLoc(), Pat, 1, Buffer, MatchPos,
-                 MatchLen, Req, Diags);
-
-    // Handle the end of a CHECK-DAG group.
-    if (std::next(PatItr) == PatEnd ||
-        std::next(PatItr)->getCheckTy() == Check::CheckNot) {
-      if (!NotStrings.empty()) {
-        // If there are CHECK-NOTs between two CHECK-DAGs or from CHECK to
-        // CHECK-DAG, verify that there are no 'not' strings occurred in that
-        // region.
-        StringRef SkippedRegion =
-            Buffer.slice(StartPos, MatchRanges.begin()->Pos);
-        if (CheckNot(SM, SkippedRegion, NotStrings, Req, Diags))
-          return StringRef::npos;
-        // Clear "not strings".
-        NotStrings.clear();
-      }
-      // All subsequent CHECK-DAGs and CHECK-NOTs should be matched from the
-      // end of this CHECK-DAG group's match range.
-      StartPos = MatchRanges.rbegin()->End;
-      // Don't waste time checking for (impossible) overlaps before that.
-      MatchRanges.clear();
-    }
-  }
-
-  return StartPos;
-}
-
-static bool ValidatePrefixes(StringRef Kind, StringSet<> &UniquePrefixes,
-                             ArrayRef<StringRef> SuppliedPrefixes) {
-  for (StringRef Prefix : SuppliedPrefixes) {
-    if (Prefix.empty()) {
-      errs() << "error: supplied " << Kind << " prefix must not be the empty "
-             << "string\n";
-      return false;
-    }
-    static const Regex Validator("^[a-zA-Z0-9_-]*$");
-    if (!Validator.match(Prefix)) {
-      errs() << "error: supplied " << Kind << " prefix must start with a "
-             << "letter and contain only alphanumeric characters, hyphens, and "
-             << "underscores: '" << Prefix << "'\n";
-      return false;
-    }
-    if (!UniquePrefixes.insert(Prefix).second) {
-      errs() << "error: supplied " << Kind << " prefix must be unique among "
-             << "check and comment prefixes: '" << Prefix << "'\n";
-      return false;
-    }
-  }
-  return true;
-}
-
-static const char *DefaultCheckPrefixes[] = {"CHECK"};
-static const char *DefaultCommentPrefixes[] = {"COM", "RUN"};
-
-bool FileCheck::ValidateCheckPrefixes() {
-  StringSet<> UniquePrefixes;
-  // Add default prefixes to catch user-supplied duplicates of them below.
-  if (Req.CheckPrefixes.empty()) {
-    for (const char *Prefix : DefaultCheckPrefixes)
-      UniquePrefixes.insert(Prefix);
-  }
-  if (Req.CommentPrefixes.empty()) {
-    for (const char *Prefix : DefaultCommentPrefixes)
-      UniquePrefixes.insert(Prefix);
-  }
-  // Do not validate the default prefixes, or diagnostics about duplicates might
-  // incorrectly indicate that they were supplied by the user.
-  if (!ValidatePrefixes("check", UniquePrefixes, Req.CheckPrefixes))
-    return false;
-  if (!ValidatePrefixes("comment", UniquePrefixes, Req.CommentPrefixes))
-    return false;
-  return true;
-}
-
-Regex FileCheck::buildCheckPrefixRegex() {
-  if (Req.CheckPrefixes.empty()) {
-    for (const char *Prefix : DefaultCheckPrefixes)
-      Req.CheckPrefixes.push_back(Prefix);
-    Req.IsDefaultCheckPrefix = true;
-  }
-  if (Req.CommentPrefixes.empty()) {
-    for (const char *Prefix : DefaultCommentPrefixes)
-      Req.CommentPrefixes.push_back(Prefix);
-  }
-
-  // We already validated the contents of CheckPrefixes and CommentPrefixes so
-  // just concatenate them as alternatives.
-  SmallString<32> PrefixRegexStr;
-  for (size_t I = 0, E = Req.CheckPrefixes.size(); I != E; ++I) {
-    if (I != 0)
-      PrefixRegexStr.push_back('|');
-    PrefixRegexStr.append(Req.CheckPrefixes[I]);
-  }
-  for (StringRef Prefix : Req.CommentPrefixes) {
-    PrefixRegexStr.push_back('|');
-    PrefixRegexStr.append(Prefix);
-  }
-
-  return Regex(PrefixRegexStr);
-}
-
-Error FileCheckPatternContext::defineCmdlineVariables(
-    ArrayRef<StringRef> CmdlineDefines, SourceMgr &SM) {
-  assert(GlobalVariableTable.empty() && GlobalNumericVariableTable.empty() &&
-         "Overriding defined variable with command-line variable definitions");
-
-  if (CmdlineDefines.empty())
-    return Error::success();
-
-  // Create a string representing the vector of command-line definitions. Each
-  // definition is on its own line and prefixed with a definition number to
-  // clarify which definition a given diagnostic corresponds to.
-  unsigned I = 0;
-  Error Errs = Error::success();
-  std::string CmdlineDefsDiag;
-  SmallVector<std::pair<size_t, size_t>, 4> CmdlineDefsIndices;
-  for (StringRef CmdlineDef : CmdlineDefines) {
-    std::string DefPrefix = ("Global define #" + Twine(++I) + ": ").str();
-    size_t EqIdx = CmdlineDef.find('=');
-    if (EqIdx == StringRef::npos) {
-      CmdlineDefsIndices.push_back(std::make_pair(CmdlineDefsDiag.size(), 0));
-      continue;
-    }
-    // Numeric variable definition.
-    if (CmdlineDef[0] == '#') {
-      // Append a copy of the command-line definition adapted to use the same
-      // format as in the input file to be able to reuse
-      // parseNumericSubstitutionBlock.
-      CmdlineDefsDiag += (DefPrefix + CmdlineDef + " (parsed as: [[").str();
-      std::string SubstitutionStr = std::string(CmdlineDef);
-      SubstitutionStr[EqIdx] = ':';
-      CmdlineDefsIndices.push_back(
-          std::make_pair(CmdlineDefsDiag.size(), SubstitutionStr.size()));
-      CmdlineDefsDiag += (SubstitutionStr + Twine("]])\n")).str();
-    } else {
-      CmdlineDefsDiag += DefPrefix;
-      CmdlineDefsIndices.push_back(
-          std::make_pair(CmdlineDefsDiag.size(), CmdlineDef.size()));
-      CmdlineDefsDiag += (CmdlineDef + "\n").str();
-    }
-  }
-
-  // Create a buffer with fake command line content in order to display
-  // parsing diagnostic with location information and point to the
-  // global definition with invalid syntax.
-  std::unique_ptr<MemoryBuffer> CmdLineDefsDiagBuffer =
-      MemoryBuffer::getMemBufferCopy(CmdlineDefsDiag, "Global defines");
-  StringRef CmdlineDefsDiagRef = CmdLineDefsDiagBuffer->getBuffer();
-  SM.AddNewSourceBuffer(std::move(CmdLineDefsDiagBuffer), SMLoc());
-
-  for (std::pair<size_t, size_t> CmdlineDefIndices : CmdlineDefsIndices) {
-    StringRef CmdlineDef = CmdlineDefsDiagRef.substr(CmdlineDefIndices.first,
-                                                     CmdlineDefIndices.second);
-    if (CmdlineDef.empty()) {
-      Errs = joinErrors(
-          std::move(Errs),
-          ErrorDiagnostic::get(SM, CmdlineDef,
-                               "missing equal sign in global definition"));
-      continue;
-    }
-
-    // Numeric variable definition.
-    if (CmdlineDef[0] == '#') {
-      // Now parse the definition both to check that the syntax is correct and
-      // to create the necessary class instance.
-      StringRef CmdlineDefExpr = CmdlineDef.substr(1);
-      Optional<NumericVariable *> DefinedNumericVariable;
-      Expected<std::unique_ptr<Expression>> ExpressionResult =
-          Pattern::parseNumericSubstitutionBlock(
-              CmdlineDefExpr, DefinedNumericVariable, false, None, this, SM);
-      if (!ExpressionResult) {
-        Errs = joinErrors(std::move(Errs), ExpressionResult.takeError());
-        continue;
-      }
-      std::unique_ptr<Expression> Expression = std::move(*ExpressionResult);
-      // Now evaluate the expression whose value this variable should be set
-      // to, since the expression of a command-line variable definition should
-      // only use variables defined earlier on the command-line. If not, this
-      // is an error and we report it.
-      Expected<ExpressionValue> Value = Expression->getAST()->eval();
-      if (!Value) {
-        Errs = joinErrors(std::move(Errs), Value.takeError());
-        continue;
-      }
-
-      assert(DefinedNumericVariable && "No variable defined");
-      (*DefinedNumericVariable)->setValue(*Value);
-
-      // Record this variable definition.
-      GlobalNumericVariableTable[(*DefinedNumericVariable)->getName()] =
-          *DefinedNumericVariable;
-    } else {
-      // String variable definition.
-      std::pair<StringRef, StringRef> CmdlineNameVal = CmdlineDef.split('=');
-      StringRef CmdlineName = CmdlineNameVal.first;
-      StringRef OrigCmdlineName = CmdlineName;
-      Expected<Pattern::VariableProperties> ParseVarResult =
-          Pattern::parseVariable(CmdlineName, SM);
-      if (!ParseVarResult) {
-        Errs = joinErrors(std::move(Errs), ParseVarResult.takeError());
-        continue;
-      }
-      // Check that CmdlineName does not denote a pseudo variable is only
-      // composed of the parsed numeric variable. This catches cases like
-      // "FOO+2" in a "FOO+2=10" definition.
-      if (ParseVarResult->IsPseudo || !CmdlineName.empty()) {
-        Errs = joinErrors(std::move(Errs),
-                          ErrorDiagnostic::get(
-                              SM, OrigCmdlineName,
-                              "invalid name in string variable definition '" +
-                                  OrigCmdlineName + "'"));
-        continue;
-      }
-      StringRef Name = ParseVarResult->Name;
-
-      // Detect collisions between string and numeric variables when the former
-      // is created later than the latter.
-      if (GlobalNumericVariableTable.find(Name) !=
-          GlobalNumericVariableTable.end()) {
-        Errs = joinErrors(std::move(Errs),
-                          ErrorDiagnostic::get(SM, Name,
-                                               "numeric variable with name '" +
-                                                   Name + "' already exists"));
-        continue;
-      }
-      GlobalVariableTable.insert(CmdlineNameVal);
-      // Mark the string variable as defined to detect collisions between
-      // string and numeric variables in defineCmdlineVariables when the latter
-      // is created later than the former. We cannot reuse GlobalVariableTable
-      // for this by populating it with an empty string since we would then
-      // lose the ability to detect the use of an undefined variable in
-      // match().
-      DefinedVariableTable[Name] = true;
-    }
-  }
-
-  return Errs;
-}
-
-void FileCheckPatternContext::clearLocalVars() {
-  SmallVector<StringRef, 16> LocalPatternVars, LocalNumericVars;
-  for (const StringMapEntry<StringRef> &Var : GlobalVariableTable)
-    if (Var.first()[0] != '$')
-      LocalPatternVars.push_back(Var.first());
-
-  // Numeric substitution reads the value of a variable directly, not via
-  // GlobalNumericVariableTable. Therefore, we clear local variables by
-  // clearing their value which will lead to a numeric substitution failure. We
-  // also mark the variable for removal from GlobalNumericVariableTable since
-  // this is what defineCmdlineVariables checks to decide that no global
-  // variable has been defined.
-  for (const auto &Var : GlobalNumericVariableTable)
-    if (Var.first()[0] != '$') {
-      Var.getValue()->clearValue();
-      LocalNumericVars.push_back(Var.first());
-    }
-
-  for (const auto &Var : LocalPatternVars)
-    GlobalVariableTable.erase(Var);
-  for (const auto &Var : LocalNumericVars)
-    GlobalNumericVariableTable.erase(Var);
-}
-
-bool FileCheck::checkInput(SourceMgr &SM, StringRef Buffer,
-                           std::vector<FileCheckDiag> *Diags) {
-  bool ChecksFailed = false;
-
-  unsigned i = 0, j = 0, e = CheckStrings->size();
-  while (true) {
-    StringRef CheckRegion;
-    if (j == e) {
-      CheckRegion = Buffer;
-    } else {
-      const FileCheckString &CheckLabelStr = (*CheckStrings)[j];
-      if (CheckLabelStr.Pat.getCheckTy() != Check::CheckLabel) {
-        ++j;
-        continue;
-      }
-
-      // Scan to next CHECK-LABEL match, ignoring CHECK-NOT and CHECK-DAG
-      size_t MatchLabelLen = 0;
-      size_t MatchLabelPos =
-          CheckLabelStr.Check(SM, Buffer, true, MatchLabelLen, Req, Diags);
-      if (MatchLabelPos == StringRef::npos)
-        // Immediately bail if CHECK-LABEL fails, nothing else we can do.
-        return false;
-
-      CheckRegion = Buffer.substr(0, MatchLabelPos + MatchLabelLen);
-      Buffer = Buffer.substr(MatchLabelPos + MatchLabelLen);
-      ++j;
-    }
-
-    // Do not clear the first region as it's the one before the first
-    // CHECK-LABEL and it would clear variables defined on the command-line
-    // before they get used.
-    if (i != 0 && Req.EnableVarScope)
-      PatternContext->clearLocalVars();
-
-    for (; i != j; ++i) {
-      const FileCheckString &CheckStr = (*CheckStrings)[i];
-
-      // Check each string within the scanned region, including a second check
-      // of any final CHECK-LABEL (to verify CHECK-NOT and CHECK-DAG)
-      size_t MatchLen = 0;
-      size_t MatchPos =
-          CheckStr.Check(SM, CheckRegion, false, MatchLen, Req, Diags);
-
-      if (MatchPos == StringRef::npos) {
-        ChecksFailed = true;
-        i = j;
-        break;
-      }
-
-      CheckRegion = CheckRegion.substr(MatchPos + MatchLen);
-    }
-
-    if (j == e)
-      break;
-  }
-
-  // Success if no checks failed.
-  return !ChecksFailed;
-}
diff --git a/contrib/llvm-project/llvm/lib/Support/FileCheckImpl.h b/contrib/llvm-project/llvm/lib/Support/FileCheckImpl.h
deleted file mode 100644
index 6ca67ec2964c..000000000000
--- a/contrib/llvm-project/llvm/lib/Support/FileCheckImpl.h
+++ /dev/null
@@ -1,832 +0,0 @@
-//===-- FileCheckImpl.h - Private FileCheck Interface ------------*- C++ -*-==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the private interfaces of FileCheck. Its purpose is to
-// allow unit testing of FileCheck and to separate the interface from the
-// implementation. It is only meant to be used by FileCheck.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_SUPPORT_FILECHECKIMPL_H
-#define LLVM_LIB_SUPPORT_FILECHECKIMPL_H
-
-#include "llvm/Support/FileCheck.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/StringMap.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/SourceMgr.h"
-#include <map>
-#include <string>
-#include <vector>
-
-namespace llvm {
-
-//===----------------------------------------------------------------------===//
-// Numeric substitution handling code.
-//===----------------------------------------------------------------------===//
-
-class ExpressionValue;
-
-/// Type representing the format an expression value should be textualized into
-/// for matching. Used to represent both explicit format specifiers as well as
-/// implicit format from using numeric variables.
-struct ExpressionFormat {
-  enum class Kind {
-    /// Denote absence of format. Used for implicit format of literals and
-    /// empty expressions.
-    NoFormat,
-    /// Value is an unsigned integer and should be printed as a decimal number.
-    Unsigned,
-    /// Value is a signed integer and should be printed as a decimal number.
-    Signed,
-    /// Value should be printed as an uppercase hex number.
-    HexUpper,
-    /// Value should be printed as a lowercase hex number.
-    HexLower
-  };
-
-private:
-  Kind Value;
-
-public:
-  /// Evaluates a format to true if it can be used in a match.
-  explicit operator bool() const { return Value != Kind::NoFormat; }
-
-  /// Define format equality: formats are equal if neither is NoFormat and
-  /// their kinds are the same.
-  bool operator==(const ExpressionFormat &Other) const {
-    return Value != Kind::NoFormat && Value == Other.Value;
-  }
-
-  bool operator!=(const ExpressionFormat &Other) const {
-    return !(*this == Other);
-  }
-
-  bool operator==(Kind OtherValue) const { return Value == OtherValue; }
-
-  bool operator!=(Kind OtherValue) const { return !(*this == OtherValue); }
-
-  /// \returns the format specifier corresponding to this format as a string.
-  StringRef toString() const;
-
-  ExpressionFormat() : Value(Kind::NoFormat){};
-  explicit ExpressionFormat(Kind Value) : Value(Value){};
-
-  /// \returns a wildcard regular expression StringRef that matches any value
-  /// in the format represented by this instance, or an error if the format is
-  /// NoFormat.
-  Expected<StringRef> getWildcardRegex() const;
-
-  /// \returns the string representation of \p Value in the format represented
-  /// by this instance, or an error if conversion to this format failed or the
-  /// format is NoFormat.
-  Expected<std::string> getMatchingString(ExpressionValue Value) const;
-
-  /// \returns the value corresponding to string representation \p StrVal
-  /// according to the matching format represented by this instance or an error
-  /// with diagnostic against \p SM if \p StrVal does not correspond to a valid
-  /// and representable value.
-  Expected<ExpressionValue> valueFromStringRepr(StringRef StrVal,
-                                                const SourceMgr &SM) const;
-};
-
-/// Class to represent an overflow error that might result when manipulating a
-/// value.
-class OverflowError : public ErrorInfo<OverflowError> {
-public:
-  static char ID;
-
-  std::error_code convertToErrorCode() const override {
-    return std::make_error_code(std::errc::value_too_large);
-  }
-
-  void log(raw_ostream &OS) const override { OS << "overflow error"; }
-};
-
-/// Class representing a numeric value.
-class ExpressionValue {
-private:
-  uint64_t Value;
-  bool Negative;
-
-public:
-  template <class T>
-  explicit ExpressionValue(T Val) : Value(Val), Negative(Val < 0) {}
-
-  bool operator==(const ExpressionValue &Other) const {
-    return Value == Other.Value && isNegative() == Other.isNegative();
-  }
-
-  bool operator!=(const ExpressionValue &Other) const {
-    return !(*this == Other);
-  }
-
-  /// Returns true if value is signed and negative, false otherwise.
-  bool isNegative() const {
-    assert((Value != 0 || !Negative) && "Unexpected negative zero!");
-    return Negative;
-  }
-
-  /// \returns the value as a signed integer or an error if the value is out of
-  /// range.
-  Expected<int64_t> getSignedValue() const;
-
-  /// \returns the value as an unsigned integer or an error if the value is out
-  /// of range.
-  Expected<uint64_t> getUnsignedValue() const;
-
-  /// \returns an unsigned ExpressionValue instance whose value is the absolute
-  /// value to this object's value.
-  ExpressionValue getAbsolute() const;
-};
-
-/// Performs operation and \returns its result or an error in case of failure,
-/// such as if an overflow occurs.
-Expected<ExpressionValue> operator+(const ExpressionValue &Lhs,
-                                    const ExpressionValue &Rhs);
-Expected<ExpressionValue> operator-(const ExpressionValue &Lhs,
-                                    const ExpressionValue &Rhs);
-Expected<ExpressionValue> operator*(const ExpressionValue &Lhs,
-                                    const ExpressionValue &Rhs);
-Expected<ExpressionValue> operator/(const ExpressionValue &Lhs,
-                                    const ExpressionValue &Rhs);
-Expected<ExpressionValue> max(const ExpressionValue &Lhs,
-                              const ExpressionValue &Rhs);
-Expected<ExpressionValue> min(const ExpressionValue &Lhs,
-                              const ExpressionValue &Rhs);
-
-/// Base class representing the AST of a given expression.
-class ExpressionAST {
-private:
-  StringRef ExpressionStr;
-
-public:
-  ExpressionAST(StringRef ExpressionStr) : ExpressionStr(ExpressionStr) {}
-
-  virtual ~ExpressionAST() = default;
-
-  StringRef getExpressionStr() const { return ExpressionStr; }
-
-  /// Evaluates and \returns the value of the expression represented by this
-  /// AST or an error if evaluation fails.
-  virtual Expected<ExpressionValue> eval() const = 0;
-
-  /// \returns either the implicit format of this AST, a diagnostic against
-  /// \p SM if implicit formats of the AST's components conflict, or NoFormat
-  /// if the AST has no implicit format (e.g. AST is made up of a single
-  /// literal).
-  virtual Expected<ExpressionFormat>
-  getImplicitFormat(const SourceMgr &SM) const {
-    return ExpressionFormat();
-  }
-};
-
-/// Class representing an unsigned literal in the AST of an expression.
-class ExpressionLiteral : public ExpressionAST {
-private:
-  /// Actual value of the literal.
-  ExpressionValue Value;
-
-public:
-  template <class T>
-  explicit ExpressionLiteral(StringRef ExpressionStr, T Val)
-      : ExpressionAST(ExpressionStr), Value(Val) {}
-
-  /// \returns the literal's value.
-  Expected<ExpressionValue> eval() const override { return Value; }
-};
-
-/// Class to represent an undefined variable error, which quotes that
-/// variable's name when printed.
-class UndefVarError : public ErrorInfo<UndefVarError> {
-private:
-  StringRef VarName;
-
-public:
-  static char ID;
-
-  UndefVarError(StringRef VarName) : VarName(VarName) {}
-
-  StringRef getVarName() const { return VarName; }
-
-  std::error_code convertToErrorCode() const override {
-    return inconvertibleErrorCode();
-  }
-
-  /// Print name of variable associated with this error.
-  void log(raw_ostream &OS) const override {
-    OS << "\"";
-    OS.write_escaped(VarName) << "\"";
-  }
-};
-
-/// Class representing an expression and its matching format.
-class Expression {
-private:
-  /// Pointer to AST of the expression.
-  std::unique_ptr<ExpressionAST> AST;
-
-  /// Format to use (e.g. hex upper case letters) when matching the value.
-  ExpressionFormat Format;
-
-public:
-  /// Generic constructor for an expression represented by the given \p AST and
-  /// whose matching format is \p Format.
-  Expression(std::unique_ptr<ExpressionAST> AST, ExpressionFormat Format)
-      : AST(std::move(AST)), Format(Format) {}
-
-  /// \returns pointer to AST of the expression. Pointer is guaranteed to be
-  /// valid as long as this object is.
-  ExpressionAST *getAST() const { return AST.get(); }
-
-  ExpressionFormat getFormat() const { return Format; }
-};
-
-/// Class representing a numeric variable and its associated current value.
-class NumericVariable {
-private:
-  /// Name of the numeric variable.
-  StringRef Name;
-
-  /// Format to use for expressions using this variable without an explicit
-  /// format.
-  ExpressionFormat ImplicitFormat;
-
-  /// Value of numeric variable, if defined, or None otherwise.
-  Optional<ExpressionValue> Value;
-
-  /// Line number where this variable is defined, or None if defined before
-  /// input is parsed. Used to determine whether a variable is defined on the
-  /// same line as a given use.
-  Optional<size_t> DefLineNumber;
-
-public:
-  /// Constructor for a variable \p Name with implicit format \p ImplicitFormat
-  /// defined at line \p DefLineNumber or defined before input is parsed if
-  /// \p DefLineNumber is None.
-  explicit NumericVariable(StringRef Name, ExpressionFormat ImplicitFormat,
-                           Optional<size_t> DefLineNumber = None)
-      : Name(Name), ImplicitFormat(ImplicitFormat),
-        DefLineNumber(DefLineNumber) {}
-
-  /// \returns name of this numeric variable.
-  StringRef getName() const { return Name; }
-
-  /// \returns implicit format of this numeric variable.
-  ExpressionFormat getImplicitFormat() const { return ImplicitFormat; }
-
-  /// \returns this variable's value.
-  Optional<ExpressionValue> getValue() const { return Value; }
-
-  /// Sets value of this numeric variable to \p NewValue.
-  void setValue(ExpressionValue NewValue) { Value = NewValue; }
-
-  /// Clears value of this numeric variable, regardless of whether it is
-  /// currently defined or not.
-  void clearValue() { Value = None; }
-
-  /// \returns the line number where this variable is defined, if any, or None
-  /// if defined before input is parsed.
-  Optional<size_t> getDefLineNumber() const { return DefLineNumber; }
-};
-
-/// Class representing the use of a numeric variable in the AST of an
-/// expression.
-class NumericVariableUse : public ExpressionAST {
-private:
-  /// Pointer to the class instance for the variable this use is about.
-  NumericVariable *Variable;
-
-public:
-  NumericVariableUse(StringRef Name, NumericVariable *Variable)
-      : ExpressionAST(Name), Variable(Variable) {}
-  /// \returns the value of the variable referenced by this instance.
-  Expected<ExpressionValue> eval() const override;
-
-  /// \returns implicit format of this numeric variable.
-  Expected<ExpressionFormat>
-  getImplicitFormat(const SourceMgr &SM) const override {
-    return Variable->getImplicitFormat();
-  }
-};
-
-/// Type of functions evaluating a given binary operation.
-using binop_eval_t = Expected<ExpressionValue> (*)(const ExpressionValue &,
-                                                   const ExpressionValue &);
-
-/// Class representing a single binary operation in the AST of an expression.
-class BinaryOperation : public ExpressionAST {
-private:
-  /// Left operand.
-  std::unique_ptr<ExpressionAST> LeftOperand;
-
-  /// Right operand.
-  std::unique_ptr<ExpressionAST> RightOperand;
-
-  /// Pointer to function that can evaluate this binary operation.
-  binop_eval_t EvalBinop;
-
-public:
-  BinaryOperation(StringRef ExpressionStr, binop_eval_t EvalBinop,
-                  std::unique_ptr<ExpressionAST> LeftOp,
-                  std::unique_ptr<ExpressionAST> RightOp)
-      : ExpressionAST(ExpressionStr), EvalBinop(EvalBinop) {
-    LeftOperand = std::move(LeftOp);
-    RightOperand = std::move(RightOp);
-  }
-
-  /// Evaluates the value of the binary operation represented by this AST,
-  /// using EvalBinop on the result of recursively evaluating the operands.
-  /// \returns the expression value or an error if an undefined numeric
-  /// variable is used in one of the operands.
-  Expected<ExpressionValue> eval() const override;
-
-  /// \returns the implicit format of this AST, if any, a diagnostic against
-  /// \p SM if the implicit formats of the AST's components conflict, or no
-  /// format if the AST has no implicit format (e.g. AST is made of a single
-  /// literal).
-  Expected<ExpressionFormat>
-  getImplicitFormat(const SourceMgr &SM) const override;
-};
-
-class FileCheckPatternContext;
-
-/// Class representing a substitution to perform in the RegExStr string.
-class Substitution {
-protected:
-  /// Pointer to a class instance holding, among other things, the table with
-  /// the values of live string variables at the start of any given CHECK line.
-  /// Used for substituting string variables with the text they were defined
-  /// as. Expressions are linked to the numeric variables they use at
-  /// parse time and directly access the value of the numeric variable to
-  /// evaluate their value.
-  FileCheckPatternContext *Context;
-
-  /// The string that needs to be substituted for something else. For a
-  /// string variable this is its name, otherwise this is the whole expression.
-  StringRef FromStr;
-
-  // Index in RegExStr of where to do the substitution.
-  size_t InsertIdx;
-
-public:
-  Substitution(FileCheckPatternContext *Context, StringRef VarName,
-               size_t InsertIdx)
-      : Context(Context), FromStr(VarName), InsertIdx(InsertIdx) {}
-
-  virtual ~Substitution() = default;
-
-  /// \returns the string to be substituted for something else.
-  StringRef getFromString() const { return FromStr; }
-
-  /// \returns the index where the substitution is to be performed in RegExStr.
-  size_t getIndex() const { return InsertIdx; }
-
-  /// \returns a string containing the result of the substitution represented
-  /// by this class instance or an error if substitution failed.
-  virtual Expected<std::string> getResult() const = 0;
-};
-
-class StringSubstitution : public Substitution {
-public:
-  StringSubstitution(FileCheckPatternContext *Context, StringRef VarName,
-                     size_t InsertIdx)
-      : Substitution(Context, VarName, InsertIdx) {}
-
-  /// \returns the text that the string variable in this substitution matched
-  /// when defined, or an error if the variable is undefined.
-  Expected<std::string> getResult() const override;
-};
-
-class NumericSubstitution : public Substitution {
-private:
-  /// Pointer to the class representing the expression whose value is to be
-  /// substituted.
-  std::unique_ptr<Expression> ExpressionPointer;
-
-public:
-  NumericSubstitution(FileCheckPatternContext *Context, StringRef ExpressionStr,
-                      std::unique_ptr<Expression> ExpressionPointer,
-                      size_t InsertIdx)
-      : Substitution(Context, ExpressionStr, InsertIdx),
-        ExpressionPointer(std::move(ExpressionPointer)) {}
-
-  /// \returns a string containing the result of evaluating the expression in
-  /// this substitution, or an error if evaluation failed.
-  Expected<std::string> getResult() const override;
-};
-
-//===----------------------------------------------------------------------===//
-// Pattern handling code.
-//===----------------------------------------------------------------------===//
-
-/// Class holding the Pattern global state, shared by all patterns: tables
-/// holding values of variables and whether they are defined or not at any
-/// given time in the matching process.
-class FileCheckPatternContext {
-  friend class Pattern;
-
-private:
-  /// When matching a given pattern, this holds the value of all the string
-  /// variables defined in previous patterns. In a pattern, only the last
-  /// definition for a given variable is recorded in this table.
-  /// Back-references are used for uses after any the other definition.
-  StringMap<StringRef> GlobalVariableTable;
-
-  /// Map of all string variables defined so far. Used at parse time to detect
-  /// a name conflict between a numeric variable and a string variable when
-  /// the former is defined on a later line than the latter.
-  StringMap<bool> DefinedVariableTable;
-
-  /// When matching a given pattern, this holds the pointers to the classes
-  /// representing the numeric variables defined in previous patterns. When
-  /// matching a pattern all definitions for that pattern are recorded in the
-  /// NumericVariableDefs table in the Pattern instance of that pattern.
-  StringMap<NumericVariable *> GlobalNumericVariableTable;
-
-  /// Pointer to the class instance representing the @LINE pseudo variable for
-  /// easily updating its value.
-  NumericVariable *LineVariable = nullptr;
-
-  /// Vector holding pointers to all parsed numeric variables. Used to
-  /// automatically free them once they are guaranteed to no longer be used.
-  std::vector<std::unique_ptr<NumericVariable>> NumericVariables;
-
-  /// Vector holding pointers to all parsed expressions. Used to automatically
-  /// free the expressions once they are guaranteed to no longer be used.
-  std::vector<std::unique_ptr<Expression>> Expressions;
-
-  /// Vector holding pointers to all substitutions. Used to automatically free
-  /// them once they are guaranteed to no longer be used.
-  std::vector<std::unique_ptr<Substitution>> Substitutions;
-
-public:
-  /// \returns the value of string variable \p VarName or an error if no such
-  /// variable has been defined.
-  Expected<StringRef> getPatternVarValue(StringRef VarName);
-
-  /// Defines string and numeric variables from definitions given on the
-  /// command line, passed as a vector of [#]VAR=VAL strings in
-  /// \p CmdlineDefines. \returns an error list containing diagnostics against
-  /// \p SM for all definition parsing failures, if any, or Success otherwise.
-  Error defineCmdlineVariables(ArrayRef<StringRef> CmdlineDefines,
-                               SourceMgr &SM);
-
-  /// Create @LINE pseudo variable. Value is set when pattern are being
-  /// matched.
-  void createLineVariable();
-
-  /// Undefines local variables (variables whose name does not start with a '$'
-  /// sign), i.e. removes them from GlobalVariableTable and from
-  /// GlobalNumericVariableTable and also clears the value of numeric
-  /// variables.
-  void clearLocalVars();
-
-private:
-  /// Makes a new numeric variable and registers it for destruction when the
-  /// context is destroyed.
-  template <class... Types> NumericVariable *makeNumericVariable(Types... args);
-
-  /// Makes a new string substitution and registers it for destruction when the
-  /// context is destroyed.
-  Substitution *makeStringSubstitution(StringRef VarName, size_t InsertIdx);
-
-  /// Makes a new numeric substitution and registers it for destruction when
-  /// the context is destroyed.
-  Substitution *makeNumericSubstitution(StringRef ExpressionStr,
-                                        std::unique_ptr<Expression> Expression,
-                                        size_t InsertIdx);
-};
-
-/// Class to represent an error holding a diagnostic with location information
-/// used when printing it.
-class ErrorDiagnostic : public ErrorInfo<ErrorDiagnostic> {
-private:
-  SMDiagnostic Diagnostic;
-
-public:
-  static char ID;
-
-  ErrorDiagnostic(SMDiagnostic &&Diag) : Diagnostic(Diag) {}
-
-  std::error_code convertToErrorCode() const override {
-    return inconvertibleErrorCode();
-  }
-
-  /// Print diagnostic associated with this error when printing the error.
-  void log(raw_ostream &OS) const override { Diagnostic.print(nullptr, OS); }
-
-  static Error get(const SourceMgr &SM, SMLoc Loc, const Twine &ErrMsg) {
-    return make_error<ErrorDiagnostic>(
-        SM.GetMessage(Loc, SourceMgr::DK_Error, ErrMsg));
-  }
-
-  static Error get(const SourceMgr &SM, StringRef Buffer, const Twine &ErrMsg) {
-    return get(SM, SMLoc::getFromPointer(Buffer.data()), ErrMsg);
-  }
-};
-
-class NotFoundError : public ErrorInfo<NotFoundError> {
-public:
-  static char ID;
-
-  std::error_code convertToErrorCode() const override {
-    return inconvertibleErrorCode();
-  }
-
-  /// Print diagnostic associated with this error when printing the error.
-  void log(raw_ostream &OS) const override {
-    OS << "String not found in input";
-  }
-};
-
-class Pattern {
-  SMLoc PatternLoc;
-
-  /// A fixed string to match as the pattern or empty if this pattern requires
-  /// a regex match.
-  StringRef FixedStr;
-
-  /// A regex string to match as the pattern or empty if this pattern requires
-  /// a fixed string to match.
-  std::string RegExStr;
-
-  /// Entries in this vector represent a substitution of a string variable or
-  /// an expression in the RegExStr regex at match time. For example, in the
-  /// case of a CHECK directive with the pattern "foo[[bar]]baz[[#N+1]]",
-  /// RegExStr will contain "foobaz" and we'll get two entries in this vector
-  /// that tells us to insert the value of string variable "bar" at offset 3
-  /// and the value of expression "N+1" at offset 6.
-  std::vector<Substitution *> Substitutions;
-
-  /// Maps names of string variables defined in a pattern to the number of
-  /// their parenthesis group in RegExStr capturing their last definition.
-  ///
-  /// E.g. for the pattern "foo[[bar:.*]]baz([[bar]][[QUUX]][[bar:.*]])",
-  /// RegExStr will be "foo(.*)baz(\1<quux value>(.*))" where <quux value> is
-  /// the value captured for QUUX on the earlier line where it was defined, and
-  /// VariableDefs will map "bar" to the third parenthesis group which captures
-  /// the second definition of "bar".
-  ///
-  /// Note: uses std::map rather than StringMap to be able to get the key when
-  /// iterating over values.
-  std::map<StringRef, unsigned> VariableDefs;
-
-  /// Structure representing the definition of a numeric variable in a pattern.
-  /// It holds the pointer to the class instance holding the value and matching
-  /// format of the numeric variable whose value is being defined and the
-  /// number of the parenthesis group in RegExStr to capture that value.
-  struct NumericVariableMatch {
-    /// Pointer to class instance holding the value and matching format of the
-    /// numeric variable being defined.
-    NumericVariable *DefinedNumericVariable;
-
-    /// Number of the parenthesis group in RegExStr that captures the value of
-    /// this numeric variable definition.
-    unsigned CaptureParenGroup;
-  };
-
-  /// Holds the number of the parenthesis group in RegExStr and pointer to the
-  /// corresponding NumericVariable class instance of all numeric variable
-  /// definitions. Used to set the matched value of all those variables.
-  StringMap<NumericVariableMatch> NumericVariableDefs;
-
-  /// Pointer to a class instance holding the global state shared by all
-  /// patterns:
-  /// - separate tables with the values of live string and numeric variables
-  ///   respectively at the start of any given CHECK line;
-  /// - table holding whether a string variable has been defined at any given
-  ///   point during the parsing phase.
-  FileCheckPatternContext *Context;
-
-  Check::FileCheckType CheckTy;
-
-  /// Line number for this CHECK pattern or None if it is an implicit pattern.
-  /// Used to determine whether a variable definition is made on an earlier
-  /// line to the one with this CHECK.
-  Optional<size_t> LineNumber;
-
-  /// Ignore case while matching if set to true.
-  bool IgnoreCase = false;
-
-public:
-  Pattern(Check::FileCheckType Ty, FileCheckPatternContext *Context,
-          Optional<size_t> Line = None)
-      : Context(Context), CheckTy(Ty), LineNumber(Line) {}
-
-  /// \returns the location in source code.
-  SMLoc getLoc() const { return PatternLoc; }
-
-  /// \returns the pointer to the global state for all patterns in this
-  /// FileCheck instance.
-  FileCheckPatternContext *getContext() const { return Context; }
-
-  /// \returns whether \p C is a valid first character for a variable name.
-  static bool isValidVarNameStart(char C);
-
-  /// Parsing information about a variable.
-  struct VariableProperties {
-    StringRef Name;
-    bool IsPseudo;
-  };
-
-  /// Parses the string at the start of \p Str for a variable name. \returns
-  /// a VariableProperties structure holding the variable name and whether it
-  /// is the name of a pseudo variable, or an error holding a diagnostic
-  /// against \p SM if parsing fail. If parsing was successful, also strips
-  /// \p Str from the variable name.
-  static Expected<VariableProperties> parseVariable(StringRef &Str,
-                                                    const SourceMgr &SM);
-  /// Parses \p Expr for a numeric substitution block at line \p LineNumber,
-  /// or before input is parsed if \p LineNumber is None. Parameter
-  /// \p IsLegacyLineExpr indicates whether \p Expr should be a legacy @LINE
-  /// expression and \p Context points to the class instance holding the live
-  /// string and numeric variables. \returns a pointer to the class instance
-  /// representing the expression whose value must be substitued, or an error
-  /// holding a diagnostic against \p SM if parsing fails. If substitution was
-  /// successful, sets \p DefinedNumericVariable to point to the class
-  /// representing the numeric variable defined in this numeric substitution
-  /// block, or None if this block does not define any variable.
-  static Expected<std::unique_ptr<Expression>> parseNumericSubstitutionBlock(
-      StringRef Expr, Optional<NumericVariable *> &DefinedNumericVariable,
-      bool IsLegacyLineExpr, Optional<size_t> LineNumber,
-      FileCheckPatternContext *Context, const SourceMgr &SM);
-  /// Parses the pattern in \p PatternStr and initializes this Pattern instance
-  /// accordingly.
-  ///
-  /// \p Prefix provides which prefix is being matched, \p Req describes the
-  /// global options that influence the parsing such as whitespace
-  /// canonicalization, \p SM provides the SourceMgr used for error reports.
-  /// \returns true in case of an error, false otherwise.
-  bool parsePattern(StringRef PatternStr, StringRef Prefix, SourceMgr &SM,
-                    const FileCheckRequest &Req);
-  /// Matches the pattern string against the input buffer \p Buffer
-  ///
-  /// \returns the position that is matched or an error indicating why matching
-  /// failed. If there is a match, updates \p MatchLen with the size of the
-  /// matched string.
-  ///
-  /// The GlobalVariableTable StringMap in the FileCheckPatternContext class
-  /// instance provides the current values of FileCheck string variables and is
-  /// updated if this match defines new values. Likewise, the
-  /// GlobalNumericVariableTable StringMap in the same class provides the
-  /// current values of FileCheck numeric variables and is updated if this
-  /// match defines new numeric values.
-  Expected<size_t> match(StringRef Buffer, size_t &MatchLen,
-                         const SourceMgr &SM) const;
-  /// Prints the value of successful substitutions or the name of the undefined
-  /// string or numeric variables preventing a successful substitution.
-  void printSubstitutions(const SourceMgr &SM, StringRef Buffer,
-                          SMRange MatchRange = None) const;
-  void printFuzzyMatch(const SourceMgr &SM, StringRef Buffer,
-                       std::vector<FileCheckDiag> *Diags) const;
-
-  bool hasVariable() const {
-    return !(Substitutions.empty() && VariableDefs.empty());
-  }
-
-  Check::FileCheckType getCheckTy() const { return CheckTy; }
-
-  int getCount() const { return CheckTy.getCount(); }
-
-private:
-  bool AddRegExToRegEx(StringRef RS, unsigned &CurParen, SourceMgr &SM);
-  void AddBackrefToRegEx(unsigned BackrefNum);
-  /// Computes an arbitrary estimate for the quality of matching this pattern
-  /// at the start of \p Buffer; a distance of zero should correspond to a
-  /// perfect match.
-  unsigned computeMatchDistance(StringRef Buffer) const;
-  /// Finds the closing sequence of a regex variable usage or definition.
-  ///
-  /// \p Str has to point in the beginning of the definition (right after the
-  /// opening sequence). \p SM holds the SourceMgr used for error reporting.
-  ///  \returns the offset of the closing sequence within Str, or npos if it
-  /// was not found.
-  static size_t FindRegexVarEnd(StringRef Str, SourceMgr &SM);
-
-  /// Parses \p Expr for the name of a numeric variable to be defined at line
-  /// \p LineNumber, or before input is parsed if \p LineNumber is None.
-  /// \returns a pointer to the class instance representing that variable,
-  /// creating it if needed, or an error holding a diagnostic against \p SM
-  /// should defining such a variable be invalid.
-  static Expected<NumericVariable *> parseNumericVariableDefinition(
-      StringRef &Expr, FileCheckPatternContext *Context,
-      Optional<size_t> LineNumber, ExpressionFormat ImplicitFormat,
-      const SourceMgr &SM);
-  /// Parses \p Name as a (pseudo if \p IsPseudo is true) numeric variable use
-  /// at line \p LineNumber, or before input is parsed if \p LineNumber is
-  /// None. Parameter \p Context points to the class instance holding the live
-  /// string and numeric variables. \returns the pointer to the class instance
-  /// representing that variable if successful, or an error holding a
-  /// diagnostic against \p SM otherwise.
-  static Expected<std::unique_ptr<NumericVariableUse>> parseNumericVariableUse(
-      StringRef Name, bool IsPseudo, Optional<size_t> LineNumber,
-      FileCheckPatternContext *Context, const SourceMgr &SM);
-  enum class AllowedOperand { LineVar, LegacyLiteral, Any };
-  /// Parses \p Expr for use of a numeric operand at line \p LineNumber, or
-  /// before input is parsed if \p LineNumber is None. Accepts literal values,
-  /// numeric variables and function calls, depending on the value of \p AO.
-  /// \p MaybeInvalidConstraint indicates whether the text being parsed could
-  /// be an invalid constraint. \p Context points to the class instance holding
-  /// the live string and numeric variables. \returns the class representing
-  /// that operand in the AST of the expression or an error holding a
-  /// diagnostic against \p SM otherwise. If \p Expr starts with a "(" this
-  /// function will attempt to parse a parenthesized expression.
-  static Expected<std::unique_ptr<ExpressionAST>>
-  parseNumericOperand(StringRef &Expr, AllowedOperand AO, bool ConstraintParsed,
-                      Optional<size_t> LineNumber,
-                      FileCheckPatternContext *Context, const SourceMgr &SM);
-  /// Parses and updates \p RemainingExpr for a binary operation at line
-  /// \p LineNumber, or before input is parsed if \p LineNumber is None. The
-  /// left operand of this binary operation is given in \p LeftOp and \p Expr
-  /// holds the string for the full expression, including the left operand.
-  /// Parameter \p IsLegacyLineExpr indicates whether we are parsing a legacy
-  /// @LINE expression. Parameter \p Context points to the class instance
-  /// holding the live string and numeric variables. \returns the class
-  /// representing the binary operation in the AST of the expression, or an
-  /// error holding a diagnostic against \p SM otherwise.
-  static Expected<std::unique_ptr<ExpressionAST>>
-  parseBinop(StringRef Expr, StringRef &RemainingExpr,
-             std::unique_ptr<ExpressionAST> LeftOp, bool IsLegacyLineExpr,
-             Optional<size_t> LineNumber, FileCheckPatternContext *Context,
-             const SourceMgr &SM);
-
-  /// Parses a parenthesized expression inside \p Expr at line \p LineNumber, or
-  /// before input is parsed if \p LineNumber is None. \p Expr must start with
-  /// a '('. Accepts both literal values and numeric variables. Parameter \p
-  /// Context points to the class instance holding the live string and numeric
-  /// variables. \returns the class representing that operand in the AST of the
-  /// expression or an error holding a diagnostic against \p SM otherwise.
-  static Expected<std::unique_ptr<ExpressionAST>>
-  parseParenExpr(StringRef &Expr, Optional<size_t> LineNumber,
-                 FileCheckPatternContext *Context, const SourceMgr &SM);
-
-  /// Parses \p Expr for an argument list belonging to a call to function \p
-  /// FuncName at line \p LineNumber, or before input is parsed if \p LineNumber
-  /// is None. Parameter \p FuncLoc is the source location used for diagnostics.
-  /// Parameter \p Context points to the class instance holding the live string
-  /// and numeric variables. \returns the class representing that call in the
-  /// AST of the expression or an error holding a diagnostic against \p SM
-  /// otherwise.
-  static Expected<std::unique_ptr<ExpressionAST>>
-  parseCallExpr(StringRef &Expr, StringRef FuncName,
-                Optional<size_t> LineNumber, FileCheckPatternContext *Context,
-                const SourceMgr &SM);
-};
-
-//===----------------------------------------------------------------------===//
-// Check Strings.
-//===----------------------------------------------------------------------===//
-
-/// A check that we found in the input file.
-struct FileCheckString {
-  /// The pattern to match.
-  Pattern Pat;
-
-  /// Which prefix name this check matched.
-  StringRef Prefix;
-
-  /// The location in the match file that the check string was specified.
-  SMLoc Loc;
-
-  /// All of the strings that are disallowed from occurring between this match
-  /// string and the previous one (or start of file).
-  std::vector<Pattern> DagNotStrings;
-
-  FileCheckString(const Pattern &P, StringRef S, SMLoc L)
-      : Pat(P), Prefix(S), Loc(L) {}
-
-  /// Matches check string and its "not strings" and/or "dag strings".
-  size_t Check(const SourceMgr &SM, StringRef Buffer, bool IsLabelScanMode,
-               size_t &MatchLen, FileCheckRequest &Req,
-               std::vector<FileCheckDiag> *Diags) const;
-
-  /// Verifies that there is a single line in the given \p Buffer. Errors are
-  /// reported against \p SM.
-  bool CheckNext(const SourceMgr &SM, StringRef Buffer) const;
-  /// Verifies that there is no newline in the given \p Buffer. Errors are
-  /// reported against \p SM.
-  bool CheckSame(const SourceMgr &SM, StringRef Buffer) const;
-  /// Verifies that none of the strings in \p NotStrings are found in the given
-  /// \p Buffer. Errors are reported against \p SM and diagnostics recorded in
-  /// \p Diags according to the verbosity level set in \p Req.
-  bool CheckNot(const SourceMgr &SM, StringRef Buffer,
-                const std::vector<const Pattern *> &NotStrings,
-                const FileCheckRequest &Req,
-                std::vector<FileCheckDiag> *Diags) const;
-  /// Matches "dag strings" and their mixed "not strings".
-  size_t CheckDag(const SourceMgr &SM, StringRef Buffer,
-                  std::vector<const Pattern *> &NotStrings,
-                  const FileCheckRequest &Req,
-                  std::vector<FileCheckDiag> *Diags) const;
-};
-
-} // namespace llvm
-
-#endif
diff --git a/contrib/llvm-project/llvm/lib/Support/FileCollector.cpp b/contrib/llvm-project/llvm/lib/Support/FileCollector.cpp
index 59755556a5a3..99482075f675 100644
--- a/contrib/llvm-project/llvm/lib/Support/FileCollector.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/FileCollector.cpp
@@ -15,6 +15,22 @@
 
 using namespace llvm;
 
+FileCollectorBase::FileCollectorBase() = default;
+FileCollectorBase::~FileCollectorBase() = default;
+
+void FileCollectorBase::addFile(const Twine &File) {
+  std::lock_guard<std::mutex> lock(Mutex);
+  std::string FileStr = File.str();
+  if (markAsSeen(FileStr))
+    addFileImpl(FileStr);
+}
+
+void FileCollectorBase::addDirectory(const Twine &Dir) {
+  assert(sys::fs::is_directory(Dir));
+  std::error_code EC;
+  addDirectoryImpl(Dir, vfs::getRealFileSystem(), EC);
+}
+
 static bool isCaseSensitivePath(StringRef Path) {
   SmallString<256> TmpDest = Path, UpperDest, RealDest;
 
@@ -37,74 +53,82 @@ FileCollector::FileCollector(std::string Root, std::string OverlayRoot)
     : Root(std::move(Root)), OverlayRoot(std::move(OverlayRoot)) {
 }
 
-bool FileCollector::getRealPath(StringRef SrcPath,
-                                SmallVectorImpl<char> &Result) {
+void FileCollector::PathCanonicalizer::updateWithRealPath(
+    SmallVectorImpl<char> &Path) {
+  StringRef SrcPath(Path.begin(), Path.size());
+  StringRef Filename = sys::path::filename(SrcPath);
+  StringRef Directory = sys::path::parent_path(SrcPath);
+
+  // Use real_path to fix any symbolic link component present in the directory
+  // part of the path, caching the search because computing the real path is
+  // expensive.
   SmallString<256> RealPath;
-  StringRef FileName = sys::path::filename(SrcPath);
-  std::string Directory = sys::path::parent_path(SrcPath).str();
-  auto DirWithSymlink = SymlinkMap.find(Directory);
-
-  // Use real_path to fix any symbolic link component present in a path.
-  // Computing the real path is expensive, cache the search through the parent
-  // path Directory.
-  if (DirWithSymlink == SymlinkMap.end()) {
-    auto EC = sys::fs::real_path(Directory, RealPath);
-    if (EC)
-      return false;
-    SymlinkMap[Directory] = std::string(RealPath.str());
+  auto DirWithSymlink = CachedDirs.find(Directory);
+  if (DirWithSymlink == CachedDirs.end()) {
+    // FIXME: Should this be a call to FileSystem::getRealpath(), in some
+    // cases? What if there is nothing on disk?
+    if (sys::fs::real_path(Directory, RealPath))
+      return;
+    CachedDirs[Directory] = std::string(RealPath.str());
   } else {
     RealPath = DirWithSymlink->second;
   }
 
-  sys::path::append(RealPath, FileName);
-  Result.swap(RealPath);
-  return true;
-}
-
-void FileCollector::addFile(const Twine &File) {
-  std::lock_guard<std::mutex> lock(Mutex);
-  std::string FileStr = File.str();
-  if (markAsSeen(FileStr))
-    addFileImpl(FileStr);
-}
+  // Finish recreating the path by appending the original filename, since we
+  // don't need to resolve symlinks in the filename.
+  //
+  // FIXME: If we can cope with this, maybe we can cope without calling
+  // getRealPath() at all when there's no ".." component.
+  sys::path::append(RealPath, Filename);
 
-void FileCollector::addDirectory(const Twine &Dir) {
-  assert(sys::fs::is_directory(Dir));
-  std::error_code EC;
-  addDirectoryImpl(Dir, vfs::getRealFileSystem(), EC);
+  // Swap to create the output.
+  Path.swap(RealPath);
 }
 
-void FileCollector::addFileImpl(StringRef SrcPath) {
+/// Make Path absolute.
+static void makeAbsolute(SmallVectorImpl<char> &Path) {
   // We need an absolute src path to append to the root.
-  SmallString<256> AbsoluteSrc = SrcPath;
-  sys::fs::make_absolute(AbsoluteSrc);
+  sys::fs::make_absolute(Path);
 
   // Canonicalize src to a native path to avoid mixed separator styles.
-  sys::path::native(AbsoluteSrc);
+  sys::path::native(Path);
 
   // Remove redundant leading "./" pieces and consecutive separators.
-  AbsoluteSrc = sys::path::remove_leading_dotslash(AbsoluteSrc);
+  Path.erase(Path.begin(), sys::path::remove_leading_dotslash(
+                               StringRef(Path.begin(), Path.size()))
+                               .begin());
+}
 
-  // Canonicalize the source path by removing "..", "." components.
-  SmallString<256> VirtualPath = AbsoluteSrc;
-  sys::path::remove_dots(VirtualPath, /*remove_dot_dot=*/true);
+FileCollector::PathCanonicalizer::PathStorage
+FileCollector::PathCanonicalizer::canonicalize(StringRef SrcPath) {
+  PathStorage Paths;
+  Paths.VirtualPath = SrcPath;
+  makeAbsolute(Paths.VirtualPath);
 
   // If a ".." component is present after a symlink component, remove_dots may
   // lead to the wrong real destination path. Let the source be canonicalized
   // like that but make sure we always use the real path for the destination.
-  SmallString<256> CopyFrom;
-  if (!getRealPath(AbsoluteSrc, CopyFrom))
-    CopyFrom = VirtualPath;
+  Paths.CopyFrom = Paths.VirtualPath;
+  updateWithRealPath(Paths.CopyFrom);
+
+  // Canonicalize the virtual path by removing "..", "." components.
+  sys::path::remove_dots(Paths.VirtualPath, /*remove_dot_dot=*/true);
+
+  return Paths;
+}
+
+void FileCollector::addFileImpl(StringRef SrcPath) {
+  PathCanonicalizer::PathStorage Paths = Canonicalizer.canonicalize(SrcPath);
 
   SmallString<256> DstPath = StringRef(Root);
-  sys::path::append(DstPath, sys::path::relative_path(CopyFrom));
+  sys::path::append(DstPath, sys::path::relative_path(Paths.CopyFrom));
 
   // Always map a canonical src path to its real path into the YAML, by doing
   // this we map different virtual src paths to the same entry in the VFS
   // overlay, which is a way to emulate symlink inside the VFS; this is also
   // needed for correctness, not doing that can lead to module redefinition
   // errors.
-  addFileToMapping(VirtualPath, DstPath);
+  addFileToMapping(Paths.VirtualPath, DstPath);
 }
 
 llvm::vfs::directory_iterator
@@ -158,14 +182,6 @@ std::error_code FileCollector::copyFiles(bool StopOnError) {
   std::lock_guard<std::mutex> lock(Mutex);
 
   for (auto &entry : VFSWriter.getMappings()) {
-    // Create directory tree.
-    if (std::error_code EC =
-            sys::fs::create_directories(sys::path::parent_path(entry.RPath),
-                                        /*IgnoreExisting=*/true)) {
-      if (StopOnError)
-        return EC;
-    }
-
     // Get the status of the original file/directory.
     sys::fs::file_status Stat;
     if (std::error_code EC = sys::fs::status(entry.VPath, Stat)) {
@@ -174,6 +190,18 @@ std::error_code FileCollector::copyFiles(bool StopOnError) {
       continue;
     }
 
+    // Continue if the file doesn't exist.
+    if (Stat.type() == sys::fs::file_type::file_not_found)
+      continue;
+
+    // Create directory tree.
+    if (std::error_code EC =
+            sys::fs::create_directories(sys::path::parent_path(entry.RPath),
+                                        /*IgnoreExisting=*/true)) {
+      if (StopOnError)
+        return EC;
+    }
+
     if (Stat.type() == sys::fs::file_type::directory_file) {
       // Construct a directory when it's just a directory entry.
       if (std::error_code EC =
diff --git a/contrib/llvm-project/llvm/lib/Support/FormatVariadic.cpp b/contrib/llvm-project/llvm/lib/Support/FormatVariadic.cpp
index 632e879e540d..f6d48bcd50e8 100644
--- a/contrib/llvm-project/llvm/lib/Support/FormatVariadic.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/FormatVariadic.cpp
@@ -91,27 +91,26 @@ formatv_object_base::parseReplacementItem(StringRef Spec) {
 
 std::pair<ReplacementItem, StringRef>
 formatv_object_base::splitLiteralAndReplacement(StringRef Fmt) {
-  std::size_t From = 0;
-  while (From < Fmt.size() && From != StringRef::npos) {
-    std::size_t BO = Fmt.find_first_of('{', From);
+  while (!Fmt.empty()) {
     // Everything up until the first brace is a literal.
-    if (BO != 0)
+    if (Fmt.front() != '{') {
+      std::size_t BO = Fmt.find_first_of('{');
       return std::make_pair(ReplacementItem{Fmt.substr(0, BO)}, Fmt.substr(BO));
+    }
 
-    StringRef Braces =
-        Fmt.drop_front(BO).take_while([](char C) { return C == '{'; });
+    StringRef Braces = Fmt.take_while([](char C) { return C == '{'; });
     // If there is more than one brace, then some of them are escaped.  Treat
     // these as replacements.
     if (Braces.size() > 1) {
       size_t NumEscapedBraces = Braces.size() / 2;
-      StringRef Middle = Fmt.substr(BO, NumEscapedBraces);
-      StringRef Right = Fmt.drop_front(BO + NumEscapedBraces * 2);
+      StringRef Middle = Fmt.take_front(NumEscapedBraces);
+      StringRef Right = Fmt.drop_front(NumEscapedBraces * 2);
       return std::make_pair(ReplacementItem{Middle}, Right);
     }
     // An unterminated open brace is undefined.  We treat the rest of the string
     // as a literal replacement, but we assert to indicate that this is
     // undefined and that we consider it an error.
-    std::size_t BC = Fmt.find_first_of('}', BO);
+    std::size_t BC = Fmt.find_first_of('}');
     if (BC == StringRef::npos) {
       assert(
           false &&
@@ -122,12 +121,12 @@ formatv_object_base::splitLiteralAndReplacement(StringRef Fmt) {
     // Even if there is a closing brace, if there is another open brace before
     // this closing brace, treat this portion as literal, and try again with the
     // next one.
-    std::size_t BO2 = Fmt.find_first_of('{', BO + 1);
+    std::size_t BO2 = Fmt.find_first_of('{', 1);
     if (BO2 < BC)
       return std::make_pair(ReplacementItem{Fmt.substr(0, BO2)},
                             Fmt.substr(BO2));
 
-    StringRef Spec = Fmt.slice(BO + 1, BC);
+    StringRef Spec = Fmt.slice(1, BC);
     StringRef Right = Fmt.substr(BC + 1);
 
     auto RI = parseReplacementItem(Spec);
@@ -136,7 +135,7 @@ formatv_object_base::splitLiteralAndReplacement(StringRef Fmt) {
 
     // If there was an error parsing the replacement item, treat it as an
     // invalid replacement spec, and just continue.
-    From = BC + 1;
+    Fmt = Fmt.drop_front(BC + 1);
   }
   return std::make_pair(ReplacementItem{Fmt}, StringRef());
 }
diff --git a/contrib/llvm-project/llvm/lib/Support/Host.cpp b/contrib/llvm-project/llvm/lib/Support/Host.cpp
index 36cecf9b2a16..09146c47ff2c 100644
--- a/contrib/llvm-project/llvm/lib/Support/Host.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/Host.cpp
@@ -161,11 +161,14 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) {
   // Look for the CPU implementer line.
   StringRef Implementer;
   StringRef Hardware;
+  StringRef Part;
   for (unsigned I = 0, E = Lines.size(); I != E; ++I) {
     if (Lines[I].startswith("CPU implementer"))
       Implementer = Lines[I].substr(15).ltrim("\t :");
     if (Lines[I].startswith("Hardware"))
       Hardware = Lines[I].substr(8).ltrim("\t :");
+    if (Lines[I].startswith("CPU part"))
+      Part = Lines[I].substr(8).ltrim("\t :");
   }
 
   if (Implementer == "0x41") { // ARM Ltd.
@@ -175,110 +178,89 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) {
       return "cortex-a53";
 
 
-    // Look for the CPU part line.
-    for (unsigned I = 0, E = Lines.size(); I != E; ++I)
-      if (Lines[I].startswith("CPU part"))
-        // The CPU part is a 3 digit hexadecimal number with a 0x prefix. The
-        // values correspond to the "Part number" in the CP15/c0 register. The
-        // contents are specified in the various processor manuals.
-        // This corresponds to the Main ID Register in Technical Reference Manuals.
-        // and is used in programs like sys-utils
-        return StringSwitch<const char *>(Lines[I].substr(8).ltrim("\t :"))
-            .Case("0x926", "arm926ej-s")
-            .Case("0xb02", "mpcore")
-            .Case("0xb36", "arm1136j-s")
-            .Case("0xb56", "arm1156t2-s")
-            .Case("0xb76", "arm1176jz-s")
-            .Case("0xc08", "cortex-a8")
-            .Case("0xc09", "cortex-a9")
-            .Case("0xc0f", "cortex-a15")
-            .Case("0xc20", "cortex-m0")
-            .Case("0xc23", "cortex-m3")
-            .Case("0xc24", "cortex-m4")
-            .Case("0xd22", "cortex-m55")
-            .Case("0xd02", "cortex-a34")
-            .Case("0xd04", "cortex-a35")
-            .Case("0xd03", "cortex-a53")
-            .Case("0xd07", "cortex-a57")
-            .Case("0xd08", "cortex-a72")
-            .Case("0xd09", "cortex-a73")
-            .Case("0xd0a", "cortex-a75")
-            .Case("0xd0b", "cortex-a76")
-            .Case("0xd0d", "cortex-a77")
-            .Case("0xd41", "cortex-a78")
-            .Case("0xd44", "cortex-x1")
-            .Case("0xd0c", "neoverse-n1")
-            .Default("generic");
+    // The CPU part is a 3 digit hexadecimal number with a 0x prefix. The
+    // values correspond to the "Part number" in the CP15/c0 register. The
+    // contents are specified in the various processor manuals.
+    // This corresponds to the Main ID Register in Technical Reference Manuals.
+    // and is used in programs like sys-utils
+    return StringSwitch<const char *>(Part)
+        .Case("0x926", "arm926ej-s")
+        .Case("0xb02", "mpcore")
+        .Case("0xb36", "arm1136j-s")
+        .Case("0xb56", "arm1156t2-s")
+        .Case("0xb76", "arm1176jz-s")
+        .Case("0xc08", "cortex-a8")
+        .Case("0xc09", "cortex-a9")
+        .Case("0xc0f", "cortex-a15")
+        .Case("0xc20", "cortex-m0")
+        .Case("0xc23", "cortex-m3")
+        .Case("0xc24", "cortex-m4")
+        .Case("0xd22", "cortex-m55")
+        .Case("0xd02", "cortex-a34")
+        .Case("0xd04", "cortex-a35")
+        .Case("0xd03", "cortex-a53")
+        .Case("0xd07", "cortex-a57")
+        .Case("0xd08", "cortex-a72")
+        .Case("0xd09", "cortex-a73")
+        .Case("0xd0a", "cortex-a75")
+        .Case("0xd0b", "cortex-a76")
+        .Case("0xd0d", "cortex-a77")
+        .Case("0xd41", "cortex-a78")
+        .Case("0xd44", "cortex-x1")
+        .Case("0xd0c", "neoverse-n1")
+        .Case("0xd49", "neoverse-n2")
+        .Default("generic");
   }
 
   if (Implementer == "0x42" || Implementer == "0x43") { // Broadcom | Cavium.
-    for (unsigned I = 0, E = Lines.size(); I != E; ++I) {
-      if (Lines[I].startswith("CPU part")) {
-        return StringSwitch<const char *>(Lines[I].substr(8).ltrim("\t :"))
-          .Case("0x516", "thunderx2t99")
-          .Case("0x0516", "thunderx2t99")
-          .Case("0xaf", "thunderx2t99")
-          .Case("0x0af", "thunderx2t99")
-          .Case("0xa1", "thunderxt88")
-          .Case("0x0a1", "thunderxt88")
-          .Default("generic");
-      }
-    }
+    return StringSwitch<const char *>(Part)
+      .Case("0x516", "thunderx2t99")
+      .Case("0x0516", "thunderx2t99")
+      .Case("0xaf", "thunderx2t99")
+      .Case("0x0af", "thunderx2t99")
+      .Case("0xa1", "thunderxt88")
+      .Case("0x0a1", "thunderxt88")
+      .Default("generic");
   }
 
   if (Implementer == "0x46") { // Fujitsu Ltd.
-    for (unsigned I = 0, E = Lines.size(); I != E; ++I) {
-      if (Lines[I].startswith("CPU part")) {
-        return StringSwitch<const char *>(Lines[I].substr(8).ltrim("\t :"))
-          .Case("0x001", "a64fx")
-          .Default("generic");
-      }
-    }
+    return StringSwitch<const char *>(Part)
+      .Case("0x001", "a64fx")
+      .Default("generic");
   }
 
   if (Implementer == "0x4e") { // NVIDIA Corporation
-    for (unsigned I = 0, E = Lines.size(); I != E; ++I) {
-      if (Lines[I].startswith("CPU part")) {
-        return StringSwitch<const char *>(Lines[I].substr(8).ltrim("\t :"))
-            .Case("0x004", "carmel")
-            .Default("generic");
-      }
-    }
+    return StringSwitch<const char *>(Part)
+        .Case("0x004", "carmel")
+        .Default("generic");
   }
 
   if (Implementer == "0x48") // HiSilicon Technologies, Inc.
-    // Look for the CPU part line.
-    for (unsigned I = 0, E = Lines.size(); I != E; ++I)
-      if (Lines[I].startswith("CPU part"))
-        // The CPU part is a 3 digit hexadecimal number with a 0x prefix. The
-        // values correspond to the "Part number" in the CP15/c0 register. The
-        // contents are specified in the various processor manuals.
-        return StringSwitch<const char *>(Lines[I].substr(8).ltrim("\t :"))
-          .Case("0xd01", "tsv110")
-          .Default("generic");
+    // The CPU part is a 3 digit hexadecimal number with a 0x prefix. The
+    // values correspond to the "Part number" in the CP15/c0 register. The
+    // contents are specified in the various processor manuals.
+    return StringSwitch<const char *>(Part)
+      .Case("0xd01", "tsv110")
+      .Default("generic");
 
   if (Implementer == "0x51") // Qualcomm Technologies, Inc.
-    // Look for the CPU part line.
-    for (unsigned I = 0, E = Lines.size(); I != E; ++I)
-      if (Lines[I].startswith("CPU part"))
-        // The CPU part is a 3 digit hexadecimal number with a 0x prefix. The
-        // values correspond to the "Part number" in the CP15/c0 register. The
-        // contents are specified in the various processor manuals.
-        return StringSwitch<const char *>(Lines[I].substr(8).ltrim("\t :"))
-            .Case("0x06f", "krait") // APQ8064
-            .Case("0x201", "kryo")
-            .Case("0x205", "kryo")
-            .Case("0x211", "kryo")
-            .Case("0x800", "cortex-a73")
-            .Case("0x801", "cortex-a73")
-            .Case("0x802", "cortex-a73")
-            .Case("0x803", "cortex-a73")
-            .Case("0x804", "cortex-a73")
-            .Case("0x805", "cortex-a73")
-            .Case("0xc00", "falkor")
-            .Case("0xc01", "saphira")
-            .Default("generic");
-
+    // The CPU part is a 3 digit hexadecimal number with a 0x prefix. The
+    // values correspond to the "Part number" in the CP15/c0 register. The
+    // contents are specified in the various processor manuals.
+    return StringSwitch<const char *>(Part)
+        .Case("0x06f", "krait") // APQ8064
+        .Case("0x201", "kryo")
+        .Case("0x205", "kryo")
+        .Case("0x211", "kryo")
+        .Case("0x800", "cortex-a73") // Kryo 2xx Gold
+        .Case("0x801", "cortex-a73") // Kryo 2xx Silver
+        .Case("0x802", "cortex-a75") // Kryo 3xx Gold
+        .Case("0x803", "cortex-a75") // Kryo 3xx Silver
+        .Case("0x804", "cortex-a76") // Kryo 4xx Gold
+        .Case("0x805", "cortex-a76") // Kryo 4xx/5xx Silver
+        .Case("0xc00", "falkor")
+        .Case("0xc01", "saphira")
+        .Default("generic");
   if (Implementer == "0x53") { // Samsung Electronics Co., Ltd.
     // The Exynos chips have a convoluted ID scheme that doesn't seem to follow
     // any predictive pattern across variants and parts.
@@ -323,7 +305,7 @@ StringRef sys::detail::getHostCPUNameForS390x(StringRef ProcCpuinfoContent) {
   SmallVector<StringRef, 32> CPUFeatures;
   for (unsigned I = 0, E = Lines.size(); I != E; ++I)
     if (Lines[I].startswith("features")) {
-      size_t Pos = Lines[I].find(":");
+      size_t Pos = Lines[I].find(':');
       if (Pos != StringRef::npos) {
         Lines[I].drop_front(Pos + 1).split(CPUFeatures, ' ');
         break;
@@ -435,11 +417,6 @@ StringRef sys::detail::getHostCPUNameForBPF() {
 #if defined(__i386__) || defined(_M_IX86) || \
     defined(__x86_64__) || defined(_M_X64)
 
-enum VendorSignatures {
-  SIG_INTEL = 0x756e6547 /* Genu */,
-  SIG_AMD = 0x68747541 /* Auth */
-};
-
 // The check below for i386 was copied from clang's cpuid.h (__get_cpuid_max).
 // Check motivated by bug reports for OpenSSL crashing on CPUs without CPUID
 // support. Consequently, for i386, the presence of CPUID is checked first
@@ -513,6 +490,42 @@ static bool getX86CpuIDAndInfo(unsigned value, unsigned *rEAX, unsigned *rEBX,
 #endif
 }
 
+namespace llvm {
+namespace sys {
+namespace detail {
+namespace x86 {
+
+VendorSignatures getVendorSignature(unsigned *MaxLeaf) {
+  unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0;
+  if (MaxLeaf == nullptr)
+    MaxLeaf = &EAX;
+  else
+    *MaxLeaf = 0;
+
+  if (!isCpuIdSupported())
+    return VendorSignatures::UNKNOWN;
+
+  if (getX86CpuIDAndInfo(0, MaxLeaf, &EBX, &ECX, &EDX) || *MaxLeaf < 1)
+    return VendorSignatures::UNKNOWN;
+
+  // "Genu ineI ntel"
+  if (EBX == 0x756e6547 && EDX == 0x49656e69 && ECX == 0x6c65746e)
+    return VendorSignatures::GENUINE_INTEL;
+
+  // "Auth enti cAMD"
+  if (EBX == 0x68747541 && EDX == 0x69746e65 && ECX == 0x444d4163)
+    return VendorSignatures::AUTHENTIC_AMD;
+
+  return VendorSignatures::UNKNOWN;
+}
+
+} // namespace x86
+} // namespace detail
+} // namespace sys
+} // namespace llvm
+
+using namespace llvm::sys::detail::x86;
+
 /// getX86CpuIDAndInfoEx - Execute the specified cpuid with subleaf and return
 /// the 4 values in the specified arguments.  If we can't run cpuid on the host,
 /// return true.
@@ -730,6 +743,13 @@ getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model,
       *Subtype = X86::INTEL_COREI7_ICELAKE_SERVER;
       break;
 
+    // Sapphire Rapids:
+    case 0x8f:
+      CPU = "sapphirerapids";
+      *Type = X86::INTEL_COREI7;
+      *Subtype = X86::INTEL_COREI7_SAPPHIRERAPIDS;
+      break;
+
     case 0x1c: // Most 45 nm Intel Atom processors
     case 0x26: // 45 nm Atom Lincroft
     case 0x27: // 32 nm Atom Medfield
@@ -956,6 +976,14 @@ getAMDProcessorTypeAndSubtype(unsigned Family, unsigned Model,
       break; // 00h-0Fh: Zen1
     }
     break;
+  case 25:
+    CPU = "znver3";
+    *Type = X86::AMDFAM19H;
+    if (Model <= 0x0f) {
+      *Subtype = X86::AMDFAM19H_ZNVER3;
+      break; // 00h-0Fh: Zen3
+    }
+    break;
   default:
     break; // Unknown AMD CPU.
   }
@@ -1095,14 +1123,12 @@ static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf,
 }
 
 StringRef sys::getHostCPUName() {
-  unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0;
-  unsigned MaxLeaf, Vendor;
-
-  if (!isCpuIdSupported())
+  unsigned MaxLeaf = 0;
+  const VendorSignatures Vendor = getVendorSignature(&MaxLeaf);
+  if (Vendor == VendorSignatures::UNKNOWN)
     return "generic";
 
-  if (getX86CpuIDAndInfo(0, &MaxLeaf, &Vendor, &ECX, &EDX) || MaxLeaf < 1)
-    return "generic";
+  unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0;
   getX86CpuIDAndInfo(0x1, &EAX, &EBX, &ECX, &EDX);
 
   unsigned Family = 0, Model = 0;
@@ -1117,10 +1143,10 @@ StringRef sys::getHostCPUName() {
 
   StringRef CPU;
 
-  if (Vendor == SIG_INTEL) {
+  if (Vendor == VendorSignatures::GENUINE_INTEL) {
     CPU = getIntelProcessorTypeAndSubtype(Family, Model, Features, &Type,
                                           &Subtype);
-  } else if (Vendor == SIG_AMD) {
+  } else if (Vendor == VendorSignatures::AUTHENTIC_AMD) {
     CPU = getAMDProcessorTypeAndSubtype(Family, Model, Features, &Type,
                                         &Subtype);
   }
@@ -1222,6 +1248,19 @@ StringRef sys::getHostCPUName() {
 }
 #else
 StringRef sys::getHostCPUName() { return "generic"; }
+namespace llvm {
+namespace sys {
+namespace detail {
+namespace x86 {
+
+VendorSignatures getVendorSignature(unsigned *MaxLeaf) {
+  return VendorSignatures::UNKNOWN;
+}
+
+} // namespace x86
+} // namespace detail
+} // namespace sys
+} // namespace llvm
 #endif
 
 #if defined(__linux__) && (defined(__i386__) || defined(__x86_64__))
@@ -1272,6 +1311,27 @@ int computeHostNumPhysicalCores() {
   }
   return CPU_COUNT(&Enabled);
 }
+#elif defined(__linux__) && defined(__powerpc__)
+int computeHostNumPhysicalCores() {
+  cpu_set_t Affinity;
+  if (sched_getaffinity(0, sizeof(Affinity), &Affinity) == 0)
+    return CPU_COUNT(&Affinity);
+
+  // The call to sched_getaffinity() may have failed because the Affinity
+  // mask is too small for the number of CPU's on the system (i.e. the
+  // system has more than 1024 CPUs). Allocate a mask large enough for
+  // twice as many CPUs.
+  cpu_set_t *DynAffinity;
+  DynAffinity = CPU_ALLOC(2048);
+  if (sched_getaffinity(0, CPU_ALLOC_SIZE(2048), DynAffinity) == 0) {
+    int NumCPUs = CPU_COUNT(DynAffinity);
+    CPU_FREE(DynAffinity);
+    return NumCPUs;
+  }
+  return -1;
+}
+#elif defined(__linux__) && defined(__s390x__)
+int computeHostNumPhysicalCores() { return sysconf(_SC_NPROCESSORS_ONLN); }
 #elif defined(__APPLE__) && defined(__x86_64__)
 #include <sys/param.h>
 #include <sys/sysctl.h>
@@ -1291,6 +1351,28 @@ int computeHostNumPhysicalCores() {
   }
   return count;
 }
+#elif defined(__MVS__)
+int computeHostNumPhysicalCores() {
+  enum {
+    // Byte offset of the pointer to the Communications Vector Table (CVT) in
+    // the Prefixed Save Area (PSA). The table entry is a 31-bit pointer and
+    // will be zero-extended to uintptr_t.
+    FLCCVT = 16,
+    // Byte offset of the pointer to the Common System Data Area (CSD) in the
+    // CVT. The table entry is a 31-bit pointer and will be zero-extended to
+    // uintptr_t.
+    CVTCSD = 660,
+    // Byte offset to the number of live CPs in the LPAR, stored as a signed
+    // 32-bit value in the table.
+    CSD_NUMBER_ONLINE_STANDARD_CPS = 264,
+  };
+  char *PSA = 0;
+  char *CVT = reinterpret_cast<char *>(
+      static_cast<uintptr_t>(reinterpret_cast<unsigned int &>(PSA[FLCCVT])));
+  char *CSD = reinterpret_cast<char *>(
+      static_cast<uintptr_t>(reinterpret_cast<unsigned int &>(CVT[CVTCSD])));
+  return reinterpret_cast<int &>(CSD[CSD_NUMBER_ONLINE_STANDARD_CPS]);
+}
 #elif defined(_WIN32) && LLVM_ENABLE_THREADS != 0
 // Defined in llvm/lib/Support/Windows/Threading.inc
 int computeHostNumPhysicalCores();
@@ -1420,11 +1502,13 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
   Features["avx512bitalg"]    = HasLeaf7 && ((ECX >> 12) & 1) && HasAVX512Save;
   Features["avx512vpopcntdq"] = HasLeaf7 && ((ECX >> 14) & 1) && HasAVX512Save;
   Features["rdpid"]           = HasLeaf7 && ((ECX >> 22) & 1);
+  Features["kl"]              = HasLeaf7 && ((ECX >> 23) & 1); // key locker
   Features["cldemote"]        = HasLeaf7 && ((ECX >> 25) & 1);
   Features["movdiri"]         = HasLeaf7 && ((ECX >> 27) & 1);
   Features["movdir64b"]       = HasLeaf7 && ((ECX >> 28) & 1);
   Features["enqcmd"]          = HasLeaf7 && ((ECX >> 29) & 1);
 
+  Features["uintr"]           = HasLeaf7 && ((EDX >> 5) & 1);
   Features["avx512vp2intersect"] =
       HasLeaf7 && ((EDX >> 8) & 1) && HasAVX512Save;
   Features["serialize"]       = HasLeaf7 && ((EDX >> 14) & 1);
@@ -1445,7 +1529,9 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
   Features["amx-int8"]   = HasLeaf7 && ((EDX >> 25) & 1) && HasAMXSave;
   bool HasLeaf7Subleaf1 =
       MaxLevel >= 7 && !getX86CpuIDAndInfoEx(0x7, 0x1, &EAX, &EBX, &ECX, &EDX);
+  Features["avxvnni"]    = HasLeaf7Subleaf1 && ((EAX >> 4) & 1) && HasAVXSave;
   Features["avx512bf16"] = HasLeaf7Subleaf1 && ((EAX >> 5) & 1) && HasAVX512Save;
+  Features["hreset"]     = HasLeaf7Subleaf1 && ((EAX >> 22) & 1);
 
   bool HasLeafD = MaxLevel >= 0xd &&
                   !getX86CpuIDAndInfoEx(0xd, 0x1, &EAX, &EBX, &ECX, &EDX);
@@ -1460,6 +1546,10 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
 
   Features["ptwrite"] = HasLeaf14 && ((EBX >> 4) & 1);
 
+  bool HasLeaf19 =
+      MaxLevel >= 0x19 && !getX86CpuIDAndInfo(0x19, &EAX, &EBX, &ECX, &EDX);
+  Features["widekl"] = HasLeaf7 && HasLeaf19 && ((EBX >> 2) & 1);
+
   return true;
 }
 #elif defined(__linux__) && (defined(__arm__) || defined(__aarch64__))
diff --git a/contrib/llvm-project/llvm/lib/Support/InitLLVM.cpp b/contrib/llvm-project/llvm/lib/Support/InitLLVM.cpp
index 5c56b773ea69..152de6ebae0a 100644
--- a/contrib/llvm-project/llvm/lib/Support/InitLLVM.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/InitLLVM.cpp
@@ -22,10 +22,17 @@ using namespace llvm;
 using namespace llvm::sys;
 
 InitLLVM::InitLLVM(int &Argc, const char **&Argv,
-                   bool InstallPipeSignalExitHandler)
-    : StackPrinter(Argc, Argv) {
+                   bool InstallPipeSignalExitHandler) {
   if (InstallPipeSignalExitHandler)
+    // The pipe signal handler must be installed before any other handlers are
+    // registered. This is because the Unix \ref RegisterHandlers function does
+    // not perform a sigaction() for SIGPIPE unless a one-shot handler is
+    // present, to allow long-lived processes (like lldb) to fully opt-out of
+    // llvm's SIGPIPE handling and ignore the signal safely.
     sys::SetOneShotPipeSignalFunction(sys::DefaultOneShotPipeSignalHandler);
+  // Initialize the stack printer after installing the one-shot pipe signal
+  // handler, so we can perform a sigaction() for SIGPIPE on Unix if requested.
+  StackPrinter.emplace(Argc, Argv);
   sys::PrintStackTraceOnErrorSignal(Argv[0]);
   install_out_of_memory_new_handler();
 
diff --git a/contrib/llvm-project/llvm/lib/Support/InstructionCost.cpp b/contrib/llvm-project/llvm/lib/Support/InstructionCost.cpp
new file mode 100644
index 000000000000..c485ce9107af
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Support/InstructionCost.cpp
@@ -0,0 +1,24 @@
+//===- InstructionCost.cpp --------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file includes the function definitions for the InstructionCost class
+/// that is used when calculating the cost of an instruction, or a group of
+/// instructions.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/InstructionCost.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+void InstructionCost::print(raw_ostream &OS) const {
+  if (isValid())
+    OS << Value;
+  else
+    OS << "Invalid";
+}
diff --git a/contrib/llvm-project/llvm/lib/Support/JSON.cpp b/contrib/llvm-project/llvm/lib/Support/JSON.cpp
index 16b1d11efd08..dbfd673553f4 100644
--- a/contrib/llvm-project/llvm/lib/Support/JSON.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/JSON.cpp
@@ -7,8 +7,11 @@
 //===---------------------------------------------------------------------===//
 
 #include "llvm/Support/JSON.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/ConvertUTF.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
 #include <cctype>
 
 namespace llvm {
@@ -106,7 +109,7 @@ void Value::copyFrom(const Value &M) {
   case T_Boolean:
   case T_Double:
   case T_Integer:
-    memcpy(Union.buffer, M.Union.buffer, sizeof(Union.buffer));
+    memcpy(&Union, &M.Union, sizeof(Union));
     break;
   case T_StringRef:
     create<StringRef>(M.as<StringRef>());
@@ -130,7 +133,7 @@ void Value::moveFrom(const Value &&M) {
   case T_Boolean:
   case T_Double:
   case T_Integer:
-    memcpy(Union.buffer, M.Union.buffer, sizeof(Union.buffer));
+    memcpy(&Union, &M.Union, sizeof(Union));
     break;
   case T_StringRef:
     create<StringRef>(M.as<StringRef>());
@@ -198,6 +201,160 @@ bool operator==(const Value &L, const Value &R) {
   llvm_unreachable("Unknown value kind");
 }
 
+void Path::report(llvm::StringLiteral Msg) {
+  // Walk up to the root context, and count the number of segments.
+  unsigned Count = 0;
+  const Path *P;
+  for (P = this; P->Parent != nullptr; P = P->Parent)
+    ++Count;
+  Path::Root *R = P->Seg.root();
+  // Fill in the error message and copy the path (in reverse order).
+  R->ErrorMessage = Msg;
+  R->ErrorPath.resize(Count);
+  auto It = R->ErrorPath.begin();
+  for (P = this; P->Parent != nullptr; P = P->Parent)
+    *It++ = P->Seg;
+}
+
+Error Path::Root::getError() const {
+  std::string S;
+  raw_string_ostream OS(S);
+  OS << (ErrorMessage.empty() ? "invalid JSON contents" : ErrorMessage);
+  if (ErrorPath.empty()) {
+    if (!Name.empty())
+      OS << " when parsing " << Name;
+  } else {
+    OS << " at " << (Name.empty() ? "(root)" : Name);
+    for (const Path::Segment &S : llvm::reverse(ErrorPath)) {
+      if (S.isField())
+        OS << '.' << S.field();
+      else
+        OS << '[' << S.index() << ']';
+    }
+  }
+  return createStringError(llvm::inconvertibleErrorCode(), OS.str());
+}
+
+namespace {
+
+std::vector<const Object::value_type *> sortedElements(const Object &O) {
+  std::vector<const Object::value_type *> Elements;
+  for (const auto &E : O)
+    Elements.push_back(&E);
+  llvm::sort(Elements,
+             [](const Object::value_type *L, const Object::value_type *R) {
+               return L->first < R->first;
+             });
+  return Elements;
+}
+
+// Prints a one-line version of a value that isn't our main focus.
+// We interleave writes to OS and JOS, exploiting the lack of extra buffering.
+// This is OK as we own the implementation.
+void abbreviate(const Value &V, OStream &JOS) {
+  switch (V.kind()) {
+  case Value::Array:
+    JOS.rawValue(V.getAsArray()->empty() ? "[]" : "[ ... ]");
+    break;
+  case Value::Object:
+    JOS.rawValue(V.getAsObject()->empty() ? "{}" : "{ ... }");
+    break;
+  case Value::String: {
+    llvm::StringRef S = *V.getAsString();
+    if (S.size() < 40) {
+      JOS.value(V);
+    } else {
+      std::string Truncated = fixUTF8(S.take_front(37));
+      Truncated.append("...");
+      JOS.value(Truncated);
+    }
+    break;
+  }
+  default:
+    JOS.value(V);
+  }
+}
+
+// Prints a semi-expanded version of a value that is our main focus.
+// Array/Object entries are printed, but not recursively as they may be huge.
+void abbreviateChildren(const Value &V, OStream &JOS) {
+  switch (V.kind()) {
+  case Value::Array:
+    JOS.array([&] {
+      for (const auto &I : *V.getAsArray())
+        abbreviate(I, JOS);
+    });
+    break;
+  case Value::Object:
+    JOS.object([&] {
+      for (const auto *KV : sortedElements(*V.getAsObject())) {
+        JOS.attributeBegin(KV->first);
+        abbreviate(KV->second, JOS);
+        JOS.attributeEnd();
+      }
+    });
+    break;
+  default:
+    JOS.value(V);
+  }
+}
+
+} // namespace
+
+void Path::Root::printErrorContext(const Value &R, raw_ostream &OS) const {
+  OStream JOS(OS, /*IndentSize=*/2);
+  // PrintValue recurses down the path, printing the ancestors of our target.
+  // Siblings of nodes along the path are printed with abbreviate(), and the
+  // target itself is printed with the somewhat richer abbreviateChildren().
+  // 'Recurse' is the lambda itself, to allow recursive calls.
+  auto PrintValue = [&](const Value &V, ArrayRef<Segment> Path, auto &Recurse) {
+    // Print the target node itself, with the error as a comment.
+    // Also used if we can't follow our path, e.g. it names a field that
+    // *should* exist but doesn't.
+    auto HighlightCurrent = [&] {
+      std::string Comment = "error: ";
+      Comment.append(ErrorMessage.data(), ErrorMessage.size());
+      JOS.comment(Comment);
+      abbreviateChildren(V, JOS);
+    };
+    if (Path.empty()) // We reached our target.
+      return HighlightCurrent();
+    const Segment &S = Path.back(); // Path is in reverse order.
+    if (S.isField()) {
+      // Current node is an object, path names a field.
+      llvm::StringRef FieldName = S.field();
+      const Object *O = V.getAsObject();
+      if (!O || !O->get(FieldName))
+        return HighlightCurrent();
+      JOS.object([&] {
+        for (const auto *KV : sortedElements(*O)) {
+          JOS.attributeBegin(KV->first);
+          if (FieldName.equals(KV->first))
+            Recurse(KV->second, Path.drop_back(), Recurse);
+          else
+            abbreviate(KV->second, JOS);
+          JOS.attributeEnd();
+        }
+      });
+    } else {
+      // Current node is an array, path names an element.
+      const Array *A = V.getAsArray();
+      if (!A || S.index() >= A->size())
+        return HighlightCurrent();
+      JOS.array([&] {
+        unsigned Current = 0;
+        for (const auto &V : *A) {
+          if (Current++ == S.index())
+            Recurse(V, Path.drop_back(), Recurse);
+          else
+            abbreviate(V, JOS);
+        }
+      });
+    }
+  };
+  PrintValue(R, ErrorPath, PrintValue);
+}
+
 namespace {
 // Simple recursive-descent JSON parser.
 class Parser {
@@ -518,17 +675,6 @@ Expected<Value> parse(StringRef JSON) {
 }
 char ParseError::ID = 0;
 
-static std::vector<const Object::value_type *> sortedElements(const Object &O) {
-  std::vector<const Object::value_type *> Elements;
-  for (const auto &E : O)
-    Elements.push_back(&E);
-  llvm::sort(Elements,
-             [](const Object::value_type *L, const Object::value_type *R) {
-               return L->first < R->first;
-             });
-  return Elements;
-}
-
 bool isUTF8(llvm::StringRef S, size_t *ErrOffset) {
   // Fast-path for ASCII, which is valid UTF-8.
   if (LLVM_LIKELY(isASCII(S)))
@@ -633,9 +779,40 @@ void llvm::json::OStream::valueBegin() {
   }
   if (Stack.back().Ctx == Array)
     newline();
+  flushComment();
   Stack.back().HasValue = true;
 }
 
+void OStream::comment(llvm::StringRef Comment) {
+  assert(PendingComment.empty() && "Only one comment per value!");
+  PendingComment = Comment;
+}
+
+void OStream::flushComment() {
+  if (PendingComment.empty())
+    return;
+  OS << (IndentSize ? "/* " : "/*");
+  // Be sure not to accidentally emit "*/". Transform to "* /".
+  while (!PendingComment.empty()) {
+    auto Pos = PendingComment.find("*/");
+    if (Pos == StringRef::npos) {
+      OS << PendingComment;
+      PendingComment = "";
+    } else {
+      OS << PendingComment.take_front(Pos) << "* /";
+      PendingComment = PendingComment.drop_front(Pos + 2);
+    }
+  }
+  OS << (IndentSize ? " */" : "*/");
+  // Comments are on their own line unless attached to an attribute value.
+  if (Stack.size() > 1 && Stack.back().Ctx == Singleton) {
+    if (IndentSize)
+      OS << ' ';
+  } else {
+    newline();
+  }
+}
+
 void llvm::json::OStream::newline() {
   if (IndentSize) {
     OS.write('\n');
@@ -657,6 +834,7 @@ void llvm::json::OStream::arrayEnd() {
   if (Stack.back().HasValue)
     newline();
   OS << ']';
+  assert(PendingComment.empty());
   Stack.pop_back();
   assert(!Stack.empty());
 }
@@ -675,6 +853,7 @@ void llvm::json::OStream::objectEnd() {
   if (Stack.back().HasValue)
     newline();
   OS << '}';
+  assert(PendingComment.empty());
   Stack.pop_back();
   assert(!Stack.empty());
 }
@@ -684,6 +863,7 @@ void llvm::json::OStream::attributeBegin(llvm::StringRef Key) {
   if (Stack.back().HasValue)
     OS << ',';
   newline();
+  flushComment();
   Stack.back().HasValue = true;
   Stack.emplace_back();
   Stack.back().Ctx = Singleton;
@@ -701,10 +881,23 @@ void llvm::json::OStream::attributeBegin(llvm::StringRef Key) {
 void llvm::json::OStream::attributeEnd() {
   assert(Stack.back().Ctx == Singleton);
   assert(Stack.back().HasValue && "Attribute must have a value");
+  assert(PendingComment.empty());
   Stack.pop_back();
   assert(Stack.back().Ctx == Object);
 }
 
+raw_ostream &llvm::json::OStream::rawValueBegin() {
+  valueBegin();
+  Stack.emplace_back();
+  Stack.back().Ctx = RawValue;
+  return OS;
+}
+
+void llvm::json::OStream::rawValueEnd() {
+  assert(Stack.back().Ctx == RawValue);
+  Stack.pop_back();
+}
+
 } // namespace json
 } // namespace llvm
 
diff --git a/contrib/llvm-project/llvm/lib/Support/KnownBits.cpp b/contrib/llvm-project/llvm/lib/Support/KnownBits.cpp
index 1ff66d504cbe..3623a54ae476 100644
--- a/contrib/llvm-project/llvm/lib/Support/KnownBits.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/KnownBits.cpp
@@ -83,6 +83,403 @@ KnownBits KnownBits::computeForAddSub(bool Add, bool NSW,
   return KnownOut;
 }
 
+KnownBits KnownBits::sextInReg(unsigned SrcBitWidth) const {
+  unsigned BitWidth = getBitWidth();
+  assert(0 < SrcBitWidth && SrcBitWidth <= BitWidth &&
+         "Illegal sext-in-register");
+
+  if (SrcBitWidth == BitWidth)
+    return *this;
+
+  unsigned ExtBits = BitWidth - SrcBitWidth;
+  KnownBits Result;
+  Result.One = One << ExtBits;
+  Result.Zero = Zero << ExtBits;
+  Result.One.ashrInPlace(ExtBits);
+  Result.Zero.ashrInPlace(ExtBits);
+  return Result;
+}
+
+KnownBits KnownBits::makeGE(const APInt &Val) const {
+  // Count the number of leading bit positions where our underlying value is
+  // known to be less than or equal to Val.
+  unsigned N = (Zero | Val).countLeadingOnes();
+
+  // For each of those bit positions, if Val has a 1 in that bit then our
+  // underlying value must also have a 1.
+  APInt MaskedVal(Val);
+  MaskedVal.clearLowBits(getBitWidth() - N);
+  return KnownBits(Zero, One | MaskedVal);
+}
+
+KnownBits KnownBits::umax(const KnownBits &LHS, const KnownBits &RHS) {
+  // If we can prove that LHS >= RHS then use LHS as the result. Likewise for
+  // RHS. Ideally our caller would already have spotted these cases and
+  // optimized away the umax operation, but we handle them here for
+  // completeness.
+  if (LHS.getMinValue().uge(RHS.getMaxValue()))
+    return LHS;
+  if (RHS.getMinValue().uge(LHS.getMaxValue()))
+    return RHS;
+
+  // If the result of the umax is LHS then it must be greater than or equal to
+  // the minimum possible value of RHS. Likewise for RHS. Any known bits that
+  // are common to these two values are also known in the result.
+  KnownBits L = LHS.makeGE(RHS.getMinValue());
+  KnownBits R = RHS.makeGE(LHS.getMinValue());
+  return KnownBits::commonBits(L, R);
+}
+
+KnownBits KnownBits::umin(const KnownBits &LHS, const KnownBits &RHS) {
+  // Flip the range of values: [0, 0xFFFFFFFF] <-> [0xFFFFFFFF, 0]
+  auto Flip = [](const KnownBits &Val) { return KnownBits(Val.One, Val.Zero); };
+  return Flip(umax(Flip(LHS), Flip(RHS)));
+}
+
+KnownBits KnownBits::smax(const KnownBits &LHS, const KnownBits &RHS) {
+  // Flip the range of values: [-0x80000000, 0x7FFFFFFF] <-> [0, 0xFFFFFFFF]
+  auto Flip = [](const KnownBits &Val) {
+    unsigned SignBitPosition = Val.getBitWidth() - 1;
+    APInt Zero = Val.Zero;
+    APInt One = Val.One;
+    Zero.setBitVal(SignBitPosition, Val.One[SignBitPosition]);
+    One.setBitVal(SignBitPosition, Val.Zero[SignBitPosition]);
+    return KnownBits(Zero, One);
+  };
+  return Flip(umax(Flip(LHS), Flip(RHS)));
+}
+
+KnownBits KnownBits::smin(const KnownBits &LHS, const KnownBits &RHS) {
+  // Flip the range of values: [-0x80000000, 0x7FFFFFFF] <-> [0xFFFFFFFF, 0]
+  auto Flip = [](const KnownBits &Val) {
+    unsigned SignBitPosition = Val.getBitWidth() - 1;
+    APInt Zero = Val.One;
+    APInt One = Val.Zero;
+    Zero.setBitVal(SignBitPosition, Val.Zero[SignBitPosition]);
+    One.setBitVal(SignBitPosition, Val.One[SignBitPosition]);
+    return KnownBits(Zero, One);
+  };
+  return Flip(umax(Flip(LHS), Flip(RHS)));
+}
+
+KnownBits KnownBits::shl(const KnownBits &LHS, const KnownBits &RHS) {
+  unsigned BitWidth = LHS.getBitWidth();
+  KnownBits Known(BitWidth);
+
+  // If the shift amount is a valid constant then transform LHS directly.
+  if (RHS.isConstant() && RHS.getConstant().ult(BitWidth)) {
+    unsigned Shift = RHS.getConstant().getZExtValue();
+    Known = LHS;
+    Known.Zero <<= Shift;
+    Known.One <<= Shift;
+    // Low bits are known zero.
+    Known.Zero.setLowBits(Shift);
+    return Known;
+  }
+
+  // No matter the shift amount, the trailing zeros will stay zero.
+  unsigned MinTrailingZeros = LHS.countMinTrailingZeros();
+
+  // Minimum shift amount low bits are known zero.
+  if (RHS.getMinValue().ult(BitWidth)) {
+    MinTrailingZeros += RHS.getMinValue().getZExtValue();
+    MinTrailingZeros = std::min(MinTrailingZeros, BitWidth);
+  }
+
+  Known.Zero.setLowBits(MinTrailingZeros);
+  return Known;
+}
+
+KnownBits KnownBits::lshr(const KnownBits &LHS, const KnownBits &RHS) {
+  unsigned BitWidth = LHS.getBitWidth();
+  KnownBits Known(BitWidth);
+
+  if (RHS.isConstant() && RHS.getConstant().ult(BitWidth)) {
+    unsigned Shift = RHS.getConstant().getZExtValue();
+    Known = LHS;
+    Known.Zero.lshrInPlace(Shift);
+    Known.One.lshrInPlace(Shift);
+    // High bits are known zero.
+    Known.Zero.setHighBits(Shift);
+    return Known;
+  }
+
+  // No matter the shift amount, the leading zeros will stay zero.
+  unsigned MinLeadingZeros = LHS.countMinLeadingZeros();
+
+  // Minimum shift amount high bits are known zero.
+  if (RHS.getMinValue().ult(BitWidth)) {
+    MinLeadingZeros += RHS.getMinValue().getZExtValue();
+    MinLeadingZeros = std::min(MinLeadingZeros, BitWidth);
+  }
+
+  Known.Zero.setHighBits(MinLeadingZeros);
+  return Known;
+}
+
+KnownBits KnownBits::ashr(const KnownBits &LHS, const KnownBits &RHS) {
+  unsigned BitWidth = LHS.getBitWidth();
+  KnownBits Known(BitWidth);
+
+  if (RHS.isConstant() && RHS.getConstant().ult(BitWidth)) {
+    unsigned Shift = RHS.getConstant().getZExtValue();
+    Known = LHS;
+    Known.Zero.ashrInPlace(Shift);
+    Known.One.ashrInPlace(Shift);
+    return Known;
+  }
+
+  // No matter the shift amount, the leading sign bits will stay.
+  unsigned MinLeadingZeros = LHS.countMinLeadingZeros();
+  unsigned MinLeadingOnes = LHS.countMinLeadingOnes();
+
+  // Minimum shift amount high bits are known sign bits.
+  if (RHS.getMinValue().ult(BitWidth)) {
+    if (MinLeadingZeros) {
+      MinLeadingZeros += RHS.getMinValue().getZExtValue();
+      MinLeadingZeros = std::min(MinLeadingZeros, BitWidth);
+    }
+    if (MinLeadingOnes) {
+      MinLeadingOnes += RHS.getMinValue().getZExtValue();
+      MinLeadingOnes = std::min(MinLeadingOnes, BitWidth);
+    }
+  }
+
+  Known.Zero.setHighBits(MinLeadingZeros);
+  Known.One.setHighBits(MinLeadingOnes);
+  return Known;
+}
+
+Optional<bool> KnownBits::eq(const KnownBits &LHS, const KnownBits &RHS) {
+  if (LHS.isConstant() && RHS.isConstant())
+    return Optional<bool>(LHS.getConstant() == RHS.getConstant());
+  if (LHS.One.intersects(RHS.Zero) || RHS.One.intersects(LHS.Zero))
+    return Optional<bool>(false);
+  return None;
+}
+
+Optional<bool> KnownBits::ne(const KnownBits &LHS, const KnownBits &RHS) {
+  if (Optional<bool> KnownEQ = eq(LHS, RHS))
+    return Optional<bool>(!KnownEQ.getValue());
+  return None;
+}
+
+Optional<bool> KnownBits::ugt(const KnownBits &LHS, const KnownBits &RHS) {
+  // LHS >u RHS -> false if umax(LHS) <= umax(RHS)
+  if (LHS.getMaxValue().ule(RHS.getMinValue()))
+    return Optional<bool>(false);
+  // LHS >u RHS -> true if umin(LHS) > umax(RHS)
+  if (LHS.getMinValue().ugt(RHS.getMaxValue()))
+    return Optional<bool>(true);
+  return None;
+}
+
+Optional<bool> KnownBits::uge(const KnownBits &LHS, const KnownBits &RHS) {
+  if (Optional<bool> IsUGT = ugt(RHS, LHS))
+    return Optional<bool>(!IsUGT.getValue());
+  return None;
+}
+
+Optional<bool> KnownBits::ult(const KnownBits &LHS, const KnownBits &RHS) {
+  return ugt(RHS, LHS);
+}
+
+Optional<bool> KnownBits::ule(const KnownBits &LHS, const KnownBits &RHS) {
+  return uge(RHS, LHS);
+}
+
+Optional<bool> KnownBits::sgt(const KnownBits &LHS, const KnownBits &RHS) {
+  // LHS >s RHS -> false if smax(LHS) <= smax(RHS)
+  if (LHS.getSignedMaxValue().sle(RHS.getSignedMinValue()))
+    return Optional<bool>(false);
+  // LHS >s RHS -> true if smin(LHS) > smax(RHS)
+  if (LHS.getSignedMinValue().sgt(RHS.getSignedMaxValue()))
+    return Optional<bool>(true);
+  return None;
+}
+
+Optional<bool> KnownBits::sge(const KnownBits &LHS, const KnownBits &RHS) {
+  if (Optional<bool> KnownSGT = sgt(RHS, LHS))
+    return Optional<bool>(!KnownSGT.getValue());
+  return None;
+}
+
+Optional<bool> KnownBits::slt(const KnownBits &LHS, const KnownBits &RHS) {
+  return sgt(RHS, LHS);
+}
+
+Optional<bool> KnownBits::sle(const KnownBits &LHS, const KnownBits &RHS) {
+  return sge(RHS, LHS);
+}
+
+KnownBits KnownBits::abs(bool IntMinIsPoison) const {
+  // If the source's MSB is zero then we know the rest of the bits already.
+  if (isNonNegative())
+    return *this;
+
+  // Absolute value preserves trailing zero count.
+  KnownBits KnownAbs(getBitWidth());
+  KnownAbs.Zero.setLowBits(countMinTrailingZeros());
+
+  // We only know that the absolute values's MSB will be zero if INT_MIN is
+  // poison, or there is a set bit that isn't the sign bit (otherwise it could
+  // be INT_MIN).
+  if (IntMinIsPoison || (!One.isNullValue() && !One.isMinSignedValue()))
+    KnownAbs.Zero.setSignBit();
+
+  // FIXME: Handle known negative input?
+  // FIXME: Calculate the negated Known bits and combine them?
+  return KnownAbs;
+}
+
+KnownBits KnownBits::computeForMul(const KnownBits &LHS, const KnownBits &RHS) {
+  unsigned BitWidth = LHS.getBitWidth();
+
+  assert(!LHS.hasConflict() && !RHS.hasConflict());
+  // Compute a conservative estimate for high known-0 bits.
+  unsigned LeadZ =
+      std::max(LHS.countMinLeadingZeros() + RHS.countMinLeadingZeros(),
+               BitWidth) -
+      BitWidth;
+  LeadZ = std::min(LeadZ, BitWidth);
+
+  // The result of the bottom bits of an integer multiply can be
+  // inferred by looking at the bottom bits of both operands and
+  // multiplying them together.
+  // We can infer at least the minimum number of known trailing bits
+  // of both operands. Depending on number of trailing zeros, we can
+  // infer more bits, because (a*b) <=> ((a/m) * (b/n)) * (m*n) assuming
+  // a and b are divisible by m and n respectively.
+  // We then calculate how many of those bits are inferrable and set
+  // the output. For example, the i8 mul:
+  //  a = XXXX1100 (12)
+  //  b = XXXX1110 (14)
+  // We know the bottom 3 bits are zero since the first can be divided by
+  // 4 and the second by 2, thus having ((12/4) * (14/2)) * (2*4).
+  // Applying the multiplication to the trimmed arguments gets:
+  //    XX11 (3)
+  //    X111 (7)
+  // -------
+  //    XX11
+  //   XX11
+  //  XX11
+  // XX11
+  // -------
+  // XXXXX01
+  // Which allows us to infer the 2 LSBs. Since we're multiplying the result
+  // by 8, the bottom 3 bits will be 0, so we can infer a total of 5 bits.
+  // The proof for this can be described as:
+  // Pre: (C1 >= 0) && (C1 < (1 << C5)) && (C2 >= 0) && (C2 < (1 << C6)) &&
+  //      (C7 == (1 << (umin(countTrailingZeros(C1), C5) +
+  //                    umin(countTrailingZeros(C2), C6) +
+  //                    umin(C5 - umin(countTrailingZeros(C1), C5),
+  //                         C6 - umin(countTrailingZeros(C2), C6)))) - 1)
+  // %aa = shl i8 %a, C5
+  // %bb = shl i8 %b, C6
+  // %aaa = or i8 %aa, C1
+  // %bbb = or i8 %bb, C2
+  // %mul = mul i8 %aaa, %bbb
+  // %mask = and i8 %mul, C7
+  //   =>
+  // %mask = i8 ((C1*C2)&C7)
+  // Where C5, C6 describe the known bits of %a, %b
+  // C1, C2 describe the known bottom bits of %a, %b.
+  // C7 describes the mask of the known bits of the result.
+  const APInt &Bottom0 = LHS.One;
+  const APInt &Bottom1 = RHS.One;
+
+  // How many times we'd be able to divide each argument by 2 (shr by 1).
+  // This gives us the number of trailing zeros on the multiplication result.
+  unsigned TrailBitsKnown0 = (LHS.Zero | LHS.One).countTrailingOnes();
+  unsigned TrailBitsKnown1 = (RHS.Zero | RHS.One).countTrailingOnes();
+  unsigned TrailZero0 = LHS.countMinTrailingZeros();
+  unsigned TrailZero1 = RHS.countMinTrailingZeros();
+  unsigned TrailZ = TrailZero0 + TrailZero1;
+
+  // Figure out the fewest known-bits operand.
+  unsigned SmallestOperand =
+      std::min(TrailBitsKnown0 - TrailZero0, TrailBitsKnown1 - TrailZero1);
+  unsigned ResultBitsKnown = std::min(SmallestOperand + TrailZ, BitWidth);
+
+  APInt BottomKnown =
+      Bottom0.getLoBits(TrailBitsKnown0) * Bottom1.getLoBits(TrailBitsKnown1);
+
+  KnownBits Res(BitWidth);
+  Res.Zero.setHighBits(LeadZ);
+  Res.Zero |= (~BottomKnown).getLoBits(ResultBitsKnown);
+  Res.One = BottomKnown.getLoBits(ResultBitsKnown);
+  return Res;
+}
+
+KnownBits KnownBits::udiv(const KnownBits &LHS, const KnownBits &RHS) {
+  unsigned BitWidth = LHS.getBitWidth();
+  assert(!LHS.hasConflict() && !RHS.hasConflict());
+  KnownBits Known(BitWidth);
+
+  // For the purposes of computing leading zeros we can conservatively
+  // treat a udiv as a logical right shift by the power of 2 known to
+  // be less than the denominator.
+  unsigned LeadZ = LHS.countMinLeadingZeros();
+  unsigned RHSMaxLeadingZeros = RHS.countMaxLeadingZeros();
+
+  if (RHSMaxLeadingZeros != BitWidth)
+    LeadZ = std::min(BitWidth, LeadZ + BitWidth - RHSMaxLeadingZeros - 1);
+
+  Known.Zero.setHighBits(LeadZ);
+  return Known;
+}
+
+KnownBits KnownBits::urem(const KnownBits &LHS, const KnownBits &RHS) {
+  unsigned BitWidth = LHS.getBitWidth();
+  assert(!LHS.hasConflict() && !RHS.hasConflict());
+  KnownBits Known(BitWidth);
+
+  if (RHS.isConstant() && RHS.getConstant().isPowerOf2()) {
+    // The upper bits are all zero, the lower ones are unchanged.
+    APInt LowBits = RHS.getConstant() - 1;
+    Known.Zero = LHS.Zero | ~LowBits;
+    Known.One = LHS.One & LowBits;
+    return Known;
+  }
+
+  // Since the result is less than or equal to either operand, any leading
+  // zero bits in either operand must also exist in the result.
+  uint32_t Leaders =
+      std::max(LHS.countMinLeadingZeros(), RHS.countMinLeadingZeros());
+  Known.Zero.setHighBits(Leaders);
+  return Known;
+}
+
+KnownBits KnownBits::srem(const KnownBits &LHS, const KnownBits &RHS) {
+  unsigned BitWidth = LHS.getBitWidth();
+  assert(!LHS.hasConflict() && !RHS.hasConflict());
+  KnownBits Known(BitWidth);
+
+  if (RHS.isConstant() && RHS.getConstant().isPowerOf2()) {
+    // The low bits of the first operand are unchanged by the srem.
+    APInt LowBits = RHS.getConstant() - 1;
+    Known.Zero = LHS.Zero & LowBits;
+    Known.One = LHS.One & LowBits;
+
+    // If the first operand is non-negative or has all low bits zero, then
+    // the upper bits are all zero.
+    if (LHS.isNonNegative() || LowBits.isSubsetOf(LHS.Zero))
+      Known.Zero |= ~LowBits;
+
+    // If the first operand is negative and not all low bits are zero, then
+    // the upper bits are all one.
+    if (LHS.isNegative() && LowBits.intersects(LHS.One))
+      Known.One |= ~LowBits;
+    return Known;
+  }
+
+  // The sign bit is the LHS's sign bit, except when the result of the
+  // remainder is zero. If it's known zero, our sign bit is also zero.
+  if (LHS.isNonNegative())
+    Known.makeNonNegative();
+  return Known;
+}
+
 KnownBits &KnownBits::operator&=(const KnownBits &RHS) {
   // Result bit is 0 if either operand bit is 0.
   Zero |= RHS.Zero;
diff --git a/contrib/llvm-project/llvm/lib/Support/LineIterator.cpp b/contrib/llvm-project/llvm/lib/Support/LineIterator.cpp
index 164436a2c48e..7bdf1271ac25 100644
--- a/contrib/llvm-project/llvm/lib/Support/LineIterator.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/LineIterator.cpp
@@ -33,7 +33,11 @@ static bool skipIfAtLineEnd(const char *&P) {
 
 line_iterator::line_iterator(const MemoryBuffer &Buffer, bool SkipBlanks,
                              char CommentMarker)
-    : Buffer(Buffer.getBufferSize() ? &Buffer : nullptr),
+    : line_iterator(Buffer.getMemBufferRef(), SkipBlanks, CommentMarker) {}
+
+line_iterator::line_iterator(const MemoryBufferRef &Buffer, bool SkipBlanks,
+                             char CommentMarker)
+    : Buffer(Buffer.getBufferSize() ? Optional<MemoryBufferRef>(Buffer) : None),
       CommentMarker(CommentMarker), SkipBlanks(SkipBlanks), LineNumber(1),
       CurrentLine(Buffer.getBufferSize() ? Buffer.getBufferStart() : nullptr,
                   0) {
@@ -78,7 +82,7 @@ void line_iterator::advance() {
 
   if (*Pos == '\0') {
     // We've hit the end of the buffer, reset ourselves to the end state.
-    Buffer = nullptr;
+    Buffer = None;
     CurrentLine = StringRef();
     return;
   }
diff --git a/contrib/llvm-project/llvm/lib/Support/LowLevelType.cpp b/contrib/llvm-project/llvm/lib/Support/LowLevelType.cpp
index fe77cb3db413..63559d5ac3ee 100644
--- a/contrib/llvm-project/llvm/lib/Support/LowLevelType.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/LowLevelType.cpp
@@ -23,7 +23,7 @@ LLT::LLT(MVT VT) {
   } else if (VT.isValid()) {
     // Aggregates are no different from real scalars as far as GlobalISel is
     // concerned.
-    assert(VT.getSizeInBits() != 0 && "invalid zero-sized type");
+    assert(VT.getSizeInBits().isNonZero() && "invalid zero-sized type");
     init(/*IsPointer=*/false, /*IsVector=*/false, /*NumElements=*/0,
          VT.getSizeInBits(), /*AddressSpace=*/0);
   } else {
diff --git a/contrib/llvm-project/llvm/lib/Support/MemoryBufferRef.cpp b/contrib/llvm-project/llvm/lib/Support/MemoryBufferRef.cpp
new file mode 100644
index 000000000000..a93853c0acb1
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Support/MemoryBufferRef.cpp
@@ -0,0 +1,19 @@
+//===- MemoryBufferRef.cpp - Memory Buffer Reference ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file implements the MemoryBufferRef interface.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/MemoryBufferRef.h"
+#include "llvm/Support/MemoryBuffer.h"
+
+using namespace llvm;
+
+MemoryBufferRef::MemoryBufferRef(const MemoryBuffer &Buffer)
+    : Buffer(Buffer.getBuffer()), Identifier(Buffer.getBufferIdentifier()) {}
diff --git a/contrib/llvm-project/llvm/lib/Support/Path.cpp b/contrib/llvm-project/llvm/lib/Support/Path.cpp
index 37b3086fddf5..ef223ae5ac1d 100644
--- a/contrib/llvm-project/llvm/lib/Support/Path.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/Path.cpp
@@ -354,10 +354,9 @@ StringRef root_path(StringRef path, Style style) {
       if ((++pos != e) && is_separator((*pos)[0], style)) {
         // {C:/,//net/}, so get the first two components.
         return path.substr(0, b->size() + pos->size());
-      } else {
-        // just {C:,//net}, return the first component.
-        return *b;
       }
+      // just {C:,//net}, return the first component.
+      return *b;
     }
 
     // POSIX style root directory.
@@ -467,8 +466,7 @@ StringRef parent_path(StringRef path, Style style) {
   size_t end_pos = parent_path_end(path, style);
   if (end_pos == StringRef::npos)
     return StringRef();
-  else
-    return path.substr(0, end_pos);
+  return path.substr(0, end_pos);
 }
 
 void remove_filename(SmallVectorImpl<char> &path, Style style) {
@@ -581,12 +579,10 @@ StringRef stem(StringRef path, Style style) {
   size_t pos = fname.find_last_of('.');
   if (pos == StringRef::npos)
     return fname;
-  else
-    if ((fname.size() == 1 && fname == ".") ||
-        (fname.size() == 2 && fname == ".."))
-      return fname;
-    else
-      return fname.substr(0, pos);
+  if ((fname.size() == 1 && fname == ".") ||
+      (fname.size() == 2 && fname == ".."))
+    return fname;
+  return fname.substr(0, pos);
 }
 
 StringRef extension(StringRef path, Style style) {
@@ -594,12 +590,10 @@ StringRef extension(StringRef path, Style style) {
   size_t pos = fname.find_last_of('.');
   if (pos == StringRef::npos)
     return StringRef();
-  else
-    if ((fname.size() == 1 && fname == ".") ||
-        (fname.size() == 2 && fname == ".."))
-      return StringRef();
-    else
-      return fname.substr(pos);
+  if ((fname.size() == 1 && fname == ".") ||
+      (fname.size() == 2 && fname == ".."))
+    return StringRef();
+  return fname.substr(pos);
 }
 
 bool is_separator(char value, Style style) {
@@ -683,6 +677,24 @@ bool is_absolute(const Twine &path, Style style) {
   return rootDir && rootName;
 }
 
+bool is_absolute_gnu(const Twine &path, Style style) {
+  SmallString<128> path_storage;
+  StringRef p = path.toStringRef(path_storage);
+
+  // Handle '/' which is absolute for both Windows and POSIX systems.
+  // Handle '\\' on Windows.
+  if (!p.empty() && is_separator(p.front(), style))
+    return true;
+
+  if (real_style(style) == Style::windows) {
+    // Handle drive letter pattern (a character followed by ':') on Windows.
+    if (p.size() >= 2 && (p[0] && p[1] == ':'))
+      return true;
+  }
+
+  return false;
+}
+
 bool is_relative(const Twine &path, Style style) {
   return !is_absolute(path, style);
 }
@@ -1281,7 +1293,7 @@ Expected<TempFile> TempFile::create(const Twine &Model, unsigned Mode) {
 #endif
   return std::move(Ret);
 }
-}
+} // namespace fs
 
-} // end namsspace sys
-} // end namespace llvm
+} // namespace sys
+} // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Support/PrettyStackTrace.cpp b/contrib/llvm-project/llvm/lib/Support/PrettyStackTrace.cpp
index 9072f9d2d2ee..5d3d95b5e7cb 100644
--- a/contrib/llvm-project/llvm/lib/Support/PrettyStackTrace.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/PrettyStackTrace.cpp
@@ -25,6 +25,7 @@
 #include <cassert>
 #include <cstdarg>
 #include <cstdio>
+#include <cstring>
 #include <tuple>
 
 #ifdef HAVE_CRASHREPORTERCLIENT_H
@@ -253,8 +254,16 @@ void PrettyStackTraceFormat::print(raw_ostream &OS) const { OS << Str << "\n"; }
 void PrettyStackTraceProgram::print(raw_ostream &OS) const {
   OS << "Program arguments: ";
   // Print the argument list.
-  for (unsigned i = 0, e = ArgC; i != e; ++i)
-    OS << ArgV[i] << ' ';
+  for (int I = 0; I < ArgC; ++I) {
+    const bool HaveSpace = ::strchr(ArgV[I], ' ');
+    if (I)
+      OS << ' ';
+    if (HaveSpace)
+      OS << '"';
+    OS.write_escaped(ArgV[I]);
+    if (HaveSpace)
+      OS << '"';
+  }
   OS << '\n';
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Support/Process.cpp b/contrib/llvm-project/llvm/lib/Support/Process.cpp
index 9e6e233b26ac..e0814444876c 100644
--- a/contrib/llvm-project/llvm/lib/Support/Process.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/Process.cpp
@@ -20,6 +20,8 @@
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Program.h"
 
+#include <stdlib.h> // for _Exit
+
 using namespace llvm;
 using namespace sys;
 
@@ -28,21 +30,22 @@ using namespace sys;
 //===          independent code.
 //===----------------------------------------------------------------------===//
 
-Optional<std::string> Process::FindInEnvPath(StringRef EnvName,
-                                             StringRef FileName) {
-  return FindInEnvPath(EnvName, FileName, {});
+Optional<std::string>
+Process::FindInEnvPath(StringRef EnvName, StringRef FileName, char Separator) {
+  return FindInEnvPath(EnvName, FileName, {}, Separator);
 }
 
 Optional<std::string> Process::FindInEnvPath(StringRef EnvName,
                                              StringRef FileName,
-                                             ArrayRef<std::string> IgnoreList) {
+                                             ArrayRef<std::string> IgnoreList,
+                                             char Separator) {
   assert(!path::is_absolute(FileName));
   Optional<std::string> FoundPath;
   Optional<std::string> OptPath = Process::GetEnv(EnvName);
   if (!OptPath.hasValue())
     return FoundPath;
 
-  const char EnvPathSeparatorStr[] = {EnvPathSeparator, '\0'};
+  const char EnvPathSeparatorStr[] = {Separator, '\0'};
   SmallVector<StringRef, 8> Dirs;
   SplitString(OptPath.getValue(), Dirs, EnvPathSeparatorStr);
 
@@ -90,10 +93,14 @@ static bool coreFilesPrevented = !LLVM_ENABLE_CRASH_DUMPS;
 bool Process::AreCoreFilesPrevented() { return coreFilesPrevented; }
 
 LLVM_ATTRIBUTE_NORETURN
-void Process::Exit(int RetCode) {
+void Process::Exit(int RetCode, bool NoCleanup) {
   if (CrashRecoveryContext *CRC = CrashRecoveryContext::GetCurrent())
     CRC->HandleExit(RetCode);
-  ::exit(RetCode);
+
+  if (NoCleanup)
+    _Exit(RetCode);
+  else
+    ::exit(RetCode);
 }
 
 // Include the platform-specific parts of this class.
diff --git a/contrib/llvm-project/llvm/lib/Support/Program.cpp b/contrib/llvm-project/llvm/lib/Support/Program.cpp
index 5294f65bd5a5..c7a59642b27e 100644
--- a/contrib/llvm-project/llvm/lib/Support/Program.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/Program.cpp
@@ -26,17 +26,20 @@ using namespace sys;
 static bool Execute(ProcessInfo &PI, StringRef Program,
                     ArrayRef<StringRef> Args, Optional<ArrayRef<StringRef>> Env,
                     ArrayRef<Optional<StringRef>> Redirects,
-                    unsigned MemoryLimit, std::string *ErrMsg);
+                    unsigned MemoryLimit, std::string *ErrMsg,
+                    BitVector *AffinityMask);
 
 int sys::ExecuteAndWait(StringRef Program, ArrayRef<StringRef> Args,
                         Optional<ArrayRef<StringRef>> Env,
                         ArrayRef<Optional<StringRef>> Redirects,
                         unsigned SecondsToWait, unsigned MemoryLimit,
                         std::string *ErrMsg, bool *ExecutionFailed,
-                        Optional<ProcessStatistics> *ProcStat) {
+                        Optional<ProcessStatistics> *ProcStat,
+                        BitVector *AffinityMask) {
   assert(Redirects.empty() || Redirects.size() == 3);
   ProcessInfo PI;
-  if (Execute(PI, Program, Args, Env, Redirects, MemoryLimit, ErrMsg)) {
+  if (Execute(PI, Program, Args, Env, Redirects, MemoryLimit, ErrMsg,
+              AffinityMask)) {
     if (ExecutionFailed)
       *ExecutionFailed = false;
     ProcessInfo Result =
@@ -55,12 +58,13 @@ ProcessInfo sys::ExecuteNoWait(StringRef Program, ArrayRef<StringRef> Args,
                                Optional<ArrayRef<StringRef>> Env,
                                ArrayRef<Optional<StringRef>> Redirects,
                                unsigned MemoryLimit, std::string *ErrMsg,
-                               bool *ExecutionFailed) {
+                               bool *ExecutionFailed, BitVector *AffinityMask) {
   assert(Redirects.empty() || Redirects.size() == 3);
   ProcessInfo PI;
   if (ExecutionFailed)
     *ExecutionFailed = false;
-  if (!Execute(PI, Program, Args, Env, Redirects, MemoryLimit, ErrMsg))
+  if (!Execute(PI, Program, Args, Env, Redirects, MemoryLimit, ErrMsg,
+               AffinityMask))
     if (ExecutionFailed)
       *ExecutionFailed = true;
 
diff --git a/contrib/llvm-project/llvm/lib/Support/SHA1.cpp b/contrib/llvm-project/llvm/lib/Support/SHA1.cpp
index 417b13fea05a..5dce44af9ecd 100644
--- a/contrib/llvm-project/llvm/lib/Support/SHA1.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/SHA1.cpp
@@ -225,7 +225,7 @@ void SHA1::update(ArrayRef<uint8_t> Data) {
   // Fast buffer filling for large inputs.
   while (Data.size() >= BLOCK_LENGTH) {
     assert(InternalState.BufferOffset == 0);
-    assert(BLOCK_LENGTH % 4 == 0);
+    static_assert(BLOCK_LENGTH % 4 == 0, "");
     constexpr size_t BLOCK_LENGTH_32 = BLOCK_LENGTH / 4;
     for (size_t I = 0; I < BLOCK_LENGTH_32; ++I)
       InternalState.Buffer.L[I] = support::endian::read32be(&Data[I * 4]);
diff --git a/contrib/llvm-project/llvm/lib/Support/Signals.cpp b/contrib/llvm-project/llvm/lib/Support/Signals.cpp
index 2cfdf2d42a4a..29be4df95954 100644
--- a/contrib/llvm-project/llvm/lib/Support/Signals.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/Signals.cpp
@@ -44,6 +44,9 @@ static cl::opt<bool, true>
                          cl::desc("Disable symbolizing crash backtraces."),
                          cl::location(DisableSymbolicationFlag), cl::Hidden);
 
+constexpr char DisableSymbolizationEnv[] = "LLVM_DISABLE_SYMBOLIZATION";
+constexpr char LLVMSymbolizerPathEnv[] = "LLVM_SYMBOLIZER_PATH";
+
 // Callbacks to run in signal handler must be lock-free because a signal handler
 // could be running as we add new callbacks. We don't add unbounded numbers of
 // callbacks, an array is therefore sufficient.
@@ -105,7 +108,7 @@ static FormattedNumber format_ptr(void *PC) {
 LLVM_ATTRIBUTE_USED
 static bool printSymbolizedStackTrace(StringRef Argv0, void **StackTrace,
                                       int Depth, llvm::raw_ostream &OS) {
-  if (DisableSymbolicationFlag)
+  if (DisableSymbolicationFlag || getenv(DisableSymbolizationEnv))
     return false;
 
   // Don't recursively invoke the llvm-symbolizer binary.
@@ -117,7 +120,9 @@ static bool printSymbolizedStackTrace(StringRef Argv0, void **StackTrace,
   // Use llvm-symbolizer tool to symbolize the stack traces. First look for it
   // alongside our binary, then in $PATH.
   ErrorOr<std::string> LLVMSymbolizerPathOrErr = std::error_code();
-  if (!Argv0.empty()) {
+  if (const char *Path = getenv(LLVMSymbolizerPathEnv)) {
+    LLVMSymbolizerPathOrErr = sys::findProgramByName(Path);
+  } else if (!Argv0.empty()) {
     StringRef Parent = llvm::sys::path::parent_path(Argv0);
     if (!Parent.empty())
       LLVMSymbolizerPathOrErr = sys::findProgramByName("llvm-symbolizer", Parent);
diff --git a/contrib/llvm-project/llvm/lib/Support/Signposts.cpp b/contrib/llvm-project/llvm/lib/Support/Signposts.cpp
index aa159e1da2ae..9353e9b294d1 100644
--- a/contrib/llvm-project/llvm/lib/Support/Signposts.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/Signposts.cpp
@@ -13,6 +13,7 @@
 #include "llvm/Config/config.h"
 #if LLVM_SUPPORT_XCODE_SIGNPOSTS
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/Mutex.h"
 #include <os/signpost.h>
 #endif // if LLVM_SUPPORT_XCODE_SIGNPOSTS
 
@@ -33,21 +34,22 @@ void LogDeleter(os_log_t *X) {
 
 namespace llvm {
 class SignpostEmitterImpl {
-  using LogPtrTy =
-      std::unique_ptr<os_log_t, std::function<void(os_log_t *)>>;
+  using LogPtrTy = std::unique_ptr<os_log_t, std::function<void(os_log_t *)>>;
   using LogTy = LogPtrTy::element_type;
 
   LogPtrTy SignpostLog;
-  DenseMap<const Timer *, os_signpost_id_t> Signposts;
+  DenseMap<const void *, os_signpost_id_t> Signposts;
+  sys::SmartMutex<true> Mutex;
 
   LogTy &getLogger() const { return *SignpostLog; }
-  os_signpost_id_t getSignpostForTimer(const Timer *T) {
-    const auto &I = Signposts.find(T);
+  os_signpost_id_t getSignpostForObject(const void *O) {
+    sys::SmartScopedLock<true> Lock(Mutex);
+    const auto &I = Signposts.find(O);
     if (I != Signposts.end())
       return I->second;
 
     const auto &Inserted = Signposts.insert(
-        std::make_pair(T, os_signpost_id_make_with_pointer(getLogger(), T)));
+        std::make_pair(O, os_signpost_id_make_with_pointer(getLogger(), O)));
     return Inserted.first->second;
   }
 
@@ -56,20 +58,19 @@ public:
 
   bool isEnabled() const { return os_signpost_enabled(*SignpostLog); }
 
-  void startTimerInterval(Timer *T) {
+  void startInterval(const void *O, llvm::StringRef Name) {
     if (isEnabled()) {
-      // Both strings used here are required to be constant literal strings
-      os_signpost_interval_begin(getLogger(), getSignpostForTimer(T),
-                                 "Pass Timers", "Begin %s",
-                                 T->getName().c_str());
+      // Both strings used here are required to be constant literal strings.
+      os_signpost_interval_begin(getLogger(), getSignpostForObject(O),
+                                 "LLVM Timers", "Begin %s", Name.data());
     }
   }
 
-  void endTimerInterval(Timer *T) {
+  void endInterval(const void *O, llvm::StringRef Name) {
     if (isEnabled()) {
-      // Both strings used here are required to be constant literal strings
-      os_signpost_interval_end(getLogger(), getSignpostForTimer(T),
-                               "Pass Timers", "End %s", T->getName().c_str());
+      // Both strings used here are required to be constant literal strings.
+      os_signpost_interval_end(getLogger(), getSignpostForObject(O),
+                               "LLVM Timers", "End %s", Name.data());
     }
   }
 };
@@ -85,7 +86,7 @@ public:
 SignpostEmitter::SignpostEmitter() {
 #if HAVE_ANY_SIGNPOST_IMPL
   Impl = new SignpostEmitterImpl();
-#else // if HAVE_ANY_SIGNPOST_IMPL
+#else  // if HAVE_ANY_SIGNPOST_IMPL
   Impl = nullptr;
 #endif // if !HAVE_ANY_SIGNPOST_IMPL
 }
@@ -104,18 +105,18 @@ bool SignpostEmitter::isEnabled() const {
 #endif // if !HAVE_ANY_SIGNPOST_IMPL
 }
 
-void SignpostEmitter::startTimerInterval(Timer *T) {
+void SignpostEmitter::startInterval(const void *O, StringRef Name) {
 #if HAVE_ANY_SIGNPOST_IMPL
   if (Impl == nullptr)
     return;
-  return Impl->startTimerInterval(T);
+  return Impl->startInterval(O, Name);
 #endif // if !HAVE_ANY_SIGNPOST_IMPL
 }
 
-void SignpostEmitter::endTimerInterval(Timer *T) {
+void SignpostEmitter::endInterval(const void *O, StringRef Name) {
 #if HAVE_ANY_SIGNPOST_IMPL
   if (Impl == nullptr)
     return;
-  Impl->endTimerInterval(T);
+  Impl->endInterval(O, Name);
 #endif // if !HAVE_ANY_SIGNPOST_IMPL
 }
diff --git a/contrib/llvm-project/llvm/lib/Support/SmallVector.cpp b/contrib/llvm-project/llvm/lib/Support/SmallVector.cpp
index 6d5fe7165f63..0005f7840912 100644
--- a/contrib/llvm-project/llvm/lib/Support/SmallVector.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/SmallVector.cpp
@@ -12,6 +12,9 @@
 
 #include "llvm/ADT/SmallVector.h"
 #include <cstdint>
+#ifdef LLVM_ENABLE_EXCEPTIONS
+#include <stdexcept>
+#endif
 using namespace llvm;
 
 // Check that no bytes are wasted and everything is well-aligned.
@@ -42,27 +45,72 @@ static_assert(sizeof(SmallVector<char, 0>) ==
                   sizeof(void *) * 2 + sizeof(void *),
               "1 byte elements have word-sized type for size and capacity");
 
+/// Report that MinSize doesn't fit into this vector's size type. Throws
+/// std::length_error or calls report_fatal_error.
+LLVM_ATTRIBUTE_NORETURN
+static void report_size_overflow(size_t MinSize, size_t MaxSize);
+static void report_size_overflow(size_t MinSize, size_t MaxSize) {
+  std::string Reason = "SmallVector unable to grow. Requested capacity (" +
+                       std::to_string(MinSize) +
+                       ") is larger than maximum value for size type (" +
+                       std::to_string(MaxSize) + ")";
+#ifdef LLVM_ENABLE_EXCEPTIONS
+  throw std::length_error(Reason);
+#else
+  report_fatal_error(Reason);
+#endif
+}
+
+/// Report that this vector is already at maximum capacity. Throws
+/// std::length_error or calls report_fatal_error.
+LLVM_ATTRIBUTE_NORETURN static void report_at_maximum_capacity(size_t MaxSize);
+static void report_at_maximum_capacity(size_t MaxSize) {
+  std::string Reason =
+      "SmallVector capacity unable to grow. Already at maximum size " +
+      std::to_string(MaxSize);
+#ifdef LLVM_ENABLE_EXCEPTIONS
+  throw std::length_error(Reason);
+#else
+  report_fatal_error(Reason);
+#endif
+}
+
 // Note: Moving this function into the header may cause performance regression.
 template <class Size_T>
-void SmallVectorBase<Size_T>::grow_pod(void *FirstEl, size_t MinCapacity,
-                                       size_t TSize) {
+static size_t getNewCapacity(size_t MinSize, size_t TSize, size_t OldCapacity) {
+  constexpr size_t MaxSize = std::numeric_limits<Size_T>::max();
+
   // Ensure we can fit the new capacity.
   // This is only going to be applicable when the capacity is 32 bit.
-  if (MinCapacity > SizeTypeMax())
-    report_bad_alloc_error("SmallVector capacity overflow during allocation");
+  if (MinSize > MaxSize)
+    report_size_overflow(MinSize, MaxSize);
 
   // Ensure we can meet the guarantee of space for at least one more element.
   // The above check alone will not catch the case where grow is called with a
-  // default MinCapacity of 0, but the current capacity cannot be increased.
+  // default MinSize of 0, but the current capacity cannot be increased.
   // This is only going to be applicable when the capacity is 32 bit.
-  if (capacity() == SizeTypeMax())
-    report_bad_alloc_error("SmallVector capacity unable to grow");
+  if (OldCapacity == MaxSize)
+    report_at_maximum_capacity(MaxSize);
 
   // In theory 2*capacity can overflow if the capacity is 64 bit, but the
   // original capacity would never be large enough for this to be a problem.
-  size_t NewCapacity = 2 * capacity() + 1; // Always grow.
-  NewCapacity = std::min(std::max(NewCapacity, MinCapacity), SizeTypeMax());
+  size_t NewCapacity = 2 * OldCapacity + 1; // Always grow.
+  return std::min(std::max(NewCapacity, MinSize), MaxSize);
+}
 
+// Note: Moving this function into the header may cause performance regression.
+template <class Size_T>
+void *SmallVectorBase<Size_T>::mallocForGrow(size_t MinSize, size_t TSize,
+                                             size_t &NewCapacity) {
+  NewCapacity = getNewCapacity<Size_T>(MinSize, TSize, this->capacity());
+  return llvm::safe_malloc(NewCapacity * TSize);
+}
+
+// Note: Moving this function into the header may cause performance regression.
+template <class Size_T>
+void SmallVectorBase<Size_T>::grow_pod(void *FirstEl, size_t MinSize,
+                                       size_t TSize) {
+  size_t NewCapacity = getNewCapacity<Size_T>(MinSize, TSize, this->capacity());
   void *NewElts;
   if (BeginX == FirstEl) {
     NewElts = safe_malloc(NewCapacity * TSize);
@@ -81,7 +129,7 @@ void SmallVectorBase<Size_T>::grow_pod(void *FirstEl, size_t MinCapacity,
 template class llvm::SmallVectorBase<uint32_t>;
 
 // Disable the uint64_t instantiation for 32-bit builds.
-// Both uint32_t and uint64_t instantations are needed for 64-bit builds.
+// Both uint32_t and uint64_t instantiations are needed for 64-bit builds.
 // This instantiation will never be used in 32-bit builds, and will cause
 // warnings when sizeof(Size_T) > sizeof(size_t).
 #if SIZE_MAX > UINT32_MAX
diff --git a/contrib/llvm-project/llvm/lib/Support/SourceMgr.cpp b/contrib/llvm-project/llvm/lib/Support/SourceMgr.cpp
index 9cc69732a964..89b7dc939dfc 100644
--- a/contrib/llvm-project/llvm/lib/Support/SourceMgr.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/SourceMgr.cpp
@@ -180,7 +180,7 @@ std::pair<unsigned, unsigned>
 SourceMgr::getLineAndColumn(SMLoc Loc, unsigned BufferID) const {
   if (!BufferID)
     BufferID = FindBufferContainingLoc(Loc);
-  assert(BufferID && "Invalid Location!");
+  assert(BufferID && "Invalid location!");
 
   auto &SB = getBufferInfo(BufferID);
   const char *Ptr = Loc.getPointer();
@@ -193,6 +193,30 @@ SourceMgr::getLineAndColumn(SMLoc Loc, unsigned BufferID) const {
   return std::make_pair(LineNo, Ptr - BufStart - NewlineOffs);
 }
 
+// FIXME: Note that the formatting of source locations is spread between
+// multiple functions, some in SourceMgr and some in SMDiagnostic. A better
+// solution would be a general-purpose source location formatter
+// in one of those two classes, or possibly in SMLoc.
+
+/// Get a string with the source location formatted in the standard
+/// style, but without the line offset. If \p IncludePath is true, the path
+/// is included. If false, only the file name and extension are included.
+std::string SourceMgr::getFormattedLocationNoOffset(SMLoc Loc,
+                                                    bool IncludePath) const {
+  auto BufferID = FindBufferContainingLoc(Loc);
+  assert(BufferID && "Invalid location!");
+  auto FileSpec = getBufferInfo(BufferID).Buffer->getBufferIdentifier();
+
+  if (IncludePath) {
+    return FileSpec.str() + ":" + std::to_string(FindLineNumber(Loc, BufferID));
+  } else {
+    auto I = FileSpec.find_last_of("/\\");
+    I = (I == FileSpec.size()) ? 0 : (I + 1);
+    return FileSpec.substr(I).str() + ":" +
+           std::to_string(FindLineNumber(Loc, BufferID));
+  }
+}
+
 /// Given a line and column number in a mapped buffer, turn it into an SMLoc.
 /// This will return a null SMLoc if the line/column location is invalid.
 SMLoc SourceMgr::FindLocForLineAndColumn(unsigned BufferID, unsigned LineNo,
@@ -243,7 +267,7 @@ SMDiagnostic SourceMgr::GetMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
   SmallVector<std::pair<unsigned, unsigned>, 4> ColRanges;
   std::pair<unsigned, unsigned> LineAndCol;
   StringRef BufferID = "<unknown>";
-  std::string LineStr;
+  StringRef LineStr;
 
   if (Loc.isValid()) {
     unsigned CurBuf = FindBufferContainingLoc(Loc);
@@ -264,7 +288,7 @@ SMDiagnostic SourceMgr::GetMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
     const char *BufEnd = CurMB->getBufferEnd();
     while (LineEnd != BufEnd && LineEnd[0] != '\n' && LineEnd[0] != '\r')
       ++LineEnd;
-    LineStr = std::string(LineStart, LineEnd);
+    LineStr = StringRef(LineStart, LineEnd - LineStart);
 
     // Convert any ranges to column ranges that only intersect the line of the
     // location.
@@ -346,8 +370,8 @@ SMDiagnostic::SMDiagnostic(const SourceMgr &sm, SMLoc L, StringRef FN, int Line,
                            ArrayRef<std::pair<unsigned, unsigned>> Ranges,
                            ArrayRef<SMFixIt> Hints)
     : SM(&sm), Loc(L), Filename(std::string(FN)), LineNo(Line), ColumnNo(Col),
-      Kind(Kind), Message(std::string(Msg)), LineContents(std::string(LineStr)),
-      Ranges(Ranges.vec()), FixIts(Hints.begin(), Hints.end()) {
+      Kind(Kind), Message(Msg), LineContents(LineStr), Ranges(Ranges.vec()),
+      FixIts(Hints.begin(), Hints.end()) {
   llvm::sort(FixIts);
 }
 
@@ -362,13 +386,12 @@ static void buildFixItLine(std::string &CaretLine, std::string &FixItLine,
 
   size_t PrevHintEndCol = 0;
 
-  for (ArrayRef<SMFixIt>::iterator I = FixIts.begin(), E = FixIts.end(); I != E;
-       ++I) {
+  for (const llvm::SMFixIt &Fixit : FixIts) {
     // If the fixit contains a newline or tab, ignore it.
-    if (I->getText().find_first_of("\n\r\t") != StringRef::npos)
+    if (Fixit.getText().find_first_of("\n\r\t") != StringRef::npos)
       continue;
 
-    SMRange R = I->getRange();
+    SMRange R = Fixit.getRange();
 
     // If the line doesn't contain any part of the range, then ignore it.
     if (R.Start.getPointer() > LineEnd || R.End.getPointer() < LineStart)
@@ -397,16 +420,15 @@ static void buildFixItLine(std::string &CaretLine, std::string &FixItLine,
     // FIXME: This assertion is intended to catch unintended use of multibyte
     // characters in fixits. If we decide to do this, we'll have to track
     // separate byte widths for the source and fixit lines.
-    assert((size_t)sys::locale::columnWidth(I->getText()) ==
-           I->getText().size());
+    assert((size_t)sys::locale::columnWidth(Fixit.getText()) ==
+           Fixit.getText().size());
 
     // This relies on one byte per column in our fixit hints.
-    unsigned LastColumnModified = HintCol + I->getText().size();
+    unsigned LastColumnModified = HintCol + Fixit.getText().size();
     if (LastColumnModified > FixItLine.size())
       FixItLine.resize(LastColumnModified, ' ');
 
-    std::copy(I->getText().begin(), I->getText().end(),
-              FixItLine.begin() + HintCol);
+    llvm::copy(Fixit.getText(), FixItLine.begin() + HintCol);
 
     PrevHintEndCol = LastColumnModified;
 
@@ -500,7 +522,7 @@ void SMDiagnostic::print(const char *ProgName, raw_ostream &OS, bool ShowColors,
   // map like Clang's TextDiagnostic. For now, we'll just handle tabs by
   // expanding them later, and bail out rather than show incorrect ranges and
   // misaligned fixits for any other odd characters.
-  if (find_if(LineContents, isNonASCII) != LineContents.end()) {
+  if (any_of(LineContents, isNonASCII)) {
     printSourceLine(OS, LineContents);
     return;
   }
@@ -510,11 +532,9 @@ void SMDiagnostic::print(const char *ProgName, raw_ostream &OS, bool ShowColors,
   std::string CaretLine(NumColumns + 1, ' ');
 
   // Expand any ranges.
-  for (unsigned r = 0, e = Ranges.size(); r != e; ++r) {
-    std::pair<unsigned, unsigned> R = Ranges[r];
+  for (const std::pair<unsigned, unsigned> &R : Ranges)
     std::fill(&CaretLine[R.first],
               &CaretLine[std::min((size_t)R.second, CaretLine.size())], '~');
-  }
 
   // Add any fix-its.
   // FIXME: Find the beginning of the line properly for multibyte characters.
diff --git a/contrib/llvm-project/llvm/lib/Support/TargetParser.cpp b/contrib/llvm-project/llvm/lib/Support/TargetParser.cpp
index 031384ebaa91..3ccdc55b305d 100644
--- a/contrib/llvm-project/llvm/lib/Support/TargetParser.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/TargetParser.cpp
@@ -30,7 +30,7 @@ struct GPUInfo {
   unsigned Features;
 };
 
-constexpr GPUInfo R600GPUs[26] = {
+constexpr GPUInfo R600GPUs[] = {
   // Name       Canonical    Kind        Features
   //            Name
   {{"r600"},    {"r600"},    GK_R600,    FEATURE_NONE },
@@ -63,16 +63,17 @@ constexpr GPUInfo R600GPUs[26] = {
 
 // This table should be sorted by the value of GPUKind
 // Don't bother listing the implicitly true features
-constexpr GPUInfo AMDGCNGPUs[38] = {
+constexpr GPUInfo AMDGCNGPUs[] = {
   // Name         Canonical    Kind        Features
   //              Name
   {{"gfx600"},    {"gfx600"},  GK_GFX600,  FEATURE_FAST_FMA_F32},
   {{"tahiti"},    {"gfx600"},  GK_GFX600,  FEATURE_FAST_FMA_F32},
   {{"gfx601"},    {"gfx601"},  GK_GFX601,  FEATURE_NONE},
-  {{"hainan"},    {"gfx601"},  GK_GFX601,  FEATURE_NONE},
-  {{"oland"},     {"gfx601"},  GK_GFX601,  FEATURE_NONE},
   {{"pitcairn"},  {"gfx601"},  GK_GFX601,  FEATURE_NONE},
   {{"verde"},     {"gfx601"},  GK_GFX601,  FEATURE_NONE},
+  {{"gfx602"},    {"gfx602"},  GK_GFX602,  FEATURE_NONE},
+  {{"hainan"},    {"gfx602"},  GK_GFX602,  FEATURE_NONE},
+  {{"oland"},     {"gfx602"},  GK_GFX602,  FEATURE_NONE},
   {{"gfx700"},    {"gfx700"},  GK_GFX700,  FEATURE_NONE},
   {{"kaveri"},    {"gfx700"},  GK_GFX700,  FEATURE_NONE},
   {{"gfx701"},    {"gfx701"},  GK_GFX701,  FEATURE_FAST_FMA_F32},
@@ -83,36 +84,43 @@ constexpr GPUInfo AMDGCNGPUs[38] = {
   {{"mullins"},   {"gfx703"},  GK_GFX703,  FEATURE_NONE},
   {{"gfx704"},    {"gfx704"},  GK_GFX704,  FEATURE_NONE},
   {{"bonaire"},   {"gfx704"},  GK_GFX704,  FEATURE_NONE},
-  {{"gfx801"},    {"gfx801"},  GK_GFX801,  FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32},
-  {{"carrizo"},   {"gfx801"},  GK_GFX801,  FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32},
-  {{"gfx802"},    {"gfx802"},  GK_GFX802,  FEATURE_FAST_DENORMAL_F32},
-  {{"iceland"},   {"gfx802"},  GK_GFX802,  FEATURE_FAST_DENORMAL_F32},
-  {{"tonga"},     {"gfx802"},  GK_GFX802,  FEATURE_FAST_DENORMAL_F32},
-  {{"gfx803"},    {"gfx803"},  GK_GFX803,  FEATURE_FAST_DENORMAL_F32},
-  {{"fiji"},      {"gfx803"},  GK_GFX803,  FEATURE_FAST_DENORMAL_F32},
-  {{"polaris10"}, {"gfx803"},  GK_GFX803,  FEATURE_FAST_DENORMAL_F32},
-  {{"polaris11"}, {"gfx803"},  GK_GFX803,  FEATURE_FAST_DENORMAL_F32},
-  {{"gfx810"},    {"gfx810"},  GK_GFX810,  FEATURE_FAST_DENORMAL_F32},
-  {{"stoney"},    {"gfx810"},  GK_GFX810,  FEATURE_FAST_DENORMAL_F32},
-  {{"gfx900"},    {"gfx900"},  GK_GFX900,  FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32},
-  {{"gfx902"},    {"gfx902"},  GK_GFX902,  FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32},
-  {{"gfx904"},    {"gfx904"},  GK_GFX904,  FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32},
-  {{"gfx906"},    {"gfx906"},  GK_GFX906,  FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32},
-  {{"gfx908"},    {"gfx908"},  GK_GFX908,  FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32},
-  {{"gfx909"},    {"gfx909"},  GK_GFX909,  FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32},
-  {{"gfx1010"},   {"gfx1010"}, GK_GFX1010, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32},
-  {{"gfx1011"},   {"gfx1011"}, GK_GFX1011, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32},
-  {{"gfx1012"},   {"gfx1012"}, GK_GFX1012, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32},
+  {{"gfx705"},    {"gfx705"},  GK_GFX705,  FEATURE_NONE},
+  {{"gfx801"},    {"gfx801"},  GK_GFX801,  FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK},
+  {{"carrizo"},   {"gfx801"},  GK_GFX801,  FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK},
+  {{"gfx802"},    {"gfx802"},  GK_GFX802,  FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK},
+  {{"iceland"},   {"gfx802"},  GK_GFX802,  FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK},
+  {{"tonga"},     {"gfx802"},  GK_GFX802,  FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK},
+  {{"gfx803"},    {"gfx803"},  GK_GFX803,  FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK},
+  {{"fiji"},      {"gfx803"},  GK_GFX803,  FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK},
+  {{"polaris10"}, {"gfx803"},  GK_GFX803,  FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK},
+  {{"polaris11"}, {"gfx803"},  GK_GFX803,  FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK},
+  {{"gfx805"},    {"gfx805"},  GK_GFX805,  FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK},
+  {{"tongapro"},  {"gfx805"},  GK_GFX805,  FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK},
+  {{"gfx810"},    {"gfx810"},  GK_GFX810,  FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK},
+  {{"stoney"},    {"gfx810"},  GK_GFX810,  FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK},
+  {{"gfx900"},    {"gfx900"},  GK_GFX900,  FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK},
+  {{"gfx902"},    {"gfx902"},  GK_GFX902,  FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK},
+  {{"gfx904"},    {"gfx904"},  GK_GFX904,  FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK},
+  {{"gfx906"},    {"gfx906"},  GK_GFX906,  FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC},
+  {{"gfx908"},    {"gfx908"},  GK_GFX908,  FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC},
+  {{"gfx909"},    {"gfx909"},  GK_GFX909,  FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK},
+  {{"gfx90c"},    {"gfx90c"},  GK_GFX90C,  FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK},
+  {{"gfx1010"},   {"gfx1010"}, GK_GFX1010, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK},
+  {{"gfx1011"},   {"gfx1011"}, GK_GFX1011, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK},
+  {{"gfx1012"},   {"gfx1012"}, GK_GFX1012, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK},
   {{"gfx1030"},   {"gfx1030"}, GK_GFX1030, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32},
+  {{"gfx1031"},   {"gfx1031"}, GK_GFX1031, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32},
+  {{"gfx1032"},   {"gfx1032"}, GK_GFX1032, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32},
+  {{"gfx1033"},   {"gfx1033"}, GK_GFX1033, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32},
 };
 
 const GPUInfo *getArchEntry(AMDGPU::GPUKind AK, ArrayRef<GPUInfo> Table) {
   GPUInfo Search = { {""}, {""}, AK, AMDGPU::FEATURE_NONE };
 
-  auto I = std::lower_bound(Table.begin(), Table.end(), Search,
-    [](const GPUInfo &A, const GPUInfo &B) {
-      return A.Kind < B.Kind;
-    });
+  auto I =
+      llvm::lower_bound(Table, Search, [](const GPUInfo &A, const GPUInfo &B) {
+        return A.Kind < B.Kind;
+      });
 
   if (I == Table.end())
     return nullptr;
@@ -187,14 +195,17 @@ AMDGPU::IsaVersion AMDGPU::getIsaVersion(StringRef GPU) {
   switch (AK) {
   case GK_GFX600:  return {6, 0, 0};
   case GK_GFX601:  return {6, 0, 1};
+  case GK_GFX602:  return {6, 0, 2};
   case GK_GFX700:  return {7, 0, 0};
   case GK_GFX701:  return {7, 0, 1};
   case GK_GFX702:  return {7, 0, 2};
   case GK_GFX703:  return {7, 0, 3};
   case GK_GFX704:  return {7, 0, 4};
+  case GK_GFX705:  return {7, 0, 5};
   case GK_GFX801:  return {8, 0, 1};
   case GK_GFX802:  return {8, 0, 2};
   case GK_GFX803:  return {8, 0, 3};
+  case GK_GFX805:  return {8, 0, 5};
   case GK_GFX810:  return {8, 1, 0};
   case GK_GFX900:  return {9, 0, 0};
   case GK_GFX902:  return {9, 0, 2};
@@ -202,14 +213,27 @@ AMDGPU::IsaVersion AMDGPU::getIsaVersion(StringRef GPU) {
   case GK_GFX906:  return {9, 0, 6};
   case GK_GFX908:  return {9, 0, 8};
   case GK_GFX909:  return {9, 0, 9};
+  case GK_GFX90C:  return {9, 0, 12};
   case GK_GFX1010: return {10, 1, 0};
   case GK_GFX1011: return {10, 1, 1};
   case GK_GFX1012: return {10, 1, 2};
   case GK_GFX1030: return {10, 3, 0};
+  case GK_GFX1031: return {10, 3, 1};
+  case GK_GFX1032: return {10, 3, 2};
+  case GK_GFX1033: return {10, 3, 3};
   default:         return {0, 0, 0};
   }
 }
 
+StringRef AMDGPU::getCanonicalArchName(const Triple &T, StringRef Arch) {
+  assert(T.isAMDGPU());
+  auto ProcKind = T.isAMDGCN() ? parseArchAMDGCN(Arch) : parseArchR600(Arch);
+  if (ProcKind == GK_NONE)
+    return StringRef();
+
+  return T.isAMDGCN() ? getArchNameAMDGCN(ProcKind) : getArchNameR600(ProcKind);
+}
+
 namespace llvm {
 namespace RISCV {
 
@@ -233,6 +257,12 @@ bool checkCPUKind(CPUKind Kind, bool IsRV64) {
   return RISCVCPUInfo[static_cast<unsigned>(Kind)].is64Bit() == IsRV64;
 }
 
+bool checkTuneCPUKind(CPUKind Kind, bool IsRV64) {
+  if (Kind == CK_INVALID)
+    return false;
+  return RISCVCPUInfo[static_cast<unsigned>(Kind)].is64Bit() == IsRV64;
+}
+
 CPUKind parseCPUKind(StringRef CPU) {
   return llvm::StringSwitch<CPUKind>(CPU)
 #define PROC(ENUM, NAME, FEATURES, DEFAULT_MARCH) .Case(NAME, CK_##ENUM)
@@ -240,6 +270,22 @@ CPUKind parseCPUKind(StringRef CPU) {
       .Default(CK_INVALID);
 }
 
+StringRef resolveTuneCPUAlias(StringRef TuneCPU, bool IsRV64) {
+  return llvm::StringSwitch<StringRef>(TuneCPU)
+#define PROC_ALIAS(NAME, RV32, RV64) .Case(NAME, IsRV64 ? StringRef(RV64) : StringRef(RV32))
+#include "llvm/Support/RISCVTargetParser.def"
+      .Default(TuneCPU);
+}
+
+CPUKind parseTuneCPUKind(StringRef TuneCPU, bool IsRV64) {
+  TuneCPU = resolveTuneCPUAlias(TuneCPU, IsRV64);
+
+  return llvm::StringSwitch<CPUKind>(TuneCPU)
+#define PROC(ENUM, NAME, FEATURES, DEFAULT_MARCH) .Case(NAME, CK_##ENUM)
+#include "llvm/Support/RISCVTargetParser.def"
+      .Default(CK_INVALID);
+}
+
 StringRef getMArchFromMcpu(StringRef CPU) {
   CPUKind Kind = parseCPUKind(CPU);
   return RISCVCPUInfo[static_cast<unsigned>(Kind)].DefaultMarch;
@@ -252,6 +298,15 @@ void fillValidCPUArchList(SmallVectorImpl<StringRef> &Values, bool IsRV64) {
   }
 }
 
+void fillValidTuneCPUArchList(SmallVectorImpl<StringRef> &Values, bool IsRV64) {
+  for (const auto &C : RISCVCPUInfo) {
+    if (C.Kind != CK_INVALID && IsRV64 == C.is64Bit())
+      Values.emplace_back(C.Name);
+  }
+#define PROC_ALIAS(NAME, RV32, RV64) Values.emplace_back(StringRef(NAME));
+#include "llvm/Support/RISCVTargetParser.def"
+}
+
 // Get all features except standard extension feature
 bool getCPUFeaturesExceptStdExt(CPUKind Kind,
                                 std::vector<StringRef> &Features) {
diff --git a/contrib/llvm-project/llvm/lib/Support/Timer.cpp b/contrib/llvm-project/llvm/lib/Support/Timer.cpp
index c97538cb560a..f5a512f9a22d 100644
--- a/contrib/llvm-project/llvm/lib/Support/Timer.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/Timer.cpp
@@ -53,6 +53,11 @@ namespace {
   InfoOutputFilename("info-output-file", cl::value_desc("filename"),
                      cl::desc("File to append -stats and -timer output to"),
                    cl::Hidden, cl::location(getLibSupportInfoOutputFilename()));
+
+  static cl::opt<bool>
+  SortTimers("sort-timers", cl::desc("In the report, sort the timers in each group "
+                                     "in wall clock time order"),
+             cl::init(true), cl::Hidden);
 }
 
 std::unique_ptr<raw_fd_ostream> llvm::CreateInfoOutputFile() {
@@ -138,7 +143,7 @@ TimeRecord TimeRecord::getCurrentTime(bool Start) {
 void Timer::startTimer() {
   assert(!Running && "Cannot start a running timer");
   Running = Triggered = true;
-  Signposts->startTimerInterval(this);
+  Signposts->startInterval(this, getName());
   StartTime = TimeRecord::getCurrentTime(true);
 }
 
@@ -147,7 +152,7 @@ void Timer::stopTimer() {
   Running = false;
   Time += TimeRecord::getCurrentTime(false);
   Time -= StartTime;
-  Signposts->endTimerInterval(this);
+  Signposts->endInterval(this, getName());
 }
 
 void Timer::clear() {
@@ -301,8 +306,9 @@ void TimerGroup::addTimer(Timer &T) {
 }
 
 void TimerGroup::PrintQueuedTimers(raw_ostream &OS) {
-  // Sort the timers in descending order by amount of time taken.
-  llvm::sort(TimersToPrint);
+  // Perhaps sort the timers in descending order by amount of time taken.
+  if (SortTimers)
+    llvm::sort(TimersToPrint);
 
   TimeRecord Total;
   for (const PrintRecord &Record : TimersToPrint)
diff --git a/contrib/llvm-project/llvm/lib/Support/TrigramIndex.cpp b/contrib/llvm-project/llvm/lib/Support/TrigramIndex.cpp
index 88375e6e7863..4370adc9c3e0 100644
--- a/contrib/llvm-project/llvm/lib/Support/TrigramIndex.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/TrigramIndex.cpp
@@ -15,12 +15,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/TrigramIndex.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-
 #include <set>
-#include <string>
-#include <unordered_map>
 
 using namespace llvm;
 
@@ -30,7 +25,7 @@ static bool isAdvancedMetachar(unsigned Char) {
   return strchr(RegexAdvancedMetachars, Char) != nullptr;
 }
 
-void TrigramIndex::insert(std::string Regex) {
+void TrigramIndex::insert(const std::string &Regex) {
   if (Defeated) return;
   std::set<unsigned> Was;
   unsigned Cnt = 0;
diff --git a/contrib/llvm-project/llvm/lib/Support/Triple.cpp b/contrib/llvm-project/llvm/lib/Support/Triple.cpp
index fec1985ccaca..4f483c965282 100644
--- a/contrib/llvm-project/llvm/lib/Support/Triple.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/Triple.cpp
@@ -36,6 +36,7 @@ StringRef Triple::getArchTypeName(ArchType Kind) {
   case avr:            return "avr";
   case bpfeb:          return "bpfeb";
   case bpfel:          return "bpfel";
+  case csky:           return "csky";
   case hexagon:        return "hexagon";
   case hsail64:        return "hsail64";
   case hsail:          return "hsail";
@@ -53,6 +54,7 @@ StringRef Triple::getArchTypeName(ArchType Kind) {
   case ppc64:          return "powerpc64";
   case ppc64le:        return "powerpc64le";
   case ppc:            return "powerpc";
+  case ppcle:          return "powerpcle";
   case r600:           return "r600";
   case renderscript32: return "renderscript32";
   case renderscript64: return "renderscript64";
@@ -100,7 +102,8 @@ StringRef Triple::getArchTypePrefix(ArchType Kind) {
 
   case ppc64:
   case ppc64le:
-  case ppc:         return "ppc";
+  case ppc:
+  case ppcle:       return "ppc";
 
   case mips:
   case mipsel:
@@ -151,6 +154,7 @@ StringRef Triple::getArchTypePrefix(ArchType Kind) {
   case riscv64:     return "riscv";
 
   case ve:          return "ve";
+  case csky:        return "csky";
   }
 }
 
@@ -160,8 +164,6 @@ StringRef Triple::getVendorTypeName(VendorType Kind) {
 
   case AMD: return "amd";
   case Apple: return "apple";
-  case BGP: return "bgp";
-  case BGQ: return "bgq";
   case CSR: return "csr";
   case Freescale: return "fsl";
   case IBM: return "ibm";
@@ -187,7 +189,6 @@ StringRef Triple::getOSTypeName(OSType Kind) {
   case AMDHSA: return "amdhsa";
   case AMDPAL: return "amdpal";
   case Ananas: return "ananas";
-  case CNK: return "cnk";
   case CUDA: return "cuda";
   case CloudABI: return "cloudabi";
   case Contiki: return "contiki";
@@ -218,6 +219,7 @@ StringRef Triple::getOSTypeName(OSType Kind) {
   case WASI: return "wasi";
   case WatchOS: return "watchos";
   case Win32: return "windows";
+  case ZOS: return "zos";
   }
 
   llvm_unreachable("Invalid OSType");
@@ -238,6 +240,7 @@ StringRef Triple::getEnvironmentTypeName(EnvironmentType Kind) {
   case GNUEABI: return "gnueabi";
   case GNUEABIHF: return "gnueabihf";
   case GNUX32: return "gnux32";
+  case GNUILP32: return "gnu_ilp32";
   case Itanium: return "itanium";
   case MSVC: return "msvc";
   case MacABI: return "macabi";
@@ -286,6 +289,8 @@ Triple::ArchType Triple::getArchTypeForLLVMName(StringRef Name) {
     .Case("ppc64", ppc64)
     .Case("ppc32", ppc)
     .Case("ppc", ppc)
+    .Case("ppc32le", ppcle)
+    .Case("ppcle", ppcle)
     .Case("ppc64le", ppc64le)
     .Case("r600", r600)
     .Case("amdgcn", amdgcn)
@@ -321,6 +326,7 @@ Triple::ArchType Triple::getArchTypeForLLVMName(StringRef Name) {
     .Case("renderscript32", renderscript32)
     .Case("renderscript64", renderscript64)
     .Case("ve", ve)
+    .Case("csky", csky)
     .Default(UnknownArch);
 }
 
@@ -396,6 +402,7 @@ static Triple::ArchType parseArch(StringRef ArchName) {
     .Cases("i786", "i886", "i986", Triple::x86)
     .Cases("amd64", "x86_64", "x86_64h", Triple::x86_64)
     .Cases("powerpc", "powerpcspe", "ppc", "ppc32", Triple::ppc)
+    .Cases("powerpcle", "ppcle", "ppc32le", Triple::ppcle)
     .Cases("powerpc64", "ppu", "ppc64", Triple::ppc64)
     .Cases("powerpc64le", "ppc64le", Triple::ppc64le)
     .Case("xscale", Triple::arm)
@@ -406,6 +413,7 @@ static Triple::ArchType parseArch(StringRef ArchName) {
     .Case("arc", Triple::arc)
     .Case("arm64", Triple::aarch64)
     .Case("arm64_32", Triple::aarch64_32)
+    .Case("arm64e", Triple::aarch64)
     .Case("arm", Triple::arm)
     .Case("armeb", Triple::armeb)
     .Case("thumb", Triple::thumb)
@@ -450,6 +458,7 @@ static Triple::ArchType parseArch(StringRef ArchName) {
     .Case("ve", Triple::ve)
     .Case("wasm32", Triple::wasm32)
     .Case("wasm64", Triple::wasm64)
+    .Case("csky", Triple::csky)
     .Default(Triple::UnknownArch);
 
   // Some architectures require special parsing logic just to compute the
@@ -470,8 +479,6 @@ static Triple::VendorType parseVendor(StringRef VendorName) {
     .Case("apple", Triple::Apple)
     .Case("pc", Triple::PC)
     .Case("scei", Triple::SCEI)
-    .Case("bgp", Triple::BGP)
-    .Case("bgq", Triple::BGQ)
     .Case("fsl", Triple::Freescale)
     .Case("ibm", Triple::IBM)
     .Case("img", Triple::ImaginationTechnologies)
@@ -504,11 +511,11 @@ static Triple::OSType parseOS(StringRef OSName) {
     .StartsWith("solaris", Triple::Solaris)
     .StartsWith("win32", Triple::Win32)
     .StartsWith("windows", Triple::Win32)
+    .StartsWith("zos", Triple::ZOS)
     .StartsWith("haiku", Triple::Haiku)
     .StartsWith("minix", Triple::Minix)
     .StartsWith("rtems", Triple::RTEMS)
     .StartsWith("nacl", Triple::NaCl)
-    .StartsWith("cnk", Triple::CNK)
     .StartsWith("aix", Triple::AIX)
     .StartsWith("cuda", Triple::CUDA)
     .StartsWith("nvcl", Triple::NVCL)
@@ -529,26 +536,27 @@ static Triple::OSType parseOS(StringRef OSName) {
 
 static Triple::EnvironmentType parseEnvironment(StringRef EnvironmentName) {
   return StringSwitch<Triple::EnvironmentType>(EnvironmentName)
-    .StartsWith("eabihf", Triple::EABIHF)
-    .StartsWith("eabi", Triple::EABI)
-    .StartsWith("gnuabin32", Triple::GNUABIN32)
-    .StartsWith("gnuabi64", Triple::GNUABI64)
-    .StartsWith("gnueabihf", Triple::GNUEABIHF)
-    .StartsWith("gnueabi", Triple::GNUEABI)
-    .StartsWith("gnux32", Triple::GNUX32)
-    .StartsWith("code16", Triple::CODE16)
-    .StartsWith("gnu", Triple::GNU)
-    .StartsWith("android", Triple::Android)
-    .StartsWith("musleabihf", Triple::MuslEABIHF)
-    .StartsWith("musleabi", Triple::MuslEABI)
-    .StartsWith("musl", Triple::Musl)
-    .StartsWith("msvc", Triple::MSVC)
-    .StartsWith("itanium", Triple::Itanium)
-    .StartsWith("cygnus", Triple::Cygnus)
-    .StartsWith("coreclr", Triple::CoreCLR)
-    .StartsWith("simulator", Triple::Simulator)
-    .StartsWith("macabi", Triple::MacABI)
-    .Default(Triple::UnknownEnvironment);
+      .StartsWith("eabihf", Triple::EABIHF)
+      .StartsWith("eabi", Triple::EABI)
+      .StartsWith("gnuabin32", Triple::GNUABIN32)
+      .StartsWith("gnuabi64", Triple::GNUABI64)
+      .StartsWith("gnueabihf", Triple::GNUEABIHF)
+      .StartsWith("gnueabi", Triple::GNUEABI)
+      .StartsWith("gnux32", Triple::GNUX32)
+      .StartsWith("gnu_ilp32", Triple::GNUILP32)
+      .StartsWith("code16", Triple::CODE16)
+      .StartsWith("gnu", Triple::GNU)
+      .StartsWith("android", Triple::Android)
+      .StartsWith("musleabihf", Triple::MuslEABIHF)
+      .StartsWith("musleabi", Triple::MuslEABI)
+      .StartsWith("musl", Triple::Musl)
+      .StartsWith("msvc", Triple::MSVC)
+      .StartsWith("itanium", Triple::Itanium)
+      .StartsWith("cygnus", Triple::Cygnus)
+      .StartsWith("coreclr", Triple::CoreCLR)
+      .StartsWith("simulator", Triple::Simulator)
+      .StartsWith("macabi", Triple::MacABI)
+      .Default(Triple::UnknownEnvironment);
 }
 
 static Triple::ObjectFormatType parseFormat(StringRef EnvironmentName) {
@@ -558,6 +566,7 @@ static Triple::ObjectFormatType parseFormat(StringRef EnvironmentName) {
     .EndsWith("xcoff", Triple::XCOFF)
     .EndsWith("coff", Triple::COFF)
     .EndsWith("elf", Triple::ELF)
+    .EndsWith("goff", Triple::GOFF)
     .EndsWith("macho", Triple::MachO)
     .EndsWith("wasm", Triple::Wasm)
     .Default(Triple::UnknownObjectFormat);
@@ -571,6 +580,9 @@ static Triple::SubArchType parseSubArch(StringRef SubArchName) {
   if (SubArchName == "powerpcspe")
     return Triple::PPCSubArch_spe;
 
+  if (SubArchName == "arm64e")
+    return Triple::AArch64SubArch_arm64e;
+
   StringRef ARMSubArch = ARM::getCanonicalArchName(SubArchName);
 
   // For now, this is the small part. Early return.
@@ -631,6 +643,8 @@ static Triple::SubArchType parseSubArch(StringRef SubArchName) {
     return Triple::ARMSubArch_v8_5a;
   case ARM::ArchKind::ARMV8_6A:
     return Triple::ARMSubArch_v8_6a;
+  case ARM::ArchKind::ARMV8_7A:
+    return Triple::ARMSubArch_v8_7a;
   case ARM::ArchKind::ARMV8R:
     return Triple::ARMSubArch_v8r;
   case ARM::ArchKind::ARMV8MBaseline:
@@ -649,6 +663,7 @@ static StringRef getObjectFormatTypeName(Triple::ObjectFormatType Kind) {
   case Triple::UnknownObjectFormat: return "";
   case Triple::COFF:  return "coff";
   case Triple::ELF:   return "elf";
+  case Triple::GOFF:  return "goff";
   case Triple::MachO: return "macho";
   case Triple::Wasm:  return "wasm";
   case Triple::XCOFF: return "xcoff";
@@ -680,6 +695,7 @@ static Triple::ObjectFormatType getDefaultFormat(const Triple &T) {
   case Triple::avr:
   case Triple::bpfeb:
   case Triple::bpfel:
+  case Triple::csky:
   case Triple::hexagon:
   case Triple::hsail64:
   case Triple::hsail:
@@ -695,6 +711,7 @@ static Triple::ObjectFormatType getDefaultFormat(const Triple &T) {
   case Triple::nvptx64:
   case Triple::nvptx:
   case Triple::ppc64le:
+  case Triple::ppcle:
   case Triple::r600:
   case Triple::renderscript32:
   case Triple::renderscript64:
@@ -706,7 +723,6 @@ static Triple::ObjectFormatType getDefaultFormat(const Triple &T) {
   case Triple::sparcv9:
   case Triple::spir64:
   case Triple::spir:
-  case Triple::systemz:
   case Triple::tce:
   case Triple::tcele:
   case Triple::thumbeb:
@@ -720,6 +736,11 @@ static Triple::ObjectFormatType getDefaultFormat(const Triple &T) {
       return Triple::XCOFF;
     return Triple::ELF;
 
+  case Triple::systemz:
+    if (T.isOSzOS())
+      return Triple::GOFF;
+    return Triple::ELF;
+
   case Triple::wasm32:
   case Triple::wasm64:
     return Triple::Wasm;
@@ -1017,7 +1038,7 @@ StringRef Triple::getOSAndEnvironmentName() const {
 }
 
 static unsigned EatNumber(StringRef &Str) {
-  assert(!Str.empty() && Str[0] >= '0' && Str[0] <= '9' && "Not a number");
+  assert(!Str.empty() && isDigit(Str[0]) && "Not a number");
   unsigned Result = 0;
 
   do {
@@ -1026,7 +1047,7 @@ static unsigned EatNumber(StringRef &Str) {
 
     // Eat the digit.
     Str = Str.substr(1);
-  } while (!Str.empty() && Str[0] >= '0' && Str[0] <= '9');
+  } while (!Str.empty() && isDigit(Str[0]));
 
   return Result;
 }
@@ -1249,6 +1270,7 @@ static unsigned getArchPointerBitWidth(llvm::Triple::ArchType Arch) {
   case llvm::Triple::arc:
   case llvm::Triple::arm:
   case llvm::Triple::armeb:
+  case llvm::Triple::csky:
   case llvm::Triple::hexagon:
   case llvm::Triple::hsail:
   case llvm::Triple::kalimba:
@@ -1258,6 +1280,7 @@ static unsigned getArchPointerBitWidth(llvm::Triple::ArchType Arch) {
   case llvm::Triple::mipsel:
   case llvm::Triple::nvptx:
   case llvm::Triple::ppc:
+  case llvm::Triple::ppcle:
   case llvm::Triple::r600:
   case llvm::Triple::renderscript32:
   case llvm::Triple::riscv32:
@@ -1321,7 +1344,6 @@ Triple Triple::get32BitArchVariant() const {
   case Triple::bpfeb:
   case Triple::bpfel:
   case Triple::msp430:
-  case Triple::ppc64le:
   case Triple::systemz:
   case Triple::ve:
     T.setArch(UnknownArch);
@@ -1332,6 +1354,7 @@ Triple Triple::get32BitArchVariant() const {
   case Triple::arc:
   case Triple::arm:
   case Triple::armeb:
+  case Triple::csky:
   case Triple::hexagon:
   case Triple::hsail:
   case Triple::kalimba:
@@ -1341,6 +1364,7 @@ Triple Triple::get32BitArchVariant() const {
   case Triple::mipsel:
   case Triple::nvptx:
   case Triple::ppc:
+  case Triple::ppcle:
   case Triple::r600:
   case Triple::renderscript32:
   case Triple::riscv32:
@@ -1367,6 +1391,7 @@ Triple Triple::get32BitArchVariant() const {
   case Triple::mips64el:       T.setArch(Triple::mipsel);  break;
   case Triple::nvptx64:        T.setArch(Triple::nvptx);   break;
   case Triple::ppc64:          T.setArch(Triple::ppc);     break;
+  case Triple::ppc64le:        T.setArch(Triple::ppcle);   break;
   case Triple::renderscript64: T.setArch(Triple::renderscript32); break;
   case Triple::riscv64:        T.setArch(Triple::riscv32); break;
   case Triple::sparcv9:        T.setArch(Triple::sparc);   break;
@@ -1383,6 +1408,7 @@ Triple Triple::get64BitArchVariant() const {
   case Triple::UnknownArch:
   case Triple::arc:
   case Triple::avr:
+  case Triple::csky:
   case Triple::hexagon:
   case Triple::kalimba:
   case Triple::lanai:
@@ -1430,6 +1456,7 @@ Triple Triple::get64BitArchVariant() const {
   case Triple::mipsel:          T.setArch(Triple::mips64el);   break;
   case Triple::nvptx:           T.setArch(Triple::nvptx64);    break;
   case Triple::ppc:             T.setArch(Triple::ppc64);      break;
+  case Triple::ppcle:           T.setArch(Triple::ppc64le);    break;
   case Triple::renderscript32:  T.setArch(Triple::renderscript64);     break;
   case Triple::riscv32:         T.setArch(Triple::riscv64);    break;
   case Triple::sparc:           T.setArch(Triple::sparcv9);    break;
@@ -1476,6 +1503,7 @@ Triple Triple::getBigEndianArchVariant() const {
   case Triple::x86_64:
   case Triple::xcore:
   case Triple::ve:
+  case Triple::csky:
 
   // ARM is intentionally unsupported here, changing the architecture would
   // drop any arch suffixes.
@@ -1488,6 +1516,7 @@ Triple Triple::getBigEndianArchVariant() const {
   case Triple::bpfel:   T.setArch(Triple::bpfeb);      break;
   case Triple::mips64el:T.setArch(Triple::mips64);     break;
   case Triple::mipsel:  T.setArch(Triple::mips);       break;
+  case Triple::ppcle:   T.setArch(Triple::ppc);        break;
   case Triple::ppc64le: T.setArch(Triple::ppc64);      break;
   case Triple::sparcel: T.setArch(Triple::sparc);      break;
   case Triple::tcele:   T.setArch(Triple::tce);        break;
@@ -1505,7 +1534,6 @@ Triple Triple::getLittleEndianArchVariant() const {
   switch (getArch()) {
   case Triple::UnknownArch:
   case Triple::lanai:
-  case Triple::ppc:
   case Triple::sparcv9:
   case Triple::systemz:
 
@@ -1520,6 +1548,7 @@ Triple Triple::getLittleEndianArchVariant() const {
   case Triple::bpfeb:      T.setArch(Triple::bpfel);    break;
   case Triple::mips64:     T.setArch(Triple::mips64el); break;
   case Triple::mips:       T.setArch(Triple::mipsel);   break;
+  case Triple::ppc:        T.setArch(Triple::ppcle);    break;
   case Triple::ppc64:      T.setArch(Triple::ppc64le);  break;
   case Triple::sparc:      T.setArch(Triple::sparcel);  break;
   case Triple::tce:        T.setArch(Triple::tcele);    break;
@@ -1539,6 +1568,7 @@ bool Triple::isLittleEndian() const {
   case Triple::arm:
   case Triple::avr:
   case Triple::bpfel:
+  case Triple::csky:
   case Triple::hexagon:
   case Triple::hsail64:
   case Triple::hsail:
@@ -1550,6 +1580,7 @@ bool Triple::isLittleEndian() const {
   case Triple::msp430:
   case Triple::nvptx64:
   case Triple::nvptx:
+  case Triple::ppcle:
   case Triple::ppc64le:
   case Triple::r600:
   case Triple::renderscript32:
@@ -1636,6 +1667,9 @@ VersionTuple Triple::getMinimumSupportedOSVersion() const {
     // ARM64 simulators are supported for iOS 14+.
     if (isMacCatalystEnvironment() || isSimulatorEnvironment())
       return VersionTuple(14, 0, 0);
+    // ARM64e slice is supported starting from iOS 14.
+    if (isArm64e())
+      return VersionTuple(14, 0, 0);
     break;
   case Triple::TvOS:
     // ARM64 simulators are supported for tvOS 14+.
diff --git a/contrib/llvm-project/llvm/lib/Support/Unicode.cpp b/contrib/llvm-project/llvm/lib/Support/Unicode.cpp
index 4d195069682b..bb6e75555b4c 100644
--- a/contrib/llvm-project/llvm/lib/Support/Unicode.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/Unicode.cpp
@@ -339,11 +339,22 @@ static inline int charWidth(int UCS)
   return 1;
 }
 
+static bool isprintableascii(char c) { return c > 31 && c < 127; }
+
 int columnWidthUTF8(StringRef Text) {
   unsigned ColumnWidth = 0;
   unsigned Length;
   for (size_t i = 0, e = Text.size(); i < e; i += Length) {
     Length = getNumBytesForUTF8(Text[i]);
+
+    // fast path for ASCII characters
+    if (Length == 1) {
+      if (!isprintableascii(Text[i]))
+        return ErrorNonPrintableCharacter;
+      ColumnWidth += 1;
+      continue;
+    }
+
     if (Length <= 0 || i + Length > Text.size())
       return ErrorInvalidUTF8;
     UTF32 buf[1];
diff --git a/contrib/llvm-project/llvm/lib/Support/Unix/Path.inc b/contrib/llvm-project/llvm/lib/Support/Unix/Path.inc
index d91b269cc6d3..77f3f54bd881 100644
--- a/contrib/llvm-project/llvm/lib/Support/Unix/Path.inc
+++ b/contrib/llvm-project/llvm/lib/Support/Unix/Path.inc
@@ -33,6 +33,7 @@
 
 #include <dirent.h>
 #include <pwd.h>
+#include <sys/file.h>
 
 #ifdef __APPLE__
 #include <mach-o/dyld.h>
@@ -146,6 +147,9 @@ test_dir(char ret[PATH_MAX], const char *dir, const char *bin)
 static char *
 getprogpath(char ret[PATH_MAX], const char *bin)
 {
+  if (bin == nullptr)
+    return nullptr;
+
   /* First approach: absolute path. */
   if (bin[0] == '/') {
     if (test_dir(ret, "/", bin) == 0)
@@ -681,8 +685,6 @@ void expand_tilde(const Twine &path, SmallVectorImpl<char> &dest) {
 
   path.toVector(dest);
   expandTildeExpr(dest);
-
-  return;
 }
 
 static file_type typeForMode(mode_t Mode) {
@@ -791,6 +793,16 @@ std::error_code setLastAccessAndModificationTime(int FD, TimePoint<> AccessTime,
   if (::futimes(FD, Times))
     return std::error_code(errno, std::generic_category());
   return std::error_code();
+#elif defined(__MVS__)
+  attrib_t Attr;
+  memset(&Attr, 0, sizeof(Attr));
+  Attr.att_atimechg = 1;
+  Attr.att_atime = sys::toTimeT(AccessTime);
+  Attr.att_mtimechg = 1;
+  Attr.att_mtime = sys::toTimeT(ModificationTime);
+  if (::__fchattr(FD, &Attr, sizeof(Attr)) != 0)
+    return std::error_code(errno, std::generic_category());
+  return std::error_code();
 #else
 #warning Missing futimes() and futimens()
   return make_error_code(errc::function_not_supported);
@@ -1055,8 +1067,13 @@ file_t getStdoutHandle() { return 1; }
 file_t getStderrHandle() { return 2; }
 
 Expected<size_t> readNativeFile(file_t FD, MutableArrayRef<char> Buf) {
+#if defined(__APPLE__)
+  size_t Size = std::min<size_t>(Buf.size(), INT32_MAX);
+#else
+  size_t Size = Buf.size();
+#endif
   ssize_t NumRead =
-      sys::RetryAfterSignal(-1, ::read, FD, Buf.data(), Buf.size());
+      sys::RetryAfterSignal(-1, ::read, FD, Buf.data(), Size);
   if (ssize_t(NumRead) == -1)
     return errorCodeToError(std::error_code(errno, std::generic_category()));
   return NumRead;
@@ -1064,20 +1081,69 @@ Expected<size_t> readNativeFile(file_t FD, MutableArrayRef<char> Buf) {
 
 Expected<size_t> readNativeFileSlice(file_t FD, MutableArrayRef<char> Buf,
                                      uint64_t Offset) {
+#if defined(__APPLE__)
+  size_t Size = std::min<size_t>(Buf.size(), INT32_MAX);
+#else
+  size_t Size = Buf.size();
+#endif
 #ifdef HAVE_PREAD
   ssize_t NumRead =
-      sys::RetryAfterSignal(-1, ::pread, FD, Buf.data(), Buf.size(), Offset);
+      sys::RetryAfterSignal(-1, ::pread, FD, Buf.data(), Size, Offset);
 #else
   if (lseek(FD, Offset, SEEK_SET) == -1)
     return errorCodeToError(std::error_code(errno, std::generic_category()));
   ssize_t NumRead =
-      sys::RetryAfterSignal(-1, ::read, FD, Buf.data(), Buf.size());
+      sys::RetryAfterSignal(-1, ::read, FD, Buf.data(), Size);
 #endif
   if (NumRead == -1)
     return errorCodeToError(std::error_code(errno, std::generic_category()));
   return NumRead;
 }
 
+std::error_code tryLockFile(int FD, std::chrono::milliseconds Timeout) {
+  auto Start = std::chrono::steady_clock::now();
+  auto End = Start + Timeout;
+  do {
+    struct flock Lock;
+    memset(&Lock, 0, sizeof(Lock));
+    Lock.l_type = F_WRLCK;
+    Lock.l_whence = SEEK_SET;
+    Lock.l_start = 0;
+    Lock.l_len = 0;
+    if (::fcntl(FD, F_SETLK, &Lock) != -1)
+      return std::error_code();
+    int Error = errno;
+    if (Error != EACCES && Error != EAGAIN)
+      return std::error_code(Error, std::generic_category());
+    usleep(1000);
+  } while (std::chrono::steady_clock::now() < End);
+  return make_error_code(errc::no_lock_available);
+}
+
+std::error_code lockFile(int FD) {
+  struct flock Lock;
+  memset(&Lock, 0, sizeof(Lock));
+  Lock.l_type = F_WRLCK;
+  Lock.l_whence = SEEK_SET;
+  Lock.l_start = 0;
+  Lock.l_len = 0;
+  if (::fcntl(FD, F_SETLKW, &Lock) != -1)
+    return std::error_code();
+  int Error = errno;
+  return std::error_code(Error, std::generic_category());
+}
+
+std::error_code unlockFile(int FD) {
+  struct flock Lock;
+  Lock.l_type = F_UNLCK;
+  Lock.l_whence = SEEK_SET;
+  Lock.l_start = 0;
+  Lock.l_len = 0;
+  if (::fcntl(FD, F_SETLK, &Lock) != -1)
+    return std::error_code();
+  return std::error_code(errno, std::generic_category());
+}
+
 std::error_code closeFile(file_t &F) {
   file_t TmpF = F;
   F = kInvalidFile;
diff --git a/contrib/llvm-project/llvm/lib/Support/Unix/Process.inc b/contrib/llvm-project/llvm/lib/Support/Unix/Process.inc
index 24f16b51af7b..7425d084da27 100644
--- a/contrib/llvm-project/llvm/lib/Support/Unix/Process.inc
+++ b/contrib/llvm-project/llvm/lib/Support/Unix/Process.inc
@@ -313,7 +313,7 @@ unsigned Process::StandardErrColumns() {
   return getColumns();
 }
 
-#ifdef HAVE_TERMINFO
+#ifdef LLVM_ENABLE_TERMINFO
 // We manually declare these extern functions because finding the correct
 // headers from various terminfo, curses, or other sources is harder than
 // writing their specs down.
@@ -323,12 +323,12 @@ extern "C" int del_curterm(struct term *termp);
 extern "C" int tigetnum(char *capname);
 #endif
 
-#ifdef HAVE_TERMINFO
+#ifdef LLVM_ENABLE_TERMINFO
 static ManagedStatic<std::mutex> TermColorMutex;
 #endif
 
 static bool terminalHasColors(int fd) {
-#ifdef HAVE_TERMINFO
+#ifdef LLVM_ENABLE_TERMINFO
   // First, acquire a global lock because these C routines are thread hostile.
   std::lock_guard<std::mutex> G(*TermColorMutex);
 
diff --git a/contrib/llvm-project/llvm/lib/Support/Unix/Program.inc b/contrib/llvm-project/llvm/lib/Support/Unix/Program.inc
index 8f41fc015163..fb56fa4b0d1d 100644
--- a/contrib/llvm-project/llvm/lib/Support/Unix/Program.inc
+++ b/contrib/llvm-project/llvm/lib/Support/Unix/Program.inc
@@ -174,7 +174,8 @@ toNullTerminatedCStringArray(ArrayRef<StringRef> Strings, StringSaver &Saver) {
 static bool Execute(ProcessInfo &PI, StringRef Program,
                     ArrayRef<StringRef> Args, Optional<ArrayRef<StringRef>> Env,
                     ArrayRef<Optional<StringRef>> Redirects,
-                    unsigned MemoryLimit, std::string *ErrMsg) {
+                    unsigned MemoryLimit, std::string *ErrMsg,
+                    BitVector *AffinityMask) {
   if (!llvm::sys::fs::exists(Program)) {
     if (ErrMsg)
       *ErrMsg = std::string("Executable \"") + Program.str() +
@@ -182,6 +183,9 @@ static bool Execute(ProcessInfo &PI, StringRef Program,
     return false;
   }
 
+  assert(!AffinityMask && "Starting a process with an affinity mask is "
+                          "currently not supported on Unix!");
+
   BumpPtrAllocator Allocator;
   StringSaver Saver(Allocator);
   std::vector<const char *> ArgVector, EnvVector;
diff --git a/contrib/llvm-project/llvm/lib/Support/Unix/Signals.inc b/contrib/llvm-project/llvm/lib/Support/Unix/Signals.inc
index f68374d29f02..3d7b5d2fe5aa 100644
--- a/contrib/llvm-project/llvm/lib/Support/Unix/Signals.inc
+++ b/contrib/llvm-project/llvm/lib/Support/Unix/Signals.inc
@@ -36,6 +36,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Config/config.h"
 #include "llvm/Demangle/Demangle.h"
+#include "llvm/Support/ExitCodes.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/FileUtilities.h"
 #include "llvm/Support/Format.h"
@@ -46,7 +47,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <string>
-#include <sysexits.h>
 #ifdef HAVE_BACKTRACE
 # include BACKTRACE_HEADER         // For backtrace().
 #endif
@@ -331,7 +331,7 @@ static void RegisterHandlers() { // Not signal-safe.
     registerHandler(S, SignalKind::IsInfo);
 }
 
-static void UnregisterHandlers() {
+void sys::unregisterHandlers() {
   // Restore all of the signal handlers to how they were before we showed up.
   for (unsigned i = 0, e = NumRegisteredSignals.load(); i != e; ++i) {
     sigaction(RegisteredSignalInfo[i].SigNo,
@@ -367,7 +367,7 @@ static RETSIGTYPE SignalHandler(int Sig) {
   // crashes when we return and the signal reissues.  This also ensures that if
   // we crash in our signal handler that the program will terminate immediately
   // instead of recursing in the signal handler.
-  UnregisterHandlers();
+  sys::unregisterHandlers();
 
   // Unmask all potentially blocked kill signals.
   sigset_t SigMask;
@@ -382,14 +382,15 @@ static RETSIGTYPE SignalHandler(int Sig) {
               OneShotPipeSignalFunction.exchange(nullptr))
         return OldOneShotPipeFunction();
 
-    if (std::find(std::begin(IntSigs), std::end(IntSigs), Sig)
-        != std::end(IntSigs)) {
+    bool IsIntSig = llvm::is_contained(IntSigs, Sig);
+    if (IsIntSig)
       if (auto OldInterruptFunction = InterruptFunction.exchange(nullptr))
         return OldInterruptFunction();
 
-      raise(Sig);   // Execute the default handler.
+    if (Sig == SIGPIPE || IsIntSig) {
+      raise(Sig); // Execute the default handler.
       return;
-   }
+    }
   }
 
   // Otherwise if it is a fault (like SEGV) run any handler.
@@ -554,7 +555,7 @@ static int unwindBacktrace(void **StackTrace, int MaxEntries) {
 //
 // On glibc systems we have the 'backtrace' function, which works nicely, but
 // doesn't demangle symbols.
-void llvm::sys::PrintStackTrace(raw_ostream &OS) {
+void llvm::sys::PrintStackTrace(raw_ostream &OS, int Depth) {
 #if ENABLE_BACKTRACES
   static void *StackTrace[256];
   int depth = 0;
@@ -571,9 +572,15 @@ void llvm::sys::PrintStackTrace(raw_ostream &OS) {
 #endif
   if (!depth)
     return;
-
-  if (printSymbolizedStackTrace(Argv0, StackTrace, depth, OS))
+  // If "Depth" is not provided by the caller, use the return value of
+  // backtrace() for printing a symbolized stack trace.
+  if (!Depth)
+    Depth = depth;
+  if (printSymbolizedStackTrace(Argv0, StackTrace, Depth, OS))
     return;
+  OS << "Stack dump without symbol names (ensure you have llvm-symbolizer in "
+        "your PATH or set the environment var `LLVM_SYMBOLIZER_PATH` to point "
+        "to it):\n";
 #if HAVE_DLFCN_H && HAVE_DLADDR
   int width = 0;
   for (int i = 0; i < depth; ++i) {
@@ -615,7 +622,7 @@ void llvm::sys::PrintStackTrace(raw_ostream &OS) {
     OS << '\n';
   }
 #elif defined(HAVE_BACKTRACE)
-  backtrace_symbols_fd(StackTrace, depth, STDERR_FILENO);
+  backtrace_symbols_fd(StackTrace, Depth, STDERR_FILENO);
 #endif
 #endif
 }
diff --git a/contrib/llvm-project/llvm/lib/Support/VirtualFileSystem.cpp b/contrib/llvm-project/llvm/lib/Support/VirtualFileSystem.cpp
index 5b757c9ea80d..05332b00bd1f 100644
--- a/contrib/llvm-project/llvm/lib/Support/VirtualFileSystem.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/VirtualFileSystem.cpp
@@ -792,14 +792,12 @@ bool InMemoryFileSystem::addFile(const Twine &P, time_t ModificationTime,
 }
 
 bool InMemoryFileSystem::addFileNoOwn(const Twine &P, time_t ModificationTime,
-                                      llvm::MemoryBuffer *Buffer,
+                                      const llvm::MemoryBufferRef &Buffer,
                                       Optional<uint32_t> User,
                                       Optional<uint32_t> Group,
                                       Optional<llvm::sys::fs::file_type> Type,
                                       Optional<llvm::sys::fs::perms> Perms) {
-  return addFile(P, ModificationTime,
-                 llvm::MemoryBuffer::getMemBuffer(
-                     Buffer->getBuffer(), Buffer->getBufferIdentifier()),
+  return addFile(P, ModificationTime, llvm::MemoryBuffer::getMemBuffer(Buffer),
                  std::move(User), std::move(Group), std::move(Type),
                  std::move(Perms));
 }
@@ -1018,7 +1016,6 @@ RedirectingFileSystem::RedirectingFileSystem(IntrusiveRefCntPtr<FileSystem> FS)
     if (auto ExternalWorkingDirectory =
             ExternalFS->getCurrentWorkingDirectory()) {
       WorkingDirectory = *ExternalWorkingDirectory;
-      ExternalFSValidWD = true;
     }
 }
 
@@ -1077,12 +1074,6 @@ RedirectingFileSystem::setCurrentWorkingDirectory(const Twine &Path) {
   if (!exists(Path))
     return errc::no_such_file_or_directory;
 
-  // Always change the external FS but ignore its result.
-  if (ExternalFS) {
-    auto EC = ExternalFS->setCurrentWorkingDirectory(Path);
-    ExternalFSValidWD = !static_cast<bool>(EC);
-  }
-
   SmallString<128> AbsolutePath;
   Path.toVector(AbsolutePath);
   if (std::error_code EC = makeAbsolute(AbsolutePath))
@@ -1091,8 +1082,14 @@ RedirectingFileSystem::setCurrentWorkingDirectory(const Twine &Path) {
   return {};
 }
 
-std::error_code RedirectingFileSystem::isLocal(const Twine &Path,
+std::error_code RedirectingFileSystem::isLocal(const Twine &Path_,
                                                bool &Result) {
+  SmallString<256> Path;
+  Path_.toVector(Path);
+
+  if (std::error_code EC = makeCanonical(Path))
+    return {};
+
   return ExternalFS->isLocal(Path, Result);
 }
 
@@ -1127,14 +1124,21 @@ std::error_code RedirectingFileSystem::makeAbsolute(SmallVectorImpl<char> &Path)
 
 directory_iterator RedirectingFileSystem::dir_begin(const Twine &Dir,
                                                     std::error_code &EC) {
-  ErrorOr<RedirectingFileSystem::Entry *> E = lookupPath(Dir);
+  SmallString<256> Path;
+  Dir.toVector(Path);
+
+  EC = makeCanonical(Path);
+  if (EC)
+    return {};
+
+  ErrorOr<RedirectingFileSystem::Entry *> E = lookupPath(Path);
   if (!E) {
     EC = E.getError();
     if (shouldUseExternalFS() && EC == errc::no_such_file_or_directory)
-      return ExternalFS->dir_begin(Dir, EC);
+      return ExternalFS->dir_begin(Path, EC);
     return {};
   }
-  ErrorOr<Status> S = status(Dir, *E);
+  ErrorOr<Status> S = status(Path, *E);
   if (!S) {
     EC = S.getError();
     return {};
@@ -1147,7 +1151,7 @@ directory_iterator RedirectingFileSystem::dir_begin(const Twine &Dir,
 
   auto *D = cast<RedirectingFileSystem::RedirectingDirectoryEntry>(*E);
   return directory_iterator(std::make_shared<VFSFromYamlDirIterImpl>(
-      Dir, D->contents_begin(), D->contents_end(),
+      Path, D->contents_begin(), D->contents_end(),
       /*IterateExternalFS=*/shouldUseExternalFS(), *ExternalFS, EC));
 }
 
@@ -1159,6 +1163,17 @@ StringRef RedirectingFileSystem::getExternalContentsPrefixDir() const {
   return ExternalContentsPrefixDir;
 }
 
+void RedirectingFileSystem::setFallthrough(bool Fallthrough) {
+  IsFallthrough = Fallthrough;
+}
+
+std::vector<StringRef> RedirectingFileSystem::getRoots() const {
+  std::vector<StringRef> R;
+  for (const auto &Root : Roots)
+    R.push_back(Root->getName());
+  return R;
+}
+
 void RedirectingFileSystem::dump(raw_ostream &OS) const {
   for (const auto &Root : Roots)
     dumpEntry(OS, Root.get());
@@ -1263,7 +1278,8 @@ class llvm::vfs::RedirectingFileSystemParser {
     return true;
   }
 
-  RedirectingFileSystem::Entry *
+public:
+  static RedirectingFileSystem::Entry *
   lookupOrCreateEntry(RedirectingFileSystem *FS, StringRef Name,
                       RedirectingFileSystem::Entry *ParentEntry = nullptr) {
     if (!ParentEntry) { // Look for a existent root
@@ -1305,6 +1321,7 @@ class llvm::vfs::RedirectingFileSystemParser {
     return DE->getLastContent();
   }
 
+private:
   void uniqueOverlayTree(RedirectingFileSystem *FS,
                          RedirectingFileSystem::Entry *SrcE,
                          RedirectingFileSystem::Entry *NewParentE = nullptr) {
@@ -1630,7 +1647,7 @@ public:
   }
 };
 
-RedirectingFileSystem *
+std::unique_ptr<RedirectingFileSystem>
 RedirectingFileSystem::create(std::unique_ptr<MemoryBuffer> Buffer,
                               SourceMgr::DiagHandlerTy DiagHandler,
                               StringRef YAMLFilePath, void *DiagContext,
@@ -1670,25 +1687,80 @@ RedirectingFileSystem::create(std::unique_ptr<MemoryBuffer> Buffer,
   if (!P.parse(Root, FS.get()))
     return nullptr;
 
-  return FS.release();
+  return FS;
 }
 
-ErrorOr<RedirectingFileSystem::Entry *>
-RedirectingFileSystem::lookupPath(const Twine &Path_) const {
-  SmallString<256> Path;
-  Path_.toVector(Path);
+std::unique_ptr<RedirectingFileSystem> RedirectingFileSystem::create(
+    ArrayRef<std::pair<std::string, std::string>> RemappedFiles,
+    bool UseExternalNames, FileSystem &ExternalFS) {
+  std::unique_ptr<RedirectingFileSystem> FS(
+      new RedirectingFileSystem(&ExternalFS));
+  FS->UseExternalNames = UseExternalNames;
+
+  StringMap<RedirectingFileSystem::Entry *> Entries;
+
+  for (auto &Mapping : llvm::reverse(RemappedFiles)) {
+    SmallString<128> From = StringRef(Mapping.first);
+    SmallString<128> To = StringRef(Mapping.second);
+    {
+      auto EC = ExternalFS.makeAbsolute(From);
+      (void)EC;
+      assert(!EC && "Could not make absolute path");
+    }
 
-  // Handle relative paths
+    // Check if we've already mapped this file. The first one we see (in the
+    // reverse iteration) wins.
+    RedirectingFileSystem::Entry *&ToEntry = Entries[From];
+    if (ToEntry)
+      continue;
+
+    // Add parent directories.
+    RedirectingFileSystem::Entry *Parent = nullptr;
+    StringRef FromDirectory = llvm::sys::path::parent_path(From);
+    for (auto I = llvm::sys::path::begin(FromDirectory),
+              E = llvm::sys::path::end(FromDirectory);
+         I != E; ++I) {
+      Parent = RedirectingFileSystemParser::lookupOrCreateEntry(FS.get(), *I,
+                                                                Parent);
+    }
+    assert(Parent && "File without a directory?");
+    {
+      auto EC = ExternalFS.makeAbsolute(To);
+      (void)EC;
+      assert(!EC && "Could not make absolute path");
+    }
+
+    // Add the file.
+    auto NewFile =
+        std::make_unique<RedirectingFileSystem::RedirectingFileEntry>(
+            llvm::sys::path::filename(From), To,
+            UseExternalNames
+                ? RedirectingFileSystem::RedirectingFileEntry::NK_External
+                : RedirectingFileSystem::RedirectingFileEntry::NK_Virtual);
+    ToEntry = NewFile.get();
+    cast<RedirectingFileSystem::RedirectingDirectoryEntry>(Parent)->addContent(
+        std::move(NewFile));
+  }
+
+  return FS;
+}
+
+std::error_code
+RedirectingFileSystem::makeCanonical(SmallVectorImpl<char> &Path) const {
   if (std::error_code EC = makeAbsolute(Path))
     return EC;
 
-  // Canonicalize path by removing ".", "..", "./", components. This is
-  // a VFS request, do not bother about symlinks in the path components
-  // but canonicalize in order to perform the correct entry search.
-  Path = canonicalize(Path);
-  if (Path.empty())
+  llvm::SmallString<256> CanonicalPath =
+      canonicalize(StringRef(Path.data(), Path.size()));
+  if (CanonicalPath.empty())
     return make_error_code(llvm::errc::invalid_argument);
 
+  Path.assign(CanonicalPath.begin(), CanonicalPath.end());
+  return {};
+}
+
+ErrorOr<RedirectingFileSystem::Entry *>
+RedirectingFileSystem::lookupPath(StringRef Path) const {
   sys::path::const_iterator Start = sys::path::begin(Path);
   sys::path::const_iterator End = sys::path::end(Path);
   for (const auto &Root : Roots) {
@@ -1763,7 +1835,13 @@ ErrorOr<Status> RedirectingFileSystem::status(const Twine &Path,
   }
 }
 
-ErrorOr<Status> RedirectingFileSystem::status(const Twine &Path) {
+ErrorOr<Status> RedirectingFileSystem::status(const Twine &Path_) {
+  SmallString<256> Path;
+  Path_.toVector(Path);
+
+  if (std::error_code EC = makeCanonical(Path))
+    return EC;
+
   ErrorOr<RedirectingFileSystem::Entry *> Result = lookupPath(Path);
   if (!Result) {
     if (shouldUseExternalFS() &&
@@ -1801,7 +1879,13 @@ public:
 } // namespace
 
 ErrorOr<std::unique_ptr<File>>
-RedirectingFileSystem::openFileForRead(const Twine &Path) {
+RedirectingFileSystem::openFileForRead(const Twine &Path_) {
+  SmallString<256> Path;
+  Path_.toVector(Path);
+
+  if (std::error_code EC = makeCanonical(Path))
+    return EC;
+
   ErrorOr<RedirectingFileSystem::Entry *> E = lookupPath(Path);
   if (!E) {
     if (shouldUseExternalFS() &&
@@ -1831,8 +1915,14 @@ RedirectingFileSystem::openFileForRead(const Twine &Path) {
 }
 
 std::error_code
-RedirectingFileSystem::getRealPath(const Twine &Path,
+RedirectingFileSystem::getRealPath(const Twine &Path_,
                                    SmallVectorImpl<char> &Output) const {
+  SmallString<256> Path;
+  Path_.toVector(Path);
+
+  if (std::error_code EC = makeCanonical(Path))
+    return EC;
+
   ErrorOr<RedirectingFileSystem::Entry *> Result = lookupPath(Path);
   if (!Result) {
     if (shouldUseExternalFS() &&
@@ -1852,7 +1942,7 @@ RedirectingFileSystem::getRealPath(const Twine &Path,
                                : llvm::errc::invalid_argument;
 }
 
-IntrusiveRefCntPtr<FileSystem>
+std::unique_ptr<FileSystem>
 vfs::getVFSFromYAML(std::unique_ptr<MemoryBuffer> Buffer,
                     SourceMgr::DiagHandlerTy DiagHandler,
                     StringRef YAMLFilePath, void *DiagContext,
@@ -1893,7 +1983,7 @@ void vfs::collectVFSFromYAML(std::unique_ptr<MemoryBuffer> Buffer,
                              SmallVectorImpl<YAMLVFSEntry> &CollectedEntries,
                              void *DiagContext,
                              IntrusiveRefCntPtr<FileSystem> ExternalFS) {
-  RedirectingFileSystem *VFS = RedirectingFileSystem::create(
+  std::unique_ptr<RedirectingFileSystem> VFS = RedirectingFileSystem::create(
       std::move(Buffer), DiagHandler, YAMLFilePath, DiagContext,
       std::move(ExternalFS));
   ErrorOr<RedirectingFileSystem::Entry *> RootE = VFS->lookupPath("/");
diff --git a/contrib/llvm-project/llvm/lib/Support/Windows/Path.inc b/contrib/llvm-project/llvm/lib/Support/Windows/Path.inc
index a4ffc0ec4313..adcbd1b5f8f3 100644
--- a/contrib/llvm-project/llvm/lib/Support/Windows/Path.inc
+++ b/contrib/llvm-project/llvm/lib/Support/Windows/Path.inc
@@ -402,8 +402,22 @@ std::error_code is_local(int FD, bool &Result) {
 }
 
 static std::error_code setDeleteDisposition(HANDLE Handle, bool Delete) {
-  // First, check if the file is on a network (non-local) drive. If so, don't
-  // set DeleteFile to true, since it prevents opening the file for writes.
+  // Clear the FILE_DISPOSITION_INFO flag first, before checking if it's a
+  // network file. On Windows 7 the function realPathFromHandle() below fails
+  // if the FILE_DISPOSITION_INFO flag was already set to 'DeleteFile = true' by
+  // a prior call.
+  FILE_DISPOSITION_INFO Disposition;
+  Disposition.DeleteFile = false;
+  if (!SetFileInformationByHandle(Handle, FileDispositionInfo, &Disposition,
+                                  sizeof(Disposition)))
+    return mapWindowsError(::GetLastError());
+  if (!Delete)
+    return std::error_code();
+
+  // Check if the file is on a network (non-local) drive. If so, don't
+  // continue when DeleteFile is true, since it prevents opening the file for
+  // writes. Note -- this will leak temporary files on disk, but only when the
+  // target file is on a network drive.
   SmallVector<wchar_t, 128> FinalPath;
   if (std::error_code EC = realPathFromHandle(Handle, FinalPath))
     return EC;
@@ -415,9 +429,9 @@ static std::error_code setDeleteDisposition(HANDLE Handle, bool Delete) {
   if (!IsLocal)
     return std::error_code();
 
-  // The file is on a local drive, set the DeleteFile to true.
-  FILE_DISPOSITION_INFO Disposition;
-  Disposition.DeleteFile = Delete;
+  // The file is on a local drive, we can safely set FILE_DISPOSITION_INFO's
+  // flag.
+  Disposition.DeleteFile = true;
   if (!SetFileInformationByHandle(Handle, FileDispositionInfo, &Disposition,
                                   sizeof(Disposition)))
     return mapWindowsError(::GetLastError());
@@ -1273,6 +1287,43 @@ Expected<size_t> readNativeFileSlice(file_t FileHandle,
   return readNativeFileImpl(FileHandle, Buf, &Overlapped);
 }
 
+std::error_code tryLockFile(int FD, std::chrono::milliseconds Timeout) {
+  DWORD Flags = LOCKFILE_EXCLUSIVE_LOCK | LOCKFILE_FAIL_IMMEDIATELY;
+  OVERLAPPED OV = {};
+  file_t File = convertFDToNativeFile(FD);
+  auto Start = std::chrono::steady_clock::now();
+  auto End = Start + Timeout;
+  do {
+    if (::LockFileEx(File, Flags, 0, MAXDWORD, MAXDWORD, &OV))
+      return std::error_code();
+    DWORD Error = ::GetLastError();
+    if (Error == ERROR_LOCK_VIOLATION) {
+      ::Sleep(1);
+      continue;
+    }
+    return mapWindowsError(Error);
+  } while (std::chrono::steady_clock::now() < End);
+  return mapWindowsError(ERROR_LOCK_VIOLATION);
+}
+
+std::error_code lockFile(int FD) {
+  DWORD Flags = LOCKFILE_EXCLUSIVE_LOCK;
+  OVERLAPPED OV = {};
+  file_t File = convertFDToNativeFile(FD);
+  if (::LockFileEx(File, Flags, 0, MAXDWORD, MAXDWORD, &OV))
+    return std::error_code();
+  DWORD Error = ::GetLastError();
+  return mapWindowsError(Error);
+}
+
+std::error_code unlockFile(int FD) {
+  OVERLAPPED OV = {};
+  file_t File = convertFDToNativeFile(FD);
+  if (::UnlockFileEx(File, 0, MAXDWORD, MAXDWORD, &OV))
+    return std::error_code();
+  return mapWindowsError(::GetLastError());
+}
+
 std::error_code closeFile(file_t &F) {
   file_t TmpF = F;
   F = kInvalidFile;
diff --git a/contrib/llvm-project/llvm/lib/Support/Windows/Process.inc b/contrib/llvm-project/llvm/lib/Support/Windows/Process.inc
index 8064d4e17b29..910d2395d277 100644
--- a/contrib/llvm-project/llvm/lib/Support/Windows/Process.inc
+++ b/contrib/llvm-project/llvm/lib/Support/Windows/Process.inc
@@ -228,7 +228,8 @@ static std::error_code GetExecutableName(SmallVectorImpl<char> &Filename) {
   if (EC)
     return EC;
 
-  StringRef Base = sys::path::filename(Filename.data());
+  // Make a copy of the filename since assign makes the StringRef invalid.
+  std::string Base = sys::path::filename(Filename.data()).str();
   Filename.assign(Base.begin(), Base.end());
   return std::error_code();
 }
diff --git a/contrib/llvm-project/llvm/lib/Support/Windows/Program.inc b/contrib/llvm-project/llvm/lib/Support/Windows/Program.inc
index 9fe05d24ec2e..f1d612cf3c98 100644
--- a/contrib/llvm-project/llvm/lib/Support/Windows/Program.inc
+++ b/contrib/llvm-project/llvm/lib/Support/Windows/Program.inc
@@ -171,7 +171,8 @@ static HANDLE RedirectIO(Optional<StringRef> Path, int fd,
 static bool Execute(ProcessInfo &PI, StringRef Program,
                     ArrayRef<StringRef> Args, Optional<ArrayRef<StringRef>> Env,
                     ArrayRef<Optional<StringRef>> Redirects,
-                    unsigned MemoryLimit, std::string *ErrMsg) {
+                    unsigned MemoryLimit, std::string *ErrMsg,
+                    BitVector *AffinityMask) {
   if (!sys::fs::can_execute(Program)) {
     if (ErrMsg)
       *ErrMsg = "program not executable";
@@ -189,7 +190,13 @@ static bool Execute(ProcessInfo &PI, StringRef Program,
   // Windows wants a command line, not an array of args, to pass to the new
   // process.  We have to concatenate them all, while quoting the args that
   // have embedded spaces (or are empty).
-  std::string Command = flattenWindowsCommandLine(Args);
+  auto Result = flattenWindowsCommandLine(Args);
+  if (std::error_code ec = Result.getError()) {
+    SetLastError(ec.value());
+    MakeErrMsg(ErrMsg, std::string("Unable to convert command-line to UTF-16"));
+    return false;
+  }
+  std::wstring Command = *Result;
 
   // The pointer to the environment block for the new process.
   std::vector<wchar_t> EnvBlock;
@@ -206,7 +213,7 @@ static bool Execute(ProcessInfo &PI, StringRef Program,
         return false;
       }
 
-      EnvBlock.insert(EnvBlock.end(), EnvString.begin(), EnvString.end());
+      llvm::append_range(EnvBlock, EnvString);
       EnvBlock.push_back(0);
     }
     EnvBlock.push_back(0);
@@ -271,18 +278,15 @@ static bool Execute(ProcessInfo &PI, StringRef Program,
     return false;
   }
 
-  SmallVector<wchar_t, MAX_PATH> CommandUtf16;
-  if (std::error_code ec = windows::UTF8ToUTF16(Command, CommandUtf16)) {
-    SetLastError(ec.value());
-    MakeErrMsg(ErrMsg,
-               std::string("Unable to convert command-line to UTF-16"));
-    return false;
-  }
+  unsigned CreateFlags = CREATE_UNICODE_ENVIRONMENT;
+  if (AffinityMask)
+    CreateFlags |= CREATE_SUSPENDED;
 
-  BOOL rc = CreateProcessW(ProgramUtf16.data(), CommandUtf16.data(), 0, 0,
-                           TRUE, CREATE_UNICODE_ENVIRONMENT,
-                           EnvBlock.empty() ? 0 : EnvBlock.data(), 0, &si,
-                           &pi);
+  std::vector<wchar_t> CommandUtf16(Command.size() + 1, 0);
+  std::copy(Command.begin(), Command.end(), CommandUtf16.begin());
+  BOOL rc = CreateProcessW(ProgramUtf16.data(), CommandUtf16.data(), 0, 0, TRUE,
+                           CreateFlags, EnvBlock.empty() ? 0 : EnvBlock.data(),
+                           0, &si, &pi);
   DWORD err = GetLastError();
 
   // Regardless of whether the process got created or not, we are done with
@@ -330,6 +334,13 @@ static bool Execute(ProcessInfo &PI, StringRef Program,
     }
   }
 
+  // Set the affinity mask
+  if (AffinityMask) {
+    ::SetProcessAffinityMask(pi.hProcess,
+                             (DWORD_PTR)AffinityMask->getData().front());
+    ::ResumeThread(pi.hThread);
+  }
+
   return true;
 }
 
@@ -376,7 +387,7 @@ static std::string quoteSingleArg(StringRef Arg) {
 }
 
 namespace llvm {
-std::string sys::flattenWindowsCommandLine(ArrayRef<StringRef> Args) {
+ErrorOr<std::wstring> sys::flattenWindowsCommandLine(ArrayRef<StringRef> Args) {
   std::string Command;
   for (StringRef Arg : Args) {
     if (argNeedsQuotes(Arg))
@@ -387,7 +398,11 @@ std::string sys::flattenWindowsCommandLine(ArrayRef<StringRef> Args) {
     Command.push_back(' ');
   }
 
-  return Command;
+  SmallVector<wchar_t, MAX_PATH> CommandUtf16;
+  if (std::error_code ec = windows::UTF8ToUTF16(Command, CommandUtf16))
+    return ec;
+
+  return std::wstring(CommandUtf16.begin(), CommandUtf16.end());
 }
 
 ProcessInfo sys::Wait(const ProcessInfo &PI, unsigned SecondsToWait,
@@ -532,12 +547,16 @@ llvm::sys::writeFileWithEncoding(StringRef FileName, StringRef Contents,
 
 bool llvm::sys::commandLineFitsWithinSystemLimits(StringRef Program,
                                                   ArrayRef<StringRef> Args) {
-  // The documented max length of the command line passed to CreateProcess.
-  static const size_t MaxCommandStringLength = 32768;
+  // The documentation on CreateProcessW states that the size of the argument
+  // lpCommandLine must not be greater than 32767 characters, including the
+  // Unicode terminating null character. We use smaller value to reduce risk
+  // of getting invalid command line due to unaccounted factors.
+  static const size_t MaxCommandStringLength = 32000;
   SmallVector<StringRef, 8> FullArgs;
   FullArgs.push_back(Program);
   FullArgs.append(Args.begin(), Args.end());
-  std::string Result = flattenWindowsCommandLine(FullArgs);
-  return (Result.size() + 1) <= MaxCommandStringLength;
+  auto Result = flattenWindowsCommandLine(FullArgs);
+  assert(!Result.getError());
+  return (Result->size() + 1) <= MaxCommandStringLength;
 }
 }
diff --git a/contrib/llvm-project/llvm/lib/Support/Windows/Signals.inc b/contrib/llvm-project/llvm/lib/Support/Windows/Signals.inc
index 0c3681fa9654..3758582b35f7 100644
--- a/contrib/llvm-project/llvm/lib/Support/Windows/Signals.inc
+++ b/contrib/llvm-project/llvm/lib/Support/Windows/Signals.inc
@@ -552,7 +552,8 @@ static void LocalPrintStackTrace(raw_ostream &OS, PCONTEXT C) {
                            StackFrame, C);
 }
 
-void llvm::sys::PrintStackTrace(raw_ostream &OS) {
+void llvm::sys::PrintStackTrace(raw_ostream &OS, int Depth) {
+  // FIXME: Handle "Depth" parameter to print stack trace upto specified Depth
   LocalPrintStackTrace(OS, nullptr);
 }
 
@@ -868,3 +869,5 @@ static BOOL WINAPI LLVMConsoleCtrlHandler(DWORD dwCtrlType) {
  #pragma GCC diagnostic warning "-Wformat"
  #pragma GCC diagnostic warning "-Wformat-extra-args"
 #endif
+
+void sys::unregisterHandlers() {}
diff --git a/contrib/llvm-project/llvm/lib/Support/Windows/Threading.inc b/contrib/llvm-project/llvm/lib/Support/Windows/Threading.inc
index 296e87b77695..6448bb478d0c 100644
--- a/contrib/llvm-project/llvm/lib/Support/Windows/Threading.inc
+++ b/contrib/llvm-project/llvm/lib/Support/Windows/Threading.inc
@@ -195,14 +195,27 @@ static ArrayRef<ProcessorGroup> getProcessorGroups() {
     if (!IterateProcInfo(RelationProcessorCore, HandleProc))
       return std::vector<ProcessorGroup>();
 
-    // If there's an affinity mask set on one of the CPUs, then assume the user
-    // wants to constrain the current process to only a single CPU.
-    for (auto &G : Groups) {
-      if (G.UsableThreads != G.AllThreads) {
-        ProcessorGroup NewG{G};
+    // If there's an affinity mask set, assume the user wants to constrain the
+    // current process to only a single CPU group. On Windows, it is not
+    // possible for affinity masks to cross CPU group boundaries.
+    DWORD_PTR ProcessAffinityMask = 0, SystemAffinityMask = 0;
+    if (::GetProcessAffinityMask(GetCurrentProcess(), &ProcessAffinityMask,
+                                 &SystemAffinityMask) &&
+        ProcessAffinityMask != SystemAffinityMask) {
+      // We don't expect more that 4 CPU groups on Windows (256 processors).
+      USHORT GroupCount = 4;
+      USHORT GroupArray[4]{};
+      if (::GetProcessGroupAffinity(GetCurrentProcess(), &GroupCount,
+                                    GroupArray)) {
+        assert(GroupCount == 1 &&
+               "On startup, a program is expected to be assigned only to "
+               "one processor group!");
+        unsigned CurrentGroupID = GroupArray[0];
+        ProcessorGroup NewG{Groups[CurrentGroupID]};
+        NewG.Affinity = ProcessAffinityMask;
+        NewG.UsableThreads = countPopulation(ProcessAffinityMask);
         Groups.clear();
         Groups.push_back(NewG);
-        break;
       }
     }
 
diff --git a/contrib/llvm-project/llvm/lib/Support/X86TargetParser.cpp b/contrib/llvm-project/llvm/lib/Support/X86TargetParser.cpp
index 4c2d4efbfca8..d738511465b4 100644
--- a/contrib/llvm-project/llvm/lib/Support/X86TargetParser.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/X86TargetParser.cpp
@@ -11,7 +11,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/X86TargetParser.h"
-#include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Triple.h"
 
 using namespace llvm;
@@ -117,147 +116,160 @@ struct FeatureInfo {
 } // end anonymous namespace
 
 #define X86_FEATURE(ENUM, STRING)                                              \
-  static constexpr FeatureBitset Feature##ENUM = {X86::FEATURE_##ENUM};
+  constexpr FeatureBitset Feature##ENUM = {X86::FEATURE_##ENUM};
 #include "llvm/Support/X86TargetParser.def"
 
 // Pentium with MMX.
-static constexpr FeatureBitset FeaturesPentiumMMX =
+constexpr FeatureBitset FeaturesPentiumMMX =
     FeatureX87 | FeatureCMPXCHG8B | FeatureMMX;
 
 // Pentium 2 and 3.
-static constexpr FeatureBitset FeaturesPentium2 =
+constexpr FeatureBitset FeaturesPentium2 =
     FeatureX87 | FeatureCMPXCHG8B | FeatureMMX | FeatureFXSR;
-static constexpr FeatureBitset FeaturesPentium3 = FeaturesPentium2 | FeatureSSE;
+constexpr FeatureBitset FeaturesPentium3 = FeaturesPentium2 | FeatureSSE;
 
 // Pentium 4 CPUs
-static constexpr FeatureBitset FeaturesPentium4 =
-    FeaturesPentium3 | FeatureSSE2;
-static constexpr FeatureBitset FeaturesPrescott =
-    FeaturesPentium4 | FeatureSSE3;
-static constexpr FeatureBitset FeaturesNocona =
+constexpr FeatureBitset FeaturesPentium4 = FeaturesPentium3 | FeatureSSE2;
+constexpr FeatureBitset FeaturesPrescott = FeaturesPentium4 | FeatureSSE3;
+constexpr FeatureBitset FeaturesNocona =
     FeaturesPrescott | Feature64BIT | FeatureCMPXCHG16B;
 
 // Basic 64-bit capable CPU.
-static constexpr FeatureBitset FeaturesX86_64 = FeaturesPentium4 | Feature64BIT;
+constexpr FeatureBitset FeaturesX86_64 = FeaturesPentium4 | Feature64BIT;
+constexpr FeatureBitset FeaturesX86_64_V2 = FeaturesX86_64 | FeatureSAHF |
+                                            FeaturePOPCNT | FeatureSSE4_2 |
+                                            FeatureCMPXCHG16B;
+constexpr FeatureBitset FeaturesX86_64_V3 =
+    FeaturesX86_64_V2 | FeatureAVX2 | FeatureBMI | FeatureBMI2 | FeatureF16C |
+    FeatureFMA | FeatureLZCNT | FeatureMOVBE | FeatureXSAVE;
+constexpr FeatureBitset FeaturesX86_64_V4 = FeaturesX86_64_V3 |
+                                            FeatureAVX512BW | FeatureAVX512CD |
+                                            FeatureAVX512DQ | FeatureAVX512VL;
 
 // Intel Core CPUs
-static constexpr FeatureBitset FeaturesCore2 =
+constexpr FeatureBitset FeaturesCore2 =
     FeaturesNocona | FeatureSAHF | FeatureSSSE3;
-static constexpr FeatureBitset FeaturesPenryn = FeaturesCore2 | FeatureSSE4_1;
-static constexpr FeatureBitset FeaturesNehalem =
+constexpr FeatureBitset FeaturesPenryn = FeaturesCore2 | FeatureSSE4_1;
+constexpr FeatureBitset FeaturesNehalem =
     FeaturesPenryn | FeaturePOPCNT | FeatureSSE4_2;
-static constexpr FeatureBitset FeaturesWestmere =
-    FeaturesNehalem | FeaturePCLMUL;
-static constexpr FeatureBitset FeaturesSandyBridge =
+constexpr FeatureBitset FeaturesWestmere = FeaturesNehalem | FeaturePCLMUL;
+constexpr FeatureBitset FeaturesSandyBridge =
     FeaturesWestmere | FeatureAVX | FeatureXSAVE | FeatureXSAVEOPT;
-static constexpr FeatureBitset FeaturesIvyBridge =
+constexpr FeatureBitset FeaturesIvyBridge =
     FeaturesSandyBridge | FeatureF16C | FeatureFSGSBASE | FeatureRDRND;
-static constexpr FeatureBitset FeaturesHaswell =
+constexpr FeatureBitset FeaturesHaswell =
     FeaturesIvyBridge | FeatureAVX2 | FeatureBMI | FeatureBMI2 | FeatureFMA |
     FeatureINVPCID | FeatureLZCNT | FeatureMOVBE;
-static constexpr FeatureBitset FeaturesBroadwell =
+constexpr FeatureBitset FeaturesBroadwell =
     FeaturesHaswell | FeatureADX | FeaturePRFCHW | FeatureRDSEED;
 
 // Intel Knights Landing and Knights Mill
 // Knights Landing has feature parity with Broadwell.
-static constexpr FeatureBitset FeaturesKNL =
+constexpr FeatureBitset FeaturesKNL =
     FeaturesBroadwell | FeatureAES | FeatureAVX512F | FeatureAVX512CD |
     FeatureAVX512ER | FeatureAVX512PF | FeaturePREFETCHWT1;
-static constexpr FeatureBitset FeaturesKNM =
-    FeaturesKNL | FeatureAVX512VPOPCNTDQ;
+constexpr FeatureBitset FeaturesKNM = FeaturesKNL | FeatureAVX512VPOPCNTDQ;
 
 // Intel Skylake processors.
-static constexpr FeatureBitset FeaturesSkylakeClient =
+constexpr FeatureBitset FeaturesSkylakeClient =
     FeaturesBroadwell | FeatureAES | FeatureCLFLUSHOPT | FeatureXSAVEC |
     FeatureXSAVES | FeatureSGX;
 // SkylakeServer inherits all SkylakeClient features except SGX.
 // FIXME: That doesn't match gcc.
-static constexpr FeatureBitset FeaturesSkylakeServer =
+constexpr FeatureBitset FeaturesSkylakeServer =
     (FeaturesSkylakeClient & ~FeatureSGX) | FeatureAVX512F | FeatureAVX512CD |
     FeatureAVX512DQ | FeatureAVX512BW | FeatureAVX512VL | FeatureCLWB |
     FeaturePKU;
-static constexpr FeatureBitset FeaturesCascadeLake =
+constexpr FeatureBitset FeaturesCascadeLake =
     FeaturesSkylakeServer | FeatureAVX512VNNI;
-static constexpr FeatureBitset FeaturesCooperLake =
+constexpr FeatureBitset FeaturesCooperLake =
     FeaturesCascadeLake | FeatureAVX512BF16;
 
 // Intel 10nm processors.
-static constexpr FeatureBitset FeaturesCannonlake =
+constexpr FeatureBitset FeaturesCannonlake =
     FeaturesSkylakeClient | FeatureAVX512F | FeatureAVX512CD | FeatureAVX512DQ |
     FeatureAVX512BW | FeatureAVX512VL | FeatureAVX512IFMA | FeatureAVX512VBMI |
     FeaturePKU | FeatureSHA;
-static constexpr FeatureBitset FeaturesICLClient =
+constexpr FeatureBitset FeaturesICLClient =
     FeaturesCannonlake | FeatureAVX512BITALG | FeatureAVX512VBMI2 |
     FeatureAVX512VNNI | FeatureAVX512VPOPCNTDQ | FeatureCLWB | FeatureGFNI |
     FeatureRDPID | FeatureVAES | FeatureVPCLMULQDQ;
-static constexpr FeatureBitset FeaturesICLServer =
+constexpr FeatureBitset FeaturesICLServer =
     FeaturesICLClient | FeaturePCONFIG | FeatureWBNOINVD;
-static constexpr FeatureBitset FeaturesTigerlake =
+constexpr FeatureBitset FeaturesTigerlake =
     FeaturesICLClient | FeatureAVX512VP2INTERSECT | FeatureMOVDIR64B |
-    FeatureMOVDIRI | FeatureSHSTK;
+    FeatureMOVDIRI | FeatureSHSTK | FeatureKL | FeatureWIDEKL;
+constexpr FeatureBitset FeaturesSapphireRapids =
+    FeaturesICLServer | FeatureAMX_TILE | FeatureAMX_INT8 | FeatureAMX_BF16 |
+    FeatureAVX512BF16 | FeatureAVX512VP2INTERSECT | FeatureCLDEMOTE |
+    FeatureENQCMD | FeatureMOVDIR64B | FeatureMOVDIRI | FeaturePTWRITE |
+    FeatureSERIALIZE | FeatureSHSTK | FeatureTSXLDTRK | FeatureUINTR |
+    FeatureWAITPKG | FeatureAVXVNNI;
+constexpr FeatureBitset FeaturesAlderlake =
+    FeaturesSkylakeClient | FeatureCLDEMOTE | FeatureHRESET | FeaturePTWRITE |
+    FeatureSERIALIZE | FeatureWAITPKG | FeatureAVXVNNI;
 
 // Intel Atom processors.
 // Bonnell has feature parity with Core2 and adds MOVBE.
-static constexpr FeatureBitset FeaturesBonnell = FeaturesCore2 | FeatureMOVBE;
+constexpr FeatureBitset FeaturesBonnell = FeaturesCore2 | FeatureMOVBE;
 // Silvermont has parity with Westmere and Bonnell plus PRFCHW and RDRND.
-static constexpr FeatureBitset FeaturesSilvermont =
+constexpr FeatureBitset FeaturesSilvermont =
     FeaturesBonnell | FeaturesWestmere | FeaturePRFCHW | FeatureRDRND;
-static constexpr FeatureBitset FeaturesGoldmont =
+constexpr FeatureBitset FeaturesGoldmont =
     FeaturesSilvermont | FeatureAES | FeatureCLFLUSHOPT | FeatureFSGSBASE |
     FeatureRDSEED | FeatureSHA | FeatureXSAVE | FeatureXSAVEC |
     FeatureXSAVEOPT | FeatureXSAVES;
-static constexpr FeatureBitset FeaturesGoldmontPlus =
+constexpr FeatureBitset FeaturesGoldmontPlus =
     FeaturesGoldmont | FeaturePTWRITE | FeatureRDPID | FeatureSGX;
-static constexpr FeatureBitset FeaturesTremont =
+constexpr FeatureBitset FeaturesTremont =
     FeaturesGoldmontPlus | FeatureCLWB | FeatureGFNI;
 
 // Geode Processor.
-static constexpr FeatureBitset FeaturesGeode =
+constexpr FeatureBitset FeaturesGeode =
     FeatureX87 | FeatureCMPXCHG8B | FeatureMMX | Feature3DNOW | Feature3DNOWA;
 
 // K6 processor.
-static constexpr FeatureBitset FeaturesK6 =
-    FeatureX87 | FeatureCMPXCHG8B | FeatureMMX;
+constexpr FeatureBitset FeaturesK6 = FeatureX87 | FeatureCMPXCHG8B | FeatureMMX;
 
 // K7 and K8 architecture processors.
-static constexpr FeatureBitset FeaturesAthlon =
+constexpr FeatureBitset FeaturesAthlon =
     FeatureX87 | FeatureCMPXCHG8B | FeatureMMX | Feature3DNOW | Feature3DNOWA;
-static constexpr FeatureBitset FeaturesAthlonXP =
+constexpr FeatureBitset FeaturesAthlonXP =
     FeaturesAthlon | FeatureFXSR | FeatureSSE;
-static constexpr FeatureBitset FeaturesK8 =
+constexpr FeatureBitset FeaturesK8 =
     FeaturesAthlonXP | FeatureSSE2 | Feature64BIT;
-static constexpr FeatureBitset FeaturesK8SSE3 = FeaturesK8 | FeatureSSE3;
-static constexpr FeatureBitset FeaturesAMDFAM10 =
+constexpr FeatureBitset FeaturesK8SSE3 = FeaturesK8 | FeatureSSE3;
+constexpr FeatureBitset FeaturesAMDFAM10 =
     FeaturesK8SSE3 | FeatureCMPXCHG16B | FeatureLZCNT | FeaturePOPCNT |
     FeaturePRFCHW | FeatureSAHF | FeatureSSE4_A;
 
 // Bobcat architecture processors.
-static constexpr FeatureBitset FeaturesBTVER1 =
+constexpr FeatureBitset FeaturesBTVER1 =
     FeatureX87 | FeatureCMPXCHG8B | FeatureCMPXCHG16B | Feature64BIT |
     FeatureFXSR | FeatureLZCNT | FeatureMMX | FeaturePOPCNT | FeaturePRFCHW |
     FeatureSSE | FeatureSSE2 | FeatureSSE3 | FeatureSSSE3 | FeatureSSE4_A |
     FeatureSAHF;
-static constexpr FeatureBitset FeaturesBTVER2 =
+constexpr FeatureBitset FeaturesBTVER2 =
     FeaturesBTVER1 | FeatureAES | FeatureAVX | FeatureBMI | FeatureF16C |
     FeatureMOVBE | FeaturePCLMUL | FeatureXSAVE | FeatureXSAVEOPT;
 
 // AMD Bulldozer architecture processors.
-static constexpr FeatureBitset FeaturesBDVER1 =
+constexpr FeatureBitset FeaturesBDVER1 =
     FeatureX87 | FeatureAES | FeatureAVX | FeatureCMPXCHG8B |
     FeatureCMPXCHG16B | Feature64BIT | FeatureFMA4 | FeatureFXSR | FeatureLWP |
     FeatureLZCNT | FeatureMMX | FeaturePCLMUL | FeaturePOPCNT | FeaturePRFCHW |
     FeatureSAHF | FeatureSSE | FeatureSSE2 | FeatureSSE3 | FeatureSSSE3 |
     FeatureSSE4_1 | FeatureSSE4_2 | FeatureSSE4_A | FeatureXOP | FeatureXSAVE;
-static constexpr FeatureBitset FeaturesBDVER2 =
+constexpr FeatureBitset FeaturesBDVER2 =
     FeaturesBDVER1 | FeatureBMI | FeatureFMA | FeatureF16C | FeatureTBM;
-static constexpr FeatureBitset FeaturesBDVER3 =
+constexpr FeatureBitset FeaturesBDVER3 =
     FeaturesBDVER2 | FeatureFSGSBASE | FeatureXSAVEOPT;
-static constexpr FeatureBitset FeaturesBDVER4 =
-    FeaturesBDVER3 | FeatureAVX2 | FeatureBMI2 | FeatureMOVBE | FeatureMWAITX |
-    FeatureRDRND;
+constexpr FeatureBitset FeaturesBDVER4 = FeaturesBDVER3 | FeatureAVX2 |
+                                         FeatureBMI2 | FeatureMOVBE |
+                                         FeatureMWAITX | FeatureRDRND;
 
 // AMD Zen architecture processors.
-static constexpr FeatureBitset FeaturesZNVER1 =
+constexpr FeatureBitset FeaturesZNVER1 =
     FeatureX87 | FeatureADX | FeatureAES | FeatureAVX | FeatureAVX2 |
     FeatureBMI | FeatureBMI2 | FeatureCLFLUSHOPT | FeatureCLZERO |
     FeatureCMPXCHG8B | FeatureCMPXCHG16B | Feature64BIT | FeatureF16C |
@@ -267,10 +279,13 @@ static constexpr FeatureBitset FeaturesZNVER1 =
     FeatureSSE | FeatureSSE2 | FeatureSSE3 | FeatureSSSE3 | FeatureSSE4_1 |
     FeatureSSE4_2 | FeatureSSE4_A | FeatureXSAVE | FeatureXSAVEC |
     FeatureXSAVEOPT | FeatureXSAVES;
-static constexpr FeatureBitset FeaturesZNVER2 =
+constexpr FeatureBitset FeaturesZNVER2 =
     FeaturesZNVER1 | FeatureCLWB | FeatureRDPID | FeatureWBNOINVD;
+static constexpr FeatureBitset FeaturesZNVER3 = FeaturesZNVER2 |
+                                                FeatureINVPCID | FeaturePKU |
+                                                FeatureVAES | FeatureVPCLMULQDQ;
 
-static constexpr ProcInfo Processors[] = {
+constexpr ProcInfo Processors[] = {
   // Empty processor. Include X87 and CMPXCHG8 for backwards compatibility.
   { {""}, CK_None, ~0U, FeatureX87 | FeatureCMPXCHG8B },
   // i386-generation processors.
@@ -342,6 +357,10 @@ static constexpr ProcInfo Processors[] = {
   { {"icelake-server"}, CK_IcelakeServer, FEATURE_AVX512VBMI2, FeaturesICLServer },
   // Tigerlake microarchitecture based processors.
   { {"tigerlake"}, CK_Tigerlake, FEATURE_AVX512VP2INTERSECT, FeaturesTigerlake },
+  // Sapphire Rapids microarchitecture based processors.
+  { {"sapphirerapids"}, CK_SapphireRapids, FEATURE_AVX512VP2INTERSECT, FeaturesSapphireRapids },
+  // Alderlake microarchitecture based processors.
+  { {"alderlake"}, CK_Alderlake, FEATURE_AVX2, FeaturesAlderlake },
   // Knights Landing processor.
   { {"knl"}, CK_KNL, FEATURE_AVX512F, FeaturesKNL },
   // Knights Mill processor.
@@ -379,12 +398,18 @@ static constexpr ProcInfo Processors[] = {
   // Zen architecture processors.
   { {"znver1"}, CK_ZNVER1, FEATURE_AVX2, FeaturesZNVER1 },
   { {"znver2"}, CK_ZNVER2, FEATURE_AVX2, FeaturesZNVER2 },
+  { {"znver3"}, CK_ZNVER3, FEATURE_AVX2, FeaturesZNVER3 },
   // Generic 64-bit processor.
   { {"x86-64"}, CK_x86_64, ~0U, FeaturesX86_64 },
+  { {"x86-64-v2"}, CK_x86_64_v2, ~0U, FeaturesX86_64_V2 },
+  { {"x86-64-v3"}, CK_x86_64_v3, ~0U, FeaturesX86_64_V3 },
+  { {"x86-64-v4"}, CK_x86_64_v4, ~0U, FeaturesX86_64_V4 },
   // Geode processors.
   { {"geode"}, CK_Geode, ~0U, FeaturesGeode },
 };
 
+constexpr const char *NoTuneList[] = {"x86-64-v2", "x86-64-v3", "x86-64-v4"};
+
 X86::CPUKind llvm::X86::parseArchX86(StringRef CPU, bool Only64Bit) {
   for (const auto &P : Processors)
     if (P.Name == CPU && (P.Features[FEATURE_64BIT] || !Only64Bit))
@@ -393,6 +418,12 @@ X86::CPUKind llvm::X86::parseArchX86(StringRef CPU, bool Only64Bit) {
   return CK_None;
 }
 
+X86::CPUKind llvm::X86::parseTuneCPU(StringRef CPU, bool Only64Bit) {
+  if (llvm::is_contained(NoTuneList, CPU))
+    return CK_None;
+  return parseArchX86(CPU, Only64Bit);
+}
+
 void llvm::X86::fillValidCPUArchList(SmallVectorImpl<StringRef> &Values,
                                      bool Only64Bit) {
   for (const auto &P : Processors)
@@ -400,6 +431,14 @@ void llvm::X86::fillValidCPUArchList(SmallVectorImpl<StringRef> &Values,
       Values.emplace_back(P.Name);
 }
 
+void llvm::X86::fillValidTuneCPUList(SmallVectorImpl<StringRef> &Values,
+                                     bool Only64Bit) {
+  for (const ProcInfo &P : Processors)
+    if (!P.Name.empty() && (P.Features[FEATURE_64BIT] || !Only64Bit) &&
+        !llvm::is_contained(NoTuneList, P.Name))
+      Values.emplace_back(P.Name);
+}
+
 ProcessorFeatures llvm::X86::getKeyFeature(X86::CPUKind Kind) {
   // FIXME: Can we avoid a linear search here? The table might be sorted by
   // CPUKind so we could binary search?
@@ -414,136 +453,135 @@ ProcessorFeatures llvm::X86::getKeyFeature(X86::CPUKind Kind) {
 }
 
 // Features with no dependencies.
-static constexpr FeatureBitset ImpliedFeatures64BIT = {};
-static constexpr FeatureBitset ImpliedFeaturesADX = {};
-static constexpr FeatureBitset ImpliedFeaturesBMI = {};
-static constexpr FeatureBitset ImpliedFeaturesBMI2 = {};
-static constexpr FeatureBitset ImpliedFeaturesCLDEMOTE = {};
-static constexpr FeatureBitset ImpliedFeaturesCLFLUSHOPT = {};
-static constexpr FeatureBitset ImpliedFeaturesCLWB = {};
-static constexpr FeatureBitset ImpliedFeaturesCLZERO = {};
-static constexpr FeatureBitset ImpliedFeaturesCMOV = {};
-static constexpr FeatureBitset ImpliedFeaturesCMPXCHG16B = {};
-static constexpr FeatureBitset ImpliedFeaturesCMPXCHG8B = {};
-static constexpr FeatureBitset ImpliedFeaturesENQCMD = {};
-static constexpr FeatureBitset ImpliedFeaturesFSGSBASE = {};
-static constexpr FeatureBitset ImpliedFeaturesFXSR = {};
-static constexpr FeatureBitset ImpliedFeaturesINVPCID = {};
-static constexpr FeatureBitset ImpliedFeaturesLWP = {};
-static constexpr FeatureBitset ImpliedFeaturesLZCNT = {};
-static constexpr FeatureBitset ImpliedFeaturesMWAITX = {};
-static constexpr FeatureBitset ImpliedFeaturesMOVBE = {};
-static constexpr FeatureBitset ImpliedFeaturesMOVDIR64B = {};
-static constexpr FeatureBitset ImpliedFeaturesMOVDIRI = {};
-static constexpr FeatureBitset ImpliedFeaturesPCONFIG = {};
-static constexpr FeatureBitset ImpliedFeaturesPOPCNT = {};
-static constexpr FeatureBitset ImpliedFeaturesPKU = {};
-static constexpr FeatureBitset ImpliedFeaturesPREFETCHWT1 = {};
-static constexpr FeatureBitset ImpliedFeaturesPRFCHW = {};
-static constexpr FeatureBitset ImpliedFeaturesPTWRITE = {};
-static constexpr FeatureBitset ImpliedFeaturesRDPID = {};
-static constexpr FeatureBitset ImpliedFeaturesRDRND = {};
-static constexpr FeatureBitset ImpliedFeaturesRDSEED = {};
-static constexpr FeatureBitset ImpliedFeaturesRTM = {};
-static constexpr FeatureBitset ImpliedFeaturesSAHF = {};
-static constexpr FeatureBitset ImpliedFeaturesSERIALIZE = {};
-static constexpr FeatureBitset ImpliedFeaturesSGX = {};
-static constexpr FeatureBitset ImpliedFeaturesSHSTK = {};
-static constexpr FeatureBitset ImpliedFeaturesTBM = {};
-static constexpr FeatureBitset ImpliedFeaturesTSXLDTRK = {};
-static constexpr FeatureBitset ImpliedFeaturesWAITPKG = {};
-static constexpr FeatureBitset ImpliedFeaturesWBNOINVD = {};
-static constexpr FeatureBitset ImpliedFeaturesVZEROUPPER = {};
-static constexpr FeatureBitset ImpliedFeaturesX87 = {};
-static constexpr FeatureBitset ImpliedFeaturesXSAVE = {};
+constexpr FeatureBitset ImpliedFeatures64BIT = {};
+constexpr FeatureBitset ImpliedFeaturesADX = {};
+constexpr FeatureBitset ImpliedFeaturesBMI = {};
+constexpr FeatureBitset ImpliedFeaturesBMI2 = {};
+constexpr FeatureBitset ImpliedFeaturesCLDEMOTE = {};
+constexpr FeatureBitset ImpliedFeaturesCLFLUSHOPT = {};
+constexpr FeatureBitset ImpliedFeaturesCLWB = {};
+constexpr FeatureBitset ImpliedFeaturesCLZERO = {};
+constexpr FeatureBitset ImpliedFeaturesCMOV = {};
+constexpr FeatureBitset ImpliedFeaturesCMPXCHG16B = {};
+constexpr FeatureBitset ImpliedFeaturesCMPXCHG8B = {};
+constexpr FeatureBitset ImpliedFeaturesENQCMD = {};
+constexpr FeatureBitset ImpliedFeaturesFSGSBASE = {};
+constexpr FeatureBitset ImpliedFeaturesFXSR = {};
+constexpr FeatureBitset ImpliedFeaturesINVPCID = {};
+constexpr FeatureBitset ImpliedFeaturesLWP = {};
+constexpr FeatureBitset ImpliedFeaturesLZCNT = {};
+constexpr FeatureBitset ImpliedFeaturesMWAITX = {};
+constexpr FeatureBitset ImpliedFeaturesMOVBE = {};
+constexpr FeatureBitset ImpliedFeaturesMOVDIR64B = {};
+constexpr FeatureBitset ImpliedFeaturesMOVDIRI = {};
+constexpr FeatureBitset ImpliedFeaturesPCONFIG = {};
+constexpr FeatureBitset ImpliedFeaturesPOPCNT = {};
+constexpr FeatureBitset ImpliedFeaturesPKU = {};
+constexpr FeatureBitset ImpliedFeaturesPREFETCHWT1 = {};
+constexpr FeatureBitset ImpliedFeaturesPRFCHW = {};
+constexpr FeatureBitset ImpliedFeaturesPTWRITE = {};
+constexpr FeatureBitset ImpliedFeaturesRDPID = {};
+constexpr FeatureBitset ImpliedFeaturesRDRND = {};
+constexpr FeatureBitset ImpliedFeaturesRDSEED = {};
+constexpr FeatureBitset ImpliedFeaturesRTM = {};
+constexpr FeatureBitset ImpliedFeaturesSAHF = {};
+constexpr FeatureBitset ImpliedFeaturesSERIALIZE = {};
+constexpr FeatureBitset ImpliedFeaturesSGX = {};
+constexpr FeatureBitset ImpliedFeaturesSHSTK = {};
+constexpr FeatureBitset ImpliedFeaturesTBM = {};
+constexpr FeatureBitset ImpliedFeaturesTSXLDTRK = {};
+constexpr FeatureBitset ImpliedFeaturesUINTR = {};
+constexpr FeatureBitset ImpliedFeaturesWAITPKG = {};
+constexpr FeatureBitset ImpliedFeaturesWBNOINVD = {};
+constexpr FeatureBitset ImpliedFeaturesVZEROUPPER = {};
+constexpr FeatureBitset ImpliedFeaturesX87 = {};
+constexpr FeatureBitset ImpliedFeaturesXSAVE = {};
 
 // Not really CPU features, but need to be in the table because clang uses
 // target features to communicate them to the backend.
-static constexpr FeatureBitset ImpliedFeaturesRETPOLINE_EXTERNAL_THUNK = {};
-static constexpr FeatureBitset ImpliedFeaturesRETPOLINE_INDIRECT_BRANCHES = {};
-static constexpr FeatureBitset ImpliedFeaturesRETPOLINE_INDIRECT_CALLS = {};
-static constexpr FeatureBitset ImpliedFeaturesLVI_CFI = {};
-static constexpr FeatureBitset ImpliedFeaturesLVI_LOAD_HARDENING = {};
+constexpr FeatureBitset ImpliedFeaturesRETPOLINE_EXTERNAL_THUNK = {};
+constexpr FeatureBitset ImpliedFeaturesRETPOLINE_INDIRECT_BRANCHES = {};
+constexpr FeatureBitset ImpliedFeaturesRETPOLINE_INDIRECT_CALLS = {};
+constexpr FeatureBitset ImpliedFeaturesLVI_CFI = {};
+constexpr FeatureBitset ImpliedFeaturesLVI_LOAD_HARDENING = {};
 
 // XSAVE features are dependent on basic XSAVE.
-static constexpr FeatureBitset ImpliedFeaturesXSAVEC = FeatureXSAVE;
-static constexpr FeatureBitset ImpliedFeaturesXSAVEOPT = FeatureXSAVE;
-static constexpr FeatureBitset ImpliedFeaturesXSAVES = FeatureXSAVE;
+constexpr FeatureBitset ImpliedFeaturesXSAVEC = FeatureXSAVE;
+constexpr FeatureBitset ImpliedFeaturesXSAVEOPT = FeatureXSAVE;
+constexpr FeatureBitset ImpliedFeaturesXSAVES = FeatureXSAVE;
 
 // MMX->3DNOW->3DNOWA chain.
-static constexpr FeatureBitset ImpliedFeaturesMMX = {};
-static constexpr FeatureBitset ImpliedFeatures3DNOW = FeatureMMX;
-static constexpr FeatureBitset ImpliedFeatures3DNOWA = Feature3DNOW;
+constexpr FeatureBitset ImpliedFeaturesMMX = {};
+constexpr FeatureBitset ImpliedFeatures3DNOW = FeatureMMX;
+constexpr FeatureBitset ImpliedFeatures3DNOWA = Feature3DNOW;
 
 // SSE/AVX/AVX512F chain.
-static constexpr FeatureBitset ImpliedFeaturesSSE = {};
-static constexpr FeatureBitset ImpliedFeaturesSSE2 = FeatureSSE;
-static constexpr FeatureBitset ImpliedFeaturesSSE3 = FeatureSSE2;
-static constexpr FeatureBitset ImpliedFeaturesSSSE3 = FeatureSSE3;
-static constexpr FeatureBitset ImpliedFeaturesSSE4_1 = FeatureSSSE3;
-static constexpr FeatureBitset ImpliedFeaturesSSE4_2 = FeatureSSE4_1;
-static constexpr FeatureBitset ImpliedFeaturesAVX = FeatureSSE4_2;
-static constexpr FeatureBitset ImpliedFeaturesAVX2 = FeatureAVX;
-static constexpr FeatureBitset ImpliedFeaturesAVX512F =
+constexpr FeatureBitset ImpliedFeaturesSSE = {};
+constexpr FeatureBitset ImpliedFeaturesSSE2 = FeatureSSE;
+constexpr FeatureBitset ImpliedFeaturesSSE3 = FeatureSSE2;
+constexpr FeatureBitset ImpliedFeaturesSSSE3 = FeatureSSE3;
+constexpr FeatureBitset ImpliedFeaturesSSE4_1 = FeatureSSSE3;
+constexpr FeatureBitset ImpliedFeaturesSSE4_2 = FeatureSSE4_1;
+constexpr FeatureBitset ImpliedFeaturesAVX = FeatureSSE4_2;
+constexpr FeatureBitset ImpliedFeaturesAVX2 = FeatureAVX;
+constexpr FeatureBitset ImpliedFeaturesAVX512F =
     FeatureAVX2 | FeatureF16C | FeatureFMA;
 
 // Vector extensions that build on SSE or AVX.
-static constexpr FeatureBitset ImpliedFeaturesAES = FeatureSSE2;
-static constexpr FeatureBitset ImpliedFeaturesF16C = FeatureAVX;
-static constexpr FeatureBitset ImpliedFeaturesFMA = FeatureAVX;
-static constexpr FeatureBitset ImpliedFeaturesGFNI = FeatureSSE2;
-static constexpr FeatureBitset ImpliedFeaturesPCLMUL = FeatureSSE2;
-static constexpr FeatureBitset ImpliedFeaturesSHA = FeatureSSE2;
-static constexpr FeatureBitset ImpliedFeaturesVAES = FeatureAES | FeatureAVX;
-static constexpr FeatureBitset ImpliedFeaturesVPCLMULQDQ =
-    FeatureAVX | FeaturePCLMUL;
+constexpr FeatureBitset ImpliedFeaturesAES = FeatureSSE2;
+constexpr FeatureBitset ImpliedFeaturesF16C = FeatureAVX;
+constexpr FeatureBitset ImpliedFeaturesFMA = FeatureAVX;
+constexpr FeatureBitset ImpliedFeaturesGFNI = FeatureSSE2;
+constexpr FeatureBitset ImpliedFeaturesPCLMUL = FeatureSSE2;
+constexpr FeatureBitset ImpliedFeaturesSHA = FeatureSSE2;
+constexpr FeatureBitset ImpliedFeaturesVAES = FeatureAES | FeatureAVX;
+constexpr FeatureBitset ImpliedFeaturesVPCLMULQDQ = FeatureAVX | FeaturePCLMUL;
 
 // AVX512 features.
-static constexpr FeatureBitset ImpliedFeaturesAVX512CD = FeatureAVX512F;
-static constexpr FeatureBitset ImpliedFeaturesAVX512BW = FeatureAVX512F;
-static constexpr FeatureBitset ImpliedFeaturesAVX512DQ = FeatureAVX512F;
-static constexpr FeatureBitset ImpliedFeaturesAVX512ER = FeatureAVX512F;
-static constexpr FeatureBitset ImpliedFeaturesAVX512PF = FeatureAVX512F;
-static constexpr FeatureBitset ImpliedFeaturesAVX512VL = FeatureAVX512F;
-
-static constexpr FeatureBitset ImpliedFeaturesAVX512BF16 = FeatureAVX512BW;
-static constexpr FeatureBitset ImpliedFeaturesAVX512BITALG = FeatureAVX512BW;
-static constexpr FeatureBitset ImpliedFeaturesAVX512IFMA = FeatureAVX512F;
-static constexpr FeatureBitset ImpliedFeaturesAVX512VNNI = FeatureAVX512F;
-static constexpr FeatureBitset ImpliedFeaturesAVX512VPOPCNTDQ = FeatureAVX512F;
-static constexpr FeatureBitset ImpliedFeaturesAVX512VBMI = FeatureAVX512BW;
-static constexpr FeatureBitset ImpliedFeaturesAVX512VBMI2 = FeatureAVX512BW;
-static constexpr FeatureBitset ImpliedFeaturesAVX512VP2INTERSECT =
-    FeatureAVX512F;
+constexpr FeatureBitset ImpliedFeaturesAVX512CD = FeatureAVX512F;
+constexpr FeatureBitset ImpliedFeaturesAVX512BW = FeatureAVX512F;
+constexpr FeatureBitset ImpliedFeaturesAVX512DQ = FeatureAVX512F;
+constexpr FeatureBitset ImpliedFeaturesAVX512ER = FeatureAVX512F;
+constexpr FeatureBitset ImpliedFeaturesAVX512PF = FeatureAVX512F;
+constexpr FeatureBitset ImpliedFeaturesAVX512VL = FeatureAVX512F;
+
+constexpr FeatureBitset ImpliedFeaturesAVX512BF16 = FeatureAVX512BW;
+constexpr FeatureBitset ImpliedFeaturesAVX512BITALG = FeatureAVX512BW;
+constexpr FeatureBitset ImpliedFeaturesAVX512IFMA = FeatureAVX512F;
+constexpr FeatureBitset ImpliedFeaturesAVX512VNNI = FeatureAVX512F;
+constexpr FeatureBitset ImpliedFeaturesAVX512VPOPCNTDQ = FeatureAVX512F;
+constexpr FeatureBitset ImpliedFeaturesAVX512VBMI = FeatureAVX512BW;
+constexpr FeatureBitset ImpliedFeaturesAVX512VBMI2 = FeatureAVX512BW;
+constexpr FeatureBitset ImpliedFeaturesAVX512VP2INTERSECT = FeatureAVX512F;
 
 // FIXME: These two aren't really implemented and just exist in the feature
 // list for __builtin_cpu_supports. So omit their dependencies.
-static constexpr FeatureBitset ImpliedFeaturesAVX5124FMAPS = {};
-static constexpr FeatureBitset ImpliedFeaturesAVX5124VNNIW = {};
+constexpr FeatureBitset ImpliedFeaturesAVX5124FMAPS = {};
+constexpr FeatureBitset ImpliedFeaturesAVX5124VNNIW = {};
 
 // SSE4_A->FMA4->XOP chain.
-static constexpr FeatureBitset ImpliedFeaturesSSE4_A = FeatureSSE3;
-static constexpr FeatureBitset ImpliedFeaturesFMA4 = FeatureAVX | FeatureSSE4_A;
-static constexpr FeatureBitset ImpliedFeaturesXOP = FeatureFMA4;
+constexpr FeatureBitset ImpliedFeaturesSSE4_A = FeatureSSE3;
+constexpr FeatureBitset ImpliedFeaturesFMA4 = FeatureAVX | FeatureSSE4_A;
+constexpr FeatureBitset ImpliedFeaturesXOP = FeatureFMA4;
 
 // AMX Features
-static constexpr FeatureBitset ImpliedFeaturesAMX_TILE = {};
-static constexpr FeatureBitset ImpliedFeaturesAMX_BF16 = FeatureAMX_TILE;
-static constexpr FeatureBitset ImpliedFeaturesAMX_INT8 = FeatureAMX_TILE;
+constexpr FeatureBitset ImpliedFeaturesAMX_TILE = {};
+constexpr FeatureBitset ImpliedFeaturesAMX_BF16 = FeatureAMX_TILE;
+constexpr FeatureBitset ImpliedFeaturesAMX_INT8 = FeatureAMX_TILE;
+constexpr FeatureBitset ImpliedFeaturesHRESET = {};
+
+// Key Locker Features
+constexpr FeatureBitset ImpliedFeaturesKL = FeatureSSE2;
+constexpr FeatureBitset ImpliedFeaturesWIDEKL = FeatureKL;
 
-static constexpr FeatureInfo FeatureInfos[X86::CPU_FEATURE_MAX] = {
+// AVXVNNI Features
+constexpr FeatureBitset ImpliedFeaturesAVXVNNI = FeatureAVX2;
+
+constexpr FeatureInfo FeatureInfos[X86::CPU_FEATURE_MAX] = {
 #define X86_FEATURE(ENUM, STR) {{STR}, ImpliedFeatures##ENUM},
 #include "llvm/Support/X86TargetParser.def"
 };
 
-// Convert the set bits in FeatureBitset to a list of strings.
-static void getFeatureBitsAsStrings(const FeatureBitset &Bits,
-                                    SmallVectorImpl<StringRef> &Features) {
-  for (unsigned i = 0; i != CPU_FEATURE_MAX; ++i)
-    if (Bits[i] && !FeatureInfos[i].Name.empty())
-      Features.push_back(FeatureInfos[i].Name);
-}
-
 void llvm::X86::getFeaturesForCPU(StringRef CPU,
                                   SmallVectorImpl<StringRef> &EnabledFeatures) {
   auto I = llvm::find_if(Processors,
@@ -557,7 +595,9 @@ void llvm::X86::getFeaturesForCPU(StringRef CPU,
   Bits &= ~Feature64BIT;
 
   // Add the string version of all set bits.
-  getFeatureBitsAsStrings(Bits, EnabledFeatures);
+  for (unsigned i = 0; i != CPU_FEATURE_MAX; ++i)
+    if (Bits[i] && !FeatureInfos[i].Name.empty())
+      EnabledFeatures.push_back(FeatureInfos[i].Name);
 }
 
 // For each feature that is (transitively) implied by this feature, set it.
@@ -591,9 +631,9 @@ static void getImpliedDisabledFeatures(FeatureBitset &Bits, unsigned Value) {
   } while (Prev != Bits);
 }
 
-void llvm::X86::getImpliedFeatures(
+void llvm::X86::updateImpliedFeatures(
     StringRef Feature, bool Enabled,
-    SmallVectorImpl<StringRef> &ImpliedFeatures) {
+    StringMap<bool> &Features) {
   auto I = llvm::find_if(
       FeatureInfos, [&](const FeatureInfo &FI) { return FI.Name == Feature; });
   if (I == std::end(FeatureInfos)) {
@@ -609,6 +649,8 @@ void llvm::X86::getImpliedFeatures(
     getImpliedDisabledFeatures(ImpliedBits,
                                std::distance(std::begin(FeatureInfos), I));
 
-  // Convert all the found bits into strings.
-  getFeatureBitsAsStrings(ImpliedBits, ImpliedFeatures);
+  // Update the map entry for all implied features.
+  for (unsigned i = 0; i != CPU_FEATURE_MAX; ++i)
+    if (ImpliedBits[i] && !FeatureInfos[i].Name.empty())
+      Features[FeatureInfos[i].Name] = Enabled;
 }
diff --git a/contrib/llvm-project/llvm/lib/Support/YAMLParser.cpp b/contrib/llvm-project/llvm/lib/Support/YAMLParser.cpp
index ca8ffdc47afa..f68ba0d065c1 100644
--- a/contrib/llvm-project/llvm/lib/Support/YAMLParser.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/YAMLParser.cpp
@@ -200,13 +200,12 @@ static UTF8Decoded decodeUTF8(StringRef Range) {
   StringRef::iterator End = Range.end();
   // 1 byte: [0x00, 0x7f]
   // Bit pattern: 0xxxxxxx
-  if ((*Position & 0x80) == 0) {
-     return std::make_pair(*Position, 1);
+  if (Position < End && (*Position & 0x80) == 0) {
+    return std::make_pair(*Position, 1);
   }
   // 2 bytes: [0x80, 0x7ff]
   // Bit pattern: 110xxxxx 10xxxxxx
-  if (Position + 1 != End &&
-      ((*Position & 0xE0) == 0xC0) &&
+  if (Position + 1 < End && ((*Position & 0xE0) == 0xC0) &&
       ((*(Position + 1) & 0xC0) == 0x80)) {
     uint32_t codepoint = ((*Position & 0x1F) << 6) |
                           (*(Position + 1) & 0x3F);
@@ -215,8 +214,7 @@ static UTF8Decoded decodeUTF8(StringRef Range) {
   }
   // 3 bytes: [0x8000, 0xffff]
   // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx
-  if (Position + 2 != End &&
-      ((*Position & 0xF0) == 0xE0) &&
+  if (Position + 2 < End && ((*Position & 0xF0) == 0xE0) &&
       ((*(Position + 1) & 0xC0) == 0x80) &&
       ((*(Position + 2) & 0xC0) == 0x80)) {
     uint32_t codepoint = ((*Position & 0x0F) << 12) |
@@ -230,8 +228,7 @@ static UTF8Decoded decodeUTF8(StringRef Range) {
   }
   // 4 bytes: [0x10000, 0x10FFFF]
   // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
-  if (Position + 3 != End &&
-      ((*Position & 0xF8) == 0xF0) &&
+  if (Position + 3 < End && ((*Position & 0xF8) == 0xF0) &&
       ((*(Position + 1) & 0xC0) == 0x80) &&
       ((*(Position + 2) & 0xC0) == 0x80) &&
       ((*(Position + 3) & 0xC0) == 0x80)) {
@@ -718,7 +715,7 @@ std::string yaml::escape(StringRef Input, bool EscapePrintable) {
         // Found invalid char.
         SmallString<4> Val;
         encodeUTF8(0xFFFD, Val);
-        EscapedInput.insert(EscapedInput.end(), Val.begin(), Val.end());
+        llvm::append_range(EscapedInput, Val);
         // FIXME: Error reporting.
         return EscapedInput;
       }
@@ -749,6 +746,92 @@ std::string yaml::escape(StringRef Input, bool EscapePrintable) {
   return EscapedInput;
 }
 
+llvm::Optional<bool> yaml::parseBool(StringRef S) {
+  switch (S.size()) {
+  case 1:
+    switch (S.front()) {
+    case 'y':
+    case 'Y':
+      return true;
+    case 'n':
+    case 'N':
+      return false;
+    default:
+      return None;
+    }
+  case 2:
+    switch (S.front()) {
+    case 'O':
+      if (S[1] == 'N') // ON
+        return true;
+      LLVM_FALLTHROUGH;
+    case 'o':
+      if (S[1] == 'n') //[Oo]n
+        return true;
+      return None;
+    case 'N':
+      if (S[1] == 'O') // NO
+        return false;
+      LLVM_FALLTHROUGH;
+    case 'n':
+      if (S[1] == 'o') //[Nn]o
+        return false;
+      return None;
+    default:
+      return None;
+    }
+  case 3:
+    switch (S.front()) {
+    case 'O':
+      if (S.drop_front() == "FF") // OFF
+        return false;
+      LLVM_FALLTHROUGH;
+    case 'o':
+      if (S.drop_front() == "ff") //[Oo]ff
+        return false;
+      return None;
+    case 'Y':
+      if (S.drop_front() == "ES") // YES
+        return true;
+      LLVM_FALLTHROUGH;
+    case 'y':
+      if (S.drop_front() == "es") //[Yy]es
+        return true;
+      return None;
+    default:
+      return None;
+    }
+  case 4:
+    switch (S.front()) {
+    case 'T':
+      if (S.drop_front() == "RUE") // TRUE
+        return true;
+      LLVM_FALLTHROUGH;
+    case 't':
+      if (S.drop_front() == "rue") //[Tt]rue
+        return true;
+      return None;
+    default:
+      return None;
+    }
+  case 5:
+    switch (S.front()) {
+    case 'F':
+      if (S.drop_front() == "ALSE") // FALSE
+        return false;
+      LLVM_FALLTHROUGH;
+    case 'f':
+      if (S.drop_front() == "alse") //[Ff]alse
+        return false;
+      return None;
+    default:
+      return None;
+    }
+  default:
+    return None;
+  }
+}
+
 Scanner::Scanner(StringRef Input, SourceMgr &sm, bool ShowColors,
                  std::error_code *EC)
     : SM(sm), ShowColors(ShowColors), EC(EC) {
@@ -773,7 +856,7 @@ void Scanner::init(MemoryBufferRef Buffer) {
   IsSimpleKeyAllowed = true;
   Failed = false;
   std::unique_ptr<MemoryBuffer> InputBufferOwner =
-      MemoryBuffer::getMemBuffer(Buffer);
+      MemoryBuffer::getMemBuffer(Buffer, /*RequiresNullTerminator=*/false);
   SM.AddNewSourceBuffer(std::move(InputBufferOwner), SMLoc());
 }
 
@@ -898,17 +981,9 @@ void Scanner::advanceWhile(SkipWhileFunc Func) {
   Current = Final;
 }
 
-static bool is_ns_hex_digit(const char C) {
-  return    (C >= '0' && C <= '9')
-         || (C >= 'a' && C <= 'z')
-         || (C >= 'A' && C <= 'Z');
-}
+static bool is_ns_hex_digit(const char C) { return isAlnum(C); }
 
-static bool is_ns_word_char(const char C) {
-  return    C == '-'
-         || (C >= 'a' && C <= 'z')
-         || (C >= 'A' && C <= 'Z');
-}
+static bool is_ns_word_char(const char C) { return C == '-' || isAlpha(C); }
 
 void Scanner::scan_ns_uri_char() {
   while (true) {
@@ -1036,7 +1111,7 @@ bool Scanner::rollIndent( int ToColumn
 }
 
 void Scanner::skipComment() {
-  if (*Current != '#')
+  if (Current == End || *Current != '#')
     return;
   while (true) {
     // This may skip more than one byte, thus Column is only incremented
@@ -1051,7 +1126,7 @@ void Scanner::skipComment() {
 
 void Scanner::scanToNextToken() {
   while (true) {
-    while (*Current == ' ' || *Current == '\t') {
+    while (Current != End && (*Current == ' ' || *Current == '\t')) {
       skip(1);
     }
 
@@ -1286,7 +1361,7 @@ bool Scanner::scanFlowScalar(bool IsDoubleQuoted) {
              && wasEscaped(Start + 1, Current));
   } else {
     skip(1);
-    while (true) {
+    while (Current != End) {
       // Skip a ' followed by another '.
       if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') {
         skip(2);
@@ -1334,13 +1409,14 @@ bool Scanner::scanPlainScalar() {
   unsigned LeadingBlanks = 0;
   assert(Indent >= -1 && "Indent must be >= -1 !");
   unsigned indent = static_cast<unsigned>(Indent + 1);
-  while (true) {
+  while (Current != End) {
     if (*Current == '#')
       break;
 
-    while (!isBlankOrBreak(Current)) {
-      if (  FlowLevel && *Current == ':'
-          && !(isBlankOrBreak(Current + 1) || *(Current + 1) == ',')) {
+    while (Current != End && !isBlankOrBreak(Current)) {
+      if (FlowLevel && *Current == ':' &&
+          (Current + 1 == End ||
+           !(isBlankOrBreak(Current + 1) || *(Current + 1) == ','))) {
         setError("Found unexpected ':' while scanning a plain scalar", Current);
         return false;
       }
@@ -1410,7 +1486,7 @@ bool Scanner::scanAliasOrAnchor(bool IsAlias) {
   StringRef::iterator Start = Current;
   unsigned ColStart = Column;
   skip(1);
-  while(true) {
+  while (Current != End) {
     if (   *Current == '[' || *Current == ']'
         || *Current == '{' || *Current == '}'
         || *Current == ','
@@ -1423,7 +1499,7 @@ bool Scanner::scanAliasOrAnchor(bool IsAlias) {
     ++Column;
   }
 
-  if (Start == Current) {
+  if (Start + 1 == Current) {
     setError("Got empty alias or anchor", Start);
     return false;
   }
@@ -1775,12 +1851,13 @@ Stream::~Stream() = default;
 
 bool Stream::failed() { return scanner->failed(); }
 
-void Stream::printError(Node *N, const Twine &Msg) {
-  SMRange Range = N ? N->getSourceRange() : SMRange();
-  scanner->printError( Range.Start
-                     , SourceMgr::DK_Error
-                     , Msg
-                     , Range);
+void Stream::printError(Node *N, const Twine &Msg, SourceMgr::DiagKind Kind) {
+  printError(N ? N->getSourceRange() : SMRange(), Msg, Kind);
+}
+
+void Stream::printError(const SMRange &Range, const Twine &Msg,
+                        SourceMgr::DiagKind Kind) {
+  scanner->printError(Range.Start, Kind, Msg, Range);
 }
 
 document_iterator Stream::begin() {
@@ -1899,11 +1976,11 @@ StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const {
       Storage.reserve(UnquotedValue.size());
       for (; i != StringRef::npos; i = UnquotedValue.find('\'')) {
         StringRef Valid(UnquotedValue.begin(), i);
-        Storage.insert(Storage.end(), Valid.begin(), Valid.end());
+        llvm::append_range(Storage, Valid);
         Storage.push_back('\'');
         UnquotedValue = UnquotedValue.substr(i + 2);
       }
-      Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end());
+      llvm::append_range(Storage, UnquotedValue);
       return StringRef(Storage.begin(), Storage.size());
     }
     return UnquotedValue;
@@ -1922,7 +1999,7 @@ StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue
   for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) {
     // Insert all previous chars into Storage.
     StringRef Valid(UnquotedValue.begin(), i);
-    Storage.insert(Storage.end(), Valid.begin(), Valid.end());
+    llvm::append_range(Storage, Valid);
     // Chop off inserted chars.
     UnquotedValue = UnquotedValue.substr(i);
 
@@ -2054,7 +2131,7 @@ StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue
       UnquotedValue = UnquotedValue.substr(1);
     }
   }
-  Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end());
+  llvm::append_range(Storage, UnquotedValue);
   return StringRef(Storage.begin(), Storage.size());
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Support/YAMLTraits.cpp b/contrib/llvm-project/llvm/lib/Support/YAMLTraits.cpp
index 9ac7c65e19f7..aa6163a76161 100644
--- a/contrib/llvm-project/llvm/lib/Support/YAMLTraits.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/YAMLTraits.cpp
@@ -48,6 +48,10 @@ void IO::setContext(void *Context) {
   Ctxt = Context;
 }
 
+void IO::setAllowUnknownKeys(bool Allow) {
+  llvm_unreachable("Only supported for Input");
+}
+
 //===----------------------------------------------------------------------===//
 //  Input
 //===----------------------------------------------------------------------===//
@@ -171,7 +175,7 @@ bool Input::preflightKey(const char *Key, bool Required, bool, bool &UseDefault,
     return false;
   }
   MN->ValidKeys.push_back(Key);
-  HNode *Value = MN->Mapping[Key].get();
+  HNode *Value = MN->Mapping[Key].first.get();
   if (!Value) {
     if (Required)
       setError(CurrentNode, Twine("missing required key '") + Key + "'");
@@ -197,8 +201,12 @@ void Input::endMapping() {
     return;
   for (const auto &NN : MN->Mapping) {
     if (!is_contained(MN->ValidKeys, NN.first())) {
-      setError(NN.second.get(), Twine("unknown key '") + NN.first() + "'");
-      break;
+      const SMRange &ReportLoc = NN.second.second;
+      if (!AllowUnknownKeys) {
+        setError(ReportLoc, Twine("unknown key '") + NN.first() + "'");
+        break;
+      } else
+        reportWarning(ReportLoc, Twine("unknown key '") + NN.first() + "'");
     }
   }
 }
@@ -370,6 +378,24 @@ void Input::setError(Node *node, const Twine &message) {
   EC = make_error_code(errc::invalid_argument);
 }
 
+void Input::setError(const SMRange &range, const Twine &message) {
+  Strm->printError(range, message);
+  EC = make_error_code(errc::invalid_argument);
+}
+
+void Input::reportWarning(HNode *hnode, const Twine &message) {
+  assert(hnode && "HNode must not be NULL");
+  Strm->printError(hnode->_node, message, SourceMgr::DK_Warning);
+}
+
+void Input::reportWarning(Node *node, const Twine &message) {
+  Strm->printError(node, message, SourceMgr::DK_Warning);
+}
+
+void Input::reportWarning(const SMRange &range, const Twine &message) {
+  Strm->printError(range, message, SourceMgr::DK_Warning);
+}
+
 std::unique_ptr<Input::HNode> Input::createHNodes(Node *N) {
   SmallString<128> StringStorage;
   if (ScalarNode *SN = dyn_cast<ScalarNode>(N)) {
@@ -413,7 +439,8 @@ std::unique_ptr<Input::HNode> Input::createHNodes(Node *N) {
       auto ValueHNode = createHNodes(Value);
       if (EC)
         break;
-      mapHNode->Mapping[KeyStr] = std::move(ValueHNode);
+      mapHNode->Mapping[KeyStr] =
+          std::make_pair(std::move(ValueHNode), KeyNode->getSourceRange());
     }
     return std::move(mapHNode);
   } else if (isa<NullNode>(N)) {
@@ -428,6 +455,8 @@ void Input::setError(const Twine &Message) {
   setError(CurrentNode, Message);
 }
 
+void Input::setAllowUnknownKeys(bool Allow) { AllowUnknownKeys = Allow; }
+
 bool Input::canElideEmptySequence() {
   return false;
 }
@@ -563,7 +592,7 @@ void Output::endSequence() {
   // If we did not emit anything, we should explicitly emit an empty sequence
   if (StateStack.back() == inSeqFirstElement) {
     Padding = PaddingBeforeContainer;
-    newLineCheck();
+    newLineCheck(/*EmptySequence=*/true);
     output("[]");
     Padding = "\n";
   }
@@ -769,7 +798,7 @@ void Output::outputNewLine() {
 // if seq in middle, use "- " if firstKey, else use "  "
 //
 
-void Output::newLineCheck() {
+void Output::newLineCheck(bool EmptySequence) {
   if (Padding != "\n") {
     output(Padding);
     Padding = {};
@@ -778,7 +807,7 @@ void Output::newLineCheck() {
   outputNewLine();
   Padding = {};
 
-  if (StateStack.size() == 0)
+  if (StateStack.size() == 0 || EmptySequence)
     return;
 
   unsigned Indent = StateStack.size() - 1;
@@ -802,7 +831,6 @@ void Output::newLineCheck() {
   if (OutputDash) {
     output("- ");
   }
-
 }
 
 void Output::paddedKey(StringRef key) {
@@ -856,11 +884,8 @@ void ScalarTraits<bool>::output(const bool &Val, void *, raw_ostream &Out) {
 }
 
 StringRef ScalarTraits<bool>::input(StringRef Scalar, void *, bool &Val) {
-  if (Scalar.equals("true")) {
-    Val = true;
-    return StringRef();
-  } else if (Scalar.equals("false")) {
-    Val = false;
+  if (llvm::Optional<bool> Parsed = parseBool(Scalar)) {
+    Val = *Parsed;
     return StringRef();
   }
   return "invalid boolean";
@@ -1031,8 +1056,7 @@ StringRef ScalarTraits<float>::input(StringRef Scalar, void *, float &Val) {
 }
 
 void ScalarTraits<Hex8>::output(const Hex8 &Val, void *, raw_ostream &Out) {
-  uint8_t Num = Val;
-  Out << format("0x%02X", Num);
+  Out << format("0x%" PRIX8, (uint8_t)Val);
 }
 
 StringRef ScalarTraits<Hex8>::input(StringRef Scalar, void *, Hex8 &Val) {
@@ -1046,8 +1070,7 @@ StringRef ScalarTraits<Hex8>::input(StringRef Scalar, void *, Hex8 &Val) {
 }
 
 void ScalarTraits<Hex16>::output(const Hex16 &Val, void *, raw_ostream &Out) {
-  uint16_t Num = Val;
-  Out << format("0x%04X", Num);
+  Out << format("0x%" PRIX16, (uint16_t)Val);
 }
 
 StringRef ScalarTraits<Hex16>::input(StringRef Scalar, void *, Hex16 &Val) {
@@ -1061,8 +1084,7 @@ StringRef ScalarTraits<Hex16>::input(StringRef Scalar, void *, Hex16 &Val) {
 }
 
 void ScalarTraits<Hex32>::output(const Hex32 &Val, void *, raw_ostream &Out) {
-  uint32_t Num = Val;
-  Out << format("0x%08X", Num);
+  Out << format("0x%" PRIX32, (uint32_t)Val);
 }
 
 StringRef ScalarTraits<Hex32>::input(StringRef Scalar, void *, Hex32 &Val) {
@@ -1076,8 +1098,7 @@ StringRef ScalarTraits<Hex32>::input(StringRef Scalar, void *, Hex32 &Val) {
 }
 
 void ScalarTraits<Hex64>::output(const Hex64 &Val, void *, raw_ostream &Out) {
-  uint64_t Num = Val;
-  Out << format("0x%016llX", Num);
+  Out << format("0x%" PRIX64, (uint64_t)Val);
 }
 
 StringRef ScalarTraits<Hex64>::input(StringRef Scalar, void *, Hex64 &Val) {
@@ -1087,3 +1108,15 @@ StringRef ScalarTraits<Hex64>::input(StringRef Scalar, void *, Hex64 &Val) {
   Val = Num;
   return StringRef();
 }
+
+void ScalarTraits<VersionTuple>::output(const VersionTuple &Val, void *,
+                                        llvm::raw_ostream &Out) {
+  Out << Val.getAsString();
+}
+
+StringRef ScalarTraits<VersionTuple>::input(StringRef Scalar, void *,
+                                            VersionTuple &Val) {
+  if (Val.tryParse(Scalar))
+    return "invalid version format";
+  return StringRef();
+}
diff --git a/contrib/llvm-project/llvm/lib/Support/raw_ostream.cpp b/contrib/llvm-project/llvm/lib/Support/raw_ostream.cpp
index f2d78d773239..8f10d136bc38 100644
--- a/contrib/llvm-project/llvm/lib/Support/raw_ostream.cpp
+++ b/contrib/llvm-project/llvm/lib/Support/raw_ostream.cpp
@@ -12,7 +12,6 @@
 
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Config/config.h"
 #include "llvm/Support/Compiler.h"
@@ -30,7 +29,6 @@
 #include <cstdio>
 #include <iterator>
 #include <sys/stat.h>
-#include <system_error>
 
 // <fcntl.h> may provide O_BINARY.
 #if defined(HAVE_FCNTL_H)
@@ -620,8 +618,9 @@ raw_fd_ostream::raw_fd_ostream(StringRef Filename, std::error_code &EC,
 
 /// FD is the file descriptor that this writes to.  If ShouldClose is true, this
 /// closes the file when the stream is destroyed.
-raw_fd_ostream::raw_fd_ostream(int fd, bool shouldClose, bool unbuffered)
-    : raw_pwrite_stream(unbuffered), FD(fd), ShouldClose(shouldClose) {
+raw_fd_ostream::raw_fd_ostream(int fd, bool shouldClose, bool unbuffered,
+                               OStreamKind K)
+    : raw_pwrite_stream(unbuffered, K), FD(fd), ShouldClose(shouldClose) {
   if (FD < 0 ) {
     ShouldClose = false;
     return;
@@ -858,7 +857,24 @@ bool raw_fd_ostream::is_displayed() const {
 }
 
 bool raw_fd_ostream::has_colors() const {
-  return sys::Process::FileDescriptorHasColors(FD);
+  if (!HasColors)
+    HasColors = sys::Process::FileDescriptorHasColors(FD);
+  return *HasColors;
+}
+
+Expected<sys::fs::FileLocker> raw_fd_ostream::lock() {
+  std::error_code EC = sys::fs::lockFile(FD);
+  if (!EC)
+    return sys::fs::FileLocker(FD);
+  return errorCodeToError(EC);
+}
+
+Expected<sys::fs::FileLocker>
+raw_fd_ostream::tryLockFor(std::chrono::milliseconds Timeout) {
+  std::error_code EC = sys::fs::tryLockFile(FD, Timeout);
+  if (!EC)
+    return sys::fs::FileLocker(FD);
+  return errorCodeToError(EC);
 }
 
 void raw_fd_ostream::anchor() {}
@@ -887,6 +903,37 @@ raw_ostream &llvm::nulls() {
   return S;
 }
 
+//===----------------------------------------------------------------------===//
+// File Streams
+//===----------------------------------------------------------------------===//
+
+raw_fd_stream::raw_fd_stream(StringRef Filename, std::error_code &EC)
+    : raw_fd_ostream(getFD(Filename, EC, sys::fs::CD_CreateAlways,
+                           sys::fs::FA_Write | sys::fs::FA_Read,
+                           sys::fs::OF_None),
+                     true, false, OStreamKind::OK_FDStream) {
+  if (EC)
+    return;
+
+  // Do not support non-seekable files.
+  if (!supportsSeeking())
+    EC = std::make_error_code(std::errc::invalid_argument);
+}
+
+ssize_t raw_fd_stream::read(char *Ptr, size_t Size) {
+  assert(get_fd() >= 0 && "File already closed.");
+  ssize_t Ret = ::read(get_fd(), (void *)Ptr, Size);
+  if (Ret >= 0)
+    inc_pos(Ret);
+  else
+    error_detected(std::error_code(errno, std::generic_category()));
+  return Ret;
+}
+
+bool raw_fd_stream::classof(const raw_ostream *OS) {
+  return OS->get_kind() == OStreamKind::OK_FDStream;
+}
+
 //===----------------------------------------------------------------------===//
 //  raw_string_ostream
 //===----------------------------------------------------------------------===//
@@ -940,3 +987,5 @@ void raw_null_ostream::pwrite_impl(const char *Ptr, size_t Size,
 void raw_pwrite_stream::anchor() {}
 
 void buffer_ostream::anchor() {}
+
+void buffer_unique_ostream::anchor() {}
diff --git a/contrib/llvm-project/llvm/lib/TableGen/DetailedRecordsBackend.cpp b/contrib/llvm-project/llvm/lib/TableGen/DetailedRecordsBackend.cpp
new file mode 100644
index 000000000000..2c3c3358b347
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/TableGen/DetailedRecordsBackend.cpp
@@ -0,0 +1,203 @@
+//===- DetailedRecordBackend.cpp - Detailed Records Report      -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This Tablegen backend prints a report that includes all the global 
+// variables, classes, and records in complete detail. It includes more
+// detail than the default TableGen printer backend.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/TableGenBackend.h"
+#include <algorithm>
+#include <set>
+#include <string>
+#include <vector>
+
+#define DEBUG_TYPE "detailed-records-backend"
+
+#define NL "\n"
+
+using namespace llvm;
+
+namespace {
+
+class DetailedRecordsEmitter {
+private:
+  RecordKeeper &Records;
+
+public:
+  DetailedRecordsEmitter(RecordKeeper &RK) : Records(RK) {}
+
+  void run(raw_ostream &OS);
+  void printReportHeading(raw_ostream &OS);
+  void printVariables(raw_ostream &OS);
+  void printClasses(raw_ostream &OS);
+  void printRecords(raw_ostream &OS);
+  void printSectionHeading(std::string Title, int Count, raw_ostream &OS);
+  void printDefms(Record *Rec, raw_ostream &OS);
+  void printTemplateArgs(Record *Rec, raw_ostream &OS);
+  void printSuperclasses(Record *Rec, raw_ostream &OS);
+  void printFields(Record *Rec, raw_ostream &OS);
+}; // emitter class
+
+} // anonymous namespace
+
+// Print the report.
+void DetailedRecordsEmitter::run(raw_ostream &OS) {
+  printReportHeading(OS);
+  printVariables(OS);
+  printClasses(OS);
+  printRecords(OS);
+}
+
+// Print the report heading, including the source file name.
+void DetailedRecordsEmitter::printReportHeading(raw_ostream &OS) {
+  OS << formatv("DETAILED RECORDS for file {0}\n", Records.getInputFilename());
+}
+
+// Print the global variables.
+void DetailedRecordsEmitter::printVariables(raw_ostream &OS) {
+  const auto GlobalList = Records.getGlobals();
+  printSectionHeading("Global Variables", GlobalList.size(), OS);
+
+  OS << NL;
+  for (const auto &Var : GlobalList) {
+    OS << Var.first << " = " << Var.second->getAsString() << NL;
+  }
+}
+
+// Print the classes, including the template arguments, superclasses,
+// and fields.
+void DetailedRecordsEmitter::printClasses(raw_ostream &OS) {
+  const auto &ClassList = Records.getClasses();
+  printSectionHeading("Classes", ClassList.size(), OS);
+
+  for (const auto &ClassPair : ClassList) {
+    auto *const Class = ClassPair.second.get();
+    OS << formatv("\n{0}  |{1}|\n", Class->getNameInitAsString(),
+                  SrcMgr.getFormattedLocationNoOffset(Class->getLoc().front()));
+    printTemplateArgs(Class, OS);
+    printSuperclasses(Class, OS);
+    printFields(Class, OS);
+  }
+}
+
+// Print the records, including the defm sequences, supercasses,
+// and fields.
+void DetailedRecordsEmitter::printRecords(raw_ostream &OS) {
+  const auto &RecordList = Records.getDefs();
+  printSectionHeading("Records", RecordList.size(), OS);
+
+  for (const auto &RecPair : RecordList) {
+    auto *const Rec = RecPair.second.get();
+    OS << formatv("\n{0}  |{1}|\n", Rec->getNameInitAsString(),
+                  SrcMgr.getFormattedLocationNoOffset(Rec->getLoc().front()));
+    printDefms(Rec, OS);
+    printSuperclasses(Rec, OS);
+    printFields(Rec, OS);
+  }
+}
+
+// Print a section heading with the name of the section and
+// the item count.
+void DetailedRecordsEmitter::printSectionHeading(std::string Title, int Count,
+                                                 raw_ostream &OS) {
+  OS << formatv("\n{0} {1} ({2}) {0}\n", "--------------------", Title, Count);
+}
+
+// Print the record's defm source locations, if any. Note that they
+// are stored in the reverse order of their invocation.
+void DetailedRecordsEmitter::printDefms(Record *Rec, raw_ostream &OS) {
+  const auto &LocList = Rec->getLoc();
+  if (LocList.size() < 2)
+    return;
+
+  OS << "  Defm sequence:";
+  for (unsigned I = LocList.size() - 1; I >= 1; --I) {
+    OS << formatv(" |{0}|", SrcMgr.getFormattedLocationNoOffset(LocList[I]));
+  }
+  OS << NL;
+}
+
+// Print the template arguments of a class.
+void DetailedRecordsEmitter::printTemplateArgs(Record *Rec,
+                                               raw_ostream &OS) {
+  ArrayRef<Init *> Args = Rec->getTemplateArgs();
+  if (Args.empty()) {
+    OS << "  Template args: (none)\n";
+    return;
+  }
+
+  OS << "  Template args:\n";
+  for (const Init *ArgName : Args) {
+    const RecordVal *Value = Rec->getValue(ArgName);
+    assert(Value && "Template argument value not found.");
+    OS << "    ";
+    Value->print(OS, false);
+    OS << formatv("  |{0}|", SrcMgr.getFormattedLocationNoOffset(Value->getLoc()));
+    OS << NL;
+  }
+}
+
+// Print the superclasses of a class or record. Indirect superclasses
+// are enclosed in parentheses.
+void DetailedRecordsEmitter::printSuperclasses(Record *Rec, raw_ostream &OS) {
+  ArrayRef<std::pair<Record *, SMRange>> Superclasses = Rec->getSuperClasses();
+  if (Superclasses.empty()) {
+    OS << "  Superclasses: (none)\n";
+    return;
+  }
+
+  OS << "  Superclasses:";
+  for (const auto &SuperclassPair : Superclasses) {
+    auto *ClassRec = SuperclassPair.first;
+    if (Rec->hasDirectSuperClass(ClassRec))
+      OS << formatv(" {0}", ClassRec->getNameInitAsString());
+    else
+      OS << formatv(" ({0})", ClassRec->getNameInitAsString());
+  }
+  OS << NL;
+}
+
+// Print the fields of a class or record, including their source locations.
+void DetailedRecordsEmitter::printFields(Record *Rec, raw_ostream &OS) {
+  const auto &ValueList = Rec->getValues();
+  if (ValueList.empty()) {
+    OS << "  Fields: (none)\n";
+    return;
+  }
+
+  OS << "  Fields:\n";
+  for (const RecordVal &Value : ValueList)
+    if (!Rec->isTemplateArg(Value.getNameInit())) {
+      OS << "    ";
+      Value.print(OS, false);
+      OS << formatv("  |{0}|\n",
+                    SrcMgr.getFormattedLocationNoOffset(Value.getLoc()));
+    }
+}
+
+namespace llvm {
+
+// This function is called by TableGen after parsing the files.
+
+void EmitDetailedRecords(RecordKeeper &RK, raw_ostream &OS) {
+  // Instantiate the emitter class and invoke run().
+  DetailedRecordsEmitter(RK).run(OS);
+}
+
+} // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/TableGen/Error.cpp b/contrib/llvm-project/llvm/lib/TableGen/Error.cpp
index 54b063cb4f8d..eed4de67942a 100644
--- a/contrib/llvm-project/llvm/lib/TableGen/Error.cpp
+++ b/contrib/llvm-project/llvm/lib/TableGen/Error.cpp
@@ -11,11 +11,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/TableGen/Error.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/WithColor.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
 #include <cstdlib>
 
 namespace llvm {
@@ -39,12 +40,54 @@ static void PrintMessage(ArrayRef<SMLoc> Loc, SourceMgr::DiagKind Kind,
                         "instantiated from multiclass");
 }
 
-void PrintNote(const Twine &Msg) { WithColor::note() << Msg << "\n"; }
+// Functions to print notes.
+
+void PrintNote(const Twine &Msg) {
+  WithColor::note() << Msg << "\n";
+}
 
 void PrintNote(ArrayRef<SMLoc> NoteLoc, const Twine &Msg) {
   PrintMessage(NoteLoc, SourceMgr::DK_Note, Msg);
 }
 
+// Functions to print fatal notes.
+
+void PrintFatalNote(const Twine &Msg) {
+  PrintNote(Msg);
+  // The following call runs the file cleanup handlers.
+  sys::RunInterruptHandlers();
+  std::exit(1);
+}
+
+void PrintFatalNote(ArrayRef<SMLoc> NoteLoc, const Twine &Msg) {
+  PrintNote(NoteLoc, Msg);
+  // The following call runs the file cleanup handlers.
+  sys::RunInterruptHandlers();
+  std::exit(1);
+}
+
+// This method takes a Record and uses the source location
+// stored in it.
+void PrintFatalNote(const Record *Rec, const Twine &Msg) {
+  PrintNote(Rec->getLoc(), Msg);
+  // The following call runs the file cleanup handlers.
+  sys::RunInterruptHandlers();
+  std::exit(1);
+}
+
+// This method takes a RecordVal and uses the source location
+// stored in it.
+void PrintFatalNote(const RecordVal *RecVal, const Twine &Msg) {
+  PrintNote(RecVal->getLoc(), Msg);
+  // The following call runs the file cleanup handlers.
+  sys::RunInterruptHandlers();
+  std::exit(1);
+}
+
+// Functions to print warnings.
+
+void PrintWarning(const Twine &Msg) { WithColor::warning() << Msg << "\n"; }
+
 void PrintWarning(ArrayRef<SMLoc> WarningLoc, const Twine &Msg) {
   PrintMessage(WarningLoc, SourceMgr::DK_Warning, Msg);
 }
@@ -53,7 +96,9 @@ void PrintWarning(const char *Loc, const Twine &Msg) {
   SrcMgr.PrintMessage(SMLoc::getFromPointer(Loc), SourceMgr::DK_Warning, Msg);
 }
 
-void PrintWarning(const Twine &Msg) { WithColor::warning() << Msg << "\n"; }
+// Functions to print errors.
+
+void PrintError(const Twine &Msg) { WithColor::error() << Msg << "\n"; }
 
 void PrintError(ArrayRef<SMLoc> ErrorLoc, const Twine &Msg) {
   PrintMessage(ErrorLoc, SourceMgr::DK_Error, Msg);
@@ -63,7 +108,19 @@ void PrintError(const char *Loc, const Twine &Msg) {
   SrcMgr.PrintMessage(SMLoc::getFromPointer(Loc), SourceMgr::DK_Error, Msg);
 }
 
-void PrintError(const Twine &Msg) { WithColor::error() << Msg << "\n"; }
+// This method takes a Record and uses the source location
+// stored in it.
+void PrintError(const Record *Rec, const Twine &Msg) {
+  PrintMessage(Rec->getLoc(), SourceMgr::DK_Error, Msg);
+}
+
+// This method takes a RecordVal and uses the source location
+// stored in it.
+void PrintError(const RecordVal *RecVal, const Twine &Msg) {
+  PrintMessage(RecVal->getLoc(), SourceMgr::DK_Error, Msg);
+}
+
+// Functions to print fatal errors.
 
 void PrintFatalError(const Twine &Msg) {
   PrintError(Msg);
@@ -79,4 +136,22 @@ void PrintFatalError(ArrayRef<SMLoc> ErrorLoc, const Twine &Msg) {
   std::exit(1);
 }
 
+// This method takes a Record and uses the source location
+// stored in it.
+void PrintFatalError(const Record *Rec, const Twine &Msg) {
+  PrintError(Rec->getLoc(), Msg);
+  // The following call runs the file cleanup handlers.
+  sys::RunInterruptHandlers();
+  std::exit(1);
+}
+
+// This method takes a RecordVal and uses the source location
+// stored in it.
+void PrintFatalError(const RecordVal *RecVal, const Twine &Msg) {
+  PrintError(RecVal->getLoc(), Msg);
+  // The following call runs the file cleanup handlers.
+  sys::RunInterruptHandlers();
+  std::exit(1);
+}
+
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/TableGen/JSONBackend.cpp b/contrib/llvm-project/llvm/lib/TableGen/JSONBackend.cpp
index 196644cda667..8ddfd9f04524 100644
--- a/contrib/llvm-project/llvm/lib/TableGen/JSONBackend.cpp
+++ b/contrib/llvm-project/llvm/lib/TableGen/JSONBackend.cpp
@@ -29,7 +29,6 @@ private:
   RecordKeeper &Records;
 
   json::Value translateInit(const Init &I);
-  json::Array listSuperclasses(const Record &R);
 
 public:
   JSONEmitter(RecordKeeper &R);
@@ -59,8 +58,6 @@ json::Value JSONEmitter::translateInit(const Init &I) {
     return Int->getValue();
   } else if (auto *Str = dyn_cast<StringInit>(&I)) {
     return Str->getValue();
-  } else if (auto *Code = dyn_cast<CodeInit>(&I)) {
-    return Code->getValue();
   } else if (auto *List = dyn_cast<ListInit>(&I)) {
     json::Array array;
     for (auto val : *List)
@@ -147,7 +144,7 @@ void JSONEmitter::run(raw_ostream &OS) {
     for (const RecordVal &RV : Def.getValues()) {
       if (!Def.isTemplateArg(RV.getNameInit())) {
         auto Name = RV.getNameInitAsString();
-        if (RV.getPrefix())
+        if (RV.isNonconcreteOK())
           fields.push_back(Name);
         obj[Name] = translateInit(*RV.getValue());
       }
diff --git a/contrib/llvm-project/llvm/lib/TableGen/Main.cpp b/contrib/llvm-project/llvm/lib/TableGen/Main.cpp
index 77f1b61cf930..0ace5363dd05 100644
--- a/contrib/llvm-project/llvm/lib/TableGen/Main.cpp
+++ b/contrib/llvm-project/llvm/lib/TableGen/Main.cpp
@@ -52,6 +52,9 @@ MacroNames("D", cl::desc("Name of the macro to be defined"),
 static cl::opt<bool>
 WriteIfChanged("write-if-changed", cl::desc("Only write output if it changed"));
 
+static cl::opt<bool>
+TimePhases("time-phases", cl::desc("Time phases of parser and backend"));
+
 static int reportError(const char *ProgName, Twine Msg) {
   errs() << ProgName << ": " << Msg;
   errs().flush();
@@ -83,13 +86,20 @@ static int createDependencyFile(const TGParser &Parser, const char *argv0) {
 int llvm::TableGenMain(const char *argv0, TableGenMainFn *MainFn) {
   RecordKeeper Records;
 
+  if (TimePhases)
+    Records.startPhaseTiming();
+
   // Parse the input file.
+
+  Records.startTimer("Parse, build records");
   ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrErr =
       MemoryBuffer::getFileOrSTDIN(InputFilename);
   if (std::error_code EC = FileOrErr.getError())
     return reportError(argv0, "Could not open input file '" + InputFilename +
                                   "': " + EC.message() + "\n");
 
+  Records.saveInputFilename(InputFilename);
+
   // Tell SrcMgr about this buffer, which is what TGParser will pick up.
   SrcMgr.AddNewSourceBuffer(std::move(*FileOrErr), SMLoc());
 
@@ -101,11 +111,15 @@ int llvm::TableGenMain(const char *argv0, TableGenMainFn *MainFn) {
 
   if (Parser.ParseFile())
     return 1;
+  Records.stopTimer();
 
   // Write output to memory.
+  Records.startBackendTimer("Backend overall");
   std::string OutString;
   raw_string_ostream Out(OutString);
-  if (MainFn(Out, Records))
+  unsigned status = MainFn(Out, Records);
+  Records.stopBackendTimer();
+  if (status)
     return 1;
 
   // Always write the depfile, even if the main output hasn't changed.
@@ -117,26 +131,31 @@ int llvm::TableGenMain(const char *argv0, TableGenMainFn *MainFn) {
       return Ret;
   }
 
+  Records.startTimer("Write output");
+  bool WriteFile = true;
   if (WriteIfChanged) {
     // Only updates the real output file if there are any differences.
     // This prevents recompilation of all the files depending on it if there
     // aren't any.
     if (auto ExistingOrErr = MemoryBuffer::getFile(OutputFilename))
       if (std::move(ExistingOrErr.get())->getBuffer() == Out.str())
-        return 0;
+        WriteFile = false;
   }
-
-  std::error_code EC;
-  ToolOutputFile OutFile(OutputFilename, EC, sys::fs::OF_None);
-  if (EC)
-    return reportError(argv0, "error opening " + OutputFilename + ":" +
-                                  EC.message() + "\n");
-  OutFile.os() << Out.str();
+  if (WriteFile) {
+    std::error_code EC;
+    ToolOutputFile OutFile(OutputFilename, EC, sys::fs::OF_None);
+    if (EC)
+      return reportError(argv0, "error opening " + OutputFilename + ": " +
+                                    EC.message() + "\n");
+    OutFile.os() << Out.str();
+    if (ErrorsPrinted == 0)
+      OutFile.keep();
+  }
+  
+  Records.stopTimer();
+  Records.stopPhaseTiming();
 
   if (ErrorsPrinted > 0)
     return reportError(argv0, Twine(ErrorsPrinted) + " errors.\n");
-
-  // Declare success.
-  OutFile.keep();
   return 0;
 }
diff --git a/contrib/llvm-project/llvm/lib/TableGen/Record.cpp b/contrib/llvm-project/llvm/lib/TableGen/Record.cpp
index d3db004196b8..b6c6b8f031d3 100644
--- a/contrib/llvm-project/llvm/lib/TableGen/Record.cpp
+++ b/contrib/llvm-project/llvm/lib/TableGen/Record.cpp
@@ -43,15 +43,11 @@ using namespace llvm;
 
 static BumpPtrAllocator Allocator;
 
-STATISTIC(CodeInitsConstructed,
-          "The total number of unique CodeInits constructed");
-
 //===----------------------------------------------------------------------===//
 //    Type implementations
 //===----------------------------------------------------------------------===//
 
 BitRecTy BitRecTy::Shared;
-CodeRecTy CodeRecTy::Shared;
 IntRecTy IntRecTy::Shared;
 StringRecTy StringRecTy::Shared;
 DagRecTy DagRecTy::Shared;
@@ -113,27 +109,22 @@ bool IntRecTy::typeIsConvertibleTo(const RecTy *RHS) const {
   return kind==BitRecTyKind || kind==BitsRecTyKind || kind==IntRecTyKind;
 }
 
-bool CodeRecTy::typeIsConvertibleTo(const RecTy *RHS) const {
-  RecTyKind Kind = RHS->getRecTyKind();
-  return Kind == CodeRecTyKind || Kind == StringRecTyKind;
-}
-
 std::string StringRecTy::getAsString() const {
   return "string";
 }
 
 bool StringRecTy::typeIsConvertibleTo(const RecTy *RHS) const {
   RecTyKind Kind = RHS->getRecTyKind();
-  return Kind == StringRecTyKind || Kind == CodeRecTyKind;
+  return Kind == StringRecTyKind;
 }
 
 std::string ListRecTy::getAsString() const {
-  return "list<" + Ty->getAsString() + ">";
+  return "list<" + ElementTy->getAsString() + ">";
 }
 
 bool ListRecTy::typeIsConvertibleTo(const RecTy *RHS) const {
   if (const auto *ListTy = dyn_cast<ListRecTy>(RHS))
-    return Ty->typeIsConvertibleTo(ListTy->getElementType());
+    return ElementTy->typeIsConvertibleTo(ListTy->getElementType());
   return false;
 }
 
@@ -241,13 +232,10 @@ bool RecordRecTy::typeIsA(const RecTy *RHS) const {
 
 static RecordRecTy *resolveRecordTypes(RecordRecTy *T1, RecordRecTy *T2) {
   SmallVector<Record *, 4> CommonSuperClasses;
-  SmallVector<Record *, 4> Stack;
-
-  Stack.insert(Stack.end(), T1->classes_begin(), T1->classes_end());
+  SmallVector<Record *, 4> Stack(T1->classes_begin(), T1->classes_end());
 
   while (!Stack.empty()) {
-    Record *R = Stack.back();
-    Stack.pop_back();
+    Record *R = Stack.pop_back_val();
 
     if (T2->isSubClassOf(R)) {
       CommonSuperClasses.push_back(R);
@@ -514,45 +502,26 @@ IntInit::convertInitializerBitRange(ArrayRef<unsigned> Bits) const {
   return BitsInit::get(NewBits);
 }
 
-CodeInit *CodeInit::get(StringRef V, const SMLoc &Loc) {
-  static StringSet<BumpPtrAllocator &> ThePool(Allocator);
-
-  CodeInitsConstructed++;
-
-  // Unlike StringMap, StringSet doesn't accept empty keys.
-  if (V.empty())
-    return new (Allocator) CodeInit("", Loc);
-
-  // Location tracking prevents us from de-duping CodeInits as we're never
-  // called with the same string and same location twice. However, we can at
-  // least de-dupe the strings for a modest saving.
-  auto &Entry = *ThePool.insert(V).first;
-  return new(Allocator) CodeInit(Entry.getKey(), Loc);
-}
-
-StringInit *StringInit::get(StringRef V) {
-  static StringMap<StringInit*, BumpPtrAllocator &> ThePool(Allocator);
+StringInit *StringInit::get(StringRef V, StringFormat Fmt) {
+  static StringMap<StringInit*, BumpPtrAllocator &> StringPool(Allocator);
+  static StringMap<StringInit*, BumpPtrAllocator &> CodePool(Allocator);
 
-  auto &Entry = *ThePool.insert(std::make_pair(V, nullptr)).first;
-  if (!Entry.second)
-    Entry.second = new(Allocator) StringInit(Entry.getKey());
-  return Entry.second;
+  if (Fmt == SF_String) {
+    auto &Entry = *StringPool.insert(std::make_pair(V, nullptr)).first;
+    if (!Entry.second)
+      Entry.second = new (Allocator) StringInit(Entry.getKey(), Fmt);
+    return Entry.second;
+  } else {
+    auto &Entry = *CodePool.insert(std::make_pair(V, nullptr)).first;
+    if (!Entry.second)
+      Entry.second = new (Allocator) StringInit(Entry.getKey(), Fmt);
+    return Entry.second;
+  }
 }
 
 Init *StringInit::convertInitializerTo(RecTy *Ty) const {
   if (isa<StringRecTy>(Ty))
     return const_cast<StringInit *>(this);
-  if (isa<CodeRecTy>(Ty))
-    return CodeInit::get(getValue(), SMLoc());
-
-  return nullptr;
-}
-
-Init *CodeInit::convertInitializerTo(RecTy *Ty) const {
-  if (isa<CodeRecTy>(Ty))
-    return const_cast<CodeInit *>(this);
-  if (isa<StringRecTy>(Ty))
-    return StringInit::get(getValue());
 
   return nullptr;
 }
@@ -719,8 +688,10 @@ Init *UnOpInit::Fold(Record *CurRec, bool IsFinal) const {
       if (DefInit *LHSd = dyn_cast<DefInit>(LHS))
         return StringInit::get(LHSd->getAsString());
 
-      if (IntInit *LHSi = dyn_cast<IntInit>(LHS))
+      if (IntInit *LHSi =
+              dyn_cast_or_null<IntInit>(LHS->convertInitializerTo(IntRecTy::get())))
         return StringInit::get(LHSi->getAsString());
+
     } else if (isa<RecordRecTy>(getType())) {
       if (StringInit *Name = dyn_cast<StringInit>(LHS)) {
         if (!CurRec && !IsFinal)
@@ -761,6 +732,12 @@ Init *UnOpInit::Fold(Record *CurRec, bool IsFinal) const {
       return NewInit;
     break;
 
+  case NOT:
+    if (IntInit *LHSi =
+            dyn_cast_or_null<IntInit>(LHS->convertInitializerTo(IntRecTy::get())))
+      return IntInit::get(LHSi->getValue() ? 0 : 1);
+    break;
+
   case HEAD:
     if (ListInit *LHSl = dyn_cast<ListInit>(LHS)) {
       assert(!LHSl->empty() && "Empty list in head");
@@ -780,16 +757,22 @@ Init *UnOpInit::Fold(Record *CurRec, bool IsFinal) const {
   case SIZE:
     if (ListInit *LHSl = dyn_cast<ListInit>(LHS))
       return IntInit::get(LHSl->size());
+    if (DagInit *LHSd = dyn_cast<DagInit>(LHS))
+      return IntInit::get(LHSd->arg_size());
+    if (StringInit *LHSs = dyn_cast<StringInit>(LHS))
+      return IntInit::get(LHSs->getValue().size());
     break;
 
   case EMPTY:
     if (ListInit *LHSl = dyn_cast<ListInit>(LHS))
       return IntInit::get(LHSl->empty());
+    if (DagInit *LHSd = dyn_cast<DagInit>(LHS))
+      return IntInit::get(LHSd->arg_empty());
     if (StringInit *LHSs = dyn_cast<StringInit>(LHS))
       return IntInit::get(LHSs->getValue().empty());
     break;
 
-  case GETOP:
+  case GETDAGOP:
     if (DagInit *Dag = dyn_cast<DagInit>(LHS)) {
       DefInit *DI = DefInit::get(Dag->getOperatorAsDef({}));
       if (!DI->getType()->typeIsA(getType())) {
@@ -820,11 +803,12 @@ std::string UnOpInit::getAsString() const {
   std::string Result;
   switch (getOpcode()) {
   case CAST: Result = "!cast<" + getType()->getAsString() + ">"; break;
+  case NOT: Result = "!not"; break;
   case HEAD: Result = "!head"; break;
   case TAIL: Result = "!tail"; break;
   case SIZE: Result = "!size"; break;
   case EMPTY: Result = "!empty"; break;
-  case GETOP: Result = "!getop"; break;
+  case GETDAGOP: Result = "!getdagop"; break;
   }
   return Result + "(" + LHS->getAsString() + ")";
 }
@@ -862,7 +846,53 @@ static StringInit *ConcatStringInits(const StringInit *I0,
                                      const StringInit *I1) {
   SmallString<80> Concat(I0->getValue());
   Concat.append(I1->getValue());
-  return StringInit::get(Concat);
+  return StringInit::get(Concat, 
+                         StringInit::determineFormat(I0->getFormat(),
+                                                     I1->getFormat()));
+}
+
+static StringInit *interleaveStringList(const ListInit *List,
+                                        const StringInit *Delim) {
+  if (List->size() == 0)
+    return StringInit::get("");
+  StringInit *Element = dyn_cast<StringInit>(List->getElement(0));
+  if (!Element)
+    return nullptr;
+  SmallString<80> Result(Element->getValue());
+  StringInit::StringFormat Fmt = StringInit::SF_String;
+
+  for (unsigned I = 1, E = List->size(); I < E; ++I) {
+    Result.append(Delim->getValue());
+    StringInit *Element = dyn_cast<StringInit>(List->getElement(I));
+    if (!Element)
+      return nullptr;
+    Result.append(Element->getValue());
+    Fmt = StringInit::determineFormat(Fmt, Element->getFormat());
+  }
+  return StringInit::get(Result, Fmt);
+}
+
+static StringInit *interleaveIntList(const ListInit *List,
+                                     const StringInit *Delim) {
+  if (List->size() == 0)
+    return StringInit::get("");
+  IntInit *Element =
+      dyn_cast_or_null<IntInit>(List->getElement(0)
+                                    ->convertInitializerTo(IntRecTy::get()));
+  if (!Element)
+    return nullptr;
+  SmallString<80> Result(Element->getAsString());
+
+  for (unsigned I = 1, E = List->size(); I < E; ++I) {
+    Result.append(Delim->getValue());
+    IntInit *Element =
+        dyn_cast_or_null<IntInit>(List->getElement(I)
+                                      ->convertInitializerTo(IntRecTy::get()));
+    if (!Element)
+      return nullptr;
+    Result.append(Element->getAsString());
+  }
+  return StringInit::get(Result);
 }
 
 Init *BinOpInit::getStrConcat(Init *I0, Init *I1) {
@@ -876,8 +906,8 @@ Init *BinOpInit::getStrConcat(Init *I0, Init *I1) {
 static ListInit *ConcatListInits(const ListInit *LHS,
                                  const ListInit *RHS) {
   SmallVector<Init *, 8> Args;
-  Args.insert(Args.end(), LHS->begin(), LHS->end());
-  Args.insert(Args.end(), RHS->begin(), RHS->end());
+  llvm::append_range(Args, *LHS);
+  llvm::append_range(Args, *RHS);
   return ListInit::get(Args, LHS->getElementType());
 }
 
@@ -891,10 +921,6 @@ Init *BinOpInit::getListConcat(TypedInit *LHS, Init *RHS) {
    return BinOpInit::get(BinOpInit::LISTCONCAT, LHS, RHS, LHS->getType());
 }
 
-Init *BinOpInit::getListSplat(TypedInit *LHS, Init *RHS) {
-  return BinOpInit::get(BinOpInit::LISTSPLAT, LHS, RHS, LHS->getType());
-}
-
 Init *BinOpInit::Fold(Record *CurRec) const {
   switch (getOpcode()) {
   case CONCAT: {
@@ -934,8 +960,8 @@ Init *BinOpInit::Fold(Record *CurRec) const {
     ListInit *RHSs = dyn_cast<ListInit>(RHS);
     if (LHSs && RHSs) {
       SmallVector<Init *, 8> Args;
-      Args.insert(Args.end(), LHSs->begin(), LHSs->end());
-      Args.insert(Args.end(), RHSs->begin(), RHSs->end());
+      llvm::append_range(Args, *LHSs);
+      llvm::append_range(Args, *RHSs);
       return ListInit::get(Args, LHSs->getElementType());
     }
     break;
@@ -956,47 +982,76 @@ Init *BinOpInit::Fold(Record *CurRec) const {
       return ConcatStringInits(LHSs, RHSs);
     break;
   }
+  case INTERLEAVE: {
+    ListInit *List = dyn_cast<ListInit>(LHS);
+    StringInit *Delim = dyn_cast<StringInit>(RHS);
+    if (List && Delim) {
+      StringInit *Result;
+      if (isa<StringRecTy>(List->getElementType()))
+        Result = interleaveStringList(List, Delim);
+      else
+        Result = interleaveIntList(List, Delim);
+      if (Result)
+        return Result;
+    }
+    break;
+  }
   case EQ:
   case NE:
   case LE:
   case LT:
   case GE:
   case GT: {
-    // try to fold eq comparison for 'bit' and 'int', otherwise fallback
-    // to string objects.
-    IntInit *L =
+    // First see if we have two bit, bits, or int.
+    IntInit *LHSi =
         dyn_cast_or_null<IntInit>(LHS->convertInitializerTo(IntRecTy::get()));
-    IntInit *R =
+    IntInit *RHSi =
         dyn_cast_or_null<IntInit>(RHS->convertInitializerTo(IntRecTy::get()));
 
-    if (L && R) {
+    if (LHSi && RHSi) {
       bool Result;
       switch (getOpcode()) {
-      case EQ: Result = L->getValue() == R->getValue(); break;
-      case NE: Result = L->getValue() != R->getValue(); break;
-      case LE: Result = L->getValue() <= R->getValue(); break;
-      case LT: Result = L->getValue() < R->getValue(); break;
-      case GE: Result = L->getValue() >= R->getValue(); break;
-      case GT: Result = L->getValue() > R->getValue(); break;
+      case EQ: Result = LHSi->getValue() == RHSi->getValue(); break;
+      case NE: Result = LHSi->getValue() != RHSi->getValue(); break;
+      case LE: Result = LHSi->getValue() <= RHSi->getValue(); break;
+      case LT: Result = LHSi->getValue() <  RHSi->getValue(); break;
+      case GE: Result = LHSi->getValue() >= RHSi->getValue(); break;
+      case GT: Result = LHSi->getValue() >  RHSi->getValue(); break;
       default: llvm_unreachable("unhandled comparison");
       }
       return BitInit::get(Result);
     }
 
-    if (getOpcode() == EQ || getOpcode() == NE) {
-      StringInit *LHSs = dyn_cast<StringInit>(LHS);
-      StringInit *RHSs = dyn_cast<StringInit>(RHS);
+    // Next try strings.
+    StringInit *LHSs = dyn_cast<StringInit>(LHS);
+    StringInit *RHSs = dyn_cast<StringInit>(RHS);
 
-      // Make sure we've resolved
-      if (LHSs && RHSs) {
-        bool Equal = LHSs->getValue() == RHSs->getValue();
-        return BitInit::get(getOpcode() == EQ ? Equal : !Equal);
+    if (LHSs && RHSs) {
+      bool Result;
+      switch (getOpcode()) {
+      case EQ: Result = LHSs->getValue() == RHSs->getValue(); break;
+      case NE: Result = LHSs->getValue() != RHSs->getValue(); break;
+      case LE: Result = LHSs->getValue() <= RHSs->getValue(); break;
+      case LT: Result = LHSs->getValue() <  RHSs->getValue(); break;
+      case GE: Result = LHSs->getValue() >= RHSs->getValue(); break;
+      case GT: Result = LHSs->getValue() >  RHSs->getValue(); break;
+      default: llvm_unreachable("unhandled comparison");
       }
+      return BitInit::get(Result);
+    }
+
+    // Finally, !eq and !ne can be used with records.
+    if (getOpcode() == EQ || getOpcode() == NE) {
+      DefInit *LHSd = dyn_cast<DefInit>(LHS);
+      DefInit *RHSd = dyn_cast<DefInit>(RHS);
+      if (LHSd && RHSd)
+        return BitInit::get((getOpcode() == EQ) ? LHSd == RHSd
+                                                : LHSd != RHSd);
     }
 
     break;
   }
-  case SETOP: {
+  case SETDAGOP: {
     DagInit *Dag = dyn_cast<DagInit>(LHS);
     DefInit *Op = dyn_cast<DefInit>(RHS);
     if (Dag && Op) {
@@ -1011,9 +1066,11 @@ Init *BinOpInit::Fold(Record *CurRec) const {
     break;
   }
   case ADD:
+  case SUB:
   case MUL:
   case AND:
   case OR:
+  case XOR:
   case SHL:
   case SRA:
   case SRL: {
@@ -1026,10 +1083,12 @@ Init *BinOpInit::Fold(Record *CurRec) const {
       int64_t Result;
       switch (getOpcode()) {
       default: llvm_unreachable("Bad opcode!");
-      case ADD: Result = LHSv +  RHSv; break;
-      case MUL: Result = LHSv *  RHSv; break;
-      case AND: Result = LHSv &  RHSv; break;
-      case OR: Result = LHSv | RHSv; break;
+      case ADD: Result = LHSv + RHSv; break;
+      case SUB: Result = LHSv - RHSv; break;
+      case MUL: Result = LHSv * RHSv; break;
+      case AND: Result = LHSv & RHSv; break;
+      case OR:  Result = LHSv | RHSv; break;
+      case XOR: Result = LHSv ^ RHSv; break;
       case SHL: Result = (uint64_t)LHSv << (uint64_t)RHSv; break;
       case SRA: Result = LHSv >> RHSv; break;
       case SRL: Result = (uint64_t)LHSv >> (uint64_t)RHSv; break;
@@ -1057,9 +1116,11 @@ std::string BinOpInit::getAsString() const {
   switch (getOpcode()) {
   case CONCAT: Result = "!con"; break;
   case ADD: Result = "!add"; break;
+  case SUB: Result = "!sub"; break;
   case MUL: Result = "!mul"; break;
   case AND: Result = "!and"; break;
   case OR: Result = "!or"; break;
+  case XOR: Result = "!xor"; break;
   case SHL: Result = "!shl"; break;
   case SRA: Result = "!sra"; break;
   case SRL: Result = "!srl"; break;
@@ -1072,7 +1133,8 @@ std::string BinOpInit::getAsString() const {
   case LISTCONCAT: Result = "!listconcat"; break;
   case LISTSPLAT: Result = "!listsplat"; break;
   case STRCONCAT: Result = "!strconcat"; break;
-  case SETOP: Result = "!setop"; break;
+  case INTERLEAVE: Result = "!interleave"; break;
+  case SETDAGOP: Result = "!setdagop"; break;
   }
   return Result + "(" + LHS->getAsString() + ", " + RHS->getAsString() + ")";
 }
@@ -1107,7 +1169,7 @@ void TernOpInit::Profile(FoldingSetNodeID &ID) const {
   ProfileTernOpInit(ID, getOpcode(), getLHS(), getMHS(), getRHS(), getType());
 }
 
-static Init *ForeachApply(Init *LHS, Init *MHSe, Init *RHS, Record *CurRec) {
+static Init *ItemApply(Init *LHS, Init *MHSe, Init *RHS, Record *CurRec) {
   MapResolver R(CurRec);
   R.set(LHS, MHSe);
   return RHS->resolveReferences(R);
@@ -1116,7 +1178,7 @@ static Init *ForeachApply(Init *LHS, Init *MHSe, Init *RHS, Record *CurRec) {
 static Init *ForeachDagApply(Init *LHS, DagInit *MHSd, Init *RHS,
                              Record *CurRec) {
   bool Change = false;
-  Init *Val = ForeachApply(LHS, MHSd->getOperator(), RHS, CurRec);
+  Init *Val = ItemApply(LHS, MHSd->getOperator(), RHS, CurRec);
   if (Val != MHSd->getOperator())
     Change = true;
 
@@ -1129,7 +1191,7 @@ static Init *ForeachDagApply(Init *LHS, DagInit *MHSd, Init *RHS,
     if (DagInit *Argd = dyn_cast<DagInit>(Arg))
       NewArg = ForeachDagApply(LHS, Argd, RHS, CurRec);
     else
-      NewArg = ForeachApply(LHS, Arg, RHS, CurRec);
+      NewArg = ItemApply(LHS, Arg, RHS, CurRec);
 
     NewArgs.push_back(std::make_pair(NewArg, ArgName));
     if (Arg != NewArg)
@@ -1151,7 +1213,7 @@ static Init *ForeachHelper(Init *LHS, Init *MHS, Init *RHS, RecTy *Type,
     SmallVector<Init *, 8> NewList(MHSl->begin(), MHSl->end());
 
     for (Init *&Item : NewList) {
-      Init *NewItem = ForeachApply(LHS, Item, RHS, CurRec);
+      Init *NewItem = ItemApply(LHS, Item, RHS, CurRec);
       if (NewItem != Item)
         Item = NewItem;
     }
@@ -1161,6 +1223,31 @@ static Init *ForeachHelper(Init *LHS, Init *MHS, Init *RHS, RecTy *Type,
   return nullptr;
 }
 
+// Evaluates RHS for all elements of MHS, using LHS as a temp variable.
+// Creates a new list with the elements that evaluated to true.
+static Init *FilterHelper(Init *LHS, Init *MHS, Init *RHS, RecTy *Type,
+                          Record *CurRec) {
+  if (ListInit *MHSl = dyn_cast<ListInit>(MHS)) {
+    SmallVector<Init *, 8> NewList;
+
+    for (Init *Item : MHSl->getValues()) {
+      Init *Include = ItemApply(LHS, Item, RHS, CurRec);
+      if (!Include)
+        return nullptr;
+      if (IntInit *IncludeInt = dyn_cast_or_null<IntInit>(
+                                    Include->convertInitializerTo(IntRecTy::get()))) {
+        if (IncludeInt->getValue())          
+          NewList.push_back(Item);
+      } else {
+        return nullptr;
+      }
+    }
+    return ListInit::get(NewList, cast<ListRecTy>(Type)->getElementType());
+  }
+
+  return nullptr;
+}
+
 Init *TernOpInit::Fold(Record *CurRec) const {
   switch (getOpcode()) {
   case SUBST: {
@@ -1213,6 +1300,12 @@ Init *TernOpInit::Fold(Record *CurRec) const {
     break;
   }
 
+  case FILTER: {
+    if (Init *Result = FilterHelper(LHS, MHS, RHS, getType(), CurRec))
+      return Result;
+    break;
+  }
+
   case IF: {
     if (IntInit *LHSi = dyn_cast_or_null<IntInit>(
                             LHS->convertInitializerTo(IntRecTy::get()))) {
@@ -1246,6 +1339,27 @@ Init *TernOpInit::Fold(Record *CurRec) const {
     }
     break;
   }
+
+  case SUBSTR: {
+    StringInit *LHSs = dyn_cast<StringInit>(LHS);
+    IntInit *MHSi = dyn_cast<IntInit>(MHS);
+    IntInit *RHSi = dyn_cast<IntInit>(RHS);
+    if (LHSs && MHSi && RHSi) {
+      int64_t StringSize = LHSs->getValue().size();
+      int64_t Start = MHSi->getValue();
+      int64_t Length = RHSi->getValue();
+      if (Start < 0 || Start > StringSize)
+        PrintError(CurRec->getLoc(),
+                   Twine("!substr start position is out of range 0...") +
+                       std::to_string(StringSize) + ": " +
+                       std::to_string(Start));
+      if (Length < 0)
+        PrintError(CurRec->getLoc(), "!substr length must be nonnegative");
+      return StringInit::get(LHSs->getValue().substr(Start, Length),
+                             LHSs->getFormat());
+    }
+    break;
+  }
   }
 
   return const_cast<TernOpInit *>(this);
@@ -1267,7 +1381,7 @@ Init *TernOpInit::resolveReferences(Resolver &R) const {
   Init *mhs = MHS->resolveReferences(R);
   Init *rhs;
 
-  if (getOpcode() == FOREACH) {
+  if (getOpcode() == FOREACH || getOpcode() == FILTER) {
     ShadowResolver SR(R);
     SR.addShadow(lhs);
     rhs = RHS->resolveReferences(SR);
@@ -1285,10 +1399,12 @@ std::string TernOpInit::getAsString() const {
   std::string Result;
   bool UnquotedLHS = false;
   switch (getOpcode()) {
-  case SUBST: Result = "!subst"; break;
+  case DAG: Result = "!dag"; break;
+  case FILTER: Result = "!filter"; UnquotedLHS = true; break;
   case FOREACH: Result = "!foreach"; UnquotedLHS = true; break;
   case IF: Result = "!if"; break;
-  case DAG: Result = "!dag"; break;
+  case SUBST: Result = "!subst"; break;
+  case SUBSTR: Result = "!substr"; break;
   }
   return (Result + "(" +
           (UnquotedLHS ? LHS->getAsUnquotedString() : LHS->getAsString()) +
@@ -2026,8 +2142,16 @@ std::string DagInit::getAsString() const {
 //    Other implementations
 //===----------------------------------------------------------------------===//
 
-RecordVal::RecordVal(Init *N, RecTy *T, bool P)
-  : Name(N), TyAndPrefix(T, P) {
+RecordVal::RecordVal(Init *N, RecTy *T, FieldKind K)
+    : Name(N), TyAndKind(T, K) {
+  setValue(UnsetInit::get());
+  assert(Value && "Cannot create unset value for current type!");
+}
+
+// This constructor accepts the same arguments as the above, but also
+// a source location.
+RecordVal::RecordVal(Init *N, SMLoc Loc, RecTy *T, FieldKind K)
+    : Name(N), Loc(Loc), TyAndKind(T, K) {
   setValue(UnsetInit::get());
   assert(Value && "Cannot create unset value for current type!");
 }
@@ -2036,6 +2160,21 @@ StringRef RecordVal::getName() const {
   return cast<StringInit>(getNameInit())->getValue();
 }
 
+std::string RecordVal::getPrintType() const {
+  if (getType() == StringRecTy::get()) {
+    if (auto *StrInit = dyn_cast<StringInit>(Value)) {
+      if (StrInit->hasCodeFormat())
+        return "code";
+      else
+        return "string";
+    } else {
+      return "string";
+    }
+  } else {
+    return TyAndKind.getPointer()->getAsString();
+  }
+}
+
 bool RecordVal::setValue(Init *V) {
   if (V) {
     Value = V->getCastTo(getType());
@@ -2046,8 +2185,33 @@ bool RecordVal::setValue(Init *V) {
         if (!isa<BitsInit>(Value)) {
           SmallVector<Init *, 64> Bits;
           Bits.reserve(BTy->getNumBits());
-          for (unsigned i = 0, e = BTy->getNumBits(); i < e; ++i)
-            Bits.push_back(Value->getBit(i));
+          for (unsigned I = 0, E = BTy->getNumBits(); I < E; ++I)
+            Bits.push_back(Value->getBit(I));
+          Value = BitsInit::get(Bits);
+        }
+      }
+    }
+    return Value == nullptr;
+  }
+  Value = nullptr;
+  return false;
+}
+
+// This version of setValue takes a source location and resets the
+// location in the RecordVal.
+bool RecordVal::setValue(Init *V, SMLoc NewLoc) {
+  Loc = NewLoc;
+  if (V) {
+    Value = V->getCastTo(getType());
+    if (Value) {
+      assert(!isa<TypedInit>(Value) ||
+             cast<TypedInit>(Value)->getType()->typeIsA(getType()));
+      if (BitsRecTy *BTy = dyn_cast<BitsRecTy>(getType())) {
+        if (!isa<BitsInit>(Value)) {
+          SmallVector<Init *, 64> Bits;
+          Bits.reserve(BTy->getNumBits());
+          for (unsigned I = 0, E = BTy->getNumBits(); I < E; ++I)
+            Bits.push_back(Value->getBit(I));
           Value = BitsInit::get(Bits);
         }
       }
@@ -2058,13 +2222,14 @@ bool RecordVal::setValue(Init *V) {
   return false;
 }
 
+#include "llvm/TableGen/Record.h"
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void RecordVal::dump() const { errs() << *this; }
 #endif
 
 void RecordVal::print(raw_ostream &OS, bool PrintSem) const {
-  if (getPrefix()) OS << "field ";
-  OS << *getType() << " " << getNameInitAsString();
+  if (isNonconcreteOK()) OS << "field ";
+  OS << getPrintType() << " " << getNameInitAsString();
 
   if (getValue())
     OS << " = " << *getValue();
@@ -2089,9 +2254,9 @@ RecordRecTy *Record::getType() {
 }
 
 DefInit *Record::getDefInit() {
-  if (!TheInit)
-    TheInit = new(Allocator) DefInit(this);
-  return TheInit;
+  if (!CorrespondingDefInit)
+    CorrespondingDefInit = new (Allocator) DefInit(this);
+  return CorrespondingDefInit;
 }
 
 void Record::setName(Init *NewName) {
@@ -2110,11 +2275,28 @@ void Record::setName(Init *NewName) {
   // this.  See TGParser::ParseDef and TGParser::ParseDefm.
 }
 
+// NOTE for the next two functions:
+// Superclasses are in post-order, so the final one is a direct
+// superclass. All of its transitive superclases immediately precede it,
+// so we can step through the direct superclasses in reverse order.
+
+bool Record::hasDirectSuperClass(const Record *Superclass) const {
+  ArrayRef<std::pair<Record *, SMRange>> SCs = getSuperClasses();
+
+  for (int I = SCs.size() - 1; I >= 0; --I) {
+    const Record *SC = SCs[I].first;
+    if (SC == Superclass)
+      return true;
+    I -= SC->getSuperClasses().size();
+  }
+
+  return false;
+}
+
 void Record::getDirectSuperClasses(SmallVectorImpl<Record *> &Classes) const {
   ArrayRef<std::pair<Record *, SMRange>> SCs = getSuperClasses();
+
   while (!SCs.empty()) {
-    // Superclasses are in reverse preorder, so 'back' is a direct superclass,
-    // and its transitive superclasses are directly preceding it.
     Record *SC = SCs.back().first;
     SCs = SCs.drop_back(1 + SC->getSuperClasses().size());
     Classes.push_back(SC);
@@ -2187,15 +2369,23 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const Record &R) {
   OS << "\n";
 
   for (const RecordVal &Val : R.getValues())
-    if (Val.getPrefix() && !R.isTemplateArg(Val.getNameInit()))
+    if (Val.isNonconcreteOK() && !R.isTemplateArg(Val.getNameInit()))
       OS << Val;
   for (const RecordVal &Val : R.getValues())
-    if (!Val.getPrefix() && !R.isTemplateArg(Val.getNameInit()))
+    if (!Val.isNonconcreteOK() && !R.isTemplateArg(Val.getNameInit()))
       OS << Val;
 
   return OS << "}\n";
 }
 
+SMLoc Record::getFieldLoc(StringRef FieldName) const {
+  const RecordVal *R = getValue(FieldName);
+  if (!R)
+    PrintFatalError(getLoc(), "Record `" + getName() +
+      "' does not have a field named `" + FieldName + "'!\n");
+  return R->getLoc();
+}
+
 Init *Record::getValueInit(StringRef FieldName) const {
   const RecordVal *R = getValue(FieldName);
   if (!R || !R->getValue())
@@ -2205,18 +2395,27 @@ Init *Record::getValueInit(StringRef FieldName) const {
 }
 
 StringRef Record::getValueAsString(StringRef FieldName) const {
-  const RecordVal *R = getValue(FieldName);
-  if (!R || !R->getValue())
+  llvm::Optional<StringRef> S = getValueAsOptionalString(FieldName);
+  if (!S.hasValue())
     PrintFatalError(getLoc(), "Record `" + getName() +
       "' does not have a field named `" + FieldName + "'!\n");
+  return S.getValue();
+}
+
+llvm::Optional<StringRef>
+Record::getValueAsOptionalString(StringRef FieldName) const {
+  const RecordVal *R = getValue(FieldName);
+  if (!R || !R->getValue())
+    return llvm::Optional<StringRef>();
+  if (isa<UnsetInit>(R->getValue()))
+    return llvm::Optional<StringRef>();
 
   if (StringInit *SI = dyn_cast<StringInit>(R->getValue()))
     return SI->getValue();
-  if (CodeInit *CI = dyn_cast<CodeInit>(R->getValue()))
-    return CI->getValue();
 
-  PrintFatalError(getLoc(), "Record `" + getName() + "', field `" +
-    FieldName + "' does not have a string initializer!");
+  PrintFatalError(getLoc(),
+                  "Record `" + getName() + "', ` field `" + FieldName +
+                      "' exists but does not have a string initializer!");
 }
 
 BitsInit *Record::getValueAsBitsInit(StringRef FieldName) const {
@@ -2227,8 +2426,8 @@ BitsInit *Record::getValueAsBitsInit(StringRef FieldName) const {
 
   if (BitsInit *BI = dyn_cast<BitsInit>(R->getValue()))
     return BI;
-  PrintFatalError(getLoc(), "Record `" + getName() + "', field `" +
-    FieldName + "' does not have a BitsInit initializer!");
+  PrintFatalError(getLoc(), "Record `" + getName() + "', field `" + FieldName + 
+                                "' exists but does not have a bits value");
 }
 
 ListInit *Record::getValueAsListInit(StringRef FieldName) const {
@@ -2239,8 +2438,8 @@ ListInit *Record::getValueAsListInit(StringRef FieldName) const {
 
   if (ListInit *LI = dyn_cast<ListInit>(R->getValue()))
     return LI;
-  PrintFatalError(getLoc(), "Record `" + getName() + "', field `" +
-    FieldName + "' does not have a list initializer!");
+  PrintFatalError(getLoc(), "Record `" + getName() + "', field `" + FieldName + 
+                                "' exists but does not have a list value");
 }
 
 std::vector<Record*>
@@ -2267,7 +2466,7 @@ int64_t Record::getValueAsInt(StringRef FieldName) const {
     return II->getValue();
   PrintFatalError(getLoc(), Twine("Record `") + getName() + "', field `" +
                                 FieldName +
-                                "' does not have an int initializer: " +
+                                "' exists but does not have an int value: " +
                                 R->getValue()->getAsString());
 }
 
@@ -2281,7 +2480,7 @@ Record::getValueAsListOfInts(StringRef FieldName) const {
     else
       PrintFatalError(getLoc(),
                       Twine("Record `") + getName() + "', field `" + FieldName +
-                          "' does not have a list of ints initializer: " +
+                          "' exists but does not have a list of ints value: " +
                           I->getAsString());
   }
   return Ints;
@@ -2294,12 +2493,10 @@ Record::getValueAsListOfStrings(StringRef FieldName) const {
   for (Init *I : List->getValues()) {
     if (StringInit *SI = dyn_cast<StringInit>(I))
       Strings.push_back(SI->getValue());
-    else if (CodeInit *CI = dyn_cast<CodeInit>(I))
-      Strings.push_back(CI->getValue());
     else
       PrintFatalError(getLoc(),
                       Twine("Record `") + getName() + "', field `" + FieldName +
-                          "' does not have a list of strings initializer: " +
+                          "' exists but does not have a list of strings value: " +
                           I->getAsString());
   }
   return Strings;
@@ -2394,16 +2591,78 @@ Init *RecordKeeper::getNewAnonymousName() {
   return StringInit::get("anonymous_" + utostr(AnonCounter++));
 }
 
-std::vector<Record *>
-RecordKeeper::getAllDerivedDefinitions(StringRef ClassName) const {
-  Record *Class = getClass(ClassName);
-  if (!Class)
-    PrintFatalError("ERROR: Couldn't find the `" + ClassName + "' class!\n");
+// These functions implement the phase timing facility. Starting a timer
+// when one is already running stops the running one.
 
-  std::vector<Record*> Defs;
-  for (const auto &D : getDefs())
-    if (D.second->isSubClassOf(Class))
-      Defs.push_back(D.second.get());
+void RecordKeeper::startTimer(StringRef Name) {
+  if (TimingGroup) {
+    if (LastTimer && LastTimer->isRunning()) {
+      LastTimer->stopTimer();
+      if (BackendTimer) {
+        LastTimer->clear();
+        BackendTimer = false;
+      }
+    }
+
+    LastTimer = new Timer("", Name, *TimingGroup);
+    LastTimer->startTimer();
+  }
+}
+
+void RecordKeeper::stopTimer() {
+  if (TimingGroup) {
+    assert(LastTimer && "No phase timer was started");
+    LastTimer->stopTimer();
+  }
+}
+
+void RecordKeeper::startBackendTimer(StringRef Name) {
+  if (TimingGroup) {
+    startTimer(Name);
+    BackendTimer = true;
+  }
+}
+
+void RecordKeeper::stopBackendTimer() {
+  if (TimingGroup) {
+    if (BackendTimer) {
+      stopTimer();
+      BackendTimer = false;
+    }
+  }
+}
+
+// We cache the record vectors for single classes. Many backends request
+// the same vectors multiple times.
+std::vector<Record *> RecordKeeper::getAllDerivedDefinitions(
+    StringRef ClassName) const {
+
+  auto Pair = ClassRecordsMap.try_emplace(ClassName);
+  if (Pair.second)
+    Pair.first->second = getAllDerivedDefinitions(makeArrayRef(ClassName));
+
+  return Pair.first->second;
+}
+
+std::vector<Record *> RecordKeeper::getAllDerivedDefinitions(
+    ArrayRef<StringRef> ClassNames) const {
+  SmallVector<Record *, 2> ClassRecs;
+  std::vector<Record *> Defs;
+
+  assert(ClassNames.size() > 0 && "At least one class must be passed.");
+  for (const auto &ClassName : ClassNames) {
+    Record *Class = getClass(ClassName);
+    if (!Class)
+      PrintFatalError("The class '" + ClassName + "' is not defined\n");
+    ClassRecs.push_back(Class);
+  }
+
+  for (const auto &OneDef : getDefs()) {
+    if (all_of(ClassRecs, [&OneDef](const Record *Class) {
+                            return OneDef.second->isSubClassOf(Class);
+                          }))
+      Defs.push_back(OneDef.second.get());
+  }
 
   return Defs;
 }
diff --git a/contrib/llvm-project/llvm/lib/TableGen/TGLexer.cpp b/contrib/llvm-project/llvm/lib/TableGen/TGLexer.cpp
index 9e6cc947925d..94c79102c7cd 100644
--- a/contrib/llvm-project/llvm/lib/TableGen/TGLexer.cpp
+++ b/contrib/llvm-project/llvm/lib/TableGen/TGLexer.cpp
@@ -150,7 +150,7 @@ tgtok::TokKind TGLexer::LexToken(bool FileOrLineStart) {
   case EOF:
     // Lex next token, if we just left an include file.
     // Note that leaving an include file means that the next
-    // symbol is located at the end of 'include "..."'
+    // symbol is located at the end of the 'include "..."'
     // construct, so LexToken() is called with default
     // false parameter.
     if (processEOF())
@@ -161,7 +161,6 @@ tgtok::TokKind TGLexer::LexToken(bool FileOrLineStart) {
 
   case ':': return tgtok::colon;
   case ';': return tgtok::semi;
-  case '.': return tgtok::period;
   case ',': return tgtok::comma;
   case '<': return tgtok::less;
   case '>': return tgtok::greater;
@@ -181,6 +180,19 @@ tgtok::TokKind TGLexer::LexToken(bool FileOrLineStart) {
 
     return tgtok::paste;
 
+  // The period is a separate case so we can recognize the "..."
+  // range punctuator.
+  case '.':
+    if (peekNextChar(0) == '.') {
+      ++CurPtr; // Eat second dot.
+      if (peekNextChar(0) == '.') {
+        ++CurPtr; // Eat third dot.
+        return tgtok::dotdotdot;
+      }
+      return ReturnError(TokStart, "Invalid '..' punctuation");
+    }
+    return tgtok::dot;
+
   case '\r':
     PrintFatalError("getNextChar() must never return '\r'");
     return tgtok::Error;
@@ -326,14 +338,9 @@ tgtok::TokKind TGLexer::LexIdentifier() {
   while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_')
     ++CurPtr;
 
-  // Check to see if this identifier is a keyword.
+  // Check to see if this identifier is a reserved keyword.
   StringRef Str(IdentStart, CurPtr-IdentStart);
 
-  if (Str == "include") {
-    if (LexInclude()) return tgtok::Error;
-    return Lex();
-  }
-
   tgtok::TokKind Kind = StringSwitch<tgtok::TokKind>(Str)
     .Case("int", tgtok::Int)
     .Case("bit", tgtok::Bit)
@@ -344,6 +351,8 @@ tgtok::TokKind TGLexer::LexIdentifier() {
     .Case("dag", tgtok::Dag)
     .Case("class", tgtok::Class)
     .Case("def", tgtok::Def)
+    .Case("true", tgtok::TrueVal)
+    .Case("false", tgtok::FalseVal)
     .Case("foreach", tgtok::Foreach)
     .Case("defm", tgtok::Defm)
     .Case("defset", tgtok::Defset)
@@ -352,13 +361,25 @@ tgtok::TokKind TGLexer::LexIdentifier() {
     .Case("let", tgtok::Let)
     .Case("in", tgtok::In)
     .Case("defvar", tgtok::Defvar)
+    .Case("include", tgtok::Include)
     .Case("if", tgtok::If)
     .Case("then", tgtok::Then)
     .Case("else", tgtok::ElseKW)
+    .Case("assert", tgtok::Assert)
     .Default(tgtok::Id);
 
-  if (Kind == tgtok::Id)
-    CurStrVal.assign(Str.begin(), Str.end());
+  // A couple of tokens require special processing.
+  switch (Kind) {
+    case tgtok::Include:
+      if (LexInclude()) return tgtok::Error;
+      return Lex();
+    case tgtok::Id:
+      CurStrVal.assign(Str.begin(), Str.end());
+      break;
+    default:
+      break;
+  }
+
   return Kind;
 }
 
@@ -520,7 +541,7 @@ tgtok::TokKind TGLexer::LexBracket() {
     }
   }
 
-  return ReturnError(CodeStart-2, "Unterminated Code Block");
+  return ReturnError(CodeStart - 2, "Unterminated code block");
 }
 
 /// LexExclaim - Lex '!' and '![a-zA-Z]+'.
@@ -550,9 +571,12 @@ tgtok::TokKind TGLexer::LexExclaim() {
     .Case("con", tgtok::XConcat)
     .Case("dag", tgtok::XDag)
     .Case("add", tgtok::XADD)
+    .Case("sub", tgtok::XSUB)
     .Case("mul", tgtok::XMUL)
+    .Case("not", tgtok::XNOT)
     .Case("and", tgtok::XAND)
     .Case("or", tgtok::XOR)
+    .Case("xor", tgtok::XXOR)
     .Case("shl", tgtok::XSHL)
     .Case("sra", tgtok::XSRA)
     .Case("srl", tgtok::XSRL)
@@ -561,11 +585,14 @@ tgtok::TokKind TGLexer::LexExclaim() {
     .Case("subst", tgtok::XSubst)
     .Case("foldl", tgtok::XFoldl)
     .Case("foreach", tgtok::XForEach)
+    .Case("filter", tgtok::XFilter)
     .Case("listconcat", tgtok::XListConcat)
     .Case("listsplat", tgtok::XListSplat)
     .Case("strconcat", tgtok::XStrConcat)
-    .Case("setop", tgtok::XSetOp)
-    .Case("getop", tgtok::XGetOp)
+    .Case("interleave", tgtok::XInterleave)
+    .Case("substr", tgtok::XSubstr)
+    .Cases("setdagop", "setop", tgtok::XSetDagOp) // !setop is deprecated.
+    .Cases("getdagop", "getop", tgtok::XGetDagOp) // !getop is deprecated.
     .Default(tgtok::Error);
 
   return Kind != tgtok::Error ? Kind : ReturnError(Start-1, "Unknown operator");
diff --git a/contrib/llvm-project/llvm/lib/TableGen/TGLexer.h b/contrib/llvm-project/llvm/lib/TableGen/TGLexer.h
index 5b3b0a44e3ef..2f322f705e0d 100644
--- a/contrib/llvm-project/llvm/lib/TableGen/TGLexer.h
+++ b/contrib/llvm-project/llvm/lib/TableGen/TGLexer.h
@@ -40,19 +40,25 @@ namespace tgtok {
     l_paren, r_paren,   // ( )
     less, greater,      // < >
     colon, semi,        // : ;
-    comma, period,      // , .
+    comma, dot,         // , .
     equal, question,    // = ?
     paste,              // #
+    dotdotdot,          // ...
 
-    // Keywords. ('ElseKW' is named to distinguish it from the existing 'Else'
-    // that means the preprocessor #else.)
-    Bit, Bits, Class, Code, Dag, Def, Foreach, Defm, Field, In, Int, Let, List,
-    MultiClass, String, Defset, Defvar, If, Then, ElseKW,
+    // Reserved keywords. ('ElseKW' is named to distinguish it from the
+    // existing 'Else' that means the preprocessor #else.)
+    Assert, Bit, Bits, Class, Code, Dag, Def, Defm, Defset, Defvar, ElseKW,
+    FalseKW, Field, Foreach, If, In, Include, Int, Let, List, MultiClass,
+    String, Then, TrueKW,
 
-    // !keywords.
-    XConcat, XADD, XMUL, XAND, XOR, XSRA, XSRL, XSHL, XListConcat, XListSplat,
-    XStrConcat, XCast, XSubst, XForEach, XFoldl, XHead, XTail, XSize, XEmpty,
-    XIf, XCond, XEq, XIsA, XDag, XNe, XLe, XLt, XGe, XGt, XSetOp, XGetOp,
+    // Bang operators.
+    XConcat, XADD, XSUB, XMUL, XNOT, XAND, XOR, XXOR, XSRA, XSRL, XSHL,
+    XListConcat, XListSplat, XStrConcat, XInterleave, XSubstr, XCast,
+    XSubst, XForEach, XFilter, XFoldl, XHead, XTail, XSize, XEmpty, XIf,
+    XCond, XEq, XIsA, XDag, XNe, XLe, XLt, XGe, XGt, XSetDagOp, XGetDagOp,
+
+    // Boolean literals.
+    TrueVal, FalseVal,
 
     // Integer value.
     IntVal,
@@ -80,8 +86,8 @@ class TGLexer {
   // Information about the current token.
   const char *TokStart = nullptr;
   tgtok::TokKind CurCode = tgtok::TokKind::Eof;
-  std::string CurStrVal;  // This is valid for ID, STRVAL, VARNAME, CODEFRAGMENT
-  int64_t CurIntVal = 0;  // This is valid for INTVAL.
+  std::string CurStrVal; // This is valid for Id, StrVal, VarName, CodeFragment
+  int64_t CurIntVal = 0; // This is valid for IntVal.
 
   /// CurBuffer - This is the current buffer index we're lexing from as managed
   /// by the SourceMgr object.
diff --git a/contrib/llvm-project/llvm/lib/TableGen/TGParser.cpp b/contrib/llvm-project/llvm/lib/TableGen/TGParser.cpp
index 47f471ae2c4b..24949f0b2b4d 100644
--- a/contrib/llvm-project/llvm/lib/TableGen/TGParser.cpp
+++ b/contrib/llvm-project/llvm/lib/TableGen/TGParser.cpp
@@ -25,6 +25,7 @@
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
+#include <limits>
 
 using namespace llvm;
 
@@ -94,7 +95,7 @@ static void checkConcrete(Record &R) {
     // done merely because existing targets have legitimate cases of
     // non-concrete variables in helper defs. Ideally, we'd introduce a
     // 'maybe' or 'optional' modifier instead of this.
-    if (RV.getPrefix())
+    if (RV.isNonconcreteOK())
       continue;
 
     if (Init *V = RV.getValue()) {
@@ -209,16 +210,16 @@ bool TGParser::SetValue(Record *CurRec, SMLoc Loc, Init *ValName,
     V = BitsInit::get(NewBits);
   }
 
-  if (RV->setValue(V)) {
+  if (RV->setValue(V, Loc)) {
     std::string InitType;
     if (BitsInit *BI = dyn_cast<BitsInit>(V))
       InitType = (Twine("' of type bit initializer with length ") +
                   Twine(BI->getNumBits())).str();
     else if (TypedInit *TI = dyn_cast<TypedInit>(V))
       InitType = (Twine("' of type '") + TI->getType()->getAsString()).str();
-    return Error(Loc, "Value '" + ValName->getAsUnquotedString() +
+    return Error(Loc, "Field '" + ValName->getAsUnquotedString() +
                           "' of type '" + RV->getType()->getAsString() +
-                          "' is incompatible with initializer '" +
+                          "' is incompatible with value '" +
                           V->getAsString() + InitType + "'");
   }
   return false;
@@ -451,6 +452,8 @@ bool TGParser::addDefOne(std::unique_ptr<Record> Rec) {
   Rec->resolveReferences();
   checkConcrete(*Rec);
 
+  CheckRecordAsserts(*Rec);
+
   if (!isa<StringInit>(Rec->getNameInit())) {
     PrintError(Rec->getLoc(), Twine("record name '") +
                                   Rec->getNameInit()->getAsString() +
@@ -481,11 +484,12 @@ bool TGParser::addDefOne(std::unique_ptr<Record> Rec) {
 // Parser Code
 //===----------------------------------------------------------------------===//
 
-/// isObjectStart - Return true if this is a valid first token for an Object.
+/// isObjectStart - Return true if this is a valid first token for a statement.
 static bool isObjectStart(tgtok::TokKind K) {
-  return K == tgtok::Class || K == tgtok::Def || K == tgtok::Defm ||
-         K == tgtok::Let || K == tgtok::MultiClass || K == tgtok::Foreach ||
-         K == tgtok::Defset || K == tgtok::Defvar || K == tgtok::If;
+  return K == tgtok::Assert || K == tgtok::Class || K == tgtok::Def ||
+         K == tgtok::Defm || K == tgtok::Defset || K == tgtok::Defvar ||
+         K == tgtok::Foreach || K == tgtok::If || K == tgtok::Let ||
+         K == tgtok::MultiClass;
 }
 
 bool TGParser::consume(tgtok::TokKind K) {
@@ -671,8 +675,10 @@ ParseSubMultiClassReference(MultiClass *CurMC) {
 
 /// ParseRangePiece - Parse a bit/value range.
 ///   RangePiece ::= INTVAL
+///   RangePiece ::= INTVAL '...' INTVAL
 ///   RangePiece ::= INTVAL '-' INTVAL
-///   RangePiece ::= INTVAL INTVAL
+///   RangePiece ::= INTVAL INTVAL 
+// The last two forms are deprecated.
 bool TGParser::ParseRangePiece(SmallVectorImpl<unsigned> &Ranges,
                                TypedInit *FirstItem) {
   Init *CurVal = FirstItem;
@@ -693,6 +699,8 @@ bool TGParser::ParseRangePiece(SmallVectorImpl<unsigned> &Ranges,
   default:
     Ranges.push_back(Start);
     return false;
+
+  case tgtok::dotdotdot:
   case tgtok::minus: {
     Lex.Lex(); // eat
 
@@ -795,8 +803,8 @@ bool TGParser::ParseOptionalBitList(SmallVectorImpl<unsigned> &Ranges) {
 RecTy *TGParser::ParseType() {
   switch (Lex.getCode()) {
   default: TokError("Unknown token when expecting a type"); return nullptr;
-  case tgtok::String: Lex.Lex(); return StringRecTy::get();
-  case tgtok::Code:   Lex.Lex(); return CodeRecTy::get();
+  case tgtok::String:
+  case tgtok::Code:   Lex.Lex(); return StringRecTy::get();
   case tgtok::Bit:    Lex.Lex(); return BitRecTy::get();
   case tgtok::Int:    Lex.Lex(); return IntRecTy::get();
   case tgtok::Dag:    Lex.Lex(); return DagRecTy::get();
@@ -809,12 +817,12 @@ RecTy *TGParser::ParseType() {
       TokError("expected '<' after bits type");
       return nullptr;
     }
-    if (Lex.Lex() != tgtok::IntVal) {  // Eat '<'
+    if (Lex.Lex() != tgtok::IntVal) { // Eat '<'
       TokError("expected integer in bits<n> type");
       return nullptr;
     }
     uint64_t Val = Lex.getCurIntVal();
-    if (Lex.Lex() != tgtok::greater) {  // Eat count.
+    if (Lex.Lex() != tgtok::greater) { // Eat count.
       TokError("expected '>' at end of bits<n> type");
       return nullptr;
     }
@@ -839,8 +847,7 @@ RecTy *TGParser::ParseType() {
   }
 }
 
-/// ParseIDValue - This is just like ParseIDValue above, but it assumes the ID
-/// has already been read.
+/// ParseIDValue
 Init *TGParser::ParseIDValue(Record *CurRec, StringInit *Name, SMLoc NameLoc,
                              IDParseMode Mode) {
   if (CurRec) {
@@ -902,14 +909,15 @@ Init *TGParser::ParseIDValue(Record *CurRec, StringInit *Name, SMLoc NameLoc,
 Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
   switch (Lex.getCode()) {
   default:
-    TokError("unknown operation");
+    TokError("unknown bang operator");
     return nullptr;
+  case tgtok::XNOT:
   case tgtok::XHead:
   case tgtok::XTail:
   case tgtok::XSize:
   case tgtok::XEmpty:
   case tgtok::XCast:
-  case tgtok::XGetOp: {  // Value ::= !unop '(' Value ')'
+  case tgtok::XGetDagOp: { // Value ::= !unop '(' Value ')'
     UnOpInit::UnaryOp Code;
     RecTy *Type = nullptr;
 
@@ -926,6 +934,11 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
         return nullptr;
       }
 
+      break;
+    case tgtok::XNOT:
+      Lex.Lex();  // eat the operation
+      Code = UnOpInit::NOT;
+      Type = IntRecTy::get();
       break;
     case tgtok::XHead:
       Lex.Lex();  // eat the operation
@@ -945,12 +958,12 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
       Code = UnOpInit::EMPTY;
       Type = IntRecTy::get();
       break;
-    case tgtok::XGetOp:
+    case tgtok::XGetDagOp:
       Lex.Lex();  // eat the operation
       if (Lex.getCode() == tgtok::less) {
         // Parse an optional type suffix, so that you can say
-        // !getop<BaseClass>(someDag) as a shorthand for
-        // !cast<BaseClass>(!getop(someDag)).
+        // !getdagop<BaseClass>(someDag) as a shorthand for
+        // !cast<BaseClass>(!getdagop(someDag)).
         Type = ParseOperatorType();
 
         if (!Type) {
@@ -959,13 +972,13 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
         }
 
         if (!isa<RecordRecTy>(Type)) {
-          TokError("type for !getop must be a record type");
+          TokError("type for !getdagop must be a record type");
           // but keep parsing, to consume the operand
         }
       } else {
         Type = RecordRecTy::get({});
       }
-      Code = UnOpInit::GETOP;
+      Code = UnOpInit::GETDAGOP;
       break;
     }
     if (!consume(tgtok::l_paren)) {
@@ -976,56 +989,58 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
     Init *LHS = ParseValue(CurRec);
     if (!LHS) return nullptr;
 
-    if (Code == UnOpInit::HEAD ||
-        Code == UnOpInit::TAIL ||
-        Code == UnOpInit::EMPTY) {
+    if (Code == UnOpInit::EMPTY || Code == UnOpInit::SIZE) {
       ListInit *LHSl = dyn_cast<ListInit>(LHS);
       StringInit *LHSs = dyn_cast<StringInit>(LHS);
+      DagInit *LHSd = dyn_cast<DagInit>(LHS);
       TypedInit *LHSt = dyn_cast<TypedInit>(LHS);
-      if (!LHSl && !LHSs && !LHSt) {
-        TokError("expected list or string type argument in unary operator");
+      if (!LHSl && !LHSs && !LHSd && !LHSt) {
+        TokError("expected string, list, or dag type argument in unary operator");
         return nullptr;
       }
       if (LHSt) {
         ListRecTy *LType = dyn_cast<ListRecTy>(LHSt->getType());
         StringRecTy *SType = dyn_cast<StringRecTy>(LHSt->getType());
-        if (!LType && !SType) {
-          TokError("expected list or string type argument in unary operator");
+        DagRecTy *DType = dyn_cast<DagRecTy>(LHSt->getType());
+        if (!LType && !SType && !DType) {
+          TokError("expected string, list, or dag type argument in unary operator");
           return nullptr;
         }
       }
+    }
 
-      if (Code == UnOpInit::HEAD || Code == UnOpInit::TAIL ||
-          Code == UnOpInit::SIZE) {
-        if (!LHSl && !LHSt) {
+    if (Code == UnOpInit::HEAD || Code == UnOpInit::TAIL) {
+      ListInit *LHSl = dyn_cast<ListInit>(LHS);
+      TypedInit *LHSt = dyn_cast<TypedInit>(LHS);
+      if (!LHSl && !LHSt) {
+        TokError("expected list type argument in unary operator");
+        return nullptr;
+      }
+      if (LHSt) {
+        ListRecTy *LType = dyn_cast<ListRecTy>(LHSt->getType());
+        if (!LType) {
           TokError("expected list type argument in unary operator");
           return nullptr;
         }
       }
 
-      if (Code == UnOpInit::HEAD || Code == UnOpInit::TAIL) {
-        if (LHSl && LHSl->empty()) {
-          TokError("empty list argument in unary operator");
+      if (LHSl && LHSl->empty()) {
+        TokError("empty list argument in unary operator");
+        return nullptr;
+      }
+      if (LHSl) {
+        Init *Item = LHSl->getElement(0);
+        TypedInit *Itemt = dyn_cast<TypedInit>(Item);
+        if (!Itemt) {
+          TokError("untyped list element in unary operator");
           return nullptr;
         }
-        if (LHSl) {
-          Init *Item = LHSl->getElement(0);
-          TypedInit *Itemt = dyn_cast<TypedInit>(Item);
-          if (!Itemt) {
-            TokError("untyped list element in unary operator");
-            return nullptr;
-          }
-          Type = (Code == UnOpInit::HEAD) ? Itemt->getType()
-                                          : ListRecTy::get(Itemt->getType());
-        } else {
-          assert(LHSt && "expected list type argument in unary operator");
-          ListRecTy *LType = dyn_cast<ListRecTy>(LHSt->getType());
-          if (!LType) {
-            TokError("expected list type argument in unary operator");
-            return nullptr;
-          }
-          Type = (Code == UnOpInit::HEAD) ? LType->getElementType() : LType;
-        }
+        Type = (Code == UnOpInit::HEAD) ? Itemt->getType()
+                                        : ListRecTy::get(Itemt->getType());
+      } else {
+        assert(LHSt && "expected list type argument in unary operator");
+        ListRecTy *LType = dyn_cast<ListRecTy>(LHSt->getType());
+        Type = (Code == UnOpInit::HEAD) ? LType->getElementType() : LType;
       }
     }
 
@@ -1063,9 +1078,11 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
 
   case tgtok::XConcat:
   case tgtok::XADD:
+  case tgtok::XSUB:
   case tgtok::XMUL:
   case tgtok::XAND:
   case tgtok::XOR:
+  case tgtok::XXOR:
   case tgtok::XSRA:
   case tgtok::XSRL:
   case tgtok::XSHL:
@@ -1078,7 +1095,8 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
   case tgtok::XListConcat:
   case tgtok::XListSplat:
   case tgtok::XStrConcat:
-  case tgtok::XSetOp: {  // Value ::= !binop '(' Value ',' Value ')'
+  case tgtok::XInterleave:
+  case tgtok::XSetDagOp: { // Value ::= !binop '(' Value ',' Value ')'
     tgtok::TokKind OpTok = Lex.getCode();
     SMLoc OpLoc = Lex.getLoc();
     Lex.Lex();  // eat the operation
@@ -1088,9 +1106,11 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
     default: llvm_unreachable("Unhandled code!");
     case tgtok::XConcat: Code = BinOpInit::CONCAT; break;
     case tgtok::XADD:    Code = BinOpInit::ADD; break;
+    case tgtok::XSUB:    Code = BinOpInit::SUB; break;
     case tgtok::XMUL:    Code = BinOpInit::MUL; break;
     case tgtok::XAND:    Code = BinOpInit::AND; break;
     case tgtok::XOR:     Code = BinOpInit::OR; break;
+    case tgtok::XXOR:    Code = BinOpInit::XOR; break;
     case tgtok::XSRA:    Code = BinOpInit::SRA; break;
     case tgtok::XSRL:    Code = BinOpInit::SRL; break;
     case tgtok::XSHL:    Code = BinOpInit::SHL; break;
@@ -1101,9 +1121,10 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
     case tgtok::XGe:     Code = BinOpInit::GE; break;
     case tgtok::XGt:     Code = BinOpInit::GT; break;
     case tgtok::XListConcat: Code = BinOpInit::LISTCONCAT; break;
-    case tgtok::XListSplat: Code = BinOpInit::LISTSPLAT; break;
-    case tgtok::XStrConcat: Code = BinOpInit::STRCONCAT; break;
-    case tgtok::XSetOp: Code = BinOpInit::SETOP; break;
+    case tgtok::XListSplat:  Code = BinOpInit::LISTSPLAT; break;
+    case tgtok::XStrConcat:  Code = BinOpInit::STRCONCAT; break;
+    case tgtok::XInterleave: Code = BinOpInit::INTERLEAVE; break;
+    case tgtok::XSetDagOp:   Code = BinOpInit::SETDAGOP; break;
     }
 
     RecTy *Type = nullptr;
@@ -1112,31 +1133,30 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
     default:
       llvm_unreachable("Unhandled code!");
     case tgtok::XConcat:
-    case tgtok::XSetOp:
+    case tgtok::XSetDagOp:
       Type = DagRecTy::get();
       ArgType = DagRecTy::get();
       break;
     case tgtok::XAND:
     case tgtok::XOR:
+    case tgtok::XXOR:
     case tgtok::XSRA:
     case tgtok::XSRL:
     case tgtok::XSHL:
     case tgtok::XADD:
+    case tgtok::XSUB:
     case tgtok::XMUL:
       Type = IntRecTy::get();
       ArgType = IntRecTy::get();
       break;
     case tgtok::XEq:
     case tgtok::XNe:
-      Type = BitRecTy::get();
-      // ArgType for Eq / Ne is not known at this point
-      break;
     case tgtok::XLe:
     case tgtok::XLt:
     case tgtok::XGe:
     case tgtok::XGt:
       Type = BitRecTy::get();
-      ArgType = IntRecTy::get();
+      // ArgType for the comparison operators is not yet known.
       break;
     case tgtok::XListConcat:
       // We don't know the list type until we parse the first argument
@@ -1149,6 +1169,9 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
       Type = StringRecTy::get();
       ArgType = StringRecTy::get();
       break;
+    case tgtok::XInterleave:
+      Type = StringRecTy::get();
+      // The first argument type is not yet known.
     }
 
     if (Type && ItemType && !Type->typeIsConvertibleTo(ItemType)) {
@@ -1165,6 +1188,8 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
 
     SmallVector<Init*, 2> InitList;
 
+    // Note that this loop consumes an arbitrary number of arguments.
+    // The actual count is checked later.
     for (;;) {
       SMLoc InitLoc = Lex.getLoc();
       InitList.push_back(ParseValue(CurRec, ArgType));
@@ -1177,7 +1202,9 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
         return nullptr;
       }
       RecTy *ListType = InitListBack->getType();
+
       if (!ArgType) {
+        // Argument type must be determined from the argument itself.
         ArgType = ListType;
 
         switch (Code) {
@@ -1217,16 +1244,55 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
           break;
         case BinOpInit::EQ:
         case BinOpInit::NE:
+          if (!ArgType->typeIsConvertibleTo(IntRecTy::get()) &&
+              !ArgType->typeIsConvertibleTo(StringRecTy::get()) &&
+              !ArgType->typeIsConvertibleTo(RecordRecTy::get({}))) {
+            Error(InitLoc, Twine("expected bit, bits, int, string, or record; "
+                                 "got value of type '") + ArgType->getAsString() + 
+                                 "'");
+            return nullptr;
+          }
+          break;
+        case BinOpInit::LE:
+        case BinOpInit::LT:
+        case BinOpInit::GE:
+        case BinOpInit::GT:
           if (!ArgType->typeIsConvertibleTo(IntRecTy::get()) &&
               !ArgType->typeIsConvertibleTo(StringRecTy::get())) {
-            Error(InitLoc, Twine("expected int, bits, or string; got value of "
-                                 "type '") + ArgType->getAsString() + "'");
+            Error(InitLoc, Twine("expected bit, bits, int, or string; "
+                                 "got value of type '") + ArgType->getAsString() + 
+                                 "'");
             return nullptr;
           }
           break;
+        case BinOpInit::INTERLEAVE:
+          switch (InitList.size()) {
+          case 1: // First argument must be a list of strings or integers.
+            if (ArgType != StringRecTy::get()->getListTy() &&
+                !ArgType->typeIsConvertibleTo(IntRecTy::get()->getListTy())) {
+              Error(InitLoc, Twine("expected list of string, int, bits, or bit; "
+                                   "got value of type '") +
+                                   ArgType->getAsString() + "'");
+              return nullptr;
+            }
+            break;
+          case 2: // Second argument must be a string.
+            if (!isa<StringRecTy>(ArgType)) {
+              Error(InitLoc, Twine("expected second argument to be a string, "
+                                   "got value of type '") +
+                                 ArgType->getAsString() + "'");
+              return nullptr;
+            }
+            break;
+          default: ;
+          }
+          ArgType = nullptr; // Broken invariant: types not identical.
+          break;
         default: llvm_unreachable("other ops have fixed argument types");
         }
+
       } else {
+        // Desired argument type is a known and in ArgType.
         RecTy *Resolved = resolveTypes(ArgType, ListType);
         if (!Resolved) {
           Error(InitLoc, Twine("expected value of type '") +
@@ -1234,8 +1300,9 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
                              ListType->getAsString() + "'");
           return nullptr;
         }
-        if (Code != BinOpInit::ADD && Code != BinOpInit::AND &&
-            Code != BinOpInit::OR && Code != BinOpInit::SRA &&
+        if (Code != BinOpInit::ADD && Code != BinOpInit::SUB &&
+            Code != BinOpInit::AND && Code != BinOpInit::OR &&
+            Code != BinOpInit::XOR && Code != BinOpInit::SRA &&
             Code != BinOpInit::SRL && Code != BinOpInit::SHL &&
             Code != BinOpInit::MUL)
           ArgType = Resolved;
@@ -1244,7 +1311,7 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
       // Deal with BinOps whose arguments have different types, by
       // rewriting ArgType in between them.
       switch (Code) {
-        case BinOpInit::SETOP:
+        case BinOpInit::SETDAGOP:
           // After parsing the first dag argument, switch to expecting
           // a record, with no restriction on its superclasses.
           ArgType = RecordRecTy::get({});
@@ -1274,7 +1341,7 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
     if (Code == BinOpInit::STRCONCAT || Code == BinOpInit::LISTCONCAT ||
         Code == BinOpInit::CONCAT || Code == BinOpInit::ADD ||
         Code == BinOpInit::AND || Code == BinOpInit::OR ||
-        Code == BinOpInit::MUL) {
+        Code == BinOpInit::XOR || Code == BinOpInit::MUL) {
       while (InitList.size() > 2) {
         Init *RHS = InitList.pop_back_val();
         RHS = (BinOpInit::get(Code, InitList.back(), RHS, Type))->Fold(CurRec);
@@ -1290,118 +1357,14 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
     return nullptr;
   }
 
-  case tgtok::XForEach: { // Value ::= !foreach '(' Id ',' Value ',' Value ')'
-    SMLoc OpLoc = Lex.getLoc();
-    Lex.Lex(); // eat the operation
-    if (Lex.getCode() != tgtok::l_paren) {
-      TokError("expected '(' after !foreach");
-      return nullptr;
-    }
-
-    if (Lex.Lex() != tgtok::Id) { // eat the '('
-      TokError("first argument of !foreach must be an identifier");
-      return nullptr;
-    }
-
-    Init *LHS = StringInit::get(Lex.getCurStrVal());
-    Lex.Lex();
-
-    if (CurRec && CurRec->getValue(LHS)) {
-      TokError((Twine("iteration variable '") + LHS->getAsString() +
-                "' already defined")
-                   .str());
-      return nullptr;
-    }
-
-    if (!consume(tgtok::comma)) { // eat the id
-      TokError("expected ',' in ternary operator");
-      return nullptr;
-    }
-
-    Init *MHS = ParseValue(CurRec);
-    if (!MHS)
-      return nullptr;
-
-    if (!consume(tgtok::comma)) {
-      TokError("expected ',' in ternary operator");
-      return nullptr;
-    }
-
-    TypedInit *MHSt = dyn_cast<TypedInit>(MHS);
-    if (!MHSt) {
-      TokError("could not get type of !foreach input");
-      return nullptr;
-    }
-
-    RecTy *InEltType = nullptr;
-    RecTy *OutEltType = nullptr;
-    bool IsDAG = false;
-
-    if (ListRecTy *InListTy = dyn_cast<ListRecTy>(MHSt->getType())) {
-      InEltType = InListTy->getElementType();
-      if (ItemType) {
-        if (ListRecTy *OutListTy = dyn_cast<ListRecTy>(ItemType)) {
-          OutEltType = OutListTy->getElementType();
-        } else {
-          Error(OpLoc,
-                "expected value of type '" + Twine(ItemType->getAsString()) +
-                "', but got !foreach of list type");
-          return nullptr;
-        }
-      }
-    } else if (DagRecTy *InDagTy = dyn_cast<DagRecTy>(MHSt->getType())) {
-      InEltType = InDagTy;
-      if (ItemType && !isa<DagRecTy>(ItemType)) {
-        Error(OpLoc,
-              "expected value of type '" + Twine(ItemType->getAsString()) +
-              "', but got !foreach of dag type");
-        return nullptr;
-      }
-      IsDAG = true;
-    } else {
-      TokError("!foreach must have list or dag input");
-      return nullptr;
-    }
-
-    // We need to create a temporary record to provide a scope for the iteration
-    // variable while parsing top-level foreach's.
-    std::unique_ptr<Record> ParseRecTmp;
-    Record *ParseRec = CurRec;
-    if (!ParseRec) {
-      ParseRecTmp = std::make_unique<Record>(".parse", ArrayRef<SMLoc>{}, Records);
-      ParseRec = ParseRecTmp.get();
-    }
-
-    ParseRec->addValue(RecordVal(LHS, InEltType, false));
-    Init *RHS = ParseValue(ParseRec, OutEltType);
-    ParseRec->removeValue(LHS);
-    if (!RHS)
-      return nullptr;
-
-    if (!consume(tgtok::r_paren)) {
-      TokError("expected ')' in binary operator");
-      return nullptr;
-    }
-
-    RecTy *OutType;
-    if (IsDAG) {
-      OutType = InEltType;
-    } else {
-      TypedInit *RHSt = dyn_cast<TypedInit>(RHS);
-      if (!RHSt) {
-        TokError("could not get type of !foreach result");
-        return nullptr;
-      }
-      OutType = RHSt->getType()->getListTy();
-    }
-
-    return (TernOpInit::get(TernOpInit::FOREACH, LHS, MHS, RHS, OutType))
-        ->Fold(CurRec);
+  case tgtok::XForEach:
+  case tgtok::XFilter: {
+    return ParseOperationForEachFilter(CurRec, ItemType);
   }
 
   case tgtok::XDag:
   case tgtok::XIf:
-  case tgtok::XSubst: {  // Value ::= !ternop '(' Value ',' Value ',' Value ')'
+  case tgtok::XSubst: { // Value ::= !ternop '(' Value ',' Value ',' Value ')'
     TernOpInit::TernaryOp Code;
     RecTy *Type = nullptr;
 
@@ -1536,11 +1499,14 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
     return (TernOpInit::get(Code, LHS, MHS, RHS, Type))->Fold(CurRec);
   }
 
+  case tgtok::XSubstr:
+    return ParseOperationSubstr(CurRec, ItemType);
+
   case tgtok::XCond:
     return ParseOperationCond(CurRec, ItemType);
 
   case tgtok::XFoldl: {
-    // Value ::= !foldl '(' Id ',' Id ',' Value ',' Value ',' Value ')'
+    // Value ::= !foldl '(' Value ',' Value ',' Id ',' Id ',' Expr ')'
     Lex.Lex(); // eat the operation
     if (!consume(tgtok::l_paren)) {
       TokError("expected '(' after !foldl");
@@ -1623,8 +1589,8 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
     }
     Lex.Lex(); // eat the ','
 
-    // We need to create a temporary record to provide a scope for the iteration
-    // variable while parsing top-level foreach's.
+    // We need to create a temporary record to provide a scope for the
+    // two variables.
     std::unique_ptr<Record> ParseRecTmp;
     Record *ParseRec = CurRec;
     if (!ParseRec) {
@@ -1632,8 +1598,9 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
       ParseRec = ParseRecTmp.get();
     }
 
-    ParseRec->addValue(RecordVal(A, Start->getType(), false));
-    ParseRec->addValue(RecordVal(B, ListType->getElementType(), false));
+    ParseRec->addValue(RecordVal(A, Start->getType(), RecordVal::FK_Normal));
+    ParseRec->addValue(RecordVal(B, ListType->getElementType(),
+                                 RecordVal::FK_Normal));
     Init *ExprUntyped = ParseValue(ParseRec);
     ParseRec->removeValue(A);
     ParseRec->removeValue(B);
@@ -1677,6 +1644,9 @@ RecTy *TGParser::ParseOperatorType() {
     return nullptr;
   }
 
+  if (Lex.getCode() == tgtok::Code)
+    TokError("the 'code' type is not allowed in bang operators; use 'string'");
+
   Type = ParseType();
 
   if (!Type) {
@@ -1692,6 +1662,221 @@ RecTy *TGParser::ParseOperatorType() {
   return Type;
 }
 
+/// Parse the !substr operation. Return null on error.
+///
+/// Substr ::= !substr(string, start-int [, length-int]) => string
+Init *TGParser::ParseOperationSubstr(Record *CurRec, RecTy *ItemType) {
+  TernOpInit::TernaryOp Code = TernOpInit::SUBSTR;
+  RecTy *Type = StringRecTy::get();
+
+  Lex.Lex(); // eat the operation
+
+  if (!consume(tgtok::l_paren)) {
+    TokError("expected '(' after !substr operator");
+    return nullptr;
+  }
+
+  Init *LHS = ParseValue(CurRec);
+  if (!LHS)
+    return nullptr;
+
+  if (!consume(tgtok::comma)) {
+    TokError("expected ',' in !substr operator");
+    return nullptr;
+  }
+
+  SMLoc MHSLoc = Lex.getLoc();
+  Init *MHS = ParseValue(CurRec);
+  if (!MHS)
+    return nullptr;
+
+  SMLoc RHSLoc = Lex.getLoc();
+  Init *RHS;
+  if (consume(tgtok::comma)) {
+    RHSLoc = Lex.getLoc();
+    RHS = ParseValue(CurRec);
+    if (!RHS)
+      return nullptr;
+  } else {
+    RHS = IntInit::get(std::numeric_limits<int64_t>::max());
+  }
+
+  if (!consume(tgtok::r_paren)) {
+    TokError("expected ')' in !substr operator");
+    return nullptr;
+  }
+
+  if (ItemType && !Type->typeIsConvertibleTo(ItemType)) {
+    Error(RHSLoc, Twine("expected value of type '") +
+                  ItemType->getAsString() + "', got '" +
+                  Type->getAsString() + "'");
+  }
+
+  TypedInit *LHSt = dyn_cast<TypedInit>(LHS);
+  if (!LHSt && !isa<UnsetInit>(LHS)) {
+    TokError("could not determine type of the string in !substr");
+    return nullptr;
+  }
+  if (LHSt && !isa<StringRecTy>(LHSt->getType())) {
+    TokError(Twine("expected string, got type '") +
+             LHSt->getType()->getAsString() + "'");
+    return nullptr;
+  }
+
+  TypedInit *MHSt = dyn_cast<TypedInit>(MHS);
+  if (!MHSt && !isa<UnsetInit>(MHS)) {
+    TokError("could not determine type of the start position in !substr");
+    return nullptr;
+  }
+  if (MHSt && !isa<IntRecTy>(MHSt->getType())) {
+    Error(MHSLoc, Twine("expected int, got type '") +
+                      MHSt->getType()->getAsString() + "'");
+    return nullptr;
+  }
+
+  if (RHS) {
+    TypedInit *RHSt = dyn_cast<TypedInit>(RHS);
+    if (!RHSt && !isa<UnsetInit>(RHS)) {
+      TokError("could not determine type of the length in !substr");
+      return nullptr;
+    }
+    if (RHSt && !isa<IntRecTy>(RHSt->getType())) {
+      TokError(Twine("expected int, got type '") +
+               RHSt->getType()->getAsString() + "'");
+      return nullptr;
+    }
+  }
+
+  return (TernOpInit::get(Code, LHS, MHS, RHS, Type))->Fold(CurRec);
+}
+
+/// Parse the !foreach and !filter operations. Return null on error.
+///
+/// ForEach ::= !foreach(ID, list-or-dag, expr) => list<expr type>
+/// Filter  ::= !foreach(ID, list, predicate) ==> list<list type>
+Init *TGParser::ParseOperationForEachFilter(Record *CurRec, RecTy *ItemType) { 
+  SMLoc OpLoc = Lex.getLoc();
+  tgtok::TokKind Operation = Lex.getCode();
+  Lex.Lex(); // eat the operation
+  if (Lex.getCode() != tgtok::l_paren) {
+    TokError("expected '(' after !foreach/!filter");
+    return nullptr;
+  }
+
+  if (Lex.Lex() != tgtok::Id) { // eat the '('
+    TokError("first argument of !foreach/!filter must be an identifier");
+    return nullptr;
+  }
+
+  Init *LHS = StringInit::get(Lex.getCurStrVal());
+  Lex.Lex(); // eat the ID.
+
+  if (CurRec && CurRec->getValue(LHS)) {
+    TokError((Twine("iteration variable '") + LHS->getAsString() +
+              "' is already defined")
+                 .str());
+    return nullptr;
+  }
+
+  if (!consume(tgtok::comma)) {
+    TokError("expected ',' in !foreach/!filter");
+    return nullptr;
+  }
+
+  Init *MHS = ParseValue(CurRec);
+  if (!MHS)
+    return nullptr;
+
+  if (!consume(tgtok::comma)) {
+    TokError("expected ',' in !foreach/!filter");
+    return nullptr;
+  }
+
+  TypedInit *MHSt = dyn_cast<TypedInit>(MHS);
+  if (!MHSt) {
+    TokError("could not get type of !foreach/!filter list or dag");
+    return nullptr;
+  }
+
+  RecTy *InEltType = nullptr;
+  RecTy *ExprEltType = nullptr;
+  bool IsDAG = false;
+
+  if (ListRecTy *InListTy = dyn_cast<ListRecTy>(MHSt->getType())) {
+    InEltType = InListTy->getElementType();
+    if (ItemType) {
+      if (ListRecTy *OutListTy = dyn_cast<ListRecTy>(ItemType)) {
+        ExprEltType = (Operation == tgtok::XForEach)
+                          ? OutListTy->getElementType()
+                          : IntRecTy::get();
+      } else {
+        Error(OpLoc,
+              "expected value of type '" +
+                  Twine(ItemType->getAsString()) +
+                  "', but got list type");
+        return nullptr;
+      }
+    }
+  } else if (DagRecTy *InDagTy = dyn_cast<DagRecTy>(MHSt->getType())) {
+    if (Operation == tgtok::XFilter) {
+      TokError("!filter must have a list argument");
+      return nullptr;
+    }
+    InEltType = InDagTy;
+    if (ItemType && !isa<DagRecTy>(ItemType)) {
+      Error(OpLoc,
+            "expected value of type '" + Twine(ItemType->getAsString()) +
+                "', but got dag type");
+      return nullptr;
+    }
+    IsDAG = true;
+  } else {
+    if (Operation == tgtok::XForEach)
+      TokError("!foreach must have a list or dag argument");
+    else
+      TokError("!filter must have a list argument");
+    return nullptr;
+  }
+
+  // We need to create a temporary record to provide a scope for the
+  // iteration variable.
+  std::unique_ptr<Record> ParseRecTmp;
+  Record *ParseRec = CurRec;
+  if (!ParseRec) {
+    ParseRecTmp =
+        std::make_unique<Record>(".parse", ArrayRef<SMLoc>{}, Records);
+    ParseRec = ParseRecTmp.get();
+  }
+
+  ParseRec->addValue(RecordVal(LHS, InEltType, RecordVal::FK_Normal));
+  Init *RHS = ParseValue(ParseRec, ExprEltType);
+  ParseRec->removeValue(LHS);
+  if (!RHS)
+    return nullptr;
+
+  if (!consume(tgtok::r_paren)) {
+    TokError("expected ')' in !foreach/!filter");
+    return nullptr;
+  }
+
+  RecTy *OutType = InEltType;
+  if (Operation == tgtok::XForEach && !IsDAG) {
+    TypedInit *RHSt = dyn_cast<TypedInit>(RHS);
+    if (!RHSt) {
+      TokError("could not get type of !foreach result expression");
+      return nullptr;
+    }
+    OutType = RHSt->getType()->getListTy();
+  } else if (Operation == tgtok::XFilter) {
+    OutType = InEltType->getListTy();
+  }    
+
+  return (TernOpInit::get((Operation == tgtok::XForEach) ? TernOpInit::FOREACH
+                                                         : TernOpInit::FILTER,
+                          LHS, MHS, RHS, OutType))
+      ->Fold(CurRec);
+}
+
 Init *TGParser::ParseOperationCond(Record *CurRec, RecTy *ItemType) {
   Lex.Lex();  // eat the operation 'cond'
 
@@ -1783,6 +1968,7 @@ Init *TGParser::ParseOperationCond(Record *CurRec, RecTy *ItemType) {
 ///   SimpleValue ::= '(' IDValue DagArgList ')'
 ///   SimpleValue ::= CONCATTOK '(' Value ',' Value ')'
 ///   SimpleValue ::= ADDTOK '(' Value ',' Value ')'
+///   SimpleValue ::= SUBTOK '(' Value ',' Value ')'
 ///   SimpleValue ::= SHLTOK '(' Value ',' Value ')'
 ///   SimpleValue ::= SRATOK '(' Value ',' Value ')'
 ///   SimpleValue ::= SRLTOK '(' Value ',' Value ')'
@@ -1795,8 +1981,20 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
                                  IDParseMode Mode) {
   Init *R = nullptr;
   switch (Lex.getCode()) {
-  default: TokError("Unknown token when parsing a value"); break;
-  case tgtok::IntVal: R = IntInit::get(Lex.getCurIntVal()); Lex.Lex(); break;
+  default: TokError("Unknown or reserved token when parsing a value"); break;
+
+  case tgtok::TrueVal:
+    R = IntInit::get(1);
+    Lex.Lex();
+    break;
+  case tgtok::FalseVal:
+    R = IntInit::get(0);
+    Lex.Lex();
+    break;
+  case tgtok::IntVal:
+    R = IntInit::get(Lex.getCurIntVal());
+    Lex.Lex();
+    break;
   case tgtok::BinaryIntVal: {
     auto BinaryVal = Lex.getCurBinaryIntVal();
     SmallVector<Init*, 16> Bits(BinaryVal.second);
@@ -1820,7 +2018,7 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
     break;
   }
   case tgtok::CodeFragment:
-    R = CodeInit::get(Lex.getCurStrVal(), Lex.getLoc());
+    R = StringInit::get(Lex.getCurStrVal(), StringInit::SF_Code);
     Lex.Lex();
     break;
   case tgtok::question:
@@ -1951,7 +2149,7 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
     if (ItemType) {
       ListRecTy *ListType = dyn_cast<ListRecTy>(ItemType);
       if (!ListType) {
-        TokError(Twine("Type mismatch for list, expected list type, got ") +
+        TokError(Twine("Encountered a list when expecting a ") +
                  ItemType->getAsString());
         return nullptr;
       }
@@ -2035,7 +2233,7 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
   case tgtok::l_paren: {         // Value ::= '(' IDValue DagArgList ')'
     Lex.Lex();   // eat the '('
     if (Lex.getCode() != tgtok::Id && Lex.getCode() != tgtok::XCast &&
-        Lex.getCode() != tgtok::question && Lex.getCode() != tgtok::XGetOp) {
+        Lex.getCode() != tgtok::question && Lex.getCode() != tgtok::XGetDagOp) {
       TokError("expected identifier in dag init");
       return nullptr;
     }
@@ -2073,14 +2271,17 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
   case tgtok::XSize:
   case tgtok::XEmpty:
   case tgtok::XCast:
-  case tgtok::XGetOp:  // Value ::= !unop '(' Value ')'
+  case tgtok::XGetDagOp: // Value ::= !unop '(' Value ')'
   case tgtok::XIsA:
   case tgtok::XConcat:
   case tgtok::XDag:
   case tgtok::XADD:
+  case tgtok::XSUB:
   case tgtok::XMUL:
+  case tgtok::XNOT:
   case tgtok::XAND:
   case tgtok::XOR:
+  case tgtok::XXOR:
   case tgtok::XSRA:
   case tgtok::XSRL:
   case tgtok::XSHL:
@@ -2093,12 +2294,15 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
   case tgtok::XListConcat:
   case tgtok::XListSplat:
   case tgtok::XStrConcat:
-  case tgtok::XSetOp:   // Value ::= !binop '(' Value ',' Value ')'
+  case tgtok::XInterleave:
+  case tgtok::XSetDagOp: // Value ::= !binop '(' Value ',' Value ')'
   case tgtok::XIf:
   case tgtok::XCond:
   case tgtok::XFoldl:
   case tgtok::XForEach:
-  case tgtok::XSubst: {  // Value ::= !ternop '(' Value ',' Value ',' Value ')'
+  case tgtok::XFilter:
+  case tgtok::XSubst:
+  case tgtok::XSubstr: { // Value ::= !ternop '(' Value ',' Value ',' Value ')'
     return ParseOperation(CurRec, ItemType);
   }
   }
@@ -2106,7 +2310,7 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
   return R;
 }
 
-/// ParseValue - Parse a tblgen value.  This returns null on error.
+/// ParseValue - Parse a TableGen value. This returns null on error.
 ///
 ///   Value       ::= SimpleValue ValueSuffix*
 ///   ValueSuffix ::= '{' BitList '}'
@@ -2167,8 +2371,8 @@ Init *TGParser::ParseValue(Record *CurRec, RecTy *ItemType, IDParseMode Mode) {
       }
       break;
     }
-    case tgtok::period: {
-      if (Lex.Lex() != tgtok::Id) {  // eat the .
+    case tgtok::dot: {
+      if (Lex.Lex() != tgtok::Id) { // eat the .
         TokError("expected field identifier after '.'");
         return nullptr;
       }
@@ -2195,6 +2399,8 @@ Init *TGParser::ParseValue(Record *CurRec, RecTy *ItemType, IDParseMode Mode) {
       if (isa<ListRecTy>(LHS->getType())) {
         Lex.Lex();  // Eat the '#'.
 
+        assert(Mode == ParseValueMode && "encountered paste of lists in name");
+
         switch (Lex.getCode()) {
         case tgtok::colon:
         case tgtok::semi:
@@ -2202,8 +2408,11 @@ Init *TGParser::ParseValue(Record *CurRec, RecTy *ItemType, IDParseMode Mode) {
           Result = LHS; // trailing paste, ignore.
           break;
         default:
-          Init *RHSResult = ParseValue(CurRec, ItemType, ParseNameMode);
+          Init *RHSResult = ParseValue(CurRec, ItemType, ParseValueMode);
+          if (!RHSResult)
+            return nullptr;
           Result = BinOpInit::getListConcat(LHS, RHSResult);
+          break;
         }
         break;
       }
@@ -2239,6 +2448,8 @@ Init *TGParser::ParseValue(Record *CurRec, RecTy *ItemType, IDParseMode Mode) {
 
       default:
         Init *RHSResult = ParseValue(CurRec, nullptr, ParseNameMode);
+        if (!RHSResult)
+          return nullptr;
         RHS = dyn_cast<TypedInit>(RHSResult);
         if (!RHS) {
           Error(PasteLoc, "RHS of paste is not typed!");
@@ -2410,8 +2621,10 @@ Init *TGParser::ParseDeclaration(Record *CurRec,
                              "::");
   }
 
-  // Add the value.
-  if (AddValue(CurRec, IdLoc, RecordVal(DeclName, Type, HasField)))
+  // Add the field to the record.
+  if (AddValue(CurRec, IdLoc, RecordVal(DeclName, IdLoc, Type,
+                                        HasField ? RecordVal::FK_NonconcreteOK
+                                                 : RecordVal::FK_Normal)))
     return nullptr;
 
   // If a value is present, parse it.
@@ -2552,12 +2765,16 @@ bool TGParser::ParseTemplateArgList(Record *CurRec) {
   return false;
 }
 
-/// ParseBodyItem - Parse a single item at within the body of a def or class.
+/// ParseBodyItem - Parse a single item within the body of a def or class.
 ///
 ///   BodyItem ::= Declaration ';'
 ///   BodyItem ::= LET ID OptionalBitList '=' Value ';'
 ///   BodyItem ::= Defvar
+///   BodyItem ::= Assert
 bool TGParser::ParseBodyItem(Record *CurRec) {
+  if (Lex.getCode() == tgtok::Assert)
+    return ParseAssert(nullptr, CurRec);
+
   if (Lex.getCode() == tgtok::Defvar)
     return ParseDefvar();
 
@@ -2619,7 +2836,7 @@ bool TGParser::ParseBody(Record *CurRec) {
     return false;
 
   if (!consume(tgtok::l_brace))
-    return TokError("Expected ';' or '{' to start body");
+    return TokError("Expected '{' to start body or ';' for declaration only");
 
   // An object body introduces a new scope for local variables.
   TGLocalVarScope *BodyScope = PushLocalScope();
@@ -2632,6 +2849,14 @@ bool TGParser::ParseBody(Record *CurRec) {
 
   // Eat the '}'.
   Lex.Lex();
+
+  // If we have a semicolon, print a gentle error.
+  SMLoc SemiLoc = Lex.getLoc();
+  if (consume(tgtok::semi)) {
+    PrintError(SemiLoc, "A class or def body should not end with a semicolon");
+    PrintNote("Semicolon ignored; remove to eliminate this error");    
+  }
+
   return false;
 }
 
@@ -2963,6 +3188,41 @@ bool TGParser::ParseIfBody(MultiClass *CurMultiClass, StringRef Kind) {
   return false;
 }
 
+/// ParseAssert - Parse an assert statement.
+///
+///   Assert ::= ASSERT condition , message ;
+bool TGParser::ParseAssert(MultiClass *CurMultiClass, Record *CurRec) {
+  assert(Lex.getCode() == tgtok::Assert && "Unknown tok");
+  Lex.Lex(); // Eat the 'assert' token.
+
+  SMLoc ConditionLoc = Lex.getLoc();
+  Init *Condition = ParseValue(CurRec);
+  if (!Condition)
+    return true;
+
+  if (!consume(tgtok::comma)) {
+    TokError("expected ',' in assert statement");
+    return true;
+  }
+
+  Init *Message = ParseValue(CurRec);
+  if (!Message)
+    return true;
+
+  if (!consume(tgtok::semi))
+    return TokError("expected ';'");
+
+  if (CurMultiClass) {
+    assert(false && "assert in multiclass not yet supported");
+  } else if (CurRec) {
+    CurRec->addAssertion(ConditionLoc, Condition, Message);
+  } else { // at top level
+    CheckAssert(ConditionLoc, Condition, Message);
+  }
+ 
+  return false;
+}
+
 /// ParseClass - Parse a tblgen class definition.
 ///
 ///   ClassInst ::= CLASS ID TemplateArgList? ObjectBody
@@ -3162,14 +3422,17 @@ bool TGParser::ParseMultiClass() {
     while (Lex.getCode() != tgtok::r_brace) {
       switch (Lex.getCode()) {
       default:
-        return TokError("expected 'let', 'def', 'defm', 'defvar', 'foreach' "
-                        "or 'if' in multiclass body");
-      case tgtok::Let:
+        return TokError("expected 'assert', 'def', 'defm', 'defvar', "
+                        "'foreach', 'if', or 'let' in multiclass body");
+      case tgtok::Assert:
+        return TokError("an assert statement in a multiclass is not yet supported");
+
       case tgtok::Def:
       case tgtok::Defm:
       case tgtok::Defvar:
       case tgtok::Foreach:
       case tgtok::If:
+      case tgtok::Let:
         if (ParseObject(CurMultiClass))
           return true;
         break;
@@ -3177,6 +3440,13 @@ bool TGParser::ParseMultiClass() {
     }
     Lex.Lex();  // eat the '}'.
 
+    // If we have a semicolon, print a gentle error.
+    SMLoc SemiLoc = Lex.getLoc();
+    if (consume(tgtok::semi)) {
+      PrintError(SemiLoc, "A multiclass body should not end with a semicolon");
+      PrintNote("Semicolon ignored; remove to eliminate this error");    
+    }
+
     PopLocalScope(MulticlassScope);
   }
 
@@ -3320,22 +3590,23 @@ bool TGParser::ParseDefm(MultiClass *CurMultiClass) {
 ///   Object ::= LETCommand Object
 ///   Object ::= Defset
 ///   Object ::= Defvar
+///   Object ::= Assert
 bool TGParser::ParseObject(MultiClass *MC) {
   switch (Lex.getCode()) {
   default:
-    return TokError("Expected class, def, defm, defset, multiclass, let, "
-                    "foreach or if");
-  case tgtok::Let:   return ParseTopLevelLet(MC);
-  case tgtok::Def:   return ParseDef(MC);
-  case tgtok::Foreach:   return ParseForeach(MC);
-  case tgtok::If:    return ParseIf(MC);
-  case tgtok::Defm:  return ParseDefm(MC);
+    return TokError(
+               "Expected assert, class, def, defm, defset, foreach, if, or let");
+  case tgtok::Assert:  return ParseAssert(MC, nullptr);
+  case tgtok::Def:     return ParseDef(MC);
+  case tgtok::Defm:    return ParseDefm(MC);
+  case tgtok::Defvar:  return ParseDefvar();
+  case tgtok::Foreach: return ParseForeach(MC);
+  case tgtok::If:      return ParseIf(MC);
+  case tgtok::Let:     return ParseTopLevelLet(MC);
   case tgtok::Defset:
     if (MC)
       return TokError("defset is not allowed inside multiclass");
     return ParseDefset();
-  case tgtok::Defvar:
-    return ParseDefvar();
   case tgtok::Class:
     if (MC)
       return TokError("class is not allowed inside multiclass");
@@ -3367,7 +3638,38 @@ bool TGParser::ParseFile() {
   if (Lex.getCode() == tgtok::Eof)
     return false;
 
-  return TokError("Unexpected input at top level");
+  return TokError("Unexpected token at top level");
+}
+
+// Check an assertion: Obtain the condition value and be sure it is true.
+// If not, print a nonfatal error along with the message.
+void TGParser::CheckAssert(SMLoc Loc, Init *Condition, Init *Message) {
+  auto *CondValue = dyn_cast_or_null<IntInit>(
+                        Condition->convertInitializerTo(IntRecTy::get()));
+  if (CondValue) {
+    if (!CondValue->getValue()) {
+      PrintError(Loc, "assertion failed");
+      if (auto *MessageInit = dyn_cast<StringInit>(Message))
+        PrintNote(MessageInit->getValue());
+      else
+        PrintNote("(assert message is not a string)");
+    }
+  } else {
+    PrintError(Loc, "assert condition must of type bit, bits, or int.");
+  }
+}
+
+// Check all record assertions: For each one, resolve the condition
+// and message, then call CheckAssert().
+void TGParser::CheckRecordAsserts(Record &Rec) {
+  RecordResolver R(Rec);
+  R.setFinal(true);
+
+  for (auto Assertion : Rec.getAssertions()) {
+    Init *Condition = std::get<1>(Assertion)->resolveReferences(R);
+    Init *Message = std::get<2>(Assertion)->resolveReferences(R);
+    CheckAssert(std::get<0>(Assertion), Condition, Message);
+  }
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/contrib/llvm-project/llvm/lib/TableGen/TGParser.h b/contrib/llvm-project/llvm/lib/TableGen/TGParser.h
index 07a4003219f5..578a56c9d01c 100644
--- a/contrib/llvm-project/llvm/lib/TableGen/TGParser.h
+++ b/contrib/llvm-project/llvm/lib/TableGen/TGParser.h
@@ -222,6 +222,7 @@ private:  // Parser methods.
   bool ParseForeach(MultiClass *CurMultiClass);
   bool ParseIf(MultiClass *CurMultiClass);
   bool ParseIfBody(MultiClass *CurMultiClass, StringRef Kind);
+  bool ParseAssert(MultiClass *CurMultiClass, Record *CurRec);
   bool ParseTopLevelLet(MultiClass *CurMultiClass);
   void ParseLetList(SmallVectorImpl<LetRecord> &Result);
 
@@ -254,6 +255,8 @@ private:  // Parser methods.
                        TypedInit *FirstItem = nullptr);
   RecTy *ParseType();
   Init *ParseOperation(Record *CurRec, RecTy *ItemType);
+  Init *ParseOperationSubstr(Record *CurRec, RecTy *ItemType);
+  Init *ParseOperationForEachFilter(Record *CurRec, RecTy *ItemType);
   Init *ParseOperationCond(Record *CurRec, RecTy *ItemType);
   RecTy *ParseOperatorType();
   Init *ParseObjectName(MultiClass *CurMultiClass);
@@ -261,6 +264,8 @@ private:  // Parser methods.
   MultiClass *ParseMultiClassID();
   bool ApplyLetStack(Record *CurRec);
   bool ApplyLetStack(RecordsEntry &Entry);
+  void CheckAssert(SMLoc Loc, Init *Condition, Init *Message);
+  void CheckRecordAsserts(Record &Rec);
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/TableGen/TableGenBackendSkeleton.cpp b/contrib/llvm-project/llvm/lib/TableGen/TableGenBackendSkeleton.cpp
new file mode 100644
index 000000000000..4ce88e003e65
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/TableGen/TableGenBackendSkeleton.cpp
@@ -0,0 +1,64 @@
+//===- SkeletonEmitter.cpp - Skeleton TableGen backend          -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This Tablegen backend emits ...
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/TableGenBackend.h"
+#include <algorithm>
+#include <set>
+#include <string>
+#include <vector>
+
+#define DEBUG_TYPE "skeleton-emitter"
+
+using namespace llvm;
+
+namespace {
+
+// Any helper data structures can be defined here. Some backends use
+// structs to collect information from the records.
+
+class SkeletonEmitter {
+private:
+  RecordKeeper &Records;
+
+public:
+  SkeletonEmitter(RecordKeeper &RK) : Records(RK) {}
+
+  void run(raw_ostream &OS);
+}; // emitter class
+
+} // anonymous namespace
+
+void SkeletonEmitter::run(raw_ostream &OS) {
+  emitSourceFileHeader("Skeleton data structures", OS);
+
+  (void)Records; // To suppress unused variable warning; remove on use.
+}
+
+namespace llvm {
+
+// The only thing that should be in the llvm namespace is the
+// emitter entry point function.
+
+void EmitSkeleton(RecordKeeper &RK, raw_ostream &OS) {
+  // Instantiate the emitter class and invoke run().
+  SkeletonEmitter(RK).run(OS);
+}
+
+} // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64.h b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64.h
index fd35b530e3ce..d2170a99e0a2 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64.h
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64.h
@@ -58,8 +58,10 @@ ModulePass *createSVEIntrinsicOptsPass();
 InstructionSelector *
 createAArch64InstructionSelector(const AArch64TargetMachine &,
                                  AArch64Subtarget &, AArch64RegisterBankInfo &);
-FunctionPass *createAArch64PreLegalizeCombiner(bool IsOptNone);
-FunctionPass *createAArch64PostLegalizeCombiner(bool IsOptNone);
+FunctionPass *createAArch64PreLegalizerCombiner(bool IsOptNone);
+FunctionPass *createAArch64PostLegalizerCombiner(bool IsOptNone);
+FunctionPass *createAArch64PostLegalizerLowering();
+FunctionPass *createAArch64PostSelectOptimize();
 FunctionPass *createAArch64StackTaggingPass(bool IsOptNone);
 FunctionPass *createAArch64StackTaggingPreRAPass();
 
@@ -80,6 +82,8 @@ void initializeAArch64LoadStoreOptPass(PassRegistry&);
 void initializeAArch64SIMDInstrOptPass(PassRegistry&);
 void initializeAArch64PreLegalizerCombinerPass(PassRegistry&);
 void initializeAArch64PostLegalizerCombinerPass(PassRegistry &);
+void initializeAArch64PostLegalizerLoweringPass(PassRegistry &);
+void initializeAArch64PostSelectOptimizePass(PassRegistry &);
 void initializeAArch64PromoteConstantPass(PassRegistry&);
 void initializeAArch64RedundantCopyEliminationPass(PassRegistry&);
 void initializeAArch64StorePairSuppressPass(PassRegistry&);
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64.td b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64.td
index 534af9686af0..762855207d2b 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64.td
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64.td
@@ -61,6 +61,9 @@ def FeatureRAS : SubtargetFeature<"ras", "HasRAS", "true",
 def FeatureLSE : SubtargetFeature<"lse", "HasLSE", "true",
   "Enable ARMv8.1 Large System Extension (LSE) atomic instructions">;
 
+def FeatureOutlineAtomics : SubtargetFeature<"outline-atomics", "OutlineAtomics", "true",
+  "Enable out of line atomics to support LSE instructions">;
+
 def FeatureRDM : SubtargetFeature<"rdm", "HasRDM", "true",
   "Enable ARMv8.1 Rounding Double Multiply Add/Subtract instructions">;
 
@@ -72,9 +75,11 @@ def FeatureLOR : SubtargetFeature<
     "lor", "HasLOR", "true",
     "Enables ARM v8.1 Limited Ordering Regions extension">;
 
-def FeatureVH : SubtargetFeature<
-    "vh", "HasVH", "true",
-    "Enables ARM v8.1 Virtual Host extension">;
+def FeatureCONTEXTIDREL2 : SubtargetFeature<"CONTEXTIDREL2", "HasCONTEXTIDREL2",
+    "true", "Enable RW operand CONTEXTIDR_EL2" >;
+
+def FeatureVH : SubtargetFeature<"vh", "HasVH", "true",
+    "Enables ARM v8.1 Virtual Host extension", [FeatureCONTEXTIDREL2] >;
 
 def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true",
   "Enable ARMv8 PMUv3 Performance Monitors extension">;
@@ -213,6 +218,10 @@ def FeatureArithmeticCbzFusion : SubtargetFeature<
     "arith-cbz-fusion", "HasArithmeticCbzFusion", "true",
     "CPU fuses arithmetic + cbz/cbnz operations">;
 
+def FeatureCmpBccFusion : SubtargetFeature<
+    "cmp-bcc-fusion", "HasCmpBccFusion", "true",
+    "CPU fuses cmp+bcc operations">;
+
 def FeatureFuseAddress : SubtargetFeature<
     "fuse-address", "HasFuseAddress", "true",
     "CPU fuses address generation and memory operations">;
@@ -256,8 +265,8 @@ def FeatureDotProd : SubtargetFeature<
     "dotprod", "HasDotProd", "true",
     "Enable dot product support">;
 
-def FeaturePA : SubtargetFeature<
-    "pa", "HasPA", "true",
+def FeaturePAuth : SubtargetFeature<
+    "pauth", "HasPAuth", "true",
     "Enable v8.3-A Pointer Authentication extension">;
 
 def FeatureJS : SubtargetFeature<
@@ -278,11 +287,6 @@ def FeatureNV : SubtargetFeature<
     "nv", "HasNV", "true",
     "Enable v8.4-A Nested Virtualization Enchancement">;
 
-def FeatureRASv8_4 : SubtargetFeature<
-    "rasv8_4", "HasRASv8_4", "true",
-    "Enable v8.4-A Reliability, Availability and Serviceability extension",
-    [FeatureRAS]>;
-
 def FeatureMPAM : SubtargetFeature<
     "mpam", "HasMPAM", "true",
     "Enable v8.4-A Memory system Partitioning and Monitoring extension">;
@@ -316,8 +320,8 @@ def FeatureTLB_RMI : SubtargetFeature<
     "tlb-rmi", "HasTLB_RMI", "true",
     "Enable v8.4-A TLB Range and Maintenance Instructions">;
 
-def FeatureFMI : SubtargetFeature<
-    "fmi", "HasFMI", "true",
+def FeatureFlagM : SubtargetFeature<
+    "flagm", "HasFlagM", "true",
     "Enable v8.4-A Flag Manipulation Instructions">;
 
 // 8.4 RCPC enchancements: LDAPR & STLR instructions with Immediate Offset
@@ -400,6 +404,24 @@ def FeatureMatMulFP32 : SubtargetFeature<"f32mm", "HasMatMulFP32",
 def FeatureMatMulFP64 : SubtargetFeature<"f64mm", "HasMatMulFP64",
     "true", "Enable Matrix Multiply FP64 Extension", [FeatureSVE]>;
 
+def FeatureXS : SubtargetFeature<"xs", "HasXS",
+    "true", "Enable Armv8.7-A limited-TLB-maintenance instruction">;
+
+def FeatureWFxT : SubtargetFeature<"wfxt", "HasWFxT",
+    "true", "Enable Armv8.7-A WFET and WFIT instruction">;
+
+def FeatureHCX : SubtargetFeature<
+    "hcx", "HasHCX", "true", "Enable Armv8.7-A HCRX_EL2 system register">;
+
+def FeatureLS64 : SubtargetFeature<"ls64", "HasLS64",
+    "true", "Enable Armv8.7-A LD64B/ST64B Accelerator Extension">;
+
+def FeatureBRBE : SubtargetFeature<"brbe", "HasBRBE",
+    "true", "Enable Branch Record Buffer Extension">;
+
+def FeatureSPE_EEF : SubtargetFeature<"spe-eef", "HasSPE_EEF",
+    "true", "Enable extra register in the Statistical Profiling Extension">;
+
 def FeatureFineGrainedTraps : SubtargetFeature<"fgt", "HasFineGrainedTraps",
     "true", "Enable fine grained virtualization traps extension">;
 
@@ -420,14 +442,14 @@ def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true",
   FeaturePAN_RWV, FeatureRAS, FeatureCCPP]>;
 
 def HasV8_3aOps : SubtargetFeature<"v8.3a", "HasV8_3aOps", "true",
-  "Support ARM v8.3a instructions", [HasV8_2aOps, FeatureRCPC, FeaturePA,
+  "Support ARM v8.3a instructions", [HasV8_2aOps, FeatureRCPC, FeaturePAuth,
   FeatureJS, FeatureCCIDX, FeatureComplxNum]>;
 
 def HasV8_4aOps : SubtargetFeature<"v8.4a", "HasV8_4aOps", "true",
   "Support ARM v8.4a instructions", [HasV8_3aOps, FeatureDotProd,
-  FeatureNV, FeatureRASv8_4, FeatureMPAM, FeatureDIT,
+  FeatureNV, FeatureMPAM, FeatureDIT,
   FeatureTRACEV8_4, FeatureAM, FeatureSEL2, FeaturePMU, FeatureTLB_RMI,
-  FeatureFMI, FeatureRCPC_IMMO]>;
+  FeatureFlagM, FeatureRCPC_IMMO]>;
 
 def HasV8_5aOps : SubtargetFeature<
   "v8.5a", "HasV8_5aOps", "true", "Support ARM v8.5a instructions",
@@ -437,10 +459,29 @@ def HasV8_5aOps : SubtargetFeature<
 
 def HasV8_6aOps : SubtargetFeature<
   "v8.6a", "HasV8_6aOps", "true", "Support ARM v8.6a instructions",
-
   [HasV8_5aOps, FeatureAMVS, FeatureBF16, FeatureFineGrainedTraps,
    FeatureEnhancedCounterVirtualization, FeatureMatMulInt8]>;
 
+def HasV8_7aOps : SubtargetFeature<
+  "v8.7a", "HasV8_7aOps", "true", "Support ARM v8.7a instructions",
+  [HasV8_6aOps, FeatureXS, FeatureWFxT, FeatureHCX]>;
+
+def HasV8_0rOps : SubtargetFeature<
+  "v8r", "HasV8_0rOps", "true", "Support ARM v8r instructions",
+  [//v8.1
+  FeatureCRC, FeaturePAN, FeatureRDM, FeatureLSE, FeatureCONTEXTIDREL2,
+  //v8.2
+  FeaturePerfMon, FeatureRAS, FeaturePsUAO, FeatureSM4,
+  FeatureSHA3, FeatureCCPP, FeatureFullFP16, FeaturePAN_RWV,
+  //v8.3
+  FeatureComplxNum, FeatureCCIDX, FeatureJS,
+  FeaturePAuth, FeatureRCPC,
+  //v8.4
+  FeatureDotProd, FeatureFP16FML, FeatureTRACEV8_4,
+  FeatureTLB_RMI, FeatureFlagM, FeatureDIT, FeatureSEL2, FeatureRCPC_IMMO,
+  //v8.5
+  FeatureSSBS, FeaturePredRes, FeatureSB, FeatureSpecRestrict]>;
+
 //===----------------------------------------------------------------------===//
 // Register File Description
 //===----------------------------------------------------------------------===//
@@ -502,10 +543,11 @@ def SVEUnsupported : AArch64Unsupported {
 }
 
 def PAUnsupported : AArch64Unsupported {
-  let F = [HasPA];
+  let F = [HasPAuth];
 }
 
 include "AArch64SchedA53.td"
+include "AArch64SchedA55.td"
 include "AArch64SchedA57.td"
 include "AArch64SchedCyclone.td"
 include "AArch64SchedFalkor.td"
@@ -515,7 +557,9 @@ include "AArch64SchedExynosM4.td"
 include "AArch64SchedExynosM5.td"
 include "AArch64SchedThunderX.td"
 include "AArch64SchedThunderX2T99.td"
+include "AArch64SchedA64FX.td"
 include "AArch64SchedThunderX3T110.td"
+include "AArch64SchedTSV110.td"
 
 def ProcA35     : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35",
                                    "Cortex-A35 ARM processors", [
@@ -575,6 +619,9 @@ def ProcA65     : SubtargetFeature<"a65", "ARMProcFamily", "CortexA65",
                                    FeatureDotProd,
                                    FeatureFPARMv8,
                                    FeatureFullFP16,
+                                   FeatureFuseAddress,
+                                   FeatureFuseAES,
+                                   FeatureFuseLiterals,
                                    FeatureNEON,
                                    FeatureRAS,
                                    FeatureRCPC,
@@ -587,6 +634,7 @@ def ProcA72     : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72",
                                    FeatureCrypto,
                                    FeatureFPARMv8,
                                    FeatureFuseAES,
+                                   FeatureFuseLiterals,
                                    FeatureNEON,
                                    FeaturePerfMon
                                    ]>;
@@ -618,6 +666,7 @@ def ProcA76     : SubtargetFeature<"a76", "ARMProcFamily", "CortexA76",
                                    "Cortex-A76 ARM processors", [
                                     HasV8_2aOps,
                                     FeatureFPARMv8,
+                                    FeatureFuseAES,
                                     FeatureNEON,
                                     FeatureRCPC,
                                     FeatureCrypto,
@@ -629,7 +678,9 @@ def ProcA76     : SubtargetFeature<"a76", "ARMProcFamily", "CortexA76",
 def ProcA77     : SubtargetFeature<"a77", "ARMProcFamily", "CortexA77",
                                    "Cortex-A77 ARM processors", [
                                     HasV8_2aOps,
+                                    FeatureCmpBccFusion,
                                     FeatureFPARMv8,
+                                    FeatureFuseAES,
                                     FeatureNEON, FeatureRCPC,
                                     FeatureCrypto,
                                     FeatureFullFP16,
@@ -640,6 +691,7 @@ def ProcA78 : SubtargetFeature<"cortex-a78", "ARMProcFamily",
                                "CortexA78",
                                "Cortex-A78 ARM processors", [
                                HasV8_2aOps,
+                               FeatureCmpBccFusion,
                                FeatureCrypto,
                                FeatureFPARMv8,
                                FeatureFuseAES,
@@ -652,9 +704,39 @@ def ProcA78 : SubtargetFeature<"cortex-a78", "ARMProcFamily",
                                FeatureSSBS,
                                FeatureDotProd]>;
 
+def ProcA78C : SubtargetFeature<"cortex-a78c", "ARMProcFamily",
+                                "CortexA78C",
+                                "Cortex-A78C ARM processors", [
+                                HasV8_2aOps,
+                                FeatureCmpBccFusion,
+                                FeatureCrypto,
+                                FeatureDotProd,
+                                FeatureFlagM,
+                                FeatureFP16FML,
+                                FeatureFPARMv8,
+                                FeatureFullFP16,
+                                FeatureFuseAES,
+                                FeatureNEON,
+                                FeaturePAuth,
+                                FeaturePerfMon,
+                                FeaturePostRAScheduler,
+                                FeatureRCPC,
+                                FeatureSPE,
+                                FeatureSSBS]>;
+
+def ProcR82 : SubtargetFeature<"cortex-r82", "ARMProcFamily",
+                               "CortexR82",
+                               "Cortex-R82 ARM Processors", [
+                               FeaturePostRAScheduler,
+                               // TODO: crypto and FuseAES
+                               // All other features are implied by v8_0r ops:
+                               HasV8_0rOps,
+                               ]>;
+
 def ProcX1 : SubtargetFeature<"cortex-x1", "ARMProcFamily", "CortexX1",
                                   "Cortex-X1 ARM processors", [
                                   HasV8_2aOps,
+                                  FeatureCmpBccFusion,
                                   FeatureCrypto,
                                   FeatureFPARMv8,
                                   FeatureFuseAES,
@@ -676,7 +758,10 @@ def ProcA64FX : SubtargetFeature<"a64fx", "ARMProcFamily", "A64FX",
                                   FeatureFullFP16,
                                   FeatureSVE,
                                   FeaturePostRAScheduler,
-                                  FeatureComplxNum
+                                  FeatureComplxNum,
+                                  FeatureAggressiveFMA,
+                                  FeatureArithmeticBccFusion,
+                                  FeaturePredictableSelectIsExpensive
                                   ]>;
 
 def ProcCarmel : SubtargetFeature<"carmel", "ARMProcFamily", "Carmel",
@@ -783,6 +868,38 @@ def ProcAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
                                      HasV8_4aOps
                                      ]>;
 
+def ProcAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
+                                     "Apple A14", [
+                                     FeatureAggressiveFMA,
+                                     FeatureAlternateSExtLoadCVTF32Pattern,
+                                     FeatureAltFPCmp,
+                                     FeatureArithmeticBccFusion,
+                                     FeatureArithmeticCbzFusion,
+                                     FeatureCrypto,
+                                     FeatureDisableLatencySchedHeuristic,
+                                     FeatureFPARMv8,
+                                     FeatureFRInt3264,
+                                     FeatureFuseAddress,
+                                     FeatureFuseAES,
+                                     FeatureFuseArithmeticLogic,
+                                     FeatureFuseCCSelect,
+                                     FeatureFuseCryptoEOR,
+                                     FeatureFuseLiterals,
+                                     FeatureNEON,
+                                     FeaturePerfMon,
+                                     FeatureSpecRestrict,
+                                     FeatureSSBS,
+                                     FeatureSB,
+                                     FeaturePredRes,
+                                     FeatureCacheDeepPersist,
+                                     FeatureZCRegMove,
+                                     FeatureZCZeroing,
+                                     FeatureFullFP16,
+                                     FeatureFP16FML,
+                                     FeatureSHA3,
+                                     HasV8_4aOps
+                                     ]>;
+
 def ProcExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
                                     "Samsung Exynos-M3 processors",
                                     [FeatureCRC,
@@ -876,6 +993,38 @@ def ProcNeoverseN1 : SubtargetFeature<"neoversen1", "ARMProcFamily",
                                       FeatureSSBS,
                                       ]>;
 
+def ProcNeoverseN2 : SubtargetFeature<"neoversen2", "ARMProcFamily",
+                                      "NeoverseN2",
+                                      "Neoverse N2 ARM processors", [
+                                      HasV8_5aOps,
+                                      FeatureBF16,
+                                      FeatureETE,
+                                      FeatureMatMulInt8,
+                                      FeatureMTE,
+                                      FeatureSVE2,
+                                      FeatureSVE2BitPerm,
+                                      FeatureTRBE]>;
+
+def ProcNeoverseV1 : SubtargetFeature<"neoversev1", "ARMProcFamily",
+                                      "NeoverseV1",
+                                      "Neoverse V1 ARM processors", [
+                                      HasV8_4aOps,
+                                      FeatureBF16,
+                                      FeatureCacheDeepPersist,
+                                      FeatureCrypto,
+                                      FeatureFPARMv8,
+                                      FeatureFP16FML,
+                                      FeatureFullFP16,
+                                      FeatureFuseAES,
+                                      FeatureMatMulInt8,
+                                      FeatureNEON,
+                                      FeaturePerfMon,
+                                      FeaturePostRAScheduler,
+                                      FeatureRandGen,
+                                      FeatureSPE,
+                                      FeatureSSBS,
+                                      FeatureSVE]>;
+
 def ProcSaphira  : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira",
                                    "Qualcomm Saphira processors", [
                                    FeatureCrypto,
@@ -916,7 +1065,7 @@ def ProcThunderX3T110  : SubtargetFeature<"thunderx3t110", "ARMProcFamily",
                                           FeaturePostRAScheduler,
                                           FeaturePredictableSelectIsExpensive,
                                           FeatureLSE,
-                                          FeaturePA,
+                                          FeaturePAuth,
                                           FeatureUseAA,
                                           FeatureBalanceFPOps,
                                           FeaturePerfMon,
@@ -998,7 +1147,7 @@ def : ProcessorModel<"generic", NoSchedModel, [
 def : ProcessorModel<"cortex-a35", CortexA53Model, [ProcA35]>;
 def : ProcessorModel<"cortex-a34", CortexA53Model, [ProcA35]>;
 def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>;
-def : ProcessorModel<"cortex-a55", CortexA53Model, [ProcA55]>;
+def : ProcessorModel<"cortex-a55", CortexA55Model, [ProcA55]>;
 def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>;
 def : ProcessorModel<"cortex-a65", CortexA53Model, [ProcA65]>;
 def : ProcessorModel<"cortex-a65ae", CortexA53Model, [ProcA65]>;
@@ -1009,9 +1158,13 @@ def : ProcessorModel<"cortex-a76", CortexA57Model, [ProcA76]>;
 def : ProcessorModel<"cortex-a76ae", CortexA57Model, [ProcA76]>;
 def : ProcessorModel<"cortex-a77", CortexA57Model, [ProcA77]>;
 def : ProcessorModel<"cortex-a78", CortexA57Model, [ProcA78]>;
+def : ProcessorModel<"cortex-a78c", CortexA57Model, [ProcA78C]>;
+def : ProcessorModel<"cortex-r82", CortexA55Model, [ProcR82]>;
 def : ProcessorModel<"cortex-x1", CortexA57Model, [ProcX1]>;
 def : ProcessorModel<"neoverse-e1", CortexA53Model, [ProcNeoverseE1]>;
 def : ProcessorModel<"neoverse-n1", CortexA57Model, [ProcNeoverseN1]>;
+def : ProcessorModel<"neoverse-n2", CortexA57Model, [ProcNeoverseN2]>;
+def : ProcessorModel<"neoverse-v1", CortexA57Model, [ProcNeoverseV1]>;
 def : ProcessorModel<"exynos-m3", ExynosM3Model, [ProcExynosM3]>;
 def : ProcessorModel<"exynos-m4", ExynosM4Model, [ProcExynosM4]>;
 def : ProcessorModel<"exynos-m5", ExynosM5Model, [ProcExynosM4]>;
@@ -1027,8 +1180,7 @@ def : ProcessorModel<"thunderxt83", ThunderXT8XModel,  [ProcThunderXT83]>;
 def : ProcessorModel<"thunderx2t99", ThunderX2T99Model, [ProcThunderX2T99]>;
 // Marvell ThunderX3T110 Processors.
 def : ProcessorModel<"thunderx3t110", ThunderX3T110Model, [ProcThunderX3T110]>;
-// FIXME: HiSilicon TSV110 is currently modeled as a Cortex-A57.
-def : ProcessorModel<"tsv110", CortexA57Model, [ProcTSV110]>;
+def : ProcessorModel<"tsv110", TSV110Model, [ProcTSV110]>;
 
 // Support cyclone as an alias for apple-a7 so we can still LTO old bitcode.
 def : ProcessorModel<"cyclone", CycloneModel, [ProcAppleA7]>;
@@ -1041,6 +1193,7 @@ def : ProcessorModel<"apple-a10", CycloneModel, [ProcAppleA10]>;
 def : ProcessorModel<"apple-a11", CycloneModel, [ProcAppleA11]>;
 def : ProcessorModel<"apple-a12", CycloneModel, [ProcAppleA12]>;
 def : ProcessorModel<"apple-a13", CycloneModel, [ProcAppleA13]>;
+def : ProcessorModel<"apple-a14", CycloneModel, [ProcAppleA14]>;
 
 // watch CPUs.
 def : ProcessorModel<"apple-s4", CycloneModel, [ProcAppleA12]>;
@@ -1050,8 +1203,7 @@ def : ProcessorModel<"apple-s5", CycloneModel, [ProcAppleA12]>;
 def : ProcessorModel<"apple-latest", CycloneModel, [ProcAppleA13]>;
 
 // Fujitsu A64FX
-// FIXME: Scheduling model is not implemented yet.
-def : ProcessorModel<"a64fx", NoSchedModel, [ProcA64FX]>;
+def : ProcessorModel<"a64fx", A64FXModel, [ProcA64FX]>;
 
 // Nvidia Carmel
 def : ProcessorModel<"carmel", NoSchedModel, [ProcCarmel]>;
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
index 981b366c14b1..c996d2df8c38 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
@@ -123,7 +123,7 @@ static bool isFPR64(unsigned Reg, unsigned SubReg,
 }
 
 // getSrcFromCopy - Get the original source register for a GPR64 <--> FPR64
-// copy instruction. Return zero_reg if the instruction is not a copy.
+// copy instruction. Return nullptr if the instruction is not a copy.
 static MachineOperand *getSrcFromCopy(MachineInstr *MI,
                                       const MachineRegisterInfo *MRI,
                                       unsigned &SubReg) {
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 7ec7ffe309f7..a0c5498ee620 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -32,6 +32,7 @@
 #include "llvm/BinaryFormat/COFF.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/FaultMaps.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -54,6 +55,7 @@
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Instrumentation/HWAddressSanitizer.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -69,12 +71,13 @@ namespace {
 class AArch64AsmPrinter : public AsmPrinter {
   AArch64MCInstLower MCInstLowering;
   StackMaps SM;
+  FaultMaps FM;
   const AArch64Subtarget *STI;
 
 public:
   AArch64AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
       : AsmPrinter(TM, std::move(Streamer)), MCInstLowering(OutContext, *this),
-        SM(*this) {}
+        SM(*this), FM(*this) {}
 
   StringRef getPassName() const override { return "AArch64 Assembly Printer"; }
 
@@ -86,17 +89,18 @@ public:
 
   void emitStartOfAsmFile(Module &M) override;
   void emitJumpTableInfo() override;
-  void emitJumpTableEntry(const MachineJumpTableInfo *MJTI,
-                          const MachineBasicBlock *MBB, unsigned JTI);
 
   void emitFunctionEntryLabel() override;
 
-  void LowerJumpTableDestSmall(MCStreamer &OutStreamer, const MachineInstr &MI);
+  void LowerJumpTableDest(MCStreamer &OutStreamer, const MachineInstr &MI);
 
   void LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
                      const MachineInstr &MI);
   void LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
                        const MachineInstr &MI);
+  void LowerSTATEPOINT(MCStreamer &OutStreamer, StackMaps &SM,
+                       const MachineInstr &MI);
+  void LowerFAULTING_OP(const MachineInstr &MI);
 
   void LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI);
   void LowerPATCHABLE_FUNCTION_EXIT(const MachineInstr &MI);
@@ -191,58 +195,24 @@ void AArch64AsmPrinter::emitStartOfAsmFile(Module &M) {
     return;
 
   // Assemble feature flags that may require creation of a note section.
-  unsigned Flags = ELF::GNU_PROPERTY_AARCH64_FEATURE_1_BTI |
-                   ELF::GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
-
-  if (any_of(M, [](const Function &F) {
-        return !F.isDeclaration() &&
-               !F.hasFnAttribute("branch-target-enforcement");
-      })) {
-    Flags &= ~ELF::GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
-  }
+  unsigned Flags = 0;
+  if (const auto *BTE = mdconst::extract_or_null<ConstantInt>(
+          M.getModuleFlag("branch-target-enforcement")))
+    if (BTE->getZExtValue())
+      Flags |= ELF::GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
 
-  if ((Flags & ELF::GNU_PROPERTY_AARCH64_FEATURE_1_BTI) == 0 &&
-      any_of(M, [](const Function &F) {
-        return F.hasFnAttribute("branch-target-enforcement");
-      })) {
-    errs() << "warning: some functions compiled with BTI and some compiled "
-              "without BTI\n"
-           << "warning: not setting BTI in feature flags\n";
-  }
-
-  if (any_of(M, [](const Function &F) {
-        if (F.isDeclaration())
-          return false;
-        Attribute A = F.getFnAttribute("sign-return-address");
-        return !A.isStringAttribute() || A.getValueAsString() == "none";
-      })) {
-    Flags &= ~ELF::GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
-  }
+  if (const auto *Sign = mdconst::extract_or_null<ConstantInt>(
+          M.getModuleFlag("sign-return-address")))
+    if (Sign->getZExtValue())
+      Flags |= ELF::GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
 
   if (Flags == 0)
     return;
 
   // Emit a .note.gnu.property section with the flags.
-  MCSection *Cur = OutStreamer->getCurrentSectionOnly();
-  MCSection *Nt = MMI->getContext().getELFSection(
-      ".note.gnu.property", ELF::SHT_NOTE, ELF::SHF_ALLOC);
-  OutStreamer->SwitchSection(Nt);
-
-  // Emit the note header.
-  emitAlignment(Align(8));
-  OutStreamer->emitInt32(4);     // data size for "GNU\0"
-  OutStreamer->emitInt32(4 * 4); // Elf_Prop size
-  OutStreamer->emitInt32(ELF::NT_GNU_PROPERTY_TYPE_0);
-  OutStreamer->emitBytes(StringRef("GNU", 4)); // note name
-
-  // Emit the PAC/BTI properties.
-  OutStreamer->emitInt32(ELF::GNU_PROPERTY_AARCH64_FEATURE_1_AND);
-  OutStreamer->emitInt32(4);     // data size
-  OutStreamer->emitInt32(Flags); // data
-  OutStreamer->emitInt32(0);     // pad
-
-  OutStreamer->endSection(Nt);
-  OutStreamer->SwitchSection(Cur);
+  if (auto *TS = static_cast<AArch64TargetStreamer *>(
+          OutStreamer->getTargetStreamer()))
+    TS->emitNoteSection(Flags);
 }
 
 void AArch64AsmPrinter::emitFunctionHeaderComment() {
@@ -333,7 +303,7 @@ void AArch64AsmPrinter::LowerHWASAN_CHECK_MEMACCESS(const MachineInstr &MI) {
     std::string SymName = "__hwasan_check_x" + utostr(Reg - AArch64::X0) + "_" +
                           utostr(AccessInfo);
     if (IsShort)
-      SymName += "_short";
+      SymName += "_short_v2";
     Sym = OutContext.getOrCreateSymbol(SymName);
   }
 
@@ -350,6 +320,7 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) {
   assert(TT.isOSBinFormatELF());
   std::unique_ptr<MCSubtargetInfo> STI(
       TM.getTarget().createMCSubtargetInfo(TT.str(), "", ""));
+  assert(STI && "Unable to create subtarget info");
 
   MCSymbol *HwasanTagMismatchV1Sym =
       OutContext.getOrCreateSymbol("__hwasan_tag_mismatch");
@@ -369,6 +340,15 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) {
         IsShort ? HwasanTagMismatchV2Ref : HwasanTagMismatchV1Ref;
     MCSymbol *Sym = P.second;
 
+    bool HasMatchAllTag =
+        (AccessInfo >> HWASanAccessInfo::HasMatchAllShift) & 1;
+    uint8_t MatchAllTag =
+        (AccessInfo >> HWASanAccessInfo::MatchAllShift) & 0xff;
+    unsigned Size =
+        1 << ((AccessInfo >> HWASanAccessInfo::AccessSizeShift) & 0xf);
+    bool CompileKernel =
+        (AccessInfo >> HWASanAccessInfo::CompileKernelShift) & 1;
+
     OutStreamer->SwitchSection(OutContext.getELFSection(
         ".text.hot", ELF::SHT_PROGBITS,
         ELF::SHF_EXECINSTR | ELF::SHF_ALLOC | ELF::SHF_GROUP, 0,
@@ -379,19 +359,20 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) {
     OutStreamer->emitSymbolAttribute(Sym, MCSA_Hidden);
     OutStreamer->emitLabel(Sym);
 
-    OutStreamer->emitInstruction(MCInstBuilder(AArch64::UBFMXri)
+    OutStreamer->emitInstruction(MCInstBuilder(AArch64::SBFMXri)
                                      .addReg(AArch64::X16)
                                      .addReg(Reg)
                                      .addImm(4)
                                      .addImm(55),
                                  *STI);
-    OutStreamer->emitInstruction(MCInstBuilder(AArch64::LDRBBroX)
-                                     .addReg(AArch64::W16)
-                                     .addReg(AArch64::X9)
-                                     .addReg(AArch64::X16)
-                                     .addImm(0)
-                                     .addImm(0),
-                                 *STI);
+    OutStreamer->emitInstruction(
+        MCInstBuilder(AArch64::LDRBBroX)
+            .addReg(AArch64::W16)
+            .addReg(IsShort ? AArch64::X20 : AArch64::X9)
+            .addReg(AArch64::X16)
+            .addImm(0)
+            .addImm(0),
+        *STI);
     OutStreamer->emitInstruction(
         MCInstBuilder(AArch64::SUBSXrs)
             .addReg(AArch64::XZR)
@@ -412,6 +393,26 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) {
         MCInstBuilder(AArch64::RET).addReg(AArch64::LR), *STI);
     OutStreamer->emitLabel(HandleMismatchOrPartialSym);
 
+    if (HasMatchAllTag) {
+      OutStreamer->emitInstruction(MCInstBuilder(AArch64::UBFMXri)
+                                       .addReg(AArch64::X16)
+                                       .addReg(Reg)
+                                       .addImm(56)
+                                       .addImm(63),
+                                   *STI);
+      OutStreamer->emitInstruction(MCInstBuilder(AArch64::SUBSXri)
+                                       .addReg(AArch64::XZR)
+                                       .addReg(AArch64::X16)
+                                       .addImm(MatchAllTag)
+                                       .addImm(0),
+                                   *STI);
+      OutStreamer->emitInstruction(
+          MCInstBuilder(AArch64::Bcc)
+              .addImm(AArch64CC::EQ)
+              .addExpr(MCSymbolRefExpr::create(ReturnSym, OutContext)),
+          *STI);
+    }
+
     if (IsShort) {
       OutStreamer->emitInstruction(MCInstBuilder(AArch64::SUBSWri)
                                        .addReg(AArch64::WZR)
@@ -432,7 +433,6 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) {
               .addReg(Reg)
               .addImm(AArch64_AM::encodeLogicalImmediate(0xf, 64)),
           *STI);
-      unsigned Size = 1 << (AccessInfo & 0xf);
       if (Size != 1)
         OutStreamer->emitInstruction(MCInstBuilder(AArch64::ADDXri)
                                          .addReg(AArch64::X17)
@@ -500,32 +500,41 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) {
                                        .addReg(Reg)
                                        .addImm(0),
                                    *STI);
-    OutStreamer->emitInstruction(MCInstBuilder(AArch64::MOVZXi)
-                                     .addReg(AArch64::X1)
-                                     .addImm(AccessInfo)
-                                     .addImm(0),
-                                 *STI);
-
-    // Intentionally load the GOT entry and branch to it, rather than possibly
-    // late binding the function, which may clobber the registers before we have
-    // a chance to save them.
     OutStreamer->emitInstruction(
-        MCInstBuilder(AArch64::ADRP)
-            .addReg(AArch64::X16)
-            .addExpr(AArch64MCExpr::create(
-                HwasanTagMismatchRef, AArch64MCExpr::VariantKind::VK_GOT_PAGE,
-                OutContext)),
+        MCInstBuilder(AArch64::MOVZXi)
+            .addReg(AArch64::X1)
+            .addImm(AccessInfo & HWASanAccessInfo::RuntimeMask)
+            .addImm(0),
         *STI);
-    OutStreamer->emitInstruction(
-        MCInstBuilder(AArch64::LDRXui)
-            .addReg(AArch64::X16)
-            .addReg(AArch64::X16)
-            .addExpr(AArch64MCExpr::create(
-                HwasanTagMismatchRef, AArch64MCExpr::VariantKind::VK_GOT_LO12,
-                OutContext)),
-        *STI);
-    OutStreamer->emitInstruction(
-        MCInstBuilder(AArch64::BR).addReg(AArch64::X16), *STI);
+
+    if (CompileKernel) {
+      // The Linux kernel's dynamic loader doesn't support GOT relative
+      // relocations, but it doesn't support late binding either, so just call
+      // the function directly.
+      OutStreamer->emitInstruction(
+          MCInstBuilder(AArch64::B).addExpr(HwasanTagMismatchRef), *STI);
+    } else {
+      // Intentionally load the GOT entry and branch to it, rather than possibly
+      // late binding the function, which may clobber the registers before we
+      // have a chance to save them.
+      OutStreamer->emitInstruction(
+          MCInstBuilder(AArch64::ADRP)
+              .addReg(AArch64::X16)
+              .addExpr(AArch64MCExpr::create(
+                  HwasanTagMismatchRef, AArch64MCExpr::VariantKind::VK_GOT_PAGE,
+                  OutContext)),
+          *STI);
+      OutStreamer->emitInstruction(
+          MCInstBuilder(AArch64::LDRXui)
+              .addReg(AArch64::X16)
+              .addReg(AArch64::X16)
+              .addExpr(AArch64MCExpr::create(
+                  HwasanTagMismatchRef, AArch64MCExpr::VariantKind::VK_GOT_LO12,
+                  OutContext)),
+          *STI);
+      OutStreamer->emitInstruction(
+          MCInstBuilder(AArch64::BR).addReg(AArch64::X16), *STI);
+    }
   }
 }
 
@@ -541,7 +550,11 @@ void AArch64AsmPrinter::emitEndOfAsmFile(Module &M) {
     // generates code that does this, it is always safe to set.
     OutStreamer->emitAssemblerFlag(MCAF_SubsectionsViaSymbols);
   }
+  
+  // Emit stack and fault map information.
   emitStackMaps(SM);
+  FM.serializeToFaultMapSection();
+
 }
 
 void AArch64AsmPrinter::EmitLOHs() {
@@ -634,7 +647,8 @@ bool AArch64AsmPrinter::printAsmRegInClass(const MachineOperand &MO,
   const TargetRegisterInfo *RI = STI->getRegisterInfo();
   Register Reg = MO.getReg();
   unsigned RegToPrint = RC->getRegister(RI->getEncodingValue(Reg));
-  assert(RI->regsOverlap(RegToPrint, Reg));
+  if (!RI->regsOverlap(RegToPrint, Reg))
+    return true;
   O << AArch64InstPrinter::getRegisterName(RegToPrint, AltName);
   return false;
 }
@@ -795,33 +809,25 @@ void AArch64AsmPrinter::emitJumpTableInfo() {
     emitAlignment(Align(Size));
     OutStreamer->emitLabel(GetJTISymbol(JTI));
 
-    for (auto *JTBB : JTBBs)
-      emitJumpTableEntry(MJTI, JTBB, JTI);
-  }
-}
+    const MCSymbol *BaseSym = AArch64FI->getJumpTableEntryPCRelSymbol(JTI);
+    const MCExpr *Base = MCSymbolRefExpr::create(BaseSym, OutContext);
 
-void AArch64AsmPrinter::emitJumpTableEntry(const MachineJumpTableInfo *MJTI,
-                                           const MachineBasicBlock *MBB,
-                                           unsigned JTI) {
-  const MCExpr *Value = MCSymbolRefExpr::create(MBB->getSymbol(), OutContext);
-  auto AFI = MF->getInfo<AArch64FunctionInfo>();
-  unsigned Size = AFI->getJumpTableEntrySize(JTI);
+    for (auto *JTBB : JTBBs) {
+      const MCExpr *Value =
+          MCSymbolRefExpr::create(JTBB->getSymbol(), OutContext);
 
-  if (Size == 4) {
-    // .word LBB - LJTI
-    const TargetLowering *TLI = MF->getSubtarget().getTargetLowering();
-    const MCExpr *Base = TLI->getPICJumpTableRelocBaseExpr(MF, JTI, OutContext);
-    Value = MCBinaryExpr::createSub(Value, Base, OutContext);
-  } else {
-    // .byte (LBB - LBB) >> 2 (or .hword)
-    const MCSymbol *BaseSym = AFI->getJumpTableEntryPCRelSymbol(JTI);
-    const MCExpr *Base = MCSymbolRefExpr::create(BaseSym, OutContext);
-    Value = MCBinaryExpr::createSub(Value, Base, OutContext);
-    Value = MCBinaryExpr::createLShr(
-        Value, MCConstantExpr::create(2, OutContext), OutContext);
-  }
+      // Each entry is:
+      //     .byte/.hword (LBB - Lbase)>>2
+      // or plain:
+      //     .word LBB - Lbase
+      Value = MCBinaryExpr::createSub(Value, Base, OutContext);
+      if (Size != 4)
+        Value = MCBinaryExpr::createLShr(
+            Value, MCConstantExpr::create(2, OutContext), OutContext);
 
-  OutStreamer->emitValue(Value, Size);
+      OutStreamer->emitValue(Value, Size);
+    }
+  }
 }
 
 void AArch64AsmPrinter::emitFunctionEntryLabel() {
@@ -845,9 +851,9 @@ void AArch64AsmPrinter::emitFunctionEntryLabel() {
 ///
 ///             adr xDest, .LBB0_0
 ///             ldrb wScratch, [xTable, xEntry]   (with "lsl #1" for ldrh).
-///             add xDest, xDest, xScratch, lsl #2
-void AArch64AsmPrinter::LowerJumpTableDestSmall(llvm::MCStreamer &OutStreamer,
-                                                const llvm::MachineInstr &MI) {
+///             add xDest, xDest, xScratch (with "lsl #2" for smaller entries)
+void AArch64AsmPrinter::LowerJumpTableDest(llvm::MCStreamer &OutStreamer,
+                                           const llvm::MachineInstr &MI) {
   Register DestReg = MI.getOperand(0).getReg();
   Register ScratchReg = MI.getOperand(1).getReg();
   Register ScratchRegW =
@@ -855,33 +861,50 @@ void AArch64AsmPrinter::LowerJumpTableDestSmall(llvm::MCStreamer &OutStreamer,
   Register TableReg = MI.getOperand(2).getReg();
   Register EntryReg = MI.getOperand(3).getReg();
   int JTIdx = MI.getOperand(4).getIndex();
-  bool IsByteEntry = MI.getOpcode() == AArch64::JumpTableDest8;
+  int Size = AArch64FI->getJumpTableEntrySize(JTIdx);
 
   // This has to be first because the compression pass based its reachability
   // calculations on the start of the JumpTableDest instruction.
   auto Label =
       MF->getInfo<AArch64FunctionInfo>()->getJumpTableEntryPCRelSymbol(JTIdx);
+
+  // If we don't already have a symbol to use as the base, use the ADR
+  // instruction itself.
+  if (!Label) {
+    Label = MF->getContext().createTempSymbol();
+    AArch64FI->setJumpTableEntryInfo(JTIdx, Size, Label);
+    OutStreamer.emitLabel(Label);
+  }
+
+  auto LabelExpr = MCSymbolRefExpr::create(Label, MF->getContext());
   EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::ADR)
                                   .addReg(DestReg)
-                                  .addExpr(MCSymbolRefExpr::create(
-                                      Label, MF->getContext())));
+                                  .addExpr(LabelExpr));
 
   // Load the number of instruction-steps to offset from the label.
-  unsigned LdrOpcode = IsByteEntry ? AArch64::LDRBBroX : AArch64::LDRHHroX;
+  unsigned LdrOpcode;
+  switch (Size) {
+  case 1: LdrOpcode = AArch64::LDRBBroX; break;
+  case 2: LdrOpcode = AArch64::LDRHHroX; break;
+  case 4: LdrOpcode = AArch64::LDRSWroX; break;
+  default:
+    llvm_unreachable("Unknown jump table size");
+  }
+
   EmitToStreamer(OutStreamer, MCInstBuilder(LdrOpcode)
-                                  .addReg(ScratchRegW)
+                                  .addReg(Size == 4 ? ScratchReg : ScratchRegW)
                                   .addReg(TableReg)
                                   .addReg(EntryReg)
                                   .addImm(0)
-                                  .addImm(IsByteEntry ? 0 : 1));
+                                  .addImm(Size == 1 ? 0 : 1));
 
-  // Multiply the steps by 4 and add to the already materialized base label
-  // address.
+  // Add to the already materialized base label address, multiplying by 4 if
+  // compressed.
   EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::ADDXrs)
                                   .addReg(DestReg)
                                   .addReg(DestReg)
                                   .addReg(ScratchReg)
-                                  .addImm(2));
+                                  .addImm(Size == 4 ? 0 : 2));
 }
 
 void AArch64AsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
@@ -959,6 +982,83 @@ void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
     EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0));
 }
 
+void AArch64AsmPrinter::LowerSTATEPOINT(MCStreamer &OutStreamer, StackMaps &SM,
+                                        const MachineInstr &MI) {
+  StatepointOpers SOpers(&MI);
+  if (unsigned PatchBytes = SOpers.getNumPatchBytes()) {
+    assert(PatchBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
+    for (unsigned i = 0; i < PatchBytes; i += 4)
+      EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0));
+  } else {
+    // Lower call target and choose correct opcode
+    const MachineOperand &CallTarget = SOpers.getCallTarget();
+    MCOperand CallTargetMCOp;
+    unsigned CallOpcode;
+    switch (CallTarget.getType()) {
+    case MachineOperand::MO_GlobalAddress:
+    case MachineOperand::MO_ExternalSymbol:
+      MCInstLowering.lowerOperand(CallTarget, CallTargetMCOp);
+      CallOpcode = AArch64::BL;
+      break;
+    case MachineOperand::MO_Immediate:
+      CallTargetMCOp = MCOperand::createImm(CallTarget.getImm());
+      CallOpcode = AArch64::BL;
+      break;
+    case MachineOperand::MO_Register:
+      CallTargetMCOp = MCOperand::createReg(CallTarget.getReg());
+      CallOpcode = AArch64::BLR;
+      break;
+    default:
+      llvm_unreachable("Unsupported operand type in statepoint call target");
+      break;
+    }
+
+    EmitToStreamer(OutStreamer,
+                   MCInstBuilder(CallOpcode).addOperand(CallTargetMCOp));
+  }
+
+  auto &Ctx = OutStreamer.getContext();
+  MCSymbol *MILabel = Ctx.createTempSymbol();
+  OutStreamer.emitLabel(MILabel);
+  SM.recordStatepoint(*MILabel, MI);
+}
+
+void AArch64AsmPrinter::LowerFAULTING_OP(const MachineInstr &FaultingMI) {
+  // FAULTING_LOAD_OP <def>, <faltinf type>, <MBB handler>,
+  //                  <opcode>, <operands>
+
+  Register DefRegister = FaultingMI.getOperand(0).getReg();
+  FaultMaps::FaultKind FK =
+      static_cast<FaultMaps::FaultKind>(FaultingMI.getOperand(1).getImm());
+  MCSymbol *HandlerLabel = FaultingMI.getOperand(2).getMBB()->getSymbol();
+  unsigned Opcode = FaultingMI.getOperand(3).getImm();
+  unsigned OperandsBeginIdx = 4;
+
+  auto &Ctx = OutStreamer->getContext();
+  MCSymbol *FaultingLabel = Ctx.createTempSymbol();
+  OutStreamer->emitLabel(FaultingLabel);
+
+  assert(FK < FaultMaps::FaultKindMax && "Invalid Faulting Kind!");
+  FM.recordFaultingOp(FK, FaultingLabel, HandlerLabel);
+
+  MCInst MI;
+  MI.setOpcode(Opcode);
+
+  if (DefRegister != (Register)0)
+    MI.addOperand(MCOperand::createReg(DefRegister));
+
+  for (auto I = FaultingMI.operands_begin() + OperandsBeginIdx,
+            E = FaultingMI.operands_end();
+       I != E; ++I) {
+    MCOperand Dest;
+    lowerOperand(*I, Dest);
+    MI.addOperand(Dest);
+  }
+
+  OutStreamer->AddComment("on-fault: " + HandlerLabel->getName());
+  OutStreamer->emitInstruction(MI, getSubtargetInfo());
+}
+
 void AArch64AsmPrinter::EmitFMov0(const MachineInstr &MI) {
   Register DestReg = MI.getOperand(0).getReg();
   if (STI->hasZeroCycleZeroingFP() && !STI->hasZeroCycleZeroingFPWorkaround()) {
@@ -1172,17 +1272,28 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
     EmitToStreamer(*OutStreamer, Adrp);
 
     MCInst Ldr;
-    Ldr.setOpcode(AArch64::LDRXui);
-    Ldr.addOperand(MCOperand::createReg(AArch64::X1));
+    if (STI->isTargetILP32()) {
+      Ldr.setOpcode(AArch64::LDRWui);
+      Ldr.addOperand(MCOperand::createReg(AArch64::W1));
+    } else {
+      Ldr.setOpcode(AArch64::LDRXui);
+      Ldr.addOperand(MCOperand::createReg(AArch64::X1));
+    }
     Ldr.addOperand(MCOperand::createReg(AArch64::X0));
     Ldr.addOperand(SymTLSDescLo12);
     Ldr.addOperand(MCOperand::createImm(0));
     EmitToStreamer(*OutStreamer, Ldr);
 
     MCInst Add;
-    Add.setOpcode(AArch64::ADDXri);
-    Add.addOperand(MCOperand::createReg(AArch64::X0));
-    Add.addOperand(MCOperand::createReg(AArch64::X0));
+    if (STI->isTargetILP32()) {
+      Add.setOpcode(AArch64::ADDWri);
+      Add.addOperand(MCOperand::createReg(AArch64::W0));
+      Add.addOperand(MCOperand::createReg(AArch64::W0));
+    } else {
+      Add.setOpcode(AArch64::ADDXri);
+      Add.addOperand(MCOperand::createReg(AArch64::X0));
+      Add.addOperand(MCOperand::createReg(AArch64::X0));
+    }
     Add.addOperand(SymTLSDescLo12);
     Add.addOperand(MCOperand::createImm(AArch64_AM::getShiftValue(0)));
     EmitToStreamer(*OutStreamer, Add);
@@ -1202,30 +1313,10 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
     return;
   }
 
-  case AArch64::JumpTableDest32: {
-    // We want:
-    //     ldrsw xScratch, [xTable, xEntry, lsl #2]
-    //     add xDest, xTable, xScratch
-    unsigned DestReg = MI->getOperand(0).getReg(),
-             ScratchReg = MI->getOperand(1).getReg(),
-             TableReg = MI->getOperand(2).getReg(),
-             EntryReg = MI->getOperand(3).getReg();
-    EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::LDRSWroX)
-                                     .addReg(ScratchReg)
-                                     .addReg(TableReg)
-                                     .addReg(EntryReg)
-                                     .addImm(0)
-                                     .addImm(1));
-    EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::ADDXrs)
-                                     .addReg(DestReg)
-                                     .addReg(TableReg)
-                                     .addReg(ScratchReg)
-                                     .addImm(0));
-    return;
-  }
+  case AArch64::JumpTableDest32:
   case AArch64::JumpTableDest16:
   case AArch64::JumpTableDest8:
-    LowerJumpTableDestSmall(*OutStreamer, *MI);
+    LowerJumpTableDest(*OutStreamer, *MI);
     return;
 
   case AArch64::FMOVH0:
@@ -1240,6 +1331,12 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
   case TargetOpcode::PATCHPOINT:
     return LowerPATCHPOINT(*OutStreamer, SM, *MI);
 
+  case TargetOpcode::STATEPOINT:
+    return LowerSTATEPOINT(*OutStreamer, SM, *MI);
+
+  case TargetOpcode::FAULTING_OP:
+    return LowerFAULTING_OP(*MI);
+
   case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
     LowerPATCHABLE_FUNCTION_ENTER(*MI);
     return;
@@ -1284,6 +1381,14 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
     return;
 
   case AArch64::SEH_SaveRegP:
+    if (MI->getOperand(1).getImm() == 30 && MI->getOperand(0).getImm() >= 19 &&
+        MI->getOperand(0).getImm() <= 28) {
+      assert((MI->getOperand(0).getImm() - 19) % 2 == 0 &&
+             "Register paired with LR must be odd");
+      TS->EmitARM64WinCFISaveLRPair(MI->getOperand(0).getImm(),
+                                    MI->getOperand(2).getImm());
+      return;
+    }
     assert((MI->getOperand(1).getImm() - MI->getOperand(0).getImm() == 1) &&
             "Non-consecutive registers not allowed for save_regp");
     TS->EmitARM64WinCFISaveRegP(MI->getOperand(0).getImm(),
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp
index 1956014b738d..d3b5166585c3 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp
@@ -16,6 +16,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "AArch64MachineFunctionInfo.h"
 #include "AArch64Subtarget.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -57,13 +58,13 @@ FunctionPass *llvm::createAArch64BranchTargetsPass() {
 }
 
 bool AArch64BranchTargets::runOnMachineFunction(MachineFunction &MF) {
-  const Function &F = MF.getFunction();
-  if (!F.hasFnAttribute("branch-target-enforcement"))
+  if (!MF.getInfo<AArch64FunctionInfo>()->branchTargetEnforcement())
     return false;
 
   LLVM_DEBUG(
       dbgs() << "********** AArch64 Branch Targets  **********\n"
              << "********** Function: " << MF.getName() << '\n');
+  const Function &F = MF.getFunction();
 
   // LLVM does not consider basic blocks which are the targets of jump tables
   // to be address-taken (the address can't escape anywhere else), but they are
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
index 9ae2b465e247..c51dd48cab34 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
@@ -42,6 +42,51 @@ static const MCPhysReg ZRegList[] = {AArch64::Z0, AArch64::Z1, AArch64::Z2,
 static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers,
                              MVT LocVT, ISD::ArgFlagsTy &ArgFlags,
                              CCState &State, Align SlotAlign) {
+  if (LocVT.isScalableVector()) {
+    const AArch64Subtarget &Subtarget = static_cast<const AArch64Subtarget &>(
+        State.getMachineFunction().getSubtarget());
+    const AArch64TargetLowering *TLI = Subtarget.getTargetLowering();
+
+    // We are about to reinvoke the CCAssignFn auto-generated handler. If we
+    // don't unset these flags we will get stuck in an infinite loop forever
+    // invoking the custom handler.
+    ArgFlags.setInConsecutiveRegs(false);
+    ArgFlags.setInConsecutiveRegsLast(false);
+
+    // The calling convention for passing SVE tuples states that in the event
+    // we cannot allocate enough registers for the tuple we should still leave
+    // any remaining registers unallocated. However, when we call the
+    // CCAssignFn again we want it to behave as if all remaining registers are
+    // allocated. This will force the code to pass the tuple indirectly in
+    // accordance with the PCS.
+    bool RegsAllocated[8];
+    for (int I = 0; I < 8; I++) {
+      RegsAllocated[I] = State.isAllocated(ZRegList[I]);
+      State.AllocateReg(ZRegList[I]);
+    }
+
+    auto &It = PendingMembers[0];
+    CCAssignFn *AssignFn =
+        TLI->CCAssignFnForCall(State.getCallingConv(), /*IsVarArg=*/false);
+    if (AssignFn(It.getValNo(), It.getValVT(), It.getValVT(), CCValAssign::Full,
+                 ArgFlags, State))
+      llvm_unreachable("Call operand has unhandled type");
+
+    // Return the flags to how they were before.
+    ArgFlags.setInConsecutiveRegs(true);
+    ArgFlags.setInConsecutiveRegsLast(true);
+
+    // Return the register state back to how it was before, leaving any
+    // unallocated registers available for other smaller types.
+    for (int I = 0; I < 8; I++)
+      if (!RegsAllocated[I])
+        State.DeallocateReg(ZRegList[I]);
+
+    // All pending members have now been allocated
+    PendingMembers.clear();
+    return true;
+  }
+
   unsigned Size = LocVT.getSizeInBits() / 8;
   const Align StackAlign =
       State.getMachineFunction().getDataLayout().getStackAlignment();
@@ -146,13 +191,11 @@ static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
     return true;
   }
 
-  if (LocVT.isScalableVector())
-    report_fatal_error(
-        "Passing consecutive scalable vector registers unsupported");
-
-  // Mark all regs in the class as unavailable
-  for (auto Reg : RegList)
-    State.AllocateReg(Reg);
+  if (!LocVT.isScalableVector()) {
+    // Mark all regs in the class as unavailable
+    for (auto Reg : RegList)
+      State.AllocateReg(Reg);
+  }
 
   const Align SlotAlign = Subtarget.isTargetDarwin() ? Align(1) : Align(8);
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64Combine.td b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64Combine.td
index aa41cae289e8..b1e714653f46 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -19,7 +19,6 @@ def fconstant_to_constant : GICombineRule<
 
 def AArch64PreLegalizerCombinerHelper: GICombinerHelper<
   "AArch64GenPreLegalizerCombinerHelper", [all_combines,
-                                           elide_br_by_inverting_cond,
                                            fconstant_to_constant]> {
   let DisableRuleOption = "aarch64prelegalizercombiner-disable-rule";
   let StateClass = "AArch64PreLegalizerCombinerHelperState";
@@ -76,9 +75,68 @@ def ext: GICombineRule <
 // instruction.
 def shuffle_vector_pseudos : GICombineGroup<[dup, rev, ext, zip, uzp, trn]>;
 
+def vashr_vlshr_imm_matchdata : GIDefMatchData<"int64_t">;
+def vashr_vlshr_imm : GICombineRule<
+  (defs root:$root, vashr_vlshr_imm_matchdata:$matchinfo),
+  (match (wip_match_opcode G_ASHR, G_LSHR):$root,
+          [{ return matchVAshrLshrImm(*${root}, MRI, ${matchinfo}); }]),
+  (apply [{ applyVAshrLshrImm(*${root}, MRI, ${matchinfo}); }])
+>;
+
+def form_duplane_matchdata :
+  GIDefMatchData<"std::pair<unsigned, int>">;
+def form_duplane : GICombineRule <
+  (defs root:$root, form_duplane_matchdata:$matchinfo),
+  (match (wip_match_opcode G_SHUFFLE_VECTOR):$root,
+          [{ return matchDupLane(*${root}, MRI, ${matchinfo}); }]),
+  (apply [{ applyDupLane(*${root}, MRI, B, ${matchinfo}); }])
+>;
+
+def adjust_icmp_imm_matchdata :
+  GIDefMatchData<"std::pair<uint64_t, CmpInst::Predicate>">;
+def adjust_icmp_imm : GICombineRule <
+  (defs root:$root, adjust_icmp_imm_matchdata:$matchinfo),
+  (match (wip_match_opcode G_ICMP):$root,
+          [{ return matchAdjustICmpImmAndPred(*${root}, MRI, ${matchinfo}); }]),
+  (apply [{ applyAdjustICmpImmAndPred(*${root}, ${matchinfo}, B, Observer); }])
+>;
+
+def icmp_lowering : GICombineGroup<[adjust_icmp_imm]>;
+
+def extractvecelt_pairwise_add_matchdata : GIDefMatchData<"std::tuple<unsigned, LLT, Register>">;
+def extractvecelt_pairwise_add : GICombineRule<
+  (defs root:$root, extractvecelt_pairwise_add_matchdata:$matchinfo),
+  (match (wip_match_opcode G_EXTRACT_VECTOR_ELT):$root,
+          [{ return matchExtractVecEltPairwiseAdd(*${root}, MRI, ${matchinfo}); }]),
+  (apply [{ applyExtractVecEltPairwiseAdd(*${root}, MRI, B, ${matchinfo}); }])
+>;
+
+def mul_const_matchdata : GIDefMatchData<"std::function<void(MachineIRBuilder&, Register)>">;
+def mul_const : GICombineRule<
+  (defs root:$root, mul_const_matchdata:$matchinfo),
+  (match (wip_match_opcode G_MUL):$root,
+          [{ return matchAArch64MulConstCombine(*${root}, MRI, ${matchinfo}); }]),
+  (apply [{ applyAArch64MulConstCombine(*${root}, MRI, B, ${matchinfo}); }])
+>;
+
+// Post-legalization combines which should happen at all optimization levels.
+// (E.g. ones that facilitate matching for the selector) For example, matching
+// pseudos.
+def AArch64PostLegalizerLoweringHelper
+    : GICombinerHelper<"AArch64GenPostLegalizerLoweringHelper",
+                       [shuffle_vector_pseudos, vashr_vlshr_imm,
+                        icmp_lowering, form_duplane]> {
+  let DisableRuleOption = "aarch64postlegalizerlowering-disable-rule";
+}
+
+// Post-legalization combines which are primarily optimizations.
 def AArch64PostLegalizerCombinerHelper
     : GICombinerHelper<"AArch64GenPostLegalizerCombinerHelper",
-                       [erase_undef_store, combines_for_extload,
-                        sext_already_extended, shuffle_vector_pseudos]> {
+                       [copy_prop, erase_undef_store, combines_for_extload,
+                        sext_trunc_sextload,
+                        hoist_logic_op_with_same_opcode_hands,
+                        redundant_and, xor_of_and_with_same_reg,
+                        extractvecelt_pairwise_add, redundant_or,
+                        mul_const]> {
   let DisableRuleOption = "aarch64postlegalizercombiner-disable-rule";
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64CompressJumpTables.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64CompressJumpTables.cpp
index 57dc8a4061f1..2328a8b4deb8 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64CompressJumpTables.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64CompressJumpTables.cpp
@@ -37,8 +37,13 @@ class AArch64CompressJumpTables : public MachineFunctionPass {
   MachineFunction *MF;
   SmallVector<int, 8> BlockInfo;
 
-  int computeBlockSize(MachineBasicBlock &MBB);
-  void scanFunction();
+  /// Returns the size in instructions of the block \p MBB, or None if we
+  /// couldn't get a safe upper bound.
+  Optional<int> computeBlockSize(MachineBasicBlock &MBB);
+
+  /// Gather information about the function, returns false if we can't perform
+  /// this optimization for some reason.
+  bool scanFunction();
 
   bool compressJumpTable(MachineInstr &MI, int Offset);
 
@@ -59,19 +64,27 @@ public:
   }
 };
 char AArch64CompressJumpTables::ID = 0;
-}
+} // namespace
 
 INITIALIZE_PASS(AArch64CompressJumpTables, DEBUG_TYPE,
                 "AArch64 compress jump tables pass", false, false)
 
-int AArch64CompressJumpTables::computeBlockSize(MachineBasicBlock &MBB) {
+Optional<int>
+AArch64CompressJumpTables::computeBlockSize(MachineBasicBlock &MBB) {
   int Size = 0;
-  for (const MachineInstr &MI : MBB)
+  for (const MachineInstr &MI : MBB) {
+    // Inline asm may contain some directives like .bytes which we don't
+    // currently have the ability to parse accurately. To be safe, just avoid
+    // computing a size and bail out.
+    if (MI.getOpcode() == AArch64::INLINEASM ||
+        MI.getOpcode() == AArch64::INLINEASM_BR)
+      return None;
     Size += TII->getInstSizeInBytes(MI);
+  }
   return Size;
 }
 
-void AArch64CompressJumpTables::scanFunction() {
+bool AArch64CompressJumpTables::scanFunction() {
   BlockInfo.clear();
   BlockInfo.resize(MF->getNumBlockIDs());
 
@@ -84,8 +97,12 @@ void AArch64CompressJumpTables::scanFunction() {
     else
       AlignedOffset = alignTo(Offset, Alignment);
     BlockInfo[MBB.getNumber()] = AlignedOffset;
-    Offset = AlignedOffset + computeBlockSize(MBB);
+    auto BlockSize = computeBlockSize(MBB);
+    if (!BlockSize)
+      return false;
+    Offset = AlignedOffset + *BlockSize;
   }
+  return true;
 }
 
 bool AArch64CompressJumpTables::compressJumpTable(MachineInstr &MI,
@@ -104,7 +121,7 @@ bool AArch64CompressJumpTables::compressJumpTable(MachineInstr &MI,
   int MaxOffset = std::numeric_limits<int>::min(),
       MinOffset = std::numeric_limits<int>::max();
   MachineBasicBlock *MinBlock = nullptr;
-  for (auto Block : JT.MBBs) {
+  for (auto *Block : JT.MBBs) {
     int BlockOffset = BlockInfo[Block->getNumber()];
     assert(BlockOffset % 4 == 0 && "misaligned basic block");
 
@@ -124,13 +141,14 @@ bool AArch64CompressJumpTables::compressJumpTable(MachineInstr &MI,
   }
 
   int Span = MaxOffset - MinOffset;
-  auto AFI = MF->getInfo<AArch64FunctionInfo>();
+  auto *AFI = MF->getInfo<AArch64FunctionInfo>();
   if (isUInt<8>(Span / 4)) {
     AFI->setJumpTableEntryInfo(JTIdx, 1, MinBlock->getSymbol());
     MI.setDesc(TII->get(AArch64::JumpTableDest8));
     ++NumJT8;
     return true;
-  } else if (isUInt<16>(Span / 4)) {
+  }
+  if (isUInt<16>(Span / 4)) {
     AFI->setJumpTableEntryInfo(JTIdx, 2, MinBlock->getSymbol());
     MI.setDesc(TII->get(AArch64::JumpTableDest16));
     ++NumJT16;
@@ -151,7 +169,8 @@ bool AArch64CompressJumpTables::runOnMachineFunction(MachineFunction &MFIn) {
   if (ST.force32BitJumpTables() && !MF->getFunction().hasMinSize())
     return false;
 
-  scanFunction();
+  if (!scanFunction())
+    return false;
 
   for (MachineBasicBlock &MBB : *MF) {
     int Offset = BlockInfo[MBB.getNumber()];
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 9e65ad2e18f9..e57650ae60b1 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -83,6 +83,8 @@ private:
   bool expandSVESpillFill(MachineBasicBlock &MBB,
                           MachineBasicBlock::iterator MBBI, unsigned Opc,
                           unsigned N);
+  bool expandCALL_RVMARKER(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI);
 };
 
 } // end anonymous namespace
@@ -627,6 +629,46 @@ bool AArch64ExpandPseudo::expandSVESpillFill(MachineBasicBlock &MBB,
   return true;
 }
 
+bool AArch64ExpandPseudo::expandCALL_RVMARKER(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) {
+  // Expand CALL_RVMARKER pseudo to a branch, followed by the special `mov x29,
+  // x29` marker. Mark the sequence as bundle, to avoid passes moving other code
+  // in between.
+  MachineInstr &MI = *MBBI;
+
+  MachineInstr *OriginalCall;
+  MachineOperand &CallTarget = MI.getOperand(0);
+  assert((CallTarget.isGlobal() || CallTarget.isReg()) &&
+         "invalid operand for regular call");
+  unsigned Opc = CallTarget.isGlobal() ? AArch64::BL : AArch64::BLR;
+  OriginalCall = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc)).getInstr();
+  OriginalCall->addOperand(CallTarget);
+
+  unsigned RegMaskStartIdx = 1;
+  // Skip register arguments. Those are added during ISel, but are not
+  // needed for the concrete branch.
+  while (!MI.getOperand(RegMaskStartIdx).isRegMask()) {
+    assert(MI.getOperand(RegMaskStartIdx).isReg() &&
+           "should only skip register operands");
+    RegMaskStartIdx++;
+  }
+  for (; RegMaskStartIdx < MI.getNumOperands(); ++RegMaskStartIdx)
+    OriginalCall->addOperand(MI.getOperand(RegMaskStartIdx));
+
+  auto *Marker = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXrs))
+                     .addReg(AArch64::FP, RegState::Define)
+                     .addReg(AArch64::XZR)
+                     .addReg(AArch64::FP)
+                     .addImm(0)
+                     .getInstr();
+  if (MI.shouldUpdateCallSiteInfo())
+    MBB.getParent()->moveCallSiteInfo(&MI, Marker);
+  MI.eraseFromParent();
+  finalizeBundle(MBB, OriginalCall->getIterator(),
+                 std::next(Marker->getIterator()));
+  return true;
+}
+
 /// If MBBI references a pseudo instruction that should be expanded here,
 /// do the expansion and return true.  Otherwise return false.
 bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
@@ -1014,6 +1056,8 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
      return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 3);
    case AArch64::LDR_ZZXI:
      return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 2);
+   case AArch64::BLR_RVMARKER:
+     return expandCALL_RVMARKER(MBB, MBBI);
   }
   return false;
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
index 538863ebe95a..209f9f7255a5 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
@@ -54,7 +54,7 @@
 
 using namespace llvm;
 
-#define DEBUG_TYPE "falkor-hwpf-fix"
+#define DEBUG_TYPE "aarch64-falkor-hwpf-fix"
 
 STATISTIC(NumStridedLoadsMarked, "Number of strided loads marked");
 STATISTIC(NumCollisionsAvoided,
@@ -146,7 +146,7 @@ bool FalkorMarkStridedAccesses::run() {
 
 bool FalkorMarkStridedAccesses::runOnLoop(Loop &L) {
   // Only mark strided loads in the inner-most loop
-  if (!L.empty())
+  if (!L.isInnermost())
     return false;
 
   bool MadeChange = false;
@@ -224,10 +224,10 @@ struct LoadInfo {
 
 char FalkorHWPFFix::ID = 0;
 
-INITIALIZE_PASS_BEGIN(FalkorHWPFFix, "falkor-hwpf-fix-late",
+INITIALIZE_PASS_BEGIN(FalkorHWPFFix, "aarch64-falkor-hwpf-fix-late",
                       "Falkor HW Prefetch Fix Late Phase", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
-INITIALIZE_PASS_END(FalkorHWPFFix, "falkor-hwpf-fix-late",
+INITIALIZE_PASS_END(FalkorHWPFFix, "aarch64-falkor-hwpf-fix-late",
                     "Falkor HW Prefetch Fix Late Phase", false, false)
 
 static unsigned makeTag(unsigned Dest, unsigned Base, unsigned Offset) {
@@ -830,7 +830,7 @@ bool FalkorHWPFFix::runOnMachineFunction(MachineFunction &Fn) {
   for (MachineLoop *I : LI)
     for (auto L = df_begin(I), LE = df_end(I); L != LE; ++L)
       // Only process inner-loops
-      if (L->empty())
+      if (L->isInnermost())
         runOnLoop(**L, Fn);
 
   return Modified;
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FastISel.cpp
index 0f63f4ca62e5..9801036653f7 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FastISel.cpp
@@ -3409,8 +3409,7 @@ bool AArch64FastISel::foldXALUIntrinsic(AArch64CC::CondCode &CC,
   const Value *RHS = II->getArgOperand(1);
 
   // Canonicalize immediate to the RHS.
-  if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) &&
-      isCommutativeIntrinsic(II))
+  if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) && II->isCommutative())
     std::swap(LHS, RHS);
 
   // Simplify multiplies.
@@ -3652,14 +3651,10 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BRK))
         .addImm(1);
     return true;
-  case Intrinsic::debugtrap: {
-    if (Subtarget->isTargetWindows()) {
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BRK))
-          .addImm(0xF000);
-      return true;
-    }
-    break;
-  }
+  case Intrinsic::debugtrap:
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BRK))
+        .addImm(0xF000);
+    return true;
 
   case Intrinsic::sqrt: {
     Type *RetTy = II->getCalledFunction()->getReturnType();
@@ -3701,8 +3696,7 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
     const Value *LHS = II->getArgOperand(0);
     const Value *RHS = II->getArgOperand(1);
     // Canonicalize immediate to the RHS.
-    if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) &&
-        isCommutativeIntrinsic(II))
+    if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) && II->isCommutative())
       std::swap(LHS, RHS);
 
     // Simplify multiplies.
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index c6cc6e9e8471..65ee5016042c 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -116,7 +116,6 @@
 #include "AArch64InstrInfo.h"
 #include "AArch64MachineFunctionInfo.h"
 #include "AArch64RegisterInfo.h"
-#include "AArch64StackOffset.h"
 #include "AArch64Subtarget.h"
 #include "AArch64TargetMachine.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
@@ -176,6 +175,10 @@ static cl::opt<bool> StackTaggingMergeSetTag(
     cl::desc("merge settag instruction in function epilog"), cl::init(true),
     cl::Hidden);
 
+static cl::opt<bool> OrderFrameObjects("aarch64-order-frame-objects",
+                                       cl::desc("sort stack allocations"),
+                                       cl::init(true), cl::Hidden);
+
 STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
 
 /// Returns the argument pop size.
@@ -246,7 +249,7 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF) {
 
 TargetStackID::Value
 AArch64FrameLowering::getStackIDForScalableVectors() const {
-  return TargetStackID::SVEVector;
+  return TargetStackID::ScalableVector;
 }
 
 /// Returns the size of the fixed object area (allocated next to sp on entry)
@@ -270,7 +273,7 @@ static unsigned getFixedObjectSize(const MachineFunction &MF,
 /// Returns the size of the entire SVE stackframe (calleesaves + spills).
 static StackOffset getSVEStackSize(const MachineFunction &MF) {
   const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
-  return {(int64_t)AFI->getStackSizeSVE(), MVT::nxv1i8};
+  return StackOffset::getScalable((int64_t)AFI->getStackSizeSVE());
 }
 
 bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
@@ -362,44 +365,19 @@ MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
       // Most call frames will be allocated at the start of a function so
       // this is OK, but it is a limitation that needs dealing with.
       assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large");
-      emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, {Amount, MVT::i8},
-                      TII);
+      emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
+                      StackOffset::getFixed(Amount), TII);
     }
   } else if (CalleePopAmount != 0) {
     // If the calling convention demands that the callee pops arguments from the
     // stack, we want to add it back if we have a reserved call frame.
     assert(CalleePopAmount < 0xffffff && "call frame too large");
     emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
-                    {-(int64_t)CalleePopAmount, MVT::i8}, TII);
+                    StackOffset::getFixed(-(int64_t)CalleePopAmount), TII);
   }
   return MBB.erase(I);
 }
 
-static bool ShouldSignReturnAddress(MachineFunction &MF) {
-  // The function should be signed in the following situations:
-  // - sign-return-address=all
-  // - sign-return-address=non-leaf and the functions spills the LR
-
-  const Function &F = MF.getFunction();
-  if (!F.hasFnAttribute("sign-return-address"))
-    return false;
-
-  StringRef Scope = F.getFnAttribute("sign-return-address").getValueAsString();
-  if (Scope.equals("none"))
-    return false;
-
-  if (Scope.equals("all"))
-    return true;
-
-  assert(Scope.equals("non-leaf") && "Expected all, none or non-leaf");
-
-  for (const auto &Info : MF.getFrameInfo().getCalleeSavedInfo())
-    if (Info.getReg() == AArch64::LR)
-      return true;
-
-  return false;
-}
-
 // Convenience function to create a DWARF expression for
 //   Expr + NumBytes + NumVGScaledBytes * AArch64::VG
 static void appendVGScaledOffsetExpr(SmallVectorImpl<char> &Expr,
@@ -435,7 +413,8 @@ static void appendVGScaledOffsetExpr(SmallVectorImpl<char> &Expr,
 MCCFIInstruction AArch64FrameLowering::createDefCFAExpressionFromSP(
     const TargetRegisterInfo &TRI, const StackOffset &OffsetFromSP) const {
   int64_t NumBytes, NumVGScaledBytes;
-  OffsetFromSP.getForDwarfOffset(NumBytes, NumVGScaledBytes);
+  AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(OffsetFromSP, NumBytes,
+                                                        NumVGScaledBytes);
 
   std::string CommentBuffer = "sp";
   llvm::raw_string_ostream Comment(CommentBuffer);
@@ -462,7 +441,8 @@ MCCFIInstruction AArch64FrameLowering::createCfaOffset(
     const TargetRegisterInfo &TRI, unsigned Reg,
     const StackOffset &OffsetFromDefCFA) const {
   int64_t NumBytes, NumVGScaledBytes;
-  OffsetFromDefCFA.getForDwarfOffset(NumBytes, NumVGScaledBytes);
+  AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
+      OffsetFromDefCFA, NumBytes, NumVGScaledBytes);
 
   unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
 
@@ -516,14 +496,14 @@ void AArch64FrameLowering::emitCalleeSavedFrameMoves(
       continue;
 
     StackOffset Offset;
-    if (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::SVEVector) {
+    if (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector) {
       AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
-      Offset = StackOffset(MFI.getObjectOffset(Info.getFrameIdx()), MVT::nxv1i8) -
-               StackOffset(AFI->getCalleeSavedStackSize(MFI), MVT::i8);
+      Offset =
+          StackOffset::getScalable(MFI.getObjectOffset(Info.getFrameIdx())) -
+          StackOffset::getFixed(AFI->getCalleeSavedStackSize(MFI));
     } else {
-      Offset = {MFI.getObjectOffset(Info.getFrameIdx()) -
-                    getOffsetOfLocalArea(),
-                MVT::i8};
+      Offset = StackOffset::getFixed(MFI.getObjectOffset(Info.getFrameIdx()) -
+                                     getOffsetOfLocalArea());
     }
     unsigned CFIIndex = MF.addFrameInst(createCfaOffset(*TRI, Reg, Offset));
     BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
@@ -604,6 +584,12 @@ static bool windowsRequiresStackProbe(MachineFunction &MF,
          !F.hasFnAttribute("no-stack-arg-probe");
 }
 
+static bool needsWinCFI(const MachineFunction &MF) {
+  const Function &F = MF.getFunction();
+  return MF.getTarget().getMCAsmInfo()->usesWindowsCFI() &&
+         F.needsUnwindTableEntry();
+}
+
 bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
     MachineFunction &MF, uint64_t StackBumpBytes) const {
   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
@@ -614,6 +600,18 @@ bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
   if (AFI->getLocalStackSize() == 0)
     return false;
 
+  // For WinCFI, if optimizing for size, prefer to not combine the stack bump
+  // (to force a stp with predecrement) to match the packed unwind format,
+  // provided that there actually are any callee saved registers to merge the
+  // decrement with.
+  // This is potentially marginally slower, but allows using the packed
+  // unwind format for functions that both have a local area and callee saved
+  // registers. Using the packed unwind format notably reduces the size of
+  // the unwind info.
+  if (needsWinCFI(MF) && AFI->getCalleeSavedStackSize() > 0 &&
+      MF.getFunction().hasOptSize())
+    return false;
+
   // 512 is the maximum immediate for stp/ldp that will be used for
   // callee-save save/restores
   if (StackBumpBytes >= 512 || windowsRequiresStackProbe(MF, StackBumpBytes))
@@ -1007,27 +1005,6 @@ static void adaptForLdStOpt(MachineBasicBlock &MBB,
   //
 }
 
-static bool ShouldSignWithAKey(MachineFunction &MF) {
-  const Function &F = MF.getFunction();
-  if (!F.hasFnAttribute("sign-return-address-key"))
-    return true;
-
-  const StringRef Key =
-      F.getFnAttribute("sign-return-address-key").getValueAsString();
-  assert(Key.equals_lower("a_key") || Key.equals_lower("b_key"));
-  return Key.equals_lower("a_key");
-}
-
-static bool needsWinCFI(const MachineFunction &MF) {
-  const Function &F = MF.getFunction();
-  return MF.getTarget().getMCAsmInfo()->usesWindowsCFI() &&
-         F.needsUnwindTableEntry();
-}
-
-static bool isTargetDarwin(const MachineFunction &MF) {
-  return MF.getSubtarget<AArch64Subtarget>().isTargetDarwin();
-}
-
 static bool isTargetWindows(const MachineFunction &MF) {
   return MF.getSubtarget<AArch64Subtarget>().isTargetWindows();
 }
@@ -1074,15 +1051,16 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
   // to determine the end of the prologue.
   DebugLoc DL;
 
-  if (ShouldSignReturnAddress(MF)) {
-    if (ShouldSignWithAKey(MF))
-      BuildMI(MBB, MBBI, DL, TII->get(AArch64::PACIASP))
-          .setMIFlag(MachineInstr::FrameSetup);
-    else {
+  const auto &MFnI = *MF.getInfo<AArch64FunctionInfo>();
+  if (MFnI.shouldSignReturnAddress()) {
+    if (MFnI.shouldSignWithBKey()) {
       BuildMI(MBB, MBBI, DL, TII->get(AArch64::EMITBKEY))
           .setMIFlag(MachineInstr::FrameSetup);
       BuildMI(MBB, MBBI, DL, TII->get(AArch64::PACIBSP))
           .setMIFlag(MachineInstr::FrameSetup);
+    } else {
+      BuildMI(MBB, MBBI, DL, TII->get(AArch64::PACIASP))
+          .setMIFlag(MachineInstr::FrameSetup);
     }
 
     unsigned CFIIndex =
@@ -1097,9 +1075,13 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
   if (MF.getFunction().getCallingConv() == CallingConv::GHC)
     return;
 
-  // Set tagged base pointer to the bottom of the stack frame.
+  // Set tagged base pointer to the requested stack slot.
   // Ideally it should match SP value after prologue.
-  AFI->setTaggedBasePointerOffset(MFI.getStackSize());
+  Optional<int> TBPI = AFI->getTaggedBasePointerIndex();
+  if (TBPI)
+    AFI->setTaggedBasePointerOffset(-MFI.getObjectOffset(*TBPI));
+  else
+    AFI->setTaggedBasePointerOffset(MFI.getStackSize());
 
   const StackOffset &SVEStackSize = getSVEStackSize(MF);
 
@@ -1126,8 +1108,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
       ++NumRedZoneFunctions;
     } else {
       emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
-                      {-NumBytes, MVT::i8}, TII, MachineInstr::FrameSetup,
-                      false, NeedsWinCFI, &HasWinCFI);
+                      StackOffset::getFixed(-NumBytes), TII,
+                      MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
       if (!NeedsWinCFI && needsFrameMoves) {
         // Label used to tie together the PROLOG_LABEL and the MachineMoves.
         MCSymbol *FrameLabel = MMI.getContext().createTempSymbol();
@@ -1160,8 +1142,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
   if (CombineSPBump) {
     assert(!SVEStackSize && "Cannot combine SP bump with SVE");
     emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
-                    {-NumBytes, MVT::i8}, TII, MachineInstr::FrameSetup, false,
-                    NeedsWinCFI, &HasWinCFI);
+                    StackOffset::getFixed(-NumBytes), TII,
+                    MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
     NumBytes = 0;
   } else if (PrologueSaveSize != 0) {
     MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(
@@ -1185,7 +1167,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
   // For funclets the FP belongs to the containing function.
   if (!IsFunclet && HasFP) {
     // Only set up FP if we actually need to.
-    int64_t FPOffset = isTargetDarwin(MF) ? (AFI->getCalleeSavedStackSize() - 16) : 0;
+    int64_t FPOffset = AFI->getCalleeSaveBaseToFrameRecordOffset();
 
     if (CombineSPBump)
       FPOffset += AFI->getLocalStackSize();
@@ -1195,8 +1177,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
     // Note: All stores of callee-saved registers are marked as "FrameSetup".
     // This code marks the instruction(s) that set the FP also.
     emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP,
-                    {FPOffset, MVT::i8}, TII, MachineInstr::FrameSetup, false,
-                    NeedsWinCFI, &HasWinCFI);
+                    StackOffset::getFixed(FPOffset), TII,
+                    MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
   }
 
   if (windowsRequiresStackProbe(MF, NumBytes)) {
@@ -1306,7 +1288,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
       ++MBBI;
     CalleeSavesEnd = MBBI;
 
-    AllocateBefore = {CalleeSavedSize, MVT::nxv1i8};
+    AllocateBefore = StackOffset::getScalable(CalleeSavedSize);
     AllocateAfter = SVEStackSize - AllocateBefore;
   }
 
@@ -1338,8 +1320,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
       // the correct value here, as NumBytes also includes padding bytes,
       // which shouldn't be counted here.
       emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP,
-                      {-NumBytes, MVT::i8}, TII, MachineInstr::FrameSetup,
-                      false, NeedsWinCFI, &HasWinCFI);
+                      StackOffset::getFixed(-NumBytes), TII,
+                      MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
 
     if (NeedsRealignment) {
       const unsigned NrBitsToZero = Log2(MFI.getMaxAlign());
@@ -1409,11 +1391,6 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
   }
 
   if (needsFrameMoves) {
-    const DataLayout &TD = MF.getDataLayout();
-    const int StackGrowth = isTargetDarwin(MF)
-                                ? (2 * -TD.getPointerSize(0))
-                                : -AFI->getCalleeSavedStackSize();
-    Register FramePtr = RegInfo->getFrameRegister(MF);
     // An example of the prologue:
     //
     //     .globl __foo
@@ -1481,10 +1458,15 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
     //     .cfi_offset w28, -32
 
     if (HasFP) {
+      const int OffsetToFirstCalleeSaveFromFP =
+          AFI->getCalleeSaveBaseToFrameRecordOffset() -
+          AFI->getCalleeSavedStackSize();
+      Register FramePtr = RegInfo->getFrameRegister(MF);
+
       // Define the current CFA rule to use the provided FP.
       unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true);
       unsigned CFIIndex = MF.addFrameInst(
-          MCCFIInstruction::cfiDefCfa(nullptr, Reg, FixedObject - StackGrowth));
+          MCCFIInstruction::cfiDefCfa(nullptr, Reg, FixedObject - OffsetToFirstCalleeSaveFromFP));
       BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex)
           .setMIFlags(MachineInstr::FrameSetup);
@@ -1494,7 +1476,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
         const TargetSubtargetInfo &STI = MF.getSubtarget();
         const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
         StackOffset TotalSize =
-            SVEStackSize + StackOffset((int64_t)MFI.getStackSize(), MVT::i8);
+            SVEStackSize + StackOffset::getFixed((int64_t)MFI.getStackSize());
         CFIIndex = MF.addFrameInst(createDefCFAExpressionFromSP(TRI, TotalSize));
       } else {
         // Encode the stack size of the leaf function.
@@ -1514,7 +1496,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
 
 static void InsertReturnAddressAuth(MachineFunction &MF,
                                     MachineBasicBlock &MBB) {
-  if (!ShouldSignReturnAddress(MF))
+  const auto &MFI = *MF.getInfo<AArch64FunctionInfo>();
+  if (!MFI.shouldSignReturnAddress())
     return;
   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
@@ -1528,16 +1511,16 @@ static void InsertReturnAddressAuth(MachineFunction &MF,
   // this instruction can safely used for any v8a architecture.
   // From v8.3a onwards there are optimised authenticate LR and return
   // instructions, namely RETA{A,B}, that can be used instead.
-  if (Subtarget.hasV8_3aOps() && MBBI != MBB.end() &&
+  if (Subtarget.hasPAuth() && MBBI != MBB.end() &&
       MBBI->getOpcode() == AArch64::RET_ReallyLR) {
     BuildMI(MBB, MBBI, DL,
-            TII->get(ShouldSignWithAKey(MF) ? AArch64::RETAA : AArch64::RETAB))
+            TII->get(MFI.shouldSignWithBKey() ? AArch64::RETAB : AArch64::RETAA))
         .copyImplicitOps(*MBBI);
     MBB.erase(MBBI);
   } else {
     BuildMI(
         MBB, MBBI, DL,
-        TII->get(ShouldSignWithAKey(MF) ? AArch64::AUTIASP : AArch64::AUTIBSP))
+        TII->get(MFI.shouldSignWithBKey() ? AArch64::AUTIBSP : AArch64::AUTIASP))
         .setMIFlag(MachineInstr::FrameDestroy);
   }
 }
@@ -1562,10 +1545,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
   bool NeedsWinCFI = needsWinCFI(MF);
   bool HasWinCFI = false;
   bool IsFunclet = false;
-  auto WinCFI = make_scope_exit([&]() {
-    if (!MF.hasWinCFI())
-      MF.setHasWinCFI(HasWinCFI);
-  });
+  auto WinCFI = make_scope_exit([&]() { assert(HasWinCFI == MF.hasWinCFI()); });
 
   if (MBB.end() != MBBI) {
     DL = MBBI->getDebugLoc();
@@ -1665,7 +1645,13 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
                                         NeedsWinCFI, &HasWinCFI);
   }
 
-  if (NeedsWinCFI) {
+  if (MF.hasWinCFI()) {
+    // If the prologue didn't contain any SEH opcodes and didn't set the
+    // MF.hasWinCFI() flag, assume the epilogue won't either, and skip the
+    // EpilogStart - to avoid generating CFI for functions that don't need it.
+    // (And as we didn't generate any prologue at all, it would be asymmetrical
+    // to the epilogue.) By the end of the function, we assert that
+    // HasWinCFI is equal to MF.hasWinCFI(), to verify this assumption.
     HasWinCFI = true;
     BuildMI(MBB, LastPopI, DL, TII->get(AArch64::SEH_EpilogStart))
         .setMIFlag(MachineInstr::FrameDestroy);
@@ -1677,9 +1663,10 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
   if (CombineSPBump) {
     assert(!SVEStackSize && "Cannot combine SP bump with SVE");
     emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
-                    {NumBytes + (int64_t)AfterCSRPopSize, MVT::i8}, TII,
-                    MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
-    if (NeedsWinCFI && HasWinCFI)
+                    StackOffset::getFixed(NumBytes + (int64_t)AfterCSRPopSize),
+                    TII, MachineInstr::FrameDestroy, false, NeedsWinCFI,
+                    &HasWinCFI);
+    if (HasWinCFI)
       BuildMI(MBB, MBB.getFirstTerminator(), DL,
               TII->get(AArch64::SEH_EpilogEnd))
           .setMIFlag(MachineInstr::FrameDestroy);
@@ -1702,7 +1689,8 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
     assert(IsSVECalleeSave(RestoreBegin) &&
            IsSVECalleeSave(std::prev(RestoreEnd)) && "Unexpected instruction");
 
-    StackOffset CalleeSavedSizeAsOffset = {CalleeSavedSize, MVT::nxv1i8};
+    StackOffset CalleeSavedSizeAsOffset =
+        StackOffset::getScalable(CalleeSavedSize);
     DeallocateBefore = SVEStackSize - CalleeSavedSizeAsOffset;
     DeallocateAfter = CalleeSavedSizeAsOffset;
   }
@@ -1715,14 +1703,15 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
         // be reloaded. The code below will deallocate the stack space
         // space by moving FP -> SP.
         emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::FP,
-                        {-CalleeSavedSize, MVT::nxv1i8}, TII,
+                        StackOffset::getScalable(-CalleeSavedSize), TII,
                         MachineInstr::FrameDestroy);
     } else {
       if (AFI->getSVECalleeSavedStackSize()) {
         // Deallocate the non-SVE locals first before we can deallocate (and
         // restore callee saves) from the SVE area.
         emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
-                        {NumBytes, MVT::i8}, TII, MachineInstr::FrameDestroy);
+                        StackOffset::getFixed(NumBytes), TII,
+                        MachineInstr::FrameDestroy);
         NumBytes = 0;
       }
 
@@ -1755,11 +1744,10 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
       adaptForLdStOpt(MBB, MBB.getFirstTerminator(), LastPopI);
 
     emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
-                    {StackRestoreBytes, MVT::i8}, TII,
+                    StackOffset::getFixed(StackRestoreBytes), TII,
                     MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
     if (Done) {
-      if (NeedsWinCFI) {
-        HasWinCFI = true;
+      if (HasWinCFI) {
         BuildMI(MBB, MBB.getFirstTerminator(), DL,
                 TII->get(AArch64::SEH_EpilogEnd))
             .setMIFlag(MachineInstr::FrameDestroy);
@@ -1775,15 +1763,14 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
   // non-post-indexed loads for the restores if we aren't actually going to
   // be able to save any instructions.
   if (!IsFunclet && (MFI.hasVarSizedObjects() || AFI->isStackRealigned())) {
-    int64_t OffsetToFrameRecord =
-        isTargetDarwin(MF) ? (-(int64_t)AFI->getCalleeSavedStackSize() + 16) : 0;
-    emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::FP,
-                    {OffsetToFrameRecord, MVT::i8},
-                    TII, MachineInstr::FrameDestroy, false, NeedsWinCFI);
+    emitFrameOffset(
+        MBB, LastPopI, DL, AArch64::SP, AArch64::FP,
+        StackOffset::getFixed(-AFI->getCalleeSaveBaseToFrameRecordOffset()),
+        TII, MachineInstr::FrameDestroy, false, NeedsWinCFI);
   } else if (NumBytes)
     emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
-                    {NumBytes, MVT::i8}, TII, MachineInstr::FrameDestroy, false,
-                    NeedsWinCFI);
+                    StackOffset::getFixed(NumBytes), TII,
+                    MachineInstr::FrameDestroy, false, NeedsWinCFI);
 
   // This must be placed after the callee-save restore code because that code
   // assumes the SP is at the same location as it was after the callee-save save
@@ -1804,62 +1791,63 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
     adaptForLdStOpt(MBB, FirstSPPopI, LastPopI);
 
     emitFrameOffset(MBB, FirstSPPopI, DL, AArch64::SP, AArch64::SP,
-                    {(int64_t)AfterCSRPopSize, MVT::i8}, TII,
+                    StackOffset::getFixed((int64_t)AfterCSRPopSize), TII,
                     MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
   }
-  if (NeedsWinCFI && HasWinCFI)
+  if (HasWinCFI)
     BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::SEH_EpilogEnd))
         .setMIFlag(MachineInstr::FrameDestroy);
-
-  MF.setHasWinCFI(HasWinCFI);
 }
 
 /// getFrameIndexReference - Provide a base+offset reference to an FI slot for
 /// debug info.  It's the same as what we use for resolving the code-gen
 /// references for now.  FIXME: This can go wrong when references are
 /// SP-relative and simple call frames aren't used.
-int AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF,
-                                                 int FI,
-                                                 Register &FrameReg) const {
+StackOffset
+AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
+                                             Register &FrameReg) const {
   return resolveFrameIndexReference(
-             MF, FI, FrameReg,
-             /*PreferFP=*/
-             MF.getFunction().hasFnAttribute(Attribute::SanitizeHWAddress),
-             /*ForSimm=*/false)
-      .getBytes();
+      MF, FI, FrameReg,
+      /*PreferFP=*/
+      MF.getFunction().hasFnAttribute(Attribute::SanitizeHWAddress),
+      /*ForSimm=*/false);
 }
 
-int AArch64FrameLowering::getNonLocalFrameIndexReference(
-  const MachineFunction &MF, int FI) const {
-  return getSEHFrameIndexOffset(MF, FI);
+StackOffset
+AArch64FrameLowering::getNonLocalFrameIndexReference(const MachineFunction &MF,
+                                                     int FI) const {
+  return StackOffset::getFixed(getSEHFrameIndexOffset(MF, FI));
 }
 
-static StackOffset getFPOffset(const MachineFunction &MF, int64_t ObjectOffset) {
+static StackOffset getFPOffset(const MachineFunction &MF,
+                               int64_t ObjectOffset) {
   const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
   const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
   bool IsWin64 =
       Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
-
   unsigned FixedObject =
       getFixedObjectSize(MF, AFI, IsWin64, /*IsFunclet=*/false);
-  unsigned FPAdjust = isTargetDarwin(MF)
-                        ? 16 : AFI->getCalleeSavedStackSize(MF.getFrameInfo());
-  return {ObjectOffset + FixedObject + FPAdjust, MVT::i8};
+  int64_t CalleeSaveSize = AFI->getCalleeSavedStackSize(MF.getFrameInfo());
+  int64_t FPAdjust =
+      CalleeSaveSize - AFI->getCalleeSaveBaseToFrameRecordOffset();
+  return StackOffset::getFixed(ObjectOffset + FixedObject + FPAdjust);
 }
 
-static StackOffset getStackOffset(const MachineFunction &MF, int64_t ObjectOffset) {
+static StackOffset getStackOffset(const MachineFunction &MF,
+                                  int64_t ObjectOffset) {
   const auto &MFI = MF.getFrameInfo();
-  return {ObjectOffset + (int64_t)MFI.getStackSize(), MVT::i8};
+  return StackOffset::getFixed(ObjectOffset + (int64_t)MFI.getStackSize());
 }
 
+  // TODO: This function currently does not work for scalable vectors.
 int AArch64FrameLowering::getSEHFrameIndexOffset(const MachineFunction &MF,
                                                  int FI) const {
   const auto *RegInfo = static_cast<const AArch64RegisterInfo *>(
       MF.getSubtarget().getRegisterInfo());
   int ObjectOffset = MF.getFrameInfo().getObjectOffset(FI);
   return RegInfo->getLocalAddressRegister(MF) == AArch64::FP
-             ? getFPOffset(MF, ObjectOffset).getBytes()
-             : getStackOffset(MF, ObjectOffset).getBytes();
+             ? getFPOffset(MF, ObjectOffset).getFixed()
+             : getStackOffset(MF, ObjectOffset).getFixed();
 }
 
 StackOffset AArch64FrameLowering::resolveFrameIndexReference(
@@ -1868,7 +1856,7 @@ StackOffset AArch64FrameLowering::resolveFrameIndexReference(
   const auto &MFI = MF.getFrameInfo();
   int64_t ObjectOffset = MFI.getObjectOffset(FI);
   bool isFixed = MFI.isFixedObjectIndex(FI);
-  bool isSVE = MFI.getStackID(FI) == TargetStackID::SVEVector;
+  bool isSVE = MFI.getStackID(FI) == TargetStackID::ScalableVector;
   return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, isSVE, FrameReg,
                                      PreferFP, ForSimm);
 }
@@ -1882,8 +1870,8 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
   const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
   const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
 
-  int64_t FPOffset = getFPOffset(MF, ObjectOffset).getBytes();
-  int64_t Offset = getStackOffset(MF, ObjectOffset).getBytes();
+  int64_t FPOffset = getFPOffset(MF, ObjectOffset).getFixed();
+  int64_t Offset = getStackOffset(MF, ObjectOffset).getFixed();
   bool isCSR =
       !isFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize(MFI));
 
@@ -1958,16 +1946,16 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
          "non-argument/CSR objects cannot be accessed through the frame pointer");
 
   if (isSVE) {
-    int64_t OffsetToSVEArea =
-        MFI.getStackSize() - AFI->getCalleeSavedStackSize();
-    StackOffset FPOffset = {ObjectOffset, MVT::nxv1i8};
-    StackOffset SPOffset = SVEStackSize +
-                           StackOffset(ObjectOffset, MVT::nxv1i8) +
-                           StackOffset(OffsetToSVEArea, MVT::i8);
+    StackOffset FPOffset =
+        StackOffset::get(-AFI->getCalleeSaveBaseToFrameRecordOffset(), ObjectOffset);
+    StackOffset SPOffset =
+        SVEStackSize +
+        StackOffset::get(MFI.getStackSize() - AFI->getCalleeSavedStackSize(),
+                         ObjectOffset);
     // Always use the FP for SVE spills if available and beneficial.
     if (hasFP(MF) &&
-        (SPOffset.getBytes() ||
-         FPOffset.getScalableBytes() < SPOffset.getScalableBytes() ||
+        (SPOffset.getFixed() ||
+         FPOffset.getScalable() < SPOffset.getScalable() ||
          RegInfo->needsStackRealignment(MF))) {
       FrameReg = RegInfo->getFrameRegister(MF);
       return FPOffset;
@@ -1986,7 +1974,7 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
 
   if (UseFP) {
     FrameReg = RegInfo->getFrameRegister(MF);
-    return StackOffset(FPOffset, MVT::i8) + ScalableOffset;
+    return StackOffset::getFixed(FPOffset) + ScalableOffset;
   }
 
   // Use the base pointer if we have one.
@@ -2003,7 +1991,7 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
       Offset -= AFI->getLocalStackSize();
   }
 
-  return StackOffset(Offset, MVT::i8) + ScalableOffset;
+  return StackOffset::getFixed(Offset) + ScalableOffset;
 }
 
 static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) {
@@ -2025,21 +2013,28 @@ static bool produceCompactUnwindFrame(MachineFunction &MF) {
 }
 
 static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2,
-                                             bool NeedsWinCFI) {
+                                             bool NeedsWinCFI, bool IsFirst) {
   // If we are generating register pairs for a Windows function that requires
   // EH support, then pair consecutive registers only.  There are no unwind
   // opcodes for saves/restores of non-consectuve register pairs.
-  // The unwind opcodes are save_regp, save_regp_x, save_fregp, save_frepg_x.
+  // The unwind opcodes are save_regp, save_regp_x, save_fregp, save_frepg_x,
+  // save_lrpair.
   // https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling
 
-  // TODO: LR can be paired with any register.  We don't support this yet in
-  // the MCLayer.  We need to add support for the save_lrpair unwind code.
   if (Reg2 == AArch64::FP)
     return true;
   if (!NeedsWinCFI)
     return false;
   if (Reg2 == Reg1 + 1)
     return false;
+  // If pairing a GPR with LR, the pair can be described by the save_lrpair
+  // opcode. If this is the first register pair, it would end up with a
+  // predecrement, but there's no save_lrpair_x opcode, so we can only do this
+  // if LR is paired with something else than the first register.
+  // The save_lrpair opcode requires the first register to be an odd one.
+  if (Reg1 >= AArch64::X19 && Reg1 <= AArch64::X27 &&
+      (Reg1 - AArch64::X19) % 2 == 0 && Reg2 == AArch64::LR && !IsFirst)
+    return false;
   return true;
 }
 
@@ -2048,9 +2043,10 @@ static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2,
 /// LR and FP need to be allocated together when the frame needs to save
 /// the frame-record. This means any other register pairing with LR is invalid.
 static bool invalidateRegisterPairing(unsigned Reg1, unsigned Reg2,
-                                      bool UsesWinAAPCS, bool NeedsWinCFI, bool NeedsFrameRecord) {
+                                      bool UsesWinAAPCS, bool NeedsWinCFI,
+                                      bool NeedsFrameRecord, bool IsFirst) {
   if (UsesWinAAPCS)
-    return invalidateWindowsRegisterPairing(Reg1, Reg2, NeedsWinCFI);
+    return invalidateWindowsRegisterPairing(Reg1, Reg2, NeedsWinCFI, IsFirst);
 
   // If we need to store the frame record, don't pair any register
   // with LR other than FP.
@@ -2114,14 +2110,22 @@ static void computeCalleeSaveRegisterPairs(
           (Count & 1) == 0) &&
          "Odd number of callee-saved regs to spill!");
   int ByteOffset = AFI->getCalleeSavedStackSize();
+  int StackFillDir = -1;
+  int RegInc = 1;
+  unsigned FirstReg = 0;
+  if (NeedsWinCFI) {
+    // For WinCFI, fill the stack from the bottom up.
+    ByteOffset = 0;
+    StackFillDir = 1;
+    // As the CSI array is reversed to match PrologEpilogInserter, iterate
+    // backwards, to pair up registers starting from lower numbered registers.
+    RegInc = -1;
+    FirstReg = Count - 1;
+  }
   int ScalableByteOffset = AFI->getSVECalleeSavedStackSize();
-  // On Linux, we will have either one or zero non-paired register.  On Windows
-  // with CFI, we can have multiple unpaired registers in order to utilize the
-  // available unwind codes.  This flag assures that the alignment fixup is done
-  // only once, as intened.
-  bool FixupDone = false;
 
-  for (unsigned i = 0; i < Count; ++i) {
+  // When iterating backwards, the loop condition relies on unsigned wraparound.
+  for (unsigned i = FirstReg; i < Count; i += RegInc) {
     RegPairInfo RPI;
     RPI.Reg1 = CSI[i].getReg();
 
@@ -2139,18 +2143,20 @@ static void computeCalleeSaveRegisterPairs(
       llvm_unreachable("Unsupported register class.");
 
     // Add the next reg to the pair if it is in the same register class.
-    if (i + 1 < Count) {
-      unsigned NextReg = CSI[i + 1].getReg();
+    if (unsigned(i + RegInc) < Count) {
+      unsigned NextReg = CSI[i + RegInc].getReg();
+      bool IsFirst = i == FirstReg;
       switch (RPI.Type) {
       case RegPairInfo::GPR:
         if (AArch64::GPR64RegClass.contains(NextReg) &&
-            !invalidateRegisterPairing(RPI.Reg1, NextReg, IsWindows, NeedsWinCFI,
-                                       NeedsFrameRecord))
+            !invalidateRegisterPairing(RPI.Reg1, NextReg, IsWindows,
+                                       NeedsWinCFI, NeedsFrameRecord, IsFirst))
           RPI.Reg2 = NextReg;
         break;
       case RegPairInfo::FPR64:
         if (AArch64::FPR64RegClass.contains(NextReg) &&
-            !invalidateWindowsRegisterPairing(RPI.Reg1, NextReg, NeedsWinCFI))
+            !invalidateWindowsRegisterPairing(RPI.Reg1, NextReg, NeedsWinCFI,
+                                              IsFirst))
           RPI.Reg2 = NextReg;
         break;
       case RegPairInfo::FPR128:
@@ -2179,7 +2185,7 @@ static void computeCalleeSaveRegisterPairs(
     // The order of the registers in the list is controlled by
     // getCalleeSavedRegs(), so they will always be in-order, as well.
     assert((!RPI.isPaired() ||
-            (CSI[i].getFrameIdx() + 1 == CSI[i + 1].getFrameIdx())) &&
+            (CSI[i].getFrameIdx() + RegInc == CSI[i + RegInc].getFrameIdx())) &&
            "Out of order callee saved regs!");
 
     assert((!RPI.isPaired() || !NeedsFrameRecord || RPI.Reg2 != AArch64::FP ||
@@ -2201,39 +2207,72 @@ static void computeCalleeSaveRegisterPairs(
            "Callee-save registers not saved as adjacent register pair!");
 
     RPI.FrameIdx = CSI[i].getFrameIdx();
+    if (NeedsWinCFI &&
+        RPI.isPaired()) // RPI.FrameIdx must be the lower index of the pair
+      RPI.FrameIdx = CSI[i + RegInc].getFrameIdx();
 
     int Scale = RPI.getScale();
+
+    int OffsetPre = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
+    assert(OffsetPre % Scale == 0);
+
     if (RPI.isScalable())
-      ScalableByteOffset -= Scale;
+      ScalableByteOffset += StackFillDir * Scale;
     else
-      ByteOffset -= RPI.isPaired() ? 2 * Scale : Scale;
+      ByteOffset += StackFillDir * (RPI.isPaired() ? 2 * Scale : Scale);
 
     assert(!(RPI.isScalable() && RPI.isPaired()) &&
            "Paired spill/fill instructions don't exist for SVE vectors");
 
     // Round up size of non-pair to pair size if we need to pad the
     // callee-save area to ensure 16-byte alignment.
-    if (AFI->hasCalleeSaveStackFreeSpace() && !FixupDone &&
+    if (AFI->hasCalleeSaveStackFreeSpace() && !NeedsWinCFI &&
         !RPI.isScalable() && RPI.Type != RegPairInfo::FPR128 &&
         !RPI.isPaired()) {
-      FixupDone = true;
-      ByteOffset -= 8;
+      ByteOffset += 8 * StackFillDir;
       assert(ByteOffset % 16 == 0);
       assert(MFI.getObjectAlign(RPI.FrameIdx) <= Align(16));
+      // A stack frame with a gap looks like this, bottom up:
+      // d9, d8. x21, gap, x20, x19.
+      // Set extra alignment on the x21 object (the only unpaired register)
+      // to create the gap above it.
       MFI.setObjectAlignment(RPI.FrameIdx, Align(16));
     }
 
-    int Offset = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
-    assert(Offset % Scale == 0);
+    int OffsetPost = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
+    assert(OffsetPost % Scale == 0);
+    // If filling top down (default), we want the offset after incrementing it.
+    // If fillibg bootom up (WinCFI) we need the original offset.
+    int Offset = NeedsWinCFI ? OffsetPre : OffsetPost;
     RPI.Offset = Offset / Scale;
 
     assert(((!RPI.isScalable() && RPI.Offset >= -64 && RPI.Offset <= 63) ||
             (RPI.isScalable() && RPI.Offset >= -256 && RPI.Offset <= 255)) &&
            "Offset out of bounds for LDP/STP immediate");
 
+    // Save the offset to frame record so that the FP register can point to the
+    // innermost frame record (spilled FP and LR registers).
+    if (NeedsFrameRecord && ((!IsWindows && RPI.Reg1 == AArch64::LR &&
+                              RPI.Reg2 == AArch64::FP) ||
+                             (IsWindows && RPI.Reg1 == AArch64::FP &&
+                              RPI.Reg2 == AArch64::LR)))
+      AFI->setCalleeSaveBaseToFrameRecordOffset(Offset);
+
     RegPairs.push_back(RPI);
     if (RPI.isPaired())
-      ++i;
+      i += RegInc;
+  }
+  if (NeedsWinCFI) {
+    // If we need an alignment gap in the stack, align the topmost stack
+    // object. A stack frame with a gap looks like this, bottom up:
+    // x19, d8. d9, gap.
+    // Set extra alignment on the topmost stack object (the first element in
+    // CSI, which goes top down), to create the gap above it.
+    if (AFI->hasCalleeSaveStackFreeSpace())
+      MFI.setObjectAlignment(CSI[0].getFrameIdx(), Align(16));
+    // We iterated bottom up over the registers; flip RegPairs back to top
+    // down order.
+    std::reverse(RegPairs.begin(), RegPairs.end());
   }
 }
 
@@ -2373,7 +2412,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
     // Update the StackIDs of the SVE stack slots.
     MachineFrameInfo &MFI = MF.getFrameInfo();
     if (RPI.Type == RegPairInfo::ZPR || RPI.Type == RegPairInfo::PPR)
-      MFI.setStackID(RPI.FrameIdx, TargetStackID::SVEVector);
+      MFI.setStackID(RPI.FrameIdx, TargetStackID::ScalableVector);
 
   }
   return true;
@@ -2665,6 +2704,21 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
   AFI->setSVECalleeSavedStackSize(alignTo(SVECSStackSize, 16));
 }
 
+bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
+    MachineFunction &MF, const TargetRegisterInfo *TRI,
+    std::vector<CalleeSavedInfo> &CSI) const {
+  bool NeedsWinCFI = needsWinCFI(MF);
+  // To match the canonical windows frame layout, reverse the list of
+  // callee saved registers to get them laid out by PrologEpilogInserter
+  // in the right order. (PrologEpilogInserter allocates stack objects top
+  // down. Windows canonical prologs store higher numbered registers at
+  // the top, thus have the CSI array start from the highest registers.)
+  if (NeedsWinCFI)
+    std::reverse(CSI.begin(), CSI.end());
+  // Let the generic code do the rest of the setup.
+  return false;
+}
+
 bool AArch64FrameLowering::enableStackSlotScavenging(
     const MachineFunction &MF) const {
   const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
@@ -2707,7 +2761,7 @@ static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI,
 #ifndef NDEBUG
   // First process all fixed stack objects.
   for (int I = MFI.getObjectIndexBegin(); I != 0; ++I)
-    assert(MFI.getStackID(I) != TargetStackID::SVEVector &&
+    assert(MFI.getStackID(I) != TargetStackID::ScalableVector &&
            "SVE vectors should never be passed on the stack by value, only by "
            "reference.");
 #endif
@@ -2737,7 +2791,7 @@ static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI,
   SmallVector<int, 8> ObjectsToAllocate;
   for (int I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) {
     unsigned StackID = MFI.getStackID(I);
-    if (StackID != TargetStackID::SVEVector)
+    if (StackID != TargetStackID::ScalableVector)
       continue;
     if (MaxCSFrameIndex >= I && I >= MinCSFrameIndex)
       continue;
@@ -2891,12 +2945,12 @@ void TagStoreEdit::emitUnrolled(MachineBasicBlock::iterator InsertI) {
   const int64_t kMaxOffset = 255 * 16;
 
   Register BaseReg = FrameReg;
-  int64_t BaseRegOffsetBytes = FrameRegOffset.getBytes();
+  int64_t BaseRegOffsetBytes = FrameRegOffset.getFixed();
   if (BaseRegOffsetBytes < kMinOffset ||
       BaseRegOffsetBytes + (Size - Size % 32) > kMaxOffset) {
     Register ScratchReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
     emitFrameOffset(*MBB, InsertI, DL, ScratchReg, BaseReg,
-                    {BaseRegOffsetBytes, MVT::i8}, TII);
+                    StackOffset::getFixed(BaseRegOffsetBytes), TII);
     BaseReg = ScratchReg;
     BaseRegOffsetBytes = 0;
   }
@@ -2953,7 +3007,7 @@ void TagStoreEdit::emitLoop(MachineBasicBlock::iterator InsertI) {
     LoopI->setFlags(FrameRegUpdateFlags);
 
   int64_t ExtraBaseRegUpdate =
-      FrameRegUpdate ? (*FrameRegUpdate - FrameRegOffset.getBytes() - Size) : 0;
+      FrameRegUpdate ? (*FrameRegUpdate - FrameRegOffset.getFixed() - Size) : 0;
   if (LoopSize < Size) {
     assert(FrameRegUpdate);
     assert(Size - LoopSize == 16);
@@ -3057,7 +3111,7 @@ void TagStoreEdit::emitCode(MachineBasicBlock::iterator &InsertI,
       // realistically happens in function epilogue. Also, STGloop is expanded
       // before that pass.
       if (InsertI != MBB->end() &&
-          canMergeRegUpdate(InsertI, FrameReg, FrameRegOffset.getBytes() + Size,
+          canMergeRegUpdate(InsertI, FrameReg, FrameRegOffset.getFixed() + Size,
                             &TotalOffset)) {
         UpdateInstr = &*InsertI++;
         LLVM_DEBUG(dbgs() << "Folding SP update into loop:\n  "
@@ -3220,7 +3274,7 @@ void AArch64FrameLowering::processFunctionBeforeFrameIndicesReplaced(
 /// For Win64 AArch64 EH, the offset to the Unwind object is from the SP
 /// before the update.  This is easily retrieved as it is exactly the offset
 /// that is set in processFunctionBeforeFrameFinalized.
-int AArch64FrameLowering::getFrameIndexReferencePreferSP(
+StackOffset AArch64FrameLowering::getFrameIndexReferencePreferSP(
     const MachineFunction &MF, int FI, Register &FrameReg,
     bool IgnoreSPUpdates) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
@@ -3228,7 +3282,7 @@ int AArch64FrameLowering::getFrameIndexReferencePreferSP(
     LLVM_DEBUG(dbgs() << "Offset from the SP for " << FI << " is "
                       << MFI.getObjectOffset(FI) << "\n");
     FrameReg = AArch64::SP;
-    return MFI.getObjectOffset(FI);
+    return StackOffset::getFixed(MFI.getObjectOffset(FI));
   }
 
   return getFrameIndexReference(MF, FI, FrameReg);
@@ -3252,3 +3306,162 @@ unsigned AArch64FrameLowering::getWinEHFuncletFrameSize(
   return alignTo(CSSize + MF.getFrameInfo().getMaxCallFrameSize(),
                  getStackAlign());
 }
+
+namespace {
+struct FrameObject {
+  bool IsValid = false;
+  // Index of the object in MFI.
+  int ObjectIndex = 0;
+  // Group ID this object belongs to.
+  int GroupIndex = -1;
+  // This object should be placed first (closest to SP).
+  bool ObjectFirst = false;
+  // This object's group (which always contains the object with
+  // ObjectFirst==true) should be placed first.
+  bool GroupFirst = false;
+};
+
+class GroupBuilder {
+  SmallVector<int, 8> CurrentMembers;
+  int NextGroupIndex = 0;
+  std::vector<FrameObject> &Objects;
+
+public:
+  GroupBuilder(std::vector<FrameObject> &Objects) : Objects(Objects) {}
+  void AddMember(int Index) { CurrentMembers.push_back(Index); }
+  void EndCurrentGroup() {
+    if (CurrentMembers.size() > 1) {
+      // Create a new group with the current member list. This might remove them
+      // from their pre-existing groups. That's OK, dealing with overlapping
+      // groups is too hard and unlikely to make a difference.
+      LLVM_DEBUG(dbgs() << "group:");
+      for (int Index : CurrentMembers) {
+        Objects[Index].GroupIndex = NextGroupIndex;
+        LLVM_DEBUG(dbgs() << " " << Index);
+      }
+      LLVM_DEBUG(dbgs() << "\n");
+      NextGroupIndex++;
+    }
+    CurrentMembers.clear();
+  }
+};
+
+bool FrameObjectCompare(const FrameObject &A, const FrameObject &B) {
+  // Objects at a lower index are closer to FP; objects at a higher index are
+  // closer to SP.
+  //
+  // For consistency in our comparison, all invalid objects are placed
+  // at the end. This also allows us to stop walking when we hit the
+  // first invalid item after it's all sorted.
+  //
+  // The "first" object goes first (closest to SP), followed by the members of
+  // the "first" group.
+  //
+  // The rest are sorted by the group index to keep the groups together.
+  // Higher numbered groups are more likely to be around longer (i.e. untagged
+  // in the function epilogue and not at some earlier point). Place them closer
+  // to SP.
+  //
+  // If all else equal, sort by the object index to keep the objects in the
+  // original order.
+  return std::make_tuple(!A.IsValid, A.ObjectFirst, A.GroupFirst, A.GroupIndex,
+                         A.ObjectIndex) <
+         std::make_tuple(!B.IsValid, B.ObjectFirst, B.GroupFirst, B.GroupIndex,
+                         B.ObjectIndex);
+}
+} // namespace
+
+void AArch64FrameLowering::orderFrameObjects(
+    const MachineFunction &MF, SmallVectorImpl<int> &ObjectsToAllocate) const {
+  if (!OrderFrameObjects || ObjectsToAllocate.empty())
+    return;
+
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  std::vector<FrameObject> FrameObjects(MFI.getObjectIndexEnd());
+  for (auto &Obj : ObjectsToAllocate) {
+    FrameObjects[Obj].IsValid = true;
+    FrameObjects[Obj].ObjectIndex = Obj;
+  }
+
+  // Identify stack slots that are tagged at the same time.
+  GroupBuilder GB(FrameObjects);
+  for (auto &MBB : MF) {
+    for (auto &MI : MBB) {
+      if (MI.isDebugInstr())
+        continue;
+      int OpIndex;
+      switch (MI.getOpcode()) {
+      case AArch64::STGloop:
+      case AArch64::STZGloop:
+        OpIndex = 3;
+        break;
+      case AArch64::STGOffset:
+      case AArch64::STZGOffset:
+      case AArch64::ST2GOffset:
+      case AArch64::STZ2GOffset:
+        OpIndex = 1;
+        break;
+      default:
+        OpIndex = -1;
+      }
+
+      int TaggedFI = -1;
+      if (OpIndex >= 0) {
+        const MachineOperand &MO = MI.getOperand(OpIndex);
+        if (MO.isFI()) {
+          int FI = MO.getIndex();
+          if (FI >= 0 && FI < MFI.getObjectIndexEnd() &&
+              FrameObjects[FI].IsValid)
+            TaggedFI = FI;
+        }
+      }
+
+      // If this is a stack tagging instruction for a slot that is not part of a
+      // group yet, either start a new group or add it to the current one.
+      if (TaggedFI >= 0)
+        GB.AddMember(TaggedFI);
+      else
+        GB.EndCurrentGroup();
+    }
+    // Groups should never span multiple basic blocks.
+    GB.EndCurrentGroup();
+  }
+
+  // If the function's tagged base pointer is pinned to a stack slot, we want to
+  // put that slot first when possible. This will likely place it at SP + 0,
+  // and save one instruction when generating the base pointer because IRG does
+  // not allow an immediate offset.
+  const AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
+  Optional<int> TBPI = AFI.getTaggedBasePointerIndex();
+  if (TBPI) {
+    FrameObjects[*TBPI].ObjectFirst = true;
+    FrameObjects[*TBPI].GroupFirst = true;
+    int FirstGroupIndex = FrameObjects[*TBPI].GroupIndex;
+    if (FirstGroupIndex >= 0)
+      for (FrameObject &Object : FrameObjects)
+        if (Object.GroupIndex == FirstGroupIndex)
+          Object.GroupFirst = true;
+  }
+
+  llvm::stable_sort(FrameObjects, FrameObjectCompare);
+
+  int i = 0;
+  for (auto &Obj : FrameObjects) {
+    // All invalid items are sorted at the end, so it's safe to stop.
+    if (!Obj.IsValid)
+      break;
+    ObjectsToAllocate[i++] = Obj.ObjectIndex;
+  }
+
+  LLVM_DEBUG(dbgs() << "Final frame order:\n"; for (auto &Obj
+                                                    : FrameObjects) {
+    if (!Obj.IsValid)
+      break;
+    dbgs() << "  " << Obj.ObjectIndex << ": group " << Obj.GroupIndex;
+    if (Obj.ObjectFirst)
+      dbgs() << ", first";
+    if (Obj.GroupFirst)
+      dbgs() << ", group-first";
+    dbgs() << "\n";
+  });
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FrameLowering.h
index 1ca8c3e9e2bf..80079a9d9836 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FrameLowering.h
@@ -13,7 +13,7 @@
 #ifndef LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H
 #define LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H
 
-#include "AArch64StackOffset.h"
+#include "llvm/Support/TypeSize.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
 
 namespace llvm {
@@ -41,8 +41,8 @@ public:
 
   bool canUseAsPrologue(const MachineBasicBlock &MBB) const override;
 
-  int getFrameIndexReference(const MachineFunction &MF, int FI,
-                             Register &FrameReg) const override;
+  StackOffset getFrameIndexReference(const MachineFunction &MF, int FI,
+                                     Register &FrameReg) const override;
   StackOffset resolveFrameIndexReference(const MachineFunction &MF, int FI,
                                          Register &FrameReg, bool PreferFP,
                                          bool ForSimm) const;
@@ -67,6 +67,11 @@ public:
   bool hasFP(const MachineFunction &MF) const override;
   bool hasReservedCallFrame(const MachineFunction &MF) const override;
 
+  bool
+  assignCalleeSavedSpillSlots(MachineFunction &MF,
+                              const TargetRegisterInfo *TRI,
+                              std::vector<CalleeSavedInfo> &CSI) const override;
+
   void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
                             RegScavenger *RS) const override;
 
@@ -89,11 +94,12 @@ public:
 
   unsigned getWinEHFuncletFrameSize(const MachineFunction &MF) const;
 
-  int getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI,
-                                     Register &FrameReg,
-                                     bool IgnoreSPUpdates) const override;
-  int getNonLocalFrameIndexReference(const MachineFunction &MF,
-                               int FI) const override;
+  StackOffset
+  getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI,
+                                 Register &FrameReg,
+                                 bool IgnoreSPUpdates) const override;
+  StackOffset getNonLocalFrameIndexReference(const MachineFunction &MF,
+                                             int FI) const override;
   int getSEHFrameIndexOffset(const MachineFunction &MF, int FI) const;
 
   bool isSupportedStackID(TargetStackID::Value ID) const override {
@@ -101,7 +107,7 @@ public:
     default:
       return false;
     case TargetStackID::Default:
-    case TargetStackID::SVEVector:
+    case TargetStackID::ScalableVector:
     case TargetStackID::NoAlloc:
       return true;
     }
@@ -110,9 +116,13 @@ public:
   bool isStackIdSafeForLocalArea(unsigned StackId) const override {
     // We don't support putting SVE objects into the pre-allocated local
     // frame block at the moment.
-    return StackId != TargetStackID::SVEVector;
+    return StackId != TargetStackID::ScalableVector;
   }
 
+  void
+  orderFrameObjects(const MachineFunction &MF,
+                    SmallVectorImpl<int> &ObjectsToAllocate) const override;
+
 private:
   bool shouldCombineCSRLocalStackBump(MachineFunction &MF,
                                       uint64_t StackBumpBytes) const;
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 7799ebfbd68e..94b5d7718d0c 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -10,6 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "AArch64MachineFunctionInfo.h"
 #include "AArch64TargetMachine.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "llvm/ADT/APSInt.h"
@@ -190,9 +191,14 @@ public:
     return SelectSVELogicalImm(N, VT, Imm);
   }
 
-  template <unsigned Low, unsigned High>
-  bool SelectSVEShiftImm64(SDValue N, SDValue &Imm) {
-    return SelectSVEShiftImm64(N, Low, High, Imm);
+  template <MVT::SimpleValueType VT>
+  bool SelectSVEArithImm(SDValue N, SDValue &Imm) {
+    return SelectSVEArithImm(N, VT, Imm);
+  }
+
+  template <unsigned Low, unsigned High, bool AllowSaturation = false>
+  bool SelectSVEShiftImm(SDValue N, SDValue &Imm) {
+    return SelectSVEShiftImm(N, Low, High, AllowSaturation, Imm);
   }
 
   // Returns a suitable CNT/INC/DEC/RDVL multiplier to calculate VSCALE*N.
@@ -323,10 +329,10 @@ private:
   bool SelectSVELogicalImm(SDValue N, MVT VT, SDValue &Imm);
 
   bool SelectSVESignedArithImm(SDValue N, SDValue &Imm);
-  bool SelectSVEShiftImm64(SDValue N, uint64_t Low, uint64_t High,
-                           SDValue &Imm);
+  bool SelectSVEShiftImm(SDValue N, uint64_t Low, uint64_t High,
+                         bool AllowSaturation, SDValue &Imm);
 
-  bool SelectSVEArithImm(SDValue N, SDValue &Imm);
+  bool SelectSVEArithImm(SDValue N, MVT VT, SDValue &Imm);
   bool SelectSVERegRegAddrMode(SDValue N, unsigned Scale, SDValue &Base,
                                SDValue &Offset);
 };
@@ -1371,9 +1377,12 @@ void AArch64DAGToDAGISel::SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
 
   ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1));
 
-  // Transfer memoperands.
-  MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(N)->getMemOperand();
-  CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ld), {MemOp});
+  // Transfer memoperands. In the case of AArch64::LD64B, there won't be one,
+  // because it's too simple to have needed special treatment during lowering.
+  if (auto *MemIntr = dyn_cast<MemIntrinsicSDNode>(N)) {
+    MachineMemOperand *MemOp = MemIntr->getMemOperand();
+    CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ld), {MemOp});
+  }
 
   CurDAG->RemoveDeadNode(N);
 }
@@ -3127,13 +3136,28 @@ bool AArch64DAGToDAGISel::SelectSVESignedArithImm(SDValue N, SDValue &Imm) {
   return false;
 }
 
-bool AArch64DAGToDAGISel::SelectSVEArithImm(SDValue N, SDValue &Imm) {
+bool AArch64DAGToDAGISel::SelectSVEArithImm(SDValue N, MVT VT, SDValue &Imm) {
   if (auto CNode = dyn_cast<ConstantSDNode>(N)) {
-    uint64_t ImmVal = CNode->getSExtValue();
-    SDLoc DL(N);
-    ImmVal = ImmVal & 0xFF;
+    uint64_t ImmVal = CNode->getZExtValue();
+
+    switch (VT.SimpleTy) {
+    case MVT::i8:
+      ImmVal &= 0xFF;
+      break;
+    case MVT::i16:
+      ImmVal &= 0xFFFF;
+      break;
+    case MVT::i32:
+      ImmVal &= 0xFFFFFFFF;
+      break;
+    case MVT::i64:
+      break;
+    default:
+      llvm_unreachable("Unexpected type");
+    }
+
     if (ImmVal < 256) {
-      Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32);
+      Imm = CurDAG->getTargetConstant(ImmVal, SDLoc(N), MVT::i32);
       return true;
     }
   }
@@ -3177,19 +3201,30 @@ bool AArch64DAGToDAGISel::SelectSVELogicalImm(SDValue N, MVT VT, SDValue &Imm) {
   return false;
 }
 
-// This method is only needed to "cast" i64s into i32s when the value
-// is a valid shift which has been splatted into a vector with i64 elements.
-// Every other type is fine in tablegen.
-bool AArch64DAGToDAGISel::SelectSVEShiftImm64(SDValue N, uint64_t Low,
-                                              uint64_t High, SDValue &Imm) {
+// SVE shift intrinsics allow shift amounts larger than the element's bitwidth.
+// Rather than attempt to normalise everything we can sometimes saturate the
+// shift amount during selection. This function also allows for consistent
+// isel patterns by ensuring the resulting "Imm" node is of the i32 type
+// required by the instructions.
+bool AArch64DAGToDAGISel::SelectSVEShiftImm(SDValue N, uint64_t Low,
+                                            uint64_t High, bool AllowSaturation,
+                                            SDValue &Imm) {
   if (auto *CN = dyn_cast<ConstantSDNode>(N)) {
     uint64_t ImmVal = CN->getZExtValue();
-    SDLoc DL(N);
 
-    if (ImmVal >= Low && ImmVal <= High) {
-      Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32);
-      return true;
+    // Reject shift amounts that are too small.
+    if (ImmVal < Low)
+      return false;
+
+    // Reject or saturate shift amounts that are too big.
+    if (ImmVal > High) {
+      if (!AllowSaturation)
+        return false;
+      ImmVal = High;
     }
+
+    Imm = CurDAG->getTargetConstant(ImmVal, SDLoc(N), MVT::i32);
+    return true;
   }
 
   return false;
@@ -3798,6 +3833,9 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
         return;
       }
       break;
+    case Intrinsic::aarch64_ld64b:
+      SelectLoad(Node, 8, AArch64::LD64B, AArch64::x8sub_0);
+      return;
     }
   } break;
   case ISD::INTRINSIC_WO_CHAIN: {
@@ -4816,7 +4854,8 @@ static EVT getPackedVectorTypeFromPredicateType(LLVMContext &Ctx, EVT PredVT,
     return EVT();
 
   ElementCount EC = PredVT.getVectorElementCount();
-  EVT ScalarVT = EVT::getIntegerVT(Ctx, AArch64::SVEBitsPerBlock / EC.Min);
+  EVT ScalarVT =
+      EVT::getIntegerVT(Ctx, AArch64::SVEBitsPerBlock / EC.getKnownMinValue());
   EVT MemVT = EVT::getVectorVT(Ctx, ScalarVT, EC * NumVec);
 
   return MemVT;
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 48ca9039b1bd..c522ee76626d 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -27,7 +27,6 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/VectorUtils.h"
@@ -113,9 +112,76 @@ EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
                                   "optimization"),
                          cl::init(true));
 
+// Temporary option added for the purpose of testing functionality added
+// to DAGCombiner.cpp in D92230. It is expected that this can be removed
+// in future when both implementations will be based off MGATHER rather
+// than the GLD1 nodes added for the SVE gather load intrinsics.
+static cl::opt<bool>
+EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
+                                cl::desc("Combine extends of AArch64 masked "
+                                         "gather intrinsics"),
+                                cl::init(true));
+
 /// Value type used for condition codes.
 static const MVT MVT_CC = MVT::i32;
 
+static inline EVT getPackedSVEVectorVT(EVT VT) {
+  switch (VT.getSimpleVT().SimpleTy) {
+  default:
+    llvm_unreachable("unexpected element type for vector");
+  case MVT::i8:
+    return MVT::nxv16i8;
+  case MVT::i16:
+    return MVT::nxv8i16;
+  case MVT::i32:
+    return MVT::nxv4i32;
+  case MVT::i64:
+    return MVT::nxv2i64;
+  case MVT::f16:
+    return MVT::nxv8f16;
+  case MVT::f32:
+    return MVT::nxv4f32;
+  case MVT::f64:
+    return MVT::nxv2f64;
+  case MVT::bf16:
+    return MVT::nxv8bf16;
+  }
+}
+
+// NOTE: Currently there's only a need to return integer vector types. If this
+// changes then just add an extra "type" parameter.
+static inline EVT getPackedSVEVectorVT(ElementCount EC) {
+  switch (EC.getKnownMinValue()) {
+  default:
+    llvm_unreachable("unexpected element count for vector");
+  case 16:
+    return MVT::nxv16i8;
+  case 8:
+    return MVT::nxv8i16;
+  case 4:
+    return MVT::nxv4i32;
+  case 2:
+    return MVT::nxv2i64;
+  }
+}
+
+static inline EVT getPromotedVTForPredicate(EVT VT) {
+  assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&
+         "Expected scalable predicate vector type!");
+  switch (VT.getVectorMinNumElements()) {
+  default:
+    llvm_unreachable("unexpected element count for vector");
+  case 2:
+    return MVT::nxv2i64;
+  case 4:
+    return MVT::nxv4i32;
+  case 8:
+    return MVT::nxv8i16;
+  case 16:
+    return MVT::nxv16i8;
+  }
+}
+
 /// Returns true if VT's elements occupy the lowest bit positions of its
 /// associated register class without any intervening space.
 ///
@@ -128,6 +194,42 @@ static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
          VT.getSizeInBits().getKnownMinSize() == AArch64::SVEBitsPerBlock;
 }
 
+// Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
+// predicate and end with a passthru value matching the result type.
+static bool isMergePassthruOpcode(unsigned Opc) {
+  switch (Opc) {
+  default:
+    return false;
+  case AArch64ISD::BITREVERSE_MERGE_PASSTHRU:
+  case AArch64ISD::BSWAP_MERGE_PASSTHRU:
+  case AArch64ISD::CTLZ_MERGE_PASSTHRU:
+  case AArch64ISD::CTPOP_MERGE_PASSTHRU:
+  case AArch64ISD::DUP_MERGE_PASSTHRU:
+  case AArch64ISD::ABS_MERGE_PASSTHRU:
+  case AArch64ISD::NEG_MERGE_PASSTHRU:
+  case AArch64ISD::FNEG_MERGE_PASSTHRU:
+  case AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU:
+  case AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU:
+  case AArch64ISD::FCEIL_MERGE_PASSTHRU:
+  case AArch64ISD::FFLOOR_MERGE_PASSTHRU:
+  case AArch64ISD::FNEARBYINT_MERGE_PASSTHRU:
+  case AArch64ISD::FRINT_MERGE_PASSTHRU:
+  case AArch64ISD::FROUND_MERGE_PASSTHRU:
+  case AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU:
+  case AArch64ISD::FTRUNC_MERGE_PASSTHRU:
+  case AArch64ISD::FP_ROUND_MERGE_PASSTHRU:
+  case AArch64ISD::FP_EXTEND_MERGE_PASSTHRU:
+  case AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU:
+  case AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU:
+  case AArch64ISD::FCVTZU_MERGE_PASSTHRU:
+  case AArch64ISD::FCVTZS_MERGE_PASSTHRU:
+  case AArch64ISD::FSQRT_MERGE_PASSTHRU:
+  case AArch64ISD::FRECPX_MERGE_PASSTHRU:
+  case AArch64ISD::FABS_MERGE_PASSTHRU:
+    return true;
+  }
+}
+
 AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
                                              const AArch64Subtarget &STI)
     : TargetLowering(TM), Subtarget(&STI) {
@@ -161,7 +263,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     addDRTypeForNEON(MVT::v1i64);
     addDRTypeForNEON(MVT::v1f64);
     addDRTypeForNEON(MVT::v4f16);
-    addDRTypeForNEON(MVT::v4bf16);
+    if (Subtarget->hasBF16())
+      addDRTypeForNEON(MVT::v4bf16);
 
     addQRTypeForNEON(MVT::v4f32);
     addQRTypeForNEON(MVT::v2f64);
@@ -170,7 +273,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     addQRTypeForNEON(MVT::v4i32);
     addQRTypeForNEON(MVT::v2i64);
     addQRTypeForNEON(MVT::v8f16);
-    addQRTypeForNEON(MVT::v8bf16);
+    if (Subtarget->hasBF16())
+      addQRTypeForNEON(MVT::v8bf16);
   }
 
   if (Subtarget->hasSVE()) {
@@ -199,7 +303,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
     }
 
-    if (useSVEForFixedLengthVectors()) {
+    if (Subtarget->useSVEForFixedLengthVectors()) {
       for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
         if (useSVEForFixedLengthVectorVT(VT))
           addRegisterClass(VT, &AArch64::ZPRRegClass);
@@ -230,7 +334,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
            MVT::nxv2f64 }) {
       setCondCodeAction(ISD::SETO, VT, Expand);
       setCondCodeAction(ISD::SETOLT, VT, Expand);
+      setCondCodeAction(ISD::SETLT, VT, Expand);
       setCondCodeAction(ISD::SETOLE, VT, Expand);
+      setCondCodeAction(ISD::SETLE, VT, Expand);
       setCondCodeAction(ISD::SETULT, VT, Expand);
       setCondCodeAction(ISD::SETULE, VT, Expand);
       setCondCodeAction(ISD::SETUGE, VT, Expand);
@@ -296,12 +402,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   // Virtually no operation on f128 is legal, but LLVM can't expand them when
   // there's a valid register class, so we need custom operations in most cases.
   setOperationAction(ISD::FABS, MVT::f128, Expand);
-  setOperationAction(ISD::FADD, MVT::f128, Custom);
+  setOperationAction(ISD::FADD, MVT::f128, LibCall);
   setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
   setOperationAction(ISD::FCOS, MVT::f128, Expand);
-  setOperationAction(ISD::FDIV, MVT::f128, Custom);
+  setOperationAction(ISD::FDIV, MVT::f128, LibCall);
   setOperationAction(ISD::FMA, MVT::f128, Expand);
-  setOperationAction(ISD::FMUL, MVT::f128, Custom);
+  setOperationAction(ISD::FMUL, MVT::f128, LibCall);
   setOperationAction(ISD::FNEG, MVT::f128, Expand);
   setOperationAction(ISD::FPOW, MVT::f128, Expand);
   setOperationAction(ISD::FREM, MVT::f128, Expand);
@@ -309,7 +415,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::FSIN, MVT::f128, Expand);
   setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
   setOperationAction(ISD::FSQRT, MVT::f128, Expand);
-  setOperationAction(ISD::FSUB, MVT::f128, Custom);
+  setOperationAction(ISD::FSUB, MVT::f128, LibCall);
   setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
   setOperationAction(ISD::SETCC, MVT::f128, Custom);
   setOperationAction(ISD::STRICT_FSETCC, MVT::f128, Custom);
@@ -345,8 +451,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
   setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
   setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom);
+  setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
   setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
   setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
+  setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
   setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
   setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
 
@@ -401,6 +509,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::CTPOP, MVT::i64, Custom);
   setOperationAction(ISD::CTPOP, MVT::i128, Custom);
 
+  setOperationAction(ISD::ABS, MVT::i32, Custom);
+  setOperationAction(ISD::ABS, MVT::i64, Custom);
+
   setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
   setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
   for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
@@ -588,6 +699,57 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom);
   setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
 
+  // Generate outline atomics library calls only if LSE was not specified for
+  // subtarget
+  if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
+    setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, LibCall);
+    setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, LibCall);
+    setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, LibCall);
+    setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, LibCall);
+    setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, LibCall);
+    setOperationAction(ISD::ATOMIC_SWAP, MVT::i8, LibCall);
+    setOperationAction(ISD::ATOMIC_SWAP, MVT::i16, LibCall);
+    setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, LibCall);
+    setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, LibCall);
+    setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i8, LibCall);
+    setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i16, LibCall);
+    setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, LibCall);
+    setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, LibCall);
+    setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i8, LibCall);
+    setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i16, LibCall);
+    setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, LibCall);
+    setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, LibCall);
+    setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i8, LibCall);
+    setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i16, LibCall);
+    setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i32, LibCall);
+    setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i64, LibCall);
+    setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i8, LibCall);
+    setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i16, LibCall);
+    setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, LibCall);
+    setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, LibCall);
+#define LCALLNAMES(A, B, N)                                                    \
+  setLibcallName(A##N##_RELAX, #B #N "_relax");                                \
+  setLibcallName(A##N##_ACQ, #B #N "_acq");                                    \
+  setLibcallName(A##N##_REL, #B #N "_rel");                                    \
+  setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
+#define LCALLNAME4(A, B)                                                       \
+  LCALLNAMES(A, B, 1)                                                          \
+  LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
+#define LCALLNAME5(A, B)                                                       \
+  LCALLNAMES(A, B, 1)                                                          \
+  LCALLNAMES(A, B, 2)                                                          \
+  LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
+    LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas)
+    LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp)
+    LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd)
+    LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
+    LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
+    LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
+#undef LCALLNAMES
+#undef LCALLNAME4
+#undef LCALLNAME5
+  }
+
   // 128-bit loads and stores can be done without expanding
   setOperationAction(ISD::LOAD, MVT::i128, Custom);
   setOperationAction(ISD::STORE, MVT::i128, Custom);
@@ -677,8 +839,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 
   // Trap.
   setOperationAction(ISD::TRAP, MVT::Other, Legal);
-  if (Subtarget->isTargetWindows())
-    setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
+  setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
+  setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
 
   // We combine OR nodes for bitfield operations.
   setTargetDAGCombine(ISD::OR);
@@ -688,6 +850,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   // Vector add and sub nodes may conceal a high-half opportunity.
   // Also, try to fold ADD into CSINC/CSINV..
   setTargetDAGCombine(ISD::ADD);
+  setTargetDAGCombine(ISD::ABS);
   setTargetDAGCombine(ISD::SUB);
   setTargetDAGCombine(ISD::SRL);
   setTargetDAGCombine(ISD::XOR);
@@ -704,11 +867,15 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::ZERO_EXTEND);
   setTargetDAGCombine(ISD::SIGN_EXTEND);
   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
+  setTargetDAGCombine(ISD::TRUNCATE);
   setTargetDAGCombine(ISD::CONCAT_VECTORS);
   setTargetDAGCombine(ISD::STORE);
   if (Subtarget->supportsAddressTopByteIgnored())
     setTargetDAGCombine(ISD::LOAD);
 
+  setTargetDAGCombine(ISD::MGATHER);
+  setTargetDAGCombine(ISD::MSCATTER);
+
   setTargetDAGCombine(ISD::MUL);
 
   setTargetDAGCombine(ISD::SELECT);
@@ -717,6 +884,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::INTRINSIC_VOID);
   setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
+  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+  setTargetDAGCombine(ISD::VECREDUCE_ADD);
 
   setTargetDAGCombine(ISD::GlobalAddress);
 
@@ -836,28 +1005,34 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::MUL, MVT::v4i32, Custom);
     setOperationAction(ISD::MUL, MVT::v2i64, Custom);
 
+    // Saturates
     for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
                     MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
-      // Vector reductions
-      setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
-      setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
-      setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
-      setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
-      setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
-
-      // Saturates
       setOperationAction(ISD::SADDSAT, VT, Legal);
       setOperationAction(ISD::UADDSAT, VT, Legal);
       setOperationAction(ISD::SSUBSAT, VT, Legal);
       setOperationAction(ISD::USUBSAT, VT, Legal);
-
-      setOperationAction(ISD::TRUNCATE, VT, Custom);
     }
+
+    // Vector reductions
     for (MVT VT : { MVT::v4f16, MVT::v2f32,
                     MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
-      setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
-      setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
+      if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
+        setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
+        setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
+
+        setOperationAction(ISD::VECREDUCE_FADD, VT, Legal);
+      }
+    }
+    for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
+                    MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
+      setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
     }
+    setOperationAction(ISD::VECREDUCE_ADD, MVT::v2i64, Custom);
 
     setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
     setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
@@ -918,46 +1093,112 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     // FIXME: Add custom lowering of MLOAD to handle different passthrus (not a
     // splat of 0 or undef) once vector selects supported in SVE codegen. See
     // D68877 for more details.
-    for (MVT VT : MVT::integer_scalable_vector_valuetypes()) {
-      if (isTypeLegal(VT)) {
-        setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
-        setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
-        setOperationAction(ISD::SELECT, VT, Custom);
-        setOperationAction(ISD::SDIV, VT, Custom);
-        setOperationAction(ISD::UDIV, VT, Custom);
-        setOperationAction(ISD::SMIN, VT, Custom);
-        setOperationAction(ISD::UMIN, VT, Custom);
-        setOperationAction(ISD::SMAX, VT, Custom);
-        setOperationAction(ISD::UMAX, VT, Custom);
-        setOperationAction(ISD::SHL, VT, Custom);
-        setOperationAction(ISD::SRL, VT, Custom);
-        setOperationAction(ISD::SRA, VT, Custom);
-        if (VT.getScalarType() == MVT::i1) {
-          setOperationAction(ISD::SETCC, VT, Custom);
-          setOperationAction(ISD::TRUNCATE, VT, Custom);
-          setOperationAction(ISD::CONCAT_VECTORS, VT, Legal);
-        }
-      }
+    for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
+      setOperationAction(ISD::BITREVERSE, VT, Custom);
+      setOperationAction(ISD::BSWAP, VT, Custom);
+      setOperationAction(ISD::CTLZ, VT, Custom);
+      setOperationAction(ISD::CTPOP, VT, Custom);
+      setOperationAction(ISD::CTTZ, VT, Custom);
+      setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
+      setOperationAction(ISD::UINT_TO_FP, VT, Custom);
+      setOperationAction(ISD::SINT_TO_FP, VT, Custom);
+      setOperationAction(ISD::FP_TO_UINT, VT, Custom);
+      setOperationAction(ISD::FP_TO_SINT, VT, Custom);
+      setOperationAction(ISD::MGATHER, VT, Custom);
+      setOperationAction(ISD::MSCATTER, VT, Custom);
+      setOperationAction(ISD::MUL, VT, Custom);
+      setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
+      setOperationAction(ISD::SELECT, VT, Custom);
+      setOperationAction(ISD::SDIV, VT, Custom);
+      setOperationAction(ISD::UDIV, VT, Custom);
+      setOperationAction(ISD::SMIN, VT, Custom);
+      setOperationAction(ISD::UMIN, VT, Custom);
+      setOperationAction(ISD::SMAX, VT, Custom);
+      setOperationAction(ISD::UMAX, VT, Custom);
+      setOperationAction(ISD::SHL, VT, Custom);
+      setOperationAction(ISD::SRL, VT, Custom);
+      setOperationAction(ISD::SRA, VT, Custom);
+      setOperationAction(ISD::ABS, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
     }
 
-    for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32})
+    // Illegal unpacked integer vector types.
+    for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+      setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
+    }
 
-    setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom);
-    setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
-
-    for (MVT VT : MVT::fp_scalable_vector_valuetypes()) {
-      if (isTypeLegal(VT)) {
-        setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
-        setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
-        setOperationAction(ISD::SELECT, VT, Custom);
-        setOperationAction(ISD::FMA, VT, Custom);
+    for (auto VT : {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1}) {
+      setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+      setOperationAction(ISD::SELECT, VT, Custom);
+      setOperationAction(ISD::SETCC, VT, Custom);
+      setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
+      setOperationAction(ISD::TRUNCATE, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
+
+      // There are no legal MVT::nxv16f## based types.
+      if (VT != MVT::nxv16i1) {
+        setOperationAction(ISD::SINT_TO_FP, VT, Custom);
+        setOperationAction(ISD::UINT_TO_FP, VT, Custom);
       }
     }
 
+    for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
+                    MVT::nxv4f32, MVT::nxv2f64}) {
+      setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+      setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
+      setOperationAction(ISD::MGATHER, VT, Custom);
+      setOperationAction(ISD::MSCATTER, VT, Custom);
+      setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
+      setOperationAction(ISD::SELECT, VT, Custom);
+      setOperationAction(ISD::FADD, VT, Custom);
+      setOperationAction(ISD::FDIV, VT, Custom);
+      setOperationAction(ISD::FMA, VT, Custom);
+      setOperationAction(ISD::FMAXNUM, VT, Custom);
+      setOperationAction(ISD::FMINNUM, VT, Custom);
+      setOperationAction(ISD::FMUL, VT, Custom);
+      setOperationAction(ISD::FNEG, VT, Custom);
+      setOperationAction(ISD::FSUB, VT, Custom);
+      setOperationAction(ISD::FCEIL, VT, Custom);
+      setOperationAction(ISD::FFLOOR, VT, Custom);
+      setOperationAction(ISD::FNEARBYINT, VT, Custom);
+      setOperationAction(ISD::FRINT, VT, Custom);
+      setOperationAction(ISD::FROUND, VT, Custom);
+      setOperationAction(ISD::FROUNDEVEN, VT, Custom);
+      setOperationAction(ISD::FTRUNC, VT, Custom);
+      setOperationAction(ISD::FSQRT, VT, Custom);
+      setOperationAction(ISD::FABS, VT, Custom);
+      setOperationAction(ISD::FP_EXTEND, VT, Custom);
+      setOperationAction(ISD::FP_ROUND, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
+    }
+
+    for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
+      setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+      setOperationAction(ISD::MGATHER, VT, Custom);
+      setOperationAction(ISD::MSCATTER, VT, Custom);
+    }
+
+    setOperationAction(ISD::SPLAT_VECTOR, MVT::nxv8bf16, Custom);
+
+    setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom);
+    setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
+
     // NOTE: Currently this has to happen after computeRegisterProperties rather
     // than the preferred option of combining it with the addRegisterClass call.
-    if (useSVEForFixedLengthVectors()) {
+    if (Subtarget->useSVEForFixedLengthVectors()) {
       for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
         if (useSVEForFixedLengthVectorVT(VT))
           addTypeForFixedLengthSVE(VT);
@@ -975,6 +1216,61 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
         setOperationAction(ISD::TRUNCATE, VT, Custom);
       for (auto VT : {MVT::v8f16, MVT::v4f32})
         setOperationAction(ISD::FP_ROUND, VT, Expand);
+
+      // These operations are not supported on NEON but SVE can do them.
+      setOperationAction(ISD::BITREVERSE, MVT::v1i64, Custom);
+      setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
+      setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
+      setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
+      setOperationAction(ISD::MUL, MVT::v1i64, Custom);
+      setOperationAction(ISD::MUL, MVT::v2i64, Custom);
+      setOperationAction(ISD::SDIV, MVT::v8i8, Custom);
+      setOperationAction(ISD::SDIV, MVT::v16i8, Custom);
+      setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
+      setOperationAction(ISD::SDIV, MVT::v8i16, Custom);
+      setOperationAction(ISD::SDIV, MVT::v2i32, Custom);
+      setOperationAction(ISD::SDIV, MVT::v4i32, Custom);
+      setOperationAction(ISD::SDIV, MVT::v1i64, Custom);
+      setOperationAction(ISD::SDIV, MVT::v2i64, Custom);
+      setOperationAction(ISD::SMAX, MVT::v1i64, Custom);
+      setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
+      setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
+      setOperationAction(ISD::SMIN, MVT::v2i64, Custom);
+      setOperationAction(ISD::UDIV, MVT::v8i8, Custom);
+      setOperationAction(ISD::UDIV, MVT::v16i8, Custom);
+      setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
+      setOperationAction(ISD::UDIV, MVT::v8i16, Custom);
+      setOperationAction(ISD::UDIV, MVT::v2i32, Custom);
+      setOperationAction(ISD::UDIV, MVT::v4i32, Custom);
+      setOperationAction(ISD::UDIV, MVT::v1i64, Custom);
+      setOperationAction(ISD::UDIV, MVT::v2i64, Custom);
+      setOperationAction(ISD::UMAX, MVT::v1i64, Custom);
+      setOperationAction(ISD::UMAX, MVT::v2i64, Custom);
+      setOperationAction(ISD::UMIN, MVT::v1i64, Custom);
+      setOperationAction(ISD::UMIN, MVT::v2i64, Custom);
+      setOperationAction(ISD::VECREDUCE_SMAX, MVT::v2i64, Custom);
+      setOperationAction(ISD::VECREDUCE_SMIN, MVT::v2i64, Custom);
+      setOperationAction(ISD::VECREDUCE_UMAX, MVT::v2i64, Custom);
+      setOperationAction(ISD::VECREDUCE_UMIN, MVT::v2i64, Custom);
+
+      // Int operations with no NEON support.
+      for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
+                      MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
+        setOperationAction(ISD::BITREVERSE, VT, Custom);
+        setOperationAction(ISD::CTTZ, VT, Custom);
+        setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
+        setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
+        setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
+      }
+
+      // FP operations with no NEON support.
+      for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32,
+                      MVT::v1f64, MVT::v2f64})
+        setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
+
+      // Use SVE for vectors with more than 2 elements.
+      for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
+        setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
     }
   }
 
@@ -1046,6 +1342,7 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
 
   // F[MIN|MAX][NUM|NAN] are available for all FP NEON types.
   if (VT.isFloatingPoint() &&
+      VT.getVectorElementType() != MVT::bf16 &&
       (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
     for (unsigned Opcode :
          {ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FMINNUM, ISD::FMAXNUM})
@@ -1071,11 +1368,64 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
   setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
 
   // Lower fixed length vector operations to scalable equivalents.
+  setOperationAction(ISD::ABS, VT, Custom);
   setOperationAction(ISD::ADD, VT, Custom);
+  setOperationAction(ISD::AND, VT, Custom);
+  setOperationAction(ISD::ANY_EXTEND, VT, Custom);
+  setOperationAction(ISD::BITREVERSE, VT, Custom);
+  setOperationAction(ISD::BSWAP, VT, Custom);
+  setOperationAction(ISD::CTLZ, VT, Custom);
+  setOperationAction(ISD::CTPOP, VT, Custom);
+  setOperationAction(ISD::CTTZ, VT, Custom);
   setOperationAction(ISD::FADD, VT, Custom);
+  setOperationAction(ISD::FCEIL, VT, Custom);
+  setOperationAction(ISD::FDIV, VT, Custom);
+  setOperationAction(ISD::FFLOOR, VT, Custom);
+  setOperationAction(ISD::FMA, VT, Custom);
+  setOperationAction(ISD::FMAXNUM, VT, Custom);
+  setOperationAction(ISD::FMINNUM, VT, Custom);
+  setOperationAction(ISD::FMUL, VT, Custom);
+  setOperationAction(ISD::FNEARBYINT, VT, Custom);
+  setOperationAction(ISD::FNEG, VT, Custom);
+  setOperationAction(ISD::FRINT, VT, Custom);
+  setOperationAction(ISD::FROUND, VT, Custom);
+  setOperationAction(ISD::FSQRT, VT, Custom);
+  setOperationAction(ISD::FSUB, VT, Custom);
+  setOperationAction(ISD::FTRUNC, VT, Custom);
   setOperationAction(ISD::LOAD, VT, Custom);
+  setOperationAction(ISD::MUL, VT, Custom);
+  setOperationAction(ISD::OR, VT, Custom);
+  setOperationAction(ISD::SDIV, VT, Custom);
+  setOperationAction(ISD::SETCC, VT, Custom);
+  setOperationAction(ISD::SHL, VT, Custom);
+  setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom);
+  setOperationAction(ISD::SMAX, VT, Custom);
+  setOperationAction(ISD::SMIN, VT, Custom);
+  setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
+  setOperationAction(ISD::SRA, VT, Custom);
+  setOperationAction(ISD::SRL, VT, Custom);
   setOperationAction(ISD::STORE, VT, Custom);
+  setOperationAction(ISD::SUB, VT, Custom);
   setOperationAction(ISD::TRUNCATE, VT, Custom);
+  setOperationAction(ISD::UDIV, VT, Custom);
+  setOperationAction(ISD::UMAX, VT, Custom);
+  setOperationAction(ISD::UMIN, VT, Custom);
+  setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
+  setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
+  setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
+  setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
+  setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
+  setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
+  setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
+  setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
+  setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
+  setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
+  setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
+  setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
+  setOperationAction(ISD::VSELECT, VT, Custom);
+  setOperationAction(ISD::XOR, VT, Custom);
+  setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
 }
 
 void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
@@ -1247,8 +1597,7 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode(
     KnownBits Known2;
     Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
     Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
-    Known.Zero &= Known2.Zero;
-    Known.One &= Known2.One;
+    Known = KnownBits::commonBits(Known, Known2);
     break;
   }
   case AArch64ISD::LOADgot:
@@ -1388,15 +1737,38 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(AArch64ISD::THREAD_POINTER)
     MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ)
     MAKE_CASE(AArch64ISD::ADD_PRED)
+    MAKE_CASE(AArch64ISD::MUL_PRED)
     MAKE_CASE(AArch64ISD::SDIV_PRED)
+    MAKE_CASE(AArch64ISD::SHL_PRED)
+    MAKE_CASE(AArch64ISD::SMAX_PRED)
+    MAKE_CASE(AArch64ISD::SMIN_PRED)
+    MAKE_CASE(AArch64ISD::SRA_PRED)
+    MAKE_CASE(AArch64ISD::SRL_PRED)
+    MAKE_CASE(AArch64ISD::SUB_PRED)
     MAKE_CASE(AArch64ISD::UDIV_PRED)
-    MAKE_CASE(AArch64ISD::SMIN_MERGE_OP1)
-    MAKE_CASE(AArch64ISD::UMIN_MERGE_OP1)
-    MAKE_CASE(AArch64ISD::SMAX_MERGE_OP1)
-    MAKE_CASE(AArch64ISD::UMAX_MERGE_OP1)
-    MAKE_CASE(AArch64ISD::SHL_MERGE_OP1)
-    MAKE_CASE(AArch64ISD::SRL_MERGE_OP1)
-    MAKE_CASE(AArch64ISD::SRA_MERGE_OP1)
+    MAKE_CASE(AArch64ISD::UMAX_PRED)
+    MAKE_CASE(AArch64ISD::UMIN_PRED)
+    MAKE_CASE(AArch64ISD::FNEG_MERGE_PASSTHRU)
+    MAKE_CASE(AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU)
+    MAKE_CASE(AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU)
+    MAKE_CASE(AArch64ISD::FCEIL_MERGE_PASSTHRU)
+    MAKE_CASE(AArch64ISD::FFLOOR_MERGE_PASSTHRU)
+    MAKE_CASE(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU)
+    MAKE_CASE(AArch64ISD::FRINT_MERGE_PASSTHRU)
+    MAKE_CASE(AArch64ISD::FROUND_MERGE_PASSTHRU)
+    MAKE_CASE(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU)
+    MAKE_CASE(AArch64ISD::FTRUNC_MERGE_PASSTHRU)
+    MAKE_CASE(AArch64ISD::FP_ROUND_MERGE_PASSTHRU)
+    MAKE_CASE(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU)
+    MAKE_CASE(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU)
+    MAKE_CASE(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU)
+    MAKE_CASE(AArch64ISD::FCVTZU_MERGE_PASSTHRU)
+    MAKE_CASE(AArch64ISD::FCVTZS_MERGE_PASSTHRU)
+    MAKE_CASE(AArch64ISD::FSQRT_MERGE_PASSTHRU)
+    MAKE_CASE(AArch64ISD::FRECPX_MERGE_PASSTHRU)
+    MAKE_CASE(AArch64ISD::FABS_MERGE_PASSTHRU)
+    MAKE_CASE(AArch64ISD::ABS_MERGE_PASSTHRU)
+    MAKE_CASE(AArch64ISD::NEG_MERGE_PASSTHRU)
     MAKE_CASE(AArch64ISD::SETCC_MERGE_ZERO)
     MAKE_CASE(AArch64ISD::ADC)
     MAKE_CASE(AArch64ISD::SBC)
@@ -1465,10 +1837,14 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(AArch64ISD::UADDV)
     MAKE_CASE(AArch64ISD::SRHADD)
     MAKE_CASE(AArch64ISD::URHADD)
+    MAKE_CASE(AArch64ISD::SHADD)
+    MAKE_CASE(AArch64ISD::UHADD)
     MAKE_CASE(AArch64ISD::SMINV)
     MAKE_CASE(AArch64ISD::UMINV)
     MAKE_CASE(AArch64ISD::SMAXV)
     MAKE_CASE(AArch64ISD::UMAXV)
+    MAKE_CASE(AArch64ISD::SADDV_PRED)
+    MAKE_CASE(AArch64ISD::UADDV_PRED)
     MAKE_CASE(AArch64ISD::SMAXV_PRED)
     MAKE_CASE(AArch64ISD::UMAXV_PRED)
     MAKE_CASE(AArch64ISD::SMINV_PRED)
@@ -1486,12 +1862,16 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(AArch64ISD::FADD_PRED)
     MAKE_CASE(AArch64ISD::FADDA_PRED)
     MAKE_CASE(AArch64ISD::FADDV_PRED)
+    MAKE_CASE(AArch64ISD::FDIV_PRED)
     MAKE_CASE(AArch64ISD::FMA_PRED)
     MAKE_CASE(AArch64ISD::FMAXV_PRED)
+    MAKE_CASE(AArch64ISD::FMAXNM_PRED)
     MAKE_CASE(AArch64ISD::FMAXNMV_PRED)
     MAKE_CASE(AArch64ISD::FMINV_PRED)
+    MAKE_CASE(AArch64ISD::FMINNM_PRED)
     MAKE_CASE(AArch64ISD::FMINNMV_PRED)
-    MAKE_CASE(AArch64ISD::NOT)
+    MAKE_CASE(AArch64ISD::FMUL_PRED)
+    MAKE_CASE(AArch64ISD::FSUB_PRED)
     MAKE_CASE(AArch64ISD::BIT)
     MAKE_CASE(AArch64ISD::CBZ)
     MAKE_CASE(AArch64ISD::CBNZ)
@@ -1603,8 +1983,15 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(AArch64ISD::LDP)
     MAKE_CASE(AArch64ISD::STP)
     MAKE_CASE(AArch64ISD::STNP)
+    MAKE_CASE(AArch64ISD::BITREVERSE_MERGE_PASSTHRU)
+    MAKE_CASE(AArch64ISD::BSWAP_MERGE_PASSTHRU)
+    MAKE_CASE(AArch64ISD::CTLZ_MERGE_PASSTHRU)
+    MAKE_CASE(AArch64ISD::CTPOP_MERGE_PASSTHRU)
     MAKE_CASE(AArch64ISD::DUP_MERGE_PASSTHRU)
     MAKE_CASE(AArch64ISD::INDEX_VECTOR)
+    MAKE_CASE(AArch64ISD::UABD)
+    MAKE_CASE(AArch64ISD::SABD)
+    MAKE_CASE(AArch64ISD::CALL_RVMARKER)
   }
 #undef MAKE_CASE
   return nullptr;
@@ -1692,6 +2079,7 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
 
   case TargetOpcode::STACKMAP:
   case TargetOpcode::PATCHPOINT:
+  case TargetOpcode::STATEPOINT:
     return emitPatchPoint(MI, BB);
 
   case AArch64::CATCHRET:
@@ -2517,21 +2905,10 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
   return std::make_pair(Value, Overflow);
 }
 
-SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
-                                             RTLIB::Libcall Call) const {
-  bool IsStrict = Op->isStrictFPOpcode();
-  unsigned Offset = IsStrict ? 1 : 0;
-  SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
-  SmallVector<SDValue, 2> Ops(Op->op_begin() + Offset, Op->op_end());
-  MakeLibCallOptions CallOptions;
-  SDValue Result;
-  SDLoc dl(Op);
-  std::tie(Result, Chain) = makeLibCall(DAG, Call, Op.getValueType(), Ops,
-                                        CallOptions, dl, Chain);
-  return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
-}
+SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
+  if (useSVEForFixedLengthVectorVT(Op.getValueType()))
+    return LowerToScalableOp(Op, DAG);
 
-static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) {
   SDValue Sel = Op.getOperand(0);
   SDValue Other = Op.getOperand(1);
   SDLoc dl(Sel);
@@ -2706,16 +3083,18 @@ static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
 
 SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
                                               SelectionDAG &DAG) const {
-  assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
-
-  RTLIB::Libcall LC;
-  LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
+  if (Op.getValueType().isScalableVector())
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
 
-  return LowerF128Call(Op, DAG, LC);
+  assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
+  return SDValue();
 }
 
 SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
                                              SelectionDAG &DAG) const {
+  if (Op.getValueType().isScalableVector())
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
+
   bool IsStrict = Op->isStrictFPOpcode();
   SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
   EVT SrcVT = SrcVal.getValueType();
@@ -2729,19 +3108,7 @@ SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
     return Op;
   }
 
-  RTLIB::Libcall LC;
-  LC = RTLIB::getFPROUND(SrcVT, Op.getValueType());
-
-  // FP_ROUND node has a second operand indicating whether it is known to be
-  // precise. That doesn't take part in the LibCall so we can't directly use
-  // LowerF128Call.
-  MakeLibCallOptions CallOptions;
-  SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
-  SDValue Result;
-  SDLoc dl(Op);
-  std::tie(Result, Chain) = makeLibCall(DAG, LC, Op.getValueType(), SrcVal,
-                                        CallOptions, dl, Chain);
-  return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
+  return SDValue();
 }
 
 SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
@@ -2751,6 +3118,14 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
   // in the cost tables.
   EVT InVT = Op.getOperand(0).getValueType();
   EVT VT = Op.getValueType();
+
+  if (VT.isScalableVector()) {
+    unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
+                          ? AArch64ISD::FCVTZU_MERGE_PASSTHRU
+                          : AArch64ISD::FCVTZS_MERGE_PASSTHRU;
+    return LowerToPredicatedOp(Op, DAG, Opcode);
+  }
+
   unsigned NumElts = InVT.getVectorNumElements();
 
   // f16 conversions are promoted to f32 when full fp16 is not supported.
@@ -2763,7 +3138,9 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
         DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
   }
 
-  if (VT.getSizeInBits() < InVT.getSizeInBits()) {
+  uint64_t VTSize = VT.getFixedSizeInBits();
+  uint64_t InVTSize = InVT.getFixedSizeInBits();
+  if (VTSize < InVTSize) {
     SDLoc dl(Op);
     SDValue Cv =
         DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
@@ -2771,7 +3148,7 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
     return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
   }
 
-  if (VT.getSizeInBits() > InVT.getSizeInBits()) {
+  if (VTSize > InVTSize) {
     SDLoc dl(Op);
     MVT ExtVT =
         MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()),
@@ -2806,17 +3183,11 @@ SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
     return Op;
   }
 
-  RTLIB::Libcall LC;
-  if (Op.getOpcode() == ISD::FP_TO_SINT ||
-      Op.getOpcode() == ISD::STRICT_FP_TO_SINT)
-    LC = RTLIB::getFPTOSINT(SrcVal.getValueType(), Op.getValueType());
-  else
-    LC = RTLIB::getFPTOUINT(SrcVal.getValueType(), Op.getValueType());
-
-  return LowerF128Call(Op, DAG, LC);
+  return SDValue();
 }
 
-static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
+SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
+                                                    SelectionDAG &DAG) const {
   // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
   // Any additional optimization in this function should be recorded
   // in the cost tables.
@@ -2824,21 +3195,38 @@ static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
   SDLoc dl(Op);
   SDValue In = Op.getOperand(0);
   EVT InVT = In.getValueType();
+  unsigned Opc = Op.getOpcode();
+  bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
+
+  if (VT.isScalableVector()) {
+    if (InVT.getVectorElementType() == MVT::i1) {
+      // We can't directly extend an SVE predicate; extend it first.
+      unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+      EVT CastVT = getPromotedVTForPredicate(InVT);
+      In = DAG.getNode(CastOpc, dl, CastVT, In);
+      return DAG.getNode(Opc, dl, VT, In);
+    }
 
-  if (VT.getSizeInBits() < InVT.getSizeInBits()) {
+    unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
+                               : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
+    return LowerToPredicatedOp(Op, DAG, Opcode);
+  }
+
+  uint64_t VTSize = VT.getFixedSizeInBits();
+  uint64_t InVTSize = InVT.getFixedSizeInBits();
+  if (VTSize < InVTSize) {
     MVT CastVT =
         MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()),
                          InVT.getVectorNumElements());
-    In = DAG.getNode(Op.getOpcode(), dl, CastVT, In);
+    In = DAG.getNode(Opc, dl, CastVT, In);
     return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0, dl));
   }
 
-  if (VT.getSizeInBits() > InVT.getSizeInBits()) {
-    unsigned CastOpc =
-        Op.getOpcode() == ISD::SINT_TO_FP ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+  if (VTSize > InVTSize) {
+    unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
     EVT CastVT = VT.changeVectorElementTypeToInteger();
     In = DAG.getNode(CastOpc, dl, CastVT, In);
-    return DAG.getNode(Op.getOpcode(), dl, VT, In);
+    return DAG.getNode(Opc, dl, VT, In);
   }
 
   return Op;
@@ -2871,15 +3259,7 @@ SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
   // fp128.
   if (Op.getValueType() != MVT::f128)
     return Op;
-
-  RTLIB::Libcall LC;
-  if (Op.getOpcode() == ISD::SINT_TO_FP ||
-      Op.getOpcode() == ISD::STRICT_SINT_TO_FP)
-    LC = RTLIB::getSINTTOFP(SrcVal.getValueType(), Op.getValueType());
-  else
-    LC = RTLIB::getUINTTOFP(SrcVal.getValueType(), Op.getValueType());
-
-  return LowerF128Call(Op, DAG, LC);
+  return SDValue();
 }
 
 SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
@@ -2993,7 +3373,8 @@ static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
 }
 
 static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) {
-  if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
+  if (N->getOpcode() == ISD::SIGN_EXTEND ||
+      N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND)
     return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG,
                                              N->getOperand(0)->getValueType(0),
                                              N->getValueType(0),
@@ -3018,11 +3399,13 @@ static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) {
 
 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
   return N->getOpcode() == ISD::SIGN_EXTEND ||
+         N->getOpcode() == ISD::ANY_EXTEND ||
          isExtendedBUILD_VECTOR(N, DAG, true);
 }
 
 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
   return N->getOpcode() == ISD::ZERO_EXTEND ||
+         N->getOpcode() == ISD::ANY_EXTEND ||
          isExtendedBUILD_VECTOR(N, DAG, false);
 }
 
@@ -3071,10 +3454,17 @@ SDValue AArch64TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
   return DAG.getMergeValues({AND, Chain}, dl);
 }
 
-static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
+SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+
+  // If SVE is available then i64 vector multiplications can also be made legal.
+  bool OverrideNEON = VT == MVT::v2i64 || VT == MVT::v1i64;
+
+  if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED, OverrideNEON);
+
   // Multiplications are only custom-lowered for 128-bit vectors so that
   // VMULL can be detected.  Otherwise v2i64 multiplications are not legal.
-  EVT VT = Op.getValueType();
   assert(VT.is128BitVector() && VT.isInteger() &&
          "unexpected type for custom-lowering ISD::MUL");
   SDNode *N0 = Op.getOperand(0).getNode();
@@ -3233,11 +3623,77 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::aarch64_sve_ptrue:
     return DAG.getNode(AArch64ISD::PTRUE, dl, Op.getValueType(),
                        Op.getOperand(1));
+  case Intrinsic::aarch64_sve_clz:
+    return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, dl, Op.getValueType(),
+                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
+  case Intrinsic::aarch64_sve_cnt: {
+    SDValue Data = Op.getOperand(3);
+    // CTPOP only supports integer operands.
+    if (Data.getValueType().isFloatingPoint())
+      Data = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Data);
+    return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, dl, Op.getValueType(),
+                       Op.getOperand(2), Data, Op.getOperand(1));
+  }
   case Intrinsic::aarch64_sve_dupq_lane:
     return LowerDUPQLane(Op, DAG);
   case Intrinsic::aarch64_sve_convert_from_svbool:
     return DAG.getNode(AArch64ISD::REINTERPRET_CAST, dl, Op.getValueType(),
                        Op.getOperand(1));
+  case Intrinsic::aarch64_sve_fneg:
+    return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, dl, Op.getValueType(),
+                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
+  case Intrinsic::aarch64_sve_frintp:
+    return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, dl, Op.getValueType(),
+                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
+  case Intrinsic::aarch64_sve_frintm:
+    return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, dl, Op.getValueType(),
+                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
+  case Intrinsic::aarch64_sve_frinti:
+    return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, dl, Op.getValueType(),
+                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
+  case Intrinsic::aarch64_sve_frintx:
+    return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, dl, Op.getValueType(),
+                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
+  case Intrinsic::aarch64_sve_frinta:
+    return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, dl, Op.getValueType(),
+                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
+  case Intrinsic::aarch64_sve_frintn:
+    return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, dl, Op.getValueType(),
+                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
+  case Intrinsic::aarch64_sve_frintz:
+    return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, dl, Op.getValueType(),
+                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
+  case Intrinsic::aarch64_sve_ucvtf:
+    return DAG.getNode(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU, dl,
+                       Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
+                       Op.getOperand(1));
+  case Intrinsic::aarch64_sve_scvtf:
+    return DAG.getNode(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU, dl,
+                       Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
+                       Op.getOperand(1));
+  case Intrinsic::aarch64_sve_fcvtzu:
+    return DAG.getNode(AArch64ISD::FCVTZU_MERGE_PASSTHRU, dl,
+                       Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
+                       Op.getOperand(1));
+  case Intrinsic::aarch64_sve_fcvtzs:
+    return DAG.getNode(AArch64ISD::FCVTZS_MERGE_PASSTHRU, dl,
+                       Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
+                       Op.getOperand(1));
+  case Intrinsic::aarch64_sve_fsqrt:
+    return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, dl, Op.getValueType(),
+                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
+  case Intrinsic::aarch64_sve_frecpx:
+    return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, dl, Op.getValueType(),
+                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
+  case Intrinsic::aarch64_sve_fabs:
+    return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, dl, Op.getValueType(),
+                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
+  case Intrinsic::aarch64_sve_abs:
+    return DAG.getNode(AArch64ISD::ABS_MERGE_PASSTHRU, dl, Op.getValueType(),
+                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
+  case Intrinsic::aarch64_sve_neg:
+    return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, dl, Op.getValueType(),
+                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
   case Intrinsic::aarch64_sve_convert_to_svbool: {
     EVT OutVT = Op.getValueType();
     EVT InVT = Op.getOperand(1).getValueType();
@@ -3263,6 +3719,49 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return DAG.getNode(AArch64ISD::INSR, dl, Op.getValueType(),
                        Op.getOperand(1), Scalar);
   }
+  case Intrinsic::aarch64_sve_rbit:
+    return DAG.getNode(AArch64ISD::BITREVERSE_MERGE_PASSTHRU, dl,
+                       Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
+                       Op.getOperand(1));
+  case Intrinsic::aarch64_sve_revb:
+    return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, dl, Op.getValueType(),
+                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
+  case Intrinsic::aarch64_sve_sxtb:
+    return DAG.getNode(
+        AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
+        Op.getOperand(2), Op.getOperand(3),
+        DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
+        Op.getOperand(1));
+  case Intrinsic::aarch64_sve_sxth:
+    return DAG.getNode(
+        AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
+        Op.getOperand(2), Op.getOperand(3),
+        DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
+        Op.getOperand(1));
+  case Intrinsic::aarch64_sve_sxtw:
+    return DAG.getNode(
+        AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
+        Op.getOperand(2), Op.getOperand(3),
+        DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
+        Op.getOperand(1));
+  case Intrinsic::aarch64_sve_uxtb:
+    return DAG.getNode(
+        AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
+        Op.getOperand(2), Op.getOperand(3),
+        DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
+        Op.getOperand(1));
+  case Intrinsic::aarch64_sve_uxth:
+    return DAG.getNode(
+        AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
+        Op.getOperand(2), Op.getOperand(3),
+        DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
+        Op.getOperand(1));
+  case Intrinsic::aarch64_sve_uxtw:
+    return DAG.getNode(
+        AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
+        Op.getOperand(2), Op.getOperand(3),
+        DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
+        Op.getOperand(1));
 
   case Intrinsic::localaddress: {
     const auto &MF = DAG.getMachineFunction();
@@ -3302,19 +3801,291 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   }
 
   case Intrinsic::aarch64_neon_srhadd:
-  case Intrinsic::aarch64_neon_urhadd: {
-    bool IsSignedAdd = IntNo == Intrinsic::aarch64_neon_srhadd;
-    unsigned Opcode = IsSignedAdd ? AArch64ISD::SRHADD : AArch64ISD::URHADD;
+  case Intrinsic::aarch64_neon_urhadd:
+  case Intrinsic::aarch64_neon_shadd:
+  case Intrinsic::aarch64_neon_uhadd: {
+    bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
+                        IntNo == Intrinsic::aarch64_neon_shadd);
+    bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
+                          IntNo == Intrinsic::aarch64_neon_urhadd);
+    unsigned Opcode =
+        IsSignedAdd ? (IsRoundingAdd ? AArch64ISD::SRHADD : AArch64ISD::SHADD)
+                    : (IsRoundingAdd ? AArch64ISD::URHADD : AArch64ISD::UHADD);
     return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
                        Op.getOperand(2));
   }
+
+  case Intrinsic::aarch64_neon_uabd: {
+    return DAG.getNode(AArch64ISD::UABD, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  }
+  case Intrinsic::aarch64_neon_sabd: {
+    return DAG.getNode(AArch64ISD::SABD, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
   }
+  }
+}
+
+bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(EVT VT) const {
+  if (VT.getVectorElementType() == MVT::i32 &&
+      VT.getVectorElementCount().getKnownMinValue() >= 4)
+    return true;
+
+  return false;
 }
 
 bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
   return ExtVal.getValueType().isScalableVector();
 }
 
+unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
+  std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
+      {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
+       AArch64ISD::GLD1_MERGE_ZERO},
+      {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
+       AArch64ISD::GLD1_UXTW_MERGE_ZERO},
+      {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
+       AArch64ISD::GLD1_MERGE_ZERO},
+      {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
+       AArch64ISD::GLD1_SXTW_MERGE_ZERO},
+      {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
+       AArch64ISD::GLD1_SCALED_MERGE_ZERO},
+      {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
+       AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO},
+      {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
+       AArch64ISD::GLD1_SCALED_MERGE_ZERO},
+      {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
+       AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO},
+  };
+  auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
+  return AddrModes.find(Key)->second;
+}
+
+unsigned getScatterVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
+  std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
+      {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
+       AArch64ISD::SST1_PRED},
+      {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
+       AArch64ISD::SST1_UXTW_PRED},
+      {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
+       AArch64ISD::SST1_PRED},
+      {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
+       AArch64ISD::SST1_SXTW_PRED},
+      {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
+       AArch64ISD::SST1_SCALED_PRED},
+      {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
+       AArch64ISD::SST1_UXTW_SCALED_PRED},
+      {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
+       AArch64ISD::SST1_SCALED_PRED},
+      {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
+       AArch64ISD::SST1_SXTW_SCALED_PRED},
+  };
+  auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
+  return AddrModes.find(Key)->second;
+}
+
+unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
+  switch (Opcode) {
+  default:
+    llvm_unreachable("unimplemented opcode");
+    return Opcode;
+  case AArch64ISD::GLD1_MERGE_ZERO:
+    return AArch64ISD::GLD1S_MERGE_ZERO;
+  case AArch64ISD::GLD1_IMM_MERGE_ZERO:
+    return AArch64ISD::GLD1S_IMM_MERGE_ZERO;
+  case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
+    return AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
+  case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
+    return AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
+  case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
+    return AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
+  case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
+    return AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
+  case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
+    return AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
+  }
+}
+
+bool getGatherScatterIndexIsExtended(SDValue Index) {
+  unsigned Opcode = Index.getOpcode();
+  if (Opcode == ISD::SIGN_EXTEND_INREG)
+    return true;
+
+  if (Opcode == ISD::AND) {
+    SDValue Splat = Index.getOperand(1);
+    if (Splat.getOpcode() != ISD::SPLAT_VECTOR)
+      return false;
+    ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(Splat.getOperand(0));
+    if (!Mask || Mask->getZExtValue() != 0xFFFFFFFF)
+      return false;
+    return true;
+  }
+
+  return false;
+}
+
+// If the base pointer of a masked gather or scatter is null, we
+// may be able to swap BasePtr & Index and use the vector + register
+// or vector + immediate addressing mode, e.g.
+// VECTOR + REGISTER:
+//    getelementptr nullptr, <vscale x N x T> (splat(%offset)) + %indices)
+// -> getelementptr %offset, <vscale x N x T> %indices
+// VECTOR + IMMEDIATE:
+//    getelementptr nullptr, <vscale x N x T> (splat(#x)) + %indices)
+// -> getelementptr #x, <vscale x N x T> %indices
+void selectGatherScatterAddrMode(SDValue &BasePtr, SDValue &Index, EVT MemVT,
+                                 unsigned &Opcode, bool IsGather,
+                                 SelectionDAG &DAG) {
+  if (!isNullConstant(BasePtr))
+    return;
+
+  ConstantSDNode *Offset = nullptr;
+  if (Index.getOpcode() == ISD::ADD)
+    if (auto SplatVal = DAG.getSplatValue(Index.getOperand(1))) {
+      if (isa<ConstantSDNode>(SplatVal))
+        Offset = cast<ConstantSDNode>(SplatVal);
+      else {
+        BasePtr = SplatVal;
+        Index = Index->getOperand(0);
+        return;
+      }
+    }
+
+  unsigned NewOp =
+      IsGather ? AArch64ISD::GLD1_IMM_MERGE_ZERO : AArch64ISD::SST1_IMM_PRED;
+
+  if (!Offset) {
+    std::swap(BasePtr, Index);
+    Opcode = NewOp;
+    return;
+  }
+
+  uint64_t OffsetVal = Offset->getZExtValue();
+  unsigned ScalarSizeInBytes = MemVT.getScalarSizeInBits() / 8;
+  auto ConstOffset = DAG.getConstant(OffsetVal, SDLoc(Index), MVT::i64);
+
+  if (OffsetVal % ScalarSizeInBytes || OffsetVal / ScalarSizeInBytes > 31) {
+    // Index is out of range for the immediate addressing mode
+    BasePtr = ConstOffset;
+    Index = Index->getOperand(0);
+    return;
+  }
+
+  // Immediate is in range
+  Opcode = NewOp;
+  BasePtr = Index->getOperand(0);
+  Index = ConstOffset;
+}
+
+SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op);
+  assert(MGT && "Can only custom lower gather load nodes");
+
+  SDValue Index = MGT->getIndex();
+  SDValue Chain = MGT->getChain();
+  SDValue PassThru = MGT->getPassThru();
+  SDValue Mask = MGT->getMask();
+  SDValue BasePtr = MGT->getBasePtr();
+  ISD::LoadExtType ExtTy = MGT->getExtensionType();
+
+  ISD::MemIndexType IndexType = MGT->getIndexType();
+  bool IsScaled =
+      IndexType == ISD::SIGNED_SCALED || IndexType == ISD::UNSIGNED_SCALED;
+  bool IsSigned =
+      IndexType == ISD::SIGNED_SCALED || IndexType == ISD::SIGNED_UNSCALED;
+  bool IdxNeedsExtend =
+      getGatherScatterIndexIsExtended(Index) ||
+      Index.getSimpleValueType().getVectorElementType() == MVT::i32;
+  bool ResNeedsSignExtend = ExtTy == ISD::EXTLOAD || ExtTy == ISD::SEXTLOAD;
+
+  EVT VT = PassThru.getSimpleValueType();
+  EVT MemVT = MGT->getMemoryVT();
+  SDValue InputVT = DAG.getValueType(MemVT);
+
+  if (VT.getVectorElementType() == MVT::bf16 &&
+      !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
+    return SDValue();
+
+  // Handle FP data by using an integer gather and casting the result.
+  if (VT.isFloatingPoint()) {
+    EVT PassThruVT = getPackedSVEVectorVT(VT.getVectorElementCount());
+    PassThru = getSVESafeBitCast(PassThruVT, PassThru, DAG);
+    InputVT = DAG.getValueType(MemVT.changeVectorElementTypeToInteger());
+  }
+
+  SDVTList VTs = DAG.getVTList(PassThru.getSimpleValueType(), MVT::Other);
+
+  if (getGatherScatterIndexIsExtended(Index))
+    Index = Index.getOperand(0);
+
+  unsigned Opcode = getGatherVecOpcode(IsScaled, IsSigned, IdxNeedsExtend);
+  selectGatherScatterAddrMode(BasePtr, Index, MemVT, Opcode,
+                              /*isGather=*/true, DAG);
+
+  if (ResNeedsSignExtend)
+    Opcode = getSignExtendedGatherOpcode(Opcode);
+
+  SDValue Ops[] = {Chain, Mask, BasePtr, Index, InputVT, PassThru};
+  SDValue Gather = DAG.getNode(Opcode, DL, VTs, Ops);
+
+  if (VT.isFloatingPoint()) {
+    SDValue Cast = getSVESafeBitCast(VT, Gather, DAG);
+    return DAG.getMergeValues({Cast, Gather}, DL);
+  }
+
+  return Gather;
+}
+
+SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op);
+  assert(MSC && "Can only custom lower scatter store nodes");
+
+  SDValue Index = MSC->getIndex();
+  SDValue Chain = MSC->getChain();
+  SDValue StoreVal = MSC->getValue();
+  SDValue Mask = MSC->getMask();
+  SDValue BasePtr = MSC->getBasePtr();
+
+  ISD::MemIndexType IndexType = MSC->getIndexType();
+  bool IsScaled =
+      IndexType == ISD::SIGNED_SCALED || IndexType == ISD::UNSIGNED_SCALED;
+  bool IsSigned =
+      IndexType == ISD::SIGNED_SCALED || IndexType == ISD::SIGNED_UNSCALED;
+  bool NeedsExtend =
+      getGatherScatterIndexIsExtended(Index) ||
+      Index.getSimpleValueType().getVectorElementType() == MVT::i32;
+
+  EVT VT = StoreVal.getSimpleValueType();
+  SDVTList VTs = DAG.getVTList(MVT::Other);
+  EVT MemVT = MSC->getMemoryVT();
+  SDValue InputVT = DAG.getValueType(MemVT);
+
+  if (VT.getVectorElementType() == MVT::bf16 &&
+      !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
+    return SDValue();
+
+  // Handle FP data by casting the data so an integer scatter can be used.
+  if (VT.isFloatingPoint()) {
+    EVT StoreValVT = getPackedSVEVectorVT(VT.getVectorElementCount());
+    StoreVal = getSVESafeBitCast(StoreValVT, StoreVal, DAG);
+    InputVT = DAG.getValueType(MemVT.changeVectorElementTypeToInteger());
+  }
+
+  if (getGatherScatterIndexIsExtended(Index))
+    Index = Index.getOperand(0);
+
+  unsigned Opcode = getScatterVecOpcode(IsScaled, IsSigned, NeedsExtend);
+  selectGatherScatterAddrMode(BasePtr, Index, MemVT, Opcode,
+                              /*isGather=*/false, DAG);
+
+  SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, InputVT};
+  return DAG.getNode(Opcode, DL, VTs, Ops);
+}
+
 // Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
 static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST,
                                         EVT VT, EVT MemVT,
@@ -3380,8 +4151,9 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
     // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
     // the custom lowering, as there are no un-paired non-temporal stores and
     // legalization will break up 256 bit inputs.
+    ElementCount EC = MemVT.getVectorElementCount();
     if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
-        MemVT.getVectorElementCount().Min % 2u == 0 &&
+        EC.isKnownEven() &&
         ((MemVT.getScalarSizeInBits() == 8u ||
           MemVT.getScalarSizeInBits() == 16u ||
           MemVT.getScalarSizeInBits() == 32u ||
@@ -3390,11 +4162,11 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
           DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl,
                       MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
                       StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
-      SDValue Hi = DAG.getNode(
-          ISD::EXTRACT_SUBVECTOR, Dl,
-          MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
-          StoreNode->getValue(),
-          DAG.getConstant(MemVT.getVectorElementCount().Min / 2, Dl, MVT::i64));
+      SDValue Hi =
+          DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl,
+                      MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
+                      StoreNode->getValue(),
+                      DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64));
       SDValue Result = DAG.getMemIntrinsicNode(
           AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
           {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
@@ -3419,6 +4191,25 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
   return SDValue();
 }
 
+// Generate SUBS and CSEL for integer abs.
+SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
+  MVT VT = Op.getSimpleValueType();
+
+  if (VT.isVector())
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU);
+
+  SDLoc DL(Op);
+  SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
+                            Op.getOperand(0));
+  // Generate SUBS & CSEL.
+  SDValue Cmp =
+      DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
+                  Op.getOperand(0), DAG.getConstant(0, DL, VT));
+  return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
+                     DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
+                     Cmp.getValue(1));
+}
+
 SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
                                               SelectionDAG &DAG) const {
   LLVM_DEBUG(dbgs() << "Custom lowering: ");
@@ -3471,17 +4262,35 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
   case ISD::UMULO:
     return LowerXALUO(Op, DAG);
   case ISD::FADD:
-    if (useSVEForFixedLengthVectorVT(Op.getValueType()))
-      return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
-    return LowerF128Call(Op, DAG, RTLIB::ADD_F128);
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
   case ISD::FSUB:
-    return LowerF128Call(Op, DAG, RTLIB::SUB_F128);
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED);
   case ISD::FMUL:
-    return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED);
   case ISD::FMA:
     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
   case ISD::FDIV:
-    return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED);
+  case ISD::FNEG:
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU);
+  case ISD::FCEIL:
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU);
+  case ISD::FFLOOR:
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU);
+  case ISD::FNEARBYINT:
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU);
+  case ISD::FRINT:
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU);
+  case ISD::FROUND:
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU);
+  case ISD::FROUNDEVEN:
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU);
+  case ISD::FTRUNC:
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU);
+  case ISD::FSQRT:
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU);
+  case ISD::FABS:
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU);
   case ISD::FP_ROUND:
   case ISD::STRICT_FP_ROUND:
     return LowerFP_ROUND(Op, DAG);
@@ -3495,6 +4304,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
     return LowerRETURNADDR(Op, DAG);
   case ISD::ADDROFRETURNADDR:
     return LowerADDROFRETURNADDR(Op, DAG);
+  case ISD::CONCAT_VECTORS:
+    return LowerCONCAT_VECTORS(Op, DAG);
   case ISD::INSERT_VECTOR_ELT:
     return LowerINSERT_VECTOR_ELT(Op, DAG);
   case ISD::EXTRACT_VECTOR_ELT:
@@ -3510,17 +4321,20 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
   case ISD::INSERT_SUBVECTOR:
     return LowerINSERT_SUBVECTOR(Op, DAG);
   case ISD::SDIV:
-    return LowerToPredicatedOp(Op, DAG, AArch64ISD::SDIV_PRED);
   case ISD::UDIV:
-    return LowerToPredicatedOp(Op, DAG, AArch64ISD::UDIV_PRED);
+    return LowerDIV(Op, DAG);
   case ISD::SMIN:
-    return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_MERGE_OP1);
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED,
+                               /*OverrideNEON=*/true);
   case ISD::UMIN:
-    return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_MERGE_OP1);
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED,
+                               /*OverrideNEON=*/true);
   case ISD::SMAX:
-    return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_MERGE_OP1);
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED,
+                               /*OverrideNEON=*/true);
   case ISD::UMAX:
-    return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_MERGE_OP1);
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED,
+                               /*OverrideNEON=*/true);
   case ISD::SRA:
   case ISD::SRL:
   case ISD::SHL:
@@ -3560,11 +4374,21 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
     return LowerINTRINSIC_WO_CHAIN(Op, DAG);
   case ISD::STORE:
     return LowerSTORE(Op, DAG);
+  case ISD::MGATHER:
+    return LowerMGATHER(Op, DAG);
+  case ISD::MSCATTER:
+    return LowerMSCATTER(Op, DAG);
+  case ISD::VECREDUCE_SEQ_FADD:
+    return LowerVECREDUCE_SEQ_FADD(Op, DAG);
   case ISD::VECREDUCE_ADD:
+  case ISD::VECREDUCE_AND:
+  case ISD::VECREDUCE_OR:
+  case ISD::VECREDUCE_XOR:
   case ISD::VECREDUCE_SMAX:
   case ISD::VECREDUCE_SMIN:
   case ISD::VECREDUCE_UMAX:
   case ISD::VECREDUCE_UMIN:
+  case ISD::VECREDUCE_FADD:
   case ISD::VECREDUCE_FMAX:
   case ISD::VECREDUCE_FMIN:
     return LowerVECREDUCE(Op, DAG);
@@ -3576,38 +4400,71 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
     return LowerDYNAMIC_STACKALLOC(Op, DAG);
   case ISD::VSCALE:
     return LowerVSCALE(Op, DAG);
-  case ISD::TRUNCATE:
-    return LowerTRUNCATE(Op, DAG);
-  case ISD::LOAD:
-    if (useSVEForFixedLengthVectorVT(Op.getValueType()))
-      return LowerFixedLengthVectorLoadToSVE(Op, DAG);
+  case ISD::ANY_EXTEND:
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+    return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
+  case ISD::SIGN_EXTEND_INREG: {
+    // Only custom lower when ExtraVT has a legal byte based element type.
+    EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+    EVT ExtraEltVT = ExtraVT.getVectorElementType();
+    if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) &&
+        (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64))
+      return SDValue();
+
+    return LowerToPredicatedOp(Op, DAG,
+                               AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU);
+  }
+  case ISD::TRUNCATE:
+    return LowerTRUNCATE(Op, DAG);
+  case ISD::LOAD:
+    if (useSVEForFixedLengthVectorVT(Op.getValueType()))
+      return LowerFixedLengthVectorLoadToSVE(Op, DAG);
     llvm_unreachable("Unexpected request to lower ISD::LOAD");
   case ISD::ADD:
-    if (useSVEForFixedLengthVectorVT(Op.getValueType()))
-      return LowerToPredicatedOp(Op, DAG, AArch64ISD::ADD_PRED);
-    llvm_unreachable("Unexpected request to lower ISD::ADD");
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::ADD_PRED);
+  case ISD::AND:
+    return LowerToScalableOp(Op, DAG);
+  case ISD::SUB:
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::SUB_PRED);
+  case ISD::FMAXNUM:
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED);
+  case ISD::FMINNUM:
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED);
+  case ISD::VSELECT:
+    return LowerFixedLengthVectorSelectToSVE(Op, DAG);
+  case ISD::ABS:
+    return LowerABS(Op, DAG);
+  case ISD::BITREVERSE:
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU,
+                               /*OverrideNEON=*/true);
+  case ISD::BSWAP:
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU);
+  case ISD::CTLZ:
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU,
+                               /*OverrideNEON=*/true);
+  case ISD::CTTZ:
+    return LowerCTTZ(Op, DAG);
   }
 }
 
-bool AArch64TargetLowering::useSVEForFixedLengthVectors() const {
-  // Prefer NEON unless larger SVE registers are available.
-  return Subtarget->hasSVE() && Subtarget->getMinSVEVectorSizeInBits() >= 256;
+bool AArch64TargetLowering::mergeStoresAfterLegalization(EVT VT) const {
+  return !Subtarget->useSVEForFixedLengthVectors();
 }
 
-bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(EVT VT) const {
-  if (!useSVEForFixedLengthVectors())
+bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(
+    EVT VT, bool OverrideNEON) const {
+  if (!Subtarget->useSVEForFixedLengthVectors())
     return false;
 
   if (!VT.isFixedLengthVector())
     return false;
 
-  // Fixed length predicates should be promoted to i8.
-  // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
-  if (VT.getVectorElementType() == MVT::i1)
-    return false;
-
   // Don't use SVE for vectors we cannot scalarize if required.
   switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
+  // Fixed length predicates should be promoted to i8.
+  // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
+  case MVT::i1:
   default:
     return false;
   case MVT::i8:
@@ -3620,12 +4477,16 @@ bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(EVT VT) const {
     break;
   }
 
+  // All SVE implementations support NEON sized vectors.
+  if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))
+    return true;
+
   // Ensure NEON MVTs only belong to a single register class.
-  if (VT.getSizeInBits() <= 128)
+  if (VT.getFixedSizeInBits() <= 128)
     return false;
 
   // Don't use SVE for types that don't fit.
-  if (VT.getSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
+  if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
     return false;
 
   // TODO: Perhaps an artificial restriction, but worth having whilst getting
@@ -3724,10 +4585,10 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
     assert(!Res && "Call operand has unhandled type");
     (void)Res;
   }
-  assert(ArgLocs.size() == Ins.size());
   SmallVector<SDValue, 16> ArgValues;
-  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
-    CCValAssign &VA = ArgLocs[i];
+  unsigned ExtraArgLocs = 0;
+  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
 
     if (Ins[i].Flags.isByVal()) {
       // Byval is used for HFAs in the PCS, but the system should work in a
@@ -3855,16 +4716,44 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
     if (VA.getLocInfo() == CCValAssign::Indirect) {
       assert(VA.getValVT().isScalableVector() &&
            "Only scalable vectors can be passed indirectly");
-      // If value is passed via pointer - do a load.
-      ArgValue =
-          DAG.getLoad(VA.getValVT(), DL, Chain, ArgValue, MachinePointerInfo());
-    }
 
-    if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
-      ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
-                             ArgValue, DAG.getValueType(MVT::i32));
-    InVals.push_back(ArgValue);
+      uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinSize();
+      unsigned NumParts = 1;
+      if (Ins[i].Flags.isInConsecutiveRegs()) {
+        assert(!Ins[i].Flags.isInConsecutiveRegsLast());
+        while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
+          ++NumParts;
+      }
+
+      MVT PartLoad = VA.getValVT();
+      SDValue Ptr = ArgValue;
+
+      // Ensure we generate all loads for each tuple part, whilst updating the
+      // pointer after each load correctly using vscale.
+      while (NumParts > 0) {
+        ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo());
+        InVals.push_back(ArgValue);
+        NumParts--;
+        if (NumParts > 0) {
+          SDValue BytesIncrement = DAG.getVScale(
+              DL, Ptr.getValueType(),
+              APInt(Ptr.getValueSizeInBits().getFixedSize(), PartSize));
+          SDNodeFlags Flags;
+          Flags.setNoUnsignedWrap(true);
+          Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
+                            BytesIncrement, Flags);
+          ExtraArgLocs++;
+          i++;
+        }
+      }
+    } else {
+      if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
+        ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
+                               ArgValue, DAG.getValueType(MVT::i32));
+      InVals.push_back(ArgValue);
+    }
   }
+  assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
 
   // varargs
   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
@@ -4039,9 +4928,7 @@ SDValue AArch64TargetLowering::LowerCallResult(
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
     SDValue ThisVal) const {
-  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
-                          ? RetCC_AArch64_WebKit_JS
-                          : RetCC_AArch64_AAPCS;
+  CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
   DenseMap<unsigned, SDValue> CopiedRegs;
@@ -4464,8 +5351,9 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   }
 
   // Walk the register/memloc assignments, inserting copies/loads.
-  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
-    CCValAssign &VA = ArgLocs[i];
+  unsigned ExtraArgLocs = 0;
+  for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
     SDValue Arg = OutVals[i];
     ISD::ArgFlagsTy Flags = Outs[i].Flags;
 
@@ -4507,18 +5395,49 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
     case CCValAssign::Indirect:
       assert(VA.getValVT().isScalableVector() &&
              "Only scalable vectors can be passed indirectly");
+
+      uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinSize();
+      uint64_t PartSize = StoreSize;
+      unsigned NumParts = 1;
+      if (Outs[i].Flags.isInConsecutiveRegs()) {
+        assert(!Outs[i].Flags.isInConsecutiveRegsLast());
+        while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
+          ++NumParts;
+        StoreSize *= NumParts;
+      }
+
       MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
       Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
       Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
-      int FI = MFI.CreateStackObject(
-          VA.getValVT().getStoreSize().getKnownMinSize(), Alignment, false);
-      MFI.setStackID(FI, TargetStackID::SVEVector);
+      int FI = MFI.CreateStackObject(StoreSize, Alignment, false);
+      MFI.setStackID(FI, TargetStackID::ScalableVector);
 
-      SDValue SpillSlot = DAG.getFrameIndex(
+      MachinePointerInfo MPI =
+          MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
+      SDValue Ptr = DAG.getFrameIndex(
           FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
-      Chain = DAG.getStore(
-          Chain, DL, Arg, SpillSlot,
-          MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
+      SDValue SpillSlot = Ptr;
+
+      // Ensure we generate all stores for each tuple part, whilst updating the
+      // pointer after each store correctly using vscale.
+      while (NumParts) {
+        Chain = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI);
+        NumParts--;
+        if (NumParts > 0) {
+          SDValue BytesIncrement = DAG.getVScale(
+              DL, Ptr.getValueType(),
+              APInt(Ptr.getValueSizeInBits().getFixedSize(), PartSize));
+          SDNodeFlags Flags;
+          Flags.setNoUnsignedWrap(true);
+
+          MPI = MachinePointerInfo(MPI.getAddrSpace());
+          Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
+                            BytesIncrement, Flags);
+          ExtraArgLocs++;
+          i++;
+        }
+      }
+
       Arg = SpillSlot;
       break;
     }
@@ -4538,20 +5457,18 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
         // take care of putting the two halves in the right place but we have to
         // combine them.
         SDValue &Bits =
-            std::find_if(RegsToPass.begin(), RegsToPass.end(),
-                         [=](const std::pair<unsigned, SDValue> &Elt) {
-                           return Elt.first == VA.getLocReg();
-                         })
+            llvm::find_if(RegsToPass,
+                          [=](const std::pair<unsigned, SDValue> &Elt) {
+                            return Elt.first == VA.getLocReg();
+                          })
                 ->second;
         Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
         // Call site info is used for function's parameter entry value
         // tracking. For now we track only simple cases when parameter
         // is transferred through whole register.
-        CSInfo.erase(std::remove_if(CSInfo.begin(), CSInfo.end(),
-                                    [&VA](MachineFunction::ArgRegPair ArgReg) {
-                                      return ArgReg.Reg == VA.getLocReg();
-                                    }),
-                     CSInfo.end());
+        llvm::erase_if(CSInfo, [&VA](MachineFunction::ArgRegPair ArgReg) {
+          return ArgReg.Reg == VA.getLocReg();
+        });
       } else {
         RegsToPass.emplace_back(VA.getLocReg(), Arg);
         RegsUsed.insert(VA.getLocReg());
@@ -4570,7 +5487,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
       uint32_t BEAlign = 0;
       unsigned OpSize;
       if (VA.getLocInfo() == CCValAssign::Indirect)
-        OpSize = VA.getLocVT().getSizeInBits();
+        OpSize = VA.getLocVT().getFixedSizeInBits();
       else
         OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
                                  : VA.getValVT().getSizeInBits();
@@ -4730,8 +5647,17 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
     return Ret;
   }
 
+  unsigned CallOpc = AArch64ISD::CALL;
+  // Calls marked with "rv_marker" are special. They should be expanded to the
+  // call, directly followed by a special marker sequence. Use the CALL_RVMARKER
+  // to do that.
+  if (CLI.CB && CLI.CB->hasRetAttr("rv_marker")) {
+    assert(!IsTailCall && "tail calls cannot be marked with rv_marker");
+    CallOpc = AArch64ISD::CALL_RVMARKER;
+  }
+
   // Returns a chain and a flag for retval copy to use.
-  Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops);
+  Chain = DAG.getNode(CallOpc, DL, NodeTys, Ops);
   DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
   InFlag = Chain.getValue(1);
   DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
@@ -4755,9 +5681,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
 bool AArch64TargetLowering::CanLowerReturn(
     CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
     const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
-  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
-                          ? RetCC_AArch64_WebKit_JS
-                          : RetCC_AArch64_AAPCS;
+  CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
   return CCInfo.CheckReturn(Outs, RetCC);
@@ -4772,9 +5696,7 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   auto &MF = DAG.getMachineFunction();
   auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
 
-  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
-                          ? RetCC_AArch64_WebKit_JS
-                          : RetCC_AArch64_AAPCS;
+  CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
                  *DAG.getContext());
@@ -4819,11 +5741,9 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
 
     if (RegsUsed.count(VA.getLocReg())) {
       SDValue &Bits =
-          std::find_if(RetVals.begin(), RetVals.end(),
-                       [=](const std::pair<unsigned, SDValue> &Elt) {
-                         return Elt.first == VA.getLocReg();
-                       })
-              ->second;
+          llvm::find_if(RetVals, [=](const std::pair<unsigned, SDValue> &Elt) {
+            return Elt.first == VA.getLocReg();
+          })->second;
       Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
     } else {
       RetVals.emplace_back(VA.getLocReg(), Arg);
@@ -5043,7 +5963,7 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
   SDValue FuncTLVGet = DAG.getLoad(
       PtrMemVT, DL, Chain, DescAddr,
       MachinePointerInfo::getGOT(DAG.getMachineFunction()),
-      /* Alignment = */ PtrMemVT.getSizeInBits() / 8,
+      Align(PtrMemVT.getSizeInBits() / 8),
       MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable);
   Chain = FuncTLVGet.getValue(1);
 
@@ -5358,6 +6278,22 @@ SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
   llvm_unreachable("Unexpected platform trying to use TLS");
 }
 
+// Looks through \param Val to determine the bit that can be used to
+// check the sign of the value. It returns the unextended value and
+// the sign bit position.
+std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) {
+  if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG)
+    return {Val.getOperand(0),
+            cast<VTSDNode>(Val.getOperand(1))->getVT().getFixedSizeInBits() -
+                1};
+
+  if (Val.getOpcode() == ISD::SIGN_EXTEND)
+    return {Val.getOperand(0),
+            Val.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1};
+
+  return {Val, Val.getValueSizeInBits() - 1};
+}
+
 SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
   SDValue Chain = Op.getOperand(0);
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
@@ -5452,9 +6388,10 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
         // Don't combine AND since emitComparison converts the AND to an ANDS
         // (a.k.a. TST) and the test in the test bit and branch instruction
         // becomes redundant.  This would also increase register pressure.
-        uint64_t Mask = LHS.getValueSizeInBits() - 1;
+        uint64_t SignBitPos;
+        std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
         return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
-                           DAG.getConstant(Mask, dl, MVT::i64), Dest);
+                           DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
       }
     }
     if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
@@ -5462,9 +6399,10 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
       // Don't combine AND since emitComparison converts the AND to an ANDS
       // (a.k.a. TST) and the test in the test bit and branch instruction
       // becomes redundant.  This would also increase register pressure.
-      uint64_t Mask = LHS.getValueSizeInBits() - 1;
+      uint64_t SignBitPos;
+      std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
       return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
-                         DAG.getConstant(Mask, dl, MVT::i64), Dest);
+                         DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
     }
 
     SDValue CCVal;
@@ -5611,6 +6549,9 @@ SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
     return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV);
   }
 
+  if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT))
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
+
   assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
           VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
          "Unexpected type for custom ctpop lowering");
@@ -5634,6 +6575,16 @@ SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
   return Val;
 }
 
+SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  assert(VT.isScalableVector() ||
+         useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true));
+
+  SDLoc DL(Op);
+  SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0));
+  return DAG.getNode(ISD::CTLZ, DL, VT, RBIT);
+}
+
 SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
 
   if (Op.getValueType().isVector())
@@ -5791,7 +6742,8 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
       // instead of a CSEL in that case.
       if (TrueVal == ~FalseVal) {
         Opcode = AArch64ISD::CSINV;
-      } else if (TrueVal == -FalseVal) {
+      } else if (FalseVal > std::numeric_limits<int64_t>::min() &&
+                 TrueVal == -FalseVal) {
         Opcode = AArch64ISD::CSNEG;
       } else if (TVal.getValueType() == MVT::i32) {
         // If our operands are only 32-bit wide, make sure we use 32-bit
@@ -5991,6 +6943,9 @@ SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
   SDValue Entry = Op.getOperand(2);
   int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();
 
+  auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
+  AFI->setJumpTableEntryInfo(JTI, 4, nullptr);
+
   SDNode *Dest =
       DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
                          Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
@@ -6057,11 +7012,13 @@ SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
 }
 
 SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
-                                                SelectionDAG &DAG) const {
+                                                  SelectionDAG &DAG) const {
   // The layout of the va_list struct is specified in the AArch64 Procedure Call
   // Standard, section B.3.
   MachineFunction &MF = DAG.getMachineFunction();
   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+  unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
+  auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
   auto PtrVT = getPointerTy(DAG.getDataLayout());
   SDLoc DL(Op);
 
@@ -6071,56 +7028,64 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
   SmallVector<SDValue, 4> MemOps;
 
   // void *__stack at offset 0
+  unsigned Offset = 0;
   SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
+  Stack = DAG.getZExtOrTrunc(Stack, DL, PtrMemVT);
   MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
-                                MachinePointerInfo(SV), /* Alignment = */ 8));
+                                MachinePointerInfo(SV), Align(PtrSize)));
 
-  // void *__gr_top at offset 8
+  // void *__gr_top at offset 8 (4 on ILP32)
+  Offset += PtrSize;
   int GPRSize = FuncInfo->getVarArgsGPRSize();
   if (GPRSize > 0) {
     SDValue GRTop, GRTopAddr;
 
-    GRTopAddr =
-        DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(8, DL, PtrVT));
+    GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
+                            DAG.getConstant(Offset, DL, PtrVT));
 
     GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
     GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
                         DAG.getConstant(GPRSize, DL, PtrVT));
+    GRTop = DAG.getZExtOrTrunc(GRTop, DL, PtrMemVT);
 
     MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
-                                  MachinePointerInfo(SV, 8),
-                                  /* Alignment = */ 8));
+                                  MachinePointerInfo(SV, Offset),
+                                  Align(PtrSize)));
   }
 
-  // void *__vr_top at offset 16
+  // void *__vr_top at offset 16 (8 on ILP32)
+  Offset += PtrSize;
   int FPRSize = FuncInfo->getVarArgsFPRSize();
   if (FPRSize > 0) {
     SDValue VRTop, VRTopAddr;
     VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
-                            DAG.getConstant(16, DL, PtrVT));
+                            DAG.getConstant(Offset, DL, PtrVT));
 
     VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
     VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
                         DAG.getConstant(FPRSize, DL, PtrVT));
+    VRTop = DAG.getZExtOrTrunc(VRTop, DL, PtrMemVT);
 
     MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
-                                  MachinePointerInfo(SV, 16),
-                                  /* Alignment = */ 8));
-  }
-
-  // int __gr_offs at offset 24
-  SDValue GROffsAddr =
-      DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(24, DL, PtrVT));
-  MemOps.push_back(DAG.getStore(
-      Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32), GROffsAddr,
-      MachinePointerInfo(SV, 24), /* Alignment = */ 4));
-
-  // int __vr_offs at offset 28
-  SDValue VROffsAddr =
-      DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(28, DL, PtrVT));
-  MemOps.push_back(DAG.getStore(
-      Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32), VROffsAddr,
-      MachinePointerInfo(SV, 28), /* Alignment = */ 4));
+                                  MachinePointerInfo(SV, Offset),
+                                  Align(PtrSize)));
+  }
+
+  // int __gr_offs at offset 24 (12 on ILP32)
+  Offset += PtrSize;
+  SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
+                                   DAG.getConstant(Offset, DL, PtrVT));
+  MemOps.push_back(
+      DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32),
+                   GROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
+
+  // int __vr_offs at offset 28 (16 on ILP32)
+  Offset += 4;
+  SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
+                                   DAG.getConstant(Offset, DL, PtrVT));
+  MemOps.push_back(
+      DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32),
+                   VROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
 
   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
 }
@@ -6143,8 +7108,10 @@ SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
   // pointer.
   SDLoc DL(Op);
   unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
-  unsigned VaListSize = (Subtarget->isTargetDarwin() ||
-                         Subtarget->isTargetWindows()) ? PtrSize : 32;
+  unsigned VaListSize =
+      (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
+          ? PtrSize
+          : Subtarget->isTargetILP32() ? 20 : 32;
   const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
 
@@ -6297,17 +7264,34 @@ SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
   EVT VT = Op.getValueType();
   SDLoc DL(Op);
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  SDValue ReturnAddress;
   if (Depth) {
     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
     SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));
-    return DAG.getLoad(VT, DL, DAG.getEntryNode(),
-                       DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
-                       MachinePointerInfo());
+    ReturnAddress = DAG.getLoad(
+        VT, DL, DAG.getEntryNode(),
+        DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo());
+  } else {
+    // Return LR, which contains the return address. Mark it an implicit
+    // live-in.
+    unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
+    ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
+  }
+
+  // The XPACLRI instruction assembles to a hint-space instruction before
+  // Armv8.3-A therefore this instruction can be safely used for any pre
+  // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use
+  // that instead.
+  SDNode *St;
+  if (Subtarget->hasPAuth()) {
+    St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress);
+  } else {
+    // XPACLRI operates on LR therefore we must move the operand accordingly.
+    SDValue Chain =
+        DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress);
+    St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain);
   }
-
-  // Return LR, which contains the return address. Mark it an implicit live-in.
-  unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
-  return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
+  return SDValue(St, 0);
 }
 
 /// LowerShiftRightParts - Lower SRA_PARTS, which returns two
@@ -6488,6 +7472,22 @@ static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
   return SDValue();
 }
 
+SDValue
+AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
+                                        const DenormalMode &Mode) const {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+  SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
+  return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
+}
+
+SDValue
+AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+  return Op;
+}
+
 SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
                                                SelectionDAG &DAG, int Enabled,
                                                int &ExtraSteps,
@@ -6511,17 +7511,8 @@ SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
         Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
         Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
       }
-      if (!Reciprocal) {
-        EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
-                                      VT);
-        SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
-        SDValue Eq = DAG.getSetCC(DL, CCVT, Operand, FPZero, ISD::SETEQ);
-
+      if (!Reciprocal)
         Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
-        // Correct the result if the operand is 0.0.
-        Estimate = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, DL,
-                               VT, Eq, Operand, Estimate);
-      }
 
       ExtraSteps = 0;
       return Estimate;
@@ -6697,23 +7688,30 @@ AArch64TargetLowering::getRegForInlineAsmConstraint(
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     case 'r':
-      if (VT.getSizeInBits() == 64)
+      if (VT.isScalableVector())
+        return std::make_pair(0U, nullptr);
+      if (VT.getFixedSizeInBits() == 64)
         return std::make_pair(0U, &AArch64::GPR64commonRegClass);
       return std::make_pair(0U, &AArch64::GPR32commonRegClass);
-    case 'w':
+    case 'w': {
       if (!Subtarget->hasFPARMv8())
         break;
-      if (VT.isScalableVector())
-        return std::make_pair(0U, &AArch64::ZPRRegClass);
-      if (VT.getSizeInBits() == 16)
+      if (VT.isScalableVector()) {
+        if (VT.getVectorElementType() != MVT::i1)
+          return std::make_pair(0U, &AArch64::ZPRRegClass);
+        return std::make_pair(0U, nullptr);
+      }
+      uint64_t VTSize = VT.getFixedSizeInBits();
+      if (VTSize == 16)
         return std::make_pair(0U, &AArch64::FPR16RegClass);
-      if (VT.getSizeInBits() == 32)
+      if (VTSize == 32)
         return std::make_pair(0U, &AArch64::FPR32RegClass);
-      if (VT.getSizeInBits() == 64)
+      if (VTSize == 64)
         return std::make_pair(0U, &AArch64::FPR64RegClass);
-      if (VT.getSizeInBits() == 128)
+      if (VTSize == 128)
         return std::make_pair(0U, &AArch64::FPR128RegClass);
       break;
+    }
     // The instructions that this constraint is designed for can
     // only take 128-bit registers so just use that regclass.
     case 'x':
@@ -6734,10 +7732,11 @@ AArch64TargetLowering::getRegForInlineAsmConstraint(
   } else {
     PredicateConstraint PC = parsePredicateConstraint(Constraint);
     if (PC != PredicateConstraint::Invalid) {
-      assert(VT.isScalableVector());
+      if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)
+        return std::make_pair(0U, nullptr);
       bool restricted = (PC == PredicateConstraint::Upl);
       return restricted ? std::make_pair(0U, &AArch64::PPR_3bRegClass)
-                          : std::make_pair(0U, &AArch64::PPRRegClass);
+                        : std::make_pair(0U, &AArch64::PPRRegClass);
     }
   }
   if (StringRef("{cc}").equals_lower(Constraint))
@@ -6976,6 +7975,8 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
   LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
   SDLoc dl(Op);
   EVT VT = Op.getValueType();
+  assert(!VT.isScalableVector() &&
+         "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
   unsigned NumElts = VT.getVectorNumElements();
 
   struct ShuffleSourceInfo {
@@ -7046,8 +8047,9 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
     }
   }
   unsigned ResMultiplier =
-      VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits();
-  NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
+      VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits();
+  uint64_t VTSize = VT.getFixedSizeInBits();
+  NumElts = VTSize / SmallestEltTy.getFixedSizeInBits();
   EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
 
   // If the source vector is too wide or too narrow, we may nevertheless be able
@@ -7056,17 +8058,18 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
   for (auto &Src : Sources) {
     EVT SrcVT = Src.ShuffleVec.getValueType();
 
-    if (SrcVT.getSizeInBits() == VT.getSizeInBits())
+    uint64_t SrcVTSize = SrcVT.getFixedSizeInBits();
+    if (SrcVTSize == VTSize)
       continue;
 
     // This stage of the search produces a source with the same element type as
     // the original, but with a total width matching the BUILD_VECTOR output.
     EVT EltVT = SrcVT.getVectorElementType();
-    unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits();
+    unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
     EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
 
-    if (SrcVT.getSizeInBits() < VT.getSizeInBits()) {
-      assert(2 * SrcVT.getSizeInBits() == VT.getSizeInBits());
+    if (SrcVTSize < VTSize) {
+      assert(2 * SrcVTSize == VTSize);
       // We can pad out the smaller vector for free, so if it's part of a
       // shuffle...
       Src.ShuffleVec =
@@ -7075,7 +8078,11 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
       continue;
     }
 
-    assert(SrcVT.getSizeInBits() == 2 * VT.getSizeInBits());
+    if (SrcVTSize != 2 * VTSize) {
+      LLVM_DEBUG(
+          dbgs() << "Reshuffle failed: result vector too small to extract\n");
+      return SDValue();
+    }
 
     if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
       LLVM_DEBUG(
@@ -7104,6 +8111,13 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
                       DAG.getConstant(NumSrcElts, dl, MVT::i64));
       unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
 
+      if (!SrcVT.is64BitVector()) {
+        LLVM_DEBUG(
+          dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
+                    "for SVE vectors.");
+        return SDValue();
+      }
+
       Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
                                    VEXTSrc2,
                                    DAG.getConstant(Imm, dl, MVT::i32));
@@ -7120,7 +8134,8 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
       continue;
     assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
     Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
-    Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
+    Src.WindowScale =
+        SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits();
     Src.WindowBase *= Src.WindowScale;
   }
 
@@ -7144,8 +8159,8 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
     // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
     // segment.
     EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
-    int BitsDefined =
-        std::min(OrigEltTy.getSizeInBits(), VT.getScalarSizeInBits());
+    int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
+                               VT.getScalarSizeInBits());
     int LanesDefined = BitsDefined / BitsPerShuffleLane;
 
     // This source is expected to fill ResMultiplier lanes of the final shuffle,
@@ -7209,6 +8224,81 @@ static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
   return true;
 }
 
+/// Check if a vector shuffle corresponds to a DUP instructions with a larger
+/// element width than the vector lane type. If that is the case the function
+/// returns true and writes the value of the DUP instruction lane operand into
+/// DupLaneOp
+static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize,
+                          unsigned &DupLaneOp) {
+  assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
+         "Only possible block sizes for wide DUP are: 16, 32, 64");
+
+  if (BlockSize <= VT.getScalarSizeInBits())
+    return false;
+  if (BlockSize % VT.getScalarSizeInBits() != 0)
+    return false;
+  if (VT.getSizeInBits() % BlockSize != 0)
+    return false;
+
+  size_t SingleVecNumElements = VT.getVectorNumElements();
+  size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits();
+  size_t NumBlocks = VT.getSizeInBits() / BlockSize;
+
+  // We are looking for masks like
+  // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element
+  // might be replaced by 'undefined'. BlockIndices will eventually contain
+  // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7]
+  // for the above examples)
+  SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1);
+  for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++)
+    for (size_t I = 0; I < NumEltsPerBlock; I++) {
+      int Elt = M[BlockIndex * NumEltsPerBlock + I];
+      if (Elt < 0)
+        continue;
+      // For now we don't support shuffles that use the second operand
+      if ((unsigned)Elt >= SingleVecNumElements)
+        return false;
+      if (BlockElts[I] < 0)
+        BlockElts[I] = Elt;
+      else if (BlockElts[I] != Elt)
+        return false;
+    }
+
+  // We found a candidate block (possibly with some undefs). It must be a
+  // sequence of consecutive integers starting with a value divisible by
+  // NumEltsPerBlock with some values possibly replaced by undef-s.
+
+  // Find first non-undef element
+  auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; });
+  assert(FirstRealEltIter != BlockElts.end() &&
+         "Shuffle with all-undefs must have been caught by previous cases, "
+         "e.g. isSplat()");
+  if (FirstRealEltIter == BlockElts.end()) {
+    DupLaneOp = 0;
+    return true;
+  }
+
+  // Index of FirstRealElt in BlockElts
+  size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin();
+
+  if ((unsigned)*FirstRealEltIter < FirstRealIndex)
+    return false;
+  // BlockElts[0] must have the following value if it isn't undef:
+  size_t Elt0 = *FirstRealEltIter - FirstRealIndex;
+
+  // Check the first element
+  if (Elt0 % NumEltsPerBlock != 0)
+    return false;
+  // Check that the sequence indeed consists of consecutive integers (modulo
+  // undefs)
+  for (size_t I = 0; I < NumEltsPerBlock; I++)
+    if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I)
+      return false;
+
+  DupLaneOp = Elt0 / NumEltsPerBlock;
+  return true;
+}
+
 // check if an EXT instruction can handle the shuffle mask when the
 // vector sources of the shuffle are different.
 static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
@@ -7642,6 +8732,60 @@ static unsigned getDUPLANEOp(EVT EltType) {
   llvm_unreachable("Invalid vector element type?");
 }
 
+static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT,
+                            unsigned Opcode, SelectionDAG &DAG) {
+  // Try to eliminate a bitcasted extract subvector before a DUPLANE.
+  auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
+    // Match: dup (bitcast (extract_subv X, C)), LaneC
+    if (BitCast.getOpcode() != ISD::BITCAST ||
+        BitCast.getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR)
+      return false;
+
+    // The extract index must align in the destination type. That may not
+    // happen if the bitcast is from narrow to wide type.
+    SDValue Extract = BitCast.getOperand(0);
+    unsigned ExtIdx = Extract.getConstantOperandVal(1);
+    unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
+    unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
+    unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
+    if (ExtIdxInBits % CastedEltBitWidth != 0)
+      return false;
+
+    // Update the lane value by offsetting with the scaled extract index.
+    LaneC += ExtIdxInBits / CastedEltBitWidth;
+
+    // Determine the casted vector type of the wide vector input.
+    // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
+    // Examples:
+    // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
+    // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
+    unsigned SrcVecNumElts =
+        Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;
+    CastVT = MVT::getVectorVT(BitCast.getSimpleValueType().getScalarType(),
+                              SrcVecNumElts);
+    return true;
+  };
+  MVT CastVT;
+  if (getScaledOffsetDup(V, Lane, CastVT)) {
+    V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0));
+  } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
+    // The lane is incremented by the index of the extract.
+    // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
+    Lane += V.getConstantOperandVal(1);
+    V = V.getOperand(0);
+  } else if (V.getOpcode() == ISD::CONCAT_VECTORS) {
+    // The lane is decremented if we are splatting from the 2nd operand.
+    // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
+    unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
+    Lane -= Idx * VT.getVectorNumElements() / 2;
+    V = WidenVector(V.getOperand(Idx), DAG);
+  } else if (VT.getSizeInBits() == 64) {
+    // Widen the operand to 128-bit register with undef.
+    V = WidenVector(V, DAG);
+  }
+  return DAG.getNode(Opcode, dl, VT, V, DAG.getConstant(Lane, dl, MVT::i64));
+}
+
 SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
                                                    SelectionDAG &DAG) const {
   SDLoc dl(Op);
@@ -7675,57 +8819,26 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
 
     // Otherwise, duplicate from the lane of the input vector.
     unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
-
-    // Try to eliminate a bitcasted extract subvector before a DUPLANE.
-    auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
-      // Match: dup (bitcast (extract_subv X, C)), LaneC
-      if (BitCast.getOpcode() != ISD::BITCAST ||
-          BitCast.getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR)
-        return false;
-
-      // The extract index must align in the destination type. That may not
-      // happen if the bitcast is from narrow to wide type.
-      SDValue Extract = BitCast.getOperand(0);
-      unsigned ExtIdx = Extract.getConstantOperandVal(1);
-      unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
-      unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
-      unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
-      if (ExtIdxInBits % CastedEltBitWidth != 0)
-        return false;
-
-      // Update the lane value by offsetting with the scaled extract index.
-      LaneC += ExtIdxInBits / CastedEltBitWidth;
-
-      // Determine the casted vector type of the wide vector input.
-      // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
-      // Examples:
-      // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
-      // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
-      unsigned SrcVecNumElts =
-          Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;
-      CastVT = MVT::getVectorVT(BitCast.getSimpleValueType().getScalarType(),
-                                SrcVecNumElts);
-      return true;
-    };
-    MVT CastVT;
-    if (getScaledOffsetDup(V1, Lane, CastVT)) {
-      V1 = DAG.getBitcast(CastVT, V1.getOperand(0).getOperand(0));
-    } else if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
-      // The lane is incremented by the index of the extract.
-      // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
-      Lane += V1.getConstantOperandVal(1);
-      V1 = V1.getOperand(0);
-    } else if (V1.getOpcode() == ISD::CONCAT_VECTORS) {
-      // The lane is decremented if we are splatting from the 2nd operand.
-      // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
-      unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
-      Lane -= Idx * VT.getVectorNumElements() / 2;
-      V1 = WidenVector(V1.getOperand(Idx), DAG);
-    } else if (VT.getSizeInBits() == 64) {
-      // Widen the operand to 128-bit register with undef.
-      V1 = WidenVector(V1, DAG);
-    }
-    return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, dl, MVT::i64));
+    return constructDup(V1, Lane, dl, VT, Opcode, DAG);
+  }
+
+  // Check if the mask matches a DUP for a wider element
+  for (unsigned LaneSize : {64U, 32U, 16U}) {
+    unsigned Lane = 0;
+    if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) {
+      unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64
+                                       : LaneSize == 32 ? AArch64ISD::DUPLANE32
+                                                        : AArch64ISD::DUPLANE16;
+      // Cast V1 to an integer vector with required lane size
+      MVT NewEltTy = MVT::getIntegerVT(LaneSize);
+      unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
+      MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount);
+      V1 = DAG.getBitcast(NewVecTy, V1);
+      // Constuct the DUP instruction
+      V1 = constructDup(V1, Lane, dl, NewVecTy, Opcode, DAG);
+      // Cast back to the original type
+      return DAG.getBitcast(VT, V1);
+    }
   }
 
   if (isREVMask(ShuffleMask, VT, 64))
@@ -7796,7 +8909,7 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
 
     EVT ScalarVT = VT.getVectorElementType();
 
-    if (ScalarVT.getSizeInBits() < 32 && ScalarVT.isInteger())
+    if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger())
       ScalarVT = MVT::i32;
 
     return DAG.getNode(
@@ -7835,9 +8948,11 @@ SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
   SDLoc dl(Op);
   EVT VT = Op.getValueType();
   EVT ElemVT = VT.getScalarType();
-
   SDValue SplatVal = Op.getOperand(0);
 
+  if (useSVEForFixedLengthVectorVT(VT))
+    return LowerToScalableOp(Op, DAG);
+
   // Extend input splat value where needed to fit into a GPR (32b or 64b only)
   // FPRs don't have this restriction.
   switch (ElemVT.getSimpleVT().SimpleTy) {
@@ -8267,6 +9382,9 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
 
 SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
                                              SelectionDAG &DAG) const {
+  if (useSVEForFixedLengthVectorVT(Op.getValueType()))
+    return LowerToScalableOp(Op, DAG);
+
   // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
   if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
     return Res;
@@ -8425,14 +9543,18 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
   bool isConstant = true;
   bool AllLanesExtractElt = true;
   unsigned NumConstantLanes = 0;
+  unsigned NumDifferentLanes = 0;
+  unsigned NumUndefLanes = 0;
   SDValue Value;
   SDValue ConstantValue;
   for (unsigned i = 0; i < NumElts; ++i) {
     SDValue V = Op.getOperand(i);
     if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
       AllLanesExtractElt = false;
-    if (V.isUndef())
+    if (V.isUndef()) {
+      ++NumUndefLanes;
       continue;
+    }
     if (i > 0)
       isOnlyLowElement = false;
     if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
@@ -8448,8 +9570,10 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
 
     if (!Value.getNode())
       Value = V;
-    else if (V != Value)
+    else if (V != Value) {
       usesOnlyOneValue = false;
+      ++NumDifferentLanes;
+    }
   }
 
   if (!Value.getNode()) {
@@ -8575,11 +9699,20 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
     }
   }
 
+  // If we need to insert a small number of different non-constant elements and
+  // the vector width is sufficiently large, prefer using DUP with the common
+  // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred,
+  // skip the constant lane handling below.
+  bool PreferDUPAndInsert =
+      !isConstant && NumDifferentLanes >= 1 &&
+      NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) &&
+      NumDifferentLanes >= NumConstantLanes;
+
   // If there was only one constant value used and for more than one lane,
   // start by splatting that value, then replace the non-constant lanes. This
   // is better than the default, which will perform a separate initialization
   // for each lane.
-  if (NumConstantLanes > 0 && usesOnlyOneConstantValue) {
+  if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) {
     // Firstly, try to materialize the splat constant.
     SDValue Vec = DAG.getSplatBuildVector(VT, dl, ConstantValue),
             Val = ConstantBuildVector(Vec, DAG);
@@ -8615,6 +9748,22 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
       return shuffle;
   }
 
+  if (PreferDUPAndInsert) {
+    // First, build a constant vector with the common element.
+    SmallVector<SDValue, 8> Ops;
+    for (unsigned I = 0; I < NumElts; ++I)
+      Ops.push_back(Value);
+    SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, dl, Ops), DAG);
+    // Next, insert the elements that do not match the common value.
+    for (unsigned I = 0; I < NumElts; ++I)
+      if (Op.getOperand(I) != Value)
+        NewVector =
+            DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, NewVector,
+                        Op.getOperand(I), DAG.getConstant(I, dl, MVT::i64));
+
+    return NewVector;
+  }
+
   // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
   // know the default expansion would otherwise fall back on something even
   // worse. For a vector with one or two non-undef values, that's
@@ -8663,6 +9812,18 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
   return SDValue();
 }
 
+SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+  assert(Op.getValueType().isScalableVector() &&
+         isTypeLegal(Op.getValueType()) &&
+         "Expected legal scalable vector type!");
+
+  if (isTypeLegal(Op.getOperand(0).getValueType()) && Op.getNumOperands() == 2)
+    return Op;
+
+  return SDValue();
+}
+
 SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
                                                       SelectionDAG &DAG) const {
   assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
@@ -8758,7 +9919,8 @@ SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
 
   // If this is extracting the upper 64-bits of a 128-bit vector, we match
   // that directly.
-  if (Size == 64 && Idx * InVT.getScalarSizeInBits() == 64)
+  if (Size == 64 && Idx * InVT.getScalarSizeInBits() == 64 &&
+      InVT.getSizeInBits() == 128)
     return Op;
 
   return SDValue();
@@ -8772,9 +9934,34 @@ SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
   EVT InVT = Op.getOperand(1).getValueType();
   unsigned Idx = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
 
-  // We don't have any patterns for scalable vector yet.
-  if (InVT.isScalableVector() || !useSVEForFixedLengthVectorVT(InVT))
+  if (InVT.isScalableVector()) {
+    SDLoc DL(Op);
+    EVT VT = Op.getValueType();
+
+    if (!isTypeLegal(VT) || !VT.isInteger())
+      return SDValue();
+
+    SDValue Vec0 = Op.getOperand(0);
+    SDValue Vec1 = Op.getOperand(1);
+
+    // Ensure the subvector is half the size of the main vector.
+    if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
+      return SDValue();
+
+    // Extend elements of smaller vector...
+    EVT WideVT = InVT.widenIntegerVectorElementType(*(DAG.getContext()));
+    SDValue ExtVec = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
+
+    if (Idx == 0) {
+      SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0);
+      return DAG.getNode(AArch64ISD::UZP1, DL, VT, ExtVec, HiVec0);
+    } else if (Idx == InVT.getVectorMinNumElements()) {
+      SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0);
+      return DAG.getNode(AArch64ISD::UZP1, DL, VT, LoVec0, ExtVec);
+    }
+
     return SDValue();
+  }
 
   // This will be matched by custom code during ISelDAGToDAG.
   if (Idx == 0 && isPackedVectorType(InVT, DAG) && Op.getOperand(0).isUndef())
@@ -8783,6 +9970,42 @@ SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
   return SDValue();
 }
 
+SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+
+  if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
+    return LowerFixedLengthVectorIntDivideToSVE(Op, DAG);
+
+  assert(VT.isScalableVector() && "Expected a scalable vector.");
+
+  bool Signed = Op.getOpcode() == ISD::SDIV;
+  unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
+
+  if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64)
+    return LowerToPredicatedOp(Op, DAG, PredOpcode);
+
+  // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit
+  // operations, and truncate the result.
+  EVT WidenedVT;
+  if (VT == MVT::nxv16i8)
+    WidenedVT = MVT::nxv8i16;
+  else if (VT == MVT::nxv8i16)
+    WidenedVT = MVT::nxv4i32;
+  else
+    llvm_unreachable("Unexpected Custom DIV operation");
+
+  SDLoc dl(Op);
+  unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
+  unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
+  SDValue Op0Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(0));
+  SDValue Op1Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(1));
+  SDValue Op0Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(0));
+  SDValue Op1Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(1));
+  SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Lo, Op1Lo);
+  SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Hi, Op1Hi);
+  return DAG.getNode(AArch64ISD::UZP1, dl, VT, ResultLo, ResultHi);
+}
+
 bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
   // Currently no fixed length shuffles that require SVE are legal.
   if (useSVEForFixedLengthVectorVT(VT))
@@ -8867,14 +10090,6 @@ static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
   return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
 }
 
-// Attempt to form urhadd(OpA, OpB) from
-// truncate(vlshr(sub(zext(OpB), xor(zext(OpA), Ones(ElemSizeInBits))), 1)).
-// The original form of this expression is
-// truncate(srl(add(zext(OpB), add(zext(OpA), 1)), 1)) and before this function
-// is called the srl will have been lowered to AArch64ISD::VLSHR and the
-// ((OpA + OpB + 1) >> 1) expression will have been changed to (OpB - (~OpA)).
-// This pass can also recognize a variant of this pattern that uses sign
-// extension instead of zero extension and form a srhadd(OpA, OpB) from it.
 SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
                                              SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
@@ -8890,66 +10105,12 @@ SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
   }
 
   if (!VT.isVector() || VT.isScalableVector())
-    return Op;
+    return SDValue();
 
   if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType()))
     return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
 
-  // Since we are looking for a right shift by a constant value of 1 and we are
-  // operating on types at least 16 bits in length (sign/zero extended OpA and
-  // OpB, which are at least 8 bits), it follows that the truncate will always
-  // discard the shifted-in bit and therefore the right shift will be logical
-  // regardless of the signedness of OpA and OpB.
-  SDValue Shift = Op.getOperand(0);
-  if (Shift.getOpcode() != AArch64ISD::VLSHR)
-    return Op;
-
-  // Is the right shift using an immediate value of 1?
-  uint64_t ShiftAmount = Shift.getConstantOperandVal(1);
-  if (ShiftAmount != 1)
-    return Op;
-
-  SDValue Sub = Shift->getOperand(0);
-  if (Sub.getOpcode() != ISD::SUB)
-    return Op;
-
-  SDValue Xor = Sub.getOperand(1);
-  if (Xor.getOpcode() != ISD::XOR)
-    return Op;
-
-  SDValue ExtendOpA = Xor.getOperand(0);
-  SDValue ExtendOpB = Sub.getOperand(0);
-  unsigned ExtendOpAOpc = ExtendOpA.getOpcode();
-  unsigned ExtendOpBOpc = ExtendOpB.getOpcode();
-  if (!(ExtendOpAOpc == ExtendOpBOpc &&
-        (ExtendOpAOpc == ISD::ZERO_EXTEND || ExtendOpAOpc == ISD::SIGN_EXTEND)))
-    return Op;
-
-  // Is the result of the right shift being truncated to the same value type as
-  // the original operands, OpA and OpB?
-  SDValue OpA = ExtendOpA.getOperand(0);
-  SDValue OpB = ExtendOpB.getOperand(0);
-  EVT OpAVT = OpA.getValueType();
-  assert(ExtendOpA.getValueType() == ExtendOpB.getValueType());
-  if (!(VT == OpAVT && OpAVT == OpB.getValueType()))
-    return Op;
-
-  // Is the XOR using a constant amount of all ones in the right hand side?
-  uint64_t C;
-  if (!isAllConstantBuildVector(Xor.getOperand(1), C))
-    return Op;
-
-  unsigned ElemSizeInBits = VT.getScalarSizeInBits();
-  APInt CAsAPInt(ElemSizeInBits, C);
-  if (CAsAPInt != APInt::getAllOnesValue(ElemSizeInBits))
-    return Op;
-
-  SDLoc DL(Op);
-  bool IsSignExtend = ExtendOpAOpc == ISD::SIGN_EXTEND;
-  unsigned RHADDOpc = IsSignExtend ? AArch64ISD::SRHADD : AArch64ISD::URHADD;
-  SDValue ResultURHADD = DAG.getNode(RHADDOpc, DL, VT, OpA, OpB);
-
-  return ResultURHADD;
+  return SDValue();
 }
 
 SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
@@ -8967,8 +10128,8 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
     llvm_unreachable("unexpected shift opcode");
 
   case ISD::SHL:
-    if (VT.isScalableVector())
-      return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_MERGE_OP1);
+    if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT))
+      return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED);
 
     if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
       return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
@@ -8979,9 +10140,9 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
                        Op.getOperand(0), Op.getOperand(1));
   case ISD::SRA:
   case ISD::SRL:
-    if (VT.isScalableVector()) {
-      unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_MERGE_OP1
-                                                : AArch64ISD::SRL_MERGE_OP1;
+    if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT)) {
+      unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED
+                                                : AArch64ISD::SRL_PRED;
       return LowerToPredicatedOp(Op, DAG, Opc);
     }
 
@@ -9033,7 +10194,7 @@ static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
         Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
       else
         Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
-      return DAG.getNode(AArch64ISD::NOT, dl, VT, Fcmeq);
+      return DAG.getNOT(dl, Fcmeq, VT);
     }
     case AArch64CC::EQ:
       if (IsZero)
@@ -9072,7 +10233,7 @@ static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
       Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
     else
       Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
-    return DAG.getNode(AArch64ISD::NOT, dl, VT, Cmeq);
+    return DAG.getNOT(dl, Cmeq, VT);
   }
   case AArch64CC::EQ:
     if (IsZero)
@@ -9113,6 +10274,9 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
     return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
   }
 
+  if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType()))
+    return LowerFixedLengthVectorSetccToSVE(Op, DAG);
+
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
@@ -9185,6 +10349,51 @@ static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
 
 SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
                                               SelectionDAG &DAG) const {
+  SDValue Src = Op.getOperand(0);
+
+  // Try to lower fixed length reductions to SVE.
+  EVT SrcVT = Src.getValueType();
+  bool OverrideNEON = Op.getOpcode() == ISD::VECREDUCE_AND ||
+                      Op.getOpcode() == ISD::VECREDUCE_OR ||
+                      Op.getOpcode() == ISD::VECREDUCE_XOR ||
+                      Op.getOpcode() == ISD::VECREDUCE_FADD ||
+                      (Op.getOpcode() != ISD::VECREDUCE_ADD &&
+                       SrcVT.getVectorElementType() == MVT::i64);
+  if (SrcVT.isScalableVector() ||
+      useSVEForFixedLengthVectorVT(SrcVT, OverrideNEON)) {
+
+    if (SrcVT.getVectorElementType() == MVT::i1)
+      return LowerPredReductionToSVE(Op, DAG);
+
+    switch (Op.getOpcode()) {
+    case ISD::VECREDUCE_ADD:
+      return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG);
+    case ISD::VECREDUCE_AND:
+      return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG);
+    case ISD::VECREDUCE_OR:
+      return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG);
+    case ISD::VECREDUCE_SMAX:
+      return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG);
+    case ISD::VECREDUCE_SMIN:
+      return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG);
+    case ISD::VECREDUCE_UMAX:
+      return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG);
+    case ISD::VECREDUCE_UMIN:
+      return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG);
+    case ISD::VECREDUCE_XOR:
+      return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG);
+    case ISD::VECREDUCE_FADD:
+      return LowerReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG);
+    case ISD::VECREDUCE_FMAX:
+      return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG);
+    case ISD::VECREDUCE_FMIN:
+      return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG);
+    default:
+      llvm_unreachable("Unhandled fixed length reduction");
+    }
+  }
+
+  // Lower NEON reductions.
   SDLoc dl(Op);
   switch (Op.getOpcode()) {
   case ISD::VECREDUCE_ADD:
@@ -9198,18 +10407,16 @@ SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
   case ISD::VECREDUCE_UMIN:
     return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG);
   case ISD::VECREDUCE_FMAX: {
-    assert(Op->getFlags().hasNoNaNs() && "fmax vector reduction needs NoNaN flag");
     return DAG.getNode(
         ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
         DAG.getConstant(Intrinsic::aarch64_neon_fmaxnmv, dl, MVT::i32),
-        Op.getOperand(0));
+        Src);
   }
   case ISD::VECREDUCE_FMIN: {
-    assert(Op->getFlags().hasNoNaNs() && "fmin vector reduction needs NoNaN flag");
     return DAG.getNode(
         ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
         DAG.getConstant(Intrinsic::aarch64_neon_fminnmv, dl, MVT::i32),
-        Op.getOperand(0));
+        Src);
   }
   default:
     llvm_unreachable("Unhandled reduction");
@@ -9219,7 +10426,7 @@ SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
 SDValue AArch64TargetLowering::LowerATOMIC_LOAD_SUB(SDValue Op,
                                                     SelectionDAG &DAG) const {
   auto &Subtarget = static_cast<const AArch64Subtarget &>(DAG.getSubtarget());
-  if (!Subtarget.hasLSE())
+  if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
     return SDValue();
 
   // LSE has an atomic load-add instruction, but not a load-sub.
@@ -9236,7 +10443,7 @@ SDValue AArch64TargetLowering::LowerATOMIC_LOAD_SUB(SDValue Op,
 SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
                                                     SelectionDAG &DAG) const {
   auto &Subtarget = static_cast<const AArch64Subtarget &>(DAG.getSubtarget());
-  if (!Subtarget.hasLSE())
+  if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
     return SDValue();
 
   // LSE has an atomic load-clear instruction, but not a load-and.
@@ -9337,16 +10544,17 @@ SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
 
 /// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
 template <unsigned NumVecs>
-static bool setInfoSVEStN(AArch64TargetLowering::IntrinsicInfo &Info,
-                          const CallInst &CI) {
+static bool
+setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL,
+              AArch64TargetLowering::IntrinsicInfo &Info, const CallInst &CI) {
   Info.opc = ISD::INTRINSIC_VOID;
   // Retrieve EC from first vector argument.
-  const EVT VT = EVT::getEVT(CI.getArgOperand(0)->getType());
+  const EVT VT = TLI.getMemValueType(DL, CI.getArgOperand(0)->getType());
   ElementCount EC = VT.getVectorElementCount();
 #ifndef NDEBUG
   // Check the assumption that all input vectors are the same type.
   for (unsigned I = 0; I < NumVecs; ++I)
-    assert(VT == EVT::getEVT(CI.getArgOperand(I)->getType()) &&
+    assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) &&
            "Invalid type.");
 #endif
   // memVT is `NumVecs * VT`.
@@ -9369,11 +10577,11 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   auto &DL = I.getModule()->getDataLayout();
   switch (Intrinsic) {
   case Intrinsic::aarch64_sve_st2:
-    return setInfoSVEStN<2>(Info, I);
+    return setInfoSVEStN<2>(*this, DL, Info, I);
   case Intrinsic::aarch64_sve_st3:
-    return setInfoSVEStN<3>(Info, I);
+    return setInfoSVEStN<3>(*this, DL, Info, I);
   case Intrinsic::aarch64_sve_st4:
-    return setInfoSVEStN<4>(Info, I);
+    return setInfoSVEStN<4>(*this, DL, Info, I);
   case Intrinsic::aarch64_neon_ld2:
   case Intrinsic::aarch64_neon_ld3:
   case Intrinsic::aarch64_neon_ld4:
@@ -9529,15 +10737,15 @@ bool AArch64TargetLowering::shouldReduceLoadWidth(SDNode *Load,
 bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
     return false;
-  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
-  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
+  uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedSize();
+  uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedSize();
   return NumBits1 > NumBits2;
 }
 bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
   if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
     return false;
-  unsigned NumBits1 = VT1.getSizeInBits();
-  unsigned NumBits2 = VT2.getSizeInBits();
+  uint64_t NumBits1 = VT1.getFixedSizeInBits();
+  uint64_t NumBits2 = VT2.getFixedSizeInBits();
   return NumBits1 > NumBits2;
 }
 
@@ -9779,6 +10987,43 @@ bool AArch64TargetLowering::shouldSinkOperands(
 
     return true;
   }
+  case Instruction::Mul: {
+    bool IsProfitable = false;
+    for (auto &Op : I->operands()) {
+      // Make sure we are not already sinking this operand
+      if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
+        continue;
+
+      ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Op);
+      if (!Shuffle || !Shuffle->isZeroEltSplat())
+        continue;
+
+      Value *ShuffleOperand = Shuffle->getOperand(0);
+      InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
+      if (!Insert)
+        continue;
+
+      Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
+      if (!OperandInstr)
+        continue;
+
+      ConstantInt *ElementConstant =
+          dyn_cast<ConstantInt>(Insert->getOperand(2));
+      // Check that the insertelement is inserting into element 0
+      if (!ElementConstant || ElementConstant->getZExtValue() != 0)
+        continue;
+
+      unsigned Opcode = OperandInstr->getOpcode();
+      if (Opcode != Instruction::SExt && Opcode != Instruction::ZExt)
+        continue;
+
+      Ops.push_back(&Shuffle->getOperandUse(0));
+      Ops.push_back(&Op);
+      IsProfitable = true;
+    }
+
+    return IsProfitable;
+  }
   default:
     return false;
   }
@@ -10114,11 +11359,12 @@ SDValue AArch64TargetLowering::LowerSVEStructLoad(unsigned Intrinsic,
       {Intrinsic::aarch64_sve_ld4, {4, AArch64ISD::SVE_LD4_MERGE_ZERO}}};
 
   std::tie(N, Opcode) = IntrinsicMap[Intrinsic];
-  assert(VT.getVectorElementCount().Min % N == 0 &&
+  assert(VT.getVectorElementCount().getKnownMinValue() % N == 0 &&
          "invalid tuple vector type!");
 
-  EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
-                                 VT.getVectorElementCount() / N);
+  EVT SplitVT =
+      EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
+                       VT.getVectorElementCount().divideCoefficientBy(N));
   assert(isTypeLegal(SplitVT));
 
   SmallVector<EVT, 5> VTs(N, SplitVT);
@@ -10409,32 +11655,77 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
   return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0));
 }
 
-// Generate SUBS and CSEL for integer abs.
-static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
-  EVT VT = N->getValueType(0);
+// VECREDUCE_ADD( EXTEND(v16i8_type) ) to
+// VECREDUCE_ADD( DOTv16i8(v16i8_type) )
+static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG,
+                                          const AArch64Subtarget *ST) {
+  SDValue Op0 = N->getOperand(0);
+  if (!ST->hasDotProd() || N->getValueType(0) != MVT::i32)
+    return SDValue();
 
-  SDValue N0 = N->getOperand(0);
-  SDValue N1 = N->getOperand(1);
-  SDLoc DL(N);
+  if (Op0.getValueType().getVectorElementType() != MVT::i32)
+    return SDValue();
 
-  // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
-  // and change it to SUB and CSEL.
-  if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
-      N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
-      N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0))
-    if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
-      if (Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
-        SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
-                                  N0.getOperand(0));
-        // Generate SUBS & CSEL.
-        SDValue Cmp =
-            DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
-                        N0.getOperand(0), DAG.getConstant(0, DL, VT));
-        return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0.getOperand(0), Neg,
-                           DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
-                           SDValue(Cmp.getNode(), 1));
-      }
-  return SDValue();
+  unsigned ExtOpcode = Op0.getOpcode();
+  if (ExtOpcode != ISD::ZERO_EXTEND && ExtOpcode != ISD::SIGN_EXTEND)
+    return SDValue();
+
+  EVT Op0VT = Op0.getOperand(0).getValueType();
+  if (Op0VT != MVT::v16i8)
+    return SDValue();
+
+  SDLoc DL(Op0);
+  SDValue Ones = DAG.getConstant(1, DL, Op0VT);
+  SDValue Zeros = DAG.getConstant(0, DL, MVT::v4i32);
+  auto DotIntrisic = (ExtOpcode == ISD::ZERO_EXTEND)
+                         ? Intrinsic::aarch64_neon_udot
+                         : Intrinsic::aarch64_neon_sdot;
+  SDValue Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Zeros.getValueType(),
+                            DAG.getConstant(DotIntrisic, DL, MVT::i32), Zeros,
+                            Ones, Op0.getOperand(0));
+  return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
+}
+
+// Given a ABS node, detect the following pattern:
+// (ABS (SUB (EXTEND a), (EXTEND b))).
+// Generates UABD/SABD instruction.
+static SDValue performABSCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const AArch64Subtarget *Subtarget) {
+  SDValue AbsOp1 = N->getOperand(0);
+  SDValue Op0, Op1;
+
+  if (AbsOp1.getOpcode() != ISD::SUB)
+    return SDValue();
+
+  Op0 = AbsOp1.getOperand(0);
+  Op1 = AbsOp1.getOperand(1);
+
+  unsigned Opc0 = Op0.getOpcode();
+  // Check if the operands of the sub are (zero|sign)-extended.
+  if (Opc0 != Op1.getOpcode() ||
+      (Opc0 != ISD::ZERO_EXTEND && Opc0 != ISD::SIGN_EXTEND))
+    return SDValue();
+
+  EVT VectorT1 = Op0.getOperand(0).getValueType();
+  EVT VectorT2 = Op1.getOperand(0).getValueType();
+  // Check if vectors are of same type and valid size.
+  uint64_t Size = VectorT1.getFixedSizeInBits();
+  if (VectorT1 != VectorT2 || (Size != 64 && Size != 128))
+    return SDValue();
+
+  // Check if vector element types are valid.
+  EVT VT1 = VectorT1.getVectorElementType();
+  if (VT1 != MVT::i8 && VT1 != MVT::i16 && VT1 != MVT::i32)
+    return SDValue();
+
+  Op0 = Op0.getOperand(0);
+  Op1 = Op1.getOperand(0);
+  unsigned ABDOpcode =
+      (Opc0 == ISD::SIGN_EXTEND) ? AArch64ISD::SABD : AArch64ISD::UABD;
+  SDValue ABD =
+      DAG.getNode(ABDOpcode, SDLoc(N), Op0->getValueType(0), Op0, Op1);
+  return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), ABD);
 }
 
 static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
@@ -10443,10 +11734,7 @@ static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
-  if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
-    return Cmp;
-
-  return performIntegerAbsCombine(N, DAG);
+  return foldVectorXorShiftIntoCmp(N, DAG, Subtarget);
 }
 
 SDValue
@@ -10505,9 +11793,157 @@ static bool IsSVECntIntrinsic(SDValue S) {
   return false;
 }
 
+/// Calculates what the pre-extend type is, based on the extension
+/// operation node provided by \p Extend.
+///
+/// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the
+/// pre-extend type is pulled directly from the operand, while other extend
+/// operations need a bit more inspection to get this information.
+///
+/// \param Extend The SDNode from the DAG that represents the extend operation
+/// \param DAG The SelectionDAG hosting the \p Extend node
+///
+/// \returns The type representing the \p Extend source type, or \p MVT::Other
+/// if no valid type can be determined
+static EVT calculatePreExtendType(SDValue Extend, SelectionDAG &DAG) {
+  switch (Extend.getOpcode()) {
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+    return Extend.getOperand(0).getValueType();
+  case ISD::AssertSext:
+  case ISD::AssertZext:
+  case ISD::SIGN_EXTEND_INREG: {
+    VTSDNode *TypeNode = dyn_cast<VTSDNode>(Extend.getOperand(1));
+    if (!TypeNode)
+      return MVT::Other;
+    return TypeNode->getVT();
+  }
+  case ISD::AND: {
+    ConstantSDNode *Constant =
+        dyn_cast<ConstantSDNode>(Extend.getOperand(1).getNode());
+    if (!Constant)
+      return MVT::Other;
+
+    uint32_t Mask = Constant->getZExtValue();
+
+    if (Mask == UCHAR_MAX)
+      return MVT::i8;
+    else if (Mask == USHRT_MAX)
+      return MVT::i16;
+    else if (Mask == UINT_MAX)
+      return MVT::i32;
+
+    return MVT::Other;
+  }
+  default:
+    return MVT::Other;
+  }
+
+  llvm_unreachable("Code path unhandled in calculatePreExtendType!");
+}
+
+/// Combines a dup(sext/zext) node pattern into sext/zext(dup)
+/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
+static SDValue performCommonVectorExtendCombine(SDValue VectorShuffle,
+                                                SelectionDAG &DAG) {
+
+  ShuffleVectorSDNode *ShuffleNode =
+      dyn_cast<ShuffleVectorSDNode>(VectorShuffle.getNode());
+  if (!ShuffleNode)
+    return SDValue();
+
+  // Ensuring the mask is zero before continuing
+  if (!ShuffleNode->isSplat() || ShuffleNode->getSplatIndex() != 0)
+    return SDValue();
+
+  SDValue InsertVectorElt = VectorShuffle.getOperand(0);
+
+  if (InsertVectorElt.getOpcode() != ISD::INSERT_VECTOR_ELT)
+    return SDValue();
+
+  SDValue InsertLane = InsertVectorElt.getOperand(2);
+  ConstantSDNode *Constant = dyn_cast<ConstantSDNode>(InsertLane.getNode());
+  // Ensures the insert is inserting into lane 0
+  if (!Constant || Constant->getZExtValue() != 0)
+    return SDValue();
+
+  SDValue Extend = InsertVectorElt.getOperand(1);
+  unsigned ExtendOpcode = Extend.getOpcode();
+
+  bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||
+                ExtendOpcode == ISD::SIGN_EXTEND_INREG ||
+                ExtendOpcode == ISD::AssertSext;
+  if (!IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&
+      ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
+    return SDValue();
+
+  EVT TargetType = VectorShuffle.getValueType();
+  EVT PreExtendType = calculatePreExtendType(Extend, DAG);
+
+  if ((TargetType != MVT::v8i16 && TargetType != MVT::v4i32 &&
+       TargetType != MVT::v2i64) ||
+      (PreExtendType == MVT::Other))
+    return SDValue();
+
+  // Restrict valid pre-extend data type
+  if (PreExtendType != MVT::i8 && PreExtendType != MVT::i16 &&
+      PreExtendType != MVT::i32)
+    return SDValue();
+
+  EVT PreExtendVT = TargetType.changeVectorElementType(PreExtendType);
+
+  if (PreExtendVT.getVectorElementCount() != TargetType.getVectorElementCount())
+    return SDValue();
+
+  if (TargetType.getScalarSizeInBits() != PreExtendVT.getScalarSizeInBits() * 2)
+    return SDValue();
+
+  SDLoc DL(VectorShuffle);
+
+  SDValue InsertVectorNode = DAG.getNode(
+      InsertVectorElt.getOpcode(), DL, PreExtendVT, DAG.getUNDEF(PreExtendVT),
+      DAG.getAnyExtOrTrunc(Extend.getOperand(0), DL, PreExtendType),
+      DAG.getConstant(0, DL, MVT::i64));
+
+  std::vector<int> ShuffleMask(TargetType.getVectorElementCount().getValue());
+
+  SDValue VectorShuffleNode =
+      DAG.getVectorShuffle(PreExtendVT, DL, InsertVectorNode,
+                           DAG.getUNDEF(PreExtendVT), ShuffleMask);
+
+  SDValue ExtendNode = DAG.getNode(IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
+                                   DL, TargetType, VectorShuffleNode);
+
+  return ExtendNode;
+}
+
+/// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
+/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
+static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG) {
+  // If the value type isn't a vector, none of the operands are going to be dups
+  if (!Mul->getValueType(0).isVector())
+    return SDValue();
+
+  SDValue Op0 = performCommonVectorExtendCombine(Mul->getOperand(0), DAG);
+  SDValue Op1 = performCommonVectorExtendCombine(Mul->getOperand(1), DAG);
+
+  // Neither operands have been changed, don't make any further changes
+  if (!Op0 && !Op1)
+    return SDValue();
+
+  SDLoc DL(Mul);
+  return DAG.getNode(Mul->getOpcode(), DL, Mul->getValueType(0),
+                     Op0 ? Op0 : Mul->getOperand(0),
+                     Op1 ? Op1 : Mul->getOperand(1));
+}
+
 static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const AArch64Subtarget *Subtarget) {
+
+  if (SDValue Ext = performMulVectorExtendCombine(N, DAG))
+    return Ext;
+
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
@@ -11042,6 +12478,9 @@ static SDValue performSVEAndCombine(SDNode *N,
     return DAG.getNode(Opc, DL, N->getValueType(0), And);
   }
 
+  if (!EnableCombineMGatherIntrinsics)
+    return SDValue();
+
   SDValue Mask = N->getOperand(1);
 
   if (!Src.hasOneUse())
@@ -11095,6 +12534,11 @@ static SDValue performANDCombine(SDNode *N,
   if (VT.isScalableVector())
     return performSVEAndCombine(N, DCI);
 
+  // The combining code below works only for NEON vectors. In particular, it
+  // does not work for SVE when dealing with vectors wider than 128 bits.
+  if (!(VT.is64BitVector() || VT.is128BitVector()))
+    return SDValue();
+
   BuildVectorSDNode *BVN =
       dyn_cast<BuildVectorSDNode>(N->getOperand(1).getNode());
   if (!BVN)
@@ -11155,6 +12599,143 @@ static SDValue performSRLCombine(SDNode *N,
   return SDValue();
 }
 
+// Attempt to form urhadd(OpA, OpB) from
+// truncate(vlshr(sub(zext(OpB), xor(zext(OpA), Ones(ElemSizeInBits))), 1))
+// or uhadd(OpA, OpB) from truncate(vlshr(add(zext(OpA), zext(OpB)), 1)).
+// The original form of the first expression is
+// truncate(srl(add(zext(OpB), add(zext(OpA), 1)), 1)) and the
+// (OpA + OpB + 1) subexpression will have been changed to (OpB - (~OpA)).
+// Before this function is called the srl will have been lowered to
+// AArch64ISD::VLSHR.
+// This pass can also recognize signed variants of the patterns that use sign
+// extension instead of zero extension and form a srhadd(OpA, OpB) or a
+// shadd(OpA, OpB) from them.
+static SDValue
+performVectorTruncateCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+                             SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+
+  // Since we are looking for a right shift by a constant value of 1 and we are
+  // operating on types at least 16 bits in length (sign/zero extended OpA and
+  // OpB, which are at least 8 bits), it follows that the truncate will always
+  // discard the shifted-in bit and therefore the right shift will be logical
+  // regardless of the signedness of OpA and OpB.
+  SDValue Shift = N->getOperand(0);
+  if (Shift.getOpcode() != AArch64ISD::VLSHR)
+    return SDValue();
+
+  // Is the right shift using an immediate value of 1?
+  uint64_t ShiftAmount = Shift.getConstantOperandVal(1);
+  if (ShiftAmount != 1)
+    return SDValue();
+
+  SDValue ExtendOpA, ExtendOpB;
+  SDValue ShiftOp0 = Shift.getOperand(0);
+  unsigned ShiftOp0Opc = ShiftOp0.getOpcode();
+  if (ShiftOp0Opc == ISD::SUB) {
+
+    SDValue Xor = ShiftOp0.getOperand(1);
+    if (Xor.getOpcode() != ISD::XOR)
+      return SDValue();
+
+    // Is the XOR using a constant amount of all ones in the right hand side?
+    uint64_t C;
+    if (!isAllConstantBuildVector(Xor.getOperand(1), C))
+      return SDValue();
+
+    unsigned ElemSizeInBits = VT.getScalarSizeInBits();
+    APInt CAsAPInt(ElemSizeInBits, C);
+    if (CAsAPInt != APInt::getAllOnesValue(ElemSizeInBits))
+      return SDValue();
+
+    ExtendOpA = Xor.getOperand(0);
+    ExtendOpB = ShiftOp0.getOperand(0);
+  } else if (ShiftOp0Opc == ISD::ADD) {
+    ExtendOpA = ShiftOp0.getOperand(0);
+    ExtendOpB = ShiftOp0.getOperand(1);
+  } else
+    return SDValue();
+
+  unsigned ExtendOpAOpc = ExtendOpA.getOpcode();
+  unsigned ExtendOpBOpc = ExtendOpB.getOpcode();
+  if (!(ExtendOpAOpc == ExtendOpBOpc &&
+        (ExtendOpAOpc == ISD::ZERO_EXTEND || ExtendOpAOpc == ISD::SIGN_EXTEND)))
+    return SDValue();
+
+  // Is the result of the right shift being truncated to the same value type as
+  // the original operands, OpA and OpB?
+  SDValue OpA = ExtendOpA.getOperand(0);
+  SDValue OpB = ExtendOpB.getOperand(0);
+  EVT OpAVT = OpA.getValueType();
+  assert(ExtendOpA.getValueType() == ExtendOpB.getValueType());
+  if (!(VT == OpAVT && OpAVT == OpB.getValueType()))
+    return SDValue();
+
+  SDLoc DL(N);
+  bool IsSignExtend = ExtendOpAOpc == ISD::SIGN_EXTEND;
+  bool IsRHADD = ShiftOp0Opc == ISD::SUB;
+  unsigned HADDOpc = IsSignExtend
+                         ? (IsRHADD ? AArch64ISD::SRHADD : AArch64ISD::SHADD)
+                         : (IsRHADD ? AArch64ISD::URHADD : AArch64ISD::UHADD);
+  SDValue ResultHADD = DAG.getNode(HADDOpc, DL, VT, OpA, OpB);
+
+  return ResultHADD;
+}
+
+static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
+  switch (Opcode) {
+  case ISD::FADD:
+    return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64;
+  case ISD::ADD:
+    return VT == MVT::i64;
+  default:
+    return false;
+  }
+}
+
+static SDValue performExtractVectorEltCombine(SDNode *N, SelectionDAG &DAG) {
+  SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
+  ConstantSDNode *ConstantN1 = dyn_cast<ConstantSDNode>(N1);
+
+  EVT VT = N->getValueType(0);
+  const bool FullFP16 =
+      static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
+
+  // Rewrite for pairwise fadd pattern
+  //   (f32 (extract_vector_elt
+  //           (fadd (vXf32 Other)
+  //                 (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0))
+  // ->
+  //   (f32 (fadd (extract_vector_elt (vXf32 Other) 0)
+  //              (extract_vector_elt (vXf32 Other) 1))
+  if (ConstantN1 && ConstantN1->getZExtValue() == 0 &&
+      hasPairwiseAdd(N0->getOpcode(), VT, FullFP16)) {
+    SDLoc DL(N0);
+    SDValue N00 = N0->getOperand(0);
+    SDValue N01 = N0->getOperand(1);
+
+    ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(N01);
+    SDValue Other = N00;
+
+    // And handle the commutative case.
+    if (!Shuffle) {
+      Shuffle = dyn_cast<ShuffleVectorSDNode>(N00);
+      Other = N01;
+    }
+
+    if (Shuffle && Shuffle->getMaskElt(0) == 1 &&
+        Other == Shuffle->getOperand(0)) {
+      return DAG.getNode(N0->getOpcode(), DL, VT,
+                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
+                                     DAG.getConstant(0, DL, MVT::i64)),
+                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
+                                     DAG.getConstant(1, DL, MVT::i64)));
+    }
+  }
+
+  return SDValue();
+}
+
 static SDValue performConcatVectorsCombine(SDNode *N,
                                            TargetLowering::DAGCombinerInfo &DCI,
                                            SelectionDAG &DAG) {
@@ -11200,9 +12781,9 @@ static SDValue performConcatVectorsCombine(SDNode *N,
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
-  // Optimise concat_vectors of two [us]rhadds that use extracted subvectors
-  // from the same original vectors. Combine these into a single [us]rhadd that
-  // operates on the two original vectors. Example:
+  // Optimise concat_vectors of two [us]rhadds or [us]hadds that use extracted
+  // subvectors from the same original vectors. Combine these into a single
+  // [us]rhadd or [us]hadd that operates on the two original vectors. Example:
   //  (v16i8 (concat_vectors (v8i8 (urhadd (extract_subvector (v16i8 OpA, <0>),
   //                                        extract_subvector (v16i8 OpB,
   //                                        <0>))),
@@ -11212,7 +12793,8 @@ static SDValue performConcatVectorsCombine(SDNode *N,
   // ->
   //  (v16i8(urhadd(v16i8 OpA, v16i8 OpB)))
   if (N->getNumOperands() == 2 && N0Opc == N1Opc &&
-      (N0Opc == AArch64ISD::URHADD || N0Opc == AArch64ISD::SRHADD)) {
+      (N0Opc == AArch64ISD::URHADD || N0Opc == AArch64ISD::SRHADD ||
+       N0Opc == AArch64ISD::UHADD || N0Opc == AArch64ISD::SHADD)) {
     SDValue N00 = N0->getOperand(0);
     SDValue N01 = N0->getOperand(1);
     SDValue N10 = N1->getOperand(0);
@@ -11517,6 +13099,43 @@ static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
   return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
 }
 
+// ADD(UADDV a, UADDV b) -->  UADDV(ADD a, b)
+static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+  // Only scalar integer and vector types.
+  if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger())
+    return SDValue();
+
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+      RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getValueType() != VT)
+    return SDValue();
+
+  auto *LHSN1 = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
+  auto *RHSN1 = dyn_cast<ConstantSDNode>(RHS->getOperand(1));
+  if (!LHSN1 || LHSN1 != RHSN1 || !RHSN1->isNullValue())
+    return SDValue();
+
+  SDValue Op1 = LHS->getOperand(0);
+  SDValue Op2 = RHS->getOperand(0);
+  EVT OpVT1 = Op1.getValueType();
+  EVT OpVT2 = Op2.getValueType();
+  if (Op1.getOpcode() != AArch64ISD::UADDV || OpVT1 != OpVT2 ||
+      Op2.getOpcode() != AArch64ISD::UADDV ||
+      OpVT1.getVectorElementType() != VT)
+    return SDValue();
+
+  SDValue Val1 = Op1.getOperand(0);
+  SDValue Val2 = Op2.getOperand(0);
+  EVT ValVT = Val1->getValueType(0);
+  SDLoc DL(N);
+  SDValue AddVal = DAG.getNode(ISD::ADD, DL, ValVT, Val1, Val2);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
+                     DAG.getNode(AArch64ISD::UADDV, DL, ValVT, AddVal),
+                     DAG.getConstant(0, DL, MVT::i64));
+}
+
 // The basic add/sub long vector instructions have variants with "2" on the end
 // which act on the high-half of their inputs. They are normally matched by
 // patterns like:
@@ -11570,6 +13189,16 @@ static SDValue performAddSubLongCombine(SDNode *N,
   return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
 }
 
+static SDValue performAddSubCombine(SDNode *N,
+                                    TargetLowering::DAGCombinerInfo &DCI,
+                                    SelectionDAG &DAG) {
+  // Try to change sum of two reductions.
+  if (SDValue Val = performUADDVCombine(N, DAG))
+    return Val;
+
+  return performAddSubLongCombine(N, DCI, DAG);
+}
+
 // Massage DAGs which we can use the high-half "long" operations on into
 // something isel will recognize better. E.g.
 //
@@ -11583,8 +13212,8 @@ static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
-  SDValue LHS = N->getOperand(1);
-  SDValue RHS = N->getOperand(2);
+  SDValue LHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 0 : 1);
+  SDValue RHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 1 : 2);
   assert(LHS.getValueType().is64BitVector() &&
          RHS.getValueType().is64BitVector() &&
          "unexpected shape for long operation");
@@ -11602,6 +13231,9 @@ static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
       return SDValue();
   }
 
+  if (IID == Intrinsic::not_intrinsic)
+    return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), LHS, RHS);
+
   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
                      N->getOperand(0), LHS, RHS);
 }
@@ -11700,34 +13332,6 @@ static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N,
                      DAG.getConstant(0, dl, MVT::i64));
 }
 
-static SDValue LowerSVEIntReduction(SDNode *N, unsigned Opc,
-                                    SelectionDAG &DAG) {
-  SDLoc dl(N);
-  LLVMContext &Ctx = *DAG.getContext();
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-
-  EVT VT = N->getValueType(0);
-  SDValue Pred = N->getOperand(1);
-  SDValue Data = N->getOperand(2);
-  EVT DataVT = Data.getValueType();
-
-  if (DataVT.getVectorElementType().isScalarInteger() &&
-      (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64)) {
-    if (!TLI.isTypeLegal(DataVT))
-      return SDValue();
-
-    EVT OutputVT = EVT::getVectorVT(Ctx, VT,
-      AArch64::NeonBitsPerVector / VT.getSizeInBits());
-    SDValue Reduce = DAG.getNode(Opc, dl, OutputVT, Pred, Data);
-    SDValue Zero = DAG.getConstant(0, dl, MVT::i64);
-    SDValue Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Reduce, Zero);
-
-    return Result;
-  }
-
-  return SDValue();
-}
-
 static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG) {
   SDLoc DL(N);
   SDValue Op1 = N->getOperand(1);
@@ -11770,7 +13374,8 @@ static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG) {
 
   unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8;
   unsigned ByteSize = VT.getSizeInBits().getKnownMinSize() / 8;
-  EVT ByteVT = EVT::getVectorVT(Ctx, MVT::i8, { ByteSize, true });
+  EVT ByteVT =
+      EVT::getVectorVT(Ctx, MVT::i8, ElementCount::getScalable(ByteSize));
 
   // Convert everything to the domain of EXT (i.e bytes).
   SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(1));
@@ -11870,6 +13475,25 @@ static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
   return DAG.getZExtOrTrunc(Res, DL, VT);
 }
 
+static SDValue combineSVEReductionInt(SDNode *N, unsigned Opc,
+                                      SelectionDAG &DAG) {
+  SDLoc DL(N);
+
+  SDValue Pred = N->getOperand(1);
+  SDValue VecToReduce = N->getOperand(2);
+
+  // NOTE: The integer reduction's result type is not always linked to the
+  // operand's element type so we construct it from the intrinsic's result type.
+  EVT ReduceVT = getPackedSVEVectorVT(N->getValueType(0));
+  SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
+
+  // SVE reductions set the whole vector register with the first element
+  // containing the reduction result, which we'll now extract.
+  SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
+                     Zero);
+}
+
 static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc,
                                      SelectionDAG &DAG) {
   SDLoc DL(N);
@@ -11910,6 +13534,25 @@ static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc,
                      Zero);
 }
 
+// If a merged operation has no inactive lanes we can relax it to a predicated
+// or unpredicated operation, which potentially allows better isel (perhaps
+// using immediate forms) or relaxing register reuse requirements.
+static SDValue convertMergedOpToPredOp(SDNode *N, unsigned PredOpc,
+                                       SelectionDAG &DAG) {
+  assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!");
+  assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!");
+  SDValue Pg = N->getOperand(1);
+
+  // ISD way to specify an all active predicate.
+  if ((Pg.getOpcode() == AArch64ISD::PTRUE) &&
+      (Pg.getConstantOperandVal(0) == AArch64SVEPredPattern::all))
+    return DAG.getNode(PredOpc, SDLoc(N), N->getValueType(0), Pg,
+                       N->getOperand(2), N->getOperand(3));
+
+  // FUTURE: SplatVector(true)
+  return SDValue();
+}
+
 static SDValue performIntrinsicCombine(SDNode *N,
                                        TargetLowering::DAGCombinerInfo &DCI,
                                        const AArch64Subtarget *Subtarget) {
@@ -11964,20 +13607,28 @@ static SDValue performIntrinsicCombine(SDNode *N,
   case Intrinsic::aarch64_crc32h:
   case Intrinsic::aarch64_crc32ch:
     return tryCombineCRC32(0xffff, N, DAG);
+  case Intrinsic::aarch64_sve_saddv:
+    // There is no i64 version of SADDV because the sign is irrelevant.
+    if (N->getOperand(2)->getValueType(0).getVectorElementType() == MVT::i64)
+      return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG);
+    else
+      return combineSVEReductionInt(N, AArch64ISD::SADDV_PRED, DAG);
+  case Intrinsic::aarch64_sve_uaddv:
+    return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG);
   case Intrinsic::aarch64_sve_smaxv:
-    return LowerSVEIntReduction(N, AArch64ISD::SMAXV_PRED, DAG);
+    return combineSVEReductionInt(N, AArch64ISD::SMAXV_PRED, DAG);
   case Intrinsic::aarch64_sve_umaxv:
-    return LowerSVEIntReduction(N, AArch64ISD::UMAXV_PRED, DAG);
+    return combineSVEReductionInt(N, AArch64ISD::UMAXV_PRED, DAG);
   case Intrinsic::aarch64_sve_sminv:
-    return LowerSVEIntReduction(N, AArch64ISD::SMINV_PRED, DAG);
+    return combineSVEReductionInt(N, AArch64ISD::SMINV_PRED, DAG);
   case Intrinsic::aarch64_sve_uminv:
-    return LowerSVEIntReduction(N, AArch64ISD::UMINV_PRED, DAG);
+    return combineSVEReductionInt(N, AArch64ISD::UMINV_PRED, DAG);
   case Intrinsic::aarch64_sve_orv:
-    return LowerSVEIntReduction(N, AArch64ISD::ORV_PRED, DAG);
+    return combineSVEReductionInt(N, AArch64ISD::ORV_PRED, DAG);
   case Intrinsic::aarch64_sve_eorv:
-    return LowerSVEIntReduction(N, AArch64ISD::EORV_PRED, DAG);
+    return combineSVEReductionInt(N, AArch64ISD::EORV_PRED, DAG);
   case Intrinsic::aarch64_sve_andv:
-    return LowerSVEIntReduction(N, AArch64ISD::ANDV_PRED, DAG);
+    return combineSVEReductionInt(N, AArch64ISD::ANDV_PRED, DAG);
   case Intrinsic::aarch64_sve_index:
     return LowerSVEIntrinsicIndex(N, DAG);
   case Intrinsic::aarch64_sve_dup:
@@ -11988,26 +13639,19 @@ static SDValue performIntrinsicCombine(SDNode *N,
   case Intrinsic::aarch64_sve_ext:
     return LowerSVEIntrinsicEXT(N, DAG);
   case Intrinsic::aarch64_sve_smin:
-    return DAG.getNode(AArch64ISD::SMIN_MERGE_OP1, SDLoc(N), N->getValueType(0),
-                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
+    return convertMergedOpToPredOp(N, AArch64ISD::SMIN_PRED, DAG);
   case Intrinsic::aarch64_sve_umin:
-    return DAG.getNode(AArch64ISD::UMIN_MERGE_OP1, SDLoc(N), N->getValueType(0),
-                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
+    return convertMergedOpToPredOp(N, AArch64ISD::UMIN_PRED, DAG);
   case Intrinsic::aarch64_sve_smax:
-    return DAG.getNode(AArch64ISD::SMAX_MERGE_OP1, SDLoc(N), N->getValueType(0),
-                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
+    return convertMergedOpToPredOp(N, AArch64ISD::SMAX_PRED, DAG);
   case Intrinsic::aarch64_sve_umax:
-    return DAG.getNode(AArch64ISD::UMAX_MERGE_OP1, SDLoc(N), N->getValueType(0),
-                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
+    return convertMergedOpToPredOp(N, AArch64ISD::UMAX_PRED, DAG);
   case Intrinsic::aarch64_sve_lsl:
-    return DAG.getNode(AArch64ISD::SHL_MERGE_OP1, SDLoc(N), N->getValueType(0),
-                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
+    return convertMergedOpToPredOp(N, AArch64ISD::SHL_PRED, DAG);
   case Intrinsic::aarch64_sve_lsr:
-    return DAG.getNode(AArch64ISD::SRL_MERGE_OP1, SDLoc(N), N->getValueType(0),
-                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
+    return convertMergedOpToPredOp(N, AArch64ISD::SRL_PRED, DAG);
   case Intrinsic::aarch64_sve_asr:
-    return DAG.getNode(AArch64ISD::SRA_MERGE_OP1, SDLoc(N), N->getValueType(0),
-                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
+    return convertMergedOpToPredOp(N, AArch64ISD::SRA_PRED, DAG);
   case Intrinsic::aarch64_sve_cmphs:
     if (!N->getOperand(2).getValueType().isFloatingPoint())
       return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
@@ -12100,18 +13744,15 @@ static SDValue performExtendCombine(SDNode *N,
   // helps the backend to decide that an sabdl2 would be useful, saving a real
   // extract_high operation.
   if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
-      N->getOperand(0).getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
+      (N->getOperand(0).getOpcode() == AArch64ISD::UABD ||
+       N->getOperand(0).getOpcode() == AArch64ISD::SABD)) {
     SDNode *ABDNode = N->getOperand(0).getNode();
-    unsigned IID = getIntrinsicID(ABDNode);
-    if (IID == Intrinsic::aarch64_neon_sabd ||
-        IID == Intrinsic::aarch64_neon_uabd) {
-      SDValue NewABD = tryCombineLongOpWithDup(IID, ABDNode, DCI, DAG);
-      if (!NewABD.getNode())
-        return SDValue();
+    SDValue NewABD =
+        tryCombineLongOpWithDup(Intrinsic::not_intrinsic, ABDNode, DCI, DAG);
+    if (!NewABD.getNode())
+      return SDValue();
 
-      return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0),
-                         NewABD);
-    }
+    return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD);
   }
 
   // This is effectively a custom type legalization for AArch64.
@@ -12594,6 +14235,31 @@ static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
                       S->getMemOperand()->getFlags());
 }
 
+static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG) {
+  SDLoc DL(N);
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  EVT ResVT = N->getValueType(0);
+
+  // uzp1(unpklo(uzp1(x, y)), z) => uzp1(x, z)
+  if (Op0.getOpcode() == AArch64ISD::UUNPKLO) {
+    if (Op0.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
+      SDValue X = Op0.getOperand(0).getOperand(0);
+      return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, X, Op1);
+    }
+  }
+
+  // uzp1(x, unpkhi(uzp1(y, z))) => uzp1(x, z)
+  if (Op1.getOpcode() == AArch64ISD::UUNPKHI) {
+    if (Op1.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
+      SDValue Z = Op1.getOperand(0).getOperand(1);
+      return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Z);
+    }
+  }
+
+  return SDValue();
+}
+
 /// Target-specific DAG combine function for post-increment LD1 (lane) and
 /// post-increment LD1R.
 static SDValue performPostLD1Combine(SDNode *N,
@@ -12732,6 +14398,54 @@ static SDValue performSTORECombine(SDNode *N,
   return SDValue();
 }
 
+static SDValue performMaskedGatherScatterCombine(SDNode *N,
+                                      TargetLowering::DAGCombinerInfo &DCI,
+                                      SelectionDAG &DAG) {
+  MaskedGatherScatterSDNode *MGS = cast<MaskedGatherScatterSDNode>(N);
+  assert(MGS && "Can only combine gather load or scatter store nodes");
+
+  SDLoc DL(MGS);
+  SDValue Chain = MGS->getChain();
+  SDValue Scale = MGS->getScale();
+  SDValue Index = MGS->getIndex();
+  SDValue Mask = MGS->getMask();
+  SDValue BasePtr = MGS->getBasePtr();
+  ISD::MemIndexType IndexType = MGS->getIndexType();
+
+  EVT IdxVT = Index.getValueType();
+
+  if (DCI.isBeforeLegalize()) {
+    // SVE gather/scatter requires indices of i32/i64. Promote anything smaller
+    // prior to legalisation so the result can be split if required.
+    if ((IdxVT.getVectorElementType() == MVT::i8) ||
+        (IdxVT.getVectorElementType() == MVT::i16)) {
+      EVT NewIdxVT = IdxVT.changeVectorElementType(MVT::i32);
+      if (MGS->isIndexSigned())
+        Index = DAG.getNode(ISD::SIGN_EXTEND, DL, NewIdxVT, Index);
+      else
+        Index = DAG.getNode(ISD::ZERO_EXTEND, DL, NewIdxVT, Index);
+
+      if (auto *MGT = dyn_cast<MaskedGatherSDNode>(MGS)) {
+        SDValue PassThru = MGT->getPassThru();
+        SDValue Ops[] = { Chain, PassThru, Mask, BasePtr, Index, Scale };
+        return DAG.getMaskedGather(DAG.getVTList(N->getValueType(0), MVT::Other),
+                                   PassThru.getValueType(), DL, Ops,
+                                   MGT->getMemOperand(),
+                                   MGT->getIndexType(), MGT->getExtensionType());
+      } else {
+        auto *MSC = cast<MaskedScatterSDNode>(MGS);
+        SDValue Data = MSC->getValue();
+        SDValue Ops[] = { Chain, Data, Mask, BasePtr, Index, Scale };
+        return DAG.getMaskedScatter(DAG.getVTList(MVT::Other),
+                                    MSC->getMemoryVT(), DL, Ops,
+                                    MSC->getMemOperand(), IndexType,
+                                    MSC->isTruncatingStore());
+      }
+    }
+  }
+
+  return SDValue();
+}
 
 /// Target-specific DAG combine function for NEON load/store intrinsics
 /// to merge base address updates.
@@ -13703,9 +15417,6 @@ static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG,
 static SDValue
 performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
                               SelectionDAG &DAG) {
-  if (DCI.isBeforeLegalizeOps())
-    return SDValue();
-
   SDLoc DL(N);
   SDValue Src = N->getOperand(0);
   unsigned Opc = Src->getOpcode();
@@ -13732,9 +15443,7 @@ performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
     assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) &&
            "Sign extending from an invalid type");
 
-    EVT ExtVT = EVT::getVectorVT(*DAG.getContext(),
-                                 VT.getVectorElementType(),
-                                 VT.getVectorElementCount() * 2);
+    EVT ExtVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext());
 
     SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ExtOp.getValueType(),
                               ExtOp, DAG.getValueType(ExtVT));
@@ -13742,6 +15451,12 @@ performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
     return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);
   }
 
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  if (!EnableCombineMGatherIntrinsics)
+    return SDValue();
+
   // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
   // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
   unsigned NewOpc;
@@ -13881,9 +15596,11 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
   default:
     LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
     break;
+  case ISD::ABS:
+    return performABSCombine(N, DAG, DCI, Subtarget);
   case ISD::ADD:
   case ISD::SUB:
-    return performAddSubLongCombine(N, DCI, DAG);
+    return performAddSubCombine(N, DCI, DAG);
   case ISD::XOR:
     return performXorCombine(N, DAG, DCI, Subtarget);
   case ISD::MUL:
@@ -13910,6 +15627,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     return performExtendCombine(N, DCI, DAG);
   case ISD::SIGN_EXTEND_INREG:
     return performSignExtendInRegCombine(N, DCI, DAG);
+  case ISD::TRUNCATE:
+    return performVectorTruncateCombine(N, DCI, DAG);
   case ISD::CONCAT_VECTORS:
     return performConcatVectorsCombine(N, DCI, DAG);
   case ISD::SELECT:
@@ -13922,6 +15641,9 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     break;
   case ISD::STORE:
     return performSTORECombine(N, DCI, DAG, Subtarget);
+  case ISD::MGATHER:
+  case ISD::MSCATTER:
+    return performMaskedGatherScatterCombine(N, DCI, DAG);
   case AArch64ISD::BRCOND:
     return performBRCONDCombine(N, DCI, DAG);
   case AArch64ISD::TBNZ:
@@ -13933,8 +15655,14 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     return performPostLD1Combine(N, DCI, false);
   case AArch64ISD::NVCAST:
     return performNVCASTCombine(N);
+  case AArch64ISD::UZP1:
+    return performUzpCombine(N, DAG);
   case ISD::INSERT_VECTOR_ELT:
     return performPostLD1Combine(N, DCI, true);
+  case ISD::EXTRACT_VECTOR_ELT:
+    return performExtractVectorEltCombine(N, DAG);
+  case ISD::VECREDUCE_ADD:
+    return performVecReduceAddCombine(N, DCI.DAG, Subtarget);
   case ISD::INTRINSIC_VOID:
   case ISD::INTRINSIC_W_CHAIN:
     switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
@@ -14083,10 +15811,10 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
 
       uint64_t IdxConst = cast<ConstantSDNode>(Idx)->getZExtValue();
       EVT ResVT = N->getValueType(0);
-      uint64_t NumLanes = ResVT.getVectorElementCount().Min;
+      uint64_t NumLanes = ResVT.getVectorElementCount().getKnownMinValue();
+      SDValue ExtIdx = DAG.getVectorIdxConstant(IdxConst * NumLanes, DL);
       SDValue Val =
-          DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Src1,
-                      DAG.getConstant(IdxConst * NumLanes, DL, MVT::i32));
+          DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Src1, ExtIdx);
       return DAG.getMergeValues({Val, Chain}, DL);
     }
     case Intrinsic::aarch64_sve_tuple_set: {
@@ -14097,10 +15825,11 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
       SDValue Vec = N->getOperand(4);
 
       EVT TupleVT = Tuple.getValueType();
-      uint64_t TupleLanes = TupleVT.getVectorElementCount().Min;
+      uint64_t TupleLanes = TupleVT.getVectorElementCount().getKnownMinValue();
 
       uint64_t IdxConst = cast<ConstantSDNode>(Idx)->getZExtValue();
-      uint64_t NumLanes = Vec.getValueType().getVectorElementCount().Min;
+      uint64_t NumLanes =
+          Vec.getValueType().getVectorElementCount().getKnownMinValue();
 
       if ((TupleLanes % NumLanes) != 0)
         report_fatal_error("invalid tuple vector!");
@@ -14112,9 +15841,9 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
         if (I == IdxConst)
           Opnds.push_back(Vec);
         else {
-          Opnds.push_back(
-              DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Vec.getValueType(), Tuple,
-                          DAG.getConstant(I * NumLanes, DL, MVT::i32)));
+          SDValue ExtIdx = DAG.getVectorIdxConstant(I * NumLanes, DL);
+          Opnds.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL,
+                                      Vec.getValueType(), Tuple, ExtIdx));
         }
       }
       SDValue Concat =
@@ -14336,7 +16065,7 @@ void AArch64TargetLowering::ReplaceExtractSubVectorResults(
 
   ElementCount ResEC = VT.getVectorElementCount();
 
-  if (InVT.getVectorElementCount().Min != (ResEC.Min * 2))
+  if (InVT.getVectorElementCount() != (ResEC * 2))
     return;
 
   auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1));
@@ -14344,7 +16073,7 @@ void AArch64TargetLowering::ReplaceExtractSubVectorResults(
     return;
 
   unsigned Index = CIndex->getZExtValue();
-  if ((Index != 0) && (Index != ResEC.Min))
+  if ((Index != 0) && (Index != ResEC.getKnownMinValue()))
     return;
 
   unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI;
@@ -14379,7 +16108,7 @@ static void ReplaceCMP_SWAP_128Results(SDNode *N,
   assert(N->getValueType(0) == MVT::i128 &&
          "AtomicCmpSwap on types less than 128 should be legal");
 
-  if (Subtarget->hasLSE()) {
+  if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) {
     // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
     // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
     SDValue Ops[] = {
@@ -14460,7 +16189,8 @@ void AArch64TargetLowering::ReplaceNodeResults(
     return;
 
   case ISD::CTPOP:
-    Results.push_back(LowerCTPOP(SDValue(N, 0), DAG));
+    if (SDValue Result = LowerCTPOP(SDValue(N, 0), DAG))
+      Results.push_back(Result);
     return;
   case AArch64ISD::SADDV:
     ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV);
@@ -14605,17 +16335,44 @@ AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
 
   unsigned Size = AI->getType()->getPrimitiveSizeInBits();
   if (Size > 128) return AtomicExpansionKind::None;
-  // Nand not supported in LSE.
-  if (AI->getOperation() == AtomicRMWInst::Nand) return AtomicExpansionKind::LLSC;
-  // Leave 128 bits to LLSC.
-  return (Subtarget->hasLSE() && Size < 128) ? AtomicExpansionKind::None : AtomicExpansionKind::LLSC;
+
+  // Nand is not supported in LSE.
+  // Leave 128 bits to LLSC or CmpXChg.
+  if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128) {
+    if (Subtarget->hasLSE())
+      return AtomicExpansionKind::None;
+    if (Subtarget->outlineAtomics()) {
+      // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
+      // Don't outline them unless
+      // (1) high level <atomic> support approved:
+      //   http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
+      // (2) low level libgcc and compiler-rt support implemented by:
+      //   min/max outline atomics helpers
+      if (AI->getOperation() != AtomicRMWInst::Min &&
+          AI->getOperation() != AtomicRMWInst::Max &&
+          AI->getOperation() != AtomicRMWInst::UMin &&
+          AI->getOperation() != AtomicRMWInst::UMax) {
+        return AtomicExpansionKind::None;
+      }
+    }
+  }
+
+  // At -O0, fast-regalloc cannot cope with the live vregs necessary to
+  // implement atomicrmw without spilling. If the target address is also on the
+  // stack and close enough to the spill slot, this can lead to a situation
+  // where the monitor always gets cleared and the atomic operation can never
+  // succeed. So at -O0 lower this operation to a CAS loop.
+  if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
+    return AtomicExpansionKind::CmpXChg;
+
+  return AtomicExpansionKind::LLSC;
 }
 
 TargetLowering::AtomicExpansionKind
 AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(
     AtomicCmpXchgInst *AI) const {
   // If subtarget has LSE, leave cmpxchg intact for codegen.
-  if (Subtarget->hasLSE())
+  if (Subtarget->hasLSE() || Subtarget->outlineAtomics())
     return AtomicExpansionKind::None;
   // At -O0, fast-regalloc cannot cope with the live vregs necessary to
   // implement cmpxchg without spilling. If the address being exchanged is also
@@ -15126,6 +16883,92 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
       Store->isTruncatingStore());
 }
 
+SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
+    SDValue Op, SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+  EVT EltVT = VT.getVectorElementType();
+
+  bool Signed = Op.getOpcode() == ISD::SDIV;
+  unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
+
+  // Scalable vector i32/i64 DIV is supported.
+  if (EltVT == MVT::i32 || EltVT == MVT::i64)
+    return LowerToPredicatedOp(Op, DAG, PredOpcode, /*OverrideNEON=*/true);
+
+  // Scalable vector i8/i16 DIV is not supported. Promote it to i32.
+  EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
+  EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
+  EVT FixedWidenedVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext());
+  EVT ScalableWidenedVT = getContainerForFixedLengthVector(DAG, FixedWidenedVT);
+
+  // Convert the operands to scalable vectors.
+  SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
+  SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
+
+  // Extend the scalable operands.
+  unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
+  unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
+  SDValue Op0Lo = DAG.getNode(UnpkLo, dl, ScalableWidenedVT, Op0);
+  SDValue Op1Lo = DAG.getNode(UnpkLo, dl, ScalableWidenedVT, Op1);
+  SDValue Op0Hi = DAG.getNode(UnpkHi, dl, ScalableWidenedVT, Op0);
+  SDValue Op1Hi = DAG.getNode(UnpkHi, dl, ScalableWidenedVT, Op1);
+
+  // Convert back to fixed vectors so the DIV can be further lowered.
+  Op0Lo = convertFromScalableVector(DAG, FixedWidenedVT, Op0Lo);
+  Op1Lo = convertFromScalableVector(DAG, FixedWidenedVT, Op1Lo);
+  Op0Hi = convertFromScalableVector(DAG, FixedWidenedVT, Op0Hi);
+  Op1Hi = convertFromScalableVector(DAG, FixedWidenedVT, Op1Hi);
+  SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, FixedWidenedVT,
+                                 Op0Lo, Op1Lo);
+  SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, FixedWidenedVT,
+                                 Op0Hi, Op1Hi);
+
+  // Convert again to scalable vectors to truncate.
+  ResultLo = convertToScalableVector(DAG, ScalableWidenedVT, ResultLo);
+  ResultHi = convertToScalableVector(DAG, ScalableWidenedVT, ResultHi);
+  SDValue ScalableResult = DAG.getNode(AArch64ISD::UZP1, dl, ContainerVT,
+                                       ResultLo, ResultHi);
+
+  return convertFromScalableVector(DAG, VT, ScalableResult);
+}
+
+SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE(
+    SDValue Op, SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
+
+  SDLoc DL(Op);
+  SDValue Val = Op.getOperand(0);
+  EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
+  Val = convertToScalableVector(DAG, ContainerVT, Val);
+
+  bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND;
+  unsigned ExtendOpc = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
+
+  // Repeatedly unpack Val until the result is of the desired element type.
+  switch (ContainerVT.getSimpleVT().SimpleTy) {
+  default:
+    llvm_unreachable("unimplemented container type");
+  case MVT::nxv16i8:
+    Val = DAG.getNode(ExtendOpc, DL, MVT::nxv8i16, Val);
+    if (VT.getVectorElementType() == MVT::i16)
+      break;
+    LLVM_FALLTHROUGH;
+  case MVT::nxv8i16:
+    Val = DAG.getNode(ExtendOpc, DL, MVT::nxv4i32, Val);
+    if (VT.getVectorElementType() == MVT::i32)
+      break;
+    LLVM_FALLTHROUGH;
+  case MVT::nxv4i32:
+    Val = DAG.getNode(ExtendOpc, DL, MVT::nxv2i64, Val);
+    assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!");
+    break;
+  }
+
+  return convertFromScalableVector(DAG, VT, Val);
+}
+
 SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
     SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
@@ -15162,17 +17005,21 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
   return convertFromScalableVector(DAG, VT, Val);
 }
 
+// Convert vector operation 'Op' to an equivalent predicated operation whereby
+// the original operation's type is used to construct a suitable predicate.
+// NOTE: The results for inactive lanes are undefined.
 SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
                                                    SelectionDAG &DAG,
-                                                   unsigned NewOp) const {
+                                                   unsigned NewOp,
+                                                   bool OverrideNEON) const {
   EVT VT = Op.getValueType();
   SDLoc DL(Op);
   auto Pg = getPredicateForVector(DAG, DL, VT);
 
-  if (useSVEForFixedLengthVectorVT(VT)) {
+  if (useSVEForFixedLengthVectorVT(VT, OverrideNEON)) {
     EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
 
-    // Create list of operands by convereting existing ones to scalable types.
+    // Create list of operands by converting existing ones to scalable types.
     SmallVector<SDValue, 4> Operands = {Pg};
     for (const SDValue &V : Op->op_values()) {
       if (isa<CondCodeSDNode>(V)) {
@@ -15180,11 +17027,21 @@ SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
         continue;
       }
 
-      assert(useSVEForFixedLengthVectorVT(V.getValueType()) &&
+      if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(V)) {
+        EVT VTArg = VTNode->getVT().getVectorElementType();
+        EVT NewVTArg = ContainerVT.changeVectorElementType(VTArg);
+        Operands.push_back(DAG.getValueType(NewVTArg));
+        continue;
+      }
+
+      assert(useSVEForFixedLengthVectorVT(V.getValueType(), OverrideNEON) &&
              "Only fixed length vectors are supported!");
       Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));
     }
 
+    if (isMergePassthruOpcode(NewOp))
+      Operands.push_back(DAG.getUNDEF(ContainerVT));
+
     auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands);
     return convertFromScalableVector(DAG, VT, ScalableRes);
   }
@@ -15193,10 +17050,228 @@ SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
 
   SmallVector<SDValue, 4> Operands = {Pg};
   for (const SDValue &V : Op->op_values()) {
-    assert((isa<CondCodeSDNode>(V) || V.getValueType().isScalableVector()) &&
+    assert((!V.getValueType().isVector() ||
+            V.getValueType().isScalableVector()) &&
            "Only scalable vectors are supported!");
     Operands.push_back(V);
   }
 
+  if (isMergePassthruOpcode(NewOp))
+    Operands.push_back(DAG.getUNDEF(VT));
+
   return DAG.getNode(NewOp, DL, VT, Operands);
 }
+
+// If a fixed length vector operation has no side effects when applied to
+// undefined elements, we can safely use scalable vectors to perform the same
+// operation without needing to worry about predication.
+SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  assert(useSVEForFixedLengthVectorVT(VT) &&
+         "Only expected to lower fixed length vector operation!");
+  EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
+
+  // Create list of operands by converting existing ones to scalable types.
+  SmallVector<SDValue, 4> Ops;
+  for (const SDValue &V : Op->op_values()) {
+    assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
+
+    // Pass through non-vector operands.
+    if (!V.getValueType().isVector()) {
+      Ops.push_back(V);
+      continue;
+    }
+
+    // "cast" fixed length vector to a scalable vector.
+    assert(useSVEForFixedLengthVectorVT(V.getValueType()) &&
+           "Only fixed length vectors are supported!");
+    Ops.push_back(convertToScalableVector(DAG, ContainerVT, V));
+  }
+
+  auto ScalableRes = DAG.getNode(Op.getOpcode(), SDLoc(Op), ContainerVT, Ops);
+  return convertFromScalableVector(DAG, VT, ScalableRes);
+}
+
+SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
+    SelectionDAG &DAG) const {
+  SDLoc DL(ScalarOp);
+  SDValue AccOp = ScalarOp.getOperand(0);
+  SDValue VecOp = ScalarOp.getOperand(1);
+  EVT SrcVT = VecOp.getValueType();
+  EVT ResVT = SrcVT.getVectorElementType();
+
+  EVT ContainerVT = SrcVT;
+  if (SrcVT.isFixedLengthVector()) {
+    ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
+    VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
+  }
+
+  SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
+  SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
+
+  // Convert operands to Scalable.
+  AccOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
+                      DAG.getUNDEF(ContainerVT), AccOp, Zero);
+
+  // Perform reduction.
+  SDValue Rdx = DAG.getNode(AArch64ISD::FADDA_PRED, DL, ContainerVT,
+                            Pg, AccOp, VecOp);
+
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, Zero);
+}
+
+SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
+                                                       SelectionDAG &DAG) const {
+  SDLoc DL(ReduceOp);
+  SDValue Op = ReduceOp.getOperand(0);
+  EVT OpVT = Op.getValueType();
+  EVT VT = ReduceOp.getValueType();
+
+  if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
+    return SDValue();
+
+  SDValue Pg = getPredicateForVector(DAG, DL, OpVT);
+
+  switch (ReduceOp.getOpcode()) {
+  default:
+    return SDValue();
+  case ISD::VECREDUCE_OR:
+    return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE);
+  case ISD::VECREDUCE_AND: {
+    Op = DAG.getNode(ISD::XOR, DL, OpVT, Op, Pg);
+    return getPTest(DAG, VT, Pg, Op, AArch64CC::NONE_ACTIVE);
+  }
+  case ISD::VECREDUCE_XOR: {
+    SDValue ID =
+        DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64);
+    SDValue Cntp =
+        DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op);
+    return DAG.getAnyExtOrTrunc(Cntp, DL, VT);
+  }
+  }
+
+  return SDValue();
+}
+
+SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
+                                                   SDValue ScalarOp,
+                                                   SelectionDAG &DAG) const {
+  SDLoc DL(ScalarOp);
+  SDValue VecOp = ScalarOp.getOperand(0);
+  EVT SrcVT = VecOp.getValueType();
+
+  if (useSVEForFixedLengthVectorVT(SrcVT, true)) {
+    EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
+    VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
+  }
+
+  // UADDV always returns an i64 result.
+  EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 :
+                                                   SrcVT.getVectorElementType();
+  EVT RdxVT = SrcVT;
+  if (SrcVT.isFixedLengthVector() || Opcode == AArch64ISD::UADDV_PRED)
+    RdxVT = getPackedSVEVectorVT(ResVT);
+
+  SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
+  SDValue Rdx = DAG.getNode(Opcode, DL, RdxVT, Pg, VecOp);
+  SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT,
+                            Rdx, DAG.getConstant(0, DL, MVT::i64));
+
+  // The VEC_REDUCE nodes expect an element size result.
+  if (ResVT != ScalarOp.getValueType())
+    Res = DAG.getAnyExtOrTrunc(Res, DL, ScalarOp.getValueType());
+
+  return Res;
+}
+
+SDValue
+AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op,
+    SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+
+  EVT InVT = Op.getOperand(1).getValueType();
+  EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
+  SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(1));
+  SDValue Op2 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(2));
+
+  // Convert the mask to a predicated (NOTE: We don't need to worry about
+  // inactive lanes since VSELECT is safe when given undefined elements).
+  EVT MaskVT = Op.getOperand(0).getValueType();
+  EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskVT);
+  auto Mask = convertToScalableVector(DAG, MaskContainerVT, Op.getOperand(0));
+  Mask = DAG.getNode(ISD::TRUNCATE, DL,
+                     MaskContainerVT.changeVectorElementType(MVT::i1), Mask);
+
+  auto ScalableRes = DAG.getNode(ISD::VSELECT, DL, ContainerVT,
+                                Mask, Op1, Op2);
+
+  return convertFromScalableVector(DAG, VT, ScalableRes);
+}
+
+SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE(
+    SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT InVT = Op.getOperand(0).getValueType();
+  EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
+
+  assert(useSVEForFixedLengthVectorVT(InVT) &&
+         "Only expected to lower fixed length vector operation!");
+  assert(Op.getValueType() == InVT.changeTypeToInteger() &&
+         "Expected integer result of the same bit length as the inputs!");
+
+  // Expand floating point vector comparisons.
+  if (InVT.isFloatingPoint())
+    return SDValue();
+
+  auto Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
+  auto Op2 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
+  auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
+
+  EVT CmpVT = Pg.getValueType();
+  auto Cmp = DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT,
+                         {Pg, Op1, Op2, Op.getOperand(2)});
+
+  EVT PromoteVT = ContainerVT.changeTypeToInteger();
+  auto Promote = DAG.getBoolExtOrTrunc(Cmp, DL, PromoteVT, InVT);
+  return convertFromScalableVector(DAG, Op.getValueType(), Promote);
+}
+
+SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT InVT = Op.getValueType();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  (void)TLI;
+
+  assert(VT.isScalableVector() && TLI.isTypeLegal(VT) &&
+         InVT.isScalableVector() && TLI.isTypeLegal(InVT) &&
+         "Only expect to cast between legal scalable vector types!");
+  assert((VT.getVectorElementType() == MVT::i1) ==
+             (InVT.getVectorElementType() == MVT::i1) &&
+         "Cannot cast between data and predicate scalable vector types!");
+
+  if (InVT == VT)
+    return Op;
+
+  if (VT.getVectorElementType() == MVT::i1)
+    return DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
+
+  EVT PackedVT = getPackedSVEVectorVT(VT.getVectorElementType());
+  EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType());
+  assert((VT == PackedVT || InVT == PackedInVT) &&
+         "Cannot cast between unpacked scalable vector types!");
+
+  // Pack input if required.
+  if (InVT != PackedInVT)
+    Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedInVT, Op);
+
+  Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
+
+  // Unpack result if required.
+  if (VT != PackedVT)
+    Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
+
+  return Op;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 4fe77481706b..9550197159e6 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -72,19 +72,50 @@ enum NodeType : unsigned {
   ADC,
   SBC, // adc, sbc instructions
 
-  // Arithmetic instructions
+  // Predicated instructions where inactive lanes produce undefined results.
   ADD_PRED,
   FADD_PRED,
+  FDIV_PRED,
+  FMA_PRED,
+  FMAXNM_PRED,
+  FMINNM_PRED,
+  FMUL_PRED,
+  FSUB_PRED,
+  MUL_PRED,
   SDIV_PRED,
+  SHL_PRED,
+  SMAX_PRED,
+  SMIN_PRED,
+  SRA_PRED,
+  SRL_PRED,
+  SUB_PRED,
   UDIV_PRED,
-  FMA_PRED,
-  SMIN_MERGE_OP1,
-  UMIN_MERGE_OP1,
-  SMAX_MERGE_OP1,
-  UMAX_MERGE_OP1,
-  SHL_MERGE_OP1,
-  SRL_MERGE_OP1,
-  SRA_MERGE_OP1,
+  UMAX_PRED,
+  UMIN_PRED,
+
+  // Predicated instructions with the result of inactive lanes provided by the
+  // last operand.
+  FABS_MERGE_PASSTHRU,
+  FCEIL_MERGE_PASSTHRU,
+  FFLOOR_MERGE_PASSTHRU,
+  FNEARBYINT_MERGE_PASSTHRU,
+  FNEG_MERGE_PASSTHRU,
+  FRECPX_MERGE_PASSTHRU,
+  FRINT_MERGE_PASSTHRU,
+  FROUND_MERGE_PASSTHRU,
+  FROUNDEVEN_MERGE_PASSTHRU,
+  FSQRT_MERGE_PASSTHRU,
+  FTRUNC_MERGE_PASSTHRU,
+  FP_ROUND_MERGE_PASSTHRU,
+  FP_EXTEND_MERGE_PASSTHRU,
+  UINT_TO_FP_MERGE_PASSTHRU,
+  SINT_TO_FP_MERGE_PASSTHRU,
+  FCVTZU_MERGE_PASSTHRU,
+  FCVTZS_MERGE_PASSTHRU,
+  SIGN_EXTEND_INREG_MERGE_PASSTHRU,
+  ZERO_EXTEND_INREG_MERGE_PASSTHRU,
+  ABS_MERGE_PASSTHRU,
+  NEG_MERGE_PASSTHRU,
 
   SETCC_MERGE_ZERO,
 
@@ -188,10 +219,18 @@ enum NodeType : unsigned {
   SADDV,
   UADDV,
 
+  // Vector halving addition
+  SHADD,
+  UHADD,
+
   // Vector rounding halving addition
   SRHADD,
   URHADD,
 
+  // Absolute difference
+  UABD,
+  SABD,
+
   // Vector across-lanes min/max
   // Only the lower result lane is defined.
   SMINV,
@@ -199,6 +238,8 @@ enum NodeType : unsigned {
   SMAXV,
   UMAXV,
 
+  SADDV_PRED,
+  UADDV_PRED,
   SMAXV_PRED,
   UMAXV_PRED,
   SMINV_PRED,
@@ -207,9 +248,6 @@ enum NodeType : unsigned {
   EORV_PRED,
   ANDV_PRED,
 
-  // Vector bitwise negation
-  NOT,
-
   // Vector bitwise insertion
   BIT,
 
@@ -269,9 +307,14 @@ enum NodeType : unsigned {
   PTEST,
   PTRUE,
 
+  BITREVERSE_MERGE_PASSTHRU,
+  BSWAP_MERGE_PASSTHRU,
+  CTLZ_MERGE_PASSTHRU,
+  CTPOP_MERGE_PASSTHRU,
   DUP_MERGE_PASSTHRU,
   INDEX_VECTOR,
 
+  // Cast between vectors of the same element type but differ in length.
   REINTERPRET_CAST,
 
   LD1_MERGE_ZERO,
@@ -381,7 +424,11 @@ enum NodeType : unsigned {
 
   LDP,
   STP,
-  STNP
+  STNP,
+
+  // Pseudo for a OBJC call that gets emitted together with a special `mov
+  // x29, x29` marker instruction.
+  CALL_RVMARKER
 };
 
 } // end namespace AArch64ISD
@@ -391,12 +438,14 @@ namespace {
 // Any instruction that defines a 32-bit result zeros out the high half of the
 // register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
 // be copying from a truncate. But any other 32-bit operation will zero-extend
-// up to 64 bits.
+// up to 64 bits. AssertSext/AssertZext aren't saying anything about the upper
+// 32 bits, they're probably just qualifying a CopyFromReg.
 // FIXME: X86 also checks for CMOV here. Do we need something similar?
 static inline bool isDef32(const SDNode &N) {
   unsigned Opc = N.getOpcode();
   return Opc != ISD::TRUNCATE && Opc != TargetOpcode::EXTRACT_SUBREG &&
-         Opc != ISD::CopyFromReg;
+         Opc != ISD::CopyFromReg && Opc != ISD::AssertSext &&
+         Opc != ISD::AssertZext;
 }
 
 } // end anonymous namespace
@@ -455,12 +504,6 @@ public:
 
   SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
-  /// Returns true if a cast between SrcAS and DestAS is a noop.
-  bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
-    // Addrspacecasts are always noops.
-    return true;
-  }
-
   /// This method returns a target specific FastISel object, or null if the
   /// target does not support "fast" ISel.
   FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
@@ -741,9 +784,7 @@ public:
   /// illegal as the original, thus leading to an infinite legalisation loop.
   /// NOTE: Once BUILD_VECTOR is legal or can be custom lowered for all legal
   /// vector types this override can be removed.
-  bool mergeStoresAfterLegalization(EVT VT) const override {
-    return !useSVEForFixedLengthVectors();
-  }
+  bool mergeStoresAfterLegalization(EVT VT) const override;
 
 private:
   /// Keep a pointer to the AArch64Subtarget around so that we can
@@ -774,6 +815,10 @@ private:
                           SDValue ThisVal) const;
 
   SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerABS(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue LowerMGATHER(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerMSCATTER(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
 
@@ -858,24 +903,28 @@ private:
   SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSPLAT_VECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerDUPQLane(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerToPredicatedOp(SDValue Op, SelectionDAG &DAG,
-                              unsigned NewOp) const;
+  SDValue LowerToPredicatedOp(SDValue Op, SelectionDAG &DAG, unsigned NewOp,
+                              bool OverrideNEON = false) const;
+  SDValue LowerToScalableOp(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerDIV(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerF128Call(SDValue Op, SelectionDAG &DAG,
-                        RTLIB::Libcall Call) const;
+  SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVSCALE(SDValue Op, SelectionDAG &DAG) const;
@@ -890,7 +939,17 @@ private:
   SDValue LowerSVEStructLoad(unsigned Intrinsic, ArrayRef<SDValue> LoadOps,
                              EVT VT, SelectionDAG &DAG, const SDLoc &DL) const;
 
+  SDValue LowerFixedLengthVectorIntDivideToSVE(SDValue Op,
+                                               SelectionDAG &DAG) const;
+  SDValue LowerFixedLengthVectorIntExtendToSVE(SDValue Op,
+                                               SelectionDAG &DAG) const;
   SDValue LowerFixedLengthVectorLoadToSVE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp, SelectionDAG &DAG) const;
+  SDValue LowerPredReductionToSVE(SDValue ScalarOp, SelectionDAG &DAG) const;
+  SDValue LowerReductionToSVE(unsigned Opcode, SDValue ScalarOp,
+                              SelectionDAG &DAG) const;
+  SDValue LowerFixedLengthVectorSelectToSVE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFixedLengthVectorSetccToSVE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFixedLengthVectorStoreToSVE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFixedLengthVectorTruncateToSVE(SDValue Op,
                                               SelectionDAG &DAG) const;
@@ -902,6 +961,10 @@ private:
                           bool Reciprocal) const override;
   SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
                            int &ExtraSteps) const override;
+  SDValue getSqrtInputTest(SDValue Operand, SelectionDAG &DAG,
+                           const DenormalMode &Mode) const override;
+  SDValue getSqrtResultForDenormInput(SDValue Operand,
+                                      SelectionDAG &DAG) const override;
   unsigned combineRepeatedFPDivisors() const override;
 
   ConstraintType getConstraintType(StringRef Constraint) const override;
@@ -933,6 +996,7 @@ private:
     return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
   }
 
+  bool shouldRemoveExtendFromGSIndex(EVT VT) const override;
   bool isVectorLoadExtDesirable(SDValue ExtVal) const override;
   bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
   bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
@@ -959,8 +1023,21 @@ private:
   bool shouldLocalize(const MachineInstr &MI,
                       const TargetTransformInfo *TTI) const override;
 
-  bool useSVEForFixedLengthVectors() const;
-  bool useSVEForFixedLengthVectorVT(EVT VT) const;
+  // Normally SVE is only used for byte size vectors that do not fit within a
+  // NEON vector. This changes when OverrideNEON is true, allowing SVE to be
+  // used for 64bit and 128bit vectors as well.
+  bool useSVEForFixedLengthVectorVT(EVT VT, bool OverrideNEON = false) const;
+
+  // With the exception of data-predicate transitions, no instructions are
+  // required to cast between legal scalable vector types. However:
+  //  1. Packed and unpacked types have different bit lengths, meaning BITCAST
+  //     is not universally useable.
+  //  2. Most unpacked integer types are not legal and thus integer extends
+  //     cannot be used to convert between unpacked and packed types.
+  // These can make "bitcasting" a multiphase process. REINTERPRET_CAST is used
+  // to transition between unpacked and packed types of the same element type,
+  // with BITCAST used otherwise.
+  SDValue getSVESafeBitCast(EVT VT, SDValue Op, SelectionDAG &DAG) const;
 };
 
 namespace AArch64 {
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 4f4ba692c2db..cf08f56e5b08 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -60,10 +60,14 @@ class AArch64Inst<Format f, string cstr> : Instruction {
   bits<2> Form    = F.Value;
 
   // Defaults
+  bit isWhile = 0;
+  bit isPTestLike = 0;
   FalseLanesEnum FalseLanes = FalseLanesNone;
   DestructiveInstTypeEnum DestructiveInstType = NotDestructive;
   ElementSizeEnum ElementSize = ElementSizeNone;
 
+  let TSFlags{10}  = isPTestLike;
+  let TSFlags{9}   = isWhile;
   let TSFlags{8-7} = FalseLanes.Value;
   let TSFlags{6-3} = DestructiveInstType.Value;
   let TSFlags{2-0} = ElementSize.Value;
@@ -263,6 +267,7 @@ def adrplabel : Operand<i64> {
   let EncoderMethod = "getAdrLabelOpValue";
   let PrintMethod = "printAdrpLabel";
   let ParserMatchClass = AdrpOperand;
+  let OperandType = "OPERAND_PCREL";
 }
 
 def AdrOperand : AsmOperandClass {
@@ -325,7 +330,7 @@ def simm9 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= -256 && Imm < 256; }]> {
 }
 
 def SImm8Operand : SImmOperand<8>;
-def simm8 : Operand<i32>, ImmLeaf<i32, [{ return Imm >= -128 && Imm < 127; }]> {
+def simm8 : Operand<i32>, ImmLeaf<i32, [{ return Imm >= -128 && Imm < 128; }]> {
   let ParserMatchClass = SImm8Operand;
   let DecoderMethod = "DecodeSImm<8>";
 }
@@ -914,6 +919,13 @@ def imm0_1 : Operand<i64>, ImmLeaf<i64, [{
   let ParserMatchClass = Imm0_1Operand;
 }
 
+// timm0_1 - as above, but use TargetConstant (TImmLeaf)
+def timm0_1 : Operand<i64>, TImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 2;
+}]> {
+  let ParserMatchClass = Imm0_1Operand;
+}
+
 // imm0_15 predicate - True if the immediate is in the range [0,15]
 def imm0_15 : Operand<i64>, ImmLeaf<i64, [{
   return ((uint64_t)Imm) < 16;
@@ -1289,8 +1301,9 @@ class SimpleSystemI<bit L, dag iops, string asm, string operands,
 }
 
 // System instructions which have an Rt register.
-class RtSystemI<bit L, dag oops, dag iops, string asm, string operands>
-    : BaseSystemI<L, oops, iops, asm, operands>,
+class RtSystemI<bit L, dag oops, dag iops, string asm, string operands,
+                list<dag> pattern = []>
+    : BaseSystemI<L, oops, iops, asm, operands, pattern>,
       Sched<[WriteSys]> {
   bits<5> Rt;
   let Inst{4-0} = Rt;
@@ -1318,6 +1331,16 @@ class TMSystemI<bits<4> CRm, string asm, list<dag> pattern>
   let Inst{4-0} = Rt;
 }
 
+// System instructions that pass a register argument
+// This class assumes the register is for input rather than output.
+class RegInputSystemI<bits<4> CRm, bits<3> Op2, string asm,
+                      list<dag> pattern = []>
+    : RtSystemI<0, (outs), (ins GPR64:$Rt), asm, "\t$Rt", pattern> {
+  let Inst{20-12} = 0b000110001;
+  let Inst{11-8} = CRm;
+  let Inst{7-5} = Op2;
+}
+
 // System instructions for transactional memory - no operand
 class TMSystemINoOperand<bits<4> CRm, string asm, list<dag> pattern>
     : TMBaseSystemI<0b0, CRm, 0b011, (outs), (ins), asm, "", pattern> {
@@ -1358,6 +1381,14 @@ def barrier_op : Operand<i32> {
   let PrintMethod = "printBarrierOption";
   let ParserMatchClass = BarrierAsmOperand;
 }
+def BarriernXSAsmOperand : AsmOperandClass {
+  let Name = "BarriernXS";
+  let ParserMethod = "tryParseBarriernXSOperand";
+}
+def barrier_nxs_op : Operand<i32> {
+  let PrintMethod = "printBarriernXSOption";
+  let ParserMatchClass = BarriernXSAsmOperand;
+}
 class CRmSystemI<Operand crmtype, bits<3> opc, string asm,
                  list<dag> pattern = []>
     : SimpleSystemI<0, (ins crmtype:$CRm), asm, "\t$CRm", pattern>,
@@ -1439,6 +1470,7 @@ class MRSI : RtSystemI<1, (outs GPR64:$Rt), (ins mrs_sysreg_op:$systemreg),
                        "mrs", "\t$Rt, $systemreg"> {
   bits<16> systemreg;
   let Inst{20-5} = systemreg;
+  let DecoderNamespace = "Fallback";
 }
 
 // FIXME: Some of these def NZCV, others don't. Best way to model that?
@@ -1448,6 +1480,7 @@ class MSRI : RtSystemI<0, (outs), (ins msr_sysreg_op:$systemreg, GPR64:$Rt),
                        "msr", "\t$systemreg, $Rt"> {
   bits<16> systemreg;
   let Inst{20-5} = systemreg;
+  let DecoderNamespace = "Fallback";
 }
 
 def SystemPStateFieldWithImm0_15Operand : AsmOperandClass {
@@ -1937,11 +1970,21 @@ class SignAuthTwoOperand<bits<4> opc, string asm,
   let Inst{4-0}   = Rd;
 }
 
+class ClearAuth<bits<1> data, string asm>
+  : I<(outs GPR64:$Rd), (ins GPR64:$Rn), asm, "\t$Rd", "$Rd = $Rn", []>, Sched<[]> {
+  bits<5> Rd;
+  let Inst{31-11} = 0b110110101100000101000;
+  let Inst{10} = data;
+  let Inst{9-5} = 0b11111;
+  let Inst{4-0} = Rd;
+}
+
 // Base class for the Armv8.4-A 8 and 16-bit flag manipulation instructions
 class BaseFlagManipulation<bit sf, bit sz, dag iops, string asm, string ops>
     : I<(outs), iops, asm, ops, "", []>,
       Sched<[WriteI, ReadI, ReadI]> {
   let Uses = [NZCV];
+  let Defs = [NZCV];
   bits<5> Rn;
   let Inst{31}    = sf;
   let Inst{30-15} = 0b0111010000000000;
@@ -3929,7 +3972,7 @@ class LoadPreIdx<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
                      (outs GPR64sp:$wback, regtype:$Rt),
                      (ins GPR64sp:$Rn, simm9:$offset), asm,
                      "$Rn = $wback,@earlyclobber $wback", []>,
-      Sched<[WriteLD, WriteAdr]>;
+      Sched<[WriteAdr, WriteLD]>;
 
 let mayStore = 1, mayLoad = 0 in
 class StorePreIdx<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
@@ -3975,7 +4018,7 @@ class LoadPostIdx<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
                       (outs GPR64sp:$wback, regtype:$Rt),
                       (ins GPR64sp:$Rn, simm9:$offset),
                       asm, "$Rn = $wback,@earlyclobber $wback", []>,
-      Sched<[WriteLD, WriteAdr]>;
+      Sched<[WriteAdr, WriteLD]>;
 
 let mayStore = 1, mayLoad = 0 in
 class StorePostIdx<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
@@ -4072,7 +4115,7 @@ class LoadPairPreIdx<bits<2> opc, bit V, RegisterOperand regtype,
     : BaseLoadStorePairPreIdx<opc, V, 1,
                               (outs GPR64sp:$wback, regtype:$Rt, regtype:$Rt2),
                               (ins GPR64sp:$Rn, indextype:$offset), asm>,
-      Sched<[WriteLD, WriteLDHi, WriteAdr]>;
+      Sched<[WriteAdr, WriteLD, WriteLDHi]>;
 
 let mayStore = 1, mayLoad = 0 in
 class StorePairPreIdx<bits<2> opc, bit V, RegisterOperand regtype,
@@ -4113,7 +4156,7 @@ class LoadPairPostIdx<bits<2> opc, bit V, RegisterOperand regtype,
     : BaseLoadStorePairPostIdx<opc, V, 1,
                               (outs GPR64sp:$wback, regtype:$Rt, regtype:$Rt2),
                               (ins GPR64sp:$Rn, idxtype:$offset), asm>,
-      Sched<[WriteLD, WriteLDHi, WriteAdr]>;
+      Sched<[WriteAdr, WriteLD, WriteLDHi]>;
 
 let mayStore = 1, mayLoad = 0 in
 class StorePairPostIdx<bits<2> opc, bit V, RegisterOperand regtype,
@@ -7831,9 +7874,9 @@ class BaseSIMDThreeSameVectorBFDot<bit Q, bit U, string asm, string kind1,
 
 multiclass SIMDThreeSameVectorBFDot<bit U, string asm> {
   def v4bf16 : BaseSIMDThreeSameVectorBFDot<0, U, asm, ".2s", ".4h", V64,
-                                           v2f32, v8i8>;
+                                           v2f32, v4bf16>;
   def v8bf16 : BaseSIMDThreeSameVectorBFDot<1, U, asm, ".4s", ".8h", V128,
-                                           v4f32, v16i8>;
+                                           v4f32, v8bf16>;
 }
 
 class BaseSIMDThreeSameVectorBF16DotI<bit Q, bit U, string asm,
@@ -7851,7 +7894,7 @@ class BaseSIMDThreeSameVectorBF16DotI<bit Q, bit U, string asm,
                                  (InputType RegType:$Rn),
                                  (InputType (bitconvert (AccumType
                                     (AArch64duplane32 (v4f32 V128:$Rm),
-                                        VectorIndexH:$idx)))))))]> {
+                                        VectorIndexS:$idx)))))))]> {
 
   bits<2> idx;
   let Inst{21}    = idx{0};  // L
@@ -7861,16 +7904,16 @@ class BaseSIMDThreeSameVectorBF16DotI<bit Q, bit U, string asm,
 multiclass SIMDThreeSameVectorBF16DotI<bit U, string asm> {
 
   def v4bf16  : BaseSIMDThreeSameVectorBF16DotI<0, U, asm, ".2s", ".4h",
-                                               ".2h", V64, v2f32, v8i8>;
+                                               ".2h", V64, v2f32, v4bf16>;
   def v8bf16 : BaseSIMDThreeSameVectorBF16DotI<1, U, asm, ".4s", ".8h",
-                                              ".2h", V128, v4f32, v16i8>;
+                                              ".2h", V128, v4f32, v8bf16>;
 }
 
 class SIMDBF16MLAL<bit Q, string asm, SDPatternOperator OpNode>
   : BaseSIMDThreeSameVectorTied<Q, 0b1, 0b110, 0b11111, V128, asm, ".4s",
               [(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd),
-                                               (v16i8 V128:$Rn),
-                                               (v16i8 V128:$Rm)))]> {
+                                               (v8bf16 V128:$Rn),
+                                               (v8bf16 V128:$Rm)))]> {
   let AsmString = !strconcat(asm, "{\t$Rd.4s, $Rn.8h, $Rm.8h}");
 }
 
@@ -7880,10 +7923,10 @@ class SIMDBF16MLALIndex<bit Q, string asm, SDPatternOperator OpNode>
       "{\t$Rd.4s, $Rn.8h, $Rm.h$idx}", "$Rd = $dst",
           [(set (v4f32 V128:$dst),
                 (v4f32 (OpNode (v4f32 V128:$Rd),
-                               (v16i8 V128:$Rn),
-                               (v16i8 (bitconvert (v8bf16
+                               (v8bf16 V128:$Rn),
+                               (v8bf16
                                   (AArch64duplane16 (v8bf16 V128_lo:$Rm),
-                                      VectorIndexH:$idx)))))))]>,
+                                      VectorIndexH:$idx)))))]>,
     Sched<[WriteV]> {
   bits<5> Rd;
   bits<5> Rn;
@@ -7907,8 +7950,8 @@ class SIMDThreeSameVectorBF16MatrixMul<string asm>
                                 V128, asm, ".4s",
                           [(set (v4f32 V128:$dst),
                                 (int_aarch64_neon_bfmmla (v4f32 V128:$Rd),
-                                                         (v16i8 V128:$Rn),
-                                                         (v16i8 V128:$Rm)))]> {
+                                                         (v8bf16 V128:$Rn),
+                                                         (v8bf16 V128:$Rm)))]> {
   let AsmString = !strconcat(asm, "{\t$Rd", ".4s", ", $Rn", ".8h",
                                     ", $Rm", ".8h", "}");
 }
@@ -10586,14 +10629,14 @@ multiclass SIMDThreeSameVectorComplexHSD<bit U, bits<3> opcode, Operand rottype,
               [(set (v4f16 V64:$dst), (OpNode (v4f16 V64:$Rd),
                                               (v4f16 V64:$Rn),
                                               (v4f16 V64:$Rm),
-                                              (rottype i32:$rot)))]>;
+                                              (i32 rottype:$rot)))]>;
 
   def v8f16 : BaseSIMDThreeSameVectorComplex<1, U, 0b01, opcode, V128, rottype,
               asm, ".8h",
               [(set (v8f16 V128:$dst), (OpNode (v8f16 V128:$Rd),
                                                (v8f16 V128:$Rn),
                                                (v8f16 V128:$Rm),
-                                               (rottype i32:$rot)))]>;
+                                               (i32 rottype:$rot)))]>;
   }
 
   let Predicates = [HasComplxNum, HasNEON] in {
@@ -10602,21 +10645,21 @@ multiclass SIMDThreeSameVectorComplexHSD<bit U, bits<3> opcode, Operand rottype,
               [(set (v2f32 V64:$dst), (OpNode (v2f32 V64:$Rd),
                                               (v2f32 V64:$Rn),
                                               (v2f32 V64:$Rm),
-                                              (rottype i32:$rot)))]>;
+                                              (i32 rottype:$rot)))]>;
 
   def v4f32 : BaseSIMDThreeSameVectorComplex<1, U, 0b10, opcode, V128, rottype,
               asm, ".4s",
               [(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd),
                                                (v4f32 V128:$Rn),
                                                (v4f32 V128:$Rm),
-                                               (rottype i32:$rot)))]>;
+                                               (i32 rottype:$rot)))]>;
 
   def v2f64 : BaseSIMDThreeSameVectorComplex<1, U, 0b11, opcode, V128, rottype,
               asm, ".2d",
               [(set (v2f64 V128:$dst), (OpNode (v2f64 V128:$Rd),
                                                (v2f64 V128:$Rn),
                                                (v2f64 V128:$Rm),
-                                               (rottype i32:$rot)))]>;
+                                               (i32 rottype:$rot)))]>;
   }
 }
 
@@ -10658,14 +10701,14 @@ multiclass SIMDThreeSameVectorTiedComplexHSD<bit U, bits<3> opcode,
               [(set (v4f16 V64:$dst), (OpNode (v4f16 V64:$Rd),
                                               (v4f16 V64:$Rn),
                                               (v4f16 V64:$Rm),
-                                              (rottype i32:$rot)))]>;
+                                              (i32 rottype:$rot)))]>;
 
   def v8f16 : BaseSIMDThreeSameVectorTiedComplex<1, U, 0b01, opcode, V128,
               rottype, asm, ".8h",
               [(set (v8f16 V128:$dst), (OpNode (v8f16 V128:$Rd),
                                                (v8f16 V128:$Rn),
                                                (v8f16 V128:$Rm),
-                                               (rottype i32:$rot)))]>;
+                                               (i32 rottype:$rot)))]>;
   }
 
   let Predicates = [HasComplxNum, HasNEON] in {
@@ -10674,21 +10717,21 @@ multiclass SIMDThreeSameVectorTiedComplexHSD<bit U, bits<3> opcode,
               [(set (v2f32 V64:$dst), (OpNode (v2f32 V64:$Rd),
                                               (v2f32 V64:$Rn),
                                               (v2f32 V64:$Rm),
-                                              (rottype i32:$rot)))]>;
+                                              (i32 rottype:$rot)))]>;
 
   def v4f32 : BaseSIMDThreeSameVectorTiedComplex<1, U, 0b10, opcode, V128,
               rottype, asm, ".4s",
               [(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd),
                                                (v4f32 V128:$Rn),
                                                (v4f32 V128:$Rm),
-                                               (rottype i32:$rot)))]>;
+                                               (i32 rottype:$rot)))]>;
 
   def v2f64 : BaseSIMDThreeSameVectorTiedComplex<1, U, 0b11, opcode, V128,
               rottype, asm, ".2d",
               [(set (v2f64 V128:$dst), (OpNode (v2f64 V128:$Rd),
                                                (v2f64 V128:$Rn),
                                                (v2f64 V128:$Rm),
-                                               (rottype i32:$rot)))]>;
+                                               (i32 rottype:$rot)))]>;
   }
 }
 
@@ -11216,6 +11259,35 @@ multiclass STOPregister<string asm, string instr> {
                     !cast<Instruction>(instr # "X")>;
 }
 
+class LoadStore64B_base<bits<3> opc, string asm_inst, string asm_ops,
+                        dag iops, dag oops, list<dag> pat>
+    : I<oops, iops, asm_inst, asm_ops, "", pat>,
+      Sched<[]> /* FIXME: fill in scheduling details once known */ {
+  bits<5> Rt;
+  bits<5> Rn;
+  let Inst{31-21} = 0b11111000001;
+  let Inst{15}    = 1;
+  let Inst{14-12} = opc;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let Predicates = [HasV8_7a];
+}
+
+class LoadStore64B<bits<3> opc, string asm_inst, dag iops, dag oops,
+                      list<dag> pat = []>
+    : LoadStore64B_base<opc, asm_inst, "\t$Rt, [$Rn]", iops, oops, pat> {
+  let Inst{20-16} = 0b11111;
+}
+
+class Store64BV<bits<3> opc, string asm_inst, list<dag> pat = []>
+    : LoadStore64B_base<opc, asm_inst, "\t$Rs, $Rt, [$Rn]",
+                       (ins GPR64x8:$Rt, GPR64sp:$Rn), (outs GPR64:$Rs), pat> {
+  bits<5> Rs;
+  let Inst{20-16} = Rs;
+}
+
 //----------------------------------------------------------------------------
 // Allow the size specifier tokens to be upper case, not just lower.
 def : TokenAlias<".4B", ".4b">;  // Add dot product
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrGISel.td
index a0e7c782f68c..25656fac1d2f 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrGISel.td
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrGISel.td
@@ -88,6 +88,29 @@ def G_DUP: AArch64GenericInstruction {
   let InOperandList = (ins type1:$lane);
   let hasSideEffects = 0;
 }
+
+// Represents a lane duplicate operation.
+def G_DUPLANE8 : AArch64GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src, type1:$lane);
+  let hasSideEffects = 0;
+}
+def G_DUPLANE16 : AArch64GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src, type1:$lane);
+  let hasSideEffects = 0;
+}
+def G_DUPLANE32 : AArch64GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src, type1:$lane);
+  let hasSideEffects = 0;
+}
+def G_DUPLANE64 : AArch64GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src, type1:$lane);
+  let hasSideEffects = 0;
+}
+
 // Represents a trn1 instruction. Produced post-legalization from
 // G_SHUFFLE_VECTORs with appropriate masks.
 def G_TRN1 : AArch64GenericInstruction {
@@ -111,6 +134,28 @@ def G_EXT: AArch64GenericInstruction {
   let InOperandList = (ins type0:$v1, type0:$v2, untyped_imm_0:$imm);
 }
 
+// Represents a vector G_ASHR with an immediate.
+def G_VASHR : AArch64GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src1, untyped_imm_0:$imm);
+}
+
+// Represents a vector G_LSHR with an immediate.
+def G_VLSHR : AArch64GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src1, untyped_imm_0:$imm);
+}
+
+// Represents an integer to FP conversion on the FPR bank.
+def G_SITOF : AArch64GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src);
+}
+def G_UITOF : AArch64GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src);
+}
+
 def : GINodeEquiv<G_REV16, AArch64rev16>;
 def : GINodeEquiv<G_REV32, AArch64rev32>;
 def : GINodeEquiv<G_REV64, AArch64rev64>;
@@ -119,6 +164,21 @@ def : GINodeEquiv<G_UZP2, AArch64uzp2>;
 def : GINodeEquiv<G_ZIP1, AArch64zip1>;
 def : GINodeEquiv<G_ZIP2, AArch64zip2>;
 def : GINodeEquiv<G_DUP, AArch64dup>;
+def : GINodeEquiv<G_DUPLANE8, AArch64duplane8>;
+def : GINodeEquiv<G_DUPLANE16, AArch64duplane16>;
+def : GINodeEquiv<G_DUPLANE32, AArch64duplane32>;
+def : GINodeEquiv<G_DUPLANE64, AArch64duplane64>;
 def : GINodeEquiv<G_TRN1, AArch64trn1>;
 def : GINodeEquiv<G_TRN2, AArch64trn2>;
 def : GINodeEquiv<G_EXT, AArch64ext>;
+def : GINodeEquiv<G_VASHR, AArch64vashr>;
+def : GINodeEquiv<G_VLSHR, AArch64vlshr>;
+def : GINodeEquiv<G_SITOF, AArch64sitof>;
+def : GINodeEquiv<G_UITOF, AArch64uitof>;
+
+def : GINodeEquiv<G_EXTRACT_VECTOR_ELT, vector_extract>;
+
+// These are patterns that we only use for GlobalISel via the importer.
+def : Pat<(f32 (fadd (vector_extract (v2f32 FPR64:$Rn), (i64 0)),
+                     (vector_extract (v2f32 FPR64:$Rn), (i64 1)))),
+           (f32 (FADDPv2i32p (v2f32 FPR64:$Rn)))>;
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 08f80c9aa361..6b38e216a854 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -107,6 +107,13 @@ unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
     NumBytes = PatchPointOpers(&MI).getNumPatchBytes();
     assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
     break;
+  case TargetOpcode::STATEPOINT:
+    NumBytes = StatepointOpers(&MI).getNumPatchBytes();
+    assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
+    // No patch bytes means a normal call inst is emitted
+    if (NumBytes == 0)
+      NumBytes = 4;
+    break;
   case AArch64::TLSDESC_CALLSEQ:
     // This gets lowered to an instruction sequence which takes 16 bytes
     NumBytes = 16;
@@ -287,6 +294,31 @@ bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
     }
   }
 
+  // If we're allowed to modify and the block ends in a unconditional branch
+  // which could simply fallthrough, remove the branch.  (Note: This case only
+  // matters when we can't understand the whole sequence, otherwise it's also
+  // handled by BranchFolding.cpp.)
+  if (AllowModify && isUncondBranchOpcode(LastOpc) &&
+      MBB.isLayoutSuccessor(getBranchDestBlock(*LastInst))) {
+    LastInst->eraseFromParent();
+    LastInst = SecondLastInst;
+    LastOpc = LastInst->getOpcode();
+    if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
+      assert(!isUncondBranchOpcode(LastOpc) &&
+             "unreachable unconditional branches removed above");
+
+      if (isCondBranchOpcode(LastOpc)) {
+        // Block ends with fall-through condbranch.
+        parseCondBranch(LastInst, TBB, Cond);
+        return false;
+      }
+      return true; // Can't handle indirect branch.
+    } else {
+      SecondLastInst = &*I;
+      SecondLastOpc = SecondLastInst->getOpcode();
+    }
+  }
+
   // If there are three terminators, we don't know what sort of block this is.
   if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
     return true;
@@ -321,6 +353,56 @@ bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
   return true;
 }
 
+bool AArch64InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB,
+                                              MachineBranchPredicate &MBP,
+                                              bool AllowModify) const {
+  // For the moment, handle only a block which ends with a cb(n)zx followed by
+  // a fallthrough.  Why this?  Because it is a common form.
+  // TODO: Should we handle b.cc?
+
+  MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
+  if (I == MBB.end())
+    return true;
+
+  // Skip over SpeculationBarrierEndBB terminators
+  if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
+      I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
+    --I;
+  }
+
+  if (!isUnpredicatedTerminator(*I))
+    return true;
+
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = &*I;
+  unsigned LastOpc = LastInst->getOpcode();
+  if (!isCondBranchOpcode(LastOpc))
+    return true;
+
+  switch (LastOpc) {
+  default:
+    return true;
+  case AArch64::CBZW:
+  case AArch64::CBZX:
+  case AArch64::CBNZW:
+  case AArch64::CBNZX:
+    break;
+  };
+
+  MBP.TrueDest = LastInst->getOperand(1).getMBB();
+  assert(MBP.TrueDest && "expected!");
+  MBP.FalseDest = MBB.getNextNode();
+
+  MBP.ConditionDef = nullptr;
+  MBP.SingleUseCondition = false;
+
+  MBP.LHS = LastInst->getOperand(0);
+  MBP.RHS = MachineOperand::CreateImm(0);
+  MBP.Predicate = LastOpc == AArch64::CBNZX ? MachineBranchPredicate::PRED_NE
+                                            : MachineBranchPredicate::PRED_EQ;
+  return false;
+}
+
 bool AArch64InstrInfo::reverseBranchCondition(
     SmallVectorImpl<MachineOperand> &Cond) const {
   if (Cond[0].getImm() != -1) {
@@ -1037,6 +1119,13 @@ bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
   switch (MI.getOpcode()) {
   default:
     break;
+  case AArch64::PTEST_PP:
+    SrcReg = MI.getOperand(0).getReg();
+    SrcReg2 = MI.getOperand(1).getReg();
+    // Not sure about the mask and value for now...
+    CmpMask = ~0;
+    CmpValue = 0;
+    return true;
   case AArch64::SUBSWrr:
   case AArch64::SUBSWrs:
   case AArch64::SUBSWrx:
@@ -1192,10 +1281,9 @@ static bool areCFlagsAccessedBetweenInstrs(
     return true;
 
   // From must be above To.
-  assert(std::find_if(++To.getReverse(), To->getParent()->rend(),
-                      [From](MachineInstr &MI) {
-                        return MI.getIterator() == From;
-                      }) != To->getParent()->rend());
+  assert(std::any_of(
+      ++To.getReverse(), To->getParent()->rend(),
+      [From](MachineInstr &MI) { return MI.getIterator() == From; }));
 
   // We iterate backward starting at \p To until we hit \p From.
   for (const MachineInstr &Instr :
@@ -1208,6 +1296,127 @@ static bool areCFlagsAccessedBetweenInstrs(
   return false;
 }
 
+/// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating
+/// operation which could set the flags in an identical manner
+bool AArch64InstrInfo::optimizePTestInstr(
+    MachineInstr *PTest, unsigned MaskReg, unsigned PredReg,
+    const MachineRegisterInfo *MRI) const {
+  auto *Mask = MRI->getUniqueVRegDef(MaskReg);
+  auto *Pred = MRI->getUniqueVRegDef(PredReg);
+  auto NewOp = Pred->getOpcode();
+  bool OpChanged = false;
+
+  unsigned MaskOpcode = Mask->getOpcode();
+  unsigned PredOpcode = Pred->getOpcode();
+  bool PredIsPTestLike = isPTestLikeOpcode(PredOpcode);
+  bool PredIsWhileLike = isWhileOpcode(PredOpcode);
+
+  if (isPTrueOpcode(MaskOpcode) && (PredIsPTestLike || PredIsWhileLike)) {
+    // For PTEST(PTRUE, OTHER_INST), PTEST is redundant when PTRUE doesn't
+    // deactivate any lanes OTHER_INST might set.
+    uint64_t MaskElementSize = getElementSizeForOpcode(MaskOpcode);
+    uint64_t PredElementSize = getElementSizeForOpcode(PredOpcode);
+
+    // Must be an all active predicate of matching element size.
+    if ((PredElementSize != MaskElementSize) ||
+        (Mask->getOperand(1).getImm() != 31))
+      return false;
+
+    // Fallthough to simply remove the PTEST.
+  } else if ((Mask == Pred) && (PredIsPTestLike || PredIsWhileLike)) {
+    // For PTEST(PG, PG), PTEST is redundant when PG is the result of an
+    // instruction that sets the flags as PTEST would.
+
+    // Fallthough to simply remove the PTEST.
+  } else if (PredIsPTestLike) {
+    // For PTEST(PG_1, PTEST_LIKE(PG2, ...)), PTEST is redundant when both
+    // instructions use the same predicate.
+    auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
+    if (Mask != PTestLikeMask)
+      return false;
+
+    // Fallthough to simply remove the PTEST.
+  } else {
+    switch (Pred->getOpcode()) {
+    case AArch64::BRKB_PPzP:
+    case AArch64::BRKPB_PPzPP: {
+      // Op 0 is chain, 1 is the mask, 2 the previous predicate to
+      // propagate, 3 the new predicate.
+
+      // Check to see if our mask is the same as the brkpb's. If
+      // not the resulting flag bits may be different and we
+      // can't remove the ptest.
+      auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
+      if (Mask != PredMask)
+        return false;
+
+      // Switch to the new opcode
+      NewOp = Pred->getOpcode() == AArch64::BRKB_PPzP ? AArch64::BRKBS_PPzP
+                                                      : AArch64::BRKPBS_PPzPP;
+      OpChanged = true;
+      break;
+    }
+    case AArch64::BRKN_PPzP: {
+      auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
+      if (Mask != PredMask)
+        return false;
+
+      NewOp = AArch64::BRKNS_PPzP;
+      OpChanged = true;
+      break;
+    }
+    default:
+      // Bail out if we don't recognize the input
+      return false;
+    }
+  }
+
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+
+  // If the predicate is in a different block (possibly because its been
+  // hoisted out), then assume the flags are set in between statements.
+  if (Pred->getParent() != PTest->getParent())
+    return false;
+
+  // If another instruction between the propagation and test sets the
+  // flags, don't remove the ptest.
+  MachineBasicBlock::iterator I = Pred, E = PTest;
+  ++I; // Skip past the predicate op itself.
+  for (; I != E; ++I) {
+    const MachineInstr &Inst = *I;
+
+    // TODO: If the ptest flags are unused, we could still remove it.
+    if (Inst.modifiesRegister(AArch64::NZCV, TRI))
+      return false;
+  }
+
+  // If we pass all the checks, it's safe to remove the PTEST and use the flags
+  // as they are prior to PTEST. Sometimes this requires the tested PTEST
+  // operand to be replaced with an equivalent instruction that also sets the
+  // flags.
+  Pred->setDesc(get(NewOp));
+  PTest->eraseFromParent();
+  if (OpChanged) {
+    bool succeeded = UpdateOperandRegClass(*Pred);
+    (void)succeeded;
+    assert(succeeded && "Operands have incompatible register classes!");
+    Pred->addRegisterDefined(AArch64::NZCV, TRI);
+  }
+
+  // Ensure that the flags def is live.
+  if (Pred->registerDefIsDead(AArch64::NZCV, TRI)) {
+    unsigned i = 0, e = Pred->getNumOperands();
+    for (; i != e; ++i) {
+      MachineOperand &MO = Pred->getOperand(i);
+      if (MO.isReg() && MO.isDef() && MO.getReg() == AArch64::NZCV) {
+        MO.setIsDead(false);
+        break;
+      }
+    }
+  }
+  return true;
+}
+
 /// Try to optimize a compare instruction. A compare instruction is an
 /// instruction which produces AArch64::NZCV. It can be truly compare
 /// instruction
@@ -1246,6 +1455,9 @@ bool AArch64InstrInfo::optimizeCompareInstr(
     return true;
   }
 
+  if (CmpInstr.getOpcode() == AArch64::PTEST_PP)
+    return optimizePTestInstr(&CmpInstr, SrcReg, SrcReg2, MRI);
+
   // Continue only if we have a "ri" where immediate is zero.
   // FIXME:CmpValue has already been converted to 0 or 1 in analyzeCompare
   // function.
@@ -2062,6 +2274,24 @@ bool AArch64InstrInfo::getMemOperandsWithOffsetWidth(
   return true;
 }
 
+Optional<ExtAddrMode>
+AArch64InstrInfo::getAddrModeFromMemoryOp(const MachineInstr &MemI,
+                                          const TargetRegisterInfo *TRI) const {
+  const MachineOperand *Base; // Filled with the base operand of MI.
+  int64_t Offset;             // Filled with the offset of MI.
+  bool OffsetIsScalable;
+  if (!getMemOperandWithOffset(MemI, Base, Offset, OffsetIsScalable, TRI))
+    return None;
+
+  if (!Base->isReg())
+    return None;
+  ExtAddrMode AM;
+  AM.BaseReg = Base->getReg();
+  AM.Displacement = Offset;
+  AM.ScaledReg = 0;
+  return AM;
+}
+
 bool AArch64InstrInfo::getMemOperandWithOffsetWidth(
     const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,
     bool &OffsetIsScalable, unsigned &Width,
@@ -3060,7 +3290,7 @@ void AArch64InstrInfo::storeRegToStackSlot(
     else if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
       assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
       Opc = AArch64::STR_PXI;
-      StackID = TargetStackID::SVEVector;
+      StackID = TargetStackID::ScalableVector;
     }
     break;
   case 4:
@@ -3104,7 +3334,7 @@ void AArch64InstrInfo::storeRegToStackSlot(
     } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
       assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
       Opc = AArch64::STR_ZXI;
-      StackID = TargetStackID::SVEVector;
+      StackID = TargetStackID::ScalableVector;
     }
     break;
   case 24:
@@ -3126,7 +3356,7 @@ void AArch64InstrInfo::storeRegToStackSlot(
     } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
       assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
       Opc = AArch64::STR_ZZXI;
-      StackID = TargetStackID::SVEVector;
+      StackID = TargetStackID::ScalableVector;
     }
     break;
   case 48:
@@ -3137,7 +3367,7 @@ void AArch64InstrInfo::storeRegToStackSlot(
     } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
       assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
       Opc = AArch64::STR_ZZZXI;
-      StackID = TargetStackID::SVEVector;
+      StackID = TargetStackID::ScalableVector;
     }
     break;
   case 64:
@@ -3148,7 +3378,7 @@ void AArch64InstrInfo::storeRegToStackSlot(
     } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
       assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
       Opc = AArch64::STR_ZZZZXI;
-      StackID = TargetStackID::SVEVector;
+      StackID = TargetStackID::ScalableVector;
     }
     break;
   }
@@ -3214,7 +3444,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(
     else if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
       assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
       Opc = AArch64::LDR_PXI;
-      StackID = TargetStackID::SVEVector;
+      StackID = TargetStackID::ScalableVector;
     }
     break;
   case 4:
@@ -3258,7 +3488,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(
     } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
       assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
       Opc = AArch64::LDR_ZXI;
-      StackID = TargetStackID::SVEVector;
+      StackID = TargetStackID::ScalableVector;
     }
     break;
   case 24:
@@ -3280,7 +3510,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(
     } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
       assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
       Opc = AArch64::LDR_ZZXI;
-      StackID = TargetStackID::SVEVector;
+      StackID = TargetStackID::ScalableVector;
     }
     break;
   case 48:
@@ -3291,7 +3521,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(
     } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
       assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
       Opc = AArch64::LDR_ZZZXI;
-      StackID = TargetStackID::SVEVector;
+      StackID = TargetStackID::ScalableVector;
     }
     break;
   case 64:
@@ -3302,7 +3532,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(
     } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
       assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
       Opc = AArch64::LDR_ZZZZXI;
-      StackID = TargetStackID::SVEVector;
+      StackID = TargetStackID::ScalableVector;
     }
     break;
   }
@@ -3329,6 +3559,47 @@ bool llvm::isNZCVTouchedInInstructionRange(const MachineInstr &DefMI,
                 });
 }
 
+void AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
+    const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized) {
+  // The smallest scalable element supported by scaled SVE addressing
+  // modes are predicates, which are 2 scalable bytes in size. So the scalable
+  // byte offset must always be a multiple of 2.
+  assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
+
+  // VGSized offsets are divided by '2', because the VG register is the
+  // the number of 64bit granules as opposed to 128bit vector chunks,
+  // which is how the 'n' in e.g. MVT::nxv1i8 is modelled.
+  // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes.
+  // VG = n * 2 and the dwarf offset must be VG * 8 bytes.
+  ByteSized = Offset.getFixed();
+  VGSized = Offset.getScalable() / 2;
+}
+
+/// Returns the offset in parts to which this frame offset can be
+/// decomposed for the purpose of describing a frame offset.
+/// For non-scalable offsets this is simply its byte size.
+void AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
+    const StackOffset &Offset, int64_t &NumBytes, int64_t &NumPredicateVectors,
+    int64_t &NumDataVectors) {
+  // The smallest scalable element supported by scaled SVE addressing
+  // modes are predicates, which are 2 scalable bytes in size. So the scalable
+  // byte offset must always be a multiple of 2.
+  assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
+
+  NumBytes = Offset.getFixed();
+  NumDataVectors = 0;
+  NumPredicateVectors = Offset.getScalable() / 2;
+  // This method is used to get the offsets to adjust the frame offset.
+  // If the function requires ADDPL to be used and needs more than two ADDPL
+  // instructions, part of the offset is folded into NumDataVectors so that it
+  // uses ADDVL for part of it, reducing the number of ADDPL instructions.
+  if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 ||
+      NumPredicateVectors > 62) {
+    NumDataVectors = NumPredicateVectors / 8;
+    NumPredicateVectors -= NumDataVectors * 8;
+  }
+}
+
 // Helper function to emit a frame offset adjustment from a given
 // pointer (SrcReg), stored into DestReg. This function is explicit
 // in that it requires the opcode.
@@ -3438,12 +3709,13 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB,
                            MachineInstr::MIFlag Flag, bool SetNZCV,
                            bool NeedsWinCFI, bool *HasWinCFI) {
   int64_t Bytes, NumPredicateVectors, NumDataVectors;
-  Offset.getForFrameOffset(Bytes, NumPredicateVectors, NumDataVectors);
+  AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
+      Offset, Bytes, NumPredicateVectors, NumDataVectors);
 
   // First emit non-scalable frame offsets, or a simple 'mov'.
   if (Bytes || (!Offset && SrcReg != DestReg)) {
-    assert((DestReg != AArch64::SP || Bytes % 16 == 0) &&
-           "SP increment/decrement not 16-byte aligned");
+    assert((DestReg != AArch64::SP || Bytes % 8 == 0) &&
+           "SP increment/decrement not 8-byte aligned");
     unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri;
     if (Bytes < 0) {
       Bytes = -Bytes;
@@ -3698,7 +3970,7 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI,
   // Construct the complete offset.
   bool IsMulVL = ScaleValue.isScalable();
   unsigned Scale = ScaleValue.getKnownMinSize();
-  int64_t Offset = IsMulVL ? SOffset.getScalableBytes() : SOffset.getBytes();
+  int64_t Offset = IsMulVL ? SOffset.getScalable() : SOffset.getFixed();
 
   const MachineOperand &ImmOpnd =
       MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode()));
@@ -3740,11 +4012,9 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI,
     *OutUnscaledOp = *UnscaledOp;
 
   if (IsMulVL)
-    SOffset = StackOffset(Offset, MVT::nxv1i8) +
-              StackOffset(SOffset.getBytes(), MVT::i8);
+    SOffset = StackOffset::get(SOffset.getFixed(), Offset);
   else
-    SOffset = StackOffset(Offset, MVT::i8) +
-              StackOffset(SOffset.getScalableBytes(), MVT::nxv1i8);
+    SOffset = StackOffset::get(Offset, SOffset.getScalable());
   return AArch64FrameOffsetCanUpdate |
          (SOffset ? 0 : AArch64FrameOffsetIsLegal);
 }
@@ -3756,7 +4026,7 @@ bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
   unsigned ImmIdx = FrameRegIdx + 1;
 
   if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
-    Offset += StackOffset(MI.getOperand(ImmIdx).getImm(), MVT::i8);
+    Offset += StackOffset::getFixed(MI.getOperand(ImmIdx).getImm());
     emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
                     MI.getOperand(0).getReg(), FrameReg, Offset, TII,
                     MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
@@ -3861,7 +4131,7 @@ static bool isCombineInstrCandidate64(unsigned Opc) {
   return false;
 }
 
-// FP Opcodes that can be combined with a FMUL
+// FP Opcodes that can be combined with a FMUL.
 static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
   switch (Inst.getOpcode()) {
   default:
@@ -3883,8 +4153,12 @@ static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
   case AArch64::FSUBv2f64:
   case AArch64::FSUBv4f32:
     TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options;
-    return (Options.UnsafeFPMath ||
-            Options.AllowFPOpFusion == FPOpFusion::Fast);
+    // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by
+    // the target options or if FADD/FSUB has the contract fast-math flag.
+    return Options.UnsafeFPMath ||
+           Options.AllowFPOpFusion == FPOpFusion::Fast ||
+           Inst.getFlag(MachineInstr::FmContract);
+    return true;
   }
   return false;
 }
@@ -4364,8 +4638,8 @@ bool AArch64InstrInfo::isThroughputPattern(
 /// pattern evaluator stops checking as soon as it finds a faster sequence.
 
 bool AArch64InstrInfo::getMachineCombinerPatterns(
-    MachineInstr &Root,
-    SmallVectorImpl<MachineCombinerPattern> &Patterns) const {
+    MachineInstr &Root, SmallVectorImpl<MachineCombinerPattern> &Patterns,
+    bool DoRegPressureReduce) const {
   // Integer patterns
   if (getMaddPatterns(Root, Patterns))
     return true;
@@ -4373,7 +4647,8 @@ bool AArch64InstrInfo::getMachineCombinerPatterns(
   if (getFMAPatterns(Root, Patterns))
     return true;
 
-  return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns);
+  return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
+                                                     DoRegPressureReduce);
 }
 
 enum class FMAInstKind { Default, Indexed, Accumulator };
@@ -4596,7 +4871,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
   MachineFunction &MF = *MBB.getParent();
   const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
 
-  MachineInstr *MUL;
+  MachineInstr *MUL = nullptr;
   const TargetRegisterClass *RC;
   unsigned Opc;
   switch (Pattern) {
@@ -5417,6 +5692,9 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
   }
   } // end switch (Pattern)
   // Record MUL and ADD/SUB for deletion
+  // FIXME: This assertion fails in CodeGen/AArch64/tailmerging_in_mbp.ll and
+  // CodeGen/AArch64/urem-seteq-nonzero.ll.
+  // assert(MUL && "MUL was never set");
   DelInstrs.push_back(MUL);
   DelInstrs.push_back(&Root);
 }
@@ -5756,84 +6034,20 @@ AArch64InstrInfo::findRegisterToSaveLRTo(const outliner::Candidate &C) const {
 static bool
 outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a,
                                          const outliner::Candidate &b) {
-  const Function &Fa = a.getMF()->getFunction();
-  const Function &Fb = b.getMF()->getFunction();
-
-  // If none of the functions have the "sign-return-address" attribute their
-  // signing behaviour is equal
-  if (!Fa.hasFnAttribute("sign-return-address") &&
-      !Fb.hasFnAttribute("sign-return-address")) {
-    return true;
-  }
+  const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
+  const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
 
-  // If both functions have the "sign-return-address" attribute their signing
-  // behaviour is equal, if the values of the attributes are equal
-  if (Fa.hasFnAttribute("sign-return-address") &&
-      Fb.hasFnAttribute("sign-return-address")) {
-    StringRef ScopeA =
-        Fa.getFnAttribute("sign-return-address").getValueAsString();
-    StringRef ScopeB =
-        Fb.getFnAttribute("sign-return-address").getValueAsString();
-    return ScopeA.equals(ScopeB);
-  }
-
-  // If function B doesn't have the "sign-return-address" attribute but A does,
-  // the functions' signing behaviour is equal if A's value for
-  // "sign-return-address" is "none" and vice versa.
-  if (Fa.hasFnAttribute("sign-return-address")) {
-    StringRef ScopeA =
-        Fa.getFnAttribute("sign-return-address").getValueAsString();
-    return ScopeA.equals("none");
-  }
-
-  if (Fb.hasFnAttribute("sign-return-address")) {
-    StringRef ScopeB =
-        Fb.getFnAttribute("sign-return-address").getValueAsString();
-    return ScopeB.equals("none");
-  }
-
-  llvm_unreachable("Unkown combination of sign-return-address attributes");
+  return MFIa->shouldSignReturnAddress(false) == MFIb->shouldSignReturnAddress(false) &&
+         MFIa->shouldSignReturnAddress(true) == MFIb->shouldSignReturnAddress(true);
 }
 
 static bool
 outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a,
                                        const outliner::Candidate &b) {
-  const Function &Fa = a.getMF()->getFunction();
-  const Function &Fb = b.getMF()->getFunction();
-
-  // If none of the functions have the "sign-return-address-key" attribute
-  // their keys are equal
-  if (!Fa.hasFnAttribute("sign-return-address-key") &&
-      !Fb.hasFnAttribute("sign-return-address-key")) {
-    return true;
-  }
-
-  // If both functions have the "sign-return-address-key" attribute their
-  // keys are equal if the values of "sign-return-address-key" are equal
-  if (Fa.hasFnAttribute("sign-return-address-key") &&
-      Fb.hasFnAttribute("sign-return-address-key")) {
-    StringRef KeyA =
-        Fa.getFnAttribute("sign-return-address-key").getValueAsString();
-    StringRef KeyB =
-        Fb.getFnAttribute("sign-return-address-key").getValueAsString();
-    return KeyA.equals(KeyB);
-  }
-
-  // If B doesn't have the "sign-return-address-key" attribute, both keys are
-  // equal, if function a has the default key (a_key)
-  if (Fa.hasFnAttribute("sign-return-address-key")) {
-    StringRef KeyA =
-        Fa.getFnAttribute("sign-return-address-key").getValueAsString();
-    return KeyA.equals_lower("a_key");
-  }
-
-  if (Fb.hasFnAttribute("sign-return-address-key")) {
-    StringRef KeyB =
-        Fb.getFnAttribute("sign-return-address-key").getValueAsString();
-    return KeyB.equals_lower("a_key");
-  }
+  const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
+  const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
 
-  llvm_unreachable("Unkown combination of sign-return-address-key attributes");
+  return MFIa->shouldSignWithBKey() == MFIb->shouldSignWithBKey();
 }
 
 static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a,
@@ -5889,9 +6103,10 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo(
   // v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is
   // necessary. However, at this point we don't know if the outlined function
   // will have a RET instruction so we assume the worst.
-  const Function &FCF = FirstCand.getMF()->getFunction();
   const TargetRegisterInfo &TRI = getRegisterInfo();
-  if (FCF.hasFnAttribute("sign-return-address")) {
+  if (FirstCand.getMF()
+          ->getInfo<AArch64FunctionInfo>()
+          ->shouldSignReturnAddress(true)) {
     // One PAC and one AUT instructions
     NumBytesToCreateFrame += 8;
 
@@ -5948,10 +6163,7 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo(
       return false;
     };
     // Remove candidates with illegal stack modifying instructions
-    RepeatedSequenceLocs.erase(std::remove_if(RepeatedSequenceLocs.begin(),
-                                              RepeatedSequenceLocs.end(),
-                                              hasIllegalSPModification),
-                               RepeatedSequenceLocs.end());
+    llvm::erase_if(RepeatedSequenceLocs, hasIllegalSPModification);
 
     // If the sequence doesn't have enough candidates left, then we're done.
     if (RepeatedSequenceLocs.size() < 2)
@@ -5994,10 +6206,7 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo(
     // Erase every candidate that violates the restrictions above. (It could be
     // true that we have viable candidates, so it's not worth bailing out in
     // the case that, say, 1 out of 20 candidates violate the restructions.)
-    RepeatedSequenceLocs.erase(std::remove_if(RepeatedSequenceLocs.begin(),
-                                              RepeatedSequenceLocs.end(),
-                                              CantGuaranteeValueAcrossCall),
-                               RepeatedSequenceLocs.end());
+    llvm::erase_if(RepeatedSequenceLocs, CantGuaranteeValueAcrossCall);
 
     // If the sequence doesn't have enough candidates left, then we're done.
     if (RepeatedSequenceLocs.size() < 2)
@@ -6020,7 +6229,7 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo(
   NumBytesToCreateFrame += 4;
 
   bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) {
-    return C.getMF()->getFunction().hasFnAttribute("branch-target-enforcement");
+    return C.getMF()->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement();
   });
 
   // We check to see if CFI Instructions are present, and if they are
@@ -6189,6 +6398,60 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo(
       FrameID = MachineOutlinerNoLRSave;
     } else {
       SetCandidateCallInfo(MachineOutlinerDefault, 12);
+
+      // Bugzilla ID: 46767
+      // TODO: Check if fixing up the stack more than once is safe so we can
+      // outline these.
+      //
+      // An outline resulting in a caller that requires stack fixups at the
+      // callsite to a callee that also requires stack fixups can happen when
+      // there are no available registers at the candidate callsite for a
+      // candidate that itself also has calls.
+      //
+      // In other words if function_containing_sequence in the following pseudo
+      // assembly requires that we save LR at the point of the call, but there
+      // are no available registers: in this case we save using SP and as a
+      // result the SP offsets requires stack fixups by multiples of 16.
+      //
+      // function_containing_sequence:
+      //   ...
+      //   save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
+      //   call OUTLINED_FUNCTION_N
+      //   restore LR from SP
+      //   ...
+      //
+      // OUTLINED_FUNCTION_N:
+      //   save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
+      //   ...
+      //   bl foo
+      //   restore LR from SP
+      //   ret
+      //
+      // Because the code to handle more than one stack fixup does not
+      // currently have the proper checks for legality, these cases will assert
+      // in the AArch64 MachineOutliner. This is because the code to do this
+      // needs more hardening, testing, better checks that generated code is
+      // legal, etc and because it is only verified to handle a single pass of
+      // stack fixup.
+      //
+      // The assert happens in AArch64InstrInfo::buildOutlinedFrame to catch
+      // these cases until they are known to be handled. Bugzilla 46767 is
+      // referenced in comments at the assert site.
+      //
+      // To avoid asserting (or generating non-legal code on noassert builds)
+      // we remove all candidates which would need more than one stack fixup by
+      // pruning the cases where the candidate has calls while also having no
+      // available LR and having no available general purpose registers to copy
+      // LR to (ie one extra stack save/restore).
+      //
+      if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
+        erase_if(RepeatedSequenceLocs, [this](outliner::Candidate &C) {
+          return (std::any_of(
+                     C.front(), std::next(C.back()),
+                     [](const MachineInstr &MI) { return MI.isCall(); })) &&
+                 (!C.LRU.available(AArch64::LR) || !findRegisterToSaveLRTo(C));
+        });
+      }
     }
 
     // If we dropped all of the candidates, bail out here.
@@ -6557,7 +6820,7 @@ static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB,
 
     // If v8.3a features are available we can replace a RET instruction by
     // RETAA or RETAB and omit the AUT instructions
-    if (Subtarget.hasV8_3aOps() && MBBAUT != MBB.end() &&
+    if (Subtarget.hasPAuth() && MBBAUT != MBB.end() &&
         MBBAUT->getOpcode() == AArch64::RET) {
       BuildMI(MBB, MBBAUT, DL,
               TII->get(ShouldSignReturnAddrWithAKey ? AArch64::RETAA
@@ -6609,9 +6872,12 @@ void AArch64InstrInfo::buildOutlinedFrame(
     return MI.isCall() && !MI.isReturn();
   };
 
-  if (std::any_of(MBB.instr_begin(), MBB.instr_end(), IsNonTailCall)) {
+  if (llvm::any_of(MBB.instrs(), IsNonTailCall)) {
     // Fix up the instructions in the range, since we're going to modify the
     // stack.
+
+    // Bugzilla ID: 46767
+    // TODO: Check if fixing up twice is safe so we can outline these.
     assert(OF.FrameConstructionID != MachineOutlinerDefault &&
            "Can only fix up stack references once");
     fixupPostOutline(MBB);
@@ -6668,27 +6934,11 @@ void AArch64InstrInfo::buildOutlinedFrame(
   // If a bunch of candidates reach this point they must agree on their return
   // address signing. It is therefore enough to just consider the signing
   // behaviour of one of them
-  const Function &CF = OF.Candidates.front().getMF()->getFunction();
-  bool ShouldSignReturnAddr = false;
-  if (CF.hasFnAttribute("sign-return-address")) {
-    StringRef Scope =
-        CF.getFnAttribute("sign-return-address").getValueAsString();
-    if (Scope.equals("all"))
-      ShouldSignReturnAddr = true;
-    else if (Scope.equals("non-leaf") && !IsLeafFunction)
-      ShouldSignReturnAddr = true;
-  }
+  const auto &MFI = *OF.Candidates.front().getMF()->getInfo<AArch64FunctionInfo>();
+  bool ShouldSignReturnAddr = MFI.shouldSignReturnAddress(!IsLeafFunction);
 
   // a_key is the default
-  bool ShouldSignReturnAddrWithAKey = true;
-  if (CF.hasFnAttribute("sign-return-address-key")) {
-    const StringRef Key =
-        CF.getFnAttribute("sign-return-address-key").getValueAsString();
-    // Key can either be a_key or b_key
-    assert((Key.equals_lower("a_key") || Key.equals_lower("b_key")) &&
-           "Return address signing key must be either a_key or b_key");
-    ShouldSignReturnAddrWithAKey = Key.equals_lower("a_key");
-  }
+  bool ShouldSignReturnAddrWithAKey = !MFI.shouldSignWithBKey();
 
   // If this is a tail call outlined function, then there's already a return.
   if (OF.FrameConstructionID == MachineOutlinerTailCall ||
@@ -6847,10 +7097,9 @@ Optional<RegImmPair> AArch64InstrInfo::isAddImmediate(const MachineInstr &MI,
     if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() ||
         !MI.getOperand(2).isImm())
       return None;
-    Offset = MI.getOperand(2).getImm() * Sign;
     int Shift = MI.getOperand(3).getImm();
     assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12");
-    Offset = Offset << Shift;
+    Offset = Sign * (MI.getOperand(2).getImm() << Shift);
   }
   }
   return RegImmPair{MI.getOperand(1).getReg(), Offset};
@@ -6926,6 +7175,14 @@ uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const {
   return get(Opc).TSFlags & AArch64::ElementSizeMask;
 }
 
+bool AArch64InstrInfo::isPTestLikeOpcode(unsigned Opc) const {
+  return get(Opc).TSFlags & AArch64::InstrFlagIsPTestLike;
+}
+
+bool AArch64InstrInfo::isWhileOpcode(unsigned Opc) const {
+  return get(Opc).TSFlags & AArch64::InstrFlagIsWhile;
+}
+
 unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) {
   if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr())
     return AArch64::BLRNoIP;
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 298c04d81708..7434987e0617 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -15,7 +15,6 @@
 
 #include "AArch64.h"
 #include "AArch64RegisterInfo.h"
-#include "AArch64StackOffset.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/CodeGen/MachineCombinerPattern.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
@@ -113,6 +112,10 @@ public:
   /// Hint that pairing the given load or store is unprofitable.
   static void suppressLdStPair(MachineInstr &MI);
 
+  Optional<ExtAddrMode>
+  getAddrModeFromMemoryOp(const MachineInstr &MemI,
+                          const TargetRegisterInfo *TRI) const override;
+
   bool getMemOperandsWithOffsetWidth(
       const MachineInstr &MI, SmallVectorImpl<const MachineOperand *> &BaseOps,
       int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
@@ -188,6 +191,9 @@ public:
                      MachineBasicBlock *&FBB,
                      SmallVectorImpl<MachineOperand> &Cond,
                      bool AllowModify = false) const override;
+  bool analyzeBranchPredicate(MachineBasicBlock &MBB,
+                              MachineBranchPredicate &MBP,
+                              bool AllowModify) const override;
   unsigned removeBranch(MachineBasicBlock &MBB,
                         int *BytesRemoved = nullptr) const override;
   unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
@@ -229,9 +235,10 @@ public:
   /// Return true when there is potentially a faster code sequence
   /// for an instruction chain ending in ``Root``. All potential patterns are
   /// listed in the ``Patterns`` array.
-  bool getMachineCombinerPatterns(
-      MachineInstr &Root,
-      SmallVectorImpl<MachineCombinerPattern> &Patterns) const override;
+  bool
+  getMachineCombinerPatterns(MachineInstr &Root,
+                             SmallVectorImpl<MachineCombinerPattern> &Patterns,
+                             bool DoRegPressureReduce) const override;
   /// Return true when Inst is associative and commutative so that it can be
   /// reassociated.
   bool isAssociativeAndCommutative(const MachineInstr &Inst) const override;
@@ -273,6 +280,12 @@ public:
   bool shouldOutlineFromFunctionByDefault(MachineFunction &MF) const override;
   /// Returns the vector element size (B, H, S or D) of an SVE opcode.
   uint64_t getElementSizeForOpcode(unsigned Opc) const;
+  /// Returns true if the opcode is for an SVE instruction that sets the
+  /// condition codes as if it's results had been fed to a PTEST instruction
+  /// along with the same general predicate.
+  bool isPTestLikeOpcode(unsigned Opc) const;
+  /// Returns true if the opcode is for an SVE WHILE## instruction.
+  bool isWhileOpcode(unsigned Opc) const;
   /// Returns true if the instruction has a shift by immediate that can be
   /// executed in one cycle less.
   static bool isFalkorShiftExtFast(const MachineInstr &MI);
@@ -286,6 +299,13 @@ public:
   Optional<ParamLoadedValue> describeLoadedValue(const MachineInstr &MI,
                                                  Register Reg) const override;
 
+  static void decomposeStackOffsetForFrameOffsets(const StackOffset &Offset,
+                                                  int64_t &NumBytes,
+                                                  int64_t &NumPredicateVectors,
+                                                  int64_t &NumDataVectors);
+  static void decomposeStackOffsetForDwarfOffsets(const StackOffset &Offset,
+                                                  int64_t &ByteSized,
+                                                  int64_t &VGSized);
 #define GET_INSTRINFO_HELPER_DECLS
 #include "AArch64GenInstrInfo.inc"
 
@@ -314,6 +334,12 @@ private:
   /// Returns an unused general-purpose register which can be used for
   /// constructing an outlined call if one exists. Returns 0 otherwise.
   unsigned findRegisterToSaveLRTo(const outliner::Candidate &C) const;
+
+  /// Remove a ptest of a predicate-generating operation that already sets, or
+  /// can be made to set, the condition codes in an identical manner
+  bool optimizePTestInstr(MachineInstr *PTest, unsigned MaskReg,
+                          unsigned PredReg,
+                          const MachineRegisterInfo *MRI) const;
 };
 
 /// Return true if there is an instruction /after/ \p DefMI and before \p UseMI
@@ -397,6 +423,18 @@ static inline bool isIndirectBranchOpcode(int Opc) {
   return false;
 }
 
+static inline bool isPTrueOpcode(unsigned Opc) {
+  switch (Opc) {
+  case AArch64::PTRUE_B:
+  case AArch64::PTRUE_H:
+  case AArch64::PTRUE_S:
+  case AArch64::PTRUE_D:
+    return true;
+  default:
+    return false;
+  }
+}
+
 /// Return opcode to be used for indirect calls.
 unsigned getBLRCallOpcode(const MachineFunction &MF);
 
@@ -404,6 +442,7 @@ unsigned getBLRCallOpcode(const MachineFunction &MF);
 #define TSFLAG_ELEMENT_SIZE_TYPE(X)      (X)       // 3-bits
 #define TSFLAG_DESTRUCTIVE_INST_TYPE(X) ((X) << 3) // 4-bit
 #define TSFLAG_FALSE_LANE_TYPE(X)       ((X) << 7) // 2-bits
+#define TSFLAG_INSTR_FLAGS(X)           ((X) << 9) // 2-bits
 // }
 
 namespace AArch64 {
@@ -436,9 +475,14 @@ enum FalseLaneType {
   FalseLanesUndef = TSFLAG_FALSE_LANE_TYPE(0x2),
 };
 
+// NOTE: This is a bit field.
+static const uint64_t InstrFlagIsWhile     = TSFLAG_INSTR_FLAGS(0x1);
+static const uint64_t InstrFlagIsPTestLike = TSFLAG_INSTR_FLAGS(0x2);
+
 #undef TSFLAG_ELEMENT_SIZE_TYPE
 #undef TSFLAG_DESTRUCTIVE_INST_TYPE
 #undef TSFLAG_FALSE_LANE_TYPE
+#undef TSFLAG_INSTR_FLAGS
 
 int getSVEPseudoMap(uint16_t Opcode);
 int getSVERevInstr(uint16_t Opcode);
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index f4a5f639e497..171d3dbaa814 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -25,14 +25,16 @@ def HasV8_5a         : Predicate<"Subtarget->hasV8_5aOps()">,
                                  AssemblerPredicate<(all_of HasV8_5aOps), "armv8.5a">;
 def HasV8_6a         : Predicate<"Subtarget->hasV8_6aOps()">,
                                  AssemblerPredicate<(all_of HasV8_6aOps), "armv8.6a">;
+def HasV8_7a         : Predicate<"Subtarget->hasV8_7aOps()">,
+                                 AssemblerPredicate<(all_of HasV8_7aOps), "armv8.7a">;
 def HasVH            : Predicate<"Subtarget->hasVH()">,
                        AssemblerPredicate<(all_of FeatureVH), "vh">;
 
 def HasLOR           : Predicate<"Subtarget->hasLOR()">,
                        AssemblerPredicate<(all_of FeatureLOR), "lor">;
 
-def HasPA            : Predicate<"Subtarget->hasPA()">,
-                       AssemblerPredicate<(all_of FeaturePA), "pa">;
+def HasPAuth         : Predicate<"Subtarget->hasPAuth()">,
+                       AssemblerPredicate<(all_of FeaturePAuth), "pauth">;
 
 def HasJS            : Predicate<"Subtarget->hasJS()">,
                        AssemblerPredicate<(all_of FeatureJS), "jsconv">;
@@ -46,9 +48,6 @@ def HasComplxNum      : Predicate<"Subtarget->hasComplxNum()">,
 def HasNV            : Predicate<"Subtarget->hasNV()">,
                        AssemblerPredicate<(all_of FeatureNV), "nv">;
 
-def HasRASv8_4       : Predicate<"Subtarget->hasRASv8_4()">,
-                       AssemblerPredicate<(all_of FeatureRASv8_4), "rasv8_4">;
-
 def HasMPAM          : Predicate<"Subtarget->hasMPAM()">,
                        AssemblerPredicate<(all_of FeatureMPAM), "mpam">;
 
@@ -70,8 +69,8 @@ def HasPMU           : Predicate<"Subtarget->hasPMU()">,
 def HasTLB_RMI          : Predicate<"Subtarget->hasTLB_RMI()">,
                        AssemblerPredicate<(all_of FeatureTLB_RMI), "tlb-rmi">;
 
-def HasFMI           : Predicate<"Subtarget->hasFMI()">,
-                       AssemblerPredicate<(all_of FeatureFMI), "fmi">;
+def HasFlagM         : Predicate<"Subtarget->hasFlagM()">,
+                       AssemblerPredicate<(all_of FeatureFlagM), "flagm">;
 
 def HasRCPC_IMMO      : Predicate<"Subtarget->hasRCPCImm()">,
                        AssemblerPredicate<(all_of FeatureRCPC_IMMO), "rcpc-immo">;
@@ -152,6 +151,16 @@ def HasMatMulFP32    : Predicate<"Subtarget->hasMatMulFP32()">,
                        AssemblerPredicate<(all_of FeatureMatMulFP32), "f32mm">;
 def HasMatMulFP64    : Predicate<"Subtarget->hasMatMulFP64()">,
                        AssemblerPredicate<(all_of FeatureMatMulFP64), "f64mm">;
+def HasXS            : Predicate<"Subtarget->hasXS()">,
+                       AssemblerPredicate<(all_of FeatureXS), "xs">;
+def HasWFxT          : Predicate<"Subtarget->hasWFxT()">,
+                       AssemblerPredicate<(all_of FeatureWFxT), "wfxt">;
+def HasLS64          : Predicate<"Subtarget->hasLS64()">,
+                       AssemblerPredicate<(all_of FeatureLS64), "ls64">;
+def HasBRBE          : Predicate<"Subtarget->hasBRBE()">,
+                       AssemblerPredicate<(all_of FeatureBRBE), "brbe">;
+def HasSPE_EEF       : Predicate<"Subtarget->hasSPE_EEF()">,
+                       AssemblerPredicate<(all_of FeatureSPE_EEF), "spe-eef">;
 def IsLE             : Predicate<"Subtarget->isLittleEndian()">;
 def IsBE             : Predicate<"!Subtarget->isLittleEndian()">;
 def IsWindows        : Predicate<"Subtarget->isTargetWindows()">;
@@ -402,6 +411,12 @@ def AArch64call          : SDNode<"AArch64ISD::CALL",
                                 SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>,
                                 [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                                  SDNPVariadic]>;
+
+def AArch64call_rvmarker: SDNode<"AArch64ISD::CALL_RVMARKER",
+                             SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>,
+                             [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                              SDNPVariadic]>;
+
 def AArch64brcond        : SDNode<"AArch64ISD::BRCOND", SDT_AArch64Brcond,
                                 [SDNPHasChain]>;
 def AArch64cbz           : SDNode<"AArch64ISD::CBZ", SDT_AArch64cbz,
@@ -484,7 +499,6 @@ def AArch64urshri : SDNode<"AArch64ISD::URSHR_I", SDT_AArch64vshift>;
 def AArch64vsli : SDNode<"AArch64ISD::VSLI", SDT_AArch64vshiftinsert>;
 def AArch64vsri : SDNode<"AArch64ISD::VSRI", SDT_AArch64vshiftinsert>;
 
-def AArch64not: SDNode<"AArch64ISD::NOT", SDT_AArch64unvec>;
 def AArch64bit: SDNode<"AArch64ISD::BIT", SDT_AArch64trivec>;
 def AArch64bsp: SDNode<"AArch64ISD::BSP", SDT_AArch64trivec>;
 
@@ -504,7 +518,7 @@ def AArch64cmgtz: SDNode<"AArch64ISD::CMGTz", SDT_AArch64unvec>;
 def AArch64cmlez: SDNode<"AArch64ISD::CMLEz", SDT_AArch64unvec>;
 def AArch64cmltz: SDNode<"AArch64ISD::CMLTz", SDT_AArch64unvec>;
 def AArch64cmtst : PatFrag<(ops node:$LHS, node:$RHS),
-                        (AArch64not (AArch64cmeqz (and node:$LHS, node:$RHS)))>;
+                        (vnot (AArch64cmeqz (and node:$LHS, node:$RHS)))>;
 
 def AArch64fcmeqz: SDNode<"AArch64ISD::FCMEQz", SDT_AArch64fcmpz>;
 def AArch64fcmgez: SDNode<"AArch64ISD::FCMGEz", SDT_AArch64fcmpz>;
@@ -556,6 +570,18 @@ def AArch64umaxv    : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>;
 
 def AArch64srhadd   : SDNode<"AArch64ISD::SRHADD", SDT_AArch64binvec>;
 def AArch64urhadd   : SDNode<"AArch64ISD::URHADD", SDT_AArch64binvec>;
+def AArch64shadd   : SDNode<"AArch64ISD::SHADD", SDT_AArch64binvec>;
+def AArch64uhadd   : SDNode<"AArch64ISD::UHADD", SDT_AArch64binvec>;
+
+def AArch64uabd_n   : SDNode<"AArch64ISD::UABD", SDT_AArch64binvec>;
+def AArch64sabd_n   : SDNode<"AArch64ISD::SABD", SDT_AArch64binvec>;
+
+def AArch64uabd     : PatFrags<(ops node:$lhs, node:$rhs),
+                               [(AArch64uabd_n node:$lhs, node:$rhs),
+                                (int_aarch64_neon_uabd node:$lhs, node:$rhs)]>;
+def AArch64sabd     : PatFrags<(ops node:$lhs, node:$rhs),
+                               [(AArch64sabd_n node:$lhs, node:$rhs),
+                                (int_aarch64_neon_sabd node:$lhs, node:$rhs)]>;
 
 def SDT_AArch64SETTAG : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>;
 def AArch64stg : SDNode<"AArch64ISD::STG", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
@@ -591,8 +617,8 @@ let RecomputePerFunction = 1 in {
   // Avoid generating STRQro if it is slow, unless we're optimizing for code size.
   def UseSTRQro : Predicate<"!Subtarget->isSTRQroSlow() || shouldOptForSize(MF)">;
 
-  def UseBTI : Predicate<[{ MF->getFunction().hasFnAttribute("branch-target-enforcement") }]>;
-  def NotUseBTI : Predicate<[{ !MF->getFunction().hasFnAttribute("branch-target-enforcement") }]>;
+  def UseBTI : Predicate<[{ MF->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement() }]>;
+  def NotUseBTI : Predicate<[{ !MF->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement() }]>;
 
   def SLSBLRMitigation : Predicate<[{ MF->getSubtarget<AArch64Subtarget>().hardenSlsBlr() }]>;
   def NoSLSBLRMitigation : Predicate<[{ !MF->getSubtarget<AArch64Subtarget>().hardenSlsBlr() }]>;
@@ -690,7 +716,8 @@ def : Pat<(AArch64LOADgot tconstpool:$addr),
 // 32-bit jump table destination is actually only 2 instructions since we can
 // use the table itself as a PC-relative base. But optimization occurs after
 // branch relaxation so be pessimistic.
-let Size = 12, Constraints = "@earlyclobber $dst,@earlyclobber $scratch" in {
+let Size = 12, Constraints = "@earlyclobber $dst,@earlyclobber $scratch",
+    isNotDuplicable = 1 in {
 def JumpTableDest32 : Pseudo<(outs GPR64:$dst, GPR64sp:$scratch),
                              (ins GPR64:$table, GPR64:$entry, i32imm:$jti), []>,
                       Sched<[]>;
@@ -774,8 +801,34 @@ def TSB   : CRmSystemI<barrier_op, 0b010, "tsb", []> {
   let Inst{12}   = 0;
   let Predicates = [HasTRACEV8_4];
 }
+
+def DSBnXS  : CRmSystemI<barrier_nxs_op, 0b001, "dsb"> {
+  let CRm{1-0}   = 0b11;
+  let Inst{9-8}  = 0b10;
+  let Predicates = [HasXS];
+}
+
+let Predicates = [HasWFxT] in {
+def WFET : RegInputSystemI<0b0000, 0b000, "wfet">;
+def WFIT : RegInputSystemI<0b0000, 0b001, "wfit">;
+}
+
+// Branch Record Buffer two-word mnemonic instructions
+class BRBEI<bits<3> op2, string keyword>
+    : SimpleSystemI<0, (ins), "brb", keyword>, Sched<[WriteSys]> {
+  let Inst{31-8} = 0b110101010000100101110010;
+  let Inst{7-5} = op2;
+  let Predicates = [HasBRBE];
+}
+def BRB_IALL: BRBEI<0b100, "\tiall">;
+def BRB_INJ:  BRBEI<0b101, "\tinj">;
+
 }
 
+// Allow uppercase and lowercase keyword arguments for BRB IALL and BRB INJ
+def : TokenAlias<"INJ", "inj">;
+def : TokenAlias<"IALL", "iall">;
+
 // ARMv8.2-A Dot Product
 let Predicates = [HasDotProd] in {
 defm SDOT : SIMDThreeSameVectorDot<0, 0, "sdot", int_aarch64_neon_sdot>;
@@ -796,6 +849,23 @@ def BFMLALTIdx   : SIMDBF16MLALIndex<1, "bfmlalt", int_aarch64_neon_bfmlalt>;
 def BFCVTN       : SIMD_BFCVTN;
 def BFCVTN2      : SIMD_BFCVTN2;
 def BFCVT        : BF16ToSinglePrecision<"bfcvt">;
+
+// Vector-scalar BFDOT:
+// The second source operand of the 64-bit variant of BF16DOTlane is a 128-bit
+// register (the instruction uses a single 32-bit lane from it), so the pattern
+// is a bit tricky.
+def : Pat<(v2f32 (int_aarch64_neon_bfdot
+                    (v2f32 V64:$Rd), (v4bf16 V64:$Rn),
+                    (v4bf16 (bitconvert
+                      (v2i32 (AArch64duplane32
+                        (v4i32 (bitconvert
+                          (v8bf16 (insert_subvector undef,
+                            (v4bf16 V64:$Rm),
+                            (i64 0))))),
+                        VectorIndexS:$idx)))))),
+          (BF16DOTlanev4bf16 (v2f32 V64:$Rd), (v4bf16 V64:$Rn),
+                             (SUBREG_TO_REG (i32 0), V64:$Rm, dsub),
+                             VectorIndexS:$idx)>;
 }
 
 // ARMv8.6A AArch64 matrix multiplication
@@ -895,6 +965,7 @@ let Predicates = [HasComplxNum, HasNEON, HasFullFP16] in {
   def : Pat<(v8f16 (int_aarch64_neon_vcadd_rot270 (v8f16 V128:$Rn), (v8f16 V128:$Rm))),
             (FCADDv8f16 (v8f16 V128:$Rn), (v8f16 V128:$Rm), (i32 1))>;
 }
+
 let Predicates = [HasComplxNum, HasNEON] in {
   def : Pat<(v2f32 (int_aarch64_neon_vcadd_rot90 (v2f32 V64:$Rn), (v2f32 V64:$Rm))),
             (FCADDv2f32 (v2f32 V64:$Rn), (v2f32 V64:$Rm), (i32 0))>;
@@ -908,6 +979,47 @@ let Predicates = [HasComplxNum, HasNEON] in {
   }
 }
 
+multiclass FCMLA_PATS<ValueType ty, RegisterClass Reg> {
+  def : Pat<(ty (int_aarch64_neon_vcmla_rot0 (ty Reg:$Rd), (ty Reg:$Rn), (ty Reg:$Rm))),
+            (!cast<Instruction>("FCMLA" # ty) $Rd, $Rn, $Rm, 0)>;
+  def : Pat<(ty (int_aarch64_neon_vcmla_rot90 (ty Reg:$Rd), (ty Reg:$Rn), (ty Reg:$Rm))),
+            (!cast<Instruction>("FCMLA" # ty) $Rd, $Rn, $Rm, 1)>;
+  def : Pat<(ty (int_aarch64_neon_vcmla_rot180 (ty Reg:$Rd), (ty Reg:$Rn), (ty Reg:$Rm))),
+            (!cast<Instruction>("FCMLA" # ty) $Rd, $Rn, $Rm, 2)>;
+  def : Pat<(ty (int_aarch64_neon_vcmla_rot270 (ty Reg:$Rd), (ty Reg:$Rn), (ty Reg:$Rm))),
+            (!cast<Instruction>("FCMLA" # ty) $Rd, $Rn, $Rm, 3)>;
+}
+
+multiclass FCMLA_LANE_PATS<ValueType ty, RegisterClass Reg, dag RHSDup> {
+  def : Pat<(ty (int_aarch64_neon_vcmla_rot0 (ty Reg:$Rd), (ty Reg:$Rn), RHSDup)),
+            (!cast<Instruction>("FCMLA" # ty # "_indexed") $Rd, $Rn, $Rm, VectorIndexS:$idx, 0)>;
+  def : Pat<(ty (int_aarch64_neon_vcmla_rot90 (ty Reg:$Rd), (ty Reg:$Rn), RHSDup)),
+            (!cast<Instruction>("FCMLA" # ty # "_indexed") $Rd, $Rn, $Rm, VectorIndexS:$idx, 1)>;
+  def : Pat<(ty (int_aarch64_neon_vcmla_rot180 (ty Reg:$Rd), (ty Reg:$Rn), RHSDup)),
+            (!cast<Instruction>("FCMLA" # ty # "_indexed") $Rd, $Rn, $Rm, VectorIndexS:$idx, 2)>;
+  def : Pat<(ty (int_aarch64_neon_vcmla_rot270 (ty Reg:$Rd), (ty Reg:$Rn), RHSDup)),
+            (!cast<Instruction>("FCMLA" # ty # "_indexed") $Rd, $Rn, $Rm, VectorIndexS:$idx, 3)>;
+}
+
+
+let Predicates = [HasComplxNum, HasNEON, HasFullFP16] in {
+  defm : FCMLA_PATS<v4f16, V64>;
+  defm : FCMLA_PATS<v8f16, V128>;
+
+  defm : FCMLA_LANE_PATS<v4f16, V64,
+                         (v4f16 (bitconvert (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexD:$idx))))>;
+  defm : FCMLA_LANE_PATS<v8f16, V128,
+                         (v8f16 (bitconvert (v4i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))>;
+}
+let Predicates = [HasComplxNum, HasNEON] in {
+  defm : FCMLA_PATS<v2f32, V64>;
+  defm : FCMLA_PATS<v4f32, V128>;
+  defm : FCMLA_PATS<v2f64, V128>;
+
+  defm : FCMLA_LANE_PATS<v4f32, V128,
+                         (v4f32 (bitconvert (v2i64 (AArch64duplane64 (v2i64 V128:$Rm), VectorIndexD:$idx))))>;
+}
+
 // v8.3a Pointer Authentication
 // These instructions inhabit part of the hint space and so can be used for
 // armv8 targets. Keeping the old HINT mnemonic when compiling without PA is
@@ -961,7 +1073,7 @@ def : InstAlias<"autib1716", (AUTIB1716), 0>;
 def : InstAlias<"xpaclri", (XPACLRI), 0>;
 
 // These pointer authentication instructions require armv8.3a
-let Predicates = [HasPA] in {
+let Predicates = [HasPAuth] in {
 
   // When PA is enabled, a better mnemonic should be emitted.
   def : InstAlias<"paciaz", (PACIAZ), 1>;
@@ -992,8 +1104,8 @@ let Predicates = [HasPA] in {
   defm PAC : SignAuth<0b000, 0b010, "pac">;
   defm AUT : SignAuth<0b001, 0b011, "aut">;
 
-  def XPACI : SignAuthZero<0b100, 0b00, "xpaci">;
-  def XPACD : SignAuthZero<0b100, 0b01, "xpacd">;
+  def XPACI : ClearAuth<0, "xpaci">;
+  def XPACD : ClearAuth<1, "xpacd">;
   def PACGA : SignAuthTwoOperand<0b1100, "pacga", null_frag>;
 
   // Combined Instructions
@@ -1028,7 +1140,7 @@ let Predicates = [HasPA] in {
 }
 
 // v8.3a floating point conversion for javascript
-let Predicates = [HasJS, HasFPARMv8] in
+let Predicates = [HasJS, HasFPARMv8], Defs = [NZCV] in
 def FJCVTZS  : BaseFPToIntegerUnscaled<0b01, 0b11, 0b110, FPR64, GPR32,
                                       "fjcvtzs",
                                       [(set GPR32:$Rd,
@@ -1037,7 +1149,7 @@ def FJCVTZS  : BaseFPToIntegerUnscaled<0b01, 0b11, 0b110, FPR64, GPR32,
 } // HasJS, HasFPARMv8
 
 // v8.4 Flag manipulation instructions
-let Predicates = [HasFMI] in {
+let Predicates = [HasFlagM], Defs = [NZCV], Uses = [NZCV] in {
 def CFINV : SimpleSystemI<0, (ins), "cfinv", "">, Sched<[WriteSys]> {
   let Inst{20-5} = 0b0000001000000000;
 }
@@ -1045,7 +1157,7 @@ def SETF8  : BaseFlagManipulation<0, 0, (ins GPR32:$Rn), "setf8", "{\t$Rn}">;
 def SETF16 : BaseFlagManipulation<0, 1, (ins GPR32:$Rn), "setf16", "{\t$Rn}">;
 def RMIF   : FlagRotate<(ins GPR64:$Rn, uimm6:$imm, imm0_15:$mask), "rmif",
                         "{\t$Rn, $imm, $mask}">;
-} // HasFMI
+} // HasFlagM
 
 // v8.5 flag manipulation instructions
 let Predicates = [HasAltNZCV], Uses = [NZCV], Defs = [NZCV] in {
@@ -1094,9 +1206,12 @@ def HWASAN_CHECK_MEMACCESS : Pseudo<
   (outs), (ins GPR64noip:$ptr, i32imm:$accessinfo),
   [(int_hwasan_check_memaccess X9, GPR64noip:$ptr, (i32 timm:$accessinfo))]>,
   Sched<[]>;
+}
+
+let Uses = [ X20 ], Defs = [ X16, X17, LR, NZCV ] in {
 def HWASAN_CHECK_MEMACCESS_SHORTGRANULES : Pseudo<
   (outs), (ins GPR64noip:$ptr, i32imm:$accessinfo),
-  [(int_hwasan_check_memaccess_shortgranules X9, GPR64noip:$ptr, (i32 timm:$accessinfo))]>,
+  [(int_hwasan_check_memaccess_shortgranules X20, GPR64noip:$ptr, (i32 timm:$accessinfo))]>,
   Sched<[]>;
 }
 
@@ -1443,8 +1558,16 @@ def SMSUBLrrr : WideMulAccum<1, 0b001, "smsubl", sub, sext>;
 def UMADDLrrr : WideMulAccum<0, 0b101, "umaddl", add, zext>;
 def UMSUBLrrr : WideMulAccum<1, 0b101, "umsubl", sub, zext>;
 
+def : Pat<(i64 (mul (sext_inreg GPR64:$Rn, i32), (sext_inreg GPR64:$Rm, i32))),
+          (SMADDLrrr (EXTRACT_SUBREG $Rn, sub_32), (EXTRACT_SUBREG $Rm, sub_32), XZR)>;
+def : Pat<(i64 (mul (sext_inreg GPR64:$Rn, i32), (sext GPR32:$Rm))),
+          (SMADDLrrr (EXTRACT_SUBREG $Rn, sub_32), $Rm, XZR)>;
 def : Pat<(i64 (mul (sext GPR32:$Rn), (sext GPR32:$Rm))),
           (SMADDLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
+def : Pat<(i64 (mul (and GPR64:$Rn, 0xFFFFFFFF), (and GPR64:$Rm, 0xFFFFFFFF))),
+          (UMADDLrrr (EXTRACT_SUBREG $Rn, sub_32), (EXTRACT_SUBREG $Rm, sub_32), XZR)>;
+def : Pat<(i64 (mul (and GPR64:$Rn, 0xFFFFFFFF), (zext GPR32:$Rm))),
+          (UMADDLrrr (EXTRACT_SUBREG $Rn, sub_32), $Rm, XZR)>;
 def : Pat<(i64 (mul (zext GPR32:$Rn), (zext GPR32:$Rm))),
           (UMADDLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
 
@@ -2031,6 +2154,8 @@ let isCall = 1, Defs = [LR], Uses = [SP] in {
   def BLRNoIP : Pseudo<(outs), (ins GPR64noip:$Rn), []>,
                 Sched<[WriteBrReg]>,
                 PseudoInstExpansion<(BLR GPR64:$Rn)>;
+  def BLR_RVMARKER : Pseudo<(outs), (ins variable_ops), []>,
+                     Sched<[WriteBrReg]>;
 } // isCall
 
 def : Pat<(AArch64call GPR64:$Rn),
@@ -2040,6 +2165,10 @@ def : Pat<(AArch64call GPR64noip:$Rn),
           (BLRNoIP GPR64noip:$Rn)>,
       Requires<[SLSBLRMitigation]>;
 
+def : Pat<(AArch64call_rvmarker GPR64:$Rn),
+          (BLR_RVMARKER GPR64:$Rn)>,
+      Requires<[NoSLSBLRMitigation]>;
+
 let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
 def BR  : BranchReg<0b0000, "br", [(brind GPR64:$Rn)]>;
 } // isBranch, isTerminator, isBarrier, isIndirectBranch
@@ -3701,18 +3830,6 @@ def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, (fneg FPR32:$Ra))),
 def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, (fneg FPR64:$Ra))),
           (FNMADDDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
 
-// And here "(-a) + b*(-c)"
-
-let Predicates = [HasNEON, HasFullFP16] in
-def : Pat<(f16 (fma FPR16:$Rn, (fneg FPR16:$Rm), (fneg FPR16:$Ra))),
-          (FNMADDHrrr FPR16:$Rn, FPR16:$Rm, FPR16:$Ra)>;
-
-def : Pat<(f32 (fma FPR32:$Rn, (fneg FPR32:$Rm), (fneg FPR32:$Ra))),
-          (FNMADDSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
-
-def : Pat<(f64 (fma FPR64:$Rn, (fneg FPR64:$Rm), (fneg FPR64:$Ra))),
-          (FNMADDDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
-
 //===----------------------------------------------------------------------===//
 // Floating point comparison instructions.
 //===----------------------------------------------------------------------===//
@@ -3783,7 +3900,7 @@ let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1,
 // Floating point immediate move.
 //===----------------------------------------------------------------------===//
 
-let isReMaterializable = 1 in {
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
 defm FMOV : FPMoveImmediate<"fmov">;
 }
 
@@ -3792,7 +3909,7 @@ defm FMOV : FPMoveImmediate<"fmov">;
 //===----------------------------------------------------------------------===//
 
 defm UABDL   : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl",
-                                          int_aarch64_neon_uabd>;
+                                          AArch64uabd>;
 // Match UABDL in log2-shuffle patterns.
 def : Pat<(abs (v8i16 (sub (zext (v8i8 V64:$opA)),
                            (zext (v8i8 V64:$opB))))),
@@ -3920,19 +4037,11 @@ def : Pat<(AArch64neg (v2i32 V64:$Rn)),  (NEGv2i32 V64:$Rn)>;
 def : Pat<(AArch64neg (v4i32 V128:$Rn)), (NEGv4i32 V128:$Rn)>;
 def : Pat<(AArch64neg (v2i64 V128:$Rn)), (NEGv2i64 V128:$Rn)>;
 
-def : Pat<(AArch64not (v8i8 V64:$Rn)),   (NOTv8i8  V64:$Rn)>;
-def : Pat<(AArch64not (v16i8 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
-def : Pat<(AArch64not (v4i16 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
-def : Pat<(AArch64not (v8i16 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
-def : Pat<(AArch64not (v2i32 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
-def : Pat<(AArch64not (v1i64 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
-def : Pat<(AArch64not (v4i32 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
-def : Pat<(AArch64not (v2i64 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
-
 def : Pat<(vnot (v4i16 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
 def : Pat<(vnot (v8i16 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
 def : Pat<(vnot (v2i32 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
 def : Pat<(vnot (v4i32 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+def : Pat<(vnot (v1i64 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
 def : Pat<(vnot (v2i64 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
 
 defm RBIT   : SIMDTwoVectorB<1, 0b01, 0b00101, "rbit", int_aarch64_neon_rbit>;
@@ -4038,17 +4147,6 @@ defm FMLA     : SIMDThreeSameVectorFPTied<0, 0, 0b001, "fmla",
 defm FMLS     : SIMDThreeSameVectorFPTied<0, 1, 0b001, "fmls",
             TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
 
-// The following def pats catch the case where the LHS of an FMA is negated.
-// The TriOpFrag above catches the case where the middle operand is negated.
-def : Pat<(v2f32 (fma (fneg V64:$Rn), V64:$Rm, V64:$Rd)),
-          (FMLSv2f32 V64:$Rd, V64:$Rn, V64:$Rm)>;
-
-def : Pat<(v4f32 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)),
-          (FMLSv4f32 V128:$Rd, V128:$Rn, V128:$Rm)>;
-
-def : Pat<(v2f64 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)),
-          (FMLSv2f64 V128:$Rd, V128:$Rn, V128:$Rm)>;
-
 defm FMULX    : SIMDThreeSameVectorFP<0,0,0b011,"fmulx", int_aarch64_neon_fmulx>;
 defm FMUL     : SIMDThreeSameVectorFP<1,0,0b011,"fmul", fmul>;
 defm FRECPS   : SIMDThreeSameVectorFP<0,0,0b111,"frecps", int_aarch64_neon_frecps>;
@@ -4062,9 +4160,9 @@ defm MLS      : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls", null_frag>;
 defm MUL      : SIMDThreeSameVectorBHS<0, 0b10011, "mul", mul>;
 defm PMUL     : SIMDThreeSameVectorB<1, 0b10011, "pmul", int_aarch64_neon_pmul>;
 defm SABA     : SIMDThreeSameVectorBHSTied<0, 0b01111, "saba",
-      TriOpFrag<(add node:$LHS, (int_aarch64_neon_sabd node:$MHS, node:$RHS))> >;
-defm SABD     : SIMDThreeSameVectorBHS<0,0b01110,"sabd", int_aarch64_neon_sabd>;
-defm SHADD    : SIMDThreeSameVectorBHS<0,0b00000,"shadd", int_aarch64_neon_shadd>;
+      TriOpFrag<(add node:$LHS, (AArch64sabd node:$MHS, node:$RHS))> >;
+defm SABD     : SIMDThreeSameVectorBHS<0,0b01110,"sabd", AArch64sabd>;
+defm SHADD    : SIMDThreeSameVectorBHS<0,0b00000,"shadd", AArch64shadd>;
 defm SHSUB    : SIMDThreeSameVectorBHS<0,0b00100,"shsub", int_aarch64_neon_shsub>;
 defm SMAXP    : SIMDThreeSameVectorBHS<0,0b10100,"smaxp", int_aarch64_neon_smaxp>;
 defm SMAX     : SIMDThreeSameVectorBHS<0,0b01100,"smax", smax>;
@@ -4081,9 +4179,9 @@ defm SRSHL    : SIMDThreeSameVector<0,0b01010,"srshl", int_aarch64_neon_srshl>;
 defm SSHL     : SIMDThreeSameVector<0,0b01000,"sshl", int_aarch64_neon_sshl>;
 defm SUB      : SIMDThreeSameVector<1,0b10000,"sub", sub>;
 defm UABA     : SIMDThreeSameVectorBHSTied<1, 0b01111, "uaba",
-      TriOpFrag<(add node:$LHS, (int_aarch64_neon_uabd node:$MHS, node:$RHS))> >;
-defm UABD     : SIMDThreeSameVectorBHS<1,0b01110,"uabd", int_aarch64_neon_uabd>;
-defm UHADD    : SIMDThreeSameVectorBHS<1,0b00000,"uhadd", int_aarch64_neon_uhadd>;
+      TriOpFrag<(add node:$LHS, (AArch64uabd node:$MHS, node:$RHS))> >;
+defm UABD     : SIMDThreeSameVectorBHS<1,0b01110,"uabd", AArch64uabd>;
+defm UHADD    : SIMDThreeSameVectorBHS<1,0b00000,"uhadd", AArch64uhadd>;
 defm UHSUB    : SIMDThreeSameVectorBHS<1,0b00100,"uhsub", int_aarch64_neon_uhsub>;
 defm UMAXP    : SIMDThreeSameVectorBHS<1,0b10100,"umaxp", int_aarch64_neon_umaxp>;
 defm UMAX     : SIMDThreeSameVectorBHS<1,0b01100,"umax", umax>;
@@ -4481,6 +4579,10 @@ def : Pat<(v1i64 (int_aarch64_neon_fcvtps (v1f64 FPR64:$Rn))),
           (FCVTPSv1i64 FPR64:$Rn)>;
 def : Pat<(v1i64 (int_aarch64_neon_fcvtpu (v1f64 FPR64:$Rn))),
           (FCVTPUv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtzs (v1f64 FPR64:$Rn))),
+          (FCVTZSv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtzu (v1f64 FPR64:$Rn))),
+          (FCVTZUv1i64 FPR64:$Rn)>;
 
 def : Pat<(f16 (int_aarch64_neon_frecpe (f16 FPR16:$Rn))),
           (FRECPEv1f16 FPR16:$Rn)>;
@@ -4652,9 +4754,9 @@ defm RADDHN : SIMDNarrowThreeVectorBHS<1,0b0100,"raddhn",int_aarch64_neon_raddhn
 defm RSUBHN : SIMDNarrowThreeVectorBHS<1,0b0110,"rsubhn",int_aarch64_neon_rsubhn>;
 defm PMULL  : SIMDDifferentThreeVectorBD<0,0b1110,"pmull",int_aarch64_neon_pmull>;
 defm SABAL  : SIMDLongThreeVectorTiedBHSabal<0,0b0101,"sabal",
-                                             int_aarch64_neon_sabd>;
+                                             AArch64sabd>;
 defm SABDL   : SIMDLongThreeVectorBHSabdl<0, 0b0111, "sabdl",
-                                          int_aarch64_neon_sabd>;
+                                          AArch64sabd>;
 defm SADDL   : SIMDLongThreeVectorBHS<   0, 0b0000, "saddl",
             BinOpFrag<(add (sext node:$LHS), (sext node:$RHS))>>;
 defm SADDW   : SIMDWideThreeVectorBHS<   0, 0b0001, "saddw",
@@ -4675,20 +4777,58 @@ defm SSUBL   : SIMDLongThreeVectorBHS<0, 0b0010, "ssubl",
 defm SSUBW   : SIMDWideThreeVectorBHS<0, 0b0011, "ssubw",
                  BinOpFrag<(sub node:$LHS, (sext node:$RHS))>>;
 defm UABAL   : SIMDLongThreeVectorTiedBHSabal<1, 0b0101, "uabal",
-                                              int_aarch64_neon_uabd>;
+                                              AArch64uabd>;
 defm UADDL   : SIMDLongThreeVectorBHS<1, 0b0000, "uaddl",
-                 BinOpFrag<(add (zext node:$LHS), (zext node:$RHS))>>;
+                 BinOpFrag<(add (zanyext node:$LHS), (zanyext node:$RHS))>>;
 defm UADDW   : SIMDWideThreeVectorBHS<1, 0b0001, "uaddw",
-                 BinOpFrag<(add node:$LHS, (zext node:$RHS))>>;
+                 BinOpFrag<(add node:$LHS, (zanyext node:$RHS))>>;
 defm UMLAL   : SIMDLongThreeVectorTiedBHS<1, 0b1000, "umlal",
     TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
 defm UMLSL   : SIMDLongThreeVectorTiedBHS<1, 0b1010, "umlsl",
     TriOpFrag<(sub node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
 defm UMULL   : SIMDLongThreeVectorBHS<1, 0b1100, "umull", int_aarch64_neon_umull>;
 defm USUBL   : SIMDLongThreeVectorBHS<1, 0b0010, "usubl",
-                 BinOpFrag<(sub (zext node:$LHS), (zext node:$RHS))>>;
+                 BinOpFrag<(sub (zanyext node:$LHS), (zanyext node:$RHS))>>;
 defm USUBW   : SIMDWideThreeVectorBHS<   1, 0b0011, "usubw",
-                 BinOpFrag<(sub node:$LHS, (zext node:$RHS))>>;
+                 BinOpFrag<(sub node:$LHS, (zanyext node:$RHS))>>;
+
+// Additional patterns for [SU]ML[AS]L
+multiclass Neon_mul_acc_widen_patterns<SDPatternOperator opnode, SDPatternOperator vecopnode,
+  Instruction INST8B, Instruction INST4H, Instruction INST2S> {
+  def : Pat<(v4i16 (opnode
+                    V64:$Ra,
+                    (v4i16 (extract_subvector
+                            (vecopnode (v8i8 V64:$Rn),(v8i8 V64:$Rm)),
+                            (i64 0))))),
+             (EXTRACT_SUBREG (v8i16 (INST8B
+                                     (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), V64:$Ra, dsub),
+                                     V64:$Rn, V64:$Rm)), dsub)>;
+  def : Pat<(v2i32 (opnode
+                    V64:$Ra,
+                    (v2i32 (extract_subvector
+                            (vecopnode (v4i16 V64:$Rn),(v4i16 V64:$Rm)),
+                            (i64 0))))),
+             (EXTRACT_SUBREG (v4i32 (INST4H
+                                     (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), V64:$Ra, dsub),
+                                     V64:$Rn, V64:$Rm)), dsub)>;
+  def : Pat<(v1i64 (opnode
+                    V64:$Ra,
+                    (v1i64 (extract_subvector
+                            (vecopnode (v2i32 V64:$Rn),(v2i32 V64:$Rm)),
+                            (i64 0))))),
+             (EXTRACT_SUBREG (v2i64 (INST2S
+                                     (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), V64:$Ra, dsub),
+                                     V64:$Rn, V64:$Rm)), dsub)>;
+}
+
+defm : Neon_mul_acc_widen_patterns<add, int_aarch64_neon_umull,
+     UMLALv8i8_v8i16, UMLALv4i16_v4i32, UMLALv2i32_v2i64>;
+defm : Neon_mul_acc_widen_patterns<add, int_aarch64_neon_smull,
+     SMLALv8i8_v8i16, SMLALv4i16_v4i32, SMLALv2i32_v2i64>;
+defm : Neon_mul_acc_widen_patterns<sub, int_aarch64_neon_umull,
+     UMLSLv8i8_v8i16, UMLSLv4i16_v4i32, UMLSLv2i32_v2i64>;
+defm : Neon_mul_acc_widen_patterns<sub, int_aarch64_neon_smull,
+     SMLSLv8i8_v8i16, SMLSLv4i16_v4i32, SMLSLv2i32_v2i64>;
 
 // Additional patterns for SMULL and UMULL
 multiclass Neon_mul_widen_patterns<SDPatternOperator opnode,
@@ -4901,6 +5041,26 @@ defm FMAXNMP : SIMDFPPairwiseScalar<0, 0b01100, "fmaxnmp">;
 defm FMAXP   : SIMDFPPairwiseScalar<0, 0b01111, "fmaxp">;
 defm FMINNMP : SIMDFPPairwiseScalar<1, 0b01100, "fminnmp">;
 defm FMINP   : SIMDFPPairwiseScalar<1, 0b01111, "fminp">;
+
+let Predicates = [HasFullFP16] in {
+def : Pat<(f16 (vecreduce_fadd (v8f16 V128:$Rn))),
+            (FADDPv2i16p
+              (EXTRACT_SUBREG
+                 (FADDPv8f16 (FADDPv8f16 V128:$Rn, (v8f16 (IMPLICIT_DEF))), (v8f16 (IMPLICIT_DEF))),
+               dsub))>;
+def : Pat<(f16 (vecreduce_fadd (v4f16 V64:$Rn))),
+          (FADDPv2i16p (FADDPv4f16 V64:$Rn, (v4f16 (IMPLICIT_DEF))))>;
+}
+def : Pat<(f32 (vecreduce_fadd (v4f32 V128:$Rn))),
+          (FADDPv2i32p
+            (EXTRACT_SUBREG
+              (FADDPv4f32 V128:$Rn, (v4f32 (IMPLICIT_DEF))),
+             dsub))>;
+def : Pat<(f32 (vecreduce_fadd (v2f32 V64:$Rn))),
+          (FADDPv2i32p V64:$Rn)>;
+def : Pat<(f64 (vecreduce_fadd (v2f64 V128:$Rn))),
+          (FADDPv2i64p V128:$Rn)>;
+
 def : Pat<(v2i64 (AArch64saddv V128:$Rn)),
           (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (ADDPv2i64p V128:$Rn), dsub)>;
 def : Pat<(v2i64 (AArch64uaddv V128:$Rn)),
@@ -5152,6 +5312,16 @@ def : Pat<(v4f16 (vector_insert (v4f16 V64:$Rn),
               (i64 0)),
             dsub)>;
 
+def : Pat<(vector_insert (v8f16 v8f16:$Rn), (f16 fpimm0),
+            (i64 VectorIndexH:$imm)),
+          (INSvi16gpr V128:$Rn, VectorIndexH:$imm, WZR)>;
+def : Pat<(vector_insert v4f32:$Rn, (f32 fpimm0),
+            (i64 VectorIndexS:$imm)),
+          (INSvi32gpr V128:$Rn, VectorIndexS:$imm, WZR)>;
+def : Pat<(vector_insert v2f64:$Rn, (f64 fpimm0),
+            (i64 VectorIndexD:$imm)),
+          (INSvi64gpr V128:$Rn, VectorIndexS:$imm, XZR)>;
+
 def : Pat<(v8f16 (vector_insert (v8f16 V128:$Rn),
             (f16 FPR16:$Rm), (i64 VectorIndexH:$imm))),
           (INSvi16lane
@@ -6663,7 +6833,17 @@ def : Pat<(i32 (trunc GPR64sp:$src)),
 
 // __builtin_trap() uses the BRK instruction on AArch64.
 def : Pat<(trap), (BRK 1)>;
-def : Pat<(debugtrap), (BRK 0xF000)>, Requires<[IsWindows]>;
+def : Pat<(debugtrap), (BRK 0xF000)>;
+
+def ubsan_trap_xform : SDNodeXForm<timm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() | ('U' << 8), SDLoc(N), MVT::i32);
+}]>;
+
+def ubsan_trap_imm : TImmLeaf<i32, [{
+  return isUInt<8>(Imm);
+}], ubsan_trap_xform>;
+
+def : Pat<(ubsantrap ubsan_trap_imm:$kind), (BRK ubsan_trap_imm:$kind)>;
 
 // Multiply high patterns which multiply the lower subvector using smull/umull
 // and the upper subvector with smull2/umull2. Then shuffle the high the high
@@ -7459,6 +7639,9 @@ def : Pat<(f64 (fadd (vector_extract (v2f64 FPR128:$Rn), (i64 0)),
 def : Pat<(fadd (vector_extract (v4f32 FPR128:$Rn), (i64 0)),
                 (vector_extract (v4f32 FPR128:$Rn), (i64 1))),
           (f32 (FADDPv2i32p (EXTRACT_SUBREG FPR128:$Rn, dsub)))>;
+def : Pat<(fadd (vector_extract (v8f16 FPR128:$Rn), (i64 0)),
+                (vector_extract (v8f16 FPR128:$Rn), (i64 1))),
+          (f16 (FADDPv2i16p (EXTRACT_SUBREG FPR128:$Rn, dsub)))>;
 
 // Scalar 64-bit shifts in FPR64 registers.
 def : Pat<(i64 (int_aarch64_neon_sshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
@@ -7661,6 +7844,23 @@ let AddedComplexity = 10 in {
   // FIXME: add SVE dot-product patterns.
 }
 
+let Predicates = [HasLS64] in {
+  def LD64B: LoadStore64B<0b101, "ld64b", (ins GPR64sp:$Rn),
+                                          (outs GPR64x8:$Rt)>;
+  def ST64B: LoadStore64B<0b001, "st64b", (ins GPR64x8:$Rt, GPR64sp:$Rn),
+                                          (outs)>;
+  def ST64BV:   Store64BV<0b011, "st64bv">;
+  def ST64BV0:  Store64BV<0b010, "st64bv0">;
+
+  class ST64BPattern<Intrinsic intrinsic, Instruction instruction>
+    : Pat<(intrinsic GPR64sp:$addr, GPR64:$x0, GPR64:$x1, GPR64:$x2, GPR64:$x3, GPR64:$x4, GPR64:$x5, GPR64:$x6, GPR64:$x7),
+          (instruction (REG_SEQUENCE GPR64x8Class, $x0, x8sub_0, $x1, x8sub_1, $x2, x8sub_2, $x3, x8sub_3, $x4, x8sub_4, $x5, x8sub_5, $x6, x8sub_6, $x7, x8sub_7), $addr)>;
+
+  def : ST64BPattern<int_aarch64_st64b, ST64B>;
+  def : ST64BPattern<int_aarch64_st64bv, ST64BV>;
+  def : ST64BPattern<int_aarch64_st64bv0, ST64BV0>;
+}
+
 include "AArch64InstrAtomics.td"
 include "AArch64SVEInstrInfo.td"
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index d975b8bd04fe..ad180cb2935e 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -1186,8 +1186,10 @@ bool AArch64LoadStoreOpt::findMatchingStore(
     // store instruction writes and the stored value is not modified, we can
     // promote the load. Since we do not handle stores with pre-/post-index,
     // it's unnecessary to check if BaseReg is modified by the store itself.
+    // Also we can't handle stores without an immediate offset operand,
+    // while the operand might be the address for a global variable.
     if (MI.mayStore() && isMatchingStore(LoadMI, MI) &&
-        BaseReg == getLdStBaseOp(MI).getReg() &&
+        BaseReg == getLdStBaseOp(MI).getReg() && getLdStOffsetOp(MI).isImm() &&
         isLdOffsetInRangeOfSt(LoadMI, MI, TII) &&
         ModifiedRegUnits.available(getLdStRegOp(MI).getReg())) {
       StoreI = MBBI;
@@ -1550,16 +1552,27 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
             continue;
           }
         }
-        // If the destination register of the loads is the same register, bail
-        // and keep looking. A load-pair instruction with both destination
-        // registers the same is UNPREDICTABLE and will result in an exception.
-        if (MayLoad && Reg == getLdStRegOp(MI).getReg()) {
+        // If the destination register of one load is the same register or a
+        // sub/super register of the other load, bail and keep looking. A
+        // load-pair instruction with both destination registers the same is
+        // UNPREDICTABLE and will result in an exception.
+        if (MayLoad &&
+            TRI->isSuperOrSubRegisterEq(Reg, getLdStRegOp(MI).getReg())) {
           LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits,
                                             TRI);
           MemInsns.push_back(&MI);
           continue;
         }
 
+        // If the BaseReg has been modified, then we cannot do the optimization.
+        // For example, in the following pattern
+        //   ldr x1 [x2]
+        //   ldr x2 [x3]
+        //   ldr x4 [x2, #8],
+        // the first and third ldr cannot be converted to ldp x1, x4, [x2]
+        if (!ModifiedRegUnits.available(BaseReg))
+          return E;
+
         // If the Rt of the second instruction was not modified or used between
         // the two instructions and none of the instructions between the second
         // and first alias with the second, we can combine the second into the
@@ -1750,6 +1763,11 @@ bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
   return false;
 }
 
+static bool needsWinCFI(const MachineFunction *MF) {
+  return MF->getTarget().getMCAsmInfo()->usesWindowsCFI() &&
+         MF->getFunction().needsUnwindTableEntry();
+}
+
 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
     MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
   MachineBasicBlock::iterator E = I->getParent()->end();
@@ -1790,14 +1808,11 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
   // the memory access (I) and the increment (MBBI) can access the memory
   // region defined by [SP, MBBI].
   const bool BaseRegSP = BaseReg == AArch64::SP;
-  if (BaseRegSP) {
+  if (BaseRegSP && needsWinCFI(I->getMF())) {
     // FIXME: For now, we always block the optimization over SP in windows
     // targets as it requires to adjust the unwind/debug info, messing up
     // the unwind info can actually cause a miscompile.
-    const MCAsmInfo *MAI = I->getMF()->getTarget().getMCAsmInfo();
-    if (MAI->usesWindowsCFI() &&
-        I->getMF()->getFunction().needsUnwindTableEntry())
-      return E;
+    return E;
   }
 
   for (unsigned Count = 0; MBBI != E && Count < Limit;
@@ -1853,6 +1868,14 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
     }
   }
 
+  const bool BaseRegSP = BaseReg == AArch64::SP;
+  if (BaseRegSP && needsWinCFI(I->getMF())) {
+    // FIXME: For now, we always block the optimization over SP in windows
+    // targets as it requires to adjust the unwind/debug info, messing up
+    // the unwind info can actually cause a miscompile.
+    return E;
+  }
+
   // Track which register units have been modified and used between the first
   // insn (inclusive) and the second insn.
   ModifiedRegUnits.clear();
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
index afd5ae6bcbf2..10e191ff44cf 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
@@ -203,6 +203,12 @@ MCOperand AArch64MCInstLower::lowerSymbolOperandCOFF(const MachineOperand &MO,
     RefFlags |= AArch64MCExpr::VK_SABS;
   } else {
     RefFlags |= AArch64MCExpr::VK_ABS;
+
+    if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE)
+      RefFlags |= AArch64MCExpr::VK_PAGE;
+    else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) ==
+             AArch64II::MO_PAGEOFF)
+      RefFlags |= AArch64MCExpr::VK_PAGEOFF | AArch64MCExpr::VK_NC;
   }
 
   if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_G3)
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
index a37e38072554..41343ba9700c 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
@@ -14,6 +14,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64MachineFunctionInfo.h"
+#include "AArch64InstrInfo.h"
+#include <llvm/IR/Metadata.h>
+#include <llvm/IR/Module.h>
 
 using namespace llvm;
 
@@ -30,3 +33,82 @@ void AArch64FunctionInfo::initializeBaseYamlFields(
   if (YamlMFI.HasRedZone.hasValue())
     HasRedZone = YamlMFI.HasRedZone;
 }
+
+static std::pair<bool, bool> GetSignReturnAddress(const Function &F) {
+  // The function should be signed in the following situations:
+  // - sign-return-address=all
+  // - sign-return-address=non-leaf and the functions spills the LR
+  if (!F.hasFnAttribute("sign-return-address")) {
+    const Module &M = *F.getParent();
+    if (const auto *Sign = mdconst::extract_or_null<ConstantInt>(
+            M.getModuleFlag("sign-return-address"))) {
+      if (Sign->getZExtValue()) {
+        if (const auto *All = mdconst::extract_or_null<ConstantInt>(
+                M.getModuleFlag("sign-return-address-all")))
+          return {true, All->getZExtValue()};
+        return {true, false};
+      }
+    }
+    return {false, false};
+  }
+
+  StringRef Scope = F.getFnAttribute("sign-return-address").getValueAsString();
+  if (Scope.equals("none"))
+    return {false, false};
+
+  if (Scope.equals("all"))
+    return {true, true};
+
+  assert(Scope.equals("non-leaf"));
+  return {true, false};
+}
+
+static bool ShouldSignWithBKey(const Function &F) {
+  if (!F.hasFnAttribute("sign-return-address-key")) {
+    if (const auto *BKey = mdconst::extract_or_null<ConstantInt>(
+            F.getParent()->getModuleFlag("sign-return-address-with-bkey")))
+      return BKey->getZExtValue();
+    return false;
+  }
+
+  const StringRef Key =
+      F.getFnAttribute("sign-return-address-key").getValueAsString();
+  assert(Key.equals_lower("a_key") || Key.equals_lower("b_key"));
+  return Key.equals_lower("b_key");
+}
+
+AArch64FunctionInfo::AArch64FunctionInfo(MachineFunction &MF) : MF(MF) {
+  // If we already know that the function doesn't have a redzone, set
+  // HasRedZone here.
+  if (MF.getFunction().hasFnAttribute(Attribute::NoRedZone))
+    HasRedZone = false;
+
+  const Function &F = MF.getFunction();
+  std::tie(SignReturnAddress, SignReturnAddressAll) = GetSignReturnAddress(F);
+  SignWithBKey = ShouldSignWithBKey(F);
+
+  if (!F.hasFnAttribute("branch-target-enforcement")) {
+    if (const auto *BTE = mdconst::extract_or_null<ConstantInt>(
+            F.getParent()->getModuleFlag("branch-target-enforcement")))
+      BranchTargetEnforcement = BTE->getZExtValue();
+    return;
+  }
+
+  const StringRef BTIEnable = F.getFnAttribute("branch-target-enforcement").getValueAsString();
+  assert(BTIEnable.equals_lower("true") || BTIEnable.equals_lower("false"));
+  BranchTargetEnforcement = BTIEnable.equals_lower("true");
+}
+
+bool AArch64FunctionInfo::shouldSignReturnAddress(bool SpillsLR) const {
+  if (!SignReturnAddress)
+    return false;
+  if (SignReturnAddressAll)
+    return true;
+  return SpillsLR;
+}
+
+bool AArch64FunctionInfo::shouldSignReturnAddress() const {
+  return shouldSignReturnAddress(llvm::any_of(
+      MF.getFrameInfo().getCalleeSavedInfo(),
+      [](const auto &Info) { return Info.getReg() == AArch64::LR; }));
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 84aa53f2bece..f60e2b6c316e 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -20,7 +20,6 @@
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MIRYamlMapping.h"
 #include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/IR/Function.h"
 #include "llvm/MC/MCLinkerOptimizationHint.h"
 #include <cassert>
@@ -36,6 +35,9 @@ class MachineInstr;
 /// AArch64FunctionInfo - This class is derived from MachineFunctionInfo and
 /// contains private AArch64-specific information for each MachineFunction.
 class AArch64FunctionInfo final : public MachineFunctionInfo {
+  /// Backreference to the machine function.
+  MachineFunction &MF;
+
   /// Number of bytes of arguments this function has on the stack. If the callee
   /// is expected to restore the argument stack this should be a multiple of 16,
   /// all usable during a tail call.
@@ -126,26 +128,40 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   /// that must be forwarded to every musttail call.
   SmallVector<ForwardedRegister, 1> ForwardedMustTailRegParms;
 
-  // Offset from SP-at-entry to the tagged base pointer.
-  // Tagged base pointer is set up to point to the first (lowest address) tagged
-  // stack slot.
-  unsigned TaggedBasePointerOffset = 0;
+  /// FrameIndex for the tagged base pointer.
+  Optional<int> TaggedBasePointerIndex;
+
+  /// Offset from SP-at-entry to the tagged base pointer.
+  /// Tagged base pointer is set up to point to the first (lowest address)
+  /// tagged stack slot.
+  unsigned TaggedBasePointerOffset;
 
   /// OutliningStyle denotes, if a function was outined, how it was outlined,
   /// e.g. Tail Call, Thunk, or Function if none apply.
   Optional<std::string> OutliningStyle;
 
-public:
-  AArch64FunctionInfo() = default;
+  // Offset from SP-after-callee-saved-spills (i.e. SP-at-entry minus
+  // CalleeSavedStackSize) to the address of the frame record.
+  int CalleeSaveBaseToFrameRecordOffset = 0;
 
-  explicit AArch64FunctionInfo(MachineFunction &MF) {
-    (void)MF;
+  /// SignReturnAddress is true if PAC-RET is enabled for the function with
+  /// defaults being sign non-leaf functions only, with the B key.
+  bool SignReturnAddress = false;
+
+  /// SignReturnAddressAll modifies the default PAC-RET mode to signing leaf
+  /// functions as well.
+  bool SignReturnAddressAll = false;
+
+  /// SignWithBKey modifies the default PAC-RET mode to signing with the B key.
+  bool SignWithBKey = false;
+
+  /// BranchTargetEnforcement enables placing BTI instructions at potential
+  /// indirect branch destinations.
+  bool BranchTargetEnforcement = false;
+
+public:
+  explicit AArch64FunctionInfo(MachineFunction &MF);
 
-    // If we already know that the function doesn't have a redzone, set
-    // HasRedZone here.
-    if (MF.getFunction().hasFnAttribute(Attribute::NoRedZone))
-      HasRedZone = false;
-  }
   void initializeBaseYamlFields(const yaml::AArch64FunctionInfo &YamlMFI);
 
   unsigned getBytesInStackArgArea() const { return BytesInStackArgArea; }
@@ -281,15 +297,14 @@ public:
   void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
 
   unsigned getJumpTableEntrySize(int Idx) const {
-    auto It = JumpTableEntryInfo.find(Idx);
-    if (It != JumpTableEntryInfo.end())
-      return It->second.first;
-    return 4;
+    return JumpTableEntryInfo[Idx].first;
   }
   MCSymbol *getJumpTableEntryPCRelSymbol(int Idx) const {
-    return JumpTableEntryInfo.find(Idx)->second.second;
+    return JumpTableEntryInfo[Idx].second;
   }
   void setJumpTableEntryInfo(int Idx, unsigned Size, MCSymbol *PCRelSym) {
+    if ((unsigned)Idx >= JumpTableEntryInfo.size())
+      JumpTableEntryInfo.resize(Idx+1);
     JumpTableEntryInfo[Idx] = std::make_pair(Size, PCRelSym);
   }
 
@@ -331,6 +346,11 @@ public:
     return ForwardedMustTailRegParms;
   }
 
+  Optional<int> getTaggedBasePointerIndex() const {
+    return TaggedBasePointerIndex;
+  }
+  void setTaggedBasePointerIndex(int Index) { TaggedBasePointerIndex = Index; }
+
   unsigned getTaggedBasePointerOffset() const {
     return TaggedBasePointerOffset;
   }
@@ -338,12 +358,26 @@ public:
     TaggedBasePointerOffset = Offset;
   }
 
+  int getCalleeSaveBaseToFrameRecordOffset() const {
+    return CalleeSaveBaseToFrameRecordOffset;
+  }
+  void setCalleeSaveBaseToFrameRecordOffset(int Offset) {
+    CalleeSaveBaseToFrameRecordOffset = Offset;
+  }
+
+  bool shouldSignReturnAddress() const;
+  bool shouldSignReturnAddress(bool SpillsLR) const;
+
+  bool shouldSignWithBKey() const { return SignWithBKey; }
+
+  bool branchTargetEnforcement() const { return BranchTargetEnforcement; }
+
 private:
   // Hold the lists of LOHs.
   MILOHContainer LOHContainerSet;
   SetOfInstructions LOHRelated;
 
-  DenseMap<int, std::pair<unsigned, MCSymbol *>> JumpTableEntryInfo;
+  SmallVector<std::pair<unsigned, MCSymbol *>, 2> JumpTableEntryInfo;
 };
 
 namespace yaml {
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp
index 9a2103579a6a..f3b8ef16d6f9 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp
@@ -21,7 +21,7 @@ namespace {
 
 /// CMN, CMP, TST followed by Bcc
 static bool isArithmeticBccPair(const MachineInstr *FirstMI,
-                                const MachineInstr &SecondMI) {
+                                const MachineInstr &SecondMI, bool CmpOnly) {
   if (SecondMI.getOpcode() != AArch64::Bcc)
     return false;
 
@@ -29,6 +29,13 @@ static bool isArithmeticBccPair(const MachineInstr *FirstMI,
   if (FirstMI == nullptr)
     return true;
 
+  // If we're in CmpOnly mode, we only fuse arithmetic instructions that
+  // discard their result.
+  if (CmpOnly && !(FirstMI->getOperand(0).getReg() == AArch64::XZR ||
+                   FirstMI->getOperand(0).getReg() == AArch64::WZR)) {
+    return false;
+  }
+
   switch (FirstMI->getOpcode()) {
   case AArch64::ADDSWri:
   case AArch64::ADDSWrr:
@@ -380,8 +387,11 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
 
   // All checking functions assume that the 1st instr is a wildcard if it is
   // unspecified.
-  if (ST.hasArithmeticBccFusion() && isArithmeticBccPair(FirstMI, SecondMI))
-    return true;
+  if (ST.hasCmpBccFusion() || ST.hasArithmeticBccFusion()) {
+    bool CmpOnly = !ST.hasArithmeticBccFusion();
+    if (isArithmeticBccPair(FirstMI, SecondMI, CmpOnly))
+      return true;
+  }
   if (ST.hasArithmeticCbzFusion() && isArithmeticCbzPair(FirstMI, SecondMI))
     return true;
   if (ST.hasFuseAES() && isAESPair(FirstMI, SecondMI))
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
index 0d75ab7ac8a9..019220e3a527 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
@@ -408,6 +408,11 @@ bool AArch64RedundantCopyElimination::optimizeBlock(MachineBasicBlock *MBB) {
                          O.getReg() != CmpReg;
                 }))
               continue;
+
+            // Don't remove a move immediate that implicitly defines the upper
+            // bits as different.
+            if (TRI->isSuperRegister(DefReg, KnownReg.Reg) && KnownReg.Imm < 0)
+              continue;
           }
 
           if (IsCopy)
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 3e9c8c7b6df2..f90856d14b2f 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -15,7 +15,6 @@
 #include "AArch64FrameLowering.h"
 #include "AArch64InstrInfo.h"
 #include "AArch64MachineFunctionInfo.h"
-#include "AArch64StackOffset.h"
 #include "AArch64Subtarget.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "llvm/ADT/BitVector.h"
@@ -25,6 +24,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Support/raw_ostream.h"
@@ -240,6 +240,14 @@ AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
     return SCS ? CSR_AArch64_AAPCS_SCS_RegMask : CSR_AArch64_AAPCS_RegMask;
 }
 
+const uint32_t *AArch64RegisterInfo::getCustomEHPadPreservedMask(
+    const MachineFunction &MF) const {
+  if (MF.getSubtarget<AArch64Subtarget>().isTargetLinux())
+    return CSR_AArch64_AAPCS_RegMask;
+
+  return nullptr;
+}
+
 const uint32_t *AArch64RegisterInfo::getTLSCallPreservedMask() const {
   if (TT.isOSDarwin())
     return CSR_Darwin_AArch64_TLS_RegMask;
@@ -326,16 +334,16 @@ bool AArch64RegisterInfo::isReservedReg(const MachineFunction &MF,
 }
 
 bool AArch64RegisterInfo::isAnyArgRegReserved(const MachineFunction &MF) const {
-  return std::any_of(std::begin(*AArch64::GPR64argRegClass.MC),
-                     std::end(*AArch64::GPR64argRegClass.MC),
-                     [this, &MF](MCPhysReg r){return isReservedReg(MF, r);});
+  return llvm::any_of(*AArch64::GPR64argRegClass.MC, [this, &MF](MCPhysReg r) {
+    return isReservedReg(MF, r);
+  });
 }
 
 void AArch64RegisterInfo::emitReservedArgRegCallError(
     const MachineFunction &MF) const {
   const Function &F = MF.getFunction();
-  F.getContext().diagnose(DiagnosticInfoUnsupported{F, "AArch64 doesn't support"
-    " function calls if any of the argument registers is reserved."});
+  F.getContext().diagnose(DiagnosticInfoUnsupported{F, ("AArch64 doesn't support"
+    " function calls if any of the argument registers is reserved.")});
 }
 
 bool AArch64RegisterInfo::isAsmClobberable(const MachineFunction &MF,
@@ -517,16 +525,16 @@ bool AArch64RegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
                                              Register BaseReg,
                                              int64_t Offset) const {
   assert(MI && "Unable to get the legal offset for nil instruction.");
-  StackOffset SaveOffset(Offset, MVT::i8);
+  StackOffset SaveOffset = StackOffset::getFixed(Offset);
   return isAArch64FrameOffsetLegal(*MI, SaveOffset) & AArch64FrameOffsetIsLegal;
 }
 
 /// Insert defining instruction(s) for BaseReg to be a pointer to FrameIdx
 /// at the beginning of the basic block.
-void AArch64RegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
-                                                       Register BaseReg,
-                                                       int FrameIdx,
-                                                       int64_t Offset) const {
+Register
+AArch64RegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
+                                                  int FrameIdx,
+                                                  int64_t Offset) const {
   MachineBasicBlock::iterator Ins = MBB->begin();
   DebugLoc DL; // Defaults to "unknown"
   if (Ins != MBB->end())
@@ -536,6 +544,7 @@ void AArch64RegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
       MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
   const MCInstrDesc &MCID = TII->get(AArch64::ADDXri);
   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  Register BaseReg = MRI.createVirtualRegister(&AArch64::GPR64spRegClass);
   MRI.constrainRegClass(BaseReg, TII->getRegClass(MCID, 0, this, MF));
   unsigned Shifter = AArch64_AM::getShifterImm(AArch64_AM::LSL, 0);
 
@@ -543,19 +552,21 @@ void AArch64RegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
       .addFrameIndex(FrameIdx)
       .addImm(Offset)
       .addImm(Shifter);
+
+  return BaseReg;
 }
 
 void AArch64RegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
                                             int64_t Offset) const {
   // ARM doesn't need the general 64-bit offsets
-  StackOffset Off(Offset, MVT::i8);
+  StackOffset Off = StackOffset::getFixed(Offset);
 
   unsigned i = 0;
-
   while (!MI.getOperand(i).isFI()) {
     ++i;
     assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
   }
+
   const MachineFunction *MF = MI.getParent()->getParent();
   const AArch64InstrInfo *TII =
       MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
@@ -585,6 +596,33 @@ createScratchRegisterForInstruction(MachineInstr &MI,
   }
 }
 
+void AArch64RegisterInfo::getOffsetOpcodes(
+    const StackOffset &Offset, SmallVectorImpl<uint64_t> &Ops) const {
+  // The smallest scalable element supported by scaled SVE addressing
+  // modes are predicates, which are 2 scalable bytes in size. So the scalable
+  // byte offset must always be a multiple of 2.
+  assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
+
+  // Add fixed-sized offset using existing DIExpression interface.
+  DIExpression::appendOffset(Ops, Offset.getFixed());
+
+  unsigned VG = getDwarfRegNum(AArch64::VG, true);
+  int64_t VGSized = Offset.getScalable() / 2;
+  if (VGSized > 0) {
+    Ops.push_back(dwarf::DW_OP_constu);
+    Ops.push_back(VGSized);
+    Ops.append({dwarf::DW_OP_bregx, VG, 0ULL});
+    Ops.push_back(dwarf::DW_OP_mul);
+    Ops.push_back(dwarf::DW_OP_plus);
+  } else if (VGSized < 0) {
+    Ops.push_back(dwarf::DW_OP_constu);
+    Ops.push_back(-VGSized);
+    Ops.append({dwarf::DW_OP_bregx, VG, 0ULL});
+    Ops.push_back(dwarf::DW_OP_mul);
+    Ops.push_back(dwarf::DW_OP_minus);
+  }
+}
+
 void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
                                               int SPAdj, unsigned FIOperandNum,
                                               RegScavenger *RS) const {
@@ -597,29 +635,31 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   const AArch64InstrInfo *TII =
       MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
   const AArch64FrameLowering *TFI = getFrameLowering(MF);
-
   int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
   bool Tagged =
       MI.getOperand(FIOperandNum).getTargetFlags() & AArch64II::MO_TAGGED;
   Register FrameReg;
 
-  // Special handling of dbg_value, stackmap and patchpoint instructions.
-  if (MI.isDebugValue() || MI.getOpcode() == TargetOpcode::STACKMAP ||
-      MI.getOpcode() == TargetOpcode::PATCHPOINT) {
+  // Special handling of dbg_value, stackmap patchpoint statepoint instructions.
+  if (MI.getOpcode() == TargetOpcode::STACKMAP ||
+      MI.getOpcode() == TargetOpcode::PATCHPOINT ||
+      MI.getOpcode() == TargetOpcode::STATEPOINT) {
     StackOffset Offset =
         TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg,
                                         /*PreferFP=*/true,
                                         /*ForSimm=*/false);
-    Offset += StackOffset(MI.getOperand(FIOperandNum + 1).getImm(), MVT::i8);
+    Offset += StackOffset::getFixed(MI.getOperand(FIOperandNum + 1).getImm());
     MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false /*isDef*/);
-    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset.getBytes());
+    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset.getFixed());
     return;
   }
 
   if (MI.getOpcode() == TargetOpcode::LOCAL_ESCAPE) {
     MachineOperand &FI = MI.getOperand(FIOperandNum);
-    int Offset = TFI->getNonLocalFrameIndexReference(MF, FrameIndex);
-    FI.ChangeToImmediate(Offset);
+    StackOffset Offset = TFI->getNonLocalFrameIndexReference(MF, FrameIndex);
+    assert(!Offset.getScalable() &&
+           "Frame offsets with a scalable component are not supported");
+    FI.ChangeToImmediate(Offset.getFixed());
     return;
   }
 
@@ -628,12 +668,11 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     // TAGPstack must use the virtual frame register in its 3rd operand.
     const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
     FrameReg = MI.getOperand(3).getReg();
-    Offset = {MFI.getObjectOffset(FrameIndex) +
-                  AFI->getTaggedBasePointerOffset(),
-              MVT::i8};
+    Offset = StackOffset::getFixed(MFI.getObjectOffset(FrameIndex) +
+                                      AFI->getTaggedBasePointerOffset());
   } else if (Tagged) {
-    StackOffset SPOffset = {
-        MFI.getObjectOffset(FrameIndex) + (int64_t)MFI.getStackSize(), MVT::i8};
+    StackOffset SPOffset = StackOffset::getFixed(
+        MFI.getObjectOffset(FrameIndex) + (int64_t)MFI.getStackSize());
     if (MFI.hasVarSizedObjects() ||
         isAArch64FrameOffsetLegal(MI, SPOffset, nullptr, nullptr, nullptr) !=
             (AArch64FrameOffsetCanUpdate | AArch64FrameOffsetIsLegal)) {
@@ -654,8 +693,8 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
       return;
     }
     FrameReg = AArch64::SP;
-    Offset = {MFI.getObjectOffset(FrameIndex) + (int64_t)MFI.getStackSize(),
-              MVT::i8};
+    Offset = StackOffset::getFixed(MFI.getObjectOffset(FrameIndex) +
+                                   (int64_t)MFI.getStackSize());
   } else {
     Offset = TFI->resolveFrameIndexReference(
         MF, FrameIndex, FrameReg, /*PreferFP=*/false, /*ForSimm=*/true);
@@ -726,3 +765,19 @@ unsigned AArch64RegisterInfo::getLocalAddressRegister(
     return getBaseRegister();
   return getFrameRegister(MF);
 }
+
+/// SrcRC and DstRC will be morphed into NewRC if this returns true
+bool AArch64RegisterInfo::shouldCoalesce(
+    MachineInstr *MI, const TargetRegisterClass *SrcRC, unsigned SubReg,
+    const TargetRegisterClass *DstRC, unsigned DstSubReg,
+    const TargetRegisterClass *NewRC, LiveIntervals &LIS) const {
+  if (MI->isCopy() &&
+      ((DstRC->getID() == AArch64::GPR64RegClassID) ||
+       (DstRC->getID() == AArch64::GPR64commonRegClassID)) &&
+      MI->getOperand(0).getSubReg() && MI->getOperand(1).getSubReg())
+    // Do not coalesce in the case of a 32-bit subregister copy
+    // which implements a 32 to 64 bit zero extension
+    // which relies on the upper 32 bits being zeroed.
+    return false;
+  return true;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
index 7b20f181e76d..0c871ac089a7 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -72,6 +72,10 @@ public:
   // Funclets on ARM64 Windows don't preserve any registers.
   const uint32_t *getNoPreservedMask() const override;
 
+  // Unwinders may not preserve all Neon and SVE registers.
+  const uint32_t *
+  getCustomEHPadPreservedMask(const MachineFunction &MF) const override;
+
   /// getThisReturnPreservedMask - Returns a call preserved mask specific to the
   /// case that 'returned' is on an i64 first argument if the calling convention
   /// is one that can (partially) model this attribute with a preserved mask
@@ -103,9 +107,8 @@ public:
   bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override;
   bool isFrameOffsetLegal(const MachineInstr *MI, Register BaseReg,
                           int64_t Offset) const override;
-  void materializeFrameBaseRegister(MachineBasicBlock *MBB, Register BaseReg,
-                                    int FrameIdx,
-                                    int64_t Offset) const override;
+  Register materializeFrameBaseRegister(MachineBasicBlock *MBB, int FrameIdx,
+                                        int64_t Offset) const override;
   void resolveFrameIndex(MachineInstr &MI, Register BaseReg,
                          int64_t Offset) const override;
   void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
@@ -125,6 +128,15 @@ public:
 
   unsigned getLocalAddressRegister(const MachineFunction &MF) const;
   bool regNeedsCFI(unsigned Reg, unsigned &RegToUseForCFI) const;
+
+  /// SrcRC and DstRC will be morphed into NewRC if this returns true
+  bool shouldCoalesce(MachineInstr *MI, const TargetRegisterClass *SrcRC,
+                      unsigned SubReg, const TargetRegisterClass *DstRC,
+                      unsigned DstSubReg, const TargetRegisterClass *NewRC,
+                      LiveIntervals &LIS) const override;
+
+  void getOffsetOpcodes(const StackOffset &Offset,
+                        SmallVectorImpl<uint64_t> &Ops) const override;
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
index 54b351fda053..28d1988b8a5f 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -711,6 +711,32 @@ def XSeqPairClassOperand :
 
 //===----- END: v8.1a atomic CASP register operands -----------------------===//
 
+//===----------------------------------------------------------------------===//
+// Armv8.7a accelerator extension register operands: 8 consecutive GPRs
+// starting with an even one
+
+let Namespace = "AArch64" in {
+  foreach i = 0-7 in
+    def "x8sub_"#i : SubRegIndex<64, !mul(64, i)>;
+}
+
+def Tuples8X : RegisterTuples<
+  !foreach(i, [0,1,2,3,4,5,6,7], !cast<SubRegIndex>("x8sub_"#i)),
+  !foreach(i, [0,1,2,3,4,5,6,7], (trunc (decimate (rotl GPR64, i), 2), 12))>;
+
+def GPR64x8Class : RegisterClass<"AArch64", [i64], 64, (trunc Tuples8X, 12)>;
+def GPR64x8AsmOp : AsmOperandClass {
+  let Name = "GPR64x8";
+  let ParserMethod = "tryParseGPR64x8";
+  let RenderMethod = "addRegOperands";
+}
+def GPR64x8 : RegisterOperand<GPR64x8Class, "printGPR64x8"> {
+  let ParserMatchClass = GPR64x8AsmOp;
+  let PrintMethod = "printGPR64x8";
+}
+
+//===----- END: v8.7a accelerator extension register operands -------------===//
+
 // SVE predicate registers
 def P0    : AArch64Reg<0,   "p0">, DwarfRegNum<[48]>;
 def P1    : AArch64Reg<1,   "p1">, DwarfRegNum<[49]>;
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp
index fc31e701d3af..03b32967a212 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp
@@ -221,8 +221,9 @@ shouldReplaceInst(MachineFunction *MF, const MCInstrDesc *InstDesc,
   // if so, return it.
   std::string Subtarget = std::string(SchedModel.getSubtargetInfo()->getCPU());
   auto InstID = std::make_pair(InstDesc->getOpcode(), Subtarget);
-  if (SIMDInstrTable.find(InstID) != SIMDInstrTable.end())
-    return SIMDInstrTable[InstID];
+  auto It = SIMDInstrTable.find(InstID);
+  if (It != SIMDInstrTable.end())
+    return It->second;
 
   unsigned SCIdx = InstDesc->getSchedClass();
   const MCSchedClassDesc *SCDesc =
@@ -290,8 +291,9 @@ bool AArch64SIMDInstrOpt::shouldExitEarly(MachineFunction *MF, Subpass SP) {
   case Interleave:
     std::string Subtarget =
         std::string(SchedModel.getSubtargetInfo()->getCPU());
-    if (InterlEarlyExit.find(Subtarget) != InterlEarlyExit.end())
-      return InterlEarlyExit[Subtarget];
+    auto It = InterlEarlyExit.find(Subtarget);
+    if (It != InterlEarlyExit.end())
+      return It->second;
 
     for (auto &I : IRT) {
       OriginalMCID = &TII->get(I.OrigOpc);
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 4f29f2f18185..e09b8401c0e0 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -152,6 +152,8 @@ def AArch64fmaxv_p   : SDNode<"AArch64ISD::FMAXV_PRED",   SDT_AArch64Reduce>;
 def AArch64fmaxnmv_p : SDNode<"AArch64ISD::FMAXNMV_PRED", SDT_AArch64Reduce>;
 def AArch64fminv_p   : SDNode<"AArch64ISD::FMINV_PRED",   SDT_AArch64Reduce>;
 def AArch64fminnmv_p : SDNode<"AArch64ISD::FMINNMV_PRED", SDT_AArch64Reduce>;
+def AArch64saddv_p   : SDNode<"AArch64ISD::SADDV_PRED",   SDT_AArch64Reduce>;
+def AArch64uaddv_p   : SDNode<"AArch64ISD::UADDV_PRED",   SDT_AArch64Reduce>;
 def AArch64smaxv_p   : SDNode<"AArch64ISD::SMAXV_PRED",   SDT_AArch64Reduce>;
 def AArch64umaxv_p   : SDNode<"AArch64ISD::UMAXV_PRED",   SDT_AArch64Reduce>;
 def AArch64sminv_p   : SDNode<"AArch64ISD::SMINV_PRED",   SDT_AArch64Reduce>;
@@ -164,29 +166,83 @@ def AArch64lastb     : SDNode<"AArch64ISD::LASTB",        SDT_AArch64Reduce>;
 
 def SDT_AArch64Arith : SDTypeProfile<1, 3, [
   SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>,
-  SDTCVecEltisVT<1,i1>, SDTCisSameAs<2,3>
+  SDTCVecEltisVT<1,i1>, SDTCisSameAs<0,2>, SDTCisSameAs<2,3>
 ]>;
 
 def SDT_AArch64FMA : SDTypeProfile<1, 4, [
   SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>, SDTCisVec<4>,
-  SDTCVecEltisVT<1,i1>, SDTCisSameAs<2,3>, SDTCisSameAs<3,4>
+  SDTCVecEltisVT<1,i1>, SDTCisSameAs<0,2>, SDTCisSameAs<2,3>, SDTCisSameAs<3,4>
 ]>;
 
 // Predicated operations with the result of inactive lanes being unspecified.
 def AArch64add_p  : SDNode<"AArch64ISD::ADD_PRED",  SDT_AArch64Arith>;
+def AArch64asr_p  : SDNode<"AArch64ISD::SRA_PRED",  SDT_AArch64Arith>;
 def AArch64fadd_p : SDNode<"AArch64ISD::FADD_PRED", SDT_AArch64Arith>;
+def AArch64fdiv_p : SDNode<"AArch64ISD::FDIV_PRED", SDT_AArch64Arith>;
 def AArch64fma_p  : SDNode<"AArch64ISD::FMA_PRED",  SDT_AArch64FMA>;
+def AArch64fmaxnm_p : SDNode<"AArch64ISD::FMAXNM_PRED", SDT_AArch64Arith>;
+def AArch64fminnm_p : SDNode<"AArch64ISD::FMINNM_PRED", SDT_AArch64Arith>;
+def AArch64fmul_p : SDNode<"AArch64ISD::FMUL_PRED", SDT_AArch64Arith>;
+def AArch64fsub_p : SDNode<"AArch64ISD::FSUB_PRED", SDT_AArch64Arith>;
+def AArch64lsl_p  : SDNode<"AArch64ISD::SHL_PRED",  SDT_AArch64Arith>;
+def AArch64lsr_p  : SDNode<"AArch64ISD::SRL_PRED",  SDT_AArch64Arith>;
+def AArch64mul_p  : SDNode<"AArch64ISD::MUL_PRED",  SDT_AArch64Arith>;
 def AArch64sdiv_p : SDNode<"AArch64ISD::SDIV_PRED", SDT_AArch64Arith>;
+def AArch64smax_p : SDNode<"AArch64ISD::SMAX_PRED", SDT_AArch64Arith>;
+def AArch64smin_p : SDNode<"AArch64ISD::SMIN_PRED", SDT_AArch64Arith>;
+def AArch64sub_p  : SDNode<"AArch64ISD::SUB_PRED",  SDT_AArch64Arith>;
 def AArch64udiv_p : SDNode<"AArch64ISD::UDIV_PRED", SDT_AArch64Arith>;
+def AArch64umax_p : SDNode<"AArch64ISD::UMAX_PRED", SDT_AArch64Arith>;
+def AArch64umin_p : SDNode<"AArch64ISD::UMIN_PRED", SDT_AArch64Arith>;
 
-// Merging op1 into the inactive lanes.
-def AArch64smin_m1 :  SDNode<"AArch64ISD::SMIN_MERGE_OP1", SDT_AArch64Arith>;
-def AArch64umin_m1 :  SDNode<"AArch64ISD::UMIN_MERGE_OP1", SDT_AArch64Arith>;
-def AArch64smax_m1 :  SDNode<"AArch64ISD::SMAX_MERGE_OP1", SDT_AArch64Arith>;
-def AArch64umax_m1 :  SDNode<"AArch64ISD::UMAX_MERGE_OP1", SDT_AArch64Arith>;
-def AArch64lsl_m1  :  SDNode<"AArch64ISD::SHL_MERGE_OP1",  SDT_AArch64Arith>;
-def AArch64lsr_m1  :  SDNode<"AArch64ISD::SRL_MERGE_OP1",  SDT_AArch64Arith>;
-def AArch64asr_m1  :  SDNode<"AArch64ISD::SRA_MERGE_OP1",  SDT_AArch64Arith>;
+def SDT_AArch64IntExtend : SDTypeProfile<1, 4, [
+  SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVT<3, OtherVT>, SDTCisVec<4>,
+  SDTCVecEltisVT<1,i1>, SDTCisSameAs<0,2>, SDTCisVTSmallerThanOp<3, 2>, SDTCisSameAs<0,4>
+]>;
+
+// Predicated operations with the result of inactive lanes provided by the last operand.
+def AArch64clz_mt    : SDNode<"AArch64ISD::CTLZ_MERGE_PASSTHRU", SDT_AArch64Arith>;
+def AArch64cnt_mt    : SDNode<"AArch64ISD::CTPOP_MERGE_PASSTHRU", SDT_AArch64Arith>;
+def AArch64fneg_mt   : SDNode<"AArch64ISD::FNEG_MERGE_PASSTHRU", SDT_AArch64Arith>;
+def AArch64fabs_mt   : SDNode<"AArch64ISD::FABS_MERGE_PASSTHRU", SDT_AArch64Arith>;
+def AArch64abs_mt   : SDNode<"AArch64ISD::ABS_MERGE_PASSTHRU", SDT_AArch64Arith>;
+def AArch64neg_mt   : SDNode<"AArch64ISD::NEG_MERGE_PASSTHRU", SDT_AArch64Arith>;
+def AArch64sxt_mt    : SDNode<"AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU", SDT_AArch64IntExtend>;
+def AArch64uxt_mt    : SDNode<"AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU", SDT_AArch64IntExtend>;
+def AArch64frintp_mt : SDNode<"AArch64ISD::FCEIL_MERGE_PASSTHRU", SDT_AArch64Arith>;
+def AArch64frintm_mt : SDNode<"AArch64ISD::FFLOOR_MERGE_PASSTHRU", SDT_AArch64Arith>;
+def AArch64frinti_mt : SDNode<"AArch64ISD::FNEARBYINT_MERGE_PASSTHRU", SDT_AArch64Arith>;
+def AArch64frintx_mt : SDNode<"AArch64ISD::FRINT_MERGE_PASSTHRU", SDT_AArch64Arith>;
+def AArch64frinta_mt : SDNode<"AArch64ISD::FROUND_MERGE_PASSTHRU", SDT_AArch64Arith>;
+def AArch64frintn_mt : SDNode<"AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU", SDT_AArch64Arith>;
+def AArch64frintz_mt : SDNode<"AArch64ISD::FTRUNC_MERGE_PASSTHRU", SDT_AArch64Arith>;
+def AArch64fsqrt_mt  : SDNode<"AArch64ISD::FSQRT_MERGE_PASSTHRU", SDT_AArch64Arith>;
+def AArch64frecpx_mt : SDNode<"AArch64ISD::FRECPX_MERGE_PASSTHRU", SDT_AArch64Arith>;
+def AArch64rbit_mt   : SDNode<"AArch64ISD::BITREVERSE_MERGE_PASSTHRU", SDT_AArch64Arith>;
+def AArch64revb_mt   : SDNode<"AArch64ISD::BSWAP_MERGE_PASSTHRU", SDT_AArch64Arith>;
+
+// These are like the above but we don't yet have need for ISD nodes. They allow
+// a single pattern to match intrinsic and ISD operand layouts.
+def AArch64cls_mt  : PatFrags<(ops node:$pg, node:$op, node:$pt), [(int_aarch64_sve_cls  node:$pt, node:$pg, node:$op)]>;
+def AArch64cnot_mt : PatFrags<(ops node:$pg, node:$op, node:$pt), [(int_aarch64_sve_cnot node:$pt, node:$pg, node:$op)]>;
+def AArch64not_mt  : PatFrags<(ops node:$pg, node:$op, node:$pt), [(int_aarch64_sve_not  node:$pt, node:$pg, node:$op)]>;
+
+def SDT_AArch64FCVT : SDTypeProfile<1, 3, [
+  SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>,
+  SDTCVecEltisVT<1,i1>
+]>;
+
+def SDT_AArch64FCVTR : SDTypeProfile<1, 4, [
+  SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>, SDTCisVec<4>,
+  SDTCVecEltisVT<1,i1>
+]>;
+
+def AArch64fcvtr_mt  : SDNode<"AArch64ISD::FP_ROUND_MERGE_PASSTHRU", SDT_AArch64FCVTR>;
+def AArch64fcvte_mt  : SDNode<"AArch64ISD::FP_EXTEND_MERGE_PASSTHRU", SDT_AArch64FCVT>;
+def AArch64ucvtf_mt  : SDNode<"AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU", SDT_AArch64FCVT>;
+def AArch64scvtf_mt  : SDNode<"AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU", SDT_AArch64FCVT>;
+def AArch64fcvtzu_mt : SDNode<"AArch64ISD::FCVTZU_MERGE_PASSTHRU", SDT_AArch64FCVT>;
+def AArch64fcvtzs_mt : SDNode<"AArch64ISD::FCVTZS_MERGE_PASSTHRU", SDT_AArch64FCVT>;
 
 def SDT_AArch64ReduceWithInit : SDTypeProfile<1, 3, [SDTCisVec<1>, SDTCisVec<3>]>;
 def AArch64clasta_n   : SDNode<"AArch64ISD::CLASTA_N",   SDT_AArch64ReduceWithInit>;
@@ -207,6 +263,24 @@ def index_vector : SDNode<"AArch64ISD::INDEX_VECTOR", SDT_IndexVector, []>;
 
 def reinterpret_cast : SDNode<"AArch64ISD::REINTERPRET_CAST", SDTUnaryOp>;
 
+def setoge_or_setge : PatFrags<(ops node:$lhs, node:$rhs),
+                               [(setoge node:$lhs, node:$rhs),
+                                (setge node:$lhs, node:$rhs)]>;
+def setogt_or_setgt : PatFrags<(ops node:$lhs, node:$rhs),
+                                [(setogt node:$lhs, node:$rhs),
+                                 (setgt node:$lhs, node:$rhs)]>;
+def setoeq_or_seteq : PatFrags<(ops node:$lhs, node:$rhs),
+                                [(setoeq node:$lhs, node:$rhs),
+                                 (seteq node:$lhs, node:$rhs)]>;
+def setone_or_setne : PatFrags<(ops node:$lhs, node:$rhs),
+                                [(setone node:$lhs, node:$rhs),
+                                 (setne node:$lhs, node:$rhs)]>;
+def AArch64mul_p_oneuse : PatFrag<(ops node:$pred, node:$src1, node:$src2),
+                                  (AArch64mul_p node:$pred, node:$src1, node:$src2), [{
+  return N->hasOneUse();
+}]>;
+
+
 let Predicates = [HasSVE] in {
   defm RDFFR_PPz  : sve_int_rdffr_pred<0b0, "rdffr", int_aarch64_sve_rdffr_z>;
   def  RDFFRS_PPz : sve_int_rdffr_pred<0b1, "rdffrs">;
@@ -231,6 +305,7 @@ let Predicates = [HasSVE] in {
   defm SUBR_ZPmZ : sve_int_bin_pred_arit_0<0b011, "subr", "SUBR_ZPZZ", int_aarch64_sve_subr, DestructiveBinaryCommWithRev, "SUB_ZPmZ", /*isReverseInstr*/ 1>;
 
   defm ADD_ZPZZ  : sve_int_bin_pred_bhsd<AArch64add_p>;
+  defm SUB_ZPZZ  : sve_int_bin_pred_bhsd<AArch64sub_p>;
 
   let Predicates = [HasSVE, UseExperimentalZeroingPseudos] in {
     defm ADD_ZPZZ  : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_add>;
@@ -253,12 +328,12 @@ let Predicates = [HasSVE] in {
 
   defm MAD_ZPmZZ : sve_int_mladdsub_vvv_pred<0b0, "mad", int_aarch64_sve_mad>;
   defm MSB_ZPmZZ : sve_int_mladdsub_vvv_pred<0b1, "msb", int_aarch64_sve_msb>;
-  defm MLA_ZPmZZ : sve_int_mlas_vvv_pred<0b0, "mla", int_aarch64_sve_mla>;
-  defm MLS_ZPmZZ : sve_int_mlas_vvv_pred<0b1, "mls", int_aarch64_sve_mls>;
+  defm MLA_ZPmZZ : sve_int_mlas_vvv_pred<0b0, "mla", int_aarch64_sve_mla, add, AArch64mul_p_oneuse>;
+  defm MLS_ZPmZZ : sve_int_mlas_vvv_pred<0b1, "mls", int_aarch64_sve_mls, sub, AArch64mul_p_oneuse>;
 
   // SVE predicated integer reductions.
-  defm SADDV_VPZ : sve_int_reduce_0_saddv<0b000, "saddv", int_aarch64_sve_saddv>;
-  defm UADDV_VPZ : sve_int_reduce_0_uaddv<0b001, "uaddv", int_aarch64_sve_uaddv, int_aarch64_sve_saddv>;
+  defm SADDV_VPZ : sve_int_reduce_0_saddv<0b000, "saddv", AArch64saddv_p>;
+  defm UADDV_VPZ : sve_int_reduce_0_uaddv<0b001, "uaddv", AArch64uaddv_p>;
   defm SMAXV_VPZ : sve_int_reduce_1<0b000, "smaxv", AArch64smaxv_p>;
   defm UMAXV_VPZ : sve_int_reduce_1<0b001, "umaxv", AArch64umaxv_p>;
   defm SMINV_VPZ : sve_int_reduce_1<0b010, "sminv", AArch64sminv_p>;
@@ -271,25 +346,17 @@ let Predicates = [HasSVE] in {
   defm EOR_ZI : sve_int_log_imm<0b01, "eor", "eon", xor>;
   defm AND_ZI : sve_int_log_imm<0b10, "and", "bic", and>;
 
-  defm SMAX_ZI   : sve_int_arith_imm1<0b00, "smax", AArch64smax_m1>;
-  defm SMIN_ZI   : sve_int_arith_imm1<0b10, "smin", AArch64smin_m1>;
-  defm UMAX_ZI   : sve_int_arith_imm1_unsigned<0b01, "umax", AArch64umax_m1>;
-  defm UMIN_ZI   : sve_int_arith_imm1_unsigned<0b11, "umin", AArch64umin_m1>;
-
-  defm MUL_ZI     : sve_int_arith_imm2<"mul", mul>;
-  defm MUL_ZPmZ   : sve_int_bin_pred_arit_2<0b000, "mul",   int_aarch64_sve_mul>;
-  defm SMULH_ZPmZ : sve_int_bin_pred_arit_2<0b010, "smulh", int_aarch64_sve_smulh>;
-  defm UMULH_ZPmZ : sve_int_bin_pred_arit_2<0b011, "umulh", int_aarch64_sve_umulh>;
-
-  // Add unpredicated alternative for the mul instruction.
-  def : Pat<(mul nxv16i8:$Op1, nxv16i8:$Op2),
-            (MUL_ZPmZ_B (PTRUE_B 31), $Op1, $Op2)>;
-  def : Pat<(mul nxv8i16:$Op1, nxv8i16:$Op2),
-            (MUL_ZPmZ_H (PTRUE_H 31), $Op1, $Op2)>;
-  def : Pat<(mul nxv4i32:$Op1, nxv4i32:$Op2),
-            (MUL_ZPmZ_S (PTRUE_S 31), $Op1, $Op2)>;
-  def : Pat<(mul nxv2i64:$Op1, nxv2i64:$Op2),
-            (MUL_ZPmZ_D (PTRUE_D 31), $Op1, $Op2)>;
+  defm SMAX_ZI   : sve_int_arith_imm1<0b00, "smax", AArch64smax_p>;
+  defm SMIN_ZI   : sve_int_arith_imm1<0b10, "smin", AArch64smin_p>;
+  defm UMAX_ZI   : sve_int_arith_imm1_unsigned<0b01, "umax", AArch64umax_p>;
+  defm UMIN_ZI   : sve_int_arith_imm1_unsigned<0b11, "umin", AArch64umin_p>;
+
+  defm MUL_ZI     : sve_int_arith_imm2<"mul", AArch64mul_p>;
+  defm MUL_ZPmZ   : sve_int_bin_pred_arit_2<0b000, "mul",   "MUL_ZPZZ",   int_aarch64_sve_mul,   DestructiveBinaryComm>;
+  defm SMULH_ZPmZ : sve_int_bin_pred_arit_2<0b010, "smulh", "SMULH_ZPZZ", int_aarch64_sve_smulh, DestructiveBinaryComm>;
+  defm UMULH_ZPmZ : sve_int_bin_pred_arit_2<0b011, "umulh", "UMULH_ZPZZ", int_aarch64_sve_umulh, DestructiveBinaryComm>;
+
+  defm MUL_ZPZZ   : sve_int_bin_pred_bhsd<AArch64mul_p>;
 
   defm SDIV_ZPmZ  : sve_int_bin_pred_arit_2_div<0b100, "sdiv",  "SDIV_ZPZZ", int_aarch64_sve_sdiv, DestructiveBinaryCommWithRev, "SDIVR_ZPmZ">;
   defm UDIV_ZPmZ  : sve_int_bin_pred_arit_2_div<0b101, "udiv",  "UDIV_ZPZZ", int_aarch64_sve_udiv, DestructiveBinaryCommWithRev, "UDIVR_ZPmZ">;
@@ -305,34 +372,34 @@ let Predicates = [HasSVE] in {
   defm SDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b0, "sdot", int_aarch64_sve_sdot_lane>;
   defm UDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b1, "udot", int_aarch64_sve_udot_lane>;
 
-  defm SXTB_ZPmZ : sve_int_un_pred_arit_0_h<0b000, "sxtb", int_aarch64_sve_sxtb>;
-  defm UXTB_ZPmZ : sve_int_un_pred_arit_0_h<0b001, "uxtb", int_aarch64_sve_uxtb>;
-  defm SXTH_ZPmZ : sve_int_un_pred_arit_0_w<0b010, "sxth", int_aarch64_sve_sxth>;
-  defm UXTH_ZPmZ : sve_int_un_pred_arit_0_w<0b011, "uxth", int_aarch64_sve_uxth>;
-  defm SXTW_ZPmZ : sve_int_un_pred_arit_0_d<0b100, "sxtw", int_aarch64_sve_sxtw>;
-  defm UXTW_ZPmZ : sve_int_un_pred_arit_0_d<0b101, "uxtw", int_aarch64_sve_uxtw>;
-  defm ABS_ZPmZ  : sve_int_un_pred_arit_0<  0b110, "abs",  int_aarch64_sve_abs>;
-  defm NEG_ZPmZ  : sve_int_un_pred_arit_0<  0b111, "neg",  int_aarch64_sve_neg>;
-
-  defm CLS_ZPmZ  : sve_int_un_pred_arit_1<   0b000, "cls",  int_aarch64_sve_cls>;
-  defm CLZ_ZPmZ  : sve_int_un_pred_arit_1<   0b001, "clz",  int_aarch64_sve_clz>;
-  defm CNT_ZPmZ  : sve_int_un_pred_arit_1<   0b010, "cnt",  int_aarch64_sve_cnt>;
-
- let Predicates = [HasSVE, HasBF16] in {
-  def : SVE_3_Op_Pat<nxv8i16, int_aarch64_sve_cnt, nxv8i16, nxv8i1, nxv8bf16, !cast<Instruction>(CNT_ZPmZ_H)>;
- }
-
-  defm CNOT_ZPmZ : sve_int_un_pred_arit_1<   0b011, "cnot", int_aarch64_sve_cnot>;
-  defm NOT_ZPmZ  : sve_int_un_pred_arit_1<   0b110, "not",  int_aarch64_sve_not>;
-  defm FABS_ZPmZ : sve_int_un_pred_arit_1_fp<0b100, "fabs", int_aarch64_sve_fabs>;
-  defm FNEG_ZPmZ : sve_int_un_pred_arit_1_fp<0b101, "fneg", int_aarch64_sve_fneg>;
-
-  defm SMAX_ZPmZ : sve_int_bin_pred_arit_1<0b000, "smax", AArch64smax_m1>;
-  defm UMAX_ZPmZ : sve_int_bin_pred_arit_1<0b001, "umax", AArch64umax_m1>;
-  defm SMIN_ZPmZ : sve_int_bin_pred_arit_1<0b010, "smin", AArch64smin_m1>;
-  defm UMIN_ZPmZ : sve_int_bin_pred_arit_1<0b011, "umin", AArch64umin_m1>;
-  defm SABD_ZPmZ : sve_int_bin_pred_arit_1<0b100, "sabd", int_aarch64_sve_sabd>;
-  defm UABD_ZPmZ : sve_int_bin_pred_arit_1<0b101, "uabd", int_aarch64_sve_uabd>;
+  defm SXTB_ZPmZ : sve_int_un_pred_arit_0_h<0b000, "sxtb", AArch64sxt_mt>;
+  defm UXTB_ZPmZ : sve_int_un_pred_arit_0_h<0b001, "uxtb", AArch64uxt_mt>;
+  defm SXTH_ZPmZ : sve_int_un_pred_arit_0_w<0b010, "sxth", AArch64sxt_mt>;
+  defm UXTH_ZPmZ : sve_int_un_pred_arit_0_w<0b011, "uxth", AArch64uxt_mt>;
+  defm SXTW_ZPmZ : sve_int_un_pred_arit_0_d<0b100, "sxtw", AArch64sxt_mt>;
+  defm UXTW_ZPmZ : sve_int_un_pred_arit_0_d<0b101, "uxtw", AArch64uxt_mt>;
+  defm ABS_ZPmZ  : sve_int_un_pred_arit_0<  0b110, "abs",  AArch64abs_mt>;
+  defm NEG_ZPmZ  : sve_int_un_pred_arit_0<  0b111, "neg",  AArch64neg_mt>;
+
+  defm CLS_ZPmZ  : sve_int_un_pred_arit_1<   0b000, "cls",  AArch64cls_mt>;
+  defm CLZ_ZPmZ  : sve_int_un_pred_arit_1<   0b001, "clz",  AArch64clz_mt>;
+  defm CNT_ZPmZ  : sve_int_un_pred_arit_1<   0b010, "cnt",  AArch64cnt_mt>;
+  defm CNOT_ZPmZ : sve_int_un_pred_arit_1<   0b011, "cnot", AArch64cnot_mt>;
+  defm NOT_ZPmZ  : sve_int_un_pred_arit_1<   0b110, "not",  AArch64not_mt>;
+  defm FABS_ZPmZ : sve_int_un_pred_arit_1_fp<0b100, "fabs", AArch64fabs_mt>;
+  defm FNEG_ZPmZ : sve_int_un_pred_arit_1_fp<0b101, "fneg", AArch64fneg_mt>;
+
+  defm SMAX_ZPmZ : sve_int_bin_pred_arit_1<0b000, "smax", "SMAX_ZPZZ", int_aarch64_sve_smax, DestructiveBinaryComm>;
+  defm UMAX_ZPmZ : sve_int_bin_pred_arit_1<0b001, "umax", "UMAX_ZPZZ", int_aarch64_sve_umax, DestructiveBinaryComm>;
+  defm SMIN_ZPmZ : sve_int_bin_pred_arit_1<0b010, "smin", "SMIN_ZPZZ", int_aarch64_sve_smin, DestructiveBinaryComm>;
+  defm UMIN_ZPmZ : sve_int_bin_pred_arit_1<0b011, "umin", "UMIN_ZPZZ", int_aarch64_sve_umin, DestructiveBinaryComm>;
+  defm SABD_ZPmZ : sve_int_bin_pred_arit_1<0b100, "sabd", "SABD_ZPZZ", int_aarch64_sve_sabd, DestructiveBinaryComm>;
+  defm UABD_ZPmZ : sve_int_bin_pred_arit_1<0b101, "uabd", "UABD_ZPZZ", int_aarch64_sve_uabd, DestructiveBinaryComm>;
+
+  defm SMAX_ZPZZ  : sve_int_bin_pred_bhsd<AArch64smax_p>;
+  defm UMAX_ZPZZ  : sve_int_bin_pred_bhsd<AArch64umax_p>;
+  defm SMIN_ZPZZ  : sve_int_bin_pred_bhsd<AArch64smin_p>;
+  defm UMIN_ZPZZ  : sve_int_bin_pred_bhsd<AArch64umin_p>;
 
   defm FRECPE_ZZ  : sve_fp_2op_u_zd<0b110, "frecpe",  int_aarch64_sve_frecpe_x>;
   defm FRSQRTE_ZZ : sve_fp_2op_u_zd<0b111, "frsqrte", int_aarch64_sve_frsqrte_x>;
@@ -361,6 +428,11 @@ let Predicates = [HasSVE] in {
   defm FDIV_ZPmZ   : sve_fp_2op_p_zds<0b1101, "fdiv", "FDIV_ZPZZ", int_aarch64_sve_fdiv, DestructiveBinaryCommWithRev, "FDIVR_ZPmZ">;
 
   defm FADD_ZPZZ   : sve_fp_bin_pred_hfd<AArch64fadd_p>;
+  defm FSUB_ZPZZ   : sve_fp_bin_pred_hfd<AArch64fsub_p>;
+  defm FMUL_ZPZZ   : sve_fp_bin_pred_hfd<AArch64fmul_p>;
+  defm FMAXNM_ZPZZ : sve_fp_bin_pred_hfd<AArch64fmaxnm_p>;
+  defm FMINNM_ZPZZ : sve_fp_bin_pred_hfd<AArch64fminnm_p>;
+  defm FDIV_ZPZZ   : sve_fp_bin_pred_hfd<AArch64fdiv_p>;
 
   let Predicates = [HasSVE, UseExperimentalZeroingPseudos] in {
     defm FADD_ZPZZ   : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fadd>;
@@ -377,10 +449,10 @@ let Predicates = [HasSVE] in {
     defm FDIV_ZPZZ   : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fdiv>;
   }
 
-  defm FADD_ZZZ    : sve_fp_3op_u_zd<0b000, "fadd",    fadd>;
-  defm FSUB_ZZZ    : sve_fp_3op_u_zd<0b001, "fsub",    fsub>;
-  defm FMUL_ZZZ    : sve_fp_3op_u_zd<0b010, "fmul",    fmul>;
-  defm FTSMUL_ZZZ  : sve_fp_3op_u_zd_ftsmul<0b011, "ftsmul",  int_aarch64_sve_ftsmul_x>;
+  defm FADD_ZZZ    : sve_fp_3op_u_zd<0b000, "fadd", fadd, AArch64fadd_p>;
+  defm FSUB_ZZZ    : sve_fp_3op_u_zd<0b001, "fsub", fsub, AArch64fsub_p>;
+  defm FMUL_ZZZ    : sve_fp_3op_u_zd<0b010, "fmul", fmul, AArch64fmul_p>;
+  defm FTSMUL_ZZZ  : sve_fp_3op_u_zd_ftsmul<0b011, "ftsmul", int_aarch64_sve_ftsmul_x>;
   defm FRECPS_ZZZ  : sve_fp_3op_u_zd<0b110, "frecps",  int_aarch64_sve_frecps_x>;
   defm FRSQRTS_ZZZ : sve_fp_3op_u_zd<0b111, "frsqrts", int_aarch64_sve_frsqrts_x>;
 
@@ -404,8 +476,14 @@ let Predicates = [HasSVE] in {
   // regalloc.
   def : Pat<(nxv8f16 (AArch64fma_p nxv8i1:$P, nxv8f16:$Op1, nxv8f16:$Op2, nxv8f16:$Op3)),
             (FMLA_ZPmZZ_H $P, $Op3, $Op1, $Op2)>;
+  def : Pat<(nxv4f16 (AArch64fma_p nxv4i1:$P, nxv4f16:$Op1, nxv4f16:$Op2, nxv4f16:$Op3)),
+            (FMLA_ZPmZZ_H $P, $Op3, $Op1, $Op2)>;
+  def : Pat<(nxv2f16 (AArch64fma_p nxv2i1:$P, nxv2f16:$Op1, nxv2f16:$Op2, nxv2f16:$Op3)),
+            (FMLA_ZPmZZ_H $P, $Op3, $Op1, $Op2)>;
   def : Pat<(nxv4f32 (AArch64fma_p nxv4i1:$P, nxv4f32:$Op1, nxv4f32:$Op2, nxv4f32:$Op3)),
             (FMLA_ZPmZZ_S $P, $Op3, $Op1, $Op2)>;
+  def : Pat<(nxv2f32 (AArch64fma_p nxv2i1:$P, nxv2f32:$Op1, nxv2f32:$Op2, nxv2f32:$Op3)),
+            (FMLA_ZPmZZ_S $P, $Op3, $Op1, $Op2)>;
   def : Pat<(nxv2f64 (AArch64fma_p nxv2i1:$P, nxv2f64:$Op1, nxv2f64:$Op2, nxv2f64:$Op3)),
             (FMLA_ZPmZZ_D $P, $Op3, $Op1, $Op2)>;
 
@@ -425,15 +503,6 @@ let Predicates = [HasSVE] in {
   defm FMAXV_VPZ   : sve_fp_fast_red<0b110, "fmaxv",   AArch64fmaxv_p>;
   defm FMINV_VPZ   : sve_fp_fast_red<0b111, "fminv",   AArch64fminv_p>;
 
-  // Use more efficient NEON instructions to extract elements within the NEON
-  // part (first 128bits) of an SVE register.
-  def : Pat<(vector_extract (nxv8f16 ZPR:$Zs), (i64 0)),
-            (f16 (EXTRACT_SUBREG (v8f16 (EXTRACT_SUBREG ZPR:$Zs, zsub)), hsub))>;
-  def : Pat<(vector_extract (nxv4f32 ZPR:$Zs), (i64 0)),
-            (f32 (EXTRACT_SUBREG (v4f32 (EXTRACT_SUBREG ZPR:$Zs, zsub)), ssub))>;
-  def : Pat<(vector_extract (nxv2f64 ZPR:$Zs), (i64 0)),
-            (f64 (EXTRACT_SUBREG (v2f64 (EXTRACT_SUBREG ZPR:$Zs, zsub)), dsub))>;
-
   // Splat immediate (unpredicated)
   defm DUP_ZI   : sve_int_dup_imm<"dup">;
   defm FDUP_ZI  : sve_int_dup_fpimm<"fdup">;
@@ -452,11 +521,6 @@ let Predicates = [HasSVE] in {
   defm CPY_ZPmR : sve_int_perm_cpy_r<"cpy", AArch64dup_mt>;
   defm CPY_ZPmV : sve_int_perm_cpy_v<"cpy", AArch64dup_mt>;
 
-  let Predicates = [HasSVE, HasBF16] in {
-    def : Pat<(nxv8bf16 (AArch64dup_mt nxv8i1:$pg, bf16:$splat, nxv8bf16:$passthru)),
-              (CPY_ZPmV_H $passthru, $pg, $splat)>;
-  }
-
   // Duplicate FP scalar into all vector elements
   def : Pat<(nxv8f16 (AArch64dup (f16 FPR16:$src))),
             (DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>;
@@ -470,10 +534,8 @@ let Predicates = [HasSVE] in {
             (DUP_ZZI_S (INSERT_SUBREG (IMPLICIT_DEF), FPR32:$src, ssub), 0)>;
   def : Pat<(nxv2f64 (AArch64dup (f64 FPR64:$src))),
             (DUP_ZZI_D (INSERT_SUBREG (IMPLICIT_DEF), FPR64:$src, dsub), 0)>;
-  let Predicates = [HasSVE, HasBF16] in {
-    def : Pat<(nxv8bf16 (AArch64dup (bf16 FPR16:$src))),
-              (DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>;
-  }
+  def : Pat<(nxv8bf16 (AArch64dup (bf16 FPR16:$src))),
+            (DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>;
 
   // Duplicate +0.0 into all vector elements
   def : Pat<(nxv8f16 (AArch64dup (f16 fpimm0))), (DUP_ZI_H 0, 0)>;
@@ -482,9 +544,7 @@ let Predicates = [HasSVE] in {
   def : Pat<(nxv4f32 (AArch64dup (f32 fpimm0))), (DUP_ZI_S 0, 0)>;
   def : Pat<(nxv2f32 (AArch64dup (f32 fpimm0))), (DUP_ZI_S 0, 0)>;
   def : Pat<(nxv2f64 (AArch64dup (f64 fpimm0))), (DUP_ZI_D 0, 0)>;
-  let Predicates = [HasSVE, HasBF16] in {
-    def : Pat<(nxv8bf16 (AArch64dup (bf16 fpimm0))), (DUP_ZI_H 0, 0)>;
-  }
+  def : Pat<(nxv8bf16 (AArch64dup (bf16 fpimm0))), (DUP_ZI_H 0, 0)>;
 
   // Duplicate Int immediate into all vector elements
   def : Pat<(nxv16i8 (AArch64dup (i32 (SVE8BitLslImm i32:$a, i32:$b)))),
@@ -513,36 +573,23 @@ let Predicates = [HasSVE] in {
   }
 
   // Select elements from either vector (predicated)
-  defm SEL_ZPZZ    : sve_int_sel_vvv<"sel", vselect>;
+  defm SEL_ZPZZ   : sve_int_sel_vvv<"sel", vselect>;
 
   defm SPLICE_ZPZ : sve_int_perm_splice<"splice", int_aarch64_sve_splice>;
 
-  let Predicates = [HasSVE, HasBF16] in {
-    def : SVE_3_Op_Pat<nxv8bf16, vselect, nxv8i1, nxv8bf16, nxv8bf16, SEL_ZPZZ_H>;
-    def : SVE_3_Op_Pat<nxv8bf16, int_aarch64_sve_splice, nxv8i1, nxv8bf16, nxv8bf16, SPLICE_ZPZ_H>;
-  }
-
   defm COMPACT_ZPZ : sve_int_perm_compact<"compact", int_aarch64_sve_compact>;
   defm INSR_ZR : sve_int_perm_insrs<"insr", AArch64insr>;
   defm INSR_ZV : sve_int_perm_insrv<"insr", AArch64insr>;
   defm EXT_ZZI : sve_int_perm_extract_i<"ext", AArch64ext>;
 
-  let Predicates = [HasSVE, HasBF16] in {
-    def : SVE_2_Op_Pat<nxv8bf16, AArch64insr, nxv8bf16, bf16, INSR_ZV_H>;
-  }
-
-  defm RBIT_ZPmZ : sve_int_perm_rev_rbit<"rbit", int_aarch64_sve_rbit>;
-  defm REVB_ZPmZ : sve_int_perm_rev_revb<"revb", int_aarch64_sve_revb, bswap>;
+  defm RBIT_ZPmZ : sve_int_perm_rev_rbit<"rbit", AArch64rbit_mt>;
+  defm REVB_ZPmZ : sve_int_perm_rev_revb<"revb", AArch64revb_mt>;
   defm REVH_ZPmZ : sve_int_perm_rev_revh<"revh", int_aarch64_sve_revh>;
   defm REVW_ZPmZ : sve_int_perm_rev_revw<"revw", int_aarch64_sve_revw>;
 
   defm REV_PP : sve_int_perm_reverse_p<"rev", AArch64rev>;
   defm REV_ZZ : sve_int_perm_reverse_z<"rev", AArch64rev>;
 
-  let Predicates = [HasSVE, HasBF16] in {
-    def : SVE_1_Op_Pat<nxv8bf16, AArch64rev, nxv8bf16, REV_ZZ_H>;
-  }
-
   defm SUNPKLO_ZZ : sve_int_perm_unpk<0b00, "sunpklo", AArch64sunpklo>;
   defm SUNPKHI_ZZ : sve_int_perm_unpk<0b01, "sunpkhi", AArch64sunpkhi>;
   defm UUNPKLO_ZZ : sve_int_perm_unpk<0b10, "uunpklo", AArch64uunpklo>;
@@ -599,23 +646,11 @@ let Predicates = [HasSVE] in {
   defm CLASTA_ZPZ : sve_int_perm_clast_zz<0, "clasta", int_aarch64_sve_clasta>;
   defm CLASTB_ZPZ : sve_int_perm_clast_zz<1, "clastb", int_aarch64_sve_clastb>;
 
-  let Predicates = [HasSVE, HasBF16] in {
-    def : SVE_3_Op_Pat<bf16,     AArch64clasta_n,        nxv8i1, bf16,     nxv8bf16, CLASTA_VPZ_H>;
-    def : SVE_3_Op_Pat<bf16,     AArch64clastb_n,        nxv8i1, bf16,     nxv8bf16, CLASTB_VPZ_H>;
-    def : SVE_3_Op_Pat<nxv8bf16, int_aarch64_sve_clasta, nxv8i1, nxv8bf16, nxv8bf16, CLASTA_ZPZ_H>;
-    def : SVE_3_Op_Pat<nxv8bf16, int_aarch64_sve_clastb, nxv8i1, nxv8bf16, nxv8bf16, CLASTB_ZPZ_H>;
-  }
-
   defm LASTA_RPZ : sve_int_perm_last_r<0, "lasta", AArch64lasta>;
   defm LASTB_RPZ : sve_int_perm_last_r<1, "lastb", AArch64lastb>;
   defm LASTA_VPZ : sve_int_perm_last_v<0, "lasta", AArch64lasta>;
   defm LASTB_VPZ : sve_int_perm_last_v<1, "lastb", AArch64lastb>;
 
-  let Predicates = [HasSVE, HasBF16] in {
-    def : SVE_2_Op_Pat<bf16, AArch64lasta, nxv8i1, nxv8bf16, LASTA_VPZ_H>;
-    def : SVE_2_Op_Pat<bf16, AArch64lastb, nxv8i1, nxv8bf16, LASTB_VPZ_H>;
-  }
-
   // continuous load with reg+immediate
   defm LD1B_IMM    : sve_mem_cld_si<0b0000, "ld1b",  Z_b, ZPR8>;
   defm LD1B_H_IMM  : sve_mem_cld_si<0b0001, "ld1b",  Z_h, ZPR16>;
@@ -1000,7 +1035,7 @@ let Predicates = [HasSVE] in {
   def PRFS_PRR : sve_mem_prfm_ss<0b101, "prfw", GPR64NoXZRshifted32>;
   def PRFD_PRR : sve_mem_prfm_ss<0b111, "prfd", GPR64NoXZRshifted64>;
 
-multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instruction RegImmInst, Instruction RegRegInst, int scale, ComplexPattern AddrCP> {
+  multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instruction RegImmInst, Instruction RegRegInst, int scale, ComplexPattern AddrCP> {
     // reg + imm
     let AddedComplexity = 2 in {
       def _reg_imm : Pat<(prefetch (PredTy PPR_3b:$gp), (am_sve_indexed_s6 GPR64sp:$base, simm6s1:$offset), (i32 sve_prfop:$prfop)),
@@ -1082,10 +1117,6 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
 
   defm TBL_ZZZ  : sve_int_perm_tbl<"tbl", AArch64tbl>;
 
-  let Predicates = [HasSVE, HasBF16] in {
-    def : SVE_2_Op_Pat<nxv8bf16, AArch64tbl, nxv8bf16, nxv8i16, TBL_ZZZ_H>;
-  }
-
   defm ZIP1_ZZZ : sve_int_perm_bin_perm_zz<0b000, "zip1", AArch64zip1>;
   defm ZIP2_ZZZ : sve_int_perm_bin_perm_zz<0b001, "zip2", AArch64zip2>;
   defm UZP1_ZZZ : sve_int_perm_bin_perm_zz<0b010, "uzp1", AArch64uzp1>;
@@ -1093,15 +1124,6 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
   defm TRN1_ZZZ : sve_int_perm_bin_perm_zz<0b100, "trn1", AArch64trn1>;
   defm TRN2_ZZZ : sve_int_perm_bin_perm_zz<0b101, "trn2", AArch64trn2>;
 
-  let Predicates = [HasSVE, HasBF16] in {
-    def : SVE_2_Op_Pat<nxv8bf16, AArch64zip1, nxv8bf16, nxv8bf16, ZIP1_ZZZ_H>;
-    def : SVE_2_Op_Pat<nxv8bf16, AArch64zip2, nxv8bf16, nxv8bf16, ZIP2_ZZZ_H>;
-    def : SVE_2_Op_Pat<nxv8bf16, AArch64uzp1, nxv8bf16, nxv8bf16, UZP1_ZZZ_H>;
-    def : SVE_2_Op_Pat<nxv8bf16, AArch64uzp2, nxv8bf16, nxv8bf16, UZP2_ZZZ_H>;
-    def : SVE_2_Op_Pat<nxv8bf16, AArch64trn1, nxv8bf16, nxv8bf16, TRN1_ZZZ_H>;
-    def : SVE_2_Op_Pat<nxv8bf16, AArch64trn2, nxv8bf16, nxv8bf16, TRN2_ZZZ_H>;
-  }
-
   defm ZIP1_PPP : sve_int_perm_bin_perm_pp<0b000, "zip1", AArch64zip1>;
   defm ZIP2_PPP : sve_int_perm_bin_perm_pp<0b001, "zip2", AArch64zip2>;
   defm UZP1_PPP : sve_int_perm_bin_perm_pp<0b010, "uzp1", AArch64uzp1>;
@@ -1123,6 +1145,29 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
   def : Pat<(nxv8i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 8))),
             (ZIP2_PPP_B PPR:$Ps, (PFALSE))>;
 
+  // Extract subvectors from FP SVE vectors
+  def : Pat<(nxv2f16 (extract_subvector (nxv4f16 ZPR:$Zs), (i64 0))),
+            (UUNPKLO_ZZ_D ZPR:$Zs)>;
+  def : Pat<(nxv2f16 (extract_subvector (nxv4f16 ZPR:$Zs), (i64 2))),
+            (UUNPKHI_ZZ_D ZPR:$Zs)>;
+  def : Pat<(nxv4f16 (extract_subvector (nxv8f16 ZPR:$Zs), (i64 0))),
+            (UUNPKLO_ZZ_S ZPR:$Zs)>;
+  def : Pat<(nxv4f16 (extract_subvector (nxv8f16 ZPR:$Zs), (i64 4))),
+            (UUNPKHI_ZZ_S ZPR:$Zs)>;
+  def : Pat<(nxv2f32 (extract_subvector (nxv4f32 ZPR:$Zs), (i64 0))),
+            (UUNPKLO_ZZ_D ZPR:$Zs)>;
+  def : Pat<(nxv2f32 (extract_subvector (nxv4f32 ZPR:$Zs), (i64 2))),
+            (UUNPKHI_ZZ_D ZPR:$Zs)>;
+
+  def : Pat<(nxv2bf16 (extract_subvector (nxv4bf16 ZPR:$Zs), (i64 0))),
+            (UUNPKLO_ZZ_D ZPR:$Zs)>;
+  def : Pat<(nxv2bf16 (extract_subvector (nxv4bf16 ZPR:$Zs), (i64 2))),
+            (UUNPKHI_ZZ_D ZPR:$Zs)>;
+  def : Pat<(nxv4bf16 (extract_subvector (nxv8bf16 ZPR:$Zs), (i64 0))),
+            (UUNPKLO_ZZ_S ZPR:$Zs)>;
+  def : Pat<(nxv4bf16 (extract_subvector (nxv8bf16 ZPR:$Zs), (i64 4))),
+            (UUNPKHI_ZZ_S ZPR:$Zs)>;
+
   // Concatenate two predicates.
   def : Pat<(nxv4i1 (concat_vectors nxv2i1:$p1, nxv2i1:$p2)),
             (UZP1_PPP_S $p1, $p2)>;
@@ -1131,6 +1176,18 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
   def : Pat<(nxv16i1 (concat_vectors nxv8i1:$p1, nxv8i1:$p2)),
             (UZP1_PPP_B $p1, $p2)>;
 
+  // Concatenate two floating point vectors.
+  def : Pat<(nxv4f16 (concat_vectors nxv2f16:$v1, nxv2f16:$v2)),
+            (UZP1_ZZZ_S $v1, $v2)>;
+  def : Pat<(nxv8f16 (concat_vectors nxv4f16:$v1, nxv4f16:$v2)),
+            (UZP1_ZZZ_H $v1, $v2)>;
+  def : Pat<(nxv4f32 (concat_vectors nxv2f32:$v1, nxv2f32:$v2)),
+            (UZP1_ZZZ_S $v1, $v2)>;
+  def : Pat<(nxv4bf16 (concat_vectors nxv2bf16:$v1, nxv2bf16:$v2)),
+            (UZP1_ZZZ_S $v1, $v2)>;
+  def : Pat<(nxv8bf16 (concat_vectors nxv4bf16:$v1, nxv4bf16:$v2)),
+            (UZP1_ZZZ_H $v1, $v2)>;
+
   defm CMPHS_PPzZZ : sve_int_cmp_0<0b000, "cmphs", SETUGE, SETULE>;
   defm CMPHI_PPzZZ : sve_int_cmp_0<0b001, "cmphi", SETUGT, SETULT>;
   defm CMPGE_PPzZZ : sve_int_cmp_0<0b100, "cmpge", SETGE, SETLE>;
@@ -1160,10 +1217,10 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
   defm CMPLO_PPzZI : sve_int_ucmp_vi<0b10, "cmplo", SETULT, SETUGT>;
   defm CMPLS_PPzZI : sve_int_ucmp_vi<0b11, "cmpls", SETULE, SETUGE>;
 
-  defm FCMGE_PPzZZ : sve_fp_3op_p_pd_cc<0b000, "fcmge", int_aarch64_sve_fcmpge, setoge>;
-  defm FCMGT_PPzZZ : sve_fp_3op_p_pd_cc<0b001, "fcmgt", int_aarch64_sve_fcmpgt, setogt>;
-  defm FCMEQ_PPzZZ : sve_fp_3op_p_pd_cc<0b010, "fcmeq", int_aarch64_sve_fcmpeq, setoeq>;
-  defm FCMNE_PPzZZ : sve_fp_3op_p_pd_cc<0b011, "fcmne", int_aarch64_sve_fcmpne, setone>;
+  defm FCMGE_PPzZZ : sve_fp_3op_p_pd_cc<0b000, "fcmge", int_aarch64_sve_fcmpge, setoge_or_setge>;
+  defm FCMGT_PPzZZ : sve_fp_3op_p_pd_cc<0b001, "fcmgt", int_aarch64_sve_fcmpgt, setogt_or_setgt>;
+  defm FCMEQ_PPzZZ : sve_fp_3op_p_pd_cc<0b010, "fcmeq", int_aarch64_sve_fcmpeq, setoeq_or_seteq>;
+  defm FCMNE_PPzZZ : sve_fp_3op_p_pd_cc<0b011, "fcmne", int_aarch64_sve_fcmpne, setone_or_setne>;
   defm FCMUO_PPzZZ : sve_fp_3op_p_pd_cc<0b100, "fcmuo", int_aarch64_sve_fcmpuo, setuo>;
   defm FACGE_PPzZZ : sve_fp_3op_p_pd<0b101, "facge", int_aarch64_sve_facge>;
   defm FACGT_PPzZZ : sve_fp_3op_p_pd<0b111, "facgt", int_aarch64_sve_facgt>;
@@ -1288,82 +1345,145 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
   defm INDEX_II : sve_int_index_ii<"index", index_vector>;
 
   // Unpredicated shifts
-  defm ASR_ZZI : sve_int_bin_cons_shift_imm_right<0b00, "asr", AArch64asr_m1>;
-  defm LSR_ZZI : sve_int_bin_cons_shift_imm_right<0b01, "lsr", AArch64lsr_m1>;
-  defm LSL_ZZI : sve_int_bin_cons_shift_imm_left< 0b11, "lsl", AArch64lsl_m1>;
+  defm ASR_ZZI : sve_int_bin_cons_shift_imm_right<0b00, "asr", AArch64asr_p>;
+  defm LSR_ZZI : sve_int_bin_cons_shift_imm_right<0b01, "lsr", AArch64lsr_p>;
+  defm LSL_ZZI : sve_int_bin_cons_shift_imm_left< 0b11, "lsl", AArch64lsl_p>;
 
   defm ASR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b00, "asr">;
   defm LSR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b01, "lsr">;
   defm LSL_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b11, "lsl">;
 
   // Predicated shifts
-  defm ASR_ZPmI  : sve_int_bin_pred_shift_imm_right<0b0000, "asr", "ASR_ZPZI">;
-  defm LSR_ZPmI  : sve_int_bin_pred_shift_imm_right<0b0001, "lsr", "LSR_ZPZI">;
-  defm LSL_ZPmI  : sve_int_bin_pred_shift_imm_left< 0b0011, "lsl">;
-  defm ASRD_ZPmI : sve_int_bin_pred_shift_imm_right<0b0100, "asrd", "ASRD_ZPZI", int_aarch64_sve_asrd>;
+  defm ASR_ZPmI  : sve_int_bin_pred_shift_imm_right_dup<0b0000, "asr",  "ASR_ZPZI",  int_aarch64_sve_asr>;
+  defm LSR_ZPmI  : sve_int_bin_pred_shift_imm_right_dup<0b0001, "lsr",  "LSR_ZPZI",  int_aarch64_sve_lsr>;
+  defm LSL_ZPmI  : sve_int_bin_pred_shift_imm_left_dup< 0b0011, "lsl",  "LSL_ZPZI",  int_aarch64_sve_lsl>;
+  defm ASRD_ZPmI : sve_int_bin_pred_shift_imm_right<    0b0100, "asrd", "ASRD_ZPZI", int_aarch64_sve_asrd>;
+
+  defm ASR_ZPZI : sve_int_shift_pred_bhsd<AArch64asr_p, SVEShiftImmR8, SVEShiftImmR16, SVEShiftImmR32, SVEShiftImmR64>;
+  defm LSR_ZPZI : sve_int_shift_pred_bhsd<AArch64lsr_p, SVEShiftImmR8, SVEShiftImmR16, SVEShiftImmR32, SVEShiftImmR64>;
+  defm LSL_ZPZI : sve_int_shift_pred_bhsd<AArch64lsl_p, SVEShiftImmL8, SVEShiftImmL16, SVEShiftImmL32, SVEShiftImmL64>;
 
   let Predicates = [HasSVE, UseExperimentalZeroingPseudos] in {
-    defm ASR_ZPZZ    : sve_int_bin_pred_zeroing_bhsd<AArch64asr_m1>;
-    defm LSR_ZPZZ    : sve_int_bin_pred_zeroing_bhsd<AArch64lsr_m1>;
-    defm LSL_ZPZZ    : sve_int_bin_pred_zeroing_bhsd<AArch64lsl_m1>;
+    defm ASR_ZPZZ    : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_asr>;
+    defm LSR_ZPZZ    : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_lsr>;
+    defm LSL_ZPZZ    : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_lsl>;
     defm ASRD_ZPZI   : sve_int_bin_pred_shift_imm_right_zeroing_bhsd<int_aarch64_sve_asrd>;
   }
 
-  defm ASR_ZPmZ  : sve_int_bin_pred_shift<0b000, "asr", "ASR_ZPZZ", AArch64asr_m1, "ASRR_ZPmZ">;
-  defm LSR_ZPmZ  : sve_int_bin_pred_shift<0b001, "lsr", "LSR_ZPZZ", AArch64lsr_m1, "LSRR_ZPmZ">;
-  defm LSL_ZPmZ  : sve_int_bin_pred_shift<0b011, "lsl", "LSL_ZPZZ", AArch64lsl_m1, "LSLR_ZPmZ">;
+  defm ASR_ZPmZ  : sve_int_bin_pred_shift<0b000, "asr", "ASR_ZPZZ", int_aarch64_sve_asr, "ASRR_ZPmZ">;
+  defm LSR_ZPmZ  : sve_int_bin_pred_shift<0b001, "lsr", "LSR_ZPZZ", int_aarch64_sve_lsr, "LSRR_ZPmZ">;
+  defm LSL_ZPmZ  : sve_int_bin_pred_shift<0b011, "lsl", "LSL_ZPZZ", int_aarch64_sve_lsl, "LSLR_ZPmZ">;
   defm ASRR_ZPmZ : sve_int_bin_pred_shift<0b100, "asrr", "ASRR_ZPZZ", null_frag, "ASR_ZPmZ", /*isReverseInstr*/ 1>;
   defm LSRR_ZPmZ : sve_int_bin_pred_shift<0b101, "lsrr", "LSRR_ZPZZ", null_frag, "LSR_ZPmZ", /*isReverseInstr*/ 1>;
   defm LSLR_ZPmZ : sve_int_bin_pred_shift<0b111, "lslr", "LSLR_ZPZZ", null_frag, "LSL_ZPmZ", /*isReverseInstr*/ 1>;
 
+  defm ASR_ZPZZ  : sve_int_bin_pred_bhsd<AArch64asr_p>;
+  defm LSR_ZPZZ  : sve_int_bin_pred_bhsd<AArch64lsr_p>;
+  defm LSL_ZPZZ  : sve_int_bin_pred_bhsd<AArch64lsl_p>;
+
   defm ASR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b000, "asr", int_aarch64_sve_asr_wide>;
   defm LSR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b001, "lsr", int_aarch64_sve_lsr_wide>;
   defm LSL_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b011, "lsl", int_aarch64_sve_lsl_wide>;
 
-  defm FCVT_ZPmZ_StoH   : sve_fp_2op_p_zd<0b1001000, "fcvt",   ZPR32, ZPR16, int_aarch64_sve_fcvt_f16f32,    nxv8f16, nxv4i1, nxv4f32, ElementSizeS>;
-  defm FCVT_ZPmZ_HtoS   : sve_fp_2op_p_zd<0b1001001, "fcvt",   ZPR16, ZPR32, int_aarch64_sve_fcvt_f32f16,    nxv4f32, nxv4i1, nxv8f16, ElementSizeS>;
-  defm SCVTF_ZPmZ_HtoH  : sve_fp_2op_p_zd<0b0110010, "scvtf",  ZPR16, ZPR16, int_aarch64_sve_scvtf,          nxv8f16, nxv8i1, nxv8i16, ElementSizeH>;
-  defm SCVTF_ZPmZ_StoS  : sve_fp_2op_p_zd<0b1010100, "scvtf",  ZPR32, ZPR32, int_aarch64_sve_scvtf,          nxv4f32, nxv4i1, nxv4i32, ElementSizeS>;
-  defm UCVTF_ZPmZ_StoS  : sve_fp_2op_p_zd<0b1010101, "ucvtf",  ZPR32, ZPR32, int_aarch64_sve_ucvtf,          nxv4f32, nxv4i1, nxv4i32, ElementSizeS>;
-  defm UCVTF_ZPmZ_HtoH  : sve_fp_2op_p_zd<0b0110011, "ucvtf",  ZPR16, ZPR16, int_aarch64_sve_ucvtf,          nxv8f16, nxv8i1, nxv8i16, ElementSizeH>;
-  defm FCVTZS_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111010, "fcvtzs", ZPR16, ZPR16, int_aarch64_sve_fcvtzs,         nxv8i16, nxv8i1, nxv8f16, ElementSizeH>;
-  defm FCVTZS_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011100, "fcvtzs", ZPR32, ZPR32, int_aarch64_sve_fcvtzs,         nxv4i32, nxv4i1, nxv4f32, ElementSizeS>;
-  defm FCVTZU_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111011, "fcvtzu", ZPR16, ZPR16, int_aarch64_sve_fcvtzu,         nxv8i16, nxv8i1, nxv8f16, ElementSizeH>;
-  defm FCVTZU_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011101, "fcvtzu", ZPR32, ZPR32, int_aarch64_sve_fcvtzu,         nxv4i32, nxv4i1, nxv4f32, ElementSizeS>;
-  defm FCVT_ZPmZ_DtoH   : sve_fp_2op_p_zd<0b1101000, "fcvt",   ZPR64, ZPR16, int_aarch64_sve_fcvt_f16f64,    nxv8f16, nxv2i1, nxv2f64, ElementSizeD>;
-  defm FCVT_ZPmZ_HtoD   : sve_fp_2op_p_zd<0b1101001, "fcvt",   ZPR16, ZPR64, int_aarch64_sve_fcvt_f64f16,    nxv2f64, nxv2i1, nxv8f16, ElementSizeD>;
-  defm FCVT_ZPmZ_DtoS   : sve_fp_2op_p_zd<0b1101010, "fcvt",   ZPR64, ZPR32, int_aarch64_sve_fcvt_f32f64,    nxv4f32, nxv2i1, nxv2f64, ElementSizeD>;
-  defm FCVT_ZPmZ_StoD   : sve_fp_2op_p_zd<0b1101011, "fcvt",   ZPR32, ZPR64, int_aarch64_sve_fcvt_f64f32,    nxv2f64, nxv2i1, nxv4f32, ElementSizeD>;
-  defm SCVTF_ZPmZ_StoD  : sve_fp_2op_p_zd<0b1110000, "scvtf",  ZPR32, ZPR64, int_aarch64_sve_scvtf_f64i32,   nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
-  defm UCVTF_ZPmZ_StoD  : sve_fp_2op_p_zd<0b1110001, "ucvtf",  ZPR32, ZPR64, int_aarch64_sve_ucvtf_f64i32,   nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
-  defm UCVTF_ZPmZ_StoH  : sve_fp_2op_p_zd<0b0110101, "ucvtf",  ZPR32, ZPR16, int_aarch64_sve_ucvtf_f16i32,   nxv8f16, nxv4i1, nxv4i32, ElementSizeS>;
-  defm SCVTF_ZPmZ_DtoS  : sve_fp_2op_p_zd<0b1110100, "scvtf",  ZPR64, ZPR32, int_aarch64_sve_scvtf_f32i64,   nxv4f32, nxv2i1, nxv2i64, ElementSizeD>;
-  defm SCVTF_ZPmZ_StoH  : sve_fp_2op_p_zd<0b0110100, "scvtf",  ZPR32, ZPR16, int_aarch64_sve_scvtf_f16i32,   nxv8f16, nxv4i1, nxv4i32, ElementSizeS>;
-  defm SCVTF_ZPmZ_DtoH  : sve_fp_2op_p_zd<0b0110110, "scvtf",  ZPR64, ZPR16, int_aarch64_sve_scvtf_f16i64,   nxv8f16, nxv2i1, nxv2i64, ElementSizeD>;
-  defm UCVTF_ZPmZ_DtoS  : sve_fp_2op_p_zd<0b1110101, "ucvtf",  ZPR64, ZPR32, int_aarch64_sve_ucvtf_f32i64,   nxv4f32, nxv2i1, nxv2i64, ElementSizeD>;
-  defm UCVTF_ZPmZ_DtoH  : sve_fp_2op_p_zd<0b0110111, "ucvtf",  ZPR64, ZPR16, int_aarch64_sve_ucvtf_f16i64,   nxv8f16, nxv2i1, nxv2i64, ElementSizeD>;
-  defm SCVTF_ZPmZ_DtoD  : sve_fp_2op_p_zd<0b1110110, "scvtf",  ZPR64, ZPR64, int_aarch64_sve_scvtf,          nxv2f64, nxv2i1, nxv2i64, ElementSizeD>;
-  defm UCVTF_ZPmZ_DtoD  : sve_fp_2op_p_zd<0b1110111, "ucvtf",  ZPR64, ZPR64, int_aarch64_sve_ucvtf,          nxv2f64, nxv2i1, nxv2i64, ElementSizeD>;
-  defm FCVTZS_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111000, "fcvtzs", ZPR64, ZPR32, int_aarch64_sve_fcvtzs_i32f64,  nxv4i32, nxv2i1, nxv2f64, ElementSizeD>;
-  defm FCVTZU_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111001, "fcvtzu", ZPR64, ZPR32, int_aarch64_sve_fcvtzu_i32f64,  nxv4i32, nxv2i1, nxv2f64, ElementSizeD>;
-  defm FCVTZS_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111100, "fcvtzs", ZPR32, ZPR64, int_aarch64_sve_fcvtzs_i64f32,  nxv2i64, nxv2i1, nxv4f32, ElementSizeD>;
-  defm FCVTZS_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111100, "fcvtzs", ZPR16, ZPR32, int_aarch64_sve_fcvtzs_i32f16,  nxv4i32, nxv4i1, nxv8f16, ElementSizeS>;
-  defm FCVTZS_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111110, "fcvtzs", ZPR16, ZPR64, int_aarch64_sve_fcvtzs_i64f16,  nxv2i64, nxv2i1, nxv8f16, ElementSizeD>;
-  defm FCVTZU_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111101, "fcvtzu", ZPR16, ZPR32, int_aarch64_sve_fcvtzu_i32f16,  nxv4i32, nxv4i1, nxv8f16, ElementSizeS>;
-  defm FCVTZU_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111111, "fcvtzu", ZPR16, ZPR64, int_aarch64_sve_fcvtzu_i64f16,  nxv2i64, nxv2i1, nxv8f16, ElementSizeD>;
-  defm FCVTZU_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111101, "fcvtzu", ZPR32, ZPR64, int_aarch64_sve_fcvtzu_i64f32,  nxv2i64, nxv2i1, nxv4f32, ElementSizeD>;
-  defm FCVTZS_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111110, "fcvtzs", ZPR64, ZPR64, int_aarch64_sve_fcvtzs,         nxv2i64, nxv2i1, nxv2f64, ElementSizeD>;
-  defm FCVTZU_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111111, "fcvtzu", ZPR64, ZPR64, int_aarch64_sve_fcvtzu,         nxv2i64, nxv2i1, nxv2f64, ElementSizeD>;
-
-  defm FRINTN_ZPmZ : sve_fp_2op_p_zd_HSD<0b00000, "frintn", int_aarch64_sve_frintn>;
-  defm FRINTP_ZPmZ : sve_fp_2op_p_zd_HSD<0b00001, "frintp", int_aarch64_sve_frintp>;
-  defm FRINTM_ZPmZ : sve_fp_2op_p_zd_HSD<0b00010, "frintm", int_aarch64_sve_frintm>;
-  defm FRINTZ_ZPmZ : sve_fp_2op_p_zd_HSD<0b00011, "frintz", int_aarch64_sve_frintz>;
-  defm FRINTA_ZPmZ : sve_fp_2op_p_zd_HSD<0b00100, "frinta", int_aarch64_sve_frinta>;
-  defm FRINTX_ZPmZ : sve_fp_2op_p_zd_HSD<0b00110, "frintx", int_aarch64_sve_frintx>;
-  defm FRINTI_ZPmZ : sve_fp_2op_p_zd_HSD<0b00111, "frinti", int_aarch64_sve_frinti>;
-  defm FRECPX_ZPmZ : sve_fp_2op_p_zd_HSD<0b01100, "frecpx", int_aarch64_sve_frecpx>;
-  defm FSQRT_ZPmZ  : sve_fp_2op_p_zd_HSD<0b01101, "fsqrt",  int_aarch64_sve_fsqrt>;
+  defm FCVT_ZPmZ_StoH   : sve_fp_2op_p_zdr<0b1001000, "fcvt",   ZPR32, ZPR16, int_aarch64_sve_fcvt_f16f32,   AArch64fcvtr_mt,  nxv4f16, nxv4i1, nxv4f32, ElementSizeS>;
+  defm FCVT_ZPmZ_HtoS   : sve_fp_2op_p_zd< 0b1001001, "fcvt",   ZPR16, ZPR32, int_aarch64_sve_fcvt_f32f16,   AArch64fcvte_mt,  nxv4f32, nxv4i1, nxv4f16, ElementSizeS>;
+  defm SCVTF_ZPmZ_HtoH  : sve_fp_2op_p_zd< 0b0110010, "scvtf",  ZPR16, ZPR16, null_frag,                     AArch64scvtf_mt,  nxv8f16, nxv8i1, nxv8i16, ElementSizeH>;
+  defm SCVTF_ZPmZ_StoS  : sve_fp_2op_p_zd< 0b1010100, "scvtf",  ZPR32, ZPR32, null_frag,                     AArch64scvtf_mt,  nxv4f32, nxv4i1, nxv4i32, ElementSizeS>;
+  defm UCVTF_ZPmZ_StoS  : sve_fp_2op_p_zd< 0b1010101, "ucvtf",  ZPR32, ZPR32, null_frag,                     AArch64ucvtf_mt,  nxv4f32, nxv4i1, nxv4i32, ElementSizeS>;
+  defm UCVTF_ZPmZ_HtoH  : sve_fp_2op_p_zd< 0b0110011, "ucvtf",  ZPR16, ZPR16, null_frag,                     AArch64ucvtf_mt,  nxv8f16, nxv8i1, nxv8i16, ElementSizeH>;
+  defm FCVTZS_ZPmZ_HtoH : sve_fp_2op_p_zd< 0b0111010, "fcvtzs", ZPR16, ZPR16, null_frag,                     AArch64fcvtzs_mt, nxv8i16, nxv8i1, nxv8f16, ElementSizeH>;
+  defm FCVTZS_ZPmZ_StoS : sve_fp_2op_p_zd< 0b1011100, "fcvtzs", ZPR32, ZPR32, null_frag,                     AArch64fcvtzs_mt, nxv4i32, nxv4i1, nxv4f32, ElementSizeS>;
+  defm FCVTZU_ZPmZ_HtoH : sve_fp_2op_p_zd< 0b0111011, "fcvtzu", ZPR16, ZPR16, null_frag,                     AArch64fcvtzu_mt, nxv8i16, nxv8i1, nxv8f16, ElementSizeH>;
+  defm FCVTZU_ZPmZ_StoS : sve_fp_2op_p_zd< 0b1011101, "fcvtzu", ZPR32, ZPR32, null_frag,                     AArch64fcvtzu_mt, nxv4i32, nxv4i1, nxv4f32, ElementSizeS>;
+  defm FCVT_ZPmZ_DtoH   : sve_fp_2op_p_zdr<0b1101000, "fcvt",   ZPR64, ZPR16, int_aarch64_sve_fcvt_f16f64,   AArch64fcvtr_mt,  nxv2f16, nxv2i1, nxv2f64, ElementSizeD>;
+  defm FCVT_ZPmZ_HtoD   : sve_fp_2op_p_zd< 0b1101001, "fcvt",   ZPR16, ZPR64, int_aarch64_sve_fcvt_f64f16,   AArch64fcvte_mt,  nxv2f64, nxv2i1, nxv2f16, ElementSizeD>;
+  defm FCVT_ZPmZ_DtoS   : sve_fp_2op_p_zdr<0b1101010, "fcvt",   ZPR64, ZPR32, int_aarch64_sve_fcvt_f32f64,   AArch64fcvtr_mt,  nxv2f32, nxv2i1, nxv2f64, ElementSizeD>;
+  defm FCVT_ZPmZ_StoD   : sve_fp_2op_p_zd< 0b1101011, "fcvt",   ZPR32, ZPR64, int_aarch64_sve_fcvt_f64f32,   AArch64fcvte_mt,  nxv2f64, nxv2i1, nxv2f32, ElementSizeD>;
+  defm SCVTF_ZPmZ_StoD  : sve_fp_2op_p_zd< 0b1110000, "scvtf",  ZPR32, ZPR64, int_aarch64_sve_scvtf_f64i32,  AArch64scvtf_mt,  nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
+  defm UCVTF_ZPmZ_StoD  : sve_fp_2op_p_zd< 0b1110001, "ucvtf",  ZPR32, ZPR64, int_aarch64_sve_ucvtf_f64i32,  AArch64ucvtf_mt,  nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
+  defm UCVTF_ZPmZ_StoH  : sve_fp_2op_p_zd< 0b0110101, "ucvtf",  ZPR32, ZPR16, int_aarch64_sve_ucvtf_f16i32,  AArch64ucvtf_mt,  nxv4f16, nxv4i1, nxv4i32, ElementSizeS>;
+  defm SCVTF_ZPmZ_DtoS  : sve_fp_2op_p_zd< 0b1110100, "scvtf",  ZPR64, ZPR32, int_aarch64_sve_scvtf_f32i64,  AArch64scvtf_mt,  nxv2f32, nxv2i1, nxv2i64, ElementSizeD>;
+  defm SCVTF_ZPmZ_StoH  : sve_fp_2op_p_zd< 0b0110100, "scvtf",  ZPR32, ZPR16, int_aarch64_sve_scvtf_f16i32,  AArch64scvtf_mt,  nxv4f16, nxv4i1, nxv4i32, ElementSizeS>;
+  defm SCVTF_ZPmZ_DtoH  : sve_fp_2op_p_zd< 0b0110110, "scvtf",  ZPR64, ZPR16, int_aarch64_sve_scvtf_f16i64,  AArch64scvtf_mt,  nxv2f16, nxv2i1, nxv2i64, ElementSizeD>;
+  defm UCVTF_ZPmZ_DtoS  : sve_fp_2op_p_zd< 0b1110101, "ucvtf",  ZPR64, ZPR32, int_aarch64_sve_ucvtf_f32i64,  AArch64ucvtf_mt,  nxv2f32, nxv2i1, nxv2i64, ElementSizeD>;
+  defm UCVTF_ZPmZ_DtoH  : sve_fp_2op_p_zd< 0b0110111, "ucvtf",  ZPR64, ZPR16, int_aarch64_sve_ucvtf_f16i64,  AArch64ucvtf_mt,  nxv2f16, nxv2i1, nxv2i64, ElementSizeD>;
+  defm SCVTF_ZPmZ_DtoD  : sve_fp_2op_p_zd< 0b1110110, "scvtf",  ZPR64, ZPR64, null_frag,                     AArch64scvtf_mt,  nxv2f64, nxv2i1, nxv2i64, ElementSizeD>;
+  defm UCVTF_ZPmZ_DtoD  : sve_fp_2op_p_zd< 0b1110111, "ucvtf",  ZPR64, ZPR64, null_frag,                     AArch64ucvtf_mt,  nxv2f64, nxv2i1, nxv2i64, ElementSizeD>;
+  defm FCVTZS_ZPmZ_DtoS : sve_fp_2op_p_zd< 0b1111000, "fcvtzs", ZPR64, ZPR32, int_aarch64_sve_fcvtzs_i32f64, null_frag,        nxv4i32, nxv2i1, nxv2f64, ElementSizeD>;
+  defm FCVTZU_ZPmZ_DtoS : sve_fp_2op_p_zd< 0b1111001, "fcvtzu", ZPR64, ZPR32, int_aarch64_sve_fcvtzu_i32f64, null_frag,        nxv4i32, nxv2i1, nxv2f64, ElementSizeD>;
+  defm FCVTZS_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1111100, "fcvtzs", ZPR32, ZPR64, int_aarch64_sve_fcvtzs_i64f32, AArch64fcvtzs_mt, nxv2i64, nxv2i1, nxv2f32, ElementSizeD>;
+  defm FCVTZS_ZPmZ_HtoS : sve_fp_2op_p_zd< 0b0111100, "fcvtzs", ZPR16, ZPR32, int_aarch64_sve_fcvtzs_i32f16, AArch64fcvtzs_mt, nxv4i32, nxv4i1, nxv4f16, ElementSizeS>;
+  defm FCVTZS_ZPmZ_HtoD : sve_fp_2op_p_zd< 0b0111110, "fcvtzs", ZPR16, ZPR64, int_aarch64_sve_fcvtzs_i64f16, AArch64fcvtzs_mt, nxv2i64, nxv2i1, nxv2f16, ElementSizeD>;
+  defm FCVTZU_ZPmZ_HtoS : sve_fp_2op_p_zd< 0b0111101, "fcvtzu", ZPR16, ZPR32, int_aarch64_sve_fcvtzu_i32f16, AArch64fcvtzu_mt, nxv4i32, nxv4i1, nxv4f16, ElementSizeS>;
+  defm FCVTZU_ZPmZ_HtoD : sve_fp_2op_p_zd< 0b0111111, "fcvtzu", ZPR16, ZPR64, int_aarch64_sve_fcvtzu_i64f16, AArch64fcvtzu_mt, nxv2i64, nxv2i1, nxv2f16, ElementSizeD>;
+  defm FCVTZU_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1111101, "fcvtzu", ZPR32, ZPR64, int_aarch64_sve_fcvtzu_i64f32, AArch64fcvtzu_mt, nxv2i64, nxv2i1, nxv2f32, ElementSizeD>;
+  defm FCVTZS_ZPmZ_DtoD : sve_fp_2op_p_zd< 0b1111110, "fcvtzs", ZPR64, ZPR64, null_frag,                     AArch64fcvtzs_mt, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>;
+  defm FCVTZU_ZPmZ_DtoD : sve_fp_2op_p_zd< 0b1111111, "fcvtzu", ZPR64, ZPR64, null_frag,                     AArch64fcvtzu_mt, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>;
+
+  def : Pat<(nxv2f32 (AArch64fcvte_mt (nxv2i1 PPR:$Pg), (nxv2f16 ZPR:$Zs), (nxv2f32 ZPR:$Zd))),
+            (FCVT_ZPmZ_HtoS ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
+
+  // FP_ROUND has an additional 'precise' flag which indicates the type of rounding.
+  // This is ignored by the pattern below where it is matched by (i64 timm0_1)
+  def : Pat<(nxv2f16 (AArch64fcvtr_mt (nxv2i1 PPR:$Pg), (nxv2f32 ZPR:$Zs), (i64 timm0_1), (nxv2f16 ZPR:$Zd))),
+            (FCVT_ZPmZ_StoH ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
+
+  // Floating-point -> signed integer
+  def : Pat<(nxv2f16 (AArch64scvtf_mt (nxv2i1 PPR:$Pg),
+                      (sext_inreg (nxv2i64 ZPR:$Zs), nxv2i16), (nxv2f16 ZPR:$Zd))),
+            (SCVTF_ZPmZ_HtoH ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
+
+  def : Pat<(nxv4f16 (AArch64scvtf_mt (nxv4i1 PPR:$Pg),
+                      (sext_inreg (nxv4i32 ZPR:$Zs), nxv4i16), (nxv4f16 ZPR:$Zd))),
+            (SCVTF_ZPmZ_HtoH ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
+
+  def : Pat<(nxv2f16 (AArch64scvtf_mt (nxv2i1 PPR:$Pg),
+                      (sext_inreg (nxv2i64 ZPR:$Zs), nxv2i32), (nxv2f16 ZPR:$Zd))),
+            (SCVTF_ZPmZ_StoH ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
+
+  def : Pat<(nxv2f32 (AArch64scvtf_mt (nxv2i1 PPR:$Pg),
+                      (sext_inreg (nxv2i64 ZPR:$Zs), nxv2i32), (nxv2f32 ZPR:$Zd))),
+            (SCVTF_ZPmZ_StoS ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
+
+  def : Pat<(nxv2f64 (AArch64scvtf_mt (nxv2i1 PPR:$Pg),
+                      (sext_inreg (nxv2i64 ZPR:$Zs), nxv2i32), (nxv2f64 ZPR:$Zd))),
+            (SCVTF_ZPmZ_StoD ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
+
+  // Floating-point -> unsigned integer
+  def : Pat<(nxv2f16 (AArch64ucvtf_mt (nxv2i1 PPR:$Pg),
+                      (and (nxv2i64 ZPR:$Zs),
+                       (nxv2i64 (AArch64dup (i64 0xFFFF)))), (nxv2f16 ZPR:$Zd))),
+            (UCVTF_ZPmZ_HtoH ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
+
+  def : Pat<(nxv2f16 (AArch64ucvtf_mt (nxv2i1 PPR:$Pg),
+                      (and (nxv2i64 ZPR:$Zs),
+                       (nxv2i64 (AArch64dup (i64 0xFFFFFFFF)))), (nxv2f16 ZPR:$Zd))),
+            (UCVTF_ZPmZ_StoH ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
+
+  def : Pat<(nxv4f16 (AArch64ucvtf_mt (nxv4i1 PPR:$Pg),
+                      (and (nxv4i32 ZPR:$Zs),
+                       (nxv4i32 (AArch64dup (i32 0xFFFF)))), (nxv4f16 ZPR:$Zd))),
+            (UCVTF_ZPmZ_HtoH ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
+
+  def : Pat<(nxv2f32 (AArch64ucvtf_mt (nxv2i1 PPR:$Pg),
+                      (and (nxv2i64 ZPR:$Zs),
+                       (nxv2i64 (AArch64dup (i64 0xFFFFFFFF)))), (nxv2f32 ZPR:$Zd))),
+            (UCVTF_ZPmZ_StoS ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
+
+  def : Pat<(nxv2f64 (AArch64ucvtf_mt (nxv2i1 PPR:$Pg),
+                      (and (nxv2i64 ZPR:$Zs),
+                       (nxv2i64 (AArch64dup (i64 0xFFFFFFFF)))), (nxv2f64 ZPR:$Zd))),
+            (UCVTF_ZPmZ_StoD ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
+
+  defm FRINTN_ZPmZ : sve_fp_2op_p_zd_HSD<0b00000, "frintn", AArch64frintn_mt>;
+  defm FRINTP_ZPmZ : sve_fp_2op_p_zd_HSD<0b00001, "frintp", AArch64frintp_mt>;
+  defm FRINTM_ZPmZ : sve_fp_2op_p_zd_HSD<0b00010, "frintm", AArch64frintm_mt>;
+  defm FRINTZ_ZPmZ : sve_fp_2op_p_zd_HSD<0b00011, "frintz", AArch64frintz_mt>;
+  defm FRINTA_ZPmZ : sve_fp_2op_p_zd_HSD<0b00100, "frinta", AArch64frinta_mt>;
+  defm FRINTX_ZPmZ : sve_fp_2op_p_zd_HSD<0b00110, "frintx", AArch64frintx_mt>;
+  defm FRINTI_ZPmZ : sve_fp_2op_p_zd_HSD<0b00111, "frinti", AArch64frinti_mt>;
+  defm FRECPX_ZPmZ : sve_fp_2op_p_zd_HSD<0b01100, "frecpx", AArch64frecpx_mt>;
+  defm FSQRT_ZPmZ  : sve_fp_2op_p_zd_HSD<0b01101, "fsqrt",  AArch64fsqrt_mt>;
 
   let Predicates = [HasBF16, HasSVE] in {
     defm BFDOT_ZZZ    : sve_bfloat_dot<"bfdot", int_aarch64_sve_bfdot>;
@@ -1528,6 +1648,9 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
     def : Pat<(vscale (sve_cntd_imm_neg i32:$imm)), (SUBXrs XZR, (CNTD_XPiI 31, $imm), 0)>;
   }
 
+  def : Pat<(add GPR64:$op, (vscale (sve_rdvl_imm i32:$imm))),
+            (ADDVL_XXI GPR64:$op, $imm)>;
+
   // FIXME: BigEndian requires an additional REV instruction to satisfy the
   // constraint that none of the bits change when stored to memory as one
   // type, and and reloaded as another type.
@@ -1581,15 +1704,6 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
     def : Pat<(nxv2f64 (bitconvert (nxv8f16 ZPR:$src))), (nxv2f64 ZPR:$src)>;
     def : Pat<(nxv2f64 (bitconvert (nxv4f32 ZPR:$src))), (nxv2f64 ZPR:$src)>;
 
-  }
-
-  let Predicates = [IsLE, HasBF16, HasSVE] in {
-    def : Pat<(nxv2i64  (bitconvert (nxv8bf16 ZPR:$src))), (nxv2i64 ZPR:$src)>;
-    def : Pat<(nxv8bf16 (bitconvert (nxv2i64 ZPR:$src))),  (nxv8bf16 ZPR:$src)>;
-    def : Pat<(nxv8bf16 (bitconvert (nxv8i16 ZPR:$src))),  (nxv8bf16 ZPR:$src)>;
-  }
-
-  let Predicates = [IsLE, HasSVE, HasBF16] in {
     def : Pat<(nxv8bf16 (bitconvert (nxv16i8 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
     def : Pat<(nxv8bf16 (bitconvert (nxv8i16 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
     def : Pat<(nxv8bf16 (bitconvert (nxv4i32 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
@@ -1607,6 +1721,7 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
     def : Pat<(nxv2f64 (bitconvert (nxv8bf16 ZPR:$src))), (nxv2f64 ZPR:$src)>;
   }
 
+  // These allow casting from/to unpacked predicate types.
   def : Pat<(nxv16i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
   def : Pat<(nxv16i1 (reinterpret_cast (nxv8i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
   def : Pat<(nxv16i1 (reinterpret_cast (nxv4i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
@@ -1621,6 +1736,18 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
   def : Pat<(nxv2i1 (reinterpret_cast  (nxv8i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
   def : Pat<(nxv2i1 (reinterpret_cast  (nxv4i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
 
+  // These allow casting from/to unpacked floating-point types.
+  def : Pat<(nxv2f16 (reinterpret_cast (nxv8f16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
+  def : Pat<(nxv8f16 (reinterpret_cast (nxv2f16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
+  def : Pat<(nxv4f16 (reinterpret_cast (nxv8f16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
+  def : Pat<(nxv8f16 (reinterpret_cast (nxv4f16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
+  def : Pat<(nxv2f32 (reinterpret_cast (nxv4f32 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
+  def : Pat<(nxv4f32 (reinterpret_cast (nxv2f32 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
+  def : Pat<(nxv2bf16 (reinterpret_cast (nxv8bf16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
+  def : Pat<(nxv8bf16 (reinterpret_cast (nxv2bf16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
+  def : Pat<(nxv4bf16 (reinterpret_cast (nxv8bf16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
+  def : Pat<(nxv8bf16 (reinterpret_cast (nxv4bf16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
+
   def : Pat<(nxv16i1 (and PPR:$Ps1, PPR:$Ps2)),
             (AND_PPzPP (PTRUE_B 31), PPR:$Ps1, PPR:$Ps2)>;
   def : Pat<(nxv8i1 (and PPR:$Ps1, PPR:$Ps2)),
@@ -1673,10 +1800,7 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
   defm : pred_load<nxv8i16,  nxv8i1, asext_masked_load_i8, LD1SB_H, LD1SB_H_IMM, am_sve_regreg_lsl0>;
   defm : pred_load<nxv8i16,  nxv8i1, nonext_masked_load,   LD1H,    LD1H_IMM,    am_sve_regreg_lsl1>;
   defm : pred_load<nxv8f16,  nxv8i1, nonext_masked_load,   LD1H,    LD1H_IMM,    am_sve_regreg_lsl1>;
-
-  let Predicates = [HasBF16, HasSVE] in {
-    defm : pred_load<nxv8bf16, nxv8i1, nonext_masked_load,   LD1H,    LD1H_IMM,    am_sve_regreg_lsl1>;
-  }
+  defm : pred_load<nxv8bf16, nxv8i1, nonext_masked_load,   LD1H,    LD1H_IMM,    am_sve_regreg_lsl1>;
 
   // 16-element contiguous loads
   defm : pred_load<nxv16i8, nxv16i1, nonext_masked_load, LD1B, LD1B_IMM, am_sve_regreg_lsl0>;
@@ -1714,13 +1838,10 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
   defm : pred_store<nxv4f32, nxv4i1, nontrunc_masked_store,  ST1W,   ST1W_IMM,   am_sve_regreg_lsl2>;
 
   // 8-element contiguous stores
-  defm : pred_store<nxv8i16, nxv8i1, trunc_masked_store_i8, ST1B_H, ST1B_H_IMM, am_sve_regreg_lsl0>;
-  defm : pred_store<nxv8i16, nxv8i1, nontrunc_masked_store, ST1H,   ST1H_IMM,   am_sve_regreg_lsl1>;
-  defm : pred_store<nxv8f16, nxv8i1, nontrunc_masked_store, ST1H,   ST1H_IMM,   am_sve_regreg_lsl1>;
-
-  let Predicates = [HasBF16, HasSVE] in {
-    defm : pred_store<nxv8bf16, nxv8i1, nontrunc_masked_store, ST1H,   ST1H_IMM,   am_sve_regreg_lsl1>;
-  }
+  defm : pred_store<nxv8i16,  nxv8i1, trunc_masked_store_i8, ST1B_H, ST1B_H_IMM, am_sve_regreg_lsl0>;
+  defm : pred_store<nxv8i16,  nxv8i1, nontrunc_masked_store, ST1H,   ST1H_IMM,   am_sve_regreg_lsl1>;
+  defm : pred_store<nxv8f16,  nxv8i1, nontrunc_masked_store, ST1H,   ST1H_IMM,   am_sve_regreg_lsl1>;
+  defm : pred_store<nxv8bf16, nxv8i1, nontrunc_masked_store, ST1H,   ST1H_IMM,   am_sve_regreg_lsl1>;
 
   // 16-element contiguous stores
   defm : pred_store<nxv16i8, nxv16i1, nontrunc_masked_store, ST1B, ST1B_IMM, am_sve_regreg_lsl0>;
@@ -1882,10 +2003,7 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
   defm : ld1<LD1SB_H, LD1SB_H_IMM, nxv8i16,  AArch64ld1s_z, nxv8i1, nxv8i8,   am_sve_regreg_lsl0>;
   defm : ld1<LD1H,    LD1H_IMM,    nxv8i16,  AArch64ld1_z,  nxv8i1, nxv8i16,  am_sve_regreg_lsl1>;
   defm : ld1<LD1H,    LD1H_IMM,    nxv8f16,  AArch64ld1_z,  nxv8i1, nxv8f16,  am_sve_regreg_lsl1>;
-
-  let Predicates = [HasBF16, HasSVE] in {
-    defm : ld1<LD1H,    LD1H_IMM,    nxv8bf16, AArch64ld1_z,  nxv8i1, nxv8bf16, am_sve_regreg_lsl1>;
-  }
+  defm : ld1<LD1H,    LD1H_IMM,    nxv8bf16, AArch64ld1_z,  nxv8i1, nxv8bf16, am_sve_regreg_lsl1>;
 
   // 16-element contiguous loads
   defm : ld1<LD1B, LD1B_IMM, nxv16i8, AArch64ld1_z, nxv16i1, nxv16i8, am_sve_regreg_lsl0>;
@@ -1925,10 +2043,7 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
   defm : ldnf1<LDNF1SB_H_IMM, nxv8i16,  AArch64ldnf1s_z, nxv8i1, nxv8i8>;
   defm : ldnf1<LDNF1H_IMM,    nxv8i16,  AArch64ldnf1_z,  nxv8i1, nxv8i16>;
   defm : ldnf1<LDNF1H_IMM,    nxv8f16,  AArch64ldnf1_z,  nxv8i1, nxv8f16>;
-
-  let Predicates = [HasBF16, HasSVE] in {
-    defm : ldnf1<LDNF1H_IMM,    nxv8bf16, AArch64ldnf1_z,  nxv8i1, nxv8bf16>;
-  }
+  defm : ldnf1<LDNF1H_IMM,    nxv8bf16, AArch64ldnf1_z,  nxv8i1, nxv8bf16>;
 
   // 16-element contiguous non-faulting loads
   defm : ldnf1<LDNF1B_IMM,    nxv16i8,  AArch64ldnf1_z, nxv16i1, nxv16i8>;
@@ -1969,10 +2084,7 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
   defm : ldff1<LDFF1SB_H, nxv8i16,  AArch64ldff1s_z, nxv8i1, nxv8i8,   am_sve_regreg_lsl0>;
   defm : ldff1<LDFF1H,    nxv8i16,  AArch64ldff1_z,  nxv8i1, nxv8i16,  am_sve_regreg_lsl1>;
   defm : ldff1<LDFF1H,    nxv8f16,  AArch64ldff1_z,  nxv8i1, nxv8f16,  am_sve_regreg_lsl1>;
-
-  let Predicates = [HasBF16, HasSVE] in {
-    defm : ldff1<LDFF1H,    nxv8bf16, AArch64ldff1_z,  nxv8i1, nxv8bf16, am_sve_regreg_lsl1>;
-  }
+  defm : ldff1<LDFF1H,    nxv8bf16, AArch64ldff1_z,  nxv8i1, nxv8bf16, am_sve_regreg_lsl1>;
 
   // 16-element contiguous first faulting loads
   defm : ldff1<LDFF1B, nxv16i8, AArch64ldff1_z, nxv16i1, nxv16i8, am_sve_regreg_lsl0>;
@@ -2023,6 +2135,19 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
   def : Pat<(nxv2i64 (vector_insert (nxv2i64 (undef)), (i64 FPR64:$src), 0)),
             (INSERT_SUBREG (nxv2i64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
 
+  def : Pat<(nxv8f16 (vector_insert (nxv8f16 (undef)), (f16 FPR16:$src), 0)),
+            (INSERT_SUBREG (nxv8f16 (IMPLICIT_DEF)), FPR16:$src, hsub)>;
+  def : Pat<(nxv4f16 (vector_insert (nxv4f16 (undef)), (f16 FPR16:$src), 0)),
+            (INSERT_SUBREG (nxv4f16 (IMPLICIT_DEF)), FPR16:$src, hsub)>;
+  def : Pat<(nxv2f16 (vector_insert (nxv2f16 (undef)), (f16 FPR16:$src), 0)),
+            (INSERT_SUBREG (nxv2f16 (IMPLICIT_DEF)), FPR16:$src, hsub)>;
+  def : Pat<(nxv4f32 (vector_insert (nxv4f32 (undef)), (f32 FPR32:$src), 0)),
+            (INSERT_SUBREG (nxv4f32 (IMPLICIT_DEF)), FPR32:$src, ssub)>;
+  def : Pat<(nxv2f32 (vector_insert (nxv2f32 (undef)), (f32 FPR32:$src), 0)),
+            (INSERT_SUBREG (nxv2f32 (IMPLICIT_DEF)), FPR32:$src, ssub)>;
+  def : Pat<(nxv2f64 (vector_insert (nxv2f64 (undef)), (f64 FPR64:$src), 0)),
+            (INSERT_SUBREG (nxv2f64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+
   // Insert scalar into vector[0]
   def : Pat<(nxv16i8 (vector_insert (nxv16i8 ZPR:$vec), (i32 GPR32:$src), 0)),
             (CPY_ZPmR_B ZPR:$vec, (PTRUE_B 1), GPR32:$src)>;
@@ -2086,6 +2211,28 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
                                        (DUP_ZR_D $index)),
                         $src)>;
 
+  // Extract element from vector with scalar index
+  def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), GPR64:$index)),
+            (LASTB_RPZ_B (WHILELS_PXX_B XZR, GPR64:$index), ZPR:$vec)>;
+  def : Pat<(i32 (vector_extract (nxv8i16 ZPR:$vec), GPR64:$index)),
+            (LASTB_RPZ_H (WHILELS_PXX_H XZR, GPR64:$index), ZPR:$vec)>;
+  def : Pat<(i32 (vector_extract (nxv4i32 ZPR:$vec), GPR64:$index)),
+            (LASTB_RPZ_S (WHILELS_PXX_S XZR, GPR64:$index), ZPR:$vec)>;
+  def : Pat<(i64 (vector_extract (nxv2i64 ZPR:$vec), GPR64:$index)),
+            (LASTB_RPZ_D (WHILELS_PXX_D XZR, GPR64:$index), ZPR:$vec)>;
+  def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$vec), GPR64:$index)),
+            (LASTB_VPZ_H (WHILELS_PXX_H XZR, GPR64:$index), ZPR:$vec)>;
+  def : Pat<(f16 (vector_extract (nxv4f16 ZPR:$vec), GPR64:$index)),
+            (LASTB_VPZ_H (WHILELS_PXX_S XZR, GPR64:$index), ZPR:$vec)>;
+  def : Pat<(f16 (vector_extract (nxv2f16 ZPR:$vec), GPR64:$index)),
+            (LASTB_VPZ_H (WHILELS_PXX_D XZR, GPR64:$index), ZPR:$vec)>;
+  def : Pat<(f32 (vector_extract (nxv4f32 ZPR:$vec), GPR64:$index)),
+            (LASTB_VPZ_S (WHILELS_PXX_S XZR, GPR64:$index), ZPR:$vec)>;
+  def : Pat<(f32 (vector_extract (nxv2f32 ZPR:$vec), GPR64:$index)),
+            (LASTB_VPZ_S (WHILELS_PXX_D XZR, GPR64:$index), ZPR:$vec)>;
+  def : Pat<(f64 (vector_extract (nxv2f64 ZPR:$vec), GPR64:$index)),
+            (LASTB_VPZ_D (WHILELS_PXX_D XZR, GPR64:$index), ZPR:$vec)>;
+
   // Extract element from vector with immediate index
   def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), sve_elm_idx_extdup_b:$index)),
             (EXTRACT_SUBREG (DUP_ZZI_B ZPR:$vec, sve_elm_idx_extdup_b:$index), ssub)>;
@@ -2097,34 +2244,54 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
             (EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), dsub)>;
   def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$vec), sve_elm_idx_extdup_h:$index)),
             (EXTRACT_SUBREG (DUP_ZZI_H ZPR:$vec, sve_elm_idx_extdup_h:$index), hsub)>;
+  def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$vec), sve_elm_idx_extdup_h:$index)),
+            (EXTRACT_SUBREG (DUP_ZZI_H ZPR:$vec, sve_elm_idx_extdup_h:$index), hsub)>;
+  def : Pat<(f16 (vector_extract (nxv4f16 ZPR:$vec), sve_elm_idx_extdup_s:$index)),
+            (EXTRACT_SUBREG (DUP_ZZI_S ZPR:$vec, sve_elm_idx_extdup_s:$index), hsub)>;
+  def : Pat<(f16 (vector_extract (nxv2f16 ZPR:$vec), sve_elm_idx_extdup_d:$index)),
+            (EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), hsub)>;
   def : Pat<(f32 (vector_extract (nxv4f32 ZPR:$vec), sve_elm_idx_extdup_s:$index)),
             (EXTRACT_SUBREG (DUP_ZZI_S ZPR:$vec, sve_elm_idx_extdup_s:$index), ssub)>;
+  def : Pat<(f32 (vector_extract (nxv2f32 ZPR:$vec), sve_elm_idx_extdup_d:$index)),
+            (EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), ssub)>;
   def : Pat<(f64 (vector_extract (nxv2f64 ZPR:$vec), sve_elm_idx_extdup_d:$index)),
             (EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), dsub)>;
 
-  // Extract element from vector with scalar index
-  def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), GPR64:$index)),
-            (LASTB_RPZ_B (WHILELS_PXX_B XZR, GPR64:$index),
-                         ZPR:$vec)>;
-  def : Pat<(i32 (vector_extract (nxv8i16 ZPR:$vec), GPR64:$index)),
-            (LASTB_RPZ_H (WHILELS_PXX_H XZR, GPR64:$index),
-                         ZPR:$vec)>;
-  def : Pat<(i32 (vector_extract (nxv4i32 ZPR:$vec), GPR64:$index)),
-            (LASTB_RPZ_S (WHILELS_PXX_S XZR, GPR64:$index),
-                         ZPR:$vec)>;
-  def : Pat<(i64 (vector_extract (nxv2i64 ZPR:$vec), GPR64:$index)),
-            (LASTB_RPZ_D (WHILELS_PXX_D XZR, GPR64:$index),
-                         ZPR:$vec)>;
+  // Extract element from vector with immediate index that's within the bottom 128-bits.
+  let AddedComplexity = 1 in {
+  def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), VectorIndexB:$index)),
+            (i32 (UMOVvi8 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index))>;
+  def : Pat<(i32 (vector_extract (nxv8i16 ZPR:$vec), VectorIndexH:$index)),
+            (i32 (UMOVvi16 (v8i16 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexH:$index))>;
+  def : Pat<(i32 (vector_extract (nxv4i32 ZPR:$vec), VectorIndexS:$index)),
+            (i32 (UMOVvi32 (v4i32 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexS:$index))>;
+  def : Pat<(i64 (vector_extract (nxv2i64 ZPR:$vec), VectorIndexD:$index)),
+            (i64 (UMOVvi64 (v2i64 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexD:$index))>;
+  }
 
-  def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$vec), GPR64:$index)),
-            (LASTB_VPZ_H (WHILELS_PXX_H XZR, GPR64:$index),
-                         ZPR:$vec)>;
-  def : Pat<(f32 (vector_extract (nxv4f32 ZPR:$vec), GPR64:$index)),
-            (LASTB_VPZ_S (WHILELS_PXX_S XZR, GPR64:$index),
-                         ZPR:$vec)>;
-  def : Pat<(f64 (vector_extract (nxv2f64 ZPR:$vec), GPR64:$index)),
-            (LASTB_VPZ_D (WHILELS_PXX_D XZR, GPR64:$index),
-                         ZPR:$vec)>;
+  // Extract first element from vector.
+  let AddedComplexity = 2 in {
+  def : Pat<(vector_extract (nxv16i8 ZPR:$Zs), (i64 0)),
+            (i32 (EXTRACT_SUBREG ZPR:$Zs, ssub))>;
+  def : Pat<(vector_extract (nxv8i16 ZPR:$Zs), (i64 0)),
+            (i32 (EXTRACT_SUBREG ZPR:$Zs, ssub))>;
+  def : Pat<(vector_extract (nxv4i32 ZPR:$Zs), (i64 0)),
+            (i32 (EXTRACT_SUBREG ZPR:$Zs, ssub))>;
+  def : Pat<(vector_extract (nxv2i64 ZPR:$Zs), (i64 0)),
+            (i64 (EXTRACT_SUBREG ZPR:$Zs, dsub))>;
+  def : Pat<(vector_extract (nxv8f16 ZPR:$Zs), (i64 0)),
+            (f16 (EXTRACT_SUBREG ZPR:$Zs, hsub))>;
+  def : Pat<(vector_extract (nxv4f16 ZPR:$Zs), (i64 0)),
+            (f16 (EXTRACT_SUBREG ZPR:$Zs, hsub))>;
+  def : Pat<(vector_extract (nxv2f16 ZPR:$Zs), (i64 0)),
+            (f16 (EXTRACT_SUBREG ZPR:$Zs, hsub))>;
+  def : Pat<(vector_extract (nxv4f32 ZPR:$Zs), (i64 0)),
+            (f32 (EXTRACT_SUBREG ZPR:$Zs, ssub))>;
+  def : Pat<(vector_extract (nxv2f32 ZPR:$Zs), (i64 0)),
+            (f32 (EXTRACT_SUBREG ZPR:$Zs, ssub))>;
+  def : Pat<(vector_extract (nxv2f64 ZPR:$Zs), (i64 0)),
+            (f64 (EXTRACT_SUBREG ZPR:$Zs, dsub))>;
+  }
 }
 
 let Predicates = [HasSVE, HasMatMulInt8] in {
@@ -2158,15 +2325,6 @@ let Predicates = [HasSVE, HasMatMulFP64] in {
   defm TRN2_ZZZ_Q  : sve_int_perm_bin_perm_128_zz<0b11, 1, "trn2", int_aarch64_sve_trn2q>;
 }
 
-let Predicates = [HasSVE, HasMatMulFP64, HasBF16] in {
-  def : SVE_2_Op_Pat<nxv8bf16, int_aarch64_sve_zip1q, nxv8bf16, nxv8bf16, ZIP1_ZZZ_Q>;
-  def : SVE_2_Op_Pat<nxv8bf16, int_aarch64_sve_zip2q, nxv8bf16, nxv8bf16, ZIP2_ZZZ_Q>;
-  def : SVE_2_Op_Pat<nxv8bf16, int_aarch64_sve_uzp1q, nxv8bf16, nxv8bf16, UZP1_ZZZ_Q>;
-  def : SVE_2_Op_Pat<nxv8bf16, int_aarch64_sve_uzp2q, nxv8bf16, nxv8bf16, UZP2_ZZZ_Q>;
-  def : SVE_2_Op_Pat<nxv8bf16, int_aarch64_sve_trn1q, nxv8bf16, nxv8bf16, TRN1_ZZZ_Q>;
-  def : SVE_2_Op_Pat<nxv8bf16, int_aarch64_sve_trn2q, nxv8bf16, nxv8bf16, TRN2_ZZZ_Q>;
-}
-
 let Predicates = [HasSVE2] in {
   // SVE2 integer multiply-add (indexed)
   defm MLA_ZZZI : sve2_int_mla_by_indexed_elem<0b01, 0b0, "mla", int_aarch64_sve_mla_lane>;
@@ -2192,10 +2350,10 @@ let Predicates = [HasSVE2] in {
   defm SQRDMULH_ZZZ : sve2_int_mul<0b101, "sqrdmulh", int_aarch64_sve_sqrdmulh>;
 
   // SVE2 integer multiply vectors (unpredicated)
-  defm MUL_ZZZ    : sve2_int_mul<0b000,  "mul",   mul>;
+  defm MUL_ZZZ    : sve2_int_mul<0b000,  "mul",   null_frag, AArch64mul_p>;
   defm SMULH_ZZZ  : sve2_int_mul<0b010,  "smulh", null_frag>;
   defm UMULH_ZZZ  : sve2_int_mul<0b011,  "umulh", null_frag>;
-  defm PMUL_ZZZ   : sve2_int_mul_single<0b001, "pmul",  int_aarch64_sve_pmul>;
+  defm PMUL_ZZZ   : sve2_int_mul_single<0b001, "pmul", int_aarch64_sve_pmul>;
 
   // Add patterns for unpredicated version of smulh and umulh.
   def : Pat<(nxv16i8 (int_aarch64_sve_smulh (nxv16i1 (AArch64ptrue 31)), nxv16i8:$Op1, nxv16i8:$Op2)),
@@ -2214,6 +2372,7 @@ let Predicates = [HasSVE2] in {
             (UMULH_ZZZ_S $Op1, $Op2)>;
   def : Pat<(nxv2i64 (int_aarch64_sve_umulh (nxv2i1 (AArch64ptrue 31)), nxv2i64:$Op1, nxv2i64:$Op2)),
             (UMULH_ZZZ_D $Op1, $Op2)>;
+
   // SVE2 complex integer dot product (indexed)
   defm CDOT_ZZZI : sve2_cintx_dot_by_indexed_elem<"cdot", int_aarch64_sve_cdot_lane>;
 
@@ -2335,11 +2494,11 @@ let Predicates = [HasSVE2] in {
   }
 
   // SVE2 predicated shifts
-  defm SQSHL_ZPmI  : sve_int_bin_pred_shift_imm_left< 0b0110, "sqshl", "SQSHL_ZPZI">;
-  defm UQSHL_ZPmI  : sve_int_bin_pred_shift_imm_left< 0b0111, "uqshl", "UQSHL_ZPZI">;
-  defm SRSHR_ZPmI  : sve_int_bin_pred_shift_imm_right<0b1100,  "srshr",  "SRSHR_ZPZI",  int_aarch64_sve_srshr>;
-  defm URSHR_ZPmI  : sve_int_bin_pred_shift_imm_right<0b1101,  "urshr",  "URSHR_ZPZI",  int_aarch64_sve_urshr>;
-  defm SQSHLU_ZPmI : sve2_int_bin_pred_shift_imm_left< 0b1111, "sqshlu", "SQSHLU_ZPZI", int_aarch64_sve_sqshlu>;
+  defm SQSHL_ZPmI  : sve_int_bin_pred_shift_imm_left< 0b0110, "sqshl",  "SQSHL_ZPZI">;
+  defm UQSHL_ZPmI  : sve_int_bin_pred_shift_imm_left< 0b0111, "uqshl",  "UQSHL_ZPZI">;
+  defm SRSHR_ZPmI  : sve_int_bin_pred_shift_imm_right<0b1100, "srshr",  "SRSHR_ZPZI",  int_aarch64_sve_srshr>;
+  defm URSHR_ZPmI  : sve_int_bin_pred_shift_imm_right<0b1101, "urshr",  "URSHR_ZPZI",  int_aarch64_sve_urshr>;
+  defm SQSHLU_ZPmI : sve_int_bin_pred_shift_imm_left< 0b1111, "sqshlu", "SQSHLU_ZPZI", int_aarch64_sve_sqshlu>;
 
   // SVE2 integer add/subtract long
   defm SADDLB_ZZZ : sve2_wide_int_arith_long<0b00000, "saddlb", int_aarch64_sve_saddlb>;
@@ -2546,13 +2705,6 @@ let Predicates = [HasSVE2] in {
   defm TBL_ZZZZ : sve2_int_perm_tbl<"tbl", int_aarch64_sve_tbl2>;
   defm TBX_ZZZ  : sve2_int_perm_tbx<"tbx", int_aarch64_sve_tbx>;
 
-  let Predicates = [HasSVE, HasBF16] in {
-    def : SVE_3_Op_Pat<nxv8bf16, int_aarch64_sve_tbx, nxv8bf16, nxv8bf16, nxv8i16, TBX_ZZZ_H>;
-    def : Pat<(nxv8bf16 (int_aarch64_sve_tbl2 nxv8bf16:$Op1, nxv8bf16:$Op2, nxv8i16:$Op3)),
-              (nxv8bf16 (TBL_ZZZZ_H (REG_SEQUENCE ZPR2, nxv8bf16:$Op1, zsub0, nxv8bf16:$Op2, zsub1),
-                        nxv8i16:$Op3))>;
-  }
-
   // SVE2 integer compare scalar count and limit
   defm WHILEGE_PWW : sve_int_while4_rr<0b000, "whilege", int_aarch64_sve_whilege>;
   defm WHILEGT_PWW : sve_int_while4_rr<0b001, "whilegt", int_aarch64_sve_whilegt>;
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SchedA55.td b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SchedA55.td
new file mode 100644
index 000000000000..50911fd22bc9
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SchedA55.td
@@ -0,0 +1,339 @@
+//==- AArch64SchedCortexA55.td - ARM Cortex-A55 Scheduling Definitions -*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for the ARM Cortex-A55 processors.
+//
+//===----------------------------------------------------------------------===//
+
+// ===---------------------------------------------------------------------===//
+// The following definitions describe the per-operand machine model.
+// This works with MachineScheduler. See MCSchedModel.h for details.
+
+// Cortex-A55 machine model for scheduling and other instruction cost heuristics.
+def CortexA55Model : SchedMachineModel {
+  let MicroOpBufferSize = 0;  // The Cortex-A55 is an in-order processor
+  let IssueWidth = 2;         // It dual-issues under most circumstances
+  let LoadLatency = 4;        // Cycles for loads to access the cache. The
+                              // optimisation guide shows that most loads have
+                              // a latency of 3, but some have a latency of 4
+                              // or 5. Setting it 4 looked to be good trade-off.
+  let MispredictPenalty = 8;  // A branch direction mispredict.
+  let PostRAScheduler = 1;    // Enable PostRA scheduler pass.
+  let CompleteModel = 0;      // Covers instructions applicable to Cortex-A55.
+
+  list<Predicate> UnsupportedFeatures = [HasSVE];
+
+  // FIXME: Remove when all errors have been fixed.
+  let FullInstRWOverlapCheck = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available.
+
+// Modeling each pipeline as a ProcResource using the BufferSize = 0 since the
+// Cortex-A55 is in-order.
+
+def CortexA55UnitALU    : ProcResource<2> { let BufferSize = 0; } // Int ALU
+def CortexA55UnitMAC    : ProcResource<1> { let BufferSize = 0; } // Int MAC, 64-bi wide
+def CortexA55UnitDiv    : ProcResource<1> { let BufferSize = 0; } // Int Division, not pipelined
+def CortexA55UnitLd     : ProcResource<1> { let BufferSize = 0; } // Load pipe
+def CortexA55UnitSt     : ProcResource<1> { let BufferSize = 0; } // Store pipe
+def CortexA55UnitB      : ProcResource<1> { let BufferSize = 0; } // Branch
+
+// The FP DIV/SQRT instructions execute totally differently from the FP ALU
+// instructions, which can mostly be dual-issued; that's why for now we model
+// them with 2 resources.
+def CortexA55UnitFPALU  : ProcResource<2> { let BufferSize = 0; } // FP ALU
+def CortexA55UnitFPMAC  : ProcResource<2> { let BufferSize = 0; } // FP MAC
+def CortexA55UnitFPDIV  : ProcResource<1> { let BufferSize = 0; } // FP Div/SQRT, 64/128
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific SchedWrite types
+
+let SchedModel = CortexA55Model in {
+
+// These latencies are modeled without taking into account forwarding paths
+// (the software optimisation guide lists latencies taking into account
+// typical forwarding paths).
+def : WriteRes<WriteImm, [CortexA55UnitALU]> { let Latency = 3; }    // MOVN, MOVZ
+def : WriteRes<WriteI, [CortexA55UnitALU]> { let Latency = 3; }      // ALU
+def : WriteRes<WriteISReg, [CortexA55UnitALU]> { let Latency = 3; }  // ALU of Shifted-Reg
+def : WriteRes<WriteIEReg, [CortexA55UnitALU]> { let Latency = 3; }  // ALU of Extended-Reg
+def : WriteRes<WriteExtr, [CortexA55UnitALU]> { let Latency = 3; }   // EXTR from a reg pair
+def : WriteRes<WriteIS, [CortexA55UnitALU]> { let Latency = 3; }     // Shift/Scale
+
+// MAC
+def : WriteRes<WriteIM32, [CortexA55UnitMAC]> { let Latency = 4; }   // 32-bit Multiply
+def : WriteRes<WriteIM64, [CortexA55UnitMAC]> { let Latency = 4; }   // 64-bit Multiply
+
+// Div
+def : WriteRes<WriteID32, [CortexA55UnitDiv]> {
+  let Latency = 8; let ResourceCycles = [8];
+}
+def : WriteRes<WriteID64, [CortexA55UnitDiv]> {
+  let Latency = 8; let ResourceCycles = [8];
+}
+
+// Load
+def : WriteRes<WriteLD, [CortexA55UnitLd]> { let Latency = 3; }
+def : WriteRes<WriteLDIdx, [CortexA55UnitLd]> { let Latency = 4; }
+def : WriteRes<WriteLDHi, [CortexA55UnitLd]> { let Latency = 5; }
+
+// Vector Load - Vector loads take 1-5 cycles to issue. For the WriteVecLd
+//               below, choosing the median of 3 which makes the latency 6.
+// An extra cycle is needed to get the swizzling right.
+def : WriteRes<WriteVLD, [CortexA55UnitLd]> { let Latency = 6;
+                                           let ResourceCycles = [3]; }
+def CortexA55WriteVLD1 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 4; }
+def CortexA55WriteVLD2 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 5;
+                                                  let ResourceCycles = [2]; }
+def CortexA55WriteVLD3 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 6;
+                                                  let ResourceCycles = [3]; }
+def CortexA55WriteVLD4 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 7;
+                                                  let ResourceCycles = [4]; }
+def CortexA55WriteVLD5 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 8;
+                                                  let ResourceCycles = [5]; }
+def CortexA55WriteVLD6 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 9;
+                                                  let ResourceCycles = [6]; }
+def CortexA55WriteVLD7 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 10;
+                                                  let ResourceCycles = [7]; }
+def CortexA55WriteVLD8 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 11;
+                                                  let ResourceCycles = [8]; }
+
+// Pre/Post Indexing - Performed as part of address generation
+def : WriteRes<WriteAdr, []> { let Latency = 0; }
+
+// Store
+def : WriteRes<WriteST, [CortexA55UnitSt]> { let Latency = 4; }
+def : WriteRes<WriteSTP, [CortexA55UnitSt]> { let Latency = 4; }
+def : WriteRes<WriteSTIdx, [CortexA55UnitSt]> { let Latency = 4; }
+def : WriteRes<WriteSTX, [CortexA55UnitSt]> { let Latency = 4; }
+
+// Vector Store - Similar to vector loads, can take 1-3 cycles to issue.
+def : WriteRes<WriteVST, [CortexA55UnitSt]> { let Latency = 5;
+                                          let ResourceCycles = [2];}
+def CortexA55WriteVST1 : SchedWriteRes<[CortexA55UnitSt]> { let Latency = 4; }
+def CortexA55WriteVST2 : SchedWriteRes<[CortexA55UnitSt]> { let Latency = 5;
+                                                  let ResourceCycles = [2]; }
+def CortexA55WriteVST3 : SchedWriteRes<[CortexA55UnitSt]> { let Latency = 6;
+                                                  let ResourceCycles = [3]; }
+def CortexA55WriteVST4 : SchedWriteRes<[CortexA55UnitSt]> { let Latency = 5;
+                                                  let ResourceCycles = [4]; }
+
+def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
+
+// Branch
+def : WriteRes<WriteBr, [CortexA55UnitB]>;
+def : WriteRes<WriteBrReg, [CortexA55UnitB]>;
+def : WriteRes<WriteSys, [CortexA55UnitB]>;
+def : WriteRes<WriteBarrier, [CortexA55UnitB]>;
+def : WriteRes<WriteHint, [CortexA55UnitB]>;
+
+// FP ALU
+//   As WriteF result is produced in F5 and it can be mostly forwarded
+//   to consumer at F1, the effectively latency is set as 4.
+def : WriteRes<WriteF, [CortexA55UnitFPALU]> { let Latency = 4; }
+def : WriteRes<WriteFCmp, [CortexA55UnitFPALU]> { let Latency = 3; }
+def : WriteRes<WriteFCvt, [CortexA55UnitFPALU]> { let Latency = 4; }
+def : WriteRes<WriteFCopy, [CortexA55UnitFPALU]> { let Latency = 3; }
+def : WriteRes<WriteFImm, [CortexA55UnitFPALU]> { let Latency = 3; }
+def : WriteRes<WriteV, [CortexA55UnitFPALU]> { let Latency = 4; }
+
+// FP ALU specific new schedwrite definitions
+def CortexA55WriteFPALU_F3 : SchedWriteRes<[CortexA55UnitFPALU]> { let Latency = 3;}
+def CortexA55WriteFPALU_F4 : SchedWriteRes<[CortexA55UnitFPALU]> { let Latency = 4;}
+def CortexA55WriteFPALU_F5 : SchedWriteRes<[CortexA55UnitFPALU]> { let Latency = 5;}
+
+// FP Mul, Div, Sqrt. Div/Sqrt are not pipelined
+def : WriteRes<WriteFMul, [CortexA55UnitFPMAC]> { let Latency = 4; }
+def : WriteRes<WriteFDiv, [CortexA55UnitFPDIV]> { let Latency = 22;
+                                            let ResourceCycles = [29]; }
+def CortexA55WriteFMAC : SchedWriteRes<[CortexA55UnitFPMAC]> { let Latency = 4; }
+def CortexA55WriteFDivHP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency = 8;
+                                                     let ResourceCycles = [5]; }
+def CortexA55WriteFDivSP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency = 13;
+                                                     let ResourceCycles = [10]; }
+def CortexA55WriteFDivDP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency = 22;
+                                                     let ResourceCycles = [19]; }
+def CortexA55WriteFSqrtHP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency = 8;
+                                                      let ResourceCycles = [5]; }
+def CortexA55WriteFSqrtSP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency = 12;
+                                                      let ResourceCycles = [9]; }
+def CortexA55WriteFSqrtDP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency = 22;
+                                                      let ResourceCycles = [19]; }
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific SchedRead types.
+
+def : ReadAdvance<ReadVLD, 0>;
+def : ReadAdvance<ReadExtrHi, 1>;
+def : ReadAdvance<ReadAdrBase, 1>;
+
+// ALU - ALU input operands are generally needed in EX1. An operand produced in
+//       in say EX2 can be forwarded for consumption to ALU in EX1, thereby
+//       allowing back-to-back ALU operations such as add. If an operand requires
+//       a shift, it will, however, be required in ISS stage.
+def : ReadAdvance<ReadI, 2, [WriteImm,WriteI,
+                             WriteISReg, WriteIEReg,WriteIS,
+                             WriteID32,WriteID64,
+                             WriteIM32,WriteIM64]>;
+// Shifted operand
+def CortexA55ReadShifted : SchedReadAdvance<1, [WriteImm,WriteI,
+                                          WriteISReg, WriteIEReg,WriteIS,
+                                          WriteID32,WriteID64,
+                                          WriteIM32,WriteIM64]>;
+def CortexA55ReadNotShifted : SchedReadAdvance<2, [WriteImm,WriteI,
+                                             WriteISReg, WriteIEReg,WriteIS,
+                                             WriteID32,WriteID64,
+                                             WriteIM32,WriteIM64]>;
+def CortexA55ReadISReg : SchedReadVariant<[
+        SchedVar<RegShiftedPred, [CortexA55ReadShifted]>,
+        SchedVar<NoSchedPred, [CortexA55ReadNotShifted]>]>;
+def : SchedAlias<ReadISReg, CortexA55ReadISReg>;
+
+def CortexA55ReadIEReg : SchedReadVariant<[
+        SchedVar<RegExtendedPred, [CortexA55ReadShifted]>,
+        SchedVar<NoSchedPred, [CortexA55ReadNotShifted]>]>;
+def : SchedAlias<ReadIEReg, CortexA55ReadIEReg>;
+
+// MUL
+def : ReadAdvance<ReadIM, 1, [WriteImm,WriteI,
+                              WriteISReg, WriteIEReg,WriteIS,
+                              WriteID32,WriteID64,
+                              WriteIM32,WriteIM64]>;
+def : ReadAdvance<ReadIMA, 2, [WriteImm,WriteI,
+                               WriteISReg, WriteIEReg,WriteIS,
+                               WriteID32,WriteID64,
+                               WriteIM32,WriteIM64]>;
+
+// Div
+def : ReadAdvance<ReadID, 1, [WriteImm,WriteI,
+                              WriteISReg, WriteIEReg,WriteIS,
+                              WriteID32,WriteID64,
+                              WriteIM32,WriteIM64]>;
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific InstRWs.
+
+//---
+// Miscellaneous
+//---
+def : InstRW<[CortexA55WriteVLD2,CortexA55WriteVLD1], (instregex "LDP.*")>;
+def : InstRW<[WriteI], (instrs COPY)>;
+//---
+// Vector Loads - 64-bit per cycle
+//---
+//   1-element structures
+def : InstRW<[CortexA55WriteVLD1], (instregex "LD1i(8|16|32|64)$")>;                // single element
+def : InstRW<[CortexA55WriteVLD1], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; // replicate
+def : InstRW<[CortexA55WriteVLD1], (instregex "LD1Onev(8b|4h|2s|1d)$")>;
+def : InstRW<[CortexA55WriteVLD2], (instregex "LD1Onev(16b|8h|4s|2d)$")>;
+def : InstRW<[CortexA55WriteVLD2], (instregex "LD1Twov(8b|4h|2s|1d)$")>; // multiple structures
+def : InstRW<[CortexA55WriteVLD4], (instregex "LD1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[CortexA55WriteVLD3], (instregex "LD1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[CortexA55WriteVLD6], (instregex "LD1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[CortexA55WriteVLD4], (instregex "LD1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[CortexA55WriteVLD8], (instregex "LD1Fourv(16b|8h|4s|2d)$")>;
+
+def : InstRW<[CortexA55WriteVLD1, WriteAdr], (instregex "LD1i(8|16|32|64)_POST$")>;
+def : InstRW<[CortexA55WriteVLD1, WriteAdr], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[CortexA55WriteVLD1, WriteAdr], (instregex "LD1Onev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD1Onev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD1Twov(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[CortexA55WriteVLD4, WriteAdr], (instregex "LD1Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[CortexA55WriteVLD3, WriteAdr], (instregex "LD1Threev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[CortexA55WriteVLD6, WriteAdr], (instregex "LD1Threev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[CortexA55WriteVLD4, WriteAdr], (instregex "LD1Fourv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[CortexA55WriteVLD8, WriteAdr], (instregex "LD1Fourv(16b|8h|4s|2d)_POST$")>;
+
+//    2-element structures
+def : InstRW<[CortexA55WriteVLD2], (instregex "LD2i(8|16|32|64)$")>;
+def : InstRW<[CortexA55WriteVLD2], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[CortexA55WriteVLD2], (instregex "LD2Twov(8b|4h|2s)$")>;
+def : InstRW<[CortexA55WriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)$")>;
+
+def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD2i(8|16|32|64)(_POST)?$")>;
+def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
+def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD2Twov(8b|4h|2s)(_POST)?$")>;
+def : InstRW<[CortexA55WriteVLD4, WriteAdr], (instregex "LD2Twov(16b|8h|4s|2d)(_POST)?$")>;
+
+//    3-element structures
+def : InstRW<[CortexA55WriteVLD2], (instregex "LD3i(8|16|32|64)$")>;
+def : InstRW<[CortexA55WriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[CortexA55WriteVLD3], (instregex "LD3Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[CortexA55WriteVLD6], (instregex "LD3Threev(16b|8h|4s|2d)$")>;
+
+def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD3i(8|16|32|64)_POST$")>;
+def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[CortexA55WriteVLD3, WriteAdr], (instregex "LD3Threev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[CortexA55WriteVLD6, WriteAdr], (instregex "LD3Threev(16b|8h|4s|2d)_POST$")>;
+
+//    4-element structures
+def : InstRW<[CortexA55WriteVLD2], (instregex "LD4i(8|16|32|64)$")>;                // load single 4-el structure to one lane of 4 regs.
+def : InstRW<[CortexA55WriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; // load single 4-el structure, replicate to all lanes of 4 regs.
+def : InstRW<[CortexA55WriteVLD4], (instregex "LD4Fourv(8b|4h|2s|1d)$")>;           // load multiple 4-el structures to 4 regs.
+def : InstRW<[CortexA55WriteVLD8], (instregex "LD4Fourv(16b|8h|4s|2d)$")>;
+
+def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD4i(8|16|32|64)_POST$")>;
+def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[CortexA55WriteVLD4, WriteAdr], (instregex "LD4Fourv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[CortexA55WriteVLD8, WriteAdr], (instregex "LD4Fourv(16b|8h|4s|2d)_POST$")>;
+
+//---
+// Vector Stores
+//---
+def : InstRW<[CortexA55WriteVST1], (instregex "ST1i(8|16|32|64)$")>;
+def : InstRW<[CortexA55WriteVST1], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[CortexA55WriteVST1], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[CortexA55WriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[CortexA55WriteVST4], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[CortexA55WriteVST1, WriteAdr], (instregex "ST1i(8|16|32|64)_POST$")>;
+def : InstRW<[CortexA55WriteVST1, WriteAdr], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[CortexA55WriteVST1, WriteAdr], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[CortexA55WriteVST2, WriteAdr], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[CortexA55WriteVST4, WriteAdr], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[CortexA55WriteVST2], (instregex "ST2i(8|16|32|64)$")>;
+def : InstRW<[CortexA55WriteVST2], (instregex "ST2Twov(8b|4h|2s)$")>;
+def : InstRW<[CortexA55WriteVST4], (instregex "ST2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[CortexA55WriteVST2, WriteAdr], (instregex "ST2i(8|16|32|64)_POST$")>;
+def : InstRW<[CortexA55WriteVST2, WriteAdr], (instregex "ST2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[CortexA55WriteVST4, WriteAdr], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[CortexA55WriteVST2], (instregex "ST3i(8|16|32|64)$")>;
+def : InstRW<[CortexA55WriteVST4], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[CortexA55WriteVST2, WriteAdr], (instregex "ST3i(8|16|32|64)_POST$")>;
+def : InstRW<[CortexA55WriteVST4, WriteAdr], (instregex "ST3Threev(8b|4h|2s|1d|2d|16b|8h|4s|4d)_POST$")>;
+
+def : InstRW<[CortexA55WriteVST2], (instregex "ST4i(8|16|32|64)$")>;
+def : InstRW<[CortexA55WriteVST4], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[CortexA55WriteVST2, WriteAdr], (instregex "ST4i(8|16|32|64)_POST$")>;
+def : InstRW<[CortexA55WriteVST4, WriteAdr], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+//---
+// Floating Point Conversions, MAC, DIV, SQRT
+//---
+def : InstRW<[CortexA55WriteFPALU_F3], (instregex "^FCVT[ALMNPZ][SU](S|U)?(W|X)")>;
+def : InstRW<[CortexA55WriteFPALU_F4], (instregex "^FCVT(X)?[ALMNPXZ](S|U|N)?v")>;
+
+def : InstRW<[CortexA55WriteFPALU_F4], (instregex "^(S|U)CVTF(S|U)(W|X)(H|S|D)")>;
+def : InstRW<[CortexA55WriteFPALU_F4], (instregex "^(S|U)CVTF(h|s|d)")>;
+def : InstRW<[CortexA55WriteFPALU_F4], (instregex "^(S|U)CVTFv")>;
+
+def : InstRW<[CortexA55WriteFMAC], (instregex "^FN?M(ADD|SUB).*")>;
+def : InstRW<[CortexA55WriteFMAC], (instregex "^FML(A|S).*")>;
+def : InstRW<[CortexA55WriteFDivHP], (instrs FDIVHrr)>;
+def : InstRW<[CortexA55WriteFDivSP], (instrs FDIVSrr)>;
+def : InstRW<[CortexA55WriteFDivDP], (instrs FDIVDrr)>;
+def : InstRW<[CortexA55WriteFDivHP], (instregex "^FDIVv.*16$")>;
+def : InstRW<[CortexA55WriteFDivSP], (instregex "^FDIVv.*32$")>;
+def : InstRW<[CortexA55WriteFDivDP], (instregex "^FDIVv.*64$")>;
+def : InstRW<[CortexA55WriteFSqrtHP], (instregex "^.*SQRT.*16$")>;
+def : InstRW<[CortexA55WriteFSqrtSP], (instregex "^.*SQRT.*32$")>;
+def : InstRW<[CortexA55WriteFSqrtDP], (instregex "^.*SQRT.*64$")>;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SchedA57.td b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SchedA57.td
index 7c40da05c305..aa5bec8088e4 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SchedA57.td
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SchedA57.td
@@ -93,7 +93,7 @@ def : SchedAlias<WriteFCmp,  A57Write_3cyc_1V>;
 def : SchedAlias<WriteFCvt,  A57Write_5cyc_1V>;
 def : SchedAlias<WriteFCopy, A57Write_5cyc_1L>;
 def : SchedAlias<WriteFImm,  A57Write_3cyc_1V>;
-def : SchedAlias<WriteFMul,  A57Write_5cyc_1V>;
+def : WriteRes<WriteFMul, [A57UnitV]> { let Latency = 5;}
 def : SchedAlias<WriteFDiv,  A57Write_17cyc_1W>;
 def : SchedAlias<WriteV,     A57Write_3cyc_1V>;
 def : SchedAlias<WriteVLD,   A57Write_5cyc_1L>;
@@ -350,12 +350,16 @@ def : InstRW<[A57Write_8cyc_8S, WriteAdr],      (instregex "ST4Fourv(2d)_POST$")
 //   D form - v8i8_v8i16, v4i16_v4i32, v2i32_v2i64
 //   Q form - v16i8_v8i16, v8i16_v4i32, v4i32_v2i64
 
+// Cortex A57 Software Optimization Guide Sec 3.14
+// Advance for absolute diff accum, pairwise add and accumulate, shift accumulate
+def A57ReadIVA3 : SchedReadAdvance<3, [A57Write_4cyc_1X_NonMul_Forward, A57Write_5cyc_2X_NonMul_Forward]>;
+
 // ASIMD absolute diff accum, D-form
-def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]ABA(v8i8|v4i16|v2i32)$")>;
+def : InstRW<[A57Write_4cyc_1X_NonMul_Forward, A57ReadIVA3], (instregex "^[SU]ABA(v8i8|v4i16|v2i32)$")>;
 // ASIMD absolute diff accum, Q-form
-def : InstRW<[A57Write_5cyc_2X], (instregex "^[SU]ABA(v16i8|v8i16|v4i32)$")>;
+def : InstRW<[A57Write_5cyc_2X_NonMul_Forward, A57ReadIVA3], (instregex "^[SU]ABA(v16i8|v8i16|v4i32)$")>;
 // ASIMD absolute diff accum long
-def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]ABAL")>;
+def : InstRW<[A57Write_4cyc_1X_NonMul_Forward, A57ReadIVA3], (instregex "^[SU]ABAL")>;
 
 // ASIMD arith, reduce, 4H/4S
 def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v$")>;
@@ -372,32 +376,41 @@ def : InstRW<[A57Write_7cyc_1V_1X], (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v$")>
 def : InstRW<[A57Write_8cyc_2X], (instregex "^[SU](MIN|MAX)Vv16i8v$")>;
 
 // ASIMD multiply, D-form
-def : InstRW<[A57Write_5cyc_1W], (instregex "^(P?MUL|SQR?DMULH)(v8i8|v4i16|v2i32|v1i8|v1i16|v1i32|v1i64)(_indexed)?$")>;
+// MUL
+def : InstRW<[A57Write_5cyc_1W_Mul_Forward], (instregex "^MUL(v8i8|v4i16|v2i32|v1i8|v1i16|v1i32|v1i64)(_indexed)?$")>;
+// PMUL, SQDMULH, SQRDMULH
+def : InstRW<[A57Write_5cyc_1W], (instregex "^(PMUL|SQR?DMULH)(v8i8|v4i16|v2i32|v1i8|v1i16|v1i32|v1i64)(_indexed)?$")>;
+
 // ASIMD multiply, Q-form
-def : InstRW<[A57Write_6cyc_2W], (instregex "^(P?MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>;
+// MUL
+def : InstRW<[A57Write_6cyc_2W_Mul_Forward], (instregex "^MUL(v16i8|v8i16|v4i32)(_indexed)?$")>;
+// PMUL, SQDMULH, SQRDMULH
+def : InstRW<[A57Write_6cyc_2W], (instregex "^(PMUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>;
+
+// Cortex A57 Software Optimization Guide Sec 3.14
+def A57ReadIVMA4   : SchedReadAdvance<4 , [A57Write_5cyc_1W_Mul_Forward, A57Write_6cyc_2W_Mul_Forward]>;
+def A57ReadIVMA3   : SchedReadAdvance<3 , [A57Write_5cyc_1W_Mul_Forward, A57Write_6cyc_2W_Mul_Forward]>;
 
 // ASIMD multiply accumulate, D-form
-def : InstRW<[A57Write_5cyc_1W], (instregex "^ML[AS](v8i8|v4i16|v2i32)(_indexed)?$")>;
+def : InstRW<[A57Write_5cyc_1W_Mul_Forward, A57ReadIVMA4], (instregex "^ML[AS](v8i8|v4i16|v2i32)(_indexed)?$")>;
 // ASIMD multiply accumulate, Q-form
-def : InstRW<[A57Write_6cyc_2W], (instregex "^ML[AS](v16i8|v8i16|v4i32)(_indexed)?$")>;
+def : InstRW<[A57Write_6cyc_2W_Mul_Forward, A57ReadIVMA4], (instregex "^ML[AS](v16i8|v8i16|v4i32)(_indexed)?$")>;
 
 // ASIMD multiply accumulate long
 // ASIMD multiply accumulate saturating long
-def A57WriteIVMA   : SchedWriteRes<[A57UnitW]> { let Latency = 5;  }
-def A57ReadIVMA4   : SchedReadAdvance<4, [A57WriteIVMA]>;
-def : InstRW<[A57WriteIVMA, A57ReadIVMA4], (instregex "^(S|U|SQD)ML[AS]L")>;
+def : InstRW<[A57Write_5cyc_1W_Mul_Forward, A57ReadIVMA4], (instregex "^(S|U)ML[AS]L")>;
+def : InstRW<[A57Write_5cyc_1W_Mul_Forward, A57ReadIVMA3], (instregex "^SQDML[AS]L")>;
 
 // ASIMD multiply long
-def : InstRW<[A57Write_5cyc_1W], (instregex "^(S|U|SQD)MULL")>;
+def : InstRW<[A57Write_5cyc_1W_Mul_Forward], (instregex "^(S|U)MULL")>;
+def : InstRW<[A57Write_5cyc_1W], (instregex "^SQDMULL")>;
 def : InstRW<[A57Write_5cyc_1W], (instregex "^PMULL(v8i8|v16i8)")>;
 def : InstRW<[A57Write_3cyc_1W], (instregex "^PMULL(v1i64|v2i64)")>;
 
 // ASIMD pairwise add and accumulate
 // ASIMD shift accumulate
-def A57WriteIVA    : SchedWriteRes<[A57UnitX]> { let Latency = 4;  }
-def A57ReadIVA3    : SchedReadAdvance<3, [A57WriteIVA]>;
-def : InstRW<[A57WriteIVA, A57ReadIVA3], (instregex "^[SU]ADALP")>;
-def : InstRW<[A57WriteIVA, A57ReadIVA3], (instregex "^(S|SR|U|UR)SRA")>;
+def : InstRW<[A57Write_4cyc_1X_NonMul_Forward, A57ReadIVA3], (instregex "^[SU]ADALP")>;
+def : InstRW<[A57Write_4cyc_1X_NonMul_Forward, A57ReadIVA3], (instregex "^(S|SR|U|UR)SRA")>;
 
 // ASIMD shift by immed, complex
 def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]?(Q|R){1,2}SHR")>;
@@ -474,17 +487,22 @@ def : InstRW<[A57Write_9cyc_3V], (instregex "^(FMAX|FMIN)(NM)?P(v4f32|v2f64|v2i6
 def : InstRW<[A57Write_10cyc_3V], (instregex "^(FMAX|FMIN)(NM)?Vv")>;
 
 // ASIMD FP multiply, D-form, FZ
-def : InstRW<[A57Write_5cyc_1V], (instregex "^FMULX?(v2f32|v1i32|v2i32|v1i64|32|64)")>;
+def : InstRW<[A57Write_5cyc_1V_FP_Forward], (instregex "^FMULX?(v2f32|v1i32|v2i32|v1i64|32|64)")>;
 // ASIMD FP multiply, Q-form, FZ
-def : InstRW<[A57Write_5cyc_2V], (instregex "^FMULX?(v4f32|v2f64|v4i32|v2i64)")>;
+def : InstRW<[A57Write_5cyc_2V_FP_Forward], (instregex "^FMULX?(v4f32|v2f64|v4i32|v2i64)")>;
 
 // ASIMD FP multiply accumulate, D-form, FZ
 // ASIMD FP multiply accumulate, Q-form, FZ
 def A57WriteFPVMAD : SchedWriteRes<[A57UnitV]> { let Latency = 9;  }
 def A57WriteFPVMAQ : SchedWriteRes<[A57UnitV, A57UnitV]> { let Latency = 10;  }
-def A57ReadFPVMA5  : SchedReadAdvance<5, [A57WriteFPVMAD, A57WriteFPVMAQ]>;
+
+// Cortex A57 Software Optimization Guide Sec 3.15
+// Advances from FP mul and mul-accum to mul-accum
+def A57ReadFPVMA5  : SchedReadAdvance<5, [A57WriteFPVMAD, A57WriteFPVMAQ, A57Write_5cyc_1V_FP_Forward, A57Write_5cyc_2V_FP_Forward]>;
+def A57ReadFPVMA6  : SchedReadAdvance<6, [A57WriteFPVMAD, A57WriteFPVMAQ, A57Write_5cyc_1V_FP_Forward, A57Write_5cyc_2V_FP_Forward]>;
+
 def : InstRW<[A57WriteFPVMAD, A57ReadFPVMA5], (instregex "^FML[AS](v2f32|v1i32|v2i32|v1i64)")>;
-def : InstRW<[A57WriteFPVMAQ, A57ReadFPVMA5], (instregex "^FML[AS](v4f32|v2f64|v4i32|v2i64)")>;
+def : InstRW<[A57WriteFPVMAQ, A57ReadFPVMA6], (instregex "^FML[AS](v4f32|v2f64|v4i32|v2i64)")>;
 
 // ASIMD FP round, D-form
 def : InstRW<[A57Write_5cyc_1V], (instregex "^FRINT[AIMNPXZ](v2f32)")>;
@@ -547,8 +565,9 @@ def : InstRW<[A57Write_6cyc_3V], (instregex "^(UZP|ZIP)(1|2)(v16i8|v8i16|v4i32|v
 
 def : InstRW<[A57Write_5cyc_1V], (instregex "^F(ADD|SUB)[DS]rr")>;
 
+// Cortex A57 Software Optimization Guide Sec 3.10
 def A57WriteFPMA  : SchedWriteRes<[A57UnitV]> { let Latency = 9;  }
-def A57ReadFPMA5  : SchedReadAdvance<5, [A57WriteFPMA]>;
+def A57ReadFPMA5  : SchedReadAdvance<5, [A57WriteFPMA, WriteFMul]>;
 def A57ReadFPM    : SchedReadAdvance<0>;
 def : InstRW<[A57WriteFPMA, A57ReadFPM, A57ReadFPM, A57ReadFPMA5], (instregex "^FN?M(ADD|SUB)[DS]rrr")>;
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SchedA57WriteRes.td b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SchedA57WriteRes.td
index 987ed3c4ebfb..a4c090d439db 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SchedA57WriteRes.td
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SchedA57WriteRes.td
@@ -13,6 +13,10 @@
 //   Prefix: A57Write
 //   Latency: #cyc
 //   MicroOp Count/Types: #(B|I|M|L|S|X|W|V)
+//   Postfix (optional): (XYZ)_Forward
+//
+//   The postfix is added to differentiate SchedWriteRes that are used in
+//   subsequent SchedReadAdvances.
 //
 // e.g. A57Write_6cyc_1I_6S_4V means the total latency is 6 and there are
 //      11 micro-ops to be issued down one I pipe, six S pipes and four V pipes.
@@ -25,7 +29,9 @@
 def A57Write_5cyc_1L  : SchedWriteRes<[A57UnitL]> { let Latency = 5;  }
 def A57Write_5cyc_1M  : SchedWriteRes<[A57UnitM]> { let Latency = 5;  }
 def A57Write_5cyc_1V  : SchedWriteRes<[A57UnitV]> { let Latency = 5;  }
+def A57Write_5cyc_1V_FP_Forward  : SchedWriteRes<[A57UnitV]> { let Latency = 5; }
 def A57Write_5cyc_1W  : SchedWriteRes<[A57UnitW]> { let Latency = 5;  }
+def A57Write_5cyc_1W_Mul_Forward  : SchedWriteRes<[A57UnitW]> { let Latency = 5;  }
 def A57Write_10cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 10; }
 def A57Write_17cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 17;
                                                     let ResourceCycles = [17]; }
@@ -45,6 +51,7 @@ def A57Write_3cyc_1W  : SchedWriteRes<[A57UnitW]> { let Latency = 3;  }
 def A57Write_3cyc_1X  : SchedWriteRes<[A57UnitX]> { let Latency = 3;  }
 def A57Write_4cyc_1L  : SchedWriteRes<[A57UnitL]> { let Latency = 4;  }
 def A57Write_4cyc_1X  : SchedWriteRes<[A57UnitX]> { let Latency = 4;  }
+def A57Write_4cyc_1X_NonMul_Forward  : SchedWriteRes<[A57UnitX]> { let Latency = 4;  }
 def A57Write_9cyc_1V  : SchedWriteRes<[A57UnitV]> { let Latency = 9;  }
 def A57Write_6cyc_1M  : SchedWriteRes<[A57UnitM]> { let Latency = 6;  }
 def A57Write_6cyc_1V  : SchedWriteRes<[A57UnitV]> { let Latency = 6;  }
@@ -93,6 +100,10 @@ def A57Write_6cyc_2W     : SchedWriteRes<[A57UnitW, A57UnitW]> {
   let Latency     = 6;
   let NumMicroOps = 2;
 }
+def A57Write_6cyc_2W_Mul_Forward     : SchedWriteRes<[A57UnitW, A57UnitW]> {
+  let Latency     = 6;
+  let NumMicroOps = 2;
+}
 def A57Write_5cyc_1I_1L  : SchedWriteRes<[A57UnitI,
                                           A57UnitL]> {
   let Latency     = 5;
@@ -102,10 +113,18 @@ def A57Write_5cyc_2V     : SchedWriteRes<[A57UnitV, A57UnitV]> {
   let Latency     = 5;
   let NumMicroOps = 2;
 }
+def A57Write_5cyc_2V_FP_Forward     : SchedWriteRes<[A57UnitV, A57UnitV]> {
+  let Latency     = 5;
+  let NumMicroOps = 2;
+}
 def A57Write_5cyc_2X     : SchedWriteRes<[A57UnitX, A57UnitX]> {
   let Latency     = 5;
   let NumMicroOps = 2;
 }
+def A57Write_5cyc_2X_NonMul_Forward     : SchedWriteRes<[A57UnitX, A57UnitX]> {
+  let Latency     = 5;
+  let NumMicroOps = 2;
+}
 def A57Write_10cyc_1L_1V : SchedWriteRes<[A57UnitL,
                                           A57UnitV]> {
   let Latency     = 10;
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SchedA64FX.td b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SchedA64FX.td
new file mode 100644
index 000000000000..b6741d418ef0
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SchedA64FX.td
@@ -0,0 +1,3890 @@
+//=- AArch64SchedA64FX.td - Fujitsu A64FX Scheduling Defs -*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the scheduling model for the Fujitsu A64FX processors.
+//
+//===----------------------------------------------------------------------===//
+
+def A64FXModel : SchedMachineModel {
+  let IssueWidth            =   6; // 6 micro-ops dispatched at a time.
+  let MicroOpBufferSize     = 180; // 180 entries in micro-op re-order buffer.
+  let LoadLatency           =   5; // Optimistic load latency.
+  let MispredictPenalty     =  12; // Extra cycles for mispredicted branch.
+  // Determined via a mix of micro-arch details and experimentation.
+  let LoopMicroOpBufferSize = 128;
+  let PostRAScheduler       =   1; // Using PostRA sched.
+  let CompleteModel         =   1;
+
+  list<Predicate> UnsupportedFeatures =
+    [HasSVE2, HasSVE2AES, HasSVE2SM4, HasSVE2SHA3, HasSVE2BitPerm, HasPAuth];
+
+  let FullInstRWOverlapCheck = 0;
+}
+
+let SchedModel = A64FXModel in {
+
+// Define the issue ports.
+
+// A64FXIP*
+
+// Port 0
+def A64FXIPFLA : ProcResource<1>;
+
+// Port 1
+def A64FXIPPR : ProcResource<1>;
+
+// Port 2
+def A64FXIPEXA : ProcResource<1>;
+
+// Port 3
+def A64FXIPFLB : ProcResource<1>;
+
+// Port 4
+def A64FXIPEXB : ProcResource<1>;
+
+// Port 5
+def A64FXIPEAGA : ProcResource<1>;
+
+// Port 6
+def A64FXIPEAGB : ProcResource<1>;
+
+// Port 7
+def A64FXIPBR : ProcResource<1>;
+
+// Define groups for the functional units on each issue port.  Each group
+// created will be used by a WriteRes later on.
+
+def A64FXGI7 : ProcResGroup<[A64FXIPBR]>;
+
+def A64FXGI0 : ProcResGroup<[A64FXIPFLA]>;
+
+def A64FXGI1 : ProcResGroup<[A64FXIPPR]>;
+
+def A64FXGI2 : ProcResGroup<[A64FXIPEXA]>;
+
+def A64FXGI3 : ProcResGroup<[A64FXIPFLB]>;
+
+def A64FXGI4 : ProcResGroup<[A64FXIPEXB]>;
+
+def A64FXGI5 : ProcResGroup<[A64FXIPEAGA]>;
+
+def A64FXGI6 : ProcResGroup<[A64FXIPEAGB]>;
+
+def A64FXGI03 : ProcResGroup<[A64FXIPFLA, A64FXIPFLB]>;
+
+def A64FXGI01 : ProcResGroup<[A64FXIPFLA, A64FXIPPR]>;
+
+def A64FXGI02 : ProcResGroup<[A64FXIPFLA, A64FXIPEXA]>;
+
+def A64FXGI12 : ProcResGroup<[A64FXIPEXA, A64FXIPPR]>;
+
+def A64FXGI15 : ProcResGroup<[A64FXIPEAGA, A64FXIPPR]>;
+
+def A64FXGI05 : ProcResGroup<[A64FXIPFLA, A64FXIPEAGA]>;
+
+def A64FXGI24 : ProcResGroup<[A64FXIPEXA, A64FXIPEXB]>;
+
+def A64FXGI124 : ProcResGroup<[A64FXIPEXA, A64FXIPEXB, A64FXIPPR]>;
+
+def A64FXGI056 : ProcResGroup<[A64FXIPFLA, A64FXIPEAGA, A64FXIPEAGB]>;
+
+def A64FXGI0256 : ProcResGroup<[A64FXIPFLA, A64FXIPEXA, A64FXIPEAGA, A64FXIPEAGB]>;
+
+def A64FXGI56 : ProcResGroup<[A64FXIPEAGA, A64FXIPEAGB]>;
+
+def A64FXGI2456 : ProcResGroup<[A64FXIPEXA, A64FXIPEXB, A64FXIPEAGA, A64FXIPEAGB]>;
+
+def A64FXAny : ProcResGroup<[A64FXIPFLA, A64FXIPPR, A64FXIPEXA, A64FXIPFLB,
+                             A64FXIPEXB, A64FXIPEAGA, A64FXIPEAGB, A64FXIPBR]> {
+  let BufferSize = 60;
+}
+
+def A64FXWrite_6Cyc : SchedWriteRes<[]> {
+  let Latency = 6;
+}
+
+def A64FXWrite_1Cyc_GI7 : SchedWriteRes<[A64FXGI7]> {
+  let Latency = 1;
+}
+
+def A64FXWrite_2Cyc_GI0 : SchedWriteRes<[A64FXGI0]> {
+  let Latency = 2;
+}
+
+def A64FXWrite_4Cyc_GI0 : SchedWriteRes<[A64FXGI0]> {
+  let Latency = 4;
+}
+
+def A64FXWrite_5Cyc_GI0 : SchedWriteRes<[A64FXGI0]> {
+  let Latency = 5;
+}
+
+def A64FXWrite_6Cyc_GI0 : SchedWriteRes<[A64FXGI0]> {
+  let Latency = 6;
+}
+
+def A64FXWrite_8Cyc_GI0 : SchedWriteRes<[A64FXGI0]> {
+  let Latency = 8;
+}
+
+def A64FXWrite_9Cyc_GI0 : SchedWriteRes<[A64FXGI0]> {
+  let Latency = 9;
+}
+
+def A64FXWrite_13Cyc_GI0 : SchedWriteRes<[A64FXGI0]> {
+  let Latency = 13;
+}
+
+def A64FXWrite_37Cyc_GI0 : SchedWriteRes<[A64FXGI0]> {
+  let Latency = 37;
+}
+
+def A64FXWrite_98Cyc_GI0 : SchedWriteRes<[A64FXGI0]> {
+  let Latency = 98;
+}
+
+def A64FXWrite_134Cyc_GI0 : SchedWriteRes<[A64FXGI0]> {
+  let Latency = 134;
+}
+
+def A64FXWrite_154Cyc_GI0 : SchedWriteRes<[A64FXGI0]> {
+  let Latency = 154;
+}
+
+def A64FXWrite_4Cyc_GI01 : SchedWriteRes<[A64FXGI01]> {
+  let Latency = 4;
+}
+
+def A64FXWrite_6Cyc_GI01 : SchedWriteRes<[A64FXGI01]> {
+  let Latency = 6;
+}
+
+def A64FXWrite_8Cyc_GI01 : SchedWriteRes<[A64FXGI01]> {
+  let Latency = 8;
+}
+
+def A64FXWrite_12Cyc_GI01 : SchedWriteRes<[A64FXGI01]> {
+  let Latency = 12;
+}
+
+def A64FXWrite_10Cyc_GI02 : SchedWriteRes<[A64FXGI02]> {
+  let Latency = 10;
+}
+
+def A64FXWrite_17Cyc_GI02 : SchedWriteRes<[A64FXGI02]> {
+  let Latency = 17;
+}
+
+def A64FXWrite_21Cyc_GI02 : SchedWriteRes<[A64FXGI02]> {
+  let Latency = 21;
+}
+
+def A64FXWrite_3Cyc_GI1 : SchedWriteRes<[A64FXGI1]> {
+  let Latency = 3;
+}
+
+def A64FXWrite_6Cyc_NGI1 : SchedWriteRes<[A64FXGI1]> {
+  let Latency = 3;
+  let NumMicroOps = 2;
+}
+
+def A64FXWrite_4Cyc_GI12 : SchedWriteRes<[A64FXGI12]> {
+  let Latency = 4;
+}
+
+def A64FXWrite_3Cyc_GI2 : SchedWriteRes<[A64FXGI2]> {
+  let Latency = 3;
+}
+
+def A64FXWrite_5Cyc_GI2 : SchedWriteRes<[A64FXGI2]> {
+  let Latency = 5;
+}
+
+def A64FXWrite_6Cyc_GI2 : SchedWriteRes<[A64FXGI2]> {
+  let Latency = 6;
+}
+
+def A64FXWrite_4Cyc_GI3 : SchedWriteRes<[A64FXGI3]> {
+  let Latency = 4;
+}
+
+def A64FXWrite_6Cyc_GI3 : SchedWriteRes<[A64FXGI3]> {
+  let Latency = 6;
+}
+
+def A64FXWrite_6Cyc_GI15 : SchedWriteRes<[A64FXGI15]> {
+  let Latency = 6;
+}
+
+def A64FXWrite_3Cyc_GI03 : SchedWriteRes<[A64FXGI03]> {
+  let Latency = 3;
+}
+
+def A64FXWrite_4Cyc_GI03 : SchedWriteRes<[A64FXGI03]> {
+  let Latency = 4;
+}
+
+def A64FXWrite_6Cyc_GI03 : SchedWriteRes<[A64FXGI03]> {
+  let Latency = 6;
+}
+
+def A64FXWrite_8Cyc_GI03 : SchedWriteRes<[A64FXGI03]> {
+  let Latency = 8;
+}
+
+def A64FXWrite_9Cyc_GI03 : SchedWriteRes<[A64FXGI03]> {
+  let Latency = 9;
+}
+
+def A64FXWrite_10Cyc_GI03 : SchedWriteRes<[A64FXGI03]> {
+  let Latency = 10;
+}
+
+def A64FXWrite_12Cyc_GI03 : SchedWriteRes<[A64FXGI03]> {
+  let Latency = 12;
+}
+
+def A64FXWrite_14Cyc_GI03 : SchedWriteRes<[A64FXGI03]> {
+  let Latency = 14;
+}
+
+def A64FXWrite_15Cyc_GI03 : SchedWriteRes<[A64FXGI03]> {
+  let Latency = 15;
+}
+
+def A64FXWrite_15Cyc_NGI03 : SchedWriteRes<[A64FXGI03]> {
+  let Latency = 15;
+  let NumMicroOps = 2;
+}
+
+def A64FXWrite_18Cyc_GI03 : SchedWriteRes<[A64FXGI03]> {
+  let Latency = 18;
+}
+
+def A64FXWrite_45Cyc_GI03 : SchedWriteRes<[A64FXGI03]> {
+  let Latency = 45;
+}
+
+def A64FXWrite_60Cyc_GI03 : SchedWriteRes<[A64FXGI03]> {
+  let Latency = 60;
+}
+
+def A64FXWrite_75Cyc_GI03 : SchedWriteRes<[A64FXGI03]> {
+  let Latency = 75;
+}
+
+def A64FXWrite_6Cyc_GI05 : SchedWriteRes<[A64FXGI05]> {
+  let Latency = 6;
+}
+
+def A64FXWrite_10Cyc_GI4 : SchedWriteRes<[A64FXGI4]> {
+  let Latency = 10;
+}
+
+def A64FXWrite_12Cyc_GI4 : SchedWriteRes<[A64FXGI4]> {
+  let Latency = 12;
+}
+
+def A64FXWrite_20Cyc_GI4 : SchedWriteRes<[A64FXGI4]> {
+  let Latency = 20;
+}
+
+def A64FXWrite_5Cyc_GI5 : SchedWriteRes<[A64FXGI5]> {
+  let Latency = 5;
+}
+
+def A64FXWrite_11Cyc_GI5 : SchedWriteRes<[A64FXGI5]> {
+  let Latency = 11;
+}
+
+def A64FXWrite_5Cyc_GI6 : SchedWriteRes<[A64FXGI6]> {
+  let Latency = 5;
+}
+
+def A64FXWrite_1Cyc_GI24 : SchedWriteRes<[A64FXGI24]> {
+  let Latency = 1;
+}
+
+def A64FXWrite_2Cyc_GI24 : SchedWriteRes<[A64FXGI24]> {
+  let Latency = 2;
+}
+
+def A64FXWrite_4Cyc_NGI24 : SchedWriteRes<[A64FXGI24]> {
+  let Latency = 4;
+  let NumMicroOps = 4;
+}
+
+def A64FXWrite_6Cyc_GI124: SchedWriteRes<[A64FXGI124]> {
+  let Latency = 6;
+}
+
+def A64FXWrite_8Cyc_GI124 : SchedWriteRes<[A64FXGI124]> {
+  let Latency = 8;
+  let NumMicroOps = 2;
+}
+
+def A64FXWrite_6Cyc_GI56 : SchedWriteRes<[A64FXGI56]> {
+  let Latency = 0;
+}
+
+def A64FXWrite_1Cyc_GI56 : SchedWriteRes<[A64FXGI56]> {
+  let Latency = 1;
+}
+
+def A64FXWrite_5Cyc_GI56 : SchedWriteRes<[A64FXGI56]> {
+  let Latency = 5;
+}
+
+def A64FXWrite_8Cyc_GI56 : SchedWriteRes<[A64FXGI56]> {
+  let Latency = 8;
+}
+
+def A64FXWrite_11Cyc_GI56 : SchedWriteRes<[A64FXGI56]> {
+  let Latency = 11;
+}
+
+def A64FXWrite_44Cyc_GI56 : SchedWriteRes<[A64FXGI56]> {
+  let Latency = 44;
+}
+
+def A64FXWrite_10Cyc_GI056 : SchedWriteRes<[A64FXGI056]> {
+  let Latency = 10;
+}
+
+def A64FXWrite_15Cyc_GI056 : SchedWriteRes<[A64FXGI056]> {
+  let Latency = 15;
+}
+
+def A64FXWrite_19Cyc_GI056 : SchedWriteRes<[A64FXGI056]> {
+  let Latency = 19;
+}
+
+def A64FXWrite_25Cyc_GI056 : SchedWriteRes<[A64FXGI056]> {
+  let Latency = 25;
+}
+
+def A64FXWrite_14Cyc_GI0256 : SchedWriteRes<[A64FXGI0256]> {
+  let Latency = 14;
+}
+
+def A64FXWrite_19Cyc_GI0256 : SchedWriteRes<[A64FXGI0256]> {
+  let Latency = 19;
+}
+
+def A64FXWrite_29Cyc_GI0256 : SchedWriteRes<[A64FXGI0256]> {
+  let Latency = 29;
+}
+
+def A64FXWrite_LDNP: SchedWriteRes<[A64FXGI56]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
+
+def A64FXWrite_LDP01: SchedWriteRes<[A64FXGI2456]> {
+  let Latency = 5;
+  let NumMicroOps = 3;
+}
+
+def A64FXWrite_LDR01: SchedWriteRes<[A64FXGI2456]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
+
+def A64FXWrite_LD102: SchedWriteRes<[A64FXGI56]> {
+  let Latency = 8;
+  let NumMicroOps = 2;
+}
+
+def A64FXWrite_LD103: SchedWriteRes<[A64FXGI56]> {
+  let Latency = 11;
+  let NumMicroOps = 2;
+
+}
+
+def A64FXWrite_LD104: SchedWriteRes<[A64FXGI56]> {
+  let Latency = 8;
+  let NumMicroOps = 3;
+}
+
+def A64FXWrite_LD105: SchedWriteRes<[A64FXGI56]> {
+  let Latency = 11;
+  let NumMicroOps = 3;
+}
+
+def A64FXWrite_LD106: SchedWriteRes<[A64FXGI56]> {
+  let Latency = 8;
+  let NumMicroOps = 4;
+}
+
+def A64FXWrite_LD107: SchedWriteRes<[A64FXGI56]> {
+  let Latency = 11;
+  let NumMicroOps = 4;
+}
+
+def A64FXWrite_LD108: SchedWriteRes<[A64FXGI56]> {
+  let Latency = 8;
+  let NumMicroOps = 2;
+}
+
+def A64FXWrite_LD109: SchedWriteRes<[A64FXGI56]> {
+  let Latency = 11;
+  let NumMicroOps = 2;
+}
+
+def A64FXWrite_LD110: SchedWriteRes<[A64FXGI56]> {
+  let Latency = 8;
+  let NumMicroOps = 3;
+}
+
+def A64FXWrite_LD111: SchedWriteRes<[A64FXGI56]> {
+  let Latency = 11;
+  let NumMicroOps = 3;
+}
+
+def A64FXWrite_LD112: SchedWriteRes<[A64FXGI56]> {
+  let Latency = 8;
+  let NumMicroOps = 4;
+}
+
+def A64FXWrite_LD113: SchedWriteRes<[A64FXGI56]> {
+  let Latency = 11;
+  let NumMicroOps = 4;
+}
+
+def A64FXWrite_LD114: SchedWriteRes<[A64FXGI56]> {
+  let Latency = 8;
+  let NumMicroOps = 5;
+}
+
+def A64FXWrite_LD115: SchedWriteRes<[A64FXGI56]> {
+  let Latency = 11;
+  let NumMicroOps = 5;
+}
+
+def A64FXWrite_LD1I0: SchedWriteRes<[A64FXGI056]> {
+  let Latency = 8;
+  let NumMicroOps = 2;
+}
+
+def A64FXWrite_LD1I1: SchedWriteRes<[A64FXGI056]> {
+  let Latency = 8;
+  let NumMicroOps = 3;
+}
+
+def A64FXWrite_LD2I0: SchedWriteRes<[A64FXGI056]> {
+  let Latency = 8;
+  let NumMicroOps = 4;
+}
+
+def A64FXWrite_LD2I1: SchedWriteRes<[A64FXGI056]> {
+  let Latency = 8;
+  let NumMicroOps = 5;
+}
+
+def A64FXWrite_LD3I0: SchedWriteRes<[A64FXGI056]> {
+  let Latency = 8;
+  let NumMicroOps = 6;
+}
+
+def A64FXWrite_LD3I1: SchedWriteRes<[A64FXGI056]> {
+  let Latency = 8;
+  let NumMicroOps = 7;
+}
+
+def A64FXWrite_LD4I0: SchedWriteRes<[A64FXGI056]> {
+  let Latency = 8;
+  let NumMicroOps = 8;
+}
+
+def A64FXWrite_LD4I1: SchedWriteRes<[A64FXGI056]> {
+  let Latency = 8;
+  let NumMicroOps = 9;
+}
+
+def A64FXWrite_1Cyc_GI2456 : SchedWriteRes<[A64FXGI2456]> {
+  let Latency = 1;
+}
+
+def A64FXWrite_FMOV_GV : SchedWriteRes<[A64FXGI03]> {
+  let Latency = 10;
+}
+
+def A64FXWrite_FMOV_VG14 : SchedWriteRes<[A64FXGI03]> {
+  let Latency = 14;
+}
+
+def A64FXWrite_FMOV_VG : SchedWriteRes<[A64FXGI03]> {
+  let Latency = 25;
+}
+
+def A64FXWrite_ADDLV : SchedWriteRes<[A64FXGI03]> {
+  let Latency = 12;
+}
+
+def A64FXWrite_MULLE : SchedWriteRes<[A64FXGI03]> {
+  let Latency = 14;
+}
+
+def A64FXWrite_MULLV : SchedWriteRes<[A64FXGI03]> {
+  let Latency = 14;
+}
+
+def A64FXWrite_MADDL : SchedWriteRes<[A64FXGI03]> {
+  let Latency = 6;
+}
+
+def A64FXWrite_ABA : SchedWriteRes<[A64FXGI03]> {
+  let Latency = 8;
+}
+
+def A64FXWrite_ABAL : SchedWriteRes<[A64FXGI03]> {
+  let Latency = 10;
+}
+
+def A64FXWrite_ADDLV1 : SchedWriteRes<[A64FXGI03]> {
+  let Latency = 12;
+  let NumMicroOps = 6;
+}
+
+def A64FXWrite_MINMAXV : SchedWriteRes<[A64FXGI03]> {
+  let Latency = 14;
+  let NumMicroOps = 6;
+}
+
+def A64FXWrite_SQRDMULH : SchedWriteRes<[A64FXGI03]> {
+  let Latency = 9;
+}
+
+def A64FXWrite_PMUL : SchedWriteRes<[A64FXGI03]> {
+  let Latency = 8;
+}
+
+
+def A64FXWrite_SRSRAV : SchedWriteRes<[A64FXGI03]> {
+  let Latency = 8;
+  let NumMicroOps = 3;
+}
+
+def A64FXWrite_SSRAV : SchedWriteRes<[A64FXGI03]> {
+  let Latency = 8;
+  let NumMicroOps = 2;
+}
+
+def A64FXWrite_RSHRN : SchedWriteRes<[A64FXGI03]> {
+  let Latency = 10;
+  let NumMicroOps = 3;
+}
+
+def A64FXWrite_SHRN : SchedWriteRes<[A64FXGI03]> {
+  let Latency = 10;
+  let NumMicroOps = 2;
+}
+
+
+def A64FXWrite_ADDP : SchedWriteRes<[A64FXGI03]> {
+  let Latency = 10;
+  let NumMicroOps = 3;
+}
+
+def A64FXWrite_FMULXE : SchedWriteRes<[A64FXGI03]> {
+  let Latency = 15;
+  let NumMicroOps = 2;
+}
+
+def A64FXWrite_FADDPV : SchedWriteRes<[A64FXGI03]> {
+  let Latency = 15;
+  let NumMicroOps = 3;
+}
+
+def A64FXWrite_SADALP : SchedWriteRes<[A64FXGI03]> {
+  let Latency = 10;
+  let NumMicroOps = 3;
+}
+
+def A64FXWrite_SADDLP : SchedWriteRes<[A64FXGI03]> {
+  let Latency = 10;
+  let NumMicroOps = 2;
+}
+
+def A64FXWrite_FCVTXNV : SchedWriteRes<[A64FXGI03]> {
+  let Latency = 15;
+  let NumMicroOps = 2;
+}
+
+def A64FXWrite_FMAXVVH : SchedWriteRes<[A64FXGI03]> {
+  let Latency = 14;
+  let NumMicroOps = 7;
+}
+
+def A64FXWrite_FMAXVVS : SchedWriteRes<[A64FXGI03]> {
+  let Latency = 14;
+}
+
+def A64FXWrite_BIF : SchedWriteRes<[A64FXGI03]> {
+  let Latency = 5;
+}
+
+def A64FXWrite_DUPGENERAL : SchedWriteRes<[A64FXGI03]> {
+  let Latency = 10;
+}
+
+def A64FXWrite_SHA00 : SchedWriteRes<[A64FXGI0]> {
+  let Latency = 9;
+}
+
+def A64FXWrite_SHA01 : SchedWriteRes<[A64FXGI0]> {
+  let Latency = 12;
+}
+
+def A64FXWrite_SMOV : SchedWriteRes<[A64FXGI03]> {
+  let Latency = 25;
+}
+
+def A64FXWrite_TBX1 : SchedWriteRes<[A64FXGI03]> {
+  let Latency = 10;
+  let NumMicroOps = 3;
+}
+
+def A64FXWrite_TBX2 : SchedWriteRes<[A64FXGI03]> {
+  let Latency = 10;
+  let NumMicroOps = 5;
+}
+
+def A64FXWrite_TBX3 : SchedWriteRes<[A64FXGI03]> {
+  let Latency = 10;
+  let NumMicroOps = 7;
+}
+
+def A64FXWrite_TBX4 : SchedWriteRes<[A64FXGI03]> {
+  let Latency = 10;
+  let NumMicroOps = 9;
+}
+
+def A64FXWrite_PREF0: SchedWriteRes<[A64FXGI56]> {
+  let Latency = 0;
+}
+
+def A64FXWrite_PREF1: SchedWriteRes<[A64FXGI56]> {
+  let Latency = 0;
+}
+
+def A64FXWrite_SWP: SchedWriteRes<[A64FXGI56]> {
+  let Latency = 0;
+}
+
+def A64FXWrite_STUR: SchedWriteRes<[A64FXGI56]> {
+  let Latency = 0;
+}
+
+def A64FXWrite_STNP: SchedWriteRes<[A64FXGI56]> {
+  let Latency = 0;
+}
+
+def A64FXWrite_STP01: SchedWriteRes<[A64FXGI56]> {
+  let Latency = 0;
+}
+
+def A64FXWrite_ST10: SchedWriteRes<[A64FXGI56]> {
+  let Latency = 0;
+}
+
+def A64FXWrite_ST11: SchedWriteRes<[A64FXGI56]> {
+  let Latency = 0;
+}
+
+def A64FXWrite_ST12: SchedWriteRes<[A64FXGI56]> {
+  let Latency = 0;
+}
+
+def A64FXWrite_ST13: SchedWriteRes<[A64FXGI56]> {
+  let Latency = 0;
+}
+
+def A64FXWrite_ST14: SchedWriteRes<[A64FXGI56]> {
+  let Latency = 1;
+}
+
+def A64FXWrite_ST15: SchedWriteRes<[A64FXGI56]> {
+  let Latency = 1;
+}
+
+def A64FXWrite_ST16: SchedWriteRes<[A64FXGI56]> {
+  let Latency = 1;
+}
+
+def A64FXWrite_ST17: SchedWriteRes<[A64FXGI56]> {
+  let Latency = 1;
+}
+
+def A64FXWrite_ST1W_6: SchedWriteRes<[A64FXGI056]> {
+  let Latency = 6;
+}
+
+def A64FXWrite_ST2W_7: SchedWriteRes<[A64FXGI056]> {
+  let Latency = 7;
+}
+
+def A64FXWrite_ST3W_8: SchedWriteRes<[A64FXGI056]> {
+  let Latency = 8;
+}
+
+def A64FXWrite_ST4W_9: SchedWriteRes<[A64FXGI056]> {
+  let Latency = 9;
+}
+
+def A64FXWrite_ST1W_15: SchedWriteRes<[A64FXGI056]> {
+  let Latency = 15;
+}
+
+def A64FXWrite_ST1W_19: SchedWriteRes<[A64FXGI056]> {
+  let Latency = 19;
+}
+
+def A64FXWrite_CAS: SchedWriteRes<[A64FXGI56]> {
+  let Latency = 7;
+}
+
+// Define commonly used read types.
+
+// No forwarding is provided for these types.
+def : ReadAdvance<ReadI,       0>;
+def : ReadAdvance<ReadISReg,   0>;
+def : ReadAdvance<ReadIEReg,   0>;
+def : ReadAdvance<ReadIM,      0>;
+def : ReadAdvance<ReadIMA,     0>;
+def : ReadAdvance<ReadID,      0>;
+def : ReadAdvance<ReadExtrHi,  0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadVLD,     0>;
+
+//===----------------------------------------------------------------------===//
+// 3. Instruction Tables.
+
+//---
+// 3.1 Branch Instructions
+//---
+
+// Branch, immed
+// Branch and link, immed
+// Compare and branch
+def : WriteRes<WriteBr,      [A64FXGI7]> {
+  let Latency = 1;
+}
+
+// Branch, register
+// Branch and link, register != LR
+// Branch and link, register = LR
+def : WriteRes<WriteBrReg,   [A64FXGI7]> {
+  let Latency = 1;
+}
+
+def : WriteRes<WriteSys,     []> { let Latency = 1; }
+def : WriteRes<WriteBarrier, []> { let Latency = 1; }
+def : WriteRes<WriteHint,    []> { let Latency = 1; }
+
+def : WriteRes<WriteAtomic,  []> {
+  let Latency = 4;
+}
+
+//---
+// Branch
+//---
+def : InstRW<[A64FXWrite_1Cyc_GI7], (instrs B, BL, BR, BLR)>;
+def : InstRW<[A64FXWrite_1Cyc_GI7], (instrs RET)>;
+def : InstRW<[A64FXWrite_1Cyc_GI7], (instregex "^B..$")>;
+def : InstRW<[A64FXWrite_1Cyc_GI7],
+            (instregex "^CBZ", "^CBNZ", "^TBZ", "^TBNZ")>;
+
+//---
+// 3.2 Arithmetic and Logical Instructions
+// 3.3 Move and Shift Instructions
+//---
+
+// ALU, basic
+// Conditional compare
+// Conditional select
+// Address generation
+def : WriteRes<WriteI,       [A64FXGI2456]> {
+  let Latency = 1;
+  let ResourceCycles = [1];
+}
+
+def : InstRW<[WriteI],
+            (instregex "ADD?(W|X)r(i|r|s|x)",   "ADDS?(W|X)r(i|r|s|x)(64)?",
+                       "AND?(W|X)r(i|r|s|x)",   "ANDS?(W|X)r(i|r|s|x)",
+                       "ADC(W|X)r",
+                       "BIC?(W|X)r(i|r|s|x)",   "BICS?(W|X)r(i|r|s|x)",
+                       "EON?(W|X)r(i|r|s|x)",   "ORN?(W|X)r(i|r|s|x)",
+                       "ORR?(W|X)r(i|r|s|x)",   "SUB?(W|X)r(i|r|s|x)",
+                       "SUBS?(W|X)r(i|r|s|x)",  "SBC(W|X)r",
+                       "SBCS(W|X)r",            "CCMN(W|X)(i|r)",
+                       "CCMP(W|X)(i|r)",        "CSEL(W|X)r",
+                       "CSINC(W|X)r",           "CSINV(W|X)r",
+                       "CSNEG(W|X)r")>;
+
+def : InstRW<[WriteI], (instrs COPY)>;
+
+// ALU, extend and/or shift
+def : WriteRes<WriteISReg,   [A64FXGI2456]> {
+  let Latency = 2;
+  let ResourceCycles = [1];
+}
+
+def : InstRW<[WriteISReg],
+            (instregex "ADD?(W|X)r(i|r|s|x)",   "ADDS?(W|X)r(i|r|s|x)(64)?",
+                       "AND?(W|X)r(i|r|s|x)",   "ANDS?(W|X)r(i|r|s|x)",
+                       "ADC(W|X)r",
+                       "BIC?(W|X)r(i|r|s|x)",   "BICS?(W|X)r(i|r|s|x)",
+                       "EON?(W|X)r(i|r|s|x)",   "ORN?(W|X)r(i|r|s|x)",
+                       "ORR?(W|X)r(i|r|s|x)",   "SUB?(W|X)r(i|r|s|x)",
+                       "SUBS?(W|X)r(i|r|s|x)",  "SBC(W|X)r",
+                       "SBCS(W|X)r",            "CCMN(W|X)(i|r)",
+                       "CCMP(W|X)(i|r)",        "CSEL(W|X)r",
+                       "CSINC(W|X)r",           "CSINV(W|X)r",
+                       "CSNEG(W|X)r")>;
+
+def : WriteRes<WriteIEReg,   [A64FXGI2456]> {
+  let Latency = 1;
+  let ResourceCycles = [1];
+}
+
+def : InstRW<[WriteIEReg],
+            (instregex "ADD?(W|X)r(i|r|s|x)",   "ADDS?(W|X)r(i|r|s|x)(64)?",
+                       "AND?(W|X)r(i|r|s|x)",   "ANDS?(W|X)r(i|r|s|x)",
+                       "ADC(W|X)r",
+                       "BIC?(W|X)r(i|r|s|x)",   "BICS?(W|X)r(i|r|s|x)",
+                       "EON?(W|X)r(i|r|s|x)",   "ORN?(W|X)r(i|r|s|x)",
+                       "ORR?(W|X)r(i|r|s|x)",   "SUB?(W|X)r(i|r|s|x)",
+                       "SUBS?(W|X)r(i|r|s|x)",  "SBC(W|X)r",
+                       "SBCS(W|X)r",            "CCMN(W|X)(i|r)",
+                       "CCMP(W|X)(i|r)",        "CSEL(W|X)r",
+                       "CSINC(W|X)r",           "CSINV(W|X)r",
+                       "CSNEG(W|X)r")>;
+
+// Move immed
+def : WriteRes<WriteImm,     [A64FXGI2456]> {
+  let Latency = 1;
+  let ResourceCycles = [1];
+}
+
+def : InstRW<[A64FXWrite_1Cyc_GI2456],
+            (instrs MOVKWi, MOVKXi, MOVNWi, MOVNXi, MOVZWi, MOVZXi)>;
+
+def : InstRW<[A64FXWrite_2Cyc_GI24],
+            (instrs ASRVWr, ASRVXr, LSLVWr, LSLVXr, RORVWr, RORVXr)>;
+
+// Variable shift
+def : WriteRes<WriteIS,      [A64FXGI2456]> {
+  let Latency = 1;
+  let ResourceCycles = [1];
+}
+
+//---
+// 3.4 Divide and Multiply Instructions
+//---
+
+// Divide, W-form
+def : WriteRes<WriteID32,    [A64FXGI4]> {
+  let Latency = 39;
+  let ResourceCycles = [39];
+}
+
+// Divide, X-form
+def : WriteRes<WriteID64,    [A64FXGI4]> {
+  let Latency = 23;
+  let ResourceCycles = [23];
+}
+
+// Multiply accumulate, W-form
+def : WriteRes<WriteIM32,    [A64FXGI2456]> {
+  let Latency = 5;
+  let ResourceCycles = [1];
+}
+
+// Multiply accumulate, X-form
+def : WriteRes<WriteIM64,    [A64FXGI2456]> {
+  let Latency = 5;
+  let ResourceCycles = [1];
+}
+
+def : InstRW<[WriteIM32], (instrs MADDWrrr, MSUBWrrr)>;
+def : InstRW<[WriteIM32], (instrs MADDXrrr, MSUBXrrr)>;
+def : InstRW<[A64FXWrite_MADDL],
+            (instregex "(S|U)(MADDL|MSUBL)rrr")>;
+
+def : InstRW<[WriteID32], (instrs SDIVWr, UDIVWr)>;
+def : InstRW<[WriteID64], (instrs SDIVXr, UDIVXr)>;
+
+// Bitfield extract, two reg
+def : WriteRes<WriteExtr,    [A64FXGI2456]> {
+  let Latency = 1;
+  let ResourceCycles = [1];
+}
+
+// Multiply high
+def : InstRW<[A64FXWrite_5Cyc_GI2], (instrs SMULHrr, UMULHrr)>;
+
+// Miscellaneous Data-Processing Instructions
+// Bitfield extract
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs EXTRWrri, EXTRXrri)>;
+
+// Bitifield move - basic
+def : InstRW<[A64FXWrite_1Cyc_GI24],
+            (instrs SBFMWri, SBFMXri, UBFMWri, UBFMXri)>;
+
+// Bitfield move, insert
+def : InstRW<[A64FXWrite_4Cyc_NGI24], (instregex "^BFM")>;
+def : InstRW<[A64FXWrite_1Cyc_GI24], (instregex "(S|U)?BFM.*")>;
+
+// Count leading
+def : InstRW<[A64FXWrite_2Cyc_GI0], (instregex "^CLS(W|X)r$",
+                                               "^CLZ(W|X)r$")>;
+
+// Reverse bits
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs RBITWr, RBITXr)>;
+
+// Cryptography Extensions
+def : InstRW<[A64FXWrite_8Cyc_GI0], (instregex "^AES[DE]")>;
+def : InstRW<[A64FXWrite_8Cyc_GI0], (instregex "^AESI?MC")>;
+def : InstRW<[A64FXWrite_8Cyc_GI0], (instregex "^PMULL")>;
+def : InstRW<[A64FXWrite_SHA00], (instregex "^SHA1SU0")>;
+def : InstRW<[A64FXWrite_8Cyc_GI0], (instregex "^SHA1(H|SU1)")>;
+def : InstRW<[A64FXWrite_SHA01], (instregex "^SHA1[CMP]")>;
+def : InstRW<[A64FXWrite_8Cyc_GI0], (instregex "^SHA256SU0")>;
+def : InstRW<[A64FXWrite_8Cyc_GI0], (instregex "^SHA256SU1")>;
+def : InstRW<[A64FXWrite_SHA01], (instregex "^SHA256(H|H2)")>;
+
+// CRC Instructions
+def : InstRW<[A64FXWrite_10Cyc_GI4], (instrs CRC32Brr, CRC32Hrr)>;
+def : InstRW<[A64FXWrite_12Cyc_GI4], (instrs CRC32Wrr)>;
+def : InstRW<[A64FXWrite_20Cyc_GI4], (instrs CRC32Xrr)>;
+
+def : InstRW<[A64FXWrite_10Cyc_GI4], (instrs CRC32CBrr, CRC32CHrr)>;
+def : InstRW<[A64FXWrite_12Cyc_GI4], (instrs CRC32CWrr)>;
+def : InstRW<[A64FXWrite_20Cyc_GI4], (instrs CRC32CXrr)>;
+
+// Reverse bits/bytes
+// NOTE: Handled by WriteI.
+
+//---
+// 3.6 Load Instructions
+// 3.10 FP Load Instructions
+//---
+
+// Load register, literal
+// Load register, unscaled immed
+// Load register, immed unprivileged
+// Load register, unsigned immed
+def : WriteRes<WriteLD,      [A64FXGI56]> {
+  let Latency = 4;
+  let ResourceCycles = [3];
+}
+
+// Load register, immed post-index
+// NOTE: Handled by WriteLD, WriteI.
+// Load register, immed pre-index
+// NOTE: Handled by WriteLD, WriteAdr.
+def : WriteRes<WriteAdr,     [A64FXGI2456]> {
+  let Latency = 1;
+  let ResourceCycles = [1];
+}
+
+// Load pair, immed offset, normal
+// Load pair, immed offset, signed words, base != SP
+// Load pair, immed offset signed words, base = SP
+// LDP only breaks into *one* LS micro-op.  Thus
+// the resources are handled by WriteLD.
+def : WriteRes<WriteLDHi,    []> {
+  let Latency = 5;
+}
+
+// Load register offset, basic
+// Load register, register offset, scale by 4/8
+// Load register, register offset, scale by 2
+// Load register offset, extend
+// Load register, register offset, extend, scale by 4/8
+// Load register, register offset, extend, scale by 2
+def A64FXWriteLDIdx : SchedWriteVariant<[
+  SchedVar<ScaledIdxPred, [A64FXWrite_1Cyc_GI56]>,
+  SchedVar<NoSchedPred,   [A64FXWrite_1Cyc_GI56]>]>;
+def : SchedAlias<WriteLDIdx, A64FXWriteLDIdx>;
+
+def A64FXReadAdrBase : SchedReadVariant<[
+  SchedVar<ScaledIdxPred, [ReadDefault]>,
+  SchedVar<NoSchedPred,   [ReadDefault]>]>;
+def : SchedAlias<ReadAdrBase, A64FXReadAdrBase>;
+
+// Load pair, immed pre-index, normal
+// Load pair, immed pre-index, signed words
+// Load pair, immed post-index, normal
+// Load pair, immed post-index, signed words
+// NOTE: Handled by WriteLD, WriteLDHi, WriteAdr.
+
+def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDNPDi)>;
+def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDNPQi)>;
+def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDNPSi)>;
+def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDNPWi)>;
+def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDNPXi)>;
+
+def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDPDi)>;
+def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDPQi)>;
+def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDPSi)>;
+def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDPSWi)>;
+def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDPWi)>;
+def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDPXi)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDRBui)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDRDui)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDRHui)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDRQui)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDRSui)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI6], (instrs LDRDl)>;
+def : InstRW<[A64FXWrite_5Cyc_GI6], (instrs LDRQl)>;
+def : InstRW<[A64FXWrite_5Cyc_GI6], (instrs LDRWl)>;
+def : InstRW<[A64FXWrite_5Cyc_GI6], (instrs LDRXl)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDTRBi)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDTRHi)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDTRWi)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDTRXi)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDTRSBWi)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDTRSBXi)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDTRSHWi)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDTRSHXi)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDTRSWi)>;
+
+def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr],
+            (instrs LDPDpre)>;
+def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr],
+            (instrs LDPQpre)>;
+def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr],
+            (instrs LDPSpre)>;
+def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr],
+            (instrs LDPWpre)>;
+def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr],
+            (instrs LDPWpre)>;
+
+def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRBpre)>;
+def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRDpre)>;
+def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRHpre)>;
+def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRQpre)>;
+def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRSpre)>;
+def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRWpre)>;
+def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRXpre)>;
+
+def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRSBWpre)>;
+def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRSBXpre)>;
+def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRSBWpost)>;
+def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRSBXpost)>;
+
+def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRSHWpre)>;
+def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRSHXpre)>;
+def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRSHWpost)>;
+def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRSHXpost)>;
+
+def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRBBpre)>;
+def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRBBpost)>;
+
+def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRHHpre)>;
+def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRHHpost)>;
+
+def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr],
+            (instrs LDPDpost)>;
+def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr],
+            (instrs LDPQpost)>;
+def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr],
+            (instrs LDPSpost)>;
+def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr],
+            (instrs LDPWpost)>;
+def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr],
+            (instrs LDPXpost)>;
+
+def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRBpost)>;
+def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRDpost)>;
+def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRHpost)>;
+def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRQpost)>;
+def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRSpost)>;
+def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRWpost)>;
+def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRXpost)>;
+
+def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr],
+            (instrs LDPDpre)>;
+def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr],
+            (instrs LDPQpre)>;
+def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr],
+            (instrs LDPSpre)>;
+def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr],
+            (instrs LDPWpre)>;
+def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr],
+            (instrs LDPXpre)>;
+
+def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRBpre)>;
+def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRDpre)>;
+def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRHpre)>;
+def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRQpre)>;
+def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRSpre)>;
+def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRWpre)>;
+def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRXpre)>;
+
+def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr],
+            (instrs LDPDpost)>;
+def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr],
+            (instrs LDPQpost)>;
+def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr],
+            (instrs LDPSpost)>;
+def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr],
+            (instrs LDPWpost)>;
+def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr],
+            (instrs LDPXpost)>;
+
+def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRBpost)>;
+def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRDpost)>;
+def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRHpost)>;
+def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRQpost)>;
+def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRSpost)>;
+def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRWpost)>;
+def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRXpost)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRBroW)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRDroW)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRHroW)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRHHroW)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRQroW)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRSroW)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRSHWroW)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRSHXroW)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRWroW)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRXroW)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRBroX)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRDroX)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRHHroX)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRHroX)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRQroX)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRSroX)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRSHWroX)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRSHXroX)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRWroX)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRXroX)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase],
+            (instrs LDRBroW)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase],
+            (instrs LDRBroW)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase],
+             (instrs LDRDroW)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase],
+            (instrs LDRHroW)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase],
+            (instrs LDRHHroW)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase],
+            (instrs LDRQroW)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase],
+            (instrs LDRSroW)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase],
+            (instrs LDRSHWroW)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase],
+            (instrs LDRSHXroW)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase],
+            (instrs LDRWroW)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase],
+            (instrs LDRXroW)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase],
+            (instrs LDRBroX)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase],
+            (instrs LDRDroX)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase],
+            (instrs LDRHroX)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase],
+            (instrs LDRHHroX)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase],
+            (instrs LDRQroX)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase],
+            (instrs LDRSroX)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase],
+            (instrs LDRSHWroX)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase],
+            (instrs LDRSHXroX)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase],
+            (instrs LDRWroX)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase],
+            (instrs LDRXroX)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURBi)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURBBi)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURDi)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURHi)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURHHi)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURQi)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURSi)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURXi)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURSBWi)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURSBXi)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURSHWi)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURSHXi)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURSWi)>;
+
+//---
+// Prefetch
+//---
+def : InstRW<[A64FXWrite_PREF0], (instrs PRFMl)>;
+def : InstRW<[A64FXWrite_PREF1], (instrs PRFUMi)>;
+def : InstRW<[A64FXWrite_PREF1], (instrs PRFMui)>;
+def : InstRW<[A64FXWrite_PREF1], (instrs PRFMroW)>;
+def : InstRW<[A64FXWrite_PREF1], (instrs PRFMroX)>;
+
+//--
+// 3.7 Store Instructions
+// 3.11 FP Store Instructions
+//--
+
+// Store register, unscaled immed
+// Store register, immed unprivileged
+// Store register, unsigned immed
+def : WriteRes<WriteST,      [A64FXGI56]> {
+  let Latency = 1;
+}
+
+// Store register, immed post-index
+// NOTE: Handled by WriteAdr, WriteST, ReadAdrBase
+
+// Store register, immed pre-index
+// NOTE: Handled by WriteAdr, WriteST
+
+// Store register, register offset, basic
+// Store register, register offset, scaled by 4/8
+// Store register, register offset, scaled by 2
+// Store register, register offset, extend
+// Store register, register offset, extend, scale by 4/8
+// Store register, register offset, extend, scale by 1
+def : WriteRes<WriteSTIdx, [A64FXGI56, A64FXGI2456]> {
+  let Latency = 1;
+}
+
+// Store pair, immed offset, W-form
+// Store pair, immed offset, X-form
+def : WriteRes<WriteSTP,     [A64FXGI56]> {
+  let Latency = 1;
+}
+
+// Store pair, immed post-index, W-form
+// Store pair, immed post-index, X-form
+// Store pair, immed pre-index, W-form
+// Store pair, immed pre-index, X-form
+// NOTE: Handled by WriteAdr, WriteSTP.
+
+def : InstRW<[A64FXWrite_STUR], (instrs STURBi)>;
+def : InstRW<[A64FXWrite_STUR], (instrs STURBBi)>;
+def : InstRW<[A64FXWrite_STUR], (instrs STURDi)>;
+def : InstRW<[A64FXWrite_STUR], (instrs STURHi)>;
+def : InstRW<[A64FXWrite_STUR], (instrs STURHHi)>;
+def : InstRW<[A64FXWrite_STUR], (instrs STURQi)>;
+def : InstRW<[A64FXWrite_STUR], (instrs STURSi)>;
+def : InstRW<[A64FXWrite_STUR], (instrs STURWi)>;
+def : InstRW<[A64FXWrite_STUR], (instrs STURXi)>;
+
+def : InstRW<[WriteAdr, A64FXWrite_STUR], (instrs STTRBi)>;
+def : InstRW<[WriteAdr, A64FXWrite_STUR], (instrs STTRHi)>;
+def : InstRW<[WriteAdr, A64FXWrite_STUR], (instrs STTRWi)>;
+def : InstRW<[WriteAdr, A64FXWrite_STUR], (instrs STTRXi)>;
+
+def : InstRW<[A64FXWrite_STNP], (instrs STNPDi)>;
+def : InstRW<[A64FXWrite_STNP], (instrs STNPQi)>;
+def : InstRW<[A64FXWrite_STNP], (instrs STNPXi)>;
+def : InstRW<[A64FXWrite_STNP], (instrs STNPWi)>;
+
+def : InstRW<[A64FXWrite_STNP], (instrs STPDi)>;
+def : InstRW<[A64FXWrite_STNP], (instrs STPQi)>;
+def : InstRW<[A64FXWrite_STNP], (instrs STPXi)>;
+def : InstRW<[A64FXWrite_STNP], (instrs STPWi)>;
+
+def : InstRW<[A64FXWrite_STUR], (instrs STRBui)>;
+def : InstRW<[A64FXWrite_STUR], (instrs STRBui)>;
+def : InstRW<[A64FXWrite_STUR], (instrs STRDui)>;
+def : InstRW<[A64FXWrite_STUR], (instrs STRDui)>;
+def : InstRW<[A64FXWrite_STUR], (instrs STRHui)>;
+def : InstRW<[A64FXWrite_STUR], (instrs STRHui)>;
+def : InstRW<[A64FXWrite_STUR], (instrs STRQui)>;
+def : InstRW<[A64FXWrite_STUR], (instrs STRQui)>;
+def : InstRW<[A64FXWrite_STUR], (instrs STRXui)>;
+def : InstRW<[A64FXWrite_STUR], (instrs STRXui)>;
+def : InstRW<[A64FXWrite_STUR], (instrs STRWui)>;
+def : InstRW<[A64FXWrite_STUR], (instrs STRWui)>;
+
+def : InstRW<[A64FXWrite_STP01],
+            (instrs STPDpre, STPDpost)>;
+def : InstRW<[A64FXWrite_STP01, ReadAdrBase],
+            (instrs STPDpre, STPDpost)>;
+def : InstRW<[A64FXWrite_STP01],
+            (instrs STPDpre, STPDpost)>;
+def : InstRW<[A64FXWrite_STP01, ReadAdrBase],
+            (instrs STPDpre, STPDpost)>;
+def : InstRW<[A64FXWrite_STP01],
+            (instrs STPQpre, STPQpost)>;
+def : InstRW<[A64FXWrite_STP01, ReadAdrBase],
+            (instrs STPQpre, STPQpost)>;
+def : InstRW<[A64FXWrite_STP01],
+            (instrs STPQpre, STPQpost)>;
+def : InstRW<[A64FXWrite_STP01, ReadAdrBase],
+            (instrs STPQpre, STPQpost)>;
+def : InstRW<[A64FXWrite_STP01],
+            (instrs STPSpre, STPSpost)>;
+def : InstRW<[A64FXWrite_STP01, ReadAdrBase],
+            (instrs STPSpre, STPSpost)>;
+def : InstRW<[A64FXWrite_STP01],
+            (instrs STPSpre, STPSpost)>;
+def : InstRW<[A64FXWrite_STP01, ReadAdrBase],
+            (instrs STPSpre, STPSpost)>;
+def : InstRW<[A64FXWrite_STP01],
+            (instrs STPWpre, STPWpost)>;
+def : InstRW<[A64FXWrite_STP01, ReadAdrBase],
+            (instrs STPWpre, STPWpost)>;
+def : InstRW<[A64FXWrite_STP01],
+            (instrs STPWpre, STPWpost)>;
+def : InstRW<[A64FXWrite_STP01, ReadAdrBase],
+            (instrs STPWpre, STPWpost)>;
+def : InstRW<[A64FXWrite_STP01],
+            (instrs STPXpre, STPXpost)>;
+def : InstRW<[A64FXWrite_STP01, ReadAdrBase],
+            (instrs STPXpre, STPXpost)>;
+def : InstRW<[A64FXWrite_STP01],
+            (instrs STPXpre, STPXpost)>;
+def : InstRW<[A64FXWrite_STP01, ReadAdrBase],
+            (instrs STPXpre, STPXpost)>;
+
+def : InstRW<[WriteAdr, A64FXWrite_STP01],
+            (instrs STRBpre, STRBpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase],
+            (instrs STRBpre, STRBpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01],
+            (instrs STRBpre, STRBpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase],
+            (instrs STRBpre, STRBpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01],
+            (instrs STRBBpre, STRBBpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase],
+            (instrs STRBBpre, STRBBpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01],
+            (instrs STRBBpre, STRBBpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase],
+            (instrs STRBBpre, STRBBpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01],
+            (instrs STRDpre, STRDpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase],
+            (instrs STRDpre, STRDpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01],
+            (instrs STRDpre, STRDpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase],
+            (instrs STRDpre, STRDpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01],
+            (instrs STRHpre, STRHpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase],
+            (instrs STRHpre, STRHpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01],
+            (instrs STRHpre, STRHpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase],
+            (instrs STRHpre, STRHpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01],
+            (instrs STRHHpre, STRHHpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase],
+            (instrs STRHHpre, STRHHpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01],
+            (instrs STRHHpre, STRHHpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase],
+            (instrs STRHHpre, STRHHpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01],
+            (instrs STRQpre, STRQpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase],
+            (instrs STRQpre, STRQpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01],
+            (instrs STRQpre, STRQpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase],
+            (instrs STRQpre, STRQpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01],
+            (instrs STRSpre, STRSpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase],
+            (instrs STRSpre, STRSpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01],
+            (instrs STRSpre, STRSpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase],
+            (instrs STRSpre, STRSpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01],
+            (instrs STRWpre, STRWpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase],
+            (instrs STRWpre, STRWpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01],
+            (instrs STRWpre, STRWpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase],
+            (instrs STRWpre, STRWpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01],
+            (instrs STRXpre, STRXpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase],
+            (instrs STRXpre, STRXpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01],
+            (instrs STRXpre, STRXpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase],
+            (instrs STRXpre, STRXpost)>;
+
+def : InstRW<[A64FXWrite_STUR, ReadAdrBase],
+            (instrs STRBroW, STRBroX)>;
+def : InstRW<[A64FXWrite_STUR, ReadAdrBase],
+            (instrs STRBroW, STRBroX)>;
+def : InstRW<[A64FXWrite_STUR, ReadAdrBase],
+            (instrs STRBBroW, STRBBroX)>;
+def : InstRW<[A64FXWrite_STUR, ReadAdrBase],
+            (instrs STRBBroW, STRBBroX)>;
+def : InstRW<[A64FXWrite_STUR, ReadAdrBase],
+            (instrs STRDroW, STRDroX)>;
+def : InstRW<[A64FXWrite_STUR, ReadAdrBase],
+            (instrs STRDroW, STRDroX)>;
+def : InstRW<[A64FXWrite_STUR, ReadAdrBase],
+            (instrs STRHroW, STRHroX)>;
+def : InstRW<[A64FXWrite_STUR, ReadAdrBase],
+            (instrs STRHroW, STRHroX)>;
+def : InstRW<[A64FXWrite_STUR, ReadAdrBase],
+            (instrs STRHHroW, STRHHroX)>;
+def : InstRW<[A64FXWrite_STUR, ReadAdrBase],
+            (instrs STRHHroW, STRHHroX)>;
+def : InstRW<[A64FXWrite_STUR, ReadAdrBase],
+            (instrs STRQroW, STRQroX)>;
+def : InstRW<[A64FXWrite_STUR, ReadAdrBase],
+            (instrs STRQroW, STRQroX)>;
+def : InstRW<[A64FXWrite_STUR, ReadAdrBase],
+            (instrs STRSroW, STRSroX)>;
+def : InstRW<[A64FXWrite_STUR, ReadAdrBase],
+            (instrs STRSroW, STRSroX)>;
+def : InstRW<[A64FXWrite_STUR, ReadAdrBase],
+            (instrs STRWroW, STRWroX)>;
+def : InstRW<[A64FXWrite_STUR, ReadAdrBase],
+            (instrs STRWroW, STRWroX)>;
+def : InstRW<[A64FXWrite_STUR, ReadAdrBase],
+            (instrs STRXroW, STRXroX)>;
+def : InstRW<[A64FXWrite_STUR, ReadAdrBase],
+            (instrs STRXroW, STRXroX)>;
+
+//---
+// 3.8 FP Data Processing Instructions
+//---
+
+// FP absolute value
+// FP min/max
+// FP negate
+def : WriteRes<WriteF,       [A64FXGI03]> {
+  let Latency = 4;
+  let ResourceCycles = [2];
+}
+
+// FP arithmetic
+
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FADDDrr, FADDHrr)>;
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FSUBDrr, FSUBHrr)>;
+
+// FP compare
+def : WriteRes<WriteFCmp,    [A64FXGI03]> {
+  let Latency = 4;
+  let ResourceCycles = [2];
+}
+
+// FP Div, Sqrt
+def : WriteRes<WriteFDiv, [A64FXGI0]> {
+  let Latency = 43;
+}
+
+def A64FXXWriteFDiv : SchedWriteRes<[A64FXGI0]> {
+  let Latency = 38;
+}
+
+def A64FXXWriteFDivSP : SchedWriteRes<[A64FXGI0]> {
+  let Latency = 29;
+}
+
+def A64FXXWriteFDivDP : SchedWriteRes<[A64FXGI0]> {
+  let Latency = 43;
+}
+
+def A64FXXWriteFSqrtSP : SchedWriteRes<[A64FXGI0]> {
+  let Latency = 29;
+}
+
+def A64FXXWriteFSqrtDP : SchedWriteRes<[A64FXGI0]> {
+  let Latency = 43;
+}
+
+// FP divide, S-form
+// FP square root, S-form
+def : InstRW<[A64FXXWriteFDivSP], (instrs FDIVSrr)>;
+def : InstRW<[A64FXXWriteFSqrtSP], (instrs FSQRTSr)>;
+def : InstRW<[A64FXXWriteFDivSP], (instregex "^FDIVv.*32$")>;
+def : InstRW<[A64FXXWriteFSqrtSP], (instregex "^.*SQRT.*32$")>;
+def : InstRW<[A64FXXWriteFDivSP], (instregex "^FDIVSrr")>;
+def : InstRW<[A64FXXWriteFSqrtSP], (instregex "^FSQRTSr")>;
+
+// FP divide, D-form
+// FP square root, D-form
+def : InstRW<[A64FXXWriteFDivDP], (instrs FDIVDrr)>;
+def : InstRW<[A64FXXWriteFSqrtDP], (instrs FSQRTDr)>;
+def : InstRW<[A64FXXWriteFDivDP], (instregex "^FDIVv.*64$")>;
+def : InstRW<[A64FXXWriteFSqrtDP], (instregex "^.*SQRT.*64$")>;
+def : InstRW<[A64FXXWriteFDivDP], (instregex "^FDIVDrr")>;
+def : InstRW<[A64FXXWriteFSqrtDP], (instregex "^FSQRTDr")>;
+
+// FP multiply
+// FP multiply accumulate
+def : WriteRes<WriteFMul, [A64FXGI03]> {
+  let Latency = 9;
+  let ResourceCycles = [2];
+}
+
+def A64FXXWriteFMul : SchedWriteRes<[A64FXGI03]> {
+  let Latency = 9;
+  let ResourceCycles = [2];
+}
+
+def A64FXXWriteFMulAcc : SchedWriteRes<[A64FXGI03]> {
+  let Latency = 9;
+  let ResourceCycles = [2];
+}
+
+def : InstRW<[A64FXXWriteFMul], (instregex "^FMUL", "^FNMUL")>;
+def : InstRW<[A64FXXWriteFMulAcc],
+            (instregex "^FMADD", "^FMSUB", "^FNMADD", "^FNMSUB")>;
+
+// FP round to integral
+def : InstRW<[A64FXWrite_9Cyc_GI03],
+            (instregex "^FRINT(A|I|M|N|P|X|Z)(Sr|Dr)")>;
+
+// FP select
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instregex "^FCSEL")>;
+
+//---
+// 3.9 FP Miscellaneous Instructions
+//---
+
+// FP convert, from vec to vec reg
+// FP convert, from gen to vec reg
+// FP convert, from vec to gen reg
+def : WriteRes<WriteFCvt, [A64FXGI03]> {
+  let Latency = 9;
+  let ResourceCycles = [2];
+}
+
+// FP move, immed
+// FP move, register
+def : WriteRes<WriteFImm, [A64FXGI0]> {
+  let Latency = 4;
+  let ResourceCycles = [2];
+}
+
+// FP transfer, from gen to vec reg
+// FP transfer, from vec to gen reg
+def : WriteRes<WriteFCopy, [A64FXGI0]> {
+  let Latency = 4;
+  let ResourceCycles = [2];
+}
+
+def : InstRW<[A64FXWrite_FMOV_GV], (instrs FMOVXDHighr)>;
+def : InstRW<[A64FXWrite_FMOV_VG14], (instrs FMOVDXHighr)>;
+
+//---
+// 3.12 ASIMD Integer Instructions
+//---
+
+// ASIMD absolute diff, D-form
+// ASIMD absolute diff, Q-form
+// ASIMD absolute diff accum, D-form
+// ASIMD absolute diff accum, Q-form
+// ASIMD absolute diff accum long
+// ASIMD absolute diff long
+// ASIMD arith, basic
+// ASIMD arith, complex
+// ASIMD compare
+// ASIMD logical (AND, BIC, EOR)
+// ASIMD max/min, basic
+// ASIMD max/min, reduce, 4H/4S
+// ASIMD max/min, reduce, 8B/8H
+// ASIMD max/min, reduce, 16B
+// ASIMD multiply, D-form
+// ASIMD multiply, Q-form
+// ASIMD multiply accumulate long
+// ASIMD multiply accumulate saturating long
+// ASIMD multiply long
+// ASIMD pairwise add and accumulate
+// ASIMD shift accumulate
+// ASIMD shift by immed, basic
+// ASIMD shift by immed and insert, basic, D-form
+// ASIMD shift by immed and insert, basic, Q-form
+// ASIMD shift by immed, complex
+// ASIMD shift by register, basic, D-form
+// ASIMD shift by register, basic, Q-form
+// ASIMD shift by register, complex, D-form
+// ASIMD shift by register, complex, Q-form
+def : WriteRes<WriteV, [A64FXGI03]> {
+  let Latency = 4;
+  let ResourceCycles = [1];
+}
+
+// ASIMD arith, reduce, 4H/4S
+// ASIMD arith, reduce, 8B/8H
+// ASIMD arith, reduce, 16B
+
+// ASIMD logical (MVN (alias for NOT), ORN, ORR)
+def : InstRW<[A64FXWrite_4Cyc_GI03],
+            (instregex "^ANDv", "^BICv", "^EORv", "^ORRv", "^ORNv", "^NOTv")>;
+
+// ASIMD arith, reduce
+def : InstRW<[A64FXWrite_ADDLV],
+            (instregex "^ADDVv", "^SADDLVv", "^UADDLVv")>;
+
+// ASIMD polynomial (8x8) multiply long
+def : InstRW<[A64FXWrite_MULLE], (instregex "^(S|U|SQD)MULL")>;
+def : InstRW<[A64FXWrite_MULLV],
+            (instregex "(S|U|SQD)(MLAL|MLSL|MULL)v.*")>;
+def : InstRW<[A64FXWrite_8Cyc_GI03], (instregex "^PMULL(v8i8|v16i8)")>;
+def : InstRW<[A64FXWrite_8Cyc_GI03], (instregex "^PMULL(v1i64|v2i64)")>;
+
+// ASIMD absolute diff accum, D-form
+def : InstRW<[A64FXWrite_ABA],
+            (instregex "^[SU]ABA(v8i8|v4i16|v2i32)$")>;
+// ASIMD absolute diff accum, Q-form
+def : InstRW<[A64FXWrite_ABA],
+            (instregex "^[SU]ABA(v16i8|v8i16|v4i32)$")>;
+// ASIMD absolute diff accum long
+def : InstRW<[A64FXWrite_ABAL],
+            (instregex "^[SU]ABAL")>;
+// ASIMD arith, reduce, 4H/4S
+def : InstRW<[A64FXWrite_ADDLV1],
+            (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v$")>;
+// ASIMD arith, reduce, 8B
+def : InstRW<[A64FXWrite_ADDLV1],
+            (instregex "^[SU]?ADDL?V(v8i16|v4i32)v$")>;
+// ASIMD arith, reduce, 16B/16H
+def : InstRW<[A64FXWrite_ADDLV1],
+            (instregex "^[SU]?ADDL?Vv16i8v$")>;
+// ASIMD max/min, reduce, 4H/4S
+def : InstRW<[A64FXWrite_MINMAXV],
+            (instregex "^[SU](MIN|MAX)V(v4i16|v4i32)v$")>;
+// ASIMD max/min, reduce, 8B/8H
+def : InstRW<[A64FXWrite_MINMAXV],
+            (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v$")>;
+// ASIMD max/min, reduce, 16B/16H
+def : InstRW<[A64FXWrite_MINMAXV],
+            (instregex "^[SU](MIN|MAX)Vv16i8v$")>;
+// ASIMD multiply, D-form
+def : InstRW<[A64FXWrite_PMUL],
+            (instregex "^(P?MUL|SQR?DMUL)" #
+                       "(v8i8|v4i16|v2i32|v1i8|v1i16|v1i32|v1i64)" #
+                       "(_indexed)?$")>;
+
+// ASIMD multiply, Q-form
+def : InstRW<[A64FXWrite_PMUL],
+            (instregex "^(P?MUL)(v16i8|v8i16|v4i32)(_indexed)?$")>;
+
+// ASIMD multiply, Q-form
+def : InstRW<[A64FXWrite_SQRDMULH],
+            (instregex "^(SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>;
+
+// ASIMD multiply accumulate, D-form
+def : InstRW<[A64FXWrite_9Cyc_GI03],
+            (instregex "^ML[AS](v8i8|v4i16|v2i32)(_indexed)?$")>;
+// ASIMD multiply accumulate, Q-form
+def : InstRW<[A64FXWrite_9Cyc_GI03],
+            (instregex "^ML[AS](v16i8|v8i16|v4i32)(_indexed)?$")>;
+// ASIMD shift accumulate
+def : InstRW<[A64FXWrite_SRSRAV],
+            (instregex "SRSRAv", "URSRAv")>;
+def : InstRW<[A64FXWrite_SSRAV],
+            (instregex "SSRAv", "USRAv")>;
+
+// ASIMD shift by immed, basic
+def : InstRW<[A64FXWrite_RSHRN],
+            (instregex "RSHRNv", "SQRSHRNv", "SQRSHRUNv", "UQRSHRNv")>;
+def : InstRW<[A64FXWrite_SHRN],
+            (instregex "SHRNv", "SQSHRNv", "SQSHRUNv", "UQSHRNv")>;
+
+def : InstRW<[A64FXWrite_6Cyc_GI3],
+            (instregex "SQXTNv", "SQXTUNv", "UQXTNv")>;
+
+// ASIMD shift by immed, complex
+def : InstRW<[A64FXWrite_ABA], (instregex "^[SU]?(Q|R){1,2}SHR")>;
+def : InstRW<[A64FXWrite_6Cyc_GI3], (instregex "^SQSHLU")>;
+// ASIMD shift by register, basic, Q-form
+def : InstRW<[A64FXWrite_6Cyc_GI3],
+            (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>;
+// ASIMD shift by register, complex, D-form
+def : InstRW<[A64FXWrite_6Cyc_GI3],
+            (instregex "^[SU][QR]{1,2}SHL" #
+                       "(v1i8|v1i16|v1i32|v1i64|v8i8|v4i16|v2i32|b|d|h|s)")>;
+// ASIMD shift by register, complex, Q-form
+def : InstRW<[A64FXWrite_6Cyc_GI3],
+            (instregex "^[SU][QR]{1,2}SHL(v16i8|v8i16|v4i32|v2i64)")>;
+
+// ASIMD Arithmetic
+def : InstRW<[A64FXWrite_4Cyc_GI03],
+            (instregex "(ADD|SUB)(v8i8|v4i16|v2i32|v1i64)")>;
+def : InstRW<[A64FXWrite_4Cyc_GI03],
+            (instregex "(ADD|SUB)(v16i8|v8i16|v4i32|v2i64)")>;
+def : InstRW<[A64FXWrite_SHRN], (instregex "(ADD|SUB)HNv.*")>;
+def : InstRW<[A64FXWrite_RSHRN], (instregex "(RADD|RSUB)HNv.*")>;
+def : InstRW<[A64FXWrite_4Cyc_GI03],
+            (instregex "^SQADD", "^SQNEG", "^SQSUB", "^SRHADD",
+                       "^SUQADD", "^UQADD", "^UQSUB", "^URHADD", "^USQADD")>;
+def : InstRW<[A64FXWrite_ADDP],
+            (instregex "ADDP(v16i8|v8i16|v4i32|v2i64)")>;
+def : InstRW<[A64FXWrite_4Cyc_GI03],
+            (instregex "((AND|ORN|EOR|EON)S?(Xr[rsi]|v16i8|v8i16|v4i32)|" #
+                       "(ORR|BIC)S?(Xr[rs]|v16i8|v8i16|v4i32))")>;
+def : InstRW<[A64FXWrite_4Cyc_GI0],
+            (instregex "(CLS|CLZ|CNT)(v4i32|v8i16|v16i8)")>;
+def : InstRW<[A64FXWrite_SADALP], (instregex "^SADALP", "^UADALP")>;
+def : InstRW<[A64FXWrite_SADDLP], (instregex "^SADDLPv", "^UADDLPv")>;
+def : InstRW<[A64FXWrite_ADDLV1], (instregex "^SADDLV", "^UADDLV")>;
+def : InstRW<[A64FXWrite_MINMAXV],
+             (instregex "^ADDVv", "^SMAXVv", "^UMAXVv", "^SMINVv", "^UMINVv")>;
+def : InstRW<[A64FXWrite_ABA],
+             (instregex "^SABAv", "^UABAv", "^SABALv", "^UABALv")>;
+def : InstRW<[A64FXWrite_4Cyc_GI03],
+            (instregex "^SQADDv", "^SQSUBv", "^UQADDv", "^UQSUBv")>;
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instregex "^SUQADDv", "^USQADDv")>;
+def : InstRW<[A64FXWrite_SHRN],
+            (instregex "^ADDHNv", "^SUBHNv")>;
+def : InstRW<[A64FXWrite_RSHRN],
+            (instregex "^RADDHNv", "^RSUBHNv")>;
+def : InstRW<[A64FXWrite_4Cyc_GI03],
+            (instregex "^SQABS", "^SQADD", "^SQNEG", "^SQSUB",
+                       "^SRHADD", "^SUQADD", "^UQADD", "^UQSUB",
+                      "^URHADD", "^USQADD")>;
+
+def : InstRW<[A64FXWrite_4Cyc_GI03],
+            (instregex "^CMEQv", "^CMGEv", "^CMGTv",
+                       "^CMLEv", "^CMLTv", "^CMHIv", "^CMHSv")>;
+def : InstRW<[A64FXWrite_MINMAXV],
+            (instregex "^SMAXv", "^SMINv", "^UMAXv", "^UMINv")>;
+def : InstRW<[A64FXWrite_ADDP],
+            (instregex "^SMAXPv", "^SMINPv", "^UMAXPv", "^UMINPv")>;
+def : InstRW<[A64FXWrite_4Cyc_GI03],
+            (instregex "^SABDv", "^UABDv")>;
+def : InstRW<[A64FXWrite_TBX1],
+            (instregex "^SABDLv", "^UABDLv")>;
+
+//---
+// 3.13 ASIMD Floating-point Instructions
+//---
+
+// ASIMD FP absolute value
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instregex "^FABSv")>;
+
+// ASIMD FP arith, normal, D-form
+// ASIMD FP arith, normal, Q-form
+def : InstRW<[A64FXWrite_9Cyc_GI03],
+            (instregex "^FABDv", "^FADDv", "^FSUBv")>;
+
+// ASIMD FP arith, pairwise, D-form
+// ASIMD FP arith, pairwise, Q-form
+def : InstRW<[A64FXWrite_FADDPV], (instregex "^FADDPv")>;
+
+// ASIMD FP compare, D-form
+// ASIMD FP compare, Q-form
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instregex "^FACGEv", "^FACGTv")>;
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instregex "^FCMEQv", "^FCMGEv",
+                                                 "^FCMGTv", "^FCMLEv",
+                                                 "^FCMLTv")>;
+// ASIMD FP round, D-form
+def : InstRW<[A64FXWrite_9Cyc_GI03],
+            (instregex "^FRINT[AIMNPXZ](v2f32)")>;
+// ASIMD FP round, Q-form
+def : InstRW<[A64FXWrite_9Cyc_GI03],
+            (instregex "^FRINT[AIMNPXZ](v4f32|v2f64)")>;
+
+// ASIMD FP convert, long
+// ASIMD FP convert, narrow
+// ASIMD FP convert, other, D-form
+// ASIMD FP convert, other, Q-form
+
+// ASIMD FP convert, long and narrow
+def : InstRW<[A64FXWrite_FCVTXNV], (instregex "^FCVT(L|N|XN)v")>;
+// ASIMD FP convert, other, D-form
+def : InstRW<[A64FXWrite_FCVTXNV],
+      (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?(v2f32|v1i32|v2i32|v1i64)")>;
+// ASIMD FP convert, other, Q-form
+def : InstRW<[A64FXWrite_FCVTXNV],
+      (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?(v4f32|v2f64|v4i32|v2i64)")>;
+
+// ASIMD FP divide, D-form, F32
+def : InstRW<[A64FXXWriteFDivSP], (instrs FDIVv2f32)>;
+def : InstRW<[A64FXXWriteFDivSP], (instregex "FDIVv2f32")>;
+
+// ASIMD FP divide, Q-form, F32
+def : InstRW<[A64FXXWriteFDiv], (instrs FDIVv4f32)>;
+def : InstRW<[A64FXXWriteFDiv], (instregex "FDIVv4f32")>;
+
+// ASIMD FP divide, Q-form, F64
+def : InstRW<[A64FXXWriteFDivDP], (instrs FDIVv2f64)>;
+def : InstRW<[A64FXXWriteFDivDP], (instregex "FDIVv2f64")>;
+
+// ASIMD FP max/min, normal, D-form
+// ASIMD FP max/min, normal, Q-form
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instregex "^FMAXv", "^FMAXNMv",
+                                               "^FMINv", "^FMINNMv")>;
+
+// ASIMD FP max/min, pairwise, D-form
+// ASIMD FP max/min, pairwise, Q-form
+def : InstRW<[A64FXWrite_ADDP], (instregex "^FMAXPv", "^FMAXNMPv",
+                                           "^FMINPv", "^FMINNMPv")>;
+
+// ASIMD FP max/min, reduce
+def : InstRW<[A64FXWrite_FMAXVVH], (instregex "^FMAXVv", "^FMAXNMVv",
+                                              "^FMINVv", "^FMINNMVv")>;
+
+// ASIMD FP multiply, D-form, FZ
+// ASIMD FP multiply, D-form, no FZ
+// ASIMD FP multiply, Q-form, FZ
+// ASIMD FP multiply, Q-form, no FZ
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instregex "^FMULv", "^FMULXv")>;
+def : InstRW<[A64FXWrite_FMULXE],
+            (instregex "^FMULX?(v2f32|v1i32|v2i32|v1i64|32|64)")>;
+def : InstRW<[A64FXWrite_FMULXE],
+            (instregex "^FMULX?(v4f32|v2f64|v4i32|v2i64)")>;
+
+// ASIMD FP multiply accumulate, Dform, FZ
+// ASIMD FP multiply accumulate, Dform, no FZ
+// ASIMD FP multiply accumulate, Qform, FZ
+// ASIMD FP multiply accumulate, Qform, no FZ
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instregex "^FMLAv", "^FMLSv")>;
+def : InstRW<[A64FXWrite_FMULXE],
+            (instregex "^FML[AS](v2f32|v1i32|v2i32|v1i64)")>;
+def : InstRW<[A64FXWrite_FMULXE],
+            (instregex "^FML[AS](v4f32|v2f64|v4i32|v2i64)")>;
+
+// ASIMD FP negate
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instregex "^FNEGv")>;
+
+//--
+// 3.14 ASIMD Miscellaneous Instructions
+//--
+
+// ASIMD bit reverse
+def : InstRW<[A64FXWrite_1Cyc_GI2456], (instregex "^RBITv")>;
+
+// ASIMD bitwise insert, D-form
+// ASIMD bitwise insert, Q-form
+def : InstRW<[A64FXWrite_BIF],
+            (instregex "^BIFv", "^BITv", "^BSLv")>;
+
+// ASIMD count, D-form
+// ASIMD count, Q-form
+def : InstRW<[A64FXWrite_4Cyc_GI0],
+            (instregex "^CLSv", "^CLZv", "^CNTv")>;
+
+// ASIMD duplicate, gen reg
+// ASIMD duplicate, element
+def : InstRW<[A64FXWrite_DUPGENERAL], (instregex "^DUPv")>;
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instregex "^CPY")>;
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instregex "^DUPv.+gpr")>;
+
+// ASIMD extract
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instregex "^EXTv")>;
+
+// ASIMD extract narrow
+def : InstRW<[A64FXWrite_6Cyc_GI3], (instregex "^XTNv")>;
+
+// ASIMD extract narrow, saturating
+def : InstRW<[A64FXWrite_6Cyc_GI3],
+            (instregex "^SQXTNv", "^SQXTUNv", "^UQXTNv")>;
+
+// ASIMD insert, element to element
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instregex "^INSv")>;
+
+// ASIMD transfer, element to gen reg
+def : InstRW<[A64FXWrite_SMOV], (instregex "^[SU]MOVv")>;
+
+// ASIMD move, integer immed
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instregex "^MOVIv")>;
+
+// ASIMD move, FP immed
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instregex "^FMOVv")>;
+
+// ASIMD table lookup, D-form
+def : InstRW<[A64FXWrite_6Cyc_GI3], (instregex "^TBLv8i8One")>;
+def : InstRW<[A64FXWrite_TBX1], (instregex "^TBLv8i8Two")>;
+def : InstRW<[A64FXWrite_TBX2], (instregex "^TBLv8i8Three")>;
+def : InstRW<[A64FXWrite_TBX3], (instregex "^TBLv8i8Four")>;
+def : InstRW<[A64FXWrite_TBX1], (instregex "^TBXv8i8One")>;
+def : InstRW<[A64FXWrite_TBX2], (instregex "^TBXv8i8Two")>;
+def : InstRW<[A64FXWrite_TBX3], (instregex "^TBXv8i8Three")>;
+def : InstRW<[A64FXWrite_TBX4], (instregex "^TBXv8i8Four")>;
+
+// ASIMD table lookup, Q-form
+def : InstRW<[A64FXWrite_6Cyc_GI3], (instregex "^TBLv16i8One")>;
+def : InstRW<[A64FXWrite_TBX1], (instregex "^TBLv16i8Two")>;
+def : InstRW<[A64FXWrite_TBX2], (instregex "^TBLv16i8Three")>;
+def : InstRW<[A64FXWrite_TBX3], (instregex "^TBLv16i8Four")>;
+def : InstRW<[A64FXWrite_TBX1], (instregex "^TBXv16i8One")>;
+def : InstRW<[A64FXWrite_TBX2], (instregex "^TBXv16i8Two")>;
+def : InstRW<[A64FXWrite_TBX3], (instregex "^TBXv16i8Three")>;
+def : InstRW<[A64FXWrite_TBX4], (instregex "^TBXv16i8Four")>;
+
+// ASIMD transpose
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instregex "^TRN1", "^TRN2")>;
+
+// ASIMD unzip/zip
+def : InstRW<[A64FXWrite_6Cyc_GI0],
+            (instregex "^UZP1", "^UZP2", "^ZIP1", "^ZIP2")>;
+
+// ASIMD reciprocal estimate, D-form
+// ASIMD reciprocal estimate, Q-form
+def : InstRW<[A64FXWrite_4Cyc_GI03],
+            (instregex "^FRECPEv", "^FRECPXv", "^URECPEv",
+                       "^FRSQRTEv", "^URSQRTEv")>;
+
+// ASIMD reciprocal step, D-form, FZ
+// ASIMD reciprocal step, D-form, no FZ
+// ASIMD reciprocal step, Q-form, FZ
+// ASIMD reciprocal step, Q-form, no FZ
+def : InstRW<[A64FXWrite_9Cyc_GI0], (instregex "^FRECPSv", "^FRSQRTSv")>;
+
+// ASIMD reverse
+def : InstRW<[A64FXWrite_4Cyc_GI03],
+            (instregex "^REV16v", "^REV32v", "^REV64v")>;
+
+// ASIMD table lookup, D-form
+// ASIMD table lookup, Q-form
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instregex "^TBLv", "^TBXv")>;
+
+// ASIMD transfer, element to word or word
+def : InstRW<[A64FXWrite_SMOV], (instregex "^[SU]MOVv")>;
+
+// ASIMD transfer, element to gen reg
+def : InstRW<[A64FXWrite_SMOV], (instregex "(S|U)MOVv.*")>;
+
+// ASIMD transfer gen reg to element
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instregex "^INSv")>;
+
+// ASIMD transpose
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instregex "^TRN1v", "^TRN2v",
+                                                 "^UZP1v", "^UZP2v")>;
+
+// ASIMD unzip/zip
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instregex "^ZIP1v", "^ZIP2v")>;
+
+//--
+// 3.15 ASIMD Load Instructions
+//--
+
+// ASIMD load, 1 element, multiple, 1 reg, D-form
+// ASIMD load, 1 element, multiple, 1 reg, Q-form
+def : InstRW<[A64FXWrite_8Cyc_GI56],
+            (instregex "^LD1Onev(8b|4h|2s|1d|2d)$")>;
+def : InstRW<[A64FXWrite_11Cyc_GI56],
+            (instregex "^LD1Onev(16b|8h|4s)$")>;
+def : InstRW<[A64FXWrite_LD108, WriteAdr],
+            (instregex "^LD1Onev(8b|4h|2s|1d|2d)_POST$")>;
+def : InstRW<[A64FXWrite_LD109, WriteAdr],
+            (instregex "^LD1Onev(16b|8h|4s)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 2 reg, D-form
+// ASIMD load, 1 element, multiple, 2 reg, Q-form
+def : InstRW<[A64FXWrite_LD102],
+            (instregex "^LD1Twov(8b|4h|2s|1d|2d)$")>;
+def : InstRW<[A64FXWrite_LD103],
+            (instregex "^LD1Twov(16b|8h|4s)$")>;
+def : InstRW<[A64FXWrite_LD110, WriteAdr],
+            (instregex "^LD1Twov(8b|4h|2s|1d|2d)_POST$")>;
+def : InstRW<[A64FXWrite_LD111, WriteAdr],
+            (instregex "^LD1Twov(16b|8h|4s)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 3 reg, D-form
+// ASIMD load, 1 element, multiple, 3 reg, Q-form
+def : InstRW<[A64FXWrite_LD104],
+            (instregex "^LD1Threev(8b|4h|2s|1d|2d)$")>;
+def : InstRW<[A64FXWrite_LD105],
+            (instregex "^LD1Threev(16b|8h|4s)$")>;
+def : InstRW<[A64FXWrite_LD112, WriteAdr],
+            (instregex "^LD1Threev(8b|4h|2s|1d|2d)_POST$")>;
+def : InstRW<[A64FXWrite_LD113, WriteAdr],
+            (instregex "^LD1Threev(16b|8h|4s)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 4 reg, D-form
+// ASIMD load, 1 element, multiple, 4 reg, Q-form
+def : InstRW<[A64FXWrite_LD106],
+            (instregex "^LD1Fourv(8b|4h|2s|1d|2d)$")>;
+def : InstRW<[A64FXWrite_LD107],
+            (instregex "^LD1Fourv(16b|8h|4s)$")>;
+def : InstRW<[A64FXWrite_LD114, WriteAdr],
+            (instregex "^LD1Fourv(8b|4h|2s|1d|2d)_POST$")>;
+def : InstRW<[A64FXWrite_LD115, WriteAdr],
+            (instregex "^LD1Fourv(16b|8h|4s)_POST$")>;
+
+// ASIMD load, 1 element, one lane, B/H/S
+// ASIMD load, 1 element, one lane, D
+def : InstRW<[A64FXWrite_LD1I0], (instregex "^LD1i(8|16|32|64)$")>;
+def : InstRW<[A64FXWrite_LD1I1, WriteAdr],
+            (instregex "^LD1i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 1 element, all lanes, D-form, B/H/S
+// ASIMD load, 1 element, all lanes, D-form, D
+// ASIMD load, 1 element, all lanes, Q-form
+def : InstRW<[A64FXWrite_8Cyc_GI03],
+            (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A64FXWrite_LD108, WriteAdr],
+            (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 2 element, multiple, D-form, B/H/S
+// ASIMD load, 2 element, multiple, Q-form, D
+def : InstRW<[A64FXWrite_LD103],
+            (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[A64FXWrite_LD111, WriteAdr],
+            (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 2 element, one lane, B/H
+// ASIMD load, 2 element, one lane, S
+// ASIMD load, 2 element, one lane, D
+def : InstRW<[A64FXWrite_LD2I0], (instregex "^LD2i(8|16|32|64)$")>;
+def : InstRW<[A64FXWrite_LD2I1, WriteAdr],
+            (instregex "^LD2i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 2 element, all lanes, D-form, B/H/S
+// ASIMD load, 2 element, all lanes, D-form, D
+// ASIMD load, 2 element, all lanes, Q-form
+def : InstRW<[A64FXWrite_LD102],
+            (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A64FXWrite_LD110, WriteAdr],
+            (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 3 element, multiple, D-form, B/H/S
+// ASIMD load, 3 element, multiple, Q-form, B/H/S
+// ASIMD load, 3 element, multiple, Q-form, D
+def : InstRW<[A64FXWrite_LD105],
+            (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[A64FXWrite_LD113, WriteAdr],
+            (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 3 element, one lone, B/H
+// ASIMD load, 3 element, one lane, S
+// ASIMD load, 3 element, one lane, D
+def : InstRW<[A64FXWrite_LD3I0], (instregex "^LD3i(8|16|32|64)$")>;
+def : InstRW<[A64FXWrite_LD3I1, WriteAdr],
+            (instregex "^LD3i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 3 element, all lanes, D-form, B/H/S
+// ASIMD load, 3 element, all lanes, D-form, D
+// ASIMD load, 3 element, all lanes, Q-form, B/H/S
+// ASIMD load, 3 element, all lanes, Q-form, D
+def : InstRW<[A64FXWrite_LD104],
+            (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A64FXWrite_LD112, WriteAdr],
+            (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 4 element, multiple, D-form, B/H/S
+// ASIMD load, 4 element, multiple, Q-form, B/H/S
+// ASIMD load, 4 element, multiple, Q-form, D
+def : InstRW<[A64FXWrite_LD107],
+            (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[A64FXWrite_LD115, WriteAdr],
+            (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 4 element, one lane, B/H
+// ASIMD load, 4 element, one lane, S
+// ASIMD load, 4 element, one lane, D
+def : InstRW<[A64FXWrite_LD4I0], (instregex "^LD4i(8|16|32|64)$")>;
+def : InstRW<[A64FXWrite_LD4I1, WriteAdr],
+            (instregex "^LD4i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 4 element, all lanes, D-form, B/H/S
+// ASIMD load, 4 element, all lanes, D-form, D
+// ASIMD load, 4 element, all lanes, Q-form, B/H/S
+// ASIMD load, 4 element, all lanes, Q-form, D
+def : InstRW<[A64FXWrite_LD106],
+            (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A64FXWrite_LD114, WriteAdr],
+            (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+//--
+// 3.16 ASIMD Store Instructions
+//--
+
+// ASIMD store, 1 element, multiple, 1 reg, D-form
+// ASIMD store, 1 element, multiple, 1 reg, Q-form
+def : InstRW<[A64FXWrite_ST10],
+            (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A64FXWrite_ST14, WriteAdr],
+            (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 2 reg, D-form
+// ASIMD store, 1 element, multiple, 2 reg, Q-form
+def : InstRW<[A64FXWrite_ST11],
+            (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A64FXWrite_ST15, WriteAdr],
+            (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 3 reg, D-form
+// ASIMD store, 1 element, multiple, 3 reg, Q-form
+def : InstRW<[A64FXWrite_ST12],
+            (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A64FXWrite_ST16, WriteAdr],
+            (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 4 reg, D-form
+// ASIMD store, 1 element, multiple, 4 reg, Q-form
+def : InstRW<[A64FXWrite_ST13],
+            (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A64FXWrite_ST17, WriteAdr],
+            (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, one lane, B/H/S
+// ASIMD store, 1 element, one lane, D
+def : InstRW<[A64FXWrite_ST10],
+            (instregex "^ST1i(8|16|32|64)$")>;
+def : InstRW<[A64FXWrite_ST14, WriteAdr],
+            (instregex "^ST1i(8|16|32|64)_POST$")>;
+
+// ASIMD store, 2 element, multiple, D-form, B/H/S
+// ASIMD store, 2 element, multiple, Q-form, B/H/S
+// ASIMD store, 2 element, multiple, Q-form, D
+def : InstRW<[A64FXWrite_ST11],
+            (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[A64FXWrite_ST15, WriteAdr],
+            (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 2 element, one lane, B/H/S
+// ASIMD store, 2 element, one lane, D
+def : InstRW<[A64FXWrite_ST11],
+            (instregex "^ST2i(8|16|32|64)$")>;
+def : InstRW<[A64FXWrite_ST15, WriteAdr],
+            (instregex "^ST2i(8|16|32|64)_POST$")>;
+
+// ASIMD store, 3 element, multiple, D-form, B/H/S
+// ASIMD store, 3 element, multiple, Q-form, B/H/S
+// ASIMD store, 3 element, multiple, Q-form, D
+def : InstRW<[A64FXWrite_ST12],
+            (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[A64FXWrite_ST16, WriteAdr],
+            (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 3 element, one lane, B/H
+// ASIMD store, 3 element, one lane, S
+// ASIMD store, 3 element, one lane, D
+def : InstRW<[A64FXWrite_ST12], (instregex "^ST3i(8|16|32|64)$")>;
+def : InstRW<[A64FXWrite_ST16, WriteAdr],
+            (instregex "^ST3i(8|16|32|64)_POST$")>;
+
+// ASIMD store, 4 element, multiple, D-form, B/H/S
+// ASIMD store, 4 element, multiple, Q-form, B/H/S
+// ASIMD store, 4 element, multiple, Q-form, D
+def : InstRW<[A64FXWrite_ST13],
+            (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[A64FXWrite_ST17, WriteAdr],
+            (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 4 element, one lane, B/H
+// ASIMD store, 4 element, one lane, S
+// ASIMD store, 4 element, one lane, D
+def : InstRW<[A64FXWrite_ST13], (instregex "^ST4i(8|16|32|64)$")>;
+def : InstRW<[A64FXWrite_ST17, WriteAdr],
+            (instregex "^ST4i(8|16|32|64)_POST$")>;
+
+// V8.1a Atomics (LSE)
+def : InstRW<[A64FXWrite_CAS, WriteAtomic],
+            (instrs CASB, CASH, CASW, CASX)>;
+
+def : InstRW<[A64FXWrite_CAS, WriteAtomic],
+            (instrs CASAB, CASAH, CASAW, CASAX)>;
+
+def : InstRW<[A64FXWrite_CAS, WriteAtomic],
+            (instrs CASLB, CASLH, CASLW, CASLX)>;
+
+def : InstRW<[A64FXWrite_CAS, WriteAtomic],
+            (instrs CASALB, CASALH, CASALW, CASALX)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic],
+            (instrs LDLARB, LDLARH, LDLARW, LDLARX)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic],
+            (instrs LDADDB, LDADDH, LDADDW, LDADDX)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic],
+            (instrs LDADDAB, LDADDAH, LDADDAW, LDADDAX)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic],
+            (instrs LDADDLB, LDADDLH, LDADDLW, LDADDLX)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic],
+            (instrs LDADDALB, LDADDALH, LDADDALW, LDADDALX)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic],
+            (instrs LDCLRB, LDCLRH, LDCLRW, LDCLRX)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic],
+            (instrs LDCLRAB, LDCLRAH, LDCLRAW, LDCLRAX)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic],
+            (instrs LDCLRLB, LDCLRLH, LDCLRLW, LDCLRLX)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic],
+            (instrs LDCLRALB, LDCLRALH, LDCLRALW, LDCLRALX)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic],
+            (instrs LDEORB, LDEORH, LDEORW, LDEORX)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic],
+            (instrs LDEORAB, LDEORAH, LDEORAW, LDEORAX)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic],
+            (instrs LDEORLB, LDEORLH, LDEORLW, LDEORLX)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic],
+            (instrs LDEORALB, LDEORALH, LDEORALW, LDEORALX)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic],
+            (instrs LDSETB, LDSETH, LDSETW, LDSETX)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic],
+            (instrs LDSETAB, LDSETAH, LDSETAW, LDSETAX)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic],
+            (instrs LDSETLB, LDSETLH, LDSETLW, LDSETLX)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic],
+            (instrs LDSETALB, LDSETALH, LDSETALW, LDSETALX)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic],
+            (instrs LDSMAXB, LDSMAXH, LDSMAXW, LDSMAXX,
+             LDSMAXAB, LDSMAXAH, LDSMAXAW, LDSMAXAX,
+             LDSMAXLB, LDSMAXLH, LDSMAXLW, LDSMAXLX,
+             LDSMAXALB, LDSMAXALH, LDSMAXALW, LDSMAXALX)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic],
+            (instrs LDSMINB, LDSMINH, LDSMINW, LDSMINX,
+             LDSMINAB, LDSMINAH, LDSMINAW, LDSMINAX,
+             LDSMINLB, LDSMINLH, LDSMINLW, LDSMINLX,
+             LDSMINALB, LDSMINALH, LDSMINALW, LDSMINALX)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic],
+            (instrs LDUMAXB, LDUMAXH, LDUMAXW, LDUMAXX,
+             LDUMAXAB, LDUMAXAH, LDUMAXAW, LDUMAXAX,
+             LDUMAXLB, LDUMAXLH, LDUMAXLW, LDUMAXLX,
+             LDUMAXALB, LDUMAXALH, LDUMAXALW, LDUMAXALX)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic],
+            (instrs LDUMINB, LDUMINH, LDUMINW, LDUMINX,
+             LDUMINAB, LDUMINAH, LDUMINAW, LDUMINAX,
+             LDUMINLB, LDUMINLH, LDUMINLW, LDUMINLX,
+             LDUMINALB, LDUMINALH, LDUMINALW, LDUMINALX)>;
+
+def : InstRW<[A64FXWrite_SWP, WriteAtomic],
+            (instrs SWPB, SWPH, SWPW, SWPX)>;
+
+def : InstRW<[A64FXWrite_SWP, WriteAtomic],
+            (instrs SWPAB, SWPAH, SWPAW, SWPAX)>;
+
+def : InstRW<[A64FXWrite_SWP, WriteAtomic],
+            (instrs SWPLB, SWPLH, SWPLW, SWPLX)>;
+
+def : InstRW<[A64FXWrite_SWP, WriteAtomic],
+            (instrs SWPALB, SWPALH, SWPALW, SWPALX)>;
+
+def : InstRW<[A64FXWrite_STUR, WriteAtomic],
+            (instrs STLLRB, STLLRH, STLLRW, STLLRX)>;
+
+// [ 1]   "abs  $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ABS_ZPmZ_B, ABS_ZPmZ_D, ABS_ZPmZ_H, ABS_ZPmZ_S)>;
+
+// [ 2]   "add  $Zd, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ADD_ZZZ_B, ADD_ZZZ_D, ADD_ZZZ_H, ADD_ZZZ_S)>;
+
+// [ 3]   "add  $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ADD_ZPmZ_B, ADD_ZPmZ_D, ADD_ZPmZ_H, ADD_ZPmZ_S)>;
+
+// [ 4]   "add  $Zdn, $_Zdn, $imm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ADD_ZI_B, ADD_ZI_D, ADD_ZI_H, ADD_ZI_S)>;
+
+// [ 5]   "addpl        $Rd, $Rn, $imm6";
+def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs ADDPL_XXI)>;
+
+// [ 6]   "addvl        $Rd, $Rn, $imm6";
+def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs ADDVL_XXI)>;
+
+// [ 7]   "adr  $Zd, [$Zn, $Zm]";
+def : InstRW<[A64FXWrite_5Cyc_GI0], (instrs ADR_LSL_ZZZ_D_0, ADR_LSL_ZZZ_D_1, ADR_LSL_ZZZ_D_2, ADR_LSL_ZZZ_D_3, ADR_LSL_ZZZ_S_0, ADR_LSL_ZZZ_S_1, ADR_LSL_ZZZ_S_2, ADR_LSL_ZZZ_S_3, ADR_SXTW_ZZZ_D_0, ADR_SXTW_ZZZ_D_1, ADR_SXTW_ZZZ_D_2, ADR_SXTW_ZZZ_D_3, ADR_UXTW_ZZZ_D_0, ADR_UXTW_ZZZ_D_1, ADR_UXTW_ZZZ_D_2, ADR_UXTW_ZZZ_D_3)>;
+
+// [ 8]   "and  $Pd, $Pg/z, $Pn, $Pm";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs AND_PPzPP)>;
+
+// [ 9]   "and  $Zd, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs AND_ZZZ)>;
+
+// [10]   "and  $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs AND_ZPmZ_B, AND_ZPmZ_D, AND_ZPmZ_H, AND_ZPmZ_S)>;
+
+// [11]   "and  $Zdn, $_Zdn, $imms13";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs AND_ZI)>;
+
+// [12]   "ands $Pd, $Pg/z, $Pn, $Pm";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs ANDS_PPzPP)>;
+
+// [13]   "andv $Vd, $Pg, $Zn";
+def : InstRW<[A64FXWrite_14Cyc_GI03], (instrs ANDV_VPZ_B, ANDV_VPZ_D, ANDV_VPZ_H, ANDV_VPZ_S)>;
+
+// [14]   "asr  $Zd, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ASR_WIDE_ZZZ_B, ASR_WIDE_ZZZ_H, ASR_WIDE_ZZZ_S)>;
+
+// [15]   "asr  $Zd, $Zn, $imm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ASR_ZZI_B, ASR_ZZI_D, ASR_ZZI_H, ASR_ZZI_S)>;
+
+// [16]   "asr  $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ASR_WIDE_ZPmZ_B, ASR_WIDE_ZPmZ_H, ASR_WIDE_ZPmZ_S, ASR_ZPmZ_B, ASR_ZPmZ_D, ASR_ZPmZ_H, ASR_ZPmZ_S)>;
+
+// [17]   "asr  $Zdn, $Pg/m, $_Zdn, $imm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ASR_ZPmI_B, ASR_ZPmI_D, ASR_ZPmI_H, ASR_ZPmI_S)>;
+
+// [18]   "asrd $Zdn, $Pg/m, $_Zdn, $imm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ASRD_ZPmI_B, ASRD_ZPmI_D, ASRD_ZPmI_H, ASRD_ZPmI_S)>;
+
+// [19]   "asrr $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ASRR_ZPmZ_B, ASRR_ZPmZ_D, ASRR_ZPmZ_H, ASRR_ZPmZ_S)>;
+
+// [20]   "bic  $Pd, $Pg/z, $Pn, $Pm";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BIC_PPzPP)>;
+
+// [21]   "bic  $Zd, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs BIC_ZZZ)>;
+
+// [22]   "bic  $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs BIC_ZPmZ_B, BIC_ZPmZ_D, BIC_ZPmZ_H, BIC_ZPmZ_S)>;
+
+// [23]   "bics $Pd, $Pg/z, $Pn, $Pm";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BICS_PPzPP)>;
+
+// [24]   "brka $Pd, $Pg/m, $Pn";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKA_PPmP)>;
+
+// [25]   "brka $Pd, $Pg/z, $Pn";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKA_PPzP)>;
+
+// [26]   "brkas        $Pd, $Pg/z, $Pn";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKAS_PPzP)>;
+
+// [27]   "brkb $Pd, $Pg/m, $Pn";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKB_PPmP)>;
+
+// [28]   "brkb $Pd, $Pg/z, $Pn";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKB_PPzP)>;
+
+// [29]   "brkbs        $Pd, $Pg/z, $Pn";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKBS_PPzP)>;
+
+// [30]   "brkn $Pdm, $Pg/z, $Pn, $_Pdm";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKN_PPzP)>;
+
+// [31]   "brkns        $Pdm, $Pg/z, $Pn, $_Pdm";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKNS_PPzP)>;
+
+// [32]   "brkpa        $Pd, $Pg/z, $Pn, $Pm";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKPA_PPzPP)>;
+
+// [33]   "brkpas       $Pd, $Pg/z, $Pn, $Pm";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKPAS_PPzPP)>;
+
+// [34]   "brkpb        $Pd, $Pg/z, $Pn, $Pm";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKPB_PPzPP)>;
+
+// [35]   "brkpbs       $Pd, $Pg/z, $Pn, $Pm";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKPBS_PPzPP)>;
+
+// [36]   "clasta       $Rdn, $Pg, $_Rdn, $Zm";
+def : InstRW<[A64FXWrite_29Cyc_GI0256], (instrs CLASTA_RPZ_B, CLASTA_RPZ_D, CLASTA_RPZ_H, CLASTA_RPZ_S)>;
+
+// [37]   "clasta       $Vdn, $Pg, $_Vdn, $Zm";
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs CLASTA_VPZ_B, CLASTA_VPZ_D, CLASTA_VPZ_H, CLASTA_VPZ_S)>;
+
+// [38]   "clasta       $Zdn, $Pg, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs CLASTA_ZPZ_B, CLASTA_ZPZ_D, CLASTA_ZPZ_H, CLASTA_ZPZ_S)>;
+
+// [39]   "clastb       $Rdn, $Pg, $_Rdn, $Zm";
+def : InstRW<[A64FXWrite_29Cyc_GI0256], (instrs CLASTB_RPZ_B, CLASTB_RPZ_D, CLASTB_RPZ_H, CLASTB_RPZ_S)>;
+
+// [40]   "clastb       $Vdn, $Pg, $_Vdn, $Zm";
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs CLASTB_VPZ_B, CLASTB_VPZ_D, CLASTB_VPZ_H, CLASTB_VPZ_S)>;
+
+// [41]   "clastb       $Zdn, $Pg, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs CLASTB_ZPZ_B, CLASTB_ZPZ_D, CLASTB_ZPZ_H, CLASTB_ZPZ_S)>;
+
+// [42]   "cls  $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs CLS_ZPmZ_B, CLS_ZPmZ_D, CLS_ZPmZ_H, CLS_ZPmZ_S)>;
+
+// [43]   "clz  $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs CLZ_ZPmZ_B, CLZ_ZPmZ_D, CLZ_ZPmZ_H, CLZ_ZPmZ_S)>;
+
+// [44]   "cmpeq        $Pd, $Pg/z, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPEQ_PPzZZ_B, CMPEQ_PPzZZ_D, CMPEQ_PPzZZ_H, CMPEQ_PPzZZ_S, CMPEQ_WIDE_PPzZZ_B, CMPEQ_WIDE_PPzZZ_H, CMPEQ_WIDE_PPzZZ_S)>;
+
+// [45]   "cmpeq        $Pd, $Pg/z, $Zn, $imm5";
+def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPEQ_PPzZI_B, CMPEQ_PPzZI_D, CMPEQ_PPzZI_H, CMPEQ_PPzZI_S)>;
+
+// [46]   "cmpge        $Pd, $Pg/z, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPGE_PPzZZ_B, CMPGE_PPzZZ_D, CMPGE_PPzZZ_H, CMPGE_PPzZZ_S, CMPGE_WIDE_PPzZZ_B, CMPGE_WIDE_PPzZZ_H, CMPGE_WIDE_PPzZZ_S)>;
+
+// [47]   "cmpge        $Pd, $Pg/z, $Zn, $imm5";
+def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPGE_PPzZI_B, CMPGE_PPzZI_D, CMPGE_PPzZI_H, CMPGE_PPzZI_S)>;
+
+// [48]   "cmpgt        $Pd, $Pg/z, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPGT_PPzZZ_B, CMPGT_PPzZZ_D, CMPGT_PPzZZ_H, CMPGT_PPzZZ_S, CMPGT_WIDE_PPzZZ_B, CMPGT_WIDE_PPzZZ_H, CMPGT_WIDE_PPzZZ_S)>;
+
+// [49]   "cmpgt        $Pd, $Pg/z, $Zn, $imm5";
+def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPGT_PPzZI_B, CMPGT_PPzZI_D, CMPGT_PPzZI_H, CMPGT_PPzZI_S)>;
+
+// [50]   "cmphi        $Pd, $Pg/z, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPHI_PPzZZ_B, CMPHI_PPzZZ_D, CMPHI_PPzZZ_H, CMPHI_PPzZZ_S, CMPHI_WIDE_PPzZZ_B, CMPHI_WIDE_PPzZZ_H, CMPHI_WIDE_PPzZZ_S)>;
+
+// [51]   "cmphi        $Pd, $Pg/z, $Zn, $imm7";
+def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPHI_PPzZI_B, CMPHI_PPzZI_D, CMPHI_PPzZI_H, CMPHI_PPzZI_S)>;
+
+// [52]   "cmphs        $Pd, $Pg/z, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPHS_PPzZZ_B, CMPHS_PPzZZ_D, CMPHS_PPzZZ_H, CMPHS_PPzZZ_S, CMPHS_WIDE_PPzZZ_B, CMPHS_WIDE_PPzZZ_H, CMPHS_WIDE_PPzZZ_S)>;
+
+// [53]   "cmphs        $Pd, $Pg/z, $Zn, $imm7";
+def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPHS_PPzZI_B, CMPHS_PPzZI_D, CMPHS_PPzZI_H, CMPHS_PPzZI_S)>;
+
+// [54]   "cmple        $Pd, $Pg/z, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPLE_WIDE_PPzZZ_B, CMPLE_WIDE_PPzZZ_H, CMPLE_WIDE_PPzZZ_S)>;
+
+// [55]   "cmple        $Pd, $Pg/z, $Zn, $imm5";
+def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPLE_PPzZI_B, CMPLE_PPzZI_D, CMPLE_PPzZI_H, CMPLE_PPzZI_S)>;
+
+// [56]   "cmplo        $Pd, $Pg/z, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPLO_WIDE_PPzZZ_B, CMPLO_WIDE_PPzZZ_H, CMPLO_WIDE_PPzZZ_S)>;
+
+// [57]   "cmplo        $Pd, $Pg/z, $Zn, $imm7";
+def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPLO_PPzZI_B, CMPLO_PPzZI_D, CMPLO_PPzZI_H, CMPLO_PPzZI_S)>;
+
+// [58]   "cmpls        $Pd, $Pg/z, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPLS_WIDE_PPzZZ_B, CMPLS_WIDE_PPzZZ_H, CMPLS_WIDE_PPzZZ_S)>;
+
+// [59]   "cmpls        $Pd, $Pg/z, $Zn, $imm7";
+def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPLS_PPzZI_B, CMPLS_PPzZI_D, CMPLS_PPzZI_H, CMPLS_PPzZI_S)>;
+
+// [60]   "cmplt        $Pd, $Pg/z, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPLT_WIDE_PPzZZ_B, CMPLT_WIDE_PPzZZ_H, CMPLT_WIDE_PPzZZ_S)>;
+
+// [61]   "cmplt        $Pd, $Pg/z, $Zn, $imm5";
+def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPLT_PPzZI_B, CMPLT_PPzZI_D, CMPLT_PPzZI_H, CMPLT_PPzZI_S)>;
+
+// [62]   "cmpne        $Pd, $Pg/z, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPNE_PPzZZ_B, CMPNE_PPzZZ_D, CMPNE_PPzZZ_H, CMPNE_PPzZZ_S, CMPNE_WIDE_PPzZZ_B, CMPNE_WIDE_PPzZZ_H, CMPNE_WIDE_PPzZZ_S)>;
+
+// [63]   "cmpne        $Pd, $Pg/z, $Zn, $imm5";
+def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPNE_PPzZI_B, CMPNE_PPzZI_D, CMPNE_PPzZI_H, CMPNE_PPzZI_S)>;
+
+// [64]   "cnot $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs CNOT_ZPmZ_B, CNOT_ZPmZ_D, CNOT_ZPmZ_H, CNOT_ZPmZ_S)>;
+
+// [65]   "cnt  $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_4Cyc_GI3], (instrs CNT_ZPmZ_B, CNT_ZPmZ_D, CNT_ZPmZ_H, CNT_ZPmZ_S)>;
+
+// [66]   "cntb $Rd, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs CNTB_XPiI)>;
+
+// [67]   "cntd $Rd, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs CNTD_XPiI)>;
+
+// [68]   "cnth $Rd, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs CNTH_XPiI)>;
+
+// [69]   "cntp $Rd, $Pg, $Pn";
+def : InstRW<[A64FXWrite_6Cyc_GI01], (instrs CNTP_XPP_B, CNTP_XPP_D, CNTP_XPP_H, CNTP_XPP_S)>;
+
+// [70]   "cntw $Rd, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs CNTW_XPiI)>;
+
+// [71]   "compact      $Zd, $Pg, $Zn";
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs COMPACT_ZPZ_D, COMPACT_ZPZ_S)>;
+
+// [72]   "cpy  $Zd, $Pg/m, $Rn";
+//@@@ def : InstRW<[XXXXXX], (instrs CPY_ZPmR_B, CPY_ZPmR_D, CPY_ZPmR_H, CPY_ZPmR_S)>;
+
+// [73]   "cpy  $Zd, $Pg/m, $Vn";
+//@@@ def : InstRW<[XXXXXX], (instrs CPY_ZPmV_B, CPY_ZPmV_D, CPY_ZPmV_H, CPY_ZPmV_S)>;
+
+// [74]   "cpy  $Zd, $Pg/m, $imm";
+//@@@ def : InstRW<[XXXXXX], (instrs CPY_ZPmI_B, CPY_ZPmI_D, CPY_ZPmI_H, CPY_ZPmI_S)>;
+
+// [75]   "cpy  $Zd, $Pg/z, $imm";
+//@@@ def : InstRW<[XXXXXX], (instrs CPY_ZPzI_B, CPY_ZPzI_D, CPY_ZPzI_H, CPY_ZPzI_S)>;
+
+// [76]   "ctermeq      $Rn, $Rm";
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs CTERMEQ_WW, CTERMEQ_XX)>;
+
+// [77]   "ctermne      $Rn, $Rm";
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs CTERMNE_WW, CTERMNE_XX)>;
+
+// [78]   "decb $Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs DECB_XPiI)>;
+
+// [79]   "decd $Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs DECD_XPiI)>;
+
+// [80]   "decd $Zdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs DECD_ZPiI)>;
+
+// [81]   "dech $Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs DECH_XPiI)>;
+
+// [82]   "dech $Zdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs DECH_ZPiI)>;
+
+// [83]   "decp $Rdn, $Pg";
+def : InstRW<[A64FXWrite_6Cyc_GI124], (instrs DECP_XP_B, DECP_XP_D, DECP_XP_H, DECP_XP_S)>;
+
+// [84]   "decp $Zdn, $Pg";
+def : InstRW<[A64FXWrite_12Cyc_GI01], (instrs DECP_ZP_D, DECP_ZP_H, DECP_ZP_S)>;
+
+// [85]   "decw $Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs DECW_XPiI)>;
+
+// [86]   "decw $Zdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs DECW_ZPiI)>;
+
+// [87]   "dup  $Zd, $Rn";
+def : InstRW<[A64FXWrite_8Cyc_GI01], (instrs DUP_ZR_B, DUP_ZR_D, DUP_ZR_H, DUP_ZR_S)>;
+
+// [88]   "dup  $Zd, $Zn$idx";
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs DUP_ZZI_B, DUP_ZZI_D, DUP_ZZI_H, DUP_ZZI_Q, DUP_ZZI_S)>;
+
+// [89]   "dup  $Zd, $imm";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs DUP_ZI_B, DUP_ZI_D, DUP_ZI_H, DUP_ZI_S)>;
+
+// [90]   "dupm $Zd, $imms";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs DUPM_ZI)>;
+
+// [91]   "eor  $Pd, $Pg/z, $Pn, $Pm";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs EOR_PPzPP)>;
+
+// [92]   "eor  $Zd, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs EOR_ZZZ)>;
+
+// [93]   "eor  $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs EOR_ZPmZ_B, EOR_ZPmZ_D, EOR_ZPmZ_H, EOR_ZPmZ_S)>;
+
+// [94]   "eor  $Zdn, $_Zdn, $imms13";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs EOR_ZI)>;
+
+// [95]   "eors $Pd, $Pg/z, $Pn, $Pm";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs EORS_PPzPP)>;
+
+// [96]   "eorv $Vd, $Pg, $Zn";
+def : InstRW<[A64FXWrite_14Cyc_GI03], (instrs EORV_VPZ_B, EORV_VPZ_D, EORV_VPZ_H, EORV_VPZ_S)>;
+
+// [97]   "ext  $Zdn, $_Zdn, $Zm, $imm8";
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs EXT_ZZI)>;
+
+// [99]   "fabd $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FABD_ZPmZ_D, FABD_ZPmZ_H, FABD_ZPmZ_S)>;
+
+// [100]   "fabs        $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FABS_ZPmZ_D, FABS_ZPmZ_H, FABS_ZPmZ_S)>;
+
+// [101]   "facge       $Pd, $Pg/z, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FACGE_PPzZZ_D, FACGE_PPzZZ_H, FACGE_PPzZZ_S)>;
+
+// [102]   "facgt       $Pd, $Pg/z, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FACGT_PPzZZ_D, FACGT_PPzZZ_H, FACGT_PPzZZ_S)>;
+
+// [103]   "fadd        $Zd, $Zn, $Zm"; def is line 1638
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FADD_ZZZ_D, FADD_ZZZ_H, FADD_ZZZ_S)>;
+
+// [104]   "fadd        $Zdn, $Pg/m, $_Zdn, $Zm"; def is line 1638
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FADD_ZPmZ_D, FADD_ZPmZ_H, FADD_ZPmZ_S)>;
+
+// [105]   "fadd        $Zdn, $Pg/m, $_Zdn, $i1"; def is line 1638
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FADD_ZPmI_D, FADD_ZPmI_H, FADD_ZPmI_S)>;
+
+// [106]   "fadda       $Vdn, $Pg, $_Vdn, $Zm";
+def : InstRW<[A64FXWrite_18Cyc_GI03], (instrs FADDA_VPZ_D, FADDA_VPZ_H, FADDA_VPZ_S)>;
+
+// [107]   "faddv       $Vd, $Pg, $Zn";
+// H : 4 / 6 / ([1,2]9 / [1]6) x 4 / [1,2]9 = 75 cycle
+// S : 4 / 6 / ([1,2]9 / [1]6) x 3 / [1,2]9 = 60 cycle
+// D : 4 / 6 / ([1,2]9 / [1]6) x 2 / [1,2]9 = 45 cycle
+def : InstRW<[A64FXWrite_75Cyc_GI03], (instrs FADDV_VPZ_H)>;
+def : InstRW<[A64FXWrite_60Cyc_GI03], (instrs FADDV_VPZ_S)>;
+def : InstRW<[A64FXWrite_45Cyc_GI03], (instrs FADDV_VPZ_D)>;
+
+// [108]   "fcadd       $Zdn, $Pg/m, $_Zdn, $Zm, $imm";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FCADD_ZPmZ_D, FCADD_ZPmZ_H, FCADD_ZPmZ_S)>;
+
+// [109]   "fcmeq       $Pd, $Pg/z, $Zn, #0.0";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMEQ_PPzZ0_D, FCMEQ_PPzZ0_H, FCMEQ_PPzZ0_S)>;
+
+// [110]   "fcmeq       $Pd, $Pg/z, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMEQ_PPzZZ_D, FCMEQ_PPzZZ_H, FCMEQ_PPzZZ_S)>;
+
+// [111]   "fcmge       $Pd, $Pg/z, $Zn, #0.0";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMGE_PPzZ0_D, FCMGE_PPzZ0_H, FCMGE_PPzZ0_S)>;
+
+// [112]   "fcmge       $Pd, $Pg/z, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMGE_PPzZZ_D, FCMGE_PPzZZ_H, FCMGE_PPzZZ_S)>;
+
+// [113]   "fcmgt       $Pd, $Pg/z, $Zn, #0.0";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMGT_PPzZ0_D, FCMGT_PPzZ0_H, FCMGT_PPzZ0_S)>;
+
+// [114]   "fcmgt       $Pd, $Pg/z, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMGT_PPzZZ_D, FCMGT_PPzZZ_H, FCMGT_PPzZZ_S)>;
+
+// [115]   "fcmla       $Zda, $Pg/m, $Zn, $Zm, $imm";
+def : InstRW<[A64FXWrite_15Cyc_GI03], (instrs FCMLA_ZPmZZ_D, FCMLA_ZPmZZ_H, FCMLA_ZPmZZ_S)>;
+
+// [116]   "fcmla       $Zda, $Zn, $Zm$iop, $imm";
+def : InstRW<[A64FXWrite_15Cyc_GI03], (instrs FCMLA_ZZZI_H, FCMLA_ZZZI_S)>;
+
+// [117]   "fcmle       $Pd, $Pg/z, $Zn, #0.0";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMLE_PPzZ0_D, FCMLE_PPzZ0_H, FCMLE_PPzZ0_S)>;
+
+// [118]   "fcmlt       $Pd, $Pg/z, $Zn, #0.0";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMLT_PPzZ0_D, FCMLT_PPzZ0_H, FCMLT_PPzZ0_S)>;
+
+// [119]   "fcmne       $Pd, $Pg/z, $Zn, #0.0";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMNE_PPzZ0_D, FCMNE_PPzZ0_H, FCMNE_PPzZ0_S)>;
+
+// [120]   "fcmne       $Pd, $Pg/z, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMNE_PPzZZ_D, FCMNE_PPzZZ_H, FCMNE_PPzZZ_S)>;
+
+// [121]   "fcmuo       $Pd, $Pg/z, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMUO_PPzZZ_D, FCMUO_PPzZZ_H, FCMUO_PPzZZ_S)>;
+
+// [122]   "fcpy        $Zd, $Pg/m, $imm8";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCPY_ZPmI_D, FCPY_ZPmI_H, FCPY_ZPmI_S)>;
+
+// [123]   "fcvt        $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FCVT_ZPmZ_DtoH, FCVT_ZPmZ_DtoS, FCVT_ZPmZ_HtoD, FCVT_ZPmZ_HtoS, FCVT_ZPmZ_StoD, FCVT_ZPmZ_StoH)>;
+
+// [124]   "fcvtzs      $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FCVTZS_ZPmZ_DtoD, FCVTZS_ZPmZ_DtoS, FCVTZS_ZPmZ_HtoD, FCVTZS_ZPmZ_HtoH, FCVTZS_ZPmZ_HtoS, FCVTZS_ZPmZ_StoD, FCVTZS_ZPmZ_StoS)>;
+
+// [125]   "fcvtzu      $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FCVTZU_ZPmZ_DtoD, FCVTZU_ZPmZ_DtoS, FCVTZU_ZPmZ_HtoD, FCVTZU_ZPmZ_HtoH, FCVTZU_ZPmZ_HtoS, FCVTZU_ZPmZ_StoD, FCVTZU_ZPmZ_StoS)>;
+
+// [126]   "fdiv        $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_154Cyc_GI0], (instrs FDIV_ZPmZ_D)>;
+def : InstRW<[A64FXWrite_134Cyc_GI0], (instrs FDIV_ZPmZ_H)>;
+def : InstRW<[A64FXWrite_98Cyc_GI0], (instrs FDIV_ZPmZ_S)>;
+
+// [127]   "fdivr       $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_154Cyc_GI0], (instrs FDIVR_ZPmZ_D)>;
+def : InstRW<[A64FXWrite_134Cyc_GI0], (instrs FDIVR_ZPmZ_H)>;
+def : InstRW<[A64FXWrite_98Cyc_GI0], (instrs FDIVR_ZPmZ_S)>;
+
+// [128]   "fdup        $Zd, $imm8";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FDUP_ZI_D, FDUP_ZI_H, FDUP_ZI_S)>;
+
+// [129]   "fexpa       $Zd, $Zn";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FEXPA_ZZ_D, FEXPA_ZZ_H, FEXPA_ZZ_S)>;
+
+// [130]   "fmad        $Zdn, $Pg/m, $Zm, $Za";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FMAD_ZPmZZ_D, FMAD_ZPmZZ_H, FMAD_ZPmZZ_S)>;
+
+// [131]   "fmax        $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FMAX_ZPmZ_D, FMAX_ZPmZ_H, FMAX_ZPmZ_S)>;
+
+// [132]   "fmax        $Zdn, $Pg/m, $_Zdn, $i1";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FMAX_ZPmI_D, FMAX_ZPmI_H, FMAX_ZPmI_S)>;
+
+// [133]   "fmaxnm      $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FMAXNM_ZPmZ_D, FMAXNM_ZPmZ_H, FMAXNM_ZPmZ_S)>;
+
+// [134]   "fmaxnm      $Zdn, $Pg/m, $_Zdn, $i1";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FMAXNM_ZPmI_D, FMAXNM_ZPmI_H, FMAXNM_ZPmI_S)>;
+
+// [135]   "fmaxnmv     $Vd, $Pg, $Zn";
+def : InstRW<[A64FXWrite_10Cyc_GI03], (instrs FMAXNMV_VPZ_D, FMAXNMV_VPZ_H, FMAXNMV_VPZ_S)>;
+
+// [136]   "fmaxv       $Vd, $Pg, $Zn";
+def : InstRW<[A64FXWrite_10Cyc_GI03], (instrs FMAXV_VPZ_D, FMAXV_VPZ_H, FMAXV_VPZ_S)>;
+
+// [137]   "fmin        $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FMIN_ZPmZ_D, FMIN_ZPmZ_H, FMIN_ZPmZ_S)>;
+
+// [138]   "fmin        $Zdn, $Pg/m, $_Zdn, $i1";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FMIN_ZPmI_D, FMIN_ZPmI_H, FMIN_ZPmI_S)>;
+
+// [139]   "fminnm      $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FMINNM_ZPmZ_D, FMINNM_ZPmZ_H, FMINNM_ZPmZ_S)>;
+
+// [140]   "fminnm      $Zdn, $Pg/m, $_Zdn, $i1";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FMINNM_ZPmI_D, FMINNM_ZPmI_H, FMINNM_ZPmI_S)>;
+
+// [141]   "fminnmv     $Vd, $Pg, $Zn";
+def : InstRW<[A64FXWrite_10Cyc_GI03], (instrs FMINNMV_VPZ_D, FMINNMV_VPZ_H, FMINNMV_VPZ_S)>;
+
+// [142]   "fminv       $Vd, $Pg, $Zn";
+def : InstRW<[A64FXWrite_10Cyc_GI03], (instrs FMINV_VPZ_D, FMINV_VPZ_H, FMINV_VPZ_S)>;
+
+// [143]   "fmla        $Zda, $Pg/m, $Zn, $Zm";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FMLA_ZPmZZ_D, FMLA_ZPmZZ_H, FMLA_ZPmZZ_S)>;
+
+// [144]   "fmla        $Zda, $Zn, $Zm$iop";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FMLA_ZZZI_D, FMLA_ZZZI_H, FMLA_ZZZI_S)>;
+
+// [145]   "fmls        $Zda, $Pg/m, $Zn, $Zm";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FMLS_ZPmZZ_D, FMLS_ZPmZZ_H, FMLS_ZPmZZ_S)>;
+
+// [146]   "fmls        $Zda, $Zn, $Zm$iop";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FMLS_ZZZI_D, FMLS_ZZZI_H, FMLS_ZZZI_S)>;
+
+// [147]   "fmsb        $Zdn, $Pg/m, $Zm, $Za";
+
+// [148]   "fmul        $Zd, $Zn, $Zm";
+
+// [149]   "fmul        $Zd, $Zn, $Zm$iop";
+
+// [150]   "fmul        $Zdn, $Pg/m, $_Zdn, $Zm";
+
+// [151]   "fmul        $Zdn, $Pg/m, $_Zdn, $i1";
+
+// [152]   "fmulx       $Zdn, $Pg/m, $_Zdn, $Zm";
+
+// [153]   "fneg        $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FNEG_ZPmZ_D, FNEG_ZPmZ_H, FNEG_ZPmZ_S)>;
+
+// [154]   "fnmad       $Zdn, $Pg/m, $Zm, $Za";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FNMAD_ZPmZZ_D, FNMAD_ZPmZZ_H, FNMAD_ZPmZZ_S)>;
+
+// [155]   "fnmla       $Zda, $Pg/m, $Zn, $Zm";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FNMLA_ZPmZZ_D, FNMLA_ZPmZZ_H, FNMLA_ZPmZZ_S)>;
+
+// [156]   "fnmls       $Zda, $Pg/m, $Zn, $Zm";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FNMLS_ZPmZZ_D, FNMLS_ZPmZZ_H, FNMLS_ZPmZZ_S)>;
+
+// [157]   "fnmsb       $Zdn, $Pg/m, $Zm, $Za";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FNMSB_ZPmZZ_D, FNMSB_ZPmZZ_H, FNMSB_ZPmZZ_S)>;
+
+// [158]   "frecpe      $Zd, $Zn";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FRECPE_ZZ_D, FRECPE_ZZ_H, FRECPE_ZZ_S)>;
+
+// [159]   "frecps      $Zd, $Zn, $Zm";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FRECPS_ZZZ_D, FRECPS_ZZZ_H, FRECPS_ZZZ_S)>;
+
+// [160]   "frecpx      $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FRECPX_ZPmZ_D, FRECPX_ZPmZ_H, FRECPX_ZPmZ_S)>;
+
+// [161]   "frinta      $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FRINTA_ZPmZ_D, FRINTA_ZPmZ_H, FRINTA_ZPmZ_S)>;
+
+// [162]   "frinti      $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FRINTI_ZPmZ_D, FRINTI_ZPmZ_H, FRINTI_ZPmZ_S)>;
+
+// [163]   "frintm      $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FRINTM_ZPmZ_D, FRINTM_ZPmZ_H, FRINTM_ZPmZ_S)>;
+
+// [164]   "frintn      $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FRINTN_ZPmZ_D, FRINTN_ZPmZ_H, FRINTN_ZPmZ_S)>;
+
+// [165]   "frintp      $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FRINTP_ZPmZ_D, FRINTP_ZPmZ_H, FRINTP_ZPmZ_S)>;
+
+// [166]   "frintx      $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FRINTX_ZPmZ_D, FRINTX_ZPmZ_H, FRINTX_ZPmZ_S)>;
+
+// [167]   "frintz      $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FRINTZ_ZPmZ_D, FRINTZ_ZPmZ_H, FRINTZ_ZPmZ_S)>;
+
+// [168]   "frsqrte     $Zd, $Zn";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FRSQRTE_ZZ_D, FRSQRTE_ZZ_H, FRSQRTE_ZZ_S)>;
+
+// [169]   "frsqrts     $Zd, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FRSQRTS_ZZZ_D, FRSQRTS_ZZZ_H, FRSQRTS_ZZZ_S)>;
+
+// [170]   "fscale      $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FSCALE_ZPmZ_D, FSCALE_ZPmZ_H, FSCALE_ZPmZ_S)>;
+
+// [171]   "fsqrt       $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_154Cyc_GI0], (instrs FSQRT_ZPmZ_D)>;
+def : InstRW<[A64FXWrite_134Cyc_GI0], (instrs FSQRT_ZPmZ_H)>;
+def : InstRW<[A64FXWrite_98Cyc_GI0], (instrs FSQRT_ZPmZ_S)>;
+
+// [172]   "fsub        $Zd, $Zn, $Zm";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FSUB_ZZZ_D, FSUB_ZZZ_H, FSUB_ZZZ_S)>;
+
+// [173]   "fsub        $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FSUB_ZPmZ_D, FSUB_ZPmZ_H, FSUB_ZPmZ_S)>;
+
+// [174]   "fsub        $Zdn, $Pg/m, $_Zdn, $i1";
+def : InstRW<[A64FXWrite_9Cyc_GI0], (instrs FSUB_ZPmI_D, FSUB_ZPmI_H, FSUB_ZPmI_S)>;
+
+// [175]   "fsubr       $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FSUBR_ZPmZ_D, FSUBR_ZPmZ_H, FSUBR_ZPmZ_S)>;
+
+// [176]   "fsubr       $Zdn, $Pg/m, $_Zdn, $i1";
+def : InstRW<[A64FXWrite_9Cyc_GI0], (instrs FSUBR_ZPmI_D, FSUBR_ZPmI_H, FSUBR_ZPmI_S)>;
+
+// [177]   "ftmad       $Zdn, $_Zdn, $Zm, $imm3";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FTMAD_ZZI_D, FTMAD_ZZI_H, FTMAD_ZZI_S)>;
+
+// [178]   "ftsmul      $Zd, $Zn, $Zm";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FTSMUL_ZZZ_D, FTSMUL_ZZZ_H, FTSMUL_ZZZ_S)>;
+
+// [180]   "incb        $Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs INCB_XPiI)>;
+
+// [181]   "incd        $Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs INCD_XPiI)>;
+
+// [182]   "incd        $Zdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs INCD_ZPiI)>;
+
+// [183]   "inch        $Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs INCH_XPiI)>;
+
+// [184]   "inch        $Zdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs INCH_ZPiI)>;
+
+// [185]   "incp        $Rdn, $Pg";
+def : InstRW<[A64FXWrite_6Cyc_GI124], (instrs INCP_XP_B, INCP_XP_D, INCP_XP_H, INCP_XP_S)>;
+
+// [186]   "incp        $Zdn, $Pg";
+def : InstRW<[A64FXWrite_12Cyc_GI01], (instrs INCP_ZP_D, INCP_ZP_H, INCP_ZP_S)>;
+
+// [187]   "incw        $Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs INCW_XPiI)>;
+
+// [188]   "incw        $Zdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs INCW_ZPiI)>;
+
+// [189]   "index       $Zd, $Rn, $Rm";
+def : InstRW<[A64FXWrite_17Cyc_GI02], (instrs INDEX_RR_B, INDEX_RR_D, INDEX_RR_H, INDEX_RR_S)>;
+
+// [190]   "index       $Zd, $Rn, $imm5";
+def : InstRW<[A64FXWrite_21Cyc_GI02], (instrs INDEX_RI_B, INDEX_RI_D, INDEX_RI_H, INDEX_RI_S)>;
+
+// [191]   "index       $Zd, $imm5, $Rm";
+def : InstRW<[A64FXWrite_21Cyc_GI02], (instrs INDEX_IR_B, INDEX_IR_D, INDEX_IR_H, INDEX_IR_S)>;
+
+// [192]   "index       $Zd, $imm5, $imm5b";
+def : InstRW<[A64FXWrite_13Cyc_GI0], (instrs INDEX_II_B, INDEX_II_D, INDEX_II_H, INDEX_II_S)>;
+
+// [193]   "insr        $Zdn, $Rm";
+def : InstRW<[A64FXWrite_10Cyc_GI02], (instrs INSR_ZR_B, INSR_ZR_D, INSR_ZR_H, INSR_ZR_S)>;
+
+// [194]   "insr        $Zdn, $Vm";
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs INSR_ZV_B, INSR_ZV_D, INSR_ZV_H, INSR_ZV_S)>;
+
+// [195]   "lasta       $Rd, $Pg, $Zn";
+def : InstRW<[A64FXWrite_25Cyc_GI056], (instrs LASTA_RPZ_B, LASTA_RPZ_D, LASTA_RPZ_H, LASTA_RPZ_S)>;
+
+// [196]   "lasta       $Vd, $Pg, $Zn";
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs LASTA_VPZ_B, LASTA_VPZ_D, LASTA_VPZ_H, LASTA_VPZ_S)>;
+
+// [197]   "lastb       $Rd, $Pg, $Zn";
+def : InstRW<[A64FXWrite_25Cyc_GI056], (instrs LASTB_RPZ_B, LASTB_RPZ_D, LASTB_RPZ_H, LASTB_RPZ_S)>;
+
+// [198]   "lastb       $Vd, $Pg, $Zn";
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs LASTB_VPZ_B, LASTB_VPZ_D, LASTB_VPZ_H, LASTB_VPZ_S)>;
+
+// [199]   "ld1b        $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1B, LD1B_D, LD1B_H, LD1B_S)>;
+
+// [200]   "ld1b        $Zt, $Pg/z, [$Rn, $Zm]";
+def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLD1B_D_REAL, GLD1B_D_SXTW_REAL, GLD1B_D_UXTW_REAL, GLD1B_S_SXTW_REAL, GLD1B_S_UXTW_REAL)>;
+
+// [201]   "ld1b        $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1B_D_IMM_REAL, LD1B_H_IMM_REAL, LD1B_IMM_REAL, LD1B_S_IMM_REAL)>;
+
+// [202]   "ld1b        $Zt, $Pg/z, [$Zn, $imm5]";
+def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLD1B_D_IMM_REAL, GLD1B_S_IMM_REAL)>;
+
+// [203]   "ld1d        $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1D)>;
+
+// [204]   "ld1d        $Zt, $Pg/z, [$Rn, $Zm]";
+def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLD1D_REAL, GLD1D_SCALED_REAL, GLD1D_SXTW_REAL, GLD1D_SXTW_SCALED_REAL, GLD1D_UXTW_REAL, GLD1D_UXTW_SCALED_REAL)>;
+
+// [205]   "ld1d        $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1D_IMM_REAL)>;
+
+// [206]   "ld1d        $Zt, $Pg/z, [$Zn, $imm5]";
+def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLD1D_IMM_REAL)>;
+
+// [207]   "ld1h        $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1H, LD1H_D, LD1H_S)>;
+
+// [208]   "ld1h        $Zt, $Pg/z, [$Rn, $Zm]";
+def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLD1H_D_REAL, GLD1H_D_SCALED_REAL, GLD1H_D_SXTW_REAL, GLD1H_D_SXTW_SCALED_REAL, GLD1H_D_UXTW_REAL, GLD1H_D_UXTW_SCALED_REAL, GLD1H_S_SXTW_REAL, GLD1H_S_SXTW_SCALED_REAL, GLD1H_S_UXTW_REAL, GLD1H_S_UXTW_SCALED_REAL)>;
+
+// [209]   "ld1h        $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1H_D_IMM_REAL, LD1H_IMM_REAL, LD1H_S_IMM_REAL)>;
+
+// [210]   "ld1h        $Zt, $Pg/z, [$Zn, $imm5]";
+def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLD1H_D_IMM_REAL, GLD1H_S_IMM_REAL)>;
+
+// [211]   "ld1rb       $Zt, $Pg/z, [$Rn, $imm6]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RB_D_IMM, LD1RB_H_IMM, LD1RB_IMM, LD1RB_S_IMM)>;
+
+// [212]   "ld1rd       $Zt, $Pg/z, [$Rn, $imm6]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RD_IMM)>;
+
+// [213]   "ld1rh       $Zt, $Pg/z, [$Rn, $imm6]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RH_D_IMM, LD1RH_IMM, LD1RH_S_IMM)>;
+
+// [214]   "ld1rqb      $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RQ_B)>;
+
+// [215]   "ld1rqb      $Zt, $Pg/z, [$Rn, $imm4]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RQ_B_IMM)>;
+
+// [216]   "ld1rqd      $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RQ_D)>;
+
+// [217]   "ld1rqd      $Zt, $Pg/z, [$Rn, $imm4]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RQ_D_IMM)>;
+
+// [218]   "ld1rqh      $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RQ_H)>;
+
+// [219]   "ld1rqh      $Zt, $Pg/z, [$Rn, $imm4]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RQ_H_IMM)>;
+
+// [220]   "ld1rqw      $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RQ_W)>;
+
+// [221]   "ld1rqw      $Zt, $Pg/z, [$Rn, $imm4]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RQ_W_IMM)>;
+
+// [222]   "ld1rsb      $Zt, $Pg/z, [$Rn, $imm6]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RSB_D_IMM, LD1RSB_H_IMM, LD1RSB_S_IMM)>;
+
+// [223]   "ld1rsh      $Zt, $Pg/z, [$Rn, $imm6]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RSH_D_IMM, LD1RSH_S_IMM)>;
+
+// [224]   "ld1rsw      $Zt, $Pg/z, [$Rn, $imm6]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RSW_IMM)>;
+
+// [225]   "ld1rw       $Zt, $Pg/z, [$Rn, $imm6]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RW_D_IMM, LD1RW_IMM)>;
+
+// [226]   "ld1sb       $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1SB_D, LD1SB_H, LD1SB_S)>;
+
+// [227]   "ld1sb       $Zt, $Pg/z, [$Rn, $Zm]";
+def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLD1SB_D_REAL, GLD1SB_D_SXTW_REAL, GLD1SB_D_UXTW_REAL, GLD1SB_S_SXTW_REAL, GLD1SB_S_UXTW_REAL)>;
+
+// [228]   "ld1sb       $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1SB_D_IMM_REAL, LD1SB_H_IMM_REAL, LD1SB_S_IMM_REAL)>;
+
+// [229]   "ld1sb       $Zt, $Pg/z, [$Zn, $imm5]";
+def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLD1SB_D_IMM_REAL, GLD1SB_S_IMM_REAL)>;
+
+// [230]   "ld1sh       $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1SH_D, LD1SH_S)>;
+
+// [231]   "ld1sh       $Zt, $Pg/z, [$Rn, $Zm]";
+def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLD1SH_D_REAL, GLD1SH_D_SCALED_REAL, GLD1SH_D_SXTW_REAL, GLD1SH_D_SXTW_SCALED_REAL, GLD1SH_D_UXTW_REAL, GLD1SH_D_UXTW_SCALED_REAL, GLD1SH_S_SXTW_REAL, GLD1SH_S_SXTW_SCALED_REAL, GLD1SH_S_UXTW_REAL, GLD1SH_S_UXTW_SCALED_REAL)>;
+
+// [232]   "ld1sh       $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1SH_D_IMM_REAL, LD1SH_S_IMM_REAL)>;
+
+// [233]   "ld1sh       $Zt, $Pg/z, [$Zn, $imm5]";
+def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLD1SH_D_IMM_REAL, GLD1SH_S_IMM_REAL)>;
+
+// [234]   "ld1sw       $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1SW_D)>;
+
+// [235]   "ld1sw       $Zt, $Pg/z, [$Rn, $Zm]";
+def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLD1SW_D_REAL, GLD1SW_D_SCALED_REAL, GLD1SW_D_SXTW_REAL, GLD1SW_D_SXTW_SCALED_REAL, GLD1SW_D_UXTW_REAL, GLD1SW_D_UXTW_SCALED_REAL)>;
+
+// [236]   "ld1sw       $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1SW_D_IMM_REAL)>;
+
+// [237]   "ld1sw       $Zt, $Pg/z, [$Zn, $imm5]";
+def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLD1SW_D_IMM_REAL)>;
+
+// [238]   "ld1w        $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1W, LD1W_D)>;
+
+// [239]   "ld1w        $Zt, $Pg/z, [$Rn, $Zm]";
+def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLD1W_D_REAL, GLD1W_D_SCALED_REAL, GLD1W_D_SXTW_REAL, GLD1W_D_SXTW_SCALED_REAL, GLD1W_D_UXTW_REAL, GLD1W_D_UXTW_SCALED_REAL, GLD1W_SXTW_REAL, GLD1W_SXTW_SCALED_REAL, GLD1W_UXTW_REAL, GLD1W_UXTW_SCALED_REAL)>;
+
+// [240]   "ld1w        $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1W_D_IMM_REAL, LD1W_IMM_REAL)>;
+
+// [241]   "ld1w        $Zt, $Pg/z, [$Zn, $imm5]";
+def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLD1W_D_IMM_REAL, GLD1W_IMM_REAL)>;
+
+// [242]   "ld2b        $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_44Cyc_GI56], (instrs LD2B)>;
+
+// [243]   "ld2b        $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_44Cyc_GI56], (instrs LD2B_IMM)>;
+
+// [244]   "ld2d        $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD2D)>;
+
+// [245]   "ld2d        $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD2D_IMM)>;
+
+// [246]   "ld2h        $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_44Cyc_GI56], (instrs LD2H)>;
+
+// [247]   "ld2h        $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_44Cyc_GI56], (instrs LD2H_IMM)>;
+
+// [248]   "ld2w        $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD2W)>;
+
+// [249]   "ld2w        $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD2W_IMM)>;
+
+// [250]   "ld3b        $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_44Cyc_GI56], (instrs LD3B)>;
+
+// [251]   "ld3b        $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_44Cyc_GI56], (instrs LD3B_IMM)>;
+
+// [252]   "ld3d        $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD3D)>;
+
+// [253]   "ld3d        $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD3D_IMM)>;
+
+// [254]   "ld3h        $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_44Cyc_GI56], (instrs LD3H)>;
+
+// [255]   "ld3h        $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_44Cyc_GI56], (instrs LD3H_IMM)>;
+
+// [256]   "ld3w        $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD3W)>;
+
+// [257]   "ld3w        $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD3W_IMM)>;
+
+// [258]   "ld4b        $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_44Cyc_GI56], (instrs LD4B)>;
+
+// [259]   "ld4b        $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_44Cyc_GI56], (instrs LD4B_IMM)>;
+
+// [260]   "ld4d        $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD4D)>;
+
+// [261]   "ld4d        $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD4D_IMM)>;
+
+// [262]   "ld4h        $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD4H)>;
+
+// [263]   "ld4h        $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD4H_IMM)>;
+
+// [264]   "ld4w        $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD4W)>;
+
+// [265]   "ld4w        $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD4W_IMM)>;
+
+// [266]   "ldff1b      $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDFF1B_D_REAL, LDFF1B_H_REAL, LDFF1B_REAL, LDFF1B_S_REAL)>;
+
+// [267]   "ldff1b      $Zt, $Pg/z, [$Rn, $Zm]";
+def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLDFF1B_D_REAL, GLDFF1B_D_SXTW_REAL, GLDFF1B_D_UXTW_REAL, GLDFF1B_S_SXTW_REAL, GLDFF1B_S_UXTW_REAL)>;
+
+// [268]   "ldff1b      $Zt, $Pg/z, [$Zn, $imm5]";
+def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLDFF1B_D_IMM_REAL, GLDFF1B_S_IMM_REAL)>;
+
+// [269]   "ldff1d      $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDFF1D_REAL)>;
+
+// [270]   "ldff1d      $Zt, $Pg/z, [$Rn, $Zm]";
+def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLDFF1D_REAL, GLDFF1D_SCALED_REAL, GLDFF1D_SXTW_REAL, GLDFF1D_SXTW_SCALED_REAL, GLDFF1D_UXTW_REAL, GLDFF1D_UXTW_SCALED_REAL)>;
+
+// [271]   "ldff1d      $Zt, $Pg/z, [$Zn, $imm5]";
+def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLDFF1D_IMM_REAL)>;
+
+// [272]   "ldff1h      $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDFF1H_D_REAL, LDFF1H_REAL, LDFF1H_S_REAL)>;
+
+// [273]   "ldff1h      $Zt, $Pg/z, [$Rn, $Zm]";
+def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLDFF1H_D_REAL, GLDFF1H_D_SCALED_REAL, GLDFF1H_D_SXTW_REAL, GLDFF1H_D_SXTW_SCALED_REAL, GLDFF1H_D_UXTW_REAL, GLDFF1H_D_UXTW_SCALED_REAL, GLDFF1H_S_SXTW_REAL, GLDFF1H_S_SXTW_SCALED_REAL, GLDFF1H_S_UXTW_REAL, GLDFF1H_S_UXTW_SCALED_REAL)>;
+
+// [274]   "ldff1h      $Zt, $Pg/z, [$Zn, $imm5]";
+def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLDFF1H_D_IMM_REAL, GLDFF1H_S_IMM_REAL)>;
+
+// [275]   "ldff1sb     $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDFF1SB_D_REAL, LDFF1SB_H_REAL, LDFF1SB_S_REAL)>;
+
+// [276]   "ldff1sb     $Zt, $Pg/z, [$Rn, $Zm]";
+def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLDFF1SB_D_REAL, GLDFF1SB_D_SXTW_REAL, GLDFF1SB_D_UXTW_REAL, GLDFF1SB_S_SXTW_REAL, GLDFF1SB_S_UXTW_REAL)>;
+
+// [277]   "ldff1sb     $Zt, $Pg/z, [$Zn, $imm5]";
+def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLDFF1SB_D_IMM_REAL, GLDFF1SB_S_IMM_REAL)>;
+
+// [278]   "ldff1sh     $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDFF1SH_D_REAL, LDFF1SH_S_REAL)>;
+
+// [279]   "ldff1sh     $Zt, $Pg/z, [$Rn, $Zm]";
+def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLDFF1SH_D_REAL, GLDFF1SH_D_SCALED_REAL, GLDFF1SH_D_SXTW_REAL, GLDFF1SH_D_SXTW_SCALED_REAL, GLDFF1SH_D_UXTW_REAL, GLDFF1SH_D_UXTW_SCALED_REAL, GLDFF1SH_S_SXTW_REAL, GLDFF1SH_S_SXTW_SCALED_REAL, GLDFF1SH_S_UXTW_REAL, GLDFF1SH_S_UXTW_SCALED_REAL)>;
+
+// [280]   "ldff1sh     $Zt, $Pg/z, [$Zn, $imm5]";
+def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLDFF1SH_D_IMM_REAL, GLDFF1SH_S_IMM_REAL)>;
+
+// [281]   "ldff1sw     $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDFF1SW_D_REAL)>;
+
+// [282]   "ldff1sw     $Zt, $Pg/z, [$Rn, $Zm]";
+def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLDFF1SW_D_REAL, GLDFF1SW_D_SCALED_REAL, GLDFF1SW_D_SXTW_REAL, GLDFF1SW_D_SXTW_SCALED_REAL, GLDFF1SW_D_UXTW_REAL, GLDFF1SW_D_UXTW_SCALED_REAL)>;
+
+// [283]   "ldff1sw     $Zt, $Pg/z, [$Zn, $imm5]";
+def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLDFF1SW_D_IMM_REAL)>;
+
+// [284]   "ldff1w      $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDFF1W_D_REAL, LDFF1W_REAL)>;
+
+// [285]   "ldff1w      $Zt, $Pg/z, [$Rn, $Zm]";
+def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLDFF1W_D_REAL, GLDFF1W_D_SCALED_REAL, GLDFF1W_D_SXTW_REAL, GLDFF1W_D_SXTW_SCALED_REAL, GLDFF1W_D_UXTW_REAL, GLDFF1W_D_UXTW_SCALED_REAL, GLDFF1W_SXTW_REAL, GLDFF1W_SXTW_SCALED_REAL, GLDFF1W_UXTW_REAL, GLDFF1W_UXTW_SCALED_REAL)>;
+
+// [286]   "ldff1w      $Zt, $Pg/z, [$Zn, $imm5]";
+def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLDFF1W_D_IMM_REAL, GLDFF1W_IMM_REAL)>;
+
+// [287]   "ldnf1b      $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNF1B_D_IMM_REAL, LDNF1B_H_IMM_REAL, LDNF1B_IMM_REAL, LDNF1B_S_IMM_REAL)>;
+
+// [288]   "ldnf1d      $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNF1D_IMM_REAL)>;
+
+// [289]   "ldnf1h      $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNF1H_D_IMM_REAL, LDNF1H_IMM_REAL, LDNF1H_S_IMM_REAL)>;
+
+// [290]   "ldnf1sb     $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNF1SB_D_IMM_REAL, LDNF1SB_H_IMM_REAL, LDNF1SB_S_IMM_REAL)>;
+
+// [291]   "ldnf1sh     $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNF1SH_D_IMM_REAL, LDNF1SH_S_IMM_REAL)>;
+
+// [292]   "ldnf1sw     $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNF1SW_D_IMM_REAL)>;
+
+// [293]   "ldnf1w      $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNF1W_D_IMM_REAL, LDNF1W_IMM_REAL)>;
+
+// [294]   "ldnt1b      $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNT1B_ZRR)>;
+
+// [295]   "ldnt1b      $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNT1B_ZRI)>;
+
+// [296]   "ldnt1d      $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNT1D_ZRR)>;
+
+// [297]   "ldnt1d      $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNT1D_ZRI)>;
+
+// [298]   "ldnt1h      $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNT1H_ZRR)>;
+
+// [299]   "ldnt1h      $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNT1H_ZRI)>;
+
+// [300]   "ldnt1w      $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNT1W_ZRR)>;
+
+// [301]   "ldnt1w      $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNT1W_ZRI)>;
+
+// [302]   "ldr $Pt, [$Rn, $imm9, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI5], (instrs LDR_PXI)>;
+
+// [303]   "ldr $Zt, [$Rn, $imm9, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI5], (instrs LDR_ZXI)>;
+
+// [304]   "lsl $Zd, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs LSL_WIDE_ZZZ_B, LSL_WIDE_ZZZ_H, LSL_WIDE_ZZZ_S)>;
+
+// [305]   "lsl $Zd, $Zn, $imm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs LSL_ZZI_B, LSL_ZZI_D, LSL_ZZI_H, LSL_ZZI_S)>;
+
+// [306]   "lsl $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs LSL_WIDE_ZPmZ_B, LSL_WIDE_ZPmZ_H, LSL_WIDE_ZPmZ_S, LSL_ZPmZ_B, LSL_ZPmZ_D, LSL_ZPmZ_H, LSL_ZPmZ_S)>;
+
+// [307]   "lsl $Zdn, $Pg/m, $_Zdn, $imm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs LSL_ZPmI_B, LSL_ZPmI_D, LSL_ZPmI_H, LSL_ZPmI_S)>;
+
+// [308]   "lslr        $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs LSLR_ZPmZ_B, LSLR_ZPmZ_D, LSLR_ZPmZ_H, LSLR_ZPmZ_S)>;
+
+// [309]   "lsr $Zd, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs LSR_WIDE_ZZZ_B, LSR_WIDE_ZZZ_H, LSR_WIDE_ZZZ_S)>;
+
+// [310]   "lsr $Zd, $Zn, $imm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs LSR_ZZI_B, LSR_ZZI_D, LSR_ZZI_H, LSR_ZZI_S)>;
+
+// [311]   "lsr $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs LSR_WIDE_ZPmZ_B, LSR_WIDE_ZPmZ_H, LSR_WIDE_ZPmZ_S, LSR_ZPmZ_B, LSR_ZPmZ_D, LSR_ZPmZ_H, LSR_ZPmZ_S)>;
+
+// [312]   "lsr $Zdn, $Pg/m, $_Zdn, $imm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs LSR_ZPmI_B, LSR_ZPmI_D, LSR_ZPmI_H, LSR_ZPmI_S)>;
+
+// [313]   "lsrr        $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs LSRR_ZPmZ_B, LSRR_ZPmZ_D, LSRR_ZPmZ_H, LSRR_ZPmZ_S)>;
+
+// [314]   "mad $Zdn, $Pg/m, $Zm, $Za";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs MAD_ZPmZZ_B, MAD_ZPmZZ_D, MAD_ZPmZZ_H, MAD_ZPmZZ_S)>;
+
+// [315]   "mla $Zda, $Pg/m, $Zn, $Zm";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs MLA_ZPmZZ_B, MLA_ZPmZZ_D, MLA_ZPmZZ_H, MLA_ZPmZZ_S)>;
+
+// [316]   "mls $Zda, $Pg/m, $Zn, $Zm";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs MLS_ZPmZZ_B, MLS_ZPmZZ_D, MLS_ZPmZZ_H, MLS_ZPmZZ_S)>;
+
+// [317]   "movprfx     $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs MOVPRFX_ZPmZ_B, MOVPRFX_ZPmZ_D, MOVPRFX_ZPmZ_H, MOVPRFX_ZPmZ_S)>;
+
+// [318]   "movprfx     $Zd, $Pg/z, $Zn";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs MOVPRFX_ZPzZ_B, MOVPRFX_ZPzZ_D, MOVPRFX_ZPzZ_H, MOVPRFX_ZPzZ_S)>;
+
+// [319]   "movprfx     $Zd, $Zn";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs MOVPRFX_ZZ)>;
+
+// [320]   "msb $Zdn, $Pg/m, $Zm, $Za";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs MSB_ZPmZZ_B, MSB_ZPmZZ_D, MSB_ZPmZZ_H, MSB_ZPmZZ_S)>;
+
+// [321]   "mul $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs MUL_ZPmZ_B, MUL_ZPmZ_D, MUL_ZPmZ_H, MUL_ZPmZ_S)>;
+
+// [322]   "mul $Zdn, $_Zdn, $imm";
+def : InstRW<[A64FXWrite_9Cyc_GI0], (instrs MUL_ZI_B, MUL_ZI_D, MUL_ZI_H, MUL_ZI_S)>;
+
+// [323]   "nand        $Pd, $Pg/z, $Pn, $Pm";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs NAND_PPzPP)>;
+
+// [324]   "nands       $Pd, $Pg/z, $Pn, $Pm";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs NANDS_PPzPP)>;
+
+// [325]   "neg $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs NEG_ZPmZ_B, NEG_ZPmZ_D, NEG_ZPmZ_H, NEG_ZPmZ_S)>;
+
+// [326]   "nor $Pd, $Pg/z, $Pn, $Pm";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs NOR_PPzPP)>;
+
+// [327]   "nors        $Pd, $Pg/z, $Pn, $Pm";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs NORS_PPzPP)>;
+
+// [328]   "not $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs NOT_ZPmZ_B, NOT_ZPmZ_D, NOT_ZPmZ_H, NOT_ZPmZ_S)>;
+
+// [329]   "orn $Pd, $Pg/z, $Pn, $Pm";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs ORN_PPzPP)>;
+
+// [330]   "orns        $Pd, $Pg/z, $Pn, $Pm";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs ORNS_PPzPP)>;
+
+// [331]   "orr $Pd, $Pg/z, $Pn, $Pm";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs ORR_PPzPP)>;
+
+// [332]   "orr $Zd, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ORR_ZZZ)>;
+
+// [333]   "orr $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ORR_ZPmZ_B, ORR_ZPmZ_D, ORR_ZPmZ_H, ORR_ZPmZ_S)>;
+
+// [334]   "orr $Zdn, $_Zdn, $imms13";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs ORR_ZI)>;
+
+// [335]   "orrs        $Pd, $Pg/z, $Pn, $Pm";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs ORRS_PPzPP)>;
+
+// [336]   "orv $Vd, $Pg, $Zn";
+def : InstRW<[A64FXWrite_14Cyc_GI03], (instrs ORV_VPZ_B, ORV_VPZ_D, ORV_VPZ_H, ORV_VPZ_S)>;
+
+// [337]   "pfalse      $Pd";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs PFALSE)>;
+
+// [338]   "pnext       $Pdn, $Pg, $_Pdn";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs PNEXT_B, PNEXT_D, PNEXT_H, PNEXT_S)>;
+
+// [339]   "prfb        $prfop, $Pg, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_6Cyc_GI56], (instrs PRFB_PRR)>;
+
+// [340]   "prfb        $prfop, $Pg, [$Rn, $Zm]";
+def : InstRW<[A64FXWrite_14Cyc_GI0256], (instrs PRFB_D_SCALED, PRFB_D_SXTW_SCALED, PRFB_D_UXTW_SCALED, PRFB_S_SXTW_SCALED, PRFB_S_UXTW_SCALED)>;
+
+// [341]   "prfb        $prfop, $Pg, [$Rn, $imm6, mul vl]";
+def : InstRW<[A64FXWrite_6Cyc_GI56], (instrs PRFB_PRI)>;
+
+// [342]   "prfb        $prfop, $Pg, [$Zn, $imm5]";
+def : InstRW<[A64FXWrite_10Cyc_GI056], (instrs PRFB_D_PZI, PRFB_S_PZI)>;
+
+// [343]   "prfd        $prfop, $Pg, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_6Cyc_GI56], (instrs PRFD_PRR)>;
+
+// [344]   "prfd        $prfop, $Pg, [$Rn, $Zm]";
+def : InstRW<[A64FXWrite_14Cyc_GI0256], (instrs PRFD_D_SCALED, PRFD_D_SXTW_SCALED, PRFD_D_UXTW_SCALED, PRFD_S_SXTW_SCALED, PRFD_S_UXTW_SCALED)>;
+
+// [345]   "prfd        $prfop, $Pg, [$Rn, $imm6, mul vl]";
+def : InstRW<[A64FXWrite_6Cyc_GI56], (instrs PRFD_PRI)>;
+
+// [346]   "prfd        $prfop, $Pg, [$Zn, $imm5]";
+def : InstRW<[A64FXWrite_10Cyc_GI056], (instrs PRFD_D_PZI, PRFD_S_PZI)>;
+
+// [347]   "prfh        $prfop, $Pg, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_6Cyc_GI56], (instrs PRFH_PRR)>;
+
+// [348]   "prfh        $prfop, $Pg, [$Rn, $Zm]";
+def : InstRW<[A64FXWrite_14Cyc_GI0256], (instrs PRFH_D_SCALED, PRFH_D_SXTW_SCALED, PRFH_D_UXTW_SCALED, PRFH_S_SXTW_SCALED, PRFH_S_UXTW_SCALED)>;
+
+// [349]   "prfh        $prfop, $Pg, [$Rn, $imm6, mul vl]";
+def : InstRW<[A64FXWrite_6Cyc_GI56], (instrs PRFH_PRI)>;
+
+// [350]   "prfh        $prfop, $Pg, [$Zn, $imm5]";
+def : InstRW<[A64FXWrite_10Cyc_GI056], (instrs PRFH_D_PZI, PRFH_S_PZI)>;
+
+// [351]   "prfw        $prfop, $Pg, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_6Cyc_GI56], (instrs PRFS_PRR)>;
+
+// [352]   "prfw        $prfop, $Pg, [$Rn, $Zm]";
+def : InstRW<[A64FXWrite_14Cyc_GI0256], (instrs PRFW_D_SCALED, PRFW_D_SXTW_SCALED, PRFW_D_UXTW_SCALED, PRFW_S_SXTW_SCALED, PRFW_S_UXTW_SCALED)>;
+
+// [353]   "prfw        $prfop, $Pg, [$Rn, $imm6, mul vl]";
+def : InstRW<[A64FXWrite_6Cyc_GI56], (instrs PRFW_PRI)>;
+
+// [354]   "prfw        $prfop, $Pg, [$Zn, $imm5]";
+def : InstRW<[A64FXWrite_10Cyc_GI056], (instrs PRFW_D_PZI, PRFW_S_PZI)>;
+
+// [355]   "ptest       $Pg, $Pn";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs PTEST_PP)>;
+
+// [356]   "ptrue       $Pd, $pattern";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs PTRUE_B, PTRUE_D, PTRUE_H, PTRUE_S)>;
+
+// [357]   "ptrues      $Pd, $pattern";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs PTRUES_B, PTRUES_D, PTRUES_H, PTRUES_S)>;
+
+// [358]   "punpkhi     $Pd, $Pn";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs PUNPKHI_PP)>;
+
+// [359]   "punpklo     $Pd, $Pn";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs PUNPKLO_PP)>;
+
+// [360]   "rbit        $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs RBIT_ZPmZ_B, RBIT_ZPmZ_D, RBIT_ZPmZ_H, RBIT_ZPmZ_S)>;
+
+// [361]   "rdffr       $Pd";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs RDFFR_P)>;
+
+// [362]   "rdffr       $Pd, $Pg/z";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs RDFFR_PPz)>;
+
+// [363]   "rdffrs      $Pd, $Pg/z";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs RDFFRS_PPz)>;
+
+// [364]   "rdvl        $Rd, $imm6";
+def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs RDVLI_XI)>;
+
+// [365]   "rev $Pd, $Pn";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs REV_PP_B, REV_PP_D, REV_PP_H, REV_PP_S)>;
+
+// [366]   "rev $Zd, $Zn";
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs REV_ZZ_B, REV_ZZ_D, REV_ZZ_H, REV_ZZ_S)>;
+
+// [367]   "revb        $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs REVB_ZPmZ_D, REVB_ZPmZ_H, REVB_ZPmZ_S)>;
+
+// [368]   "revh        $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs REVH_ZPmZ_D, REVH_ZPmZ_S)>;
+
+// [369]   "revw        $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs REVW_ZPmZ_D)>;
+
+// [370]   "sabd        $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SABD_ZPmZ_B, SABD_ZPmZ_D, SABD_ZPmZ_H, SABD_ZPmZ_S)>;
+
+// [371]   "saddv       $Vd, $Pg, $Zn";
+def : InstRW<[A64FXWrite_12Cyc_GI03], (instrs SADDV_VPZ_B, SADDV_VPZ_H, SADDV_VPZ_S)>;
+
+// [372]   "scvtf       $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs SCVTF_ZPmZ_DtoD, SCVTF_ZPmZ_DtoH, SCVTF_ZPmZ_DtoS, SCVTF_ZPmZ_HtoH, SCVTF_ZPmZ_StoD, SCVTF_ZPmZ_StoH, SCVTF_ZPmZ_StoS)>;
+
+// [373]   "sdiv        $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_37Cyc_GI0], (instrs SDIV_ZPmZ_D, SDIV_ZPmZ_S)>;
+
+// [374]   "sdivr       $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_37Cyc_GI0], (instrs SDIVR_ZPmZ_D, SDIVR_ZPmZ_S)>;
+
+// [375]   "sdot        $Zda, $Zn, $Zm";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs SDOT_ZZZ_D, SDOT_ZZZ_S)>;
+
+// [376]   "sdot        $Zda, $Zn, $Zm$iop";
+def : InstRW<[A64FXWrite_15Cyc_NGI03], (instrs SDOT_ZZZI_D, SDOT_ZZZI_S)>;
+
+// [377]   "sel $Pd, $Pg, $Pn, $Pm";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs SEL_PPPP)>;
+
+// [378]   "sel $Zd, $Pg, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SEL_ZPZZ_B, SEL_ZPZZ_D, SEL_ZPZZ_H, SEL_ZPZZ_S)>;
+
+// [379]   "setffr";
+def : InstRW<[A64FXWrite_6Cyc], (instrs SETFFR)>;
+
+// [380]   "smax        $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SMAX_ZPmZ_B, SMAX_ZPmZ_D, SMAX_ZPmZ_H, SMAX_ZPmZ_S)>;
+
+// [381]   "smax        $Zdn, $_Zdn, $imm";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs SMAX_ZI_B, SMAX_ZI_D, SMAX_ZI_H, SMAX_ZI_S)>;
+
+// [382]   "smaxv       $Vd, $Pg, $Zn";
+def : InstRW<[A64FXWrite_14Cyc_GI03], (instrs SMAXV_VPZ_B, SMAXV_VPZ_D, SMAXV_VPZ_H, SMAXV_VPZ_S)>;
+
+// [383]   "smin        $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SMIN_ZPmZ_B, SMIN_ZPmZ_D, SMIN_ZPmZ_H, SMIN_ZPmZ_S)>;
+
+// [384]   "smin        $Zdn, $_Zdn, $imm";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs SMIN_ZI_B, SMIN_ZI_D, SMIN_ZI_H, SMIN_ZI_S)>;
+
+// [385]   "sminv       $Vd, $Pg, $Zn";
+def : InstRW<[A64FXWrite_14Cyc_GI03], (instrs SMINV_VPZ_B, SMINV_VPZ_D, SMINV_VPZ_H, SMINV_VPZ_S)>;
+
+// [386]   "smulh       $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs SMULH_ZPmZ_B, SMULH_ZPmZ_D, SMULH_ZPmZ_H, SMULH_ZPmZ_S)>;
+
+// [387]   "splice      $Zdn, $Pg, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs SPLICE_ZPZ_B, SPLICE_ZPZ_D, SPLICE_ZPZ_H, SPLICE_ZPZ_S)>;
+
+// [388]   "sqadd       $Zd, $Zn, $Zm";
+
+// [389]   "sqadd       $Zdn, $_Zdn, $imm";
+
+// [390]   "sqdecb      $Rdn, $_Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQDECB_XPiWdI)>;
+
+// [391]   "sqdecb      $Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQDECB_XPiI)>;
+
+// [392]   "sqdecd      $Rdn, $_Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQDECD_XPiWdI)>;
+
+// [393]   "sqdecd      $Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQDECD_XPiI)>;
+
+// [394]   "sqdecd      $Zdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SQDECD_ZPiI)>;
+
+// [395]   "sqdech      $Rdn, $_Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQDECH_XPiWdI)>;
+
+// [396]   "sqdech      $Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQDECH_XPiI)>;
+
+// [397]   "sqdech      $Zdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SQDECH_ZPiI)>;
+
+// [398]   "sqdecp      $Rdn, $Pg";
+def : InstRW<[A64FXWrite_8Cyc_GI124], (instrs SQDECP_XP_B, SQDECP_XP_D, SQDECP_XP_H, SQDECP_XP_S)>;
+
+// [399]   "sqdecp      $Rdn, $Pg, $_Rdn";
+def : InstRW<[A64FXWrite_8Cyc_GI124], (instrs SQDECP_XPWd_B, SQDECP_XPWd_D, SQDECP_XPWd_H, SQDECP_XPWd_S)>;
+
+// [400]   "sqdecp      $Zdn, $Pg";
+def : InstRW<[A64FXWrite_12Cyc_GI01], (instrs SQDECP_ZP_D, SQDECP_ZP_H, SQDECP_ZP_S)>;
+
+// [401]   "sqdecw      $Rdn, $_Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQDECW_XPiWdI)>;
+
+// [402]   "sqdecw      $Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQDECW_XPiI)>;
+
+// [403]   "sqdecw      $Zdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SQDECW_ZPiI)>;
+
+// [404]   "sqincb      $Rdn, $_Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQINCB_XPiWdI)>;
+
+// [405]   "sqincb      $Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQINCB_XPiI)>;
+
+// [406]   "sqincd      $Rdn, $_Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQINCD_XPiWdI)>;
+
+// [407]   "sqincd      $Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQINCD_XPiI)>;
+
+// [408]   "sqincd      $Zdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SQINCD_ZPiI)>;
+
+// [409]   "sqinch      $Rdn, $_Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQINCH_XPiWdI)>;
+
+// [410]   "sqinch      $Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQINCH_XPiI)>;
+
+// [411]   "sqinch      $Zdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SQINCH_ZPiI)>;
+
+// [412]   "sqincp      $Rdn, $Pg";
+def : InstRW<[A64FXWrite_8Cyc_GI124], (instrs SQINCP_XP_B, SQINCP_XP_D, SQINCP_XP_H, SQINCP_XP_S)>;
+
+// [413]   "sqincp      $Rdn, $Pg, $_Rdn";
+def : InstRW<[A64FXWrite_8Cyc_GI124], (instrs SQINCP_XPWd_B, SQINCP_XPWd_D, SQINCP_XPWd_H, SQINCP_XPWd_S)>;
+
+// [414]   "sqincp      $Zdn, $Pg";
+def : InstRW<[A64FXWrite_12Cyc_GI01], (instrs SQINCP_ZP_D, SQINCP_ZP_H, SQINCP_ZP_S)>;
+
+// [415]   "sqincw      $Rdn, $_Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQINCW_XPiWdI)>;
+
+// [416]   "sqincw      $Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQINCW_XPiI)>;
+
+// [417]   "sqincw      $Zdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SQINCW_ZPiI)>;
+
+// [418]   "sqsub       $Zd, $Zn, $Zm";
+
+// [419]   "sqsub       $Zdn, $_Zdn, $imm";
+
+// [420]   "st1b        $Zt, $Pg, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1B, ST1B_D, ST1B_H, ST1B_S)>;
+
+// [421]   "st1b        $Zt, $Pg, [$Rn, $Zm]";
+def : InstRW<[A64FXWrite_ST1W_19], (instrs SST1B_D_REAL, SST1B_D_SXTW, SST1B_D_UXTW, SST1B_S_SXTW, SST1B_S_UXTW)>;
+
+// [422]   "st1b        $Zt, $Pg, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1B_D_IMM, ST1B_H_IMM, ST1B_IMM, ST1B_S_IMM)>;
+
+// [423]   "st1b        $Zt, $Pg, [$Zn, $imm5]";
+def : InstRW<[A64FXWrite_ST1W_15], (instrs SST1B_D_IMM, SST1B_S_IMM)>;
+
+// [424]   "st1d        $Zt, $Pg, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1D)>;
+
+// [425]   "st1d        $Zt, $Pg, [$Rn, $Zm]";
+def : InstRW<[A64FXWrite_ST1W_19], (instrs SST1D_REAL, SST1D_SCALED_SCALED_REAL, SST1D_SXTW, SST1D_SXTW_SCALED, SST1D_UXTW, SST1D_UXTW_SCALED)>;
+
+// [426]   "st1d        $Zt, $Pg, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1D_IMM)>;
+
+// [427]   "st1d        $Zt, $Pg, [$Zn, $imm5]";
+def : InstRW<[A64FXWrite_ST1W_15], (instrs SST1D_IMM)>;
+
+// [428]   "st1h        $Zt, $Pg, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1H, ST1H_D, ST1H_S)>;
+
+// [429]   "st1h        $Zt, $Pg, [$Rn, $Zm]";
+def : InstRW<[A64FXWrite_ST1W_19], (instrs SST1H_D_REAL, SST1H_D_SCALED_SCALED_REAL, SST1H_D_SXTW, SST1H_D_SXTW_SCALED, SST1H_D_UXTW, SST1H_D_UXTW_SCALED, SST1H_S_SXTW, SST1H_S_SXTW_SCALED, SST1H_S_UXTW, SST1H_S_UXTW_SCALED)>;
+
+// [430]   "st1h        $Zt, $Pg, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1H_D_IMM, ST1H_IMM, ST1H_S_IMM)>;
+
+// [431]   "st1h        $Zt, $Pg, [$Zn, $imm5]";
+def : InstRW<[A64FXWrite_ST1W_15], (instrs SST1H_D_IMM, SST1H_S_IMM)>;
+
+// [432]   "st1w        $Zt, $Pg, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1W, ST1W_D)>;
+
+// [433]   "st1w        $Zt, $Pg, [$Rn, $Zm]";
+def : InstRW<[A64FXWrite_ST1W_19], (instrs SST1W_D_REAL, SST1W_D_SCALED_SCALED_REAL, SST1W_D_SXTW, SST1W_D_SXTW_SCALED, SST1W_D_UXTW, SST1W_D_UXTW_SCALED, SST1W_SXTW, SST1W_SXTW_SCALED, SST1W_UXTW, SST1W_UXTW_SCALED)>;
+
+// [434]   "st1w        $Zt, $Pg, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1W_D_IMM, ST1W_IMM)>;
+
+// [435]   "st1w        $Zt, $Pg, [$Zn, $imm5]";
+def : InstRW<[A64FXWrite_ST1W_15], (instrs SST1W_D_IMM, SST1W_IMM)>;
+
+// [436]   "st2b        $Zt, $Pg, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_ST2W_7], (instrs ST2B)>;
+
+// [437]   "st2b        $Zt, $Pg, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_ST2W_7], (instrs ST2B_IMM)>;
+
+// [438]   "st2d        $Zt, $Pg, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_ST2W_7], (instrs ST2D)>;
+
+// [439]   "st2d        $Zt, $Pg, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_ST2W_7], (instrs ST2D_IMM)>;
+
+// [440]   "st2h        $Zt, $Pg, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_ST2W_7], (instrs ST2H)>;
+
+// [441]   "st2h        $Zt, $Pg, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_ST2W_7], (instrs ST2H_IMM)>;
+
+// [442]   "st2w        $Zt, $Pg, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_ST2W_7], (instrs ST2W)>;
+
+// [443]   "st2w        $Zt, $Pg, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_ST2W_7], (instrs ST2W_IMM)>;
+
+// [444]   "st3b        $Zt, $Pg, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_ST3W_8], (instrs ST3B)>;
+
+// [445]   "st3b        $Zt, $Pg, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_ST3W_8], (instrs ST3B_IMM)>;
+
+// [446]   "st3d        $Zt, $Pg, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_ST3W_8], (instrs ST3D)>;
+
+// [447]   "st3d        $Zt, $Pg, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_ST3W_8], (instrs ST3D_IMM)>;
+
+// [448]   "st3h        $Zt, $Pg, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_ST3W_8], (instrs ST3H)>;
+
+// [449]   "st3h        $Zt, $Pg, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_ST3W_8], (instrs ST3H_IMM)>;
+
+// [450]   "st3w        $Zt, $Pg, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_ST3W_8], (instrs ST3W)>;
+
+// [451]   "st3w        $Zt, $Pg, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_ST3W_8], (instrs ST3W_IMM)>;
+
+// [452]   "st4b        $Zt, $Pg, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_ST4W_9], (instrs ST4B)>;
+
+// [453]   "st4b        $Zt, $Pg, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_ST4W_9], (instrs ST4B_IMM)>;
+
+// [454]   "st4d        $Zt, $Pg, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_ST4W_9], (instrs ST4D)>;
+
+// [455]   "st4d        $Zt, $Pg, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_ST4W_9], (instrs ST4D_IMM)>;
+
+// [456]   "st4h        $Zt, $Pg, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_ST4W_9], (instrs ST4H)>;
+
+// [457]   "st4h        $Zt, $Pg, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_ST4W_9], (instrs ST4H_IMM)>;
+
+// [458]   "st4w        $Zt, $Pg, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_ST4W_9], (instrs ST4W)>;
+
+// [459]   "st4w        $Zt, $Pg, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_ST4W_9], (instrs ST4W_IMM)>;
+
+// [460]   "stnt1b      $Zt, $Pg, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_ST1W_6], (instrs STNT1B_ZRR)>;
+
+// [461]   "stnt1b      $Zt, $Pg, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_ST1W_6], (instrs STNT1B_ZRI)>;
+
+// [462]   "stnt1d      $Zt, $Pg, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_ST1W_6], (instrs STNT1D_ZRR)>;
+
+// [463]   "stnt1d      $Zt, $Pg, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_ST1W_6], (instrs STNT1D_ZRI)>;
+
+// [464]   "stnt1h      $Zt, $Pg, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_ST1W_6], (instrs STNT1H_ZRR)>;
+
+// [465]   "stnt1h      $Zt, $Pg, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_ST1W_6], (instrs STNT1H_ZRI)>;
+
+// [466]   "stnt1w      $Zt, $Pg, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_ST1W_6], (instrs STNT1W_ZRR)>;
+
+// [467]   "stnt1w      $Zt, $Pg, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_ST1W_6], (instrs STNT1W_ZRI)>;
+
+// [468]   "str $Pt, [$Rn, $imm9, mul vl]";
+def : InstRW<[A64FXWrite_6Cyc_GI15], (instrs STR_PXI)>;
+
+// [469]   "str $Zt, [$Rn, $imm9, mul vl]";
+def : InstRW<[A64FXWrite_6Cyc_GI05], (instrs STR_ZXI)>;
+
+// [470]   "sub $Zd, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SUB_ZZZ_B, SUB_ZZZ_D, SUB_ZZZ_H, SUB_ZZZ_S)>;
+
+// [471]   "sub $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SUB_ZPmZ_B, SUB_ZPmZ_D, SUB_ZPmZ_H, SUB_ZPmZ_S)>;
+
+// [472]   "sub $Zdn, $_Zdn, $imm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SUB_ZI_B, SUB_ZI_D, SUB_ZI_H, SUB_ZI_S)>;
+
+// [473]   "subr        $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SUBR_ZPmZ_B, SUBR_ZPmZ_D, SUBR_ZPmZ_H, SUBR_ZPmZ_S)>;
+
+// [474]   "subr        $Zdn, $_Zdn, $imm";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs SUBR_ZI_B, SUBR_ZI_D, SUBR_ZI_H, SUBR_ZI_S)>;
+
+// [475]   "sunpkhi     $Zd, $Zn";
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs SUNPKHI_ZZ_D, SUNPKHI_ZZ_H, SUNPKHI_ZZ_S)>;
+
+// [476]   "sunpklo     $Zd, $Zn";
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs SUNPKLO_ZZ_D, SUNPKLO_ZZ_H, SUNPKLO_ZZ_S)>;
+
+// [477]   "sxtb        $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SXTB_ZPmZ_D, SXTB_ZPmZ_H, SXTB_ZPmZ_S)>;
+
+// [478]   "sxth        $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SXTH_ZPmZ_D, SXTH_ZPmZ_S)>;
+
+// [479]   "sxtw        $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SXTW_ZPmZ_D)>;
+
+// [480]   "tbl $Zd, $Zn, $Zm";
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs TBL_ZZZ_B, TBL_ZZZ_D, TBL_ZZZ_H, TBL_ZZZ_S)>;
+
+// [481]   "trn1        $Pd, $Pn, $Pm";
+
+// [482]   "trn1        $Zd, $Zn, $Zm";
+
+// [483]   "trn2        $Pd, $Pn, $Pm";
+
+// [484]   "trn2        $Zd, $Zn, $Zm";
+
+// [486]   "uabd        $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UABD_ZPmZ_B, UABD_ZPmZ_D, UABD_ZPmZ_H, UABD_ZPmZ_S)>;
+
+// [487]   "uaddv       $Vd, $Pg, $Zn";
+def : InstRW<[A64FXWrite_12Cyc_GI03], (instrs UADDV_VPZ_B, UADDV_VPZ_D, UADDV_VPZ_H, UADDV_VPZ_S)>;
+
+// [488]   "ucvtf       $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs UCVTF_ZPmZ_DtoD, UCVTF_ZPmZ_DtoH, UCVTF_ZPmZ_DtoS, UCVTF_ZPmZ_HtoH, UCVTF_ZPmZ_StoD, UCVTF_ZPmZ_StoH, UCVTF_ZPmZ_StoS)>;
+
+// [489]   "udiv        $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_37Cyc_GI0], (instrs UDIV_ZPmZ_D, UDIV_ZPmZ_S)>;
+
+// [490]   "udivr       $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_37Cyc_GI0], (instrs UDIVR_ZPmZ_D, UDIVR_ZPmZ_S)>;
+
+// [491]   "udot        $Zda, $Zn, $Zm";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs UDOT_ZZZ_D, UDOT_ZZZ_S)>;
+
+// [492]   "udot        $Zda, $Zn, $Zm$iop";
+def : InstRW<[A64FXWrite_15Cyc_NGI03], (instrs UDOT_ZZZI_D, UDOT_ZZZI_S)>;
+
+// [493]   "umax        $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UMAX_ZPmZ_B, UMAX_ZPmZ_D, UMAX_ZPmZ_H, UMAX_ZPmZ_S)>;
+
+// [494]   "umax        $Zdn, $_Zdn, $imm";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs UMAX_ZI_B, UMAX_ZI_D, UMAX_ZI_H, UMAX_ZI_S)>;
+
+// [495]   "umaxv       $Vd, $Pg, $Zn";
+def : InstRW<[A64FXWrite_14Cyc_GI03], (instrs UMAXV_VPZ_B, UMAXV_VPZ_D, UMAXV_VPZ_H, UMAXV_VPZ_S)>;
+
+// [496]   "umin        $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UMIN_ZPmZ_B, UMIN_ZPmZ_D, UMIN_ZPmZ_H, UMIN_ZPmZ_S)>;
+
+// [497]   "umin        $Zdn, $_Zdn, $imm";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs UMIN_ZI_B, UMIN_ZI_D, UMIN_ZI_H, UMIN_ZI_S)>;
+
+// [498]   "uminv       $Vd, $Pg, $Zn";
+def : InstRW<[A64FXWrite_14Cyc_GI03], (instrs UMINV_VPZ_B, UMINV_VPZ_D, UMINV_VPZ_H, UMINV_VPZ_S)>;
+
+// [499]   "umulh       $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs UMULH_ZPmZ_B, UMULH_ZPmZ_D, UMULH_ZPmZ_H, UMULH_ZPmZ_S)>;
+
+// [500]   "uqadd       $Zd, $Zn, $Zm";
+
+// [501]   "uqadd       $Zdn, $_Zdn, $imm";
+
+// [502]   "uqdecb      $Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs UQDECB_WPiI, UQDECB_XPiI)>;
+
+// [503]   "uqdecd      $Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs UQDECD_WPiI, UQDECD_XPiI)>;
+
+// [504]   "uqdecd      $Zdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UQDECD_ZPiI)>;
+
+// [505]   "uqdech      $Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs UQDECH_WPiI, UQDECH_XPiI)>;
+
+// [506]   "uqdech      $Zdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UQDECH_ZPiI)>;
+
+// [507]   "uqdecp      $Rdn, $Pg";
+def : InstRW<[A64FXWrite_8Cyc_GI124], (instrs UQDECP_WP_B, UQDECP_WP_D, UQDECP_WP_H, UQDECP_WP_S, UQDECP_XP_B, UQDECP_XP_D, UQDECP_XP_H, UQDECP_XP_S)>;
+
+// [508]   "uqdecp      $Zdn, $Pg";
+def : InstRW<[A64FXWrite_12Cyc_GI01], (instrs UQDECP_ZP_D, UQDECP_ZP_H, UQDECP_ZP_S)>;
+
+// [509]   "uqdecw      $Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs UQDECW_WPiI, UQDECW_XPiI)>;
+
+// [510]   "uqdecw      $Zdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UQDECW_ZPiI)>;
+
+// [511]   "uqincb      $Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs UQINCB_WPiI, UQINCB_XPiI)>;
+
+// [512]   "uqincd      $Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs UQINCD_WPiI, UQINCD_XPiI)>;
+
+// [513]   "uqincd      $Zdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UQINCD_ZPiI)>;
+
+// [514]   "uqinch      $Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs UQINCH_WPiI, UQINCH_XPiI)>;
+
+// [515]   "uqinch      $Zdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UQINCH_ZPiI)>;
+
+// [516]   "uqincp      $Rdn, $Pg";
+def : InstRW<[A64FXWrite_8Cyc_GI124], (instrs UQINCP_WP_B, UQINCP_WP_D, UQINCP_WP_H, UQINCP_WP_S, UQINCP_XP_B, UQINCP_XP_D, UQINCP_XP_H, UQINCP_XP_S)>;
+
+// [517]   "uqincp      $Zdn, $Pg";
+def : InstRW<[A64FXWrite_12Cyc_GI01], (instrs UQINCP_ZP_D, UQINCP_ZP_H, UQINCP_ZP_S)>;
+
+// [518]   "uqincw      $Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs UQINCW_WPiI, UQINCW_XPiI)>;
+
+// [519]   "uqincw      $Zdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UQINCW_ZPiI)>;
+
+// [520]   "uqsub       $Zd, $Zn, $Zm";
+//@@@ def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UQSUB_ZZZ_B, UQSUB_ZZZ_D, UQSUB_ZZZ_H, UQSUB_ZZZ_S)>;
+
+// [521]   "uqsub       $Zdn, $_Zdn, $imm";
+//@@@ def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UQSUB_ZI_B, UQSUB_ZI_D, UQSUB_ZI_H, UQSUB_ZI_S)>;
+
+// [522]   "uunpkhi     $Zd, $Zn";
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs UUNPKHI_ZZ_D, UUNPKHI_ZZ_H, UUNPKHI_ZZ_S)>;
+
+// [523]   "uunpklo     $Zd, $Zn";
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs UUNPKLO_ZZ_D, UUNPKLO_ZZ_H, UUNPKLO_ZZ_S)>;
+
+// [524]   "uxtb        $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UXTB_ZPmZ_D, UXTB_ZPmZ_H, UXTB_ZPmZ_S)>;
+
+// [525]   "uxth        $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UXTH_ZPmZ_D, UXTH_ZPmZ_S)>;
+
+// [526]   "uxtw        $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UXTW_ZPmZ_D)>;
+
+// [527]   "uzp1        $Pd, $Pn, $Pm";
+
+// [528]   "uzp1        $Zd, $Zn, $Zm";
+
+// [529]   "uzp2        $Pd, $Pn, $Pm";
+
+// [530]   "uzp2        $Zd, $Zn, $Zm";
+
+// [531]   "whilele     $Pd, $Rn, $Rm";
+def : InstRW<[A64FXWrite_4Cyc_GI12], (instrs WHILELE_PWW_B, WHILELE_PWW_D, WHILELE_PWW_H, WHILELE_PWW_S, WHILELE_PXX_B, WHILELE_PXX_D, WHILELE_PXX_H, WHILELE_PXX_S)>;
+
+// [532]   "whilelo     $Pd, $Rn, $Rm";
+def : InstRW<[A64FXWrite_4Cyc_GI12], (instrs WHILELO_PWW_B, WHILELO_PWW_D, WHILELO_PWW_H, WHILELO_PWW_S, WHILELO_PXX_B, WHILELO_PXX_D, WHILELO_PXX_H, WHILELO_PXX_S)>;
+
+// [533]   "whilels     $Pd, $Rn, $Rm";
+def : InstRW<[A64FXWrite_4Cyc_GI12], (instrs WHILELS_PWW_B, WHILELS_PWW_D, WHILELS_PWW_H, WHILELS_PWW_S, WHILELS_PXX_B, WHILELS_PXX_D, WHILELS_PXX_H, WHILELS_PXX_S)>;
+
+// [534]   "whilelt     $Pd, $Rn, $Rm";
+def : InstRW<[A64FXWrite_4Cyc_GI12], (instrs WHILELT_PWW_B, WHILELT_PWW_D, WHILELT_PWW_H, WHILELT_PWW_S, WHILELT_PXX_B, WHILELT_PXX_D, WHILELT_PXX_H, WHILELT_PXX_S)>;
+
+// [535]   "wrffr       $Pn";
+def : InstRW<[A64FXWrite_6Cyc_NGI1], (instrs WRFFR)>;
+
+// [536]   "zip1        $Pd, $Pn, $Pm";
+
+// [537]   "zip1        $Zd, $Zn, $Zm";
+
+// [538]   "zip2        $Pd, $Pn, $Pm";
+
+// [539]   "zip2        $Zd, $Zn, $Zm";
+
+} // SchedModel = A64FXModel
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SchedTSV110.td b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SchedTSV110.td
new file mode 100644
index 000000000000..438371c1b6a8
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SchedTSV110.td
@@ -0,0 +1,745 @@
+//==- AArch64SchedTSV110.td - Huawei TSV110 Scheduling Definitions -*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Huawei TSV110 to support
+// instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+// ===---------------------------------------------------------------------===//
+// The following definitions describe the simpler per-operand machine model.
+// This works with MachineScheduler. See llvm/MC/MCSchedule.h for details.
+
+// Huawei TSV110 scheduling machine model.
+def TSV110Model : SchedMachineModel {
+  let IssueWidth            =   4; // 4 micro-ops dispatched  per cycle. 
+  let MicroOpBufferSize     = 128; // 128 micro-op re-order buffer
+  let LoopMicroOpBufferSize =  16; 
+  let LoadLatency           =   4; // Optimistic load latency.
+  let MispredictPenalty     =  14; // Fetch + Decode/Rename/Dispatch + Branch
+  let CompleteModel         =   1;
+
+  list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
+                                                    PAUnsupported.F);
+}
+
+// Define each kind of processor resource and number available on the TSV110,
+// which has 8 pipelines, each with its own queue where micro-ops wait for
+// their operands and issue out-of-order to one of eight execution pipelines.
+let SchedModel = TSV110Model in {
+  def TSV110UnitALU  : ProcResource<1>; // Int ALU
+  def TSV110UnitAB   : ProcResource<2>; // Int ALU/BRU
+  def TSV110UnitMDU  : ProcResource<1>; // Multi-Cycle
+  def TSV110UnitFSU1 : ProcResource<1>; // FP/ASIMD
+  def TSV110UnitFSU2 : ProcResource<1>; // FP/ASIMD
+  def TSV110UnitLdSt : ProcResource<2>; // Load/Store
+
+  def TSV110UnitF     : ProcResGroup<[TSV110UnitFSU1, TSV110UnitFSU2]>;
+  def TSV110UnitALUAB : ProcResGroup<[TSV110UnitALU, TSV110UnitAB]>;
+  def TSV110UnitFLdSt : ProcResGroup<[TSV110UnitFSU1, TSV110UnitFSU2, TSV110UnitLdSt]>;
+}
+
+let SchedModel = TSV110Model in {
+
+//===----------------------------------------------------------------------===//
+// Map the target-defined scheduler read/write resources and latency for 
+// TSV110
+
+// Integer ALU
+def : WriteRes<WriteImm,   [TSV110UnitALUAB]> { let Latency = 1; }
+def : WriteRes<WriteI,     [TSV110UnitALUAB]> { let Latency = 1; }
+def : WriteRes<WriteISReg, [TSV110UnitMDU]>   { let Latency = 2; } 
+def : WriteRes<WriteIEReg, [TSV110UnitMDU]>   { let Latency = 2; } 
+def : WriteRes<WriteExtr,  [TSV110UnitALUAB]> { let Latency = 1; }
+def : WriteRes<WriteIS,    [TSV110UnitALUAB]> { let Latency = 1; }
+
+// Integer Mul/MAC/Div
+def : WriteRes<WriteID32,  [TSV110UnitMDU]> { let Latency = 12;
+                                              let ResourceCycles = [12]; } 
+def : WriteRes<WriteID64,  [TSV110UnitMDU]> { let Latency = 20;
+                                              let ResourceCycles = [20]; }
+def : WriteRes<WriteIM32,  [TSV110UnitMDU]> { let Latency = 3; }
+def : WriteRes<WriteIM64,  [TSV110UnitMDU]> { let Latency = 4; }
+
+// Load
+def : WriteRes<WriteLD,    [TSV110UnitLdSt]> { let Latency = 4; }
+def : WriteRes<WriteLDIdx, [TSV110UnitLdSt]> { let Latency = 4; }
+def : WriteRes<WriteLDHi,  []> { let Latency = 4; }
+
+// Pre/Post Indexing
+def : WriteRes<WriteAdr,   [TSV110UnitALUAB]> { let Latency = 1; } 
+
+// Store
+def : WriteRes<WriteST,    [TSV110UnitLdSt]> { let Latency = 1; }
+def : WriteRes<WriteSTP,   [TSV110UnitLdSt]> { let Latency = 1; }
+def : WriteRes<WriteSTIdx, [TSV110UnitLdSt]> { let Latency = 1; }
+
+// FP
+def : WriteRes<WriteF,     [TSV110UnitF]> { let Latency = 2; }
+def : WriteRes<WriteFCmp,  [TSV110UnitF]> { let Latency = 3; }
+def : WriteRes<WriteFCvt,  [TSV110UnitF]> { let Latency = 3; } 
+def : WriteRes<WriteFCopy, [TSV110UnitF]> { let Latency = 2; }
+def : WriteRes<WriteFImm,  [TSV110UnitF]> { let Latency = 2; } 
+def : WriteRes<WriteFMul,  [TSV110UnitF]> { let Latency = 5; }
+
+// FP Div, Sqrt
+def : WriteRes<WriteFDiv,  [TSV110UnitFSU1]> { let Latency = 18; } 
+
+def : WriteRes<WriteV,     [TSV110UnitF]>     { let Latency = 4; }
+def : WriteRes<WriteVLD,   [TSV110UnitFLdSt]> { let Latency = 5; }
+def : WriteRes<WriteVST,   [TSV110UnitF]>     { let Latency = 1; }
+
+// Branch
+def : WriteRes<WriteBr,    [TSV110UnitAB]> { let Latency = 1; }
+def : WriteRes<WriteBrReg, [TSV110UnitAB]> { let Latency = 1; }
+def : WriteRes<WriteSys,     []> { let Latency = 1; }
+def : WriteRes<WriteBarrier, []> { let Latency = 1; }
+def : WriteRes<WriteHint,    []> { let Latency = 1; }
+
+def : WriteRes<WriteAtomic, []> { let Unsupported = 1; } 
+
+// Forwarding logic is modeled only for multiply and accumulate.
+def : ReadAdvance<ReadI,       0>;
+def : ReadAdvance<ReadISReg,   0>;
+def : ReadAdvance<ReadIEReg,   0>;
+def : ReadAdvance<ReadIM,      0>;
+def : ReadAdvance<ReadIMA,     2, [WriteIM32, WriteIM64]>;
+def : ReadAdvance<ReadID,      0>;
+def : ReadAdvance<ReadExtrHi,  0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadVLD,     0>;
+
+def : InstRW<[WriteI], (instrs COPY)>;
+
+// Detailed Refinements
+//===----------------------------------------------------------------------===//
+
+// Contains all of the TSV110 specific SchedWriteRes types. The approach
+// below is to define a generic SchedWriteRes for every combination of
+// latency and microOps. The naming conventions is to use a prefix, one field
+// for latency, and one or more microOp count/type designators.
+//   Prefix: TSV110Wr
+//       Latency: #cyc
+//   MicroOp Count/Types: #(ALU|AB|MDU|FSU1|FSU2|LdSt|ALUAB|F|FLdSt)
+//
+// e.g. TSV110Wr_6cyc_1ALU_6MDU_4LdSt means the total latency is 6 and there are
+//      1 micro-ops to be issued down one ALU pipe, six MDU pipes and four LdSt pipes.
+//
+
+//===----------------------------------------------------------------------===//
+// Define Generic 1 micro-op types
+
+def TSV110Wr_1cyc_1AB    : SchedWriteRes<[TSV110UnitAB]>    { let Latency = 1; }
+def TSV110Wr_1cyc_1ALU   : SchedWriteRes<[TSV110UnitALU]>   { let Latency = 1; }
+def TSV110Wr_1cyc_1ALUAB : SchedWriteRes<[TSV110UnitALUAB]> { let Latency = 1; }
+def TSV110Wr_1cyc_1LdSt  : SchedWriteRes<[TSV110UnitLdSt]>  { let Latency = 1; }
+
+def TSV110Wr_2cyc_1AB    : SchedWriteRes<[TSV110UnitAB]>    { let Latency = 2; }
+def TSV110Wr_2cyc_1ALU   : SchedWriteRes<[TSV110UnitALU]>   { let Latency = 2; }
+def TSV110Wr_2cyc_1LdSt  : SchedWriteRes<[TSV110UnitLdSt]>  { let Latency = 2; }
+def TSV110Wr_2cyc_1MDU   : SchedWriteRes<[TSV110UnitMDU]>   { let Latency = 2; }
+def TSV110Wr_2cyc_1FSU1  : SchedWriteRes<[TSV110UnitFSU1]>  { let Latency = 2; }
+def TSV110Wr_2cyc_1F     : SchedWriteRes<[TSV110UnitF]>     { let Latency = 2; }
+
+def TSV110Wr_3cyc_1F     : SchedWriteRes<[TSV110UnitF]>     { let Latency = 3; }
+def TSV110Wr_3cyc_1FSU1  : SchedWriteRes<[TSV110UnitFSU1]>  { let Latency = 3; }
+def TSV110Wr_3cyc_1MDU   : SchedWriteRes<[TSV110UnitMDU]>   { let Latency = 3; }
+
+def TSV110Wr_4cyc_1FSU1  : SchedWriteRes<[TSV110UnitFSU1]>  { let Latency = 4; }
+def TSV110Wr_4cyc_1F     : SchedWriteRes<[TSV110UnitF]>     { let Latency = 4; }
+def TSV110Wr_4cyc_1LdSt  : SchedWriteRes<[TSV110UnitLdSt]>  { let Latency = 4; }
+def TSV110Wr_4cyc_1MDU   : SchedWriteRes<[TSV110UnitMDU]>   { let Latency = 4; }
+
+def TSV110Wr_5cyc_1F     : SchedWriteRes<[TSV110UnitF]>     { let Latency = 5; }
+def TSV110Wr_5cyc_1FSU1  : SchedWriteRes<[TSV110UnitFSU1]>  { let Latency = 5; }
+def TSV110Wr_5cyc_1FSU2  : SchedWriteRes<[TSV110UnitFSU2]>  { let Latency = 5; }
+def TSV110Wr_5cyc_1LdSt  : SchedWriteRes<[TSV110UnitLdSt]>  { let Latency = 5; }
+
+def TSV110Wr_6cyc_1F     : SchedWriteRes<[TSV110UnitF]>     { let Latency = 6; }
+
+def TSV110Wr_7cyc_1F     : SchedWriteRes<[TSV110UnitF]>     { let Latency = 7; }
+
+def TSV110Wr_8cyc_1F     : SchedWriteRes<[TSV110UnitF]>     { let Latency = 8; }
+
+def TSV110Wr_11cyc_1FSU1 : SchedWriteRes<[TSV110UnitFSU1]>  { let Latency = 11; }
+
+def TSV110Wr_12cyc_1MDU  : SchedWriteRes<[TSV110UnitMDU]>   { let Latency = 12; }
+
+def TSV110Wr_17cyc_1FSU2 : SchedWriteRes<[TSV110UnitFSU2]>  { let Latency = 17; }
+
+def TSV110Wr_18cyc_1FSU1 : SchedWriteRes<[TSV110UnitFSU1]>  { let Latency = 18; }
+
+def TSV110Wr_20cyc_1MDU  : SchedWriteRes<[TSV110UnitMDU]>   { let Latency = 20; }
+
+def TSV110Wr_24cyc_1FSU1 : SchedWriteRes<[TSV110UnitFSU1]>  { let Latency = 24; }
+
+def TSV110Wr_31cyc_1FSU2 : SchedWriteRes<[TSV110UnitFSU2]>  { let Latency = 31; }
+
+def TSV110Wr_36cyc_1FSU2 : SchedWriteRes<[TSV110UnitFSU2]>  { let Latency = 36; }
+
+def TSV110Wr_38cyc_1FSU1 : SchedWriteRes<[TSV110UnitFSU1]>  { let Latency = 38; }
+
+def TSV110Wr_64cyc_1FSU2 : SchedWriteRes<[TSV110UnitFSU2]>  { let Latency = 64; }
+
+//===----------------------------------------------------------------------===//
+// Define Generic 2 micro-op types
+
+def TSV110Wr_1cyc_1LdSt_1ALUAB : SchedWriteRes<[TSV110UnitLdSt,
+                                                TSV110UnitALUAB]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+def TSV110Wr_2cyc_1LdSt_1ALUAB :  SchedWriteRes<[TSV110UnitLdSt,
+                                                 TSV110UnitALUAB]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+def TSV110Wr_2cyc_2LdSt        : SchedWriteRes<[TSV110UnitLdSt,
+                                                TSV110UnitLdSt]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+def TSV110Wr_2cyc_2F           : SchedWriteRes<[TSV110UnitF,
+                                                TSV110UnitF]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+def TSV110Wr_2cyc_1FSU1_1FSU2  : SchedWriteRes<[TSV110UnitFSU1,
+                                                TSV110UnitFSU2]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+def TSV110Wr_4cyc_2F           : SchedWriteRes<[TSV110UnitF,
+                                                TSV110UnitF]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+
+def TSV110Wr_4cyc_1FSU1_1FSU2  : SchedWriteRes<[TSV110UnitFSU1,
+                                                TSV110UnitFSU2]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+
+def TSV110Wr_4cyc_1LdSt_1ALUAB : SchedWriteRes<[TSV110UnitLdSt,
+                                                TSV110UnitALUAB]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+
+def TSV110Wr_5cyc_1ALU_1F      : SchedWriteRes<[TSV110UnitALU,
+                                                TSV110UnitF]> {
+  let Latency     = 5;
+  let NumMicroOps = 2;
+}
+
+def TSV110Wr_6cyc_2LdSt        : SchedWriteRes<[TSV110UnitLdSt,
+                                                TSV110UnitLdSt]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+}
+
+def TSV110Wr_6cyc_1LdSt_1ALUAB : SchedWriteRes<[TSV110UnitLdSt,
+                                                TSV110UnitALUAB]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+}
+
+def TSV110Wr_7cyc_1F_1LdSt     : SchedWriteRes<[TSV110UnitF,
+                                                TSV110UnitLdSt]> {
+  let Latency = 7;
+  let NumMicroOps = 2;
+}
+
+def TSV110Wr_8cyc_2FSU1        : SchedWriteRes<[TSV110UnitFSU1,
+                                                TSV110UnitFSU1]> {
+  let Latency = 8;
+  let NumMicroOps = 2;
+}
+
+
+def TSV110Wr_8cyc_1FSU1_1FSU2  : SchedWriteRes<[TSV110UnitFSU1,
+                                                TSV110UnitFSU2]> {
+  let Latency = 8;
+  let NumMicroOps = 2;
+}
+
+//===----------------------------------------------------------------------===//
+// Define Generic 3 micro-op types
+
+def TSV110Wr_6cyc_3F       : SchedWriteRes<[TSV110UnitF, TSV110UnitF,
+                                            TSV110UnitF]> {
+  let Latency     = 6;
+  let NumMicroOps = 3;
+}
+
+def TSV110Wr_6cyc_3LdSt    : SchedWriteRes<[TSV110UnitLdSt, TSV110UnitLdSt,
+                                            TSV110UnitLdSt]> {
+  let Latency = 6;
+  let NumMicroOps = 3;
+}
+
+def TSV110Wr_7cyc_2F_1LdSt : SchedWriteRes<[TSV110UnitF, TSV110UnitF,
+                                                         TSV110UnitLdSt]> {
+  let Latency = 7;
+  let NumMicroOps = 3;
+}
+
+//===----------------------------------------------------------------------===//
+// Define Generic 4 micro-op types
+
+def TSV110Wr_8cyc_4F          : SchedWriteRes<[TSV110UnitF, TSV110UnitF,
+                                               TSV110UnitF, TSV110UnitF]> {
+  let Latency = 8;
+  let NumMicroOps = 4;
+}
+
+def TSV110Wr_8cyc_3F_1LdSt    : SchedWriteRes<[TSV110UnitF, TSV110UnitF,
+                                               TSV110UnitF, TSV110UnitLdSt]> {
+  let Latency = 8;
+  let NumMicroOps = 4;
+}
+
+//===----------------------------------------------------------------------===//
+// Define Generic 5 micro-op types
+
+def TSV110Wr_8cyc_3F_2LdSt : SchedWriteRes<[TSV110UnitF, TSV110UnitF, TSV110UnitF,
+                                            TSV110UnitLdSt, TSV110UnitLdSt]> {
+  let Latency = 8;
+  let NumMicroOps = 5;
+}
+
+//===----------------------------------------------------------------------===//
+// Define Generic 8 micro-op types
+
+def TSV110Wr_10cyc_4F_4LdSt : SchedWriteRes<[TSV110UnitF, TSV110UnitF,
+                                             TSV110UnitF, TSV110UnitF,
+                                             TSV110UnitLdSt, TSV110UnitLdSt,
+                                             TSV110UnitLdSt, TSV110UnitLdSt]> {
+  let Latency = 10;
+  let NumMicroOps = 8;
+}
+
+
+// Branch Instructions
+// -----------------------------------------------------------------------------
+
+def : InstRW<[TSV110Wr_1cyc_1AB], (instrs B)>;
+def : InstRW<[TSV110Wr_1cyc_1AB], (instrs BL)>;
+def : InstRW<[TSV110Wr_1cyc_1AB], (instrs BLR)>;
+def : InstRW<[TSV110Wr_1cyc_1AB], (instregex "^(BR|RET|(CBZ|CBNZ|TBZ|TBNZ))$")>;
+
+
+// Cryptography Extensions
+// -----------------------------------------------------------------------------
+
+def : InstRW<[TSV110Wr_3cyc_1FSU1], (instregex "^AES[DE]")>;
+def : InstRW<[TSV110Wr_3cyc_1FSU1], (instregex "^AESI?MC")>;
+def : InstRW<[TSV110Wr_2cyc_1FSU1], (instregex "^SHA1SU1")>;
+def : InstRW<[TSV110Wr_2cyc_2F],    (instregex "^SHA1(H|SU0)")>;
+def : InstRW<[TSV110Wr_5cyc_1FSU1], (instregex "^SHA1[CMP]")>;
+def : InstRW<[TSV110Wr_2cyc_1FSU1], (instregex "^SHA256SU0")>;
+def : InstRW<[TSV110Wr_3cyc_1FSU1], (instregex "^SHA256SU1")>;
+def : InstRW<[TSV110Wr_5cyc_1FSU1], (instregex "^SHA256(H|H2)")>;
+def TSV110ReadCRC: SchedReadAdvance<1, [TSV110Wr_2cyc_1MDU]>;
+def : InstRW<[TSV110Wr_2cyc_1MDU, TSV110ReadCRC],  (instregex "^CRC32.*$")>;
+
+
+// Arithmetic and Logical Instructions
+// -----------------------------------------------------------------------------
+
+def : InstRW<[TSV110Wr_1cyc_1ALUAB], (instregex "(BIC|EON|ORN)[WX]rr")>;
+def : InstRW<[TSV110Wr_1cyc_1AB],    (instregex "(BIC)S[WX]rr")>;
+
+def : InstRW<[TSV110Wr_1cyc_1ALUAB], (instregex "(ADD|AND|EOR|ORR|SUB)[WX]r(r|i)")>;
+def : InstRW<[TSV110Wr_1cyc_1AB],    (instregex "(ADD|AND|EOR|ORR|SUB)S[WX]r(r|i)")>;
+
+def : InstRW<[TSV110Wr_1cyc_1ALUAB], (instregex "^(ADC|SBC|BIC)[WX]r$")>;
+def : InstRW<[TSV110Wr_1cyc_1AB],    (instregex "^(ADC|SBC)S[WX]r$")>;
+
+def : InstRW<[TSV110Wr_2cyc_1MDU],   (instregex "^(AND|BIC|EON|EOR|ORN|ORR)[WX]rs$")>;
+def : InstRW<[TSV110Wr_2cyc_1AB],    (instregex "^(AND|BIC|EON|EOR|ORN|ORR)S[WX]rs$")>;
+def : InstRW<[TSV110Wr_2cyc_1MDU],   (instregex "^(ADD|SUB)[WX]r(s|x|x64)$")>;
+def : InstRW<[TSV110Wr_2cyc_1AB],    (instregex "^(ADD|SUB)S[WX]r(s|x|x64)$")>;
+
+def : InstRW<[TSV110Wr_1cyc_1AB],    (instregex "^(CCMN|CCMP)(W|X)(r|i)$")>;
+def : InstRW<[TSV110Wr_1cyc_1ALUAB], (instregex "^(CSEL|CSINC|CSINV|CSNEG)(W|X)r$")>;
+
+
+// Move and Shift Instructions
+// -----------------------------------------------------------------------------
+
+def : InstRW<[TSV110Wr_1cyc_1ALUAB], (instrs ADR, ADRP)>;
+def : InstRW<[TSV110Wr_1cyc_1ALUAB], (instregex "^MOV[NZK][WX]i")>;
+def : InstRW<[TSV110Wr_1cyc_1ALUAB], (instregex "(LSLV|LSRV|ASRV|RORV)(W|X)r")>;
+
+
+// Divide and Multiply Instructions
+// -----------------------------------------------------------------------------
+
+def : InstRW<[TSV110Wr_12cyc_1MDU],  (instregex "^(S|U)DIVWr$")>;
+def : InstRW<[TSV110Wr_20cyc_1MDU],  (instregex "^(S|U)DIVXr$")>;
+
+def TSV110ReadMAW : SchedReadAdvance<2, [TSV110Wr_3cyc_1MDU]>;
+def : InstRW<[TSV110Wr_3cyc_1MDU, TSV110ReadMAW], (instrs MADDWrrr, MSUBWrrr)>;
+def TSV110ReadMAQ : SchedReadAdvance<3, [TSV110Wr_4cyc_1MDU]>;
+def : InstRW<[TSV110Wr_4cyc_1MDU, TSV110ReadMAQ], (instrs MADDXrrr, MSUBXrrr)>;
+def : InstRW<[TSV110Wr_3cyc_1MDU, TSV110ReadMAW], (instregex "(S|U)(MADDL|MSUBL)rrr")>;
+def : InstRW<[TSV110Wr_4cyc_1MDU], (instregex "^(S|U)MULHrr$")>;
+
+
+// Miscellaneous Data-Processing Instructions
+// -----------------------------------------------------------------------------
+
+def : InstRW<[TSV110Wr_1cyc_1ALUAB],    (instregex "^EXTR(W|X)rri$")>;
+def : InstRW<[TSV110Wr_1cyc_1ALUAB],    (instregex "^(S|U)?BFM(W|X)ri$")>;
+def : InstRW<[TSV110Wr_1cyc_1ALUAB],    (instregex "^(CLS|CLZ|RBIT|REV(16|32)?)(W|X)r$")>;
+
+
+// Load Instructions
+// -----------------------------------------------------------------------------
+
+def : InstRW<[TSV110Wr_4cyc_1LdSt],     (instregex "^LDR(W|X)l$")>;
+def : InstRW<[TSV110Wr_4cyc_1LdSt],     (instrs LDRSWl)>;
+
+def : InstRW<[TSV110Wr_4cyc_1LdSt],     (instregex "^LDR(BB|HH|W|X)ui$")>;
+def : InstRW<[TSV110Wr_4cyc_1LdSt],     (instregex "^LDRS(BW|BX|HW|HX|W)ui$")>;
+
+def : InstRW<[TSV110Wr_4cyc_1LdSt, WriteAdr],     (instregex "^LDR(BB|HH|W|X)(post|pre)$")>;
+def : InstRW<[TSV110Wr_4cyc_1LdSt, WriteAdr],     (instregex "^LDRS(BW|BX|HW|HX|W)(post|pre)$")>;
+
+def : InstRW<[TSV110Wr_4cyc_1LdSt],     (instregex "^LDTR(B|H|W|X)i$")>;
+def : InstRW<[TSV110Wr_4cyc_1LdSt],     (instregex "^LDUR(BB|HH|W|X)i$")>;
+def : InstRW<[TSV110Wr_4cyc_1LdSt],     (instregex "^LDTRS(BW|BX|HW|HX|W)i$")>;
+def : InstRW<[TSV110Wr_4cyc_1LdSt],     (instregex "^LDURS(BW|BX|HW|HX|W)i$")>;
+
+def : InstRW<[TSV110Wr_4cyc_1LdSt, WriteLDHi],     (instregex "^LDNP(W|X)i$")>;
+def : InstRW<[TSV110Wr_4cyc_1LdSt, WriteLDHi],     (instregex "^LDP(W|X)i$")>;
+def : InstRW<[TSV110Wr_4cyc_1LdSt_1ALUAB, WriteLDHi, WriteAdr],(instregex "^LDP(W|X)(post|pre)$")>;
+
+def : InstRW<[TSV110Wr_4cyc_1LdSt, WriteLDHi],           (instrs LDPSWi)>;
+def : InstRW<[TSV110Wr_4cyc_1LdSt, WriteLDHi, WriteAdr], (instrs LDPSWpost)>;
+def : InstRW<[TSV110Wr_4cyc_1LdSt, WriteLDHi, WriteAdr], (instrs LDPSWpre)>;
+
+def : InstRW<[TSV110Wr_4cyc_1LdSt],     (instrs PRFMl)>;
+def : InstRW<[TSV110Wr_4cyc_1LdSt],     (instrs PRFUMi)>;
+def : InstRW<[TSV110Wr_4cyc_1LdSt],     (instregex "^PRFMui$")>;
+def : InstRW<[TSV110Wr_4cyc_1LdSt],     (instregex "^PRFMro(W|X)$")>;
+
+
+// Store Instructions
+// -----------------------------------------------------------------------------
+
+def : InstRW<[TSV110Wr_1cyc_1LdSt],            (instregex "^STN?P(W|X)i$")>;
+def : InstRW<[TSV110Wr_1cyc_1LdSt, WriteAdr],  (instregex "^STP(W|X)(post|pre)$")>;
+def : InstRW<[TSV110Wr_1cyc_1LdSt],            (instregex "^STUR(BB|HH|W|X)i$")>;
+def : InstRW<[TSV110Wr_1cyc_1LdSt],            (instregex "^STTR(B|H|W|X)i$")>;
+def : InstRW<[TSV110Wr_1cyc_1LdSt],            (instregex "^STR(BB|HH|W|X)ui$")>;
+
+def : InstRW<[TSV110Wr_1cyc_1LdSt, WriteAdr],  (instregex "^STR(BB|HH|W|X)(post|pre)$")>;
+def : InstRW<[TSV110Wr_1cyc_1LdSt, WriteAdr],  (instregex "^STR(BB|HH|W|X)ro(W|X)$")>;
+
+
+// FP Data Processing Instructions
+// -----------------------------------------------------------------------------
+
+def : InstRW<[TSV110Wr_2cyc_1F], (instregex "F(ABS|NEG)(D|S)r")>;
+def : InstRW<[TSV110Wr_3cyc_1F], (instregex "^FCCMP(E)?(S|D)rr$")>;
+def : InstRW<[TSV110Wr_3cyc_1F], (instregex "^FCMP(E)?(S|D)r(r|i)$")>;
+def : InstRW<[TSV110Wr_3cyc_1F], (instregex "^FCSEL(S|D)rrr$")>;
+
+def : InstRW<[TSV110Wr_11cyc_1FSU1], (instrs FDIVSrr)>;
+def : InstRW<[TSV110Wr_18cyc_1FSU1], (instrs FDIVDrr)>;
+def : InstRW<[TSV110Wr_17cyc_1FSU2], (instrs FSQRTSr)>;
+def : InstRW<[TSV110Wr_31cyc_1FSU2], (instrs FSQRTDr)>;
+
+def : InstRW<[TSV110Wr_2cyc_1F], (instregex "^F(MAX|MIN).+rr")>;
+
+def : InstRW<[TSV110Wr_4cyc_1F], (instregex "^FN?M(ADD|SUB)Hrrr")>;
+def : InstRW<[TSV110Wr_5cyc_1F], (instregex "^FN?M(ADD|SUB)Srrr")>;
+def : InstRW<[TSV110Wr_7cyc_1F], (instregex "^FN?M(ADD|SUB)Drrr")>;
+
+def : InstRW<[TSV110Wr_4cyc_1F], (instregex "^F(ADD|SUB)Hrr")>;
+def : InstRW<[TSV110Wr_5cyc_1F], (instregex "^F(ADD|SUB)Srr")>;
+def : InstRW<[TSV110Wr_4cyc_1F], (instregex "^F(ADD|SUB)Drr")>;
+
+def : InstRW<[TSV110Wr_4cyc_1F], (instregex "^F(N)?MULHrr$")>;
+def : InstRW<[TSV110Wr_5cyc_1F], (instregex "^F(N)?MULSrr$")>;
+def : InstRW<[TSV110Wr_5cyc_1F], (instregex "^F(N)?MULDrr$")>;
+
+def : InstRW<[TSV110Wr_3cyc_1F], (instregex "^FRINT.+r")>;
+
+
+// FP Miscellaneous Instructions
+// -----------------------------------------------------------------------------
+
+def : InstRW<[TSV110Wr_5cyc_1ALU_1F], (instregex "^[SU]CVTF[SU][WX][SD]ri")>;
+def : InstRW<[TSV110Wr_4cyc_1FSU1],   (instregex "^FCVT(A|M|N|P|Z)(S|U)U(W|X)(S|D)r$")>;
+def : InstRW<[TSV110Wr_3cyc_1F],      (instregex "^FCVT[HSD][HSD]r")>;
+
+def : InstRW<[TSV110Wr_2cyc_1FSU1],   (instregex "^FMOV(DX|WS|XD|SW|DXHigh|XDHigh)r$")>;
+def : InstRW<[TSV110Wr_2cyc_1F],      (instregex "^FMOV[SD][ir]$")>;
+
+
+// FP Load Instructions
+// -----------------------------------------------------------------------------
+
+def : InstRW<[TSV110Wr_5cyc_1LdSt],                      (instregex "^LDR[DSQ]l")>;
+def : InstRW<[TSV110Wr_5cyc_1LdSt],                      (instregex "^LDUR[BDHSQ]i")>;
+def : InstRW<[TSV110Wr_5cyc_1LdSt, WriteAdr],            (instregex "^LDR[BDHSQ](post|pre)")>;
+def : InstRW<[TSV110Wr_5cyc_1LdSt],                      (instregex "^LDR[BDHSQ]ui")>;
+def : InstRW<[TSV110Wr_6cyc_1LdSt_1ALUAB, ReadAdrBase],  (instregex "^LDR(Q|D|H|S|B)ro(W|X)$")>;
+def : InstRW<[TSV110Wr_5cyc_1LdSt, WriteLDHi],           (instregex "^LDN?P[DQS]i")>;
+def : InstRW<[TSV110Wr_5cyc_1LdSt, WriteLDHi, WriteAdr], (instregex "^LDP[DQS](post|pre)")>;
+
+
+// FP Store Instructions
+// -----------------------------------------------------------------------------
+
+def : InstRW<[TSV110Wr_1cyc_1LdSt],                     (instregex "^STUR[BHSDQ]i")>;
+def : InstRW<[TSV110Wr_1cyc_1LdSt_1ALUAB, ReadAdrBase], (instregex "^STR[BHSDQ](post|pre)")>;
+def : InstRW<[TSV110Wr_1cyc_1LdSt],                     (instregex "^STR[BHSDQ]ui")>;
+def : InstRW<[TSV110Wr_2cyc_1LdSt_1ALUAB, ReadAdrBase], (instregex "^STR[BHSDQ]ro[WX]")>;
+def : InstRW<[TSV110Wr_2cyc_2LdSt],                     (instregex "^STN?P[SDQ]i")>;
+def : InstRW<[TSV110Wr_2cyc_2LdSt, WriteAdr],           (instregex "^STP[SDQ](post|pre)")>;
+
+
+// ASIMD Integer Instructions
+// -----------------------------------------------------------------------------
+
+// Reference for forms in this group
+//   D form - v8i8, v4i16, v2i32
+//   Q form - v16i8, v8i16, v4i32
+//   D form - v1i8, v1i16, v1i32, v1i64
+//   Q form - v16i8, v8i16, v4i32, v2i64
+//   D form - v8i8_v8i16, v4i16_v4i32, v2i32_v2i64
+//   Q form - v16i8_v8i16, v8i16_v4i32, v4i32_v2i64
+
+// ASIMD simple arithmetic
+def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^(ABS|ADD(P)?|NEG|SUB)v")>;
+def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^[SU](ADD(L|LP|W)|SUB(L|W))v")>;
+
+// ASIMD complex arithmetic
+def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^[SU]H(ADD|SUB)v")>;
+def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^R?(ADD|SUB)HN2?v")>;
+def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^[SU]Q(ADD|SUB)v")>;
+def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^(SU|US)QADDv")>;
+def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^[SU]RHADDv")>;
+def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^[SU]ABAL?v")>;
+def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^[SU]ABDL?v")>;
+def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^[SU]ADALPv")>;
+def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^((SQ)(ABS|NEG))v")>;
+
+// ASIMD compare
+def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^CM(EQ|GE|GT|HI|HS|LE|LT|TST)v")>;
+
+// ASIMD max/min
+def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^[SU](MIN|MAX)P?v")>;
+
+// ASIMD logical
+def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^(AND|BIC|BIF|BIT|BSL|EOR|MVN|NOT|ORN|ORR)v")>;
+
+// ASIMD multiply accumulate, D-form
+def : InstRW<[TSV110Wr_4cyc_1FSU1], (instregex "^(MUL|ML[AS]|SQR?D(MULH))(v8i8|v4i16|v2i32|v1i8|v1i16|v1i32|v1i64)")>;
+// ASIMD multiply accumulate, Q-form
+def : InstRW<[TSV110Wr_8cyc_2FSU1], (instregex "^(MUL|ML[AS]|SQR?D(MULH))(v16i8|v8i16|v4i32)")>;
+
+// ASIMD multiply accumulate long
+def : InstRW<[TSV110Wr_4cyc_1FSU1], (instregex "(S|U|SQD)(MLAL|MLSL|MULL)v.*")>;
+def : InstRW<[TSV110Wr_2cyc_1FSU1], (instregex "^PMULL(v8i8|v16i8)")>;
+def : InstRW<[TSV110Wr_2cyc_1FSU1], (instregex "^PMULL(v1i64|v2i64)")>;
+
+// ASIMD shift
+// ASIMD shift accumulate
+def : InstRW<[TSV110Wr_4cyc_1FSU1], (instregex "^(S|SR|U|UR)SRA")>;
+// ASIMD shift by immed, basic
+def : InstRW<[TSV110Wr_4cyc_1FSU1],
+            (instregex "SHLv","SLIv","SRIv","SHRNv","SQXTNv","SQXTUNv","UQXTNv")>;
+// ASIMD shift by immed, complex
+def : InstRW<[TSV110Wr_4cyc_1FSU1], (instregex "^[SU]?(Q|R){1,2}SHR")>;
+def : InstRW<[TSV110Wr_4cyc_1FSU1], (instregex "^SQSHLU")>;
+// ASIMD shift by register, basic, Q-form
+def : InstRW<[TSV110Wr_4cyc_1FSU1], (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>;
+// ASIMD shift by register, complex, D-form
+def : InstRW<[TSV110Wr_4cyc_1FSU1], (instregex "^[SU][QR]{1,2}SHL(v1i8|v1i16|v1i32|v1i64|v8i8|v4i16|v2i32|b|d|h|s)")>;
+// ASIMD shift by register, complex, Q-form
+def : InstRW<[TSV110Wr_4cyc_1FSU1], (instregex "^[SU][QR]{1,2}SHL(v16i8|v8i16|v4i32|v2i64)")>;
+
+// ASIMD reduction
+// ASIMD arith, reduce, 4H/4S
+def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v$")>;
+// ASIMD arith, reduce, 8B/8H
+def : InstRW<[TSV110Wr_8cyc_1FSU1_1FSU2], (instregex "^[SU]?ADDL?V(v8i16|v4i32)v$")>;
+// ASIMD arith, reduce, 16B
+def : InstRW<[TSV110Wr_8cyc_1FSU1_1FSU2], (instregex "^[SU]?ADDL?Vv16i8v$")>;
+
+// ASIMD max/min, reduce, 4H/4S
+def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^[SU](MIN|MAX)V(v4i16|v4i32)v$")>;
+// ASIMD max/min, reduce, 8B/8H
+def : InstRW<[TSV110Wr_8cyc_1FSU1_1FSU2], (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v$")>;
+// ASIMD max/min, reduce, 16B
+def : InstRW<[TSV110Wr_8cyc_1FSU1_1FSU2], (instregex "^[SU](MIN|MAX)Vv16i8v$")>;
+
+
+// Vector - Floating Point
+// -----------------------------------------------------------------------------
+
+// Reference for forms in this group
+//   D form - v2f32
+//   Q form - v4f32, v2f64
+//   D form - 32, 64
+//   D form - v1i32, v1i64
+//   D form - v2i32
+//   Q form - v4i32, v2i64
+
+// ASIMD FP sign manipulation
+def : InstRW<[TSV110Wr_2cyc_1F],  (instregex "^FABSv")>;
+def : InstRW<[TSV110Wr_2cyc_1F],  (instregex "^FNEGv")>;
+
+// ASIMD FP compare
+def : InstRW<[TSV110Wr_2cyc_1F],  (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v")>;
+
+// ASIMD FP convert
+def : InstRW<[TSV110Wr_2cyc_1F],  (instregex "^FCVT[AMNPZ][SU]v")>;
+def : InstRW<[TSV110Wr_3cyc_1F],  (instregex "^FCVT(L)v")>;
+def : InstRW<[TSV110Wr_5cyc_1F],  (instregex "^FCVT(N|XN)v")>;
+
+// ASIMD FP divide, D-form, F32
+def : InstRW<[TSV110Wr_11cyc_1FSU1], (instregex "FDIVv2f32")>;
+// ASIMD FP divide, Q-form, F32
+def : InstRW<[TSV110Wr_24cyc_1FSU1], (instregex "FDIVv4f32")>;
+// ASIMD FP divide, Q-form, F64
+def : InstRW<[TSV110Wr_38cyc_1FSU1], (instregex "FDIVv2f64")>;
+
+// ASIMD FP SQRT
+def : InstRW<[TSV110Wr_17cyc_1FSU2], (instrs FSQRTv2f32)>;
+def : InstRW<[TSV110Wr_36cyc_1FSU2], (instrs FSQRTv4f32)>;
+def : InstRW<[TSV110Wr_64cyc_1FSU2], (instrs FSQRTv2f64)>;
+
+// ASIMD FP max,min
+def : InstRW<[TSV110Wr_2cyc_1F],  (instregex "^F(MAX|MIN)(NM)?v")>;
+def : InstRW<[TSV110Wr_2cyc_1F],  (instregex "^F(MAX|MIN)(NM)?Pv")>;
+def : InstRW<[TSV110Wr_4cyc_1F],  (instregex "^F(MAX|MIN)(NM)?Vv")>;
+
+// ASIMD FP add
+def : InstRW<[TSV110Wr_5cyc_1F],  (instregex "^F(ADD|ADDP|SUB)v")>;
+
+// ASIMD FP multiply
+def : InstRW<[TSV110Wr_5cyc_1F],  (instregex "^FMULX?v")>;
+
+
+// ASIMD Miscellaneous Instructions
+// -----------------------------------------------------------------------------
+
+def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^(CLS|CLZ|CNT)v")>;
+def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^(DUP|INS)v.+lane")>;
+def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^REV(16|32|64)v")>;
+def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^(UZP|ZIP)[12]v")>;
+
+def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^EXTv")>;
+def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^XTNv")>;
+def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^RBITv")>;
+
+def : InstRW<[TSV110Wr_4cyc_1F], (instregex "^(INS|DUP)v.+gpr")>;
+
+def : InstRW<[TSV110Wr_3cyc_1FSU1], (instregex "^[SU]MOVv")>;
+
+// ASIMD table lookup, D-form
+def : InstRW<[TSV110Wr_2cyc_1F], (instregex "^TB[LX]v8i8One")>;
+def : InstRW<[TSV110Wr_4cyc_2F], (instregex "^TB[LX]v8i8Two")>;
+def : InstRW<[TSV110Wr_6cyc_3F], (instregex "^TB[LX]v8i8Three")>;
+def : InstRW<[TSV110Wr_8cyc_4F], (instregex "^TB[LX]v8i8Four")>;
+// ASIMD table lookup, Q-form
+def : InstRW<[TSV110Wr_2cyc_1F], (instregex "^TB[LX]v16i8One")>;
+def : InstRW<[TSV110Wr_4cyc_2F], (instregex "^TB[LX]v16i8Two")>;
+def : InstRW<[TSV110Wr_6cyc_3F], (instregex "^TB[LX]v16i8Three")>;
+def : InstRW<[TSV110Wr_8cyc_4F], (instregex "^TB[LX]v16i8Four")>;
+
+def : InstRW<[TSV110Wr_2cyc_1F], (instregex "^FMOVv")>;
+
+def : InstRW<[TSV110Wr_3cyc_1F], (instregex "^FRINT[AIMNPXZ]v")>;
+def : InstRW<[TSV110Wr_3cyc_1F], (instregex "^[SU]CVTFv")>;
+def : InstRW<[TSV110Wr_3cyc_1F], (instregex "^[FU](RECP|RSQRT)(E|X)v")>;
+
+
+// ASIMD Load Instructions
+// -----------------------------------------------------------------------------
+
+def : InstRW<[TSV110Wr_7cyc_1F_1LdSt],            (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[TSV110Wr_7cyc_1F_1LdSt, WriteAdr],  (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[TSV110Wr_7cyc_2F_1LdSt],            (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[TSV110Wr_7cyc_2F_1LdSt, WriteAdr],  (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[TSV110Wr_8cyc_3F_1LdSt],            (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[TSV110Wr_8cyc_3F_1LdSt, WriteAdr],  (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[TSV110Wr_8cyc_3F_2LdSt],            (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[TSV110Wr_8cyc_3F_2LdSt, WriteAdr],  (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+def  : InstRW<[TSV110Wr_7cyc_1F_1LdSt],           (instregex "LD1i(8|16|32|64)$")>;
+def  : InstRW<[TSV110Wr_7cyc_1F_1LdSt, WriteAdr], (instregex "LD1i(8|16|32|64)_POST$")>;
+def  : InstRW<[TSV110Wr_7cyc_2F_1LdSt],           (instregex "LD2i(8|16|32|64)$")>;
+def  : InstRW<[TSV110Wr_7cyc_2F_1LdSt, WriteAdr], (instregex "LD2i(8|16|32|64)_POST$")>;
+def  : InstRW<[TSV110Wr_8cyc_3F_1LdSt],           (instregex "LD3i(8|16|32|64)$")>;
+def  : InstRW<[TSV110Wr_8cyc_3F_1LdSt, WriteAdr], (instregex "LD3i(8|16|32|64)_POST$")>;
+def  : InstRW<[TSV110Wr_8cyc_3F_2LdSt],           (instregex "LD4i(8|16|32|64)$")>;
+def  : InstRW<[TSV110Wr_8cyc_3F_2LdSt, WriteAdr], (instregex "LD4i(8|16|32|64)_POST$")>;
+
+def : InstRW<[TSV110Wr_5cyc_1LdSt],               (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[TSV110Wr_5cyc_1LdSt, WriteAdr],     (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[TSV110Wr_5cyc_1LdSt],               (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[TSV110Wr_5cyc_1LdSt, WriteAdr],     (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[TSV110Wr_6cyc_3LdSt],               (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[TSV110Wr_6cyc_3LdSt, WriteAdr],     (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[TSV110Wr_6cyc_2LdSt],               (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[TSV110Wr_6cyc_2LdSt, WriteAdr],     (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[TSV110Wr_7cyc_2F_1LdSt],            (instregex "^LD2Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[TSV110Wr_7cyc_2F_1LdSt, WriteAdr],  (instregex "^LD2Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[TSV110Wr_8cyc_3F_1LdSt],            (instregex "^LD3Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[TSV110Wr_8cyc_3F_1LdSt, WriteAdr],  (instregex "^LD3Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[TSV110Wr_10cyc_4F_4LdSt],           (instregex "^LD4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[TSV110Wr_10cyc_4F_4LdSt, WriteAdr], (instregex "^LD4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+
+// ASIMD Store Instructions
+// -----------------------------------------------------------------------------
+
+def  : InstRW<[TSV110Wr_3cyc_1F],             (instregex "ST1i(8|16|32|64)$")>;
+def  : InstRW<[TSV110Wr_3cyc_1F, WriteAdr],   (instregex "ST1i(8|16|32|64)_POST$")>;
+def  : InstRW<[TSV110Wr_4cyc_1F],             (instregex "ST2i(8|16|32|64)$")>;
+def  : InstRW<[TSV110Wr_4cyc_1F, WriteAdr],   (instregex "ST2i(8|16|32|64)_POST$")>;
+def  : InstRW<[TSV110Wr_5cyc_1F],             (instregex "ST3i(8|16|32|64)$")>;
+def  : InstRW<[TSV110Wr_5cyc_1F, WriteAdr],   (instregex "ST3i(8|16|32|64)_POST$")>;
+def  : InstRW<[TSV110Wr_6cyc_1F],             (instregex "ST4i(8|16|32|64)$")>;
+def  : InstRW<[TSV110Wr_6cyc_1F, WriteAdr],   (instregex "ST4i(8|16|32|64)_POST$")>;
+
+def : InstRW<[TSV110Wr_3cyc_1F],              (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[TSV110Wr_3cyc_1F, WriteAdr],    (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[TSV110Wr_4cyc_1F],              (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[TSV110Wr_4cyc_1F, WriteAdr],    (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[TSV110Wr_5cyc_1F],              (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[TSV110Wr_5cyc_1F, WriteAdr],    (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[TSV110Wr_6cyc_1F],              (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[TSV110Wr_6cyc_1F, WriteAdr],    (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[TSV110Wr_4cyc_1F],              (instregex "^ST2Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[TSV110Wr_4cyc_1F, WriteAdr],    (instregex "^ST2Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[TSV110Wr_5cyc_1F],              (instregex "^ST3Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[TSV110Wr_5cyc_1F, WriteAdr],    (instregex "^ST3Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[TSV110Wr_8cyc_1F],              (instregex "^ST4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[TSV110Wr_8cyc_1F, WriteAdr],    (instregex "^ST4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+} // SchedModel = TSV110Model
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index 8f814d185e85..a5bc3668ed54 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -82,7 +82,8 @@ static SDValue EmitUnrolledSetTag(SelectionDAG &DAG, const SDLoc &dl,
   unsigned OffsetScaled = 0;
   while (OffsetScaled < ObjSizeScaled) {
     if (ObjSizeScaled - OffsetScaled >= 2) {
-      SDValue AddrNode = DAG.getMemBasePlusOffset(Ptr, OffsetScaled * 16, dl);
+      SDValue AddrNode =
+          DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(OffsetScaled * 16), dl);
       SDValue St = DAG.getMemIntrinsicNode(
           OpCode2, dl, DAG.getVTList(MVT::Other),
           {Chain, TagSrc, AddrNode},
@@ -94,7 +95,8 @@ static SDValue EmitUnrolledSetTag(SelectionDAG &DAG, const SDLoc &dl,
     }
 
     if (ObjSizeScaled - OffsetScaled > 0) {
-      SDValue AddrNode = DAG.getMemBasePlusOffset(Ptr, OffsetScaled * 16, dl);
+      SDValue AddrNode =
+          DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(OffsetScaled * 16), dl);
       SDValue St = DAG.getMemIntrinsicNode(
           OpCode1, dl, DAG.getVTList(MVT::Other),
           {Chain, TagSrc, AddrNode},
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64StackOffset.h b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64StackOffset.h
deleted file mode 100644
index 24751a81797d..000000000000
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64StackOffset.h
+++ /dev/null
@@ -1,151 +0,0 @@
-//==--AArch64StackOffset.h ---------------------------------------*- C++ -*-==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the declaration of the StackOffset class, which is used to
-// describe scalable and non-scalable offsets during frame lowering.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64STACKOFFSET_H
-#define LLVM_LIB_TARGET_AARCH64_AARCH64STACKOFFSET_H
-
-#include "llvm/Support/MachineValueType.h"
-#include "llvm/Support/TypeSize.h"
-#include <cassert>
-
-namespace llvm {
-
-/// StackOffset is a wrapper around scalable and non-scalable offsets and is
-/// used in several functions such as 'isAArch64FrameOffsetLegal' and
-/// 'emitFrameOffset()'. StackOffsets are described by MVTs, e.g.
-//
-///   StackOffset(1, MVT::nxv16i8)
-//
-/// would describe an offset as being the size of a single SVE vector.
-///
-/// The class also implements simple arithmetic (addition/subtraction) on these
-/// offsets, e.g.
-//
-///   StackOffset(1, MVT::nxv16i8) + StackOffset(1, MVT::i64)
-//
-/// describes an offset that spans the combined storage required for an SVE
-/// vector and a 64bit GPR.
-class StackOffset {
-  int64_t Bytes;
-  int64_t ScalableBytes;
-
-  explicit operator int() const;
-
-public:
-  using Part = std::pair<int64_t, MVT>;
-
-  StackOffset() : Bytes(0), ScalableBytes(0) {}
-
-  StackOffset(int64_t Offset, MVT::SimpleValueType T) : StackOffset() {
-    assert(MVT(T).isByteSized() && "Offset type is not a multiple of bytes");
-    *this += Part(Offset, T);
-  }
-
-  StackOffset(const StackOffset &Other)
-      : Bytes(Other.Bytes), ScalableBytes(Other.ScalableBytes) {}
-
-  StackOffset &operator=(const StackOffset &) = default;
-
-  StackOffset &operator+=(const StackOffset::Part &Other) {
-    const TypeSize Size = Other.second.getSizeInBits();
-    if (Size.isScalable())
-      ScalableBytes += Other.first * ((int64_t)Size.getKnownMinSize() / 8);
-    else
-      Bytes += Other.first * ((int64_t)Size.getFixedSize() / 8);
-    return *this;
-  }
-
-  StackOffset &operator+=(const StackOffset &Other) {
-    Bytes += Other.Bytes;
-    ScalableBytes += Other.ScalableBytes;
-    return *this;
-  }
-
-  StackOffset operator+(const StackOffset &Other) const {
-    StackOffset Res(*this);
-    Res += Other;
-    return Res;
-  }
-
-  StackOffset &operator-=(const StackOffset &Other) {
-    Bytes -= Other.Bytes;
-    ScalableBytes -= Other.ScalableBytes;
-    return *this;
-  }
-
-  StackOffset operator-(const StackOffset &Other) const {
-    StackOffset Res(*this);
-    Res -= Other;
-    return Res;
-  }
-
-  StackOffset operator-() const {
-    StackOffset Res = {};
-    const StackOffset Other(*this);
-    Res -= Other;
-    return Res;
-  }
-
-  /// Returns the scalable part of the offset in bytes.
-  int64_t getScalableBytes() const { return ScalableBytes; }
-
-  /// Returns the non-scalable part of the offset in bytes.
-  int64_t getBytes() const { return Bytes; }
-
-  /// Returns the offset in parts to which this frame offset can be
-  /// decomposed for the purpose of describing a frame offset.
-  /// For non-scalable offsets this is simply its byte size.
-  void getForFrameOffset(int64_t &NumBytes, int64_t &NumPredicateVectors,
-                         int64_t &NumDataVectors) const {
-    assert(isValid() && "Invalid frame offset");
-
-    NumBytes = Bytes;
-    NumDataVectors = 0;
-    NumPredicateVectors = ScalableBytes / 2;
-    // This method is used to get the offsets to adjust the frame offset.
-    // If the function requires ADDPL to be used and needs more than two ADDPL
-    // instructions, part of the offset is folded into NumDataVectors so that it
-    // uses ADDVL for part of it, reducing the number of ADDPL instructions.
-    if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 ||
-        NumPredicateVectors > 62) {
-      NumDataVectors = NumPredicateVectors / 8;
-      NumPredicateVectors -= NumDataVectors * 8;
-    }
-  }
-
-  void getForDwarfOffset(int64_t &ByteSized, int64_t &VGSized) const {
-    assert(isValid() && "Invalid frame offset");
-
-    // VGSized offsets are divided by '2', because the VG register is the
-    // the number of 64bit granules as opposed to 128bit vector chunks,
-    // which is how the 'n' in e.g. MVT::nxv1i8 is modelled.
-    // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes.
-    // VG = n * 2 and the dwarf offset must be VG * 8 bytes.
-    ByteSized = Bytes;
-    VGSized = ScalableBytes / 2;
-  }
-
-  /// Returns whether the offset is known zero.
-  explicit operator bool() const { return Bytes || ScalableBytes; }
-
-  bool isValid() const {
-    // The smallest scalable element supported by scaled SVE addressing
-    // modes are predicates, which are 2 scalable bytes in size. So the scalable
-    // byte offset must always be a multiple of 2.
-    return ScalableBytes % 2 == 0;
-  }
-};
-
-} // end namespace llvm
-
-#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
index 61f27cbc3b29..ab49e0c3f937 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
@@ -59,7 +59,7 @@
 
 using namespace llvm;
 
-#define DEBUG_TYPE "stack-tagging"
+#define DEBUG_TYPE "aarch64-stack-tagging"
 
 static cl::opt<bool> ClMergeInit(
     "stack-tagging-merge-init", cl::Hidden, cl::init(true), cl::ZeroOrMore,
@@ -73,6 +73,10 @@ static cl::opt<bool>
 static cl::opt<unsigned> ClScanLimit("stack-tagging-merge-init-scan-limit",
                                      cl::init(40), cl::Hidden);
 
+static cl::opt<unsigned>
+    ClMergeInitSizeLimit("stack-tagging-merge-init-size-limit", cl::init(272),
+                         cl::Hidden);
+
 static const Align kTagGranuleSize = Align(16);
 
 namespace {
@@ -103,9 +107,10 @@ public:
         SetTagZeroFn(SetTagZeroFn), StgpFn(StgpFn) {}
 
   bool addRange(uint64_t Start, uint64_t End, Instruction *Inst) {
-    auto I = std::lower_bound(
-        Ranges.begin(), Ranges.end(), Start,
-        [](const Range &LHS, uint64_t RHS) { return LHS.End <= RHS; });
+    auto I =
+        llvm::lower_bound(Ranges, Start, [](const Range &LHS, uint64_t RHS) {
+          return LHS.End <= RHS;
+        });
     if (I != Ranges.end() && End > I->Start) {
       // Overlap - bail.
       return false;
@@ -434,7 +439,8 @@ void AArch64StackTagging::tagAlloca(AllocaInst *AI, Instruction *InsertBefore,
   bool LittleEndian =
       Triple(AI->getModule()->getTargetTriple()).isLittleEndian();
   // Current implementation of initializer merging assumes little endianness.
-  if (MergeInit && !F->hasOptNone() && LittleEndian) {
+  if (MergeInit && !F->hasOptNone() && LittleEndian &&
+      Size < ClMergeInitSizeLimit) {
     LLVM_DEBUG(dbgs() << "collecting initializers for " << *AI
                       << ", size = " << Size << "\n");
     InsertBefore = collectInitializers(InsertBefore, Ptr, Size, IB);
@@ -544,7 +550,6 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
 
   MapVector<AllocaInst *, AllocaInfo> Allocas; // need stable iteration order
   SmallVector<Instruction *, 8> RetVec;
-  DenseMap<Value *, AllocaInst *> AllocaForValue;
   SmallVector<Instruction *, 4> UnrecognizedLifetimes;
 
   for (auto &BB : *F) {
@@ -566,8 +571,7 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
       auto *II = dyn_cast<IntrinsicInst>(I);
       if (II && (II->getIntrinsicID() == Intrinsic::lifetime_start ||
                  II->getIntrinsicID() == Intrinsic::lifetime_end)) {
-        AllocaInst *AI =
-            llvm::findAllocaForValue(II->getArgOperand(1), AllocaForValue);
+        AllocaInst *AI = findAllocaForValue(II->getArgOperand(1));
         if (!AI) {
           UnrecognizedLifetimes.push_back(I);
           continue;
@@ -655,7 +659,7 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
       IntrinsicInst *Start = Info.LifetimeStart[0];
       IntrinsicInst *End = Info.LifetimeEnd[0];
       uint64_t Size =
-          dyn_cast<ConstantInt>(Start->getArgOperand(0))->getZExtValue();
+          cast<ConstantInt>(Start->getArgOperand(0))->getZExtValue();
       Size = alignTo(Size, kTagGranuleSize);
       tagAlloca(AI, Start->getNextNode(), Start->getArgOperand(1), Size);
       // We need to ensure that if we tag some object, we certainly untag it
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp
index 73bd434ef123..41096a961330 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp
@@ -13,7 +13,6 @@
 #include "AArch64InstrInfo.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -50,6 +49,12 @@ cl::opt<UncheckedLdStMode> ClUncheckedLdSt(
             "apply unchecked-ld-st when the target is definitely within range"),
         clEnumValN(UncheckedAlways, "always", "always apply unchecked-ld-st")));
 
+static cl::opt<bool>
+    ClFirstSlot("stack-tagging-first-slot-opt", cl::Hidden, cl::init(true),
+                cl::ZeroOrMore,
+                cl::desc("Apply first slot optimization for stack tagging "
+                         "(eliminate ADDG Rt, Rn, 0, 0)."));
+
 namespace {
 
 class AArch64StackTaggingPreRA : public MachineFunctionPass {
@@ -71,6 +76,7 @@ public:
   bool mayUseUncheckedLoadStore();
   void uncheckUsesOf(unsigned TaggedReg, int FI);
   void uncheckLoadsAndStores();
+  Optional<int> findFirstSlotCandidate();
 
   bool runOnMachineFunction(MachineFunction &Func) override;
   StringRef getPassName() const override {
@@ -197,6 +203,141 @@ void AArch64StackTaggingPreRA::uncheckLoadsAndStores() {
   }
 }
 
+struct SlotWithTag {
+  int FI;
+  int Tag;
+  SlotWithTag(int FI, int Tag) : FI(FI), Tag(Tag) {}
+  explicit SlotWithTag(const MachineInstr &MI)
+      : FI(MI.getOperand(1).getIndex()), Tag(MI.getOperand(4).getImm()) {}
+  bool operator==(const SlotWithTag &Other) const {
+    return FI == Other.FI && Tag == Other.Tag;
+  }
+};
+
+namespace llvm {
+template <> struct DenseMapInfo<SlotWithTag> {
+  static inline SlotWithTag getEmptyKey() { return {-2, -2}; }
+  static inline SlotWithTag getTombstoneKey() { return {-3, -3}; }
+  static unsigned getHashValue(const SlotWithTag &V) {
+    return hash_combine(DenseMapInfo<int>::getHashValue(V.FI),
+                        DenseMapInfo<int>::getHashValue(V.Tag));
+  }
+  static bool isEqual(const SlotWithTag &A, const SlotWithTag &B) {
+    return A == B;
+  }
+};
+} // namespace llvm
+
+static bool isSlotPreAllocated(MachineFrameInfo *MFI, int FI) {
+  return MFI->getUseLocalStackAllocationBlock() &&
+         MFI->isObjectPreAllocated(FI);
+}
+
+// Pin one of the tagged slots to offset 0 from the tagged base pointer.
+// This would make its address available in a virtual register (IRG's def), as
+// opposed to requiring an ADDG instruction to materialize. This effectively
+// eliminates a vreg (by replacing it with direct uses of IRG, which is usually
+// live almost everywhere anyway), and therefore needs to happen before
+// regalloc.
+Optional<int> AArch64StackTaggingPreRA::findFirstSlotCandidate() {
+  // Find the best (FI, Tag) pair to pin to offset 0.
+  // Looking at the possible uses of a tagged address, the advantage of pinning
+  // is:
+  // - COPY to physical register.
+  //   Does not matter, this would trade a MOV instruction for an ADDG.
+  // - ST*G matter, but those mostly appear near the function prologue where all
+  //   the tagged addresses need to be materialized anyway; also, counting ST*G
+  //   uses would overweight large allocas that require more than one ST*G
+  //   instruction.
+  // - Load/Store instructions in the address operand do not require a tagged
+  //   pointer, so they also do not benefit. These operands have already been
+  //   eliminated (see uncheckLoadsAndStores) so all remaining load/store
+  //   instructions count.
+  // - Any other instruction may benefit from being pinned to offset 0.
+  LLVM_DEBUG(dbgs() << "AArch64StackTaggingPreRA::findFirstSlotCandidate\n");
+  if (!ClFirstSlot)
+    return None;
+
+  DenseMap<SlotWithTag, int> RetagScore;
+  SlotWithTag MaxScoreST{-1, -1};
+  int MaxScore = -1;
+  for (auto *I : ReTags) {
+    SlotWithTag ST{*I};
+    if (isSlotPreAllocated(MFI, ST.FI))
+      continue;
+
+    Register RetagReg = I->getOperand(0).getReg();
+    if (!Register::isVirtualRegister(RetagReg))
+      continue;
+
+    int Score = 0;
+    SmallVector<Register, 8> WorkList;
+    WorkList.push_back(RetagReg);
+
+    while (!WorkList.empty()) {
+      Register UseReg = WorkList.back();
+      WorkList.pop_back();
+      for (auto &UseI : MRI->use_instructions(UseReg)) {
+        unsigned Opcode = UseI.getOpcode();
+        if (Opcode == AArch64::STGOffset || Opcode == AArch64::ST2GOffset ||
+            Opcode == AArch64::STZGOffset || Opcode == AArch64::STZ2GOffset ||
+            Opcode == AArch64::STGPi || Opcode == AArch64::STGloop ||
+            Opcode == AArch64::STZGloop || Opcode == AArch64::STGloop_wback ||
+            Opcode == AArch64::STZGloop_wback)
+          continue;
+        if (UseI.isCopy()) {
+          Register DstReg = UseI.getOperand(0).getReg();
+          if (Register::isVirtualRegister(DstReg))
+            WorkList.push_back(DstReg);
+          continue;
+        }
+        LLVM_DEBUG(dbgs() << "[" << ST.FI << ":" << ST.Tag << "] use of %"
+                          << Register::virtReg2Index(UseReg) << " in " << UseI
+                          << "\n");
+        Score++;
+      }
+    }
+
+    int TotalScore = RetagScore[ST] += Score;
+    if (TotalScore > MaxScore ||
+        (TotalScore == MaxScore && ST.FI > MaxScoreST.FI)) {
+      MaxScore = TotalScore;
+      MaxScoreST = ST;
+    }
+  }
+
+  if (MaxScoreST.FI < 0)
+    return None;
+
+  // If FI's tag is already 0, we are done.
+  if (MaxScoreST.Tag == 0)
+    return MaxScoreST.FI;
+
+  // Otherwise, find a random victim pair (FI, Tag) where Tag == 0.
+  SlotWithTag SwapST{-1, -1};
+  for (auto *I : ReTags) {
+    SlotWithTag ST{*I};
+    if (ST.Tag == 0) {
+      SwapST = ST;
+      break;
+    }
+  }
+
+  // Swap tags between the victim and the highest scoring pair.
+  // If SwapWith is still (-1, -1), that's fine, too - we'll simply take tag for
+  // the highest score slot without changing anything else.
+  for (auto *&I : ReTags) {
+    SlotWithTag ST{*I};
+    MachineOperand &TagOp = I->getOperand(4);
+    if (ST == MaxScoreST) {
+      TagOp.setImm(0);
+    } else if (ST == SwapST) {
+      TagOp.setImm(MaxScoreST.Tag);
+    }
+  }
+  return MaxScoreST.FI;
+}
+
 bool AArch64StackTaggingPreRA::runOnMachineFunction(MachineFunction &Func) {
   MF = &Func;
   MRI = &MF->getRegInfo();
@@ -225,11 +366,35 @@ bool AArch64StackTaggingPreRA::runOnMachineFunction(MachineFunction &Func) {
     }
   }
 
+  // Take over from SSP. It does nothing for tagged slots, and should not really
+  // have been enabled in the first place.
+  for (int FI : TaggedSlots)
+    MFI->setObjectSSPLayout(FI, MachineFrameInfo::SSPLK_None);
+
   if (ReTags.empty())
     return false;
 
   if (mayUseUncheckedLoadStore())
     uncheckLoadsAndStores();
 
+  // Find a slot that is used with zero tag offset, like ADDG #fi, 0.
+  // If the base tagged pointer is set up to the address of this slot,
+  // the ADDG instruction can be eliminated.
+  Optional<int> BaseSlot = findFirstSlotCandidate();
+  if (BaseSlot)
+    AFI->setTaggedBasePointerIndex(*BaseSlot);
+
+  for (auto *I : ReTags) {
+    int FI = I->getOperand(1).getIndex();
+    int Tag = I->getOperand(4).getImm();
+    Register Base = I->getOperand(3).getReg();
+    if (Tag == 0 && FI == BaseSlot) {
+      BuildMI(*I->getParent(), I, {}, TII->get(AArch64::COPY),
+              I->getOperand(0).getReg())
+          .addReg(Base);
+      I->eraseFromParent();
+    }
+  }
+
   return true;
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 029535cb98b5..71b2bb196486 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -67,7 +67,7 @@ AArch64Subtarget::initializeSubtargetDependencies(StringRef FS,
   if (CPUString.empty())
     CPUString = "generic";
 
-  ParseSubtargetFeatures(CPUString, FS);
+  ParseSubtargetFeatures(CPUString, /*TuneCPU*/ CPUString, FS);
   initializeProperties();
 
   return *this;
@@ -103,19 +103,26 @@ void AArch64Subtarget::initializeProperties() {
   case CortexA76:
   case CortexA77:
   case CortexA78:
+  case CortexA78C:
+  case CortexR82:
   case CortexX1:
     PrefFunctionLogAlignment = 4;
     break;
   case A64FX:
     CacheLineSize = 256;
-    PrefFunctionLogAlignment = 5;
-    PrefLoopLogAlignment = 5;
+    PrefFunctionLogAlignment = 3;
+    PrefLoopLogAlignment = 2;
+    MaxInterleaveFactor = 4;
+    PrefetchDistance = 128;
+    MinPrefetchStride = 1024;
+    MaxPrefetchIterationsAhead = 4;
     break;
   case AppleA7:
   case AppleA10:
   case AppleA11:
   case AppleA12:
   case AppleA13:
+  case AppleA14:
     CacheLineSize = 64;
     PrefetchDistance = 280;
     MinPrefetchStride = 2048;
@@ -150,6 +157,8 @@ void AArch64Subtarget::initializeProperties() {
     PrefFunctionLogAlignment = 3;
     break;
   case NeoverseN1:
+  case NeoverseN2:
+  case NeoverseV1:
     PrefFunctionLogAlignment = 4;
     break;
   case Saphira:
@@ -200,7 +209,7 @@ void AArch64Subtarget::initializeProperties() {
 AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,
                                    const std::string &FS,
                                    const TargetMachine &TM, bool LittleEndian)
-    : AArch64GenSubtargetInfo(TT, CPU, FS),
+    : AArch64GenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS),
       ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()),
       CustomCallSavedXRegs(AArch64::GPR64commonRegClass.getNumRegs()),
       IsLittle(LittleEndian),
@@ -366,3 +375,8 @@ unsigned AArch64Subtarget::getMinSVEVectorSizeInBits() const {
     return (SVEVectorBitsMin / 128) * 128;
   return (std::min(SVEVectorBitsMin, SVEVectorBitsMax) / 128) * 128;
 }
+
+bool AArch64Subtarget::useSVEForFixedLengthVectors() const {
+  // Prefer NEON unless larger SVE registers are available.
+  return hasSVE() && getMinSVEVectorSizeInBits() >= 256;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64Subtarget.h b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64Subtarget.h
index b111f0016948..8fe2f125982f 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -45,6 +45,7 @@ public:
     AppleA11,
     AppleA12,
     AppleA13,
+    AppleA14,
     Carmel,
     CortexA35,
     CortexA53,
@@ -57,20 +58,24 @@ public:
     CortexA76,
     CortexA77,
     CortexA78,
+    CortexA78C,
+    CortexR82,
     CortexX1,
     ExynosM3,
     Falkor,
     Kryo,
     NeoverseE1,
     NeoverseN1,
+    NeoverseN2,
+    NeoverseV1,
     Saphira,
     ThunderX2T99,
     ThunderX,
     ThunderXT81,
     ThunderXT83,
     ThunderXT88,
-    TSV110,
-    ThunderX3T110
+    ThunderX3T110,
+    TSV110
   };
 
 protected:
@@ -83,6 +88,10 @@ protected:
   bool HasV8_4aOps = false;
   bool HasV8_5aOps = false;
   bool HasV8_6aOps = false;
+  bool HasV8_7aOps = false;
+
+  bool HasV8_0rOps = false;
+  bool HasCONTEXTIDREL2 = false;
 
   bool HasFPARMv8 = false;
   bool HasNEON = false;
@@ -118,14 +127,13 @@ protected:
   bool HasAES = false;
 
   // ARMv8.3 extensions
-  bool HasPA = false;
+  bool HasPAuth = false;
   bool HasJS = false;
   bool HasCCIDX = false;
   bool HasComplxNum = false;
 
   // ARMv8.4 extensions
   bool HasNV = false;
-  bool HasRASv8_4 = false;
   bool HasMPAM = false;
   bool HasDIT = false;
   bool HasTRACEV8_4 = false;
@@ -133,7 +141,7 @@ protected:
   bool HasSEL2 = false;
   bool HasPMU = false;
   bool HasTLB_RMI = false;
-  bool HasFMI = false;
+  bool HasFlagM = false;
   bool HasRCPC_IMMO = false;
 
   bool HasLSLFast = false;
@@ -162,6 +170,12 @@ protected:
   bool HasFineGrainedTraps = false;
   bool HasEnhancedCounterVirtualization = false;
 
+  // Armv8.7-A Extensions
+  bool HasXS = false;
+  bool HasWFxT = false;
+  bool HasHCX = false;
+  bool HasLS64 = false;
+
   // Arm SVE2 extensions
   bool HasSVE2 = false;
   bool HasSVE2AES = false;
@@ -172,6 +186,9 @@ protected:
   // Future architecture extensions.
   bool HasETE = false;
   bool HasTRBE = false;
+  bool HasBRBE = false;
+  bool HasPAUTH = false;
+  bool HasSPE_EEF = false;
 
   // HasZeroCycleRegMove - Has zero-cycle register mov instructions.
   bool HasZeroCycleRegMove = false;
@@ -191,6 +208,7 @@ protected:
   // Enable 64-bit vectorization in SLP.
   unsigned MinVectorRegisterBitWidth = 64;
 
+  bool OutlineAtomics = false;
   bool UseAA = false;
   bool PredictableSelectIsExpensive = false;
   bool BalanceFPOps = false;
@@ -203,6 +221,7 @@ protected:
   bool UseAlternateSExtLoadCVTF32Pattern = false;
   bool HasArithmeticBccFusion = false;
   bool HasArithmeticCbzFusion = false;
+  bool HasCmpBccFusion = false;
   bool HasFuseAddress = false;
   bool HasFuseAES = false;
   bool HasFuseArithmeticLogic = false;
@@ -306,6 +325,7 @@ public:
   bool hasV8_3aOps() const { return HasV8_3aOps; }
   bool hasV8_4aOps() const { return HasV8_4aOps; }
   bool hasV8_5aOps() const { return HasV8_5aOps; }
+  bool hasV8_0rOps() const { return HasV8_0rOps; }
 
   bool hasZeroCycleRegMove() const { return HasZeroCycleRegMove; }
 
@@ -343,6 +363,7 @@ public:
   bool hasSHA3() const { return HasSHA3; }
   bool hasSHA2() const { return HasSHA2; }
   bool hasAES() const { return HasAES; }
+  bool hasCONTEXTIDREL2() const { return HasCONTEXTIDREL2; }
   bool balanceFPOps() const { return BalanceFPOps; }
   bool predictableSelectIsExpensive() const {
     return PredictableSelectIsExpensive;
@@ -357,6 +378,7 @@ public:
   }
   bool hasArithmeticBccFusion() const { return HasArithmeticBccFusion; }
   bool hasArithmeticCbzFusion() const { return HasArithmeticCbzFusion; }
+  bool hasCmpBccFusion() const { return HasCmpBccFusion; }
   bool hasFuseAddress() const { return HasFuseAddress; }
   bool hasFuseAES() const { return HasFuseAES; }
   bool hasFuseArithmeticLogic() const { return HasFuseArithmeticLogic; }
@@ -432,6 +454,7 @@ public:
   bool hasRandGen() const { return HasRandGen; }
   bool hasMTE() const { return HasMTE; }
   bool hasTME() const { return HasTME; }
+  bool hasPAUTH() const { return HasPAUTH; }
   // Arm SVE2 extensions
   bool hasSVE2AES() const { return HasSVE2AES; }
   bool hasSVE2SM4() const { return HasSVE2SM4; }
@@ -461,10 +484,15 @@ public:
   bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
   bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
 
-  bool isTargetILP32() const { return TargetTriple.isArch32Bit(); }
+  bool isTargetILP32() const {
+    return TargetTriple.isArch32Bit() ||
+           TargetTriple.getEnvironment() == Triple::GNUILP32;
+  }
 
   bool useAA() const override { return UseAA; }
 
+  bool outlineAtomics() const { return OutlineAtomics; }
+
   bool hasVH() const { return HasVH; }
   bool hasPAN() const { return HasPAN; }
   bool hasLOR() const { return HasLOR; }
@@ -473,22 +501,25 @@ public:
   bool hasPAN_RWV() const { return HasPAN_RWV; }
   bool hasCCPP() const { return HasCCPP; }
 
-  bool hasPA() const { return HasPA; }
+  bool hasPAuth() const { return HasPAuth; }
   bool hasJS() const { return HasJS; }
   bool hasCCIDX() const { return HasCCIDX; }
   bool hasComplxNum() const { return HasComplxNum; }
 
   bool hasNV() const { return HasNV; }
-  bool hasRASv8_4() const { return HasRASv8_4; }
   bool hasMPAM() const { return HasMPAM; }
   bool hasDIT() const { return HasDIT; }
   bool hasTRACEV8_4() const { return HasTRACEV8_4; }
   bool hasAM() const { return HasAM; }
   bool hasAMVS() const { return HasAMVS; }
+  bool hasXS() const { return HasXS; }
+  bool hasWFxT() const { return HasWFxT; }
+  bool hasHCX() const { return HasHCX; }
+  bool hasLS64() const { return HasLS64; }
   bool hasSEL2() const { return HasSEL2; }
   bool hasPMU() const { return HasPMU; }
   bool hasTLB_RMI() const { return HasTLB_RMI; }
-  bool hasFMI() const { return HasFMI; }
+  bool hasFlagM() const { return HasFlagM; }
   bool hasRCPC_IMMO() const { return HasRCPC_IMMO; }
 
   bool addrSinkUsingGEPs() const override {
@@ -511,7 +542,7 @@ public:
 
   /// ParseSubtargetFeatures - Parses features string setting specified
   /// subtarget options.  Definition of function is auto generated by tblgen.
-  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+  void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
 
   /// ClassifyGlobalReference - Find the target operand flags that describe
   /// how a global value should be referenced for the current subtarget.
@@ -550,6 +581,7 @@ public:
   // implied by the architecture.
   unsigned getMaxSVEVectorSizeInBits() const;
   unsigned getMinSVEVectorSizeInBits() const;
+  bool useSVEForFixedLengthVectors() const;
 };
 } // End llvm namespace
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SystemOperands.td b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SystemOperands.td
index ceceabc6ff4e..01ac52bd875a 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SystemOperands.td
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SystemOperands.td
@@ -32,6 +32,11 @@ def HasPAN_RWV : Predicate<"Subtarget->hasPAN_RWV()">,
                  AssemblerPredicate<(all_of FeaturePAN_RWV),
                  "ARM v8.2 PAN AT S1E1R and AT S1E1W Variation">;
 
+def HasCONTEXTIDREL2
+               : Predicate<"Subtarget->hasCONTEXTIDREL2()">,
+                 AssemblerPredicate<(all_of FeatureCONTEXTIDREL2),
+                 "Target contains CONTEXTIDR_EL2 RW operand">;
+
 //===----------------------------------------------------------------------===//
 // AT (address translate) instruction options.
 //===----------------------------------------------------------------------===//
@@ -93,6 +98,21 @@ def : DB<"ld",    0xd>;
 def : DB<"st",    0xe>;
 def : DB<"sy",    0xf>;
 
+class DBnXS<string name, bits<4> encoding, bits<5> immValue> : SearchableTable {
+  let SearchableFields = ["Name", "Encoding", "ImmValue"];
+  let EnumValueField = "Encoding";
+
+  string Name = name;
+  bits<4> Encoding = encoding;
+  bits<5> ImmValue = immValue;
+  code Requires = [{ {AArch64::FeatureXS} }];
+}
+
+def : DBnXS<"oshnxs", 0x3, 0x10>;
+def : DBnXS<"nshnxs", 0x7, 0x14>;
+def : DBnXS<"ishnxs", 0xb, 0x18>;
+def : DBnXS<"synxs",  0xf, 0x1c>;
+
 //===----------------------------------------------------------------------===//
 // DC (data cache maintenance) instruction options.
 //===----------------------------------------------------------------------===//
@@ -384,11 +404,8 @@ def : BTI<"jc", 0b11>;
 // TLBI (translation lookaside buffer invalidate) instruction options.
 //===----------------------------------------------------------------------===//
 
-class TLBI<string name, bits<3> op1, bits<4> crn, bits<4> crm,
-             bits<3> op2, bit needsreg = 1> : SearchableTable {
-  let SearchableFields = ["Name", "Encoding"];
-  let EnumValueField = "Encoding";
-
+class TLBIEntry<string name, bits<3> op1, bits<4> crn, bits<4> crm,
+             bits<3> op2, bit needsreg> {
   string Name = name;
   bits<14> Encoding;
   let Encoding{13-11} = op1;
@@ -396,95 +413,122 @@ class TLBI<string name, bits<3> op1, bits<4> crn, bits<4> crm,
   let Encoding{6-3} = crm;
   let Encoding{2-0} = op2;
   bit NeedsReg = needsreg;
-  code Requires = [{ {} }];
+  list<string> Requires = [];
+  list<string> ExtraRequires = [];
+  code RequiresStr = [{ { }] # !interleave(Requires # ExtraRequires, [{, }]) # [{ } }];
 }
 
-def : TLBI<"IPAS2E1IS",    0b100, 0b1000, 0b0000, 0b001>;
-def : TLBI<"IPAS2LE1IS",   0b100, 0b1000, 0b0000, 0b101>;
-def : TLBI<"VMALLE1IS",    0b000, 0b1000, 0b0011, 0b000, 0>;
-def : TLBI<"ALLE2IS",      0b100, 0b1000, 0b0011, 0b000, 0>;
-def : TLBI<"ALLE3IS",      0b110, 0b1000, 0b0011, 0b000, 0>;
-def : TLBI<"VAE1IS",       0b000, 0b1000, 0b0011, 0b001>;
-def : TLBI<"VAE2IS",       0b100, 0b1000, 0b0011, 0b001>;
-def : TLBI<"VAE3IS",       0b110, 0b1000, 0b0011, 0b001>;
-def : TLBI<"ASIDE1IS",     0b000, 0b1000, 0b0011, 0b010>;
-def : TLBI<"VAAE1IS",      0b000, 0b1000, 0b0011, 0b011>;
-def : TLBI<"ALLE1IS",      0b100, 0b1000, 0b0011, 0b100, 0>;
-def : TLBI<"VALE1IS",      0b000, 0b1000, 0b0011, 0b101>;
-def : TLBI<"VALE2IS",      0b100, 0b1000, 0b0011, 0b101>;
-def : TLBI<"VALE3IS",      0b110, 0b1000, 0b0011, 0b101>;
-def : TLBI<"VMALLS12E1IS", 0b100, 0b1000, 0b0011, 0b110, 0>;
-def : TLBI<"VAALE1IS",     0b000, 0b1000, 0b0011, 0b111>;
-def : TLBI<"IPAS2E1",      0b100, 0b1000, 0b0100, 0b001>;
-def : TLBI<"IPAS2LE1",     0b100, 0b1000, 0b0100, 0b101>;
-def : TLBI<"VMALLE1",      0b000, 0b1000, 0b0111, 0b000, 0>;
-def : TLBI<"ALLE2",        0b100, 0b1000, 0b0111, 0b000, 0>;
-def : TLBI<"ALLE3",        0b110, 0b1000, 0b0111, 0b000, 0>;
-def : TLBI<"VAE1",         0b000, 0b1000, 0b0111, 0b001>;
-def : TLBI<"VAE2",         0b100, 0b1000, 0b0111, 0b001>;
-def : TLBI<"VAE3",         0b110, 0b1000, 0b0111, 0b001>;
-def : TLBI<"ASIDE1",       0b000, 0b1000, 0b0111, 0b010>;
-def : TLBI<"VAAE1",        0b000, 0b1000, 0b0111, 0b011>;
-def : TLBI<"ALLE1",        0b100, 0b1000, 0b0111, 0b100, 0>;
-def : TLBI<"VALE1",        0b000, 0b1000, 0b0111, 0b101>;
-def : TLBI<"VALE2",        0b100, 0b1000, 0b0111, 0b101>;
-def : TLBI<"VALE3",        0b110, 0b1000, 0b0111, 0b101>;
-def : TLBI<"VMALLS12E1",   0b100, 0b1000, 0b0111, 0b110, 0>;
-def : TLBI<"VAALE1",       0b000, 0b1000, 0b0111, 0b111>;
+def TLBITable : GenericTable {
+  let FilterClass = "TLBIEntry";
+  let CppTypeName = "TLBI";
+  let Fields = ["Name", "Encoding", "NeedsReg", "RequiresStr"];
+}
+
+def lookupTLBIByName : SearchIndex {
+  let Table = TLBITable;
+  let Key = ["Name"];
+}
+
+def lookupTLBIByEncoding : SearchIndex {
+  let Table = TLBITable;
+  let Key = ["Encoding"];
+}
+
+multiclass TLBI<string name, bits<3> op1, bits<4> crn, bits<4> crm,
+             bits<3> op2, bit needsreg = 1> {
+  def : TLBIEntry<name, op1, crn, crm, op2, needsreg>;
+  def : TLBIEntry<!strconcat(name, "nXS"), op1, crn, crm, op2, needsreg> {
+    let Encoding{7} = 1;
+    let ExtraRequires = ["AArch64::FeatureXS"];
+  }
+}
+
+defm : TLBI<"IPAS2E1IS",    0b100, 0b1000, 0b0000, 0b001>;
+defm : TLBI<"IPAS2LE1IS",   0b100, 0b1000, 0b0000, 0b101>;
+defm : TLBI<"VMALLE1IS",    0b000, 0b1000, 0b0011, 0b000, 0>;
+defm : TLBI<"ALLE2IS",      0b100, 0b1000, 0b0011, 0b000, 0>;
+defm : TLBI<"ALLE3IS",      0b110, 0b1000, 0b0011, 0b000, 0>;
+defm : TLBI<"VAE1IS",       0b000, 0b1000, 0b0011, 0b001>;
+defm : TLBI<"VAE2IS",       0b100, 0b1000, 0b0011, 0b001>;
+defm : TLBI<"VAE3IS",       0b110, 0b1000, 0b0011, 0b001>;
+defm : TLBI<"ASIDE1IS",     0b000, 0b1000, 0b0011, 0b010>;
+defm : TLBI<"VAAE1IS",      0b000, 0b1000, 0b0011, 0b011>;
+defm : TLBI<"ALLE1IS",      0b100, 0b1000, 0b0011, 0b100, 0>;
+defm : TLBI<"VALE1IS",      0b000, 0b1000, 0b0011, 0b101>;
+defm : TLBI<"VALE2IS",      0b100, 0b1000, 0b0011, 0b101>;
+defm : TLBI<"VALE3IS",      0b110, 0b1000, 0b0011, 0b101>;
+defm : TLBI<"VMALLS12E1IS", 0b100, 0b1000, 0b0011, 0b110, 0>;
+defm : TLBI<"VAALE1IS",     0b000, 0b1000, 0b0011, 0b111>;
+defm : TLBI<"IPAS2E1",      0b100, 0b1000, 0b0100, 0b001>;
+defm : TLBI<"IPAS2LE1",     0b100, 0b1000, 0b0100, 0b101>;
+defm : TLBI<"VMALLE1",      0b000, 0b1000, 0b0111, 0b000, 0>;
+defm : TLBI<"ALLE2",        0b100, 0b1000, 0b0111, 0b000, 0>;
+defm : TLBI<"ALLE3",        0b110, 0b1000, 0b0111, 0b000, 0>;
+defm : TLBI<"VAE1",         0b000, 0b1000, 0b0111, 0b001>;
+defm : TLBI<"VAE2",         0b100, 0b1000, 0b0111, 0b001>;
+defm : TLBI<"VAE3",         0b110, 0b1000, 0b0111, 0b001>;
+defm : TLBI<"ASIDE1",       0b000, 0b1000, 0b0111, 0b010>;
+defm : TLBI<"VAAE1",        0b000, 0b1000, 0b0111, 0b011>;
+defm : TLBI<"ALLE1",        0b100, 0b1000, 0b0111, 0b100, 0>;
+defm : TLBI<"VALE1",        0b000, 0b1000, 0b0111, 0b101>;
+defm : TLBI<"VALE2",        0b100, 0b1000, 0b0111, 0b101>;
+defm : TLBI<"VALE3",        0b110, 0b1000, 0b0111, 0b101>;
+defm : TLBI<"VMALLS12E1",   0b100, 0b1000, 0b0111, 0b110, 0>;
+defm : TLBI<"VAALE1",       0b000, 0b1000, 0b0111, 0b111>;
 
 // Armv8.4-A Translation Lookaside Buffer Instructions (TLBI)
-let Requires = [{ {AArch64::FeatureTLB_RMI} }] in {
+let Requires = ["AArch64::FeatureTLB_RMI"] in {
 // Armv8.4-A Outer Sharable TLB Maintenance instructions:
 //                         op1    CRn     CRm     op2
-def : TLBI<"VMALLE1OS",    0b000, 0b1000, 0b0001, 0b000, 0>;
-def : TLBI<"VAE1OS",       0b000, 0b1000, 0b0001, 0b001>;
-def : TLBI<"ASIDE1OS",     0b000, 0b1000, 0b0001, 0b010>;
-def : TLBI<"VAAE1OS",      0b000, 0b1000, 0b0001, 0b011>;
-def : TLBI<"VALE1OS",      0b000, 0b1000, 0b0001, 0b101>;
-def : TLBI<"VAALE1OS",     0b000, 0b1000, 0b0001, 0b111>;
-def : TLBI<"IPAS2E1OS",    0b100, 0b1000, 0b0100, 0b000>;
-def : TLBI<"IPAS2LE1OS",   0b100, 0b1000, 0b0100, 0b100>;
-def : TLBI<"VAE2OS",       0b100, 0b1000, 0b0001, 0b001>;
-def : TLBI<"VALE2OS",      0b100, 0b1000, 0b0001, 0b101>;
-def : TLBI<"VMALLS12E1OS", 0b100, 0b1000, 0b0001, 0b110, 0>;
-def : TLBI<"VAE3OS",       0b110, 0b1000, 0b0001, 0b001>;
-def : TLBI<"VALE3OS",      0b110, 0b1000, 0b0001, 0b101>;
-def : TLBI<"ALLE2OS",      0b100, 0b1000, 0b0001, 0b000, 0>;
-def : TLBI<"ALLE1OS",      0b100, 0b1000, 0b0001, 0b100, 0>;
-def : TLBI<"ALLE3OS",      0b110, 0b1000, 0b0001, 0b000, 0>;
+defm : TLBI<"VMALLE1OS",    0b000, 0b1000, 0b0001, 0b000, 0>;
+defm : TLBI<"VAE1OS",       0b000, 0b1000, 0b0001, 0b001>;
+defm : TLBI<"ASIDE1OS",     0b000, 0b1000, 0b0001, 0b010>;
+defm : TLBI<"VAAE1OS",      0b000, 0b1000, 0b0001, 0b011>;
+defm : TLBI<"VALE1OS",      0b000, 0b1000, 0b0001, 0b101>;
+defm : TLBI<"VAALE1OS",     0b000, 0b1000, 0b0001, 0b111>;
+defm : TLBI<"IPAS2E1OS",    0b100, 0b1000, 0b0100, 0b000>;
+defm : TLBI<"IPAS2LE1OS",   0b100, 0b1000, 0b0100, 0b100>;
+defm : TLBI<"VAE2OS",       0b100, 0b1000, 0b0001, 0b001>;
+defm : TLBI<"VALE2OS",      0b100, 0b1000, 0b0001, 0b101>;
+defm : TLBI<"VMALLS12E1OS", 0b100, 0b1000, 0b0001, 0b110, 0>;
+defm : TLBI<"VAE3OS",       0b110, 0b1000, 0b0001, 0b001>;
+defm : TLBI<"VALE3OS",      0b110, 0b1000, 0b0001, 0b101>;
+defm : TLBI<"ALLE2OS",      0b100, 0b1000, 0b0001, 0b000, 0>;
+defm : TLBI<"ALLE1OS",      0b100, 0b1000, 0b0001, 0b100, 0>;
+defm : TLBI<"ALLE3OS",      0b110, 0b1000, 0b0001, 0b000, 0>;
 
 // Armv8.4-A TLB Range Maintenance instructions:
 //                         op1    CRn     CRm     op2
-def : TLBI<"RVAE1",        0b000, 0b1000, 0b0110, 0b001>;
-def : TLBI<"RVAAE1",       0b000, 0b1000, 0b0110, 0b011>;
-def : TLBI<"RVALE1",       0b000, 0b1000, 0b0110, 0b101>;
-def : TLBI<"RVAALE1",      0b000, 0b1000, 0b0110, 0b111>;
-def : TLBI<"RVAE1IS",      0b000, 0b1000, 0b0010, 0b001>;
-def : TLBI<"RVAAE1IS",     0b000, 0b1000, 0b0010, 0b011>;
-def : TLBI<"RVALE1IS",     0b000, 0b1000, 0b0010, 0b101>;
-def : TLBI<"RVAALE1IS",    0b000, 0b1000, 0b0010, 0b111>;
-def : TLBI<"RVAE1OS",      0b000, 0b1000, 0b0101, 0b001>;
-def : TLBI<"RVAAE1OS",     0b000, 0b1000, 0b0101, 0b011>;
-def : TLBI<"RVALE1OS",     0b000, 0b1000, 0b0101, 0b101>;
-def : TLBI<"RVAALE1OS",    0b000, 0b1000, 0b0101, 0b111>;
-def : TLBI<"RIPAS2E1IS",   0b100, 0b1000, 0b0000, 0b010>;
-def : TLBI<"RIPAS2LE1IS",  0b100, 0b1000, 0b0000, 0b110>;
-def : TLBI<"RIPAS2E1",     0b100, 0b1000, 0b0100, 0b010>;
-def : TLBI<"RIPAS2LE1",    0b100, 0b1000, 0b0100, 0b110>;
-def : TLBI<"RIPAS2E1OS",   0b100, 0b1000, 0b0100, 0b011>;
-def : TLBI<"RIPAS2LE1OS",  0b100, 0b1000, 0b0100, 0b111>;
-def : TLBI<"RVAE2",        0b100, 0b1000, 0b0110, 0b001>;
-def : TLBI<"RVALE2",       0b100, 0b1000, 0b0110, 0b101>;
-def : TLBI<"RVAE2IS",      0b100, 0b1000, 0b0010, 0b001>;
-def : TLBI<"RVALE2IS",     0b100, 0b1000, 0b0010, 0b101>;
-def : TLBI<"RVAE2OS",      0b100, 0b1000, 0b0101, 0b001>;
-def : TLBI<"RVALE2OS",     0b100, 0b1000, 0b0101, 0b101>;
-def : TLBI<"RVAE3",        0b110, 0b1000, 0b0110, 0b001>;
-def : TLBI<"RVALE3",       0b110, 0b1000, 0b0110, 0b101>;
-def : TLBI<"RVAE3IS",      0b110, 0b1000, 0b0010, 0b001>;
-def : TLBI<"RVALE3IS",     0b110, 0b1000, 0b0010, 0b101>;
-def : TLBI<"RVAE3OS",      0b110, 0b1000, 0b0101, 0b001>;
-def : TLBI<"RVALE3OS",     0b110, 0b1000, 0b0101, 0b101>;
+defm : TLBI<"RVAE1",        0b000, 0b1000, 0b0110, 0b001>;
+defm : TLBI<"RVAAE1",       0b000, 0b1000, 0b0110, 0b011>;
+defm : TLBI<"RVALE1",       0b000, 0b1000, 0b0110, 0b101>;
+defm : TLBI<"RVAALE1",      0b000, 0b1000, 0b0110, 0b111>;
+defm : TLBI<"RVAE1IS",      0b000, 0b1000, 0b0010, 0b001>;
+defm : TLBI<"RVAAE1IS",     0b000, 0b1000, 0b0010, 0b011>;
+defm : TLBI<"RVALE1IS",     0b000, 0b1000, 0b0010, 0b101>;
+defm : TLBI<"RVAALE1IS",    0b000, 0b1000, 0b0010, 0b111>;
+defm : TLBI<"RVAE1OS",      0b000, 0b1000, 0b0101, 0b001>;
+defm : TLBI<"RVAAE1OS",     0b000, 0b1000, 0b0101, 0b011>;
+defm : TLBI<"RVALE1OS",     0b000, 0b1000, 0b0101, 0b101>;
+defm : TLBI<"RVAALE1OS",    0b000, 0b1000, 0b0101, 0b111>;
+defm : TLBI<"RIPAS2E1IS",   0b100, 0b1000, 0b0000, 0b010>;
+defm : TLBI<"RIPAS2LE1IS",  0b100, 0b1000, 0b0000, 0b110>;
+defm : TLBI<"RIPAS2E1",     0b100, 0b1000, 0b0100, 0b010>;
+defm : TLBI<"RIPAS2LE1",    0b100, 0b1000, 0b0100, 0b110>;
+defm : TLBI<"RIPAS2E1OS",   0b100, 0b1000, 0b0100, 0b011>;
+defm : TLBI<"RIPAS2LE1OS",  0b100, 0b1000, 0b0100, 0b111>;
+defm : TLBI<"RVAE2",        0b100, 0b1000, 0b0110, 0b001>;
+defm : TLBI<"RVALE2",       0b100, 0b1000, 0b0110, 0b101>;
+defm : TLBI<"RVAE2IS",      0b100, 0b1000, 0b0010, 0b001>;
+defm : TLBI<"RVALE2IS",     0b100, 0b1000, 0b0010, 0b101>;
+defm : TLBI<"RVAE2OS",      0b100, 0b1000, 0b0101, 0b001>;
+defm : TLBI<"RVALE2OS",     0b100, 0b1000, 0b0101, 0b101>;
+defm : TLBI<"RVAE3",        0b110, 0b1000, 0b0110, 0b001>;
+defm : TLBI<"RVALE3",       0b110, 0b1000, 0b0110, 0b101>;
+defm : TLBI<"RVAE3IS",      0b110, 0b1000, 0b0010, 0b001>;
+defm : TLBI<"RVALE3IS",     0b110, 0b1000, 0b0010, 0b101>;
+defm : TLBI<"RVAE3OS",      0b110, 0b1000, 0b0101, 0b001>;
+defm : TLBI<"RVALE3OS",     0b110, 0b1000, 0b0101, 0b101>;
 } //FeatureTLB_RMI
 
 // Armv8.5-A Prediction Restriction by Context instruction options:
@@ -599,6 +643,7 @@ def : ROSysReg<"ID_AA64AFR0_EL1",     0b11, 0b000, 0b0000, 0b0101, 0b100>;
 def : ROSysReg<"ID_AA64AFR1_EL1",     0b11, 0b000, 0b0000, 0b0101, 0b101>;
 def : ROSysReg<"ID_AA64ISAR0_EL1",    0b11, 0b000, 0b0000, 0b0110, 0b000>;
 def : ROSysReg<"ID_AA64ISAR1_EL1",    0b11, 0b000, 0b0000, 0b0110, 0b001>;
+def : ROSysReg<"ID_AA64ISAR2_EL1",    0b11, 0b000, 0b0000, 0b0110, 0b010>;
 def : ROSysReg<"ID_AA64MMFR0_EL1",    0b11, 0b000, 0b0000, 0b0111, 0b000>;
 def : ROSysReg<"ID_AA64MMFR1_EL1",    0b11, 0b000, 0b0000, 0b0111, 0b001>;
 def : ROSysReg<"ID_AA64MMFR2_EL1",    0b11, 0b000, 0b0000, 0b0111, 0b010>;
@@ -814,6 +859,9 @@ def : RWSysReg<"ACTLR_EL1",          0b11, 0b000, 0b0001, 0b0000, 0b001>;
 def : RWSysReg<"ACTLR_EL2",          0b11, 0b100, 0b0001, 0b0000, 0b001>;
 def : RWSysReg<"ACTLR_EL3",          0b11, 0b110, 0b0001, 0b0000, 0b001>;
 def : RWSysReg<"HCR_EL2",            0b11, 0b100, 0b0001, 0b0001, 0b000>;
+def : RWSysReg<"HCRX_EL2",           0b11, 0b100, 0b0001, 0b0010, 0b010> {
+  let Requires = [{ {AArch64::FeatureHCX} }];
+}
 def : RWSysReg<"SCR_EL3",            0b11, 0b110, 0b0001, 0b0001, 0b000>;
 def : RWSysReg<"MDCR_EL2",           0b11, 0b100, 0b0001, 0b0001, 0b001>;
 def : RWSysReg<"SDER32_EL3",         0b11, 0b110, 0b0001, 0b0001, 0b001>;
@@ -1220,7 +1268,6 @@ def : RWSysReg<"LORC_EL1",   0b11, 0b000, 0b1010, 0b0100, 0b011>;
 //                              Op0    Op1     CRn     CRm    Op2
 let Requires = [{ {AArch64::FeatureVH} }] in {
 def : RWSysReg<"TTBR1_EL2",       0b11, 0b100, 0b0010, 0b0000, 0b001>;
-def : RWSysReg<"CONTEXTIDR_EL2",  0b11, 0b100, 0b1101, 0b0000, 0b001>;
 def : RWSysReg<"CNTHV_TVAL_EL2",  0b11, 0b100, 0b1110, 0b0011, 0b000>;
 def : RWSysReg<"CNTHV_CVAL_EL2",  0b11, 0b100, 0b1110, 0b0011, 0b010>;
 def : RWSysReg<"CNTHV_CTL_EL2",   0b11, 0b100, 0b1110, 0b0011, 0b001>;
@@ -1246,6 +1293,9 @@ def : RWSysReg<"CNTV_CTL_EL02",   0b11, 0b101, 0b1110, 0b0011, 0b001>;
 def : RWSysReg<"CNTV_CVAL_EL02",  0b11, 0b101, 0b1110, 0b0011, 0b010>;
 def : RWSysReg<"SPSR_EL12",       0b11, 0b101, 0b0100, 0b0000, 0b000>;
 def : RWSysReg<"ELR_EL12",        0b11, 0b101, 0b0100, 0b0000, 0b001>;
+let Requires = [{ {AArch64::FeatureCONTEXTIDREL2} }] in {
+  def : RWSysReg<"CONTEXTIDR_EL2",  0b11, 0b100, 0b1101, 0b0000, 0b001>;
+}
 }
 // v8.2a registers
 //                  Op0    Op1     CRn     CRm    Op2
@@ -1286,7 +1336,7 @@ def : RWSysReg<"VSESR_EL2",     0b11, 0b100, 0b0101, 0b0010, 0b011>;
 
 // v8.3a "Pointer authentication extension" registers
 //                              Op0    Op1     CRn     CRm    Op2
-let Requires = [{ {AArch64::FeaturePA} }] in {
+let Requires = [{ {AArch64::FeaturePAuth} }] in {
 def : RWSysReg<"APIAKeyLo_EL1", 0b11, 0b000, 0b0010, 0b0001, 0b000>;
 def : RWSysReg<"APIAKeyHi_EL1", 0b11, 0b000, 0b0010, 0b0001, 0b001>;
 def : RWSysReg<"APIBKeyLo_EL1", 0b11, 0b000, 0b0010, 0b0001, 0b010>;
@@ -1328,13 +1378,11 @@ def : RWSysReg<"PMMIR_EL1", 0b11, 0b000, 0b1001, 0b1110, 0b110>;
 
 // v8.4a RAS registers
 //                              Op0   Op1    CRn     CRm     Op2
-let Requires = [{ {AArch64::FeatureRASv8_4} }] in {
 def : RWSysReg<"ERXPFGCTL_EL1", 0b11, 0b000, 0b0101, 0b0100, 0b101>;
 def : RWSysReg<"ERXPFGCDN_EL1", 0b11, 0b000, 0b0101, 0b0100, 0b110>;
 def : RWSysReg<"ERXMISC2_EL1",  0b11, 0b000, 0b0101, 0b0101, 0b010>;
 def : RWSysReg<"ERXMISC3_EL1",  0b11, 0b000, 0b0101, 0b0101, 0b011>;
 def : ROSysReg<"ERXPFGF_EL1",   0b11, 0b000, 0b0101, 0b0100, 0b100>;
-} // FeatureRASv8_4
 
 // v8.4a MPAM registers
 //                             Op0   Op1    CRn     CRm     Op2
@@ -1522,6 +1570,33 @@ def : RWSysReg<"CNTPCTSS_EL0",     0b11, 0b011, 0b1110, 0b0000, 0b101>;
 def : RWSysReg<"CNTVCTSS_EL0",     0b11, 0b011, 0b1110, 0b0000, 0b110>;
 }
 
+// v8.7a LD64B/ST64B Accelerator Extension system register
+let Requires = [{ {AArch64::FeatureLS64} }] in
+def : RWSysReg<"ACCDATA_EL1",       0b11, 0b000, 0b1101, 0b0000, 0b101>;
+
+// Branch Record Buffer system registers
+let Requires = [{ {AArch64::FeatureBRBE} }] in {
+def : RWSysReg<"BRBCR_EL1",         0b10, 0b001, 0b1001, 0b0000, 0b000>;
+def : RWSysReg<"BRBCR_EL12",        0b10, 0b101, 0b1001, 0b0000, 0b000>;
+def : RWSysReg<"BRBCR_EL2",         0b10, 0b100, 0b1001, 0b0000, 0b000>;
+def : RWSysReg<"BRBFCR_EL1",        0b10, 0b001, 0b1001, 0b0000, 0b001>;
+def : ROSysReg<"BRBIDR0_EL1",       0b10, 0b001, 0b1001, 0b0010, 0b000>;
+def : RWSysReg<"BRBINFINJ_EL1",     0b10, 0b001, 0b1001, 0b0001, 0b000>;
+def : RWSysReg<"BRBSRCINJ_EL1",     0b10, 0b001, 0b1001, 0b0001, 0b001>;
+def : RWSysReg<"BRBTGTINJ_EL1",     0b10, 0b001, 0b1001, 0b0001, 0b010>;
+def : RWSysReg<"BRBTS_EL1",         0b10, 0b001, 0b1001, 0b0000, 0b010>;
+foreach n = 0-31 in {
+  defvar nb = !cast<bits<5>>(n);
+  def : ROSysReg<"BRBINF"#n#"_EL1", 0b10, 0b001, 0b1000, nb{3-0}, {nb{4},0b00}>;
+  def : ROSysReg<"BRBSRC"#n#"_EL1", 0b10, 0b001, 0b1000, nb{3-0}, {nb{4},0b01}>;
+  def : ROSysReg<"BRBTGT"#n#"_EL1", 0b10, 0b001, 0b1000, nb{3-0}, {nb{4},0b10}>;
+}
+}
+
+// Statistical Profiling Extension system register
+let Requires = [{ {AArch64::FeatureSPE_EEF} }] in
+def : RWSysReg<"PMSNEVFR_EL1",      0b11, 0b000, 0b1001, 0b1001, 0b001>;
+
 // Cyclone specific system registers
 //                                 Op0    Op1     CRn     CRm    Op2
 let Requires = [{ {AArch64::ProcAppleA7} }] in
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index a63b9a97ada5..bec1758a931b 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -148,10 +148,10 @@ static cl::opt<int> EnableGlobalISelAtO(
     cl::desc("Enable GlobalISel at or below an opt level (-1 to disable)"),
     cl::init(0));
 
-static cl::opt<bool> EnableSVEIntrinsicOpts(
-    "aarch64-sve-intrinsic-opts", cl::Hidden,
-    cl::desc("Enable SVE intrinsic opts"),
-    cl::init(true));
+static cl::opt<bool>
+    EnableSVEIntrinsicOpts("aarch64-enable-sve-intrinsic-opts", cl::Hidden,
+                           cl::desc("Enable SVE intrinsic opts"),
+                           cl::init(true));
 
 static cl::opt<bool> EnableFalkorHWPFFix("aarch64-enable-falkor-hwpf-fix",
                                          cl::init(true), cl::Hidden);
@@ -184,6 +184,8 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Target() {
   initializeAArch64SIMDInstrOptPass(*PR);
   initializeAArch64PreLegalizerCombinerPass(*PR);
   initializeAArch64PostLegalizerCombinerPass(*PR);
+  initializeAArch64PostLegalizerLoweringPass(*PR);
+  initializeAArch64PostSelectOptimizePass(*PR);
   initializeAArch64PromoteConstantPass(*PR);
   initializeAArch64RedundantCopyEliminationPass(*PR);
   initializeAArch64StorePairSuppressPass(*PR);
@@ -213,8 +215,6 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
 static std::string computeDataLayout(const Triple &TT,
                                      const MCTargetOptions &Options,
                                      bool LittleEndian) {
-  if (Options.getABIName() == "ilp32")
-    return "e-m:e-p:32:32-i8:8-i16:16-i64:64-S128";
   if (TT.isOSBinFormatMachO()) {
     if (TT.getArch() == Triple::aarch64_32)
       return "e-m:o-p:32:32-i64:64-i128:128-n32:64-S128";
@@ -222,9 +222,16 @@ static std::string computeDataLayout(const Triple &TT,
   }
   if (TT.isOSBinFormatCOFF())
     return "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128";
-  if (LittleEndian)
-    return "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128";
-  return "E-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128";
+  std::string Endian = LittleEndian ? "e" : "E";
+  std::string Ptr32 = TT.getEnvironment() == Triple::GNUILP32 ? "-p:32:32" : "";
+  return Endian + "-m:e" + Ptr32 +
+         "-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128";
+}
+
+static StringRef computeDefaultCPU(const Triple &TT, StringRef CPU) {
+  if (CPU.empty() && TT.isArm64e())
+    return "apple-a12";
+  return CPU;
 }
 
 static Reloc::Model getEffectiveRelocModel(const Triple &TT,
@@ -274,7 +281,8 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, const Triple &TT,
                                            bool LittleEndian)
     : LLVMTargetMachine(T,
                         computeDataLayout(TT, Options.MCOptions, LittleEndian),
-                        TT, CPU, FS, Options, getEffectiveRelocModel(TT, RM),
+                        TT, computeDefaultCPU(TT, CPU), FS, Options,
+                        getEffectiveRelocModel(TT, RM),
                         getEffectiveAArch64CodeModel(TT, CM, JIT), OL),
       TLOF(createTLOF(getTargetTriple())), isLittle(LittleEndian) {
   initAsmInfo();
@@ -309,6 +317,7 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, const Triple &TT,
   // MachO/CodeModel::Large, which GlobalISel does not support.
   if (getOptLevel() <= EnableGlobalISelAtO &&
       TT.getArch() != Triple::aarch64_32 &&
+      TT.getEnvironment() != Triple::GNUILP32 &&
       !(getCodeModel() == CodeModel::Large && TT.isOSBinFormatMachO())) {
     setGlobalISel(true);
     setGlobalISelAbort(GlobalISelAbortMode::Disable);
@@ -331,12 +340,10 @@ AArch64TargetMachine::getSubtargetImpl(const Function &F) const {
   Attribute CPUAttr = F.getFnAttribute("target-cpu");
   Attribute FSAttr = F.getFnAttribute("target-features");
 
-  std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
-                        ? CPUAttr.getValueAsString().str()
-                        : TargetCPU;
-  std::string FS = !FSAttr.hasAttribute(Attribute::None)
-                       ? FSAttr.getValueAsString().str()
-                       : TargetFS;
+  std::string CPU =
+      CPUAttr.isValid() ? CPUAttr.getValueAsString().str() : TargetCPU;
+  std::string FS =
+      FSAttr.isValid() ? FSAttr.getValueAsString().str() : TargetFS;
 
   auto &I = SubtargetMap[CPU + FS];
   if (!I) {
@@ -453,7 +460,12 @@ void AArch64PassConfig::addIRPasses() {
   // determine whether it succeeded. We can exploit existing control-flow in
   // ldrex/strex loops to simplify this, but it needs tidying up.
   if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy)
-    addPass(createCFGSimplificationPass(1, true, true, false, true));
+    addPass(createCFGSimplificationPass(SimplifyCFGOptions()
+                                            .forwardSwitchCondToPhi(true)
+                                            .convertSwitchToLookupTable(true)
+                                            .needCanonicalLoops(false)
+                                            .hoistCommonInsts(true)
+                                            .sinkCommonInsts(true)));
 
   // Run LoopDataPrefetch
   //
@@ -541,13 +553,13 @@ bool AArch64PassConfig::addInstSelector() {
 }
 
 bool AArch64PassConfig::addIRTranslator() {
-  addPass(new IRTranslator());
+  addPass(new IRTranslator(getOptLevel()));
   return false;
 }
 
 void AArch64PassConfig::addPreLegalizeMachineIR() {
   bool IsOptNone = getOptLevel() == CodeGenOpt::None;
-  addPass(createAArch64PreLegalizeCombiner(IsOptNone));
+  addPass(createAArch64PreLegalizerCombiner(IsOptNone));
 }
 
 bool AArch64PassConfig::addLegalizeMachineIR() {
@@ -556,11 +568,10 @@ bool AArch64PassConfig::addLegalizeMachineIR() {
 }
 
 void AArch64PassConfig::addPreRegBankSelect() {
-  // For now we don't add this to the pipeline for -O0. We could do in future
-  // if we split the combines into separate O0/opt groupings.
   bool IsOptNone = getOptLevel() == CodeGenOpt::None;
   if (!IsOptNone)
-    addPass(createAArch64PostLegalizeCombiner(IsOptNone));
+    addPass(createAArch64PostLegalizerCombiner(IsOptNone));
+  addPass(createAArch64PostLegalizerLowering());
 }
 
 bool AArch64PassConfig::addRegBankSelect() {
@@ -574,6 +585,8 @@ void AArch64PassConfig::addPreGlobalInstructionSelect() {
 
 bool AArch64PassConfig::addGlobalInstructionSelect() {
   addPass(new InstructionSelect());
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(createAArch64PostSelectOptimize());
   return false;
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetMachine.h b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetMachine.h
index 7738a4229391..25e626134317 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetMachine.h
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetMachine.h
@@ -57,6 +57,12 @@ public:
                                 SMDiagnostic &Error,
                                 SMRange &SourceRange) const override;
 
+  /// Returns true if a cast between SrcAS and DestAS is a noop.
+  bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
+    // Addrspacecasts are always noops.
+    return true;
+  }
+
 private:
   bool isLittle;
 };
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index cf6de797727b..7fda6b8fb602 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "AArch64ExpandImm.h"
 #include "AArch64TargetTransformInfo.h"
+#include "AArch64ExpandImm.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -16,9 +16,11 @@
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsAArch64.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/Debug.h"
 #include <algorithm>
 using namespace llvm;
+using namespace llvm::PatternMatch;
 
 #define DEBUG_TYPE "aarch64tti"
 
@@ -84,7 +86,8 @@ int AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
 
 int AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
                                       const APInt &Imm, Type *Ty,
-                                      TTI::TargetCostKind CostKind) {
+                                      TTI::TargetCostKind CostKind,
+                                      Instruction *Inst) {
   assert(Ty->isIntegerTy());
 
   unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -192,6 +195,10 @@ int AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
     if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
       return TTI::TCC_Free;
     break;
+  case Intrinsic::experimental_gc_statepoint:
+    if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
+      return TTI::TCC_Free;
+    break;
   }
   return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
 }
@@ -205,14 +212,43 @@ AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
   return TTI::PSK_Software;
 }
 
+unsigned
+AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
+                                      TTI::TargetCostKind CostKind) {
+  auto *RetTy = ICA.getReturnType();
+  switch (ICA.getID()) {
+  case Intrinsic::umin:
+  case Intrinsic::umax: {
+    auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
+    // umin(x,y) -> sub(x,usubsat(x,y))
+    // umax(x,y) -> add(x,usubsat(y,x))
+    if (LT.second == MVT::v2i64)
+      return LT.first * 2;
+    LLVM_FALLTHROUGH;
+  }
+  case Intrinsic::smin:
+  case Intrinsic::smax: {
+    static const auto ValidMinMaxTys = {MVT::v8i8,  MVT::v16i8, MVT::v4i16,
+                                        MVT::v8i16, MVT::v2i32, MVT::v4i32};
+    auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
+    if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
+      return LT.first;
+    break;
+  }
+  default:
+    break;
+  }
+  return BaseT::getIntrinsicInstrCost(ICA, CostKind);
+}
+
 bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
                                            ArrayRef<const Value *> Args) {
 
   // A helper that returns a vector type from the given type. The number of
   // elements in type Ty determine the vector width.
   auto toVectorTy = [&](Type *ArgTy) {
-    return FixedVectorType::get(ArgTy->getScalarType(),
-                                cast<FixedVectorType>(DstTy)->getNumElements());
+    return VectorType::get(ArgTy->getScalarType(),
+                           cast<VectorType>(DstTy)->getElementCount());
   };
 
   // Exit early if DstTy is not a vector type whose elements are at least
@@ -261,8 +297,8 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
     return false;
 
   // Get the total number of vector elements in the legalized types.
-  unsigned NumDstEls = DstTyL.first * DstTyL.second.getVectorNumElements();
-  unsigned NumSrcEls = SrcTyL.first * SrcTyL.second.getVectorNumElements();
+  unsigned NumDstEls = DstTyL.first * DstTyL.second.getVectorMinNumElements();
+  unsigned NumSrcEls = SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
 
   // Return true if the legalized types have the same number of vector elements
   // and the destination element type size is twice that of the source type.
@@ -270,6 +306,7 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
 }
 
 int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                                     TTI::CastContextHint CCH,
                                      TTI::TargetCostKind CostKind,
                                      const Instruction *I) {
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
@@ -306,7 +343,8 @@ int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
   EVT DstTy = TLI->getValueType(DL, Dst);
 
   if (!SrcTy.isSimple() || !DstTy.isSimple())
-    return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I));
+    return AdjustCost(
+        BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
 
   static const TypeConversionCostTblEntry
   ConversionTbl[] = {
@@ -410,7 +448,8 @@ int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
                                                  SrcTy.getSimpleVT()))
     return AdjustCost(Entry->Cost);
 
-  return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I));
+  return AdjustCost(
+      BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
 }
 
 int AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst,
@@ -442,12 +481,14 @@ int AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst,
   // we may get the extension for free. If not, get the default cost for the
   // extend.
   if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
-    return Cost + getCastInstrCost(Opcode, Dst, Src, CostKind);
+    return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
+                                   CostKind);
 
   // The destination type should be larger than the element type. If not, get
   // the default cost for the extend.
-  if (DstVT.getSizeInBits() < SrcVT.getSizeInBits())
-    return Cost + getCastInstrCost(Opcode, Dst, Src, CostKind);
+  if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
+    return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
+                                   CostKind);
 
   switch (Opcode) {
   default:
@@ -466,7 +507,8 @@ int AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst,
   }
 
   // If we are unable to perform the extend for free, get the default cost.
-  return Cost + getCastInstrCost(Opcode, Dst, Src, CostKind);
+  return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
+                                 CostKind);
 }
 
 unsigned AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
@@ -602,8 +644,20 @@ int AArch64TTIImpl::getArithmeticInstrCost(
     }
     return Cost;
 
-  case ISD::ADD:
   case ISD::MUL:
+    if (LT.second != MVT::v2i64)
+      return (Cost + 1) * LT.first;
+    // Since we do not have a MUL.2d instruction, a mul <2 x i64> is expensive
+    // as elements are extracted from the vectors and the muls scalarized.
+    // As getScalarizationOverhead is a bit too pessimistic, we estimate the
+    // cost for a i64 vector directly here, which is:
+    // - four i64 extracts,
+    // - two i64 inserts, and
+    // - two muls.
+    // So, for a v2i64 with LT.First = 1 the cost is 8, and for a v4i64 with
+    // LT.first = 2 the cost is 16.
+    return LT.first * 8;
+  case ISD::ADD:
   case ISD::XOR:
   case ISD::OR:
   case ISD::AND:
@@ -642,19 +696,40 @@ int AArch64TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
 }
 
 int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                                       Type *CondTy,
+                                       Type *CondTy, CmpInst::Predicate VecPred,
                                        TTI::TargetCostKind CostKind,
                                        const Instruction *I) {
   // TODO: Handle other cost kinds.
   if (CostKind != TTI::TCK_RecipThroughput)
-    return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I);
+    return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
+                                     I);
 
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   // We don't lower some vector selects well that are wider than the register
   // width.
-  if (ValTy->isVectorTy() && ISD == ISD::SELECT) {
+  if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) {
     // We would need this many instructions to hide the scalarization happening.
     const int AmortizationCost = 20;
+
+    // If VecPred is not set, check if we can get a predicate from the context
+    // instruction, if its type matches the requested ValTy.
+    if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
+      CmpInst::Predicate CurrentPred;
+      if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
+                            m_Value())))
+        VecPred = CurrentPred;
+    }
+    // Check if we have a compare/select chain that can be lowered using CMxx &
+    // BFI pair.
+    if (CmpInst::isIntPredicate(VecPred)) {
+      static const auto ValidMinMaxTys = {MVT::v8i8,  MVT::v16i8, MVT::v4i16,
+                                          MVT::v8i16, MVT::v2i32, MVT::v4i32,
+                                          MVT::v2i64};
+      auto LT = TLI->getTypeLegalizationCost(DL, ValTy);
+      if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
+        return LT.first;
+    }
+
     static const TypeConversionCostTblEntry
     VectorSelectTbl[] = {
       { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 },
@@ -674,7 +749,9 @@ int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
         return Entry->Cost;
     }
   }
-  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I);
+  // The base case handles scalable vectors fine for now, since it treats the
+  // cost as 1 * legalization cost.
+  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
 }
 
 AArch64TTIImpl::TTI::MemCmpExpansionOptions
@@ -695,6 +772,30 @@ AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
   return Options;
 }
 
+unsigned AArch64TTIImpl::getGatherScatterOpCost(
+    unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
+    Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
+
+  if (!isa<ScalableVectorType>(DataTy))
+    return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
+                                         Alignment, CostKind, I);
+  auto *VT = cast<VectorType>(DataTy);
+  auto LT = TLI->getTypeLegalizationCost(DL, DataTy);
+  ElementCount LegalVF = LT.second.getVectorElementCount();
+  Optional<unsigned> MaxNumVScale = getMaxVScale();
+  assert(MaxNumVScale && "Expected valid max vscale value");
+
+  unsigned MemOpCost =
+      getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, I);
+  unsigned MaxNumElementsPerGather =
+      MaxNumVScale.getValue() * LegalVF.getKnownMinValue();
+  return LT.first * MaxNumElementsPerGather * MemOpCost;
+}
+
+bool AArch64TTIImpl::useNeonVector(const Type *Ty) const {
+  return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
+}
+
 int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
                                     MaybeAlign Alignment, unsigned AddressSpace,
                                     TTI::TargetCostKind CostKind,
@@ -722,7 +823,7 @@ int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
     return LT.first * 2 * AmortizationCost;
   }
 
-  if (Ty->isVectorTy() &&
+  if (useNeonVector(Ty) &&
       cast<VectorType>(Ty)->getElementType()->isIntegerTy(8)) {
     unsigned ProfitableNumElements;
     if (Opcode == Instruction::Store)
@@ -997,11 +1098,70 @@ bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
   return false;
 }
 
+int AArch64TTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
+                                           bool IsPairwise, bool IsUnsigned,
+                                           TTI::TargetCostKind CostKind) {
+  if (!isa<ScalableVectorType>(Ty))
+    return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned,
+                                         CostKind);
+  assert((isa<ScalableVectorType>(Ty) && isa<ScalableVectorType>(CondTy)) &&
+         "Both vector needs to be scalable");
+
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+  int LegalizationCost = 0;
+  if (LT.first > 1) {
+    Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
+    unsigned CmpOpcode =
+        Ty->isFPOrFPVectorTy() ? Instruction::FCmp : Instruction::ICmp;
+    LegalizationCost =
+        getCmpSelInstrCost(CmpOpcode, LegalVTy, LegalVTy,
+                           CmpInst::BAD_ICMP_PREDICATE, CostKind) +
+        getCmpSelInstrCost(Instruction::Select, LegalVTy, LegalVTy,
+                           CmpInst::BAD_ICMP_PREDICATE, CostKind);
+    LegalizationCost *= LT.first - 1;
+  }
+
+  return LegalizationCost + /*Cost of horizontal reduction*/ 2;
+}
+
+int AArch64TTIImpl::getArithmeticReductionCostSVE(
+    unsigned Opcode, VectorType *ValTy, bool IsPairwise,
+    TTI::TargetCostKind CostKind) {
+  assert(!IsPairwise && "Cannot be pair wise to continue");
+
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+  int LegalizationCost = 0;
+  if (LT.first > 1) {
+    Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
+    LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
+    LegalizationCost *= LT.first - 1;
+  }
+
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+  // Add the final reduction cost for the legal horizontal reduction
+  switch (ISD) {
+  case ISD::ADD:
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+  case ISD::FADD:
+    return LegalizationCost + 2;
+  default:
+    // TODO: Replace for invalid when InstructionCost is used
+    // cases not supported by SVE
+    return 16;
+  }
+}
+
 int AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode,
                                                VectorType *ValTy,
                                                bool IsPairwiseForm,
                                                TTI::TargetCostKind CostKind) {
 
+  if (isa<ScalableVectorType>(ValTy))
+    return getArithmeticReductionCostSVE(Opcode, ValTy, IsPairwiseForm,
+                                         CostKind);
   if (IsPairwiseForm)
     return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm,
                                              CostKind);
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 1f029689a60e..7c9360ada92e 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -74,7 +74,8 @@ public:
   int getIntImmCost(int64_t Val);
   int getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind);
   int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,
-                        Type *Ty, TTI::TargetCostKind CostKind);
+                        Type *Ty, TTI::TargetCostKind CostKind,
+                        Instruction *Inst = nullptr);
   int getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
                           Type *Ty, TTI::TargetCostKind CostKind);
   TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
@@ -96,6 +97,9 @@ public:
     return 31;
   }
 
+  unsigned getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
+                                 TTI::TargetCostKind CostKind);
+
   unsigned getRegisterBitWidth(bool Vector) const {
     if (Vector) {
       if (ST->hasSVE())
@@ -111,10 +115,21 @@ public:
     return ST->getMinVectorRegisterBitWidth();
   }
 
+  Optional<unsigned> getMaxVScale() const {
+    if (ST->hasSVE())
+      return AArch64::SVEMaxBitsPerVector / AArch64::SVEBitsPerBlock;
+    return BaseT::getMaxVScale();
+  }
+
   unsigned getMaxInterleaveFactor(unsigned VF);
 
+  unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
+                                  const Value *Ptr, bool VariableMask,
+                                  Align Alignment, TTI::TargetCostKind CostKind,
+                                  const Instruction *I = nullptr);
+
   int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
-                       TTI::TargetCostKind CostKind,
+                       TTI::CastContextHint CCH, TTI::TargetCostKind CostKind,
                        const Instruction *I = nullptr);
 
   int getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
@@ -124,6 +139,14 @@ public:
 
   int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
 
+  int getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
+                             bool IsPairwise, bool IsUnsigned,
+                             TTI::TargetCostKind CostKind);
+
+  int getArithmeticReductionCostSVE(unsigned Opcode, VectorType *ValTy,
+                                    bool IsPairwiseForm,
+                                    TTI::TargetCostKind CostKind);
+
   int getArithmeticInstrCost(
       unsigned Opcode, Type *Ty,
       TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
@@ -137,11 +160,13 @@ public:
   int getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr);
 
   int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                         CmpInst::Predicate VecPred,
                          TTI::TargetCostKind CostKind,
                          const Instruction *I = nullptr);
 
   TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
                                                     bool IsZeroCmp) const;
+  bool useNeonVector(const Type *Ty) const;
 
   int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
                       unsigned AddressSpace,
@@ -166,6 +191,9 @@ public:
       return false;
 
     Type *Ty = cast<ScalableVectorType>(DataType)->getElementType();
+    if (Ty->isPointerTy())
+      return true;
+
     if (Ty->isBFloatTy() || Ty->isHalfTy() ||
         Ty->isFloatTy() || Ty->isDoubleTy())
       return true;
@@ -213,28 +241,14 @@ public:
   shouldConsiderAddressTypePromotion(const Instruction &I,
                                      bool &AllowPromotionWithoutCommonHeader);
 
-  bool shouldExpandReduction(const IntrinsicInst *II) const {
-    switch (II->getIntrinsicID()) {
-    case Intrinsic::experimental_vector_reduce_v2_fadd:
-    case Intrinsic::experimental_vector_reduce_v2_fmul:
-      // We don't have legalization support for ordered FP reductions.
-      return !II->getFastMathFlags().allowReassoc();
-
-    case Intrinsic::experimental_vector_reduce_fmax:
-    case Intrinsic::experimental_vector_reduce_fmin:
-      // Lowering asserts that there are no NaNs.
-      return !II->getFastMathFlags().noNaNs();
-
-    default:
-      // Don't expand anything else, let legalization deal with it.
-      return false;
-    }
-  }
+  bool shouldExpandReduction(const IntrinsicInst *II) const { return false; }
 
   unsigned getGISelRematGlobalCost() const {
     return 2;
   }
 
+  bool supportsScalableVectors() const { return ST->hasSVE(); }
+
   bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
                              TTI::ReductionFlags Flags) const;
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index e72ae0e62cb7..96c50ff3f8d0 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/AArch64AddressingModes.h"
+#include "MCTargetDesc/AArch64InstPrinter.h"
 #include "MCTargetDesc/AArch64MCExpr.h"
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
 #include "MCTargetDesc/AArch64TargetStreamer.h"
@@ -158,8 +159,13 @@ private:
   bool parseSymbolicImmVal(const MCExpr *&ImmVal);
   bool parseNeonVectorList(OperandVector &Operands);
   bool parseOptionalMulOperand(OperandVector &Operands);
+  bool parseKeywordOperand(OperandVector &Operands);
   bool parseOperand(OperandVector &Operands, bool isCondCode,
                     bool invertCondCode);
+  bool parseImmExpr(int64_t &Out);
+  bool parseComma();
+  bool parseRegisterInRange(unsigned &Out, unsigned Base, unsigned First,
+                            unsigned Last);
 
   bool showMatchError(SMLoc Loc, unsigned ErrCode, uint64_t ErrorInfo,
                       OperandVector &Operands);
@@ -181,6 +187,31 @@ private:
 
   bool parseDirectiveVariantPCS(SMLoc L);
 
+  bool parseDirectiveSEHAllocStack(SMLoc L);
+  bool parseDirectiveSEHPrologEnd(SMLoc L);
+  bool parseDirectiveSEHSaveR19R20X(SMLoc L);
+  bool parseDirectiveSEHSaveFPLR(SMLoc L);
+  bool parseDirectiveSEHSaveFPLRX(SMLoc L);
+  bool parseDirectiveSEHSaveReg(SMLoc L);
+  bool parseDirectiveSEHSaveRegX(SMLoc L);
+  bool parseDirectiveSEHSaveRegP(SMLoc L);
+  bool parseDirectiveSEHSaveRegPX(SMLoc L);
+  bool parseDirectiveSEHSaveLRPair(SMLoc L);
+  bool parseDirectiveSEHSaveFReg(SMLoc L);
+  bool parseDirectiveSEHSaveFRegX(SMLoc L);
+  bool parseDirectiveSEHSaveFRegP(SMLoc L);
+  bool parseDirectiveSEHSaveFRegPX(SMLoc L);
+  bool parseDirectiveSEHSetFP(SMLoc L);
+  bool parseDirectiveSEHAddFP(SMLoc L);
+  bool parseDirectiveSEHNop(SMLoc L);
+  bool parseDirectiveSEHSaveNext(SMLoc L);
+  bool parseDirectiveSEHEpilogStart(SMLoc L);
+  bool parseDirectiveSEHEpilogEnd(SMLoc L);
+  bool parseDirectiveSEHTrapFrame(SMLoc L);
+  bool parseDirectiveSEHMachineFrame(SMLoc L);
+  bool parseDirectiveSEHContext(SMLoc L);
+  bool parseDirectiveSEHClearUnwoundToCall(SMLoc L);
+
   bool validateInstruction(MCInst &Inst, SMLoc &IDLoc,
                            SmallVectorImpl<SMLoc> &Loc);
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
@@ -200,6 +231,7 @@ private:
                                               RegKind MatchKind);
   OperandMatchResultTy tryParseOptionalShiftExtend(OperandVector &Operands);
   OperandMatchResultTy tryParseBarrierOperand(OperandVector &Operands);
+  OperandMatchResultTy tryParseBarriernXSOperand(OperandVector &Operands);
   OperandMatchResultTy tryParseMRSSystemRegister(OperandVector &Operands);
   OperandMatchResultTy tryParseSysReg(OperandVector &Operands);
   OperandMatchResultTy tryParseSysCROperand(OperandVector &Operands);
@@ -226,6 +258,7 @@ private:
   OperandMatchResultTy tryParseVectorList(OperandVector &Operands,
                                           bool ExpectMatch = false);
   OperandMatchResultTy tryParseSVEPattern(OperandVector &Operands);
+  OperandMatchResultTy tryParseGPR64x8(OperandVector &Operands);
 
 public:
   enum AArch64MatchResultTy {
@@ -238,7 +271,7 @@ public:
   AArch64AsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser,
                    const MCInstrInfo &MII, const MCTargetOptions &Options)
     : MCTargetAsmParser(Options, STI, MII) {
-    IsILP32 = Options.getABIName() == "ilp32";
+    IsILP32 = STI.getTargetTriple().getEnvironment() == Triple::GNUILP32;
     MCAsmParserExtension::Initialize(Parser);
     MCStreamer &S = getParser().getStreamer();
     if (S.getTargetStreamer() == nullptr)
@@ -371,6 +404,7 @@ private:
     const char *Data;
     unsigned Length;
     unsigned Val; // Not the enum since not all values have names.
+    bool HasnXSModifier;
   };
 
   struct SysRegOp {
@@ -540,6 +574,11 @@ public:
     return StringRef(Barrier.Data, Barrier.Length);
   }
 
+  bool getBarriernXSModifier() const {
+    assert(Kind == k_Barrier && "Invalid access!");
+    return Barrier.HasnXSModifier;
+  }
+
   unsigned getReg() const override {
     assert(Kind == k_Register && "Invalid access!");
     return Reg.RegNum;
@@ -711,7 +750,8 @@ public:
         ELFRefKind == AArch64MCExpr::VK_GOTTPREL_LO12_NC ||
         ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12 ||
         ELFRefKind == AArch64MCExpr::VK_SECREL_LO12 ||
-        ELFRefKind == AArch64MCExpr::VK_SECREL_HI12) {
+        ELFRefKind == AArch64MCExpr::VK_SECREL_HI12 ||
+        ELFRefKind == AArch64MCExpr::VK_GOT_PAGE_LO15) {
       // Note that we don't range-check the addend. It's adjusted modulo page
       // size when converted, so there is no "out of range" condition when using
       // @pageoff.
@@ -857,7 +897,8 @@ public:
     if (!isShiftedImm() && (!isImm() || !isa<MCConstantExpr>(getImm())))
       return DiagnosticPredicateTy::NoMatch;
 
-    bool IsByte = std::is_same<int8_t, std::make_signed_t<T>>::value;
+    bool IsByte = std::is_same<int8_t, std::make_signed_t<T>>::value ||
+                  std::is_same<int8_t, T>::value;
     if (auto ShiftedImm = getShiftedVal<8>())
       if (!(IsByte && ShiftedImm->second) &&
           AArch64_AM::isSVECpyImm<T>(uint64_t(ShiftedImm->first)
@@ -874,7 +915,8 @@ public:
     if (!isShiftedImm() && (!isImm() || !isa<MCConstantExpr>(getImm())))
       return DiagnosticPredicateTy::NoMatch;
 
-    bool IsByte = std::is_same<int8_t, std::make_signed_t<T>>::value;
+    bool IsByte = std::is_same<int8_t, std::make_signed_t<T>>::value ||
+                  std::is_same<int8_t, T>::value;
     if (auto ShiftedImm = getShiftedVal<8>())
       if (!(IsByte && ShiftedImm->second) &&
           AArch64_AM::isSVEAddSubImm<T>(ShiftedImm->first
@@ -999,7 +1041,12 @@ public:
            AArch64_AM::getFP64Imm(getFPImm().bitcastToAPInt()) != -1;
   }
 
-  bool isBarrier() const { return Kind == k_Barrier; }
+  bool isBarrier() const {
+    return Kind == k_Barrier && !getBarriernXSModifier();
+  }
+  bool isBarriernXS() const {
+    return Kind == k_Barrier && getBarriernXSModifier();
+  }
   bool isSysReg() const { return Kind == k_SysReg; }
 
   bool isMRSSystemRegister() const {
@@ -1126,6 +1173,12 @@ public:
       AArch64MCRegisterClasses[AArch64::GPR32RegClassID].contains(Reg.RegNum);
   }
 
+  bool isGPR64x8() const {
+    return Kind == k_Register && Reg.Kind == RegKind::Scalar &&
+           AArch64MCRegisterClasses[AArch64::GPR64x8ClassRegClassID].contains(
+               Reg.RegNum);
+  }
+
   bool isWSeqPair() const {
     return Kind == k_Register && Reg.Kind == RegKind::Scalar &&
            AArch64MCRegisterClasses[AArch64::WSeqPairsClassRegClassID].contains(
@@ -1689,6 +1742,11 @@ public:
     Inst.addOperand(MCOperand::createImm(getBarrier()));
   }
 
+  void addBarriernXSOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createImm(getBarrier()));
+  }
+
   void addMRSSystemRegisterOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
 
@@ -1924,11 +1982,13 @@ public:
   static std::unique_ptr<AArch64Operand> CreateBarrier(unsigned Val,
                                                        StringRef Str,
                                                        SMLoc S,
-                                                       MCContext &Ctx) {
+                                                       MCContext &Ctx,
+                                                       bool HasnXSModifier) {
     auto Op = std::make_unique<AArch64Operand>(k_Barrier, Ctx);
     Op->Barrier.Val = Val;
     Op->Barrier.Data = Str.data();
     Op->Barrier.Length = Str.size();
+    Op->Barrier.HasnXSModifier = HasnXSModifier;
     Op->StartLoc = S;
     Op->EndLoc = S;
     return Op;
@@ -2073,14 +2133,14 @@ void AArch64Operand::print(raw_ostream &OS) const {
   case k_PSBHint:
     OS << getPSBHintName();
     break;
+  case k_BTIHint:
+    OS << getBTIHintName();
+    break;
   case k_Register:
     OS << "<register " << getReg() << ">";
     if (!getShiftExtendAmount() && !hasShiftExtendAmount())
       break;
     LLVM_FALLTHROUGH;
-  case k_BTIHint:
-    OS << getBTIHintName();
-    break;
   case k_ShiftExtend:
     OS << "<" << AArch64_AM::getShiftExtendName(getShiftExtendType()) << " #"
        << getShiftExtendAmount();
@@ -2510,6 +2570,7 @@ AArch64AsmParser::tryParseAdrpLabel(OperandVector &Operands) {
                DarwinRefKind != MCSymbolRefExpr::VK_TLVPPAGE &&
                ELFRefKind != AArch64MCExpr::VK_ABS_PAGE_NC &&
                ELFRefKind != AArch64MCExpr::VK_GOT_PAGE &&
+               ELFRefKind != AArch64MCExpr::VK_GOT_PAGE_LO15 &&
                ELFRefKind != AArch64MCExpr::VK_GOTTPREL_PAGE &&
                ELFRefKind != AArch64MCExpr::VK_TLSDESC_PAGE) {
       // The operand must be an @page or @gotpage qualified symbolref.
@@ -2843,6 +2904,7 @@ static const struct Extension {
     {"predres", {AArch64::FeaturePredRes}},
     {"ccdp", {AArch64::FeatureCacheDeepPersist}},
     {"mte", {AArch64::FeatureMTE}},
+    {"memtag", {AArch64::FeatureMTE}},
     {"tlb-rmi", {AArch64::FeatureTLB_RMI}},
     {"pan-rwv", {AArch64::FeaturePAN_RWV}},
     {"ccpp", {AArch64::FeatureCCPP}},
@@ -2853,6 +2915,10 @@ static const struct Extension {
     {"sve2-sm4", {AArch64::FeatureSVE2SM4}},
     {"sve2-sha3", {AArch64::FeatureSVE2SHA3}},
     {"sve2-bitperm", {AArch64::FeatureSVE2BitPerm}},
+    {"ls64", {AArch64::FeatureLS64}},
+    {"xs", {AArch64::FeatureXS}},
+    {"pauth", {AArch64::FeaturePAuth}},
+    {"flagm", {AArch64::FeatureFlagM}},
     // FIXME: Unsupported extensions
     {"pan", {}},
     {"lor", {}},
@@ -2873,15 +2939,16 @@ static void setRequiredFeatureString(FeatureBitset FBS, std::string &Str) {
     Str += "ARMv8.5a";
   else if (FBS[AArch64::HasV8_6aOps])
     Str += "ARMv8.6a";
+  else if (FBS[AArch64::HasV8_7aOps])
+    Str += "ARMv8.7a";
   else {
-    auto ext = std::find_if(std::begin(ExtensionMap),
-      std::end(ExtensionMap),
-      [&](const Extension& e)
+    SmallVector<std::string, 2> ExtMatches;
+    for (const auto& Ext : ExtensionMap) {
       // Use & in case multiple features are enabled
-      { return (FBS & e.Features) != FeatureBitset(); }
-    );
-
-    Str += ext != std::end(ExtensionMap) ? ext->Name : "(unknown)";
+      if ((FBS & Ext.Features) != FeatureBitset())
+        ExtMatches.push_back(Ext.Name);
+    }
+    Str += !ExtMatches.empty() ? llvm::join(ExtMatches, ", ") : "(unknown)";
   }
 }
 
@@ -2926,7 +2993,7 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
     if (!IC)
       return TokError("invalid operand for IC instruction");
     else if (!IC->haveFeatures(getSTI().getFeatureBits())) {
-      std::string Str("IC " + std::string(IC->Name) + " requires ");
+      std::string Str("IC " + std::string(IC->Name) + " requires: ");
       setRequiredFeatureString(IC->getRequiredFeatures(), Str);
       return TokError(Str.c_str());
     }
@@ -2936,7 +3003,7 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
     if (!DC)
       return TokError("invalid operand for DC instruction");
     else if (!DC->haveFeatures(getSTI().getFeatureBits())) {
-      std::string Str("DC " + std::string(DC->Name) + " requires ");
+      std::string Str("DC " + std::string(DC->Name) + " requires: ");
       setRequiredFeatureString(DC->getRequiredFeatures(), Str);
       return TokError(Str.c_str());
     }
@@ -2946,7 +3013,7 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
     if (!AT)
       return TokError("invalid operand for AT instruction");
     else if (!AT->haveFeatures(getSTI().getFeatureBits())) {
-      std::string Str("AT " + std::string(AT->Name) + " requires ");
+      std::string Str("AT " + std::string(AT->Name) + " requires: ");
       setRequiredFeatureString(AT->getRequiredFeatures(), Str);
       return TokError(Str.c_str());
     }
@@ -2956,7 +3023,7 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
     if (!TLBI)
       return TokError("invalid operand for TLBI instruction");
     else if (!TLBI->haveFeatures(getSTI().getFeatureBits())) {
-      std::string Str("TLBI " + std::string(TLBI->Name) + " requires ");
+      std::string Str("TLBI " + std::string(TLBI->Name) + " requires: ");
       setRequiredFeatureString(TLBI->getRequiredFeatures(), Str);
       return TokError(Str.c_str());
     }
@@ -2967,7 +3034,7 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
       return TokError("invalid operand for prediction restriction instruction");
     else if (!PRCTX->haveFeatures(getSTI().getFeatureBits())) {
       std::string Str(
-          Mnemonic.upper() + std::string(PRCTX->Name) + " requires ");
+          Mnemonic.upper() + std::string(PRCTX->Name) + " requires: ");
       setRequiredFeatureString(PRCTX->getRequiredFeatures(), Str);
       return TokError(Str.c_str());
     }
@@ -3011,11 +3078,11 @@ AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) {
   if (Mnemonic == "tsb" && Tok.isNot(AsmToken::Identifier)) {
     TokError("'csync' operand expected");
     return MatchOperand_ParseFail;
-  // Can be either a #imm style literal or an option name
   } else if (parseOptionalToken(AsmToken::Hash) || Tok.is(AsmToken::Integer)) {
     // Immediate operand.
     const MCExpr *ImmVal;
     SMLoc ExprLoc = getLoc();
+    AsmToken IntTok = Tok;
     if (getParser().parseExpression(ImmVal))
       return MatchOperand_ParseFail;
     const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
@@ -3023,13 +3090,22 @@ AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) {
       Error(ExprLoc, "immediate value expected for barrier operand");
       return MatchOperand_ParseFail;
     }
-    if (MCE->getValue() < 0 || MCE->getValue() > 15) {
+    int64_t Value = MCE->getValue();
+    if (Mnemonic == "dsb" && Value > 15) {
+      // This case is a no match here, but it might be matched by the nXS
+      // variant. Deliberately not unlex the optional '#' as it is not necessary
+      // to characterize an integer immediate.
+      Parser.getLexer().UnLex(IntTok);
+      return MatchOperand_NoMatch;
+    }
+    if (Value < 0 || Value > 15) {
       Error(ExprLoc, "barrier operand out of range");
       return MatchOperand_ParseFail;
     }
-    auto DB = AArch64DB::lookupDBByEncoding(MCE->getValue());
-    Operands.push_back(AArch64Operand::CreateBarrier(
-        MCE->getValue(), DB ? DB->Name : "", ExprLoc, getContext()));
+    auto DB = AArch64DB::lookupDBByEncoding(Value);
+    Operands.push_back(AArch64Operand::CreateBarrier(Value, DB ? DB->Name : "",
+                                                     ExprLoc, getContext(),
+                                                     false /*hasnXSModifier*/));
     return MatchOperand_Success;
   }
 
@@ -3038,9 +3114,10 @@ AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) {
     return MatchOperand_ParseFail;
   }
 
-  auto TSB = AArch64TSB::lookupTSBByName(Tok.getString());
+  StringRef Operand = Tok.getString();
+  auto TSB = AArch64TSB::lookupTSBByName(Operand);
+  auto DB = AArch64DB::lookupDBByName(Operand);
   // The only valid named option for ISB is 'sy'
-  auto DB = AArch64DB::lookupDBByName(Tok.getString());
   if (Mnemonic == "isb" && (!DB || DB->Encoding != AArch64DB::sy)) {
     TokError("'sy' or #imm operand expected");
     return MatchOperand_ParseFail;
@@ -3049,12 +3126,73 @@ AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) {
     TokError("'csync' operand expected");
     return MatchOperand_ParseFail;
   } else if (!DB && !TSB) {
+    if (Mnemonic == "dsb") {
+      // This case is a no match here, but it might be matched by the nXS
+      // variant.
+      return MatchOperand_NoMatch;
+    }
     TokError("invalid barrier option name");
     return MatchOperand_ParseFail;
   }
 
   Operands.push_back(AArch64Operand::CreateBarrier(
-      DB ? DB->Encoding : TSB->Encoding, Tok.getString(), getLoc(), getContext()));
+      DB ? DB->Encoding : TSB->Encoding, Tok.getString(), getLoc(),
+      getContext(), false /*hasnXSModifier*/));
+  Parser.Lex(); // Consume the option
+
+  return MatchOperand_Success;
+}
+
+OperandMatchResultTy
+AArch64AsmParser::tryParseBarriernXSOperand(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  const AsmToken &Tok = Parser.getTok();
+
+  assert(Mnemonic == "dsb" && "Instruction does not accept nXS operands");
+  if (Mnemonic != "dsb")
+    return MatchOperand_ParseFail;
+
+  if (parseOptionalToken(AsmToken::Hash) || Tok.is(AsmToken::Integer)) {
+    // Immediate operand.
+    const MCExpr *ImmVal;
+    SMLoc ExprLoc = getLoc();
+    if (getParser().parseExpression(ImmVal))
+      return MatchOperand_ParseFail;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+    if (!MCE) {
+      Error(ExprLoc, "immediate value expected for barrier operand");
+      return MatchOperand_ParseFail;
+    }
+    int64_t Value = MCE->getValue();
+    // v8.7-A DSB in the nXS variant accepts only the following immediate
+    // values: 16, 20, 24, 28.
+    if (Value != 16 && Value != 20 && Value != 24 && Value != 28) {
+      Error(ExprLoc, "barrier operand out of range");
+      return MatchOperand_ParseFail;
+    }
+    auto DB = AArch64DBnXS::lookupDBnXSByImmValue(Value);
+    Operands.push_back(AArch64Operand::CreateBarrier(DB->Encoding, DB->Name,
+                                                     ExprLoc, getContext(),
+                                                     true /*hasnXSModifier*/));
+    return MatchOperand_Success;
+  }
+
+  if (Tok.isNot(AsmToken::Identifier)) {
+    TokError("invalid operand for instruction");
+    return MatchOperand_ParseFail;
+  }
+
+  StringRef Operand = Tok.getString();
+  auto DB = AArch64DBnXS::lookupDBnXSByName(Operand);
+
+  if (!DB) {
+    TokError("invalid barrier option name");
+    return MatchOperand_ParseFail;
+  }
+
+  Operands.push_back(
+      AArch64Operand::CreateBarrier(DB->Encoding, Tok.getString(), getLoc(),
+                                    getContext(), true /*hasnXSModifier*/));
   Parser.Lex(); // Consume the option
 
   return MatchOperand_Success;
@@ -3300,6 +3438,7 @@ bool AArch64AsmParser::parseSymbolicImmVal(const MCExpr *&ImmVal) {
                   .Case("tprel_lo12_nc", AArch64MCExpr::VK_TPREL_LO12_NC)
                   .Case("tlsdesc_lo12", AArch64MCExpr::VK_TLSDESC_LO12)
                   .Case("got", AArch64MCExpr::VK_GOT_PAGE)
+                  .Case("gotpage_lo15", AArch64MCExpr::VK_GOT_PAGE_LO15)
                   .Case("got_lo12", AArch64MCExpr::VK_GOT_LO12)
                   .Case("gottprel", AArch64MCExpr::VK_GOTTPREL_PAGE)
                   .Case("gottprel_lo12", AArch64MCExpr::VK_GOTTPREL_LO12_NC)
@@ -3568,6 +3707,17 @@ bool AArch64AsmParser::parseOptionalMulOperand(OperandVector &Operands) {
   return Error(getLoc(), "expected 'vl' or '#<imm>'");
 }
 
+bool AArch64AsmParser::parseKeywordOperand(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  auto Tok = Parser.getTok();
+  if (Tok.isNot(AsmToken::Identifier))
+    return true;
+  Operands.push_back(AArch64Operand::CreateToken(Tok.getString(), false,
+                                                 Tok.getLoc(), getContext()));
+  Parser.Lex();
+  return false;
+}
+
 /// parseOperand - Parse a arm instruction operand.  For now this parses the
 /// operand regardless of the mnemonic.
 bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
@@ -3632,6 +3782,11 @@ bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
     if (GotShift != MatchOperand_NoMatch)
       return GotShift;
 
+    // If this is a two-word mnemonic, parse its special keyword
+    // operand as an identifier.
+    if (Mnemonic == "brb")
+      return parseKeywordOperand(Operands);
+
     // This was not a register so parse other operands that start with an
     // identifier (like labels) as expressions and create them as immediates.
     const MCExpr *IdVal;
@@ -3740,6 +3895,66 @@ bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
   }
 }
 
+bool AArch64AsmParser::parseImmExpr(int64_t &Out) {
+  const MCExpr *Expr = nullptr;
+  SMLoc L = getLoc();
+  if (check(getParser().parseExpression(Expr), L, "expected expression"))
+    return true;
+  const MCConstantExpr *Value = dyn_cast_or_null<MCConstantExpr>(Expr);
+  if (check(!Value, L, "expected constant expression"))
+    return true;
+  Out = Value->getValue();
+  return false;
+}
+
+bool AArch64AsmParser::parseComma() {
+  if (check(getParser().getTok().isNot(AsmToken::Comma), getLoc(),
+            "expected comma"))
+    return true;
+  // Eat the comma
+  getParser().Lex();
+  return false;
+}
+
+bool AArch64AsmParser::parseRegisterInRange(unsigned &Out, unsigned Base,
+                                            unsigned First, unsigned Last) {
+  unsigned Reg;
+  SMLoc Start, End;
+  if (check(ParseRegister(Reg, Start, End), getLoc(), "expected register"))
+    return true;
+
+  // Special handling for FP and LR; they aren't linearly after x28 in
+  // the registers enum.
+  unsigned RangeEnd = Last;
+  if (Base == AArch64::X0) {
+    if (Last == AArch64::FP) {
+      RangeEnd = AArch64::X28;
+      if (Reg == AArch64::FP) {
+        Out = 29;
+        return false;
+      }
+    }
+    if (Last == AArch64::LR) {
+      RangeEnd = AArch64::X28;
+      if (Reg == AArch64::FP) {
+        Out = 29;
+        return false;
+      } else if (Reg == AArch64::LR) {
+        Out = 30;
+        return false;
+      }
+    }
+  }
+
+  if (check(Reg < First || Reg > RangeEnd, Start,
+            Twine("expected register in range ") +
+                AArch64InstPrinter::getRegisterName(First) + " to " +
+                AArch64InstPrinter::getRegisterName(Last)))
+    return true;
+  Out = Reg - Base;
+  return false;
+}
+
 bool AArch64AsmParser::regsEqual(const MCParsedAsmOperand &Op1,
                                  const MCParsedAsmOperand &Op2) const {
   auto &AOp1 = static_cast<const AArch64Operand&>(Op1);
@@ -5058,6 +5273,7 @@ bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) {
   const MCObjectFileInfo::Environment Format =
     getContext().getObjectFileInfo()->getObjectFileType();
   bool IsMachO = Format == MCObjectFileInfo::IsMachO;
+  bool IsCOFF = Format == MCObjectFileInfo::IsCOFF;
 
   auto IDVal = DirectiveID.getIdentifier().lower();
   SMLoc Loc = DirectiveID.getLoc();
@@ -5086,6 +5302,57 @@ bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) {
       parseDirectiveLOH(IDVal, Loc);
     else
       return true;
+  } else if (IsCOFF) {
+    if (IDVal == ".seh_stackalloc")
+      parseDirectiveSEHAllocStack(Loc);
+    else if (IDVal == ".seh_endprologue")
+      parseDirectiveSEHPrologEnd(Loc);
+    else if (IDVal == ".seh_save_r19r20_x")
+      parseDirectiveSEHSaveR19R20X(Loc);
+    else if (IDVal == ".seh_save_fplr")
+      parseDirectiveSEHSaveFPLR(Loc);
+    else if (IDVal == ".seh_save_fplr_x")
+      parseDirectiveSEHSaveFPLRX(Loc);
+    else if (IDVal == ".seh_save_reg")
+      parseDirectiveSEHSaveReg(Loc);
+    else if (IDVal == ".seh_save_reg_x")
+      parseDirectiveSEHSaveRegX(Loc);
+    else if (IDVal == ".seh_save_regp")
+      parseDirectiveSEHSaveRegP(Loc);
+    else if (IDVal == ".seh_save_regp_x")
+      parseDirectiveSEHSaveRegPX(Loc);
+    else if (IDVal == ".seh_save_lrpair")
+      parseDirectiveSEHSaveLRPair(Loc);
+    else if (IDVal == ".seh_save_freg")
+      parseDirectiveSEHSaveFReg(Loc);
+    else if (IDVal == ".seh_save_freg_x")
+      parseDirectiveSEHSaveFRegX(Loc);
+    else if (IDVal == ".seh_save_fregp")
+      parseDirectiveSEHSaveFRegP(Loc);
+    else if (IDVal == ".seh_save_fregp_x")
+      parseDirectiveSEHSaveFRegPX(Loc);
+    else if (IDVal == ".seh_set_fp")
+      parseDirectiveSEHSetFP(Loc);
+    else if (IDVal == ".seh_add_fp")
+      parseDirectiveSEHAddFP(Loc);
+    else if (IDVal == ".seh_nop")
+      parseDirectiveSEHNop(Loc);
+    else if (IDVal == ".seh_save_next")
+      parseDirectiveSEHSaveNext(Loc);
+    else if (IDVal == ".seh_startepilogue")
+      parseDirectiveSEHEpilogStart(Loc);
+    else if (IDVal == ".seh_endepilogue")
+      parseDirectiveSEHEpilogEnd(Loc);
+    else if (IDVal == ".seh_trap_frame")
+      parseDirectiveSEHTrapFrame(Loc);
+    else if (IDVal == ".seh_pushframe")
+      parseDirectiveSEHMachineFrame(Loc);
+    else if (IDVal == ".seh_context")
+      parseDirectiveSEHContext(Loc);
+    else if (IDVal == ".seh_clear_unwound_to_call")
+      parseDirectiveSEHClearUnwoundToCall(Loc);
+    else
+      return true;
   } else
     return true;
   return false;
@@ -5093,12 +5360,8 @@ bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) {
 
 static void ExpandCryptoAEK(AArch64::ArchKind ArchKind,
                             SmallVector<StringRef, 4> &RequestedExtensions) {
-  const bool NoCrypto =
-      (std::find(RequestedExtensions.begin(), RequestedExtensions.end(),
-                 "nocrypto") != std::end(RequestedExtensions));
-  const bool Crypto =
-      (std::find(RequestedExtensions.begin(), RequestedExtensions.end(),
-                 "crypto") != std::end(RequestedExtensions));
+  const bool NoCrypto = llvm::is_contained(RequestedExtensions, "nocrypto");
+  const bool Crypto = llvm::is_contained(RequestedExtensions, "crypto");
 
   if (!NoCrypto && Crypto) {
     switch (ArchKind) {
@@ -5114,6 +5377,8 @@ static void ExpandCryptoAEK(AArch64::ArchKind ArchKind,
     case AArch64::ArchKind::ARMV8_4A:
     case AArch64::ArchKind::ARMV8_5A:
     case AArch64::ArchKind::ARMV8_6A:
+    case AArch64::ArchKind::ARMV8_7A:
+    case AArch64::ArchKind::ARMV8R:
       RequestedExtensions.push_back("sm4");
       RequestedExtensions.push_back("sha3");
       RequestedExtensions.push_back("sha2");
@@ -5134,6 +5399,7 @@ static void ExpandCryptoAEK(AArch64::ArchKind ArchKind,
     case AArch64::ArchKind::ARMV8_4A:
     case AArch64::ArchKind::ARMV8_5A:
     case AArch64::ArchKind::ARMV8_6A:
+    case AArch64::ArchKind::ARMV8_7A:
       RequestedExtensions.push_back("nosm4");
       RequestedExtensions.push_back("nosha3");
       RequestedExtensions.push_back("nosha2");
@@ -5167,7 +5433,8 @@ bool AArch64AsmParser::parseDirectiveArch(SMLoc L) {
 
   MCSubtargetInfo &STI = copySTI();
   std::vector<std::string> ArchFeatures(AArch64Features.begin(), AArch64Features.end());
-  STI.setDefaultFeatures("generic", join(ArchFeatures.begin(), ArchFeatures.end(), ","));
+  STI.setDefaultFeatures("generic", /*TuneCPU*/ "generic",
+                         join(ArchFeatures.begin(), ArchFeatures.end(), ","));
 
   SmallVector<StringRef, 4> RequestedExtensions;
   if (!ExtensionString.empty())
@@ -5269,7 +5536,7 @@ bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) {
   }
 
   MCSubtargetInfo &STI = copySTI();
-  STI.setDefaultFeatures(CPU, "");
+  STI.setDefaultFeatures(CPU, /*TuneCPU*/ CPU, "");
   CurLoc = incrementLoc(CurLoc, CPU.size());
 
   ExpandCryptoAEK(llvm::AArch64::getCPUArchKind(CPU), RequestedExtensions);
@@ -5537,6 +5804,238 @@ bool AArch64AsmParser::parseDirectiveVariantPCS(SMLoc L) {
   return false;
 }
 
+/// parseDirectiveSEHAllocStack
+/// ::= .seh_stackalloc
+bool AArch64AsmParser::parseDirectiveSEHAllocStack(SMLoc L) {
+  int64_t Size;
+  if (parseImmExpr(Size))
+    return true;
+  getTargetStreamer().EmitARM64WinCFIAllocStack(Size);
+  return false;
+}
+
+/// parseDirectiveSEHPrologEnd
+/// ::= .seh_endprologue
+bool AArch64AsmParser::parseDirectiveSEHPrologEnd(SMLoc L) {
+  getTargetStreamer().EmitARM64WinCFIPrologEnd();
+  return false;
+}
+
+/// parseDirectiveSEHSaveR19R20X
+/// ::= .seh_save_r19r20_x
+bool AArch64AsmParser::parseDirectiveSEHSaveR19R20X(SMLoc L) {
+  int64_t Offset;
+  if (parseImmExpr(Offset))
+    return true;
+  getTargetStreamer().EmitARM64WinCFISaveR19R20X(Offset);
+  return false;
+}
+
+/// parseDirectiveSEHSaveFPLR
+/// ::= .seh_save_fplr
+bool AArch64AsmParser::parseDirectiveSEHSaveFPLR(SMLoc L) {
+  int64_t Offset;
+  if (parseImmExpr(Offset))
+    return true;
+  getTargetStreamer().EmitARM64WinCFISaveFPLR(Offset);
+  return false;
+}
+
+/// parseDirectiveSEHSaveFPLRX
+/// ::= .seh_save_fplr_x
+bool AArch64AsmParser::parseDirectiveSEHSaveFPLRX(SMLoc L) {
+  int64_t Offset;
+  if (parseImmExpr(Offset))
+    return true;
+  getTargetStreamer().EmitARM64WinCFISaveFPLRX(Offset);
+  return false;
+}
+
+/// parseDirectiveSEHSaveReg
+/// ::= .seh_save_reg
+bool AArch64AsmParser::parseDirectiveSEHSaveReg(SMLoc L) {
+  unsigned Reg;
+  int64_t Offset;
+  if (parseRegisterInRange(Reg, AArch64::X0, AArch64::X19, AArch64::LR) ||
+      parseComma() || parseImmExpr(Offset))
+    return true;
+  getTargetStreamer().EmitARM64WinCFISaveReg(Reg, Offset);
+  return false;
+}
+
+/// parseDirectiveSEHSaveRegX
+/// ::= .seh_save_reg_x
+bool AArch64AsmParser::parseDirectiveSEHSaveRegX(SMLoc L) {
+  unsigned Reg;
+  int64_t Offset;
+  if (parseRegisterInRange(Reg, AArch64::X0, AArch64::X19, AArch64::LR) ||
+      parseComma() || parseImmExpr(Offset))
+    return true;
+  getTargetStreamer().EmitARM64WinCFISaveRegX(Reg, Offset);
+  return false;
+}
+
+/// parseDirectiveSEHSaveRegP
+/// ::= .seh_save_regp
+bool AArch64AsmParser::parseDirectiveSEHSaveRegP(SMLoc L) {
+  unsigned Reg;
+  int64_t Offset;
+  if (parseRegisterInRange(Reg, AArch64::X0, AArch64::X19, AArch64::FP) ||
+      parseComma() || parseImmExpr(Offset))
+    return true;
+  getTargetStreamer().EmitARM64WinCFISaveRegP(Reg, Offset);
+  return false;
+}
+
+/// parseDirectiveSEHSaveRegPX
+/// ::= .seh_save_regp_x
+bool AArch64AsmParser::parseDirectiveSEHSaveRegPX(SMLoc L) {
+  unsigned Reg;
+  int64_t Offset;
+  if (parseRegisterInRange(Reg, AArch64::X0, AArch64::X19, AArch64::FP) ||
+      parseComma() || parseImmExpr(Offset))
+    return true;
+  getTargetStreamer().EmitARM64WinCFISaveRegPX(Reg, Offset);
+  return false;
+}
+
+/// parseDirectiveSEHSaveLRPair
+/// ::= .seh_save_lrpair
+bool AArch64AsmParser::parseDirectiveSEHSaveLRPair(SMLoc L) {
+  unsigned Reg;
+  int64_t Offset;
+  L = getLoc();
+  if (parseRegisterInRange(Reg, AArch64::X0, AArch64::X19, AArch64::LR) ||
+      parseComma() || parseImmExpr(Offset))
+    return true;
+  if (check(((Reg - 19) % 2 != 0), L,
+            "expected register with even offset from x19"))
+    return true;
+  getTargetStreamer().EmitARM64WinCFISaveLRPair(Reg, Offset);
+  return false;
+}
+
+/// parseDirectiveSEHSaveFReg
+/// ::= .seh_save_freg
+bool AArch64AsmParser::parseDirectiveSEHSaveFReg(SMLoc L) {
+  unsigned Reg;
+  int64_t Offset;
+  if (parseRegisterInRange(Reg, AArch64::D0, AArch64::D8, AArch64::D15) ||
+      parseComma() || parseImmExpr(Offset))
+    return true;
+  getTargetStreamer().EmitARM64WinCFISaveFReg(Reg, Offset);
+  return false;
+}
+
+/// parseDirectiveSEHSaveFRegX
+/// ::= .seh_save_freg_x
+bool AArch64AsmParser::parseDirectiveSEHSaveFRegX(SMLoc L) {
+  unsigned Reg;
+  int64_t Offset;
+  if (parseRegisterInRange(Reg, AArch64::D0, AArch64::D8, AArch64::D15) ||
+      parseComma() || parseImmExpr(Offset))
+    return true;
+  getTargetStreamer().EmitARM64WinCFISaveFRegX(Reg, Offset);
+  return false;
+}
+
+/// parseDirectiveSEHSaveFRegP
+/// ::= .seh_save_fregp
+bool AArch64AsmParser::parseDirectiveSEHSaveFRegP(SMLoc L) {
+  unsigned Reg;
+  int64_t Offset;
+  if (parseRegisterInRange(Reg, AArch64::D0, AArch64::D8, AArch64::D14) ||
+      parseComma() || parseImmExpr(Offset))
+    return true;
+  getTargetStreamer().EmitARM64WinCFISaveFRegP(Reg, Offset);
+  return false;
+}
+
+/// parseDirectiveSEHSaveFRegPX
+/// ::= .seh_save_fregp_x
+bool AArch64AsmParser::parseDirectiveSEHSaveFRegPX(SMLoc L) {
+  unsigned Reg;
+  int64_t Offset;
+  if (parseRegisterInRange(Reg, AArch64::D0, AArch64::D8, AArch64::D14) ||
+      parseComma() || parseImmExpr(Offset))
+    return true;
+  getTargetStreamer().EmitARM64WinCFISaveFRegPX(Reg, Offset);
+  return false;
+}
+
+/// parseDirectiveSEHSetFP
+/// ::= .seh_set_fp
+bool AArch64AsmParser::parseDirectiveSEHSetFP(SMLoc L) {
+  getTargetStreamer().EmitARM64WinCFISetFP();
+  return false;
+}
+
+/// parseDirectiveSEHAddFP
+/// ::= .seh_add_fp
+bool AArch64AsmParser::parseDirectiveSEHAddFP(SMLoc L) {
+  int64_t Size;
+  if (parseImmExpr(Size))
+    return true;
+  getTargetStreamer().EmitARM64WinCFIAddFP(Size);
+  return false;
+}
+
+/// parseDirectiveSEHNop
+/// ::= .seh_nop
+bool AArch64AsmParser::parseDirectiveSEHNop(SMLoc L) {
+  getTargetStreamer().EmitARM64WinCFINop();
+  return false;
+}
+
+/// parseDirectiveSEHSaveNext
+/// ::= .seh_save_next
+bool AArch64AsmParser::parseDirectiveSEHSaveNext(SMLoc L) {
+  getTargetStreamer().EmitARM64WinCFISaveNext();
+  return false;
+}
+
+/// parseDirectiveSEHEpilogStart
+/// ::= .seh_startepilogue
+bool AArch64AsmParser::parseDirectiveSEHEpilogStart(SMLoc L) {
+  getTargetStreamer().EmitARM64WinCFIEpilogStart();
+  return false;
+}
+
+/// parseDirectiveSEHEpilogEnd
+/// ::= .seh_endepilogue
+bool AArch64AsmParser::parseDirectiveSEHEpilogEnd(SMLoc L) {
+  getTargetStreamer().EmitARM64WinCFIEpilogEnd();
+  return false;
+}
+
+/// parseDirectiveSEHTrapFrame
+/// ::= .seh_trap_frame
+bool AArch64AsmParser::parseDirectiveSEHTrapFrame(SMLoc L) {
+  getTargetStreamer().EmitARM64WinCFITrapFrame();
+  return false;
+}
+
+/// parseDirectiveSEHMachineFrame
+/// ::= .seh_pushframe
+bool AArch64AsmParser::parseDirectiveSEHMachineFrame(SMLoc L) {
+  getTargetStreamer().EmitARM64WinCFIMachineFrame();
+  return false;
+}
+
+/// parseDirectiveSEHContext
+/// ::= .seh_context
+bool AArch64AsmParser::parseDirectiveSEHContext(SMLoc L) {
+  getTargetStreamer().EmitARM64WinCFIContext();
+  return false;
+}
+
+/// parseDirectiveSEHClearUnwoundToCall
+/// ::= .seh_clear_unwound_to_call
+bool AArch64AsmParser::parseDirectiveSEHClearUnwoundToCall(SMLoc L) {
+  getTargetStreamer().EmitARM64WinCFIClearUnwoundToCall();
+  return false;
+}
+
 bool
 AArch64AsmParser::classifySymbolRef(const MCExpr *Expr,
                                     AArch64MCExpr::VariantKind &ELFRefKind,
@@ -5824,3 +6323,26 @@ AArch64AsmParser::tryParseSVEPattern(OperandVector &Operands) {
 
   return MatchOperand_Success;
 }
+
+OperandMatchResultTy
+AArch64AsmParser::tryParseGPR64x8(OperandVector &Operands) {
+  SMLoc SS = getLoc();
+
+  unsigned XReg;
+  if (tryParseScalarRegister(XReg) != MatchOperand_Success)
+    return MatchOperand_NoMatch;
+
+  MCContext &ctx = getContext();
+  const MCRegisterInfo *RI = ctx.getRegisterInfo();
+  int X8Reg = RI->getMatchingSuperReg(
+      XReg, AArch64::x8sub_0,
+      &AArch64MCRegisterClasses[AArch64::GPR64x8ClassRegClassID]);
+  if (!X8Reg) {
+    Error(SS, "expected an even-numbered x-register in the range [x0,x22]");
+    return MatchOperand_ParseFail;
+  }
+
+  Operands.push_back(
+      AArch64Operand::CreateReg(X8Reg, RegKind::Scalar, SS, getLoc(), ctx));
+  return MatchOperand_Success;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index 1ff4abb34054..dca76f8457fe 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -62,6 +62,10 @@ static DecodeStatus DecodeGPR64commonRegisterClass(MCInst &Inst, unsigned RegNo,
 static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst, unsigned RegNo,
                                              uint64_t Address,
                                              const void *Decoder);
+static DecodeStatus DecodeGPR64x8ClassRegisterClass(MCInst &Inst,
+                                                    unsigned RegNo,
+                                                    uint64_t Address,
+                                                    const void *Decoder);
 static DecodeStatus DecodeGPR64spRegisterClass(MCInst &Inst,
                                                unsigned RegNo, uint64_t Address,
                                                const void *Decoder);
@@ -267,8 +271,16 @@ DecodeStatus AArch64Disassembler::getInstruction(MCInst &MI, uint64_t &Size,
   uint32_t Insn =
       (Bytes[3] << 24) | (Bytes[2] << 16) | (Bytes[1] << 8) | (Bytes[0] << 0);
 
-  // Calling the auto-generated decoder function.
-  return decodeInstruction(DecoderTable32, MI, Insn, Address, this, STI);
+  const uint8_t *Tables[] = {DecoderTable32, DecoderTableFallback32};
+
+  for (auto Table : Tables) {
+    DecodeStatus Result =
+        decodeInstruction(Table, MI, Insn, Address, this, STI);
+    if (Result != MCDisassembler::Fail)
+      return Result;
+  }
+
+  return MCDisassembler::Fail;
 }
 
 static MCSymbolizer *
@@ -449,6 +461,35 @@ static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst, unsigned RegNo,
   return Success;
 }
 
+static const unsigned GPR64x8DecoderTable[] = {
+  AArch64::X0_X1_X2_X3_X4_X5_X6_X7,
+  AArch64::X2_X3_X4_X5_X6_X7_X8_X9,
+  AArch64::X4_X5_X6_X7_X8_X9_X10_X11,
+  AArch64::X6_X7_X8_X9_X10_X11_X12_X13,
+  AArch64::X8_X9_X10_X11_X12_X13_X14_X15,
+  AArch64::X10_X11_X12_X13_X14_X15_X16_X17,
+  AArch64::X12_X13_X14_X15_X16_X17_X18_X19,
+  AArch64::X14_X15_X16_X17_X18_X19_X20_X21,
+  AArch64::X16_X17_X18_X19_X20_X21_X22_X23,
+  AArch64::X18_X19_X20_X21_X22_X23_X24_X25,
+  AArch64::X20_X21_X22_X23_X24_X25_X26_X27,
+  AArch64::X22_X23_X24_X25_X26_X27_X28_FP,
+};
+
+static DecodeStatus DecodeGPR64x8ClassRegisterClass(MCInst &Inst,
+                                                    unsigned RegNo,
+                                                    uint64_t Address,
+                                                    const void *Decoder) {
+  if (RegNo > 22)
+    return Fail;
+  if (RegNo & 1)
+    return Fail;
+
+  unsigned Register = GPR64x8DecoderTable[RegNo >> 1];
+  Inst.addOperand(MCOperand::createReg(Register));
+  return Success;
+}
+
 static DecodeStatus DecodeGPR64spRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Addr,
                                                const void *Decoder) {
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
index 4832ae8f415f..0f8b1d6584b1 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
@@ -52,10 +52,10 @@ AArch64CallLowering::AArch64CallLowering(const AArch64TargetLowering &TLI)
   : CallLowering(&TLI) {}
 
 namespace {
-struct IncomingArgHandler : public CallLowering::ValueHandler {
+struct IncomingArgHandler : public CallLowering::IncomingValueHandler {
   IncomingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
                      CCAssignFn *AssignFn)
-      : ValueHandler(MIRBuilder, MRI, AssignFn), StackUsed(0) {}
+      : IncomingValueHandler(MIRBuilder, MRI, AssignFn), StackUsed(0) {}
 
   Register getStackAddress(uint64_t Size, int64_t Offset,
                            MachinePointerInfo &MPO) override {
@@ -101,9 +101,7 @@ struct IncomingArgHandler : public CallLowering::ValueHandler {
   /// How the physical register gets marked varies between formal
   /// parameters (it's a basic-block live-in), and a call instruction
   /// (it's an implicit-def of the BL).
-  virtual void markPhysRegUsed(unsigned PhysReg) = 0;
-
-  bool isIncomingArgumentHandler() const override { return true; }
+  virtual void markPhysRegUsed(MCRegister PhysReg) = 0;
 
   uint64_t StackUsed;
 };
@@ -113,7 +111,7 @@ struct FormalArgHandler : public IncomingArgHandler {
                    CCAssignFn *AssignFn)
     : IncomingArgHandler(MIRBuilder, MRI, AssignFn) {}
 
-  void markPhysRegUsed(unsigned PhysReg) override {
+  void markPhysRegUsed(MCRegister PhysReg) override {
     MIRBuilder.getMRI()->addLiveIn(PhysReg);
     MIRBuilder.getMBB().addLiveIn(PhysReg);
   }
@@ -124,24 +122,22 @@ struct CallReturnHandler : public IncomingArgHandler {
                     MachineInstrBuilder MIB, CCAssignFn *AssignFn)
     : IncomingArgHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {}
 
-  void markPhysRegUsed(unsigned PhysReg) override {
+  void markPhysRegUsed(MCRegister PhysReg) override {
     MIB.addDef(PhysReg, RegState::Implicit);
   }
 
   MachineInstrBuilder MIB;
 };
 
-struct OutgoingArgHandler : public CallLowering::ValueHandler {
+struct OutgoingArgHandler : public CallLowering::OutgoingValueHandler {
   OutgoingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
                      MachineInstrBuilder MIB, CCAssignFn *AssignFn,
                      CCAssignFn *AssignFnVarArg, bool IsTailCall = false,
                      int FPDiff = 0)
-      : ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB),
+      : OutgoingValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB),
         AssignFnVarArg(AssignFnVarArg), IsTailCall(IsTailCall), FPDiff(FPDiff),
         StackSize(0), SPReg(0) {}
 
-  bool isIncomingArgumentHandler() const override { return false; }
-
   Register getStackAddress(uint64_t Size, int64_t Offset,
                            MachinePointerInfo &MPO) override {
     MachineFunction &MF = MIRBuilder.getMF();
@@ -191,6 +187,8 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler {
     if (!Arg.IsFixed)
       MaxSize = 0;
 
+    assert(Arg.Regs.size() == 1);
+
     Register ValVReg = VA.getLocInfo() != CCValAssign::LocInfo::FPExt
                            ? extendRegister(Arg.Regs[0], VA, MaxSize)
                            : Arg.Regs[0];
@@ -276,6 +274,7 @@ void AArch64CallLowering::splitToValueTypes(
 bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
                                       const Value *Val,
                                       ArrayRef<Register> VRegs,
+                                      FunctionLoweringInfo &FLI,
                                       Register SwiftErrorVReg) const {
   auto MIB = MIRBuilder.buildInstrNoInsert(AArch64::RET_ReallyLR);
   assert(((Val && !VRegs.empty()) || (!Val && VRegs.empty())) &&
@@ -421,7 +420,7 @@ static void handleMustTailForwardedRegisters(MachineIRBuilder &MIRBuilder,
   // Conservatively forward X8, since it might be used for an aggregate
   // return.
   if (!CCInfo.isAllocated(AArch64::X8)) {
-    unsigned X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
+    Register X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
     Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
   }
 
@@ -442,7 +441,7 @@ bool AArch64CallLowering::fallBackToDAGISel(const Function &F) const {
 
 bool AArch64CallLowering::lowerFormalArguments(
     MachineIRBuilder &MIRBuilder, const Function &F,
-    ArrayRef<ArrayRef<Register>> VRegs) const {
+    ArrayRef<ArrayRef<Register>> VRegs, FunctionLoweringInfo &FLI) const {
   MachineFunction &MF = MIRBuilder.getMF();
   MachineBasicBlock &MBB = MIRBuilder.getMBB();
   MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -624,64 +623,25 @@ bool AArch64CallLowering::areCalleeOutgoingArgsTailCallable(
   const uint32_t *CallerPreservedMask = TRI->getCallPreservedMask(MF, CallerCC);
   MachineRegisterInfo &MRI = MF.getRegInfo();
 
-  for (unsigned i = 0; i < OutLocs.size(); ++i) {
-    auto &ArgLoc = OutLocs[i];
-    // If it's not a register, it's fine.
-    if (!ArgLoc.isRegLoc()) {
-      if (Info.IsVarArg) {
-        // Be conservative and disallow variadic memory operands to match SDAG's
-        // behaviour.
-        // FIXME: If the caller's calling convention is C, then we can
-        // potentially use its argument area. However, for cases like fastcc,
-        // we can't do anything.
-        LLVM_DEBUG(
-            dbgs()
-            << "... Cannot tail call vararg function with stack arguments\n");
-        return false;
-      }
-      continue;
-    }
-
-    Register Reg = ArgLoc.getLocReg();
-
-    // Only look at callee-saved registers.
-    if (MachineOperand::clobbersPhysReg(CallerPreservedMask, Reg))
-      continue;
-
-    LLVM_DEBUG(
-        dbgs()
-        << "... Call has an argument passed in a callee-saved register.\n");
-
-    // Check if it was copied from.
-    ArgInfo &OutInfo = OutArgs[i];
-
-    if (OutInfo.Regs.size() > 1) {
-      LLVM_DEBUG(
-          dbgs() << "... Cannot handle arguments in multiple registers.\n");
-      return false;
-    }
+  if (Info.IsVarArg) {
+    // Be conservative and disallow variadic memory operands to match SDAG's
+    // behaviour.
+    // FIXME: If the caller's calling convention is C, then we can
+    // potentially use its argument area. However, for cases like fastcc,
+    // we can't do anything.
+    for (unsigned i = 0; i < OutLocs.size(); ++i) {
+      auto &ArgLoc = OutLocs[i];
+      if (ArgLoc.isRegLoc())
+        continue;
 
-    // Check if we copy the register, walking through copies from virtual
-    // registers. Note that getDefIgnoringCopies does not ignore copies from
-    // physical registers.
-    MachineInstr *RegDef = getDefIgnoringCopies(OutInfo.Regs[0], MRI);
-    if (!RegDef || RegDef->getOpcode() != TargetOpcode::COPY) {
       LLVM_DEBUG(
           dbgs()
-          << "... Parameter was not copied into a VReg, cannot tail call.\n");
-      return false;
-    }
-
-    // Got a copy. Verify that it's the same as the register we want.
-    Register CopyRHS = RegDef->getOperand(1).getReg();
-    if (CopyRHS != Reg) {
-      LLVM_DEBUG(dbgs() << "... Callee-saved register was not copied into "
-                           "VReg, cannot tail call.\n");
+          << "... Cannot tail call vararg function with stack arguments\n");
       return false;
     }
   }
 
-  return true;
+  return parametersInCSRMatch(MRI, CallerPreservedMask, OutLocs, OutArgs);
 }
 
 bool AArch64CallLowering::isEligibleForTailCallOptimization(
@@ -796,7 +756,7 @@ static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect,
 
   // When BTI is enabled, we need to use TCRETURNriBTI to make sure that we use
   // x16 or x17.
-  if (CallerF.getFunction().hasFnAttribute("branch-target-enforcement"))
+  if (CallerF.getInfo<AArch64FunctionInfo>()->branchTargetEnforcement())
     return AArch64::TCRETURNriBTI;
 
   return AArch64::TCRETURNri;
@@ -816,7 +776,7 @@ bool AArch64CallLowering::lowerTailCall(
 
   // TODO: Right now, regbankselect doesn't know how to handle the rtcGPR64
   // register class. Until we can do that, we should fall back here.
-  if (F.hasFnAttribute("branch-target-enforcement")) {
+  if (MF.getInfo<AArch64FunctionInfo>()->branchTargetEnforcement()) {
     LLVM_DEBUG(
         dbgs() << "Cannot lower indirect tail calls with BTI enabled yet.\n");
     return false;
@@ -934,10 +894,9 @@ bool AArch64CallLowering::lowerTailCall(
   // If Callee is a reg, since it is used by a target specific instruction,
   // it must have a register class matching the constraint of that instruction.
   if (Info.Callee.isReg())
-    MIB->getOperand(0).setReg(constrainOperandRegClass(
-        MF, *TRI, MRI, *MF.getSubtarget().getInstrInfo(),
-        *MF.getSubtarget().getRegBankInfo(), *MIB, MIB->getDesc(), Info.Callee,
-        0));
+    constrainOperandRegClass(MF, *TRI, MRI, *MF.getSubtarget().getInstrInfo(),
+                             *MF.getSubtarget().getRegBankInfo(), *MIB,
+                             MIB->getDesc(), Info.Callee, 0);
 
   MF.getFrameInfo().setHasTailCall();
   Info.LoweredTailCall = true;
@@ -1019,10 +978,9 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
   // instruction, it must have a register class matching the
   // constraint of that instruction.
   if (Info.Callee.isReg())
-    MIB->getOperand(0).setReg(constrainOperandRegClass(
-        MF, *TRI, MRI, *MF.getSubtarget().getInstrInfo(),
-        *MF.getSubtarget().getRegBankInfo(), *MIB, MIB->getDesc(), Info.Callee,
-        0));
+    constrainOperandRegClass(MF, *TRI, MRI, *MF.getSubtarget().getInstrInfo(),
+                             *MF.getSubtarget().getRegBankInfo(), *MIB,
+                             MIB->getDesc(), Info.Callee, 0);
 
   // Finally we can copy the returned value back into its virtual-register. In
   // symmetry with the arguments, the physical register must be an
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.h b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.h
index 640a86253059..1f45c9ebc048 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.h
@@ -34,13 +34,14 @@ public:
   AArch64CallLowering(const AArch64TargetLowering &TLI);
 
   bool lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val,
-                   ArrayRef<Register> VRegs,
+                   ArrayRef<Register> VRegs, FunctionLoweringInfo &FLI,
                    Register SwiftErrorVReg) const override;
 
   bool fallBackToDAGISel(const Function &F) const override;
 
   bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
-                            ArrayRef<ArrayRef<Register>> VRegs) const override;
+                            ArrayRef<ArrayRef<Register>> VRegs,
+                            FunctionLoweringInfo &FLI) const override;
 
   bool lowerCall(MachineIRBuilder &MIRBuilder,
                  CallLoweringInfo &Info) const override;
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.h b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.h
new file mode 100644
index 000000000000..bed1136c7a67
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.h
@@ -0,0 +1,29 @@
+//===- AArch64GlobalISelUtils.h ----------------------------------*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file APIs for AArch64-specific helper functions used in the GlobalISel
+/// pipeline.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_GISEL_AARCH64GLOBALISELUTILS_H
+#define LLVM_LIB_TARGET_AARCH64_GISEL_AARCH64GLOBALISELUTILS_H
+
+#include <cstdint>
+
+namespace llvm {
+namespace AArch64GISelUtils {
+
+/// \returns true if \p C is a legal immediate operand for an arithmetic
+/// instruction.
+constexpr bool isLegalArithImmed(const uint64_t C) {
+  return (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
+}
+
+} // namespace AArch64GISelUtils
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 7733fe7f7b24..fc5ef02e8457 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -18,6 +18,7 @@
 #include "AArch64Subtarget.h"
 #include "AArch64TargetMachine.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
@@ -33,14 +34,18 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/IntrinsicsAArch64.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
 #define DEBUG_TYPE "aarch64-isel"
 
 using namespace llvm;
+using namespace MIPatternMatch;
 
 namespace {
 
@@ -98,15 +103,23 @@ private:
   bool selectVaStartDarwin(MachineInstr &I, MachineFunction &MF,
                            MachineRegisterInfo &MRI) const;
 
-  bool tryOptAndIntoCompareBranch(MachineInstr *LHS,
-                                  int64_t CmpConstant,
-                                  const CmpInst::Predicate &Pred,
+  ///@{
+  /// Helper functions for selectCompareBranch.
+  bool selectCompareBranchFedByFCmp(MachineInstr &I, MachineInstr &FCmp,
+                                    MachineIRBuilder &MIB) const;
+  bool selectCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp,
+                                    MachineIRBuilder &MIB) const;
+  bool tryOptCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp,
+                                    MachineIRBuilder &MIB) const;
+  bool tryOptAndIntoCompareBranch(MachineInstr &AndInst, bool Invert,
                                   MachineBasicBlock *DstMBB,
                                   MachineIRBuilder &MIB) const;
+  ///@}
+
   bool selectCompareBranch(MachineInstr &I, MachineFunction &MF,
                            MachineRegisterInfo &MRI) const;
 
-  bool selectVectorASHR(MachineInstr &I, MachineRegisterInfo &MRI) const;
+  bool selectVectorAshrLshr(MachineInstr &I, MachineRegisterInfo &MRI) const;
   bool selectVectorSHL(MachineInstr &I, MachineRegisterInfo &MRI) const;
 
   // Helper to generate an equivalent of scalar_to_vector into a new register,
@@ -147,6 +160,7 @@ private:
   bool selectJumpTable(MachineInstr &I, MachineRegisterInfo &MRI) const;
   bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI) const;
   bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI) const;
+  bool selectReduction(MachineInstr &I, MachineRegisterInfo &MRI) const;
 
   unsigned emitConstantPoolEntry(const Constant *CPVal,
                                  MachineFunction &MF) const;
@@ -159,20 +173,72 @@ private:
                                  MachineIRBuilder &MIRBuilder) const;
 
   // Emit an integer compare between LHS and RHS, which checks for Predicate.
-  //
-  // This returns the produced compare instruction, and the predicate which
-  // was ultimately used in the compare. The predicate may differ from what
-  // is passed in \p Predicate due to optimization.
-  std::pair<MachineInstr *, CmpInst::Predicate>
-  emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
-                     MachineOperand &Predicate,
-                     MachineIRBuilder &MIRBuilder) const;
-  MachineInstr *emitADD(Register DefReg, MachineOperand &LHS, MachineOperand &RHS,
+  MachineInstr *emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
+                                   MachineOperand &Predicate,
+                                   MachineIRBuilder &MIRBuilder) const;
+
+  /// Emit a floating point comparison between \p LHS and \p RHS.
+  /// \p Pred if given is the intended predicate to use.
+  MachineInstr *emitFPCompare(Register LHS, Register RHS,
+                              MachineIRBuilder &MIRBuilder,
+                              Optional<CmpInst::Predicate> = None) const;
+
+  MachineInstr *emitInstr(unsigned Opcode,
+                          std::initializer_list<llvm::DstOp> DstOps,
+                          std::initializer_list<llvm::SrcOp> SrcOps,
+                          MachineIRBuilder &MIRBuilder,
+                          const ComplexRendererFns &RenderFns = None) const;
+  /// Helper function to emit an add or sub instruction.
+  ///
+  /// \p AddrModeAndSizeToOpcode must contain each of the opcode variants above
+  /// in a specific order.
+  ///
+  /// Below is an example of the expected input to \p AddrModeAndSizeToOpcode.
+  ///
+  /// \code
+  ///   const std::array<std::array<unsigned, 2>, 4> Table {
+  ///    {{AArch64::ADDXri, AArch64::ADDWri},
+  ///     {AArch64::ADDXrs, AArch64::ADDWrs},
+  ///     {AArch64::ADDXrr, AArch64::ADDWrr},
+  ///     {AArch64::SUBXri, AArch64::SUBWri},
+  ///     {AArch64::ADDXrx, AArch64::ADDWrx}}};
+  /// \endcode
+  ///
+  /// Each row in the table corresponds to a different addressing mode. Each
+  /// column corresponds to a different register size.
+  ///
+  /// \attention Rows must be structured as follows:
+  ///   - Row 0: The ri opcode variants
+  ///   - Row 1: The rs opcode variants
+  ///   - Row 2: The rr opcode variants
+  ///   - Row 3: The ri opcode variants for negative immediates
+  ///   - Row 4: The rx opcode variants
+  ///
+  /// \attention Columns must be structured as follows:
+  ///   - Column 0: The 64-bit opcode variants
+  ///   - Column 1: The 32-bit opcode variants
+  ///
+  /// \p Dst is the destination register of the binop to emit.
+  /// \p LHS is the left-hand operand of the binop to emit.
+  /// \p RHS is the right-hand operand of the binop to emit.
+  MachineInstr *emitAddSub(
+      const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode,
+      Register Dst, MachineOperand &LHS, MachineOperand &RHS,
+      MachineIRBuilder &MIRBuilder) const;
+  MachineInstr *emitADD(Register DefReg, MachineOperand &LHS,
+                        MachineOperand &RHS,
                         MachineIRBuilder &MIRBuilder) const;
+  MachineInstr *emitADDS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
+                         MachineIRBuilder &MIRBuilder) const;
+  MachineInstr *emitSUBS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
+                         MachineIRBuilder &MIRBuilder) const;
   MachineInstr *emitCMN(MachineOperand &LHS, MachineOperand &RHS,
                         MachineIRBuilder &MIRBuilder) const;
-  MachineInstr *emitTST(const Register &LHS, const Register &RHS,
+  MachineInstr *emitTST(MachineOperand &LHS, MachineOperand &RHS,
                         MachineIRBuilder &MIRBuilder) const;
+  MachineInstr *emitSelect(Register Dst, Register LHS, Register RHS,
+                           AArch64CC::CondCode CC,
+                           MachineIRBuilder &MIRBuilder) const;
   MachineInstr *emitExtractVectorElt(Optional<Register> DstReg,
                                      const RegisterBank &DstRB, LLT ScalarTy,
                                      Register VecReg, unsigned LaneIdx,
@@ -184,9 +250,24 @@ private:
   MachineInstr *emitFMovForFConstant(MachineInstr &MI,
                                      MachineRegisterInfo &MRI) const;
 
-  /// Emit a CSet for a compare.
+  /// Emit a CSet for an integer compare.
+  ///
+  /// \p DefReg is expected to be a 32-bit scalar register.
   MachineInstr *emitCSetForICMP(Register DefReg, unsigned Pred,
                                 MachineIRBuilder &MIRBuilder) const;
+  /// Emit a CSet for a FP compare.
+  ///
+  /// \p Dst is expected to be a 32-bit scalar register.
+  MachineInstr *emitCSetForFCmp(Register Dst, CmpInst::Predicate Pred,
+                                MachineIRBuilder &MIRBuilder) const;
+
+  /// Emit the overflow op for \p Opcode.
+  ///
+  /// \p Opcode is expected to be an overflow op's opcode, e.g. G_UADDO,
+  /// G_USUBO, etc.
+  std::pair<MachineInstr *, AArch64CC::CondCode>
+  emitOverflowOp(unsigned Opcode, Register Dst, MachineOperand &LHS,
+                 MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const;
 
   /// Emit a TB(N)Z instruction which tests \p Bit in \p TestReg.
   /// \p IsNegative is true if the test should be "not zero".
@@ -195,6 +276,11 @@ private:
                             MachineBasicBlock *DstMBB,
                             MachineIRBuilder &MIB) const;
 
+  /// Emit a CB(N)Z instruction which branches to \p DestMBB.
+  MachineInstr *emitCBZ(Register CompareReg, bool IsNegative,
+                        MachineBasicBlock *DestMBB,
+                        MachineIRBuilder &MIB) const;
+
   // Equivalent to the i32shift_a and friends from AArch64InstrInfo.td.
   // We use these manually instead of using the importer since it doesn't
   // support SDNodeXForm.
@@ -316,13 +402,6 @@ private:
   MachineInstr *tryFoldIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
                                       MachineOperand &Predicate,
                                       MachineIRBuilder &MIRBuilder) const;
-  MachineInstr *tryOptArithImmedIntegerCompare(MachineOperand &LHS,
-                                               MachineOperand &RHS,
-                                               CmpInst::Predicate &Predicate,
-                                               MachineIRBuilder &MIB) const;
-  MachineInstr *tryOptArithShiftedCompare(MachineOperand &LHS,
-                                          MachineOperand &RHS,
-                                          MachineIRBuilder &MIB) const;
 
   /// Return true if \p MI is a load or store of \p NumBytes bytes.
   bool isLoadStoreOfNumBytes(const MachineInstr &MI, unsigned NumBytes) const;
@@ -498,7 +577,7 @@ static Optional<uint64_t> getImmedFromMO(const MachineOperand &Root) {
         getConstantVRegValWithLookThrough(Root.getReg(), MRI, true);
     if (!ValAndVReg)
       return None;
-    Immed = ValAndVReg->Value;
+    Immed = ValAndVReg->Value.getSExtValue();
   } else
     return None;
   return Immed;
@@ -786,6 +865,7 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
 #ifndef NDEBUG
     ValidCopy = KnownValid || isValidCopy(I, DstRegBank, MRI, TRI, RBI);
     assert(ValidCopy && "Invalid copy.");
+    (void)KnownValid;
 #endif
     return ValidCopy;
   };
@@ -932,44 +1012,173 @@ static unsigned selectFPConvOpc(unsigned GenericOpc, LLT DstTy, LLT SrcTy) {
   return GenericOpc;
 }
 
-static unsigned selectSelectOpc(MachineInstr &I, MachineRegisterInfo &MRI,
-                                const RegisterBankInfo &RBI) {
-  const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
-  bool IsFP = (RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI)->getID() !=
-               AArch64::GPRRegBankID);
-  LLT Ty = MRI.getType(I.getOperand(0).getReg());
-  if (Ty == LLT::scalar(32))
-    return IsFP ? AArch64::FCSELSrrr : AArch64::CSELWr;
-  else if (Ty == LLT::scalar(64) || Ty == LLT::pointer(0, 64))
-    return IsFP ? AArch64::FCSELDrrr : AArch64::CSELXr;
-  return 0;
-}
+MachineInstr *
+AArch64InstructionSelector::emitSelect(Register Dst, Register True,
+                                       Register False, AArch64CC::CondCode CC,
+                                       MachineIRBuilder &MIB) const {
+  MachineRegisterInfo &MRI = *MIB.getMRI();
+  assert(RBI.getRegBank(False, MRI, TRI)->getID() ==
+             RBI.getRegBank(True, MRI, TRI)->getID() &&
+         "Expected both select operands to have the same regbank?");
+  LLT Ty = MRI.getType(True);
+  if (Ty.isVector())
+    return nullptr;
+  const unsigned Size = Ty.getSizeInBits();
+  assert((Size == 32 || Size == 64) &&
+         "Expected 32 bit or 64 bit select only?");
+  const bool Is32Bit = Size == 32;
+  if (RBI.getRegBank(True, MRI, TRI)->getID() != AArch64::GPRRegBankID) {
+    unsigned Opc = Is32Bit ? AArch64::FCSELSrrr : AArch64::FCSELDrrr;
+    auto FCSel = MIB.buildInstr(Opc, {Dst}, {True, False}).addImm(CC);
+    constrainSelectedInstRegOperands(*FCSel, TII, TRI, RBI);
+    return &*FCSel;
+  }
+
+  // By default, we'll try and emit a CSEL.
+  unsigned Opc = Is32Bit ? AArch64::CSELWr : AArch64::CSELXr;
+  bool Optimized = false;
+  auto TryFoldBinOpIntoSelect = [&Opc, Is32Bit, &CC, &MRI,
+                                 &Optimized](Register &Reg, Register &OtherReg,
+                                             bool Invert) {
+    if (Optimized)
+      return false;
 
-/// Helper function to select the opcode for a G_FCMP.
-static unsigned selectFCMPOpc(MachineInstr &I, MachineRegisterInfo &MRI) {
-  // If this is a compare against +0.0, then we don't have to explicitly
-  // materialize a constant.
-  const ConstantFP *FPImm = getConstantFPVRegVal(I.getOperand(3).getReg(), MRI);
-  bool ShouldUseImm = FPImm && (FPImm->isZero() && !FPImm->isNegative());
-  unsigned OpSize = MRI.getType(I.getOperand(2).getReg()).getSizeInBits();
-  if (OpSize != 32 && OpSize != 64)
-    return 0;
-  unsigned CmpOpcTbl[2][2] = {{AArch64::FCMPSrr, AArch64::FCMPDrr},
-                              {AArch64::FCMPSri, AArch64::FCMPDri}};
-  return CmpOpcTbl[ShouldUseImm][OpSize == 64];
-}
+    // Attempt to fold:
+    //
+    // %sub = G_SUB 0, %x
+    // %select = G_SELECT cc, %reg, %sub
+    //
+    // Into:
+    // %select = CSNEG %reg, %x, cc
+    Register MatchReg;
+    if (mi_match(Reg, MRI, m_Neg(m_Reg(MatchReg)))) {
+      Opc = Is32Bit ? AArch64::CSNEGWr : AArch64::CSNEGXr;
+      Reg = MatchReg;
+      if (Invert) {
+        CC = AArch64CC::getInvertedCondCode(CC);
+        std::swap(Reg, OtherReg);
+      }
+      return true;
+    }
+
+    // Attempt to fold:
+    //
+    // %xor = G_XOR %x, -1
+    // %select = G_SELECT cc, %reg, %xor
+    //
+    // Into:
+    // %select = CSINV %reg, %x, cc
+    if (mi_match(Reg, MRI, m_Not(m_Reg(MatchReg)))) {
+      Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
+      Reg = MatchReg;
+      if (Invert) {
+        CC = AArch64CC::getInvertedCondCode(CC);
+        std::swap(Reg, OtherReg);
+      }
+      return true;
+    }
+
+    // Attempt to fold:
+    //
+    // %add = G_ADD %x, 1
+    // %select = G_SELECT cc, %reg, %add
+    //
+    // Into:
+    // %select = CSINC %reg, %x, cc
+    if (mi_match(Reg, MRI, m_GAdd(m_Reg(MatchReg), m_SpecificICst(1)))) {
+      Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
+      Reg = MatchReg;
+      if (Invert) {
+        CC = AArch64CC::getInvertedCondCode(CC);
+        std::swap(Reg, OtherReg);
+      }
+      return true;
+    }
 
-/// Returns true if \p P is an unsigned integer comparison predicate.
-static bool isUnsignedICMPPred(const CmpInst::Predicate P) {
-  switch (P) {
-  default:
     return false;
-  case CmpInst::ICMP_UGT:
-  case CmpInst::ICMP_UGE:
-  case CmpInst::ICMP_ULT:
-  case CmpInst::ICMP_ULE:
-    return true;
-  }
+  };
+
+  // Helper lambda which tries to use CSINC/CSINV for the instruction when its
+  // true/false values are constants.
+  // FIXME: All of these patterns already exist in tablegen. We should be
+  // able to import these.
+  auto TryOptSelectCst = [&Opc, &True, &False, &CC, Is32Bit, &MRI,
+                          &Optimized]() {
+    if (Optimized)
+      return false;
+    auto TrueCst = getConstantVRegValWithLookThrough(True, MRI);
+    auto FalseCst = getConstantVRegValWithLookThrough(False, MRI);
+    if (!TrueCst && !FalseCst)
+      return false;
+
+    Register ZReg = Is32Bit ? AArch64::WZR : AArch64::XZR;
+    if (TrueCst && FalseCst) {
+      int64_t T = TrueCst->Value.getSExtValue();
+      int64_t F = FalseCst->Value.getSExtValue();
+
+      if (T == 0 && F == 1) {
+        // G_SELECT cc, 0, 1 -> CSINC zreg, zreg, cc
+        Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
+        True = ZReg;
+        False = ZReg;
+        return true;
+      }
+
+      if (T == 0 && F == -1) {
+        // G_SELECT cc 0, -1 -> CSINV zreg, zreg cc
+        Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
+        True = ZReg;
+        False = ZReg;
+        return true;
+      }
+    }
+
+    if (TrueCst) {
+      int64_t T = TrueCst->Value.getSExtValue();
+      if (T == 1) {
+        // G_SELECT cc, 1, f -> CSINC f, zreg, inv_cc
+        Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
+        True = False;
+        False = ZReg;
+        CC = AArch64CC::getInvertedCondCode(CC);
+        return true;
+      }
+
+      if (T == -1) {
+        // G_SELECT cc, -1, f -> CSINV f, zreg, inv_cc
+        Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
+        True = False;
+        False = ZReg;
+        CC = AArch64CC::getInvertedCondCode(CC);
+        return true;
+      }
+    }
+
+    if (FalseCst) {
+      int64_t F = FalseCst->Value.getSExtValue();
+      if (F == 1) {
+        // G_SELECT cc, t, 1 -> CSINC t, zreg, cc
+        Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
+        False = ZReg;
+        return true;
+      }
+
+      if (F == -1) {
+        // G_SELECT cc, t, -1 -> CSINC t, zreg, cc
+        Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
+        False = ZReg;
+        return true;
+      }
+    }
+    return false;
+  };
+
+  Optimized |= TryFoldBinOpIntoSelect(False, True, /*Invert = */ false);
+  Optimized |= TryFoldBinOpIntoSelect(True, False, /*Invert = */ true);
+  Optimized |= TryOptSelectCst();
+  auto SelectInst = MIB.buildInstr(Opc, {Dst}, {True, False}).addImm(CC);
+  constrainSelectedInstRegOperands(*SelectInst, TII, TRI, RBI);
+  return &*SelectInst;
 }
 
 static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) {
@@ -1099,7 +1308,7 @@ static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert,
         VRegAndVal = getConstantVRegValWithLookThrough(ConstantReg, MRI);
       }
       if (VRegAndVal)
-        C = VRegAndVal->Value;
+        C = VRegAndVal->Value.getSExtValue();
       break;
     }
     case TargetOpcode::G_ASHR:
@@ -1109,7 +1318,7 @@ static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert,
       auto VRegAndVal =
           getConstantVRegValWithLookThrough(MI->getOperand(2).getReg(), MRI);
       if (VRegAndVal)
-        C = VRegAndVal->Value;
+        C = VRegAndVal->Value.getSExtValue();
       break;
     }
     }
@@ -1211,8 +1420,9 @@ MachineInstr *AArch64InstructionSelector::emitTestBit(
 }
 
 bool AArch64InstructionSelector::tryOptAndIntoCompareBranch(
-    MachineInstr *AndInst, int64_t CmpConstant, const CmpInst::Predicate &Pred,
-    MachineBasicBlock *DstMBB, MachineIRBuilder &MIB) const {
+    MachineInstr &AndInst, bool Invert, MachineBasicBlock *DstMBB,
+    MachineIRBuilder &MIB) const {
+  assert(AndInst.getOpcode() == TargetOpcode::G_AND && "Expected G_AND only?");
   // Given something like this:
   //
   //  %x = ...Something...
@@ -1230,65 +1440,96 @@ bool AArch64InstructionSelector::tryOptAndIntoCompareBranch(
   //
   // TBNZ %x %bb.3
   //
-  if (!AndInst || AndInst->getOpcode() != TargetOpcode::G_AND)
-    return false;
-
-  // Need to be comparing against 0 to fold.
-  if (CmpConstant != 0)
-    return false;
-
-  MachineRegisterInfo &MRI = *MIB.getMRI();
-
-  // Only support EQ and NE. If we have LT, then it *is* possible to fold, but
-  // we don't want to do this. When we have an AND and LT, we need a TST/ANDS,
-  // so folding would be redundant.
-  if (Pred != CmpInst::Predicate::ICMP_EQ &&
-      Pred != CmpInst::Predicate::ICMP_NE)
-    return false;
 
   // Check if the AND has a constant on its RHS which we can use as a mask.
   // If it's a power of 2, then it's the same as checking a specific bit.
   // (e.g, ANDing with 8 == ANDing with 000...100 == testing if bit 3 is set)
-  auto MaybeBit =
-      getConstantVRegValWithLookThrough(AndInst->getOperand(2).getReg(), MRI);
-  if (!MaybeBit || !isPowerOf2_64(MaybeBit->Value))
+  auto MaybeBit = getConstantVRegValWithLookThrough(
+      AndInst.getOperand(2).getReg(), *MIB.getMRI());
+  if (!MaybeBit)
+    return false;
+
+  int32_t Bit = MaybeBit->Value.exactLogBase2();
+  if (Bit < 0)
     return false;
 
-  uint64_t Bit = Log2_64(static_cast<uint64_t>(MaybeBit->Value));
-  Register TestReg = AndInst->getOperand(1).getReg();
-  bool Invert = Pred == CmpInst::Predicate::ICMP_NE;
+  Register TestReg = AndInst.getOperand(1).getReg();
 
   // Emit a TB(N)Z.
   emitTestBit(TestReg, Bit, Invert, DstMBB, MIB);
   return true;
 }
 
-bool AArch64InstructionSelector::selectCompareBranch(
-    MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
+MachineInstr *AArch64InstructionSelector::emitCBZ(Register CompareReg,
+                                                  bool IsNegative,
+                                                  MachineBasicBlock *DestMBB,
+                                                  MachineIRBuilder &MIB) const {
+  assert(ProduceNonFlagSettingCondBr && "CBZ does not set flags!");
+  MachineRegisterInfo &MRI = *MIB.getMRI();
+  assert(RBI.getRegBank(CompareReg, MRI, TRI)->getID() ==
+             AArch64::GPRRegBankID &&
+         "Expected GPRs only?");
+  auto Ty = MRI.getType(CompareReg);
+  unsigned Width = Ty.getSizeInBits();
+  assert(!Ty.isVector() && "Expected scalar only?");
+  assert(Width <= 64 && "Expected width to be at most 64?");
+  static const unsigned OpcTable[2][2] = {{AArch64::CBZW, AArch64::CBZX},
+                                          {AArch64::CBNZW, AArch64::CBNZX}};
+  unsigned Opc = OpcTable[IsNegative][Width == 64];
+  auto BranchMI = MIB.buildInstr(Opc, {}, {CompareReg}).addMBB(DestMBB);
+  constrainSelectedInstRegOperands(*BranchMI, TII, TRI, RBI);
+  return &*BranchMI;
+}
 
-  const Register CondReg = I.getOperand(0).getReg();
+bool AArch64InstructionSelector::selectCompareBranchFedByFCmp(
+    MachineInstr &I, MachineInstr &FCmp, MachineIRBuilder &MIB) const {
+  assert(FCmp.getOpcode() == TargetOpcode::G_FCMP);
+  assert(I.getOpcode() == TargetOpcode::G_BRCOND);
+  // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
+  // totally clean.  Some of them require two branches to implement.
+  auto Pred = (CmpInst::Predicate)FCmp.getOperand(1).getPredicate();
+  emitFPCompare(FCmp.getOperand(2).getReg(), FCmp.getOperand(3).getReg(), MIB,
+                Pred);
+  AArch64CC::CondCode CC1, CC2;
+  changeFCMPPredToAArch64CC(static_cast<CmpInst::Predicate>(Pred), CC1, CC2);
   MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
-  MachineInstr *CCMI = MRI.getVRegDef(CondReg);
-  if (CCMI->getOpcode() == TargetOpcode::G_TRUNC)
-    CCMI = MRI.getVRegDef(CCMI->getOperand(1).getReg());
-  if (CCMI->getOpcode() != TargetOpcode::G_ICMP)
+  MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC1).addMBB(DestMBB);
+  if (CC2 != AArch64CC::AL)
+    MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC2).addMBB(DestMBB);
+  I.eraseFromParent();
+  return true;
+}
+
+bool AArch64InstructionSelector::tryOptCompareBranchFedByICmp(
+    MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const {
+  assert(ICmp.getOpcode() == TargetOpcode::G_ICMP);
+  assert(I.getOpcode() == TargetOpcode::G_BRCOND);
+  // Attempt to optimize the G_BRCOND + G_ICMP into a TB(N)Z/CB(N)Z.
+  //
+  // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z
+  // instructions will not be produced, as they are conditional branch
+  // instructions that do not set flags.
+  if (!ProduceNonFlagSettingCondBr)
     return false;
 
-  Register LHS = CCMI->getOperand(2).getReg();
-  Register RHS = CCMI->getOperand(3).getReg();
+  MachineRegisterInfo &MRI = *MIB.getMRI();
+  MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
+  auto Pred =
+      static_cast<CmpInst::Predicate>(ICmp.getOperand(1).getPredicate());
+  Register LHS = ICmp.getOperand(2).getReg();
+  Register RHS = ICmp.getOperand(3).getReg();
+
+  // We're allowed to emit a TB(N)Z/CB(N)Z. Try to do that.
   auto VRegAndVal = getConstantVRegValWithLookThrough(RHS, MRI);
-  MachineIRBuilder MIB(I);
-  CmpInst::Predicate Pred =
-      (CmpInst::Predicate)CCMI->getOperand(1).getPredicate();
-  MachineInstr *LHSMI = getDefIgnoringCopies(LHS, MRI);
+  MachineInstr *AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI);
 
   // When we can emit a TB(N)Z, prefer that.
   //
   // Handle non-commutative condition codes first.
   // Note that we don't want to do this when we have a G_AND because it can
   // become a tst. The tst will make the test bit in the TB(N)Z redundant.
-  if (VRegAndVal && LHSMI->getOpcode() != TargetOpcode::G_AND) {
-    int64_t C = VRegAndVal->Value;
+  if (VRegAndVal && !AndInst) {
+    int64_t C = VRegAndVal->Value.getSExtValue();
 
     // When we have a greater-than comparison, we can just test if the msb is
     // zero.
@@ -1309,54 +1550,97 @@ bool AArch64InstructionSelector::selectCompareBranch(
     }
   }
 
-  if (!VRegAndVal) {
-    std::swap(RHS, LHS);
-    VRegAndVal = getConstantVRegValWithLookThrough(RHS, MRI);
-    LHSMI = getDefIgnoringCopies(LHS, MRI);
+  // Attempt to handle commutative condition codes. Right now, that's only
+  // eq/ne.
+  if (ICmpInst::isEquality(Pred)) {
+    if (!VRegAndVal) {
+      std::swap(RHS, LHS);
+      VRegAndVal = getConstantVRegValWithLookThrough(RHS, MRI);
+      AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI);
+    }
+
+    if (VRegAndVal && VRegAndVal->Value == 0) {
+      // If there's a G_AND feeding into this branch, try to fold it away by
+      // emitting a TB(N)Z instead.
+      //
+      // Note: If we have LT, then it *is* possible to fold, but it wouldn't be
+      // beneficial. When we have an AND and LT, we need a TST/ANDS, so folding
+      // would be redundant.
+      if (AndInst &&
+          tryOptAndIntoCompareBranch(
+              *AndInst, /*Invert = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB)) {
+        I.eraseFromParent();
+        return true;
+      }
+
+      // Otherwise, try to emit a CB(N)Z instead.
+      auto LHSTy = MRI.getType(LHS);
+      if (!LHSTy.isVector() && LHSTy.getSizeInBits() <= 64) {
+        emitCBZ(LHS, /*IsNegative = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB);
+        I.eraseFromParent();
+        return true;
+      }
+    }
   }
 
-  if (!VRegAndVal || VRegAndVal->Value != 0) {
-    // If we can't select a CBZ then emit a cmp + Bcc.
-    MachineInstr *Cmp;
-    std::tie(Cmp, Pred) = emitIntegerCompare(
-        CCMI->getOperand(2), CCMI->getOperand(3), CCMI->getOperand(1), MIB);
-    if (!Cmp)
-      return false;
-    const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(Pred);
-    MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC).addMBB(DestMBB);
-    I.eraseFromParent();
+  return false;
+}
+
+bool AArch64InstructionSelector::selectCompareBranchFedByICmp(
+    MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const {
+  assert(ICmp.getOpcode() == TargetOpcode::G_ICMP);
+  assert(I.getOpcode() == TargetOpcode::G_BRCOND);
+  if (tryOptCompareBranchFedByICmp(I, ICmp, MIB))
     return true;
+
+  // Couldn't optimize. Emit a compare + a Bcc.
+  MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
+  auto PredOp = ICmp.getOperand(1);
+  emitIntegerCompare(ICmp.getOperand(2), ICmp.getOperand(3), PredOp, MIB);
+  const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(
+      static_cast<CmpInst::Predicate>(PredOp.getPredicate()));
+  MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC).addMBB(DestMBB);
+  I.eraseFromParent();
+  return true;
+}
+
+bool AArch64InstructionSelector::selectCompareBranch(
+    MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
+  Register CondReg = I.getOperand(0).getReg();
+  MachineInstr *CCMI = MRI.getVRegDef(CondReg);
+  if (CCMI->getOpcode() == TargetOpcode::G_TRUNC) {
+    CondReg = CCMI->getOperand(1).getReg();
+    CCMI = MRI.getVRegDef(CondReg);
   }
 
-  // Try to emit a TB(N)Z for an eq or ne condition.
-  if (tryOptAndIntoCompareBranch(LHSMI, VRegAndVal->Value, Pred, DestMBB,
-                                 MIB)) {
+  // Try to select the G_BRCOND using whatever is feeding the condition if
+  // possible.
+  MachineIRBuilder MIB(I);
+  unsigned CCMIOpc = CCMI->getOpcode();
+  if (CCMIOpc == TargetOpcode::G_FCMP)
+    return selectCompareBranchFedByFCmp(I, *CCMI, MIB);
+  if (CCMIOpc == TargetOpcode::G_ICMP)
+    return selectCompareBranchFedByICmp(I, *CCMI, MIB);
+
+  // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z
+  // instructions will not be produced, as they are conditional branch
+  // instructions that do not set flags.
+  if (ProduceNonFlagSettingCondBr) {
+    emitTestBit(CondReg, /*Bit = */ 0, /*IsNegative = */ true,
+                I.getOperand(1).getMBB(), MIB);
     I.eraseFromParent();
     return true;
   }
 
-  const RegisterBank &RB = *RBI.getRegBank(LHS, MRI, TRI);
-  if (RB.getID() != AArch64::GPRRegBankID)
-    return false;
-  if (Pred != CmpInst::ICMP_NE && Pred != CmpInst::ICMP_EQ)
-    return false;
-
-  const unsigned CmpWidth = MRI.getType(LHS).getSizeInBits();
-  unsigned CBOpc = 0;
-  if (CmpWidth <= 32)
-    CBOpc = (Pred == CmpInst::ICMP_EQ ? AArch64::CBZW : AArch64::CBNZW);
-  else if (CmpWidth == 64)
-    CBOpc = (Pred == CmpInst::ICMP_EQ ? AArch64::CBZX : AArch64::CBNZX);
-  else
-    return false;
-
-  BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(CBOpc))
-      .addUse(LHS)
-      .addMBB(DestMBB)
-      .constrainAllUses(TII, TRI, RBI);
-
+  // Can't emit TB(N)Z/CB(N)Z. Emit a tst + bcc instead.
+  auto TstMI =
+      MIB.buildInstr(AArch64::ANDSWri, {LLT::scalar(32)}, {CondReg}).addImm(1);
+  constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
+  auto Bcc = MIB.buildInstr(AArch64::Bcc)
+                 .addImm(AArch64CC::EQ)
+                 .addMBB(I.getOperand(1).getMBB());
   I.eraseFromParent();
-  return true;
+  return constrainSelectedInstRegOperands(*Bcc, TII, TRI, RBI);
 }
 
 /// Returns the element immediate value of a vector shift operand if found.
@@ -1377,8 +1661,8 @@ static Optional<int64_t> getVectorShiftImm(Register Reg,
       return None;
 
     if (Idx == 1)
-      ImmVal = VRegAndVal->Value;
-    if (ImmVal != VRegAndVal->Value)
+      ImmVal = VRegAndVal->Value.getSExtValue();
+    if (ImmVal != VRegAndVal->Value.getSExtValue())
       return None;
   }
 
@@ -1441,6 +1725,14 @@ bool AArch64InstructionSelector::selectVectorSHL(
     Opc = ImmVal ? AArch64::SHLv4i32_shift : AArch64::USHLv4i32;
   } else if (Ty == LLT::vector(2, 32)) {
     Opc = ImmVal ? AArch64::SHLv2i32_shift : AArch64::USHLv2i32;
+  } else if (Ty == LLT::vector(4, 16)) {
+    Opc = ImmVal ? AArch64::SHLv4i16_shift : AArch64::USHLv4i16;
+  } else if (Ty == LLT::vector(8, 16)) {
+    Opc = ImmVal ? AArch64::SHLv8i16_shift : AArch64::USHLv8i16;
+  } else if (Ty == LLT::vector(16, 8)) {
+    Opc = ImmVal ? AArch64::SHLv16i8_shift : AArch64::USHLv16i8;
+  } else if (Ty == LLT::vector(8, 8)) {
+    Opc = ImmVal ? AArch64::SHLv8i8_shift : AArch64::USHLv8i8;
   } else {
     LLVM_DEBUG(dbgs() << "Unhandled G_SHL type");
     return false;
@@ -1457,9 +1749,10 @@ bool AArch64InstructionSelector::selectVectorSHL(
   return true;
 }
 
-bool AArch64InstructionSelector::selectVectorASHR(
+bool AArch64InstructionSelector::selectVectorAshrLshr(
     MachineInstr &I, MachineRegisterInfo &MRI) const {
-  assert(I.getOpcode() == TargetOpcode::G_ASHR);
+  assert(I.getOpcode() == TargetOpcode::G_ASHR ||
+         I.getOpcode() == TargetOpcode::G_LSHR);
   Register DstReg = I.getOperand(0).getReg();
   const LLT Ty = MRI.getType(DstReg);
   Register Src1Reg = I.getOperand(1).getReg();
@@ -1468,25 +1761,40 @@ bool AArch64InstructionSelector::selectVectorASHR(
   if (!Ty.isVector())
     return false;
 
+  bool IsASHR = I.getOpcode() == TargetOpcode::G_ASHR;
+
+  // We expect the immediate case to be lowered in the PostLegalCombiner to
+  // AArch64ISD::VASHR or AArch64ISD::VLSHR equivalents.
+
   // There is not a shift right register instruction, but the shift left
   // register instruction takes a signed value, where negative numbers specify a
   // right shift.
 
   unsigned Opc = 0;
   unsigned NegOpc = 0;
-  const TargetRegisterClass *RC = nullptr;
+  const TargetRegisterClass *RC =
+      getRegClassForTypeOnBank(Ty, RBI.getRegBank(AArch64::FPRRegBankID), RBI);
   if (Ty == LLT::vector(2, 64)) {
-    Opc = AArch64::SSHLv2i64;
+    Opc = IsASHR ? AArch64::SSHLv2i64 : AArch64::USHLv2i64;
     NegOpc = AArch64::NEGv2i64;
-    RC = &AArch64::FPR128RegClass;
   } else if (Ty == LLT::vector(4, 32)) {
-    Opc = AArch64::SSHLv4i32;
+    Opc = IsASHR ? AArch64::SSHLv4i32 : AArch64::USHLv4i32;
     NegOpc = AArch64::NEGv4i32;
-    RC = &AArch64::FPR128RegClass;
   } else if (Ty == LLT::vector(2, 32)) {
-    Opc = AArch64::SSHLv2i32;
+    Opc = IsASHR ? AArch64::SSHLv2i32 : AArch64::USHLv2i32;
     NegOpc = AArch64::NEGv2i32;
-    RC = &AArch64::FPR64RegClass;
+  } else if (Ty == LLT::vector(4, 16)) {
+    Opc = IsASHR ? AArch64::SSHLv4i16 : AArch64::USHLv4i16;
+    NegOpc = AArch64::NEGv4i16;
+  } else if (Ty == LLT::vector(8, 16)) {
+    Opc = IsASHR ? AArch64::SSHLv8i16 : AArch64::USHLv8i16;
+    NegOpc = AArch64::NEGv8i16;
+  } else if (Ty == LLT::vector(16, 8)) {
+    Opc = IsASHR ? AArch64::SSHLv16i8 : AArch64::USHLv16i8;
+    NegOpc = AArch64::NEGv16i8;
+  } else if (Ty == LLT::vector(8, 8)) {
+    Opc = IsASHR ? AArch64::SSHLv8i8 : AArch64::USHLv8i8;
+    NegOpc = AArch64::NEGv8i8;
   } else {
     LLVM_DEBUG(dbgs() << "Unhandled G_ASHR type");
     return false;
@@ -1569,7 +1877,6 @@ void AArch64InstructionSelector::materializeLargeCMVal(
                               AArch64II::MO_G1 | AArch64II::MO_NC, 16, 0);
   DstReg = BuildMovK(DstReg, AArch64II::MO_G2 | AArch64II::MO_NC, 32, 0);
   BuildMovK(DstReg, AArch64II::MO_G3, 48, I.getOperand(0).getReg());
-  return;
 }
 
 bool AArch64InstructionSelector::preISelLower(MachineInstr &I) {
@@ -1624,6 +1931,40 @@ bool AArch64InstructionSelector::preISelLower(MachineInstr &I) {
     MRI.setType(DstReg, LLT::scalar(64));
     return true;
   }
+  case AArch64::G_DUP: {
+    // Convert the type from p0 to s64 to help selection.
+    LLT DstTy = MRI.getType(I.getOperand(0).getReg());
+    if (!DstTy.getElementType().isPointer())
+      return false;
+    MachineIRBuilder MIB(I);
+    auto NewSrc = MIB.buildCopy(LLT::scalar(64), I.getOperand(1).getReg());
+    MRI.setType(I.getOperand(0).getReg(),
+                DstTy.changeElementType(LLT::scalar(64)));
+    MRI.setRegBank(NewSrc.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID));
+    I.getOperand(1).setReg(NewSrc.getReg(0));
+    return true;
+  }
+  case TargetOpcode::G_UITOFP:
+  case TargetOpcode::G_SITOFP: {
+    // If both source and destination regbanks are FPR, then convert the opcode
+    // to G_SITOF so that the importer can select it to an fpr variant.
+    // Otherwise, it ends up matching an fpr/gpr variant and adding a cross-bank
+    // copy.
+    Register SrcReg = I.getOperand(1).getReg();
+    LLT SrcTy = MRI.getType(SrcReg);
+    LLT DstTy = MRI.getType(I.getOperand(0).getReg());
+    if (SrcTy.isVector() || SrcTy.getSizeInBits() != DstTy.getSizeInBits())
+      return false;
+
+    if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::FPRRegBankID) {
+      if (I.getOpcode() == TargetOpcode::G_SITOFP)
+        I.setDesc(TII.get(AArch64::G_SITOF));
+      else
+        I.setDesc(TII.get(AArch64::G_UITOF));
+      return true;
+    }
+    return false;
+  }
   default:
     return false;
   }
@@ -1664,6 +2005,14 @@ bool AArch64InstructionSelector::convertPtrAddToAdd(
     LLVM_DEBUG(dbgs() << "Failed to select G_PTRTOINT in convertPtrAddToAdd");
     return false;
   }
+
+  // Also take the opportunity here to try to do some optimization.
+  // Try to convert this into a G_SUB if the offset is a 0-x negate idiom.
+  Register NegatedReg;
+  if (!mi_match(I.getOperand(2).getReg(), MRI, m_Neg(m_Reg(NegatedReg))))
+    return true;
+  I.getOperand(2).setReg(NegatedReg);
+  I.setDesc(TII.get(TargetOpcode::G_SUB));
   return true;
 }
 
@@ -1753,6 +2102,17 @@ bool AArch64InstructionSelector::earlySelect(MachineInstr &I) const {
   MachineRegisterInfo &MRI = MF.getRegInfo();
 
   switch (I.getOpcode()) {
+  case TargetOpcode::G_BR: {
+    // If the branch jumps to the fallthrough block, don't bother emitting it.
+    // Only do this for -O0 for a good code size improvement, because when
+    // optimizations are enabled we want to leave this choice to
+    // MachineBlockPlacement.
+    bool EnableOpt = MF.getTarget().getOptLevel() != CodeGenOpt::None;
+    if (EnableOpt || !MBB.isLayoutSuccessor(I.getOperand(0).getMBB()))
+      return false;
+    I.eraseFromParent();
+    return true;
+  }
   case TargetOpcode::G_SHL:
     return earlySelectSHL(I, MRI);
   case TargetOpcode::G_CONSTANT: {
@@ -1872,48 +2232,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
   MachineIRBuilder MIB(I);
 
   switch (Opcode) {
-  case TargetOpcode::G_BRCOND: {
-    if (Ty.getSizeInBits() > 32) {
-      // We shouldn't need this on AArch64, but it would be implemented as an
-      // EXTRACT_SUBREG followed by a TBNZW because TBNZX has no encoding if the
-      // bit being tested is < 32.
-      LLVM_DEBUG(dbgs() << "G_BRCOND has type: " << Ty
-                        << ", expected at most 32-bits");
-      return false;
-    }
-
-    const Register CondReg = I.getOperand(0).getReg();
-    MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
-
-    // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z
-    // instructions will not be produced, as they are conditional branch
-    // instructions that do not set flags.
-    if (ProduceNonFlagSettingCondBr && selectCompareBranch(I, MF, MRI))
-      return true;
-
-    if (ProduceNonFlagSettingCondBr) {
-      auto MIB = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::TBNZW))
-                     .addUse(CondReg)
-                     .addImm(/*bit offset=*/0)
-                     .addMBB(DestMBB);
-
-      I.eraseFromParent();
-      return constrainSelectedInstRegOperands(*MIB.getInstr(), TII, TRI, RBI);
-    } else {
-      auto CMP = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::ANDSWri))
-                     .addDef(AArch64::WZR)
-                     .addUse(CondReg)
-                     .addImm(1);
-      constrainSelectedInstRegOperands(*CMP.getInstr(), TII, TRI, RBI);
-      auto Bcc =
-          BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::Bcc))
-              .addImm(AArch64CC::EQ)
-              .addMBB(DestMBB);
-
-      I.eraseFromParent();
-      return constrainSelectedInstRegOperands(*Bcc.getInstr(), TII, TRI, RBI);
-    }
-  }
+  case TargetOpcode::G_BRCOND:
+    return selectCompareBranch(I, MF, MRI);
 
   case TargetOpcode::G_BRINDIRECT: {
     I.setDesc(TII.get(AArch64::BR));
@@ -1993,6 +2313,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     const LLT s16 = LLT::scalar(16);
     const LLT s32 = LLT::scalar(32);
     const LLT s64 = LLT::scalar(64);
+    const LLT s128 = LLT::scalar(128);
     const LLT p0 = LLT::pointer(0, 64);
 
     const Register DefReg = I.getOperand(0).getReg();
@@ -2002,10 +2323,10 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
 
     // FIXME: Redundant check, but even less readable when factored out.
     if (isFP) {
-      if (Ty != s32 && Ty != s64) {
+      if (Ty != s32 && Ty != s64 && Ty != s128) {
         LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty
                           << " constant, expected: " << s32 << " or " << s64
-                          << '\n');
+                          << " or " << s128 << '\n');
         return false;
       }
 
@@ -2018,7 +2339,9 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
 
       // The case when we have 0.0 is covered by tablegen. Reject it here so we
       // can be sure tablegen works correctly and isn't rescued by this code.
-      if (I.getOperand(1).getFPImm()->getValueAPF().isExactlyValue(0.0))
+      // 0.0 is not covered by tablegen for FP128. So we will handle this 
+      // scenario in the code here.
+      if (DefSize != 128 && I.getOperand(1).getFPImm()->isExactlyValue(0.0))
         return false;
     } else {
       // s32 and s64 are covered by tablegen.
@@ -2045,15 +2368,17 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
       // Either emit a FMOV, or emit a copy to emit a normal mov.
       const TargetRegisterClass &GPRRC =
           DefSize == 32 ? AArch64::GPR32RegClass : AArch64::GPR64RegClass;
-      const TargetRegisterClass &FPRRC =
-          DefSize == 32 ? AArch64::FPR32RegClass : AArch64::FPR64RegClass;
+      const TargetRegisterClass &FPRRC = 
+          DefSize == 32 ? AArch64::FPR32RegClass 
+                        : (DefSize == 64 ? AArch64::FPR64RegClass 
+                                         : AArch64::FPR128RegClass);
 
       // Can we use a FMOV instruction to represent the immediate?
       if (emitFMovForFConstant(I, MRI))
         return true;
 
       // For 64b values, emit a constant pool load instead.
-      if (DefSize == 64) {
+      if (DefSize == 64 || DefSize == 128) {
         auto *FPImm = I.getOperand(1).getFPImm();
         MachineIRBuilder MIB(I);
         auto *LoadMI = emitLoadFromConstantPool(FPImm, MIB);
@@ -2246,21 +2571,22 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     }
 
     auto &MemOp = **I.memoperands_begin();
+    uint64_t MemSizeInBytes = MemOp.getSize();
     if (MemOp.isAtomic()) {
       // For now we just support s8 acquire loads to be able to compile stack
       // protector code.
       if (MemOp.getOrdering() == AtomicOrdering::Acquire &&
-          MemOp.getSize() == 1) {
+          MemSizeInBytes == 1) {
         I.setDesc(TII.get(AArch64::LDARB));
         return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
       }
       LLVM_DEBUG(dbgs() << "Atomic load/store not fully supported yet\n");
       return false;
     }
-    unsigned MemSizeInBits = MemOp.getSize() * 8;
+    unsigned MemSizeInBits = MemSizeInBytes * 8;
 
-    const Register PtrReg = I.getOperand(1).getReg();
 #ifndef NDEBUG
+    const Register PtrReg = I.getOperand(1).getReg();
     const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI);
     // Sanity-check the pointer register.
     assert(PtrRB.getID() == AArch64::GPRRegBankID &&
@@ -2272,68 +2598,78 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     const Register ValReg = I.getOperand(0).getReg();
     const RegisterBank &RB = *RBI.getRegBank(ValReg, MRI, TRI);
 
-    const unsigned NewOpc =
-        selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemSizeInBits);
-    if (NewOpc == I.getOpcode())
-      return false;
-
-    I.setDesc(TII.get(NewOpc));
-
-    uint64_t Offset = 0;
-    auto *PtrMI = MRI.getVRegDef(PtrReg);
-
-    // Try to fold a GEP into our unsigned immediate addressing mode.
-    if (PtrMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
-      if (auto COff = getConstantVRegVal(PtrMI->getOperand(2).getReg(), MRI)) {
-        int64_t Imm = *COff;
-        const unsigned Size = MemSizeInBits / 8;
-        const unsigned Scale = Log2_32(Size);
-        if ((Imm & (Size - 1)) == 0 && Imm >= 0 && Imm < (0x1000 << Scale)) {
-          Register Ptr2Reg = PtrMI->getOperand(1).getReg();
-          I.getOperand(1).setReg(Ptr2Reg);
-          PtrMI = MRI.getVRegDef(Ptr2Reg);
-          Offset = Imm / Size;
-        }
+    // Helper lambda for partially selecting I. Either returns the original
+    // instruction with an updated opcode, or a new instruction.
+    auto SelectLoadStoreAddressingMode = [&]() -> MachineInstr * {
+      bool IsStore = I.getOpcode() == TargetOpcode::G_STORE;
+      const unsigned NewOpc =
+          selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemSizeInBits);
+      if (NewOpc == I.getOpcode())
+        return nullptr;
+      // Check if we can fold anything into the addressing mode.
+      auto AddrModeFns =
+          selectAddrModeIndexed(I.getOperand(1), MemSizeInBytes);
+      if (!AddrModeFns) {
+        // Can't fold anything. Use the original instruction.
+        I.setDesc(TII.get(NewOpc));
+        I.addOperand(MachineOperand::CreateImm(0));
+        return &I;
       }
-    }
 
-    // If we haven't folded anything into our addressing mode yet, try to fold
-    // a frame index into the base+offset.
-    if (!Offset && PtrMI->getOpcode() == TargetOpcode::G_FRAME_INDEX)
-      I.getOperand(1).ChangeToFrameIndex(PtrMI->getOperand(1).getIndex());
+      // Folded something. Create a new instruction and return it.
+      auto NewInst = MIB.buildInstr(NewOpc, {}, {}, I.getFlags());
+      IsStore ? NewInst.addUse(ValReg) : NewInst.addDef(ValReg);
+      NewInst.cloneMemRefs(I);
+      for (auto &Fn : *AddrModeFns)
+        Fn(NewInst);
+      I.eraseFromParent();
+      return &*NewInst;
+    };
 
-    I.addOperand(MachineOperand::CreateImm(Offset));
+    MachineInstr *LoadStore = SelectLoadStoreAddressingMode();
+    if (!LoadStore)
+      return false;
 
     // If we're storing a 0, use WZR/XZR.
-    if (auto CVal = getConstantVRegVal(ValReg, MRI)) {
-      if (*CVal == 0 && Opcode == TargetOpcode::G_STORE) {
-        if (I.getOpcode() == AArch64::STRWui)
-          I.getOperand(0).setReg(AArch64::WZR);
-        else if (I.getOpcode() == AArch64::STRXui)
-          I.getOperand(0).setReg(AArch64::XZR);
+    if (Opcode == TargetOpcode::G_STORE) {
+      auto CVal = getConstantVRegValWithLookThrough(
+          LoadStore->getOperand(0).getReg(), MRI, /*LookThroughInstrs = */ true,
+          /*HandleFConstants = */ false);
+      if (CVal && CVal->Value == 0) {
+        switch (LoadStore->getOpcode()) {
+        case AArch64::STRWui:
+        case AArch64::STRHHui:
+        case AArch64::STRBBui:
+          LoadStore->getOperand(0).setReg(AArch64::WZR);
+          break;
+        case AArch64::STRXui:
+          LoadStore->getOperand(0).setReg(AArch64::XZR);
+          break;
+        }
       }
     }
 
     if (IsZExtLoad) {
-      // The zextload from a smaller type to i32 should be handled by the importer.
-      if (MRI.getType(ValReg).getSizeInBits() != 64)
+      // The zextload from a smaller type to i32 should be handled by the
+      // importer.
+      if (MRI.getType(LoadStore->getOperand(0).getReg()).getSizeInBits() != 64)
         return false;
       // If we have a ZEXTLOAD then change the load's type to be a narrower reg
-      //and zero_extend with SUBREG_TO_REG.
+      // and zero_extend with SUBREG_TO_REG.
       Register LdReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
-      Register DstReg = I.getOperand(0).getReg();
-      I.getOperand(0).setReg(LdReg);
+      Register DstReg = LoadStore->getOperand(0).getReg();
+      LoadStore->getOperand(0).setReg(LdReg);
 
-      MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator()));
+      MIB.setInsertPt(MIB.getMBB(), std::next(LoadStore->getIterator()));
       MIB.buildInstr(AArch64::SUBREG_TO_REG, {DstReg}, {})
           .addImm(0)
           .addUse(LdReg)
           .addImm(AArch64::sub_32);
-      constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+      constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI);
       return RBI.constrainGenericRegister(DstReg, AArch64::GPR64allRegClass,
                                           MRI);
     }
-    return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+    return constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI);
   }
 
   case TargetOpcode::G_SMULH:
@@ -2364,22 +2700,21 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     // operands to use appropriate classes.
     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
   }
-  case TargetOpcode::G_FADD:
-  case TargetOpcode::G_FSUB:
-  case TargetOpcode::G_FMUL:
-  case TargetOpcode::G_FDIV:
-
+  case TargetOpcode::G_LSHR:
   case TargetOpcode::G_ASHR:
     if (MRI.getType(I.getOperand(0).getReg()).isVector())
-      return selectVectorASHR(I, MRI);
+      return selectVectorAshrLshr(I, MRI);
     LLVM_FALLTHROUGH;
   case TargetOpcode::G_SHL:
     if (Opcode == TargetOpcode::G_SHL &&
         MRI.getType(I.getOperand(0).getReg()).isVector())
       return selectVectorSHL(I, MRI);
     LLVM_FALLTHROUGH;
-  case TargetOpcode::G_OR:
-  case TargetOpcode::G_LSHR: {
+  case TargetOpcode::G_FADD:
+  case TargetOpcode::G_FSUB:
+  case TargetOpcode::G_FMUL:
+  case TargetOpcode::G_FDIV:
+  case TargetOpcode::G_OR: {
     // Reject the various things we don't support yet.
     if (unsupportedBinOp(I, RBI, MRI, TRI))
       return false;
@@ -2408,37 +2743,24 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     I.eraseFromParent();
     return true;
   }
-  case TargetOpcode::G_UADDO: {
-    // TODO: Support other types.
-    unsigned OpSize = Ty.getSizeInBits();
-    if (OpSize != 32 && OpSize != 64) {
-      LLVM_DEBUG(
-          dbgs()
-          << "G_UADDO currently only supported for 32 and 64 b types.\n");
-      return false;
-    }
-
-    // TODO: Support vectors.
-    if (Ty.isVector()) {
-      LLVM_DEBUG(dbgs() << "G_UADDO currently only supported for scalars.\n");
-      return false;
-    }
-
-    // Add and set the set condition flag.
-    unsigned AddsOpc = OpSize == 32 ? AArch64::ADDSWrr : AArch64::ADDSXrr;
+  case TargetOpcode::G_SADDO:
+  case TargetOpcode::G_UADDO:
+  case TargetOpcode::G_SSUBO:
+  case TargetOpcode::G_USUBO: {
+    // Emit the operation and get the correct condition code.
     MachineIRBuilder MIRBuilder(I);
-    auto AddsMI = MIRBuilder.buildInstr(AddsOpc, {I.getOperand(0)},
-                                        {I.getOperand(2), I.getOperand(3)});
-    constrainSelectedInstRegOperands(*AddsMI, TII, TRI, RBI);
+    auto OpAndCC = emitOverflowOp(Opcode, I.getOperand(0).getReg(),
+                                  I.getOperand(2), I.getOperand(3), MIRBuilder);
 
     // Now, put the overflow result in the register given by the first operand
-    // to the G_UADDO. CSINC increments the result when the predicate is false,
-    // so to get the increment when it's true, we need to use the inverse. In
-    // this case, we want to increment when carry is set.
+    // to the overflow op. CSINC increments the result when the predicate is
+    // false, so to get the increment when it's true, we need to use the
+    // inverse. In this case, we want to increment when carry is set.
+    Register ZReg = AArch64::WZR;
     auto CsetMI = MIRBuilder
                       .buildInstr(AArch64::CSINCWr, {I.getOperand(1).getReg()},
-                                  {Register(AArch64::WZR), Register(AArch64::WZR)})
-                      .addImm(getInvertedCondCode(AArch64CC::HS));
+                                  {ZReg, ZReg})
+                      .addImm(getInvertedCondCode(OpAndCC.second));
     constrainSelectedInstRegOperands(*CsetMI, TII, TRI, RBI);
     I.eraseFromParent();
     return true;
@@ -2446,7 +2768,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
 
   case TargetOpcode::G_PTRMASK: {
     Register MaskReg = I.getOperand(2).getReg();
-    Optional<int64_t> MaskVal = getConstantVRegVal(MaskReg, MRI);
+    Optional<int64_t> MaskVal = getConstantVRegSExtVal(MaskReg, MRI);
     // TODO: Implement arbitrary cases
     if (!MaskVal || !isShiftedMask_64(*MaskVal))
       return false;
@@ -2737,22 +3059,15 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     if (tryOptSelect(I))
       return true;
 
-    Register CSelOpc = selectSelectOpc(I, MRI, RBI);
-    MachineInstr &TstMI =
-        *BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::ANDSWri))
-             .addDef(AArch64::WZR)
-             .addUse(CondReg)
-             .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
-
-    MachineInstr &CSelMI = *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CSelOpc))
-                                .addDef(I.getOperand(0).getReg())
-                                .addUse(TReg)
-                                .addUse(FReg)
-                                .addImm(AArch64CC::NE);
-
-    constrainSelectedInstRegOperands(TstMI, TII, TRI, RBI);
-    constrainSelectedInstRegOperands(CSelMI, TII, TRI, RBI);
-
+    // Make sure to use an unused vreg instead of wzr, so that the peephole
+    // optimizations will be able to optimize these.
+    MachineIRBuilder MIB(I);
+    Register DeadVReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
+    auto TstMI = MIB.buildInstr(AArch64::ANDSWri, {DeadVReg}, {CondReg})
+                     .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
+    constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
+    if (!emitSelect(I.getOperand(0).getReg(), TReg, FReg, AArch64CC::NE, MIB))
+      return false;
     I.eraseFromParent();
     return true;
   }
@@ -2767,76 +3082,22 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     }
 
     MachineIRBuilder MIRBuilder(I);
-    MachineInstr *Cmp;
-    CmpInst::Predicate Pred;
-    std::tie(Cmp, Pred) = emitIntegerCompare(I.getOperand(2), I.getOperand(3),
-                                             I.getOperand(1), MIRBuilder);
-    if (!Cmp)
-      return false;
+    auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate());
+    emitIntegerCompare(I.getOperand(2), I.getOperand(3), I.getOperand(1),
+                       MIRBuilder);
     emitCSetForICMP(I.getOperand(0).getReg(), Pred, MIRBuilder);
     I.eraseFromParent();
     return true;
   }
 
   case TargetOpcode::G_FCMP: {
-    if (Ty != LLT::scalar(32)) {
-      LLVM_DEBUG(dbgs() << "G_FCMP result has type: " << Ty
-                        << ", expected: " << LLT::scalar(32) << '\n');
-      return false;
-    }
-
-    unsigned CmpOpc = selectFCMPOpc(I, MRI);
-    if (!CmpOpc)
+    MachineIRBuilder MIRBuilder(I);
+    CmpInst::Predicate Pred =
+        static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate());
+    if (!emitFPCompare(I.getOperand(2).getReg(), I.getOperand(3).getReg(),
+                       MIRBuilder, Pred) ||
+        !emitCSetForFCmp(I.getOperand(0).getReg(), Pred, MIRBuilder))
       return false;
-
-    // FIXME: regbank
-
-    AArch64CC::CondCode CC1, CC2;
-    changeFCMPPredToAArch64CC(
-        (CmpInst::Predicate)I.getOperand(1).getPredicate(), CC1, CC2);
-
-    // Partially build the compare. Decide if we need to add a use for the
-    // third operand based off whether or not we're comparing against 0.0.
-    auto CmpMI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(CmpOpc))
-                     .addUse(I.getOperand(2).getReg());
-
-    // If we don't have an immediate compare, then we need to add a use of the
-    // register which wasn't used for the immediate.
-    // Note that the immediate will always be the last operand.
-    if (CmpOpc != AArch64::FCMPSri && CmpOpc != AArch64::FCMPDri)
-      CmpMI = CmpMI.addUse(I.getOperand(3).getReg());
-
-    const Register DefReg = I.getOperand(0).getReg();
-    Register Def1Reg = DefReg;
-    if (CC2 != AArch64CC::AL)
-      Def1Reg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
-
-    MachineInstr &CSetMI =
-        *BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::CSINCWr))
-             .addDef(Def1Reg)
-             .addUse(AArch64::WZR)
-             .addUse(AArch64::WZR)
-             .addImm(getInvertedCondCode(CC1));
-
-    if (CC2 != AArch64CC::AL) {
-      Register Def2Reg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
-      MachineInstr &CSet2MI =
-          *BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::CSINCWr))
-               .addDef(Def2Reg)
-               .addUse(AArch64::WZR)
-               .addUse(AArch64::WZR)
-               .addImm(getInvertedCondCode(CC2));
-      MachineInstr &OrMI =
-          *BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::ORRWrr))
-               .addDef(DefReg)
-               .addUse(Def1Reg)
-               .addUse(Def2Reg);
-      constrainSelectedInstRegOperands(OrMI, TII, TRI, RBI);
-      constrainSelectedInstRegOperands(CSet2MI, TII, TRI, RBI);
-    }
-    constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI);
-    constrainSelectedInstRegOperands(CSetMI, TII, TRI, RBI);
-
     I.eraseFromParent();
     return true;
   }
@@ -2875,6 +3136,24 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
       return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI);
     }
   }
+  case AArch64::G_DUP: {
+    // When the scalar of G_DUP is an s8/s16 gpr, they can't be selected by
+    // imported patterns. Do it manually here. Avoiding generating s16 gpr is
+    // difficult because at RBS we may end up pessimizing the fpr case if we
+    // decided to add an anyextend to fix this. Manual selection is the most
+    // robust solution for now.
+    Register SrcReg = I.getOperand(1).getReg();
+    if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() != AArch64::GPRRegBankID)
+      return false; // We expect the fpr regbank case to be imported.
+    LLT SrcTy = MRI.getType(SrcReg);
+    if (SrcTy.getSizeInBits() == 16)
+      I.setDesc(TII.get(AArch64::DUPv8i16gpr));
+    else if (SrcTy.getSizeInBits() == 8)
+      I.setDesc(TII.get(AArch64::DUPv16i8gpr));
+    else
+      return false;
+    return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+  }
   case TargetOpcode::G_INTRINSIC_TRUNC:
     return selectIntrinsicTrunc(I, MRI);
   case TargetOpcode::G_INTRINSIC_ROUND:
@@ -2895,8 +3174,49 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     return selectConcatVectors(I, MRI);
   case TargetOpcode::G_JUMP_TABLE:
     return selectJumpTable(I, MRI);
+  case TargetOpcode::G_VECREDUCE_FADD:
+  case TargetOpcode::G_VECREDUCE_ADD:
+    return selectReduction(I, MRI);
+  }
+
+  return false;
+}
+
+bool AArch64InstructionSelector::selectReduction(
+    MachineInstr &I, MachineRegisterInfo &MRI) const {
+  Register VecReg = I.getOperand(1).getReg();
+  LLT VecTy = MRI.getType(VecReg);
+  if (I.getOpcode() == TargetOpcode::G_VECREDUCE_ADD) {
+    unsigned Opc = 0;
+    if (VecTy == LLT::vector(16, 8))
+      Opc = AArch64::ADDVv16i8v;
+    else if (VecTy == LLT::vector(8, 16))
+      Opc = AArch64::ADDVv8i16v;
+    else if (VecTy == LLT::vector(4, 32))
+      Opc = AArch64::ADDVv4i32v;
+    else if (VecTy == LLT::vector(2, 64))
+      Opc = AArch64::ADDPv2i64p;
+    else {
+      LLVM_DEBUG(dbgs() << "Unhandled type for add reduction");
+      return false;
+    }
+    I.setDesc(TII.get(Opc));
+    return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
   }
 
+  if (I.getOpcode() == TargetOpcode::G_VECREDUCE_FADD) {
+    unsigned Opc = 0;
+    if (VecTy == LLT::vector(2, 32))
+      Opc = AArch64::FADDPv2i32p;
+    else if (VecTy == LLT::vector(2, 64))
+      Opc = AArch64::FADDPv2i64p;
+    else {
+      LLVM_DEBUG(dbgs() << "Unhandled type for fadd reduction");
+      return false;
+    }
+    I.setDesc(TII.get(Opc));
+    return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+  }
   return false;
 }
 
@@ -2910,6 +3230,8 @@ bool AArch64InstructionSelector::selectBrJT(MachineInstr &I,
 
   Register TargetReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
   Register ScratchReg = MRI.createVirtualRegister(&AArch64::GPR64spRegClass);
+
+  MF->getInfo<AArch64FunctionInfo>()->setJumpTableEntryInfo(JTI, 4, nullptr);
   auto JumpTableInst = MIB.buildInstr(AArch64::JumpTableDest32,
                                       {TargetReg, ScratchReg}, {JTAddr, Index})
                            .addJumpTableIndex(JTI);
@@ -2946,17 +3268,20 @@ bool AArch64InstructionSelector::selectTLSGlobalValue(
   const GlobalValue &GV = *I.getOperand(1).getGlobal();
   MachineIRBuilder MIB(I);
 
-  MIB.buildInstr(AArch64::LOADgot, {AArch64::X0}, {})
-      .addGlobalAddress(&GV, 0, AArch64II::MO_TLS);
+  auto LoadGOT =
+      MIB.buildInstr(AArch64::LOADgot, {&AArch64::GPR64commonRegClass}, {})
+          .addGlobalAddress(&GV, 0, AArch64II::MO_TLS);
 
   auto Load = MIB.buildInstr(AArch64::LDRXui, {&AArch64::GPR64commonRegClass},
-                             {Register(AArch64::X0)})
+                             {LoadGOT.getReg(0)})
                   .addImm(0);
 
+  MIB.buildCopy(Register(AArch64::X0), LoadGOT.getReg(0));
   // TLS calls preserve all registers except those that absolutely must be
   // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
   // silly).
   MIB.buildInstr(getBLRCallOpcode(MF), {}, {Load})
+      .addUse(AArch64::X0, RegState::Implicit)
       .addDef(AArch64::X0, RegState::Implicit)
       .addRegMask(TRI.getTLSCallPreservedMask());
 
@@ -3442,7 +3767,7 @@ bool AArch64InstructionSelector::selectExtractElt(
   (void)WideTy;
   assert(WideTy.getSizeInBits() >= NarrowTy.getSizeInBits() &&
          "source register size too small!");
-  assert(NarrowTy.isScalar() && "cannot extract vector into vector!");
+  assert(!NarrowTy.isVector() && "cannot extract vector into vector!");
 
   // Need the lane index to determine the correct copy opcode.
   MachineOperand &LaneIdxOp = I.getOperand(2);
@@ -3457,7 +3782,7 @@ bool AArch64InstructionSelector::selectExtractElt(
   auto VRegAndVal = getConstantVRegValWithLookThrough(LaneIdxOp.getReg(), MRI);
   if (!VRegAndVal)
     return false;
-  unsigned LaneIdx = VRegAndVal->Value;
+  unsigned LaneIdx = VRegAndVal->Value.getSExtValue();
 
   MachineIRBuilder MIRBuilder(I);
 
@@ -3680,7 +4005,10 @@ static std::pair<unsigned, unsigned>
 getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) {
   unsigned Opc, SubregIdx;
   if (RB.getID() == AArch64::GPRRegBankID) {
-    if (EltSize == 32) {
+    if (EltSize == 16) {
+      Opc = AArch64::INSvi16gpr;
+      SubregIdx = AArch64::ssub;
+    } else if (EltSize == 32) {
       Opc = AArch64::INSvi32gpr;
       SubregIdx = AArch64::ssub;
     } else if (EltSize == 64) {
@@ -3709,135 +4037,223 @@ getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) {
   return std::make_pair(Opc, SubregIdx);
 }
 
+MachineInstr *AArch64InstructionSelector::emitInstr(
+    unsigned Opcode, std::initializer_list<llvm::DstOp> DstOps,
+    std::initializer_list<llvm::SrcOp> SrcOps, MachineIRBuilder &MIRBuilder,
+    const ComplexRendererFns &RenderFns) const {
+  assert(Opcode && "Expected an opcode?");
+  assert(!isPreISelGenericOpcode(Opcode) &&
+         "Function should only be used to produce selected instructions!");
+  auto MI = MIRBuilder.buildInstr(Opcode, DstOps, SrcOps);
+  if (RenderFns)
+    for (auto &Fn : *RenderFns)
+      Fn(MI);
+  constrainSelectedInstRegOperands(*MI, TII, TRI, RBI);
+  return &*MI;
+}
+
+MachineInstr *AArch64InstructionSelector::emitAddSub(
+    const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode,
+    Register Dst, MachineOperand &LHS, MachineOperand &RHS,
+    MachineIRBuilder &MIRBuilder) const {
+  MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
+  assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
+  auto Ty = MRI.getType(LHS.getReg());
+  assert(!Ty.isVector() && "Expected a scalar or pointer?");
+  unsigned Size = Ty.getSizeInBits();
+  assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit type only");
+  bool Is32Bit = Size == 32;
+
+  // INSTRri form with positive arithmetic immediate.
+  if (auto Fns = selectArithImmed(RHS))
+    return emitInstr(AddrModeAndSizeToOpcode[0][Is32Bit], {Dst}, {LHS},
+                     MIRBuilder, Fns);
+
+  // INSTRri form with negative arithmetic immediate.
+  if (auto Fns = selectNegArithImmed(RHS))
+    return emitInstr(AddrModeAndSizeToOpcode[3][Is32Bit], {Dst}, {LHS},
+                     MIRBuilder, Fns);
+
+  // INSTRrx form.
+  if (auto Fns = selectArithExtendedRegister(RHS))
+    return emitInstr(AddrModeAndSizeToOpcode[4][Is32Bit], {Dst}, {LHS},
+                     MIRBuilder, Fns);
+
+  // INSTRrs form.
+  if (auto Fns = selectShiftedRegister(RHS))
+    return emitInstr(AddrModeAndSizeToOpcode[1][Is32Bit], {Dst}, {LHS},
+                     MIRBuilder, Fns);
+  return emitInstr(AddrModeAndSizeToOpcode[2][Is32Bit], {Dst}, {LHS, RHS},
+                   MIRBuilder);
+}
+
 MachineInstr *
 AArch64InstructionSelector::emitADD(Register DefReg, MachineOperand &LHS,
                                     MachineOperand &RHS,
                                     MachineIRBuilder &MIRBuilder) const {
-  assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!");
-  MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
-  static const unsigned OpcTable[2][2]{{AArch64::ADDXrr, AArch64::ADDXri},
-                                       {AArch64::ADDWrr, AArch64::ADDWri}};
-  bool Is32Bit = MRI.getType(LHS.getReg()).getSizeInBits() == 32;
-  auto ImmFns = selectArithImmed(RHS);
-  unsigned Opc = OpcTable[Is32Bit][ImmFns.hasValue()];
-  auto AddMI = MIRBuilder.buildInstr(Opc, {DefReg}, {LHS});
-
-  // If we matched a valid constant immediate, add those operands.
-  if (ImmFns) {
-    for (auto &RenderFn : *ImmFns)
-      RenderFn(AddMI);
-  } else {
-    AddMI.addUse(RHS.getReg());
-  }
+  const std::array<std::array<unsigned, 2>, 5> OpcTable{
+      {{AArch64::ADDXri, AArch64::ADDWri},
+       {AArch64::ADDXrs, AArch64::ADDWrs},
+       {AArch64::ADDXrr, AArch64::ADDWrr},
+       {AArch64::SUBXri, AArch64::SUBWri},
+       {AArch64::ADDXrx, AArch64::ADDWrx}}};
+  return emitAddSub(OpcTable, DefReg, LHS, RHS, MIRBuilder);
+}
+
+MachineInstr *
+AArch64InstructionSelector::emitADDS(Register Dst, MachineOperand &LHS,
+                                     MachineOperand &RHS,
+                                     MachineIRBuilder &MIRBuilder) const {
+  const std::array<std::array<unsigned, 2>, 5> OpcTable{
+      {{AArch64::ADDSXri, AArch64::ADDSWri},
+       {AArch64::ADDSXrs, AArch64::ADDSWrs},
+       {AArch64::ADDSXrr, AArch64::ADDSWrr},
+       {AArch64::SUBSXri, AArch64::SUBSWri},
+       {AArch64::ADDSXrx, AArch64::ADDSWrx}}};
+  return emitAddSub(OpcTable, Dst, LHS, RHS, MIRBuilder);
+}
 
-  constrainSelectedInstRegOperands(*AddMI, TII, TRI, RBI);
-  return &*AddMI;
+MachineInstr *
+AArch64InstructionSelector::emitSUBS(Register Dst, MachineOperand &LHS,
+                                     MachineOperand &RHS,
+                                     MachineIRBuilder &MIRBuilder) const {
+  const std::array<std::array<unsigned, 2>, 5> OpcTable{
+      {{AArch64::SUBSXri, AArch64::SUBSWri},
+       {AArch64::SUBSXrs, AArch64::SUBSWrs},
+       {AArch64::SUBSXrr, AArch64::SUBSWrr},
+       {AArch64::ADDSXri, AArch64::ADDSWri},
+       {AArch64::SUBSXrx, AArch64::SUBSWrx}}};
+  return emitAddSub(OpcTable, Dst, LHS, RHS, MIRBuilder);
 }
 
 MachineInstr *
 AArch64InstructionSelector::emitCMN(MachineOperand &LHS, MachineOperand &RHS,
                                     MachineIRBuilder &MIRBuilder) const {
-  assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!");
   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
-  static const unsigned OpcTable[2][2]{{AArch64::ADDSXrr, AArch64::ADDSXri},
-                                       {AArch64::ADDSWrr, AArch64::ADDSWri}};
   bool Is32Bit = (MRI.getType(LHS.getReg()).getSizeInBits() == 32);
-  auto ImmFns = selectArithImmed(RHS);
-  unsigned Opc = OpcTable[Is32Bit][ImmFns.hasValue()];
-  Register ZReg = Is32Bit ? AArch64::WZR : AArch64::XZR;
-
-  auto CmpMI = MIRBuilder.buildInstr(Opc, {ZReg}, {LHS});
-
-  // If we matched a valid constant immediate, add those operands.
-  if (ImmFns) {
-    for (auto &RenderFn : *ImmFns)
-      RenderFn(CmpMI);
-  } else {
-    CmpMI.addUse(RHS.getReg());
-  }
-
-  constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI);
-  return &*CmpMI;
+  auto RC = Is32Bit ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass;
+  return emitADDS(MRI.createVirtualRegister(RC), LHS, RHS, MIRBuilder);
 }
 
 MachineInstr *
-AArch64InstructionSelector::emitTST(const Register &LHS, const Register &RHS,
+AArch64InstructionSelector::emitTST(MachineOperand &LHS, MachineOperand &RHS,
                                     MachineIRBuilder &MIRBuilder) const {
+  assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
-  unsigned RegSize = MRI.getType(LHS).getSizeInBits();
+  LLT Ty = MRI.getType(LHS.getReg());
+  unsigned RegSize = Ty.getSizeInBits();
   bool Is32Bit = (RegSize == 32);
-  static const unsigned OpcTable[2][2]{{AArch64::ANDSXrr, AArch64::ANDSXri},
-                                       {AArch64::ANDSWrr, AArch64::ANDSWri}};
-  Register ZReg = Is32Bit ? AArch64::WZR : AArch64::XZR;
-
-  // We might be able to fold in an immediate into the TST. We need to make sure
-  // it's a logical immediate though, since ANDS requires that.
-  auto ValAndVReg = getConstantVRegValWithLookThrough(RHS, MRI);
-  bool IsImmForm = ValAndVReg.hasValue() &&
-                   AArch64_AM::isLogicalImmediate(ValAndVReg->Value, RegSize);
-  unsigned Opc = OpcTable[Is32Bit][IsImmForm];
-  auto TstMI = MIRBuilder.buildInstr(Opc, {ZReg}, {LHS});
-
-  if (IsImmForm)
-    TstMI.addImm(
-        AArch64_AM::encodeLogicalImmediate(ValAndVReg->Value, RegSize));
-  else
-    TstMI.addUse(RHS);
+  const unsigned OpcTable[3][2] = {{AArch64::ANDSXri, AArch64::ANDSWri},
+                                   {AArch64::ANDSXrs, AArch64::ANDSWrs},
+                                   {AArch64::ANDSXrr, AArch64::ANDSWrr}};
+  // ANDS needs a logical immediate for its immediate form. Check if we can
+  // fold one in.
+  if (auto ValAndVReg = getConstantVRegValWithLookThrough(RHS.getReg(), MRI)) {
+    int64_t Imm = ValAndVReg->Value.getSExtValue();
+
+    if (AArch64_AM::isLogicalImmediate(Imm, RegSize)) {
+      auto TstMI = MIRBuilder.buildInstr(OpcTable[0][Is32Bit], {Ty}, {LHS});
+      TstMI.addImm(AArch64_AM::encodeLogicalImmediate(Imm, RegSize));
+      constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
+      return &*TstMI;
+    }
+  }
 
-  constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
-  return &*TstMI;
+  if (auto Fns = selectLogicalShiftedRegister(RHS))
+    return emitInstr(OpcTable[1][Is32Bit], {Ty}, {LHS}, MIRBuilder, Fns);
+  return emitInstr(OpcTable[2][Is32Bit], {Ty}, {LHS, RHS}, MIRBuilder);
 }
 
-std::pair<MachineInstr *, CmpInst::Predicate>
-AArch64InstructionSelector::emitIntegerCompare(
+MachineInstr *AArch64InstructionSelector::emitIntegerCompare(
     MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate,
     MachineIRBuilder &MIRBuilder) const {
   assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!");
   assert(Predicate.isPredicate() && "Expected predicate?");
   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
+  LLT CmpTy = MRI.getType(LHS.getReg());
+  assert(!CmpTy.isVector() && "Expected scalar or pointer");
+  unsigned Size = CmpTy.getSizeInBits();
+  (void)Size;
+  assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit LHS/RHS?");
+  // Fold the compare into a cmn or tst if possible.
+  if (auto FoldCmp = tryFoldIntegerCompare(LHS, RHS, Predicate, MIRBuilder))
+    return FoldCmp;
+  auto Dst = MRI.cloneVirtualRegister(LHS.getReg());
+  return emitSUBS(Dst, LHS, RHS, MIRBuilder);
+}
 
-  CmpInst::Predicate P = (CmpInst::Predicate)Predicate.getPredicate();
-
-  // Fold the compare if possible.
-  MachineInstr *FoldCmp =
-      tryFoldIntegerCompare(LHS, RHS, Predicate, MIRBuilder);
-  if (FoldCmp)
-    return {FoldCmp, P};
+MachineInstr *AArch64InstructionSelector::emitCSetForFCmp(
+    Register Dst, CmpInst::Predicate Pred, MachineIRBuilder &MIRBuilder) const {
+  MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
+#ifndef NDEBUG
+  LLT Ty = MRI.getType(Dst);
+  assert(!Ty.isVector() && Ty.getSizeInBits() == 32 &&
+         "Expected a 32-bit scalar register?");
+#endif
+  const Register ZeroReg = AArch64::WZR;
+  auto EmitCSet = [&](Register CsetDst, AArch64CC::CondCode CC) {
+    auto CSet =
+        MIRBuilder.buildInstr(AArch64::CSINCWr, {CsetDst}, {ZeroReg, ZeroReg})
+            .addImm(getInvertedCondCode(CC));
+    constrainSelectedInstRegOperands(*CSet, TII, TRI, RBI);
+    return &*CSet;
+  };
 
-  // Can't fold into a CMN. Just emit a normal compare.
-  unsigned CmpOpc = 0;
-  Register ZReg;
+  AArch64CC::CondCode CC1, CC2;
+  changeFCMPPredToAArch64CC(Pred, CC1, CC2);
+  if (CC2 == AArch64CC::AL)
+    return EmitCSet(Dst, CC1);
+
+  const TargetRegisterClass *RC = &AArch64::GPR32RegClass;
+  Register Def1Reg = MRI.createVirtualRegister(RC);
+  Register Def2Reg = MRI.createVirtualRegister(RC);
+  EmitCSet(Def1Reg, CC1);
+  EmitCSet(Def2Reg, CC2);
+  auto OrMI = MIRBuilder.buildInstr(AArch64::ORRWrr, {Dst}, {Def1Reg, Def2Reg});
+  constrainSelectedInstRegOperands(*OrMI, TII, TRI, RBI);
+  return &*OrMI;
+}
 
-  LLT CmpTy = MRI.getType(LHS.getReg());
-  assert((CmpTy.isScalar() || CmpTy.isPointer()) &&
-         "Expected scalar or pointer");
-  if (CmpTy == LLT::scalar(32)) {
-    CmpOpc = AArch64::SUBSWrr;
-    ZReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
-  } else if (CmpTy == LLT::scalar(64) || CmpTy.isPointer()) {
-    CmpOpc = AArch64::SUBSXrr;
-    ZReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
-  } else {
-    return {nullptr, CmpInst::Predicate::BAD_ICMP_PREDICATE};
-  }
+MachineInstr *
+AArch64InstructionSelector::emitFPCompare(Register LHS, Register RHS,
+                                          MachineIRBuilder &MIRBuilder,
+                                          Optional<CmpInst::Predicate> Pred) const {
+  MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
+  LLT Ty = MRI.getType(LHS);
+  if (Ty.isVector())
+    return nullptr;
+  unsigned OpSize = Ty.getSizeInBits();
+  if (OpSize != 32 && OpSize != 64)
+    return nullptr;
 
-  // Try to match immediate forms.
-  MachineInstr *ImmedCmp =
-      tryOptArithImmedIntegerCompare(LHS, RHS, P, MIRBuilder);
-  if (ImmedCmp)
-    return {ImmedCmp, P};
+  // If this is a compare against +0.0, then we don't have
+  // to explicitly materialize a constant.
+  const ConstantFP *FPImm = getConstantFPVRegVal(RHS, MRI);
+  bool ShouldUseImm = FPImm && (FPImm->isZero() && !FPImm->isNegative());
 
-  // If we don't have an immediate, we may have a shift which can be folded
-  // into the compare.
-  MachineInstr *ShiftedCmp = tryOptArithShiftedCompare(LHS, RHS, MIRBuilder);
-  if (ShiftedCmp)
-    return {ShiftedCmp, P};
+  auto IsEqualityPred = [](CmpInst::Predicate P) {
+    return P == CmpInst::FCMP_OEQ || P == CmpInst::FCMP_ONE ||
+           P == CmpInst::FCMP_UEQ || P == CmpInst::FCMP_UNE;
+  };
+  if (!ShouldUseImm && Pred && IsEqualityPred(*Pred)) {
+    // Try commutating the operands.
+    const ConstantFP *LHSImm = getConstantFPVRegVal(LHS, MRI);
+    if (LHSImm && (LHSImm->isZero() && !LHSImm->isNegative())) {
+      ShouldUseImm = true;
+      std::swap(LHS, RHS);
+    }
+  }
+  unsigned CmpOpcTbl[2][2] = {{AArch64::FCMPSrr, AArch64::FCMPDrr},
+                              {AArch64::FCMPSri, AArch64::FCMPDri}};
+  unsigned CmpOpc = CmpOpcTbl[ShouldUseImm][OpSize == 64];
 
-  auto CmpMI =
-      MIRBuilder.buildInstr(CmpOpc, {ZReg}, {LHS.getReg(), RHS.getReg()});
-  // Make sure that we can constrain the compare that we emitted.
+  // Partially build the compare. Decide if we need to add a use for the
+  // third operand based off whether or not we're comparing against 0.0.
+  auto CmpMI = MIRBuilder.buildInstr(CmpOpc).addUse(LHS);
+  if (!ShouldUseImm)
+    CmpMI.addUse(RHS);
   constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI);
-  return {&*CmpMI, P};
+  return &*CmpMI;
 }
 
 MachineInstr *AArch64InstructionSelector::emitVectorConcat(
@@ -3947,11 +4363,28 @@ AArch64InstructionSelector::emitCSetForICMP(Register DefReg, unsigned Pred,
   return &*I;
 }
 
+std::pair<MachineInstr *, AArch64CC::CondCode>
+AArch64InstructionSelector::emitOverflowOp(unsigned Opcode, Register Dst,
+                                           MachineOperand &LHS,
+                                           MachineOperand &RHS,
+                                           MachineIRBuilder &MIRBuilder) const {
+  switch (Opcode) {
+  default:
+    llvm_unreachable("Unexpected opcode!");
+  case TargetOpcode::G_SADDO:
+    return std::make_pair(emitADDS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS);
+  case TargetOpcode::G_UADDO:
+    return std::make_pair(emitADDS(Dst, LHS, RHS, MIRBuilder), AArch64CC::HS);
+  case TargetOpcode::G_SSUBO:
+    return std::make_pair(emitSUBS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS);
+  case TargetOpcode::G_USUBO:
+    return std::make_pair(emitSUBS(Dst, LHS, RHS, MIRBuilder), AArch64CC::LO);
+  }
+}
+
 bool AArch64InstructionSelector::tryOptSelect(MachineInstr &I) const {
   MachineIRBuilder MIB(I);
   MachineRegisterInfo &MRI = *MIB.getMRI();
-  const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
-
   // We want to recognize this pattern:
   //
   // $z = G_FCMP pred, $x, $y
@@ -4008,27 +4441,17 @@ bool AArch64InstructionSelector::tryOptSelect(MachineInstr &I) const {
 
   AArch64CC::CondCode CondCode;
   if (CondOpc == TargetOpcode::G_ICMP) {
-    MachineInstr *Cmp;
-    CmpInst::Predicate Pred;
-
-    std::tie(Cmp, Pred) =
-        emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3),
-                           CondDef->getOperand(1), MIB);
-
-    if (!Cmp) {
-      LLVM_DEBUG(dbgs() << "Couldn't emit compare for select!\n");
-      return false;
-    }
-
-    // Have to collect the CondCode after emitIntegerCompare, since it can
-    // update the predicate.
+    auto Pred =
+        static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate());
     CondCode = changeICMPPredToAArch64CC(Pred);
+    emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3),
+                       CondDef->getOperand(1), MIB);
   } else {
     // Get the condition code for the select.
+    auto Pred =
+        static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate());
     AArch64CC::CondCode CondCode2;
-    changeFCMPPredToAArch64CC(
-        (CmpInst::Predicate)CondDef->getOperand(1).getPredicate(), CondCode,
-        CondCode2);
+    changeFCMPPredToAArch64CC(Pred, CondCode, CondCode2);
 
     // changeFCMPPredToAArch64CC sets CondCode2 to AL when we require two
     // instructions to emit the comparison.
@@ -4037,25 +4460,16 @@ bool AArch64InstructionSelector::tryOptSelect(MachineInstr &I) const {
     if (CondCode2 != AArch64CC::AL)
       return false;
 
-    // Make sure we'll be able to select the compare.
-    unsigned CmpOpc = selectFCMPOpc(*CondDef, MRI);
-    if (!CmpOpc)
+    if (!emitFPCompare(CondDef->getOperand(2).getReg(),
+                       CondDef->getOperand(3).getReg(), MIB)) {
+      LLVM_DEBUG(dbgs() << "Couldn't emit compare for select!\n");
       return false;
-
-    // Emit a new compare.
-    auto Cmp = MIB.buildInstr(CmpOpc, {}, {CondDef->getOperand(2).getReg()});
-    if (CmpOpc != AArch64::FCMPSri && CmpOpc != AArch64::FCMPDri)
-      Cmp.addUse(CondDef->getOperand(3).getReg());
-    constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);
+    }
   }
 
   // Emit the select.
-  unsigned CSelOpc = selectSelectOpc(I, MRI, RBI);
-  auto CSel =
-      MIB.buildInstr(CSelOpc, {I.getOperand(0).getReg()},
-                     {I.getOperand(2).getReg(), I.getOperand(3).getReg()})
-          .addImm(CondCode);
-  constrainSelectedInstRegOperands(*CSel, TII, TRI, RBI);
+  emitSelect(I.getOperand(0).getReg(), I.getOperand(2).getReg(),
+             I.getOperand(3).getReg(), CondCode, MIB);
   I.eraseFromParent();
   return true;
 }
@@ -4138,162 +4552,20 @@ MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare(
   // Produce this if the compare is signed:
   //
   // tst x, y
-  if (!isUnsignedICMPPred(P) && LHSDef &&
+  if (!CmpInst::isUnsigned(P) && LHSDef &&
       LHSDef->getOpcode() == TargetOpcode::G_AND) {
     // Make sure that the RHS is 0.
     auto ValAndVReg = getConstantVRegValWithLookThrough(RHS.getReg(), MRI);
     if (!ValAndVReg || ValAndVReg->Value != 0)
       return nullptr;
 
-    return emitTST(LHSDef->getOperand(1).getReg(),
-                   LHSDef->getOperand(2).getReg(), MIRBuilder);
+    return emitTST(LHSDef->getOperand(1),
+                   LHSDef->getOperand(2), MIRBuilder);
   }
 
   return nullptr;
 }
 
-MachineInstr *AArch64InstructionSelector::tryOptArithImmedIntegerCompare(
-    MachineOperand &LHS, MachineOperand &RHS, CmpInst::Predicate &P,
-    MachineIRBuilder &MIB) const {
-  // Attempt to select the immediate form of an integer compare.
-  MachineRegisterInfo &MRI = *MIB.getMRI();
-  auto Ty = MRI.getType(LHS.getReg());
-  assert(!Ty.isVector() && "Expected scalar or pointer only?");
-  unsigned Size = Ty.getSizeInBits();
-  assert((Size == 32 || Size == 64) &&
-         "Expected 32 bit or 64 bit compare only?");
-
-  // Check if this is a case we can already handle.
-  InstructionSelector::ComplexRendererFns ImmFns;
-  ImmFns = selectArithImmed(RHS);
-
-  if (!ImmFns) {
-    // We didn't get a rendering function, but we may still have a constant.
-    auto MaybeImmed = getImmedFromMO(RHS);
-    if (!MaybeImmed)
-      return nullptr;
-
-    // We have a constant, but it doesn't fit. Try adjusting it by one and
-    // updating the predicate if possible.
-    uint64_t C = *MaybeImmed;
-    CmpInst::Predicate NewP;
-    switch (P) {
-    default:
-      return nullptr;
-    case CmpInst::ICMP_SLT:
-    case CmpInst::ICMP_SGE:
-      // Check for
-      //
-      // x slt c => x sle c - 1
-      // x sge c => x sgt c - 1
-      //
-      // When c is not the smallest possible negative number.
-      if ((Size == 64 && static_cast<int64_t>(C) == INT64_MIN) ||
-          (Size == 32 && static_cast<int32_t>(C) == INT32_MIN))
-        return nullptr;
-      NewP = (P == CmpInst::ICMP_SLT) ? CmpInst::ICMP_SLE : CmpInst::ICMP_SGT;
-      C -= 1;
-      break;
-    case CmpInst::ICMP_ULT:
-    case CmpInst::ICMP_UGE:
-      // Check for
-      //
-      // x ult c => x ule c - 1
-      // x uge c => x ugt c - 1
-      //
-      // When c is not zero.
-      if (C == 0)
-        return nullptr;
-      NewP = (P == CmpInst::ICMP_ULT) ? CmpInst::ICMP_ULE : CmpInst::ICMP_UGT;
-      C -= 1;
-      break;
-    case CmpInst::ICMP_SLE:
-    case CmpInst::ICMP_SGT:
-      // Check for
-      //
-      // x sle c => x slt c + 1
-      // x sgt c => s sge c + 1
-      //
-      // When c is not the largest possible signed integer.
-      if ((Size == 32 && static_cast<int32_t>(C) == INT32_MAX) ||
-          (Size == 64 && static_cast<int64_t>(C) == INT64_MAX))
-        return nullptr;
-      NewP = (P == CmpInst::ICMP_SLE) ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGE;
-      C += 1;
-      break;
-    case CmpInst::ICMP_ULE:
-    case CmpInst::ICMP_UGT:
-      // Check for
-      //
-      // x ule c => x ult c + 1
-      // x ugt c => s uge c + 1
-      //
-      // When c is not the largest possible unsigned integer.
-      if ((Size == 32 && static_cast<uint32_t>(C) == UINT32_MAX) ||
-          (Size == 64 && C == UINT64_MAX))
-        return nullptr;
-      NewP = (P == CmpInst::ICMP_ULE) ? CmpInst::ICMP_ULT : CmpInst::ICMP_UGE;
-      C += 1;
-      break;
-    }
-
-    // Check if the new constant is valid.
-    if (Size == 32)
-      C = static_cast<uint32_t>(C);
-    ImmFns = select12BitValueWithLeftShift(C);
-    if (!ImmFns)
-      return nullptr;
-    P = NewP;
-  }
-
-  // At this point, we know we can select an immediate form. Go ahead and do
-  // that.
-  Register ZReg;
-  unsigned Opc;
-  if (Size == 32) {
-    ZReg = AArch64::WZR;
-    Opc = AArch64::SUBSWri;
-  } else {
-    ZReg = AArch64::XZR;
-    Opc = AArch64::SUBSXri;
-  }
-
-  auto CmpMI = MIB.buildInstr(Opc, {ZReg}, {LHS.getReg()});
-  for (auto &RenderFn : *ImmFns)
-    RenderFn(CmpMI);
-  constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI);
-  return &*CmpMI;
-}
-
-MachineInstr *AArch64InstructionSelector::tryOptArithShiftedCompare(
-    MachineOperand &LHS, MachineOperand &RHS, MachineIRBuilder &MIB) const {
-  // We are looking for the following pattern:
-  //
-  // shift = G_SHL/ASHR/LHSR y, c
-  // ...
-  // cmp = G_ICMP pred, something, shift
-  //
-  // Since we will select the G_ICMP to a SUBS, we can potentially fold the
-  // shift into the subtract.
-  static const unsigned OpcTable[2] = {AArch64::SUBSWrs, AArch64::SUBSXrs};
-  static const Register ZRegTable[2] = {AArch64::WZR, AArch64::XZR};
-  auto ImmFns = selectShiftedRegister(RHS);
-  if (!ImmFns)
-    return nullptr;
-  MachineRegisterInfo &MRI = *MIB.getMRI();
-  auto Ty = MRI.getType(LHS.getReg());
-  assert(!Ty.isVector() && "Expected scalar or pointer only?");
-  unsigned Size = Ty.getSizeInBits();
-  bool Idx = (Size == 64);
-  Register ZReg = ZRegTable[Idx];
-  unsigned Opc = OpcTable[Idx];
-  auto CmpMI = MIB.buildInstr(Opc, {ZReg}, {LHS.getReg()});
-  for (auto &RenderFn : *ImmFns)
-    RenderFn(CmpMI);
-  constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI);
-  return &*CmpMI;
-}
-
 bool AArch64InstructionSelector::selectShuffleVector(
     MachineInstr &I, MachineRegisterInfo &MRI) const {
   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
@@ -4436,7 +4708,7 @@ bool AArch64InstructionSelector::selectInsertElt(
   auto VRegAndVal = getConstantVRegValWithLookThrough(IdxReg, MRI);
   if (!VRegAndVal)
     return false;
-  unsigned LaneIdx = VRegAndVal->Value;
+  unsigned LaneIdx = VRegAndVal->Value.getSExtValue();
 
   // Perform the lane insert.
   Register SrcReg = I.getOperand(1).getReg();
@@ -4493,8 +4765,9 @@ bool AArch64InstructionSelector::selectInsertElt(
 bool AArch64InstructionSelector::tryOptConstantBuildVec(
     MachineInstr &I, LLT DstTy, MachineRegisterInfo &MRI) const {
   assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
-  assert(DstTy.getSizeInBits() <= 128 && "Unexpected build_vec type!");
-  if (DstTy.getSizeInBits() < 32)
+  unsigned DstSize = DstTy.getSizeInBits();
+  assert(DstSize <= 128 && "Unexpected build_vec type!");
+  if (DstSize < 32)
     return false;
   // Check if we're building a constant vector, in which case we want to
   // generate a constant pool load instead of a vector insert sequence.
@@ -4515,6 +4788,24 @@ bool AArch64InstructionSelector::tryOptConstantBuildVec(
   }
   Constant *CV = ConstantVector::get(Csts);
   MachineIRBuilder MIB(I);
+  if (CV->isNullValue()) {
+    // Until the importer can support immAllZerosV in pattern leaf nodes,
+    // select a zero move manually here.
+    Register DstReg = I.getOperand(0).getReg();
+    if (DstSize == 128) {
+      auto Mov = MIB.buildInstr(AArch64::MOVIv2d_ns, {DstReg}, {}).addImm(0);
+      I.eraseFromParent();
+      return constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
+    } else if (DstSize == 64) {
+      auto Mov =
+          MIB.buildInstr(AArch64::MOVIv2d_ns, {&AArch64::FPR128RegClass}, {})
+              .addImm(0);
+      MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {})
+          .addReg(Mov.getReg(0), 0, AArch64::dsub);
+      I.eraseFromParent();
+      return RBI.constrainGenericRegister(DstReg, AArch64::FPR64RegClass, MRI);
+    }
+  }
   auto *CPLoad = emitLoadFromConstantPool(CV, MIB);
   if (!CPLoad) {
     LLVM_DEBUG(dbgs() << "Could not generate cp load for build_vector");
@@ -4634,10 +4925,12 @@ bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
     MIRBuilder.buildInstr(AArch64::BRK, {}, {}).addImm(1);
     break;
   case Intrinsic::debugtrap:
-    if (!STI.isTargetWindows())
-      return false;
     MIRBuilder.buildInstr(AArch64::BRK, {}, {}).addImm(0xF000);
     break;
+  case Intrinsic::ubsantrap:
+    MIRBuilder.buildInstr(AArch64::BRK, {}, {})
+        .addImm(I.getOperand(1).getImm() | ('U' << 8));
+    break;
   }
 
   I.eraseFromParent();
@@ -4703,22 +4996,22 @@ bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I,
     RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI);
 
     if (Depth == 0 && IntrinID == Intrinsic::returnaddress) {
-      if (MFReturnAddr) {
-        MIRBuilder.buildCopy({DstReg}, MFReturnAddr);
-        I.eraseFromParent();
-        return true;
+      if (!MFReturnAddr) {
+        // Insert the copy from LR/X30 into the entry block, before it can be
+        // clobbered by anything.
+        MFI.setReturnAddressIsTaken(true);
+        MFReturnAddr = getFunctionLiveInPhysReg(MF, TII, AArch64::LR,
+                                                AArch64::GPR64RegClass);
       }
-      MFI.setReturnAddressIsTaken(true);
-      MF.addLiveIn(AArch64::LR, &AArch64::GPR64spRegClass);
-      // Insert the copy from LR/X30 into the entry block, before it can be
-      // clobbered by anything.
-      MachineBasicBlock &EntryBlock = *MF.begin();
-      if (!EntryBlock.isLiveIn(AArch64::LR))
-        EntryBlock.addLiveIn(AArch64::LR);
-      MachineIRBuilder EntryBuilder(MF);
-      EntryBuilder.setInstr(*EntryBlock.begin());
-      EntryBuilder.buildCopy({DstReg}, {Register(AArch64::LR)});
-      MFReturnAddr = DstReg;
+
+      if (STI.hasPAuth()) {
+        MIRBuilder.buildInstr(AArch64::XPACI, {DstReg}, {MFReturnAddr});
+      } else {
+        MIRBuilder.buildCopy({Register(AArch64::LR)}, {MFReturnAddr});
+        MIRBuilder.buildInstr(AArch64::XPACLRI);
+        MIRBuilder.buildCopy({DstReg}, {Register(AArch64::LR)});
+      }
+
       I.eraseFromParent();
       return true;
     }
@@ -4738,7 +5031,16 @@ bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I,
       MIRBuilder.buildCopy({DstReg}, {FrameAddr});
     else {
       MFI.setReturnAddressIsTaken(true);
-      MIRBuilder.buildInstr(AArch64::LDRXui, {DstReg}, {FrameAddr}).addImm(1);
+
+      if (STI.hasPAuth()) {
+        Register TmpReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+        MIRBuilder.buildInstr(AArch64::LDRXui, {TmpReg}, {FrameAddr}).addImm(1);
+        MIRBuilder.buildInstr(AArch64::XPACI, {DstReg}, {TmpReg});
+      } else {
+        MIRBuilder.buildInstr(AArch64::LDRXui, {Register(AArch64::LR)}, {FrameAddr}).addImm(1);
+        MIRBuilder.buildInstr(AArch64::XPACLRI);
+        MIRBuilder.buildCopy({DstReg}, {Register(AArch64::LR)});
+      }
     }
 
     I.eraseFromParent();
@@ -4946,7 +5248,7 @@ AArch64InstructionSelector::selectExtendedSHL(
 
   // The value must fit into 3 bits, and must be positive. Make sure that is
   // true.
-  int64_t ImmVal = ValAndVReg->Value;
+  int64_t ImmVal = ValAndVReg->Value.getSExtValue();
 
   // Since we're going to pull this into a shift, the constant value must be
   // a power of 2. If we got a multiply, then we need to check this.
@@ -5086,12 +5388,60 @@ InstructionSelector::ComplexRendererFns
 AArch64InstructionSelector::selectAddrModeXRO(MachineOperand &Root,
                                               unsigned SizeInBytes) const {
   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
-
-  // If we have a constant offset, then we probably don't want to match a
-  // register offset.
-  if (isBaseWithConstantOffset(Root, MRI))
+  if (!Root.isReg())
+    return None;
+  MachineInstr *PtrAdd =
+      getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI);
+  if (!PtrAdd)
     return None;
 
+  // Check for an immediates which cannot be encoded in the [base + imm]
+  // addressing mode, and can't be encoded in an add/sub. If this happens, we'll
+  // end up with code like:
+  //
+  // mov x0, wide
+  // add x1 base, x0
+  // ldr x2, [x1, x0]
+  //
+  // In this situation, we can use the [base, xreg] addressing mode to save an
+  // add/sub:
+  //
+  // mov x0, wide
+  // ldr x2, [base, x0]
+  auto ValAndVReg =
+      getConstantVRegValWithLookThrough(PtrAdd->getOperand(2).getReg(), MRI);
+  if (ValAndVReg) {
+    unsigned Scale = Log2_32(SizeInBytes);
+    int64_t ImmOff = ValAndVReg->Value.getSExtValue();
+
+    // Skip immediates that can be selected in the load/store addresing
+    // mode.
+    if (ImmOff % SizeInBytes == 0 && ImmOff >= 0 &&
+        ImmOff < (0x1000 << Scale))
+      return None;
+
+    // Helper lambda to decide whether or not it is preferable to emit an add.
+    auto isPreferredADD = [](int64_t ImmOff) {
+      // Constants in [0x0, 0xfff] can be encoded in an add.
+      if ((ImmOff & 0xfffffffffffff000LL) == 0x0LL)
+        return true;
+
+      // Can it be encoded in an add lsl #12?
+      if ((ImmOff & 0xffffffffff000fffLL) != 0x0LL)
+        return false;
+
+      // It can be encoded in an add lsl #12, but we may not want to. If it is
+      // possible to select this as a single movz, then prefer that. A single
+      // movz is faster than an add with a shift.
+      return (ImmOff & 0xffffffffff00ffffLL) != 0x0LL &&
+             (ImmOff & 0xffffffffffff0fffLL) != 0x0LL;
+    };
+
+    // If the immediate can be encoded in a single add/sub, then bail out.
+    if (isPreferredADD(ImmOff) || isPreferredADD(-ImmOff))
+      return None;
+  }
+
   // Try to fold shifts into the addressing mode.
   auto AddrModeFns = selectAddrModeShiftedExtendXReg(Root, SizeInBytes);
   if (AddrModeFns)
@@ -5521,7 +5871,8 @@ void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB,
   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
          "Expected G_CONSTANT");
-  Optional<int64_t> CstVal = getConstantVRegVal(MI.getOperand(0).getReg(), MRI);
+  Optional<int64_t> CstVal =
+      getConstantVRegSExtVal(MI.getOperand(0).getReg(), MRI);
   assert(CstVal && "Expected constant value");
   MIB.addImm(CstVal.getValue());
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 4ffde2a7e3c4..5a6c904e3f5d 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -14,6 +14,7 @@
 #include "AArch64LegalizerInfo.h"
 #include "AArch64Subtarget.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -22,6 +23,8 @@
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Type.h"
+#include <initializer_list>
+#include "llvm/Support/MathExtras.h"
 
 #define DEBUG_TYPE "aarch64-legalinfo"
 
@@ -53,6 +56,13 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
   const LLT v2s64 = LLT::vector(2, 64);
   const LLT v2p0 = LLT::vector(2, p0);
 
+  std::initializer_list<LLT> PackedVectorAllTypeList = {/* Begin 128bit types */
+                                                        v16s8, v8s16, v4s32,
+                                                        v2s64, v2p0,
+                                                        /* End 128bit types */
+                                                        /* Begin 64bit types */
+                                                        v8s8, v4s16, v2s32};
+
   const TargetMachine &TM = ST.getTargetLowering()->getTargetMachine();
 
   // FIXME: support subtargets which have neon/fp-armv8 disabled.
@@ -61,25 +71,31 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
     return;
   }
 
+  // Some instructions only support s16 if the subtarget has full 16-bit FP
+  // support.
+  const bool HasFP16 = ST.hasFullFP16();
+  const LLT &MinFPScalar = HasFP16 ? s16 : s32;
+
   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
-    .legalFor({p0, s1, s8, s16, s32, s64, v2s32, v4s32, v2s64})
-    .clampScalar(0, s1, s64)
-    .widenScalarToNextPow2(0, 8)
-    .fewerElementsIf(
-      [=](const LegalityQuery &Query) {
-        return Query.Types[0].isVector() &&
-          (Query.Types[0].getElementType() != s64 ||
-           Query.Types[0].getNumElements() != 2);
-      },
-      [=](const LegalityQuery &Query) {
-        LLT EltTy = Query.Types[0].getElementType();
-        if (EltTy == s64)
-          return std::make_pair(0, LLT::vector(2, 64));
-        return std::make_pair(0, EltTy);
-      });
-
-  getActionDefinitionsBuilder(G_PHI)
-      .legalFor({p0, s16, s32, s64, v2s32, v4s32, v2s64})
+      .legalFor({p0, s1, s8, s16, s32, s64})
+      .legalFor(PackedVectorAllTypeList)
+      .clampScalar(0, s1, s64)
+      .widenScalarToNextPow2(0, 8)
+      .fewerElementsIf(
+          [=](const LegalityQuery &Query) {
+            return Query.Types[0].isVector() &&
+                   (Query.Types[0].getElementType() != s64 ||
+                    Query.Types[0].getNumElements() != 2);
+          },
+          [=](const LegalityQuery &Query) {
+            LLT EltTy = Query.Types[0].getElementType();
+            if (EltTy == s64)
+              return std::make_pair(0, LLT::vector(2, 64));
+            return std::make_pair(0, EltTy);
+          });
+
+  getActionDefinitionsBuilder(G_PHI).legalFor({p0, s16, s32, s64})
+      .legalFor(PackedVectorAllTypeList)
       .clampScalar(0, s16, s64)
       .widenScalarToNextPow2(0);
 
@@ -89,26 +105,38 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .widenScalarToNextPow2(0);
 
   getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR})
-      .legalFor({s32, s64, v2s32, v4s32, v2s64, v8s16, v16s8})
+      .legalFor({s32, s64, v2s32, v4s32, v4s16, v8s16, v16s8, v8s8})
+      .scalarizeIf(
+          [=](const LegalityQuery &Query) {
+            return Query.Opcode == G_MUL && Query.Types[0] == v2s64;
+          },
+          0)
+      .legalFor({v2s64})
       .clampScalar(0, s32, s64)
       .widenScalarToNextPow2(0)
       .clampNumElements(0, v2s32, v4s32)
       .clampNumElements(0, v2s64, v2s64)
       .moreElementsToNextPow2(0);
 
-  getActionDefinitionsBuilder(G_SHL)
+  getActionDefinitionsBuilder({G_SHL, G_ASHR, G_LSHR})
       .customIf([=](const LegalityQuery &Query) {
         const auto &SrcTy = Query.Types[0];
         const auto &AmtTy = Query.Types[1];
         return !SrcTy.isVector() && SrcTy.getSizeInBits() == 32 &&
                AmtTy.getSizeInBits() == 32;
       })
-      .legalFor({{s32, s32},
-                 {s64, s64},
-                 {s32, s64},
-                 {v2s32, v2s32},
-                 {v4s32, v4s32},
-                 {v2s64, v2s64}})
+      .legalFor({
+          {s32, s32},
+          {s32, s64},
+          {s64, s64},
+          {v8s8, v8s8},
+          {v16s8, v16s8},
+          {v4s16, v4s16},
+          {v8s16, v8s16},
+          {v2s32, v2s32},
+          {v4s32, v4s32},
+          {v2s64, v2s64},
+      })
       .clampScalar(1, s32, s64)
       .clampScalar(0, s32, s64)
       .widenScalarToNextPow2(0)
@@ -130,43 +158,28 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .widenScalarToNextPow2(0)
       .scalarize(0);
 
-  getActionDefinitionsBuilder({G_LSHR, G_ASHR})
-      .customIf([=](const LegalityQuery &Query) {
-        const auto &SrcTy = Query.Types[0];
-        const auto &AmtTy = Query.Types[1];
-        return !SrcTy.isVector() && SrcTy.getSizeInBits() == 32 &&
-               AmtTy.getSizeInBits() == 32;
-      })
-      .legalFor({{s32, s32},
-                 {s32, s64},
-                 {s64, s64},
-                 {v2s32, v2s32},
-                 {v4s32, v4s32},
-                 {v2s64, v2s64}})
-      .clampScalar(1, s32, s64)
-      .clampScalar(0, s32, s64)
-      .minScalarSameAs(1, 0);
-
   getActionDefinitionsBuilder({G_SREM, G_UREM})
       .lowerFor({s1, s8, s16, s32, s64});
 
-  getActionDefinitionsBuilder({G_SMULO, G_UMULO})
-      .lowerFor({{s64, s1}});
+  getActionDefinitionsBuilder({G_SMULO, G_UMULO}).lowerFor({{s64, s1}});
 
   getActionDefinitionsBuilder({G_SMULH, G_UMULH}).legalFor({s32, s64});
 
-  getActionDefinitionsBuilder({G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_UADDO})
+  getActionDefinitionsBuilder(
+      {G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_UADDO, G_USUBO})
       .legalFor({{s32, s1}, {s64, s1}})
       .minScalar(0, s32);
 
   getActionDefinitionsBuilder({G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FNEG})
-    .legalFor({s32, s64, v2s64, v4s32, v2s32});
+      .legalFor({s32, s64, v2s64, v4s32, v2s32})
+      .clampNumElements(0, v2s32, v4s32)
+      .clampNumElements(0, v2s64, v2s64);
 
   getActionDefinitionsBuilder(G_FREM).libcallFor({s32, s64});
 
   getActionDefinitionsBuilder({G_FCEIL, G_FABS, G_FSQRT, G_FFLOOR, G_FRINT,
                                G_FMA, G_INTRINSIC_TRUNC, G_INTRINSIC_ROUND,
-                               G_FNEARBYINT})
+                               G_FNEARBYINT, G_INTRINSIC_LRINT})
       // If we don't have full FP16 support, then scalarize the elements of
       // vectors containing fp16 types.
       .fewerElementsIf(
@@ -272,8 +285,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
                                  {v4s32, p0, 128, 8},
                                  {v2s64, p0, 128, 8}})
       // These extends are also legal
-      .legalForTypesWithMemDesc({{s32, p0, 8, 8},
-                                 {s32, p0, 16, 8}})
+      .legalForTypesWithMemDesc({{s32, p0, 8, 8}, {s32, p0, 16, 8}})
       .clampScalar(0, s8, s64)
       .lowerIfMemSizeNotPow2()
       // Lower any any-extending loads left into G_ANYEXT and G_LOAD
@@ -295,6 +307,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
                                  {p0, p0, 64, 8},
                                  {s128, p0, 128, 8},
                                  {v16s8, p0, 128, 8},
+                                 {v8s8, p0, 64, 8},
                                  {v4s16, p0, 64, 8},
                                  {v8s16, p0, 128, 8},
                                  {v2s32, p0, 64, 8},
@@ -312,14 +325,19 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
 
   // Constants
   getActionDefinitionsBuilder(G_CONSTANT)
-    .legalFor({p0, s8, s16, s32, s64})
+      .legalFor({p0, s8, s16, s32, s64})
       .clampScalar(0, s8, s64)
       .widenScalarToNextPow2(0);
   getActionDefinitionsBuilder(G_FCONSTANT)
-      .legalFor({s32, s64})
-      .clampScalar(0, s32, s64);
+      .legalIf([=](const LegalityQuery &Query) {
+        const auto &Ty = Query.Types[0];
+        if (HasFP16 && Ty == s16)
+          return true;
+        return Ty == s32 || Ty == s64 || Ty == s128;
+      })
+      .clampScalar(0, MinFPScalar, s128);
 
-  getActionDefinitionsBuilder(G_ICMP)
+  getActionDefinitionsBuilder({G_ICMP, G_FCMP})
       .legalFor({{s32, s32},
                  {s32, s64},
                  {s32, p0},
@@ -347,13 +365,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .minScalarOrEltIf(
           [=](const LegalityQuery &Query) { return Query.Types[1] == v2p0; }, 0,
           s64)
-      .widenScalarOrEltToNextPow2(1);
-
-  getActionDefinitionsBuilder(G_FCMP)
-      .legalFor({{s32, s32}, {s32, s64}})
-      .clampScalar(0, s32, s32)
-      .clampScalar(1, s32, s64)
-      .widenScalarToNextPow2(1);
+      .widenScalarOrEltToNextPow2(1)
+      .clampNumElements(0, v2s32, v4s32);
 
   // Extensions
   auto ExtLegalFunc = [=](const LegalityQuery &Query) {
@@ -361,7 +374,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
 
     if (DstSize == 128 && !Query.Types[0].isVector())
       return false; // Extending to a scalar s128 needs narrowing.
-    
+
     // Make sure that we have something that will fit in a register, and
     // make sure it's a power of 2.
     if (DstSize < 8 || DstSize > 128 || !isPowerOf2_32(DstSize))
@@ -386,17 +399,28 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .legalIf(ExtLegalFunc)
       .clampScalar(0, s64, s64); // Just for s128, others are handled above.
 
-  getActionDefinitionsBuilder(G_TRUNC).alwaysLegal();
+  getActionDefinitionsBuilder(G_TRUNC)
+      .minScalarOrEltIf(
+          [=](const LegalityQuery &Query) { return Query.Types[0].isVector(); },
+          0, s8)
+      .customIf([=](const LegalityQuery &Query) {
+        LLT DstTy = Query.Types[0];
+        LLT SrcTy = Query.Types[1];
+        return DstTy == v8s8 && SrcTy.getSizeInBits() > 128;
+      })
+      .alwaysLegal();
 
-  getActionDefinitionsBuilder(G_SEXT_INREG)
-    .legalFor({s32, s64})
-    .lower();
+  getActionDefinitionsBuilder(G_SEXT_INREG).legalFor({s32, s64}).lower();
 
   // FP conversions
-  getActionDefinitionsBuilder(G_FPTRUNC).legalFor(
-      {{s16, s32}, {s16, s64}, {s32, s64}, {v4s16, v4s32}, {v2s32, v2s64}});
-  getActionDefinitionsBuilder(G_FPEXT).legalFor(
-      {{s32, s16}, {s64, s16}, {s64, s32}, {v4s32, v4s16}, {v2s64, v2s32}});
+  getActionDefinitionsBuilder(G_FPTRUNC)
+      .legalFor(
+          {{s16, s32}, {s16, s64}, {s32, s64}, {v4s16, v4s32}, {v2s32, v2s64}})
+      .clampMaxNumElements(0, s32, 2);
+  getActionDefinitionsBuilder(G_FPEXT)
+      .legalFor(
+          {{s32, s16}, {s64, s16}, {s64, s32}, {v4s32, v4s16}, {v2s64, v2s32}})
+      .clampMaxNumElements(0, s64, 2);
 
   // Conversions
   getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
@@ -409,7 +433,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
   getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
       .legalForCartesianProduct({s32, s64, v2s64, v4s32, v2s32})
       .clampScalar(1, s32, s64)
-      .widenScalarToNextPow2(1)
+      .minScalarSameAs(1, 0)
       .clampScalar(0, s32, s64)
       .widenScalarToNextPow2(0);
 
@@ -417,14 +441,12 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
   getActionDefinitionsBuilder(G_BRCOND).legalFor({s1, s8, s16, s32});
   getActionDefinitionsBuilder(G_BRINDIRECT).legalFor({p0});
 
-  // Select
-  // FIXME: We can probably do a bit better than just scalarizing vector
-  // selects.
   getActionDefinitionsBuilder(G_SELECT)
       .legalFor({{s32, s1}, {s64, s1}, {p0, s1}})
       .clampScalar(0, s32, s64)
       .widenScalarToNextPow2(0)
-      .scalarize(0);
+      .minScalarEltSameAsIf(all(isVector(0), isVector(1)), 1, 0)
+      .lowerIf(isVector(0));
 
   // Pointer-handling
   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0});
@@ -554,8 +576,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
           return BigTy.getSizeInBits() % LitTy.getSizeInBits() == 0;
         })
         // Any vectors left are the wrong size. Scalarize them.
-      .scalarize(0)
-      .scalarize(1);
+        .scalarize(0)
+        .scalarize(1);
   }
 
   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
@@ -567,18 +589,40 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .legalIf([=](const LegalityQuery &Query) {
         const LLT &VecTy = Query.Types[1];
         return VecTy == v2s16 || VecTy == v4s16 || VecTy == v8s16 ||
-               VecTy == v4s32 || VecTy == v2s64 || VecTy == v2s32;
-      });
+               VecTy == v4s32 || VecTy == v2s64 || VecTy == v2s32 ||
+               VecTy == v16s8 || VecTy == v2s32 || VecTy == v2p0;
+      })
+      .minScalarOrEltIf(
+          [=](const LegalityQuery &Query) {
+            // We want to promote to <M x s1> to <M x s64> if that wouldn't
+            // cause the total vec size to be > 128b.
+            return Query.Types[1].getNumElements() <= 2;
+          },
+          0, s64)
+      .minScalarOrEltIf(
+          [=](const LegalityQuery &Query) {
+            return Query.Types[1].getNumElements() <= 4;
+          },
+          0, s32)
+      .minScalarOrEltIf(
+          [=](const LegalityQuery &Query) {
+            return Query.Types[1].getNumElements() <= 8;
+          },
+          0, s16)
+      .minScalarOrEltIf(
+          [=](const LegalityQuery &Query) {
+            return Query.Types[1].getNumElements() <= 16;
+          },
+          0, s8)
+      .minScalarOrElt(0, s8); // Worst case, we need at least s8.
 
   getActionDefinitionsBuilder(G_INSERT_VECTOR_ELT)
-      .legalIf([=](const LegalityQuery &Query) {
-        const LLT &VecTy = Query.Types[0];
-        // TODO: Support s8 and s16
-        return VecTy == v2s32 || VecTy == v4s32 || VecTy == v2s64;
-      });
+      .legalIf(typeInSet(0, {v8s16, v2s32, v4s32, v2s64}));
 
   getActionDefinitionsBuilder(G_BUILD_VECTOR)
-      .legalFor({{v4s16, s16},
+      .legalFor({{v8s8, s8},
+                 {v16s8, s8},
+                 {v4s16, s16},
                  {v8s16, s16},
                  {v2s32, s32},
                  {v4s32, s32},
@@ -594,8 +638,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       })
       .minScalarSameAs(1, 0);
 
-  getActionDefinitionsBuilder(G_CTLZ).legalForCartesianProduct(
-      {s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32})
+  getActionDefinitionsBuilder(G_CTLZ)
+      .legalForCartesianProduct(
+          {s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32})
       .scalarize(1);
 
   getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
@@ -606,7 +651,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
         // to be the same size as the dest.
         if (DstTy != SrcTy)
           return false;
-        for (auto &Ty : {v2s32, v4s32, v2s64}) {
+        for (auto &Ty : {v2s32, v4s32, v2s64, v2p0, v16s8, v8s16}) {
           if (DstTy == Ty)
             return true;
         }
@@ -623,8 +668,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
       .legalFor({{v4s32, v2s32}, {v8s16, v4s16}});
 
-  getActionDefinitionsBuilder(G_JUMP_TABLE)
-    .legalFor({{p0}, {s64}});
+  getActionDefinitionsBuilder(G_JUMP_TABLE).legalFor({{p0}, {s64}});
 
   getActionDefinitionsBuilder(G_BRJT).legalIf([=](const LegalityQuery &Query) {
     return Query.Types[0] == p0 && Query.Types[1] == s64;
@@ -632,6 +676,20 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
 
   getActionDefinitionsBuilder(G_DYN_STACKALLOC).lower();
 
+  getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE, G_MEMSET}).libcall();
+
+  getActionDefinitionsBuilder(G_ABS).lowerIf(
+      [=](const LegalityQuery &Query) { return Query.Types[0].isScalar(); });
+
+  getActionDefinitionsBuilder(G_VECREDUCE_FADD)
+      // We only have FADDP to do reduction-like operations. Lower the rest.
+      .legalFor({{s32, v2s32}, {s64, v2s64}})
+      .lower();
+
+  getActionDefinitionsBuilder(G_VECREDUCE_ADD)
+      .legalFor({{s8, v16s8}, {s16, v8s16}, {s32, v4s32}, {s64, v2s64}})
+      .lower();
+
   computeTables();
   verify(*ST.getInstrInfo());
 }
@@ -656,15 +714,63 @@ bool AArch64LegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
     return legalizeShlAshrLshr(MI, MRI, MIRBuilder, Observer);
   case TargetOpcode::G_GLOBAL_VALUE:
     return legalizeSmallCMGlobalValue(MI, MRI, MIRBuilder, Observer);
+  case TargetOpcode::G_TRUNC:
+    return legalizeVectorTrunc(MI, Helper);
   }
 
   llvm_unreachable("expected switch to return");
 }
 
-bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue(MachineInstr &MI,
-                                                      MachineRegisterInfo &MRI,
-                                                      MachineIRBuilder &MIRBuilder,
-                                                      GISelChangeObserver &Observer) const {
+static void extractParts(Register Reg, MachineRegisterInfo &MRI,
+                         MachineIRBuilder &MIRBuilder, LLT Ty, int NumParts,
+                         SmallVectorImpl<Register> &VRegs) {
+  for (int I = 0; I < NumParts; ++I)
+    VRegs.push_back(MRI.createGenericVirtualRegister(Ty));
+  MIRBuilder.buildUnmerge(VRegs, Reg);
+}
+
+bool AArch64LegalizerInfo::legalizeVectorTrunc(
+    MachineInstr &MI, LegalizerHelper &Helper) const {
+  MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
+  MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
+  // Similar to how operand splitting is done in SelectiondDAG, we can handle
+  // %res(v8s8) = G_TRUNC %in(v8s32) by generating:
+  //   %inlo(<4x s32>), %inhi(<4 x s32>) = G_UNMERGE %in(<8 x s32>)
+  //   %lo16(<4 x s16>) = G_TRUNC %inlo
+  //   %hi16(<4 x s16>) = G_TRUNC %inhi
+  //   %in16(<8 x s16>) = G_CONCAT_VECTORS %lo16, %hi16
+  //   %res(<8 x s8>) = G_TRUNC %in16
+
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(1).getReg();
+  LLT DstTy = MRI.getType(DstReg);
+  LLT SrcTy = MRI.getType(SrcReg);
+  assert(isPowerOf2_32(DstTy.getSizeInBits()) &&
+         isPowerOf2_32(SrcTy.getSizeInBits()));
+
+  // Split input type.
+  LLT SplitSrcTy = SrcTy.changeNumElements(SrcTy.getNumElements() / 2);
+  // First, split the source into two smaller vectors.
+  SmallVector<Register, 2> SplitSrcs;
+  extractParts(SrcReg, MRI, MIRBuilder, SplitSrcTy, 2, SplitSrcs);
+
+  // Truncate the splits into intermediate narrower elements.
+  LLT InterTy = SplitSrcTy.changeElementSize(DstTy.getScalarSizeInBits() * 2);
+  for (unsigned I = 0; I < SplitSrcs.size(); ++I)
+    SplitSrcs[I] = MIRBuilder.buildTrunc(InterTy, SplitSrcs[I]).getReg(0);
+
+  auto Concat = MIRBuilder.buildConcatVectors(
+      DstTy.changeElementSize(DstTy.getScalarSizeInBits() * 2), SplitSrcs);
+
+  Helper.Observer.changingInstr(MI);
+  MI.getOperand(1).setReg(Concat.getReg(0));
+  Helper.Observer.changedInstr(MI);
+  return true;
+}
+
+bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue(
+    MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder,
+    GISelChangeObserver &Observer) const {
   assert(MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE);
   // We do this custom legalization to convert G_GLOBAL_VALUE into target ADRP +
   // G_ADD_LOW instructions.
@@ -686,6 +792,27 @@ bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue(MachineInstr &MI,
   // Set the regclass on the dest reg too.
   MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass);
 
+  // MO_TAGGED on the page indicates a tagged address. Set the tag now. We do so
+  // by creating a MOVK that sets bits 48-63 of the register to (global address
+  // + 0x100000000 - PC) >> 48. The additional 0x100000000 offset here is to
+  // prevent an incorrect tag being generated during relocation when the the
+  // global appears before the code section. Without the offset, a global at
+  // `0x0f00'0000'0000'1000` (i.e. at `0x1000` with tag `0xf`) that's referenced
+  // by code at `0x2000` would result in `0x0f00'0000'0000'1000 - 0x2000 =
+  // 0x0eff'ffff'ffff'f000`, meaning the tag would be incorrectly set to `0xe`
+  // instead of `0xf`.
+  // This assumes that we're in the small code model so we can assume a binary
+  // size of <= 4GB, which makes the untagged PC relative offset positive. The
+  // binary must also be loaded into address range [0, 2^48). Both of these
+  // properties need to be ensured at runtime when using tagged addresses.
+  if (OpFlags & AArch64II::MO_TAGGED) {
+    ADRP = MIRBuilder.buildInstr(AArch64::MOVKXi, {LLT::pointer(0, 64)}, {ADRP})
+               .addGlobalAddress(GV, 0x100000000,
+                                 AArch64II::MO_PREL | AArch64II::MO_G3)
+               .addImm(48);
+    MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass);
+  }
+
   MIRBuilder.buildInstr(AArch64::G_ADD_LOW, {DstReg}, {ADRP})
       .addGlobalAddress(GV, 0,
                         OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
@@ -693,21 +820,8 @@ bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue(MachineInstr &MI,
   return true;
 }
 
-bool AArch64LegalizerInfo::legalizeIntrinsic(
-  LegalizerHelper &Helper, MachineInstr &MI) const {
-  MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
-  switch (MI.getIntrinsicID()) {
-  case Intrinsic::memcpy:
-  case Intrinsic::memset:
-  case Intrinsic::memmove:
-    if (createMemLibcall(MIRBuilder, *MIRBuilder.getMRI(), MI) ==
-        LegalizerHelper::UnableToLegalize)
-      return false;
-    MI.eraseFromParent();
-    return true;
-  default:
-    break;
-  }
+bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
+                                             MachineInstr &MI) const {
   return true;
 }
 
@@ -724,11 +838,13 @@ bool AArch64LegalizerInfo::legalizeShlAshrLshr(
   if (!VRegAndVal)
     return true;
   // Check the shift amount is in range for an immediate form.
-  int64_t Amount = VRegAndVal->Value;
+  int64_t Amount = VRegAndVal->Value.getSExtValue();
   if (Amount > 31)
     return true; // This will have to remain a register variant.
   auto ExtCst = MIRBuilder.buildConstant(LLT::scalar(64), Amount);
+  Observer.changingInstr(MI);
   MI.getOperand(2).setReg(ExtCst.getReg(0));
+  Observer.changedInstr(MI);
   return true;
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
index 1cb24559c1ab..8217e37c8512 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
@@ -15,6 +15,7 @@
 #define LLVM_LIB_TARGET_AARCH64_AARCH64MACHINELEGALIZER_H
 
 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
 
 namespace llvm {
@@ -45,6 +46,7 @@ private:
   bool legalizeSmallCMGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI,
                                   MachineIRBuilder &MIRBuilder,
                                   GISelChangeObserver &Observer) const;
+  bool legalizeVectorTrunc(MachineInstr &MI, LegalizerHelper &Helper) const;
   const AArch64Subtarget *ST;
 };
 } // End llvm namespace.
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
index baa8515baf3e..fdd04cb77fad 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
@@ -1,17 +1,22 @@
- //=== lib/CodeGen/GlobalISel/AArch64PostLegalizerCombiner.cpp -------------===//
+//=== AArch64PostLegalizerCombiner.cpp --------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
-// This performs post-legalization combines on generic MachineInstrs.
-//
-// Any combine that this pass performs must preserve instruction legality.
-// Combines unconcerned with legality should be handled by the
-// PreLegalizerCombiner instead.
-//
+///
+/// \file
+/// Post-legalization combines on generic MachineInstrs.
+///
+/// The combines here must preserve instruction legality.
+///
+/// Lowering combines (e.g. pseudo matching) should be handled by
+/// AArch64PostLegalizerLowering.
+///
+/// Combines which don't rely on instruction legality should go in the
+/// AArch64PreLegalizerCombiner.
+///
 //===----------------------------------------------------------------------===//
 
 #include "AArch64TargetMachine.h"
@@ -19,373 +24,215 @@
 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
-#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/Support/Debug.h"
 
 #define DEBUG_TYPE "aarch64-postlegalizer-combiner"
 
 using namespace llvm;
-using namespace MIPatternMatch;
-
-/// Represents a pseudo instruction which replaces a G_SHUFFLE_VECTOR.
-///
-/// Used for matching target-supported shuffles before codegen.
-struct ShuffleVectorPseudo {
-  unsigned Opc; ///< Opcode for the instruction. (E.g. G_ZIP1)
-  Register Dst; ///< Destination register.
-  SmallVector<SrcOp, 2> SrcOps; ///< Source registers.
-  ShuffleVectorPseudo(unsigned Opc, Register Dst,
-                      std::initializer_list<SrcOp> SrcOps)
-      : Opc(Opc), Dst(Dst), SrcOps(SrcOps){};
-  ShuffleVectorPseudo() {}
-};
-
-/// \returns The splat index of a G_SHUFFLE_VECTOR \p MI when \p MI is a splat.
-/// If \p MI is not a splat, returns None.
-static Optional<int> getSplatIndex(MachineInstr &MI) {
-  assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR &&
-         "Only G_SHUFFLE_VECTOR can have a splat index!");
-  ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
-  auto FirstDefinedIdx = find_if(Mask, [](int Elt) { return Elt >= 0; });
-
-  // If all elements are undefined, this shuffle can be considered a splat.
-  // Return 0 for better potential for callers to simplify.
-  if (FirstDefinedIdx == Mask.end())
-    return 0;
-
-  // Make sure all remaining elements are either undef or the same
-  // as the first non-undef value.
-  int SplatValue = *FirstDefinedIdx;
-  if (any_of(make_range(std::next(FirstDefinedIdx), Mask.end()),
-             [&SplatValue](int Elt) { return Elt >= 0 && Elt != SplatValue; }))
-    return None;
-
-  return SplatValue;
-}
-
-/// Check if a vector shuffle corresponds to a REV instruction with the
-/// specified blocksize.
-static bool isREVMask(ArrayRef<int> M, unsigned EltSize, unsigned NumElts,
-                      unsigned BlockSize) {
-  assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
-         "Only possible block sizes for REV are: 16, 32, 64");
-  assert(EltSize != 64 && "EltSize cannot be 64 for REV mask.");
-
-  unsigned BlockElts = M[0] + 1;
 
-  // If the first shuffle index is UNDEF, be optimistic.
-  if (M[0] < 0)
-    BlockElts = BlockSize / EltSize;
-
-  if (BlockSize <= EltSize || BlockSize != BlockElts * EltSize)
+/// This combine tries do what performExtractVectorEltCombine does in SDAG.
+/// Rewrite for pairwise fadd pattern
+///   (s32 (g_extract_vector_elt
+///           (g_fadd (vXs32 Other)
+///                  (g_vector_shuffle (vXs32 Other) undef <1,X,...> )) 0))
+/// ->
+///   (s32 (g_fadd (g_extract_vector_elt (vXs32 Other) 0)
+///              (g_extract_vector_elt (vXs32 Other) 1))
+bool matchExtractVecEltPairwiseAdd(
+    MachineInstr &MI, MachineRegisterInfo &MRI,
+    std::tuple<unsigned, LLT, Register> &MatchInfo) {
+  Register Src1 = MI.getOperand(1).getReg();
+  Register Src2 = MI.getOperand(2).getReg();
+  LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+
+  auto Cst = getConstantVRegValWithLookThrough(Src2, MRI);
+  if (!Cst || Cst->Value != 0)
     return false;
+  // SDAG also checks for FullFP16, but this looks to be beneficial anyway.
 
-  for (unsigned i = 0; i < NumElts; ++i) {
-    // Ignore undef indices.
-    if (M[i] < 0)
-      continue;
-    if (static_cast<unsigned>(M[i]) !=
-        (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
-      return false;
-  }
-
-  return true;
-}
-
-/// Determines if \p M is a shuffle vector mask for a TRN of \p NumElts.
-/// Whether or not G_TRN1 or G_TRN2 should be used is stored in \p WhichResult.
-static bool isTRNMask(ArrayRef<int> M, unsigned NumElts,
-                      unsigned &WhichResult) {
-  if (NumElts % 2 != 0)
+  // Now check for an fadd operation. TODO: expand this for integer add?
+  auto *FAddMI = getOpcodeDef(TargetOpcode::G_FADD, Src1, MRI);
+  if (!FAddMI)
     return false;
-  WhichResult = (M[0] == 0 ? 0 : 1);
-  for (unsigned i = 0; i < NumElts; i += 2) {
-    if ((M[i] >= 0 && static_cast<unsigned>(M[i]) != i + WhichResult) ||
-        (M[i + 1] >= 0 &&
-         static_cast<unsigned>(M[i + 1]) != i + NumElts + WhichResult))
-      return false;
-  }
-  return true;
-}
-
-/// Check if a G_EXT instruction can handle a shuffle mask \p M when the vector
-/// sources of the shuffle are different.
-static Optional<std::pair<bool, uint64_t>> getExtMask(ArrayRef<int> M,
-                                                      unsigned NumElts) {
-  // Look for the first non-undef element.
-  auto FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
-  if (FirstRealElt == M.end())
-    return None;
-
-  // Use APInt to handle overflow when calculating expected element.
-  unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
-  APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
-
-  // The following shuffle indices must be the successive elements after the
-  // first real element.
-  if (any_of(
-          make_range(std::next(FirstRealElt), M.end()),
-          [&ExpectedElt](int Elt) { return Elt != ExpectedElt++ && Elt >= 0; }))
-    return None;
-
-  // The index of an EXT is the first element if it is not UNDEF.
-  // Watch out for the beginning UNDEFs. The EXT index should be the expected
-  // value of the first element.  E.g.
-  // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
-  // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
-  // ExpectedElt is the last mask index plus 1.
-  uint64_t Imm = ExpectedElt.getZExtValue();
-  bool ReverseExt = false;
-
-  // There are two difference cases requiring to reverse input vectors.
-  // For example, for vector <4 x i32> we have the following cases,
-  // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
-  // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
-  // For both cases, we finally use mask <5, 6, 7, 0>, which requires
-  // to reverse two input vectors.
-  if (Imm < NumElts)
-    ReverseExt = true;
-  else
-    Imm -= NumElts;
-  return std::make_pair(ReverseExt, Imm);
-}
-
-/// Determines if \p M is a shuffle vector mask for a UZP of \p NumElts.
-/// Whether or not G_UZP1 or G_UZP2 should be used is stored in \p WhichResult.
-static bool isUZPMask(ArrayRef<int> M, unsigned NumElts,
-                      unsigned &WhichResult) {
-  WhichResult = (M[0] == 0 ? 0 : 1);
-  for (unsigned i = 0; i != NumElts; ++i) {
-    // Skip undef indices.
-    if (M[i] < 0)
-      continue;
-    if (static_cast<unsigned>(M[i]) != 2 * i + WhichResult)
-      return false;
-  }
-  return true;
-}
 
-/// \return true if \p M is a zip mask for a shuffle vector of \p NumElts.
-/// Whether or not G_ZIP1 or G_ZIP2 should be used is stored in \p WhichResult.
-static bool isZipMask(ArrayRef<int> M, unsigned NumElts,
-                      unsigned &WhichResult) {
-  if (NumElts % 2 != 0)
+  // If we add support for integer add, must restrict these types to just s64.
+  unsigned DstSize = DstTy.getSizeInBits();
+  if (DstSize != 16 && DstSize != 32 && DstSize != 64)
     return false;
 
-  // 0 means use ZIP1, 1 means use ZIP2.
-  WhichResult = (M[0] == 0 ? 0 : 1);
-  unsigned Idx = WhichResult * NumElts / 2;
-  for (unsigned i = 0; i != NumElts; i += 2) {
-      if ((M[i] >= 0 && static_cast<unsigned>(M[i]) != Idx) ||
-          (M[i + 1] >= 0 && static_cast<unsigned>(M[i + 1]) != Idx + NumElts))
-        return false;
-    Idx += 1;
+  Register Src1Op1 = FAddMI->getOperand(1).getReg();
+  Register Src1Op2 = FAddMI->getOperand(2).getReg();
+  MachineInstr *Shuffle =
+      getOpcodeDef(TargetOpcode::G_SHUFFLE_VECTOR, Src1Op2, MRI);
+  MachineInstr *Other = MRI.getVRegDef(Src1Op1);
+  if (!Shuffle) {
+    Shuffle = getOpcodeDef(TargetOpcode::G_SHUFFLE_VECTOR, Src1Op1, MRI);
+    Other = MRI.getVRegDef(Src1Op2);
   }
-  return true;
-}
-
-/// \return true if a G_SHUFFLE_VECTOR instruction \p MI can be replaced with a
-/// G_REV instruction. Returns the appropriate G_REV opcode in \p Opc.
-static bool matchREV(MachineInstr &MI, MachineRegisterInfo &MRI,
-                     ShuffleVectorPseudo &MatchInfo) {
-  assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
-  ArrayRef<int> ShuffleMask = MI.getOperand(3).getShuffleMask();
-  Register Dst = MI.getOperand(0).getReg();
-  Register Src = MI.getOperand(1).getReg();
-  LLT Ty = MRI.getType(Dst);
-  unsigned EltSize = Ty.getScalarSizeInBits();
-
-  // Element size for a rev cannot be 64.
-  if (EltSize == 64)
-    return false;
 
-  unsigned NumElts = Ty.getNumElements();
-
-  // Try to produce G_REV64
-  if (isREVMask(ShuffleMask, EltSize, NumElts, 64)) {
-    MatchInfo = ShuffleVectorPseudo(AArch64::G_REV64, Dst, {Src});
+  // We're looking for a shuffle that moves the second element to index 0.
+  if (Shuffle && Shuffle->getOperand(3).getShuffleMask()[0] == 1 &&
+      Other == MRI.getVRegDef(Shuffle->getOperand(1).getReg())) {
+    std::get<0>(MatchInfo) = TargetOpcode::G_FADD;
+    std::get<1>(MatchInfo) = DstTy;
+    std::get<2>(MatchInfo) = Other->getOperand(0).getReg();
     return true;
   }
-
-  // TODO: Produce G_REV32 and G_REV16 once we have proper legalization support.
-  // This should be identical to above, but with a constant 32 and constant
-  // 16.
   return false;
 }
 
-/// \return true if a G_SHUFFLE_VECTOR instruction \p MI can be replaced with
-/// a G_TRN1 or G_TRN2 instruction.
-static bool matchTRN(MachineInstr &MI, MachineRegisterInfo &MRI,
-                     ShuffleVectorPseudo &MatchInfo) {
-  assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
-  unsigned WhichResult;
-  ArrayRef<int> ShuffleMask = MI.getOperand(3).getShuffleMask();
-  Register Dst = MI.getOperand(0).getReg();
-  unsigned NumElts = MRI.getType(Dst).getNumElements();
-  if (!isTRNMask(ShuffleMask, NumElts, WhichResult))
-    return false;
-  unsigned Opc = (WhichResult == 0) ? AArch64::G_TRN1 : AArch64::G_TRN2;
-  Register V1 = MI.getOperand(1).getReg();
-  Register V2 = MI.getOperand(2).getReg();
-  MatchInfo = ShuffleVectorPseudo(Opc, Dst, {V1, V2});
+bool applyExtractVecEltPairwiseAdd(
+    MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
+    std::tuple<unsigned, LLT, Register> &MatchInfo) {
+  unsigned Opc = std::get<0>(MatchInfo);
+  assert(Opc == TargetOpcode::G_FADD && "Unexpected opcode!");
+  // We want to generate two extracts of elements 0 and 1, and add them.
+  LLT Ty = std::get<1>(MatchInfo);
+  Register Src = std::get<2>(MatchInfo);
+  LLT s64 = LLT::scalar(64);
+  B.setInstrAndDebugLoc(MI);
+  auto Elt0 = B.buildExtractVectorElement(Ty, Src, B.buildConstant(s64, 0));
+  auto Elt1 = B.buildExtractVectorElement(Ty, Src, B.buildConstant(s64, 1));
+  B.buildInstr(Opc, {MI.getOperand(0).getReg()}, {Elt0, Elt1});
+  MI.eraseFromParent();
   return true;
 }
 
-/// \return true if a G_SHUFFLE_VECTOR instruction \p MI can be replaced with
-/// a G_UZP1 or G_UZP2 instruction.
-///
-/// \param [in] MI - The shuffle vector instruction.
-/// \param [out] MatchInfo - Either G_UZP1 or G_UZP2 on success.
-static bool matchUZP(MachineInstr &MI, MachineRegisterInfo &MRI,
-                     ShuffleVectorPseudo &MatchInfo) {
-  assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
-  unsigned WhichResult;
-  ArrayRef<int> ShuffleMask = MI.getOperand(3).getShuffleMask();
-  Register Dst = MI.getOperand(0).getReg();
-  unsigned NumElts = MRI.getType(Dst).getNumElements();
-  if (!isUZPMask(ShuffleMask, NumElts, WhichResult))
-    return false;
-  unsigned Opc = (WhichResult == 0) ? AArch64::G_UZP1 : AArch64::G_UZP2;
-  Register V1 = MI.getOperand(1).getReg();
-  Register V2 = MI.getOperand(2).getReg();
-  MatchInfo = ShuffleVectorPseudo(Opc, Dst, {V1, V2});
-  return true;
+static bool isSignExtended(Register R, MachineRegisterInfo &MRI) {
+  // TODO: check if extended build vector as well.
+  unsigned Opc = MRI.getVRegDef(R)->getOpcode();
+  return Opc == TargetOpcode::G_SEXT || Opc == TargetOpcode::G_SEXT_INREG;
 }
 
-static bool matchZip(MachineInstr &MI, MachineRegisterInfo &MRI,
-                     ShuffleVectorPseudo &MatchInfo) {
-  assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
-  unsigned WhichResult;
-  ArrayRef<int> ShuffleMask = MI.getOperand(3).getShuffleMask();
-  Register Dst = MI.getOperand(0).getReg();
-  unsigned NumElts = MRI.getType(Dst).getNumElements();
-  if (!isZipMask(ShuffleMask, NumElts, WhichResult))
-    return false;
-  unsigned Opc = (WhichResult == 0) ? AArch64::G_ZIP1 : AArch64::G_ZIP2;
-  Register V1 = MI.getOperand(1).getReg();
-  Register V2 = MI.getOperand(2).getReg();
-  MatchInfo = ShuffleVectorPseudo(Opc, Dst, {V1, V2});
-  return true;
+static bool isZeroExtended(Register R, MachineRegisterInfo &MRI) {
+  // TODO: check if extended build vector as well.
+  return MRI.getVRegDef(R)->getOpcode() == TargetOpcode::G_ZEXT;
 }
 
-/// Helper function for matchDup.
-static bool matchDupFromInsertVectorElt(int Lane, MachineInstr &MI,
-                                        MachineRegisterInfo &MRI,
-                                        ShuffleVectorPseudo &MatchInfo) {
-  if (Lane != 0)
-    return false;
-
-  // Try to match a vector splat operation into a dup instruction.
-  // We're looking for this pattern:
-  //
-  // %scalar:gpr(s64) = COPY $x0
-  // %undef:fpr(<2 x s64>) = G_IMPLICIT_DEF
-  // %cst0:gpr(s32) = G_CONSTANT i32 0
-  // %zerovec:fpr(<2 x s32>) = G_BUILD_VECTOR %cst0(s32), %cst0(s32)
-  // %ins:fpr(<2 x s64>) = G_INSERT_VECTOR_ELT %undef, %scalar(s64), %cst0(s32)
-  // %splat:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %ins(<2 x s64>), %undef, %zerovec(<2 x s32>)
-  //
-  // ...into:
-  // %splat = G_DUP %scalar
-
-  // Begin matching the insert.
-  auto *InsMI = getOpcodeDef(TargetOpcode::G_INSERT_VECTOR_ELT,
-                             MI.getOperand(1).getReg(), MRI);
-  if (!InsMI)
-    return false;
-  // Match the undef vector operand.
-  if (!getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, InsMI->getOperand(1).getReg(),
-                    MRI))
-    return false;
-
-  // Match the index constant 0.
-  int64_t Index = 0;
-  if (!mi_match(InsMI->getOperand(3).getReg(), MRI, m_ICst(Index)) || Index)
-    return false;
-
-  MatchInfo = ShuffleVectorPseudo(AArch64::G_DUP, MI.getOperand(0).getReg(),
-                                  {InsMI->getOperand(2).getReg()});
-  return true;
-}
+bool matchAArch64MulConstCombine(
+    MachineInstr &MI, MachineRegisterInfo &MRI,
+    std::function<void(MachineIRBuilder &B, Register DstReg)> &ApplyFn) {
+  assert(MI.getOpcode() == TargetOpcode::G_MUL);
+  Register LHS = MI.getOperand(1).getReg();
+  Register RHS = MI.getOperand(2).getReg();
+  Register Dst = MI.getOperand(0).getReg();
+  const LLT Ty = MRI.getType(LHS);
 
-/// Helper function for matchDup.
-static bool matchDupFromBuildVector(int Lane, MachineInstr &MI,
-                                    MachineRegisterInfo &MRI,
-                                    ShuffleVectorPseudo &MatchInfo) {
-  assert(Lane >= 0 && "Expected positive lane?");
-  // Test if the LHS is a BUILD_VECTOR. If it is, then we can just reference the
-  // lane's definition directly.
-  auto *BuildVecMI = getOpcodeDef(TargetOpcode::G_BUILD_VECTOR,
-                                  MI.getOperand(1).getReg(), MRI);
-  if (!BuildVecMI)
+  // The below optimizations require a constant RHS.
+  auto Const = getConstantVRegValWithLookThrough(RHS, MRI);
+  if (!Const)
     return false;
-  Register Reg = BuildVecMI->getOperand(Lane + 1).getReg();
-  MatchInfo =
-      ShuffleVectorPseudo(AArch64::G_DUP, MI.getOperand(0).getReg(), {Reg});
-  return true;
-}
 
-static bool matchDup(MachineInstr &MI, MachineRegisterInfo &MRI,
-                     ShuffleVectorPseudo &MatchInfo) {
-  assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
-  auto MaybeLane = getSplatIndex(MI);
-  if (!MaybeLane)
-    return false;
-  int Lane = *MaybeLane;
-  // If this is undef splat, generate it via "just" vdup, if possible.
-  if (Lane < 0)
-    Lane = 0;
-  if (matchDupFromInsertVectorElt(Lane, MI, MRI, MatchInfo))
-    return true;
-  if (matchDupFromBuildVector(Lane, MI, MRI, MatchInfo))
-    return true;
-  return false;
-}
+  const APInt ConstValue = Const->Value.sextOrSelf(Ty.getSizeInBits());
+  // The following code is ported from AArch64ISelLowering.
+  // Multiplication of a power of two plus/minus one can be done more
+  // cheaply as as shift+add/sub. For now, this is true unilaterally. If
+  // future CPUs have a cheaper MADD instruction, this may need to be
+  // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
+  // 64-bit is 5 cycles, so this is always a win.
+  // More aggressively, some multiplications N0 * C can be lowered to
+  // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
+  // e.g. 6=3*2=(2+1)*2.
+  // TODO: consider lowering more cases, e.g. C = 14, -6, -14 or even 45
+  // which equals to (1+2)*16-(1+2).
+  // TrailingZeroes is used to test if the mul can be lowered to
+  // shift+add+shift.
+  unsigned TrailingZeroes = ConstValue.countTrailingZeros();
+  if (TrailingZeroes) {
+    // Conservatively do not lower to shift+add+shift if the mul might be
+    // folded into smul or umul.
+    if (MRI.hasOneNonDBGUse(LHS) &&
+        (isSignExtended(LHS, MRI) || isZeroExtended(LHS, MRI)))
+      return false;
+    // Conservatively do not lower to shift+add+shift if the mul might be
+    // folded into madd or msub.
+    if (MRI.hasOneNonDBGUse(Dst)) {
+      MachineInstr &UseMI = *MRI.use_instr_begin(Dst);
+      if (UseMI.getOpcode() == TargetOpcode::G_ADD ||
+          UseMI.getOpcode() == TargetOpcode::G_SUB)
+        return false;
+    }
+  }
+  // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
+  // and shift+add+shift.
+  APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);
+
+  unsigned ShiftAmt, AddSubOpc;
+  // Is the shifted value the LHS operand of the add/sub?
+  bool ShiftValUseIsLHS = true;
+  // Do we need to negate the result?
+  bool NegateResult = false;
+
+  if (ConstValue.isNonNegative()) {
+    // (mul x, 2^N + 1) => (add (shl x, N), x)
+    // (mul x, 2^N - 1) => (sub (shl x, N), x)
+    // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
+    APInt SCVMinus1 = ShiftedConstValue - 1;
+    APInt CVPlus1 = ConstValue + 1;
+    if (SCVMinus1.isPowerOf2()) {
+      ShiftAmt = SCVMinus1.logBase2();
+      AddSubOpc = TargetOpcode::G_ADD;
+    } else if (CVPlus1.isPowerOf2()) {
+      ShiftAmt = CVPlus1.logBase2();
+      AddSubOpc = TargetOpcode::G_SUB;
+    } else
+      return false;
+  } else {
+    // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
+    // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
+    APInt CVNegPlus1 = -ConstValue + 1;
+    APInt CVNegMinus1 = -ConstValue - 1;
+    if (CVNegPlus1.isPowerOf2()) {
+      ShiftAmt = CVNegPlus1.logBase2();
+      AddSubOpc = TargetOpcode::G_SUB;
+      ShiftValUseIsLHS = false;
+    } else if (CVNegMinus1.isPowerOf2()) {
+      ShiftAmt = CVNegMinus1.logBase2();
+      AddSubOpc = TargetOpcode::G_ADD;
+      NegateResult = true;
+    } else
+      return false;
+  }
 
-static bool matchEXT(MachineInstr &MI, MachineRegisterInfo &MRI,
-                     ShuffleVectorPseudo &MatchInfo) {
-  assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
-  Register Dst = MI.getOperand(0).getReg();
-  auto ExtInfo = getExtMask(MI.getOperand(3).getShuffleMask(),
-                            MRI.getType(Dst).getNumElements());
-  if (!ExtInfo)
+  if (NegateResult && TrailingZeroes)
     return false;
-  bool ReverseExt;
-  uint64_t Imm;
-  std::tie(ReverseExt, Imm) = *ExtInfo;
-  Register V1 = MI.getOperand(1).getReg();
-  Register V2 = MI.getOperand(2).getReg();
-  if (ReverseExt)
-    std::swap(V1, V2);
-  uint64_t ExtFactor = MRI.getType(V1).getScalarSizeInBits() / 8;
-  Imm *= ExtFactor;
-  MatchInfo = ShuffleVectorPseudo(AArch64::G_EXT, Dst, {V1, V2, Imm});
-  return true;
-}
 
-/// Replace a G_SHUFFLE_VECTOR instruction with a pseudo.
-/// \p Opc is the opcode to use. \p MI is the G_SHUFFLE_VECTOR.
-static bool applyShuffleVectorPseudo(MachineInstr &MI,
-                                     ShuffleVectorPseudo &MatchInfo) {
-  MachineIRBuilder MIRBuilder(MI);
-  MIRBuilder.buildInstr(MatchInfo.Opc, {MatchInfo.Dst}, MatchInfo.SrcOps);
-  MI.eraseFromParent();
+  ApplyFn = [=](MachineIRBuilder &B, Register DstReg) {
+    auto Shift = B.buildConstant(LLT::scalar(64), ShiftAmt);
+    auto ShiftedVal = B.buildShl(Ty, LHS, Shift);
+
+    Register AddSubLHS = ShiftValUseIsLHS ? ShiftedVal.getReg(0) : LHS;
+    Register AddSubRHS = ShiftValUseIsLHS ? LHS : ShiftedVal.getReg(0);
+    auto Res = B.buildInstr(AddSubOpc, {Ty}, {AddSubLHS, AddSubRHS});
+    assert(!(NegateResult && TrailingZeroes) &&
+           "NegateResult and TrailingZeroes cannot both be true for now.");
+    // Negate the result.
+    if (NegateResult) {
+      B.buildSub(DstReg, B.buildConstant(Ty, 0), Res);
+      return;
+    }
+    // Shift the result.
+    if (TrailingZeroes) {
+      B.buildShl(DstReg, Res, B.buildConstant(LLT::scalar(64), TrailingZeroes));
+      return;
+    }
+    B.buildCopy(DstReg, Res.getReg(0));
+  };
   return true;
 }
 
-/// Replace a G_SHUFFLE_VECTOR instruction with G_EXT.
-/// Special-cased because the constant operand must be emitted as a G_CONSTANT
-/// for the imported tablegen patterns to work.
-static bool applyEXT(MachineInstr &MI, ShuffleVectorPseudo &MatchInfo) {
-  MachineIRBuilder MIRBuilder(MI);
-  // Tablegen patterns expect an i32 G_CONSTANT as the final op.
-  auto Cst =
-      MIRBuilder.buildConstant(LLT::scalar(32), MatchInfo.SrcOps[2].getImm());
-  MIRBuilder.buildInstr(MatchInfo.Opc, {MatchInfo.Dst},
-                        {MatchInfo.SrcOps[0], MatchInfo.SrcOps[1], Cst});
+bool applyAArch64MulConstCombine(
+    MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
+    std::function<void(MachineIRBuilder &B, Register DstReg)> &ApplyFn) {
+  B.setInstrAndDebugLoc(MI);
+  ApplyFn(B, MI.getOperand(0).getReg());
   MI.eraseFromParent();
   return true;
 }
@@ -501,7 +348,7 @@ INITIALIZE_PASS_END(AArch64PostLegalizerCombiner, DEBUG_TYPE,
                     false)
 
 namespace llvm {
-FunctionPass *createAArch64PostLegalizeCombiner(bool IsOptNone) {
+FunctionPass *createAArch64PostLegalizerCombiner(bool IsOptNone) {
   return new AArch64PostLegalizerCombiner(IsOptNone);
 }
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
new file mode 100644
index 000000000000..a06ff4b5417a
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
@@ -0,0 +1,704 @@
+//=== AArch64PostLegalizerLowering.cpp --------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Post-legalization lowering for instructions.
+///
+/// This is used to offload pattern matching from the selector.
+///
+/// For example, this combiner will notice that a G_SHUFFLE_VECTOR is actually
+/// a G_ZIP, G_UZP, etc.
+///
+/// General optimization combines should be handled by either the
+/// AArch64PostLegalizerCombiner or the AArch64PreLegalizerCombiner.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AArch64TargetMachine.h"
+#include "AArch64GlobalISelUtils.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "llvm/CodeGen/GlobalISel/Combiner.h"
+#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
+#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "aarch64-postlegalizer-lowering"
+
+using namespace llvm;
+using namespace MIPatternMatch;
+using namespace AArch64GISelUtils;
+
+/// Represents a pseudo instruction which replaces a G_SHUFFLE_VECTOR.
+///
+/// Used for matching target-supported shuffles before codegen.
+struct ShuffleVectorPseudo {
+  unsigned Opc; ///< Opcode for the instruction. (E.g. G_ZIP1)
+  Register Dst; ///< Destination register.
+  SmallVector<SrcOp, 2> SrcOps; ///< Source registers.
+  ShuffleVectorPseudo(unsigned Opc, Register Dst,
+                      std::initializer_list<SrcOp> SrcOps)
+      : Opc(Opc), Dst(Dst), SrcOps(SrcOps){};
+  ShuffleVectorPseudo() {}
+};
+
+/// Check if a vector shuffle corresponds to a REV instruction with the
+/// specified blocksize.
+static bool isREVMask(ArrayRef<int> M, unsigned EltSize, unsigned NumElts,
+                      unsigned BlockSize) {
+  assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
+         "Only possible block sizes for REV are: 16, 32, 64");
+  assert(EltSize != 64 && "EltSize cannot be 64 for REV mask.");
+
+  unsigned BlockElts = M[0] + 1;
+
+  // If the first shuffle index is UNDEF, be optimistic.
+  if (M[0] < 0)
+    BlockElts = BlockSize / EltSize;
+
+  if (BlockSize <= EltSize || BlockSize != BlockElts * EltSize)
+    return false;
+
+  for (unsigned i = 0; i < NumElts; ++i) {
+    // Ignore undef indices.
+    if (M[i] < 0)
+      continue;
+    if (static_cast<unsigned>(M[i]) !=
+        (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
+      return false;
+  }
+
+  return true;
+}
+
+/// Determines if \p M is a shuffle vector mask for a TRN of \p NumElts.
+/// Whether or not G_TRN1 or G_TRN2 should be used is stored in \p WhichResult.
+static bool isTRNMask(ArrayRef<int> M, unsigned NumElts,
+                      unsigned &WhichResult) {
+  if (NumElts % 2 != 0)
+    return false;
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  for (unsigned i = 0; i < NumElts; i += 2) {
+    if ((M[i] >= 0 && static_cast<unsigned>(M[i]) != i + WhichResult) ||
+        (M[i + 1] >= 0 &&
+         static_cast<unsigned>(M[i + 1]) != i + NumElts + WhichResult))
+      return false;
+  }
+  return true;
+}
+
+/// Check if a G_EXT instruction can handle a shuffle mask \p M when the vector
+/// sources of the shuffle are different.
+static Optional<std::pair<bool, uint64_t>> getExtMask(ArrayRef<int> M,
+                                                      unsigned NumElts) {
+  // Look for the first non-undef element.
+  auto FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
+  if (FirstRealElt == M.end())
+    return None;
+
+  // Use APInt to handle overflow when calculating expected element.
+  unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
+  APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
+
+  // The following shuffle indices must be the successive elements after the
+  // first real element.
+  if (any_of(
+          make_range(std::next(FirstRealElt), M.end()),
+          [&ExpectedElt](int Elt) { return Elt != ExpectedElt++ && Elt >= 0; }))
+    return None;
+
+  // The index of an EXT is the first element if it is not UNDEF.
+  // Watch out for the beginning UNDEFs. The EXT index should be the expected
+  // value of the first element.  E.g.
+  // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
+  // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
+  // ExpectedElt is the last mask index plus 1.
+  uint64_t Imm = ExpectedElt.getZExtValue();
+  bool ReverseExt = false;
+
+  // There are two difference cases requiring to reverse input vectors.
+  // For example, for vector <4 x i32> we have the following cases,
+  // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
+  // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
+  // For both cases, we finally use mask <5, 6, 7, 0>, which requires
+  // to reverse two input vectors.
+  if (Imm < NumElts)
+    ReverseExt = true;
+  else
+    Imm -= NumElts;
+  return std::make_pair(ReverseExt, Imm);
+}
+
+/// Determines if \p M is a shuffle vector mask for a UZP of \p NumElts.
+/// Whether or not G_UZP1 or G_UZP2 should be used is stored in \p WhichResult.
+static bool isUZPMask(ArrayRef<int> M, unsigned NumElts,
+                      unsigned &WhichResult) {
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  for (unsigned i = 0; i != NumElts; ++i) {
+    // Skip undef indices.
+    if (M[i] < 0)
+      continue;
+    if (static_cast<unsigned>(M[i]) != 2 * i + WhichResult)
+      return false;
+  }
+  return true;
+}
+
+/// \return true if \p M is a zip mask for a shuffle vector of \p NumElts.
+/// Whether or not G_ZIP1 or G_ZIP2 should be used is stored in \p WhichResult.
+static bool isZipMask(ArrayRef<int> M, unsigned NumElts,
+                      unsigned &WhichResult) {
+  if (NumElts % 2 != 0)
+    return false;
+
+  // 0 means use ZIP1, 1 means use ZIP2.
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  unsigned Idx = WhichResult * NumElts / 2;
+  for (unsigned i = 0; i != NumElts; i += 2) {
+      if ((M[i] >= 0 && static_cast<unsigned>(M[i]) != Idx) ||
+          (M[i + 1] >= 0 && static_cast<unsigned>(M[i + 1]) != Idx + NumElts))
+        return false;
+    Idx += 1;
+  }
+  return true;
+}
+
+/// \return true if a G_SHUFFLE_VECTOR instruction \p MI can be replaced with a
+/// G_REV instruction. Returns the appropriate G_REV opcode in \p Opc.
+static bool matchREV(MachineInstr &MI, MachineRegisterInfo &MRI,
+                     ShuffleVectorPseudo &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
+  ArrayRef<int> ShuffleMask = MI.getOperand(3).getShuffleMask();
+  Register Dst = MI.getOperand(0).getReg();
+  Register Src = MI.getOperand(1).getReg();
+  LLT Ty = MRI.getType(Dst);
+  unsigned EltSize = Ty.getScalarSizeInBits();
+
+  // Element size for a rev cannot be 64.
+  if (EltSize == 64)
+    return false;
+
+  unsigned NumElts = Ty.getNumElements();
+
+  // Try to produce G_REV64
+  if (isREVMask(ShuffleMask, EltSize, NumElts, 64)) {
+    MatchInfo = ShuffleVectorPseudo(AArch64::G_REV64, Dst, {Src});
+    return true;
+  }
+
+  // TODO: Produce G_REV32 and G_REV16 once we have proper legalization support.
+  // This should be identical to above, but with a constant 32 and constant
+  // 16.
+  return false;
+}
+
+/// \return true if a G_SHUFFLE_VECTOR instruction \p MI can be replaced with
+/// a G_TRN1 or G_TRN2 instruction.
+static bool matchTRN(MachineInstr &MI, MachineRegisterInfo &MRI,
+                     ShuffleVectorPseudo &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
+  unsigned WhichResult;
+  ArrayRef<int> ShuffleMask = MI.getOperand(3).getShuffleMask();
+  Register Dst = MI.getOperand(0).getReg();
+  unsigned NumElts = MRI.getType(Dst).getNumElements();
+  if (!isTRNMask(ShuffleMask, NumElts, WhichResult))
+    return false;
+  unsigned Opc = (WhichResult == 0) ? AArch64::G_TRN1 : AArch64::G_TRN2;
+  Register V1 = MI.getOperand(1).getReg();
+  Register V2 = MI.getOperand(2).getReg();
+  MatchInfo = ShuffleVectorPseudo(Opc, Dst, {V1, V2});
+  return true;
+}
+
+/// \return true if a G_SHUFFLE_VECTOR instruction \p MI can be replaced with
+/// a G_UZP1 or G_UZP2 instruction.
+///
+/// \param [in] MI - The shuffle vector instruction.
+/// \param [out] MatchInfo - Either G_UZP1 or G_UZP2 on success.
+static bool matchUZP(MachineInstr &MI, MachineRegisterInfo &MRI,
+                     ShuffleVectorPseudo &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
+  unsigned WhichResult;
+  ArrayRef<int> ShuffleMask = MI.getOperand(3).getShuffleMask();
+  Register Dst = MI.getOperand(0).getReg();
+  unsigned NumElts = MRI.getType(Dst).getNumElements();
+  if (!isUZPMask(ShuffleMask, NumElts, WhichResult))
+    return false;
+  unsigned Opc = (WhichResult == 0) ? AArch64::G_UZP1 : AArch64::G_UZP2;
+  Register V1 = MI.getOperand(1).getReg();
+  Register V2 = MI.getOperand(2).getReg();
+  MatchInfo = ShuffleVectorPseudo(Opc, Dst, {V1, V2});
+  return true;
+}
+
+static bool matchZip(MachineInstr &MI, MachineRegisterInfo &MRI,
+                     ShuffleVectorPseudo &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
+  unsigned WhichResult;
+  ArrayRef<int> ShuffleMask = MI.getOperand(3).getShuffleMask();
+  Register Dst = MI.getOperand(0).getReg();
+  unsigned NumElts = MRI.getType(Dst).getNumElements();
+  if (!isZipMask(ShuffleMask, NumElts, WhichResult))
+    return false;
+  unsigned Opc = (WhichResult == 0) ? AArch64::G_ZIP1 : AArch64::G_ZIP2;
+  Register V1 = MI.getOperand(1).getReg();
+  Register V2 = MI.getOperand(2).getReg();
+  MatchInfo = ShuffleVectorPseudo(Opc, Dst, {V1, V2});
+  return true;
+}
+
+/// Helper function for matchDup.
+static bool matchDupFromInsertVectorElt(int Lane, MachineInstr &MI,
+                                        MachineRegisterInfo &MRI,
+                                        ShuffleVectorPseudo &MatchInfo) {
+  if (Lane != 0)
+    return false;
+
+  // Try to match a vector splat operation into a dup instruction.
+  // We're looking for this pattern:
+  //
+  // %scalar:gpr(s64) = COPY $x0
+  // %undef:fpr(<2 x s64>) = G_IMPLICIT_DEF
+  // %cst0:gpr(s32) = G_CONSTANT i32 0
+  // %zerovec:fpr(<2 x s32>) = G_BUILD_VECTOR %cst0(s32), %cst0(s32)
+  // %ins:fpr(<2 x s64>) = G_INSERT_VECTOR_ELT %undef, %scalar(s64), %cst0(s32)
+  // %splat:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %ins(<2 x s64>), %undef, %zerovec(<2 x s32>)
+  //
+  // ...into:
+  // %splat = G_DUP %scalar
+
+  // Begin matching the insert.
+  auto *InsMI = getOpcodeDef(TargetOpcode::G_INSERT_VECTOR_ELT,
+                             MI.getOperand(1).getReg(), MRI);
+  if (!InsMI)
+    return false;
+  // Match the undef vector operand.
+  if (!getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, InsMI->getOperand(1).getReg(),
+                    MRI))
+    return false;
+
+  // Match the index constant 0.
+  if (!mi_match(InsMI->getOperand(3).getReg(), MRI, m_ZeroInt()))
+    return false;
+
+  MatchInfo = ShuffleVectorPseudo(AArch64::G_DUP, MI.getOperand(0).getReg(),
+                                  {InsMI->getOperand(2).getReg()});
+  return true;
+}
+
+/// Helper function for matchDup.
+static bool matchDupFromBuildVector(int Lane, MachineInstr &MI,
+                                    MachineRegisterInfo &MRI,
+                                    ShuffleVectorPseudo &MatchInfo) {
+  assert(Lane >= 0 && "Expected positive lane?");
+  // Test if the LHS is a BUILD_VECTOR. If it is, then we can just reference the
+  // lane's definition directly.
+  auto *BuildVecMI = getOpcodeDef(TargetOpcode::G_BUILD_VECTOR,
+                                  MI.getOperand(1).getReg(), MRI);
+  if (!BuildVecMI)
+    return false;
+  Register Reg = BuildVecMI->getOperand(Lane + 1).getReg();
+  MatchInfo =
+      ShuffleVectorPseudo(AArch64::G_DUP, MI.getOperand(0).getReg(), {Reg});
+  return true;
+}
+
+static bool matchDup(MachineInstr &MI, MachineRegisterInfo &MRI,
+                     ShuffleVectorPseudo &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
+  auto MaybeLane = getSplatIndex(MI);
+  if (!MaybeLane)
+    return false;
+  int Lane = *MaybeLane;
+  // If this is undef splat, generate it via "just" vdup, if possible.
+  if (Lane < 0)
+    Lane = 0;
+  if (matchDupFromInsertVectorElt(Lane, MI, MRI, MatchInfo))
+    return true;
+  if (matchDupFromBuildVector(Lane, MI, MRI, MatchInfo))
+    return true;
+  return false;
+}
+
+static bool matchEXT(MachineInstr &MI, MachineRegisterInfo &MRI,
+                     ShuffleVectorPseudo &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
+  Register Dst = MI.getOperand(0).getReg();
+  auto ExtInfo = getExtMask(MI.getOperand(3).getShuffleMask(),
+                            MRI.getType(Dst).getNumElements());
+  if (!ExtInfo)
+    return false;
+  bool ReverseExt;
+  uint64_t Imm;
+  std::tie(ReverseExt, Imm) = *ExtInfo;
+  Register V1 = MI.getOperand(1).getReg();
+  Register V2 = MI.getOperand(2).getReg();
+  if (ReverseExt)
+    std::swap(V1, V2);
+  uint64_t ExtFactor = MRI.getType(V1).getScalarSizeInBits() / 8;
+  Imm *= ExtFactor;
+  MatchInfo = ShuffleVectorPseudo(AArch64::G_EXT, Dst, {V1, V2, Imm});
+  return true;
+}
+
+/// Replace a G_SHUFFLE_VECTOR instruction with a pseudo.
+/// \p Opc is the opcode to use. \p MI is the G_SHUFFLE_VECTOR.
+static bool applyShuffleVectorPseudo(MachineInstr &MI,
+                                     ShuffleVectorPseudo &MatchInfo) {
+  MachineIRBuilder MIRBuilder(MI);
+  MIRBuilder.buildInstr(MatchInfo.Opc, {MatchInfo.Dst}, MatchInfo.SrcOps);
+  MI.eraseFromParent();
+  return true;
+}
+
+/// Replace a G_SHUFFLE_VECTOR instruction with G_EXT.
+/// Special-cased because the constant operand must be emitted as a G_CONSTANT
+/// for the imported tablegen patterns to work.
+static bool applyEXT(MachineInstr &MI, ShuffleVectorPseudo &MatchInfo) {
+  MachineIRBuilder MIRBuilder(MI);
+  // Tablegen patterns expect an i32 G_CONSTANT as the final op.
+  auto Cst =
+      MIRBuilder.buildConstant(LLT::scalar(32), MatchInfo.SrcOps[2].getImm());
+  MIRBuilder.buildInstr(MatchInfo.Opc, {MatchInfo.Dst},
+                        {MatchInfo.SrcOps[0], MatchInfo.SrcOps[1], Cst});
+  MI.eraseFromParent();
+  return true;
+}
+
+/// isVShiftRImm - Check if this is a valid vector for the immediate
+/// operand of a vector shift right operation. The value must be in the range:
+///   1 <= Value <= ElementBits for a right shift.
+static bool isVShiftRImm(Register Reg, MachineRegisterInfo &MRI, LLT Ty,
+                         int64_t &Cnt) {
+  assert(Ty.isVector() && "vector shift count is not a vector type");
+  MachineInstr *MI = MRI.getVRegDef(Reg);
+  auto Cst = getBuildVectorConstantSplat(*MI, MRI);
+  if (!Cst)
+    return false;
+  Cnt = *Cst;
+  int64_t ElementBits = Ty.getScalarSizeInBits();
+  return Cnt >= 1 && Cnt <= ElementBits;
+}
+
+/// Match a vector G_ASHR or G_LSHR with a valid immediate shift.
+static bool matchVAshrLshrImm(MachineInstr &MI, MachineRegisterInfo &MRI,
+                              int64_t &Imm) {
+  assert(MI.getOpcode() == TargetOpcode::G_ASHR ||
+         MI.getOpcode() == TargetOpcode::G_LSHR);
+  LLT Ty = MRI.getType(MI.getOperand(1).getReg());
+  if (!Ty.isVector())
+    return false;
+  return isVShiftRImm(MI.getOperand(2).getReg(), MRI, Ty, Imm);
+}
+
+static bool applyVAshrLshrImm(MachineInstr &MI, MachineRegisterInfo &MRI,
+                              int64_t &Imm) {
+  unsigned Opc = MI.getOpcode();
+  assert(Opc == TargetOpcode::G_ASHR || Opc == TargetOpcode::G_LSHR);
+  unsigned NewOpc =
+      Opc == TargetOpcode::G_ASHR ? AArch64::G_VASHR : AArch64::G_VLSHR;
+  MachineIRBuilder MIB(MI);
+  auto ImmDef = MIB.buildConstant(LLT::scalar(32), Imm);
+  MIB.buildInstr(NewOpc, {MI.getOperand(0)}, {MI.getOperand(1), ImmDef});
+  MI.eraseFromParent();
+  return true;
+}
+
+/// Determine if it is possible to modify the \p RHS and predicate \p P of a
+/// G_ICMP instruction such that the right-hand side is an arithmetic immediate.
+///
+/// \returns A pair containing the updated immediate and predicate which may
+/// be used to optimize the instruction.
+///
+/// \note This assumes that the comparison has been legalized.
+Optional<std::pair<uint64_t, CmpInst::Predicate>>
+tryAdjustICmpImmAndPred(Register RHS, CmpInst::Predicate P,
+                          const MachineRegisterInfo &MRI) {
+  const auto &Ty = MRI.getType(RHS);
+  if (Ty.isVector())
+    return None;
+  unsigned Size = Ty.getSizeInBits();
+  assert((Size == 32 || Size == 64) && "Expected 32 or 64 bit compare only?");
+
+  // If the RHS is not a constant, or the RHS is already a valid arithmetic
+  // immediate, then there is nothing to change.
+  auto ValAndVReg = getConstantVRegValWithLookThrough(RHS, MRI);
+  if (!ValAndVReg)
+    return None;
+  uint64_t C = ValAndVReg->Value.getZExtValue();
+  if (isLegalArithImmed(C))
+    return None;
+
+  // We have a non-arithmetic immediate. Check if adjusting the immediate and
+  // adjusting the predicate will result in a legal arithmetic immediate.
+  switch (P) {
+  default:
+    return None;
+  case CmpInst::ICMP_SLT:
+  case CmpInst::ICMP_SGE:
+    // Check for
+    //
+    // x slt c => x sle c - 1
+    // x sge c => x sgt c - 1
+    //
+    // When c is not the smallest possible negative number.
+    if ((Size == 64 && static_cast<int64_t>(C) == INT64_MIN) ||
+        (Size == 32 && static_cast<int32_t>(C) == INT32_MIN))
+      return None;
+    P = (P == CmpInst::ICMP_SLT) ? CmpInst::ICMP_SLE : CmpInst::ICMP_SGT;
+    C -= 1;
+    break;
+  case CmpInst::ICMP_ULT:
+  case CmpInst::ICMP_UGE:
+    // Check for
+    //
+    // x ult c => x ule c - 1
+    // x uge c => x ugt c - 1
+    //
+    // When c is not zero.
+    if (C == 0)
+      return None;
+    P = (P == CmpInst::ICMP_ULT) ? CmpInst::ICMP_ULE : CmpInst::ICMP_UGT;
+    C -= 1;
+    break;
+  case CmpInst::ICMP_SLE:
+  case CmpInst::ICMP_SGT:
+    // Check for
+    //
+    // x sle c => x slt c + 1
+    // x sgt c => s sge c + 1
+    //
+    // When c is not the largest possible signed integer.
+    if ((Size == 32 && static_cast<int32_t>(C) == INT32_MAX) ||
+        (Size == 64 && static_cast<int64_t>(C) == INT64_MAX))
+      return None;
+    P = (P == CmpInst::ICMP_SLE) ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGE;
+    C += 1;
+    break;
+  case CmpInst::ICMP_ULE:
+  case CmpInst::ICMP_UGT:
+    // Check for
+    //
+    // x ule c => x ult c + 1
+    // x ugt c => s uge c + 1
+    //
+    // When c is not the largest possible unsigned integer.
+    if ((Size == 32 && static_cast<uint32_t>(C) == UINT32_MAX) ||
+        (Size == 64 && C == UINT64_MAX))
+      return None;
+    P = (P == CmpInst::ICMP_ULE) ? CmpInst::ICMP_ULT : CmpInst::ICMP_UGE;
+    C += 1;
+    break;
+  }
+
+  // Check if the new constant is valid, and return the updated constant and
+  // predicate if it is.
+  if (Size == 32)
+    C = static_cast<uint32_t>(C);
+  if (!isLegalArithImmed(C))
+    return None;
+  return {{C, P}};
+}
+
+/// Determine whether or not it is possible to update the RHS and predicate of
+/// a G_ICMP instruction such that the RHS will be selected as an arithmetic
+/// immediate.
+///
+/// \p MI - The G_ICMP instruction
+/// \p MatchInfo - The new RHS immediate and predicate on success
+///
+/// See tryAdjustICmpImmAndPred for valid transformations.
+bool matchAdjustICmpImmAndPred(
+    MachineInstr &MI, const MachineRegisterInfo &MRI,
+    std::pair<uint64_t, CmpInst::Predicate> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_ICMP);
+  Register RHS = MI.getOperand(3).getReg();
+  auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
+  if (auto MaybeNewImmAndPred = tryAdjustICmpImmAndPred(RHS, Pred, MRI)) {
+    MatchInfo = *MaybeNewImmAndPred;
+    return true;
+  }
+  return false;
+}
+
+bool applyAdjustICmpImmAndPred(
+    MachineInstr &MI, std::pair<uint64_t, CmpInst::Predicate> &MatchInfo,
+    MachineIRBuilder &MIB, GISelChangeObserver &Observer) {
+  MIB.setInstrAndDebugLoc(MI);
+  MachineOperand &RHS = MI.getOperand(3);
+  MachineRegisterInfo &MRI = *MIB.getMRI();
+  auto Cst = MIB.buildConstant(MRI.cloneVirtualRegister(RHS.getReg()),
+                               MatchInfo.first);
+  Observer.changingInstr(MI);
+  RHS.setReg(Cst->getOperand(0).getReg());
+  MI.getOperand(1).setPredicate(MatchInfo.second);
+  Observer.changedInstr(MI);
+  return true;
+}
+
+bool matchDupLane(MachineInstr &MI, MachineRegisterInfo &MRI,
+                  std::pair<unsigned, int> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
+  Register Src1Reg = MI.getOperand(1).getReg();
+  const LLT SrcTy = MRI.getType(Src1Reg);
+  const LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+
+  auto LaneIdx = getSplatIndex(MI);
+  if (!LaneIdx)
+    return false;
+
+  // The lane idx should be within the first source vector.
+  if (*LaneIdx >= SrcTy.getNumElements())
+    return false;
+
+  if (DstTy != SrcTy)
+    return false;
+
+  LLT ScalarTy = SrcTy.getElementType();
+  unsigned ScalarSize = ScalarTy.getSizeInBits();
+
+  unsigned Opc = 0;
+  switch (SrcTy.getNumElements()) {
+  case 2:
+    if (ScalarSize == 64)
+      Opc = AArch64::G_DUPLANE64;
+    break;
+  case 4:
+    if (ScalarSize == 32)
+      Opc = AArch64::G_DUPLANE32;
+    break;
+  case 8:
+    if (ScalarSize == 16)
+      Opc = AArch64::G_DUPLANE16;
+    break;
+  case 16:
+    if (ScalarSize == 8)
+      Opc = AArch64::G_DUPLANE8;
+    break;
+  default:
+    break;
+  }
+  if (!Opc)
+    return false;
+
+  MatchInfo.first = Opc;
+  MatchInfo.second = *LaneIdx;
+  return true;
+}
+
+bool applyDupLane(MachineInstr &MI, MachineRegisterInfo &MRI,
+                  MachineIRBuilder &B, std::pair<unsigned, int> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
+  B.setInstrAndDebugLoc(MI);
+  auto Lane = B.buildConstant(LLT::scalar(64), MatchInfo.second);
+  B.buildInstr(MatchInfo.first, {MI.getOperand(0).getReg()},
+               {MI.getOperand(1).getReg(), Lane});
+  MI.eraseFromParent();
+  return true;
+}
+
+#define AARCH64POSTLEGALIZERLOWERINGHELPER_GENCOMBINERHELPER_DEPS
+#include "AArch64GenPostLegalizeGILowering.inc"
+#undef AARCH64POSTLEGALIZERLOWERINGHELPER_GENCOMBINERHELPER_DEPS
+
+namespace {
+#define AARCH64POSTLEGALIZERLOWERINGHELPER_GENCOMBINERHELPER_H
+#include "AArch64GenPostLegalizeGILowering.inc"
+#undef AARCH64POSTLEGALIZERLOWERINGHELPER_GENCOMBINERHELPER_H
+
+class AArch64PostLegalizerLoweringInfo : public CombinerInfo {
+public:
+  AArch64GenPostLegalizerLoweringHelperRuleConfig GeneratedRuleCfg;
+
+  AArch64PostLegalizerLoweringInfo(bool OptSize, bool MinSize)
+      : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
+                     /*LegalizerInfo*/ nullptr, /*OptEnabled = */ true, OptSize,
+                     MinSize) {
+    if (!GeneratedRuleCfg.parseCommandLineOption())
+      report_fatal_error("Invalid rule identifier");
+  }
+
+  virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
+                       MachineIRBuilder &B) const override;
+};
+
+bool AArch64PostLegalizerLoweringInfo::combine(GISelChangeObserver &Observer,
+                                               MachineInstr &MI,
+                                               MachineIRBuilder &B) const {
+  CombinerHelper Helper(Observer, B);
+  AArch64GenPostLegalizerLoweringHelper Generated(GeneratedRuleCfg);
+  return Generated.tryCombineAll(Observer, MI, B, Helper);
+}
+
+#define AARCH64POSTLEGALIZERLOWERINGHELPER_GENCOMBINERHELPER_CPP
+#include "AArch64GenPostLegalizeGILowering.inc"
+#undef AARCH64POSTLEGALIZERLOWERINGHELPER_GENCOMBINERHELPER_CPP
+
+class AArch64PostLegalizerLowering : public MachineFunctionPass {
+public:
+  static char ID;
+
+  AArch64PostLegalizerLowering();
+
+  StringRef getPassName() const override {
+    return "AArch64PostLegalizerLowering";
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+};
+} // end anonymous namespace
+
+void AArch64PostLegalizerLowering::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<TargetPassConfig>();
+  AU.setPreservesCFG();
+  getSelectionDAGFallbackAnalysisUsage(AU);
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+AArch64PostLegalizerLowering::AArch64PostLegalizerLowering()
+    : MachineFunctionPass(ID) {
+  initializeAArch64PostLegalizerLoweringPass(*PassRegistry::getPassRegistry());
+}
+
+bool AArch64PostLegalizerLowering::runOnMachineFunction(MachineFunction &MF) {
+  if (MF.getProperties().hasProperty(
+          MachineFunctionProperties::Property::FailedISel))
+    return false;
+  assert(MF.getProperties().hasProperty(
+             MachineFunctionProperties::Property::Legalized) &&
+         "Expected a legalized function?");
+  auto *TPC = &getAnalysis<TargetPassConfig>();
+  const Function &F = MF.getFunction();
+  AArch64PostLegalizerLoweringInfo PCInfo(F.hasOptSize(), F.hasMinSize());
+  Combiner C(PCInfo, TPC);
+  return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
+}
+
+char AArch64PostLegalizerLowering::ID = 0;
+INITIALIZE_PASS_BEGIN(AArch64PostLegalizerLowering, DEBUG_TYPE,
+                      "Lower AArch64 MachineInstrs after legalization", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(AArch64PostLegalizerLowering, DEBUG_TYPE,
+                    "Lower AArch64 MachineInstrs after legalization", false,
+                    false)
+
+namespace llvm {
+FunctionPass *createAArch64PostLegalizerLowering() {
+  return new AArch64PostLegalizerLowering();
+}
+} // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PostSelectOptimize.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PostSelectOptimize.cpp
new file mode 100644
index 000000000000..2f882ecb1fd4
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PostSelectOptimize.cpp
@@ -0,0 +1,187 @@
+//=== AArch64PostSelectOptimize.cpp ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass does post-instruction-selection optimizations in the GlobalISel
+// pipeline, before the rest of codegen runs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64TargetMachine.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "aarch64-post-select-optimize"
+
+using namespace llvm;
+
+namespace {
+class AArch64PostSelectOptimize : public MachineFunctionPass {
+public:
+  static char ID;
+
+  AArch64PostSelectOptimize();
+
+  StringRef getPassName() const override {
+    return "AArch64 Post Select Optimizer";
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+private:
+  bool optimizeNZCVDefs(MachineBasicBlock &MBB);
+};
+} // end anonymous namespace
+
+void AArch64PostSelectOptimize::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<TargetPassConfig>();
+  AU.setPreservesCFG();
+  getSelectionDAGFallbackAnalysisUsage(AU);
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+AArch64PostSelectOptimize::AArch64PostSelectOptimize()
+    : MachineFunctionPass(ID) {
+  initializeAArch64PostSelectOptimizePass(*PassRegistry::getPassRegistry());
+}
+
+unsigned getNonFlagSettingVariant(unsigned Opc) {
+  switch (Opc) {
+  default:
+    return 0;
+  case AArch64::SUBSXrr:
+    return AArch64::SUBXrr;
+  case AArch64::SUBSWrr:
+    return AArch64::SUBWrr;
+  case AArch64::SUBSXrs:
+    return AArch64::SUBXrs;
+  case AArch64::SUBSXri:
+    return AArch64::SUBXri;
+  case AArch64::SUBSWri:
+    return AArch64::SUBWri;
+  }
+}
+
+bool AArch64PostSelectOptimize::optimizeNZCVDefs(MachineBasicBlock &MBB) {
+  // Consider the following code:
+  //  FCMPSrr %0, %1, implicit-def $nzcv
+  //  %sel1:gpr32 = CSELWr %_, %_, 12, implicit $nzcv
+  //  %sub:gpr32 = SUBSWrr %_, %_, implicit-def $nzcv
+  //  FCMPSrr %0, %1, implicit-def $nzcv
+  //  %sel2:gpr32 = CSELWr %_, %_, 12, implicit $nzcv
+  // This kind of code where we have 2 FCMPs each feeding a CSEL can happen
+  // when we have a single IR fcmp being used by two selects. During selection,
+  // to ensure that there can be no clobbering of nzcv between the fcmp and the
+  // csel, we have to generate an fcmp immediately before each csel is
+  // selected.
+  // However, often we can essentially CSE these together later in MachineCSE.
+  // This doesn't work though if there are unrelated flag-setting instructions
+  // in between the two FCMPs. In this case, the SUBS defines NZCV
+  // but it doesn't have any users, being overwritten by the second FCMP.
+  //
+  // Our solution here is to try to convert flag setting operations between
+  // a interval of identical FCMPs, so that CSE will be able to eliminate one.
+  bool Changed = false;
+  const auto *TII = MBB.getParent()->getSubtarget().getInstrInfo();
+
+  // The first step is to find the first and last FCMPs. If we have found
+  // at least two, then set the limit of the bottom-up walk to the first FCMP
+  // found since we're only interested in dealing with instructions between
+  // them.
+  MachineInstr *FirstCmp = nullptr, *LastCmp = nullptr;
+  for (auto &MI : instructionsWithoutDebug(MBB.begin(), MBB.end())) {
+    if (MI.getOpcode() == AArch64::FCMPSrr ||
+        MI.getOpcode() == AArch64::FCMPDrr) {
+      if (!FirstCmp)
+        FirstCmp = &MI;
+      else
+        LastCmp = &MI;
+    }
+  }
+
+  // In addition to converting flag-setting ops in fcmp ranges into non-flag
+  // setting ops, across the whole basic block we also detect when nzcv
+  // implicit-defs are dead, and mark them as dead. Peephole optimizations need
+  // this information later.
+
+  LiveRegUnits LRU(*MBB.getParent()->getSubtarget().getRegisterInfo());
+  LRU.addLiveOuts(MBB);
+  bool NZCVDead = LRU.available(AArch64::NZCV);
+  bool InsideCmpRange = false;
+  for (auto &II : instructionsWithoutDebug(MBB.rbegin(), MBB.rend())) {
+    LRU.stepBackward(II);
+
+    if (LastCmp) { // There's a range present in this block.
+      // If we're inside an fcmp range, look for begin instruction.
+      if (InsideCmpRange && &II == FirstCmp)
+        InsideCmpRange = false;
+      else if (&II == LastCmp)
+        InsideCmpRange = true;
+    }
+
+    // Did this instruction define NZCV?
+    bool NZCVDeadAtCurrInstr = LRU.available(AArch64::NZCV);
+    if (NZCVDead && NZCVDeadAtCurrInstr && II.definesRegister(AArch64::NZCV)) {
+      // If we have a def and NZCV is dead, then we may convert this op.
+      unsigned NewOpc = getNonFlagSettingVariant(II.getOpcode());
+      int DeadNZCVIdx = II.findRegisterDefOperandIdx(AArch64::NZCV);
+      if (DeadNZCVIdx != -1) {
+        // If we're inside an fcmp range, then convert flag setting ops.
+        if (InsideCmpRange && NewOpc) {
+          LLVM_DEBUG(dbgs() << "Post-select optimizer: converting flag-setting "
+                               "op in fcmp range: "
+                            << II);
+          II.setDesc(TII->get(NewOpc));
+          II.RemoveOperand(DeadNZCVIdx);
+          Changed |= true;
+        } else {
+          // Otherwise, we just set the nzcv imp-def operand to be dead, so the
+          // peephole optimizations can optimize them further.
+          II.getOperand(DeadNZCVIdx).setIsDead();
+        }
+      }
+    }
+
+    NZCVDead = NZCVDeadAtCurrInstr;
+  }
+  return Changed;
+}
+
+bool AArch64PostSelectOptimize::runOnMachineFunction(MachineFunction &MF) {
+  if (MF.getProperties().hasProperty(
+          MachineFunctionProperties::Property::FailedISel))
+    return false;
+  assert(MF.getProperties().hasProperty(
+             MachineFunctionProperties::Property::Selected) &&
+         "Expected a selected MF");
+
+  bool Changed = false;
+  for (auto &BB : MF)
+    Changed |= optimizeNZCVDefs(BB);
+  return true;
+}
+
+char AArch64PostSelectOptimize::ID = 0;
+INITIALIZE_PASS_BEGIN(AArch64PostSelectOptimize, DEBUG_TYPE,
+                      "Optimize AArch64 selected instructions",
+                      false, false)
+INITIALIZE_PASS_END(AArch64PostSelectOptimize, DEBUG_TYPE,
+                    "Optimize AArch64 selected instructions", false,
+                    false)
+
+namespace llvm {
+FunctionPass *createAArch64PostSelectOptimize() {
+  return new AArch64PostSelectOptimize();
+}
+} // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
index 9a1f200d5222..5f9b64e274b3 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
@@ -96,24 +96,6 @@ bool AArch64PreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
   CombinerHelper Helper(Observer, B, KB, MDT);
   AArch64GenPreLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper);
 
-  switch (MI.getOpcode()) {
-  case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
-    switch (MI.getIntrinsicID()) {
-    case Intrinsic::memcpy:
-    case Intrinsic::memmove:
-    case Intrinsic::memset: {
-      // If we're at -O0 set a maxlen of 32 to inline, otherwise let the other
-      // heuristics decide.
-      unsigned MaxLen = EnableOpt ? 0 : 32;
-      // Try to inline memcpy type calls if optimizations are enabled.
-      return (!EnableMinSize) ? Helper.tryCombineMemCpyFamily(MI, MaxLen)
-                              : false;
-    }
-    default:
-      break;
-    }
-  }
-
   if (Generated.tryCombineAll(Observer, MI, B))
     return true;
 
@@ -122,6 +104,15 @@ bool AArch64PreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
     return Helper.tryCombineConcatVectors(MI);
   case TargetOpcode::G_SHUFFLE_VECTOR:
     return Helper.tryCombineShuffleVector(MI);
+  case TargetOpcode::G_MEMCPY:
+  case TargetOpcode::G_MEMMOVE:
+  case TargetOpcode::G_MEMSET: {
+    // If we're at -O0 set a maxlen of 32 to inline, otherwise let the other
+    // heuristics decide.
+    unsigned MaxLen = EnableOpt ? 0 : 32;
+    // Try to inline memcpy type calls if optimizations are enabled.
+    return !EnableMinSize ? Helper.tryCombineMemCpyFamily(MI, MaxLen) : false;
+  }
   }
 
   return false;
@@ -197,7 +188,7 @@ INITIALIZE_PASS_END(AArch64PreLegalizerCombiner, DEBUG_TYPE,
 
 
 namespace llvm {
-FunctionPass *createAArch64PreLegalizeCombiner(bool IsOptNone) {
+FunctionPass *createAArch64PreLegalizerCombiner(bool IsOptNone) {
   return new AArch64PreLegalizerCombiner(IsOptNone);
 }
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index 93213f5977e5..c76c43389b37 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -13,6 +13,7 @@
 
 #include "AArch64RegisterBankInfo.h"
 #include "AArch64InstrInfo.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
@@ -465,9 +466,10 @@ AArch64RegisterBankInfo::getSameKindOfOperandsMapping(
                                getValueMapping(RBIdx, Size), NumOperands);
 }
 
-bool AArch64RegisterBankInfo::hasFPConstraints(
-    const MachineInstr &MI, const MachineRegisterInfo &MRI,
-    const TargetRegisterInfo &TRI) const {
+bool AArch64RegisterBankInfo::hasFPConstraints(const MachineInstr &MI,
+                                               const MachineRegisterInfo &MRI,
+                                               const TargetRegisterInfo &TRI,
+                                               unsigned Depth) const {
   unsigned Op = MI.getOpcode();
 
   // Do we have an explicit floating point instruction?
@@ -479,14 +481,30 @@ bool AArch64RegisterBankInfo::hasFPConstraints(
   if (Op != TargetOpcode::COPY && !MI.isPHI())
     return false;
 
-  // MI is copy-like. Return true if it outputs an FPR.
-  return getRegBank(MI.getOperand(0).getReg(), MRI, TRI) ==
-         &AArch64::FPRRegBank;
+  // Check if we already know the register bank.
+  auto *RB = getRegBank(MI.getOperand(0).getReg(), MRI, TRI);
+  if (RB == &AArch64::FPRRegBank)
+    return true;
+  if (RB == &AArch64::GPRRegBank)
+    return false;
+
+  // We don't know anything.
+  //
+  // If we have a phi, we may be able to infer that it will be assigned a FPR
+  // based off of its inputs.
+  if (!MI.isPHI() || Depth > MaxFPRSearchDepth)
+    return false;
+
+  return any_of(MI.explicit_uses(), [&](const MachineOperand &Op) {
+    return Op.isReg() &&
+           onlyDefinesFP(*MRI.getVRegDef(Op.getReg()), MRI, TRI, Depth + 1);
+  });
 }
 
 bool AArch64RegisterBankInfo::onlyUsesFP(const MachineInstr &MI,
                                          const MachineRegisterInfo &MRI,
-                                         const TargetRegisterInfo &TRI) const {
+                                         const TargetRegisterInfo &TRI,
+                                         unsigned Depth) const {
   switch (MI.getOpcode()) {
   case TargetOpcode::G_FPTOSI:
   case TargetOpcode::G_FPTOUI:
@@ -495,12 +513,13 @@ bool AArch64RegisterBankInfo::onlyUsesFP(const MachineInstr &MI,
   default:
     break;
   }
-  return hasFPConstraints(MI, MRI, TRI);
+  return hasFPConstraints(MI, MRI, TRI, Depth);
 }
 
-bool AArch64RegisterBankInfo::onlyDefinesFP(
-    const MachineInstr &MI, const MachineRegisterInfo &MRI,
-    const TargetRegisterInfo &TRI) const {
+bool AArch64RegisterBankInfo::onlyDefinesFP(const MachineInstr &MI,
+                                            const MachineRegisterInfo &MRI,
+                                            const TargetRegisterInfo &TRI,
+                                            unsigned Depth) const {
   switch (MI.getOpcode()) {
   case AArch64::G_DUP:
   case TargetOpcode::G_SITOFP:
@@ -511,7 +530,7 @@ bool AArch64RegisterBankInfo::onlyDefinesFP(
   default:
     break;
   }
-  return hasFPConstraints(MI, MRI, TRI);
+  return hasFPConstraints(MI, MRI, TRI, Depth);
 }
 
 const RegisterBankInfo::InstructionMapping &
@@ -661,11 +680,18 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     break;
   }
   case TargetOpcode::G_SITOFP:
-  case TargetOpcode::G_UITOFP:
+  case TargetOpcode::G_UITOFP: {
     if (MRI.getType(MI.getOperand(0).getReg()).isVector())
       break;
-    OpRegBankIdx = {PMI_FirstFPR, PMI_FirstGPR};
+    // Integer to FP conversions don't necessarily happen between GPR -> FPR
+    // regbanks. They can also be done within an FPR register.
+    Register SrcReg = MI.getOperand(1).getReg();
+    if (getRegBank(SrcReg, MRI, TRI) == &AArch64::FPRRegBank)
+      OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR};
+    else
+      OpRegBankIdx = {PMI_FirstFPR, PMI_FirstGPR};
     break;
+  }
   case TargetOpcode::G_FPTOSI:
   case TargetOpcode::G_FPTOUI:
     if (MRI.getType(MI.getOperand(0).getReg()).isVector())
@@ -703,7 +729,8 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
         // assume this was a floating point load in the IR.
         // If it was not, we would have had a bitcast before
         // reaching that instruction.
-        if (onlyUsesFP(UseMI, MRI, TRI)) {
+        // Int->FP conversion operations are also captured in onlyDefinesFP().
+        if (onlyUsesFP(UseMI, MRI, TRI) || onlyDefinesFP(UseMI, MRI, TRI)) {
           OpRegBankIdx[0] = PMI_FirstFPR;
           break;
         }
@@ -826,7 +853,7 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     }
     break;
   }
-  case TargetOpcode::G_BUILD_VECTOR:
+  case TargetOpcode::G_BUILD_VECTOR: {
     // If the first source operand belongs to a FPR register bank, then make
     // sure that we preserve that.
     if (OpRegBankIdx[1] != PMI_FirstGPR)
@@ -837,10 +864,17 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
 
     // Get the instruction that defined the source operand reg, and check if
     // it's a floating point operation. Or, if it's a type like s16 which
-    // doesn't have a exact size gpr register class.
+    // doesn't have a exact size gpr register class. The exception is if the
+    // build_vector has all constant operands, which may be better to leave as
+    // gpr without copies, so it can be matched in imported patterns.
     MachineInstr *DefMI = MRI.getVRegDef(VReg);
     unsigned DefOpc = DefMI->getOpcode();
     const LLT SrcTy = MRI.getType(VReg);
+    if (all_of(MI.operands(), [&](const MachineOperand &Op) {
+          return Op.isDef() || MRI.getVRegDef(Op.getReg())->getOpcode() ==
+                                   TargetOpcode::G_CONSTANT;
+        }))
+      break;
     if (isPreISelGenericFloatingPointOpcode(DefOpc) ||
         SrcTy.getSizeInBits() < 32) {
       // Have a floating point op.
@@ -851,6 +885,30 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     }
     break;
   }
+  case TargetOpcode::G_VECREDUCE_FADD:
+  case TargetOpcode::G_VECREDUCE_FMUL:
+  case TargetOpcode::G_VECREDUCE_FMAX:
+  case TargetOpcode::G_VECREDUCE_FMIN:
+  case TargetOpcode::G_VECREDUCE_ADD:
+  case TargetOpcode::G_VECREDUCE_MUL:
+  case TargetOpcode::G_VECREDUCE_AND:
+  case TargetOpcode::G_VECREDUCE_OR:
+  case TargetOpcode::G_VECREDUCE_XOR:
+  case TargetOpcode::G_VECREDUCE_SMAX:
+  case TargetOpcode::G_VECREDUCE_SMIN:
+  case TargetOpcode::G_VECREDUCE_UMAX:
+  case TargetOpcode::G_VECREDUCE_UMIN:
+    // Reductions produce a scalar value from a vector, the scalar should be on
+    // FPR bank.
+    OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR};
+    break;
+  case TargetOpcode::G_VECREDUCE_SEQ_FADD:
+  case TargetOpcode::G_VECREDUCE_SEQ_FMUL:
+    // These reductions also take a scalar accumulator input.
+    // Assign them FPR for now.
+    OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR, PMI_FirstFPR};
+    break;
+  }
 
   // Finally construct the computed mapping.
   SmallVector<const ValueMapping *, 8> OpdsMapping(NumOperands);
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h
index e956fca1aa10..019017bc3ec4 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h
@@ -114,17 +114,20 @@ class AArch64RegisterBankInfo final : public AArch64GenRegisterBankInfo {
   const InstructionMapping &
   getSameKindOfOperandsMapping(const MachineInstr &MI) const;
 
-  /// Returns true if the output of \p MI must be stored on a FPR register.
+  /// Maximum recursion depth for hasFPConstraints.
+  const unsigned MaxFPRSearchDepth = 2;
+
+  /// \returns true if \p MI only uses and defines FPRs.
   bool hasFPConstraints(const MachineInstr &MI, const MachineRegisterInfo &MRI,
-                     const TargetRegisterInfo &TRI) const;
+                     const TargetRegisterInfo &TRI, unsigned Depth = 0) const;
 
-  /// Returns true if the source registers of \p MI must all be FPRs.
+  /// \returns true if \p MI only uses FPRs.
   bool onlyUsesFP(const MachineInstr &MI, const MachineRegisterInfo &MRI,
-                  const TargetRegisterInfo &TRI) const;
+                  const TargetRegisterInfo &TRI, unsigned Depth = 0) const;
 
-  /// Returns true if the destination register of \p MI must be a FPR.
+  /// \returns true if \p MI only defines FPRs.
   bool onlyDefinesFP(const MachineInstr &MI, const MachineRegisterInfo &MRI,
-                     const TargetRegisterInfo &TRI) const;
+                     const TargetRegisterInfo &TRI, unsigned Depth = 0) const;
 
 public:
   AArch64RegisterBankInfo(const TargetRegisterInfo &TRI);
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/select-saddo.mir b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/select-saddo.mir
new file mode 100644
index 000000000000..6f05bd7ac838
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/select-saddo.mir
@@ -0,0 +1,158 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -verify-machineinstrs -mtriple aarch64-unknown-uknown -global-isel -run-pass=instruction-select %s -o - | FileCheck %s
+
+...
+---
+name:            saddo_s32
+alignment:       4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $w0, $w1, $x2
+
+    ; CHECK-LABEL: name: saddo_s32
+    ; CHECK: liveins: $w0, $w1, $x2
+    ; CHECK: %reg0:gpr32 = COPY $w0
+    ; CHECK: %reg1:gpr32 = COPY $w1
+    ; CHECK: %saddo:gpr32 = ADDSWrr %reg0, %reg1, implicit-def $nzcv
+    ; CHECK: [[CSINCWr:%[0-9]+]]:gpr32 = CSINCWr $wzr, $wzr, 7, implicit $nzcv
+    ; CHECK: $w0 = COPY %saddo
+    ; CHECK: RET_ReallyLR implicit $w0
+    %reg0:gpr(s32) = COPY $w0
+    %reg1:gpr(s32) = COPY $w1
+    %saddo:gpr(s32), %4:gpr(s1) = G_SADDO %reg0, %reg1
+    $w0 = COPY %saddo(s32)
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:            saddo_s64
+alignment:       4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $x0, $x1, $x2
+
+    ; CHECK-LABEL: name: saddo_s64
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK: %reg0:gpr64 = COPY $x0
+    ; CHECK: %reg1:gpr64 = COPY $x1
+    ; CHECK: %saddo:gpr64 = ADDSXrr %reg0, %reg1, implicit-def $nzcv
+    ; CHECK: [[CSINCWr:%[0-9]+]]:gpr32 = CSINCWr $wzr, $wzr, 7, implicit $nzcv
+    ; CHECK: $x0 = COPY %saddo
+    ; CHECK: RET_ReallyLR implicit $x0
+    %reg0:gpr(s64) = COPY $x0
+    %reg1:gpr(s64) = COPY $x1
+    %saddo:gpr(s64), %4:gpr(s1) = G_SADDO %reg0, %reg1
+    $x0 = COPY %saddo(s64)
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:            saddo_s32_imm
+alignment:       4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $w0, $w1, $x2
+    ; Check that we get ADDSWri when we can fold in a constant.
+    ;
+    ; CHECK-LABEL: name: saddo_s32_imm
+    ; CHECK: liveins: $w0, $w1, $x2
+    ; CHECK: %copy:gpr32sp = COPY $w0
+    ; CHECK: %saddo:gpr32 = ADDSWri %copy, 16, 0, implicit-def $nzcv
+    ; CHECK: %overflow:gpr32 = CSINCWr $wzr, $wzr, 7, implicit $nzcv
+    ; CHECK: $w0 = COPY %saddo
+    ; CHECK: RET_ReallyLR implicit $w0
+    %copy:gpr(s32) = COPY $w0
+    %constant:gpr(s32) = G_CONSTANT i32 16
+    %saddo:gpr(s32), %overflow:gpr(s1) = G_SADDO %copy, %constant
+    $w0 = COPY %saddo(s32)
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:            saddo_s32_shifted
+alignment:       4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $w0, $w1, $x2
+    ; Check that we get ADDSWrs when we can fold in a shift.
+    ;
+    ; CHECK-LABEL: name: saddo_s32_shifted
+    ; CHECK: liveins: $w0, $w1, $x2
+    ; CHECK: %reg0:gpr32 = COPY $w0
+    ; CHECK: %reg1:gpr32 = COPY $w1
+    ; CHECK: %add:gpr32 = ADDSWrs %reg0, %reg1, 16, implicit-def $nzcv
+    ; CHECK: %overflow:gpr32 = CSINCWr $wzr, $wzr, 7, implicit $nzcv
+    ; CHECK: $w0 = COPY %add
+    ; CHECK: RET_ReallyLR implicit $w0
+    %reg0:gpr(s32) = COPY $w0
+    %reg1:gpr(s32) = COPY $w1
+    %constant:gpr(s32) = G_CONSTANT i32 16
+    %shift:gpr(s32) = G_SHL %reg1(s32), %constant(s32)
+    %add:gpr(s32), %overflow:gpr(s1) = G_SADDO %reg0, %shift
+    $w0 = COPY %add(s32)
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:            saddo_s32_neg_imm
+alignment:       4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $w0, $w1, $x2
+    ; Check that we get SUBSWri when we can fold in a negative constant.
+    ;
+    ; CHECK-LABEL: name: saddo_s32_neg_imm
+    ; CHECK: liveins: $w0, $w1, $x2
+    ; CHECK: %copy:gpr32sp = COPY $w0
+    ; CHECK: %add:gpr32 = SUBSWri %copy, 16, 0, implicit-def $nzcv
+    ; CHECK: %overflow:gpr32 = CSINCWr $wzr, $wzr, 7, implicit $nzcv
+    ; CHECK: $w0 = COPY %add
+    ; CHECK: RET_ReallyLR implicit $w0
+    %copy:gpr(s32) = COPY $w0
+    %constant:gpr(s32) = G_CONSTANT i32 -16
+    %add:gpr(s32), %overflow:gpr(s1) = G_SADDO %copy, %constant
+    $w0 = COPY %add(s32)
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:            saddo_arith_extended
+alignment:       4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $w0, $x0
+    ; Check that we get ADDSXrx.
+    ; CHECK-LABEL: name: saddo_arith_extended
+    ; CHECK: liveins: $w0, $x0
+    ; CHECK: %reg0:gpr64sp = COPY $x0
+    ; CHECK: %reg1:gpr32 = COPY $w0
+    ; CHECK: %add:gpr64 = ADDSXrx %reg0, %reg1, 18, implicit-def $nzcv
+    ; CHECK: %flags:gpr32 = CSINCWr $wzr, $wzr, 7, implicit $nzcv
+    ; CHECK: $x0 = COPY %add
+    ; CHECK: RET_ReallyLR implicit $x0
+    %reg0:gpr(s64) = COPY $x0
+    %reg1:gpr(s32) = COPY $w0
+    %ext:gpr(s64) = G_ZEXT %reg1(s32)
+    %cst:gpr(s64) = G_CONSTANT i64 2
+    %shift:gpr(s64) = G_SHL %ext, %cst(s64)
+    %add:gpr(s64), %flags:gpr(s1) = G_SADDO %reg0, %shift
+    $x0 = COPY %add(s64)
+    RET_ReallyLR implicit $x0
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/select-ssubo.mir b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/select-ssubo.mir
new file mode 100644
index 000000000000..f6b1794645f7
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/select-ssubo.mir
@@ -0,0 +1,158 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -verify-machineinstrs -mtriple aarch64-unknown-uknown -global-isel -run-pass=instruction-select %s -o - | FileCheck %s
+
+...
+---
+name:            ssubo_s32
+alignment:       4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $w0, $w1, $x2
+
+    ; CHECK-LABEL: name: ssubo_s32
+    ; CHECK: liveins: $w0, $w1, $x2
+    ; CHECK: %reg0:gpr32 = COPY $w0
+    ; CHECK: %reg1:gpr32 = COPY $w1
+    ; CHECK: %ssubo:gpr32 = SUBSWrr %reg0, %reg1, implicit-def $nzcv
+    ; CHECK: [[CSINCWr:%[0-9]+]]:gpr32 = CSINCWr $wzr, $wzr, 7, implicit $nzcv
+    ; CHECK: $w0 = COPY %ssubo
+    ; CHECK: RET_ReallyLR implicit $w0
+    %reg0:gpr(s32) = COPY $w0
+    %reg1:gpr(s32) = COPY $w1
+    %ssubo:gpr(s32), %4:gpr(s1) = G_SSUBO %reg0, %reg1
+    $w0 = COPY %ssubo(s32)
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:            ssubo_s64
+alignment:       4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $x0, $x1, $x2
+
+    ; CHECK-LABEL: name: ssubo_s64
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK: %reg0:gpr64 = COPY $x0
+    ; CHECK: %reg1:gpr64 = COPY $x1
+    ; CHECK: %ssubo:gpr64 = SUBSXrr %reg0, %reg1, implicit-def $nzcv
+    ; CHECK: [[CSINCWr:%[0-9]+]]:gpr32 = CSINCWr $wzr, $wzr, 7, implicit $nzcv
+    ; CHECK: $x0 = COPY %ssubo
+    ; CHECK: RET_ReallyLR implicit $x0
+    %reg0:gpr(s64) = COPY $x0
+    %reg1:gpr(s64) = COPY $x1
+    %ssubo:gpr(s64), %4:gpr(s1) = G_SSUBO %reg0, %reg1
+    $x0 = COPY %ssubo(s64)
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:            ssubo_s32_imm
+alignment:       4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $w0, $w1, $x2
+    ; Check that we get SUBSWri when we can fold in a constant.
+    ;
+    ; CHECK-LABEL: name: ssubo_s32_imm
+    ; CHECK: liveins: $w0, $w1, $x2
+    ; CHECK: %copy:gpr32sp = COPY $w0
+    ; CHECK: %ssubo:gpr32 = SUBSWri %copy, 16, 0, implicit-def $nzcv
+    ; CHECK: %overflow:gpr32 = CSINCWr $wzr, $wzr, 7, implicit $nzcv
+    ; CHECK: $w0 = COPY %ssubo
+    ; CHECK: RET_ReallyLR implicit $w0
+    %copy:gpr(s32) = COPY $w0
+    %constant:gpr(s32) = G_CONSTANT i32 16
+    %ssubo:gpr(s32), %overflow:gpr(s1) = G_SSUBO %copy, %constant
+    $w0 = COPY %ssubo(s32)
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:            ssubo_s32_shifted
+alignment:       4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $w0, $w1, $x2
+    ; Check that we get SUBSWrs when we can fold in a shift.
+    ;
+    ; CHECK-LABEL: name: ssubo_s32_shifted
+    ; CHECK: liveins: $w0, $w1, $x2
+    ; CHECK: %reg0:gpr32 = COPY $w0
+    ; CHECK: %reg1:gpr32 = COPY $w1
+    ; CHECK: %sub:gpr32 = SUBSWrs %reg0, %reg1, 16, implicit-def $nzcv
+    ; CHECK: %overflow:gpr32 = CSINCWr $wzr, $wzr, 7, implicit $nzcv
+    ; CHECK: $w0 = COPY %sub
+    ; CHECK: RET_ReallyLR implicit $w0
+    %reg0:gpr(s32) = COPY $w0
+    %reg1:gpr(s32) = COPY $w1
+    %constant:gpr(s32) = G_CONSTANT i32 16
+    %shift:gpr(s32) = G_SHL %reg1(s32), %constant(s32)
+    %sub:gpr(s32), %overflow:gpr(s1) = G_SSUBO %reg0, %shift
+    $w0 = COPY %sub(s32)
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:            ssubo_s32_neg_imm
+alignment:       4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $w0, $w1, $x2
+    ; Check that we get ADDSWri when we can fold in a negative constant.
+    ;
+    ; CHECK-LABEL: name: ssubo_s32_neg_imm
+    ; CHECK: liveins: $w0, $w1, $x2
+    ; CHECK: %copy:gpr32sp = COPY $w0
+    ; CHECK: %sub:gpr32 = ADDSWri %copy, 16, 0, implicit-def $nzcv
+    ; CHECK: %overflow:gpr32 = CSINCWr $wzr, $wzr, 7, implicit $nzcv
+    ; CHECK: $w0 = COPY %sub
+    ; CHECK: RET_ReallyLR implicit $w0
+    %copy:gpr(s32) = COPY $w0
+    %constant:gpr(s32) = G_CONSTANT i32 -16
+    %sub:gpr(s32), %overflow:gpr(s1) = G_SSUBO %copy, %constant
+    $w0 = COPY %sub(s32)
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:            ssubo_arith_extended
+alignment:       4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $w0, $x0
+    ; Check that we get SUBSXrx.
+    ; CHECK-LABEL: name: ssubo_arith_extended
+    ; CHECK: liveins: $w0, $x0
+    ; CHECK: %reg0:gpr64sp = COPY $x0
+    ; CHECK: %reg1:gpr32 = COPY $w0
+    ; CHECK: %sub:gpr64 = SUBSXrx %reg0, %reg1, 18, implicit-def $nzcv
+    ; CHECK: %flags:gpr32 = CSINCWr $wzr, $wzr, 7, implicit $nzcv
+    ; CHECK: $x0 = COPY %sub
+    ; CHECK: RET_ReallyLR implicit $x0
+    %reg0:gpr(s64) = COPY $x0
+    %reg1:gpr(s32) = COPY $w0
+    %ext:gpr(s64) = G_ZEXT %reg1(s32)
+    %cst:gpr(s64) = G_CONSTANT i64 2
+    %shift:gpr(s64) = G_SHL %ext, %cst(s64)
+    %sub:gpr(s64), %flags:gpr(s1) = G_SSUBO %reg0, %shift
+    $x0 = COPY %sub(s64)
+    RET_ReallyLR implicit $x0
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
index 9814f7625853..2cbe8315bc7e 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
@@ -763,7 +763,8 @@ static inline bool isSVECpyImm(int64_t Imm) {
   bool IsImm8 = int8_t(Imm) == Imm;
   bool IsImm16 = int16_t(Imm & ~0xff) == Imm;
 
-  if (std::is_same<int8_t, std::make_signed_t<T>>::value)
+  if (std::is_same<int8_t, std::make_signed_t<T>>::value ||
+      std::is_same<int8_t, T>::value)
     return IsImm8 || uint8_t(Imm) == Imm;
 
   if (std::is_same<int16_t, std::make_signed_t<T>>::value)
@@ -775,7 +776,8 @@ static inline bool isSVECpyImm(int64_t Imm) {
 /// Returns true if Imm is valid for ADD/SUB.
 template <typename T>
 static inline bool isSVEAddSubImm(int64_t Imm) {
-  bool IsInt8t = std::is_same<int8_t, std::make_signed_t<T>>::value;
+  bool IsInt8t = std::is_same<int8_t, std::make_signed_t<T>>::value ||
+                 std::is_same<int8_t, T>::value;
   return uint8_t(Imm) == Imm || (!IsInt8t && uint16_t(Imm & ~0xff) == Imm);
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index 9f7dfdf62482..75a9f2f5c80e 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -88,8 +88,6 @@ public:
                   uint64_t Value, bool IsResolved,
                   const MCSubtargetInfo *STI) const override;
 
-  bool mayNeedRelaxation(const MCInst &Inst,
-                         const MCSubtargetInfo &STI) const override;
   bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
                             const MCRelaxableFragment *DF,
                             const MCAsmLayout &Layout) const override;
@@ -156,19 +154,6 @@ static unsigned AdrImmBits(unsigned Value) {
   return (hi19 << 5) | (lo2 << 29);
 }
 
-static bool valueFitsIntoFixupKind(unsigned Kind, uint64_t Value) {
-  unsigned NumBits;
-  switch(Kind) {
-  case FK_Data_1: NumBits = 8; break;
-  case FK_Data_2: NumBits = 16; break;
-  case FK_Data_4: NumBits = 32; break;
-  case FK_Data_8: NumBits = 64; break;
-  default: return true;
-  }
-  return isUIntN(NumBits, Value) ||
-    isIntN(NumBits, static_cast<int64_t>(Value));
-}
-
 static uint64_t adjustFixupValue(const MCFixup &Fixup, const MCValue &Target,
                                  uint64_t Value, MCContext &Ctx,
                                  const Triple &TheTriple, bool IsResolved) {
@@ -343,9 +328,6 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, const MCValue &Target,
   case FK_Data_2:
   case FK_Data_4:
   case FK_Data_8:
-    if (!valueFitsIntoFixupKind(Fixup.getTargetKind(), Value))
-      Ctx.reportError(Fixup.getLoc(), "fixup value too large for data type!");
-    LLVM_FALLTHROUGH;
   case FK_SecRel_2:
   case FK_SecRel_4:
     return Value;
@@ -463,11 +445,6 @@ void AArch64AsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
   }
 }
 
-bool AArch64AsmBackend::mayNeedRelaxation(const MCInst &Inst,
-                                          const MCSubtargetInfo &STI) const {
-  return false;
-}
-
 bool AArch64AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
                                              uint64_t Value,
                                              const MCRelaxableFragment *DF,
@@ -781,7 +758,7 @@ MCAsmBackend *llvm::createAArch64leAsmBackend(const Target &T,
   assert(TheTriple.isOSBinFormatELF() && "Invalid target");
 
   uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
-  bool IsILP32 = Options.getABIName() == "ilp32";
+  bool IsILP32 = STI.getTargetTriple().getEnvironment() == Triple::GNUILP32;
   return new ELFAArch64AsmBackend(T, TheTriple, OSABI, /*IsLittleEndian=*/true,
                                   IsILP32);
 }
@@ -794,7 +771,7 @@ MCAsmBackend *llvm::createAArch64beAsmBackend(const Target &T,
   assert(TheTriple.isOSBinFormatELF() &&
          "Big endian is only supported for ELF targets!");
   uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
-  bool IsILP32 = Options.getABIName() == "ilp32";
+  bool IsILP32 = STI.getTargetTriple().getEnvironment() == Triple::GNUILP32;
   return new ELFAArch64AsmBackend(T, TheTriple, OSABI, /*IsLittleEndian=*/false,
                                   IsILP32);
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
index e5637dcab941..fcf67bd2f740 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
@@ -43,7 +43,7 @@ protected:
 } // end anonymous namespace
 
 AArch64ELFObjectWriter::AArch64ELFObjectWriter(uint8_t OSABI, bool IsILP32)
-    : MCELFObjectTargetWriter(/*Is64Bit*/ true, OSABI, ELF::EM_AARCH64,
+    : MCELFObjectTargetWriter(/*Is64Bit*/ !IsILP32, OSABI, ELF::EM_AARCH64,
                               /*HasRelocationAddend*/ true),
       IsILP32(IsILP32) {}
 
@@ -322,7 +322,11 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
       if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
         return R_CLS(LDST64_ABS_LO12_NC);
       if (SymLoc == AArch64MCExpr::VK_GOT && IsNC) {
+        AArch64MCExpr::VariantKind AddressLoc =
+            AArch64MCExpr::getAddressFrag(RefKind);
         if (!IsILP32) {
+          if (AddressLoc == AArch64MCExpr::VK_LO15)
+            return ELF::R_AARCH64_LD64_GOTPAGE_LO15;
           return ELF::R_AARCH64_LD64_GOT_LO12_NC;
         } else {
           Ctx.reportError(Fixup.getLoc(), "ILP32 64-bit load/store "
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index 6dfda8217628..ec97e1c8b76a 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -51,6 +51,61 @@ class AArch64TargetAsmStreamer : public AArch64TargetStreamer {
     OS << "\t.variant_pcs " << Symbol->getName() << "\n";
   }
 
+  void EmitARM64WinCFIAllocStack(unsigned Size) override {
+    OS << "\t.seh_stackalloc " << Size << "\n";
+  }
+  void EmitARM64WinCFISaveR19R20X(int Offset) override {
+    OS << "\t.seh_save_r19r20_x " << Offset << "\n";
+  }
+  void EmitARM64WinCFISaveFPLR(int Offset) override {
+    OS << "\t.seh_save_fplr " << Offset << "\n";
+  }
+  void EmitARM64WinCFISaveFPLRX(int Offset) override {
+    OS << "\t.seh_save_fplr_x " << Offset << "\n";
+  }
+  void EmitARM64WinCFISaveReg(unsigned Reg, int Offset) override {
+    OS << "\t.seh_save_reg x" << Reg << ", " << Offset << "\n";
+  }
+  void EmitARM64WinCFISaveRegX(unsigned Reg, int Offset) override {
+    OS << "\t.seh_save_reg_x x" << Reg << ", " << Offset << "\n";
+  }
+  void EmitARM64WinCFISaveRegP(unsigned Reg, int Offset) override {
+    OS << "\t.seh_save_regp x" << Reg << ", " << Offset << "\n";
+  }
+  void EmitARM64WinCFISaveRegPX(unsigned Reg, int Offset) override {
+    OS << "\t.seh_save_regp_x x" << Reg << ", " << Offset << "\n";
+  }
+  void EmitARM64WinCFISaveLRPair(unsigned Reg, int Offset) override {
+    OS << "\t.seh_save_lrpair x" << Reg << ", " << Offset << "\n";
+  }
+  void EmitARM64WinCFISaveFReg(unsigned Reg, int Offset) override {
+    OS << "\t.seh_save_freg d" << Reg << ", " << Offset << "\n";
+  }
+  void EmitARM64WinCFISaveFRegX(unsigned Reg, int Offset) override {
+    OS << "\t.seh_save_freg_x d" << Reg << ", " << Offset << "\n";
+  }
+  void EmitARM64WinCFISaveFRegP(unsigned Reg, int Offset) override {
+    OS << "\t.seh_save_fregp d" << Reg << ", " << Offset << "\n";
+  }
+  void EmitARM64WinCFISaveFRegPX(unsigned Reg, int Offset) override {
+    OS << "\t.seh_save_fregp_x d" << Reg << ", " << Offset << "\n";
+  }
+  void EmitARM64WinCFISetFP() override { OS << "\t.seh_set_fp\n"; }
+  void EmitARM64WinCFIAddFP(unsigned Size) override {
+    OS << "\t.seh_add_fp " << Size << "\n";
+  }
+  void EmitARM64WinCFINop() override { OS << "\t.seh_nop\n"; }
+  void EmitARM64WinCFISaveNext() override { OS << "\t.seh_save_next\n"; }
+  void EmitARM64WinCFIPrologEnd() override { OS << "\t.seh_endprologue\n"; }
+  void EmitARM64WinCFIEpilogStart() override { OS << "\t.seh_startepilogue\n"; }
+  void EmitARM64WinCFIEpilogEnd() override { OS << "\t.seh_endepilogue\n"; }
+  void EmitARM64WinCFITrapFrame() override { OS << "\t.seh_trap_frame\n"; }
+  void EmitARM64WinCFIMachineFrame() override { OS << "\t.seh_pushframe\n"; }
+  void EmitARM64WinCFIContext() override { OS << "\t.seh_context\n"; }
+  void EmitARM64WinCFIClearUnwoundToCall() override {
+    OS << "\t.seh_clear_unwound_to_call\n";
+  }
+
 public:
   AArch64TargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
 };
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
index 38474d31460d..340120d2b9e8 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
@@ -849,7 +849,7 @@ bool AArch64InstPrinter::printSysAlias(const MCInst *MI,
     }
     break;
     }
-  } else if (CnVal == 8) {
+  } else if (CnVal == 8 || CnVal == 9) {
     // TLBI aliases
     const AArch64TLBI::TLBI *TLBI = AArch64TLBI::lookupTLBIByEncoding(Encoding);
     if (!TLBI || !TLBI->haveFeatures(STI.getFeatureBits()))
@@ -1377,7 +1377,8 @@ void AArch64InstPrinter::printAlignedLabel(const MCInst *MI, uint64_t Address,
   }
 }
 
-void AArch64InstPrinter::printAdrpLabel(const MCInst *MI, unsigned OpNum,
+void AArch64InstPrinter::printAdrpLabel(const MCInst *MI, uint64_t Address,
+                                        unsigned OpNum,
                                         const MCSubtargetInfo &STI,
                                         raw_ostream &O) {
   const MCOperand &Op = MI->getOperand(OpNum);
@@ -1385,7 +1386,11 @@ void AArch64InstPrinter::printAdrpLabel(const MCInst *MI, unsigned OpNum,
   // If the label has already been resolved to an immediate offset (say, when
   // we're running the disassembler), just print the immediate.
   if (Op.isImm()) {
-    O << "#" << formatImm(Op.getImm() * (1 << 12));
+    const int64_t Offset = Op.getImm() * 4096;
+    if (PrintBranchImmAsAddress)
+      O << formatHex((Address & -4096) + Offset);
+    else
+      O << "#" << Offset;
     return;
   }
 
@@ -1416,6 +1421,22 @@ void AArch64InstPrinter::printBarrierOption(const MCInst *MI, unsigned OpNo,
     O << "#" << Val;
 }
 
+void AArch64InstPrinter::printBarriernXSOption(const MCInst *MI, unsigned OpNo,
+                                               const MCSubtargetInfo &STI,
+                                               raw_ostream &O) {
+  unsigned Val = MI->getOperand(OpNo).getImm();
+  assert(MI->getOpcode() == AArch64::DSBnXS);
+
+  StringRef Name;
+  auto DB = AArch64DBnXS::lookupDBnXSByEncoding(Val);
+  Name = DB ? DB->Name : "";
+
+  if (!Name.empty())
+    O << Name;
+  else
+    O << "#" << Val;
+}
+
 void AArch64InstPrinter::printMRSSystemRegister(const MCInst *MI, unsigned OpNo,
                                                 const MCSubtargetInfo &STI,
                                                 raw_ostream &O) {
@@ -1623,3 +1644,10 @@ void AArch64InstPrinter::printGPR64as32(const MCInst *MI, unsigned OpNum,
   unsigned Reg = MI->getOperand(OpNum).getReg();
   O << getRegisterName(getWRegFromXReg(Reg));
 }
+
+void AArch64InstPrinter::printGPR64x8(const MCInst *MI, unsigned OpNum,
+                                      const MCSubtargetInfo &STI,
+                                      raw_ostream &O) {
+  unsigned Reg = MI->getOperand(OpNum).getReg();
+  O << getRegisterName(MRI.getSubReg(Reg, AArch64::x8sub_0));
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h
index 6da5f0e81c80..4be885e667d8 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h
@@ -30,6 +30,7 @@ public:
   void printRegName(raw_ostream &OS, unsigned RegNo) const override;
 
   // Autogenerated by tblgen.
+  std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
   virtual void printInstruction(const MCInst *MI, uint64_t Address,
                                 const MCSubtargetInfo &STI, raw_ostream &O);
   virtual bool printAliasInstr(const MCInst *MI, uint64_t Address,
@@ -155,10 +156,12 @@ protected:
 
   void printVectorIndex(const MCInst *MI, unsigned OpNum,
                         const MCSubtargetInfo &STI, raw_ostream &O);
-  void printAdrpLabel(const MCInst *MI, unsigned OpNum,
+  void printAdrpLabel(const MCInst *MI, uint64_t Address, unsigned OpNum,
                       const MCSubtargetInfo &STI, raw_ostream &O);
   void printBarrierOption(const MCInst *MI, unsigned OpNum,
                           const MCSubtargetInfo &STI, raw_ostream &O);
+  void printBarriernXSOption(const MCInst *MI, unsigned OpNum,
+                             const MCSubtargetInfo &STI, raw_ostream &O);
   void printMSRSystemRegister(const MCInst *MI, unsigned OpNum,
                               const MCSubtargetInfo &STI, raw_ostream &O);
   void printMRSSystemRegister(const MCInst *MI, unsigned OpNum,
@@ -187,6 +190,8 @@ protected:
                     const MCSubtargetInfo &STI, raw_ostream &O);
   void printGPR64as32(const MCInst *MI, unsigned OpNum,
                       const MCSubtargetInfo &STI, raw_ostream &O);
+  void printGPR64x8(const MCInst *MI, unsigned OpNum,
+                    const MCSubtargetInfo &STI, raw_ostream &O);
   template <int Width>
   void printZPRasFPR(const MCInst *MI, unsigned OpNum,
                      const MCSubtargetInfo &STI, raw_ostream &O);
@@ -203,6 +208,7 @@ public:
   void printInst(const MCInst *MI, uint64_t Address, StringRef Annot,
                  const MCSubtargetInfo &STI, raw_ostream &O) override;
 
+  std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
   void printInstruction(const MCInst *MI, uint64_t Address,
                         const MCSubtargetInfo &STI, raw_ostream &O) override;
   bool printAliasInstr(const MCInst *MI, uint64_t Address,
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index 9a63e26dec19..68c721cb0d72 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -73,7 +73,7 @@ AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(const Triple &T) {
   // targeting ELF.
   AssemblerDialect = AsmWriterVariant == Default ? Generic : AsmWriterVariant;
 
-  CodePointerSize = 8;
+  CodePointerSize = T.getEnvironment() == Triple::GNUILP32 ? 4 : 8;
 
   // ".comm align is in bytes but .align is pow-2."
   AlignmentIsInBytes = false;
@@ -111,7 +111,7 @@ AArch64MCAsmInfoMicrosoftCOFF::AArch64MCAsmInfoMicrosoftCOFF() {
   SupportsDebugInformation = true;
   CodePointerSize = 8;
 
-  CommentString = ";";
+  CommentString = "//";
   ExceptionsType = ExceptionHandling::WinEH;
   WinEHEncodingType = WinEH::EncodingType::Itanium;
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
index 548e399e05a3..844bd6bbada9 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
@@ -70,6 +70,7 @@ StringRef AArch64MCExpr::getVariantKindName() const {
   case VK_ABS_PAGE_NC:         return ":pg_hi21_nc:";
   case VK_GOT:                 return ":got:";
   case VK_GOT_PAGE:            return ":got:";
+  case VK_GOT_PAGE_LO15:       return ":gotpage_lo15:";
   case VK_GOT_LO12:            return ":got_lo12:";
   case VK_GOTTPREL:            return ":gottprel:";
   case VK_GOTTPREL_PAGE:       return ":gottprel:";
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
index a82ff2e91426..d3e834a140b2 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
@@ -46,6 +46,7 @@ public:
     VK_G1       = 0x050,
     VK_G2       = 0x060,
     VK_G3       = 0x070,
+    VK_LO15     = 0x080,
     VK_AddressFragBits = 0x0f0,
 
     // Whether the final relocation is a checked one (where a linker should
@@ -82,6 +83,7 @@ public:
     VK_PREL_G0_NC        = VK_PREL     | VK_G0      | VK_NC,
     VK_GOT_LO12          = VK_GOT      | VK_PAGEOFF | VK_NC,
     VK_GOT_PAGE          = VK_GOT      | VK_PAGE,
+    VK_GOT_PAGE_LO15     = VK_GOT      | VK_LO15    | VK_NC,
     VK_DTPREL_G2         = VK_DTPREL   | VK_G2,
     VK_DTPREL_G1         = VK_DTPREL   | VK_G1,
     VK_DTPREL_G1_NC      = VK_DTPREL   | VK_G1      | VK_NC,
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index 209bff3a2311..3c2df1621e11 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -50,10 +50,14 @@ static MCInstrInfo *createAArch64MCInstrInfo() {
 
 static MCSubtargetInfo *
 createAArch64MCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
-  if (CPU.empty())
+  if (CPU.empty()) {
     CPU = "generic";
 
-  return createAArch64MCSubtargetInfoImpl(TT, CPU, FS);
+    if (TT.isArm64e())
+      CPU = "apple-a12";
+  }
+
+  return createAArch64MCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FS);
 }
 
 void AArch64_MC::initLLVMToCVRegMapping(MCRegisterInfo *MRI) {
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
index b0f414bd27ed..012661edbbfd 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
@@ -373,7 +373,11 @@ void AArch64MachObjectWriter::recordRelocation(
        Type == MachO::ARM64_RELOC_PAGE21 ||
        Type == MachO::ARM64_RELOC_PAGEOFF12) &&
       Value) {
-    assert((Value & 0xff000000) == 0 && "Added relocation out of range!");
+    if (!isInt<24>(Value)) {
+      Asm.getContext().reportError(Fixup.getLoc(),
+                                   "addend too big for relocation");
+      return;
+    }
 
     MachO::any_relocation_info MRE;
     MRE.r_word0 = FixupOffset;
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
index 48ed68f49263..f32a8f15b8a5 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
@@ -11,12 +11,23 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64TargetStreamer.h"
+#include "AArch64MCAsmInfo.h"
+#include "AArch64Subtarget.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/ConstantPools.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/CommandLine.h"
 
 using namespace llvm;
 
+static cl::opt<bool> MarkBTIProperty(
+    "aarch64-mark-bti-property", cl::Hidden,
+    cl::desc("Add .note.gnu.property with BTI to assembly files"),
+    cl::init(false));
+
 //
 // AArch64TargetStreamer Implemenation
 //
@@ -37,8 +48,50 @@ void AArch64TargetStreamer::emitCurrentConstantPool() {
   ConstantPools->emitForCurrentSection(Streamer);
 }
 
-// finish() - write out any non-empty assembler constant pools.
-void AArch64TargetStreamer::finish() { ConstantPools->emitAll(Streamer); }
+// finish() - write out any non-empty assembler constant pools and
+//   write out note.gnu.properties if need.
+void AArch64TargetStreamer::finish() {
+  ConstantPools->emitAll(Streamer);
+
+  if (MarkBTIProperty)
+    emitNoteSection(ELF::GNU_PROPERTY_AARCH64_FEATURE_1_BTI);
+}
+
+void AArch64TargetStreamer::emitNoteSection(unsigned Flags) {
+  if (Flags == 0)
+    return;
+
+  MCStreamer &OutStreamer = getStreamer();
+  MCContext &Context = OutStreamer.getContext();
+  // Emit a .note.gnu.property section with the flags.
+  MCSectionELF *Nt = Context.getELFSection(".note.gnu.property", ELF::SHT_NOTE,
+                                           ELF::SHF_ALLOC);
+  if (Nt->isRegistered()) {
+    SMLoc Loc;
+    Context.reportWarning(
+        Loc,
+        "The .note.gnu.property is not emitted because it is already present.");
+    return;
+  }
+  MCSection *Cur = OutStreamer.getCurrentSectionOnly();
+  OutStreamer.SwitchSection(Nt);
+
+  // Emit the note header.
+  OutStreamer.emitValueToAlignment(Align(8).value());
+  OutStreamer.emitIntValue(4, 4);     // data size for "GNU\0"
+  OutStreamer.emitIntValue(4 * 4, 4); // Elf_Prop size
+  OutStreamer.emitIntValue(ELF::NT_GNU_PROPERTY_TYPE_0, 4);
+  OutStreamer.emitBytes(StringRef("GNU", 4)); // note name
+
+  // Emit the PAC/BTI properties.
+  OutStreamer.emitIntValue(ELF::GNU_PROPERTY_AARCH64_FEATURE_1_AND, 4);
+  OutStreamer.emitIntValue(4, 4);     // data size
+  OutStreamer.emitIntValue(Flags, 4); // data
+  OutStreamer.emitIntValue(0, 4);     // pad
+
+  OutStreamer.endSection(Nt);
+  OutStreamer.SwitchSection(Cur);
+}
 
 void AArch64TargetStreamer::emitInst(uint32_t Inst) {
   char Buffer[4];
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
index 1af978a806d1..73dc1e5d4d2a 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
@@ -33,6 +33,9 @@ public:
   /// Emit contents of constant pool for the current section.
   void emitCurrentConstantPool();
 
+  /// Callback used to implement the .note.gnu.property section.
+  void emitNoteSection(unsigned Flags);
+
   /// Callback used to implement the .inst directive.
   virtual void emitInst(uint32_t Inst);
 
@@ -40,12 +43,14 @@ public:
   virtual void emitDirectiveVariantPCS(MCSymbol *Symbol) {};
 
   virtual void EmitARM64WinCFIAllocStack(unsigned Size) {}
+  virtual void EmitARM64WinCFISaveR19R20X(int Offset) {}
   virtual void EmitARM64WinCFISaveFPLR(int Offset) {}
   virtual void EmitARM64WinCFISaveFPLRX(int Offset) {}
   virtual void EmitARM64WinCFISaveReg(unsigned Reg, int Offset) {}
   virtual void EmitARM64WinCFISaveRegX(unsigned Reg, int Offset) {}
   virtual void EmitARM64WinCFISaveRegP(unsigned Reg, int Offset) {}
   virtual void EmitARM64WinCFISaveRegPX(unsigned Reg, int Offset) {}
+  virtual void EmitARM64WinCFISaveLRPair(unsigned Reg, int Offset) {}
   virtual void EmitARM64WinCFISaveFReg(unsigned Reg, int Offset) {}
   virtual void EmitARM64WinCFISaveFRegX(unsigned Reg, int Offset) {}
   virtual void EmitARM64WinCFISaveFRegP(unsigned Reg, int Offset) {}
@@ -53,9 +58,14 @@ public:
   virtual void EmitARM64WinCFISetFP() {}
   virtual void EmitARM64WinCFIAddFP(unsigned Size) {}
   virtual void EmitARM64WinCFINop() {}
+  virtual void EmitARM64WinCFISaveNext() {}
   virtual void EmitARM64WinCFIPrologEnd() {}
   virtual void EmitARM64WinCFIEpilogStart() {}
   virtual void EmitARM64WinCFIEpilogEnd() {}
+  virtual void EmitARM64WinCFITrapFrame() {}
+  virtual void EmitARM64WinCFIMachineFrame() {}
+  virtual void EmitARM64WinCFIContext() {}
+  virtual void EmitARM64WinCFIClearUnwoundToCall() {}
 
 private:
   std::unique_ptr<AssemblerConstantPools> ConstantPools;
@@ -86,12 +96,14 @@ public:
   // The unwind codes on ARM64 Windows are documented at
   // https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling
   void EmitARM64WinCFIAllocStack(unsigned Size) override;
+  void EmitARM64WinCFISaveR19R20X(int Offset) override;
   void EmitARM64WinCFISaveFPLR(int Offset) override;
   void EmitARM64WinCFISaveFPLRX(int Offset) override;
   void EmitARM64WinCFISaveReg(unsigned Reg, int Offset) override;
   void EmitARM64WinCFISaveRegX(unsigned Reg, int Offset) override;
   void EmitARM64WinCFISaveRegP(unsigned Reg, int Offset) override;
   void EmitARM64WinCFISaveRegPX(unsigned Reg, int Offset) override;
+  void EmitARM64WinCFISaveLRPair(unsigned Reg, int Offset) override;
   void EmitARM64WinCFISaveFReg(unsigned Reg, int Offset) override;
   void EmitARM64WinCFISaveFRegX(unsigned Reg, int Offset) override;
   void EmitARM64WinCFISaveFRegP(unsigned Reg, int Offset) override;
@@ -99,9 +111,15 @@ public:
   void EmitARM64WinCFISetFP() override;
   void EmitARM64WinCFIAddFP(unsigned Size) override;
   void EmitARM64WinCFINop() override;
+  void EmitARM64WinCFISaveNext() override;
   void EmitARM64WinCFIPrologEnd() override;
   void EmitARM64WinCFIEpilogStart() override;
   void EmitARM64WinCFIEpilogEnd() override;
+  void EmitARM64WinCFITrapFrame() override;
+  void EmitARM64WinCFIMachineFrame() override;
+  void EmitARM64WinCFIContext() override;
+  void EmitARM64WinCFIClearUnwoundToCall() override;
+
 private:
   void EmitARM64WinUnwindCode(unsigned UnwindCode, int Reg, int Offset);
 };
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
index 03fbab5142a2..1c50706a26f9 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
@@ -28,6 +28,7 @@ public:
 
   void EmitWinEHHandlerData(SMLoc Loc) override;
   void EmitWindowsUnwindTables() override;
+  void EmitWindowsUnwindTables(WinEH::FrameInfo *Frame) override;
   void finishImpl() override;
 };
 
@@ -36,7 +37,12 @@ void AArch64WinCOFFStreamer::EmitWinEHHandlerData(SMLoc Loc) {
 
   // We have to emit the unwind info now, because this directive
   // actually switches to the .xdata section!
-  EHStreamer.EmitUnwindInfo(*this, getCurrentWinFrameInfo());
+  EHStreamer.EmitUnwindInfo(*this, getCurrentWinFrameInfo(),
+                            /* HandlerData = */ true);
+}
+
+void AArch64WinCOFFStreamer::EmitWindowsUnwindTables(WinEH::FrameInfo *Frame) {
+  EHStreamer.EmitUnwindInfo(*this, Frame, /* HandlerData = */ false);
 }
 
 void AArch64WinCOFFStreamer::EmitWindowsUnwindTables() {
@@ -85,6 +91,10 @@ void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIAllocStack(unsigned Size) {
   EmitARM64WinUnwindCode(Op, -1, Size);
 }
 
+void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveR19R20X(int Offset) {
+  EmitARM64WinUnwindCode(Win64EH::UOP_SaveR19R20X, -1, Offset);
+}
+
 void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveFPLR(int Offset) {
   EmitARM64WinUnwindCode(Win64EH::UOP_SaveFPLR, -1, Offset);
 }
@@ -115,6 +125,11 @@ void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveRegPX(unsigned Reg,
   EmitARM64WinUnwindCode(Win64EH::UOP_SaveRegPX, Reg, Offset);
 }
 
+void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveLRPair(unsigned Reg,
+                                                             int Offset) {
+  EmitARM64WinUnwindCode(Win64EH::UOP_SaveLRPair, Reg, Offset);
+}
+
 void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveFReg(unsigned Reg,
                                                            int Offset) {
   assert(Offset >= 0 && Offset <= 504 &&
@@ -150,6 +165,10 @@ void AArch64TargetWinCOFFStreamer::EmitARM64WinCFINop() {
   EmitARM64WinUnwindCode(Win64EH::UOP_Nop, -1, 0);
 }
 
+void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveNext() {
+  EmitARM64WinUnwindCode(Win64EH::UOP_SaveNext, -1, 0);
+}
+
 // The functions below handle opcodes that can end up in either a prolog or
 // an epilog, but not both.
 void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIPrologEnd() {
@@ -188,6 +207,22 @@ void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIEpilogEnd() {
   CurrentEpilog = nullptr;
 }
 
+void AArch64TargetWinCOFFStreamer::EmitARM64WinCFITrapFrame() {
+  EmitARM64WinUnwindCode(Win64EH::UOP_TrapFrame, -1, 0);
+}
+
+void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIMachineFrame() {
+  EmitARM64WinUnwindCode(Win64EH::UOP_PushMachFrame, -1, 0);
+}
+
+void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIContext() {
+  EmitARM64WinUnwindCode(Win64EH::UOP_Context, -1, 0);
+}
+
+void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIClearUnwoundToCall() {
+  EmitARM64WinUnwindCode(Win64EH::UOP_ClearUnwoundToCall, -1, 0);
+}
+
 MCWinCOFFStreamer *createAArch64WinCOFFStreamer(
     MCContext &Context, std::unique_ptr<MCAsmBackend> MAB,
     std::unique_ptr<MCObjectWriter> OW, std::unique_ptr<MCCodeEmitter> Emitter,
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/SVEInstrFormats.td b/contrib/llvm-project/llvm/lib/Target/AArch64/SVEInstrFormats.td
index e86f2a6ebde4..4eecf72862a8 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -206,10 +206,20 @@ def SVELogicalImm64Pat : ComplexPattern<i64, 1, "SelectSVELogicalImm<MVT::i64>",
 
 def SVE8BitLslImm : ComplexPattern<i32, 2, "SelectSVE8BitLslImm", [imm]>;
 
-def SVEArithUImmPat  : ComplexPattern<i32, 1, "SelectSVEArithImm", []>;
+def SVEArithUImm8Pat  : ComplexPattern<i32, 1, "SelectSVEArithImm<MVT::i8>", []>;
+def SVEArithUImm16Pat  : ComplexPattern<i32, 1, "SelectSVEArithImm<MVT::i16>", []>;
+def SVEArithUImm32Pat  : ComplexPattern<i32, 1, "SelectSVEArithImm<MVT::i32>", []>;
+def SVEArithUImm64Pat  : ComplexPattern<i32, 1, "SelectSVEArithImm<MVT::i64>", []>;
 def SVEArithSImmPat  : ComplexPattern<i32, 1, "SelectSVESignedArithImm", []>;
 
-def SVEShiftImm64 : ComplexPattern<i32, 1, "SelectSVEShiftImm64<0, 64>", []>;
+def SVEShiftImmL8  : ComplexPattern<i32, 1, "SelectSVEShiftImm<0, 7>",  []>;
+def SVEShiftImmL16 : ComplexPattern<i32, 1, "SelectSVEShiftImm<0, 15>", []>;
+def SVEShiftImmL32 : ComplexPattern<i32, 1, "SelectSVEShiftImm<0, 31>", []>;
+def SVEShiftImmL64 : ComplexPattern<i32, 1, "SelectSVEShiftImm<0, 63>", []>;
+def SVEShiftImmR8  : ComplexPattern<i32, 1, "SelectSVEShiftImm<1, 8,  true>", []>;
+def SVEShiftImmR16 : ComplexPattern<i32, 1, "SelectSVEShiftImm<1, 16, true>", []>;
+def SVEShiftImmR32 : ComplexPattern<i32, 1, "SelectSVEShiftImm<1, 32, true>", []>;
+def SVEShiftImmR64 : ComplexPattern<i32, 1, "SelectSVEShiftImm<1, 64, true>", []>;
 
 class SVEExactFPImm<string Suffix, string ValA, string ValB> : AsmOperandClass {
   let Name = "SVEExactFPImmOperand" # Suffix;
@@ -270,6 +280,8 @@ class sve_int_ptrue<bits<2> sz8_64, bits<3> opc, string asm, PPRRegOp pprty,
   let Inst{3-0}   = Pd;
 
   let Defs = !if(!eq (opc{0}, 1), [NZCV], []);
+  let ElementSize = pprty.ElementSize;
+  let isReMaterializable = 1;
 }
 
 multiclass sve_int_ptrue<bits<3> opc, string asm, SDPatternOperator op> {
@@ -305,6 +317,18 @@ class SVE_1_Op_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
 : Pat<(vtd (op vt1:$Op1)),
       (inst $Op1)>;
 
+class SVE_1_Op_Passthru_Pat<ValueType vtd, SDPatternOperator op, ValueType pg,
+                            ValueType vts, Instruction inst>
+: Pat<(vtd (op pg:$Op1, vts:$Op2, vtd:$Op3)),
+      (inst $Op3, $Op1, $Op2)>;
+
+// Used to match FP_ROUND_MERGE_PASSTHRU, which has an additional flag for the
+// type of rounding. This is matched by timm0_1 in pattern below and ignored.
+class SVE_1_Op_Passthru_Round_Pat<ValueType vtd, SDPatternOperator op, ValueType pg,
+                                  ValueType vts, Instruction inst>
+: Pat<(vtd (op pg:$Op1, vts:$Op2, (i64 timm0_1), vtd:$Op3)),
+      (inst $Op3, $Op1, $Op2)>;
+
 class SVE_1_Op_Imm_OptLsl_Reverse_Pat<ValueType vt, SDPatternOperator op, ZPRRegOp zprty,
                                       ValueType it, ComplexPattern cpx, Instruction inst>
   : Pat<(vt (op (vt (AArch64dup (it (cpx i32:$imm, i32:$shift)))), (vt zprty:$Op1))),
@@ -315,16 +339,6 @@ class SVE_1_Op_Imm_OptLsl_Pat<ValueType vt, SDPatternOperator op, ZPRRegOp zprty
   : Pat<(vt (op (vt zprty:$Op1), (vt (AArch64dup (it (cpx i32:$imm, i32:$shift)))))),
         (inst $Op1, i32:$imm, i32:$shift)>;
 
-class SVE_1_Op_Imm_Arith_Pat<ValueType vt, SDPatternOperator op, ZPRRegOp zprty,
-                             ValueType it, ComplexPattern cpx, Instruction inst>
-  : Pat<(vt (op (vt zprty:$Op1), (vt (AArch64dup (it (cpx i32:$imm)))))),
-        (inst $Op1, i32:$imm)>;
-
-class SVE_1_Op_Imm_Shift_Pred_Pat<ValueType vt, ValueType pt, SDPatternOperator op,
-                                  ZPRRegOp zprty, Operand ImmTy, Instruction inst>
-  : Pat<(vt (op (pt (AArch64ptrue 31)), (vt zprty:$Op1), (vt (AArch64dup (ImmTy:$imm))))),
-        (inst $Op1, ImmTy:$imm)>;
-
 class SVE_1_Op_Imm_Arith_Pred_Pat<ValueType vt, ValueType pt, SDPatternOperator op,
                                   ZPRRegOp zprty, ValueType it, ComplexPattern cpx, Instruction inst>
   : Pat<(vt (op (pt (AArch64ptrue 31)), (vt zprty:$Op1), (vt (AArch64dup (it (cpx i32:$imm)))))),
@@ -340,10 +354,11 @@ class SVE_2_Op_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
 : Pat<(vtd (op vt1:$Op1, vt2:$Op2)),
       (inst $Op1, $Op2)>;
 
-class SVE_2_Op_Pat_Reduce_To_Neon<ValueType vtd, SDPatternOperator op, ValueType vt1,
-                   ValueType vt2, Instruction inst, SubRegIndex sub>
-: Pat<(vtd (op vt1:$Op1, vt2:$Op2)),
-      (INSERT_SUBREG (vtd (IMPLICIT_DEF)), (inst $Op1, $Op2), sub)>;
+class SVE_2_Op_Pred_All_Active<ValueType vtd, SDPatternOperator op,
+                               ValueType pt, ValueType vt1, ValueType vt2,
+                               Instruction inst>
+: Pat<(vtd (op (pt (AArch64ptrue 31)), vt1:$Op1, vt2:$Op2)),
+      (inst $Op1, $Op2)>;
 
 class SVE_3_Op_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
                    ValueType vt2, ValueType vt3, Instruction inst>
@@ -403,6 +418,23 @@ class SVE_2_Op_AllActive_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
 : Pat<(vtd (op vt1:$Op1, vt2:$Op2)),
       (inst (ptrue 31), $Op1, $Op2)>;
 
+class SVE_InReg_Extend<ValueType vt, SDPatternOperator op, ValueType pt,
+                       ValueType inreg_vt, Instruction inst>
+: Pat<(vt (op pt:$Pg, vt:$Src, inreg_vt, vt:$PassThru)),
+      (inst $PassThru, $Pg, $Src)>;
+
+class SVE_Shift_DupImm_Pred_Pat<ValueType vt, SDPatternOperator op,
+                                ValueType pt, ValueType it,
+                                ComplexPattern cast, Instruction inst>
+: Pat<(vt (op pt:$Pg, vt:$Rn, (vt (AArch64dup (it (cast i32:$imm)))))),
+      (inst $Pg, $Rn, i32:$imm)>;
+
+class SVE_Shift_DupImm_All_Active_Pat<ValueType vt, SDPatternOperator op,
+                                      ValueType pt, ValueType it,
+                                      ComplexPattern cast, Instruction inst>
+: Pat<(vt (op (pt (AArch64ptrue 31)), vt:$Rn, (vt (AArch64dup (it (cast i32:$imm)))))),
+      (inst $Rn, i32:$imm)>;
+
 //
 // Pseudo -> Instruction mappings
 //
@@ -479,6 +511,8 @@ class sve_int_pfalse<bits<6> opc, string asm>
   let Inst{9}     = opc{0};
   let Inst{8-4}   = 0b00000;
   let Inst{3-0}   = Pd;
+
+  let isReMaterializable = 1;
 }
 
 class sve_int_ptest<bits<6> opc, string asm>
@@ -499,6 +533,7 @@ class sve_int_ptest<bits<6> opc, string asm>
   let Inst{4-0}   = 0b00000;
 
   let Defs = [NZCV];
+  let isCompare = 1;
 }
 
 class sve_int_pfirst_next<bits<2> sz8_64, bits<5> opc, string asm,
@@ -979,8 +1014,8 @@ multiclass sve_int_perm_dup_i<string asm> {
                   (!cast<Instruction>(NAME # _Q) ZPR128:$Zd, FPR128asZPR:$Qn, 0), 2>;
 }
 
-class sve_int_perm_tbl<bits<2> sz8_64, bits<2> opc, string asm,
-                       ZPRRegOp zprty, RegisterOperand VecList>
+class sve_int_perm_tbl<bits<2> sz8_64, bits<2> opc, string asm, ZPRRegOp zprty,
+                       RegisterOperand VecList>
 : I<(outs zprty:$Zd), (ins VecList:$Zn, zprty:$Zm),
   asm, "\t$Zd, $Zn, $Zm",
   "",
@@ -1022,6 +1057,8 @@ multiclass sve_int_perm_tbl<string asm, SDPatternOperator op> {
   def : SVE_2_Op_Pat<nxv8f16, op, nxv8f16, nxv8i16, !cast<Instruction>(NAME # _H)>;
   def : SVE_2_Op_Pat<nxv4f32, op, nxv4f32, nxv4i32, !cast<Instruction>(NAME # _S)>;
   def : SVE_2_Op_Pat<nxv2f64, op, nxv2f64, nxv2i64, !cast<Instruction>(NAME # _D)>;
+
+  def : SVE_2_Op_Pat<nxv8bf16, op, nxv8bf16, nxv8i16, !cast<Instruction>(NAME # _H)>;
 }
 
 multiclass sve2_int_perm_tbl<string asm, SDPatternOperator op> {
@@ -1064,6 +1101,11 @@ multiclass sve2_int_perm_tbl<string asm, SDPatternOperator op> {
             (nxv2f64 (!cast<Instruction>(NAME # _D) (REG_SEQUENCE ZPR2, nxv2f64:$Op1, zsub0,
                                                                         nxv2f64:$Op2, zsub1),
                                                      nxv2i64:$Op3))>;
+
+  def : Pat<(nxv8bf16 (op nxv8bf16:$Op1, nxv8bf16:$Op2, nxv8i16:$Op3)),
+            (nxv8bf16 (!cast<Instruction>(NAME # _H) (REG_SEQUENCE ZPR2, nxv8bf16:$Op1, zsub0,
+                                                                         nxv8bf16:$Op2, zsub1),
+                                                      nxv8i16:$Op3))>;
 }
 
 class sve2_int_perm_tbx<bits<2> sz8_64, string asm, ZPRRegOp zprty>
@@ -1099,6 +1141,8 @@ multiclass sve2_int_perm_tbx<string asm, SDPatternOperator op> {
   def : SVE_3_Op_Pat<nxv8f16, op, nxv8f16, nxv8f16, nxv8i16, !cast<Instruction>(NAME # _H)>;
   def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, nxv4f32, nxv4i32, !cast<Instruction>(NAME # _S)>;
   def : SVE_3_Op_Pat<nxv2f64, op, nxv2f64, nxv2f64, nxv2i64, !cast<Instruction>(NAME # _D)>;
+
+  def : SVE_3_Op_Pat<nxv8bf16, op, nxv8bf16, nxv8bf16, nxv8i16, !cast<Instruction>(NAME # _H)>;
 }
 
 class sve_int_perm_reverse_z<bits<2> sz8_64, string asm, ZPRRegOp zprty>
@@ -1129,6 +1173,8 @@ multiclass sve_int_perm_reverse_z<string asm, SDPatternOperator op> {
   def : SVE_1_Op_Pat<nxv8f16, op, nxv8f16, !cast<Instruction>(NAME # _H)>;
   def : SVE_1_Op_Pat<nxv4f32, op, nxv4f32, !cast<Instruction>(NAME # _S)>;
   def : SVE_1_Op_Pat<nxv2f64, op, nxv2f64, !cast<Instruction>(NAME # _D)>;
+
+  def : SVE_1_Op_Pat<nxv8bf16, op, nxv8bf16, !cast<Instruction>(NAME # _H)>;
 }
 
 class sve_int_perm_reverse_p<bits<2> sz8_64, string asm, PPRRegOp pprty>
@@ -1241,6 +1287,8 @@ multiclass sve_int_perm_insrv<string asm, SDPatternOperator op> {
   def : SVE_2_Op_Pat<nxv8f16, op, nxv8f16, f16, !cast<Instruction>(NAME # _H)>;
   def : SVE_2_Op_Pat<nxv4f32, op, nxv4f32, f32, !cast<Instruction>(NAME # _S)>;
   def : SVE_2_Op_Pat<nxv2f64, op, nxv2f64, f64, !cast<Instruction>(NAME # _D)>;
+
+  def : SVE_2_Op_Pat<nxv8bf16, op, nxv8bf16, bf16, !cast<Instruction>(NAME # _H)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1327,6 +1375,8 @@ multiclass sve_int_sel_vvv<string asm, SDPatternOperator op> {
   def : SVE_3_Op_Pat<nxv2f32, op, nxv2i1,  nxv2f32, nxv2f32, !cast<Instruction>(NAME # _D)>;
   def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1,  nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
 
+  def : SVE_3_Op_Pat<nxv8bf16, op, nxv8i1,  nxv8bf16, nxv8bf16, !cast<Instruction>(NAME # _H)>;
+
   def : InstAlias<"mov $Zd, $Pg/m, $Zn",
                   (!cast<Instruction>(NAME # _B) ZPR8:$Zd, PPRAny:$Pg, ZPR8:$Zn, ZPR8:$Zd), 1>;
   def : InstAlias<"mov $Zd, $Pg/m, $Zn",
@@ -1389,7 +1439,6 @@ multiclass sve_int_pred_log<bits<4> opc, string asm, SDPatternOperator op,
                                !cast<Instruction>(NAME), PTRUE_D>;
 }
 
-
 //===----------------------------------------------------------------------===//
 // SVE Logical Mask Immediate Group
 //===----------------------------------------------------------------------===//
@@ -1642,7 +1691,6 @@ multiclass sve_fp_ftmad<string asm, SDPatternOperator op> {
             (!cast<Instruction>(NAME # _D) ZPR64:$Zn, ZPR64:$Zm, imm32_0_7:$imm)>;
 }
 
-
 //===----------------------------------------------------------------------===//
 // SVE Floating Point Arithmetic - Unpredicated Group
 //===----------------------------------------------------------------------===//
@@ -1665,7 +1713,8 @@ class sve_fp_3op_u_zd<bits<2> sz, bits<3> opc, string asm, ZPRRegOp zprty>
   let Inst{4-0}   = Zd;
 }
 
-multiclass sve_fp_3op_u_zd<bits<3> opc, string asm, SDPatternOperator op> {
+multiclass sve_fp_3op_u_zd<bits<3> opc, string asm, SDPatternOperator op,
+                           SDPatternOperator predicated_op = null_frag> {
   def _H : sve_fp_3op_u_zd<0b01, opc, asm, ZPR16>;
   def _S : sve_fp_3op_u_zd<0b10, opc, asm, ZPR32>;
   def _D : sve_fp_3op_u_zd<0b11, opc, asm, ZPR64>;
@@ -1674,6 +1723,9 @@ multiclass sve_fp_3op_u_zd<bits<3> opc, string asm, SDPatternOperator op> {
   def : SVE_2_Op_Pat<nxv4f32, op, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
   def : SVE_2_Op_Pat<nxv2f64, op, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
 
+  def : SVE_2_Op_Pred_All_Active<nxv8f16, predicated_op, nxv8i1, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_2_Op_Pred_All_Active<nxv4f32, predicated_op, nxv4i1, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_2_Op_Pred_All_Active<nxv2f64, predicated_op, nxv2i1, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
 }
 
 multiclass sve_fp_3op_u_zd_ftsmul<bits<3> opc, string asm, SDPatternOperator op> {
@@ -2065,7 +2117,8 @@ class sve2_fp_pairwise_pred<bits<2> sz, bits<3> opc, string asm,
   let ElementSize = zprty.ElementSize;
 }
 
-multiclass sve2_fp_pairwise_pred<bits<3> opc, string asm, SDPatternOperator op> {
+multiclass sve2_fp_pairwise_pred<bits<3> opc, string asm,
+                                 SDPatternOperator op> {
   def _H : sve2_fp_pairwise_pred<0b01, opc, asm, ZPR16>;
   def _S : sve2_fp_pairwise_pred<0b10, opc, asm, ZPR32>;
   def _D : sve2_fp_pairwise_pred<0b11, opc, asm, ZPR64>;
@@ -2217,7 +2270,11 @@ multiclass sve_int_perm_bin_perm_zz<bits<3> opc, string asm,
   def : SVE_2_Op_Pat<nxv8f16, op, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
   def : SVE_2_Op_Pat<nxv4f16, op, nxv4f16, nxv4f16, !cast<Instruction>(NAME # _S)>;
   def : SVE_2_Op_Pat<nxv4f32, op, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_2_Op_Pat<nxv2f16, op, nxv2f16, nxv2f16, !cast<Instruction>(NAME # _D)>;
+  def : SVE_2_Op_Pat<nxv2f32, op, nxv2f32, nxv2f32, !cast<Instruction>(NAME # _D)>;
   def : SVE_2_Op_Pat<nxv2f64, op, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
+
+  def : SVE_2_Op_Pat<nxv8bf16, op, nxv8bf16, nxv8bf16, !cast<Instruction>(NAME # _H)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2225,7 +2282,7 @@ multiclass sve_int_perm_bin_perm_zz<bits<3> opc, string asm,
 //===----------------------------------------------------------------------===//
 
 class sve_fp_2op_p_zd<bits<7> opc, string asm, RegisterOperand i_zprtype,
-                      RegisterOperand o_zprtype, ElementSizeEnum size>
+                      RegisterOperand o_zprtype, ElementSizeEnum Sz>
 : I<(outs o_zprtype:$Zd), (ins i_zprtype:$_Zd, PPR3bAny:$Pg, i_zprtype:$Zn),
   asm, "\t$Zd, $Pg/m, $Zn",
   "",
@@ -2244,17 +2301,51 @@ class sve_fp_2op_p_zd<bits<7> opc, string asm, RegisterOperand i_zprtype,
 
   let Constraints = "$Zd = $_Zd";
   let DestructiveInstType = DestructiveOther;
-  let ElementSize = size;
+  let ElementSize = Sz;
 }
 
 multiclass sve_fp_2op_p_zd<bits<7> opc, string asm,
                            RegisterOperand i_zprtype,
                            RegisterOperand o_zprtype,
-                           SDPatternOperator op, ValueType vt1,
+                           SDPatternOperator int_op,
+                           SDPatternOperator ir_op, ValueType vt1,
                            ValueType vt2, ValueType vt3, ElementSizeEnum Sz> {
   def NAME : sve_fp_2op_p_zd<opc, asm, i_zprtype, o_zprtype, Sz>;
 
-  def : SVE_3_Op_Pat<vt1, op, vt1, vt2, vt3, !cast<Instruction>(NAME)>;
+  // convert vt1 to a packed type for the intrinsic patterns
+  defvar packedvt1 = !cond(!eq(!cast<string>(vt1), "nxv2f16"): nxv8f16,
+                           !eq(!cast<string>(vt1), "nxv4f16"): nxv8f16,
+                           !eq(!cast<string>(vt1), "nxv2f32"): nxv4f32,
+                           1 : vt1);
+
+  // convert vt3 to a packed type for the intrinsic patterns
+  defvar packedvt3 = !cond(!eq(!cast<string>(vt3), "nxv2f16"): nxv8f16,
+                           !eq(!cast<string>(vt3), "nxv4f16"): nxv8f16,
+                           !eq(!cast<string>(vt3), "nxv2f32"): nxv4f32,
+                           1 : vt3);
+
+  def : SVE_3_Op_Pat<packedvt1, int_op, packedvt1, vt2, packedvt3, !cast<Instruction>(NAME)>;
+
+  def : SVE_1_Op_Passthru_Pat<vt1, ir_op, vt2, vt3, !cast<Instruction>(NAME)>;
+}
+
+multiclass sve_fp_2op_p_zdr<bits<7> opc, string asm,
+                            RegisterOperand i_zprtype,
+                            RegisterOperand o_zprtype,
+                            SDPatternOperator int_op,
+                            SDPatternOperator ir_op, ValueType vt1,
+                            ValueType vt2, ValueType vt3, ElementSizeEnum Sz> {
+  def NAME : sve_fp_2op_p_zd<opc, asm, i_zprtype, o_zprtype, Sz>;
+
+  // convert vt1 to a packed type for the intrinsic patterns
+  defvar packedvt1 = !cond(!eq(!cast<string>(vt1), "nxv2f16"): nxv8f16,
+                           !eq(!cast<string>(vt1), "nxv4f16"): nxv8f16,
+                           !eq(!cast<string>(vt1), "nxv2f32"): nxv4f32,
+                           1 : vt1);
+
+  def : SVE_3_Op_Pat<packedvt1, int_op, packedvt1, vt2, vt3, !cast<Instruction>(NAME)>;
+
+  def : SVE_1_Op_Passthru_Round_Pat<vt1, ir_op, vt2, vt3, !cast<Instruction>(NAME)>;
 }
 
 multiclass sve_fp_2op_p_zd_HSD<bits<5> opc, string asm, SDPatternOperator op> {
@@ -2262,9 +2353,12 @@ multiclass sve_fp_2op_p_zd_HSD<bits<5> opc, string asm, SDPatternOperator op> {
   def _S : sve_fp_2op_p_zd<{ 0b10, opc }, asm, ZPR32, ZPR32, ElementSizeS>;
   def _D : sve_fp_2op_p_zd<{ 0b11, opc }, asm, ZPR64, ZPR64, ElementSizeD>;
 
-  def : SVE_3_Op_Pat<nxv8f16, op, nxv8f16, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>;
-  def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>;
-  def : SVE_3_Op_Pat<nxv2f64, op, nxv2f64, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>;
+  def : SVE_1_Op_Passthru_Pat<nxv8f16, op, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_1_Op_Passthru_Pat<nxv4f16, op, nxv4i1, nxv4f16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_1_Op_Passthru_Pat<nxv2f16, op, nxv2i1, nxv2f16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_1_Op_Passthru_Pat<nxv4f32, op, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_1_Op_Passthru_Pat<nxv2f32, op, nxv2i1, nxv2f32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_1_Op_Passthru_Pat<nxv2f64, op, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>;
 }
 
 multiclass sve2_fp_flogb<string asm, SDPatternOperator op> {
@@ -2372,11 +2466,19 @@ multiclass sve_int_bin_pred_arit_0<bits<3> opc, string asm, string Ps,
   def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
 }
 
-multiclass sve_int_bin_pred_arit_1<bits<3> opc, string asm, SDPatternOperator op> {
-  def _B : sve_int_bin_pred_arit_log<0b00, 0b01, opc, asm, ZPR8>;
-  def _H : sve_int_bin_pred_arit_log<0b01, 0b01, opc, asm, ZPR16>;
-  def _S : sve_int_bin_pred_arit_log<0b10, 0b01, opc, asm, ZPR32>;
-  def _D : sve_int_bin_pred_arit_log<0b11, 0b01, opc, asm, ZPR64>;
+multiclass sve_int_bin_pred_arit_1<bits<3> opc, string asm, string Ps,
+                                   SDPatternOperator op,
+                                   DestructiveInstTypeEnum flags> {
+  let DestructiveInstType = flags in {
+  def _B : sve_int_bin_pred_arit_log<0b00, 0b01, opc, asm, ZPR8>,
+             SVEPseudo2Instr<Ps # _B, 1>;
+  def _H : sve_int_bin_pred_arit_log<0b01, 0b01, opc, asm, ZPR16>,
+             SVEPseudo2Instr<Ps # _H, 1>;
+  def _S : sve_int_bin_pred_arit_log<0b10, 0b01, opc, asm, ZPR32>,
+             SVEPseudo2Instr<Ps # _S, 1>;
+  def _D : sve_int_bin_pred_arit_log<0b11, 0b01, opc, asm, ZPR64>,
+             SVEPseudo2Instr<Ps # _D, 1>;
+  }
 
   def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
   def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
@@ -2384,11 +2486,19 @@ multiclass sve_int_bin_pred_arit_1<bits<3> opc, string asm, SDPatternOperator op
   def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
 }
 
-multiclass sve_int_bin_pred_arit_2<bits<3> opc, string asm, SDPatternOperator op> {
-  def _B : sve_int_bin_pred_arit_log<0b00, 0b10, opc, asm, ZPR8>;
-  def _H : sve_int_bin_pred_arit_log<0b01, 0b10, opc, asm, ZPR16>;
-  def _S : sve_int_bin_pred_arit_log<0b10, 0b10, opc, asm, ZPR32>;
-  def _D : sve_int_bin_pred_arit_log<0b11, 0b10, opc, asm, ZPR64>;
+multiclass sve_int_bin_pred_arit_2<bits<3> opc, string asm, string Ps,
+                                   SDPatternOperator op,
+                                   DestructiveInstTypeEnum flags> {
+  let DestructiveInstType = flags in {
+  def _B : sve_int_bin_pred_arit_log<0b00, 0b10, opc, asm, ZPR8>,
+             SVEPseudo2Instr<Ps # _B, 1>;
+  def _H : sve_int_bin_pred_arit_log<0b01, 0b10, opc, asm, ZPR16>,
+             SVEPseudo2Instr<Ps # _H, 1>;
+  def _S : sve_int_bin_pred_arit_log<0b10, 0b10, opc, asm, ZPR32>,
+             SVEPseudo2Instr<Ps # _S, 1>;
+  def _D : sve_int_bin_pred_arit_log<0b11, 0b10, opc, asm, ZPR64>,
+             SVEPseudo2Instr<Ps # _D, 1>;
+  }
 
   def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
   def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
@@ -2478,7 +2588,8 @@ class sve_int_mlas_vvv_pred<bits<2> sz8_64, bits<1> opc, string asm,
   let ElementSize = zprty.ElementSize;
 }
 
-multiclass sve_int_mlas_vvv_pred<bits<1> opc, string asm, SDPatternOperator op> {
+multiclass sve_int_mlas_vvv_pred<bits<1> opc, string asm, SDPatternOperator op,
+                                 SDPatternOperator outerop, SDPatternOperator mulop> {
   def _B : sve_int_mlas_vvv_pred<0b00, opc, asm, ZPR8>;
   def _H : sve_int_mlas_vvv_pred<0b01, opc, asm, ZPR16>;
   def _S : sve_int_mlas_vvv_pred<0b10, opc, asm, ZPR32>;
@@ -2488,6 +2599,15 @@ multiclass sve_int_mlas_vvv_pred<bits<1> opc, string asm, SDPatternOperator op>
   def : SVE_4_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
   def : SVE_4_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
   def : SVE_4_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
+
+  def : Pat<(outerop nxv16i8:$Op1, (mulop nxv16i1:$pred, nxv16i8:$Op2, nxv16i8:$Op3)),
+            (!cast<Instruction>(NAME # _B) $pred, $Op1, $Op2, $Op3)>;
+  def : Pat<(outerop nxv8i16:$Op1, (mulop nxv8i1:$pred, nxv8i16:$Op2, nxv8i16:$Op3)),
+            (!cast<Instruction>(NAME # _H) $pred, $Op1, $Op2, $Op3)>;
+  def : Pat<(outerop nxv4i32:$Op1, (mulop nxv4i1:$pred, nxv4i32:$Op2, nxv4i32:$Op3)),
+            (!cast<Instruction>(NAME # _S) $pred, $Op1, $Op2, $Op3)>;
+  def : Pat<(outerop nxv2i64:$Op1, (mulop nxv2i1:$pred, nxv2i64:$Op2, nxv2i64:$Op3)),
+            (!cast<Instruction>(NAME # _D) $pred, $Op1, $Op2, $Op3)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2591,7 +2711,8 @@ multiclass sve2_int_mla_by_indexed_elem<bits<2> opc, bit S, string asm,
 // SVE2 Integer Multiply-Add Long - Indexed Group
 //===----------------------------------------------------------------------===//
 
-multiclass sve2_int_mla_long_by_indexed_elem<bits<4> opc, string asm, SDPatternOperator op> {
+multiclass sve2_int_mla_long_by_indexed_elem<bits<4> opc, string asm,
+                                             SDPatternOperator op> {
   def _S : sve2_int_mla_by_indexed_elem<0b10, { opc{3}, 0b0, opc{2-1}, ?, opc{0} },
                                         asm, ZPR32, ZPR16, ZPR3b16, VectorIndexH32b> {
     bits<3> Zm;
@@ -2841,7 +2962,8 @@ class sve2_int_mul<bits<2> sz, bits<3> opc, string asm, ZPRRegOp zprty>
   let Inst{4-0}   = Zd;
 }
 
-multiclass sve2_int_mul<bits<3> opc, string asm, SDPatternOperator op> {
+multiclass sve2_int_mul<bits<3> opc, string asm, SDPatternOperator op,
+                        SDPatternOperator op_pred = null_frag> {
   def _B : sve2_int_mul<0b00, opc, asm, ZPR8>;
   def _H : sve2_int_mul<0b01, opc, asm, ZPR16>;
   def _S : sve2_int_mul<0b10, opc, asm, ZPR32>;
@@ -2851,6 +2973,11 @@ multiclass sve2_int_mul<bits<3> opc, string asm, SDPatternOperator op> {
   def : SVE_2_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
   def : SVE_2_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
   def : SVE_2_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
+
+  def : SVE_2_Op_Pred_All_Active<nxv16i8, op_pred, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
+  def : SVE_2_Op_Pred_All_Active<nxv8i16, op_pred, nxv8i1, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_2_Op_Pred_All_Active<nxv4i32, op_pred, nxv4i1, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_2_Op_Pred_All_Active<nxv2i64, op_pred, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
 }
 
 multiclass sve2_int_mul_single<bits<3> opc, string asm, SDPatternOperator op> {
@@ -3404,7 +3531,8 @@ multiclass sve2_int_absdiff_accum_long<bits<2> opc, string asm,
   def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _D)>;
 }
 
-multiclass sve2_int_addsub_long_carry<bits<2> opc, string asm, SDPatternOperator op> {
+multiclass sve2_int_addsub_long_carry<bits<2> opc, string asm,
+                                      SDPatternOperator op> {
   def _S : sve2_int_absdiff_accum<{ opc{1}, 0b0 }, { 0b010, opc{0} }, asm,
                                   ZPR32, ZPR32>;
   def _D : sve2_int_absdiff_accum<{ opc{1}, 0b1 }, { 0b010, opc{0} }, asm,
@@ -3448,7 +3576,7 @@ multiclass sve2_int_bin_shift_imm_right_narrow_bottom<bits<3> opc, string asm,
     let Inst{19} = imm{3};
   }
   def _S : sve2_int_bin_shift_imm_narrow_bottom<{1,?,?}, opc, asm, ZPR32, ZPR64,
-                                                vecshiftR32> {
+                                                tvecshiftR32> {
     let Inst{20-19} = imm{4-3};
   }
   def : SVE_2_Op_Imm_Pat<nxv16i8, op, nxv8i16, i32, tvecshiftR8,  !cast<Instruction>(NAME # _B)>;
@@ -3488,7 +3616,7 @@ multiclass sve2_int_bin_shift_imm_right_narrow_top<bits<3> opc, string asm,
     let Inst{19} = imm{3};
   }
   def _S : sve2_int_bin_shift_imm_narrow_top<{1,?,?}, opc, asm, ZPR32, ZPR64,
-                                             vecshiftR32> {
+                                             tvecshiftR32> {
     let Inst{20-19} = imm{4-3};
   }
   def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv8i16, i32, tvecshiftR8,  !cast<Instruction>(NAME # _B)>;
@@ -3649,10 +3777,10 @@ multiclass sve_int_un_pred_arit_0<bits<3> opc, string asm,
   def _S : sve_int_un_pred_arit<0b10, { opc, 0b0 }, asm, ZPR32>;
   def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>;
 
-  def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
-  def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i1,  nxv8i16, !cast<Instruction>(NAME # _H)>;
-  def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1,  nxv4i32, !cast<Instruction>(NAME # _S)>;
-  def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1,  nxv2i64, !cast<Instruction>(NAME # _D)>;
+  def : SVE_1_Op_Passthru_Pat<nxv16i8, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
+  def : SVE_1_Op_Passthru_Pat<nxv8i16, op, nxv8i1,  nxv8i16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_1_Op_Passthru_Pat<nxv4i32, op, nxv4i1,  nxv4i32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_1_Op_Passthru_Pat<nxv2i64, op, nxv2i1,  nxv2i64, !cast<Instruction>(NAME # _D)>;
 }
 
 multiclass sve_int_un_pred_arit_0_h<bits<3> opc, string asm,
@@ -3661,9 +3789,9 @@ multiclass sve_int_un_pred_arit_0_h<bits<3> opc, string asm,
   def _S : sve_int_un_pred_arit<0b10, { opc, 0b0 }, asm, ZPR32>;
   def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>;
 
-  def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i1,  nxv8i16, !cast<Instruction>(NAME # _H)>;
-  def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1,  nxv4i32, !cast<Instruction>(NAME # _S)>;
-  def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1,  nxv2i64, !cast<Instruction>(NAME # _D)>;
+  def : SVE_InReg_Extend<nxv8i16, op, nxv8i1, nxv8i8, !cast<Instruction>(NAME # _H)>;
+  def : SVE_InReg_Extend<nxv4i32, op, nxv4i1, nxv4i8, !cast<Instruction>(NAME # _S)>;
+  def : SVE_InReg_Extend<nxv2i64, op, nxv2i1, nxv2i8, !cast<Instruction>(NAME # _D)>;
 }
 
 multiclass sve_int_un_pred_arit_0_w<bits<3> opc, string asm,
@@ -3671,15 +3799,15 @@ multiclass sve_int_un_pred_arit_0_w<bits<3> opc, string asm,
   def _S : sve_int_un_pred_arit<0b10, { opc, 0b0 }, asm, ZPR32>;
   def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>;
 
-  def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1,  nxv4i32, !cast<Instruction>(NAME # _S)>;
-  def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1,  nxv2i64, !cast<Instruction>(NAME # _D)>;
+  def : SVE_InReg_Extend<nxv4i32, op, nxv4i1, nxv4i16, !cast<Instruction>(NAME # _S)>;
+  def : SVE_InReg_Extend<nxv2i64, op, nxv2i1, nxv2i16, !cast<Instruction>(NAME # _D)>;
 }
 
 multiclass sve_int_un_pred_arit_0_d<bits<3> opc, string asm,
                                     SDPatternOperator op> {
   def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>;
 
-  def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1,  nxv2i64, !cast<Instruction>(NAME # _D)>;
+  def : SVE_InReg_Extend<nxv2i64, op, nxv2i1, nxv2i32, !cast<Instruction>(NAME # _D)>;
 }
 
 multiclass sve_int_un_pred_arit_1<bits<3> opc, string asm,
@@ -3689,25 +3817,23 @@ multiclass sve_int_un_pred_arit_1<bits<3> opc, string asm,
   def _S : sve_int_un_pred_arit<0b10, { opc, 0b1 }, asm, ZPR32>;
   def _D : sve_int_un_pred_arit<0b11, { opc, 0b1 }, asm, ZPR64>;
 
-  def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
-  def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i1,  nxv8i16, !cast<Instruction>(NAME # _H)>;
-  def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1,  nxv4i32, !cast<Instruction>(NAME # _S)>;
-  def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1,  nxv2i64, !cast<Instruction>(NAME # _D)>;
-
-  def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>;
-  def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>;
-  def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>;
+  def : SVE_1_Op_Passthru_Pat<nxv16i8, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
+  def : SVE_1_Op_Passthru_Pat<nxv8i16, op, nxv8i1,  nxv8i16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_1_Op_Passthru_Pat<nxv4i32, op, nxv4i1,  nxv4i32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_1_Op_Passthru_Pat<nxv2i64, op, nxv2i1,  nxv2i64, !cast<Instruction>(NAME # _D)>;
 }
 
-multiclass sve_int_un_pred_arit_1_fp<bits<3> opc, string asm,
-                                     SDPatternOperator op> {
+multiclass sve_int_un_pred_arit_1_fp<bits<3> opc, string asm, SDPatternOperator op> {
   def _H : sve_int_un_pred_arit<0b01, { opc, 0b1 }, asm, ZPR16>;
   def _S : sve_int_un_pred_arit<0b10, { opc, 0b1 }, asm, ZPR32>;
   def _D : sve_int_un_pred_arit<0b11, { opc, 0b1 }, asm, ZPR64>;
 
-  def : SVE_3_Op_Pat<nxv8f16, op, nxv8f16, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>;
-  def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>;
-  def : SVE_3_Op_Pat<nxv2f64, op, nxv2f64, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>;
+  def : SVE_1_Op_Passthru_Pat<nxv8f16, op, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_1_Op_Passthru_Pat<nxv4f16, op, nxv4i1, nxv4f16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_1_Op_Passthru_Pat<nxv2f16, op, nxv2i1, nxv2f16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_1_Op_Passthru_Pat<nxv4f32, op, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_1_Op_Passthru_Pat<nxv2f32, op, nxv2i1, nxv2f32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_1_Op_Passthru_Pat<nxv2f64, op, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -3876,10 +4002,10 @@ multiclass sve_int_arith_imm1_unsigned<bits<2> opc, string asm, SDPatternOperato
   def _S : sve_int_arith_imm<0b10, { 0b1010, opc }, asm, ZPR32, imm0_255>;
   def _D : sve_int_arith_imm<0b11, { 0b1010, opc }, asm, ZPR64, imm0_255>;
 
-  def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv16i8, nxv16i1, op, ZPR8, i32, SVEArithUImmPat, !cast<Instruction>(NAME # _B)>;
-  def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv8i16, nxv8i1, op, ZPR16, i32, SVEArithUImmPat, !cast<Instruction>(NAME # _H)>;
-  def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv4i32, nxv4i1, op, ZPR32, i32, SVEArithUImmPat, !cast<Instruction>(NAME # _S)>;
-  def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv2i64, nxv2i1, op, ZPR64, i64, SVEArithUImmPat, !cast<Instruction>(NAME # _D)>;
+  def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv16i8, nxv16i1, op, ZPR8, i32, SVEArithUImm8Pat, !cast<Instruction>(NAME # _B)>;
+  def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv8i16, nxv8i1, op, ZPR16, i32, SVEArithUImm16Pat, !cast<Instruction>(NAME # _H)>;
+  def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv4i32, nxv4i1, op, ZPR32, i32, SVEArithUImm32Pat, !cast<Instruction>(NAME # _S)>;
+  def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv2i64, nxv2i1, op, ZPR64, i64, SVEArithUImm64Pat, !cast<Instruction>(NAME # _D)>;
 }
 
 multiclass sve_int_arith_imm2<string asm, SDPatternOperator op> {
@@ -3888,10 +4014,10 @@ multiclass sve_int_arith_imm2<string asm, SDPatternOperator op> {
   def _S : sve_int_arith_imm<0b10, 0b110000, asm, ZPR32, simm8>;
   def _D : sve_int_arith_imm<0b11, 0b110000, asm, ZPR64, simm8>;
 
-  def : SVE_1_Op_Imm_Arith_Pat<nxv16i8, op, ZPR8, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _B)>;
-  def : SVE_1_Op_Imm_Arith_Pat<nxv8i16, op, ZPR16, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _H)>;
-  def : SVE_1_Op_Imm_Arith_Pat<nxv4i32, op, ZPR32, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _S)>;
-  def : SVE_1_Op_Imm_Arith_Pat<nxv2i64, op, ZPR64, i64, SVEArithSImmPat, !cast<Instruction>(NAME # _D)>;
+  def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv16i8, nxv16i1, op, ZPR8, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _B)>;
+  def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv8i16, nxv8i1, op, ZPR16, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _H)>;
+  def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv4i32, nxv4i1, op, ZPR32, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _S)>;
+  def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv2i64, nxv2i1, op, ZPR64, i64, SVEArithSImmPat, !cast<Instruction>(NAME # _D)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -4004,6 +4130,7 @@ multiclass sve2_int_rotate_right_imm<string asm, SDPatternOperator op> {
     let Inst{22}    = imm{5};
     let Inst{20-19} = imm{4-3};
   }
+
   def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv16i8, i32, tvecshiftR8,  !cast<Instruction>(NAME # _B)>;
   def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, i32, tvecshiftR16, !cast<Instruction>(NAME # _H)>;
   def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, i32, tvecshiftR32, !cast<Instruction>(NAME # _S)>;
@@ -4162,6 +4289,8 @@ class sve_int_cmp<bit cmp_1, bits<2> sz8_64, bits<3> opc, string asm,
   let Inst{3-0}   = Pd;
 
   let Defs = [NZCV];
+  let ElementSize = pprty.ElementSize;
+  let isPTestLike = 1;
 }
 
 multiclass SVE_SETCC_Pat<CondCode cc, CondCode invcc, ValueType predvt,
@@ -4234,6 +4363,7 @@ class sve_int_scmp_vi<bits<2> sz8_64, bits<3> opc, string asm, PPRRegOp pprty,
 
   let Defs = [NZCV];
   let ElementSize = pprty.ElementSize;
+  let isPTestLike = 1;
 }
 
 multiclass SVE_SETCC_Imm_Pat<CondCode cc, CondCode commuted_cc,
@@ -4293,6 +4423,8 @@ class sve_int_ucmp_vi<bits<2> sz8_64, bits<2> opc, string asm, PPRRegOp pprty,
   let Inst{3-0}   = Pd;
 
   let Defs = [NZCV];
+  let ElementSize = pprty.ElementSize;
+  let isPTestLike = 1;
 }
 
 multiclass sve_int_ucmp_vi<bits<2> opc, string asm, CondCode cc,
@@ -4337,8 +4469,7 @@ class sve_int_cterm<bit sz, bit opc, string asm, RegisterClass rt>
 }
 
 class sve_int_while_rr<bits<2> sz8_64, bits<4> opc, string asm,
-                       RegisterClass gprty, PPRRegOp pprty,
-                       ValueType vt, SDPatternOperator op>
+                       RegisterClass gprty, PPRRegOp pprty>
 : I<(outs pprty:$Pd), (ins gprty:$Rn, gprty:$Rm),
   asm, "\t$Pd, $Rn, $Rm",
   "", []>, Sched<[]> {
@@ -4356,30 +4487,32 @@ class sve_int_while_rr<bits<2> sz8_64, bits<4> opc, string asm,
   let Inst{3-0}   = Pd;
 
   let Defs = [NZCV];
+  let ElementSize = pprty.ElementSize;
+  let isWhile = 1;
 }
 
 multiclass sve_int_while4_rr<bits<3> opc, string asm, SDPatternOperator op> {
-  def _B : sve_int_while_rr<0b00, { 0, opc }, asm, GPR32, PPR8, nxv16i1, op>;
-  def _H : sve_int_while_rr<0b01, { 0, opc }, asm, GPR32, PPR16, nxv8i1, op>;
-  def _S : sve_int_while_rr<0b10, { 0, opc }, asm, GPR32, PPR32, nxv4i1, op>;
-  def _D : sve_int_while_rr<0b11, { 0, opc }, asm, GPR32, PPR64, nxv2i1, op>;
+  def _B : sve_int_while_rr<0b00, { 0, opc }, asm, GPR32, PPR8>;
+  def _H : sve_int_while_rr<0b01, { 0, opc }, asm, GPR32, PPR16>;
+  def _S : sve_int_while_rr<0b10, { 0, opc }, asm, GPR32, PPR32>;
+  def _D : sve_int_while_rr<0b11, { 0, opc }, asm, GPR32, PPR64>;
 
   def : SVE_2_Op_Pat<nxv16i1, op, i32, i32, !cast<Instruction>(NAME # _B)>;
-  def : SVE_2_Op_Pat<nxv8i1, op, i32, i32, !cast<Instruction>(NAME # _H)>;
-  def : SVE_2_Op_Pat<nxv4i1, op, i32, i32, !cast<Instruction>(NAME # _S)>;
-  def : SVE_2_Op_Pat<nxv2i1, op, i32, i32, !cast<Instruction>(NAME # _D)>;
+  def : SVE_2_Op_Pat<nxv8i1,  op, i32, i32, !cast<Instruction>(NAME # _H)>;
+  def : SVE_2_Op_Pat<nxv4i1,  op, i32, i32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_2_Op_Pat<nxv2i1,  op, i32, i32, !cast<Instruction>(NAME # _D)>;
 }
 
 multiclass sve_int_while8_rr<bits<3> opc, string asm, SDPatternOperator op> {
-  def _B : sve_int_while_rr<0b00, { 1, opc }, asm, GPR64, PPR8, nxv16i1, op>;
-  def _H : sve_int_while_rr<0b01, { 1, opc }, asm, GPR64, PPR16, nxv8i1, op>;
-  def _S : sve_int_while_rr<0b10, { 1, opc }, asm, GPR64, PPR32, nxv4i1, op>;
-  def _D : sve_int_while_rr<0b11, { 1, opc }, asm, GPR64, PPR64, nxv2i1, op>;
+  def _B : sve_int_while_rr<0b00, { 1, opc }, asm, GPR64, PPR8>;
+  def _H : sve_int_while_rr<0b01, { 1, opc }, asm, GPR64, PPR16>;
+  def _S : sve_int_while_rr<0b10, { 1, opc }, asm, GPR64, PPR32>;
+  def _D : sve_int_while_rr<0b11, { 1, opc }, asm, GPR64, PPR64>;
 
   def : SVE_2_Op_Pat<nxv16i1, op, i64, i64, !cast<Instruction>(NAME # _B)>;
-  def : SVE_2_Op_Pat<nxv8i1, op, i64, i64, !cast<Instruction>(NAME # _H)>;
-  def : SVE_2_Op_Pat<nxv4i1, op, i64, i64, !cast<Instruction>(NAME # _S)>;
-  def : SVE_2_Op_Pat<nxv2i1, op, i64, i64, !cast<Instruction>(NAME # _D)>;
+  def : SVE_2_Op_Pat<nxv8i1,  op, i64, i64, !cast<Instruction>(NAME # _H)>;
+  def : SVE_2_Op_Pat<nxv4i1,  op, i64, i64, !cast<Instruction>(NAME # _S)>;
+  def : SVE_2_Op_Pat<nxv2i1,  op, i64, i64, !cast<Instruction>(NAME # _D)>;
 }
 
 class sve2_int_while_rr<bits<2> sz8_64, bits<1> rw, string asm,
@@ -4400,6 +4533,8 @@ class sve2_int_while_rr<bits<2> sz8_64, bits<1> rw, string asm,
   let Inst{3-0}   = Pd;
 
   let Defs = [NZCV];
+  let ElementSize = pprty.ElementSize;
+  let isWhile = 1;
 }
 
 multiclass sve2_int_while_rr<bits<1> rw, string asm, string op> {
@@ -4412,7 +4547,6 @@ multiclass sve2_int_while_rr<bits<1> rw, string asm, string op> {
   def : SVE_2_Op_Pat<nxv8i1,  !cast<SDPatternOperator>(op # _h), i64, i64, !cast<Instruction>(NAME # _H)>;
   def : SVE_2_Op_Pat<nxv4i1,  !cast<SDPatternOperator>(op # _s), i64, i64, !cast<Instruction>(NAME # _S)>;
   def : SVE_2_Op_Pat<nxv2i1,  !cast<SDPatternOperator>(op # _d), i64, i64, !cast<Instruction>(NAME # _D)>;
-
 }
 
 //===----------------------------------------------------------------------===//
@@ -4443,12 +4577,14 @@ multiclass sve_fp_fast_red<bits<3> opc, string asm, SDPatternOperator op> {
   def _S : sve_fp_fast_red<0b10, opc, asm, ZPR32, FPR32asZPR>;
   def _D : sve_fp_fast_red<0b11, opc, asm, ZPR64, FPR64asZPR>;
 
+  def : SVE_2_Op_Pat<nxv2f16, op, nxv2i1, nxv2f16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_2_Op_Pat<nxv4f16, op, nxv4i1, nxv4f16, !cast<Instruction>(NAME # _H)>;
   def : SVE_2_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_2_Op_Pat<nxv2f32, op, nxv2i1, nxv2f32, !cast<Instruction>(NAME # _S)>;
   def : SVE_2_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>;
   def : SVE_2_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>;
 }
 
-
 //===----------------------------------------------------------------------===//
 // SVE Floating Point Accumulating Reduction Group
 //===----------------------------------------------------------------------===//
@@ -4480,7 +4616,10 @@ multiclass sve_fp_2op_p_vd<bits<3> opc, string asm, SDPatternOperator op> {
   def _S : sve_fp_2op_p_vd<0b10, opc, asm, ZPR32, FPR32asZPR>;
   def _D : sve_fp_2op_p_vd<0b11, opc, asm, ZPR64, FPR64asZPR>;
 
+  def : SVE_3_Op_Pat<nxv2f16, op, nxv2i1, nxv2f16, nxv2f16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Pat<nxv4f16, op, nxv4i1, nxv4f16, nxv4f16, !cast<Instruction>(NAME # _H)>;
   def : SVE_3_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Pat<nxv2f32, op, nxv2i1, nxv2f32, nxv2f32, !cast<Instruction>(NAME # _S)>;
   def : SVE_3_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
   def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
 }
@@ -4701,10 +4840,11 @@ multiclass sve_int_index_rr<string asm, SDPatternOperator op> {
   def : SVE_2_Op_Pat<nxv4i32, op, i32, i32, !cast<Instruction>(NAME # _S)>;
   def : SVE_2_Op_Pat<nxv2i64, op, i64, i64, !cast<Instruction>(NAME # _D)>;
 }
-//
+
 //===----------------------------------------------------------------------===//
 // SVE Bitwise Shift - Predicated Group
 //===----------------------------------------------------------------------===//
+
 class sve_int_bin_pred_shift_imm<bits<4> tsz8_64, bits<4> opc, string asm,
                                  ZPRRegOp zprty, Operand immtype>
 : I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, immtype:$imm),
@@ -4729,38 +4869,19 @@ class sve_int_bin_pred_shift_imm<bits<4> tsz8_64, bits<4> opc, string asm,
   let ElementSize = zprty.ElementSize;
 }
 
-multiclass sve_int_bin_pred_shift_imm_left<bits<4> opc, string asm, string psName=""> {
-  def _B : SVEPseudo2Instr<psName # _B, 1>,
+multiclass sve_int_bin_pred_shift_imm_left<bits<4> opc, string asm, string Ps,
+                                           SDPatternOperator op = null_frag> {
+  def _B : SVEPseudo2Instr<Ps # _B, 1>,
            sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>;
-  def _H : SVEPseudo2Instr<psName # _H, 1>,
-           sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> {
-    let Inst{8} = imm{3};
-  }
-  def _S : SVEPseudo2Instr<psName # _S, 1>,
-           sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> {
-    let Inst{9-8} = imm{4-3};
-  }
-  def _D : SVEPseudo2Instr<psName # _D, 1>,
-           sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> {
-    let Inst{22}  = imm{5};
-    let Inst{9-8} = imm{4-3};
-  }
-}
-
-multiclass sve2_int_bin_pred_shift_imm_left<bits<4> opc, string asm,
-                                            string psName,
-                                            SDPatternOperator op> {
-
-  def _B : SVEPseudo2Instr<psName # _B, 1>, sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>;
-  def _H : SVEPseudo2Instr<psName # _H, 1>,
+  def _H : SVEPseudo2Instr<Ps # _H, 1>,
            sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> {
     let Inst{8} = imm{3};
   }
-  def _S : SVEPseudo2Instr<psName # _S, 1>,
+  def _S : SVEPseudo2Instr<Ps # _S, 1>,
            sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> {
     let Inst{9-8} = imm{4-3};
   }
-  def _D : SVEPseudo2Instr<psName # _D, 1>,
+  def _D : SVEPseudo2Instr<Ps # _D, 1>,
            sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> {
     let Inst{22}  = imm{5};
     let Inst{9-8} = imm{4-3};
@@ -4772,6 +4893,16 @@ multiclass sve2_int_bin_pred_shift_imm_left<bits<4> opc, string asm,
   def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i1,  nxv2i64, i32, tvecshiftL64, !cast<Instruction>(NAME # _D)>;
 }
 
+// As above but shift amount takes the form of a "vector immediate".
+multiclass sve_int_bin_pred_shift_imm_left_dup<bits<4> opc, string asm,
+                                               string Ps, SDPatternOperator op>
+: sve_int_bin_pred_shift_imm_left<opc, asm, Ps, null_frag> {
+  def : SVE_Shift_DupImm_Pred_Pat<nxv16i8, op, nxv16i1, i32, SVEShiftImmL8,  !cast<Instruction>(NAME # _B)>;
+  def : SVE_Shift_DupImm_Pred_Pat<nxv8i16, op, nxv8i1,  i32, SVEShiftImmL16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_Shift_DupImm_Pred_Pat<nxv4i32, op, nxv4i1,  i32, SVEShiftImmL32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_Shift_DupImm_Pred_Pat<nxv2i64, op, nxv2i1,  i64, SVEShiftImmL64, !cast<Instruction>(NAME # _D)>;
+}
+
 multiclass sve_int_bin_pred_shift_imm_left_zeroing_bhsd<SDPatternOperator op> {
   def _ZERO_B : PredTwoOpImmPseudo<NAME # _B, ZPR8,  tvecshiftL8,  FalseLanesZero>;
   def _ZERO_H : PredTwoOpImmPseudo<NAME # _H, ZPR16, tvecshiftL16, FalseLanesZero>;
@@ -4808,6 +4939,16 @@ multiclass sve_int_bin_pred_shift_imm_right<bits<4> opc, string asm, string Ps,
   def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i1,  nxv2i64, i32, tvecshiftR64, !cast<Instruction>(NAME # _D)>;
 }
 
+// As above but shift amount takes the form of a "vector immediate".
+multiclass sve_int_bin_pred_shift_imm_right_dup<bits<4> opc, string asm,
+                                            string Ps, SDPatternOperator op>
+: sve_int_bin_pred_shift_imm_right<opc, asm, Ps, null_frag> {
+  def : SVE_Shift_DupImm_Pred_Pat<nxv16i8, op, nxv16i1, i32, SVEShiftImmR8,  !cast<Instruction>(NAME # _B)>;
+  def : SVE_Shift_DupImm_Pred_Pat<nxv8i16, op, nxv8i1,  i32, SVEShiftImmR16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_Shift_DupImm_Pred_Pat<nxv4i32, op, nxv4i1,  i32, SVEShiftImmR32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_Shift_DupImm_Pred_Pat<nxv2i64, op, nxv2i1,  i64, SVEShiftImmR64, !cast<Instruction>(NAME # _D)>;
+}
+
 multiclass sve_int_bin_pred_shift_imm_right_zeroing_bhsd<SDPatternOperator op = null_frag> {
   def _ZERO_B : PredTwoOpImmPseudo<NAME # _B, ZPR8, vecshiftR8, FalseLanesZero>;
   def _ZERO_H : PredTwoOpImmPseudo<NAME # _H, ZPR16, vecshiftR16, FalseLanesZero>;
@@ -4948,10 +5089,10 @@ multiclass sve_int_bin_cons_shift_imm_left<bits<2> opc, string asm,
     let Inst{20-19} = imm{4-3};
   }
 
-  def : SVE_1_Op_Imm_Shift_Pred_Pat<nxv16i8, nxv16i1, op, ZPR8,  vecshiftL8,  !cast<Instruction>(NAME # _B)>;
-  def : SVE_1_Op_Imm_Shift_Pred_Pat<nxv8i16, nxv8i1,  op, ZPR16, vecshiftL16, !cast<Instruction>(NAME # _H)>;
-  def : SVE_1_Op_Imm_Shift_Pred_Pat<nxv4i32, nxv4i1,  op, ZPR32, vecshiftL32, !cast<Instruction>(NAME # _S)>;
-  def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv2i64, nxv2i1, op, ZPR64, i64, SVEShiftImm64, !cast<Instruction>(NAME # _D)>;
+  def : SVE_Shift_DupImm_All_Active_Pat<nxv16i8, op, nxv16i1, i32, SVEShiftImmL8,  !cast<Instruction>(NAME # _B)>;
+  def : SVE_Shift_DupImm_All_Active_Pat<nxv8i16, op, nxv8i1,  i32, SVEShiftImmL16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_Shift_DupImm_All_Active_Pat<nxv4i32, op, nxv4i1,  i32, SVEShiftImmL32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_Shift_DupImm_All_Active_Pat<nxv2i64, op, nxv2i1,  i64, SVEShiftImmL64, !cast<Instruction>(NAME # _D)>;
 }
 
 multiclass sve_int_bin_cons_shift_imm_right<bits<2> opc, string asm,
@@ -4968,11 +5109,12 @@ multiclass sve_int_bin_cons_shift_imm_right<bits<2> opc, string asm,
     let Inst{20-19} = imm{4-3};
   }
 
-  def : SVE_1_Op_Imm_Shift_Pred_Pat<nxv16i8, nxv16i1, op, ZPR8,  vecshiftR8,  !cast<Instruction>(NAME # _B)>;
-  def : SVE_1_Op_Imm_Shift_Pred_Pat<nxv8i16, nxv8i1,  op, ZPR16, vecshiftR16, !cast<Instruction>(NAME # _H)>;
-  def : SVE_1_Op_Imm_Shift_Pred_Pat<nxv4i32, nxv4i1,  op, ZPR32, vecshiftR32, !cast<Instruction>(NAME # _S)>;
-  def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv2i64, nxv2i1,  op, ZPR64, i64, SVEShiftImm64, !cast<Instruction>(NAME # _D)>;
+  def : SVE_Shift_DupImm_All_Active_Pat<nxv16i8, op, nxv16i1, i32, SVEShiftImmR8,  !cast<Instruction>(NAME # _B)>;
+  def : SVE_Shift_DupImm_All_Active_Pat<nxv8i16, op, nxv8i1,  i32, SVEShiftImmR16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_Shift_DupImm_All_Active_Pat<nxv4i32, op, nxv4i1,  i32, SVEShiftImmR32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_Shift_DupImm_All_Active_Pat<nxv2i64, op, nxv2i1,  i64, SVEShiftImmR64, !cast<Instruction>(NAME # _D)>;
 }
+
 //===----------------------------------------------------------------------===//
 // SVE Memory - Store Group
 //===----------------------------------------------------------------------===//
@@ -5481,8 +5623,7 @@ class sve_int_perm_bin_perm_pp<bits<3> opc, bits<2> sz8_64, string asm,
                                PPRRegOp pprty>
 : I<(outs pprty:$Pd), (ins pprty:$Pn, pprty:$Pm),
   asm, "\t$Pd, $Pn, $Pm",
-  "",
-  []>, Sched<[]> {
+  "", []>, Sched<[]> {
   bits<4> Pd;
   bits<4> Pm;
   bits<4> Pn;
@@ -5548,7 +5689,7 @@ class sve_int_rdffr_pred<bit s, string asm>
   let Inst{4}     = 0;
   let Inst{3-0}   = Pd;
 
-  let Defs = !if(!eq (s, 1), [NZCV], []);
+  let Defs = !if(s, [NZCV], []);
   let Uses = [FFR];
 }
 
@@ -5675,9 +5816,11 @@ multiclass sve_int_perm_clast_vz<bit ab, string asm, SDPatternOperator op> {
   def _S : sve_int_perm_clast_vz<0b10, ab, asm, ZPR32, FPR32>;
   def _D : sve_int_perm_clast_vz<0b11, ab, asm, ZPR64, FPR64>;
 
-  def : SVE_3_Op_Pat<f16, op, nxv8i1,  f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
-  def : SVE_3_Op_Pat<f32, op, nxv4i1,  f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
-  def : SVE_3_Op_Pat<f64, op, nxv2i1,  f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
+  def : SVE_3_Op_Pat<f16, op, nxv8i1, f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Pat<f32, op, nxv4i1, f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_3_Op_Pat<f64, op, nxv2i1, f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
+
+  def : SVE_3_Op_Pat<bf16, op, nxv8i1, bf16, nxv8bf16, !cast<Instruction>(NAME # _H)>;
 }
 
 class sve_int_perm_clast_zz<bits<2> sz8_64, bit ab, string asm,
@@ -5717,6 +5860,8 @@ multiclass sve_int_perm_clast_zz<bit ab, string asm, SDPatternOperator op> {
   def : SVE_3_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
   def : SVE_3_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
   def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
+
+  def : SVE_3_Op_Pat<nxv8bf16, op, nxv8i1, nxv8bf16, nxv8bf16, !cast<Instruction>(NAME # _H)>;
 }
 
 class sve_int_perm_last_r<bits<2> sz8_64, bit ab, string asm,
@@ -5779,6 +5924,8 @@ multiclass sve_int_perm_last_v<bit ab, string asm, SDPatternOperator op> {
   def : SVE_2_Op_Pat<f32, op, nxv4i1,  nxv4f32, !cast<Instruction>(NAME # _S)>;
   def : SVE_2_Op_Pat<f32, op, nxv2i1,  nxv2f32, !cast<Instruction>(NAME # _S)>;
   def : SVE_2_Op_Pat<f64, op, nxv2i1,  nxv2f64, !cast<Instruction>(NAME # _D)>;
+
+  def : SVE_2_Op_Pat<bf16, op, nxv8i1,  nxv8bf16, !cast<Instruction>(NAME # _H)>;
 }
 
 class sve_int_perm_splice<bits<2> sz8_64, string asm, ZPRRegOp zprty>
@@ -5815,6 +5962,8 @@ multiclass sve_int_perm_splice<string asm, SDPatternOperator op> {
   def : SVE_3_Op_Pat<nxv8f16, op, nxv8i1,  nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
   def : SVE_3_Op_Pat<nxv4f32, op, nxv4i1,  nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
   def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1,  nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
+
+  def : SVE_3_Op_Pat<nxv8bf16, op, nxv8i1, nxv8bf16, nxv8bf16, !cast<Instruction>(NAME # _H)>;
 }
 
 class sve2_int_perm_splice_cons<bits<2> sz8_64, string asm,
@@ -5870,26 +6019,20 @@ multiclass sve_int_perm_rev_rbit<string asm, SDPatternOperator op> {
   def _S : sve_int_perm_rev<0b10, 0b11, asm, ZPR32>;
   def _D : sve_int_perm_rev<0b11, 0b11, asm, ZPR64>;
 
-  def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
-  def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i1,  nxv8i16, !cast<Instruction>(NAME # _H)>;
-  def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1,  nxv4i32, !cast<Instruction>(NAME # _S)>;
-  def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1,  nxv2i64, !cast<Instruction>(NAME # _D)>;
+  def : SVE_1_Op_Passthru_Pat<nxv16i8, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
+  def : SVE_1_Op_Passthru_Pat<nxv8i16, op, nxv8i1,  nxv8i16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_1_Op_Passthru_Pat<nxv4i32, op, nxv4i1,  nxv4i32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_1_Op_Passthru_Pat<nxv2i64, op, nxv2i1,  nxv2i64, !cast<Instruction>(NAME # _D)>;
 }
 
-multiclass sve_int_perm_rev_revb<string asm,
-                                 SDPatternOperator int_op,
-                                 SDPatternOperator ir_op> {
+multiclass sve_int_perm_rev_revb<string asm, SDPatternOperator op> {
   def _H : sve_int_perm_rev<0b01, 0b00, asm, ZPR16>;
   def _S : sve_int_perm_rev<0b10, 0b00, asm, ZPR32>;
   def _D : sve_int_perm_rev<0b11, 0b00, asm, ZPR64>;
 
-  def : SVE_3_Op_Pat<nxv8i16, int_op, nxv8i16, nxv8i1,  nxv8i16, !cast<Instruction>(NAME # _H)>;
-  def : SVE_3_Op_Pat<nxv4i32, int_op, nxv4i32, nxv4i1,  nxv4i32, !cast<Instruction>(NAME # _S)>;
-  def : SVE_3_Op_Pat<nxv2i64, int_op, nxv2i64, nxv2i1,  nxv2i64, !cast<Instruction>(NAME # _D)>;
-
-  def : SVE_1_Op_AllActive_Pat<nxv8i16, ir_op, nxv8i16, !cast<Instruction>(NAME # _H), PTRUE_H>;
-  def : SVE_1_Op_AllActive_Pat<nxv4i32, ir_op, nxv4i32, !cast<Instruction>(NAME # _S), PTRUE_S>;
-  def : SVE_1_Op_AllActive_Pat<nxv2i64, ir_op, nxv2i64, !cast<Instruction>(NAME # _D), PTRUE_D>;
+  def : SVE_1_Op_Passthru_Pat<nxv8i16, op, nxv8i1,  nxv8i16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_1_Op_Passthru_Pat<nxv4i32, op, nxv4i1,  nxv4i32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_1_Op_Passthru_Pat<nxv2i64, op, nxv2i1,  nxv2i64, !cast<Instruction>(NAME # _D)>;
 }
 
 multiclass sve_int_perm_rev_revh<string asm, SDPatternOperator op> {
@@ -5988,7 +6131,6 @@ multiclass sve_int_perm_cpy_v<string asm, SDPatternOperator op> {
   def : InstAlias<"mov $Zd, $Pg/m, $Vn",
                   (!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPR3bAny:$Pg, FPR64:$Vn), 1>;
 
-
   def : Pat<(nxv8f16 (op nxv8i1:$pg, f16:$splat, nxv8f16:$passthru)),
             (!cast<Instruction>(NAME # _H) $passthru, $pg, $splat)>;
   def : Pat<(nxv2f32 (op nxv2i1:$pg, f32:$splat, nxv2f32:$passthru)),
@@ -5997,6 +6139,9 @@ multiclass sve_int_perm_cpy_v<string asm, SDPatternOperator op> {
             (!cast<Instruction>(NAME # _S) $passthru, $pg, $splat)>;
   def : Pat<(nxv2f64 (op nxv2i1:$pg, f64:$splat, nxv2f64:$passthru)),
             (!cast<Instruction>(NAME # _D) $passthru, $pg, $splat)>;
+
+  def : Pat<(nxv8bf16 (op nxv8i1:$pg, bf16:$splat, nxv8bf16:$passthru)),
+            (!cast<Instruction>(NAME # _H) $passthru, $pg, $splat)>;
 }
 
 class sve_int_perm_compact<bit sz, string asm, ZPRRegOp zprty>
@@ -6025,7 +6170,6 @@ multiclass sve_int_perm_compact<string asm, SDPatternOperator op> {
   def : SVE_2_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>;
 }
 
-
 //===----------------------------------------------------------------------===//
 // SVE Memory - Contiguous Load Group
 //===----------------------------------------------------------------------===//
@@ -6050,8 +6194,8 @@ class sve_mem_cld_si_base<bits<4> dtype, bit nf, string asm,
   let Inst{4-0}   = Zt;
 
   let mayLoad = 1;
-  let Uses = !if(!eq(nf, 1), [FFR], []);
-  let Defs = !if(!eq(nf, 1), [FFR], []);
+  let Uses = !if(nf, [FFR], []);
+  let Defs = !if(nf, [FFR], []);
 }
 
 multiclass sve_mem_cld_si_base<bits<4> dtype, bit nf, string asm,
@@ -6253,8 +6397,8 @@ class sve_mem_cld_ss_base<bits<4> dtype, bit ff, dag iops, string asm,
   let Inst{4-0}   = Zt;
 
   let mayLoad = 1;
-  let Uses = !if(!eq(ff, 1), [FFR], []);
-  let Defs = !if(!eq(ff, 1), [FFR], []);
+  let Uses = !if(ff, [FFR], []);
+  let Defs = !if(ff, [FFR], []);
 }
 
 multiclass sve_mem_cld_ss<bits<4> dtype, string asm, RegisterOperand listty,
@@ -6937,7 +7081,6 @@ multiclass sve_mem_64b_prfm_sv_lsl_scaled<bits<2> msz, string asm,
 
 }
 
-
 class sve_mem_64b_prfm_vi<bits<2> msz, string asm, Operand imm_ty>
 : I<(outs), (ins sve_prfop:$prfop, PPR3bAny:$Pg, ZPR64:$Zn, imm_ty:$imm5),
   asm, "\t$prfop, $Pg, [$Zn, $imm5]",
@@ -7021,7 +7164,6 @@ multiclass sve_int_bin_cons_misc_0_a_64_lsl<bits<2> opc, string asm> {
   def _3 : sve_int_bin_cons_misc_0_a<opc, 0b11, asm, ZPR64, ZPR64ExtLSL64>;
 }
 
-
 //===----------------------------------------------------------------------===//
 // SVE Integer Misc - Unpredicated Group
 //===----------------------------------------------------------------------===//
@@ -7085,8 +7227,8 @@ multiclass sve_int_bin_cons_misc_0_c_fexpa<string asm, SDPatternOperator op> {
 //===----------------------------------------------------------------------===//
 
 class sve_int_reduce<bits<2> sz8_32, bits<2> fmt, bits<3> opc, string asm,
-                     ZPRRegOp zprty, RegisterClass regtype>
-: I<(outs regtype:$Vd), (ins PPR3bAny:$Pg, zprty:$Zn),
+                     ZPRRegOp zprty, FPRasZPROperand dstOpType>
+: I<(outs dstOpType:$Vd), (ins PPR3bAny:$Pg, zprty:$Zn),
   asm, "\t$Vd, $Pg, $Zn",
   "",
   []>, Sched<[]> {
@@ -7104,51 +7246,54 @@ class sve_int_reduce<bits<2> sz8_32, bits<2> fmt, bits<3> opc, string asm,
   let Inst{4-0}   = Vd;
 }
 
-multiclass sve_int_reduce_0_saddv<bits<3> opc, string asm, SDPatternOperator op> {
-  def _B : sve_int_reduce<0b00, 0b00, opc, asm, ZPR8, FPR64>;
-  def _H : sve_int_reduce<0b01, 0b00, opc, asm, ZPR16, FPR64>;
-  def _S : sve_int_reduce<0b10, 0b00, opc, asm, ZPR32, FPR64>;
+multiclass sve_int_reduce_0_saddv<bits<3> opc, string asm,
+                                  SDPatternOperator op> {
+  def _B : sve_int_reduce<0b00, 0b00, opc, asm, ZPR8, FPR64asZPR>;
+  def _H : sve_int_reduce<0b01, 0b00, opc, asm, ZPR16, FPR64asZPR>;
+  def _S : sve_int_reduce<0b10, 0b00, opc, asm, ZPR32, FPR64asZPR>;
 
-  def : SVE_2_Op_Pat<i64, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
-  def : SVE_2_Op_Pat<i64, op, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>;
-  def : SVE_2_Op_Pat<i64, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_2_Op_Pat<nxv2i64, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
+  def : SVE_2_Op_Pat<nxv2i64, op, nxv8i1,  nxv8i16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_2_Op_Pat<nxv2i64, op, nxv4i1,  nxv4i32, !cast<Instruction>(NAME # _S)>;
 }
 
-multiclass sve_int_reduce_0_uaddv<bits<3> opc, string asm, SDPatternOperator op, SDPatternOperator opSaddv> {
-  def _B : sve_int_reduce<0b00, 0b00, opc, asm, ZPR8, FPR64>;
-  def _H : sve_int_reduce<0b01, 0b00, opc, asm, ZPR16, FPR64>;
-  def _S : sve_int_reduce<0b10, 0b00, opc, asm, ZPR32, FPR64>;
-  def _D : sve_int_reduce<0b11, 0b00, opc, asm, ZPR64, FPR64>;
+multiclass sve_int_reduce_0_uaddv<bits<3> opc, string asm,
+                                  SDPatternOperator op> {
+  def _B : sve_int_reduce<0b00, 0b00, opc, asm, ZPR8, FPR64asZPR>;
+  def _H : sve_int_reduce<0b01, 0b00, opc, asm, ZPR16, FPR64asZPR>;
+  def _S : sve_int_reduce<0b10, 0b00, opc, asm, ZPR32, FPR64asZPR>;
+  def _D : sve_int_reduce<0b11, 0b00, opc, asm, ZPR64, FPR64asZPR>;
 
-  def : SVE_2_Op_Pat<i64, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
-  def : SVE_2_Op_Pat<i64, op, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>;
-  def : SVE_2_Op_Pat<i64, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
-  def : SVE_2_Op_Pat<i64, op, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
-  def : SVE_2_Op_Pat<i64, opSaddv, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
+  def : SVE_2_Op_Pat<nxv2i64, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
+  def : SVE_2_Op_Pat<nxv2i64, op, nxv8i1,  nxv8i16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_2_Op_Pat<nxv2i64, op, nxv4i1,  nxv4i32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_2_Op_Pat<nxv2i64, op, nxv2i1,  nxv2i64, !cast<Instruction>(NAME # _D)>;
 }
 
-multiclass sve_int_reduce_1<bits<3> opc, string asm, SDPatternOperator op> {
-  def _B : sve_int_reduce<0b00, 0b01, opc, asm, ZPR8, FPR8>;
-  def _H : sve_int_reduce<0b01, 0b01, opc, asm, ZPR16, FPR16>;
-  def _S : sve_int_reduce<0b10, 0b01, opc, asm, ZPR32, FPR32>;
-  def _D : sve_int_reduce<0b11, 0b01, opc, asm, ZPR64, FPR64>;
+multiclass sve_int_reduce_1<bits<3> opc, string asm,
+                            SDPatternOperator op> {
+  def _B : sve_int_reduce<0b00, 0b01, opc, asm, ZPR8, FPR8asZPR>;
+  def _H : sve_int_reduce<0b01, 0b01, opc, asm, ZPR16, FPR16asZPR>;
+  def _S : sve_int_reduce<0b10, 0b01, opc, asm, ZPR32, FPR32asZPR>;
+  def _D : sve_int_reduce<0b11, 0b01, opc, asm, ZPR64, FPR64asZPR>;
 
-  def : SVE_2_Op_Pat_Reduce_To_Neon<v16i8, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B), bsub>;
-  def : SVE_2_Op_Pat_Reduce_To_Neon<v8i16, op, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H), hsub>;
-  def : SVE_2_Op_Pat_Reduce_To_Neon<v4i32, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S), ssub>;
-  def : SVE_2_Op_Pat_Reduce_To_Neon<v2i64, op, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D), dsub>;
+  def : SVE_2_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
+  def : SVE_2_Op_Pat<nxv8i16, op, nxv8i1,  nxv8i16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_2_Op_Pat<nxv4i32, op, nxv4i1,  nxv4i32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_2_Op_Pat<nxv2i64, op, nxv2i1,  nxv2i64, !cast<Instruction>(NAME # _D)>;
 }
 
-multiclass sve_int_reduce_2<bits<3> opc, string asm, SDPatternOperator op> {
-  def _B : sve_int_reduce<0b00, 0b11, opc, asm, ZPR8, FPR8>;
-  def _H : sve_int_reduce<0b01, 0b11, opc, asm, ZPR16, FPR16>;
-  def _S : sve_int_reduce<0b10, 0b11, opc, asm, ZPR32, FPR32>;
-  def _D : sve_int_reduce<0b11, 0b11, opc, asm, ZPR64, FPR64>;
+multiclass sve_int_reduce_2<bits<3> opc, string asm,
+                            SDPatternOperator op> {
+  def _B : sve_int_reduce<0b00, 0b11, opc, asm, ZPR8, FPR8asZPR>;
+  def _H : sve_int_reduce<0b01, 0b11, opc, asm, ZPR16, FPR16asZPR>;
+  def _S : sve_int_reduce<0b10, 0b11, opc, asm, ZPR32, FPR32asZPR>;
+  def _D : sve_int_reduce<0b11, 0b11, opc, asm, ZPR64, FPR64asZPR>;
 
-  def : SVE_2_Op_Pat_Reduce_To_Neon<v16i8, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B), bsub>;
-  def : SVE_2_Op_Pat_Reduce_To_Neon<v8i16, op, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H), hsub>;
-  def : SVE_2_Op_Pat_Reduce_To_Neon<v4i32, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S), ssub>;
-  def : SVE_2_Op_Pat_Reduce_To_Neon<v2i64, op, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D), dsub>;
+  def : SVE_2_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
+  def : SVE_2_Op_Pat<nxv8i16, op, nxv8i1,  nxv8i16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_2_Op_Pat<nxv4i32, op, nxv4i1,  nxv4i32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_2_Op_Pat<nxv2i64, op, nxv2i1,  nxv2i64, !cast<Instruction>(NAME # _D)>;
 }
 
 class sve_int_movprfx_pred<bits<2> sz8_32, bits<3> opc, string asm,
@@ -7253,7 +7398,7 @@ class sve_int_brkn<bit S, string asm>
   let Inst{3-0}   = Pdm;
 
   let Constraints = "$Pdm = $_Pdm";
-  let Defs = !if(!eq (S, 0b1), [NZCV], []);
+  let Defs = !if(S, [NZCV], []);
 }
 
 multiclass sve_int_brkn<bits<1> opc, string asm, SDPatternOperator op> {
@@ -7755,8 +7900,8 @@ multiclass sve_mem_ldor_ss<bits<2> sz, string asm, RegisterOperand listty,
   def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Rm]",
                   (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>;
 
-    def : Pat<(Ty (Ld1ro (PredTy PPR3bAny:$gp), (AddrCP GPR64sp:$base, gprty:$offset))),
-              (!cast<Instruction>(NAME) PPR3bAny:$gp, GPR64sp:$base, gprty:$offset)>;
+  def : Pat<(Ty (Ld1ro (PredTy PPR3bAny:$gp), (AddrCP GPR64sp:$base, gprty:$offset))),
+            (!cast<Instruction>(NAME) PPR3bAny:$gp, GPR64sp:$base, gprty:$offset)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -7790,6 +7935,7 @@ multiclass sve_int_perm_bin_perm_128_zz<bits<2> opc, bit P, string asm, SDPatter
   def : SVE_2_Op_Pat<nxv4f32,  op, nxv4f32,  nxv4f32,  !cast<Instruction>(NAME)>;
   def : SVE_2_Op_Pat<nxv2i64,  op, nxv2i64,  nxv2i64,  !cast<Instruction>(NAME)>;
   def : SVE_2_Op_Pat<nxv2f64,  op, nxv2f64,  nxv2f64,  !cast<Instruction>(NAME)>;
+  def : SVE_2_Op_Pat<nxv8bf16, op, nxv8bf16, nxv8bf16, !cast<Instruction>(NAME)>;
 }
 
 /// Addressing modes
@@ -7808,7 +7954,10 @@ multiclass sve_fp_bin_pred_hfd<SDPatternOperator op> {
   def _UNDEF_D : PredTwoOpPseudo<NAME # _D, ZPR64, FalseLanesUndef>;
 
   def : SVE_3_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Pseudo>(NAME # _UNDEF_H)>;
+  def : SVE_3_Op_Pat<nxv4f16, op, nxv4i1, nxv4f16, nxv4f16, !cast<Pseudo>(NAME # _UNDEF_H)>;
+  def : SVE_3_Op_Pat<nxv2f16, op, nxv2i1, nxv2f16, nxv2f16, !cast<Pseudo>(NAME # _UNDEF_H)>;
   def : SVE_3_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Pseudo>(NAME # _UNDEF_S)>;
+  def : SVE_3_Op_Pat<nxv2f32, op, nxv2i1, nxv2f32, nxv2f32, !cast<Pseudo>(NAME # _UNDEF_S)>;
   def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Pseudo>(NAME # _UNDEF_D)>;
 }
 
@@ -7833,3 +7982,19 @@ multiclass sve_int_bin_pred_sd<SDPatternOperator op> {
   def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Pseudo>(NAME # _UNDEF_S)>;
   def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Pseudo>(NAME # _UNDEF_D)>;
 }
+
+// Predicated pseudo integer two operand instructions. Second operand is an
+// immediate specified by imm_[bhsd].
+multiclass sve_int_shift_pred_bhsd<SDPatternOperator op,
+                                   ComplexPattern imm_b, ComplexPattern imm_h,
+                                   ComplexPattern imm_s, ComplexPattern imm_d> {
+  def _UNDEF_B : PredTwoOpImmPseudo<NAME # _B, ZPR8,  Operand<i32>, FalseLanesUndef>;
+  def _UNDEF_H : PredTwoOpImmPseudo<NAME # _H, ZPR16, Operand<i32>, FalseLanesUndef>;
+  def _UNDEF_S : PredTwoOpImmPseudo<NAME # _S, ZPR32, Operand<i32>, FalseLanesUndef>;
+  def _UNDEF_D : PredTwoOpImmPseudo<NAME # _D, ZPR64, Operand<i32>, FalseLanesUndef>;
+
+  def : SVE_Shift_DupImm_Pred_Pat<nxv16i8, op, nxv16i1, i32, imm_b, !cast<Instruction>(NAME # _UNDEF_B)>;
+  def : SVE_Shift_DupImm_Pred_Pat<nxv8i16, op, nxv8i1,  i32, imm_h, !cast<Instruction>(NAME # _UNDEF_H)>;
+  def : SVE_Shift_DupImm_Pred_Pat<nxv4i32, op, nxv4i1,  i32, imm_s, !cast<Instruction>(NAME # _UNDEF_S)>;
+  def : SVE_Shift_DupImm_Pred_Pat<nxv2i64, op, nxv2i1,  i64, imm_d, !cast<Instruction>(NAME # _UNDEF_D)>;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
index 0245dd1d611a..9911f33371c6 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
@@ -37,7 +37,7 @@
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
-#define DEBUG_TYPE "sve-intrinsic-opts"
+#define DEBUG_TYPE "aarch64-sve-intrinsic-opts"
 
 namespace llvm {
 void initializeSVEIntrinsicOptsPass(PassRegistry &);
@@ -177,22 +177,50 @@ bool SVEIntrinsicOpts::optimizeConvertFromSVBool(IntrinsicInst *I) {
   if (isa<PHINode>(I->getArgOperand(0)))
     return processPhiNode(I);
 
-  // If we have a reinterpret intrinsic I of type A which is converting from
-  // another reinterpret Y of type B, and the source type of Y is A, then we can
-  // elide away both reinterprets if there are no other users of Y.
-  auto *Y = isReinterpretToSVBool(I->getArgOperand(0));
-  if (!Y)
-    return false;
+  SmallVector<Instruction *, 32> CandidatesForRemoval;
+  Value *Cursor = I->getOperand(0), *EarliestReplacement = nullptr;
+
+  const auto *IVTy = cast<VectorType>(I->getType());
+
+  // Walk the chain of conversions.
+  while (Cursor) {
+    // If the type of the cursor has fewer lanes than the final result, zeroing
+    // must take place, which breaks the equivalence chain.
+    const auto *CursorVTy = cast<VectorType>(Cursor->getType());
+    if (CursorVTy->getElementCount().getKnownMinValue() <
+        IVTy->getElementCount().getKnownMinValue())
+      break;
+
+    // If the cursor has the same type as I, it is a viable replacement.
+    if (Cursor->getType() == IVTy)
+      EarliestReplacement = Cursor;
+
+    auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
 
-  Value *SourceVal = Y->getArgOperand(0);
-  if (I->getType() != SourceVal->getType())
+    // If this is not an SVE conversion intrinsic, this is the end of the chain.
+    if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
+                                  Intrinsic::aarch64_sve_convert_to_svbool ||
+                              IntrinsicCursor->getIntrinsicID() ==
+                                  Intrinsic::aarch64_sve_convert_from_svbool))
+      break;
+
+    CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
+    Cursor = IntrinsicCursor->getOperand(0);
+  }
+
+  // If no viable replacement in the conversion chain was found, there is
+  // nothing to do.
+  if (!EarliestReplacement)
     return false;
 
-  I->replaceAllUsesWith(SourceVal);
+  I->replaceAllUsesWith(EarliestReplacement);
   I->eraseFromParent();
-  if (Y->use_empty())
-    Y->eraseFromParent();
 
+  while (!CandidatesForRemoval.empty()) {
+    Instruction *Candidate = CandidatesForRemoval.pop_back_val();
+    if (Candidate->use_empty())
+      Candidate->eraseFromParent();
+  }
   return true;
 }
 
@@ -248,10 +276,8 @@ bool SVEIntrinsicOpts::runOnModule(Module &M) {
     case Intrinsic::aarch64_sve_ptest_any:
     case Intrinsic::aarch64_sve_ptest_first:
     case Intrinsic::aarch64_sve_ptest_last:
-      for (auto I = F.user_begin(), E = F.user_end(); I != E;) {
-        auto *Inst = dyn_cast<Instruction>(*I++);
-        Functions.insert(Inst->getFunction());
-      }
+      for (User *U : F.users())
+        Functions.insert(cast<Instruction>(U)->getFunction());
       break;
     default:
       break;
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
index c27fc7a112ec..ac59d73fd9fd 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
@@ -25,6 +25,13 @@ namespace llvm {
 }
 
 
+namespace llvm {
+  namespace AArch64DBnXS {
+#define GET_DBNXS_IMPL
+#include "AArch64GenSystemOperands.inc"
+  }
+}
+
 namespace llvm {
   namespace AArch64DB {
 #define GET_DB_IMPL
@@ -158,7 +165,7 @@ std::string AArch64SysReg::genericRegisterString(uint32_t Bits) {
 
 namespace llvm {
   namespace AArch64TLBI {
-#define GET_TLBI_IMPL
+#define GET_TLBITable_IMPL
 #include "AArch64GenSystemOperands.inc"
   }
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/contrib/llvm-project/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index 4e289fbe2325..1b13c94389cb 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -338,6 +338,14 @@ struct SysAliasReg : SysAlias {
       : SysAlias(N, E, F), NeedsReg(R) {}
 };
 
+struct SysAliasImm : SysAlias {
+  uint16_t ImmValue;
+  constexpr SysAliasImm(const char *N, uint16_t E, uint16_t I)
+      : SysAlias(N, E), ImmValue(I) {}
+  constexpr SysAliasImm(const char *N, uint16_t E, uint16_t I, FeatureBitset F)
+      : SysAlias(N, E, F), ImmValue(I) {}
+};
+
 namespace AArch64AT{
   struct AT : SysAlias {
     using SysAlias::SysAlias;
@@ -354,6 +362,14 @@ namespace AArch64DB {
   #include "AArch64GenSystemOperands.inc"
 }
 
+namespace AArch64DBnXS {
+  struct DBnXS : SysAliasImm {
+    using SysAliasImm::SysAliasImm;
+  };
+  #define GET_DBNXS_DECL
+  #include "AArch64GenSystemOperands.inc"
+}
+
 namespace  AArch64DC {
   struct DC : SysAlias {
     using SysAlias::SysAlias;
@@ -552,7 +568,7 @@ namespace AArch64TLBI {
   struct TLBI : SysAliasReg {
     using SysAliasReg::SysAliasReg;
   };
-  #define GET_TLBI_DECL
+  #define GET_TLBITable_DECL
   #include "AArch64GenSystemOperands.inc"
 }
 
@@ -606,7 +622,7 @@ namespace AArch64II {
     MO_HI12 = 7,
 
     /// MO_COFFSTUB - On a symbol operand "FOO", this indicates that the
-    /// reference is actually to the ".refptrp.FOO" symbol.  This is used for
+    /// reference is actually to the ".refptr.FOO" symbol.  This is used for
     /// stub symbols on windows.
     MO_COFFSTUB = 0x8,
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.h
index 88c79665be60..677c49331cd5 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -10,8 +10,7 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPU_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPU_H
 
-#include "llvm/IR/IntrinsicsR600.h" // TODO: Sink this.
-#include "llvm/IR/IntrinsicsAMDGPU.h" // TODO: Sink this.
+#include "llvm/IR/PassManager.h"
 #include "llvm/Support/CodeGen.h"
 
 namespace llvm {
@@ -52,7 +51,6 @@ FunctionPass *createSIAnnotateControlFlowPass();
 FunctionPass *createSIFoldOperandsPass();
 FunctionPass *createSIPeepholeSDWAPass();
 FunctionPass *createSILowerI1CopiesPass();
-FunctionPass *createSIFixupVectorISelPass();
 FunctionPass *createSIAddIMGInitPass();
 FunctionPass *createSIShrinkInstructionsPass();
 FunctionPass *createSILoadStoreOptimizerPass();
@@ -69,12 +67,25 @@ FunctionPass *createSIPostRABundlerPass();
 FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetMachine *);
 FunctionPass *createAMDGPUUseNativeCallsPass();
 FunctionPass *createAMDGPUCodeGenPreparePass();
+FunctionPass *createAMDGPULateCodeGenPreparePass();
 FunctionPass *createAMDGPUMachineCFGStructurizerPass();
 FunctionPass *createAMDGPUPropagateAttributesEarlyPass(const TargetMachine *);
 ModulePass *createAMDGPUPropagateAttributesLatePass(const TargetMachine *);
 FunctionPass *createAMDGPURewriteOutArgumentsPass();
 FunctionPass *createSIModeRegisterPass();
 
+struct AMDGPUSimplifyLibCallsPass : PassInfoMixin<AMDGPUSimplifyLibCallsPass> {
+  AMDGPUSimplifyLibCallsPass(TargetMachine &TM) : TM(TM) {}
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+private:
+  TargetMachine &TM;
+};
+
+struct AMDGPUUseNativeCallsPass : PassInfoMixin<AMDGPUUseNativeCallsPass> {
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
 void initializeAMDGPUDAGToDAGISelPass(PassRegistry&);
 
 void initializeAMDGPUMachineCFGStructurizerPass(PassRegistry&);
@@ -106,12 +117,35 @@ ModulePass *createAMDGPULowerKernelAttributesPass();
 void initializeAMDGPULowerKernelAttributesPass(PassRegistry &);
 extern char &AMDGPULowerKernelAttributesID;
 
+struct AMDGPULowerKernelAttributesPass
+    : PassInfoMixin<AMDGPULowerKernelAttributesPass> {
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
 void initializeAMDGPUPropagateAttributesEarlyPass(PassRegistry &);
 extern char &AMDGPUPropagateAttributesEarlyID;
 
+struct AMDGPUPropagateAttributesEarlyPass
+    : PassInfoMixin<AMDGPUPropagateAttributesEarlyPass> {
+  AMDGPUPropagateAttributesEarlyPass(TargetMachine &TM) : TM(TM) {}
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+private:
+  TargetMachine &TM;
+};
+
 void initializeAMDGPUPropagateAttributesLatePass(PassRegistry &);
 extern char &AMDGPUPropagateAttributesLateID;
 
+struct AMDGPUPropagateAttributesLatePass
+    : PassInfoMixin<AMDGPUPropagateAttributesLatePass> {
+  AMDGPUPropagateAttributesLatePass(TargetMachine &TM) : TM(TM) {}
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+
+private:
+  TargetMachine &TM;
+};
+
 void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &);
 extern char &AMDGPURewriteOutArgumentsID;
 
@@ -148,9 +182,6 @@ extern char &SIFixSGPRCopiesID;
 void initializeSIFixVGPRCopiesPass(PassRegistry &);
 extern char &SIFixVGPRCopiesID;
 
-void initializeSIFixupVectorISelPass(PassRegistry &);
-extern char &SIFixupVectorISelID;
-
 void initializeSILowerI1CopiesPass(PassRegistry &);
 extern char &SILowerI1CopiesID;
 
@@ -202,11 +233,37 @@ FunctionPass *createAMDGPUPromoteAllocaToVector();
 void initializeAMDGPUPromoteAllocaToVectorPass(PassRegistry&);
 extern char &AMDGPUPromoteAllocaToVectorID;
 
+struct AMDGPUPromoteAllocaPass : PassInfoMixin<AMDGPUPromoteAllocaPass> {
+  AMDGPUPromoteAllocaPass(TargetMachine &TM) : TM(TM) {}
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+private:
+  TargetMachine &TM;
+};
+
+struct AMDGPUPromoteAllocaToVectorPass
+    : PassInfoMixin<AMDGPUPromoteAllocaToVectorPass> {
+  AMDGPUPromoteAllocaToVectorPass(TargetMachine &TM) : TM(TM) {}
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+private:
+  TargetMachine &TM;
+};
+
 Pass *createAMDGPUStructurizeCFGPass();
 FunctionPass *createAMDGPUISelDag(
   TargetMachine *TM = nullptr,
   CodeGenOpt::Level OptLevel = CodeGenOpt::Default);
 ModulePass *createAMDGPUAlwaysInlinePass(bool GlobalOpt = true);
+
+struct AMDGPUAlwaysInlinePass : PassInfoMixin<AMDGPUAlwaysInlinePass> {
+  AMDGPUAlwaysInlinePass(bool GlobalOpt = true) : GlobalOpt(GlobalOpt) {}
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+
+private:
+  bool GlobalOpt;
+};
+
 ModulePass *createR600OpenCLImageTypeLoweringPass();
 FunctionPass *createAMDGPUAnnotateUniformValues();
 
@@ -214,10 +271,19 @@ ModulePass *createAMDGPUPrintfRuntimeBinding();
 void initializeAMDGPUPrintfRuntimeBindingPass(PassRegistry&);
 extern char &AMDGPUPrintfRuntimeBindingID;
 
+struct AMDGPUPrintfRuntimeBindingPass
+    : PassInfoMixin<AMDGPUPrintfRuntimeBindingPass> {
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
 ModulePass* createAMDGPUUnifyMetadataPass();
 void initializeAMDGPUUnifyMetadataPass(PassRegistry&);
 extern char &AMDGPUUnifyMetadataID;
 
+struct AMDGPUUnifyMetadataPass : PassInfoMixin<AMDGPUUnifyMetadataPass> {
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
 void initializeSIOptimizeExecMaskingPreRAPass(PassRegistry&);
 extern char &SIOptimizeExecMaskingPreRAID;
 
@@ -227,6 +293,9 @@ extern char &AMDGPUAnnotateUniformValuesPassID;
 void initializeAMDGPUCodeGenPreparePass(PassRegistry&);
 extern char &AMDGPUCodeGenPrepareID;
 
+void initializeAMDGPULateCodeGenPreparePass(PassRegistry &);
+extern char &AMDGPULateCodeGenPrepareID;
+
 void initializeSIAnnotateControlFlowPass(PassRegistry&);
 extern char &SIAnnotateControlFlowPassID;
 
@@ -258,9 +327,6 @@ void initializeAMDGPUExternalAAWrapperPass(PassRegistry&);
 
 void initializeAMDGPUArgumentUsageInfoPass(PassRegistry &);
 
-Pass *createAMDGPUFunctionInliningPass();
-void initializeAMDGPUInlinerPass(PassRegistry&);
-
 ModulePass *createAMDGPUOpenCLEnqueuedBlockLoweringPass();
 void initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(PassRegistry &);
 extern char &AMDGPUOpenCLEnqueuedBlockLoweringID;
@@ -281,8 +347,6 @@ enum TargetIndex {
 };
 }
 
-} // End namespace llvm
-
 /// OpenCL uses address spaces to differentiate between
 /// various memory regions on the hardware. On the CPU
 /// all of the address spaces point to the same memory,
@@ -339,4 +403,17 @@ namespace AMDGPUAS {
   };
 }
 
+namespace AMDGPU {
+
+// FIXME: Missing constant_32bit
+inline bool isFlatGlobalAddrSpace(unsigned AS) {
+  return AS == AMDGPUAS::GLOBAL_ADDRESS ||
+         AS == AMDGPUAS::FLAT_ADDRESS ||
+         AS == AMDGPUAS::CONSTANT_ADDRESS ||
+         AS > AMDGPUAS::MAX_AMDGPU_ADDRESS;
+}
+}
+
+} // End namespace llvm
+
 #endif
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.td
index e32f0fcc4771..c352c0097c5c 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -90,7 +90,7 @@ def FeatureAddNoCarryInsts : SubtargetFeature<"add-no-carry-insts",
 def FeatureUnalignedBufferAccess : SubtargetFeature<"unaligned-buffer-access",
   "UnalignedBufferAccess",
   "true",
-  "Support unaligned global loads and stores"
+  "Hardware supports unaligned global loads and stores"
 >;
 
 def FeatureTrapHandler: SubtargetFeature<"trap-handler",
@@ -105,6 +105,12 @@ def FeatureUnalignedScratchAccess : SubtargetFeature<"unaligned-scratch-access",
   "Support unaligned scratch loads and stores"
 >;
 
+def FeatureUnalignedDSAccess : SubtargetFeature<"unaligned-ds-access",
+  "UnalignedDSAccess",
+  "true",
+  "Hardware supports unaligned local and region loads and stores"
+>;
+
 def FeatureApertureRegs : SubtargetFeature<"aperture-regs",
   "HasApertureRegs",
   "true",
@@ -123,10 +129,10 @@ def FeatureFmaMixInsts : SubtargetFeature<"fma-mix-insts",
   "Has v_fma_mix_f32, v_fma_mixlo_f16, v_fma_mixhi_f16 instructions"
 >;
 
-def FeatureDoesNotSupportXNACK : SubtargetFeature<"no-xnack-support",
-  "DoesNotSupportXNACK",
+def FeatureSupportsXNACK : SubtargetFeature<"xnack-support",
+  "SupportsXNACK",
   "true",
-  "Hardware does not support XNACK"
+  "Hardware supports XNACK"
 >;
 
 // XNACK is disabled if SH_MEM_CONFIG.ADDRESS_MODE = GPUVM on chips that support
@@ -157,7 +163,7 @@ def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug",
 def FeatureLdsMisalignedBug : SubtargetFeature<"lds-misaligned-bug",
   "LDSMisalignedBug",
   "true",
-  "Some GFX10 bug with misaligned multi-dword LDS access in WGP mode"
+  "Some GFX10 bug with multi-dword LDS and flat access that is not naturally aligned in WGP mode"
 >;
 
 def FeatureMFMAInlineLiteralBug : SubtargetFeature<"mfma-inline-literal-bug",
@@ -220,6 +226,18 @@ def FeatureOffset3fBug : SubtargetFeature<"offset-3f-bug",
   "Branch offset of 3f hardware bug"
 >;
 
+def FeatureImageStoreD16Bug : SubtargetFeature<"image-store-d16-bug",
+  "HasImageStoreD16Bug",
+  "true",
+  "Image Store D16 hardware bug"
+>;
+
+def FeatureImageGather4D16Bug : SubtargetFeature<"image-gather4-d16-bug",
+  "HasImageGather4D16Bug",
+  "true",
+  "Image Gather4 D16 hardware bug"
+>;
+
 class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature <
   "ldsbankcount"#Value,
   "LDSBankCount",
@@ -473,16 +491,16 @@ def FeatureAtomicFaddInsts : SubtargetFeature<"atomic-fadd-insts",
   [FeatureFlatGlobalInsts]
 >;
 
-def FeatureDoesNotSupportSRAMECC : SubtargetFeature<"no-sram-ecc-support",
-  "DoesNotSupportSRAMECC",
+def FeatureSupportsSRAMECC : SubtargetFeature<"sramecc-support",
+  "SupportsSRAMECC",
   "true",
-  "Hardware does not support SRAM ECC"
+  "Hardware supports SRAMECC"
 >;
 
-def FeatureSRAMECC : SubtargetFeature<"sram-ecc",
+def FeatureSRAMECC : SubtargetFeature<"sramecc",
   "EnableSRAMECC",
   "true",
-  "Enable SRAM ECC"
+  "Enable SRAMECC"
 >;
 
 def FeatureNoSdstCMPX : SubtargetFeature<"no-sdst-cmpx",
@@ -626,19 +644,21 @@ def FeatureAutoWaitcntBeforeBarrier : SubtargetFeature <
   "Hardware automatically inserts waitcnt before barrier"
 >;
 
-def FeatureCodeObjectV3 : SubtargetFeature <
-  "code-object-v3",
-  "CodeObjectV3",
-  "true",
-  "Generate code object version 3"
->;
-
 def FeatureTrigReducedRange : SubtargetFeature<"trig-reduced-range",
   "HasTrigReducedRange",
   "true",
   "Requires use of fract on arguments to trig instructions"
 >;
 
+// Alignment enforcement is controlled by a configuration register:
+// SH_MEM_CONFIG.alignment_mode
+def FeatureUnalignedAccessMode : SubtargetFeature<"unaligned-access-mode",
+  "UnalignedAccessMode",
+  "true",
+  "Enable unaligned global, local and region loads and stores if the hardware"
+  " supports it"
+>;
+
 // Dummy feature used to disable assembler instructions.
 def FeatureDisable : SubtargetFeature<"",
   "FeatureDisable","true",
@@ -655,8 +675,7 @@ def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
   [FeatureFP64, FeatureLocalMemorySize32768, FeatureMIMG_R128,
   FeatureWavefrontSize64, FeatureSMemTimeInst, FeatureMadMacF32Insts,
   FeatureDsSrc2Insts, FeatureLDSBankCount32, FeatureMovrel,
-  FeatureTrigReducedRange, FeatureDoesNotSupportSRAMECC,
-  FeatureDoesNotSupportXNACK]
+  FeatureTrigReducedRange]
 >;
 
 def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS",
@@ -665,7 +684,7 @@ def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS",
   FeatureWavefrontSize64, FeatureFlatAddressSpace,
   FeatureCIInsts, FeatureMovrel, FeatureTrigReducedRange,
   FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts,
-  FeatureDsSrc2Insts, FeatureDoesNotSupportSRAMECC]
+  FeatureDsSrc2Insts, FeatureUnalignedBufferAccess]
 >;
 
 def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
@@ -678,8 +697,7 @@ def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
    FeatureSDWA, FeatureSDWAOutModsVOPC, FeatureSDWAMac, FeatureDPP,
    FeatureIntClamp, FeatureTrigReducedRange, FeatureGFX8Insts,
    FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts,
-   FeatureDsSrc2Insts, FeatureDoesNotSupportSRAMECC, FeatureFastDenormalF32
-  ]
+   FeatureDsSrc2Insts, FeatureFastDenormalF32, FeatureUnalignedBufferAccess]
 >;
 
 def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
@@ -695,8 +713,8 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
    FeatureAddNoCarryInsts, FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts,
    FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16,
    FeatureSMemTimeInst, FeatureMadMacF32Insts, FeatureDsSrc2Insts,
-   FeatureFastDenormalF32
-  ]
+   FeatureFastDenormalF32, FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess,
+   FeatureSupportsXNACK]
 >;
 
 def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
@@ -712,8 +730,9 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
    FeatureAddNoCarryInsts, FeatureFmaMixInsts, FeatureGFX8Insts,
    FeatureNoSdstCMPX, FeatureVscnt, FeatureRegisterBanking,
    FeatureVOP3Literal, FeatureDPP8,
-   FeatureNoDataDepHazard, FeaturePkFmacF16Inst, FeatureDoesNotSupportSRAMECC,
-   FeatureGFX10A16, FeatureFastDenormalF32, FeatureG16
+   FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
+   FeatureGFX10A16, FeatureFastDenormalF32, FeatureG16,
+   FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess
   ]
 >;
 
@@ -724,102 +743,92 @@ class FeatureSet<list<SubtargetFeature> Features_> {
 def FeatureISAVersion6_0_0 : FeatureSet<[FeatureSouthernIslands,
    FeatureFastFMAF32,
    HalfRate64Ops,
-   FeatureLDSBankCount32,
-   FeatureDoesNotSupportXNACK,
-   FeatureCodeObjectV3]>;
+   FeatureLDSBankCount32]>;
 
 def FeatureISAVersion6_0_1 : FeatureSet<
   [FeatureSouthernIslands,
-   FeatureLDSBankCount32,
-   FeatureDoesNotSupportXNACK,
-   FeatureCodeObjectV3]>;
+   FeatureLDSBankCount32]>;
+
+def FeatureISAVersion6_0_2 : FeatureSet<
+  [FeatureSouthernIslands,
+   FeatureLDSBankCount32]>;
 
 def FeatureISAVersion7_0_0 : FeatureSet<
   [FeatureSeaIslands,
-   FeatureLDSBankCount32,
-   FeatureDoesNotSupportXNACK,
-   FeatureCodeObjectV3]>;
+   FeatureLDSBankCount32]>;
 
 def FeatureISAVersion7_0_1 : FeatureSet<
   [FeatureSeaIslands,
    HalfRate64Ops,
    FeatureLDSBankCount32,
-   FeatureFastFMAF32,
-   FeatureDoesNotSupportXNACK,
-   FeatureCodeObjectV3]>;
+   FeatureFastFMAF32]>;
 
 def FeatureISAVersion7_0_2 : FeatureSet<
   [FeatureSeaIslands,
    FeatureLDSBankCount16,
-   FeatureFastFMAF32,
-   FeatureDoesNotSupportXNACK,
-   FeatureCodeObjectV3]>;
+   FeatureFastFMAF32]>;
 
 def FeatureISAVersion7_0_3 : FeatureSet<
   [FeatureSeaIslands,
-   FeatureLDSBankCount16,
-   FeatureDoesNotSupportXNACK,
-   FeatureCodeObjectV3]>;
+   FeatureLDSBankCount16]>;
 
 def FeatureISAVersion7_0_4 : FeatureSet<
   [FeatureSeaIslands,
-   FeatureLDSBankCount32,
-   FeatureDoesNotSupportXNACK,
-   FeatureCodeObjectV3]>;
+   FeatureLDSBankCount32]>;
+
+def FeatureISAVersion7_0_5 : FeatureSet<
+  [FeatureSeaIslands,
+   FeatureLDSBankCount16]>;
 
 def FeatureISAVersion8_0_1 : FeatureSet<
   [FeatureVolcanicIslands,
    FeatureFastFMAF32,
    HalfRate64Ops,
    FeatureLDSBankCount32,
-   FeatureXNACK,
-   FeatureUnpackedD16VMem,
-   FeatureCodeObjectV3]>;
+   FeatureSupportsXNACK,
+   FeatureUnpackedD16VMem]>;
 
 def FeatureISAVersion8_0_2 : FeatureSet<
   [FeatureVolcanicIslands,
    FeatureLDSBankCount32,
    FeatureSGPRInitBug,
-   FeatureUnpackedD16VMem,
-   FeatureDoesNotSupportXNACK,
-   FeatureCodeObjectV3]>;
+   FeatureUnpackedD16VMem]>;
 
 def FeatureISAVersion8_0_3 : FeatureSet<
   [FeatureVolcanicIslands,
    FeatureLDSBankCount32,
-   FeatureUnpackedD16VMem,
-   FeatureDoesNotSupportXNACK,
-   FeatureCodeObjectV3]>;
+   FeatureUnpackedD16VMem]>;
+
+def FeatureISAVersion8_0_5 : FeatureSet<
+  [FeatureVolcanicIslands,
+   FeatureLDSBankCount32,
+   FeatureSGPRInitBug,
+   FeatureUnpackedD16VMem]>;
 
 def FeatureISAVersion8_1_0 : FeatureSet<
   [FeatureVolcanicIslands,
    FeatureLDSBankCount16,
-   FeatureXNACK,
-   FeatureCodeObjectV3]>;
+   FeatureSupportsXNACK,
+   FeatureImageStoreD16Bug,
+   FeatureImageGather4D16Bug]>;
 
 def FeatureISAVersion9_0_0 : FeatureSet<
   [FeatureGFX9,
    FeatureMadMixInsts,
    FeatureLDSBankCount32,
-   FeatureCodeObjectV3,
-   FeatureDoesNotSupportXNACK,
-   FeatureDoesNotSupportSRAMECC]>;
+   FeatureImageGather4D16Bug]>;
 
 def FeatureISAVersion9_0_2 : FeatureSet<
   [FeatureGFX9,
    FeatureMadMixInsts,
    FeatureLDSBankCount32,
-   FeatureXNACK,
-   FeatureDoesNotSupportSRAMECC,
-   FeatureCodeObjectV3]>;
+   FeatureImageGather4D16Bug]>;
 
 def FeatureISAVersion9_0_4 : FeatureSet<
   [FeatureGFX9,
    FeatureLDSBankCount32,
    FeatureFmaMixInsts,
-   FeatureDoesNotSupportXNACK,
-   FeatureDoesNotSupportSRAMECC,
-   FeatureCodeObjectV3]>;
+   FeatureImageGather4D16Bug]>;
 
 def FeatureISAVersion9_0_6 : FeatureSet<
   [FeatureGFX9,
@@ -829,8 +838,8 @@ def FeatureISAVersion9_0_6 : FeatureSet<
    FeatureDLInsts,
    FeatureDot1Insts,
    FeatureDot2Insts,
-   FeatureDoesNotSupportXNACK,
-   FeatureCodeObjectV3]>;
+   FeatureSupportsSRAMECC,
+   FeatureImageGather4D16Bug]>;
 
 def FeatureISAVersion9_0_8 : FeatureSet<
   [FeatureGFX9,
@@ -847,16 +856,22 @@ def FeatureISAVersion9_0_8 : FeatureSet<
    FeatureMAIInsts,
    FeaturePkFmacF16Inst,
    FeatureAtomicFaddInsts,
-   FeatureSRAMECC,
+   FeatureSupportsSRAMECC,
    FeatureMFMAInlineLiteralBug,
-   FeatureCodeObjectV3]>;
+   FeatureImageGather4D16Bug]>;
 
 def FeatureISAVersion9_0_9 : FeatureSet<
+  [FeatureGFX9,
+   FeatureMadMixInsts,
+   FeatureLDSBankCount32,
+   FeatureImageGather4D16Bug]>;
+
+def FeatureISAVersion9_0_C : FeatureSet<
   [FeatureGFX9,
    FeatureMadMixInsts,
    FeatureLDSBankCount32,
    FeatureXNACK,
-   FeatureCodeObjectV3]>;
+   FeatureImageGather4D16Bug]>;
 
 // TODO: Organize more features into groups.
 def FeatureGroup {
@@ -889,8 +904,7 @@ def FeatureISAVersion10_1_0 : FeatureSet<
      FeatureMadMacF32Insts,
      FeatureDsSrc2Insts,
      FeatureLdsMisalignedBug,
-     FeatureDoesNotSupportXNACK,
-     FeatureCodeObjectV3])>;
+     FeatureSupportsXNACK])>;
 
 def FeatureISAVersion10_1_1 : FeatureSet<
   !listconcat(FeatureGroup.GFX10_1_Bugs,
@@ -910,8 +924,8 @@ def FeatureISAVersion10_1_1 : FeatureSet<
      FeatureSMemTimeInst,
      FeatureMadMacF32Insts,
      FeatureDsSrc2Insts,
-     FeatureDoesNotSupportXNACK,
-     FeatureCodeObjectV3])>;
+     FeatureLdsMisalignedBug,
+     FeatureSupportsXNACK])>;
 
 def FeatureISAVersion10_1_2 : FeatureSet<
   !listconcat(FeatureGroup.GFX10_1_Bugs,
@@ -932,8 +946,7 @@ def FeatureISAVersion10_1_2 : FeatureSet<
      FeatureMadMacF32Insts,
      FeatureDsSrc2Insts,
      FeatureLdsMisalignedBug,
-     FeatureDoesNotSupportXNACK,
-     FeatureCodeObjectV3])>;
+     FeatureSupportsXNACK])>;
 
 def FeatureISAVersion10_3_0 : FeatureSet<
   [FeatureGFX10,
@@ -946,9 +959,7 @@ def FeatureISAVersion10_3_0 : FeatureSet<
    FeatureDot5Insts,
    FeatureDot6Insts,
    FeatureNSAEncoding,
-   FeatureWavefrontSize32,
-   FeatureDoesNotSupportXNACK,
-   FeatureCodeObjectV3]>;
+   FeatureWavefrontSize32]>;
 
 //===----------------------------------------------------------------------===//
 
@@ -1095,6 +1106,11 @@ def isGFX10Plus :
   Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10">,
   AssemblerPredicate<(all_of FeatureGFX10Insts)>;
 
+def isGFX10Before1030 :
+  Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::GFX10 &&"
+            "!Subtarget->hasGFX10_3Insts()">,
+  AssemblerPredicate<(all_of FeatureGFX10Insts,(not FeatureGFX10_3Insts))>;
+
 def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">,
   AssemblerPredicate<(all_of FeatureFlatAddressSpace)>;
 
@@ -1107,6 +1123,9 @@ def HasScalarFlatScratchInsts : Predicate<"Subtarget->hasScalarFlatScratchInsts(
 def HasD16LoadStore : Predicate<"Subtarget->hasD16LoadStore()">,
   AssemblerPredicate<(all_of FeatureGFX9Insts)>;
 
+def HasFlatScratchSTMode : Predicate<"Subtarget->hasFlatScratchSTMode()">,
+  AssemblerPredicate<(any_of FeatureGFX10_3Insts)>;
+
 def HasGFX10_BEncoding : Predicate<"Subtarget->hasGFX10_BEncoding()">,
   AssemblerPredicate<(all_of FeatureGFX10_BEncoding)>;
 
@@ -1225,6 +1244,9 @@ def HasGetWaveIdInst : Predicate<"Subtarget->hasGetWaveIdInst()">,
 def HasMAIInsts : Predicate<"Subtarget->hasMAIInsts()">,
   AssemblerPredicate<(all_of FeatureMAIInsts)>;
 
+def HasSMemRealTime : Predicate<"Subtarget->hasSMemRealTime()">,
+  AssemblerPredicate<(all_of FeatureSMemRealTime)>;
+
 def HasSMemTimeInst : Predicate<"Subtarget->hasSMemTimeInst()">,
   AssemblerPredicate<(all_of FeatureSMemTimeInst)>;
 
@@ -1236,12 +1258,12 @@ def HasPkFmacF16Inst : Predicate<"Subtarget->hasPkFmacF16Inst()">,
 def HasMadMacF32Insts : Predicate<"Subtarget->hasMadMacF32Insts()">,
   AssemblerPredicate<(all_of FeatureMadMacF32Insts)>;
 
+def HasFmaLegacy32 : Predicate<"Subtarget->hasGFX10_3Insts()">,
+  AssemblerPredicate<(any_of FeatureGFX10_3Insts)>;
+
 def HasAtomicFaddInsts : Predicate<"Subtarget->hasAtomicFaddInsts()">,
   AssemblerPredicate<(all_of FeatureAtomicFaddInsts)>;
 
-def HasNoMadMacF32Insts : Predicate<"!Subtarget->hasMadMacF32Insts()">,
-  AssemblerPredicate<(all_of (not FeatureMadMacF32Insts))>;
-
 def HasDsSrc2Insts : Predicate<"!Subtarget->hasDsSrc2Insts()">,
   AssemblerPredicate<(all_of FeatureDsSrc2Insts)>;
 
@@ -1251,6 +1273,13 @@ def HasOffset3fBug : Predicate<"!Subtarget->hasOffset3fBug()">,
 def EnableLateCFGStructurize : Predicate<
   "EnableLateStructurizeCFG">;
 
+def EnableFlatScratch : Predicate<"Subtarget->enableFlatScratch()">;
+
+def DisableFlatScratch : Predicate<"!Subtarget->enableFlatScratch()">;
+
+def HasUnalignedAccessMode : Predicate<"Subtarget->hasUnalignedAccessMode()">,
+  AssemblerPredicate<(all_of FeatureUnalignedAccessMode)>;
+
 // Include AMDGPU TD files
 include "SISchedule.td"
 include "GCNProcessors.td"
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
index bb2aba044974..0ed89e9ca8d6 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
@@ -10,27 +10,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUAliasAnalysis.h"
-#include "AMDGPU.h"
-#include "llvm/ADT/Triple.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/Argument.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/CallingConv.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Value.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/ErrorHandling.h"
-#include <cassert>
+#include "llvm/IR/Instructions.h"
 
 using namespace llvm;
 
 #define DEBUG_TYPE "amdgpu-aa"
 
+AnalysisKey AMDGPUAA::Key;
+
 // Register this pass...
 char AMDGPUAAWrapperPass::ID = 0;
 char AMDGPUExternalAAWrapper::ID = 0;
@@ -85,6 +73,44 @@ AliasResult AMDGPUAAResult::alias(const MemoryLocation &LocA,
   if (Result == NoAlias)
     return Result;
 
+  // In general, FLAT (generic) pointers could be aliased to LOCAL or PRIVATE
+  // pointers. However, as LOCAL or PRIVATE pointers point to local objects, in
+  // certain cases, it's still viable to check whether a FLAT pointer won't
+  // alias to a LOCAL or PRIVATE pointer.
+  MemoryLocation A = LocA;
+  MemoryLocation B = LocB;
+  // Canonicalize the location order to simplify the following alias check.
+  if (asA != AMDGPUAS::FLAT_ADDRESS) {
+    std::swap(asA, asB);
+    std::swap(A, B);
+  }
+  if (asA == AMDGPUAS::FLAT_ADDRESS &&
+      (asB == AMDGPUAS::LOCAL_ADDRESS || asB == AMDGPUAS::PRIVATE_ADDRESS)) {
+    const auto *ObjA =
+        getUnderlyingObject(A.Ptr->stripPointerCastsAndInvariantGroups());
+    if (const LoadInst *LI = dyn_cast<LoadInst>(ObjA)) {
+      // If a generic pointer is loaded from the constant address space, it
+      // could only be a GLOBAL or CONSTANT one as that address space is soley
+      // prepared on the host side, where only GLOBAL or CONSTANT variables are
+      // visible. Note that this even holds for regular functions.
+      if (LI->getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS)
+        return NoAlias;
+    } else if (const Argument *Arg = dyn_cast<Argument>(ObjA)) {
+      const Function *F = Arg->getParent();
+      switch (F->getCallingConv()) {
+      case CallingConv::AMDGPU_KERNEL:
+        // In the kernel function, kernel arguments won't alias to (local)
+        // variables in shared or private address space.
+        return NoAlias;
+      default:
+        // TODO: In the regular function, if that local variable in the
+        // location B is not captured, that argument pointer won't alias to it
+        // as well.
+        break;
+      }
+    }
+  }
+
   // Forward the query to the next alias analysis.
   return AAResultBase::alias(LocA, LocB, AAQI);
 }
@@ -96,7 +122,7 @@ bool AMDGPUAAResult::pointsToConstantMemory(const MemoryLocation &Loc,
       AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
     return true;
 
-  const Value *Base = GetUnderlyingObject(Loc.Ptr, DL);
+  const Value *Base = getUnderlyingObject(Loc.Ptr);
   AS = Base->getType()->getPointerAddressSpace();
   if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
       AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h
index fd8889ea5c0d..44de40d4aa7f 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h
@@ -13,13 +13,7 @@
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUALIASANALYSIS_H
 
 #include "AMDGPU.h"
-#include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
-#include <algorithm>
-#include <memory>
 
 namespace llvm {
 
@@ -34,15 +28,17 @@ class AMDGPUAAResult : public AAResultBase<AMDGPUAAResult> {
   const DataLayout &DL;
 
 public:
-  explicit AMDGPUAAResult(const DataLayout &DL, Triple T) : AAResultBase(),
-    DL(DL) {}
+  explicit AMDGPUAAResult(const DataLayout &DL) : AAResultBase(), DL(DL) {}
   AMDGPUAAResult(AMDGPUAAResult &&Arg)
       : AAResultBase(std::move(Arg)), DL(Arg.DL) {}
 
   /// Handle invalidation events from the new pass manager.
   ///
   /// By definition, this result is stateless and so remains valid.
-  bool invalidate(Function &, const PreservedAnalyses &) { return false; }
+  bool invalidate(Function &, const PreservedAnalyses &,
+                  FunctionAnalysisManager::Invalidator &Inv) {
+    return false;
+  }
 
   AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB,
                     AAQueryInfo &AAQI);
@@ -54,14 +50,13 @@ public:
 class AMDGPUAA : public AnalysisInfoMixin<AMDGPUAA> {
   friend AnalysisInfoMixin<AMDGPUAA>;
 
-  static char PassID;
+  static AnalysisKey Key;
 
 public:
   using Result = AMDGPUAAResult;
 
   AMDGPUAAResult run(Function &F, AnalysisManager<Function> &AM) {
-    return AMDGPUAAResult(F.getParent()->getDataLayout(),
-        Triple(F.getParent()->getTargetTriple()));
+    return AMDGPUAAResult(F.getParent()->getDataLayout());
   }
 };
 
@@ -80,8 +75,7 @@ public:
   const AMDGPUAAResult &getResult() const { return *Result; }
 
   bool doInitialization(Module &M) override {
-    Result.reset(new AMDGPUAAResult(M.getDataLayout(),
-        Triple(M.getTargetTriple())));
+    Result.reset(new AMDGPUAAResult(M.getDataLayout()));
     return false;
   }
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
index 22947544ac07..51af25050950 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
@@ -15,9 +15,9 @@
 #include "AMDGPU.h"
 #include "AMDGPUTargetMachine.h"
 #include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/IR/Module.h"
-#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
 
 using namespace llvm;
 
@@ -32,8 +32,6 @@ static cl::opt<bool> StressCalls(
 class AMDGPUAlwaysInline : public ModulePass {
   bool GlobalOpt;
 
-  void recursivelyVisitUsers(GlobalValue &GV,
-                             SmallPtrSetImpl<Function *> &FuncsToAlwaysInline);
 public:
   static char ID;
 
@@ -53,16 +51,13 @@ INITIALIZE_PASS(AMDGPUAlwaysInline, "amdgpu-always-inline",
 
 char AMDGPUAlwaysInline::ID = 0;
 
-void AMDGPUAlwaysInline::recursivelyVisitUsers(
-  GlobalValue &GV,
-  SmallPtrSetImpl<Function *> &FuncsToAlwaysInline) {
-  SmallVector<User *, 16> Stack;
+static void
+recursivelyVisitUsers(GlobalValue &GV,
+                      SmallPtrSetImpl<Function *> &FuncsToAlwaysInline) {
+  SmallVector<User *, 16> Stack(GV.users());
 
   SmallPtrSet<const Value *, 8> Visited;
 
-  for (User *U : GV.users())
-    Stack.push_back(U);
-
   while (!Stack.empty()) {
     User *U = Stack.pop_back_val();
     if (!Visited.insert(U).second)
@@ -86,12 +81,11 @@ void AMDGPUAlwaysInline::recursivelyVisitUsers(
       continue;
     }
 
-    for (User *UU : U->users())
-      Stack.push_back(UU);
+    append_range(Stack, U->users());
   }
 }
 
-bool AMDGPUAlwaysInline::runOnModule(Module &M) {
+static bool alwaysInlineImpl(Module &M, bool GlobalOpt) {
   std::vector<GlobalAlias*> AliasesToRemove;
 
   SmallPtrSet<Function *, 8> FuncsToAlwaysInline;
@@ -157,7 +151,16 @@ bool AMDGPUAlwaysInline::runOnModule(Module &M) {
   return !FuncsToAlwaysInline.empty() || !FuncsToNoInline.empty();
 }
 
+bool AMDGPUAlwaysInline::runOnModule(Module &M) {
+  return alwaysInlineImpl(M, GlobalOpt);
+}
+
 ModulePass *llvm::createAMDGPUAlwaysInlinePass(bool GlobalOpt) {
   return new AMDGPUAlwaysInline(GlobalOpt);
 }
 
+PreservedAnalyses AMDGPUAlwaysInlinePass::run(Module &M,
+                                              ModuleAnalysisManager &AM) {
+  alwaysInlineImpl(M, GlobalOpt);
+  return PreservedAnalyses::all();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
index 625074569cfa..a4e72f787230 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
@@ -12,27 +12,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Triple.h"
+#include "GCNSubtarget.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Use.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/ErrorHandling.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/IntrinsicsR600.h"
 #include "llvm/Target/TargetMachine.h"
 
 #define DEBUG_TYPE "amdgpu-annotate-kernel-features"
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
index 45f515c5115e..c2a4d67ea98e 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
@@ -18,11 +18,8 @@
 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
-#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstVisitor.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
 
 #define DEBUG_TYPE "amdgpu-annotate-uniform"
 
@@ -108,9 +105,11 @@ bool AMDGPUAnnotateUniformValues::isClobberedInFunction(LoadInst * Load) {
   for (auto &BB : Checklist) {
     BasicBlock::iterator StartIt = (!L && (BB == Load->getParent())) ?
       BasicBlock::iterator(Load) : BB->end();
-    auto Q = MDR->getPointerDependencyFrom(MemoryLocation(Ptr), true,
-                                           StartIt, BB, Load);
-    if (Q.isClobber() || Q.isUnknown())
+    auto Q = MDR->getPointerDependencyFrom(
+        MemoryLocation::getBeforeOrAfter(Ptr), true, StartIt, BB, Load);
+    if (Q.isClobber() || Q.isUnknown() ||
+        // Store defines the load and thus clobbers it.
+        (Q.isDef() && Q.getInst()->mayWriteToMemory()))
       return true;
   }
   return false;
@@ -140,10 +139,11 @@ void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) {
   }
 
   bool NotClobbered = false;
+  bool GlobalLoad = isGlobalLoad(I);
   if (PtrI)
-    NotClobbered = !isClobberedInFunction(&I);
+    NotClobbered = GlobalLoad && !isClobberedInFunction(&I);
   else if (isa<Argument>(Ptr) || isa<GlobalValue>(Ptr)) {
-    if (isGlobalLoad(I) && !isClobberedInFunction(&I)) {
+    if (GlobalLoad && !isClobberedInFunction(&I)) {
       NotClobbered = true;
       // Lookup for the existing GEP
       if (noClobberClones.count(Ptr)) {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
index d078fc147a36..fb273a1650ae 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
@@ -6,11 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "AMDGPU.h"
 #include "AMDGPUArgumentUsageInfo.h"
+#include "AMDGPU.h"
 #include "AMDGPUTargetMachine.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIRegisterInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/IR/Function.h"
 #include "llvm/Support/NativeFormatting.h"
 #include "llvm/Support/raw_ostream.h"
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
index 576e6cfe929e..139ac3bab14c 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
@@ -9,14 +9,13 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H
 
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/CodeGen/Register.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/LowLevelTypeImpl.h"
 
 namespace llvm {
 
 class Function;
+class LLT;
 class raw_ostream;
 class TargetRegisterClass;
 class TargetRegisterInfo;
@@ -27,7 +26,7 @@ private:
   friend class AMDGPUArgumentUsageInfo;
 
   union {
-    Register Reg;
+    MCRegister Reg;
     unsigned StackOffset;
   };
 
@@ -69,7 +68,7 @@ public:
     return !IsStack;
   }
 
-  Register getRegister() const {
+  MCRegister getRegister() const {
     assert(!IsStack);
     return Reg;
   }
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index eef8fe2fc3b7..c655e5ec87b7 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -17,37 +17,27 @@
 
 #include "AMDGPUAsmPrinter.h"
 #include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "AMDGPUTargetMachine.h"
+#include "AMDGPUHSAMetadataStreamer.h"
+#include "AMDKernelCodeT.h"
+#include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUInstPrinter.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "MCTargetDesc/AMDGPUTargetStreamer.h"
 #include "R600AsmPrinter.h"
-#include "R600Defines.h"
-#include "R600MachineFunctionInfo.h"
-#include "R600RegisterInfo.h"
-#include "SIDefines.h"
-#include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
-#include "SIRegisterInfo.h"
 #include "TargetInfo/AMDGPUTargetInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/BinaryFormat/ELF.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCStreamer.h"
-#include "llvm/Support/AMDGPUMetadata.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/TargetParser.h"
+#include "llvm/Support/AMDHSAKernelDescriptor.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
 using namespace llvm::AMDGPU;
-using namespace llvm::AMDGPU::HSAMD;
 
 // We need to tell the runtime some amount ahead of time if we don't know the
 // true stack size. Assume a smaller number if this is only due to dynamic /
@@ -108,10 +98,13 @@ extern "C" void LLVM_EXTERNAL_VISIBILITY LLVMInitializeAMDGPUAsmPrinter() {
 AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
                                    std::unique_ptr<MCStreamer> Streamer)
   : AsmPrinter(TM, std::move(Streamer)) {
-    if (IsaInfo::hasCodeObjectV3(getGlobalSTI()))
-      HSAMetadataStream.reset(new MetadataStreamerV3());
-    else
-      HSAMetadataStream.reset(new MetadataStreamerV2());
+  if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
+    if (isHsaAbiVersion2(getGlobalSTI())) {
+      HSAMetadataStream.reset(new HSAMD::MetadataStreamerV2());
+    } else {
+      HSAMetadataStream.reset(new HSAMD::MetadataStreamerV3());
+    }
+  }
 }
 
 StringRef AMDGPUAsmPrinter::getPassName() const {
@@ -129,7 +122,7 @@ AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const {
 }
 
 void AMDGPUAsmPrinter::emitStartOfAsmFile(Module &M) {
-  if (IsaInfo::hasCodeObjectV3(getGlobalSTI())) {
+  if (isHsaAbiVersion3(getGlobalSTI())) {
     std::string ExpectedTarget;
     raw_string_ostream ExpectedTargetOS(ExpectedTarget);
     IsaInfo::streamIsaVersion(getGlobalSTI(), ExpectedTargetOS);
@@ -147,7 +140,7 @@ void AMDGPUAsmPrinter::emitStartOfAsmFile(Module &M) {
   if (TM.getTargetTriple().getOS() == Triple::AMDPAL)
     getTargetStreamer()->getPALMetadata()->readFromIR(M);
 
-  if (IsaInfo::hasCodeObjectV3(getGlobalSTI()))
+  if (isHsaAbiVersion3(getGlobalSTI()))
     return;
 
   // HSA emits NT_AMDGPU_HSA_CODE_OBJECT_VERSION for code objects v2.
@@ -165,7 +158,8 @@ void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) {
   if (!getTargetStreamer())
     return;
 
-  if (!IsaInfo::hasCodeObjectV3(getGlobalSTI())) {
+  if (TM.getTargetTriple().getOS() != Triple::AMDHSA ||
+      isHsaAbiVersion2(getGlobalSTI())) {
     // Emit ISA Version (NT_AMD_AMDGPU_ISA).
     std::string ISAVersionString;
     raw_string_ostream ISAVersionStream(ISAVersionString);
@@ -203,7 +197,7 @@ void AMDGPUAsmPrinter::emitFunctionBodyStart() {
 
   const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
   const Function &F = MF->getFunction();
-  if (!STM.hasCodeObjectV3() && STM.isAmdHsaOrMesa(F) &&
+  if ((STM.isMesaKernel(F) || isHsaAbiVersion2(getGlobalSTI())) &&
       (F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
        F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
     amd_kernel_code_t KernelCode;
@@ -220,8 +214,8 @@ void AMDGPUAsmPrinter::emitFunctionBodyEnd() {
   if (!MFI.isEntryFunction())
     return;
 
-  if (!IsaInfo::hasCodeObjectV3(getGlobalSTI()) ||
-      TM.getTargetTriple().getOS() != Triple::AMDHSA)
+  if (TM.getTargetTriple().getOS() != Triple::AMDHSA ||
+      isHsaAbiVersion2(getGlobalSTI()))
     return;
 
   auto &Streamer = getTargetStreamer()->getStreamer();
@@ -256,8 +250,8 @@ void AMDGPUAsmPrinter::emitFunctionBodyEnd() {
 }
 
 void AMDGPUAsmPrinter::emitFunctionEntryLabel() {
-  if (IsaInfo::hasCodeObjectV3(getGlobalSTI()) &&
-      TM.getTargetTriple().getOS() == Triple::AMDHSA) {
+  if (TM.getTargetTriple().getOS() == Triple::AMDHSA &&
+      isHsaAbiVersion3(getGlobalSTI())) {
     AsmPrinter::emitFunctionEntryLabel();
     return;
   }
@@ -334,7 +328,7 @@ bool AMDGPUAsmPrinter::doFinalization(Module &M) {
   // causing stale data in caches. Arguably this should be done by the linker,
   // which is why this isn't done for Mesa.
   const MCSubtargetInfo &STI = *getGlobalSTI();
-  if (AMDGPU::isGFX10(STI) &&
+  if (AMDGPU::isGFX10Plus(STI) &&
       (STI.getTargetTriple().getOS() == Triple::AMDHSA ||
        STI.getTargetTriple().getOS() == Triple::AMDPAL)) {
     OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
@@ -410,12 +404,12 @@ amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
   memset(&KernelDescriptor, 0x0, sizeof(KernelDescriptor));
 
   assert(isUInt<32>(PI.ScratchSize));
-  assert(isUInt<32>(PI.ComputePGMRSrc1));
+  assert(isUInt<32>(PI.getComputePGMRSrc1()));
   assert(isUInt<32>(PI.ComputePGMRSrc2));
 
   KernelDescriptor.group_segment_fixed_size = PI.LDSSize;
   KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
-  KernelDescriptor.compute_pgm_rsrc1 = PI.ComputePGMRSrc1;
+  KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1();
   KernelDescriptor.compute_pgm_rsrc2 = PI.ComputePGMRSrc2;
   KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
 
@@ -442,7 +436,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
     OutStreamer->SwitchSection(ConfigSection);
   }
 
-  if (MFI->isEntryFunction()) {
+  if (MFI->isModuleEntryFunction()) {
     getSIProgramInfo(CurrentProgramInfo, MF);
   } else {
     auto I = CallGraphResourceInfo.insert(
@@ -452,9 +446,12 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
     Info = analyzeResourceUsage(MF);
   }
 
-  if (STM.isAmdPalOS())
-    EmitPALMetadata(MF, CurrentProgramInfo);
-  else if (!STM.isAmdHsaOS()) {
+  if (STM.isAmdPalOS()) {
+    if (MFI->isEntryFunction())
+      EmitPALMetadata(MF, CurrentProgramInfo);
+    else if (MFI->isModuleEntryFunction())
+      emitPALFunctionMetadata(MF);
+  } else if (!STM.isAmdHsaOS()) {
     EmitProgramInfoSI(MF, CurrentProgramInfo);
   }
 
@@ -531,6 +528,9 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
     OutStreamer->emitRawComment(
       " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
 
+    OutStreamer->emitRawComment(
+      " COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
+      Twine(G_00B84C_SCRATCH_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
     OutStreamer->emitRawComment(
       " COMPUTE_PGM_RSRC2:USER_SGPR: " +
       Twine(G_00B84C_USER_SGPR(CurrentProgramInfo.ComputePGMRSrc2)), false);
@@ -741,7 +741,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
           llvm_unreachable("src_pops_exiting_wave_id should not be used");
 
         case AMDGPU::NoRegister:
-          assert(MI.isDebugInstr());
+          assert(MI.isDebugInstr() && "Instruction uses invalid noreg register");
           continue;
 
         case AMDGPU::VCC:
@@ -915,7 +915,22 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
           = TII->getNamedOperand(MI, AMDGPU::OpName::callee);
 
         const Function *Callee = getCalleeFunction(*CalleeOp);
-        if (!Callee || Callee->isDeclaration()) {
+        DenseMap<const Function *, SIFunctionResourceInfo>::const_iterator I =
+            CallGraphResourceInfo.end();
+        bool IsExternal = !Callee || Callee->isDeclaration();
+        if (!IsExternal)
+          I = CallGraphResourceInfo.find(Callee);
+
+        if (IsExternal || I == CallGraphResourceInfo.end()) {
+          // Avoid crashing on undefined behavior with an illegal call to a
+          // kernel. If a callsite's calling convention doesn't match the
+          // function's, it's undefined behavior. If the callsite calling
+          // convention does match, that would have errored earlier.
+          // FIXME: The verifier shouldn't allow this.
+          if (!IsExternal &&
+              AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
+            report_fatal_error("invalid call to entry function");
+
           // If this is a call to an external function, we can't do much. Make
           // conservative guesses.
 
@@ -936,19 +951,6 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
           // We force CodeGen to run in SCC order, so the callee's register
           // usage etc. should be the cumulative usage of all callees.
 
-          auto I = CallGraphResourceInfo.find(Callee);
-          if (I == CallGraphResourceInfo.end()) {
-            // Avoid crashing on undefined behavior with an illegal call to a
-            // kernel. If a callsite's calling convention doesn't match the
-            // function's, it's undefined behavior. If the callsite calling
-            // convention does match, that would have errored earlier.
-            // FIXME: The verifier shouldn't allow this.
-            if (AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
-              report_fatal_error("invalid call to entry function");
-
-            llvm_unreachable("callee should have been handled before caller");
-          }
-
           MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR);
           MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR);
           MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR);
@@ -989,7 +991,9 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   ProgInfo.FlatUsed = Info.UsesFlatScratch;
   ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack || Info.HasRecursion;
 
-  if (!isUInt<32>(ProgInfo.ScratchSize)) {
+  const uint64_t MaxScratchPerWorkitem =
+      GCNSubtarget::MaxWaveScratchSize / STM.getWavefrontSize();
+  if (ProgInfo.ScratchSize > MaxScratchPerWorkitem) {
     DiagnosticInfoStackSize DiagStackSize(MF.getFunction(),
                                           ProgInfo.ScratchSize, DS_Error);
     MF.getFunction().getContext().diagnose(DiagStackSize);
@@ -1023,18 +1027,26 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   // Account for extra SGPRs and VGPRs reserved for debugger use.
   ProgInfo.NumSGPR += ExtraSGPRs;
 
+  const Function &F = MF.getFunction();
+
   // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
   // dispatch registers are function args.
   unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;
-  for (auto &Arg : MF.getFunction().args()) {
-    unsigned NumRegs = (Arg.getType()->getPrimitiveSizeInBits() + 31) / 32;
-    if (Arg.hasAttribute(Attribute::InReg))
-      WaveDispatchNumSGPR += NumRegs;
-    else
-      WaveDispatchNumVGPR += NumRegs;
+
+  if (isShader(F.getCallingConv())) {
+    // FIXME: We should be using the number of registers determined during
+    // calling convention lowering to legalize the types.
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    for (auto &Arg : F.args()) {
+      unsigned NumRegs = (DL.getTypeSizeInBits(Arg.getType()) + 31) / 32;
+      if (Arg.hasAttribute(Attribute::InReg))
+        WaveDispatchNumSGPR += NumRegs;
+      else
+        WaveDispatchNumVGPR += NumRegs;
+    }
+    ProgInfo.NumSGPR = std::max(ProgInfo.NumSGPR, WaveDispatchNumSGPR);
+    ProgInfo.NumVGPR = std::max(ProgInfo.NumVGPR, WaveDispatchNumVGPR);
   }
-  ProgInfo.NumSGPR = std::max(ProgInfo.NumSGPR, WaveDispatchNumSGPR);
-  ProgInfo.NumVGPR = std::max(ProgInfo.NumVGPR, WaveDispatchNumVGPR);
 
   // Adjust number of registers used to meet default/requested minimum/maximum
   // number of waves per execution unit request.
@@ -1129,18 +1141,6 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
     ProgInfo.MemOrdered = 1;
   }
 
-  ProgInfo.ComputePGMRSrc1 =
-      S_00B848_VGPRS(ProgInfo.VGPRBlocks) |
-      S_00B848_SGPRS(ProgInfo.SGPRBlocks) |
-      S_00B848_PRIORITY(ProgInfo.Priority) |
-      S_00B848_FLOAT_MODE(ProgInfo.FloatMode) |
-      S_00B848_PRIV(ProgInfo.Priv) |
-      S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) |
-      S_00B848_DEBUG_MODE(ProgInfo.DebugMode) |
-      S_00B848_IEEE_MODE(ProgInfo.IEEEMode) |
-      S_00B848_WGP_MODE(ProgInfo.WgpMode) |
-      S_00B848_MEM_ORDERED(ProgInfo.MemOrdered);
-
   // 0 = X, 1 = XY, 2 = XYZ
   unsigned TIDIGCompCnt = 0;
   if (MFI->hasWorkItemIDZ())
@@ -1189,7 +1189,7 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
   if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
     OutStreamer->emitInt32(R_00B848_COMPUTE_PGM_RSRC1);
 
-    OutStreamer->emitInt32(CurrentProgramInfo.ComputePGMRSrc1);
+    OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc1());
 
     OutStreamer->emitInt32(R_00B84C_COMPUTE_PGM_RSRC2);
     OutStreamer->emitInt32(CurrentProgramInfo.ComputePGMRSrc2);
@@ -1238,12 +1238,10 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
   MD->setEntryPoint(CC, MF.getFunction().getName());
   MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU);
   MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU);
-  if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
-    MD->setRsrc1(CC, CurrentProgramInfo.ComputePGMRSrc1);
+  MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC));
+  if (AMDGPU::isCompute(CC)) {
     MD->setRsrc2(CC, CurrentProgramInfo.ComputePGMRSrc2);
   } else {
-    MD->setRsrc1(CC, S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
-        S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks));
     if (CurrentProgramInfo.ScratchBlocks > 0)
       MD->setRsrc2(CC, S_00B84C_SCRATCH_EN(1));
   }
@@ -1260,6 +1258,16 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
     MD->setWave32(MF.getFunction().getCallingConv());
 }
 
+void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
+  auto *MD = getTargetStreamer()->getPALMetadata();
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  MD->setFunctionScratchSize(MF, MFI.getStackSize());
+  // Set compute registers
+  MD->setRsrc1(CallingConv::AMDGPU_CS,
+               CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS));
+  MD->setRsrc2(CallingConv::AMDGPU_CS, CurrentProgramInfo.ComputePGMRSrc2);
+}
+
 // This is supposed to be log2(Size)
 static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
   switch (Size) {
@@ -1287,7 +1295,7 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
   AMDGPU::initDefaultAMDKernelCodeT(Out, &STM);
 
   Out.compute_pgm_resource_registers =
-      CurrentProgramInfo.ComputePGMRSrc1 |
+      CurrentProgramInfo.getComputePGMRSrc1() |
       (CurrentProgramInfo.ComputePGMRSrc2 << 32);
   Out.code_properties |= AMD_CODE_PROPERTY_IS_PTR64;
 
@@ -1296,7 +1304,7 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
 
   AMD_HSA_BITS_SET(Out.code_properties,
                    AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE,
-                   getElementByteSizeValue(STM.getMaxPrivateElementSize()));
+                   getElementByteSizeValue(STM.getMaxPrivateElementSize(true)));
 
   if (MFI->hasPrivateSegmentBuffer()) {
     Out.code_properties |=
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index 54e8338ab4b0..9e1e26d65d8c 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -14,19 +14,10 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUASMPRINTER_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUASMPRINTER_H
 
-#include "AMDGPU.h"
-#include "AMDKernelCodeT.h"
-#include "AMDGPUHSAMetadataStreamer.h"
 #include "SIProgramInfo.h"
-#include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/AsmPrinter.h"
-#include "llvm/Support/AMDHSAKernelDescriptor.h"
-#include <cstddef>
-#include <cstdint>
-#include <limits>
-#include <memory>
-#include <string>
-#include <vector>
+
+struct amd_kernel_code_t;
 
 namespace llvm {
 
@@ -36,6 +27,16 @@ class MCCodeEmitter;
 class MCOperand;
 class GCNSubtarget;
 
+namespace AMDGPU {
+namespace HSAMD {
+class MetadataStreamer;
+}
+} // namespace AMDGPU
+
+namespace amdhsa {
+struct kernel_descriptor_t;
+}
+
 class AMDGPUAsmPrinter final : public AsmPrinter {
 private:
   // Track resource usage for callee functions.
@@ -78,6 +79,7 @@ private:
                          const SIProgramInfo &KernelInfo);
   void EmitPALMetadata(const MachineFunction &MF,
                        const SIProgramInfo &KernelInfo);
+  void emitPALFunctionMetadata(const MachineFunction &MF);
   void emitCommonFunctionComments(uint32_t NumVGPR,
                                   Optional<uint32_t> NumAGPR,
                                   uint32_t TotalNumVGPR,
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index c9d25d4250d5..aae2a54c198b 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -14,13 +14,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "SIDefines.h"
+#include "GCNSubtarget.h"
 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 
 #define DEBUG_TYPE "amdgpu-atomic-optimizer"
@@ -404,6 +405,11 @@ static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op,
   }
 }
 
+static Value *buildMul(IRBuilder<> &B, Value *LHS, Value *RHS) {
+  const ConstantInt *CI = dyn_cast<ConstantInt>(LHS);
+  return (CI && CI->isOne()) ? RHS : B.CreateMul(LHS, RHS);
+}
+
 void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
                                            AtomicRMWInst::BinOp Op,
                                            unsigned ValIdx,
@@ -523,7 +529,7 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
       // old value times the number of active lanes.
       Value *const Ctpop = B.CreateIntCast(
           B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
-      NewV = B.CreateMul(V, Ctpop);
+      NewV = buildMul(B, V, Ctpop);
       break;
     }
 
@@ -543,7 +549,7 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
       // old value times the parity of the number of active lanes.
       Value *const Ctpop = B.CreateIntCast(
           B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
-      NewV = B.CreateMul(V, B.CreateAnd(Ctpop, 1));
+      NewV = buildMul(B, V, B.CreateAnd(Ctpop, 1));
       break;
     }
   }
@@ -622,7 +628,7 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
         llvm_unreachable("Unhandled atomic op");
       case AtomicRMWInst::Add:
       case AtomicRMWInst::Sub:
-        LaneOffset = B.CreateMul(V, Mbcnt);
+        LaneOffset = buildMul(B, V, Mbcnt);
         break;
       case AtomicRMWInst::And:
       case AtomicRMWInst::Or:
@@ -633,7 +639,7 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
         LaneOffset = B.CreateSelect(Cond, Identity, V);
         break;
       case AtomicRMWInst::Xor:
-        LaneOffset = B.CreateMul(V, B.CreateAnd(Mbcnt, 1));
+        LaneOffset = buildMul(B, V, B.CreateAnd(Mbcnt, 1));
         break;
       }
     }
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 949dcea3aa18..852a05b3c181 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -14,31 +14,45 @@
 
 #include "AMDGPUCallLowering.h"
 #include "AMDGPU.h"
-#include "AMDGPUISelLowering.h"
-#include "AMDGPUSubtarget.h"
+#include "AMDGPULegalizerInfo.h"
 #include "AMDGPUTargetMachine.h"
-#include "SIISelLowering.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/CodeGen/Analysis.h"
-#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/Support/LowLevelTypeImpl.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+
+#define DEBUG_TYPE "amdgpu-call-lowering"
 
 using namespace llvm;
 
 namespace {
 
-struct OutgoingValueHandler : public CallLowering::ValueHandler {
-  OutgoingValueHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
-                       MachineInstrBuilder MIB, CCAssignFn *AssignFn)
-      : ValueHandler(B, MRI, AssignFn), MIB(MIB) {}
+struct AMDGPUValueHandler : public CallLowering::ValueHandler {
+  AMDGPUValueHandler(bool IsIncoming, MachineIRBuilder &B,
+                     MachineRegisterInfo &MRI, CCAssignFn *AssignFn)
+      : ValueHandler(IsIncoming, B, MRI, AssignFn) {}
 
-  MachineInstrBuilder MIB;
+  /// Wrapper around extendRegister to ensure we extend to a full 32-bit
+  /// register.
+  Register extendRegisterMin32(Register ValVReg, CCValAssign &VA) {
+    if (VA.getLocVT().getSizeInBits() < 32) {
+      // 16-bit types are reported as legal for 32-bit registers. We need to
+      // extend and do a 32-bit copy to avoid the verifier complaining about it.
+      return MIRBuilder.buildAnyExt(LLT::scalar(32), ValVReg).getReg(0);
+    }
+
+    return extendRegister(ValVReg, VA);
+  }
+};
+
+struct AMDGPUOutgoingValueHandler : public AMDGPUValueHandler {
+  AMDGPUOutgoingValueHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
+                             MachineInstrBuilder MIB, CCAssignFn *AssignFn)
+      : AMDGPUValueHandler(false, B, MRI, AssignFn), MIB(MIB) {}
 
-  bool isIncomingArgumentHandler() const override { return false; }
+  MachineInstrBuilder MIB;
 
   Register getStackAddress(uint64_t Size, int64_t Offset,
                            MachinePointerInfo &MPO) override {
@@ -52,13 +66,7 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler {
 
   void assignValueToReg(Register ValVReg, Register PhysReg,
                         CCValAssign &VA) override {
-    Register ExtReg;
-    if (VA.getLocVT().getSizeInBits() < 32) {
-      // 16-bit types are reported as legal for 32-bit registers. We need to
-      // extend and do a 32-bit copy to avoid the verifier complaining about it.
-      ExtReg = MIRBuilder.buildAnyExt(LLT::scalar(32), ValVReg).getReg(0);
-    } else
-      ExtReg = extendRegister(ValVReg, VA);
+    Register ExtReg = extendRegisterMin32(ValVReg, VA);
 
     // If this is a scalar return, insert a readfirstlane just in case the value
     // ends up in a VGPR.
@@ -85,12 +93,12 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler {
   }
 };
 
-struct IncomingArgHandler : public CallLowering::ValueHandler {
+struct AMDGPUIncomingArgHandler : public AMDGPUValueHandler {
   uint64_t StackUsed = 0;
 
-  IncomingArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
-                     CCAssignFn *AssignFn)
-    : ValueHandler(B, MRI, AssignFn) {}
+  AMDGPUIncomingArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
+                           CCAssignFn *AssignFn)
+      : AMDGPUValueHandler(true, B, MRI, AssignFn) {}
 
   Register getStackAddress(uint64_t Size, int64_t Offset,
                            MachinePointerInfo &MPO) override {
@@ -148,21 +156,107 @@ struct IncomingArgHandler : public CallLowering::ValueHandler {
   /// parameters (it's a basic-block live-in), and a call instruction
   /// (it's an implicit-def of the BL).
   virtual void markPhysRegUsed(unsigned PhysReg) = 0;
-
-  // FIXME: What is the point of this being a callback?
-  bool isIncomingArgumentHandler() const override { return true; }
 };
 
-struct FormalArgHandler : public IncomingArgHandler {
+struct FormalArgHandler : public AMDGPUIncomingArgHandler {
   FormalArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
                    CCAssignFn *AssignFn)
-    : IncomingArgHandler(B, MRI, AssignFn) {}
+      : AMDGPUIncomingArgHandler(B, MRI, AssignFn) {}
 
   void markPhysRegUsed(unsigned PhysReg) override {
     MIRBuilder.getMBB().addLiveIn(PhysReg);
   }
 };
 
+struct CallReturnHandler : public AMDGPUIncomingArgHandler {
+  CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
+                    MachineInstrBuilder MIB, CCAssignFn *AssignFn)
+      : AMDGPUIncomingArgHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {}
+
+  void markPhysRegUsed(unsigned PhysReg) override {
+    MIB.addDef(PhysReg, RegState::Implicit);
+  }
+
+  MachineInstrBuilder MIB;
+};
+
+struct AMDGPUOutgoingArgHandler : public AMDGPUValueHandler {
+  MachineInstrBuilder MIB;
+  CCAssignFn *AssignFnVarArg;
+
+  /// For tail calls, the byte offset of the call's argument area from the
+  /// callee's. Unused elsewhere.
+  int FPDiff;
+
+  // Cache the SP register vreg if we need it more than once in this call site.
+  Register SPReg;
+
+  bool IsTailCall;
+
+  AMDGPUOutgoingArgHandler(MachineIRBuilder &MIRBuilder,
+                           MachineRegisterInfo &MRI, MachineInstrBuilder MIB,
+                           CCAssignFn *AssignFn, CCAssignFn *AssignFnVarArg,
+                           bool IsTailCall = false, int FPDiff = 0)
+      : AMDGPUValueHandler(false, MIRBuilder, MRI, AssignFn), MIB(MIB),
+        AssignFnVarArg(AssignFnVarArg), FPDiff(FPDiff), IsTailCall(IsTailCall) {
+  }
+
+  Register getStackAddress(uint64_t Size, int64_t Offset,
+                           MachinePointerInfo &MPO) override {
+    MachineFunction &MF = MIRBuilder.getMF();
+    const LLT PtrTy = LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32);
+    const LLT S32 = LLT::scalar(32);
+
+    if (IsTailCall) {
+      llvm_unreachable("implement me");
+    }
+
+    const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+    if (!SPReg)
+      SPReg = MIRBuilder.buildCopy(PtrTy, MFI->getStackPtrOffsetReg()).getReg(0);
+
+    auto OffsetReg = MIRBuilder.buildConstant(S32, Offset);
+
+    auto AddrReg = MIRBuilder.buildPtrAdd(PtrTy, SPReg, OffsetReg);
+    MPO = MachinePointerInfo::getStack(MF, Offset);
+    return AddrReg.getReg(0);
+  }
+
+  void assignValueToReg(Register ValVReg, Register PhysReg,
+                        CCValAssign &VA) override {
+    MIB.addUse(PhysReg, RegState::Implicit);
+    Register ExtReg = extendRegisterMin32(ValVReg, VA);
+    MIRBuilder.buildCopy(PhysReg, ExtReg);
+  }
+
+  void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
+                            MachinePointerInfo &MPO, CCValAssign &VA) override {
+    MachineFunction &MF = MIRBuilder.getMF();
+    uint64_t LocMemOffset = VA.getLocMemOffset();
+    const auto &ST = MF.getSubtarget<GCNSubtarget>();
+
+    auto MMO = MF.getMachineMemOperand(
+      MPO, MachineMemOperand::MOStore, Size,
+      commonAlignment(ST.getStackAlignment(), LocMemOffset));
+    MIRBuilder.buildStore(ValVReg, Addr, *MMO);
+  }
+
+  void assignValueToAddress(const CallLowering::ArgInfo &Arg, Register Addr,
+                            uint64_t MemSize, MachinePointerInfo &MPO,
+                            CCValAssign &VA) override {
+    Register ValVReg = VA.getLocInfo() != CCValAssign::LocInfo::FPExt
+                           ? extendRegister(Arg.Regs[0], VA)
+                           : Arg.Regs[0];
+
+    // If we extended the value type we might need to adjust the MMO's
+    // Size. This happens if ComputeValueVTs widened a small type value to a
+    // legal register type (e.g. s8->s16)
+    const LLT RegTy = MRI.getType(ValVReg);
+    MemSize = std::min(MemSize, (uint64_t)RegTy.getSizeInBytes());
+    assignValueToAddress(ValVReg, Addr, MemSize, MPO, VA);
+  }
+};
 }
 
 AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI)
@@ -183,48 +277,64 @@ static ISD::NodeType extOpcodeToISDExtOpcode(unsigned MIOpc) {
   }
 }
 
-void AMDGPUCallLowering::splitToValueTypes(
-  MachineIRBuilder &B,
-  const ArgInfo &OrigArg, unsigned OrigArgIdx,
-  SmallVectorImpl<ArgInfo> &SplitArgs,
-  const DataLayout &DL, CallingConv::ID CallConv,
-  SplitArgTy PerformArgSplit) const {
+// FIXME: This should move to generic code.
+void AMDGPUCallLowering::splitToValueTypes(MachineIRBuilder &B,
+                                           const ArgInfo &OrigArg,
+                                           SmallVectorImpl<ArgInfo> &SplitArgs,
+                                           const DataLayout &DL,
+                                           CallingConv::ID CallConv) const {
   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
   LLVMContext &Ctx = OrigArg.Ty->getContext();
 
-  if (OrigArg.Ty->isVoidTy())
-    return;
-
   SmallVector<EVT, 4> SplitVTs;
   ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs);
 
   assert(OrigArg.Regs.size() == SplitVTs.size());
 
-  int SplitIdx = 0;
-  for (EVT VT : SplitVTs) {
-    Register Reg = OrigArg.Regs[SplitIdx];
-    Type *Ty = VT.getTypeForEVT(Ctx);
-    LLT LLTy = getLLTForType(*Ty, DL);
+  if (SplitVTs.size() == 0)
+    return;
 
-    if (OrigArgIdx == AttributeList::ReturnIndex && VT.isScalarInteger()) {
-      unsigned ExtendOp = TargetOpcode::G_ANYEXT;
-      if (OrigArg.Flags[0].isSExt()) {
-        assert(OrigArg.Regs.size() == 1 && "expect only simple return values");
-        ExtendOp = TargetOpcode::G_SEXT;
-      } else if (OrigArg.Flags[0].isZExt()) {
-        assert(OrigArg.Regs.size() == 1 && "expect only simple return values");
-        ExtendOp = TargetOpcode::G_ZEXT;
-      }
+  if (SplitVTs.size() == 1) {
+    // No splitting to do, but we want to replace the original type (e.g. [1 x
+    // double] -> double).
+    SplitArgs.emplace_back(OrigArg.Regs[0], SplitVTs[0].getTypeForEVT(Ctx),
+                           OrigArg.Flags[0], OrigArg.IsFixed);
+    return;
+  }
 
-      EVT ExtVT = TLI.getTypeForExtReturn(Ctx, VT,
-                                          extOpcodeToISDExtOpcode(ExtendOp));
-      if (ExtVT != VT) {
-        VT = ExtVT;
-        Ty = ExtVT.getTypeForEVT(Ctx);
-        LLTy = getLLTForType(*Ty, DL);
-        Reg = B.buildInstr(ExtendOp, {LLTy}, {Reg}).getReg(0);
-      }
-    }
+  // Create one ArgInfo for each virtual register in the original ArgInfo.
+  assert(OrigArg.Regs.size() == SplitVTs.size() && "Regs / types mismatch");
+
+  bool NeedsRegBlock = TLI.functionArgumentNeedsConsecutiveRegisters(
+      OrigArg.Ty, CallConv, false);
+  for (unsigned i = 0, e = SplitVTs.size(); i < e; ++i) {
+    Type *SplitTy = SplitVTs[i].getTypeForEVT(Ctx);
+    SplitArgs.emplace_back(OrigArg.Regs[i], SplitTy, OrigArg.Flags[0],
+                           OrigArg.IsFixed);
+    if (NeedsRegBlock)
+      SplitArgs.back().Flags[0].setInConsecutiveRegs();
+  }
+
+  SplitArgs.back().Flags[0].setInConsecutiveRegsLast();
+}
+
+void AMDGPUCallLowering::processSplitArgs(
+    MachineIRBuilder &B, const ArgInfo &OrigArg,
+    const SmallVectorImpl<ArgInfo> &SplitArg,
+    SmallVectorImpl<ArgInfo> &SplitArgs, const DataLayout &DL,
+    CallingConv::ID CallConv, bool IsOutgoing,
+    SplitArgTy PerformArgSplit) const {
+  LLVMContext &Ctx = OrigArg.Ty->getContext();
+  const SITargetLowering &TLI = *getTLI<SITargetLowering>();
+
+  // FIXME: This is mostly nasty pre-processing before handleAssignments. Most
+  // of this should be performed by handleAssignments.
+
+  for (int SplitIdx = 0, e = SplitArg.size(); SplitIdx != e; ++SplitIdx) {
+    const ArgInfo &CurSplitArg = SplitArg[SplitIdx];
+    Register Reg = OrigArg.Regs[SplitIdx];
+    EVT VT = EVT::getEVT(CurSplitArg.Ty);
+    LLT LLTy = getLLTForType(*CurSplitArg.Ty, DL);
 
     unsigned NumParts = TLI.getNumRegistersForCallingConv(Ctx, CallConv, VT);
     MVT RegVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT);
@@ -232,9 +342,8 @@ void AMDGPUCallLowering::splitToValueTypes(
     if (NumParts == 1) {
       // No splitting to do, but we want to replace the original type (e.g. [1 x
       // double] -> double).
-      SplitArgs.emplace_back(Reg, Ty, OrigArg.Flags, OrigArg.IsFixed);
-
-      ++SplitIdx;
+      SplitArgs.emplace_back(Reg, CurSplitArg.Ty, OrigArg.Flags,
+                             OrigArg.IsFixed);
       continue;
     }
 
@@ -252,21 +361,9 @@ void AMDGPUCallLowering::splitToValueTypes(
     }
 
     PerformArgSplit(SplitRegs, Reg, LLTy, PartLLT, SplitIdx);
-
-    ++SplitIdx;
   }
 }
 
-// Get the appropriate type to make \p OrigTy \p Factor times bigger.
-static LLT getMultipleType(LLT OrigTy, int Factor) {
-  if (OrigTy.isVector()) {
-    return LLT::vector(OrigTy.getNumElements() * Factor,
-                       OrigTy.getElementType());
-  }
-
-  return LLT::scalar(OrigTy.getSizeInBits() * Factor);
-}
-
 // TODO: Move to generic code
 static void unpackRegsToOrigType(MachineIRBuilder &B,
                                  ArrayRef<Register> DstRegs,
@@ -276,34 +373,67 @@ static void unpackRegsToOrigType(MachineIRBuilder &B,
                                  LLT PartTy) {
   assert(DstRegs.size() > 1 && "Nothing to unpack");
 
-  const unsigned SrcSize = SrcTy.getSizeInBits();
   const unsigned PartSize = PartTy.getSizeInBits();
 
   if (SrcTy.isVector() && !PartTy.isVector() &&
       PartSize > SrcTy.getElementType().getSizeInBits()) {
     // Vector was scalarized, and the elements extended.
-    auto UnmergeToEltTy = B.buildUnmerge(SrcTy.getElementType(),
-                                                  SrcReg);
+    auto UnmergeToEltTy = B.buildUnmerge(SrcTy.getElementType(), SrcReg);
     for (int i = 0, e = DstRegs.size(); i != e; ++i)
       B.buildAnyExt(DstRegs[i], UnmergeToEltTy.getReg(i));
     return;
   }
 
-  if (SrcSize % PartSize == 0) {
+  LLT GCDTy = getGCDType(SrcTy, PartTy);
+  if (GCDTy == PartTy) {
+    // If this already evenly divisible, we can create a simple unmerge.
     B.buildUnmerge(DstRegs, SrcReg);
     return;
   }
 
-  const int NumRoundedParts = (SrcSize + PartSize - 1) / PartSize;
+  MachineRegisterInfo &MRI = *B.getMRI();
+  LLT DstTy = MRI.getType(DstRegs[0]);
+  LLT LCMTy = getLCMType(SrcTy, PartTy);
+
+  const unsigned LCMSize = LCMTy.getSizeInBits();
+  const unsigned DstSize = DstTy.getSizeInBits();
+  const unsigned SrcSize = SrcTy.getSizeInBits();
+
+  Register UnmergeSrc = SrcReg;
+  if (LCMSize != SrcSize) {
+    // Widen to the common type.
+    Register Undef = B.buildUndef(SrcTy).getReg(0);
+    SmallVector<Register, 8> MergeParts(1, SrcReg);
+    for (unsigned Size = SrcSize; Size != LCMSize; Size += SrcSize)
+      MergeParts.push_back(Undef);
 
-  LLT BigTy = getMultipleType(PartTy, NumRoundedParts);
-  auto ImpDef = B.buildUndef(BigTy);
+    UnmergeSrc = B.buildMerge(LCMTy, MergeParts).getReg(0);
+  }
 
-  auto Big = B.buildInsert(BigTy, ImpDef.getReg(0), SrcReg, 0).getReg(0);
+  // Unmerge to the original registers and pad with dead defs.
+  SmallVector<Register, 8> UnmergeResults(DstRegs.begin(), DstRegs.end());
+  for (unsigned Size = DstSize * DstRegs.size(); Size != LCMSize;
+       Size += DstSize) {
+    UnmergeResults.push_back(MRI.createGenericVirtualRegister(DstTy));
+  }
 
-  int64_t Offset = 0;
-  for (unsigned i = 0, e = DstRegs.size(); i != e; ++i, Offset += PartSize)
-    B.buildExtract(DstRegs[i], Big, Offset);
+  B.buildUnmerge(UnmergeResults, UnmergeSrc);
+}
+
+bool AMDGPUCallLowering::canLowerReturn(MachineFunction &MF,
+                                        CallingConv::ID CallConv,
+                                        SmallVectorImpl<BaseArgInfo> &Outs,
+                                        bool IsVarArg) const {
+  // For shaders. Vector types should be explicitly handled by CC.
+  if (AMDGPU::isEntryFunctionCC(CallConv))
+    return true;
+
+  SmallVector<CCValAssign, 16> ArgLocs;
+  const SITargetLowering &TLI = *getTLI<SITargetLowering>();
+  CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs,
+                 MF.getFunction().getContext());
+
+  return checkReturn(CCInfo, Outs, TLI.CCAssignFnForReturn(CallConv, IsVarArg));
 }
 
 /// Lower the return value for the already existing \p Ret. This assumes that
@@ -318,31 +448,77 @@ bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B,
   const auto &F = MF.getFunction();
   const DataLayout &DL = MF.getDataLayout();
   MachineRegisterInfo *MRI = B.getMRI();
+  LLVMContext &Ctx = F.getContext();
 
   CallingConv::ID CC = F.getCallingConv();
   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
 
-  ArgInfo OrigRetInfo(VRegs, Val->getType());
-  setArgFlags(OrigRetInfo, AttributeList::ReturnIndex, DL, F);
-  SmallVector<ArgInfo, 4> SplitRetInfos;
+  SmallVector<EVT, 8> SplitEVTs;
+  ComputeValueVTs(TLI, DL, Val->getType(), SplitEVTs);
+  assert(VRegs.size() == SplitEVTs.size() &&
+         "For each split Type there should be exactly one VReg.");
+
+  // We pre-process the return value decomposed into EVTs.
+  SmallVector<ArgInfo, 8> PreSplitRetInfos;
+
+  // Further processing is applied to split the arguments from PreSplitRetInfos
+  // into 32-bit pieces in SplitRetInfos before passing off to
+  // handleAssignments.
+  SmallVector<ArgInfo, 8> SplitRetInfos;
 
-  splitToValueTypes(
-    B, OrigRetInfo, AttributeList::ReturnIndex, SplitRetInfos, DL, CC,
-    [&](ArrayRef<Register> Regs, Register SrcReg, LLT LLTy, LLT PartLLT,
-        int VTSplitIdx) {
-      unpackRegsToOrigType(B, Regs, SrcReg,
-                           SplitRetInfos[VTSplitIdx],
-                           LLTy, PartLLT);
-    });
+  for (unsigned i = 0; i < SplitEVTs.size(); ++i) {
+    EVT VT = SplitEVTs[i];
+    Register Reg = VRegs[i];
+    ArgInfo RetInfo(Reg, VT.getTypeForEVT(Ctx));
+    setArgFlags(RetInfo, AttributeList::ReturnIndex, DL, F);
+
+    if (VT.isScalarInteger()) {
+      unsigned ExtendOp = TargetOpcode::G_ANYEXT;
+      if (RetInfo.Flags[0].isSExt()) {
+        assert(RetInfo.Regs.size() == 1 && "expect only simple return values");
+        ExtendOp = TargetOpcode::G_SEXT;
+      } else if (RetInfo.Flags[0].isZExt()) {
+        assert(RetInfo.Regs.size() == 1 && "expect only simple return values");
+        ExtendOp = TargetOpcode::G_ZEXT;
+      }
+
+      EVT ExtVT = TLI.getTypeForExtReturn(Ctx, VT,
+                                          extOpcodeToISDExtOpcode(ExtendOp));
+      if (ExtVT != VT) {
+        RetInfo.Ty = ExtVT.getTypeForEVT(Ctx);
+        LLT ExtTy = getLLTForType(*RetInfo.Ty, DL);
+        Reg = B.buildInstr(ExtendOp, {ExtTy}, {Reg}).getReg(0);
+      }
+    }
+
+    if (Reg != RetInfo.Regs[0]) {
+      RetInfo.Regs[0] = Reg;
+      // Reset the arg flags after modifying Reg.
+      setArgFlags(RetInfo, AttributeList::ReturnIndex, DL, F);
+    }
+
+    splitToValueTypes(B, RetInfo, PreSplitRetInfos, DL, CC);
+
+    // FIXME: This splitting should mostly be done by handleAssignments
+    processSplitArgs(B, RetInfo,
+                     PreSplitRetInfos, SplitRetInfos, DL, CC, true,
+                     [&](ArrayRef<Register> Regs, Register SrcReg, LLT LLTy,
+                         LLT PartLLT, int VTSplitIdx) {
+                       unpackRegsToOrigType(B, Regs, SrcReg,
+                                            PreSplitRetInfos[VTSplitIdx], LLTy,
+                                            PartLLT);
+                     });
+    PreSplitRetInfos.clear();
+  }
 
   CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(CC, F.isVarArg());
-  OutgoingValueHandler RetHandler(B, *MRI, Ret, AssignFn);
+  AMDGPUOutgoingValueHandler RetHandler(B, *MRI, Ret, AssignFn);
   return handleAssignments(B, SplitRetInfos, RetHandler);
 }
 
-bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B,
-                                     const Value *Val,
-                                     ArrayRef<Register> VRegs) const {
+bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val,
+                                     ArrayRef<Register> VRegs,
+                                     FunctionLoweringInfo &FLI) const {
 
   MachineFunction &MF = B.getMF();
   MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -353,8 +529,8 @@ bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B,
 
   CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
   const bool IsShader = AMDGPU::isShader(CC);
-  const bool IsWaveEnd = (IsShader && MFI->returnsVoid()) ||
-                         AMDGPU::isKernel(CC);
+  const bool IsWaveEnd =
+      (IsShader && MFI->returnsVoid()) || AMDGPU::isKernel(CC);
   if (IsWaveEnd) {
     B.buildInstr(AMDGPU::S_ENDPGM)
       .addImm(0);
@@ -373,7 +549,9 @@ bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B,
     Ret.addUse(ReturnAddrVReg);
   }
 
-  if (!lowerReturnVal(B, Val, VRegs, Ret))
+  if (!FLI.CanLowerReturn)
+    insertSRetStores(B, Val->getType(), VRegs, FLI.DemoteRegister);
+  else if (!lowerReturnVal(B, Val, VRegs, Ret))
     return false;
 
   if (ReturnOpc == AMDGPU::S_SETPC_B64_return) {
@@ -389,24 +567,19 @@ bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B,
   return true;
 }
 
-Register AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &B,
-                                               Type *ParamTy,
-                                               uint64_t Offset) const {
-
+void AMDGPUCallLowering::lowerParameterPtr(Register DstReg, MachineIRBuilder &B,
+                                           Type *ParamTy,
+                                           uint64_t Offset) const {
   MachineFunction &MF = B.getMF();
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  const Function &F = MF.getFunction();
-  const DataLayout &DL = F.getParent()->getDataLayout();
-  PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUAS::CONSTANT_ADDRESS);
-  LLT PtrType = getLLTForType(*PtrTy, DL);
   Register KernArgSegmentPtr =
     MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
   Register KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr);
 
   auto OffsetReg = B.buildConstant(LLT::scalar(64), Offset);
 
-  return B.buildPtrAdd(PtrType, KernArgSegmentVReg, OffsetReg).getReg(0);
+  B.buildPtrAdd(DstReg, KernArgSegmentVReg, OffsetReg);
 }
 
 void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, Type *ParamTy,
@@ -417,7 +590,10 @@ void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, Type *ParamTy,
   const DataLayout &DL = F.getParent()->getDataLayout();
   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
   unsigned TypeSize = DL.getTypeStoreSize(ParamTy);
-  Register PtrReg = lowerParameterPtr(B, ParamTy, Offset);
+
+  LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
+  Register PtrReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
+  lowerParameterPtr(PtrReg, B, ParamTy, Offset);
 
   MachineMemOperand *MMO = MF.getMachineMemOperand(
       PtrInfo,
@@ -504,12 +680,15 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
 
   // TODO: Align down to dword alignment and extract bits for extending loads.
   for (auto &Arg : F.args()) {
-    Type *ArgTy = Arg.getType();
+    const bool IsByRef = Arg.hasByRefAttr();
+    Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
     unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
     if (AllocSize == 0)
       continue;
 
-    Align ABIAlign = DL.getABITypeAlign(ArgTy);
+    MaybeAlign ABIAlign = IsByRef ? Arg.getParamAlign() : None;
+    if (!ABIAlign)
+      ABIAlign = DL.getABITypeAlign(ArgTy);
 
     uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset;
     ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize;
@@ -519,16 +698,34 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
       continue;
     }
 
-    ArrayRef<Register> OrigArgRegs = VRegs[i];
-    Register ArgReg =
-      OrigArgRegs.size() == 1
-      ? OrigArgRegs[0]
-      : MRI.createGenericVirtualRegister(getLLTForType(*ArgTy, DL));
-
     Align Alignment = commonAlignment(KernArgBaseAlign, ArgOffset);
-    lowerParameter(B, ArgTy, ArgOffset, Alignment, ArgReg);
-    if (OrigArgRegs.size() > 1)
-      unpackRegs(OrigArgRegs, ArgReg, ArgTy, B);
+
+    if (IsByRef) {
+      unsigned ByRefAS = cast<PointerType>(Arg.getType())->getAddressSpace();
+
+      assert(VRegs[i].size() == 1 &&
+             "expected only one register for byval pointers");
+      if (ByRefAS == AMDGPUAS::CONSTANT_ADDRESS) {
+        lowerParameterPtr(VRegs[i][0], B, ArgTy, ArgOffset);
+      } else {
+        const LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
+        Register PtrReg = MRI.createGenericVirtualRegister(ConstPtrTy);
+        lowerParameterPtr(PtrReg, B, ArgTy, ArgOffset);
+
+        B.buildAddrSpaceCast(VRegs[i][0], PtrReg);
+      }
+    } else {
+      ArrayRef<Register> OrigArgRegs = VRegs[i];
+      Register ArgReg =
+        OrigArgRegs.size() == 1
+        ? OrigArgRegs[0]
+        : MRI.createGenericVirtualRegister(getLLTForType(*ArgTy, DL));
+
+      lowerParameter(B, ArgTy, ArgOffset, Alignment, ArgReg);
+      if (OrigArgRegs.size() > 1)
+        unpackRegs(OrigArgRegs, ArgReg, ArgTy, B);
+    }
+
     ++i;
   }
 
@@ -649,8 +846,8 @@ static void packSplitRegsToOrigType(MachineIRBuilder &B,
 }
 
 bool AMDGPUCallLowering::lowerFormalArguments(
-    MachineIRBuilder &B, const Function &F,
-    ArrayRef<ArrayRef<Register>> VRegs) const {
+    MachineIRBuilder &B, const Function &F, ArrayRef<ArrayRef<Register>> VRegs,
+    FunctionLoweringInfo &FLI) const {
   CallingConv::ID CC = F.getCallingConv();
 
   // The infrastructure for normal calling convention lowering is essentially
@@ -659,7 +856,7 @@ bool AMDGPUCallLowering::lowerFormalArguments(
   if (CC == CallingConv::AMDGPU_KERNEL)
     return lowerFormalArgumentsKernel(B, F, VRegs);
 
-  const bool IsShader = AMDGPU::isShader(CC);
+  const bool IsGraphics = AMDGPU::isGraphics(CC);
   const bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC);
 
   MachineFunction &MF = B.getMF();
@@ -688,11 +885,16 @@ bool AMDGPUCallLowering::lowerFormalArguments(
     CCInfo.AllocateReg(ImplicitBufferPtrReg);
   }
 
-
+  SmallVector<ArgInfo, 8> SplitArg;
   SmallVector<ArgInfo, 32> SplitArgs;
   unsigned Idx = 0;
   unsigned PSInputNum = 0;
 
+  // Insert the hidden sret parameter if the return value won't fit in the
+  // return registers.
+  if (!FLI.CanLowerReturn)
+    insertSRetIncomingArgument(F, SplitArgs, FLI.DemoteRegister, MRI, DL);
+
   for (auto &Arg : F.args()) {
     if (DL.getTypeStoreSize(Arg.getType()) == 0)
       continue;
@@ -700,7 +902,7 @@ bool AMDGPUCallLowering::lowerFormalArguments(
     const bool InReg = Arg.hasAttribute(Attribute::InReg);
 
     // SGPR arguments to functions not implemented.
-    if (!IsShader && InReg)
+    if (!IsGraphics && InReg)
       return false;
 
     if (Arg.hasAttribute(Attribute::SwiftSelf) ||
@@ -733,16 +935,18 @@ bool AMDGPUCallLowering::lowerFormalArguments(
     const unsigned OrigArgIdx = Idx + AttributeList::FirstArgIndex;
     setArgFlags(OrigArg, OrigArgIdx, DL, F);
 
-    splitToValueTypes(
-      B, OrigArg, OrigArgIdx, SplitArgs, DL, CC,
-      // FIXME: We should probably be passing multiple registers to
-      // handleAssignments to do this
-      [&](ArrayRef<Register> Regs, Register DstReg,
-          LLT LLTy, LLT PartLLT, int VTSplitIdx) {
-        assert(DstReg == VRegs[Idx][VTSplitIdx]);
-        packSplitRegsToOrigType(B, VRegs[Idx][VTSplitIdx], Regs,
-                                LLTy, PartLLT);
-      });
+    SplitArg.clear();
+    splitToValueTypes(B, OrigArg, SplitArg, DL, CC);
+
+    processSplitArgs(B, OrigArg, SplitArg, SplitArgs, DL, CC, false,
+                     // FIXME: We should probably be passing multiple registers
+                     // to handleAssignments to do this
+                     [&](ArrayRef<Register> Regs, Register DstReg, LLT LLTy,
+                         LLT PartLLT, int VTSplitIdx) {
+                       assert(DstReg == VRegs[Idx][VTSplitIdx]);
+                       packSplitRegsToOrigType(B, VRegs[Idx][VTSplitIdx], Regs,
+                                               LLTy, PartLLT);
+                     });
 
     ++Idx;
   }
@@ -811,9 +1015,10 @@ bool AMDGPUCallLowering::lowerFormalArguments(
 
   // Start adding system SGPRs.
   if (IsEntryFunc) {
-    TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsShader);
+    TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsGraphics);
   } else {
-    CCInfo.AllocateReg(Info->getScratchRSrcReg());
+    if (!Subtarget.enableFlatScratch())
+      CCInfo.AllocateReg(Info->getScratchRSrcReg());
     TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
   }
 
@@ -822,3 +1027,368 @@ bool AMDGPUCallLowering::lowerFormalArguments(
 
   return true;
 }
+
+bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder,
+                                           CCState &CCInfo,
+                                           SmallVectorImpl<std::pair<MCRegister, Register>> &ArgRegs,
+                                           CallLoweringInfo &Info) const {
+  MachineFunction &MF = MIRBuilder.getMF();
+
+  const AMDGPUFunctionArgInfo *CalleeArgInfo
+    = &AMDGPUArgumentUsageInfo::FixedABIFunctionInfo;
+
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  const AMDGPUFunctionArgInfo &CallerArgInfo = MFI->getArgInfo();
+
+
+  // TODO: Unify with private memory register handling. This is complicated by
+  // the fact that at least in kernels, the input argument is not necessarily
+  // in the same location as the input.
+  AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = {
+    AMDGPUFunctionArgInfo::DISPATCH_PTR,
+    AMDGPUFunctionArgInfo::QUEUE_PTR,
+    AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR,
+    AMDGPUFunctionArgInfo::DISPATCH_ID,
+    AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
+    AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
+    AMDGPUFunctionArgInfo::WORKGROUP_ID_Z
+  };
+
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  const AMDGPULegalizerInfo *LI
+    = static_cast<const AMDGPULegalizerInfo*>(ST.getLegalizerInfo());
+
+  for (auto InputID : InputRegs) {
+    const ArgDescriptor *OutgoingArg;
+    const TargetRegisterClass *ArgRC;
+    LLT ArgTy;
+
+    std::tie(OutgoingArg, ArgRC, ArgTy) =
+        CalleeArgInfo->getPreloadedValue(InputID);
+    if (!OutgoingArg)
+      continue;
+
+    const ArgDescriptor *IncomingArg;
+    const TargetRegisterClass *IncomingArgRC;
+    std::tie(IncomingArg, IncomingArgRC, ArgTy) =
+        CallerArgInfo.getPreloadedValue(InputID);
+    assert(IncomingArgRC == ArgRC);
+
+    Register InputReg = MRI.createGenericVirtualRegister(ArgTy);
+
+    if (IncomingArg) {
+      LI->loadInputValue(InputReg, MIRBuilder, IncomingArg, ArgRC, ArgTy);
+    } else {
+      assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
+      LI->getImplicitArgPtr(InputReg, MRI, MIRBuilder);
+    }
+
+    if (OutgoingArg->isRegister()) {
+      ArgRegs.emplace_back(OutgoingArg->getRegister(), InputReg);
+      if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
+        report_fatal_error("failed to allocate implicit input argument");
+    } else {
+      LLVM_DEBUG(dbgs() << "Unhandled stack passed implicit input argument\n");
+      return false;
+    }
+  }
+
+  // Pack workitem IDs into a single register or pass it as is if already
+  // packed.
+  const ArgDescriptor *OutgoingArg;
+  const TargetRegisterClass *ArgRC;
+  LLT ArgTy;
+
+  std::tie(OutgoingArg, ArgRC, ArgTy) =
+      CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X);
+  if (!OutgoingArg)
+    std::tie(OutgoingArg, ArgRC, ArgTy) =
+        CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
+  if (!OutgoingArg)
+    std::tie(OutgoingArg, ArgRC, ArgTy) =
+        CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
+  if (!OutgoingArg)
+    return false;
+
+  auto WorkitemIDX =
+      CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X);
+  auto WorkitemIDY =
+      CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
+  auto WorkitemIDZ =
+      CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
+
+  const ArgDescriptor *IncomingArgX = std::get<0>(WorkitemIDX);
+  const ArgDescriptor *IncomingArgY = std::get<0>(WorkitemIDY);
+  const ArgDescriptor *IncomingArgZ = std::get<0>(WorkitemIDZ);
+  const LLT S32 = LLT::scalar(32);
+
+  // If incoming ids are not packed we need to pack them.
+  // FIXME: Should consider known workgroup size to eliminate known 0 cases.
+  Register InputReg;
+  if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX) {
+    InputReg = MRI.createGenericVirtualRegister(S32);
+    LI->loadInputValue(InputReg, MIRBuilder, IncomingArgX,
+                       std::get<1>(WorkitemIDX), std::get<2>(WorkitemIDX));
+  }
+
+  if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY) {
+    Register Y = MRI.createGenericVirtualRegister(S32);
+    LI->loadInputValue(Y, MIRBuilder, IncomingArgY, std::get<1>(WorkitemIDY),
+                       std::get<2>(WorkitemIDY));
+
+    Y = MIRBuilder.buildShl(S32, Y, MIRBuilder.buildConstant(S32, 10)).getReg(0);
+    InputReg = InputReg ? MIRBuilder.buildOr(S32, InputReg, Y).getReg(0) : Y;
+  }
+
+  if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ) {
+    Register Z = MRI.createGenericVirtualRegister(S32);
+    LI->loadInputValue(Z, MIRBuilder, IncomingArgZ, std::get<1>(WorkitemIDZ),
+                       std::get<2>(WorkitemIDZ));
+
+    Z = MIRBuilder.buildShl(S32, Z, MIRBuilder.buildConstant(S32, 20)).getReg(0);
+    InputReg = InputReg ? MIRBuilder.buildOr(S32, InputReg, Z).getReg(0) : Z;
+  }
+
+  if (!InputReg) {
+    InputReg = MRI.createGenericVirtualRegister(S32);
+
+    // Workitem ids are already packed, any of present incoming arguments will
+    // carry all required fields.
+    ArgDescriptor IncomingArg = ArgDescriptor::createArg(
+      IncomingArgX ? *IncomingArgX :
+        IncomingArgY ? *IncomingArgY : *IncomingArgZ, ~0u);
+    LI->loadInputValue(InputReg, MIRBuilder, &IncomingArg,
+                       &AMDGPU::VGPR_32RegClass, S32);
+  }
+
+  if (OutgoingArg->isRegister()) {
+    ArgRegs.emplace_back(OutgoingArg->getRegister(), InputReg);
+    if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
+      report_fatal_error("failed to allocate implicit input argument");
+  } else {
+    LLVM_DEBUG(dbgs() << "Unhandled stack passed implicit input argument\n");
+    return false;
+  }
+
+  return true;
+}
+
+/// Returns a pair containing the fixed CCAssignFn and the vararg CCAssignFn for
+/// CC.
+static std::pair<CCAssignFn *, CCAssignFn *>
+getAssignFnsForCC(CallingConv::ID CC, const SITargetLowering &TLI) {
+  return {TLI.CCAssignFnForCall(CC, false), TLI.CCAssignFnForCall(CC, true)};
+}
+
+static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect,
+                              bool IsTailCall) {
+  return AMDGPU::SI_CALL;
+}
+
+// Add operands to call instruction to track the callee.
+static bool addCallTargetOperands(MachineInstrBuilder &CallInst,
+                                  MachineIRBuilder &MIRBuilder,
+                                  AMDGPUCallLowering::CallLoweringInfo &Info) {
+  if (Info.Callee.isReg()) {
+    CallInst.addReg(Info.Callee.getReg());
+    CallInst.addImm(0);
+  } else if (Info.Callee.isGlobal() && Info.Callee.getOffset() == 0) {
+    // The call lowering lightly assumed we can directly encode a call target in
+    // the instruction, which is not the case. Materialize the address here.
+    const GlobalValue *GV = Info.Callee.getGlobal();
+    auto Ptr = MIRBuilder.buildGlobalValue(
+      LLT::pointer(GV->getAddressSpace(), 64), GV);
+    CallInst.addReg(Ptr.getReg(0));
+    CallInst.add(Info.Callee);
+  } else
+    return false;
+
+  return true;
+}
+
+bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
+                                   CallLoweringInfo &Info) const {
+  if (Info.IsVarArg) {
+    LLVM_DEBUG(dbgs() << "Variadic functions not implemented\n");
+    return false;
+  }
+
+  MachineFunction &MF = MIRBuilder.getMF();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+  const Function &F = MF.getFunction();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const SITargetLowering &TLI = *getTLI<SITargetLowering>();
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  CallingConv::ID CallConv = F.getCallingConv();
+
+  if (!AMDGPUTargetMachine::EnableFixedFunctionABI &&
+      CallConv != CallingConv::AMDGPU_Gfx) {
+    LLVM_DEBUG(dbgs() << "Variable function ABI not implemented\n");
+    return false;
+  }
+
+  if (AMDGPU::isShader(CallConv)) {
+    LLVM_DEBUG(dbgs() << "Unhandled call from graphics shader\n");
+    return false;
+  }
+
+  SmallVector<ArgInfo, 8> OutArgs;
+
+  SmallVector<ArgInfo, 8> SplitArg;
+  for (auto &OrigArg : Info.OrigArgs) {
+    splitToValueTypes(MIRBuilder, OrigArg, SplitArg, DL, Info.CallConv);
+
+    processSplitArgs(
+      MIRBuilder, OrigArg, SplitArg, OutArgs, DL, Info.CallConv, true,
+      // FIXME: We should probably be passing multiple registers to
+      // handleAssignments to do this
+      [&](ArrayRef<Register> Regs, Register SrcReg, LLT LLTy, LLT PartLLT,
+          int VTSplitIdx) {
+        unpackRegsToOrigType(MIRBuilder, Regs, SrcReg, OrigArg, LLTy, PartLLT);
+      });
+
+    SplitArg.clear();
+  }
+
+  // If we can lower as a tail call, do that instead.
+  bool CanTailCallOpt = false;
+
+  // We must emit a tail call if we have musttail.
+  if (Info.IsMustTailCall && !CanTailCallOpt) {
+    LLVM_DEBUG(dbgs() << "Failed to lower musttail call as tail call\n");
+    return false;
+  }
+
+  // Find out which ABI gets to decide where things go.
+  CCAssignFn *AssignFnFixed;
+  CCAssignFn *AssignFnVarArg;
+  std::tie(AssignFnFixed, AssignFnVarArg) =
+      getAssignFnsForCC(Info.CallConv, TLI);
+
+  MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKUP)
+    .addImm(0)
+    .addImm(0);
+
+  // Create a temporarily-floating call instruction so we can add the implicit
+  // uses of arg registers.
+  unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), false);
+
+  auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
+  MIB.addDef(TRI->getReturnAddressReg(MF));
+
+  if (!addCallTargetOperands(MIB, MIRBuilder, Info))
+    return false;
+
+  // Tell the call which registers are clobbered.
+  const uint32_t *Mask = TRI->getCallPreservedMask(MF, Info.CallConv);
+  MIB.addRegMask(Mask);
+
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(Info.CallConv, Info.IsVarArg, MF, ArgLocs, F.getContext());
+
+  // We could pass MIB and directly add the implicit uses to the call
+  // now. However, as an aesthetic choice, place implicit argument operands
+  // after the ordinary user argument registers.
+  SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs;
+
+  if (AMDGPUTargetMachine::EnableFixedFunctionABI) {
+    // With a fixed ABI, allocate fixed registers before user arguments.
+    if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info))
+      return false;
+  }
+
+  // Do the actual argument marshalling.
+  SmallVector<Register, 8> PhysRegs;
+  AMDGPUOutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed,
+                                   AssignFnVarArg, false);
+  if (!handleAssignments(CCInfo, ArgLocs, MIRBuilder, OutArgs, Handler))
+    return false;
+
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+  if (!ST.enableFlatScratch()) {
+    // Insert copies for the SRD. In the HSA case, this should be an identity
+    // copy.
+    auto ScratchRSrcReg = MIRBuilder.buildCopy(LLT::vector(4, 32),
+                                               MFI->getScratchRSrcReg());
+    MIRBuilder.buildCopy(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
+    MIB.addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Implicit);
+  }
+
+  for (std::pair<MCRegister, Register> ArgReg : ImplicitArgRegs) {
+    MIRBuilder.buildCopy((Register)ArgReg.first, ArgReg.second);
+    MIB.addReg(ArgReg.first, RegState::Implicit);
+  }
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+
+  // If Callee is a reg, since it is used by a target specific
+  // instruction, it must have a register class matching the
+  // constraint of that instruction.
+
+  // FIXME: We should define regbankselectable call instructions to handle
+  // divergent call targets.
+  if (MIB->getOperand(1).isReg()) {
+    MIB->getOperand(1).setReg(constrainOperandRegClass(
+        MF, *TRI, MRI, *ST.getInstrInfo(),
+        *ST.getRegBankInfo(), *MIB, MIB->getDesc(), MIB->getOperand(1),
+        1));
+  }
+
+  auto OrigInsertPt = MIRBuilder.getInsertPt();
+
+  // Now we can add the actual call instruction to the correct position.
+  MIRBuilder.insertInstr(MIB);
+
+  // Insert this now to give us an anchor point for managing the insert point.
+  MachineInstrBuilder CallSeqEnd =
+    MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKDOWN);
+
+  SmallVector<ArgInfo, 8> InArgs;
+  if (!Info.CanLowerReturn) {
+    insertSRetLoads(MIRBuilder, Info.OrigRet.Ty, Info.OrigRet.Regs,
+                    Info.DemoteRegister, Info.DemoteStackIndex);
+  } else if (!Info.OrigRet.Ty->isVoidTy()) {
+    SmallVector<ArgInfo, 8> PreSplitRetInfos;
+
+    splitToValueTypes(
+      MIRBuilder, Info.OrigRet, PreSplitRetInfos/*InArgs*/, DL, Info.CallConv);
+
+    processSplitArgs(MIRBuilder, Info.OrigRet,
+                     PreSplitRetInfos, InArgs/*SplitRetInfos*/, DL, Info.CallConv, false,
+                     [&](ArrayRef<Register> Regs, Register DstReg,
+                         LLT LLTy, LLT PartLLT, int VTSplitIdx) {
+                       assert(DstReg == Info.OrigRet.Regs[VTSplitIdx]);
+                       packSplitRegsToOrigType(MIRBuilder, Info.OrigRet.Regs[VTSplitIdx],
+                                               Regs, LLTy, PartLLT);
+                     });
+  }
+
+  // Make sure the raw argument copies are inserted before the marshalling to
+  // the original types.
+  MIRBuilder.setInsertPt(MIRBuilder.getMBB(), CallSeqEnd);
+
+  // Finally we can copy the returned value back into its virtual-register. In
+  // symmetry with the arguments, the physical register must be an
+  // implicit-define of the call instruction.
+  if (Info.CanLowerReturn && !Info.OrigRet.Ty->isVoidTy()) {
+    CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(Info.CallConv,
+                                                      Info.IsVarArg);
+    CallReturnHandler Handler(MIRBuilder, MRI, MIB, RetAssignFn);
+    if (!handleAssignments(MIRBuilder, InArgs, Handler))
+      return false;
+  }
+
+  uint64_t CalleePopBytes = NumBytes;
+  CallSeqEnd.addImm(0)
+            .addImm(CalleePopBytes);
+
+  // Restore the insert point to after the call sequence.
+  MIRBuilder.setInsertPt(MIRBuilder.getMBB(), OrigInsertPt);
+  return true;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
index 446619d1502e..1312388e4a38 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
@@ -14,7 +14,6 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUCALLLOWERING_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUCALLLOWERING_H
 
-#include "AMDGPU.h"
 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
 
 namespace llvm {
@@ -22,9 +21,9 @@ namespace llvm {
 class AMDGPUTargetLowering;
 class MachineInstrBuilder;
 
-class AMDGPUCallLowering: public CallLowering {
-  Register lowerParameterPtr(MachineIRBuilder &B, Type *ParamTy,
-                             uint64_t Offset) const;
+class AMDGPUCallLowering final : public CallLowering {
+  void lowerParameterPtr(Register DstReg, MachineIRBuilder &B, Type *ParamTy,
+                         uint64_t Offset) const;
 
   void lowerParameter(MachineIRBuilder &B, Type *ParamTy, uint64_t Offset,
                       Align Alignment, Register DstReg) const;
@@ -32,13 +31,20 @@ class AMDGPUCallLowering: public CallLowering {
   /// A function of this type is used to perform value split action.
   using SplitArgTy = std::function<void(ArrayRef<Register>, Register, LLT, LLT, int)>;
 
-  void splitToValueTypes(MachineIRBuilder &B,
-                         const ArgInfo &OrigArgInfo,
-                         unsigned OrigArgIdx,
+  void splitToValueTypes(MachineIRBuilder &B, const ArgInfo &OrigArgInfo,
                          SmallVectorImpl<ArgInfo> &SplitArgs,
-                         const DataLayout &DL,
-                         CallingConv::ID CallConv,
-                         SplitArgTy SplitArg) const;
+                         const DataLayout &DL, CallingConv::ID CallConv) const;
+
+  void processSplitArgs(MachineIRBuilder &B, const ArgInfo &OrigArgInfo,
+                        const SmallVectorImpl<ArgInfo> &SplitArg,
+                        SmallVectorImpl<ArgInfo> &SplitArgs,
+                        const DataLayout &DL, CallingConv::ID CallConv,
+                        bool IsOutgoing,
+                        SplitArgTy PerformArgSplit) const;
+
+  bool canLowerReturn(MachineFunction &MF, CallingConv::ID CallConv,
+                      SmallVectorImpl<BaseArgInfo> &Outs,
+                      bool IsVarArg) const override;
 
   bool lowerReturnVal(MachineIRBuilder &B, const Value *Val,
                       ArrayRef<Register> VRegs, MachineInstrBuilder &Ret) const;
@@ -47,13 +53,24 @@ public:
   AMDGPUCallLowering(const AMDGPUTargetLowering &TLI);
 
   bool lowerReturn(MachineIRBuilder &B, const Value *Val,
-                   ArrayRef<Register> VRegs) const override;
+                   ArrayRef<Register> VRegs,
+                   FunctionLoweringInfo &FLI) const override;
 
   bool lowerFormalArgumentsKernel(MachineIRBuilder &B, const Function &F,
                                   ArrayRef<ArrayRef<Register>> VRegs) const;
 
   bool lowerFormalArguments(MachineIRBuilder &B, const Function &F,
-                            ArrayRef<ArrayRef<Register>> VRegs) const override;
+                            ArrayRef<ArrayRef<Register>> VRegs,
+                            FunctionLoweringInfo &FLI) const override;
+
+  bool passSpecialInputs(MachineIRBuilder &MIRBuilder,
+                         CCState &CCInfo,
+                         SmallVectorImpl<std::pair<MCRegister, Register>> &ArgRegs,
+                         CallLoweringInfo &Info) const;
+
+  bool lowerCall(MachineIRBuilder &MIRBuilder,
+                 CallLoweringInfo &Info) const override;
+
   static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg);
   static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg);
 };
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 7c83b6dcb44b..250c42776297 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -16,7 +16,75 @@ class CCIfExtend<CCAction A>
   : CCIf<"ArgFlags.isSExt() || ArgFlags.isZExt()", A>;
 
 // Calling convention for SI
-def CC_SI : CallingConv<[
+def CC_SI_Gfx : CallingConv<[
+  // 0-3 are reserved for the stack buffer descriptor
+  // 30-31 are reserved for the return address
+  // 32 is reserved for the stack pointer
+  CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
+    SGPR4, SGPR5, SGPR6, SGPR7,
+    SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
+    SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
+    SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29,
+  ]>>>,
+
+  CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
+    VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
+    VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
+    VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
+    VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31
+  ]>>>,
+
+  CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>,
+  CCIfType<[i64, f64, v2i32, v2f32], CCAssignToStack<8, 4>>,
+  CCIfType<[v3i32, v3f32], CCAssignToStack<12, 4>>,
+  CCIfType<[v4i32, v4f32, v2i64, v2f64], CCAssignToStack<16, 4>>,
+  CCIfType<[v5i32, v5f32], CCAssignToStack<20, 4>>,
+  CCIfType<[v8i32, v8f32], CCAssignToStack<32, 4>>,
+  CCIfType<[v16i32, v16f32], CCAssignToStack<64, 4>>
+]>;
+
+def RetCC_SI_Gfx : CallingConv<[
+  // 0-3 are reserved for the stack buffer descriptor
+  // 32 is reserved for the stack pointer
+  CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
+    SGPR4, SGPR5, SGPR6, SGPR7,
+    SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
+    SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
+    SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29, SGPR30, SGPR31,
+    SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39,
+    SGPR40, SGPR41, SGPR42, SGPR43
+  ]>>>,
+
+  CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
+    VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
+    VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
+    VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
+    VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31,
+    VGPR32, VGPR33, VGPR34, VGPR35, VGPR36, VGPR37, VGPR38, VGPR39,
+    VGPR40, VGPR41, VGPR42, VGPR43, VGPR44, VGPR45, VGPR46, VGPR47,
+    VGPR48, VGPR49, VGPR50, VGPR51, VGPR52, VGPR53, VGPR54, VGPR55,
+    VGPR56, VGPR57, VGPR58, VGPR59, VGPR60, VGPR61, VGPR62, VGPR63,
+    VGPR64, VGPR65, VGPR66, VGPR67, VGPR68, VGPR69, VGPR70, VGPR71,
+    VGPR72, VGPR73, VGPR74, VGPR75, VGPR76, VGPR77, VGPR78, VGPR79,
+    VGPR80, VGPR81, VGPR82, VGPR83, VGPR84, VGPR85, VGPR86, VGPR87,
+    VGPR88, VGPR89, VGPR90, VGPR91, VGPR92, VGPR93, VGPR94, VGPR95,
+    VGPR96, VGPR97, VGPR98, VGPR99, VGPR100, VGPR101, VGPR102, VGPR103,
+    VGPR104, VGPR105, VGPR106, VGPR107, VGPR108, VGPR109, VGPR110, VGPR111,
+    VGPR112, VGPR113, VGPR114, VGPR115, VGPR116, VGPR117, VGPR118, VGPR119,
+    VGPR120, VGPR121, VGPR122, VGPR123, VGPR124, VGPR125, VGPR126, VGPR127,
+    VGPR128, VGPR129, VGPR130, VGPR131, VGPR132, VGPR133, VGPR134, VGPR135
+  ]>>>,
+
+  CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>,
+  CCIfType<[i64, f64, v2i32, v2f32], CCAssignToStack<8, 4>>,
+  CCIfType<[v3i32, v3f32], CCAssignToStack<12, 4>>,
+  CCIfType<[v4i32, v4f32, v2i64, v2f64], CCAssignToStack<16, 4>>,
+  CCIfType<[v5i32, v5f32], CCAssignToStack<20, 4>>,
+  CCIfType<[v8i32, v8f32], CCAssignToStack<32, 4>>,
+  CCIfType<[v16i32, v16f32], CCAssignToStack<64, 4>>
+]>;
+
+def CC_SI_SHADER : CallingConv<[
 
   CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
     SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
@@ -125,11 +193,13 @@ def CSR_AMDGPU_HighRegs : CalleeSavedRegs<
   (add CSR_AMDGPU_VGPRs, CSR_AMDGPU_SGPRs_32_105)
 >;
 
+def CSR_AMDGPU_NoRegs : CalleeSavedRegs<(add)>;
+
 // Calling convention for leaf functions
 def CC_AMDGPU_Func : CallingConv<[
   CCIfByVal<CCPassByVal<4, 4>>,
   CCIfType<[i1], CCPromoteToType<i32>>,
-  CCIfType<[i1, i8, i16], CCIfExtend<CCPromoteToType<i32>>>,
+  CCIfType<[i8, i16], CCIfExtend<CCPromoteToType<i32>>>,
   CCIfType<[i32, f32, i16, f16, v2i16, v2f16, i1], CCAssignToReg<[
     VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
     VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
@@ -159,7 +229,7 @@ def CC_AMDGPU : CallingConv<[
    CCIf<"static_cast<const GCNSubtarget&>"
          "(State.getMachineFunction().getSubtarget()).getGeneration() >= "
            "AMDGPUSubtarget::SOUTHERN_ISLANDS",
-        CCDelegateTo<CC_SI>>,
+        CCDelegateTo<CC_SI_SHADER>>,
    CCIf<"static_cast<const GCNSubtarget&>"
          "(State.getMachineFunction().getSubtarget()).getGeneration() >= "
            "AMDGPUSubtarget::SOUTHERN_ISLANDS && State.getCallingConv() == CallingConv::C",
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index a79549301740..2556996df97f 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -13,40 +13,19 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
 #include "AMDGPUTargetMachine.h"
-#include "llvm/ADT/FloatingPointMode.h"
-#include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
-#include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstVisitor.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Value.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Transforms/Utils/IntegerDivision.h"
-#include <cassert>
-#include <iterator>
 
 #define DEBUG_TYPE "amdgpu-codegenprepare"
 
@@ -60,6 +39,12 @@ static cl::opt<bool> WidenLoads(
   cl::ReallyHidden,
   cl::init(false));
 
+static cl::opt<bool> Widen16BitOps(
+  "amdgpu-codegenprepare-widen-16-bit-ops",
+  cl::desc("Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
+  cl::ReallyHidden,
+  cl::init(true));
+
 static cl::opt<bool> UseMul24Intrin(
   "amdgpu-codegenprepare-mul24",
   cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
@@ -269,6 +254,9 @@ bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const {
 }
 
 bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const {
+  if (!Widen16BitOps)
+    return false;
+
   const IntegerType *IntTy = dyn_cast<IntegerType>(T);
   if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16)
     return true;
@@ -751,6 +739,11 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
 
   Type *Ty = FDiv.getType()->getScalarType();
 
+  // The f64 rcp/rsq approximations are pretty inaccurate. We can do an
+  // expansion around them in codegen.
+  if (Ty->isDoubleTy())
+    return false;
+
   // No intrinsic for fdiv16 if target does not support f16.
   if (Ty->isHalfTy() && !ST->has16BitInsts())
     return false;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index faaf9168d0dd..a8399176bb4a 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -11,22 +11,22 @@ include "llvm/Target/GlobalISel/Combine.td"
 // TODO: This really belongs after legalization after scalarization.
 // TODO: GICombineRules should accept subtarget predicates
 
-def fmin_fmax_legacy_matchdata : GIDefMatchData<"FMinFMaxLegacyInfo">;
+def fmin_fmax_legacy_matchdata : GIDefMatchData<"AMDGPUPostLegalizerCombinerHelper::FMinFMaxLegacyInfo">;
 
 def fcmp_select_to_fmin_fmax_legacy : GICombineRule<
   (defs root:$select, fmin_fmax_legacy_matchdata:$matchinfo),
   (match (wip_match_opcode G_SELECT):$select,
-         [{ return matchFMinFMaxLegacy(*${select}, MRI, *MF, ${matchinfo}); }]),
-  (apply [{ applySelectFCmpToFMinToFMaxLegacy(*${select}, ${matchinfo}); }])>;
+         [{ return PostLegalizerHelper.matchFMinFMaxLegacy(*${select}, ${matchinfo}); }]),
+  (apply [{ PostLegalizerHelper.applySelectFCmpToFMinToFMaxLegacy(*${select}, ${matchinfo}); }])>;
 
 
 def uchar_to_float : GICombineRule<
   (defs root:$itofp),
   (match (wip_match_opcode G_UITOFP, G_SITOFP):$itofp,
-         [{ return matchUCharToFloat(*${itofp}, MRI, *MF, Helper); }]),
-  (apply [{ applyUCharToFloat(*${itofp}); }])>;
+         [{ return PostLegalizerHelper.matchUCharToFloat(*${itofp}); }]),
+  (apply [{ PostLegalizerHelper.applyUCharToFloat(*${itofp}); }])>;
 
-def cvt_f32_ubyteN_matchdata : GIDefMatchData<"CvtF32UByteMatchInfo">;
+def cvt_f32_ubyteN_matchdata : GIDefMatchData<"AMDGPUPostLegalizerCombinerHelper::CvtF32UByteMatchInfo">;
 
 def cvt_f32_ubyteN : GICombineRule<
   (defs root:$cvt_f32_ubyteN, cvt_f32_ubyteN_matchdata:$matchinfo),
@@ -34,33 +34,25 @@ def cvt_f32_ubyteN : GICombineRule<
                            G_AMDGPU_CVT_F32_UBYTE1,
                            G_AMDGPU_CVT_F32_UBYTE2,
                            G_AMDGPU_CVT_F32_UBYTE3):$cvt_f32_ubyteN,
-         [{ return matchCvtF32UByteN(*${cvt_f32_ubyteN}, MRI, *MF, ${matchinfo}); }]),
-  (apply [{ applyCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }])>;
+         [{ return PostLegalizerHelper.matchCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }]),
+  (apply [{ PostLegalizerHelper.applyCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }])>;
 
 // Combines which should only apply on SI/VI
 def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>;
 
 
 def AMDGPUPreLegalizerCombinerHelper: GICombinerHelper<
-  "AMDGPUGenPreLegalizerCombinerHelper", [all_combines,
-                                          elide_br_by_inverting_cond]> {
+  "AMDGPUGenPreLegalizerCombinerHelper", [all_combines]> {
   let DisableRuleOption = "amdgpuprelegalizercombiner-disable-rule";
 }
 
-
-// FIXME: combines_for_extload can introduce illegal extloads which
-// aren't re-legalized.
-// FIXME: Is there a way to remove a single item from all_combines?
-def all_combines_minus_extload : GICombineGroup<[trivial_combines,
-  ptr_add_immed_chain, combine_indexed_load_store, undef_combines,
-  identity_combines]
->;
-
 def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper<
   "AMDGPUGenPostLegalizerCombinerHelper",
-  [all_combines_minus_extload, gfx6gfx7_combines,
+  [all_combines, gfx6gfx7_combines,
    uchar_to_float, cvt_f32_ubyteN]> {
   let DisableRuleOption = "amdgpupostlegalizercombiner-disable-rule";
+  let StateClass = "AMDGPUPostLegalizerCombinerHelperState";
+  let AdditionalArguments = [];
 }
 
 def AMDGPURegBankCombinerHelper : GICombinerHelper<
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp
index 25c82ed61fc2..bed0707f3aa7 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp
@@ -12,9 +12,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUExportClustering.h"
-#include "AMDGPUSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIInstrInfo.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
 
 using namespace llvm;
 
@@ -27,15 +27,13 @@ public:
 };
 
 static bool isExport(const SUnit &SU) {
-  const MachineInstr *MI = SU.getInstr();
-  return MI->getOpcode() == AMDGPU::EXP ||
-         MI->getOpcode() == AMDGPU::EXP_DONE;
+  return SIInstrInfo::isEXP(*SU.getInstr());
 }
 
 static bool isPositionExport(const SIInstrInfo *TII, SUnit *SU) {
   const MachineInstr *MI = SU->getInstr();
-  int Imm = TII->getNamedOperand(*MI, AMDGPU::OpName::tgt)->getImm();
-  return Imm >= 12 && Imm <= 15;
+  unsigned Imm = TII->getNamedOperand(*MI, AMDGPU::OpName::tgt)->getImm();
+  return Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST;
 }
 
 static void sortChain(const SIInstrInfo *TII, SmallVector<SUnit *, 8> &Chain,
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.h
index 58491d0671e4..041d6deef243 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.h
@@ -6,7 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/ScheduleDAGMutation.h"
+#include <memory>
 
 namespace llvm {
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 3f12addbcc79..bba03736d01a 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -51,6 +51,11 @@ def gi_vop3opselmods :
     GIComplexOperandMatcher<s32, "selectVOP3OpSelMods">,
     GIComplexPatternEquiv<VOP3OpSelMods>;
 
+// FIXME: Why do we have both VOP3OpSel and VOP3OpSelMods?
+def gi_vop3opsel :
+    GIComplexOperandMatcher<s32, "selectVOP3OpSelMods">,
+    GIComplexPatternEquiv<VOP3OpSel>;
+
 def gi_smrd_imm :
     GIComplexOperandMatcher<s64, "selectSmrdImm">,
     GIComplexPatternEquiv<SMRDImm>;
@@ -63,19 +68,15 @@ def gi_smrd_sgpr :
     GIComplexOperandMatcher<s64, "selectSmrdSgpr">,
     GIComplexPatternEquiv<SMRDSgpr>;
 
-// FIXME: Why are the atomic versions separated?
 def gi_flat_offset :
     GIComplexOperandMatcher<s64, "selectFlatOffset">,
     GIComplexPatternEquiv<FLATOffset>;
 def gi_flat_offset_signed :
     GIComplexOperandMatcher<s64, "selectFlatOffsetSigned">,
     GIComplexPatternEquiv<FLATOffsetSigned>;
-def gi_flat_atomic :
-    GIComplexOperandMatcher<s64, "selectFlatOffset">,
-    GIComplexPatternEquiv<FLATAtomic>;
-def gi_flat_signed_atomic :
-    GIComplexOperandMatcher<s64, "selectFlatOffsetSigned">,
-    GIComplexPatternEquiv<FLATSignedAtomic>;
+def gi_global_saddr :
+    GIComplexOperandMatcher<s64, "selectGlobalSAddr">,
+    GIComplexPatternEquiv<GlobalSAddr>;
 
 def gi_mubuf_scratch_offset :
     GIComplexOperandMatcher<s32, "selectMUBUFScratchOffset">,
@@ -84,6 +85,14 @@ def gi_mubuf_scratch_offen :
     GIComplexOperandMatcher<s32, "selectMUBUFScratchOffen">,
     GIComplexPatternEquiv<MUBUFScratchOffen>;
 
+def gi_flat_scratch_offset :
+    GIComplexOperandMatcher<s32, "selectFlatOffsetSigned">,
+    GIComplexPatternEquiv<ScratchOffset>;
+
+def gi_flat_scratch_saddr :
+    GIComplexOperandMatcher<s32, "selectScratchSAddr">,
+    GIComplexPatternEquiv<ScratchSAddr>;
+
 def gi_ds_1addr_1offset :
     GIComplexOperandMatcher<s32, "selectDS1Addr1Offset">,
     GIComplexPatternEquiv<DS1Addr1Offset>;
@@ -92,6 +101,10 @@ def gi_ds_64bit_4byte_aligned :
     GIComplexOperandMatcher<s64, "selectDS64Bit4ByteAligned">,
     GIComplexPatternEquiv<DS64Bit4ByteAligned>;
 
+def gi_ds_128bit_8byte_aligned :
+    GIComplexOperandMatcher<s64, "selectDS128Bit8ByteAligned">,
+    GIComplexPatternEquiv<DS128Bit8ByteAligned>;
+
 def gi_mubuf_addr64 :
     GIComplexOperandMatcher<s64, "selectMUBUFAddr64">,
     GIComplexPatternEquiv<MUBUFAddr64>;
@@ -133,6 +146,9 @@ def : GINodeEquiv<G_LOAD, AMDGPUatomic_ld_glue> {
   bit CheckMMOIsAtomic = 1;
 }
 
+def : GINodeEquiv<G_STORE, AMDGPUatomic_st_glue> {
+  bit CheckMMOIsAtomic = 1;
+}
 
 
 def : GINodeEquiv<G_ATOMIC_CMPXCHG, atomic_cmp_swap_glue>;
@@ -181,6 +197,11 @@ def : GINodeEquiv<G_AMDGPU_ATOMIC_INC, SIatomic_inc>;
 def : GINodeEquiv<G_AMDGPU_ATOMIC_DEC, SIatomic_dec>;
 def : GINodeEquiv<G_AMDGPU_ATOMIC_INC, atomic_inc_glue>;
 def : GINodeEquiv<G_AMDGPU_ATOMIC_DEC, atomic_dec_glue>;
+def : GINodeEquiv<G_AMDGPU_ATOMIC_FMIN, SIatomic_fmin>;
+def : GINodeEquiv<G_AMDGPU_ATOMIC_FMAX, SIatomic_fmax>;
+def : GINodeEquiv<G_AMDGPU_ATOMIC_FMIN, atomic_load_fmin_glue>;
+def : GINodeEquiv<G_AMDGPU_ATOMIC_FMAX, atomic_load_fmax_glue>;
+
 
 def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_SWAP, SIbuffer_atomic_swap>;
 def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_ADD, SIbuffer_atomic_add>;
@@ -194,6 +215,7 @@ def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_OR, SIbuffer_atomic_or>;
 def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_XOR, SIbuffer_atomic_xor>;
 def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_INC, SIbuffer_atomic_inc>;
 def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_DEC, SIbuffer_atomic_dec>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FADD, SIbuffer_atomic_fadd>;
 def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_CMPSWAP, SIbuffer_atomic_cmpswap>;
 def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD, SIsbuffer_load>;
 
@@ -312,3 +334,6 @@ def gi_extract_dlc : GICustomOperandRenderer<"renderExtractDLC">,
 
 def gi_extract_swz : GICustomOperandRenderer<"renderExtractSWZ">,
   GISDNodeXFormEquiv<extract_swz>;
+
+def gi_frameindex_to_targetframeindex : GICustomOperandRenderer<"renderFrameIndex">,
+  GISDNodeXFormEquiv<frameindex_to_targetframeindex>;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
index 600b351f9ea1..bfeee37feb4b 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
@@ -37,8 +37,9 @@ enum PartialMappingIdx {
   PM_AGPR32 = 31,
   PM_AGPR64 = 32,
   PM_AGPR128 = 33,
-  PM_AGPR512 = 34,
-  PM_AGPR1024 = 35
+  PM_AGPR256 = 34,
+  PM_AGPR512 = 35,
+  PM_AGPR1024 = 36
 };
 
 const RegisterBankInfo::PartialMapping PartMappings[] {
@@ -69,6 +70,7 @@ const RegisterBankInfo::PartialMapping PartMappings[] {
   {0, 32, AGPRRegBank}, // AGPR begin
   {0, 64, AGPRRegBank},
   {0, 128, AGPRRegBank},
+  {0, 256, AGPRRegBank},
   {0, 512, AGPRRegBank},
   {0, 1024, AGPRRegBank}
 };
@@ -115,9 +117,9 @@ const RegisterBankInfo::ValueMapping ValMappings[] {
   {&PartMappings[20], 1}, // 32
   {&PartMappings[21], 1}, // 64
   {&PartMappings[22], 1}, // 128
-  {nullptr, 0},
-  {&PartMappings[23], 1}, // 512
-  {&PartMappings[24], 1}  // 1024
+  {&PartMappings[23], 1}, // 256
+  {&PartMappings[24], 1}, // 512
+  {&PartMappings[25], 1}  // 1024
 };
 
 const RegisterBankInfo::PartialMapping SGPROnly64BreakDown[] {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
index 989937a597fb..b3bafc5b2720 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
@@ -13,11 +13,11 @@
 using namespace llvm;
 using namespace MIPatternMatch;
 
-std::tuple<Register, unsigned, MachineInstr *>
+std::pair<Register, unsigned>
 AMDGPU::getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
   MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
   if (!Def)
-    return std::make_tuple(Reg, 0, nullptr);
+    return std::make_pair(Reg, 0);
 
   if (Def->getOpcode() == TargetOpcode::G_CONSTANT) {
     unsigned Offset;
@@ -27,21 +27,21 @@ AMDGPU::getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
     else
       Offset = Op.getCImm()->getZExtValue();
 
-    return std::make_tuple(Register(), Offset, Def);
+    return std::make_pair(Register(), Offset);
   }
 
   int64_t Offset;
   if (Def->getOpcode() == TargetOpcode::G_ADD) {
     // TODO: Handle G_OR used for add case
     if (mi_match(Def->getOperand(2).getReg(), MRI, m_ICst(Offset)))
-      return std::make_tuple(Def->getOperand(1).getReg(), Offset, Def);
+      return std::make_pair(Def->getOperand(1).getReg(), Offset);
 
     // FIXME: matcher should ignore copies
     if (mi_match(Def->getOperand(2).getReg(), MRI, m_Copy(m_ICst(Offset))))
-      return std::make_tuple(Def->getOperand(1).getReg(), Offset, Def);
+      return std::make_pair(Def->getOperand(1).getReg(), Offset);
   }
 
-  return std::make_tuple(Reg, 0, Def);
+  return std::make_pair(Reg, 0);
 }
 
 bool AMDGPU::isLegalVOP3PShuffleMask(ArrayRef<int> Mask) {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
index 766750758efc..404e0fcd1166 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
@@ -9,53 +9,21 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H
 
-#include "AMDGPUInstrInfo.h"
 #include "llvm/CodeGen/Register.h"
-#include <tuple>
+#include <utility>
 
 namespace llvm {
 
-class MachineInstr;
 class MachineRegisterInfo;
 
 namespace AMDGPU {
 
-/// Returns Base register, constant offset, and offset def point.
-std::tuple<Register, unsigned, MachineInstr *>
+/// Returns base register and constant offset.
+std::pair<Register, unsigned>
 getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg);
 
 bool isLegalVOP3PShuffleMask(ArrayRef<int> Mask);
 
-/// Return number of address arguments, and the number of gradients for an image
-/// intrinsic.
-inline std::pair<int, int>
-getImageNumVAddr(const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
-                 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode) {
-  const AMDGPU::MIMGDimInfo *DimInfo
-    = AMDGPU::getMIMGDimInfo(ImageDimIntr->Dim);
-
-  int NumGradients = BaseOpcode->Gradients ? DimInfo->NumGradients : 0;
-  int NumCoords = BaseOpcode->Coordinates ? DimInfo->NumCoords : 0;
-  int NumLCM = BaseOpcode->LodOrClampOrMip ? 1 : 0;
-  int NumVAddr = BaseOpcode->NumExtraArgs + NumGradients + NumCoords + NumLCM;
-  return {NumVAddr, NumGradients};
-}
-
-/// Return index of dmask in an gMIR image intrinsic
-inline int getDMaskIdx(const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode,
-                       int NumDefs) {
-  assert(!BaseOpcode->Atomic);
-  return NumDefs + 1 + (BaseOpcode->Store ? 1 : 0);
-}
-
-/// Return first address operand index in a gMIR image intrinsic.
-inline int getImageVAddrIdxBegin(const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode,
-                                 int NumDefs) {
-  if (BaseOpcode->Atomic)
-    return NumDefs + 1 + (BaseOpcode->AtomicX2 ? 2 : 1);
-  return getDMaskIdx(BaseOpcode, NumDefs) + 1;
-}
-
 }
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index c6f6a3b84e36..39f9092ce77c 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -14,15 +14,27 @@
 
 #include "AMDGPUHSAMetadataStreamer.h"
 #include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
+#include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUTargetStreamer.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIProgramInfo.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/IR/Constants.h"
 #include "llvm/IR/Module.h"
-#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+static std::pair<Type *, Align> getArgumentTypeAlign(const Argument &Arg,
+                                                     const DataLayout &DL) {
+  Type *Ty = Arg.getType();
+  MaybeAlign ArgAlign;
+  if (Arg.hasByRefAttr()) {
+    Ty = Arg.getParamByRefType();
+    ArgAlign = Arg.getParamAlign();
+  }
+
+  if (!ArgAlign)
+    ArgAlign = DL.getABITypeAlign(Ty);
+
+  return std::make_pair(Ty, *ArgAlign);
+}
 
 namespace llvm {
 
@@ -47,7 +59,7 @@ void MetadataStreamerV2::verify(StringRef HSAMetadataString) const {
   errs() << "AMDGPU HSA Metadata Parser Test: ";
 
   HSAMD::Metadata FromHSAMetadataString;
-  if (fromString(std::string(HSAMetadataString), FromHSAMetadataString)) {
+  if (fromString(HSAMetadataString, FromHSAMetadataString)) {
     errs() << "FAIL\n";
     return;
   }
@@ -311,23 +323,28 @@ void MetadataStreamerV2::emitKernelArg(const Argument &Arg) {
   if (Node && ArgNo < Node->getNumOperands())
     TypeQual = cast<MDString>(Node->getOperand(ArgNo))->getString();
 
-  Type *Ty = Arg.getType();
   const DataLayout &DL = Func->getParent()->getDataLayout();
 
   MaybeAlign PointeeAlign;
-  if (auto PtrTy = dyn_cast<PointerType>(Ty)) {
+  if (auto PtrTy = dyn_cast<PointerType>(Arg.getType())) {
     if (PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+      // FIXME: Should report this for all address spaces
       PointeeAlign = DL.getValueOrABITypeAlignment(Arg.getParamAlign(),
                                                    PtrTy->getElementType());
     }
   }
 
-  emitKernelArg(DL, Ty, getValueKind(Arg.getType(), TypeQual, BaseTypeName),
-                PointeeAlign, Name, TypeName, BaseTypeName, AccQual, TypeQual);
+  Type *ArgTy;
+  Align ArgAlign;
+  std::tie(ArgTy, ArgAlign) = getArgumentTypeAlign(Arg, DL);
+
+  emitKernelArg(DL, ArgTy, ArgAlign,
+                getValueKind(ArgTy, TypeQual, BaseTypeName), PointeeAlign, Name,
+                TypeName, BaseTypeName, AccQual, TypeQual);
 }
 
 void MetadataStreamerV2::emitKernelArg(const DataLayout &DL, Type *Ty,
-                                       ValueKind ValueKind,
+                                       Align Alignment, ValueKind ValueKind,
                                        MaybeAlign PointeeAlign, StringRef Name,
                                        StringRef TypeName,
                                        StringRef BaseTypeName,
@@ -338,7 +355,7 @@ void MetadataStreamerV2::emitKernelArg(const DataLayout &DL, Type *Ty,
   Arg.mName = std::string(Name);
   Arg.mTypeName = std::string(TypeName);
   Arg.mSize = DL.getTypeAllocSize(Ty);
-  Arg.mAlign = DL.getABITypeAlign(Ty).value();
+  Arg.mAlign = Alignment.value();
   Arg.mValueKind = ValueKind;
   Arg.mPointeeAlign = PointeeAlign ? PointeeAlign->value() : 0;
 
@@ -374,11 +391,11 @@ void MetadataStreamerV2::emitHiddenKernelArgs(const Function &Func) {
   auto Int64Ty = Type::getInt64Ty(Func.getContext());
 
   if (HiddenArgNumBytes >= 8)
-    emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetX);
+    emitKernelArg(DL, Int64Ty, Align(8), ValueKind::HiddenGlobalOffsetX);
   if (HiddenArgNumBytes >= 16)
-    emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetY);
+    emitKernelArg(DL, Int64Ty, Align(8), ValueKind::HiddenGlobalOffsetY);
   if (HiddenArgNumBytes >= 24)
-    emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetZ);
+    emitKernelArg(DL, Int64Ty, Align(8), ValueKind::HiddenGlobalOffsetZ);
 
   auto Int8PtrTy = Type::getInt8PtrTy(Func.getContext(),
                                       AMDGPUAS::GLOBAL_ADDRESS);
@@ -387,31 +404,31 @@ void MetadataStreamerV2::emitHiddenKernelArgs(const Function &Func) {
   // "none" argument.
   if (HiddenArgNumBytes >= 32) {
     if (Func.getParent()->getNamedMetadata("llvm.printf.fmts"))
-      emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenPrintfBuffer);
+      emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenPrintfBuffer);
     else if (Func.getParent()->getFunction("__ockl_hostcall_internal")) {
       // The printf runtime binding pass should have ensured that hostcall and
       // printf are not used in the same module.
       assert(!Func.getParent()->getNamedMetadata("llvm.printf.fmts"));
-      emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenHostcallBuffer);
+      emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenHostcallBuffer);
     } else
-      emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenNone);
+      emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenNone);
   }
 
   // Emit "default queue" and "completion action" arguments if enqueue kernel is
   // used, otherwise emit dummy "none" arguments.
   if (HiddenArgNumBytes >= 48) {
     if (Func.hasFnAttribute("calls-enqueue-kernel")) {
-      emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenDefaultQueue);
-      emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenCompletionAction);
+      emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenDefaultQueue);
+      emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenCompletionAction);
     } else {
-      emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenNone);
-      emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenNone);
+      emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenNone);
+      emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenNone);
     }
   }
 
   // Emit the pointer argument for multi-grid object.
   if (HiddenArgNumBytes >= 56)
-    emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenMultiGridSyncArg);
+    emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenMultiGridSyncArg);
 }
 
 bool MetadataStreamerV2::emitTo(AMDGPUTargetStreamer &TargetStreamer) {
@@ -699,10 +716,12 @@ void MetadataStreamerV3::emitKernelArg(const Argument &Arg, unsigned &Offset,
   if (Node && ArgNo < Node->getNumOperands())
     TypeQual = cast<MDString>(Node->getOperand(ArgNo))->getString();
 
-  Type *Ty = Arg.getType();
   const DataLayout &DL = Func->getParent()->getDataLayout();
 
   MaybeAlign PointeeAlign;
+  Type *Ty = Arg.hasByRefAttr() ? Arg.getParamByRefType() : Arg.getType();
+
+  // FIXME: Need to distinguish in memory alignment from pointer alignment.
   if (auto PtrTy = dyn_cast<PointerType>(Ty)) {
     if (PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
       PointeeAlign = DL.getValueOrABITypeAlignment(Arg.getParamAlign(),
@@ -710,19 +729,21 @@ void MetadataStreamerV3::emitKernelArg(const Argument &Arg, unsigned &Offset,
     }
   }
 
-  emitKernelArg(Func->getParent()->getDataLayout(), Arg.getType(),
-                getValueKind(Arg.getType(), TypeQual, BaseTypeName), Offset,
-                Args, PointeeAlign, Name, TypeName, BaseTypeName, AccQual,
-                TypeQual);
+  // There's no distinction between byval aggregates and raw aggregates.
+  Type *ArgTy;
+  Align ArgAlign;
+  std::tie(ArgTy, ArgAlign) = getArgumentTypeAlign(Arg, DL);
+
+  emitKernelArg(DL, ArgTy, ArgAlign,
+                getValueKind(ArgTy, TypeQual, BaseTypeName), Offset, Args,
+                PointeeAlign, Name, TypeName, BaseTypeName, AccQual, TypeQual);
 }
 
-void MetadataStreamerV3::emitKernelArg(const DataLayout &DL, Type *Ty,
-                                       StringRef ValueKind, unsigned &Offset,
-                                       msgpack::ArrayDocNode Args,
-                                       MaybeAlign PointeeAlign, StringRef Name,
-                                       StringRef TypeName,
-                                       StringRef BaseTypeName,
-                                       StringRef AccQual, StringRef TypeQual) {
+void MetadataStreamerV3::emitKernelArg(
+    const DataLayout &DL, Type *Ty, Align Alignment, StringRef ValueKind,
+    unsigned &Offset, msgpack::ArrayDocNode Args, MaybeAlign PointeeAlign,
+    StringRef Name, StringRef TypeName, StringRef BaseTypeName,
+    StringRef AccQual, StringRef TypeQual) {
   auto Arg = Args.getDocument()->getMapNode();
 
   if (!Name.empty())
@@ -730,7 +751,6 @@ void MetadataStreamerV3::emitKernelArg(const DataLayout &DL, Type *Ty,
   if (!TypeName.empty())
     Arg[".type_name"] = Arg.getDocument()->getNode(TypeName, /*Copy=*/true);
   auto Size = DL.getTypeAllocSize(Ty);
-  Align Alignment = DL.getABITypeAlign(Ty);
   Arg[".size"] = Arg.getDocument()->getNode(Size);
   Offset = alignTo(Offset, Alignment);
   Arg[".offset"] = Arg.getDocument()->getNode(Offset);
@@ -777,11 +797,14 @@ void MetadataStreamerV3::emitHiddenKernelArgs(const Function &Func,
   auto Int64Ty = Type::getInt64Ty(Func.getContext());
 
   if (HiddenArgNumBytes >= 8)
-    emitKernelArg(DL, Int64Ty, "hidden_global_offset_x", Offset, Args);
+    emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_x", Offset,
+                  Args);
   if (HiddenArgNumBytes >= 16)
-    emitKernelArg(DL, Int64Ty, "hidden_global_offset_y", Offset, Args);
+    emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_y", Offset,
+                  Args);
   if (HiddenArgNumBytes >= 24)
-    emitKernelArg(DL, Int64Ty, "hidden_global_offset_z", Offset, Args);
+    emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_z", Offset,
+                  Args);
 
   auto Int8PtrTy =
       Type::getInt8PtrTy(Func.getContext(), AMDGPUAS::GLOBAL_ADDRESS);
@@ -790,31 +813,36 @@ void MetadataStreamerV3::emitHiddenKernelArgs(const Function &Func,
   // "none" argument.
   if (HiddenArgNumBytes >= 32) {
     if (Func.getParent()->getNamedMetadata("llvm.printf.fmts"))
-      emitKernelArg(DL, Int8PtrTy, "hidden_printf_buffer", Offset, Args);
+      emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_printf_buffer", Offset,
+                    Args);
     else if (Func.getParent()->getFunction("__ockl_hostcall_internal")) {
       // The printf runtime binding pass should have ensured that hostcall and
       // printf are not used in the same module.
       assert(!Func.getParent()->getNamedMetadata("llvm.printf.fmts"));
-      emitKernelArg(DL, Int8PtrTy, "hidden_hostcall_buffer", Offset, Args);
+      emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_hostcall_buffer", Offset,
+                    Args);
     } else
-      emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, Args);
+      emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args);
   }
 
   // Emit "default queue" and "completion action" arguments if enqueue kernel is
   // used, otherwise emit dummy "none" arguments.
   if (HiddenArgNumBytes >= 48) {
     if (Func.hasFnAttribute("calls-enqueue-kernel")) {
-      emitKernelArg(DL, Int8PtrTy, "hidden_default_queue", Offset, Args);
-      emitKernelArg(DL, Int8PtrTy, "hidden_completion_action", Offset, Args);
+      emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_default_queue", Offset,
+                    Args);
+      emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_completion_action", Offset,
+                    Args);
     } else {
-      emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, Args);
-      emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, Args);
+      emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args);
+      emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args);
     }
   }
 
   // Emit the pointer argument for multi-grid object.
   if (HiddenArgNumBytes >= 56)
-    emitKernelArg(DL, Int8PtrTy, "hidden_multigrid_sync_arg", Offset, Args);
+    emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_multigrid_sync_arg", Offset,
+                  Args);
 }
 
 msgpack::MapDocNode
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
index 9534fffd228d..1c6db14b85cd 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
@@ -15,9 +15,6 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUHSAMETADATASTREAMER_H
 #define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUHSAMETADATASTREAMER_H
 
-#include "AMDGPU.h"
-#include "AMDKernelCodeT.h"
-#include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/MsgPackDocument.h"
 #include "llvm/Support/AMDGPUMetadata.h"
 #include "llvm/Support/Alignment.h"
@@ -87,11 +84,12 @@ private:
   void emitKernelArg(const Argument &Arg, unsigned &Offset,
                      msgpack::ArrayDocNode Args);
 
-  void emitKernelArg(const DataLayout &DL, Type *Ty, StringRef ValueKind,
-                     unsigned &Offset, msgpack::ArrayDocNode Args,
-                     MaybeAlign PointeeAlign = None, StringRef Name = "",
-                     StringRef TypeName = "", StringRef BaseTypeName = "",
-                     StringRef AccQual = "", StringRef TypeQual = "");
+  void emitKernelArg(const DataLayout &DL, Type *Ty, Align Alignment,
+                     StringRef ValueKind, unsigned &Offset,
+                     msgpack::ArrayDocNode Args, MaybeAlign PointeeAlign = None,
+                     StringRef Name = "", StringRef TypeName = "",
+                     StringRef BaseTypeName = "", StringRef AccQual = "",
+                     StringRef TypeQual = "");
 
   void emitHiddenKernelArgs(const Function &Func, unsigned &Offset,
                             msgpack::ArrayDocNode Args);
@@ -156,10 +154,11 @@ private:
 
   void emitKernelArg(const Argument &Arg);
 
-  void emitKernelArg(const DataLayout &DL, Type *Ty, ValueKind ValueKind,
-                     MaybeAlign PointeeAlign = None, StringRef Name = "",
-                     StringRef TypeName = "", StringRef BaseTypeName = "",
-                     StringRef AccQual = "", StringRef TypeQual = "");
+  void emitKernelArg(const DataLayout &DL, Type *Ty, Align Alignment,
+                     ValueKind ValueKind, MaybeAlign PointeeAlign = None,
+                     StringRef Name = "", StringRef TypeName = "",
+                     StringRef BaseTypeName = "", StringRef AccQual = "",
+                     StringRef TypeQual = "");
 
   void emitHiddenKernelArgs(const Function &Func);
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index aaf448346b53..340f4ac6f57a 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -12,48 +12,21 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "AMDGPUArgumentUsageInfo.h"
-#include "AMDGPUISelLowering.h" // For AMDGPUISD
-#include "AMDGPUInstrInfo.h"
-#include "AMDGPUPerfHintAnalysis.h"
-#include "AMDGPUSubtarget.h"
 #include "AMDGPUTargetMachine.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIDefines.h"
-#include "SIISelLowering.h"
-#include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
-#include "SIRegisterInfo.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
-#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
-#include "llvm/CodeGen/ISDOpcodes.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
-#include "llvm/CodeGen/ValueTypes.h"
-#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/InitializePasses.h"
+
 #ifdef EXPENSIVE_CHECKS
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/Dominators.h"
 #endif
-#include "llvm/IR/Instruction.h"
-#include "llvm/MC/MCInstrDesc.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CodeGen.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MachineValueType.h"
-#include "llvm/Support/MathExtras.h"
-#include <cassert>
-#include <cstdint>
-#include <new>
-#include <vector>
 
 #define DEBUG_TYPE "isel"
 
@@ -191,6 +164,9 @@ private:
   bool isUniformLoad(const SDNode *N) const;
   bool isUniformBr(const SDNode *N) const;
 
+  bool isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
+                                  SDValue &RHS) const;
+
   MachineSDNode *buildSMovImm64(SDLoc &DL, uint64_t Val, EVT VT) const;
 
   SDNode *glueCopyToOp(SDNode *N, SDValue NewChain, SDValue Glue) const;
@@ -200,11 +176,16 @@ private:
   const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const;
   virtual bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
   virtual bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
-  bool isDSOffsetLegal(SDValue Base, unsigned Offset,
-                       unsigned OffsetBits) const;
+  bool isDSOffsetLegal(SDValue Base, unsigned Offset) const;
+  bool isDSOffset2Legal(SDValue Base, unsigned Offset0, unsigned Offset1,
+                        unsigned Size) const;
   bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
   bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
                                  SDValue &Offset1) const;
+  bool SelectDS128Bit8ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
+                                  SDValue &Offset1) const;
+  bool SelectDSReadWrite2(SDValue Ptr, SDValue &Base, SDValue &Offset0,
+                          SDValue &Offset1, unsigned Size) const;
   bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
                    SDValue &SOffset, SDValue &Offset, SDValue &Offen,
                    SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC,
@@ -233,11 +214,11 @@ private:
 
   template <bool IsSigned>
   bool SelectFlatOffset(SDNode *N, SDValue Addr, SDValue &VAddr,
-                        SDValue &Offset, SDValue &SLC) const;
-  bool SelectFlatAtomic(SDNode *N, SDValue Addr, SDValue &VAddr,
-                        SDValue &Offset, SDValue &SLC) const;
-  bool SelectFlatAtomicSigned(SDNode *N, SDValue Addr, SDValue &VAddr,
-                              SDValue &Offset, SDValue &SLC) const;
+                        SDValue &Offset) const;
+  bool SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
+                         SDValue &VOffset, SDValue &Offset) const;
+  bool SelectScratchSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
+                          SDValue &Offset) const;
 
   bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset,
                         bool &Imm) const;
@@ -252,11 +233,15 @@ private:
   bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const;
 
   bool SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, SDValue &SrcMods) const;
-  bool SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &SrcMods) const;
+  bool SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &SrcMods,
+                          bool AllowAbs = true) const;
   bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+  bool SelectVOP3BMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
   bool SelectVOP3NoMods(SDValue In, SDValue &Src) const;
   bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods,
                        SDValue &Clamp, SDValue &Omod) const;
+  bool SelectVOP3BMods0(SDValue In, SDValue &Src, SDValue &SrcMods,
+                        SDValue &Clamp, SDValue &Omod) const;
   bool SelectVOP3NoMods0(SDValue In, SDValue &Src, SDValue &SrcMods,
                          SDValue &Clamp, SDValue &Omod) const;
 
@@ -519,8 +504,8 @@ bool AMDGPUDAGToDAGISel::isNoNanSrc(SDValue N) const {
     return true;
 
   // TODO: Move into isKnownNeverNaN
-  if (N->getFlags().isDefined())
-    return N->getFlags().hasNoNaNs();
+  if (N->getFlags().hasNoNaNs())
+    return true;
 
   return CurDAG->isKnownNeverNaN(N);
 }
@@ -557,8 +542,8 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
                                                           unsigned OpNo) const {
   if (!N->isMachineOpcode()) {
     if (N->getOpcode() == ISD::CopyToReg) {
-      unsigned Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
-      if (Register::isVirtualRegister(Reg)) {
+      Register Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
+      if (Reg.isVirtual()) {
         MachineRegisterInfo &MRI = CurDAG->getMachineFunction().getRegInfo();
         return MRI.getRegClass(Reg);
       }
@@ -716,8 +701,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
       (Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC ||
        Opc == ISD::ATOMIC_LOAD_FADD ||
        Opc == AMDGPUISD::ATOMIC_LOAD_FMIN ||
-       Opc == AMDGPUISD::ATOMIC_LOAD_FMAX ||
-       Opc == AMDGPUISD::ATOMIC_LOAD_CSUB)) {
+       Opc == AMDGPUISD::ATOMIC_LOAD_FMAX)) {
     N = glueCopyToM0LDSInit(N);
     SelectCode(N);
     return;
@@ -920,6 +904,53 @@ bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
          Term->getMetadata("structurizecfg.uniform");
 }
 
+static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr,
+                                          SDValue &N0, SDValue &N1) {
+  if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
+      Addr.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
+    // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.
+    // (i64 (bitcast (v2i32 (build_vector
+    //                        (or (extract_vector_elt V, 0), OFFSET),
+    //                        (extract_vector_elt V, 1)))))
+    SDValue Lo = Addr.getOperand(0).getOperand(0);
+    if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) {
+      SDValue BaseLo = Lo.getOperand(0);
+      SDValue BaseHi = Addr.getOperand(0).getOperand(1);
+      // Check that split base (Lo and Hi) are extracted from the same one.
+      if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+          BaseHi.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+          BaseLo.getOperand(0) == BaseHi.getOperand(0) &&
+          // Lo is statically extracted from index 0.
+          isa<ConstantSDNode>(BaseLo.getOperand(1)) &&
+          BaseLo.getConstantOperandVal(1) == 0 &&
+          // Hi is statically extracted from index 0.
+          isa<ConstantSDNode>(BaseHi.getOperand(1)) &&
+          BaseHi.getConstantOperandVal(1) == 1) {
+        N0 = BaseLo.getOperand(0).getOperand(0);
+        N1 = Lo.getOperand(1);
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+bool AMDGPUDAGToDAGISel::isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
+                                                    SDValue &RHS) const {
+  if (CurDAG->isBaseWithConstantOffset(Addr)) {
+    LHS = Addr.getOperand(0);
+    RHS = Addr.getOperand(1);
+    return true;
+  }
+
+  if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, LHS, RHS)) {
+    assert(LHS && RHS && isa<ConstantSDNode>(RHS));
+    return true;
+  }
+
+  return false;
+}
+
 StringRef AMDGPUDAGToDAGISel::getPassName() const {
   return "AMDGPU DAG->DAG Pattern Instruction Selection";
 }
@@ -994,7 +1025,7 @@ void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
 
   static const unsigned OpcMap[2][2][2] = {
       {{AMDGPU::S_SUB_U32, AMDGPU::S_ADD_U32},
-       {AMDGPU::V_SUB_I32_e32, AMDGPU::V_ADD_I32_e32}},
+       {AMDGPU::V_SUB_CO_U32_e32, AMDGPU::V_ADD_CO_U32_e32}},
       {{AMDGPU::S_SUBB_U32, AMDGPU::S_ADDC_U32},
        {AMDGPU::V_SUBB_U32_e32, AMDGPU::V_ADDC_U32_e32}}};
 
@@ -1073,7 +1104,7 @@ void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
     }
 
   if (IsVALU) {
-    unsigned Opc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64;
+    unsigned Opc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
 
     CurDAG->SelectNodeTo(
         N, Opc, N->getVTList(),
@@ -1099,7 +1130,7 @@ void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
   Ops[8] = N->getOperand(0);
   Ops[9] = N->getOperand(4);
 
-  CurDAG->SelectNodeTo(N, AMDGPU::V_FMA_F32, N->getVTList(), Ops);
+  CurDAG->SelectNodeTo(N, AMDGPU::V_FMA_F32_e64, N->getVTList(), Ops);
 }
 
 void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
@@ -1124,9 +1155,14 @@ void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
   assert(VT == MVT::f32 || VT == MVT::f64);
 
   unsigned Opc
-    = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64 : AMDGPU::V_DIV_SCALE_F32;
+    = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64;
 
-  SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2) };
+  // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
+  // omod
+  SDValue Ops[8];
+  SelectVOP3BMods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
+  SelectVOP3BMods(N->getOperand(1), Ops[3], Ops[2]);
+  SelectVOP3BMods(N->getOperand(2), Ops[5], Ops[4]);
   CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
 }
 
@@ -1135,7 +1171,7 @@ void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
 void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
   SDLoc SL(N);
   bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
-  unsigned Opc = Signed ? AMDGPU::V_MAD_I64_I32 : AMDGPU::V_MAD_U64_U32;
+  unsigned Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
 
   SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
   SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
@@ -1143,13 +1179,11 @@ void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
   CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
 }
 
-bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset,
-                                         unsigned OffsetBits) const {
-  if ((OffsetBits == 16 && !isUInt<16>(Offset)) ||
-      (OffsetBits == 8 && !isUInt<8>(Offset)))
+bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const {
+  if (!isUInt<16>(Offset))
     return false;
 
-  if (Subtarget->hasUsableDSOffset() ||
+  if (!Base || Subtarget->hasUsableDSOffset() ||
       Subtarget->unsafeDSOffsetFoldingEnabled())
     return true;
 
@@ -1165,7 +1199,7 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
     SDValue N0 = Addr.getOperand(0);
     SDValue N1 = Addr.getOperand(1);
     ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
-    if (isDSOffsetLegal(N0, C1->getSExtValue(), 16)) {
+    if (isDSOffsetLegal(N0, C1->getSExtValue())) {
       // (add n0, c0)
       Base = N0;
       Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
@@ -1175,7 +1209,7 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
     // sub C, x -> add (sub 0, x), C
     if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
       int64_t ByteOffset = C->getSExtValue();
-      if (isUInt<16>(ByteOffset)) {
+      if (isDSOffsetLegal(SDValue(), ByteOffset)) {
         SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
 
         // XXX - This is kind of hacky. Create a dummy sub node so we can check
@@ -1184,13 +1218,13 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
         SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
                                       Zero, Addr.getOperand(1));
 
-        if (isDSOffsetLegal(Sub, ByteOffset, 16)) {
+        if (isDSOffsetLegal(Sub, ByteOffset)) {
           SmallVector<SDValue, 3> Opnds;
           Opnds.push_back(Zero);
           Opnds.push_back(Addr.getOperand(1));
 
           // FIXME: Select to VOP3 version for with-carry.
-          unsigned SubOp = AMDGPU::V_SUB_I32_e32;
+          unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
           if (Subtarget->hasAddNoCarry()) {
             SubOp = AMDGPU::V_SUB_U32_e64;
             Opnds.push_back(
@@ -1214,7 +1248,7 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
 
     SDLoc DL(Addr);
 
-    if (isUInt<16>(CAddr->getZExtValue())) {
+    if (isDSOffsetLegal(SDValue(), CAddr->getZExtValue())) {
       SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
       MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
                                  DL, MVT::i32, Zero);
@@ -1230,75 +1264,104 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
   return true;
 }
 
+bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
+                                          unsigned Offset1,
+                                          unsigned Size) const {
+  if (Offset0 % Size != 0 || Offset1 % Size != 0)
+    return false;
+  if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
+    return false;
+
+  if (!Base || Subtarget->hasUsableDSOffset() ||
+      Subtarget->unsafeDSOffsetFoldingEnabled())
+    return true;
+
+  // On Southern Islands instruction with a negative base value and an offset
+  // don't seem to work.
+  return CurDAG->SignBitIsZero(Base);
+}
+
 // TODO: If offset is too big, put low 16-bit into offset.
 bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
                                                    SDValue &Offset0,
                                                    SDValue &Offset1) const {
+  return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 4);
+}
+
+bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base,
+                                                    SDValue &Offset0,
+                                                    SDValue &Offset1) const {
+  return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 8);
+}
+
+bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
+                                            SDValue &Offset0, SDValue &Offset1,
+                                            unsigned Size) const {
   SDLoc DL(Addr);
 
   if (CurDAG->isBaseWithConstantOffset(Addr)) {
     SDValue N0 = Addr.getOperand(0);
     SDValue N1 = Addr.getOperand(1);
     ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
-    unsigned DWordOffset0 = C1->getZExtValue() / 4;
-    unsigned DWordOffset1 = DWordOffset0 + 1;
+    unsigned OffsetValue0 = C1->getZExtValue();
+    unsigned OffsetValue1 = OffsetValue0 + Size;
+
     // (add n0, c0)
-    if (isDSOffsetLegal(N0, DWordOffset1, 8)) {
+    if (isDSOffset2Legal(N0, OffsetValue0, OffsetValue1, Size)) {
       Base = N0;
-      Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8);
-      Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8);
+      Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
+      Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
       return true;
     }
   } else if (Addr.getOpcode() == ISD::SUB) {
     // sub C, x -> add (sub 0, x), C
-    if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
-      unsigned DWordOffset0 = C->getZExtValue() / 4;
-      unsigned DWordOffset1 = DWordOffset0 + 1;
+    if (const ConstantSDNode *C =
+            dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
+      unsigned OffsetValue0 = C->getZExtValue();
+      unsigned OffsetValue1 = OffsetValue0 + Size;
 
-      if (isUInt<8>(DWordOffset0)) {
+      if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
         SDLoc DL(Addr);
         SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
 
         // XXX - This is kind of hacky. Create a dummy sub node so we can check
         // the known bits in isDSOffsetLegal. We need to emit the selected node
         // here, so this is thrown away.
-        SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
-                                      Zero, Addr.getOperand(1));
+        SDValue Sub =
+            CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1));
 
-        if (isDSOffsetLegal(Sub, DWordOffset1, 8)) {
+        if (isDSOffset2Legal(Sub, OffsetValue0, OffsetValue1, Size)) {
           SmallVector<SDValue, 3> Opnds;
           Opnds.push_back(Zero);
           Opnds.push_back(Addr.getOperand(1));
-          unsigned SubOp = AMDGPU::V_SUB_I32_e32;
+          unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
           if (Subtarget->hasAddNoCarry()) {
             SubOp = AMDGPU::V_SUB_U32_e64;
             Opnds.push_back(
                 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
           }
 
-          MachineSDNode *MachineSub
-            = CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds);
+          MachineSDNode *MachineSub = CurDAG->getMachineNode(
+              SubOp, DL, MVT::getIntegerVT(Size * 8), Opnds);
 
           Base = SDValue(MachineSub, 0);
-          Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8);
-          Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8);
+          Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
+          Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
           return true;
         }
       }
     }
   } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
-    unsigned DWordOffset0 = CAddr->getZExtValue() / 4;
-    unsigned DWordOffset1 = DWordOffset0 + 1;
-    assert(4 * DWordOffset0 == CAddr->getZExtValue());
+    unsigned OffsetValue0 = CAddr->getZExtValue();
+    unsigned OffsetValue1 = OffsetValue0 + Size;
 
-    if (isUInt<8>(DWordOffset0) && isUInt<8>(DWordOffset1)) {
+    if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
       SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
-      MachineSDNode *MovZero
-        = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
-                                 DL, MVT::i32, Zero);
+      MachineSDNode *MovZero =
+          CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);
       Base = SDValue(MovZero, 0);
-      Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8);
-      Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8);
+      Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
+      Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
       return true;
     }
   }
@@ -1454,22 +1517,16 @@ static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) {
 
 std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
   SDLoc DL(N);
-  const MachineFunction &MF = CurDAG->getMachineFunction();
-  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
-
-  if (auto FI = dyn_cast<FrameIndexSDNode>(N)) {
-    SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
-                                              FI->getValueType(0));
 
-    // If we can resolve this to a frame index access, this will be relative to
-    // either the stack or frame pointer SGPR.
-    return std::make_pair(
-        TFI, CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32));
-  }
+  auto *FI = dyn_cast<FrameIndexSDNode>(N);
+  SDValue TFI =
+      FI ? CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)) : N;
 
-  // If we don't know this private access is a local stack object, it needs to
-  // be relative to the entry point's scratch wave offset.
-  return std::make_pair(N, CurDAG->getTargetConstant(0, DL, MVT::i32));
+  // We rebase the base address into an absolute stack address and hence
+  // use constant 0 for soffset. This value must be retained until
+  // frame elimination and eliminateFrameIndex will choose the appropriate
+  // frame register if need be.
+  return std::make_pair(TFI, CurDAG->getTargetConstant(0, DL, MVT::i32));
 }
 
 bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
@@ -1628,155 +1685,245 @@ static MemSDNode* findMemSDNode(SDNode *N) {
   llvm_unreachable("cannot find MemSDNode in the pattern!");
 }
 
-static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr,
-                                          SDValue &N0, SDValue &N1) {
-  if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
-      Addr.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
-    // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.
-    // (i64 (bitcast (v2i32 (build_vector
-    //                        (or (extract_vector_elt V, 0), OFFSET),
-    //                        (extract_vector_elt V, 1)))))
-    SDValue Lo = Addr.getOperand(0).getOperand(0);
-    if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) {
-      SDValue BaseLo = Lo.getOperand(0);
-      SDValue BaseHi = Addr.getOperand(0).getOperand(1);
-      // Check that split base (Lo and Hi) are extracted from the same one.
-      if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
-          BaseHi.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
-          BaseLo.getOperand(0) == BaseHi.getOperand(0) &&
-          // Lo is statically extracted from index 0.
-          isa<ConstantSDNode>(BaseLo.getOperand(1)) &&
-          BaseLo.getConstantOperandVal(1) == 0 &&
-          // Hi is statically extracted from index 0.
-          isa<ConstantSDNode>(BaseHi.getOperand(1)) &&
-          BaseHi.getConstantOperandVal(1) == 1) {
-        N0 = BaseLo.getOperand(0).getOperand(0);
-        N1 = Lo.getOperand(1);
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
 template <bool IsSigned>
 bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N,
                                           SDValue Addr,
                                           SDValue &VAddr,
-                                          SDValue &Offset,
-                                          SDValue &SLC) const {
+                                          SDValue &Offset) const {
   int64_t OffsetVal = 0;
 
+  unsigned AS = findMemSDNode(N)->getAddressSpace();
+
   if (Subtarget->hasFlatInstOffsets() &&
       (!Subtarget->hasFlatSegmentOffsetBug() ||
-       findMemSDNode(N)->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS)) {
+       AS != AMDGPUAS::FLAT_ADDRESS)) {
     SDValue N0, N1;
-    if (CurDAG->isBaseWithConstantOffset(Addr)) {
-      N0 = Addr.getOperand(0);
-      N1 = Addr.getOperand(1);
-    } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) {
-      assert(N0 && N1 && isa<ConstantSDNode>(N1));
-    }
-    if (N0 && N1) {
+    if (isBaseWithConstantOffset64(Addr, N0, N1)) {
       uint64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
 
       const SIInstrInfo *TII = Subtarget->getInstrInfo();
-      unsigned AS = findMemSDNode(N)->getAddressSpace();
       if (TII->isLegalFLATOffset(COffsetVal, AS, IsSigned)) {
         Addr = N0;
         OffsetVal = COffsetVal;
       } else {
         // If the offset doesn't fit, put the low bits into the offset field and
         // add the rest.
+        //
+        // For a FLAT instruction the hardware decides whether to access
+        // global/scratch/shared memory based on the high bits of vaddr,
+        // ignoring the offset field, so we have to ensure that when we add
+        // remainder to vaddr it still points into the same underlying object.
+        // The easiest way to do that is to make sure that we split the offset
+        // into two pieces that are both >= 0 or both <= 0.
 
         SDLoc DL(N);
-        uint64_t ImmField;
-        const unsigned NumBits = TII->getNumFlatOffsetBits(AS, IsSigned);
-        if (IsSigned) {
-          ImmField = SignExtend64(COffsetVal, NumBits);
-
-          // Don't use a negative offset field if the base offset is positive.
-          // Since the scheduler currently relies on the offset field, doing so
-          // could result in strange scheduling decisions.
-
-          // TODO: Should we not do this in the opposite direction as well?
-          if (static_cast<int64_t>(COffsetVal) > 0) {
-            if (static_cast<int64_t>(ImmField) < 0) {
-              const uint64_t OffsetMask =
-                  maskTrailingOnes<uint64_t>(NumBits - 1);
-              ImmField = COffsetVal & OffsetMask;
-            }
-          }
-        } else {
-          // TODO: Should we do this for a negative offset?
-          const uint64_t OffsetMask = maskTrailingOnes<uint64_t>(NumBits);
-          ImmField = COffsetVal & OffsetMask;
-        }
+        uint64_t RemainderOffset;
 
-        uint64_t RemainderOffset = COffsetVal - ImmField;
+        std::tie(OffsetVal, RemainderOffset)
+          = TII->splitFlatOffset(COffsetVal, AS, IsSigned);
 
-        assert(TII->isLegalFLATOffset(ImmField, AS, IsSigned));
-        assert(RemainderOffset + ImmField == COffsetVal);
-
-        OffsetVal = ImmField;
+        SDValue AddOffsetLo =
+            getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
+        SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
 
-        // TODO: Should this try to use a scalar add pseudo if the base address
-        // is uniform and saddr is usable?
-        SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
-        SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
+        if (Addr.getValueType().getSizeInBits() == 32) {
+          SmallVector<SDValue, 3> Opnds;
+          Opnds.push_back(N0);
+          Opnds.push_back(AddOffsetLo);
+          unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
+          if (Subtarget->hasAddNoCarry()) {
+            AddOp = AMDGPU::V_ADD_U32_e64;
+            Opnds.push_back(Clamp);
+          }
+          Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
+        } else {
+          // TODO: Should this try to use a scalar add pseudo if the base address
+          // is uniform and saddr is usable?
+          SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
+          SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
 
-        SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
-                                              MVT::i32, N0, Sub0);
-        SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
-                                              MVT::i32, N0, Sub1);
+          SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+                                                DL, MVT::i32, N0, Sub0);
+          SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+                                                DL, MVT::i32, N0, Sub1);
 
-        SDValue AddOffsetLo =
-            getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
-        SDValue AddOffsetHi =
-            getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
+          SDValue AddOffsetHi =
+              getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
 
-        SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
-        SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
+          SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
 
-        SDNode *Add =
-            CurDAG->getMachineNode(AMDGPU::V_ADD_I32_e64, DL, VTs,
-                                   {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
+          SDNode *Add =
+              CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
+                                     {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
 
-        SDNode *Addc = CurDAG->getMachineNode(
-            AMDGPU::V_ADDC_U32_e64, DL, VTs,
-            {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
+          SDNode *Addc = CurDAG->getMachineNode(
+              AMDGPU::V_ADDC_U32_e64, DL, VTs,
+              {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
 
-        SDValue RegSequenceArgs[] = {
-            CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
-            SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
+          SDValue RegSequenceArgs[] = {
+              CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
+              SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
 
-        Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
-                                              MVT::i64, RegSequenceArgs),
-                       0);
+          Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
+                                                MVT::i64, RegSequenceArgs),
+                         0);
+        }
       }
     }
   }
 
   VAddr = Addr;
   Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i16);
-  SLC = CurDAG->getTargetConstant(0, SDLoc(), MVT::i1);
   return true;
 }
 
-bool AMDGPUDAGToDAGISel::SelectFlatAtomic(SDNode *N,
-                                          SDValue Addr,
-                                          SDValue &VAddr,
-                                          SDValue &Offset,
-                                          SDValue &SLC) const {
-  return SelectFlatOffset<false>(N, Addr, VAddr, Offset, SLC);
+// If this matches zero_extend i32:x, return x
+static SDValue matchZExtFromI32(SDValue Op) {
+  if (Op.getOpcode() != ISD::ZERO_EXTEND)
+    return SDValue();
+
+  SDValue ExtSrc = Op.getOperand(0);
+  return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();
+}
+
+// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
+bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
+                                           SDValue Addr,
+                                           SDValue &SAddr,
+                                           SDValue &VOffset,
+                                           SDValue &Offset) const {
+  int64_t ImmOffset = 0;
+
+  // Match the immediate offset first, which canonically is moved as low as
+  // possible.
+
+  SDValue LHS, RHS;
+  if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
+    int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
+    const SIInstrInfo *TII = Subtarget->getInstrInfo();
+
+    if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, true)) {
+      Addr = LHS;
+      ImmOffset = COffsetVal;
+    } else if (!LHS->isDivergent() && COffsetVal > 0) {
+      SDLoc SL(N);
+      // saddr + large_offset -> saddr + (voffset = large_offset & ~MaxOffset) +
+      //                         (large_offset & MaxOffset);
+      int64_t SplitImmOffset, RemainderOffset;
+      std::tie(SplitImmOffset, RemainderOffset)
+        = TII->splitFlatOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, true);
+
+      if (isUInt<32>(RemainderOffset)) {
+        SDNode *VMov = CurDAG->getMachineNode(
+          AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
+          CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
+        VOffset = SDValue(VMov, 0);
+        SAddr = LHS;
+        Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16);
+        return true;
+      }
+    }
+  }
+
+  // Match the variable offset.
+  if (Addr.getOpcode() != ISD::ADD) {
+    if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
+        isa<ConstantSDNode>(Addr))
+      return false;
+
+    // It's cheaper to materialize a single 32-bit zero for vaddr than the two
+    // moves required to copy a 64-bit SGPR to VGPR.
+    SAddr = Addr;
+    SDNode *VMov = CurDAG->getMachineNode(
+      AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
+      CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
+    VOffset = SDValue(VMov, 0);
+    Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
+    return true;
+  }
+
+  LHS = Addr.getOperand(0);
+  RHS = Addr.getOperand(1);
+
+  if (!LHS->isDivergent()) {
+    // add (i64 sgpr), (zero_extend (i32 vgpr))
+    if (SDValue ZextRHS = matchZExtFromI32(RHS)) {
+      SAddr = LHS;
+      VOffset = ZextRHS;
+    }
+  }
+
+  if (!SAddr && !RHS->isDivergent()) {
+    // add (zero_extend (i32 vgpr)), (i64 sgpr)
+    if (SDValue ZextLHS = matchZExtFromI32(LHS)) {
+      SAddr = RHS;
+      VOffset = ZextLHS;
+    }
+  }
+
+  if (!SAddr)
+    return false;
+
+  Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
+  return true;
 }
 
-bool AMDGPUDAGToDAGISel::SelectFlatAtomicSigned(SDNode *N,
-                                                SDValue Addr,
-                                                SDValue &VAddr,
-                                                SDValue &Offset,
-                                                SDValue &SLC) const {
-  return SelectFlatOffset<true>(N, Addr, VAddr, Offset, SLC);
+// Match (32-bit SGPR base) + sext(imm offset)
+bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *N,
+                                            SDValue Addr,
+                                            SDValue &SAddr,
+                                            SDValue &Offset) const {
+  if (Addr->isDivergent())
+    return false;
+
+  SAddr = Addr;
+  int64_t COffsetVal = 0;
+
+  if (CurDAG->isBaseWithConstantOffset(Addr)) {
+    COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
+    SAddr = Addr.getOperand(0);
+  }
+
+  if (auto FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
+    SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
+  } else if (SAddr.getOpcode() == ISD::ADD &&
+             isa<FrameIndexSDNode>(SAddr.getOperand(0))) {
+    // Materialize this into a scalar move for scalar address to avoid
+    // readfirstlane.
+    auto FI = cast<FrameIndexSDNode>(SAddr.getOperand(0));
+    SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
+                                              FI->getValueType(0));
+    SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_U32, SDLoc(SAddr),
+                                           MVT::i32, TFI, SAddr.getOperand(1)),
+                    0);
+  }
+
+  const SIInstrInfo *TII = Subtarget->getInstrInfo();
+
+  if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true)) {
+    int64_t RemainderOffset = COffsetVal;
+    int64_t ImmField = 0;
+    const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(*Subtarget, true);
+    // Use signed division by a power of two to truncate towards 0.
+    int64_t D = 1LL << (NumBits - 1);
+    RemainderOffset = (COffsetVal / D) * D;
+    ImmField = COffsetVal - RemainderOffset;
+
+    assert(TII->isLegalFLATOffset(ImmField, AMDGPUAS::PRIVATE_ADDRESS, true));
+    assert(RemainderOffset + ImmField == COffsetVal);
+
+    COffsetVal = ImmField;
+
+    SDLoc DL(N);
+    SDValue AddOffset =
+        getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
+    SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_U32, DL, MVT::i32,
+                                           SAddr, AddOffset), 0);
+  }
+
+  Offset = CurDAG->getTargetConstant(COffsetVal, SDLoc(), MVT::i16);
+
+  return true;
 }
 
 bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
@@ -2223,11 +2370,12 @@ void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) {
       unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN :
         AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN;
       SDValue CmpVal = Mem->getOperand(2);
+      SDValue GLC = CurDAG->getTargetConstant(1, SL, MVT::i1);
 
       // XXX - Do we care about glue operands?
 
       SDValue Ops[] = {
-        CmpVal, VAddr, SRsrc, SOffset, Offset, SLC, Mem->getChain()
+        CmpVal, VAddr, SRsrc, SOffset, Offset, GLC, SLC, Mem->getChain()
       };
 
       CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops);
@@ -2241,8 +2389,9 @@ void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) {
         AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN;
 
       SDValue CmpVal = Mem->getOperand(2);
+      SDValue GLC = CurDAG->getTargetConstant(1, SL, MVT::i1);
       SDValue Ops[] = {
-        CmpVal, SRsrc, SOffset, Offset, SLC, Mem->getChain()
+        CmpVal, SRsrc, SOffset, Offset, GLC, SLC, Mem->getChain()
       };
 
       CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops);
@@ -2284,7 +2433,7 @@ void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
     SDValue PtrOffset = Ptr.getOperand(1);
 
     const APInt &OffsetVal = cast<ConstantSDNode>(PtrOffset)->getAPIntValue();
-    if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue(), 16)) {
+    if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue())) {
       N = glueCopyToM0(N, PtrBase);
       Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
     }
@@ -2379,15 +2528,11 @@ void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
   SDValue Chain = N->getOperand(0);
   SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
 
-  // TODO: Can this just be removed from the instruction?
-  SDValue GDS = CurDAG->getTargetConstant(1, SL, MVT::i1);
-
   const unsigned Opc = gwsIntrinToOpcode(IntrID);
   SmallVector<SDValue, 5> Ops;
   if (HasVSrc)
     Ops.push_back(N->getOperand(2));
   Ops.push_back(OffsetField);
-  Ops.push_back(GDS);
   Ops.push_back(Chain);
 
   SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
@@ -2511,7 +2656,8 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
 }
 
 bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
-                                            unsigned &Mods) const {
+                                            unsigned &Mods,
+                                            bool AllowAbs) const {
   Mods = 0;
   Src = In;
 
@@ -2520,7 +2666,7 @@ bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
     Src = Src.getOperand(0);
   }
 
-  if (Src.getOpcode() == ISD::FABS) {
+  if (AllowAbs && Src.getOpcode() == ISD::FABS) {
     Mods |= SISrcMods::ABS;
     Src = Src.getOperand(0);
   }
@@ -2539,6 +2685,17 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
   return false;
 }
 
+bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src,
+                                         SDValue &SrcMods) const {
+  unsigned Mods;
+  if (SelectVOP3ModsImpl(In, Src, Mods, /* AllowAbs */ false)) {
+    SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+    return true;
+  }
+
+  return false;
+}
+
 bool AMDGPUDAGToDAGISel::SelectVOP3Mods_NNaN(SDValue In, SDValue &Src,
                                              SDValue &SrcMods) const {
   SelectVOP3Mods(In, Src, SrcMods);
@@ -2563,6 +2720,16 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
   return SelectVOP3Mods(In, Src, SrcMods);
 }
 
+bool AMDGPUDAGToDAGISel::SelectVOP3BMods0(SDValue In, SDValue &Src,
+                                          SDValue &SrcMods, SDValue &Clamp,
+                                          SDValue &Omod) const {
+  SDLoc DL(In);
+  Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
+  Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
+
+  return SelectVOP3BMods(In, Src, SrcMods);
+}
+
 bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
                                          SDValue &Clamp, SDValue &Omod) const {
   Src = In;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 940ec6f31c69..0b4b4776ad39 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -14,25 +14,17 @@
 
 #include "AMDGPUISelLowering.h"
 #include "AMDGPU.h"
-#include "AMDGPUCallLowering.h"
-#include "AMDGPUFrameLowering.h"
-#include "AMDGPUSubtarget.h"
-#include "AMDGPUTargetMachine.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "R600MachineFunctionInfo.h"
-#include "SIInstrInfo.h"
+#include "AMDGPUInstrInfo.h"
+#include "AMDGPUMachineFunction.h"
+#include "GCNSubtarget.h"
 #include "SIMachineFunctionInfo.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/CodeGen/Analysis.h"
-#include "llvm/CodeGen/CallingConvLower.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
-#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/KnownBits.h"
-#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetMachine.h"
+
 using namespace llvm;
 
 #include "AMDGPUGenCallingConv.inc"
@@ -320,6 +312,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
   setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
 
+  setOperationAction(ISD::FREM, MVT::f16, Custom);
   setOperationAction(ISD::FREM, MVT::f32, Custom);
   setOperationAction(ISD::FREM, MVT::f64, Custom);
 
@@ -396,6 +389,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::ROTL, MVT::i64, Expand);
   setOperationAction(ISD::ROTR, MVT::i64, Expand);
 
+  setOperationAction(ISD::MULHU, MVT::i16, Expand);
+  setOperationAction(ISD::MULHS, MVT::i16, Expand);
+
   setOperationAction(ISD::MUL, MVT::i64, Expand);
   setOperationAction(ISD::MULHU, MVT::i64, Expand);
   setOperationAction(ISD::MULHS, MVT::i64, Expand);
@@ -569,6 +565,17 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
 }
 
+bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const {
+  if (getTargetMachine().Options.NoSignedZerosFPMath)
+    return true;
+
+  const auto Flags = Op.getNode()->getFlags();
+  if (Flags.hasNoSignedZeros())
+    return true;
+
+  return false;
+}
+
 //===----------------------------------------------------------------------===//
 // Target Information
 //===----------------------------------------------------------------------===//
@@ -598,6 +605,7 @@ static bool fnegFoldsIntoOp(unsigned Opc) {
   case AMDGPUISD::FMIN_LEGACY:
   case AMDGPUISD::FMAX_LEGACY:
   case AMDGPUISD::FMED3:
+    // TODO: handle llvm.amdgcn.fma.legacy
     return true;
   default:
     return false;
@@ -781,34 +789,27 @@ bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {
   return true;
 }
 
-bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode * N) const {
+bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode *N) const {
   switch (N->getOpcode()) {
-    default:
-    return false;
-    case ISD::EntryToken:
-    case ISD::TokenFactor:
+  case ISD::EntryToken:
+  case ISD::TokenFactor:
+    return true;
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+    switch (IntrID) {
+    case Intrinsic::amdgcn_readfirstlane:
+    case Intrinsic::amdgcn_readlane:
       return true;
-    case ISD::INTRINSIC_WO_CHAIN:
-    {
-      unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
-      switch (IntrID) {
-        default:
-        return false;
-        case Intrinsic::amdgcn_readfirstlane:
-        case Intrinsic::amdgcn_readlane:
-          return true;
-      }
     }
-    break;
-    case ISD::LOAD:
-    {
-      if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
-          AMDGPUAS::CONSTANT_ADDRESS_32BIT)
-        return true;
-      return false;
-    }
-    break;
+    return false;
   }
+  case ISD::LOAD:
+    if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
+        AMDGPUAS::CONSTANT_ADDRESS_32BIT)
+      return true;
+    return false;
+  }
+  return false;
 }
 
 SDValue AMDGPUTargetLowering::getNegatedExpression(
@@ -944,6 +945,8 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
   case CallingConv::Fast:
   case CallingConv::Cold:
     return CC_AMDGPU_Func;
+  case CallingConv::AMDGPU_Gfx:
+    return CC_SI_Gfx;
   case CallingConv::AMDGPU_KERNEL:
   case CallingConv::SPIR_KERNEL:
   default:
@@ -965,6 +968,8 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
   case CallingConv::AMDGPU_ES:
   case CallingConv::AMDGPU_LS:
     return RetCC_SI_Shader;
+  case CallingConv::AMDGPU_Gfx:
+    return RetCC_SI_Gfx;
   case CallingConv::C:
   case CallingConv::Fast:
   case CallingConv::Cold:
@@ -1017,10 +1022,14 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
   unsigned InIndex = 0;
 
   for (const Argument &Arg : Fn.args()) {
+    const bool IsByRef = Arg.hasByRefAttr();
     Type *BaseArgTy = Arg.getType();
-    Align Alignment = DL.getABITypeAlign(BaseArgTy);
-    MaxAlign = std::max(Alignment, MaxAlign);
-    unsigned AllocSize = DL.getTypeAllocSize(BaseArgTy);
+    Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
+    MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
+    if (!Alignment)
+      Alignment = DL.getABITypeAlign(MemArgTy);
+    MaxAlign = max(Alignment, MaxAlign);
+    uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
 
     uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
     ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
@@ -1224,7 +1233,7 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
   switch (Op.getOpcode()) {
   default:
     Op->print(errs(), &DAG);
-    llvm_unreachable("Custom lowering code for this"
+    llvm_unreachable("Custom lowering code for this "
                      "instruction is not implemented yet!");
     break;
   case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
@@ -1295,7 +1304,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
 
   if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
       G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
-    if (!MFI->isEntryFunction()) {
+    if (!MFI->isModuleEntryFunction()) {
       SDLoc DL(Op);
       const Function &Fn = DAG.getMachineFunction().getFunction();
       DiagnosticInfoUnsupported BadLDSDecl(
@@ -1539,7 +1548,7 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
   SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
                                   Load->getChain(), BasePtr, SrcValue, LoMemVT,
                                   BaseAlign, Load->getMemOperand()->getFlags());
-  SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, Size);
+  SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::Fixed(Size));
   SDValue HiLoad =
       DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
                      HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
@@ -1564,17 +1573,25 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
   return DAG.getMergeValues(Ops, SL);
 }
 
-// Widen a vector load from vec3 to vec4.
-SDValue AMDGPUTargetLowering::WidenVectorLoad(SDValue Op,
-                                              SelectionDAG &DAG) const {
+SDValue AMDGPUTargetLowering::WidenOrSplitVectorLoad(SDValue Op,
+                                                     SelectionDAG &DAG) const {
   LoadSDNode *Load = cast<LoadSDNode>(Op);
   EVT VT = Op.getValueType();
-  assert(VT.getVectorNumElements() == 3);
   SDValue BasePtr = Load->getBasePtr();
   EVT MemVT = Load->getMemoryVT();
   SDLoc SL(Op);
   const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
   unsigned BaseAlign = Load->getAlignment();
+  unsigned NumElements = MemVT.getVectorNumElements();
+
+  // Widen from vec3 to vec4 when the load is at least 8-byte aligned
+  // or 16-byte fully dereferenceable. Otherwise, split the vector load.
+  if (NumElements != 3 ||
+      (BaseAlign < 8 &&
+       !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
+    return SplitVectorLoad(Op, DAG);
+
+  assert(NumElements == 3);
 
   EVT WideVT =
       EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4);
@@ -2075,20 +2092,19 @@ SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
   return DAG.getMergeValues(Res, DL);
 }
 
-// (frem x, y) -> (fsub x, (fmul (ftrunc (fdiv x, y)), y))
+// (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
 SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
   SDLoc SL(Op);
   EVT VT = Op.getValueType();
+  auto Flags = Op->getFlags();
   SDValue X = Op.getOperand(0);
   SDValue Y = Op.getOperand(1);
 
-  // TODO: Should this propagate fast-math-flags?
-
-  SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y);
-  SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div);
-  SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y);
-
-  return DAG.getNode(ISD::FSUB, SL, VT, X, Mul);
+  SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags);
+  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags);
+  SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags);
+  // TODO: For f32 use FMAD instead if !hasFastFMA32?
+  return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags);
 }
 
 SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
@@ -2698,14 +2714,12 @@ SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op,
   // TODO: Factor out code common with LowerFP_TO_UINT.
 
   EVT SrcVT = Src.getValueType();
-  if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
+  if (SrcVT == MVT::f16 ||
+      (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
     SDLoc DL(Op);
 
-    SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
-    SDValue FpToInt32 =
-        DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);
-
-    return FpToInt32;
+    SDValue FpToInt32 = DAG.getNode(Op.getOpcode(), DL, MVT::i32, Src);
+    return DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, FpToInt32);
   }
 
   if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
@@ -2721,14 +2735,12 @@ SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op,
   // TODO: Factor out code common with LowerFP_TO_SINT.
 
   EVT SrcVT = Src.getValueType();
-  if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
+  if (SrcVT == MVT::f16 ||
+      (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
     SDLoc DL(Op);
 
-    SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
-    SDValue FpToInt32 =
-        DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);
-
-    return FpToInt32;
+    SDValue FpToUInt32 = DAG.getNode(Op.getOpcode(), DL, MVT::i32, Src);
+    return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, FpToUInt32);
   }
 
   if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
@@ -3204,7 +3216,7 @@ SDValue AMDGPUTargetLowering::performTruncateCombine(
     if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
       SDValue Elt0 = Vec.getOperand(0);
       EVT EltVT = Elt0.getValueType();
-      if (VT.getSizeInBits() <= EltVT.getSizeInBits()) {
+      if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
         if (EltVT.isFloatingPoint()) {
           Elt0 = DAG.getNode(ISD::BITCAST, SL,
                              EltVT.changeTypeToInteger(), Elt0);
@@ -3287,17 +3299,13 @@ static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
     return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
   }
 
-  // Because we want to eliminate extension instructions before the
-  // operation, we need to create a single user here (i.e. not the separate
-  // mul_lo + mul_hi) so that SimplifyDemandedBits will deal with it.
-
-  unsigned MulOpc = Signed ? AMDGPUISD::MUL_LOHI_I24 : AMDGPUISD::MUL_LOHI_U24;
+  unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
+  unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
 
-  SDValue Mul = DAG.getNode(MulOpc, SL,
-                            DAG.getVTList(MVT::i32, MVT::i32), N0, N1);
+  SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
+  SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
 
-  return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64,
-                     Mul.getValue(0), Mul.getValue(1));
+  return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
 }
 
 SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
@@ -3395,29 +3403,6 @@ SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
   return DAG.getZExtOrTrunc(Mulhi, DL, VT);
 }
 
-SDValue AMDGPUTargetLowering::performMulLoHi24Combine(
-  SDNode *N, DAGCombinerInfo &DCI) const {
-  SelectionDAG &DAG = DCI.DAG;
-
-  // Simplify demanded bits before splitting into multiple users.
-  if (SDValue V = simplifyI24(N, DCI))
-    return V;
-
-  SDValue N0 = N->getOperand(0);
-  SDValue N1 = N->getOperand(1);
-
-  bool Signed = (N->getOpcode() == AMDGPUISD::MUL_LOHI_I24);
-
-  unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
-  unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
-
-  SDLoc SL(N);
-
-  SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
-  SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
-  return DAG.getMergeValues({ MulLo, MulHi }, SL);
-}
-
 static bool isNegativeOne(SDValue Val) {
   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val))
     return C->isAllOnesValue();
@@ -3730,6 +3715,7 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
   }
   case ISD::FMA:
   case ISD::FMAD: {
+    // TODO: handle llvm.amdgcn.fma.legacy
     if (!mayIgnoreSignedZero(N0))
       return SDValue();
 
@@ -3795,8 +3781,15 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
     SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
     if (Res.getOpcode() != AMDGPUISD::FMED3)
       return SDValue(); // Op got folded away.
-    if (!N0.hasOneUse())
-      DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
+
+    if (!N0.hasOneUse()) {
+      SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res);
+      DAG.ReplaceAllUsesWith(N0, Neg);
+
+      for (SDNode *U : Neg->uses())
+        DCI.AddToWorklist(U);
+    }
+
     return Res;
   }
   case ISD::FP_EXTEND:
@@ -3933,7 +3926,7 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
       }
     }
 
-    if (DestVT.getSizeInBits() != 64 && !DestVT.isVector())
+    if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())
       break;
 
     // Fold bitcasts of constants.
@@ -3942,14 +3935,12 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
     // TODO: Generalize and move to DAGCombiner
     SDValue Src = N->getOperand(0);
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
-      if (Src.getValueType() == MVT::i64) {
-        SDLoc SL(N);
-        uint64_t CVal = C->getZExtValue();
-        SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
-                                 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
-                                 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
-      }
+      SDLoc SL(N);
+      uint64_t CVal = C->getZExtValue();
+      SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
+                               DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
+                               DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
+      return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
     }
 
     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
@@ -3999,9 +3990,6 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
       return V;
     return SDValue();
   }
-  case AMDGPUISD::MUL_LOHI_I24:
-  case AMDGPUISD::MUL_LOHI_U24:
-    return performMulLoHi24Combine(N, DCI);
   case ISD::SELECT:
     return performSelectCombine(N, DCI);
   case ISD::FNEG:
@@ -4159,9 +4147,9 @@ SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,
   auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
   SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
 
-  return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, 4,
+  return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, Align(4),
                      MachineMemOperand::MODereferenceable |
-                     MachineMemOperand::MOInvariant);
+                         MachineMemOperand::MOInvariant);
 }
 
 SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,
@@ -4173,7 +4161,7 @@ SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,
   MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);
 
   SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
-  SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, 4,
+  SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4),
                                MachineMemOperand::MODereferenceable);
   return Store;
 }
@@ -4285,8 +4273,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(MUL_I24)
   NODE_NAME_CASE(MULHI_U24)
   NODE_NAME_CASE(MULHI_I24)
-  NODE_NAME_CASE(MUL_LOHI_U24)
-  NODE_NAME_CASE(MUL_LOHI_I24)
   NODE_NAME_CASE(MAD_U24)
   NODE_NAME_CASE(MAD_I24)
   NODE_NAME_CASE(MAD_I64_I32)
@@ -4336,7 +4322,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(ATOMIC_DEC)
   NODE_NAME_CASE(ATOMIC_LOAD_FMIN)
   NODE_NAME_CASE(ATOMIC_LOAD_FMAX)
-  NODE_NAME_CASE(ATOMIC_LOAD_CSUB)
   NODE_NAME_CASE(BUFFER_LOAD)
   NODE_NAME_CASE(BUFFER_LOAD_UBYTE)
   NODE_NAME_CASE(BUFFER_LOAD_USHORT)
@@ -4365,8 +4350,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
   NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)
   NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
-  NODE_NAME_CASE(BUFFER_ATOMIC_PK_FADD)
-  NODE_NAME_CASE(ATOMIC_PK_FADD)
 
   case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
   }
@@ -4718,6 +4701,12 @@ bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
     case Intrinsic::amdgcn_fdot2:
       // TODO: Refine on operand
       return SNaN;
+    case Intrinsic::amdgcn_fma_legacy:
+      if (SNaN)
+        return true;
+      return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
+             DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) &&
+             DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1);
     default:
       return false;
     }
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 85f23c81db17..ce3618f83130 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -15,10 +15,8 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUISELLOWERING_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUISELLOWERING_H
 
-#include "AMDGPU.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/TargetLowering.h"
-#include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
 
@@ -90,7 +88,6 @@ protected:
   SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const;
-  SDValue performMulLoHi24Combine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS,
                              SDValue RHS, DAGCombinerInfo &DCI) const;
   SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const;
@@ -125,8 +122,9 @@ protected:
   /// Split a vector load into 2 loads of half the vector.
   SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const;
 
-  /// Widen a vector load from vec3 to vec4.
-  SDValue WidenVectorLoad(SDValue Op, SelectionDAG &DAG) const;
+  /// Widen a suitably aligned v3 load. For all other cases, split the input
+  /// vector load.
+  SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const;
 
   /// Split a vector store into 2 stores of half the vector.
   SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const;
@@ -145,16 +143,7 @@ protected:
 public:
   AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI);
 
-  bool mayIgnoreSignedZero(SDValue Op) const {
-    if (getTargetMachine().Options.NoSignedZerosFPMath)
-      return true;
-
-    const auto Flags = Op.getNode()->getFlags();
-    if (Flags.isDefined())
-      return Flags.hasNoSignedZeros();
-
-    return false;
-  }
+  bool mayIgnoreSignedZero(SDValue Op) const;
 
   static inline SDValue stripBitcast(SDValue Val) {
     return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
@@ -440,8 +429,6 @@ enum NodeType : unsigned {
   MAD_I24,
   MAD_U64_U32,
   MAD_I64_I32,
-  MUL_LOHI_I24,
-  MUL_LOHI_U24,
   PERM,
   TEXTURE_FETCH,
   R600_EXPORT,
@@ -508,7 +495,6 @@ enum NodeType : unsigned {
   ATOMIC_DEC,
   ATOMIC_LOAD_FMIN,
   ATOMIC_LOAD_FMAX,
-  ATOMIC_LOAD_CSUB,
   BUFFER_LOAD,
   BUFFER_LOAD_UBYTE,
   BUFFER_LOAD_USHORT,
@@ -537,8 +523,6 @@ enum NodeType : unsigned {
   BUFFER_ATOMIC_CMPSWAP,
   BUFFER_ATOMIC_CSUB,
   BUFFER_ATOMIC_FADD,
-  BUFFER_ATOMIC_PK_FADD,
-  ATOMIC_PK_FADD,
 
   LAST_AMDGPU_ISD_NUMBER
 };
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInline.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInline.cpp
deleted file mode 100644
index 3b5d91133a2f..000000000000
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInline.cpp
+++ /dev/null
@@ -1,226 +0,0 @@
-//===- AMDGPUInline.cpp - Code to perform simple function inlining --------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// This is AMDGPU specific replacement of the standard inliner.
-/// The main purpose is to account for the fact that calls not only expensive
-/// on the AMDGPU, but much more expensive if a private memory pointer is
-/// passed to a function as an argument. In this situation, we are unable to
-/// eliminate private memory in the caller unless inlined and end up with slow
-/// and expensive scratch access. Thus, we boost the inline threshold for such
-/// functions here.
-///
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/CallGraph.h"
-#include "llvm/Analysis/InlineCost.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/IPO/Inliner.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "inline"
-
-static cl::opt<int>
-ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(4000),
-              cl::desc("Cost of alloca argument"));
-
-// If the amount of scratch memory to eliminate exceeds our ability to allocate
-// it into registers we gain nothing by aggressively inlining functions for that
-// heuristic.
-static cl::opt<unsigned>
-ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256),
-                cl::desc("Maximum alloca size to use for inline cost"));
-
-// Inliner constraint to achieve reasonable compilation time
-static cl::opt<size_t>
-MaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(1100),
-      cl::desc("Maximum BB number allowed in a function after inlining"
-               " (compile time constraint)"));
-
-namespace {
-
-class AMDGPUInliner : public LegacyInlinerBase {
-
-public:
-  AMDGPUInliner() : LegacyInlinerBase(ID) {
-    initializeAMDGPUInlinerPass(*PassRegistry::getPassRegistry());
-    Params = getInlineParams();
-  }
-
-  static char ID; // Pass identification, replacement for typeid
-
-  unsigned getInlineThreshold(CallBase &CB) const;
-
-  InlineCost getInlineCost(CallBase &CB) override;
-
-  bool runOnSCC(CallGraphSCC &SCC) override;
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
-
-private:
-  TargetTransformInfoWrapperPass *TTIWP;
-
-  InlineParams Params;
-};
-
-} // end anonymous namespace
-
-char AMDGPUInliner::ID = 0;
-INITIALIZE_PASS_BEGIN(AMDGPUInliner, "amdgpu-inline",
-                "AMDGPU Function Integration/Inlining", false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(AMDGPUInliner, "amdgpu-inline",
-                "AMDGPU Function Integration/Inlining", false, false)
-
-Pass *llvm::createAMDGPUFunctionInliningPass() { return new AMDGPUInliner(); }
-
-bool AMDGPUInliner::runOnSCC(CallGraphSCC &SCC) {
-  TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>();
-  return LegacyInlinerBase::runOnSCC(SCC);
-}
-
-void AMDGPUInliner::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<TargetTransformInfoWrapperPass>();
-  LegacyInlinerBase::getAnalysisUsage(AU);
-}
-
-unsigned AMDGPUInliner::getInlineThreshold(CallBase &CB) const {
-  int Thres = Params.DefaultThreshold;
-
-  Function *Caller = CB.getCaller();
-  // Listen to the inlinehint attribute when it would increase the threshold
-  // and the caller does not need to minimize its size.
-  Function *Callee = CB.getCalledFunction();
-  bool InlineHint = Callee && !Callee->isDeclaration() &&
-    Callee->hasFnAttribute(Attribute::InlineHint);
-  if (InlineHint && Params.HintThreshold && Params.HintThreshold > Thres
-      && !Caller->hasFnAttribute(Attribute::MinSize))
-    Thres = Params.HintThreshold.getValue() *
-            TTIWP->getTTI(*Callee).getInliningThresholdMultiplier();
-
-  const DataLayout &DL = Caller->getParent()->getDataLayout();
-  if (!Callee)
-    return (unsigned)Thres;
-
-  // If we have a pointer to private array passed into a function
-  // it will not be optimized out, leaving scratch usage.
-  // Increase the inline threshold to allow inliniting in this case.
-  uint64_t AllocaSize = 0;
-  SmallPtrSet<const AllocaInst *, 8> AIVisited;
-  for (Value *PtrArg : CB.args()) {
-    PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
-    if (!Ty || (Ty->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS &&
-                Ty->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS))
-      continue;
-
-    PtrArg = GetUnderlyingObject(PtrArg, DL);
-    if (const AllocaInst *AI = dyn_cast<AllocaInst>(PtrArg)) {
-      if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second)
-        continue;
-      AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType());
-      // If the amount of stack memory is excessive we will not be able
-      // to get rid of the scratch anyway, bail out.
-      if (AllocaSize > ArgAllocaCutoff) {
-        AllocaSize = 0;
-        break;
-      }
-    }
-  }
-  if (AllocaSize)
-    Thres += ArgAllocaCost;
-
-  return (unsigned)Thres;
-}
-
-// Check if call is just a wrapper around another call.
-// In this case we only have call and ret instructions.
-static bool isWrapperOnlyCall(CallBase &CB) {
-  Function *Callee = CB.getCalledFunction();
-  if (!Callee || Callee->size() != 1)
-    return false;
-  const BasicBlock &BB = Callee->getEntryBlock();
-  if (const Instruction *I = BB.getFirstNonPHI()) {
-    if (!isa<CallInst>(I)) {
-      return false;
-    }
-    if (isa<ReturnInst>(*std::next(I->getIterator()))) {
-      LLVM_DEBUG(dbgs() << "    Wrapper only call detected: "
-                        << Callee->getName() << '\n');
-      return true;
-    }
-  }
-  return false;
-}
-
-InlineCost AMDGPUInliner::getInlineCost(CallBase &CB) {
-  Function *Callee = CB.getCalledFunction();
-  Function *Caller = CB.getCaller();
-
-  if (!Callee || Callee->isDeclaration())
-    return llvm::InlineCost::getNever("undefined callee");
-
-  if (CB.isNoInline())
-    return llvm::InlineCost::getNever("noinline");
-
-  TargetTransformInfo &TTI = TTIWP->getTTI(*Callee);
-  if (!TTI.areInlineCompatible(Caller, Callee))
-    return llvm::InlineCost::getNever("incompatible");
-
-  if (CB.hasFnAttr(Attribute::AlwaysInline)) {
-    auto IsViable = isInlineViable(*Callee);
-    if (IsViable.isSuccess())
-      return llvm::InlineCost::getAlways("alwaysinline viable");
-    return llvm::InlineCost::getNever(IsViable.getFailureReason());
-  }
-
-  if (isWrapperOnlyCall(CB))
-    return llvm::InlineCost::getAlways("wrapper-only call");
-
-  InlineParams LocalParams = Params;
-  LocalParams.DefaultThreshold = (int)getInlineThreshold(CB);
-  bool RemarksEnabled = false;
-  const auto &BBs = Caller->getBasicBlockList();
-  if (!BBs.empty()) {
-    auto DI = OptimizationRemark(DEBUG_TYPE, "", DebugLoc(), &BBs.front());
-    if (DI.isEnabled())
-      RemarksEnabled = true;
-  }
-
-  OptimizationRemarkEmitter ORE(Caller);
-  auto GetAssumptionCache = [this](Function &F) -> AssumptionCache & {
-    return ACT->getAssumptionCache(F);
-  };
-
-  auto IC = llvm::getInlineCost(CB, Callee, LocalParams, TTI,
-                                GetAssumptionCache, GetTLI, nullptr, PSI,
-                                RemarksEnabled ? &ORE : nullptr);
-
-  if (IC && !IC.isAlways() && !Callee->hasFnAttribute(Attribute::InlineHint)) {
-    // Single BB does not increase total BB amount, thus subtract 1
-    size_t Size = Caller->size() + Callee->size() - 1;
-    if (MaxBB && Size > MaxBB)
-      return llvm::InlineCost::getNever("max number of bb exceeded");
-  }
-  return IC;
-}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
new file mode 100644
index 000000000000..06aa0055e4bb
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -0,0 +1,1075 @@
+//===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// \file
+// This file implements a TargetTransformInfo analysis pass specific to the
+// AMDGPU target machine. It uses the target's detailed information to provide
+// more precise answers to certain TTI queries, while letting the target
+// independent and default TTI implementations handle the rest.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUInstrInfo.h"
+#include "AMDGPUTargetTransformInfo.h"
+#include "GCNSubtarget.h"
+#include "R600Subtarget.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "AMDGPUtti"
+
+namespace {
+
+struct AMDGPUImageDMaskIntrinsic {
+  unsigned Intr;
+};
+
+#define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
+#include "InstCombineTables.inc"
+
+} // end anonymous namespace
+
+// Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
+//
+// A single NaN input is folded to minnum, so we rely on that folding for
+// handling NaNs.
+static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
+                           const APFloat &Src2) {
+  APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);
+
+  APFloat::cmpResult Cmp0 = Max3.compare(Src0);
+  assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
+  if (Cmp0 == APFloat::cmpEqual)
+    return maxnum(Src1, Src2);
+
+  APFloat::cmpResult Cmp1 = Max3.compare(Src1);
+  assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
+  if (Cmp1 == APFloat::cmpEqual)
+    return maxnum(Src0, Src2);
+
+  return maxnum(Src0, Src1);
+}
+
+// Check if a value can be converted to a 16-bit value without losing
+// precision.
+static bool canSafelyConvertTo16Bit(Value &V) {
+  Type *VTy = V.getType();
+  if (VTy->isHalfTy() || VTy->isIntegerTy(16)) {
+    // The value is already 16-bit, so we don't want to convert to 16-bit again!
+    return false;
+  }
+  if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
+    // We need to check that if we cast the index down to a half, we do not lose
+    // precision.
+    APFloat FloatValue(ConstFloat->getValueAPF());
+    bool LosesInfo = true;
+    FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero, &LosesInfo);
+    return !LosesInfo;
+  }
+  Value *CastSrc;
+  if (match(&V, m_FPExt(PatternMatch::m_Value(CastSrc))) ||
+      match(&V, m_SExt(PatternMatch::m_Value(CastSrc))) ||
+      match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)))) {
+    Type *CastSrcTy = CastSrc->getType();
+    if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16))
+      return true;
+  }
+
+  return false;
+}
+
+// Convert a value to 16-bit.
+static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) {
+  Type *VTy = V.getType();
+  if (isa<FPExtInst>(&V) || isa<SExtInst>(&V) || isa<ZExtInst>(&V))
+    return cast<Instruction>(&V)->getOperand(0);
+  if (VTy->isIntegerTy())
+    return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false);
+  if (VTy->isFloatingPointTy())
+    return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext()));
+
+  llvm_unreachable("Should never be called!");
+}
+
+static Optional<Instruction *>
+simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
+                             const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
+                             IntrinsicInst &II, InstCombiner &IC) {
+  if (!ST->hasA16() && !ST->hasG16())
+    return None;
+
+  bool FloatCoord = false;
+  // true means derivatives can be converted to 16 bit, coordinates not
+  bool OnlyDerivatives = false;
+
+  for (unsigned OperandIndex = ImageDimIntr->GradientStart;
+       OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
+    Value *Coord = II.getOperand(OperandIndex);
+    // If the values are not derived from 16-bit values, we cannot optimize.
+    if (!canSafelyConvertTo16Bit(*Coord)) {
+      if (OperandIndex < ImageDimIntr->CoordStart ||
+          ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
+        return None;
+      }
+      // All gradients can be converted, so convert only them
+      OnlyDerivatives = true;
+      break;
+    }
+
+    assert(OperandIndex == ImageDimIntr->GradientStart ||
+           FloatCoord == Coord->getType()->isFloatingPointTy());
+    FloatCoord = Coord->getType()->isFloatingPointTy();
+  }
+
+  if (OnlyDerivatives) {
+    if (!ST->hasG16())
+      return None;
+  } else {
+    if (!ST->hasA16())
+      OnlyDerivatives = true; // Only supports G16
+  }
+
+  Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext())
+                               : Type::getInt16Ty(II.getContext());
+
+  SmallVector<Type *, 4> ArgTys;
+  if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), ArgTys))
+    return None;
+
+  ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
+  if (!OnlyDerivatives)
+    ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
+  Function *I =
+      Intrinsic::getDeclaration(II.getModule(), II.getIntrinsicID(), ArgTys);
+
+  SmallVector<Value *, 8> Args(II.arg_operands());
+
+  unsigned EndIndex =
+      OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
+  for (unsigned OperandIndex = ImageDimIntr->GradientStart;
+       OperandIndex < EndIndex; OperandIndex++) {
+    Args[OperandIndex] =
+        convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder);
+  }
+
+  CallInst *NewCall = IC.Builder.CreateCall(I, Args);
+  NewCall->takeName(&II);
+  NewCall->copyMetadata(II);
+  if (isa<FPMathOperator>(NewCall))
+    NewCall->copyFastMathFlags(&II);
+  return IC.replaceInstUsesWith(II, NewCall);
+}
+
+bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Value *Op0, const Value *Op1,
+                                           InstCombiner &IC) const {
+  // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
+  // infinity, gives +0.0. If we can prove we don't have one of the special
+  // cases then we can use a normal multiply instead.
+  // TODO: Create and use isKnownFiniteNonZero instead of just matching
+  // constants here.
+  if (match(Op0, PatternMatch::m_FiniteNonZero()) ||
+      match(Op1, PatternMatch::m_FiniteNonZero())) {
+    // One operand is not zero or infinity or NaN.
+    return true;
+  }
+  auto *TLI = &IC.getTargetLibraryInfo();
+  if (isKnownNeverInfinity(Op0, TLI) && isKnownNeverNaN(Op0, TLI) &&
+      isKnownNeverInfinity(Op1, TLI) && isKnownNeverNaN(Op1, TLI)) {
+    // Neither operand is infinity or NaN.
+    return true;
+  }
+  return false;
+}
+
+Optional<Instruction *>
+GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
+  Intrinsic::ID IID = II.getIntrinsicID();
+  switch (IID) {
+  case Intrinsic::amdgcn_rcp: {
+    Value *Src = II.getArgOperand(0);
+
+    // TODO: Move to ConstantFolding/InstSimplify?
+    if (isa<UndefValue>(Src)) {
+      Type *Ty = II.getType();
+      auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
+      return IC.replaceInstUsesWith(II, QNaN);
+    }
+
+    if (II.isStrictFP())
+      break;
+
+    if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
+      const APFloat &ArgVal = C->getValueAPF();
+      APFloat Val(ArgVal.getSemantics(), 1);
+      Val.divide(ArgVal, APFloat::rmNearestTiesToEven);
+
+      // This is more precise than the instruction may give.
+      //
+      // TODO: The instruction always flushes denormal results (except for f16),
+      // should this also?
+      return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val));
+    }
+
+    break;
+  }
+  case Intrinsic::amdgcn_rsq: {
+    Value *Src = II.getArgOperand(0);
+
+    // TODO: Move to ConstantFolding/InstSimplify?
+    if (isa<UndefValue>(Src)) {
+      Type *Ty = II.getType();
+      auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
+      return IC.replaceInstUsesWith(II, QNaN);
+    }
+
+    break;
+  }
+  case Intrinsic::amdgcn_frexp_mant:
+  case Intrinsic::amdgcn_frexp_exp: {
+    Value *Src = II.getArgOperand(0);
+    if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
+      int Exp;
+      APFloat Significand =
+          frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven);
+
+      if (IID == Intrinsic::amdgcn_frexp_mant) {
+        return IC.replaceInstUsesWith(
+            II, ConstantFP::get(II.getContext(), Significand));
+      }
+
+      // Match instruction special case behavior.
+      if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
+        Exp = 0;
+
+      return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp));
+    }
+
+    if (isa<UndefValue>(Src)) {
+      return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
+    }
+
+    break;
+  }
+  case Intrinsic::amdgcn_class: {
+    enum {
+      S_NAN = 1 << 0,       // Signaling NaN
+      Q_NAN = 1 << 1,       // Quiet NaN
+      N_INFINITY = 1 << 2,  // Negative infinity
+      N_NORMAL = 1 << 3,    // Negative normal
+      N_SUBNORMAL = 1 << 4, // Negative subnormal
+      N_ZERO = 1 << 5,      // Negative zero
+      P_ZERO = 1 << 6,      // Positive zero
+      P_SUBNORMAL = 1 << 7, // Positive subnormal
+      P_NORMAL = 1 << 8,    // Positive normal
+      P_INFINITY = 1 << 9   // Positive infinity
+    };
+
+    const uint32_t FullMask = S_NAN | Q_NAN | N_INFINITY | N_NORMAL |
+                              N_SUBNORMAL | N_ZERO | P_ZERO | P_SUBNORMAL |
+                              P_NORMAL | P_INFINITY;
+
+    Value *Src0 = II.getArgOperand(0);
+    Value *Src1 = II.getArgOperand(1);
+    const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
+    if (!CMask) {
+      if (isa<UndefValue>(Src0)) {
+        return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
+      }
+
+      if (isa<UndefValue>(Src1)) {
+        return IC.replaceInstUsesWith(II,
+                                      ConstantInt::get(II.getType(), false));
+      }
+      break;
+    }
+
+    uint32_t Mask = CMask->getZExtValue();
+
+    // If all tests are made, it doesn't matter what the value is.
+    if ((Mask & FullMask) == FullMask) {
+      return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), true));
+    }
+
+    if ((Mask & FullMask) == 0) {
+      return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false));
+    }
+
+    if (Mask == (S_NAN | Q_NAN)) {
+      // Equivalent of isnan. Replace with standard fcmp.
+      Value *FCmp = IC.Builder.CreateFCmpUNO(Src0, Src0);
+      FCmp->takeName(&II);
+      return IC.replaceInstUsesWith(II, FCmp);
+    }
+
+    if (Mask == (N_ZERO | P_ZERO)) {
+      // Equivalent of == 0.
+      Value *FCmp =
+          IC.Builder.CreateFCmpOEQ(Src0, ConstantFP::get(Src0->getType(), 0.0));
+
+      FCmp->takeName(&II);
+      return IC.replaceInstUsesWith(II, FCmp);
+    }
+
+    // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other
+    if (((Mask & S_NAN) || (Mask & Q_NAN)) &&
+        isKnownNeverNaN(Src0, &IC.getTargetLibraryInfo())) {
+      return IC.replaceOperand(
+          II, 1, ConstantInt::get(Src1->getType(), Mask & ~(S_NAN | Q_NAN)));
+    }
+
+    const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0);
+    if (!CVal) {
+      if (isa<UndefValue>(Src0)) {
+        return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
+      }
+
+      // Clamp mask to used bits
+      if ((Mask & FullMask) != Mask) {
+        CallInst *NewCall = IC.Builder.CreateCall(
+            II.getCalledFunction(),
+            {Src0, ConstantInt::get(Src1->getType(), Mask & FullMask)});
+
+        NewCall->takeName(&II);
+        return IC.replaceInstUsesWith(II, NewCall);
+      }
+
+      break;
+    }
+
+    const APFloat &Val = CVal->getValueAPF();
+
+    bool Result =
+        ((Mask & S_NAN) && Val.isNaN() && Val.isSignaling()) ||
+        ((Mask & Q_NAN) && Val.isNaN() && !Val.isSignaling()) ||
+        ((Mask & N_INFINITY) && Val.isInfinity() && Val.isNegative()) ||
+        ((Mask & N_NORMAL) && Val.isNormal() && Val.isNegative()) ||
+        ((Mask & N_SUBNORMAL) && Val.isDenormal() && Val.isNegative()) ||
+        ((Mask & N_ZERO) && Val.isZero() && Val.isNegative()) ||
+        ((Mask & P_ZERO) && Val.isZero() && !Val.isNegative()) ||
+        ((Mask & P_SUBNORMAL) && Val.isDenormal() && !Val.isNegative()) ||
+        ((Mask & P_NORMAL) && Val.isNormal() && !Val.isNegative()) ||
+        ((Mask & P_INFINITY) && Val.isInfinity() && !Val.isNegative());
+
+    return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Result));
+  }
+  case Intrinsic::amdgcn_cvt_pkrtz: {
+    Value *Src0 = II.getArgOperand(0);
+    Value *Src1 = II.getArgOperand(1);
+    if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
+      if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
+        const fltSemantics &HalfSem =
+            II.getType()->getScalarType()->getFltSemantics();
+        bool LosesInfo;
+        APFloat Val0 = C0->getValueAPF();
+        APFloat Val1 = C1->getValueAPF();
+        Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
+        Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
+
+        Constant *Folded =
+            ConstantVector::get({ConstantFP::get(II.getContext(), Val0),
+                                 ConstantFP::get(II.getContext(), Val1)});
+        return IC.replaceInstUsesWith(II, Folded);
+      }
+    }
+
+    if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
+      return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
+    }
+
+    break;
+  }
+  case Intrinsic::amdgcn_cvt_pknorm_i16:
+  case Intrinsic::amdgcn_cvt_pknorm_u16:
+  case Intrinsic::amdgcn_cvt_pk_i16:
+  case Intrinsic::amdgcn_cvt_pk_u16: {
+    Value *Src0 = II.getArgOperand(0);
+    Value *Src1 = II.getArgOperand(1);
+
+    if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
+      return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
+    }
+
+    break;
+  }
+  case Intrinsic::amdgcn_ubfe:
+  case Intrinsic::amdgcn_sbfe: {
+    // Decompose simple cases into standard shifts.
+    Value *Src = II.getArgOperand(0);
+    if (isa<UndefValue>(Src)) {
+      return IC.replaceInstUsesWith(II, Src);
+    }
+
+    unsigned Width;
+    Type *Ty = II.getType();
+    unsigned IntSize = Ty->getIntegerBitWidth();
+
+    ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2));
+    if (CWidth) {
+      Width = CWidth->getZExtValue();
+      if ((Width & (IntSize - 1)) == 0) {
+        return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(Ty));
+      }
+
+      // Hardware ignores high bits, so remove those.
+      if (Width >= IntSize) {
+        return IC.replaceOperand(
+            II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1)));
+      }
+    }
+
+    unsigned Offset;
+    ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1));
+    if (COffset) {
+      Offset = COffset->getZExtValue();
+      if (Offset >= IntSize) {
+        return IC.replaceOperand(
+            II, 1,
+            ConstantInt::get(COffset->getType(), Offset & (IntSize - 1)));
+      }
+    }
+
+    bool Signed = IID == Intrinsic::amdgcn_sbfe;
+
+    if (!CWidth || !COffset)
+      break;
+
+    // The case of Width == 0 is handled above, which makes this tranformation
+    // safe.  If Width == 0, then the ashr and lshr instructions become poison
+    // value since the shift amount would be equal to the bit size.
+    assert(Width != 0);
+
+    // TODO: This allows folding to undef when the hardware has specific
+    // behavior?
+    if (Offset + Width < IntSize) {
+      Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width);
+      Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width)
+                                 : IC.Builder.CreateLShr(Shl, IntSize - Width);
+      RightShift->takeName(&II);
+      return IC.replaceInstUsesWith(II, RightShift);
+    }
+
+    Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset)
+                               : IC.Builder.CreateLShr(Src, Offset);
+
+    RightShift->takeName(&II);
+    return IC.replaceInstUsesWith(II, RightShift);
+  }
+  case Intrinsic::amdgcn_exp:
+  case Intrinsic::amdgcn_exp_compr: {
+    ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1));
+    unsigned EnBits = En->getZExtValue();
+    if (EnBits == 0xf)
+      break; // All inputs enabled.
+
+    bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
+    bool Changed = false;
+    for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
+      if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
+          (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
+        Value *Src = II.getArgOperand(I + 2);
+        if (!isa<UndefValue>(Src)) {
+          IC.replaceOperand(II, I + 2, UndefValue::get(Src->getType()));
+          Changed = true;
+        }
+      }
+    }
+
+    if (Changed) {
+      return &II;
+    }
+
+    break;
+  }
+  case Intrinsic::amdgcn_fmed3: {
+    // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled
+    // for the shader.
+
+    Value *Src0 = II.getArgOperand(0);
+    Value *Src1 = II.getArgOperand(1);
+    Value *Src2 = II.getArgOperand(2);
+
+    // Checking for NaN before canonicalization provides better fidelity when
+    // mapping other operations onto fmed3 since the order of operands is
+    // unchanged.
+    CallInst *NewCall = nullptr;
+    if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) {
+      NewCall = IC.Builder.CreateMinNum(Src1, Src2);
+    } else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) {
+      NewCall = IC.Builder.CreateMinNum(Src0, Src2);
+    } else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) {
+      NewCall = IC.Builder.CreateMaxNum(Src0, Src1);
+    }
+
+    if (NewCall) {
+      NewCall->copyFastMathFlags(&II);
+      NewCall->takeName(&II);
+      return IC.replaceInstUsesWith(II, NewCall);
+    }
+
+    bool Swap = false;
+    // Canonicalize constants to RHS operands.
+    //
+    // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
+    if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
+      std::swap(Src0, Src1);
+      Swap = true;
+    }
+
+    if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
+      std::swap(Src1, Src2);
+      Swap = true;
+    }
+
+    if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
+      std::swap(Src0, Src1);
+      Swap = true;
+    }
+
+    if (Swap) {
+      II.setArgOperand(0, Src0);
+      II.setArgOperand(1, Src1);
+      II.setArgOperand(2, Src2);
+      return &II;
+    }
+
+    if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
+      if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
+        if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
+          APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
+                                       C2->getValueAPF());
+          return IC.replaceInstUsesWith(
+              II, ConstantFP::get(IC.Builder.getContext(), Result));
+        }
+      }
+    }
+
+    break;
+  }
+  case Intrinsic::amdgcn_icmp:
+  case Intrinsic::amdgcn_fcmp: {
+    const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2));
+    // Guard against invalid arguments.
+    int64_t CCVal = CC->getZExtValue();
+    bool IsInteger = IID == Intrinsic::amdgcn_icmp;
+    if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
+                       CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
+        (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
+                        CCVal > CmpInst::LAST_FCMP_PREDICATE)))
+      break;
+
+    Value *Src0 = II.getArgOperand(0);
+    Value *Src1 = II.getArgOperand(1);
+
+    if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
+      if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
+        Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1);
+        if (CCmp->isNullValue()) {
+          return IC.replaceInstUsesWith(
+              II, ConstantExpr::getSExt(CCmp, II.getType()));
+        }
+
+        // The result of V_ICMP/V_FCMP assembly instructions (which this
+        // intrinsic exposes) is one bit per thread, masked with the EXEC
+        // register (which contains the bitmask of live threads). So a
+        // comparison that always returns true is the same as a read of the
+        // EXEC register.
+        Function *NewF = Intrinsic::getDeclaration(
+            II.getModule(), Intrinsic::read_register, II.getType());
+        Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")};
+        MDNode *MD = MDNode::get(II.getContext(), MDArgs);
+        Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
+        CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
+        NewCall->addAttribute(AttributeList::FunctionIndex,
+                              Attribute::Convergent);
+        NewCall->takeName(&II);
+        return IC.replaceInstUsesWith(II, NewCall);
+      }
+
+      // Canonicalize constants to RHS.
+      CmpInst::Predicate SwapPred =
+          CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal));
+      II.setArgOperand(0, Src1);
+      II.setArgOperand(1, Src0);
+      II.setArgOperand(
+          2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred)));
+      return &II;
+    }
+
+    if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
+      break;
+
+    // Canonicalize compare eq with true value to compare != 0
+    // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
+    //   -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
+    // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
+    //   -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
+    Value *ExtSrc;
+    if (CCVal == CmpInst::ICMP_EQ &&
+        ((match(Src1, PatternMatch::m_One()) &&
+          match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) ||
+         (match(Src1, PatternMatch::m_AllOnes()) &&
+          match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) &&
+        ExtSrc->getType()->isIntegerTy(1)) {
+      IC.replaceOperand(II, 1, ConstantInt::getNullValue(Src1->getType()));
+      IC.replaceOperand(II, 2,
+                        ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
+      return &II;
+    }
+
+    CmpInst::Predicate SrcPred;
+    Value *SrcLHS;
+    Value *SrcRHS;
+
+    // Fold compare eq/ne with 0 from a compare result as the predicate to the
+    // intrinsic. The typical use is a wave vote function in the library, which
+    // will be fed from a user code condition compared with 0. Fold in the
+    // redundant compare.
+
+    // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
+    //   -> llvm.amdgcn.[if]cmp(a, b, pred)
+    //
+    // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
+    //   -> llvm.amdgcn.[if]cmp(a, b, inv pred)
+    if (match(Src1, PatternMatch::m_Zero()) &&
+        match(Src0, PatternMatch::m_ZExtOrSExt(
+                        m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS),
+                              PatternMatch::m_Value(SrcRHS))))) {
+      if (CCVal == CmpInst::ICMP_EQ)
+        SrcPred = CmpInst::getInversePredicate(SrcPred);
+
+      Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred)
+                                 ? Intrinsic::amdgcn_fcmp
+                                 : Intrinsic::amdgcn_icmp;
+
+      Type *Ty = SrcLHS->getType();
+      if (auto *CmpType = dyn_cast<IntegerType>(Ty)) {
+        // Promote to next legal integer type.
+        unsigned Width = CmpType->getBitWidth();
+        unsigned NewWidth = Width;
+
+        // Don't do anything for i1 comparisons.
+        if (Width == 1)
+          break;
+
+        if (Width <= 16)
+          NewWidth = 16;
+        else if (Width <= 32)
+          NewWidth = 32;
+        else if (Width <= 64)
+          NewWidth = 64;
+        else if (Width > 64)
+          break; // Can't handle this.
+
+        if (Width != NewWidth) {
+          IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth);
+          if (CmpInst::isSigned(SrcPred)) {
+            SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy);
+            SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy);
+          } else {
+            SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy);
+            SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy);
+          }
+        }
+      } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
+        break;
+
+      Function *NewF = Intrinsic::getDeclaration(
+          II.getModule(), NewIID, {II.getType(), SrcLHS->getType()});
+      Value *Args[] = {SrcLHS, SrcRHS,
+                       ConstantInt::get(CC->getType(), SrcPred)};
+      CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
+      NewCall->takeName(&II);
+      return IC.replaceInstUsesWith(II, NewCall);
+    }
+
+    break;
+  }
+  case Intrinsic::amdgcn_ballot: {
+    if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
+      if (Src->isZero()) {
+        // amdgcn.ballot(i1 0) is zero.
+        return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
+      }
+
+      if (Src->isOne()) {
+        // amdgcn.ballot(i1 1) is exec.
+        const char *RegName = "exec";
+        if (II.getType()->isIntegerTy(32))
+          RegName = "exec_lo";
+        else if (!II.getType()->isIntegerTy(64))
+          break;
+
+        Function *NewF = Intrinsic::getDeclaration(
+            II.getModule(), Intrinsic::read_register, II.getType());
+        Metadata *MDArgs[] = {MDString::get(II.getContext(), RegName)};
+        MDNode *MD = MDNode::get(II.getContext(), MDArgs);
+        Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
+        CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
+        NewCall->addAttribute(AttributeList::FunctionIndex,
+                              Attribute::Convergent);
+        NewCall->takeName(&II);
+        return IC.replaceInstUsesWith(II, NewCall);
+      }
+    }
+    break;
+  }
+  case Intrinsic::amdgcn_wqm_vote: {
+    // wqm_vote is identity when the argument is constant.
+    if (!isa<Constant>(II.getArgOperand(0)))
+      break;
+
+    return IC.replaceInstUsesWith(II, II.getArgOperand(0));
+  }
+  case Intrinsic::amdgcn_kill: {
+    const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0));
+    if (!C || !C->getZExtValue())
+      break;
+
+    // amdgcn.kill(i1 1) is a no-op
+    return IC.eraseInstFromFunction(II);
+  }
+  case Intrinsic::amdgcn_update_dpp: {
+    Value *Old = II.getArgOperand(0);
+
+    auto *BC = cast<ConstantInt>(II.getArgOperand(5));
+    auto *RM = cast<ConstantInt>(II.getArgOperand(3));
+    auto *BM = cast<ConstantInt>(II.getArgOperand(4));
+    if (BC->isZeroValue() || RM->getZExtValue() != 0xF ||
+        BM->getZExtValue() != 0xF || isa<UndefValue>(Old))
+      break;
+
+    // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
+    return IC.replaceOperand(II, 0, UndefValue::get(Old->getType()));
+  }
+  case Intrinsic::amdgcn_permlane16:
+  case Intrinsic::amdgcn_permlanex16: {
+    // Discard vdst_in if it's not going to be read.
+    Value *VDstIn = II.getArgOperand(0);
+    if (isa<UndefValue>(VDstIn))
+      break;
+
+    ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(4));
+    ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(5));
+    if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
+      break;
+
+    return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType()));
+  }
+  case Intrinsic::amdgcn_readfirstlane:
+  case Intrinsic::amdgcn_readlane: {
+    // A constant value is trivially uniform.
+    if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
+      return IC.replaceInstUsesWith(II, C);
+    }
+
+    // The rest of these may not be safe if the exec may not be the same between
+    // the def and use.
+    Value *Src = II.getArgOperand(0);
+    Instruction *SrcInst = dyn_cast<Instruction>(Src);
+    if (SrcInst && SrcInst->getParent() != II.getParent())
+      break;
+
+    // readfirstlane (readfirstlane x) -> readfirstlane x
+    // readlane (readfirstlane x), y -> readfirstlane x
+    if (match(Src,
+              PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) {
+      return IC.replaceInstUsesWith(II, Src);
+    }
+
+    if (IID == Intrinsic::amdgcn_readfirstlane) {
+      // readfirstlane (readlane x, y) -> readlane x, y
+      if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) {
+        return IC.replaceInstUsesWith(II, Src);
+      }
+    } else {
+      // readlane (readlane x, y), y -> readlane x, y
+      if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>(
+                         PatternMatch::m_Value(),
+                         PatternMatch::m_Specific(II.getArgOperand(1))))) {
+        return IC.replaceInstUsesWith(II, Src);
+      }
+    }
+
+    break;
+  }
+  case Intrinsic::amdgcn_ldexp: {
+    // FIXME: This doesn't introduce new instructions and belongs in
+    // InstructionSimplify.
+    Type *Ty = II.getType();
+    Value *Op0 = II.getArgOperand(0);
+    Value *Op1 = II.getArgOperand(1);
+
+    // Folding undef to qnan is safe regardless of the FP mode.
+    if (isa<UndefValue>(Op0)) {
+      auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
+      return IC.replaceInstUsesWith(II, QNaN);
+    }
+
+    const APFloat *C = nullptr;
+    match(Op0, PatternMatch::m_APFloat(C));
+
+    // FIXME: Should flush denorms depending on FP mode, but that's ignored
+    // everywhere else.
+    //
+    // These cases should be safe, even with strictfp.
+    // ldexp(0.0, x) -> 0.0
+    // ldexp(-0.0, x) -> -0.0
+    // ldexp(inf, x) -> inf
+    // ldexp(-inf, x) -> -inf
+    if (C && (C->isZero() || C->isInfinity())) {
+      return IC.replaceInstUsesWith(II, Op0);
+    }
+
+    // With strictfp, be more careful about possibly needing to flush denormals
+    // or not, and snan behavior depends on ieee_mode.
+    if (II.isStrictFP())
+      break;
+
+    if (C && C->isNaN()) {
+      // FIXME: We just need to make the nan quiet here, but that's unavailable
+      // on APFloat, only IEEEfloat
+      auto *Quieted =
+          ConstantFP::get(Ty, scalbn(*C, 0, APFloat::rmNearestTiesToEven));
+      return IC.replaceInstUsesWith(II, Quieted);
+    }
+
+    // ldexp(x, 0) -> x
+    // ldexp(x, undef) -> x
+    if (isa<UndefValue>(Op1) || match(Op1, PatternMatch::m_ZeroInt())) {
+      return IC.replaceInstUsesWith(II, Op0);
+    }
+
+    break;
+  }
+  case Intrinsic::amdgcn_fmul_legacy: {
+    Value *Op0 = II.getArgOperand(0);
+    Value *Op1 = II.getArgOperand(1);
+
+    // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
+    // infinity, gives +0.0.
+    // TODO: Move to InstSimplify?
+    if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
+        match(Op1, PatternMatch::m_AnyZeroFP()))
+      return IC.replaceInstUsesWith(II, ConstantFP::getNullValue(II.getType()));
+
+    // If we can prove we don't have one of the special cases then we can use a
+    // normal fmul instruction instead.
+    if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) {
+      auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II);
+      FMul->takeName(&II);
+      return IC.replaceInstUsesWith(II, FMul);
+    }
+    break;
+  }
+  case Intrinsic::amdgcn_fma_legacy: {
+    Value *Op0 = II.getArgOperand(0);
+    Value *Op1 = II.getArgOperand(1);
+    Value *Op2 = II.getArgOperand(2);
+
+    // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
+    // infinity, gives +0.0.
+    // TODO: Move to InstSimplify?
+    if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
+        match(Op1, PatternMatch::m_AnyZeroFP())) {
+      // It's tempting to just return Op2 here, but that would give the wrong
+      // result if Op2 was -0.0.
+      auto *Zero = ConstantFP::getNullValue(II.getType());
+      auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II);
+      FAdd->takeName(&II);
+      return IC.replaceInstUsesWith(II, FAdd);
+    }
+
+    // If we can prove we don't have one of the special cases then we can use a
+    // normal fma instead.
+    if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) {
+      II.setCalledOperand(Intrinsic::getDeclaration(
+          II.getModule(), Intrinsic::fma, II.getType()));
+      return &II;
+    }
+    break;
+  }
+  default: {
+    if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
+            AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
+      return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
+    }
+  }
+  }
+  return None;
+}
+
+/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
+///
+/// Note: This only supports non-TFE/LWE image intrinsic calls; those have
+///       struct returns.
+static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
+                                                    IntrinsicInst &II,
+                                                    APInt DemandedElts,
+                                                    int DMaskIdx = -1) {
+
+  auto *IIVTy = cast<FixedVectorType>(II.getType());
+  unsigned VWidth = IIVTy->getNumElements();
+  if (VWidth == 1)
+    return nullptr;
+
+  IRBuilderBase::InsertPointGuard Guard(IC.Builder);
+  IC.Builder.SetInsertPoint(&II);
+
+  // Assume the arguments are unchanged and later override them, if needed.
+  SmallVector<Value *, 16> Args(II.args());
+
+  if (DMaskIdx < 0) {
+    // Buffer case.
+
+    const unsigned ActiveBits = DemandedElts.getActiveBits();
+    const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros();
+
+    // Start assuming the prefix of elements is demanded, but possibly clear
+    // some other bits if there are trailing zeros (unused components at front)
+    // and update offset.
+    DemandedElts = (1 << ActiveBits) - 1;
+
+    if (UnusedComponentsAtFront > 0) {
+      static const unsigned InvalidOffsetIdx = 0xf;
+
+      unsigned OffsetIdx;
+      switch (II.getIntrinsicID()) {
+      case Intrinsic::amdgcn_raw_buffer_load:
+        OffsetIdx = 1;
+        break;
+      case Intrinsic::amdgcn_s_buffer_load:
+        // If resulting type is vec3, there is no point in trimming the
+        // load with updated offset, as the vec3 would most likely be widened to
+        // vec4 anyway during lowering.
+        if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
+          OffsetIdx = InvalidOffsetIdx;
+        else
+          OffsetIdx = 1;
+        break;
+      case Intrinsic::amdgcn_struct_buffer_load:
+        OffsetIdx = 2;
+        break;
+      default:
+        // TODO: handle tbuffer* intrinsics.
+        OffsetIdx = InvalidOffsetIdx;
+        break;
+      }
+
+      if (OffsetIdx != InvalidOffsetIdx) {
+        // Clear demanded bits and update the offset.
+        DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
+        auto *Offset = II.getArgOperand(OffsetIdx);
+        unsigned SingleComponentSizeInBits =
+            IC.getDataLayout().getTypeSizeInBits(II.getType()->getScalarType());
+        unsigned OffsetAdd =
+            UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
+        auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
+        Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal);
+      }
+    }
+  } else {
+    // Image case.
+
+    ConstantInt *DMask = cast<ConstantInt>(II.getArgOperand(DMaskIdx));
+    unsigned DMaskVal = DMask->getZExtValue() & 0xf;
+
+    // Mask off values that are undefined because the dmask doesn't cover them
+    DemandedElts &= (1 << countPopulation(DMaskVal)) - 1;
+
+    unsigned NewDMaskVal = 0;
+    unsigned OrigLoadIdx = 0;
+    for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
+      const unsigned Bit = 1 << SrcIdx;
+      if (!!(DMaskVal & Bit)) {
+        if (!!DemandedElts[OrigLoadIdx])
+          NewDMaskVal |= Bit;
+        OrigLoadIdx++;
+      }
+    }
+
+    if (DMaskVal != NewDMaskVal)
+      Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
+  }
+
+  unsigned NewNumElts = DemandedElts.countPopulation();
+  if (!NewNumElts)
+    return UndefValue::get(II.getType());
+
+  if (NewNumElts >= VWidth && DemandedElts.isMask()) {
+    if (DMaskIdx >= 0)
+      II.setArgOperand(DMaskIdx, Args[DMaskIdx]);
+    return nullptr;
+  }
+
+  // Validate function argument and return types, extracting overloaded types
+  // along the way.
+  SmallVector<Type *, 6> OverloadTys;
+  if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys))
+    return nullptr;
+
+  Module *M = II.getParent()->getParent()->getParent();
+  Type *EltTy = IIVTy->getElementType();
+  Type *NewTy =
+      (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
+
+  OverloadTys[0] = NewTy;
+  Function *NewIntrin =
+      Intrinsic::getDeclaration(M, II.getIntrinsicID(), OverloadTys);
+
+  CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args);
+  NewCall->takeName(&II);
+  NewCall->copyMetadata(II);
+
+  if (NewNumElts == 1) {
+    return IC.Builder.CreateInsertElement(UndefValue::get(II.getType()),
+                                          NewCall,
+                                          DemandedElts.countTrailingZeros());
+  }
+
+  SmallVector<int, 8> EltMask;
+  unsigned NewLoadIdx = 0;
+  for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
+    if (!!DemandedElts[OrigLoadIdx])
+      EltMask.push_back(NewLoadIdx++);
+    else
+      EltMask.push_back(NewNumElts);
+  }
+
+  Value *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask);
+
+  return Shuffle;
+}
+
+Optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
+    InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
+    APInt &UndefElts2, APInt &UndefElts3,
+    std::function<void(Instruction *, unsigned, APInt, APInt &)>
+        SimplifyAndSetOp) const {
+  switch (II.getIntrinsicID()) {
+  case Intrinsic::amdgcn_buffer_load:
+  case Intrinsic::amdgcn_buffer_load_format:
+  case Intrinsic::amdgcn_raw_buffer_load:
+  case Intrinsic::amdgcn_raw_buffer_load_format:
+  case Intrinsic::amdgcn_raw_tbuffer_load:
+  case Intrinsic::amdgcn_s_buffer_load:
+  case Intrinsic::amdgcn_struct_buffer_load:
+  case Intrinsic::amdgcn_struct_buffer_load_format:
+  case Intrinsic::amdgcn_struct_tbuffer_load:
+  case Intrinsic::amdgcn_tbuffer_load:
+    return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
+  default: {
+    if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) {
+      return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0);
+    }
+    break;
+  }
+  }
+  return None;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
index 6c13bc8599db..f2d62956e25b 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
@@ -13,11 +13,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUInstrInfo.h"
-#include "AMDGPUTargetMachine.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "AMDGPU.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Value.h"
 
 using namespace llvm;
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
index 61b78acad3f4..8e7a6a7029c6 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
@@ -15,9 +15,7 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRINFO_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRINFO_H
 
-#include "AMDGPU.h"
 #include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/CodeGen/TargetInstrInfo.h"
 
 namespace llvm {
 
@@ -25,6 +23,7 @@ class GCNSubtarget;
 class MachineFunction;
 class MachineInstr;
 class MachineInstrBuilder;
+class MachineMemOperand;
 
 class AMDGPUInstrInfo {
 public:
@@ -52,6 +51,28 @@ struct ImageDimIntrinsicInfo {
   unsigned Intr;
   unsigned BaseOpcode;
   MIMGDim Dim;
+
+  uint8_t NumGradients;
+  uint8_t NumDmask;
+  uint8_t NumData;
+  uint8_t NumVAddrs;
+  uint8_t NumArgs;
+
+  uint8_t DMaskIndex;
+  uint8_t VAddrStart;
+  uint8_t GradientStart;
+  uint8_t CoordStart;
+  uint8_t LodIndex;
+  uint8_t MipIndex;
+  uint8_t VAddrEnd;
+  uint8_t RsrcIndex;
+  uint8_t SampIndex;
+  uint8_t UnormIndex;
+  uint8_t TexFailCtrlIndex;
+  uint8_t CachePolicyIndex;
+
+  uint8_t GradientTyArg;
+  uint8_t CoordTyArg;
 };
 const ImageDimIntrinsicInfo *getImageDimIntrinsicInfo(unsigned Intr);
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 2025c0fa5d21..bd577a6fb8c5 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -12,27 +12,17 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUInstructionSelector.h"
-#include "AMDGPUInstrInfo.h"
+#include "AMDGPU.h"
 #include "AMDGPUGlobalISelUtils.h"
+#include "AMDGPUInstrInfo.h"
 #include "AMDGPURegisterBankInfo.h"
-#include "AMDGPUSubtarget.h"
 #include "AMDGPUTargetMachine.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIMachineFunctionInfo.h"
 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
-#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
-#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
-#include "llvm/CodeGen/GlobalISel/Utils.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/Type.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/IR/DiagnosticInfo.h"
 
 #define DEBUG_TYPE "amdgpu-isel"
 
@@ -72,13 +62,15 @@ const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
 void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits &KB,
                                         CodeGenCoverage &CoverageInfo) {
   MRI = &MF.getRegInfo();
+  Subtarget = &MF.getSubtarget<GCNSubtarget>();
   InstructionSelector::setupMF(MF, KB, CoverageInfo);
 }
 
 bool AMDGPUInstructionSelector::isVCC(Register Reg,
                                       const MachineRegisterInfo &MRI) const {
-  if (Register::isPhysicalRegister(Reg))
-    return Reg == TRI.getVCC();
+  // The verifier is oblivious to s1 being a valid value for wavesize registers.
+  if (Reg.isPhysical())
+    return false;
 
   auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
   const TargetRegisterClass *RC =
@@ -170,24 +162,11 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
     if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
       return false;
 
-    // Don't constrain the source register to a class so the def instruction
-    // handles it (unless it's undef).
-    //
-    // FIXME: This is a hack. When selecting the def, we neeed to know
-    // specifically know that the result is VCCRegBank, and not just an SGPR
-    // with size 1. An SReg_32 with size 1 is ambiguous with wave32.
-    if (Src.isUndef()) {
-      const TargetRegisterClass *SrcRC =
-        TRI.getConstrainedRegClassForOperand(Src, *MRI);
-      if (SrcRC && !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
-        return false;
-    }
-
     return true;
   }
 
   for (const MachineOperand &MO : I.operands()) {
-    if (Register::isPhysicalRegister(MO.getReg()))
+    if (MO.getReg().isPhysical())
       continue;
 
     const TargetRegisterClass *RC =
@@ -286,50 +265,24 @@ static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
 }
 
 bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
-  MachineOperand &Dst = I.getOperand(0);
-  MachineOperand &Src0 = I.getOperand(1);
-  MachineOperand &Src1 = I.getOperand(2);
-  Register DstReg = Dst.getReg();
+  Register DstReg = I.getOperand(0).getReg();
   unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
 
   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
-  if (DstRB->getID() == AMDGPU::VCCRegBankID) {
-    const TargetRegisterClass *RC = TRI.getBoolRC();
-    unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(),
-                                           RC == &AMDGPU::SReg_64RegClass);
-    I.setDesc(TII.get(InstOpc));
-    // Dead implicit-def of scc
-    I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
-                                           true, // isImp
-                                           false, // isKill
-                                           true)); // isDead
-
-    // FIXME: Hack to avoid turning the register bank into a register class.
-    // The selector for G_ICMP relies on seeing the register bank for the result
-    // is VCC. In wave32 if we constrain the registers to SReg_32 here, it will
-    // be ambiguous whether it's a scalar or vector bool.
-    if (Src0.isUndef() && !MRI->getRegClassOrNull(Src0.getReg()))
-      MRI->setRegClass(Src0.getReg(), RC);
-    if (Src1.isUndef() && !MRI->getRegClassOrNull(Src1.getReg()))
-      MRI->setRegClass(Src1.getReg(), RC);
-
-    return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
-  }
-
-  // TODO: Should this allow an SCC bank result, and produce a copy from SCC for
-  // the result?
-  if (DstRB->getID() == AMDGPU::SGPRRegBankID) {
-    unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(), Size > 32);
-    I.setDesc(TII.get(InstOpc));
-    // Dead implicit-def of scc
-    I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
-                                           true, // isImp
-                                           false, // isKill
-                                           true)); // isDead
-    return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
-  }
+  if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
+      DstRB->getID() != AMDGPU::VCCRegBankID)
+    return false;
 
-  return false;
+  bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
+                            STI.isWave64());
+  I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
+
+  // Dead implicit-def of scc
+  I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
+                                         true, // isImp
+                                         false, // isKill
+                                         true)); // isDead
+  return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
 }
 
 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
@@ -365,7 +318,7 @@ bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
     }
 
-    const unsigned Opc = Sub ? AMDGPU::V_SUB_I32_e64 : AMDGPU::V_ADD_I32_e64;
+    const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
 
     Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
     MachineInstr *Add
@@ -403,7 +356,7 @@ bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
   } else {
     const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
     Register CarryReg = MRI->createVirtualRegister(CarryRC);
-    BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_I32_e64), DstLo)
+    BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
       .addDef(CarryReg)
       .add(Lo1)
       .add(Lo2)
@@ -446,10 +399,8 @@ bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
                           I.getOpcode() == AMDGPU::G_USUBE;
 
   if (isVCC(Dst1Reg, *MRI)) {
-      // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
-      // carry out despite the _i32 name. These were renamed in VI to _U32.
-      // FIXME: We should probably rename the opcodes here.
-    unsigned NoCarryOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64;
+    unsigned NoCarryOpc =
+        IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
     unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
     I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
     I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
@@ -597,8 +548,6 @@ bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
   if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
     return false;
 
-  const unsigned SrcFlags = getUndefRegState(Src.isUndef());
-
   // Note we could have mixed SGPR and VGPR destination banks for an SGPR
   // source, and this relies on the fact that the same subregister indices are
   // used for both.
@@ -606,7 +555,12 @@ bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
   for (int I = 0, E = NumDst; I != E; ++I) {
     MachineOperand &Dst = MI.getOperand(I);
     BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
-      .addReg(SrcReg, SrcFlags, SubRegs[I]);
+      .addReg(SrcReg, 0, SubRegs[I]);
+
+    // Make sure the subregister index is valid for the source register.
+    SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
+    if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
+      return false;
 
     const TargetRegisterClass *DstRC =
       TRI.getConstrainedRegClassForOperand(Dst, *MRI);
@@ -618,11 +572,6 @@ bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
   return true;
 }
 
-static bool isZero(Register Reg, const MachineRegisterInfo &MRI) {
-  int64_t Val;
-  return mi_match(Reg, MRI, m_ICst(Val)) && Val == 0;
-}
-
 bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
   MachineInstr &MI) const {
   if (selectImpl(MI, *CoverageInfo))
@@ -647,6 +596,24 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
   const DebugLoc &DL = MI.getDebugLoc();
   MachineBasicBlock *BB = MI.getParent();
 
+  auto ConstSrc1 =
+      getConstantVRegValWithLookThrough(Src1, *MRI, true, true, true);
+  if (ConstSrc1) {
+    auto ConstSrc0 =
+        getConstantVRegValWithLookThrough(Src0, *MRI, true, true, true);
+    if (ConstSrc0) {
+      const int64_t K0 = ConstSrc0->Value.getSExtValue();
+      const int64_t K1 = ConstSrc1->Value.getSExtValue();
+      uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
+      uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
+
+      BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst)
+        .addImm(Lo16 | (Hi16 << 16));
+      MI.eraseFromParent();
+      return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
+    }
+  }
+
   // TODO: This should probably be a combine somewhere
   // (build_vector_trunc $src0, undef -> copy $src0
   MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
@@ -659,7 +626,6 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
 
   Register ShiftSrc0;
   Register ShiftSrc1;
-  int64_t ShiftAmt;
 
   // With multiple uses of the shift, this will duplicate the shift and
   // increase register pressure.
@@ -671,14 +637,11 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
   // (build_vector_trunc $src0, $src1)
   //  => (S_PACK_LL_B32_B16 $src0, $src1)
 
-  // FIXME: This is an inconvenient way to check a specific value
   bool Shift0 = mi_match(
-    Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_ICst(ShiftAmt)))) &&
-    ShiftAmt == 16;
+      Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
 
   bool Shift1 = mi_match(
-    Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_ICst(ShiftAmt)))) &&
-    ShiftAmt == 16;
+      Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
 
   unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
   if (Shift0 && Shift1) {
@@ -688,7 +651,7 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
   } else if (Shift1) {
     Opc = AMDGPU::S_PACK_LH_B32_B16;
     MI.getOperand(2).setReg(ShiftSrc1);
-  } else if (Shift0 && isZero(Src1, *MRI)) {
+  } else if (Shift0 && ConstSrc1 && ConstSrc1->Value == 0) {
     // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
     auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
       .addReg(ShiftSrc0)
@@ -738,6 +701,10 @@ bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
   if (Offset % 32 != 0 || InsSize % 32 != 0)
     return false;
 
+  // Currently not handled by getSubRegFromChannel.
+  if (InsSize > 128)
+    return false;
+
   unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
   if (SubReg == AMDGPU::NoSubRegister)
     return false;
@@ -821,6 +788,63 @@ bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
   return true;
 }
 
+// Writelane is special in that it can use SGPR and M0 (which would normally
+// count as using the constant bus twice - but in this case it is allowed since
+// the lane selector doesn't count as a use of the constant bus). However, it is
+// still required to abide by the 1 SGPR rule. Fix this up if we might have
+// multiple SGPRs.
+bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
+  // With a constant bus limit of at least 2, there's no issue.
+  if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
+    return selectImpl(MI, *CoverageInfo);
+
+  MachineBasicBlock *MBB = MI.getParent();
+  const DebugLoc &DL = MI.getDebugLoc();
+  Register VDst = MI.getOperand(0).getReg();
+  Register Val = MI.getOperand(2).getReg();
+  Register LaneSelect = MI.getOperand(3).getReg();
+  Register VDstIn = MI.getOperand(4).getReg();
+
+  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
+
+  Optional<ValueAndVReg> ConstSelect =
+    getConstantVRegValWithLookThrough(LaneSelect, *MRI, true, true);
+  if (ConstSelect) {
+    // The selector has to be an inline immediate, so we can use whatever for
+    // the other operands.
+    MIB.addReg(Val);
+    MIB.addImm(ConstSelect->Value.getSExtValue() &
+               maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
+  } else {
+    Optional<ValueAndVReg> ConstVal =
+      getConstantVRegValWithLookThrough(Val, *MRI, true, true);
+
+    // If the value written is an inline immediate, we can get away without a
+    // copy to m0.
+    if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
+                                                 STI.hasInv2PiInlineImm())) {
+      MIB.addImm(ConstVal->Value.getSExtValue());
+      MIB.addReg(LaneSelect);
+    } else {
+      MIB.addReg(Val);
+
+      // If the lane selector was originally in a VGPR and copied with
+      // readfirstlane, there's a hazard to read the same SGPR from the
+      // VALU. Constrain to a different SGPR to help avoid needing a nop later.
+      RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
+
+      BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
+        .addReg(LaneSelect);
+      MIB.addReg(AMDGPU::M0);
+    }
+  }
+
+  MIB.addReg(VDstIn);
+
+  MI.eraseFromParent();
+  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+}
+
 // We need to handle this here because tablegen doesn't support matching
 // instructions with multiple outputs.
 bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
@@ -830,12 +854,14 @@ bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
   LLT Ty = MRI->getType(Dst0);
   unsigned Opc;
   if (Ty == LLT::scalar(32))
-    Opc = AMDGPU::V_DIV_SCALE_F32;
+    Opc = AMDGPU::V_DIV_SCALE_F32_e64;
   else if (Ty == LLT::scalar(64))
-    Opc = AMDGPU::V_DIV_SCALE_F64;
+    Opc = AMDGPU::V_DIV_SCALE_F64_e64;
   else
     return false;
 
+  // TODO: Match source modifiers.
+
   const DebugLoc &DL = MI.getDebugLoc();
   MachineBasicBlock *MBB = MI.getParent();
 
@@ -847,9 +873,14 @@ bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
 
   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
     .addDef(Dst1)
-    .addUse(Src0)
-    .addUse(Denom)
-    .addUse(Numer);
+    .addImm(0)     // $src0_modifiers
+    .addUse(Src0)  // $src0
+    .addImm(0)     // $src1_modifiers
+    .addUse(Denom) // $src1
+    .addImm(0)     // $src2_modifiers
+    .addUse(Numer) // $src2
+    .addImm(0)     // $clamp
+    .addImm(0);    // $omod
 
   MI.eraseFromParent();
   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
@@ -887,12 +918,20 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
     return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
   case Intrinsic::amdgcn_wwm:
     return constrainCopyLikeIntrin(I, AMDGPU::WWM);
+  case Intrinsic::amdgcn_writelane:
+    return selectWritelane(I);
   case Intrinsic::amdgcn_div_scale:
     return selectDivScale(I);
   case Intrinsic::amdgcn_icmp:
     return selectIntrinsicIcmp(I);
   case Intrinsic::amdgcn_ballot:
     return selectBallot(I);
+  case Intrinsic::amdgcn_reloc_constant:
+    return selectRelocConstant(I);
+  case Intrinsic::amdgcn_groupstaticsize:
+    return selectGroupStaticSize(I);
+  case Intrinsic::returnaddress:
+    return selectReturnAddress(I);
   default:
     return selectImpl(I, *CoverageInfo);
   }
@@ -1055,7 +1094,7 @@ bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
       getConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI, true);
 
   if (Arg.hasValue()) {
-    const int64_t Value = Arg.getValue().Value;
+    const int64_t Value = Arg.getValue().Value.getSExtValue();
     if (Value == 0) {
       unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
       BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
@@ -1073,6 +1112,96 @@ bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
   return true;
 }
 
+bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
+  Register DstReg = I.getOperand(0).getReg();
+  const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
+  const TargetRegisterClass *DstRC =
+    TRI.getRegClassForSizeOnBank(32, *DstBank, *MRI);
+  if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
+    return false;
+
+  const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
+
+  Module *M = MF->getFunction().getParent();
+  const MDNode *Metadata = I.getOperand(2).getMetadata();
+  auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
+  auto RelocSymbol = cast<GlobalVariable>(
+    M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
+
+  MachineBasicBlock *BB = I.getParent();
+  BuildMI(*BB, &I, I.getDebugLoc(),
+          TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
+    .addGlobalAddress(RelocSymbol, 0, SIInstrInfo::MO_ABS32_LO);
+
+  I.eraseFromParent();
+  return true;
+}
+
+bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
+  Triple::OSType OS = MF->getTarget().getTargetTriple().getOS();
+
+  Register DstReg = I.getOperand(0).getReg();
+  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
+  unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
+    AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
+
+  MachineBasicBlock *MBB = I.getParent();
+  const DebugLoc &DL = I.getDebugLoc();
+
+  auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
+
+  if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
+    const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+    MIB.addImm(MFI->getLDSSize());
+  } else {
+    Module *M = MF->getFunction().getParent();
+    const GlobalValue *GV
+      = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
+    MIB.addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
+  }
+
+  I.eraseFromParent();
+  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+}
+
+bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
+  MachineBasicBlock *MBB = I.getParent();
+  MachineFunction &MF = *MBB->getParent();
+  const DebugLoc &DL = I.getDebugLoc();
+
+  MachineOperand &Dst = I.getOperand(0);
+  Register DstReg = Dst.getReg();
+  unsigned Depth = I.getOperand(2).getImm();
+
+  const TargetRegisterClass *RC
+    = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
+  if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
+      !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
+    return false;
+
+  // Check for kernel and shader functions
+  if (Depth != 0 ||
+      MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
+    BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
+      .addImm(0);
+    I.eraseFromParent();
+    return true;
+  }
+
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  // There is a call to @llvm.returnaddress in this function
+  MFI.setReturnAddressIsTaken(true);
+
+  // Get the return address reg and mark it as an implicit live-in
+  Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
+  Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
+                                             AMDGPU::SReg_64RegClass);
+  BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
+    .addReg(LiveIn);
+  I.eraseFromParent();
+  return true;
+}
+
 bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
   // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
   // SelectionDAG uses for wave32 vs wave64.
@@ -1088,28 +1217,6 @@ bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
   return true;
 }
 
-static unsigned getDSShaderTypeValue(const MachineFunction &MF) {
-  switch (MF.getFunction().getCallingConv()) {
-  case CallingConv::AMDGPU_PS:
-    return 1;
-  case CallingConv::AMDGPU_VS:
-    return 2;
-  case CallingConv::AMDGPU_GS:
-    return 3;
-  case CallingConv::AMDGPU_HS:
-  case CallingConv::AMDGPU_LS:
-  case CallingConv::AMDGPU_ES:
-    report_fatal_error("ds_ordered_count unsupported for this calling conv");
-  case CallingConv::AMDGPU_CS:
-  case CallingConv::AMDGPU_KERNEL:
-  case CallingConv::C:
-  case CallingConv::Fast:
-  default:
-    // Assume other calling conventions are various compute callable functions
-    return 0;
-  }
-}
-
 bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
   MachineInstr &MI, Intrinsic::ID IntrID) const {
   MachineBasicBlock *MBB = MI.getParent();
@@ -1141,7 +1248,7 @@ bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
     report_fatal_error("ds_ordered_count: bad index operand");
 
   unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
-  unsigned ShaderType = getDSShaderTypeValue(*MF);
+  unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
 
   unsigned Offset0 = OrderedCountIndex << 2;
   unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) |
@@ -1235,8 +1342,8 @@ bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
       .addImm(0);
   } else {
-    std::tie(BaseOffset, ImmOffset, OffsetDef)
-      = AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset);
+    std::tie(BaseOffset, ImmOffset) =
+        AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset);
 
     if (Readfirstlane) {
       // We have the constant offset now, so put the readfirstlane back on the
@@ -1274,7 +1381,6 @@ bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
   }
 
   MIB.addImm(ImmOffset)
-     .addImm(-1) // $gds
      .cloneMemRefs(MI);
 
   MI.eraseFromParent();
@@ -1291,7 +1397,7 @@ bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
   std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
 
   // TODO: Should this try to look through readfirstlane like GWS?
-  if (!isDSOffsetLegal(PtrBase, Offset, 16)) {
+  if (!isDSOffsetLegal(PtrBase, Offset)) {
     PtrBase = MI.getOperand(2).getReg();
     Offset = 0;
   }
@@ -1302,12 +1408,29 @@ bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
 
   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
     .addReg(PtrBase);
-  BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
+  if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
+    return false;
+
+  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
     .addImm(Offset)
     .addImm(IsGDS ? -1 : 0)
     .cloneMemRefs(MI);
   MI.eraseFromParent();
-  return true;
+  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+}
+
+bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
+  if (TM.getOptLevel() > CodeGenOpt::None) {
+    unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
+    if (WGSize <= STI.getWavefrontSize()) {
+      MachineBasicBlock *MBB = MI.getParent();
+      const DebugLoc &DL = MI.getDebugLoc();
+      BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER));
+      MI.eraseFromParent();
+      return true;
+    }
+  }
+  return selectImpl(MI, *CoverageInfo);
 }
 
 static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
@@ -1355,36 +1478,29 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
   const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo =
       AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode);
   unsigned IntrOpcode = Intr->BaseOpcode;
-  const bool IsGFX10 = STI.getGeneration() >= AMDGPUSubtarget::GFX10;
+  const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
 
-  const int VAddrIdx = getImageVAddrIdxBegin(BaseOpcode,
-                                             MI.getNumExplicitDefs());
-  int NumVAddr, NumGradients;
-  std::tie(NumVAddr, NumGradients) = getImageNumVAddr(Intr, BaseOpcode);
+  const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
 
   Register VDataIn, VDataOut;
   LLT VDataTy;
   int NumVDataDwords = -1;
   bool IsD16 = false;
 
-  // XXX - Can we just get the second to last argument for ctrl?
-  unsigned CtrlIdx; // Index of texfailctrl argument
   bool Unorm;
-  if (!BaseOpcode->Sampler) {
+  if (!BaseOpcode->Sampler)
     Unorm = true;
-    CtrlIdx = VAddrIdx + NumVAddr + 1;
-  } else {
-    Unorm = MI.getOperand(VAddrIdx + NumVAddr + 2).getImm() != 0;
-    CtrlIdx = VAddrIdx + NumVAddr + 3;
-  }
+  else
+    Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
 
   bool TFE;
   bool LWE;
   bool IsTexFail = false;
-  if (!parseTexFail(MI.getOperand(CtrlIdx).getImm(), TFE, LWE, IsTexFail))
+  if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
+                    TFE, LWE, IsTexFail))
     return false;
 
-  const int Flags = MI.getOperand(CtrlIdx + 2).getImm();
+  const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
   const bool IsA16 = (Flags & 1) != 0;
   const bool IsG16 = (Flags & 2) != 0;
 
@@ -1415,11 +1531,19 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
       NumVDataDwords = Is64Bit ? 2 : 1;
     }
   } else {
-    const int DMaskIdx = 2; // Input/output + intrinsic ID.
-
-    DMask = MI.getOperand(DMaskIdx).getImm();
+    DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
     DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
 
+    // One memoperand is mandatory, except for getresinfo.
+    // FIXME: Check this in verifier.
+    if (!MI.memoperands_empty()) {
+      const MachineMemOperand *MMO = *MI.memoperands_begin();
+
+      // Infer d16 from the memory size, as the register type will be mangled by
+      // unpacked subtargets, or by TFE.
+      IsD16 = ((8 * MMO->getSize()) / DMaskLanes) < 32;
+    }
+
     if (BaseOpcode->Store) {
       VDataIn = MI.getOperand(1).getReg();
       VDataTy = MRI->getType(VDataIn);
@@ -1429,18 +1553,8 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
       VDataTy = MRI->getType(VDataOut);
       NumVDataDwords = DMaskLanes;
 
-      // One memoperand is mandatory, except for getresinfo.
-      // FIXME: Check this in verifier.
-      if (!MI.memoperands_empty()) {
-        const MachineMemOperand *MMO = *MI.memoperands_begin();
-
-        // Infer d16 from the memory size, as the register type will be mangled by
-        // unpacked subtargets, or by TFE.
-        IsD16 = ((8 * MMO->getSize()) / DMaskLanes) < 32;
-
-        if (IsD16 && !STI.hasUnpackedD16VMem())
-          NumVDataDwords = (DMaskLanes + 1) / 2;
-      }
+      if (IsD16 && !STI.hasUnpackedD16VMem())
+        NumVDataDwords = (DMaskLanes + 1) / 2;
     }
   }
 
@@ -1448,7 +1562,7 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
   if (LZMappingInfo) {
     // The legalizer replaced the register with an immediate 0 if we need to
     // change the opcode.
-    const MachineOperand &Lod = MI.getOperand(VAddrIdx + NumVAddr - 1);
+    const MachineOperand &Lod = MI.getOperand(ArgOffset + Intr->LodIndex);
     if (Lod.isImm()) {
       assert(Lod.getImm() == 0);
       IntrOpcode = LZMappingInfo->LZ;  // set new opcode to _lz variant of _l
@@ -1457,7 +1571,7 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
 
   // Optimize _mip away, when 'lod' is zero
   if (MIPMappingInfo) {
-    const MachineOperand &Lod = MI.getOperand(VAddrIdx + NumVAddr - 1);
+    const MachineOperand &Lod = MI.getOperand(ArgOffset + Intr->MipIndex);
     if (Lod.isImm()) {
       assert(Lod.getImm() == 0);
       IntrOpcode = MIPMappingInfo->NONMIP;  // set new opcode to variant without _mip
@@ -1480,20 +1594,22 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
   bool DLC = false;
   if (BaseOpcode->Atomic) {
     GLC = true; // TODO no-return optimization
-    if (!parseCachePolicy(MI.getOperand(CtrlIdx + 1).getImm(), nullptr, &SLC,
-                          IsGFX10 ? &DLC : nullptr))
+    if (!parseCachePolicy(
+            MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm(), nullptr,
+            &SLC, IsGFX10Plus ? &DLC : nullptr))
       return false;
   } else {
-    if (!parseCachePolicy(MI.getOperand(CtrlIdx + 1).getImm(), &GLC, &SLC,
-                          IsGFX10 ? &DLC : nullptr))
+    if (!parseCachePolicy(
+            MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm(), &GLC,
+            &SLC, IsGFX10Plus ? &DLC : nullptr))
       return false;
   }
 
   int NumVAddrRegs = 0;
   int NumVAddrDwords = 0;
-  for (int I = 0; I < NumVAddr; ++I) {
+  for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
     // Skip the $noregs and 0s inserted during legalization.
-    MachineOperand &AddrOp = MI.getOperand(VAddrIdx + I);
+    MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
     if (!AddrOp.isReg())
       continue; // XXX - Break?
 
@@ -1518,7 +1634,7 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
     ++NumVDataDwords;
 
   int Opcode = -1;
-  if (IsGFX10) {
+  if (IsGFX10Plus) {
     Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
                                    UseNSA ? AMDGPU::MIMGEncGfx10NSA
                                           : AMDGPU::MIMGEncGfx10Default,
@@ -1556,36 +1672,36 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
   if (VDataIn)
     MIB.addReg(VDataIn); // vdata input
 
-  for (int i = 0; i != NumVAddrRegs; ++i) {
-    MachineOperand &SrcOp = MI.getOperand(VAddrIdx + i);
+  for (int I = 0; I != NumVAddrRegs; ++I) {
+    MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
     if (SrcOp.isReg()) {
       assert(SrcOp.getReg() != 0);
       MIB.addReg(SrcOp.getReg());
     }
   }
 
-  MIB.addReg(MI.getOperand(VAddrIdx + NumVAddr).getReg()); // rsrc
+  MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
   if (BaseOpcode->Sampler)
-    MIB.addReg(MI.getOperand(VAddrIdx + NumVAddr + 1).getReg()); // sampler
+    MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
 
   MIB.addImm(DMask); // dmask
 
-  if (IsGFX10)
+  if (IsGFX10Plus)
     MIB.addImm(DimInfo->Encoding);
   MIB.addImm(Unorm);
-  if (IsGFX10)
+  if (IsGFX10Plus)
     MIB.addImm(DLC);
 
   MIB.addImm(GLC);
   MIB.addImm(SLC);
   MIB.addImm(IsA16 &&  // a16 or r128
              STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
-  if (IsGFX10)
+  if (IsGFX10Plus)
     MIB.addImm(IsA16 ? -1 : 0);
 
   MIB.addImm(TFE); // tfe
   MIB.addImm(LWE); // lwe
-  if (!IsGFX10)
+  if (!IsGFX10Plus)
     MIB.addImm(DimInfo->DA ? -1 : 0);
   if (BaseOpcode->HasD16)
     MIB.addImm(IsD16 ? -1 : 0);
@@ -1614,6 +1730,10 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
     return selectDSAppendConsume(I, true);
   case Intrinsic::amdgcn_ds_consume:
     return selectDSAppendConsume(I, false);
+  case Intrinsic::amdgcn_s_barrier:
+    return selectSBarrier(I);
+  case Intrinsic::amdgcn_global_atomic_fadd:
+    return selectGlobalAtomicFaddIntrinsic(I);
   default: {
     return selectImpl(I, *CoverageInfo);
   }
@@ -1670,11 +1790,6 @@ bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
   return Ret;
 }
 
-bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const {
-  initM0(I);
-  return selectImpl(I, *CoverageInfo);
-}
-
 static int sizeToSubRegIndex(unsigned Size) {
   switch (Size) {
   case 32:
@@ -1853,12 +1968,33 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
   if (!DstTy.isScalar())
     return false;
 
-  if (I.getOpcode() == AMDGPU::G_ANYEXT)
-    return selectCOPY(I);
-
   // Artifact casts should never use vcc.
   const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
 
+  // FIXME: This should probably be illegal and split earlier.
+  if (I.getOpcode() == AMDGPU::G_ANYEXT) {
+    if (DstSize <= 32)
+      return selectCOPY(I);
+
+    const TargetRegisterClass *SrcRC =
+        TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank, *MRI);
+    const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
+    const TargetRegisterClass *DstRC =
+        TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
+
+    Register UndefReg = MRI->createVirtualRegister(SrcRC);
+    BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
+    BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
+      .addReg(SrcReg)
+      .addImm(AMDGPU::sub0)
+      .addReg(UndefReg)
+      .addImm(AMDGPU::sub1);
+    I.eraseFromParent();
+
+    return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
+           RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
+  }
+
   if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
     // 64-bit should have been split up in RegBankSelect
 
@@ -1873,7 +2009,7 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
       return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
     }
 
-    const unsigned BFE = Signed ? AMDGPU::V_BFE_I32 : AMDGPU::V_BFE_U32;
+    const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
     MachineInstr *ExtI =
       BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
       .addReg(SrcReg)
@@ -1944,33 +2080,36 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
 bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
   MachineBasicBlock *BB = I.getParent();
   MachineOperand &ImmOp = I.getOperand(1);
+  Register DstReg = I.getOperand(0).getReg();
+  unsigned Size = MRI->getType(DstReg).getSizeInBits();
 
   // The AMDGPU backend only supports Imm operands and not CImm or FPImm.
   if (ImmOp.isFPImm()) {
     const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt();
     ImmOp.ChangeToImmediate(Imm.getZExtValue());
   } else if (ImmOp.isCImm()) {
-    ImmOp.ChangeToImmediate(ImmOp.getCImm()->getZExtValue());
+    ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue());
+  } else {
+    llvm_unreachable("Not supported by g_constants");
   }
 
-  Register DstReg = I.getOperand(0).getReg();
-  unsigned Size;
-  bool IsSgpr;
-  const RegisterBank *RB = MRI->getRegBankOrNull(I.getOperand(0).getReg());
-  if (RB) {
-    IsSgpr = RB->getID() == AMDGPU::SGPRRegBankID;
-    Size = MRI->getType(DstReg).getSizeInBits();
+  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
+  const bool IsSgpr = DstRB->getID() == AMDGPU::SGPRRegBankID;
+
+  unsigned Opcode;
+  if (DstRB->getID() == AMDGPU::VCCRegBankID) {
+    Opcode = STI.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
   } else {
-    const TargetRegisterClass *RC = TRI.getRegClassForReg(*MRI, DstReg);
-    IsSgpr = TRI.isSGPRClass(RC);
-    Size = TRI.getRegSizeInBits(*RC);
-  }
+    Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
 
-  if (Size != 32 && Size != 64)
-    return false;
+    // We should never produce s1 values on banks other than VCC. If the user of
+    // this already constrained the register, we may incorrectly think it's VCC
+    // if it wasn't originally.
+    if (Size == 1)
+      return false;
+  }
 
-  unsigned Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
-  if (Size == 32) {
+  if (Size != 64) {
     I.setDesc(TII.get(Opcode));
     I.addImplicitDefUseOperands(*MF);
     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
@@ -2148,6 +2287,10 @@ void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
   getAddrModeInfo(*PtrMI, MRI, AddrInfo);
 }
 
+bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
+  return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
+}
+
 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
   if (!MI.hasOneMemOperand())
     return false;
@@ -2179,19 +2322,20 @@ bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
 }
 
 void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
-  MachineBasicBlock *BB = I.getParent();
-
   const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
   unsigned AS = PtrTy.getAddressSpace();
   if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) &&
       STI.ldsRequiresM0Init()) {
+    MachineBasicBlock *BB = I.getParent();
+
     // If DS instructions require M0 initializtion, insert it before selecting.
     BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
       .addImm(-1);
   }
 }
 
-bool AMDGPUInstructionSelector::selectG_LOAD_ATOMICRMW(MachineInstr &I) const {
+bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
+  MachineInstr &I) const {
   initM0(I);
   return selectImpl(I, *CoverageInfo);
 }
@@ -2242,6 +2386,7 @@ bool AMDGPUInstructionSelector::selectG_AMDGPU_ATOMIC_CMPXCHG(
     MIB.addImm(0);
 
   MIB.addImm(Offset);
+  MIB.addImm(1); // glc
   MIB.addImm(0); // slc
   MIB.cloneMemRefs(MI);
 
@@ -2276,8 +2421,7 @@ bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
 
     CondPhysReg = AMDGPU::SCC;
     BrOpcode = AMDGPU::S_CBRANCH_SCC1;
-    // FIXME: Hack for isSCC tests
-    ConstrainRC = &AMDGPU::SGPR_32RegClass;
+    ConstrainRC = &AMDGPU::SReg_32RegClass;
   } else {
     // FIXME: Do we have to insert an and with exec here, like in SelectionDAG?
     // We sort of know that a VCC producer based on the register bank, that ands
@@ -2301,7 +2445,7 @@ bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
   return true;
 }
 
-bool AMDGPUInstructionSelector::selectG_FRAME_INDEX_GLOBAL_VALUE(
+bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
   MachineInstr &I) const {
   Register DstReg = I.getOperand(0).getReg();
   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
@@ -2422,10 +2566,8 @@ computeIndirectRegIndex(MachineRegisterInfo &MRI,
                         unsigned EltSize) {
   Register IdxBaseReg;
   int Offset;
-  MachineInstr *Unused;
 
-  std::tie(IdxBaseReg, Offset, Unused)
-    = AMDGPU::getBaseWithConstantOffset(MRI, IdxReg);
+  std::tie(IdxBaseReg, Offset) = AMDGPU::getBaseWithConstantOffset(MRI, IdxReg);
   if (IdxBaseReg == AMDGPU::NoRegister) {
     // This will happen if the index is a known constant. This should ordinarily
     // be legalized out, but handle it as a register just in case.
@@ -2501,20 +2643,18 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
       .addReg(IdxReg);
     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
-      .addReg(SrcReg, RegState::Undef, SubReg)
+      .addReg(SrcReg, 0, SubReg)
       .addReg(SrcReg, RegState::Implicit);
     MI.eraseFromParent();
     return true;
   }
 
-  BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON))
-    .addReg(IdxReg)
-    .addImm(AMDGPU::VGPRIndexMode::SRC0_ENABLE);
-  BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), DstReg)
-    .addReg(SrcReg, RegState::Undef, SubReg)
-    .addReg(SrcReg, RegState::Implicit)
-    .addReg(AMDGPU::M0, RegState::Implicit);
-  BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF));
+  const MCInstrDesc &GPRIDXDesc =
+      TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
+  BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
+      .addReg(SrcReg)
+      .addReg(IdxReg)
+      .addImm(SubReg);
 
   MI.eraseFromParent();
   return true;
@@ -2568,25 +2708,27 @@ bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
   MachineBasicBlock *BB = MI.getParent();
   const DebugLoc &DL = MI.getDebugLoc();
 
-  if (IndexMode) {
-    BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON))
-      .addReg(IdxReg)
-      .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE);
-  } else {
+  if (!IndexMode) {
     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
       .addReg(IdxReg);
-  }
 
-  const MCInstrDesc &RegWriteOp
-    = TII.getIndirectRegWritePseudo(VecSize, ValSize,
-                                    VecRB->getID() == AMDGPU::SGPRRegBankID);
-  BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
-    .addReg(VecReg)
-    .addReg(ValReg)
-    .addImm(SubReg);
+    const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
+        VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
+    BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
+        .addReg(VecReg)
+        .addReg(ValReg)
+        .addImm(SubReg);
+    MI.eraseFromParent();
+    return true;
+  }
 
-  if (IndexMode)
-    BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF));
+  const MCInstrDesc &GPRIDXDesc =
+      TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
+  BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
+      .addReg(VecReg)
+      .addReg(ValReg)
+      .addReg(IdxReg)
+      .addImm(SubReg);
 
   MI.eraseFromParent();
   return true;
@@ -2731,7 +2873,7 @@ bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(
     }
   } else if (Mask[0] == 1 && Mask[1] == 0) {
     if (IsVALU) {
-      BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_ALIGNBIT_B32), DstReg)
+      BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_ALIGNBIT_B32_e64), DstReg)
         .addReg(SrcVec)
         .addReg(SrcVec)
         .addImm(16);
@@ -2751,6 +2893,130 @@ bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(
   return true;
 }
 
+bool AMDGPUInstructionSelector::selectAMDGPU_BUFFER_ATOMIC_FADD(
+  MachineInstr &MI) const {
+
+  MachineBasicBlock *MBB = MI.getParent();
+  const DebugLoc &DL = MI.getDebugLoc();
+
+  if (!MRI->use_nodbg_empty(MI.getOperand(0).getReg())) {
+    Function &F = MBB->getParent()->getFunction();
+    DiagnosticInfoUnsupported
+      NoFpRet(F, "return versions of fp atomics not supported",
+              MI.getDebugLoc(), DS_Error);
+    F.getContext().diagnose(NoFpRet);
+    return false;
+  }
+
+  // FIXME: This is only needed because tablegen requires number of dst operands
+  // in match and replace pattern to be the same. Otherwise patterns can be
+  // exported from SDag path.
+  MachineOperand &VDataIn = MI.getOperand(1);
+  MachineOperand &VIndex = MI.getOperand(3);
+  MachineOperand &VOffset = MI.getOperand(4);
+  MachineOperand &SOffset = MI.getOperand(5);
+  int16_t Offset = MI.getOperand(6).getImm();
+
+  bool HasVOffset = !isOperandImmEqual(VOffset, 0, *MRI);
+  bool HasVIndex = !isOperandImmEqual(VIndex, 0, *MRI);
+
+  unsigned Opcode;
+  if (HasVOffset) {
+    Opcode = HasVIndex ? AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN
+                       : AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFEN;
+  } else {
+    Opcode = HasVIndex ? AMDGPU::BUFFER_ATOMIC_ADD_F32_IDXEN
+                       : AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFSET;
+  }
+
+  if (MRI->getType(VDataIn.getReg()).isVector()) {
+    switch (Opcode) {
+    case AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN:
+      Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN;
+      break;
+    case AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFEN:
+      Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_OFFEN;
+      break;
+    case AMDGPU::BUFFER_ATOMIC_ADD_F32_IDXEN:
+      Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_IDXEN;
+      break;
+    case AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFSET:
+      Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_OFFSET;
+      break;
+    }
+  }
+
+  auto I = BuildMI(*MBB, MI, DL, TII.get(Opcode));
+  I.add(VDataIn);
+
+  if (Opcode == AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN ||
+      Opcode == AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN) {
+    Register IdxReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass);
+    BuildMI(*MBB, &*I, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
+      .addReg(VIndex.getReg())
+      .addImm(AMDGPU::sub0)
+      .addReg(VOffset.getReg())
+      .addImm(AMDGPU::sub1);
+
+    I.addReg(IdxReg);
+  } else if (HasVIndex) {
+    I.add(VIndex);
+  } else if (HasVOffset) {
+    I.add(VOffset);
+  }
+
+  I.add(MI.getOperand(2)); // rsrc
+  I.add(SOffset);
+  I.addImm(Offset);
+  renderExtractSLC(I, MI, 7);
+  I.cloneMemRefs(MI);
+
+  MI.eraseFromParent();
+
+  return true;
+}
+
+bool AMDGPUInstructionSelector::selectGlobalAtomicFaddIntrinsic(
+  MachineInstr &MI) const{
+
+  MachineBasicBlock *MBB = MI.getParent();
+  const DebugLoc &DL = MI.getDebugLoc();
+
+  if (!MRI->use_nodbg_empty(MI.getOperand(0).getReg())) {
+    Function &F = MBB->getParent()->getFunction();
+    DiagnosticInfoUnsupported
+      NoFpRet(F, "return versions of fp atomics not supported",
+              MI.getDebugLoc(), DS_Error);
+    F.getContext().diagnose(NoFpRet);
+    return false;
+  }
+
+  // FIXME: This is only needed because tablegen requires number of dst operands
+  // in match and replace pattern to be the same. Otherwise patterns can be
+  // exported from SDag path.
+  auto Addr = selectFlatOffsetImpl<true>(MI.getOperand(2));
+
+  Register Data = MI.getOperand(3).getReg();
+  const unsigned Opc = MRI->getType(Data).isVector() ?
+    AMDGPU::GLOBAL_ATOMIC_PK_ADD_F16 : AMDGPU::GLOBAL_ATOMIC_ADD_F32;
+  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
+    .addReg(Addr.first)
+    .addReg(Data)
+    .addImm(Addr.second)
+    .addImm(0) // SLC
+    .cloneMemRefs(MI);
+
+  MI.eraseFromParent();
+  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+}
+
+bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{
+  MI.setDesc(TII.get(MI.getOperand(1).getImm()));
+  MI.RemoveOperand(1);
+  MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
+  return true;
+}
+
 bool AMDGPUInstructionSelector::select(MachineInstr &I) {
   if (I.isPHI())
     return selectPHI(I);
@@ -2807,6 +3073,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
     return selectG_PTR_ADD(I);
   case TargetOpcode::G_IMPLICIT_DEF:
     return selectG_IMPLICIT_DEF(I);
+  case TargetOpcode::G_FREEZE:
+    return selectCOPY(I);
   case TargetOpcode::G_INSERT:
     return selectG_INSERT(I);
   case TargetOpcode::G_INTRINSIC:
@@ -2818,6 +3086,7 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
       return true;
     return selectImpl(I, *CoverageInfo);
   case TargetOpcode::G_LOAD:
+  case TargetOpcode::G_STORE:
   case TargetOpcode::G_ATOMIC_CMPXCHG:
   case TargetOpcode::G_ATOMICRMW_XCHG:
   case TargetOpcode::G_ATOMICRMW_ADD:
@@ -2830,13 +3099,15 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
   case TargetOpcode::G_ATOMICRMW_UMIN:
   case TargetOpcode::G_ATOMICRMW_UMAX:
   case TargetOpcode::G_ATOMICRMW_FADD:
-    return selectG_LOAD_ATOMICRMW(I);
+  case AMDGPU::G_AMDGPU_ATOMIC_INC:
+  case AMDGPU::G_AMDGPU_ATOMIC_DEC:
+  case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
+  case AMDGPU::G_AMDGPU_ATOMIC_FMAX:
+    return selectG_LOAD_STORE_ATOMICRMW(I);
   case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
     return selectG_AMDGPU_ATOMIC_CMPXCHG(I);
   case TargetOpcode::G_SELECT:
     return selectG_SELECT(I);
-  case TargetOpcode::G_STORE:
-    return selectG_STORE(I);
   case TargetOpcode::G_TRUNC:
     return selectG_TRUNC(I);
   case TargetOpcode::G_SEXT:
@@ -2848,9 +3119,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
     return selectG_SZA_EXT(I);
   case TargetOpcode::G_BRCOND:
     return selectG_BRCOND(I);
-  case TargetOpcode::G_FRAME_INDEX:
   case TargetOpcode::G_GLOBAL_VALUE:
-    return selectG_FRAME_INDEX_GLOBAL_VALUE(I);
+    return selectG_GLOBAL_VALUE(I);
   case TargetOpcode::G_PTRMASK:
     return selectG_PTRMASK(I);
   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
@@ -2859,10 +3129,6 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
     return selectG_INSERT_VECTOR_ELT(I);
   case TargetOpcode::G_SHUFFLE_VECTOR:
     return selectG_SHUFFLE_VECTOR(I);
-  case AMDGPU::G_AMDGPU_ATOMIC_INC:
-  case AMDGPU::G_AMDGPU_ATOMIC_DEC:
-    initM0(I);
-    return selectImpl(I, *CoverageInfo);
   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
     const AMDGPU::ImageDimIntrinsicInfo *Intr
@@ -2870,6 +3136,10 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
     assert(Intr && "not an image intrinsic with image pseudo");
     return selectImageIntrinsic(I, Intr);
   }
+  case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY:
+    return selectBVHIntrinsic(I);
+  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
+    return selectAMDGPU_BUFFER_ATOMIC_FADD(I);
   default:
     return selectImpl(I, *CoverageInfo);
   }
@@ -2885,7 +3155,8 @@ AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
 }
 
 std::pair<Register, unsigned>
-AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root) const {
+AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root,
+                                              bool AllowAbs) const {
   Register Src = Root.getReg();
   Register OrigSrc = Src;
   unsigned Mods = 0;
@@ -2897,7 +3168,7 @@ AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root) const {
     MI = getDefIgnoringCopies(Src, *MRI);
   }
 
-  if (MI && MI->getOpcode() == AMDGPU::G_FABS) {
+  if (AllowAbs && MI && MI->getOpcode() == AMDGPU::G_FABS) {
     Src = MI->getOperand(1).getReg();
     Mods |= SISrcMods::ABS;
   }
@@ -2943,6 +3214,20 @@ AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
   }};
 }
 
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
+  Register Src;
+  unsigned Mods;
+  std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /* AllowAbs */ false);
+
+  return {{
+      [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },    // clamp
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }     // omod
+  }};
+}
+
 InstructionSelector::ComplexRendererFns
 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
   return {{
@@ -2964,6 +3249,18 @@ AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
   }};
 }
 
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
+  Register Src;
+  unsigned Mods;
+  std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /* AllowAbs */ false);
+
+  return {{
+      [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
+  }};
+}
+
 InstructionSelector::ComplexRendererFns
 AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
   Register Reg = Root.getReg();
@@ -3019,7 +3316,7 @@ AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const {
   Register Src;
   unsigned Mods;
   std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
-  if (!TM.Options.NoNaNsFPMath && !isKnownNeverNaN(Src, *MRI))
+  if (!isKnownNeverNaN(Src, *MRI))
     return None;
 
   return {{
@@ -3112,49 +3409,234 @@ AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
 }
 
 template <bool Signed>
-InstructionSelector::ComplexRendererFns
+std::pair<Register, int>
 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const {
   MachineInstr *MI = Root.getParent();
 
-  InstructionSelector::ComplexRendererFns Default = {{
-      [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
-      [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },  // offset
-      [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }  // slc
-    }};
+  auto Default = std::make_pair(Root.getReg(), 0);
 
   if (!STI.hasFlatInstOffsets())
     return Default;
 
-  const MachineInstr *OpDef = MRI->getVRegDef(Root.getReg());
-  if (!OpDef || OpDef->getOpcode() != AMDGPU::G_PTR_ADD)
-    return Default;
-
-  Optional<int64_t> Offset =
-    getConstantVRegVal(OpDef->getOperand(2).getReg(), *MRI);
-  if (!Offset.hasValue())
+  Register PtrBase;
+  int64_t ConstOffset;
+  std::tie(PtrBase, ConstOffset) =
+      getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
+  if (ConstOffset == 0)
     return Default;
 
   unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
-  if (!TII.isLegalFLATOffset(Offset.getValue(), AddrSpace, Signed))
+  if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, Signed))
     return Default;
 
-  Register BasePtr = OpDef->getOperand(1).getReg();
+  return std::make_pair(PtrBase, ConstOffset);
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
+  auto PtrWithOffset = selectFlatOffsetImpl<false>(Root);
 
   return {{
-      [=](MachineInstrBuilder &MIB) { MIB.addReg(BasePtr); },
-      [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset.getValue()); },
-      [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }  // slc
+      [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
     }};
 }
 
 InstructionSelector::ComplexRendererFns
-AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
-  return selectFlatOffsetImpl<false>(Root);
+AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const {
+  auto PtrWithOffset = selectFlatOffsetImpl<true>(Root);
+
+  return {{
+      [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
+    }};
 }
 
+/// Match a zero extend from a 32-bit value to 64-bits.
+static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) {
+  Register ZExtSrc;
+  if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
+    return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
+
+  // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
+  const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
+  if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
+    return false;
+
+  if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
+    return Def->getOperand(1).getReg();
+  }
+
+  return Register();
+}
+
+// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
 InstructionSelector::ComplexRendererFns
-AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const {
-  return selectFlatOffsetImpl<true>(Root);
+AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
+  Register Addr = Root.getReg();
+  Register PtrBase;
+  int64_t ConstOffset;
+  int64_t ImmOffset = 0;
+
+  // Match the immediate offset first, which canonically is moved as low as
+  // possible.
+  std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
+
+  if (ConstOffset != 0) {
+    if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, true)) {
+      Addr = PtrBase;
+      ImmOffset = ConstOffset;
+    } else if (ConstOffset > 0) {
+      auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
+      if (!PtrBaseDef)
+        return None;
+
+      if (isSGPR(PtrBaseDef->Reg)) {
+        // Offset is too large.
+        //
+        // saddr + large_offset -> saddr + (voffset = large_offset & ~MaxOffset)
+        //                         + (large_offset & MaxOffset);
+        int64_t SplitImmOffset, RemainderOffset;
+        std::tie(SplitImmOffset, RemainderOffset)
+          = TII.splitFlatOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, true);
+
+        if (isUInt<32>(RemainderOffset)) {
+          MachineInstr *MI = Root.getParent();
+          MachineBasicBlock *MBB = MI->getParent();
+          Register HighBits
+            = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+          BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
+                  HighBits)
+            .addImm(RemainderOffset);
+
+          return {{
+            [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); },  // saddr
+            [=](MachineInstrBuilder &MIB) { MIB.addReg(HighBits); }, // voffset
+            [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
+          }};
+        }
+      }
+    }
+  }
+
+  auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
+  if (!AddrDef)
+    return None;
+
+  // Match the variable offset.
+  if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD) {
+    // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
+    // drop this.
+    if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
+        AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT)
+      return None;
+
+    // It's cheaper to materialize a single 32-bit zero for vaddr than the two
+    // moves required to copy a 64-bit SGPR to VGPR.
+    const Register SAddr = AddrDef->Reg;
+    if (!isSGPR(SAddr))
+      return None;
+
+    MachineInstr *MI = Root.getParent();
+    MachineBasicBlock *MBB = MI->getParent();
+    Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+    BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
+            VOffset)
+      .addImm(0);
+
+    return {{
+        [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); },    // saddr
+        [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); },  // voffset
+        [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
+    }};
+  }
+
+  // Look through the SGPR->VGPR copy.
+  Register SAddr =
+    getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
+  if (!SAddr || !isSGPR(SAddr))
+    return None;
+
+  Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
+
+  // It's possible voffset is an SGPR here, but the copy to VGPR will be
+  // inserted later.
+  Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset);
+  if (!VOffset)
+    return None;
+
+  return {{[=](MachineInstrBuilder &MIB) { // saddr
+             MIB.addReg(SAddr);
+           },
+           [=](MachineInstrBuilder &MIB) { // voffset
+             MIB.addReg(VOffset);
+           },
+           [=](MachineInstrBuilder &MIB) { // offset
+             MIB.addImm(ImmOffset);
+           }}};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
+  Register Addr = Root.getReg();
+  Register PtrBase;
+  int64_t ConstOffset;
+  int64_t ImmOffset = 0;
+
+  // Match the immediate offset first, which canonically is moved as low as
+  // possible.
+  std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
+
+  if (ConstOffset != 0 &&
+      TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, true)) {
+    Addr = PtrBase;
+    ImmOffset = ConstOffset;
+  }
+
+  auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
+  if (!AddrDef)
+    return None;
+
+  if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
+    int FI = AddrDef->MI->getOperand(1).getIndex();
+    return {{
+        [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
+        [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
+    }};
+  }
+
+  Register SAddr = AddrDef->Reg;
+
+  if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
+    Register LHS = AddrDef->MI->getOperand(1).getReg();
+    Register RHS = AddrDef->MI->getOperand(2).getReg();
+    auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
+    auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
+
+    if (LHSDef && RHSDef &&
+        LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
+        isSGPR(RHSDef->Reg)) {
+      int FI = LHSDef->MI->getOperand(1).getIndex();
+      MachineInstr &I = *Root.getParent();
+      MachineBasicBlock *BB = I.getParent();
+      const DebugLoc &DL = I.getDebugLoc();
+      SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+
+      BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), SAddr)
+        .addFrameIndex(FI)
+        .addReg(RHSDef->Reg);
+    }
+  }
+
+  if (!isSGPR(SAddr))
+    return None;
+
+  return {{
+      [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
+  }};
 }
 
 static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) {
@@ -3187,13 +3669,9 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
                MIB.addReg(HighBits);
              },
              [=](MachineInstrBuilder &MIB) { // soffset
-               const MachineMemOperand *MMO = *MI->memoperands_begin();
-               const MachinePointerInfo &PtrInfo = MMO->getPointerInfo();
-
-               if (isStackPtrRelative(PtrInfo))
-                 MIB.addReg(Info->getStackPtrOffsetReg());
-               else
-                 MIB.addImm(0);
+               // Use constant zero for soffset and rely on eliminateFrameIndex
+               // to choose the appropriate frame register if need be.
+               MIB.addImm(0);
              },
              [=](MachineInstrBuilder &MIB) { // offset
                MIB.addImm(Offset & 4095);
@@ -3240,15 +3718,9 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
                MIB.addReg(VAddr);
            },
            [=](MachineInstrBuilder &MIB) { // soffset
-             // If we don't know this private access is a local stack object, it
-             // needs to be relative to the entry point's scratch wave offset.
-             // TODO: Should split large offsets that don't fit like above.
-             // TODO: Don't use scratch wave offset just because the offset
-             // didn't fit.
-             if (!Info->isEntryFunction() && FI.hasValue())
-               MIB.addReg(Info->getStackPtrOffsetReg());
-             else
-               MIB.addImm(0);
+             // Use constant zero for soffset and rely on eliminateFrameIndex
+             // to choose the appropriate frame register if need be.
+             MIB.addImm(0);
            },
            [=](MachineInstrBuilder &MIB) { // offset
              MIB.addImm(Offset);
@@ -3256,10 +3728,24 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
 }
 
 bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
-                                                int64_t Offset,
-                                                unsigned OffsetBits) const {
-  if ((OffsetBits == 16 && !isUInt<16>(Offset)) ||
-      (OffsetBits == 8 && !isUInt<8>(Offset)))
+                                                int64_t Offset) const {
+  if (!isUInt<16>(Offset))
+    return false;
+
+  if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
+    return true;
+
+  // On Southern Islands instruction with a negative base value and an offset
+  // don't seem to work.
+  return KnownBits->signBitIsZero(Base);
+}
+
+bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
+                                                 int64_t Offset1,
+                                                 unsigned Size) const {
+  if (Offset0 % Size != 0 || Offset1 % Size != 0)
+    return false;
+  if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
     return false;
 
   if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
@@ -3314,7 +3800,7 @@ AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const
     getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
 
   if (Offset) {
-    if (isDSOffsetLegal(PtrBase, Offset, 16)) {
+    if (isDSOffsetLegal(PtrBase, Offset)) {
       // (add n0, c0)
       return std::make_pair(PtrBase, Offset);
     }
@@ -3343,9 +3829,20 @@ AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
 
 InstructionSelector::ComplexRendererFns
 AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
+  return selectDSReadWrite2(Root, 4);
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
+  return selectDSReadWrite2(Root, 8);
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
+                                              unsigned Size) const {
   Register Reg;
   unsigned Offset;
-  std::tie(Reg, Offset) = selectDS64Bit4ByteAlignedImpl(Root);
+  std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
   return {{
       [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
@@ -3354,7 +3851,8 @@ AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const
 }
 
 std::pair<Register, unsigned>
-AMDGPUInstructionSelector::selectDS64Bit4ByteAlignedImpl(MachineOperand &Root) const {
+AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
+                                                  unsigned Size) const {
   const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
   if (!RootDef)
     return std::make_pair(Root.getReg(), 0);
@@ -3367,11 +3865,11 @@ AMDGPUInstructionSelector::selectDS64Bit4ByteAlignedImpl(MachineOperand &Root) c
     getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
 
   if (Offset) {
-    int64_t DWordOffset0 = Offset / 4;
-    int64_t DWordOffset1 = DWordOffset0 + 1;
-    if (isDSOffsetLegal(PtrBase, DWordOffset1, 8)) {
+    int64_t OffsetValue0 = Offset;
+    int64_t OffsetValue1 = Offset + Size;
+    if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
       // (add n0, c0)
-      return std::make_pair(PtrBase, DWordOffset0);
+      return std::make_pair(PtrBase, OffsetValue0 / Size);
     }
   } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
     // TODO
@@ -3391,7 +3889,7 @@ AMDGPUInstructionSelector::selectDS64Bit4ByteAlignedImpl(MachineOperand &Root) c
 std::pair<Register, int64_t>
 AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
   Register Root, const MachineRegisterInfo &MRI) const {
-  MachineInstr *RootI = MRI.getVRegDef(Root);
+  MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
   if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
     return {Root, 0};
 
@@ -3400,7 +3898,7 @@ AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
     = getConstantVRegValWithLookThrough(RHS.getReg(), MRI, true);
   if (!MaybeOffset)
     return {Root, 0};
-  return {RootI->getOperand(1).getReg(), MaybeOffset->Value};
+  return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue()};
 }
 
 static void addZeroImm(MachineInstrBuilder &MIB) {
@@ -3582,6 +4080,11 @@ bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
 bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
   MachineOperand &Root, Register &RSrcReg, Register &SOffset,
   int64_t &Offset) const {
+
+  // FIXME: Pattern should not reach here.
+  if (STI.useFlatForGlobal())
+    return false;
+
   MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
   if (shouldUseAddr64(AddrData))
     return false;
@@ -3723,7 +4226,7 @@ AMDGPUInstructionSelector::selectMUBUFOffsetAtomic(MachineOperand &Root) const {
 static Optional<uint64_t> getConstantZext32Val(Register Reg,
                                                const MachineRegisterInfo &MRI) {
   // getConstantVRegVal sexts any values, so see if that matters.
-  Optional<int64_t> OffsetVal = getConstantVRegVal(Reg, MRI);
+  Optional<int64_t> OffsetVal = getConstantVRegSExtVal(Reg, MRI);
   if (!OffsetVal || !isInt<32>(*OffsetVal))
     return None;
   return Lo_32(*OffsetVal);
@@ -3833,6 +4336,12 @@ void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
   MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1);
 }
 
+void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
+                                                 const MachineInstr &MI,
+                                                 int OpIdx) const {
+  MIB.addFrameIndex((MI.getOperand(1).getIndex()));
+}
+
 bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const {
   return AMDGPU::isInlinableLiteral16(Imm, STI.hasInv2PiInlineImm());
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 1fe80958917d..d70f18098cd7 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -13,13 +13,11 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRUCTIONSELECTOR_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRUCTIONSELECTOR_H
 
-#include "AMDGPU.h"
-#include "AMDGPUArgumentUsageInfo.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/Register.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
+#include "llvm/CodeGen/Register.h"
 #include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
 
 namespace {
 #define GET_GLOBALISEL_PREDICATE_BITSET
@@ -37,6 +35,7 @@ struct ImageDimIntrinsicInfo;
 
 class AMDGPUInstrInfo;
 class AMDGPURegisterBankInfo;
+class AMDGPUTargetMachine;
 class GCNSubtarget;
 class MachineInstr;
 class MachineIRBuilder;
@@ -47,9 +46,10 @@ class SIInstrInfo;
 class SIMachineFunctionInfo;
 class SIRegisterInfo;
 
-class AMDGPUInstructionSelector : public InstructionSelector {
+class AMDGPUInstructionSelector final : public InstructionSelector {
 private:
   MachineRegisterInfo *MRI;
+  const GCNSubtarget *Subtarget;
 
 public:
   AMDGPUInstructionSelector(const GCNSubtarget &STI,
@@ -71,6 +71,8 @@ private:
     GEPInfo(const MachineInstr &GEP) : GEP(GEP), Imm(0) { }
   };
 
+  bool isSGPR(Register Reg) const;
+
   bool isInstrUniform(const MachineInstr &MI) const;
   bool isVCC(Register Reg, const MachineRegisterInfo &MRI) const;
 
@@ -105,15 +107,20 @@ private:
   bool selectG_INSERT(MachineInstr &I) const;
 
   bool selectInterpP1F16(MachineInstr &MI) const;
+  bool selectWritelane(MachineInstr &MI) const;
   bool selectDivScale(MachineInstr &MI) const;
   bool selectIntrinsicIcmp(MachineInstr &MI) const;
   bool selectBallot(MachineInstr &I) const;
+  bool selectRelocConstant(MachineInstr &I) const;
+  bool selectGroupStaticSize(MachineInstr &I) const;
+  bool selectReturnAddress(MachineInstr &I) const;
   bool selectG_INTRINSIC(MachineInstr &I) const;
 
   bool selectEndCfIntrinsic(MachineInstr &MI) const;
   bool selectDSOrderedIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
   bool selectDSGWSIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
   bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const;
+  bool selectSBarrier(MachineInstr &MI) const;
 
   bool selectImageIntrinsic(MachineInstr &MI,
                             const AMDGPU::ImageDimIntrinsicInfo *Intr) const;
@@ -126,19 +133,21 @@ private:
   bool selectSMRD(MachineInstr &I, ArrayRef<GEPInfo> AddrInfo) const;
 
   void initM0(MachineInstr &I) const;
-  bool selectG_LOAD_ATOMICRMW(MachineInstr &I) const;
+  bool selectG_LOAD_STORE_ATOMICRMW(MachineInstr &I) const;
   bool selectG_AMDGPU_ATOMIC_CMPXCHG(MachineInstr &I) const;
-  bool selectG_STORE(MachineInstr &I) const;
   bool selectG_SELECT(MachineInstr &I) const;
   bool selectG_BRCOND(MachineInstr &I) const;
-  bool selectG_FRAME_INDEX_GLOBAL_VALUE(MachineInstr &I) const;
+  bool selectG_GLOBAL_VALUE(MachineInstr &I) const;
   bool selectG_PTRMASK(MachineInstr &I) const;
   bool selectG_EXTRACT_VECTOR_ELT(MachineInstr &I) const;
   bool selectG_INSERT_VECTOR_ELT(MachineInstr &I) const;
   bool selectG_SHUFFLE_VECTOR(MachineInstr &I) const;
+  bool selectAMDGPU_BUFFER_ATOMIC_FADD(MachineInstr &I) const;
+  bool selectGlobalAtomicFaddIntrinsic(MachineInstr &I) const;
+  bool selectBVHIntrinsic(MachineInstr &I) const;
 
-  std::pair<Register, unsigned>
-  selectVOP3ModsImpl(MachineOperand &Root) const;
+  std::pair<Register, unsigned> selectVOP3ModsImpl(MachineOperand &Root,
+                                                   bool AllowAbs = true) const;
 
   InstructionSelector::ComplexRendererFns
   selectVCSRC(MachineOperand &Root) const;
@@ -149,9 +158,13 @@ private:
   InstructionSelector::ComplexRendererFns
   selectVOP3Mods0(MachineOperand &Root) const;
   InstructionSelector::ComplexRendererFns
+  selectVOP3BMods0(MachineOperand &Root) const;
+  InstructionSelector::ComplexRendererFns
   selectVOP3OMods(MachineOperand &Root) const;
   InstructionSelector::ComplexRendererFns
   selectVOP3Mods(MachineOperand &Root) const;
+  InstructionSelector::ComplexRendererFns
+  selectVOP3BMods(MachineOperand &Root) const;
 
   ComplexRendererFns selectVOP3NoMods(MachineOperand &Root) const;
 
@@ -175,32 +188,45 @@ private:
   selectSmrdSgpr(MachineOperand &Root) const;
 
   template <bool Signed>
-  InstructionSelector::ComplexRendererFns
+  std::pair<Register, int>
   selectFlatOffsetImpl(MachineOperand &Root) const;
+
   InstructionSelector::ComplexRendererFns
   selectFlatOffset(MachineOperand &Root) const;
-
   InstructionSelector::ComplexRendererFns
   selectFlatOffsetSigned(MachineOperand &Root) const;
 
+  InstructionSelector::ComplexRendererFns
+  selectGlobalSAddr(MachineOperand &Root) const;
+
+  InstructionSelector::ComplexRendererFns
+  selectScratchSAddr(MachineOperand &Root) const;
+
   InstructionSelector::ComplexRendererFns
   selectMUBUFScratchOffen(MachineOperand &Root) const;
   InstructionSelector::ComplexRendererFns
   selectMUBUFScratchOffset(MachineOperand &Root) const;
 
-  bool isDSOffsetLegal(Register Base, int64_t Offset,
-                       unsigned OffsetBits) const;
+  bool isDSOffsetLegal(Register Base, int64_t Offset) const;
+  bool isDSOffset2Legal(Register Base, int64_t Offset0, int64_t Offset1,
+                        unsigned Size) const;
 
   std::pair<Register, unsigned>
   selectDS1Addr1OffsetImpl(MachineOperand &Root) const;
   InstructionSelector::ComplexRendererFns
   selectDS1Addr1Offset(MachineOperand &Root) const;
 
-  std::pair<Register, unsigned>
-  selectDS64Bit4ByteAlignedImpl(MachineOperand &Root) const;
   InstructionSelector::ComplexRendererFns
   selectDS64Bit4ByteAligned(MachineOperand &Root) const;
 
+  InstructionSelector::ComplexRendererFns
+  selectDS128Bit8ByteAligned(MachineOperand &Root) const;
+
+  std::pair<Register, unsigned> selectDSReadWrite2Impl(MachineOperand &Root,
+                                                       unsigned size) const;
+  InstructionSelector::ComplexRendererFns
+  selectDSReadWrite2(MachineOperand &Root, unsigned size) const;
+
   std::pair<Register, int64_t>
   getPtrBaseWithConstantOffset(Register Root,
                                const MachineRegisterInfo &MRI) const;
@@ -284,6 +310,8 @@ private:
                         int OpIdx) const;
   void renderExtractSWZ(MachineInstrBuilder &MIB, const MachineInstr &MI,
                         int OpIdx) const;
+  void renderFrameIndex(MachineInstrBuilder &MIB, const MachineInstr &MI,
+                        int OpIdx) const;
 
   bool isInlineImmediate16(int64_t Imm) const;
   bool isInlineImmediate32(int64_t Imm) const;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 5cb7ac320d2f..8ef9c99e8b35 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -83,9 +83,8 @@ def FalsePredicate : Predicate<"false">;
 // Add a predicate to the list if does not already exist to deduplicate it.
 class PredConcat<list<Predicate> lst, Predicate pred> {
   list<Predicate> ret =
-    !foldl([pred], lst, acc, cur,
-           !listconcat(acc, !if(!eq(!cast<string>(cur),!cast<string>(pred)),
-                                [], [cur])));
+      !listconcat([pred], !filter(item, lst,
+                                  !ne(!cast<string>(item), !cast<string>(pred))));
 }
 
 class PredicateControl {
@@ -483,19 +482,20 @@ defm atomic_load_umax : ret_noret_binary_atomic_op<atomic_load_umax>;
 defm atomic_load_umin : ret_noret_binary_atomic_op<atomic_load_umin>;
 defm atomic_load_xor : ret_noret_binary_atomic_op<atomic_load_xor>;
 defm atomic_load_fadd : ret_noret_binary_atomic_op<atomic_load_fadd, 0>;
+let MemoryVT = v2f16 in
+defm atomic_load_fadd_v2f16 : ret_noret_binary_atomic_op<atomic_load_fadd, 0>;
 defm AMDGPUatomic_cmp_swap : ret_noret_binary_atomic_op<AMDGPUatomic_cmp_swap>;
 
-
-def load_align8_local : PatFrag <(ops node:$ptr), (load_local node:$ptr)> {
+def load_align8_local : PatFrag<(ops node:$ptr), (load_local node:$ptr)>,
+                        Aligned<8> {
   let IsLoad = 1;
   let IsNonExtLoad = 1;
-  let MinAlignment = 8;
 }
 
-def load_align16_local : PatFrag <(ops node:$ptr), (load_local node:$ptr)> {
+def load_align16_local : PatFrag<(ops node:$ptr), (load_local node:$ptr)>,
+                        Aligned<16> {
   let IsLoad = 1;
   let IsNonExtLoad = 1;
-  let MinAlignment = 16;
 }
 
 def store_align8_local: PatFrag<(ops node:$val, node:$ptr),
@@ -596,149 +596,6 @@ class DwordAddrPat<ValueType vt, RegisterClass rc> : AMDGPUPat <
   (vt rc:$addr)
 >;
 
-// BFI_INT patterns
-
-multiclass BFIPatterns <Instruction BFI_INT,
-                        Instruction LoadImm32,
-                        RegisterClass RC64> {
-  // Definition from ISA doc:
-  // (y & x) | (z & ~x)
-  def : AMDGPUPat <
-    (or (and i32:$y, i32:$x), (and i32:$z, (not i32:$x))),
-    (BFI_INT $x, $y, $z)
-  >;
-
-  // 64-bit version
-  def : AMDGPUPat <
-    (or (and i64:$y, i64:$x), (and i64:$z, (not i64:$x))),
-    (REG_SEQUENCE RC64,
-      (BFI_INT (i32 (EXTRACT_SUBREG RC64:$x, sub0)),
-               (i32 (EXTRACT_SUBREG RC64:$y, sub0)),
-               (i32 (EXTRACT_SUBREG RC64:$z, sub0))), sub0,
-      (BFI_INT (i32 (EXTRACT_SUBREG RC64:$x, sub1)),
-               (i32 (EXTRACT_SUBREG RC64:$y, sub1)),
-               (i32 (EXTRACT_SUBREG RC64:$z, sub1))), sub1)
-  >;
-
-  // SHA-256 Ch function
-  // z ^ (x & (y ^ z))
-  def : AMDGPUPat <
-    (xor i32:$z, (and i32:$x, (xor i32:$y, i32:$z))),
-    (BFI_INT $x, $y, $z)
-  >;
-
-  // 64-bit version
-  def : AMDGPUPat <
-    (xor i64:$z, (and i64:$x, (xor i64:$y, i64:$z))),
-    (REG_SEQUENCE RC64,
-      (BFI_INT (i32 (EXTRACT_SUBREG RC64:$x, sub0)),
-               (i32 (EXTRACT_SUBREG RC64:$y, sub0)),
-               (i32 (EXTRACT_SUBREG RC64:$z, sub0))), sub0,
-      (BFI_INT (i32 (EXTRACT_SUBREG RC64:$x, sub1)),
-               (i32 (EXTRACT_SUBREG RC64:$y, sub1)),
-               (i32 (EXTRACT_SUBREG RC64:$z, sub1))), sub1)
-  >;
-
-  def : AMDGPUPat <
-    (fcopysign f32:$src0, f32:$src1),
-    (BFI_INT (LoadImm32 (i32 0x7fffffff)), $src0, $src1)
-  >;
-
-  def : AMDGPUPat <
-    (f32 (fcopysign f32:$src0, f64:$src1)),
-    (BFI_INT (LoadImm32 (i32 0x7fffffff)), $src0,
-             (i32 (EXTRACT_SUBREG RC64:$src1, sub1)))
-  >;
-
-  def : AMDGPUPat <
-    (f64 (fcopysign f64:$src0, f64:$src1)),
-    (REG_SEQUENCE RC64,
-      (i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
-      (BFI_INT (LoadImm32 (i32 0x7fffffff)),
-               (i32 (EXTRACT_SUBREG RC64:$src0, sub1)),
-               (i32 (EXTRACT_SUBREG RC64:$src1, sub1))), sub1)
-  >;
-
-  def : AMDGPUPat <
-    (f64 (fcopysign f64:$src0, f32:$src1)),
-    (REG_SEQUENCE RC64,
-      (i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
-      (BFI_INT (LoadImm32 (i32 0x7fffffff)),
-               (i32 (EXTRACT_SUBREG RC64:$src0, sub1)),
-               $src1), sub1)
-  >;
-}
-
-// SHA-256 Ma patterns
-
-// ((x & z) | (y & (x | z))) -> BFI_INT (XOR x, y), z, y
-multiclass SHA256MaPattern <Instruction BFI_INT, Instruction XOR, RegisterClass RC64> {
-  def : AMDGPUPat <
-    (or (and i32:$x, i32:$z), (and i32:$y, (or i32:$x, i32:$z))),
-    (BFI_INT (XOR i32:$x, i32:$y), i32:$z, i32:$y)
-  >;
-
-  def : AMDGPUPat <
-    (or (and i64:$x, i64:$z), (and i64:$y, (or i64:$x, i64:$z))),
-    (REG_SEQUENCE RC64,
-      (BFI_INT (XOR (i32 (EXTRACT_SUBREG RC64:$x, sub0)),
-                    (i32 (EXTRACT_SUBREG RC64:$y, sub0))),
-               (i32 (EXTRACT_SUBREG RC64:$z, sub0)),
-               (i32 (EXTRACT_SUBREG RC64:$y, sub0))), sub0,
-      (BFI_INT (XOR (i32 (EXTRACT_SUBREG RC64:$x, sub1)),
-                    (i32 (EXTRACT_SUBREG RC64:$y, sub1))),
-               (i32 (EXTRACT_SUBREG RC64:$z, sub1)),
-               (i32 (EXTRACT_SUBREG RC64:$y, sub1))), sub1)
-  >;
-}
-
-// Bitfield extract patterns
-
-def IMMZeroBasedBitfieldMask : ImmLeaf <i32, [{
-  return isMask_32(Imm);
-}]>;
-
-def IMMPopCount : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(countPopulation(N->getZExtValue()), SDLoc(N),
-                                   MVT::i32);
-}]>;
-
-multiclass BFEPattern <Instruction UBFE, Instruction SBFE, Instruction MOV> {
-  def : AMDGPUPat <
-    (i32 (and (i32 (srl i32:$src, i32:$rshift)), IMMZeroBasedBitfieldMask:$mask)),
-    (UBFE $src, $rshift, (MOV (i32 (IMMPopCount $mask))))
-  >;
-
-  // x & ((1 << y) - 1)
-  def : AMDGPUPat <
-    (and i32:$src, (add_oneuse (shl_oneuse 1, i32:$width), -1)),
-    (UBFE $src, (MOV (i32 0)), $width)
-  >;
-
-  // x & ~(-1 << y)
-  def : AMDGPUPat <
-    (and i32:$src, (xor_oneuse (shl_oneuse -1, i32:$width), -1)),
-    (UBFE $src, (MOV (i32 0)), $width)
-  >;
-
-  // x & (-1 >> (bitwidth - y))
-  def : AMDGPUPat <
-    (and i32:$src, (srl_oneuse -1, (sub 32, i32:$width))),
-    (UBFE $src, (MOV (i32 0)), $width)
-  >;
-
-  // x << (bitwidth - y) >> (bitwidth - y)
-  def : AMDGPUPat <
-    (srl (shl_oneuse i32:$src, (sub 32, i32:$width)), (sub 32, i32:$width)),
-    (UBFE $src, (MOV (i32 0)), $width)
-  >;
-
-  def : AMDGPUPat <
-    (sra (shl_oneuse i32:$src, (sub 32, i32:$width)), (sub 32, i32:$width)),
-    (SBFE $src, (MOV (i32 0)), $width)
-  >;
-}
-
 // fshr pattern
 class FSHRPattern <Instruction BIT_ALIGN> : AMDGPUPat <
   (fshr i32:$src0, i32:$src1, i32:$src2),
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
new file mode 100644
index 000000000000..8aea33cf289d
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -0,0 +1,195 @@
+//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass does misc. AMDGPU optimizations on IR *just* before instruction
+/// selection.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+#define DEBUG_TYPE "amdgpu-late-codegenprepare"
+
+using namespace llvm;
+
+// Scalar load widening needs running after load-store-vectorizer as that pass
+// doesn't handle overlapping cases. In addition, this pass enhances the
+// widening to handle cases where scalar sub-dword loads are naturally aligned
+// only but not dword aligned.
+static cl::opt<bool>
+    WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads",
+               cl::desc("Widen sub-dword constant address space loads in "
+                        "AMDGPULateCodeGenPrepare"),
+               cl::ReallyHidden, cl::init(true));
+
+namespace {
+
+class AMDGPULateCodeGenPrepare
+    : public FunctionPass,
+      public InstVisitor<AMDGPULateCodeGenPrepare, bool> {
+  Module *Mod = nullptr;
+  const DataLayout *DL = nullptr;
+
+  AssumptionCache *AC = nullptr;
+  LegacyDivergenceAnalysis *DA = nullptr;
+
+public:
+  static char ID;
+
+  AMDGPULateCodeGenPrepare() : FunctionPass(ID) {}
+
+  StringRef getPassName() const override {
+    return "AMDGPU IR late optimizations";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<LegacyDivergenceAnalysis>();
+    AU.setPreservesAll();
+  }
+
+  bool doInitialization(Module &M) override;
+  bool runOnFunction(Function &F) override;
+
+  bool visitInstruction(Instruction &) { return false; }
+
+  // Check if the specified value is at least DWORD aligned.
+  bool isDWORDAligned(const Value *V) const {
+    KnownBits Known = computeKnownBits(V, *DL, 0, AC);
+    return Known.countMinTrailingZeros() >= 2;
+  }
+
+  bool canWidenScalarExtLoad(LoadInst &LI) const;
+  bool visitLoadInst(LoadInst &LI);
+};
+
+} // end anonymous namespace
+
+bool AMDGPULateCodeGenPrepare::doInitialization(Module &M) {
+  Mod = &M;
+  DL = &Mod->getDataLayout();
+  return false;
+}
+
+bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+  DA = &getAnalysis<LegacyDivergenceAnalysis>();
+
+  bool Changed = false;
+  for (auto &BB : F)
+    for (auto BI = BB.begin(), BE = BB.end(); BI != BE; /*EMPTY*/) {
+      Instruction *I = &*BI++;
+      Changed |= visit(*I);
+    }
+
+  return Changed;
+}
+
+bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
+  unsigned AS = LI.getPointerAddressSpace();
+  // Skip non-constant address space.
+  if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
+      AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT)
+    return false;
+  // Skip non-simple loads.
+  if (!LI.isSimple())
+    return false;
+  auto *Ty = LI.getType();
+  // Skip aggregate types.
+  if (Ty->isAggregateType())
+    return false;
+  unsigned TySize = DL->getTypeStoreSize(Ty);
+  // Only handle sub-DWORD loads.
+  if (TySize >= 4)
+    return false;
+  // That load must be at least naturally aligned.
+  if (LI.getAlign() < DL->getABITypeAlign(Ty))
+    return false;
+  // It should be uniform, i.e. a scalar load.
+  return DA->isUniform(&LI);
+}
+
+bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
+  if (!WidenLoads)
+    return false;
+
+  // Skip if that load is already aligned on DWORD at least as it's handled in
+  // SDAG.
+  if (LI.getAlign() >= 4)
+    return false;
+
+  if (!canWidenScalarExtLoad(LI))
+    return false;
+
+  int64_t Offset = 0;
+  auto *Base =
+      GetPointerBaseWithConstantOffset(LI.getPointerOperand(), Offset, *DL);
+  // If that base is not DWORD aligned, it's not safe to perform the following
+  // transforms.
+  if (!isDWORDAligned(Base))
+    return false;
+
+  int64_t Adjust = Offset & 0x3;
+  if (Adjust == 0) {
+    // With a zero adjust, the original alignment could be promoted with a
+    // better one.
+    LI.setAlignment(Align(4));
+    return true;
+  }
+
+  IRBuilder<> IRB(&LI);
+  IRB.SetCurrentDebugLocation(LI.getDebugLoc());
+
+  unsigned AS = LI.getPointerAddressSpace();
+  unsigned LdBits = DL->getTypeStoreSize(LI.getType()) * 8;
+  auto IntNTy = Type::getIntNTy(LI.getContext(), LdBits);
+
+  PointerType *Int32PtrTy = Type::getInt32PtrTy(LI.getContext(), AS);
+  PointerType *Int8PtrTy = Type::getInt8PtrTy(LI.getContext(), AS);
+  auto *NewPtr = IRB.CreateBitCast(
+      IRB.CreateConstGEP1_64(IRB.CreateBitCast(Base, Int8PtrTy),
+                             Offset - Adjust),
+      Int32PtrTy);
+  LoadInst *NewLd = IRB.CreateAlignedLoad(NewPtr, Align(4));
+  NewLd->copyMetadata(LI);
+  NewLd->setMetadata(LLVMContext::MD_range, nullptr);
+
+  unsigned ShAmt = Adjust * 8;
+  auto *NewVal = IRB.CreateBitCast(
+      IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), IntNTy), LI.getType());
+  LI.replaceAllUsesWith(NewVal);
+  RecursivelyDeleteTriviallyDeadInstructions(&LI);
+
+  return true;
+}
+
+INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
+                      "AMDGPU IR late optimizations", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
+INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
+                    "AMDGPU IR late optimizations", false, false)
+
+char AMDGPULateCodeGenPrepare::ID = 0;
+
+FunctionPass *llvm::createAMDGPULateCodeGenPreparePass() {
+  return new AMDGPULateCodeGenPrepare();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 2976794b49c3..9f359c232981 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -15,18 +15,15 @@
 
 #include "AMDGPU.h"
 #include "AMDGPUGlobalISelUtils.h"
+#include "AMDGPUInstrInfo.h"
 #include "AMDGPUTargetMachine.h"
 #include "SIMachineFunctionInfo.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
-#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
-#include "llvm/CodeGen/TargetOpcodes.h"
-#include "llvm/CodeGen/ValueTypes.h"
-#include "llvm/IR/DerivedTypes.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/IR/DiagnosticInfo.h"
-#include "llvm/IR/Type.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
 
 #define DEBUG_TYPE "amdgpu-legalinfo"
 
@@ -60,16 +57,30 @@ static LLT getPow2ScalarType(LLT Ty) {
   return LLT::scalar(Pow2Bits);
 }
 
+/// \returs true if this is an odd sized vector which should widen by adding an
+/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
+/// excludes s1 vectors, which should always be scalarized.
 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
   return [=](const LegalityQuery &Query) {
     const LLT Ty = Query.Types[TypeIdx];
-    return Ty.isVector() &&
-           Ty.getNumElements() % 2 != 0 &&
-           Ty.getElementType().getSizeInBits() < 32 &&
+    if (!Ty.isVector())
+      return false;
+
+    const LLT EltTy = Ty.getElementType();
+    const unsigned EltSize = EltTy.getSizeInBits();
+    return Ty.getNumElements() % 2 != 0 &&
+           EltSize > 1 && EltSize < 32 &&
            Ty.getSizeInBits() % 32 != 0;
   };
 }
 
+static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
+  return [=](const LegalityQuery &Query) {
+    const LLT Ty = Query.Types[TypeIdx];
+    return Ty.getSizeInBits() % 32 == 0;
+  };
+}
+
 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
   return [=](const LegalityQuery &Query) {
     const LLT Ty = Query.Types[TypeIdx];
@@ -115,20 +126,32 @@ static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
   };
 }
 
+static LLT getBitcastRegisterType(const LLT Ty) {
+  const unsigned Size = Ty.getSizeInBits();
+
+  LLT CoercedTy;
+  if (Size <= 32) {
+    // <2 x s8> -> s16
+    // <4 x s8> -> s32
+    return LLT::scalar(Size);
+  }
+
+  return LLT::scalarOrVector(Size / 32, 32);
+}
+
 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
   return [=](const LegalityQuery &Query) {
     const LLT Ty = Query.Types[TypeIdx];
-    unsigned Size = Ty.getSizeInBits();
-
-    LLT CoercedTy;
-    if (Size <= 32) {
-      // <2 x s8> -> s16
-      // <4 x s8> -> s32
-      CoercedTy = LLT::scalar(Size);
-    } else
-      CoercedTy = LLT::scalarOrVector(Size / 32, 32);
+    return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
+  };
+}
 
-    return std::make_pair(TypeIdx, CoercedTy);
+static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
+  return [=](const LegalityQuery &Query) {
+    const LLT Ty = Query.Types[TypeIdx];
+    unsigned Size = Ty.getSizeInBits();
+    assert(Size % 32 == 0);
+    return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
   };
 }
 
@@ -213,7 +236,7 @@ static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
   switch (AS) {
   case AMDGPUAS::PRIVATE_ADDRESS:
     // FIXME: Private element size.
-    return 32;
+    return ST.enableFlatScratch() ? 128 : 32;
   case AMDGPUAS::LOCAL_ADDRESS:
     return ST.useDS128() ? 128 : 64;
   case AMDGPUAS::GLOBAL_ADDRESS:
@@ -243,7 +266,7 @@ static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
 
   unsigned RegSize = Ty.getSizeInBits();
   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
-  unsigned Align = Query.MMODescrs[0].AlignInBits;
+  unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
   unsigned AS = Query.Types[1].getAddressSpace();
 
   // All of these need to be custom lowered to cast the pointer operand.
@@ -286,9 +309,10 @@ static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
 
   assert(RegSize >= MemSize);
 
-  if (Align < MemSize) {
+  if (AlignBits < MemSize) {
     const SITargetLowering *TLI = ST.getTargetLowering();
-    if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8))
+    if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
+                                                 Align(AlignBits / 8)))
       return false;
   }
 
@@ -308,7 +332,12 @@ static bool loadStoreBitcastWorkaround(const LLT Ty) {
     return false;
   if (!Ty.isVector())
     return true;
-  unsigned EltSize = Ty.getElementType().getSizeInBits();
+
+  LLT EltTy = Ty.getElementType();
+  if (EltTy.isPointer())
+    return true;
+
+  unsigned EltSize = EltTy.getSizeInBits();
   return EltSize != 32 && EltSize != 64;
 }
 
@@ -319,6 +348,66 @@ static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
          !loadStoreBitcastWorkaround(Ty);
 }
 
+/// Return true if a load or store of the type should be lowered with a bitcast
+/// to a different type.
+static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
+                                       const unsigned MemSizeInBits) {
+  const unsigned Size = Ty.getSizeInBits();
+    if (Size != MemSizeInBits)
+      return Size <= 32 && Ty.isVector();
+
+  if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
+    return true;
+  return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
+         !isRegisterVectorElementType(Ty.getElementType());
+}
+
+/// Return true if we should legalize a load by widening an odd sized memory
+/// access up to the alignment. Note this case when the memory access itself
+/// changes, not the size of the result register.
+static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits,
+                            unsigned AlignInBits, unsigned AddrSpace,
+                            unsigned Opcode) {
+  // We don't want to widen cases that are naturally legal.
+  if (isPowerOf2_32(SizeInBits))
+    return false;
+
+  // If we have 96-bit memory operations, we shouldn't touch them. Note we may
+  // end up widening these for a scalar load during RegBankSelect, since there
+  // aren't 96-bit scalar loads.
+  if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
+    return false;
+
+  if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
+    return false;
+
+  // A load is known dereferenceable up to the alignment, so it's legal to widen
+  // to it.
+  //
+  // TODO: Could check dereferenceable for less aligned cases.
+  unsigned RoundedSize = NextPowerOf2(SizeInBits);
+  if (AlignInBits < RoundedSize)
+    return false;
+
+  // Do not widen if it would introduce a slow unaligned load.
+  const SITargetLowering *TLI = ST.getTargetLowering();
+  bool Fast = false;
+  return TLI->allowsMisalignedMemoryAccessesImpl(
+             RoundedSize, AddrSpace, Align(AlignInBits / 8),
+             MachineMemOperand::MOLoad, &Fast) &&
+         Fast;
+}
+
+static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
+                            unsigned Opcode) {
+  if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
+    return false;
+
+  return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits,
+                         Query.MMODescrs[0].AlignInBits,
+                         Query.Types[1].getAddressSpace(), Opcode);
+}
+
 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
                                          const GCNTargetMachine &TM)
   :  ST(ST_) {
@@ -329,6 +418,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
   };
 
   const LLT S1 = LLT::scalar(1);
+  const LLT S8 = LLT::scalar(8);
   const LLT S16 = LLT::scalar(16);
   const LLT S32 = LLT::scalar(32);
   const LLT S64 = LLT::scalar(64);
@@ -337,6 +427,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
   const LLT S512 = LLT::scalar(512);
   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
 
+  const LLT V2S8 = LLT::vector(2, 8);
   const LLT V2S16 = LLT::vector(2, 16);
   const LLT V4S16 = LLT::vector(4, 16);
 
@@ -410,48 +501,103 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
   // elements for v3s16
   getActionDefinitionsBuilder(G_PHI)
-    .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
+    .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
     .legalFor(AllS32Vectors)
     .legalFor(AllS64Vectors)
     .legalFor(AddrSpaces64)
     .legalFor(AddrSpaces32)
-    .clampScalar(0, S32, S256)
+    .legalIf(isPointer(0))
+    .clampScalar(0, S16, S256)
     .widenScalarToNextPow2(0, 32)
     .clampMaxNumElements(0, S32, 16)
     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
-    .legalIf(isPointer(0));
+    .scalarize(0);
 
-  if (ST.hasVOP3PInsts()) {
+  if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
+    // Full set of gfx9 features.
     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
       .legalFor({S32, S16, V2S16})
       .clampScalar(0, S16, S32)
       .clampMaxNumElements(0, S16, 2)
       .scalarize(0)
       .widenScalarToNextPow2(0, 32);
+
+    getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
+      .legalFor({S32, S16, V2S16}) // Clamp modifier
+      .minScalarOrElt(0, S16)
+      .clampMaxNumElements(0, S16, 2)
+      .scalarize(0)
+      .widenScalarToNextPow2(0, 32)
+      .lower();
   } else if (ST.has16BitInsts()) {
     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
       .legalFor({S32, S16})
       .clampScalar(0, S16, S32)
       .scalarize(0)
-      .widenScalarToNextPow2(0, 32);
+      .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
+
+    // Technically the saturating operations require clamp bit support, but this
+    // was introduced at the same time as 16-bit operations.
+    getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
+      .legalFor({S32, S16}) // Clamp modifier
+      .minScalar(0, S16)
+      .scalarize(0)
+      .widenScalarToNextPow2(0, 16)
+      .lower();
+
+    // We're just lowering this, but it helps get a better result to try to
+    // coerce to the desired type first.
+    getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
+      .minScalar(0, S16)
+      .scalarize(0)
+      .lower();
   } else {
     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
       .legalFor({S32})
       .clampScalar(0, S32, S32)
       .scalarize(0);
+
+    if (ST.hasIntClamp()) {
+      getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
+        .legalFor({S32}) // Clamp modifier.
+        .scalarize(0)
+        .minScalarOrElt(0, S32)
+        .lower();
+    } else {
+      // Clamp bit support was added in VI, along with 16-bit operations.
+      getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
+        .minScalar(0, S32)
+        .scalarize(0)
+        .lower();
+    }
+
+    // FIXME: DAG expansion gets better results. The widening uses the smaller
+    // range values and goes for the min/max lowering directly.
+    getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
+      .minScalar(0, S32)
+      .scalarize(0)
+      .lower();
   }
 
-  // FIXME: Not really legal. Placeholder for custom lowering.
   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
     .customFor({S32, S64})
     .clampScalar(0, S32, S64)
     .widenScalarToNextPow2(0, 32)
     .scalarize(0);
 
-  getActionDefinitionsBuilder({G_UMULH, G_SMULH})
-    .legalFor({S32})
-    .clampScalar(0, S32, S32)
-    .scalarize(0);
+  auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
+                   .legalFor({S32})
+                   .maxScalarOrElt(0, S32);
+
+  if (ST.hasVOP3PInsts()) {
+    Mulh
+      .clampMaxNumElements(0, S8, 2)
+      .lowerFor({V2S8});
+  }
+
+  Mulh
+    .scalarize(0)
+    .lower();
 
   // Report legal for any types we can handle anywhere. For the cases only legal
   // on the SALU, RegBankSelect will be able to re-legalize.
@@ -479,9 +625,9 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
   getActionDefinitionsBuilder(G_CONSTANT)
     .legalFor({S1, S32, S64, S16, GlobalPtr,
                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
+    .legalIf(isPointer(0))
     .clampScalar(0, S32, S64)
-    .widenScalarToNextPow2(0)
-    .legalIf(isPointer(0));
+    .widenScalarToNextPow2(0);
 
   getActionDefinitionsBuilder(G_FCONSTANT)
     .legalFor({S32, S64, S16})
@@ -505,8 +651,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     .legalFor({{PrivatePtr, S32}});
 
   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
-    .unsupportedFor({PrivatePtr})
-    .custom();
+    .customIf(typeIsNot(0, PrivatePtr));
+
   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
 
   auto &FPOpActions = getActionDefinitionsBuilder(
@@ -599,7 +745,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
 
   getActionDefinitionsBuilder(G_FPEXT)
     .legalFor({{S64, S32}, {S32, S16}})
-    .lowerFor({{S64, S16}}) // FIXME: Implement
+    .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
     .scalarize(0);
 
   getActionDefinitionsBuilder(G_FSUB)
@@ -621,6 +767,15 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
   FMad.scalarize(0)
       .lower();
 
+  auto &FRem = getActionDefinitionsBuilder(G_FREM);
+  if (ST.has16BitInsts()) {
+    FRem.customFor({S16, S32, S64});
+  } else {
+    FRem.minScalar(0, S32)
+        .customFor({S32, S64});
+  }
+  FRem.scalarize(0);
+
   // TODO: Do we need to clamp maximum bitwidth?
   getActionDefinitionsBuilder(G_TRUNC)
     .legalIf(isScalar(0))
@@ -648,12 +803,14 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
   if (ST.has16BitInsts())
     IToFP.legalFor({{S16, S16}});
   IToFP.clampScalar(1, S32, S64)
+       .minScalar(0, S32)
        .scalarize(0)
        .widenScalarToNextPow2(1);
 
   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
-    .customFor({{S64, S64}});
+    .customFor({{S64, S64}})
+    .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
   if (ST.has16BitInsts())
     FPToI.legalFor({{S16, S16}});
   else
@@ -663,7 +820,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
        .scalarize(0)
        .lower();
 
-  getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
+  // Lower roundeven into G_FRINT
+  getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
     .scalarize(0)
     .lower();
 
@@ -685,16 +843,14 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
       .scalarize(0);
   }
 
-  // FIXME: Clamp offset operand.
   getActionDefinitionsBuilder(G_PTR_ADD)
-    .legalIf(isPointer(0))
-    .scalarize(0);
+    .legalIf(all(isPointer(0), sameSize(0, 1)))
+    .scalarize(0)
+    .scalarSameSizeAs(1, 0);
 
   getActionDefinitionsBuilder(G_PTRMASK)
-    .legalIf(typeInSet(1, {S64, S32}))
-    .minScalar(1, S32)
-    .maxScalarIf(sizeIs(0, 32), 1, S32)
-    .maxScalarIf(sizeIs(0, 64), 1, S64)
+    .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
+    .scalarSameSizeAs(1, 0)
     .scalarize(0);
 
   auto &CmpBuilder =
@@ -746,6 +902,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
   ExpOps.clampScalar(0, MinScalarFPTy, S32)
         .scalarize(0);
 
+  getActionDefinitionsBuilder(G_FPOWI)
+    .clampScalar(0, MinScalarFPTy, S32)
+    .lower();
+
   // The 64-bit versions produce 32-bit results, but only on the SALU.
   getActionDefinitionsBuilder(G_CTPOP)
     .legalFor({{S32, S32}, {S32, S64}})
@@ -870,10 +1030,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
 
     // Split vector extloads.
     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
-    unsigned Align = Query.MMODescrs[0].AlignInBits;
+    unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
 
     if (MemSize < DstTy.getSizeInBits())
-      MemSize = std::max(MemSize, Align);
+      MemSize = std::max(MemSize, AlignBits);
 
     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
       return true;
@@ -895,35 +1055,18 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
         return true;
     }
 
-    if (Align < MemSize) {
+    if (AlignBits < MemSize) {
       const SITargetLowering *TLI = ST.getTargetLowering();
-      return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
+      return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
+                                                      Align(AlignBits / 8));
     }
 
     return false;
   };
 
-  const auto shouldWidenLoadResult = [=](const LegalityQuery &Query,
-                                         unsigned Opc) -> bool {
-    unsigned Size = Query.Types[0].getSizeInBits();
-    if (isPowerOf2_32(Size))
-      return false;
-
-    if (Size == 96 && ST.hasDwordx3LoadStores())
-      return false;
-
-    unsigned AddrSpace = Query.Types[1].getAddressSpace();
-    if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc))
-      return false;
-
-    unsigned Align = Query.MMODescrs[0].AlignInBits;
-    unsigned RoundedSize = NextPowerOf2(Size);
-    return (Align >= RoundedSize);
-  };
-
-  unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
-  unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
-  unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
+  unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
+  unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
+  unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
 
   // TODO: Refine based on subtargets which support unaligned access or 128-bit
   // LDS
@@ -981,31 +1124,20 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     // 16-bit vector parts.
     Actions.bitcastIf(
       [=](const LegalityQuery &Query) -> bool {
-        const LLT Ty = Query.Types[0];
-        const unsigned Size = Ty.getSizeInBits();
-
-        if (Size != Query.MMODescrs[0].SizeInBits)
-          return Size <= 32 && Ty.isVector();
-
-        if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
-          return true;
-        return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
-               !isRegisterVectorElementType(Ty.getElementType());
+        return shouldBitcastLoadStoreType(ST, Query.Types[0],
+                                          Query.MMODescrs[0].SizeInBits);
       }, bitcastToRegisterType(0));
 
+    if (!IsStore) {
+      // Widen suitably aligned loads by loading extra bytes. The standard
+      // legalization actions can't properly express widening memory operands.
+      Actions.customIf([=](const LegalityQuery &Query) -> bool {
+        return shouldWidenLoad(ST, Query, G_LOAD);
+      });
+    }
+
+    // FIXME: load/store narrowing should be moved to lower action
     Actions
-        .customIf(typeIs(1, Constant32Ptr))
-        // Widen suitably aligned loads by loading extra elements.
-        .moreElementsIf([=](const LegalityQuery &Query) {
-            const LLT Ty = Query.Types[0];
-            return Op == G_LOAD && Ty.isVector() &&
-                   shouldWidenLoadResult(Query, Op);
-          }, moreElementsToNextPow2(0))
-        .widenScalarIf([=](const LegalityQuery &Query) {
-            const LLT Ty = Query.Types[0];
-            return Op == G_LOAD && !Ty.isVector() &&
-                   shouldWidenLoadResult(Query, Op);
-          }, widenScalarOrEltToNextPow2(0))
         .narrowScalarIf(
             [=](const LegalityQuery &Query) -> bool {
               return !Query.Types[0].isVector() &&
@@ -1111,15 +1243,16 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
               // May need relegalization for the scalars.
               return std::make_pair(0, EltTy);
             })
-        .minScalar(0, S32);
+    .lowerIfMemSizeNotPow2()
+    .minScalar(0, S32);
 
     if (IsStore)
       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
 
-    // TODO: Need a bitcast lower option?
     Actions
         .widenScalarToNextPow2(0)
-        .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
+        .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
+        .lower();
   }
 
   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
@@ -1147,14 +1280,15 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
      G_ATOMICRMW_UMIN})
     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
-               {S64, GlobalPtr}, {S64, LocalPtr}});
+               {S64, GlobalPtr}, {S64, LocalPtr},
+               {S32, RegionPtr}, {S64, RegionPtr}});
   if (ST.hasFlatAddressSpace()) {
     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
   }
 
   if (ST.hasLDSFPAtomics()) {
     getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
-      .legalFor({{S32, LocalPtr}});
+      .legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
   }
 
   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
@@ -1207,6 +1341,11 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     Shifts.clampScalar(1, S32, S32);
     Shifts.clampScalar(0, S16, S64);
     Shifts.widenScalarToNextPow2(0, 16);
+
+    getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
+      .minScalar(0, S16)
+      .scalarize(0)
+      .lower();
   } else {
     // Make sure we legalize the shift amount type first, as the general
     // expansion for the shifted type will produce much worse code if it hasn't
@@ -1214,6 +1353,11 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     Shifts.clampScalar(1, S32, S32);
     Shifts.clampScalar(0, S32, S64);
     Shifts.widenScalarToNextPow2(0, 32);
+
+    getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
+      .minScalar(0, S32)
+      .scalarize(0)
+      .lower();
   }
   Shifts.scalarize(0);
 
@@ -1227,15 +1371,38 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
           const LLT EltTy = Query.Types[EltTypeIdx];
           const LLT VecTy = Query.Types[VecTypeIdx];
           const LLT IdxTy = Query.Types[IdxTypeIdx];
-          return (EltTy.getSizeInBits() == 16 ||
-                  EltTy.getSizeInBits() % 32 == 0) &&
-                 VecTy.getSizeInBits() % 32 == 0 &&
-                 VecTy.getSizeInBits() <= MaxRegisterSize &&
-                 IdxTy.getSizeInBits() == 32;
+          const unsigned EltSize = EltTy.getSizeInBits();
+          return (EltSize == 32 || EltSize == 64) &&
+                  VecTy.getSizeInBits() % 32 == 0 &&
+                  VecTy.getSizeInBits() <= MaxRegisterSize &&
+                  IdxTy.getSizeInBits() == 32;
+        })
+      .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
+                 bitcastToVectorElement32(VecTypeIdx))
+      //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
+      .bitcastIf(
+        all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
+        [=](const LegalityQuery &Query) {
+          // For > 64-bit element types, try to turn this into a 64-bit
+          // element vector since we may be able to do better indexing
+          // if this is scalar. If not, fall back to 32.
+          const LLT EltTy = Query.Types[EltTypeIdx];
+          const LLT VecTy = Query.Types[VecTypeIdx];
+          const unsigned DstEltSize = EltTy.getSizeInBits();
+          const unsigned VecSize = VecTy.getSizeInBits();
+
+          const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
+          return std::make_pair(
+            VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize));
         })
       .clampScalar(EltTypeIdx, S32, S64)
       .clampScalar(VecTypeIdx, S32, S64)
-      .clampScalar(IdxTypeIdx, S32, S32);
+      .clampScalar(IdxTypeIdx, S32, S32)
+      .clampMaxNumElements(VecTypeIdx, S32, 32)
+      // TODO: Clamp elements for 64-bit vectors?
+      // It should only be necessary with variable indexes.
+      // As a last resort, lower to the stack
+      .lower();
   }
 
   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
@@ -1306,7 +1473,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
 
   // FIXME: Clamp maximum size
   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
-    .legalIf(isRegisterType(0));
+    .legalIf(all(isRegisterType(0), isRegisterType(1)))
+    .clampMaxNumElements(0, S32, 32)
+    .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
+    .clampMaxNumElements(0, S16, 64);
 
   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
   // pre-legalize.
@@ -1335,6 +1505,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     };
 
     auto &Builder = getActionDefinitionsBuilder(Op)
+      .legalIf(all(isRegisterType(0), isRegisterType(1)))
       .lowerFor({{S16, V2S16}})
       .lowerIf([=](const LegalityQuery &Query) {
           const LLT BigTy = Query.Types[BigTyIdx];
@@ -1390,19 +1561,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
         }
         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
       })
-      .legalIf([=](const LegalityQuery &Query) {
-          const LLT &BigTy = Query.Types[BigTyIdx];
-          const LLT &LitTy = Query.Types[LitTyIdx];
-
-          if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
-            return false;
-          if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
-            return false;
-
-          return BigTy.getSizeInBits() % 16 == 0 &&
-                 LitTy.getSizeInBits() % 16 == 0 &&
-                 BigTy.getSizeInBits() <= MaxRegisterSize;
-        })
       // Any vectors left are the wrong size. Scalarize them.
       .scalarize(0)
       .scalarize(1);
@@ -1427,12 +1585,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     SextInReg.lowerFor({{S32}, {S64}});
   }
 
-  // FIXME: Placeholder rule. Really depends on whether the clamp modifier is
-  // available, and is selectively legal for s16, s32, v2s16.
-  getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT, G_UADDSAT, G_USUBSAT})
-    .scalarize(0)
-    .clampScalar(0, S16, S32);
-
   SextInReg
     .scalarize(0)
     .clampScalar(0, S32, S64)
@@ -1446,11 +1598,16 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
     .legalFor({S64});
 
+  getActionDefinitionsBuilder(G_FENCE)
+    .alwaysLegal();
+
   getActionDefinitionsBuilder({
       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
       G_FCOPYSIGN,
 
       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
+      G_ATOMICRMW_NAND,
+      G_ATOMICRMW_FSUB,
       G_READ_REGISTER,
       G_WRITE_REGISTER,
 
@@ -1474,7 +1631,6 @@ bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
                                          MachineInstr &MI) const {
   MachineIRBuilder &B = Helper.MIRBuilder;
   MachineRegisterInfo &MRI = *B.getMRI();
-  GISelChangeObserver &Observer = Helper.Observer;
 
   switch (MI.getOpcode()) {
   case TargetOpcode::G_ADDRSPACE_CAST:
@@ -1483,6 +1639,8 @@ bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
     return legalizeFrint(MI, MRI, B);
   case TargetOpcode::G_FCEIL:
     return legalizeFceil(MI, MRI, B);
+  case TargetOpcode::G_FREM:
+    return legalizeFrem(MI, MRI, B);
   case TargetOpcode::G_INTRINSIC_TRUNC:
     return legalizeIntrinsicTrunc(MI, MRI, B);
   case TargetOpcode::G_SITOFP:
@@ -1510,7 +1668,7 @@ bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
   case TargetOpcode::G_GLOBAL_VALUE:
     return legalizeGlobalValue(MI, MRI, B);
   case TargetOpcode::G_LOAD:
-    return legalizeLoad(MI, MRI, B, Observer);
+    return legalizeLoad(Helper, MI);
   case TargetOpcode::G_FMAD:
     return legalizeFMad(MI, MRI, B);
   case TargetOpcode::G_FDIV:
@@ -1580,8 +1738,7 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
   Register QueuePtr = MRI.createGenericVirtualRegister(
     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
 
-  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-  if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
+  if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
     return Register();
 
   // Offset into amd_queue_t for group_segment_aperture_base_hi /
@@ -1623,8 +1780,7 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
   const AMDGPUTargetMachine &TM
     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
 
-  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-  if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
+  if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
     return true;
   }
@@ -1721,6 +1877,7 @@ bool AMDGPULegalizerInfo::legalizeFrint(
 
   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
+  MI.eraseFromParent();
   return true;
 }
 
@@ -1752,7 +1909,24 @@ bool AMDGPULegalizerInfo::legalizeFceil(
   return true;
 }
 
-static MachineInstrBuilder extractF64Exponent(unsigned Hi,
+bool AMDGPULegalizerInfo::legalizeFrem(
+  MachineInstr &MI, MachineRegisterInfo &MRI,
+  MachineIRBuilder &B) const {
+    Register DstReg = MI.getOperand(0).getReg();
+    Register Src0Reg = MI.getOperand(1).getReg();
+    Register Src1Reg = MI.getOperand(2).getReg();
+    auto Flags = MI.getFlags();
+    LLT Ty = MRI.getType(DstReg);
+
+    auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
+    auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
+    auto Neg = B.buildFNeg(Ty, Trunc, Flags);
+    B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
+    MI.eraseFromParent();
+    return true;
+}
+
+static MachineInstrBuilder extractF64Exponent(Register Hi,
                                               MachineIRBuilder &B) {
   const unsigned FractBits = 52;
   const unsigned ExpBits = 11;
@@ -1762,6 +1936,7 @@ static MachineInstrBuilder extractF64Exponent(unsigned Hi,
   auto Const1 = B.buildConstant(S32, ExpBits);
 
   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
+    .addUse(Hi)
     .addUse(Const0.getReg(0))
     .addUse(Const1.getReg(0));
 
@@ -1809,6 +1984,7 @@ bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
 
   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
+  MI.eraseFromParent();
   return true;
 }
 
@@ -1907,10 +2083,11 @@ bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
   // FIXME: Artifact combiner probably should have replaced the truncated
   // constant before this, so we shouldn't need
   // getConstantVRegValWithLookThrough.
-  Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
-    MI.getOperand(2).getReg(), MRI);
-  if (!IdxVal) // Dynamic case will be selected to register indexing.
+  Optional<ValueAndVReg> MaybeIdxVal =
+      getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
+  if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
     return true;
+  const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
 
   Register Dst = MI.getOperand(0).getReg();
   Register Vec = MI.getOperand(1).getReg();
@@ -1919,8 +2096,8 @@ bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
   LLT EltTy = VecTy.getElementType();
   assert(EltTy == MRI.getType(Dst));
 
-  if (IdxVal->Value < VecTy.getNumElements())
-    B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
+  if (IdxVal < VecTy.getNumElements())
+    B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
   else
     B.buildUndef(Dst);
 
@@ -1938,11 +2115,12 @@ bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
   // FIXME: Artifact combiner probably should have replaced the truncated
   // constant before this, so we shouldn't need
   // getConstantVRegValWithLookThrough.
-  Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
-    MI.getOperand(3).getReg(), MRI);
-  if (!IdxVal) // Dynamic case will be selected to register indexing.
+  Optional<ValueAndVReg> MaybeIdxVal =
+      getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
+  if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
     return true;
 
+  int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
   Register Dst = MI.getOperand(0).getReg();
   Register Vec = MI.getOperand(1).getReg();
   Register Ins = MI.getOperand(2).getReg();
@@ -1951,8 +2129,8 @@ bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
   LLT EltTy = VecTy.getElementType();
   assert(EltTy == MRI.getType(Ins));
 
-  if (IdxVal->Value < VecTy.getNumElements())
-    B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
+  if (IdxVal < VecTy.getNumElements())
+    B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
   else
     B.buildUndef(Dst);
 
@@ -2043,7 +2221,9 @@ bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
   // variable, but since the encoding of $symbol starts 4 bytes after the start
   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
   // small. This requires us to add 4 to the global variable offset in order to
-  // compute the correct address.
+  // compute the correct address. Similarly for the s_addc_u32 instruction, the
+  // encoding of $symbol starts 12 bytes after the start of the s_add_u32
+  // instruction.
 
   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
 
@@ -2057,7 +2237,7 @@ bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
   if (GAFlags == SIInstrInfo::MO_NONE)
     MIB.addImm(0);
   else
-    MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
+    MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
 
   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
 
@@ -2078,7 +2258,7 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue(
   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
-    if (!MFI->isEntryFunction()) {
+    if (!MFI->isModuleEntryFunction()) {
       const Function &Fn = MF.getFunction();
       DiagnosticInfoUnsupported BadLDSDecl(
         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
@@ -2104,6 +2284,25 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue(
         return true; // Leave in place;
       }
 
+      if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
+        Type *Ty = GV->getValueType();
+        // HIP uses an unsized array `extern __shared__ T s[]` or similar
+        // zero-sized type in other languages to declare the dynamic shared
+        // memory which size is not known at the compile time. They will be
+        // allocated by the runtime and placed directly after the static
+        // allocated ones. They all share the same offset.
+        if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
+          // Adjust alignment for that dynamic shared memory array.
+          MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
+          LLT S32 = LLT::scalar(32);
+          auto Sz =
+              B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
+          B.buildIntToPtr(DstReg, Sz);
+          MI.eraseFromParent();
+          return true;
+        }
+      }
+
       B.buildConstant(
           DstReg,
           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
@@ -2154,15 +2353,90 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue(
   return true;
 }
 
-bool AMDGPULegalizerInfo::legalizeLoad(
-  MachineInstr &MI, MachineRegisterInfo &MRI,
-  MachineIRBuilder &B, GISelChangeObserver &Observer) const {
-  LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
-  auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
-  Observer.changingInstr(MI);
-  MI.getOperand(1).setReg(Cast.getReg(0));
-  Observer.changedInstr(MI);
-  return true;
+static LLT widenToNextPowerOf2(LLT Ty) {
+  if (Ty.isVector())
+    return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements()));
+  return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
+}
+
+bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
+                                       MachineInstr &MI) const {
+  MachineIRBuilder &B = Helper.MIRBuilder;
+  MachineRegisterInfo &MRI = *B.getMRI();
+  GISelChangeObserver &Observer = Helper.Observer;
+
+  Register PtrReg = MI.getOperand(1).getReg();
+  LLT PtrTy = MRI.getType(PtrReg);
+  unsigned AddrSpace = PtrTy.getAddressSpace();
+
+  if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
+    LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
+    auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
+    Observer.changingInstr(MI);
+    MI.getOperand(1).setReg(Cast.getReg(0));
+    Observer.changedInstr(MI);
+    return true;
+  }
+
+  Register ValReg = MI.getOperand(0).getReg();
+  LLT ValTy = MRI.getType(ValReg);
+
+  MachineMemOperand *MMO = *MI.memoperands_begin();
+  const unsigned ValSize = ValTy.getSizeInBits();
+  const unsigned MemSize = 8 * MMO->getSize();
+  const Align MemAlign = MMO->getAlign();
+  const unsigned AlignInBits = 8 * MemAlign.value();
+
+  // Widen non-power-of-2 loads to the alignment if needed
+  if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) {
+    const unsigned WideMemSize = PowerOf2Ceil(MemSize);
+
+    // This was already the correct extending load result type, so just adjust
+    // the memory type.
+    if (WideMemSize == ValSize) {
+      MachineFunction &MF = B.getMF();
+
+      MachineMemOperand *WideMMO =
+          MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
+      Observer.changingInstr(MI);
+      MI.setMemRefs(MF, {WideMMO});
+      Observer.changedInstr(MI);
+      return true;
+    }
+
+    // Don't bother handling edge case that should probably never be produced.
+    if (ValSize > WideMemSize)
+      return false;
+
+    LLT WideTy = widenToNextPowerOf2(ValTy);
+
+    Register WideLoad;
+    if (!WideTy.isVector()) {
+      WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
+      B.buildTrunc(ValReg, WideLoad).getReg(0);
+    } else {
+      // Extract the subvector.
+
+      if (isRegisterType(ValTy)) {
+        // If this a case where G_EXTRACT is legal, use it.
+        // (e.g. <3 x s32> -> <4 x s32>)
+        WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
+        B.buildExtract(ValReg, WideLoad, 0);
+      } else {
+        // For cases where the widened type isn't a nice register value, unmerge
+        // from a widened register (e.g. <3 x s16> -> <4 x s16>)
+        B.setInsertPt(B.getMBB(), ++B.getInsertPt());
+        WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
+        B.setInsertPt(B.getMBB(), MI.getIterator());
+        B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
+      }
+    }
+
+    MI.eraseFromParent();
+    return true;
+  }
+
+  return false;
 }
 
 bool AMDGPULegalizerInfo::legalizeFMad(
@@ -2194,8 +2468,7 @@ bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
   Register CmpVal = MI.getOperand(2).getReg();
   Register NewVal = MI.getOperand(3).getReg();
 
-  assert(SITargetLowering::isFlatGlobalAddrSpace(
-           MRI.getType(PtrReg).getAddressSpace()) &&
+  assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
          "this should not have been custom lowered");
 
   LLT ValTy = MRI.getType(CmpVal);
@@ -2364,23 +2637,42 @@ bool AMDGPULegalizerInfo::legalizeBuildVector(
   return true;
 }
 
+// Check that this is a G_XOR x, -1
+static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
+  if (MI.getOpcode() != TargetOpcode::G_XOR)
+    return false;
+  auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
+  return ConstVal && *ConstVal == -1;
+}
+
 // Return the use branch instruction, otherwise null if the usage is invalid.
-static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
-                                       MachineRegisterInfo &MRI,
-                                       MachineInstr *&Br,
-                                       MachineBasicBlock *&UncondBrTarget) {
+static MachineInstr *
+verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
+                  MachineBasicBlock *&UncondBrTarget, bool &Negated) {
   Register CondDef = MI.getOperand(0).getReg();
   if (!MRI.hasOneNonDBGUse(CondDef))
     return nullptr;
 
   MachineBasicBlock *Parent = MI.getParent();
-  MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
-  if (UseMI.getParent() != Parent ||
-      UseMI.getOpcode() != AMDGPU::G_BRCOND)
+  MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
+
+  if (isNot(MRI, *UseMI)) {
+    Register NegatedCond = UseMI->getOperand(0).getReg();
+    if (!MRI.hasOneNonDBGUse(NegatedCond))
+      return nullptr;
+
+    // We're deleting the def of this value, so we need to remove it.
+    UseMI->eraseFromParent();
+
+    UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
+    Negated = true;
+  }
+
+  if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
     return nullptr;
 
   // Make sure the cond br is followed by a G_BR, or is the last instruction.
-  MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
+  MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
   if (Next == Parent->end()) {
     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
@@ -2393,84 +2685,19 @@ static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
     UncondBrTarget = Br->getOperand(0).getMBB();
   }
 
-  return &UseMI;
-}
-
-Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B,
-                                               MachineRegisterInfo &MRI,
-                                               Register LiveIn,
-                                               Register PhyReg) const {
-  assert(PhyReg.isPhysical() && "Physical register expected");
-
-  // Insert the live-in copy, if required, by defining destination virtual
-  // register.
-  // FIXME: It seems EmitLiveInCopies isn't called anywhere?
-  if (!MRI.getVRegDef(LiveIn)) {
-    // FIXME: Should have scoped insert pt
-    MachineBasicBlock &OrigInsBB = B.getMBB();
-    auto OrigInsPt = B.getInsertPt();
-
-    MachineBasicBlock &EntryMBB = B.getMF().front();
-    EntryMBB.addLiveIn(PhyReg);
-    B.setInsertPt(EntryMBB, EntryMBB.begin());
-    B.buildCopy(LiveIn, PhyReg);
-
-    B.setInsertPt(OrigInsBB, OrigInsPt);
-  }
-
-  return LiveIn;
-}
-
-Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B,
-                                                MachineRegisterInfo &MRI,
-                                                Register PhyReg, LLT Ty,
-                                                bool InsertLiveInCopy) const {
-  assert(PhyReg.isPhysical() && "Physical register expected");
-
-  // Get or create virtual live-in regester
-  Register LiveIn = MRI.getLiveInVirtReg(PhyReg);
-  if (!LiveIn) {
-    LiveIn = MRI.createGenericVirtualRegister(Ty);
-    MRI.addLiveIn(PhyReg, LiveIn);
-  }
-
-  // When the actual true copy required is from virtual register to physical
-  // register (to be inserted later), live-in copy insertion from physical
-  // to register virtual register is not required
-  if (!InsertLiveInCopy)
-    return LiveIn;
-
-  return insertLiveInCopy(B, MRI, LiveIn, PhyReg);
-}
-
-const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor(
-    MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
-  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
-  const ArgDescriptor *Arg;
-  const TargetRegisterClass *RC;
-  LLT ArgTy;
-  std::tie(Arg, RC, ArgTy) = MFI->getPreloadedValue(ArgType);
-  if (!Arg) {
-    LLVM_DEBUG(dbgs() << "Required arg register missing\n");
-    return nullptr;
-  }
-  return Arg;
+  return UseMI;
 }
 
 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
-                                         const ArgDescriptor *Arg) const {
-  if (!Arg->isRegister() || !Arg->getRegister().isValid())
-    return false; // TODO: Handle these
-
-  Register SrcReg = Arg->getRegister();
-  assert(SrcReg.isPhysical() && "Physical register expected");
+                                         const ArgDescriptor *Arg,
+                                         const TargetRegisterClass *ArgRC,
+                                         LLT ArgTy) const {
+  MCRegister SrcReg = Arg->getRegister();
+  assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
   assert(DstReg.isVirtual() && "Virtual register expected");
 
-  MachineRegisterInfo &MRI = *B.getMRI();
-
-  LLT Ty = MRI.getType(DstReg);
-  Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty);
-
+  Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
+                                             ArgTy);
   if (Arg->isMasked()) {
     // TODO: Should we try to emit this once in the entry block?
     const LLT S32 = LLT::scalar(32);
@@ -2492,15 +2719,24 @@ bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
   return true;
 }
 
-bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
-    MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
+bool AMDGPULegalizerInfo::loadInputValue(
+    Register DstReg, MachineIRBuilder &B,
     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
+  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
+  const ArgDescriptor *Arg;
+  const TargetRegisterClass *ArgRC;
+  LLT ArgTy;
+  std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
 
-  const ArgDescriptor *Arg = getArgDescriptor(B, ArgType);
-  if (!Arg)
-    return false;
+  if (!Arg->isRegister() || !Arg->getRegister().isValid())
+    return false; // TODO: Handle these
+  return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
+}
 
-  if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg))
+bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
+    MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
+    AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
+  if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
     return false;
 
   MI.eraseFromParent();
@@ -2516,9 +2752,6 @@ bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
   LLT S32 = LLT::scalar(32);
   LLT S64 = LLT::scalar(64);
 
-  if (legalizeFastUnsafeFDIV(MI, MRI, B))
-    return true;
-
   if (DstTy == S16)
     return legalizeFDIV16(MI, MRI, B);
   if (DstTy == S32)
@@ -2813,22 +3046,14 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
   Register Res = MI.getOperand(0).getReg();
   Register LHS = MI.getOperand(1).getReg();
   Register RHS = MI.getOperand(2).getReg();
-
   uint16_t Flags = MI.getFlags();
-
   LLT ResTy = MRI.getType(Res);
-  LLT S32 = LLT::scalar(32);
-  LLT S64 = LLT::scalar(64);
 
   const MachineFunction &MF = B.getMF();
-  bool Unsafe =
-    MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
-
-  if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
-    return false;
+  bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
+                            MI.getFlag(MachineInstr::FmAfn);
 
-  if (!Unsafe && ResTy == S32 &&
-      MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
+  if (!AllowInaccurateRcp)
     return false;
 
   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
@@ -2855,22 +3080,58 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
   }
 
   // x / y -> x * (1.0 / y)
-  if (Unsafe) {
-    auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
-      .addUse(RHS)
-      .setMIFlags(Flags);
-    B.buildFMul(Res, LHS, RCP, Flags);
+  auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
+    .addUse(RHS)
+    .setMIFlags(Flags);
+  B.buildFMul(Res, LHS, RCP, Flags);
 
-    MI.eraseFromParent();
-    return true;
-  }
+  MI.eraseFromParent();
+  return true;
+}
 
-  return false;
+bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
+                                                   MachineRegisterInfo &MRI,
+                                                   MachineIRBuilder &B) const {
+  Register Res = MI.getOperand(0).getReg();
+  Register X = MI.getOperand(1).getReg();
+  Register Y = MI.getOperand(2).getReg();
+  uint16_t Flags = MI.getFlags();
+  LLT ResTy = MRI.getType(Res);
+
+  const MachineFunction &MF = B.getMF();
+  bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
+                            MI.getFlag(MachineInstr::FmAfn);
+
+  if (!AllowInaccurateRcp)
+    return false;
+
+  auto NegY = B.buildFNeg(ResTy, Y);
+  auto One = B.buildFConstant(ResTy, 1.0);
+
+  auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
+    .addUse(Y)
+    .setMIFlags(Flags);
+
+  auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
+  R = B.buildFMA(ResTy, Tmp0, R, R);
+
+  auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
+  R = B.buildFMA(ResTy, Tmp1, R, R);
+
+  auto Ret = B.buildFMul(ResTy, X, R);
+  auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
+
+  B.buildFMA(Res, Tmp2, R, Ret);
+  MI.eraseFromParent();
+  return true;
 }
 
 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
                                          MachineRegisterInfo &MRI,
                                          MachineIRBuilder &B) const {
+  if (legalizeFastUnsafeFDIV(MI, MRI, B))
+    return true;
+
   Register Res = MI.getOperand(0).getReg();
   Register LHS = MI.getOperand(1).getReg();
   Register RHS = MI.getOperand(2).getReg();
@@ -2933,6 +3194,9 @@ static void toggleSPDenormMode(bool Enable,
 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
                                          MachineRegisterInfo &MRI,
                                          MachineIRBuilder &B) const {
+  if (legalizeFastUnsafeFDIV(MI, MRI, B))
+    return true;
+
   Register Res = MI.getOperand(0).getReg();
   Register LHS = MI.getOperand(1).getReg();
   Register RHS = MI.getOperand(2).getReg();
@@ -2999,6 +3263,9 @@ bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
                                          MachineRegisterInfo &MRI,
                                          MachineIRBuilder &B) const {
+  if (legalizeFastUnsafeFDIV64(MI, MRI, B))
+    return true;
+
   Register Res = MI.getOperand(0).getReg();
   Register LHS = MI.getOperand(1).getReg();
   Register RHS = MI.getOperand(2).getReg();
@@ -3109,35 +3376,118 @@ bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
   return true;
 }
 
-bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
-                                                 MachineRegisterInfo &MRI,
-                                                 MachineIRBuilder &B) const {
+// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
+// FIXME: Why do we handle this one but not other removed instructions?
+//
+// Reciprocal square root.  The clamp prevents infinite results, clamping
+// infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
+// +-max_float.
+bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
+                                                    MachineRegisterInfo &MRI,
+                                                    MachineIRBuilder &B) const {
+  if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
+    return true;
+
+  Register Dst = MI.getOperand(0).getReg();
+  Register Src = MI.getOperand(2).getReg();
+  auto Flags = MI.getFlags();
+
+  LLT Ty = MRI.getType(Dst);
+
+  const fltSemantics *FltSemantics;
+  if (Ty == LLT::scalar(32))
+    FltSemantics = &APFloat::IEEEsingle();
+  else if (Ty == LLT::scalar(64))
+    FltSemantics = &APFloat::IEEEdouble();
+  else
+    return false;
+
+  auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
+    .addUse(Src)
+    .setMIFlags(Flags);
+
+  // We don't need to concern ourselves with the snan handling difference, since
+  // the rsq quieted (or not) so use the one which will directly select.
   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
-  if (!MFI->isEntryFunction()) {
-    return legalizePreloadedArgIntrin(MI, MRI, B,
-                                      AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
+  const bool UseIEEE = MFI->getMode().IEEE;
+
+  auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
+  auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
+                            B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
+
+  auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
+
+  if (UseIEEE)
+    B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
+  else
+    B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
+  MI.eraseFromParent();
+  return true;
+}
+
+static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
+  switch (IID) {
+  case Intrinsic::amdgcn_ds_fadd:
+    return AMDGPU::G_ATOMICRMW_FADD;
+  case Intrinsic::amdgcn_ds_fmin:
+    return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
+  case Intrinsic::amdgcn_ds_fmax:
+    return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
+  default:
+    llvm_unreachable("not a DS FP intrinsic");
   }
+}
 
+bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
+                                                      MachineInstr &MI,
+                                                      Intrinsic::ID IID) const {
+  GISelChangeObserver &Observer = Helper.Observer;
+  Observer.changingInstr(MI);
+
+  MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
+
+  // The remaining operands were used to set fields in the MemOperand on
+  // construction.
+  for (int I = 6; I > 3; --I)
+    MI.RemoveOperand(I);
+
+  MI.RemoveOperand(1); // Remove the intrinsic ID.
+  Observer.changedInstr(MI);
+  return true;
+}
+
+bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
+                                            MachineRegisterInfo &MRI,
+                                            MachineIRBuilder &B) const {
   uint64_t Offset =
     ST.getTargetLowering()->getImplicitParameterOffset(
       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
-  Register DstReg = MI.getOperand(0).getReg();
   LLT DstTy = MRI.getType(DstReg);
   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
 
-  const ArgDescriptor *Arg;
-  const TargetRegisterClass *RC;
-  LLT ArgTy;
-  std::tie(Arg, RC, ArgTy) =
-      MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
-  if (!Arg)
-    return false;
-
   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
-  if (!loadInputValue(KernargPtrReg, B, Arg))
+  if (!loadInputValue(KernargPtrReg, B,
+                      AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
     return false;
 
+  // FIXME: This should be nuw
   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
+  return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
+                                                 MachineRegisterInfo &MRI,
+                                                 MachineIRBuilder &B) const {
+  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
+  if (!MFI->isEntryFunction()) {
+    return legalizePreloadedArgIntrin(MI, MRI, B,
+                                      AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
+  }
+
+  Register DstReg = MI.getOperand(0).getReg();
+  if (!getImplicitArgPtr(DstReg, MRI, B))
+    return false;
+
   MI.eraseFromParent();
   return true;
 }
@@ -3147,7 +3497,9 @@ bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
                                               MachineIRBuilder &B,
                                               unsigned AddrSpace) const {
   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
-  auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
+  auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
+  Register Hi32 = Unmerge.getReg(1);
+
   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
   MI.eraseFromParent();
   return true;
@@ -3165,11 +3517,10 @@ AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
   const unsigned MaxImm = 4095;
   Register BaseReg;
   unsigned TotalConstOffset;
-  MachineInstr *OffsetDef;
   const LLT S32 = LLT::scalar(32);
 
-  std::tie(BaseReg, TotalConstOffset, OffsetDef)
-    = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
+  std::tie(BaseReg, TotalConstOffset) =
+      AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
 
   unsigned ImmOffset = TotalConstOffset;
 
@@ -3205,24 +3556,58 @@ AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
 /// Handle register layout difference for f16 images for some subtargets.
 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
                                              MachineRegisterInfo &MRI,
-                                             Register Reg) const {
-  if (!ST.hasUnpackedD16VMem())
-    return Reg;
-
+                                             Register Reg,
+                                             bool ImageStore) const {
   const LLT S16 = LLT::scalar(16);
   const LLT S32 = LLT::scalar(32);
   LLT StoreVT = MRI.getType(Reg);
   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
 
-  auto Unmerge = B.buildUnmerge(S16, Reg);
+  if (ST.hasUnpackedD16VMem()) {
+    auto Unmerge = B.buildUnmerge(S16, Reg);
 
-  SmallVector<Register, 4> WideRegs;
-  for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
-    WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
+    SmallVector<Register, 4> WideRegs;
+    for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
+      WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
 
-  int NumElts = StoreVT.getNumElements();
+    int NumElts = StoreVT.getNumElements();
 
-  return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
+    return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
+  }
+
+  if (ImageStore && ST.hasImageStoreD16Bug()) {
+    if (StoreVT.getNumElements() == 2) {
+      SmallVector<Register, 4> PackedRegs;
+      Reg = B.buildBitcast(S32, Reg).getReg(0);
+      PackedRegs.push_back(Reg);
+      PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
+      return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0);
+    }
+
+    if (StoreVT.getNumElements() == 3) {
+      SmallVector<Register, 4> PackedRegs;
+      auto Unmerge = B.buildUnmerge(S16, Reg);
+      for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
+        PackedRegs.push_back(Unmerge.getReg(I));
+      PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
+      Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0);
+      return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0);
+    }
+
+    if (StoreVT.getNumElements() == 4) {
+      SmallVector<Register, 4> PackedRegs;
+      Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0);
+      auto Unmerge = B.buildUnmerge(S32, Reg);
+      for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
+        PackedRegs.push_back(Unmerge.getReg(I));
+      PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
+      return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0);
+    }
+
+    llvm_unreachable("invalid data type");
+  }
+
+  return Reg;
 }
 
 Register AMDGPULegalizerInfo::fixStoreSourceType(
@@ -3513,6 +3898,9 @@ static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
+  case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
+  case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
+    return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
   default:
     llvm_unreachable("unhandled atomic opcode");
   }
@@ -3523,12 +3911,20 @@ bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
                                                Intrinsic::ID IID) const {
   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
+  const bool HasReturn = MI.getNumExplicitDefs() != 0;
 
-  Register Dst = MI.getOperand(0).getReg();
-  Register VData = MI.getOperand(2).getReg();
+  Register Dst;
 
-  Register CmpVal;
   int OpOffset = 0;
+  if (HasReturn) {
+    // A few FP atomics do not support return values.
+    Dst = MI.getOperand(0).getReg();
+  } else {
+    OpOffset = -1;
+  }
+
+  Register VData = MI.getOperand(2 + OpOffset).getReg();
+  Register CmpVal;
 
   if (IsCmpSwap) {
     CmpVal = MI.getOperand(3 + OpOffset).getReg();
@@ -3536,7 +3932,7 @@ bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
   }
 
   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
-  const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
+  const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
 
   // The struct intrinsic variants add one additional operand over raw.
   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
@@ -3561,9 +3957,12 @@ bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
   if (!VIndex)
     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
 
-  auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
-    .addDef(Dst)
-    .addUse(VData); // vdata
+  auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
+
+  if (HasReturn)
+    MIB.addDef(Dst);
+
+  MIB.addUse(VData); // vdata
 
   if (IsCmpSwap)
     MIB.addReg(CmpVal);
@@ -3583,38 +3982,41 @@ bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
 
 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
 /// vector with s16 typed elements.
-static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
-                                        SmallVectorImpl<Register> &PackedAddrs,
-                                        int AddrIdx, int DimIdx, int EndIdx,
-                                        int NumGradients) {
+static void packImageA16AddressToDwords(
+    MachineIRBuilder &B, MachineInstr &MI,
+    SmallVectorImpl<Register> &PackedAddrs, unsigned ArgOffset,
+    const AMDGPU::ImageDimIntrinsicInfo *Intr, unsigned EndIdx) {
   const LLT S16 = LLT::scalar(16);
   const LLT V2S16 = LLT::vector(2, 16);
 
-  for (int I = AddrIdx; I < EndIdx; ++I) {
-    MachineOperand &SrcOp = MI.getOperand(I);
+  for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
+    MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
     if (!SrcOp.isReg())
       continue; // _L to _LZ may have eliminated this.
 
     Register AddrReg = SrcOp.getReg();
 
-    if (I < DimIdx) {
+    if (I < Intr->GradientStart) {
       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
       PackedAddrs.push_back(AddrReg);
     } else {
       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
       // derivatives dx/dh and dx/dv are packed with undef.
       if (((I + 1) >= EndIdx) ||
-          ((NumGradients / 2) % 2 == 1 &&
-           (I == DimIdx + (NumGradients / 2) - 1 ||
-            I == DimIdx + NumGradients - 1)) ||
+          ((Intr->NumGradients / 2) % 2 == 1 &&
+           (I == static_cast<unsigned>(Intr->GradientStart +
+                                       (Intr->NumGradients / 2) - 1) ||
+            I == static_cast<unsigned>(Intr->GradientStart +
+                                       Intr->NumGradients - 1))) ||
           // Check for _L to _LZ optimization
-          !MI.getOperand(I + 1).isReg()) {
+          !MI.getOperand(ArgOffset + I + 1).isReg()) {
         PackedAddrs.push_back(
             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
                 .getReg(0));
       } else {
         PackedAddrs.push_back(
-            B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
+            B.buildBuildVector(
+                 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
                 .getReg(0));
         ++I;
       }
@@ -3673,43 +4075,37 @@ static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
 /// now unnecessary arguments with $noreg.
 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
-    MachineInstr &MI, MachineIRBuilder &B,
-    GISelChangeObserver &Observer,
-    const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
+    MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
+    const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
 
-  const int NumDefs = MI.getNumExplicitDefs();
+  const unsigned NumDefs = MI.getNumExplicitDefs();
+  const unsigned ArgOffset = NumDefs + 1;
   bool IsTFE = NumDefs == 2;
   // We are only processing the operands of d16 image operations on subtargets
   // that use the unpacked register layout, or need to repack the TFE result.
 
   // TODO: Do we need to guard against already legalized intrinsics?
   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
-    AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
+      AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
 
   MachineRegisterInfo *MRI = B.getMRI();
   const LLT S32 = LLT::scalar(32);
   const LLT S16 = LLT::scalar(16);
   const LLT V2S16 = LLT::vector(2, 16);
 
-  // Index of first address argument
-  const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
-
-  int NumVAddrs, NumGradients;
-  std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
-  const int DMaskIdx = BaseOpcode->Atomic ? -1 :
-    getDMaskIdx(BaseOpcode, NumDefs);
   unsigned DMask = 0;
 
   // Check for 16 bit addresses and pack if true.
-  int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
-  LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg());
-  LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg());
+  LLT GradTy =
+      MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
+  LLT AddrTy =
+      MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
   const bool IsG16 = GradTy == S16;
   const bool IsA16 = AddrTy == S16;
 
   int DMaskLanes = 0;
   if (!BaseOpcode->Atomic) {
-    DMask = MI.getOperand(DMaskIdx).getImm();
+    DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
     if (BaseOpcode->Gather4) {
       DMaskLanes = 4;
     } else if (DMask != 0) {
@@ -3736,7 +4132,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
   if (IsTFE && DMask == 0) {
     DMask = 0x1;
     DMaskLanes = 1;
-    MI.getOperand(DMaskIdx).setImm(DMask);
+    MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
   }
 
   if (BaseOpcode->Atomic) {
@@ -3757,41 +4153,41 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
     }
   }
 
-  int CorrectedNumVAddrs = NumVAddrs;
+  unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
 
   // Optimize _L to _LZ when _L is zero
   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
-        AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
+          AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
     const ConstantFP *ConstantLod;
-    const int LodIdx = AddrIdx + NumVAddrs - 1;
 
-    if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
+    if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
+                 m_GFCst(ConstantLod))) {
       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
-        ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
-          LZMappingInfo->LZ, ImageDimIntr->Dim);
+        const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
+            AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ,
+                                                      Intr->Dim);
 
         // The starting indexes should remain in the same place.
-        --NumVAddrs;
         --CorrectedNumVAddrs;
 
-        MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
-          static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
-        MI.RemoveOperand(LodIdx);
+        MI.getOperand(MI.getNumExplicitDefs())
+            .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
+        MI.RemoveOperand(ArgOffset + Intr->LodIndex);
+        Intr = NewImageDimIntr;
       }
     }
   }
 
   // Optimize _mip away, when 'lod' is zero
-  if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
+  if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
     int64_t ConstantLod;
-    const int LodIdx = AddrIdx + NumVAddrs - 1;
-
-    if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
+    if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
+                 m_ICst(ConstantLod))) {
       if (ConstantLod == 0) {
         // TODO: Change intrinsic opcode and remove operand instead or replacing
         // it with 0, as the _L to _LZ handling is done above.
-        MI.getOperand(LodIdx).ChangeToImmediate(0);
+        MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
         --CorrectedNumVAddrs;
       }
     }
@@ -3806,18 +4202,17 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
     } else if (!ST.hasG16())
       return false;
 
-    if (NumVAddrs > 1) {
+    if (Intr->NumVAddrs > 1) {
       SmallVector<Register, 4> PackedRegs;
       // Don't compress addresses for G16
-      const int PackEndIdx =
-          IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients);
-      packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx,
-                                  PackEndIdx, NumGradients);
+      const int PackEndIdx = IsA16 ? Intr->VAddrEnd : Intr->CoordStart;
+      packImageA16AddressToDwords(B, MI, PackedRegs, ArgOffset, Intr,
+                                  PackEndIdx);
 
       if (!IsA16) {
         // Add uncompressed address
-        for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) {
-          int AddrReg = MI.getOperand(I).getReg();
+        for (unsigned I = Intr->CoordStart; I < Intr->VAddrEnd; I++) {
+          int AddrReg = MI.getOperand(ArgOffset + I).getReg();
           assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
           PackedRegs.push_back(AddrReg);
         }
@@ -3833,9 +4228,9 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
         PackedRegs.resize(1);
       }
 
-      const int NumPacked = PackedRegs.size();
-      for (int I = 0; I != NumVAddrs; ++I) {
-        MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
+      const unsigned NumPacked = PackedRegs.size();
+      for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
+        MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
         if (!SrcOp.isReg()) {
           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
           continue;
@@ -3843,8 +4238,8 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
 
         assert(SrcOp.getReg() != AMDGPU::NoRegister);
 
-        if (I < NumPacked)
-          SrcOp.setReg(PackedRegs[I]);
+        if (I - Intr->VAddrStart < NumPacked)
+          SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
         else
           SrcOp.setReg(AMDGPU::NoRegister);
       }
@@ -3863,8 +4258,9 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
     // allocation when possible.
     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
 
-    if (!UseNSA && NumVAddrs > 1)
-      convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
+    if (!UseNSA && Intr->NumVAddrs > 1)
+      convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
+                               Intr->NumVAddrs);
   }
 
   int Flags = 0;
@@ -3881,7 +4277,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
     if (!Ty.isVector() || Ty.getElementType() != S16)
       return true;
 
-    Register RepackedReg = handleD16VData(B, *MRI, VData);
+    Register RepackedReg = handleD16VData(B, *MRI, VData, true);
     if (RepackedReg != VData) {
       MI.getOperand(1).setReg(RepackedReg);
     }
@@ -4053,8 +4449,10 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
 }
 
 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
-  MachineInstr &MI, MachineIRBuilder &B,
-  GISelChangeObserver &Observer) const {
+  LegalizerHelper &Helper, MachineInstr &MI) const {
+  MachineIRBuilder &B = Helper.MIRBuilder;
+  GISelChangeObserver &Observer = Helper.Observer;
+
   Register Dst = MI.getOperand(0).getReg();
   LLT Ty = B.getMRI()->getType(Dst);
   unsigned Size = Ty.getSizeInBits();
@@ -4062,6 +4460,13 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(
 
   Observer.changingInstr(MI);
 
+  if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
+    Ty = getBitcastRegisterType(Ty);
+    Helper.bitcastDst(MI, Ty, 0);
+    Dst = MI.getOperand(0).getReg();
+    B.setInsertPt(B.getMBB(), MI);
+  }
+
   // FIXME: We don't really need this intermediate instruction. The intrinsic
   // should be fixed to have a memory operand. Since it's readnone, we're not
   // allowed to add one.
@@ -4083,8 +4488,6 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(
   // always be legal. We may need to restore this to a 96-bit result if it turns
   // out this needs to be converted to a vector load during RegBankSelect.
   if (!isPowerOf2_32(Size)) {
-    LegalizerHelper Helper(MF, *this, Observer, B);
-
     if (Ty.isVector())
       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
     else
@@ -4095,6 +4498,7 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(
   return true;
 }
 
+// TODO: Move to selection
 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
                                                 MachineRegisterInfo &MRI,
                                                 MachineIRBuilder &B) const {
@@ -4105,17 +4509,14 @@ bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
   } else {
     // Pass queue pointer to trap handler as input, and insert trap instruction
     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
-    const ArgDescriptor *Arg =
-        getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR);
-    if (!Arg)
-      return false;
     MachineRegisterInfo &MRI = *B.getMRI();
-    Register SGPR01(AMDGPU::SGPR0_SGPR1);
-    Register LiveIn = getLiveInRegister(
-        B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64),
-        /*InsertLiveInCopy=*/false);
-    if (!loadInputValue(LiveIn, B, Arg))
+
+    Register LiveIn =
+      MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
+    if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
       return false;
+
+    Register SGPR01(AMDGPU::SGPR0_SGPR1);
     B.buildCopy(SGPR01, LiveIn);
     B.buildInstr(AMDGPU::S_TRAP)
         .addImm(GCNSubtarget::TrapIDLLVMTrap)
@@ -4146,6 +4547,78 @@ bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
   return true;
 }
 
+bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
+                                               MachineIRBuilder &B) const {
+  MachineRegisterInfo &MRI = *B.getMRI();
+  const LLT S16 = LLT::scalar(16);
+  const LLT S32 = LLT::scalar(32);
+
+  Register DstReg = MI.getOperand(0).getReg();
+  Register NodePtr = MI.getOperand(2).getReg();
+  Register RayExtent = MI.getOperand(3).getReg();
+  Register RayOrigin = MI.getOperand(4).getReg();
+  Register RayDir = MI.getOperand(5).getReg();
+  Register RayInvDir = MI.getOperand(6).getReg();
+  Register TDescr = MI.getOperand(7).getReg();
+
+  bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
+  bool Is64 =  MRI.getType(NodePtr).getSizeInBits() == 64;
+  unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
+                                 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
+                          : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
+                                 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
+
+  SmallVector<Register, 12> Ops;
+  if (Is64) {
+    auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
+    Ops.push_back(Unmerge.getReg(0));
+    Ops.push_back(Unmerge.getReg(1));
+  } else {
+    Ops.push_back(NodePtr);
+  }
+  Ops.push_back(RayExtent);
+
+  auto packLanes = [&Ops, &S32, &B] (Register Src) {
+    auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
+    Ops.push_back(Unmerge.getReg(0));
+    Ops.push_back(Unmerge.getReg(1));
+    Ops.push_back(Unmerge.getReg(2));
+  };
+
+  packLanes(RayOrigin);
+  if (IsA16) {
+    auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
+    auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
+    Register R1 = MRI.createGenericVirtualRegister(S32);
+    Register R2 = MRI.createGenericVirtualRegister(S32);
+    Register R3 = MRI.createGenericVirtualRegister(S32);
+    B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
+    B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
+    B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
+    Ops.push_back(R1);
+    Ops.push_back(R2);
+    Ops.push_back(R3);
+  } else {
+    packLanes(RayDir);
+    packLanes(RayInvDir);
+  }
+
+  auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
+    .addDef(DstReg)
+    .addImm(Opcode);
+
+  for (Register R : Ops) {
+    MIB.addUse(R);
+  }
+
+  MIB.addUse(TDescr)
+     .addImm(IsA16 ? 1 : 0)
+     .cloneMemRefs(MI);
+
+  MI.eraseFromParent();
+  return true;
+}
+
 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
                                             MachineInstr &MI) const {
   MachineIRBuilder &B = Helper.MIRBuilder;
@@ -4158,7 +4631,9 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
   case Intrinsic::amdgcn_else: {
     MachineInstr *Br = nullptr;
     MachineBasicBlock *UncondBrTarget = nullptr;
-    if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
+    bool Negated = false;
+    if (MachineInstr *BrCond =
+            verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
       const SIRegisterInfo *TRI
         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
 
@@ -4166,6 +4641,10 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
       Register Use = MI.getOperand(3).getReg();
 
       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
+
+      if (Negated)
+        std::swap(CondBrTarget, UncondBrTarget);
+
       B.setInsertPt(B.getMBB(), BrCond->getIterator());
       if (IntrID == Intrinsic::amdgcn_if) {
         B.buildInstr(AMDGPU::SI_IF)
@@ -4174,10 +4653,9 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
           .addMBB(UncondBrTarget);
       } else {
         B.buildInstr(AMDGPU::SI_ELSE)
-          .addDef(Def)
-          .addUse(Use)
-          .addMBB(UncondBrTarget)
-          .addImm(0);
+            .addDef(Def)
+            .addUse(Use)
+            .addMBB(UncondBrTarget);
       }
 
       if (Br) {
@@ -4201,13 +4679,18 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
   case Intrinsic::amdgcn_loop: {
     MachineInstr *Br = nullptr;
     MachineBasicBlock *UncondBrTarget = nullptr;
-    if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
+    bool Negated = false;
+    if (MachineInstr *BrCond =
+            verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
       const SIRegisterInfo *TRI
         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
 
       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
       Register Reg = MI.getOperand(2).getReg();
 
+      if (Negated)
+        std::swap(CondBrTarget, UncondBrTarget);
+
       B.setInsertPt(B.getMBB(), BrCond->getIterator());
       B.buildInstr(AMDGPU::SI_LOOP)
         .addUse(Reg)
@@ -4280,7 +4763,7 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
     return true;
   }
   case Intrinsic::amdgcn_s_buffer_load:
-    return legalizeSBufferLoad(MI, B, Helper.Observer);
+    return legalizeSBufferLoad(Helper, MI);
   case Intrinsic::amdgcn_raw_buffer_store:
   case Intrinsic::amdgcn_struct_buffer_store:
     return legalizeBufferStore(MI, MRI, B, false, false);
@@ -4323,6 +4806,8 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
+  case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
+  case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
     return legalizeBufferAtomic(MI, B, IntrID);
@@ -4334,6 +4819,14 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
     return legalizeTrapIntrinsic(MI, MRI, B);
   case Intrinsic::debugtrap:
     return legalizeDebugTrapIntrinsic(MI, MRI, B);
+  case Intrinsic::amdgcn_rsq_clamp:
+    return legalizeRsqClampIntrinsic(MI, MRI, B);
+  case Intrinsic::amdgcn_ds_fadd:
+  case Intrinsic::amdgcn_ds_fmin:
+  case Intrinsic::amdgcn_ds_fmax:
+    return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
+  case Intrinsic::amdgcn_image_bvh_intersect_ray:
+    return legalizeBVHIntrinsic(MI, B);
   default: {
     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
             AMDGPU::getImageDimIntrinsicInfo(IntrID))
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index ce32bbf76b34..87e8b2128a25 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -23,9 +23,13 @@ namespace llvm {
 class GCNTargetMachine;
 class LLVMContext;
 class GCNSubtarget;
+class MachineIRBuilder;
 
+namespace AMDGPU {
+struct ImageDimIntrinsicInfo;
+}
 /// This class provides the information for the target register banks.
-class AMDGPULegalizerInfo : public LegalizerInfo {
+class AMDGPULegalizerInfo final : public LegalizerInfo {
   const GCNSubtarget &ST;
 
 public:
@@ -44,6 +48,8 @@ public:
                      MachineIRBuilder &B) const;
   bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI,
                      MachineIRBuilder &B) const;
+  bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI,
+                    MachineIRBuilder &B) const;
   bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI,
                               MachineIRBuilder &B) const;
   bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI,
@@ -67,9 +73,7 @@ public:
 
   bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI,
                            MachineIRBuilder &B) const;
-  bool legalizeLoad(MachineInstr &MI, MachineRegisterInfo &MRI,
-                    MachineIRBuilder &B,
-                    GISelChangeObserver &Observer) const;
+  bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const;
 
   bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI,
                     MachineIRBuilder &B) const;
@@ -86,16 +90,11 @@ public:
   bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI,
                            MachineIRBuilder &B) const;
 
-  Register getLiveInRegister(MachineIRBuilder &B, MachineRegisterInfo &MRI,
-                             Register PhyReg, LLT Ty,
-                             bool InsertLiveInCopy = true) const;
-  Register insertLiveInCopy(MachineIRBuilder &B, MachineRegisterInfo &MRI,
-                            Register LiveIn, Register PhyReg) const;
-  const ArgDescriptor *
-  getArgDescriptor(MachineIRBuilder &B,
-                   AMDGPUFunctionArgInfo::PreloadedValue ArgType) const;
   bool loadInputValue(Register DstReg, MachineIRBuilder &B,
-                      const ArgDescriptor *Arg) const;
+                      const ArgDescriptor *Arg,
+                      const TargetRegisterClass *ArgRC, LLT ArgTy) const;
+  bool loadInputValue(Register DstReg, MachineIRBuilder &B,
+                      AMDGPUFunctionArgInfo::PreloadedValue ArgType) const;
   bool legalizePreloadedArgIntrin(
     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const;
@@ -130,9 +129,20 @@ public:
                       MachineIRBuilder &B) const;
   bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI,
                               MachineIRBuilder &B) const;
+  bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI,
+                                MachineIRBuilder &B) const;
   bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI,
                               MachineIRBuilder &B) const;
 
+  bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI,
+                                 MachineIRBuilder &B) const;
+
+  bool legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
+                                   MachineInstr &MI, Intrinsic::ID IID) const;
+
+  bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI,
+                         MachineIRBuilder &B) const;
+
   bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI,
                               MachineIRBuilder &B) const;
   bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI,
@@ -142,7 +152,7 @@ public:
   splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const;
 
   Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI,
-                          Register Reg) const;
+                          Register Reg, bool ImageStore = false) const;
   bool legalizeRawBufferStore(MachineInstr &MI, MachineRegisterInfo &MRI,
                               MachineIRBuilder &B, bool IsFormat) const;
   bool legalizeRawBufferLoad(MachineInstr &MI, MachineRegisterInfo &MRI,
@@ -154,19 +164,19 @@ public:
                            MachineIRBuilder &B, bool IsTyped,
                            bool IsFormat) const;
   bool legalizeBufferLoad(MachineInstr &MI, MachineRegisterInfo &MRI,
-                          MachineIRBuilder &B, bool IsTyped,
-                          bool IsFormat) const;
+                          MachineIRBuilder &B, bool IsFormat,
+                          bool IsTyped) const;
   bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B,
                             Intrinsic::ID IID) const;
 
+  bool legalizeBVHIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const;
+
   bool legalizeImageIntrinsic(
       MachineInstr &MI, MachineIRBuilder &B,
       GISelChangeObserver &Observer,
       const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const;
 
-  bool legalizeSBufferLoad(
-    MachineInstr &MI, MachineIRBuilder &B,
-    GISelChangeObserver &Observer) const;
+  bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const;
 
   bool legalizeAtomicIncDec(MachineInstr &MI,  MachineIRBuilder &B,
                             bool IsInc) const;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index 4a14259f1bdb..6b7f57252b7a 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -13,27 +13,12 @@
 
 #include "AMDGPU.h"
 #include "AMDGPULibFunc.h"
-#include "AMDGPUSubtarget.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSet.h"
+#include "GCNSubtarget.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/Loads.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
-#include <cmath>
-#include <vector>
 
 #define DEBUG_TYPE "amdgpu-simplifylib"
 
@@ -495,8 +480,7 @@ bool AMDGPULibCalls::isUnsafeMath(const CallInst *CI) const {
 }
 
 bool AMDGPULibCalls::useNativeFunc(const StringRef F) const {
-  return AllNative ||
-         std::find(UseNative.begin(), UseNative.end(), F) != UseNative.end();
+  return AllNative || llvm::is_contained(UseNative, F);
 }
 
 void AMDGPULibCalls::initNativeFuncs() {
@@ -1289,6 +1273,7 @@ bool AMDGPULibCalls::fold_sincos(CallInst *CI, IRBuilder<> &B,
   BasicBlock * const CBB = CI->getParent();
 
   int const MaxScan = 30;
+  bool Changed = false;
 
   { // fold in load value.
     LoadInst *LI = dyn_cast<LoadInst>(CArgVal);
@@ -1296,6 +1281,7 @@ bool AMDGPULibCalls::fold_sincos(CallInst *CI, IRBuilder<> &B,
       BasicBlock::iterator BBI = LI->getIterator();
       Value *AvailableVal = FindAvailableLoadedValue(LI, CBB, BBI, MaxScan, AA);
       if (AvailableVal) {
+        Changed = true;
         CArgVal->replaceAllUsesWith(AvailableVal);
         if (CArgVal->getNumUses() == 0)
           LI->eraseFromParent();
@@ -1331,7 +1317,8 @@ bool AMDGPULibCalls::fold_sincos(CallInst *CI, IRBuilder<> &B,
     if (UI) break;
   }
 
-  if (!UI) return false;
+  if (!UI)
+    return Changed;
 
   // Merge the sin and cos.
 
@@ -1340,7 +1327,8 @@ bool AMDGPULibCalls::fold_sincos(CallInst *CI, IRBuilder<> &B,
   AMDGPULibFunc nf(AMDGPULibFunc::EI_SINCOS, fInfo);
   nf.getLeads()[0].PtrKind = AMDGPULibFunc::getEPtrKindFromAddrSpace(AMDGPUAS::FLAT_ADDRESS);
   FunctionCallee Fsincos = getFunction(M, nf);
-  if (!Fsincos) return false;
+  if (!Fsincos)
+    return Changed;
 
   BasicBlock::iterator ItOld = B.GetInsertPoint();
   AllocaInst *Alloc = insertAlloca(UI, B, "__sincos_");
@@ -1747,6 +1735,40 @@ bool AMDGPUSimplifyLibCalls::runOnFunction(Function &F) {
   return Changed;
 }
 
+PreservedAnalyses AMDGPUSimplifyLibCallsPass::run(Function &F,
+                                                  FunctionAnalysisManager &AM) {
+  AMDGPULibCalls Simplifier(&TM);
+  Simplifier.initNativeFuncs();
+
+  bool Changed = false;
+  auto AA = &AM.getResult<AAManager>(F);
+
+  LLVM_DEBUG(dbgs() << "AMDIC: process function ";
+             F.printAsOperand(dbgs(), false, F.getParent()); dbgs() << '\n';);
+
+  for (auto &BB : F) {
+    for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E;) {
+      // Ignore non-calls.
+      CallInst *CI = dyn_cast<CallInst>(I);
+      ++I;
+      // Ignore intrinsics that do not become real instructions.
+      if (!CI || isa<DbgInfoIntrinsic>(CI) || CI->isLifetimeStartOrEnd())
+        continue;
+
+      // Ignore indirect calls.
+      Function *Callee = CI->getCalledFunction();
+      if (Callee == 0)
+        continue;
+
+      LLVM_DEBUG(dbgs() << "AMDIC: try folding " << *CI << "\n";
+                 dbgs().flush());
+      if (Simplifier.fold(CI, AA))
+        Changed = true;
+    }
+  }
+  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
+
 bool AMDGPUUseNativeCalls::runOnFunction(Function &F) {
   if (skipFunction(F) || UseNative.empty())
     return false;
@@ -1769,3 +1791,32 @@ bool AMDGPUUseNativeCalls::runOnFunction(Function &F) {
   }
   return Changed;
 }
+
+PreservedAnalyses AMDGPUUseNativeCallsPass::run(Function &F,
+                                                FunctionAnalysisManager &AM) {
+  if (UseNative.empty())
+    return PreservedAnalyses::all();
+
+  AMDGPULibCalls Simplifier;
+  Simplifier.initNativeFuncs();
+
+  bool Changed = false;
+  for (auto &BB : F) {
+    for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E;) {
+      // Ignore non-calls.
+      CallInst *CI = dyn_cast<CallInst>(I);
+      ++I;
+      if (!CI)
+        continue;
+
+      // Ignore indirect calls.
+      Function *Callee = CI->getCalledFunction();
+      if (Callee == 0)
+        continue;
+
+      if (Simplifier.useNative(CI))
+        Changed = true;
+    }
+  }
+  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp
index 2b5143ba7506..646087cdb7db 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp
@@ -12,17 +12,14 @@
 
 #include "AMDGPULibFunc.h"
 #include "AMDGPU.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringSwitch.h"
-#include "llvm/IR/Attributes.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ValueSymbolTable.h"
 #include "llvm/Support/raw_ostream.h"
-#include <string>
 
 using namespace llvm;
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
index 54c15e4e4d39..714e74faaf13 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
@@ -8,12 +8,16 @@
 
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
-#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/IntrinsicsR600.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
 
 #define DEBUG_TYPE "amdgpu-lower-intrinsics"
@@ -131,7 +135,9 @@ bool AMDGPULowerIntrinsics::makeLIDRangeMetadata(Function &F) const {
     if (!CI)
       continue;
 
-    Changed |= AMDGPUSubtarget::get(TM, F).makeLIDRangeMetadata(CI);
+    Function *Caller = CI->getParent()->getParent();
+    const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, *Caller);
+    Changed |= ST.makeLIDRangeMetadata(CI);
   }
   return Changed;
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
index 62ab5bb55a16..8fb4f93fd4b3 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -12,30 +12,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "AMDGPUTargetMachine.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Analysis/Loads.h"
-#include "llvm/CodeGen/Passes.h"
+#include "GCNSubtarget.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/MDBuilder.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Value.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-
+#include "llvm/Target/TargetMachine.h"
 #define DEBUG_TYPE "amdgpu-lower-kernel-arguments"
 
 using namespace llvm;
@@ -108,10 +89,14 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
   uint64_t ExplicitArgOffset = 0;
 
   for (Argument &Arg : F.args()) {
-    Type *ArgTy = Arg.getType();
-    Align ABITypeAlign = DL.getABITypeAlign(ArgTy);
-    unsigned Size = DL.getTypeSizeInBits(ArgTy);
-    unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
+    const bool IsByRef = Arg.hasByRefAttr();
+    Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
+    MaybeAlign ABITypeAlign = IsByRef ? Arg.getParamAlign() : None;
+    if (!ABITypeAlign)
+      ABITypeAlign = DL.getABITypeAlign(ArgTy);
+
+    uint64_t Size = DL.getTypeSizeInBits(ArgTy);
+    uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
 
     uint64_t EltOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + BaseOffset;
     ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize;
@@ -119,6 +104,19 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
     if (Arg.use_empty())
       continue;
 
+    // If this is byval, the loads are already explicit in the function. We just
+    // need to rewrite the pointer values.
+    if (IsByRef) {
+      Value *ArgOffsetPtr = Builder.CreateConstInBoundsGEP1_64(
+          Builder.getInt8Ty(), KernArgSegment, EltOffset,
+          Arg.getName() + ".byval.kernarg.offset");
+
+      Value *CastOffsetPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
+          ArgOffsetPtr, Arg.getType());
+      Arg.replaceAllUsesWith(CastOffsetPtr);
+      continue;
+    }
+
     if (PointerType *PT = dyn_cast<PointerType>(ArgTy)) {
       // FIXME: Hack. We rely on AssertZext to be able to fold DS addressing
       // modes on SI to know the high bits are 0 so pointer adds don't wrap. We
@@ -224,8 +222,7 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
                                             Arg.getName() + ".load");
       Arg.replaceAllUsesWith(NewVal);
     } else if (IsV3) {
-      Value *Shuf = Builder.CreateShuffleVector(Load, UndefValue::get(V4Ty),
-                                                ArrayRef<int>{0, 1, 2},
+      Value *Shuf = Builder.CreateShuffleVector(Load, ArrayRef<int>{0, 1, 2},
                                                 Arg.getName() + ".load");
       Arg.replaceAllUsesWith(Shuf);
     } else {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
index 00e12f808783..9ab6a5246ce5 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
@@ -13,13 +13,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "AMDGPUTargetMachine.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Pass.h"
 
@@ -41,16 +42,11 @@ enum DispatchPackedOffsets {
 };
 
 class AMDGPULowerKernelAttributes : public ModulePass {
-  Module *Mod = nullptr;
-
 public:
   static char ID;
 
   AMDGPULowerKernelAttributes() : ModulePass(ID) {}
 
-  bool processUse(CallInst *CI);
-
-  bool doInitialization(Module &M) override;
   bool runOnModule(Module &M) override;
 
   StringRef getPassName() const override {
@@ -64,12 +60,7 @@ public:
 
 } // end anonymous namespace
 
-bool AMDGPULowerKernelAttributes::doInitialization(Module &M) {
-  Mod = &M;
-  return false;
-}
-
-bool AMDGPULowerKernelAttributes::processUse(CallInst *CI) {
+static bool processUse(CallInst *CI) {
   Function *F = CI->getParent()->getParent();
 
   auto MD = F->getMetadata("reqd_work_group_size");
@@ -89,7 +80,7 @@ bool AMDGPULowerKernelAttributes::processUse(CallInst *CI) {
   Value *GridSizeY = nullptr;
   Value *GridSizeZ = nullptr;
 
-  const DataLayout &DL = Mod->getDataLayout();
+  const DataLayout &DL = F->getParent()->getDataLayout();
 
   // We expect to see several GEP users, casted to the appropriate type and
   // loaded.
@@ -239,7 +230,7 @@ bool AMDGPULowerKernelAttributes::runOnModule(Module &M) {
   StringRef DispatchPtrName
     = Intrinsic::getName(Intrinsic::amdgcn_dispatch_ptr);
 
-  Function *DispatchPtr = Mod->getFunction(DispatchPtrName);
+  Function *DispatchPtr = M.getFunction(DispatchPtrName);
   if (!DispatchPtr) // Dispatch ptr not used.
     return false;
 
@@ -267,3 +258,22 @@ char AMDGPULowerKernelAttributes::ID = 0;
 ModulePass *llvm::createAMDGPULowerKernelAttributesPass() {
   return new AMDGPULowerKernelAttributes();
 }
+
+PreservedAnalyses
+AMDGPULowerKernelAttributesPass::run(Function &F, FunctionAnalysisManager &AM) {
+  StringRef DispatchPtrName =
+      Intrinsic::getName(Intrinsic::amdgcn_dispatch_ptr);
+
+  Function *DispatchPtr = F.getParent()->getFunction(DispatchPtrName);
+  if (!DispatchPtr) // Dispatch ptr not used.
+    return PreservedAnalyses::all();
+
+  for (Instruction &I : instructions(F)) {
+    if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+      if (CI->getCalledFunction() == DispatchPtr)
+        processUse(CI);
+    }
+  }
+
+  return PreservedAnalyses::all();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index 99d229c9b74e..a8cba3f5cc5c 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -13,12 +13,10 @@
 //
 
 #include "AMDGPUAsmPrinter.h"
-#include "AMDGPUSubtarget.h"
 #include "AMDGPUTargetMachine.h"
 #include "MCTargetDesc/AMDGPUInstPrinter.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "R600AsmPrinter.h"
-#include "SIInstrInfo.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/IR/Constants.h"
@@ -323,7 +321,10 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
     // The isPseudo check really shouldn't be here, but unfortunately there are
     // some negative lit tests that depend on being able to continue through
     // here even when pseudo instructions haven't been lowered.
-    if (!MI->isPseudo() && STI.isCPUStringValid(STI.getCPU())) {
+    //
+    // We also overestimate branch sizes with the offset bug.
+    if (!MI->isPseudo() && STI.isCPUStringValid(STI.getCPU()) &&
+        (!STI.hasOffset3fBug() || !MI->isBranch())) {
       SmallVector<MCFixup, 4> Fixups;
       SmallVector<char, 16> CodeBytes;
       raw_svector_ostream CodeStream(CodeBytes);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp
new file mode 100644
index 000000000000..c3441f81a78e
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp
@@ -0,0 +1,38 @@
+//===- AMDGPUMIRFormatter.cpp ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Implementation of AMDGPU overrides of MIRFormatter.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMIRFormatter.h"
+#include "GCNSubtarget.h"
+#include "SIMachineFunctionInfo.h"
+
+using namespace llvm;
+
+bool AMDGPUMIRFormatter::parseCustomPseudoSourceValue(
+    StringRef Src, MachineFunction &MF, PerFunctionMIParsingState &PFS,
+    const PseudoSourceValue *&PSV, ErrorCallbackType ErrorCallback) const {
+  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  const SIInstrInfo &TII = *MF.getSubtarget<GCNSubtarget>().getInstrInfo();
+  if (Src == "BufferResource") {
+    PSV = MFI->getBufferPSV(TII);
+    return false;
+  }
+  if (Src == "ImageResource") {
+    PSV = MFI->getImagePSV(TII);
+    return false;
+  }
+  if (Src == "GWSResource") {
+    PSV = MFI->getGWSPSV(TII);
+    return false;
+  }
+  llvm_unreachable("unknown MIR custom pseudo source value");
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h
new file mode 100644
index 000000000000..a61f1f7b8182
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h
@@ -0,0 +1,47 @@
+//===-- llvm/Target/AMDGPU/AMDGPUMIRFormatter.h -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// AMDGPU specific overrides of MIRFormatter.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPUMIRFORMATTER_H
+#define LLVM_LIB_TARGET_AMDGPUMIRFORMATTER_H
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/CodeGen/MIRFormatter.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstdint>
+
+namespace llvm {
+
+class MachineFunction;
+class MachineInstr;
+struct PerFunctionMIParsingState;
+struct SlotMapping;
+
+class AMDGPUMIRFormatter final : public MIRFormatter {
+public:
+  AMDGPUMIRFormatter() {}
+  virtual ~AMDGPUMIRFormatter() = default;
+
+  /// Implement target specific parsing of target custom pseudo source value.
+  virtual bool
+  parseCustomPseudoSourceValue(StringRef Src, MachineFunction &MF,
+                               PerFunctionMIParsingState &PFS,
+                               const PseudoSourceValue *&PSV,
+                               ErrorCallbackType ErrorCallback) const override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
index f61af5a27943..b6a69b2819ee 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
@@ -11,36 +11,17 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "SIInstrInfo.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
+#include "GCNSubtarget.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegionInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/Config/llvm-config.h"
-#include "llvm/IR/DebugLoc.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include <cassert>
-#include <tuple>
-#include <utility>
 
 using namespace llvm;
 
@@ -342,11 +323,11 @@ protected:
   LinearizedRegion *Parent;
   RegionMRT *RMRT;
 
-  void storeLiveOutReg(MachineBasicBlock *MBB, unsigned Reg,
+  void storeLiveOutReg(MachineBasicBlock *MBB, Register Reg,
                        MachineInstr *DefInstr, const MachineRegisterInfo *MRI,
                        const TargetRegisterInfo *TRI, PHILinearize &PHIInfo);
 
-  void storeLiveOutRegRegion(RegionMRT *Region, unsigned Reg,
+  void storeLiveOutRegRegion(RegionMRT *Region, Register Reg,
                              MachineInstr *DefInstr,
                              const MachineRegisterInfo *MRI,
                              const TargetRegisterInfo *TRI,
@@ -397,7 +378,7 @@ public:
 
   void replaceLiveOut(unsigned OldReg, unsigned NewReg);
 
-  void replaceRegister(unsigned Register, unsigned NewRegister,
+  void replaceRegister(unsigned Register, class Register NewRegister,
                        MachineRegisterInfo *MRI, bool ReplaceInside,
                        bool ReplaceOutside, bool IncludeLoopPHIs);
 
@@ -690,12 +671,12 @@ RegionMRT *MRT::buildMRT(MachineFunction &MF,
   return Result;
 }
 
-void LinearizedRegion::storeLiveOutReg(MachineBasicBlock *MBB, unsigned Reg,
+void LinearizedRegion::storeLiveOutReg(MachineBasicBlock *MBB, Register Reg,
                                        MachineInstr *DefInstr,
                                        const MachineRegisterInfo *MRI,
                                        const TargetRegisterInfo *TRI,
                                        PHILinearize &PHIInfo) {
-  if (Register::isVirtualRegister(Reg)) {
+  if (Reg.isVirtual()) {
     LLVM_DEBUG(dbgs() << "Considering Register: " << printReg(Reg, TRI)
                       << "\n");
     // If this is a source register to a PHI we are chaining, it
@@ -730,12 +711,12 @@ void LinearizedRegion::storeLiveOutReg(MachineBasicBlock *MBB, unsigned Reg,
   }
 }
 
-void LinearizedRegion::storeLiveOutRegRegion(RegionMRT *Region, unsigned Reg,
+void LinearizedRegion::storeLiveOutRegRegion(RegionMRT *Region, Register Reg,
                                              MachineInstr *DefInstr,
                                              const MachineRegisterInfo *MRI,
                                              const TargetRegisterInfo *TRI,
                                              PHILinearize &PHIInfo) {
-  if (Register::isVirtualRegister(Reg)) {
+  if (Reg.isVirtual()) {
     LLVM_DEBUG(dbgs() << "Considering Register: " << printReg(Reg, TRI)
                       << "\n");
     for (auto &UI : MRI->use_operands(Reg)) {
@@ -907,7 +888,8 @@ void LinearizedRegion::replaceLiveOut(unsigned OldReg, unsigned NewReg) {
   }
 }
 
-void LinearizedRegion::replaceRegister(unsigned Register, unsigned NewRegister,
+void LinearizedRegion::replaceRegister(unsigned Register,
+                                       class Register NewRegister,
                                        MachineRegisterInfo *MRI,
                                        bool ReplaceInside, bool ReplaceOutside,
                                        bool IncludeLoopPHI) {
@@ -950,7 +932,7 @@ void LinearizedRegion::replaceRegister(unsigned Register, unsigned NewRegister,
                          (IncludeLoopPHI && IsLoopPHI);
     if (ShouldReplace) {
 
-      if (Register::isPhysicalRegister(NewRegister)) {
+      if (NewRegister.isPhysical()) {
         LLVM_DEBUG(dbgs() << "Trying to substitute physical register: "
                           << printReg(NewRegister, MRI->getTargetRegisterInfo())
                           << "\n");
@@ -1002,11 +984,11 @@ void LinearizedRegion::addMBBs(LinearizedRegion *InnerRegion) {
 }
 
 bool LinearizedRegion::contains(MachineBasicBlock *MBB) {
-  return MBBs.count(MBB) == 1;
+  return MBBs.contains(MBB);
 }
 
 bool LinearizedRegion::isLiveOut(unsigned Reg) {
-  return LiveOuts.count(Reg) == 1;
+  return LiveOuts.contains(Reg);
 }
 
 bool LinearizedRegion::hasNoDef(unsigned Reg, MachineRegisterInfo *MRI) {
@@ -1025,7 +1007,7 @@ void LinearizedRegion::removeFalseRegisterKills(MachineRegisterInfo *MRI) {
       for (auto &RI : II.uses()) {
         if (RI.isReg()) {
           Register Reg = RI.getReg();
-          if (Register::isVirtualRegister(Reg)) {
+          if (Reg.isVirtual()) {
             if (hasNoDef(Reg, MRI))
               continue;
             if (!MRI->hasOneDef(Reg)) {
@@ -1168,7 +1150,7 @@ private:
   void createEntryPHIs(LinearizedRegion *CurrentRegion);
   void resolvePHIInfos(MachineBasicBlock *FunctionEntry);
 
-  void replaceRegisterWith(unsigned Register, unsigned NewRegister);
+  void replaceRegisterWith(unsigned Register, class Register NewRegister);
 
   MachineBasicBlock *createIfRegion(MachineBasicBlock *MergeBB,
                                     MachineBasicBlock *CodeBB,
@@ -1872,7 +1854,7 @@ MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfBlock(
                     ? SinglePred->findDebugLoc(SinglePred->getFirstTerminator())
                     : DebugLoc();
 
-  unsigned Reg =
+  Register Reg =
       TII->insertEQ(IfBB, IfBB->begin(), DL, IfReg,
                     SelectBB->getNumber() /* CodeBBStart->getNumber() */);
   if (&(*(IfBB->getParent()->begin())) == IfBB) {
@@ -2224,8 +2206,8 @@ void AMDGPUMachineCFGStructurizer::createEntryPHIs(LinearizedRegion *CurrentRegi
   PHIInfo.clear();
 }
 
-void AMDGPUMachineCFGStructurizer::replaceRegisterWith(unsigned Register,
-                                                 unsigned NewRegister) {
+void AMDGPUMachineCFGStructurizer::replaceRegisterWith(
+    unsigned Register, class Register NewRegister) {
   assert(Register != NewRegister && "Cannot replace a reg with itself");
 
   for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(Register),
@@ -2233,7 +2215,7 @@ void AMDGPUMachineCFGStructurizer::replaceRegisterWith(unsigned Register,
        I != E;) {
     MachineOperand &O = *I;
     ++I;
-    if (Register::isPhysicalRegister(NewRegister)) {
+    if (NewRegister.isPhysical()) {
       LLVM_DEBUG(dbgs() << "Trying to substitute physical register: "
                         << printReg(NewRegister, MRI->getTargetRegisterInfo())
                         << "\n");
@@ -2334,7 +2316,7 @@ MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfRegion(
         TII->removeBranch(*RegionExit);
 
         // We need to create a backedge if there is a loop
-        unsigned Reg = TII->insertNE(
+        Register Reg = TII->insertNE(
             RegionExit, RegionExit->instr_end(), DL,
             CurrentRegion->getRegionMRT()->getInnerOutputRegister(),
             CurrentRegion->getRegionMRT()->getEntry()->getNumber());
@@ -2393,7 +2375,7 @@ MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfRegion(
       TII->removeBranch(*RegionExit);
 
       // We need to create a backedge if there is a loop
-      unsigned Reg =
+      Register Reg =
           TII->insertNE(RegionExit, RegionExit->instr_end(), DL,
                         CurrentRegion->getRegionMRT()->getInnerOutputRegister(),
                         CurrentRegion->getRegionMRT()->getEntry()->getNumber());
@@ -2592,7 +2574,7 @@ static void removeOldExitPreds(RegionMRT *Region) {
 static bool mbbHasBackEdge(MachineBasicBlock *MBB,
                            SmallPtrSet<MachineBasicBlock *, 8> &MBBs) {
   for (auto SI = MBB->succ_begin(), SE = MBB->succ_end(); SI != SE; ++SI) {
-    if (MBBs.count(*SI) != 0) {
+    if (MBBs.contains(*SI)) {
       return true;
     }
   }
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index 64acd6efe028..717145b7af53 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -7,17 +7,20 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUMachineFunction.h"
-#include "AMDGPUSubtarget.h"
 #include "AMDGPUPerfHintAnalysis.h"
+#include "AMDGPUSubtarget.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
 
-AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
-  MachineFunctionInfo(),
-  Mode(MF.getFunction()),
-  IsEntryFunction(AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv())),
-  NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath) {
+AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF)
+    : MachineFunctionInfo(), Mode(MF.getFunction()),
+      IsEntryFunction(
+          AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv())),
+      IsModuleEntryFunction(
+          AMDGPU::isModuleEntryFunctionCC(MF.getFunction().getCallingConv())),
+      NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath) {
   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
 
   // FIXME: Should initialize KernArgSize based on ExplicitKernelArgOffset,
@@ -49,10 +52,27 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
   /// TODO: We should sort these to minimize wasted space due to alignment
   /// padding. Currently the padding is decided by the first encountered use
   /// during lowering.
-  unsigned Offset = LDSSize = alignTo(LDSSize, Alignment);
+  unsigned Offset = StaticLDSSize = alignTo(StaticLDSSize, Alignment);
 
   Entry.first->second = Offset;
-  LDSSize += DL.getTypeAllocSize(GV.getValueType());
+  StaticLDSSize += DL.getTypeAllocSize(GV.getValueType());
+
+  // Update the LDS size considering the padding to align the dynamic shared
+  // memory.
+  LDSSize = alignTo(StaticLDSSize, DynLDSAlign);
 
   return Offset;
 }
+
+void AMDGPUMachineFunction::setDynLDSAlign(const DataLayout &DL,
+                                           const GlobalVariable &GV) {
+  assert(DL.getTypeAllocSize(GV.getValueType()).isZero());
+
+  Align Alignment =
+      DL.getValueOrABITypeAlignment(GV.getAlign(), GV.getValueType());
+  if (Alignment <= DynLDSAlign)
+    return;
+
+  LDSSize = alignTo(StaticLDSSize, Alignment);
+  DynLDSAlign = Alignment;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index c504dd76bc65..07cac776082d 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -9,9 +9,9 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINEFUNCTION_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINEFUNCTION_H
 
+#include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/CodeGen/MachineFunction.h"
-#include "Utils/AMDGPUBaseInfo.h"
 
 namespace llvm {
 
@@ -29,13 +29,27 @@ protected:
   /// Number of bytes in the LDS that are being used.
   unsigned LDSSize = 0;
 
+  /// Number of bytes in the LDS allocated statically. This field is only used
+  /// in the instruction selector and not part of the machine function info.
+  unsigned StaticLDSSize = 0;
+
+  /// Align for dynamic shared memory if any. Dynamic shared memory is
+  /// allocated directly after the static one, i.e., LDSSize. Need to pad
+  /// LDSSize to ensure that dynamic one is aligned accordingly.
+  /// The maximal alignment is updated during IR translation or lowering
+  /// stages.
+  Align DynLDSAlign;
+
   // State of MODE register, assumed FP mode.
   AMDGPU::SIModeRegisterDefaults Mode;
 
-  // Kernels + shaders. i.e. functions called by the driver and not called
+  // Kernels + shaders. i.e. functions called by the hardware and not called
   // by other functions.
   bool IsEntryFunction = false;
 
+  // Entry points called by other functions instead of directly by the hardware.
+  bool IsModuleEntryFunction = false;
+
   bool NoSignedZerosFPMath = false;
 
   // Function may be memory bound.
@@ -65,6 +79,8 @@ public:
     return IsEntryFunction;
   }
 
+  bool isModuleEntryFunction() const { return IsModuleEntryFunction; }
+
   bool hasNoSignedZerosFPMath() const {
     return NoSignedZerosFPMath;
   }
@@ -78,6 +94,10 @@ public:
   }
 
   unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV);
+
+  Align getDynLDSAlign() const { return DynLDSAlign; }
+
+  void setDynLDSAlign(const DataLayout &DL, const GlobalVariable &GV);
 };
 
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp
index 4d9f08b3af01..6646cce8186b 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp
@@ -13,7 +13,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUMachineModuleInfo.h"
-#include "llvm/IR/Module.h"
 
 namespace llvm {
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h
index 2b0b8b42acfe..1b513c456307 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h
@@ -15,11 +15,7 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINEMODULEINFO_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINEMODULEINFO_H
 
-#include "llvm/ADT/None.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
-#include "llvm/IR/LLVMContext.h"
 
 namespace llvm {
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp
index b05855d1afc6..c15c94ee17f8 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp
@@ -12,10 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUMacroFusion.h"
-#include "AMDGPUSubtarget.h"
-#include "SIInstrInfo.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-
+#include "SIInstrInfo.h"
 #include "llvm/CodeGen/MacroFusion.h"
 
 using namespace llvm;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.h
index da4b3cf8bc24..82c6d75bb060 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.h
@@ -6,7 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/ScheduleDAGMutation.h"
+#include <memory>
 
 namespace llvm {
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
index 4f9ffa11bc73..d27eb68ca74b 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
@@ -34,16 +34,11 @@
 #include "AMDGPU.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/IR/Module.h"
-#include "llvm/IR/User.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
 
 #define DEBUG_TYPE "amdgpu-lower-enqueued-block"
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h
index 8b69f51c1a0d..756bc948b1dd 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h
@@ -16,6 +16,7 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUPTNOTE_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUPTNOTE_H
 
+namespace llvm {
 namespace AMDGPU {
 
 namespace ElfNote {
@@ -41,7 +42,7 @@ enum NoteType{
     NT_AMDGPU_HSA_HLDEBUG_TARGET = 102
 };
 
-}
-}
-
+} // End namespace ElfNote
+} // End namespace AMDGPU
+} // End namespace llvm
 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUNOTETYPE_H
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
index 93079738ef99..2f6220e425cc 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
@@ -22,11 +22,7 @@
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
-#include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/ValueMap.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetMachine.h"
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
index 9599e09fbd96..99dbf5080741 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
@@ -17,7 +17,6 @@
 
 #include "llvm/Analysis/CallGraphSCCPass.h"
 #include "llvm/IR/ValueMap.h"
-#include "llvm/Pass.h"
 
 namespace llvm {
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
index 098b0e993886..09e2c762abdb 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
@@ -11,35 +11,65 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "AMDGPUTargetMachine.h"
+#include "AMDGPU.h"
 #include "AMDGPULegalizerInfo.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/CodeGen/GlobalISel/Combiner.h"
 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
 #include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/Support/Debug.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/Target/TargetMachine.h"
 
 #define DEBUG_TYPE "amdgpu-postlegalizer-combiner"
 
 using namespace llvm;
 using namespace MIPatternMatch;
 
-struct FMinFMaxLegacyInfo {
-  Register LHS;
-  Register RHS;
-  Register True;
-  Register False;
-  CmpInst::Predicate Pred;
+class AMDGPUPostLegalizerCombinerHelper {
+protected:
+  MachineIRBuilder &B;
+  MachineFunction &MF;
+  MachineRegisterInfo &MRI;
+  CombinerHelper &Helper;
+
+public:
+  AMDGPUPostLegalizerCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper)
+      : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){};
+
+  struct FMinFMaxLegacyInfo {
+    Register LHS;
+    Register RHS;
+    Register True;
+    Register False;
+    CmpInst::Predicate Pred;
+  };
+
+  // TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize
+  bool matchFMinFMaxLegacy(MachineInstr &MI, FMinFMaxLegacyInfo &Info);
+  void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI,
+                                         const FMinFMaxLegacyInfo &Info);
+
+  bool matchUCharToFloat(MachineInstr &MI);
+  void applyUCharToFloat(MachineInstr &MI);
+
+  // FIXME: Should be able to have 2 separate matchdatas rather than custom
+  // struct boilerplate.
+  struct CvtF32UByteMatchInfo {
+    Register CvtVal;
+    unsigned ShiftOffset;
+  };
+
+  bool matchCvtF32UByteN(MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo);
+  void applyCvtF32UByteN(MachineInstr &MI,
+                         const CvtF32UByteMatchInfo &MatchInfo);
 };
 
-// TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize
-static bool matchFMinFMaxLegacy(MachineInstr &MI, MachineRegisterInfo &MRI,
-                                MachineFunction &MF, FMinFMaxLegacyInfo &Info) {
+bool AMDGPUPostLegalizerCombinerHelper::matchFMinFMaxLegacy(
+    MachineInstr &MI, FMinFMaxLegacyInfo &Info) {
   // FIXME: Combines should have subtarget predicates, and we shouldn't need
   // this here.
   if (!MF.getSubtarget<GCNSubtarget>().hasFminFmaxLegacy())
@@ -77,12 +107,11 @@ static bool matchFMinFMaxLegacy(MachineInstr &MI, MachineRegisterInfo &MRI,
   }
 }
 
-static void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI,
-                                              const FMinFMaxLegacyInfo &Info) {
-
-  auto buildNewInst = [&MI](unsigned Opc, Register X, Register Y) {
-    MachineIRBuilder MIB(MI);
-    MIB.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags());
+void AMDGPUPostLegalizerCombinerHelper::applySelectFCmpToFMinToFMaxLegacy(
+    MachineInstr &MI, const FMinFMaxLegacyInfo &Info) {
+  B.setInstrAndDebugLoc(MI);
+  auto buildNewInst = [&MI, this](unsigned Opc, Register X, Register Y) {
+    B.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags());
   };
 
   switch (Info.Pred) {
@@ -127,8 +156,7 @@ static void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI,
   MI.eraseFromParent();
 }
 
-static bool matchUCharToFloat(MachineInstr &MI, MachineRegisterInfo &MRI,
-                              MachineFunction &MF, CombinerHelper &Helper) {
+bool AMDGPUPostLegalizerCombinerHelper::matchUCharToFloat(MachineInstr &MI) {
   Register DstReg = MI.getOperand(0).getReg();
 
   // TODO: We could try to match extracting the higher bytes, which would be
@@ -147,15 +175,15 @@ static bool matchUCharToFloat(MachineInstr &MI, MachineRegisterInfo &MRI,
   return false;
 }
 
-static void applyUCharToFloat(MachineInstr &MI) {
-  MachineIRBuilder B(MI);
+void AMDGPUPostLegalizerCombinerHelper::applyUCharToFloat(MachineInstr &MI) {
+  B.setInstrAndDebugLoc(MI);
 
   const LLT S32 = LLT::scalar(32);
 
   Register DstReg = MI.getOperand(0).getReg();
   Register SrcReg = MI.getOperand(1).getReg();
-  LLT Ty = B.getMRI()->getType(DstReg);
-  LLT SrcTy = B.getMRI()->getType(SrcReg);
+  LLT Ty = MRI.getType(DstReg);
+  LLT SrcTy = MRI.getType(SrcReg);
   if (SrcTy != S32)
     SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0);
 
@@ -171,16 +199,8 @@ static void applyUCharToFloat(MachineInstr &MI) {
   MI.eraseFromParent();
 }
 
-// FIXME: Should be able to have 2 separate matchdatas rather than custom struct
-// boilerplate.
-struct CvtF32UByteMatchInfo {
-  Register CvtVal;
-  unsigned ShiftOffset;
-};
-
-static bool matchCvtF32UByteN(MachineInstr &MI, MachineRegisterInfo &MRI,
-                              MachineFunction &MF,
-                              CvtF32UByteMatchInfo &MatchInfo) {
+bool AMDGPUPostLegalizerCombinerHelper::matchCvtF32UByteN(
+    MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) {
   Register SrcReg = MI.getOperand(1).getReg();
 
   // Look through G_ZEXT.
@@ -207,14 +227,14 @@ static bool matchCvtF32UByteN(MachineInstr &MI, MachineRegisterInfo &MRI,
   return false;
 }
 
-static void applyCvtF32UByteN(MachineInstr &MI,
-                              const CvtF32UByteMatchInfo &MatchInfo) {
-  MachineIRBuilder B(MI);
+void AMDGPUPostLegalizerCombinerHelper::applyCvtF32UByteN(
+    MachineInstr &MI, const CvtF32UByteMatchInfo &MatchInfo) {
+  B.setInstrAndDebugLoc(MI);
   unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8;
 
   const LLT S32 = LLT::scalar(32);
   Register CvtSrc = MatchInfo.CvtVal;
-  LLT SrcTy = B.getMRI()->getType(MatchInfo.CvtVal);
+  LLT SrcTy = MRI.getType(MatchInfo.CvtVal);
   if (SrcTy != S32) {
     assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8);
     CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0);
@@ -225,6 +245,18 @@ static void applyCvtF32UByteN(MachineInstr &MI,
   MI.eraseFromParent();
 }
 
+class AMDGPUPostLegalizerCombinerHelperState {
+protected:
+  CombinerHelper &Helper;
+  AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper;
+
+public:
+  AMDGPUPostLegalizerCombinerHelperState(
+      CombinerHelper &Helper,
+      AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper)
+      : Helper(Helper), PostLegalizerHelper(PostLegalizerHelper) {}
+};
+
 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
 #include "AMDGPUGenPostLegalizeGICombiner.inc"
 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
@@ -234,7 +266,7 @@ namespace {
 #include "AMDGPUGenPostLegalizeGICombiner.inc"
 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
 
-class AMDGPUPostLegalizerCombinerInfo : public CombinerInfo {
+class AMDGPUPostLegalizerCombinerInfo final : public CombinerInfo {
   GISelKnownBits *KB;
   MachineDominatorTree *MDT;
 
@@ -258,10 +290,12 @@ public:
 bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
                                               MachineInstr &MI,
                                               MachineIRBuilder &B) const {
-  CombinerHelper Helper(Observer, B, KB, MDT);
-  AMDGPUGenPostLegalizerCombinerHelper Generated(GeneratedRuleCfg);
+  CombinerHelper Helper(Observer, B, KB, MDT, LInfo);
+  AMDGPUPostLegalizerCombinerHelper PostLegalizerHelper(B, Helper);
+  AMDGPUGenPostLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper,
+                                                 PostLegalizerHelper);
 
-  if (Generated.tryCombineAll(Observer, MI, B, Helper))
+  if (Generated.tryCombineAll(Observer, MI, B))
     return true;
 
   switch (MI.getOpcode()) {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
index 800ad2039f0e..e4b628bf6b23 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
@@ -11,17 +11,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "AMDGPUTargetMachine.h"
+#include "AMDGPU.h"
 #include "llvm/CodeGen/GlobalISel/Combiner.h"
 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
 #include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/Support/Debug.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/Target/TargetMachine.h"
 
 #define DEBUG_TYPE "amdgpu-prelegalizer-combiner"
 
@@ -37,7 +35,7 @@ namespace {
 #include "AMDGPUGenPreLegalizeGICombiner.inc"
 #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
 
-class AMDGPUPreLegalizerCombinerInfo : public CombinerInfo {
+class AMDGPUPreLegalizerCombinerInfo final : public CombinerInfo {
   GISelKnownBits *KB;
   MachineDominatorTree *MDT;
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
index 524a34be876f..c8bd9b96b44f 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
@@ -19,33 +19,21 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
-#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "printfToRuntime"
 #define DWORD_ALIGN 4
 
 namespace {
-class LLVM_LIBRARY_VISIBILITY AMDGPUPrintfRuntimeBinding final
-    : public ModulePass {
+class AMDGPUPrintfRuntimeBinding final : public ModulePass {
 
 public:
   static char ID;
@@ -54,25 +42,36 @@ public:
 
 private:
   bool runOnModule(Module &M) override;
-  void getConversionSpecifiers(SmallVectorImpl<char> &OpConvSpecifiers,
-                               StringRef fmt, size_t num_ops) const;
-
-  bool shouldPrintAsStr(char Specifier, Type *OpType) const;
-  bool
-  lowerPrintfForGpu(Module &M,
-                    function_ref<const TargetLibraryInfo &(Function &)> GetTLI);
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<TargetLibraryInfoWrapperPass>();
     AU.addRequired<DominatorTreeWrapperPass>();
   }
+};
 
-  Value *simplify(Instruction *I, const TargetLibraryInfo *TLI) {
+class AMDGPUPrintfRuntimeBindingImpl {
+public:
+  AMDGPUPrintfRuntimeBindingImpl(
+      function_ref<const DominatorTree &(Function &)> GetDT,
+      function_ref<const TargetLibraryInfo &(Function &)> GetTLI)
+      : GetDT(GetDT), GetTLI(GetTLI) {}
+  bool run(Module &M);
+
+private:
+  void getConversionSpecifiers(SmallVectorImpl<char> &OpConvSpecifiers,
+                               StringRef fmt, size_t num_ops) const;
+
+  bool shouldPrintAsStr(char Specifier, Type *OpType) const;
+  bool lowerPrintfForGpu(Module &M);
+
+  Value *simplify(Instruction *I, const TargetLibraryInfo *TLI,
+                  const DominatorTree *DT) {
     return SimplifyInstruction(I, {*TD, TLI, DT});
   }
 
   const DataLayout *TD;
-  const DominatorTree *DT;
+  function_ref<const DominatorTree &(Function &)> GetDT;
+  function_ref<const TargetLibraryInfo &(Function &)> GetTLI;
   SmallVector<CallInst *, 32> Printfs;
 };
 } // namespace
@@ -95,12 +94,11 @@ ModulePass *createAMDGPUPrintfRuntimeBinding() {
 }
 } // namespace llvm
 
-AMDGPUPrintfRuntimeBinding::AMDGPUPrintfRuntimeBinding()
-    : ModulePass(ID), TD(nullptr), DT(nullptr) {
+AMDGPUPrintfRuntimeBinding::AMDGPUPrintfRuntimeBinding() : ModulePass(ID) {
   initializeAMDGPUPrintfRuntimeBindingPass(*PassRegistry::getPassRegistry());
 }
 
-void AMDGPUPrintfRuntimeBinding::getConversionSpecifiers(
+void AMDGPUPrintfRuntimeBindingImpl::getConversionSpecifiers(
     SmallVectorImpl<char> &OpConvSpecifiers, StringRef Fmt,
     size_t NumOps) const {
   // not all format characters are collected.
@@ -132,8 +130,8 @@ void AMDGPUPrintfRuntimeBinding::getConversionSpecifiers(
   }
 }
 
-bool AMDGPUPrintfRuntimeBinding::shouldPrintAsStr(char Specifier,
-                                                  Type *OpType) const {
+bool AMDGPUPrintfRuntimeBindingImpl::shouldPrintAsStr(char Specifier,
+                                                      Type *OpType) const {
   if (Specifier != 's')
     return false;
   const PointerType *PT = dyn_cast<PointerType>(OpType);
@@ -146,8 +144,7 @@ bool AMDGPUPrintfRuntimeBinding::shouldPrintAsStr(char Specifier,
   return ElemIType->getBitWidth() == 8;
 }
 
-bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu(
-    Module &M, function_ref<const TargetLibraryInfo &(Function &)> GetTLI) {
+bool AMDGPUPrintfRuntimeBindingImpl::lowerPrintfForGpu(Module &M) {
   LLVMContext &Ctx = M.getContext();
   IRBuilder<> Builder(Ctx);
   Type *I32Ty = Type::getInt32Ty(Ctx);
@@ -172,7 +169,8 @@ bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu(
     }
 
     if (auto I = dyn_cast<Instruction>(Op)) {
-      Value *Op_simplified = simplify(I, &GetTLI(*I->getFunction()));
+      Value *Op_simplified =
+          simplify(I, &GetTLI(*I->getFunction()), &GetDT(*I->getFunction()));
       if (Op_simplified)
         Op = Op_simplified;
     }
@@ -184,8 +182,8 @@ bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu(
 
       StringRef Str("unknown");
       if (GVar && GVar->hasInitializer()) {
-        auto Init = GVar->getInitializer();
-        if (auto CA = dyn_cast<ConstantDataArray>(Init)) {
+        auto *Init = GVar->getInitializer();
+        if (auto *CA = dyn_cast<ConstantDataArray>(Init)) {
           if (CA->isString())
             Str = CA->getAsCString();
         } else if (isa<ConstantAggregateZero>(Init)) {
@@ -248,16 +246,15 @@ bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu(
           }
         }
         if (shouldPrintAsStr(OpConvSpecifiers[ArgCount - 1], ArgType)) {
-          if (ConstantExpr *ConstExpr = dyn_cast<ConstantExpr>(Arg)) {
-            GlobalVariable *GV =
-                dyn_cast<GlobalVariable>(ConstExpr->getOperand(0));
+          if (auto *ConstExpr = dyn_cast<ConstantExpr>(Arg)) {
+            auto *GV = dyn_cast<GlobalVariable>(ConstExpr->getOperand(0));
             if (GV && GV->hasInitializer()) {
               Constant *Init = GV->getInitializer();
-              ConstantDataArray *CA = dyn_cast<ConstantDataArray>(Init);
-              if (Init->isZeroValue() || CA->isString()) {
-                size_t SizeStr = Init->isZeroValue()
-                                     ? 1
-                                     : (strlen(CA->getAsCString().data()) + 1);
+              bool IsZeroValue = Init->isZeroValue();
+              auto *CA = dyn_cast<ConstantDataArray>(Init);
+              if (IsZeroValue || (CA && CA->isString())) {
+                size_t SizeStr =
+                    IsZeroValue ? 1 : (strlen(CA->getAsCString().data()) + 1);
                 size_t Rem = SizeStr % DWORD_ALIGN;
                 size_t NSizeStr = 0;
                 LLVM_DEBUG(dbgs() << "Printf string original size = " << SizeStr
@@ -379,9 +376,8 @@ bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu(
           ConstantInt::get(Ctx, APInt(32, StringRef("0"), 10));
       ZeroIdxList.push_back(zeroInt);
 
-      GetElementPtrInst *BufferIdx =
-          dyn_cast<GetElementPtrInst>(GetElementPtrInst::Create(
-              nullptr, pcall, ZeroIdxList, "PrintBuffID", Brnch));
+      GetElementPtrInst *BufferIdx = GetElementPtrInst::Create(
+          nullptr, pcall, ZeroIdxList, "PrintBuffID", Brnch);
 
       Type *idPointer = PointerType::get(I32Ty, AMDGPUAS::GLOBAL_ADDRESS);
       Value *id_gep_cast =
@@ -395,8 +391,8 @@ bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu(
 
       FourthIdxList.push_back(fourInt); // 1st 4 bytes hold the printf_id
       // the following GEP is the buffer pointer
-      BufferIdx = cast<GetElementPtrInst>(GetElementPtrInst::Create(
-          nullptr, pcall, FourthIdxList, "PrintBuffGep", Brnch));
+      BufferIdx = GetElementPtrInst::Create(nullptr, pcall, FourthIdxList,
+                                            "PrintBuffGep", Brnch);
 
       Type *Int32Ty = Type::getInt32Ty(Ctx);
       Type *Int64Ty = Type::getInt64Ty(Ctx);
@@ -409,17 +405,15 @@ bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu(
         if (ArgType->isFPOrFPVectorTy() && !isa<VectorType>(ArgType)) {
           Type *IType = (ArgType->isFloatTy()) ? Int32Ty : Int64Ty;
           if (OpConvSpecifiers[ArgCount - 1] == 'f') {
-            ConstantFP *fpCons = dyn_cast<ConstantFP>(Arg);
-            if (fpCons) {
-              APFloat Val(fpCons->getValueAPF());
+            if (auto *FpCons = dyn_cast<ConstantFP>(Arg)) {
+              APFloat Val(FpCons->getValueAPF());
               bool Lost = false;
               Val.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven,
                           &Lost);
               Arg = ConstantFP::get(Ctx, Val);
               IType = Int32Ty;
-            } else {
-              FPExtInst *FpExt = dyn_cast<FPExtInst>(Arg);
-              if (FpExt && FpExt->getType()->isDoubleTy() &&
+            } else if (auto *FpExt = dyn_cast<FPExtInst>(Arg)) {
+              if (FpExt->getType()->isDoubleTy() &&
                   FpExt->getOperand(0)->getType()->isFloatTy()) {
                 Arg = FpExt->getOperand(0);
                 IType = Int32Ty;
@@ -431,14 +425,14 @@ bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu(
         } else if (ArgType->getTypeID() == Type::PointerTyID) {
           if (shouldPrintAsStr(OpConvSpecifiers[ArgCount - 1], ArgType)) {
             const char *S = NonLiteralStr;
-            if (ConstantExpr *ConstExpr = dyn_cast<ConstantExpr>(Arg)) {
-              GlobalVariable *GV =
-                  dyn_cast<GlobalVariable>(ConstExpr->getOperand(0));
+            if (auto *ConstExpr = dyn_cast<ConstantExpr>(Arg)) {
+              auto *GV = dyn_cast<GlobalVariable>(ConstExpr->getOperand(0));
               if (GV && GV->hasInitializer()) {
                 Constant *Init = GV->getInitializer();
-                ConstantDataArray *CA = dyn_cast<ConstantDataArray>(Init);
-                if (Init->isZeroValue() || CA->isString()) {
-                  S = Init->isZeroValue() ? "" : CA->getAsCString().data();
+                bool IsZeroValue = Init->isZeroValue();
+                auto *CA = dyn_cast<ConstantDataArray>(Init);
+                if (IsZeroValue || (CA && CA->isString())) {
+                  S = IsZeroValue ? "" : CA->getAsCString().data();
                 }
               }
             }
@@ -491,27 +485,27 @@ bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu(
           switch (EleSize) {
           default:
             EleCount = TotalSize / 64;
-            IType = dyn_cast<Type>(Type::getInt64Ty(ArgType->getContext()));
+            IType = Type::getInt64Ty(ArgType->getContext());
             break;
           case 8:
             if (EleCount >= 8) {
               EleCount = TotalSize / 64;
-              IType = dyn_cast<Type>(Type::getInt64Ty(ArgType->getContext()));
+              IType = Type::getInt64Ty(ArgType->getContext());
             } else if (EleCount >= 3) {
               EleCount = 1;
-              IType = dyn_cast<Type>(Type::getInt32Ty(ArgType->getContext()));
+              IType = Type::getInt32Ty(ArgType->getContext());
             } else {
               EleCount = 1;
-              IType = dyn_cast<Type>(Type::getInt16Ty(ArgType->getContext()));
+              IType = Type::getInt16Ty(ArgType->getContext());
             }
             break;
           case 16:
             if (EleCount >= 3) {
               EleCount = TotalSize / 64;
-              IType = dyn_cast<Type>(Type::getInt64Ty(ArgType->getContext()));
+              IType = Type::getInt64Ty(ArgType->getContext());
             } else {
               EleCount = 1;
-              IType = dyn_cast<Type>(Type::getInt32Ty(ArgType->getContext()));
+              IType = Type::getInt32Ty(ArgType->getContext());
             }
             break;
           }
@@ -539,8 +533,8 @@ bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu(
           (void)StBuff;
           if (I + 1 == E && ArgCount + 1 == CI->getNumArgOperands())
             break;
-          BufferIdx = dyn_cast<GetElementPtrInst>(GetElementPtrInst::Create(
-              nullptr, BufferIdx, BuffOffset, "PrintBuffNextPtr", Brnch));
+          BufferIdx = GetElementPtrInst::Create(nullptr, BufferIdx, BuffOffset,
+                                                "PrintBuffNextPtr", Brnch);
           LLVM_DEBUG(dbgs() << "inserting gep to the printf buffer:\n"
                             << *BufferIdx << '\n');
         }
@@ -556,7 +550,7 @@ bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu(
   return true;
 }
 
-bool AMDGPUPrintfRuntimeBinding::runOnModule(Module &M) {
+bool AMDGPUPrintfRuntimeBindingImpl::run(Module &M) {
   Triple TT(M.getTargetTriple());
   if (TT.getArch() == Triple::r600)
     return false;
@@ -585,11 +579,31 @@ bool AMDGPUPrintfRuntimeBinding::runOnModule(Module &M) {
   }
 
   TD = &M.getDataLayout();
-  auto DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
-  DT = DTWP ? &DTWP->getDomTree() : nullptr;
+
+  return lowerPrintfForGpu(M);
+}
+
+bool AMDGPUPrintfRuntimeBinding::runOnModule(Module &M) {
+  auto GetDT = [this](Function &F) -> DominatorTree & {
+    return this->getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
+  };
   auto GetTLI = [this](Function &F) -> TargetLibraryInfo & {
     return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
   };
 
-  return lowerPrintfForGpu(M, GetTLI);
+  return AMDGPUPrintfRuntimeBindingImpl(GetDT, GetTLI).run(M);
+}
+
+PreservedAnalyses
+AMDGPUPrintfRuntimeBindingPass::run(Module &M, ModuleAnalysisManager &AM) {
+  FunctionAnalysisManager &FAM =
+      AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  auto GetDT = [&FAM](Function &F) -> DominatorTree & {
+    return FAM.getResult<DominatorTreeAnalysis>(F);
+  };
+  auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
+    return FAM.getResult<TargetLibraryAnalysis>(F);
+  };
+  bool Changed = AMDGPUPrintfRuntimeBindingImpl(GetDT, GetTLI).run(M);
+  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 727f71b35049..2a6ea838efc0 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -12,53 +12,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/None.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Triple.h"
-#include "llvm/ADT/Twine.h"
+#include "GCNSubtarget.h"
 #include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/IntrinsicsR600.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
-#include <map>
-#include <tuple>
-#include <utility>
-#include <vector>
 
 #define DEBUG_TYPE "amdgpu-promote-alloca"
 
@@ -83,8 +45,26 @@ static cl::opt<unsigned> PromoteAllocaToVectorLimit(
 
 // FIXME: This can create globals so should be a module pass.
 class AMDGPUPromoteAlloca : public FunctionPass {
+public:
+  static char ID;
+
+  AMDGPUPromoteAlloca() : FunctionPass(ID) {}
+
+  bool runOnFunction(Function &F) override;
+
+  StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
+
+  bool handleAlloca(AllocaInst &I, bool SufficientLDS);
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    FunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+class AMDGPUPromoteAllocaImpl {
 private:
-  const TargetMachine *TM;
+  const TargetMachine &TM;
   Module *Mod = nullptr;
   const DataLayout *DL = nullptr;
 
@@ -116,28 +96,14 @@ private:
   /// Check whether we have enough local memory for promotion.
   bool hasSufficientLocalMem(const Function &F);
 
-public:
-  static char ID;
-
-  AMDGPUPromoteAlloca() : FunctionPass(ID) {}
-
-  bool doInitialization(Module &M) override;
-  bool runOnFunction(Function &F) override;
-
-  StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
-
   bool handleAlloca(AllocaInst &I, bool SufficientLDS);
 
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
-    FunctionPass::getAnalysisUsage(AU);
-  }
+public:
+  AMDGPUPromoteAllocaImpl(TargetMachine &TM) : TM(TM) {}
+  bool run(Function &F);
 };
 
 class AMDGPUPromoteAllocaToVector : public FunctionPass {
-private:
-  unsigned MaxVGPRs;
-
 public:
   static char ID;
 
@@ -149,8 +115,6 @@ public:
     return "AMDGPU Promote Alloca to vector";
   }
 
-  bool handleAlloca(AllocaInst &I);
-
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
     FunctionPass::getAnalysisUsage(AU);
@@ -171,32 +135,41 @@ INITIALIZE_PASS(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
 char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
 char &llvm::AMDGPUPromoteAllocaToVectorID = AMDGPUPromoteAllocaToVector::ID;
 
-bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
-  Mod = &M;
-  DL = &Mod->getDataLayout();
+bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
 
+  if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) {
+    return AMDGPUPromoteAllocaImpl(TPC->getTM<TargetMachine>()).run(F);
+  }
   return false;
 }
 
-bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
-  if (skipFunction(F))
-    return false;
+PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
+                                               FunctionAnalysisManager &AM) {
+  bool Changed = AMDGPUPromoteAllocaImpl(TM).run(F);
+  if (Changed) {
+    PreservedAnalyses PA;
+    PA.preserveSet<CFGAnalyses>();
+    return PA;
+  }
+  return PreservedAnalyses::all();
+}
 
-  if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
-    TM = &TPC->getTM<TargetMachine>();
-  else
-    return false;
+bool AMDGPUPromoteAllocaImpl::run(Function &F) {
+  Mod = F.getParent();
+  DL = &Mod->getDataLayout();
 
-  const Triple &TT = TM->getTargetTriple();
+  const Triple &TT = TM.getTargetTriple();
   IsAMDGCN = TT.getArch() == Triple::amdgcn;
   IsAMDHSA = TT.getOS() == Triple::AMDHSA;
 
-  const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
+  const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
   if (!ST.isPromoteAllocaEnabled())
     return false;
 
   if (IsAMDGCN) {
-    const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
+    const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
     MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
   } else {
     MaxVGPRs = 128;
@@ -221,9 +194,9 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
 }
 
 std::pair<Value *, Value *>
-AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
+AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) {
   const Function &F = *Builder.GetInsertBlock()->getParent();
-  const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
+  const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
 
   if (!IsAMDHSA) {
     Function *LocalSizeYFn
@@ -308,9 +281,10 @@ AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
   return std::make_pair(Y, LoadZU);
 }
 
-Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
+Value *AMDGPUPromoteAllocaImpl::getWorkitemID(IRBuilder<> &Builder,
+                                              unsigned N) {
   const AMDGPUSubtarget &ST =
-      AMDGPUSubtarget::get(*TM, *Builder.GetInsertBlock()->getParent());
+      AMDGPUSubtarget::get(TM, *Builder.GetInsertBlock()->getParent());
   Intrinsic::ID IntrID = Intrinsic::not_intrinsic;
 
   switch (N) {
@@ -592,11 +566,9 @@ static bool isCallPromotable(CallInst *CI) {
   }
 }
 
-bool AMDGPUPromoteAlloca::binaryOpIsDerivedFromSameAlloca(Value *BaseAlloca,
-                                                          Value *Val,
-                                                          Instruction *Inst,
-                                                          int OpIdx0,
-                                                          int OpIdx1) const {
+bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca(
+    Value *BaseAlloca, Value *Val, Instruction *Inst, int OpIdx0,
+    int OpIdx1) const {
   // Figure out which operand is the one we might not be promoting.
   Value *OtherOp = Inst->getOperand(OpIdx0);
   if (Val == OtherOp)
@@ -605,7 +577,7 @@ bool AMDGPUPromoteAlloca::binaryOpIsDerivedFromSameAlloca(Value *BaseAlloca,
   if (isa<ConstantPointerNull>(OtherOp))
     return true;
 
-  Value *OtherObj = GetUnderlyingObject(OtherOp, *DL);
+  Value *OtherObj = getUnderlyingObject(OtherOp);
   if (!isa<AllocaInst>(OtherObj))
     return false;
 
@@ -624,10 +596,8 @@ bool AMDGPUPromoteAlloca::binaryOpIsDerivedFromSameAlloca(Value *BaseAlloca,
   return true;
 }
 
-bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes(
-  Value *BaseAlloca,
-  Value *Val,
-  std::vector<Value*> &WorkList) const {
+bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes(
+    Value *BaseAlloca, Value *Val, std::vector<Value *> &WorkList) const {
 
   for (User *User : Val->users()) {
     if (is_contained(WorkList, User))
@@ -727,10 +697,10 @@ bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes(
   return true;
 }
 
-bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) {
+bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
 
   FunctionType *FTy = F.getFunctionType();
-  const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
+  const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
 
   // If the function has any arguments in the local address space, then it's
   // possible these arguments require the entire local memory space, so
@@ -749,35 +719,79 @@ bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) {
   if (LocalMemLimit == 0)
     return false;
 
-  const DataLayout &DL = Mod->getDataLayout();
+  SmallVector<const Constant *, 16> Stack;
+  SmallPtrSet<const Constant *, 8> VisitedConstants;
+  SmallPtrSet<const GlobalVariable *, 8> UsedLDS;
+
+  auto visitUsers = [&](const GlobalVariable *GV, const Constant *Val) -> bool {
+    for (const User *U : Val->users()) {
+      if (const Instruction *Use = dyn_cast<Instruction>(U)) {
+        if (Use->getParent()->getParent() == &F)
+          return true;
+      } else {
+        const Constant *C = cast<Constant>(U);
+        if (VisitedConstants.insert(C).second)
+          Stack.push_back(C);
+      }
+    }
+
+    return false;
+  };
 
-  // Check how much local memory is being used by global objects
-  CurrentLocalMemUsage = 0;
   for (GlobalVariable &GV : Mod->globals()) {
     if (GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
       continue;
 
-    for (const User *U : GV.users()) {
-      const Instruction *Use = dyn_cast<Instruction>(U);
-      if (!Use)
-        continue;
+    if (visitUsers(&GV, &GV)) {
+      UsedLDS.insert(&GV);
+      Stack.clear();
+      continue;
+    }
 
-      if (Use->getParent()->getParent() == &F) {
-        Align Alignment =
-            DL.getValueOrABITypeAlignment(GV.getAlign(), GV.getValueType());
-
-        // FIXME: Try to account for padding here. The padding is currently
-        // determined from the inverse order of uses in the function. I'm not
-        // sure if the use list order is in any way connected to this, so the
-        // total reported size is likely incorrect.
-        uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType());
-        CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Alignment);
-        CurrentLocalMemUsage += AllocSize;
+    // For any ConstantExpr uses, we need to recursively search the users until
+    // we see a function.
+    while (!Stack.empty()) {
+      const Constant *C = Stack.pop_back_val();
+      if (visitUsers(&GV, C)) {
+        UsedLDS.insert(&GV);
+        Stack.clear();
         break;
       }
     }
   }
 
+  const DataLayout &DL = Mod->getDataLayout();
+  SmallVector<std::pair<uint64_t, Align>, 16> AllocatedSizes;
+  AllocatedSizes.reserve(UsedLDS.size());
+
+  for (const GlobalVariable *GV : UsedLDS) {
+    Align Alignment =
+        DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getValueType());
+    uint64_t AllocSize = DL.getTypeAllocSize(GV->getValueType());
+    AllocatedSizes.emplace_back(AllocSize, Alignment);
+  }
+
+  // Sort to try to estimate the worst case alignment padding
+  //
+  // FIXME: We should really do something to fix the addresses to a more optimal
+  // value instead
+  llvm::sort(AllocatedSizes, [](std::pair<uint64_t, Align> LHS,
+                                std::pair<uint64_t, Align> RHS) {
+    return LHS.second < RHS.second;
+  });
+
+  // Check how much local memory is being used by global objects
+  CurrentLocalMemUsage = 0;
+
+  // FIXME: Try to account for padding here. The real padding and address is
+  // currently determined from the inverse order of uses in the function when
+  // legalizing, which could also potentially change. We try to estimate the
+  // worst case here, but we probably should fix the addresses earlier.
+  for (auto Alloc : AllocatedSizes) {
+    CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Alloc.second);
+    CurrentLocalMemUsage += Alloc.first;
+  }
+
   unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage,
                                                           F);
 
@@ -819,7 +833,7 @@ bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) {
 }
 
 // FIXME: Should try to pick the most likely to be profitable allocas first.
-bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
+bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) {
   // Array allocations are probably not worth handling, since an allocation of
   // the array type is the canonical form.
   if (!I.isStaticAlloca() || I.isArrayAllocation())
@@ -860,7 +874,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
   if (!SufficientLDS)
     return false;
 
-  const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, ContainingFunction);
+  const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, ContainingFunction);
   unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
 
   Align Alignment =
@@ -1039,22 +1053,29 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
   return true;
 }
 
-bool AMDGPUPromoteAllocaToVector::runOnFunction(Function &F) {
-  if (skipFunction(F) || DisablePromoteAllocaToVector)
+bool handlePromoteAllocaToVector(AllocaInst &I, unsigned MaxVGPRs) {
+  // Array allocations are probably not worth handling, since an allocation of
+  // the array type is the canonical form.
+  if (!I.isStaticAlloca() || I.isArrayAllocation())
     return false;
 
-  const TargetMachine *TM;
-  if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
-    TM = &TPC->getTM<TargetMachine>();
-  else
+  LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
+
+  Module *Mod = I.getParent()->getParent()->getParent();
+  return tryPromoteAllocaToVector(&I, Mod->getDataLayout(), MaxVGPRs);
+}
+
+bool promoteAllocasToVector(Function &F, TargetMachine &TM) {
+  if (DisablePromoteAllocaToVector)
     return false;
 
-  const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
+  const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
   if (!ST.isPromoteAllocaEnabled())
     return false;
 
-  if (TM->getTargetTriple().getArch() == Triple::amdgcn) {
-    const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
+  unsigned MaxVGPRs;
+  if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
+    const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
     MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
   } else {
     MaxVGPRs = 128;
@@ -1070,23 +1091,31 @@ bool AMDGPUPromoteAllocaToVector::runOnFunction(Function &F) {
   }
 
   for (AllocaInst *AI : Allocas) {
-    if (handleAlloca(*AI))
+    if (handlePromoteAllocaToVector(*AI, MaxVGPRs))
       Changed = true;
   }
 
   return Changed;
 }
 
-bool AMDGPUPromoteAllocaToVector::handleAlloca(AllocaInst &I) {
-  // Array allocations are probably not worth handling, since an allocation of
-  // the array type is the canonical form.
-  if (!I.isStaticAlloca() || I.isArrayAllocation())
+bool AMDGPUPromoteAllocaToVector::runOnFunction(Function &F) {
+  if (skipFunction(F))
     return false;
+  if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) {
+    return promoteAllocasToVector(F, TPC->getTM<TargetMachine>());
+  }
+  return false;
+}
 
-  LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
-
-  Module *Mod = I.getParent()->getParent()->getParent();
-  return tryPromoteAllocaToVector(&I, Mod->getDataLayout(), MaxVGPRs);
+PreservedAnalyses
+AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) {
+  bool Changed = promoteAllocasToVector(F, TM);
+  if (Changed) {
+    PreservedAnalyses PA;
+    PA.preserveSet<CFGAnalyses>();
+    return PA;
+  }
+  return PreservedAnalyses::all();
 }
 
 FunctionPass *llvm::createAMDGPUPromoteAlloca() {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp
index 982aae374884..cd71c7a16c73 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp
@@ -27,16 +27,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Module.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/InstrTypes.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Utils/Cloning.h"
-#include <string>
 
 #define DEBUG_TYPE "amdgpu-propagate-attributes"
 
@@ -56,8 +54,10 @@ static constexpr const FeatureBitset TargetFeatures = {
 };
 
 // Attributes to propagate.
+// TODO: Support conservative min/max merging instead of cloning.
 static constexpr const char* AttributeNames[] = {
-  "amdgpu-waves-per-eu"
+  "amdgpu-waves-per-eu",
+  "amdgpu-flat-work-group-size"
 };
 
 static constexpr unsigned NumAttr =
@@ -371,15 +371,28 @@ AMDGPUPropagateAttributes::getFeatureString(const FeatureBitset &Features) const
 }
 
 bool AMDGPUPropagateAttributesEarly::runOnFunction(Function &F) {
-  if (!TM || !AMDGPU::isEntryFunctionCC(F.getCallingConv()))
+  if (!TM) {
+    auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+    if (!TPC)
+      return false;
+
+    TM = &TPC->getTM<TargetMachine>();
+  }
+
+  if (!AMDGPU::isEntryFunctionCC(F.getCallingConv()))
     return false;
 
   return AMDGPUPropagateAttributes(TM, false).process(F);
 }
 
 bool AMDGPUPropagateAttributesLate::runOnModule(Module &M) {
-  if (!TM)
-    return false;
+  if (!TM) {
+    auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+    if (!TPC)
+      return false;
+
+    TM = &TPC->getTM<TargetMachine>();
+  }
 
   return AMDGPUPropagateAttributes(TM, true).process(M);
 }
@@ -393,3 +406,21 @@ ModulePass
 *llvm::createAMDGPUPropagateAttributesLatePass(const TargetMachine *TM) {
   return new AMDGPUPropagateAttributesLate(TM);
 }
+
+PreservedAnalyses
+AMDGPUPropagateAttributesEarlyPass::run(Function &F,
+                                        FunctionAnalysisManager &AM) {
+  if (!AMDGPU::isEntryFunctionCC(F.getCallingConv()))
+    return PreservedAnalyses::all();
+
+  return AMDGPUPropagateAttributes(&TM, false).process(F)
+             ? PreservedAnalyses::none()
+             : PreservedAnalyses::all();
+}
+
+PreservedAnalyses
+AMDGPUPropagateAttributesLatePass::run(Module &M, ModuleAnalysisManager &AM) {
+  return AMDGPUPropagateAttributes(&TM, true).process(M)
+             ? PreservedAnalyses::none()
+             : PreservedAnalyses::all();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index 71d82679b3ff..d644c0319286 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -11,19 +11,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "AMDGPUTargetMachine.h"
+#include "AMDGPU.h"
 #include "AMDGPULegalizerInfo.h"
+#include "GCNSubtarget.h"
 #include "llvm/CodeGen/GlobalISel/Combiner.h"
 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
 #include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/Support/Debug.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-
+#include "llvm/Target/TargetMachine.h"
 #define DEBUG_TYPE "amdgpu-regbank-combiner"
 
 using namespace llvm;
@@ -39,7 +37,7 @@ namespace {
 #include "AMDGPUGenRegBankGICombiner.inc"
 #undef AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_H
 
-class AMDGPURegBankCombinerInfo : public CombinerInfo {
+class AMDGPURegBankCombinerInfo final : public CombinerInfo {
   GISelKnownBits *KB;
   MachineDominatorTree *MDT;
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index dfaf97bfb08e..502356d4f9a4 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -70,21 +70,17 @@
 
 #include "AMDGPURegisterBankInfo.h"
 
+#include "AMDGPU.h"
 #include "AMDGPUGlobalISelUtils.h"
 #include "AMDGPUInstrInfo.h"
-#include "AMDGPUSubtarget.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "GCNSubtarget.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
-#include "llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/CodeGen/TargetSubtargetInfo.h"
-#include "llvm/IR/Constants.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
 
 #define GET_TARGET_REGBANK_IMPL
 #include "AMDGPUGenRegisterBank.inc"
@@ -187,7 +183,12 @@ public:
   }
 
   void changingInstr(MachineInstr &MI) override {}
-  void changedInstr(MachineInstr &MI) override {}
+  void changedInstr(MachineInstr &MI) override {
+    // FIXME: In principle we should probably add the instruction to NewInsts,
+    // but the way the LegalizerHelper uses the observer, we will always see the
+    // registers we need to set the regbank on also referenced in a new
+    // instruction.
+  }
 };
 
 }
@@ -750,6 +751,9 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
 
   for (MachineInstr &MI : Range) {
     for (MachineOperand &Def : MI.defs()) {
+      if (MRI.use_nodbg_empty(Def.getReg()))
+        continue;
+
       LLT ResTy = MRI.getType(Def.getReg());
       const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI);
       ResultRegs.push_back(Def.getReg());
@@ -847,7 +851,18 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
         continue;
       }
 
-      LLT OpTy = MRI.getType(Op.getReg());
+      Register OpReg = Op.getReg();
+      LLT OpTy = MRI.getType(OpReg);
+
+      const RegisterBank *OpBank = getRegBank(OpReg, MRI, *TRI);
+      if (OpBank != &AMDGPU::VGPRRegBank) {
+        // Insert copy from AGPR to VGPR before the loop.
+        B.setMBB(MBB);
+        OpReg = B.buildCopy(OpTy, OpReg).getReg(0);
+        MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank);
+        B.setInstr(*I);
+      }
+
       unsigned OpSize = OpTy.getSizeInBits();
 
       // Can only do a readlane of 32-bit pieces.
@@ -857,11 +872,11 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
           = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
         MRI.setType(CurrentLaneOpReg, OpTy);
 
-        constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI);
+        constrainGenericRegister(OpReg, AMDGPU::VGPR_32RegClass, MRI);
         // Read the next variant <- also loop target.
         BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
                 CurrentLaneOpReg)
-          .addReg(Op.getReg());
+          .addReg(OpReg);
 
         Register NewCondReg = MRI.createVirtualRegister(WaveRC);
         bool First = CondReg == AMDGPU::NoRegister;
@@ -872,7 +887,7 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
         B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64)
           .addDef(NewCondReg)
           .addReg(CurrentLaneOpReg)
-          .addReg(Op.getReg());
+          .addReg(OpReg);
         Op.setReg(CurrentLaneOpReg);
 
         if (!First) {
@@ -904,7 +919,7 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
         // Insert the unmerge before the loop.
 
         B.setMBB(MBB);
-        auto Unmerge = B.buildUnmerge(UnmergeTy, Op.getReg());
+        auto Unmerge = B.buildUnmerge(UnmergeTy, OpReg);
         B.setInstr(*I);
 
         unsigned NumPieces = Unmerge->getNumOperands() - 1;
@@ -1039,7 +1054,7 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
 
 // Return any unique registers used by \p MI at \p OpIndices that need to be
 // handled in a waterfall loop. Returns these registers in \p
-// SGPROperandRegs. Returns true if there are any operansd to handle and a
+// SGPROperandRegs. Returns true if there are any operands to handle and a
 // waterfall loop is necessary.
 bool AMDGPURegisterBankInfo::collectWaterfallOperands(
   SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI,
@@ -1048,7 +1063,7 @@ bool AMDGPURegisterBankInfo::collectWaterfallOperands(
     assert(MI.getOperand(Op).isUse());
     Register Reg = MI.getOperand(Op).getReg();
     const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
-    if (OpBank->getID() == AMDGPU::VGPRRegBankID)
+    if (OpBank->getID() != AMDGPU::SGPRRegBankID)
       SGPROperandRegs.insert(Reg);
   }
 
@@ -1083,16 +1098,24 @@ void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
     MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const {
   Register Reg = MI.getOperand(OpIdx).getReg();
   const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
-  if (Bank != &AMDGPU::VGPRRegBank)
+  if (Bank == &AMDGPU::SGPRRegBank)
     return;
 
+  LLT Ty = MRI.getType(Reg);
   MachineIRBuilder B(MI);
+
+  if (Bank != &AMDGPU::VGPRRegBank) {
+    // We need to copy from AGPR to VGPR
+    Reg = B.buildCopy(Ty, Reg).getReg(0);
+    MRI.setRegBank(Reg, AMDGPU::VGPRRegBank);
+  }
+
   Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
   B.buildInstr(AMDGPU::V_READFIRSTLANE_B32)
     .addDef(SGPR)
     .addReg(Reg);
 
-  MRI.setType(SGPR, MRI.getType(Reg));
+  MRI.setType(SGPR, Ty);
 
   const TargetRegisterClass *Constrained =
       constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI);
@@ -1149,10 +1172,8 @@ bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI,
     // 96-bit loads are only available for vector loads. We need to split this
     // into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
 
-    MachineIRBuilder B(MI);
     ApplyRegBankMapping O(*this, MRI, &AMDGPU::SGPRRegBank);
-    GISelObserverWrapper Observer(&O);
-    B.setChangeObserver(Observer);
+    MachineIRBuilder B(MI, O);
 
     if (MMO->getAlign() < Align(16)) {
       LLT Part64, Part32;
@@ -1191,13 +1212,10 @@ bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI,
   LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
   MRI.setType(BasePtrReg, PtrTy);
 
-  MachineIRBuilder B(MI);
-
   unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize;
   const LLT LoadSplitTy = LoadTy.divide(NumSplitParts);
-  ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank);
-  GISelObserverWrapper Observer(&O);
-  B.setChangeObserver(Observer);
+  ApplyRegBankMapping Observer(*this, MRI, &AMDGPU::VGPRRegBank);
+  MachineIRBuilder B(MI, Observer);
   LegalizerHelper Helper(B.getMF(), Observer, B);
 
   if (LoadTy.isVector()) {
@@ -1241,10 +1259,7 @@ bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
   Register SPReg = Info->getStackPtrOffsetReg();
   ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
-  GISelObserverWrapper Observer(&ApplyBank);
-
-  MachineIRBuilder B(MI);
-  B.setChangeObserver(Observer);
+  MachineIRBuilder B(MI, ApplyBank);
 
   auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2());
   auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize);
@@ -1309,7 +1324,7 @@ static unsigned setBufferOffsets(MachineIRBuilder &B,
   const LLT S32 = LLT::scalar(32);
   MachineRegisterInfo *MRI = B.getMRI();
 
-  if (Optional<int64_t> Imm = getConstantVRegVal(CombinedOffset, *MRI)) {
+  if (Optional<int64_t> Imm = getConstantVRegSExtVal(CombinedOffset, *MRI)) {
     uint32_t SOffset, ImmOffset;
     if (AMDGPU::splitMUBUFOffset(*Imm, SOffset, ImmOffset, &RBI.Subtarget,
                                  Alignment)) {
@@ -1325,10 +1340,9 @@ static unsigned setBufferOffsets(MachineIRBuilder &B,
 
   Register Base;
   unsigned Offset;
-  MachineInstr *Unused;
 
-  std::tie(Base, Offset, Unused)
-    = AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset);
+  std::tie(Base, Offset) =
+      AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset);
 
   uint32_t SOffset, ImmOffset;
   if (Offset > 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
@@ -1535,9 +1549,7 @@ bool AMDGPURegisterBankInfo::applyMappingBFEIntrinsic(
   // The scalar form packs the offset and width in a single operand.
 
   ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
-  GISelObserverWrapper Observer(&ApplyBank);
-  MachineIRBuilder B(MI);
-  B.setChangeObserver(Observer);
+  MachineIRBuilder B(MI, ApplyBank);
 
   // Ensure the high bits are clear to insert the offset.
   auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6));
@@ -1922,7 +1934,7 @@ bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
   const RegisterBank &IdxBank =
     *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
 
-  bool IsDivergentIdx = IdxBank == AMDGPU::VGPRRegBank;
+  bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
 
   LLT VecTy = MRI.getType(VecReg);
   unsigned EltSize = VecTy.getScalarSizeInBits();
@@ -2004,7 +2016,7 @@ bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
   const RegisterBank &IdxBank =
     *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
 
-  bool IsDivergentIdx = IdxBank == AMDGPU::VGPRRegBank;
+  bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
 
   LLT VecTy = MRI.getType(VecReg);
   unsigned EltSize = VecTy.getScalarSizeInBits();
@@ -2129,9 +2141,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
     // Promote SGPR/VGPR booleans to s32
     MachineFunction *MF = MI.getParent()->getParent();
     ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
-    GISelObserverWrapper Observer(&ApplyBank);
-    MachineIRBuilder B(MI);
-    LegalizerHelper Helper(*MF, Observer, B);
+    MachineIRBuilder B(MI, ApplyBank);
+    LegalizerHelper Helper(*MF, ApplyBank, B);
 
     if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
       llvm_unreachable("widen scalar should have succeeded");
@@ -2274,9 +2285,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
 
       MachineFunction *MF = MI.getParent()->getParent();
       ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
-      GISelObserverWrapper Observer(&ApplyBank);
-      MachineIRBuilder B(MI);
-      LegalizerHelper Helper(*MF, Observer, B);
+      MachineIRBuilder B(MI, ApplyBank);
+      LegalizerHelper Helper(*MF, ApplyBank, B);
 
       if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
           LegalizerHelper::Legalized)
@@ -2319,15 +2329,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
 
     setRegsToType(MRI, DefRegs, HalfTy);
 
-    B.buildInstr(Opc)
-      .addDef(DefRegs[0])
-      .addUse(Src0Regs[0])
-      .addUse(Src1Regs[0]);
-
-    B.buildInstr(Opc)
-      .addDef(DefRegs[1])
-      .addUse(Src0Regs[1])
-      .addUse(Src1Regs[1]);
+    B.buildInstr(Opc, {DefRegs[0]}, {Src0Regs[0], Src1Regs[0]});
+    B.buildInstr(Opc, {DefRegs[1]}, {Src0Regs[1], Src1Regs[1]});
 
     MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
     MI.eraseFromParent();
@@ -2355,13 +2358,10 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
     const LLT S32 = LLT::scalar(32);
     MachineBasicBlock *MBB = MI.getParent();
     MachineFunction *MF = MBB->getParent();
-    MachineIRBuilder B(MI);
     ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
-    GISelObserverWrapper Observer(&ApplySALU);
+    MachineIRBuilder B(MI, ApplySALU);
 
     if (DstTy.isVector()) {
-      B.setChangeObserver(Observer);
-
       Register WideSrc0Lo, WideSrc0Hi;
       Register WideSrc1Lo, WideSrc1Hi;
 
@@ -2374,7 +2374,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
       B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
       MI.eraseFromParent();
     } else {
-      LegalizerHelper Helper(*MF, Observer, B);
+      LegalizerHelper Helper(*MF, ApplySALU, B);
 
       if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
         llvm_unreachable("widen scalar should have succeeded");
@@ -2411,8 +2411,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
 
     if (Ty == V2S16) {
       ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
-      GISelObserverWrapper Observer(&ApplySALU);
-      B.setChangeObserver(Observer);
+      B.setChangeObserver(ApplySALU);
 
       // Need to widen to s32, and expand as cmp + select, and avoid producing
       // illegal vector extends or unmerges that would need further
@@ -2444,8 +2443,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
       MI.eraseFromParent();
     } else if (Ty == S16) {
       ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
-      GISelObserverWrapper Observer(&ApplySALU);
-      LegalizerHelper Helper(*MF, Observer, B);
+      B.setChangeObserver(ApplySALU);
+      LegalizerHelper Helper(*MF, ApplySALU, B);
 
       // Need to widen to s32, and expand as cmp + select.
       if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
@@ -2499,9 +2498,6 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
   case AMDGPU::G_CTPOP:
   case AMDGPU::G_CTLZ_ZERO_UNDEF:
   case AMDGPU::G_CTTZ_ZERO_UNDEF: {
-    MachineIRBuilder B(MI);
-    MachineFunction &MF = B.getMF();
-
     const RegisterBank *DstBank =
       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
     if (DstBank == &AMDGPU::SGPRRegBank)
@@ -2514,8 +2510,10 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
       break;
 
     ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank);
-    GISelObserverWrapper Observer(&ApplyVALU);
-    LegalizerHelper Helper(MF, Observer, B);
+    MachineIRBuilder B(MI, ApplyVALU);
+
+    MachineFunction &MF = B.getMF();
+    LegalizerHelper Helper(MF, ApplyVALU, B);
 
     if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized)
       llvm_unreachable("narrowScalar should have succeeded");
@@ -2693,8 +2691,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
 
     Register BaseIdxReg;
     unsigned ConstOffset;
-    MachineInstr *OffsetDef;
-    std::tie(BaseIdxReg, ConstOffset, OffsetDef) =
+    std::tie(BaseIdxReg, ConstOffset) =
         AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg());
 
     // See if the index is an add of a constant which will be foldable by moving
@@ -2825,9 +2822,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
 
     Register BaseIdxReg;
     unsigned ConstOffset;
-    MachineInstr *OffsetDef;
-    std::tie(BaseIdxReg, ConstOffset, OffsetDef) =
-      AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg());
+    std::tie(BaseIdxReg, ConstOffset) =
+        AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg());
 
     // See if the index is an add of a constant which will be foldable by moving
     // the base register of the index later if this is going to be executed in a
@@ -2957,6 +2953,11 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
     executeInWaterfallLoop(MI, MRI, {2, 5});
     return;
   }
+  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: {
+    applyDefaultMapping(OpdMapper);
+    executeInWaterfallLoop(MI, MRI, {2, 5});
+    return;
+  }
   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
     applyDefaultMapping(OpdMapper);
     executeInWaterfallLoop(MI, MRI, {3, 6});
@@ -2989,7 +2990,6 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
       constrainOpWithReadfirstlane(MI, MRI, 3); // Index
       return;
     }
-    case Intrinsic::amdgcn_ballot:
     case Intrinsic::amdgcn_interp_p1:
     case Intrinsic::amdgcn_interp_p2:
     case Intrinsic::amdgcn_interp_mov:
@@ -3017,6 +3017,9 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
     case Intrinsic::amdgcn_ubfe:
       applyMappingBFEIntrinsic(OpdMapper, false);
       return;
+    case Intrinsic::amdgcn_ballot:
+      // Use default handling and insert copy to vcc source.
+      break;
     }
     break;
   }
@@ -3031,6 +3034,11 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
     applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
     return;
   }
+  case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
+    unsigned N = MI.getNumExplicitOperands() - 2;
+    executeInWaterfallLoop(MI, MRI, { N });
+    return;
+  }
   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
     auto IntrID = MI.getIntrinsicID();
     switch (IntrID) {
@@ -3106,6 +3114,59 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
   return applyDefaultMapping(OpdMapper);
 }
 
+// vgpr, sgpr -> vgpr
+// vgpr, agpr -> vgpr
+// agpr, agpr -> agpr
+// agpr, sgpr -> vgpr
+static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
+  if (RB0 == AMDGPU::InvalidRegBankID)
+    return RB1;
+  if (RB1 == AMDGPU::InvalidRegBankID)
+    return RB0;
+
+  if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID)
+    return AMDGPU::SGPRRegBankID;
+
+  if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID)
+    return AMDGPU::AGPRRegBankID;
+
+  return AMDGPU::VGPRRegBankID;
+}
+
+static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) {
+  if (RB0 == AMDGPU::InvalidRegBankID)
+    return RB1;
+  if (RB1 == AMDGPU::InvalidRegBankID)
+    return RB0;
+
+  // vcc, vcc -> vcc
+  // vcc, sgpr -> vcc
+  // vcc, vgpr -> vcc
+  if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID)
+    return AMDGPU::VCCRegBankID;
+
+  // vcc, vgpr -> vgpr
+  return regBankUnion(RB0, RB1);
+}
+
+unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI,
+                                                const MachineInstr &MI) const {
+  unsigned RegBank = AMDGPU::InvalidRegBankID;
+
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    if (!MI.getOperand(i).isReg())
+      continue;
+    Register Reg = MI.getOperand(i).getReg();
+    if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
+      RegBank = regBankUnion(RegBank, Bank->getID());
+      if (RegBank == AMDGPU::VGPRRegBankID)
+        break;
+    }
+  }
+
+  return RegBank;
+}
+
 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
   const MachineFunction &MF = *MI.getParent()->getParent();
   const MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -3214,7 +3275,7 @@ AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI,
 
     if (MustBeSGPR) {
       // If this must be an SGPR, so we must report whatever it is as legal.
-      unsigned NewBank = getRegBankID(OpReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
+      unsigned NewBank = getRegBankID(OpReg, MRI, AMDGPU::SGPRRegBankID);
       OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size);
     } else {
       // Some operands must be VGPR, and these are easy to copy to.
@@ -3232,7 +3293,7 @@ AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI,
   LLT PtrTy = MRI.getType(PtrReg);
   unsigned Size = PtrTy.getSizeInBits();
   if (Subtarget.useFlatForGlobal() ||
-      !SITargetLowering::isFlatGlobalAddrSpace(PtrTy.getAddressSpace()))
+      !AMDGPU::isFlatGlobalAddrSpace(PtrTy.getAddressSpace()))
     return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
 
   // If we're using MUBUF instructions for global memory, an SGPR base register
@@ -3258,8 +3319,7 @@ AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
 
   const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
 
-  if (PtrBank == &AMDGPU::SGPRRegBank &&
-      SITargetLowering::isFlatGlobalAddrSpace(AS)) {
+  if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) {
     if (isScalarLoadLegal(MI)) {
       // We have a uniform instruction so we want to use an SMRD load
       ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
@@ -3292,41 +3352,18 @@ AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
 unsigned
 AMDGPURegisterBankInfo::getRegBankID(Register Reg,
                                      const MachineRegisterInfo &MRI,
-                                     const TargetRegisterInfo &TRI,
                                      unsigned Default) const {
-  const RegisterBank *Bank = getRegBank(Reg, MRI, TRI);
+  const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
   return Bank ? Bank->getID() : Default;
 }
 
-
-static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
-  return (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID) ?
-    AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
-}
-
-static int regBankBoolUnion(int RB0, int RB1) {
-  if (RB0 == -1)
-    return RB1;
-  if (RB1 == -1)
-    return RB0;
-
-  // vcc, vcc -> vcc
-  // vcc, sgpr -> vcc
-  // vcc, vgpr -> vcc
-  if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID)
-    return AMDGPU::VCCRegBankID;
-
-  // vcc, vgpr -> vgpr
-  return regBankUnion(RB0, RB1);
-}
-
 const RegisterBankInfo::ValueMapping *
 AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg,
                                          const MachineRegisterInfo &MRI,
                                          const TargetRegisterInfo &TRI) const {
   // Lie and claim anything is legal, even though this needs to be an SGPR
   // applyMapping will have to deal with it as a waterfall loop.
-  unsigned Bank = getRegBankID(Reg, MRI, TRI, AMDGPU::SGPRRegBankID);
+  unsigned Bank = getRegBankID(Reg, MRI, AMDGPU::SGPRRegBankID);
   unsigned Size = getSizeInBits(Reg, MRI, TRI);
   return AMDGPU::getValueMapping(Bank, Size);
 }
@@ -3361,7 +3398,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   const MachineFunction &MF = *MI.getParent()->getParent();
   const MachineRegisterInfo &MRI = MF.getRegInfo();
 
-  if (MI.isCopy()) {
+  if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) {
     // The default logic bothers to analyze impossible alternative mappings. We
     // want the most straightforward mapping, so just directly handle this.
     const RegisterBank *DstBank = getRegBank(MI.getOperand(0).getReg(), MRI,
@@ -3377,9 +3414,15 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       return getInvalidInstructionMapping();
 
     const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank);
+    unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2;
+    SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize);
+    OpdsMapping[0] = &ValMap;
+    if (MI.getOpcode() == AMDGPU::G_FREEZE)
+      OpdsMapping[1] = &ValMap;
+
     return getInstructionMapping(
         1, /*Cost*/ 1,
-        /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
+        /*OperandsMapping*/ getOperandsMapping(OpdsMapping), OpdsMappingSize);
   }
 
   if (MI.isRegSequence()) {
@@ -3388,7 +3431,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     unsigned BankID = AMDGPU::SGPRRegBankID;
 
     for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
-      auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI, *TRI);
+      auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI);
       // It doesn't make sense to use vcc or scc banks here, so just ignore
       // them.
       if (OpBank != AMDGPU::SGPRRegBankID) {
@@ -3409,8 +3452,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   //
   // TODO: There are additional exec masking dependencies to analyze.
   if (MI.getOpcode() == TargetOpcode::G_PHI) {
-    // TODO: Generate proper invalid bank enum.
-    int ResultBank = -1;
+    unsigned ResultBank = AMDGPU::InvalidRegBankID;
     Register DstReg = MI.getOperand(0).getReg();
 
     // Sometimes the result may have already been assigned a bank.
@@ -3432,7 +3474,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       ResultBank = regBankBoolUnion(ResultBank, OpBank);
     }
 
-    assert(ResultBank != -1);
+    assert(ResultBank != AMDGPU::InvalidRegBankID);
 
     unsigned Size = MRI.getType(DstReg).getSizeInBits();
 
@@ -3461,9 +3503,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       const RegisterBank *DstBank
         = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI);
 
-      unsigned TargetBankID = -1;
-      unsigned BankLHS = -1;
-      unsigned BankRHS = -1;
+      unsigned TargetBankID = AMDGPU::InvalidRegBankID;
+      unsigned BankLHS = AMDGPU::InvalidRegBankID;
+      unsigned BankRHS = AMDGPU::InvalidRegBankID;
       if (DstBank) {
         TargetBankID = DstBank->getID();
         if (DstBank == &AMDGPU::VCCRegBank) {
@@ -3471,15 +3513,15 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
           BankLHS = AMDGPU::VCCRegBankID;
           BankRHS = AMDGPU::VCCRegBankID;
         } else {
-          BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
+          BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
                                  AMDGPU::SGPRRegBankID);
-          BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
+          BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
                                  AMDGPU::SGPRRegBankID);
         }
       } else {
-        BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
+        BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
                                AMDGPU::VCCRegBankID);
-        BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
+        BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
                                AMDGPU::VCCRegBankID);
 
         // Both inputs should be true booleans to produce a boolean result.
@@ -3507,10 +3549,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
         OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
       } else {
         OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
-        unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI/*, DefaultBankID*/);
+        unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/);
         OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
 
-        unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI/*, DefaultBankID*/);
+        unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI /*, DefaultBankID*/);
         OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size);
       }
 
@@ -3542,6 +3584,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       return getDefaultMappingSOP(MI);
     LLVM_FALLTHROUGH;
 
+  case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU
+  case AMDGPU::G_SSUBSAT:
+  case AMDGPU::G_UADDSAT:
+  case AMDGPU::G_USUBSAT:
   case AMDGPU::G_FADD:
   case AMDGPU::G_FSUB:
   case AMDGPU::G_FPTOSI:
@@ -3606,13 +3652,12 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case AMDGPU::G_DYN_STACKALLOC: {
     // Result is always uniform, and a wave reduction is needed for the source.
     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
-    unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
+    unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
     OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32);
     break;
   }
   case AMDGPU::G_INSERT: {
-    unsigned BankID = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID :
-                                          AMDGPU::VGPRRegBankID;
+    unsigned BankID = getMappingType(MRI, MI);
     unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
     unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
     unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
@@ -3623,7 +3668,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     break;
   }
   case AMDGPU::G_EXTRACT: {
-    unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
+    unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
     unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
     unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
@@ -3637,8 +3682,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     if (DstTy == LLT::vector(2, 16)) {
       unsigned DstSize = DstTy.getSizeInBits();
       unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
-      unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
-      unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
+      unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
+      unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
       unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID);
 
       OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize);
@@ -3651,8 +3696,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   }
   case AMDGPU::G_MERGE_VALUES:
   case AMDGPU::G_CONCAT_VECTORS: {
-    unsigned Bank = isSALUMapping(MI) ?
-      AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
+    unsigned Bank = getMappingType(MRI, MI);
     unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
     unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
 
@@ -3669,7 +3713,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case AMDGPU::G_FABS:
   case AMDGPU::G_FNEG: {
     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
-    unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
+    unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
     OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
     break;
   }
@@ -3677,7 +3721,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case AMDGPU::G_CTTZ_ZERO_UNDEF:
   case AMDGPU::G_CTPOP: {
     unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
-    unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
+    unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
 
     // This should really be getValueMappingSGPR64Only, but allowing the generic
@@ -3689,7 +3733,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case AMDGPU::G_TRUNC: {
     Register Dst = MI.getOperand(0).getReg();
     Register Src = MI.getOperand(1).getReg();
-    unsigned Bank = getRegBankID(Src, MRI, *TRI);
+    unsigned Bank = getRegBankID(Src, MRI);
     unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
     unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
@@ -3726,7 +3770,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   }
   case AMDGPU::G_FCMP: {
     unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
-    unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
+    unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
     OpdsMapping[1] = nullptr; // Predicate Operand.
     OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
@@ -3751,10 +3795,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
 
     // See if the result register has already been constrained to vcc, which may
     // happen due to control flow intrinsic lowering.
-    unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI, *TRI,
+    unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI,
                                     AMDGPU::SGPRRegBankID);
-    unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
-    unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
+    unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
+    unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI);
 
     bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID &&
                      Op2Bank == AMDGPU::SGPRRegBankID &&
@@ -3777,11 +3821,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   }
   case AMDGPU::G_EXTRACT_VECTOR_ELT: {
     // VGPR index can be used for waterfall when indexing a SGPR vector.
-    unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
+    unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
     unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
     unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
     unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
-    unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
+    unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI);
     unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank);
 
     OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize);
@@ -3798,9 +3842,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
     unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
     unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
-    unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(),
-                                            MRI, *TRI);
-    unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
+    unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
+    unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI);
 
     OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
     OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize);
@@ -3820,8 +3863,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     break;
   }
   case AMDGPU::G_UNMERGE_VALUES: {
-    unsigned Bank = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID :
-      AMDGPU::VGPRRegBankID;
+    unsigned Bank = getMappingType(MRI, MI);
 
     // Op1 and Dst should use the same register bank.
     // FIXME: Shouldn't this be the default? Why do we need to handle this?
@@ -3876,7 +3918,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
-  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: {
+  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
+  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: {
     // vdata_out
     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
 
@@ -3958,6 +4001,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_rsq_legacy:
     case Intrinsic::amdgcn_rsq_clamp:
     case Intrinsic::amdgcn_fmul_legacy:
+    case Intrinsic::amdgcn_fma_legacy:
     case Intrinsic::amdgcn_ldexp:
     case Intrinsic::amdgcn_frexp_mant:
     case Intrinsic::amdgcn_frexp_exp:
@@ -4011,10 +4055,13 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_wwm:
     case Intrinsic::amdgcn_wqm:
     case Intrinsic::amdgcn_softwqm:
+    case Intrinsic::amdgcn_set_inactive:
       return getDefaultMappingAllVGPR(MI);
     case Intrinsic::amdgcn_kernarg_segment_ptr:
     case Intrinsic::amdgcn_s_getpc:
-    case Intrinsic::amdgcn_groupstaticsize: {
+    case Intrinsic::amdgcn_groupstaticsize:
+    case Intrinsic::amdgcn_reloc_constant:
+    case Intrinsic::returnaddress: {
       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
       break;
@@ -4065,7 +4112,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       // This must be an SGPR, but accept a VGPR.
       Register IdxReg = MI.getOperand(3).getReg();
       unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
-      unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
+      unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
       OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
       LLVM_FALLTHROUGH;
     }
@@ -4080,10 +4127,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
       Register SrcReg = MI.getOperand(2).getReg();
       unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
-      unsigned SrcBank = getRegBankID(SrcReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
+      unsigned SrcBank = getRegBankID(SrcReg, MRI, AMDGPU::SGPRRegBankID);
       Register IdxReg = MI.getOperand(3).getReg();
       unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
-      unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
+      unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
 
       // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
@@ -4149,7 +4196,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_interp_p2_f16: {
       const int M0Idx = MI.getNumOperands() - 1;
       Register M0Reg = MI.getOperand(M0Idx).getReg();
-      unsigned M0Bank = getRegBankID(M0Reg, MRI, *TRI, AMDGPU::SGPRRegBankID);
+      unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
 
       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
@@ -4182,6 +4229,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     assert(RSrcIntrin->IsImage);
     return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);
   }
+  case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
+    unsigned N = MI.getNumExplicitOperands() - 2;
+    OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128);
+    OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI);
+    for (unsigned I = 2; I < N; ++I)
+      OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+    break;
+  }
   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
     auto IntrID = MI.getIntrinsicID();
     switch (IntrID) {
@@ -4193,15 +4248,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
       break;
     }
-    case Intrinsic::amdgcn_ds_fadd:
-    case Intrinsic::amdgcn_ds_fmin:
-    case Intrinsic::amdgcn_ds_fmax:
+    case Intrinsic::amdgcn_global_atomic_fadd:
+    case Intrinsic::amdgcn_global_atomic_csub:
       return getDefaultMappingAllVGPR(MI);
     case Intrinsic::amdgcn_ds_ordered_add:
     case Intrinsic::amdgcn_ds_ordered_swap: {
       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
-      unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
+      unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
                                  AMDGPU::SGPRRegBankID);
       OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32);
       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
@@ -4228,14 +4282,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_s_sendmsg:
     case Intrinsic::amdgcn_s_sendmsghalt: {
       // This must be an SGPR, but accept a VGPR.
-      unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
+      unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
                                    AMDGPU::SGPRRegBankID);
       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
       break;
     }
     case Intrinsic::amdgcn_s_setreg: {
       // This must be an SGPR, but accept a VGPR.
-      unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
+      unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
                                    AMDGPU::SGPRRegBankID);
       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
       break;
@@ -4304,7 +4358,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
 
       // This must be an SGPR, but accept a VGPR.
-      unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
+      unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
                                    AMDGPU::SGPRRegBankID);
       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
       break;
@@ -4313,7 +4367,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_ds_gws_sema_p:
     case Intrinsic::amdgcn_ds_gws_sema_release_all: {
       // This must be an SGPR, but accept a VGPR.
-      unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
+      unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI,
                                    AMDGPU::SGPRRegBankID);
       OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
       break;
@@ -4325,16 +4379,16 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   }
   case AMDGPU::G_SELECT: {
     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
-    unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
+    unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
                                     AMDGPU::SGPRRegBankID);
-    unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI,
+    unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI,
                                     AMDGPU::SGPRRegBankID);
     bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID &&
                     Op3Bank == AMDGPU::SGPRRegBankID;
 
     unsigned CondBankDefault = SGPRSrcs ?
       AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
-    unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
+    unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI,
                                      CondBankDefault);
     if (CondBank == AMDGPU::SGPRRegBankID)
       CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
@@ -4380,7 +4434,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case AMDGPU::G_ATOMICRMW_FADD:
   case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
   case AMDGPU::G_AMDGPU_ATOMIC_INC:
-  case AMDGPU::G_AMDGPU_ATOMIC_DEC: {
+  case AMDGPU::G_AMDGPU_ATOMIC_DEC:
+  case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
+  case AMDGPU::G_AMDGPU_ATOMIC_FMAX: {
     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
     OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
@@ -4394,7 +4450,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     break;
   }
   case AMDGPU::G_BRCOND: {
-    unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI, *TRI,
+    unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI,
                                  AMDGPU::SGPRRegBankID);
     assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
     if (Bank != AMDGPU::SGPRRegBankID)
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
index 8f38ec4eeb3a..1c1441729e30 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
@@ -20,7 +20,6 @@
 
 #define GET_REGBANK_DECLARATIONS
 #include "AMDGPUGenRegisterBank.inc"
-#undef GET_REGBANK_DECLARATIONS
 
 namespace llvm {
 
@@ -39,7 +38,8 @@ protected:
 #define GET_TARGET_REGBANK_CLASS
 #include "AMDGPUGenRegisterBank.inc"
 };
-class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo {
+
+class AMDGPURegisterBankInfo final : public AMDGPUGenRegisterBankInfo {
 public:
   const GCNSubtarget &Subtarget;
   const SIRegisterInfo *TRI;
@@ -105,7 +105,6 @@ public:
   getInstrMappingForLoad(const MachineInstr &MI) const;
 
   unsigned getRegBankID(Register Reg, const MachineRegisterInfo &MRI,
-                        const TargetRegisterInfo &TRI,
                         unsigned Default = AMDGPU::VGPRRegBankID) const;
 
   // Return a value mapping for an operand that is required to be an SGPR.
@@ -150,6 +149,9 @@ public:
   getInstrAlternativeMappingsIntrinsicWSideEffects(
       const MachineInstr &MI, const MachineRegisterInfo &MRI) const;
 
+  unsigned getMappingType(const MachineRegisterInfo &MRI,
+                          const MachineInstr &MI) const;
+
   bool isSALUMapping(const MachineInstr &MI) const;
 
   const InstructionMapping &getDefaultMappingSOP(const MachineInstr &MI) const;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
index 9f6ebd00cd97..6c70b53b23c1 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 def SGPRRegBank : RegisterBank<"SGPR",
-  [SReg_LO16, SReg_32, SReg_64, SReg_128, SReg_160, SReg_192, SReg_256, SReg_512, SReg_1024]
+  [SReg_LO16, SReg_32, SReg_64, SReg_96, SReg_128, SReg_160, SReg_192, SReg_256, SReg_512, SReg_1024]
 >;
 
 def VGPRRegBank : RegisterBank<"VGPR",
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
index 9c3d96de6d68..e2aafa25142e 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
@@ -43,35 +43,16 @@
 
 #include "AMDGPU.h"
 #include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
-#include "llvm/Analysis/MemoryLocation.h"
-#include "llvm/IR/Argument.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Use.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include <cassert>
-#include <utility>
 
 #define DEBUG_TYPE "amdgpu-rewrite-out-arguments"
 
@@ -303,8 +284,8 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
       for (ReturnInst *RI : Returns) {
         BasicBlock *BB = RI->getParent();
 
-        MemDepResult Q = MDA->getPointerDependencyFrom(MemoryLocation(OutArg),
-                                                       true, BB->end(), BB, RI);
+        MemDepResult Q = MDA->getPointerDependencyFrom(
+            MemoryLocation::getBeforeOrAfter(OutArg), true, BB->end(), BB, RI);
         StoreInst *SI = nullptr;
         if (Q.isDef())
           SI = dyn_cast<StoreInst>(Q.getInst());
@@ -325,9 +306,10 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
         Value *ReplVal = Store.second->getValueOperand();
 
         auto &ValVec = Replacements[Store.first];
-        if (llvm::find_if(ValVec,
-              [OutArg](const std::pair<Argument *, Value *> &Entry) {
-                 return Entry.first == OutArg;}) != ValVec.end()) {
+        if (llvm::any_of(ValVec,
+                         [OutArg](const std::pair<Argument *, Value *> &Entry) {
+                           return Entry.first == OutArg;
+                         })) {
           LLVM_DEBUG(dbgs()
                      << "Saw multiple out arg stores" << *OutArg << '\n');
           // It is possible to see stores to the same argument multiple times,
@@ -408,8 +390,7 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
         if (DL->getTypeSizeInBits(EffectiveEltTy) !=
             DL->getTypeSizeInBits(Val->getType())) {
           assert(isVec3ToVec4Shuffle(EffectiveEltTy, Val->getType()));
-          Val = B.CreateShuffleVector(Val, UndefValue::get(Val->getType()),
-                                      ArrayRef<int>{0, 1, 2});
+          Val = B.CreateShuffleVector(Val, ArrayRef<int>{0, 1, 2});
         }
 
         Val = B.CreateBitCast(Val, EffectiveEltTy);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index bc68310b2f5c..fd65727f04d4 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -225,6 +225,7 @@ def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_or>;
 def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_xor>;
 def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_inc>;
 def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_dec>;
+def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fadd>;
 def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_cmpswap>;
 def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_swap>;
 def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_add>;
@@ -238,6 +239,7 @@ def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_or>;
 def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_xor>;
 def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_inc>;
 def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_dec>;
+def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fadd>;
 def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_cmpswap>;
 def : SourceOfDivergence<int_amdgcn_buffer_atomic_csub>;
 def : SourceOfDivergence<int_amdgcn_ps_live>;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 213788ae0f67..f1a7d7463676 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -13,18 +13,21 @@
 
 #include "AMDGPUSubtarget.h"
 #include "AMDGPU.h"
-#include "AMDGPUTargetMachine.h"
 #include "AMDGPUCallLowering.h"
 #include "AMDGPUInstructionSelector.h"
 #include "AMDGPULegalizerInfo.h"
 #include "AMDGPURegisterBankInfo.h"
+#include "AMDGPUTargetMachine.h"
 #include "SIMachineFunctionInfo.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
 #include "llvm/CodeGen/MachineScheduler.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/IR/MDBuilder.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/IntrinsicsR600.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include <algorithm>
 
 using namespace llvm;
@@ -50,6 +53,15 @@ static cl::opt<bool> EnableVGPRIndexMode(
   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
   cl::init(false));
 
+static cl::opt<bool> EnableFlatScratch(
+  "amdgpu-enable-flat-scratch",
+  cl::desc("Use flat scratch instructions"),
+  cl::init(false));
+
+static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
+                           cl::desc("Enable the use of AA during codegen."),
+                           cl::init(true));
+
 GCNSubtarget::~GCNSubtarget() = default;
 
 R600Subtarget &
@@ -57,7 +69,7 @@ R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
                                                StringRef GPU, StringRef FS) {
   SmallString<256> FullFS("+promote-alloca,");
   FullFS += FS;
-  ParseSubtargetFeatures(GPU, FullFS);
+  ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
 
   HasMulU24 = getGeneration() >= EVERGREEN;
   HasMulI24 = hasCaymanISA();
@@ -77,11 +89,11 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
   // Similarly we want enable-prt-strict-null to be on by default and not to
   // unset everything else if it is disabled
 
-  // Assuming ECC is enabled is the conservative default.
-  SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,+sram-ecc,+xnack,");
+  SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
 
-  if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
-    FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
+  // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default
+  if (isAmdHsaOS())
+    FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
 
   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
 
@@ -97,17 +109,38 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
 
   FullFS += FS;
 
-  ParseSubtargetFeatures(GPU, FullFS);
+  ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
+
+  // Implement the "generic" processors, which acts as the default when no
+  // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
+  // the first amdgcn target that supports flat addressing. Other OSes defaults
+  // to the first amdgcn target.
+  if (Gen == AMDGPUSubtarget::INVALID) {
+     Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS
+                                        : AMDGPUSubtarget::SOUTHERN_ISLANDS;
+  }
 
   // We don't support FP64 for EG/NI atm.
   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
 
-  // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
-  // on VI and newer hardware to avoid assertion failures due to missing ADDR64
-  // variants of MUBUF instructions.
-  if (!hasAddr64() && !FS.contains("flat-for-global")) {
+  // Targets must either support 64-bit offsets for MUBUF instructions, and/or
+  // support flat operations, otherwise they cannot access a 64-bit global
+  // address space
+  assert(hasAddr64() || hasFlat());
+  // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
+  // that do not support ADDR64 variants of MUBUF instructions. Such targets
+  // cannot use a 64 bit offset with a MUBUF instruction to access the global
+  // address space
+  if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
+    ToggleFeature(AMDGPU::FeatureFlatForGlobal);
     FlatForGlobal = true;
   }
+  // Unless +-flat-for-global is specified, use MUBUF instructions for global
+  // address space access if flat operations are not available.
+  if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
+    ToggleFeature(AMDGPU::FeatureFlatForGlobal);
+    FlatForGlobal = false;
+  }
 
   // Set defaults if needed.
   if (MaxPrivateElementSize == 0)
@@ -131,20 +164,12 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
 
   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
 
-  // Disable XNACK on targets where it is not enabled by default unless it is
-  // explicitly requested.
-  if (!FS.contains("+xnack") && DoesNotSupportXNACK && EnableXNACK) {
-    ToggleFeature(AMDGPU::FeatureXNACK);
-    EnableXNACK = false;
-  }
+  TargetID.setTargetIDFromFeaturesString(FS);
 
-  // ECC is on by default, but turn it off if the hardware doesn't support it
-  // anyway. This matters for the gfx9 targets with d16 loads, but don't support
-  // ECC.
-  if (DoesNotSupportSRAMECC && EnableSRAMECC) {
-    ToggleFeature(AMDGPU::FeatureSRAMECC);
-    EnableSRAMECC = false;
-  }
+  LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
+                    << TargetID.getXnackSetting() << '\n');
+  LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
+                    << TargetID.getSramEccSetting() << '\n');
 
   return *this;
 }
@@ -170,10 +195,11 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
 
 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
                            const GCNTargetMachine &TM) :
-    AMDGPUGenSubtargetInfo(TT, GPU, FS),
+    AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
     AMDGPUSubtarget(TT),
     TargetTriple(TT),
-    Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
+    TargetID(*this),
+    Gen(INVALID),
     InstrItins(getInstrItineraryForCPU(GPU)),
     LDSBankCount(0),
     MaxPrivateElementSize(0),
@@ -184,13 +210,12 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
 
     FlatForGlobal(false),
     AutoWaitcntBeforeBarrier(false),
-    CodeObjectV3(false),
     UnalignedScratchAccess(false),
-    UnalignedBufferAccess(false),
+    UnalignedAccessMode(false),
 
     HasApertureRegs(false),
+    SupportsXNACK(false),
     EnableXNACK(false),
-    DoesNotSupportXNACK(false),
     EnableCuMode(false),
     TrapHandler(false),
 
@@ -239,8 +264,8 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     HasMAIInsts(false),
     HasPkFmacF16Inst(false),
     HasAtomicFaddInsts(false),
+    SupportsSRAMECC(false),
     EnableSRAMECC(false),
-    DoesNotSupportSRAMECC(false),
     HasNoSdstCMPX(false),
     HasVscnt(false),
     HasGetWaveIdInst(false),
@@ -257,6 +282,8 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     HasUnpackedD16VMem(false),
     LDSMisalignedBug(false),
     HasMFMAInlineLiteralBug(false),
+    UnalignedBufferAccess(false),
+    UnalignedDSAccess(false),
 
     ScalarizeGlobal(false),
 
@@ -269,6 +296,8 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     HasNSAtoVMEMBug(false),
     HasOffset3fBug(false),
     HasFlatSegmentOffsetBug(false),
+    HasImageStoreD16Bug(false),
+    HasImageGather4D16Bug(false),
 
     FeatureDisable(false),
     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
@@ -283,20 +312,24 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
 }
 
+bool GCNSubtarget::enableFlatScratch() const {
+  return EnableFlatScratch && hasFlatScratchInsts();
+}
+
 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
   if (getGeneration() < GFX10)
     return 1;
 
   switch (Opcode) {
-  case AMDGPU::V_LSHLREV_B64:
+  case AMDGPU::V_LSHLREV_B64_e64:
   case AMDGPU::V_LSHLREV_B64_gfx10:
-  case AMDGPU::V_LSHL_B64:
-  case AMDGPU::V_LSHRREV_B64:
+  case AMDGPU::V_LSHL_B64_e64:
+  case AMDGPU::V_LSHRREV_B64_e64:
   case AMDGPU::V_LSHRREV_B64_gfx10:
-  case AMDGPU::V_LSHR_B64:
-  case AMDGPU::V_ASHRREV_I64:
+  case AMDGPU::V_LSHR_B64_e64:
+  case AMDGPU::V_ASHRREV_I64_e64:
   case AMDGPU::V_ASHRREV_I64_gfx10:
-  case AMDGPU::V_ASHR_I64:
+  case AMDGPU::V_ASHR_I64_e64:
     return 1;
   }
 
@@ -436,6 +469,25 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
   return Requested;
 }
 
+static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
+  auto Node = Kernel.getMetadata("reqd_work_group_size");
+  if (Node && Node->getNumOperands() == 3)
+    return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
+  return std::numeric_limits<unsigned>::max();
+}
+
+bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
+  return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
+}
+
+unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
+                                           unsigned Dimension) const {
+  unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
+  if (ReqdSize != std::numeric_limits<unsigned>::max())
+    return ReqdSize - 1;
+  return getFlatWorkGroupSizes(Kernel).second - 1;
+}
+
 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
   Function *Kernel = I->getParent()->getParent();
   unsigned MinSize = 0;
@@ -472,11 +524,11 @@ bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
       default:
         break;
       }
+
       if (Dim <= 3) {
-        if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
-          if (Node->getNumOperands() == 3)
-            MinSize = MaxSize = mdconst::extract<ConstantInt>(
-                                  Node->getOperand(Dim))->getZExtValue();
+        unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
+        if (ReqdSize != std::numeric_limits<unsigned>::max())
+          MinSize = MaxSize = ReqdSize;
       }
     }
   }
@@ -498,6 +550,12 @@ bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
   return true;
 }
 
+unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
+  if (isMesaKernel(F))
+    return 16;
+  return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0);
+}
+
 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
                                                  Align &MaxAlign) const {
   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
@@ -508,12 +566,15 @@ uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
   MaxAlign = Align(1);
 
   for (const Argument &Arg : F.args()) {
-    Type *ArgTy = Arg.getType();
+    const bool IsByRef = Arg.hasByRefAttr();
+    Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
+    MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
+    if (!Alignment)
+      Alignment = DL.getABITypeAlign(ArgTy);
 
-    const Align Alignment = DL.getABITypeAlign(ArgTy);
     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
     ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
-    MaxAlign = std::max(MaxAlign, Alignment);
+    MaxAlign = max(MaxAlign, Alignment);
   }
 
   return ExplicitArgBytes;
@@ -536,9 +597,14 @@ unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
   return alignTo(TotalSize, 4);
 }
 
+AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const {
+  return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32
+                                  : AMDGPUDwarfFlavour::Wave64;
+}
+
 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
                              const TargetMachine &TM) :
-  R600GenSubtargetInfo(TT, GPU, FS),
+  R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/GPU, FS),
   AMDGPUSubtarget(TT),
   InstrInfo(*this),
   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
@@ -571,13 +637,15 @@ void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
 }
 
 bool GCNSubtarget::hasMadF16() const {
-  return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1;
+  return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
 }
 
 bool GCNSubtarget::useVGPRIndexMode() const {
   return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
 }
 
+bool GCNSubtarget::useAA() const { return UseAA; }
+
 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
   if (getGeneration() >= AMDGPUSubtarget::GFX10)
     return getMaxWavesPerEU();
@@ -787,7 +855,7 @@ struct FillMFMAShadowMutation : ScheduleDAGMutation {
     for (unsigned I = 0; I < Succs.size(); ++I) {
       for (const SDep &SI : Succs[I]->Succs) {
         const SUnit *SU = SI.getSUnit();
-        if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end())
+        if (SU != Succs[I] && !llvm::is_contained(Succs, SU))
           Succs.push_back(SU);
       }
     }
@@ -795,7 +863,7 @@ struct FillMFMAShadowMutation : ScheduleDAGMutation {
     SmallPtrSet<const SUnit*, 32> Visited;
     while (!Preds.empty()) {
       const SUnit *SU = Preds.pop_back_val();
-      if (llvm::find(Succs, SU) != Succs.end())
+      if (llvm::is_contained(Succs, SU))
         return false;
       Visited.insert(SU);
       for (const SDep &SI : SU->Preds)
@@ -859,8 +927,8 @@ struct FillMFMAShadowMutation : ScheduleDAGMutation {
     for (SUnit &SU : DAG->SUnits) {
       MachineInstr &MAI = *SU.getInstr();
       if (!TII->isMAI(MAI) ||
-           MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 ||
-           MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32)
+           MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
+           MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
         continue;
 
       unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index c833bfbcf936..ba3a8acae551 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -1,4 +1,4 @@
-//=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU ------*- C++ -*-====//
+//=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU -------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,58 +7,38 @@
 //==-----------------------------------------------------------------------===//
 //
 /// \file
-/// AMDGPU specific subclass of TargetSubtarget.
+/// Base class for AMDGPU specific classes of TargetSubtarget.
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
 
-#include "AMDGPU.h"
-#include "AMDGPUCallLowering.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "R600FrameLowering.h"
-#include "R600ISelLowering.h"
-#include "R600InstrInfo.h"
-#include "SIFrameLowering.h"
-#include "SIISelLowering.h"
-#include "SIInstrInfo.h"
-#include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/Triple.h"
-#include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
-#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
-#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
-#include "llvm/MC/MCInstrItineraries.h"
-#include "llvm/Support/MathExtras.h"
-#include <cassert>
-#include <cstdint>
-#include <memory>
-#include <utility>
-
-#define GET_SUBTARGETINFO_HEADER
-#include "AMDGPUGenSubtargetInfo.inc"
-#define GET_SUBTARGETINFO_HEADER
-#include "R600GenSubtargetInfo.inc"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/Support/Alignment.h"
 
 namespace llvm {
 
-class StringRef;
+enum AMDGPUDwarfFlavour : unsigned;
+class Function;
+class Instruction;
+class MachineFunction;
+class TargetMachine;
 
 class AMDGPUSubtarget {
 public:
   enum Generation {
-    R600 = 0,
-    R700 = 1,
-    EVERGREEN = 2,
-    NORTHERN_ISLANDS = 3,
-    SOUTHERN_ISLANDS = 4,
-    SEA_ISLANDS = 5,
-    VOLCANIC_ISLANDS = 6,
-    GFX9 = 7,
-    GFX10 = 8
+    INVALID = 0,
+    R600 = 1,
+    R700 = 2,
+    EVERGREEN = 3,
+    NORTHERN_ISLANDS = 4,
+    SOUTHERN_ISLANDS = 5,
+    SEA_ISLANDS = 6,
+    VOLCANIC_ISLANDS = 7,
+    GFX9 = 8,
+    GFX10 = 9
   };
 
 private:
@@ -78,7 +58,7 @@ protected:
   bool EnablePromoteAlloca;
   bool HasTrigReducedRange;
   unsigned MaxWavesPerEU;
-  int LocalMemorySize;
+  unsigned LocalMemorySize;
   char WavefrontSizeLog2;
 
 public:
@@ -134,9 +114,7 @@ public:
     return TargetTriple.getOS() == Triple::Mesa3D;
   }
 
-  bool isMesaKernel(const Function &F) const {
-    return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
-  }
+  bool isMesaKernel(const Function &F) const;
 
   bool isAmdHsaOrMesa(const Function &F) const {
     return isAmdHsaOS() || isMesaKernel(F);
@@ -202,7 +180,7 @@ public:
     return WavefrontSizeLog2;
   }
 
-  int getLocalMemorySize() const {
+  unsigned getLocalMemorySize() const {
     return LocalMemorySize;
   }
 
@@ -239,1150 +217,26 @@ public:
   /// subtarget without any kind of limitation.
   unsigned getMaxWavesPerEU() const { return MaxWavesPerEU; }
 
-  /// Creates value range metadata on an workitemid.* inrinsic call or load.
+  /// Return the maximum workitem ID value in the function, for the given (0, 1,
+  /// 2) dimension.
+  unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const;
+
+  /// Creates value range metadata on an workitemid.* intrinsic call or load.
   bool makeLIDRangeMetadata(Instruction *I) const;
 
   /// \returns Number of bytes of arguments that are passed to a shader or
   /// kernel in addition to the explicit ones declared for the function.
-  unsigned getImplicitArgNumBytes(const Function &F) const {
-    if (isMesaKernel(F))
-      return 16;
-    return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0);
-  }
+  unsigned getImplicitArgNumBytes(const Function &F) const;
   uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const;
   unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const;
 
   /// \returns Corresponsing DWARF register number mapping flavour for the
   /// \p WavefrontSize.
-  AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const {
-    return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32
-                                    : AMDGPUDwarfFlavour::Wave64;
-  }
+  AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const;
 
   virtual ~AMDGPUSubtarget() {}
 };
 
-class GCNSubtarget : public AMDGPUGenSubtargetInfo,
-                     public AMDGPUSubtarget {
-
-  using AMDGPUSubtarget::getMaxWavesPerEU;
-
-public:
-  enum TrapHandlerAbi {
-    TrapHandlerAbiNone = 0,
-    TrapHandlerAbiHsa = 1
-  };
-
-  enum TrapID {
-    TrapIDHardwareReserved = 0,
-    TrapIDHSADebugTrap = 1,
-    TrapIDLLVMTrap = 2,
-    TrapIDLLVMDebugTrap = 3,
-    TrapIDDebugBreakpoint = 7,
-    TrapIDDebugReserved8 = 8,
-    TrapIDDebugReservedFE = 0xfe,
-    TrapIDDebugReservedFF = 0xff
-  };
-
-  enum TrapRegValues {
-    LLVMTrapHandlerRegValue = 1
-  };
-
-private:
-  /// GlobalISel related APIs.
-  std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
-  std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo;
-  std::unique_ptr<InstructionSelector> InstSelector;
-  std::unique_ptr<LegalizerInfo> Legalizer;
-  std::unique_ptr<RegisterBankInfo> RegBankInfo;
-
-protected:
-  // Basic subtarget description.
-  Triple TargetTriple;
-  unsigned Gen;
-  InstrItineraryData InstrItins;
-  int LDSBankCount;
-  unsigned MaxPrivateElementSize;
-
-  // Possibly statically set by tablegen, but may want to be overridden.
-  bool FastFMAF32;
-  bool FastDenormalF32;
-  bool HalfRate64Ops;
-
-  // Dynamially set bits that enable features.
-  bool FlatForGlobal;
-  bool AutoWaitcntBeforeBarrier;
-  bool CodeObjectV3;
-  bool UnalignedScratchAccess;
-  bool UnalignedBufferAccess;
-  bool HasApertureRegs;
-  bool EnableXNACK;
-  bool DoesNotSupportXNACK;
-  bool EnableCuMode;
-  bool TrapHandler;
-
-  // Used as options.
-  bool EnableLoadStoreOpt;
-  bool EnableUnsafeDSOffsetFolding;
-  bool EnableSIScheduler;
-  bool EnableDS128;
-  bool EnablePRTStrictNull;
-  bool DumpCode;
-
-  // Subtarget statically properties set by tablegen
-  bool FP64;
-  bool FMA;
-  bool MIMG_R128;
-  bool IsGCN;
-  bool GCN3Encoding;
-  bool CIInsts;
-  bool GFX8Insts;
-  bool GFX9Insts;
-  bool GFX10Insts;
-  bool GFX10_3Insts;
-  bool GFX7GFX8GFX9Insts;
-  bool SGPRInitBug;
-  bool HasSMemRealTime;
-  bool HasIntClamp;
-  bool HasFmaMixInsts;
-  bool HasMovrel;
-  bool HasVGPRIndexMode;
-  bool HasScalarStores;
-  bool HasScalarAtomics;
-  bool HasSDWAOmod;
-  bool HasSDWAScalar;
-  bool HasSDWASdst;
-  bool HasSDWAMac;
-  bool HasSDWAOutModsVOPC;
-  bool HasDPP;
-  bool HasDPP8;
-  bool HasR128A16;
-  bool HasGFX10A16;
-  bool HasG16;
-  bool HasNSAEncoding;
-  bool GFX10_BEncoding;
-  bool HasDLInsts;
-  bool HasDot1Insts;
-  bool HasDot2Insts;
-  bool HasDot3Insts;
-  bool HasDot4Insts;
-  bool HasDot5Insts;
-  bool HasDot6Insts;
-  bool HasMAIInsts;
-  bool HasPkFmacF16Inst;
-  bool HasAtomicFaddInsts;
-  bool EnableSRAMECC;
-  bool DoesNotSupportSRAMECC;
-  bool HasNoSdstCMPX;
-  bool HasVscnt;
-  bool HasGetWaveIdInst;
-  bool HasSMemTimeInst;
-  bool HasRegisterBanking;
-  bool HasVOP3Literal;
-  bool HasNoDataDepHazard;
-  bool FlatAddressSpace;
-  bool FlatInstOffsets;
-  bool FlatGlobalInsts;
-  bool FlatScratchInsts;
-  bool ScalarFlatScratchInsts;
-  bool AddNoCarryInsts;
-  bool HasUnpackedD16VMem;
-  bool R600ALUInst;
-  bool CaymanISA;
-  bool CFALUBug;
-  bool LDSMisalignedBug;
-  bool HasMFMAInlineLiteralBug;
-  bool HasVertexCache;
-  short TexVTXClauseSize;
-  bool ScalarizeGlobal;
-
-  bool HasVcmpxPermlaneHazard;
-  bool HasVMEMtoScalarWriteHazard;
-  bool HasSMEMtoVectorWriteHazard;
-  bool HasInstFwdPrefetchBug;
-  bool HasVcmpxExecWARHazard;
-  bool HasLdsBranchVmemWARHazard;
-  bool HasNSAtoVMEMBug;
-  bool HasOffset3fBug;
-  bool HasFlatSegmentOffsetBug;
-
-  // Dummy feature to use for assembler in tablegen.
-  bool FeatureDisable;
-
-  SelectionDAGTargetInfo TSInfo;
-private:
-  SIInstrInfo InstrInfo;
-  SITargetLowering TLInfo;
-  SIFrameLowering FrameLowering;
-
-  // See COMPUTE_TMPRING_SIZE.WAVESIZE, 13-bit field in units of 256-dword.
-  static const unsigned MaxWaveScratchSize = (256 * 4) * ((1 << 13) - 1);
-
-public:
-  GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
-               const GCNTargetMachine &TM);
-  ~GCNSubtarget() override;
-
-  GCNSubtarget &initializeSubtargetDependencies(const Triple &TT,
-                                                   StringRef GPU, StringRef FS);
-
-  const SIInstrInfo *getInstrInfo() const override {
-    return &InstrInfo;
-  }
-
-  const SIFrameLowering *getFrameLowering() const override {
-    return &FrameLowering;
-  }
-
-  const SITargetLowering *getTargetLowering() const override {
-    return &TLInfo;
-  }
-
-  const SIRegisterInfo *getRegisterInfo() const override {
-    return &InstrInfo.getRegisterInfo();
-  }
-
-  const CallLowering *getCallLowering() const override {
-    return CallLoweringInfo.get();
-  }
-
-  const InlineAsmLowering *getInlineAsmLowering() const override {
-    return InlineAsmLoweringInfo.get();
-  }
-
-  InstructionSelector *getInstructionSelector() const override {
-    return InstSelector.get();
-  }
-
-  const LegalizerInfo *getLegalizerInfo() const override {
-    return Legalizer.get();
-  }
-
-  const RegisterBankInfo *getRegBankInfo() const override {
-    return RegBankInfo.get();
-  }
-
-  // Nothing implemented, just prevent crashes on use.
-  const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
-    return &TSInfo;
-  }
-
-  const InstrItineraryData *getInstrItineraryData() const override {
-    return &InstrItins;
-  }
-
-  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
-
-  Generation getGeneration() const {
-    return (Generation)Gen;
-  }
-
-  /// Return the number of high bits known to be zero fror a frame index.
-  unsigned getKnownHighZeroBitsForFrameIndex() const {
-    return countLeadingZeros(MaxWaveScratchSize) + getWavefrontSizeLog2();
-  }
-
-  int getLDSBankCount() const {
-    return LDSBankCount;
-  }
-
-  unsigned getMaxPrivateElementSize() const {
-    return MaxPrivateElementSize;
-  }
-
-  unsigned getConstantBusLimit(unsigned Opcode) const;
-
-  bool hasIntClamp() const {
-    return HasIntClamp;
-  }
-
-  bool hasFP64() const {
-    return FP64;
-  }
-
-  bool hasMIMG_R128() const {
-    return MIMG_R128;
-  }
-
-  bool hasHWFP64() const {
-    return FP64;
-  }
-
-  bool hasFastFMAF32() const {
-    return FastFMAF32;
-  }
-
-  bool hasHalfRate64Ops() const {
-    return HalfRate64Ops;
-  }
-
-  bool hasAddr64() const {
-    return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
-  }
-
-  // Return true if the target only has the reverse operand versions of VALU
-  // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32).
-  bool hasOnlyRevVALUShifts() const {
-    return getGeneration() >= VOLCANIC_ISLANDS;
-  }
-
-  bool hasFractBug() const {
-    return getGeneration() == SOUTHERN_ISLANDS;
-  }
-
-  bool hasBFE() const {
-    return true;
-  }
-
-  bool hasBFI() const {
-    return true;
-  }
-
-  bool hasBFM() const {
-    return hasBFE();
-  }
-
-  bool hasBCNT(unsigned Size) const {
-    return true;
-  }
-
-  bool hasFFBL() const {
-    return true;
-  }
-
-  bool hasFFBH() const {
-    return true;
-  }
-
-  bool hasMed3_16() const {
-    return getGeneration() >= AMDGPUSubtarget::GFX9;
-  }
-
-  bool hasMin3Max3_16() const {
-    return getGeneration() >= AMDGPUSubtarget::GFX9;
-  }
-
-  bool hasFmaMixInsts() const {
-    return HasFmaMixInsts;
-  }
-
-  bool hasCARRY() const {
-    return true;
-  }
-
-  bool hasFMA() const {
-    return FMA;
-  }
-
-  bool hasSwap() const {
-    return GFX9Insts;
-  }
-
-  bool hasScalarPackInsts() const {
-    return GFX9Insts;
-  }
-
-  bool hasScalarMulHiInsts() const {
-    return GFX9Insts;
-  }
-
-  TrapHandlerAbi getTrapHandlerAbi() const {
-    return isAmdHsaOS() ? TrapHandlerAbiHsa : TrapHandlerAbiNone;
-  }
-
-  /// True if the offset field of DS instructions works as expected. On SI, the
-  /// offset uses a 16-bit adder and does not always wrap properly.
-  bool hasUsableDSOffset() const {
-    return getGeneration() >= SEA_ISLANDS;
-  }
-
-  bool unsafeDSOffsetFoldingEnabled() const {
-    return EnableUnsafeDSOffsetFolding;
-  }
-
-  /// Condition output from div_scale is usable.
-  bool hasUsableDivScaleConditionOutput() const {
-    return getGeneration() != SOUTHERN_ISLANDS;
-  }
-
-  /// Extra wait hazard is needed in some cases before
-  /// s_cbranch_vccnz/s_cbranch_vccz.
-  bool hasReadVCCZBug() const {
-    return getGeneration() <= SEA_ISLANDS;
-  }
-
-  /// Writes to VCC_LO/VCC_HI update the VCCZ flag.
-  bool partialVCCWritesUpdateVCCZ() const {
-    return getGeneration() >= GFX10;
-  }
-
-  /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
-  /// was written by a VALU instruction.
-  bool hasSMRDReadVALUDefHazard() const {
-    return getGeneration() == SOUTHERN_ISLANDS;
-  }
-
-  /// A read of an SGPR by a VMEM instruction requires 5 wait states when the
-  /// SGPR was written by a VALU Instruction.
-  bool hasVMEMReadSGPRVALUDefHazard() const {
-    return getGeneration() >= VOLCANIC_ISLANDS;
-  }
-
-  bool hasRFEHazards() const {
-    return getGeneration() >= VOLCANIC_ISLANDS;
-  }
-
-  /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
-  unsigned getSetRegWaitStates() const {
-    return getGeneration() <= SEA_ISLANDS ? 1 : 2;
-  }
-
-  bool dumpCode() const {
-    return DumpCode;
-  }
-
-  /// Return the amount of LDS that can be used that will not restrict the
-  /// occupancy lower than WaveCount.
-  unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
-                                           const Function &) const;
-
-  bool supportsMinMaxDenormModes() const {
-    return getGeneration() >= AMDGPUSubtarget::GFX9;
-  }
-
-  /// \returns If target supports S_DENORM_MODE.
-  bool hasDenormModeInst() const {
-    return getGeneration() >= AMDGPUSubtarget::GFX10;
-  }
-
-  bool useFlatForGlobal() const {
-    return FlatForGlobal;
-  }
-
-  /// \returns If target supports ds_read/write_b128 and user enables generation
-  /// of ds_read/write_b128.
-  bool useDS128() const {
-    return CIInsts && EnableDS128;
-  }
-
-  /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64
-  bool haveRoundOpsF64() const {
-    return CIInsts;
-  }
-
-  /// \returns If MUBUF instructions always perform range checking, even for
-  /// buffer resources used for private memory access.
-  bool privateMemoryResourceIsRangeChecked() const {
-    return getGeneration() < AMDGPUSubtarget::GFX9;
-  }
-
-  /// \returns If target requires PRT Struct NULL support (zero result registers
-  /// for sparse texture support).
-  bool usePRTStrictNull() const {
-    return EnablePRTStrictNull;
-  }
-
-  bool hasAutoWaitcntBeforeBarrier() const {
-    return AutoWaitcntBeforeBarrier;
-  }
-
-  bool hasCodeObjectV3() const {
-    // FIXME: Need to add code object v3 support for mesa and pal.
-    return isAmdHsaOS() ? CodeObjectV3 : false;
-  }
-
-  bool hasUnalignedBufferAccess() const {
-    return UnalignedBufferAccess;
-  }
-
-  bool hasUnalignedScratchAccess() const {
-    return UnalignedScratchAccess;
-  }
-
-  bool hasApertureRegs() const {
-    return HasApertureRegs;
-  }
-
-  bool isTrapHandlerEnabled() const {
-    return TrapHandler;
-  }
-
-  bool isXNACKEnabled() const {
-    return EnableXNACK;
-  }
-
-  bool isCuModeEnabled() const {
-    return EnableCuMode;
-  }
-
-  bool hasFlatAddressSpace() const {
-    return FlatAddressSpace;
-  }
-
-  bool hasFlatScrRegister() const {
-    return hasFlatAddressSpace();
-  }
-
-  bool hasFlatInstOffsets() const {
-    return FlatInstOffsets;
-  }
-
-  bool hasFlatGlobalInsts() const {
-    return FlatGlobalInsts;
-  }
-
-  bool hasFlatScratchInsts() const {
-    return FlatScratchInsts;
-  }
-
-  bool hasScalarFlatScratchInsts() const {
-    return ScalarFlatScratchInsts;
-  }
-
-  bool hasGlobalAddTidInsts() const {
-    return GFX10_BEncoding;
-  }
-
-  bool hasAtomicCSub() const {
-    return GFX10_BEncoding;
-  }
-
-  bool hasMultiDwordFlatScratchAddressing() const {
-    return getGeneration() >= GFX9;
-  }
-
-  bool hasFlatSegmentOffsetBug() const {
-    return HasFlatSegmentOffsetBug;
-  }
-
-  bool hasFlatLgkmVMemCountInOrder() const {
-    return getGeneration() > GFX9;
-  }
-
-  bool hasD16LoadStore() const {
-    return getGeneration() >= GFX9;
-  }
-
-  bool d16PreservesUnusedBits() const {
-    return hasD16LoadStore() && !isSRAMECCEnabled();
-  }
-
-  bool hasD16Images() const {
-    return getGeneration() >= VOLCANIC_ISLANDS;
-  }
-
-  /// Return if most LDS instructions have an m0 use that require m0 to be
-  /// iniitalized.
-  bool ldsRequiresM0Init() const {
-    return getGeneration() < GFX9;
-  }
-
-  // True if the hardware rewinds and replays GWS operations if a wave is
-  // preempted.
-  //
-  // If this is false, a GWS operation requires testing if a nack set the
-  // MEM_VIOL bit, and repeating if so.
-  bool hasGWSAutoReplay() const {
-    return getGeneration() >= GFX9;
-  }
-
-  /// \returns if target has ds_gws_sema_release_all instruction.
-  bool hasGWSSemaReleaseAll() const {
-    return CIInsts;
-  }
-
-  bool hasAddNoCarry() const {
-    return AddNoCarryInsts;
-  }
-
-  bool hasUnpackedD16VMem() const {
-    return HasUnpackedD16VMem;
-  }
-
-  // Covers VS/PS/CS graphics shaders
-  bool isMesaGfxShader(const Function &F) const {
-    return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv());
-  }
-
-  bool hasMad64_32() const {
-    return getGeneration() >= SEA_ISLANDS;
-  }
-
-  bool hasSDWAOmod() const {
-    return HasSDWAOmod;
-  }
-
-  bool hasSDWAScalar() const {
-    return HasSDWAScalar;
-  }
-
-  bool hasSDWASdst() const {
-    return HasSDWASdst;
-  }
-
-  bool hasSDWAMac() const {
-    return HasSDWAMac;
-  }
-
-  bool hasSDWAOutModsVOPC() const {
-    return HasSDWAOutModsVOPC;
-  }
-
-  bool hasDLInsts() const {
-    return HasDLInsts;
-  }
-
-  bool hasDot1Insts() const {
-    return HasDot1Insts;
-  }
-
-  bool hasDot2Insts() const {
-    return HasDot2Insts;
-  }
-
-  bool hasDot3Insts() const {
-    return HasDot3Insts;
-  }
-
-  bool hasDot4Insts() const {
-    return HasDot4Insts;
-  }
-
-  bool hasDot5Insts() const {
-    return HasDot5Insts;
-  }
-
-  bool hasDot6Insts() const {
-    return HasDot6Insts;
-  }
-
-  bool hasMAIInsts() const {
-    return HasMAIInsts;
-  }
-
-  bool hasPkFmacF16Inst() const {
-    return HasPkFmacF16Inst;
-  }
-
-  bool hasAtomicFaddInsts() const {
-    return HasAtomicFaddInsts;
-  }
-
-  bool isSRAMECCEnabled() const {
-    return EnableSRAMECC;
-  }
-
-  bool hasNoSdstCMPX() const {
-    return HasNoSdstCMPX;
-  }
-
-  bool hasVscnt() const {
-    return HasVscnt;
-  }
-
-  bool hasGetWaveIdInst() const {
-    return HasGetWaveIdInst;
-  }
-
-  bool hasSMemTimeInst() const {
-    return HasSMemTimeInst;
-  }
-
-  bool hasRegisterBanking() const {
-    return HasRegisterBanking;
-  }
-
-  bool hasVOP3Literal() const {
-    return HasVOP3Literal;
-  }
-
-  bool hasNoDataDepHazard() const {
-    return HasNoDataDepHazard;
-  }
-
-  bool vmemWriteNeedsExpWaitcnt() const {
-    return getGeneration() < SEA_ISLANDS;
-  }
-
-  // Scratch is allocated in 256 dword per wave blocks for the entire
-  // wavefront. When viewed from the perspecive of an arbitrary workitem, this
-  // is 4-byte aligned.
-  //
-  // Only 4-byte alignment is really needed to access anything. Transformations
-  // on the pointer value itself may rely on the alignment / known low bits of
-  // the pointer. Set this to something above the minimum to avoid needing
-  // dynamic realignment in common cases.
-  Align getStackAlignment() const { return Align(16); }
-
-  bool enableMachineScheduler() const override {
-    return true;
-  }
-
-  bool enableSubRegLiveness() const override {
-    return true;
-  }
-
-  void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
-  bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
-
-  // static wrappers
-  static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
-
-  // XXX - Why is this here if it isn't in the default pass set?
-  bool enableEarlyIfConversion() const override {
-    return true;
-  }
-
-  void overrideSchedPolicy(MachineSchedPolicy &Policy,
-                           unsigned NumRegionInstrs) const override;
-
-  unsigned getMaxNumUserSGPRs() const {
-    return 16;
-  }
-
-  bool hasSMemRealTime() const {
-    return HasSMemRealTime;
-  }
-
-  bool hasMovrel() const {
-    return HasMovrel;
-  }
-
-  bool hasVGPRIndexMode() const {
-    return HasVGPRIndexMode;
-  }
-
-  bool useVGPRIndexMode() const;
-
-  bool hasScalarCompareEq64() const {
-    return getGeneration() >= VOLCANIC_ISLANDS;
-  }
-
-  bool hasScalarStores() const {
-    return HasScalarStores;
-  }
-
-  bool hasScalarAtomics() const {
-    return HasScalarAtomics;
-  }
-
-  bool hasLDSFPAtomics() const {
-    return GFX8Insts;
-  }
-
-  bool hasDPP() const {
-    return HasDPP;
-  }
-
-  bool hasDPPBroadcasts() const {
-    return HasDPP && getGeneration() < GFX10;
-  }
-
-  bool hasDPPWavefrontShifts() const {
-    return HasDPP && getGeneration() < GFX10;
-  }
-
-  bool hasDPP8() const {
-    return HasDPP8;
-  }
-
-  bool hasR128A16() const {
-    return HasR128A16;
-  }
-
-  bool hasGFX10A16() const {
-    return HasGFX10A16;
-  }
-
-  bool hasA16() const { return hasR128A16() || hasGFX10A16(); }
-
-  bool hasG16() const { return HasG16; }
-
-  bool hasOffset3fBug() const {
-    return HasOffset3fBug;
-  }
-
-  bool hasNSAEncoding() const {
-    return HasNSAEncoding;
-  }
-
-  bool hasGFX10_BEncoding() const {
-    return GFX10_BEncoding;
-  }
-
-  bool hasGFX10_3Insts() const {
-    return GFX10_3Insts;
-  }
-
-  bool hasMadF16() const;
-
-  bool enableSIScheduler() const {
-    return EnableSIScheduler;
-  }
-
-  bool loadStoreOptEnabled() const {
-    return EnableLoadStoreOpt;
-  }
-
-  bool hasSGPRInitBug() const {
-    return SGPRInitBug;
-  }
-
-  bool hasMFMAInlineLiteralBug() const {
-    return HasMFMAInlineLiteralBug;
-  }
-
-  bool has12DWordStoreHazard() const {
-    return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
-  }
-
-  // \returns true if the subtarget supports DWORDX3 load/store instructions.
-  bool hasDwordx3LoadStores() const {
-    return CIInsts;
-  }
-
-  bool hasSMovFedHazard() const {
-    return getGeneration() == AMDGPUSubtarget::GFX9;
-  }
-
-  bool hasReadM0MovRelInterpHazard() const {
-    return getGeneration() == AMDGPUSubtarget::GFX9;
-  }
-
-  bool hasReadM0SendMsgHazard() const {
-    return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
-           getGeneration() <= AMDGPUSubtarget::GFX9;
-  }
-
-  bool hasVcmpxPermlaneHazard() const {
-    return HasVcmpxPermlaneHazard;
-  }
-
-  bool hasVMEMtoScalarWriteHazard() const {
-    return HasVMEMtoScalarWriteHazard;
-  }
-
-  bool hasSMEMtoVectorWriteHazard() const {
-    return HasSMEMtoVectorWriteHazard;
-  }
-
-  bool hasLDSMisalignedBug() const {
-    return LDSMisalignedBug && !EnableCuMode;
-  }
-
-  bool hasInstFwdPrefetchBug() const {
-    return HasInstFwdPrefetchBug;
-  }
-
-  bool hasVcmpxExecWARHazard() const {
-    return HasVcmpxExecWARHazard;
-  }
-
-  bool hasLdsBranchVmemWARHazard() const {
-    return HasLdsBranchVmemWARHazard;
-  }
-
-  bool hasNSAtoVMEMBug() const {
-    return HasNSAtoVMEMBug;
-  }
-
-  bool hasHardClauses() const { return getGeneration() >= GFX10; }
-
-  /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
-  /// SGPRs
-  unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
-
-  /// Return the maximum number of waves per SIMD for kernels using \p VGPRs
-  /// VGPRs
-  unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
-
-  /// Return occupancy for the given function. Used LDS and a number of
-  /// registers if provided.
-  /// Note, occupancy can be affected by the scratch allocation as well, but
-  /// we do not have enough information to compute it.
-  unsigned computeOccupancy(const Function &F, unsigned LDSSize = 0,
-                            unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const;
-
-  /// \returns true if the flat_scratch register should be initialized with the
-  /// pointer to the wave's scratch memory rather than a size and offset.
-  bool flatScratchIsPointer() const {
-    return getGeneration() >= AMDGPUSubtarget::GFX9;
-  }
-
-  /// \returns true if the machine has merged shaders in which s0-s7 are
-  /// reserved by the hardware and user SGPRs start at s8
-  bool hasMergedShaders() const {
-    return getGeneration() >= GFX9;
-  }
-
-  /// \returns SGPR allocation granularity supported by the subtarget.
-  unsigned getSGPRAllocGranule() const {
-    return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
-  }
-
-  /// \returns SGPR encoding granularity supported by the subtarget.
-  unsigned getSGPREncodingGranule() const {
-    return AMDGPU::IsaInfo::getSGPREncodingGranule(this);
-  }
-
-  /// \returns Total number of SGPRs supported by the subtarget.
-  unsigned getTotalNumSGPRs() const {
-    return AMDGPU::IsaInfo::getTotalNumSGPRs(this);
-  }
-
-  /// \returns Addressable number of SGPRs supported by the subtarget.
-  unsigned getAddressableNumSGPRs() const {
-    return AMDGPU::IsaInfo::getAddressableNumSGPRs(this);
-  }
-
-  /// \returns Minimum number of SGPRs that meets the given number of waves per
-  /// execution unit requirement supported by the subtarget.
-  unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
-    return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU);
-  }
-
-  /// \returns Maximum number of SGPRs that meets the given number of waves per
-  /// execution unit requirement supported by the subtarget.
-  unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
-    return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable);
-  }
-
-  /// \returns Reserved number of SGPRs for given function \p MF.
-  unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
-
-  /// \returns Maximum number of SGPRs that meets number of waves per execution
-  /// unit requirement for function \p MF, or number of SGPRs explicitly
-  /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
-  ///
-  /// \returns Value that meets number of waves per execution unit requirement
-  /// if explicitly requested value cannot be converted to integer, violates
-  /// subtarget's specifications, or does not meet number of waves per execution
-  /// unit requirement.
-  unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
-
-  /// \returns VGPR allocation granularity supported by the subtarget.
-  unsigned getVGPRAllocGranule() const {
-    return AMDGPU::IsaInfo::getVGPRAllocGranule(this);
-  }
-
-  /// \returns VGPR encoding granularity supported by the subtarget.
-  unsigned getVGPREncodingGranule() const {
-    return AMDGPU::IsaInfo::getVGPREncodingGranule(this);
-  }
-
-  /// \returns Total number of VGPRs supported by the subtarget.
-  unsigned getTotalNumVGPRs() const {
-    return AMDGPU::IsaInfo::getTotalNumVGPRs(this);
-  }
-
-  /// \returns Addressable number of VGPRs supported by the subtarget.
-  unsigned getAddressableNumVGPRs() const {
-    return AMDGPU::IsaInfo::getAddressableNumVGPRs(this);
-  }
-
-  /// \returns Minimum number of VGPRs that meets given number of waves per
-  /// execution unit requirement supported by the subtarget.
-  unsigned getMinNumVGPRs(unsigned WavesPerEU) const {
-    return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU);
-  }
-
-  /// \returns Maximum number of VGPRs that meets given number of waves per
-  /// execution unit requirement supported by the subtarget.
-  unsigned getMaxNumVGPRs(unsigned WavesPerEU) const {
-    return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU);
-  }
-
-  /// \returns Maximum number of VGPRs that meets number of waves per execution
-  /// unit requirement for function \p MF, or number of VGPRs explicitly
-  /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
-  ///
-  /// \returns Value that meets number of waves per execution unit requirement
-  /// if explicitly requested value cannot be converted to integer, violates
-  /// subtarget's specifications, or does not meet number of waves per execution
-  /// unit requirement.
-  unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
-
-  void getPostRAMutations(
-      std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations)
-      const override;
-
-  bool isWave32() const {
-    return getWavefrontSize() == 32;
-  }
-
-  const TargetRegisterClass *getBoolRC() const {
-    return getRegisterInfo()->getBoolRC();
-  }
-
-  /// \returns Maximum number of work groups per compute unit supported by the
-  /// subtarget and limited by given \p FlatWorkGroupSize.
-  unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
-    return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize);
-  }
-
-  /// \returns Minimum flat work group size supported by the subtarget.
-  unsigned getMinFlatWorkGroupSize() const override {
-    return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this);
-  }
-
-  /// \returns Maximum flat work group size supported by the subtarget.
-  unsigned getMaxFlatWorkGroupSize() const override {
-    return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
-  }
-
-  /// \returns Number of waves per execution unit required to support the given
-  /// \p FlatWorkGroupSize.
-  unsigned
-  getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override {
-    return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize);
-  }
-
-  /// \returns Minimum number of waves per execution unit supported by the
-  /// subtarget.
-  unsigned getMinWavesPerEU() const override {
-    return AMDGPU::IsaInfo::getMinWavesPerEU(this);
-  }
-
-  void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
-                             SDep &Dep) const override;
-};
-
-class R600Subtarget final : public R600GenSubtargetInfo,
-                            public AMDGPUSubtarget {
-private:
-  R600InstrInfo InstrInfo;
-  R600FrameLowering FrameLowering;
-  bool FMA;
-  bool CaymanISA;
-  bool CFALUBug;
-  bool HasVertexCache;
-  bool R600ALUInst;
-  bool FP64;
-  short TexVTXClauseSize;
-  Generation Gen;
-  R600TargetLowering TLInfo;
-  InstrItineraryData InstrItins;
-  SelectionDAGTargetInfo TSInfo;
-
-public:
-  R600Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
-                const TargetMachine &TM);
-
-  const R600InstrInfo *getInstrInfo() const override { return &InstrInfo; }
-
-  const R600FrameLowering *getFrameLowering() const override {
-    return &FrameLowering;
-  }
-
-  const R600TargetLowering *getTargetLowering() const override {
-    return &TLInfo;
-  }
-
-  const R600RegisterInfo *getRegisterInfo() const override {
-    return &InstrInfo.getRegisterInfo();
-  }
-
-  const InstrItineraryData *getInstrItineraryData() const override {
-    return &InstrItins;
-  }
-
-  // Nothing implemented, just prevent crashes on use.
-  const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
-    return &TSInfo;
-  }
-
-  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
-
-  Generation getGeneration() const {
-    return Gen;
-  }
-
-  Align getStackAlignment() const { return Align(4); }
-
-  R600Subtarget &initializeSubtargetDependencies(const Triple &TT,
-                                                 StringRef GPU, StringRef FS);
-
-  bool hasBFE() const {
-    return (getGeneration() >= EVERGREEN);
-  }
-
-  bool hasBFI() const {
-    return (getGeneration() >= EVERGREEN);
-  }
-
-  bool hasBCNT(unsigned Size) const {
-    if (Size == 32)
-      return (getGeneration() >= EVERGREEN);
-
-    return false;
-  }
-
-  bool hasBORROW() const {
-    return (getGeneration() >= EVERGREEN);
-  }
-
-  bool hasCARRY() const {
-    return (getGeneration() >= EVERGREEN);
-  }
-
-  bool hasCaymanISA() const {
-    return CaymanISA;
-  }
-
-  bool hasFFBL() const {
-    return (getGeneration() >= EVERGREEN);
-  }
-
-  bool hasFFBH() const {
-    return (getGeneration() >= EVERGREEN);
-  }
-
-  bool hasFMA() const { return FMA; }
-
-  bool hasCFAluBug() const { return CFALUBug; }
-
-  bool hasVertexCache() const { return HasVertexCache; }
-
-  short getTexVTXClauseSize() const { return TexVTXClauseSize; }
-
-  bool enableMachineScheduler() const override {
-    return true;
-  }
-
-  bool enableSubRegLiveness() const override {
-    return true;
-  }
-
-  /// \returns Maximum number of work groups per compute unit supported by the
-  /// subtarget and limited by given \p FlatWorkGroupSize.
-  unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
-    return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize);
-  }
-
-  /// \returns Minimum flat work group size supported by the subtarget.
-  unsigned getMinFlatWorkGroupSize() const override {
-    return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this);
-  }
-
-  /// \returns Maximum flat work group size supported by the subtarget.
-  unsigned getMaxFlatWorkGroupSize() const override {
-    return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
-  }
-
-  /// \returns Number of waves per execution unit required to support the given
-  /// \p FlatWorkGroupSize.
-  unsigned
-  getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override {
-    return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize);
-  }
-
-  /// \returns Minimum number of waves per execution unit supported by the
-  /// subtarget.
-  unsigned getMinWavesPerEU() const override {
-    return AMDGPU::IsaInfo::getMinWavesPerEU(this);
-  }
-};
-
 } // end namespace llvm
 
 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index b4b10835837c..ce7c82e2a88a 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -15,45 +15,40 @@
 #include "AMDGPUTargetMachine.h"
 #include "AMDGPU.h"
 #include "AMDGPUAliasAnalysis.h"
-#include "AMDGPUCallLowering.h"
 #include "AMDGPUExportClustering.h"
-#include "AMDGPUInstructionSelector.h"
-#include "AMDGPULegalizerInfo.h"
 #include "AMDGPUMacroFusion.h"
 #include "AMDGPUTargetObjectFile.h"
 #include "AMDGPUTargetTransformInfo.h"
 #include "GCNIterativeScheduler.h"
 #include "GCNSchedStrategy.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "R600MachineScheduler.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIMachineScheduler.h"
 #include "TargetInfo/AMDGPUTargetInfo.h"
+#include "llvm/Analysis/CGSCCPassManager.h"
 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
 #include "llvm/CodeGen/GlobalISel/Localizer.h"
 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
 #include "llvm/CodeGen/MIRParser/MIParser.h"
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/Function.h"
 #include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
+#include "llvm/Passes/PassBuilder.h"
 #include "llvm/Support/TargetRegistry.h"
-#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/AlwaysInliner.h"
+#include "llvm/Transforms/IPO/GlobalDCE.h"
+#include "llvm/Transforms/IPO/Internalize.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/Scalar/InferAddressSpaces.h"
 #include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/SimplifyLibCalls.h"
 #include "llvm/Transforms/Vectorize.h"
-#include <memory>
 
 using namespace llvm;
 
@@ -216,7 +211,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeSILowerSGPRSpillsPass(*PR);
   initializeSIFixSGPRCopiesPass(*PR);
   initializeSIFixVGPRCopiesPass(*PR);
-  initializeSIFixupVectorISelPass(*PR);
   initializeSIFoldOperandsPass(*PR);
   initializeSIPeepholeSDWAPass(*PR);
   initializeSIShrinkInstructionsPass(*PR);
@@ -237,6 +231,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPUPromoteAllocaPass(*PR);
   initializeAMDGPUPromoteAllocaToVectorPass(*PR);
   initializeAMDGPUCodeGenPreparePass(*PR);
+  initializeAMDGPULateCodeGenPreparePass(*PR);
   initializeAMDGPUPropagateAttributesEarlyPass(*PR);
   initializeAMDGPUPropagateAttributesLatePass(*PR);
   initializeAMDGPURewriteOutArgumentsPass(*PR);
@@ -260,7 +255,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPUExternalAAWrapperPass(*PR);
   initializeAMDGPUUseNativeCallsPass(*PR);
   initializeAMDGPUSimplifyLibCallsPass(*PR);
-  initializeAMDGPUInlinerPass(*PR);
   initializeAMDGPUPrintfRuntimeBindingPass(*PR);
   initializeGCNRegBankReassignPass(*PR);
   initializeGCNNSAReassignPass(*PR);
@@ -284,7 +278,6 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
   ScheduleDAGMILive *DAG =
     new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
-  DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
   DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
   return DAG;
@@ -295,7 +288,6 @@ createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
   auto DAG = new GCNIterativeScheduler(C,
     GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
-  DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
   return DAG;
 }
 
@@ -309,7 +301,6 @@ createIterativeILPMachineScheduler(MachineSchedContext *C) {
   auto DAG = new GCNIterativeScheduler(C,
     GCNIterativeScheduler::SCHEDULE_ILP);
   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
-  DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
   return DAG;
 }
@@ -345,15 +336,15 @@ GCNILPSchedRegistry("gcn-ilp",
 static StringRef computeDataLayout(const Triple &TT) {
   if (TT.getArch() == Triple::r600) {
     // 32-bit pointers.
-      return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
-             "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5";
+    return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
+           "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1";
   }
 
   // 32-bit private, local, and region pointers. 64-bit global, constant and
   // flat, non-integral buffer fat pointers.
-    return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
+  return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
          "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
-         "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+         "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1"
          "-ni:7";
 }
 
@@ -402,16 +393,14 @@ AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
 
 StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
   Attribute GPUAttr = F.getFnAttribute("target-cpu");
-  return GPUAttr.hasAttribute(Attribute::None) ?
-    getTargetCPU() : GPUAttr.getValueAsString();
+  return GPUAttr.isValid() ? GPUAttr.getValueAsString() : getTargetCPU();
 }
 
 StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
   Attribute FSAttr = F.getFnAttribute("target-features");
 
-  return FSAttr.hasAttribute(Attribute::None) ?
-    getTargetFeatureString() :
-    FSAttr.getValueAsString();
+  return FSAttr.isValid() ? FSAttr.getValueAsString()
+                          : getTargetFeatureString();
 }
 
 /// Predicate for Internalize pass.
@@ -433,7 +422,7 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
 
   if (EnableFunctionCalls) {
     delete Builder.Inliner;
-    Builder.Inliner = createAMDGPUFunctionInliningPass();
+    Builder.Inliner = createFunctionInliningPass();
   }
 
   Builder.addExtension(
@@ -487,6 +476,133 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
   });
 }
 
+void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) {
+  AAM.registerFunctionAnalysis<AMDGPUAA>();
+}
+
+void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB,
+                                                       bool DebugPassManager) {
+  PB.registerPipelineParsingCallback(
+      [this](StringRef PassName, ModulePassManager &PM,
+             ArrayRef<PassBuilder::PipelineElement>) {
+        if (PassName == "amdgpu-propagate-attributes-late") {
+          PM.addPass(AMDGPUPropagateAttributesLatePass(*this));
+          return true;
+        }
+        if (PassName == "amdgpu-unify-metadata") {
+          PM.addPass(AMDGPUUnifyMetadataPass());
+          return true;
+        }
+        if (PassName == "amdgpu-printf-runtime-binding") {
+          PM.addPass(AMDGPUPrintfRuntimeBindingPass());
+          return true;
+        }
+        if (PassName == "amdgpu-always-inline") {
+          PM.addPass(AMDGPUAlwaysInlinePass());
+          return true;
+        }
+        return false;
+      });
+  PB.registerPipelineParsingCallback(
+      [this](StringRef PassName, FunctionPassManager &PM,
+             ArrayRef<PassBuilder::PipelineElement>) {
+        if (PassName == "amdgpu-simplifylib") {
+          PM.addPass(AMDGPUSimplifyLibCallsPass(*this));
+          return true;
+        }
+        if (PassName == "amdgpu-usenative") {
+          PM.addPass(AMDGPUUseNativeCallsPass());
+          return true;
+        }
+        if (PassName == "amdgpu-promote-alloca") {
+          PM.addPass(AMDGPUPromoteAllocaPass(*this));
+          return true;
+        }
+        if (PassName == "amdgpu-promote-alloca-to-vector") {
+          PM.addPass(AMDGPUPromoteAllocaToVectorPass(*this));
+          return true;
+        }
+        if (PassName == "amdgpu-lower-kernel-attributes") {
+          PM.addPass(AMDGPULowerKernelAttributesPass());
+          return true;
+        }
+        if (PassName == "amdgpu-propagate-attributes-early") {
+          PM.addPass(AMDGPUPropagateAttributesEarlyPass(*this));
+          return true;
+        }
+
+        return false;
+      });
+
+  PB.registerAnalysisRegistrationCallback([](FunctionAnalysisManager &FAM) {
+    FAM.registerPass([&] { return AMDGPUAA(); });
+  });
+
+  PB.registerParseAACallback([](StringRef AAName, AAManager &AAM) {
+    if (AAName == "amdgpu-aa") {
+      AAM.registerFunctionAnalysis<AMDGPUAA>();
+      return true;
+    }
+    return false;
+  });
+
+  PB.registerPipelineStartEPCallback([this, DebugPassManager](
+                                         ModulePassManager &PM,
+                                         PassBuilder::OptimizationLevel Level) {
+    FunctionPassManager FPM(DebugPassManager);
+    FPM.addPass(AMDGPUPropagateAttributesEarlyPass(*this));
+    FPM.addPass(AMDGPUUseNativeCallsPass());
+    if (EnableLibCallSimplify && Level != PassBuilder::OptimizationLevel::O0)
+      FPM.addPass(AMDGPUSimplifyLibCallsPass(*this));
+    PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+  });
+
+  PB.registerPipelineEarlySimplificationEPCallback(
+      [this](ModulePassManager &PM, PassBuilder::OptimizationLevel Level) {
+        if (Level == PassBuilder::OptimizationLevel::O0)
+          return;
+
+        PM.addPass(AMDGPUUnifyMetadataPass());
+        PM.addPass(AMDGPUPrintfRuntimeBindingPass());
+
+        if (InternalizeSymbols) {
+          PM.addPass(InternalizePass(mustPreserveGV));
+        }
+        PM.addPass(AMDGPUPropagateAttributesLatePass(*this));
+        if (InternalizeSymbols) {
+          PM.addPass(GlobalDCEPass());
+        }
+        if (EarlyInlineAll && !EnableFunctionCalls)
+          PM.addPass(AMDGPUAlwaysInlinePass());
+      });
+
+  PB.registerCGSCCOptimizerLateEPCallback(
+      [this, DebugPassManager](CGSCCPassManager &PM,
+                               PassBuilder::OptimizationLevel Level) {
+        if (Level == PassBuilder::OptimizationLevel::O0)
+          return;
+
+        FunctionPassManager FPM(DebugPassManager);
+
+        // Add infer address spaces pass to the opt pipeline after inlining
+        // but before SROA to increase SROA opportunities.
+        FPM.addPass(InferAddressSpacesPass());
+
+        // This should run after inlining to have any chance of doing
+        // anything, and before other cleanup optimizations.
+        FPM.addPass(AMDGPULowerKernelAttributesPass());
+
+        if (Level != PassBuilder::OptimizationLevel::O0) {
+          // Promote alloca to vector before SROA and loop unroll. If we
+          // manage to eliminate allocas before unroll we may choose to unroll
+          // less.
+          FPM.addPass(AMDGPUPromoteAllocaToVectorPass(*this));
+        }
+
+        PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
+      });
+}
+
 //===----------------------------------------------------------------------===//
 // R600 Target Machine (R600 -> Cayman)
 //===----------------------------------------------------------------------===//
@@ -526,6 +642,39 @@ const R600Subtarget *R600TargetMachine::getSubtargetImpl(
   return I.get();
 }
 
+int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) {
+  return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
+          AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
+          AddrSpace == AMDGPUAS::REGION_ADDRESS)
+             ? -1
+             : 0;
+}
+
+bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS,
+                                              unsigned DestAS) const {
+  return AMDGPU::isFlatGlobalAddrSpace(SrcAS) &&
+         AMDGPU::isFlatGlobalAddrSpace(DestAS);
+}
+
+unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const {
+  const auto *LD = dyn_cast<LoadInst>(V);
+  if (!LD)
+    return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
+
+  // It must be a generic pointer loaded.
+  assert(V->getType()->isPointerTy() &&
+         V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS);
+
+  const auto *Ptr = LD->getPointerOperand();
+  if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
+    return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
+  // For a generic pointer loaded from the constant memory, it could be assumed
+  // as a global pointer since the constant memory is only populated on the
+  // host side. As implied by the offload programming model, only global
+  // pointers could be referenced on the host side.
+  return AMDGPUAS::GLOBAL_ADDRESS;
+}
+
 TargetTransformInfo
 R600TargetMachine::getTargetTransformInfo(const Function &F) {
   return TargetTransformInfo(R600TTIImpl(this, F));
@@ -593,7 +742,6 @@ public:
   createMachineScheduler(MachineSchedContext *C) const override {
     ScheduleDAGMILive *DAG = createGenericSchedLive(C);
     DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
-    DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
     return DAG;
   }
 
@@ -866,6 +1014,7 @@ ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
 bool GCNPassConfig::addPreISel() {
   AMDGPUPassConfig::addPreISel();
 
+  addPass(createAMDGPULateCodeGenPreparePass());
   if (EnableAtomicOptimizations) {
     addPass(createAMDGPUAtomicOptimizerPass());
   }
@@ -930,19 +1079,12 @@ bool GCNPassConfig::addInstSelector() {
   AMDGPUPassConfig::addInstSelector();
   addPass(&SIFixSGPRCopiesID);
   addPass(createSILowerI1CopiesPass());
-  // TODO: We have to add FinalizeISel
-  // to expand V_ADD/SUB_U64_PSEUDO before SIFixupVectorISel
-  // that expects V_ADD/SUB -> A_ADDC/SUBB pairs expanded.
-  // Will be removed as soon as SIFixupVectorISel is changed
-  // to work with V_ADD/SUB_U64_PSEUDO instead.
-  addPass(&FinalizeISelID);
-  addPass(createSIFixupVectorISelPass());
   addPass(createSIAddIMGInitPass());
   return false;
 }
 
 bool GCNPassConfig::addIRTranslator() {
-  addPass(new IRTranslator());
+  addPass(new IRTranslator(getOptLevel()));
   return false;
 }
 
@@ -969,6 +1111,10 @@ bool GCNPassConfig::addRegBankSelect() {
 
 bool GCNPassConfig::addGlobalInstructionSelect() {
   addPass(new InstructionSelect());
+  // TODO: Fix instruction selection to do the right thing for image
+  // instructions with tfe or lwe in the first place, instead of running a
+  // separate pass to fix them up?
+  addPass(createSIAddIMGInitPass());
   return false;
 }
 
@@ -976,7 +1122,6 @@ void GCNPassConfig::addPreRegAlloc() {
   if (LateCFGStructurize) {
     addPass(createAMDGPUMachineCFGStructurizerPass());
   }
-  addPass(createSIWholeQuadModePass());
 }
 
 void GCNPassConfig::addFastRegAlloc() {
@@ -988,13 +1133,18 @@ void GCNPassConfig::addFastRegAlloc() {
   // SI_ELSE will introduce a copy of the tied operand source after the else.
   insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
 
-  // This must be run just after RegisterCoalescing.
-  insertPass(&RegisterCoalescerID, &SIPreAllocateWWMRegsID, false);
+  insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID);
+  insertPass(&TwoAddressInstructionPassID, &SIPreAllocateWWMRegsID);
 
   TargetPassConfig::addFastRegAlloc();
 }
 
 void GCNPassConfig::addOptimizedRegAlloc() {
+  // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation
+  // instructions that cause scheduling barriers.
+  insertPass(&MachineSchedulerID, &SIWholeQuadModeID);
+  insertPass(&MachineSchedulerID, &SIPreAllocateWWMRegsID);
+
   if (OptExecMaskPreRA)
     insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
   insertPass(&MachineSchedulerID, &SIFormMemoryClausesID);
@@ -1004,9 +1154,6 @@ void GCNPassConfig::addOptimizedRegAlloc() {
   // SI_ELSE will introduce a copy of the tied operand source after the else.
   insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
 
-  // This must be run just after RegisterCoalescing.
-  insertPass(&RegisterCoalescerID, &SIPreAllocateWWMRegsID, false);
-
   if (EnableDCEInRA)
     insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID);
 
@@ -1041,6 +1188,12 @@ void GCNPassConfig::addPreEmitPass() {
   addPass(createSIShrinkInstructionsPass());
   addPass(createSIModeRegisterPass());
 
+  if (getOptLevel() > CodeGenOpt::None)
+    addPass(&SIInsertHardClausesID);
+
+  addPass(&SIRemoveShortExecBranchesID);
+  addPass(&SIInsertSkipsPassID);
+  addPass(&SIPreEmitPeepholeID);
   // The hazard recognizer that runs as part of the post-ra scheduler does not
   // guarantee to be able handle all hazards correctly. This is because if there
   // are multiple scheduling regions in a basic block, the regions are scheduled
@@ -1049,16 +1202,7 @@ void GCNPassConfig::addPreEmitPass() {
   //
   // Here we add a stand-alone hazard recognizer pass which can handle all
   // cases.
-  //
-  // FIXME: This stand-alone pass will emit indiv. S_NOP 0, as needed. It would
-  // be better for it to emit S_NOP <N> when possible.
   addPass(&PostRAHazardRecognizerID);
-  if (getOptLevel() > CodeGenOpt::None)
-    addPass(&SIInsertHardClausesID);
-
-  addPass(&SIRemoveShortExecBranchesID);
-  addPass(&SIInsertSkipsPassID);
-  addPass(&SIPreEmitPeepholeID);
   addPass(&BranchRelaxationPassID);
 }
 
@@ -1087,6 +1231,12 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
 
   MFI->initializeBaseYamlFields(YamlMFI);
 
+  if (MFI->Occupancy == 0) {
+    // Fixup the subtarget dependent default value.
+    const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+    MFI->Occupancy = ST.computeOccupancy(MF.getFunction(), MFI->getLDSSize());
+  }
+
   auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) {
     Register TempReg;
     if (parseNamedRegisterReference(PFS, TempReg, RegName.Value, Error)) {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index e223fecc8819..95aefa23c24c 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -14,14 +14,9 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETMACHINE_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETMACHINE_H
 
-#include "AMDGPUSubtarget.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/StringMap.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Support/CodeGen.h"
+#include "GCNSubtarget.h"
+#include "R600Subtarget.h"
 #include "llvm/Target/TargetMachine.h"
-#include <memory>
 
 namespace llvm {
 
@@ -56,12 +51,16 @@ public:
 
   void adjustPassManager(PassManagerBuilder &) override;
 
+  void registerPassBuilderCallbacks(PassBuilder &PB,
+                                    bool DebugPassManager) override;
+  void registerDefaultAliasAnalyses(AAManager &) override;
+
   /// Get the integer value of a null pointer in the given address space.
-  static int64_t getNullPointerValue(unsigned AddrSpace) {
-    return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
-            AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
-            AddrSpace == AMDGPUAS::REGION_ADDRESS) ? -1 : 0;
-  }
+  static int64_t getNullPointerValue(unsigned AddrSpace);
+
+  bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
+
+  unsigned getAssumedAddrSpace(const Value *V) const override;
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
index 6569980d2c75..f854c8c16e5a 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
@@ -7,13 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUTargetObjectFile.h"
-#include "AMDGPU.h"
-#include "AMDGPUTargetMachine.h"
 #include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/BinaryFormat/ELF.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCSectionELF.h"
-
+#include "llvm/IR/GlobalObject.h"
+#include "llvm/MC/SectionKind.h"
+#include "llvm/Target/TargetMachine.h"
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 542a5f006c0f..7b8a79640bb2 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -15,40 +15,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUTargetTransformInfo.h"
-#include "AMDGPUSubtarget.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/STLExtras.h"
+#include "AMDGPUTargetMachine.h"
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/CodeGen/ISDOpcodes.h"
-#include "llvm/CodeGen/ValueTypes.h"
-#include "llvm/IR/Argument.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CallingConv.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/PatternMatch.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Value.h"
-#include "llvm/MC/SubtargetFeature.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MachineValueType.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
-#include <algorithm>
-#include <cassert>
-#include <limits>
-#include <utility>
+#include "llvm/Support/KnownBits.h"
 
 using namespace llvm;
 
@@ -82,7 +54,25 @@ static cl::opt<bool> UseLegacyDA(
 static cl::opt<unsigned> UnrollMaxBlockToAnalyze(
     "amdgpu-unroll-max-block-to-analyze",
     cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
-    cl::init(20), cl::Hidden);
+    cl::init(32), cl::Hidden);
+
+static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",
+                                       cl::Hidden, cl::init(4000),
+                                       cl::desc("Cost of alloca argument"));
+
+// If the amount of scratch memory to eliminate exceeds our ability to allocate
+// it into registers we gain nothing by aggressively inlining functions for that
+// heuristic.
+static cl::opt<unsigned>
+    ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden,
+                    cl::init(256),
+                    cl::desc("Maximum alloca size to use for inline cost"));
+
+// Inliner constraint to achieve reasonable compilation time.
+static cl::opt<size_t> InlineMaxBB(
+    "amdgpu-inline-max-bb", cl::Hidden, cl::init(1100),
+    cl::desc("Maximum number of BBs allowed in a function after inlining"
+             " (compile time constraint)"));
 
 static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
                               unsigned Depth = 0) {
@@ -103,6 +93,12 @@ static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
   return false;
 }
 
+AMDGPUTTIImpl::AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
+    : BaseT(TM, F.getParent()->getDataLayout()),
+      TargetTriple(TM->getTargetTriple()),
+      ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
+      TLI(ST->getTargetLowering()) {}
+
 void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                             TTI::UnrollingPreferences &UP) {
   const Function &F = *L->getHeader()->getParent();
@@ -116,6 +112,26 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
   const unsigned MaxAlloca = (256 - 16) * 4;
   unsigned ThresholdPrivate = UnrollThresholdPrivate;
   unsigned ThresholdLocal = UnrollThresholdLocal;
+
+  // If this loop has the amdgpu.loop.unroll.threshold metadata we will use the
+  // provided threshold value as the default for Threshold
+  if (MDNode *LoopUnrollThreshold =
+          findOptionMDForLoop(L, "amdgpu.loop.unroll.threshold")) {
+    if (LoopUnrollThreshold->getNumOperands() == 2) {
+      ConstantInt *MetaThresholdValue = mdconst::extract_or_null<ConstantInt>(
+          LoopUnrollThreshold->getOperand(1));
+      if (MetaThresholdValue) {
+        // We will also use the supplied value for PartialThreshold for now.
+        // We may introduce additional metadata if it becomes necessary in the
+        // future.
+        UP.Threshold = MetaThresholdValue->getSExtValue();
+        UP.PartialThreshold = UP.Threshold;
+        ThresholdPrivate = std::min(ThresholdPrivate, UP.Threshold);
+        ThresholdLocal = std::min(ThresholdLocal, UP.Threshold);
+      }
+    }
+  }
+
   unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
   for (const BasicBlock *BB : L->getBlocks()) {
     const DataLayout &DL = BB->getModule()->getDataLayout();
@@ -169,7 +185,7 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
       if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
         const Value *Ptr = GEP->getPointerOperand();
         const AllocaInst *Alloca =
-            dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL));
+            dyn_cast<AllocaInst>(getUnderlyingObject(Ptr));
         if (!Alloca || !Alloca->isStaticAlloca())
           continue;
         Type *Ty = Alloca->getAllocatedType();
@@ -231,7 +247,7 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
 
     // If we got a GEP in a small BB from inner loop then increase max trip
     // count to analyze for better estimation cost in unroll
-    if (L->empty() && BB->size() < UnrollMaxBlockToAnalyze)
+    if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze)
       UP.MaxIterationsCountToAnalyze = 32;
   }
 }
@@ -240,6 +256,41 @@ void AMDGPUTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
                                           TTI::PeelingPreferences &PP) {
   BaseT::getPeelingPreferences(L, SE, PP);
 }
+
+const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
+    // Codegen control options which don't matter.
+    AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
+    AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureFlatForGlobal,
+    AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedScratchAccess,
+    AMDGPU::FeatureUnalignedAccessMode,
+
+    AMDGPU::FeatureAutoWaitcntBeforeBarrier,
+
+    // Property of the kernel/environment which can't actually differ.
+    AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,
+    AMDGPU::FeatureTrapHandler,
+
+    // The default assumption needs to be ecc is enabled, but no directly
+    // exposed operations depend on it, so it can be safely inlined.
+    AMDGPU::FeatureSRAMECC,
+
+    // Perf-tuning features
+    AMDGPU::FeatureFastFMAF32, AMDGPU::HalfRate64Ops};
+
+GCNTTIImpl::GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
+    : BaseT(TM, F.getParent()->getDataLayout()),
+      ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
+      TLI(ST->getTargetLowering()), CommonTTI(TM, F),
+      IsGraphics(AMDGPU::isGraphics(F.getCallingConv())),
+      MaxVGPRs(ST->getMaxNumVGPRs(
+          std::max(ST->getWavesPerEU(F).first,
+                   ST->getWavesPerEUForWorkGroup(
+                       ST->getFlatWorkGroupSizes(F).second)))) {
+  AMDGPU::SIModeRegisterDefaults Mode(F);
+  HasFP32Denormals = Mode.allFP32Denormals();
+  HasFP64FP16Denormals = Mode.allFP64FP16Denormals();
+}
+
 unsigned GCNTTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
   // The concept of vector registers doesn't really exist. Some packed vector
   // operations operate on the normal 32-bit registers.
@@ -267,6 +318,12 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
   return 32;
 }
 
+unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
+  if (Opcode == Instruction::Load || Opcode == Instruction::Store)
+    return 32 * 4 / ElemWidth;
+  return (ElemWidth == 16 && ST->has16BitInsts()) ? 2 : 1;
+}
+
 unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
                                          unsigned ChainSizeInBytes,
                                          VectorType *VecTy) const {
@@ -451,9 +508,50 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
     // FIXME: We're having to query the throughput cost so that the basic
     // implementation tries to generate legalize and scalarization costs. Maybe
     // we could hoist the scalarization code here?
-    return BaseT::getArithmeticInstrCost(Opcode, Ty, TTI::TCK_RecipThroughput,
-                                         Opd1Info, Opd2Info,
-                                         Opd1PropInfo, Opd2PropInfo);
+    if (CostKind != TTI::TCK_CodeSize)
+      return BaseT::getArithmeticInstrCost(Opcode, Ty, TTI::TCK_RecipThroughput,
+                                           Opd1Info, Opd2Info, Opd1PropInfo,
+                                           Opd2PropInfo, Args, CxtI);
+    // Scalarization
+
+    // Check if any of the operands are vector operands.
+    int ISD = TLI->InstructionOpcodeToISD(Opcode);
+    assert(ISD && "Invalid opcode");
+
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+
+    bool IsFloat = Ty->isFPOrFPVectorTy();
+    // Assume that floating point arithmetic operations cost twice as much as
+    // integer operations.
+    unsigned OpCost = (IsFloat ? 2 : 1);
+
+    if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
+      // The operation is legal. Assume it costs 1.
+      // TODO: Once we have extract/insert subvector cost we need to use them.
+      return LT.first * OpCost;
+    }
+
+    if (!TLI->isOperationExpand(ISD, LT.second)) {
+      // If the operation is custom lowered, then assume that the code is twice
+      // as expensive.
+      return LT.first * 2 * OpCost;
+    }
+
+    // Else, assume that we need to scalarize this op.
+    // TODO: If one of the types get legalized by splitting, handle this
+    // similarly to what getCastInstrCost() does.
+    if (auto *VTy = dyn_cast<VectorType>(Ty)) {
+      unsigned Num = cast<FixedVectorType>(VTy)->getNumElements();
+      unsigned Cost = getArithmeticInstrCost(
+          Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
+          Opd1PropInfo, Opd2PropInfo, Args, CxtI);
+      // Return the cost of multiple scalar invocation plus the cost of
+      // inserting and extracting the values.
+      return getScalarizationOverhead(VTy, Args) + Num * Cost;
+    }
+
+    // We don't know anything about this scalar instruction.
+    return OpCost;
   }
 
   // Legalize the type.
@@ -472,7 +570,7 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
   case ISD::SRL:
   case ISD::SRA:
     if (SLT == MVT::i64)
-      return get64BitInstrCost() * LT.first * NElts;
+      return get64BitInstrCost(CostKind) * LT.first * NElts;
 
     if (ST->has16BitInsts() && SLT == MVT::i16)
       NElts = (NElts + 1) / 2;
@@ -494,7 +592,7 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
 
     return LT.first * NElts * getFullRateInstrCost();
   case ISD::MUL: {
-    const int QuarterRateCost = getQuarterRateInstrCost();
+    const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
     if (SLT == MVT::i64) {
       const int FullRateCost = getFullRateInstrCost();
       return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
@@ -506,11 +604,32 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
     // i32
     return QuarterRateCost * NElts * LT.first;
   }
+  case ISD::FMUL:
+    // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for
+    // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole
+    // fused operation.
+    if (CxtI && CxtI->hasOneUse())
+      if (const auto *FAdd = dyn_cast<BinaryOperator>(*CxtI->user_begin())) {
+        const int OPC = TLI->InstructionOpcodeToISD(FAdd->getOpcode());
+        if (OPC == ISD::FADD || OPC == ISD::FSUB) {
+          if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)
+            return TargetTransformInfo::TCC_Free;
+          if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
+            return TargetTransformInfo::TCC_Free;
+
+          // Estimate all types may be fused with contract/unsafe flags
+          const TargetOptions &Options = TLI->getTargetMachine().Options;
+          if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
+              Options.UnsafeFPMath ||
+              (FAdd->hasAllowContract() && CxtI->hasAllowContract()))
+            return TargetTransformInfo::TCC_Free;
+        }
+      }
+    LLVM_FALLTHROUGH;
   case ISD::FADD:
   case ISD::FSUB:
-  case ISD::FMUL:
     if (SLT == MVT::f64)
-      return LT.first * NElts * get64BitInstrCost();
+      return LT.first * NElts * get64BitInstrCost(CostKind);
 
     if (ST->has16BitInsts() && SLT == MVT::f16)
       NElts = (NElts + 1) / 2;
@@ -523,7 +642,9 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
     // FIXME: frem should be handled separately. The fdiv in it is most of it,
     // but the current lowering is also not entirely correct.
     if (SLT == MVT::f64) {
-      int Cost = 4 * get64BitInstrCost() + 7 * getQuarterRateInstrCost();
+      int Cost = 7 * get64BitInstrCost(CostKind) +
+                 getQuarterRateInstrCost(CostKind) +
+                 3 * getHalfRateInstrCost(CostKind);
       // Add cost of workaround.
       if (!ST->hasUsableDivScaleConditionOutput())
         Cost += 3 * getFullRateInstrCost();
@@ -535,7 +656,7 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
       // TODO: This is more complicated, unsafe flags etc.
       if ((SLT == MVT::f32 && !HasFP32Denormals) ||
           (SLT == MVT::f16 && ST->has16BitInsts())) {
-        return LT.first * getQuarterRateInstrCost() * NElts;
+        return LT.first * getQuarterRateInstrCost(CostKind) * NElts;
       }
     }
 
@@ -545,12 +666,15 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
       // f32 fmul
       // v_cvt_f16_f32
       // f16 div_fixup
-      int Cost = 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost();
+      int Cost =
+          4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(CostKind);
       return LT.first * Cost * NElts;
     }
 
     if (SLT == MVT::f32 || SLT == MVT::f16) {
-      int Cost = 7 * getFullRateInstrCost() + 1 * getQuarterRateInstrCost();
+      // 4 more v_cvt_* insts without f16 insts support
+      int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
+                 1 * getQuarterRateInstrCost(CostKind);
 
       if (!HasFP32Denormals) {
         // FP mode switches.
@@ -568,18 +692,21 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
     break;
   }
 
-  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
-                                       Opd2Info,
-                                       Opd1PropInfo, Opd2PropInfo);
+  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, Opd2Info,
+                                       Opd1PropInfo, Opd2PropInfo, Args, CxtI);
 }
 
-// Return true if there's a potential benefit from using v2f16 instructions for
-// an intrinsic, even if it requires nontrivial legalization.
+// Return true if there's a potential benefit from using v2f16/v2i16
+// instructions for an intrinsic, even if it requires nontrivial legalization.
 static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) {
   switch (ID) {
   case Intrinsic::fma: // TODO: fmuladd
   // There's a small benefit to using vector ops in the legalized code.
   case Intrinsic::round:
+  case Intrinsic::uadd_sat:
+  case Intrinsic::usub_sat:
+  case Intrinsic::sadd_sat:
+  case Intrinsic::ssub_sat:
     return true;
   default:
     return false;
@@ -597,7 +724,48 @@ int GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
   Type *RetTy = ICA.getReturnType();
   EVT OrigTy = TLI->getValueType(DL, RetTy);
   if (!OrigTy.isSimple()) {
-    return BaseT::getIntrinsicInstrCost(ICA, CostKind);
+    if (CostKind != TTI::TCK_CodeSize)
+      return BaseT::getIntrinsicInstrCost(ICA, CostKind);
+
+    // TODO: Combine these two logic paths.
+    if (ICA.isTypeBasedOnly())
+      return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
+
+    Type *RetTy = ICA.getReturnType();
+    unsigned VF = ICA.getVectorFactor().getFixedValue();
+    unsigned RetVF =
+        (RetTy->isVectorTy() ? cast<FixedVectorType>(RetTy)->getNumElements()
+                             : 1);
+    assert((RetVF == 1 || VF == 1) && "VF > 1 and RetVF is a vector type");
+    const IntrinsicInst *I = ICA.getInst();
+    const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
+    FastMathFlags FMF = ICA.getFlags();
+    // Assume that we need to scalarize this intrinsic.
+    SmallVector<Type *, 4> Types;
+    for (const Value *Op : Args) {
+      Type *OpTy = Op->getType();
+      assert(VF == 1 || !OpTy->isVectorTy());
+      Types.push_back(VF == 1 ? OpTy : FixedVectorType::get(OpTy, VF));
+    }
+
+    if (VF > 1 && !RetTy->isVoidTy())
+      RetTy = FixedVectorType::get(RetTy, VF);
+
+    // Compute the scalarization overhead based on Args for a vector
+    // intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while
+    // CostModel will pass a vector RetTy and VF is 1.
+    unsigned ScalarizationCost = std::numeric_limits<unsigned>::max();
+    if (RetVF > 1 || VF > 1) {
+      ScalarizationCost = 0;
+      if (!RetTy->isVoidTy())
+        ScalarizationCost +=
+            getScalarizationOverhead(cast<VectorType>(RetTy), true, false);
+      ScalarizationCost += getOperandsScalarizationOverhead(Args, VF);
+    }
+
+    IntrinsicCostAttributes Attrs(ICA.getID(), RetTy, Types, FMF,
+                                  ScalarizationCost, I);
+    return getIntrinsicInstrCost(Attrs, CostKind);
   }
 
   // Legalize the type.
@@ -609,16 +777,16 @@ int GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
   MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
 
   if (SLT == MVT::f64)
-    return LT.first * NElts * get64BitInstrCost();
+    return LT.first * NElts * get64BitInstrCost(CostKind);
 
   if (ST->has16BitInsts() && SLT == MVT::f16)
     NElts = (NElts + 1) / 2;
 
   // TODO: Get more refined intrinsic costs?
-  unsigned InstRate = getQuarterRateInstrCost();
+  unsigned InstRate = getQuarterRateInstrCost(CostKind);
   if (ICA.getID() == Intrinsic::fma) {
-    InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost()
-                                   : getQuarterRateInstrCost();
+    InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
+                                   : getQuarterRateInstrCost(CostKind);
   }
 
   return LT.first * NElts * InstRate;
@@ -669,7 +837,7 @@ int GCNTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
                                          CostKind);
 
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
-  return LT.first * getHalfRateInstrCost();
+  return LT.first * getHalfRateInstrCost(CostKind);
 }
 
 int GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
@@ -697,32 +865,6 @@ int GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
   }
 }
 
-static bool isArgPassedInSGPR(const Argument *A) {
-  const Function *F = A->getParent();
-
-  // Arguments to compute shaders are never a source of divergence.
-  CallingConv::ID CC = F->getCallingConv();
-  switch (CC) {
-  case CallingConv::AMDGPU_KERNEL:
-  case CallingConv::SPIR_KERNEL:
-    return true;
-  case CallingConv::AMDGPU_VS:
-  case CallingConv::AMDGPU_LS:
-  case CallingConv::AMDGPU_HS:
-  case CallingConv::AMDGPU_ES:
-  case CallingConv::AMDGPU_GS:
-  case CallingConv::AMDGPU_PS:
-  case CallingConv::AMDGPU_CS:
-    // For non-compute shaders, SGPR inputs are marked with either inreg or byval.
-    // Everything else is in VGPRs.
-    return F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::InReg) ||
-           F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::ByVal);
-  default:
-    // TODO: Should calls support inreg for SGPR inputs?
-    return false;
-  }
-}
-
 /// Analyze if the results of inline asm are divergent. If \p Indices is empty,
 /// this is analyzing the collective result of all output registers. Otherwise,
 /// this is only querying a specific result index if this returns multiple
@@ -779,7 +921,7 @@ bool GCNTTIImpl::useGPUDivergenceAnalysis() const {
 /// different across workitems in a wavefront.
 bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
   if (const Argument *A = dyn_cast<Argument>(V))
-    return !isArgPassedInSGPR(A);
+    return !AMDGPU::isArgPassedInSGPR(A);
 
   // Loads from the private and flat address spaces are divergent, because
   // threads can execute the load instruction with the same inputs and get
@@ -921,7 +1063,10 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
     Type *MaskTy = MaskOp->getType();
 
     bool DoTruncate = false;
-    if (!getTLI()->isNoopAddrSpaceCast(OldAS, NewAS)) {
+
+    const GCNTargetMachine &TM =
+        static_cast<const GCNTargetMachine &>(getTLI()->getTargetMachine());
+    if (!TM.isNoopAddrSpaceCast(OldAS, NewAS)) {
       // All valid 64-bit to 32-bit casts work by chopping off the high
       // bits. Any masking only clearing the low bits will also apply in the new
       // address space.
@@ -993,7 +1138,47 @@ bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
   // no way to support merge for backend defined attributes.
   AMDGPU::SIModeRegisterDefaults CallerMode(*Caller);
   AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee);
-  return CallerMode.isInlineCompatible(CalleeMode);
+  if (!CallerMode.isInlineCompatible(CalleeMode))
+    return false;
+
+  // Hack to make compile times reasonable.
+  if (InlineMaxBB && !Callee->hasFnAttribute(Attribute::InlineHint)) {
+    // Single BB does not increase total BB amount, thus subtract 1.
+    size_t BBSize = Caller->size() + Callee->size() - 1;
+    return BBSize <= InlineMaxBB;
+  }
+
+  return true;
+}
+
+unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase *CB) const {
+  // If we have a pointer to private array passed into a function
+  // it will not be optimized out, leaving scratch usage.
+  // Increase the inline threshold to allow inlining in this case.
+  uint64_t AllocaSize = 0;
+  SmallPtrSet<const AllocaInst *, 8> AIVisited;
+  for (Value *PtrArg : CB->args()) {
+    PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
+    if (!Ty || (Ty->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS &&
+                Ty->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS))
+      continue;
+
+    PtrArg = getUnderlyingObject(PtrArg);
+    if (const AllocaInst *AI = dyn_cast<AllocaInst>(PtrArg)) {
+      if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second)
+        continue;
+      AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType());
+      // If the amount of stack memory is excessive we will not be able
+      // to get rid of the scratch anyway, bail out.
+      if (AllocaSize > ArgAllocaCutoff) {
+        AllocaSize = 0;
+        break;
+      }
+    }
+  }
+  if (AllocaSize)
+    return ArgAllocaCost;
+  return 0;
 }
 
 void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
@@ -1006,6 +1191,16 @@ void GCNTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
   CommonTTI.getPeelingPreferences(L, SE, PP);
 }
 
+int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {
+  return ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
+                                : getQuarterRateInstrCost(CostKind);
+}
+
+R600TTIImpl::R600TTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
+    : BaseT(TM, F.getParent()->getDataLayout()),
+      ST(static_cast<const R600Subtarget *>(TM->getSubtargetImpl(F))),
+      TLI(ST->getTargetLowering()), CommonTTI(TM, F) {}
+
 unsigned R600TTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
   return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 3364a9bcaccb..b29c94180fb8 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -19,22 +19,18 @@
 
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
-#include "AMDGPUTargetMachine.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/BasicTTIImpl.h"
-#include "llvm/IR/Function.h"
-#include "llvm/MC/SubtargetFeature.h"
-#include "llvm/Support/MathExtras.h"
-#include <cassert>
 
 namespace llvm {
 
 class AMDGPUTargetLowering;
+class GCNSubtarget;
+class InstCombiner;
 class Loop;
+class R600Subtarget;
 class ScalarEvolution;
+class SITargetLowering;
 class Type;
 class Value;
 
@@ -46,18 +42,14 @@ class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
 
   Triple TargetTriple;
 
-  const GCNSubtarget *ST;
+  const TargetSubtargetInfo *ST;
   const TargetLoweringBase *TLI;
 
   const TargetSubtargetInfo *getST() const { return ST; }
   const TargetLoweringBase *getTLI() const { return TLI; }
 
 public:
-  explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
-      : BaseT(TM, F.getParent()->getDataLayout()),
-        TargetTriple(TM->getTargetTriple()),
-        ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
-        TLI(ST->getTargetLowering()) {}
+  explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F);
 
   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                TTI::UnrollingPreferences &UP);
@@ -75,73 +67,41 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
   const GCNSubtarget *ST;
   const SITargetLowering *TLI;
   AMDGPUTTIImpl CommonTTI;
-  bool IsGraphicsShader;
+  bool IsGraphics;
   bool HasFP32Denormals;
+  bool HasFP64FP16Denormals;
   unsigned MaxVGPRs;
 
-  const FeatureBitset InlineFeatureIgnoreList = {
-    // Codegen control options which don't matter.
-    AMDGPU::FeatureEnableLoadStoreOpt,
-    AMDGPU::FeatureEnableSIScheduler,
-    AMDGPU::FeatureEnableUnsafeDSOffsetFolding,
-    AMDGPU::FeatureFlatForGlobal,
-    AMDGPU::FeaturePromoteAlloca,
-    AMDGPU::FeatureUnalignedBufferAccess,
-    AMDGPU::FeatureUnalignedScratchAccess,
-
-    AMDGPU::FeatureAutoWaitcntBeforeBarrier,
-
-    // Property of the kernel/environment which can't actually differ.
-    AMDGPU::FeatureSGPRInitBug,
-    AMDGPU::FeatureXNACK,
-    AMDGPU::FeatureTrapHandler,
-    AMDGPU::FeatureCodeObjectV3,
-
-    // The default assumption needs to be ecc is enabled, but no directly
-    // exposed operations depend on it, so it can be safely inlined.
-    AMDGPU::FeatureSRAMECC,
-
-    // Perf-tuning features
-    AMDGPU::FeatureFastFMAF32,
-    AMDGPU::HalfRate64Ops
-  };
+  static const FeatureBitset InlineFeatureIgnoreList;
 
   const GCNSubtarget *getST() const { return ST; }
-  const AMDGPUTargetLowering *getTLI() const { return TLI; }
+  const SITargetLowering *getTLI() const { return TLI; }
 
   static inline int getFullRateInstrCost() {
     return TargetTransformInfo::TCC_Basic;
   }
 
-  static inline int getHalfRateInstrCost() {
-    return 2 * TargetTransformInfo::TCC_Basic;
+  static inline int getHalfRateInstrCost(
+      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) {
+    return CostKind == TTI::TCK_CodeSize ? 2
+                                         : 2 * TargetTransformInfo::TCC_Basic;
   }
 
   // TODO: The size is usually 8 bytes, but takes 4x as many cycles. Maybe
   // should be 2 or 4.
-  static inline int getQuarterRateInstrCost() {
-    return 3 * TargetTransformInfo::TCC_Basic;
+  static inline int getQuarterRateInstrCost(
+      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) {
+    return CostKind == TTI::TCK_CodeSize ? 2
+                                         : 4 * TargetTransformInfo::TCC_Basic;
   }
 
-   // On some parts, normal fp64 operations are half rate, and others
-   // quarter. This also applies to some integer operations.
-  inline int get64BitInstrCost() const {
-    return ST->hasHalfRate64Ops() ?
-      getHalfRateInstrCost() : getQuarterRateInstrCost();
-  }
+  // On some parts, normal fp64 operations are half rate, and others
+  // quarter. This also applies to some integer operations.
+  int get64BitInstrCost(
+      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const;
 
 public:
-  explicit GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
-    : BaseT(TM, F.getParent()->getDataLayout()),
-      ST(static_cast<const GCNSubtarget*>(TM->getSubtargetImpl(F))),
-      TLI(ST->getTargetLowering()),
-      CommonTTI(TM, F),
-      IsGraphicsShader(AMDGPU::isShader(F.getCallingConv())),
-      HasFP32Denormals(AMDGPU::SIModeRegisterDefaults(F).allFP32Denormals()),
-      MaxVGPRs(ST->getMaxNumVGPRs(
-          std::max(ST->getWavesPerEU(F).first,
-                   ST->getWavesPerEUForWorkGroup(
-                       ST->getFlatWorkGroupSizes(F).second)))) {}
+  explicit GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F);
 
   bool hasBranchDivergence() { return true; }
   bool useGPUDivergenceAnalysis() const;
@@ -162,6 +122,7 @@ public:
   unsigned getNumberOfRegisters(unsigned RCID) const;
   unsigned getRegisterBitWidth(bool Vector) const;
   unsigned getMinVectorRegisterBitWidth() const;
+  unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const;
   unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
                                unsigned ChainSizeInBytes,
                                VectorType *VecTy) const;
@@ -213,7 +174,7 @@ public:
   unsigned getFlatAddressSpace() const {
     // Don't bother running InferAddressSpaces pass on graphics shaders which
     // don't use flat addressing.
-    if (IsGraphicsShader)
+    if (IsGraphics)
       return -1;
     return AMDGPUAS::FLAT_ADDRESS;
   }
@@ -223,6 +184,16 @@ public:
   Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
                                           Value *NewV) const;
 
+  bool canSimplifyLegacyMulToMul(const Value *Op0, const Value *Op1,
+                                 InstCombiner &IC) const;
+  Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
+                                               IntrinsicInst &II) const;
+  Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
+      InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
+      APInt &UndefElts2, APInt &UndefElts3,
+      std::function<void(Instruction *, unsigned, APInt, APInt &)>
+          SimplifyAndSetOp) const;
+
   unsigned getVectorSplitCost() { return 0; }
 
   unsigned getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index,
@@ -232,6 +203,7 @@ public:
                            const Function *Callee) const;
 
   unsigned getInliningThresholdMultiplier() { return 11; }
+  unsigned adjustInliningThreshold(const CallBase *CB) const;
 
   int getInlinerVectorBonusPercent() { return 0; }
 
@@ -259,11 +231,7 @@ class R600TTIImpl final : public BasicTTIImplBase<R600TTIImpl> {
   AMDGPUTTIImpl CommonTTI;
 
 public:
-  explicit R600TTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
-    : BaseT(TM, F.getParent()->getDataLayout()),
-      ST(static_cast<const R600Subtarget*>(TM->getSubtargetImpl(F))),
-      TLI(ST->getTargetLowering()),
-      CommonTTI(TM, F) {}
+  explicit R600TTIImpl(const AMDGPUTargetMachine *TM, const Function &F);
 
   const R600Subtarget *getST() const { return ST; }
   const AMDGPUTargetLowering *getTLI() const { return TLI; }
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
index 418296684d76..84d72e1b579f 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
@@ -20,21 +20,25 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
+#include "SIDefines.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/Type.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
@@ -70,17 +74,25 @@ char &llvm::AMDGPUUnifyDivergentExitNodesID = AMDGPUUnifyDivergentExitNodes::ID;
 
 INITIALIZE_PASS_BEGIN(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,
                      "Unify divergent function exit nodes", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
 INITIALIZE_PASS_END(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,
                     "Unify divergent function exit nodes", false, false)
 
 void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{
-  // TODO: Preserve dominator tree.
+  if (RequireAndPreserveDomTree)
+    AU.addRequired<DominatorTreeWrapperPass>();
+
   AU.addRequired<PostDominatorTreeWrapperPass>();
 
   AU.addRequired<LegacyDivergenceAnalysis>();
 
+  if (RequireAndPreserveDomTree) {
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    // FIXME: preserve PostDominatorTreeWrapperPass
+  }
+
   // No divergent values are changed, only blocks and branch edges.
   AU.addPreserved<LegacyDivergenceAnalysis>();
 
@@ -133,7 +145,7 @@ static void removeDoneExport(Function &F) {
   }
 }
 
-static BasicBlock *unifyReturnBlockSet(Function &F,
+static BasicBlock *unifyReturnBlockSet(Function &F, DomTreeUpdater &DTU,
                                        ArrayRef<BasicBlock *> ReturningBlocks,
                                        bool InsertExport,
                                        const TargetTransformInfo &TTI,
@@ -153,7 +165,7 @@ static BasicBlock *unifyReturnBlockSet(Function &F,
     Value *Undef = UndefValue::get(B.getFloatTy());
     B.CreateIntrinsic(Intrinsic::amdgcn_exp, { B.getFloatTy() },
                       {
-                        B.getInt32(9), // target, SQ_EXP_NULL
+                        B.getInt32(AMDGPU::Exp::ET_NULL),
                         B.getInt32(0), // enabled channels
                         Undef, Undef, Undef, Undef, // values
                         B.getTrue(), // done
@@ -174,6 +186,8 @@ static BasicBlock *unifyReturnBlockSet(Function &F,
 
   // Loop over all of the blocks, replacing the return instruction with an
   // unconditional branch.
+  std::vector<DominatorTree::UpdateType> Updates;
+  Updates.reserve(ReturningBlocks.size());
   for (BasicBlock *BB : ReturningBlocks) {
     // Add an incoming element to the PHI node for every return instruction that
     // is merging into this new block...
@@ -183,17 +197,27 @@ static BasicBlock *unifyReturnBlockSet(Function &F,
     // Remove and delete the return inst.
     BB->getTerminator()->eraseFromParent();
     BranchInst::Create(NewRetBlock, BB);
+    Updates.push_back({DominatorTree::Insert, BB, NewRetBlock});
   }
 
+  if (RequireAndPreserveDomTree)
+    DTU.applyUpdates(Updates);
+  Updates.clear();
+
   for (BasicBlock *BB : ReturningBlocks) {
     // Cleanup possible branch to unconditional branch to the return.
-    simplifyCFG(BB, TTI, {2});
+    simplifyCFG(BB, TTI, RequireAndPreserveDomTree ? &DTU : nullptr,
+                SimplifyCFGOptions().bonusInstThreshold(2));
   }
 
   return NewRetBlock;
 }
 
 bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
+  DominatorTree *DT = nullptr;
+  if (RequireAndPreserveDomTree)
+    DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
   auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
 
   // If there's only one exit, we don't need to do anything, unless this is a
@@ -216,6 +240,8 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
   bool InsertExport = false;
 
   bool Changed = false;
+  std::vector<DominatorTree::UpdateType> Updates;
+
   for (BasicBlock *BB : PDT.roots()) {
     if (isa<ReturnInst>(BB->getTerminator())) {
       if (!isUniformlyReached(DA, *BB))
@@ -272,14 +298,28 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
         BI->eraseFromParent(); // Delete the unconditional branch.
         // Add a new conditional branch with a dummy edge to the return block.
         BranchInst::Create(LoopHeaderBB, DummyReturnBB, BoolTrue, BB);
+        Updates.push_back({DominatorTree::Insert, BB, DummyReturnBB});
       } else { // Conditional branch.
+        SmallVector<BasicBlock *, 2> Successors(succ_begin(BB), succ_end(BB));
+
         // Create a new transition block to hold the conditional branch.
         BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock");
 
+        Updates.reserve(Updates.size() + 2 * Successors.size() + 2);
+
+        // 'Successors' become successors of TransitionBB instead of BB,
+        // and TransitionBB becomes a single successor of BB.
+        Updates.push_back({DominatorTree::Insert, BB, TransitionBB});
+        for (BasicBlock *Successor : Successors) {
+          Updates.push_back({DominatorTree::Insert, TransitionBB, Successor});
+          Updates.push_back({DominatorTree::Delete, BB, Successor});
+        }
+
         // Create a branch that will always branch to the transition block and
         // references DummyReturnBB.
         BB->getTerminator()->eraseFromParent();
         BranchInst::Create(TransitionBB, DummyReturnBB, BoolTrue, BB);
+        Updates.push_back({DominatorTree::Insert, BB, DummyReturnBB});
       }
       Changed = true;
     }
@@ -295,10 +335,12 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
                                             "UnifiedUnreachableBlock", &F);
       new UnreachableInst(F.getContext(), UnreachableBlock);
 
+      Updates.reserve(Updates.size() + UnreachableBlocks.size());
       for (BasicBlock *BB : UnreachableBlocks) {
         // Remove and delete the unreachable inst.
         BB->getTerminator()->eraseFromParent();
         BranchInst::Create(UnreachableBlock, BB);
+        Updates.push_back({DominatorTree::Insert, BB, UnreachableBlock});
       }
       Changed = true;
     }
@@ -328,6 +370,12 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
     }
   }
 
+  // FIXME: add PDT here once simplifycfg is ready.
+  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+  if (RequireAndPreserveDomTree)
+    DTU.applyUpdates(Updates);
+  Updates.clear();
+
   // Now handle return blocks.
   if (ReturningBlocks.empty())
     return Changed; // No blocks return
@@ -345,11 +393,10 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
   // uniformly reached block with the "done" bit cleared.
   auto BlocksToUnify = std::move(ReturningBlocks);
   if (InsertExport) {
-    BlocksToUnify.insert(BlocksToUnify.end(), UniformlyReachedRetBlocks.begin(),
-                         UniformlyReachedRetBlocks.end());
+    llvm::append_range(BlocksToUnify, UniformlyReachedRetBlocks);
   }
 
-  unifyReturnBlockSet(F, BlocksToUnify, InsertExport, TTI,
+  unifyReturnBlockSet(F, DTU, BlocksToUnify, InsertExport, TTI,
                       "UnifiedReturnBlock");
   return true;
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
index 281ae6d646e9..240b6c2ff462 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
@@ -12,14 +12,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/Pass.h"
-#include <algorithm>
-#include <cassert>
 
 using namespace llvm;
 
@@ -45,6 +41,7 @@ namespace {
 
   private:
     bool runOnModule(Module &M) override;
+  };
 
     /// Unify version metadata.
     /// \return true if changes are made.
@@ -96,7 +93,7 @@ namespace {
     SmallVector<Metadata *, 4> All;
     for (auto MD : NamedMD->operands())
       for (const auto &Op : MD->operands())
-        if (std::find(All.begin(), All.end(), Op.get()) == All.end())
+        if (!llvm::is_contained(All, Op.get()))
           All.push_back(Op.get());
 
     NamedMD->eraseFromParent();
@@ -106,41 +103,42 @@ namespace {
 
     return true;
   }
-};
 
-} // end anonymous namespace
+  bool unifyMetadataImpl(Module &M) {
+    const char *Vers[] = {kOCLMD::SpirVer, kOCLMD::OCLVer};
+    const char *Exts[] = {kOCLMD::UsedExt, kOCLMD::UsedOptCoreFeat,
+                          kOCLMD::CompilerOptions, kOCLMD::LLVMIdent};
 
-char AMDGPUUnifyMetadata::ID = 0;
+    bool Changed = false;
 
-char &llvm::AMDGPUUnifyMetadataID = AMDGPUUnifyMetadata::ID;
+    for (auto &I : Vers)
+      Changed |= unifyVersionMD(M, I, true);
 
-INITIALIZE_PASS(AMDGPUUnifyMetadata, "amdgpu-unify-metadata",
-                "Unify multiple OpenCL metadata due to linking",
-                false, false)
+    for (auto &I : Exts)
+      Changed |= unifyExtensionMD(M, I);
 
-ModulePass* llvm::createAMDGPUUnifyMetadataPass() {
-  return new AMDGPUUnifyMetadata();
-}
+    return Changed;
+  }
 
-bool AMDGPUUnifyMetadata::runOnModule(Module &M) {
-  const char* Vers[] = {
-      kOCLMD::SpirVer,
-      kOCLMD::OCLVer
-  };
-  const char* Exts[] = {
-      kOCLMD::UsedExt,
-      kOCLMD::UsedOptCoreFeat,
-      kOCLMD::CompilerOptions,
-      kOCLMD::LLVMIdent
-  };
+  } // end anonymous namespace
 
-  bool Changed = false;
+  char AMDGPUUnifyMetadata::ID = 0;
 
-  for (auto &I : Vers)
-    Changed |= unifyVersionMD(M, I, true);
+  char &llvm::AMDGPUUnifyMetadataID = AMDGPUUnifyMetadata::ID;
 
-  for (auto &I : Exts)
-    Changed |= unifyExtensionMD(M, I);
+  INITIALIZE_PASS(AMDGPUUnifyMetadata, "amdgpu-unify-metadata",
+                  "Unify multiple OpenCL metadata due to linking", false, false)
 
-  return Changed;
-}
+  ModulePass *llvm::createAMDGPUUnifyMetadataPass() {
+    return new AMDGPUUnifyMetadata();
+  }
+
+  bool AMDGPUUnifyMetadata::runOnModule(Module &M) {
+    return unifyMetadataImpl(M);
+  }
+
+  PreservedAnalyses AMDGPUUnifyMetadataPass::run(Module &M,
+                                                 ModuleAnalysisManager &AM) {
+    return unifyMetadataImpl(M) ? PreservedAnalyses::none()
+                                : PreservedAnalyses::all();
+  }
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
index 1f28688a7296..b9a8c6bd005d 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
@@ -7,42 +7,17 @@
 //==-----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "R600InstrInfo.h"
 #include "R600RegisterInfo.h"
-#include "llvm/ADT/DepthFirstIterator.h"
+#include "R600Subtarget.h"
 #include "llvm/ADT/SCCIterator.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/DebugLoc.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MachineValueType.h"
-#include "llvm/Support/raw_ostream.h"
-#include <cassert>
-#include <cstddef>
-#include <deque>
-#include <iterator>
-#include <map>
-#include <utility>
-#include <vector>
 
 using namespace llvm;
 
@@ -467,7 +442,7 @@ MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore(MachineBasicBlock *MBB,
                                                        const DebugLoc &DL) {
   MachineInstr *MI =
       MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DL);
-  if (MBB->begin() != MBB->end())
+  if (!MBB->empty())
     MBB->insert(MBB->begin(), MI);
   else
     MBB->push_back(MI);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDKernelCodeT.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDKernelCodeT.h
index 3e658a144c1f..654153ea5151 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDKernelCodeT.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDKernelCodeT.h
@@ -11,12 +11,8 @@
 #ifndef AMDKERNELCODET_H
 #define AMDKERNELCODET_H
 
-#include "llvm/MC/SubtargetFeature.h"
-
-#include <cstddef>
 #include <cstdint>
 
-#include "llvm/Support/Debug.h"
 //---------------------------------------------------------------------------//
 // AMD Kernel Code, and its dependencies                                     //
 //---------------------------------------------------------------------------//
@@ -527,7 +523,7 @@ typedef struct hsa_ext_control_directives_s {
 /// the kernarg segment is constant for the duration of the kernel execution.
 ///
 
-typedef struct amd_kernel_code_s {
+struct amd_kernel_code_t {
   uint32_t amd_kernel_code_version_major;
   uint32_t amd_kernel_code_version_minor;
   uint16_t amd_machine_kind;
@@ -650,6 +646,6 @@ typedef struct amd_kernel_code_s {
   uint8_t reserved3[12];
   uint64_t runtime_loader_kernel_symbol;
   uint64_t control_directives[16];
-} amd_kernel_code_t;
+};
 
 #endif // AMDKERNELCODET_H
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 013b7a0cf25d..af4a47935e3f 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "AMDGPU.h"
 #include "AMDKernelCodeT.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "MCTargetDesc/AMDGPUTargetStreamer.h"
@@ -17,49 +16,23 @@
 #include "Utils/AMDGPUBaseInfo.h"
 #include "Utils/AMDKernelCodeTUtils.h"
 #include "llvm/ADT/APFloat.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallBitVector.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/Twine.h"
-#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstrDesc.h"
-#include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
-#include "llvm/MC/MCParser/MCAsmParserExtension.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/AMDGPUMetadata.h"
 #include "llvm/Support/AMDHSAKernelDescriptor.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Error.h"
 #include "llvm/Support/MachineValueType.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/SMLoc.h"
 #include "llvm/Support/TargetParser.h"
 #include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
-#include <cstring>
-#include <iterator>
-#include <map>
-#include <memory>
-#include <string>
 
 using namespace llvm;
 using namespace llvm::AMDGPU;
@@ -188,6 +161,12 @@ public:
     ImmTyEndpgm,
   };
 
+  enum ImmKindTy {
+    ImmKindTyNone,
+    ImmKindTyLiteral,
+    ImmKindTyConst,
+  };
+
 private:
   struct TokOp {
     const char *Data;
@@ -198,6 +177,7 @@ private:
     int64_t Val;
     ImmTy Type;
     bool IsFPImm;
+    mutable ImmKindTy Kind;
     Modifiers Mods;
   };
 
@@ -233,6 +213,29 @@ public:
     return Kind == Immediate;
   }
 
+  void setImmKindNone() const {
+    assert(isImm());
+    Imm.Kind = ImmKindTyNone;
+  }
+
+  void setImmKindLiteral() const {
+    assert(isImm());
+    Imm.Kind = ImmKindTyLiteral;
+  }
+
+  void setImmKindConst() const {
+    assert(isImm());
+    Imm.Kind = ImmKindTyConst;
+  }
+
+  bool IsImmKindLiteral() const {
+    return isImm() && Imm.Kind == ImmKindTyLiteral;
+  }
+
+  bool isImmKindConst() const {
+    return isImm() && Imm.Kind == ImmKindTyConst;
+  }
+
   bool isInlinableImm(MVT type) const;
   bool isLiteralImm(MVT type) const;
 
@@ -335,11 +338,14 @@ public:
   bool isLDS() const { return isImmTy(ImmTyLDS); }
   bool isDLC() const { return isImmTy(ImmTyDLC); }
   bool isGLC() const { return isImmTy(ImmTyGLC); }
+  // "GLC_1" is a MatchClass of the GLC_1 operand with the default and forced
+  // value of the GLC operand.
+  bool isGLC_1() const { return isImmTy(ImmTyGLC); }
   bool isSLC() const { return isImmTy(ImmTySLC); }
   bool isSWZ() const { return isImmTy(ImmTySWZ); }
   bool isTFE() const { return isImmTy(ImmTyTFE); }
   bool isD16() const { return isImmTy(ImmTyD16); }
-  bool isFORMAT() const { return isImmTy(ImmTyFORMAT) && isUInt<8>(getImm()); }
+  bool isFORMAT() const { return isImmTy(ImmTyFORMAT) && isUInt<7>(getImm()); }
   bool isBankMask() const { return isImmTy(ImmTyDppBankMask); }
   bool isRowMask() const { return isImmTy(ImmTyDppRowMask); }
   bool isBoundCtrl() const { return isImmTy(ImmTyDppBoundCtrl); }
@@ -689,6 +695,11 @@ public:
     return Imm.Val;
   }
 
+  void setImm(int64_t Val) {
+    assert(isImm());
+    Imm.Val = Val;
+  }
+
   ImmTy getImmTy() const {
     assert(isImm());
     return Imm.Type;
@@ -903,6 +914,7 @@ public:
     auto Op = std::make_unique<AMDGPUOperand>(Immediate, AsmParser);
     Op->Imm.Val = Val;
     Op->Imm.IsFPImm = IsFPImm;
+    Op->Imm.Kind = ImmKindTyNone;
     Op->Imm.Type = Type;
     Op->Imm.Mods = Modifiers();
     Op->StartLoc = Loc;
@@ -1065,7 +1077,7 @@ private:
                            std::string &CollectString);
 
   bool AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth,
-                             RegisterKind RegKind, unsigned Reg1);
+                             RegisterKind RegKind, unsigned Reg1, SMLoc Loc);
   bool ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg,
                            unsigned &RegNum, unsigned &RegWidth,
                            bool RestoreOnFailure = false);
@@ -1083,7 +1095,8 @@ private:
   bool ParseRegRange(unsigned& Num, unsigned& Width);
   unsigned getRegularReg(RegisterKind RegKind,
                          unsigned RegNum,
-                         unsigned RegWidth);
+                         unsigned RegWidth,
+                         SMLoc Loc);
 
   bool isRegister();
   bool isRegister(const AsmToken &Token, const AsmToken &NextToken) const;
@@ -1127,7 +1140,7 @@ public:
       // AsmParser::parseDirectiveSet() cannot be specialized for specific target.
       AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU());
       MCContext &Ctx = getContext();
-      if (ISA.Major >= 6 && AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI())) {
+      if (ISA.Major >= 6 && isHsaAbiVersion3(&getSTI())) {
         MCSymbol *Sym =
             Ctx.getOrCreateSymbol(Twine(".amdgcn.gfx_generation_number"));
         Sym->setVariableValue(MCConstantExpr::create(ISA.Major, Ctx));
@@ -1144,7 +1157,7 @@ public:
         Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_stepping"));
         Sym->setVariableValue(MCConstantExpr::create(ISA.Stepping, Ctx));
       }
-      if (ISA.Major >= 6 && AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI())) {
+      if (ISA.Major >= 6 && isHsaAbiVersion3(&getSTI())) {
         initializeGprCountSymbol(IS_VGPR);
         initializeGprCountSymbol(IS_SGPR);
       } else
@@ -1184,10 +1197,16 @@ public:
     return AMDGPU::isGFX9(getSTI());
   }
 
+  bool isGFX9Plus() const {
+    return AMDGPU::isGFX9Plus(getSTI());
+  }
+
   bool isGFX10() const {
     return AMDGPU::isGFX10(getSTI());
   }
 
+  bool isGFX10Plus() const { return AMDGPU::isGFX10Plus(getSTI()); }
+
   bool isGFX10_BEncoding() const {
     return AMDGPU::isGFX10_BEncoding(getSTI());
   }
@@ -1204,9 +1223,7 @@ public:
     return !isVI() && !isGFX9();
   }
 
-  bool hasSGPR104_SGPR105() const {
-    return isGFX10();
-  }
+  bool hasSGPR104_SGPR105() const { return isGFX10Plus(); }
 
   bool hasIntClamp() const {
     return getFeatureBits()[AMDGPU::FeatureIntClamp];
@@ -1240,6 +1257,7 @@ public:
   bool isForcedDPP() const { return ForcedDPP; }
   bool isForcedSDWA() const { return ForcedSDWA; }
   ArrayRef<unsigned> getMatchedVariants() const;
+  StringRef getMatchedVariantName() const;
 
   std::unique_ptr<AMDGPUOperand> parseRegister(bool RestoreOnFailure = false);
   bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc,
@@ -1279,7 +1297,8 @@ public:
   parseNamedBit(const char *Name, OperandVector &Operands,
                 AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone);
   OperandMatchResultTy parseStringWithPrefix(StringRef Prefix,
-                                             StringRef &Value);
+                                             StringRef &Value,
+                                             SMLoc &StringLoc);
 
   bool isModifier();
   bool isOperandModifier(const AsmToken &Token, const AsmToken &NextToken) const;
@@ -1295,7 +1314,15 @@ public:
   OperandMatchResultTy parseRegWithFPInputMods(OperandVector &Operands);
   OperandMatchResultTy parseRegWithIntInputMods(OperandVector &Operands);
   OperandMatchResultTy parseVReg32OrOff(OperandVector &Operands);
-  OperandMatchResultTy parseDfmtNfmt(OperandVector &Operands);
+  OperandMatchResultTy parseDfmtNfmt(int64_t &Format);
+  OperandMatchResultTy parseUfmt(int64_t &Format);
+  OperandMatchResultTy parseSymbolicSplitFormat(StringRef FormatStr, SMLoc Loc, int64_t &Format);
+  OperandMatchResultTy parseSymbolicUnifiedFormat(StringRef FormatStr, SMLoc Loc, int64_t &Format);
+  OperandMatchResultTy parseFORMAT(OperandVector &Operands);
+  OperandMatchResultTy parseSymbolicOrNumericFormat(int64_t &Format);
+  OperandMatchResultTy parseNumericFormat(int64_t &Format);
+  bool tryParseFmt(const char *Pref, int64_t MaxVal, int64_t &Val);
+  bool matchDfmtNfmt(int64_t &Dfmt, int64_t &Nfmt, StringRef FormatStr, SMLoc Loc);
 
   void cvtDSOffset01(MCInst &Inst, const OperandVector &Operands);
   void cvtDS(MCInst &Inst, const OperandVector &Operands) { cvtDSImpl(Inst, Operands, false); }
@@ -1308,6 +1335,7 @@ public:
 
 private:
   struct OperandInfoTy {
+    SMLoc Loc;
     int64_t Id;
     bool IsSymbolic = false;
     bool IsDefined = false;
@@ -1318,30 +1346,35 @@ private:
   bool parseSendMsgBody(OperandInfoTy &Msg, OperandInfoTy &Op, OperandInfoTy &Stream);
   bool validateSendMsg(const OperandInfoTy &Msg,
                        const OperandInfoTy &Op,
-                       const OperandInfoTy &Stream,
-                       const SMLoc Loc);
+                       const OperandInfoTy &Stream);
 
-  bool parseHwregBody(OperandInfoTy &HwReg, int64_t &Offset, int64_t &Width);
+  bool parseHwregBody(OperandInfoTy &HwReg,
+                      OperandInfoTy &Offset,
+                      OperandInfoTy &Width);
   bool validateHwreg(const OperandInfoTy &HwReg,
-                     const int64_t Offset,
-                     const int64_t Width,
-                     const SMLoc Loc);
+                     const OperandInfoTy &Offset,
+                     const OperandInfoTy &Width);
 
-  void errorExpTgt();
-  OperandMatchResultTy parseExpTgtImpl(StringRef Str, uint8_t &Val);
   SMLoc getFlatOffsetLoc(const OperandVector &Operands) const;
   SMLoc getSMEMOffsetLoc(const OperandVector &Operands) const;
 
+  SMLoc getOperandLoc(std::function<bool(const AMDGPUOperand&)> Test,
+                      const OperandVector &Operands) const;
+  SMLoc getImmLoc(AMDGPUOperand::ImmTy Type, const OperandVector &Operands) const;
+  SMLoc getRegLoc(unsigned Reg, const OperandVector &Operands) const;
+  SMLoc getLitLoc(const OperandVector &Operands) const;
+  SMLoc getConstLoc(const OperandVector &Operands) const;
+
   bool validateInstruction(const MCInst &Inst, const SMLoc &IDLoc, const OperandVector &Operands);
   bool validateFlatOffset(const MCInst &Inst, const OperandVector &Operands);
   bool validateSMEMOffset(const MCInst &Inst, const OperandVector &Operands);
   bool validateSOPLiteral(const MCInst &Inst) const;
-  bool validateConstantBusLimitations(const MCInst &Inst);
-  bool validateEarlyClobberLimitations(const MCInst &Inst);
+  bool validateConstantBusLimitations(const MCInst &Inst, const OperandVector &Operands);
+  bool validateEarlyClobberLimitations(const MCInst &Inst, const OperandVector &Operands);
   bool validateIntClampSupported(const MCInst &Inst);
   bool validateMIMGAtomicDMask(const MCInst &Inst);
   bool validateMIMGGatherDMask(const MCInst &Inst);
-  bool validateMovrels(const MCInst &Inst);
+  bool validateMovrels(const MCInst &Inst, const OperandVector &Operands);
   bool validateMIMGDataSize(const MCInst &Inst);
   bool validateMIMGAddrSize(const MCInst &Inst);
   bool validateMIMGD16(const MCInst &Inst);
@@ -1349,13 +1382,23 @@ private:
   bool validateLdsDirect(const MCInst &Inst);
   bool validateOpSel(const MCInst &Inst);
   bool validateVccOperand(unsigned Reg) const;
-  bool validateVOP3Literal(const MCInst &Inst) const;
-  bool validateMAIAccWrite(const MCInst &Inst);
+  bool validateVOP3Literal(const MCInst &Inst, const OperandVector &Operands);
+  bool validateMAIAccWrite(const MCInst &Inst, const OperandVector &Operands);
+  bool validateDivScale(const MCInst &Inst);
+  bool validateCoherencyBits(const MCInst &Inst, const OperandVector &Operands,
+                             const SMLoc &IDLoc);
   unsigned getConstantBusLimit(unsigned Opcode) const;
   bool usesConstantBus(const MCInst &Inst, unsigned OpIdx);
   bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const;
   unsigned findImplicitSGPRReadInVOP(const MCInst &Inst) const;
 
+  bool isSupportedMnemo(StringRef Mnemo,
+                        const FeatureBitset &FBS);
+  bool isSupportedMnemo(StringRef Mnemo,
+                        const FeatureBitset &FBS,
+                        ArrayRef<unsigned> Variants);
+  bool checkUnsupportedInstruction(StringRef Name, const SMLoc &IDLoc);
+
   bool isId(const StringRef Id) const;
   bool isId(const AsmToken &Token, const StringRef Id) const;
   bool isToken(const AsmToken::TokenKind Kind) const;
@@ -1364,9 +1407,11 @@ private:
   bool trySkipToken(const AsmToken::TokenKind Kind);
   bool skipToken(const AsmToken::TokenKind Kind, const StringRef ErrMsg);
   bool parseString(StringRef &Val, const StringRef ErrMsg = "expected a string");
+  bool parseId(StringRef &Val, const StringRef ErrMsg = "");
+
   void peekTokens(MutableArrayRef<AsmToken> Tokens);
   AsmToken::TokenKind getTokenKind() const;
-  bool parseExpr(int64_t &Imm);
+  bool parseExpr(int64_t &Imm, StringRef Expected = "");
   bool parseExpr(OperandVector &Operands);
   StringRef getTokenStr() const;
   AsmToken peekToken();
@@ -1385,6 +1430,11 @@ public:
   OperandMatchResultTy parseSOppBrTarget(OperandVector &Operands);
   OperandMatchResultTy parseBoolReg(OperandVector &Operands);
 
+  bool parseSwizzleOperand(int64_t &Op,
+                           const unsigned MinVal,
+                           const unsigned MaxVal,
+                           const StringRef ErrMsg,
+                           SMLoc &Loc);
   bool parseSwizzleOperands(const unsigned OpNum, int64_t* Op,
                             const unsigned MinVal,
                             const unsigned MaxVal,
@@ -1409,6 +1459,7 @@ public:
 
   AMDGPUOperand::Ptr defaultDLC() const;
   AMDGPUOperand::Ptr defaultGLC() const;
+  AMDGPUOperand::Ptr defaultGLC_1() const;
   AMDGPUOperand::Ptr defaultSLC() const;
 
   AMDGPUOperand::Ptr defaultSMRDOffset8() const;
@@ -1429,10 +1480,14 @@ public:
   void cvtMIMG(MCInst &Inst, const OperandVector &Operands,
                bool IsAtomic = false);
   void cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands);
+  void cvtIntersectRay(MCInst &Inst, const OperandVector &Operands);
 
   OperandMatchResultTy parseDim(OperandVector &Operands);
   OperandMatchResultTy parseDPP8(OperandVector &Operands);
   OperandMatchResultTy parseDPPCtrl(OperandVector &Operands);
+  bool isSupportedDPPCtrl(StringRef Ctrl, const OperandVector &Operands);
+  int64_t parseDPPCtrlSel(StringRef Ctrl);
+  int64_t parseDPPCtrlPerm();
   AMDGPUOperand::Ptr defaultRowMask() const;
   AMDGPUOperand::Ptr defaultBankMask() const;
   AMDGPUOperand::Ptr defaultBoundCtrl() const;
@@ -1673,7 +1728,7 @@ bool AMDGPUOperand::isRegClass(unsigned RCID) const {
 bool AMDGPUOperand::isSDWAOperand(MVT type) const {
   if (AsmParser->isVI())
     return isVReg32();
-  else if (AsmParser->isGFX9() || AsmParser->isGFX10())
+  else if (AsmParser->isGFX9Plus())
     return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(type);
   else
     return false;
@@ -1726,6 +1781,7 @@ void AMDGPUOperand::addImmOperands(MCInst &Inst, unsigned N, bool ApplyModifiers
   } else {
     assert(!isImmTy(ImmTyNone) || !hasModifiers());
     Inst.addOperand(MCOperand::createImm(Imm.Val));
+    setImmKindNone();
   }
 }
 
@@ -1753,6 +1809,7 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
       if (AMDGPU::isInlinableLiteral64(Literal.getZExtValue(),
                                        AsmParser->hasInv2PiInlineImm())) {
         Inst.addOperand(MCOperand::createImm(Literal.getZExtValue()));
+        setImmKindConst();
         return;
       }
 
@@ -1766,6 +1823,7 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
         }
 
         Inst.addOperand(MCOperand::createImm(Literal.lshr(32).getZExtValue()));
+        setImmKindLiteral();
         return;
       }
 
@@ -1802,6 +1860,7 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
 
       uint64_t ImmVal = FPLiteral.bitcastToAPInt().getZExtValue();
       Inst.addOperand(MCOperand::createImm(ImmVal));
+      setImmKindLiteral();
       return;
     }
     default:
@@ -1826,10 +1885,12 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
         AMDGPU::isInlinableLiteral32(static_cast<int32_t>(Val),
                                      AsmParser->hasInv2PiInlineImm())) {
       Inst.addOperand(MCOperand::createImm(Val));
+      setImmKindConst();
       return;
     }
 
     Inst.addOperand(MCOperand::createImm(Val & 0xffffffff));
+    setImmKindLiteral();
     return;
 
   case AMDGPU::OPERAND_REG_IMM_INT64:
@@ -1838,10 +1899,12 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
   case AMDGPU::OPERAND_REG_INLINE_C_FP64:
     if (AMDGPU::isInlinableLiteral64(Val, AsmParser->hasInv2PiInlineImm())) {
       Inst.addOperand(MCOperand::createImm(Val));
+      setImmKindConst();
       return;
     }
 
     Inst.addOperand(MCOperand::createImm(Lo_32(Val)));
+    setImmKindLiteral();
     return;
 
   case AMDGPU::OPERAND_REG_IMM_INT16:
@@ -1854,10 +1917,12 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
         AMDGPU::isInlinableLiteral16(static_cast<int16_t>(Val),
                                      AsmParser->hasInv2PiInlineImm())) {
       Inst.addOperand(MCOperand::createImm(Val));
+      setImmKindConst();
       return;
     }
 
     Inst.addOperand(MCOperand::createImm(Val & 0xffff));
+    setImmKindLiteral();
     return;
 
   case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
@@ -1879,6 +1944,7 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
 template <unsigned Bitwidth>
 void AMDGPUOperand::addKImmFPOperands(MCInst &Inst, unsigned N) const {
   APInt Literal(64, Imm.Val);
+  setImmKindNone();
 
   if (!Imm.IsFPImm) {
     // We got int literal token.
@@ -2051,7 +2117,8 @@ OperandMatchResultTy AMDGPUAsmParser::tryParseRegister(unsigned &RegNo,
 }
 
 bool AMDGPUAsmParser::AddNextRegisterToList(unsigned &Reg, unsigned &RegWidth,
-                                            RegisterKind RegKind, unsigned Reg1) {
+                                            RegisterKind RegKind, unsigned Reg1,
+                                            SMLoc Loc) {
   switch (RegKind) {
   case IS_SPECIAL:
     if (Reg == AMDGPU::EXEC_LO && Reg1 == AMDGPU::EXEC_HI) {
@@ -2084,12 +2151,14 @@ bool AMDGPUAsmParser::AddNextRegisterToList(unsigned &Reg, unsigned &RegWidth,
       RegWidth = 2;
       return true;
     }
+    Error(Loc, "register does not fit in the list");
     return false;
   case IS_VGPR:
   case IS_SGPR:
   case IS_AGPR:
   case IS_TTMP:
     if (Reg1 != Reg + RegWidth) {
+      Error(Loc, "registers in a list must have consecutive indices");
       return false;
     }
     RegWidth++;
@@ -2172,7 +2241,8 @@ AMDGPUAsmParser::isRegister()
 unsigned
 AMDGPUAsmParser::getRegularReg(RegisterKind RegKind,
                                unsigned RegNum,
-                               unsigned RegWidth) {
+                               unsigned RegWidth,
+                               SMLoc Loc) {
 
   assert(isRegularReg(RegKind));
 
@@ -2183,18 +2253,24 @@ AMDGPUAsmParser::getRegularReg(RegisterKind RegKind,
     AlignSize = std::min(RegWidth, 4u);
   }
 
-  if (RegNum % AlignSize != 0)
+  if (RegNum % AlignSize != 0) {
+    Error(Loc, "invalid register alignment");
     return AMDGPU::NoRegister;
+  }
 
   unsigned RegIdx = RegNum / AlignSize;
   int RCID = getRegClass(RegKind, RegWidth);
-  if (RCID == -1)
+  if (RCID == -1) {
+    Error(Loc, "invalid or unsupported register size");
     return AMDGPU::NoRegister;
+  }
 
   const MCRegisterInfo *TRI = getContext().getRegisterInfo();
   const MCRegisterClass RC = TRI->getRegClass(RCID);
-  if (RegIdx >= RC.getNumRegs())
+  if (RegIdx >= RC.getNumRegs()) {
+    Error(Loc, "register index is out of range");
     return AMDGPU::NoRegister;
+  }
 
   return RC.getRegister(RegIdx);
 }
@@ -2202,24 +2278,40 @@ AMDGPUAsmParser::getRegularReg(RegisterKind RegKind,
 bool
 AMDGPUAsmParser::ParseRegRange(unsigned& Num, unsigned& Width) {
   int64_t RegLo, RegHi;
-  if (!trySkipToken(AsmToken::LBrac))
+  if (!skipToken(AsmToken::LBrac, "missing register index"))
     return false;
 
+  SMLoc FirstIdxLoc = getLoc();
+  SMLoc SecondIdxLoc;
+
   if (!parseExpr(RegLo))
     return false;
 
   if (trySkipToken(AsmToken::Colon)) {
+    SecondIdxLoc = getLoc();
     if (!parseExpr(RegHi))
       return false;
   } else {
     RegHi = RegLo;
   }
 
-  if (!trySkipToken(AsmToken::RBrac))
+  if (!skipToken(AsmToken::RBrac, "expected a closing square bracket"))
+    return false;
+
+  if (!isUInt<32>(RegLo)) {
+    Error(FirstIdxLoc, "invalid register index");
+    return false;
+  }
+
+  if (!isUInt<32>(RegHi)) {
+    Error(SecondIdxLoc, "invalid register index");
     return false;
+  }
 
-  if (!isUInt<32>(RegLo) || !isUInt<32>(RegHi) || RegLo > RegHi)
+  if (RegLo > RegHi) {
+    Error(FirstIdxLoc, "first register index should not exceed second index");
     return false;
+  }
 
   Num = static_cast<unsigned>(RegLo);
   Width = (RegHi - RegLo) + 1;
@@ -2246,10 +2338,14 @@ unsigned AMDGPUAsmParser::ParseRegularReg(RegisterKind &RegKind,
                                           SmallVectorImpl<AsmToken> &Tokens) {
   assert(isToken(AsmToken::Identifier));
   StringRef RegName = getTokenStr();
+  auto Loc = getLoc();
 
   const RegInfo *RI = getRegularRegInfo(RegName);
-  if (!RI)
+  if (!RI) {
+    Error(Loc, "invalid register name");
     return AMDGPU::NoRegister;
+  }
+
   Tokens.push_back(getToken());
   lex(); // skip register name
 
@@ -2257,8 +2353,10 @@ unsigned AMDGPUAsmParser::ParseRegularReg(RegisterKind &RegKind,
   StringRef RegSuffix = RegName.substr(RI->Name.size());
   if (!RegSuffix.empty()) {
     // Single 32-bit register: vXX.
-    if (!getRegNum(RegSuffix, RegNum))
+    if (!getRegNum(RegSuffix, RegNum)) {
+      Error(Loc, "invalid register index");
       return AMDGPU::NoRegister;
+    }
     RegWidth = 1;
   } else {
     // Range of registers: v[XX:YY]. ":YY" is optional.
@@ -2266,44 +2364,59 @@ unsigned AMDGPUAsmParser::ParseRegularReg(RegisterKind &RegKind,
       return AMDGPU::NoRegister;
   }
 
-  return getRegularReg(RegKind, RegNum, RegWidth);
+  return getRegularReg(RegKind, RegNum, RegWidth, Loc);
 }
 
 unsigned AMDGPUAsmParser::ParseRegList(RegisterKind &RegKind, unsigned &RegNum,
                                        unsigned &RegWidth,
                                        SmallVectorImpl<AsmToken> &Tokens) {
   unsigned Reg = AMDGPU::NoRegister;
+  auto ListLoc = getLoc();
 
-  if (!trySkipToken(AsmToken::LBrac))
+  if (!skipToken(AsmToken::LBrac,
+                 "expected a register or a list of registers")) {
     return AMDGPU::NoRegister;
+  }
 
   // List of consecutive registers, e.g.: [s0,s1,s2,s3]
 
+  auto Loc = getLoc();
   if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth))
     return AMDGPU::NoRegister;
-  if (RegWidth != 1)
+  if (RegWidth != 1) {
+    Error(Loc, "expected a single 32-bit register");
     return AMDGPU::NoRegister;
+  }
 
   for (; trySkipToken(AsmToken::Comma); ) {
     RegisterKind NextRegKind;
     unsigned NextReg, NextRegNum, NextRegWidth;
+    Loc = getLoc();
 
-    if (!ParseAMDGPURegister(NextRegKind, NextReg, NextRegNum, NextRegWidth,
-                             Tokens))
+    if (!ParseAMDGPURegister(NextRegKind, NextReg,
+                             NextRegNum, NextRegWidth,
+                             Tokens)) {
       return AMDGPU::NoRegister;
-    if (NextRegWidth != 1)
+    }
+    if (NextRegWidth != 1) {
+      Error(Loc, "expected a single 32-bit register");
       return AMDGPU::NoRegister;
-    if (NextRegKind != RegKind)
+    }
+    if (NextRegKind != RegKind) {
+      Error(Loc, "registers in a list must be of the same kind");
       return AMDGPU::NoRegister;
-    if (!AddNextRegisterToList(Reg, RegWidth, RegKind, NextReg))
+    }
+    if (!AddNextRegisterToList(Reg, RegWidth, RegKind, NextReg, Loc))
       return AMDGPU::NoRegister;
   }
 
-  if (!trySkipToken(AsmToken::RBrac))
+  if (!skipToken(AsmToken::RBrac,
+                 "expected a comma or a closing square bracket")) {
     return AMDGPU::NoRegister;
+  }
 
   if (isRegularReg(RegKind))
-    Reg = getRegularReg(RegKind, RegNum, RegWidth);
+    Reg = getRegularReg(RegKind, RegNum, RegWidth, ListLoc);
 
   return Reg;
 }
@@ -2311,6 +2424,7 @@ unsigned AMDGPUAsmParser::ParseRegList(RegisterKind &RegKind, unsigned &RegNum,
 bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg,
                                           unsigned &RegNum, unsigned &RegWidth,
                                           SmallVectorImpl<AsmToken> &Tokens) {
+  auto Loc = getLoc();
   Reg = AMDGPU::NoRegister;
 
   if (isToken(AsmToken::Identifier)) {
@@ -2322,12 +2436,26 @@ bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg,
   }
 
   const MCRegisterInfo *TRI = getContext().getRegisterInfo();
-  return Reg != AMDGPU::NoRegister && subtargetHasRegister(*TRI, Reg);
+  if (Reg == AMDGPU::NoRegister) {
+    assert(Parser.hasPendingError());
+    return false;
+  }
+
+  if (!subtargetHasRegister(*TRI, Reg)) {
+    if (Reg == AMDGPU::SGPR_NULL) {
+      Error(Loc, "'null' operand is not supported on this GPU");
+    } else {
+      Error(Loc, "register not available on this GPU");
+    }
+    return false;
+  }
+
+  return true;
 }
 
 bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg,
                                           unsigned &RegNum, unsigned &RegWidth,
-                                          bool RestoreOnFailure) {
+                                          bool RestoreOnFailure /*=false*/) {
   Reg = AMDGPU::NoRegister;
 
   SmallVector<AsmToken, 1> Tokens;
@@ -2377,11 +2505,11 @@ bool AMDGPUAsmParser::updateGprCountSymbols(RegisterKind RegKind,
   int64_t OldCount;
 
   if (!Sym->isVariable())
-    return !Error(getParser().getTok().getLoc(),
+    return !Error(getLoc(),
                   ".amdgcn.next_free_{v,s}gpr symbols must be variable");
   if (!Sym->getVariableValue(false)->evaluateAsAbsolute(OldCount))
     return !Error(
-        getParser().getTok().getLoc(),
+        getLoc(),
         ".amdgcn.next_free_{v,s}gpr symbols must be absolute expressions");
 
   if (OldCount <= NewMax)
@@ -2392,18 +2520,16 @@ bool AMDGPUAsmParser::updateGprCountSymbols(RegisterKind RegKind,
 
 std::unique_ptr<AMDGPUOperand>
 AMDGPUAsmParser::parseRegister(bool RestoreOnFailure) {
-  const auto &Tok = Parser.getTok();
+  const auto &Tok = getToken();
   SMLoc StartLoc = Tok.getLoc();
   SMLoc EndLoc = Tok.getEndLoc();
   RegisterKind RegKind;
   unsigned Reg, RegNum, RegWidth;
 
   if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth)) {
-    //FIXME: improve error messages (bug 41303).
-    Error(StartLoc, "not a valid operand.");
     return nullptr;
   }
-  if (AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI())) {
+  if (isHsaAbiVersion3(&getSTI())) {
     if (!updateGprCountSymbols(RegKind, RegNum, RegWidth))
       return nullptr;
   } else
@@ -2466,7 +2592,7 @@ AMDGPUAsmParser::parseImm(OperandVector &Operands, bool HasSP3AbsModifier) {
       // This syntax is not compatible with syntax of standard
       // MC expressions (due to the trailing '|').
       SMLoc EndLoc;
-      if (getParser().parsePrimaryExpr(Expr, EndLoc))
+      if (getParser().parsePrimaryExpr(Expr, EndLoc, nullptr))
         return MatchOperand_ParseFail;
     } else {
       if (Parser.parseExpression(Expr))
@@ -2761,6 +2887,15 @@ unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
   return Match_Success;
 }
 
+static ArrayRef<unsigned> getAllVariants() {
+  static const unsigned Variants[] = {
+    AMDGPUAsmVariants::DEFAULT, AMDGPUAsmVariants::VOP3,
+    AMDGPUAsmVariants::SDWA, AMDGPUAsmVariants::SDWA9, AMDGPUAsmVariants::DPP
+  };
+
+  return makeArrayRef(Variants);
+}
+
 // What asm variants we should check
 ArrayRef<unsigned> AMDGPUAsmParser::getMatchedVariants() const {
   if (getForcedEncodingSize() == 32) {
@@ -2784,12 +2919,23 @@ ArrayRef<unsigned> AMDGPUAsmParser::getMatchedVariants() const {
     return makeArrayRef(Variants);
   }
 
-  static const unsigned Variants[] = {
-    AMDGPUAsmVariants::DEFAULT, AMDGPUAsmVariants::VOP3,
-    AMDGPUAsmVariants::SDWA, AMDGPUAsmVariants::SDWA9, AMDGPUAsmVariants::DPP
-  };
+  return getAllVariants();
+}
 
-  return makeArrayRef(Variants);
+StringRef AMDGPUAsmParser::getMatchedVariantName() const {
+  if (getForcedEncodingSize() == 32)
+    return "e32";
+
+  if (isForcedVOP3())
+    return "e64";
+
+  if (isForcedSDWA())
+    return "sdwa";
+
+  if (isForcedDPP())
+    return "dpp";
+
+  return "";
 }
 
 unsigned AMDGPUAsmParser::findImplicitSGPRReadInVOP(const MCInst &Inst) const {
@@ -2858,20 +3004,20 @@ bool AMDGPUAsmParser::isInlineConstant(const MCInst &Inst,
 }
 
 unsigned AMDGPUAsmParser::getConstantBusLimit(unsigned Opcode) const {
-  if (!isGFX10())
+  if (!isGFX10Plus())
     return 1;
 
   switch (Opcode) {
   // 64-bit shift instructions can use only one scalar value input
-  case AMDGPU::V_LSHLREV_B64:
+  case AMDGPU::V_LSHLREV_B64_e64:
   case AMDGPU::V_LSHLREV_B64_gfx10:
-  case AMDGPU::V_LSHL_B64:
-  case AMDGPU::V_LSHRREV_B64:
+  case AMDGPU::V_LSHRREV_B64_e64:
   case AMDGPU::V_LSHRREV_B64_gfx10:
-  case AMDGPU::V_LSHR_B64:
-  case AMDGPU::V_ASHRREV_I64:
+  case AMDGPU::V_ASHRREV_I64_e64:
   case AMDGPU::V_ASHRREV_I64_gfx10:
-  case AMDGPU::V_ASHR_I64:
+  case AMDGPU::V_LSHL_B64_e64:
+  case AMDGPU::V_LSHR_B64_e64:
+  case AMDGPU::V_ASHR_I64_e64:
     return 1;
   default:
     return 2;
@@ -2885,15 +3031,19 @@ bool AMDGPUAsmParser::usesConstantBus(const MCInst &Inst, unsigned OpIdx) {
   } else if (MO.isReg()) {
     auto Reg = MO.getReg();
     const MCRegisterInfo *TRI = getContext().getRegisterInfo();
-    return isSGPR(mc2PseudoReg(Reg), TRI) && Reg != SGPR_NULL;
+    auto PReg = mc2PseudoReg(Reg);
+    return isSGPR(PReg, TRI) && PReg != SGPR_NULL;
   } else {
     return true;
   }
 }
 
-bool AMDGPUAsmParser::validateConstantBusLimitations(const MCInst &Inst) {
+bool
+AMDGPUAsmParser::validateConstantBusLimitations(const MCInst &Inst,
+                                                const OperandVector &Operands) {
   const unsigned Opcode = Inst.getOpcode();
   const MCInstrDesc &Desc = MII.get(Opcode);
+  unsigned LastSGPR = AMDGPU::NoRegister;
   unsigned ConstantBusUseCount = 0;
   unsigned NumLiterals = 0;
   unsigned LiteralSize;
@@ -2927,15 +3077,15 @@ bool AMDGPUAsmParser::validateConstantBusLimitations(const MCInst &Inst) {
       const MCOperand &MO = Inst.getOperand(OpIdx);
       if (usesConstantBus(Inst, OpIdx)) {
         if (MO.isReg()) {
-          const unsigned Reg = mc2PseudoReg(MO.getReg());
+          LastSGPR = mc2PseudoReg(MO.getReg());
           // Pairs of registers with a partial intersections like these
           //   s0, s[0:1]
           //   flat_scratch_lo, flat_scratch
           //   flat_scratch_lo, flat_scratch_hi
           // are theoretically valid but they are disabled anyway.
           // Note that this code mimics SIInstrInfo::verifyInstruction
-          if (!SGPRsUsed.count(Reg)) {
-            SGPRsUsed.insert(Reg);
+          if (!SGPRsUsed.count(LastSGPR)) {
+            SGPRsUsed.insert(LastSGPR);
             ++ConstantBusUseCount;
           }
         } else { // Expression or a literal
@@ -2967,10 +3117,19 @@ bool AMDGPUAsmParser::validateConstantBusLimitations(const MCInst &Inst) {
   }
   ConstantBusUseCount += NumLiterals;
 
-  return ConstantBusUseCount <= getConstantBusLimit(Opcode);
+  if (ConstantBusUseCount <= getConstantBusLimit(Opcode))
+    return true;
+
+  SMLoc LitLoc = getLitLoc(Operands);
+  SMLoc RegLoc = getRegLoc(LastSGPR, Operands);
+  SMLoc Loc = (LitLoc.getPointer() < RegLoc.getPointer()) ? RegLoc : LitLoc;
+  Error(Loc, "invalid operand (violates constant bus restrictions)");
+  return false;
 }
 
-bool AMDGPUAsmParser::validateEarlyClobberLimitations(const MCInst &Inst) {
+bool
+AMDGPUAsmParser::validateEarlyClobberLimitations(const MCInst &Inst,
+                                                 const OperandVector &Operands) {
   const unsigned Opcode = Inst.getOpcode();
   const MCInstrDesc &Desc = MII.get(Opcode);
 
@@ -2999,6 +3158,8 @@ bool AMDGPUAsmParser::validateEarlyClobberLimitations(const MCInst &Inst) {
     if (Src.isReg()) {
       const unsigned SrcReg = mc2PseudoReg(Src.getReg());
       if (isRegIntersect(DstReg, SrcReg, TRI)) {
+        Error(getRegLoc(SrcReg, Operands),
+          "destination must be different than all sources");
         return false;
       }
     }
@@ -3034,8 +3195,9 @@ bool AMDGPUAsmParser::validateMIMGDataSize(const MCInst &Inst) {
   int TFEIdx   = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::tfe);
 
   assert(VDataIdx != -1);
-  assert(DMaskIdx != -1);
-  assert(TFEIdx != -1);
+
+  if (DMaskIdx == -1 || TFEIdx == -1) // intersect_ray
+    return true;
 
   unsigned VDataSize = AMDGPU::getRegOperandSize(getMRI(), Desc, VDataIdx);
   unsigned TFESize = Inst.getOperand(TFEIdx).getImm()? 1 : 0;
@@ -3058,10 +3220,11 @@ bool AMDGPUAsmParser::validateMIMGAddrSize(const MCInst &Inst) {
   const unsigned Opc = Inst.getOpcode();
   const MCInstrDesc &Desc = MII.get(Opc);
 
-  if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0 || !isGFX10())
+  if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0 || !isGFX10Plus())
     return true;
 
   const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
+
   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
       AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
   int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
@@ -3070,9 +3233,11 @@ bool AMDGPUAsmParser::validateMIMGAddrSize(const MCInst &Inst) {
 
   assert(VAddr0Idx != -1);
   assert(SrsrcIdx != -1);
-  assert(DimIdx != -1);
   assert(SrsrcIdx > VAddr0Idx);
 
+  if (DimIdx == -1)
+    return true; // intersect_ray
+
   unsigned Dim = Inst.getOperand(DimIdx).getImm();
   const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfoByEncoding(Dim);
   bool IsNSA = SrsrcIdx - VAddr0Idx > 1;
@@ -3148,7 +3313,8 @@ static bool IsMovrelsSDWAOpcode(const unsigned Opcode)
 // movrels* opcodes should only allow VGPRS as src0.
 // This is specified in .td description for vop1/vop3,
 // but sdwa is handled differently. See isSDWAOperand.
-bool AMDGPUAsmParser::validateMovrels(const MCInst &Inst) {
+bool AMDGPUAsmParser::validateMovrels(const MCInst &Inst,
+                                      const OperandVector &Operands) {
 
   const unsigned Opc = Inst.getOpcode();
   const MCInstrDesc &Desc = MII.get(Opc);
@@ -3159,16 +3325,24 @@ bool AMDGPUAsmParser::validateMovrels(const MCInst &Inst) {
   const int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
   assert(Src0Idx != -1);
 
+  SMLoc ErrLoc;
   const MCOperand &Src0 = Inst.getOperand(Src0Idx);
-  if (!Src0.isReg())
-    return false;
+  if (Src0.isReg()) {
+    auto Reg = mc2PseudoReg(Src0.getReg());
+    const MCRegisterInfo *TRI = getContext().getRegisterInfo();
+    if (!isSGPR(Reg, TRI))
+      return true;
+    ErrLoc = getRegLoc(Reg, Operands);
+  } else {
+    ErrLoc = getConstLoc(Operands);
+  }
 
-  auto Reg = Src0.getReg();
-  const MCRegisterInfo *TRI = getContext().getRegisterInfo();
-  return !isSGPR(mc2PseudoReg(Reg), TRI);
+  Error(ErrLoc, "source operand must be a VGPR");
+  return false;
 }
 
-bool AMDGPUAsmParser::validateMAIAccWrite(const MCInst &Inst) {
+bool AMDGPUAsmParser::validateMAIAccWrite(const MCInst &Inst,
+                                          const OperandVector &Operands) {
 
   const unsigned Opc = Inst.getOpcode();
 
@@ -3182,16 +3356,45 @@ bool AMDGPUAsmParser::validateMAIAccWrite(const MCInst &Inst) {
   if (!Src0.isReg())
     return true;
 
-  auto Reg = Src0.getReg();
+  auto Reg = mc2PseudoReg(Src0.getReg());
   const MCRegisterInfo *TRI = getContext().getRegisterInfo();
-  if (isSGPR(mc2PseudoReg(Reg), TRI)) {
-    Error(getLoc(), "source operand must be either a VGPR or an inline constant");
+  if (isSGPR(Reg, TRI)) {
+    Error(getRegLoc(Reg, Operands),
+          "source operand must be either a VGPR or an inline constant");
     return false;
   }
 
   return true;
 }
 
+bool AMDGPUAsmParser::validateDivScale(const MCInst &Inst) {
+  switch (Inst.getOpcode()) {
+  default:
+    return true;
+  case V_DIV_SCALE_F32_gfx6_gfx7:
+  case V_DIV_SCALE_F32_vi:
+  case V_DIV_SCALE_F32_gfx10:
+  case V_DIV_SCALE_F64_gfx6_gfx7:
+  case V_DIV_SCALE_F64_vi:
+  case V_DIV_SCALE_F64_gfx10:
+    break;
+  }
+
+  // TODO: Check that src0 = src1 or src2.
+
+  for (auto Name : {AMDGPU::OpName::src0_modifiers,
+                    AMDGPU::OpName::src2_modifiers,
+                    AMDGPU::OpName::src2_modifiers}) {
+    if (Inst.getOperand(AMDGPU::getNamedOperandIdx(Inst.getOpcode(), Name))
+            .getImm() &
+        SISrcMods::ABS) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
 bool AMDGPUAsmParser::validateMIMGD16(const MCInst &Inst) {
 
   const unsigned Opc = Inst.getOpcode();
@@ -3239,8 +3442,8 @@ static bool IsRevOpcode(const unsigned Opcode)
   case AMDGPU::V_SUBREV_F32_e64_gfx6_gfx7:
   case AMDGPU::V_SUBREV_F32_e64_vi:
 
-  case AMDGPU::V_SUBREV_I32_e32:
-  case AMDGPU::V_SUBREV_I32_e64:
+  case AMDGPU::V_SUBREV_CO_U32_e32:
+  case AMDGPU::V_SUBREV_CO_U32_e64:
   case AMDGPU::V_SUBREV_I32_e32_gfx6_gfx7:
   case AMDGPU::V_SUBREV_I32_e64_gfx6_gfx7:
 
@@ -3328,15 +3531,15 @@ static bool IsRevOpcode(const unsigned Opcode)
   case AMDGPU::V_ASHRREV_I16_e64_vi:
   case AMDGPU::V_ASHRREV_I16_gfx10:
 
-  case AMDGPU::V_LSHLREV_B64:
+  case AMDGPU::V_LSHLREV_B64_e64:
   case AMDGPU::V_LSHLREV_B64_gfx10:
   case AMDGPU::V_LSHLREV_B64_vi:
 
-  case AMDGPU::V_LSHRREV_B64:
+  case AMDGPU::V_LSHRREV_B64_e64:
   case AMDGPU::V_LSHRREV_B64_gfx10:
   case AMDGPU::V_LSHRREV_B64_vi:
 
-  case AMDGPU::V_ASHRREV_I64:
+  case AMDGPU::V_ASHRREV_I64_e64:
   case AMDGPU::V_ASHRREV_I64_gfx10:
   case AMDGPU::V_ASHRREV_I64_vi:
 
@@ -3419,22 +3622,20 @@ bool AMDGPUAsmParser::validateFlatOffset(const MCInst &Inst,
     return false;
   }
 
-  // Address offset is 12-bit signed for GFX10, 13-bit for GFX9.
   // For FLAT segment the offset must be positive;
   // MSB is ignored and forced to zero.
-  unsigned OffsetSize = isGFX9() ? 13 : 12;
-  if (TSFlags & SIInstrFlags::IsNonFlatSeg) {
+  if (TSFlags & (SIInstrFlags::IsFlatGlobal | SIInstrFlags::IsFlatScratch)) {
+    unsigned OffsetSize = AMDGPU::getNumFlatOffsetBits(getSTI(), true);
     if (!isIntN(OffsetSize, Op.getImm())) {
       Error(getFlatOffsetLoc(Operands),
-            isGFX9() ? "expected a 13-bit signed offset" :
-                       "expected a 12-bit signed offset");
+            Twine("expected a ") + Twine(OffsetSize) + "-bit signed offset");
       return false;
     }
   } else {
-    if (!isUIntN(OffsetSize - 1, Op.getImm())) {
+    unsigned OffsetSize = AMDGPU::getNumFlatOffsetBits(getSTI(), false);
+    if (!isUIntN(OffsetSize, Op.getImm())) {
       Error(getFlatOffsetLoc(Operands),
-            isGFX9() ? "expected a 12-bit unsigned offset" :
-                       "expected an 11-bit unsigned offset");
+            Twine("expected a ") + Twine(OffsetSize) + "-bit unsigned offset");
       return false;
     }
   }
@@ -3443,7 +3644,8 @@ bool AMDGPUAsmParser::validateFlatOffset(const MCInst &Inst,
 }
 
 SMLoc AMDGPUAsmParser::getSMEMOffsetLoc(const OperandVector &Operands) const {
-  for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
+  // Start with second operand because SMEM Offset cannot be dst or src0.
+  for (unsigned i = 2, e = Operands.size(); i != e; ++i) {
     AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
     if (Op.isSMEMOffset())
       return Op.getStartLoc();
@@ -3539,7 +3741,8 @@ bool AMDGPUAsmParser::validateVccOperand(unsigned Reg) const {
 }
 
 // VOP3 literal is only allowed in GFX10+ and only one can be used
-bool AMDGPUAsmParser::validateVOP3Literal(const MCInst &Inst) const {
+bool AMDGPUAsmParser::validateVOP3Literal(const MCInst &Inst,
+                                          const OperandVector &Operands) {
   unsigned Opcode = Inst.getOpcode();
   const MCInstrDesc &Desc = MII.get(Opcode);
   if (!(Desc.TSFlags & (SIInstrFlags::VOP3 | SIInstrFlags::VOP3P)))
@@ -3565,8 +3768,11 @@ bool AMDGPUAsmParser::validateVOP3Literal(const MCInst &Inst) const {
       continue;
 
     if (OpIdx == Src2Idx && (Desc.TSFlags & SIInstrFlags::IsMAI) &&
-        getFeatureBits()[AMDGPU::FeatureMFMAInlineLiteralBug])
+        getFeatureBits()[AMDGPU::FeatureMFMAInlineLiteralBug]) {
+      Error(getConstLoc(Operands),
+            "inline constants are not allowed for this operand");
       return false;
+    }
 
     if (MO.isImm() && !isInlineConstant(Inst, OpIdx)) {
       uint32_t Value = static_cast<uint32_t>(MO.getImm());
@@ -3580,51 +3786,74 @@ bool AMDGPUAsmParser::validateVOP3Literal(const MCInst &Inst) const {
   }
   NumLiterals += NumExprs;
 
-  return !NumLiterals ||
-         (NumLiterals == 1 && getFeatureBits()[AMDGPU::FeatureVOP3Literal]);
+  if (!NumLiterals)
+    return true;
+
+  if (!getFeatureBits()[AMDGPU::FeatureVOP3Literal]) {
+    Error(getLitLoc(Operands), "literal operands are not supported");
+    return false;
+  }
+
+  if (NumLiterals > 1) {
+    Error(getLitLoc(Operands), "only one literal operand is allowed");
+    return false;
+  }
+
+  return true;
+}
+
+bool AMDGPUAsmParser::validateCoherencyBits(const MCInst &Inst,
+                                            const OperandVector &Operands,
+                                            const SMLoc &IDLoc) {
+  int GLCPos = AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
+                                          AMDGPU::OpName::glc1);
+  if (GLCPos != -1) {
+    // -1 is set by GLC_1 default operand. In all cases "glc" must be present
+    // in the asm string, and the default value means it is not present.
+    if (Inst.getOperand(GLCPos).getImm() == -1) {
+      Error(IDLoc, "instruction must use glc");
+      return false;
+    }
+  }
+
+  return true;
 }
 
 bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
                                           const SMLoc &IDLoc,
                                           const OperandVector &Operands) {
   if (!validateLdsDirect(Inst)) {
-    Error(IDLoc,
+    Error(getRegLoc(AMDGPU::LDS_DIRECT, Operands),
       "invalid use of lds_direct");
     return false;
   }
   if (!validateSOPLiteral(Inst)) {
-    Error(IDLoc,
+    Error(getLitLoc(Operands),
       "only one literal operand is allowed");
     return false;
   }
-  if (!validateVOP3Literal(Inst)) {
-    Error(IDLoc,
-      "invalid literal operand");
+  if (!validateVOP3Literal(Inst, Operands)) {
     return false;
   }
-  if (!validateConstantBusLimitations(Inst)) {
-    Error(IDLoc,
-      "invalid operand (violates constant bus restrictions)");
+  if (!validateConstantBusLimitations(Inst, Operands)) {
     return false;
   }
-  if (!validateEarlyClobberLimitations(Inst)) {
-    Error(IDLoc,
-      "destination must be different than all sources");
+  if (!validateEarlyClobberLimitations(Inst, Operands)) {
     return false;
   }
   if (!validateIntClampSupported(Inst)) {
-    Error(IDLoc,
+    Error(getImmLoc(AMDGPUOperand::ImmTyClampSI, Operands),
       "integer clamping is not supported on this GPU");
     return false;
   }
   if (!validateOpSel(Inst)) {
-    Error(IDLoc,
+    Error(getImmLoc(AMDGPUOperand::ImmTyOpSel, Operands),
       "invalid op_sel operand");
     return false;
   }
   // For MUBUF/MTBUF d16 is a part of opcode, so there is nothing to validate.
   if (!validateMIMGD16(Inst)) {
-    Error(IDLoc,
+    Error(getImmLoc(AMDGPUOperand::ImmTyD16, Operands),
       "d16 modifier is not supported on this GPU");
     return false;
   }
@@ -3643,17 +3872,16 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
     return false;
   }
   if (!validateMIMGAtomicDMask(Inst)) {
-    Error(IDLoc,
+    Error(getImmLoc(AMDGPUOperand::ImmTyDMask, Operands),
       "invalid atomic image dmask");
     return false;
   }
   if (!validateMIMGGatherDMask(Inst)) {
-    Error(IDLoc,
+    Error(getImmLoc(AMDGPUOperand::ImmTyDMask, Operands),
       "invalid image_gather dmask: only one bit must be set");
     return false;
   }
-  if (!validateMovrels(Inst)) {
-    Error(IDLoc, "source operand must be a VGPR");
+  if (!validateMovrels(Inst, Operands)) {
     return false;
   }
   if (!validateFlatOffset(Inst, Operands)) {
@@ -3662,7 +3890,14 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
   if (!validateSMEMOffset(Inst, Operands)) {
     return false;
   }
-  if (!validateMAIAccWrite(Inst)) {
+  if (!validateMAIAccWrite(Inst, Operands)) {
+    return false;
+  }
+  if (!validateDivScale(Inst)) {
+    Error(IDLoc, "ABS not allowed in VOP3B instructions");
+    return false;
+  }
+  if (!validateCoherencyBits(Inst, Operands, IDLoc)) {
     return false;
   }
 
@@ -3673,6 +3908,57 @@ static std::string AMDGPUMnemonicSpellCheck(StringRef S,
                                             const FeatureBitset &FBS,
                                             unsigned VariantID = 0);
 
+static bool AMDGPUCheckMnemonic(StringRef Mnemonic,
+                                const FeatureBitset &AvailableFeatures,
+                                unsigned VariantID);
+
+bool AMDGPUAsmParser::isSupportedMnemo(StringRef Mnemo,
+                                       const FeatureBitset &FBS) {
+  return isSupportedMnemo(Mnemo, FBS, getAllVariants());
+}
+
+bool AMDGPUAsmParser::isSupportedMnemo(StringRef Mnemo,
+                                       const FeatureBitset &FBS,
+                                       ArrayRef<unsigned> Variants) {
+  for (auto Variant : Variants) {
+    if (AMDGPUCheckMnemonic(Mnemo, FBS, Variant))
+      return true;
+  }
+
+  return false;
+}
+
+bool AMDGPUAsmParser::checkUnsupportedInstruction(StringRef Mnemo,
+                                                  const SMLoc &IDLoc) {
+  FeatureBitset FBS = ComputeAvailableFeatures(getSTI().getFeatureBits());
+
+  // Check if requested instruction variant is supported.
+  if (isSupportedMnemo(Mnemo, FBS, getMatchedVariants()))
+    return false;
+
+  // This instruction is not supported.
+  // Clear any other pending errors because they are no longer relevant.
+  getParser().clearPendingErrors();
+
+  // Requested instruction variant is not supported.
+  // Check if any other variants are supported.
+  StringRef VariantName = getMatchedVariantName();
+  if (!VariantName.empty() && isSupportedMnemo(Mnemo, FBS)) {
+    return Error(IDLoc,
+                 Twine(VariantName,
+                       " variant of this instruction is not supported"));
+  }
+
+  // Finally check if this instruction is supported on any other GPU.
+  if (isSupportedMnemo(Mnemo, FeatureBitset().set())) {
+    return Error(IDLoc, "instruction not supported on this GPU");
+  }
+
+  // Instruction not supported on any GPU. Probably a typo.
+  std::string Suggestion = AMDGPUMnemonicSpellCheck(Mnemo, FBS);
+  return Error(IDLoc, "invalid instruction" + Suggestion);
+}
+
 bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                               OperandVector &Operands,
                                               MCStreamer &Out,
@@ -3702,27 +3988,28 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
       break;
   }
 
-  switch (Result) {
-  default: break;
-  case Match_Success:
+  if (Result == Match_Success) {
     if (!validateInstruction(Inst, IDLoc, Operands)) {
       return true;
     }
     Inst.setLoc(IDLoc);
     Out.emitInstruction(Inst, getSTI());
     return false;
+  }
 
-  case Match_MissingFeature:
-    return Error(IDLoc, "instruction not supported on this GPU");
-
-  case Match_MnemonicFail: {
-    FeatureBitset FBS = ComputeAvailableFeatures(getSTI().getFeatureBits());
-    std::string Suggestion = AMDGPUMnemonicSpellCheck(
-        ((AMDGPUOperand &)*Operands[0]).getToken(), FBS);
-    return Error(IDLoc, "invalid instruction" + Suggestion,
-                 ((AMDGPUOperand &)*Operands[0]).getLocRange());
+  StringRef Mnemo = ((AMDGPUOperand &)*Operands[0]).getToken();
+  if (checkUnsupportedInstruction(Mnemo, IDLoc)) {
+    return true;
   }
 
+  switch (Result) {
+  default: break;
+  case Match_MissingFeature:
+    // It has been verified that the specified instruction
+    // mnemonic is valid. A match was found but it requires
+    // features which are not supported on this GPU.
+    return Error(IDLoc, "operands are not valid for this GPU or mode");
+
   case Match_InvalidOperand: {
     SMLoc ErrorLoc = IDLoc;
     if (ErrorInfo != ~0ULL) {
@@ -3739,13 +4026,15 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   case Match_PreferE32:
     return Error(IDLoc, "internal error: instruction without _e64 suffix "
                         "should be encoded as e32");
+  case Match_MnemonicFail:
+    llvm_unreachable("Invalid instructions should have been handled already");
   }
   llvm_unreachable("Implement any new match types added!");
 }
 
 bool AMDGPUAsmParser::ParseAsAbsoluteExpression(uint32_t &Ret) {
   int64_t Tmp = -1;
-  if (getLexer().isNot(AsmToken::Integer) && getLexer().isNot(AsmToken::Identifier)) {
+  if (!isToken(AsmToken::Integer) && !isToken(AsmToken::Identifier)) {
     return true;
   }
   if (getParser().parseAbsoluteExpression(Tmp)) {
@@ -3760,9 +4049,8 @@ bool AMDGPUAsmParser::ParseDirectiveMajorMinor(uint32_t &Major,
   if (ParseAsAbsoluteExpression(Major))
     return TokError("invalid major version");
 
-  if (getLexer().isNot(AsmToken::Comma))
+  if (!trySkipToken(AsmToken::Comma))
     return TokError("minor version number required, comma expected");
-  Lex();
 
   if (ParseAsAbsoluteExpression(Minor))
     return TokError("invalid minor version");
@@ -3776,25 +4064,24 @@ bool AMDGPUAsmParser::ParseDirectiveAMDGCNTarget() {
 
   std::string Target;
 
-  SMLoc TargetStart = getTok().getLoc();
+  SMLoc TargetStart = getLoc();
   if (getParser().parseEscapedString(Target))
     return true;
-  SMRange TargetRange = SMRange(TargetStart, getTok().getLoc());
+  SMRange TargetRange = SMRange(TargetStart, getLoc());
 
   std::string ExpectedTarget;
   raw_string_ostream ExpectedTargetOS(ExpectedTarget);
   IsaInfo::streamIsaVersion(&getSTI(), ExpectedTargetOS);
 
   if (Target != ExpectedTargetOS.str())
-    return getParser().Error(TargetRange.Start, "target must match options",
-                             TargetRange);
+    return Error(TargetRange.Start, "target must match options", TargetRange);
 
   getTargetStreamer().EmitDirectiveAMDGCNTarget(Target);
   return false;
 }
 
 bool AMDGPUAsmParser::OutOfRangeError(SMRange Range) {
-  return getParser().Error(Range.Start, "value out of range", Range);
+  return Error(Range.Start, "value out of range", Range);
 }
 
 bool AMDGPUAsmParser::calculateGPRBlocks(
@@ -3865,15 +4152,12 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
   Optional<bool> EnableWavefrontSize32;
 
   while (true) {
-    while (getLexer().is(AsmToken::EndOfStatement))
-      Lex();
-
-    if (getLexer().isNot(AsmToken::Identifier))
-      return TokError("expected .amdhsa_ directive or .end_amdhsa_kernel");
+    while (trySkipToken(AsmToken::EndOfStatement));
 
-    StringRef ID = getTok().getIdentifier();
+    StringRef ID;
     SMRange IDRange = getTok().getLocRange();
-    Lex();
+    if (!parseId(ID, "expected .amdhsa_ directive or .end_amdhsa_kernel"))
+      return true;
 
     if (ID == ".end_amdhsa_kernel")
       break;
@@ -3882,11 +4166,11 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
       return TokError(".amdhsa_ directives cannot be repeated");
     Seen.insert(ID);
 
-    SMLoc ValStart = getTok().getLoc();
+    SMLoc ValStart = getLoc();
     int64_t IVal;
     if (getParser().parseAbsoluteExpression(IVal))
       return true;
-    SMLoc ValEnd = getTok().getLoc();
+    SMLoc ValEnd = getLoc();
     SMRange ValRange = SMRange(ValStart, ValEnd);
 
     if (IVal < 0)
@@ -3951,8 +4235,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
         UserSGPRCount += 1;
     } else if (ID == ".amdhsa_wavefront_size32") {
       if (IVersion.Major < 10)
-        return getParser().Error(IDRange.Start, "directive requires gfx10+",
-                                 IDRange);
+        return Error(IDRange.Start, "directive requires gfx10+", IDRange);
       EnableWavefrontSize32 = Val;
       PARSE_BITS_ENTRY(KD.kernel_code_properties,
                        KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32,
@@ -3960,7 +4243,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
     } else if (ID == ".amdhsa_system_sgpr_private_segment_wavefront_offset") {
       PARSE_BITS_ENTRY(
           KD.compute_pgm_rsrc2,
-          COMPUTE_PGM_RSRC2_ENABLE_SGPR_PRIVATE_SEGMENT_WAVEFRONT_OFFSET, Val,
+          COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT, Val,
           ValRange);
     } else if (ID == ".amdhsa_system_sgpr_workgroup_id_x") {
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
@@ -3994,15 +4277,13 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
       ReserveVCC = Val;
     } else if (ID == ".amdhsa_reserve_flat_scratch") {
       if (IVersion.Major < 7)
-        return getParser().Error(IDRange.Start, "directive requires gfx7+",
-                                 IDRange);
+        return Error(IDRange.Start, "directive requires gfx7+", IDRange);
       if (!isUInt<1>(Val))
         return OutOfRangeError(ValRange);
       ReserveFlatScr = Val;
     } else if (ID == ".amdhsa_reserve_xnack_mask") {
       if (IVersion.Major < 8)
-        return getParser().Error(IDRange.Start, "directive requires gfx8+",
-                                 IDRange);
+        return Error(IDRange.Start, "directive requires gfx8+", IDRange);
       if (!isUInt<1>(Val))
         return OutOfRangeError(ValRange);
       ReserveXNACK = Val;
@@ -4027,26 +4308,22 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
                        Val, ValRange);
     } else if (ID == ".amdhsa_fp16_overflow") {
       if (IVersion.Major < 9)
-        return getParser().Error(IDRange.Start, "directive requires gfx9+",
-                                 IDRange);
+        return Error(IDRange.Start, "directive requires gfx9+", IDRange);
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_FP16_OVFL, Val,
                        ValRange);
     } else if (ID == ".amdhsa_workgroup_processor_mode") {
       if (IVersion.Major < 10)
-        return getParser().Error(IDRange.Start, "directive requires gfx10+",
-                                 IDRange);
+        return Error(IDRange.Start, "directive requires gfx10+", IDRange);
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_WGP_MODE, Val,
                        ValRange);
     } else if (ID == ".amdhsa_memory_ordered") {
       if (IVersion.Major < 10)
-        return getParser().Error(IDRange.Start, "directive requires gfx10+",
-                                 IDRange);
+        return Error(IDRange.Start, "directive requires gfx10+", IDRange);
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_MEM_ORDERED, Val,
                        ValRange);
     } else if (ID == ".amdhsa_forward_progress") {
       if (IVersion.Major < 10)
-        return getParser().Error(IDRange.Start, "directive requires gfx10+",
-                                 IDRange);
+        return Error(IDRange.Start, "directive requires gfx10+", IDRange);
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_FWD_PROGRESS, Val,
                        ValRange);
     } else if (ID == ".amdhsa_exception_fp_ieee_invalid_op") {
@@ -4080,8 +4357,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
                        COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO,
                        Val, ValRange);
     } else {
-      return getParser().Error(IDRange.Start,
-                               "unknown .amdhsa_kernel directive", IDRange);
+      return Error(IDRange.Start, "unknown .amdhsa_kernel directive", IDRange);
     }
 
 #undef PARSE_BITS_ENTRY
@@ -4145,7 +4421,7 @@ bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() {
 
   // If this directive has no arguments, then use the ISA version for the
   // targeted GPU.
-  if (getLexer().is(AsmToken::EndOfStatement)) {
+  if (isToken(AsmToken::EndOfStatement)) {
     AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU());
     getTargetStreamer().EmitDirectiveHSACodeObjectISA(ISA.Major, ISA.Minor,
                                                       ISA.Stepping,
@@ -4156,32 +4432,23 @@ bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() {
   if (ParseDirectiveMajorMinor(Major, Minor))
     return true;
 
-  if (getLexer().isNot(AsmToken::Comma))
+  if (!trySkipToken(AsmToken::Comma))
     return TokError("stepping version number required, comma expected");
-  Lex();
 
   if (ParseAsAbsoluteExpression(Stepping))
     return TokError("invalid stepping version");
 
-  if (getLexer().isNot(AsmToken::Comma))
+  if (!trySkipToken(AsmToken::Comma))
     return TokError("vendor name required, comma expected");
-  Lex();
-
-  if (getLexer().isNot(AsmToken::String))
-    return TokError("invalid vendor name");
 
-  VendorName = getLexer().getTok().getStringContents();
-  Lex();
+  if (!parseString(VendorName, "invalid vendor name"))
+    return true;
 
-  if (getLexer().isNot(AsmToken::Comma))
+  if (!trySkipToken(AsmToken::Comma))
     return TokError("arch name required, comma expected");
-  Lex();
-
-  if (getLexer().isNot(AsmToken::String))
-    return TokError("invalid arch name");
 
-  ArchName = getLexer().getTok().getStringContents();
-  Lex();
+  if (!parseString(ArchName, "invalid arch name"))
+    return true;
 
   getTargetStreamer().EmitDirectiveHSACodeObjectISA(Major, Minor, Stepping,
                                                     VendorName, ArchName);
@@ -4206,7 +4473,7 @@ bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID,
 
   if (ID == "enable_wavefront_size32") {
     if (Header.code_properties & AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32) {
-      if (!isGFX10())
+      if (!isGFX10Plus())
         return TokError("enable_wavefront_size32=1 is only allowed on GFX10+");
       if (!getFeatureBits()[AMDGPU::FeatureWavefrontSize32])
         return TokError("enable_wavefront_size32=1 requires +WavefrontSize32");
@@ -4218,7 +4485,7 @@ bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID,
 
   if (ID == "wavefront_size") {
     if (Header.wavefront_size == 5) {
-      if (!isGFX10())
+      if (!isGFX10Plus())
         return TokError("wavefront_size=5 is only allowed on GFX10+");
       if (!getFeatureBits()[AMDGPU::FeatureWavefrontSize32])
         return TokError("wavefront_size=5 requires +WavefrontSize32");
@@ -4229,17 +4496,20 @@ bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID,
   }
 
   if (ID == "enable_wgp_mode") {
-    if (G_00B848_WGP_MODE(Header.compute_pgm_resource_registers) && !isGFX10())
+    if (G_00B848_WGP_MODE(Header.compute_pgm_resource_registers) &&
+        !isGFX10Plus())
       return TokError("enable_wgp_mode=1 is only allowed on GFX10+");
   }
 
   if (ID == "enable_mem_ordered") {
-    if (G_00B848_MEM_ORDERED(Header.compute_pgm_resource_registers) && !isGFX10())
+    if (G_00B848_MEM_ORDERED(Header.compute_pgm_resource_registers) &&
+        !isGFX10Plus())
       return TokError("enable_mem_ordered=1 is only allowed on GFX10+");
   }
 
   if (ID == "enable_fwd_progress") {
-    if (G_00B848_FWD_PROGRESS(Header.compute_pgm_resource_registers) && !isGFX10())
+    if (G_00B848_FWD_PROGRESS(Header.compute_pgm_resource_registers) &&
+        !isGFX10Plus())
       return TokError("enable_fwd_progress=1 is only allowed on GFX10+");
   }
 
@@ -4253,14 +4523,11 @@ bool AMDGPUAsmParser::ParseDirectiveAMDKernelCodeT() {
   while (true) {
     // Lex EndOfStatement.  This is in a while loop, because lexing a comment
     // will set the current token to EndOfStatement.
-    while(getLexer().is(AsmToken::EndOfStatement))
-      Lex();
-
-    if (getLexer().isNot(AsmToken::Identifier))
-      return TokError("expected value identifier or .end_amd_kernel_code_t");
+    while(trySkipToken(AsmToken::EndOfStatement));
 
-    StringRef ID = getLexer().getTok().getIdentifier();
-    Lex();
+    StringRef ID;
+    if (!parseId(ID, "expected value identifier or .end_amd_kernel_code_t"))
+      return true;
 
     if (ID == ".end_amd_kernel_code_t")
       break;
@@ -4275,34 +4542,32 @@ bool AMDGPUAsmParser::ParseDirectiveAMDKernelCodeT() {
 }
 
 bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaKernel() {
-  if (getLexer().isNot(AsmToken::Identifier))
-    return TokError("expected symbol name");
-
-  StringRef KernelName = Parser.getTok().getString();
+  StringRef KernelName;
+  if (!parseId(KernelName, "expected symbol name"))
+    return true;
 
   getTargetStreamer().EmitAMDGPUSymbolType(KernelName,
                                            ELF::STT_AMDGPU_HSA_KERNEL);
-  Lex();
-  if (!AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI()))
-    KernelScope.initialize(getContext());
+
+  KernelScope.initialize(getContext());
   return false;
 }
 
 bool AMDGPUAsmParser::ParseDirectiveISAVersion() {
   if (getSTI().getTargetTriple().getArch() != Triple::amdgcn) {
-    return Error(getParser().getTok().getLoc(),
+    return Error(getLoc(),
                  ".amd_amdgpu_isa directive is not available on non-amdgcn "
                  "architectures");
   }
 
-  auto ISAVersionStringFromASM = getLexer().getTok().getStringContents();
+  auto ISAVersionStringFromASM = getToken().getStringContents();
 
   std::string ISAVersionStringFromSTI;
   raw_string_ostream ISAVersionStreamFromSTI(ISAVersionStringFromSTI);
   IsaInfo::streamIsaVersion(&getSTI(), ISAVersionStreamFromSTI);
 
   if (ISAVersionStringFromASM != ISAVersionStreamFromSTI.str()) {
-    return Error(getParser().getTok().getLoc(),
+    return Error(getLoc(),
                  ".amd_amdgpu_isa directive does not match triple and/or mcpu "
                  "arguments specified through the command line");
   }
@@ -4317,14 +4582,14 @@ bool AMDGPUAsmParser::ParseDirectiveHSAMetadata() {
   const char *AssemblerDirectiveBegin;
   const char *AssemblerDirectiveEnd;
   std::tie(AssemblerDirectiveBegin, AssemblerDirectiveEnd) =
-      AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI())
+      isHsaAbiVersion3(&getSTI())
           ? std::make_tuple(HSAMD::V3::AssemblerDirectiveBegin,
                             HSAMD::V3::AssemblerDirectiveEnd)
           : std::make_tuple(HSAMD::AssemblerDirectiveBegin,
                             HSAMD::AssemblerDirectiveEnd);
 
   if (getSTI().getTargetTriple().getOS() != Triple::AMDHSA) {
-    return Error(getParser().getTok().getLoc(),
+    return Error(getLoc(),
                  (Twine(AssemblerDirectiveBegin) + Twine(" directive is "
                  "not available on non-amdhsa OSes")).str());
   }
@@ -4334,12 +4599,12 @@ bool AMDGPUAsmParser::ParseDirectiveHSAMetadata() {
                           HSAMetadataString))
     return true;
 
-  if (IsaInfo::hasCodeObjectV3(&getSTI())) {
+  if (isHsaAbiVersion3(&getSTI())) {
     if (!getTargetStreamer().EmitHSAMetadataV3(HSAMetadataString))
-      return Error(getParser().getTok().getLoc(), "invalid HSA metadata");
+      return Error(getLoc(), "invalid HSA metadata");
   } else {
     if (!getTargetStreamer().EmitHSAMetadataV2(HSAMetadataString))
-      return Error(getParser().getTok().getLoc(), "invalid HSA metadata");
+      return Error(getLoc(), "invalid HSA metadata");
   }
 
   return false;
@@ -4356,19 +4621,15 @@ bool AMDGPUAsmParser::ParseToEndDirective(const char *AssemblerDirectiveBegin,
   getLexer().setSkipSpace(false);
 
   bool FoundEnd = false;
-  while (!getLexer().is(AsmToken::Eof)) {
-    while (getLexer().is(AsmToken::Space)) {
-      CollectStream << getLexer().getTok().getString();
+  while (!isToken(AsmToken::Eof)) {
+    while (isToken(AsmToken::Space)) {
+      CollectStream << getTokenStr();
       Lex();
     }
 
-    if (getLexer().is(AsmToken::Identifier)) {
-      StringRef ID = getLexer().getTok().getIdentifier();
-      if (ID == AssemblerDirectiveEnd) {
-        Lex();
-        FoundEnd = true;
-        break;
-      }
+    if (trySkipId(AssemblerDirectiveEnd)) {
+      FoundEnd = true;
+      break;
     }
 
     CollectStream << Parser.parseStringToEndOfStatement()
@@ -4379,7 +4640,7 @@ bool AMDGPUAsmParser::ParseToEndDirective(const char *AssemblerDirectiveBegin,
 
   getLexer().setSkipSpace(true);
 
-  if (getLexer().is(AsmToken::Eof) && !FoundEnd) {
+  if (isToken(AsmToken::Eof) && !FoundEnd) {
     return TokError(Twine("expected directive ") +
                     Twine(AssemblerDirectiveEnd) + Twine(" not found"));
   }
@@ -4397,14 +4658,14 @@ bool AMDGPUAsmParser::ParseDirectivePALMetadataBegin() {
 
   auto PALMetadata = getTargetStreamer().getPALMetadata();
   if (!PALMetadata->setFromString(String))
-    return Error(getParser().getTok().getLoc(), "invalid PAL metadata");
+    return Error(getLoc(), "invalid PAL metadata");
   return false;
 }
 
 /// Parse the assembler directive for old linear-format PAL metadata.
 bool AMDGPUAsmParser::ParseDirectivePALMetadata() {
   if (getSTI().getTargetTriple().getOS() != Triple::AMDPAL) {
-    return Error(getParser().getTok().getLoc(),
+    return Error(getLoc(),
                  (Twine(PALMD::AssemblerDirective) + Twine(" directive is "
                  "not available on non-amdpal OSes")).str());
   }
@@ -4417,19 +4678,17 @@ bool AMDGPUAsmParser::ParseDirectivePALMetadata() {
       return TokError(Twine("invalid value in ") +
                       Twine(PALMD::AssemblerDirective));
     }
-    if (getLexer().isNot(AsmToken::Comma)) {
+    if (!trySkipToken(AsmToken::Comma)) {
       return TokError(Twine("expected an even number of values in ") +
                       Twine(PALMD::AssemblerDirective));
     }
-    Lex();
     if (ParseAsAbsoluteExpression(Value)) {
       return TokError(Twine("invalid value in ") +
                       Twine(PALMD::AssemblerDirective));
     }
     PALMetadata->setRegister(Key, Value);
-    if (getLexer().isNot(AsmToken::Comma))
+    if (!trySkipToken(AsmToken::Comma))
       break;
-    Lex();
   }
   return false;
 }
@@ -4441,7 +4700,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDGPULDS() {
     return true;
 
   StringRef Name;
-  SMLoc NameLoc = getLexer().getLoc();
+  SMLoc NameLoc = getLoc();
   if (getParser().parseIdentifier(Name))
     return TokError("expected identifier in directive");
 
@@ -4452,7 +4711,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDGPULDS() {
   unsigned LocalMemorySize = AMDGPU::IsaInfo::getLocalMemorySize(&getSTI());
 
   int64_t Size;
-  SMLoc SizeLoc = getLexer().getLoc();
+  SMLoc SizeLoc = getLoc();
   if (getParser().parseAbsoluteExpression(Size))
     return true;
   if (Size < 0)
@@ -4461,9 +4720,8 @@ bool AMDGPUAsmParser::ParseDirectiveAMDGPULDS() {
     return Error(SizeLoc, "size is too large");
 
   int64_t Alignment = 4;
-  if (getLexer().is(AsmToken::Comma)) {
-    Lex();
-    SMLoc AlignLoc = getLexer().getLoc();
+  if (trySkipToken(AsmToken::Comma)) {
+    SMLoc AlignLoc = getLoc();
     if (getParser().parseAbsoluteExpression(Alignment))
       return true;
     if (Alignment < 0 || !isPowerOf2_64(Alignment))
@@ -4491,7 +4749,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDGPULDS() {
 bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
   StringRef IDVal = DirectiveID.getString();
 
-  if (AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI())) {
+  if (isHsaAbiVersion3(&getSTI())) {
     if (IDVal == ".amdgcn_target")
       return ParseDirectiveAMDGCNTarget();
 
@@ -4539,7 +4797,7 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI,
   for (MCRegAliasIterator R(AMDGPU::TTMP12_TTMP13_TTMP14_TTMP15, &MRI, true);
        R.isValid(); ++R) {
     if (*R == RegNo)
-      return isGFX9() || isGFX10();
+      return isGFX9Plus();
   }
 
   // GFX10 has 2 more SGPRs 104 and 105.
@@ -4555,20 +4813,20 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI,
   case AMDGPU::SRC_PRIVATE_BASE:
   case AMDGPU::SRC_PRIVATE_LIMIT:
   case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
-    return !isCI() && !isSI() && !isVI();
+    return isGFX9Plus();
   case AMDGPU::TBA:
   case AMDGPU::TBA_LO:
   case AMDGPU::TBA_HI:
   case AMDGPU::TMA:
   case AMDGPU::TMA_LO:
   case AMDGPU::TMA_HI:
-    return !isGFX9() && !isGFX10();
+    return !isGFX9Plus();
   case AMDGPU::XNACK_MASK:
   case AMDGPU::XNACK_MASK_LO:
   case AMDGPU::XNACK_MASK_HI:
-    return !isCI() && !isSI() && !isGFX10() && hasXNACK();
+    return (isVI() || isGFX9()) && hasXNACK();
   case AMDGPU::SGPR_NULL:
-    return isGFX10();
+    return isGFX10Plus();
   default:
     break;
   }
@@ -4576,7 +4834,7 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI,
   if (isCI())
     return true;
 
-  if (isSI() || isGFX10()) {
+  if (isSI() || isGFX10Plus()) {
     // No flat_scr on SI.
     // On GFX10 flat scratch is not a valid register operand and can only be
     // accessed with s_setreg/s_getreg.
@@ -4614,35 +4872,33 @@ AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic,
   // are appending default values to the Operands list.  This is only done
   // by custom parser, so we shouldn't continue on to the generic parsing.
   if (ResTy == MatchOperand_Success || ResTy == MatchOperand_ParseFail ||
-      getLexer().is(AsmToken::EndOfStatement))
+      isToken(AsmToken::EndOfStatement))
     return ResTy;
 
-  if (Mode == OperandMode_NSA && getLexer().is(AsmToken::LBrac)) {
+  SMLoc RBraceLoc;
+  SMLoc LBraceLoc = getLoc();
+  if (Mode == OperandMode_NSA && trySkipToken(AsmToken::LBrac)) {
     unsigned Prefix = Operands.size();
-    SMLoc LBraceLoc = getTok().getLoc();
-    Parser.Lex(); // eat the '['
 
     for (;;) {
       ResTy = parseReg(Operands);
       if (ResTy != MatchOperand_Success)
         return ResTy;
 
-      if (getLexer().is(AsmToken::RBrac))
+      RBraceLoc = getLoc();
+      if (trySkipToken(AsmToken::RBrac))
         break;
 
-      if (getLexer().isNot(AsmToken::Comma))
+      if (!trySkipToken(AsmToken::Comma))
         return MatchOperand_ParseFail;
-      Parser.Lex();
     }
 
     if (Operands.size() - Prefix > 1) {
       Operands.insert(Operands.begin() + Prefix,
                       AMDGPUOperand::CreateToken(this, "[", LBraceLoc));
-      Operands.push_back(AMDGPUOperand::CreateToken(this, "]",
-                                                    getTok().getLoc()));
+      Operands.push_back(AMDGPUOperand::CreateToken(this, "]", RBraceLoc));
     }
 
-    Parser.Lex(); // eat the ']'
     return MatchOperand_Success;
   }
 
@@ -4680,32 +4936,28 @@ bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info,
 
   bool IsMIMG = Name.startswith("image_");
 
-  while (!getLexer().is(AsmToken::EndOfStatement)) {
+  while (!trySkipToken(AsmToken::EndOfStatement)) {
     OperandMode Mode = OperandMode_Default;
-    if (IsMIMG && isGFX10() && Operands.size() == 2)
+    if (IsMIMG && isGFX10Plus() && Operands.size() == 2)
       Mode = OperandMode_NSA;
     OperandMatchResultTy Res = parseOperand(Operands, Name, Mode);
 
     // Eat the comma or space if there is one.
-    if (getLexer().is(AsmToken::Comma))
-      Parser.Lex();
+    trySkipToken(AsmToken::Comma);
 
-    switch (Res) {
-      case MatchOperand_Success: break;
-      case MatchOperand_ParseFail:
-        // FIXME: use real operand location rather than the current location.
-        Error(getLexer().getLoc(), "failed parsing operand.");
-        while (!getLexer().is(AsmToken::EndOfStatement)) {
-          Parser.Lex();
-        }
-        return true;
-      case MatchOperand_NoMatch:
+    if (Res != MatchOperand_Success) {
+      checkUnsupportedInstruction(Name, NameLoc);
+      if (!Parser.hasPendingError()) {
         // FIXME: use real operand location rather than the current location.
-        Error(getLexer().getLoc(), "not a valid operand.");
-        while (!getLexer().is(AsmToken::EndOfStatement)) {
-          Parser.Lex();
-        }
-        return true;
+        StringRef Msg =
+          (Res == MatchOperand_ParseFail) ? "failed parsing operand." :
+                                            "not a valid operand.";
+        Error(getLoc(), Msg);
+      }
+      while (!trySkipToken(AsmToken::EndOfStatement)) {
+        lex();
+      }
+      return true;
     }
   }
 
@@ -4794,14 +5046,14 @@ OperandMatchResultTy
 AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands,
                                AMDGPUOperand::ImmTy ImmTy) {
   int64_t Bit = 0;
-  SMLoc S = Parser.getTok().getLoc();
+  SMLoc S = getLoc();
 
   // We are at the end of the statement, and this is a default argument, so
   // use a default value.
-  if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    switch(getLexer().getKind()) {
+  if (!isToken(AsmToken::EndOfStatement)) {
+    switch(getTokenKind()) {
       case AsmToken::Identifier: {
-        StringRef Tok = Parser.getTok().getString();
+        StringRef Tok = getTokenStr();
         if (Tok == Name) {
           if (Tok == "r128" && !hasMIMG_R128())
             Error(S, "r128 modifier is not supported on this GPU");
@@ -4822,7 +5074,7 @@ AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands,
     }
   }
 
-  if (!isGFX10() && ImmTy == AMDGPUOperand::ImmTyDLC)
+  if (!isGFX10Plus() && ImmTy == AMDGPUOperand::ImmTyDLC)
     return MatchOperand_ParseFail;
 
   if (isGFX9() && ImmTy == AMDGPUOperand::ImmTyA16)
@@ -4847,88 +5099,288 @@ static void addOptionalImmOperand(
 }
 
 OperandMatchResultTy
-AMDGPUAsmParser::parseStringWithPrefix(StringRef Prefix, StringRef &Value) {
-  if (getLexer().isNot(AsmToken::Identifier)) {
-    return MatchOperand_NoMatch;
-  }
-  StringRef Tok = Parser.getTok().getString();
-  if (Tok != Prefix) {
+AMDGPUAsmParser::parseStringWithPrefix(StringRef Prefix,
+                                       StringRef &Value,
+                                       SMLoc &StringLoc) {
+  if (!trySkipId(Prefix, AsmToken::Colon))
     return MatchOperand_NoMatch;
-  }
 
-  Parser.Lex();
-  if (getLexer().isNot(AsmToken::Colon)) {
-    return MatchOperand_ParseFail;
-  }
+  StringLoc = getLoc();
+  return parseId(Value, "expected an identifier") ? MatchOperand_Success
+                                                  : MatchOperand_ParseFail;
+}
 
-  Parser.Lex();
-  if (getLexer().isNot(AsmToken::Identifier)) {
-    return MatchOperand_ParseFail;
+//===----------------------------------------------------------------------===//
+// MTBUF format
+//===----------------------------------------------------------------------===//
+
+bool AMDGPUAsmParser::tryParseFmt(const char *Pref,
+                                  int64_t MaxVal,
+                                  int64_t &Fmt) {
+  int64_t Val;
+  SMLoc Loc = getLoc();
+
+  auto Res = parseIntWithPrefix(Pref, Val);
+  if (Res == MatchOperand_ParseFail)
+    return false;
+  if (Res == MatchOperand_NoMatch)
+    return true;
+
+  if (Val < 0 || Val > MaxVal) {
+    Error(Loc, Twine("out of range ", StringRef(Pref)));
+    return false;
   }
 
-  Value = Parser.getTok().getString();
-  return MatchOperand_Success;
+  Fmt = Val;
+  return true;
 }
 
 // dfmt and nfmt (in a tbuffer instruction) are parsed as one to allow their
 // values to live in a joint format operand in the MCInst encoding.
 OperandMatchResultTy
-AMDGPUAsmParser::parseDfmtNfmt(OperandVector &Operands) {
-  SMLoc S = Parser.getTok().getLoc();
-  int64_t Dfmt = 0, Nfmt = 0;
+AMDGPUAsmParser::parseDfmtNfmt(int64_t &Format) {
+  using namespace llvm::AMDGPU::MTBUFFormat;
+
+  int64_t Dfmt = DFMT_UNDEF;
+  int64_t Nfmt = NFMT_UNDEF;
+
   // dfmt and nfmt can appear in either order, and each is optional.
-  bool GotDfmt = false, GotNfmt = false;
-  while (!GotDfmt || !GotNfmt) {
-    if (!GotDfmt) {
-      auto Res = parseIntWithPrefix("dfmt", Dfmt);
-      if (Res != MatchOperand_NoMatch) {
-        if (Res != MatchOperand_Success)
-          return Res;
-        if (Dfmt >= 16) {
-          Error(Parser.getTok().getLoc(), "out of range dfmt");
-          return MatchOperand_ParseFail;
-        }
-        GotDfmt = true;
-        Parser.Lex();
-        continue;
-      }
+  for (int I = 0; I < 2; ++I) {
+    if (Dfmt == DFMT_UNDEF && !tryParseFmt("dfmt", DFMT_MAX, Dfmt))
+      return MatchOperand_ParseFail;
+
+    if (Nfmt == NFMT_UNDEF && !tryParseFmt("nfmt", NFMT_MAX, Nfmt)) {
+      return MatchOperand_ParseFail;
     }
-    if (!GotNfmt) {
-      auto Res = parseIntWithPrefix("nfmt", Nfmt);
-      if (Res != MatchOperand_NoMatch) {
-        if (Res != MatchOperand_Success)
-          return Res;
-        if (Nfmt >= 8) {
-          Error(Parser.getTok().getLoc(), "out of range nfmt");
-          return MatchOperand_ParseFail;
-        }
-        GotNfmt = true;
-        Parser.Lex();
-        continue;
-      }
+    // Skip optional comma between dfmt/nfmt
+    // but guard against 2 commas following each other.
+    if ((Dfmt == DFMT_UNDEF) != (Nfmt == NFMT_UNDEF) &&
+        !peekToken().is(AsmToken::Comma)) {
+      trySkipToken(AsmToken::Comma);
     }
-    break;
   }
-  if (!GotDfmt && !GotNfmt)
+
+  if (Dfmt == DFMT_UNDEF && Nfmt == NFMT_UNDEF)
     return MatchOperand_NoMatch;
-  auto Format = Dfmt | Nfmt << 4;
-  Operands.push_back(
-      AMDGPUOperand::CreateImm(this, Format, S, AMDGPUOperand::ImmTyFORMAT));
+
+  Dfmt = (Dfmt == DFMT_UNDEF) ? DFMT_DEFAULT : Dfmt;
+  Nfmt = (Nfmt == NFMT_UNDEF) ? NFMT_DEFAULT : Nfmt;
+
+  Format = encodeDfmtNfmt(Dfmt, Nfmt);
   return MatchOperand_Success;
 }
 
-//===----------------------------------------------------------------------===//
-// ds
-//===----------------------------------------------------------------------===//
+OperandMatchResultTy
+AMDGPUAsmParser::parseUfmt(int64_t &Format) {
+  using namespace llvm::AMDGPU::MTBUFFormat;
 
-void AMDGPUAsmParser::cvtDSOffset01(MCInst &Inst,
-                                    const OperandVector &Operands) {
-  OptionalImmIndexMap OptionalIdx;
+  int64_t Fmt = UFMT_UNDEF;
 
-  for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
-    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
+  if (!tryParseFmt("format", UFMT_MAX, Fmt))
+    return MatchOperand_ParseFail;
 
-    // Add the register arguments
+  if (Fmt == UFMT_UNDEF)
+    return MatchOperand_NoMatch;
+
+  Format = Fmt;
+  return MatchOperand_Success;
+}
+
+bool AMDGPUAsmParser::matchDfmtNfmt(int64_t &Dfmt,
+                                    int64_t &Nfmt,
+                                    StringRef FormatStr,
+                                    SMLoc Loc) {
+  using namespace llvm::AMDGPU::MTBUFFormat;
+  int64_t Format;
+
+  Format = getDfmt(FormatStr);
+  if (Format != DFMT_UNDEF) {
+    Dfmt = Format;
+    return true;
+  }
+
+  Format = getNfmt(FormatStr, getSTI());
+  if (Format != NFMT_UNDEF) {
+    Nfmt = Format;
+    return true;
+  }
+
+  Error(Loc, "unsupported format");
+  return false;
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseSymbolicSplitFormat(StringRef FormatStr,
+                                          SMLoc FormatLoc,
+                                          int64_t &Format) {
+  using namespace llvm::AMDGPU::MTBUFFormat;
+
+  int64_t Dfmt = DFMT_UNDEF;
+  int64_t Nfmt = NFMT_UNDEF;
+  if (!matchDfmtNfmt(Dfmt, Nfmt, FormatStr, FormatLoc))
+    return MatchOperand_ParseFail;
+
+  if (trySkipToken(AsmToken::Comma)) {
+    StringRef Str;
+    SMLoc Loc = getLoc();
+    if (!parseId(Str, "expected a format string") ||
+        !matchDfmtNfmt(Dfmt, Nfmt, Str, Loc)) {
+      return MatchOperand_ParseFail;
+    }
+    if (Dfmt == DFMT_UNDEF) {
+      Error(Loc, "duplicate numeric format");
+      return MatchOperand_ParseFail;
+    } else if (Nfmt == NFMT_UNDEF) {
+      Error(Loc, "duplicate data format");
+      return MatchOperand_ParseFail;
+    }
+  }
+
+  Dfmt = (Dfmt == DFMT_UNDEF) ? DFMT_DEFAULT : Dfmt;
+  Nfmt = (Nfmt == NFMT_UNDEF) ? NFMT_DEFAULT : Nfmt;
+
+  if (isGFX10Plus()) {
+    auto Ufmt = convertDfmtNfmt2Ufmt(Dfmt, Nfmt);
+    if (Ufmt == UFMT_UNDEF) {
+      Error(FormatLoc, "unsupported format");
+      return MatchOperand_ParseFail;
+    }
+    Format = Ufmt;
+  } else {
+    Format = encodeDfmtNfmt(Dfmt, Nfmt);
+  }
+
+  return MatchOperand_Success;
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseSymbolicUnifiedFormat(StringRef FormatStr,
+                                            SMLoc Loc,
+                                            int64_t &Format) {
+  using namespace llvm::AMDGPU::MTBUFFormat;
+
+  auto Id = getUnifiedFormat(FormatStr);
+  if (Id == UFMT_UNDEF)
+    return MatchOperand_NoMatch;
+
+  if (!isGFX10Plus()) {
+    Error(Loc, "unified format is not supported on this GPU");
+    return MatchOperand_ParseFail;
+  }
+
+  Format = Id;
+  return MatchOperand_Success;
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseNumericFormat(int64_t &Format) {
+  using namespace llvm::AMDGPU::MTBUFFormat;
+  SMLoc Loc = getLoc();
+
+  if (!parseExpr(Format))
+    return MatchOperand_ParseFail;
+  if (!isValidFormatEncoding(Format, getSTI())) {
+    Error(Loc, "out of range format");
+    return MatchOperand_ParseFail;
+  }
+
+  return MatchOperand_Success;
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseSymbolicOrNumericFormat(int64_t &Format) {
+  using namespace llvm::AMDGPU::MTBUFFormat;
+
+  if (!trySkipId("format", AsmToken::Colon))
+    return MatchOperand_NoMatch;
+
+  if (trySkipToken(AsmToken::LBrac)) {
+    StringRef FormatStr;
+    SMLoc Loc = getLoc();
+    if (!parseId(FormatStr, "expected a format string"))
+      return MatchOperand_ParseFail;
+
+    auto Res = parseSymbolicUnifiedFormat(FormatStr, Loc, Format);
+    if (Res == MatchOperand_NoMatch)
+      Res = parseSymbolicSplitFormat(FormatStr, Loc, Format);
+    if (Res != MatchOperand_Success)
+      return Res;
+
+    if (!skipToken(AsmToken::RBrac, "expected a closing square bracket"))
+      return MatchOperand_ParseFail;
+
+    return MatchOperand_Success;
+  }
+
+  return parseNumericFormat(Format);
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseFORMAT(OperandVector &Operands) {
+  using namespace llvm::AMDGPU::MTBUFFormat;
+
+  int64_t Format = getDefaultFormatEncoding(getSTI());
+  OperandMatchResultTy Res;
+  SMLoc Loc = getLoc();
+
+  // Parse legacy format syntax.
+  Res = isGFX10Plus() ? parseUfmt(Format) : parseDfmtNfmt(Format);
+  if (Res == MatchOperand_ParseFail)
+    return Res;
+
+  bool FormatFound = (Res == MatchOperand_Success);
+
+  Operands.push_back(
+    AMDGPUOperand::CreateImm(this, Format, Loc, AMDGPUOperand::ImmTyFORMAT));
+
+  if (FormatFound)
+    trySkipToken(AsmToken::Comma);
+
+  if (isToken(AsmToken::EndOfStatement)) {
+    // We are expecting an soffset operand,
+    // but let matcher handle the error.
+    return MatchOperand_Success;
+  }
+
+  // Parse soffset.
+  Res = parseRegOrImm(Operands);
+  if (Res != MatchOperand_Success)
+    return Res;
+
+  trySkipToken(AsmToken::Comma);
+
+  if (!FormatFound) {
+    Res = parseSymbolicOrNumericFormat(Format);
+    if (Res == MatchOperand_ParseFail)
+      return Res;
+    if (Res == MatchOperand_Success) {
+      auto Size = Operands.size();
+      AMDGPUOperand &Op = static_cast<AMDGPUOperand &>(*Operands[Size - 2]);
+      assert(Op.isImm() && Op.getImmTy() == AMDGPUOperand::ImmTyFORMAT);
+      Op.setImm(Format);
+    }
+    return MatchOperand_Success;
+  }
+
+  if (isId("format") && peekToken().is(AsmToken::Colon)) {
+    Error(getLoc(), "duplicate format");
+    return MatchOperand_ParseFail;
+  }
+  return MatchOperand_Success;
+}
+
+//===----------------------------------------------------------------------===//
+// ds
+//===----------------------------------------------------------------------===//
+
+void AMDGPUAsmParser::cvtDSOffset01(MCInst &Inst,
+                                    const OperandVector &Operands) {
+  OptionalImmIndexMap OptionalIdx;
+
+  for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
+    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
+
+    // Add the register arguments
     if (Op.isReg()) {
       Op.addRegOperands(Inst, 1);
       continue;
@@ -5122,12 +5574,14 @@ AMDGPUAsmParser::parseSWaitCntOps(OperandVector &Operands) {
   int64_t Waitcnt = getWaitcntBitMask(ISA);
   SMLoc S = getLoc();
 
-  // If parse failed, do not return error code
-  // to avoid excessive error messages.
   if (isToken(AsmToken::Identifier) && peekToken().is(AsmToken::LParen)) {
-    while (parseCnt(Waitcnt) && !isToken(AsmToken::EndOfStatement));
+    while (!isToken(AsmToken::EndOfStatement)) {
+      if (!parseCnt(Waitcnt))
+        return MatchOperand_ParseFail;
+    }
   } else {
-    parseExpr(Waitcnt);
+    if (!parseExpr(Waitcnt))
+      return MatchOperand_ParseFail;
   }
 
   Operands.push_back(AMDGPUOperand::CreateImm(this, Waitcnt, S));
@@ -5145,16 +5599,17 @@ AMDGPUOperand::isSWaitCnt() const {
 
 bool
 AMDGPUAsmParser::parseHwregBody(OperandInfoTy &HwReg,
-                                int64_t &Offset,
-                                int64_t &Width) {
+                                OperandInfoTy &Offset,
+                                OperandInfoTy &Width) {
   using namespace llvm::AMDGPU::Hwreg;
 
   // The register may be specified by name or using a numeric code
+  HwReg.Loc = getLoc();
   if (isToken(AsmToken::Identifier) &&
       (HwReg.Id = getHwregId(getTokenStr())) >= 0) {
     HwReg.IsSymbolic = true;
-    lex(); // skip message name
-  } else if (!parseExpr(HwReg.Id)) {
+    lex(); // skip register name
+  } else if (!parseExpr(HwReg.Id, "a register name")) {
     return false;
   }
 
@@ -5162,33 +5617,45 @@ AMDGPUAsmParser::parseHwregBody(OperandInfoTy &HwReg,
     return true;
 
   // parse optional params
-  return
-    skipToken(AsmToken::Comma, "expected a comma or a closing parenthesis") &&
-    parseExpr(Offset) &&
-    skipToken(AsmToken::Comma, "expected a comma") &&
-    parseExpr(Width) &&
-    skipToken(AsmToken::RParen, "expected a closing parenthesis");
+  if (!skipToken(AsmToken::Comma, "expected a comma or a closing parenthesis"))
+    return false;
+
+  Offset.Loc = getLoc();
+  if (!parseExpr(Offset.Id))
+    return false;
+
+  if (!skipToken(AsmToken::Comma, "expected a comma"))
+    return false;
+
+  Width.Loc = getLoc();
+  return parseExpr(Width.Id) &&
+         skipToken(AsmToken::RParen, "expected a closing parenthesis");
 }
 
 bool
 AMDGPUAsmParser::validateHwreg(const OperandInfoTy &HwReg,
-                               const int64_t Offset,
-                               const int64_t Width,
-                               const SMLoc Loc) {
+                               const OperandInfoTy &Offset,
+                               const OperandInfoTy &Width) {
 
   using namespace llvm::AMDGPU::Hwreg;
 
   if (HwReg.IsSymbolic && !isValidHwreg(HwReg.Id, getSTI())) {
-    Error(Loc, "specified hardware register is not supported on this GPU");
+    Error(HwReg.Loc,
+          "specified hardware register is not supported on this GPU");
     return false;
-  } else if (!isValidHwreg(HwReg.Id)) {
-    Error(Loc, "invalid code of hardware register: only 6-bit values are legal");
+  }
+  if (!isValidHwreg(HwReg.Id)) {
+    Error(HwReg.Loc,
+          "invalid code of hardware register: only 6-bit values are legal");
     return false;
-  } else if (!isValidHwregOffset(Offset)) {
-    Error(Loc, "invalid bit offset: only 5-bit values are legal");
+  }
+  if (!isValidHwregOffset(Offset.Id)) {
+    Error(Offset.Loc, "invalid bit offset: only 5-bit values are legal");
     return false;
-  } else if (!isValidHwregWidth(Width)) {
-    Error(Loc, "invalid bitfield width: only values from 1 to 32 are legal");
+  }
+  if (!isValidHwregWidth(Width.Id)) {
+    Error(Width.Loc,
+          "invalid bitfield width: only values from 1 to 32 are legal");
     return false;
   }
   return true;
@@ -5201,19 +5668,23 @@ AMDGPUAsmParser::parseHwreg(OperandVector &Operands) {
   int64_t ImmVal = 0;
   SMLoc Loc = getLoc();
 
-  // If parse failed, do not return error code
-  // to avoid excessive error messages.
   if (trySkipId("hwreg", AsmToken::LParen)) {
     OperandInfoTy HwReg(ID_UNKNOWN_);
-    int64_t Offset = OFFSET_DEFAULT_;
-    int64_t Width = WIDTH_DEFAULT_;
+    OperandInfoTy Offset(OFFSET_DEFAULT_);
+    OperandInfoTy Width(WIDTH_DEFAULT_);
     if (parseHwregBody(HwReg, Offset, Width) &&
-        validateHwreg(HwReg, Offset, Width, Loc)) {
-      ImmVal = encodeHwreg(HwReg.Id, Offset, Width);
+        validateHwreg(HwReg, Offset, Width)) {
+      ImmVal = encodeHwreg(HwReg.Id, Offset.Id, Width.Id);
+    } else {
+      return MatchOperand_ParseFail;
     }
-  } else if (parseExpr(ImmVal)) {
-    if (ImmVal < 0 || !isUInt<16>(ImmVal))
+  } else if (parseExpr(ImmVal, "a hwreg macro")) {
+    if (ImmVal < 0 || !isUInt<16>(ImmVal)) {
       Error(Loc, "invalid immediate: only 16-bit values are legal");
+      return MatchOperand_ParseFail;
+    }
+  } else {
+    return MatchOperand_ParseFail;
   }
 
   Operands.push_back(AMDGPUOperand::CreateImm(this, ImmVal, Loc, AMDGPUOperand::ImmTyHwreg));
@@ -5234,24 +5705,27 @@ AMDGPUAsmParser::parseSendMsgBody(OperandInfoTy &Msg,
                                   OperandInfoTy &Stream) {
   using namespace llvm::AMDGPU::SendMsg;
 
+  Msg.Loc = getLoc();
   if (isToken(AsmToken::Identifier) && (Msg.Id = getMsgId(getTokenStr())) >= 0) {
     Msg.IsSymbolic = true;
     lex(); // skip message name
-  } else if (!parseExpr(Msg.Id)) {
+  } else if (!parseExpr(Msg.Id, "a message name")) {
     return false;
   }
 
   if (trySkipToken(AsmToken::Comma)) {
     Op.IsDefined = true;
+    Op.Loc = getLoc();
     if (isToken(AsmToken::Identifier) &&
         (Op.Id = getMsgOpId(Msg.Id, getTokenStr())) >= 0) {
       lex(); // skip operation name
-    } else if (!parseExpr(Op.Id)) {
+    } else if (!parseExpr(Op.Id, "an operation name")) {
       return false;
     }
 
     if (trySkipToken(AsmToken::Comma)) {
       Stream.IsDefined = true;
+      Stream.Loc = getLoc();
       if (!parseExpr(Stream.Id))
         return false;
     }
@@ -5263,8 +5737,7 @@ AMDGPUAsmParser::parseSendMsgBody(OperandInfoTy &Msg,
 bool
 AMDGPUAsmParser::validateSendMsg(const OperandInfoTy &Msg,
                                  const OperandInfoTy &Op,
-                                 const OperandInfoTy &Stream,
-                                 const SMLoc S) {
+                                 const OperandInfoTy &Stream) {
   using namespace llvm::AMDGPU::SendMsg;
 
   // Validation strictness depends on whether message is specified
@@ -5273,21 +5746,27 @@ AMDGPUAsmParser::validateSendMsg(const OperandInfoTy &Msg,
   bool Strict = Msg.IsSymbolic;
 
   if (!isValidMsgId(Msg.Id, getSTI(), Strict)) {
-    Error(S, "invalid message id");
+    Error(Msg.Loc, "invalid message id");
     return false;
-  } else if (Strict && (msgRequiresOp(Msg.Id) != Op.IsDefined)) {
-    Error(S, Op.IsDefined ?
-             "message does not support operations" :
-             "missing message operation");
+  }
+  if (Strict && (msgRequiresOp(Msg.Id) != Op.IsDefined)) {
+    if (Op.IsDefined) {
+      Error(Op.Loc, "message does not support operations");
+    } else {
+      Error(Msg.Loc, "missing message operation");
+    }
     return false;
-  } else if (!isValidMsgOp(Msg.Id, Op.Id, Strict)) {
-    Error(S, "invalid operation id");
+  }
+  if (!isValidMsgOp(Msg.Id, Op.Id, Strict)) {
+    Error(Op.Loc, "invalid operation id");
     return false;
-  } else if (Strict && !msgSupportsStream(Msg.Id, Op.Id) && Stream.IsDefined) {
-    Error(S, "message operation does not support streams");
+  }
+  if (Strict && !msgSupportsStream(Msg.Id, Op.Id) && Stream.IsDefined) {
+    Error(Stream.Loc, "message operation does not support streams");
     return false;
-  } else if (!isValidMsgStream(Msg.Id, Op.Id, Stream.Id, Strict)) {
-    Error(S, "invalid message stream id");
+  }
+  if (!isValidMsgStream(Msg.Id, Op.Id, Stream.Id, Strict)) {
+    Error(Stream.Loc, "invalid message stream id");
     return false;
   }
   return true;
@@ -5300,19 +5779,23 @@ AMDGPUAsmParser::parseSendMsgOp(OperandVector &Operands) {
   int64_t ImmVal = 0;
   SMLoc Loc = getLoc();
 
-  // If parse failed, do not return error code
-  // to avoid excessive error messages.
   if (trySkipId("sendmsg", AsmToken::LParen)) {
     OperandInfoTy Msg(ID_UNKNOWN_);
     OperandInfoTy Op(OP_NONE_);
     OperandInfoTy Stream(STREAM_ID_NONE_);
     if (parseSendMsgBody(Msg, Op, Stream) &&
-        validateSendMsg(Msg, Op, Stream, Loc)) {
+        validateSendMsg(Msg, Op, Stream)) {
       ImmVal = encodeMsg(Msg.Id, Op.Id, Stream.Id);
+    } else {
+      return MatchOperand_ParseFail;
     }
-  } else if (parseExpr(ImmVal)) {
-    if (ImmVal < 0 || !isUInt<16>(ImmVal))
+  } else if (parseExpr(ImmVal, "a sendmsg macro")) {
+    if (ImmVal < 0 || !isUInt<16>(ImmVal)) {
       Error(Loc, "invalid immediate: only 16-bit values are legal");
+      return MatchOperand_ParseFail;
+    }
+  } else {
+    return MatchOperand_ParseFail;
   }
 
   Operands.push_back(AMDGPUOperand::CreateImm(this, ImmVal, Loc, AMDGPUOperand::ImmTySendMsg));
@@ -5328,34 +5811,40 @@ bool AMDGPUOperand::isSendMsg() const {
 //===----------------------------------------------------------------------===//
 
 OperandMatchResultTy AMDGPUAsmParser::parseInterpSlot(OperandVector &Operands) {
-  if (getLexer().getKind() != AsmToken::Identifier)
+  StringRef Str;
+  SMLoc S = getLoc();
+
+  if (!parseId(Str))
     return MatchOperand_NoMatch;
 
-  StringRef Str = Parser.getTok().getString();
   int Slot = StringSwitch<int>(Str)
     .Case("p10", 0)
     .Case("p20", 1)
     .Case("p0", 2)
     .Default(-1);
 
-  SMLoc S = Parser.getTok().getLoc();
-  if (Slot == -1)
+  if (Slot == -1) {
+    Error(S, "invalid interpolation slot");
     return MatchOperand_ParseFail;
+  }
 
-  Parser.Lex();
   Operands.push_back(AMDGPUOperand::CreateImm(this, Slot, S,
                                               AMDGPUOperand::ImmTyInterpSlot));
   return MatchOperand_Success;
 }
 
 OperandMatchResultTy AMDGPUAsmParser::parseInterpAttr(OperandVector &Operands) {
-  if (getLexer().getKind() != AsmToken::Identifier)
-    return MatchOperand_NoMatch;
+  StringRef Str;
+  SMLoc S = getLoc();
 
-  StringRef Str = Parser.getTok().getString();
-  if (!Str.startswith("attr"))
+  if (!parseId(Str))
     return MatchOperand_NoMatch;
 
+  if (!Str.startswith("attr")) {
+    Error(S, "invalid interpolation attribute");
+    return MatchOperand_ParseFail;
+  }
+
   StringRef Chan = Str.take_back(2);
   int AttrChan = StringSwitch<int>(Chan)
     .Case(".x", 0)
@@ -5363,20 +5852,22 @@ OperandMatchResultTy AMDGPUAsmParser::parseInterpAttr(OperandVector &Operands) {
     .Case(".z", 2)
     .Case(".w", 3)
     .Default(-1);
-  if (AttrChan == -1)
+  if (AttrChan == -1) {
+    Error(S, "invalid or missing interpolation attribute channel");
     return MatchOperand_ParseFail;
+  }
 
   Str = Str.drop_back(2).drop_front(4);
 
   uint8_t Attr;
-  if (Str.getAsInteger(10, Attr))
+  if (Str.getAsInteger(10, Attr)) {
+    Error(S, "invalid or missing interpolation attribute number");
     return MatchOperand_ParseFail;
+  }
 
-  SMLoc S = Parser.getTok().getLoc();
-  Parser.Lex();
   if (Attr > 63) {
-    Error(S, "out of bounds attr");
-    return MatchOperand_Success;
+    Error(S, "out of bounds interpolation attribute number");
+    return MatchOperand_ParseFail;
   }
 
   SMLoc SChan = SMLoc::getFromPointer(Chan.data());
@@ -5392,86 +5883,24 @@ OperandMatchResultTy AMDGPUAsmParser::parseInterpAttr(OperandVector &Operands) {
 // exp
 //===----------------------------------------------------------------------===//
 
-void AMDGPUAsmParser::errorExpTgt() {
-  Error(Parser.getTok().getLoc(), "invalid exp target");
-}
-
-OperandMatchResultTy AMDGPUAsmParser::parseExpTgtImpl(StringRef Str,
-                                                      uint8_t &Val) {
-  if (Str == "null") {
-    Val = 9;
-    return MatchOperand_Success;
-  }
-
-  if (Str.startswith("mrt")) {
-    Str = Str.drop_front(3);
-    if (Str == "z") { // == mrtz
-      Val = 8;
-      return MatchOperand_Success;
-    }
-
-    if (Str.getAsInteger(10, Val))
-      return MatchOperand_ParseFail;
-
-    if (Val > 7)
-      errorExpTgt();
-
-    return MatchOperand_Success;
-  }
-
-  if (Str.startswith("pos")) {
-    Str = Str.drop_front(3);
-    if (Str.getAsInteger(10, Val))
-      return MatchOperand_ParseFail;
-
-    if (Val > 4 || (Val == 4 && !isGFX10()))
-      errorExpTgt();
-
-    Val += 12;
-    return MatchOperand_Success;
-  }
-
-  if (isGFX10() && Str == "prim") {
-    Val = 20;
-    return MatchOperand_Success;
-  }
-
-  if (Str.startswith("param")) {
-    Str = Str.drop_front(5);
-    if (Str.getAsInteger(10, Val))
-      return MatchOperand_ParseFail;
-
-    if (Val >= 32)
-      errorExpTgt();
+OperandMatchResultTy AMDGPUAsmParser::parseExpTgt(OperandVector &Operands) {
+  using namespace llvm::AMDGPU::Exp;
 
-    Val += 32;
-    return MatchOperand_Success;
-  }
+  StringRef Str;
+  SMLoc S = getLoc();
 
-  if (Str.startswith("invalid_target_")) {
-    Str = Str.drop_front(15);
-    if (Str.getAsInteger(10, Val))
-      return MatchOperand_ParseFail;
+  if (!parseId(Str))
+    return MatchOperand_NoMatch;
 
-    errorExpTgt();
-    return MatchOperand_Success;
+  unsigned Id = getTgtId(Str);
+  if (Id == ET_INVALID || !isSupportedTgtId(Id, getSTI())) {
+    Error(S, (Id == ET_INVALID) ?
+                "invalid exp target" :
+                "exp target is not supported on this GPU");
+    return MatchOperand_ParseFail;
   }
 
-  return MatchOperand_NoMatch;
-}
-
-OperandMatchResultTy AMDGPUAsmParser::parseExpTgt(OperandVector &Operands) {
-  uint8_t Val;
-  StringRef Str = Parser.getTok().getString();
-
-  auto Res = parseExpTgtImpl(Str, Val);
-  if (Res != MatchOperand_Success)
-    return Res;
-
-  SMLoc S = Parser.getTok().getLoc();
-  Parser.Lex();
-
-  Operands.push_back(AMDGPUOperand::CreateImm(this, Val, S,
+  Operands.push_back(AMDGPUOperand::CreateImm(this, Id, S,
                                               AMDGPUOperand::ImmTyExpTgt));
   return MatchOperand_Success;
 }
@@ -5534,8 +5963,23 @@ AMDGPUAsmParser::skipToken(const AsmToken::TokenKind Kind,
 }
 
 bool
-AMDGPUAsmParser::parseExpr(int64_t &Imm) {
-  return !getParser().parseAbsoluteExpression(Imm);
+AMDGPUAsmParser::parseExpr(int64_t &Imm, StringRef Expected) {
+  SMLoc S = getLoc();
+
+  const MCExpr *Expr;
+  if (Parser.parseExpression(Expr))
+    return false;
+
+  if (Expr->evaluateAsAbsolute(Imm))
+    return true;
+
+  if (Expected.empty()) {
+    Error(S, "expected absolute expression");
+  } else {
+    Error(S, Twine("expected ", Expected) +
+             Twine(" or an absolute expression"));
+  }
+  return false;
 }
 
 bool
@@ -5567,6 +6011,19 @@ AMDGPUAsmParser::parseString(StringRef &Val, const StringRef ErrMsg) {
   }
 }
 
+bool
+AMDGPUAsmParser::parseId(StringRef &Val, const StringRef ErrMsg) {
+  if (isToken(AsmToken::Identifier)) {
+    Val = getTokenStr();
+    lex();
+    return true;
+  } else {
+    if (!ErrMsg.empty())
+      Error(getLoc(), ErrMsg);
+    return false;
+  }
+}
+
 AsmToken
 AMDGPUAsmParser::getToken() const {
   return Parser.getTok();
@@ -5574,7 +6031,7 @@ AMDGPUAsmParser::getToken() const {
 
 AsmToken
 AMDGPUAsmParser::peekToken() {
-  return getLexer().peekTok();
+  return isToken(AsmToken::EndOfStatement) ? getToken() : getLexer().peekTok();
 }
 
 void
@@ -5605,6 +6062,49 @@ AMDGPUAsmParser::lex() {
   Parser.Lex();
 }
 
+SMLoc
+AMDGPUAsmParser::getOperandLoc(std::function<bool(const AMDGPUOperand&)> Test,
+                               const OperandVector &Operands) const {
+  for (unsigned i = Operands.size() - 1; i > 0; --i) {
+    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
+    if (Test(Op))
+      return Op.getStartLoc();
+  }
+  return ((AMDGPUOperand &)*Operands[0]).getStartLoc();
+}
+
+SMLoc
+AMDGPUAsmParser::getImmLoc(AMDGPUOperand::ImmTy Type,
+                           const OperandVector &Operands) const {
+  auto Test = [=](const AMDGPUOperand& Op) { return Op.isImmTy(Type); };
+  return getOperandLoc(Test, Operands);
+}
+
+SMLoc
+AMDGPUAsmParser::getRegLoc(unsigned Reg,
+                           const OperandVector &Operands) const {
+  auto Test = [=](const AMDGPUOperand& Op) {
+    return Op.isRegKind() && Op.getReg() == Reg;
+  };
+  return getOperandLoc(Test, Operands);
+}
+
+SMLoc
+AMDGPUAsmParser::getLitLoc(const OperandVector &Operands) const {
+  auto Test = [](const AMDGPUOperand& Op) {
+    return Op.IsImmKindLiteral() || Op.isExpr();
+  };
+  return getOperandLoc(Test, Operands);
+}
+
+SMLoc
+AMDGPUAsmParser::getConstLoc(const OperandVector &Operands) const {
+  auto Test = [](const AMDGPUOperand& Op) {
+    return Op.isImmKindConst();
+  };
+  return getOperandLoc(Test, Operands);
+}
+
 //===----------------------------------------------------------------------===//
 // swizzle
 //===----------------------------------------------------------------------===//
@@ -5622,23 +6122,36 @@ encodeBitmaskPerm(const unsigned AndMask,
          (XorMask << BITMASK_XOR_SHIFT);
 }
 
+bool
+AMDGPUAsmParser::parseSwizzleOperand(int64_t &Op,
+                                     const unsigned MinVal,
+                                     const unsigned MaxVal,
+                                     const StringRef ErrMsg,
+                                     SMLoc &Loc) {
+  if (!skipToken(AsmToken::Comma, "expected a comma")) {
+    return false;
+  }
+  Loc = getLoc();
+  if (!parseExpr(Op)) {
+    return false;
+  }
+  if (Op < MinVal || Op > MaxVal) {
+    Error(Loc, ErrMsg);
+    return false;
+  }
+
+  return true;
+}
+
 bool
 AMDGPUAsmParser::parseSwizzleOperands(const unsigned OpNum, int64_t* Op,
                                       const unsigned MinVal,
                                       const unsigned MaxVal,
                                       const StringRef ErrMsg) {
+  SMLoc Loc;
   for (unsigned i = 0; i < OpNum; ++i) {
-    if (!skipToken(AsmToken::Comma, "expected a comma")){
+    if (!parseSwizzleOperand(Op[i], MinVal, MaxVal, ErrMsg, Loc))
       return false;
-    }
-    SMLoc ExprLoc = Parser.getTok().getLoc();
-    if (!parseExpr(Op[i])) {
-      return false;
-    }
-    if (Op[i] < MinVal || Op[i] > MaxVal) {
-      Error(ExprLoc, ErrMsg);
-      return false;
-    }
   }
 
   return true;
@@ -5664,22 +6177,24 @@ bool
 AMDGPUAsmParser::parseSwizzleBroadcast(int64_t &Imm) {
   using namespace llvm::AMDGPU::Swizzle;
 
-  SMLoc S = Parser.getTok().getLoc();
+  SMLoc Loc;
   int64_t GroupSize;
   int64_t LaneIdx;
 
-  if (!parseSwizzleOperands(1, &GroupSize,
-                            2, 32,
-                            "group size must be in the interval [2,32]")) {
+  if (!parseSwizzleOperand(GroupSize,
+                           2, 32,
+                           "group size must be in the interval [2,32]",
+                           Loc)) {
     return false;
   }
   if (!isPowerOf2_64(GroupSize)) {
-    Error(S, "group size must be a power of two");
+    Error(Loc, "group size must be a power of two");
     return false;
   }
-  if (parseSwizzleOperands(1, &LaneIdx,
-                           0, GroupSize - 1,
-                           "lane id must be in the interval [0,group size - 1]")) {
+  if (parseSwizzleOperand(LaneIdx,
+                          0, GroupSize - 1,
+                          "lane id must be in the interval [0,group size - 1]",
+                          Loc)) {
     Imm = encodeBitmaskPerm(BITMASK_MAX - GroupSize + 1, LaneIdx, 0);
     return true;
   }
@@ -5690,15 +6205,17 @@ bool
 AMDGPUAsmParser::parseSwizzleReverse(int64_t &Imm) {
   using namespace llvm::AMDGPU::Swizzle;
 
-  SMLoc S = Parser.getTok().getLoc();
+  SMLoc Loc;
   int64_t GroupSize;
 
-  if (!parseSwizzleOperands(1, &GroupSize,
-      2, 32, "group size must be in the interval [2,32]")) {
+  if (!parseSwizzleOperand(GroupSize,
+                           2, 32,
+                           "group size must be in the interval [2,32]",
+                           Loc)) {
     return false;
   }
   if (!isPowerOf2_64(GroupSize)) {
-    Error(S, "group size must be a power of two");
+    Error(Loc, "group size must be a power of two");
     return false;
   }
 
@@ -5710,15 +6227,17 @@ bool
 AMDGPUAsmParser::parseSwizzleSwap(int64_t &Imm) {
   using namespace llvm::AMDGPU::Swizzle;
 
-  SMLoc S = Parser.getTok().getLoc();
+  SMLoc Loc;
   int64_t GroupSize;
 
-  if (!parseSwizzleOperands(1, &GroupSize,
-      1, 16, "group size must be in the interval [1,16]")) {
+  if (!parseSwizzleOperand(GroupSize,
+                           1, 16,
+                           "group size must be in the interval [1,16]",
+                           Loc)) {
     return false;
   }
   if (!isPowerOf2_64(GroupSize)) {
-    Error(S, "group size must be a power of two");
+    Error(Loc, "group size must be a power of two");
     return false;
   }
 
@@ -5735,7 +6254,7 @@ AMDGPUAsmParser::parseSwizzleBitmaskPerm(int64_t &Imm) {
   }
 
   StringRef Ctl;
-  SMLoc StrLoc = Parser.getTok().getLoc();
+  SMLoc StrLoc = getLoc();
   if (!parseString(Ctl)) {
     return false;
   }
@@ -5776,9 +6295,9 @@ AMDGPUAsmParser::parseSwizzleBitmaskPerm(int64_t &Imm) {
 bool
 AMDGPUAsmParser::parseSwizzleOffset(int64_t &Imm) {
 
-  SMLoc OffsetLoc = Parser.getTok().getLoc();
+  SMLoc OffsetLoc = getLoc();
 
-  if (!parseExpr(Imm)) {
+  if (!parseExpr(Imm, "a swizzle macro")) {
     return false;
   }
   if (!isUInt<16>(Imm)) {
@@ -5794,7 +6313,7 @@ AMDGPUAsmParser::parseSwizzleMacro(int64_t &Imm) {
 
   if (skipToken(AsmToken::LParen, "expected a left parentheses")) {
 
-    SMLoc ModeLoc = Parser.getTok().getLoc();
+    SMLoc ModeLoc = getLoc();
     bool Ok = false;
 
     if (trySkipId(IdSymbolic[ID_QUAD_PERM])) {
@@ -5819,7 +6338,7 @@ AMDGPUAsmParser::parseSwizzleMacro(int64_t &Imm) {
 
 OperandMatchResultTy
 AMDGPUAsmParser::parseSwizzleOp(OperandVector &Operands) {
-  SMLoc S = Parser.getTok().getLoc();
+  SMLoc S = getLoc();
   int64_t Imm = 0;
 
   if (trySkipId("offset")) {
@@ -5864,7 +6383,7 @@ int64_t AMDGPUAsmParser::parseGPRIdxMacro() {
 
   while (true) {
     unsigned Mode = 0;
-    SMLoc S = Parser.getTok().getLoc();
+    SMLoc S = getLoc();
 
     for (unsigned ModeId = ID_MIN; ModeId <= ID_MAX; ++ModeId) {
       if (trySkipId(IdSymbolic[ModeId])) {
@@ -5877,12 +6396,12 @@ int64_t AMDGPUAsmParser::parseGPRIdxMacro() {
       Error(S, (Imm == 0)?
                "expected a VGPR index mode or a closing parenthesis" :
                "expected a VGPR index mode");
-      break;
+      return UNDEF;
     }
 
     if (Imm & Mode) {
       Error(S, "duplicate VGPR index mode");
-      break;
+      return UNDEF;
     }
     Imm |= Mode;
 
@@ -5890,7 +6409,7 @@ int64_t AMDGPUAsmParser::parseGPRIdxMacro() {
       break;
     if (!skipToken(AsmToken::Comma,
                    "expected a comma or a closing parenthesis"))
-      break;
+      return UNDEF;
   }
 
   return Imm;
@@ -5899,25 +6418,21 @@ int64_t AMDGPUAsmParser::parseGPRIdxMacro() {
 OperandMatchResultTy
 AMDGPUAsmParser::parseGPRIdxMode(OperandVector &Operands) {
 
-  int64_t Imm = 0;
-  SMLoc S = Parser.getTok().getLoc();
-
-  if (getLexer().getKind() == AsmToken::Identifier &&
-      Parser.getTok().getString() == "gpr_idx" &&
-      getLexer().peekTok().is(AsmToken::LParen)) {
+  using namespace llvm::AMDGPU::VGPRIndexMode;
 
-    Parser.Lex();
-    Parser.Lex();
+  int64_t Imm = 0;
+  SMLoc S = getLoc();
 
-    // If parse failed, trigger an error but do not return error code
-    // to avoid excessive error messages.
+  if (trySkipId("gpr_idx", AsmToken::LParen)) {
     Imm = parseGPRIdxMacro();
-
+    if (Imm == UNDEF)
+      return MatchOperand_ParseFail;
   } else {
     if (getParser().parseAbsoluteExpression(Imm))
-      return MatchOperand_NoMatch;
+      return MatchOperand_ParseFail;
     if (Imm < 0 || !isUInt<4>(Imm)) {
       Error(S, "invalid immediate: only 4-bit values are legal");
+      return MatchOperand_ParseFail;
     }
   }
 
@@ -5943,22 +6458,22 @@ AMDGPUAsmParser::parseSOppBrTarget(OperandVector &Operands) {
   if (isRegister() || isModifier())
     return MatchOperand_NoMatch;
 
-  if (parseExpr(Operands)) {
+  if (!parseExpr(Operands))
+    return MatchOperand_ParseFail;
 
-    AMDGPUOperand &Opr = ((AMDGPUOperand &)*Operands[Operands.size() - 1]);
-    assert(Opr.isImm() || Opr.isExpr());
-    SMLoc Loc = Opr.getStartLoc();
+  AMDGPUOperand &Opr = ((AMDGPUOperand &)*Operands[Operands.size() - 1]);
+  assert(Opr.isImm() || Opr.isExpr());
+  SMLoc Loc = Opr.getStartLoc();
 
-    // Currently we do not support arbitrary expressions as branch targets.
-    // Only labels and absolute expressions are accepted.
-    if (Opr.isExpr() && !Opr.isSymbolRefExpr()) {
-      Error(Loc, "expected an absolute expression or a label");
-    } else if (Opr.isImm() && !Opr.isS16Imm()) {
-      Error(Loc, "expected a 16-bit signed jump offset");
-    }
+  // Currently we do not support arbitrary expressions as branch targets.
+  // Only labels and absolute expressions are accepted.
+  if (Opr.isExpr() && !Opr.isSymbolRefExpr()) {
+    Error(Loc, "expected an absolute expression or a label");
+  } else if (Opr.isImm() && !Opr.isS16Imm()) {
+    Error(Loc, "expected a 16-bit signed jump offset");
   }
 
-  return MatchOperand_Success; // avoid excessive error messages
+  return MatchOperand_Success;
 }
 
 //===----------------------------------------------------------------------===//
@@ -5982,6 +6497,10 @@ AMDGPUOperand::Ptr AMDGPUAsmParser::defaultGLC() const {
   return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyGLC);
 }
 
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultGLC_1() const {
+  return AMDGPUOperand::CreateImm(this, -1, SMLoc(), AMDGPUOperand::ImmTyGLC);
+}
+
 AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSLC() const {
   return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTySLC);
 }
@@ -6046,8 +6565,9 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
   }
 
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset);
-  if (!IsAtomic) { // glc is hard-coded.
-    addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC);
+  if (!IsAtomic || IsAtomicReturn) {
+    addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC,
+                          IsAtomicReturn ? -1 : 0);
   }
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
 
@@ -6055,7 +6575,7 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
     addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
   }
 
-  if (isGFX10())
+  if (isGFX10Plus())
     addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDLC);
 }
 
@@ -6095,7 +6615,7 @@ void AMDGPUAsmParser::cvtMtbuf(MCInst &Inst, const OperandVector &Operands) {
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
 
-  if (isGFX10())
+  if (isGFX10Plus())
     addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDLC);
 }
 
@@ -6132,22 +6652,22 @@ void AMDGPUAsmParser::cvtMIMG(MCInst &Inst, const OperandVector &Operands,
     }
   }
 
-  bool IsGFX10 = isGFX10();
+  bool IsGFX10Plus = isGFX10Plus();
 
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDMask);
-  if (IsGFX10)
+  if (IsGFX10Plus)
     addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDim, -1);
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyUNorm);
-  if (IsGFX10)
+  if (IsGFX10Plus)
     addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDLC);
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC);
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyR128A16);
-  if (IsGFX10)
+  if (IsGFX10Plus)
     addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyA16);
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyLWE);
-  if (!IsGFX10)
+  if (!IsGFX10Plus)
     addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDA);
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyD16);
 }
@@ -6156,6 +6676,17 @@ void AMDGPUAsmParser::cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands)
   cvtMIMG(Inst, Operands, true);
 }
 
+void AMDGPUAsmParser::cvtIntersectRay(MCInst &Inst,
+                                      const OperandVector &Operands) {
+  for (unsigned I = 1; I < Operands.size(); ++I) {
+    auto &Operand = (AMDGPUOperand &)*Operands[I];
+    if (Operand.isReg())
+      Operand.addRegOperands(Inst, 1);
+  }
+
+  Inst.addOperand(MCOperand::createImm(1)); // a16
+}
+
 //===----------------------------------------------------------------------===//
 // smrd
 //===----------------------------------------------------------------------===//
@@ -6242,7 +6773,6 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
   {"offset",  AMDGPUOperand::ImmTyOffset, false, nullptr},
   {"inst_offset", AMDGPUOperand::ImmTyInstOffset, false, nullptr},
   {"dlc",     AMDGPUOperand::ImmTyDLC, true, nullptr},
-  {"format",  AMDGPUOperand::ImmTyFORMAT, false, nullptr},
   {"glc",     AMDGPUOperand::ImmTyGLC, true, nullptr},
   {"slc",     AMDGPUOperand::ImmTySLC, true, nullptr},
   {"swz",     AMDGPUOperand::ImmTySWZ, true, nullptr},
@@ -6327,8 +6857,6 @@ OperandMatchResultTy AMDGPUAsmParser::parseOptionalOpr(OperandVector &Operands)
                                         Op.ConvertResult);
     } else if (Op.Type == AMDGPUOperand::ImmTyDim) {
       res = parseDim(Operands);
-    } else if (Op.Type == AMDGPUOperand::ImmTyFORMAT && !isGFX10()) {
-      res = parseDfmtNfmt(Operands);
     } else {
       res = parseIntWithPrefix(Op.Name, Operands, Op.Type, Op.ConvertResult);
     }
@@ -6340,7 +6868,7 @@ OperandMatchResultTy AMDGPUAsmParser::parseOptionalOpr(OperandVector &Operands)
 }
 
 OperandMatchResultTy AMDGPUAsmParser::parseOModOperand(OperandVector &Operands) {
-  StringRef Name = Parser.getTok().getString();
+  StringRef Name = getTokenStr();
   if (Name == "mul") {
     return parseIntWithPrefix("mul", Operands,
                               AMDGPUOperand::ImmTyOModSI, ConvertOmodMul);
@@ -6479,15 +7007,19 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands,
   if (Opc == AMDGPU::V_MAC_F32_e64_gfx6_gfx7 ||
       Opc == AMDGPU::V_MAC_F32_e64_gfx10 ||
       Opc == AMDGPU::V_MAC_F32_e64_vi ||
+      Opc == AMDGPU::V_MAC_LEGACY_F32_e64_gfx6_gfx7 ||
+      Opc == AMDGPU::V_MAC_LEGACY_F32_e64_gfx10 ||
       Opc == AMDGPU::V_MAC_F16_e64_vi ||
       Opc == AMDGPU::V_FMAC_F32_e64_gfx10 ||
       Opc == AMDGPU::V_FMAC_F32_e64_vi ||
+      Opc == AMDGPU::V_FMAC_LEGACY_F32_e64_gfx10 ||
       Opc == AMDGPU::V_FMAC_F16_e64_gfx10) {
     auto it = Inst.begin();
     std::advance(it, AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2_modifiers));
     it = Inst.insert(it, MCOperand::createImm(0)); // no modifiers for src2
     ++it;
-    Inst.insert(it, Inst.getOperand(0)); // src2 = dst
+    // Copy the operand to ensure it's not invalidated when Inst grows.
+    Inst.insert(it, MCOperand(Inst.getOperand(0))); // src2 = dst
   }
 }
 
@@ -6636,35 +7168,27 @@ bool AMDGPUOperand::isU16Imm() const {
 }
 
 OperandMatchResultTy AMDGPUAsmParser::parseDim(OperandVector &Operands) {
-  if (!isGFX10())
+  if (!isGFX10Plus())
     return MatchOperand_NoMatch;
 
-  SMLoc S = Parser.getTok().getLoc();
+  SMLoc S = getLoc();
 
-  if (getLexer().isNot(AsmToken::Identifier))
-    return MatchOperand_NoMatch;
-  if (getLexer().getTok().getString() != "dim")
+  if (!trySkipId("dim", AsmToken::Colon))
     return MatchOperand_NoMatch;
 
-  Parser.Lex();
-  if (getLexer().isNot(AsmToken::Colon))
-    return MatchOperand_ParseFail;
-
-  Parser.Lex();
-
   // We want to allow "dim:1D" etc., but the initial 1 is tokenized as an
   // integer.
   std::string Token;
-  if (getLexer().is(AsmToken::Integer)) {
-    SMLoc Loc = getLexer().getTok().getEndLoc();
-    Token = std::string(getLexer().getTok().getString());
-    Parser.Lex();
-    if (getLexer().getTok().getLoc() != Loc)
+  if (isToken(AsmToken::Integer)) {
+    SMLoc Loc = getToken().getEndLoc();
+    Token = std::string(getTokenStr());
+    lex();
+    if (getLoc() != Loc)
       return MatchOperand_ParseFail;
   }
-  if (getLexer().isNot(AsmToken::Identifier))
+  if (!isToken(AsmToken::Identifier))
     return MatchOperand_ParseFail;
-  Token += getLexer().getTok().getString();
+  Token += getTokenStr();
 
   StringRef DimId = Token;
   if (DimId.startswith("SQ_RSRC_IMG_"))
@@ -6674,7 +7198,7 @@ OperandMatchResultTy AMDGPUAsmParser::parseDim(OperandVector &Operands) {
   if (!DimInfo)
     return MatchOperand_ParseFail;
 
-  Parser.Lex();
+  lex();
 
   Operands.push_back(AMDGPUOperand::CreateImm(this, DimInfo->Encoding, S,
                                               AMDGPUOperand::ImmTyDim));
@@ -6682,52 +7206,33 @@ OperandMatchResultTy AMDGPUAsmParser::parseDim(OperandVector &Operands) {
 }
 
 OperandMatchResultTy AMDGPUAsmParser::parseDPP8(OperandVector &Operands) {
-  SMLoc S = Parser.getTok().getLoc();
-  StringRef Prefix;
-
-  if (getLexer().getKind() == AsmToken::Identifier) {
-    Prefix = Parser.getTok().getString();
-  } else {
-    return MatchOperand_NoMatch;
-  }
+  SMLoc S = getLoc();
 
-  if (Prefix != "dpp8")
-    return parseDPPCtrl(Operands);
-  if (!isGFX10())
+  if (!isGFX10Plus() || !trySkipId("dpp8", AsmToken::Colon))
     return MatchOperand_NoMatch;
 
   // dpp8:[%d,%d,%d,%d,%d,%d,%d,%d]
 
   int64_t Sels[8];
 
-  Parser.Lex();
-  if (getLexer().isNot(AsmToken::Colon))
-    return MatchOperand_ParseFail;
-
-  Parser.Lex();
-  if (getLexer().isNot(AsmToken::LBrac))
-    return MatchOperand_ParseFail;
-
-  Parser.Lex();
-  if (getParser().parseAbsoluteExpression(Sels[0]))
-    return MatchOperand_ParseFail;
-  if (0 > Sels[0] || 7 < Sels[0])
+  if (!skipToken(AsmToken::LBrac, "expected an opening square bracket"))
     return MatchOperand_ParseFail;
 
-  for (size_t i = 1; i < 8; ++i) {
-    if (getLexer().isNot(AsmToken::Comma))
+  for (size_t i = 0; i < 8; ++i) {
+    if (i > 0 && !skipToken(AsmToken::Comma, "expected a comma"))
       return MatchOperand_ParseFail;
 
-    Parser.Lex();
+    SMLoc Loc = getLoc();
     if (getParser().parseAbsoluteExpression(Sels[i]))
       return MatchOperand_ParseFail;
-    if (0 > Sels[i] || 7 < Sels[i])
+    if (0 > Sels[i] || 7 < Sels[i]) {
+      Error(Loc, "expected a 3-bit value");
       return MatchOperand_ParseFail;
+    }
   }
 
-  if (getLexer().isNot(AsmToken::RBrac))
+  if (!skipToken(AsmToken::RBrac, "expected a closing square bracket"))
     return MatchOperand_ParseFail;
-  Parser.Lex();
 
   unsigned DPP8 = 0;
   for (size_t i = 0; i < 8; ++i)
@@ -6737,119 +7242,138 @@ OperandMatchResultTy AMDGPUAsmParser::parseDPP8(OperandVector &Operands) {
   return MatchOperand_Success;
 }
 
-OperandMatchResultTy
-AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) {
+bool
+AMDGPUAsmParser::isSupportedDPPCtrl(StringRef Ctrl,
+                                    const OperandVector &Operands) {
+  if (Ctrl == "row_share" ||
+      Ctrl == "row_xmask")
+    return isGFX10Plus();
+
+  if (Ctrl == "wave_shl" ||
+      Ctrl == "wave_shr" ||
+      Ctrl == "wave_rol" ||
+      Ctrl == "wave_ror" ||
+      Ctrl == "row_bcast")
+    return isVI() || isGFX9();
+
+  return Ctrl == "row_mirror" ||
+         Ctrl == "row_half_mirror" ||
+         Ctrl == "quad_perm" ||
+         Ctrl == "row_shl" ||
+         Ctrl == "row_shr" ||
+         Ctrl == "row_ror";
+}
+
+int64_t
+AMDGPUAsmParser::parseDPPCtrlPerm() {
+  // quad_perm:[%d,%d,%d,%d]
+
+  if (!skipToken(AsmToken::LBrac, "expected an opening square bracket"))
+    return -1;
+
+  int64_t Val = 0;
+  for (int i = 0; i < 4; ++i) {
+    if (i > 0 && !skipToken(AsmToken::Comma, "expected a comma"))
+      return -1;
+
+    int64_t Temp;
+    SMLoc Loc = getLoc();
+    if (getParser().parseAbsoluteExpression(Temp))
+      return -1;
+    if (Temp < 0 || Temp > 3) {
+      Error(Loc, "expected a 2-bit value");
+      return -1;
+    }
+
+    Val += (Temp << i * 2);
+  }
+
+  if (!skipToken(AsmToken::RBrac, "expected a closing square bracket"))
+    return -1;
+
+  return Val;
+}
+
+int64_t
+AMDGPUAsmParser::parseDPPCtrlSel(StringRef Ctrl) {
   using namespace AMDGPU::DPP;
 
-  SMLoc S = Parser.getTok().getLoc();
-  StringRef Prefix;
-  int64_t Int;
+  // sel:%d
 
-  if (getLexer().getKind() == AsmToken::Identifier) {
-    Prefix = Parser.getTok().getString();
+  int64_t Val;
+  SMLoc Loc = getLoc();
+
+  if (getParser().parseAbsoluteExpression(Val))
+    return -1;
+
+  struct DppCtrlCheck {
+    int64_t Ctrl;
+    int Lo;
+    int Hi;
+  };
+
+  DppCtrlCheck Check = StringSwitch<DppCtrlCheck>(Ctrl)
+    .Case("wave_shl",  {DppCtrl::WAVE_SHL1,       1,  1})
+    .Case("wave_rol",  {DppCtrl::WAVE_ROL1,       1,  1})
+    .Case("wave_shr",  {DppCtrl::WAVE_SHR1,       1,  1})
+    .Case("wave_ror",  {DppCtrl::WAVE_ROR1,       1,  1})
+    .Case("row_shl",   {DppCtrl::ROW_SHL0,        1, 15})
+    .Case("row_shr",   {DppCtrl::ROW_SHR0,        1, 15})
+    .Case("row_ror",   {DppCtrl::ROW_ROR0,        1, 15})
+    .Case("row_share", {DppCtrl::ROW_SHARE_FIRST, 0, 15})
+    .Case("row_xmask", {DppCtrl::ROW_XMASK_FIRST, 0, 15})
+    .Default({-1, 0, 0});
+
+  bool Valid;
+  if (Check.Ctrl == -1) {
+    Valid = (Ctrl == "row_bcast" && (Val == 15 || Val == 31));
+    Val = (Val == 15)? DppCtrl::BCAST15 : DppCtrl::BCAST31;
   } else {
-    return MatchOperand_NoMatch;
+    Valid = Check.Lo <= Val && Val <= Check.Hi;
+    Val = (Check.Lo == Check.Hi) ? Check.Ctrl : (Check.Ctrl | Val);
   }
 
-  if (Prefix == "row_mirror") {
-    Int = DppCtrl::ROW_MIRROR;
-    Parser.Lex();
-  } else if (Prefix == "row_half_mirror") {
-    Int = DppCtrl::ROW_HALF_MIRROR;
-    Parser.Lex();
-  } else {
-    // Check to prevent parseDPPCtrlOps from eating invalid tokens
-    if (Prefix != "quad_perm"
-        && Prefix != "row_shl"
-        && Prefix != "row_shr"
-        && Prefix != "row_ror"
-        && Prefix != "wave_shl"
-        && Prefix != "wave_rol"
-        && Prefix != "wave_shr"
-        && Prefix != "wave_ror"
-        && Prefix != "row_bcast"
-        && Prefix != "row_share"
-        && Prefix != "row_xmask") {
-      return MatchOperand_NoMatch;
-    }
-
-    if (!isGFX10() && (Prefix == "row_share" || Prefix == "row_xmask"))
-      return MatchOperand_NoMatch;
-
-    if (!isVI() && !isGFX9() &&
-        (Prefix == "wave_shl" || Prefix == "wave_shr" ||
-         Prefix == "wave_rol" || Prefix == "wave_ror" ||
-         Prefix == "row_bcast"))
-      return MatchOperand_NoMatch;
-
-    Parser.Lex();
-    if (getLexer().isNot(AsmToken::Colon))
-      return MatchOperand_ParseFail;
+  if (!Valid) {
+    Error(Loc, Twine("invalid ", Ctrl) + Twine(" value"));
+    return -1;
+  }
 
-    if (Prefix == "quad_perm") {
-      // quad_perm:[%d,%d,%d,%d]
-      Parser.Lex();
-      if (getLexer().isNot(AsmToken::LBrac))
-        return MatchOperand_ParseFail;
-      Parser.Lex();
+  return Val;
+}
 
-      if (getParser().parseAbsoluteExpression(Int) || !(0 <= Int && Int <=3))
-        return MatchOperand_ParseFail;
+OperandMatchResultTy
+AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) {
+  using namespace AMDGPU::DPP;
 
-      for (int i = 0; i < 3; ++i) {
-        if (getLexer().isNot(AsmToken::Comma))
-          return MatchOperand_ParseFail;
-        Parser.Lex();
+  if (!isToken(AsmToken::Identifier) ||
+      !isSupportedDPPCtrl(getTokenStr(), Operands))
+    return MatchOperand_NoMatch;
 
-        int64_t Temp;
-        if (getParser().parseAbsoluteExpression(Temp) || !(0 <= Temp && Temp <=3))
-          return MatchOperand_ParseFail;
-        const int shift = i*2 + 2;
-        Int += (Temp << shift);
-      }
+  SMLoc S = getLoc();
+  int64_t Val = -1;
+  StringRef Ctrl;
 
-      if (getLexer().isNot(AsmToken::RBrac))
-        return MatchOperand_ParseFail;
-      Parser.Lex();
-    } else {
-      // sel:%d
-      Parser.Lex();
-      if (getParser().parseAbsoluteExpression(Int))
-        return MatchOperand_ParseFail;
+  parseId(Ctrl);
 
-      if (Prefix == "row_shl" && 1 <= Int && Int <= 15) {
-        Int |= DppCtrl::ROW_SHL0;
-      } else if (Prefix == "row_shr" && 1 <= Int && Int <= 15) {
-        Int |= DppCtrl::ROW_SHR0;
-      } else if (Prefix == "row_ror" && 1 <= Int && Int <= 15) {
-        Int |= DppCtrl::ROW_ROR0;
-      } else if (Prefix == "wave_shl" && 1 == Int) {
-        Int = DppCtrl::WAVE_SHL1;
-      } else if (Prefix == "wave_rol" && 1 == Int) {
-        Int = DppCtrl::WAVE_ROL1;
-      } else if (Prefix == "wave_shr" && 1 == Int) {
-        Int = DppCtrl::WAVE_SHR1;
-      } else if (Prefix == "wave_ror" && 1 == Int) {
-        Int = DppCtrl::WAVE_ROR1;
-      } else if (Prefix == "row_bcast") {
-        if (Int == 15) {
-          Int = DppCtrl::BCAST15;
-        } else if (Int == 31) {
-          Int = DppCtrl::BCAST31;
-        } else {
-          return MatchOperand_ParseFail;
-        }
-      } else if (Prefix == "row_share" && 0 <= Int && Int <= 15) {
-        Int |= DppCtrl::ROW_SHARE_FIRST;
-      } else if (Prefix == "row_xmask" && 0 <= Int && Int <= 15) {
-        Int |= DppCtrl::ROW_XMASK_FIRST;
+  if (Ctrl == "row_mirror") {
+    Val = DppCtrl::ROW_MIRROR;
+  } else if (Ctrl == "row_half_mirror") {
+    Val = DppCtrl::ROW_HALF_MIRROR;
+  } else {
+    if (skipToken(AsmToken::Colon, "expected a colon")) {
+      if (Ctrl == "quad_perm") {
+        Val = parseDPPCtrlPerm();
       } else {
-        return MatchOperand_ParseFail;
+        Val = parseDPPCtrlSel(Ctrl);
       }
     }
   }
 
-  Operands.push_back(AMDGPUOperand::CreateImm(this, Int, S, AMDGPUOperand::ImmTyDppCtrl));
+  if (Val == -1)
+    return MatchOperand_ParseFail;
+
+  Operands.push_back(
+    AMDGPUOperand::CreateImm(this, Val, S, AMDGPUOperand::ImmTyDppCtrl));
   return MatchOperand_Success;
 }
 
@@ -6947,11 +7471,12 @@ AMDGPUAsmParser::parseSDWASel(OperandVector &Operands, StringRef Prefix,
                               AMDGPUOperand::ImmTy Type) {
   using namespace llvm::AMDGPU::SDWA;
 
-  SMLoc S = Parser.getTok().getLoc();
+  SMLoc S = getLoc();
   StringRef Value;
   OperandMatchResultTy res;
 
-  res = parseStringWithPrefix(Prefix, Value);
+  SMLoc StringLoc;
+  res = parseStringWithPrefix(Prefix, Value, StringLoc);
   if (res != MatchOperand_Success) {
     return res;
   }
@@ -6966,9 +7491,9 @@ AMDGPUAsmParser::parseSDWASel(OperandVector &Operands, StringRef Prefix,
         .Case("WORD_1", SdwaSel::WORD_1)
         .Case("DWORD", SdwaSel::DWORD)
         .Default(0xffffffff);
-  Parser.Lex(); // eat last token
 
   if (Int == 0xffffffff) {
+    Error(StringLoc, "invalid " + Twine(Prefix) + " value");
     return MatchOperand_ParseFail;
   }
 
@@ -6980,11 +7505,12 @@ OperandMatchResultTy
 AMDGPUAsmParser::parseSDWADstUnused(OperandVector &Operands) {
   using namespace llvm::AMDGPU::SDWA;
 
-  SMLoc S = Parser.getTok().getLoc();
+  SMLoc S = getLoc();
   StringRef Value;
   OperandMatchResultTy res;
 
-  res = parseStringWithPrefix("dst_unused", Value);
+  SMLoc StringLoc;
+  res = parseStringWithPrefix("dst_unused", Value, StringLoc);
   if (res != MatchOperand_Success) {
     return res;
   }
@@ -6995,9 +7521,9 @@ AMDGPUAsmParser::parseSDWADstUnused(OperandVector &Operands) {
         .Case("UNUSED_SEXT", DstUnused::UNUSED_SEXT)
         .Case("UNUSED_PRESERVE", DstUnused::UNUSED_PRESERVE)
         .Default(0xffffffff);
-  Parser.Lex(); // eat last token
 
   if (Int == 0xffffffff) {
+    Error(StringLoc, "invalid dst_unused value");
     return MatchOperand_ParseFail;
   }
 
@@ -7146,6 +7672,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUAsmParser() {
 #define GET_REGISTER_MATCHER
 #define GET_MATCHER_IMPLEMENTATION
 #define GET_MNEMONIC_SPELL_CHECKER
+#define GET_MNEMONIC_CHECKER
 #include "AMDGPUGenAsmMatcher.inc"
 
 // This fuction should be defined after auto-generated include so that we have
@@ -7210,7 +7737,7 @@ unsigned AMDGPUAsmParser::validateTargetOperandClass(MCParsedAsmOperand &Op,
 //===----------------------------------------------------------------------===//
 
 OperandMatchResultTy AMDGPUAsmParser::parseEndpgmOp(OperandVector &Operands) {
-  SMLoc S = Parser.getTok().getLoc();
+  SMLoc S = getLoc();
   int64_t Imm = 0;
 
   if (!parseExpr(Imm)) {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/BUFInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/BUFInstructions.td
index fa42ddc54b56..5dc5481df49e 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -114,6 +114,7 @@ class MTBUF_Real <MTBUF_Pseudo ps> :
   let isCodeGenOnly = 0;
 
   // copy relevant pseudo op flags
+  let UseNamedOperandTable = ps.UseNamedOperandTable;
   let SubtargetPredicate = ps.SubtargetPredicate;
   let AsmMatchConverter  = ps.AsmMatchConverter;
   let Constraints        = ps.Constraints;
@@ -168,29 +169,29 @@ class getMTBUFIns<int addrKind, list<RegisterClass> vdataList=[]> {
 
 class getMTBUFAsmOps<int addrKind> {
   string Pfx =
-    !if(!eq(addrKind, BUFAddrKind.Offset), "off, $srsrc, $format, $soffset",
+    !if(!eq(addrKind, BUFAddrKind.Offset), "off, $srsrc,$format $soffset",
     !if(!eq(addrKind, BUFAddrKind.OffEn),
-            "$vaddr, $srsrc, $format, $soffset offen",
+            "$vaddr, $srsrc,$format $soffset offen",
     !if(!eq(addrKind, BUFAddrKind.IdxEn),
-            "$vaddr, $srsrc, $format, $soffset idxen",
+            "$vaddr, $srsrc,$format $soffset idxen",
     !if(!eq(addrKind, BUFAddrKind.BothEn),
-            "$vaddr, $srsrc, $format, $soffset idxen offen",
+            "$vaddr, $srsrc,$format $soffset idxen offen",
     !if(!eq(addrKind, BUFAddrKind.Addr64),
-            "$vaddr, $srsrc, $format, $soffset addr64",
+            "$vaddr, $srsrc,$format $soffset addr64",
     "")))));
   string ret = Pfx # "$offset";
 }
 
 class MTBUF_SetupAddr<int addrKind> {
-  bits<1> offen  = !if(!eq(addrKind, BUFAddrKind.OffEn), 1,
-                   !if(!eq(addrKind, BUFAddrKind.BothEn), 1 , 0));
+  bits<1> offen  = !or(!eq(addrKind, BUFAddrKind.OffEn),
+                       !eq(addrKind, BUFAddrKind.BothEn));
 
-  bits<1> idxen  = !if(!eq(addrKind, BUFAddrKind.IdxEn), 1,
-                   !if(!eq(addrKind, BUFAddrKind.BothEn), 1 , 0));
+  bits<1> idxen  = !or(!eq(addrKind, BUFAddrKind.IdxEn),
+                       !eq(addrKind, BUFAddrKind.BothEn));
 
-  bits<1> addr64 = !if(!eq(addrKind, BUFAddrKind.Addr64), 1, 0);
+  bits<1> addr64 = !eq(addrKind, BUFAddrKind.Addr64);
 
-  bits<1> has_vaddr = !if(!eq(addrKind, BUFAddrKind.Offset), 0, 1);
+  bits<1> has_vaddr = !ne(addrKind, BUFAddrKind.Offset);
 }
 
 class MTBUF_Load_Pseudo <string opName,
@@ -349,11 +350,13 @@ class MUBUF_Real <MUBUF_Pseudo ps> :
   let isCodeGenOnly = 0;
 
   // copy relevant pseudo op flags
-  let SubtargetPredicate = ps.SubtargetPredicate;
-  let AsmMatchConverter  = ps.AsmMatchConverter;
-  let Constraints        = ps.Constraints;
-  let DisableEncoding    = ps.DisableEncoding;
-  let TSFlags            = ps.TSFlags;
+  let SubtargetPredicate   = ps.SubtargetPredicate;
+  let AsmMatchConverter    = ps.AsmMatchConverter;
+  let OtherPredicates      = ps.OtherPredicates;
+  let Constraints          = ps.Constraints;
+  let DisableEncoding      = ps.DisableEncoding;
+  let TSFlags              = ps.TSFlags;
+  let UseNamedOperandTable = ps.UseNamedOperandTable;
 
   bits<12> offset;
   bits<1>  glc;
@@ -461,15 +464,15 @@ class getMUBUFAsmOps<int addrKind> {
 }
 
 class MUBUF_SetupAddr<int addrKind> {
-  bits<1> offen  = !if(!eq(addrKind, BUFAddrKind.OffEn), 1,
-                   !if(!eq(addrKind, BUFAddrKind.BothEn), 1 , 0));
+  bits<1> offen  = !or(!eq(addrKind, BUFAddrKind.OffEn),
+                       !eq(addrKind, BUFAddrKind.BothEn));
 
-  bits<1> idxen  = !if(!eq(addrKind, BUFAddrKind.IdxEn), 1,
-                   !if(!eq(addrKind, BUFAddrKind.BothEn), 1 , 0));
+  bits<1> idxen  = !or(!eq(addrKind, BUFAddrKind.IdxEn),
+                       !eq(addrKind, BUFAddrKind.BothEn));
 
-  bits<1> addr64 = !if(!eq(addrKind, BUFAddrKind.Addr64), 1, 0);
+  bits<1> addr64 = !eq(addrKind, BUFAddrKind.Addr64);
 
-  bits<1> has_vaddr = !if(!eq(addrKind, BUFAddrKind.Offset), 0, 1);
+  bits<1> has_vaddr = !ne(addrKind, BUFAddrKind.Offset);
 }
 
 class MUBUF_Load_Pseudo <string opName,
@@ -485,7 +488,7 @@ class MUBUF_Load_Pseudo <string opName,
                  !con(getMUBUFIns<addrKindCopy, [], isLds>.ret,
                       !if(HasTiedDest, (ins getVregSrcForVT<vdata_vt>.ret:$vdata_in), (ins))),
                  " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc" #
-                   !if(isLds, " lds", "$tfe") # "$dlc" # "$swz",
+                   !if(isLds, " lds", "$tfe") # "$dlc$swz",
                  pattern>,
     MUBUF_SetupAddr<addrKindCopy> {
   let PseudoInstr = opName # !if(isLds, "_lds", "") #
@@ -497,7 +500,7 @@ class MUBUF_Load_Pseudo <string opName,
   let mayStore = 0;
   let maybeAtomic = 1;
   let Uses = !if(isLds, [EXEC, M0], [EXEC]);
-  let has_tfe = !if(isLds, 0, 1);
+  let has_tfe = !not(isLds);
   let lds = isLds;
   let elements = getMUBUFElements<vdata_vt>.ret;
 }
@@ -528,21 +531,23 @@ multiclass MUBUF_Pseudo_Loads<string opName,
                               bit TiedDest = 0,
                               bit isLds = 0> {
 
-  def _OFFSET : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, load_vt, TiedDest, isLds>,
+  defvar legal_load_vt = !if(!eq(!cast<string>(load_vt), !cast<string>(v3f16)), v4f16, load_vt);
+
+  def _OFFSET : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, legal_load_vt, TiedDest, isLds>,
     MUBUFAddr64Table<0, NAME # !if(isLds, "_LDS", "")>;
 
-  def _ADDR64 : MUBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, load_vt, TiedDest, isLds>,
+  def _ADDR64 : MUBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, legal_load_vt, TiedDest, isLds>,
     MUBUFAddr64Table<1, NAME # !if(isLds, "_LDS", "")>;
 
-  def _OFFEN  : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, load_vt, TiedDest, isLds>;
-  def _IDXEN  : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, load_vt, TiedDest, isLds>;
-  def _BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, load_vt, TiedDest, isLds>;
+  def _OFFEN  : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, legal_load_vt, TiedDest, isLds>;
+  def _IDXEN  : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, legal_load_vt, TiedDest, isLds>;
+  def _BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, legal_load_vt, TiedDest, isLds>;
 
   let DisableWQM = 1 in {
-    def _OFFSET_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, load_vt, TiedDest, isLds>;
-    def _OFFEN_exact  : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, load_vt, TiedDest, isLds>;
-    def _IDXEN_exact  : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, load_vt, TiedDest, isLds>;
-    def _BOTHEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, load_vt, TiedDest, isLds>;
+    def _OFFSET_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, legal_load_vt, TiedDest, isLds>;
+    def _OFFEN_exact  : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, legal_load_vt, TiedDest, isLds>;
+    def _IDXEN_exact  : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, legal_load_vt, TiedDest, isLds>;
+    def _BOTHEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, legal_load_vt, TiedDest, isLds>;
   }
 }
 
@@ -576,25 +581,27 @@ multiclass MUBUF_Pseudo_Stores<string opName,
                                ValueType store_vt = i32,
                                SDPatternOperator st = null_frag> {
 
-  def _OFFSET : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, store_vt,
-    [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
+  defvar legal_store_vt = !if(!eq(!cast<string>(store_vt), !cast<string>(v3f16)), v4f16, store_vt);
+
+  def _OFFSET : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, legal_store_vt,
+    [(st legal_store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
                                        i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))]>,
     MUBUFAddr64Table<0, NAME>;
 
-  def _ADDR64 : MUBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, store_vt,
-    [(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
+  def _ADDR64 : MUBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, legal_store_vt,
+    [(st legal_store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
                                        i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))]>,
     MUBUFAddr64Table<1, NAME>;
 
-  def _OFFEN  : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, store_vt>;
-  def _IDXEN  : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, store_vt>;
-  def _BOTHEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, store_vt>;
+  def _OFFEN  : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, legal_store_vt>;
+  def _IDXEN  : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, legal_store_vt>;
+  def _BOTHEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, legal_store_vt>;
 
   let DisableWQM = 1 in {
-    def _OFFSET_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, store_vt>;
-    def _OFFEN_exact  : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, store_vt>;
-    def _IDXEN_exact  : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, store_vt>;
-    def _BOTHEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, store_vt>;
+    def _OFFSET_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, legal_store_vt>;
+    def _OFFEN_exact  : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, legal_store_vt>;
+    def _IDXEN_exact  : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, legal_store_vt>;
+    def _BOTHEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, legal_store_vt>;
   }
 }
 
@@ -622,9 +629,9 @@ class getMUBUFAtomicInsDA<RegisterClass vdataClass, bit vdata_in,
   dag ret = !if(vdata_in,
     !if(!empty(vaddrList),
       (ins vdataClass:$vdata_in,
-           SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, SLC:$slc),
+           SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, GLC_1:$glc1, SLC:$slc),
       (ins vdataClass:$vdata_in, vaddrClass:$vaddr,
-           SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, SLC:$slc)
+           SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, GLC_1:$glc1, SLC:$slc)
     ),
     !if(!empty(vaddrList),
       (ins vdataClass:$vdata,
@@ -701,7 +708,7 @@ class MUBUF_AtomicRet_Pseudo<string opName, int addrKind,
   : MUBUF_Atomic_Pseudo<opName, addrKindCopy,
                         (outs vdataClassCopy:$vdata),
                         getMUBUFAtomicIns<addrKindCopy, vdataClassCopy, 1>.ret,
-                        " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # " glc$slc",
+                        " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc1$slc",
                         pattern>,
     AtomicNoRet<opName # "_" # getAddrName<addrKindCopy>.ret, 1> {
   let PseudoInstr = opName # "_rtn_" # getAddrName<addrKindCopy>.ret;
@@ -1006,7 +1013,7 @@ defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Pseudo_Atomics <
 
 let SubtargetPredicate = HasGFX10_BEncoding in
 defm BUFFER_ATOMIC_CSUB : MUBUF_Pseudo_Atomics_RTN <
-  "buffer_atomic_csub", VGPR_32, i32, atomic_csub_global_32
+  "buffer_atomic_csub", VGPR_32, i32, int_amdgcn_global_atomic_csub
 >;
 
 let SubtargetPredicate = isGFX8GFX9 in {
@@ -1093,14 +1100,12 @@ def BUFFER_WBINVL1 : MUBUF_Invalidate <"buffer_wbinvl1",
                                        int_amdgcn_buffer_wbinvl1>;
 
 let SubtargetPredicate = HasAtomicFaddInsts in {
-
 defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_NO_RTN <
-  "buffer_atomic_add_f32", VGPR_32, f32, atomic_fadd_global_noret
+  "buffer_atomic_add_f32", VGPR_32, f32, atomic_load_fadd_global_noret_32
 >;
 defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN <
-  "buffer_atomic_pk_add_f16", VGPR_32, v2f16, atomic_pk_fadd_global_noret
+  "buffer_atomic_pk_add_f16", VGPR_32, v2f16, atomic_load_fadd_v2f16_global_noret_32
 >;
-
 } // End SubtargetPredicate = HasAtomicFaddInsts
 
 //===----------------------------------------------------------------------===//
@@ -1163,9 +1168,11 @@ let SubtargetPredicate = isGFX10Plus in {
 //===----------------------------------------------------------------------===//
 
 multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
-                                  string opcode> {
+                                  string opcode, ValueType memoryVt = vt> {
+  defvar st = !if(!eq(!cast<string>(memoryVt), !cast<string>(vt)), name, mubuf_intrinsic_load<name, memoryVt>);
+
   def : GCNPat<
-    (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
+    (vt (st v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
               timm:$auxiliary, 0)),
     (!cast<MUBUF_Pseudo>(opcode # _OFFSET) SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
       (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
@@ -1173,7 +1180,7 @@ multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
   >;
 
   def : GCNPat<
-    (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
+    (vt (st v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
               timm:$auxiliary, 0)),
     (!cast<MUBUF_Pseudo>(opcode # _OFFEN) VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
       (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
@@ -1181,7 +1188,7 @@ multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
   >;
 
   def : GCNPat<
-    (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
+    (vt (st v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
               timm:$auxiliary, timm)),
     (!cast<MUBUF_Pseudo>(opcode # _IDXEN) VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
       (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
@@ -1189,7 +1196,7 @@ multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
   >;
 
   def : GCNPat<
-    (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset,
+    (vt (st v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset,
               timm:$auxiliary, timm)),
     (!cast<MUBUF_Pseudo>(opcode # _BOTHEN)
       (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
@@ -1213,6 +1220,7 @@ let SubtargetPredicate = HasUnpackedD16VMem in {
   defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i16, "BUFFER_LOAD_FORMAT_D16_X_gfx80">;
   defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i32, "BUFFER_LOAD_FORMAT_D16_X_gfx80">;
   defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2i32, "BUFFER_LOAD_FORMAT_D16_XY_gfx80">;
+  defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v3i32, "BUFFER_LOAD_FORMAT_D16_XYZ_gfx80">;
   defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4i32, "BUFFER_LOAD_FORMAT_D16_XYZW_gfx80">;
 } // End HasUnpackedD16VMem.
 
@@ -1222,6 +1230,8 @@ let SubtargetPredicate = HasPackedD16VMem in {
   defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i32, "BUFFER_LOAD_FORMAT_D16_X">;
   defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2f16, "BUFFER_LOAD_FORMAT_D16_XY">;
   defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2i16, "BUFFER_LOAD_FORMAT_D16_XY">;
+  defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4f16, "BUFFER_LOAD_FORMAT_D16_XYZ", v3f16>;
+  defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4i16, "BUFFER_LOAD_FORMAT_D16_XYZ", v3i16>;
   defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4f16, "BUFFER_LOAD_FORMAT_D16_XYZW">;
   defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4i16, "BUFFER_LOAD_FORMAT_D16_XYZW">;
 } // End HasPackedD16VMem.
@@ -1244,9 +1254,11 @@ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_ubyte, i32, "BUFFER_LOAD_UBYTE">;
 defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_ushort,  i32, "BUFFER_LOAD_USHORT">;
 
 multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
-                                   string opcode> {
+                                   string opcode, ValueType memoryVt = vt> {
+  defvar st = !if(!eq(!cast<string>(memoryVt), !cast<string>(vt)), name, mubuf_intrinsic_store<name, memoryVt>);
+
   def : GCNPat<
-    (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
+    (st vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
               timm:$auxiliary, 0),
     (!cast<MUBUF_Pseudo>(opcode # _OFFSET_exact) getVregSrcForVT<vt>.ret:$vdata, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
       (extract_glc $auxiliary), (extract_slc $auxiliary), 0,  (extract_dlc $auxiliary),
@@ -1254,7 +1266,7 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
   >;
 
   def : GCNPat<
-    (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
+    (st vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
               timm:$auxiliary, 0),
     (!cast<MUBUF_Pseudo>(opcode # _OFFEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset,
       (as_i16timm $offset), (extract_glc $auxiliary),
@@ -1263,7 +1275,7 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
   >;
 
   def : GCNPat<
-    (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
+    (st vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
               timm:$auxiliary, timm),
     (!cast<MUBUF_Pseudo>(opcode # _IDXEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset,
       (as_i16timm $offset), (extract_glc $auxiliary),
@@ -1272,7 +1284,7 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
   >;
 
   def : GCNPat<
-    (name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset,
+    (st vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset,
               timm:$auxiliary, timm),
     (!cast<MUBUF_Pseudo>(opcode # _BOTHEN_exact)
       getVregSrcForVT<vt>.ret:$vdata,
@@ -1297,6 +1309,7 @@ let SubtargetPredicate = HasUnpackedD16VMem in {
   defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, i16, "BUFFER_STORE_FORMAT_D16_X_gfx80">;
   defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, i32, "BUFFER_STORE_FORMAT_D16_X_gfx80">;
   defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v2i32, "BUFFER_STORE_FORMAT_D16_XY_gfx80">;
+  defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v3i32, "BUFFER_STORE_FORMAT_D16_XYZ_gfx80">;
   defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v4i32, "BUFFER_STORE_FORMAT_D16_XYZW_gfx80">;
 } // End HasUnpackedD16VMem.
 
@@ -1306,6 +1319,8 @@ let SubtargetPredicate = HasPackedD16VMem in {
   defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, i32, "BUFFER_STORE_FORMAT_D16_X">;
   defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v2f16, "BUFFER_STORE_FORMAT_D16_XY">;
   defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v2i16, "BUFFER_STORE_FORMAT_D16_XY">;
+  defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v4f16, "BUFFER_STORE_FORMAT_D16_XYZ", v3f16>;
+  defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v4i16, "BUFFER_STORE_FORMAT_D16_XYZ", v3i16>;
   defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v4f16, "BUFFER_STORE_FORMAT_D16_XYZW">;
   defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v4i16, "BUFFER_STORE_FORMAT_D16_XYZW">;
 } // End HasPackedD16VMem.
@@ -1367,6 +1382,7 @@ multiclass BufferAtomicPatterns<SDPatternOperator name, ValueType vt,
 }
 
 defm : BufferAtomicPatterns<SIbuffer_atomic_swap, i32, "BUFFER_ATOMIC_SWAP">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_swap, f32, "BUFFER_ATOMIC_SWAP">;
 defm : BufferAtomicPatterns<SIbuffer_atomic_add, i32, "BUFFER_ATOMIC_ADD">;
 defm : BufferAtomicPatterns<SIbuffer_atomic_sub, i32, "BUFFER_ATOMIC_SUB">;
 defm : BufferAtomicPatterns<SIbuffer_atomic_smin, i32, "BUFFER_ATOMIC_SMIN">;
@@ -1392,46 +1408,56 @@ defm : BufferAtomicPatterns<SIbuffer_atomic_xor, i64, "BUFFER_ATOMIC_XOR_X2">;
 defm : BufferAtomicPatterns<SIbuffer_atomic_inc, i64, "BUFFER_ATOMIC_INC_X2">;
 defm : BufferAtomicPatterns<SIbuffer_atomic_dec, i64, "BUFFER_ATOMIC_DEC_X2">;
 
+class NoUseBufferAtomic<SDPatternOperator Op, ValueType vt> : PatFrag <
+  (ops node:$src0, node:$src1, node:$src2, node:$src3, node:$src4, node:$src5, node:$src6, node:$src7),
+  (vt (Op $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7)),
+  [{ return SDValue(N, 0).use_empty(); }]> {
+
+  let GISelPredicateCode = [{
+    return MRI.use_nodbg_empty(MI.getOperand(0).getReg());
+  }];
+}
+
 multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt,
                                        string opcode> {
   def : GCNPat<
-    (name vt:$vdata_in, v4i32:$rsrc, 0,
-          0, i32:$soffset, timm:$offset,
-          timm:$cachepolicy, 0),
-    (!cast<MUBUF_Pseudo>(opcode # _OFFSET) $vdata_in, $rsrc, $soffset,
-                                        (as_i16imm $offset), (extract_slc $cachepolicy))
+    (NoUseBufferAtomic<name, vt> vt:$vdata_in, v4i32:$rsrc, 0,
+                                 0, i32:$soffset, timm:$offset,
+                                 timm:$cachepolicy, 0),
+    (!cast<MUBUF_Pseudo>(opcode # _OFFSET) getVregSrcForVT<vt>.ret:$vdata_in, SReg_128:$rsrc, SCSrc_b32:$soffset,
+                                          (as_i16timm $offset), (extract_slc $cachepolicy))
   >;
 
   def : GCNPat<
-    (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
-          0, i32:$soffset, timm:$offset,
-          timm:$cachepolicy, timm),
-    (!cast<MUBUF_Pseudo>(opcode # _IDXEN) $vdata_in, $vindex, $rsrc, $soffset,
-                                       (as_i16imm $offset), (extract_slc $cachepolicy))
+    (NoUseBufferAtomic<name, vt> vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
+                                 0, i32:$soffset, timm:$offset,
+                                 timm:$cachepolicy, timm),
+    (!cast<MUBUF_Pseudo>(opcode # _IDXEN) getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset,
+                                          (as_i16timm $offset), (extract_slc $cachepolicy))
   >;
 
   def : GCNPat<
-    (name vt:$vdata_in, v4i32:$rsrc, 0,
-          i32:$voffset, i32:$soffset, timm:$offset,
-          timm:$cachepolicy, 0),
-    (!cast<MUBUF_Pseudo>(opcode # _OFFEN) $vdata_in, $voffset, $rsrc, $soffset,
-                                       (as_i16imm $offset), (extract_slc $cachepolicy))
+    (NoUseBufferAtomic<name, vt> vt:$vdata_in, v4i32:$rsrc, 0,
+                                 i32:$voffset, i32:$soffset, timm:$offset,
+                                 timm:$cachepolicy, 0),
+    (!cast<MUBUF_Pseudo>(opcode # _OFFEN) getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset,
+                                          (as_i16timm $offset), (extract_slc $cachepolicy))
   >;
 
   def : GCNPat<
-    (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
-          i32:$voffset, i32:$soffset, timm:$offset,
-          timm:$cachepolicy, timm),
+    (NoUseBufferAtomic<name, vt> vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
+                                 i32:$voffset, i32:$soffset, timm:$offset,
+                                 timm:$cachepolicy, timm),
     (!cast<MUBUF_Pseudo>(opcode # _BOTHEN)
-      $vdata_in,
-      (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
-      $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy))
+      getVregSrcForVT<vt>.ret:$vdata_in,
+      (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
+      SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_slc $cachepolicy))
   >;
 }
 
 let SubtargetPredicate = HasAtomicFaddInsts in {
 defm : BufferAtomicPatterns_NO_RTN<SIbuffer_atomic_fadd, f32, "BUFFER_ATOMIC_ADD_F32">;
-defm : BufferAtomicPatterns_NO_RTN<SIbuffer_atomic_pk_fadd, v2f16, "BUFFER_ATOMIC_PK_ADD_F16">;
+defm : BufferAtomicPatterns_NO_RTN<SIbuffer_atomic_fadd, v2f16, "BUFFER_ATOMIC_PK_ADD_F16">;
 }
 
 def : GCNPat<
@@ -1568,6 +1594,7 @@ multiclass MUBUFScratchLoadPat_D16 <MUBUF_Pseudo InstrOffen,
   >;
 }
 
+let OtherPredicates = [DisableFlatScratch] in {
 defm : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, BUFFER_LOAD_SBYTE_OFFSET, i32, sextloadi8_private>;
 defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i32, extloadi8_private>;
 defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i32, zextloadi8_private>;
@@ -1586,7 +1613,7 @@ defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, BUFFER_LOAD_DWORDX2_OFFSE
 defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX3_OFFEN, BUFFER_LOAD_DWORDX3_OFFSET, v3i32, load_private>;
 defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, BUFFER_LOAD_DWORDX4_OFFSET, v4i32, load_private>;
 
-let OtherPredicates = [D16PreservesUnusedBits] in {
+let OtherPredicates = [D16PreservesUnusedBits, DisableFlatScratch] in {
 defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SHORT_D16_HI_OFFEN, BUFFER_LOAD_SHORT_D16_HI_OFFSET, v2i16, load_d16_hi_private>;
 defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_UBYTE_D16_HI_OFFEN, BUFFER_LOAD_UBYTE_D16_HI_OFFSET, v2i16, az_extloadi8_d16_hi_private>;
 defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SBYTE_D16_HI_OFFEN, BUFFER_LOAD_SBYTE_D16_HI_OFFSET, v2i16, sextloadi8_d16_hi_private>;
@@ -1602,6 +1629,8 @@ defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_UBYTE_D16_OFFEN, BUFFER_LOAD_UBYTE_D1
 defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SBYTE_D16_OFFEN, BUFFER_LOAD_SBYTE_D16_OFFSET, v2f16, sextloadi8_d16_lo_private>;
 }
 
+} // End OtherPredicates = [DisableFlatScratch]
+
 multiclass MUBUFStore_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Instr_OFFSET,
                                       ValueType vt, PatFrag atomic_st> {
   // Store follows atomic op convention so address is first
@@ -1652,6 +1681,7 @@ multiclass MUBUFScratchStorePat <MUBUF_Pseudo InstrOffen,
   >;
 }
 
+let OtherPredicates = [DisableFlatScratch] in {
 defm : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, BUFFER_STORE_BYTE_OFFSET, i32, truncstorei8_private>;
 defm : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, BUFFER_STORE_SHORT_OFFSET, i32, truncstorei16_private>;
 defm : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, BUFFER_STORE_BYTE_OFFSET, i16, truncstorei8_private>;
@@ -1666,7 +1696,7 @@ defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX3_OFFEN, BUFFER_STORE_DWORDX3_OF
 defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, BUFFER_STORE_DWORDX4_OFFSET, v4i32, store_private, VReg_128>;
 
 
-let OtherPredicates = [D16PreservesUnusedBits] in {
+let OtherPredicates = [D16PreservesUnusedBits, DisableFlatScratch] in {
  // Hiding the extract high pattern in the PatFrag seems to not
  // automatically increase the complexity.
 let AddedComplexity = 1 in {
@@ -1674,6 +1704,7 @@ defm : MUBUFScratchStorePat <BUFFER_STORE_SHORT_D16_HI_OFFEN, BUFFER_STORE_SHORT
 defm : MUBUFScratchStorePat <BUFFER_STORE_BYTE_D16_HI_OFFEN, BUFFER_STORE_BYTE_D16_HI_OFFSET, i32, truncstorei8_hi16_private>;
 }
 }
+} // End OtherPredicates = [DisableFlatScratch]
 
 //===----------------------------------------------------------------------===//
 // MTBUF Patterns
@@ -1684,9 +1715,11 @@ defm : MUBUFScratchStorePat <BUFFER_STORE_BYTE_D16_HI_OFFEN, BUFFER_STORE_BYTE_D
 //===----------------------------------------------------------------------===//
 
 multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
-                                  string opcode> {
+                                  string opcode, ValueType memoryVt = vt> {
+  defvar st = !if(!eq(!cast<string>(memoryVt), !cast<string>(vt)), name, mtbuf_intrinsic_load<name, memoryVt>);
+
   def : GCNPat<
-    (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
+    (vt (st v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
               timm:$format, timm:$auxiliary, 0)),
     (!cast<MTBUF_Pseudo>(opcode # _OFFSET) SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
       (as_i8timm $format),
@@ -1695,7 +1728,7 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
   >;
 
   def : GCNPat<
-    (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
+    (vt (st v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
               timm:$format, timm:$auxiliary, timm)),
     (!cast<MTBUF_Pseudo>(opcode # _IDXEN) VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
       (as_i8timm $format),
@@ -1704,7 +1737,7 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
   >;
 
   def : GCNPat<
-    (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
+    (vt (st v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
               timm:$format, timm:$auxiliary, 0)),
     (!cast<MTBUF_Pseudo>(opcode # _OFFEN) VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
       (as_i8timm $format),
@@ -1713,7 +1746,7 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
   >;
 
   def : GCNPat<
-    (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset,
+    (vt (st v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset,
               timm:$format, timm:$auxiliary, timm)),
     (!cast<MTBUF_Pseudo>(opcode # _BOTHEN)
       (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
@@ -1737,6 +1770,7 @@ let SubtargetPredicate = HasUnpackedD16VMem in {
   defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, f16,   "TBUFFER_LOAD_FORMAT_D16_X_gfx80">;
   defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, i32,   "TBUFFER_LOAD_FORMAT_D16_X_gfx80">;
   defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v2i32, "TBUFFER_LOAD_FORMAT_D16_XY_gfx80">;
+  defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v3i32, "TBUFFER_LOAD_FORMAT_D16_XYZ_gfx80">;
   defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v4i32, "TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80">;
 } // End HasUnpackedD16VMem.
 
@@ -1744,13 +1778,16 @@ let SubtargetPredicate = HasPackedD16VMem in {
   defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, f16,   "TBUFFER_LOAD_FORMAT_D16_X">;
   defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, i32,   "TBUFFER_LOAD_FORMAT_D16_X">;
   defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v2f16, "TBUFFER_LOAD_FORMAT_D16_XY">;
+  defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v4f16, "TBUFFER_LOAD_FORMAT_D16_XYZ", v3f16>;
   defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v4f16, "TBUFFER_LOAD_FORMAT_D16_XYZW">;
 } // End HasPackedD16VMem.
 
 multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
-                                   string opcode> {
+                                        string opcode, ValueType memoryVt = vt> {
+  defvar st = !if(!eq(!cast<string>(memoryVt), !cast<string>(vt)), name, mtbuf_intrinsic_store<name, memoryVt>);
+
   def : GCNPat<
-    (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
+    (st vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
           timm:$format, timm:$auxiliary, 0),
     (!cast<MTBUF_Pseudo>(opcode # _OFFSET_exact) getVregSrcForVT<vt>.ret:$vdata, SReg_128:$rsrc, SCSrc_b32:$soffset,
       (as_i16timm $offset), (as_i8timm $format),
@@ -1759,7 +1796,7 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
   >;
 
   def : GCNPat<
-    (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
+    (st vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
           timm:$format, timm:$auxiliary, timm),
     (!cast<MTBUF_Pseudo>(opcode # _IDXEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset,
       (as_i16timm $offset), (as_i8timm $format),
@@ -1768,7 +1805,7 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
   >;
 
   def : GCNPat<
-    (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
+    (st vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
           timm:$format, timm:$auxiliary, 0),
     (!cast<MTBUF_Pseudo>(opcode # _OFFEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset,
       (as_i16timm $offset), (as_i8timm $format),
@@ -1777,7 +1814,7 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
   >;
 
   def : GCNPat<
-    (name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset,
+    (st vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset,
           timm:$offset, timm:$format, timm:$auxiliary, timm),
     (!cast<MTBUF_Pseudo>(opcode # _BOTHEN_exact)
       getVregSrcForVT<vt>.ret:$vdata,
@@ -1801,6 +1838,7 @@ let SubtargetPredicate = HasUnpackedD16VMem in {
   defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, f16,   "TBUFFER_STORE_FORMAT_D16_X_gfx80">;
   defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, i32,   "TBUFFER_STORE_FORMAT_D16_X_gfx80">;
   defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v2i32, "TBUFFER_STORE_FORMAT_D16_XY_gfx80">;
+  defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v3i32, "TBUFFER_STORE_FORMAT_D16_XYZ_gfx80">;
   defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v4i32, "TBUFFER_STORE_FORMAT_D16_XYZW_gfx80">;
 } // End HasUnpackedD16VMem.
 
@@ -1808,6 +1846,7 @@ let SubtargetPredicate = HasPackedD16VMem in {
   defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, f16,   "TBUFFER_STORE_FORMAT_D16_X">;
   defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, i32,   "TBUFFER_STORE_FORMAT_D16_X">;
   defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v2f16, "TBUFFER_STORE_FORMAT_D16_XY">;
+  defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v4f16, "TBUFFER_STORE_FORMAT_D16_XYZ", v3f16>;
   defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v4f16, "TBUFFER_STORE_FORMAT_D16_XYZW">;
 } // End HasPackedD16VMem.
 
@@ -1825,7 +1864,7 @@ class Base_MUBUF_Real_gfx6_gfx7_gfx10<bits<7> op, MUBUF_Pseudo ps, int ef> :
   let Inst{12}    = ps.offen;
   let Inst{13}    = ps.idxen;
   let Inst{14}    = !if(ps.has_glc, glc, ps.glc_value);
-  let Inst{16}    = !if(ps.lds, 1, 0);
+  let Inst{16}    = ps.lds;
   let Inst{24-18} = op;
   let Inst{31-26} = 0x38;
   let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
@@ -2176,7 +2215,7 @@ class MUBUF_Real_vi <bits<7> op, MUBUF_Pseudo ps> :
   let Inst{12}    = ps.offen;
   let Inst{13}    = ps.idxen;
   let Inst{14}    = !if(ps.has_glc, glc, ps.glc_value);
-  let Inst{16}    = !if(ps.lds, 1, 0);
+  let Inst{16}    = ps.lds;
   let Inst{17}    = !if(ps.has_slc, slc, ?);
   let Inst{24-18} = op;
   let Inst{31-26} = 0x38; //encoding
@@ -2226,7 +2265,7 @@ class MUBUF_Real_gfx80 <bits<7> op, MUBUF_Pseudo ps> :
   let Inst{12}    = ps.offen;
   let Inst{13}    = ps.idxen;
   let Inst{14}    = !if(ps.has_glc, glc, ps.glc_value);
-  let Inst{16}    = !if(ps.lds, 1, 0);
+  let Inst{16}    = ps.lds;
   let Inst{17}    = !if(ps.has_slc, slc, ?);
   let Inst{24-18} = op;
   let Inst{31-26} = 0x38; //encoding
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/DSInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/DSInstructions.td
index beb01b1abf0f..328c81005df4 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -53,7 +53,7 @@ class DS_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> patt
 }
 
 class DS_Real <DS_Pseudo ds> :
-  InstSI <ds.OutOperandList, ds.InOperandList, ds.Mnemonic # " " # ds.AsmOperands, []>,
+  InstSI <ds.OutOperandList, ds.InOperandList, ds.Mnemonic # ds.AsmOperands, []>,
   Enc64 {
 
   let isPseudo = 0;
@@ -87,7 +87,7 @@ class DS_0A1D_NORET<string opName, RegisterClass rc = VGPR_32>
 : DS_Pseudo<opName,
   (outs),
   (ins rc:$data0, offset:$offset, gds:$gds),
-  "$data0$offset$gds"> {
+  " $data0$offset$gds"> {
 
   let has_addr = 0;
   let has_data1 = 0;
@@ -98,7 +98,7 @@ class DS_1A1D_NORET<string opName, RegisterClass rc = VGPR_32>
 : DS_Pseudo<opName,
   (outs),
   (ins VGPR_32:$addr, rc:$data0, offset:$offset, gds:$gds),
-  "$addr, $data0$offset$gds"> {
+  " $addr, $data0$offset$gds"> {
 
   let has_data1 = 0;
   let has_vdst = 0;
@@ -118,7 +118,7 @@ class DS_1A2D_NORET<string opName, RegisterClass rc = VGPR_32>
 : DS_Pseudo<opName,
   (outs),
   (ins VGPR_32:$addr, rc:$data0, rc:$data1, offset:$offset, gds:$gds),
-  "$addr, $data0, $data1"#"$offset"#"$gds"> {
+  " $addr, $data0, $data1$offset$gds"> {
 
   let has_vdst = 0;
 }
@@ -138,7 +138,7 @@ class DS_1A2D_Off8_NORET <string opName, RegisterClass rc = VGPR_32>
   (outs),
   (ins VGPR_32:$addr, rc:$data0, rc:$data1,
        offset0:$offset0, offset1:$offset1, gds:$gds),
-  "$addr, $data0, $data1$offset0$offset1$gds"> {
+  " $addr, $data0, $data1$offset0$offset1$gds"> {
 
   let has_vdst = 0;
   let has_offset = 0;
@@ -157,7 +157,7 @@ class DS_1A1D_RET <string opName, RegisterClass rc = VGPR_32>
 : DS_Pseudo<opName,
   (outs rc:$vdst),
   (ins VGPR_32:$addr, rc:$data0, offset:$offset, gds:$gds),
-  "$vdst, $addr, $data0$offset$gds"> {
+  " $vdst, $addr, $data0$offset$gds"> {
 
   let hasPostISelHook = 1;
   let has_data1 = 0;
@@ -166,12 +166,12 @@ class DS_1A1D_RET <string opName, RegisterClass rc = VGPR_32>
 multiclass DS_1A1D_RET_mc <string opName, RegisterClass rc = VGPR_32,
                            string NoRetOp = ""> {
   def "" : DS_1A1D_RET<opName, rc>,
-    AtomicNoRet<NoRetOp, !if(!eq(NoRetOp, ""), 0, 1)>;
+    AtomicNoRet<NoRetOp, !ne(NoRetOp, "")>;
 
   let has_m0_read = 0 in {
     def _gfx9 : DS_1A1D_RET<opName, rc>,
       AtomicNoRet<!if(!eq(NoRetOp, ""), "", NoRetOp#"_gfx9"),
-                  !if(!eq(NoRetOp, ""), 0, 1)>;
+                  !ne(NoRetOp, "")>;
   }
 }
 
@@ -181,7 +181,7 @@ class DS_1A2D_RET<string opName,
 : DS_Pseudo<opName,
   (outs rc:$vdst),
   (ins VGPR_32:$addr, src:$data0, src:$data1, offset:$offset, gds:$gds),
-  "$vdst, $addr, $data0, $data1$offset$gds"> {
+  " $vdst, $addr, $data0, $data1$offset$gds"> {
 
   let hasPostISelHook = 1;
 }
@@ -191,11 +191,11 @@ multiclass DS_1A2D_RET_mc<string opName,
                           string NoRetOp = "",
                           RegisterClass src = rc> {
   def "" : DS_1A2D_RET<opName, rc, src>,
-    AtomicNoRet<NoRetOp, !if(!eq(NoRetOp, ""), 0, 1)>;
+    AtomicNoRet<NoRetOp, !ne(NoRetOp, "")>;
 
   let has_m0_read = 0 in {
     def _gfx9 : DS_1A2D_RET<opName, rc, src>,
-      AtomicNoRet<NoRetOp#"_gfx9", !if(!eq(NoRetOp, ""), 0, 1)>;
+      AtomicNoRet<NoRetOp#"_gfx9", !ne(NoRetOp, "")>;
   }
 }
 
@@ -205,7 +205,7 @@ class DS_1A2D_Off8_RET<string opName,
 : DS_Pseudo<opName,
   (outs rc:$vdst),
   (ins VGPR_32:$addr, src:$data0, src:$data1, offset0:$offset0, offset1:$offset1, gds:$gds),
-  "$vdst, $addr, $data0, $data1$offset0$offset1$gds"> {
+  " $vdst, $addr, $data0, $data1$offset0$offset1$gds"> {
 
   let has_offset = 0;
   let AsmMatchConverter = "cvtDSOffset01";
@@ -230,7 +230,7 @@ class DS_1A_RET<string opName, RegisterClass rc = VGPR_32, bit HasTiedOutput = 0
   !if(HasTiedOutput,
     (ins VGPR_32:$addr, ofs:$offset, gds:$gds, rc:$vdst_in),
     (ins VGPR_32:$addr, ofs:$offset, gds:$gds)),
-  "$vdst, $addr$offset$gds"> {
+  " $vdst, $addr$offset$gds"> {
   let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", "");
   let DisableEncoding = !if(HasTiedOutput, "$vdst_in", "");
   let has_data0 = 0;
@@ -252,7 +252,7 @@ class DS_1A_Off8_RET <string opName, RegisterClass rc = VGPR_32>
 : DS_Pseudo<opName,
   (outs rc:$vdst),
   (ins VGPR_32:$addr, offset0:$offset0, offset1:$offset1, gds:$gds),
-  "$vdst, $addr$offset0$offset1$gds"> {
+  " $vdst, $addr$offset0$offset1$gds"> {
 
   let has_offset = 0;
   let has_data0 = 0;
@@ -271,7 +271,7 @@ multiclass DS_1A_Off8_RET_mc <string opName, RegisterClass rc = VGPR_32> {
 class DS_1A_RET_GDS <string opName> : DS_Pseudo<opName,
   (outs VGPR_32:$vdst),
   (ins VGPR_32:$addr, offset:$offset),
-  "$vdst, $addr$offset gds"> {
+  " $vdst, $addr$offset gds"> {
 
   let has_data0 = 0;
   let has_data1 = 0;
@@ -283,7 +283,7 @@ class DS_1A_RET_GDS <string opName> : DS_Pseudo<opName,
 class DS_0A_RET <string opName> : DS_Pseudo<opName,
   (outs VGPR_32:$vdst),
   (ins offset:$offset, gds:$gds),
-  "$vdst$offset$gds"> {
+  " $vdst$offset$gds"> {
 
   let mayLoad = 1;
   let mayStore = 1;
@@ -296,7 +296,7 @@ class DS_0A_RET <string opName> : DS_Pseudo<opName,
 class DS_1A <string opName> : DS_Pseudo<opName,
   (outs),
   (ins VGPR_32:$addr, offset:$offset, gds:$gds),
-  "$addr$offset$gds"> {
+  " $addr$offset$gds"> {
 
   let mayLoad = 1;
   let mayStore = 1;
@@ -330,13 +330,13 @@ class DS_GWS <string opName, dag ins, string asmOps>
 
 class DS_GWS_0D <string opName>
 : DS_GWS<opName,
-  (ins offset:$offset, gds:$gds), "$offset gds"> {
+  (ins offset:$offset), "$offset gds"> {
   let hasSideEffects = 1;
 }
 
 class DS_GWS_1D <string opName>
 : DS_GWS<opName,
-  (ins VGPR_32:$data0, offset:$offset, gds:$gds), "$data0$offset gds"> {
+  (ins VGPR_32:$data0, offset:$offset), " $data0$offset gds"> {
 
   let has_gws_data0 = 1;
   let hasSideEffects = 1;
@@ -364,7 +364,7 @@ class DS_1A1D_PERMUTE <string opName, SDPatternOperator node = null_frag>
 : DS_Pseudo<opName,
   (outs VGPR_32:$vdst),
   (ins VGPR_32:$addr, VGPR_32:$data0, offset:$offset),
-  "$vdst, $addr, $data0$offset",
+  " $vdst, $addr, $data0$offset",
   [(set i32:$vdst,
    (node (DS1Addr1Offset i32:$addr, i16:$offset), i32:$data0))] > {
 
@@ -680,7 +680,29 @@ foreach vt = VReg_64.RegTypes in {
 defm : DSReadPat_mc <DS_READ_B64, vt, "load_align8_local">;
 }
 
-defm : DSReadPat_mc <DS_READ_B128, v4i32, "load_align16_local">;
+let SubtargetPredicate = isGFX7Plus in {
+
+foreach vt = VReg_96.RegTypes in {
+defm : DSReadPat_mc <DS_READ_B96, vt, "load_align16_local">;
+}
+
+foreach vt = VReg_128.RegTypes in {
+defm : DSReadPat_mc <DS_READ_B128, vt, "load_align16_local">;
+}
+
+let SubtargetPredicate = HasUnalignedAccessMode in {
+
+foreach vt = VReg_96.RegTypes in {
+defm : DSReadPat_mc <DS_READ_B96, vt, "load_local">;
+}
+
+foreach vt = VReg_128.RegTypes in {
+defm : DSReadPat_mc <DS_READ_B128, vt, "load_local">;
+}
+
+} // End SubtargetPredicate = HasUnalignedAccessMode
+
+} // End SubtargetPredicate = isGFX7Plus
 
 } // End AddedComplexity = 100
 
@@ -719,7 +741,7 @@ multiclass DSWritePat_mc <DS_Pseudo inst, ValueType vt, string frag> {
 // normal store.
 class DSAtomicWritePat <DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <
   (frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$value),
-  (inst $ptr, $value, offset:$offset, (i1 0))
+  (inst $ptr, getVregSrcForVT<vt>.ret:$value, offset:$offset, (i1 0))
 >;
 
 multiclass DSAtomicWritePat_mc <DS_Pseudo inst, ValueType vt, string frag> {
@@ -761,6 +783,18 @@ class DS64Bit4ByteAlignedWritePat<DS_Pseudo inst, ValueType vt, PatFrag frag> :
               (i1 0))
 >;
 
+class DS128Bit8ByteAlignedReadPat<DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <
+  (vt:$value (frag (DS128Bit8ByteAligned i32:$ptr, i8:$offset0, i8:$offset1))),
+  (inst $ptr, $offset0, $offset1, (i1 0))
+>;
+
+class DS128Bit8ByteAlignedWritePat<DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat<
+  (frag vt:$value, (DS128Bit8ByteAligned i32:$ptr, i8:$offset0, i8:$offset1)),
+  (inst $ptr, (i64 (EXTRACT_SUBREG VReg_128:$value, sub0_sub1)),
+              (i64 (EXTRACT_SUBREG VReg_128:$value, sub2_sub3)), $offset0, $offset1,
+              (i1 0))
+>;
+
 multiclass DS64Bit4ByteAlignedPat_mc<ValueType vt> {
   let OtherPredicates = [LDSRequiresM0Init, isGFX7Plus] in {
     def : DS64Bit4ByteAlignedReadPat<DS_READ2_B32, vt, load_local_m0>;
@@ -773,21 +807,60 @@ multiclass DS64Bit4ByteAlignedPat_mc<ValueType vt> {
   }
 }
 
+multiclass DS128Bit8ByteAlignedPat_mc<ValueType vt> {
+  let OtherPredicates = [LDSRequiresM0Init, isGFX7Plus] in {
+    def : DS128Bit8ByteAlignedReadPat<DS_READ2_B64, vt, load_local_m0>;
+    def : DS128Bit8ByteAlignedWritePat<DS_WRITE2_B64, vt, store_local_m0>;
+  }
+
+  let OtherPredicates = [NotLDSRequiresM0Init] in {
+    def : DS128Bit8ByteAlignedReadPat<DS_READ2_B64_gfx9, vt, load_local>;
+    def : DS128Bit8ByteAlignedWritePat<DS_WRITE2_B64_gfx9, vt, store_local>;
+  }
+}
+
 // v2i32 loads are split into i32 loads on SI during lowering, due to a bug
 // related to bounds checking.
 foreach vt = VReg_64.RegTypes in {
 defm : DS64Bit4ByteAlignedPat_mc<vt>;
 }
 
+foreach vt = VReg_128.RegTypes in {
+defm : DS128Bit8ByteAlignedPat_mc<vt>;
+}
+
 let AddedComplexity = 100 in {
 
 foreach vt = VReg_64.RegTypes in {
 defm : DSWritePat_mc <DS_WRITE_B64, vt, "store_align8_local">;
 }
 
-defm : DSWritePat_mc <DS_WRITE_B128, v4i32, "store_align16_local">;
+let SubtargetPredicate = isGFX7Plus in {
+
+foreach vt = VReg_96.RegTypes in {
+defm : DSWritePat_mc <DS_WRITE_B96, vt, "store_align16_local">;
+}
+
+foreach vt = VReg_128.RegTypes in {
+defm : DSWritePat_mc <DS_WRITE_B128, vt, "store_align16_local">;
+}
+
+let SubtargetPredicate = HasUnalignedAccessMode in {
+
+foreach vt = VReg_96.RegTypes in {
+defm : DSWritePat_mc <DS_WRITE_B96, vt, "store_local">;
+}
+
+foreach vt = VReg_128.RegTypes in {
+defm : DSWritePat_mc <DS_WRITE_B128, vt, "store_local">;
+}
+
+} // End SubtargetPredicate = HasUnalignedAccessMode
+
+} // End SubtargetPredicate = isGFX7Plus
 
 } // End AddedComplexity = 100
+
 class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag, bit gds=0> : GCNPat <
   (frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$value),
   (inst $ptr, getVregSrcForVT<vt>.ret:$value, offset:$offset, (i1 gds))
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 9c2f2e7eecd1..8061c6c509e0 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -17,42 +17,24 @@
 // ToDo: What to do with instruction suffixes (v_mov_b32 vs v_mov_b32_e32)?
 
 #include "Disassembler/AMDGPUDisassembler.h"
-#include "AMDGPU.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIDefines.h"
 #include "TargetInfo/AMDGPUTargetInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
-#include "llvm-c/Disassembler.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/BinaryFormat/ELF.h"
+#include "llvm-c/DisassemblerTypes.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFixedLenDisassembler.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Support/Endian.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/AMDHSAKernelDescriptor.h"
 #include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
-#include <cassert>
-#include <cstddef>
-#include <cstdint>
-#include <iterator>
-#include <tuple>
-#include <vector>
 
 using namespace llvm;
 
 #define DEBUG_TYPE "amdgpu-disassembler"
 
-#define SGPR_MAX (isGFX10() ? AMDGPU::EncValues::SGPR_MAX_GFX10 \
-                            : AMDGPU::EncValues::SGPR_MAX_SI)
+#define SGPR_MAX                                                               \
+  (isGFX10Plus() ? AMDGPU::EncValues::SGPR_MAX_GFX10                           \
+                 : AMDGPU::EncValues::SGPR_MAX_SI)
 
 using DecodeStatus = llvm::MCDisassembler::DecodeStatus;
 
@@ -63,7 +45,7 @@ AMDGPUDisassembler::AMDGPUDisassembler(const MCSubtargetInfo &STI,
   TargetMaxInstBytes(Ctx.getAsmInfo()->getMaxInstLength(&STI)) {
 
   // ToDo: AMDGPUDisassembler supports only VI ISA.
-  if (!STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding] && !isGFX10())
+  if (!STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding] && !isGFX10Plus())
     report_fatal_error("Disassembly not yet supported for subtarget");
 }
 
@@ -139,6 +121,8 @@ DECODE_OPERAND_REG(VS_128)
 DECODE_OPERAND_REG(VReg_64)
 DECODE_OPERAND_REG(VReg_96)
 DECODE_OPERAND_REG(VReg_128)
+DECODE_OPERAND_REG(VReg_256)
+DECODE_OPERAND_REG(VReg_512)
 
 DECODE_OPERAND_REG(SReg_32)
 DECODE_OPERAND_REG(SReg_32_XM0_XEXEC)
@@ -382,15 +366,24 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
   if (Res && (MI.getOpcode() == AMDGPU::V_MAC_F32_e64_vi ||
               MI.getOpcode() == AMDGPU::V_MAC_F32_e64_gfx6_gfx7 ||
               MI.getOpcode() == AMDGPU::V_MAC_F32_e64_gfx10 ||
+              MI.getOpcode() == AMDGPU::V_MAC_LEGACY_F32_e64_gfx6_gfx7 ||
+              MI.getOpcode() == AMDGPU::V_MAC_LEGACY_F32_e64_gfx10 ||
               MI.getOpcode() == AMDGPU::V_MAC_F16_e64_vi ||
               MI.getOpcode() == AMDGPU::V_FMAC_F32_e64_vi ||
               MI.getOpcode() == AMDGPU::V_FMAC_F32_e64_gfx10 ||
+              MI.getOpcode() == AMDGPU::V_FMAC_LEGACY_F32_e64_gfx10 ||
               MI.getOpcode() == AMDGPU::V_FMAC_F16_e64_gfx10)) {
     // Insert dummy unused src2_modifiers.
     insertNamedMCOperand(MI, MCOperand::createImm(0),
                          AMDGPU::OpName::src2_modifiers);
   }
 
+  if (Res && (MCII->get(MI.getOpcode()).TSFlags &
+                        (SIInstrFlags::MUBUF | SIInstrFlags::FLAT)) &&
+      AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::glc1) != -1) {
+    insertNamedMCOperand(MI, MCOperand::createImm(1), AMDGPU::OpName::glc1);
+  }
+
   if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::MIMG)) {
     int VAddr0Idx =
         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
@@ -499,8 +492,16 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
                                             AMDGPU::OpName::d16);
 
   assert(VDataIdx != -1);
-  assert(DMaskIdx != -1);
-  assert(TFEIdx != -1);
+  if (DMaskIdx == -1 || TFEIdx == -1) {// intersect_ray
+    if (AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::a16) > -1) {
+      assert(MI.getOpcode() == AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_sa ||
+             MI.getOpcode() == AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa ||
+             MI.getOpcode() == AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_sa ||
+             MI.getOpcode() == AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa);
+      addOperand(MI, MCOperand::createImm(1));
+    }
+    return MCDisassembler::Success;
+  }
 
   const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
   bool IsAtomic = (VDstIdx != -1);
@@ -544,9 +545,8 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
     DstSize = (DstSize + 1) / 2;
   }
 
-  // FIXME: Add tfe support
   if (MI.getOperand(TFEIdx).getImm())
-    return MCDisassembler::Success;
+    DstSize += 1;
 
   if (DstSize == Info->VDataDwords && AddrSize == Info->VAddrDwords)
     return MCDisassembler::Success;
@@ -996,10 +996,8 @@ unsigned AMDGPUDisassembler::getTtmpClassId(const OpWidthTy Width) const {
 int AMDGPUDisassembler::getTTmpIdx(unsigned Val) const {
   using namespace AMDGPU::EncValues;
 
-  unsigned TTmpMin =
-      (isGFX9() || isGFX10()) ? TTMP_GFX9_GFX10_MIN : TTMP_VI_MIN;
-  unsigned TTmpMax =
-      (isGFX9() || isGFX10()) ? TTMP_GFX9_GFX10_MAX : TTMP_VI_MAX;
+  unsigned TTmpMin = isGFX9Plus() ? TTMP_GFX9PLUS_MIN : TTMP_VI_MIN;
+  unsigned TTmpMax = isGFX9Plus() ? TTMP_GFX9PLUS_MAX : TTMP_VI_MAX;
 
   return (TTmpMin <= Val && Val <= TTmpMax)? Val - TTmpMin : -1;
 }
@@ -1017,7 +1015,8 @@ MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val) c
                                    : getVgprClassId(Width), Val - VGPR_MIN);
   }
   if (Val <= SGPR_MAX) {
-    assert(SGPR_MIN == 0); // "SGPR_MIN <= Val" is always true and causes compilation warning.
+    // "SGPR_MIN <= Val" is always true and causes compilation warning.
+    static_assert(SGPR_MIN == 0, "");
     return createSRegOperand(getSgprClassId(Width), Val - SGPR_MIN);
   }
 
@@ -1054,7 +1053,8 @@ MCOperand AMDGPUDisassembler::decodeDstOp(const OpWidthTy Width, unsigned Val) c
   assert(Width == OPW256 || Width == OPW512);
 
   if (Val <= SGPR_MAX) {
-    assert(SGPR_MIN == 0); // "SGPR_MIN <= Val" is always true and causes compilation warning.
+    // "SGPR_MIN <= Val" is always true and causes compilation warning.
+    static_assert(SGPR_MIN == 0, "");
     return createSRegOperand(getSgprClassId(Width), Val - SGPR_MIN);
   }
 
@@ -1137,8 +1137,8 @@ MCOperand AMDGPUDisassembler::decodeSDWASrc(const OpWidthTy Width,
                               Val - SDWA9EncValues::SRC_VGPR_MIN);
     }
     if (SDWA9EncValues::SRC_SGPR_MIN <= Val &&
-        Val <= (isGFX10() ? SDWA9EncValues::SRC_SGPR_MAX_GFX10
-                          : SDWA9EncValues::SRC_SGPR_MAX_SI)) {
+        Val <= (isGFX10Plus() ? SDWA9EncValues::SRC_SGPR_MAX_GFX10
+                              : SDWA9EncValues::SRC_SGPR_MAX_SI)) {
       return createSRegOperand(getSgprClassId(Width),
                                Val - SDWA9EncValues::SRC_SGPR_MIN);
     }
@@ -1207,12 +1207,358 @@ bool AMDGPUDisassembler::isVI() const {
   return STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands];
 }
 
-bool AMDGPUDisassembler::isGFX9() const {
-  return STI.getFeatureBits()[AMDGPU::FeatureGFX9];
+bool AMDGPUDisassembler::isGFX9() const { return AMDGPU::isGFX9(STI); }
+
+bool AMDGPUDisassembler::isGFX9Plus() const { return AMDGPU::isGFX9Plus(STI); }
+
+bool AMDGPUDisassembler::isGFX10() const { return AMDGPU::isGFX10(STI); }
+
+bool AMDGPUDisassembler::isGFX10Plus() const {
+  return AMDGPU::isGFX10Plus(STI);
+}
+
+//===----------------------------------------------------------------------===//
+// AMDGPU specific symbol handling
+//===----------------------------------------------------------------------===//
+#define PRINT_DIRECTIVE(DIRECTIVE, MASK)                                       \
+  do {                                                                         \
+    KdStream << Indent << DIRECTIVE " "                                        \
+             << ((FourByteBuffer & MASK) >> (MASK##_SHIFT)) << '\n';           \
+  } while (0)
+
+// NOLINTNEXTLINE(readability-identifier-naming)
+MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC1(
+    uint32_t FourByteBuffer, raw_string_ostream &KdStream) const {
+  using namespace amdhsa;
+  StringRef Indent = "\t";
+
+  // We cannot accurately backward compute #VGPRs used from
+  // GRANULATED_WORKITEM_VGPR_COUNT. But we are concerned with getting the same
+  // value of GRANULATED_WORKITEM_VGPR_COUNT in the reassembled binary. So we
+  // simply calculate the inverse of what the assembler does.
+
+  uint32_t GranulatedWorkitemVGPRCount =
+      (FourByteBuffer & COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT) >>
+      COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT_SHIFT;
+
+  uint32_t NextFreeVGPR = (GranulatedWorkitemVGPRCount + 1) *
+                          AMDGPU::IsaInfo::getVGPREncodingGranule(&STI);
+
+  KdStream << Indent << ".amdhsa_next_free_vgpr " << NextFreeVGPR << '\n';
+
+  // We cannot backward compute values used to calculate
+  // GRANULATED_WAVEFRONT_SGPR_COUNT. Hence the original values for following
+  // directives can't be computed:
+  // .amdhsa_reserve_vcc
+  // .amdhsa_reserve_flat_scratch
+  // .amdhsa_reserve_xnack_mask
+  // They take their respective default values if not specified in the assembly.
+  //
+  // GRANULATED_WAVEFRONT_SGPR_COUNT
+  //    = f(NEXT_FREE_SGPR + VCC + FLAT_SCRATCH + XNACK_MASK)
+  //
+  // We compute the inverse as though all directives apart from NEXT_FREE_SGPR
+  // are set to 0. So while disassembling we consider that:
+  //
+  // GRANULATED_WAVEFRONT_SGPR_COUNT
+  //    = f(NEXT_FREE_SGPR + 0 + 0 + 0)
+  //
+  // The disassembler cannot recover the original values of those 3 directives.
+
+  uint32_t GranulatedWavefrontSGPRCount =
+      (FourByteBuffer & COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT) >>
+      COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT_SHIFT;
+
+  if (isGFX10Plus() && GranulatedWavefrontSGPRCount)
+    return MCDisassembler::Fail;
+
+  uint32_t NextFreeSGPR = (GranulatedWavefrontSGPRCount + 1) *
+                          AMDGPU::IsaInfo::getSGPREncodingGranule(&STI);
+
+  KdStream << Indent << ".amdhsa_reserve_vcc " << 0 << '\n';
+  KdStream << Indent << ".amdhsa_reserve_flat_scratch " << 0 << '\n';
+  KdStream << Indent << ".amdhsa_reserve_xnack_mask " << 0 << '\n';
+  KdStream << Indent << ".amdhsa_next_free_sgpr " << NextFreeSGPR << "\n";
+
+  if (FourByteBuffer & COMPUTE_PGM_RSRC1_PRIORITY)
+    return MCDisassembler::Fail;
+
+  PRINT_DIRECTIVE(".amdhsa_float_round_mode_32",
+                  COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_32);
+  PRINT_DIRECTIVE(".amdhsa_float_round_mode_16_64",
+                  COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_16_64);
+  PRINT_DIRECTIVE(".amdhsa_float_denorm_mode_32",
+                  COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_32);
+  PRINT_DIRECTIVE(".amdhsa_float_denorm_mode_16_64",
+                  COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64);
+
+  if (FourByteBuffer & COMPUTE_PGM_RSRC1_PRIV)
+    return MCDisassembler::Fail;
+
+  PRINT_DIRECTIVE(".amdhsa_dx10_clamp", COMPUTE_PGM_RSRC1_ENABLE_DX10_CLAMP);
+
+  if (FourByteBuffer & COMPUTE_PGM_RSRC1_DEBUG_MODE)
+    return MCDisassembler::Fail;
+
+  PRINT_DIRECTIVE(".amdhsa_ieee_mode", COMPUTE_PGM_RSRC1_ENABLE_IEEE_MODE);
+
+  if (FourByteBuffer & COMPUTE_PGM_RSRC1_BULKY)
+    return MCDisassembler::Fail;
+
+  if (FourByteBuffer & COMPUTE_PGM_RSRC1_CDBG_USER)
+    return MCDisassembler::Fail;
+
+  PRINT_DIRECTIVE(".amdhsa_fp16_overflow", COMPUTE_PGM_RSRC1_FP16_OVFL);
+
+  if (FourByteBuffer & COMPUTE_PGM_RSRC1_RESERVED0)
+    return MCDisassembler::Fail;
+
+  if (isGFX10Plus()) {
+    PRINT_DIRECTIVE(".amdhsa_workgroup_processor_mode",
+                    COMPUTE_PGM_RSRC1_WGP_MODE);
+    PRINT_DIRECTIVE(".amdhsa_memory_ordered", COMPUTE_PGM_RSRC1_MEM_ORDERED);
+    PRINT_DIRECTIVE(".amdhsa_forward_progress", COMPUTE_PGM_RSRC1_FWD_PROGRESS);
+  }
+  return MCDisassembler::Success;
 }
 
-bool AMDGPUDisassembler::isGFX10() const {
-  return STI.getFeatureBits()[AMDGPU::FeatureGFX10];
+// NOLINTNEXTLINE(readability-identifier-naming)
+MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC2(
+    uint32_t FourByteBuffer, raw_string_ostream &KdStream) const {
+  using namespace amdhsa;
+  StringRef Indent = "\t";
+  PRINT_DIRECTIVE(
+      ".amdhsa_system_sgpr_private_segment_wavefront_offset",
+      COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT);
+  PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_id_x",
+                  COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X);
+  PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_id_y",
+                  COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Y);
+  PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_id_z",
+                  COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Z);
+  PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_info",
+                  COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_INFO);
+  PRINT_DIRECTIVE(".amdhsa_system_vgpr_workitem_id",
+                  COMPUTE_PGM_RSRC2_ENABLE_VGPR_WORKITEM_ID);
+
+  if (FourByteBuffer & COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_ADDRESS_WATCH)
+    return MCDisassembler::Fail;
+
+  if (FourByteBuffer & COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_MEMORY)
+    return MCDisassembler::Fail;
+
+  if (FourByteBuffer & COMPUTE_PGM_RSRC2_GRANULATED_LDS_SIZE)
+    return MCDisassembler::Fail;
+
+  PRINT_DIRECTIVE(
+      ".amdhsa_exception_fp_ieee_invalid_op",
+      COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION);
+  PRINT_DIRECTIVE(".amdhsa_exception_fp_denorm_src",
+                  COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE);
+  PRINT_DIRECTIVE(
+      ".amdhsa_exception_fp_ieee_div_zero",
+      COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO);
+  PRINT_DIRECTIVE(".amdhsa_exception_fp_ieee_overflow",
+                  COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW);
+  PRINT_DIRECTIVE(".amdhsa_exception_fp_ieee_underflow",
+                  COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW);
+  PRINT_DIRECTIVE(".amdhsa_exception_fp_ieee_inexact",
+                  COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT);
+  PRINT_DIRECTIVE(".amdhsa_exception_int_div_zero",
+                  COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO);
+
+  if (FourByteBuffer & COMPUTE_PGM_RSRC2_RESERVED0)
+    return MCDisassembler::Fail;
+
+  return MCDisassembler::Success;
+}
+
+#undef PRINT_DIRECTIVE
+
+MCDisassembler::DecodeStatus
+AMDGPUDisassembler::decodeKernelDescriptorDirective(
+    DataExtractor::Cursor &Cursor, ArrayRef<uint8_t> Bytes,
+    raw_string_ostream &KdStream) const {
+#define PRINT_DIRECTIVE(DIRECTIVE, MASK)                                       \
+  do {                                                                         \
+    KdStream << Indent << DIRECTIVE " "                                        \
+             << ((TwoByteBuffer & MASK) >> (MASK##_SHIFT)) << '\n';            \
+  } while (0)
+
+  uint16_t TwoByteBuffer = 0;
+  uint32_t FourByteBuffer = 0;
+  uint64_t EightByteBuffer = 0;
+
+  StringRef ReservedBytes;
+  StringRef Indent = "\t";
+
+  assert(Bytes.size() == 64);
+  DataExtractor DE(Bytes, /*IsLittleEndian=*/true, /*AddressSize=*/8);
+
+  switch (Cursor.tell()) {
+  case amdhsa::GROUP_SEGMENT_FIXED_SIZE_OFFSET:
+    FourByteBuffer = DE.getU32(Cursor);
+    KdStream << Indent << ".amdhsa_group_segment_fixed_size " << FourByteBuffer
+             << '\n';
+    return MCDisassembler::Success;
+
+  case amdhsa::PRIVATE_SEGMENT_FIXED_SIZE_OFFSET:
+    FourByteBuffer = DE.getU32(Cursor);
+    KdStream << Indent << ".amdhsa_private_segment_fixed_size "
+             << FourByteBuffer << '\n';
+    return MCDisassembler::Success;
+
+  case amdhsa::RESERVED0_OFFSET:
+    // 8 reserved bytes, must be 0.
+    EightByteBuffer = DE.getU64(Cursor);
+    if (EightByteBuffer) {
+      return MCDisassembler::Fail;
+    }
+    return MCDisassembler::Success;
+
+  case amdhsa::KERNEL_CODE_ENTRY_BYTE_OFFSET_OFFSET:
+    // KERNEL_CODE_ENTRY_BYTE_OFFSET
+    // So far no directive controls this for Code Object V3, so simply skip for
+    // disassembly.
+    DE.skip(Cursor, 8);
+    return MCDisassembler::Success;
+
+  case amdhsa::RESERVED1_OFFSET:
+    // 20 reserved bytes, must be 0.
+    ReservedBytes = DE.getBytes(Cursor, 20);
+    for (int I = 0; I < 20; ++I) {
+      if (ReservedBytes[I] != 0) {
+        return MCDisassembler::Fail;
+      }
+    }
+    return MCDisassembler::Success;
+
+  case amdhsa::COMPUTE_PGM_RSRC3_OFFSET:
+    // COMPUTE_PGM_RSRC3
+    //  - Only set for GFX10, GFX6-9 have this to be 0.
+    //  - Currently no directives directly control this.
+    FourByteBuffer = DE.getU32(Cursor);
+    if (!isGFX10Plus() && FourByteBuffer) {
+      return MCDisassembler::Fail;
+    }
+    return MCDisassembler::Success;
+
+  case amdhsa::COMPUTE_PGM_RSRC1_OFFSET:
+    FourByteBuffer = DE.getU32(Cursor);
+    if (decodeCOMPUTE_PGM_RSRC1(FourByteBuffer, KdStream) ==
+        MCDisassembler::Fail) {
+      return MCDisassembler::Fail;
+    }
+    return MCDisassembler::Success;
+
+  case amdhsa::COMPUTE_PGM_RSRC2_OFFSET:
+    FourByteBuffer = DE.getU32(Cursor);
+    if (decodeCOMPUTE_PGM_RSRC2(FourByteBuffer, KdStream) ==
+        MCDisassembler::Fail) {
+      return MCDisassembler::Fail;
+    }
+    return MCDisassembler::Success;
+
+  case amdhsa::KERNEL_CODE_PROPERTIES_OFFSET:
+    using namespace amdhsa;
+    TwoByteBuffer = DE.getU16(Cursor);
+
+    PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_buffer",
+                    KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER);
+    PRINT_DIRECTIVE(".amdhsa_user_sgpr_dispatch_ptr",
+                    KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR);
+    PRINT_DIRECTIVE(".amdhsa_user_sgpr_queue_ptr",
+                    KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR);
+    PRINT_DIRECTIVE(".amdhsa_user_sgpr_kernarg_segment_ptr",
+                    KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR);
+    PRINT_DIRECTIVE(".amdhsa_user_sgpr_dispatch_id",
+                    KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID);
+    PRINT_DIRECTIVE(".amdhsa_user_sgpr_flat_scratch_init",
+                    KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT);
+    PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_size",
+                    KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE);
+
+    if (TwoByteBuffer & KERNEL_CODE_PROPERTY_RESERVED0)
+      return MCDisassembler::Fail;
+
+    // Reserved for GFX9
+    if (isGFX9() &&
+        (TwoByteBuffer & KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32)) {
+      return MCDisassembler::Fail;
+    } else if (isGFX10Plus()) {
+      PRINT_DIRECTIVE(".amdhsa_wavefront_size32",
+                      KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32);
+    }
+
+    if (TwoByteBuffer & KERNEL_CODE_PROPERTY_RESERVED1)
+      return MCDisassembler::Fail;
+
+    return MCDisassembler::Success;
+
+  case amdhsa::RESERVED2_OFFSET:
+    // 6 bytes from here are reserved, must be 0.
+    ReservedBytes = DE.getBytes(Cursor, 6);
+    for (int I = 0; I < 6; ++I) {
+      if (ReservedBytes[I] != 0)
+        return MCDisassembler::Fail;
+    }
+    return MCDisassembler::Success;
+
+  default:
+    llvm_unreachable("Unhandled index. Case statements cover everything.");
+    return MCDisassembler::Fail;
+  }
+#undef PRINT_DIRECTIVE
+}
+
+MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeKernelDescriptor(
+    StringRef KdName, ArrayRef<uint8_t> Bytes, uint64_t KdAddress) const {
+  // CP microcode requires the kernel descriptor to be 64 aligned.
+  if (Bytes.size() != 64 || KdAddress % 64 != 0)
+    return MCDisassembler::Fail;
+
+  std::string Kd;
+  raw_string_ostream KdStream(Kd);
+  KdStream << ".amdhsa_kernel " << KdName << '\n';
+
+  DataExtractor::Cursor C(0);
+  while (C && C.tell() < Bytes.size()) {
+    MCDisassembler::DecodeStatus Status =
+        decodeKernelDescriptorDirective(C, Bytes, KdStream);
+
+    cantFail(C.takeError());
+
+    if (Status == MCDisassembler::Fail)
+      return MCDisassembler::Fail;
+  }
+  KdStream << ".end_amdhsa_kernel\n";
+  outs() << KdStream.str();
+  return MCDisassembler::Success;
+}
+
+Optional<MCDisassembler::DecodeStatus>
+AMDGPUDisassembler::onSymbolStart(SymbolInfoTy &Symbol, uint64_t &Size,
+                                  ArrayRef<uint8_t> Bytes, uint64_t Address,
+                                  raw_ostream &CStream) const {
+  // Right now only kernel descriptor needs to be handled.
+  // We ignore all other symbols for target specific handling.
+  // TODO:
+  // Fix the spurious symbol issue for AMDGPU kernels. Exists for both Code
+  // Object V2 and V3 when symbols are marked protected.
+
+  // amd_kernel_code_t for Code Object V2.
+  if (Symbol.Type == ELF::STT_AMDGPU_HSA_KERNEL) {
+    Size = 256;
+    return MCDisassembler::Fail;
+  }
+
+  // Code Object V3 kernel descriptors.
+  StringRef Name = Symbol.Name;
+  if (Symbol.Type == ELF::STT_OBJECT && Name.endswith(StringRef(".kd"))) {
+    Size = 64; // Size = 64 regardless of success or failure.
+    return decodeKernelDescriptor(Name.drop_back(3), Bytes, Address);
+  }
+  return None;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1233,11 +1579,10 @@ bool AMDGPUSymbolizer::tryAddingSymbolicOperand(MCInst &Inst,
   if (!Symbols)
     return false;
 
-  auto Result = std::find_if(Symbols->begin(), Symbols->end(),
-                             [Value](const SymbolInfoTy& Val) {
-                                return Val.Addr == static_cast<uint64_t>(Value)
-                                    && Val.Type == ELF::STT_NOTYPE;
-                             });
+  auto Result = llvm::find_if(*Symbols, [Value](const SymbolInfoTy &Val) {
+    return Val.Addr == static_cast<uint64_t>(Value) &&
+           Val.Type == ELF::STT_NOTYPE;
+  });
   if (Result != Symbols->end()) {
     auto *Sym = Ctx.getOrCreateSymbol(Result->Name);
     const auto *Add = MCSymbolRefExpr::create(Sym, Ctx);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index f975af409a09..714dabbc5184 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -15,15 +15,9 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_DISASSEMBLER_AMDGPUDISASSEMBLER_H
 #define LLVM_LIB_TARGET_AMDGPU_DISASSEMBLER_AMDGPUDISASSEMBLER_H
 
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
-#include "llvm/MC/MCDisassembler/MCRelocationInfo.h"
-#include "llvm/MC/MCDisassembler/MCSymbolizer.h"
-
-#include <algorithm>
-#include <cstdint>
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/Support/DataExtractor.h"
 #include <memory>
 
 namespace llvm {
@@ -66,6 +60,33 @@ public:
   DecodeStatus tryDecodeInst(const uint8_t* Table, MCInst &MI, uint64_t Inst,
                              uint64_t Address) const;
 
+  Optional<DecodeStatus> onSymbolStart(SymbolInfoTy &Symbol, uint64_t &Size,
+                                       ArrayRef<uint8_t> Bytes,
+                                       uint64_t Address,
+                                       raw_ostream &CStream) const override;
+
+  DecodeStatus decodeKernelDescriptor(StringRef KdName, ArrayRef<uint8_t> Bytes,
+                                      uint64_t KdAddress) const;
+
+  DecodeStatus
+  decodeKernelDescriptorDirective(DataExtractor::Cursor &Cursor,
+                                  ArrayRef<uint8_t> Bytes,
+                                  raw_string_ostream &KdStream) const;
+
+  /// Decode as directives that handle COMPUTE_PGM_RSRC1.
+  /// \param FourByteBuffer - Bytes holding contents of COMPUTE_PGM_RSRC1.
+  /// \param KdStream       - Stream to write the disassembled directives to.
+  // NOLINTNEXTLINE(readability-identifier-naming)
+  DecodeStatus decodeCOMPUTE_PGM_RSRC1(uint32_t FourByteBuffer,
+                                       raw_string_ostream &KdStream) const;
+
+  /// Decode as directives that handle COMPUTE_PGM_RSRC2.
+  /// \param FourByteBuffer - Bytes holding contents of COMPUTE_PGM_RSRC2.
+  /// \param KdStream       - Stream to write the disassembled directives to.
+  // NOLINTNEXTLINE(readability-identifier-naming)
+  DecodeStatus decodeCOMPUTE_PGM_RSRC2(uint32_t FourByteBuffer,
+                                       raw_string_ostream &KdStream) const;
+
   DecodeStatus convertSDWAInst(MCInst &MI) const;
   DecodeStatus convertDPP8Inst(MCInst &MI) const;
   DecodeStatus convertMIMGInst(MCInst &MI) const;
@@ -140,7 +161,9 @@ public:
 
   bool isVI() const;
   bool isGFX9() const;
+  bool isGFX9Plus() const;
   bool isGFX10() const;
+  bool isGFX10Plus() const;
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/EXPInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/EXPInstructions.td
new file mode 100644
index 000000000000..b3b55ddd2c97
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/EXPInstructions.td
@@ -0,0 +1,125 @@
+//===-- EXPInstructions.td - Export Instruction Definitions ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// EXP classes
+//===----------------------------------------------------------------------===//
+
+class EXPCommon<bit done, string asm = ""> : InstSI<
+  (outs),
+  (ins exp_tgt:$tgt,
+       ExpSrc0:$src0, ExpSrc1:$src1, ExpSrc2:$src2, ExpSrc3:$src3,
+       exp_vm:$vm, exp_compr:$compr, i32imm:$en),
+  asm> {
+  let EXP = 1;
+  let EXP_CNT = 1;
+  let mayLoad = done;
+  let mayStore = 1;
+  let UseNamedOperandTable = 1;
+  let Uses = [EXEC];
+  let SchedRW = [WriteExport];
+  let DisableWQM = 1;
+}
+
+class EXP_Pseudo<bit done> : EXPCommon<done>,
+                             SIMCInstr <NAME, SIEncodingFamily.NONE> {
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+}
+
+class EXP_Real<bit done, string pseudo, int subtarget>
+  : EXPCommon<done, "exp$tgt $src0, $src1, $src2, $src3"#!if(done, " done", "")
+                    #"$compr$vm">,
+    SIMCInstr <pseudo, subtarget> {
+  let AsmMatchConverter = "cvtExp";
+}
+
+//===----------------------------------------------------------------------===//
+// EXP Instructions
+//===----------------------------------------------------------------------===//
+
+// Split EXP instruction into EXP and EXP_DONE so we can set
+// mayLoad for done=1.
+def EXP : EXP_Pseudo<0>;
+def EXP_DONE : EXP_Pseudo<1>;
+
+//===----------------------------------------------------------------------===//
+// SI
+//===----------------------------------------------------------------------===//
+
+class EXP_Real_si<bit _done, string pseudo>
+  : EXP_Real<_done, pseudo, SIEncodingFamily.SI>, EXPe {
+  let AssemblerPredicate = isGFX6GFX7;
+  let DecoderNamespace = "GFX6GFX7";
+  let done = _done;
+}
+
+def EXP_si      : EXP_Real_si<0, "EXP">;
+def EXP_DONE_si : EXP_Real_si<1, "EXP_DONE">;
+
+//===----------------------------------------------------------------------===//
+// VI
+//===----------------------------------------------------------------------===//
+
+class EXP_Real_vi<bit _done, string pseudo>
+  : EXP_Real<_done, pseudo, SIEncodingFamily.VI>, EXPe_vi {
+  let AssemblerPredicate = isGFX8GFX9;
+  let DecoderNamespace = "GFX8";
+  let done = _done;
+}
+
+def EXP_vi      : EXP_Real_vi<0, "EXP">;
+def EXP_DONE_vi : EXP_Real_vi<1, "EXP_DONE">;
+
+//===----------------------------------------------------------------------===//
+// GFX10+
+//===----------------------------------------------------------------------===//
+
+class EXP_Real_gfx10<bit _done, string pseudo>
+  : EXP_Real<_done, pseudo, SIEncodingFamily.GFX10>, EXPe {
+  let AssemblerPredicate = isGFX10Plus;
+  let DecoderNamespace = "GFX10";
+  let done = _done;
+}
+
+def EXP_gfx10      : EXP_Real_gfx10<0, "EXP">;
+def EXP_DONE_gfx10 : EXP_Real_gfx10<1, "EXP_DONE">;
+
+//===----------------------------------------------------------------------===//
+// EXP Patterns
+//===----------------------------------------------------------------------===//
+
+class ExpPattern<ValueType vt, Instruction Inst, int done_val> : GCNPat<
+  (int_amdgcn_exp timm:$tgt, timm:$en,
+                  (vt ExpSrc0:$src0), (vt ExpSrc1:$src1),
+                  (vt ExpSrc2:$src2), (vt ExpSrc3:$src3),
+                  done_val, timm:$vm),
+  (Inst timm:$tgt, ExpSrc0:$src0, ExpSrc1:$src1,
+        ExpSrc2:$src2, ExpSrc3:$src3, timm:$vm, 0, timm:$en)
+>;
+
+class ExpComprPattern<ValueType vt, Instruction Inst, int done_val> : GCNPat<
+  (int_amdgcn_exp_compr timm:$tgt, timm:$en,
+                        (vt ExpSrc0:$src0), (vt ExpSrc1:$src1),
+                        done_val, timm:$vm),
+  (Inst timm:$tgt, ExpSrc0:$src0, ExpSrc1:$src1,
+        (IMPLICIT_DEF), (IMPLICIT_DEF), timm:$vm, 1, timm:$en)
+>;
+
+// FIXME: The generated DAG matcher seems to have strange behavior
+// with a 1-bit literal to match, so use a -1 for checking a true
+// 1-bit value.
+def : ExpPattern<i32, EXP, 0>;
+def : ExpPattern<i32, EXP_DONE, -1>;
+def : ExpPattern<f32, EXP, 0>;
+def : ExpPattern<f32, EXP_DONE, -1>;
+
+def : ExpComprPattern<v2i16, EXP, 0>;
+def : ExpComprPattern<v2i16, EXP_DONE, -1>;
+def : ExpComprPattern<v2f16, EXP, 0>;
+def : ExpComprPattern<v2f16, EXP_DONE, -1>;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
index 97104a242d8c..8d3e138ba56a 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
@@ -30,6 +30,15 @@ class EGOrCaymanPat<dag pattern, dag result> : AMDGPUPat<pattern, result> {
   let SubtargetPredicate = isEGorCayman;
 }
 
+def IMMZeroBasedBitfieldMask : ImmLeaf <i32, [{
+  return isMask_32(Imm);
+}]>;
+
+def IMMPopCount : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(countPopulation(N->getZExtValue()), SDLoc(N),
+                                   MVT::i32);
+}]>;
+
 //===----------------------------------------------------------------------===//
 // Evergreen / Cayman store instructions
 //===----------------------------------------------------------------------===//
@@ -69,7 +78,7 @@ multiclass RAT_ATOMIC<bits<6> op_ret, bits<6> op_noret, string name> {
   def  _RTN: CF_MEM_RAT <op_ret, 0, 0xf,
              (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr),
              (outs R600_Reg128:$out_gpr),
-             name # "_RTN" # " $rw_gpr, $index_gpr", [] >;
+             name # "_RTN $rw_gpr, $index_gpr", [] >;
   def _NORET: CF_MEM_RAT <op_noret, 0, 0xf,
               (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr),
               (outs R600_Reg128:$out_gpr),
@@ -394,7 +403,41 @@ def BFE_INT_eg : R600_3OP <0x5, "BFE_INT",
   VecALU
 >;
 
-defm : BFEPattern <BFE_UINT_eg, BFE_INT_eg, MOV_IMM_I32>;
+// Bitfield extract patterns
+
+def : AMDGPUPat <
+  (and (i32 (srl i32:$src, i32:$rshift)), IMMZeroBasedBitfieldMask:$mask),
+  (BFE_UINT_eg $src, $rshift, (MOV_IMM_I32 (i32 (IMMPopCount $mask))))
+>;
+
+// x & ((1 << y) - 1)
+def : AMDGPUPat <
+  (and i32:$src, (add_oneuse (shl_oneuse 1, i32:$width), -1)),
+  (BFE_UINT_eg $src, (MOV_IMM_I32 (i32 0)), $width)
+>;
+
+// x & ~(-1 << y)
+def : AMDGPUPat <
+  (and i32:$src, (xor_oneuse (shl_oneuse -1, i32:$width), -1)),
+  (BFE_UINT_eg $src, (MOV_IMM_I32 (i32 0)), $width)
+>;
+
+// x & (-1 >> (bitwidth - y))
+def : AMDGPUPat <
+  (and i32:$src, (srl_oneuse -1, (sub 32, i32:$width))),
+  (BFE_UINT_eg $src, (MOV_IMM_I32 (i32 0)), $width)
+>;
+
+// x << (bitwidth - y) >> (bitwidth - y)
+def : AMDGPUPat <
+  (srl (shl_oneuse i32:$src, (sub 32, i32:$width)), (sub 32, i32:$width)),
+  (BFE_UINT_eg $src, (MOV_IMM_I32 (i32 0)), $width)
+>;
+
+def : AMDGPUPat <
+  (sra (shl_oneuse i32:$src, (sub 32, i32:$width)), (sub 32, i32:$width)),
+  (BFE_INT_eg $src, (MOV_IMM_I32 (i32 0)), $width)
+>;
 
 def BFI_INT_eg : R600_3OP <0x06, "BFI_INT",
   [(set i32:$dst, (AMDGPUbfi i32:$src0, i32:$src1, i32:$src2))],
@@ -408,7 +451,74 @@ def : EGOrCaymanPat<(i32 (sext_inreg i32:$src, i8)),
 def : EGOrCaymanPat<(i32 (sext_inreg i32:$src, i16)),
   (BFE_INT_eg i32:$src, (i32 ZERO), (MOV_IMM_I32 16))>;
 
-defm : BFIPatterns <BFI_INT_eg, MOV_IMM_I32, R600_Reg64>;
+// BFI patterns
+
+// Definition from ISA doc:
+// (y & x) | (z & ~x)
+def : AMDGPUPat <
+  (or (and i32:$y, i32:$x), (and i32:$z, (not i32:$x))),
+  (BFI_INT_eg $x, $y, $z)
+>;
+
+// 64-bit version
+def : AMDGPUPat <
+  (or (and i64:$y, i64:$x), (and i64:$z, (not i64:$x))),
+  (REG_SEQUENCE R600_Reg64,
+    (BFI_INT_eg (i32 (EXTRACT_SUBREG R600_Reg64:$x, sub0)),
+                (i32 (EXTRACT_SUBREG R600_Reg64:$y, sub0)),
+                (i32 (EXTRACT_SUBREG R600_Reg64:$z, sub0))), sub0,
+    (BFI_INT_eg (i32 (EXTRACT_SUBREG R600_Reg64:$x, sub1)),
+                (i32 (EXTRACT_SUBREG R600_Reg64:$y, sub1)),
+                (i32 (EXTRACT_SUBREG R600_Reg64:$z, sub1))), sub1)
+>;
+
+// SHA-256 Ch function
+// z ^ (x & (y ^ z))
+def : AMDGPUPat <
+  (xor i32:$z, (and i32:$x, (xor i32:$y, i32:$z))),
+  (BFI_INT_eg $x, $y, $z)
+>;
+
+// 64-bit version
+def : AMDGPUPat <
+  (xor i64:$z, (and i64:$x, (xor i64:$y, i64:$z))),
+  (REG_SEQUENCE R600_Reg64,
+    (BFI_INT_eg (i32 (EXTRACT_SUBREG R600_Reg64:$x, sub0)),
+                (i32 (EXTRACT_SUBREG R600_Reg64:$y, sub0)),
+                (i32 (EXTRACT_SUBREG R600_Reg64:$z, sub0))), sub0,
+    (BFI_INT_eg (i32 (EXTRACT_SUBREG R600_Reg64:$x, sub1)),
+                (i32 (EXTRACT_SUBREG R600_Reg64:$y, sub1)),
+                (i32 (EXTRACT_SUBREG R600_Reg64:$z, sub1))), sub1)
+>;
+
+def : AMDGPUPat <
+  (fcopysign f32:$src0, f32:$src1),
+  (BFI_INT_eg (MOV_IMM_I32 (i32 0x7fffffff)), $src0, $src1)
+>;
+
+def : AMDGPUPat <
+  (fcopysign f32:$src0, f64:$src1),
+  (BFI_INT_eg (MOV_IMM_I32 (i32 0x7fffffff)), $src0,
+              (i32 (EXTRACT_SUBREG R600_Reg64:$src1, sub1)))
+>;
+
+def : AMDGPUPat <
+  (fcopysign f64:$src0, f64:$src1),
+  (REG_SEQUENCE R600_Reg64,
+    (i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
+    (BFI_INT_eg (MOV_IMM_I32 (i32 0x7fffffff)),
+                (i32 (EXTRACT_SUBREG R600_Reg64:$src0, sub1)),
+                (i32 (EXTRACT_SUBREG R600_Reg64:$src1, sub1))), sub1)
+>;
+
+def : AMDGPUPat <
+  (fcopysign f64:$src0, f32:$src1),
+  (REG_SEQUENCE R600_Reg64,
+    (i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
+    (BFI_INT_eg (MOV_IMM_I32 (i32 0x7fffffff)),
+                (i32 (EXTRACT_SUBREG R600_Reg64:$src0, sub1)),
+                $src1), sub1)
+>;
 
 def BFM_INT_eg : R600_2OP <0xA0, "BFM_INT",
   [(set i32:$dst, (AMDGPUbfm i32:$src0, i32:$src1))],
@@ -692,8 +802,26 @@ def : EGOrCaymanPat<(fp_to_sint f32:$src0), (FLT_TO_INT_eg (TRUNC $src0))>;
 
 def : EGOrCaymanPat<(fp_to_uint f32:$src0), (FLT_TO_UINT_eg (TRUNC $src0))>;
 
-// SHA-256 Patterns
-defm : SHA256MaPattern <BFI_INT_eg, XOR_INT, R600_Reg64>;
+// SHA-256 Ma patterns
+
+// ((x & z) | (y & (x | z))) -> BFI (XOR x, y), z, y
+def : AMDGPUPat <
+  (or (and i32:$x, i32:$z), (and i32:$y, (or i32:$x, i32:$z))),
+  (BFI_INT_eg (XOR_INT i32:$x, i32:$y), i32:$z, i32:$y)
+>;
+
+def : AMDGPUPat <
+  (or (and i64:$x, i64:$z), (and i64:$y, (or i64:$x, i64:$z))),
+  (REG_SEQUENCE R600_Reg64,
+    (BFI_INT_eg (XOR_INT (i32 (EXTRACT_SUBREG R600_Reg64:$x, sub0)),
+                     (i32 (EXTRACT_SUBREG R600_Reg64:$y, sub0))),
+                (i32 (EXTRACT_SUBREG R600_Reg64:$z, sub0)),
+                (i32 (EXTRACT_SUBREG R600_Reg64:$y, sub0))), sub0,
+    (BFI_INT_eg (XOR_INT (i32 (EXTRACT_SUBREG R600_Reg64:$x, sub1)),
+                     (i32 (EXTRACT_SUBREG R600_Reg64:$y, sub1))),
+                (i32 (EXTRACT_SUBREG R600_Reg64:$z, sub1)),
+                (i32 (EXTRACT_SUBREG R600_Reg64:$y, sub1))), sub1)
+>;
 
 def EG_ExportSwz : ExportSwzInst {
   let Word1{19-16} = 0; // BURST_COUNT
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/FLATInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 69facada2e96..57a355a55a02 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -6,11 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-def FLATAtomic : ComplexPattern<i64, 3, "SelectFlatAtomic", [], [SDNPWantRoot], -10>;
-def FLATOffset : ComplexPattern<i64, 3, "SelectFlatOffset<false>", [], [SDNPWantRoot], -10>;
+def FLATOffset : ComplexPattern<i64, 2, "SelectFlatOffset<false>", [], [SDNPWantRoot], -10>;
+def FLATOffsetSigned : ComplexPattern<i64, 2, "SelectFlatOffset<true>", [], [SDNPWantRoot], -10>;
+def ScratchOffset : ComplexPattern<i32, 2, "SelectFlatOffset<true>", [], [SDNPWantRoot], -10>;
 
-def FLATOffsetSigned : ComplexPattern<i64, 3, "SelectFlatOffset<true>", [], [SDNPWantRoot], -10>;
-def FLATSignedAtomic : ComplexPattern<i64, 3, "SelectFlatAtomicSigned", [], [SDNPWantRoot], -10>;
+def GlobalSAddr : ComplexPattern<i64, 3, "SelectGlobalSAddr", [], [SDNPWantRoot], -10>;
+def ScratchSAddr : ComplexPattern<i32, 2, "SelectScratchSAddr", [], [SDNPWantRoot], -10>;
 
 //===----------------------------------------------------------------------===//
 // FLAT classes
@@ -64,9 +65,11 @@ class FLAT_Pseudo<string opName, dag outs, dag ins,
   // Buffer instruction; so, they increment both VM_CNT and LGKM_CNT
   // and are not considered done until both have been decremented.
   let VM_CNT = 1;
-  let LGKM_CNT = !if(!or(is_flat_global, is_flat_scratch), 0, 1);
+  let LGKM_CNT = !not(!or(is_flat_global, is_flat_scratch));
 
-  let IsNonFlatSeg = !if(!or(is_flat_global, is_flat_scratch), 1, 0);
+  let IsFlatGlobal = is_flat_global;
+
+  let IsFlatScratch = is_flat_scratch;
 }
 
 class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
@@ -79,6 +82,7 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
   // copy relevant pseudo op flags
   let SubtargetPredicate = ps.SubtargetPredicate;
   let AsmMatchConverter  = ps.AsmMatchConverter;
+  let OtherPredicates = ps.OtherPredicates;
   let TSFlags = ps.TSFlags;
   let UseNamedOperandTable = ps.UseNamedOperandTable;
 
@@ -140,10 +144,13 @@ class FLAT_Load_Pseudo <string opName, RegisterClass regClass,
   (outs regClass:$vdst),
   !con(
     !con(
-      !con((ins VReg_64:$vaddr),
-        !if(EnableSaddr, (ins SReg_64:$saddr), (ins))),
-          (ins flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc)),
-          !if(HasTiedOutput, (ins regClass:$vdst_in), (ins))),
+      !if(EnableSaddr,
+        (ins SReg_64:$saddr, VGPR_32:$vaddr),
+        (ins VReg_64:$vaddr)),
+        (ins flat_offset:$offset)),
+        // FIXME: Operands with default values do not work with following non-optional operands.
+        !if(HasTiedOutput, (ins GLC:$glc, SLC:$slc, DLC:$dlc, regClass:$vdst_in),
+                           (ins GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc))),
   " $vdst, $vaddr"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$glc$slc$dlc"> {
   let has_data = 0;
   let mayLoad = 1;
@@ -161,9 +168,10 @@ class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
   opName,
   (outs),
   !con(
-    !con((ins VReg_64:$vaddr, vdataClass:$vdata),
-      !if(EnableSaddr, (ins SReg_64:$saddr), (ins))),
-        (ins flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc)),
+    !if(EnableSaddr,
+      (ins VGPR_32:$vaddr, vdataClass:$vdata, SReg_64:$saddr),
+      (ins VReg_64:$vaddr, vdataClass:$vdata)),
+      (ins flat_offset:$offset, GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc)),
   " $vaddr, $vdata"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$glc$slc$dlc"> {
   let mayLoad  = 0;
   let mayStore = 1;
@@ -184,24 +192,34 @@ multiclass FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass, bit Ha
 }
 
 class FLAT_Global_Load_AddTid_Pseudo <string opName, RegisterClass regClass,
-  bit HasTiedOutput = 0, bit HasSignedOffset = 0> : FLAT_Pseudo<
+  bit HasTiedOutput = 0, bit HasSignedOffset = 0, bit EnableSaddr = 0> : FLAT_Pseudo<
   opName,
   (outs regClass:$vdst),
-  !con((ins SReg_64:$saddr, flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc),
+  !con(!if(EnableSaddr, (ins SReg_64:$saddr), (ins)),
+    (ins flat_offset:$offset, GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc),
     !if(HasTiedOutput, (ins regClass:$vdst_in), (ins))),
-  " $vdst, $saddr$offset$glc$slc$dlc"> {
+  " $vdst, "#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc$dlc"> {
   let is_flat_global = 1;
   let has_data = 0;
   let mayLoad = 1;
   let has_vaddr = 0;
   let has_saddr = 1;
-  let enabled_saddr = 1;
+  let enabled_saddr = EnableSaddr;
   let maybeAtomic = 1;
+  let PseudoInstr = opName#!if(EnableSaddr, "_SADDR", "");
 
   let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", "");
   let DisableEncoding = !if(HasTiedOutput, "$vdst_in", "");
 }
 
+multiclass FLAT_Global_Load_AddTid_Pseudo<string opName, RegisterClass regClass,
+  bit HasTiedOutput = 0, bit HasSignedOffset = 0> {
+  def "" : FLAT_Global_Load_AddTid_Pseudo<opName, regClass, HasTiedOutput, HasSignedOffset>,
+    GlobalSaddrTable<0, opName>;
+  def _SADDR : FLAT_Global_Load_AddTid_Pseudo<opName, regClass, HasTiedOutput, HasSignedOffset, 1>,
+    GlobalSaddrTable<1, opName>;
+}
+
 multiclass FLAT_Global_Store_Pseudo<string opName, RegisterClass regClass> {
   let is_flat_global = 1, SubtargetPredicate = HasFlatGlobalInsts in {
     def "" : FLAT_Store_Pseudo<opName, regClass, 1>,
@@ -212,68 +230,107 @@ multiclass FLAT_Global_Store_Pseudo<string opName, RegisterClass regClass> {
 }
 
 class FLAT_Global_Store_AddTid_Pseudo <string opName, RegisterClass vdataClass,
-  bit HasSignedOffset = 0> : FLAT_Pseudo<
+  bit HasSignedOffset = 0, bit EnableSaddr = 0> : FLAT_Pseudo<
   opName,
   (outs),
-  !con(
-    (ins vdataClass:$vdata, SReg_64:$saddr),
-      (ins flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc)),
-  " $vdata, $saddr$offset$glc$slc$dlc"> {
+  !con(!if(EnableSaddr, (ins vdataClass:$vdata, SReg_64:$saddr), (ins vdataClass:$vdata)),
+    (ins flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc)),
+  " $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc$dlc"> {
   let is_flat_global = 1;
   let mayLoad  = 0;
   let mayStore = 1;
   let has_vdst = 0;
   let has_vaddr = 0;
   let has_saddr = 1;
-  let enabled_saddr = 1;
+  let enabled_saddr = EnableSaddr;
   let maybeAtomic = 1;
+  let PseudoInstr = opName#!if(EnableSaddr, "_SADDR", "");
+}
+
+multiclass FLAT_Global_Store_AddTid_Pseudo<string opName, RegisterClass regClass,
+  bit HasSignedOffset = 0> {
+  def "" : FLAT_Global_Store_AddTid_Pseudo<opName, regClass, HasSignedOffset>,
+    GlobalSaddrTable<0, opName>;
+  def _SADDR : FLAT_Global_Store_AddTid_Pseudo<opName, regClass, HasSignedOffset, 1>,
+    GlobalSaddrTable<1, opName>;
+}
+
+class FlatScratchInst <string sv_op, string mode> {
+  string SVOp = sv_op;
+  string Mode = mode;
 }
 
 class FLAT_Scratch_Load_Pseudo <string opName, RegisterClass regClass,
-  bit EnableSaddr = 0>: FLAT_Pseudo<
+  bit HasTiedOutput = 0,
+  bit EnableSaddr = 0,
+  bit EnableVaddr = !not(EnableSaddr)>
+  : FLAT_Pseudo<
   opName,
   (outs regClass:$vdst),
-  !if(EnableSaddr,
-      (ins SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc),
-      (ins VGPR_32:$vaddr, flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc)),
-  " $vdst, "#!if(EnableSaddr, "off", "$vaddr")#!if(EnableSaddr, ", $saddr", ", off")#"$offset$glc$slc$dlc"> {
+  !con(
+     !if(EnableSaddr,
+       (ins SReg_32_XEXEC_HI:$saddr, flat_offset:$offset),
+       !if(EnableVaddr,
+         (ins VGPR_32:$vaddr, flat_offset:$offset),
+         (ins flat_offset:$offset))),
+     !if(HasTiedOutput, (ins GLC:$glc, SLC:$slc, DLC:$dlc, regClass:$vdst_in),
+                        (ins GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc))),
+  " $vdst, "#!if(EnableVaddr, "$vaddr, ", "off, ")#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc$dlc"> {
   let has_data = 0;
   let mayLoad = 1;
   let has_saddr = 1;
   let enabled_saddr = EnableSaddr;
-  let has_vaddr = !if(EnableSaddr, 0, 1);
-  let PseudoInstr = opName#!if(EnableSaddr, "_SADDR", "");
+  let has_vaddr = EnableVaddr;
+  let PseudoInstr = opName#!if(EnableSaddr, "_SADDR", !if(EnableVaddr, "", "_ST"));
   let maybeAtomic = 1;
+
+  let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", "");
+  let DisableEncoding = !if(HasTiedOutput, "$vdst_in", "");
 }
 
-class FLAT_Scratch_Store_Pseudo <string opName, RegisterClass vdataClass, bit EnableSaddr = 0> : FLAT_Pseudo<
+class FLAT_Scratch_Store_Pseudo <string opName, RegisterClass vdataClass, bit EnableSaddr = 0,
+  bit EnableVaddr = !not(EnableSaddr)> : FLAT_Pseudo<
   opName,
   (outs),
   !if(EnableSaddr,
-    (ins vdataClass:$vdata, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc),
-    (ins vdataClass:$vdata, VGPR_32:$vaddr, flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc)),
-  " "#!if(EnableSaddr, "off", "$vaddr")#", $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc$dlc"> {
+    (ins vdataClass:$vdata, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc),
+    !if(EnableVaddr,
+      (ins vdataClass:$vdata, VGPR_32:$vaddr, flat_offset:$offset, GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc),
+      (ins vdataClass:$vdata, flat_offset:$offset, GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc))),
+  " "#!if(EnableVaddr, "$vaddr", "off")#", $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc$dlc"> {
   let mayLoad  = 0;
   let mayStore = 1;
   let has_vdst = 0;
   let has_saddr = 1;
   let enabled_saddr = EnableSaddr;
-  let has_vaddr = !if(EnableSaddr, 0, 1);
-  let PseudoInstr = opName#!if(EnableSaddr, "_SADDR", "");
+  let has_vaddr = EnableVaddr;
+  let PseudoInstr = opName#!if(EnableSaddr, "_SADDR", !if(EnableVaddr, "", "_ST"));
   let maybeAtomic = 1;
 }
 
-multiclass FLAT_Scratch_Load_Pseudo<string opName, RegisterClass regClass> {
+multiclass FLAT_Scratch_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedOutput = 0> {
   let is_flat_scratch = 1 in {
-    def "" : FLAT_Scratch_Load_Pseudo<opName, regClass>;
-    def _SADDR : FLAT_Scratch_Load_Pseudo<opName, regClass, 1>;
+    def "" : FLAT_Scratch_Load_Pseudo<opName, regClass, HasTiedOutput>,
+             FlatScratchInst<opName, "SV">;
+    def _SADDR : FLAT_Scratch_Load_Pseudo<opName, regClass, HasTiedOutput, 1>,
+                 FlatScratchInst<opName, "SS">;
+
+    let SubtargetPredicate = HasFlatScratchSTMode in
+    def _ST  : FLAT_Scratch_Load_Pseudo<opName, regClass, HasTiedOutput, 0, 0>,
+               FlatScratchInst<opName, "ST">;
   }
 }
 
 multiclass FLAT_Scratch_Store_Pseudo<string opName, RegisterClass regClass> {
   let is_flat_scratch = 1 in {
-    def "" : FLAT_Scratch_Store_Pseudo<opName, regClass>;
-    def _SADDR : FLAT_Scratch_Store_Pseudo<opName, regClass, 1>;
+    def "" : FLAT_Scratch_Store_Pseudo<opName, regClass>,
+             FlatScratchInst<opName, "SV">;
+    def _SADDR : FLAT_Scratch_Store_Pseudo<opName, regClass, 1>,
+                 FlatScratchInst<opName, "SS">;
+
+    let SubtargetPredicate = HasFlatScratchSTMode in
+    def _ST  : FLAT_Scratch_Store_Pseudo<opName, regClass, 0, 0>,
+               FlatScratchInst<opName, "ST">;
   }
 }
 
@@ -310,7 +367,7 @@ multiclass FLAT_Atomic_Pseudo<
   bit isFP = isFloatType<data_vt>.ret> {
   def "" : FLAT_AtomicNoRet_Pseudo <opName,
     (outs),
-    (ins VReg_64:$vaddr, data_rc:$vdata, flat_offset:$offset, SLC:$slc),
+    (ins VReg_64:$vaddr, data_rc:$vdata, flat_offset:$offset, SLC_0:$slc),
     " $vaddr, $vdata$offset$slc">,
     GlobalSaddrTable<0, opName>,
     AtomicNoRet <opName, 0> {
@@ -321,10 +378,10 @@ multiclass FLAT_Atomic_Pseudo<
 
   def _RTN : FLAT_AtomicRet_Pseudo <opName,
     (outs vdst_rc:$vdst),
-    (ins VReg_64:$vaddr, data_rc:$vdata, flat_offset:$offset, SLC:$slc),
-    " $vdst, $vaddr, $vdata$offset glc$slc",
+    (ins VReg_64:$vaddr, data_rc:$vdata, flat_offset:$offset, GLC_1:$glc1, SLC_0:$slc),
+    " $vdst, $vaddr, $vdata$offset$glc1$slc",
     [(set vt:$vdst,
-      (atomic (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$vdata))]>,
+      (atomic (FLATOffset i64:$vaddr, i16:$offset), data_vt:$vdata))]>,
        GlobalSaddrTable<0, opName#"_rtn">,
        AtomicNoRet <opName, 1>{
     let FPAtomic = isFP;
@@ -343,7 +400,7 @@ multiclass FLAT_Global_Atomic_Pseudo_NO_RTN<
 
   def "" : FLAT_AtomicNoRet_Pseudo <opName,
     (outs),
-    (ins VReg_64:$vaddr, data_rc:$vdata, flat_offset:$offset, SLC:$slc),
+    (ins VReg_64:$vaddr, data_rc:$vdata, flat_offset:$offset, SLC_0:$slc),
     " $vaddr, $vdata, off$offset$slc">,
     GlobalSaddrTable<0, opName>,
     AtomicNoRet <opName, 0> {
@@ -354,7 +411,7 @@ multiclass FLAT_Global_Atomic_Pseudo_NO_RTN<
 
   def _SADDR : FLAT_AtomicNoRet_Pseudo <opName,
     (outs),
-    (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, flat_offset:$offset, SLC:$slc),
+    (ins VGPR_32:$vaddr, data_rc:$vdata, SReg_64:$saddr, flat_offset:$offset, SLC_0:$slc),
     " $vaddr, $vdata, $saddr$offset$slc">,
     GlobalSaddrTable<1, opName>,
     AtomicNoRet <opName#"_saddr", 0> {
@@ -376,10 +433,10 @@ multiclass FLAT_Global_Atomic_Pseudo_RTN<
 
   def _RTN : FLAT_AtomicRet_Pseudo <opName,
     (outs vdst_rc:$vdst),
-      (ins VReg_64:$vaddr, data_rc:$vdata, flat_offset:$offset, SLC:$slc),
-    " $vdst, $vaddr, $vdata, off$offset glc$slc",
+      (ins VReg_64:$vaddr, data_rc:$vdata, flat_offset:$offset, GLC_1:$glc1, SLC_0:$slc),
+    " $vdst, $vaddr, $vdata, off$offset$glc1$slc",
     [(set vt:$vdst,
-      (atomic (FLATSignedAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$vdata))]>,
+      (atomic (FLATOffsetSigned i64:$vaddr, i16:$offset), data_vt:$vdata))]>,
       GlobalSaddrTable<0, opName#"_rtn">,
       AtomicNoRet <opName, 1> {
     let has_saddr = 1;
@@ -388,8 +445,8 @@ multiclass FLAT_Global_Atomic_Pseudo_RTN<
 
   def _SADDR_RTN : FLAT_AtomicRet_Pseudo <opName,
     (outs vdst_rc:$vdst),
-      (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, flat_offset:$offset, SLC:$slc),
-    " $vdst, $vaddr, $vdata, $saddr$offset glc$slc">,
+      (ins VGPR_32:$vaddr, data_rc:$vdata, SReg_64:$saddr, flat_offset:$offset, GLC_1:$glc1, SLC_0:$slc),
+    " $vdst, $vaddr, $vdata, $saddr$offset$glc1$slc">,
     GlobalSaddrTable<1, opName#"_rtn">,
     AtomicNoRet <opName#"_saddr", 1> {
      let has_saddr = 1;
@@ -564,7 +621,7 @@ defm GLOBAL_LOAD_SBYTE_D16_HI : FLAT_Global_Load_Pseudo <"global_load_sbyte_d16_
 defm GLOBAL_LOAD_SHORT_D16    : FLAT_Global_Load_Pseudo <"global_load_short_d16", VGPR_32, 1>;
 defm GLOBAL_LOAD_SHORT_D16_HI : FLAT_Global_Load_Pseudo <"global_load_short_d16_hi", VGPR_32, 1>;
 let OtherPredicates = [HasGFX10_BEncoding] in
-def  GLOBAL_LOAD_DWORD_ADDTID : FLAT_Global_Load_AddTid_Pseudo <"global_load_dword_addtid", VGPR_32>;
+defm GLOBAL_LOAD_DWORD_ADDTID : FLAT_Global_Load_AddTid_Pseudo <"global_load_dword_addtid", VGPR_32>;
 
 defm GLOBAL_STORE_BYTE    : FLAT_Global_Store_Pseudo <"global_store_byte", VGPR_32>;
 defm GLOBAL_STORE_SHORT   : FLAT_Global_Store_Pseudo <"global_store_short", VGPR_32>;
@@ -573,7 +630,7 @@ defm GLOBAL_STORE_DWORDX2 : FLAT_Global_Store_Pseudo <"global_store_dwordx2", VR
 defm GLOBAL_STORE_DWORDX3 : FLAT_Global_Store_Pseudo <"global_store_dwordx3", VReg_96>;
 defm GLOBAL_STORE_DWORDX4 : FLAT_Global_Store_Pseudo <"global_store_dwordx4", VReg_128>;
 let OtherPredicates = [HasGFX10_BEncoding] in
-def  GLOBAL_STORE_DWORD_ADDTID : FLAT_Global_Store_AddTid_Pseudo <"global_store_dword_addtid", VGPR_32>;
+defm GLOBAL_STORE_DWORD_ADDTID : FLAT_Global_Store_AddTid_Pseudo <"global_store_dword_addtid", VGPR_32>;
 
 defm GLOBAL_STORE_BYTE_D16_HI  : FLAT_Global_Store_Pseudo <"global_store_byte_d16_hi", VGPR_32>;
 defm GLOBAL_STORE_SHORT_D16_HI : FLAT_Global_Store_Pseudo <"global_store_short_d16_hi", VGPR_32>;
@@ -662,7 +719,7 @@ defm GLOBAL_ATOMIC_DEC_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_dec_x2",
 
 let SubtargetPredicate = HasGFX10_BEncoding in
 defm GLOBAL_ATOMIC_CSUB : FLAT_Global_Atomic_Pseudo_RTN <"global_atomic_csub",
-                              VGPR_32, i32, atomic_csub_global_32>;
+                              VGPR_32, i32, int_amdgcn_global_atomic_csub>;
 } // End is_flat_global = 1
 
 
@@ -677,12 +734,12 @@ defm SCRATCH_LOAD_DWORDX2  : FLAT_Scratch_Load_Pseudo <"scratch_load_dwordx2", V
 defm SCRATCH_LOAD_DWORDX3  : FLAT_Scratch_Load_Pseudo <"scratch_load_dwordx3", VReg_96>;
 defm SCRATCH_LOAD_DWORDX4  : FLAT_Scratch_Load_Pseudo <"scratch_load_dwordx4", VReg_128>;
 
-defm SCRATCH_LOAD_UBYTE_D16    : FLAT_Scratch_Load_Pseudo <"scratch_load_ubyte_d16", VGPR_32>;
-defm SCRATCH_LOAD_UBYTE_D16_HI : FLAT_Scratch_Load_Pseudo <"scratch_load_ubyte_d16_hi", VGPR_32>;
-defm SCRATCH_LOAD_SBYTE_D16    : FLAT_Scratch_Load_Pseudo <"scratch_load_sbyte_d16", VGPR_32>;
-defm SCRATCH_LOAD_SBYTE_D16_HI : FLAT_Scratch_Load_Pseudo <"scratch_load_sbyte_d16_hi", VGPR_32>;
-defm SCRATCH_LOAD_SHORT_D16    : FLAT_Scratch_Load_Pseudo <"scratch_load_short_d16", VGPR_32>;
-defm SCRATCH_LOAD_SHORT_D16_HI : FLAT_Scratch_Load_Pseudo <"scratch_load_short_d16_hi", VGPR_32>;
+defm SCRATCH_LOAD_UBYTE_D16    : FLAT_Scratch_Load_Pseudo <"scratch_load_ubyte_d16", VGPR_32, 1>;
+defm SCRATCH_LOAD_UBYTE_D16_HI : FLAT_Scratch_Load_Pseudo <"scratch_load_ubyte_d16_hi", VGPR_32, 1>;
+defm SCRATCH_LOAD_SBYTE_D16    : FLAT_Scratch_Load_Pseudo <"scratch_load_sbyte_d16", VGPR_32, 1>;
+defm SCRATCH_LOAD_SBYTE_D16_HI : FLAT_Scratch_Load_Pseudo <"scratch_load_sbyte_d16_hi", VGPR_32, 1>;
+defm SCRATCH_LOAD_SHORT_D16    : FLAT_Scratch_Load_Pseudo <"scratch_load_short_d16", VGPR_32, 1>;
+defm SCRATCH_LOAD_SHORT_D16_HI : FLAT_Scratch_Load_Pseudo <"scratch_load_short_d16_hi", VGPR_32, 1>;
 
 defm SCRATCH_STORE_BYTE    : FLAT_Scratch_Store_Pseudo <"scratch_store_byte", VGPR_32>;
 defm SCRATCH_STORE_SHORT   : FLAT_Scratch_Store_Pseudo <"scratch_store_short", VGPR_32>;
@@ -711,16 +768,16 @@ let SubtargetPredicate = isGFX10Plus, is_flat_global = 1 in {
     FLAT_Global_Atomic_Pseudo<"global_atomic_fmax_x2", VReg_64, f64>;
 } // End SubtargetPredicate = isGFX10Plus, is_flat_global = 1
 
-let SubtargetPredicate = HasAtomicFaddInsts, is_flat_global = 1 in {
-
-defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_NO_RTN <
-  "global_atomic_add_f32", VGPR_32, f32, atomic_fadd_global_noret
->;
-defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_NO_RTN <
-  "global_atomic_pk_add_f16", VGPR_32, v2f16, atomic_pk_fadd_global_noret
->;
-
-} // End SubtargetPredicate = HasAtomicFaddInsts
+let is_flat_global = 1 in {
+let OtherPredicates = [HasAtomicFaddInsts] in {
+  defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_NO_RTN <
+    "global_atomic_add_f32", VGPR_32, f32
+  >;
+  defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_NO_RTN <
+    "global_atomic_pk_add_f16", VGPR_32, v2f16
+  >;
+} // End OtherPredicates = [HasAtomicFaddInsts]
+} // End is_flat_global = 1
 
 //===----------------------------------------------------------------------===//
 // Flat Patterns
@@ -728,69 +785,135 @@ defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_NO_RTN <
 
 // Patterns for global loads with no offset.
 class FlatLoadPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
-  (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc))),
-  (inst $vaddr, $offset, 0, 0, $slc)
+  (vt (node (FLATOffset i64:$vaddr, i16:$offset))),
+  (inst $vaddr, $offset)
 >;
 
 class FlatLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
-  (node (FLATOffset (i64 VReg_64:$vaddr), i16:$offset, i1:$slc), vt:$in),
-  (inst $vaddr, $offset, 0, 0, $slc, $in)
+  (node (FLATOffset (i64 VReg_64:$vaddr), i16:$offset), vt:$in),
+  (inst $vaddr, $offset, 0, 0, 0, $in)
 >;
 
 class FlatSignedLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
-  (node (FLATOffsetSigned (i64 VReg_64:$vaddr), i16:$offset, i1:$slc), vt:$in),
-  (inst $vaddr, $offset, 0, 0, $slc, $in)
+  (node (FLATOffsetSigned (i64 VReg_64:$vaddr), i16:$offset), vt:$in),
+  (inst $vaddr, $offset, 0, 0, 0, $in)
 >;
 
-class FlatLoadAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
-  (vt (node (FLATAtomic (i64 VReg_64:$vaddr), i16:$offset, i1:$slc))),
-  (inst $vaddr, $offset, 0, 0, $slc)
+class GlobalLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+  (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i16:$offset), vt:$in)),
+  (inst $saddr, $voffset, $offset, 0, 0, 0, $in)
 >;
 
 class FlatLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
-  (vt (node (FLATOffsetSigned (i64 VReg_64:$vaddr), i16:$offset, i1:$slc))),
-  (inst $vaddr, $offset, 0, 0, $slc)
+  (vt (node (FLATOffsetSigned (i64 VReg_64:$vaddr), i16:$offset))),
+  (inst $vaddr, $offset)
+>;
+
+class GlobalLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+  (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i16:$offset))),
+  (inst $saddr, $voffset, $offset, 0, 0, 0)
 >;
 
-class FlatStorePat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, RegisterClass rc = VGPR_32> : GCNPat <
-  (node vt:$data, (FLATOffset i64:$vaddr, i16:$offset, i1:$slc)),
-  (inst $vaddr, rc:$data, $offset, 0, 0, $slc)
+class GlobalStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
+                           ValueType vt> : GCNPat <
+  (node vt:$data, (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i16:$offset)),
+  (inst $voffset, getVregSrcForVT<vt>.ret:$data, $saddr, $offset)
 >;
 
-class FlatStoreSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, RegisterClass rc = VGPR_32> : GCNPat <
-  (node vt:$data, (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc)),
-  (inst $vaddr, rc:$data, $offset, 0, 0, $slc)
+class GlobalAtomicStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
+                                 ValueType vt> : GCNPat <
+  (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i16:$offset), vt:$data),
+  (inst $voffset, getVregSrcForVT<vt>.ret:$data, $saddr, $offset)
 >;
 
-class FlatStoreAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, RegisterClass rc = VGPR_32> : GCNPat <
+class GlobalAtomicSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
+                            ValueType vt, ValueType data_vt = vt> : GCNPat <
+  (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i16:$offset), data_vt:$data)),
+  (inst $voffset, getVregSrcForVT<data_vt>.ret:$data, $saddr, $offset)
+>;
+
+class GlobalAtomicNoRtnSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
+                                 ValueType vt> : GCNPat <
+  (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i16:$offset), vt:$data),
+  (inst $voffset, getVregSrcForVT<vt>.ret:$data, $saddr, $offset)
+>;
+
+class FlatStorePat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+  (node vt:$data, (FLATOffset i64:$vaddr, i16:$offset)),
+  (inst $vaddr, getVregSrcForVT<vt>.ret:$data, $offset)
+>;
+
+class FlatStoreSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+  (node vt:$data, (FLATOffsetSigned i64:$vaddr, i16:$offset)),
+  (inst $vaddr, getVregSrcForVT<vt>.ret:$data, $offset)
+>;
+
+class FlatStoreAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
   // atomic store follows atomic binop convention so the address comes
   // first.
-  (node (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc), vt:$data),
-  (inst $vaddr, rc:$data, $offset, 0, 0, $slc)
+  (node (FLATOffset i64:$vaddr, i16:$offset), vt:$data),
+  (inst $vaddr, getVregSrcForVT<vt>.ret:$data, $offset)
 >;
 
-class FlatStoreSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, RegisterClass rc = VGPR_32> : GCNPat <
+class FlatStoreSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node,
+                                ValueType vt, ValueType data_vt = vt> : GCNPat <
   // atomic store follows atomic binop convention so the address comes
   // first.
-  (node (FLATSignedAtomic i64:$vaddr, i16:$offset, i1:$slc), vt:$data),
-  (inst $vaddr, rc:$data, $offset, 0, 0, $slc)
+  (node (FLATOffset i64:$vaddr, i16:$offset), data_vt:$data),
+  (inst $vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)
 >;
 
 class FlatAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt,
                      ValueType data_vt = vt> : GCNPat <
-  (vt (node (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$data)),
-  (inst $vaddr, $data, $offset, $slc)
+  (vt (node (FLATOffset i64:$vaddr, i16:$offset), data_vt:$data)),
+  (inst $vaddr, $data, $offset)
 >;
 
 class FlatAtomicPatNoRtn <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
-  (node (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc), vt:$data),
-  (inst $vaddr, $data, $offset, $slc)
+  (node (FLATOffset i64:$vaddr, i16:$offset), vt:$data),
+  (inst VReg_64:$vaddr, getVregSrcForVT<vt>.ret:$data, $offset)
+>;
+
+class FlatSignedAtomicPatNoRtn <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+  (node (FLATOffsetSigned i64:$vaddr, i16:$offset), vt:$data),
+  (inst VReg_64:$vaddr, getVregSrcForVT<vt>.ret:$data, $offset)
 >;
 
 class FlatSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt,
-                     ValueType data_vt = vt> : GCNPat <
-  (vt (node (FLATSignedAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$data)),
-  (inst $vaddr, $data, $offset, $slc)
+                           ValueType data_vt = vt> : GCNPat <
+  (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset), data_vt:$data)),
+  (inst $vaddr, $data, $offset)
+>;
+
+class ScratchLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+  (vt (node (ScratchOffset (i32 VGPR_32:$vaddr), i16:$offset))),
+  (inst $vaddr, $offset)
+>;
+
+class ScratchLoadSignedPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+  (node (ScratchOffset (i32 VGPR_32:$vaddr), i16:$offset), vt:$in),
+  (inst $vaddr, $offset, 0, 0, 0, $in)
+>;
+
+class ScratchStoreSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+  (node vt:$data, (ScratchOffset (i32 VGPR_32:$vaddr), i16:$offset)),
+  (inst getVregSrcForVT<vt>.ret:$data, $vaddr, $offset)
+>;
+
+class ScratchLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+  (vt (node (ScratchSAddr (i32 SGPR_32:$saddr), i16:$offset))),
+  (inst $saddr, $offset)
+>;
+
+class ScratchLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+  (vt (node (ScratchSAddr (i32 SGPR_32:$saddr), i16:$offset), vt:$in)),
+  (inst $saddr, $offset, 0, 0, 0, $in)
+>;
+
+class ScratchStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
+                            ValueType vt> : GCNPat <
+  (node vt:$data, (ScratchSAddr (i32 SGPR_32:$saddr), i16:$offset)),
+  (inst getVregSrcForVT<vt>.ret:$data, $saddr, $offset)
 >;
 
 let OtherPredicates = [HasFlatAddressSpace] in {
@@ -807,8 +930,8 @@ def : FlatLoadPat <FLAT_LOAD_USHORT, load_flat, i16>;
 def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_flat, i32>;
 def : FlatLoadPat <FLAT_LOAD_DWORDX3, load_flat, v3i32>;
 
-def : FlatLoadAtomicPat <FLAT_LOAD_DWORD, atomic_load_32_flat, i32>;
-def : FlatLoadAtomicPat <FLAT_LOAD_DWORDX2, atomic_load_64_flat, i64>;
+def : FlatLoadPat <FLAT_LOAD_DWORD, atomic_load_32_flat, i32>;
+def : FlatLoadPat <FLAT_LOAD_DWORDX2, atomic_load_64_flat, i64>;
 
 def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i32>;
 def : FlatStorePat <FLAT_STORE_SHORT, truncstorei16_flat, i32>;
@@ -819,19 +942,19 @@ def : FlatStorePat <FLAT_STORE_DWORD, store_flat, vt>;
 }
 
 foreach vt = VReg_64.RegTypes in {
-def : FlatStorePat <FLAT_STORE_DWORDX2, store_flat, vt, VReg_64>;
+def : FlatStorePat <FLAT_STORE_DWORDX2, store_flat, vt>;
 def : FlatLoadPat <FLAT_LOAD_DWORDX2, load_flat, vt>;
 }
 
-def : FlatStorePat <FLAT_STORE_DWORDX3, store_flat, v3i32, VReg_96>;
+def : FlatStorePat <FLAT_STORE_DWORDX3, store_flat, v3i32>;
 
 foreach vt = VReg_128.RegTypes in {
 def : FlatLoadPat <FLAT_LOAD_DWORDX4, load_flat, vt>;
-def : FlatStorePat <FLAT_STORE_DWORDX4, store_flat, vt, VReg_128>;
+def : FlatStorePat <FLAT_STORE_DWORDX4, store_flat, vt>;
 }
 
 def : FlatStoreAtomicPat <FLAT_STORE_DWORD, atomic_store_flat_32, i32>;
-def : FlatStoreAtomicPat <FLAT_STORE_DWORDX2, atomic_store_flat_64, i64, VReg_64>;
+def : FlatStoreAtomicPat <FLAT_STORE_DWORDX2, atomic_store_flat_64, i64>;
 
 def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_load_add_global_32, i32>;
 def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_load_sub_global_32, i32>;
@@ -885,101 +1008,258 @@ def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2f16>;
 
 } // End OtherPredicates = [HasFlatAddressSpace]
 
-let OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10 in {
 
-def : FlatLoadSignedPat <GLOBAL_LOAD_UBYTE, extloadi8_global, i32>;
-def : FlatLoadSignedPat <GLOBAL_LOAD_UBYTE, zextloadi8_global, i32>;
-def : FlatLoadSignedPat <GLOBAL_LOAD_SBYTE, sextloadi8_global, i32>;
-def : FlatLoadSignedPat <GLOBAL_LOAD_UBYTE, extloadi8_global, i16>;
-def : FlatLoadSignedPat <GLOBAL_LOAD_UBYTE, zextloadi8_global, i16>;
-def : FlatLoadSignedPat <GLOBAL_LOAD_SBYTE, sextloadi8_global, i16>;
-def : FlatLoadSignedPat <GLOBAL_LOAD_USHORT, extloadi16_global, i32>;
-def : FlatLoadSignedPat <GLOBAL_LOAD_USHORT, zextloadi16_global, i32>;
-def : FlatLoadSignedPat <GLOBAL_LOAD_SSHORT, sextloadi16_global, i32>;
-def : FlatLoadSignedPat <GLOBAL_LOAD_USHORT, load_global, i16>;
+multiclass GlobalFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+  def : FlatLoadSignedPat <inst, node, vt> {
+    let AddedComplexity = 10;
+  }
+
+  def : GlobalLoadSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+    let AddedComplexity = 11;
+  }
+}
+
+multiclass GlobalFLATLoadPats_D16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+  def : FlatSignedLoadPat_D16 <inst, node, vt> {
+    let AddedComplexity = 10;
+  }
+
+  def : GlobalLoadSaddrPat_D16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+    let AddedComplexity = 11;
+  }
+}
+
+multiclass GlobalFLATStorePats<FLAT_Pseudo inst, SDPatternOperator node,
+                               ValueType vt> {
+  def : FlatStoreSignedPat <inst, node, vt> {
+    let AddedComplexity = 10;
+  }
+
+  def : GlobalStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+    let AddedComplexity = 11;
+  }
+}
+
+// Deal with swapped operands for atomic_store vs. regular store
+multiclass GlobalFLATAtomicStorePats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+  def : FlatStoreSignedAtomicPat <inst, node, vt> {
+    let AddedComplexity = 10;
+  }
+
+  def : GlobalAtomicStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+    let AddedComplexity = 11;
+  }
+}
+
+multiclass GlobalFLATAtomicPats<string nortn_inst_name, SDPatternOperator node,
+                               ValueType vt, ValueType data_vt = vt> {
+  def : FlatSignedAtomicPat <!cast<FLAT_Pseudo>(nortn_inst_name#"_RTN"), node, vt, data_vt> {
+    let AddedComplexity = 10;
+  }
+
+  def : GlobalAtomicSaddrPat<!cast<FLAT_Pseudo>(nortn_inst_name#"_SADDR_RTN"), node, vt, data_vt> {
+    let AddedComplexity = 11;
+  }
+}
+
+multiclass GlobalFLATNoRtnAtomicPats<FLAT_Pseudo inst, SDPatternOperator node,
+                                     ValueType vt> {
+  def : FlatSignedAtomicPatNoRtn <inst, node, vt> {
+    let AddedComplexity = 10;
+  }
+
+  def : GlobalAtomicNoRtnSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+    let AddedComplexity = 11;
+  }
+}
+
+multiclass ScratchFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+  def : ScratchLoadSignedPat <inst, node, vt> {
+    let AddedComplexity = 25;
+  }
+
+  def : ScratchLoadSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+    let AddedComplexity = 26;
+  }
+}
+
+multiclass ScratchFLATStorePats<FLAT_Pseudo inst, SDPatternOperator node,
+                               ValueType vt> {
+  def : ScratchStoreSignedPat <inst, node, vt> {
+    let AddedComplexity = 25;
+  }
+
+  def : ScratchStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+    let AddedComplexity = 26;
+  }
+}
+
+multiclass ScratchFLATLoadPats_D16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+  def : ScratchLoadSignedPat_D16 <inst, node, vt> {
+    let AddedComplexity = 25;
+  }
+
+  def : ScratchLoadSaddrPat_D16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+    let AddedComplexity = 26;
+  }
+}
+
+let OtherPredicates = [HasFlatGlobalInsts] in {
+
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, extloadi8_global, i32>;
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, zextloadi8_global, i32>;
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_SBYTE, sextloadi8_global, i32>;
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, extloadi8_global, i16>;
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, zextloadi8_global, i16>;
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_SBYTE, sextloadi8_global, i16>;
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, extloadi16_global, i32>;
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, zextloadi16_global, i32>;
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_SSHORT, sextloadi16_global, i32>;
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, load_global, i16>;
 
 foreach vt = Reg32Types.types in {
-def : FlatLoadSignedPat <GLOBAL_LOAD_DWORD, load_global, vt>;
-def : FlatStoreSignedPat <GLOBAL_STORE_DWORD, store_global, vt, VGPR_32>;
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORD, load_global, vt>;
+defm : GlobalFLATStorePats <GLOBAL_STORE_DWORD, store_global, vt>;
 }
 
 foreach vt = VReg_64.RegTypes in {
-def : FlatLoadSignedPat <GLOBAL_LOAD_DWORDX2, load_global, vt>;
-def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX2, store_global, vt, VReg_64>;
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORDX2, load_global, vt>;
+defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX2, store_global, vt>;
 }
 
-def : FlatLoadSignedPat <GLOBAL_LOAD_DWORDX3, load_global, v3i32>;
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORDX3, load_global, v3i32>;
 
 foreach vt = VReg_128.RegTypes in {
-def : FlatLoadSignedPat <GLOBAL_LOAD_DWORDX4, load_global, vt>;
-def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX4, store_global, vt, VReg_128>;
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORDX4, load_global, vt>;
+defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX4, store_global, vt>;
 }
 
-def : FlatLoadAtomicPat <GLOBAL_LOAD_DWORD, atomic_load_32_global, i32>;
-def : FlatLoadAtomicPat <GLOBAL_LOAD_DWORDX2, atomic_load_64_global, i64>;
+// There is no distinction for atomic load lowering during selection;
+// the memory legalizer will set the cache bits and insert the
+// appropriate waits.
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORD, atomic_load_32_global, i32>;
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORDX2, atomic_load_64_global, i64>;
 
-def : FlatStoreSignedPat <GLOBAL_STORE_BYTE, truncstorei8_global, i32, VGPR_32>;
-def : FlatStoreSignedPat <GLOBAL_STORE_BYTE, truncstorei8_global, i16, VGPR_32>;
-def : FlatStoreSignedPat <GLOBAL_STORE_SHORT, truncstorei16_global, i32, VGPR_32>;
-def : FlatStoreSignedPat <GLOBAL_STORE_SHORT, store_global, i16, VGPR_32>;
-def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX3, store_global, v3i32, VReg_96>;
+defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, truncstorei8_global, i32>;
+defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, truncstorei8_global, i16>;
+defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, truncstorei16_global, i32>;
+defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, store_global, i16>;
+defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX3, store_global, v3i32>;
 
 let OtherPredicates = [D16PreservesUnusedBits] in {
-def : FlatStoreSignedPat <GLOBAL_STORE_SHORT_D16_HI, truncstorei16_hi16_global, i32>;
-def : FlatStoreSignedPat <GLOBAL_STORE_BYTE_D16_HI, truncstorei8_hi16_global, i32>;
-
-def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_global, v2i16>;
-def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_global, v2f16>;
-def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_global, v2i16>;
-def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_global, v2f16>;
-def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SHORT_D16_HI, load_d16_hi_global, v2i16>;
-def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SHORT_D16_HI, load_d16_hi_global, v2f16>;
-
-def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_UBYTE_D16, az_extloadi8_d16_lo_global, v2i16>;
-def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_UBYTE_D16, az_extloadi8_d16_lo_global, v2f16>;
-def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SBYTE_D16, sextloadi8_d16_lo_global, v2i16>;
-def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SBYTE_D16, sextloadi8_d16_lo_global, v2f16>;
-def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SHORT_D16, load_d16_lo_global, v2i16>;
-def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SHORT_D16, load_d16_lo_global, v2f16>;
+defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT_D16_HI, truncstorei16_hi16_global, i32>;
+defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE_D16_HI, truncstorei8_hi16_global, i32>;
+
+defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_global, v2i16>;
+defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_global, v2f16>;
+defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_global, v2i16>;
+defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_global, v2f16>;
+defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_SHORT_D16_HI, load_d16_hi_global, v2i16>;
+defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_SHORT_D16_HI, load_d16_hi_global, v2f16>;
+
+defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_UBYTE_D16, az_extloadi8_d16_lo_global, v2i16>;
+defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_UBYTE_D16, az_extloadi8_d16_lo_global, v2f16>;
+defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_SBYTE_D16, sextloadi8_d16_lo_global, v2i16>;
+defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_SBYTE_D16, sextloadi8_d16_lo_global, v2f16>;
+defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_SHORT_D16, load_d16_lo_global, v2i16>;
+defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_SHORT_D16, load_d16_lo_global, v2f16>;
 }
 
-def : FlatStoreSignedAtomicPat <GLOBAL_STORE_DWORD, atomic_store_global_32, i32>;
-def : FlatStoreSignedAtomicPat <GLOBAL_STORE_DWORDX2, atomic_store_global_64, i64, VReg_64>;
-
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_ADD_RTN, atomic_load_add_global_32, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SUB_RTN, atomic_load_sub_global_32, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_INC_RTN, atomic_inc_global_32, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_DEC_RTN, atomic_dec_global_32, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_AND_RTN, atomic_load_and_global_32, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMAX_RTN, atomic_load_max_global_32, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMAX_RTN, atomic_load_umax_global_32, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMIN_RTN, atomic_load_min_global_32, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMIN_RTN, atomic_load_umin_global_32, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_OR_RTN, atomic_load_or_global_32, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SWAP_RTN, atomic_swap_global_32, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_CMPSWAP_RTN, AMDGPUatomic_cmp_swap_global_32, i32, v2i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_XOR_RTN, atomic_load_xor_global_32, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_CSUB_RTN, atomic_csub_global_32, i32>;
-
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_ADD_X2_RTN, atomic_load_add_global_64, i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SUB_X2_RTN, atomic_load_sub_global_64, i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_INC_X2_RTN, atomic_inc_global_64, i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_DEC_X2_RTN, atomic_dec_global_64, i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_AND_X2_RTN, atomic_load_and_global_64, i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMAX_X2_RTN, atomic_load_max_global_64, i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMAX_X2_RTN, atomic_load_umax_global_64, i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMIN_X2_RTN, atomic_load_min_global_64, i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMIN_X2_RTN, atomic_load_umin_global_64, i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_OR_X2_RTN, atomic_load_or_global_64, i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SWAP_X2_RTN, atomic_swap_global_64, i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_CMPSWAP_X2_RTN, AMDGPUatomic_cmp_swap_global_64, i64, v2i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_XOR_X2_RTN, atomic_load_xor_global_64, i64>;
-
-def : FlatAtomicPatNoRtn <GLOBAL_ATOMIC_ADD_F32,    atomic_fadd_global_noret, f32>;
-def : FlatAtomicPatNoRtn <GLOBAL_ATOMIC_PK_ADD_F16, atomic_pk_fadd_global_noret, v2f16>;
+defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_DWORD, atomic_store_global_32, i32>;
+defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_DWORDX2, atomic_store_global_64, i64>;
+
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD", atomic_load_add_global_32, i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB", atomic_load_sub_global_32, i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_INC", atomic_inc_global_32, i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_DEC", atomic_dec_global_32, i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_AND", atomic_load_and_global_32, i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SMAX", atomic_load_max_global_32, i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_UMAX", atomic_load_umax_global_32, i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SMIN", atomic_load_min_global_32, i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_UMIN", atomic_load_umin_global_32, i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_OR", atomic_load_or_global_32, i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SWAP", atomic_swap_global_32, i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP", AMDGPUatomic_cmp_swap_global_32, i32, v2i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR", atomic_load_xor_global_32, i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CSUB", int_amdgcn_global_atomic_csub, i32>;
+
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_X2", atomic_load_add_global_64, i64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB_X2", atomic_load_sub_global_64, i64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_INC_X2", atomic_inc_global_64, i64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_DEC_X2", atomic_dec_global_64, i64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_AND_X2", atomic_load_and_global_64, i64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SMAX_X2", atomic_load_max_global_64, i64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_UMAX_X2", atomic_load_umax_global_64, i64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SMIN_X2", atomic_load_min_global_64, i64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_UMIN_X2", atomic_load_umin_global_64, i64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_OR_X2", atomic_load_or_global_64, i64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SWAP_X2", atomic_swap_global_64, i64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP_X2", AMDGPUatomic_cmp_swap_global_64, i64, v2i64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR_X2", atomic_load_xor_global_64, i64>;
+
+let OtherPredicates = [HasAtomicFaddInsts] in {
+defm : GlobalFLATNoRtnAtomicPats <GLOBAL_ATOMIC_ADD_F32,    atomic_load_fadd_global_noret_32, f32>;
+defm : GlobalFLATNoRtnAtomicPats <GLOBAL_ATOMIC_PK_ADD_F16, atomic_load_fadd_v2f16_global_noret_32, v2f16>;
+}
 
 } // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10
 
+let OtherPredicates = [HasFlatScratchInsts, EnableFlatScratch] in {
+
+defm : ScratchFLATLoadPats <SCRATCH_LOAD_UBYTE, extloadi8_private, i32>;
+defm : ScratchFLATLoadPats <SCRATCH_LOAD_UBYTE, zextloadi8_private, i32>;
+defm : ScratchFLATLoadPats <SCRATCH_LOAD_SBYTE, sextloadi8_private, i32>;
+defm : ScratchFLATLoadPats <SCRATCH_LOAD_UBYTE, extloadi8_private, i16>;
+defm : ScratchFLATLoadPats <SCRATCH_LOAD_UBYTE, zextloadi8_private, i16>;
+defm : ScratchFLATLoadPats <SCRATCH_LOAD_SBYTE, sextloadi8_private, i16>;
+defm : ScratchFLATLoadPats <SCRATCH_LOAD_USHORT, extloadi16_private, i32>;
+defm : ScratchFLATLoadPats <SCRATCH_LOAD_USHORT, zextloadi16_private, i32>;
+defm : ScratchFLATLoadPats <SCRATCH_LOAD_SSHORT, sextloadi16_private, i32>;
+defm : ScratchFLATLoadPats <SCRATCH_LOAD_USHORT, load_private, i16>;
+
+foreach vt = Reg32Types.types in {
+defm : ScratchFLATLoadPats <SCRATCH_LOAD_DWORD, load_private, vt>;
+defm : ScratchFLATStorePats <SCRATCH_STORE_DWORD, store_private, vt>;
+}
+
+foreach vt = VReg_64.RegTypes in {
+defm : ScratchFLATLoadPats <SCRATCH_LOAD_DWORDX2, load_private, vt>;
+defm : ScratchFLATStorePats <SCRATCH_STORE_DWORDX2, store_private, vt>;
+}
+
+defm : ScratchFLATLoadPats <SCRATCH_LOAD_DWORDX3, load_private, v3i32>;
+
+foreach vt = VReg_128.RegTypes in {
+defm : ScratchFLATLoadPats <SCRATCH_LOAD_DWORDX4, load_private, vt>;
+defm : ScratchFLATStorePats <SCRATCH_STORE_DWORDX4, store_private, vt>;
+}
+
+defm : ScratchFLATStorePats <SCRATCH_STORE_BYTE, truncstorei8_private, i32>;
+defm : ScratchFLATStorePats <SCRATCH_STORE_BYTE, truncstorei8_private, i16>;
+defm : ScratchFLATStorePats <SCRATCH_STORE_SHORT, truncstorei16_private, i32>;
+defm : ScratchFLATStorePats <SCRATCH_STORE_SHORT, store_private, i16>;
+defm : ScratchFLATStorePats <SCRATCH_STORE_DWORDX3, store_private, v3i32>;
+
+let OtherPredicates = [D16PreservesUnusedBits, HasFlatScratchInsts, EnableFlatScratch] in {
+defm : ScratchFLATStorePats <SCRATCH_STORE_SHORT_D16_HI, truncstorei16_hi16_private, i32>;
+defm : ScratchFLATStorePats <SCRATCH_STORE_BYTE_D16_HI, truncstorei8_hi16_private, i32>;
+
+defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_private, v2i16>;
+defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_private, v2f16>;
+defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_private, v2i16>;
+defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_private, v2f16>;
+defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SHORT_D16_HI, load_d16_hi_private, v2i16>;
+defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SHORT_D16_HI, load_d16_hi_private, v2f16>;
+
+defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_UBYTE_D16, az_extloadi8_d16_lo_private, v2i16>;
+defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_UBYTE_D16, az_extloadi8_d16_lo_private, v2f16>;
+defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SBYTE_D16, sextloadi8_d16_lo_private, v2i16>;
+defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SBYTE_D16, sextloadi8_d16_lo_private, v2f16>;
+defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SHORT_D16, load_d16_lo_private, v2i16>;
+defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SHORT_D16, load_d16_lo_private, v2f16>;
+}
+
+} // End OtherPredicates = [HasFlatScratchInsts,EnableFlatScratch]
 
 //===----------------------------------------------------------------------===//
 // Target
@@ -1246,6 +1526,13 @@ multiclass FLAT_Real_SADDR_RTN_gfx10<bits<7> op> {
     FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN")>;
 }
 
+multiclass FLAT_Real_ST_gfx10<bits<7> op> {
+  def _ST_gfx10 :
+    FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(NAME#"_ST")> {
+      let Inst{54-48} = !cast<int>(EXEC_HI.HWEncoding);
+      let OtherPredicates = [HasFlatScratchSTMode];
+    }
+}
 
 multiclass FLAT_Real_AllAddr_gfx10<bits<7> op> :
   FLAT_Real_Base_gfx10<op>,
@@ -1264,6 +1551,11 @@ multiclass FLAT_Real_GlblAtomics_RTN_gfx10<bits<7> op> :
   FLAT_Real_RTN_gfx10<op>,
   FLAT_Real_SADDR_RTN_gfx10<op>;
 
+multiclass FLAT_Real_ScratchAllAddr_gfx10<bits<7> op> :
+  FLAT_Real_Base_gfx10<op>,
+  FLAT_Real_SADDR_gfx10<op>,
+  FLAT_Real_ST_gfx10<op>;
+
 // ENC_FLAT.
 defm FLAT_LOAD_UBYTE            : FLAT_Real_Base_gfx10<0x008>;
 defm FLAT_LOAD_SBYTE            : FLAT_Real_Base_gfx10<0x009>;
@@ -1377,32 +1669,32 @@ defm GLOBAL_ATOMIC_DEC_X2       : FLAT_Real_GlblAtomics_gfx10<0x05d>;
 defm GLOBAL_ATOMIC_FCMPSWAP_X2  : FLAT_Real_GlblAtomics_gfx10<0x05e>;
 defm GLOBAL_ATOMIC_FMIN_X2      : FLAT_Real_GlblAtomics_gfx10<0x05f>;
 defm GLOBAL_ATOMIC_FMAX_X2      : FLAT_Real_GlblAtomics_gfx10<0x060>;
-defm GLOBAL_LOAD_DWORD_ADDTID   : FLAT_Real_Base_gfx10<0x016>;
-defm GLOBAL_STORE_DWORD_ADDTID  : FLAT_Real_Base_gfx10<0x017>;
+defm GLOBAL_LOAD_DWORD_ADDTID   : FLAT_Real_AllAddr_gfx10<0x016>;
+defm GLOBAL_STORE_DWORD_ADDTID  : FLAT_Real_AllAddr_gfx10<0x017>;
 
 // ENC_FLAT_SCRATCH.
-defm SCRATCH_LOAD_UBYTE         : FLAT_Real_AllAddr_gfx10<0x008>;
-defm SCRATCH_LOAD_SBYTE         : FLAT_Real_AllAddr_gfx10<0x009>;
-defm SCRATCH_LOAD_USHORT        : FLAT_Real_AllAddr_gfx10<0x00a>;
-defm SCRATCH_LOAD_SSHORT        : FLAT_Real_AllAddr_gfx10<0x00b>;
-defm SCRATCH_LOAD_DWORD         : FLAT_Real_AllAddr_gfx10<0x00c>;
-defm SCRATCH_LOAD_DWORDX2       : FLAT_Real_AllAddr_gfx10<0x00d>;
-defm SCRATCH_LOAD_DWORDX4       : FLAT_Real_AllAddr_gfx10<0x00e>;
-defm SCRATCH_LOAD_DWORDX3       : FLAT_Real_AllAddr_gfx10<0x00f>;
-defm SCRATCH_STORE_BYTE         : FLAT_Real_AllAddr_gfx10<0x018>;
-defm SCRATCH_STORE_BYTE_D16_HI  : FLAT_Real_AllAddr_gfx10<0x019>;
-defm SCRATCH_STORE_SHORT        : FLAT_Real_AllAddr_gfx10<0x01a>;
-defm SCRATCH_STORE_SHORT_D16_HI : FLAT_Real_AllAddr_gfx10<0x01b>;
-defm SCRATCH_STORE_DWORD        : FLAT_Real_AllAddr_gfx10<0x01c>;
-defm SCRATCH_STORE_DWORDX2      : FLAT_Real_AllAddr_gfx10<0x01d>;
-defm SCRATCH_STORE_DWORDX4      : FLAT_Real_AllAddr_gfx10<0x01e>;
-defm SCRATCH_STORE_DWORDX3      : FLAT_Real_AllAddr_gfx10<0x01f>;
-defm SCRATCH_LOAD_UBYTE_D16     : FLAT_Real_AllAddr_gfx10<0x020>;
-defm SCRATCH_LOAD_UBYTE_D16_HI  : FLAT_Real_AllAddr_gfx10<0x021>;
-defm SCRATCH_LOAD_SBYTE_D16     : FLAT_Real_AllAddr_gfx10<0x022>;
-defm SCRATCH_LOAD_SBYTE_D16_HI  : FLAT_Real_AllAddr_gfx10<0x023>;
-defm SCRATCH_LOAD_SHORT_D16     : FLAT_Real_AllAddr_gfx10<0x024>;
-defm SCRATCH_LOAD_SHORT_D16_HI  : FLAT_Real_AllAddr_gfx10<0x025>;
+defm SCRATCH_LOAD_UBYTE         : FLAT_Real_ScratchAllAddr_gfx10<0x008>;
+defm SCRATCH_LOAD_SBYTE         : FLAT_Real_ScratchAllAddr_gfx10<0x009>;
+defm SCRATCH_LOAD_USHORT        : FLAT_Real_ScratchAllAddr_gfx10<0x00a>;
+defm SCRATCH_LOAD_SSHORT        : FLAT_Real_ScratchAllAddr_gfx10<0x00b>;
+defm SCRATCH_LOAD_DWORD         : FLAT_Real_ScratchAllAddr_gfx10<0x00c>;
+defm SCRATCH_LOAD_DWORDX2       : FLAT_Real_ScratchAllAddr_gfx10<0x00d>;
+defm SCRATCH_LOAD_DWORDX4       : FLAT_Real_ScratchAllAddr_gfx10<0x00e>;
+defm SCRATCH_LOAD_DWORDX3       : FLAT_Real_ScratchAllAddr_gfx10<0x00f>;
+defm SCRATCH_STORE_BYTE         : FLAT_Real_ScratchAllAddr_gfx10<0x018>;
+defm SCRATCH_STORE_BYTE_D16_HI  : FLAT_Real_ScratchAllAddr_gfx10<0x019>;
+defm SCRATCH_STORE_SHORT        : FLAT_Real_ScratchAllAddr_gfx10<0x01a>;
+defm SCRATCH_STORE_SHORT_D16_HI : FLAT_Real_ScratchAllAddr_gfx10<0x01b>;
+defm SCRATCH_STORE_DWORD        : FLAT_Real_ScratchAllAddr_gfx10<0x01c>;
+defm SCRATCH_STORE_DWORDX2      : FLAT_Real_ScratchAllAddr_gfx10<0x01d>;
+defm SCRATCH_STORE_DWORDX4      : FLAT_Real_ScratchAllAddr_gfx10<0x01e>;
+defm SCRATCH_STORE_DWORDX3      : FLAT_Real_ScratchAllAddr_gfx10<0x01f>;
+defm SCRATCH_LOAD_UBYTE_D16     : FLAT_Real_ScratchAllAddr_gfx10<0x020>;
+defm SCRATCH_LOAD_UBYTE_D16_HI  : FLAT_Real_ScratchAllAddr_gfx10<0x021>;
+defm SCRATCH_LOAD_SBYTE_D16     : FLAT_Real_ScratchAllAddr_gfx10<0x022>;
+defm SCRATCH_LOAD_SBYTE_D16_HI  : FLAT_Real_ScratchAllAddr_gfx10<0x023>;
+defm SCRATCH_LOAD_SHORT_D16     : FLAT_Real_ScratchAllAddr_gfx10<0x024>;
+defm SCRATCH_LOAD_SHORT_D16_HI  : FLAT_Real_ScratchAllAddr_gfx10<0x025>;
 
 let SubtargetPredicate = HasAtomicFaddInsts in {
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
index 719a968b8314..e4eacd101ce8 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
@@ -38,22 +38,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "SIInstrInfo.h"
+#include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/Pass.h"
-#include <cassert>
 
 using namespace llvm;
 
@@ -274,14 +262,14 @@ static bool isIdentityValue(unsigned OrigMIOp, MachineOperand *OldOpnd) {
   default: break;
   case AMDGPU::V_ADD_U32_e32:
   case AMDGPU::V_ADD_U32_e64:
-  case AMDGPU::V_ADD_I32_e32:
-  case AMDGPU::V_ADD_I32_e64:
+  case AMDGPU::V_ADD_CO_U32_e32:
+  case AMDGPU::V_ADD_CO_U32_e64:
   case AMDGPU::V_OR_B32_e32:
   case AMDGPU::V_OR_B32_e64:
   case AMDGPU::V_SUBREV_U32_e32:
   case AMDGPU::V_SUBREV_U32_e64:
-  case AMDGPU::V_SUBREV_I32_e32:
-  case AMDGPU::V_SUBREV_I32_e64:
+  case AMDGPU::V_SUBREV_CO_U32_e32:
+  case AMDGPU::V_SUBREV_CO_U32_e64:
   case AMDGPU::V_MAX_U32_e32:
   case AMDGPU::V_MAX_U32_e64:
   case AMDGPU::V_XOR_B32_e32:
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 8482dbfec250..ed1dc77bd545 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -11,25 +11,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "GCNHazardRecognizer.h"
-#include "AMDGPUSubtarget.h"
-#include "SIDefines.h"
-#include "SIInstrInfo.h"
-#include "SIRegisterInfo.h"
+#include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/iterator_range.h"
 #include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
-#include "llvm/MC/MCInstrDesc.h"
-#include "llvm/Support/ErrorHandling.h"
-#include <algorithm>
-#include <cassert>
-#include <limits>
-#include <set>
-#include <vector>
+#include "llvm/Support/TargetParser.h"
 
 using namespace llvm;
 
@@ -50,6 +36,10 @@ GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
   TSchedModel.init(&ST);
 }
 
+void GCNHazardRecognizer::Reset() {
+  EmittedInstrs.clear();
+}
+
 void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
   EmitInstruction(SU->getInstr());
 }
@@ -59,7 +49,7 @@ void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
 }
 
 static bool isDivFMas(unsigned Opcode) {
-  return Opcode == AMDGPU::V_DIV_FMAS_F32 || Opcode == AMDGPU::V_DIV_FMAS_F64;
+  return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
 }
 
 static bool isSGetReg(unsigned Opcode) {
@@ -67,7 +57,14 @@ static bool isSGetReg(unsigned Opcode) {
 }
 
 static bool isSSetReg(unsigned Opcode) {
-  return Opcode == AMDGPU::S_SETREG_B32 || Opcode == AMDGPU::S_SETREG_IMM32_B32;
+  switch (Opcode) {
+  case AMDGPU::S_SETREG_B32:
+  case AMDGPU::S_SETREG_B32_mode:
+  case AMDGPU::S_SETREG_IMM32_B32:
+  case AMDGPU::S_SETREG_IMM32_B32_mode:
+    return true;
+  }
+  return false;
 }
 
 static bool isRWLane(unsigned Opcode) {
@@ -118,8 +115,8 @@ static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
 
 static bool isPermlane(const MachineInstr &MI) {
   unsigned Opcode = MI.getOpcode();
-  return Opcode == AMDGPU::V_PERMLANE16_B32 ||
-         Opcode == AMDGPU::V_PERMLANEX16_B32;
+  return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
+         Opcode == AMDGPU::V_PERMLANEX16_B32_e64;
 }
 
 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
@@ -131,75 +128,83 @@ static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
 ScheduleHazardRecognizer::HazardType
 GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
   MachineInstr *MI = SU->getInstr();
+  // If we are not in "HazardRecognizerMode" and therefore not being run from
+  // the scheduler, track possible stalls from hazards but don't insert noops.
+  auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
+
   if (MI->isBundle())
    return NoHazard;
 
   if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
-    return NoopHazard;
+    return HazardType;
 
   // FIXME: Should flat be considered vmem?
   if ((SIInstrInfo::isVMEM(*MI) ||
        SIInstrInfo::isFLAT(*MI))
       && checkVMEMHazards(MI) > 0)
-    return NoopHazard;
+    return HazardType;
 
   if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
-    return NoopHazard;
+    return HazardType;
 
   if (checkFPAtomicToDenormModeHazard(MI) > 0)
-    return NoopHazard;
+    return HazardType;
 
   if (ST.hasNoDataDepHazard())
     return NoHazard;
 
   if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
-    return NoopHazard;
+    return HazardType;
 
   if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
-    return NoopHazard;
+    return HazardType;
 
   if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
-    return NoopHazard;
+    return HazardType;
 
   if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
-    return NoopHazard;
+    return HazardType;
 
   if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
-    return NoopHazard;
+    return HazardType;
 
   if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
-    return NoopHazard;
+    return HazardType;
 
   if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
-    return NoopHazard;
+    return HazardType;
 
   if (ST.hasReadM0MovRelInterpHazard() &&
       (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) &&
       checkReadM0Hazards(MI) > 0)
-    return NoopHazard;
+    return HazardType;
 
   if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI) &&
       checkReadM0Hazards(MI) > 0)
-    return NoopHazard;
+    return HazardType;
 
   if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
-    return NoopHazard;
+    return HazardType;
 
-  if (MI->mayLoadOrStore() && checkMAILdStHazards(MI) > 0)
-    return NoopHazard;
+  if ((SIInstrInfo::isVMEM(*MI) ||
+       SIInstrInfo::isFLAT(*MI) ||
+       SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0)
+    return HazardType;
 
   if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
-    return NoopHazard;
-
-  if (checkAnyInstHazards(MI) > 0)
-    return NoopHazard;
+    return HazardType;
 
   return NoHazard;
 }
 
-static void insertNoopInBundle(MachineInstr *MI, const SIInstrInfo &TII) {
-  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
-      .addImm(0);
+static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII,
+                                unsigned Quantity) {
+  while (Quantity > 0) {
+    unsigned Arg = std::min(Quantity, 8u);
+    Quantity -= Arg;
+    BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
+        .addImm(Arg - 1);
+  }
 }
 
 void GCNHazardRecognizer::processBundle() {
@@ -210,11 +215,11 @@ void GCNHazardRecognizer::processBundle() {
     CurrCycleInstr = &*MI;
     unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
 
-    if (IsHazardRecognizerMode)
+    if (IsHazardRecognizerMode) {
       fixHazards(CurrCycleInstr);
 
-    for (unsigned i = 0; i < WaitStates; ++i)
-      insertNoopInBundle(CurrCycleInstr, TII);
+      insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
+    }
 
     // It’s unnecessary to track more than MaxLookAhead instructions. Since we
     // include the bundled MI directly after, only add a maximum of
@@ -241,7 +246,7 @@ unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
   if (MI->isBundle())
     return 0;
 
-  int WaitStates = std::max(0, checkAnyInstHazards(MI));
+  int WaitStates = 0;
 
   if (SIInstrInfo::isSMRD(*MI))
     return std::max(WaitStates, checkSMRDHazards(MI));
@@ -291,7 +296,9 @@ unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
   if (SIInstrInfo::isMAI(*MI))
     return std::max(WaitStates, checkMAIHazards(MI));
 
-  if (MI->mayLoadOrStore())
+  if (SIInstrInfo::isVMEM(*MI) ||
+      SIInstrInfo::isFLAT(*MI) ||
+      SIInstrInfo::isDS(*MI))
     return std::max(WaitStates, checkMAILdStHazards(MI));
 
   return WaitStates;
@@ -304,15 +311,19 @@ void GCNHazardRecognizer::EmitNoop() {
 void GCNHazardRecognizer::AdvanceCycle() {
   // When the scheduler detects a stall, it will call AdvanceCycle() without
   // emitting any instructions.
-  if (!CurrCycleInstr)
+  if (!CurrCycleInstr) {
+    EmittedInstrs.push_front(nullptr);
     return;
+  }
 
   // Do not track non-instructions which do not affect the wait states.
   // If included, these instructions can lead to buffer overflow such that
   // detectable hazards are missed.
   if (CurrCycleInstr->isImplicitDef() || CurrCycleInstr->isDebugInstr() ||
-      CurrCycleInstr->isKill())
+      CurrCycleInstr->isKill()) {
+    CurrCycleInstr = nullptr;
     return;
+  }
 
   if (CurrCycleInstr->isBundle()) {
     processBundle();
@@ -367,7 +378,7 @@ static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
     if (IsHazard(&*I))
       return WaitStates;
 
-    if (I->isInlineAsm() || I->isImplicitDef() || I->isDebugInstr())
+    if (I->isInlineAsm() || I->isMetaInstruction())
       continue;
 
     WaitStates += SIInstrInfo::getNumWaitStates(*I);
@@ -460,8 +471,8 @@ int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
 // No-op Hazard Detection
 //===----------------------------------------------------------------------===//
 
-static void addRegUnits(const SIRegisterInfo &TRI,
-                        BitVector &BV, unsigned Reg) {
+static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
+                        MCRegister Reg) {
   for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI)
     BV.set(*RUI);
 }
@@ -471,7 +482,7 @@ static void addRegsToSet(const SIRegisterInfo &TRI,
                          BitVector &Set) {
   for (const MachineOperand &Op : Ops) {
     if (Op.isReg())
-      addRegUnits(TRI, Set, Op.getReg());
+      addRegUnits(TRI, Set, Op.getReg().asMCReg());
   }
 }
 
@@ -718,8 +729,9 @@ int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
   return -1;
 }
 
-int GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
-						const MachineRegisterInfo &MRI) {
+int
+GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
+                                            const MachineRegisterInfo &MRI) {
   // Helper to check for the hazard where VMEM instructions that store more than
   // 8 bytes can have there store data over written by the next instruction.
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
@@ -821,34 +833,6 @@ int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
   return RFEWaitStates - WaitStatesNeeded;
 }
 
-int GCNHazardRecognizer::checkAnyInstHazards(MachineInstr *MI) {
-  if (MI->isDebugInstr())
-    return 0;
-
-  const SIRegisterInfo *TRI = ST.getRegisterInfo();
-  if (!ST.hasSMovFedHazard())
-    return 0;
-
-  // Check for any instruction reading an SGPR after a write from
-  // s_mov_fed_b32.
-  int MovFedWaitStates = 1;
-  int WaitStatesNeeded = 0;
-
-  for (const MachineOperand &Use : MI->uses()) {
-    if (!Use.isReg() || TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
-      continue;
-    auto IsHazardFn = [] (MachineInstr *MI) {
-      return MI->getOpcode() == AMDGPU::S_MOV_FED_B32;
-    };
-    int WaitStatesNeededForUse =
-        MovFedWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardFn,
-                                                 MovFedWaitStates);
-    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
-  }
-
-  return WaitStatesNeeded;
-}
-
 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
   const SIInstrInfo *TII = ST.getInstrInfo();
   const int SMovRelWaitStates = 1;
@@ -930,10 +914,12 @@ bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
     return false;
   };
 
-  auto IsExpiredFn = [] (MachineInstr *MI, int) {
+  auto IsExpiredFn = [](MachineInstr *MI, int) {
     return MI && (SIInstrInfo::isVALU(*MI) ||
                   (MI->getOpcode() == AMDGPU::S_WAITCNT &&
-                   !MI->getOperand(0).getImm()));
+                   !MI->getOperand(0).getImm()) ||
+                  (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
+                   MI->getOperand(0).getImm() == 0xffe3));
   };
 
   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
@@ -941,7 +927,9 @@ bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
     return false;
 
   const SIInstrInfo *TII = ST.getInstrInfo();
-  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
+  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+          TII->get(AMDGPU::S_WAITCNT_DEPCTR))
+      .addImm(0xffe3);
   return true;
 }
 
@@ -955,7 +943,6 @@ bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
   unsigned SDSTName;
   switch (MI->getOpcode()) {
   case AMDGPU::V_READLANE_B32:
-  case AMDGPU::V_READLANE_B32_gfx10:
   case AMDGPU::V_READFIRSTLANE_B32:
     SDSTName = AMDGPU::OpName::vdst;
     break;
@@ -1183,7 +1170,7 @@ int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
     case AMDGPU::S_WAITCNT_VMCNT:
     case AMDGPU::S_WAITCNT_EXPCNT:
     case AMDGPU::S_WAITCNT_LGKMCNT:
-    case AMDGPU::S_WAITCNT_IDLE:
+    case AMDGPU::S_WAIT_IDLE:
       return true;
     default:
       break;
@@ -1207,7 +1194,7 @@ int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
     return SIInstrInfo::isVALU(*MI);
   };
 
-  if (Opc != AMDGPU::V_ACCVGPR_READ_B32) { // MFMA or v_accvgpr_write
+  if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
     const int LegacyVALUWritesVGPRWaitStates = 2;
     const int VALUWritesExecWaitStates = 4;
     const int MaxWaitStates = 4;
@@ -1235,15 +1222,15 @@ int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
 
   auto IsMFMAFn = [] (MachineInstr *MI) {
     return SIInstrInfo::isMAI(*MI) &&
-           MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32 &&
-           MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32;
+           MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
+           MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
   };
 
   for (const MachineOperand &Op : MI->explicit_operands()) {
     if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
       continue;
 
-    if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32)
+    if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
       continue;
 
     const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
@@ -1277,7 +1264,7 @@ int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
     int OpNo = MI->getOperandNo(&Op);
     if (OpNo == SrcCIdx) {
       NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
-    } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32) {
+    } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
       switch (HazardDefLatency) {
       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
                break;
@@ -1287,7 +1274,7 @@ int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
                break;
       }
-    } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) {
+    } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
       switch (HazardDefLatency) {
       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
                break;
@@ -1306,7 +1293,7 @@ int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
       return WaitStatesNeeded; // Early exit.
 
     auto IsAccVgprWriteFn = [Reg, this] (MachineInstr *MI) {
-      if (MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32)
+      if (MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
         return false;
       Register DstReg = MI->getOperand(0).getReg();
       return TRI.regsOverlap(Reg, DstReg);
@@ -1318,7 +1305,7 @@ int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
     NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
     if (OpNo == SrcCIdx)
       NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
-    else if (Opc == AMDGPU::V_ACCVGPR_READ_B32)
+    else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
       NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
 
     WaitStatesNeededForUse = NeedWaitStates -
@@ -1329,7 +1316,7 @@ int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
       return WaitStatesNeeded; // Early exit.
   }
 
-  if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) {
+  if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
     const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
     const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
     const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
@@ -1373,7 +1360,7 @@ int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
   int WaitStatesNeeded = 0;
 
   auto IsAccVgprReadFn = [] (MachineInstr *MI) {
-    return MI->getOpcode() == AMDGPU::V_ACCVGPR_READ_B32;
+    return MI->getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
   };
 
   for (const MachineOperand &Op : MI->explicit_uses()) {
@@ -1383,7 +1370,7 @@ int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
     Register Reg = Op.getReg();
 
     const int AccVgprReadLdStWaitStates = 2;
-    const int VALUWriteAccVgprReadLdStDepVALUWaitStates = 1;
+    const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
     const int MaxWaitStates = 2;
 
     int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
@@ -1393,8 +1380,9 @@ int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
     if (WaitStatesNeeded == MaxWaitStates)
       return WaitStatesNeeded; // Early exit.
 
-    auto IsVALUAccVgprReadCheckFn = [Reg, this] (MachineInstr *MI) {
-      if (MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32)
+    auto IsVALUAccVgprRdWrCheckFn = [Reg, this](MachineInstr *MI) {
+      if (MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
+          MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
         return false;
       auto IsVALUFn = [] (MachineInstr *MI) {
         return SIInstrInfo::isVALU(*MI) && !SIInstrInfo::isMAI(*MI);
@@ -1403,10 +1391,34 @@ int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
              std::numeric_limits<int>::max();
     };
 
-    WaitStatesNeededForUse = VALUWriteAccVgprReadLdStDepVALUWaitStates -
-      getWaitStatesSince(IsVALUAccVgprReadCheckFn, MaxWaitStates);
+    WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
+      getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
   }
 
   return WaitStatesNeeded;
 }
+
+bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
+  if (!SU->isInstr())
+    return false;
+
+  MachineInstr *MAI = nullptr;
+  auto IsMFMAFn = [&MAI] (MachineInstr *MI) {
+    MAI = nullptr;
+    if (SIInstrInfo::isMAI(*MI) &&
+        MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
+        MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64)
+      MAI = MI;
+    return MAI != nullptr;
+  };
+
+  MachineInstr *MI = SU->getInstr();
+  if (IsMFMAFn(MI)) {
+    int W = getWaitStatesSince(IsMFMAFn, 16);
+    if (MAI)
+      return W < (int)TSchedModel.computeInstrLatency(MAI);
+  }
+
+  return false;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index cd17f2755bd1..447ca828ae64 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -83,7 +83,6 @@ private:
   int checkRWLaneHazards(MachineInstr *RWLane);
   int checkRFEHazards(MachineInstr *RFE);
   int checkInlineAsmHazards(MachineInstr *IA);
-  int checkAnyInstHazards(MachineInstr *MI);
   int checkReadM0Hazards(MachineInstr *SMovRel);
   int checkNSAtoVMEMHazard(MachineInstr *MI);
   int checkFPAtomicToDenormModeHazard(MachineInstr *MI);
@@ -109,6 +108,8 @@ public:
   unsigned PreEmitNoopsCommon(MachineInstr *);
   void AdvanceCycle() override;
   void RecedeCycle() override;
+  bool ShouldPreferAnother(SUnit *SU) override;
+  void Reset() override;
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNILPSched.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNILPSched.cpp
index 39072af7d871..1eb617640c32 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNILPSched.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNILPSched.cpp
@@ -11,7 +11,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/ScheduleDAG.h"
-#include "llvm/Support/Debug.h"
 
 using namespace llvm;
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
index 75a02c839034..f3f9eb53355f 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
@@ -12,29 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "GCNIterativeScheduler.h"
-#include "AMDGPUSubtarget.h"
-#include "GCNRegPressure.h"
 #include "GCNSchedStrategy.h"
 #include "SIMachineFunctionInfo.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/LiveIntervals.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/RegisterPressure.h"
-#include "llvm/CodeGen/ScheduleDAG.h"
-#include "llvm/Config/llvm-config.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
-#include <cassert>
-#include <iterator>
-#include <limits>
-#include <memory>
-#include <type_traits>
-#include <vector>
 
 using namespace llvm;
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h
index a0d4f432aa48..c0228540b7a2 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h
@@ -18,13 +18,7 @@
 #define LLVM_LIB_TARGET_AMDGPU_GCNITERATIVESCHEDULER_H
 
 #include "GCNRegPressure.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineScheduler.h"
-#include "llvm/Support/Allocator.h"
-#include <limits>
-#include <memory>
-#include <vector>
 
 namespace llvm {
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNMinRegStrategy.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
index 884b2e17289c..443472a3b99a 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
@@ -13,20 +13,7 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/ilist_node.h"
-#include "llvm/ADT/simple_ilist.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
-#include "llvm/Support/Allocator.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include <cassert>
-#include <cstdint>
-#include <limits>
-#include <vector>
-
 using namespace llvm;
 
 #define DEBUG_TYPE "machine-scheduler"
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
index 57346087d017..fc7105bc15a7 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
@@ -14,18 +14,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "SIInstrInfo.h"
+#include "GCNSubtarget.h"
 #include "SIMachineFunctionInfo.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/LiveRegMatrix.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/Support/MathExtras.h"
-#include <algorithm>
 
 using namespace llvm;
 
@@ -114,15 +109,15 @@ GCNNSAReassign::tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals,
   unsigned NumRegs = Intervals.size();
 
   for (unsigned N = 0; N < NumRegs; ++N)
-    if (VRM->hasPhys(Intervals[N]->reg))
+    if (VRM->hasPhys(Intervals[N]->reg()))
       LRM->unassign(*Intervals[N]);
 
   for (unsigned N = 0; N < NumRegs; ++N)
-    if (LRM->checkInterference(*Intervals[N], StartReg + N))
+    if (LRM->checkInterference(*Intervals[N], MCRegister::from(StartReg + N)))
       return false;
 
   for (unsigned N = 0; N < NumRegs; ++N)
-    LRM->assign(*Intervals[N], StartReg + N);
+    LRM->assign(*Intervals[N], MCRegister::from(StartReg + N));
 
   return true;
 }
@@ -175,7 +170,7 @@ GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const {
   for (unsigned I = 0; I < Info->VAddrDwords; ++I) {
     const MachineOperand &Op = MI.getOperand(VAddr0Idx + I);
     Register Reg = Op.getReg();
-    if (Register::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg))
+    if (Reg.isPhysical() || !VRM->isAssignedReg(Reg))
       return NSA_Status::FIXED;
 
     Register PhysReg = VRM->getPhys(Reg);
@@ -273,13 +268,13 @@ bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) {
       AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr0);
 
     SmallVector<LiveInterval *, 16> Intervals;
-    SmallVector<unsigned, 16> OrigRegs;
+    SmallVector<MCRegister, 16> OrigRegs;
     SlotIndex MinInd, MaxInd;
     for (unsigned I = 0; I < Info->VAddrDwords; ++I) {
       const MachineOperand &Op = MI->getOperand(VAddr0Idx + I);
       Register Reg = Op.getReg();
       LiveInterval *LI = &LIS->getInterval(Reg);
-      if (llvm::find(Intervals, LI) != Intervals.end()) {
+      if (llvm::is_contained(Intervals, LI)) {
         // Same register used, unable to make sequential
         Intervals.clear();
         break;
@@ -302,14 +297,15 @@ bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) {
 
     LLVM_DEBUG(dbgs() << "Attempting to reassign NSA: " << *MI
                       << "\tOriginal allocation:\t";
-               for(auto *LI : Intervals)
-                 dbgs() << " " << llvm::printReg((VRM->getPhys(LI->reg)), TRI);
+               for (auto *LI
+                    : Intervals) dbgs()
+               << " " << llvm::printReg((VRM->getPhys(LI->reg())), TRI);
                dbgs() << '\n');
 
     bool Success = scavengeRegs(Intervals);
     if (!Success) {
       LLVM_DEBUG(dbgs() << "\tCannot reallocate.\n");
-      if (VRM->hasPhys(Intervals.back()->reg)) // Did not change allocation.
+      if (VRM->hasPhys(Intervals.back()->reg())) // Did not change allocation.
         continue;
     } else {
       // Check we did not make it worse for other instructions.
@@ -328,7 +324,7 @@ bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) {
 
     if (!Success) {
       for (unsigned I = 0; I < Info->VAddrDwords; ++I)
-        if (VRM->hasPhys(Intervals[I]->reg))
+        if (VRM->hasPhys(Intervals[I]->reg()))
           LRM->unassign(*Intervals[I]);
 
       for (unsigned I = 0; I < Info->VAddrDwords; ++I)
@@ -339,11 +335,12 @@ bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) {
 
     C.second = true;
     ++NumNSAConverted;
-    LLVM_DEBUG(dbgs() << "\tNew allocation:\t\t ["
-                 << llvm::printReg((VRM->getPhys(Intervals.front()->reg)), TRI)
-                 << " : "
-                 << llvm::printReg((VRM->getPhys(Intervals.back()->reg)), TRI)
-                 << "]\n");
+    LLVM_DEBUG(
+        dbgs() << "\tNew allocation:\t\t ["
+               << llvm::printReg((VRM->getPhys(Intervals.front()->reg())), TRI)
+               << " : "
+               << llvm::printReg((VRM->getPhys(Intervals.back()->reg())), TRI)
+               << "]\n");
     Changed = true;
   }
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNProcessors.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNProcessors.td
index 17e6098d880d..7447ec2db188 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNProcessors.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNProcessors.td
@@ -32,20 +32,24 @@ def : ProcessorModel<"gfx601", SIQuarterSpeedModel,
   FeatureISAVersion6_0_1.Features
 >;
 
-def : ProcessorModel<"hainan", SIQuarterSpeedModel,
+def : ProcessorModel<"pitcairn", SIQuarterSpeedModel,
   FeatureISAVersion6_0_1.Features
 >;
 
-def : ProcessorModel<"oland", SIQuarterSpeedModel,
+def : ProcessorModel<"verde", SIQuarterSpeedModel,
   FeatureISAVersion6_0_1.Features
 >;
 
-def : ProcessorModel<"pitcairn", SIQuarterSpeedModel,
-  FeatureISAVersion6_0_1.Features
+def : ProcessorModel<"gfx602", SIQuarterSpeedModel,
+  FeatureISAVersion6_0_2.Features
 >;
 
-def : ProcessorModel<"verde", SIQuarterSpeedModel,
-  FeatureISAVersion6_0_1.Features
+def : ProcessorModel<"hainan", SIQuarterSpeedModel,
+  FeatureISAVersion6_0_2.Features
+>;
+
+def : ProcessorModel<"oland", SIQuarterSpeedModel,
+  FeatureISAVersion6_0_2.Features
 >;
 
 //===------------------------------------------------------------===//
@@ -92,6 +96,10 @@ def : ProcessorModel<"bonaire", SIQuarterSpeedModel,
   FeatureISAVersion7_0_4.Features
 >;
 
+def : ProcessorModel<"gfx705", SIQuarterSpeedModel,
+  FeatureISAVersion7_0_5.Features
+>;
+
 //===------------------------------------------------------------===//
 // GCN GFX8 (Volcanic Islands (VI)).
 //===------------------------------------------------------------===//
@@ -132,6 +140,14 @@ def : ProcessorModel<"polaris11", SIQuarterSpeedModel,
   FeatureISAVersion8_0_3.Features
 >;
 
+def : ProcessorModel<"gfx805", SIQuarterSpeedModel,
+  FeatureISAVersion8_0_5.Features
+>;
+
+def : ProcessorModel<"tongapro", SIQuarterSpeedModel,
+  FeatureISAVersion8_0_5.Features
+>;
+
 def : ProcessorModel<"gfx810", SIQuarterSpeedModel,
   FeatureISAVersion8_1_0.Features
 >;
@@ -168,6 +184,10 @@ def : ProcessorModel<"gfx909", SIQuarterSpeedModel,
   FeatureISAVersion9_0_9.Features
 >;
 
+def : ProcessorModel<"gfx90c", SIQuarterSpeedModel,
+  FeatureISAVersion9_0_C.Features
+>;
+
 //===----------------------------------------------------------------------===//
 // GCN GFX10.
 //===----------------------------------------------------------------------===//
@@ -187,3 +207,15 @@ def : ProcessorModel<"gfx1012", GFX10SpeedModel,
 def : ProcessorModel<"gfx1030", GFX10SpeedModel,
   FeatureISAVersion10_3_0.Features
 >;
+
+def : ProcessorModel<"gfx1031", GFX10SpeedModel,
+  FeatureISAVersion10_3_0.Features
+>;
+
+def : ProcessorModel<"gfx1032", GFX10SpeedModel,
+  FeatureISAVersion10_3_0.Features
+>;
+
+def : ProcessorModel<"gfx1033", GFX10SpeedModel,
+  FeatureISAVersion10_3_0.Features
+>;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp
index 98d971630ca4..a12e9ab03e1d 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp
@@ -31,20 +31,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIInstrInfo.h"
+#include "GCNSubtarget.h"
 #include "SIMachineFunctionInfo.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/LiveRegMatrix.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/Support/MathExtras.h"
 
 using namespace llvm;
 
@@ -76,41 +71,59 @@ class GCNRegBankReassign : public MachineFunctionPass {
   public:
     OperandMask(unsigned r, unsigned s, unsigned m)
       : Reg(r), SubReg(s), Mask(m) {}
-    unsigned Reg;
+    Register Reg;
     unsigned SubReg;
     unsigned Mask;
   };
 
   class Candidate {
   public:
-    Candidate(MachineInstr *mi, unsigned reg, unsigned freebanks,
-              unsigned weight)
-      : MI(mi), Reg(reg), FreeBanks(freebanks), Weight(weight) {}
-
-    bool operator< (const Candidate& RHS) const { return Weight < RHS.Weight; }
+    Candidate(MachineInstr *mi, Register reg, unsigned subreg,
+              unsigned freebanks)
+        : MI(mi), Reg(reg), SubReg(subreg), FreeBanks(freebanks) {}
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
     void dump(const GCNRegBankReassign *P) const {
       MI->dump();
       dbgs() << P->printReg(Reg) << " to banks ";
       dumpFreeBanks(FreeBanks);
-      dbgs() << " weight " << Weight << '\n';
+      dbgs() << '\n';
     }
 #endif
 
     MachineInstr *MI;
-    unsigned Reg;
+    Register Reg;
+    unsigned SubReg;
     unsigned FreeBanks;
-    unsigned Weight;
   };
 
-  class CandidateList : public std::list<Candidate> {
+  class CandidateList : public std::map<unsigned, std::list<Candidate>> {
   public:
-    // Speedup subsequent sort.
-    void push(const Candidate&& C) {
-      if (C.Weight) push_back(C);
-      else push_front(C);
+    void push(unsigned Weight, const Candidate&& C) {
+      operator[](Weight).push_front(C);
+    }
+
+    Candidate &back() {
+      return rbegin()->second.back();
+    }
+
+    void pop_back() {
+      rbegin()->second.pop_back();
+      if (rbegin()->second.empty())
+        erase(rbegin()->first);
+    }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+    void dump(const GCNRegBankReassign *P) const {
+      dbgs() << "\nCandidates:\n\n";
+      for (auto &B : *this) {
+        dbgs() << " Weight " << B.first << ":\n";
+        for (auto &C : B.second)
+          C.dump(P);
+      }
+      dbgs() << "\n\n";
     }
+#endif
   };
 
 public:
@@ -162,32 +175,32 @@ private:
   const MCPhysReg *CSRegs;
 
   // Returns bank for a phys reg.
-  unsigned getPhysRegBank(unsigned Reg) const;
+  unsigned getPhysRegBank(Register Reg, unsigned SubReg) const;
 
   // Return a bit set for each register bank used. 4 banks for VGPRs and
   // 8 banks for SGPRs.
   // Registers already processed and recorded in RegsUsed are excluded.
   // If Bank is not -1 assume Reg:SubReg to belong to that Bank.
-  uint32_t getRegBankMask(unsigned Reg, unsigned SubReg, int Bank);
+  uint32_t getRegBankMask(Register Reg, unsigned SubReg, int Bank);
 
   // Analyze one instruction returning the number of stalls and a mask of the
   // banks used by all operands.
   // If Reg and Bank are provided, assume all uses of Reg will be replaced with
   // a register chosen from Bank.
   std::pair<unsigned, unsigned> analyzeInst(const MachineInstr &MI,
-                                            unsigned Reg = AMDGPU::NoRegister,
-                                            int Bank = -1);
+                                            Register Reg = Register(),
+                                            unsigned SubReg = 0, int Bank = -1);
 
   // Return true if register is regular VGPR or SGPR or their tuples.
   // Returns false for special registers like m0, vcc etc.
-  bool isReassignable(unsigned Reg) const;
+  bool isReassignable(Register Reg) const;
 
   // Check if registers' defs are old and may be pre-loaded.
   // Returns 0 if both registers are old enough, 1 or 2 if one or both
   // registers will not likely be pre-loaded.
   unsigned getOperandGatherWeight(const MachineInstr& MI,
-                                  unsigned Reg1,
-                                  unsigned Reg2,
+                                  Register Reg1,
+                                  Register Reg2,
                                   unsigned StallCycles) const;
 
 
@@ -197,7 +210,7 @@ private:
   // Find all bank bits in UsedBanks where Mask can be relocated to.
   // Bank is relative to the register and not its subregister component.
   // Returns 0 is a register is not reassignable.
-  unsigned getFreeBanks(unsigned Reg, unsigned SubReg, unsigned Mask,
+  unsigned getFreeBanks(Register Reg, unsigned SubReg, unsigned Mask,
                         unsigned UsedBanks) const;
 
   // Add cadidate instruction to the work list.
@@ -209,18 +222,20 @@ private:
   unsigned collectCandidates(MachineFunction &MF, bool Collect = true);
 
   // Remove all candidates that read specified register.
-  void removeCandidates(unsigned Reg);
+  void removeCandidates(Register Reg);
 
   // Compute stalls within the uses of SrcReg replaced by a register from
   // Bank. If Bank is -1 does not perform substitution. If Collect is set
   // candidates are collected and added to work list.
-  unsigned computeStallCycles(unsigned SrcReg,
-                              unsigned Reg = AMDGPU::NoRegister,
-                              int Bank = -1, bool Collect = false);
+  unsigned computeStallCycles(Register SrcReg,
+                              Register Reg = Register(),
+                              unsigned SubReg = 0, int Bank = -1,
+                              bool Collect = false);
 
   // Search for a register in Bank unused within LI.
   // Returns phys reg or NoRegister.
-  unsigned scavengeReg(LiveInterval& LI, unsigned Bank) const;
+  MCRegister scavengeReg(LiveInterval &LI, unsigned Bank,
+                         unsigned SubReg) const;
 
   // Try to reassign candidate. Returns number or stall cycles saved.
   unsigned tryReassign(Candidate &C);
@@ -231,9 +246,9 @@ private:
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 public:
-  Printable printReg(unsigned Reg, unsigned SubReg = 0) const {
+  Printable printReg(Register Reg, unsigned SubReg = 0) const {
     return Printable([Reg, SubReg, this](raw_ostream &OS) {
-      if (Register::isPhysicalRegister(Reg)) {
+      if (Reg.isPhysical()) {
         OS << llvm::printReg(Reg, TRI);
         return;
       }
@@ -277,28 +292,37 @@ char GCNRegBankReassign::ID = 0;
 
 char &llvm::GCNRegBankReassignID = GCNRegBankReassign::ID;
 
-unsigned GCNRegBankReassign::getPhysRegBank(unsigned Reg) const {
-  assert(Register::isPhysicalRegister(Reg));
+unsigned GCNRegBankReassign::getPhysRegBank(Register Reg,
+                                            unsigned SubReg) const {
+  assert(Reg.isPhysical());
 
   const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
   unsigned Size = TRI->getRegSizeInBits(*RC);
   if (Size == 16)
     Reg = TRI->get32BitRegister(Reg);
-  else if (Size > 32)
-    Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
+  else if (Size > 32) {
+    if (SubReg) {
+      const TargetRegisterClass *SubRC = TRI->getSubRegClass(RC, SubReg);
+      Reg = TRI->getSubReg(Reg, SubReg);
+      if (TRI->getRegSizeInBits(*SubRC) > 32)
+        Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
+    } else {
+      Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
+    }
+  }
 
   if (TRI->hasVGPRs(RC)) {
-    Reg -= AMDGPU::VGPR0;
-    return Reg % NUM_VGPR_BANKS;
+    unsigned RegNo = Reg - AMDGPU::VGPR0;
+    return RegNo % NUM_VGPR_BANKS;
   }
 
-  Reg = TRI->getEncodingValue(Reg) / 2;
-  return Reg % NUM_SGPR_BANKS + SGPR_BANK_OFFSET;
+  unsigned RegNo = TRI->getEncodingValue(AMDGPU::getMCReg(Reg, *ST)) / 2;
+  return RegNo % NUM_SGPR_BANKS + SGPR_BANK_OFFSET;
 }
 
-uint32_t GCNRegBankReassign::getRegBankMask(unsigned Reg, unsigned SubReg,
+uint32_t GCNRegBankReassign::getRegBankMask(Register Reg, unsigned SubReg,
                                             int Bank) {
-  if (Register::isVirtualRegister(Reg)) {
+  if (Reg.isVirtual()) {
     if (!VRM->isAssignedReg(Reg))
       return 0;
 
@@ -323,23 +347,23 @@ uint32_t GCNRegBankReassign::getRegBankMask(unsigned Reg, unsigned SubReg,
 
   if (TRI->hasVGPRs(RC)) {
     // VGPRs have 4 banks assigned in a round-robin fashion.
-    Reg -= AMDGPU::VGPR0;
+    unsigned RegNo = Reg - AMDGPU::VGPR0;
     uint32_t Mask = maskTrailingOnes<uint32_t>(Size);
     unsigned Used = 0;
     // Bitmask lacks an extract method
     for (unsigned I = 0; I < Size; ++I)
-      if (RegsUsed.test(Reg + I))
+      if (RegsUsed.test(RegNo + I))
         Used |= 1 << I;
-    RegsUsed.set(Reg, Reg + Size);
+    RegsUsed.set(RegNo, RegNo + Size);
     Mask &= ~Used;
-    Mask <<= (Bank == -1) ? Reg % NUM_VGPR_BANKS : uint32_t(Bank);
+    Mask <<= (Bank == -1) ? RegNo % NUM_VGPR_BANKS : uint32_t(Bank);
     return (Mask | (Mask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK;
   }
 
   // SGPRs have 8 banks holding 2 consequitive registers each.
-  Reg = TRI->getEncodingValue(Reg) / 2;
+  unsigned RegNo = TRI->getEncodingValue(AMDGPU::getMCReg(Reg, *ST)) / 2;
   unsigned StartBit = AMDGPU::VGPR_32RegClass.getNumRegs();
-  if (Reg + StartBit >= RegsUsed.size())
+  if (RegNo + StartBit >= RegsUsed.size())
     return 0;
 
   if (Size > 1)
@@ -347,11 +371,11 @@ uint32_t GCNRegBankReassign::getRegBankMask(unsigned Reg, unsigned SubReg,
   unsigned Mask = (1 << Size) - 1;
   unsigned Used = 0;
   for (unsigned I = 0; I < Size; ++I)
-    if (RegsUsed.test(StartBit + Reg + I))
+    if (RegsUsed.test(StartBit + RegNo + I))
       Used |= 1 << I;
-  RegsUsed.set(StartBit + Reg, StartBit + Reg + Size);
+  RegsUsed.set(StartBit + RegNo, StartBit + RegNo + Size);
   Mask &= ~Used;
-  Mask <<= (Bank == -1) ? Reg % NUM_SGPR_BANKS
+  Mask <<= (Bank == -1) ? RegNo % NUM_SGPR_BANKS
                         : unsigned(Bank - SGPR_BANK_OFFSET);
   Mask = (Mask | (Mask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK;
   // Reserve 4 bank ids for VGPRs.
@@ -359,8 +383,8 @@ uint32_t GCNRegBankReassign::getRegBankMask(unsigned Reg, unsigned SubReg,
 }
 
 std::pair<unsigned, unsigned>
-GCNRegBankReassign::analyzeInst(const MachineInstr &MI, unsigned Reg,
-                                int Bank) {
+GCNRegBankReassign::analyzeInst(const MachineInstr &MI, Register Reg,
+                                unsigned SubReg, int Bank) {
   unsigned StallCycles = 0;
   unsigned UsedBanks = 0;
 
@@ -375,26 +399,39 @@ GCNRegBankReassign::analyzeInst(const MachineInstr &MI, unsigned Reg,
     if (!Op.isReg() || Op.isUndef())
       continue;
 
-    Register R = Op.getReg();
-    if (TRI->hasAGPRs(TRI->getRegClassForReg(*MRI, R)))
-      continue;
+    const Register R = Op.getReg();
+    const TargetRegisterClass *RC = TRI->getRegClassForReg(*MRI, R);
 
-    unsigned ShiftedBank = Bank;
+    // Do not compute stalls for AGPRs
+    if (TRI->hasAGPRs(RC))
+      continue;
 
-    if (Bank != -1 && R == Reg && Op.getSubReg()) {
-      unsigned Offset = TRI->getChannelFromSubReg(Op.getSubReg());
+    // Do not compute stalls if sub-register covers all banks
+    if (Op.getSubReg()) {
       LaneBitmask LM = TRI->getSubRegIndexLaneMask(Op.getSubReg());
-      if (Offset && Bank < NUM_VGPR_BANKS) {
-        // If a register spans all banks we cannot shift it to avoid conflict.
+      if (TRI->hasVGPRs(RC)) {
         if (TRI->getNumCoveredRegs(LM) >= NUM_VGPR_BANKS)
           continue;
-        ShiftedBank = (Bank + Offset) % NUM_VGPR_BANKS;
-      } else if (Offset > 1 && Bank >= SGPR_BANK_OFFSET) {
-        // If a register spans all banks we cannot shift it to avoid conflict.
+      } else {
         if (TRI->getNumCoveredRegs(LM) / 2 >= NUM_SGPR_BANKS)
           continue;
+      }
+    }
+
+    unsigned ShiftedBank = Bank;
+
+    if (Bank != -1 && R == Reg && (Op.getSubReg() || SubReg)) {
+      unsigned RegOffset =
+          TRI->getChannelFromSubReg(SubReg ? SubReg : (unsigned)AMDGPU::sub0);
+      unsigned Offset = TRI->getChannelFromSubReg(
+          Op.getSubReg() ? Op.getSubReg() : (unsigned)AMDGPU::sub0);
+      if (Bank < NUM_VGPR_BANKS) {
+        unsigned Shift = ((NUM_VGPR_BANKS + Offset) - RegOffset);
+        ShiftedBank = (Bank + Shift) % NUM_VGPR_BANKS;
+      } else if (Bank >= SGPR_BANK_OFFSET) {
+        unsigned Shift = (NUM_SGPR_BANKS + (Offset >> 1)) - (RegOffset >> 1);
         ShiftedBank = SGPR_BANK_OFFSET +
-          (Bank - SGPR_BANK_OFFSET + (Offset >> 1)) % NUM_SGPR_BANKS;
+                      (Bank - SGPR_BANK_OFFSET + Shift) % NUM_SGPR_BANKS;
       }
     }
 
@@ -409,8 +446,8 @@ GCNRegBankReassign::analyzeInst(const MachineInstr &MI, unsigned Reg,
 }
 
 unsigned GCNRegBankReassign::getOperandGatherWeight(const MachineInstr& MI,
-                                                    unsigned Reg1,
-                                                    unsigned Reg2,
+                                                    Register Reg1,
+                                                    Register Reg2,
                                                     unsigned StallCycles) const
 {
   unsigned Defs = 0;
@@ -430,8 +467,8 @@ unsigned GCNRegBankReassign::getOperandGatherWeight(const MachineInstr& MI,
   return countPopulation(Defs);
 }
 
-bool GCNRegBankReassign::isReassignable(unsigned Reg) const {
-  if (Register::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg))
+bool GCNRegBankReassign::isReassignable(Register Reg) const {
+  if (Reg.isPhysical() || !VRM->isAssignedReg(Reg))
     return false;
 
   const MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
@@ -506,7 +543,7 @@ unsigned GCNRegBankReassign::getFreeBanks(unsigned Mask,
   return FreeBanks;
 }
 
-unsigned GCNRegBankReassign::getFreeBanks(unsigned Reg,
+unsigned GCNRegBankReassign::getFreeBanks(Register Reg,
                                           unsigned SubReg,
                                           unsigned Mask,
                                           unsigned UsedBanks) const {
@@ -556,8 +593,8 @@ void GCNRegBankReassign::collectCandidates(MachineInstr& MI,
       if (!(OperandMasks[I].Mask & OperandMasks[J].Mask))
         continue;
 
-      unsigned Reg1 = OperandMasks[I].Reg;
-      unsigned Reg2 = OperandMasks[J].Reg;
+      Register Reg1 = OperandMasks[I].Reg;
+      Register Reg2 = OperandMasks[J].Reg;
       unsigned SubReg1 = OperandMasks[I].SubReg;
       unsigned SubReg2 = OperandMasks[J].SubReg;
       unsigned Mask1 = OperandMasks[I].Mask;
@@ -576,17 +613,17 @@ void GCNRegBankReassign::collectCandidates(MachineInstr& MI,
       unsigned FreeBanks1 = getFreeBanks(Reg1, SubReg1, Mask1, UsedBanks);
       unsigned FreeBanks2 = getFreeBanks(Reg2, SubReg2, Mask2, UsedBanks);
       if (FreeBanks1)
-        Candidates.push(Candidate(&MI, Reg1, FreeBanks1, Weight
-                                    + ((Size2 > Size1) ? 1 : 0)));
+        Candidates.push(Weight + ((Size2 > Size1) ? 1 : 0),
+                        Candidate(&MI, Reg1, SubReg1, FreeBanks1));
       if (FreeBanks2)
-        Candidates.push(Candidate(&MI, Reg2, FreeBanks2, Weight
-                                    + ((Size1 > Size2) ? 1 : 0)));
+        Candidates.push(Weight + ((Size1 > Size2) ? 1 : 0),
+                        Candidate(&MI, Reg2, SubReg2, FreeBanks2));
     }
   }
 }
 
-unsigned GCNRegBankReassign::computeStallCycles(unsigned SrcReg,
-                                                unsigned Reg, int Bank,
+unsigned GCNRegBankReassign::computeStallCycles(Register SrcReg, Register Reg,
+                                                unsigned SubReg, int Bank,
                                                 bool Collect) {
   unsigned TotalStallCycles = 0;
   SmallSet<const MachineInstr *, 16> Visited;
@@ -598,7 +635,7 @@ unsigned GCNRegBankReassign::computeStallCycles(unsigned SrcReg,
       continue;
     unsigned StallCycles;
     unsigned UsedBanks;
-    std::tie(StallCycles, UsedBanks) = analyzeInst(MI, Reg, Bank);
+    std::tie(StallCycles, UsedBanks) = analyzeInst(MI, Reg, SubReg, Bank);
     TotalStallCycles += StallCycles;
     if (Collect)
       collectCandidates(MI, UsedBanks, StallCycles);
@@ -607,26 +644,26 @@ unsigned GCNRegBankReassign::computeStallCycles(unsigned SrcReg,
   return TotalStallCycles;
 }
 
-unsigned GCNRegBankReassign::scavengeReg(LiveInterval& LI,
-                                         unsigned Bank) const {
-  const TargetRegisterClass *RC = MRI->getRegClass(LI.reg);
+MCRegister GCNRegBankReassign::scavengeReg(LiveInterval &LI, unsigned Bank,
+                                           unsigned SubReg) const {
+  const TargetRegisterClass *RC = MRI->getRegClass(LI.reg());
   unsigned MaxNumRegs = (Bank < NUM_VGPR_BANKS) ? MaxNumVGPRs
                                                 : MaxNumSGPRs;
   unsigned MaxReg = MaxNumRegs + (Bank < NUM_VGPR_BANKS ? AMDGPU::VGPR0
                                                         : AMDGPU::SGPR0);
 
-  for (unsigned Reg : RC->getRegisters()) {
+  for (MCRegister Reg : RC->getRegisters()) {
     // Check occupancy limit.
     if (TRI->isSubRegisterEq(Reg, MaxReg))
       break;
 
-    if (!MRI->isAllocatable(Reg) || getPhysRegBank(Reg) != Bank)
+    if (!MRI->isAllocatable(Reg) || getPhysRegBank(Reg, SubReg) != Bank)
       continue;
 
     for (unsigned I = 0; CSRegs[I]; ++I)
       if (TRI->isSubRegisterEq(Reg, CSRegs[I]) &&
           !LRM->isPhysRegUsed(CSRegs[I]))
-        return AMDGPU::NoRegister;
+        return MCRegister::from(AMDGPU::NoRegister);
 
     LLVM_DEBUG(dbgs() << "Trying register " << printReg(Reg) << '\n');
 
@@ -634,7 +671,7 @@ unsigned GCNRegBankReassign::scavengeReg(LiveInterval& LI,
       return Reg;
   }
 
-  return AMDGPU::NoRegister;
+  return MCRegister::from(AMDGPU::NoRegister);
 }
 
 unsigned GCNRegBankReassign::tryReassign(Candidate &C) {
@@ -669,7 +706,7 @@ unsigned GCNRegBankReassign::tryReassign(Candidate &C) {
   for (int Bank = 0; Bank < NUM_BANKS; ++Bank) {
     if (C.FreeBanks & (1 << Bank)) {
       LLVM_DEBUG(dbgs() << "Trying bank " << printBank(Bank) << '\n');
-      unsigned Stalls = computeStallCycles(C.Reg, C.Reg, Bank);
+      unsigned Stalls = computeStallCycles(C.Reg, C.Reg, C.SubReg, Bank);
       if (Stalls < OrigStalls) {
         LLVM_DEBUG(dbgs() << "With bank " << printBank(Bank) << " -> "
                      << Stalls << '\n');
@@ -679,11 +716,11 @@ unsigned GCNRegBankReassign::tryReassign(Candidate &C) {
   }
   llvm::sort(BankStalls);
 
-  Register OrigReg = VRM->getPhys(C.Reg);
+  MCRegister OrigReg = VRM->getPhys(C.Reg);
   LRM->unassign(LI);
   while (!BankStalls.empty()) {
     BankStall BS = BankStalls.pop_back_val();
-    unsigned Reg = scavengeReg(LI, BS.Bank);
+    MCRegister Reg = scavengeReg(LI, BS.Bank, C.SubReg);
     if (Reg == AMDGPU::NoRegister) {
       LLVM_DEBUG(dbgs() << "No free registers in bank " << printBank(BS.Bank)
                    << '\n');
@@ -735,10 +772,16 @@ unsigned GCNRegBankReassign::collectCandidates(MachineFunction &MF,
   return TotalStallCycles;
 }
 
-void GCNRegBankReassign::removeCandidates(unsigned Reg) {
-  Candidates.remove_if([Reg, this](const Candidate& C) {
-    return C.MI->readsRegister(Reg, TRI);
-  });
+void GCNRegBankReassign::removeCandidates(Register Reg) {
+  typename CandidateList::iterator Next;
+  for (auto I = Candidates.begin(), E = Candidates.end(); I != E; I = Next) {
+    Next = std::next(I);
+    I->second.remove_if([Reg, this](const Candidate& C) {
+      return C.MI->readsRegister(Reg, TRI);
+    });
+    if (I->second.empty())
+      Candidates.erase(I);
+  }
 }
 
 bool GCNRegBankReassign::verifyCycles(MachineFunction &MF,
@@ -770,9 +813,10 @@ bool GCNRegBankReassign::runOnMachineFunction(MachineFunction &MF) {
   MaxNumSGPRs = std::min(ST->getMaxNumSGPRs(Occupancy, true), MaxNumSGPRs);
 
   CSRegs = MRI->getCalleeSavedRegs();
-
-  RegsUsed.resize(AMDGPU::VGPR_32RegClass.getNumRegs() +
-                  TRI->getEncodingValue(AMDGPU::SGPR_NULL) / 2 + 1);
+  unsigned NumRegBanks = AMDGPU::VGPR_32RegClass.getNumRegs() +
+                         // Not a tight bound
+                         AMDGPU::SReg_32RegClass.getNumRegs() / 2 + 1;
+  RegsUsed.resize(NumRegBanks);
 
   LLVM_DEBUG(dbgs() << "=== RegBanks reassign analysis on function " << MF.getName()
                << '\n');
@@ -783,11 +827,7 @@ bool GCNRegBankReassign::runOnMachineFunction(MachineFunction &MF) {
   LLVM_DEBUG(dbgs() << "=== " << StallCycles << " stall cycles detected in "
                   "function " << MF.getName() << '\n');
 
-  Candidates.sort();
-
-  LLVM_DEBUG(dbgs() << "\nCandidates:\n\n";
-        for (auto C : Candidates) C.dump(this);
-        dbgs() << "\n\n");
+  LLVM_DEBUG(Candidates.dump(this));
 
   unsigned CyclesSaved = 0;
   while (!Candidates.empty()) {
@@ -801,13 +841,9 @@ bool GCNRegBankReassign::runOnMachineFunction(MachineFunction &MF) {
     Candidates.pop_back();
     if (LocalCyclesSaved) {
       removeCandidates(C.Reg);
-      computeStallCycles(C.Reg, AMDGPU::NoRegister, -1, true);
-      Candidates.sort();
+      computeStallCycles(C.Reg, AMDGPU::NoRegister, 0, -1, true);
 
-      LLVM_DEBUG(dbgs() << "\nCandidates:\n\n";
-            for (auto C : Candidates)
-              C.dump(this);
-            dbgs() << "\n\n");
+      LLVM_DEBUG(Candidates.dump(this));
     }
   }
   NumStallsRecovered += CyclesSaved;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index 86a3cb9af32f..aeec3e886327 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -12,25 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "GCNRegPressure.h"
-#include "AMDGPUSubtarget.h"
-#include "SIRegisterInfo.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/LiveInterval.h"
-#include "llvm/CodeGen/LiveIntervals.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterPressure.h"
-#include "llvm/CodeGen/SlotIndexes.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/Config/llvm-config.h"
-#include "llvm/MC/LaneBitmask.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
-#include <cassert>
 
 using namespace llvm;
 
@@ -87,9 +69,9 @@ bool llvm::isEqual(const GCNRPTracker::LiveRegSet &S1,
 ///////////////////////////////////////////////////////////////////////////////
 // GCNRegPressure
 
-unsigned GCNRegPressure::getRegKind(unsigned Reg,
+unsigned GCNRegPressure::getRegKind(Register Reg,
                                     const MachineRegisterInfo &MRI) {
-  assert(Register::isVirtualRegister(Reg));
+  assert(Reg.isVirtual());
   const auto RC = MRI.getRegClass(Reg);
   auto STI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
   return STI->isSGPRClass(RC) ?
@@ -199,7 +181,7 @@ void GCNRegPressure::print(raw_ostream &OS, const GCNSubtarget *ST) const {
 
 static LaneBitmask getDefRegMask(const MachineOperand &MO,
                                  const MachineRegisterInfo &MRI) {
-  assert(MO.isDef() && MO.isReg() && Register::isVirtualRegister(MO.getReg()));
+  assert(MO.isDef() && MO.isReg() && MO.getReg().isVirtual());
 
   // We don't rely on read-undef flag because in case of tentative schedule
   // tracking it isn't set correctly yet. This works correctly however since
@@ -212,7 +194,7 @@ static LaneBitmask getDefRegMask(const MachineOperand &MO,
 static LaneBitmask getUsedRegMask(const MachineOperand &MO,
                                   const MachineRegisterInfo &MRI,
                                   const LiveIntervals &LIS) {
-  assert(MO.isUse() && MO.isReg() && Register::isVirtualRegister(MO.getReg()));
+  assert(MO.isUse() && MO.isReg() && MO.getReg().isVirtual());
 
   if (auto SubReg = MO.getSubReg())
     return MRI.getTargetRegisterInfo()->getSubRegIndexLaneMask(SubReg);
@@ -233,7 +215,7 @@ collectVirtualRegUses(const MachineInstr &MI, const LiveIntervals &LIS,
                       const MachineRegisterInfo &MRI) {
   SmallVector<RegisterMaskPair, 8> Res;
   for (const auto &MO : MI.operands()) {
-    if (!MO.isReg() || !Register::isVirtualRegister(MO.getReg()))
+    if (!MO.isReg() || !MO.getReg().isVirtual())
       continue;
     if (!MO.isUse() || !MO.readsReg())
       continue;
@@ -241,9 +223,8 @@ collectVirtualRegUses(const MachineInstr &MI, const LiveIntervals &LIS,
     auto const UsedMask = getUsedRegMask(MO, MRI, LIS);
 
     auto Reg = MO.getReg();
-    auto I = std::find_if(Res.begin(), Res.end(), [Reg](const RegisterMaskPair &RM) {
-      return RM.RegUnit == Reg;
-    });
+    auto I = llvm::find_if(
+        Res, [Reg](const RegisterMaskPair &RM) { return RM.RegUnit == Reg; });
     if (I != Res.end())
       I->LaneMask |= UsedMask;
     else
@@ -330,8 +311,7 @@ void GCNUpwardRPTracker::recede(const MachineInstr &MI) {
   MaxPressure = max(AtMIPressure, MaxPressure);
 
   for (const auto &MO : MI.operands()) {
-    if (!MO.isReg() || !MO.isDef() ||
-        !Register::isVirtualRegister(MO.getReg()) || MO.isDead())
+    if (!MO.isReg() || !MO.isDef() || !MO.getReg().isVirtual() || MO.isDead())
       continue;
 
     auto Reg = MO.getReg();
@@ -410,7 +390,7 @@ void GCNDownwardRPTracker::advanceToNext() {
     if (!MO.isReg() || !MO.isDef())
       continue;
     Register Reg = MO.getReg();
-    if (!Register::isVirtualRegister(Reg))
+    if (!Reg.isVirtual())
       continue;
     auto &LiveMask = LiveRegs[Reg];
     auto PrevMask = LiveMask;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 2ef79410719f..ba8c85aa502b 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -17,21 +17,15 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H
 #define LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H
 
-#include "AMDGPUSubtarget.h"
-#include "llvm/ADT/DenseMap.h"
+#include "GCNSubtarget.h"
 #include "llvm/CodeGen/LiveIntervals.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/SlotIndexes.h"
-#include "llvm/MC/LaneBitmask.h"
-#include "llvm/Support/Debug.h"
 #include <algorithm>
-#include <limits>
 
 namespace llvm {
 
 class MachineRegisterInfo;
 class raw_ostream;
+class SlotIndex;
 
 struct GCNRegPressure {
   enum RegKind {
@@ -90,7 +84,7 @@ struct GCNRegPressure {
 private:
   unsigned Value[TOTAL_KINDS];
 
-  static unsigned getRegKind(unsigned Reg, const MachineRegisterInfo &MRI);
+  static unsigned getRegKind(Register Reg, const MachineRegisterInfo &MRI);
 
   friend GCNRegPressure max(const GCNRegPressure &P1,
                             const GCNRegPressure &P2);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index deed50b6db7d..6e2550298dc6 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -12,13 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "GCNSchedStrategy.h"
-#include "AMDGPUSubtarget.h"
-#include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
-#include "SIRegisterInfo.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/CodeGen/RegisterClassInfo.h"
-#include "llvm/Support/MathExtras.h"
 
 #define DEBUG_TYPE "machine-scheduler"
 
@@ -567,8 +561,10 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
       SavedMutations.swap(Mutations);
 
     for (auto Region : Regions) {
-      if (Stage == UnclusteredReschedule && !RescheduleRegions[RegionIdx])
+      if (Stage == UnclusteredReschedule && !RescheduleRegions[RegionIdx]) {
+        ++RegionIdx;
         continue;
+      }
 
       RegionBegin = Region.first;
       RegionEnd = Region.second;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSubtarget.h
new file mode 100644
index 000000000000..7a7178126444
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -0,0 +1,1064 @@
+//=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//==-----------------------------------------------------------------------===//
+//
+/// \file
+/// AMD GCN specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
+#define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
+
+#include "AMDGPUCallLowering.h"
+#include "AMDGPUSubtarget.h"
+#include "SIFrameLowering.h"
+#include "SIISelLowering.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+
+namespace llvm {
+
+class MCInst;
+class MCInstrInfo;
+
+} // namespace llvm
+
+#define GET_SUBTARGETINFO_HEADER
+#include "AMDGPUGenSubtargetInfo.inc"
+
+namespace llvm {
+
+class GCNTargetMachine;
+
+class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
+                           public AMDGPUSubtarget {
+
+  using AMDGPUSubtarget::getMaxWavesPerEU;
+
+public:
+  enum TrapHandlerAbi {
+    TrapHandlerAbiNone = 0,
+    TrapHandlerAbiHsa = 1
+  };
+
+  enum TrapID {
+    TrapIDHardwareReserved = 0,
+    TrapIDHSADebugTrap = 1,
+    TrapIDLLVMTrap = 2,
+    TrapIDLLVMDebugTrap = 3,
+    TrapIDDebugBreakpoint = 7,
+    TrapIDDebugReserved8 = 8,
+    TrapIDDebugReservedFE = 0xfe,
+    TrapIDDebugReservedFF = 0xff
+  };
+
+  enum TrapRegValues {
+    LLVMTrapHandlerRegValue = 1
+  };
+
+private:
+  /// GlobalISel related APIs.
+  std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
+  std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo;
+  std::unique_ptr<InstructionSelector> InstSelector;
+  std::unique_ptr<LegalizerInfo> Legalizer;
+  std::unique_ptr<RegisterBankInfo> RegBankInfo;
+
+protected:
+  // Basic subtarget description.
+  Triple TargetTriple;
+  AMDGPU::IsaInfo::AMDGPUTargetID TargetID;
+  unsigned Gen;
+  InstrItineraryData InstrItins;
+  int LDSBankCount;
+  unsigned MaxPrivateElementSize;
+
+  // Possibly statically set by tablegen, but may want to be overridden.
+  bool FastFMAF32;
+  bool FastDenormalF32;
+  bool HalfRate64Ops;
+
+  // Dynamically set bits that enable features.
+  bool FlatForGlobal;
+  bool AutoWaitcntBeforeBarrier;
+  bool UnalignedScratchAccess;
+  bool UnalignedAccessMode;
+  bool HasApertureRegs;
+  bool SupportsXNACK;
+
+  // This should not be used directly. 'TargetID' tracks the dynamic settings
+  // for XNACK.
+  bool EnableXNACK;
+
+  bool EnableCuMode;
+  bool TrapHandler;
+
+  // Used as options.
+  bool EnableLoadStoreOpt;
+  bool EnableUnsafeDSOffsetFolding;
+  bool EnableSIScheduler;
+  bool EnableDS128;
+  bool EnablePRTStrictNull;
+  bool DumpCode;
+
+  // Subtarget statically properties set by tablegen
+  bool FP64;
+  bool FMA;
+  bool MIMG_R128;
+  bool GCN3Encoding;
+  bool CIInsts;
+  bool GFX8Insts;
+  bool GFX9Insts;
+  bool GFX10Insts;
+  bool GFX10_3Insts;
+  bool GFX7GFX8GFX9Insts;
+  bool SGPRInitBug;
+  bool HasSMemRealTime;
+  bool HasIntClamp;
+  bool HasFmaMixInsts;
+  bool HasMovrel;
+  bool HasVGPRIndexMode;
+  bool HasScalarStores;
+  bool HasScalarAtomics;
+  bool HasSDWAOmod;
+  bool HasSDWAScalar;
+  bool HasSDWASdst;
+  bool HasSDWAMac;
+  bool HasSDWAOutModsVOPC;
+  bool HasDPP;
+  bool HasDPP8;
+  bool HasR128A16;
+  bool HasGFX10A16;
+  bool HasG16;
+  bool HasNSAEncoding;
+  bool GFX10_BEncoding;
+  bool HasDLInsts;
+  bool HasDot1Insts;
+  bool HasDot2Insts;
+  bool HasDot3Insts;
+  bool HasDot4Insts;
+  bool HasDot5Insts;
+  bool HasDot6Insts;
+  bool HasMAIInsts;
+  bool HasPkFmacF16Inst;
+  bool HasAtomicFaddInsts;
+  bool SupportsSRAMECC;
+
+  // This should not be used directly. 'TargetID' tracks the dynamic settings
+  // for SRAMECC.
+  bool EnableSRAMECC;
+
+  bool HasNoSdstCMPX;
+  bool HasVscnt;
+  bool HasGetWaveIdInst;
+  bool HasSMemTimeInst;
+  bool HasRegisterBanking;
+  bool HasVOP3Literal;
+  bool HasNoDataDepHazard;
+  bool FlatAddressSpace;
+  bool FlatInstOffsets;
+  bool FlatGlobalInsts;
+  bool FlatScratchInsts;
+  bool ScalarFlatScratchInsts;
+  bool AddNoCarryInsts;
+  bool HasUnpackedD16VMem;
+  bool LDSMisalignedBug;
+  bool HasMFMAInlineLiteralBug;
+  bool UnalignedBufferAccess;
+  bool UnalignedDSAccess;
+  bool ScalarizeGlobal;
+
+  bool HasVcmpxPermlaneHazard;
+  bool HasVMEMtoScalarWriteHazard;
+  bool HasSMEMtoVectorWriteHazard;
+  bool HasInstFwdPrefetchBug;
+  bool HasVcmpxExecWARHazard;
+  bool HasLdsBranchVmemWARHazard;
+  bool HasNSAtoVMEMBug;
+  bool HasOffset3fBug;
+  bool HasFlatSegmentOffsetBug;
+  bool HasImageStoreD16Bug;
+  bool HasImageGather4D16Bug;
+
+  // Dummy feature to use for assembler in tablegen.
+  bool FeatureDisable;
+
+  SelectionDAGTargetInfo TSInfo;
+private:
+  SIInstrInfo InstrInfo;
+  SITargetLowering TLInfo;
+  SIFrameLowering FrameLowering;
+
+public:
+  // See COMPUTE_TMPRING_SIZE.WAVESIZE, 13-bit field in units of 256-dword.
+  static const unsigned MaxWaveScratchSize = (256 * 4) * ((1 << 13) - 1);
+
+  GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
+               const GCNTargetMachine &TM);
+  ~GCNSubtarget() override;
+
+  GCNSubtarget &initializeSubtargetDependencies(const Triple &TT,
+                                                   StringRef GPU, StringRef FS);
+
+  const SIInstrInfo *getInstrInfo() const override {
+    return &InstrInfo;
+  }
+
+  const SIFrameLowering *getFrameLowering() const override {
+    return &FrameLowering;
+  }
+
+  const SITargetLowering *getTargetLowering() const override {
+    return &TLInfo;
+  }
+
+  const SIRegisterInfo *getRegisterInfo() const override {
+    return &InstrInfo.getRegisterInfo();
+  }
+
+  const CallLowering *getCallLowering() const override {
+    return CallLoweringInfo.get();
+  }
+
+  const InlineAsmLowering *getInlineAsmLowering() const override {
+    return InlineAsmLoweringInfo.get();
+  }
+
+  InstructionSelector *getInstructionSelector() const override {
+    return InstSelector.get();
+  }
+
+  const LegalizerInfo *getLegalizerInfo() const override {
+    return Legalizer.get();
+  }
+
+  const RegisterBankInfo *getRegBankInfo() const override {
+    return RegBankInfo.get();
+  }
+
+  // Nothing implemented, just prevent crashes on use.
+  const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
+    return &TSInfo;
+  }
+
+  const InstrItineraryData *getInstrItineraryData() const override {
+    return &InstrItins;
+  }
+
+  void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
+
+  Generation getGeneration() const {
+    return (Generation)Gen;
+  }
+
+  /// Return the number of high bits known to be zero fror a frame index.
+  unsigned getKnownHighZeroBitsForFrameIndex() const {
+    return countLeadingZeros(MaxWaveScratchSize) + getWavefrontSizeLog2();
+  }
+
+  int getLDSBankCount() const {
+    return LDSBankCount;
+  }
+
+  unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const {
+    return (ForBufferRSrc || !enableFlatScratch()) ? MaxPrivateElementSize : 16;
+  }
+
+  unsigned getConstantBusLimit(unsigned Opcode) const;
+
+  bool hasIntClamp() const {
+    return HasIntClamp;
+  }
+
+  bool hasFP64() const {
+    return FP64;
+  }
+
+  bool hasMIMG_R128() const {
+    return MIMG_R128;
+  }
+
+  bool hasHWFP64() const {
+    return FP64;
+  }
+
+  bool hasFastFMAF32() const {
+    return FastFMAF32;
+  }
+
+  bool hasHalfRate64Ops() const {
+    return HalfRate64Ops;
+  }
+
+  bool hasAddr64() const {
+    return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
+  }
+
+  bool hasFlat() const {
+    return (getGeneration() > AMDGPUSubtarget::SOUTHERN_ISLANDS);
+  }
+
+  // Return true if the target only has the reverse operand versions of VALU
+  // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32).
+  bool hasOnlyRevVALUShifts() const {
+    return getGeneration() >= VOLCANIC_ISLANDS;
+  }
+
+  bool hasFractBug() const {
+    return getGeneration() == SOUTHERN_ISLANDS;
+  }
+
+  bool hasBFE() const {
+    return true;
+  }
+
+  bool hasBFI() const {
+    return true;
+  }
+
+  bool hasBFM() const {
+    return hasBFE();
+  }
+
+  bool hasBCNT(unsigned Size) const {
+    return true;
+  }
+
+  bool hasFFBL() const {
+    return true;
+  }
+
+  bool hasFFBH() const {
+    return true;
+  }
+
+  bool hasMed3_16() const {
+    return getGeneration() >= AMDGPUSubtarget::GFX9;
+  }
+
+  bool hasMin3Max3_16() const {
+    return getGeneration() >= AMDGPUSubtarget::GFX9;
+  }
+
+  bool hasFmaMixInsts() const {
+    return HasFmaMixInsts;
+  }
+
+  bool hasCARRY() const {
+    return true;
+  }
+
+  bool hasFMA() const {
+    return FMA;
+  }
+
+  bool hasSwap() const {
+    return GFX9Insts;
+  }
+
+  bool hasScalarPackInsts() const {
+    return GFX9Insts;
+  }
+
+  bool hasScalarMulHiInsts() const {
+    return GFX9Insts;
+  }
+
+  TrapHandlerAbi getTrapHandlerAbi() const {
+    return isAmdHsaOS() ? TrapHandlerAbiHsa : TrapHandlerAbiNone;
+  }
+
+  /// True if the offset field of DS instructions works as expected. On SI, the
+  /// offset uses a 16-bit adder and does not always wrap properly.
+  bool hasUsableDSOffset() const {
+    return getGeneration() >= SEA_ISLANDS;
+  }
+
+  bool unsafeDSOffsetFoldingEnabled() const {
+    return EnableUnsafeDSOffsetFolding;
+  }
+
+  /// Condition output from div_scale is usable.
+  bool hasUsableDivScaleConditionOutput() const {
+    return getGeneration() != SOUTHERN_ISLANDS;
+  }
+
+  /// Extra wait hazard is needed in some cases before
+  /// s_cbranch_vccnz/s_cbranch_vccz.
+  bool hasReadVCCZBug() const {
+    return getGeneration() <= SEA_ISLANDS;
+  }
+
+  /// Writes to VCC_LO/VCC_HI update the VCCZ flag.
+  bool partialVCCWritesUpdateVCCZ() const {
+    return getGeneration() >= GFX10;
+  }
+
+  /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
+  /// was written by a VALU instruction.
+  bool hasSMRDReadVALUDefHazard() const {
+    return getGeneration() == SOUTHERN_ISLANDS;
+  }
+
+  /// A read of an SGPR by a VMEM instruction requires 5 wait states when the
+  /// SGPR was written by a VALU Instruction.
+  bool hasVMEMReadSGPRVALUDefHazard() const {
+    return getGeneration() >= VOLCANIC_ISLANDS;
+  }
+
+  bool hasRFEHazards() const {
+    return getGeneration() >= VOLCANIC_ISLANDS;
+  }
+
+  /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
+  unsigned getSetRegWaitStates() const {
+    return getGeneration() <= SEA_ISLANDS ? 1 : 2;
+  }
+
+  bool dumpCode() const {
+    return DumpCode;
+  }
+
+  /// Return the amount of LDS that can be used that will not restrict the
+  /// occupancy lower than WaveCount.
+  unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
+                                           const Function &) const;
+
+  bool supportsMinMaxDenormModes() const {
+    return getGeneration() >= AMDGPUSubtarget::GFX9;
+  }
+
+  /// \returns If target supports S_DENORM_MODE.
+  bool hasDenormModeInst() const {
+    return getGeneration() >= AMDGPUSubtarget::GFX10;
+  }
+
+  bool useFlatForGlobal() const {
+    return FlatForGlobal;
+  }
+
+  /// \returns If target supports ds_read/write_b128 and user enables generation
+  /// of ds_read/write_b128.
+  bool useDS128() const {
+    return CIInsts && EnableDS128;
+  }
+
+  /// \return If target supports ds_read/write_b96/128.
+  bool hasDS96AndDS128() const {
+    return CIInsts;
+  }
+
+  /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64
+  bool haveRoundOpsF64() const {
+    return CIInsts;
+  }
+
+  /// \returns If MUBUF instructions always perform range checking, even for
+  /// buffer resources used for private memory access.
+  bool privateMemoryResourceIsRangeChecked() const {
+    return getGeneration() < AMDGPUSubtarget::GFX9;
+  }
+
+  /// \returns If target requires PRT Struct NULL support (zero result registers
+  /// for sparse texture support).
+  bool usePRTStrictNull() const {
+    return EnablePRTStrictNull;
+  }
+
+  bool hasAutoWaitcntBeforeBarrier() const {
+    return AutoWaitcntBeforeBarrier;
+  }
+
+  bool hasUnalignedBufferAccess() const {
+    return UnalignedBufferAccess;
+  }
+
+  bool hasUnalignedBufferAccessEnabled() const {
+    return UnalignedBufferAccess && UnalignedAccessMode;
+  }
+
+  bool hasUnalignedDSAccess() const {
+    return UnalignedDSAccess;
+  }
+
+  bool hasUnalignedDSAccessEnabled() const {
+    return UnalignedDSAccess && UnalignedAccessMode;
+  }
+
+  bool hasUnalignedScratchAccess() const {
+    return UnalignedScratchAccess;
+  }
+
+  bool hasUnalignedAccessMode() const {
+    return UnalignedAccessMode;
+  }
+
+  bool hasApertureRegs() const {
+    return HasApertureRegs;
+  }
+
+  bool isTrapHandlerEnabled() const {
+    return TrapHandler;
+  }
+
+  bool isXNACKEnabled() const {
+    return TargetID.isXnackOnOrAny();
+  }
+
+  bool isCuModeEnabled() const {
+    return EnableCuMode;
+  }
+
+  bool hasFlatAddressSpace() const {
+    return FlatAddressSpace;
+  }
+
+  bool hasFlatScrRegister() const {
+    return hasFlatAddressSpace();
+  }
+
+  bool hasFlatInstOffsets() const {
+    return FlatInstOffsets;
+  }
+
+  bool hasFlatGlobalInsts() const {
+    return FlatGlobalInsts;
+  }
+
+  bool hasFlatScratchInsts() const {
+    return FlatScratchInsts;
+  }
+
+  // Check if target supports ST addressing mode with FLAT scratch instructions.
+  // The ST addressing mode means no registers are used, either VGPR or SGPR,
+  // but only immediate offset is swizzled and added to the FLAT scratch base.
+  bool hasFlatScratchSTMode() const {
+    return hasFlatScratchInsts() && hasGFX10_3Insts();
+  }
+
+  bool hasScalarFlatScratchInsts() const {
+    return ScalarFlatScratchInsts;
+  }
+
+  bool hasGlobalAddTidInsts() const {
+    return GFX10_BEncoding;
+  }
+
+  bool hasAtomicCSub() const {
+    return GFX10_BEncoding;
+  }
+
+  bool hasMultiDwordFlatScratchAddressing() const {
+    return getGeneration() >= GFX9;
+  }
+
+  bool hasFlatSegmentOffsetBug() const {
+    return HasFlatSegmentOffsetBug;
+  }
+
+  bool hasFlatLgkmVMemCountInOrder() const {
+    return getGeneration() > GFX9;
+  }
+
+  bool hasD16LoadStore() const {
+    return getGeneration() >= GFX9;
+  }
+
+  bool d16PreservesUnusedBits() const {
+    return hasD16LoadStore() && !TargetID.isSramEccOnOrAny();
+  }
+
+  bool hasD16Images() const {
+    return getGeneration() >= VOLCANIC_ISLANDS;
+  }
+
+  /// Return if most LDS instructions have an m0 use that require m0 to be
+  /// iniitalized.
+  bool ldsRequiresM0Init() const {
+    return getGeneration() < GFX9;
+  }
+
+  // True if the hardware rewinds and replays GWS operations if a wave is
+  // preempted.
+  //
+  // If this is false, a GWS operation requires testing if a nack set the
+  // MEM_VIOL bit, and repeating if so.
+  bool hasGWSAutoReplay() const {
+    return getGeneration() >= GFX9;
+  }
+
+  /// \returns if target has ds_gws_sema_release_all instruction.
+  bool hasGWSSemaReleaseAll() const {
+    return CIInsts;
+  }
+
+  /// \returns true if the target has integer add/sub instructions that do not
+  /// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32,
+  /// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier
+  /// for saturation.
+  bool hasAddNoCarry() const {
+    return AddNoCarryInsts;
+  }
+
+  bool hasUnpackedD16VMem() const {
+    return HasUnpackedD16VMem;
+  }
+
+  // Covers VS/PS/CS graphics shaders
+  bool isMesaGfxShader(const Function &F) const {
+    return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv());
+  }
+
+  bool hasMad64_32() const {
+    return getGeneration() >= SEA_ISLANDS;
+  }
+
+  bool hasSDWAOmod() const {
+    return HasSDWAOmod;
+  }
+
+  bool hasSDWAScalar() const {
+    return HasSDWAScalar;
+  }
+
+  bool hasSDWASdst() const {
+    return HasSDWASdst;
+  }
+
+  bool hasSDWAMac() const {
+    return HasSDWAMac;
+  }
+
+  bool hasSDWAOutModsVOPC() const {
+    return HasSDWAOutModsVOPC;
+  }
+
+  bool hasDLInsts() const {
+    return HasDLInsts;
+  }
+
+  bool hasDot1Insts() const {
+    return HasDot1Insts;
+  }
+
+  bool hasDot2Insts() const {
+    return HasDot2Insts;
+  }
+
+  bool hasDot3Insts() const {
+    return HasDot3Insts;
+  }
+
+  bool hasDot4Insts() const {
+    return HasDot4Insts;
+  }
+
+  bool hasDot5Insts() const {
+    return HasDot5Insts;
+  }
+
+  bool hasDot6Insts() const {
+    return HasDot6Insts;
+  }
+
+  bool hasMAIInsts() const {
+    return HasMAIInsts;
+  }
+
+  bool hasPkFmacF16Inst() const {
+    return HasPkFmacF16Inst;
+  }
+
+  bool hasAtomicFaddInsts() const {
+    return HasAtomicFaddInsts;
+  }
+
+  bool hasNoSdstCMPX() const {
+    return HasNoSdstCMPX;
+  }
+
+  bool hasVscnt() const {
+    return HasVscnt;
+  }
+
+  bool hasGetWaveIdInst() const {
+    return HasGetWaveIdInst;
+  }
+
+  bool hasSMemTimeInst() const {
+    return HasSMemTimeInst;
+  }
+
+  bool hasRegisterBanking() const {
+    return HasRegisterBanking;
+  }
+
+  bool hasVOP3Literal() const {
+    return HasVOP3Literal;
+  }
+
+  bool hasNoDataDepHazard() const {
+    return HasNoDataDepHazard;
+  }
+
+  bool vmemWriteNeedsExpWaitcnt() const {
+    return getGeneration() < SEA_ISLANDS;
+  }
+
+  // Scratch is allocated in 256 dword per wave blocks for the entire
+  // wavefront. When viewed from the perspecive of an arbitrary workitem, this
+  // is 4-byte aligned.
+  //
+  // Only 4-byte alignment is really needed to access anything. Transformations
+  // on the pointer value itself may rely on the alignment / known low bits of
+  // the pointer. Set this to something above the minimum to avoid needing
+  // dynamic realignment in common cases.
+  Align getStackAlignment() const { return Align(16); }
+
+  bool enableMachineScheduler() const override {
+    return true;
+  }
+
+  bool useAA() const override;
+
+  bool enableSubRegLiveness() const override {
+    return true;
+  }
+
+  void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
+  bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
+
+  // static wrappers
+  static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
+
+  // XXX - Why is this here if it isn't in the default pass set?
+  bool enableEarlyIfConversion() const override {
+    return true;
+  }
+
+  bool enableFlatScratch() const;
+
+  void overrideSchedPolicy(MachineSchedPolicy &Policy,
+                           unsigned NumRegionInstrs) const override;
+
+  unsigned getMaxNumUserSGPRs() const {
+    return 16;
+  }
+
+  bool hasSMemRealTime() const {
+    return HasSMemRealTime;
+  }
+
+  bool hasMovrel() const {
+    return HasMovrel;
+  }
+
+  bool hasVGPRIndexMode() const {
+    return HasVGPRIndexMode;
+  }
+
+  bool useVGPRIndexMode() const;
+
+  bool hasScalarCompareEq64() const {
+    return getGeneration() >= VOLCANIC_ISLANDS;
+  }
+
+  bool hasScalarStores() const {
+    return HasScalarStores;
+  }
+
+  bool hasScalarAtomics() const {
+    return HasScalarAtomics;
+  }
+
+  bool hasLDSFPAtomics() const {
+    return GFX8Insts;
+  }
+
+  bool hasDPP() const {
+    return HasDPP;
+  }
+
+  bool hasDPPBroadcasts() const {
+    return HasDPP && getGeneration() < GFX10;
+  }
+
+  bool hasDPPWavefrontShifts() const {
+    return HasDPP && getGeneration() < GFX10;
+  }
+
+  bool hasDPP8() const {
+    return HasDPP8;
+  }
+
+  bool hasR128A16() const {
+    return HasR128A16;
+  }
+
+  bool hasGFX10A16() const {
+    return HasGFX10A16;
+  }
+
+  bool hasA16() const { return hasR128A16() || hasGFX10A16(); }
+
+  bool hasG16() const { return HasG16; }
+
+  bool hasOffset3fBug() const {
+    return HasOffset3fBug;
+  }
+
+  bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; }
+
+  bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; }
+
+  bool hasNSAEncoding() const { return HasNSAEncoding; }
+
+  bool hasGFX10_BEncoding() const {
+    return GFX10_BEncoding;
+  }
+
+  bool hasGFX10_3Insts() const {
+    return GFX10_3Insts;
+  }
+
+  bool hasMadF16() const;
+
+  bool enableSIScheduler() const {
+    return EnableSIScheduler;
+  }
+
+  bool loadStoreOptEnabled() const {
+    return EnableLoadStoreOpt;
+  }
+
+  bool hasSGPRInitBug() const {
+    return SGPRInitBug;
+  }
+
+  bool hasMFMAInlineLiteralBug() const {
+    return HasMFMAInlineLiteralBug;
+  }
+
+  bool has12DWordStoreHazard() const {
+    return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
+  }
+
+  // \returns true if the subtarget supports DWORDX3 load/store instructions.
+  bool hasDwordx3LoadStores() const {
+    return CIInsts;
+  }
+
+  bool hasReadM0MovRelInterpHazard() const {
+    return getGeneration() == AMDGPUSubtarget::GFX9;
+  }
+
+  bool hasReadM0SendMsgHazard() const {
+    return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
+           getGeneration() <= AMDGPUSubtarget::GFX9;
+  }
+
+  bool hasVcmpxPermlaneHazard() const {
+    return HasVcmpxPermlaneHazard;
+  }
+
+  bool hasVMEMtoScalarWriteHazard() const {
+    return HasVMEMtoScalarWriteHazard;
+  }
+
+  bool hasSMEMtoVectorWriteHazard() const {
+    return HasSMEMtoVectorWriteHazard;
+  }
+
+  bool hasLDSMisalignedBug() const {
+    return LDSMisalignedBug && !EnableCuMode;
+  }
+
+  bool hasInstFwdPrefetchBug() const {
+    return HasInstFwdPrefetchBug;
+  }
+
+  bool hasVcmpxExecWARHazard() const {
+    return HasVcmpxExecWARHazard;
+  }
+
+  bool hasLdsBranchVmemWARHazard() const {
+    return HasLdsBranchVmemWARHazard;
+  }
+
+  bool hasNSAtoVMEMBug() const {
+    return HasNSAtoVMEMBug;
+  }
+
+  bool hasHardClauses() const { return getGeneration() >= GFX10; }
+
+  /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
+  /// SGPRs
+  unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
+
+  /// Return the maximum number of waves per SIMD for kernels using \p VGPRs
+  /// VGPRs
+  unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
+
+  /// Return occupancy for the given function. Used LDS and a number of
+  /// registers if provided.
+  /// Note, occupancy can be affected by the scratch allocation as well, but
+  /// we do not have enough information to compute it.
+  unsigned computeOccupancy(const Function &F, unsigned LDSSize = 0,
+                            unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const;
+
+  /// \returns true if the flat_scratch register should be initialized with the
+  /// pointer to the wave's scratch memory rather than a size and offset.
+  bool flatScratchIsPointer() const {
+    return getGeneration() >= AMDGPUSubtarget::GFX9;
+  }
+
+  /// \returns true if the machine has merged shaders in which s0-s7 are
+  /// reserved by the hardware and user SGPRs start at s8
+  bool hasMergedShaders() const {
+    return getGeneration() >= GFX9;
+  }
+
+  /// \returns SGPR allocation granularity supported by the subtarget.
+  unsigned getSGPRAllocGranule() const {
+    return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
+  }
+
+  /// \returns SGPR encoding granularity supported by the subtarget.
+  unsigned getSGPREncodingGranule() const {
+    return AMDGPU::IsaInfo::getSGPREncodingGranule(this);
+  }
+
+  /// \returns Total number of SGPRs supported by the subtarget.
+  unsigned getTotalNumSGPRs() const {
+    return AMDGPU::IsaInfo::getTotalNumSGPRs(this);
+  }
+
+  /// \returns Addressable number of SGPRs supported by the subtarget.
+  unsigned getAddressableNumSGPRs() const {
+    return AMDGPU::IsaInfo::getAddressableNumSGPRs(this);
+  }
+
+  /// \returns Minimum number of SGPRs that meets the given number of waves per
+  /// execution unit requirement supported by the subtarget.
+  unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
+    return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU);
+  }
+
+  /// \returns Maximum number of SGPRs that meets the given number of waves per
+  /// execution unit requirement supported by the subtarget.
+  unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
+    return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable);
+  }
+
+  /// \returns Reserved number of SGPRs for given function \p MF.
+  unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
+
+  /// \returns Maximum number of SGPRs that meets number of waves per execution
+  /// unit requirement for function \p MF, or number of SGPRs explicitly
+  /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
+  ///
+  /// \returns Value that meets number of waves per execution unit requirement
+  /// if explicitly requested value cannot be converted to integer, violates
+  /// subtarget's specifications, or does not meet number of waves per execution
+  /// unit requirement.
+  unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
+
+  /// \returns VGPR allocation granularity supported by the subtarget.
+  unsigned getVGPRAllocGranule() const {
+    return AMDGPU::IsaInfo::getVGPRAllocGranule(this);
+  }
+
+  /// \returns VGPR encoding granularity supported by the subtarget.
+  unsigned getVGPREncodingGranule() const {
+    return AMDGPU::IsaInfo::getVGPREncodingGranule(this);
+  }
+
+  /// \returns Total number of VGPRs supported by the subtarget.
+  unsigned getTotalNumVGPRs() const {
+    return AMDGPU::IsaInfo::getTotalNumVGPRs(this);
+  }
+
+  /// \returns Addressable number of VGPRs supported by the subtarget.
+  unsigned getAddressableNumVGPRs() const {
+    return AMDGPU::IsaInfo::getAddressableNumVGPRs(this);
+  }
+
+  /// \returns Minimum number of VGPRs that meets given number of waves per
+  /// execution unit requirement supported by the subtarget.
+  unsigned getMinNumVGPRs(unsigned WavesPerEU) const {
+    return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU);
+  }
+
+  /// \returns Maximum number of VGPRs that meets given number of waves per
+  /// execution unit requirement supported by the subtarget.
+  unsigned getMaxNumVGPRs(unsigned WavesPerEU) const {
+    return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU);
+  }
+
+  /// \returns Maximum number of VGPRs that meets number of waves per execution
+  /// unit requirement for function \p MF, or number of VGPRs explicitly
+  /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
+  ///
+  /// \returns Value that meets number of waves per execution unit requirement
+  /// if explicitly requested value cannot be converted to integer, violates
+  /// subtarget's specifications, or does not meet number of waves per execution
+  /// unit requirement.
+  unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
+
+  void getPostRAMutations(
+      std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations)
+      const override;
+
+  bool isWave32() const {
+    return getWavefrontSize() == 32;
+  }
+
+  bool isWave64() const {
+    return getWavefrontSize() == 64;
+  }
+
+  const TargetRegisterClass *getBoolRC() const {
+    return getRegisterInfo()->getBoolRC();
+  }
+
+  /// \returns Maximum number of work groups per compute unit supported by the
+  /// subtarget and limited by given \p FlatWorkGroupSize.
+  unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
+    return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize);
+  }
+
+  /// \returns Minimum flat work group size supported by the subtarget.
+  unsigned getMinFlatWorkGroupSize() const override {
+    return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this);
+  }
+
+  /// \returns Maximum flat work group size supported by the subtarget.
+  unsigned getMaxFlatWorkGroupSize() const override {
+    return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
+  }
+
+  /// \returns Number of waves per execution unit required to support the given
+  /// \p FlatWorkGroupSize.
+  unsigned
+  getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override {
+    return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize);
+  }
+
+  /// \returns Minimum number of waves per execution unit supported by the
+  /// subtarget.
+  unsigned getMinWavesPerEU() const override {
+    return AMDGPU::IsaInfo::getMinWavesPerEU(this);
+  }
+
+  void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
+                             SDep &Dep) const override;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/InstCombineTables.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/InstCombineTables.td
new file mode 100644
index 000000000000..98b2adc442fa
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/InstCombineTables.td
@@ -0,0 +1,11 @@
+include "llvm/TableGen/SearchableTable.td"
+include "llvm/IR/Intrinsics.td"
+
+def AMDGPUImageDMaskIntrinsicTable : GenericTable {
+  let FilterClass = "AMDGPUImageDMaskIntrinsic";
+  let Fields = ["Intr"];
+
+  let PrimaryKey = ["Intr"];
+  let PrimaryKeyName = "getAMDGPUImageDMaskIntrinsic";
+  let PrimaryKeyEarlyOut = 1;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
index ea6e9038fd1e..dd0db6c7b655 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -9,17 +9,14 @@
 
 #include "MCTargetDesc/AMDGPUFixupKinds.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/BinaryFormat/ELF.h"
+#include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCFixupKindInfo.h"
 #include "llvm/MC/MCObjectWriter.h"
-#include "llvm/MC/MCValue.h"
 #include "llvm/Support/EndianStream.h"
 #include "llvm/Support/TargetRegistry.h"
-#include "Utils/AMDGPUBaseInfo.h"
 
 using namespace llvm;
 using namespace llvm::AMDGPU;
@@ -61,7 +58,6 @@ void AMDGPUAsmBackend::relaxInstruction(MCInst &Inst,
   Res.setOpcode(RelaxedOpcode);
   Res.addOperand(Inst.getOperand(0));
   Inst = std::move(Res);
-  return;
 }
 
 bool AMDGPUAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
@@ -237,7 +233,6 @@ MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T,
                                            const MCSubtargetInfo &STI,
                                            const MCRegisterInfo &MRI,
                                            const MCTargetOptions &Options) {
-  // Use 64-bit ELF for amdgcn
   return new ELFAMDGPUAsmBackend(T, STI.getTargetTriple(),
-                                 IsaInfo::hasCodeObjectV3(&STI) ? 1 : 0);
+                                 getHsaAbiVersion(&STI).getValueOr(0));
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
index 619fde74e88d..426648d19d55 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
@@ -8,15 +8,9 @@
 
 #include "AMDGPUFixupKinds.h"
 #include "AMDGPUMCTargetDesc.h"
-#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELFObjectWriter.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCFixup.h"
-#include "llvm/MC/MCObjectWriter.h"
-#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCValue.h"
-#include "llvm/Support/ErrorHandling.h"
 
 using namespace llvm;
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp
index 40437d8fa1a4..1ce7012040da 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp
@@ -7,10 +7,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUELFStreamer.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCObjectWriter.h"
 
 using namespace llvm;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h
index 9fbf53c944ef..b56f75132135 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h
@@ -14,13 +14,15 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUELFSTREAMER_H
 #define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUELFSTREAMER_H
 
-#include "llvm/MC/MCELFStreamer.h"
-
+#include <memory>
 namespace llvm {
 class MCAsmBackend;
 class MCCodeEmitter;
 class MCContext;
 class MCSubtargetInfo;
+class MCELFStreamer;
+class Triple;
+class MCObjectWriter;
 
 MCELFStreamer *createAMDGPUELFStreamer(const Triple &T, MCContext &Context,
                                        std::unique_ptr<MCAsmBackend> MAB,
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index fe063d33ea3e..fbf7dc2a72db 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -16,13 +16,9 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
-#include <cassert>
+#include "llvm/Support/TargetParser.h"
 
 using namespace llvm;
 using namespace llvm::AMDGPU;
@@ -136,7 +132,7 @@ void AMDGPUInstPrinter::printOffset(const MCInst *MI, unsigned OpNo,
                                     raw_ostream &O) {
   uint16_t Imm = MI->getOperand(OpNo).getImm();
   if (Imm != 0) {
-    O << ((OpNo == 0)? "offset:" : " offset:");
+    O << " offset:";
     printU16ImmDecOperand(MI, OpNo, O);
   }
 }
@@ -146,15 +142,16 @@ void AMDGPUInstPrinter::printFlatOffset(const MCInst *MI, unsigned OpNo,
                                         raw_ostream &O) {
   uint16_t Imm = MI->getOperand(OpNo).getImm();
   if (Imm != 0) {
-    O << ((OpNo == 0)? "offset:" : " offset:");
+    O << " offset:";
 
     const MCInstrDesc &Desc = MII.get(MI->getOpcode());
-    bool IsFlatSeg = !(Desc.TSFlags & SIInstrFlags::IsNonFlatSeg);
+    bool IsFlatSeg = !(Desc.TSFlags &
+        (SIInstrFlags::IsFlatGlobal | SIInstrFlags::IsFlatScratch));
 
     if (IsFlatSeg) { // Unsigned offset
       printU16ImmDecOperand(MI, OpNo, O);
     } else {         // Signed offset
-      if (AMDGPU::isGFX10(STI)) {
+      if (AMDGPU::isGFX10Plus(STI)) {
         O << formatDec(SignExtend32<12>(MI->getOperand(OpNo).getImm()));
       } else {
         O << formatDec(SignExtend32<13>(MI->getOperand(OpNo).getImm()));
@@ -206,7 +203,7 @@ void AMDGPUInstPrinter::printGDS(const MCInst *MI, unsigned OpNo,
 
 void AMDGPUInstPrinter::printDLC(const MCInst *MI, unsigned OpNo,
                                  const MCSubtargetInfo &STI, raw_ostream &O) {
-  if (AMDGPU::isGFX10(STI))
+  if (AMDGPU::isGFX10Plus(STI))
     printNamedBit(MI, OpNo, O, "dlc");
 }
 
@@ -285,26 +282,58 @@ void AMDGPUInstPrinter::printD16(const MCInst *MI, unsigned OpNo,
 void AMDGPUInstPrinter::printExpCompr(const MCInst *MI, unsigned OpNo,
                                       const MCSubtargetInfo &STI,
                                       raw_ostream &O) {
-  if (MI->getOperand(OpNo).getImm())
-    O << " compr";
+  printNamedBit(MI, OpNo, O, "compr");
 }
 
 void AMDGPUInstPrinter::printExpVM(const MCInst *MI, unsigned OpNo,
                                    const MCSubtargetInfo &STI,
                                    raw_ostream &O) {
-  if (MI->getOperand(OpNo).getImm())
-    O << " vm";
+  printNamedBit(MI, OpNo, O, "vm");
 }
 
 void AMDGPUInstPrinter::printFORMAT(const MCInst *MI, unsigned OpNo,
                                     const MCSubtargetInfo &STI,
                                     raw_ostream &O) {
-  if (unsigned Val = MI->getOperand(OpNo).getImm()) {
-    if (AMDGPU::isGFX10(STI))
+}
+
+void AMDGPUInstPrinter::printSymbolicFormat(const MCInst *MI,
+                                            const MCSubtargetInfo &STI,
+                                            raw_ostream &O) {
+  using namespace llvm::AMDGPU::MTBUFFormat;
+
+  int OpNo =
+    AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::format);
+  assert(OpNo != -1);
+
+  unsigned Val = MI->getOperand(OpNo).getImm();
+  if (AMDGPU::isGFX10Plus(STI)) {
+    if (Val == UFMT_DEFAULT)
+      return;
+    if (isValidUnifiedFormat(Val)) {
+      O << " format:[" << getUnifiedFormatName(Val) << ']';
+    } else {
+      O << " format:" << Val;
+    }
+  } else {
+    if (Val == DFMT_NFMT_DEFAULT)
+      return;
+    if (isValidDfmtNfmt(Val, STI)) {
+      unsigned Dfmt;
+      unsigned Nfmt;
+      decodeDfmtNfmt(Val, Dfmt, Nfmt);
+      O << " format:[";
+      if (Dfmt != DFMT_DEFAULT) {
+        O << getDfmtName(Dfmt);
+        if (Nfmt != NFMT_DEFAULT) {
+          O << ',';
+        }
+      }
+      if (Nfmt != NFMT_DEFAULT) {
+        O << getNfmtName(Nfmt, STI);
+      }
+      O << ']';
+    } else {
       O << " format:" << Val;
-    else {
-      O << " dfmt:" << (Val & 15);
-      O << ", nfmt:" << (Val >> 4);
     }
   }
 }
@@ -382,10 +411,12 @@ void AMDGPUInstPrinter::printImmediateInt16(uint32_t Imm,
                                             const MCSubtargetInfo &STI,
                                             raw_ostream &O) {
   int16_t SImm = static_cast<int16_t>(Imm);
-  if (isInlinableIntLiteral(SImm))
+  if (isInlinableIntLiteral(SImm)) {
     O << SImm;
-  else
-    O << formatHex(static_cast<uint64_t>(Imm));
+  } else {
+    uint64_t Imm16 = static_cast<uint16_t>(Imm);
+    O << formatHex(Imm16);
+  }
 }
 
 void AMDGPUInstPrinter::printImmediate16(uint32_t Imm,
@@ -413,11 +444,13 @@ void AMDGPUInstPrinter::printImmediate16(uint32_t Imm,
     O<< "4.0";
   else if (Imm == 0xC400)
     O<< "-4.0";
-  else if (Imm == 0x3118) {
-    assert(STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm]);
+  else if (Imm == 0x3118 &&
+           STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm]) {
     O << "0.15915494";
-  } else
-    O << formatHex(static_cast<uint64_t>(Imm));
+  } else {
+    uint64_t Imm16 = static_cast<uint16_t>(Imm);
+    O << formatHex(Imm16);
+  }
 }
 
 void AMDGPUInstPrinter::printImmediateV216(uint32_t Imm,
@@ -669,6 +702,14 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
       printDefaultVccOperand(OpNo, STI, O);
     break;
   }
+
+  if (Desc.TSFlags & SIInstrFlags::MTBUF) {
+    int SOffsetIdx =
+      AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::soffset);
+    assert(SOffsetIdx != -1);
+    if ((int)OpNo == SOffsetIdx)
+      printSymbolicFormat(MI, STI, O);
+  }
 }
 
 void AMDGPUInstPrinter::printOperandAndFPInputMods(const MCInst *MI,
@@ -735,11 +776,11 @@ void AMDGPUInstPrinter::printOperandAndIntInputMods(const MCInst *MI,
 void AMDGPUInstPrinter::printDPP8(const MCInst *MI, unsigned OpNo,
                                   const MCSubtargetInfo &STI,
                                   raw_ostream &O) {
-  if (!AMDGPU::isGFX10(STI))
+  if (!AMDGPU::isGFX10Plus(STI))
     llvm_unreachable("dpp8 is not supported on ASICs earlier than GFX10");
 
   unsigned Imm = MI->getOperand(OpNo).getImm();
-  O << " dpp8:[" << formatDec(Imm & 0x7);
+  O << "dpp8:[" << formatDec(Imm & 0x7);
   for (size_t i = 1; i < 8; ++i) {
     O << ',' << formatDec((Imm >> (3 * i)) & 0x7);
   }
@@ -753,81 +794,81 @@ void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo,
 
   unsigned Imm = MI->getOperand(OpNo).getImm();
   if (Imm <= DppCtrl::QUAD_PERM_LAST) {
-    O << " quad_perm:[";
+    O << "quad_perm:[";
     O << formatDec(Imm & 0x3)         << ',';
     O << formatDec((Imm & 0xc)  >> 2) << ',';
     O << formatDec((Imm & 0x30) >> 4) << ',';
     O << formatDec((Imm & 0xc0) >> 6) << ']';
   } else if ((Imm >= DppCtrl::ROW_SHL_FIRST) &&
              (Imm <= DppCtrl::ROW_SHL_LAST)) {
-    O << " row_shl:";
+    O << "row_shl:";
     printU4ImmDecOperand(MI, OpNo, O);
   } else if ((Imm >= DppCtrl::ROW_SHR_FIRST) &&
              (Imm <= DppCtrl::ROW_SHR_LAST)) {
-    O << " row_shr:";
+    O << "row_shr:";
     printU4ImmDecOperand(MI, OpNo, O);
   } else if ((Imm >= DppCtrl::ROW_ROR_FIRST) &&
              (Imm <= DppCtrl::ROW_ROR_LAST)) {
-    O << " row_ror:";
+    O << "row_ror:";
     printU4ImmDecOperand(MI, OpNo, O);
   } else if (Imm == DppCtrl::WAVE_SHL1) {
-    if (!AMDGPU::isVI(STI) && !AMDGPU::isGFX9(STI)) {
-      O << " /* wave_shl is not supported starting from GFX10 */";
+    if (AMDGPU::isGFX10Plus(STI)) {
+      O << "/* wave_shl is not supported starting from GFX10 */";
       return;
     }
-    O << " wave_shl:1";
+    O << "wave_shl:1";
   } else if (Imm == DppCtrl::WAVE_ROL1) {
-    if (!AMDGPU::isVI(STI) && !AMDGPU::isGFX9(STI)) {
-      O << " /* wave_rol is not supported starting from GFX10 */";
+    if (AMDGPU::isGFX10Plus(STI)) {
+      O << "/* wave_rol is not supported starting from GFX10 */";
       return;
     }
-    O << " wave_rol:1";
+    O << "wave_rol:1";
   } else if (Imm == DppCtrl::WAVE_SHR1) {
-    if (!AMDGPU::isVI(STI) && !AMDGPU::isGFX9(STI)) {
-      O << " /* wave_shr is not supported starting from GFX10 */";
+    if (AMDGPU::isGFX10Plus(STI)) {
+      O << "/* wave_shr is not supported starting from GFX10 */";
       return;
     }
-    O << " wave_shr:1";
+    O << "wave_shr:1";
   } else if (Imm == DppCtrl::WAVE_ROR1) {
-    if (!AMDGPU::isVI(STI) && !AMDGPU::isGFX9(STI)) {
-      O << " /* wave_ror is not supported starting from GFX10 */";
+    if (AMDGPU::isGFX10Plus(STI)) {
+      O << "/* wave_ror is not supported starting from GFX10 */";
       return;
     }
-    O << " wave_ror:1";
+    O << "wave_ror:1";
   } else if (Imm == DppCtrl::ROW_MIRROR) {
-    O << " row_mirror";
+    O << "row_mirror";
   } else if (Imm == DppCtrl::ROW_HALF_MIRROR) {
-    O << " row_half_mirror";
+    O << "row_half_mirror";
   } else if (Imm == DppCtrl::BCAST15) {
-    if (!AMDGPU::isVI(STI) && !AMDGPU::isGFX9(STI)) {
-      O << " /* row_bcast is not supported starting from GFX10 */";
+    if (AMDGPU::isGFX10Plus(STI)) {
+      O << "/* row_bcast is not supported starting from GFX10 */";
       return;
     }
-    O << " row_bcast:15";
+    O << "row_bcast:15";
   } else if (Imm == DppCtrl::BCAST31) {
-    if (!AMDGPU::isVI(STI) && !AMDGPU::isGFX9(STI)) {
-      O << " /* row_bcast is not supported starting from GFX10 */";
+    if (AMDGPU::isGFX10Plus(STI)) {
+      O << "/* row_bcast is not supported starting from GFX10 */";
       return;
     }
-    O << " row_bcast:31";
+    O << "row_bcast:31";
   } else if ((Imm >= DppCtrl::ROW_SHARE_FIRST) &&
              (Imm <= DppCtrl::ROW_SHARE_LAST)) {
-    if (!AMDGPU::isGFX10(STI)) {
-      O << " /* row_share is not supported on ASICs earlier than GFX10 */";
+    if (!AMDGPU::isGFX10Plus(STI)) {
+      O << "/* row_share is not supported on ASICs earlier than GFX10 */";
       return;
     }
-    O << " row_share:";
+    O << "row_share:";
     printU4ImmDecOperand(MI, OpNo, O);
   } else if ((Imm >= DppCtrl::ROW_XMASK_FIRST) &&
              (Imm <= DppCtrl::ROW_XMASK_LAST)) {
-    if (!AMDGPU::isGFX10(STI)) {
-      O << " /* row_xmask is not supported on ASICs earlier than GFX10 */";
+    if (!AMDGPU::isGFX10Plus(STI)) {
+      O << "/* row_xmask is not supported on ASICs earlier than GFX10 */";
       return;
     }
     O << "row_xmask:";
     printU4ImmDecOperand(MI, OpNo, O);
   } else {
-    O << " /* Invalid dpp_ctrl value */";
+    O << "/* Invalid dpp_ctrl value */";
   }
 }
 
@@ -917,10 +958,9 @@ void AMDGPUInstPrinter::printSDWADstUnused(const MCInst *MI, unsigned OpNo,
   }
 }
 
-template <unsigned N>
 void AMDGPUInstPrinter::printExpSrcN(const MCInst *MI, unsigned OpNo,
-                                     const MCSubtargetInfo &STI,
-                                     raw_ostream &O) {
+                                     const MCSubtargetInfo &STI, raw_ostream &O,
+                                     unsigned N) {
   unsigned Opc = MI->getOpcode();
   int EnIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::en);
   unsigned En = MI->getOperand(EnIdx).getImm();
@@ -928,12 +968,8 @@ void AMDGPUInstPrinter::printExpSrcN(const MCInst *MI, unsigned OpNo,
   int ComprIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::compr);
 
   // If compr is set, print as src0, src0, src1, src1
-  if (MI->getOperand(ComprIdx).getImm()) {
-    if (N == 1 || N == 2)
-      --OpNo;
-    else if (N == 3)
-      OpNo -= 2;
-  }
+  if (MI->getOperand(ComprIdx).getImm())
+    OpNo = OpNo - N + N / 2;
 
   if (En & (1 << N))
     printRegOperand(MI->getOperand(OpNo).getReg(), O, MRI);
@@ -944,48 +980,43 @@ void AMDGPUInstPrinter::printExpSrcN(const MCInst *MI, unsigned OpNo,
 void AMDGPUInstPrinter::printExpSrc0(const MCInst *MI, unsigned OpNo,
                                      const MCSubtargetInfo &STI,
                                      raw_ostream &O) {
-  printExpSrcN<0>(MI, OpNo, STI, O);
+  printExpSrcN(MI, OpNo, STI, O, 0);
 }
 
 void AMDGPUInstPrinter::printExpSrc1(const MCInst *MI, unsigned OpNo,
                                      const MCSubtargetInfo &STI,
                                      raw_ostream &O) {
-  printExpSrcN<1>(MI, OpNo, STI, O);
+  printExpSrcN(MI, OpNo, STI, O, 1);
 }
 
 void AMDGPUInstPrinter::printExpSrc2(const MCInst *MI, unsigned OpNo,
                                      const MCSubtargetInfo &STI,
                                      raw_ostream &O) {
-  printExpSrcN<2>(MI, OpNo, STI, O);
+  printExpSrcN(MI, OpNo, STI, O, 2);
 }
 
 void AMDGPUInstPrinter::printExpSrc3(const MCInst *MI, unsigned OpNo,
                                      const MCSubtargetInfo &STI,
                                      raw_ostream &O) {
-  printExpSrcN<3>(MI, OpNo, STI, O);
+  printExpSrcN(MI, OpNo, STI, O, 3);
 }
 
 void AMDGPUInstPrinter::printExpTgt(const MCInst *MI, unsigned OpNo,
                                     const MCSubtargetInfo &STI,
                                     raw_ostream &O) {
+  using namespace llvm::AMDGPU::Exp;
+
   // This is really a 6 bit field.
-  uint32_t Tgt = MI->getOperand(OpNo).getImm() & ((1 << 6) - 1);
-
-  if (Tgt <= 7)
-    O << " mrt" << Tgt;
-  else if (Tgt == 8)
-    O << " mrtz";
-  else if (Tgt == 9)
-    O << " null";
-  else if ((Tgt >= 12 && Tgt <= 15) || (Tgt == 16 && AMDGPU::isGFX10(STI)))
-    O << " pos" << Tgt - 12;
-  else if (AMDGPU::isGFX10(STI) && Tgt == 20)
-    O << " prim";
-  else if (Tgt >= 32 && Tgt <= 63)
-    O << " param" << Tgt - 32;
-  else {
-    // Reserved values 10, 11
-    O << " invalid_target_" << Tgt;
+  unsigned Id = MI->getOperand(OpNo).getImm() & ((1 << 6) - 1);
+
+  int Index;
+  StringRef TgtName;
+  if (getTgtName(Id, TgtName, Index) && isSupportedTgtId(Id, STI)) {
+    O << ' ' << TgtName;
+    if (Index >= 0)
+      O << Index;
+  } else {
+    O << " invalid_target_" << Id;
   }
 }
 
@@ -1124,9 +1155,9 @@ void AMDGPUInstPrinter::printVGPRIndexMode(const MCInst *MI, unsigned OpNo,
   unsigned Val = MI->getOperand(OpNo).getImm();
 
   if ((Val & ~ENABLE_MASK) != 0) {
-    O << " " << formatHex(static_cast<uint64_t>(Val));
+    O << formatHex(static_cast<uint64_t>(Val));
   } else {
-    O << " gpr_idx(";
+    O << "gpr_idx(";
     bool NeedComma = false;
     for (unsigned ModeId = ID_MIN; ModeId <= ID_MAX; ++ModeId) {
       if (Val & (1 << ModeId)) {
@@ -1171,15 +1202,13 @@ void AMDGPUInstPrinter::printIfSet(const MCInst *MI, unsigned OpNo,
 void AMDGPUInstPrinter::printHigh(const MCInst *MI, unsigned OpNo,
                                   const MCSubtargetInfo &STI,
                                   raw_ostream &O) {
-  if (MI->getOperand(OpNo).getImm())
-    O << " high";
+  printNamedBit(MI, OpNo, O, "high");
 }
 
 void AMDGPUInstPrinter::printClampSI(const MCInst *MI, unsigned OpNo,
                                      const MCSubtargetInfo &STI,
                                      raw_ostream &O) {
-  if (MI->getOperand(OpNo).getImm())
-    O << " clamp";
+  printNamedBit(MI, OpNo, O, "clamp");
 }
 
 void AMDGPUInstPrinter::printOModSI(const MCInst *MI, unsigned OpNo,
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index 6dfd23ea72e6..8d13aa682211 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -24,6 +24,7 @@ public:
 
   //Autogenerated by tblgen
   void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+  std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
   void printInstruction(const MCInst *MI, uint64_t Address,
                         const MCSubtargetInfo &STI, raw_ostream &O);
   static const char *getRegisterName(unsigned RegNo);
@@ -99,6 +100,8 @@ private:
                   const MCSubtargetInfo &STI, raw_ostream &O);
   void printFORMAT(const MCInst *MI, unsigned OpNo,
                    const MCSubtargetInfo &STI, raw_ostream &O);
+  void printSymbolicFormat(const MCInst *MI,
+                           const MCSubtargetInfo &STI, raw_ostream &O);
 
   void printRegOperand(unsigned RegNo, raw_ostream &O);
   void printVOPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
@@ -109,8 +112,6 @@ private:
                            raw_ostream &O);
   void printImmediate16(uint32_t Imm, const MCSubtargetInfo &STI,
                         raw_ostream &O);
-  void printImmediateIntV216(uint32_t Imm, const MCSubtargetInfo &STI,
-                             raw_ostream &O);
   void printImmediateV216(uint32_t Imm, const MCSubtargetInfo &STI,
                           raw_ostream &O);
   void printImmediate32(uint32_t Imm, const MCSubtargetInfo &STI,
@@ -178,10 +179,8 @@ private:
   void printDefaultVccOperand(unsigned OpNo, const MCSubtargetInfo &STI,
                               raw_ostream &O);
 
-
-  template <unsigned N>
-  void printExpSrcN(const MCInst *MI, unsigned OpNo,
-                    const MCSubtargetInfo &STI, raw_ostream &O);
+  void printExpSrcN(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                    raw_ostream &O, unsigned N);
   void printExpSrc0(const MCInst *MI, unsigned OpNo,
                     const MCSubtargetInfo &STI, raw_ostream &O);
   void printExpSrc1(const MCInst *MI, unsigned OpNo,
@@ -253,6 +252,7 @@ public:
 
   void printInst(const MCInst *MI, uint64_t Address, StringRef Annot,
                  const MCSubtargetInfo &STI, raw_ostream &O) override;
+  std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
   void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &O);
   static const char *getRegisterName(unsigned RegNo);
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
index 687cfef4559f..1836237c8df5 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -40,7 +40,6 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT,
   HasAggressiveSymbolFolding = true;
   COMMDirectiveAlignmentIsInBytes = false;
   HasNoDeadStrip = true;
-  WeakRefDirective = ".weakref\t";
   //===--- Dwarf Emission Directives -----------------------------------===//
   SupportsDebugInformation = true;
   DwarfRegNumForCFI = true;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
index d7d8c8181b02..1a7ca7e1a330 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
@@ -15,7 +15,7 @@
 #define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCCODEEMITTER_H
 
 #include "llvm/MC/MCCodeEmitter.h"
-#include "llvm/Support/raw_ostream.h"
+#include <cstdint>
 
 namespace llvm {
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index 7d3235efc59e..34b2cd1fc1e4 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -20,15 +20,15 @@
 #include "TargetInfo/AMDGPUTargetInfo.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCCodeEmitter.h"
-#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCInstPrinter.h"
 #include "llvm/MC/MCInstrAnalysis.h"
+#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCObjectWriter.h"
-#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCRegister.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/MC/MachineLocation.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
@@ -74,8 +74,8 @@ MCRegisterInfo *llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour DwarfFlavour) {
 static MCSubtargetInfo *
 createAMDGPUMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
   if (TT.getArch() == Triple::r600)
-    return createR600MCSubtargetInfoImpl(TT, CPU, FS);
-  return createAMDGPUMCSubtargetInfoImpl(TT, CPU, FS);
+    return createR600MCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FS);
+  return createAMDGPUMCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FS);
 }
 
 static MCInstPrinter *createAMDGPUMCInstPrinter(const Triple &T,
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
index b9cdbc6502e5..71b44a509108 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
@@ -15,8 +15,6 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCTARGETDESC_H
 #define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCTARGETDESC_H
 
-#include "llvm/Support/DataTypes.h"
-
 #include <memory>
 
 namespace llvm {
@@ -33,7 +31,7 @@ class Target;
 class Triple;
 class raw_pwrite_stream;
 
-enum AMDGPUDwarfFlavour { Wave64 = 0, Wave32 = 1 };
+enum AMDGPUDwarfFlavour : unsigned { Wave64 = 0, Wave32 = 1 };
 
 MCRegisterInfo *createGCNMCRegisterInfo(AMDGPUDwarfFlavour DwarfFlavour);
 
@@ -58,34 +56,24 @@ createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI,
 
 #define GET_REGINFO_ENUM
 #include "AMDGPUGenRegisterInfo.inc"
-#undef GET_REGINFO_ENUM
 
 #define GET_REGINFO_ENUM
 #include "R600GenRegisterInfo.inc"
-#undef GET_REGINFO_ENUM
 
 #define GET_INSTRINFO_ENUM
 #define GET_INSTRINFO_OPERAND_ENUM
 #define GET_INSTRINFO_SCHED_ENUM
 #include "AMDGPUGenInstrInfo.inc"
-#undef GET_INSTRINFO_SCHED_ENUM
-#undef GET_INSTRINFO_OPERAND_ENUM
-#undef GET_INSTRINFO_ENUM
 
 #define GET_INSTRINFO_ENUM
 #define GET_INSTRINFO_OPERAND_ENUM
 #define GET_INSTRINFO_SCHED_ENUM
 #include "R600GenInstrInfo.inc"
-#undef GET_INSTRINFO_SCHED_ENUM
-#undef GET_INSTRINFO_OPERAND_ENUM
-#undef GET_INSTRINFO_ENUM
 
 #define GET_SUBTARGETINFO_ENUM
 #include "AMDGPUGenSubtargetInfo.inc"
-#undef GET_SUBTARGETINFO_ENUM
 
 #define GET_SUBTARGETINFO_ENUM
 #include "R600GenSubtargetInfo.inc"
-#undef GET_SUBTARGETINFO_ENUM
 
 #endif
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 3d202d7960d6..f0eb11b70c97 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -11,31 +11,21 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUTargetStreamer.h"
-#include "AMDGPU.h"
-#include "SIDefines.h"
+#include "AMDGPUPTNote.h"
+#include "AMDKernelCodeT.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "Utils/AMDKernelCodeTUtils.h"
-#include "llvm/ADT/Twine.h"
 #include "llvm/BinaryFormat/AMDGPUMetadataVerifier.h"
 #include "llvm/BinaryFormat/ELF.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/Module.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELFStreamer.h"
-#include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCSectionELF.h"
+#include "llvm/Support/AMDGPUMetadata.h"
+#include "llvm/Support/AMDHSAKernelDescriptor.h"
 #include "llvm/Support/FormattedStream.h"
-#include "llvm/Support/TargetParser.h"
-
-namespace llvm {
-#include "AMDGPUPTNote.h"
-}
 
 using namespace llvm;
 using namespace llvm::AMDGPU;
-using namespace llvm::AMDGPU::HSAMD;
 
 //===----------------------------------------------------------------------===//
 // AMDGPUTargetStreamer
@@ -43,9 +33,8 @@ using namespace llvm::AMDGPU::HSAMD;
 
 bool AMDGPUTargetStreamer::EmitHSAMetadataV2(StringRef HSAMetadataString) {
   HSAMD::Metadata HSAMetadata;
-  if (HSAMD::fromString(std::string(HSAMetadataString), HSAMetadata))
+  if (HSAMD::fromString(HSAMetadataString, HSAMetadata))
     return false;
-
   return EmitHSAMetadata(HSAMetadata);
 }
 
@@ -79,14 +68,17 @@ StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) {
   case ELF::EF_AMDGPU_MACH_R600_TURKS:     AK = GK_TURKS;   break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX600:  AK = GK_GFX600;  break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX601:  AK = GK_GFX601;  break;
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX602:  AK = GK_GFX602;  break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX700:  AK = GK_GFX700;  break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX701:  AK = GK_GFX701;  break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX702:  AK = GK_GFX702;  break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX703:  AK = GK_GFX703;  break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX704:  AK = GK_GFX704;  break;
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX705:  AK = GK_GFX705;  break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX801:  AK = GK_GFX801;  break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX802:  AK = GK_GFX802;  break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX803:  AK = GK_GFX803;  break;
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX805:  AK = GK_GFX805;  break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX810:  AK = GK_GFX810;  break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX900:  AK = GK_GFX900;  break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX902:  AK = GK_GFX902;  break;
@@ -94,10 +86,14 @@ StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) {
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX906:  AK = GK_GFX906;  break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX908:  AK = GK_GFX908;  break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX909:  AK = GK_GFX909;  break;
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C:  AK = GK_GFX90C;  break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010: AK = GK_GFX1010; break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011: AK = GK_GFX1011; break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012: AK = GK_GFX1012; break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1030: AK = GK_GFX1030; break;
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1031: AK = GK_GFX1031; break;
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1032: AK = GK_GFX1032; break;
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1033: AK = GK_GFX1033; break;
   case ELF::EF_AMDGPU_MACH_NONE:           AK = GK_NONE;    break;
   }
 
@@ -131,14 +127,17 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) {
   case GK_TURKS:   return ELF::EF_AMDGPU_MACH_R600_TURKS;
   case GK_GFX600:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX600;
   case GK_GFX601:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX601;
+  case GK_GFX602:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX602;
   case GK_GFX700:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX700;
   case GK_GFX701:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX701;
   case GK_GFX702:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX702;
   case GK_GFX703:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX703;
   case GK_GFX704:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX704;
+  case GK_GFX705:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX705;
   case GK_GFX801:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX801;
   case GK_GFX802:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX802;
   case GK_GFX803:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX803;
+  case GK_GFX805:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX805;
   case GK_GFX810:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX810;
   case GK_GFX900:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX900;
   case GK_GFX902:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX902;
@@ -146,10 +145,14 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) {
   case GK_GFX906:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX906;
   case GK_GFX908:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX908;
   case GK_GFX909:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX909;
+  case GK_GFX90C:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C;
   case GK_GFX1010: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010;
   case GK_GFX1011: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011;
   case GK_GFX1012: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012;
   case GK_GFX1030: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1030;
+  case GK_GFX1031: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1031;
+  case GK_GFX1032: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1032;
+  case GK_GFX1033: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1033;
   case GK_NONE:    return ELF::EF_AMDGPU_MACH_NONE;
   }
 
@@ -166,10 +169,15 @@ AMDGPUTargetAsmStreamer::AMDGPUTargetAsmStreamer(MCStreamer &S,
 
 // A hook for emitting stuff at the end.
 // We use it for emitting the accumulated PAL metadata as directives.
+// The PAL metadata is reset after it is emitted.
 void AMDGPUTargetAsmStreamer::finish() {
   std::string S;
   getPALMetadata()->toString(S);
   OS << S;
+
+  // Reset the pal metadata so its data will not affect a compilation that
+  // reuses this object.
+  getPALMetadata()->reset();
 }
 
 void AMDGPUTargetAsmStreamer::EmitDirectiveAMDGCNTarget(StringRef Target) {
@@ -228,15 +236,15 @@ bool AMDGPUTargetAsmStreamer::EmitHSAMetadata(
   if (HSAMD::toString(HSAMetadata, HSAMetadataString))
     return false;
 
-  OS << '\t' << AssemblerDirectiveBegin << '\n';
+  OS << '\t' << HSAMD::AssemblerDirectiveBegin << '\n';
   OS << HSAMetadataString << '\n';
-  OS << '\t' << AssemblerDirectiveEnd << '\n';
+  OS << '\t' << HSAMD::AssemblerDirectiveEnd << '\n';
   return true;
 }
 
 bool AMDGPUTargetAsmStreamer::EmitHSAMetadata(
     msgpack::Document &HSAMetadataDoc, bool Strict) {
-  V3::MetadataVerifier Verifier(Strict);
+  HSAMD::V3::MetadataVerifier Verifier(Strict);
   if (!Verifier.verify(HSAMetadataDoc.getRoot()))
     return false;
 
@@ -244,9 +252,9 @@ bool AMDGPUTargetAsmStreamer::EmitHSAMetadata(
   raw_string_ostream StrOS(HSAMetadataString);
   HSAMetadataDoc.toYAML(StrOS);
 
-  OS << '\t' << V3::AssemblerDirectiveBegin << '\n';
+  OS << '\t' << HSAMD::V3::AssemblerDirectiveBegin << '\n';
   OS << StrOS.str() << '\n';
-  OS << '\t' << V3::AssemblerDirectiveEnd << '\n';
+  OS << '\t' << HSAMD::V3::AssemblerDirectiveEnd << '\n';
   return true;
 }
 
@@ -302,7 +310,7 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
   PRINT_FIELD(
       OS, ".amdhsa_system_sgpr_private_segment_wavefront_offset", KD,
       compute_pgm_rsrc2,
-      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_PRIVATE_SEGMENT_WAVEFRONT_OFFSET);
+      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT);
   PRINT_FIELD(OS, ".amdhsa_system_sgpr_workgroup_id_x", KD,
               compute_pgm_rsrc2,
               amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X);
@@ -421,6 +429,7 @@ MCELFStreamer &AMDGPUTargetELFStreamer::getStreamer() {
 
 // A hook for emitting stuff at the end.
 // We use it for emitting the accumulated PAL metadata as a .note record.
+// The PAL metadata is reset after it is emitted.
 void AMDGPUTargetELFStreamer::finish() {
   std::string Blob;
   const char *Vendor = getPALMetadata()->getVendor();
@@ -430,6 +439,10 @@ void AMDGPUTargetELFStreamer::finish() {
     return;
   EmitNote(Vendor, MCConstantExpr::create(Blob.size(), getContext()), Type,
            [&](MCELFStreamer &OS) { OS.emitBytes(Blob); });
+
+  // Reset the pal metadata so its data will not affect a compilation that
+  // reuses this object.
+  getPALMetadata()->reset();
 }
 
 void AMDGPUTargetELFStreamer::EmitNote(
@@ -554,7 +567,7 @@ bool AMDGPUTargetELFStreamer::EmitISAVersion(StringRef IsaVersionString) {
 
 bool AMDGPUTargetELFStreamer::EmitHSAMetadata(msgpack::Document &HSAMetadataDoc,
                                               bool Strict) {
-  V3::MetadataVerifier Verifier(Strict);
+  HSAMD::V3::MetadataVerifier Verifier(Strict);
   if (!Verifier.verify(HSAMetadataDoc.getRoot()))
     return false;
 
@@ -644,9 +657,10 @@ void AMDGPUTargetELFStreamer::EmitAmdhsaKernelDescriptor(
     KernelCodeSymbol->setVisibility(ELF::STV_PROTECTED);
 
   Streamer.emitLabel(KernelDescriptorSymbol);
-  Streamer.emitBytes(StringRef(
-      (const char*)&(KernelDescriptor),
-      offsetof(amdhsa::kernel_descriptor_t, kernel_code_entry_byte_offset)));
+  Streamer.emitInt32(KernelDescriptor.group_segment_fixed_size);
+  Streamer.emitInt32(KernelDescriptor.private_segment_fixed_size);
+  for (uint8_t Res : KernelDescriptor.reserved0)
+    Streamer.emitInt8(Res);
   // FIXME: Remove the use of VK_AMDGPU_REL64 in the expression below. The
   // expression being created is:
   //   (start of kernel code) - (start of kernel descriptor)
@@ -658,11 +672,12 @@ void AMDGPUTargetELFStreamer::EmitAmdhsaKernelDescriptor(
           KernelDescriptorSymbol, MCSymbolRefExpr::VK_None, Context),
       Context),
       sizeof(KernelDescriptor.kernel_code_entry_byte_offset));
-  Streamer.emitBytes(StringRef(
-      (const char*)&(KernelDescriptor) +
-          offsetof(amdhsa::kernel_descriptor_t, kernel_code_entry_byte_offset) +
-          sizeof(KernelDescriptor.kernel_code_entry_byte_offset),
-      sizeof(KernelDescriptor) -
-          offsetof(amdhsa::kernel_descriptor_t, kernel_code_entry_byte_offset) -
-          sizeof(KernelDescriptor.kernel_code_entry_byte_offset)));
+  for (uint8_t Res : KernelDescriptor.reserved1)
+    Streamer.emitInt8(Res);
+  Streamer.emitInt32(KernelDescriptor.compute_pgm_rsrc3);
+  Streamer.emitInt32(KernelDescriptor.compute_pgm_rsrc1);
+  Streamer.emitInt32(KernelDescriptor.compute_pgm_rsrc2);
+  Streamer.emitInt16(KernelDescriptor.kernel_code_properties);
+  for (uint8_t Res : KernelDescriptor.reserved2)
+    Streamer.emitInt8(Res);
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index a19d4646deb2..1ad64532931c 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -9,16 +9,12 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUTARGETSTREAMER_H
 #define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUTARGETSTREAMER_H
 
-#include "AMDKernelCodeT.h"
 #include "Utils/AMDGPUPALMetadata.h"
-#include "llvm/BinaryFormat/MsgPackDocument.h"
 #include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Support/AMDGPUMetadata.h"
-#include "llvm/Support/AMDHSAKernelDescriptor.h"
+
+struct amd_kernel_code_t;
 
 namespace llvm {
-#include "AMDGPUPTNote.h"
 
 class DataLayout;
 class Function;
@@ -28,6 +24,16 @@ class MDNode;
 class Module;
 class Type;
 
+namespace AMDGPU {
+namespace HSAMD {
+struct Metadata;
+}
+} // namespace AMDGPU
+
+namespace amdhsa {
+struct kernel_descriptor_t;
+}
+
 class AMDGPUTargetStreamer : public MCTargetStreamer {
   AMDGPUPALMetadata PALMetadata;
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
index f61470573050..bbca8cbb742c 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -13,22 +13,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/AMDGPUFixupKinds.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "R600Defines.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Support/Endian.h"
+#include "llvm/MC/SubtargetFeature.h"
 #include "llvm/Support/EndianStream.h"
-#include "llvm/Support/raw_ostream.h"
-#include <cassert>
-#include <cstdint>
 
 using namespace llvm;
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
index 2cd6c3a81d2b..1a1ffcda3b4e 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -12,29 +12,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "AMDGPU.h"
 #include "MCTargetDesc/AMDGPUFixupKinds.h"
 #include "MCTargetDesc/AMDGPUMCCodeEmitter.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIDefines.h"
 #include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCFixup.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
-#include <cassert>
-#include <cstdint>
-#include <cstdlib>
 
 using namespace llvm;
 
@@ -303,7 +289,7 @@ void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
   }
 
   // NSA encoding.
-  if (AMDGPU::isGFX10(STI) && Desc.TSFlags & SIInstrFlags::MIMG) {
+  if (AMDGPU::isGFX10Plus(STI) && Desc.TSFlags & SIInstrFlags::MIMG) {
     int vaddr0 = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
                                             AMDGPU::OpName::vaddr0);
     int srsrc = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index 2bfc2d579533..54c8cdf196ac 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -51,7 +51,7 @@ def MIMGBaseOpcodesTable : GenericTable {
   let Fields = ["BaseOpcode", "Store", "Atomic", "AtomicX2", "Sampler",
                 "Gather4", "NumExtraArgs", "Gradients", "G16", "Coordinates",
                 "LodOrClampOrMip", "HasD16"];
-  GenericEnum TypeOf_BaseOpcode = MIMGBaseOpcode;
+  string TypeOf_BaseOpcode = "MIMGBaseOpcode";
 
   let PrimaryKey = ["BaseOpcode"];
   let PrimaryKeyName = "getMIMGBaseOpcodeInfo";
@@ -65,7 +65,7 @@ def MIMGDimInfoTable : GenericTable {
   let FilterClass = "AMDGPUDimProps";
   let CppTypeName = "MIMGDimInfo";
   let Fields = ["Dim", "NumCoords", "NumGradients", "DA", "Encoding", "AsmSuffix"];
-  GenericEnum TypeOf_Dim = MIMGDim;
+  string TypeOf_Dim = "MIMGDim";
 
   let PrimaryKey = ["Dim"];
   let PrimaryKeyName = "getMIMGDimInfo";
@@ -95,8 +95,8 @@ def MIMGLZMappingTable : GenericTable {
   let FilterClass = "MIMGLZMapping";
   let CppTypeName = "MIMGLZMappingInfo";
   let Fields = ["L", "LZ"];
-  GenericEnum TypeOf_L = MIMGBaseOpcode;
-  GenericEnum TypeOf_LZ = MIMGBaseOpcode;
+  string TypeOf_L = "MIMGBaseOpcode";
+  string TypeOf_LZ = "MIMGBaseOpcode";
 
   let PrimaryKey = ["L"];
   let PrimaryKeyName = "getMIMGLZMappingInfo";
@@ -111,8 +111,8 @@ def MIMGMIPMappingTable : GenericTable {
   let FilterClass = "MIMGMIPMapping";
   let CppTypeName = "MIMGMIPMappingInfo";
   let Fields = ["MIP", "NONMIP"];
-  GenericEnum TypeOf_MIP = MIMGBaseOpcode;
-  GenericEnum TypeOf_NONMIP = MIMGBaseOpcode;
+  string TypeOf_MIP = "MIMGBaseOpcode";
+  string TypeOf_NONMIP = "MIMGBaseOpcode";
 
   let PrimaryKey = ["MIP"];
   let PrimaryKeyName = "getMIMGMIPMappingInfo";
@@ -127,8 +127,8 @@ def MIMGG16MappingTable : GenericTable {
   let FilterClass = "MIMGG16Mapping";
   let CppTypeName = "MIMGG16MappingInfo";
   let Fields = ["G", "G16"];
-  GenericEnum TypeOf_G = MIMGBaseOpcode;
-  GenericEnum TypeOf_G16 = MIMGBaseOpcode;
+  string TypeOf_G = "MIMGBaseOpcode";
+  string TypeOf_G16 = "MIMGBaseOpcode";
 
   let PrimaryKey = ["G"];
   let PrimaryKeyName = "getMIMGG16MappingInfo";
@@ -148,7 +148,7 @@ class MIMG_Base <dag outs, string dns = "">
   let hasSideEffects = 0; // XXX ????
 
   let DecoderNamespace = dns;
-  let isAsmParserOnly = !if(!eq(dns,""), 1, 0);
+  let isAsmParserOnly = !eq(dns, "");
 }
 
 class MIMG <dag outs, string dns = "">
@@ -168,8 +168,8 @@ def MIMGInfoTable : GenericTable {
   let FilterClass = "MIMG";
   let CppTypeName = "MIMGInfo";
   let Fields = ["Opcode", "BaseOpcode", "MIMGEncoding", "VDataDwords", "VAddrDwords"];
-  GenericEnum TypeOf_BaseOpcode = MIMGBaseOpcode;
-  GenericEnum TypeOf_MIMGEncoding = MIMGEncoding;
+  string TypeOf_BaseOpcode = "MIMGBaseOpcode";
+  string TypeOf_MIMGEncoding = "MIMGEncoding";
 
   let PrimaryKey = ["BaseOpcode", "MIMGEncoding", "VDataDwords", "VAddrDwords"];
   let PrimaryKeyName = "getMIMGOpcodeHelper";
@@ -180,14 +180,14 @@ def getMIMGInfo : SearchIndex {
   let Key = ["Opcode"];
 }
 
-// This is a separate class so that TableGen memoizes the computations.
+// This class used to use !foldl to memoize the AddrAsmNames list.
+// It turned out that that was much slower than using !filter.
 class MIMGNSAHelper<int num_addrs> {
   list<string> AddrAsmNames =
-    !foldl([]<string>, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], lhs, i,
-           !if(!lt(i, num_addrs), !listconcat(lhs, ["vaddr"#!size(lhs)]), lhs));
+    !foreach(i, !filter(i, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
+                        !lt(i, num_addrs)), "vaddr" # i);
   dag AddrIns = !dag(ins, !foreach(arg, AddrAsmNames, VGPR_32), AddrAsmNames);
-  string AddrAsm = "[" # !foldl("$" # !head(AddrAsmNames), !tail(AddrAsmNames), lhs, rhs,
-                                lhs # ", $" # rhs) # "]";
+  string AddrAsm = "[$" # !interleave(AddrAsmNames, ", $") # "]";
 
   int NSA = !if(!le(num_addrs, 1), ?,
             !if(!le(num_addrs, 5), 1,
@@ -308,13 +308,13 @@ multiclass MIMG_NoSampler_Src_Helper <bits<8> op, string asm,
 multiclass MIMG_NoSampler <bits<8> op, string asm, bit has_d16, bit mip = 0,
                            bit isResInfo = 0> {
   def "" : MIMGBaseOpcode {
-    let Coordinates = !if(isResInfo, 0, 1);
+    let Coordinates = !not(isResInfo);
     let LodOrClampOrMip = mip;
     let HasD16 = has_d16;
   }
 
   let BaseOpcode = !cast<MIMGBaseOpcode>(NAME),
-      mayLoad = !if(isResInfo, 0, 1) in {
+      mayLoad = !not(isResInfo) in {
     let VDataDwords = 1 in
     defm _V1 : MIMG_NoSampler_Src_Helper <op, asm, VGPR_32, 1>;
     let VDataDwords = 2 in
@@ -413,6 +413,8 @@ multiclass MIMG_Store <bits<8> op, string asm, bit has_d16, bit mip = 0> {
     defm _V3 : MIMG_Store_Addr_Helper <op, asm, VReg_96, 0>;
     let VDataDwords = 4 in
     defm _V4 : MIMG_Store_Addr_Helper <op, asm, VReg_128, 0>;
+    let VDataDwords = 5 in
+    defm _V5 : MIMG_Store_Addr_Helper <op, asm, VReg_160, 0>;
   }
 }
 
@@ -665,12 +667,12 @@ multiclass MIMG_Sampler <bits<8> op, AMDGPUSampleVariant sample, bit wqm = 0,
                          bit isG16 = 0, bit isGetLod = 0,
                          string asm = "image_sample"#sample.LowerCaseMod#!if(isG16, "_g16", "")> {
   def "" : MIMG_Sampler_BaseOpcode<sample> {
-    let HasD16 = !if(isGetLod, 0, 1);
+    let HasD16 = !not(isGetLod);
     let G16 = isG16;
   }
 
   let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), WQM = wqm,
-      mayLoad = !if(isGetLod, 0, 1) in {
+      mayLoad = !not(isGetLod) in {
     let VDataDwords = 1 in
     defm _V1 : MIMG_Sampler_Src_Helper<op, asm, sample, VGPR_32, 1>;
     let VDataDwords = 2 in
@@ -708,6 +710,55 @@ multiclass MIMG_Gather <bits<8> op, AMDGPUSampleVariant sample, bit wqm = 0,
 multiclass MIMG_Gather_WQM <bits<8> op, AMDGPUSampleVariant sample>
     : MIMG_Gather<op, sample, 1>;
 
+class MIMG_IntersectRay_gfx10<int op, string opcode, RegisterClass AddrRC, bit A16>
+    : MIMG_gfx10<op, (outs VReg_128:$vdata), "AMDGPU"> {
+
+  let InOperandList = !con((ins AddrRC:$vaddr0, SReg_128:$srsrc),
+                           !if(A16, (ins GFX10A16:$a16), (ins)));
+  let AsmString = opcode#" $vdata, $vaddr0, $srsrc"#!if(A16, "$a16", "");
+
+  let nsa = 0;
+}
+
+class MIMG_IntersectRay_nsa_gfx10<int op, string opcode, int num_addrs, bit A16>
+    : MIMG_nsa_gfx10<op, (outs VReg_128:$vdata), num_addrs, "AMDGPU"> {
+  let InOperandList = !con(nsah.AddrIns,
+                           (ins SReg_128:$srsrc),
+                           !if(A16, (ins GFX10A16:$a16), (ins)));
+  let AsmString = opcode#" $vdata, "#nsah.AddrAsm#", $srsrc"#!if(A16, "$a16", "");
+}
+
+multiclass MIMG_IntersectRay<int op, string opcode, int num_addrs, bit A16> {
+  def "" : MIMGBaseOpcode;
+  let SubtargetPredicate = HasGFX10_BEncoding,
+      AssemblerPredicate = HasGFX10_BEncoding,
+      AsmMatchConverter = !if(A16, "cvtIntersectRay", ""),
+      dmask = 0xf,
+      unorm = 1,
+      d16 = 0,
+      glc = 0,
+      slc = 0,
+      dlc = 0,
+      tfe = 0,
+      lwe = 0,
+      r128 = 1,
+      ssamp = 0,
+      dim = {0, 0, 0},
+      a16 = A16,
+      d16 = 0,
+      BaseOpcode = !cast<MIMGBaseOpcode>(NAME),
+      VDataDwords = 4 in {
+    // TODO: MIMGAddrSize will choose VReg_512 which is a 16 register tuple,
+    // when we only need 9, 11 or 12 depending on A16 field and ptr size.
+    def "_sa" : MIMG_IntersectRay_gfx10<op, opcode, MIMGAddrSize<num_addrs, 0>.RegClass, A16> {
+      let VAddrDwords = !srl(MIMGAddrSize<num_addrs, 0>.RegClass.Size, 5);
+    }
+    def _nsa : MIMG_IntersectRay_nsa_gfx10<op, opcode, num_addrs, A16> {
+      let VAddrDwords = num_addrs;
+    }
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // MIMG Instructions
 //===----------------------------------------------------------------------===//
@@ -832,6 +883,11 @@ defm IMAGE_SAMPLE_C_CD_CL_O_G16 : MIMG_Sampler <0x000000ef, AMDGPUSample_c_cd_cl
 let SubtargetPredicate = HasGFX10_BEncoding in
 defm IMAGE_MSAA_LOAD : MIMG_NoSampler <0x00000080, "image_msaa_load", 1>;
 
+defm IMAGE_BVH_INTERSECT_RAY       : MIMG_IntersectRay<0xe6, "image_bvh_intersect_ray", 11, 0>;
+defm IMAGE_BVH_INTERSECT_RAY_a16   : MIMG_IntersectRay<0xe6, "image_bvh_intersect_ray", 8, 1>;
+defm IMAGE_BVH64_INTERSECT_RAY     : MIMG_IntersectRay<0xe7, "image_bvh64_intersect_ray", 12, 0>;
+defm IMAGE_BVH64_INTERSECT_RAY_a16 : MIMG_IntersectRay<0xe7, "image_bvh64_intersect_ray", 9, 1>;
+
 /********** ========================================= **********/
 /********** Table of dimension-aware image intrinsics **********/
 /********** ========================================= **********/
@@ -840,13 +896,40 @@ class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> {
   Intrinsic Intr = I;
   MIMGBaseOpcode BaseOpcode = !cast<MIMGBaseOpcode>(!strconcat("IMAGE_", I.P.OpMod));
   AMDGPUDimProps Dim = I.P.Dim;
+  AMDGPUImageDimIntrinsicEval DimEval = AMDGPUImageDimIntrinsicEval<I.P>;
+
+  bits<8> NumGradients = DimEval.NumGradientArgs;
+  bits<8> NumDmask = DimEval.NumDmaskArgs;
+  bits<8> NumData = DimEval.NumDataArgs;
+  bits<8> NumVAddrs = DimEval.NumVAddrArgs;
+  bits<8> NumArgs = !add(DimEval.CachePolicyArgIndex, 1);
+
+  bits<8> DMaskIndex = DimEval.DmaskArgIndex;
+  bits<8> VAddrStart = DimEval.VAddrArgIndex;
+  bits<8> GradientStart = DimEval.GradientArgIndex;
+  bits<8> CoordStart = DimEval.CoordArgIndex;
+  bits<8> LodIndex = DimEval.LodArgIndex;
+  bits<8> MipIndex = DimEval.MipArgIndex;
+  bits<8> VAddrEnd = !add(DimEval.VAddrArgIndex, DimEval.NumVAddrArgs);
+  bits<8> RsrcIndex = DimEval.RsrcArgIndex;
+  bits<8> SampIndex = DimEval.SampArgIndex;
+  bits<8> UnormIndex = DimEval.UnormArgIndex;
+  bits<8> TexFailCtrlIndex = DimEval.TexFailCtrlArgIndex;
+  bits<8> CachePolicyIndex = DimEval.CachePolicyArgIndex;
+
+  bits<8> GradientTyArg = !add(I.P.NumRetAndDataAnyTypes,
+    !foldl(0, I.P.ExtraAddrArgs, cnt, arg, !add(cnt, arg.Type.isAny)));
+  bits<8> CoordTyArg = !add(GradientTyArg, !if(I.P.Gradients, 1, 0));
 }
 
 def ImageDimIntrinsicTable : GenericTable {
   let FilterClass = "ImageDimIntrinsicInfo";
-  let Fields = ["Intr", "BaseOpcode", "Dim"];
-  GenericEnum TypeOf_BaseOpcode = MIMGBaseOpcode;
-  GenericEnum TypeOf_Dim = MIMGDim;
+  let Fields = ["Intr", "BaseOpcode", "Dim", "NumGradients", "NumDmask", "NumData", "NumVAddrs", "NumArgs",
+    "DMaskIndex", "VAddrStart", "GradientStart", "CoordStart", "LodIndex", "MipIndex", "VAddrEnd",
+    "RsrcIndex", "SampIndex", "UnormIndex", "TexFailCtrlIndex", "CachePolicyIndex",
+    "GradientTyArg", "CoordTyArg"];
+  string TypeOf_BaseOpcode = "MIMGBaseOpcode";
+  string TypeOf_Dim = "MIMGDim";
 
   let PrimaryKey = ["Intr"];
   let PrimaryKeyName = "getImageDimIntrinsicInfo";
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600AsmPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600AsmPrinter.cpp
index d363baa15507..a96fc7ef234e 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600AsmPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600AsmPrinter.cpp
@@ -15,10 +15,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "R600AsmPrinter.h"
-#include "AMDGPUSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "R600Defines.h"
 #include "R600MachineFunctionInfo.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "R600Subtarget.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSectionELF.h"
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp
index 290a960ae901..a19d00b62502 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp
@@ -13,17 +13,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "R600Defines.h"
-#include "R600InstrInfo.h"
-#include "R600MachineFunctionInfo.h"
-#include "R600RegisterInfo.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
+#include "R600Subtarget.h"
 
 using namespace llvm;
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
index 8124df68f688..ca1e61393e9a 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
@@ -13,35 +13,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "R600Defines.h"
-#include "R600InstrInfo.h"
-#include "R600MachineFunctionInfo.h"
-#include "R600RegisterInfo.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/IR/CallingConv.h"
-#include "llvm/IR/DebugLoc.h"
-#include "llvm/IR/Function.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
+#include "R600MachineFunctionInfo.h"
+#include "R600Subtarget.h"
 #include <set>
-#include <utility>
-#include <vector>
 
 using namespace llvm;
 
@@ -84,12 +59,7 @@ unsigned CFStack::getLoopDepth() {
 }
 
 bool CFStack::branchStackContains(CFStack::StackItem Item) {
-  for (std::vector<CFStack::StackItem>::const_iterator I = BranchStack.begin(),
-       E = BranchStack.end(); I != E; ++I) {
-    if (*I == Item)
-      return true;
-  }
-  return false;
+  return llvm::is_contained(BranchStack, Item);
 }
 
 bool CFStack::requiresWorkAroundForInst(unsigned Opcode) {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Defines.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Defines.h
index d72534908dcf..613a59ae81f0 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Defines.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Defines.h
@@ -10,8 +10,6 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_R600DEFINES_H
 #define LLVM_LIB_TARGET_AMDGPU_R600DEFINES_H
 
-#include "llvm/MC/MCRegisterInfo.h"
-
 // Operand Flags
 #define MO_FLAG_CLAMP (1 << 0)
 #define MO_FLAG_NEG   (1 << 1)
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
index b97e3c8b8dd7..664e134889e9 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
@@ -14,25 +14,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "R600Defines.h"
-#include "R600InstrInfo.h"
-#include "R600RegisterInfo.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/ErrorHandling.h"
-#include <cassert>
-#include <cstdint>
-#include <utility>
-#include <vector>
+#include "R600Defines.h"
+#include "R600Subtarget.h"
 
 using namespace llvm;
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
index 5f682d86d26e..81dc91ab922f 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
@@ -14,21 +14,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "R600Defines.h"
-#include "R600InstrInfo.h"
-#include "R600RegisterInfo.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/Pass.h"
-#include <cassert>
-#include <cstdint>
-#include <iterator>
+#include "R600Defines.h"
+#include "R600Subtarget.h"
 
 using namespace llvm;
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp
index c568a4aa61c3..abd4086db62c 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp
@@ -7,19 +7,16 @@
 //==-----------------------------------------------------------------------===//
 
 #include "R600FrameLowering.h"
-#include "AMDGPUSubtarget.h"
-#include "R600RegisterInfo.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/Support/MathExtras.h"
+#include "R600Subtarget.h"
 
 using namespace llvm;
 
 R600FrameLowering::~R600FrameLowering() = default;
 
 /// \returns The number of registers allocated for \p FI.
-int R600FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
-                                              Register &FrameReg) const {
+StackOffset
+R600FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
+                                          Register &FrameReg) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   const R600RegisterInfo *RI
     = MF.getSubtarget<R600Subtarget>().getRegisterInfo();
@@ -44,5 +41,5 @@ int R600FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
   if (FI != -1)
     OffsetBytes = alignTo(OffsetBytes, MFI.getObjectAlign(FI));
 
-  return OffsetBytes / (getStackWidth(MF) * 4);
+  return StackOffset::getFixed(OffsetBytes / (getStackWidth(MF) * 4));
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600FrameLowering.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600FrameLowering.h
index b877ecd29829..f171bc4fea78 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600FrameLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600FrameLowering.h
@@ -24,8 +24,8 @@ public:
                     MachineBasicBlock &MBB) const override {}
   void emitEpilogue(MachineFunction &MF,
                     MachineBasicBlock &MBB) const override {}
-  int getFrameIndexReference(const MachineFunction &MF, int FI,
-                             Register &FrameReg) const override;
+  StackOffset getFrameIndexReference(const MachineFunction &MF, int FI,
+                                     Register &FrameReg) const override;
 
   bool hasFP(const MachineFunction &MF) const override {
     return false;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index dc2e73e1f94e..c0120903396c 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -12,42 +12,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "R600ISelLowering.h"
-#include "AMDGPUFrameLowering.h"
-#include "AMDGPUSubtarget.h"
+#include "AMDGPU.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "R600Defines.h"
-#include "R600FrameLowering.h"
 #include "R600InstrInfo.h"
 #include "R600MachineFunctionInfo.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/APFloat.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/CallingConvLower.h"
-#include "llvm/CodeGen/DAGCombine.h"
-#include "llvm/CodeGen/ISDOpcodes.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
+#include "R600Subtarget.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/IntrinsicsR600.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MachineValueType.h"
-#include "llvm/Support/MathExtras.h"
-#include <cassert>
-#include <cstdint>
-#include <iterator>
-#include <utility>
-#include <vector>
 
 using namespace llvm;
 
@@ -338,7 +310,7 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
 
   case R600::MASK_WRITE: {
     Register maskedRegister = MI.getOperand(0).getReg();
-    assert(Register::isVirtualRegister(maskedRegister));
+    assert(maskedRegister.isVirtual());
     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
     TII->addFlag(*defInstr, 0, MO_FLAG_MASK);
     break;
@@ -1550,10 +1522,10 @@ SDValue R600TargetLowering::lowerFrameIndex(SDValue Op,
 
   unsigned FrameIndex = FIN->getIndex();
   Register IgnoredFrameReg;
-  unsigned Offset =
-    TFL->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg);
-  return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), SDLoc(Op),
-                         Op.getValueType());
+  StackOffset Offset =
+      TFL->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg);
+  return DAG.getConstant(Offset.getFixed() * 4 * TFL->getStackWidth(MF),
+                         SDLoc(Op), Op.getValueType());
 }
 
 CCAssignFn *R600TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
@@ -1608,7 +1580,7 @@ SDValue R600TargetLowering::LowerFormalArguments(
     }
 
     if (AMDGPU::isShader(CallConv)) {
-      unsigned Reg = MF.addLiveIn(VA.getLocReg(), &R600::R600_Reg128RegClass);
+      Register Reg = MF.addLiveIn(VA.getLocReg(), &R600::R600_Reg128RegClass);
       SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
       InVals.push_back(Register);
       continue;
@@ -1747,7 +1719,7 @@ static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
   for (unsigned i = 0; i < 4; i++) {
     RemapSwizzle[i] = i;
     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
-      unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
+      unsigned Idx = cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
           ->getZExtValue();
       if (i == Idx)
         isUnmovable[Idx] = true;
@@ -1756,7 +1728,7 @@ static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
 
   for (unsigned i = 0; i < 4; i++) {
     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
-      unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
+      unsigned Idx = cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
           ->getZExtValue();
       if (isUnmovable[Idx])
         continue;
@@ -2160,7 +2132,7 @@ bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
     uint64_t ImmValue = 0;
 
     if (Src.getMachineOpcode() == R600::MOV_IMM_F32) {
-      ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
+      ConstantFPSDNode *FPC = cast<ConstantFPSDNode>(Src.getOperand(0));
       float FloatValue = FPC->getValueAPF().convertToFloat();
       if (FloatValue == 0.0) {
         ImmReg = R600::ZERO;
@@ -2172,7 +2144,7 @@ bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
         ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
       }
     } else {
-      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
+      ConstantSDNode *C = cast<ConstantSDNode>(Src.getOperand(0));
       uint64_t Value = C->getZExtValue();
       if (Value == 0) {
         ImmReg = R600::ZERO;
@@ -2189,8 +2161,7 @@ bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
     if (ImmReg == R600::ALU_LITERAL_X) {
       if (!Imm.getNode())
         return false;
-      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
-      assert(C);
+      ConstantSDNode *C = cast<ConstantSDNode>(Imm);
       if (C->getZExtValue())
         return false;
       Imm = DAG.getTargetConstant(ImmValue, SDLoc(ParentNode), MVT::i32);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
index 088cf16d8ed2..7a623f3e304e 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
@@ -13,33 +13,10 @@
 
 #include "R600InstrInfo.h"
 #include "AMDGPU.h"
-#include "AMDGPUInstrInfo.h"
-#include "AMDGPUSubtarget.h"
-#include "R600Defines.h"
-#include "R600FrameLowering.h"
-#include "R600RegisterInfo.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/BitVector.h"
+#include "R600Defines.h"
+#include "R600Subtarget.h"
 #include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/CodeGen/TargetSubtargetInfo.h"
-#include "llvm/Support/ErrorHandling.h"
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
-#include <cstring>
-#include <iterator>
-#include <utility>
-#include <vector>
 
 using namespace llvm;
 
@@ -97,7 +74,7 @@ bool R600InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB,
                                        MachineBasicBlock::iterator MBBI) const {
   for (MachineInstr::const_mop_iterator I = MBBI->operands_begin(),
                                         E = MBBI->operands_end(); I != E; ++I) {
-    if (I->isReg() && !Register::isVirtualRegister(I->getReg()) && I->isUse() &&
+    if (I->isReg() && !I->getReg().isVirtual() && I->isUse() &&
         RI.isPhysRegLiveAcrossClauses(I->getReg()))
       return false;
   }
@@ -242,7 +219,7 @@ bool R600InstrInfo::readsLDSSrcReg(const MachineInstr &MI) const {
   for (MachineInstr::const_mop_iterator I = MI.operands_begin(),
                                         E = MI.operands_end();
        I != E; ++I) {
-    if (!I->isReg() || !I->isUse() || Register::isVirtualRegister(I->getReg()))
+    if (!I->isReg() || !I->isUse() || I->getReg().isVirtual())
       continue;
 
     if (R600::R600_LDS_SRC_REGRegClass.contains(I->getReg()))
@@ -963,8 +940,9 @@ R600InstrInfo::reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) con
   return false;
 }
 
-bool R600InstrInfo::DefinesPredicate(MachineInstr &MI,
-                                     std::vector<MachineOperand> &Pred) const {
+bool R600InstrInfo::ClobbersPredicate(MachineInstr &MI,
+                                      std::vector<MachineOperand> &Pred,
+                                      bool SkipDead) const {
   return isPredicateSetter(MI.getOpcode());
 }
 
@@ -1191,15 +1169,15 @@ int R600InstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const {
 
   const TargetRegisterClass *IndirectRC = getIndirectAddrRegClass();
   for (std::pair<unsigned, unsigned> LI : MRI.liveins()) {
-    unsigned Reg = LI.first;
-    if (Register::isVirtualRegister(Reg) || !IndirectRC->contains(Reg))
+    Register Reg = LI.first;
+    if (Reg.isVirtual() || !IndirectRC->contains(Reg))
       continue;
 
     unsigned RegIndex;
     unsigned RegEnd;
     for (RegIndex = 0, RegEnd = IndirectRC->getNumRegs(); RegIndex != RegEnd;
                                                           ++RegIndex) {
-      if (IndirectRC->getRegister(RegIndex) == Reg)
+      if (IndirectRC->getRegister(RegIndex) == (unsigned)Reg)
         break;
     }
     Offset = std::max(Offset, (int)RegIndex);
@@ -1225,7 +1203,7 @@ int R600InstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const {
   const R600FrameLowering *TFL = ST.getFrameLowering();
 
   Register IgnoredFrameReg;
-  Offset = TFL->getFrameIndexReference(MF, -1, IgnoredFrameReg);
+  Offset = TFL->getFrameIndexReference(MF, -1, IgnoredFrameReg).getFixed();
 
   return getIndirectIndexBegin(MF) + Offset;
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600InstrInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600InstrInfo.h
index 873ee08470cb..1e249c6348f1 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600InstrInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600InstrInfo.h
@@ -194,8 +194,8 @@ public:
                            unsigned NumFCycles, unsigned ExtraFCycles,
                            BranchProbability Probability) const override;
 
-  bool DefinesPredicate(MachineInstr &MI,
-                        std::vector<MachineOperand> &Pred) const override;
+  bool ClobbersPredicate(MachineInstr &MI, std::vector<MachineOperand> &Pred,
+                         bool SkipDead) const override;
 
   bool isProfitableToUnpredicate(MachineBasicBlock &TMBB,
                                  MachineBasicBlock &FMBB) const override;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Instructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Instructions.td
index 2cc21364c439..055e2de59ea1 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Instructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Instructions.td
@@ -353,8 +353,8 @@ class LoadVtxId1 <PatFrag load> : PatFrag <
   const MemSDNode *LD = cast<MemSDNode>(N);
   return LD->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
          (LD->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
-           !isa<GlobalValue>(GetUnderlyingObject(
-           LD->getMemOperand()->getValue(), CurDAG->getDataLayout())));
+           !isa<GlobalValue>(getUnderlyingObject(
+           LD->getMemOperand()->getValue())));
 }]>;
 
 def vtx_id1_az_extloadi8 : LoadVtxId1 <az_extloadi8>;
@@ -365,8 +365,8 @@ class LoadVtxId2 <PatFrag load> : PatFrag <
   (ops node:$ptr), (load node:$ptr), [{
   const MemSDNode *LD = cast<MemSDNode>(N);
   return LD->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
-         isa<GlobalValue>(GetUnderlyingObject(
-         LD->getMemOperand()->getValue(), CurDAG->getDataLayout()));
+         isa<GlobalValue>(getUnderlyingObject(
+         LD->getMemOperand()->getValue()));
 }]>;
 
 def vtx_id2_az_extloadi8 : LoadVtxId2 <az_extloadi8>;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp
index 7569a2629539..f85a68706287 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp
@@ -12,13 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "R600MachineScheduler.h"
-#include "AMDGPUSubtarget.h"
-#include "R600InstrInfo.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/LegacyPassManager.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/raw_ostream.h"
+#include "R600Subtarget.h"
 
 using namespace llvm;
 
@@ -45,7 +40,7 @@ void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {
 void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc,
                                   std::vector<SUnit *> &QDst)
 {
-  QDst.insert(QDst.end(), QSrc.begin(), QSrc.end());
+  llvm::append_range(QDst, QSrc);
   QSrc.clear();
 }
 
@@ -183,7 +178,7 @@ isPhysicalRegCopy(MachineInstr *MI) {
   if (MI->getOpcode() != R600::COPY)
     return false;
 
-  return !Register::isVirtualRegister(MI->getOperand(1).getReg());
+  return !MI->getOperand(1).getReg().isVirtual();
 }
 
 void R600SchedStrategy::releaseTopNode(SUnit *SU) {
@@ -207,9 +202,9 @@ void R600SchedStrategy::releaseBottomNode(SUnit *SU) {
 
 }
 
-bool R600SchedStrategy::regBelongsToClass(unsigned Reg,
+bool R600SchedStrategy::regBelongsToClass(Register Reg,
                                           const TargetRegisterClass *RC) const {
-  if (!Register::isVirtualRegister(Reg)) {
+  if (!Reg.isVirtual()) {
     return RC->contains(Reg);
   } else {
     return MRI->getRegClass(Reg) == RC;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600MachineScheduler.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600MachineScheduler.h
index bc66f2ef5907..abcc37f8400d 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600MachineScheduler.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600MachineScheduler.h
@@ -80,7 +80,7 @@ private:
   bool VLIW5;
 
   int getInstKind(SUnit *SU);
-  bool regBelongsToClass(unsigned Reg, const TargetRegisterClass *RC) const;
+  bool regBelongsToClass(Register Reg, const TargetRegisterClass *RC) const;
   AluKind getAluKind(SUnit *SU) const;
   void LoadAlu();
   unsigned AvailablesAluCount() const;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp
index 1fe92d2269d3..5fd912e0fb39 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp
@@ -27,27 +27,12 @@
 #include "AMDGPU.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/IR/Argument.h"
-#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Metadata.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Use.h"
-#include "llvm/IR/User.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
-#include <cassert>
-#include <cstddef>
-#include <cstdint>
-#include <tuple>
 
 using namespace llvm;
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
index b0620663a230..8f19a3e478e8 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
@@ -27,30 +27,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "R600Defines.h"
-#include "R600InstrInfo.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "R600Defines.h"
+#include "R600Subtarget.h"
 #include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/DebugLoc.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include <cassert>
-#include <utility>
-#include <vector>
 
 using namespace llvm;
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Packetizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
index 176269f9b68c..eaac938b098a 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
@@ -14,17 +14,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "R600InstrInfo.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "R600Subtarget.h"
 #include "llvm/CodeGen/DFAPacketizer.h"
 #include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp
index 78ef71cdf8e3..e4f7d89bf4c9 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp
@@ -12,11 +12,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "R600RegisterInfo.h"
-#include "AMDGPUTargetMachine.h"
-#include "R600Defines.h"
-#include "R600InstrInfo.h"
-#include "R600MachineFunctionInfo.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "R600Defines.h"
+#include "R600Subtarget.h"
 
 using namespace llvm;
 
@@ -94,8 +92,8 @@ const TargetRegisterClass * R600RegisterInfo::getCFGStructurizerRegClass(
   }
 }
 
-bool R600RegisterInfo::isPhysRegLiveAcrossClauses(unsigned Reg) const {
-  assert(!Register::isVirtualRegister(Reg));
+bool R600RegisterInfo::isPhysRegLiveAcrossClauses(Register Reg) const {
+  assert(!Reg.isVirtual());
 
   switch (Reg) {
   case R600::OQAP:
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600RegisterInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600RegisterInfo.h
index 06981c4cf9c5..1308e9fff1fe 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600RegisterInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600RegisterInfo.h
@@ -45,7 +45,7 @@ struct R600RegisterInfo final : public R600GenRegisterInfo {
 
   // \returns true if \p Reg can be defined in one ALU clause and used in
   // another.
-  bool isPhysRegLiveAcrossClauses(unsigned Reg) const;
+  bool isPhysRegLiveAcrossClauses(Register Reg) const;
 
   void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
                            unsigned FIOperandNum,
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Subtarget.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Subtarget.h
new file mode 100644
index 000000000000..07238da18c67
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Subtarget.h
@@ -0,0 +1,174 @@
+//=====-- R600Subtarget.h - Define Subtarget for AMDGPU R600 ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//==-----------------------------------------------------------------------===//
+//
+/// \file
+/// AMDGPU R600 specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_R600SUBTARGET_H
+#define LLVM_LIB_TARGET_AMDGPU_R600SUBTARGET_H
+
+#include "AMDGPUSubtarget.h"
+#include "R600FrameLowering.h"
+#include "R600ISelLowering.h"
+#include "R600InstrInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+
+namespace llvm {
+
+class MCInst;
+class MCInstrInfo;
+
+} // namespace llvm
+
+#define GET_SUBTARGETINFO_HEADER
+#include "R600GenSubtargetInfo.inc"
+
+namespace llvm {
+
+class R600Subtarget final : public R600GenSubtargetInfo,
+                            public AMDGPUSubtarget {
+private:
+  R600InstrInfo InstrInfo;
+  R600FrameLowering FrameLowering;
+  bool FMA;
+  bool CaymanISA;
+  bool CFALUBug;
+  bool HasVertexCache;
+  bool R600ALUInst;
+  bool FP64;
+  short TexVTXClauseSize;
+  Generation Gen;
+  R600TargetLowering TLInfo;
+  InstrItineraryData InstrItins;
+  SelectionDAGTargetInfo TSInfo;
+
+public:
+  R600Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
+                const TargetMachine &TM);
+
+  const R600InstrInfo *getInstrInfo() const override { return &InstrInfo; }
+
+  const R600FrameLowering *getFrameLowering() const override {
+    return &FrameLowering;
+  }
+
+  const R600TargetLowering *getTargetLowering() const override {
+    return &TLInfo;
+  }
+
+  const R600RegisterInfo *getRegisterInfo() const override {
+    return &InstrInfo.getRegisterInfo();
+  }
+
+  const InstrItineraryData *getInstrItineraryData() const override {
+    return &InstrItins;
+  }
+
+  // Nothing implemented, just prevent crashes on use.
+  const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
+    return &TSInfo;
+  }
+
+  void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
+
+  Generation getGeneration() const {
+    return Gen;
+  }
+
+  Align getStackAlignment() const { return Align(4); }
+
+  R600Subtarget &initializeSubtargetDependencies(const Triple &TT,
+                                                 StringRef GPU, StringRef FS);
+
+  bool hasBFE() const {
+    return (getGeneration() >= EVERGREEN);
+  }
+
+  bool hasBFI() const {
+    return (getGeneration() >= EVERGREEN);
+  }
+
+  bool hasBCNT(unsigned Size) const {
+    if (Size == 32)
+      return (getGeneration() >= EVERGREEN);
+
+    return false;
+  }
+
+  bool hasBORROW() const {
+    return (getGeneration() >= EVERGREEN);
+  }
+
+  bool hasCARRY() const {
+    return (getGeneration() >= EVERGREEN);
+  }
+
+  bool hasCaymanISA() const {
+    return CaymanISA;
+  }
+
+  bool hasFFBL() const {
+    return (getGeneration() >= EVERGREEN);
+  }
+
+  bool hasFFBH() const {
+    return (getGeneration() >= EVERGREEN);
+  }
+
+  bool hasFMA() const { return FMA; }
+
+  bool hasCFAluBug() const { return CFALUBug; }
+
+  bool hasVertexCache() const { return HasVertexCache; }
+
+  short getTexVTXClauseSize() const { return TexVTXClauseSize; }
+
+  bool enableMachineScheduler() const override {
+    return true;
+  }
+
+  bool enableSubRegLiveness() const override {
+    return true;
+  }
+
+  /// \returns Maximum number of work groups per compute unit supported by the
+  /// subtarget and limited by given \p FlatWorkGroupSize.
+  unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
+    return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize);
+  }
+
+  /// \returns Minimum flat work group size supported by the subtarget.
+  unsigned getMinFlatWorkGroupSize() const override {
+    return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this);
+  }
+
+  /// \returns Maximum flat work group size supported by the subtarget.
+  unsigned getMaxFlatWorkGroupSize() const override {
+    return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
+  }
+
+  /// \returns Number of waves per execution unit required to support the given
+  /// \p FlatWorkGroupSize.
+  unsigned
+  getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override {
+    return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize);
+  }
+
+  /// \returns Minimum number of waves per execution unit supported by the
+  /// subtarget.
+  unsigned getMinWavesPerEU() const override {
+    return AMDGPU::IsaInfo::getMinWavesPerEU(this);
+  }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_R600SUBTARGET_H
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp
index 90e48c63b5dc..3b753cb66ead 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp
@@ -16,15 +16,9 @@
 //
 
 #include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
+#include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIInstrInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/Function.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetMachine.h"
 
 #define DEBUG_TYPE "si-img-init"
 
@@ -80,9 +74,8 @@ bool SIAddIMGInit::runOnMachineFunction(MachineFunction &MF) {
         MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
         MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
 
-        // Check for instructions that don't have tfe or lwe fields
-        // There shouldn't be any at this point.
-        assert( (TFE && LWE) && "Expected tfe and lwe operands in instruction");
+        if (!TFE && !LWE) // intersect_ray
+          continue;
 
         unsigned TFEVal = TFE->getImm();
         unsigned LWEVal = LWE->getImm();
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index 3c41bf1fef5e..625749deb3a8 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -12,36 +12,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
+#include "GCNSubtarget.h"
 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/ValueHandle.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include <cassert>
-#include <utility>
 
 using namespace llvm;
 
@@ -313,8 +295,15 @@ void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
 
   Value *Exec = popSaved();
   Instruction *FirstInsertionPt = &*BB->getFirstInsertionPt();
-  if (!isa<UndefValue>(Exec) && !isa<UnreachableInst>(FirstInsertionPt))
+  if (!isa<UndefValue>(Exec) && !isa<UnreachableInst>(FirstInsertionPt)) {
+    Instruction *ExecDef = cast<Instruction>(Exec);
+    BasicBlock *DefBB = ExecDef->getParent();
+    if (!DT->dominates(DefBB, BB)) {
+      // Split edge to make Def dominate Use
+      FirstInsertionPt = &*SplitEdge(DefBB, BB, DT, LI)->getFirstInsertionPt();
+    }
     CallInst::Create(EndCf, Exec, "", FirstInsertionPt);
+  }
 }
 
 /// Annotate the control flow with intrinsics so the backend can
@@ -327,7 +316,6 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) {
   const TargetMachine &TM = TPC.getTM<TargetMachine>();
 
   initialize(*F.getParent(), TM.getSubtarget<GCNSubtarget>(F));
-
   for (df_iterator<BasicBlock *> I = df_begin(&F.getEntryBlock()),
        E = df_end(&F.getEntryBlock()); I != E; ++I) {
     BasicBlock *BB = *I;
@@ -344,7 +332,8 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) {
       if (isTopOfStack(BB))
         closeControlFlow(BB);
 
-      handleLoop(Term);
+      if (DT->dominates(Term->getSuccessor(1), BB))
+        handleLoop(Term);
       continue;
     }
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIDefines.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIDefines.h
index 4f7d255eb450..c83802b323c3 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -33,26 +33,27 @@ enum : uint64_t {
   VOP2 = 1 << 8,
   VOPC = 1 << 9,
 
- // TODO: Should this be spilt into VOP3 a and b?
+  // TODO: Should this be spilt into VOP3 a and b?
   VOP3 = 1 << 10,
   VOP3P = 1 << 12,
 
   VINTRP = 1 << 13,
   SDWA = 1 << 14,
   DPP = 1 << 15,
+  TRANS = 1 << 16,
 
   // Memory instruction formats.
-  MUBUF = 1 << 16,
-  MTBUF = 1 << 17,
-  SMRD = 1 << 18,
-  MIMG = 1 << 19,
-  EXP = 1 << 20,
-  FLAT = 1 << 21,
-  DS = 1 << 22,
+  MUBUF = 1 << 17,
+  MTBUF = 1 << 18,
+  SMRD = 1 << 19,
+  MIMG = 1 << 20,
+  EXP = 1 << 21,
+  FLAT = 1 << 22,
+  DS = 1 << 23,
 
   // Pseudo instruction formats.
-  VGPRSpill = 1 << 23,
-  SGPRSpill = 1 << 24,
+  VGPRSpill = 1 << 24,
+  SGPRSpill = 1 << 25,
 
   // High bits - other information.
   VM_CNT = UINT64_C(1) << 32,
@@ -89,8 +90,8 @@ enum : uint64_t {
   // Is a D16 buffer instruction.
   D16Buf = UINT64_C(1) << 50,
 
-  // FLAT instruction accesses FLAT_GLBL or FLAT_SCRATCH segment.
-  IsNonFlatSeg = UINT64_C(1) << 51,
+  // FLAT instruction accesses FLAT_GLBL segment.
+  IsFlatGlobal = UINT64_C(1) << 51,
 
   // Uses floating point double precision rounding mode
   FPDPRounding = UINT64_C(1) << 52,
@@ -102,7 +103,10 @@ enum : uint64_t {
   IsMAI = UINT64_C(1) << 54,
 
   // Is a DOT instruction.
-  IsDOT = UINT64_C(1) << 55
+  IsDOT = UINT64_C(1) << 55,
+
+  // FLAT instruction accesses FLAT_SCRATCH segment.
+  IsFlatScratch = UINT64_C(1) << 56
 };
 
 // v_cmp_class_* etc. use a 10-bit mask for what operation is checked.
@@ -217,7 +221,8 @@ enum EncBits : unsigned {
   SRC1_ENABLE = 1 << ID_SRC1,
   SRC2_ENABLE = 1 << ID_SRC2,
   DST_ENABLE = 1 << ID_DST,
-  ENABLE_MASK = SRC0_ENABLE | SRC1_ENABLE | SRC2_ENABLE | DST_ENABLE
+  ENABLE_MASK = SRC0_ENABLE | SRC1_ENABLE | SRC2_ENABLE | DST_ENABLE,
+  UNDEF = 0xFFFF
 };
 
 } // namespace VGPRIndexMode
@@ -242,8 +247,8 @@ enum : unsigned {
   SGPR_MAX_GFX10 = 105,
   TTMP_VI_MIN = 112,
   TTMP_VI_MAX = 123,
-  TTMP_GFX9_GFX10_MIN = 108,
-  TTMP_GFX9_GFX10_MAX = 123,
+  TTMP_GFX9PLUS_MIN = 108,
+  TTMP_GFX9PLUS_MAX = 123,
   INLINE_INTEGER_C_MIN = 128,
   INLINE_INTEGER_C_POSITIVE_MAX = 192, // 64
   INLINE_INTEGER_C_MAX = 208,
@@ -392,6 +397,172 @@ enum ModeRegisterMasks : uint32_t {
 
 } // namespace Hwreg
 
+namespace MTBUFFormat {
+
+enum DataFormat : int64_t {
+  DFMT_INVALID = 0,
+  DFMT_8,
+  DFMT_16,
+  DFMT_8_8,
+  DFMT_32,
+  DFMT_16_16,
+  DFMT_10_11_11,
+  DFMT_11_11_10,
+  DFMT_10_10_10_2,
+  DFMT_2_10_10_10,
+  DFMT_8_8_8_8,
+  DFMT_32_32,
+  DFMT_16_16_16_16,
+  DFMT_32_32_32,
+  DFMT_32_32_32_32,
+  DFMT_RESERVED_15,
+
+  DFMT_MIN = DFMT_INVALID,
+  DFMT_MAX = DFMT_RESERVED_15,
+
+  DFMT_UNDEF = -1,
+  DFMT_DEFAULT = DFMT_8,
+
+  DFMT_SHIFT = 0,
+  DFMT_MASK = 0xF
+};
+
+enum NumFormat : int64_t {
+  NFMT_UNORM = 0,
+  NFMT_SNORM,
+  NFMT_USCALED,
+  NFMT_SSCALED,
+  NFMT_UINT,
+  NFMT_SINT,
+  NFMT_RESERVED_6,                    // VI and GFX9
+  NFMT_SNORM_OGL = NFMT_RESERVED_6,   // SI and CI only
+  NFMT_FLOAT,
+
+  NFMT_MIN = NFMT_UNORM,
+  NFMT_MAX = NFMT_FLOAT,
+
+  NFMT_UNDEF = -1,
+  NFMT_DEFAULT = NFMT_UNORM,
+
+  NFMT_SHIFT = 4,
+  NFMT_MASK = 7
+};
+
+enum MergedFormat : int64_t {
+  DFMT_NFMT_UNDEF = -1,
+  DFMT_NFMT_DEFAULT = ((DFMT_DEFAULT & DFMT_MASK) << DFMT_SHIFT) |
+                      ((NFMT_DEFAULT & NFMT_MASK) << NFMT_SHIFT),
+
+
+  DFMT_NFMT_MASK = (DFMT_MASK << DFMT_SHIFT) | (NFMT_MASK << NFMT_SHIFT),
+
+  DFMT_NFMT_MAX = DFMT_NFMT_MASK
+};
+
+enum UnifiedFormat : int64_t {
+  UFMT_INVALID = 0,
+
+  UFMT_8_UNORM,
+  UFMT_8_SNORM,
+  UFMT_8_USCALED,
+  UFMT_8_SSCALED,
+  UFMT_8_UINT,
+  UFMT_8_SINT,
+
+  UFMT_16_UNORM,
+  UFMT_16_SNORM,
+  UFMT_16_USCALED,
+  UFMT_16_SSCALED,
+  UFMT_16_UINT,
+  UFMT_16_SINT,
+  UFMT_16_FLOAT,
+
+  UFMT_8_8_UNORM,
+  UFMT_8_8_SNORM,
+  UFMT_8_8_USCALED,
+  UFMT_8_8_SSCALED,
+  UFMT_8_8_UINT,
+  UFMT_8_8_SINT,
+
+  UFMT_32_UINT,
+  UFMT_32_SINT,
+  UFMT_32_FLOAT,
+
+  UFMT_16_16_UNORM,
+  UFMT_16_16_SNORM,
+  UFMT_16_16_USCALED,
+  UFMT_16_16_SSCALED,
+  UFMT_16_16_UINT,
+  UFMT_16_16_SINT,
+  UFMT_16_16_FLOAT,
+
+  UFMT_10_11_11_UNORM,
+  UFMT_10_11_11_SNORM,
+  UFMT_10_11_11_USCALED,
+  UFMT_10_11_11_SSCALED,
+  UFMT_10_11_11_UINT,
+  UFMT_10_11_11_SINT,
+  UFMT_10_11_11_FLOAT,
+
+  UFMT_11_11_10_UNORM,
+  UFMT_11_11_10_SNORM,
+  UFMT_11_11_10_USCALED,
+  UFMT_11_11_10_SSCALED,
+  UFMT_11_11_10_UINT,
+  UFMT_11_11_10_SINT,
+  UFMT_11_11_10_FLOAT,
+
+  UFMT_10_10_10_2_UNORM,
+  UFMT_10_10_10_2_SNORM,
+  UFMT_10_10_10_2_USCALED,
+  UFMT_10_10_10_2_SSCALED,
+  UFMT_10_10_10_2_UINT,
+  UFMT_10_10_10_2_SINT,
+
+  UFMT_2_10_10_10_UNORM,
+  UFMT_2_10_10_10_SNORM,
+  UFMT_2_10_10_10_USCALED,
+  UFMT_2_10_10_10_SSCALED,
+  UFMT_2_10_10_10_UINT,
+  UFMT_2_10_10_10_SINT,
+
+  UFMT_8_8_8_8_UNORM,
+  UFMT_8_8_8_8_SNORM,
+  UFMT_8_8_8_8_USCALED,
+  UFMT_8_8_8_8_SSCALED,
+  UFMT_8_8_8_8_UINT,
+  UFMT_8_8_8_8_SINT,
+
+  UFMT_32_32_UINT,
+  UFMT_32_32_SINT,
+  UFMT_32_32_FLOAT,
+
+  UFMT_16_16_16_16_UNORM,
+  UFMT_16_16_16_16_SNORM,
+  UFMT_16_16_16_16_USCALED,
+  UFMT_16_16_16_16_SSCALED,
+  UFMT_16_16_16_16_UINT,
+  UFMT_16_16_16_16_SINT,
+  UFMT_16_16_16_16_FLOAT,
+
+  UFMT_32_32_32_UINT,
+  UFMT_32_32_32_SINT,
+  UFMT_32_32_32_FLOAT,
+  UFMT_32_32_32_32_UINT,
+  UFMT_32_32_32_32_SINT,
+  UFMT_32_32_32_32_FLOAT,
+
+  UFMT_FIRST = UFMT_INVALID,
+  UFMT_LAST = UFMT_32_32_32_32_FLOAT,
+
+  UFMT_MAX = 127,
+
+  UFMT_UNDEF = -1,
+  UFMT_DEFAULT = UFMT_8_UNORM
+};
+
+} // namespace MTBUFFormat
+
 namespace Swizzle { // Encoding of swizzle macro used in ds_swizzle_b32.
 
 enum Id : unsigned { // id of symbolic names
@@ -518,19 +689,67 @@ enum DppFiMode {
 };
 
 } // namespace DPP
+
+namespace Exp {
+
+enum Target : unsigned {
+  ET_MRT0 = 0,
+  ET_MRT7 = 7,
+  ET_MRTZ = 8,
+  ET_NULL = 9,
+  ET_POS0 = 12,
+  ET_POS3 = 15,
+  ET_POS4 = 16,          // GFX10+
+  ET_POS_LAST = ET_POS4, // Highest pos used on any subtarget
+  ET_PRIM = 20,          // GFX10+
+  ET_PARAM0 = 32,
+  ET_PARAM31 = 63,
+
+  ET_NULL_MAX_IDX = 0,
+  ET_MRTZ_MAX_IDX = 0,
+  ET_PRIM_MAX_IDX = 0,
+  ET_MRT_MAX_IDX = 7,
+  ET_POS_MAX_IDX = 4,
+  ET_PARAM_MAX_IDX = 31,
+
+  ET_INVALID = 255,
+};
+
+} // namespace Exp
 } // namespace AMDGPU
 
 #define R_00B028_SPI_SHADER_PGM_RSRC1_PS                                0x00B028
+#define   S_00B028_VGPRS(x)                                           (((x) & 0x3F) << 0)
+#define   S_00B028_SGPRS(x)                                           (((x) & 0x0F) << 6)
+#define   S_00B028_MEM_ORDERED(x)                                     (((x) & 0x1) << 25)
+#define   G_00B028_MEM_ORDERED(x)                                     (((x) >> 25) & 0x1)
+#define   C_00B028_MEM_ORDERED                                        0xFDFFFFFF
+
 #define R_00B02C_SPI_SHADER_PGM_RSRC2_PS                                0x00B02C
 #define   S_00B02C_EXTRA_LDS_SIZE(x)                                  (((x) & 0xFF) << 8)
 #define R_00B128_SPI_SHADER_PGM_RSRC1_VS                                0x00B128
+#define   S_00B128_MEM_ORDERED(x)                                     (((x) & 0x1) << 27)
+#define   G_00B128_MEM_ORDERED(x)                                     (((x) >> 27) & 0x1)
+#define   C_00B128_MEM_ORDERED                                        0xF7FFFFFF
+
 #define R_00B228_SPI_SHADER_PGM_RSRC1_GS                                0x00B228
+#define   S_00B228_WGP_MODE(x)                                        (((x) & 0x1) << 27)
+#define   G_00B228_WGP_MODE(x)                                        (((x) >> 27) & 0x1)
+#define   C_00B228_WGP_MODE                                           0xF7FFFFFF
+#define   S_00B228_MEM_ORDERED(x)                                     (((x) & 0x1) << 25)
+#define   G_00B228_MEM_ORDERED(x)                                     (((x) >> 25) & 0x1)
+#define   C_00B228_MEM_ORDERED                                        0xFDFFFFFF
+
 #define R_00B328_SPI_SHADER_PGM_RSRC1_ES                                0x00B328
 #define R_00B428_SPI_SHADER_PGM_RSRC1_HS                                0x00B428
+#define   S_00B428_WGP_MODE(x)                                        (((x) & 0x1) << 26)
+#define   G_00B428_WGP_MODE(x)                                        (((x) >> 26) & 0x1)
+#define   C_00B428_WGP_MODE                                           0xFBFFFFFF
+#define   S_00B428_MEM_ORDERED(x)                                     (((x) & 0x1) << 24)
+#define   G_00B428_MEM_ORDERED(x)                                     (((x) >> 24) & 0x1)
+#define   C_00B428_MEM_ORDERED                                        0xFEFFFFFF
+
 #define R_00B528_SPI_SHADER_PGM_RSRC1_LS                                0x00B528
-#define R_00B848_COMPUTE_PGM_RSRC1                                      0x00B848
-#define   S_00B028_VGPRS(x)                                           (((x) & 0x3F) << 0)
-#define   S_00B028_SGPRS(x)                                           (((x) & 0x0F) << 6)
 
 #define R_00B84C_COMPUTE_PGM_RSRC2                                      0x00B84C
 #define   S_00B84C_SCRATCH_EN(x)                                      (((x) & 0x1) << 0)
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index ef64c5674bd1..34f59bf34dd5 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -65,37 +65,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
+#include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIInstrInfo.h"
-#include "SIRegisterInfo.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CodeGen.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
-#include <cassert>
-#include <cstdint>
-#include <iterator>
-#include <list>
-#include <map>
-#include <tuple>
-#include <utility>
 
 using namespace llvm;
 
@@ -122,7 +96,7 @@ public:
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
-  void processPHINode(MachineInstr &MI);
+  MachineBasicBlock *processPHINode(MachineInstr &MI);
 
   StringRef getPassName() const override { return "SI Fix SGPR copies"; }
 
@@ -154,8 +128,7 @@ static bool hasVectorOperands(const MachineInstr &MI,
                               const SIRegisterInfo *TRI) {
   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
-    if (!MI.getOperand(i).isReg() ||
-        !Register::isVirtualRegister(MI.getOperand(i).getReg()))
+    if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual())
       continue;
 
     if (TRI->hasVectorRegisters(MRI.getRegClass(MI.getOperand(i).getReg())))
@@ -171,14 +144,14 @@ getCopyRegClasses(const MachineInstr &Copy,
   Register DstReg = Copy.getOperand(0).getReg();
   Register SrcReg = Copy.getOperand(1).getReg();
 
-  const TargetRegisterClass *SrcRC = Register::isVirtualRegister(SrcReg)
+  const TargetRegisterClass *SrcRC = SrcReg.isVirtual()
                                          ? MRI.getRegClass(SrcReg)
                                          : TRI.getPhysRegClass(SrcReg);
 
   // We don't really care about the subregister here.
   // SrcRC = TRI.getSubRegClass(SrcRC, Copy.getOperand(1).getSubReg());
 
-  const TargetRegisterClass *DstRC = Register::isVirtualRegister(DstReg)
+  const TargetRegisterClass *DstRC = DstReg.isVirtual()
                                          ? MRI.getRegClass(DstReg)
                                          : TRI.getPhysRegClass(DstReg);
 
@@ -206,8 +179,7 @@ static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI,
   auto &Src = MI.getOperand(1);
   Register DstReg = MI.getOperand(0).getReg();
   Register SrcReg = Src.getReg();
-  if (!Register::isVirtualRegister(SrcReg) ||
-      !Register::isVirtualRegister(DstReg))
+  if (!SrcReg.isVirtual() || !DstReg.isVirtual())
     return false;
 
   for (const auto &MO : MRI.reg_nodbg_operands(DstReg)) {
@@ -215,8 +187,12 @@ static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI,
     if (UseMI == &MI)
       continue;
     if (MO.isDef() || UseMI->getParent() != MI.getParent() ||
-        UseMI->getOpcode() <= TargetOpcode::GENERIC_OP_END ||
-        !TII->isOperandLegal(*UseMI, UseMI->getOperandNo(&MO), &Src))
+        UseMI->getOpcode() <= TargetOpcode::GENERIC_OP_END)
+      return false;
+
+    unsigned OpIdx = UseMI->getOperandNo(&MO);
+    if (OpIdx >= UseMI->getDesc().getNumOperands() ||
+        !TII->isOperandLegal(*UseMI, OpIdx, &Src))
       return false;
   }
   // Change VGPR to SGPR destination.
@@ -255,7 +231,7 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
     return false;
 
   // It is illegal to have vreg inputs to a physreg defining reg_sequence.
-  if (Register::isPhysicalRegister(CopyUse.getOperand(0).getReg()))
+  if (CopyUse.getOperand(0).getReg().isPhysical())
     return false;
 
   const TargetRegisterClass *SrcRC, *DstRC;
@@ -306,7 +282,7 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
       const TargetRegisterClass *NewSrcRC = TRI->getEquivalentAGPRClass(SrcRC);
       Register TmpAReg = MRI.createVirtualRegister(NewSrcRC);
       unsigned Opc = NewSrcRC == &AMDGPU::AGPR_32RegClass ?
-        AMDGPU::V_ACCVGPR_WRITE_B32 : AMDGPU::COPY;
+        AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::COPY;
       BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(Opc),
             TmpAReg)
         .addReg(TmpReg, RegState::Kill);
@@ -362,8 +338,7 @@ bool searchPredecessors(const MachineBasicBlock *MBB,
     return false;
 
   DenseSet<const MachineBasicBlock *> Visited;
-  SmallVector<MachineBasicBlock *, 4> Worklist(MBB->pred_begin(),
-                                               MBB->pred_end());
+  SmallVector<MachineBasicBlock *, 4> Worklist(MBB->predecessors());
 
   while (!Worklist.empty()) {
     MachineBasicBlock *MBB = Worklist.pop_back_val();
@@ -388,17 +363,13 @@ static bool isReachable(const MachineInstr *From,
                         const MachineInstr *To,
                         const MachineBasicBlock *CutOff,
                         MachineDominatorTree &MDT) {
-  // If either From block dominates To block or instructions are in the same
-  // block and From is higher.
   if (MDT.dominates(From, To))
     return true;
 
   const MachineBasicBlock *MBBFrom = From->getParent();
   const MachineBasicBlock *MBBTo = To->getParent();
-  if (MBBFrom == MBBTo)
-    return false;
 
-  // Instructions are in different blocks, do predecessor search.
+  // Do predecessor search.
   // We should almost never get here since we do not usually produce M0 stores
   // other than -1.
   return searchPredecessors(MBBTo, CutOff, [MBBFrom]
@@ -598,13 +569,11 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
   TII = ST.getInstrInfo();
   MDT = &getAnalysis<MachineDominatorTree>();
 
-  SmallVector<MachineInstr *, 16> Worklist;
-
   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
                                                   BI != BE; ++BI) {
-    MachineBasicBlock &MBB = *BI;
-    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
-         I != E; ++I) {
+    MachineBasicBlock *MBB = &*BI;
+    for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
+         ++I) {
       MachineInstr &MI = *I;
 
       switch (MI.getOpcode()) {
@@ -619,7 +588,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
         const TargetRegisterClass *SrcRC, *DstRC;
         std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, *MRI);
 
-        if (!Register::isVirtualRegister(DstReg)) {
+        if (!DstReg.isVirtual()) {
           // If the destination register is a physical register there isn't
           // really much we can do to fix this.
           // Some special instructions use M0 as an input. Some even only use
@@ -628,9 +597,9 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
             Register TmpReg
               = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
 
-            BuildMI(MBB, MI, MI.getDebugLoc(),
+            BuildMI(*MBB, MI, MI.getDebugLoc(),
                     TII->get(AMDGPU::V_READFIRSTLANE_B32), TmpReg)
-              .add(MI.getOperand(1));
+                .add(MI.getOperand(1));
             MI.getOperand(1).setReg(TmpReg);
           }
 
@@ -639,8 +608,16 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
 
         if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) {
           Register SrcReg = MI.getOperand(1).getReg();
-          if (!Register::isVirtualRegister(SrcReg)) {
-            TII->moveToVALU(MI, MDT);
+          if (!SrcReg.isVirtual()) {
+            MachineBasicBlock *NewBB = TII->moveToVALU(MI, MDT);
+            if (NewBB && NewBB != MBB) {
+              MBB = NewBB;
+              E = MBB->end();
+              BI = MachineFunction::iterator(MBB);
+              BE = MF.end();
+            }
+            assert((!NewBB || NewBB == I->getParent()) &&
+                   "moveToVALU did not return the right basic block");
             break;
           }
 
@@ -655,7 +632,15 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
             MI.setDesc(TII->get(SMovOp));
             break;
           }
-          TII->moveToVALU(MI, MDT);
+          MachineBasicBlock *NewBB = TII->moveToVALU(MI, MDT);
+          if (NewBB && NewBB != MBB) {
+            MBB = NewBB;
+            E = MBB->end();
+            BI = MachineFunction::iterator(MBB);
+            BE = MF.end();
+          }
+          assert((!NewBB || NewBB == I->getParent()) &&
+                 "moveToVALU did not return the right basic block");
         } else if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) {
           tryChangeVGPRtoSGPRinCopy(MI, TRI, TII);
         }
@@ -663,10 +648,18 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
         break;
       }
       case AMDGPU::PHI: {
-        processPHINode(MI);
+        MachineBasicBlock *NewBB = processPHINode(MI);
+        if (NewBB && NewBB != MBB) {
+          MBB = NewBB;
+          E = MBB->end();
+          BI = MachineFunction::iterator(MBB);
+          BE = MF.end();
+        }
+        assert((!NewBB || NewBB == I->getParent()) &&
+               "moveToVALU did not return the right basic block");
         break;
       }
-      case AMDGPU::REG_SEQUENCE:
+      case AMDGPU::REG_SEQUENCE: {
         if (TRI->hasVectorRegisters(TII->getOpRegClass(MI, 0)) ||
             !hasVectorOperands(MI, TRI)) {
           foldVGPRCopyIntoRegSequence(MI, TRI, TII, *MRI);
@@ -675,8 +668,17 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
 
         LLVM_DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI);
 
-        TII->moveToVALU(MI, MDT);
+        MachineBasicBlock *NewBB = TII->moveToVALU(MI, MDT);
+        if (NewBB && NewBB != MBB) {
+          MBB = NewBB;
+          E = MBB->end();
+          BI = MachineFunction::iterator(MBB);
+          BE = MF.end();
+        }
+        assert((!NewBB || NewBB == I->getParent()) &&
+               "moveToVALU did not return the right basic block");
         break;
+      }
       case AMDGPU::INSERT_SUBREG: {
         const TargetRegisterClass *DstRC, *Src0RC, *Src1RC;
         DstRC = MRI->getRegClass(MI.getOperand(0).getReg());
@@ -686,7 +688,15 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
             (TRI->hasVectorRegisters(Src0RC) ||
              TRI->hasVectorRegisters(Src1RC))) {
           LLVM_DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI);
-          TII->moveToVALU(MI, MDT);
+          MachineBasicBlock *NewBB = TII->moveToVALU(MI, MDT);
+          if (NewBB && NewBB != MBB) {
+            MBB = NewBB;
+            E = MBB->end();
+            BI = MachineFunction::iterator(MBB);
+            BE = MF.end();
+          }
+          assert((!NewBB || NewBB == I->getParent()) &&
+                 "moveToVALU did not return the right basic block");
         }
         break;
       }
@@ -721,7 +731,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
           // that can't be resolved in later operand folding pass
           bool Resolved = false;
           for (MachineOperand *MO : {&Src0, &Src1}) {
-            if (Register::isVirtualRegister(MO->getReg())) {
+            if (MO->getReg().isVirtual()) {
               MachineInstr *DefMI = MRI->getVRegDef(MO->getReg());
               if (DefMI && TII->isFoldableCopy(*DefMI)) {
                 const MachineOperand &Def = DefMI->getOperand(0);
@@ -761,17 +771,18 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
   return true;
 }
 
-void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
+MachineBasicBlock *SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
   unsigned numVGPRUses = 0;
   bool AllAGPRUses = true;
   SetVector<const MachineInstr *> worklist;
   SmallSet<const MachineInstr *, 4> Visited;
   SetVector<MachineInstr *> PHIOperands;
+  MachineBasicBlock *CreatedBB = nullptr;
   worklist.insert(&MI);
   Visited.insert(&MI);
   while (!worklist.empty()) {
     const MachineInstr *Instr = worklist.pop_back_val();
-    unsigned Reg = Instr->getOperand(0).getReg();
+    Register Reg = Instr->getOperand(0).getReg();
     for (const auto &Use : MRI->use_operands(Reg)) {
       const MachineInstr *UseMI = Use.getParent();
       AllAGPRUses &= (UseMI->isCopy() &&
@@ -820,11 +831,11 @@ void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
 
   bool hasVGPRInput = false;
   for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
-    unsigned InputReg = MI.getOperand(i).getReg();
+    Register InputReg = MI.getOperand(i).getReg();
     MachineInstr *Def = MRI->getVRegDef(InputReg);
     if (TRI->isVectorRegister(*MRI, InputReg)) {
       if (Def->isCopy()) {
-        unsigned SrcReg = Def->getOperand(1).getReg();
+        Register SrcReg = Def->getOperand(1).getReg();
         const TargetRegisterClass *RC =
           TRI->getRegClassForReg(*MRI, SrcReg);
         if (TRI->isSGPRClass(RC))
@@ -858,7 +869,7 @@ void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
        RC0 != &AMDGPU::VReg_1RegClass) &&
     (hasVGPRInput || numVGPRUses > 1)) {
     LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI);
-    TII->moveToVALU(MI);
+    CreatedBB = TII->moveToVALU(MI);
   }
   else {
     LLVM_DEBUG(dbgs() << "Legalizing PHI: " << MI);
@@ -869,4 +880,5 @@ void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
   while (!PHIOperands.empty()) {
     processPHINode(*PHIOperands.pop_back_val());
   }
+  return CreatedBB;
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFixVGPRCopies.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFixVGPRCopies.cpp
index 29484668a01d..f7e3ea5fc072 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFixVGPRCopies.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFixVGPRCopies.cpp
@@ -12,8 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "SIInstrInfo.h"
+#include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFixupVectorISel.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFixupVectorISel.cpp
deleted file mode 100644
index 8e3402b537b3..000000000000
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFixupVectorISel.cpp
+++ /dev/null
@@ -1,239 +0,0 @@
-//===-- SIFixupVectorISel.cpp - Fixup post ISel vector issues -------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-/// \file
-/// SIFixupVectorISel pass cleans up post ISEL Vector issues.
-/// Currently this will convert GLOBAL_{LOAD|STORE}_*
-/// and GLOBAL_Atomic_* instructions into their _SADDR variants,
-/// feeding the sreg into the saddr field of the new instruction.
-/// We currently handle a REG_SEQUENCE feeding the vaddr
-/// and decompose it into a base and index.
-///
-/// Transform:
-/// %17:vgpr_32, %19:sreg_64_xexec = V_ADD_I32_e64 %21:sgpr_32, %22:vgpr_32
-/// %18:vgpr_32, %20:sreg_64_xexec = V_ADDC_U32_e64 %25:vgpr_32,
-///                                    %24:vgpr_32, %19:sreg_64_xexec
-/// %16:vreg_64 = REG_SEQUENCE %17:vgpr_32, %sub0, %18:vgpr_32, %sub1
-/// %11:vreg_64 = COPY %16:vreg_64
-/// %10:vgpr_32 = GLOBAL_LOAD_DWORD killed %11:vreg_64, 16, 0, 0
-/// Into:
-/// %4:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1:sgpr_64, 36, 0
-/// %14:vreg_64 = REG_SEQUENCE %6:vgpr_32, %sub0, %15:vgpr_32, %sub1
-/// %10:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %14:vreg_64, %4:sreg_64_xexec,16...
-///
-//===----------------------------------------------------------------------===//
-//
-
-#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetMachine.h"
-#define DEBUG_TYPE "si-fixup-vector-isel"
-
-using namespace llvm;
-
-static cl::opt<bool> EnableGlobalSGPRAddr(
-  "amdgpu-enable-global-sgpr-addr",
-  cl::desc("Enable use of SGPR regs for GLOBAL LOAD/STORE instructions"),
-  cl::init(false));
-
-STATISTIC(NumSGPRGlobalOccurs, "Number of global ld/st opportunities");
-STATISTIC(NumSGPRGlobalSaddrs, "Number of global sgpr instructions converted");
-
-namespace {
-
-class SIFixupVectorISel : public MachineFunctionPass {
-public:
-  static char ID;
-
-public:
-  SIFixupVectorISel() : MachineFunctionPass(ID) {
-    initializeSIFixupVectorISelPass(*PassRegistry::getPassRegistry());
-  }
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-};
-
-} // End anonymous namespace.
-
-INITIALIZE_PASS(SIFixupVectorISel, DEBUG_TYPE,
-                "SI Fixup Vector ISel", false, false)
-
-char SIFixupVectorISel::ID = 0;
-
-char &llvm::SIFixupVectorISelID = SIFixupVectorISel::ID;
-
-FunctionPass *llvm::createSIFixupVectorISelPass() {
-  return new SIFixupVectorISel();
-}
-
-static bool findSRegBaseAndIndex(MachineOperand *Op,
-                                 unsigned &BaseReg,
-                                 unsigned &IndexReg,
-                                 MachineRegisterInfo &MRI,
-                                 const SIRegisterInfo *TRI) {
-  SmallVector<MachineOperand *, 8> Worklist;
-  Worklist.push_back(Op);
-  while (!Worklist.empty()) {
-    MachineOperand *WOp = Worklist.pop_back_val();
-    if (!WOp->isReg() || !Register::isVirtualRegister(WOp->getReg()))
-      continue;
-    MachineInstr *DefInst = MRI.getUniqueVRegDef(WOp->getReg());
-    switch (DefInst->getOpcode()) {
-    default:
-      continue;
-    case AMDGPU::COPY:
-      Worklist.push_back(&DefInst->getOperand(1));
-      break;
-    case AMDGPU::REG_SEQUENCE:
-      if (DefInst->getNumOperands() != 5)
-        continue;
-      Worklist.push_back(&DefInst->getOperand(1));
-      Worklist.push_back(&DefInst->getOperand(3));
-      break;
-    case AMDGPU::V_ADD_I32_e64:
-      // The V_ADD_* and its analogous V_ADDCV_* are generated by
-      // a previous pass which lowered from an ADD_64_PSEUDO,
-      // which generates subregs to break up the 64 bit args.
-      if (DefInst->getOperand(2).getSubReg() != AMDGPU::NoSubRegister)
-        continue;
-      BaseReg = DefInst->getOperand(2).getReg();
-      if (DefInst->getOperand(3).getSubReg() != AMDGPU::NoSubRegister)
-        continue;
-      IndexReg = DefInst->getOperand(3).getReg();
-      // Chase the IndexReg.
-      MachineInstr *MI = MRI.getUniqueVRegDef(IndexReg);
-      if (!MI || !MI->isCopy())
-        continue;
-      // Make sure the reg class is 64 bit for Index.
-      // If the Index register is a subreg, we want it to reference
-      // a 64 bit register which we will use as the Index reg.
-      const TargetRegisterClass *IdxRC, *BaseRC;
-      IdxRC = MRI.getRegClass(MI->getOperand(1).getReg());
-      if (AMDGPU::getRegBitWidth(IdxRC->getID()) != 64)
-        continue;
-      IndexReg = MI->getOperand(1).getReg();
-      // Chase the BaseReg.
-      MI = MRI.getUniqueVRegDef(BaseReg);
-      if (!MI || !MI->isCopy())
-        continue;
-      // Make sure the register class is 64 bit for Base.
-      BaseReg = MI->getOperand(1).getReg();
-      BaseRC = MRI.getRegClass(BaseReg);
-      if (AMDGPU::getRegBitWidth(BaseRC->getID()) != 64)
-        continue;
-      // Make sure Base is SReg and Index is VReg.
-      if (!TRI->isSGPRReg(MRI, BaseReg))
-        return false;
-      if (!TRI->hasVGPRs(MRI.getRegClass(IndexReg)))
-        return false;
-      // clear any killed flags on Index and Base regs, used later.
-      MRI.clearKillFlags(IndexReg);
-      MRI.clearKillFlags(BaseReg);
-      return true;
-    }
-  }
-  return false;
-}
-
-// Identify Global LOAD|STORE/ATOMIC and try to convert to _SADDR.
-static bool fixupGlobalSaddr(MachineBasicBlock &MBB,
-                             MachineFunction &MF,
-                             MachineRegisterInfo &MRI,
-                             const GCNSubtarget &ST,
-                             const SIInstrInfo *TII,
-                             const SIRegisterInfo *TRI) {
-  if (!EnableGlobalSGPRAddr)
-    return false;
-  bool FuncModified = false;
-  MachineBasicBlock::iterator I, Next;
-  for (I = MBB.begin(); I != MBB.end(); I = Next) {
-    Next = std::next(I);
-    MachineInstr &MI = *I;
-    int NewOpcd = AMDGPU::getGlobalSaddrOp(MI.getOpcode());
-    if (NewOpcd < 0)
-      continue;
-    // Update our statistics on opportunities seen.
-    ++NumSGPRGlobalOccurs;
-    LLVM_DEBUG(dbgs() << "Global Mem opp " << MI << '\n');
-    // Need a Base and Index or we cant transform to _SADDR.
-    unsigned BaseReg = 0;
-    unsigned IndexReg = 0;
-    MachineOperand *Op = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
-    if (!findSRegBaseAndIndex(Op, BaseReg, IndexReg, MRI, TRI))
-      continue;
-    ++NumSGPRGlobalSaddrs;
-    FuncModified = true;
-    // Create the new _SADDR Memory instruction.
-    bool HasVdst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst) != nullptr;
-    MachineOperand *VData = TII->getNamedOperand(MI, AMDGPU::OpName::vdata);
-    MachineInstr *NewGlob = nullptr;
-    NewGlob = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpcd));
-    if (HasVdst)
-      NewGlob->addOperand(MF, MI.getOperand(0));
-    NewGlob->addOperand(MF, MachineOperand::CreateReg(IndexReg, false));
-    if (VData)
-      NewGlob->addOperand(MF, *VData);
-    NewGlob->addOperand(MF, MachineOperand::CreateReg(BaseReg, false));
-    NewGlob->addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::offset));
-
-    MachineOperand *Glc = TII->getNamedOperand(MI, AMDGPU::OpName::glc);
-    // Atomics dont have a GLC, so omit the field if not there.
-    if (Glc)
-      NewGlob->addOperand(MF, *Glc);
-
-    MachineOperand *DLC = TII->getNamedOperand(MI, AMDGPU::OpName::dlc);
-    if (DLC)
-      NewGlob->addOperand(MF, *DLC);
-
-    NewGlob->addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::slc));
-    // _D16 have an vdst_in operand, copy it in.
-    MachineOperand *VDstInOp = TII->getNamedOperand(MI,
-                                      AMDGPU::OpName::vdst_in);
-    if (VDstInOp)
-      NewGlob->addOperand(MF, *VDstInOp);
-    NewGlob->copyImplicitOps(MF, MI);
-    NewGlob->cloneMemRefs(MF, MI);
-    // Remove the old Global Memop instruction.
-    MI.eraseFromParent();
-    LLVM_DEBUG(dbgs() << "New Global Mem " << *NewGlob << '\n');
-  }
-  return FuncModified;
-}
-
-bool SIFixupVectorISel::runOnMachineFunction(MachineFunction &MF) {
-  // Only need to run this in SelectionDAG path.
-  if (MF.getProperties().hasProperty(
-        MachineFunctionProperties::Property::Selected))
-    return false;
-
-  if (skipFunction(MF.getFunction()))
-    return false;
-
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-  const SIInstrInfo *TII = ST.getInstrInfo();
-  const SIRegisterInfo *TRI = ST.getRegisterInfo();
-
-  bool FuncModified = false;
-  for (MachineBasicBlock &MBB : MF) {
-    // Cleanup missed Saddr opportunites from ISel.
-    FuncModified |= fixupGlobalSaddr(MBB, MF, MRI, ST, TII, TRI);
-  }
-  return FuncModified;
-}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 92980d2406cf..d5fa9afded27 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -9,18 +9,11 @@
 //
 
 #include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "SIInstrInfo.h"
-#include "SIMachineFunctionInfo.h"
+#include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIMachineFunctionInfo.h"
 #include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/SetVector.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
 
 #define DEBUG_TYPE "si-fold-operands"
 using namespace llvm;
@@ -35,7 +28,7 @@ struct FoldCandidate {
     int FrameIndexToFold;
   };
   int ShrinkOpcode;
-  unsigned char UseOpNo;
+  unsigned UseOpNo;
   MachineOperand::MachineOperandType Kind;
   bool Commuted;
 
@@ -129,6 +122,23 @@ char SIFoldOperands::ID = 0;
 
 char &llvm::SIFoldOperandsID = SIFoldOperands::ID;
 
+// Map multiply-accumulate opcode to corresponding multiply-add opcode if any.
+static unsigned macToMad(unsigned Opc) {
+  switch (Opc) {
+  case AMDGPU::V_MAC_F32_e64:
+    return AMDGPU::V_MAD_F32_e64;
+  case AMDGPU::V_MAC_F16_e64:
+    return AMDGPU::V_MAD_F16_e64;
+  case AMDGPU::V_FMAC_F32_e64:
+    return AMDGPU::V_FMA_F32_e64;
+  case AMDGPU::V_FMAC_F16_e64:
+    return AMDGPU::V_FMA_F16_gfx9_e64;
+  case AMDGPU::V_FMAC_LEGACY_F32_e64:
+    return AMDGPU::V_FMA_LEGACY_F32_e64;
+  }
+  return AMDGPU::INSTRUCTION_LIST_END;
+}
+
 // Wrapper around isInlineConstant that understands special cases when
 // instruction types are replaced during operand folding.
 static bool isInlineConstantIfFolded(const SIInstrInfo *TII,
@@ -139,31 +149,18 @@ static bool isInlineConstantIfFolded(const SIInstrInfo *TII,
     return true;
 
   unsigned Opc = UseMI.getOpcode();
-  switch (Opc) {
-  case AMDGPU::V_MAC_F32_e64:
-  case AMDGPU::V_MAC_F16_e64:
-  case AMDGPU::V_FMAC_F32_e64:
-  case AMDGPU::V_FMAC_F16_e64: {
+  unsigned NewOpc = macToMad(Opc);
+  if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {
     // Special case for mac. Since this is replaced with mad when folded into
     // src2, we need to check the legality for the final instruction.
     int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
     if (static_cast<int>(OpNo) == Src2Idx) {
-      bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64 ||
-                   Opc == AMDGPU::V_FMAC_F16_e64;
-      bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64 ||
-                   Opc == AMDGPU::V_FMAC_F32_e64;
-
-      unsigned Opc = IsFMA ?
-        (IsF32 ? AMDGPU::V_FMA_F32 : AMDGPU::V_FMA_F16_gfx9) :
-        (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
-      const MCInstrDesc &MadDesc = TII->get(Opc);
+      const MCInstrDesc &MadDesc = TII->get(NewOpc);
       return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType);
     }
-    return false;
-  }
-  default:
-    return false;
   }
+
+  return false;
 }
 
 // TODO: Add heuristic that the frame index might not fit in the addressing mode
@@ -172,9 +169,23 @@ static bool frameIndexMayFold(const SIInstrInfo *TII,
                               const MachineInstr &UseMI,
                               int OpNo,
                               const MachineOperand &OpToFold) {
-  return OpToFold.isFI() &&
-    (TII->isMUBUF(UseMI) || TII->isFLATScratch(UseMI)) &&
-    OpNo == AMDGPU::getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::vaddr);
+  if (!OpToFold.isFI())
+    return false;
+
+  if (TII->isMUBUF(UseMI))
+    return OpNo == AMDGPU::getNamedOperandIdx(UseMI.getOpcode(),
+                                              AMDGPU::OpName::vaddr);
+  if (!TII->isFLATScratch(UseMI))
+    return false;
+
+  int SIdx = AMDGPU::getNamedOperandIdx(UseMI.getOpcode(),
+                                        AMDGPU::OpName::saddr);
+  if (OpNo == SIdx)
+    return true;
+
+  int VIdx = AMDGPU::getNamedOperandIdx(UseMI.getOpcode(),
+                                        AMDGPU::OpName::vaddr);
+  return OpNo == VIdx && SIdx == -1;
 }
 
 FunctionPass *llvm::createSIFoldOperandsPass() {
@@ -282,9 +293,6 @@ static bool updateOperand(FoldCandidate &Fold,
   assert(!Fold.needsShrink() && "not handled");
 
   if (Fold.isImm()) {
-    // FIXME: ChangeToImmediate should probably clear the subreg flags. It's
-    // reinterpreted as TargetFlags.
-    Old.setSubReg(0);
     Old.ChangeToImmediate(Fold.ImmToFold);
     return true;
   }
@@ -335,17 +343,8 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
   if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) {
     // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
     unsigned Opc = MI->getOpcode();
-    if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
-         Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
-        (int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) {
-      bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64 ||
-                   Opc == AMDGPU::V_FMAC_F16_e64;
-      bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64 ||
-                   Opc == AMDGPU::V_FMAC_F32_e64;
-      unsigned NewOpc = IsFMA ?
-        (IsF32 ? AMDGPU::V_FMA_F32 : AMDGPU::V_FMA_F16_gfx9) :
-        (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
-
+    unsigned NewOpc = macToMad(Opc);
+    if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {
       // Check if changing this to a v_mad_{f16, f32} instruction will allow us
       // to fold the operand.
       MI->setDesc(TII->get(NewOpc));
@@ -358,10 +357,17 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
     }
 
     // Special case for s_setreg_b32
-    if (Opc == AMDGPU::S_SETREG_B32 && OpToFold->isImm()) {
-      MI->setDesc(TII->get(AMDGPU::S_SETREG_IMM32_B32));
-      appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
-      return true;
+    if (OpToFold->isImm()) {
+      unsigned ImmOpc = 0;
+      if (Opc == AMDGPU::S_SETREG_B32)
+        ImmOpc = AMDGPU::S_SETREG_IMM32_B32;
+      else if (Opc == AMDGPU::S_SETREG_B32_mode)
+        ImmOpc = AMDGPU::S_SETREG_IMM32_B32_mode;
+      if (ImmOpc) {
+        MI->setDesc(TII->get(ImmOpc));
+        appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
+        return true;
+      }
     }
 
     // If we are already folding into another operand of MI, then
@@ -399,9 +405,9 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
       return false;
 
     if (!TII->isOperandLegal(*MI, CommuteOpNo, OpToFold)) {
-      if ((Opc == AMDGPU::V_ADD_I32_e64 ||
-           Opc == AMDGPU::V_SUB_I32_e64 ||
-           Opc == AMDGPU::V_SUBREV_I32_e64) && // FIXME
+      if ((Opc == AMDGPU::V_ADD_CO_U32_e64 ||
+           Opc == AMDGPU::V_SUB_CO_U32_e64 ||
+           Opc == AMDGPU::V_SUBREV_CO_U32_e64) && // FIXME
           (OpToFold->isImm() || OpToFold->isFI() || OpToFold->isGlobal())) {
         MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
 
@@ -463,7 +469,18 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
 static bool isUseSafeToFold(const SIInstrInfo *TII,
                             const MachineInstr &MI,
                             const MachineOperand &UseMO) {
-  return !UseMO.isUndef() && !TII->isSDWA(MI);
+  if (UseMO.isUndef() || TII->isSDWA(MI))
+    return false;
+
+  switch (MI.getOpcode()) {
+  case AMDGPU::V_MOV_B32_e32:
+  case AMDGPU::V_MOV_B32_e64:
+  case AMDGPU::V_MOV_B64_PSEUDO:
+    // Do not fold into an indirect mov.
+    return !MI.hasRegisterImplicitUseOperand(AMDGPU::M0);
+  }
+
+  return true;
   //return !MI.hasRegisterImplicitUseOperand(UseMO.getReg());
 }
 
@@ -528,11 +545,12 @@ static bool tryToFoldACImm(const SIInstrInfo *TII,
     return false;
 
   Register UseReg = OpToFold.getReg();
-  if (!Register::isVirtualRegister(UseReg))
+  if (!UseReg.isVirtual())
     return false;
 
-  if (llvm::find_if(FoldList, [UseMI](const FoldCandidate &FC) {
-        return FC.UseMI == UseMI; }) != FoldList.end())
+  if (llvm::any_of(FoldList, [UseMI](const FoldCandidate &FC) {
+        return FC.UseMI == UseMI;
+      }))
     return false;
 
   MachineRegisterInfo &MRI = UseMI->getParent()->getParent()->getRegInfo();
@@ -587,9 +605,9 @@ void SIFoldOperands::foldOperand(
     Register RegSeqDstReg = UseMI->getOperand(0).getReg();
     unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
 
-    MachineRegisterInfo::use_iterator Next;
-    for (MachineRegisterInfo::use_iterator
-           RSUse = MRI->use_begin(RegSeqDstReg), RSE = MRI->use_end();
+    MachineRegisterInfo::use_nodbg_iterator Next;
+    for (MachineRegisterInfo::use_nodbg_iterator
+           RSUse = MRI->use_nodbg_begin(RegSeqDstReg), RSE = MRI->use_nodbg_end();
          RSUse != RSE; RSUse = Next) {
       Next = std::next(RSUse);
 
@@ -616,25 +634,30 @@ void SIFoldOperands::foldOperand(
     // Sanity check that this is a stack access.
     // FIXME: Should probably use stack pseudos before frame lowering.
 
-    if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() !=
-        MFI->getScratchRSrcReg())
-      return;
+    if (TII->isMUBUF(*UseMI)) {
+      if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() !=
+          MFI->getScratchRSrcReg())
+        return;
 
-    // Ensure this is either relative to the current frame or the current wave.
-    MachineOperand &SOff =
-        *TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset);
-    if ((!SOff.isReg() || SOff.getReg() != MFI->getStackPtrOffsetReg()) &&
-        (!SOff.isImm() || SOff.getImm() != 0))
-      return;
+      // Ensure this is either relative to the current frame or the current
+      // wave.
+      MachineOperand &SOff =
+          *TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset);
+      if (!SOff.isImm() || SOff.getImm() != 0)
+        return;
+    }
 
     // A frame index will resolve to a positive constant, so it should always be
     // safe to fold the addressing mode, even pre-GFX9.
     UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getIndex());
 
-    // If this is relative to the current wave, update it to be relative to the
-    // current frame.
-    if (SOff.isImm())
-      SOff.ChangeToRegister(MFI->getStackPtrOffsetReg(), false);
+    if (TII->isFLATScratch(*UseMI) &&
+        AMDGPU::getNamedOperandIdx(UseMI->getOpcode(),
+                                   AMDGPU::OpName::vaddr) != -1) {
+      unsigned NewOpc = AMDGPU::getFlatScratchInstSSfromSV(UseMI->getOpcode());
+      UseMI->setDesc(TII->get(NewOpc));
+    }
+
     return;
   }
 
@@ -643,42 +666,46 @@ void SIFoldOperands::foldOperand(
 
   if (FoldingImmLike && UseMI->isCopy()) {
     Register DestReg = UseMI->getOperand(0).getReg();
+    Register SrcReg = UseMI->getOperand(1).getReg();
+    assert(SrcReg.isVirtual());
 
-    // Don't fold into a copy to a physical register. Doing so would interfere
-    // with the register coalescer's logic which would avoid redundant
-    // initalizations.
-    if (DestReg.isPhysical())
-      return;
+    const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg);
 
-    const TargetRegisterClass *DestRC =  MRI->getRegClass(DestReg);
+    // Don't fold into a copy to a physical register with the same class. Doing
+    // so would interfere with the register coalescer's logic which would avoid
+    // redundant initalizations.
+    if (DestReg.isPhysical() && SrcRC->contains(DestReg))
+      return;
 
-    Register SrcReg = UseMI->getOperand(1).getReg();
-    if (SrcReg.isVirtual()) { // XXX - This can be an assert?
-      const TargetRegisterClass * SrcRC = MRI->getRegClass(SrcReg);
+    const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg);
+    if (!DestReg.isPhysical()) {
       if (TRI->isSGPRClass(SrcRC) && TRI->hasVectorRegisters(DestRC)) {
-        MachineRegisterInfo::use_iterator NextUse;
+        MachineRegisterInfo::use_nodbg_iterator NextUse;
         SmallVector<FoldCandidate, 4> CopyUses;
-        for (MachineRegisterInfo::use_iterator
-          Use = MRI->use_begin(DestReg), E = MRI->use_end();
-          Use != E; Use = NextUse) {
+        for (MachineRegisterInfo::use_nodbg_iterator Use = MRI->use_nodbg_begin(DestReg),
+               E = MRI->use_nodbg_end();
+             Use != E; Use = NextUse) {
           NextUse = std::next(Use);
-          FoldCandidate FC = FoldCandidate(Use->getParent(),
-           Use.getOperandNo(), &UseMI->getOperand(1));
+          // There's no point trying to fold into an implicit operand.
+          if (Use->isImplicit())
+            continue;
+
+          FoldCandidate FC = FoldCandidate(Use->getParent(), Use.getOperandNo(),
+                                           &UseMI->getOperand(1));
           CopyUses.push_back(FC);
-       }
-        for (auto & F : CopyUses) {
-          foldOperand(*F.OpToFold, F.UseMI, F.UseOpNo,
-           FoldList, CopiesToReplace);
+        }
+        for (auto &F : CopyUses) {
+          foldOperand(*F.OpToFold, F.UseMI, F.UseOpNo, FoldList, CopiesToReplace);
         }
       }
-    }
 
-    if (DestRC == &AMDGPU::AGPR_32RegClass &&
-        TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
-      UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32));
-      UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
-      CopiesToReplace.push_back(UseMI);
-      return;
+      if (DestRC == &AMDGPU::AGPR_32RegClass &&
+          TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
+        UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64));
+        UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
+        CopiesToReplace.push_back(UseMI);
+        return;
+      }
     }
 
     // In order to fold immediates into copies, we need to change the
@@ -738,7 +765,7 @@ void SIFoldOperands::foldOperand(
 
             auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
             BuildMI(MBB, UseMI, DL,
-                    TII->get(AMDGPU::V_ACCVGPR_WRITE_B32), Tmp).addImm(Imm);
+                    TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addImm(Imm);
             B.addReg(Tmp);
           } else if (Def->isReg() && TRI->isAGPR(*MRI, Def->getReg())) {
             auto Src = getRegSubRegPair(*Def);
@@ -780,7 +807,7 @@ void SIFoldOperands::foldOperand(
             }
             auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
             BuildMI(MBB, UseMI, DL,
-                    TII->get(AMDGPU::V_ACCVGPR_WRITE_B32), Tmp).addReg(Vgpr);
+                    TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addReg(Vgpr);
             B.addReg(Tmp);
           }
 
@@ -794,10 +821,10 @@ void SIFoldOperands::foldOperand(
         return;
       if (TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) &&
           TRI->isVGPR(*MRI, UseMI->getOperand(1).getReg()))
-        UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32));
+        UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64));
       else if (TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) &&
                TRI->isAGPR(*MRI, UseMI->getOperand(1).getReg()))
-        UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_READ_B32));
+        UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64));
       return;
     }
 
@@ -819,8 +846,6 @@ void SIFoldOperands::foldOperand(
 
         UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));
 
-        // FIXME: ChangeToImmediate should clear subreg
-        UseMI->getOperand(1).setSubReg(0);
         if (OpToFold.isImm())
           UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
         else
@@ -991,8 +1016,7 @@ static MachineOperand *getImmOrMaterializedImm(MachineRegisterInfo &MRI,
                                                MachineOperand &Op) {
   if (Op.isReg()) {
     // If this has a subregister, it obviously is a register source.
-    if (Op.getSubReg() != AMDGPU::NoSubRegister ||
-        !Register::isVirtualRegister(Op.getReg()))
+    if (Op.getSubReg() != AMDGPU::NoSubRegister || !Op.getReg().isVirtual())
       return &Op;
 
     MachineInstr *Def = MRI.getVRegDef(Op.getReg());
@@ -1032,25 +1056,6 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI,
   if (!Src0->isImm() && !Src1->isImm())
     return false;
 
-  if (MI->getOpcode() == AMDGPU::V_LSHL_OR_B32 ||
-      MI->getOpcode() == AMDGPU::V_LSHL_ADD_U32 ||
-      MI->getOpcode() == AMDGPU::V_AND_OR_B32) {
-    if (Src0->isImm() && Src0->getImm() == 0) {
-      // v_lshl_or_b32 0, X, Y -> copy Y
-      // v_lshl_or_b32 0, X, K -> v_mov_b32 K
-      // v_lshl_add_b32 0, X, Y -> copy Y
-      // v_lshl_add_b32 0, X, K -> v_mov_b32 K
-      // v_and_or_b32 0, X, Y -> copy Y
-      // v_and_or_b32 0, X, K -> v_mov_b32 K
-      bool UseCopy = TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->isReg();
-      MI->RemoveOperand(Src1Idx);
-      MI->RemoveOperand(Src0Idx);
-
-      MI->setDesc(TII->get(UseCopy ? AMDGPU::COPY : AMDGPU::V_MOV_B32_e32));
-      return true;
-    }
-  }
-
   // and k0, k1 -> v_mov_b32 (k0 & k1)
   // or k0, k1 -> v_mov_b32 (k0 | k1)
   // xor k0, k1 -> v_mov_b32 (k0 ^ k1)
@@ -1178,9 +1183,9 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
     MachineOperand *NonInlineUse = nullptr;
     int NonInlineUseOpNo = -1;
 
-    MachineRegisterInfo::use_iterator NextUse;
-    for (MachineRegisterInfo::use_iterator
-           Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end();
+    MachineRegisterInfo::use_nodbg_iterator NextUse;
+    for (MachineRegisterInfo::use_nodbg_iterator
+           Use = MRI->use_nodbg_begin(Dst.getReg()), E = MRI->use_nodbg_end();
          Use != E; Use = NextUse) {
       NextUse = std::next(Use);
       MachineInstr *UseMI = Use->getParent();
@@ -1202,7 +1207,7 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
         // instruction, e.g. and x, 0 -> 0. Make sure we re-visit the user
         // again. The same constant folded instruction could also have a second
         // use operand.
-        NextUse = MRI->use_begin(Dst.getReg());
+        NextUse = MRI->use_nodbg_begin(Dst.getReg());
         FoldList.clear();
         continue;
       }
@@ -1241,9 +1246,9 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
     }
   } else {
     // Folding register.
-    SmallVector <MachineRegisterInfo::use_iterator, 4> UsesToProcess;
-    for (MachineRegisterInfo::use_iterator
-           Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end();
+    SmallVector <MachineRegisterInfo::use_nodbg_iterator, 4> UsesToProcess;
+    for (MachineRegisterInfo::use_nodbg_iterator
+           Use = MRI->use_nodbg_begin(Dst.getReg()), E = MRI->use_nodbg_end();
          Use != E; ++Use) {
       UsesToProcess.push_back(Use);
     }
@@ -1260,9 +1265,12 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
   for (MachineInstr *Copy : CopiesToReplace)
     Copy->addImplicitDefUseOperands(*MF);
 
+  SmallPtrSet<MachineInstr *, 16> Folded;
   for (FoldCandidate &Fold : FoldList) {
     assert(!Fold.isReg() || Fold.OpToFold);
-    if (Fold.isReg() && Register::isVirtualRegister(Fold.OpToFold->getReg())) {
+    if (Folded.count(Fold.UseMI))
+      continue;
+    if (Fold.isReg() && Fold.OpToFold->getReg().isVirtual()) {
       Register Reg = Fold.OpToFold->getReg();
       MachineInstr *DefMI = Fold.OpToFold->getParent();
       if (DefMI->readsRegister(AMDGPU::EXEC, TRI) &&
@@ -1281,7 +1289,8 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
       LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo "
                         << static_cast<int>(Fold.UseOpNo) << " of "
                         << *Fold.UseMI << '\n');
-      tryFoldInst(TII, Fold.UseMI);
+      if (tryFoldInst(TII, Fold.UseMI))
+        Folded.insert(Fold.UseMI);
     } else if (Fold.isCommuted()) {
       // Restoring instruction's original operand order if fold has failed.
       TII->commuteInstruction(*Fold.UseMI, false);
@@ -1296,7 +1305,7 @@ const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const {
   switch (Op) {
   case AMDGPU::V_MAX_F32_e64:
   case AMDGPU::V_MAX_F16_e64:
-  case AMDGPU::V_MAX_F64:
+  case AMDGPU::V_MAX_F64_e64:
   case AMDGPU::V_PK_MAX_F16: {
     if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
       return nullptr;
@@ -1557,7 +1566,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
       if (!FoldingImm && !OpToFold.isReg())
         continue;
 
-      if (OpToFold.isReg() && !Register::isVirtualRegister(OpToFold.getReg()))
+      if (OpToFold.isReg() && !OpToFold.getReg().isVirtual())
         continue;
 
       // Prevent folding operands backwards in the function. For example,
@@ -1567,7 +1576,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
       //    ...
       //    %vgpr0 = V_MOV_B32_e32 1, implicit %exec
       MachineOperand &Dst = MI.getOperand(0);
-      if (Dst.isReg() && !Register::isVirtualRegister(Dst.getReg()))
+      if (Dst.isReg() && !Dst.getReg().isVirtual())
         continue;
 
       foldInstOperand(MI, OpToFold);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
index 8ef02e73865d..a12e013b4fe6 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
@@ -14,15 +14,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
 #include "GCNRegPressure.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
-#include "SIRegisterInfo.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/CodeGen/LiveIntervals.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/InitializePasses.h"
 
 using namespace llvm;
@@ -60,9 +53,14 @@ public:
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
+  MachineFunctionProperties getClearedProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::IsSSA);
+  }
+
 private:
   template <typename Callable>
-  void forAllLanes(unsigned Reg, LaneBitmask LaneMask, Callable Func) const;
+  void forAllLanes(Register Reg, LaneBitmask LaneMask, Callable Func) const;
 
   bool canBundle(const MachineInstr &MI, RegUse &Defs, RegUse &Uses) const;
   bool checkPressure(const MachineInstr &MI, GCNDownwardRPTracker &RPT);
@@ -145,15 +143,15 @@ static unsigned getMopState(const MachineOperand &MO) {
     S |= RegState::Kill;
   if (MO.isEarlyClobber())
     S |= RegState::EarlyClobber;
-  if (Register::isPhysicalRegister(MO.getReg()) && MO.isRenamable())
+  if (MO.getReg().isPhysical() && MO.isRenamable())
     S |= RegState::Renamable;
   return S;
 }
 
 template <typename Callable>
-void SIFormMemoryClauses::forAllLanes(unsigned Reg, LaneBitmask LaneMask,
+void SIFormMemoryClauses::forAllLanes(Register Reg, LaneBitmask LaneMask,
                                       Callable Func) const {
-  if (LaneMask.all() || Register::isPhysicalRegister(Reg) ||
+  if (LaneMask.all() || Reg.isPhysical() ||
       LaneMask == MRI->getMaxLaneMaskForVReg(Reg)) {
     Func(0);
     return;
@@ -228,7 +226,7 @@ bool SIFormMemoryClauses::canBundle(const MachineInstr &MI,
     if (Conflict == Map.end())
       continue;
 
-    if (Register::isPhysicalRegister(Reg))
+    if (Reg.isPhysical())
       return false;
 
     LaneBitmask Mask = TRI->getSubRegIndexLaneMask(MO.getSubReg());
@@ -270,7 +268,7 @@ void SIFormMemoryClauses::collectRegUses(const MachineInstr &MI,
     if (!Reg)
       continue;
 
-    LaneBitmask Mask = Register::isVirtualRegister(Reg)
+    LaneBitmask Mask = Reg.isVirtual()
                            ? TRI->getSubRegIndexLaneMask(MO.getSubReg())
                            : LaneBitmask::getAll();
     RegUse &Map = MO.isDef() ? Defs : Uses;
@@ -324,6 +322,7 @@ bool SIFormMemoryClauses::runOnMachineFunction(MachineFunction &MF) {
       MF.getFunction(), "amdgpu-max-memory-clause", MaxClause);
 
   for (MachineBasicBlock &MBB : MF) {
+    GCNDownwardRPTracker RPT(*LIS);
     MachineBasicBlock::instr_iterator Next;
     for (auto I = MBB.instr_begin(), E = MBB.instr_end(); I != E; I = Next) {
       MachineInstr &MI = *I;
@@ -334,12 +333,19 @@ bool SIFormMemoryClauses::runOnMachineFunction(MachineFunction &MF) {
       if (!isValidClauseInst(MI, IsVMEM))
         continue;
 
-      RegUse Defs, Uses;
-      GCNDownwardRPTracker RPT(*LIS);
-      RPT.reset(MI);
+      if (!RPT.getNext().isValid())
+        RPT.reset(MI);
+      else { // Advance the state to the current MI.
+        RPT.advance(MachineBasicBlock::const_iterator(MI));
+        RPT.advanceBeforeNext();
+      }
 
-      if (!processRegUses(MI, Defs, Uses, RPT))
+      const GCNRPTracker::LiveRegSet LiveRegsCopy(RPT.getLiveRegs());
+      RegUse Defs, Uses;
+      if (!processRegUses(MI, Defs, Uses, RPT)) {
+        RPT.reset(MI, &LiveRegsCopy);
         continue;
+      }
 
       unsigned Length = 1;
       for ( ; Next != E && Length < FuncMaxClause; ++Next) {
@@ -354,8 +360,10 @@ bool SIFormMemoryClauses::runOnMachineFunction(MachineFunction &MF) {
 
         ++Length;
       }
-      if (Length < 2)
+      if (Length < 2) {
+        RPT.reset(MI, &LiveRegsCopy);
         continue;
+      }
 
       Changed = true;
       MFI->limitOccupancy(LastRecordedOccupancy);
@@ -363,6 +371,9 @@ bool SIFormMemoryClauses::runOnMachineFunction(MachineFunction &MF) {
       auto B = BuildMI(MBB, I, DebugLoc(), TII->get(TargetOpcode::BUNDLE));
       Ind->insertMachineInstrInMaps(*B);
 
+      // Restore the state after processing the bundle.
+      RPT.reset(*B, &LiveRegsCopy);
+
       for (auto BI = I; BI != Next; ++BI) {
         BI->bundleWithPred();
         Ind->removeSingleMachineInstrFromMaps(*BI);
@@ -388,17 +399,17 @@ bool SIFormMemoryClauses::runOnMachineFunction(MachineFunction &MF) {
       }
 
       for (auto &&R : Defs) {
-        unsigned Reg = R.first;
+        Register Reg = R.first;
         Uses.erase(Reg);
-        if (Register::isPhysicalRegister(Reg))
+        if (Reg.isPhysical())
           continue;
         LIS->removeInterval(Reg);
         LIS->createAndComputeVirtRegInterval(Reg);
       }
 
       for (auto &&R : Uses) {
-        unsigned Reg = R.first;
-        if (Register::isPhysicalRegister(Reg))
+        Register Reg = R.first;
+        if (Reg.isPhysical())
           continue;
         LIS->removeInterval(Reg);
         LIS->createAndComputeVirtRegInterval(Reg);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index a2e802009d09..0398d27756db 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -7,17 +7,14 @@
 //==-----------------------------------------------------------------------===//
 
 #include "SIFrameLowering.h"
-#include "AMDGPUSubtarget.h"
-#include "SIInstrInfo.h"
-#include "SIMachineFunctionInfo.h"
-#include "SIRegisterInfo.h"
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-
+#include "SIMachineFunctionInfo.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
 
@@ -112,15 +109,19 @@ static void getVGPRSpillLaneOrTempRegister(MachineFunction &MF,
       // 3: There's no free lane to spill, and no free register to save FP/BP,
       // so we're forced to spill another VGPR to use for the spill.
       FrameIndex = NewFI;
+
+      LLVM_DEBUG(
+          auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
+          dbgs() << (IsFP ? "FP" : "BP") << " requires fallback spill to "
+                 << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane << '\n';);
     } else {
+      // Remove dead <NewFI> index
+      MF.getFrameInfo().RemoveStackObject(NewFI);
       // 4: If all else fails, spill the FP/BP to memory.
       FrameIndex = FrameInfo.CreateSpillStackObject(4, Align(4));
+      LLVM_DEBUG(dbgs() << "Reserved FI " << FrameIndex << " for spilling "
+                        << (IsFP ? "FP" : "BP") << '\n');
     }
-
-    LLVM_DEBUG(auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
-               dbgs() << (IsFP ? "FP" : "BP") << " requires fallback spill to "
-                      << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane
-                      << '\n';);
   } else {
     LLVM_DEBUG(dbgs() << "Saving " << (IsFP ? "FP" : "BP") << " with copy to "
                       << printReg(TempSGPR, TRI) << '\n');
@@ -130,7 +131,8 @@ static void getVGPRSpillLaneOrTempRegister(MachineFunction &MF,
 // We need to specially emit stack operations here because a different frame
 // register is used than in the rest of the function, as getFrameRegister would
 // use.
-static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
+static void buildPrologSpill(const GCNSubtarget &ST, LivePhysRegs &LiveRegs,
+                             MachineBasicBlock &MBB,
                              MachineBasicBlock::iterator I,
                              const SIInstrInfo *TII, Register SpillReg,
                              Register ScratchRsrcReg, Register SPReg, int FI) {
@@ -143,7 +145,19 @@ static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
       MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore, 4,
       MFI.getObjectAlign(FI));
 
-  if (isUInt<12>(Offset)) {
+  if (ST.enableFlatScratch()) {
+    if (TII->isLegalFLATOffset(Offset, AMDGPUAS::PRIVATE_ADDRESS, true)) {
+      BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::SCRATCH_STORE_DWORD_SADDR))
+        .addReg(SpillReg, RegState::Kill)
+        .addReg(SPReg)
+        .addImm(Offset)
+        .addImm(0) // glc
+        .addImm(0) // slc
+        .addImm(0) // dlc
+        .addMemOperand(MMO);
+      return;
+    }
+  } else if (SIInstrInfo::isLegalMUBUFImmOffset(Offset)) {
     BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFSET))
       .addReg(SpillReg, RegState::Kill)
       .addReg(ScratchRsrcReg)
@@ -158,27 +172,52 @@ static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
     return;
   }
 
-  MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
-    MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass);
+  // Don't clobber the TmpVGPR if we also need a scratch reg for the stack
+  // offset in the spill.
+  LiveRegs.addReg(SpillReg);
 
-  BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg)
-    .addImm(Offset);
+  if (ST.enableFlatScratch()) {
+    MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
+      MF->getRegInfo(), LiveRegs, AMDGPU::SReg_32_XM0RegClass);
 
-  BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFEN))
-    .addReg(SpillReg, RegState::Kill)
-    .addReg(OffsetReg, RegState::Kill)
-    .addReg(ScratchRsrcReg)
-    .addReg(SPReg)
-    .addImm(0)
-    .addImm(0) // glc
-    .addImm(0) // slc
-    .addImm(0) // tfe
-    .addImm(0) // dlc
-    .addImm(0) // swz
-    .addMemOperand(MMO);
+    BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_ADD_U32), OffsetReg)
+      .addReg(SPReg)
+      .addImm(Offset);
+
+    BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::SCRATCH_STORE_DWORD_SADDR))
+      .addReg(SpillReg, RegState::Kill)
+      .addReg(OffsetReg, RegState::Kill)
+      .addImm(0)
+      .addImm(0) // glc
+      .addImm(0) // slc
+      .addImm(0) // dlc
+      .addMemOperand(MMO);
+  } else {
+    MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
+      MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass);
+
+    BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg)
+      .addImm(Offset);
+
+    BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFEN))
+      .addReg(SpillReg, RegState::Kill)
+      .addReg(OffsetReg, RegState::Kill)
+      .addReg(ScratchRsrcReg)
+      .addReg(SPReg)
+      .addImm(0)
+      .addImm(0) // glc
+      .addImm(0) // slc
+      .addImm(0) // tfe
+      .addImm(0) // dlc
+      .addImm(0) // swz
+      .addMemOperand(MMO);
+  }
+
+  LiveRegs.removeReg(SpillReg);
 }
 
-static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
+static void buildEpilogReload(const GCNSubtarget &ST, LivePhysRegs &LiveRegs,
+                              MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator I,
                               const SIInstrInfo *TII, Register SpillReg,
                               Register ScratchRsrcReg, Register SPReg, int FI) {
@@ -190,7 +229,36 @@ static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
       MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad, 4,
       MFI.getObjectAlign(FI));
 
-  if (isUInt<12>(Offset)) {
+  if (ST.enableFlatScratch()) {
+    if (TII->isLegalFLATOffset(Offset, AMDGPUAS::PRIVATE_ADDRESS, true)) {
+      BuildMI(MBB, I, DebugLoc(),
+              TII->get(AMDGPU::SCRATCH_LOAD_DWORD_SADDR), SpillReg)
+        .addReg(SPReg)
+        .addImm(Offset)
+        .addImm(0) // glc
+        .addImm(0) // slc
+        .addImm(0) // dlc
+        .addMemOperand(MMO);
+      return;
+    }
+    MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
+      MF->getRegInfo(), LiveRegs, AMDGPU::SReg_32_XM0RegClass);
+
+      BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_ADD_U32), OffsetReg)
+        .addReg(SPReg)
+        .addImm(Offset);
+      BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::SCRATCH_LOAD_DWORD_SADDR),
+              SpillReg)
+      .addReg(OffsetReg, RegState::Kill)
+      .addImm(0)
+      .addImm(0) // glc
+      .addImm(0) // slc
+      .addImm(0) // dlc
+      .addMemOperand(MMO);
+      return;
+  }
+
+  if (SIInstrInfo::isLegalMUBUFImmOffset(Offset)) {
     BuildMI(MBB, I, DebugLoc(),
             TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFSET), SpillReg)
       .addReg(ScratchRsrcReg)
@@ -225,6 +293,31 @@ static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
     .addMemOperand(MMO);
 }
 
+static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                        const DebugLoc &DL, const SIInstrInfo *TII,
+                        Register TargetReg) {
+  MachineFunction *MF = MBB.getParent();
+  const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+  const SIRegisterInfo *TRI = &TII->getRegisterInfo();
+  const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
+  Register TargetLo = TRI->getSubReg(TargetReg, AMDGPU::sub0);
+  Register TargetHi = TRI->getSubReg(TargetReg, AMDGPU::sub1);
+
+  if (MFI->getGITPtrHigh() != 0xffffffff) {
+    BuildMI(MBB, I, DL, SMovB32, TargetHi)
+        .addImm(MFI->getGITPtrHigh())
+        .addReg(TargetReg, RegState::ImplicitDefine);
+  } else {
+    const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64);
+    BuildMI(MBB, I, DL, GetPC64, TargetReg);
+  }
+  Register GitPtrLo = MFI->getGITPtrLoReg(*MF);
+  MF->getRegInfo().addLiveIn(GitPtrLo);
+  MBB.addLiveIn(GitPtrLo);
+  BuildMI(MBB, I, DL, SMovB32, TargetLo)
+    .addReg(GitPtrLo);
+}
+
 // Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()`
 void SIFrameLowering::emitEntryFunctionFlatScratchInit(
     MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
@@ -244,15 +337,74 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
   // pointer. Because we only detect if flat instructions are used at all,
   // this will be used more often than necessary on VI.
 
-  Register FlatScratchInitReg =
-      MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
+  Register FlatScrInitLo;
+  Register FlatScrInitHi;
 
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  MRI.addLiveIn(FlatScratchInitReg);
-  MBB.addLiveIn(FlatScratchInitReg);
+  if (ST.isAmdPalOS()) {
+    // Extract the scratch offset from the descriptor in the GIT
+    LivePhysRegs LiveRegs;
+    LiveRegs.init(*TRI);
+    LiveRegs.addLiveIns(MBB);
+
+    // Find unused reg to load flat scratch init into
+    MachineRegisterInfo &MRI = MF.getRegInfo();
+    Register FlatScrInit = AMDGPU::NoRegister;
+    ArrayRef<MCPhysReg> AllSGPR64s = TRI->getAllSGPR64(MF);
+    unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 1) / 2;
+    AllSGPR64s = AllSGPR64s.slice(
+        std::min(static_cast<unsigned>(AllSGPR64s.size()), NumPreloaded));
+    Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
+    for (MCPhysReg Reg : AllSGPR64s) {
+      if (LiveRegs.available(MRI, Reg) && MRI.isAllocatable(Reg) &&
+          !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) {
+        FlatScrInit = Reg;
+        break;
+      }
+    }
+    assert(FlatScrInit && "Failed to find free register for scratch init");
+
+    FlatScrInitLo = TRI->getSubReg(FlatScrInit, AMDGPU::sub0);
+    FlatScrInitHi = TRI->getSubReg(FlatScrInit, AMDGPU::sub1);
 
-  Register FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
-  Register FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
+    buildGitPtr(MBB, I, DL, TII, FlatScrInit);
+
+    // We now have the GIT ptr - now get the scratch descriptor from the entry
+    // at offset 0 (or offset 16 for a compute shader).
+    MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
+    const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
+    auto *MMO = MF.getMachineMemOperand(
+        PtrInfo,
+        MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
+            MachineMemOperand::MODereferenceable,
+        8, Align(4));
+    unsigned Offset =
+        MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
+    const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
+    unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
+    BuildMI(MBB, I, DL, LoadDwordX2, FlatScrInit)
+        .addReg(FlatScrInit)
+        .addImm(EncodedOffset) // offset
+        .addImm(0)             // glc
+        .addImm(0)             // dlc
+        .addMemOperand(MMO);
+
+    // Mask the offset in [47:0] of the descriptor
+    const MCInstrDesc &SAndB32 = TII->get(AMDGPU::S_AND_B32);
+    BuildMI(MBB, I, DL, SAndB32, FlatScrInitHi)
+        .addReg(FlatScrInitHi)
+        .addImm(0xffff);
+  } else {
+    Register FlatScratchInitReg =
+        MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
+    assert(FlatScratchInitReg);
+
+    MachineRegisterInfo &MRI = MF.getRegInfo();
+    MRI.addLiveIn(FlatScratchInitReg);
+    MBB.addLiveIn(FlatScratchInitReg);
+
+    FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
+    FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
+  }
 
   // Do a 64-bit pointer add.
   if (ST.flatScratchIsPointer()) {
@@ -274,6 +426,7 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
       return;
     }
 
+    // For GFX9.
     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO)
       .addReg(FlatScrInitLo)
       .addReg(ScratchWaveOffsetReg);
@@ -284,7 +437,7 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
     return;
   }
 
-  assert(ST.getGeneration() < AMDGPUSubtarget::GFX10);
+  assert(ST.getGeneration() < AMDGPUSubtarget::GFX9);
 
   // Copy the size in bytes.
   BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO)
@@ -302,6 +455,18 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
     .addImm(8);
 }
 
+// Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
+// memory. They should have been removed by now.
+static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
+  for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
+       I != E; ++I) {
+    if (!MFI.isDeadObjectIndex(I))
+      return false;
+  }
+
+  return true;
+}
+
 // Shift down registers reserved for the scratch RSRC.
 Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
     MachineFunction &MF) const {
@@ -316,7 +481,8 @@ Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
 
   Register ScratchRsrcReg = MFI->getScratchRSrcReg();
 
-  if (!ScratchRsrcReg || !MRI.isPhysRegUsed(ScratchRsrcReg))
+  if (!ScratchRsrcReg || (!MRI.isPhysRegUsed(ScratchRsrcReg) &&
+                          allStackObjectsAreDead(MF.getFrameInfo())))
     return Register();
 
   if (ST.hasSGPRInitBug() ||
@@ -354,6 +520,10 @@ Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
   return ScratchRsrcReg;
 }
 
+static unsigned getScratchScaleFactor(const GCNSubtarget &ST) {
+  return ST.enableFlatScratch() ? 1 : ST.getWavefrontSize();
+}
+
 void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
                                                 MachineBasicBlock &MBB) const {
   assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
@@ -390,7 +560,9 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
   //
   // This will return `Register()` in cases where there are no actual
   // uses of the SRSRC.
-  Register ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF);
+  Register ScratchRsrcReg;
+  if (!ST.enableFlatScratch())
+    ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF);
 
   // Make the selected register live throughout the function.
   if (ScratchRsrcReg) {
@@ -446,11 +618,11 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
   }
   assert(ScratchWaveOffsetReg);
 
-  if (MF.getFrameInfo().hasCalls()) {
+  if (requiresStackPointerReference(MF)) {
     Register SPReg = MFI->getStackPtrOffsetReg();
     assert(SPReg != AMDGPU::SP_REG);
     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg)
-        .addImm(MF.getFrameInfo().getStackSize() * ST.getWavefrontSize());
+        .addImm(MF.getFrameInfo().getStackSize() * getScratchScaleFactor(ST));
   }
 
   if (hasFP(MF)) {
@@ -490,26 +662,9 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
   if (ST.isAmdPalOS()) {
     // The pointer to the GIT is formed from the offset passed in and either
     // the amdgpu-git-ptr-high function attribute or the top part of the PC
-    Register RsrcLo = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
-    Register RsrcHi = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
     Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
 
-    const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
-
-    if (MFI->getGITPtrHigh() != 0xffffffff) {
-      BuildMI(MBB, I, DL, SMovB32, RsrcHi)
-        .addImm(MFI->getGITPtrHigh())
-        .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
-    } else {
-      const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64);
-      BuildMI(MBB, I, DL, GetPC64, Rsrc01);
-    }
-    Register GitPtrLo = MFI->getGITPtrLoReg(MF);
-    MF.getRegInfo().addLiveIn(GitPtrLo);
-    MBB.addLiveIn(GitPtrLo);
-    BuildMI(MBB, I, DL, SMovB32, RsrcLo)
-      .addReg(GitPtrLo)
-      .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+    buildGitPtr(MBB, I, DL, TII, Rsrc01);
 
     // We now have the GIT ptr - now get the scratch descriptor from the entry
     // at offset 0 (or offset 16 for a compute shader).
@@ -629,7 +784,7 @@ bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
   case TargetStackID::NoAlloc:
   case TargetStackID::SGPRSpill:
     return true;
-  case TargetStackID::SVEVector:
+  case TargetStackID::ScalableVector:
     return false;
   }
   llvm_unreachable("Invalid TargetStackID::Value");
@@ -769,7 +924,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
     if (!ScratchExecCopy)
       ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true);
 
-    buildPrologSpill(LiveRegs, MBB, MBBI, TII, Reg.VGPR,
+    buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, Reg.VGPR,
                      FuncInfo->getScratchRSrcReg(),
                      StackPtrReg,
                      Reg.FI.getValue());
@@ -787,7 +942,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
         .addReg(FramePtrReg);
 
-    buildPrologSpill(LiveRegs, MBB, MBBI, TII, TmpVGPR,
+    buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, TmpVGPR,
                      FuncInfo->getScratchRSrcReg(), StackPtrReg,
                      FuncInfo->FramePointerSaveIndex.getValue());
   }
@@ -804,7 +959,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
         .addReg(BasePtrReg);
 
-    buildPrologSpill(LiveRegs, MBB, MBBI, TII, TmpVGPR,
+    buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, TmpVGPR,
                      FuncInfo->getScratchRSrcReg(), StackPtrReg,
                      *FuncInfo->BasePointerSaveIndex);
   }
@@ -830,8 +985,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
 
     // Save FP before setting it up.
     // FIXME: This should respect spillSGPRToVGPR;
-    BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
-            Spill[0].VGPR)
+    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill[0].VGPR)
         .addReg(FramePtrReg)
         .addImm(Spill[0].Lane)
         .addReg(Spill[0].VGPR, RegState::Undef);
@@ -849,8 +1003,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
 
     // Save BP before setting it up.
     // FIXME: This should respect spillSGPRToVGPR;
-    BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
-            Spill[0].VGPR)
+    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill[0].VGPR)
         .addReg(BasePtrReg)
         .addImm(Spill[0].Lane)
         .addReg(Spill[0].VGPR, RegState::Undef);
@@ -877,11 +1030,11 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
     // s_and_b32 s32, tmp_reg, 0b111...0000
     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), ScratchSPReg)
         .addReg(StackPtrReg)
-        .addImm((Alignment - 1) * ST.getWavefrontSize())
+        .addImm((Alignment - 1) * getScratchScaleFactor(ST))
         .setMIFlag(MachineInstr::FrameSetup);
     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg)
         .addReg(ScratchSPReg, RegState::Kill)
-        .addImm(-Alignment * ST.getWavefrontSize())
+        .addImm(-Alignment * getScratchScaleFactor(ST))
         .setMIFlag(MachineInstr::FrameSetup);
     FuncInfo->setIsStackRealigned(true);
   } else if ((HasFP = hasFP(MF))) {
@@ -903,7 +1056,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
   if (HasFP && RoundedSize != 0) {
     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg)
         .addReg(StackPtrReg)
-        .addImm(RoundedSize * ST.getWavefrontSize())
+        .addImm(RoundedSize * getScratchScaleFactor(ST))
         .setMIFlag(MachineInstr::FrameSetup);
   }
 
@@ -965,7 +1118,7 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
   if (RoundedSize != 0 && hasFP(MF)) {
     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg)
       .addReg(StackPtrReg)
-      .addImm(RoundedSize * ST.getWavefrontSize())
+      .addImm(RoundedSize * getScratchScaleFactor(ST))
       .setMIFlag(MachineInstr::FrameDestroy);
   }
 
@@ -991,7 +1144,7 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
 
       MCPhysReg TempVGPR = findScratchNonCalleeSaveRegister(
           MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
-      buildEpilogReload(LiveRegs, MBB, MBBI, TII, TempVGPR,
+      buildEpilogReload(ST, LiveRegs, MBB, MBBI, TII, TempVGPR,
                         FuncInfo->getScratchRSrcReg(), StackPtrReg, FI);
       BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), FramePtrReg)
           .addReg(TempVGPR, RegState::Kill);
@@ -1001,8 +1154,7 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
       ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
           FuncInfo->getSGPRToVGPRSpills(FI);
       assert(Spill.size() == 1);
-      BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
-              FramePtrReg)
+      BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READLANE_B32), FramePtrReg)
           .addReg(Spill[0].VGPR)
           .addImm(Spill[0].Lane);
     }
@@ -1017,7 +1169,7 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
 
       MCPhysReg TempVGPR = findScratchNonCalleeSaveRegister(
           MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
-      buildEpilogReload(LiveRegs, MBB, MBBI, TII, TempVGPR,
+      buildEpilogReload(ST, LiveRegs, MBB, MBBI, TII, TempVGPR,
                         FuncInfo->getScratchRSrcReg(), StackPtrReg, BasePtrFI);
       BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), BasePtrReg)
           .addReg(TempVGPR, RegState::Kill);
@@ -1027,8 +1179,7 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
       ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
           FuncInfo->getSGPRToVGPRSpills(BasePtrFI);
       assert(Spill.size() == 1);
-      BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
-              BasePtrReg)
+      BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READLANE_B32), BasePtrReg)
           .addReg(Spill[0].VGPR)
           .addImm(Spill[0].Lane);
     }
@@ -1042,7 +1193,7 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
     if (!ScratchExecCopy)
       ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false);
 
-    buildEpilogReload(LiveRegs, MBB, MBBI, TII, Reg.VGPR,
+    buildEpilogReload(ST, LiveRegs, MBB, MBBI, TII, Reg.VGPR,
                       FuncInfo->getScratchRSrcReg(), StackPtrReg,
                       Reg.FI.getValue());
   }
@@ -1056,28 +1207,16 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
   }
 }
 
-// Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
-// memory. They should have been removed by now.
-static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
-  for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
-       I != E; ++I) {
-    if (!MFI.isDeadObjectIndex(I))
-      return false;
-  }
-
-  return true;
-}
-
 #ifndef NDEBUG
-static bool allSGPRSpillsAreDead(const MachineFrameInfo &MFI,
-                                 Optional<int> FramePointerSaveIndex,
-                                 Optional<int> BasePointerSaveIndex) {
+static bool allSGPRSpillsAreDead(const MachineFunction &MF) {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
   for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
        I != E; ++I) {
     if (!MFI.isDeadObjectIndex(I) &&
         MFI.getStackID(I) == TargetStackID::SGPRSpill &&
-        ((FramePointerSaveIndex && I != FramePointerSaveIndex) ||
-         (BasePointerSaveIndex && I != BasePointerSaveIndex))) {
+        (I != FuncInfo->FramePointerSaveIndex &&
+         I != FuncInfo->BasePointerSaveIndex)) {
       return false;
     }
   }
@@ -1086,12 +1225,13 @@ static bool allSGPRSpillsAreDead(const MachineFrameInfo &MFI,
 }
 #endif
 
-int SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
-                                            Register &FrameReg) const {
+StackOffset SIFrameLowering::getFrameIndexReference(const MachineFunction &MF,
+                                                    int FI,
+                                                    Register &FrameReg) const {
   const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
 
   FrameReg = RI->getFrameRegister(MF);
-  return MF.getFrameInfo().getObjectOffset(FI);
+  return StackOffset::getFixed(MF.getFrameInfo().getObjectOffset(FI));
 }
 
 void SIFrameLowering::processFunctionBeforeFrameFinalized(
@@ -1104,7 +1244,7 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
 
   FuncInfo->removeDeadFrameIndices(MFI);
-  assert(allSGPRSpillsAreDead(MFI, None, None) &&
+  assert(allSGPRSpillsAreDead(MF) &&
          "SGPR spill should have been removed in SILowerSGPRSpills");
 
   // FIXME: The other checks should be redundant with allStackObjectsAreDead,
@@ -1163,6 +1303,8 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
   LiveRegs.init(*TRI);
 
   if (WillHaveFP || hasFP(MF)) {
+    assert(!MFI->SGPRForFPSaveRestoreCopy && !MFI->FramePointerSaveIndex &&
+           "Re-reserving spill slot for FP");
     getVGPRSpillLaneOrTempRegister(MF, LiveRegs, MFI->SGPRForFPSaveRestoreCopy,
                                    MFI->FramePointerSaveIndex, true);
   }
@@ -1170,6 +1312,9 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
   if (TRI->hasBasePointer(MF)) {
     if (MFI->SGPRForFPSaveRestoreCopy)
       LiveRegs.addReg(MFI->SGPRForFPSaveRestoreCopy);
+
+    assert(!MFI->SGPRForBPSaveRestoreCopy &&
+           !MFI->BasePointerSaveIndex && "Re-reserving spill slot for BP");
     getVGPRSpillLaneOrTempRegister(MF, LiveRegs, MFI->SGPRForBPSaveRestoreCopy,
                                    MFI->BasePointerSaveIndex, false);
   }
@@ -1188,7 +1333,21 @@ void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
 
   // The SP is specifically managed and we don't want extra spills of it.
   SavedRegs.reset(MFI->getStackPtrOffsetReg());
+
+  const BitVector AllSavedRegs = SavedRegs;
   SavedRegs.clearBitsInMask(TRI->getAllVGPRRegMask());
+
+  // If clearing VGPRs changed the mask, we will have some CSR VGPR spills.
+  const bool HaveAnyCSRVGPR = SavedRegs != AllSavedRegs;
+
+  // We have to anticipate introducing CSR VGPR spills if we don't have any
+  // stack objects already, since we require an FP if there is a call and stack.
+  MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+  const bool WillHaveFP = FrameInfo.hasCalls() && HaveAnyCSRVGPR;
+
+  // FP will be specially managed like SP.
+  if (WillHaveFP || hasFP(MF))
+    SavedRegs.reset(MFI->getFrameOffsetReg());
 }
 
 bool SIFrameLowering::assignCalleeSavedSpillSlots(
@@ -1253,7 +1412,7 @@ MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
     unsigned Op = IsDestroy ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
     BuildMI(MBB, I, DL, TII->get(Op), SPReg)
       .addReg(SPReg)
-      .addImm(Amount * ST.getWavefrontSize());
+      .addImm(Amount * getScratchScaleFactor(ST));
   } else if (CalleePopAmount != 0) {
     llvm_unreachable("is this used?");
   }
@@ -1261,6 +1420,20 @@ MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
   return MBB.erase(I);
 }
 
+/// Returns true if the frame will require a reference to the stack pointer.
+///
+/// This is the set of conditions common to setting up the stack pointer in a
+/// kernel, and for using a frame pointer in a callable function.
+///
+/// FIXME: Should also check hasOpaqueSPAdjustment and if any inline asm
+/// references SP.
+static bool frameTriviallyRequiresSP(const MachineFrameInfo &MFI) {
+  return MFI.hasVarSizedObjects() || MFI.hasStackMap() || MFI.hasPatchPoint();
+}
+
+// The FP for kernels is always known 0, so we never really need to setup an
+// explicit register for it. However, DisableFramePointerElim will force us to
+// use a register for it.
 bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
 
@@ -1276,8 +1449,31 @@ bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
     return MFI.getStackSize() != 0;
   }
 
-  return MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() ||
-    MFI.hasStackMap() || MFI.hasPatchPoint() ||
+  return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() ||
     MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->needsStackRealignment(MF) ||
     MF.getTarget().Options.DisableFramePointerElim(MF);
 }
+
+// This is essentially a reduced version of hasFP for entry functions. Since the
+// stack pointer is known 0 on entry to kernels, we never really need an FP
+// register. We may need to initialize the stack pointer depending on the frame
+// properties, which logically overlaps many of the cases where an ordinary
+// function would require an FP.
+bool SIFrameLowering::requiresStackPointerReference(
+    const MachineFunction &MF) const {
+  // Callable functions always require a stack pointer reference.
+  assert(MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() &&
+         "only expected to call this for entry points");
+
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  // Entry points ordinarily don't need to initialize SP. We have to set it up
+  // for callees if there are any. Also note tail calls are impossible/don't
+  // make any sense for kernels.
+  if (MFI.hasCalls())
+    return true;
+
+  // We still need to initialize the SP if we're doing anything weird that
+  // references the SP, like variable sized stack objects.
+  return frameTriviallyRequiresSP(MFI);
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.h
index e89432040661..951ea79b2809 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -31,8 +31,8 @@ public:
                     MachineBasicBlock &MBB) const override;
   void emitEpilogue(MachineFunction &MF,
                     MachineBasicBlock &MBB) const override;
-  int getFrameIndexReference(const MachineFunction &MF, int FI,
-                             Register &FrameReg) const override;
+  StackOffset getFrameIndexReference(const MachineFunction &MF, int FI,
+                                     Register &FrameReg) const override;
 
   void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
                             RegScavenger *RS = nullptr) const override;
@@ -71,6 +71,8 @@ private:
 
 public:
   bool hasFP(const MachineFunction &MF) const override;
+
+  bool requiresStackPointerReference(const MachineFunction &MF) const;
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index d035aa8f72bd..839437b5e3f8 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -13,72 +13,21 @@
 
 #include "SIISelLowering.h"
 #include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
+#include "AMDGPUInstrInfo.h"
 #include "AMDGPUTargetMachine.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIDefines.h"
-#include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/APFloat.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
 #include "llvm/CodeGen/Analysis.h"
-#include "llvm/CodeGen/CallingConvLower.h"
-#include "llvm/CodeGen/DAGCombine.h"
-#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/CodeGen/SelectionDAGNodes.h"
-#include "llvm/CodeGen/TargetCallingConv.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/CodeGen/ValueTypes.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugLoc.h"
-#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/DiagnosticInfo.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Type.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CodeGen.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/IntrinsicsR600.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
-#include "llvm/Support/MachineValueType.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Target/TargetOptions.h"
-#include <cassert>
-#include <cmath>
-#include <cstdint>
-#include <iterator>
-#include <tuple>
-#include <utility>
-#include <vector>
 
 using namespace llvm;
 
@@ -449,6 +398,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 
   if (Subtarget->has16BitInsts()) {
     setOperationAction(ISD::FPOW, MVT::f16, Promote);
+    setOperationAction(ISD::FPOWI, MVT::f16, Promote);
     setOperationAction(ISD::FLOG, MVT::f16, Custom);
     setOperationAction(ISD::FEXP, MVT::f16, Custom);
     setOperationAction(ISD::FLOG10, MVT::f16, Custom);
@@ -486,6 +436,19 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   if (Subtarget->hasBFE())
     setHasExtractBitsInsn(true);
 
+  // Clamp modifier on add/sub
+  if (Subtarget->hasIntClamp()) {
+    setOperationAction(ISD::UADDSAT, MVT::i32, Legal);
+    setOperationAction(ISD::USUBSAT, MVT::i32, Legal);
+  }
+
+  if (Subtarget->hasAddNoCarry()) {
+    setOperationAction(ISD::SADDSAT, MVT::i16, Legal);
+    setOperationAction(ISD::SSUBSAT, MVT::i16, Legal);
+    setOperationAction(ISD::SADDSAT, MVT::i32, Legal);
+    setOperationAction(ISD::SSUBSAT, MVT::i32, Legal);
+  }
+
   setOperationAction(ISD::FMINNUM, MVT::f32, Custom);
   setOperationAction(ISD::FMAXNUM, MVT::f32, Custom);
   setOperationAction(ISD::FMINNUM, MVT::f64, Custom);
@@ -531,13 +494,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Promote);
     AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
 
-    setOperationAction(ISD::ROTR, MVT::i16, Promote);
-    setOperationAction(ISD::ROTL, MVT::i16, Promote);
+    setOperationAction(ISD::ROTR, MVT::i16, Expand);
+    setOperationAction(ISD::ROTL, MVT::i16, Expand);
 
     setOperationAction(ISD::SDIV, MVT::i16, Promote);
     setOperationAction(ISD::UDIV, MVT::i16, Promote);
     setOperationAction(ISD::SREM, MVT::i16, Promote);
     setOperationAction(ISD::UREM, MVT::i16, Promote);
+    setOperationAction(ISD::UADDSAT, MVT::i16, Legal);
+    setOperationAction(ISD::USUBSAT, MVT::i16, Legal);
 
     setOperationAction(ISD::BITREVERSE, MVT::i16, Promote);
 
@@ -702,6 +667,11 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::SMAX, MVT::v2i16, Legal);
     setOperationAction(ISD::UMAX, MVT::v2i16, Legal);
 
+    setOperationAction(ISD::UADDSAT, MVT::v2i16, Legal);
+    setOperationAction(ISD::USUBSAT, MVT::v2i16, Legal);
+    setOperationAction(ISD::SADDSAT, MVT::v2i16, Legal);
+    setOperationAction(ISD::SSUBSAT, MVT::v2i16, Legal);
+
     setOperationAction(ISD::FADD, MVT::v2f16, Legal);
     setOperationAction(ISD::FMUL, MVT::v2f16, Legal);
     setOperationAction(ISD::FMA, MVT::v2f16, Legal);
@@ -729,6 +699,11 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::UMIN, MVT::v4i16, Custom);
     setOperationAction(ISD::UMAX, MVT::v4i16, Custom);
 
+    setOperationAction(ISD::UADDSAT, MVT::v4i16, Custom);
+    setOperationAction(ISD::SADDSAT, MVT::v4i16, Custom);
+    setOperationAction(ISD::USUBSAT, MVT::v4i16, Custom);
+    setOperationAction(ISD::SSUBSAT, MVT::v4i16, Custom);
+
     setOperationAction(ISD::FADD, MVT::v4f16, Custom);
     setOperationAction(ISD::FMUL, MVT::v4f16, Custom);
     setOperationAction(ISD::FMA, MVT::v4f16, Custom);
@@ -779,6 +754,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom);
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2i16, Custom);
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v3f16, Custom);
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v3i16, Custom);
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom);
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4i16, Custom);
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom);
@@ -790,6 +767,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom);
   setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom);
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::v3i16, Custom);
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::v3f16, Custom);
   setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom);
   setOperationAction(ISD::INTRINSIC_VOID, MVT::v4i16, Custom);
   setOperationAction(ISD::INTRINSIC_VOID, MVT::f16, Custom);
@@ -844,6 +823,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN);
   setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX);
   setTargetDAGCombine(ISD::ATOMIC_LOAD_FADD);
+  setTargetDAGCombine(ISD::INTRINSIC_VOID);
+  setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
 
   // FIXME: In other contexts we pretend this is a per-function property.
   setStackPointerRegisterToSaveRestore(AMDGPU::SGPR32);
@@ -888,15 +869,18 @@ MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
   if (VT.isVector()) {
     EVT ScalarVT = VT.getScalarType();
     unsigned Size = ScalarVT.getSizeInBits();
-    if (Size == 32)
-      return ScalarVT.getSimpleVT();
+    if (Size == 16) {
+      if (Subtarget->has16BitInsts())
+        return VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
+      return VT.isInteger() ? MVT::i32 : MVT::f32;
+    }
 
-    if (Size > 32)
-      return MVT::i32;
+    if (Size < 16)
+      return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
+    return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
+  }
 
-    if (Size == 16 && Subtarget->has16BitInsts())
-      return VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
-  } else if (VT.getSizeInBits() > 32)
+  if (VT.getSizeInBits() > 32)
     return MVT::i32;
 
   return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
@@ -913,14 +897,15 @@ unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
     EVT ScalarVT = VT.getScalarType();
     unsigned Size = ScalarVT.getSizeInBits();
 
-    if (Size == 32)
+    // FIXME: Should probably promote 8-bit vectors to i16.
+    if (Size == 16 && Subtarget->has16BitInsts())
+      return (NumElts + 1) / 2;
+
+    if (Size <= 32)
       return NumElts;
 
     if (Size > 32)
       return NumElts * ((Size + 31) / 32);
-
-    if (Size == 16 && Subtarget->has16BitInsts())
-      return (NumElts + 1) / 2;
   } else if (VT.getSizeInBits() > 32)
     return (VT.getSizeInBits() + 31) / 32;
 
@@ -935,6 +920,16 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
     unsigned NumElts = VT.getVectorNumElements();
     EVT ScalarVT = VT.getScalarType();
     unsigned Size = ScalarVT.getSizeInBits();
+    // FIXME: We should fix the ABI to be the same on targets without 16-bit
+    // support, but unless we can properly handle 3-vectors, it will be still be
+    // inconsistent.
+    if (Size == 16 && Subtarget->has16BitInsts()) {
+      RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
+      IntermediateVT = RegisterVT;
+      NumIntermediates = (NumElts + 1) / 2;
+      return NumIntermediates;
+    }
+
     if (Size == 32) {
       RegisterVT = ScalarVT.getSimpleVT();
       IntermediateVT = RegisterVT;
@@ -942,20 +937,26 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
       return NumIntermediates;
     }
 
-    if (Size > 32) {
+    if (Size < 16 && Subtarget->has16BitInsts()) {
+      // FIXME: Should probably form v2i16 pieces
+      RegisterVT = MVT::i16;
+      IntermediateVT = ScalarVT;
+      NumIntermediates = NumElts;
+      return NumIntermediates;
+    }
+
+
+    if (Size != 16 && Size <= 32) {
       RegisterVT = MVT::i32;
-      IntermediateVT = RegisterVT;
-      NumIntermediates = NumElts * ((Size + 31) / 32);
+      IntermediateVT = ScalarVT;
+      NumIntermediates = NumElts;
       return NumIntermediates;
     }
 
-    // FIXME: We should fix the ABI to be the same on targets without 16-bit
-    // support, but unless we can properly handle 3-vectors, it will be still be
-    // inconsistent.
-    if (Size == 16 && Subtarget->has16BitInsts()) {
-      RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
+    if (Size > 32) {
+      RegisterVT = MVT::i32;
       IntermediateVT = RegisterVT;
-      NumIntermediates = (NumElts + 1) / 2;
+      NumIntermediates = NumElts * ((Size + 31) / 32);
       return NumIntermediates;
     }
   }
@@ -1007,14 +1008,12 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
     if (RsrcIntr->IsImage) {
-      Info.ptrVal = MFI->getImagePSV(
-        *MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
-        CI.getArgOperand(RsrcIntr->RsrcArg));
+      Info.ptrVal =
+          MFI->getImagePSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo());
       Info.align.reset();
     } else {
-      Info.ptrVal = MFI->getBufferPSV(
-        *MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
-        CI.getArgOperand(RsrcIntr->RsrcArg));
+      Info.ptrVal =
+          MFI->getBufferPSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo());
     }
 
     Info.flags = MachineMemOperand::MODereferenceable;
@@ -1056,8 +1055,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
       Info.flags |= MachineMemOperand::MOStore;
     } else {
       // Atomic
-      Info.opc = ISD::INTRINSIC_W_CHAIN;
-      Info.memVT = MVT::getVT(CI.getType());
+      Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID :
+                                            ISD::INTRINSIC_W_CHAIN;
+      Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
       Info.flags = MachineMemOperand::MOLoad |
                    MachineMemOperand::MOStore |
                    MachineMemOperand::MODereferenceable;
@@ -1091,11 +1091,10 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case Intrinsic::amdgcn_buffer_atomic_fadd: {
     SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
-    Info.opc = ISD::INTRINSIC_VOID;
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
-    Info.ptrVal = MFI->getBufferPSV(
-      *MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
-      CI.getArgOperand(1));
+    Info.ptrVal =
+        MFI->getBufferPSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo());
     Info.align.reset();
     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
 
@@ -1105,16 +1104,6 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
 
     return true;
   }
-  case Intrinsic::amdgcn_global_atomic_fadd: {
-    Info.opc = ISD::INTRINSIC_VOID;
-    Info.memVT = MVT::getVT(CI.getOperand(0)->getType()
-                            ->getPointerElementType());
-    Info.ptrVal = CI.getOperand(0);
-    Info.align.reset();
-    Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
-
-    return true;
-  }
   case Intrinsic::amdgcn_ds_append:
   case Intrinsic::amdgcn_ds_consume: {
     Info.opc = ISD::INTRINSIC_W_CHAIN;
@@ -1130,6 +1119,16 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     return true;
   }
   case Intrinsic::amdgcn_global_atomic_csub: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::getVT(CI.getType());
+    Info.ptrVal = CI.getOperand(0);
+    Info.align.reset();
+    Info.flags = MachineMemOperand::MOLoad |
+                 MachineMemOperand::MOStore |
+                 MachineMemOperand::MOVolatile;
+    return true;
+  }
+  case Intrinsic::amdgcn_global_atomic_fadd: {
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::getVT(CI.getType());
     Info.ptrVal = CI.getOperand(0);
@@ -1140,6 +1139,17 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
                  MachineMemOperand::MOVolatile;
     return true;
   }
+  case Intrinsic::amdgcn_image_bvh_intersect_ray: {
+    SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT?
+    Info.ptrVal =
+        MFI->getImagePSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo());
+    Info.align.reset();
+    Info.flags = MachineMemOperand::MOLoad |
+                 MachineMemOperand::MODereferenceable;
+    return true;
+  }
   case Intrinsic::amdgcn_ds_gws_init:
   case Intrinsic::amdgcn_ds_gws_barrier:
   case Intrinsic::amdgcn_ds_gws_sema_v:
@@ -1175,9 +1185,13 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
   case Intrinsic::amdgcn_atomic_dec:
   case Intrinsic::amdgcn_ds_ordered_add:
   case Intrinsic::amdgcn_ds_ordered_swap:
+  case Intrinsic::amdgcn_ds_append:
+  case Intrinsic::amdgcn_ds_consume:
   case Intrinsic::amdgcn_ds_fadd:
   case Intrinsic::amdgcn_ds_fmin:
-  case Intrinsic::amdgcn_ds_fmax: {
+  case Intrinsic::amdgcn_ds_fmax:
+  case Intrinsic::amdgcn_global_atomic_fadd:
+  case Intrinsic::amdgcn_global_atomic_csub: {
     Value *Ptr = II->getArgOperand(0);
     AccessTy = II->getType();
     Ops.push_back(Ptr);
@@ -1234,7 +1248,7 @@ bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
   // assume those use MUBUF instructions. Scratch loads / stores are currently
   // implemented as mubuf instructions with offen bit set, so slightly
   // different than the normal addr64.
-  if (!isUInt<12>(AM.BaseOffs))
+  if (!SIInstrInfo::isLegalMUBUFImmOffset(AM.BaseOffs))
     return false;
 
   // FIXME: Since we can split immediate into soffset and immediate offset,
@@ -1355,37 +1369,77 @@ bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
 }
 
 bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
-    unsigned Size, unsigned AddrSpace, unsigned Align,
+    unsigned Size, unsigned AddrSpace, Align Alignment,
     MachineMemOperand::Flags Flags, bool *IsFast) const {
   if (IsFast)
     *IsFast = false;
 
   if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
       AddrSpace == AMDGPUAS::REGION_ADDRESS) {
-    // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
-    // aligned, 8 byte access in a single operation using ds_read2/write2_b32
-    // with adjacent offsets.
-    bool AlignedBy4 = (Align % 4 == 0);
+    // Check if alignment requirements for ds_read/write instructions are
+    // disabled.
+    if (Subtarget->hasUnalignedDSAccessEnabled() &&
+        !Subtarget->hasLDSMisalignedBug()) {
+      if (IsFast)
+        *IsFast = Alignment != Align(2);
+      return true;
+    }
+
+    if (Size == 64) {
+      // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
+      // aligned, 8 byte access in a single operation using ds_read2/write2_b32
+      // with adjacent offsets.
+      bool AlignedBy4 = Alignment >= Align(4);
+      if (IsFast)
+        *IsFast = AlignedBy4;
+
+      return AlignedBy4;
+    }
+    if (Size == 96) {
+      // ds_read/write_b96 require 16-byte alignment on gfx8 and older.
+      bool Aligned = Alignment >= Align(16);
+      if (IsFast)
+        *IsFast = Aligned;
+
+      return Aligned;
+    }
+    if (Size == 128) {
+      // ds_read/write_b128 require 16-byte alignment on gfx8 and older, but we
+      // can do a 8 byte aligned, 16 byte access in a single operation using
+      // ds_read2/write2_b64.
+      bool Aligned = Alignment >= Align(8);
+      if (IsFast)
+        *IsFast = Aligned;
+
+      return Aligned;
+    }
+  }
+
+  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
+    bool AlignedBy4 = Alignment >= Align(4);
     if (IsFast)
       *IsFast = AlignedBy4;
 
-    return AlignedBy4;
+    return AlignedBy4 ||
+           Subtarget->enableFlatScratch() ||
+           Subtarget->hasUnalignedScratchAccess();
   }
 
   // FIXME: We have to be conservative here and assume that flat operations
   // will access scratch.  If we had access to the IR function, then we
   // could determine if any private memory was used in the function.
-  if (!Subtarget->hasUnalignedScratchAccess() &&
-      (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
-       AddrSpace == AMDGPUAS::FLAT_ADDRESS)) {
-    bool AlignedBy4 = Align >= 4;
+  if (AddrSpace == AMDGPUAS::FLAT_ADDRESS &&
+      !Subtarget->hasUnalignedScratchAccess()) {
+    bool AlignedBy4 = Alignment >= Align(4);
     if (IsFast)
       *IsFast = AlignedBy4;
 
     return AlignedBy4;
   }
 
-  if (Subtarget->hasUnalignedBufferAccess()) {
+  if (Subtarget->hasUnalignedBufferAccessEnabled() &&
+      !(AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
+        AddrSpace == AMDGPUAS::REGION_ADDRESS)) {
     // If we have an uniform constant load, it still requires using a slow
     // buffer instruction if unaligned.
     if (IsFast) {
@@ -1393,7 +1447,7 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
       // 2-byte alignment is worse than 1 unless doing a 2-byte accesss.
       *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
                  AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ?
-        Align >= 4 : Align != 2;
+        Alignment >= Align(4) : Alignment != Align(2);
     }
 
     return true;
@@ -1409,12 +1463,12 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
   if (IsFast)
     *IsFast = true;
 
-  return Size >= 32 && Align >= 4;
+  return Size >= 32 && Alignment >= Align(4);
 }
 
 bool SITargetLowering::allowsMisalignedMemoryAccesses(
-    EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags,
-    bool *IsFast) const {
+    EVT VT, unsigned AddrSpace, unsigned Alignment,
+    MachineMemOperand::Flags Flags, bool *IsFast) const {
   if (IsFast)
     *IsFast = false;
 
@@ -1428,7 +1482,7 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(
   }
 
   return allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AddrSpace,
-                                            Align, Flags, IsFast);
+                                            Align(Alignment), Flags, IsFast);
 }
 
 EVT SITargetLowering::getOptimalMemOpType(
@@ -1449,11 +1503,6 @@ EVT SITargetLowering::getOptimalMemOpType(
   return MVT::Other;
 }
 
-bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
-                                           unsigned DestAS) const {
-  return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS);
-}
-
 bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
   const MemSDNode *MemNode = cast<MemSDNode>(N);
   const Value *Ptr = MemNode->getMemOperand()->getValue();
@@ -1461,6 +1510,11 @@ bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
   return I && I->getMetadata("amdgpu.noclobber");
 }
 
+bool SITargetLowering::isNonGlobalAddrSpace(unsigned AS) {
+  return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||
+         AS == AMDGPUAS::PRIVATE_ADDRESS;
+}
+
 bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS,
                                            unsigned DestAS) const {
   // Flat -> private/local is a simple truncate.
@@ -1468,7 +1522,9 @@ bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS,
   if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
     return true;
 
-  return isNoopAddrSpaceCast(SrcAS, DestAS);
+  const GCNTargetMachine &TM =
+      static_cast<const GCNTargetMachine &>(getTargetMachine());
+  return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
 }
 
 bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
@@ -1537,7 +1593,7 @@ SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
   SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
     MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
 
-  return DAG.getObjectPtrOffset(SL, BasePtr, Offset);
+  return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::Fixed(Offset));
 }
 
 SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
@@ -1597,9 +1653,9 @@ SDValue SITargetLowering::lowerKernargMemParameter(
     // TODO: If we passed in the base kernel offset we could have a better
     // alignment than 4, but we don't really need it.
     SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
-    SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, 4,
+    SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
                                MachineMemOperand::MODereferenceable |
-                               MachineMemOperand::MOInvariant);
+                                   MachineMemOperand::MOInvariant);
 
     SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
     SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
@@ -1682,12 +1738,11 @@ SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
   return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT);
 }
 
-static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
-                                   CallingConv::ID CallConv,
-                                   ArrayRef<ISD::InputArg> Ins,
-                                   BitVector &Skipped,
-                                   FunctionType *FType,
-                                   SIMachineFunctionInfo *Info) {
+static void processPSInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
+                               CallingConv::ID CallConv,
+                               ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
+                               FunctionType *FType,
+                               SIMachineFunctionInfo *Info) {
   for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
     const ISD::InputArg *Arg = &Ins[I];
 
@@ -1895,26 +1950,26 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
                                             const SIRegisterInfo &TRI,
                                             SIMachineFunctionInfo &Info) const {
   if (Info.hasImplicitBufferPtr()) {
-    unsigned ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
+    Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
     MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
     CCInfo.AllocateReg(ImplicitBufferPtrReg);
   }
 
   // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
   if (Info.hasPrivateSegmentBuffer()) {
-    unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
+    Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
     MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
     CCInfo.AllocateReg(PrivateSegmentBufferReg);
   }
 
   if (Info.hasDispatchPtr()) {
-    unsigned DispatchPtrReg = Info.addDispatchPtr(TRI);
+    Register DispatchPtrReg = Info.addDispatchPtr(TRI);
     MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
     CCInfo.AllocateReg(DispatchPtrReg);
   }
 
   if (Info.hasQueuePtr()) {
-    unsigned QueuePtrReg = Info.addQueuePtr(TRI);
+    Register QueuePtrReg = Info.addQueuePtr(TRI);
     MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
     CCInfo.AllocateReg(QueuePtrReg);
   }
@@ -1929,13 +1984,13 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
   }
 
   if (Info.hasDispatchID()) {
-    unsigned DispatchIDReg = Info.addDispatchID(TRI);
+    Register DispatchIDReg = Info.addDispatchID(TRI);
     MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
     CCInfo.AllocateReg(DispatchIDReg);
   }
 
-  if (Info.hasFlatScratchInit()) {
-    unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI);
+  if (Info.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
+    Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
     MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
     CCInfo.AllocateReg(FlatScratchInitReg);
   }
@@ -1951,25 +2006,25 @@ void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo,
                                            CallingConv::ID CallConv,
                                            bool IsShader) const {
   if (Info.hasWorkGroupIDX()) {
-    unsigned Reg = Info.addWorkGroupIDX();
+    Register Reg = Info.addWorkGroupIDX();
     MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
     CCInfo.AllocateReg(Reg);
   }
 
   if (Info.hasWorkGroupIDY()) {
-    unsigned Reg = Info.addWorkGroupIDY();
+    Register Reg = Info.addWorkGroupIDY();
     MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
     CCInfo.AllocateReg(Reg);
   }
 
   if (Info.hasWorkGroupIDZ()) {
-    unsigned Reg = Info.addWorkGroupIDZ();
+    Register Reg = Info.addWorkGroupIDZ();
     MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
     CCInfo.AllocateReg(Reg);
   }
 
   if (Info.hasWorkGroupInfo()) {
-    unsigned Reg = Info.addWorkGroupInfo();
+    Register Reg = Info.addWorkGroupInfo();
     MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
     CCInfo.AllocateReg(Reg);
   }
@@ -2020,26 +2075,28 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
   // the scratch registers to pass in.
   bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
 
-  if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
-    // If we have stack objects, we unquestionably need the private buffer
-    // resource. For the Code Object V2 ABI, this will be the first 4 user
-    // SGPR inputs. We can reserve those and use them directly.
+  if (!ST.enableFlatScratch()) {
+    if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
+      // If we have stack objects, we unquestionably need the private buffer
+      // resource. For the Code Object V2 ABI, this will be the first 4 user
+      // SGPR inputs. We can reserve those and use them directly.
 
-    Register PrivateSegmentBufferReg =
-        Info.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
-    Info.setScratchRSrcReg(PrivateSegmentBufferReg);
-  } else {
-    unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
-    // We tentatively reserve the last registers (skipping the last registers
-    // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
-    // we'll replace these with the ones immediately after those which were
-    // really allocated. In the prologue copies will be inserted from the
-    // argument to these reserved registers.
+      Register PrivateSegmentBufferReg =
+          Info.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
+      Info.setScratchRSrcReg(PrivateSegmentBufferReg);
+    } else {
+      unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
+      // We tentatively reserve the last registers (skipping the last registers
+      // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
+      // we'll replace these with the ones immediately after those which were
+      // really allocated. In the prologue copies will be inserted from the
+      // argument to these reserved registers.
 
-    // Without HSA, relocations are used for the scratch pointer and the
-    // buffer resource setup is always inserted in the prologue. Scratch wave
-    // offset is still in an input SGPR.
-    Info.setScratchRSrcReg(ReservedBufferReg);
+      // Without HSA, relocations are used for the scratch pointer and the
+      // buffer resource setup is always inserted in the prologue. Scratch wave
+      // offset is still in an input SGPR.
+      Info.setScratchRSrcReg(ReservedBufferReg);
+    }
   }
 
   MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -2139,7 +2196,7 @@ SDValue SITargetLowering::LowerFormalArguments(
   FunctionType *FType = MF.getFunction().getFunctionType();
   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
 
-  if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
+  if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
     DiagnosticInfoUnsupported NoGraphicsHSA(
         Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
     DAG.getContext()->diagnose(NoGraphicsHSA);
@@ -2152,12 +2209,21 @@ SDValue SITargetLowering::LowerFormalArguments(
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
                  *DAG.getContext());
 
-  bool IsShader = AMDGPU::isShader(CallConv);
+  bool IsGraphics = AMDGPU::isGraphics(CallConv);
   bool IsKernel = AMDGPU::isKernel(CallConv);
   bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
 
-  if (IsShader) {
-    processShaderInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
+  if (IsGraphics) {
+    assert(!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr() &&
+           (!Info->hasFlatScratchInit() || Subtarget->enableFlatScratch()) &&
+           !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
+           !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
+           !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
+           !Info->hasWorkItemIDZ());
+  }
+
+  if (CallConv == CallingConv::AMDGPU_PS) {
+    processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
 
     // At least one interpolation mode must be enabled or else the GPU will
     // hang.
@@ -2172,39 +2238,28 @@ SDValue SITargetLowering::LowerFormalArguments(
     // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
     // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
     //   enabled too.
-    if (CallConv == CallingConv::AMDGPU_PS) {
-      if ((Info->getPSInputAddr() & 0x7F) == 0 ||
-           ((Info->getPSInputAddr() & 0xF) == 0 &&
-            Info->isPSInputAllocated(11))) {
-        CCInfo.AllocateReg(AMDGPU::VGPR0);
-        CCInfo.AllocateReg(AMDGPU::VGPR1);
-        Info->markPSInputAllocated(0);
-        Info->markPSInputEnabled(0);
-      }
-      if (Subtarget->isAmdPalOS()) {
-        // For isAmdPalOS, the user does not enable some bits after compilation
-        // based on run-time states; the register values being generated here are
-        // the final ones set in hardware. Therefore we need to apply the
-        // workaround to PSInputAddr and PSInputEnable together.  (The case where
-        // a bit is set in PSInputAddr but not PSInputEnable is where the
-        // frontend set up an input arg for a particular interpolation mode, but
-        // nothing uses that input arg. Really we should have an earlier pass
-        // that removes such an arg.)
-        unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
-        if ((PsInputBits & 0x7F) == 0 ||
-            ((PsInputBits & 0xF) == 0 &&
-             (PsInputBits >> 11 & 1)))
-          Info->markPSInputEnabled(
-              countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined));
-      }
+    if ((Info->getPSInputAddr() & 0x7F) == 0 ||
+        ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
+      CCInfo.AllocateReg(AMDGPU::VGPR0);
+      CCInfo.AllocateReg(AMDGPU::VGPR1);
+      Info->markPSInputAllocated(0);
+      Info->markPSInputEnabled(0);
+    }
+    if (Subtarget->isAmdPalOS()) {
+      // For isAmdPalOS, the user does not enable some bits after compilation
+      // based on run-time states; the register values being generated here are
+      // the final ones set in hardware. Therefore we need to apply the
+      // workaround to PSInputAddr and PSInputEnable together.  (The case where
+      // a bit is set in PSInputAddr but not PSInputEnable is where the
+      // frontend set up an input arg for a particular interpolation mode, but
+      // nothing uses that input arg. Really we should have an earlier pass
+      // that removes such an arg.)
+      unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
+      if ((PsInputBits & 0x7F) == 0 ||
+          ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
+        Info->markPSInputEnabled(
+            countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined));
     }
-
-    assert(!Info->hasDispatchPtr() &&
-           !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
-           !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
-           !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
-           !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
-           !Info->hasWorkItemIDZ());
   } else if (IsKernel) {
     assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
   } else {
@@ -2253,9 +2308,23 @@ SDValue SITargetLowering::LowerFormalArguments(
       const uint64_t Offset = VA.getLocMemOffset();
       Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
 
-      SDValue Arg =
-          lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset, Alignment,
-                                   Ins[i].Flags.isSExt(), &Ins[i]);
+      if (Arg.Flags.isByRef()) {
+        SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
+
+        const GCNTargetMachine &TM =
+            static_cast<const GCNTargetMachine &>(getTargetMachine());
+        if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
+                                    Arg.Flags.getPointerAddrSpace())) {
+          Ptr = DAG.getAddrSpaceCast(DL, VT, Ptr, AMDGPUAS::CONSTANT_ADDRESS,
+                                     Arg.Flags.getPointerAddrSpace());
+        }
+
+        InVals.push_back(Ptr);
+        continue;
+      }
+
+      SDValue Arg = lowerKernargMemParameter(
+        DAG, VT, MemVT, DL, Chain, Offset, Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
       Chains.push_back(Arg.getValue(1));
 
       auto *ParamTy =
@@ -2337,7 +2406,7 @@ SDValue SITargetLowering::LowerFormalArguments(
 
   // Start adding system SGPRs.
   if (IsEntryFunc) {
-    allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
+    allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
   } else {
     CCInfo.AllocateReg(Info->getScratchRSrcReg());
     allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
@@ -2820,7 +2889,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
     report_fatal_error("unsupported libcall legalization");
 
   if (!AMDGPUTargetMachine::EnableFixedFunctionABI &&
-      !CLI.CB->getCalledFunction()) {
+      !CLI.CB->getCalledFunction() && CallConv != CallingConv::AMDGPU_Gfx) {
     return lowerUnhandledCall(CLI, InVals,
                               "unsupported indirect call to function ");
   }
@@ -2830,11 +2899,19 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
                               "unsupported required tail call to function ");
   }
 
-  if (AMDGPU::isShader(MF.getFunction().getCallingConv())) {
-    // Note the issue is with the CC of the calling function, not of the call
+  if (AMDGPU::isShader(CallConv)) {
+    // Note the issue is with the CC of the called function, not of the call
     // itself.
     return lowerUnhandledCall(CLI, InVals,
-                          "unsupported call from graphics shader of function ");
+                              "unsupported call to a shader function ");
+  }
+
+  if (AMDGPU::isShader(MF.getFunction().getCallingConv()) &&
+      CallConv != CallingConv::AMDGPU_Gfx) {
+    // Only allow calls with specific calling conventions.
+    return lowerUnhandledCall(CLI, InVals,
+                              "unsupported calling convention for call from "
+                              "graphics shader of function ");
   }
 
   if (IsTailCall) {
@@ -2865,7 +2942,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
   CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
   CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
 
-  if (AMDGPUTargetMachine::EnableFixedFunctionABI) {
+  if (AMDGPUTargetMachine::EnableFixedFunctionABI &&
+      CallConv != CallingConv::AMDGPU_Gfx) {
     // With a fixed ABI, allocate fixed registers before user arguments.
     passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
   }
@@ -2894,14 +2972,16 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
   if (!IsSibCall) {
     Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
 
-    SmallVector<SDValue, 4> CopyFromChains;
+    if (!Subtarget->enableFlatScratch()) {
+      SmallVector<SDValue, 4> CopyFromChains;
 
-    // In the HSA case, this should be an identity copy.
-    SDValue ScratchRSrcReg
-      = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
-    RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
-    CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
-    Chain = DAG.getTokenFactor(DL, CopyFromChains);
+      // In the HSA case, this should be an identity copy.
+      SDValue ScratchRSrcReg
+        = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
+      RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
+      CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
+      Chain = DAG.getTokenFactor(DL, CopyFromChains);
+    }
   }
 
   MVT PtrVT = MVT::i32;
@@ -2992,14 +3072,15 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
 
         MemOpChains.push_back(Cpy);
       } else {
-        SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo,
-                                     Alignment ? Alignment->value() : 0);
+        SDValue Store =
+            DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
         MemOpChains.push_back(Store);
       }
     }
   }
 
-  if (!AMDGPUTargetMachine::EnableFixedFunctionABI) {
+  if (!AMDGPUTargetMachine::EnableFixedFunctionABI &&
+      CallConv != CallingConv::AMDGPU_Gfx) {
     // Copy special input registers after user input arguments.
     passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
   }
@@ -3223,29 +3304,11 @@ Register SITargetLowering::getRegisterByName(const char* RegName, LLT VT,
 
 // If kill is not the last instruction, split the block so kill is always a
 // proper terminator.
-MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI,
-                                                    MachineBasicBlock *BB) const {
+MachineBasicBlock *
+SITargetLowering::splitKillBlock(MachineInstr &MI,
+                                 MachineBasicBlock *BB) const {
+  MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
-
-  MachineBasicBlock::iterator SplitPoint(&MI);
-  ++SplitPoint;
-
-  if (SplitPoint == BB->end()) {
-    // Don't bother with a new block.
-    MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
-    return BB;
-  }
-
-  MachineFunction *MF = BB->getParent();
-  MachineBasicBlock *SplitBB
-    = MF->CreateMachineBasicBlock(BB->getBasicBlock());
-
-  MF->insert(++MachineFunction::iterator(BB), SplitBB);
-  SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end());
-
-  SplitBB->transferSuccessorsAndUpdatePHIs(BB);
-  BB->addSuccessor(SplitBB);
-
   MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
   return SplitBB;
 }
@@ -3357,20 +3420,14 @@ SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI,
 // will only do one iteration. In the worst case, this will loop 64 times.
 //
 // TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
-static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
-  const SIInstrInfo *TII,
-  MachineRegisterInfo &MRI,
-  MachineBasicBlock &OrigBB,
-  MachineBasicBlock &LoopBB,
-  const DebugLoc &DL,
-  const MachineOperand &IdxReg,
-  unsigned InitReg,
-  unsigned ResultReg,
-  unsigned PhiReg,
-  unsigned InitSaveExecReg,
-  int Offset,
-  bool UseGPRIdxMode,
-  bool IsIndirectSrc) {
+static MachineBasicBlock::iterator
+emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI,
+                       MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
+                       const DebugLoc &DL, const MachineOperand &Idx,
+                       unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
+                       unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
+                       Register &SGPRIdxReg) {
+
   MachineFunction *MF = OrigBB.getParent();
   const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
@@ -3396,12 +3453,12 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
 
   // Read the next variant <- also loop target.
   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
-    .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
+      .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
 
   // Compare the just read M0 value to all possible Idx values.
   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
-    .addReg(CurrentIdxReg)
-    .addReg(IdxReg.getReg(), 0, IdxReg.getSubReg());
+      .addReg(CurrentIdxReg)
+      .addReg(Idx.getReg(), 0, Idx.getSubReg());
 
   // Update EXEC, save the original EXEC value to VCC.
   BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
@@ -3412,22 +3469,14 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
   MRI.setSimpleHint(NewExec, CondReg);
 
   if (UseGPRIdxMode) {
-    unsigned IdxReg;
     if (Offset == 0) {
-      IdxReg = CurrentIdxReg;
+      SGPRIdxReg = CurrentIdxReg;
     } else {
-      IdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
-      BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), IdxReg)
-        .addReg(CurrentIdxReg, RegState::Kill)
-        .addImm(Offset);
+      SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+      BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
+          .addReg(CurrentIdxReg, RegState::Kill)
+          .addImm(Offset);
     }
-    unsigned IdxMode = IsIndirectSrc ?
-      AMDGPU::VGPRIndexMode::SRC0_ENABLE : AMDGPU::VGPRIndexMode::DST_ENABLE;
-    MachineInstr *SetOn =
-      BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
-      .addReg(IdxReg, RegState::Kill)
-      .addImm(IdxMode);
-    SetOn->getOperand(3).setIsUndef();
   } else {
     // Move index from VCC into M0
     if (Offset == 0) {
@@ -3463,14 +3512,10 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
 // per-workitem, so is kept alive for the whole loop so we end up not re-using a
 // subregister from it, using 1 more VGPR than necessary. This was saved when
 // this was expanded after register allocation.
-static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
-                                                  MachineBasicBlock &MBB,
-                                                  MachineInstr &MI,
-                                                  unsigned InitResultReg,
-                                                  unsigned PhiReg,
-                                                  int Offset,
-                                                  bool UseGPRIdxMode,
-                                                  bool IsIndirectSrc) {
+static MachineBasicBlock::iterator
+loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI,
+               unsigned InitResultReg, unsigned PhiReg, int Offset,
+               bool UseGPRIdxMode, Register &SGPRIdxReg) {
   MachineFunction *MF = MBB.getParent();
   const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
@@ -3499,7 +3544,8 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
 
   auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
                                       InitResultReg, DstReg, PhiReg, TmpExec,
-                                      Offset, UseGPRIdxMode, IsIndirectSrc);
+                                      Offset, UseGPRIdxMode, SGPRIdxReg);
+
   MachineBasicBlock* LandingPad = MF->CreateMachineBasicBlock();
   MachineFunction::iterator MBBI(LoopBB);
   ++MBBI;
@@ -3530,64 +3576,45 @@ computeIndirectRegAndOffset(const SIRegisterInfo &TRI,
   return std::make_pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
 }
 
-// Return true if the index is an SGPR and was set.
-static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII,
-                                 MachineRegisterInfo &MRI,
-                                 MachineInstr &MI,
-                                 int Offset,
-                                 bool UseGPRIdxMode,
-                                 bool IsIndirectSrc) {
+static void setM0ToIndexFromSGPR(const SIInstrInfo *TII,
+                                 MachineRegisterInfo &MRI, MachineInstr &MI,
+                                 int Offset) {
   MachineBasicBlock *MBB = MI.getParent();
   const DebugLoc &DL = MI.getDebugLoc();
   MachineBasicBlock::iterator I(&MI);
 
   const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
-  const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
 
   assert(Idx->getReg() != AMDGPU::NoRegister);
 
-  if (!TII->getRegisterInfo().isSGPRClass(IdxRC))
-    return false;
-
-  if (UseGPRIdxMode) {
-    unsigned IdxMode = IsIndirectSrc ?
-      AMDGPU::VGPRIndexMode::SRC0_ENABLE : AMDGPU::VGPRIndexMode::DST_ENABLE;
-    if (Offset == 0) {
-      MachineInstr *SetOn =
-          BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
-              .add(*Idx)
-              .addImm(IdxMode);
+  if (Offset == 0) {
+    BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0).add(*Idx);
+  } else {
+    BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
+        .add(*Idx)
+        .addImm(Offset);
+  }
+}
 
-      SetOn->getOperand(3).setIsUndef();
-    } else {
-      Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
-      BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
-          .add(*Idx)
-          .addImm(Offset);
-      MachineInstr *SetOn =
-        BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
-        .addReg(Tmp, RegState::Kill)
-        .addImm(IdxMode);
+static Register getIndirectSGPRIdx(const SIInstrInfo *TII,
+                                   MachineRegisterInfo &MRI, MachineInstr &MI,
+                                   int Offset) {
+  MachineBasicBlock *MBB = MI.getParent();
+  const DebugLoc &DL = MI.getDebugLoc();
+  MachineBasicBlock::iterator I(&MI);
 
-      SetOn->getOperand(3).setIsUndef();
-    }
+  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
 
-    return true;
-  }
+  if (Offset == 0)
+    return Idx->getReg();
 
-  if (Offset == 0) {
-    BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
-      .add(*Idx);
-  } else {
-    BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
+  Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
       .add(*Idx)
       .addImm(Offset);
-  }
-
-  return true;
+  return Tmp;
 }
 
-// Control flow needs to be inserted if indexing with a VGPR.
 static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
                                           MachineBasicBlock &MBB,
                                           const GCNSubtarget &ST) {
@@ -3597,10 +3624,12 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
   MachineRegisterInfo &MRI = MF->getRegInfo();
 
   Register Dst = MI.getOperand(0).getReg();
+  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
   Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
   int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
 
   const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
+  const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
 
   unsigned SubReg;
   std::tie(SubReg, Offset)
@@ -3608,7 +3637,8 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
 
   const bool UseGPRIdxMode = ST.useVGPRIndexMode();
 
-  if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, true)) {
+  // Check for a SGPR index.
+  if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
     MachineBasicBlock::iterator I(&MI);
     const DebugLoc &DL = MI.getDebugLoc();
 
@@ -3616,14 +3646,19 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
       // TODO: Look at the uses to avoid the copy. This may require rescheduling
       // to avoid interfering with other uses, so probably requires a new
       // optimization pass.
-      BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
-        .addReg(SrcReg, RegState::Undef, SubReg)
-        .addReg(SrcReg, RegState::Implicit)
-        .addReg(AMDGPU::M0, RegState::Implicit);
-      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
+      Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset);
+
+      const MCInstrDesc &GPRIDXDesc =
+          TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
+      BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
+          .addReg(SrcReg)
+          .addReg(Idx)
+          .addImm(SubReg);
     } else {
+      setM0ToIndexFromSGPR(TII, MRI, MI, Offset);
+
       BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
-        .addReg(SrcReg, RegState::Undef, SubReg)
+        .addReg(SrcReg, 0, SubReg)
         .addReg(SrcReg, RegState::Implicit);
     }
 
@@ -3632,6 +3667,7 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
     return &MBB;
   }
 
+  // Control flow needs to be inserted if indexing with a VGPR.
   const DebugLoc &DL = MI.getDebugLoc();
   MachineBasicBlock::iterator I(&MI);
 
@@ -3640,19 +3676,23 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
 
   BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
 
-  auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg,
-                              Offset, UseGPRIdxMode, true);
+  Register SGPRIdxReg;
+  auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
+                              UseGPRIdxMode, SGPRIdxReg);
+
   MachineBasicBlock *LoopBB = InsPt->getParent();
 
   if (UseGPRIdxMode) {
-    BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
-      .addReg(SrcReg, RegState::Undef, SubReg)
-      .addReg(SrcReg, RegState::Implicit)
-      .addReg(AMDGPU::M0, RegState::Implicit);
-    BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
+    const MCInstrDesc &GPRIDXDesc =
+        TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
+
+    BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
+        .addReg(SrcReg)
+        .addReg(SGPRIdxReg)
+        .addImm(SubReg);
   } else {
     BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
-      .addReg(SrcReg, RegState::Undef, SubReg)
+      .addReg(SrcReg, 0, SubReg)
       .addReg(SrcReg, RegState::Implicit);
   }
 
@@ -3675,6 +3715,7 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
   const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
   int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
   const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
+  const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
 
   // This can be an immediate, but will be folded later.
   assert(Val->getReg());
@@ -3700,23 +3741,36 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
     return &MBB;
   }
 
-  const MCInstrDesc &MovRelDesc
-    = TII->getIndirectRegWritePseudo(TRI.getRegSizeInBits(*VecRC), 32, false);
-
-  if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, false)) {
+  // Check for a SGPR index.
+  if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
     MachineBasicBlock::iterator I(&MI);
     const DebugLoc &DL = MI.getDebugLoc();
-    BuildMI(MBB, I, DL, MovRelDesc, Dst)
-      .addReg(SrcVec->getReg())
-      .add(*Val)
-      .addImm(SubReg);
-    if (UseGPRIdxMode)
-      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
 
+    if (UseGPRIdxMode) {
+      Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset);
+
+      const MCInstrDesc &GPRIDXDesc =
+          TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
+      BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
+          .addReg(SrcVec->getReg())
+          .add(*Val)
+          .addReg(Idx)
+          .addImm(SubReg);
+    } else {
+      setM0ToIndexFromSGPR(TII, MRI, MI, Offset);
+
+      const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
+          TRI.getRegSizeInBits(*VecRC), 32, false);
+      BuildMI(MBB, I, DL, MovRelDesc, Dst)
+          .addReg(SrcVec->getReg())
+          .add(*Val)
+          .addImm(SubReg);
+    }
     MI.eraseFromParent();
     return &MBB;
   }
 
+  // Control flow needs to be inserted if indexing with a VGPR.
   if (Val->isReg())
     MRI.clearKillFlags(Val->getReg());
 
@@ -3724,16 +3778,28 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
 
   Register PhiReg = MRI.createVirtualRegister(VecRC);
 
-  auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg,
-                              Offset, UseGPRIdxMode, false);
+  Register SGPRIdxReg;
+  auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
+                              UseGPRIdxMode, SGPRIdxReg);
   MachineBasicBlock *LoopBB = InsPt->getParent();
 
-  BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
-    .addReg(PhiReg)
-    .add(*Val)
-    .addImm(AMDGPU::sub0);
-  if (UseGPRIdxMode)
-    BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
+  if (UseGPRIdxMode) {
+    const MCInstrDesc &GPRIDXDesc =
+        TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
+
+    BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
+        .addReg(PhiReg)
+        .add(*Val)
+        .addReg(SGPRIdxReg)
+        .addImm(AMDGPU::sub0);
+  } else {
+    const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
+        TRI.getRegSizeInBits(*VecRC), 32, false);
+    BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
+        .addReg(PhiReg)
+        .add(*Val)
+        .addImm(AMDGPU::sub0);
+  }
 
   MI.eraseFromParent();
   return LoopBB;
@@ -3849,7 +3915,7 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
     MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
         MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
 
-    unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64;
+    unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
     MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
                                .addReg(CarryReg, RegState::Define)
                                .add(SrcReg0Sub0)
@@ -3912,10 +3978,29 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
       Src2.setReg(RegOp2);
     }
 
-    if (TRI->getRegSizeInBits(*MRI.getRegClass(Src2.getReg())) == 64) {
-      BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
-          .addReg(Src2.getReg())
-          .addImm(0);
+    const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
+    if (TRI->getRegSizeInBits(*Src2RC) == 64) {
+      if (ST.hasScalarCompareEq64()) {
+        BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
+            .addReg(Src2.getReg())
+            .addImm(0);
+      } else {
+        const TargetRegisterClass *SubRC =
+            TRI->getSubRegClass(Src2RC, AMDGPU::sub0);
+        MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
+            MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
+        MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
+            MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
+        Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+
+        BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
+            .add(Src2Sub0)
+            .add(Src2Sub1);
+
+        BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
+            .addReg(Src2_32, RegState::Kill)
+            .addImm(0);
+      }
     } else {
       BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMPK_LG_U32))
           .addReg(Src2.getReg())
@@ -3936,77 +4021,6 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
     MI.eraseFromParent();
     return BB;
   }
-  case AMDGPU::SI_INIT_EXEC:
-    // This should be before all vector instructions.
-    BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
-            AMDGPU::EXEC)
-        .addImm(MI.getOperand(0).getImm());
-    MI.eraseFromParent();
-    return BB;
-
-  case AMDGPU::SI_INIT_EXEC_LO:
-    // This should be before all vector instructions.
-    BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B32),
-            AMDGPU::EXEC_LO)
-        .addImm(MI.getOperand(0).getImm());
-    MI.eraseFromParent();
-    return BB;
-
-  case AMDGPU::SI_INIT_EXEC_FROM_INPUT: {
-    // Extract the thread count from an SGPR input and set EXEC accordingly.
-    // Since BFM can't shift by 64, handle that case with CMP + CMOV.
-    //
-    // S_BFE_U32 count, input, {shift, 7}
-    // S_BFM_B64 exec, count, 0
-    // S_CMP_EQ_U32 count, 64
-    // S_CMOV_B64 exec, -1
-    MachineInstr *FirstMI = &*BB->begin();
-    MachineRegisterInfo &MRI = MF->getRegInfo();
-    Register InputReg = MI.getOperand(0).getReg();
-    Register CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
-    bool Found = false;
-
-    // Move the COPY of the input reg to the beginning, so that we can use it.
-    for (auto I = BB->begin(); I != &MI; I++) {
-      if (I->getOpcode() != TargetOpcode::COPY ||
-          I->getOperand(0).getReg() != InputReg)
-        continue;
-
-      if (I == FirstMI) {
-        FirstMI = &*++BB->begin();
-      } else {
-        I->removeFromParent();
-        BB->insert(FirstMI, &*I);
-      }
-      Found = true;
-      break;
-    }
-    assert(Found);
-    (void)Found;
-
-    // This should be before all vector instructions.
-    unsigned Mask = (getSubtarget()->getWavefrontSize() << 1) - 1;
-    bool isWave32 = getSubtarget()->isWave32();
-    unsigned Exec = isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
-    BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg)
-        .addReg(InputReg)
-        .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
-    BuildMI(*BB, FirstMI, DebugLoc(),
-            TII->get(isWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64),
-            Exec)
-        .addReg(CountReg)
-        .addImm(0);
-    BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32))
-        .addReg(CountReg, RegState::Kill)
-        .addImm(getSubtarget()->getWavefrontSize());
-    BuildMI(*BB, FirstMI, DebugLoc(),
-            TII->get(isWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
-            Exec)
-        .addImm(-1);
-    MI.eraseFromParent();
-    return BB;
-  }
-
   case AMDGPU::GET_GROUPSTATICSIZE: {
     assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
            getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
@@ -4086,13 +4100,8 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
   case AMDGPU::ADJCALLSTACKDOWN: {
     const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
     MachineInstrBuilder MIB(*MF, &MI);
-
-    // Add an implicit use of the frame offset reg to prevent the restore copy
-    // inserted after the call from being reorderd after stack operations in the
-    // the caller's frame.
     MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
-        .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit)
-        .addReg(Info->getFrameOffsetReg(), RegState::Implicit);
+       .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
     return BB;
   }
   case AMDGPU::SI_CALL_ISEL: {
@@ -4111,9 +4120,9 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
     MI.eraseFromParent();
     return BB;
   }
-  case AMDGPU::V_ADD_I32_e32:
-  case AMDGPU::V_SUB_I32_e32:
-  case AMDGPU::V_SUBREV_I32_e32: {
+  case AMDGPU::V_ADD_CO_U32_e32:
+  case AMDGPU::V_SUB_CO_U32_e32:
+  case AMDGPU::V_SUBREV_CO_U32_e32: {
     // TODO: Define distinct V_*_I32_Pseudo instructions instead.
     const DebugLoc &DL = MI.getDebugLoc();
     unsigned Opc = MI.getOpcode();
@@ -4154,9 +4163,6 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
 
     return emitGWSMemViolTestLoop(MI, BB);
   case AMDGPU::S_SETREG_B32: {
-    if (!getSubtarget()->hasDenormModeInst())
-      return BB;
-
     // Try to optimize cases that only set the denormal mode or rounding mode.
     //
     // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
@@ -4166,9 +4172,6 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
     // FIXME: This could be predicates on the immediate, but tablegen doesn't
     // allow you to have a no side effect instruction in the output of a
     // sideeffecting pattern.
-
-    // TODO: Should also emit a no side effects pseudo if only FP bits are
-    // touched, even if not all of them or to a variable.
     unsigned ID, Offset, Width;
     AMDGPU::Hwreg::decodeHwreg(MI.getOperand(1).getImm(), ID, Offset, Width);
     if (ID != AMDGPU::Hwreg::ID_MODE)
@@ -4176,50 +4179,54 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
 
     const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
     const unsigned SetMask = WidthMask << Offset;
-    unsigned SetDenormOp = 0;
-    unsigned SetRoundOp = 0;
-
-    // The dedicated instructions can only set the whole denorm or round mode at
-    // once, not a subset of bits in either.
-    if (Width == 8 && (SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
-                                  AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask) {
-      // If this fully sets both the round and denorm mode, emit the two
-      // dedicated instructions for these.
-      assert(Offset == 0);
-      SetRoundOp = AMDGPU::S_ROUND_MODE;
-      SetDenormOp = AMDGPU::S_DENORM_MODE;
-    } else if (Width == 4) {
-      if ((SetMask & AMDGPU::Hwreg::FP_ROUND_MASK) == SetMask) {
+
+    if (getSubtarget()->hasDenormModeInst()) {
+      unsigned SetDenormOp = 0;
+      unsigned SetRoundOp = 0;
+
+      // The dedicated instructions can only set the whole denorm or round mode
+      // at once, not a subset of bits in either.
+      if (SetMask ==
+          (AMDGPU::Hwreg::FP_ROUND_MASK | AMDGPU::Hwreg::FP_DENORM_MASK)) {
+        // If this fully sets both the round and denorm mode, emit the two
+        // dedicated instructions for these.
+        SetRoundOp = AMDGPU::S_ROUND_MODE;
+        SetDenormOp = AMDGPU::S_DENORM_MODE;
+      } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
         SetRoundOp = AMDGPU::S_ROUND_MODE;
-        assert(Offset == 0);
-      } else if ((SetMask & AMDGPU::Hwreg::FP_DENORM_MASK) == SetMask) {
+      } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
         SetDenormOp = AMDGPU::S_DENORM_MODE;
-        assert(Offset == 4);
       }
-    }
 
-    if (SetRoundOp || SetDenormOp) {
-      MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
-      MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
-      if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
-        unsigned ImmVal = Def->getOperand(1).getImm();
-        if (SetRoundOp) {
-          BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
-            .addImm(ImmVal & 0xf);
+      if (SetRoundOp || SetDenormOp) {
+        MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+        MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
+        if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
+          unsigned ImmVal = Def->getOperand(1).getImm();
+          if (SetRoundOp) {
+            BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
+                .addImm(ImmVal & 0xf);
+
+            // If we also have the denorm mode, get just the denorm mode bits.
+            ImmVal >>= 4;
+          }
 
-          // If we also have the denorm mode, get just the denorm mode bits.
-          ImmVal >>= 4;
-        }
+          if (SetDenormOp) {
+            BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
+                .addImm(ImmVal & 0xf);
+          }
 
-        if (SetDenormOp) {
-          BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
-            .addImm(ImmVal & 0xf);
+          MI.eraseFromParent();
+          return BB;
         }
-
-        MI.eraseFromParent();
       }
     }
 
+    // If only FP bits are touched, used the no side effects pseudo.
+    if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
+                    AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
+      MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
+
     return BB;
   }
   default:
@@ -4256,6 +4263,12 @@ MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const {
   return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
 }
 
+LLT SITargetLowering::getPreferredShiftAmountTy(LLT Ty) const {
+  return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
+             ? Ty.changeElementSize(16)
+             : Ty.changeElementSize(32);
+}
+
 // Answering this is somewhat tricky and depends on the specific device which
 // have different rates for fma or all f64 operations.
 //
@@ -4457,6 +4470,10 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::FMUL:
   case ISD::FMINNUM_IEEE:
   case ISD::FMAXNUM_IEEE:
+  case ISD::UADDSAT:
+  case ISD::USUBSAT:
+  case ISD::SADDSAT:
+  case ISD::SSUBSAT:
     return splitBinaryVectorOp(Op, DAG);
   case ISD::SMULO:
   case ISD::UMULO:
@@ -4467,31 +4484,47 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   return SDValue();
 }
 
+// Used for D16: Casts the result of an instruction into the right vector,
+// packs values if loads return unpacked values.
 static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT,
                                        const SDLoc &DL,
                                        SelectionDAG &DAG, bool Unpacked) {
   if (!LoadVT.isVector())
     return Result;
 
+  // Cast back to the original packed type or to a larger type that is a
+  // multiple of 32 bit for D16. Widening the return type is a required for
+  // legalization.
+  EVT FittingLoadVT = LoadVT;
+  if ((LoadVT.getVectorNumElements() % 2) == 1) {
+    FittingLoadVT =
+        EVT::getVectorVT(*DAG.getContext(), LoadVT.getVectorElementType(),
+                         LoadVT.getVectorNumElements() + 1);
+  }
+
   if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
     // Truncate to v2i16/v4i16.
-    EVT IntLoadVT = LoadVT.changeTypeToInteger();
+    EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
 
     // Workaround legalizer not scalarizing truncate after vector op
-    // legalization byt not creating intermediate vector trunc.
+    // legalization but not creating intermediate vector trunc.
     SmallVector<SDValue, 4> Elts;
     DAG.ExtractVectorElements(Result, Elts);
     for (SDValue &Elt : Elts)
       Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
 
+    // Pad illegal v1i16/v3fi6 to v4i16
+    if ((LoadVT.getVectorNumElements() % 2) == 1)
+      Elts.push_back(DAG.getUNDEF(MVT::i16));
+
     Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
 
     // Bitcast to original type (v2f16/v4f16).
-    return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
+    return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
   }
 
   // Cast back to the original packed type.
-  return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
+  return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
 }
 
 SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
@@ -4505,10 +4538,16 @@ SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
   EVT LoadVT = M->getValueType(0);
 
   EVT EquivLoadVT = LoadVT;
-  if (Unpacked && LoadVT.isVector()) {
-    EquivLoadVT = LoadVT.isVector() ?
-      EVT::getVectorVT(*DAG.getContext(), MVT::i32,
-                       LoadVT.getVectorNumElements()) : LoadVT;
+  if (LoadVT.isVector()) {
+    if (Unpacked) {
+      EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+                                     LoadVT.getVectorNumElements());
+    } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
+      // Widen v3f16 to legal type
+      EquivLoadVT =
+          EVT::getVectorVT(*DAG.getContext(), LoadVT.getVectorElementType(),
+                           LoadVT.getVectorNumElements() + 1);
+    }
   }
 
   // Change from v4f16/v2f16 to EquivLoadVT.
@@ -4519,8 +4558,6 @@ SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
       IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL,
       VTList, Ops, M->getMemoryVT(),
       M->getMemOperand());
-  if (!Unpacked) // Just adjusted the opcode.
-    return Load;
 
   SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
 
@@ -4724,8 +4761,9 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
     if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
       if (Res.getOpcode() == ISD::MERGE_VALUES) {
         // FIXME: Hacky
-        Results.push_back(Res.getOperand(0));
-        Results.push_back(Res.getOperand(1));
+        for (unsigned I = 0; I < Res.getNumOperands(); I++) {
+          Results.push_back(Res.getOperand(I));
+        }
       } else {
         Results.push_back(Res);
         Results.push_back(Res.getValue(1));
@@ -4967,7 +5005,7 @@ SDValue SITargetLowering::LowerRETURNADDR(SDValue Op,
 
   const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
   // Get the return address reg and mark it as an implicit live-in
-  unsigned Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF), getRegClassFor(VT, Op.getNode()->isDivergent()));
+  Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF), getRegClassFor(VT, Op.getNode()->isDivergent()));
 
   return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
 }
@@ -5063,7 +5101,7 @@ SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
 
   MachineFunction &MF = DAG.getMachineFunction();
   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
-  unsigned UserSGPR = Info->getQueuePtrUserSGPR();
+  Register UserSGPR = Info->getQueuePtrUserSGPR();
   assert(UserSGPR != AMDGPU::NoRegister);
   SDValue QueuePtr = CreateLiveInRegister(
     DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
@@ -5136,14 +5174,15 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
   // private_segment_aperture_base_hi.
   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
 
-  SDValue Ptr = DAG.getObjectPtrOffset(DL, QueuePtr, StructOffset);
+  SDValue Ptr =
+      DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::Fixed(StructOffset));
 
   // TODO: Use custom target PseudoSourceValue.
   // TODO: We should use the value from the IR intrinsic call, but it might not
   // be available and how do we get it?
   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
   return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
-                     MinAlign(64, StructOffset),
+                     commonAlignment(Align(64), StructOffset),
                      MachineMemOperand::MODereferenceable |
                          MachineMemOperand::MOInvariant);
 }
@@ -5504,7 +5543,9 @@ buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
   // variable, but since the encoding of $symbol starts 4 bytes after the start
   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
   // small. This requires us to add 4 to the global variable offset in order to
-  // compute the correct address.
+  // compute the correct address. Similarly for the s_addc_u32 instruction, the
+  // encoding of $symbol starts 12 bytes after the start of the s_add_u32
+  // instruction.
   SDValue PtrLo =
       DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, GAFlags);
   SDValue PtrHi;
@@ -5512,7 +5553,7 @@ buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
     PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
   } else {
     PtrHi =
-        DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, GAFlags + 1);
+        DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 12, GAFlags + 1);
   }
   return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
 }
@@ -5521,15 +5562,32 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
                                              SDValue Op,
                                              SelectionDAG &DAG) const {
   GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
+  SDLoc DL(GSD);
+  EVT PtrVT = Op.getValueType();
+
   const GlobalValue *GV = GSD->getGlobal();
   if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
        shouldUseLDSConstAddress(GV)) ||
       GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS ||
-      GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
+      GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
+    if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
+        GV->hasExternalLinkage()) {
+      Type *Ty = GV->getValueType();
+      // HIP uses an unsized array `extern __shared__ T s[]` or similar
+      // zero-sized type in other languages to declare the dynamic shared
+      // memory which size is not known at the compile time. They will be
+      // allocated by the runtime and placed directly after the static
+      // allocated ones. They all share the same offset.
+      if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
+        assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
+        // Adjust alignment for that dynamic shared memory array.
+        MFI->setDynLDSAlign(DAG.getDataLayout(), *cast<GlobalVariable>(GV));
+        return SDValue(
+            DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
+      }
+    }
     return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
-
-  SDLoc DL(GSD);
-  EVT PtrVT = Op.getValueType();
+  }
 
   if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
     SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
@@ -5713,7 +5771,7 @@ static SDValue constructRetValue(SelectionDAG &DAG,
   SDValue Data(Result, 0);
   SDValue TexFail;
 
-  if (IsTexFail) {
+  if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
     SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
     if (MaskPopVT.isVector()) {
       Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
@@ -5722,10 +5780,6 @@ static SDValue constructRetValue(SelectionDAG &DAG,
       Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
                          SDValue(Result, 0), ZeroIdx);
     }
-
-    TexFail = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
-                          SDValue(Result, 0),
-                          DAG.getConstant(MaskPopDwords, DL, MVT::i32));
   }
 
   if (DataDwordVT.isVector())
@@ -5735,13 +5789,27 @@ static SDValue constructRetValue(SelectionDAG &DAG,
   if (IsD16)
     Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
 
-  if (!ReqRetVT.isVector())
+  EVT LegalReqRetVT = ReqRetVT;
+  if (!ReqRetVT.isVector()) {
     Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
+  } else {
+    // We need to widen the return vector to a legal type
+    if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
+        ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
+      LegalReqRetVT =
+          EVT::getVectorVT(*DAG.getContext(), ReqRetVT.getVectorElementType(),
+                           ReqRetVT.getVectorNumElements() + 1);
+    }
+  }
+  Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
 
-  Data = DAG.getNode(ISD::BITCAST, DL, ReqRetVT, Data);
+  if (IsTexFail) {
+    TexFail =
+        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
+                    DAG.getConstant(MaskPopDwords, DL, MVT::i32));
 
-  if (TexFail)
     return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
+  }
 
   if (Result->getNumValues() == 1)
     return Data;
@@ -5798,7 +5866,7 @@ static void packImageA16AddressToDwords(SelectionDAG &DAG, SDValue Op,
 
 SDValue SITargetLowering::lowerImage(SDValue Op,
                                      const AMDGPU::ImageDimIntrinsicInfo *Intr,
-                                     SelectionDAG &DAG) const {
+                                     SelectionDAG &DAG, bool WithChain) const {
   SDLoc DL(Op);
   MachineFunction &MF = DAG.getMachineFunction();
   const GCNSubtarget* ST = &MF.getSubtarget<GCNSubtarget>();
@@ -5810,10 +5878,10 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
   const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo =
       AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode);
   unsigned IntrOpcode = Intr->BaseOpcode;
-  bool IsGFX10 = Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10;
+  bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
 
-  SmallVector<EVT, 3> ResultTypes(Op->value_begin(), Op->value_end());
-  SmallVector<EVT, 3> OrigResultTypes(Op->value_begin(), Op->value_end());
+  SmallVector<EVT, 3> ResultTypes(Op->values());
+  SmallVector<EVT, 3> OrigResultTypes(Op->values());
   bool IsD16 = false;
   bool IsG16 = false;
   bool IsA16 = false;
@@ -5821,7 +5889,9 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
   int NumVDataDwords;
   bool AdjustRetType = false;
 
-  unsigned AddrIdx; // Index of first address argument
+  // Offset of intrinsic arguments
+  const unsigned ArgOffset = WithChain ? 2 : 1;
+
   unsigned DMask;
   unsigned DMaskLanes = 0;
 
@@ -5839,15 +5909,13 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
       ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
       DMask = Is64Bit ? 0xf : 0x3;
       NumVDataDwords = Is64Bit ? 4 : 2;
-      AddrIdx = 4;
     } else {
       DMask = Is64Bit ? 0x3 : 0x1;
       NumVDataDwords = Is64Bit ? 2 : 1;
-      AddrIdx = 3;
     }
   } else {
-    unsigned DMaskIdx = BaseOpcode->Store ? 3 : isa<MemSDNode>(Op) ? 2 : 1;
-    auto DMaskConst = cast<ConstantSDNode>(Op.getOperand(DMaskIdx));
+    auto *DMaskConst =
+        cast<ConstantSDNode>(Op.getOperand(ArgOffset + Intr->DMaskIndex));
     DMask = DMaskConst->getZExtValue();
     DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
 
@@ -5860,7 +5928,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
           return Op; // D16 is unsupported for this instruction
 
         IsD16 = true;
-        VData = handleD16VData(VData, DAG);
+        VData = handleD16VData(VData, DAG, true);
       }
 
       NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
@@ -5880,63 +5948,56 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
           (!LoadVT.isVector() && DMaskLanes > 1))
           return Op;
 
-      if (IsD16 && !Subtarget->hasUnpackedD16VMem())
+      // The sq block of gfx8 and gfx9 do not estimate register use correctly
+      // for d16 image_gather4, image_gather4_l, and image_gather4_lz
+      // instructions.
+      if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
+          !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
         NumVDataDwords = (DMaskLanes + 1) / 2;
       else
         NumVDataDwords = DMaskLanes;
 
       AdjustRetType = true;
     }
-
-    AddrIdx = DMaskIdx + 1;
   }
 
-  unsigned NumGradients = BaseOpcode->Gradients ? DimInfo->NumGradients : 0;
-  unsigned NumCoords = BaseOpcode->Coordinates ? DimInfo->NumCoords : 0;
-  unsigned NumLCM = BaseOpcode->LodOrClampOrMip ? 1 : 0;
-  unsigned NumVAddrs = BaseOpcode->NumExtraArgs + NumGradients +
-                       NumCoords + NumLCM;
-  unsigned NumMIVAddrs = NumVAddrs;
-
+  unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
   SmallVector<SDValue, 4> VAddrs;
 
   // Optimize _L to _LZ when _L is zero
   if (LZMappingInfo) {
-    if (auto ConstantLod =
-         dyn_cast<ConstantFPSDNode>(Op.getOperand(AddrIdx+NumVAddrs-1))) {
+    if (auto *ConstantLod = dyn_cast<ConstantFPSDNode>(
+            Op.getOperand(ArgOffset + Intr->LodIndex))) {
       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
         IntrOpcode = LZMappingInfo->LZ;  // set new opcode to _lz variant of _l
-        NumMIVAddrs--;               // remove 'lod'
+        VAddrEnd--;                      // remove 'lod'
       }
     }
   }
 
   // Optimize _mip away, when 'lod' is zero
   if (MIPMappingInfo) {
-    if (auto ConstantLod =
-         dyn_cast<ConstantSDNode>(Op.getOperand(AddrIdx+NumVAddrs-1))) {
+    if (auto *ConstantLod = dyn_cast<ConstantSDNode>(
+            Op.getOperand(ArgOffset + Intr->MipIndex))) {
       if (ConstantLod->isNullValue()) {
         IntrOpcode = MIPMappingInfo->NONMIP;  // set new opcode to variant without _mip
-        NumMIVAddrs--;               // remove 'lod'
+        VAddrEnd--;                           // remove 'mip'
       }
     }
   }
 
   // Push back extra arguments.
-  for (unsigned I = 0; I < BaseOpcode->NumExtraArgs; I++)
-    VAddrs.push_back(Op.getOperand(AddrIdx + I));
+  for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++)
+    VAddrs.push_back(Op.getOperand(ArgOffset + I));
 
   // Check for 16 bit addresses or derivatives and pack if true.
-  unsigned DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
-  unsigned CoordIdx = DimIdx + NumGradients;
-  unsigned CoordsEnd = AddrIdx + NumMIVAddrs;
-
-  MVT VAddrVT = Op.getOperand(DimIdx).getSimpleValueType();
+  MVT VAddrVT =
+      Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
   MVT VAddrScalarVT = VAddrVT.getScalarType();
   MVT PackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
   IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
 
-  VAddrVT = Op.getOperand(CoordIdx).getSimpleValueType();
+  VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
   VAddrScalarVT = VAddrVT.getScalarType();
   IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
   if (IsA16 || IsG16) {
@@ -5971,17 +6032,18 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
     }
 
     // Don't compress addresses for G16
-    const int PackEndIdx = IsA16 ? CoordsEnd : CoordIdx;
-    packImageA16AddressToDwords(DAG, Op, PackVectorVT, VAddrs, DimIdx,
-                                PackEndIdx, NumGradients);
+    const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
+    packImageA16AddressToDwords(DAG, Op, PackVectorVT, VAddrs,
+                                ArgOffset + Intr->GradientStart, PackEndIdx,
+                                Intr->NumGradients);
 
     if (!IsA16) {
       // Add uncompressed address
-      for (unsigned I = CoordIdx; I < CoordsEnd; I++)
+      for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
         VAddrs.push_back(Op.getOperand(I));
     }
   } else {
-    for (unsigned I = DimIdx; I < CoordsEnd; I++)
+    for (unsigned I = ArgOffset + Intr->GradientStart; I < VAddrEnd; I++)
       VAddrs.push_back(Op.getOperand(I));
   }
 
@@ -6004,22 +6066,19 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
 
   SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
   SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
-  unsigned CtrlIdx; // Index of texfailctrl argument
   SDValue Unorm;
   if (!BaseOpcode->Sampler) {
     Unorm = True;
-    CtrlIdx = AddrIdx + NumVAddrs + 1;
   } else {
     auto UnormConst =
-        cast<ConstantSDNode>(Op.getOperand(AddrIdx + NumVAddrs + 2));
+        cast<ConstantSDNode>(Op.getOperand(ArgOffset + Intr->UnormIndex));
 
     Unorm = UnormConst->getZExtValue() ? True : False;
-    CtrlIdx = AddrIdx + NumVAddrs + 3;
   }
 
   SDValue TFE;
   SDValue LWE;
-  SDValue TexFail = Op.getOperand(CtrlIdx);
+  SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
   bool IsTexFail = false;
   if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
     return Op;
@@ -6066,42 +6125,40 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
   SDValue DLC;
   if (BaseOpcode->Atomic) {
     GLC = True; // TODO no-return optimization
-    if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, nullptr, &SLC,
-                          IsGFX10 ? &DLC : nullptr))
+    if (!parseCachePolicy(Op.getOperand(ArgOffset + Intr->CachePolicyIndex),
+                          DAG, nullptr, &SLC, IsGFX10Plus ? &DLC : nullptr))
       return Op;
   } else {
-    if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, &GLC, &SLC,
-                          IsGFX10 ? &DLC : nullptr))
+    if (!parseCachePolicy(Op.getOperand(ArgOffset + Intr->CachePolicyIndex),
+                          DAG, &GLC, &SLC, IsGFX10Plus ? &DLC : nullptr))
       return Op;
   }
 
   SmallVector<SDValue, 26> Ops;
   if (BaseOpcode->Store || BaseOpcode->Atomic)
     Ops.push_back(VData); // vdata
-  if (UseNSA) {
-    for (const SDValue &Addr : VAddrs)
-      Ops.push_back(Addr);
-  } else {
+  if (UseNSA)
+    append_range(Ops, VAddrs);
+  else
     Ops.push_back(VAddr);
-  }
-  Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs)); // rsrc
+  Ops.push_back(Op.getOperand(ArgOffset + Intr->RsrcIndex));
   if (BaseOpcode->Sampler)
-    Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs + 1)); // sampler
+    Ops.push_back(Op.getOperand(ArgOffset + Intr->SampIndex));
   Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
-  if (IsGFX10)
+  if (IsGFX10Plus)
     Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
   Ops.push_back(Unorm);
-  if (IsGFX10)
+  if (IsGFX10Plus)
     Ops.push_back(DLC);
   Ops.push_back(GLC);
   Ops.push_back(SLC);
   Ops.push_back(IsA16 &&  // r128, a16 for gfx9
                 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
-  if (IsGFX10)
+  if (IsGFX10Plus)
     Ops.push_back(IsA16 ? True : False);
   Ops.push_back(TFE);
   Ops.push_back(LWE);
-  if (!IsGFX10)
+  if (!IsGFX10Plus)
     Ops.push_back(DimInfo->DA ? True : False);
   if (BaseOpcode->HasD16)
     Ops.push_back(IsD16 ? True : False);
@@ -6112,7 +6169,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
       UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
   int Opcode = -1;
 
-  if (IsGFX10) {
+  if (IsGFX10Plus) {
     Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
                                    UseNSA ? AMDGPU::MIMGEncGfx10NSA
                                           : AMDGPU::MIMGEncGfx10Default,
@@ -6391,11 +6448,11 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
                            SDLoc(Op), MVT::i32);
   case Intrinsic::amdgcn_s_buffer_load: {
-    bool IsGFX10 = Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10;
+    bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
     SDValue GLC;
     SDValue DLC = DAG.getTargetConstant(0, DL, MVT::i1);
     if (!parseCachePolicy(Op.getOperand(3), DAG, &GLC, nullptr,
-                          IsGFX10 ? &DLC : nullptr))
+                          IsGFX10Plus ? &DLC : nullptr))
       return Op;
     return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
                         DAG);
@@ -6417,11 +6474,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
       return SDValue();
 
-    DiagnosticInfoUnsupported BadIntrin(
-      MF.getFunction(), "intrinsic not supported on subtarget",
-      DL.getDebugLoc());
-      DAG.getContext()->diagnose(BadIntrin);
-      return DAG.getUNDEF(VT);
+    return emitRemovedIntrinsicError(DAG, DL, VT);
   }
   case Intrinsic::amdgcn_ldexp:
     return DAG.getNode(AMDGPUISD::LDEXP, DL, VT,
@@ -6567,7 +6620,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   default:
     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
             AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
-      return lowerImage(Op, ImageDimIntr, DAG);
+      return lowerImage(Op, ImageDimIntr, DAG, false);
 
     return Op;
   }
@@ -6597,26 +6650,59 @@ static unsigned getBufferOffsetForMMO(SDValue VOffset,
          cast<ConstantSDNode>(Offset)->getSExtValue();
 }
 
-static unsigned getDSShaderTypeValue(const MachineFunction &MF) {
-  switch (MF.getFunction().getCallingConv()) {
-  case CallingConv::AMDGPU_PS:
-    return 1;
-  case CallingConv::AMDGPU_VS:
-    return 2;
-  case CallingConv::AMDGPU_GS:
-    return 3;
-  case CallingConv::AMDGPU_HS:
-  case CallingConv::AMDGPU_LS:
-  case CallingConv::AMDGPU_ES:
-    report_fatal_error("ds_ordered_count unsupported for this calling conv");
-  case CallingConv::AMDGPU_CS:
-  case CallingConv::AMDGPU_KERNEL:
-  case CallingConv::C:
-  case CallingConv::Fast:
-  default:
-    // Assume other calling conventions are various compute callable functions
-    return 0;
-  }
+SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
+                                                     SelectionDAG &DAG,
+                                                     unsigned NewOpcode) const {
+  SDLoc DL(Op);
+
+  SDValue VData = Op.getOperand(2);
+  auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
+  SDValue Ops[] = {
+    Op.getOperand(0), // Chain
+    VData,            // vdata
+    Op.getOperand(3), // rsrc
+    DAG.getConstant(0, DL, MVT::i32), // vindex
+    Offsets.first,    // voffset
+    Op.getOperand(5), // soffset
+    Offsets.second,   // offset
+    Op.getOperand(6), // cachepolicy
+    DAG.getTargetConstant(0, DL, MVT::i1), // idxen
+  };
+
+  auto *M = cast<MemSDNode>(Op);
+  M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6]));
+
+  EVT MemVT = VData.getValueType();
+  return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
+                                 M->getMemOperand());
+}
+
+SDValue
+SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
+                                                unsigned NewOpcode) const {
+  SDLoc DL(Op);
+
+  SDValue VData = Op.getOperand(2);
+  auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
+  SDValue Ops[] = {
+    Op.getOperand(0), // Chain
+    VData,            // vdata
+    Op.getOperand(3), // rsrc
+    Op.getOperand(4), // vindex
+    Offsets.first,    // voffset
+    Op.getOperand(6), // soffset
+    Offsets.second,   // offset
+    Op.getOperand(7), // cachepolicy
+    DAG.getTargetConstant(1, DL, MVT::i1), // idxen
+  };
+
+  auto *M = cast<MemSDNode>(Op);
+  M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6],
+                                                      Ops[3]));
+
+  EVT MemVT = VData.getValueType();
+  return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
+                                 M->getMemOperand());
 }
 
 SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
@@ -6656,7 +6742,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
       report_fatal_error("ds_ordered_count: wave_done requires wave_release");
 
     unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
-    unsigned ShaderType = getDSShaderTypeValue(DAG.getMachineFunction());
+    unsigned ShaderType =
+        SIInstrInfo::getDSShaderTypeValue(DAG.getMachineFunction());
     unsigned Offset0 = OrderedCountIndex << 2;
     unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) |
                        (Instruction << 4);
@@ -6893,7 +6980,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
   case Intrinsic::amdgcn_buffer_atomic_umax:
   case Intrinsic::amdgcn_buffer_atomic_and:
   case Intrinsic::amdgcn_buffer_atomic_or:
-  case Intrinsic::amdgcn_buffer_atomic_xor: {
+  case Intrinsic::amdgcn_buffer_atomic_xor:
+  case Intrinsic::amdgcn_buffer_atomic_fadd: {
     unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
     unsigned IdxEn = 1;
     if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
@@ -6953,6 +7041,17 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
     case Intrinsic::amdgcn_buffer_atomic_xor:
       Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
       break;
+    case Intrinsic::amdgcn_buffer_atomic_fadd:
+      if (!Op.getValue(0).use_empty()) {
+        DiagnosticInfoUnsupported
+          NoFpRet(DAG.getMachineFunction().getFunction(),
+                  "return versions of fp atomics not supported",
+                  DL.getDebugLoc(), DS_Error);
+        DAG.getContext()->diagnose(NoFpRet);
+        return SDValue();
+      }
+      Opcode = AMDGPUISD::BUFFER_ATOMIC_FADD;
+      break;
     default:
       llvm_unreachable("unhandled atomic opcode");
     }
@@ -6960,155 +7059,64 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
     return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
                                    M->getMemOperand());
   }
+  case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
+    return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
+  case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
+    return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
+    return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
   case Intrinsic::amdgcn_raw_buffer_atomic_add:
+    return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
+    return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
+    return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
+    return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
+    return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
+    return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
   case Intrinsic::amdgcn_raw_buffer_atomic_and:
+    return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
   case Intrinsic::amdgcn_raw_buffer_atomic_or:
+    return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
+    return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
-  case Intrinsic::amdgcn_raw_buffer_atomic_dec: {
-    auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
-    SDValue Ops[] = {
-      Op.getOperand(0), // Chain
-      Op.getOperand(2), // vdata
-      Op.getOperand(3), // rsrc
-      DAG.getConstant(0, DL, MVT::i32), // vindex
-      Offsets.first,    // voffset
-      Op.getOperand(5), // soffset
-      Offsets.second,   // offset
-      Op.getOperand(6), // cachepolicy
-      DAG.getTargetConstant(0, DL, MVT::i1), // idxen
-    };
-    EVT VT = Op.getValueType();
-
-    auto *M = cast<MemSDNode>(Op);
-    M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6]));
-    unsigned Opcode = 0;
-
-    switch (IntrID) {
-    case Intrinsic::amdgcn_raw_buffer_atomic_swap:
-      Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
-      break;
-    case Intrinsic::amdgcn_raw_buffer_atomic_add:
-      Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
-      break;
-    case Intrinsic::amdgcn_raw_buffer_atomic_sub:
-      Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
-      break;
-    case Intrinsic::amdgcn_raw_buffer_atomic_smin:
-      Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
-      break;
-    case Intrinsic::amdgcn_raw_buffer_atomic_umin:
-      Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
-      break;
-    case Intrinsic::amdgcn_raw_buffer_atomic_smax:
-      Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
-      break;
-    case Intrinsic::amdgcn_raw_buffer_atomic_umax:
-      Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
-      break;
-    case Intrinsic::amdgcn_raw_buffer_atomic_and:
-      Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
-      break;
-    case Intrinsic::amdgcn_raw_buffer_atomic_or:
-      Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
-      break;
-    case Intrinsic::amdgcn_raw_buffer_atomic_xor:
-      Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
-      break;
-    case Intrinsic::amdgcn_raw_buffer_atomic_inc:
-      Opcode = AMDGPUISD::BUFFER_ATOMIC_INC;
-      break;
-    case Intrinsic::amdgcn_raw_buffer_atomic_dec:
-      Opcode = AMDGPUISD::BUFFER_ATOMIC_DEC;
-      break;
-    default:
-      llvm_unreachable("unhandled atomic opcode");
-    }
-
-    return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
-                                   M->getMemOperand());
-  }
+    return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
+  case Intrinsic::amdgcn_raw_buffer_atomic_dec:
+    return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
+    return lowerStructBufferAtomicIntrin(Op, DAG,
+                                         AMDGPUISD::BUFFER_ATOMIC_SWAP);
   case Intrinsic::amdgcn_struct_buffer_atomic_add:
+    return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
+    return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
+    return lowerStructBufferAtomicIntrin(Op, DAG,
+                                         AMDGPUISD::BUFFER_ATOMIC_SMIN);
   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
+    return lowerStructBufferAtomicIntrin(Op, DAG,
+                                         AMDGPUISD::BUFFER_ATOMIC_UMIN);
   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
+    return lowerStructBufferAtomicIntrin(Op, DAG,
+                                         AMDGPUISD::BUFFER_ATOMIC_SMAX);
   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
+    return lowerStructBufferAtomicIntrin(Op, DAG,
+                                         AMDGPUISD::BUFFER_ATOMIC_UMAX);
   case Intrinsic::amdgcn_struct_buffer_atomic_and:
+    return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
   case Intrinsic::amdgcn_struct_buffer_atomic_or:
+    return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
+    return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
-  case Intrinsic::amdgcn_struct_buffer_atomic_dec: {
-    auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
-    SDValue Ops[] = {
-      Op.getOperand(0), // Chain
-      Op.getOperand(2), // vdata
-      Op.getOperand(3), // rsrc
-      Op.getOperand(4), // vindex
-      Offsets.first,    // voffset
-      Op.getOperand(6), // soffset
-      Offsets.second,   // offset
-      Op.getOperand(7), // cachepolicy
-      DAG.getTargetConstant(1, DL, MVT::i1), // idxen
-    };
-    EVT VT = Op.getValueType();
-
-    auto *M = cast<MemSDNode>(Op);
-    M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6],
-                                                        Ops[3]));
-    unsigned Opcode = 0;
-
-    switch (IntrID) {
-    case Intrinsic::amdgcn_struct_buffer_atomic_swap:
-      Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
-      break;
-    case Intrinsic::amdgcn_struct_buffer_atomic_add:
-      Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
-      break;
-    case Intrinsic::amdgcn_struct_buffer_atomic_sub:
-      Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
-      break;
-    case Intrinsic::amdgcn_struct_buffer_atomic_smin:
-      Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
-      break;
-    case Intrinsic::amdgcn_struct_buffer_atomic_umin:
-      Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
-      break;
-    case Intrinsic::amdgcn_struct_buffer_atomic_smax:
-      Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
-      break;
-    case Intrinsic::amdgcn_struct_buffer_atomic_umax:
-      Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
-      break;
-    case Intrinsic::amdgcn_struct_buffer_atomic_and:
-      Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
-      break;
-    case Intrinsic::amdgcn_struct_buffer_atomic_or:
-      Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
-      break;
-    case Intrinsic::amdgcn_struct_buffer_atomic_xor:
-      Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
-      break;
-    case Intrinsic::amdgcn_struct_buffer_atomic_inc:
-      Opcode = AMDGPUISD::BUFFER_ATOMIC_INC;
-      break;
-    case Intrinsic::amdgcn_struct_buffer_atomic_dec:
-      Opcode = AMDGPUISD::BUFFER_ATOMIC_DEC;
-      break;
-    default:
-      llvm_unreachable("unhandled atomic opcode");
-    }
+    return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
+  case Intrinsic::amdgcn_struct_buffer_atomic_dec:
+    return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
 
-    return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
-                                   M->getMemOperand());
-  }
   case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
     unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
     unsigned IdxEn = 1;
@@ -7180,7 +7188,15 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
     return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
                                    Op->getVTList(), Ops, VT, M->getMemOperand());
   }
-  case Intrinsic::amdgcn_global_atomic_csub: {
+  case Intrinsic::amdgcn_global_atomic_fadd: {
+    if (!Op.getValue(0).use_empty()) {
+      DiagnosticInfoUnsupported
+        NoFpRet(DAG.getMachineFunction().getFunction(),
+                "return versions of fp atomics not supported",
+                DL.getDebugLoc(), DS_Error);
+      DAG.getContext()->diagnose(NoFpRet);
+      return SDValue();
+    }
     MemSDNode *M = cast<MemSDNode>(Op);
     SDValue Ops[] = {
       M->getOperand(0), // Chain
@@ -7188,15 +7204,85 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
       M->getOperand(3)  // Value
     };
 
-    return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_LOAD_CSUB, SDLoc(Op),
-                                   M->getVTList(), Ops, M->getMemoryVT(),
-                                   M->getMemOperand());
+    EVT VT = Op.getOperand(3).getValueType();
+    return DAG.getAtomic(ISD::ATOMIC_LOAD_FADD, DL, VT,
+                         DAG.getVTList(VT, MVT::Other), Ops,
+                         M->getMemOperand());
   }
+  case Intrinsic::amdgcn_image_bvh_intersect_ray: {
+    SDLoc DL(Op);
+    MemSDNode *M = cast<MemSDNode>(Op);
+    SDValue NodePtr = M->getOperand(2);
+    SDValue RayExtent = M->getOperand(3);
+    SDValue RayOrigin = M->getOperand(4);
+    SDValue RayDir = M->getOperand(5);
+    SDValue RayInvDir = M->getOperand(6);
+    SDValue TDescr = M->getOperand(7);
+
+    assert(NodePtr.getValueType() == MVT::i32 ||
+           NodePtr.getValueType() == MVT::i64);
+    assert(RayDir.getValueType() == MVT::v4f16 ||
+           RayDir.getValueType() == MVT::v4f32);
+
+    bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
+    bool Is64 = NodePtr.getValueType() == MVT::i64;
+    unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
+                                   : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
+                            : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
+                                   : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
+
+    SmallVector<SDValue, 16> Ops;
+
+    auto packLanes = [&DAG, &Ops, &DL] (SDValue Op, bool IsAligned) {
+      SmallVector<SDValue, 3> Lanes;
+      DAG.ExtractVectorElements(Op, Lanes, 0, 3);
+      if (Lanes[0].getValueSizeInBits() == 32) {
+        for (unsigned I = 0; I < 3; ++I)
+          Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
+      } else {
+        if (IsAligned) {
+          Ops.push_back(
+            DAG.getBitcast(MVT::i32,
+                           DAG.getBuildVector(MVT::v2f16, DL,
+                                              { Lanes[0], Lanes[1] })));
+          Ops.push_back(Lanes[2]);
+        } else {
+          SDValue Elt0 = Ops.pop_back_val();
+          Ops.push_back(
+            DAG.getBitcast(MVT::i32,
+                           DAG.getBuildVector(MVT::v2f16, DL,
+                                              { Elt0, Lanes[0] })));
+          Ops.push_back(
+            DAG.getBitcast(MVT::i32,
+                           DAG.getBuildVector(MVT::v2f16, DL,
+                                              { Lanes[1], Lanes[2] })));
+        }
+      }
+    };
 
+    if (Is64)
+      DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0, 2);
+    else
+      Ops.push_back(NodePtr);
+
+    Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
+    packLanes(RayOrigin, true);
+    packLanes(RayDir, true);
+    packLanes(RayInvDir, false);
+    Ops.push_back(TDescr);
+    if (IsA16)
+      Ops.push_back(DAG.getTargetConstant(1, DL, MVT::i1));
+    Ops.push_back(M->getChain());
+
+    auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
+    MachineMemOperand *MemRef = M->getMemOperand();
+    DAG.setNodeMemRefs(NewNode, {MemRef});
+    return SDValue(NewNode, 0);
+  }
   default:
     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
             AMDGPU::getImageDimIntrinsicInfo(IntrID))
-      return lowerImage(Op, ImageDimIntr, DAG);
+      return lowerImage(Op, ImageDimIntr, DAG, true);
 
     return SDValue();
   }
@@ -7234,8 +7320,8 @@ SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
   return NewOp;
 }
 
-SDValue SITargetLowering::handleD16VData(SDValue VData,
-                                         SelectionDAG &DAG) const {
+SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
+                                         bool ImageStore) const {
   EVT StoreVT = VData.getValueType();
 
   // No change for f16 and legal vector D16 types.
@@ -7243,19 +7329,70 @@ SDValue SITargetLowering::handleD16VData(SDValue VData,
     return VData;
 
   SDLoc DL(VData);
-  assert((StoreVT.getVectorNumElements() != 3) && "Handle v3f16");
+  unsigned NumElements = StoreVT.getVectorNumElements();
 
   if (Subtarget->hasUnpackedD16VMem()) {
     // We need to unpack the packed data to store.
     EVT IntStoreVT = StoreVT.changeTypeToInteger();
     SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
 
-    EVT EquivStoreVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
-                                        StoreVT.getVectorNumElements());
+    EVT EquivStoreVT =
+        EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
     SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
     return DAG.UnrollVectorOp(ZExt.getNode());
   }
 
+  // The sq block of gfx8.1 does not estimate register use correctly for d16
+  // image store instructions. The data operand is computed as if it were not a
+  // d16 image instruction.
+  if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
+    // Bitcast to i16
+    EVT IntStoreVT = StoreVT.changeTypeToInteger();
+    SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
+
+    // Decompose into scalars
+    SmallVector<SDValue, 4> Elts;
+    DAG.ExtractVectorElements(IntVData, Elts);
+
+    // Group pairs of i16 into v2i16 and bitcast to i32
+    SmallVector<SDValue, 4> PackedElts;
+    for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
+      SDValue Pair =
+          DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
+      SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
+      PackedElts.push_back(IntPair);
+    }
+    if ((NumElements % 2) == 1) {
+      // Handle v3i16
+      unsigned I = Elts.size() / 2;
+      SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
+                                        {Elts[I * 2], DAG.getUNDEF(MVT::i16)});
+      SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
+      PackedElts.push_back(IntPair);
+    }
+
+    // Pad using UNDEF
+    PackedElts.resize(Elts.size(), DAG.getUNDEF(MVT::i32));
+
+    // Build final vector
+    EVT VecVT =
+        EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
+    return DAG.getBuildVector(VecVT, DL, PackedElts);
+  }
+
+  if (NumElements == 3) {
+    EVT IntStoreVT =
+        EVT::getIntegerVT(*DAG.getContext(), StoreVT.getStoreSizeInBits());
+    SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
+
+    EVT WidenedStoreVT = EVT::getVectorVT(
+        *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
+    EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
+                                         WidenedStoreVT.getStoreSizeInBits());
+    SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
+    return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
+  }
+
   assert(isTypeLegal(StoreVT));
   return VData;
 }
@@ -7433,8 +7570,10 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     EVT VDataVT = VData.getValueType();
     EVT EltType = VDataVT.getScalarType();
     bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
-    if (IsD16)
+    if (IsD16) {
       VData = handleD16VData(VData, DAG);
+      VDataVT = VData.getValueType();
+    }
 
     if (!isTypeLegal(VDataVT)) {
       VData =
@@ -7478,8 +7617,10 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     EVT EltType = VDataVT.getScalarType();
     bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
 
-    if (IsD16)
+    if (IsD16) {
       VData = handleD16VData(VData, DAG);
+      VDataVT = VData.getValueType();
+    }
 
     if (!isTypeLegal(VDataVT)) {
       VData =
@@ -7514,57 +7655,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
                                    M->getMemoryVT(), M->getMemOperand());
   }
-
-  case Intrinsic::amdgcn_buffer_atomic_fadd: {
-    unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
-    unsigned IdxEn = 1;
-    if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
-      IdxEn = Idx->getZExtValue() != 0;
-    SDValue Ops[] = {
-      Chain,
-      Op.getOperand(2), // vdata
-      Op.getOperand(3), // rsrc
-      Op.getOperand(4), // vindex
-      SDValue(),        // voffset -- will be set by setBufferOffsets
-      SDValue(),        // soffset -- will be set by setBufferOffsets
-      SDValue(),        // offset -- will be set by setBufferOffsets
-      DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
-      DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
-    };
-    unsigned Offset = setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
-    // We don't know the offset if vindex is non-zero, so clear it.
-    if (IdxEn)
-      Offset = 0;
-    EVT VT = Op.getOperand(2).getValueType();
-
-    auto *M = cast<MemSDNode>(Op);
-    M->getMemOperand()->setOffset(Offset);
-    unsigned Opcode = VT.isVector() ? AMDGPUISD::BUFFER_ATOMIC_PK_FADD
-                                    : AMDGPUISD::BUFFER_ATOMIC_FADD;
-
-    return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
-                                   M->getMemOperand());
-  }
-
-  case Intrinsic::amdgcn_global_atomic_fadd: {
-    SDValue Ops[] = {
-      Chain,
-      Op.getOperand(2), // ptr
-      Op.getOperand(3)  // vdata
-    };
-    EVT VT = Op.getOperand(3).getValueType();
-
-    auto *M = cast<MemSDNode>(Op);
-    if (VT.isVector()) {
-      return DAG.getMemIntrinsicNode(
-        AMDGPUISD::ATOMIC_PK_FADD, DL, Op->getVTList(), Ops, VT,
-        M->getMemOperand());
-    }
-
-    return DAG.getAtomic(ISD::ATOMIC_LOAD_FADD, DL, VT,
-                         DAG.getVTList(VT, MVT::Other), Ops,
-                         M->getMemOperand()).getValue(1);
-  }
   case Intrinsic::amdgcn_end_cf:
     return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
                                       Op->getOperand(2), Chain), 0);
@@ -7572,7 +7662,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
   default: {
     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
             AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
-      return lowerImage(Op, ImageDimIntr, DAG);
+      return lowerImage(Op, ImageDimIntr, DAG, true);
 
     return Op;
   }
@@ -7848,13 +7938,6 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
          "Custom lowering for non-i32 vectors hasn't been implemented.");
 
-  if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
-                                      MemVT, *Load->getMemOperand())) {
-    SDValue Ops[2];
-    std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
-    return DAG.getMergeValues(Ops, DL);
-  }
-
   unsigned Alignment = Load->getAlignment();
   unsigned AS = Load->getAddressSpace();
   if (Subtarget->hasLDSMisalignedBug() &&
@@ -7879,9 +7962,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
     if (!Op->isDivergent() && Alignment >= 4 && NumElements < 32) {
       if (MemVT.isPow2VectorType())
         return SDValue();
-      if (NumElements == 3)
-        return WidenVectorLoad(Op, DAG);
-      return SplitVectorLoad(Op, DAG);
+      return WidenOrSplitVectorLoad(Op, DAG);
     }
     // Non-uniform loads will be selected to MUBUF instructions, so they
     // have the same legalization requirements as global and private
@@ -7897,9 +7978,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
         Alignment >= 4 && NumElements < 32) {
       if (MemVT.isPow2VectorType())
         return SDValue();
-      if (NumElements == 3)
-        return WidenVectorLoad(Op, DAG);
-      return SplitVectorLoad(Op, DAG);
+      return WidenOrSplitVectorLoad(Op, DAG);
     }
     // Non-uniform loads will be selected to MUBUF instructions, so they
     // have the same legalization requirements as global and private
@@ -7914,7 +7993,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
       return SplitVectorLoad(Op, DAG);
     // v3 loads not supported on SI.
     if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
-      return WidenVectorLoad(Op, DAG);
+      return WidenOrSplitVectorLoad(Op, DAG);
+
     // v3 and v4 loads are supported for private and global memory.
     return SDValue();
   }
@@ -7938,15 +8018,19 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
         return SplitVectorLoad(Op, DAG);
       // v3 loads not supported on SI.
       if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
-        return WidenVectorLoad(Op, DAG);
+        return WidenOrSplitVectorLoad(Op, DAG);
+
       return SDValue();
     default:
       llvm_unreachable("unsupported private_element_size");
     }
   } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
-    // Use ds_read_b128 if possible.
-    if (Subtarget->useDS128() && Load->getAlignment() >= 16 &&
-        MemVT.getStoreSize() == 16)
+    // Use ds_read_b128 or ds_read_b96 when possible.
+    if (Subtarget->hasDS96AndDS128() &&
+        ((Subtarget->useDS128() && MemVT.getStoreSize() == 16) ||
+         MemVT.getStoreSize() == 12) &&
+        allowsMisalignedMemoryAccessesImpl(MemVT.getSizeInBits(), AS,
+                                           Load->getAlign()))
       return SDValue();
 
     if (NumElements > 2)
@@ -7963,6 +8047,14 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
       return SplitVectorLoad(Op, DAG);
     }
   }
+
+  if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
+                                      MemVT, *Load->getMemOperand())) {
+    SDValue Ops[2];
+    std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
+    return DAG.getMergeValues(Ops, DL);
+  }
+
   return SDValue();
 }
 
@@ -8003,8 +8095,7 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
   EVT VT = Op.getValueType();
   const SDNodeFlags Flags = Op->getFlags();
 
-  bool AllowInaccurateRcp = DAG.getTarget().Options.UnsafeFPMath ||
-                            Flags.hasApproximateFuncs();
+  bool AllowInaccurateRcp = Flags.hasApproximateFuncs();
 
   // Without !fpmath accuracy information, we can't do more because we don't
   // know exactly whether rcp is accurate enough to meet !fpmath requirement.
@@ -8045,6 +8136,33 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
   return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
 }
 
+SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  SDValue X = Op.getOperand(0);
+  SDValue Y = Op.getOperand(1);
+  EVT VT = Op.getValueType();
+  const SDNodeFlags Flags = Op->getFlags();
+
+  bool AllowInaccurateDiv = Flags.hasApproximateFuncs() ||
+                            DAG.getTarget().Options.UnsafeFPMath;
+  if (!AllowInaccurateDiv)
+    return SDValue();
+
+  SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
+  SDValue One = DAG.getConstantFP(1.0, SL, VT);
+
+  SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
+  SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
+
+  R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
+  SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
+  R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
+  SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
+  SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
+  return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
+}
+
 static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
                           EVT VT, SDValue A, SDValue B, SDValue GlueChain,
                           SDNodeFlags Flags) {
@@ -8273,8 +8391,8 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
 }
 
 SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
-  if (DAG.getTarget().Options.UnsafeFPMath)
-    return lowerFastUnsafeFDIV(Op, DAG);
+  if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
+    return FastLowered;
 
   SDLoc SL(Op);
   SDValue X = Op.getOperand(0);
@@ -8368,11 +8486,6 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   assert(VT.isVector() &&
          Store->getValue().getValueType().getScalarType() == MVT::i32);
 
-  if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
-                                      VT, *Store->getMemOperand())) {
-    return expandUnalignedStore(Store, DAG);
-  }
-
   unsigned AS = Store->getAddressSpace();
   if (Subtarget->hasLDSMisalignedBug() &&
       AS == AMDGPUAS::FLAT_ADDRESS &&
@@ -8397,6 +8510,11 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
     // v3 stores not supported on SI.
     if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
       return SplitVectorStore(Op, DAG);
+
+    if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
+                                        VT, *Store->getMemOperand()))
+      return expandUnalignedStore(Store, DAG);
+
     return SDValue();
   } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
     switch (Subtarget->getMaxPrivateElementSize()) {
@@ -8407,16 +8525,20 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
         return SplitVectorStore(Op, DAG);
       return SDValue();
     case 16:
-      if (NumElements > 4 || NumElements == 3)
+      if (NumElements > 4 ||
+          (NumElements == 3 && !Subtarget->enableFlatScratch()))
         return SplitVectorStore(Op, DAG);
       return SDValue();
     default:
       llvm_unreachable("unsupported private_element_size");
     }
   } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
-    // Use ds_write_b128 if possible.
-    if (Subtarget->useDS128() && Store->getAlignment() >= 16 &&
-        VT.getStoreSize() == 16 && NumElements != 3)
+    // Use ds_write_b128 or ds_write_b96 when possible.
+    if (Subtarget->hasDS96AndDS128() &&
+        ((Subtarget->useDS128() && VT.getStoreSize() == 16) ||
+         (VT.getStoreSize() == 12)) &&
+        allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AS,
+                                           Store->getAlign()))
       return SDValue();
 
     if (NumElements > 2)
@@ -8433,6 +8555,13 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
       return SplitVectorStore(Op, DAG);
     }
 
+    if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
+                                        VT, *Store->getMemOperand())) {
+      if (VT.isVector())
+        return SplitVectorStore(Op, DAG);
+      return expandUnalignedStore(Store, DAG);
+    }
+
     return SDValue();
   } else {
     llvm_unreachable("unhandled address space");
@@ -8474,7 +8603,7 @@ SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) co
   unsigned AS = AtomicNode->getAddressSpace();
 
   // No custom lowering required for local address space
-  if (!isFlatGlobalAddrSpace(AS))
+  if (!AMDGPU::isFlatGlobalAddrSpace(AS))
     return Op;
 
   // Non-local address space requires custom lowering for atomic compare
@@ -8584,7 +8713,7 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
   EVT VT = N->getValueType(0);
 
   SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
-  SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32);
+  SDValue COffset = DAG.getConstant(Offset, SL, VT);
 
   SDNodeFlags Flags;
   Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap() &&
@@ -8594,12 +8723,28 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
   return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
 }
 
+/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
+/// by the chain and intrinsic ID. Theoretically we would also need to check the
+/// specific intrinsic, but they all place the pointer operand first.
+static unsigned getBasePtrIndex(const MemSDNode *N) {
+  switch (N->getOpcode()) {
+  case ISD::STORE:
+  case ISD::INTRINSIC_W_CHAIN:
+  case ISD::INTRINSIC_VOID:
+    return 2;
+  default:
+    return 1;
+  }
+}
+
 SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
                                                   DAGCombinerInfo &DCI) const {
-  SDValue Ptr = N->getBasePtr();
   SelectionDAG &DAG = DCI.DAG;
   SDLoc SL(N);
 
+  unsigned PtrIdx = getBasePtrIndex(N);
+  SDValue Ptr = N->getOperand(PtrIdx);
+
   // TODO: We could also do this for multiplies.
   if (Ptr.getOpcode() == ISD::SHL) {
     SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(),  N->getAddressSpace(),
@@ -8607,7 +8752,7 @@ SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
     if (NewPtr) {
       SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
 
-      NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr;
+      NewOps[PtrIdx] = NewPtr;
       return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
     }
   }
@@ -8868,7 +9013,7 @@ SDValue SITargetLowering::performAndCombine(SDNode *N,
   // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
   if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
-      N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) {
+      N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
     uint32_t LHSMask = getPermuteMask(DAG, LHS);
     uint32_t RHSMask = getPermuteMask(DAG, RHS);
     if (LHSMask != ~0u && RHSMask != ~0u) {
@@ -8965,7 +9110,7 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
   // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
   if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
-      N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) {
+      N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
     uint32_t LHSMask = getPermuteMask(DAG, LHS);
     uint32_t RHSMask = getPermuteMask(DAG, RHS);
     if (LHSMask != ~0u && RHSMask != ~0u) {
@@ -10509,8 +10654,6 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
   if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
     return SDValue();
   switch (N->getOpcode()) {
-  default:
-    return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
   case ISD::ADD:
     return performAddCombine(N, DCI);
   case ISD::SUB:
@@ -10537,35 +10680,6 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
     return performMinMaxCombine(N, DCI);
   case ISD::FMA:
     return performFMACombine(N, DCI);
-  case ISD::LOAD: {
-    if (SDValue Widended = widenLoad(cast<LoadSDNode>(N), DCI))
-      return Widended;
-    LLVM_FALLTHROUGH;
-  }
-  case ISD::STORE:
-  case ISD::ATOMIC_LOAD:
-  case ISD::ATOMIC_STORE:
-  case ISD::ATOMIC_CMP_SWAP:
-  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
-  case ISD::ATOMIC_SWAP:
-  case ISD::ATOMIC_LOAD_ADD:
-  case ISD::ATOMIC_LOAD_SUB:
-  case ISD::ATOMIC_LOAD_AND:
-  case ISD::ATOMIC_LOAD_OR:
-  case ISD::ATOMIC_LOAD_XOR:
-  case ISD::ATOMIC_LOAD_NAND:
-  case ISD::ATOMIC_LOAD_MIN:
-  case ISD::ATOMIC_LOAD_MAX:
-  case ISD::ATOMIC_LOAD_UMIN:
-  case ISD::ATOMIC_LOAD_UMAX:
-  case ISD::ATOMIC_LOAD_FADD:
-  case AMDGPUISD::ATOMIC_INC:
-  case AMDGPUISD::ATOMIC_DEC:
-  case AMDGPUISD::ATOMIC_LOAD_FMIN:
-  case AMDGPUISD::ATOMIC_LOAD_FMAX: // TODO: Target mem intrinsics.
-    if (DCI.isBeforeLegalize())
-      break;
-    return performMemSDNodeCombine(cast<MemSDNode>(N), DCI);
   case ISD::AND:
     return performAndCombine(N, DCI);
   case ISD::OR:
@@ -10630,14 +10744,28 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
     return performExtractVectorEltCombine(N, DCI);
   case ISD::INSERT_VECTOR_ELT:
     return performInsertVectorEltCombine(N, DCI);
+  case ISD::LOAD: {
+    if (SDValue Widended = widenLoad(cast<LoadSDNode>(N), DCI))
+      return Widended;
+    LLVM_FALLTHROUGH;
   }
+  default: {
+    if (!DCI.isBeforeLegalize()) {
+      if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
+        return performMemSDNodeCombine(MemNode, DCI);
+    }
+
+    break;
+  }
+  }
+
   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
 }
 
 /// Helper function for adjustWritemask
 static unsigned SubIdx2Lane(unsigned Idx) {
   switch (Idx) {
-  default: return 0;
+  default: return ~0u;
   case AMDGPU::sub0: return 0;
   case AMDGPU::sub1: return 1;
   case AMDGPU::sub2: return 2;
@@ -10697,6 +10825,8 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
     // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
     // set, etc.
     Lane = SubIdx2Lane(I->getConstantOperandVal(1));
+    if (Lane == ~0u)
+      return Node;
 
     // Check if the use is for the TFE/LWE generated result at VGPRn+1.
     if (UsesTFC && Lane == TFCLane) {
@@ -10826,8 +10956,7 @@ SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
 
     // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
     // to try understanding copies to physical registers.
-    if (SrcVal.getValueType() == MVT::i1 &&
-        Register::isPhysicalRegister(DestReg->getReg())) {
+    if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
       SDLoc SL(Node);
       MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
       SDValue VReg = DAG.getRegister(
@@ -10870,7 +10999,8 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
   unsigned Opcode = Node->getMachineOpcode();
 
   if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() &&
-      !TII->isGather4(Opcode)) {
+      !TII->isGather4(Opcode) &&
+      AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) != -1) {
     return adjustWritemask(Node, DAG);
   }
 
@@ -10881,14 +11011,14 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
   }
 
   switch (Opcode) {
-  case AMDGPU::V_DIV_SCALE_F32:
-  case AMDGPU::V_DIV_SCALE_F64: {
+  case AMDGPU::V_DIV_SCALE_F32_e64:
+  case AMDGPU::V_DIV_SCALE_F64_e64: {
     // Satisfy the operand register constraint when one of the inputs is
     // undefined. Ordinarily each undef value will have its own implicit_def of
     // a vreg, so force these to use a single register.
-    SDValue Src0 = Node->getOperand(0);
-    SDValue Src1 = Node->getOperand(1);
-    SDValue Src2 = Node->getOperand(2);
+    SDValue Src0 = Node->getOperand(1);
+    SDValue Src1 = Node->getOperand(3);
+    SDValue Src2 = Node->getOperand(5);
 
     if ((Src0.isMachineOpcode() &&
          Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
@@ -10923,10 +11053,10 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
     } else
       break;
 
-    SmallVector<SDValue, 4> Ops = { Src0, Src1, Src2 };
-    for (unsigned I = 3, N = Node->getNumOperands(); I != N; ++I)
-      Ops.push_back(Node->getOperand(I));
-
+    SmallVector<SDValue, 9> Ops(Node->op_begin(), Node->op_end());
+    Ops[1] = Src0;
+    Ops[3] = Src1;
+    Ops[5] = Src2;
     Ops.push_back(ImpDef.getValue(1));
     return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
   }
@@ -10962,8 +11092,7 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
         MachineOperand &Op = MI.getOperand(I);
         if ((OpInfo[I].RegClass != llvm::AMDGPU::AV_64RegClassID &&
              OpInfo[I].RegClass != llvm::AMDGPU::AV_32RegClassID) ||
-            !Register::isVirtualRegister(Op.getReg()) ||
-            !TRI->isAGPR(MRI, Op.getReg()))
+            !Op.getReg().isVirtual() || !TRI->isAGPR(MRI, Op.getReg()))
           continue;
         auto *Src = MRI.getUniqueVRegDef(Op.getReg());
         if (!Src || !Src->isCopy() ||
@@ -10985,8 +11114,12 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
   int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode());
   if (NoRetAtomicOp != -1) {
     if (!Node->hasAnyUseOfValue(0)) {
-      MI.setDesc(TII->get(NoRetAtomicOp));
+      int Glc1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+                                               AMDGPU::OpName::glc1);
+      if (Glc1Idx != -1)
+        MI.RemoveOperand(Glc1Idx);
       MI.RemoveOperand(0);
+      MI.setDesc(TII->get(NoRetAtomicOp));
       return;
     }
 
@@ -11341,17 +11474,7 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
   Info->limitOccupancy(MF);
 
   if (ST.isWave32() && !MF.empty()) {
-    // Add VCC_HI def because many instructions marked as imp-use VCC where
-    // we may only define VCC_LO. If nothing defines VCC_HI we may end up
-    // having a use of undef.
-
     const SIInstrInfo *TII = ST.getInstrInfo();
-    DebugLoc DL;
-
-    MachineBasicBlock &MBB = MF.front();
-    MachineBasicBlock::iterator I = MBB.getFirstNonDebugInstr();
-    BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), AMDGPU::VCC_HI);
-
     for (auto &MBB : MF) {
       for (auto &MI : MBB) {
         TII->fixImplicitOperands(MI);
@@ -11379,6 +11502,55 @@ void SITargetLowering::computeKnownBitsForFrameIndex(
   Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
 }
 
+static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB,
+                                   KnownBits &Known, unsigned Dim) {
+  unsigned MaxValue =
+      ST.getMaxWorkitemID(KB.getMachineFunction().getFunction(), Dim);
+  Known.Zero.setHighBits(countLeadingZeros(MaxValue));
+}
+
+void SITargetLowering::computeKnownBitsForTargetInstr(
+    GISelKnownBits &KB, Register R, KnownBits &Known, const APInt &DemandedElts,
+    const MachineRegisterInfo &MRI, unsigned Depth) const {
+  const MachineInstr *MI = MRI.getVRegDef(R);
+  switch (MI->getOpcode()) {
+  case AMDGPU::G_INTRINSIC: {
+    switch (MI->getIntrinsicID()) {
+    case Intrinsic::amdgcn_workitem_id_x:
+      knownBitsForWorkitemID(*getSubtarget(), KB, Known, 0);
+      break;
+    case Intrinsic::amdgcn_workitem_id_y:
+      knownBitsForWorkitemID(*getSubtarget(), KB, Known, 1);
+      break;
+    case Intrinsic::amdgcn_workitem_id_z:
+      knownBitsForWorkitemID(*getSubtarget(), KB, Known, 2);
+      break;
+    case Intrinsic::amdgcn_mbcnt_lo:
+    case Intrinsic::amdgcn_mbcnt_hi: {
+      // These return at most the wavefront size - 1.
+      unsigned Size = MRI.getType(R).getSizeInBits();
+      Known.Zero.setHighBits(Size - getSubtarget()->getWavefrontSizeLog2());
+      break;
+    }
+    case Intrinsic::amdgcn_groupstaticsize: {
+      // We can report everything over the maximum size as 0. We can't report
+      // based on the actual size because we don't know if it's accurate or not
+      // at any given point.
+      Known.Zero.setHighBits(countLeadingZeros(getSubtarget()->getLocalMemorySize()));
+      break;
+    }
+    }
+    break;
+  }
+  case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
+    Known.Zero.setHighBits(24);
+    break;
+  case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
+    Known.Zero.setHighBits(16);
+    break;
+  }
+}
+
 Align SITargetLowering::computeKnownAlignForTargetInstr(
   GISelKnownBits &KB, Register R, const MachineRegisterInfo &MRI,
   unsigned Depth) const {
@@ -11484,46 +11656,40 @@ static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
   return false;
 }
 
-bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N,
-  FunctionLoweringInfo * FLI, LegacyDivergenceAnalysis * KDA) const
-{
+bool SITargetLowering::isSDNodeSourceOfDivergence(
+    const SDNode *N, FunctionLoweringInfo *FLI,
+    LegacyDivergenceAnalysis *KDA) const {
   switch (N->getOpcode()) {
-    case ISD::CopyFromReg:
-    {
-      const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
-      const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
-      const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
-      Register Reg = R->getReg();
+  case ISD::CopyFromReg: {
+    const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
+    const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
+    const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
+    Register Reg = R->getReg();
 
-      // FIXME: Why does this need to consider isLiveIn?
-      if (Reg.isPhysical() || MRI.isLiveIn(Reg))
-        return !TRI->isSGPRReg(MRI, Reg);
+    // FIXME: Why does this need to consider isLiveIn?
+    if (Reg.isPhysical() || MRI.isLiveIn(Reg))
+      return !TRI->isSGPRReg(MRI, Reg);
 
-      if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
-        return KDA->isDivergent(V);
+    if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
+      return KDA->isDivergent(V);
 
-      assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N));
-      return !TRI->isSGPRReg(MRI, Reg);
-    }
-    break;
-    case ISD::LOAD: {
-      const LoadSDNode *L = cast<LoadSDNode>(N);
-      unsigned AS = L->getAddressSpace();
-      // A flat load may access private memory.
-      return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
-    } break;
-    case ISD::CALLSEQ_END:
+    assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N));
+    return !TRI->isSGPRReg(MRI, Reg);
+  }
+  case ISD::LOAD: {
+    const LoadSDNode *L = cast<LoadSDNode>(N);
+    unsigned AS = L->getAddressSpace();
+    // A flat load may access private memory.
+    return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
+  }
+  case ISD::CALLSEQ_END:
     return true;
-    break;
-    case ISD::INTRINSIC_WO_CHAIN:
-    {
-
-    }
-      return AMDGPU::isIntrinsicSourceOfDivergence(
-      cast<ConstantSDNode>(N->getOperand(0))->getZExtValue());
-    case ISD::INTRINSIC_W_CHAIN:
-      return AMDGPU::isIntrinsicSourceOfDivergence(
-      cast<ConstantSDNode>(N->getOperand(1))->getZExtValue());
+  case ISD::INTRINSIC_WO_CHAIN:
+    return AMDGPU::isIntrinsicSourceOfDivergence(
+        cast<ConstantSDNode>(N->getOperand(0))->getZExtValue());
+  case ISD::INTRINSIC_W_CHAIN:
+    return AMDGPU::isIntrinsicSourceOfDivergence(
+        cast<ConstantSDNode>(N->getOperand(1))->getZExtValue());
   }
   return false;
 }
@@ -11558,6 +11724,16 @@ bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
                                                             SNaN, Depth);
 }
 
+// Global FP atomic instructions have a hardcoded FP mode and do not support
+// FP32 denormals, and only support v2f16 denormals.
+static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW) {
+  const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
+  auto DenormMode = RMW->getParent()->getParent()->getDenormalMode(Flt);
+  if (&Flt == &APFloat::IEEEsingle())
+    return DenormMode == DenormalMode::getPreserveSign();
+  return DenormMode == DenormalMode::getIEEE();
+}
+
 TargetLowering::AtomicExpansionKind
 SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
   switch (RMW->getOperation()) {
@@ -11576,10 +11752,15 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
     unsigned AS = RMW->getPointerAddressSpace();
 
     if (AS == AMDGPUAS::GLOBAL_ADDRESS && Subtarget->hasAtomicFaddInsts()) {
+      if (!fpModeMatchesGlobalFPAtomicMode(RMW))
+        return AtomicExpansionKind::CmpXChg;
+
       return RMW->use_empty() ? AtomicExpansionKind::None :
                                 AtomicExpansionKind::CmpXChg;
     }
 
+    // DS FP atomics do repect the denormal mode, but the rounding mode is fixed
+    // to round-to-nearest-even.
     return (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomics()) ?
       AtomicExpansionKind::None : AtomicExpansionKind::CmpXChg;
   }
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.h
index f4c076464057..823d6eca9bf8 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -16,10 +16,17 @@
 
 #include "AMDGPUISelLowering.h"
 #include "AMDGPUArgumentUsageInfo.h"
-#include "SIInstrInfo.h"
 
 namespace llvm {
 
+class GCNSubtarget;
+class SIMachineFunctionInfo;
+class SIRegisterInfo;
+
+namespace AMDGPU {
+struct ImageDimIntrinsicInfo;
+}
+
 class SITargetLowering final : public AMDGPUTargetLowering {
 private:
   const GCNSubtarget *Subtarget;
@@ -59,10 +66,15 @@ private:
   SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
                                  MVT VT, unsigned Offset) const;
   SDValue lowerImage(SDValue Op, const AMDGPU::ImageDimIntrinsicInfo *Intr,
-                     SelectionDAG &DAG) const;
+                     SelectionDAG &DAG, bool WithChain) const;
   SDValue lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, SDValue Offset,
                        SDValue CachePolicy, SelectionDAG &DAG) const;
 
+  SDValue lowerRawBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
+                                     unsigned NewOpcode) const;
+  SDValue lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
+                                        unsigned NewOpcode) const;
+
   SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
@@ -80,12 +92,12 @@ private:
   SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFastUnsafeFDIV(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerFastUnsafeFDIV64(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFDIV16(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFDIV64(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFDIV(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG, bool Signed) const;
   SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
@@ -104,7 +116,8 @@ private:
                               ArrayRef<SDValue> Ops, EVT MemVT,
                               MachineMemOperand *MMO, SelectionDAG &DAG) const;
 
-  SDValue handleD16VData(SDValue VData, SelectionDAG &DAG) const;
+  SDValue handleD16VData(SDValue VData, SelectionDAG &DAG,
+                         bool ImageStore = false) const;
 
   /// Converts \p Op, which must be of floating point type, to the
   /// floating point type \p VT, by either extending or truncating it.
@@ -255,12 +268,22 @@ public:
                         const SelectionDAG &DAG) const override;
 
   bool allowsMisalignedMemoryAccessesImpl(
-      unsigned Size, unsigned AS, unsigned Align,
+      unsigned Size, unsigned AddrSpace, Align Alignment,
       MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
       bool *IsFast = nullptr) const;
 
   bool allowsMisalignedMemoryAccesses(
-      EVT VT, unsigned AS, unsigned Align,
+      LLT Ty, unsigned AddrSpace, Align Alignment,
+      MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
+      bool *IsFast = nullptr) const override {
+    if (IsFast)
+      *IsFast = false;
+    return allowsMisalignedMemoryAccessesImpl(Ty.getSizeInBits(), AddrSpace,
+                                              Alignment, Flags, IsFast);
+  }
+
+  bool allowsMisalignedMemoryAccesses(
+      EVT VT, unsigned AS, unsigned Alignment,
       MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
       bool *IsFast = nullptr) const override;
 
@@ -270,20 +293,8 @@ public:
   bool isMemOpUniform(const SDNode *N) const;
   bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const;
 
-  static bool isNonGlobalAddrSpace(unsigned AS) {
-    return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||
-           AS == AMDGPUAS::PRIVATE_ADDRESS;
-  }
+  static bool isNonGlobalAddrSpace(unsigned AS);
 
-  // FIXME: Missing constant_32bit
-  static bool isFlatGlobalAddrSpace(unsigned AS) {
-    return AS == AMDGPUAS::GLOBAL_ADDRESS ||
-           AS == AMDGPUAS::FLAT_ADDRESS ||
-           AS == AMDGPUAS::CONSTANT_ADDRESS ||
-           AS > AMDGPUAS::MAX_AMDGPU_ADDRESS;
-  }
-
-  bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
   bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
 
   TargetLoweringBase::LegalizeTypeAction
@@ -366,6 +377,8 @@ public:
   EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
                          EVT VT) const override;
   MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override;
+  LLT getPreferredShiftAmountTy(LLT Ty) const override;
+
   bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
                                   EVT VT) const override;
   bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override;
@@ -412,6 +425,11 @@ public:
   void computeKnownBitsForFrameIndex(int FrameIdx,
                                      KnownBits &Known,
                                      const MachineFunction &MF) const override;
+  void computeKnownBitsForTargetInstr(GISelKnownBits &Analysis, Register R,
+                                      KnownBits &Known,
+                                      const APInt &DemandedElts,
+                                      const MachineRegisterInfo &MRI,
+                                      unsigned Depth = 0) const override;
 
   Align computeKnownAlignForTargetInstr(GISelKnownBits &Analysis, Register R,
                                         const MachineRegisterInfo &MRI,
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
index 35c49ae8c0dd..5611c9c5d57e 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
@@ -31,8 +31,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "AMDGPUSubtarget.h"
-#include "SIInstrInfo.h"
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/SmallVector.h"
 
 using namespace llvm;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
index 052db5f6ea71..9d31cd5cedc3 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
@@ -14,30 +14,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "SIInstrInfo.h"
-#include "SIMachineFunctionInfo.h"
+#include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/IR/CallingConv.h"
-#include "llvm/IR/DebugLoc.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Target/TargetMachine.h"
-#include <cassert>
-#include <cstdint>
-#include <iterator>
 
 using namespace llvm;
 
@@ -58,16 +39,18 @@ private:
   MachineDominatorTree *MDT = nullptr;
 
   MachineBasicBlock *EarlyExitBlock = nullptr;
+  bool EarlyExitClearsExec = false;
 
   bool shouldSkip(const MachineBasicBlock &From,
                   const MachineBasicBlock &To) const;
 
   bool dominatesAllReachable(MachineBasicBlock &MBB);
-  void createEarlyExitBlock(MachineBasicBlock &MBB);
+  void ensureEarlyExitBlock(MachineBasicBlock &MBB, bool ClearExec);
   void skipIfDead(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                   DebugLoc DL);
 
   bool kill(MachineInstr &MI);
+  void earlyTerm(MachineInstr &MI);
 
   bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB);
 
@@ -164,31 +147,62 @@ bool SIInsertSkips::dominatesAllReachable(MachineBasicBlock &MBB) {
   return true;
 }
 
-static void generatePsEndPgm(MachineBasicBlock &MBB,
-                             MachineBasicBlock::iterator I, DebugLoc DL,
-                             const SIInstrInfo *TII) {
-  // Generate "null export; s_endpgm".
-  BuildMI(MBB, I, DL, TII->get(AMDGPU::EXP_DONE))
-      .addImm(0x09) // V_008DFC_SQ_EXP_NULL
-      .addReg(AMDGPU::VGPR0, RegState::Undef)
-      .addReg(AMDGPU::VGPR0, RegState::Undef)
-      .addReg(AMDGPU::VGPR0, RegState::Undef)
-      .addReg(AMDGPU::VGPR0, RegState::Undef)
-      .addImm(1)  // vm
-      .addImm(0)  // compr
-      .addImm(0); // en
+static void generateEndPgm(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator I, DebugLoc DL,
+                           const SIInstrInfo *TII, bool IsPS) {
+  // "null export"
+  if (IsPS) {
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::EXP_DONE))
+        .addImm(AMDGPU::Exp::ET_NULL)
+        .addReg(AMDGPU::VGPR0, RegState::Undef)
+        .addReg(AMDGPU::VGPR0, RegState::Undef)
+        .addReg(AMDGPU::VGPR0, RegState::Undef)
+        .addReg(AMDGPU::VGPR0, RegState::Undef)
+        .addImm(1)  // vm
+        .addImm(0)  // compr
+        .addImm(0); // en
+  }
+  // s_endpgm
   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ENDPGM)).addImm(0);
 }
 
-void SIInsertSkips::createEarlyExitBlock(MachineBasicBlock &MBB) {
+void SIInsertSkips::ensureEarlyExitBlock(MachineBasicBlock &MBB,
+                                         bool ClearExec) {
   MachineFunction *MF = MBB.getParent();
   DebugLoc DL;
 
-  assert(!EarlyExitBlock);
-  EarlyExitBlock = MF->CreateMachineBasicBlock();
-  MF->insert(MF->end(), EarlyExitBlock);
+  if (!EarlyExitBlock) {
+    EarlyExitBlock = MF->CreateMachineBasicBlock();
+    MF->insert(MF->end(), EarlyExitBlock);
+    generateEndPgm(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII,
+                   MF->getFunction().getCallingConv() ==
+                       CallingConv::AMDGPU_PS);
+    EarlyExitClearsExec = false;
+  }
 
-  generatePsEndPgm(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII);
+  if (ClearExec && !EarlyExitClearsExec) {
+    const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+    unsigned Mov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+    Register Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+    auto ExitI = EarlyExitBlock->getFirstNonPHI();
+    BuildMI(*EarlyExitBlock, ExitI, DL, TII->get(Mov), Exec).addImm(0);
+    EarlyExitClearsExec = true;
+  }
+}
+
+static void splitBlock(MachineBasicBlock &MBB, MachineInstr &MI,
+                       MachineDominatorTree *MDT) {
+  MachineBasicBlock *SplitBB = MBB.splitAt(MI, /*UpdateLiveIns*/ true);
+
+  // Update dominator tree
+  using DomTreeT = DomTreeBase<MachineBasicBlock>;
+  SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
+  for (MachineBasicBlock *Succ : SplitBB->successors()) {
+    DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
+    DTUpdates.push_back({DomTreeT::Delete, &MBB, Succ});
+  }
+  DTUpdates.push_back({DomTreeT::Insert, &MBB, SplitBB});
+  MDT->getBase().applyUpdates(DTUpdates);
 }
 
 /// Insert an "if exec=0 { null export; s_endpgm }" sequence before the given
@@ -196,6 +210,7 @@ void SIInsertSkips::createEarlyExitBlock(MachineBasicBlock &MBB) {
 void SIInsertSkips::skipIfDead(MachineBasicBlock &MBB,
                                MachineBasicBlock::iterator I, DebugLoc DL) {
   MachineFunction *MF = MBB.getParent();
+  (void)MF;
   assert(MF->getFunction().getCallingConv() == CallingConv::AMDGPU_PS);
 
   // It is possible for an SI_KILL_*_TERMINATOR to sit at the bottom of a
@@ -211,45 +226,22 @@ void SIInsertSkips::skipIfDead(MachineBasicBlock &MBB,
   // In this case, we write the "null_export; s_endpgm" skip code in the
   // already-existing basic block.
   auto NextBBI = std::next(MBB.getIterator());
-  bool NoSuccessor = I == MBB.end() &&
-                     llvm::find(MBB.successors(), &*NextBBI) == MBB.succ_end();
+  bool NoSuccessor =
+      I == MBB.end() && !llvm::is_contained(MBB.successors(), &*NextBBI);
 
   if (NoSuccessor) {
-    generatePsEndPgm(MBB, I, DL, TII);
+    generateEndPgm(MBB, I, DL, TII, true);
   } else {
-    if (!EarlyExitBlock) {
-      createEarlyExitBlock(MBB);
-      // Update next block pointer to reflect any new blocks
-      NextBBI = std::next(MBB.getIterator());
-    }
+    ensureEarlyExitBlock(MBB, false);
 
-    auto BranchMI = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
-                        .addMBB(EarlyExitBlock);
+    MachineInstr *BranchMI =
+        BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
+            .addMBB(EarlyExitBlock);
 
     // Split the block if the branch will not come at the end.
     auto Next = std::next(BranchMI->getIterator());
-    if (Next != MBB.end() && !Next->isTerminator()) {
-      MachineBasicBlock *SplitBB =
-          MF->CreateMachineBasicBlock(MBB.getBasicBlock());
-      MF->insert(NextBBI, SplitBB);
-      SplitBB->splice(SplitBB->begin(), &MBB, I, MBB.end());
-      SplitBB->transferSuccessorsAndUpdatePHIs(&MBB);
-      // FIXME: the expectation is that this will be used near the beginning
-      //        of a block so just assume all registers are still live.
-      for (auto LiveIn : MBB.liveins())
-        SplitBB->addLiveIn(LiveIn);
-      MBB.addSuccessor(SplitBB);
-
-      // Update dominator tree
-      using DomTreeT = DomTreeBase<MachineBasicBlock>;
-      SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
-      for (MachineBasicBlock *Succ : SplitBB->successors()) {
-        DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
-        DTUpdates.push_back({DomTreeT::Delete, &MBB, Succ});
-      }
-      DTUpdates.push_back({DomTreeT::Insert, &MBB, SplitBB});
-      MDT->getBase().applyUpdates(DTUpdates);
-    }
+    if (Next != MBB.end() && !Next->isTerminator())
+      splitBlock(MBB, *BranchMI, MDT);
 
     MBB.addSuccessor(EarlyExitBlock);
     MDT->getBase().insertEdge(&MBB, EarlyExitBlock);
@@ -382,6 +374,23 @@ bool SIInsertSkips::kill(MachineInstr &MI) {
   }
 }
 
+void SIInsertSkips::earlyTerm(MachineInstr &MI) {
+  MachineBasicBlock &MBB = *MI.getParent();
+  const DebugLoc DL = MI.getDebugLoc();
+
+  ensureEarlyExitBlock(MBB, true);
+
+  auto BranchMI = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC0))
+                      .addMBB(EarlyExitBlock);
+  auto Next = std::next(MI.getIterator());
+
+  if (Next != MBB.end() && !Next->isTerminator())
+    splitBlock(MBB, *BranchMI, MDT);
+
+  MBB.addSuccessor(EarlyExitBlock);
+  MDT->getBase().insertEdge(&MBB, EarlyExitBlock);
+}
+
 // Returns true if a branch over the block was inserted.
 bool SIInsertSkips::skipMaskBranch(MachineInstr &MI,
                                    MachineBasicBlock &SrcMBB) {
@@ -407,6 +416,7 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
   SkipThreshold = SkipThresholdFlag;
 
   SmallVector<MachineInstr *, 4> KillInstrs;
+  SmallVector<MachineInstr *, 4> EarlyTermInstrs;
   bool MadeChange = false;
 
   for (MachineBasicBlock &MBB : MF) {
@@ -465,18 +475,29 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
         }
         break;
 
+      case AMDGPU::SI_EARLY_TERMINATE_SCC0:
+        EarlyTermInstrs.push_back(&MI);
+        break;
+
       default:
         break;
       }
     }
   }
 
+  for (MachineInstr *Instr : EarlyTermInstrs) {
+    // Early termination in GS does nothing
+    if (MF.getFunction().getCallingConv() != CallingConv::AMDGPU_GS)
+      earlyTerm(*Instr);
+    Instr->eraseFromParent();
+  }
   for (MachineInstr *Kill : KillInstrs) {
     skipIfDead(*Kill->getParent(), std::next(Kill->getIterator()),
                Kill->getDebugLoc());
     Kill->eraseFromParent();
   }
   KillInstrs.clear();
+  EarlyTermInstrs.clear();
   EarlyExitBlock = nullptr;
 
   return MadeChange;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 2a157eb20ab4..c12745586da1 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -24,41 +24,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "SIDefines.h"
-#include "SIInstrInfo.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIMachineFunctionInfo.h"
-#include "SIRegisterInfo.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/IR/DebugLoc.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/DebugCounter.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
-#include <cstring>
-#include <memory>
-#include <utility>
-
+#include "llvm/Support/TargetParser.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "si-insert-waitcnts"
@@ -458,6 +432,7 @@ public:
 #endif // NDEBUG
   }
 
+  bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
   bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
   bool generateWaitcntInstBefore(MachineInstr &MI,
                                  WaitcntBrackets &ScoreBrackets,
@@ -486,7 +461,7 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
 
   RegInterval Result;
 
-  unsigned Reg = TRI->getEncodingValue(Op.getReg());
+  unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST));
 
   if (TRI->isVGPR(*MRI, Op.getReg())) {
     assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
@@ -623,8 +598,9 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
           MachineOperand &DefMO = Inst.getOperand(I);
           if (DefMO.isReg() && DefMO.isDef() &&
               TRI->isVGPR(*MRI, DefMO.getReg())) {
-            setRegScore(TRI->getEncodingValue(DefMO.getReg()), EXP_CNT,
-                        CurrScore);
+            setRegScore(
+                TRI->getEncodingValue(AMDGPU::getMCReg(DefMO.getReg(), *ST)),
+                EXP_CNT, CurrScore);
           }
         }
       }
@@ -855,7 +831,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
   setForceEmitWaitcnt();
   bool IsForceEmitWaitcnt = isForceEmitWaitcnt();
 
-  if (MI.isDebugInstr())
+  if (MI.isMetaInstruction())
     return false;
 
   AMDGPU::Waitcnt Wait;
@@ -876,7 +852,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
   if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
       MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
       (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
-    Wait = Wait.combined(AMDGPU::Waitcnt::allZero(IV));
+    Wait = Wait.combined(AMDGPU::Waitcnt::allZero(ST->hasVscnt()));
   }
   // Resolve vm waits before gs-done.
   else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
@@ -963,26 +939,28 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
 
       int CallAddrOpIdx =
           AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
-      RegInterval CallAddrOpInterval =
+
+      if (MI.getOperand(CallAddrOpIdx).isReg()) {
+        RegInterval CallAddrOpInterval =
           ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, CallAddrOpIdx);
 
-      for (int RegNo = CallAddrOpInterval.first;
-           RegNo < CallAddrOpInterval.second; ++RegNo)
-        ScoreBrackets.determineWait(
+        for (int RegNo = CallAddrOpInterval.first;
+             RegNo < CallAddrOpInterval.second; ++RegNo)
+          ScoreBrackets.determineWait(
             LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
 
-      int RtnAddrOpIdx =
-            AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
-      if (RtnAddrOpIdx != -1) {
-        RegInterval RtnAddrOpInterval =
+        int RtnAddrOpIdx =
+          AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
+        if (RtnAddrOpIdx != -1) {
+          RegInterval RtnAddrOpInterval =
             ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, RtnAddrOpIdx);
 
-        for (int RegNo = RtnAddrOpInterval.first;
-             RegNo < RtnAddrOpInterval.second; ++RegNo)
-          ScoreBrackets.determineWait(
+          for (int RegNo = RtnAddrOpInterval.first;
+               RegNo < RtnAddrOpInterval.second; ++RegNo)
+            ScoreBrackets.determineWait(
               LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
+        }
       }
-
     } else {
       // FIXME: Should not be relying on memoperands.
       // Look at the source operands of every instruction to see if
@@ -1024,8 +1002,10 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
           continue;
         RegInterval Interval =
             ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I);
+
+        const bool IsVGPR = TRI->isVGPR(*MRI, Op.getReg());
         for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
-          if (TRI->isVGPR(*MRI, Op.getReg())) {
+          if (IsVGPR) {
             // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
             // previous write and this write are the same type of VMEM
             // instruction, in which case they're guaranteed to write their
@@ -1055,7 +1035,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
   // requiring a WAITCNT beforehand.
   if (MI.getOpcode() == AMDGPU::S_BARRIER &&
       !ST->hasAutoWaitcntBeforeBarrier()) {
-    Wait = Wait.combined(AMDGPU::Waitcnt::allZero(IV));
+    Wait = Wait.combined(AMDGPU::Waitcnt::allZero(ST->hasVscnt()));
   }
 
   // TODO: Remove this work-around, enable the assert for Bug 457939
@@ -1088,8 +1068,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
         } else {
           assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
           assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
-          ScoreBrackets.applyWaitcnt(
-              AMDGPU::Waitcnt(~0u, ~0u, ~0u, II->getOperand(1).getImm()));
+          auto W = TII->getNamedOperand(*II, AMDGPU::OpName::simm16)->getImm();
+          ScoreBrackets.applyWaitcnt(AMDGPU::Waitcnt(~0u, ~0u, ~0u, W));
         }
       }
     }
@@ -1097,7 +1077,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
   }
 
   if (ForceEmitZeroWaitcnts)
-    Wait = AMDGPU::Waitcnt::allZero(IV);
+    Wait = AMDGPU::Waitcnt::allZero(ST->hasVscnt());
 
   if (ForceEmitWaitcnt[VM_CNT])
     Wait.VmCnt = 0;
@@ -1137,12 +1117,13 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
         assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
         assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
 
-        unsigned ICnt = II->getOperand(1).getImm();
+        unsigned ICnt = TII->getNamedOperand(*II, AMDGPU::OpName::simm16)
+                        ->getImm();
         OldWait.VsCnt = std::min(OldWait.VsCnt, ICnt);
         if (!TrackedWaitcntSet.count(&*II))
           Wait.VsCnt = std::min(Wait.VsCnt, ICnt);
         if (Wait.VsCnt != ICnt) {
-          II->getOperand(1).setImm(Wait.VsCnt);
+          TII->getNamedOperand(*II, AMDGPU::OpName::simm16)->setImm(Wait.VsCnt);
           Modified = true;
         }
         Wait.VsCnt = ~0u;
@@ -1189,12 +1170,50 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
   return Modified;
 }
 
-// This is a flat memory operation. Check to see if it has memory
-// tokens for both LDS and Memory, and if so mark it as a flat.
+// This is a flat memory operation. Check to see if it has memory tokens other
+// than LDS. Other address spaces supported by flat memory operations involve
+// global memory.
+bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
+  assert(TII->isFLAT(MI));
+
+  // All flat instructions use the VMEM counter.
+  assert(TII->usesVM_CNT(MI));
+
+  // If there are no memory operands then conservatively assume the flat
+  // operation may access VMEM.
+  if (MI.memoperands_empty())
+    return true;
+
+  // See if any memory operand specifies an address space that involves VMEM.
+  // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
+  // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
+  // (GDS) address space is not supported by flat operations. Therefore, simply
+  // return true unless only the LDS address space is found.
+  for (const MachineMemOperand *Memop : MI.memoperands()) {
+    unsigned AS = Memop->getAddrSpace();
+    assert(AS != AMDGPUAS::REGION_ADDRESS);
+    if (AS != AMDGPUAS::LOCAL_ADDRESS)
+      return true;
+  }
+
+  return false;
+}
+
+// This is a flat memory operation. Check to see if it has memory tokens for
+// either LDS or FLAT.
 bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
+  assert(TII->isFLAT(MI));
+
+  // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
+  if (!TII->usesLGKM_CNT(MI))
+    return false;
+
+  // If there are no memory operands then conservatively assume the flat
+  // operation may access LDS.
   if (MI.memoperands_empty())
     return true;
 
+  // See if any memory operand specifies an address space that involves LDS.
   for (const MachineMemOperand *Memop : MI.memoperands()) {
     unsigned AS = Memop->getAddrSpace();
     if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
@@ -1221,7 +1240,10 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
   } else if (TII->isFLAT(Inst)) {
     assert(Inst.mayLoadOrStore());
 
-    if (TII->usesVM_CNT(Inst)) {
+    int FlatASCount = 0;
+
+    if (mayAccessVMEMThroughFlat(Inst)) {
+      ++FlatASCount;
       if (!ST->hasVscnt())
         ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
       else if (Inst.mayLoad() &&
@@ -1231,15 +1253,19 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
         ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst);
     }
 
-    if (TII->usesLGKM_CNT(Inst)) {
+    if (mayAccessLDSThroughFlat(Inst)) {
+      ++FlatASCount;
       ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
-
-      // This is a flat memory operation, so note it - it will require
-      // that both the VM and LGKM be flushed to zero if it is pending when
-      // a VM or LGKM dependency occurs.
-      if (mayAccessLDSThroughFlat(Inst))
-        ScoreBrackets->setPendingFlat();
     }
+
+    // A Flat memory operation must access at least one address space.
+    assert(FlatASCount);
+
+    // This is a flat memory operation that access both VMEM and LDS, so note it
+    // - it will require that both the VM and LGKM be flushed to zero if it is
+    // pending when a VM or LGKM dependency occurs.
+    if (FlatASCount > 1)
+      ScoreBrackets->setPendingFlat();
   } else if (SIInstrInfo::isVMEM(Inst) &&
              // TODO: get a better carve out.
              Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 &&
@@ -1266,34 +1292,29 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
   } else if (Inst.isCall()) {
     if (callWaitsOnFunctionReturn(Inst)) {
       // Act as a wait on everything
-      ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZero(IV));
+      ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZero(ST->hasVscnt()));
     } else {
       // May need to way wait for anything.
       ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
     }
+  } else if (SIInstrInfo::isEXP(Inst)) {
+    unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
+    if (Imm >= AMDGPU::Exp::ET_PARAM0 && Imm <= AMDGPU::Exp::ET_PARAM31)
+      ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
+    else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
+      ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
+    else
+      ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
   } else {
     switch (Inst.getOpcode()) {
     case AMDGPU::S_SENDMSG:
     case AMDGPU::S_SENDMSGHALT:
       ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
       break;
-    case AMDGPU::EXP:
-    case AMDGPU::EXP_DONE: {
-      int Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
-      if (Imm >= 32 && Imm <= 63)
-        ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
-      else if (Imm >= 12 && Imm <= 15)
-        ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
-      else
-        ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
-      break;
-    }
     case AMDGPU::S_MEMTIME:
     case AMDGPU::S_MEMREALTIME:
       ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
       break;
-    default:
-      break;
     }
   }
 }
@@ -1381,9 +1402,19 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
     ScoreBrackets.dump();
   });
 
-  // Assume VCCZ is correct at basic block boundaries, unless and until we need
-  // to handle cases where that is not true.
+  // Track the correctness of vccz through this basic block. There are two
+  // reasons why it might be incorrect; see ST->hasReadVCCZBug() and
+  // ST->partialVCCWritesUpdateVCCZ().
   bool VCCZCorrect = true;
+  if (ST->hasReadVCCZBug()) {
+    // vccz could be incorrect at a basic block boundary if a predecessor wrote
+    // to vcc and then issued an smem load.
+    VCCZCorrect = false;
+  } else if (!ST->partialVCCWritesUpdateVCCZ()) {
+    // vccz could be incorrect at a basic block boundary if a predecessor wrote
+    // to vcc_lo or vcc_hi.
+    VCCZCorrect = false;
+  }
 
   // Walk over the instructions.
   MachineInstr *OldWaitcntInstr = nullptr;
@@ -1404,14 +1435,21 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
       continue;
     }
 
-    // We might need to restore vccz to its correct value for either of two
-    // different reasons; see ST->hasReadVCCZBug() and
-    // ST->partialVCCWritesUpdateVCCZ().
-    bool RestoreVCCZ = false;
-    if (readsVCCZ(Inst)) {
-      if (!VCCZCorrect)
-        RestoreVCCZ = true;
-      else if (ST->hasReadVCCZBug()) {
+    // Generate an s_waitcnt instruction to be placed before Inst, if needed.
+    Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);
+    OldWaitcntInstr = nullptr;
+
+    // Restore vccz if it's not known to be correct already.
+    bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(Inst);
+
+    // Don't examine operands unless we need to track vccz correctness.
+    if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) {
+      if (Inst.definesRegister(AMDGPU::VCC_LO) ||
+          Inst.definesRegister(AMDGPU::VCC_HI)) {
+        // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
+        if (!ST->partialVCCWritesUpdateVCCZ())
+          VCCZCorrect = false;
+      } else if (Inst.definesRegister(AMDGPU::VCC)) {
         // There is a hardware bug on CI/SI where SMRD instruction may corrupt
         // vccz bit, so when we detect that an instruction may read from a
         // corrupt vccz bit, we need to:
@@ -1419,10 +1457,16 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
         //    operations to complete.
         // 2. Restore the correct value of vccz by writing the current value
         //    of vcc back to vcc.
-        if (ScoreBrackets.getScoreLB(LGKM_CNT) <
-            ScoreBrackets.getScoreUB(LGKM_CNT) &&
+        if (ST->hasReadVCCZBug() &&
+            ScoreBrackets.getScoreLB(LGKM_CNT) <
+                ScoreBrackets.getScoreUB(LGKM_CNT) &&
             ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
-          RestoreVCCZ = true;
+          // Writes to vcc while there's an outstanding smem read may get
+          // clobbered as soon as any read completes.
+          VCCZCorrect = false;
+        } else {
+          // Writes to vcc will fix any incorrect value in vccz.
+          VCCZCorrect = true;
         }
       }
     }
@@ -1432,23 +1476,12 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
         const Value *Ptr = Memop->getValue();
         SLoadAddresses.insert(std::make_pair(Ptr, Inst.getParent()));
       }
-    }
-
-    if (!ST->partialVCCWritesUpdateVCCZ()) {
-      // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
-      // Writes to vcc will fix it.
-      if (Inst.definesRegister(AMDGPU::VCC_LO) ||
-          Inst.definesRegister(AMDGPU::VCC_HI))
+      if (ST->hasReadVCCZBug()) {
+        // This smem read could complete and clobber vccz at any time.
         VCCZCorrect = false;
-      else if (Inst.definesRegister(AMDGPU::VCC))
-        VCCZCorrect = true;
+      }
     }
 
-    // Generate an s_waitcnt instruction to be placed before
-    // cur_Inst, if needed.
-    Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);
-    OldWaitcntInstr = nullptr;
-
     updateEventWaitcntAfter(Inst, &ScoreBrackets);
 
 #if 0 // TODO: implement resource type check controlled by options with ub = LB.
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrFormats.td
index 428c21c896d5..7ce042b67aba 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -33,6 +33,7 @@ class InstSI <dag outs, dag ins, string asm = "",
   field bit VINTRP = 0;
   field bit SDWA = 0;
   field bit DPP = 0;
+  field bit TRANS = 0;
 
   // Memory instruction formats.
   field bit MUBUF = 0;
@@ -110,9 +111,9 @@ class InstSI <dag outs, dag ins, string asm = "",
   // This bit indicates that this is a D16 buffer instruction.
   field bit D16Buf = 0;
 
-  // This field indicates that FLAT instruction accesses FLAT_GLBL or
-  // FLAT_SCRATCH segment. Must be 0 for non-FLAT instructions.
-  field bit IsNonFlatSeg = 0;
+  // This field indicates that FLAT instruction accesses FLAT_GLBL segment.
+  // Must be 0 for non-FLAT instructions.
+  field bit IsFlatGlobal = 0;
 
   // Reads the mode register, usually for FP environment.
   field bit ReadsModeReg = 0;
@@ -130,6 +131,10 @@ class InstSI <dag outs, dag ins, string asm = "",
   // This bit indicates that this is one of DOT instructions.
   field bit IsDOT = 0;
 
+  // This field indicates that FLAT instruction accesses FLAT_SCRATCH segment.
+  // Must be 0 for non-FLAT instructions.
+  field bit IsFlatScratch = 0;
+
   // These need to be kept in sync with the enum in SIInstrFlags.
   let TSFlags{0} = SALU;
   let TSFlags{1} = VALU;
@@ -149,17 +154,18 @@ class InstSI <dag outs, dag ins, string asm = "",
   let TSFlags{13} = VINTRP;
   let TSFlags{14} = SDWA;
   let TSFlags{15} = DPP;
+  let TSFlags{16} = TRANS;
 
-  let TSFlags{16} = MUBUF;
-  let TSFlags{17} = MTBUF;
-  let TSFlags{18} = SMRD;
-  let TSFlags{19} = MIMG;
-  let TSFlags{20} = EXP;
-  let TSFlags{21} = FLAT;
-  let TSFlags{22} = DS;
+  let TSFlags{17} = MUBUF;
+  let TSFlags{18} = MTBUF;
+  let TSFlags{19} = SMRD;
+  let TSFlags{20} = MIMG;
+  let TSFlags{21} = EXP;
+  let TSFlags{22} = FLAT;
+  let TSFlags{23} = DS;
 
-  let TSFlags{23} = VGPRSpill;
-  let TSFlags{24} = SGPRSpill;
+  let TSFlags{24} = VGPRSpill;
+  let TSFlags{25} = SGPRSpill;
 
   let TSFlags{32} = VM_CNT;
   let TSFlags{33} = EXP_CNT;
@@ -187,7 +193,7 @@ class InstSI <dag outs, dag ins, string asm = "",
 
   let TSFlags{50} = D16Buf;
 
-  let TSFlags{51} = IsNonFlatSeg;
+  let TSFlags{51} = IsFlatGlobal;
 
   let TSFlags{52} = FPDPRounding;
 
@@ -197,17 +203,14 @@ class InstSI <dag outs, dag ins, string asm = "",
 
   let TSFlags{55} = IsDOT;
 
-  let SchedRW = [Write32Bit];
+  let TSFlags{56} = IsFlatScratch;
 
-  field bits<1> DisableSIDecoder = 0;
-  field bits<1> DisableVIDecoder = 0;
-  field bits<1> DisableDecoder = 0;
+  let SchedRW = [Write32Bit];
 
-  let isAsmParserOnly = !if(!eq(DisableDecoder{0}, {0}), 0, 1);
   let AsmVariantName = AMDGPUAsmVariants.Default;
 
   // Avoid changing source registers in a way that violates constant bus read limitations.
-  let hasExtraSrcRegAllocReq = !if(VOP1,1,!if(VOP2,1,!if(VOP3,1,!if(VOPC,1,!if(SDWA,1, !if(VALU,1,0))))));
+  let hasExtraSrcRegAllocReq = !or(VOP1, VOP2, VOP3, VOPC, SDWA, VALU);
 }
 
 class PseudoInstSI<dag outs, dag ins, list<dag> pattern = [], string asm = "">
@@ -362,15 +365,4 @@ class VINTRPCommon <dag outs, dag ins, string asm, list<dag> pattern> :
   let VALU = 1;
 }
 
-class EXPCommon<dag outs, dag ins, string asm, list<dag> pattern> :
-  InstSI<outs, ins, asm, pattern> {
-  let EXP = 1;
-  let EXP_CNT = 1;
-  let mayLoad = 0; // Set to 1 if done bit is set.
-  let mayStore = 1;
-  let UseNamedOperandTable = 1;
-  let Uses = [EXEC];
-  let SchedRW = [WriteExport];
-}
-
 } // End Uses = [EXEC]
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 9af8ffedce0f..dfd0075bf03a 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -13,53 +13,20 @@
 
 #include "SIInstrInfo.h"
 #include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
+#include "AMDGPUInstrInfo.h"
 #include "GCNHazardRecognizer.h"
-#include "SIDefines.h"
-#include "SIMachineFunctionInfo.h"
-#include "SIRegisterInfo.h"
+#include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/iterator_range.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/MemoryLocation.h"
+#include "SIMachineFunctionInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineInstrBundle.h"
-#include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
-#include "llvm/CodeGen/SelectionDAGNodes.h"
-#include "llvm/CodeGen/TargetOpcodes.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/DiagnosticInfo.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/InlineAsm.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/MC/MCInstrDesc.h"
-#include "llvm/Support/Casting.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MachineValueType.h"
-#include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetMachine.h"
-#include <cassert>
-#include <cstdint>
-#include <iterator>
-#include <utility>
 
 using namespace llvm;
 
@@ -69,6 +36,9 @@ using namespace llvm;
 #include "AMDGPUGenInstrInfo.inc"
 
 namespace llvm {
+
+class AAResults;
+
 namespace AMDGPU {
 #define GET_D16ImageDimIntrinsics_IMPL
 #define GET_ImageDimIntrinsicTable_IMPL
@@ -136,7 +106,7 @@ static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
 }
 
 bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
-                                                    AliasAnalysis *AA) const {
+                                                    AAResults *AA) const {
   // TODO: The generic check fails for VALU instructions that should be
   // rematerializable due to implicit reads of exec. We really want all of the
   // generic logic for this except for this.
@@ -144,8 +114,8 @@ bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
   case AMDGPU::V_MOV_B32_e32:
   case AMDGPU::V_MOV_B32_e64:
   case AMDGPU::V_MOV_B64_PSEUDO:
-  case AMDGPU::V_ACCVGPR_READ_B32:
-  case AMDGPU::V_ACCVGPR_WRITE_B32:
+  case AMDGPU::V_ACCVGPR_READ_B32_e64:
+  case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
     // No implicit operands.
     return MI.getNumOperands() == MI.getDesc().getNumOperands();
   default:
@@ -418,7 +388,7 @@ bool SIInstrInfo::getMemOperandsWithOffsetWidth(
   }
 
   if (isFLAT(LdSt)) {
-    // Instructions have either vaddr or saddr or both.
+    // Instructions have either vaddr or saddr or both or none.
     BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
     if (BaseOp)
       BaseOps.push_back(BaseOp);
@@ -459,10 +429,8 @@ static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
   auto Base2 = MO2->getValue();
   if (!Base1 || !Base2)
     return false;
-  const MachineFunction &MF = *MI1.getParent()->getParent();
-  const DataLayout &DL = MF.getFunction().getParent()->getDataLayout();
-  Base1 = GetUnderlyingObject(Base1, DL);
-  Base2 = GetUnderlyingObject(Base2, DL);
+  Base1 = getUnderlyingObject(Base1);
+  Base2 = getUnderlyingObject(Base2);
 
   if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
     return false;
@@ -474,27 +442,33 @@ bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
                                       ArrayRef<const MachineOperand *> BaseOps2,
                                       unsigned NumLoads,
                                       unsigned NumBytes) const {
-  // If current mem ops pair do not have same base pointer, then they cannot be
-  // clustered.
-  assert(!BaseOps1.empty() && !BaseOps2.empty());
-  const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
-  const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
-  if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
+  // If the mem ops (to be clustered) do not have the same base ptr, then they
+  // should not be clustered
+  if (!BaseOps1.empty() && !BaseOps2.empty()) {
+    const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
+    const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
+    if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
+      return false;
+  } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
+    // If only one base op is empty, they do not have the same base ptr
     return false;
-
-  // Compute max cluster size based on average number bytes clustered till now,
-  // and decide based on it, if current mem ops pair can be clustered or not.
-  assert((NumLoads > 0) && (NumBytes > 0) && (NumBytes >= NumLoads) &&
-         "Invalid NumLoads/NumBytes values");
-  unsigned MaxNumLoads;
-  if (NumBytes <= 4 * NumLoads) {
-    // Loads are dword or smaller (on average).
-    MaxNumLoads = 5;
-  } else {
-    // Loads are bigger than a dword (on average).
-    MaxNumLoads = 4;
   }
-  return NumLoads <= MaxNumLoads;
+
+  // In order to avoid regester pressure, on an average, the number of DWORDS
+  // loaded together by all clustered mem ops should not exceed 8. This is an
+  // empirical value based on certain observations and performance related
+  // experiments.
+  // The good thing about this heuristic is - it avoids clustering of too many
+  // sub-word loads, and also avoids clustering of wide loads. Below is the
+  // brief summary of how the heuristic behaves for various `LoadSize`.
+  // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
+  // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
+  // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
+  // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
+  // (5) LoadSize >= 17: do not cluster
+  const unsigned LoadSize = NumBytes / NumLoads;
+  const unsigned NumDWORDs = ((LoadSize + 3) / 4) * NumLoads;
+  return NumDWORDs <= 8;
 }
 
 // FIXME: This behaves strangely. If, for example, you have 32 load + stores,
@@ -533,6 +507,157 @@ static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB,
     .addReg(SrcReg, getKillRegState(KillSrc));
 }
 
+/// Handle copying from SGPR to AGPR, or from AGPR to AGPR. It is not possible
+/// to directly copy, so an intermediate VGPR needs to be used.
+static void indirectCopyToAGPR(const SIInstrInfo &TII,
+                               MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator MI,
+                               const DebugLoc &DL, MCRegister DestReg,
+                               MCRegister SrcReg, bool KillSrc,
+                               RegScavenger &RS,
+                               Register ImpDefSuperReg = Register(),
+                               Register ImpUseSuperReg = Register()) {
+  const SIRegisterInfo &RI = TII.getRegisterInfo();
+
+  assert(AMDGPU::SReg_32RegClass.contains(SrcReg) ||
+         AMDGPU::AGPR_32RegClass.contains(SrcReg));
+
+  // First try to find defining accvgpr_write to avoid temporary registers.
+  for (auto Def = MI, E = MBB.begin(); Def != E; ) {
+    --Def;
+    if (!Def->definesRegister(SrcReg, &RI))
+      continue;
+    if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
+      break;
+
+    MachineOperand &DefOp = Def->getOperand(1);
+    assert(DefOp.isReg() || DefOp.isImm());
+
+    if (DefOp.isReg()) {
+      // Check that register source operand if not clobbered before MI.
+      // Immediate operands are always safe to propagate.
+      bool SafeToPropagate = true;
+      for (auto I = Def; I != MI && SafeToPropagate; ++I)
+        if (I->modifiesRegister(DefOp.getReg(), &RI))
+          SafeToPropagate = false;
+
+      if (!SafeToPropagate)
+        break;
+
+      DefOp.setIsKill(false);
+    }
+
+    MachineInstrBuilder Builder =
+      BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
+      .add(DefOp);
+    if (ImpDefSuperReg)
+      Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
+
+    if (ImpUseSuperReg) {
+      Builder.addReg(ImpUseSuperReg,
+                     getKillRegState(KillSrc) | RegState::Implicit);
+    }
+
+    return;
+  }
+
+  RS.enterBasicBlock(MBB);
+  RS.forward(MI);
+
+  // Ideally we want to have three registers for a long reg_sequence copy
+  // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
+  unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
+                                             *MBB.getParent());
+
+  // Registers in the sequence are allocated contiguously so we can just
+  // use register number to pick one of three round-robin temps.
+  unsigned RegNo = DestReg % 3;
+  Register Tmp = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0);
+  if (!Tmp)
+    report_fatal_error("Cannot scavenge VGPR to copy to AGPR");
+  RS.setRegUsed(Tmp);
+  // Only loop through if there are any free registers left, otherwise
+  // scavenger may report a fatal error without emergency spill slot
+  // or spill with the slot.
+  while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) {
+    Register Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0);
+    if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
+      break;
+    Tmp = Tmp2;
+    RS.setRegUsed(Tmp);
+  }
+
+  // Insert copy to temporary VGPR.
+  unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
+  if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
+    TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
+  } else {
+    assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
+  }
+
+  MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  if (ImpUseSuperReg) {
+    UseBuilder.addReg(ImpUseSuperReg,
+                      getKillRegState(KillSrc) | RegState::Implicit);
+  }
+
+  MachineInstrBuilder DefBuilder
+    = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
+    .addReg(Tmp, RegState::Kill);
+
+  if (ImpDefSuperReg)
+    DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
+}
+
+static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI, const DebugLoc &DL,
+                           MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
+                           const TargetRegisterClass *RC, bool Forward) {
+  const SIRegisterInfo &RI = TII.getRegisterInfo();
+  ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
+  MachineBasicBlock::iterator I = MI;
+  MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
+
+  for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
+    int16_t SubIdx = BaseIndices[Idx];
+    Register Reg = RI.getSubReg(DestReg, SubIdx);
+    unsigned Opcode = AMDGPU::S_MOV_B32;
+
+    // Is SGPR aligned? If so try to combine with next.
+    Register Src = RI.getSubReg(SrcReg, SubIdx);
+    bool AlignedDest = ((Reg - AMDGPU::SGPR0) % 2) == 0;
+    bool AlignedSrc = ((Src - AMDGPU::SGPR0) % 2) == 0;
+    if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
+      // Can use SGPR64 copy
+      unsigned Channel = RI.getChannelFromSubReg(SubIdx);
+      SubIdx = RI.getSubRegFromChannel(Channel, 2);
+      Opcode = AMDGPU::S_MOV_B64;
+      Idx++;
+    }
+
+    LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), RI.getSubReg(DestReg, SubIdx))
+                 .addReg(RI.getSubReg(SrcReg, SubIdx))
+                 .addReg(SrcReg, RegState::Implicit);
+
+    if (!FirstMI)
+      FirstMI = LastMI;
+
+    if (!Forward)
+      I--;
+  }
+
+  assert(FirstMI && LastMI);
+  if (!Forward)
+    std::swap(FirstMI, LastMI);
+
+  FirstMI->addOperand(
+      MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
+
+  if (KillSrc)
+    LastMI->addRegisterKilled(SrcReg, &RI);
+}
+
 void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator MI,
                               const DebugLoc &DL, MCRegister DestReg,
@@ -563,7 +688,7 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
            AMDGPU::SReg_32RegClass.contains(SrcReg) ||
            AMDGPU::AGPR_32RegClass.contains(SrcReg));
     unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
-                     AMDGPU::V_ACCVGPR_READ_B32 : AMDGPU::V_MOV_B32_e32;
+                     AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
     BuildMI(MBB, MI, DL, get(Opc), DestReg)
       .addReg(SrcReg, getKillRegState(KillSrc));
     return;
@@ -639,88 +764,36 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   if (DestReg == AMDGPU::SCC) {
     // Copying 64-bit or 32-bit sources to SCC barely makes sense,
     // but SelectionDAG emits such copies for i1 sources.
-    // TODO: Use S_BITCMP0_B32 instead and only consider the 0th bit.
     if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
-      SrcReg = RI.getSubReg(SrcReg, AMDGPU::sub0);
+      // This copy can only be produced by patterns
+      // with explicit SCC, which are known to be enabled
+      // only for subtargets with S_CMP_LG_U64 present.
+      assert(ST.hasScalarCompareEq64());
+      BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
+          .addReg(SrcReg, getKillRegState(KillSrc))
+          .addImm(0);
+    } else {
+      assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
+      BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
+          .addReg(SrcReg, getKillRegState(KillSrc))
+          .addImm(0);
     }
-    assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
-
-    BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
-        .addReg(SrcReg, getKillRegState(KillSrc))
-        .addImm(0);
 
     return;
   }
 
-  if (RC == &AMDGPU::AGPR_32RegClass) {
-    assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
-           AMDGPU::SReg_32RegClass.contains(SrcReg) ||
-           AMDGPU::AGPR_32RegClass.contains(SrcReg));
-    if (!AMDGPU::VGPR_32RegClass.contains(SrcReg)) {
-      // First try to find defining accvgpr_write to avoid temporary registers.
-      for (auto Def = MI, E = MBB.begin(); Def != E; ) {
-        --Def;
-        if (!Def->definesRegister(SrcReg, &RI))
-          continue;
-        if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32)
-          break;
-
-        MachineOperand &DefOp = Def->getOperand(1);
-        assert(DefOp.isReg() || DefOp.isImm());
-
-        if (DefOp.isReg()) {
-          // Check that register source operand if not clobbered before MI.
-          // Immediate operands are always safe to propagate.
-          bool SafeToPropagate = true;
-          for (auto I = Def; I != MI && SafeToPropagate; ++I)
-            if (I->modifiesRegister(DefOp.getReg(), &RI))
-              SafeToPropagate = false;
-
-          if (!SafeToPropagate)
-            break;
-
-          DefOp.setIsKill(false);
-        }
 
-        BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32), DestReg)
-          .add(DefOp);
-        return;
-      }
-
-      RegScavenger RS;
-      RS.enterBasicBlock(MBB);
-      RS.forward(MI);
-
-      // Ideally we want to have three registers for a long reg_sequence copy
-      // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
-      unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
-                                                 *MBB.getParent());
-
-      // Registers in the sequence are allocated contiguously so we can just
-      // use register number to pick one of three round-robin temps.
-      unsigned RegNo = DestReg % 3;
-      Register Tmp = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0);
-      if (!Tmp)
-        report_fatal_error("Cannot scavenge VGPR to copy to AGPR");
-      RS.setRegUsed(Tmp);
-      // Only loop through if there are any free registers left, otherwise
-      // scavenger may report a fatal error without emergency spill slot
-      // or spill with the slot.
-      while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) {
-        unsigned Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0);
-        if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
-          break;
-        Tmp = Tmp2;
-        RS.setRegUsed(Tmp);
-      }
-      copyPhysReg(MBB, MI, DL, Tmp, SrcReg, KillSrc);
-      BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32), DestReg)
-        .addReg(Tmp, RegState::Kill);
+  if (RC == &AMDGPU::AGPR_32RegClass) {
+    if (AMDGPU::VGPR_32RegClass.contains(SrcReg)) {
+      BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
       return;
     }
 
-    BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32), DestReg)
-      .addReg(SrcReg, getKillRegState(KillSrc));
+    // FIXME: Pass should maintain scavenger to avoid scan through the block on
+    // every AGPR spill.
+    RegScavenger RS;
+    indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS);
     return;
   }
 
@@ -790,31 +863,38 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     return;
   }
 
-  unsigned EltSize = 4;
-  unsigned Opcode = AMDGPU::V_MOV_B32_e32;
+  const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
   if (RI.isSGPRClass(RC)) {
-    // TODO: Copy vec3/vec5 with s_mov_b64s then final s_mov_b32.
-    if (!(RI.getRegSizeInBits(*RC) % 64)) {
-      Opcode =  AMDGPU::S_MOV_B64;
-      EltSize = 8;
-    } else {
-      Opcode = AMDGPU::S_MOV_B32;
-      EltSize = 4;
-    }
-
     if (!RI.isSGPRClass(RI.getPhysRegClass(SrcReg))) {
       reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
       return;
     }
-  } else if (RI.hasAGPRs(RC)) {
+    expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RC, Forward);
+    return;
+  }
+
+  unsigned Opcode = AMDGPU::V_MOV_B32_e32;
+  if (RI.hasAGPRs(RC)) {
     Opcode = RI.hasVGPRs(RI.getPhysRegClass(SrcReg)) ?
-      AMDGPU::V_ACCVGPR_WRITE_B32 : AMDGPU::COPY;
+      AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
   } else if (RI.hasVGPRs(RC) && RI.hasAGPRs(RI.getPhysRegClass(SrcReg))) {
-    Opcode = AMDGPU::V_ACCVGPR_READ_B32;
+    Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
   }
 
-  ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
-  bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
+  // For the cases where we need an intermediate instruction/temporary register
+  // (destination is an AGPR), we need a scavenger.
+  //
+  // FIXME: The pass should maintain this for us so we don't have to re-scan the
+  // whole block for every handled copy.
+  std::unique_ptr<RegScavenger> RS;
+  if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
+    RS.reset(new RegScavenger());
+
+  ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, 4);
+
+  // If there is an overlap, we can't kill the super-register on the last
+  // instruction, since it will also kill the components made live by this def.
+  const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
 
   for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
     unsigned SubIdx;
@@ -823,22 +903,23 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     else
       SubIdx = SubIndices[SubIndices.size() - Idx - 1];
 
-    if (Opcode == TargetOpcode::COPY) {
-      copyPhysReg(MBB, MI, DL, RI.getSubReg(DestReg, SubIdx),
-                  RI.getSubReg(SrcReg, SubIdx), KillSrc);
-      continue;
-    }
+    bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
 
-    MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
-      get(Opcode), RI.getSubReg(DestReg, SubIdx));
-
-    Builder.addReg(RI.getSubReg(SrcReg, SubIdx));
-
-    if (Idx == 0)
-      Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
+    if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
+      Register ImpDefSuper = Idx == 0 ? Register(DestReg) : Register();
+      Register ImpUseSuper = SrcReg;
+      indirectCopyToAGPR(*this, MBB, MI, DL, RI.getSubReg(DestReg, SubIdx),
+                         RI.getSubReg(SrcReg, SubIdx), UseKill, *RS,
+                         ImpDefSuper, ImpUseSuper);
+    } else {
+      MachineInstrBuilder Builder =
+        BuildMI(MBB, MI, DL, get(Opcode), RI.getSubReg(DestReg, SubIdx))
+        .addReg(RI.getSubReg(SrcReg, SubIdx));
+      if (Idx == 0)
+        Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
 
-    bool UseKill = KillSrc && Idx == SubIndices.size() - 1;
-    Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
+      Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
+    }
   }
 }
 
@@ -928,8 +1009,6 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
                                      Register TrueReg,
                                      Register FalseReg) const {
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
-  MachineFunction *MF = MBB.getParent();
-  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
   const TargetRegisterClass *BoolXExecRC =
     RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
   assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
@@ -1089,78 +1168,123 @@ unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
   return AMDGPU::COPY;
 }
 
-static unsigned getIndirectVGPRWritePseudoOpc(unsigned VecSize) {
+const MCInstrDesc &
+SIInstrInfo::getIndirectGPRIDXPseudo(unsigned VecSize,
+                                     bool IsIndirectSrc) const {
+  if (IsIndirectSrc) {
+    if (VecSize <= 32) // 4 bytes
+      return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
+    if (VecSize <= 64) // 8 bytes
+      return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
+    if (VecSize <= 96) // 12 bytes
+      return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
+    if (VecSize <= 128) // 16 bytes
+      return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
+    if (VecSize <= 160) // 20 bytes
+      return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
+    if (VecSize <= 256) // 32 bytes
+      return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
+    if (VecSize <= 512) // 64 bytes
+      return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
+    if (VecSize <= 1024) // 128 bytes
+      return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
+
+    llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
+  }
+
+  if (VecSize <= 32) // 4 bytes
+    return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
+  if (VecSize <= 64) // 8 bytes
+    return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
+  if (VecSize <= 96) // 12 bytes
+    return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
+  if (VecSize <= 128) // 16 bytes
+    return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
+  if (VecSize <= 160) // 20 bytes
+    return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
+  if (VecSize <= 256) // 32 bytes
+    return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
+  if (VecSize <= 512) // 64 bytes
+    return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
+  if (VecSize <= 1024) // 128 bytes
+    return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
+
+  llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
+}
+
+static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
   if (VecSize <= 32) // 4 bytes
-    return AMDGPU::V_INDIRECT_REG_WRITE_B32_V1;
+    return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
   if (VecSize <= 64) // 8 bytes
-    return AMDGPU::V_INDIRECT_REG_WRITE_B32_V2;
+    return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
   if (VecSize <= 96) // 12 bytes
-    return AMDGPU::V_INDIRECT_REG_WRITE_B32_V3;
+    return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
   if (VecSize <= 128) // 16 bytes
-    return AMDGPU::V_INDIRECT_REG_WRITE_B32_V4;
+    return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
   if (VecSize <= 160) // 20 bytes
-    return AMDGPU::V_INDIRECT_REG_WRITE_B32_V5;
+    return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
   if (VecSize <= 256) // 32 bytes
-    return AMDGPU::V_INDIRECT_REG_WRITE_B32_V8;
+    return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
   if (VecSize <= 512) // 64 bytes
-    return AMDGPU::V_INDIRECT_REG_WRITE_B32_V16;
+    return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
   if (VecSize <= 1024) // 128 bytes
-    return AMDGPU::V_INDIRECT_REG_WRITE_B32_V32;
+    return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
 
   llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
 }
 
-static unsigned getIndirectSGPRWritePseudo32(unsigned VecSize) {
+static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
   if (VecSize <= 32) // 4 bytes
-    return AMDGPU::S_INDIRECT_REG_WRITE_B32_V1;
+    return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
   if (VecSize <= 64) // 8 bytes
-    return AMDGPU::S_INDIRECT_REG_WRITE_B32_V2;
+    return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
   if (VecSize <= 96) // 12 bytes
-    return AMDGPU::S_INDIRECT_REG_WRITE_B32_V3;
+    return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
   if (VecSize <= 128) // 16 bytes
-    return AMDGPU::S_INDIRECT_REG_WRITE_B32_V4;
+    return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
   if (VecSize <= 160) // 20 bytes
-    return AMDGPU::S_INDIRECT_REG_WRITE_B32_V5;
+    return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
   if (VecSize <= 256) // 32 bytes
-    return AMDGPU::S_INDIRECT_REG_WRITE_B32_V8;
+    return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
   if (VecSize <= 512) // 64 bytes
-    return AMDGPU::S_INDIRECT_REG_WRITE_B32_V16;
+    return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
   if (VecSize <= 1024) // 128 bytes
-    return AMDGPU::S_INDIRECT_REG_WRITE_B32_V32;
+    return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
 
   llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
 }
 
-static unsigned getIndirectSGPRWritePseudo64(unsigned VecSize) {
+static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
   if (VecSize <= 64) // 8 bytes
-    return AMDGPU::S_INDIRECT_REG_WRITE_B64_V1;
+    return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
   if (VecSize <= 128) // 16 bytes
-    return AMDGPU::S_INDIRECT_REG_WRITE_B64_V2;
+    return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
   if (VecSize <= 256) // 32 bytes
-    return AMDGPU::S_INDIRECT_REG_WRITE_B64_V4;
+    return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
   if (VecSize <= 512) // 64 bytes
-    return AMDGPU::S_INDIRECT_REG_WRITE_B64_V8;
+    return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
   if (VecSize <= 1024) // 128 bytes
-    return AMDGPU::S_INDIRECT_REG_WRITE_B64_V16;
+    return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
 
   llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
 }
 
-const MCInstrDesc &SIInstrInfo::getIndirectRegWritePseudo(
-  unsigned VecSize, unsigned EltSize, bool IsSGPR) const {
+const MCInstrDesc &
+SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
+                                             bool IsSGPR) const {
   if (IsSGPR) {
     switch (EltSize) {
     case 32:
-      return get(getIndirectSGPRWritePseudo32(VecSize));
+      return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
     case 64:
-      return get(getIndirectSGPRWritePseudo64(VecSize));
+      return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
     default:
       llvm_unreachable("invalid reg indexing elt size");
     }
   }
 
   assert(EltSize == 32 && "invalid reg indexing elt size");
-  return get(getIndirectVGPRWritePseudoOpc(VecSize));
+  return get(getIndirectVGPRWriteMovRelPseudoOpc(VecSize));
 }
 
 static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
@@ -1219,8 +1343,16 @@ static unsigned getAGPRSpillSaveOpcode(unsigned Size) {
     return AMDGPU::SI_SPILL_A32_SAVE;
   case 8:
     return AMDGPU::SI_SPILL_A64_SAVE;
+  case 12:
+    return AMDGPU::SI_SPILL_A96_SAVE;
   case 16:
     return AMDGPU::SI_SPILL_A128_SAVE;
+  case 20:
+    return AMDGPU::SI_SPILL_A160_SAVE;
+  case 24:
+    return AMDGPU::SI_SPILL_A192_SAVE;
+  case 32:
+    return AMDGPU::SI_SPILL_A256_SAVE;
   case 64:
     return AMDGPU::SI_SPILL_A512_SAVE;
   case 128:
@@ -1260,7 +1392,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
 
     // The SGPR spill/restore instructions only work on number sgprs, so we need
     // to make sure we are using the correct register class.
-    if (Register::isVirtualRegister(SrcReg) && SpillSize == 4) {
+    if (SrcReg.isVirtual() && SpillSize == 4) {
       MachineRegisterInfo &MRI = MF->getRegInfo();
       MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
     }
@@ -1269,11 +1401,8 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
       .addReg(SrcReg, getKillRegState(isKill)) // data
       .addFrameIndex(FrameIndex)               // addr
       .addMemOperand(MMO)
-      .addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
       .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit);
-    // Add the scratch resource registers as implicit uses because we may end up
-    // needing them, and need to ensure that the reserved registers are
-    // correctly handled.
+
     if (RI.spillSGPRToVGPR())
       FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
     return;
@@ -1283,18 +1412,12 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                     : getVGPRSpillSaveOpcode(SpillSize);
   MFI->setHasSpilledVGPRs();
 
-  auto MIB = BuildMI(MBB, MI, DL, get(Opcode));
-  if (RI.hasAGPRs(RC)) {
-    MachineRegisterInfo &MRI = MF->getRegInfo();
-    Register Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-    MIB.addReg(Tmp, RegState::Define);
-  }
-  MIB.addReg(SrcReg, getKillRegState(isKill)) // data
-     .addFrameIndex(FrameIndex)               // addr
-     .addReg(MFI->getScratchRSrcReg())        // scratch_rsrc
-     .addReg(MFI->getStackPtrOffsetReg())     // scratch_offset
-     .addImm(0)                               // offset
-     .addMemOperand(MMO);
+  BuildMI(MBB, MI, DL, get(Opcode))
+    .addReg(SrcReg, getKillRegState(isKill)) // data
+    .addFrameIndex(FrameIndex)               // addr
+    .addReg(MFI->getStackPtrOffsetReg())     // scratch_offset
+    .addImm(0)                               // offset
+    .addMemOperand(MMO);
 }
 
 static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
@@ -1353,8 +1476,16 @@ static unsigned getAGPRSpillRestoreOpcode(unsigned Size) {
     return AMDGPU::SI_SPILL_A32_RESTORE;
   case 8:
     return AMDGPU::SI_SPILL_A64_RESTORE;
+  case 12:
+    return AMDGPU::SI_SPILL_A96_RESTORE;
   case 16:
     return AMDGPU::SI_SPILL_A128_RESTORE;
+  case 20:
+    return AMDGPU::SI_SPILL_A160_RESTORE;
+  case 24:
+    return AMDGPU::SI_SPILL_A192_RESTORE;
+  case 32:
+    return AMDGPU::SI_SPILL_A256_RESTORE;
   case 64:
     return AMDGPU::SI_SPILL_A512_RESTORE;
   case 128:
@@ -1401,143 +1532,36 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
     BuildMI(MBB, MI, DL, OpDesc, DestReg)
       .addFrameIndex(FrameIndex) // addr
       .addMemOperand(MMO)
-      .addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
       .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit);
+
     return;
   }
 
   unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillRestoreOpcode(SpillSize)
                                     : getVGPRSpillRestoreOpcode(SpillSize);
-  auto MIB = BuildMI(MBB, MI, DL, get(Opcode), DestReg);
-  if (RI.hasAGPRs(RC)) {
-    MachineRegisterInfo &MRI = MF->getRegInfo();
-    Register Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-    MIB.addReg(Tmp, RegState::Define);
-  }
-  MIB.addFrameIndex(FrameIndex)        // vaddr
-     .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
-     .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
-     .addImm(0)                           // offset
-     .addMemOperand(MMO);
+  BuildMI(MBB, MI, DL, get(Opcode), DestReg)
+    .addFrameIndex(FrameIndex)        // vaddr
+    .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
+    .addImm(0)                           // offset
+    .addMemOperand(MMO);
 }
 
-/// \param @Offset Offset in bytes of the FrameIndex being spilled
-unsigned SIInstrInfo::calculateLDSSpillAddress(
-    MachineBasicBlock &MBB, MachineInstr &MI, RegScavenger *RS, unsigned TmpReg,
-    unsigned FrameOffset, unsigned Size) const {
-  MachineFunction *MF = MBB.getParent();
-  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
-  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
-  const DebugLoc &DL = MBB.findDebugLoc(MI);
-  unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize();
-  unsigned WavefrontSize = ST.getWavefrontSize();
-
-  Register TIDReg = MFI->getTIDReg();
-  if (!MFI->hasCalculatedTID()) {
-    MachineBasicBlock &Entry = MBB.getParent()->front();
-    MachineBasicBlock::iterator Insert = Entry.front();
-    const DebugLoc &DL = Insert->getDebugLoc();
-
-    TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass,
-                                   *MF);
-    if (TIDReg == AMDGPU::NoRegister)
-      return TIDReg;
-
-    if (!AMDGPU::isShader(MF->getFunction().getCallingConv()) &&
-        WorkGroupSize > WavefrontSize) {
-      Register TIDIGXReg =
-          MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
-      Register TIDIGYReg =
-          MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
-      Register TIDIGZReg =
-          MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
-      Register InputPtrReg =
-          MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
-      for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) {
-        if (!Entry.isLiveIn(Reg))
-          Entry.addLiveIn(Reg);
-      }
-
-      RS->enterBasicBlock(Entry);
-      // FIXME: Can we scavenge an SReg_64 and access the subregs?
-      Register STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
-      Register STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
-      BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0)
-              .addReg(InputPtrReg)
-              .addImm(SI::KernelInputOffsets::NGROUPS_Z);
-      BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1)
-              .addReg(InputPtrReg)
-              .addImm(SI::KernelInputOffsets::NGROUPS_Y);
-
-      // NGROUPS.X * NGROUPS.Y
-      BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1)
-              .addReg(STmp1)
-              .addReg(STmp0);
-      // (NGROUPS.X * NGROUPS.Y) * TIDIG.X
-      BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg)
-              .addReg(STmp1)
-              .addReg(TIDIGXReg);
-      // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)
-      BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg)
-              .addReg(STmp0)
-              .addReg(TIDIGYReg)
-              .addReg(TIDReg);
-      // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z
-      getAddNoCarry(Entry, Insert, DL, TIDReg)
-        .addReg(TIDReg)
-        .addReg(TIDIGZReg)
-        .addImm(0); // clamp bit
-    } else {
-      // Get the wave id
-      BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64),
-              TIDReg)
-              .addImm(-1)
-              .addImm(0);
-
-      BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64),
-              TIDReg)
-              .addImm(-1)
-              .addReg(TIDReg);
-    }
-
-    BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32),
-            TIDReg)
-            .addImm(2)
-            .addReg(TIDReg);
-    MFI->setTIDReg(TIDReg);
-  }
-
-  // Add FrameIndex to LDS offset
-  unsigned LDSOffset = MFI->getLDSSize() + (FrameOffset * WorkGroupSize);
-  getAddNoCarry(MBB, MI, DL, TmpReg)
-    .addImm(LDSOffset)
-    .addReg(TIDReg)
-    .addImm(0); // clamp bit
-
-  return TmpReg;
+void SIInstrInfo::insertNoop(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator MI) const {
+  insertNoops(MBB, MI, 1);
 }
 
-void SIInstrInfo::insertWaitStates(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MI,
-                                   int Count) const {
+void SIInstrInfo::insertNoops(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MI,
+                              unsigned Quantity) const {
   DebugLoc DL = MBB.findDebugLoc(MI);
-  while (Count > 0) {
-    int Arg;
-    if (Count >= 8)
-      Arg = 7;
-    else
-      Arg = Count - 1;
-    Count -= 8;
-    BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP))
-            .addImm(Arg);
+  while (Quantity > 0) {
+    unsigned Arg = std::min(Quantity, 8u);
+    Quantity -= Arg;
+    BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
   }
 }
 
-void SIInstrInfo::insertNoop(MachineBasicBlock &MBB,
-                             MachineBasicBlock::iterator MI) const {
-  insertWaitStates(MBB, MI, 1);
-}
-
 void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const {
   auto MF = MBB.getParent();
   SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
@@ -1593,7 +1617,11 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     // register allocation.
     MI.setDesc(get(AMDGPU::S_XOR_B32));
     break;
-
+  case AMDGPU::S_OR_B64_term:
+    // This is only a terminator to get the correct spill code placement during
+    // register allocation.
+    MI.setDesc(get(AMDGPU::S_OR_B64));
+    break;
   case AMDGPU::S_OR_B32_term:
     // This is only a terminator to get the correct spill code placement during
     // register allocation.
@@ -1670,36 +1698,35 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     MI.eraseFromParent();
     break;
   }
-  case AMDGPU::V_INDIRECT_REG_WRITE_B32_V1:
-  case AMDGPU::V_INDIRECT_REG_WRITE_B32_V2:
-  case AMDGPU::V_INDIRECT_REG_WRITE_B32_V3:
-  case AMDGPU::V_INDIRECT_REG_WRITE_B32_V4:
-  case AMDGPU::V_INDIRECT_REG_WRITE_B32_V5:
-  case AMDGPU::V_INDIRECT_REG_WRITE_B32_V8:
-  case AMDGPU::V_INDIRECT_REG_WRITE_B32_V16:
-  case AMDGPU::V_INDIRECT_REG_WRITE_B32_V32:
-  case AMDGPU::S_INDIRECT_REG_WRITE_B32_V1:
-  case AMDGPU::S_INDIRECT_REG_WRITE_B32_V2:
-  case AMDGPU::S_INDIRECT_REG_WRITE_B32_V3:
-  case AMDGPU::S_INDIRECT_REG_WRITE_B32_V4:
-  case AMDGPU::S_INDIRECT_REG_WRITE_B32_V5:
-  case AMDGPU::S_INDIRECT_REG_WRITE_B32_V8:
-  case AMDGPU::S_INDIRECT_REG_WRITE_B32_V16:
-  case AMDGPU::S_INDIRECT_REG_WRITE_B32_V32:
-  case AMDGPU::S_INDIRECT_REG_WRITE_B64_V1:
-  case AMDGPU::S_INDIRECT_REG_WRITE_B64_V2:
-  case AMDGPU::S_INDIRECT_REG_WRITE_B64_V4:
-  case AMDGPU::S_INDIRECT_REG_WRITE_B64_V8:
-  case AMDGPU::S_INDIRECT_REG_WRITE_B64_V16: {
+  case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
+  case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
+  case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
+  case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
+  case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
+  case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
+  case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
+  case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
+  case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
+  case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
+  case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
+  case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
+  case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
+  case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
+  case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
+  case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
+  case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
+  case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
+  case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
+  case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
+  case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
     const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
 
     unsigned Opc;
     if (RI.hasVGPRs(EltRC)) {
-      Opc = ST.useVGPRIndexMode() ?
-        AMDGPU::V_MOV_B32_indirect : AMDGPU::V_MOVRELD_B32_e32;
+      Opc = AMDGPU::V_MOVRELD_B32_e32;
     } else {
-      Opc = RI.getRegSizeInBits(*EltRC) == 64 ?
-        AMDGPU::S_MOVRELD_B64 : AMDGPU::S_MOVRELD_B32;
+      Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
+                                              : AMDGPU::S_MOVRELD_B32;
     }
 
     const MCInstrDesc &OpDesc = get(Opc);
@@ -1722,6 +1749,78 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     MI.eraseFromParent();
     break;
   }
+  case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
+  case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
+  case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
+  case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
+  case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
+  case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
+  case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
+  case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
+    assert(ST.useVGPRIndexMode());
+    Register VecReg = MI.getOperand(0).getReg();
+    bool IsUndef = MI.getOperand(1).isUndef();
+    Register Idx = MI.getOperand(3).getReg();
+    Register SubReg = MI.getOperand(4).getImm();
+
+    MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
+                              .addReg(Idx)
+                              .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE);
+    SetOn->getOperand(3).setIsUndef();
+
+    const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect);
+    MachineInstrBuilder MIB =
+        BuildMI(MBB, MI, DL, OpDesc)
+            .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
+            .add(MI.getOperand(2))
+            .addReg(VecReg, RegState::ImplicitDefine)
+            .addReg(VecReg,
+                    RegState::Implicit | (IsUndef ? RegState::Undef : 0));
+
+    const int ImpDefIdx = OpDesc.getNumOperands() + OpDesc.getNumImplicitUses();
+    const int ImpUseIdx = ImpDefIdx + 1;
+    MIB->tieOperands(ImpDefIdx, ImpUseIdx);
+
+    MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
+
+    finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
+
+    MI.eraseFromParent();
+    break;
+  }
+  case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
+  case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
+  case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
+  case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
+  case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
+  case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
+  case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
+  case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
+    assert(ST.useVGPRIndexMode());
+    Register Dst = MI.getOperand(0).getReg();
+    Register VecReg = MI.getOperand(1).getReg();
+    bool IsUndef = MI.getOperand(1).isUndef();
+    Register Idx = MI.getOperand(2).getReg();
+    Register SubReg = MI.getOperand(3).getImm();
+
+    MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
+                              .addReg(Idx)
+                              .addImm(AMDGPU::VGPRIndexMode::SRC0_ENABLE);
+    SetOn->getOperand(3).setIsUndef();
+
+    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32))
+        .addDef(Dst)
+        .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
+        .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0))
+        .addReg(AMDGPU::M0, RegState::Implicit);
+
+    MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
+
+    finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
+
+    MI.eraseFromParent();
+    break;
+  }
   case AMDGPU::SI_PC_ADD_REL_OFFSET: {
     MachineFunction &MF = *MBB.getParent();
     Register Reg = MI.getOperand(0).getReg();
@@ -2062,7 +2161,7 @@ unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
   //   buzz;
 
   RS->enterBasicBlockEnd(MBB);
-  unsigned Scav = RS->scavengeRegisterBackwards(
+  Register Scav = RS->scavengeRegisterBackwards(
     AMDGPU::SReg_64RegClass,
     MachineBasicBlock::iterator(GetPC), false, 0);
   MRI.replaceRegWith(PCReg, Scav);
@@ -2170,6 +2269,7 @@ bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
     case AMDGPU::SI_MASK_BRANCH:
     case AMDGPU::S_MOV_B64_term:
     case AMDGPU::S_XOR_B64_term:
+    case AMDGPU::S_OR_B64_term:
     case AMDGPU::S_ANDN2_B64_term:
     case AMDGPU::S_MOV_B32_term:
     case AMDGPU::S_XOR_B32_term:
@@ -2264,7 +2364,7 @@ unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
     BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
       .addMBB(TBB);
     if (BytesAdded)
-      *BytesAdded = 4;
+      *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
     return 1;
   }
 
@@ -2291,7 +2391,7 @@ unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
     fixImplicitOperands(*CondBr);
 
     if (BytesAdded)
-      *BytesAdded = 4;
+      *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
     return 1;
   }
 
@@ -2308,7 +2408,7 @@ unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
   CondReg.setIsKill(Cond[1].isKill());
 
   if (BytesAdded)
-      *BytesAdded = 8;
+    *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
 
   return 2;
 }
@@ -2337,7 +2437,8 @@ bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
   case VCCZ: {
     const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
     const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
-    assert(MRI.getRegClass(FalseReg) == RC);
+    if (MRI.getRegClass(FalseReg) != RC)
+      return false;
 
     int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
     CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
@@ -2351,7 +2452,8 @@ bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
     // with a vector one.
     const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
     const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
-    assert(MRI.getRegClass(FalseReg) == RC);
+    if (MRI.getRegClass(FalseReg) != RC)
+      return false;
 
     int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
 
@@ -2489,8 +2591,8 @@ bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const {
   case AMDGPU::S_MOV_B32:
   case AMDGPU::S_MOV_B64:
   case AMDGPU::COPY:
-  case AMDGPU::V_ACCVGPR_WRITE_B32:
-  case AMDGPU::V_ACCVGPR_READ_B32:
+  case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
+  case AMDGPU::V_ACCVGPR_READ_B32_e64:
     return true;
   default:
     return false;
@@ -2543,7 +2645,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
 
   case AMDGPU::V_MOV_B32_e32:
   case AMDGPU::S_MOV_B32:
-  case AMDGPU::V_ACCVGPR_WRITE_B32:
+  case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
     break;
   }
 
@@ -2567,7 +2669,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
     if (RI.isAGPR(*MRI, DstReg)) {
       if (!isInlineConstant(Imm))
         return false;
-      NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32;
+      NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
     }
 
     if (Is16Bit) {
@@ -2588,15 +2690,14 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
 
     UseMI.setDesc(get(NewOpc));
     UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue());
-    UseMI.getOperand(1).setTargetFlags(0);
     UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
     return true;
   }
 
-  if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 ||
-      Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64 ||
-      Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
-      Opc == AMDGPU::V_FMA_F16 || Opc == AMDGPU::V_FMAC_F16_e64) {
+  if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
+      Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
+      Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
+      Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64) {
     // Don't fold if we are using source or output modifiers. The new VOP2
     // instructions don't have them.
     if (hasAnyModifiersSet(UseMI))
@@ -2611,10 +2712,10 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
     if (isInlineConstant(UseMI, *Src0, *ImmOp))
       return false;
 
-    bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 ||
-                 Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64;
-    bool IsFMA = Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
-                 Opc == AMDGPU::V_FMA_F16 || Opc == AMDGPU::V_FMAC_F16_e64;
+    bool IsF32 = Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
+                 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64;
+    bool IsFMA = Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
+                 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64;
     MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
     MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
 
@@ -2686,10 +2787,10 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
           MRI->hasOneUse(Src0->getReg())) {
           Src0->ChangeToImmediate(Def->getOperand(1).getImm());
           Src0Inlined = true;
-        } else if ((Register::isPhysicalRegister(Src0->getReg()) &&
+        } else if ((Src0->getReg().isPhysical() &&
                     (ST.getConstantBusLimit(Opc) <= 1 &&
                      RI.isSGPRClass(RI.getPhysRegClass(Src0->getReg())))) ||
-                   (Register::isVirtualRegister(Src0->getReg()) &&
+                   (Src0->getReg().isVirtual() &&
                     (ST.getConstantBusLimit(Opc) <= 1 &&
                      RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))))
           return false;
@@ -2704,9 +2805,9 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
             MRI->hasOneUse(Src1->getReg()) &&
             commuteInstruction(UseMI)) {
             Src0->ChangeToImmediate(Def->getOperand(1).getImm());
-        } else if ((Register::isPhysicalRegister(Src1->getReg()) &&
+        } else if ((Src1->getReg().isPhysical() &&
                     RI.isSGPRClass(RI.getPhysRegClass(Src1->getReg()))) ||
-                   (Register::isVirtualRegister(Src1->getReg()) &&
+                   (Src1->getReg().isVirtual() &&
                     RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))))
           return false;
           // VGPR is okay as Src1 - fallthrough
@@ -2864,6 +2965,18 @@ static int64_t getFoldableImm(const MachineOperand* MO) {
   return AMDGPU::NoRegister;
 }
 
+static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI,
+                                MachineInstr &NewMI) {
+  if (LV) {
+    unsigned NumOps = MI.getNumOperands();
+    for (unsigned I = 1; I < NumOps; ++I) {
+      MachineOperand &Op = MI.getOperand(I);
+      if (Op.isReg() && Op.isKill())
+        LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
+    }
+  }
+}
+
 MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
                                                  MachineInstr &MI,
                                                  LiveVariables *LV) const {
@@ -2911,61 +3024,73 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
   const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
   const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
   const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
+  MachineInstrBuilder MIB;
 
   if (!Src0Mods && !Src1Mods && !Clamp && !Omod &&
       // If we have an SGPR input, we will violate the constant bus restriction.
-      (ST.getConstantBusLimit(Opc) > 1 ||
-       !Src0->isReg() ||
+      (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
        !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) {
     if (auto Imm = getFoldableImm(Src2)) {
       unsigned NewOpc =
-         IsFMA ? (IsF16 ? AMDGPU::V_FMAAK_F16 : AMDGPU::V_FMAAK_F32)
-               : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
-      if (pseudoToMCOpcode(NewOpc) != -1)
-        return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
-                 .add(*Dst)
-                 .add(*Src0)
-                 .add(*Src1)
-                 .addImm(Imm);
-    }
-    unsigned NewOpc =
-      IsFMA ? (IsF16 ? AMDGPU::V_FMAMK_F16 : AMDGPU::V_FMAMK_F32)
-            : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
+          IsFMA ? (IsF16 ? AMDGPU::V_FMAAK_F16 : AMDGPU::V_FMAAK_F32)
+                : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
+      if (pseudoToMCOpcode(NewOpc) != -1) {
+        MIB = BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
+                  .add(*Dst)
+                  .add(*Src0)
+                  .add(*Src1)
+                  .addImm(Imm);
+        updateLiveVariables(LV, MI, *MIB);
+        return MIB;
+      }
+    }
+    unsigned NewOpc = IsFMA
+                          ? (IsF16 ? AMDGPU::V_FMAMK_F16 : AMDGPU::V_FMAMK_F32)
+                          : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
     if (auto Imm = getFoldableImm(Src1)) {
-      if (pseudoToMCOpcode(NewOpc) != -1)
-        return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
-                 .add(*Dst)
-                 .add(*Src0)
-                 .addImm(Imm)
-                 .add(*Src2);
+      if (pseudoToMCOpcode(NewOpc) != -1) {
+        MIB = BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
+                  .add(*Dst)
+                  .add(*Src0)
+                  .addImm(Imm)
+                  .add(*Src2);
+        updateLiveVariables(LV, MI, *MIB);
+        return MIB;
+      }
     }
     if (auto Imm = getFoldableImm(Src0)) {
       if (pseudoToMCOpcode(NewOpc) != -1 &&
-          isOperandLegal(MI, AMDGPU::getNamedOperandIdx(NewOpc,
-                           AMDGPU::OpName::src0), Src1))
-        return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
-                 .add(*Dst)
-                 .add(*Src1)
-                 .addImm(Imm)
-                 .add(*Src2);
+          isOperandLegal(
+              MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
+              Src1)) {
+        MIB = BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
+                  .add(*Dst)
+                  .add(*Src1)
+                  .addImm(Imm)
+                  .add(*Src2);
+        updateLiveVariables(LV, MI, *MIB);
+        return MIB;
+      }
     }
   }
 
-  unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16 : AMDGPU::V_FMA_F32)
-                          : (IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32);
+  unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16_e64 : AMDGPU::V_FMA_F32_e64)
+                          : (IsF16 ? AMDGPU::V_MAD_F16_e64 : AMDGPU::V_MAD_F32_e64);
   if (pseudoToMCOpcode(NewOpc) == -1)
     return nullptr;
 
-  return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
-      .add(*Dst)
-      .addImm(Src0Mods ? Src0Mods->getImm() : 0)
-      .add(*Src0)
-      .addImm(Src1Mods ? Src1Mods->getImm() : 0)
-      .add(*Src1)
-      .addImm(0) // Src mods
-      .add(*Src2)
-      .addImm(Clamp ? Clamp->getImm() : 0)
-      .addImm(Omod ? Omod->getImm() : 0);
+  MIB = BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
+            .add(*Dst)
+            .addImm(Src0Mods ? Src0Mods->getImm() : 0)
+            .add(*Src0)
+            .addImm(Src1Mods ? Src1Mods->getImm() : 0)
+            .add(*Src1)
+            .addImm(0) // Src mods
+            .add(*Src2)
+            .addImm(Clamp ? Clamp->getImm() : 0)
+            .addImm(Omod ? Omod->getImm() : 0);
+  updateLiveVariables(LV, MI, *MIB);
+  return MIB;
 }
 
 // It's not generally safe to move VALU instructions across these since it will
@@ -3003,9 +3128,6 @@ bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
   // Target-independent instructions do not have an implicit-use of EXEC, even
   // when they operate on VGPRs. Treating EXEC modifications as scheduling
   // boundaries prevents incorrect movements of such instructions.
-
-  // TODO: Don't treat setreg with known constant that only changes MODE as
-  // barrier.
   return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
          MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
          MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
@@ -3053,7 +3175,7 @@ bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
   //       EXEC = 0, but checking for that case here seems not worth it
   //       given the typical code patterns.
   if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
-      Opcode == AMDGPU::EXP || Opcode == AMDGPU::EXP_DONE ||
+      isEXP(Opcode) ||
       Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::S_TRAP ||
       Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER)
     return true;
@@ -3070,7 +3192,8 @@ bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
   //
   // However, executing them with EXEC = 0 causes them to operate on undefined
   // data, which we avoid by returning true here.
-  if (Opcode == AMDGPU::V_READFIRSTLANE_B32 || Opcode == AMDGPU::V_READLANE_B32)
+  if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
+      Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32)
     return true;
 
   return false;
@@ -3241,9 +3364,6 @@ bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
   if (OpInfo.RegClass < 0)
     return false;
 
-  const MachineFunction *MF = MI.getParent()->getParent();
-  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
-
   if (MO.isImm() && isInlineConstant(MO, OpInfo)) {
     if (isMAI(MI) && ST.hasMFMAInlineLiteralBug() &&
         OpNo ==(unsigned)AMDGPU::getNamedOperandIdx(MI.getOpcode(),
@@ -3396,8 +3516,11 @@ MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI,
       Inst32.add(*Src2);
     } else {
       // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
-      // replaced with an implicit read of vcc. This was already added
-      // during the initial BuildMI, so find it to preserve the flags.
+      // replaced with an implicit read of vcc or vcc_lo. The implicit read
+      // of vcc was already added during the initial BuildMI, but we
+      // 1) may need to change vcc to vcc_lo to preserve the original register
+      // 2) have to preserve the original flags.
+      fixImplicitOperands(*Inst32);
       copyFlagsToImplicitVCC(*Inst32, *Src2);
     }
   }
@@ -3420,7 +3543,7 @@ bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
   if (!MO.isUse())
     return false;
 
-  if (Register::isVirtualRegister(MO.getReg()))
+  if (MO.getReg().isVirtual())
     return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
 
   // Null is free
@@ -3464,13 +3587,7 @@ static bool shouldReadExec(const MachineInstr &MI) {
   if (SIInstrInfo::isVALU(MI)) {
     switch (MI.getOpcode()) {
     case AMDGPU::V_READLANE_B32:
-    case AMDGPU::V_READLANE_B32_gfx6_gfx7:
-    case AMDGPU::V_READLANE_B32_gfx10:
-    case AMDGPU::V_READLANE_B32_vi:
     case AMDGPU::V_WRITELANE_B32:
-    case AMDGPU::V_WRITELANE_B32_gfx6_gfx7:
-    case AMDGPU::V_WRITELANE_B32_gfx10:
-    case AMDGPU::V_WRITELANE_B32_vi:
       return false;
     }
 
@@ -3489,7 +3606,7 @@ static bool shouldReadExec(const MachineInstr &MI) {
 static bool isSubRegOf(const SIRegisterInfo &TRI,
                        const MachineOperand &SuperVec,
                        const MachineOperand &SubReg) {
-  if (Register::isPhysicalRegister(SubReg.getReg()))
+  if (SubReg.getReg().isPhysical())
     return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
 
   return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
@@ -3530,7 +3647,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
         continue;
 
       Register Reg = Op.getReg();
-      if (!Register::isVirtualRegister(Reg) && !RC->contains(Reg)) {
+      if (!Reg.isVirtual() && !RC->contains(Reg)) {
         ErrInfo = "inlineasm operand has incorrect register class.";
         return false;
       }
@@ -3600,7 +3717,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
 
     if (RegClass != -1) {
       Register Reg = MI.getOperand(i).getReg();
-      if (Reg == AMDGPU::NoRegister || Register::isVirtualRegister(Reg))
+      if (Reg == AMDGPU::NoRegister || Reg.isVirtual())
         continue;
 
       const TargetRegisterClass *RC = RI.getRegClass(RegClass);
@@ -3636,7 +3753,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
       } else {
         // No immediates on GFX9
         if (!MO.isReg()) {
-          ErrInfo = "Only reg allowed as operands in SDWA instructions on GFX9";
+          ErrInfo =
+            "Only reg allowed as operands in SDWA instructions on GFX9+";
           return false;
         }
       }
@@ -3693,7 +3811,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
         ErrInfo =
             "Dst register should be tied to implicit use of preserved register";
         return false;
-      } else if (Register::isPhysicalRegister(TiedMO.getReg()) &&
+      } else if (TiedMO.getReg().isPhysical() &&
                  Dst.getReg() != TiedMO.getReg()) {
         ErrInfo = "Dst register should use same physical register as preserved";
         return false;
@@ -3752,11 +3870,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
       ++ConstantBusCount;
 
     SmallVector<Register, 2> SGPRsUsed;
-    Register SGPRUsed = findImplicitSGPRRead(MI);
-    if (SGPRUsed != AMDGPU::NoRegister) {
-      ++ConstantBusCount;
-      SGPRsUsed.push_back(SGPRUsed);
-    }
+    Register SGPRUsed;
 
     for (int OpIdx : OpIndices) {
       if (OpIdx == -1)
@@ -3765,8 +3879,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
       if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) {
         if (MO.isReg()) {
           SGPRUsed = MO.getReg();
-          if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
-                return !RI.regsOverlap(SGPRUsed, SGPR);
+          if (llvm::all_of(SGPRsUsed, [SGPRUsed](unsigned SGPR) {
+                return SGPRUsed != SGPR;
               })) {
             ++ConstantBusCount;
             SGPRsUsed.push_back(SGPRUsed);
@@ -3777,7 +3891,18 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
         }
       }
     }
-    const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+
+    SGPRUsed = findImplicitSGPRRead(MI);
+    if (SGPRUsed != AMDGPU::NoRegister) {
+      // Implicit uses may safely overlap true overands
+      if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
+            return !RI.regsOverlap(SGPRUsed, SGPR);
+          })) {
+        ++ConstantBusCount;
+        SGPRsUsed.push_back(SGPRUsed);
+      }
+    }
+
     // v_writelane_b32 is an exception from constant bus restriction:
     // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
     if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
@@ -3825,8 +3950,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
   }
 
   // Verify misc. restrictions on specific instructions.
-  if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 ||
-      Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) {
+  if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
+      Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
     const MachineOperand &Src0 = MI.getOperand(Src0Idx);
     const MachineOperand &Src1 = MI.getOperand(Src1Idx);
     const MachineOperand &Src2 = MI.getOperand(Src2Idx);
@@ -3837,6 +3962,15 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
         return false;
       }
     }
+    if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
+         SISrcMods::ABS) ||
+        (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
+         SISrcMods::ABS) ||
+        (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
+         SISrcMods::ABS)) {
+      ErrInfo = "ABS not allowed in VOP3B instructions";
+      return false;
+    }
   }
 
   if (isSOP2(MI) || isSOPC(MI)) {
@@ -3945,7 +4079,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
     }
   }
 
-  if (isFLAT(MI) && !MF->getSubtarget<GCNSubtarget>().hasFlatInstOffsets()) {
+  if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
     const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
     if (Offset->getImm() != 0) {
       ErrInfo = "subtarget does not support offsets in flat instructions";
@@ -4079,21 +4213,21 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
            AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
   }
   case AMDGPU::S_ADD_I32:
-    return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_I32_e32;
+    return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
   case AMDGPU::S_ADDC_U32:
     return AMDGPU::V_ADDC_U32_e32;
   case AMDGPU::S_SUB_I32:
-    return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_I32_e32;
+    return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
     // FIXME: These are not consistently handled, and selected when the carry is
     // used.
   case AMDGPU::S_ADD_U32:
-    return AMDGPU::V_ADD_I32_e32;
+    return AMDGPU::V_ADD_CO_U32_e32;
   case AMDGPU::S_SUB_U32:
-    return AMDGPU::V_SUB_I32_e32;
+    return AMDGPU::V_SUB_CO_U32_e32;
   case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
-  case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32;
-  case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32;
-  case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32;
+  case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
+  case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
+  case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
   case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
   case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
   case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
@@ -4104,15 +4238,15 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
   case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
   case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
   case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
-  case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64;
+  case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
   case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
-  case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64;
+  case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
   case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
-  case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64;
-  case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32;
-  case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32;
-  case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32;
-  case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32;
+  case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
+  case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
+  case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
+  case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
+  case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
   case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
   case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
   case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
@@ -4150,7 +4284,7 @@ const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
       Desc.OpInfo[OpNo].RegClass == -1) {
     Register Reg = MI.getOperand(OpNo).getReg();
 
-    if (Register::isVirtualRegister(Reg))
+    if (Reg.isVirtual())
       return MRI.getRegClass(Reg);
     return RI.getPhysRegClass(Reg);
   }
@@ -4164,11 +4298,9 @@ void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
   MachineBasicBlock *MBB = MI.getParent();
   MachineOperand &MO = MI.getOperand(OpIdx);
   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
-  const SIRegisterInfo *TRI =
-      static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
   unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass;
   const TargetRegisterClass *RC = RI.getRegClass(RCID);
-  unsigned Size = TRI->getRegSizeInBits(*RC);
+  unsigned Size = RI.getRegSizeInBits(*RC);
   unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO : AMDGPU::V_MOV_B32_e32;
   if (MO.isReg())
     Opcode = AMDGPU::COPY;
@@ -4255,11 +4387,13 @@ bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
     return false;
 
   Register Reg = MO.getReg();
-  const TargetRegisterClass *RC = Register::isVirtualRegister(Reg)
-                                      ? MRI.getRegClass(Reg)
-                                      : RI.getPhysRegClass(Reg);
 
   const TargetRegisterClass *DRC = RI.getRegClass(OpInfo.RegClass);
+  if (Reg.isPhysical())
+    return DRC->contains(Reg);
+
+  const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+
   if (MO.getSubReg()) {
     const MachineFunction *MF = MO.getParent()->getParent()->getParent();
     const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
@@ -4290,7 +4424,6 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
   const MachineRegisterInfo &MRI = MF.getRegInfo();
   const MCInstrDesc &InstDesc = MI.getDesc();
   const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx];
-  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const TargetRegisterClass *DefinedRC =
       OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
   if (!MO)
@@ -4469,8 +4602,8 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
     AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
   };
 
-  if (Opc == AMDGPU::V_PERMLANE16_B32 ||
-      Opc == AMDGPU::V_PERMLANEX16_B32) {
+  if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
+      Opc == AMDGPU::V_PERMLANEX16_B32_e64) {
     // src1 and src2 must be scalar
     MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
     MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
@@ -4493,7 +4626,7 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
   int ConstantBusLimit = ST.getConstantBusLimit(Opc);
   int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
   SmallDenseSet<unsigned> SGPRsUsed;
-  unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx);
+  Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
   if (SGPRReg != AMDGPU::NoRegister) {
     SGPRsUsed.insert(SGPRReg);
     --ConstantBusLimit;
@@ -4597,16 +4730,32 @@ void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI,
   // pointer value is uniform.
   MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
   if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
-    unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
+    Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
     SBase->setReg(SGPR);
   }
   MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soff);
   if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) {
-    unsigned SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
+    Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
     SOff->setReg(SGPR);
   }
 }
 
+// FIXME: Remove this when SelectionDAG is obsoleted.
+void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI,
+                                       MachineInstr &MI) const {
+  if (!isSegmentSpecificFLAT(MI))
+    return;
+
+  // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
+  // thinks they are uniform, so a readfirstlane should be valid.
+  MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
+  if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
+    return;
+
+  Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI);
+  SAddr->setReg(ToSGPR);
+}
+
 void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
                                          MachineBasicBlock::iterator I,
                                          const TargetRegisterClass *DstRC,
@@ -4671,59 +4820,82 @@ emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI,
 
   MachineBasicBlock::iterator I = LoopBB.begin();
 
+  SmallVector<Register, 8> ReadlanePieces;
+  Register CondReg = AMDGPU::NoRegister;
+
   Register VRsrc = Rsrc.getReg();
   unsigned VRsrcUndef = getUndefRegState(Rsrc.isUndef());
 
-  Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
-  Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
-  Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
-  Register AndCond = MRI.createVirtualRegister(BoolXExecRC);
-  Register SRsrcSub0 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
-  Register SRsrcSub1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
-  Register SRsrcSub2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
-  Register SRsrcSub3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
-  Register SRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
-
-  // Beginning of the loop, read the next Rsrc variant.
-  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub0)
-      .addReg(VRsrc, VRsrcUndef, AMDGPU::sub0);
-  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub1)
-      .addReg(VRsrc, VRsrcUndef, AMDGPU::sub1);
-  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub2)
-      .addReg(VRsrc, VRsrcUndef, AMDGPU::sub2);
-  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub3)
-      .addReg(VRsrc, VRsrcUndef, AMDGPU::sub3);
-
-  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SRsrc)
-      .addReg(SRsrcSub0)
-      .addImm(AMDGPU::sub0)
-      .addReg(SRsrcSub1)
-      .addImm(AMDGPU::sub1)
-      .addReg(SRsrcSub2)
-      .addImm(AMDGPU::sub2)
-      .addReg(SRsrcSub3)
-      .addImm(AMDGPU::sub3);
+  unsigned RegSize = TRI->getRegSizeInBits(Rsrc.getReg(), MRI);
+  unsigned NumSubRegs =  RegSize / 32;
+  assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 && "Unhandled register size");
+
+  for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
+
+    Register CurRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+    Register CurRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+
+    // Read the next variant <- also loop target.
+    BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
+            .addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx));
+
+    // Read the next variant <- also loop target.
+    BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
+            .addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx + 1));
+
+    ReadlanePieces.push_back(CurRegLo);
+    ReadlanePieces.push_back(CurRegHi);
+
+    // Comparison is to be done as 64-bit.
+    Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
+    BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
+            .addReg(CurRegLo)
+            .addImm(AMDGPU::sub0)
+            .addReg(CurRegHi)
+            .addImm(AMDGPU::sub1);
+
+    Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
+    auto Cmp =
+        BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), NewCondReg)
+            .addReg(CurReg);
+    if (NumSubRegs <= 2)
+      Cmp.addReg(VRsrc);
+    else
+      Cmp.addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx, 2));
+
+    // Combine the comparision results with AND.
+    if (CondReg == AMDGPU::NoRegister) // First.
+      CondReg = NewCondReg;
+    else { // If not the first, we create an AND.
+      Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
+      BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
+              .addReg(CondReg)
+              .addReg(NewCondReg);
+      CondReg = AndReg;
+    }
+  } // End for loop.
+
+  auto SRsrcRC = TRI->getEquivalentSGPRClass(MRI.getRegClass(VRsrc));
+  Register SRsrc = MRI.createVirtualRegister(SRsrcRC);
+
+  // Build scalar Rsrc.
+  auto Merge = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SRsrc);
+  unsigned Channel = 0;
+  for (Register Piece : ReadlanePieces) {
+    Merge.addReg(Piece)
+         .addImm(TRI->getSubRegFromChannel(Channel++));
+  }
 
   // Update Rsrc operand to use the SGPR Rsrc.
   Rsrc.setReg(SRsrc);
   Rsrc.setIsKill(true);
 
-  // Identify all lanes with identical Rsrc operands in their VGPRs.
-  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg0)
-      .addReg(SRsrc, 0, AMDGPU::sub0_sub1)
-      .addReg(VRsrc, 0, AMDGPU::sub0_sub1);
-  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg1)
-      .addReg(SRsrc, 0, AMDGPU::sub2_sub3)
-      .addReg(VRsrc, 0, AMDGPU::sub2_sub3);
-  BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndCond)
-      .addReg(CondReg0)
-      .addReg(CondReg1);
-
-  MRI.setSimpleHint(SaveExec, AndCond);
+  Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
+  MRI.setSimpleHint(SaveExec, CondReg);
 
   // Update EXEC to matching lanes, saving original to SaveExec.
   BuildMI(LoopBB, I, DL, TII.get(SaveExecOpc), SaveExec)
-      .addReg(AndCond, RegState::Kill);
+      .addReg(CondReg, RegState::Kill);
 
   // The original instruction is here; we insert the terminators after it.
   I = LoopBB.end();
@@ -4732,19 +4904,29 @@ emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI,
   BuildMI(LoopBB, I, DL, TII.get(XorTermOpc), Exec)
       .addReg(Exec)
       .addReg(SaveExec);
+
   BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(&LoopBB);
 }
 
 // Build a waterfall loop around \p MI, replacing the VGPR \p Rsrc register
 // with SGPRs by iterating over all unique values across all lanes.
-static void loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
-                              MachineOperand &Rsrc, MachineDominatorTree *MDT) {
+// Returns the loop basic block that now contains \p MI.
+static MachineBasicBlock *
+loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
+                  MachineOperand &Rsrc, MachineDominatorTree *MDT,
+                  MachineBasicBlock::iterator Begin = nullptr,
+                  MachineBasicBlock::iterator End = nullptr) {
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  MachineBasicBlock::iterator I(&MI);
+  if (!Begin.isValid())
+    Begin = &MI;
+  if (!End.isValid()) {
+    End = &MI;
+    ++End;
+  }
   const DebugLoc &DL = MI.getDebugLoc();
   unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
   unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
@@ -4753,13 +4935,17 @@ static void loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
   Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
 
   // Save the EXEC mask
-  BuildMI(MBB, I, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec);
+  BuildMI(MBB, Begin, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec);
 
   // Killed uses in the instruction we are waterfalling around will be
   // incorrect due to the added control-flow.
-  for (auto &MO : MI.uses()) {
-    if (MO.isReg() && MO.isUse()) {
-      MRI.clearKillFlags(MO.getReg());
+  MachineBasicBlock::iterator AfterMI = MI;
+  ++AfterMI;
+  for (auto I = Begin; I != AfterMI; I++) {
+    for (auto &MO : I->uses()) {
+      if (MO.isReg() && MO.isUse()) {
+        MRI.clearKillFlags(MO.getReg());
+      }
     }
   }
 
@@ -4776,11 +4962,11 @@ static void loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
   LoopBB->addSuccessor(LoopBB);
   LoopBB->addSuccessor(RemainderBB);
 
-  // Move MI to the LoopBB, and the remainder of the block to RemainderBB.
-  MachineBasicBlock::iterator J = I++;
+  // Move Begin to MI to the LoopBB, and the remainder of the block to
+  // RemainderBB.
   RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
-  RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
-  LoopBB->splice(LoopBB->begin(), &MBB, J);
+  RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
+  LoopBB->splice(LoopBB->begin(), &MBB, Begin, MBB.end());
 
   MBB.addSuccessor(LoopBB);
 
@@ -4803,6 +4989,7 @@ static void loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
   // Restore the EXEC mask
   MachineBasicBlock::iterator First = RemainderBB->begin();
   BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec);
+  return LoopBB;
 }
 
 // Extract pointer from Rsrc and return a zero-value Rsrc replacement.
@@ -4848,27 +5035,35 @@ extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) {
   return std::make_tuple(RsrcPtr, NewSRsrc);
 }
 
-void SIInstrInfo::legalizeOperands(MachineInstr &MI,
-                                   MachineDominatorTree *MDT) const {
+MachineBasicBlock *
+SIInstrInfo::legalizeOperands(MachineInstr &MI,
+                              MachineDominatorTree *MDT) const {
   MachineFunction &MF = *MI.getParent()->getParent();
   MachineRegisterInfo &MRI = MF.getRegInfo();
+  MachineBasicBlock *CreatedBB = nullptr;
 
   // Legalize VOP2
   if (isVOP2(MI) || isVOPC(MI)) {
     legalizeOperandsVOP2(MRI, MI);
-    return;
+    return CreatedBB;
   }
 
   // Legalize VOP3
   if (isVOP3(MI)) {
     legalizeOperandsVOP3(MRI, MI);
-    return;
+    return CreatedBB;
   }
 
   // Legalize SMRD
   if (isSMRD(MI)) {
     legalizeOperandsSMRD(MRI, MI);
-    return;
+    return CreatedBB;
+  }
+
+  // Legalize FLAT
+  if (isFLAT(MI)) {
+    legalizeOperandsFLAT(MRI, MI);
+    return CreatedBB;
   }
 
   // Legalize REG_SEQUENCE and PHI
@@ -4877,8 +5072,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
   if (MI.getOpcode() == AMDGPU::PHI) {
     const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
     for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
-      if (!MI.getOperand(i).isReg() ||
-          !Register::isVirtualRegister(MI.getOperand(i).getReg()))
+      if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual())
         continue;
       const TargetRegisterClass *OpRC =
           MRI.getRegClass(MI.getOperand(i).getReg());
@@ -4914,7 +5108,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
     // Update all the operands so they have the same type.
     for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
       MachineOperand &Op = MI.getOperand(I);
-      if (!Op.isReg() || !Register::isVirtualRegister(Op.getReg()))
+      if (!Op.isReg() || !Op.getReg().isVirtual())
         continue;
 
       // MI is a PHI instruction.
@@ -4939,7 +5133,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
       // subregister index types e.g. sub0_sub1 + sub2 + sub3
       for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
         MachineOperand &Op = MI.getOperand(I);
-        if (!Op.isReg() || !Register::isVirtualRegister(Op.getReg()))
+        if (!Op.isReg() || !Op.getReg().isVirtual())
           continue;
 
         const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
@@ -4952,7 +5146,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
       }
     }
 
-    return;
+    return CreatedBB;
   }
 
   // Legalize INSERT_SUBREG
@@ -4967,7 +5161,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
       MachineOperand &Op = MI.getOperand(1);
       legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
     }
-    return;
+    return CreatedBB;
   }
 
   // Legalize SI_INIT_M0
@@ -4975,7 +5169,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
     MachineOperand &Src = MI.getOperand(0);
     if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
       Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
-    return;
+    return CreatedBB;
   }
 
   // Legalize MIMG and MUBUF/MTBUF for shaders.
@@ -4983,21 +5177,44 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
   // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
   // scratch memory access. In both cases, the legalization never involves
   // conversion to the addr64 form.
-  if (isMIMG(MI) ||
-      (AMDGPU::isShader(MF.getFunction().getCallingConv()) &&
-       (isMUBUF(MI) || isMTBUF(MI)))) {
+  if (isMIMG(MI) || (AMDGPU::isGraphics(MF.getFunction().getCallingConv()) &&
+                     (isMUBUF(MI) || isMTBUF(MI)))) {
     MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc);
-    if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) {
-      unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI);
-      SRsrc->setReg(SGPR);
-    }
+    if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
+      CreatedBB = loadSRsrcFromVGPR(*this, MI, *SRsrc, MDT);
 
     MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp);
-    if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) {
-      unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI);
-      SSamp->setReg(SGPR);
+    if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
+      CreatedBB = loadSRsrcFromVGPR(*this, MI, *SSamp, MDT);
+
+    return CreatedBB;
+  }
+
+  // Legalize SI_CALL
+  if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
+    MachineOperand *Dest = &MI.getOperand(0);
+    if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
+      // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
+      // following copies, we also need to move copies from and to physical
+      // registers into the loop block.
+      unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
+      unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
+
+      // Also move the copies to physical registers into the loop block
+      MachineBasicBlock &MBB = *MI.getParent();
+      MachineBasicBlock::iterator Start(&MI);
+      while (Start->getOpcode() != FrameSetupOpcode)
+        --Start;
+      MachineBasicBlock::iterator End(&MI);
+      while (End->getOpcode() != FrameDestroyOpcode)
+        ++End;
+      // Also include following copies of the return value
+      ++End;
+      while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
+             MI.definesRegister(End->getOperand(1).getReg()))
+        ++End;
+      CreatedBB = loadSRsrcFromVGPR(*this, MI, *Dest, MDT, Start, End);
     }
-    return;
   }
 
   // Legalize MUBUF* instructions.
@@ -5011,7 +5228,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
                              RI.getRegClass(RsrcRC))) {
       // The operands are legal.
       // FIXME: We may need to legalize operands besided srsrc.
-      return;
+      return CreatedBB;
     }
 
     // Legalize a VGPR Rsrc.
@@ -5046,7 +5263,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
 
       // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
       const DebugLoc &DL = MI.getDebugLoc();
-      BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e64), NewVAddrLo)
+      BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
         .addDef(CondReg0)
         .addReg(RsrcPtr, 0, AMDGPU::sub0)
         .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
@@ -5072,8 +5289,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
     } else if (!VAddr && ST.hasAddr64()) {
       // This instructions is the _OFFSET variant, so we need to convert it to
       // ADDR64.
-      assert(MBB.getParent()->getSubtarget<GCNSubtarget>().getGeneration()
-             < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
+      assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
              "FIXME: Need to emit flat atomics here");
 
       unsigned RsrcPtr, NewSRsrc;
@@ -5146,15 +5362,19 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
     } else {
       // This is another variant; legalize Rsrc with waterfall loop from VGPRs
       // to SGPRs.
-      loadSRsrcFromVGPR(*this, MI, *Rsrc, MDT);
+      CreatedBB = loadSRsrcFromVGPR(*this, MI, *Rsrc, MDT);
+      return CreatedBB;
     }
   }
+  return CreatedBB;
 }
 
-void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
-                             MachineDominatorTree *MDT) const {
+MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst,
+                                           MachineDominatorTree *MDT) const {
   SetVectorType Worklist;
   Worklist.insert(&TopInst);
+  MachineBasicBlock *CreatedBB = nullptr;
+  MachineBasicBlock *CreatedBBTmp = nullptr;
 
   while (!Worklist.empty()) {
     MachineInstr &Inst = *Worklist.pop_back_val();
@@ -5174,13 +5394,18 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
       Inst.eraseFromParent();
       continue;
     case AMDGPU::S_ADD_I32:
-    case AMDGPU::S_SUB_I32:
+    case AMDGPU::S_SUB_I32: {
       // FIXME: The u32 versions currently selected use the carry.
-      if (moveScalarAddSub(Worklist, Inst, MDT))
+      bool Changed;
+      std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
+      if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp)
+        CreatedBB = CreatedBBTmp;
+      if (Changed)
         continue;
 
       // Default handling
       break;
+    }
     case AMDGPU::S_AND_B64:
       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
       Inst.eraseFromParent();
@@ -5259,19 +5484,19 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
       break;
     case AMDGPU::S_LSHL_B64:
       if (ST.hasOnlyRevVALUShifts()) {
-        NewOpcode = AMDGPU::V_LSHLREV_B64;
+        NewOpcode = AMDGPU::V_LSHLREV_B64_e64;
         swapOperands(Inst);
       }
       break;
     case AMDGPU::S_ASHR_I64:
       if (ST.hasOnlyRevVALUShifts()) {
-        NewOpcode = AMDGPU::V_ASHRREV_I64;
+        NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
         swapOperands(Inst);
       }
       break;
     case AMDGPU::S_LSHR_B64:
       if (ST.hasOnlyRevVALUShifts()) {
-        NewOpcode = AMDGPU::V_LSHRREV_B64;
+        NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
         swapOperands(Inst);
       }
       break;
@@ -5361,7 +5586,9 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
               .add(Inst.getOperand(3))
               .addReg(CarryInReg)
               .addImm(0);
-      legalizeOperands(*CarryOp);
+      CreatedBBTmp = legalizeOperands(*CarryOp);
+      if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp)
+        CreatedBB = CreatedBBTmp;
       MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
       addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
       Inst.eraseFromParent();
@@ -5376,8 +5603,8 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
       MachineOperand &Src1 = Inst.getOperand(3);
 
       unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
-                         ? AMDGPU::V_ADD_I32_e64
-                         : AMDGPU::V_SUB_I32_e64;
+                         ? AMDGPU::V_ADD_CO_U32_e64
+                         : AMDGPU::V_SUB_CO_U32_e64;
       const TargetRegisterClass *NewRC =
           RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
       Register DestReg = MRI.createVirtualRegister(NewRC);
@@ -5387,7 +5614,9 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
                                    .add(Src1)
                                    .addImm(0); // clamp bit
 
-      legalizeOperands(*NewInstr, MDT);
+      CreatedBBTmp = legalizeOperands(*NewInstr, MDT);
+      if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp)
+        CreatedBB = CreatedBBTmp;
 
       MRI.replaceRegWith(Dest0.getReg(), DestReg);
       addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI,
@@ -5406,7 +5635,9 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
     if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
       // We cannot move this instruction to the VALU, so we should try to
       // legalize its operands instead.
-      legalizeOperands(Inst, MDT);
+      CreatedBBTmp = legalizeOperands(Inst, MDT);
+      if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp)
+        CreatedBB = CreatedBBTmp;
       continue;
     }
 
@@ -5462,7 +5693,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
     unsigned NewDstReg = AMDGPU::NoRegister;
     if (HasDst) {
       Register DstReg = Inst.getOperand(0).getReg();
-      if (Register::isPhysicalRegister(DstReg))
+      if (DstReg.isPhysical())
         continue;
 
       // Update the destination register class.
@@ -5470,8 +5701,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
       if (!NewDstRC)
         continue;
 
-      if (Inst.isCopy() &&
-          Register::isVirtualRegister(Inst.getOperand(1).getReg()) &&
+      if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
           NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
         // Instead of creating a copy where src and dst are the same register
         // class, we just replace all uses of dst with src.  These kinds of
@@ -5498,16 +5728,20 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
     }
 
     // Legalize the operands
-    legalizeOperands(Inst, MDT);
+    CreatedBBTmp = legalizeOperands(Inst, MDT);
+    if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp)
+      CreatedBB = CreatedBBTmp;
 
     if (HasDst)
      addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
   }
+  return CreatedBB;
 }
 
 // Add/sub require special handling to deal with carry outs.
-bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
-                                   MachineDominatorTree *MDT) const {
+std::pair<bool, MachineBasicBlock *>
+SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
+                              MachineDominatorTree *MDT) const {
   if (ST.hasAddNoCarry()) {
     // Assume there is no user of scc since we don't select this in that case.
     // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
@@ -5532,13 +5766,13 @@ bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
     Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
     Inst.addImplicitDefUseOperands(*MBB.getParent());
     MRI.replaceRegWith(OldDstReg, ResultReg);
-    legalizeOperands(Inst, MDT);
+    MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
 
     addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
-    return true;
+    return std::make_pair(true, NewBB);
   }
 
-  return false;
+  return std::make_pair(false, nullptr);
 }
 
 void SIInstrInfo::lowerSelect(SetVectorType &Worklist, MachineInstr &Inst,
@@ -5626,7 +5860,7 @@ void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist,
   Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
 
   unsigned SubOp = ST.hasAddNoCarry() ?
-    AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_I32_e32;
+    AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
 
   BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
     .addImm(0)
@@ -5855,7 +6089,7 @@ void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist,
   MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
                                                        AMDGPU::sub1, Src1SubRC);
 
-  unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64;
+  unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
   MachineInstr *LoHalf =
     BuildMI(MBB, MII, DL, get(LoOpc), DestSub0)
     .addReg(CarryReg, RegState::Define)
@@ -6055,7 +6289,7 @@ void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist,
     Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
 
-    BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo)
+    BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
         .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
         .addImm(0)
         .addImm(BitWidth);
@@ -6152,7 +6386,7 @@ void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
       .addReg(ImmReg, RegState::Kill)
       .add(Src0);
 
-    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32), ResultReg)
+    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
       .add(Src1)
       .addImm(16)
       .addReg(TmpReg, RegState::Kill);
@@ -6162,7 +6396,7 @@ void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
     Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
       .addImm(0xffff);
-    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32), ResultReg)
+    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
       .addReg(ImmReg, RegState::Kill)
       .add(Src0)
       .add(Src1);
@@ -6176,7 +6410,7 @@ void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
       .add(Src0);
     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
       .addImm(0xffff0000);
-    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32), ResultReg)
+    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
       .add(Src1)
       .addReg(ImmReg, RegState::Kill)
       .addReg(TmpReg, RegState::Kill);
@@ -6209,7 +6443,7 @@ void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
     if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI) != -1) {
       if (MI.isCopy()) {
         MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
-        unsigned DestReg = MI.getOperand(0).getReg();
+        Register DestReg = MI.getOperand(0).getReg();
 
         for (auto &User : MRI.use_nodbg_instructions(DestReg)) {
           if ((User.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) ||
@@ -6407,7 +6641,7 @@ uint64_t SIInstrInfo::getScratchRsrcWords23() const {
 
   // GFX9 doesn't have ELEMENT_SIZE.
   if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
-    uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1;
+    uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
     Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
   }
 
@@ -6503,8 +6737,16 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
 
   // If we have a definitive size, we can use it. Otherwise we need to inspect
   // the operands to know the size.
-  if (isFixedSize(MI))
-    return DescSize;
+  if (isFixedSize(MI)) {
+    unsigned Size = DescSize;
+
+    // If we hit the buggy offset, an extra nop will be inserted in MC so
+    // estimate the worst case.
+    if (MI.isBranch() && ST.hasOffset3fBug())
+      Size += 4;
+
+    return Size;
+  }
 
   // 4-byte instructions may have a 32-bit literal encoded after them. Check
   // operands that coud ever be literals.
@@ -6555,8 +6797,7 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
   case TargetOpcode::INLINEASM_BR: {
     const MachineFunction *MF = MI.getParent()->getParent();
     const char *AsmStr = MI.getOperand(0).getSymbolName();
-    return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(),
-                              &MF->getSubtarget());
+    return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
   }
   default:
     return DescSize;
@@ -6716,7 +6957,7 @@ SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
   Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
   MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
 
-  return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg)
+  return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
            .addReg(UnusedCarry, RegState::Define | RegState::Dead);
 }
 
@@ -6737,7 +6978,7 @@ MachineInstrBuilder SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
   if (!UnusedCarry.isValid())
     return MachineInstrBuilder();
 
-  return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg)
+  return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
            .addReg(UnusedCarry, RegState::Define | RegState::Dead);
 }
 
@@ -6763,10 +7004,6 @@ const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) con
 }
 
 void SIInstrInfo::fixImplicitOperands(MachineInstr &MI) const {
-  MachineBasicBlock *MBB = MI.getParent();
-  MachineFunction *MF = MBB->getParent();
-  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
-
   if (!ST.isWave32())
     return;
 
@@ -6789,20 +7026,6 @@ bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const {
   return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
 }
 
-unsigned SIInstrInfo::getNumFlatOffsetBits(unsigned AddrSpace,
-                                           bool Signed) const {
-  if (!ST.hasFlatInstOffsets())
-    return 0;
-
-  if (ST.hasFlatSegmentOffsetBug() && AddrSpace == AMDGPUAS::FLAT_ADDRESS)
-    return 0;
-
-  if (ST.getGeneration() >= AMDGPUSubtarget::GFX10)
-    return Signed ? 12 : 11;
-
-  return Signed ? 13 : 12;
-}
-
 bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
                                     bool Signed) const {
   // TODO: Should 0 be special cased?
@@ -6812,16 +7035,31 @@ bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
   if (ST.hasFlatSegmentOffsetBug() && AddrSpace == AMDGPUAS::FLAT_ADDRESS)
     return false;
 
-  if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
-    return (Signed && isInt<12>(Offset)) ||
-           (!Signed && isUInt<11>(Offset));
+  unsigned N = AMDGPU::getNumFlatOffsetBits(ST, Signed);
+  return Signed ? isIntN(N, Offset) : isUIntN(N, Offset);
+}
+
+std::pair<int64_t, int64_t> SIInstrInfo::splitFlatOffset(int64_t COffsetVal,
+                                                         unsigned AddrSpace,
+                                                         bool IsSigned) const {
+  int64_t RemainderOffset = COffsetVal;
+  int64_t ImmField = 0;
+  const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST, IsSigned);
+  if (IsSigned) {
+    // Use signed division by a power of two to truncate towards 0.
+    int64_t D = 1LL << (NumBits - 1);
+    RemainderOffset = (COffsetVal / D) * D;
+    ImmField = COffsetVal - RemainderOffset;
+  } else if (COffsetVal >= 0) {
+    ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
+    RemainderOffset = COffsetVal - ImmField;
   }
 
-  return (Signed && isInt<13>(Offset)) ||
-         (!Signed && isUInt<12>(Offset));
+  assert(isLegalFLATOffset(ImmField, AddrSpace, IsSigned));
+  assert(RemainderOffset + ImmField == COffsetVal);
+  return {ImmField, RemainderOffset};
 }
 
-
 // This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td
 enum SIEncodingFamily {
   SI = 0,
@@ -6962,7 +7200,7 @@ static bool followSubRegDef(MachineInstr &MI,
 MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
                                      MachineRegisterInfo &MRI) {
   assert(MRI.isSSA());
-  if (!Register::isVirtualRegister(P.Reg))
+  if (!P.Reg.isVirtual())
     return nullptr;
 
   auto RSR = P;
@@ -6973,7 +7211,7 @@ MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
     case AMDGPU::COPY:
     case AMDGPU::V_MOV_B32_e32: {
       auto &Op1 = MI->getOperand(1);
-      if (Op1.isReg() && Register::isVirtualRegister(Op1.getReg())) {
+      if (Op1.isReg() && Op1.getReg().isVirtual()) {
         if (Op1.isUndef())
           return nullptr;
         RSR = getRegSubRegPair(Op1);
@@ -7035,36 +7273,51 @@ bool llvm::execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI,
   auto *TRI = MRI.getTargetRegisterInfo();
   auto *DefBB = DefMI.getParent();
 
-  const int MaxUseInstScan = 10;
-  int NumUseInst = 0;
+  const int MaxUseScan = 10;
+  int NumUse = 0;
 
-  for (auto &UseInst : MRI.use_nodbg_instructions(VReg)) {
+  for (auto &Use : MRI.use_nodbg_operands(VReg)) {
+    auto &UseInst = *Use.getParent();
     // Don't bother searching between blocks, although it is possible this block
     // doesn't modify exec.
     if (UseInst.getParent() != DefBB)
       return true;
 
-    if (++NumUseInst > MaxUseInstScan)
+    if (++NumUse > MaxUseScan)
       return true;
   }
 
+  if (NumUse == 0)
+    return false;
+
   const int MaxInstScan = 20;
   int NumInst = 0;
 
   // Stop scan when we have seen all the uses.
   for (auto I = std::next(DefMI.getIterator()); ; ++I) {
+    assert(I != DefBB->end());
+
     if (I->isDebugInstr())
       continue;
 
     if (++NumInst > MaxInstScan)
       return true;
 
-    if (I->readsRegister(VReg))
-      if (--NumUseInst == 0)
-        return false;
+    for (const MachineOperand &Op : I->operands()) {
+      // We don't check reg masks here as they're used only on calls:
+      // 1. EXEC is only considered const within one BB
+      // 2. Call should be a terminator instruction if present in a BB
 
-    if (I->modifiesRegister(AMDGPU::EXEC, TRI))
-      return true;
+      if (!Op.isReg())
+        continue;
+
+      Register Reg = Op.getReg();
+      if (Op.isUse()) {
+        if (Reg == VReg && --NumUse == 0)
+          return false;
+      } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
+        return true;
+    }
   }
 }
 
@@ -7158,3 +7411,25 @@ unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
 
   return SchedModel.computeInstrLatency(&MI);
 }
+
+unsigned SIInstrInfo::getDSShaderTypeValue(const MachineFunction &MF) {
+  switch (MF.getFunction().getCallingConv()) {
+  case CallingConv::AMDGPU_PS:
+    return 1;
+  case CallingConv::AMDGPU_VS:
+    return 2;
+  case CallingConv::AMDGPU_GS:
+    return 3;
+  case CallingConv::AMDGPU_HS:
+  case CallingConv::AMDGPU_LS:
+  case CallingConv::AMDGPU_ES:
+    report_fatal_error("ds_ordered_count unsupported for this calling conv");
+  case CallingConv::AMDGPU_CS:
+  case CallingConv::AMDGPU_KERNEL:
+  case CallingConv::C:
+  case CallingConv::Fast:
+  default:
+    // Assume other calling conventions are various compute callable functions
+    return 0;
+  }
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 53e2ffba0f65..ce59fe86c688 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -14,22 +14,12 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_SIINSTRINFO_H
 #define LLVM_LIB_TARGET_AMDGPU_SIINSTRINFO_H
 
-#include "AMDGPUInstrInfo.h"
-#include "SIDefines.h"
+#include "AMDGPUMIRFormatter.h"
 #include "SIRegisterInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SetVector.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetSchedule.h"
-#include "llvm/MC/MCInstrDesc.h"
-#include "llvm/Support/Compiler.h"
-#include <cassert>
-#include <cstdint>
 
 #define GET_INSTRINFO_HEADER
 #include "AMDGPUGenInstrInfo.inc"
@@ -37,17 +27,20 @@
 namespace llvm {
 
 class APInt;
+class GCNSubtarget;
+class LiveVariables;
 class MachineDominatorTree;
 class MachineRegisterInfo;
 class RegScavenger;
-class GCNSubtarget;
 class TargetRegisterClass;
+class ScheduleHazardRecognizer;
 
 class SIInstrInfo final : public AMDGPUGenInstrInfo {
 private:
   const SIRegisterInfo RI;
   const GCNSubtarget &ST;
   TargetSchedModel SchedModel;
+  mutable std::unique_ptr<AMDGPUMIRFormatter> Formatter;
 
   // The inverse predicate should have the negative value.
   enum BranchPredicate {
@@ -81,8 +74,9 @@ public:
 private:
   void swapOperands(MachineInstr &Inst) const;
 
-  bool moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
-                        MachineDominatorTree *MDT = nullptr) const;
+  std::pair<bool, MachineBasicBlock *>
+  moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
+                   MachineDominatorTree *MDT = nullptr) const;
 
   void lowerSelect(SetVectorType &Worklist, MachineInstr &Inst,
                    MachineDominatorTree *MDT = nullptr) const;
@@ -201,10 +195,6 @@ public:
                    const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg,
                    bool KillSrc) const override;
 
-  unsigned calculateLDSSpillAddress(MachineBasicBlock &MBB, MachineInstr &MI,
-                                    RegScavenger *RS, unsigned TmpReg,
-                                    unsigned Offset, unsigned Size) const;
-
   void materializeImmediate(MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator MI,
                             const DebugLoc &DL,
@@ -248,9 +238,12 @@ public:
   // DstRC, then AMDGPU::COPY is returned.
   unsigned getMovOpcode(const TargetRegisterClass *DstRC) const;
 
-  const MCInstrDesc &getIndirectRegWritePseudo(
-    unsigned VecSize, unsigned EltSize, bool IsSGPR) const;
+  const MCInstrDesc &getIndirectRegWriteMovRelPseudo(unsigned VecSize,
+                                                     unsigned EltSize,
+                                                     bool IsSGPR) const;
 
+  const MCInstrDesc &getIndirectGPRIDXPseudo(unsigned VecSize,
+                                             bool IsIndirectSrc) const;
   LLVM_READONLY
   int commuteOpcode(unsigned Opc) const;
 
@@ -508,12 +501,28 @@ public:
   // i.e. global_* or scratch_*.
   static bool isSegmentSpecificFLAT(const MachineInstr &MI) {
     auto Flags = MI.getDesc().TSFlags;
-    return (Flags & SIInstrFlags::FLAT) && !(Flags & SIInstrFlags::LGKM_CNT);
+    return Flags & (SIInstrFlags::IsFlatGlobal | SIInstrFlags::IsFlatScratch);
+  }
+
+  bool isSegmentSpecificFLAT(uint16_t Opcode) const {
+    auto Flags = get(Opcode).TSFlags;
+    return Flags & (SIInstrFlags::IsFlatGlobal | SIInstrFlags::IsFlatScratch);
+  }
+
+  static bool isFLATGlobal(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::IsFlatGlobal;
+  }
+
+  bool isFLATGlobal(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::IsFlatGlobal;
   }
 
-  // FIXME: Make this more precise
   static bool isFLATScratch(const MachineInstr &MI) {
-    return isSegmentSpecificFLAT(MI);
+    return MI.getDesc().TSFlags & SIInstrFlags::IsFlatScratch;
+  }
+
+  bool isFLATScratch(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::IsFlatScratch;
   }
 
   // Any FLAT encoded instruction, including global_* and scratch_*.
@@ -569,6 +578,14 @@ public:
     return get(Opcode).TSFlags & SIInstrFlags::DPP;
   }
 
+  static bool isTRANS(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::TRANS;
+  }
+
+  bool isTRANS(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::TRANS;
+  }
+
   static bool isVOP3P(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & SIInstrFlags::VOP3P;
   }
@@ -677,7 +694,7 @@ public:
 
   bool isVGPRCopy(const MachineInstr &MI) const {
     assert(MI.isCopy());
-    unsigned Dest = MI.getOperand(0).getReg();
+    Register Dest = MI.getOperand(0).getReg();
     const MachineFunction &MF = *MI.getParent()->getParent();
     const MachineRegisterInfo &MRI = MF.getRegInfo();
     return !RI.isSGPRReg(MRI, Dest);
@@ -883,6 +900,7 @@ public:
                               MachineRegisterInfo &MRI) const;
 
   void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const;
+  void legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const;
 
   void legalizeGenericOperand(MachineBasicBlock &InsertMBB,
                               MachineBasicBlock::iterator I,
@@ -893,20 +911,22 @@ public:
   /// Legalize all operands in this instruction.  This function may create new
   /// instructions and control-flow around \p MI.  If present, \p MDT is
   /// updated.
-  void legalizeOperands(MachineInstr &MI,
-                        MachineDominatorTree *MDT = nullptr) const;
+  /// \returns A new basic block that contains \p MI if new blocks were created.
+  MachineBasicBlock *
+  legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT = nullptr) const;
 
   /// Replace this instruction's opcode with the equivalent VALU
   /// opcode.  This function will also move the users of \p MI to the
   /// VALU if necessary. If present, \p MDT is updated.
-  void moveToVALU(MachineInstr &MI, MachineDominatorTree *MDT = nullptr) const;
-
-  void insertWaitStates(MachineBasicBlock &MBB,MachineBasicBlock::iterator MI,
-                        int Count) const;
+  MachineBasicBlock *moveToVALU(MachineInstr &MI,
+                                MachineDominatorTree *MDT = nullptr) const;
 
   void insertNoop(MachineBasicBlock &MBB,
                   MachineBasicBlock::iterator MI) const override;
 
+  void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                   unsigned Quantity) const override;
+
   void insertReturn(MachineBasicBlock &MBB) const;
   /// Return the number of wait states that result from executing this
   /// instruction.
@@ -1015,14 +1035,18 @@ public:
     return isUInt<12>(Imm);
   }
 
-  unsigned getNumFlatOffsetBits(unsigned AddrSpace, bool Signed) const;
-
   /// Returns if \p Offset is legal for the subtarget as the offset to a FLAT
   /// encoded instruction. If \p Signed, this is for an instruction that
   /// interprets the offset as signed.
   bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
                          bool Signed) const;
 
+  /// Split \p COffsetVal into {immediate offset field, remainder offset}
+  /// values.
+  std::pair<int64_t, int64_t> splitFlatOffset(int64_t COffsetVal,
+                                              unsigned AddrSpace,
+                                              bool IsSigned) const;
+
   /// \brief Return a target-specific opcode if Opcode is a pseudo instruction.
   /// Return -1 if the target-specific opcode for the pseudo instruction does
   /// not exist. If Opcode is not a pseudo instruction, this is identity.
@@ -1053,6 +1077,14 @@ public:
   unsigned getInstrLatency(const InstrItineraryData *ItinData,
                            const MachineInstr &MI,
                            unsigned *PredCost = nullptr) const override;
+
+  const MIRFormatter *getMIRFormatter() const override {
+    if (!Formatter.get())
+      Formatter = std::make_unique<AMDGPUMIRFormatter>();
+    return Formatter.get();
+  }
+
+  static unsigned getDSShaderTypeValue(const MachineFunction &MF);
 };
 
 /// \brief Returns true if a reg:subreg pair P has a TRC class
@@ -1148,6 +1180,12 @@ namespace AMDGPU {
   LLVM_READONLY
   int getVCMPXNoSDstOp(uint16_t Opcode);
 
+  LLVM_READONLY
+  int getFlatScratchInstSTfromSS(uint16_t Opcode);
+
+  LLVM_READONLY
+  int getFlatScratchInstSSfromSV(uint16_t Opcode);
+
   const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
   const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19);
   const uint64_t RSRC_INDEX_STRIDE_SHIFT = (32 + 21);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 7aee52f91360..5adc9e817d41 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -28,7 +28,6 @@ def SIEncodingFamily {
   int GFX9 = 5;
   int GFX10 = 6;
   int SDWA10 = 7;
-  int GFX10_B = 8;
 }
 
 //===----------------------------------------------------------------------===//
@@ -55,10 +54,6 @@ def SIatomic_dec : SDNode<"AMDGPUISD::ATOMIC_DEC", SDTAtomic2,
   [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
 >;
 
-def SIatomic_csub : SDNode<"AMDGPUISD::ATOMIC_LOAD_CSUB", SDTAtomic2,
-  [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
->;
-
 def SDTAtomic2_f32 : SDTypeProfile<1, 2, [
   SDTCisSameAs<0,2>, SDTCisFP<0>, SDTCisPtrTy<1>
 ]>;
@@ -177,19 +172,6 @@ class SDBufferAtomic<string opcode> : SDNode <opcode,
   [SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore]
 >;
 
-class SDBufferAtomicNoRtn<string opcode, ValueType ty> : SDNode <opcode,
-  SDTypeProfile<0, 8,
-      [SDTCisVT<0, ty>,    // vdata
-       SDTCisVT<1, v4i32>, // rsrc
-       SDTCisVT<2, i32>,   // vindex(VGPR)
-       SDTCisVT<3, i32>,   // voffset(VGPR)
-       SDTCisVT<4, i32>,   // soffset(SGPR)
-       SDTCisVT<5, i32>,   // offset(imm)
-       SDTCisVT<6, i32>,   // cachepolicy(imm)
-       SDTCisVT<7, i1>]>,  // idxen(imm)
-  [SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore]
->;
-
 def SIbuffer_atomic_swap : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_SWAP">;
 def SIbuffer_atomic_add : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_ADD">;
 def SIbuffer_atomic_sub : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_SUB">;
@@ -203,8 +185,7 @@ def SIbuffer_atomic_xor : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_XOR">;
 def SIbuffer_atomic_inc : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_INC">;
 def SIbuffer_atomic_dec : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_DEC">;
 def SIbuffer_atomic_csub : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_CSUB">;
-def SIbuffer_atomic_fadd : SDBufferAtomicNoRtn <"AMDGPUISD::BUFFER_ATOMIC_FADD", f32>;
-def SIbuffer_atomic_pk_fadd : SDBufferAtomicNoRtn <"AMDGPUISD::BUFFER_ATOMIC_PK_FADD", v2f16>;
+def SIbuffer_atomic_fadd : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FADD">;
 
 def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP",
   SDTypeProfile<1, 9,
@@ -228,8 +209,6 @@ class SDGlobalAtomicNoRtn<string opcode, ValueType ty> : SDNode <opcode,
   [SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore]
 >;
 
-def SIglobal_atomic_pk_fadd : SDGlobalAtomicNoRtn <"AMDGPUISD::ATOMIC_PK_FADD", v2f16>;
-
 def SIpc_add_rel_offset : SDNode<"AMDGPUISD::PC_ADD_REL_OFFSET",
   SDTypeProfile<1, 2, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>
 >;
@@ -280,41 +259,31 @@ def SIdenorm_mode : SDNode<"AMDGPUISD::DENORM_MODE",
 // Returns 1 if the source arguments have modifiers, 0 if they do not.
 // XXX - do f16 instructions?
 class isFloatType<ValueType SrcVT> {
-  bit ret =
-    !if(!eq(SrcVT.Value, f16.Value), 1,
-    !if(!eq(SrcVT.Value, f32.Value), 1,
-    !if(!eq(SrcVT.Value, f64.Value), 1,
-    !if(!eq(SrcVT.Value, v2f16.Value), 1,
-    !if(!eq(SrcVT.Value, v4f16.Value), 1,
-    !if(!eq(SrcVT.Value, v2f32.Value), 1,
-    !if(!eq(SrcVT.Value, v2f64.Value), 1,
-    0)))))));
+  bit ret = !or(!eq(SrcVT.Value, f16.Value),
+                !eq(SrcVT.Value, f32.Value),
+                !eq(SrcVT.Value, f64.Value),
+                !eq(SrcVT.Value, v2f16.Value),
+                !eq(SrcVT.Value, v4f16.Value),
+                !eq(SrcVT.Value, v2f32.Value),
+                !eq(SrcVT.Value, v2f64.Value));
 }
 
 class isIntType<ValueType SrcVT> {
-  bit ret =
-    !if(!eq(SrcVT.Value, i16.Value), 1,
-    !if(!eq(SrcVT.Value, i32.Value), 1,
-    !if(!eq(SrcVT.Value, i64.Value), 1,
-    0)));
+  bit ret = !or(!eq(SrcVT.Value, i16.Value),
+                !eq(SrcVT.Value, i32.Value),
+                !eq(SrcVT.Value, i64.Value));
 }
 
 class isPackedType<ValueType SrcVT> {
-  bit ret =
-    !if(!eq(SrcVT.Value, v2i16.Value), 1,
-      !if(!eq(SrcVT.Value, v2f16.Value), 1,
-        !if(!eq(SrcVT.Value, v4f16.Value), 1, 0)
-    ));
+  bit ret = !or(!eq(SrcVT.Value, v2i16.Value),
+                !eq(SrcVT.Value, v2f16.Value),
+                !eq(SrcVT.Value, v4f16.Value));
 }
 
 //===----------------------------------------------------------------------===//
 // PatFrags for global memory operations
 //===----------------------------------------------------------------------===//
 
-let AddressSpaces = !cast<AddressSpaceList>("LoadAddress_global").AddrSpaces in {
-defm atomic_csub_global : binary_atomic_op<SIatomic_csub>;
-}
-
 foreach as = [ "global", "flat", "constant", "local", "private", "region" ] in {
 let AddressSpaces = !cast<AddressSpaceList>("LoadAddress_"#as).AddrSpaces in {
 
@@ -328,23 +297,6 @@ defm atomic_load_fmax_#as : binary_atomic_op<SIatomic_fmax, 0>;
 } // End let AddressSpaces = ...
 } // End foreach AddrSpace
 
-def atomic_fadd_global_noret : PatFrag<
-  (ops node:$ptr, node:$value),
-  (atomic_load_fadd node:$ptr, node:$value)> {
-  // FIXME: Move this
-  let MemoryVT = f32;
-  let IsAtomic = 1;
-  let AddressSpaces = StoreAddress_global.AddrSpaces;
-}
-
-def atomic_pk_fadd_global_noret : PatFrag<
-    (ops node:$ptr, node:$value),
-    (SIglobal_atomic_pk_fadd node:$ptr, node:$value)> {
-  // FIXME: Move this
-  let MemoryVT = v2f16;
-  let IsAtomic = 1;
-  let AddressSpaces = StoreAddress_global.AddrSpaces;
-}
 
 //===----------------------------------------------------------------------===//
 // SDNodes PatFrags for loads/stores with a glue input.
@@ -450,16 +402,15 @@ def zextloadi16_local_m0 : PatFrag<(ops node:$ptr), (zextloadi16_glue node:$ptr)
 }
 
 def load_align8_local_m0 : PatFrag<(ops node:$ptr),
-                                   (load_local_m0 node:$ptr)> {
+                                   (load_local_m0 node:$ptr)>, Aligned<8> {
   let IsLoad = 1;
   let IsNonExtLoad = 1;
-  let MinAlignment = 8;
 }
+
 def load_align16_local_m0 : PatFrag<(ops node:$ptr),
-                                    (load_local_m0 node:$ptr)> {
+                                   (load_local_m0 node:$ptr)>, Aligned<16> {
   let IsLoad = 1;
   let IsNonExtLoad = 1;
-  let MinAlignment = 16;
 }
 
 } // End IsLoad = 1
@@ -535,20 +486,18 @@ def truncstorei16_local_m0 : PatFrag<(ops node:$val, node:$ptr),
 }
 }
 
-def store_align16_local_m0 : PatFrag <
-  (ops node:$value, node:$ptr),
-  (store_local_m0 node:$value, node:$ptr)> {
+def store_align8_local_m0 : PatFrag <(ops node:$value, node:$ptr),
+                                     (store_local_m0 node:$value, node:$ptr)>,
+                            Aligned<8> {
   let IsStore = 1;
   let IsTruncStore = 0;
-  let MinAlignment = 16;
 }
 
-def store_align8_local_m0 : PatFrag <
-  (ops node:$value, node:$ptr),
-  (store_local_m0 node:$value, node:$ptr)> {
+def store_align16_local_m0 : PatFrag <(ops node:$value, node:$ptr),
+                                     (store_local_m0 node:$value, node:$ptr)>,
+                            Aligned<16> {
   let IsStore = 1;
   let IsTruncStore = 0;
-  let MinAlignment = 8;
 }
 
 let AddressSpaces = StoreAddress_local.AddrSpaces in {
@@ -582,6 +531,48 @@ def si_setcc_uniform : PatFrag <
   return true;
 }]>;
 
+//===----------------------------------------------------------------------===//
+// SDNodes PatFrags for a16 loads and stores with 3 components.
+// v3f16/v3i16 is widened to v4f16/v4i16, so we need to match on the memory
+// load/store size.
+//===----------------------------------------------------------------------===//
+
+class mubuf_intrinsic_load<SDPatternOperator name, ValueType vt> : PatFrag <
+  (ops node:$rsrc, node:$vindex, node:$voffset, node:$soffset, node:$offset,
+            node:$auxiliary, node:$idxen),
+  (name node:$rsrc, node:$vindex, node:$voffset, node:$soffset, node:$offset,
+            node:$auxiliary, node:$idxen)> {
+  let IsLoad = 1;
+  let MemoryVT = vt;
+}
+
+class mubuf_intrinsic_store<SDPatternOperator name, ValueType vt> : PatFrag <
+  (ops node:$vdata, node:$rsrc, node:$vindex, node:$voffset, node:$soffset, node:$offset,
+            node:$auxiliary, node:$idxen),
+  (name node:$vdata, node:$rsrc, node:$vindex, node:$voffset, node:$soffset, node:$offset,
+            node:$auxiliary, node:$idxen)> {
+  let IsStore = 1;
+  let MemoryVT = vt;
+}
+
+class mtbuf_intrinsic_load<SDPatternOperator name, ValueType vt> : PatFrag <
+  (ops node:$rsrc, node:$vindex, node:$voffset, node:$soffset, node:$offset,
+            node:$format, node:$auxiliary, node:$idxen),
+  (name node:$rsrc, node:$vindex, node:$voffset, node:$soffset, node:$offset,
+            node:$format, node:$auxiliary, node:$idxen)> {
+  let IsLoad = 1;
+  let MemoryVT = vt;
+}
+
+class mtbuf_intrinsic_store<SDPatternOperator name, ValueType vt> : PatFrag <
+  (ops node:$vdata, node:$rsrc, node:$vindex, node:$voffset, node:$soffset, node:$offset,
+            node:$format, node:$auxiliary, node:$idxen),
+  (name node:$vdata, node:$rsrc, node:$vindex, node:$voffset, node:$soffset, node:$offset,
+            node:$format, node:$auxiliary, node:$idxen)> {
+  let IsStore = 1;
+  let MemoryVT = vt;
+}
+
 //===----------------------------------------------------------------------===//
 // SDNodes PatFrags for d16 loads
 //===----------------------------------------------------------------------===//
@@ -668,7 +659,6 @@ multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0,
 
 defm atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">;
 defm atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">;
-defm atomic_load_csub : SIAtomicM0Glue2 <"LOAD_CSUB", 1>;
 defm atomic_inc : SIAtomicM0Glue2 <"INC", 1>;
 defm atomic_dec : SIAtomicM0Glue2 <"DEC", 1>;
 defm atomic_load_and : SIAtomicM0Glue2 <"LOAD_AND">;
@@ -1051,6 +1041,12 @@ class NamedOperandBit_0<string Name, AsmOperandClass MatchClass> :
   let ParserMatchClass = MatchClass;
 }
 
+class NamedOperandBit_1<string Name, AsmOperandClass MatchClass> :
+  OperandWithDefaultOps<i1, (ops (i1 1))> {
+  let PrintMethod = "print"#Name;
+  let ParserMatchClass = MatchClass;
+}
+
 class NamedOperandU8<string Name, AsmOperandClass MatchClass> : Operand<i8> {
   let PrintMethod = "print"#Name;
   let ParserMatchClass = MatchClass;
@@ -1102,8 +1098,15 @@ def clampmod0 : NamedOperandBit_0<"ClampSI", NamedMatchClass<"ClampSI">>;
 def highmod : NamedOperandBit<"High", NamedMatchClass<"High">>;
 
 def DLC : NamedOperandBit<"DLC", NamedMatchClass<"DLC">>;
+def DLC_0 : NamedOperandBit_0<"DLC", NamedMatchClass<"DLC">>;
+
 def GLC : NamedOperandBit<"GLC", NamedMatchClass<"GLC">>;
+def GLC_0 : NamedOperandBit_0<"GLC", NamedMatchClass<"GLC">>;
+def GLC_1 : NamedOperandBit_1<"GLC", NamedMatchClass<"GLC_1">>;
+
 def SLC : NamedOperandBit<"SLC", NamedMatchClass<"SLC">>;
+def SLC_0 : NamedOperandBit_0<"SLC", NamedMatchClass<"SLC">>;
+
 def TFE : NamedOperandBit<"TFE", NamedMatchClass<"TFE">>;
 def SWZ : NamedOperandBit<"SWZ", NamedMatchClass<"SWZ">>;
 def UNorm : NamedOperandBit<"UNorm", NamedMatchClass<"UNorm">>;
@@ -1115,7 +1118,7 @@ def LWE : NamedOperandBit<"LWE", NamedMatchClass<"LWE">>;
 def exp_compr : NamedOperandBit<"ExpCompr", NamedMatchClass<"ExpCompr">>;
 def exp_vm : NamedOperandBit<"ExpVM", NamedMatchClass<"ExpVM">>;
 
-def FORMAT : NamedOperandU8<"FORMAT", NamedMatchClass<"FORMAT">>;
+def FORMAT : NamedOperandU8<"FORMAT", NamedMatchClass<"FORMAT", 0>>;
 
 def DMask : NamedOperandU16<"DMask", NamedMatchClass<"DMask">>;
 def Dim : NamedOperandU8<"Dim", NamedMatchClass<"Dim", 0>>;
@@ -1133,10 +1136,10 @@ def src0_sel : NamedOperandU32<"SDWASrc0Sel", NamedMatchClass<"SDWASrc0Sel">>;
 def src1_sel : NamedOperandU32<"SDWASrc1Sel", NamedMatchClass<"SDWASrc1Sel">>;
 def dst_unused : NamedOperandU32<"SDWADstUnused", NamedMatchClass<"SDWADstUnused">>;
 
-def op_sel : NamedOperandU32Default0<"OpSel", NamedMatchClass<"OpSel">>;
-def op_sel_hi : NamedOperandU32Default0<"OpSelHi", NamedMatchClass<"OpSelHi">>;
-def neg_lo : NamedOperandU32Default0<"NegLo", NamedMatchClass<"NegLo">>;
-def neg_hi : NamedOperandU32Default0<"NegHi", NamedMatchClass<"NegHi">>;
+def op_sel0 : NamedOperandU32Default0<"OpSel", NamedMatchClass<"OpSel">>;
+def op_sel_hi0 : NamedOperandU32Default0<"OpSelHi", NamedMatchClass<"OpSelHi">>;
+def neg_lo0 : NamedOperandU32Default0<"NegLo", NamedMatchClass<"NegLo">>;
+def neg_hi0 : NamedOperandU32Default0<"NegHi", NamedMatchClass<"NegHi">>;
 
 def blgp : NamedOperandU32<"BLGP", NamedMatchClass<"BLGP">>;
 def cbsz : NamedOperandU32<"CBSZ", NamedMatchClass<"CBSZ">>;
@@ -1308,11 +1311,11 @@ def PackedI16InputMods : PackedIntInputMods<PackedI16InputModsMatchClass>;
 
 def DS1Addr1Offset : ComplexPattern<i32, 2, "SelectDS1Addr1Offset">;
 def DS64Bit4ByteAligned : ComplexPattern<i32, 3, "SelectDS64Bit4ByteAligned">;
+def DS128Bit8ByteAligned : ComplexPattern<i64, 3, "SelectDS128Bit8ByteAligned">;
 
 def MOVRELOffset : ComplexPattern<i32, 2, "SelectMOVRELOffset">;
 
 def VOP3Mods0 : ComplexPattern<untyped, 4, "SelectVOP3Mods0">;
-def VOP3Mods0Clamp : ComplexPattern<untyped, 3, "SelectVOP3Mods0Clamp">;
 def VOP3Mods  : ComplexPattern<untyped, 2, "SelectVOP3Mods">;
 def VOP3NoMods : ComplexPattern<untyped, 1, "SelectVOP3NoMods">;
 // VOP3Mods, but the input source is known to never be NaN.
@@ -1328,9 +1331,6 @@ def VOP3OpSelMods  : ComplexPattern<untyped, 2, "SelectVOP3OpSelMods">;
 
 def VOP3PMadMixMods  : ComplexPattern<untyped, 2, "SelectVOP3PMadMixMods">;
 
-
-def Hi16Elt  : ComplexPattern<untyped, 1, "SelectHi16Elt">;
-
 //===----------------------------------------------------------------------===//
 // SI assembler operands
 //===----------------------------------------------------------------------===//
@@ -1389,9 +1389,9 @@ def HWREG {
 }
 
 class getHwRegImm<int Reg, int Offset = 0, int Size = 32> {
-  int ret = !or(Reg,
-                !or(!shl(Offset, 6),
-                    !shl(!add(Size, -1), 11)));
+  int ret = !and(!or(Reg,
+                     !shl(Offset, 6),
+                     !shl(!add(Size, -1), 11)), 65535);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1415,56 +1415,6 @@ class SIMCInstr <string pseudo, int subtarget> {
   int Subtarget = subtarget;
 }
 
-//===----------------------------------------------------------------------===//
-// EXP classes
-//===----------------------------------------------------------------------===//
-
-class EXP_Helper<bit done> : EXPCommon<
-  (outs),
-  (ins exp_tgt:$tgt,
-       ExpSrc0:$src0, ExpSrc1:$src1, ExpSrc2:$src2, ExpSrc3:$src3,
-       exp_vm:$vm, exp_compr:$compr, i32imm:$en),
-  "exp$tgt $src0, $src1, $src2, $src3"#!if(done, " done", "")#"$compr$vm", []> {
-  let AsmMatchConverter = "cvtExp";
-}
-
-// Split EXP instruction into EXP and EXP_DONE so we can set
-// mayLoad for done=1.
-multiclass EXP_m<bit done> {
-  let mayLoad = done, DisableWQM = 1 in {
-    let isPseudo = 1, isCodeGenOnly = 1 in {
-      def "" : EXP_Helper<done>,
-               SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.NONE>;
-    }
-
-    let done = done in {
-      def _si : EXP_Helper<done>,
-                SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.SI>,
-                EXPe {
-        let AssemblerPredicate = isGFX6GFX7;
-        let DecoderNamespace = "GFX6GFX7";
-        let DisableDecoder = DisableSIDecoder;
-      }
-
-      def _vi : EXP_Helper<done>,
-                SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.VI>,
-                EXPe_vi {
-        let AssemblerPredicate = isGFX8GFX9;
-        let DecoderNamespace = "GFX8";
-        let DisableDecoder = DisableVIDecoder;
-      }
-
-      def _gfx10 : EXP_Helper<done>,
-                SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.GFX10>,
-                EXPe {
-        let AssemblerPredicate = isGFX10Plus;
-        let DecoderNamespace = "GFX10";
-        let DisableDecoder = DisableSIDecoder;
-      }
-    }
-  }
-}
-
 //===----------------------------------------------------------------------===//
 // Vector ALU classes
 //===----------------------------------------------------------------------===//
@@ -1528,6 +1478,10 @@ class getVOPSrc0ForVT<ValueType VT> {
     );
 }
 
+class getSOPSrcForVT<ValueType VT> {
+  RegisterOperand ret = !if(!eq(VT.Size, 64), SSrc_b64, SSrc_b32);
+}
+
 // Returns the vreg register class to use for source operand given VT
 class getVregSrcForVT<ValueType VT> {
   RegisterClass ret = !if(!eq(VT.Size, 128), VReg_128,
@@ -1583,13 +1537,11 @@ class getVOP3SrcForVT<ValueType VT> {
 
 // Float or packed int
 class isModifierType<ValueType SrcVT> {
-  bit ret =
-    !if(!eq(SrcVT.Value, f16.Value), 1,
-    !if(!eq(SrcVT.Value, f32.Value), 1,
-    !if(!eq(SrcVT.Value, f64.Value), 1,
-    !if(!eq(SrcVT.Value, v2f16.Value), 1,
-    !if(!eq(SrcVT.Value, v2i16.Value), 1,
-    0)))));
+  bit ret = !or(!eq(SrcVT.Value, f16.Value),
+                !eq(SrcVT.Value, f32.Value),
+                !eq(SrcVT.Value, f64.Value),
+                !eq(SrcVT.Value, v2f16.Value),
+                !eq(SrcVT.Value, v2i16.Value));
 }
 
 // Return type of input modifiers operand for specified input operand
@@ -1612,7 +1564,7 @@ class getOpSelMod <ValueType VT> {
 }
 
 // Return type of input modifiers operand specified input operand for DPP
-class getSrcModExt <ValueType VT> {
+class getSrcModDPP <ValueType VT> {
   bit isFP = isFloatType<VT>.ret;
   Operand ret = !if(isFP, FPVRegInputMods, IntVRegInputMods);
 }
@@ -1635,7 +1587,7 @@ class getIns32 <RegisterOperand Src0RC, RegisterClass Src1RC, int NumSrcArgs> {
 // Returns the input arguments for VOP3 instructions for the given SrcVT.
 class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
                 RegisterOperand Src2RC, int NumSrcArgs,
-                bit HasIntClamp, bit HasModifiers, bit HasSrc2Mods, bit HasOMod,
+                bit HasClamp, bit HasModifiers, bit HasSrc2Mods, bit HasOMod,
                 Operand Src0Mod, Operand Src1Mod, Operand Src2Mod> {
 
   dag ret =
@@ -1644,20 +1596,20 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
       (ins),
       /* else */
     !if (!eq(NumSrcArgs, 1),
-      !if (!eq(HasModifiers, 1),
+      !if (HasModifiers,
         // VOP1 with modifiers
         (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
              clampmod0:$clamp, omod0:$omod)
       /* else */,
         // VOP1 without modifiers
-        !if (!eq(HasIntClamp, 1),
+        !if (HasClamp,
           (ins Src0RC:$src0, clampmod0:$clamp),
           (ins Src0RC:$src0))
       /* endif */ ),
     !if (!eq(NumSrcArgs, 2),
-      !if (!eq(HasModifiers, 1),
+      !if (HasModifiers,
         // VOP 2 with modifiers
-        !if( !eq(HasOMod, 1),
+        !if(HasOMod,
           (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
                Src1Mod:$src1_modifiers, Src1RC:$src1,
                clampmod0:$clamp, omod0:$omod),
@@ -1666,21 +1618,21 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
                clampmod0:$clamp))
       /* else */,
         // VOP2 without modifiers
-        !if (!eq(HasIntClamp, 1),
+        !if (HasClamp,
           (ins Src0RC:$src0, Src1RC:$src1, clampmod0:$clamp),
           (ins Src0RC:$src0, Src1RC:$src1))
 
       /* endif */ )
     /* NumSrcArgs == 3 */,
-      !if (!eq(HasModifiers, 1),
-        !if (!eq(HasSrc2Mods, 1),
+      !if (HasModifiers,
+        !if (HasSrc2Mods,
           // VOP3 with modifiers
-          !if (!eq(HasOMod, 1),
+          !if (HasOMod,
             (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
                  Src1Mod:$src1_modifiers, Src1RC:$src1,
                  Src2Mod:$src2_modifiers, Src2RC:$src2,
                  clampmod0:$clamp, omod0:$omod),
-            !if (!eq(HasIntClamp, 1),
+            !if (HasClamp,
               (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
                    Src1Mod:$src1_modifiers, Src1RC:$src1,
                    Src2Mod:$src2_modifiers, Src2RC:$src2,
@@ -1689,11 +1641,11 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
                    Src1Mod:$src1_modifiers, Src1RC:$src1,
                    Src2Mod:$src2_modifiers, Src2RC:$src2))),
           // VOP3 with modifiers except src2
-          !if (!eq(HasOMod, 1),
+          !if (HasOMod,
             (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
                  Src1Mod:$src1_modifiers, Src1RC:$src1,
                  Src2RC:$src2, clampmod0:$clamp, omod0:$omod),
-            !if (!eq(HasIntClamp, 1),
+            !if (HasClamp,
               (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
                    Src1Mod:$src1_modifiers, Src1RC:$src1,
                    Src2RC:$src2, clampmod0:$clamp),
@@ -1702,119 +1654,87 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
                    Src2RC:$src2))))
       /* else */,
         // VOP3 without modifiers
-        !if (!eq(HasIntClamp, 1),
+        !if (HasClamp,
           (ins Src0RC:$src0, Src1RC:$src1, Src2RC:$src2, clampmod0:$clamp),
           (ins Src0RC:$src0, Src1RC:$src1, Src2RC:$src2))
       /* endif */ ))));
 }
 
-/// XXX - src1 may only allow VGPRs?
+class getInsVOP3Base<RegisterOperand Src0RC, RegisterOperand Src1RC,
+                RegisterOperand Src2RC, int NumSrcArgs,
+                bit HasClamp, bit HasModifiers, bit HasSrc2Mods, bit HasOMod,
+                Operand Src0Mod, Operand Src1Mod, Operand Src2Mod, bit HasOpSel,
+                bit IsVOP3P> {
+  // getInst64 handles clamp and omod. implicit mutex between vop3p and omod
+  dag base = getIns64 <Src0RC, Src1RC, Src2RC, NumSrcArgs,
+                HasClamp, HasModifiers, HasSrc2Mods, HasOMod,
+                Src0Mod, Src1Mod, Src2Mod>.ret;
+  dag opsel = (ins op_sel0:$op_sel);
+  dag vop3pFields = (ins op_sel_hi0:$op_sel_hi, neg_lo0:$neg_lo, neg_hi0:$neg_hi);
+  dag ret = !con(base,
+                 !if(HasOpSel, opsel,(ins)),
+                 !if(IsVOP3P, vop3pFields,(ins)));
+}
 
-// The modifiers (except clamp) are dummy operands for the benefit of
-// printing and parsing. They defer their values to looking at the
-// srcN_modifiers for what to print.
 class getInsVOP3P <RegisterOperand Src0RC, RegisterOperand Src1RC,
-                   RegisterOperand Src2RC, int NumSrcArgs,
-                   bit HasClamp,
+                   RegisterOperand Src2RC, int NumSrcArgs, bit HasClamp,
                    Operand Src0Mod, Operand Src1Mod, Operand Src2Mod> {
-  dag ret = !if (!eq(NumSrcArgs, 2),
-    !if (HasClamp,
-      (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
-           Src1Mod:$src1_modifiers, Src1RC:$src1,
-           clampmod0:$clamp,
-           op_sel:$op_sel, op_sel_hi:$op_sel_hi,
-           neg_lo:$neg_lo, neg_hi:$neg_hi),
-      (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
-           Src1Mod:$src1_modifiers, Src1RC:$src1,
-           op_sel:$op_sel, op_sel_hi:$op_sel_hi,
-           neg_lo:$neg_lo, neg_hi:$neg_hi)),
-    // else NumSrcArgs == 3
-    !if (HasClamp,
-      (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
-           Src1Mod:$src1_modifiers, Src1RC:$src1,
-           Src2Mod:$src2_modifiers, Src2RC:$src2,
-           clampmod0:$clamp,
-           op_sel:$op_sel, op_sel_hi:$op_sel_hi,
-           neg_lo:$neg_lo, neg_hi:$neg_hi),
-      (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
-           Src1Mod:$src1_modifiers, Src1RC:$src1,
-           Src2Mod:$src2_modifiers, Src2RC:$src2,
-           op_sel:$op_sel, op_sel_hi:$op_sel_hi,
-           neg_lo:$neg_lo, neg_hi:$neg_hi))
-  );
+  dag ret = getInsVOP3Base<Src0RC, Src1RC, Src2RC, NumSrcArgs,
+                    HasClamp, 1/*HasModifiers*/, 1/*HasSrc2Mods*/,
+                    0/*HasOMod*/, Src0Mod, Src1Mod, Src2Mod,
+                    1/*HasOpSel*/, 1/*IsVOP3P*/>.ret;
 }
 
-class getInsVOP3OpSel <RegisterOperand Src0RC,
-                       RegisterOperand Src1RC,
-                       RegisterOperand Src2RC,
-                       int NumSrcArgs,
-                       bit HasClamp,
-                       Operand Src0Mod,
-                       Operand Src1Mod,
-                       Operand Src2Mod> {
-  dag ret = !if (!eq(NumSrcArgs, 2),
-    !if (HasClamp,
-      (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
-           Src1Mod:$src1_modifiers, Src1RC:$src1,
-           clampmod0:$clamp,
-           op_sel:$op_sel),
-      (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
-           Src1Mod:$src1_modifiers, Src1RC:$src1,
-           op_sel:$op_sel)),
-    // else NumSrcArgs == 3
-    !if (HasClamp,
-      (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
-           Src1Mod:$src1_modifiers, Src1RC:$src1,
-           Src2Mod:$src2_modifiers, Src2RC:$src2,
-           clampmod0:$clamp,
-           op_sel:$op_sel),
-      (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
-           Src1Mod:$src1_modifiers, Src1RC:$src1,
-           Src2Mod:$src2_modifiers, Src2RC:$src2,
-           op_sel:$op_sel))
-  );
+class getInsVOP3OpSel <RegisterOperand Src0RC, RegisterOperand Src1RC,
+                       RegisterOperand Src2RC, int NumSrcArgs,
+                       bit HasClamp, bit HasOMod,
+                       Operand Src0Mod, Operand Src1Mod, Operand Src2Mod> {
+  dag ret = getInsVOP3Base<Src0RC, Src1RC,
+                    Src2RC, NumSrcArgs,
+                    HasClamp, 1/*HasModifiers*/, 1/*HasSrc2Mods*/, HasOMod,
+                    Src0Mod, Src1Mod, Src2Mod, 1/*HasOpSel*/, 0>.ret;
 }
 
-class getInsDPP <RegisterOperand DstRC, RegisterClass Src0RC, RegisterClass Src1RC,
+class getInsDPPBase <RegisterOperand DstRC, RegisterClass Src0RC, RegisterClass Src1RC,
                  int NumSrcArgs, bit HasModifiers,
                  Operand Src0Mod, Operand Src1Mod> {
 
   dag ret = !if (!eq(NumSrcArgs, 0),
                 // VOP1 without input operands (V_NOP)
-                (ins dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
-                     bank_mask:$bank_mask, bound_ctrl:$bound_ctrl),
+                (ins ),
             !if (!eq(NumSrcArgs, 1),
-              !if (!eq(HasModifiers, 1),
+              !if (HasModifiers,
                 // VOP1_DPP with modifiers
                 (ins DstRC:$old, Src0Mod:$src0_modifiers,
-                     Src0RC:$src0, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
-                     bank_mask:$bank_mask, bound_ctrl:$bound_ctrl)
+                     Src0RC:$src0)
               /* else */,
                 // VOP1_DPP without modifiers
-                (ins DstRC:$old, Src0RC:$src0,
-                     dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
-                     bank_mask:$bank_mask, bound_ctrl:$bound_ctrl)
-              /* endif */)
-              /* NumSrcArgs == 2 */,
-              !if (!eq(HasModifiers, 1),
+                (ins DstRC:$old, Src0RC:$src0)
+              /* endif */),
+              !if (HasModifiers,
                 // VOP2_DPP with modifiers
                 (ins DstRC:$old,
                      Src0Mod:$src0_modifiers, Src0RC:$src0,
-                     Src1Mod:$src1_modifiers, Src1RC:$src1,
-                     dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
-                     bank_mask:$bank_mask, bound_ctrl:$bound_ctrl)
+                     Src1Mod:$src1_modifiers, Src1RC:$src1)
               /* else */,
                 // VOP2_DPP without modifiers
                 (ins DstRC:$old,
-                     Src0RC:$src0, Src1RC:$src1, dpp_ctrl:$dpp_ctrl,
-                     row_mask:$row_mask, bank_mask:$bank_mask,
-                     bound_ctrl:$bound_ctrl)
-             /* endif */)));
+                     Src0RC:$src0, Src1RC:$src1)
+                )));
+}
+
+class getInsDPP <RegisterOperand DstRC, RegisterClass Src0RC, RegisterClass Src1RC,
+                 int NumSrcArgs, bit HasModifiers,
+                 Operand Src0Mod, Operand Src1Mod> {
+  dag ret = !con(getInsDPPBase<DstRC, Src0RC, Src1RC, NumSrcArgs,
+                               HasModifiers, Src0Mod, Src1Mod>.ret,
+                 (ins dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
+                  bank_mask:$bank_mask, bound_ctrl:$bound_ctrl));
 }
 
 class getInsDPP16 <RegisterOperand DstRC, RegisterClass Src0RC, RegisterClass Src1RC,
-                   int NumSrcArgs, bit HasModifiers,
-                   Operand Src0Mod, Operand Src1Mod> {
+                 int NumSrcArgs, bit HasModifiers,
+                 Operand Src0Mod, Operand Src1Mod> {
   dag ret = !con(getInsDPP<DstRC, Src0RC, Src1RC, NumSrcArgs,
                            HasModifiers, Src0Mod, Src1Mod>.ret,
                  (ins FI:$fi));
@@ -1823,30 +1743,9 @@ class getInsDPP16 <RegisterOperand DstRC, RegisterClass Src0RC, RegisterClass Sr
 class getInsDPP8 <RegisterOperand DstRC, RegisterClass Src0RC, RegisterClass Src1RC,
                  int NumSrcArgs, bit HasModifiers,
                  Operand Src0Mod, Operand Src1Mod> {
-  dag ret = !if (!eq(NumSrcArgs, 0),
-                // VOP1 without input operands (V_NOP)
-                (ins dpp8:$dpp8, FI:$fi),
-            !if (!eq(NumSrcArgs, 1),
-              !if (!eq(HasModifiers, 1),
-                // VOP1_DPP with modifiers
-                (ins DstRC:$old, Src0Mod:$src0_modifiers,
-                     Src0RC:$src0, dpp8:$dpp8, FI:$fi)
-              /* else */,
-                // VOP1_DPP without modifiers
-                (ins DstRC:$old, Src0RC:$src0, dpp8:$dpp8, FI:$fi)
-              /* endif */)
-              /* NumSrcArgs == 2 */,
-              !if (!eq(HasModifiers, 1),
-                // VOP2_DPP with modifiers
-                (ins DstRC:$old,
-                     Src0Mod:$src0_modifiers, Src0RC:$src0,
-                     Src1Mod:$src1_modifiers, Src1RC:$src1,
-                     dpp8:$dpp8, FI:$fi)
-              /* else */,
-                // VOP2_DPP without modifiers
-                (ins DstRC:$old,
-                     Src0RC:$src0, Src1RC:$src1, dpp8:$dpp8, FI:$fi)
-             /* endif */)));
+  dag ret = !con(getInsDPPBase<DstRC, Src0RC, Src1RC, NumSrcArgs,
+                               HasModifiers, Src0Mod, Src1Mod>.ret,
+                 (ins dpp8:$dpp8, FI:$fi));
 }
 
 
@@ -1860,7 +1759,7 @@ class getInsSDWA <RegisterOperand Src0RC, RegisterOperand Src1RC, int NumSrcArgs
                (ins),
             !if(!eq(NumSrcArgs, 1),
                // VOP1
-               !if(!eq(HasSDWAOMod, 0),
+               !if(!not(HasSDWAOMod),
                   // VOP1_SDWA without omod
                   (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
                        clampmod:$clamp,
@@ -1878,7 +1777,7 @@ class getInsSDWA <RegisterOperand Src0RC, RegisterOperand Src1RC, int NumSrcArgs
                        Src1Mod:$src1_modifiers, Src1RC:$src1,
                        clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel),
                   // VOP2_SDWA
-                  !if(!eq(HasSDWAOMod, 0),
+                  !if(!not(HasSDWAOMod),
                      // VOP2_SDWA without omod
                      (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
                           Src1Mod:$src1_modifiers, Src1RC:$src1,
@@ -1894,12 +1793,12 @@ class getInsSDWA <RegisterOperand Src0RC, RegisterOperand Src1RC, int NumSrcArgs
             (ins)/* endif */)));
 }
 
-// Outs for DPP and SDWA
-class getOutsExt <bit HasDst, ValueType DstVT, RegisterOperand DstRCExt> {
+// Outs for DPP
+class getOutsDPP <bit HasDst, ValueType DstVT, RegisterOperand DstRCDPP> {
   dag ret = !if(HasDst,
                 !if(!eq(DstVT.Size, 1),
                     (outs), // no dst for VOPC, we use "vcc"-token as dst in SDWA VOPC instructions
-                    (outs DstRCExt:$vdst)),
+                    (outs DstRCDPP:$vdst)),
                 (outs)); // V_NOP
 }
 
@@ -1938,7 +1837,7 @@ class getAsm64 <bit HasDst, int NumSrcArgs, bit HasIntClamp, bit HasModifiers,
   string src2 = !if(!eq(NumSrcArgs, 3), " $src2_modifiers", "");
   string iclamp = !if(HasIntClamp, "$clamp", "");
   string ret =
-  !if(!eq(HasModifiers, 0),
+  !if(!not(HasModifiers),
       getAsm32<HasDst, NumSrcArgs, DstVT>.ret # iclamp,
       dst#", "#src0#src1#src2#"$clamp"#!if(HasOMod, "$omod", ""));
 }
@@ -1964,6 +1863,7 @@ class getAsmVOP3P <bit HasDst, int NumSrcArgs, bit HasModifiers,
 
 class getAsmVOP3OpSel <int NumSrcArgs,
                        bit HasClamp,
+                       bit HasOMod,
                        bit Src0HasMods,
                        bit Src1HasMods,
                        bit Src2HasMods> {
@@ -2000,7 +1900,7 @@ class getAsmDPP <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT =
   string src1 = !if(!eq(NumSrcArgs, 1), "",
                    !if(!eq(NumSrcArgs, 2), " $src1_modifiers",
                                            " $src1_modifiers,"));
-  string args = !if(!eq(HasModifiers, 0),
+  string args = !if(!not(HasModifiers),
                      getAsm32<0, NumSrcArgs, DstVT>.ret,
                      ", "#src0#src1);
   string ret = dst#args#" $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
@@ -2010,22 +1910,12 @@ class getAsmDPP16 <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT
   string ret = getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret#"$fi";
 }
 
-class getAsmDPP8 <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT = i32> {
-  string dst = !if(HasDst,
-                   !if(!eq(DstVT.Size, 1),
-                       "$sdst",
-                       "$vdst"),
-                    ""); // use $sdst for VOPC
-  string src0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,");
-  string src1 = !if(!eq(NumSrcArgs, 1), "",
-                   !if(!eq(NumSrcArgs, 2), " $src1_modifiers",
-                                           " $src1_modifiers,"));
-  string args = !if(!eq(HasModifiers, 0),
-                     getAsm32<0, NumSrcArgs, DstVT>.ret,
-                     ", "#src0#src1);
-  string ret = dst#args#"$dpp8$fi";
+class getAsmDPP8 <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT = i32>
+  : getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT> {
+  let ret = dst#args#" $dpp8$fi";
 }
 
+
 class getAsmSDWA <bit HasDst, int NumSrcArgs, ValueType DstVT = i32> {
   string dst = !if(HasDst,
                    !if(!eq(DstVT.Size, 1),
@@ -2063,7 +1953,7 @@ class getAsmSDWA9 <bit HasDst, bit HasOMod, int NumSrcArgs,
                     "");
   string src0 = "$src0_modifiers";
   string src1 = "$src1_modifiers";
-  string out_mods = !if(!eq(HasOMod, 0), "$clamp", "$clamp$omod");
+  string out_mods = !if(!not(HasOMod), "$clamp", "$clamp$omod");
   string args = !if(!eq(NumSrcArgs, 0), "",
                     !if(!eq(NumSrcArgs, 1),
                         ", "#src0,
@@ -2107,14 +1997,6 @@ class getHasDPP <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
                 getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret);
 }
 
-class BitOr<bit a, bit b> {
-  bit ret = !if(a, 1, !if(b, 1, 0));
-}
-
-class BitAnd<bit a, bit b> {
-  bit ret = !if(a, !if(b, 1, 0), 0);
-}
-
 def PatGenMode {
   int NoPattern = 0;
   int Pattern   = 1;
@@ -2146,24 +2028,19 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0,
   field Operand Src0Mod = getSrcMod<Src0VT, EnableF32SrcMods>.ret;
   field Operand Src1Mod = getSrcMod<Src1VT, EnableF32SrcMods>.ret;
   field Operand Src2Mod = getSrcMod<Src2VT, EnableF32SrcMods>.ret;
-  field Operand Src0ModDPP = getSrcModExt<Src0VT>.ret;
-  field Operand Src1ModDPP = getSrcModExt<Src1VT>.ret;
+  field Operand Src0ModDPP = getSrcModDPP<Src0VT>.ret;
+  field Operand Src1ModDPP = getSrcModDPP<Src1VT>.ret;
   field Operand Src0ModSDWA = getSrcModSDWA<Src0VT>.ret;
   field Operand Src1ModSDWA = getSrcModSDWA<Src1VT>.ret;
 
 
-  field bit HasDst = !if(!eq(DstVT.Value, untyped.Value), 0, 1);
+  field bit HasDst = !ne(DstVT.Value, untyped.Value);
   field bit HasDst32 = HasDst;
   field bit EmitDst = HasDst; // force dst encoding, see v_movreld_b32 special case
   field int NumSrcArgs = getNumSrcArgs<Src0VT, Src1VT, Src2VT>.ret;
-  field bit HasSrc0 = !if(!eq(Src0VT.Value, untyped.Value), 0, 1);
-  field bit HasSrc1 = !if(!eq(Src1VT.Value, untyped.Value), 0, 1);
-  field bit HasSrc2 = !if(!eq(Src2VT.Value, untyped.Value), 0, 1);
-
-  // TODO: Modifiers logic is somewhat adhoc here, to be refined later
-  // HasModifiers affects the normal and DPP encodings. We take note of EnableF32SrcMods, which
-  // enables modifiers for i32 type.
-  field bit HasModifiers = BitOr<isModifierType<Src0VT>.ret, EnableF32SrcMods>.ret;
+  field bit HasSrc0 = !ne(Src0VT.Value, untyped.Value);
+  field bit HasSrc1 = !ne(Src1VT.Value, untyped.Value);
+  field bit HasSrc2 = !ne(Src2VT.Value, untyped.Value);
 
   // HasSrc*FloatMods affects the SDWA encoding. We ignore EnableF32SrcMods.
   field bit HasSrc0FloatMods = isFloatType<Src0VT>.ret;
@@ -2175,16 +2052,12 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0,
   field bit HasSrc1IntMods = isIntType<Src1VT>.ret;
   field bit HasSrc2IntMods = isIntType<Src2VT>.ret;
 
-  field bit HasSrc0Mods = HasModifiers;
-  field bit HasSrc1Mods = !if(HasModifiers, BitOr<HasSrc1FloatMods, HasSrc1IntMods>.ret, 0);
-  field bit HasSrc2Mods = !if(HasModifiers, BitOr<HasSrc2FloatMods, HasSrc2IntMods>.ret, 0);
-
-  field bit HasClamp = BitOr<isModifierType<Src0VT>.ret, EnableClamp>.ret;
+  field bit HasClamp = !or(isModifierType<Src0VT>.ret, EnableClamp);
   field bit HasSDWAClamp = EmitDst;
-  field bit HasFPClamp = BitAnd<isFloatType<DstVT>.ret, HasClamp>.ret;
+  field bit HasFPClamp = !and(isFloatType<DstVT>.ret, HasClamp);
   field bit HasIntClamp = !if(isFloatType<DstVT>.ret, 0, HasClamp);
   field bit HasClampLo = HasClamp;
-  field bit HasClampHi = BitAnd<isPackedType<DstVT>.ret, HasClamp>.ret;
+  field bit HasClampHi = !and(isPackedType<DstVT>.ret, HasClamp);
   field bit HasHigh = 0;
 
   field bit IsPacked = isPackedType<Src0VT>.ret;
@@ -2192,6 +2065,16 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0,
   field bit HasOMod = !if(HasOpSel, 0, isFloatType<DstVT>.ret);
   field bit HasSDWAOMod = isFloatType<DstVT>.ret;
 
+  field bit HasModifiers = !or(isModifierType<Src0VT>.ret,
+                               isModifierType<Src1VT>.ret,
+                               isModifierType<Src2VT>.ret,
+                               HasOMod,
+                               EnableF32SrcMods);
+
+  field bit HasSrc0Mods = HasModifiers;
+  field bit HasSrc1Mods = !if(HasModifiers, !or(HasSrc1FloatMods, HasSrc1IntMods), 0);
+  field bit HasSrc2Mods = !if(HasModifiers, !or(HasSrc2FloatMods, HasSrc2IntMods), 0);
+
   field bit HasExt = getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
   field bit HasExtDPP = getHasDPP<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
   field bit HasExtSDWA = HasExt;
@@ -2211,8 +2094,8 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0,
   // output. This is manually overridden for them.
   field dag Outs32 = Outs;
   field dag Outs64 = Outs;
-  field dag OutsDPP = getOutsExt<HasDst, DstVT, DstRCDPP>.ret;
-  field dag OutsDPP8 = getOutsExt<HasDst, DstVT, DstRCDPP>.ret;
+  field dag OutsDPP = getOutsDPP<HasDst, DstVT, DstRCDPP>.ret;
+  field dag OutsDPP8 = getOutsDPP<HasDst, DstVT, DstRCDPP>.ret;
   field dag OutsSDWA = getOutsSDWA<HasDst, DstVT, DstRCSDWA>.ret;
 
   field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret;
@@ -2223,11 +2106,10 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0,
                                    NumSrcArgs, HasClamp,
                                    Src0PackedMod, Src1PackedMod, Src2PackedMod>.ret;
   field dag InsVOP3OpSel = getInsVOP3OpSel<Src0RC64, Src1RC64, Src2RC64,
-                                           NumSrcArgs,
-                                           HasClamp,
-                                           getOpSelMod<Src0VT>.ret,
-                                           getOpSelMod<Src1VT>.ret,
-                                           getOpSelMod<Src2VT>.ret>.ret;
+                                NumSrcArgs, HasClamp, HasOMod,
+                                getOpSelMod<Src0VT>.ret,
+                                getOpSelMod<Src1VT>.ret,
+                                getOpSelMod<Src2VT>.ret>.ret;
   field dag InsDPP = !if(HasExtDPP,
                          getInsDPP<DstRCDPP, Src0DPP, Src1DPP, NumSrcArgs,
                                    HasModifiers, Src0ModDPP, Src1ModDPP>.ret,
@@ -2245,7 +2127,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0,
   field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasIntClamp, HasModifiers, HasOMod, DstVT>.ret;
   field string AsmVOP3P = getAsmVOP3P<HasDst, NumSrcArgs, HasModifiers, HasClamp, DstVT>.ret;
   field string AsmVOP3OpSel = getAsmVOP3OpSel<NumSrcArgs,
-                                              HasClamp,
+                                              HasClamp, HasOMod,
                                               HasSrc0FloatMods,
                                               HasSrc1FloatMods,
                                               HasSrc2FloatMods>.ret;
@@ -2381,7 +2263,6 @@ class VINTRP_Real_si <bits <2> op, string opName, dag outs, dag ins,
   VINTRPCommon <outs, ins, asm, []>,
   VINTRPe <op>,
   SIMCInstr<opName, encodingFamily> {
-  let DisableDecoder = DisableSIDecoder;
 }
 
 class VINTRP_Real_vi <bits <2> op, string opName, dag outs, dag ins,
@@ -2391,7 +2272,6 @@ class VINTRP_Real_vi <bits <2> op, string opName, dag outs, dag ins,
   SIMCInstr<opName, SIEncodingFamily.VI> {
   let AssemblerPredicate = VIAssemblerPredicate;
   let DecoderNamespace = "GFX8";
-  let DisableDecoder = DisableVIDecoder;
 }
 
 // FIXME-GFX10: WIP.
@@ -2492,8 +2372,7 @@ def getMCOpcodeGen : InstrMapping {
                    [!cast<string>(SIEncodingFamily.GFX80)],
                    [!cast<string>(SIEncodingFamily.GFX9)],
                    [!cast<string>(SIEncodingFamily.GFX10)],
-                   [!cast<string>(SIEncodingFamily.SDWA10)],
-                   [!cast<string>(SIEncodingFamily.GFX10_B)]];
+                   [!cast<string>(SIEncodingFamily.SDWA10)]];
 }
 
 // Get equivalent SOPK instruction.
@@ -2567,11 +2446,28 @@ def getVCMPXNoSDstOp : InstrMapping {
 
 // Maps a SOPP to a SOPP with S_NOP
 def getSOPPWithRelaxation : InstrMapping {
-  let FilterClass = "Base_SOPP";
-  let RowFields = ["AsmString"];
-  let ColFields = ["Size"];
-  let KeyCol = ["4"];
-  let ValueCols = [["8"]];
+  let FilterClass = "SOPPRelaxTable";
+  let RowFields = ["KeyName"];
+  let ColFields = ["IsRelaxed"];
+  let KeyCol = ["0"];
+  let ValueCols = [["1"]];
+}
+
+// Maps flat scratch opcodes by addressing modes
+def getFlatScratchInstSTfromSS : InstrMapping {
+  let FilterClass = "FlatScratchInst";
+  let RowFields = ["SVOp"];
+  let ColFields = ["Mode"];
+  let KeyCol = ["SS"];
+  let ValueCols = [["ST"]];
+}
+
+def getFlatScratchInstSSfromSV : InstrMapping {
+  let FilterClass = "FlatScratchInst";
+  let RowFields = ["SVOp"];
+  let ColFields = ["Mode"];
+  let KeyCol = ["SV"];
+  let ValueCols = [["SS"]];
 }
 
 include "SIInstructions.td"
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td
index 0c4c9e0e9df2..7c1cbd67c993 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -19,43 +19,7 @@ include "VOPInstructions.td"
 include "SMInstructions.td"
 include "FLATInstructions.td"
 include "BUFInstructions.td"
-
-//===----------------------------------------------------------------------===//
-// EXP Instructions
-//===----------------------------------------------------------------------===//
-
-defm EXP : EXP_m<0>;
-defm EXP_DONE : EXP_m<1>;
-
-class ExpPattern<ValueType vt, Instruction Inst, int done_val> : GCNPat<
-  (int_amdgcn_exp timm:$tgt, timm:$en,
-                  (vt ExpSrc0:$src0), (vt ExpSrc1:$src1),
-                  (vt ExpSrc2:$src2), (vt ExpSrc3:$src3),
-                  done_val, timm:$vm),
-  (Inst timm:$tgt, ExpSrc0:$src0, ExpSrc1:$src1,
-        ExpSrc2:$src2, ExpSrc3:$src3, timm:$vm, 0, timm:$en)
->;
-
-class ExpComprPattern<ValueType vt, Instruction Inst, int done_val> : GCNPat<
-  (int_amdgcn_exp_compr timm:$tgt, timm:$en,
-                        (vt ExpSrc0:$src0), (vt ExpSrc1:$src1),
-                        done_val, timm:$vm),
-  (Inst timm:$tgt, ExpSrc0:$src0, ExpSrc1:$src1,
-        (IMPLICIT_DEF), (IMPLICIT_DEF), timm:$vm, 1, timm:$en)
->;
-
-// FIXME: The generated DAG matcher seems to have strange behavior
-// with a 1-bit literal to match, so use a -1 for checking a true
-// 1-bit value.
-def : ExpPattern<i32, EXP, 0>;
-def : ExpPattern<i32, EXP_DONE, -1>;
-def : ExpPattern<f32, EXP, 0>;
-def : ExpPattern<f32, EXP_DONE, -1>;
-
-def : ExpComprPattern<v2i16, EXP, 0>;
-def : ExpComprPattern<v2i16, EXP_DONE, -1>;
-def : ExpComprPattern<v2f16, EXP, 0>;
-def : ExpComprPattern<v2f16, EXP_DONE, -1>;
+include "EXPInstructions.td"
 
 //===----------------------------------------------------------------------===//
 // VINTRP Instructions
@@ -264,6 +228,7 @@ class WrapTerminatorInst<SOP_Pseudo base_inst> : SPseudoInstSI<
 let WaveSizePredicate = isWave64 in {
 def S_MOV_B64_term : WrapTerminatorInst<S_MOV_B64>;
 def S_XOR_B64_term : WrapTerminatorInst<S_XOR_B64>;
+def S_OR_B64_term : WrapTerminatorInst<S_OR_B64>;
 def S_ANDN2_B64_term : WrapTerminatorInst<S_ANDN2_B64>;
 }
 
@@ -324,7 +289,7 @@ def SI_IF: CFPseudoInstSI <
 
 def SI_ELSE : CFPseudoInstSI <
   (outs SReg_1:$dst),
-  (ins SReg_1:$src, brtarget:$target, i1imm:$execfix), [], 1, 1> {
+  (ins SReg_1:$src, brtarget:$target), [], 1, 1> {
   let Size = 12;
   let hasSideEffects = 1;
 }
@@ -356,6 +321,14 @@ def SI_IF_BREAK : CFPseudoInstSI <
   let isReMaterializable = 1;
 }
 
+// Branch to the early termination block of the shader if SCC is 0.
+// This uses SCC from a previous SALU operation, i.e. the update of
+// a mask of live lanes after a kill/demote operation.
+// Only valid in pixel shaders.
+def SI_EARLY_TERMINATE_SCC0 : SPseudoInstSI <(outs), (ins)> {
+  let Uses = [EXEC,SCC];
+}
+
 let Uses = [EXEC] in {
 
 multiclass PseudoInstKill <dag ins> {
@@ -426,32 +399,13 @@ def SI_INIT_EXEC : SPseudoInstSI <
   (outs), (ins i64imm:$src),
   [(int_amdgcn_init_exec (i64 timm:$src))]> {
   let Defs = [EXEC];
-  let usesCustomInserter = 1;
-  let isAsCheapAsAMove = 1;
-  let WaveSizePredicate = isWave64;
-}
-
-// FIXME: Intrinsic should be mangled for wave size.
-def SI_INIT_EXEC_LO : SPseudoInstSI <
-  (outs), (ins i32imm:$src), []> {
-  let Defs = [EXEC_LO];
-  let usesCustomInserter = 1;
   let isAsCheapAsAMove = 1;
-  let WaveSizePredicate = isWave32;
 }
 
-// FIXME: Wave32 version
 def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI <
   (outs), (ins SSrc_b32:$input, i32imm:$shift),
   [(int_amdgcn_init_exec_from_input i32:$input, (i32 timm:$shift))]> {
   let Defs = [EXEC];
-  let usesCustomInserter = 1;
-}
-
-def : GCNPat <
-  (int_amdgcn_init_exec timm:$src),
-  (SI_INIT_EXEC_LO (as_i32timm timm:$src))> {
-  let WaveSizePredicate = isWave32;
 }
 
 // Return for returning shaders to a shader variant epilog.
@@ -580,64 +534,97 @@ def SI_INDIRECT_DST_V32 : SI_INDIRECT_DST<VReg_1024>;
 
 } // End Uses = [EXEC], Defs = [M0, EXEC]
 
-
-// This is a pseudo variant of the v_movreld_b32 (or v_mov_b32
-// expecting to be executed with gpr indexing mode enabled)
-// instruction in which the vector operand appears only twice, once as
-// def and once as use. Using this pseudo avoids problems with the Two
-// Address instructions pass.
-class INDIRECT_REG_WRITE_pseudo<RegisterClass rc,
+// This is a pseudo variant of the v_movreld_b32 instruction in which the
+// vector operand appears only twice, once as def and once as use. Using this
+// pseudo avoids problems with the Two Address instructions pass.
+class INDIRECT_REG_WRITE_MOVREL_pseudo<RegisterClass rc,
                                 RegisterOperand val_ty> : PseudoInstSI <
   (outs rc:$vdst), (ins rc:$vsrc, val_ty:$val, i32imm:$subreg)> {
   let Constraints = "$vsrc = $vdst";
   let Uses = [M0];
 }
 
-class V_INDIRECT_REG_WRITE_B32_pseudo<RegisterClass rc> :
-  INDIRECT_REG_WRITE_pseudo<rc, VSrc_b32> {
+class V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<RegisterClass rc> :
+  INDIRECT_REG_WRITE_MOVREL_pseudo<rc, VSrc_b32> {
   let VALU = 1;
   let VOP1 = 1;
   let Uses = [M0, EXEC];
 }
 
-class S_INDIRECT_REG_WRITE_pseudo<RegisterClass rc,
+class S_INDIRECT_REG_WRITE_MOVREL_pseudo<RegisterClass rc,
                                   RegisterOperand val_ty> :
-  INDIRECT_REG_WRITE_pseudo<rc, val_ty> {
+  INDIRECT_REG_WRITE_MOVREL_pseudo<rc, val_ty> {
   let SALU = 1;
   let SOP1 = 1;
   let Uses = [M0];
 }
 
-class S_INDIRECT_REG_WRITE_B32_pseudo<RegisterClass rc> :
-  S_INDIRECT_REG_WRITE_pseudo<rc, SSrc_b32>;
-class S_INDIRECT_REG_WRITE_B64_pseudo<RegisterClass rc> :
-  S_INDIRECT_REG_WRITE_pseudo<rc, SSrc_b64>;
-
-
-def V_INDIRECT_REG_WRITE_B32_V1 : V_INDIRECT_REG_WRITE_B32_pseudo<VGPR_32>;
-def V_INDIRECT_REG_WRITE_B32_V2 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_64>;
-def V_INDIRECT_REG_WRITE_B32_V3 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_96>;
-def V_INDIRECT_REG_WRITE_B32_V4 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_128>;
-def V_INDIRECT_REG_WRITE_B32_V5 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_160>;
-def V_INDIRECT_REG_WRITE_B32_V8 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_256>;
-def V_INDIRECT_REG_WRITE_B32_V16 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_512>;
-def V_INDIRECT_REG_WRITE_B32_V32 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_1024>;
+class S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<RegisterClass rc> :
+  S_INDIRECT_REG_WRITE_MOVREL_pseudo<rc, SSrc_b32>;
+class S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<RegisterClass rc> :
+  S_INDIRECT_REG_WRITE_MOVREL_pseudo<rc, SSrc_b64>;
+
+def V_INDIRECT_REG_WRITE_MOVREL_B32_V1 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VGPR_32>;
+def V_INDIRECT_REG_WRITE_MOVREL_B32_V2 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_64>;
+def V_INDIRECT_REG_WRITE_MOVREL_B32_V3 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_96>;
+def V_INDIRECT_REG_WRITE_MOVREL_B32_V4 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_128>;
+def V_INDIRECT_REG_WRITE_MOVREL_B32_V5 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_160>;
+def V_INDIRECT_REG_WRITE_MOVREL_B32_V8 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_256>;
+def V_INDIRECT_REG_WRITE_MOVREL_B32_V16 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_512>;
+def V_INDIRECT_REG_WRITE_MOVREL_B32_V32 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_1024>;
+
+def S_INDIRECT_REG_WRITE_MOVREL_B32_V1 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_32>;
+def S_INDIRECT_REG_WRITE_MOVREL_B32_V2 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_64>;
+def S_INDIRECT_REG_WRITE_MOVREL_B32_V3 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_96>;
+def S_INDIRECT_REG_WRITE_MOVREL_B32_V4 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_128>;
+def S_INDIRECT_REG_WRITE_MOVREL_B32_V5 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_160>;
+def S_INDIRECT_REG_WRITE_MOVREL_B32_V8 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_256>;
+def S_INDIRECT_REG_WRITE_MOVREL_B32_V16 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_512>;
+def S_INDIRECT_REG_WRITE_MOVREL_B32_V32 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_1024>;
+
+def S_INDIRECT_REG_WRITE_MOVREL_B64_V1 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_64>;
+def S_INDIRECT_REG_WRITE_MOVREL_B64_V2 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_128>;
+def S_INDIRECT_REG_WRITE_MOVREL_B64_V4 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_256>;
+def S_INDIRECT_REG_WRITE_MOVREL_B64_V8 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_512>;
+def S_INDIRECT_REG_WRITE_MOVREL_B64_V16 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_1024>;
+
+// These variants of V_INDIRECT_REG_READ/WRITE use VGPR indexing. By using these
+// pseudos we avoid spills or copies being inserted within indirect sequences
+// that switch the VGPR indexing mode. Spills to accvgprs could be effected by
+// this mode switching.
+
+class V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<RegisterClass rc> : PseudoInstSI <
+  (outs rc:$vdst), (ins rc:$vsrc, VSrc_b32:$val, SSrc_b32:$idx, i32imm:$subreg)> {
+  let Constraints = "$vsrc = $vdst";
+  let VALU = 1;
+  let Uses = [M0, EXEC];
+  let Defs = [M0];
+}
 
-def S_INDIRECT_REG_WRITE_B32_V1 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_32>;
-def S_INDIRECT_REG_WRITE_B32_V2 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_64>;
-def S_INDIRECT_REG_WRITE_B32_V3 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_96>;
-def S_INDIRECT_REG_WRITE_B32_V4 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_128>;
-def S_INDIRECT_REG_WRITE_B32_V5 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_160>;
-def S_INDIRECT_REG_WRITE_B32_V8 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_256>;
-def S_INDIRECT_REG_WRITE_B32_V16 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_512>;
-def S_INDIRECT_REG_WRITE_B32_V32 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_1024>;
+def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VGPR_32>;
+def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_64>;
+def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_96>;
+def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_128>;
+def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_160>;
+def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_256>;
+def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_512>;
+def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_1024>;
 
-def S_INDIRECT_REG_WRITE_B64_V1 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_64>;
-def S_INDIRECT_REG_WRITE_B64_V2 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_128>;
-def S_INDIRECT_REG_WRITE_B64_V4 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_256>;
-def S_INDIRECT_REG_WRITE_B64_V8 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_512>;
-def S_INDIRECT_REG_WRITE_B64_V16 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_1024>;
+class V_INDIRECT_REG_READ_GPR_IDX_pseudo<RegisterClass rc> : PseudoInstSI <
+  (outs VGPR_32:$vdst), (ins rc:$vsrc, SSrc_b32:$idx, i32imm:$subreg)> {
+  let VALU = 1;
+  let Uses = [M0, EXEC];
+  let Defs = [M0];
+}
 
+def V_INDIRECT_REG_READ_GPR_IDX_B32_V1 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VGPR_32>;
+def V_INDIRECT_REG_READ_GPR_IDX_B32_V2 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_64>;
+def V_INDIRECT_REG_READ_GPR_IDX_B32_V3 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_96>;
+def V_INDIRECT_REG_READ_GPR_IDX_B32_V4 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_128>;
+def V_INDIRECT_REG_READ_GPR_IDX_B32_V5 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_160>;
+def V_INDIRECT_REG_READ_GPR_IDX_B32_V8 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_256>;
+def V_INDIRECT_REG_READ_GPR_IDX_B32_V16 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_512>;
+def V_INDIRECT_REG_READ_GPR_IDX_B32_V32 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_1024>;
 
 multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> {
   let UseNamedOperandTable = 1, SGPRSpill = 1, Uses = [EXEC] in {
@@ -671,30 +658,33 @@ defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
 defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
 defm SI_SPILL_S1024 : SI_SPILL_SGPR <SReg_1024>;
 
-multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> {
+// VGPR or AGPR spill instructions. In case of AGPR spilling a temp register
+// needs to be used and an extra instruction to move between VGPR and AGPR.
+// UsesTmp adds to the total size of an expanded spill in this case.
+multiclass SI_SPILL_VGPR <RegisterClass vgpr_class, bit UsesTmp = 0> {
   let UseNamedOperandTable = 1, VGPRSpill = 1,
        SchedRW = [WriteVMEM] in {
     def _SAVE : VPseudoInstSI <
       (outs),
-      (ins vgpr_class:$vdata, i32imm:$vaddr, SReg_128:$srsrc,
+      (ins vgpr_class:$vdata, i32imm:$vaddr,
            SReg_32:$soffset, i32imm:$offset)> {
       let mayStore = 1;
       let mayLoad = 0;
       // (2 * 4) + (8 * num_subregs) bytes maximum
-      int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8);
+      int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), !add(UsesTmp, 3)), 8);
       // Size field is unsigned char and cannot fit more.
       let Size = !if(!le(MaxSize, 256), MaxSize, 252);
     }
 
     def _RESTORE : VPseudoInstSI <
       (outs vgpr_class:$vdata),
-      (ins i32imm:$vaddr, SReg_128:$srsrc, SReg_32:$soffset,
-           i32imm:$offset)> {
+      (ins i32imm:$vaddr,
+           SReg_32:$soffset, i32imm:$offset)> {
       let mayStore = 0;
       let mayLoad = 1;
 
       // (2 * 4) + (8 * num_subregs) bytes maximum
-      int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8);
+      int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), !add(UsesTmp, 3)), 8);
       // Size field is unsigned char and cannot fit more.
       let Size = !if(!le(MaxSize, 256), MaxSize, 252);
     }
@@ -711,42 +701,15 @@ defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>;
 defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>;
 defm SI_SPILL_V1024 : SI_SPILL_VGPR <VReg_1024>;
 
-multiclass SI_SPILL_AGPR <RegisterClass vgpr_class> {
-  let UseNamedOperandTable = 1, VGPRSpill = 1,
-      Constraints = "@earlyclobber $tmp",
-      SchedRW = [WriteVMEM] in {
-    def _SAVE : VPseudoInstSI <
-      (outs VGPR_32:$tmp),
-      (ins vgpr_class:$vdata, i32imm:$vaddr, SReg_128:$srsrc,
-           SReg_32:$soffset, i32imm:$offset)> {
-      let mayStore = 1;
-      let mayLoad = 0;
-      // (2 * 4) + (16 * num_subregs) bytes maximum
-      int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 4), 8);
-      // Size field is unsigned char and cannot fit more.
-      let Size = !if(!le(MaxSize, 256), MaxSize, 252);
-    }
-
-    def _RESTORE : VPseudoInstSI <
-      (outs vgpr_class:$vdata, VGPR_32:$tmp),
-      (ins i32imm:$vaddr, SReg_128:$srsrc, SReg_32:$soffset,
-           i32imm:$offset)> {
-      let mayStore = 0;
-      let mayLoad = 1;
-
-      // (2 * 4) + (16 * num_subregs) bytes maximum
-      int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 4), 8);
-      // Size field is unsigned char and cannot fit more.
-      let Size = !if(!le(MaxSize, 256), MaxSize, 252);
-    }
-  } // End UseNamedOperandTable = 1, VGPRSpill = 1, SchedRW = [WriteVMEM]
-}
-
-defm SI_SPILL_A32  : SI_SPILL_AGPR <AGPR_32>;
-defm SI_SPILL_A64  : SI_SPILL_AGPR <AReg_64>;
-defm SI_SPILL_A128 : SI_SPILL_AGPR <AReg_128>;
-defm SI_SPILL_A512 : SI_SPILL_AGPR <AReg_512>;
-defm SI_SPILL_A1024 : SI_SPILL_AGPR <AReg_1024>;
+defm SI_SPILL_A32  : SI_SPILL_VGPR <AGPR_32, 1>;
+defm SI_SPILL_A64  : SI_SPILL_VGPR <AReg_64, 1>;
+defm SI_SPILL_A96  : SI_SPILL_VGPR <AReg_96, 1>;
+defm SI_SPILL_A128 : SI_SPILL_VGPR <AReg_128, 1>;
+defm SI_SPILL_A160 : SI_SPILL_VGPR <AReg_160, 1>;
+defm SI_SPILL_A192 : SI_SPILL_VGPR <AReg_192, 1>;
+defm SI_SPILL_A256 : SI_SPILL_VGPR <AReg_256, 1>;
+defm SI_SPILL_A512 : SI_SPILL_VGPR <AReg_512, 1>;
+defm SI_SPILL_A1024 : SI_SPILL_VGPR <AReg_1024, 1>;
 
 def SI_PC_ADD_REL_OFFSET : SPseudoInstSI <
   (outs SReg_64:$dst),
@@ -768,7 +731,7 @@ def : GCNPat<
 
 def : GCNPat<
   (AMDGPUelse i1:$src, bb:$target),
-  (SI_ELSE $src, $target, 0)
+  (SI_ELSE $src, $target)
 >;
 
 def : Pat <
@@ -804,12 +767,9 @@ def : Pat <
 
 let OtherPredicates = [UnsafeFPMath] in {
 
-//def : RcpPat<V_RCP_F64_e32, f64>;
-//defm : RsqPat<V_RSQ_F64_e32, f64>;
 //defm : RsqPat<V_RSQ_F32_e32, f32>;
 
 def : RsqPat<V_RSQ_F32_e32, f32>;
-def : RsqPat<V_RSQ_F64_e32, f64>;
 
 // Convert (x - floor(x)) to fract(x)
 def : GCNPat <
@@ -889,7 +849,8 @@ def : GCNPat <
 // VOP2 Patterns
 //===----------------------------------------------------------------------===//
 
-// TODO: Check only no src2 mods?
+// NoMods pattern used for mac. If there are any source modifiers then it's
+// better to select mad instead of mac.
 class FMADPat <ValueType vt, Instruction inst, SDPatternOperator node>
   : GCNPat <(vt (node (vt (VOP3NoMods vt:$src0)),
                       (vt (VOP3NoMods vt:$src1)),
@@ -898,18 +859,41 @@ class FMADPat <ValueType vt, Instruction inst, SDPatternOperator node>
           SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
 >;
 
-
 // Prefer mac form when there are no modifiers.
 let AddedComplexity = 9 in {
+let OtherPredicates = [HasMadMacF32Insts] in {
 def : FMADPat <f32, V_MAC_F32_e64, fmad>;
 def : FMADPat <f32, V_MAC_F32_e64, AMDGPUfmad_ftz>;
+} // OtherPredicates = [HasMadMacF32Insts]
+
+// Don't allow source modifiers. If there are any source modifiers then it's
+// better to select mad instead of mac.
+let SubtargetPredicate = isGFX6GFX7GFX10,
+    OtherPredicates = [HasMadMacF32Insts, NoFP32Denormals] in
+def : GCNPat <
+      (f32 (fadd (AMDGPUfmul_legacy (VOP3NoMods f32:$src0),
+                                    (VOP3NoMods f32:$src1)),
+                 (VOP3NoMods f32:$src2))),
+      (V_MAC_LEGACY_F32_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
+                            SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+// Don't allow source modifiers. If there are any source modifiers then it's
+// better to select fma instead of fmac.
+let SubtargetPredicate = HasFmaLegacy32 in
+def : GCNPat <
+      (f32 (int_amdgcn_fma_legacy (VOP3NoMods f32:$src0),
+                                  (VOP3NoMods f32:$src1),
+                                  (VOP3NoMods f32:$src2))),
+      (V_FMAC_LEGACY_F32_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
+                             SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
 
 let SubtargetPredicate = Has16BitInsts in {
 def : FMADPat <f16, V_MAC_F16_e64, fmad>;
 def : FMADPat <f16, V_MAC_F16_e64, AMDGPUfmad_ftz>;
-}
-
-}
+} // SubtargetPredicate = Has16BitInsts
+} // AddedComplexity = 9
 
 class FMADModsPat<ValueType Ty, Instruction inst, SDPatternOperator mad_opr>
   : GCNPat<
@@ -920,11 +904,20 @@ class FMADModsPat<ValueType Ty, Instruction inst, SDPatternOperator mad_opr>
   $src2_mod, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
 >;
 
-let SubtargetPredicate = HasMadMacF32Insts in
-def : FMADModsPat<f32, V_MAD_F32, AMDGPUfmad_ftz>;
-def : FMADModsPat<f16, V_MAD_F16, AMDGPUfmad_ftz> {
-  let SubtargetPredicate = Has16BitInsts;
-}
+let OtherPredicates = [HasMadMacF32Insts] in
+def : FMADModsPat<f32, V_MAD_F32_e64, AMDGPUfmad_ftz>;
+
+let OtherPredicates = [HasMadMacF32Insts, NoFP32Denormals] in
+def : GCNPat <
+      (f32 (fadd (AMDGPUfmul_legacy (VOP3Mods f32:$src0, i32:$src0_mod),
+                                    (VOP3Mods f32:$src1, i32:$src1_mod)),
+                 (VOP3Mods f32:$src2, i32:$src2_mod))),
+      (V_MAD_LEGACY_F32_e64 $src0_mod, $src0, $src1_mod, $src1,
+                        $src2_mod, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+let SubtargetPredicate = Has16BitInsts in
+def : FMADModsPat<f16, V_MAD_F16_e64, AMDGPUfmad_ftz>;
 
 class VOPSelectModsPat <ValueType vt> : GCNPat <
   (vt (select i1:$src0, (VOP3Mods vt:$src1, i32:$src1_mods),
@@ -1241,7 +1234,7 @@ class ClampPat<Instruction inst, ValueType vt> : GCNPat <
 >;
 
 def : ClampPat<V_MAX_F32_e64, f32>;
-def : ClampPat<V_MAX_F64, f64>;
+def : ClampPat<V_MAX_F64_e64, f64>;
 def : ClampPat<V_MAX_F16_e64, f16>;
 
 let SubtargetPredicate = HasVOP3PInsts in {
@@ -1422,12 +1415,12 @@ def : GCNPat <
 
 def : GCNPat <
   (fcopysign f16:$src0, f16:$src1),
-  (V_BFI_B32 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1)
+  (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1)
 >;
 
 def : GCNPat <
   (fcopysign f32:$src0, f16:$src1),
-  (V_BFI_B32 (S_MOV_B32 (i32 0x7fffffff)), $src0,
+  (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0,
              (V_LSHLREV_B32_e64 (i32 16), $src1))
 >;
 
@@ -1435,19 +1428,19 @@ def : GCNPat <
   (fcopysign f64:$src0, f16:$src1),
   (REG_SEQUENCE SReg_64,
     (i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
-    (V_BFI_B32 (S_MOV_B32 (i32 0x7fffffff)), (i32 (EXTRACT_SUBREG $src0, sub1)),
+    (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), (i32 (EXTRACT_SUBREG $src0, sub1)),
                (V_LSHLREV_B32_e64 (i32 16), $src1)), sub1)
 >;
 
 def : GCNPat <
   (fcopysign f16:$src0, f32:$src1),
-  (V_BFI_B32 (S_MOV_B32 (i32 0x00007fff)), $src0,
+  (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0,
              (V_LSHRREV_B32_e64 (i32 16), $src1))
 >;
 
 def : GCNPat <
   (fcopysign f16:$src0, f64:$src1),
-  (V_BFI_B32 (S_MOV_B32 (i32 0x00007fff)), $src0,
+  (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0,
              (V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1)))
 >;
 
@@ -1499,8 +1492,13 @@ def : GCNPat <
 >;
 
 def : GCNPat <
- (i32 frameindex:$fi),
- (V_MOV_B32_e32 (i32 (frameindex_to_targetframeindex $fi)))
+  (p5 frameindex:$fi),
+  (V_MOV_B32_e32 (p5 (frameindex_to_targetframeindex $fi)))
+>;
+
+def : GCNPat <
+  (p5 frameindex:$fi),
+  (S_MOV_B32 (p5 (frameindex_to_targetframeindex $fi)))
 >;
 
 def : GCNPat <
@@ -1565,19 +1563,103 @@ def : GCNPat <
 // VOP3 Patterns
 //===----------------------------------------------------------------------===//
 
-def : IMad24Pat<V_MAD_I32_I24, 1>;
-def : UMad24Pat<V_MAD_U32_U24, 1>;
+def : IMad24Pat<V_MAD_I32_I24_e64, 1>;
+def : UMad24Pat<V_MAD_U32_U24_e64, 1>;
+
+// BFI patterns
+
+def BFIImm32 : PatFrag<
+  (ops node:$x, node:$y, node:$z),
+  (i32 (DivergentBinFrag<or> (and node:$y, node:$x), (and node:$z, imm))),
+  [{
+    auto *X = dyn_cast<ConstantSDNode>(N->getOperand(0)->getOperand(1));
+    auto *NotX = dyn_cast<ConstantSDNode>(N->getOperand(1)->getOperand(1));
+    return X && NotX &&
+      ~(unsigned)X->getZExtValue() == (unsigned)NotX->getZExtValue();
+  }]
+>;
+
+// Definition from ISA doc:
+// (y & x) | (z & ~x)
+def : AMDGPUPat <
+  (DivergentBinFrag<or> (and i32:$y, i32:$x), (and i32:$z, (not i32:$x))),
+  (V_BFI_B32_e64 $x, $y, $z)
+>;
+
+// (y & C) | (z & ~C)
+def : AMDGPUPat <
+  (BFIImm32 i32:$x, i32:$y, i32:$z),
+  (V_BFI_B32_e64 $x, $y, $z)
+>;
+
+// 64-bit version
+def : AMDGPUPat <
+  (DivergentBinFrag<or> (and i64:$y, i64:$x), (and i64:$z, (not i64:$x))),
+  (REG_SEQUENCE SReg_64,
+    (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub0)),
+               (i32 (EXTRACT_SUBREG SReg_64:$y, sub0)),
+               (i32 (EXTRACT_SUBREG SReg_64:$z, sub0))), sub0,
+    (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub1)),
+               (i32 (EXTRACT_SUBREG SReg_64:$y, sub1)),
+               (i32 (EXTRACT_SUBREG SReg_64:$z, sub1))), sub1)
+>;
+
+// SHA-256 Ch function
+// z ^ (x & (y ^ z))
+def : AMDGPUPat <
+  (DivergentBinFrag<xor> i32:$z, (and i32:$x, (xor i32:$y, i32:$z))),
+  (V_BFI_B32_e64 $x, $y, $z)
+>;
 
-// FIXME: This should only be done for VALU inputs
-defm : BFIPatterns <V_BFI_B32, S_MOV_B32, SReg_64>;
-def : ROTRPattern <V_ALIGNBIT_B32>;
+// 64-bit version
+def : AMDGPUPat <
+  (DivergentBinFrag<xor> i64:$z, (and i64:$x, (xor i64:$y, i64:$z))),
+  (REG_SEQUENCE SReg_64,
+    (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub0)),
+               (i32 (EXTRACT_SUBREG SReg_64:$y, sub0)),
+               (i32 (EXTRACT_SUBREG SReg_64:$z, sub0))), sub0,
+    (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub1)),
+               (i32 (EXTRACT_SUBREG SReg_64:$y, sub1)),
+               (i32 (EXTRACT_SUBREG SReg_64:$z, sub1))), sub1)
+>;
+
+def : AMDGPUPat <
+  (fcopysign f32:$src0, f32:$src1),
+  (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0, $src1)
+>;
+
+def : AMDGPUPat <
+  (fcopysign f32:$src0, f64:$src1),
+  (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0,
+             (i32 (EXTRACT_SUBREG SReg_64:$src1, sub1)))
+>;
+
+def : AMDGPUPat <
+  (fcopysign f64:$src0, f64:$src1),
+  (REG_SEQUENCE SReg_64,
+    (i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
+    (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)),
+               (i32 (EXTRACT_SUBREG SReg_64:$src0, sub1)),
+               (i32 (EXTRACT_SUBREG SReg_64:$src1, sub1))), sub1)
+>;
+
+def : AMDGPUPat <
+  (fcopysign f64:$src0, f32:$src1),
+  (REG_SEQUENCE SReg_64,
+    (i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
+    (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)),
+               (i32 (EXTRACT_SUBREG SReg_64:$src0, sub1)),
+               $src1), sub1)
+>;
+
+def : ROTRPattern <V_ALIGNBIT_B32_e64>;
 
 def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
-          (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
+          (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
                           (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>;
 
 def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
-          (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
+          (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
                           (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>;
 
 /********** ====================== **********/
@@ -1618,7 +1700,7 @@ def : GCNPat <
   (add (sub_oneuse (umax i32:$src0, i32:$src1),
                    (umin i32:$src0, i32:$src1)),
        i32:$src2),
-  (V_SAD_U32 $src0, $src1, $src2, (i1 0))
+  (V_SAD_U32_e64 $src0, $src1, $src2, (i1 0))
 >;
 
 def : GCNPat <
@@ -1626,7 +1708,7 @@ def : GCNPat <
                       (sub i32:$src0, i32:$src1),
                       (sub i32:$src1, i32:$src0)),
        i32:$src2),
-  (V_SAD_U32 $src0, $src1, $src2, (i1 0))
+  (V_SAD_U32_e64 $src0, $src1, $src2, (i1 0))
 >;
 
 //===----------------------------------------------------------------------===//
@@ -1877,9 +1959,9 @@ def : GCNPat <
 
 def : GCNPat <
   (i32 (bswap i32:$a)),
-  (V_BFI_B32 (S_MOV_B32 (i32 0x00ff00ff)),
-             (V_ALIGNBIT_B32 VSrc_b32:$a, VSrc_b32:$a, (i32 24)),
-             (V_ALIGNBIT_B32 VSrc_b32:$a, VSrc_b32:$a, (i32 8)))
+  (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00ff00ff)),
+             (V_ALIGNBIT_B32_e64 VSrc_b32:$a, VSrc_b32:$a, (i32 24)),
+             (V_ALIGNBIT_B32_e64 VSrc_b32:$a, VSrc_b32:$a, (i32 8)))
 >;
 
 // FIXME: This should have been narrowed to i32 during legalization.
@@ -1887,19 +1969,19 @@ def : GCNPat <
 def : GCNPat <
   (i64 (bswap i64:$a)),
   (REG_SEQUENCE VReg_64,
-  (V_BFI_B32 (S_MOV_B32 (i32 0x00ff00ff)),
-             (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
+  (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00ff00ff)),
+             (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
                              (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
                              (i32 24)),
-             (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
+             (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
                              (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
                              (i32 8))),
   sub0,
-  (V_BFI_B32 (S_MOV_B32 (i32 0x00ff00ff)),
-             (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
+  (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00ff00ff)),
+             (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
                              (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
                              (i32 24)),
-             (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
+             (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
                              (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
                              (i32 8))),
   sub1)
@@ -1914,7 +1996,7 @@ let SubtargetPredicate = isGFX8Plus, AddedComplexity = 1 in {
 // register value, but this is what seems to work.
 def : GCNPat <
   (i32 (bswap i32:$a)),
-  (V_PERM_B32 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x00010203)))
+  (V_PERM_B32_e64 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x00010203)))
 >;
 
 // FIXME: This should have been narrowed to i32 during legalization.
@@ -1922,10 +2004,10 @@ def : GCNPat <
 def : GCNPat <
   (i64 (bswap i64:$a)),
   (REG_SEQUENCE VReg_64,
-  (V_PERM_B32 (i32 0), (EXTRACT_SUBREG VReg_64:$a, sub1),
+  (V_PERM_B32_e64  (i32 0), (EXTRACT_SUBREG VReg_64:$a, sub1),
               (S_MOV_B32 (i32 0x00010203))),
   sub0,
-  (V_PERM_B32 (i32 0), (EXTRACT_SUBREG VReg_64:$a, sub0),
+  (V_PERM_B32_e64  (i32 0), (EXTRACT_SUBREG VReg_64:$a, sub0),
               (S_MOV_B32 (i32 0x00010203))),
   sub1)
 >;
@@ -1934,18 +2016,18 @@ def : GCNPat <
 // The 12s emit 0s.
 def : GCNPat <
   (i16 (bswap i16:$a)),
-  (V_PERM_B32 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001)))
+  (V_PERM_B32_e64  (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001)))
 >;
 
 def : GCNPat <
   (i32 (zext (bswap i16:$a))),
-  (V_PERM_B32 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001)))
+  (V_PERM_B32_e64  (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001)))
 >;
 
 // Magic number: 1 | (0 << 8) | (3 << 16) | (2 << 24)
 def : GCNPat <
   (v2i16 (bswap v2i16:$a)),
-  (V_PERM_B32 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x02030001)))
+  (V_PERM_B32_e64  (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x02030001)))
 >;
 
 }
@@ -1981,7 +2063,7 @@ def : GCNPat<
 // TODO: Handle fneg like other types.
 def : GCNPat<
   (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))),
-  (V_MUL_F64 0, CONST.FP64_ONE, $src_mods, $src)
+  (V_MUL_F64_e64  0, CONST.FP64_ONE, $src_mods, $src)
 >;
 } // End AddedComplexity = -5
 
@@ -1997,7 +2079,7 @@ multiclass SelectCanonicalizeAsMax<
 
   def : GCNPat<
     (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))),
-    (V_MAX_F64 $src_mods, $src, $src_mods, $src)> {
+    (V_MAX_F64_e64  $src_mods, $src, $src_mods, $src)> {
     let OtherPredicates = f64_preds;
   }
 
@@ -2059,13 +2141,21 @@ def : GCNPat <
                   SRCMODS.NONE, $src2)
 >;
 
-// COPY is workaround tablegen bug from multiple outputs
-// from S_LSHL_B32's multiple outputs from implicit scc def.
 def : GCNPat <
   (v2i16 (build_vector (i16 0), (i16 SReg_32:$src1))),
   (S_LSHL_B32 SReg_32:$src1, (i16 16))
 >;
 
+def : GCNPat <
+  (v2i16 (build_vector (i16 SReg_32:$src1), (i16 0))),
+  (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1)
+>;
+
+def : GCNPat <
+  (v2f16 (build_vector (f16 SReg_32:$src1), (f16 FP_ZERO))),
+  (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1)
+>;
+
 def : GCNPat <
   (v2i16 (build_vector (i16 SReg_32:$src0), (i16 undef))),
   (COPY_TO_REGCLASS SReg_32:$src0, SReg_32)
@@ -2177,12 +2267,12 @@ let SubtargetPredicate = isGFX6 in {
 // FIXME: DAG should also custom lower this.
 def : GCNPat <
   (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))),
-  (V_ADD_F64
+  (V_ADD_F64_e64
       $mods,
       $x,
       SRCMODS.NEG,
       (V_CNDMASK_B64_PSEUDO
-         (V_MIN_F64
+         (V_MIN_F64_e64
              SRCMODS.NONE,
              (V_FRACT_F64_e64 $mods, $x),
              SRCMODS.NONE,
@@ -2213,7 +2303,7 @@ def : GCNPat<
 
 def : GCNPat<
   (add i32:$src0, (i32 NegSubInlineConst32:$src1)),
-  (V_SUB_I32_e64 VS_32:$src0, NegSubInlineConst32:$src1)> {
+  (V_SUB_CO_U32_e64 VS_32:$src0, NegSubInlineConst32:$src1)> {
   let SubtargetPredicate = NotHasAddNoCarryInsts;
 }
 
@@ -2241,8 +2331,77 @@ multiclass BFMPatterns <ValueType vt, InstSI BFM, InstSI MOV> {
 defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>;
 // FIXME: defm : BFMPatterns <i64, S_BFM_B64, S_MOV_B64>;
 
-defm : BFEPattern <V_BFE_U32, V_BFE_I32, S_MOV_B32>;
-defm : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64, SReg_64>;
+// Bitfield extract patterns
+
+def IMMZeroBasedBitfieldMask : ImmLeaf <i32, [{
+  return isMask_32(Imm);
+}]>;
+
+def IMMPopCount : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(countPopulation(N->getZExtValue()), SDLoc(N),
+                                   MVT::i32);
+}]>;
+
+def : AMDGPUPat <
+  (DivergentBinFrag<and> (i32 (srl i32:$src, i32:$rshift)),
+                         IMMZeroBasedBitfieldMask:$mask),
+  (V_BFE_U32_e64 $src, $rshift, (i32 (IMMPopCount $mask)))
+>;
+
+// x & ((1 << y) - 1)
+def : AMDGPUPat <
+  (DivergentBinFrag<and> i32:$src, (add_oneuse (shl_oneuse 1, i32:$width), -1)),
+  (V_BFE_U32_e64 $src, (i32 0), $width)
+>;
+
+// x & ~(-1 << y)
+def : AMDGPUPat <
+  (DivergentBinFrag<and> i32:$src,
+                         (xor_oneuse (shl_oneuse -1, i32:$width), -1)),
+  (V_BFE_U32_e64 $src, (i32 0), $width)
+>;
+
+// x & (-1 >> (bitwidth - y))
+def : AMDGPUPat <
+  (DivergentBinFrag<and> i32:$src, (srl_oneuse -1, (sub 32, i32:$width))),
+  (V_BFE_U32_e64 $src, (i32 0), $width)
+>;
+
+// x << (bitwidth - y) >> (bitwidth - y)
+def : AMDGPUPat <
+  (DivergentBinFrag<srl> (shl_oneuse i32:$src, (sub 32, i32:$width)),
+                         (sub 32, i32:$width)),
+  (V_BFE_U32_e64 $src, (i32 0), $width)
+>;
+
+def : AMDGPUPat <
+  (DivergentBinFrag<sra> (shl_oneuse i32:$src, (sub 32, i32:$width)),
+                         (sub 32, i32:$width)),
+  (V_BFE_I32_e64 $src, (i32 0), $width)
+>;
+
+// SHA-256 Ma patterns
+
+// ((x & z) | (y & (x | z))) -> BFI (XOR x, y), z, y
+def : AMDGPUPat <
+  (DivergentBinFrag<or> (and i32:$x, i32:$z),
+                        (and i32:$y, (or i32:$x, i32:$z))),
+  (V_BFI_B32_e64 (V_XOR_B32_e64 i32:$x, i32:$y), i32:$z, i32:$y)
+>;
+
+def : AMDGPUPat <
+  (DivergentBinFrag<or> (and i64:$x, i64:$z),
+                        (and i64:$y, (or i64:$x, i64:$z))),
+  (REG_SEQUENCE SReg_64,
+    (V_BFI_B32_e64 (V_XOR_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub0)),
+                    (i32 (EXTRACT_SUBREG SReg_64:$y, sub0))),
+               (i32 (EXTRACT_SUBREG SReg_64:$z, sub0)),
+               (i32 (EXTRACT_SUBREG SReg_64:$y, sub0))), sub0,
+    (V_BFI_B32_e64 (V_XOR_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub1)),
+                    (i32 (EXTRACT_SUBREG SReg_64:$y, sub1))),
+               (i32 (EXTRACT_SUBREG SReg_64:$z, sub1)),
+               (i32 (EXTRACT_SUBREG SReg_64:$y, sub1))), sub1)
+>;
 
 multiclass IntMed3Pat<Instruction med3Inst,
                  SDPatternOperator min,
@@ -2267,8 +2426,8 @@ multiclass IntMed3Pat<Instruction med3Inst,
 >;
 }
 
-defm : IntMed3Pat<V_MED3_I32, smin, smax, smin_oneuse, smax_oneuse>;
-defm : IntMed3Pat<V_MED3_U32, umin, umax, umin_oneuse, umax_oneuse>;
+defm : IntMed3Pat<V_MED3_I32_e64, smin, smax, smin_oneuse, smax_oneuse>;
+defm : IntMed3Pat<V_MED3_U32_e64, umin, umax, umin_oneuse, umax_oneuse>;
 
 // This matches 16 permutations of
 // max(min(x, y), min(max(x, y), z))
@@ -2315,12 +2474,12 @@ multiclass Int16Med3Pat<Instruction med3Inst,
 >;
 }
 
-def : FPMed3Pat<f32, V_MED3_F32>;
+def : FPMed3Pat<f32, V_MED3_F32_e64>;
 
 let OtherPredicates = [isGFX9Plus] in {
-def : FP16Med3Pat<f16, V_MED3_F16>;
-defm : Int16Med3Pat<V_MED3_I16, smin, smax, smax_oneuse, smin_oneuse>;
-defm : Int16Med3Pat<V_MED3_U16, umin, umax, umax_oneuse, umin_oneuse>;
+def : FP16Med3Pat<f16, V_MED3_F16_e64>;
+defm : Int16Med3Pat<V_MED3_I16_e64, smin, smax, smax_oneuse, smin_oneuse>;
+defm : Int16Med3Pat<V_MED3_U16_e64, umin, umax, umax_oneuse, umin_oneuse>;
 } // End Predicates = [isGFX9Plus]
 
 class AMDGPUGenericInstruction : GenericInstruction {
@@ -2428,10 +2587,12 @@ def G_AMDGPU_ATOMIC_CMPXCHG : AMDGPUGenericInstruction {
 let Namespace = "AMDGPU" in {
 def G_AMDGPU_ATOMIC_INC : G_ATOMICRMW_OP;
 def G_AMDGPU_ATOMIC_DEC : G_ATOMICRMW_OP;
+def G_AMDGPU_ATOMIC_FMIN : G_ATOMICRMW_OP;
+def G_AMDGPU_ATOMIC_FMAX : G_ATOMICRMW_OP;
 }
 
-class BufferAtomicGenericInstruction : AMDGPUGenericInstruction {
-  let OutOperandList = (outs type0:$dst);
+class BufferAtomicGenericInstruction<bit NoRtn = 0> : AMDGPUGenericInstruction {
+  let OutOperandList = !if(NoRtn, (outs), (outs type0:$dst));
   let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset,
                            type2:$soffset, untyped_imm_0:$offset,
                            untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen);
@@ -2452,6 +2613,7 @@ def G_AMDGPU_BUFFER_ATOMIC_OR : BufferAtomicGenericInstruction;
 def G_AMDGPU_BUFFER_ATOMIC_XOR : BufferAtomicGenericInstruction;
 def G_AMDGPU_BUFFER_ATOMIC_INC : BufferAtomicGenericInstruction;
 def G_AMDGPU_BUFFER_ATOMIC_DEC : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_FADD : BufferAtomicGenericInstruction;
 
 def G_AMDGPU_BUFFER_ATOMIC_CMPSWAP : AMDGPUGenericInstruction {
   let OutOperandList = (outs type0:$dst);
@@ -2494,3 +2656,11 @@ def G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPUGenericInstruction {
   let hasSideEffects = 0;
   let mayStore = 1;
 }
+
+def G_AMDGPU_INTRIN_BVH_INTERSECT_RAY : AMDGPUGenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins unknown:$intrin, variable_ops);
+  let hasSideEffects = 0;
+  let mayLoad = 1;
+  let mayStore = 0;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 2eb1c52f1b59..b39420f3c7db 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -58,33 +58,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
+#include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIInstrInfo.h"
-#include "SIRegisterInfo.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/DebugLoc.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
-#include <cassert>
-#include <cstdlib>
-#include <iterator>
-#include <utility>
 
 using namespace llvm;
 
@@ -171,7 +149,7 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
           return false;
 
         // TODO: We should be able to merge physical reg addreses.
-        if (Register::isPhysicalRegister(AddrOp->getReg()))
+        if (AddrOp->getReg().isPhysical())
           return false;
 
         // If an address has only one use then there will be on other
@@ -393,6 +371,15 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
   case AMDGPU::DS_WRITE_B64:
   case AMDGPU::DS_WRITE_B64_gfx9:
     return DS_WRITE;
+  case AMDGPU::IMAGE_BVH_INTERSECT_RAY_sa:
+  case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_sa:
+  case AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_sa:
+  case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_sa:
+  case AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa:
+  case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa:
+  case AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa:
+  case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa:
+    return UNKNOWN;
   }
 }
 
@@ -604,7 +591,7 @@ static void addDefsUsesToList(const MachineInstr &MI,
     if (Op.isReg()) {
       if (Op.isDef())
         RegDefs.insert(Op.getReg());
-      else if (Op.readsReg() && Register::isPhysicalRegister(Op.getReg()))
+      else if (Op.readsReg() && Op.getReg().isPhysical())
         PhysRegUses.insert(Op.getReg());
     }
   }
@@ -633,11 +620,10 @@ static bool addToListsIfDependent(MachineInstr &MI, DenseSet<Register> &RegDefs,
     // be moved for merging, then we need to move the def-instruction as well.
     // This can only happen for physical registers such as M0; virtual
     // registers are in SSA form.
-    if (Use.isReg() &&
-        ((Use.readsReg() && RegDefs.count(Use.getReg())) ||
-         (Use.isDef() && RegDefs.count(Use.getReg())) ||
-         (Use.isDef() && Register::isPhysicalRegister(Use.getReg()) &&
-          PhysRegUses.count(Use.getReg())))) {
+    if (Use.isReg() && ((Use.readsReg() && RegDefs.count(Use.getReg())) ||
+                        (Use.isDef() && RegDefs.count(Use.getReg())) ||
+                        (Use.isDef() && Use.getReg().isPhysical() &&
+                         PhysRegUses.count(Use.getReg())))) {
       Insts.push_back(&MI);
       addDefsUsesToList(MI, RegDefs, PhysRegUses);
       return true;
@@ -1667,7 +1653,7 @@ Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
   Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
   Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
   MachineInstr *LoHalf =
-    BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_I32_e64), DestSub0)
+    BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
       .addReg(CarryReg, RegState::Define)
       .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
       .add(OffsetLo)
@@ -1730,7 +1716,7 @@ SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
 // Expecting base computation as:
 //   %OFFSET0:sgpr_32 = S_MOV_B32 8000
 //   %LO:vgpr_32, %c:sreg_64_xexec =
-//       V_ADD_I32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
+//       V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
 //   %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
 //   %Base:vreg_64 =
 //       REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
@@ -1752,7 +1738,7 @@ void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base
   MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
   MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
 
-  if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_I32_e64 ||
+  if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
       !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
     return;
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index 36d52ac3ee89..5839e59b4d7f 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -48,28 +48,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "SIInstrInfo.h"
+#include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/LiveIntervals.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/SlotIndexes.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/Pass.h"
-#include <cassert>
-#include <iterator>
 
 using namespace llvm;
 
@@ -99,6 +82,7 @@ private:
   unsigned MovTermOpc;
   unsigned Andn2TermOpc;
   unsigned XorTermrOpc;
+  unsigned OrTermrOpc;
   unsigned OrSaveExecOpc;
   unsigned Exec;
 
@@ -106,14 +90,19 @@ private:
   void emitElse(MachineInstr &MI);
   void emitIfBreak(MachineInstr &MI);
   void emitLoop(MachineInstr &MI);
-  void emitEndCf(MachineInstr &MI);
+
+  MachineBasicBlock *emitEndCf(MachineInstr &MI);
+
+  void lowerInitExec(MachineBasicBlock *MBB, MachineInstr &MI);
 
   void findMaskOperands(MachineInstr &MI, unsigned OpNo,
                         SmallVectorImpl<MachineOperand> &Src) const;
 
   void combineMasks(MachineInstr &MI);
 
-  void process(MachineInstr &MI);
+  bool removeMBBifRedundant(MachineBasicBlock &MBB);
+
+  MachineBasicBlock *process(MachineInstr &MI);
 
   // Skip to the next instruction, ignoring debug instructions, and trivial
   // block boundaries (blocks that have one (typically fallthrough) successor,
@@ -122,6 +111,19 @@ private:
   skipIgnoreExecInstsTrivialSucc(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator It) const;
 
+  /// Find the insertion point for a new conditional branch.
+  MachineBasicBlock::iterator
+  skipToUncondBrOrEnd(MachineBasicBlock &MBB,
+                      MachineBasicBlock::iterator I) const {
+    assert(I->isTerminator());
+
+    // FIXME: What if we had multiple pre-existing conditional branches?
+    MachineBasicBlock::iterator End = MBB.end();
+    while (I != End && !I->isUnconditionalBranch())
+      ++I;
+    return I;
+  }
+
   // Remove redundant SI_END_CF instructions.
   void optimizeEndCf();
 
@@ -141,9 +143,6 @@ public:
     AU.addPreserved<SlotIndexes>();
     AU.addPreserved<LiveIntervals>();
     AU.addPreservedID(LiveVariablesID);
-    AU.addPreservedID(MachineLoopInfoID);
-    AU.addPreservedID(MachineDominatorsID);
-    AU.setPreservesCFG();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 };
@@ -167,8 +166,7 @@ char &llvm::SILowerControlFlowID = SILowerControlFlow::ID;
 static bool hasKill(const MachineBasicBlock *Begin,
                     const MachineBasicBlock *End, const SIInstrInfo *TII) {
   DenseSet<const MachineBasicBlock*> Visited;
-  SmallVector<MachineBasicBlock *, 4> Worklist(Begin->succ_begin(),
-                                               Begin->succ_end());
+  SmallVector<MachineBasicBlock *, 4> Worklist(Begin->successors());
 
   while (!Worklist.empty()) {
     MachineBasicBlock *MBB = Worklist.pop_back_val();
@@ -275,6 +273,10 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
     BuildMI(MBB, I, DL, TII->get(MovTermOpc), Exec)
     .addReg(Tmp, RegState::Kill);
 
+  // Skip ahead to the unconditional branch in case there are other terminators
+  // present.
+  I = skipToUncondBrOrEnd(MBB, I);
+
   // Insert the S_CBRANCH_EXECZ instruction which will be optimized later
   // during SIRemoveShortExecBranches.
   MachineInstr *NewBr = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
@@ -315,44 +317,37 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
 
   Register DstReg = MI.getOperand(0).getReg();
 
-  bool ExecModified = MI.getOperand(3).getImm() != 0;
   MachineBasicBlock::iterator Start = MBB.begin();
 
-  // We are running before TwoAddressInstructions, and si_else's operands are
-  // tied. In order to correctly tie the registers, split this into a copy of
-  // the src like it does.
-  Register CopyReg = MRI->createVirtualRegister(BoolRC);
-  MachineInstr *CopyExec =
-    BuildMI(MBB, Start, DL, TII->get(AMDGPU::COPY), CopyReg)
-      .add(MI.getOperand(1)); // Saved EXEC
-
   // This must be inserted before phis and any spill code inserted before the
   // else.
-  Register SaveReg = ExecModified ?
-    MRI->createVirtualRegister(BoolRC) : DstReg;
+  Register SaveReg = MRI->createVirtualRegister(BoolRC);
   MachineInstr *OrSaveExec =
     BuildMI(MBB, Start, DL, TII->get(OrSaveExecOpc), SaveReg)
-    .addReg(CopyReg);
+    .add(MI.getOperand(1)); // Saved EXEC
 
   MachineBasicBlock *DestBB = MI.getOperand(2).getMBB();
 
   MachineBasicBlock::iterator ElsePt(MI);
 
-  if (ExecModified) {
-    MachineInstr *And =
-      BuildMI(MBB, ElsePt, DL, TII->get(AndOpc), DstReg)
-      .addReg(Exec)
-      .addReg(SaveReg);
+  // This accounts for any modification of the EXEC mask within the block and
+  // can be optimized out pre-RA when not required.
+  MachineInstr *And = BuildMI(MBB, ElsePt, DL, TII->get(AndOpc), DstReg)
+                          .addReg(Exec)
+                          .addReg(SaveReg);
 
-    if (LIS)
-      LIS->InsertMachineInstrInMaps(*And);
-  }
+  if (LIS)
+    LIS->InsertMachineInstrInMaps(*And);
 
   MachineInstr *Xor =
     BuildMI(MBB, ElsePt, DL, TII->get(XorTermrOpc), Exec)
     .addReg(Exec)
     .addReg(DstReg);
 
+  // Skip ahead to the unconditional branch in case there are other terminators
+  // present.
+  ElsePt = skipToUncondBrOrEnd(MBB, ElsePt);
+
   MachineInstr *Branch =
       BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
           .addMBB(DestBB);
@@ -365,18 +360,14 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
   LIS->RemoveMachineInstrFromMaps(MI);
   MI.eraseFromParent();
 
-  LIS->InsertMachineInstrInMaps(*CopyExec);
   LIS->InsertMachineInstrInMaps(*OrSaveExec);
 
   LIS->InsertMachineInstrInMaps(*Xor);
   LIS->InsertMachineInstrInMaps(*Branch);
 
-  // src reg is tied to dst reg.
   LIS->removeInterval(DstReg);
   LIS->createAndComputeVirtRegInterval(DstReg);
-  LIS->createAndComputeVirtRegInterval(CopyReg);
-  if (ExecModified)
-    LIS->createAndComputeVirtRegInterval(SaveReg);
+  LIS->createAndComputeVirtRegInterval(SaveReg);
 
   // Let this be recomputed.
   LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
@@ -435,8 +426,9 @@ void SILowerControlFlow::emitLoop(MachineInstr &MI) {
           .addReg(Exec)
           .add(MI.getOperand(0));
 
+  auto BranchPt = skipToUncondBrOrEnd(MBB, MI.getIterator());
   MachineInstr *Branch =
-      BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
+      BuildMI(MBB, BranchPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
           .add(MI.getOperand(1));
 
   if (LIS) {
@@ -479,19 +471,37 @@ SILowerControlFlow::skipIgnoreExecInstsTrivialSucc(
   } while (true);
 }
 
-void SILowerControlFlow::emitEndCf(MachineInstr &MI) {
+MachineBasicBlock *SILowerControlFlow::emitEndCf(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
-  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
-  unsigned CFMask = MI.getOperand(0).getReg();
-  MachineInstr *Def = MRI.getUniqueVRegDef(CFMask);
   const DebugLoc &DL = MI.getDebugLoc();
 
-  MachineBasicBlock::iterator InsPt =
-      Def && Def->getParent() == &MBB ? std::next(MachineBasicBlock::iterator(Def))
-                               : MBB.begin();
-  MachineInstr *NewMI = BuildMI(MBB, InsPt, DL, TII->get(OrOpc), Exec)
-                            .addReg(Exec)
-                            .add(MI.getOperand(0));
+  MachineBasicBlock::iterator InsPt = MBB.begin();
+
+  // If we have instructions that aren't prolog instructions, split the block
+  // and emit a terminator instruction. This ensures correct spill placement.
+  // FIXME: We should unconditionally split the block here.
+  bool NeedBlockSplit = false;
+  Register DataReg = MI.getOperand(0).getReg();
+  for (MachineBasicBlock::iterator I = InsPt, E = MI.getIterator();
+       I != E; ++I) {
+    if (I->modifiesRegister(DataReg, TRI)) {
+      NeedBlockSplit = true;
+      break;
+    }
+  }
+
+  unsigned Opcode = OrOpc;
+  MachineBasicBlock *SplitBB = &MBB;
+  if (NeedBlockSplit) {
+    SplitBB = MBB.splitAt(MI, /*UpdateLiveIns*/true, LIS);
+    Opcode = OrTermrOpc;
+    InsPt = MI;
+  }
+
+  MachineInstr *NewMI =
+    BuildMI(MBB, InsPt, DL, TII->get(Opcode), Exec)
+    .addReg(Exec)
+    .add(MI.getOperand(0));
 
   LoweredEndCf.insert(NewMI);
 
@@ -512,6 +522,7 @@ void SILowerControlFlow::emitEndCf(MachineInstr &MI) {
 
   if (LIS)
     LIS->handleMove(*NewMI);
+  return SplitBB;
 }
 
 // Returns replace operands for a logical operation, either single result
@@ -519,7 +530,7 @@ void SILowerControlFlow::emitEndCf(MachineInstr &MI) {
 void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo,
        SmallVectorImpl<MachineOperand> &Src) const {
   MachineOperand &Op = MI.getOperand(OpNo);
-  if (!Op.isReg() || !Register::isVirtualRegister(Op.getReg())) {
+  if (!Op.isReg() || !Op.getReg().isVirtual()) {
     Src.push_back(Op);
     return;
   }
@@ -539,7 +550,7 @@ void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo,
 
   for (const auto &SrcOp : Def->explicit_operands())
     if (SrcOp.isReg() && SrcOp.isUse() &&
-        (Register::isVirtualRegister(SrcOp.getReg()) || SrcOp.getReg() == Exec))
+        (SrcOp.getReg().isVirtual() || SrcOp.getReg() == Exec))
       Src.push_back(SrcOp);
 }
 
@@ -593,15 +604,18 @@ void SILowerControlFlow::optimizeEndCf() {
       if (LIS)
         LIS->RemoveMachineInstrFromMaps(*MI);
       MI->eraseFromParent();
+      removeMBBifRedundant(MBB);
     }
   }
 }
 
-void SILowerControlFlow::process(MachineInstr &MI) {
+MachineBasicBlock *SILowerControlFlow::process(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
   MachineBasicBlock::iterator I(MI);
   MachineInstr *Prev = (I != MBB.begin()) ? &*(std::prev(I)) : nullptr;
 
+  MachineBasicBlock *SplitBB = &MBB;
+
   switch (MI.getOpcode()) {
   case AMDGPU::SI_IF:
     emitIf(MI);
@@ -620,7 +634,7 @@ void SILowerControlFlow::process(MachineInstr &MI) {
     break;
 
   case AMDGPU::SI_END_CF:
-    emitEndCf(MI);
+    SplitBB = emitEndCf(MI);
     break;
 
   default:
@@ -645,6 +659,147 @@ void SILowerControlFlow::process(MachineInstr &MI) {
       break;
     }
   }
+
+  return SplitBB;
+}
+
+void SILowerControlFlow::lowerInitExec(MachineBasicBlock *MBB,
+                                       MachineInstr &MI) {
+  MachineFunction &MF = *MBB->getParent();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  bool IsWave32 = ST.isWave32();
+
+  if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
+    // This should be before all vector instructions.
+    BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
+            TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), Exec)
+        .addImm(MI.getOperand(0).getImm());
+    if (LIS)
+      LIS->RemoveMachineInstrFromMaps(MI);
+    MI.eraseFromParent();
+    return;
+  }
+
+  // Extract the thread count from an SGPR input and set EXEC accordingly.
+  // Since BFM can't shift by 64, handle that case with CMP + CMOV.
+  //
+  // S_BFE_U32 count, input, {shift, 7}
+  // S_BFM_B64 exec, count, 0
+  // S_CMP_EQ_U32 count, 64
+  // S_CMOV_B64 exec, -1
+  Register InputReg = MI.getOperand(0).getReg();
+  MachineInstr *FirstMI = &*MBB->begin();
+  if (InputReg.isVirtual()) {
+    MachineInstr *DefInstr = MRI->getVRegDef(InputReg);
+    assert(DefInstr && DefInstr->isCopy());
+    if (DefInstr->getParent() == MBB) {
+      if (DefInstr != FirstMI) {
+        // If the `InputReg` is defined in current block, we also need to
+        // move that instruction to the beginning of the block.
+        DefInstr->removeFromParent();
+        MBB->insert(FirstMI, DefInstr);
+        if (LIS)
+          LIS->handleMove(*DefInstr);
+      } else {
+        // If first instruction is definition then move pointer after it.
+        FirstMI = &*std::next(FirstMI->getIterator());
+      }
+    }
+  }
+
+  // Insert instruction sequence at block beginning (before vector operations).
+  const DebugLoc DL = MI.getDebugLoc();
+  const unsigned WavefrontSize = ST.getWavefrontSize();
+  const unsigned Mask = (WavefrontSize << 1) - 1;
+  Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+  auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg)
+                   .addReg(InputReg)
+                   .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
+  auto BfmMI =
+      BuildMI(*MBB, FirstMI, DL,
+              TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec)
+          .addReg(CountReg)
+          .addImm(0);
+  auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
+                   .addReg(CountReg, RegState::Kill)
+                   .addImm(WavefrontSize);
+  auto CmovMI =
+      BuildMI(*MBB, FirstMI, DL,
+              TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
+              Exec)
+          .addImm(-1);
+
+  if (!LIS) {
+    MI.eraseFromParent();
+    return;
+  }
+
+  LIS->RemoveMachineInstrFromMaps(MI);
+  MI.eraseFromParent();
+
+  LIS->InsertMachineInstrInMaps(*BfeMI);
+  LIS->InsertMachineInstrInMaps(*BfmMI);
+  LIS->InsertMachineInstrInMaps(*CmpMI);
+  LIS->InsertMachineInstrInMaps(*CmovMI);
+
+  LIS->removeInterval(InputReg);
+  LIS->createAndComputeVirtRegInterval(InputReg);
+  LIS->createAndComputeVirtRegInterval(CountReg);
+}
+
+bool SILowerControlFlow::removeMBBifRedundant(MachineBasicBlock &MBB) {
+  auto GetFallThroughSucc = [=](MachineBasicBlock *B) -> MachineBasicBlock * {
+    auto *S = B->getNextNode();
+    if (!S)
+      return nullptr;
+    if (B->isSuccessor(S)) {
+      // The only fallthrough candidate
+      MachineBasicBlock::iterator I(B->getFirstInstrTerminator());
+      MachineBasicBlock::iterator E = B->end();
+      for (; I != E; I++) {
+        if (I->isBranch() && TII->getBranchDestBlock(*I) == S)
+          // We have unoptimized branch to layout successor
+          return nullptr;
+      }
+    }
+    return S;
+  };
+
+  for (auto &I : MBB.instrs()) {
+    if (!I.isDebugInstr() && !I.isUnconditionalBranch())
+      return false;
+  }
+
+  assert(MBB.succ_size() == 1 && "MBB has more than one successor");
+
+  MachineBasicBlock *Succ = *MBB.succ_begin();
+  MachineBasicBlock *FallThrough = nullptr;
+
+  while (!MBB.predecessors().empty()) {
+    MachineBasicBlock *P = *MBB.pred_begin();
+    if (GetFallThroughSucc(P) == &MBB)
+      FallThrough = P;
+    P->ReplaceUsesOfBlockWith(&MBB, Succ);
+  }
+  MBB.removeSuccessor(Succ);
+  if (LIS) {
+    for (auto &I : MBB.instrs())
+      LIS->RemoveMachineInstrFromMaps(I);
+  }
+  MBB.clear();
+  MBB.eraseFromParent();
+  if (FallThrough && !FallThrough->isLayoutSuccessor(Succ)) {
+    if (!GetFallThroughSucc(Succ)) {
+      MachineFunction *MF = FallThrough->getParent();
+      MachineFunction::iterator FallThroughPos(FallThrough);
+      MF->splice(std::next(FallThroughPos), Succ);
+    } else
+      BuildMI(*FallThrough, FallThrough->end(),
+              FallThrough->findBranchDebugLoc(), TII->get(AMDGPU::S_BRANCH))
+          .addMBB(Succ);
+  }
+
+  return true;
 }
 
 bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
@@ -666,6 +821,7 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
     MovTermOpc = AMDGPU::S_MOV_B32_term;
     Andn2TermOpc = AMDGPU::S_ANDN2_B32_term;
     XorTermrOpc = AMDGPU::S_XOR_B32_term;
+    OrTermrOpc = AMDGPU::S_OR_B32_term;
     OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32;
     Exec = AMDGPU::EXEC_LO;
   } else {
@@ -675,6 +831,7 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
     MovTermOpc = AMDGPU::S_MOV_B64_term;
     Andn2TermOpc = AMDGPU::S_ANDN2_B64_term;
     XorTermrOpc = AMDGPU::S_XOR_B64_term;
+    OrTermrOpc = AMDGPU::S_OR_B64_term;
     OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64;
     Exec = AMDGPU::EXEC;
   }
@@ -682,19 +839,21 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
   SmallVector<MachineInstr *, 32> Worklist;
 
   MachineFunction::iterator NextBB;
-  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
-       BI != BE; BI = NextBB) {
+  for (MachineFunction::iterator BI = MF.begin();
+       BI != MF.end(); BI = NextBB) {
     NextBB = std::next(BI);
-    MachineBasicBlock &MBB = *BI;
+    MachineBasicBlock *MBB = &*BI;
 
-    MachineBasicBlock::iterator I, Next;
-    for (I = MBB.begin(); I != MBB.end(); I = Next) {
+    MachineBasicBlock::iterator I, E, Next;
+    E = MBB->end();
+    for (I = MBB->begin(); I != E; I = Next) {
       Next = std::next(I);
       MachineInstr &MI = *I;
+      MachineBasicBlock *SplitMBB = MBB;
 
       switch (MI.getOpcode()) {
       case AMDGPU::SI_IF:
-        process(MI);
+        SplitMBB = process(MI);
         break;
 
       case AMDGPU::SI_ELSE:
@@ -705,12 +864,25 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
         if (InsertKillCleanups)
           Worklist.push_back(&MI);
         else
-          process(MI);
+          SplitMBB = process(MI);
+        break;
+
+      // FIXME: find a better place for this
+      case AMDGPU::SI_INIT_EXEC:
+      case AMDGPU::SI_INIT_EXEC_FROM_INPUT:
+        lowerInitExec(MBB, MI);
+        if (LIS)
+          LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
         break;
 
       default:
         break;
       }
+
+      if (SplitMBB != MBB) {
+        MBB = Next->getParent();
+        E = MBB->end();
+      }
     }
   }
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index 236a24a02ece..9570680ad9cb 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -22,20 +22,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
+#include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIInstrInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/MachineSSAUpdater.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetMachine.h"
 
 #define DEBUG_TYPE "si-i1-copies"
 
@@ -89,16 +82,15 @@ private:
   void lowerCopiesFromI1();
   void lowerPhis();
   void lowerCopiesToI1();
-  bool isConstantLaneMask(unsigned Reg, bool &Val) const;
+  bool isConstantLaneMask(Register Reg, bool &Val) const;
   void buildMergeLaneMasks(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator I, const DebugLoc &DL,
                            unsigned DstReg, unsigned PrevReg, unsigned CurReg);
   MachineBasicBlock::iterator
   getSaluInsertionAtEnd(MachineBasicBlock &MBB) const;
 
-  bool isVreg1(unsigned Reg) const {
-    return Register::isVirtualRegister(Reg) &&
-           MRI->getRegClass(Reg) == &AMDGPU::VReg_1RegClass;
+  bool isVreg1(Register Reg) const {
+    return Reg.isVirtual() && MRI->getRegClass(Reg) == &AMDGPU::VReg_1RegClass;
   }
 
   bool isLaneMaskReg(unsigned Reg) const {
@@ -185,10 +177,8 @@ public:
         }
       }
 
-      if (Divergent && PDT.dominates(&DefBlock, MBB)) {
-        for (MachineBasicBlock *Succ : MBB->successors())
-          Stack.push_back(Succ);
-      }
+      if (Divergent && PDT.dominates(&DefBlock, MBB))
+        append_range(Stack, MBB->successors());
     }
 
     while (!Stack.empty()) {
@@ -197,8 +187,7 @@ public:
         continue;
       ReachableOrdered.push_back(MBB);
 
-      for (MachineBasicBlock *Succ : MBB->successors())
-        Stack.push_back(Succ);
+      append_range(Stack, MBB->successors());
     }
 
     for (MachineBasicBlock *MBB : ReachableOrdered) {
@@ -214,7 +203,7 @@ public:
         ReachableMap[MBB] = true;
       if (HaveReachablePred) {
         for (MachineBasicBlock *UnreachablePred : Stack) {
-          if (llvm::find(Predecessors, UnreachablePred) == Predecessors.end())
+          if (!llvm::is_contained(Predecessors, UnreachablePred))
             Predecessors.push_back(UnreachablePred);
         }
       }
@@ -348,7 +337,7 @@ private:
     if (DomIt != Visited.end() && DomIt->second <= LoopLevel)
       return true;
 
-    if (llvm::find(Blocks, &MBB) != Blocks.end())
+    if (llvm::is_contained(Blocks, &MBB))
       return true;
 
     return false;
@@ -658,7 +647,7 @@ void SILowerI1Copies::lowerPhis() {
       }
     }
 
-    unsigned NewReg = SSAUpdater.GetValueInMiddleOfBlock(&MBB);
+    Register NewReg = SSAUpdater.GetValueInMiddleOfBlock(&MBB);
     if (NewReg != DstReg) {
       MRI->replaceRegWith(NewReg, DstReg);
       MI->eraseFromParent();
@@ -703,8 +692,7 @@ void SILowerI1Copies::lowerCopiesToI1() {
       Register SrcReg = MI.getOperand(1).getReg();
       assert(!MI.getOperand(1).getSubReg());
 
-      if (!Register::isVirtualRegister(SrcReg) ||
-          (!isLaneMaskReg(SrcReg) && !isVreg1(SrcReg))) {
+      if (!SrcReg.isVirtual() || (!isLaneMaskReg(SrcReg) && !isVreg1(SrcReg))) {
         assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32);
         unsigned TmpReg = createLaneMaskReg(*MF);
         BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64), TmpReg)
@@ -740,7 +728,7 @@ void SILowerI1Copies::lowerCopiesToI1() {
   }
 }
 
-bool SILowerI1Copies::isConstantLaneMask(unsigned Reg, bool &Val) const {
+bool SILowerI1Copies::isConstantLaneMask(Register Reg, bool &Val) const {
   const MachineInstr *MI;
   for (;;) {
     MI = MRI->getUniqueVRegDef(Reg);
@@ -748,7 +736,7 @@ bool SILowerI1Copies::isConstantLaneMask(unsigned Reg, bool &Val) const {
       break;
 
     Reg = MI->getOperand(1).getReg();
-    if (!Register::isVirtualRegister(Reg))
+    if (!Reg.isVirtual())
       return false;
     if (!isLaneMaskReg(Reg))
       return false;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index 1349d3b6bf3f..30405059530e 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -16,19 +16,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "SIInstrInfo.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIMachineFunctionInfo.h"
 #include "llvm/CodeGen/LiveIntervals.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
 
@@ -97,7 +90,7 @@ static void insertCSRSaves(MachineBasicBlock &SaveBlock,
   if (!TFI->spillCalleeSavedRegisters(SaveBlock, I, CSI, TRI)) {
     for (const CalleeSavedInfo &CS : CSI) {
       // Insert the spill to the stack frame.
-      unsigned Reg = CS.getReg();
+      MCRegister Reg = CS.getReg();
 
       MachineInstrSpan MIS(I, &SaveBlock);
       const TargetRegisterClass *RC =
@@ -184,6 +177,16 @@ void SILowerSGPRSpills::calculateSaveRestoreBlocks(MachineFunction &MF) {
   }
 }
 
+// TODO: To support shrink wrapping, this would need to copy
+// PrologEpilogInserter's updateLiveness.
+static void updateLiveness(MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI) {
+  MachineBasicBlock &EntryBB = MF.front();
+
+  for (const CalleeSavedInfo &CSIReg : CSI)
+    EntryBB.addLiveIn(CSIReg.getReg());
+  EntryBB.sortUniqueLiveIns();
+}
+
 bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) {
   MachineRegisterInfo &MRI = MF.getRegInfo();
   const Function &F = MF.getFunction();
@@ -206,7 +209,8 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) {
     const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
 
     for (unsigned I = 0; CSRegs[I]; ++I) {
-      unsigned Reg = CSRegs[I];
+      MCRegister Reg = CSRegs[I];
+
       if (SavedRegs.test(Reg)) {
         const TargetRegisterClass *RC =
           TRI->getMinimalPhysRegClass(Reg, MVT::i32);
@@ -221,6 +225,10 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) {
       for (MachineBasicBlock *SaveBlock : SaveBlocks)
         insertCSRSaves(*SaveBlock, CSI, LIS);
 
+      // Add live ins to save blocks.
+      assert(SaveBlocks.size() == 1 && "shrink wrapping not fully implemented");
+      updateLiveness(MF, CSI);
+
       for (MachineBasicBlock *RestoreBlock : RestoreBlocks)
         insertCSRRestores(*RestoreBlock, CSI, LIS);
       return true;
@@ -233,38 +241,44 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) {
 // Find lowest available VGPR and use it as VGPR reserved for SGPR spills.
 static bool lowerShiftReservedVGPR(MachineFunction &MF,
                                    const GCNSubtarget &ST) {
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  MachineFrameInfo &FrameInfo = MF.getFrameInfo();
   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
-  Register LowestAvailableVGPR, ReservedVGPR;
-  ArrayRef<MCPhysReg> AllVGPR32s = ST.getRegisterInfo()->getAllVGPR32(MF);
-  for (MCPhysReg Reg : AllVGPR32s) {
-    if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg)) {
-      LowestAvailableVGPR = Reg;
-      break;
-    }
-  }
+  const Register PreReservedVGPR = FuncInfo->VGPRReservedForSGPRSpill;
+  // Early out if pre-reservation of a VGPR for SGPR spilling is disabled.
+  if (!PreReservedVGPR)
+    return false;
 
+  // If there are no free lower VGPRs available, default to using the
+  // pre-reserved register instead.
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+  Register LowestAvailableVGPR =
+      TRI->findUnusedRegister(MF.getRegInfo(), &AMDGPU::VGPR_32RegClass, MF);
   if (!LowestAvailableVGPR)
-    return false;
+    LowestAvailableVGPR = PreReservedVGPR;
 
-  ReservedVGPR = FuncInfo->VGPRReservedForSGPRSpill;
   const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
-  int i = 0;
+  MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+  Optional<int> FI;
+  // Check if we are reserving a CSR. Create a stack object for a possible spill
+  // in the function prologue.
+  if (FuncInfo->isCalleeSavedReg(CSRegs, LowestAvailableVGPR))
+    FI = FrameInfo.CreateSpillStackObject(4, Align(4));
+
+  // Find saved info about the pre-reserved register.
+  const auto *ReservedVGPRInfoItr =
+      llvm::find_if(FuncInfo->getSGPRSpillVGPRs(),
+                    [PreReservedVGPR](const auto &SpillRegInfo) {
+                      return SpillRegInfo.VGPR == PreReservedVGPR;
+                    });
+
+  assert(ReservedVGPRInfoItr != FuncInfo->getSGPRSpillVGPRs().end());
+  auto Index =
+      std::distance(FuncInfo->getSGPRSpillVGPRs().begin(), ReservedVGPRInfoItr);
+
+  FuncInfo->setSGPRSpillVGPRs(LowestAvailableVGPR, FI, Index);
 
   for (MachineBasicBlock &MBB : MF) {
-    for (auto Reg : FuncInfo->getSGPRSpillVGPRs()) {
-      if (Reg.VGPR == ReservedVGPR) {
-        MBB.removeLiveIn(ReservedVGPR);
-        MBB.addLiveIn(LowestAvailableVGPR);
-        Optional<int> FI;
-        if (FuncInfo->isCalleeSavedReg(CSRegs, LowestAvailableVGPR))
-          FI = FrameInfo.CreateSpillStackObject(4, Align(4));
-
-        FuncInfo->setSGPRSpillVGPRs(LowestAvailableVGPR, FI, i);
-      }
-      ++i;
-    }
+    assert(LowestAvailableVGPR.isValid() && "Did not find an available VGPR");
+    MBB.addLiveIn(LowestAvailableVGPR);
     MBB.sortUniqueLiveIns();
   }
 
@@ -300,11 +314,15 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
   bool MadeChange = false;
 
   const bool SpillToAGPR = EnableSpillVGPRToAGPR && ST.hasMAIInsts();
+  std::unique_ptr<RegScavenger> RS;
+
+  bool NewReservedRegs = false;
 
   // TODO: CSR VGPRs will never be spilled to AGPRs. These can probably be
   // handled as SpilledToReg in regular PrologEpilogInserter.
-  if ((TRI->spillSGPRToVGPR() && (HasCSRs || FuncInfo->hasSpilledSGPRs())) ||
-      SpillVGPRToAGPR) {
+  const bool HasSGPRSpillToVGPR = TRI->spillSGPRToVGPR() &&
+                                  (HasCSRs || FuncInfo->hasSpilledSGPRs());
+  if (HasSGPRSpillToVGPR || SpillVGPRToAGPR) {
     // Process all SGPR spills before frame offsets are finalized. Ideally SGPRs
     // are spilled to VGPRs, in which case we can eliminate the stack usage.
     //
@@ -329,7 +347,13 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
               TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
           if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI,
                                                 TRI->isAGPR(MRI, VReg))) {
-            TRI->eliminateFrameIndex(MI, 0, FIOp, nullptr);
+            NewReservedRegs = true;
+            if (!RS)
+              RS.reset(new RegScavenger());
+
+            // FIXME: change to enterBasicBlockEnd()
+            RS->enterBasicBlock(MBB);
+            TRI->eliminateFrameIndex(MI, 0, FIOp, RS.get());
             continue;
           }
         }
@@ -340,6 +364,7 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
         int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex();
         assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
         if (FuncInfo->allocateSGPRSpillToVGPR(MF, FI)) {
+          NewReservedRegs = true;
           bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(MI, FI, nullptr);
           (void)Spilled;
           assert(Spilled && "failed to spill SGPR to VGPR when allocated");
@@ -368,5 +393,9 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
   SaveBlocks.clear();
   RestoreBlocks.clear();
 
+  // Updated the reserved registers with any VGPRs added for SGPR spills.
+  if (NewReservedRegs)
+    MRI.freezeReservedRegs(MF);
+
   return MadeChange;
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 788e9873f780..9a0cdc7b1f4d 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -7,21 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "SIMachineFunctionInfo.h"
-#include "AMDGPUArgumentUsageInfo.h"
 #include "AMDGPUTargetMachine.h"
-#include "AMDGPUSubtarget.h"
-#include "SIRegisterInfo.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/CallingConv.h"
-#include "llvm/IR/Function.h"
-#include <cassert>
-#include <vector>
 
 #define MAX_LANES 64
 
@@ -75,16 +61,18 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
   }
 
   if (!isEntryFunction()) {
-    // Non-entry functions have no special inputs for now, other registers
-    // required for scratch access.
-    ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
-
     // TODO: Pick a high register, and shift down, similar to a kernel.
     FrameOffsetReg = AMDGPU::SGPR33;
     StackPtrOffsetReg = AMDGPU::SGPR32;
 
-    ArgInfo.PrivateSegmentBuffer =
-      ArgDescriptor::createRegister(ScratchRSrcReg);
+    if (!ST.enableFlatScratch()) {
+      // Non-entry functions have no special inputs for now, other registers
+      // required for scratch access.
+      ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
+
+      ArgInfo.PrivateSegmentBuffer =
+        ArgDescriptor::createRegister(ScratchRSrcReg);
+    }
 
     if (F.hasFnAttribute("amdgpu-implicitarg-ptr"))
       ImplicitArgPtr = true;
@@ -142,7 +130,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
 
   bool isAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
   if (isAmdHsaOrMesa) {
-    PrivateSegmentBuffer = true;
+    if (!ST.enableFlatScratch())
+      PrivateSegmentBuffer = true;
 
     if (UseFixedABI) {
       DispatchPtr = true;
@@ -167,11 +156,12 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
   if (UseFixedABI || F.hasFnAttribute("amdgpu-kernarg-segment-ptr"))
     KernargSegmentPtr = true;
 
-  if (ST.hasFlatAddressSpace() && isEntryFunction() && isAmdHsaOrMesa) {
+  if (ST.hasFlatAddressSpace() && isEntryFunction() &&
+      (isAmdHsaOrMesa || ST.enableFlatScratch())) {
     // TODO: This could be refined a lot. The attribute is a poor way of
     // detecting calls or stack objects that may require it before argument
     // lowering.
-    if (HasCalls || HasStackObjects)
+    if (HasCalls || HasStackObjects || ST.enableFlatScratch())
       FlatScratchInit = true;
   }
 
@@ -352,6 +342,8 @@ bool SIMachineFunctionInfo::reserveVGPRforSGPRSpills(MachineFunction &MF) {
 
   Register LaneVGPR = TRI->findUnusedRegister(
       MF.getRegInfo(), &AMDGPU::VGPR_32RegClass, MF, true);
+  if (LaneVGPR == Register())
+    return false;
   SpillVGPRs.push_back(SGPRSpillVGPRCSR(LaneVGPR, None));
   FuncInfo->VGPRReservedForSGPRSpill = LaneVGPR;
   return true;
@@ -537,21 +529,21 @@ convertArgumentInfo(const AMDGPUFunctionArgInfo &ArgInfo,
 }
 
 yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
-  const llvm::SIMachineFunctionInfo& MFI,
-  const TargetRegisterInfo &TRI)
-  : ExplicitKernArgSize(MFI.getExplicitKernArgSize()),
-    MaxKernArgAlign(MFI.getMaxKernArgAlign()),
-    LDSSize(MFI.getLDSSize()),
-    IsEntryFunction(MFI.isEntryFunction()),
-    NoSignedZerosFPMath(MFI.hasNoSignedZerosFPMath()),
-    MemoryBound(MFI.isMemoryBound()),
-    WaveLimiter(MFI.needsWaveLimiter()),
-    HighBitsOf32BitAddress(MFI.get32BitAddressHighBits()),
-    ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)),
-    FrameOffsetReg(regToString(MFI.getFrameOffsetReg(), TRI)),
-    StackPtrOffsetReg(regToString(MFI.getStackPtrOffsetReg(), TRI)),
-    ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)),
-    Mode(MFI.getMode()) {}
+    const llvm::SIMachineFunctionInfo &MFI, const TargetRegisterInfo &TRI)
+    : ExplicitKernArgSize(MFI.getExplicitKernArgSize()),
+      MaxKernArgAlign(MFI.getMaxKernArgAlign()), LDSSize(MFI.getLDSSize()),
+      DynLDSAlign(MFI.getDynLDSAlign()), IsEntryFunction(MFI.isEntryFunction()),
+      NoSignedZerosFPMath(MFI.hasNoSignedZerosFPMath()),
+      MemoryBound(MFI.isMemoryBound()), WaveLimiter(MFI.needsWaveLimiter()),
+      HasSpilledSGPRs(MFI.hasSpilledSGPRs()),
+      HasSpilledVGPRs(MFI.hasSpilledVGPRs()),
+      HighBitsOf32BitAddress(MFI.get32BitAddressHighBits()),
+      Occupancy(MFI.getOccupancy()),
+      ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)),
+      FrameOffsetReg(regToString(MFI.getFrameOffsetReg(), TRI)),
+      StackPtrOffsetReg(regToString(MFI.getStackPtrOffsetReg(), TRI)),
+      ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)), Mode(MFI.getMode()) {
+}
 
 void yaml::SIMachineFunctionInfo::mappingImpl(yaml::IO &YamlIO) {
   MappingTraits<SIMachineFunctionInfo>::mapping(YamlIO, *this);
@@ -562,11 +554,15 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
   ExplicitKernArgSize = YamlMFI.ExplicitKernArgSize;
   MaxKernArgAlign = assumeAligned(YamlMFI.MaxKernArgAlign);
   LDSSize = YamlMFI.LDSSize;
+  DynLDSAlign = YamlMFI.DynLDSAlign;
   HighBitsOf32BitAddress = YamlMFI.HighBitsOf32BitAddress;
+  Occupancy = YamlMFI.Occupancy;
   IsEntryFunction = YamlMFI.IsEntryFunction;
   NoSignedZerosFPMath = YamlMFI.NoSignedZerosFPMath;
   MemoryBound = YamlMFI.MemoryBound;
   WaveLimiter = YamlMFI.WaveLimiter;
+  HasSpilledSGPRs = YamlMFI.HasSpilledSGPRs;
+  HasSpilledVGPRs = YamlMFI.HasSpilledVGPRs;
   return false;
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index cf1629fda0af..35fb43162199 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -17,28 +17,17 @@
 #include "AMDGPUMachineFunction.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIInstrInfo.h"
-#include "SIRegisterInfo.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/SparseBitVector.h"
 #include "llvm/CodeGen/MIRYamlMapping.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
-#include "llvm/CodeGen/TargetInstrInfo.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/Support/ErrorHandling.h"
-#include <array>
-#include <cassert>
-#include <utility>
-#include <vector>
+#include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
 
 class MachineFrameInfo;
 class MachineFunction;
 class TargetRegisterClass;
+class SIMachineFunctionInfo;
+class SIRegisterInfo;
 
 class AMDGPUPseudoSourceValue : public PseudoSourceValue {
 public:
@@ -76,6 +65,8 @@ public:
   static bool classof(const PseudoSourceValue *V) {
     return V->kind() == PSVBuffer;
   }
+
+  void printCustom(raw_ostream &OS) const override { OS << "BufferResource"; }
 };
 
 class AMDGPUImagePseudoSourceValue final : public AMDGPUPseudoSourceValue {
@@ -87,6 +78,8 @@ public:
   static bool classof(const PseudoSourceValue *V) {
     return V->kind() == PSVImage;
   }
+
+  void printCustom(raw_ostream &OS) const override { OS << "ImageResource"; }
 };
 
 class AMDGPUGWSResourcePseudoSourceValue final : public AMDGPUPseudoSourceValue {
@@ -277,12 +270,18 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
   uint64_t ExplicitKernArgSize = 0;
   unsigned MaxKernArgAlign = 0;
   unsigned LDSSize = 0;
+  Align DynLDSAlign;
   bool IsEntryFunction = false;
   bool NoSignedZerosFPMath = false;
   bool MemoryBound = false;
   bool WaveLimiter = false;
+  bool HasSpilledSGPRs = false;
+  bool HasSpilledVGPRs = false;
   uint32_t HighBitsOf32BitAddress = 0;
 
+  // TODO: 10 may be a better default since it's the maximum.
+  unsigned Occupancy = 0;
+
   StringValue ScratchRSrcReg = "$private_rsrc_reg";
   StringValue FrameOffsetReg = "$fp_reg";
   StringValue StackPtrOffsetReg = "$sp_reg";
@@ -304,10 +303,13 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
                        UINT64_C(0));
     YamlIO.mapOptional("maxKernArgAlign", MFI.MaxKernArgAlign, 0u);
     YamlIO.mapOptional("ldsSize", MFI.LDSSize, 0u);
+    YamlIO.mapOptional("dynLDSAlign", MFI.DynLDSAlign, Align());
     YamlIO.mapOptional("isEntryFunction", MFI.IsEntryFunction, false);
     YamlIO.mapOptional("noSignedZerosFPMath", MFI.NoSignedZerosFPMath, false);
     YamlIO.mapOptional("memoryBound", MFI.MemoryBound, false);
     YamlIO.mapOptional("waveLimiter", MFI.WaveLimiter, false);
+    YamlIO.mapOptional("hasSpilledSGPRs", MFI.HasSpilledSGPRs, false);
+    YamlIO.mapOptional("hasSpilledVGPRs", MFI.HasSpilledVGPRs, false);
     YamlIO.mapOptional("scratchRSrcReg", MFI.ScratchRSrcReg,
                        StringValue("$private_rsrc_reg"));
     YamlIO.mapOptional("frameOffsetReg", MFI.FrameOffsetReg,
@@ -318,6 +320,7 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
     YamlIO.mapOptional("mode", MFI.Mode, SIMode());
     YamlIO.mapOptional("highBitsOf32BitAddress",
                        MFI.HighBitsOf32BitAddress, 0u);
+    YamlIO.mapOptional("occupancy", MFI.Occupancy, 0);
   }
 };
 
@@ -370,10 +373,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
   // unit. Minimum - first, maximum - second.
   std::pair<unsigned, unsigned> WavesPerEU = {0, 0};
 
-  DenseMap<const Value *,
-           std::unique_ptr<const AMDGPUBufferPseudoSourceValue>> BufferPSVs;
-  DenseMap<const Value *,
-           std::unique_ptr<const AMDGPUImagePseudoSourceValue>> ImagePSVs;
+  std::unique_ptr<const AMDGPUBufferPseudoSourceValue> BufferPSV;
+  std::unique_ptr<const AMDGPUImagePseudoSourceValue> ImagePSV;
   std::unique_ptr<const AMDGPUGWSResourcePseudoSourceValue> GWSResourcePSV;
 
 private:
@@ -684,9 +685,9 @@ public:
     return ArgInfo.getPreloadedValue(Value);
   }
 
-  Register getPreloadedReg(AMDGPUFunctionArgInfo::PreloadedValue Value) const {
+  MCRegister getPreloadedReg(AMDGPUFunctionArgInfo::PreloadedValue Value) const {
     auto Arg = std::get<0>(ArgInfo.getPreloadedValue(Value));
-    return Arg ? Arg->getRegister() : Register();
+    return Arg ? Arg->getRegister() : MCRegister();
   }
 
   unsigned getGITPtrHigh() const {
@@ -884,22 +885,18 @@ public:
     return LDSWaveSpillSize;
   }
 
-  const AMDGPUBufferPseudoSourceValue *getBufferPSV(const SIInstrInfo &TII,
-                                                    const Value *BufferRsrc) {
-    assert(BufferRsrc);
-    auto PSV = BufferPSVs.try_emplace(
-      BufferRsrc,
-      std::make_unique<AMDGPUBufferPseudoSourceValue>(TII));
-    return PSV.first->second.get();
+  const AMDGPUBufferPseudoSourceValue *getBufferPSV(const SIInstrInfo &TII) {
+    if (!BufferPSV)
+      BufferPSV = std::make_unique<AMDGPUBufferPseudoSourceValue>(TII);
+
+    return BufferPSV.get();
   }
 
-  const AMDGPUImagePseudoSourceValue *getImagePSV(const SIInstrInfo &TII,
-                                                  const Value *ImgRsrc) {
-    assert(ImgRsrc);
-    auto PSV = ImagePSVs.try_emplace(
-      ImgRsrc,
-      std::make_unique<AMDGPUImagePseudoSourceValue>(TII));
-    return PSV.first->second.get();
+  const AMDGPUImagePseudoSourceValue *getImagePSV(const SIInstrInfo &TII) {
+    if (!ImagePSV)
+      ImagePSV = std::make_unique<AMDGPUImagePseudoSourceValue>(TII);
+
+    return ImagePSV.get();
   }
 
   const AMDGPUGWSResourcePseudoSourceValue *getGWSPSV(const SIInstrInfo &TII) {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
index 3ba05aadbbbe..278dd05b049c 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
@@ -12,29 +12,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "SIMachineScheduler.h"
-#include "AMDGPU.h"
 #include "SIInstrInfo.h"
-#include "SIRegisterInfo.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervals.h"
-#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineScheduler.h"
-#include "llvm/CodeGen/RegisterPressure.h"
-#include "llvm/CodeGen/SlotIndexes.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
-#include <cassert>
-#include <map>
-#include <set>
-#include <utility>
-#include <vector>
 
 using namespace llvm;
 
@@ -375,8 +356,8 @@ void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
   // Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7
   // The use of findDefBetween removes the case 4.
   for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
-    unsigned Reg = RegMaskPair.RegUnit;
-    if (Register::isVirtualRegister(Reg) &&
+    Register Reg = RegMaskPair.RegUnit;
+    if (Reg.isVirtual() &&
         isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(),
                      LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI,
                      LIS)) {
@@ -763,8 +744,7 @@ void SIScheduleBlockCreator::colorHighLatenciesGroups() {
           // depend (order dependency) on one of the
           // instruction in the block, and are required for the
           // high latency instruction we add.
-          AdditionalElements.insert(AdditionalElements.end(),
-                                    SubGraph.begin(), SubGraph.end());
+          llvm::append_range(AdditionalElements, SubGraph);
         }
       }
       if (CompatibleGroup) {
@@ -1682,9 +1662,9 @@ SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
 // Tracking of currently alive registers to determine VGPR Usage.
 
 void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) {
-  for (unsigned Reg : Regs) {
+  for (Register Reg : Regs) {
     // For now only track virtual registers.
-    if (!Register::isVirtualRegister(Reg))
+    if (!Reg.isVirtual())
       continue;
     // If not already in the live set, then add it.
     (void) LiveRegs.insert(Reg);
@@ -1742,9 +1722,9 @@ SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs,
   std::vector<int> DiffSetPressure;
   DiffSetPressure.assign(DAG->getTRI()->getNumRegPressureSets(), 0);
 
-  for (unsigned Reg : InRegs) {
+  for (Register Reg : InRegs) {
     // For now only track virtual registers.
-    if (!Register::isVirtualRegister(Reg))
+    if (!Reg.isVirtual())
       continue;
     if (LiveRegsConsumers[Reg] > 1)
       continue;
@@ -1754,9 +1734,9 @@ SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs,
     }
   }
 
-  for (unsigned Reg : OutRegs) {
+  for (Register Reg : OutRegs) {
     // For now only track virtual registers.
-    if (!Register::isVirtualRegister(Reg))
+    if (!Reg.isVirtual())
       continue;
     PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
     for (; PSetI.isValid(); ++PSetI) {
@@ -1902,9 +1882,9 @@ SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End,
   VgprUsage = 0;
   SgprUsage = 0;
   for (_Iterator RegI = First; RegI != End; ++RegI) {
-    unsigned Reg = *RegI;
+    Register Reg = *RegI;
     // For now only track virtual registers
-    if (!Register::isVirtualRegister(Reg))
+    if (!Reg.isVirtual())
       continue;
     PSetIterator PSetI = MRI.getPressureSets(Reg);
     for (; PSetI.isValid(); ++PSetI) {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineScheduler.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
index 02e0a3fe1b61..a2f5a1453d6a 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
@@ -14,20 +14,18 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
 #define LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
 
-#include "SIInstrInfo.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/CodeGen/RegisterPressure.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
-#include <cassert>
 #include <cstdint>
-#include <map>
-#include <memory>
 #include <set>
 #include <vector>
 
 namespace llvm {
 
+class SIInstrInfo;
+class SIRegisterInfo;
+
 enum SIScheduleCandReason {
   NoCand,
   RegUsage,
@@ -455,7 +453,7 @@ public:
   MachineRegisterInfo *getMRI() { return &MRI; }
   const TargetRegisterInfo *getTRI() { return TRI; }
   ScheduleDAGTopologicalSort *GetTopo() { return &Topo; }
-  SUnit& getEntrySU() { return EntrySU; }
+  SUnit &getEntrySU() { return EntrySU; }
   SUnit& getExitSU() { return ExitSU; }
 
   void restoreSULinksLeft();
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 4e6c72ca20e2..3caa75e4d958 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -15,31 +15,13 @@
 
 #include "AMDGPU.h"
 #include "AMDGPUMachineModuleInfo.h"
-#include "AMDGPUSubtarget.h"
-#include "SIDefines.h"
-#include "SIInstrInfo.h"
+#include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/BitmaskEnum.h"
-#include "llvm/ADT/None.h"
-#include "llvm/ADT/Optional.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/DiagnosticInfo.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/MC/MCInstrDesc.h"
-#include "llvm/Pass.h"
 #include "llvm/Support/AtomicOrdering.h"
-#include "llvm/Support/MathExtras.h"
-#include <cassert>
-#include <list>
+#include "llvm/Support/TargetParser.h"
 
 using namespace llvm;
 using namespace llvm::AMDGPU;
@@ -47,6 +29,10 @@ using namespace llvm::AMDGPU;
 #define DEBUG_TYPE "si-memory-legalizer"
 #define PASS_NAME "SI Memory Legalizer"
 
+static cl::opt<bool> AmdgcnSkipCacheInvalidations(
+    "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
+    cl::desc("Use this to skip inserting cache invalidating instructions."));
+
 namespace {
 
 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
@@ -125,6 +111,7 @@ private:
   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
   bool IsCrossAddressSpaceOrdering = false;
+  bool IsVolatile = false;
   bool IsNonTemporal = false;
 
   SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
@@ -134,11 +121,13 @@ private:
               bool IsCrossAddressSpaceOrdering = true,
               AtomicOrdering FailureOrdering =
                 AtomicOrdering::SequentiallyConsistent,
+              bool IsVolatile = false,
               bool IsNonTemporal = false)
     : Ordering(Ordering), FailureOrdering(FailureOrdering),
       Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
       InstrAddrSpace(InstrAddrSpace),
       IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
+      IsVolatile(IsVolatile),
       IsNonTemporal(IsNonTemporal) {
     // There is also no cross address space ordering if the ordering
     // address space is the same as the instruction address space and
@@ -186,7 +175,13 @@ public:
   }
 
   /// \returns True if memory access of the machine instruction used to
-  /// create this SIMemOpInfo is non-temporal, false otherwise.
+  /// create this SIMemOpInfo is volatile, false otherwise.
+  bool isVolatile() const {
+    return IsVolatile;
+  }
+
+  /// \returns True if memory access of the machine instruction used to
+  /// create this SIMemOpInfo is nontemporal, false otherwise.
   bool isNonTemporal() const {
     return IsNonTemporal;
   }
@@ -249,12 +244,15 @@ public:
 class SICacheControl {
 protected:
 
+  /// AMDGPU subtarget info.
+  const GCNSubtarget &ST;
+
   /// Instruction info.
   const SIInstrInfo *TII = nullptr;
 
   IsaVersion IV;
 
-  /// Whether to insert cache invalidation instructions.
+  /// Whether to insert cache invalidating instructions.
   bool InsertCacheInv;
 
   SICacheControl(const GCNSubtarget &ST);
@@ -271,28 +269,21 @@ public:
                                      SIAtomicScope Scope,
                                      SIAtomicAddrSpace AddrSpace) const = 0;
 
-  /// Update \p MI memory instruction to indicate it is
-  /// nontemporal. Return true iff the instruction was modified.
-  virtual bool enableNonTemporal(const MachineBasicBlock::iterator &MI)
-    const = 0;
+  /// Update \p MI memory instruction of kind \p Op associated with address
+  /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return
+  /// true iff the instruction was modified.
+  virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
+                                              SIAtomicAddrSpace AddrSpace,
+                                              SIMemOp Op, bool IsVolatile,
+                                              bool IsNonTemporal) const = 0;
 
   /// Inserts any necessary instructions at position \p Pos relative
-  /// to instruction \p MI to ensure any caches associated with
-  /// address spaces \p AddrSpace for memory scopes up to memory scope
-  /// \p Scope are invalidated. Returns true iff any instructions
-  /// inserted.
-  virtual bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
-                                     SIAtomicScope Scope,
-                                     SIAtomicAddrSpace AddrSpace,
-                                     Position Pos) const = 0;
-
-  /// Inserts any necessary instructions at position \p Pos relative
-  /// to instruction \p MI to ensure memory instructions of kind \p Op
-  /// associated with address spaces \p AddrSpace have completed as
-  /// observed by other memory instructions executing in memory scope
-  /// \p Scope. \p IsCrossAddrSpaceOrdering indicates if the memory
-  /// ordering is between address spaces. Returns true iff any
-  /// instructions inserted.
+  /// to instruction \p MI to ensure memory instructions before \p Pos of kind
+  /// \p Op associated with address spaces \p AddrSpace have completed. Used
+  /// between memory instructions to enforce the order they become visible as
+  /// observed by other memory instructions executing in memory scope \p Scope.
+  /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
+  /// address spaces. Returns true iff any instructions inserted.
   virtual bool insertWait(MachineBasicBlock::iterator &MI,
                           SIAtomicScope Scope,
                           SIAtomicAddrSpace AddrSpace,
@@ -300,6 +291,28 @@ public:
                           bool IsCrossAddrSpaceOrdering,
                           Position Pos) const = 0;
 
+  /// Inserts any necessary instructions at position \p Pos relative to
+  /// instruction \p MI to ensure any subsequent memory instructions of this
+  /// thread with address spaces \p AddrSpace will observe the previous memory
+  /// operations by any thread for memory scopes up to memory scope \p Scope .
+  /// Returns true iff any instructions inserted.
+  virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
+                             SIAtomicScope Scope,
+                             SIAtomicAddrSpace AddrSpace,
+                             Position Pos) const = 0;
+
+  /// Inserts any necessary instructions at position \p Pos relative to
+  /// instruction \p MI to ensure previous memory instructions by this thread
+  /// with address spaces \p AddrSpace have completed and can be observed by
+  /// subsequent memory instructions by any thread executing in memory scope \p
+  /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
+  /// between address spaces. Returns true iff any instructions inserted.
+  virtual bool insertRelease(MachineBasicBlock::iterator &MI,
+                             SIAtomicScope Scope,
+                             SIAtomicAddrSpace AddrSpace,
+                             bool IsCrossAddrSpaceOrdering,
+                             Position Pos) const = 0;
+
   /// Virtual destructor to allow derivations to be deleted.
   virtual ~SICacheControl() = default;
 
@@ -328,12 +341,10 @@ public:
                              SIAtomicScope Scope,
                              SIAtomicAddrSpace AddrSpace) const override;
 
-  bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override;
-
-  bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
-                             SIAtomicScope Scope,
-                             SIAtomicAddrSpace AddrSpace,
-                             Position Pos) const override;
+  bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
+                                      SIAtomicAddrSpace AddrSpace, SIMemOp Op,
+                                      bool IsVolatile,
+                                      bool IsNonTemporal) const override;
 
   bool insertWait(MachineBasicBlock::iterator &MI,
                   SIAtomicScope Scope,
@@ -341,6 +352,17 @@ public:
                   SIMemOp Op,
                   bool IsCrossAddrSpaceOrdering,
                   Position Pos) const override;
+
+  bool insertAcquire(MachineBasicBlock::iterator &MI,
+                     SIAtomicScope Scope,
+                     SIAtomicAddrSpace AddrSpace,
+                     Position Pos) const override;
+
+  bool insertRelease(MachineBasicBlock::iterator &MI,
+                     SIAtomicScope Scope,
+                     SIAtomicAddrSpace AddrSpace,
+                     bool IsCrossAddrSpaceOrdering,
+                     Position Pos) const override;
 };
 
 class SIGfx7CacheControl : public SIGfx6CacheControl {
@@ -348,16 +370,15 @@ public:
 
   SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {};
 
-  bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
-                             SIAtomicScope Scope,
-                             SIAtomicAddrSpace AddrSpace,
-                             Position Pos) const override;
+  bool insertAcquire(MachineBasicBlock::iterator &MI,
+                     SIAtomicScope Scope,
+                     SIAtomicAddrSpace AddrSpace,
+                     Position Pos) const override;
 
 };
 
 class SIGfx10CacheControl : public SIGfx7CacheControl {
 protected:
-  bool CuMode = false;
 
   /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
   /// is modified, false otherwise.
@@ -367,19 +388,16 @@ protected:
 
 public:
 
-  SIGfx10CacheControl(const GCNSubtarget &ST, bool CuMode) :
-    SIGfx7CacheControl(ST), CuMode(CuMode) {};
+  SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {};
 
   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
                              SIAtomicScope Scope,
                              SIAtomicAddrSpace AddrSpace) const override;
 
-  bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override;
-
-  bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
-                             SIAtomicScope Scope,
-                             SIAtomicAddrSpace AddrSpace,
-                             Position Pos) const override;
+  bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
+                                      SIAtomicAddrSpace AddrSpace, SIMemOp Op,
+                                      bool IsVolatile,
+                                      bool IsNonTemporal) const override;
 
   bool insertWait(MachineBasicBlock::iterator &MI,
                   SIAtomicScope Scope,
@@ -387,6 +405,11 @@ public:
                   SIMemOp Op,
                   bool IsCrossAddrSpaceOrdering,
                   Position Pos) const override;
+
+  bool insertAcquire(MachineBasicBlock::iterator &MI,
+                     SIAtomicScope Scope,
+                     SIAtomicAddrSpace AddrSpace,
+                     Position Pos) const override;
 };
 
 class SIMemoryLegalizer final : public MachineFunctionPass {
@@ -525,11 +548,13 @@ Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
   bool IsNonTemporal = true;
+  bool IsVolatile = false;
 
   // Validator should check whether or not MMOs cover the entire set of
   // locations accessed by the memory instruction.
   for (const auto &MMO : MI->memoperands()) {
     IsNonTemporal &= MMO->isNonTemporal();
+    IsVolatile |= MMO->isVolatile();
     InstrAddrSpace |=
       toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
     AtomicOrdering OpOrdering = MMO->getOrdering();
@@ -572,7 +597,8 @@ Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
     }
   }
   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
-                     IsCrossAddressSpaceOrdering, FailureOrdering, IsNonTemporal);
+                     IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
+                     IsNonTemporal);
 }
 
 Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo(
@@ -650,10 +676,10 @@ Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
   return constructFromMIWithMMO(MI);
 }
 
-SICacheControl::SICacheControl(const GCNSubtarget &ST) {
+SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
   TII = ST.getInstrInfo();
   IV = getIsaVersion(ST.getCPU());
-  InsertCacheInv = !ST.isAmdPalOS();
+  InsertCacheInv = !AmdgcnSkipCacheInvalidations;
 }
 
 /* static */
@@ -663,7 +689,7 @@ std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
     return std::make_unique<SIGfx6CacheControl>(ST);
   if (Generation < AMDGPUSubtarget::GFX10)
     return std::make_unique<SIGfx7CacheControl>(ST);
-  return std::make_unique<SIGfx10CacheControl>(ST, ST.isCuModeEnabled());
+  return std::make_unique<SIGfx10CacheControl>(ST);
 }
 
 bool SIGfx6CacheControl::enableLoadCacheBypass(
@@ -674,9 +700,6 @@ bool SIGfx6CacheControl::enableLoadCacheBypass(
   bool Changed = false;
 
   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
-    /// TODO: Do not set glc for rmw atomic operations as they
-    /// implicitly bypass the L1 cache.
-
     switch (Scope) {
     case SIAtomicScope::SYSTEM:
     case SIAtomicScope::AGENT:
@@ -697,64 +720,48 @@ bool SIGfx6CacheControl::enableLoadCacheBypass(
   /// sequentially consistent, and no other thread can access scratch
   /// memory.
 
-  /// Other address spaces do not hava a cache.
+  /// Other address spaces do not have a cache.
 
   return Changed;
 }
 
-bool SIGfx6CacheControl::enableNonTemporal(
-    const MachineBasicBlock::iterator &MI) const {
+bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
+    MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
+    bool IsVolatile, bool IsNonTemporal) const {
+  // Only handle load and store, not atomic read-modify-write insructions. The
+  // latter use glc to indicate if the atomic returns a result and so must not
+  // be used for cache control.
   assert(MI->mayLoad() ^ MI->mayStore());
-  bool Changed = false;
-
-  /// TODO: Do not enableGLCBit if rmw atomic.
-  Changed |= enableGLCBit(MI);
-  Changed |= enableSLCBit(MI);
 
-  return Changed;
-}
-
-bool SIGfx6CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
-                                               SIAtomicScope Scope,
-                                               SIAtomicAddrSpace AddrSpace,
-                                               Position Pos) const {
-  if (!InsertCacheInv)
-    return false;
+  // Only update load and store, not LLVM IR atomic read-modify-write
+  // instructions. The latter are always marked as volatile so cannot sensibly
+  // handle it as do not want to pessimize all atomics. Also they do not support
+  // the nontemporal attribute.
+  assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
 
   bool Changed = false;
 
-  MachineBasicBlock &MBB = *MI->getParent();
-  DebugLoc DL = MI->getDebugLoc();
+  if (IsVolatile) {
+    if (Op == SIMemOp::LOAD)
+      Changed |= enableGLCBit(MI);
 
-  if (Pos == Position::AFTER)
-    ++MI;
+    // Ensure operation has completed at system scope to cause all volatile
+    // operations to be visible outside the program in a global order. Do not
+    // request cross address space as only the global address space can be
+    // observable outside the program, so no need to cause a waitcnt for LDS
+    // address space operations.
+    Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
+                          Position::AFTER);
 
-  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
-    switch (Scope) {
-    case SIAtomicScope::SYSTEM:
-    case SIAtomicScope::AGENT:
-      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
-      Changed = true;
-      break;
-    case SIAtomicScope::WORKGROUP:
-    case SIAtomicScope::WAVEFRONT:
-    case SIAtomicScope::SINGLETHREAD:
-      // No cache to invalidate.
-      break;
-    default:
-      llvm_unreachable("Unsupported synchronization scope");
-    }
+    return Changed;
   }
 
-  /// The scratch address space does not need the global memory cache
-  /// to be flushed as all memory operations by the same thread are
-  /// sequentially consistent, and no other thread can access scratch
-  /// memory.
-
-  /// Other address spaces do not hava a cache.
-
-  if (Pos == Position::AFTER)
-    --MI;
+  if (IsNonTemporal) {
+    // Request L1 MISS_EVICT and L2 STREAM for load and store instructions.
+    Changed |= enableGLCBit(MI);
+    Changed |= enableSLCBit(MI);
+    return Changed;
+  }
 
   return Changed;
 }
@@ -776,7 +783,8 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
   bool VMCnt = false;
   bool LGKMCnt = false;
 
-  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+  if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
+      SIAtomicAddrSpace::NONE) {
     switch (Scope) {
     case SIAtomicScope::SYSTEM:
     case SIAtomicScope::AGENT:
@@ -798,12 +806,12 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
     case SIAtomicScope::SYSTEM:
     case SIAtomicScope::AGENT:
     case SIAtomicScope::WORKGROUP:
-      // If no cross address space ordering then an LDS waitcnt is not
-      // needed as LDS operations for all waves are executed in a
-      // total global ordering as observed by all waves. Required if
-      // also synchronizing with global/GDS memory as LDS operations
-      // could be reordered with respect to later global/GDS memory
-      // operations of the same wave.
+      // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
+      // not needed as LDS operations for all waves are executed in a total
+      // global ordering as observed by all waves. Required if also
+      // synchronizing with global/GDS memory as LDS operations could be
+      // reordered with respect to later global/GDS memory operations of the
+      // same wave.
       LGKMCnt |= IsCrossAddrSpaceOrdering;
       break;
     case SIAtomicScope::WAVEFRONT:
@@ -820,12 +828,12 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
     switch (Scope) {
     case SIAtomicScope::SYSTEM:
     case SIAtomicScope::AGENT:
-      // If no cross address space ordering then an GDS waitcnt is not
-      // needed as GDS operations for all waves are executed in a
-      // total global ordering as observed by all waves. Required if
-      // also synchronizing with global/LDS memory as GDS operations
-      // could be reordered with respect to later global/LDS memory
-      // operations of the same wave.
+      // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
+      // is not needed as GDS operations for all waves are executed in a total
+      // global ordering as observed by all waves. Required if also
+      // synchronizing with global/LDS memory as GDS operations could be
+      // reordered with respect to later global/LDS memory operations of the
+      // same wave.
       LGKMCnt |= IsCrossAddrSpaceOrdering;
       break;
     case SIAtomicScope::WORKGROUP:
@@ -855,10 +863,64 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
   return Changed;
 }
 
-bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
-                                               SIAtomicScope Scope,
-                                               SIAtomicAddrSpace AddrSpace,
-                                               Position Pos) const {
+bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
+                                       SIAtomicScope Scope,
+                                       SIAtomicAddrSpace AddrSpace,
+                                       Position Pos) const {
+  if (!InsertCacheInv)
+    return false;
+
+  bool Changed = false;
+
+  MachineBasicBlock &MBB = *MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+
+  if (Pos == Position::AFTER)
+    ++MI;
+
+  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+    switch (Scope) {
+    case SIAtomicScope::SYSTEM:
+    case SIAtomicScope::AGENT:
+      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
+      Changed = true;
+      break;
+    case SIAtomicScope::WORKGROUP:
+    case SIAtomicScope::WAVEFRONT:
+    case SIAtomicScope::SINGLETHREAD:
+      // No cache to invalidate.
+      break;
+    default:
+      llvm_unreachable("Unsupported synchronization scope");
+    }
+  }
+
+  /// The scratch address space does not need the global memory cache
+  /// to be flushed as all memory operations by the same thread are
+  /// sequentially consistent, and no other thread can access scratch
+  /// memory.
+
+  /// Other address spaces do not have a cache.
+
+  if (Pos == Position::AFTER)
+    --MI;
+
+  return Changed;
+}
+
+bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
+                                       SIAtomicScope Scope,
+                                       SIAtomicAddrSpace AddrSpace,
+                                       bool IsCrossAddrSpaceOrdering,
+                                       Position Pos) const {
+    return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
+                      IsCrossAddrSpaceOrdering, Pos);
+}
+
+bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
+                                       SIAtomicScope Scope,
+                                       SIAtomicAddrSpace AddrSpace,
+                                       Position Pos) const {
   if (!InsertCacheInv)
     return false;
 
@@ -869,9 +931,9 @@ bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
 
   const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
 
-  const unsigned Flush = STM.isAmdPalOS() || STM.isMesa3DOS()
-                             ? AMDGPU::BUFFER_WBINVL1
-                             : AMDGPU::BUFFER_WBINVL1_VOL;
+  const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
+                                    ? AMDGPU::BUFFER_WBINVL1
+                                    : AMDGPU::BUFFER_WBINVL1_VOL;
 
   if (Pos == Position::AFTER)
     ++MI;
@@ -880,7 +942,7 @@ bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
     switch (Scope) {
     case SIAtomicScope::SYSTEM:
     case SIAtomicScope::AGENT:
-      BuildMI(MBB, MI, DL, TII->get(Flush));
+      BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
       Changed = true;
       break;
     case SIAtomicScope::WORKGROUP:
@@ -898,7 +960,7 @@ bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
   /// sequentially consistent, and no other thread can access scratch
   /// memory.
 
-  /// Other address spaces do not hava a cache.
+  /// Other address spaces do not have a cache.
 
   if (Pos == Position::AFTER)
     --MI;
@@ -926,9 +988,9 @@ bool SIGfx10CacheControl::enableLoadCacheBypass(
     case SIAtomicScope::WORKGROUP:
       // In WGP mode the waves of a work-group can be executing on either CU of
       // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
-      // CU mode and all waves of a work-group are on the same CU, and so the
-      // L0 does not need to be bypassed.
-      if (!CuMode) Changed |= enableGLCBit(MI);
+      // CU mode all waves of a work-group are on the same CU, and so the L0
+      // does not need to be bypassed.
+      if (!ST.isCuModeEnabled()) Changed |= enableGLCBit(MI);
       break;
     case SIAtomicScope::WAVEFRONT:
     case SIAtomicScope::SINGLETHREAD:
@@ -944,73 +1006,50 @@ bool SIGfx10CacheControl::enableLoadCacheBypass(
   /// sequentially consistent, and no other thread can access scratch
   /// memory.
 
-  /// Other address spaces do not hava a cache.
+  /// Other address spaces do not have a cache.
 
   return Changed;
 }
 
-bool SIGfx10CacheControl::enableNonTemporal(
-    const MachineBasicBlock::iterator &MI) const {
+bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
+    MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
+    bool IsVolatile, bool IsNonTemporal) const {
+
+  // Only handle load and store, not atomic read-modify-write insructions. The
+  // latter use glc to indicate if the atomic returns a result and so must not
+  // be used for cache control.
   assert(MI->mayLoad() ^ MI->mayStore());
-  bool Changed = false;
 
-  Changed |= enableSLCBit(MI);
-  /// TODO for store (non-rmw atomic) instructions also enableGLCBit(MI)
-
-  return Changed;
-}
-
-bool SIGfx10CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
-                                                SIAtomicScope Scope,
-                                                SIAtomicAddrSpace AddrSpace,
-                                                Position Pos) const {
-  if (!InsertCacheInv)
-    return false;
+  // Only update load and store, not LLVM IR atomic read-modify-write
+  // instructions. The latter are always marked as volatile so cannot sensibly
+  // handle it as do not want to pessimize all atomics. Also they do not support
+  // the nontemporal attribute.
+  assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
 
   bool Changed = false;
 
-  MachineBasicBlock &MBB = *MI->getParent();
-  DebugLoc DL = MI->getDebugLoc();
-
-  if (Pos == Position::AFTER)
-    ++MI;
+  if (IsVolatile) {
 
-  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
-    switch (Scope) {
-    case SIAtomicScope::SYSTEM:
-    case SIAtomicScope::AGENT:
-      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
-      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
-      Changed = true;
-      break;
-    case SIAtomicScope::WORKGROUP:
-      // In WGP mode the waves of a work-group can be executing on either CU of
-      // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
-      // in CU mode and all waves of a work-group are on the same CU, and so the
-      // L0 does not need to be invalidated.
-      if (!CuMode) {
-        BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
-        Changed = true;
-      }
-      break;
-    case SIAtomicScope::WAVEFRONT:
-    case SIAtomicScope::SINGLETHREAD:
-      // No cache to invalidate.
-      break;
-    default:
-      llvm_unreachable("Unsupported synchronization scope");
+    if (Op == SIMemOp::LOAD) {
+      Changed |= enableGLCBit(MI);
+      Changed |= enableDLCBit(MI);
     }
-  }
-
-  /// The scratch address space does not need the global memory cache
-  /// to be flushed as all memory operations by the same thread are
-  /// sequentially consistent, and no other thread can access scratch
-  /// memory.
 
-  /// Other address spaces do not hava a cache.
+    // Ensure operation has completed at system scope to cause all volatile
+    // operations to be visible outside the program in a global order. Do not
+    // request cross address space as only the global address space can be
+    // observable outside the program, so no need to cause a waitcnt for LDS
+    // address space operations.
+    Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
+                          Position::AFTER);
+    return Changed;
+  }
 
-  if (Pos == Position::AFTER)
-    --MI;
+  if (IsNonTemporal) {
+    // Request L0/L1 HIT_EVICT and L2 STREAM for load and store instructions.
+    Changed |= enableSLCBit(MI);
+    return Changed;
+  }
 
   return Changed;
 }
@@ -1033,7 +1072,8 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
   bool VSCnt = false;
   bool LGKMCnt = false;
 
-  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+  if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
+      SIAtomicAddrSpace::NONE) {
     switch (Scope) {
     case SIAtomicScope::SYSTEM:
     case SIAtomicScope::AGENT:
@@ -1048,7 +1088,7 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
       // they are visible to waves in the other CU as the L0 is per CU.
       // Otherwise in CU mode and all waves of a work-group are on the same CU
       // which shares the same L0.
-      if (!CuMode) {
+      if (!ST.isCuModeEnabled()) {
         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
           VMCnt |= true;
         if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
@@ -1070,12 +1110,12 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
     case SIAtomicScope::SYSTEM:
     case SIAtomicScope::AGENT:
     case SIAtomicScope::WORKGROUP:
-      // If no cross address space ordering then an LDS waitcnt is not
-      // needed as LDS operations for all waves are executed in a
-      // total global ordering as observed by all waves. Required if
-      // also synchronizing with global/GDS memory as LDS operations
-      // could be reordered with respect to later global/GDS memory
-      // operations of the same wave.
+      // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
+      // not needed as LDS operations for all waves are executed in a total
+      // global ordering as observed by all waves. Required if also
+      // synchronizing with global/GDS memory as LDS operations could be
+      // reordered with respect to later global/GDS memory operations of the
+      // same wave.
       LGKMCnt |= IsCrossAddrSpaceOrdering;
       break;
     case SIAtomicScope::WAVEFRONT:
@@ -1092,12 +1132,12 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
     switch (Scope) {
     case SIAtomicScope::SYSTEM:
     case SIAtomicScope::AGENT:
-      // If no cross address space ordering then an GDS waitcnt is not
-      // needed as GDS operations for all waves are executed in a
-      // total global ordering as observed by all waves. Required if
-      // also synchronizing with global/LDS memory as GDS operations
-      // could be reordered with respect to later global/LDS memory
-      // operations of the same wave.
+      // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
+      // is not needed as GDS operations for all waves are executed in a total
+      // global ordering as observed by all waves. Required if also
+      // synchronizing with global/LDS memory as GDS operations could be
+      // reordered with respect to later global/LDS memory operations of the
+      // same wave.
       LGKMCnt |= IsCrossAddrSpaceOrdering;
       break;
     case SIAtomicScope::WORKGROUP:
@@ -1134,6 +1174,61 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
   return Changed;
 }
 
+bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
+                                        SIAtomicScope Scope,
+                                        SIAtomicAddrSpace AddrSpace,
+                                        Position Pos) const {
+  if (!InsertCacheInv)
+    return false;
+
+  bool Changed = false;
+
+  MachineBasicBlock &MBB = *MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+
+  if (Pos == Position::AFTER)
+    ++MI;
+
+  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+    switch (Scope) {
+    case SIAtomicScope::SYSTEM:
+    case SIAtomicScope::AGENT:
+      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
+      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
+      Changed = true;
+      break;
+    case SIAtomicScope::WORKGROUP:
+      // In WGP mode the waves of a work-group can be executing on either CU of
+      // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
+      // in CU mode and all waves of a work-group are on the same CU, and so the
+      // L0 does not need to be invalidated.
+      if (!ST.isCuModeEnabled()) {
+        BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
+        Changed = true;
+      }
+      break;
+    case SIAtomicScope::WAVEFRONT:
+    case SIAtomicScope::SINGLETHREAD:
+      // No cache to invalidate.
+      break;
+    default:
+      llvm_unreachable("Unsupported synchronization scope");
+    }
+  }
+
+  /// The scratch address space does not need the global memory cache
+  /// to be flushed as all memory operations by the same thread are
+  /// sequentially consistent, and no other thread can access scratch
+  /// memory.
+
+  /// Other address spaces do not have a cache.
+
+  if (Pos == Position::AFTER)
+    --MI;
+
+  return Changed;
+}
+
 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
   if (AtomicPseudoMIs.empty())
     return false;
@@ -1173,20 +1268,20 @@ bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
                                 SIMemOp::LOAD,
                                 MOI.getIsCrossAddressSpaceOrdering(),
                                 Position::AFTER);
-      Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
-                                           MOI.getOrderingAddrSpace(),
-                                           Position::AFTER);
+      Changed |= CC->insertAcquire(MI, MOI.getScope(),
+                                   MOI.getOrderingAddrSpace(),
+                                   Position::AFTER);
     }
 
     return Changed;
   }
 
-  // Atomic instructions do not have the nontemporal attribute.
-  if (MOI.isNonTemporal()) {
-    Changed |= CC->enableNonTemporal(MI);
-    return Changed;
-  }
-
+  // Atomic instructions already bypass caches to the scope specified by the
+  // SyncScope operand. Only non-atomic volatile and nontemporal instructions
+  // need additional treatment.
+  Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(),
+                                                SIMemOp::LOAD, MOI.isVolatile(),
+                                                MOI.isNonTemporal());
   return Changed;
 }
 
@@ -1199,21 +1294,20 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
   if (MOI.isAtomic()) {
     if (MOI.getOrdering() == AtomicOrdering::Release ||
         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
-      Changed |= CC->insertWait(MI, MOI.getScope(),
-                                MOI.getOrderingAddrSpace(),
-                                SIMemOp::LOAD | SIMemOp::STORE,
-                                MOI.getIsCrossAddressSpaceOrdering(),
-                                Position::BEFORE);
+      Changed |= CC->insertRelease(MI, MOI.getScope(),
+                                   MOI.getOrderingAddrSpace(),
+                                   MOI.getIsCrossAddressSpaceOrdering(),
+                                   Position::BEFORE);
 
     return Changed;
   }
 
-  // Atomic instructions do not have the nontemporal attribute.
-  if (MOI.isNonTemporal()) {
-    Changed |= CC->enableNonTemporal(MI);
-    return Changed;
-  }
-
+  // Atomic instructions already bypass caches to the scope specified by the
+  // SyncScope operand. Only non-atomic volatile and nontemporal instructions
+  // need additional treatment.
+  Changed |= CC->enableVolatileAndOrNonTemporal(
+      MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
+      MOI.isNonTemporal());
   return Changed;
 }
 
@@ -1235,19 +1329,23 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
       /// ordering and memory scope, then library does not need to
       /// generate a fence. Could add support in this file for
       /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
-      /// adding waitcnt before a S_BARRIER.
-      Changed |= CC->insertWait(MI, MOI.getScope(),
-                                MOI.getOrderingAddrSpace(),
-                                SIMemOp::LOAD | SIMemOp::STORE,
-                                MOI.getIsCrossAddressSpaceOrdering(),
-                                Position::BEFORE);
+      /// adding S_WAITCNT before a S_BARRIER.
+      Changed |= CC->insertRelease(MI, MOI.getScope(),
+                                   MOI.getOrderingAddrSpace(),
+                                   MOI.getIsCrossAddressSpaceOrdering(),
+                                   Position::BEFORE);
+
+    // TODO: If both release and invalidate are happening they could be combined
+    // to use the single "BUFFER_WBL2" instruction. This could be done by
+    // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
+    // track cache invalidate and write back instructions.
 
     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
-      Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
-                                           MOI.getOrderingAddrSpace(),
-                                           Position::BEFORE);
+      Changed |= CC->insertAcquire(MI, MOI.getScope(),
+                                   MOI.getOrderingAddrSpace(),
+                                   Position::BEFORE);
 
     return Changed;
   }
@@ -1266,11 +1364,10 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
-      Changed |= CC->insertWait(MI, MOI.getScope(),
-                                MOI.getOrderingAddrSpace(),
-                                SIMemOp::LOAD | SIMemOp::STORE,
-                                MOI.getIsCrossAddressSpaceOrdering(),
-                                Position::BEFORE);
+      Changed |= CC->insertRelease(MI, MOI.getScope(),
+                                   MOI.getOrderingAddrSpace(),
+                                   MOI.getIsCrossAddressSpaceOrdering(),
+                                   Position::BEFORE);
 
     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
@@ -1283,9 +1380,9 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
                                                    SIMemOp::STORE,
                                 MOI.getIsCrossAddressSpaceOrdering(),
                                 Position::AFTER);
-      Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
-                                           MOI.getOrderingAddrSpace(),
-                                           Position::AFTER);
+      Changed |= CC->insertAcquire(MI, MOI.getScope(),
+                                   MOI.getOrderingAddrSpace(),
+                                   Position::AFTER);
     }
 
     return Changed;
@@ -1303,7 +1400,8 @@ bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
   for (auto &MBB : MF) {
     for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
 
-      if (MI->getOpcode() == TargetOpcode::BUNDLE && MI->mayLoadOrStore()) {
+      // Unbundle instructions after the post-RA scheduler.
+      if (MI->isBundle()) {
         MachineBasicBlock::instr_iterator II(MI->getIterator());
         for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
              I != E && I->isBundledWithPred(); ++I) {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIModeRegister.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
index 0e162ac42c11..3d659eca47db 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
@@ -14,20 +14,9 @@
 //===----------------------------------------------------------------------===//
 //
 #include "AMDGPU.h"
-#include "AMDGPUInstrInfo.h"
-#include "AMDGPUSubtarget.h"
-#include "SIInstrInfo.h"
-#include "SIMachineFunctionInfo.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
 #include <queue>
 
 #define DEBUG_TYPE "si-mode-register"
@@ -242,8 +231,10 @@ void SIModeRegister::processBlockPhase1(MachineBasicBlock &MBB,
   Status IPChange;
   for (MachineInstr &MI : MBB) {
     Status InstrMode = getInstructionMode(MI, TII);
-    if ((MI.getOpcode() == AMDGPU::S_SETREG_B32) ||
-        (MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32)) {
+    if (MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
+        MI.getOpcode() == AMDGPU::S_SETREG_B32_mode ||
+        MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
+        MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32_mode) {
       // We preserve any explicit mode register setreg instruction we encounter,
       // as we assume it has been inserted by a higher authority (this is
       // likely to be a very rare occurrence).
@@ -267,7 +258,8 @@ void SIModeRegister::processBlockPhase1(MachineBasicBlock &MBB,
       // If this is an immediate then we know the value being set, but if it is
       // not an immediate then we treat the modified bits of the mode register
       // as unknown.
-      if (MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32) {
+      if (MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
+          MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32_mode) {
         unsigned Val = TII->getNamedOperand(MI, AMDGPU::OpName::imm)->getImm();
         unsigned Mode = (Val << Offset) & Mask;
         Status Setreg = Status(Mask, Mode);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
index a9717c6ffb70..54f20912d0a9 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
@@ -7,15 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
+#include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIInstrInfo.h"
-#include "llvm/ADT/SmallSet.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/Support/Debug.h"
 
 using namespace llvm;
 
@@ -176,13 +171,17 @@ static unsigned getSaveExecOp(unsigned Opc) {
 }
 
 // These are only terminators to get correct spill code placement during
-// register allocation, so turn them back into normal instructions. Only one of
-// these is expected per block.
+// register allocation, so turn them back into normal instructions.
 static bool removeTerminatorBit(const SIInstrInfo &TII, MachineInstr &MI) {
   switch (MI.getOpcode()) {
-  case AMDGPU::S_MOV_B64_term:
   case AMDGPU::S_MOV_B32_term: {
-    MI.setDesc(TII.get(AMDGPU::COPY));
+    bool RegSrc = MI.getOperand(1).isReg();
+    MI.setDesc(TII.get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
+    return true;
+  }
+  case AMDGPU::S_MOV_B64_term: {
+    bool RegSrc = MI.getOperand(1).isReg();
+    MI.setDesc(TII.get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B64));
     return true;
   }
   case AMDGPU::S_XOR_B64_term: {
@@ -197,6 +196,12 @@ static bool removeTerminatorBit(const SIInstrInfo &TII, MachineInstr &MI) {
     MI.setDesc(TII.get(AMDGPU::S_XOR_B32));
     return true;
   }
+  case AMDGPU::S_OR_B64_term: {
+    // This is only a terminator to get the correct spill code placement during
+    // register allocation.
+    MI.setDesc(TII.get(AMDGPU::S_OR_B64));
+    return true;
+  }
   case AMDGPU::S_OR_B32_term: {
     // This is only a terminator to get the correct spill code placement during
     // register allocation.
@@ -220,19 +225,29 @@ static bool removeTerminatorBit(const SIInstrInfo &TII, MachineInstr &MI) {
   }
 }
 
+// Turn all pseudoterminators in the block into their equivalent non-terminator
+// instructions. Returns the reverse iterator to the first non-terminator
+// instruction in the block.
 static MachineBasicBlock::reverse_iterator fixTerminators(
   const SIInstrInfo &TII,
   MachineBasicBlock &MBB) {
   MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend();
+
+  bool Seen = false;
+  MachineBasicBlock::reverse_iterator FirstNonTerm = I;
   for (; I != E; ++I) {
     if (!I->isTerminator())
-      return I;
+      return Seen ? FirstNonTerm : I;
 
-    if (removeTerminatorBit(TII, *I))
-      return I;
+    if (removeTerminatorBit(TII, *I)) {
+      if (!Seen) {
+        FirstNonTerm = I;
+        Seen = true;
+      }
+    }
   }
 
-  return E;
+  return FirstNonTerm;
 }
 
 static MachineBasicBlock::reverse_iterator findExecCopy(
@@ -291,8 +306,20 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
     if (I == E)
       continue;
 
-    Register CopyToExec = isCopyToExec(*I, ST);
-    if (!CopyToExec.isValid())
+    // It's possible to see other terminator copies after the exec copy. This
+    // can happen if control flow pseudos had their outputs used by phis.
+    Register CopyToExec;
+
+    unsigned SearchCount = 0;
+    const unsigned SearchLimit = 5;
+    while (I != E && SearchCount++ < SearchLimit) {
+      CopyToExec = isCopyToExec(*I, ST);
+      if (CopyToExec)
+        break;
+      ++I;
+    }
+
+    if (!CopyToExec)
       continue;
 
     // Scan backwards to find the def.
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
index 8af00fcf62a8..162e96655df2 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
@@ -13,9 +13,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
+#include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIInstrInfo.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/InitializePasses.h"
@@ -31,6 +30,17 @@ private:
   const SIRegisterInfo *TRI;
   const SIInstrInfo *TII;
   MachineRegisterInfo *MRI;
+  LiveIntervals *LIS;
+
+  unsigned AndOpc;
+  unsigned Andn2Opc;
+  unsigned OrSaveExecOpc;
+  unsigned XorTermrOpc;
+  MCRegister CondReg;
+  MCRegister ExecReg;
+
+  Register optimizeVcndVcmpPair(MachineBasicBlock &MBB);
+  bool optimizeElseBranch(MachineBasicBlock &MBB);
 
 public:
   static char ID;
@@ -68,11 +78,28 @@ FunctionPass *llvm::createSIOptimizeExecMaskingPreRAPass() {
   return new SIOptimizeExecMaskingPreRA();
 }
 
-static bool isFullExecCopy(const MachineInstr& MI, const GCNSubtarget& ST) {
-  unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+// See if there is a def between \p AndIdx and \p SelIdx that needs to live
+// beyond \p AndIdx.
+static bool isDefBetween(const LiveRange &LR, SlotIndex AndIdx,
+                         SlotIndex SelIdx) {
+  LiveQueryResult AndLRQ = LR.Query(AndIdx);
+  return (!AndLRQ.isKill() && AndLRQ.valueIn() != LR.Query(SelIdx).valueOut());
+}
+
+// FIXME: Why do we bother trying to handle physical registers here?
+static bool isDefBetween(const SIRegisterInfo &TRI,
+                         LiveIntervals *LIS, Register Reg,
+                         const MachineInstr &Sel, const MachineInstr &And) {
+  SlotIndex AndIdx = LIS->getInstructionIndex(And);
+  SlotIndex SelIdx = LIS->getInstructionIndex(Sel);
+
+  if (Reg.isVirtual())
+    return isDefBetween(LIS->getInterval(Reg), AndIdx, SelIdx);
 
-  if (MI.isFullCopy() && MI.getOperand(1).getReg() == Exec)
-    return true;
+  for (MCRegUnitIterator UI(Reg.asMCReg(), &TRI); UI.isValid(); ++UI) {
+    if (isDefBetween(LIS->getRegUnit(*UI), AndIdx, SelIdx))
+      return true;
+  }
 
   return false;
 }
@@ -93,75 +120,71 @@ static bool isFullExecCopy(const MachineInstr& MI, const GCNSubtarget& ST) {
 // lanes.
 //
 // Returns %cc register on success.
-static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB,
-                                     const GCNSubtarget &ST,
-                                     MachineRegisterInfo &MRI,
-                                     LiveIntervals *LIS) {
-  const SIRegisterInfo *TRI = ST.getRegisterInfo();
-  const SIInstrInfo *TII = ST.getInstrInfo();
-  bool Wave32 = ST.isWave32();
-  const unsigned AndOpc = Wave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
-  const unsigned Andn2Opc = Wave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
-  const unsigned CondReg = Wave32 ? AMDGPU::VCC_LO : AMDGPU::VCC;
-  const unsigned ExecReg = Wave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
-
+Register
+SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) {
   auto I = llvm::find_if(MBB.terminators(), [](const MachineInstr &MI) {
                            unsigned Opc = MI.getOpcode();
                            return Opc == AMDGPU::S_CBRANCH_VCCZ ||
                                   Opc == AMDGPU::S_CBRANCH_VCCNZ; });
   if (I == MBB.terminators().end())
-    return AMDGPU::NoRegister;
+    return Register();
 
-  auto *And = TRI->findReachingDef(CondReg, AMDGPU::NoSubRegister,
-                                   *I, MRI, LIS);
+  auto *And =
+      TRI->findReachingDef(CondReg, AMDGPU::NoSubRegister, *I, *MRI, LIS);
   if (!And || And->getOpcode() != AndOpc ||
       !And->getOperand(1).isReg() || !And->getOperand(2).isReg())
-    return AMDGPU::NoRegister;
+    return Register();
 
   MachineOperand *AndCC = &And->getOperand(1);
   Register CmpReg = AndCC->getReg();
   unsigned CmpSubReg = AndCC->getSubReg();
-  if (CmpReg == ExecReg) {
+  if (CmpReg == Register(ExecReg)) {
     AndCC = &And->getOperand(2);
     CmpReg = AndCC->getReg();
     CmpSubReg = AndCC->getSubReg();
-  } else if (And->getOperand(2).getReg() != ExecReg) {
-    return AMDGPU::NoRegister;
+  } else if (And->getOperand(2).getReg() != Register(ExecReg)) {
+    return Register();
   }
 
-  auto *Cmp = TRI->findReachingDef(CmpReg, CmpSubReg, *And, MRI, LIS);
+  auto *Cmp = TRI->findReachingDef(CmpReg, CmpSubReg, *And, *MRI, LIS);
   if (!Cmp || !(Cmp->getOpcode() == AMDGPU::V_CMP_NE_U32_e32 ||
                 Cmp->getOpcode() == AMDGPU::V_CMP_NE_U32_e64) ||
       Cmp->getParent() != And->getParent())
-    return AMDGPU::NoRegister;
+    return Register();
 
   MachineOperand *Op1 = TII->getNamedOperand(*Cmp, AMDGPU::OpName::src0);
   MachineOperand *Op2 = TII->getNamedOperand(*Cmp, AMDGPU::OpName::src1);
   if (Op1->isImm() && Op2->isReg())
     std::swap(Op1, Op2);
   if (!Op1->isReg() || !Op2->isImm() || Op2->getImm() != 1)
-    return AMDGPU::NoRegister;
+    return Register();
 
   Register SelReg = Op1->getReg();
-  auto *Sel = TRI->findReachingDef(SelReg, Op1->getSubReg(), *Cmp, MRI, LIS);
+  auto *Sel = TRI->findReachingDef(SelReg, Op1->getSubReg(), *Cmp, *MRI, LIS);
   if (!Sel || Sel->getOpcode() != AMDGPU::V_CNDMASK_B32_e64)
-    return AMDGPU::NoRegister;
+    return Register();
 
   if (TII->hasModifiersSet(*Sel, AMDGPU::OpName::src0_modifiers) ||
       TII->hasModifiersSet(*Sel, AMDGPU::OpName::src1_modifiers))
-    return AMDGPU::NoRegister;
+    return Register();
 
   Op1 = TII->getNamedOperand(*Sel, AMDGPU::OpName::src0);
   Op2 = TII->getNamedOperand(*Sel, AMDGPU::OpName::src1);
   MachineOperand *CC = TII->getNamedOperand(*Sel, AMDGPU::OpName::src2);
   if (!Op1->isImm() || !Op2->isImm() || !CC->isReg() ||
       Op1->getImm() != 0 || Op2->getImm() != 1)
-    return AMDGPU::NoRegister;
+    return Register();
+
+  Register CCReg = CC->getReg();
+
+  // If there was a def between the select and the and, we would need to move it
+  // to fold this.
+  if (isDefBetween(*TRI, LIS, CCReg, *Sel, *And))
+    return Register();
 
   LLVM_DEBUG(dbgs() << "Folding sequence:\n\t" << *Sel << '\t' << *Cmp << '\t'
                     << *And);
 
-  Register CCReg = CC->getReg();
   LIS->RemoveMachineInstrFromMaps(*And);
   MachineInstr *Andn2 =
       BuildMI(MBB, *And, And->getDebugLoc(), TII->get(Andn2Opc),
@@ -180,8 +203,8 @@ static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB,
 
   // Try to remove compare. Cmp value should not used in between of cmp
   // and s_and_b64 if VCC or just unused if any other register.
-  if ((Register::isVirtualRegister(CmpReg) && MRI.use_nodbg_empty(CmpReg)) ||
-      (CmpReg == CondReg &&
+  if ((CmpReg.isVirtual() && MRI->use_nodbg_empty(CmpReg)) ||
+      (CmpReg == Register(CondReg) &&
        std::none_of(std::next(Cmp->getIterator()), Andn2->getIterator(),
                     [&](const MachineInstr &MI) {
                       return MI.readsRegister(CondReg, TRI);
@@ -192,7 +215,7 @@ static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB,
     Cmp->eraseFromParent();
 
     // Try to remove v_cndmask_b32.
-    if (Register::isVirtualRegister(SelReg) && MRI.use_nodbg_empty(SelReg)) {
+    if (SelReg.isVirtual() && MRI->use_nodbg_empty(SelReg)) {
       LLVM_DEBUG(dbgs() << "Erasing: " << *Sel << '\n');
 
       LIS->RemoveMachineInstrFromMaps(*Sel);
@@ -203,6 +226,81 @@ static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB,
   return CCReg;
 }
 
+// Optimize sequence
+//    %dst = S_OR_SAVEEXEC %src
+//    ... instructions not modifying exec ...
+//    %tmp = S_AND $exec, %dst
+//    $exec = S_XOR_term $exec, %tmp
+// =>
+//    %dst = S_OR_SAVEEXEC %src
+//    ... instructions not modifying exec ...
+//    $exec = S_XOR_term $exec, %dst
+//
+// Clean up potentially unnecessary code added for safety during
+// control flow lowering.
+//
+// Return whether any changes were made to MBB.
+bool SIOptimizeExecMaskingPreRA::optimizeElseBranch(MachineBasicBlock &MBB) {
+  if (MBB.empty())
+    return false;
+
+  // Check this is an else block.
+  auto First = MBB.begin();
+  MachineInstr &SaveExecMI = *First;
+  if (SaveExecMI.getOpcode() != OrSaveExecOpc)
+    return false;
+
+  auto I = llvm::find_if(MBB.terminators(), [this](const MachineInstr &MI) {
+    return MI.getOpcode() == XorTermrOpc;
+  });
+  if (I == MBB.terminators().end())
+    return false;
+
+  MachineInstr &XorTermMI = *I;
+  if (XorTermMI.getOperand(1).getReg() != Register(ExecReg))
+    return false;
+
+  Register SavedExecReg = SaveExecMI.getOperand(0).getReg();
+  Register DstReg = XorTermMI.getOperand(2).getReg();
+
+  // Find potentially unnecessary S_AND
+  MachineInstr *AndExecMI = nullptr;
+  I--;
+  while (I != First && !AndExecMI) {
+    if (I->getOpcode() == AndOpc && I->getOperand(0).getReg() == DstReg &&
+        I->getOperand(1).getReg() == Register(ExecReg))
+      AndExecMI = &*I;
+    I--;
+  }
+  if (!AndExecMI)
+    return false;
+
+  // Check for exec modifying instructions.
+  // Note: exec defs do not create live ranges beyond the
+  // instruction so isDefBetween cannot be used.
+  // Instead just check that the def segments are adjacent.
+  SlotIndex StartIdx = LIS->getInstructionIndex(SaveExecMI);
+  SlotIndex EndIdx = LIS->getInstructionIndex(*AndExecMI);
+  for (MCRegUnitIterator UI(ExecReg, TRI); UI.isValid(); ++UI) {
+    LiveRange &RegUnit = LIS->getRegUnit(*UI);
+    if (RegUnit.find(StartIdx) != std::prev(RegUnit.find(EndIdx)))
+      return false;
+  }
+
+  // Remove unnecessary S_AND
+  LIS->removeInterval(SavedExecReg);
+  LIS->removeInterval(DstReg);
+
+  SaveExecMI.getOperand(0).setReg(DstReg);
+
+  LIS->RemoveMachineInstrFromMaps(*AndExecMI);
+  AndExecMI->eraseFromParent();
+
+  LIS->createAndComputeVirtRegInterval(DstReg);
+
+  return true;
+}
+
 bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
@@ -211,16 +309,28 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
   TRI = ST.getRegisterInfo();
   TII = ST.getInstrInfo();
   MRI = &MF.getRegInfo();
-
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  LiveIntervals *LIS = &getAnalysis<LiveIntervals>();
-  DenseSet<unsigned> RecalcRegs({AMDGPU::EXEC_LO, AMDGPU::EXEC_HI});
-  unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+  LIS = &getAnalysis<LiveIntervals>();
+
+  const bool Wave32 = ST.isWave32();
+  AndOpc = Wave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
+  Andn2Opc = Wave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
+  OrSaveExecOpc =
+      Wave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
+  XorTermrOpc = Wave32 ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
+  CondReg = MCRegister::from(Wave32 ? AMDGPU::VCC_LO : AMDGPU::VCC);
+  ExecReg = MCRegister::from(Wave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC);
+
+  DenseSet<Register> RecalcRegs({AMDGPU::EXEC_LO, AMDGPU::EXEC_HI});
   bool Changed = false;
 
   for (MachineBasicBlock &MBB : MF) {
 
-    if (unsigned Reg = optimizeVcndVcmpPair(MBB, ST, MRI, LIS)) {
+    if (optimizeElseBranch(MBB)) {
+      RecalcRegs.insert(AMDGPU::SCC);
+      Changed = true;
+    }
+
+    if (Register Reg = optimizeVcndVcmpPair(MBB)) {
       RecalcRegs.insert(Reg);
       RecalcRegs.insert(AMDGPU::VCC_LO);
       RecalcRegs.insert(AMDGPU::VCC_HI);
@@ -301,16 +411,18 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
     unsigned ScanThreshold = 10;
     for (auto I = MBB.rbegin(), E = MBB.rend(); I != E
          && ScanThreshold--; ++I) {
-      if (!isFullExecCopy(*I, ST))
+      // Continue scanning if this is not a full exec copy
+      if (!(I->isFullCopy() && I->getOperand(1).getReg() == Register(ExecReg)))
         continue;
 
       Register SavedExec = I->getOperand(0).getReg();
-      if (SavedExec.isVirtual() && MRI.hasOneNonDBGUse(SavedExec) &&
-          MRI.use_instr_nodbg_begin(SavedExec)->getParent() == I->getParent()) {
+      if (SavedExec.isVirtual() && MRI->hasOneNonDBGUse(SavedExec) &&
+          MRI->use_instr_nodbg_begin(SavedExec)->getParent() ==
+              I->getParent()) {
         LLVM_DEBUG(dbgs() << "Redundant EXEC COPY: " << *I << '\n');
         LIS->RemoveMachineInstrFromMaps(*I);
         I->eraseFromParent();
-        MRI.replaceRegWith(SavedExec, Exec);
+        MRI->replaceRegWith(SavedExec, ExecReg);
         LIS->removeInterval(SavedExec);
         Changed = true;
       }
@@ -320,9 +432,9 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
 
   if (Changed) {
     for (auto Reg : RecalcRegs) {
-      if (Register::isVirtualRegister(Reg)) {
+      if (Reg.isVirtual()) {
         LIS->removeInterval(Reg);
-        if (!MRI.reg_empty(Reg))
+        if (!MRI->reg_empty(Reg))
           LIS->createAndComputeVirtRegInterval(Reg);
       } else {
         LIS->removeAllRegUnitsForPhysReg(Reg);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 9a1855c3458b..7d7a753bb333 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -10,47 +10,21 @@
 ///
 /// E.g. original:
 ///   V_LSHRREV_B32_e32 %0, 16, %1
-///   V_ADD_I32_e32 %2, %0, %3
+///   V_ADD_CO_U32_e32 %2, %0, %3
 ///   V_LSHLREV_B32_e32 %4, 16, %2
 ///
 /// Replace:
-///   V_ADD_I32_sdwa %4, %1, %3
+///   V_ADD_CO_U32_sdwa %4, %1, %3
 ///       dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ///
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "SIDefines.h"
-#include "SIInstrInfo.h"
-#include "SIRegisterInfo.h"
+#include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/None.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/Config/llvm-config.h"
-#include "llvm/MC/LaneBitmask.h"
-#include "llvm/MC/MCInstrDesc.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
-#include <memory>
-#include <unordered_map>
 
 using namespace llvm;
 
@@ -570,8 +544,7 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
 
     MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
     MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
-    if (Register::isPhysicalRegister(Src1->getReg()) ||
-        Register::isPhysicalRegister(Dst->getReg()))
+    if (Src1->getReg().isPhysical() || Dst->getReg().isPhysical())
       break;
 
     if (Opcode == AMDGPU::V_LSHLREV_B32_e32 ||
@@ -609,8 +582,7 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
     MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
     MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
 
-    if (Register::isPhysicalRegister(Src1->getReg()) ||
-        Register::isPhysicalRegister(Dst->getReg()))
+    if (Src1->getReg().isPhysical() || Dst->getReg().isPhysical())
       break;
 
     if (Opcode == AMDGPU::V_LSHLREV_B16_e32 ||
@@ -625,8 +597,8 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
     break;
   }
 
-  case AMDGPU::V_BFE_I32:
-  case AMDGPU::V_BFE_U32: {
+  case AMDGPU::V_BFE_I32_e64:
+  case AMDGPU::V_BFE_U32_e64: {
     // e.g.:
     // from: v_bfe_u32 v1, v0, 8, 8
     // to SDWA src:v0 src_sel:BYTE_1
@@ -673,12 +645,11 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
     MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
     MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
 
-    if (Register::isPhysicalRegister(Src0->getReg()) ||
-        Register::isPhysicalRegister(Dst->getReg()))
+    if (Src0->getReg().isPhysical() || Dst->getReg().isPhysical())
       break;
 
     return std::make_unique<SDWASrcOperand>(
-          Src0, Dst, SrcSel, false, false, Opcode != AMDGPU::V_BFE_U32);
+          Src0, Dst, SrcSel, false, false, Opcode != AMDGPU::V_BFE_U32_e64);
   }
 
   case AMDGPU::V_AND_B32_e32:
@@ -702,8 +673,7 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
 
     MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
 
-    if (Register::isPhysicalRegister(ValSrc->getReg()) ||
-        Register::isPhysicalRegister(Dst->getReg()))
+    if (ValSrc->getReg().isPhysical() || Dst->getReg().isPhysical())
       break;
 
     return std::make_unique<SDWASrcOperand>(
@@ -863,19 +833,19 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) {
 }
 
 // Convert the V_ADDC_U32_e64 into V_ADDC_U32_e32, and
-// V_ADD_I32_e64 into V_ADD_I32_e32. This allows isConvertibleToSDWA
-// to perform its transformation on V_ADD_I32_e32 into V_ADD_I32_sdwa.
+// V_ADD_CO_U32_e64 into V_ADD_CO_U32_e32. This allows isConvertibleToSDWA
+// to perform its transformation on V_ADD_CO_U32_e32 into V_ADD_CO_U32_sdwa.
 //
 // We are transforming from a VOP3 into a VOP2 form of the instruction.
 //   %19:vgpr_32 = V_AND_B32_e32 255,
 //       killed %16:vgpr_32, implicit $exec
-//   %47:vgpr_32, %49:sreg_64_xexec = V_ADD_I32_e64
+//   %47:vgpr_32, %49:sreg_64_xexec = V_ADD_CO_U32_e64
 //       %26.sub0:vreg_64, %19:vgpr_32, implicit $exec
 //  %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64
 //       %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec
 //
 // becomes
-//   %47:vgpr_32 = V_ADD_I32_sdwa
+//   %47:vgpr_32 = V_ADD_CO_U32_sdwa
 //       0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0,
 //       implicit-def $vcc, implicit $exec
 //  %48:vgpr_32 = V_ADDC_U32_e32
@@ -883,8 +853,8 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) {
 void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI,
                                            const GCNSubtarget &ST) const {
   int Opc = MI.getOpcode();
-  assert((Opc == AMDGPU::V_ADD_I32_e64 || Opc == AMDGPU::V_SUB_I32_e64) &&
-         "Currently only handles V_ADD_I32_e64 or V_SUB_I32_e64");
+  assert((Opc == AMDGPU::V_ADD_CO_U32_e64 || Opc == AMDGPU::V_SUB_CO_U32_e64) &&
+         "Currently only handles V_ADD_CO_U32_e64 or V_SUB_CO_U32_e64");
 
   // Can the candidate MI be shrunk?
   if (!TII->canShrink(MI, *MRI))
@@ -992,6 +962,16 @@ bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI,
   if (Opc == AMDGPU::V_CNDMASK_B32_e32)
     return false;
 
+  if (MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0)) {
+    if (!Src0->isReg() && !Src0->isImm())
+      return false;
+  }
+
+  if (MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1)) {
+    if (!Src1->isReg() && !Src1->isImm())
+      return false;
+  }
+
   return true;
 }
 
@@ -1235,8 +1215,8 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
         const auto &Operand = OperandPair.second;
         MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
         if (PotentialMI &&
-           (PotentialMI->getOpcode() == AMDGPU::V_ADD_I32_e64 ||
-            PotentialMI->getOpcode() == AMDGPU::V_SUB_I32_e64))
+           (PotentialMI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 ||
+            PotentialMI->getOpcode() == AMDGPU::V_SUB_CO_U32_e64))
           pseudoOpConvertToVOP2(*PotentialMI, ST);
       }
       SDWAOperands.clear();
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
index 4c72fa235975..ab05081e55d5 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
@@ -13,13 +13,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "SIDefines.h"
-#include "SIInstrInfo.h"
+#include "GCNSubtarget.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBundle.h"
-#include "llvm/InitializePasses.h"
 
 using namespace llvm;
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
index 09dfe8753792..dc08d9dcb9bb 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
@@ -12,19 +12,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
+#include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
-#include "SIRegisterInfo.h"
 #include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/LiveRegMatrix.h"
-#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/RegisterClassInfo.h"
-#include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/InitializePasses.h"
 
 using namespace llvm;
@@ -92,11 +86,10 @@ bool SIPreAllocateWWMRegs::processDef(MachineOperand &MO) {
     return false;
 
   Register Reg = MO.getReg();
-
-  if (!TRI->isVGPR(*MRI, Reg))
+  if (Reg.isPhysical())
     return false;
 
-  if (Register::isPhysicalRegister(Reg))
+  if (!TRI->isVGPR(*MRI, Reg))
     return false;
 
   if (VRM->hasPhys(Reg))
@@ -104,7 +97,7 @@ bool SIPreAllocateWWMRegs::processDef(MachineOperand &MO) {
 
   LiveInterval &LI = LIS->getInterval(Reg);
 
-  for (unsigned PhysReg : RegClassInfo.getOrder(MRI->getRegClass(Reg))) {
+  for (MCRegister PhysReg : RegClassInfo.getOrder(MRI->getRegClass(Reg))) {
     if (!MRI->isPhysRegUsed(PhysReg) &&
         Matrix->checkInterference(LI, PhysReg) == LiveRegMatrix::IK_Free) {
       Matrix->assign(LI, PhysReg);
@@ -126,7 +119,7 @@ void SIPreAllocateWWMRegs::rewriteRegs(MachineFunction &MF) {
           continue;
 
         const Register VirtReg = MO.getReg();
-        if (Register::isPhysicalRegister(VirtReg))
+        if (VirtReg.isPhysical())
           continue;
 
         if (!VRM->hasPhys(VirtReg))
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index 442be886a8ac..9ca43512cd91 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -12,12 +12,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
+#include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/Support/CommandLine.h"
 
 using namespace llvm;
 
@@ -70,6 +68,7 @@ bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
   const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
   const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
   const unsigned AndN2 = IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
+  const unsigned Mov = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
 
   MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
                                       E = MBB.rend();
@@ -136,9 +135,20 @@ bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
   if (A->getOpcode() == AndN2)
     MaskValue = ~MaskValue;
 
-  if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC) &&
-      MI.killsRegister(CondReg, TRI))
+  if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC)) {
+    if (!MI.killsRegister(CondReg, TRI)) {
+      // Replace AND with MOV
+      if (MaskValue == 0) {
+        BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg)
+            .addImm(0);
+      } else {
+        BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg)
+            .addReg(ExecReg);
+      }
+    }
+    // Remove AND instruction
     A->eraseFromParent();
+  }
 
   bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
   if (SReg == ExecReg) {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
new file mode 100644
index 000000000000..877c8b81b2c0
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
@@ -0,0 +1,56 @@
+//===-- SIProgramInfo.cpp ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// The SIProgramInfo tracks resource usage and hardware flags for kernels and
+/// entry functions.
+//
+//===----------------------------------------------------------------------===//
+//
+
+#include "SIProgramInfo.h"
+#include "SIDefines.h"
+#include "Utils/AMDGPUBaseInfo.h"
+
+using namespace llvm;
+
+uint64_t SIProgramInfo::getComputePGMRSrc1() const {
+  return S_00B848_VGPRS(VGPRBlocks) | S_00B848_SGPRS(SGPRBlocks) |
+         S_00B848_PRIORITY(Priority) | S_00B848_FLOAT_MODE(FloatMode) |
+         S_00B848_PRIV(Priv) | S_00B848_DX10_CLAMP(DX10Clamp) |
+         S_00B848_DEBUG_MODE(DebugMode) | S_00B848_IEEE_MODE(IEEEMode) |
+         S_00B848_WGP_MODE(WgpMode) | S_00B848_MEM_ORDERED(MemOrdered);
+}
+
+uint64_t SIProgramInfo::getPGMRSrc1(CallingConv::ID CC) const {
+  if (AMDGPU::isCompute(CC)) {
+    return getComputePGMRSrc1();
+  }
+  uint64_t Reg = S_00B848_VGPRS(VGPRBlocks) | S_00B848_SGPRS(SGPRBlocks) |
+                 S_00B848_PRIORITY(Priority) | S_00B848_FLOAT_MODE(FloatMode) |
+                 S_00B848_PRIV(Priv) | S_00B848_DX10_CLAMP(DX10Clamp) |
+                 S_00B848_DEBUG_MODE(DebugMode) | S_00B848_IEEE_MODE(IEEEMode);
+  switch (CC) {
+  case CallingConv::AMDGPU_PS:
+    Reg |= S_00B028_MEM_ORDERED(MemOrdered);
+    break;
+  case CallingConv::AMDGPU_VS:
+    Reg |= S_00B128_MEM_ORDERED(MemOrdered);
+    break;
+  case CallingConv::AMDGPU_GS:
+    Reg |= S_00B228_WGP_MODE(WgpMode) | S_00B228_MEM_ORDERED(MemOrdered);
+    break;
+  case CallingConv::AMDGPU_HS:
+    Reg |= S_00B428_WGP_MODE(WgpMode) | S_00B428_MEM_ORDERED(MemOrdered);
+    break;
+  default:
+    break;
+  }
+  return Reg;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIProgramInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIProgramInfo.h
index 7c039a54b57f..9b72d0829d80 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIProgramInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIProgramInfo.h
@@ -7,7 +7,8 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// Defines struct to track resource usage for kernels and entry functions.
+/// Defines struct to track resource usage and hardware flags for kernels and
+/// entry functions.
 ///
 //
 //===----------------------------------------------------------------------===//
@@ -15,6 +16,9 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_SIPROGRAMINFO_H
 #define LLVM_LIB_TARGET_AMDGPU_SIPROGRAMINFO_H
 
+#include "llvm/IR/CallingConv.h"
+#include <cstdint>
+
 namespace llvm {
 
 /// Track resource usage for kernels / entry functions.
@@ -32,8 +36,6 @@ struct SIProgramInfo {
     uint32_t MemOrdered = 0; // GFX10+
     uint64_t ScratchSize = 0;
 
-    uint64_t ComputePGMRSrc1 = 0;
-
     // Fields set in PGM_RSRC2 pm4 packet.
     uint32_t LDSBlocks = 0;
     uint32_t ScratchBlocks = 0;
@@ -64,6 +66,10 @@ struct SIProgramInfo {
     bool VCCUsed = false;
 
     SIProgramInfo() = default;
+
+    /// Compute the value of the ComputePGMRsrc1 register.
+    uint64_t getComputePGMRSrc1() const;
+    uint64_t getPGMRSrc1(CallingConv::ID CC) const;
 };
 
 } // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 5d6009ebf384..7a45d8c54f9a 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -12,21 +12,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "SIRegisterInfo.h"
+#include "AMDGPU.h"
 #include "AMDGPURegisterBankInfo.h"
-#include "AMDGPUSubtarget.h"
-#include "SIInstrInfo.h"
-#include "SIMachineFunctionInfo.h"
+#include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUInstPrinter.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIMachineFunctionInfo.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
-#include "llvm/CodeGen/SlotIndexes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/LLVMContext.h"
-#include <vector>
 
 using namespace llvm;
 
@@ -40,6 +34,14 @@ static cl::opt<bool> EnableSpillSGPRToVGPR(
   cl::init(true));
 
 std::array<std::vector<int16_t>, 16> SIRegisterInfo::RegSplitParts;
+std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable;
+
+// Map numbers of DWORDs to indexes in SubRegFromChannelTable.
+// Valid indexes are shifted 1, such that a 0 mapping means unsupported.
+// e.g. for 8 DWORDs (256-bit), SubRegFromChannelTableWidthMap[8] = 8,
+//      meaning index 7 in SubRegFromChannelTable.
+static const std::array<unsigned, 17> SubRegFromChannelTableWidthMap = {
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 9};
 
 SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST)
     : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour()), ST(ST),
@@ -53,7 +55,8 @@ SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST)
          "getNumCoveredRegs() will not work with generated subreg masks!");
 
   RegPressureIgnoredUnits.resize(getNumRegUnits());
-  RegPressureIgnoredUnits.set(*MCRegUnitIterator(AMDGPU::M0, this));
+  RegPressureIgnoredUnits.set(
+      *MCRegUnitIterator(MCRegister::from(AMDGPU::M0), this));
   for (auto Reg : AMDGPU::VGPR_HI16RegClass)
     RegPressureIgnoredUnits.set(*MCRegUnitIterator(Reg, this));
 
@@ -78,8 +81,28 @@ SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST)
     }
   };
 
+  static llvm::once_flag InitializeSubRegFromChannelTableFlag;
+
+  static auto InitializeSubRegFromChannelTableOnce = [this]() {
+    for (auto &Row : SubRegFromChannelTable)
+      Row.fill(AMDGPU::NoSubRegister);
+    for (uint16_t Idx = 1; Idx < getNumSubRegIndices(); ++Idx) {
+      unsigned Width = AMDGPUSubRegIdxRanges[Idx].Size / 32;
+      unsigned Offset = AMDGPUSubRegIdxRanges[Idx].Offset / 32;
+      assert(Width < SubRegFromChannelTableWidthMap.size());
+      Width = SubRegFromChannelTableWidthMap[Width];
+      if (Width == 0)
+        continue;
+      unsigned TableIdx = Width - 1;
+      assert(TableIdx < SubRegFromChannelTable.size());
+      assert(Offset < SubRegFromChannelTable[TableIdx].size());
+      SubRegFromChannelTable[TableIdx][Offset] = Idx;
+    }
+  };
 
   llvm::call_once(InitializeRegSplitPartsFlag, InitializeRegSplitPartsOnce);
+  llvm::call_once(InitializeSubRegFromChannelTableFlag,
+                  InitializeSubRegFromChannelTableOnce);
 }
 
 void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved,
@@ -98,6 +121,7 @@ const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs(
   case CallingConv::C:
   case CallingConv::Fast:
   case CallingConv::Cold:
+  case CallingConv::AMDGPU_Gfx:
     return CSR_AMDGPU_HighRegs_SaveList;
   default: {
     // Dummy to not crash RegisterClassInfo.
@@ -118,12 +142,17 @@ const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
   case CallingConv::C:
   case CallingConv::Fast:
   case CallingConv::Cold:
+  case CallingConv::AMDGPU_Gfx:
     return CSR_AMDGPU_HighRegs_RegMask;
   default:
     return nullptr;
   }
 }
 
+const uint32_t *SIRegisterInfo::getNoPreservedMask() const {
+  return CSR_AMDGPU_NoRegs_RegMask;
+}
+
 Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   const SIFrameLowering *TFI =
       MF.getSubtarget<GCNSubtarget>().getFrameLowering();
@@ -156,71 +185,13 @@ const uint32_t *SIRegisterInfo::getAllAllocatableSRegMask() const {
   return CSR_AMDGPU_AllAllocatableSRegs_RegMask;
 }
 
-// FIXME: TableGen should generate something to make this manageable for all
-// register classes. At a minimum we could use the opposite of
-// composeSubRegIndices and go up from the base 32-bit subreg.
 unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel,
                                               unsigned NumRegs) {
-  // Table of NumRegs sized pieces at every 32-bit offset.
-  static const uint16_t SubRegFromChannelTable[][32] = {
-      {AMDGPU::sub0,  AMDGPU::sub1,  AMDGPU::sub2,  AMDGPU::sub3,
-       AMDGPU::sub4,  AMDGPU::sub5,  AMDGPU::sub6,  AMDGPU::sub7,
-       AMDGPU::sub8,  AMDGPU::sub9,  AMDGPU::sub10, AMDGPU::sub11,
-       AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
-       AMDGPU::sub16, AMDGPU::sub17, AMDGPU::sub18, AMDGPU::sub19,
-       AMDGPU::sub20, AMDGPU::sub21, AMDGPU::sub22, AMDGPU::sub23,
-       AMDGPU::sub24, AMDGPU::sub25, AMDGPU::sub26, AMDGPU::sub27,
-       AMDGPU::sub28, AMDGPU::sub29, AMDGPU::sub30, AMDGPU::sub31},
-      {AMDGPU::sub0_sub1,   AMDGPU::sub1_sub2,    AMDGPU::sub2_sub3,
-       AMDGPU::sub3_sub4,   AMDGPU::sub4_sub5,    AMDGPU::sub5_sub6,
-       AMDGPU::sub6_sub7,   AMDGPU::sub7_sub8,    AMDGPU::sub8_sub9,
-       AMDGPU::sub9_sub10,  AMDGPU::sub10_sub11,  AMDGPU::sub11_sub12,
-       AMDGPU::sub12_sub13, AMDGPU::sub13_sub14,  AMDGPU::sub14_sub15,
-       AMDGPU::sub15_sub16, AMDGPU::sub16_sub17,  AMDGPU::sub17_sub18,
-       AMDGPU::sub18_sub19, AMDGPU::sub19_sub20,  AMDGPU::sub20_sub21,
-       AMDGPU::sub21_sub22, AMDGPU::sub22_sub23,  AMDGPU::sub23_sub24,
-       AMDGPU::sub24_sub25, AMDGPU::sub25_sub26,  AMDGPU::sub26_sub27,
-       AMDGPU::sub27_sub28, AMDGPU::sub28_sub29,  AMDGPU::sub29_sub30,
-       AMDGPU::sub30_sub31, AMDGPU::NoSubRegister},
-      {AMDGPU::sub0_sub1_sub2,    AMDGPU::sub1_sub2_sub3,
-       AMDGPU::sub2_sub3_sub4,    AMDGPU::sub3_sub4_sub5,
-       AMDGPU::sub4_sub5_sub6,    AMDGPU::sub5_sub6_sub7,
-       AMDGPU::sub6_sub7_sub8,    AMDGPU::sub7_sub8_sub9,
-       AMDGPU::sub8_sub9_sub10,   AMDGPU::sub9_sub10_sub11,
-       AMDGPU::sub10_sub11_sub12, AMDGPU::sub11_sub12_sub13,
-       AMDGPU::sub12_sub13_sub14, AMDGPU::sub13_sub14_sub15,
-       AMDGPU::sub14_sub15_sub16, AMDGPU::sub15_sub16_sub17,
-       AMDGPU::sub16_sub17_sub18, AMDGPU::sub17_sub18_sub19,
-       AMDGPU::sub18_sub19_sub20, AMDGPU::sub19_sub20_sub21,
-       AMDGPU::sub20_sub21_sub22, AMDGPU::sub21_sub22_sub23,
-       AMDGPU::sub22_sub23_sub24, AMDGPU::sub23_sub24_sub25,
-       AMDGPU::sub24_sub25_sub26, AMDGPU::sub25_sub26_sub27,
-       AMDGPU::sub26_sub27_sub28, AMDGPU::sub27_sub28_sub29,
-       AMDGPU::sub28_sub29_sub30, AMDGPU::sub29_sub30_sub31,
-       AMDGPU::NoSubRegister,     AMDGPU::NoSubRegister},
-      {AMDGPU::sub0_sub1_sub2_sub3,     AMDGPU::sub1_sub2_sub3_sub4,
-       AMDGPU::sub2_sub3_sub4_sub5,     AMDGPU::sub3_sub4_sub5_sub6,
-       AMDGPU::sub4_sub5_sub6_sub7,     AMDGPU::sub5_sub6_sub7_sub8,
-       AMDGPU::sub6_sub7_sub8_sub9,     AMDGPU::sub7_sub8_sub9_sub10,
-       AMDGPU::sub8_sub9_sub10_sub11,   AMDGPU::sub9_sub10_sub11_sub12,
-       AMDGPU::sub10_sub11_sub12_sub13, AMDGPU::sub11_sub12_sub13_sub14,
-       AMDGPU::sub12_sub13_sub14_sub15, AMDGPU::sub13_sub14_sub15_sub16,
-       AMDGPU::sub14_sub15_sub16_sub17, AMDGPU::sub15_sub16_sub17_sub18,
-       AMDGPU::sub16_sub17_sub18_sub19, AMDGPU::sub17_sub18_sub19_sub20,
-       AMDGPU::sub18_sub19_sub20_sub21, AMDGPU::sub19_sub20_sub21_sub22,
-       AMDGPU::sub20_sub21_sub22_sub23, AMDGPU::sub21_sub22_sub23_sub24,
-       AMDGPU::sub22_sub23_sub24_sub25, AMDGPU::sub23_sub24_sub25_sub26,
-       AMDGPU::sub24_sub25_sub26_sub27, AMDGPU::sub25_sub26_sub27_sub28,
-       AMDGPU::sub26_sub27_sub28_sub29, AMDGPU::sub27_sub28_sub29_sub30,
-       AMDGPU::sub28_sub29_sub30_sub31, AMDGPU::NoSubRegister,
-       AMDGPU::NoSubRegister,           AMDGPU::NoSubRegister}};
-
-  const unsigned NumRegIndex = NumRegs - 1;
-
-  assert(NumRegIndex < array_lengthof(SubRegFromChannelTable) &&
-         "Not implemented");
-  assert(Channel < array_lengthof(SubRegFromChannelTable[0]));
-  return SubRegFromChannelTable[NumRegIndex][Channel];
+  assert(NumRegs < SubRegFromChannelTableWidthMap.size());
+  unsigned NumRegIndex = SubRegFromChannelTableWidthMap[NumRegs];
+  assert(NumRegIndex && "Not implemented");
+  assert(Channel < SubRegFromChannelTable[NumRegIndex - 1].size());
+  return SubRegFromChannelTable[NumRegIndex - 1][Channel];
 }
 
 MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg(
@@ -322,7 +293,7 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
 
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
-  unsigned ScratchRSrcReg = MFI->getScratchRSrcReg();
+  Register ScratchRSrcReg = MFI->getScratchRSrcReg();
   if (ScratchRSrcReg != AMDGPU::NoRegister) {
     // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need
     // to spill.
@@ -363,9 +334,8 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs())
     reserveRegisterTuples(Reserved, Reg);
 
-  if (MFI->VGPRReservedForSGPRSpill)
-    for (auto SSpill : MFI->getSGPRSpillVGPRs())
-      reserveRegisterTuples(Reserved, SSpill.VGPR);
+  for (auto SSpill : MFI->getSGPRSpillVGPRs())
+    reserveRegisterTuples(Reserved, SSpill.VGPR);
 
   return Reserved;
 }
@@ -415,8 +385,8 @@ bool SIRegisterInfo::requiresVirtualBaseRegisters(
   return true;
 }
 
-int64_t SIRegisterInfo::getMUBUFInstrOffset(const MachineInstr *MI) const {
-  assert(SIInstrInfo::isMUBUF(*MI));
+int64_t SIRegisterInfo::getScratchInstrOffset(const MachineInstr *MI) const {
+  assert(SIInstrInfo::isMUBUF(*MI) || SIInstrInfo::isFLATScratch(*MI));
 
   int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
                                           AMDGPU::OpName::offset);
@@ -425,29 +395,34 @@ int64_t SIRegisterInfo::getMUBUFInstrOffset(const MachineInstr *MI) const {
 
 int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI,
                                                  int Idx) const {
-  if (!SIInstrInfo::isMUBUF(*MI))
+  if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI))
     return 0;
 
-  assert(Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
-                                           AMDGPU::OpName::vaddr) &&
+  assert((Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                            AMDGPU::OpName::vaddr) ||
+         (Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                            AMDGPU::OpName::saddr))) &&
          "Should never see frame index on non-address operand");
 
-  return getMUBUFInstrOffset(MI);
+  return getScratchInstrOffset(MI);
 }
 
 bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
   if (!MI->mayLoadOrStore())
     return false;
 
-  int64_t FullOffset = Offset + getMUBUFInstrOffset(MI);
+  int64_t FullOffset = Offset + getScratchInstrOffset(MI);
 
-  return !isUInt<12>(FullOffset);
+  if (SIInstrInfo::isMUBUF(*MI))
+    return !SIInstrInfo::isLegalMUBUFImmOffset(FullOffset);
+
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  return !TII->isLegalFLATOffset(FullOffset, AMDGPUAS::PRIVATE_ADDRESS, true);
 }
 
-void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
-                                                  Register BaseReg,
-                                                  int FrameIdx,
-                                                  int64_t Offset) const {
+Register SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
+                                                      int FrameIdx,
+                                                      int64_t Offset) const {
   MachineBasicBlock::iterator Ins = MBB->begin();
   DebugLoc DL; // Defaults to "unknown"
 
@@ -456,32 +431,50 @@ void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
 
   MachineFunction *MF = MBB->getParent();
   const SIInstrInfo *TII = ST.getInstrInfo();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  unsigned MovOpc = ST.enableFlatScratch() ? AMDGPU::S_MOV_B32
+                                           : AMDGPU::V_MOV_B32_e32;
+
+  Register BaseReg = MRI.createVirtualRegister(
+      ST.enableFlatScratch() ? &AMDGPU::SReg_32_XEXEC_HIRegClass
+                             : &AMDGPU::VGPR_32RegClass);
 
   if (Offset == 0) {
-    BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), BaseReg)
+    BuildMI(*MBB, Ins, DL, TII->get(MovOpc), BaseReg)
       .addFrameIndex(FrameIdx);
-    return;
+    return BaseReg;
   }
 
-  MachineRegisterInfo &MRI = MF->getRegInfo();
   Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
 
-  Register FIReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+  Register FIReg = MRI.createVirtualRegister(
+      ST.enableFlatScratch() ? &AMDGPU::SReg_32_XM0RegClass
+                             : &AMDGPU::VGPR_32RegClass);
 
   BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
     .addImm(Offset);
-  BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), FIReg)
+  BuildMI(*MBB, Ins, DL, TII->get(MovOpc), FIReg)
     .addFrameIndex(FrameIdx);
 
+  if (ST.enableFlatScratch() ) {
+    BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_U32), BaseReg)
+        .addReg(OffsetReg, RegState::Kill)
+        .addReg(FIReg);
+    return BaseReg;
+  }
+
   TII->getAddNoCarry(*MBB, Ins, DL, BaseReg)
     .addReg(OffsetReg, RegState::Kill)
     .addReg(FIReg)
     .addImm(0); // clamp bit
+
+  return BaseReg;
 }
 
 void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
                                        int64_t Offset) const {
   const SIInstrInfo *TII = ST.getInstrInfo();
+  bool IsFlat = TII->isFLATScratch(MI);
 
 #ifndef NDEBUG
   // FIXME: Is it possible to be storing a frame index to itself?
@@ -496,20 +489,31 @@ void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
   }
 #endif
 
-  MachineOperand *FIOp = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
-#ifndef NDEBUG
-  MachineBasicBlock *MBB = MI.getParent();
-  MachineFunction *MF = MBB->getParent();
-#endif
-  assert(FIOp && FIOp->isFI() && "frame index must be address operand");
-  assert(TII->isMUBUF(MI));
-  assert(TII->getNamedOperand(MI, AMDGPU::OpName::soffset)->getReg() ==
-         MF->getInfo<SIMachineFunctionInfo>()->getStackPtrOffsetReg() &&
-         "should only be seeing stack pointer offset relative FrameIndex");
+  MachineOperand *FIOp =
+      TII->getNamedOperand(MI, IsFlat ? AMDGPU::OpName::saddr
+                                      : AMDGPU::OpName::vaddr);
 
   MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset);
   int64_t NewOffset = OffsetOp->getImm() + Offset;
-  assert(isUInt<12>(NewOffset) && "offset should be legal");
+
+  assert(FIOp && FIOp->isFI() && "frame index must be address operand");
+  assert(TII->isMUBUF(MI) || TII->isFLATScratch(MI));
+
+  if (IsFlat) {
+    assert(TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, true) &&
+           "offset should be legal");
+    FIOp->ChangeToRegister(BaseReg, false);
+    OffsetOp->setImm(NewOffset);
+    return;
+  }
+
+#ifndef NDEBUG
+  MachineOperand *SOffset = TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
+  assert(SOffset->isImm() && SOffset->getImm() == 0);
+#endif
+
+  assert(SIInstrInfo::isLegalMUBUFImmOffset(NewOffset) &&
+         "offset should be legal");
 
   FIOp->ChangeToRegister(BaseReg, false);
   OffsetOp->setImm(NewOffset);
@@ -518,12 +522,16 @@ void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
 bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
                                         Register BaseReg,
                                         int64_t Offset) const {
-  if (!SIInstrInfo::isMUBUF(*MI))
+  if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI))
     return false;
 
-  int64_t NewOffset = Offset + getMUBUFInstrOffset(MI);
+  int64_t NewOffset = Offset + getScratchInstrOffset(MI);
+
+  if (SIInstrInfo::isMUBUF(*MI))
+    return SIInstrInfo::isLegalMUBUFImmOffset(NewOffset);
 
-  return isUInt<12>(NewOffset);
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  return TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, true);
 }
 
 const TargetRegisterClass *SIRegisterInfo::getPointerRegClass(
@@ -555,16 +563,22 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
   case AMDGPU::SI_SPILL_S256_RESTORE:
   case AMDGPU::SI_SPILL_V256_SAVE:
   case AMDGPU::SI_SPILL_V256_RESTORE:
+  case AMDGPU::SI_SPILL_A256_SAVE:
+  case AMDGPU::SI_SPILL_A256_RESTORE:
     return 8;
   case AMDGPU::SI_SPILL_S192_SAVE:
   case AMDGPU::SI_SPILL_S192_RESTORE:
   case AMDGPU::SI_SPILL_V192_SAVE:
   case AMDGPU::SI_SPILL_V192_RESTORE:
+  case AMDGPU::SI_SPILL_A192_SAVE:
+  case AMDGPU::SI_SPILL_A192_RESTORE:
     return 6;
   case AMDGPU::SI_SPILL_S160_SAVE:
   case AMDGPU::SI_SPILL_S160_RESTORE:
   case AMDGPU::SI_SPILL_V160_SAVE:
   case AMDGPU::SI_SPILL_V160_RESTORE:
+  case AMDGPU::SI_SPILL_A160_SAVE:
+  case AMDGPU::SI_SPILL_A160_RESTORE:
     return 5;
   case AMDGPU::SI_SPILL_S128_SAVE:
   case AMDGPU::SI_SPILL_S128_RESTORE:
@@ -577,6 +591,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
   case AMDGPU::SI_SPILL_S96_RESTORE:
   case AMDGPU::SI_SPILL_V96_SAVE:
   case AMDGPU::SI_SPILL_V96_RESTORE:
+  case AMDGPU::SI_SPILL_A96_SAVE:
+  case AMDGPU::SI_SPILL_A96_RESTORE:
     return 3;
   case AMDGPU::SI_SPILL_S64_SAVE:
   case AMDGPU::SI_SPILL_S64_RESTORE:
@@ -672,11 +688,13 @@ static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST,
 
   unsigned Dst = IsStore ? Reg : ValueReg;
   unsigned Src = IsStore ? ValueReg : Reg;
-  unsigned Opc = (IsStore ^ TRI->isVGPR(MRI, Reg)) ? AMDGPU::V_ACCVGPR_WRITE_B32
-                                                   : AMDGPU::V_ACCVGPR_READ_B32;
+  unsigned Opc = (IsStore ^ TRI->isVGPR(MRI, Reg)) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64
+                                                   : AMDGPU::V_ACCVGPR_READ_B32_e64;
 
-  return BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(Opc), Dst)
-           .addReg(Src, getKillRegState(IsKill));
+  auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(Opc), Dst)
+               .addReg(Src, getKillRegState(IsKill));
+  MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse);
+  return MIB;
 }
 
 // This differs from buildSpillLoadStore by only scavenging a VGPR. It does not
@@ -721,12 +739,46 @@ static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST,
   return true;
 }
 
+static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII,
+                                          unsigned LoadStoreOp,
+                                          unsigned EltSize) {
+  bool IsStore = TII->get(LoadStoreOp).mayStore();
+  bool UseST =
+    AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0 &&
+    AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::saddr) < 0;
+
+  switch (EltSize) {
+  case 4:
+    LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
+                          : AMDGPU::SCRATCH_LOAD_DWORD_SADDR;
+    break;
+  case 8:
+    LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX2_SADDR
+                          : AMDGPU::SCRATCH_LOAD_DWORDX2_SADDR;
+    break;
+  case 12:
+    LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX3_SADDR
+                          : AMDGPU::SCRATCH_LOAD_DWORDX3_SADDR;
+    break;
+  case 16:
+    LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX4_SADDR
+                          : AMDGPU::SCRATCH_LOAD_DWORDX4_SADDR;
+    break;
+  default:
+    llvm_unreachable("Unexpected spill load/store size!");
+  }
+
+  if (UseST)
+    LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
+
+  return LoadStoreOp;
+}
+
 void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
                                          unsigned LoadStoreOp,
                                          int Index,
                                          Register ValueReg,
                                          bool IsKill,
-                                         MCRegister ScratchRsrcReg,
                                          MCRegister ScratchOffsetReg,
                                          int64_t InstOffset,
                                          MachineMemOperand *MMO,
@@ -737,36 +789,51 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
   const MachineFrameInfo &MFI = MF->getFrameInfo();
   const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
 
-  const MCInstrDesc &Desc = TII->get(LoadStoreOp);
+  const MCInstrDesc *Desc = &TII->get(LoadStoreOp);
   const DebugLoc &DL = MI->getDebugLoc();
-  bool IsStore = Desc.mayStore();
+  bool IsStore = Desc->mayStore();
+  bool IsFlat = TII->isFLATScratch(LoadStoreOp);
 
   bool Scavenged = false;
   MCRegister SOffset = ScratchOffsetReg;
 
-  const unsigned EltSize = 4;
   const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg);
-  unsigned NumSubRegs = AMDGPU::getRegBitWidth(RC->getID()) / (EltSize * CHAR_BIT);
+  const bool IsAGPR = hasAGPRs(RC);
+  const unsigned RegWidth = AMDGPU::getRegBitWidth(RC->getID()) / 8;
+
+  // Always use 4 byte operations for AGPRs because we need to scavenge
+  // a temporary VGPR.
+  unsigned EltSize = (IsFlat && !IsAGPR) ? std::min(RegWidth, 16u) : 4u;
+  unsigned NumSubRegs = RegWidth / EltSize;
   unsigned Size = NumSubRegs * EltSize;
+  unsigned RemSize = RegWidth - Size;
+  unsigned NumRemSubRegs = RemSize ? 1 : 0;
   int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
+  int64_t MaxOffset = Offset + Size + RemSize - EltSize;
   int64_t ScratchOffsetRegDelta = 0;
 
+  if (IsFlat && EltSize > 4) {
+    LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
+    Desc = &TII->get(LoadStoreOp);
+  }
+
   Align Alignment = MFI.getObjectAlign(Index);
   const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo();
 
-  Register TmpReg =
-    hasAGPRs(RC) ? TII->getNamedOperand(*MI, AMDGPU::OpName::tmp)->getReg()
-                 : Register();
+  assert((IsFlat || ((Offset % EltSize) == 0)) &&
+         "unexpected VGPR spill offset");
 
-  assert((Offset % EltSize) == 0 && "unexpected VGPR spill offset");
-
-  if (!isUInt<12>(Offset + Size - EltSize)) {
+  bool IsOffsetLegal = IsFlat
+      ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS, true)
+      : SIInstrInfo::isLegalMUBUFImmOffset(MaxOffset);
+  if (!IsOffsetLegal || (IsFlat && !SOffset && !ST.hasFlatScratchSTMode())) {
     SOffset = MCRegister();
 
     // We currently only support spilling VGPRs to EltSize boundaries, meaning
     // we can simplify the adjustment of Offset here to just scale with
     // WavefrontSize.
-    Offset *= ST.getWavefrontSize();
+    if (!IsFlat)
+      Offset *= ST.getWavefrontSize();
 
     // We don't have access to the register scavenger if this function is called
     // during  PEI::scavengeFrameVirtualRegs().
@@ -804,10 +871,30 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
     Offset = 0;
   }
 
-  for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += EltSize) {
-    Register SubReg = NumSubRegs == 1
-                          ? Register(ValueReg)
-                          : getSubReg(ValueReg, getSubRegFromChannel(i));
+  if (IsFlat && SOffset == AMDGPU::NoRegister) {
+    assert(AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0
+           && "Unexpected vaddr for flat scratch with a FI operand");
+
+    assert(ST.hasFlatScratchSTMode());
+    LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
+    Desc = &TII->get(LoadStoreOp);
+  }
+
+  Register TmpReg;
+
+  for (unsigned i = 0, e = NumSubRegs + NumRemSubRegs, RegOffset = 0; i != e;
+       ++i, RegOffset += EltSize) {
+    if (i == NumSubRegs) {
+      EltSize = RemSize;
+      LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
+    }
+    Desc = &TII->get(LoadStoreOp);
+
+    unsigned NumRegs = EltSize / 4;
+    Register SubReg = e == 1
+            ? ValueReg
+            : Register(getSubReg(ValueReg,
+                                 getSubRegFromChannel(RegOffset / 4, NumRegs)));
 
     unsigned SOffsetRegState = 0;
     unsigned SrcDstRegState = getDefRegState(!IsStore);
@@ -817,46 +904,111 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
       SrcDstRegState |= getKillRegState(IsKill);
     }
 
-    auto MIB = spillVGPRtoAGPR(ST, MI, Index, i, SubReg, IsKill);
-
-    if (!MIB.getInstr()) {
-      unsigned FinalReg = SubReg;
-      if (TmpReg != AMDGPU::NoRegister) {
-        if (IsStore)
-          BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_READ_B32), TmpReg)
-            .addReg(SubReg, getKillRegState(IsKill));
-        SubReg = TmpReg;
+    // Make sure the whole register is defined if there are undef components by
+    // adding an implicit def of the super-reg on the first instruction.
+    bool NeedSuperRegDef = e > 1 && IsStore && i == 0;
+    bool NeedSuperRegImpOperand = e > 1;
+
+    unsigned Lane = RegOffset / 4;
+    unsigned LaneE = (RegOffset + EltSize) / 4;
+    for ( ; Lane != LaneE; ++Lane) {
+      bool IsSubReg = e > 1 || EltSize > 4;
+      Register Sub = IsSubReg
+             ? Register(getSubReg(ValueReg, getSubRegFromChannel(Lane)))
+             : ValueReg;
+      auto MIB = spillVGPRtoAGPR(ST, MI, Index, Lane, Sub, IsKill);
+      if (!MIB.getInstr())
+        break;
+      if (NeedSuperRegDef || (IsSubReg && IsStore && Lane == 0)) {
+        MIB.addReg(ValueReg, RegState::ImplicitDefine);
+        NeedSuperRegDef = false;
+      }
+      if (IsSubReg || NeedSuperRegImpOperand) {
+        NeedSuperRegImpOperand = true;
+        unsigned State = SrcDstRegState;
+        if (Lane + 1 != LaneE)
+          State &= ~RegState::Kill;
+        MIB.addReg(ValueReg, RegState::Implicit | State);
       }
+    }
 
-      MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(EltSize * i);
-      MachineMemOperand *NewMMO =
-          MF->getMachineMemOperand(PInfo, MMO->getFlags(), EltSize,
-                                   commonAlignment(Alignment, EltSize * i));
+    if (Lane == LaneE) // Fully spilled into AGPRs.
+      continue;
+
+    // Offset in bytes from the beginning of the ValueReg to its portion we
+    // still need to spill. It may differ from RegOffset if a portion of
+    // current SubReg has been already spilled into AGPRs by the loop above.
+    unsigned RemRegOffset = Lane * 4;
+    unsigned RemEltSize = EltSize - (RemRegOffset - RegOffset);
+    if (RemEltSize != EltSize) { // Partially spilled to AGPRs
+      assert(IsFlat && EltSize > 4);
+
+      unsigned NumRegs = RemEltSize / 4;
+      SubReg = Register(getSubReg(ValueReg,
+                        getSubRegFromChannel(RemRegOffset / 4, NumRegs)));
+      unsigned Opc = getFlatScratchSpillOpcode(TII, LoadStoreOp, RemEltSize);
+      Desc = &TII->get(Opc);
+    }
 
-      MIB = BuildMI(*MBB, MI, DL, Desc)
-                .addReg(SubReg,
-                        getDefRegState(!IsStore) | getKillRegState(IsKill))
-                .addReg(ScratchRsrcReg);
-      if (SOffset == AMDGPU::NoRegister) {
-        MIB.addImm(0);
-      } else {
-        MIB.addReg(SOffset, SOffsetRegState);
+    unsigned FinalReg = SubReg;
+
+    if (IsAGPR) {
+      assert(EltSize == 4);
+
+      if (!TmpReg) {
+        assert(RS && "Needs to have RegScavenger to spill an AGPR!");
+        // FIXME: change to scavengeRegisterBackwards()
+        TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
+        RS->setRegUsed(TmpReg);
       }
-      MIB.addImm(Offset)
-          .addImm(0) // glc
-          .addImm(0) // slc
-          .addImm(0) // tfe
-          .addImm(0) // dlc
-          .addImm(0) // swz
-          .addMemOperand(NewMMO);
+      if (IsStore) {
+        auto AccRead = BuildMI(*MBB, MI, DL,
+                              TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TmpReg)
+          .addReg(SubReg, getKillRegState(IsKill));
+        if (NeedSuperRegDef)
+          AccRead.addReg(ValueReg, RegState::ImplicitDefine);
+        AccRead->setAsmPrinterFlag(MachineInstr::ReloadReuse);
+      }
+      SubReg = TmpReg;
+    }
+
+    MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RemRegOffset);
+    MachineMemOperand *NewMMO =
+        MF->getMachineMemOperand(PInfo, MMO->getFlags(), RemEltSize,
+                                 commonAlignment(Alignment, RemRegOffset));
+
+    auto MIB = BuildMI(*MBB, MI, DL, *Desc)
+                  .addReg(SubReg,
+                          getDefRegState(!IsStore) | getKillRegState(IsKill));
+    if (!IsFlat)
+      MIB.addReg(FuncInfo->getScratchRSrcReg());
 
-      if (!IsStore && TmpReg != AMDGPU::NoRegister)
-        MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32),
-                      FinalReg)
-          .addReg(TmpReg, RegState::Kill);
+    if (SOffset == AMDGPU::NoRegister) {
+      if (!IsFlat)
+        MIB.addImm(0);
+    } else {
+      MIB.addReg(SOffset, SOffsetRegState);
+    }
+    MIB.addImm(Offset + RemRegOffset)
+        .addImm(0) // glc
+        .addImm(0) // slc
+        .addImm(0); // tfe for MUBUF or dlc for FLAT
+    if (!IsFlat)
+      MIB.addImm(0) // dlc
+         .addImm(0); // swz
+    MIB.addMemOperand(NewMMO);
+
+    if (!IsAGPR && NeedSuperRegDef)
+      MIB.addReg(ValueReg, RegState::ImplicitDefine);
+
+    if (!IsStore && TmpReg != AMDGPU::NoRegister) {
+      MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64),
+                    FinalReg)
+        .addReg(TmpReg, RegState::Kill);
+      MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse);
     }
 
-    if (NumSubRegs > 1)
+    if (NeedSuperRegImpOperand)
       MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
   }
 
@@ -907,9 +1059,10 @@ void SIRegisterInfo::buildSGPRSpillLoadStore(MachineBasicBlock::iterator MI,
 
   // Backup EXEC
   if (OnlyExecLo) {
-    SavedExecReg = NumSubRegs == 1
-                       ? SuperReg
-                       : getSubReg(SuperReg, SplitParts[FirstPart + ExecLane]);
+    SavedExecReg =
+        NumSubRegs == 1
+            ? SuperReg
+            : Register(getSubReg(SuperReg, SplitParts[FirstPart + ExecLane]));
   } else {
     // If src/dst is an odd size it is possible subreg0 is not aligned.
     for (; ExecLane < (NumSubRegs - 1); ++ExecLane) {
@@ -942,15 +1095,19 @@ void SIRegisterInfo::buildSGPRSpillLoadStore(MachineBasicBlock::iterator MI,
       EltSize, Alignment);
 
   if (IsLoad) {
-    buildSpillLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
+    unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
+                                          : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
+    buildSpillLoadStore(MI, Opc,
           Index,
           VGPR, false,
-          MFI->getScratchRSrcReg(), FrameReg,
+          FrameReg,
           Offset * EltSize, MMO,
           RS);
   } else {
-    buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET, Index, VGPR,
-                        IsKill, MFI->getScratchRSrcReg(), FrameReg,
+    unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
+                                          : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
+    buildSpillLoadStore(MI, Opc, Index, VGPR,
+                        IsKill, FrameReg,
                         Offset * EltSize, MMO, RS);
     // This only ever adds one VGPR spill
     MFI->addToSpilledVGPRs(1);
@@ -966,15 +1123,15 @@ void SIRegisterInfo::buildSGPRSpillLoadStore(MachineBasicBlock::iterator MI,
   } else if (!IsKill) {
     // Restore SGPRs from appropriate VGPR lanes
     if (!OnlyExecLo) {
-      BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
+      BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32),
               getSubReg(SuperReg, SplitParts[FirstPart + ExecLane + 1]))
           .addReg(VGPR)
           .addImm(ExecLane + 1);
     }
-    BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
-            NumSubRegs == 1
-                ? SavedExecReg
-                : getSubReg(SuperReg, SplitParts[FirstPart + ExecLane]))
+    BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32),
+            NumSubRegs == 1 ? SavedExecReg
+                            : Register(getSubReg(
+                                  SuperReg, SplitParts[FirstPart + ExecLane])))
         .addReg(VGPR, RegState::Kill)
         .addImm(ExecLane);
   }
@@ -987,7 +1144,6 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
   MachineBasicBlock *MBB = MI->getParent();
   MachineFunction *MF = MBB->getParent();
   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
-  DenseSet<Register> SGPRSpillVGPRDefinedSet;
 
   ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills
     = MFI->getSGPRToVGPRSpills(Index);
@@ -1016,25 +1172,29 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
 
   if (SpillToVGPR) {
     for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
-      Register SubReg =
-          NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]);
+      Register SubReg = NumSubRegs == 1
+                            ? SuperReg
+                            : Register(getSubReg(SuperReg, SplitParts[i]));
       SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
 
-      // During SGPR spilling to VGPR, determine if the VGPR is defined. The
-      // only circumstance in which we say it is undefined is when it is the
-      // first spill to this VGPR in the first basic block.
-      bool VGPRDefined = true;
-      if (MBB == &MF->front())
-        VGPRDefined = !SGPRSpillVGPRDefinedSet.insert(Spill.VGPR).second;
+      bool UseKill = IsKill && i == NumSubRegs - 1;
 
       // Mark the "old value of vgpr" input undef only if this is the first sgpr
       // spill to this specific vgpr in the first basic block.
-      BuildMI(*MBB, MI, DL,
-              TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
-              Spill.VGPR)
-        .addReg(SubReg, getKillRegState(IsKill))
-        .addImm(Spill.Lane)
-        .addReg(Spill.VGPR, VGPRDefined ? 0 : RegState::Undef);
+      auto MIB =
+          BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill.VGPR)
+              .addReg(SubReg, getKillRegState(UseKill))
+              .addImm(Spill.Lane)
+              .addReg(Spill.VGPR);
+
+      if (i == 0 && NumSubRegs > 1) {
+        // We may be spilling a super-register which is only partially defined,
+        // and need to ensure later spills think the value is defined.
+        MIB.addReg(SuperReg, RegState::ImplicitDefine);
+      }
+
+      if (NumSubRegs > 1)
+        MIB.addReg(SuperReg, getKillRegState(UseKill) | RegState::Implicit);
 
       // FIXME: Since this spills to another register instead of an actual
       // frame index, we should delete the frame index when all references to
@@ -1060,13 +1220,12 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
       for (unsigned i = Offset * PerVGPR,
                     e = std::min((Offset + 1) * PerVGPR, NumSubRegs);
            i < e; ++i) {
-        Register SubReg =
-            NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]);
+        Register SubReg = NumSubRegs == 1
+                              ? SuperReg
+                              : Register(getSubReg(SuperReg, SplitParts[i]));
 
         MachineInstrBuilder WriteLane =
-            BuildMI(*MBB, MI, DL,
-                    TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
-                    TmpVGPR)
+            BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), TmpVGPR)
                 .addReg(SubReg, SubKillState)
                 .addImm(i % PerVGPR)
                 .addReg(TmpVGPR, TmpVGPRFlags);
@@ -1126,15 +1285,14 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
 
   if (SpillToVGPR) {
     for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
-      Register SubReg =
-          NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]);
+      Register SubReg = NumSubRegs == 1
+                            ? SuperReg
+                            : Register(getSubReg(SuperReg, SplitParts[i]));
 
       SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
-      auto MIB =
-        BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
-                SubReg)
-        .addReg(Spill.VGPR)
-        .addImm(Spill.Lane);
+      auto MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), SubReg)
+                     .addReg(Spill.VGPR)
+                     .addImm(Spill.Lane);
       if (NumSubRegs > 1 && i == 0)
         MIB.addReg(SuperReg, RegState::ImplicitDefine);
     }
@@ -1155,13 +1313,13 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
       for (unsigned i = Offset * PerVGPR,
                     e = std::min((Offset + 1) * PerVGPR, NumSubRegs);
            i < e; ++i) {
-        Register SubReg =
-            NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]);
+        Register SubReg = NumSubRegs == 1
+                              ? SuperReg
+                              : Register(getSubReg(SuperReg, SplitParts[i]));
 
         bool LastSubReg = (i + 1 == e);
         auto MIB =
-            BuildMI(*MBB, MI, DL,
-                    TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), SubReg)
+            BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), SubReg)
                 .addReg(TmpVGPR, getKillRegState(LastSubReg))
                 .addImm(i);
         if (NumSubRegs > 1 && i == 0)
@@ -1259,6 +1417,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
     case AMDGPU::SI_SPILL_V1024_SAVE:
     case AMDGPU::SI_SPILL_V512_SAVE:
     case AMDGPU::SI_SPILL_V256_SAVE:
+    case AMDGPU::SI_SPILL_V192_SAVE:
     case AMDGPU::SI_SPILL_V160_SAVE:
     case AMDGPU::SI_SPILL_V128_SAVE:
     case AMDGPU::SI_SPILL_V96_SAVE:
@@ -1266,7 +1425,11 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
     case AMDGPU::SI_SPILL_V32_SAVE:
     case AMDGPU::SI_SPILL_A1024_SAVE:
     case AMDGPU::SI_SPILL_A512_SAVE:
+    case AMDGPU::SI_SPILL_A256_SAVE:
+    case AMDGPU::SI_SPILL_A192_SAVE:
+    case AMDGPU::SI_SPILL_A160_SAVE:
     case AMDGPU::SI_SPILL_A128_SAVE:
+    case AMDGPU::SI_SPILL_A96_SAVE:
     case AMDGPU::SI_SPILL_A64_SAVE:
     case AMDGPU::SI_SPILL_A32_SAVE: {
       const MachineOperand *VData = TII->getNamedOperand(*MI,
@@ -1274,10 +1437,11 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
       assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
              MFI->getStackPtrOffsetReg());
 
-      buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET,
+      unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
+                                            : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
+      buildSpillLoadStore(MI, Opc,
             Index,
             VData->getReg(), VData->isKill(),
-            TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(),
             FrameReg,
             TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
             *MI->memoperands_begin(),
@@ -1291,12 +1455,17 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
     case AMDGPU::SI_SPILL_V96_RESTORE:
     case AMDGPU::SI_SPILL_V128_RESTORE:
     case AMDGPU::SI_SPILL_V160_RESTORE:
+    case AMDGPU::SI_SPILL_V192_RESTORE:
     case AMDGPU::SI_SPILL_V256_RESTORE:
     case AMDGPU::SI_SPILL_V512_RESTORE:
     case AMDGPU::SI_SPILL_V1024_RESTORE:
     case AMDGPU::SI_SPILL_A32_RESTORE:
     case AMDGPU::SI_SPILL_A64_RESTORE:
+    case AMDGPU::SI_SPILL_A96_RESTORE:
     case AMDGPU::SI_SPILL_A128_RESTORE:
+    case AMDGPU::SI_SPILL_A160_RESTORE:
+    case AMDGPU::SI_SPILL_A192_RESTORE:
+    case AMDGPU::SI_SPILL_A256_RESTORE:
     case AMDGPU::SI_SPILL_A512_RESTORE:
     case AMDGPU::SI_SPILL_A1024_RESTORE: {
       const MachineOperand *VData = TII->getNamedOperand(*MI,
@@ -1304,10 +1473,11 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
       assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
              MFI->getStackPtrOffsetReg());
 
-      buildSpillLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
+      unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
+                                            : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
+      buildSpillLoadStore(MI, Opc,
             Index,
             VData->getReg(), VData->isKill(),
-            TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(),
             FrameReg,
             TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
             *MI->memoperands_begin(),
@@ -1318,6 +1488,117 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
 
     default: {
       const DebugLoc &DL = MI->getDebugLoc();
+
+      int64_t Offset = FrameInfo.getObjectOffset(Index);
+      if (ST.enableFlatScratch()) {
+        if (TII->isFLATScratch(*MI)) {
+          assert((int16_t)FIOperandNum ==
+                 AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                            AMDGPU::OpName::saddr));
+
+          // The offset is always swizzled, just replace it
+          if (FrameReg)
+            FIOp.ChangeToRegister(FrameReg, false);
+
+          if (!Offset)
+            return;
+
+          MachineOperand *OffsetOp =
+            TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
+          int64_t NewOffset = Offset + OffsetOp->getImm();
+          if (TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
+                                     true)) {
+            OffsetOp->setImm(NewOffset);
+            if (FrameReg)
+              return;
+            Offset = 0;
+          }
+
+          assert(!TII->getNamedOperand(*MI, AMDGPU::OpName::vaddr) &&
+                 "Unexpected vaddr for flat scratch with a FI operand");
+
+          // On GFX10 we have ST mode to use no registers for an address.
+          // Otherwise we need to materialize 0 into an SGPR.
+          if (!Offset && ST.hasFlatScratchSTMode()) {
+            unsigned Opc = MI->getOpcode();
+            unsigned NewOpc = AMDGPU::getFlatScratchInstSTfromSS(Opc);
+            MI->RemoveOperand(
+                AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr));
+            MI->setDesc(TII->get(NewOpc));
+            return;
+          }
+        }
+
+        if (!FrameReg) {
+          FIOp.ChangeToImmediate(Offset);
+          if (TII->isImmOperandLegal(*MI, FIOperandNum, FIOp))
+            return;
+        }
+
+        // We need to use register here. Check if we can use an SGPR or need
+        // a VGPR.
+        FIOp.ChangeToRegister(AMDGPU::M0, false);
+        bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, &FIOp);
+
+        if (!Offset && FrameReg && UseSGPR) {
+          FIOp.setReg(FrameReg);
+          return;
+        }
+
+        const TargetRegisterClass *RC = UseSGPR ? &AMDGPU::SReg_32_XM0RegClass
+                                                : &AMDGPU::VGPR_32RegClass;
+
+        Register TmpReg = RS->scavengeRegister(RC, MI, 0, !UseSGPR);
+        FIOp.setReg(TmpReg);
+        FIOp.setIsKill(true);
+
+        if ((!FrameReg || !Offset) && TmpReg) {
+          unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
+          auto MIB = BuildMI(*MBB, MI, DL, TII->get(Opc), TmpReg);
+          if (FrameReg)
+            MIB.addReg(FrameReg);
+          else
+            MIB.addImm(Offset);
+
+          return;
+        }
+
+        Register TmpSReg =
+            UseSGPR ? TmpReg
+                    : RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0,
+                                           !UseSGPR);
+
+        // TODO: for flat scratch another attempt can be made with a VGPR index
+        //       if no SGPRs can be scavenged.
+        if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR))
+          report_fatal_error("Cannot scavenge register in FI elimination!");
+
+        if (!TmpSReg) {
+          // Use frame register and restore it after.
+          TmpSReg = FrameReg;
+          FIOp.setReg(FrameReg);
+          FIOp.setIsKill(false);
+        }
+
+        BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), TmpSReg)
+          .addReg(FrameReg)
+          .addImm(Offset);
+
+        if (!UseSGPR)
+          BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
+            .addReg(TmpSReg, RegState::Kill);
+
+        if (TmpSReg == FrameReg) {
+          // Undo frame register modification.
+          BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_SUB_U32),
+                  FrameReg)
+            .addReg(FrameReg)
+            .addImm(Offset);
+        }
+
+        return;
+      }
+
       bool IsMUBUF = TII->isMUBUF(*MI);
 
       if (!IsMUBUF && !MFI->isEntryFunction()) {
@@ -1356,7 +1637,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
               if (!IsVOP2)
                 MIB.addImm(0); // clamp bit
             } else {
-              assert(MIB->getOpcode() == AMDGPU::V_ADD_I32_e64 &&
+              assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 &&
                      "Need to reuse carry out register");
 
               // Use scavenged unused carry out as offset register.
@@ -1419,23 +1700,17 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
                                           AMDGPU::OpName::vaddr));
 
         auto &SOffset = *TII->getNamedOperand(*MI, AMDGPU::OpName::soffset);
-        assert((SOffset.isReg() &&
-                SOffset.getReg() == MFI->getStackPtrOffsetReg()) ||
-               (SOffset.isImm() && SOffset.getImm() == 0));
-        if (SOffset.isReg()) {
-          if (FrameReg == AMDGPU::NoRegister) {
-            SOffset.ChangeToImmediate(0);
-          } else {
-            SOffset.setReg(FrameReg);
-          }
-        }
+        assert((SOffset.isImm() && SOffset.getImm() == 0));
+
+        if (FrameReg != AMDGPU::NoRegister)
+          SOffset.ChangeToRegister(FrameReg, false);
 
         int64_t Offset = FrameInfo.getObjectOffset(Index);
         int64_t OldImm
           = TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm();
         int64_t NewOffset = OldImm + Offset;
 
-        if (isUInt<12>(NewOffset) &&
+        if (SIInstrInfo::isLegalMUBUFImmOffset(NewOffset) &&
             buildMUBUFOffsetLoadStore(ST, FrameInfo, MI, Index, NewOffset)) {
           MI->eraseFromParent();
           return;
@@ -1445,7 +1720,6 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
       // If the offset is simply too big, don't convert to a scratch wave offset
       // relative index.
 
-      int64_t Offset = FrameInfo.getObjectOffset(Index);
       FIOp.ChangeToImmediate(Offset);
       if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) {
         Register TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
@@ -1590,6 +1864,16 @@ SIRegisterInfo::getPhysRegClass(MCRegister Reg) const {
   return nullptr;
 }
 
+bool SIRegisterInfo::isSGPRReg(const MachineRegisterInfo &MRI,
+                               Register Reg) const {
+  const TargetRegisterClass *RC;
+  if (Reg.isVirtual())
+    RC = MRI.getRegClass(Reg);
+  else
+    RC = getPhysRegClass(Reg);
+  return isSGPRClass(RC);
+}
+
 // TODO: It might be helpful to have some target specific flags in
 // TargetRegisterClass to mark which classes are VGPRs to make this trivial.
 bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
@@ -1698,6 +1982,12 @@ bool SIRegisterInfo::shouldRewriteCopySrc(
   return getCommonSubClass(DefRC, SrcRC) != nullptr;
 }
 
+bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const {
+  // TODO: 64-bit operands have extending behavior from 32-bit literal.
+  return OpType >= AMDGPU::OPERAND_REG_IMM_FIRST &&
+         OpType <= AMDGPU::OPERAND_REG_IMM_LAST;
+}
+
 /// Returns a lowest register that is not used at any point in the function.
 ///        If all registers are used, then this function will return
 ///         AMDGPU::NoRegister. If \p ReserveHighestVGPR = true, then return
@@ -1903,7 +2193,8 @@ MachineInstr *SIRegisterInfo::findReachingDef(Register Reg, unsigned SubReg,
     DefIdx = V->def;
   } else {
     // Find last def.
-    for (MCRegUnitIterator Units(Reg, this); Units.isValid(); ++Units) {
+    for (MCRegUnitIterator Units(Reg.asMCReg(), this); Units.isValid();
+         ++Units) {
       LiveRange &LR = LIS->getRegUnit(*Units);
       if (VNInfo *V = LR.getVNInfoAt(UseIdx)) {
         if (!DefIdx.isValid() ||
@@ -1963,11 +2254,12 @@ SIRegisterInfo::getAllSGPR128(const MachineFunction &MF) const {
 }
 
 ArrayRef<MCPhysReg>
-SIRegisterInfo::getAllSGPR32(const MachineFunction &MF) const {
-  return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF));
+SIRegisterInfo::getAllSGPR64(const MachineFunction &MF) const {
+  return makeArrayRef(AMDGPU::SGPR_64RegClass.begin(),
+                      ST.getMaxNumSGPRs(MF) / 2);
 }
 
 ArrayRef<MCPhysReg>
-SIRegisterInfo::getAllVGPR32(const MachineFunction &MF) const {
-  return makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), ST.getMaxNumVGPRs(MF));
+SIRegisterInfo::getAllSGPR32(const MachineFunction &MF) const {
+  return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF));
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 62d9f1174337..963da9b3536b 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -17,13 +17,11 @@
 #define GET_REGINFO_HEADER
 #include "AMDGPUGenRegisterInfo.inc"
 
-#include "SIDefines.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-
 namespace llvm {
 
 class GCNSubtarget;
 class LiveIntervals;
+class RegisterBank;
 class SIMachineFunctionInfo;
 
 class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
@@ -40,6 +38,11 @@ private:
   /// all elements of the inner vector combined give a full lane mask.
   static std::array<std::vector<int16_t>, 16> RegSplitParts;
 
+  // Table representing sub reg of given width and offset.
+  // First index is subreg size: 32, 64, 96, 128, 160, 192, 224, 256, 512.
+  // Second index is 32 different dword offsets.
+  static std::array<std::array<uint16_t, 32>, 9> SubRegFromChannelTable;
+
   void reserveRegisterTuples(BitVector &, MCRegister Reg) const;
 
 public:
@@ -63,6 +66,7 @@ public:
   const MCPhysReg *getCalleeSavedRegsViaCopy(const MachineFunction *MF) const;
   const uint32_t *getCallPreservedMask(const MachineFunction &MF,
                                        CallingConv::ID) const override;
+  const uint32_t *getNoPreservedMask() const override;
 
   // Stack access is very expensive. CSRs are also the high registers, and we
   // want to minimize the number of used registers.
@@ -83,16 +87,15 @@ public:
     const MachineFunction &MF) const override;
   bool requiresVirtualBaseRegisters(const MachineFunction &Fn) const override;
 
-  int64_t getMUBUFInstrOffset(const MachineInstr *MI) const;
+  int64_t getScratchInstrOffset(const MachineInstr *MI) const;
 
   int64_t getFrameIndexInstrOffset(const MachineInstr *MI,
                                    int Idx) const override;
 
   bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override;
 
-  void materializeFrameBaseRegister(MachineBasicBlock *MBB, Register BaseReg,
-                                    int FrameIdx,
-                                    int64_t Offset) const override;
+  Register materializeFrameBaseRegister(MachineBasicBlock *MBB, int FrameIdx,
+                                        int64_t Offset) const override;
 
   void resolveFrameIndex(MachineInstr &MI, Register BaseReg,
                          int64_t Offset) const override;
@@ -126,6 +129,7 @@ public:
 
   StringRef getRegAsmName(MCRegister Reg) const override;
 
+  // Pseudo regs are not allowed
   unsigned getHWRegIndex(MCRegister Reg) const {
     return getEncodingValue(Reg) & 0xff;
   }
@@ -148,14 +152,7 @@ public:
     return isSGPRClass(getRegClass(RCID));
   }
 
-  bool isSGPRReg(const MachineRegisterInfo &MRI, Register Reg) const {
-    const TargetRegisterClass *RC;
-    if (Reg.isVirtual())
-      RC = MRI.getRegClass(Reg);
-    else
-      RC = getPhysRegClass(Reg);
-    return isSGPRClass(RC);
-  }
+  bool isSGPRReg(const MachineRegisterInfo &MRI, Register Reg) const;
 
   /// \returns true if this class contains only AGPR registers
   bool isAGPRClass(const TargetRegisterClass *RC) const {
@@ -198,11 +195,7 @@ public:
 
   /// \returns True if operands defined with this operand type can accept
   /// a literal constant (i.e. any 32-bit immediate).
-  bool opCanUseLiteralConstant(unsigned OpType) const {
-    // TODO: 64-bit operands have extending behavior from 32-bit literal.
-    return OpType >= AMDGPU::OPERAND_REG_IMM_FIRST &&
-           OpType <= AMDGPU::OPERAND_REG_IMM_LAST;
-  }
+  bool opCanUseLiteralConstant(unsigned OpType) const;
 
   /// \returns True if operands defined with this operand type can accept
   /// an inline constant. i.e. An integer value in the range (-16, 64) or
@@ -317,13 +310,13 @@ public:
   /// of the subtarget.
   ArrayRef<MCPhysReg> getAllSGPR128(const MachineFunction &MF) const;
 
-  /// Return all SGPR32 which satisfy the waves per execution unit requirement
+  /// Return all SGPR64 which satisfy the waves per execution unit requirement
   /// of the subtarget.
-  ArrayRef<MCPhysReg> getAllSGPR32(const MachineFunction &MF) const;
+  ArrayRef<MCPhysReg> getAllSGPR64(const MachineFunction &MF) const;
 
-  /// Return all VGPR32 which satisfy the waves per execution unit requirement
+  /// Return all SGPR32 which satisfy the waves per execution unit requirement
   /// of the subtarget.
-  ArrayRef<MCPhysReg> getAllVGPR32(const MachineFunction &MF) const;
+  ArrayRef<MCPhysReg> getAllSGPR32(const MachineFunction &MF) const;
 
 private:
   void buildSpillLoadStore(MachineBasicBlock::iterator MI,
@@ -331,7 +324,6 @@ private:
                            int Index,
                            Register ValueReg,
                            bool ValueIsKill,
-                           MCRegister ScratchRsrcReg,
                            MCRegister ScratchOffsetReg,
                            int64_t InstrOffset,
                            MachineMemOperand *MMO,
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index ff1f5c4bc49b..92390f1f3297 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -17,9 +17,7 @@ class Indexes<int N> {
                    24, 25, 26, 27, 28, 29, 30, 31];
 
   // Returns list of indexes [0..N)
-  list<int> slice =
-    !foldl([]<int>, all, acc, cur,
-           !listconcat(acc, !if(!lt(cur, N), [cur], [])));
+  list<int> slice = !filter(i, all, !lt(i, N));
 }
 
 let Namespace = "AMDGPU" in {
@@ -27,17 +25,17 @@ let Namespace = "AMDGPU" in {
 def lo16 : SubRegIndex<16, 0>;
 def hi16 : SubRegIndex<16, 16>;
 
-foreach Index = 0-31 in {
+foreach Index = 0...31 in {
   def sub#Index : SubRegIndex<32, !shl(Index, 5)>;
 }
 
-foreach Index = 1-31 in {
+foreach Index = 1...31 in {
   def sub#Index#_lo16 : ComposedSubRegIndex<!cast<SubRegIndex>(sub#Index), lo16>;
   def sub#Index#_hi16 : ComposedSubRegIndex<!cast<SubRegIndex>(sub#Index), hi16>;
 }
 
-foreach Size = {2-6,8,16} in {
-  foreach Index = Indexes<!add(33, !mul(Size, -1))>.slice in {
+foreach Size = {2...6,8,16} in {
+  foreach Index = Indexes<!sub(33, Size)>.slice in {
     def !foldl("", Indexes<Size>.slice, acc, cur,
                !strconcat(acc#!if(!eq(acc,""),"","_"), "sub"#!add(cur, Index))) :
       SubRegIndex<!mul(Size, 32), !shl(Index, 5)> {
@@ -89,7 +87,7 @@ class getSubRegs<int size> {
 class RegSeqNames<int last_reg, int stride, int size, string prefix,
                   int start = 0> {
   int next = !add(start, stride);
-  int end_reg = !add(!add(start, size), -1);
+  int end_reg = !add(start, size, -1);
   list<string> ret =
     !if(!le(end_reg, last_reg),
         !listconcat([prefix # "[" # start # ":" # end_reg # "]"],
@@ -102,7 +100,7 @@ class RegSeqDags<RegisterClass RC, int last_reg, int stride, int size,
                 int start = 0> {
   dag trunc_rc = (trunc RC,
                   !if(!and(!eq(stride, 1), !eq(start, 0)),
-                      !add(!add(last_reg, 2), !mul(size, -1)),
+                      !sub(!add(last_reg, 2), size),
                       !add(last_reg, 1)));
   list<dag> ret =
     !if(!lt(start, size),
@@ -149,7 +147,7 @@ multiclass SIRegLoHi16 <string n, bits<16> regIdx, bit ArtificialHigh = 1,
                                    !cast<Register>(NAME#"_HI16")]> {
     let Namespace = "AMDGPU";
     let SubRegIndices = [lo16, hi16];
-    let CoveredBySubRegs = !if(ArtificialHigh,0,1);
+    let CoveredBySubRegs = !not(ArtificialHigh);
     let HWEncoding = regIdx;
     let HWEncoding{8} = HWEncodingHigh;
   }
@@ -247,10 +245,10 @@ def TMA : RegisterWithSubRegs<"tma", [TMA_LO, TMA_HI]> {
   let HWEncoding = 110;
 }
 
-foreach Index = 0-15 in {
-  defm TTMP#Index#_vi         : SIRegLoHi16<"ttmp"#Index, !add(112, Index)>;
-  defm TTMP#Index#_gfx9_gfx10 : SIRegLoHi16<"ttmp"#Index, !add(108, Index)>;
-  defm TTMP#Index             : SIRegLoHi16<"ttmp"#Index, 0>;
+foreach Index = 0...15 in {
+  defm TTMP#Index#_vi       : SIRegLoHi16<"ttmp"#Index, !add(112, Index)>;
+  defm TTMP#Index#_gfx9plus : SIRegLoHi16<"ttmp"#Index, !add(108, Index)>;
+  defm TTMP#Index           : SIRegLoHi16<"ttmp"#Index, 0>;
 }
 
 multiclass FLAT_SCR_LOHI_m <string n, bits<16> ci_e, bits<16> vi_e> {
@@ -274,7 +272,7 @@ def FLAT_SCR_vi : FlatReg<FLAT_SCR_LO_vi, FLAT_SCR_HI_vi, 102>;
 def FLAT_SCR : FlatReg<FLAT_SCR_LO, FLAT_SCR_HI, 0>;
 
 // SGPR registers
-foreach Index = 0-105 in {
+foreach Index = 0...105 in {
   defm SGPR#Index :
      SIRegLoHi16 <"s"#Index, Index>,
      DwarfRegNum<[!if(!le(Index, 63), !add(Index, 32), !add(Index, 1024)),
@@ -282,14 +280,14 @@ foreach Index = 0-105 in {
 }
 
 // VGPR registers
-foreach Index = 0-255 in {
+foreach Index = 0...255 in {
   defm VGPR#Index :
     SIRegLoHi16 <"v"#Index, Index, 0, 1>,
     DwarfRegNum<[!add(Index, 2560), !add(Index, 1536)]>;
 }
 
 // AccVGPR registers
-foreach Index = 0-255 in {
+foreach Index = 0...255 in {
   defm AGPR#Index :
       SIRegLoHi16 <"a"#Index, Index, 1, 1>,
       DwarfRegNum<[!add(Index, 3072), !add(Index, 2048)]>;
@@ -389,7 +387,7 @@ def TTMP_512Regs : SIRegisterTuples<getSubRegs<16>.ret, TTMP_32, 15, 4, 16, "ttm
 class TmpRegTuplesBase<int index, int size,
                        list<Register> subRegs,
                        list<SubRegIndex> indices = getSubRegs<size>.ret,
-                       int index1 = !add(index, !add(size, -1)),
+                       int index1 = !add(index, size, -1),
                        string name = "ttmp["#index#":"#index1#"]"> :
   RegisterWithSubRegs<name, subRegs> {
   let HWEncoding = subRegs[0].HWEncoding;
@@ -421,8 +419,8 @@ class TmpRegTuples<string tgt,
                    getSubRegs<size>.ret>;
 
 foreach Index = {0, 2, 4, 6, 8, 10, 12, 14} in {
-  def TTMP#Index#_TTMP#!add(Index,1)#_vi         : TmpRegTuples<"_vi",   2, Index>;
-  def TTMP#Index#_TTMP#!add(Index,1)#_gfx9_gfx10 : TmpRegTuples<"_gfx9_gfx10", 2, Index>;
+  def TTMP#Index#_TTMP#!add(Index,1)#_vi       : TmpRegTuples<"_vi",   2, Index>;
+  def TTMP#Index#_TTMP#!add(Index,1)#_gfx9plus : TmpRegTuples<"_gfx9plus", 2, Index>;
 }
 
 foreach Index = {0, 4, 8, 12} in {
@@ -431,7 +429,7 @@ foreach Index = {0, 4, 8, 12} in {
                  _TTMP#!add(Index,3)#_vi : TmpRegTuples<"_vi",   4, Index>;
   def TTMP#Index#_TTMP#!add(Index,1)#
                  _TTMP#!add(Index,2)#
-                 _TTMP#!add(Index,3)#_gfx9_gfx10 : TmpRegTuples<"_gfx9_gfx10", 4, Index>;
+                 _TTMP#!add(Index,3)#_gfx9plus : TmpRegTuples<"_gfx9plus", 4, Index>;
 }
 
 foreach Index = {0, 4, 8} in {
@@ -448,7 +446,7 @@ foreach Index = {0, 4, 8} in {
                  _TTMP#!add(Index,4)#
                  _TTMP#!add(Index,5)#
                  _TTMP#!add(Index,6)#
-                 _TTMP#!add(Index,7)#_gfx9_gfx10 : TmpRegTuples<"_gfx9_gfx10", 8, Index>;
+                 _TTMP#!add(Index,7)#_gfx9plus : TmpRegTuples<"_gfx9plus", 8, Index>;
 }
 
 def TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15_vi :
@@ -458,12 +456,12 @@ def TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TT
                     TTMP8_vi, TTMP9_vi, TTMP10_vi, TTMP11_vi,
                     TTMP12_vi, TTMP13_vi, TTMP14_vi, TTMP15_vi]>;
 
-def TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15_gfx9_gfx10 :
+def TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15_gfx9plus :
   TmpRegTuplesBase<0, 16,
-                   [TTMP0_gfx9_gfx10, TTMP1_gfx9_gfx10, TTMP2_gfx9_gfx10, TTMP3_gfx9_gfx10,
-                    TTMP4_gfx9_gfx10, TTMP5_gfx9_gfx10, TTMP6_gfx9_gfx10, TTMP7_gfx9_gfx10,
-                    TTMP8_gfx9_gfx10, TTMP9_gfx9_gfx10, TTMP10_gfx9_gfx10, TTMP11_gfx9_gfx10,
-                    TTMP12_gfx9_gfx10, TTMP13_gfx9_gfx10, TTMP14_gfx9_gfx10, TTMP15_gfx9_gfx10]>;
+                   [TTMP0_gfx9plus, TTMP1_gfx9plus, TTMP2_gfx9plus, TTMP3_gfx9plus,
+                    TTMP4_gfx9plus, TTMP5_gfx9plus, TTMP6_gfx9plus, TTMP7_gfx9plus,
+                    TTMP8_gfx9plus, TTMP9_gfx9plus, TTMP10_gfx9plus, TTMP11_gfx9plus,
+                    TTMP12_gfx9plus, TTMP13_gfx9plus, TTMP14_gfx9plus, TTMP15_gfx9plus]>;
 
 class RegisterTypes<list<ValueType> reg_types> {
   list<ValueType> types = reg_types;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp
index 64fca0b46797..d30ff4a3fd15 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp
@@ -14,9 +14,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
+#include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIInstrInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/Support/CommandLine.h"
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SISchedule.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SISchedule.td
index 932381c99e0b..db4a009e08d7 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SISchedule.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SISchedule.td
@@ -104,6 +104,9 @@ def HWVALU   : ProcResource<1> {
 def HWRC   : ProcResource<1> { // Register destination cache
   let BufferSize = 1;
 }
+def HWXDL   : ProcResource<1> { // MFMA CU
+  let BufferSize = 0;
+}
 
 class HWWriteRes<SchedWrite write, list<ProcResourceKind> resources,
                  int latency> : WriteRes<write, resources> {
@@ -138,12 +141,16 @@ multiclass SICommonWriteRes {
   def : HWVALUWriteRes<WriteFloatCvt,      4>;
   def : HWVALUWriteRes<WriteTrans32,       4>;
   def : HWVALUWriteRes<WriteQuarterRate32, 4>;
-  def : HWVALUWriteRes<Write2PassMAI,      2>;
-  def : HWVALUWriteRes<Write8PassMAI,      8>;
-  def : HWVALUWriteRes<Write16PassMAI,    16>;
+
+  let ResourceCycles = [2] in
+  def : HWWriteRes<Write2PassMAI,  [HWXDL], 2>;
+  let ResourceCycles = [8] in
+  def : HWWriteRes<Write8PassMAI,  [HWXDL], 8>;
+  let ResourceCycles = [16] in
+  def : HWWriteRes<Write16PassMAI, [HWXDL], 16>;
 
   def : ReadAdvance<MIVGPRRead, -2>;
-  def : InstRW<[Write64Bit, MIReadVGPR], (instregex "^V_ACCVGPR_WRITE_B32$")>;
+  def : InstRW<[Write64Bit, MIReadVGPR], (instregex "^V_ACCVGPR_WRITE_B32_e64$")>;
 
   // Technically mfma reads can be from 0 to 4 cycles but that does not make
   // sense to model because its register setup is huge. In particular if we
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 9c6833a7dab6..cdb78aae1c4f 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -9,19 +9,10 @@
 //
 
 #include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "SIInstrInfo.h"
+#include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
 
 #define DEBUG_TYPE "si-shrink-instructions"
 
@@ -78,27 +69,25 @@ static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
   MachineOperand &Src0 = MI.getOperand(Src0Idx);
   if (Src0.isReg()) {
     Register Reg = Src0.getReg();
-    if (Register::isVirtualRegister(Reg) && MRI.hasOneUse(Reg)) {
+    if (Reg.isVirtual() && MRI.hasOneUse(Reg)) {
       MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
       if (Def && Def->isMoveImmediate()) {
         MachineOperand &MovSrc = Def->getOperand(1);
         bool ConstantFolded = false;
 
-        if (MovSrc.isImm() && (isInt<32>(MovSrc.getImm()) ||
-                               isUInt<32>(MovSrc.getImm()))) {
-          // It's possible to have only one component of a super-reg defined by
-          // a single mov, so we need to clear any subregister flag.
-          Src0.setSubReg(0);
-          Src0.ChangeToImmediate(MovSrc.getImm());
-          ConstantFolded = true;
-        } else if (MovSrc.isFI()) {
-          Src0.setSubReg(0);
-          Src0.ChangeToFrameIndex(MovSrc.getIndex());
-          ConstantFolded = true;
-        } else if (MovSrc.isGlobal()) {
-          Src0.ChangeToGA(MovSrc.getGlobal(), MovSrc.getOffset(),
-                          MovSrc.getTargetFlags());
-          ConstantFolded = true;
+        if (TII->isOperandLegal(MI, Src0Idx, &MovSrc)) {
+          if (MovSrc.isImm() &&
+              (isInt<32>(MovSrc.getImm()) || isUInt<32>(MovSrc.getImm()))) {
+            Src0.ChangeToImmediate(MovSrc.getImm());
+            ConstantFolded = true;
+          } else if (MovSrc.isFI()) {
+            Src0.ChangeToFrameIndex(MovSrc.getIndex());
+            ConstantFolded = true;
+          } else if (MovSrc.isGlobal()) {
+            Src0.ChangeToGA(MovSrc.getGlobal(), MovSrc.getOffset(),
+                            MovSrc.getTargetFlags());
+            ConstantFolded = true;
+          }
         }
 
         if (ConstantFolded) {
@@ -276,8 +265,8 @@ void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) {
   // enabled
   int TFEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe);
   int LWEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::lwe);
-  unsigned TFEVal = MI.getOperand(TFEIdx).getImm();
-  unsigned LWEVal = MI.getOperand(LWEIdx).getImm();
+  unsigned TFEVal = (TFEIdx == -1) ? 0 : MI.getOperand(TFEIdx).getImm();
+  unsigned LWEVal = (LWEIdx == -1) ? 0 : MI.getOperand(LWEIdx).getImm();
   int ToUntie = -1;
   if (TFEVal || LWEVal) {
     // TFE/LWE is enabled so we need to deal with an implicit tied operand
@@ -367,19 +356,23 @@ static bool shrinkScalarLogicOp(const GCNSubtarget &ST,
   }
 
   if (NewImm != 0) {
-    if (Register::isVirtualRegister(Dest->getReg()) && SrcReg->isReg()) {
+    if (Dest->getReg().isVirtual() && SrcReg->isReg()) {
       MRI.setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg());
       MRI.setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg());
       return true;
     }
 
     if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) {
+      const bool IsUndef = SrcReg->isUndef();
+      const bool IsKill = SrcReg->isKill();
       MI.setDesc(TII->get(Opc));
       if (Opc == AMDGPU::S_BITSET0_B32 ||
           Opc == AMDGPU::S_BITSET1_B32) {
         Src0->ChangeToImmediate(NewImm);
         // Remove the immediate and add the tied input.
-        MI.getOperand(2).ChangeToRegister(Dest->getReg(), false);
+        MI.getOperand(2).ChangeToRegister(Dest->getReg(), /*IsDef*/ false,
+                                          /*isImp*/ false, IsKill,
+                                          /*isDead*/ false, IsUndef);
         MI.tieOperands(0, 2);
       } else {
         SrcImm->setImm(NewImm);
@@ -393,17 +386,16 @@ static bool shrinkScalarLogicOp(const GCNSubtarget &ST,
 // This is the same as MachineInstr::readsRegister/modifiesRegister except
 // it takes subregs into account.
 static bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R,
-                          unsigned Reg, unsigned SubReg,
+                          Register Reg, unsigned SubReg,
                           const SIRegisterInfo &TRI) {
   for (const MachineOperand &MO : R) {
     if (!MO.isReg())
       continue;
 
-    if (Register::isPhysicalRegister(Reg) &&
-        Register::isPhysicalRegister(MO.getReg())) {
+    if (Reg.isPhysical() && MO.getReg().isPhysical()) {
       if (TRI.regsOverlap(Reg, MO.getReg()))
         return true;
-    } else if (MO.getReg() == Reg && Register::isVirtualRegister(Reg)) {
+    } else if (MO.getReg() == Reg && Reg.isVirtual()) {
       LaneBitmask Overlap = TRI.getSubRegIndexLaneMask(SubReg) &
                             TRI.getSubRegIndexLaneMask(MO.getSubReg());
       if (Overlap.any())
@@ -426,10 +418,10 @@ static bool instModifiesReg(const MachineInstr *MI,
 }
 
 static TargetInstrInfo::RegSubRegPair
-getSubRegForIndex(unsigned Reg, unsigned Sub, unsigned I,
+getSubRegForIndex(Register Reg, unsigned Sub, unsigned I,
                   const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI) {
   if (TRI.getRegSizeInBits(Reg, MRI) != 32) {
-    if (Register::isPhysicalRegister(Reg)) {
+    if (Reg.isPhysical()) {
       Reg = TRI.getSubReg(Reg, TRI.getSubRegFromChannel(I));
     } else {
       Sub = TRI.getSubRegFromChannel(I + TRI.getChannelFromSubReg(Sub));
@@ -438,6 +430,22 @@ getSubRegForIndex(unsigned Reg, unsigned Sub, unsigned I,
   return TargetInstrInfo::RegSubRegPair(Reg, Sub);
 }
 
+static void dropInstructionKeepingImpDefs(MachineInstr &MI,
+                                          const SIInstrInfo *TII) {
+  for (unsigned i = MI.getDesc().getNumOperands() +
+         MI.getDesc().getNumImplicitUses() +
+         MI.getDesc().getNumImplicitDefs(), e = MI.getNumOperands();
+       i != e; ++i) {
+    const MachineOperand &Op = MI.getOperand(i);
+    if (!Op.isDef())
+      continue;
+    BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
+            TII->get(AMDGPU::IMPLICIT_DEF), Op.getReg());
+  }
+
+  MI.eraseFromParent();
+}
+
 // Match:
 // mov t, x
 // mov x, y
@@ -477,18 +485,25 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI,
   if (!TRI.isVGPR(MRI, X))
     return nullptr;
 
+  if (MovT.hasRegisterImplicitUseOperand(AMDGPU::M0))
+    return nullptr;
+
   const unsigned SearchLimit = 16;
   unsigned Count = 0;
+  bool KilledT = false;
   for (auto Iter = std::next(MovT.getIterator()),
             E = MovT.getParent()->instr_end();
-       Iter != E && Count < SearchLimit; ++Iter, ++Count) {
+       Iter != E && Count < SearchLimit && !KilledT; ++Iter, ++Count) {
 
     MachineInstr *MovY = &*Iter;
+    KilledT = MovY->killsRegister(T, &TRI);
+
     if ((MovY->getOpcode() != AMDGPU::V_MOV_B32_e32 &&
          MovY->getOpcode() != AMDGPU::COPY) ||
         !MovY->getOperand(1).isReg()        ||
         MovY->getOperand(1).getReg() != T   ||
-        MovY->getOperand(1).getSubReg() != Tsub)
+        MovY->getOperand(1).getSubReg() != Tsub ||
+        MovY->hasRegisterImplicitUseOperand(AMDGPU::M0))
       continue;
 
     Register Y = MovY->getOperand(0).getReg();
@@ -522,32 +537,53 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI,
         MovX = nullptr;
         break;
       }
+      // Implicit use of M0 is an indirect move.
+      if (I->hasRegisterImplicitUseOperand(AMDGPU::M0))
+        continue;
+
+      if (Size > 1 && (I->getNumImplicitOperands() > (I->isCopy() ? 0U : 1U)))
+        continue;
+
       MovX = &*I;
     }
 
     if (!MovX)
       continue;
 
-    LLVM_DEBUG(dbgs() << "Matched v_swap_b32:\n" << MovT << *MovX << MovY);
+    LLVM_DEBUG(dbgs() << "Matched v_swap_b32:\n" << MovT << *MovX << *MovY);
 
     for (unsigned I = 0; I < Size; ++I) {
       TargetInstrInfo::RegSubRegPair X1, Y1;
       X1 = getSubRegForIndex(X, Xsub, I, TRI, MRI);
       Y1 = getSubRegForIndex(Y, Ysub, I, TRI, MRI);
-      BuildMI(*MovT.getParent(), MovX->getIterator(), MovT.getDebugLoc(),
-                TII->get(AMDGPU::V_SWAP_B32))
+      MachineBasicBlock &MBB = *MovT.getParent();
+      auto MIB = BuildMI(MBB, MovX->getIterator(), MovT.getDebugLoc(),
+                         TII->get(AMDGPU::V_SWAP_B32))
         .addDef(X1.Reg, 0, X1.SubReg)
         .addDef(Y1.Reg, 0, Y1.SubReg)
         .addReg(Y1.Reg, 0, Y1.SubReg)
         .addReg(X1.Reg, 0, X1.SubReg).getInstr();
+      if (MovX->hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
+        // Drop implicit EXEC.
+        MIB->RemoveOperand(MIB->getNumExplicitOperands());
+        MIB->copyImplicitOps(*MBB.getParent(), *MovX);
+      }
     }
     MovX->eraseFromParent();
-    MovY->eraseFromParent();
+    dropInstructionKeepingImpDefs(*MovY, TII);
     MachineInstr *Next = &*std::next(MovT.getIterator());
-    if (MRI.use_nodbg_empty(T))
-      MovT.eraseFromParent();
-    else
+
+    if (MRI.use_nodbg_empty(T)) {
+      dropInstructionKeepingImpDefs(MovT, TII);
+    } else {
       Xop.setIsKill(false);
+      for (int I = MovT.getNumImplicitOperands() - 1; I >= 0; --I ) {
+        unsigned OpNo = MovT.getNumExplicitOperands() + I;
+        const MachineOperand &Op = MovT.getOperand(OpNo);
+        if (Op.isKill() && TRI.regsOverlap(X, Op.getReg()))
+          MovT.RemoveOperand(OpNo);
+      }
+    }
 
     return Next;
   }
@@ -585,8 +621,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
         // optimizations happen because this will confuse them.
         // XXX - not exactly a check for post-regalloc run.
         MachineOperand &Src = MI.getOperand(1);
-        if (Src.isImm() &&
-            Register::isPhysicalRegister(MI.getOperand(0).getReg())) {
+        if (Src.isImm() && MI.getOperand(0).getReg().isPhysical()) {
           int32_t ReverseImm;
           if (isReverseInlineImm(TII, Src, ReverseImm)) {
             MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32));
@@ -604,35 +639,6 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
         }
       }
 
-      // Combine adjacent s_nops to use the immediate operand encoding how long
-      // to wait.
-      //
-      // s_nop N
-      // s_nop M
-      //  =>
-      // s_nop (N + M)
-      if (MI.getOpcode() == AMDGPU::S_NOP &&
-          MI.getNumOperands() == 1 && // Don't merge with implicit operands
-          Next != MBB.end() &&
-          (*Next).getOpcode() == AMDGPU::S_NOP &&
-          (*Next).getNumOperands() == 1) {
-
-        MachineInstr &NextMI = *Next;
-        // The instruction encodes the amount to wait with an offset of 1,
-        // i.e. 0 is wait 1 cycle. Convert both to cycles and then convert back
-        // after adding.
-        uint8_t Nop0 = MI.getOperand(0).getImm() + 1;
-        uint8_t Nop1 = NextMI.getOperand(0).getImm() + 1;
-
-        // Make sure we don't overflow the bounds.
-        if (Nop0 + Nop1 <= 8) {
-          NextMI.getOperand(0).setImm(Nop0 + Nop1 - 1);
-          MI.eraseFromParent();
-        }
-
-        continue;
-      }
-
       // FIXME: We also need to consider movs of constant operands since
       // immediate operands are not folded if they have more than one use, and
       // the operand folding pass is unaware if the immediate will be free since
@@ -652,7 +658,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
         // FIXME: This could work better if hints worked with subregisters. If
         // we have a vector add of a constant, we usually don't get the correct
         // allocation due to the subregister usage.
-        if (Register::isVirtualRegister(Dest->getReg()) && Src0->isReg()) {
+        if (Dest->getReg().isVirtual() && Src0->isReg()) {
           MRI.setRegAllocationHint(Dest->getReg(), 0, Src0->getReg());
           MRI.setRegAllocationHint(Src0->getReg(), 0, Dest->getReg());
           continue;
@@ -680,7 +686,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
         const MachineOperand &Dst = MI.getOperand(0);
         MachineOperand &Src = MI.getOperand(1);
 
-        if (Src.isImm() && Register::isPhysicalRegister(Dst.getReg())) {
+        if (Src.isImm() && Dst.getReg().isPhysical()) {
           int32_t ReverseImm;
           if (isKImmOperand(TII, Src))
             MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
@@ -729,7 +735,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
 
       if (TII->isVOPC(Op32)) {
         Register DstReg = MI.getOperand(0).getReg();
-        if (Register::isVirtualRegister(DstReg)) {
+        if (DstReg.isVirtual()) {
           // VOPC instructions can only write to the VCC register. We can't
           // force them to use VCC here, because this is only one register and
           // cannot deal with sequences which would require multiple copies of
@@ -753,7 +759,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
         if (!Src2->isReg())
           continue;
         Register SReg = Src2->getReg();
-        if (Register::isVirtualRegister(SReg)) {
+        if (SReg.isVirtual()) {
           MRI.setRegAllocationHint(SReg, 0, VCCReg);
           continue;
         }
@@ -773,7 +779,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
         bool Next = false;
 
         if (SDst->getReg() != VCCReg) {
-          if (Register::isVirtualRegister(SDst->getReg()))
+          if (SDst->getReg().isVirtual())
             MRI.setRegAllocationHint(SDst->getReg(), 0, VCCReg);
           Next = true;
         }
@@ -781,7 +787,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
         // All of the instructions with carry outs also have an SGPR input in
         // src2.
         if (Src2 && Src2->getReg() != VCCReg) {
-          if (Register::isVirtualRegister(Src2->getReg()))
+          if (Src2->getReg().isVirtual())
             MRI.setRegAllocationHint(Src2->getReg(), 0, VCCReg);
           Next = true;
         }
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index b1c73df269fb..0640e24b37ec 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -56,35 +56,17 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
+#include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIInstrInfo.h"
-#include "SIMachineFunctionInfo.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/SlotIndexes.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/IR/CallingConv.h"
-#include "llvm/IR/DebugLoc.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include <cassert>
-#include <vector>
 
 using namespace llvm;
 
@@ -154,6 +136,11 @@ private:
   MachineRegisterInfo *MRI;
   LiveIntervals *LIS;
 
+  unsigned AndOpc;
+  unsigned XorTermrOpc;
+  unsigned OrSaveExecOpc;
+  unsigned Exec;
+
   DenseMap<const MachineInstr *, InstrInfo> Instructions;
   MapVector<MachineBasicBlock *, BlockInfo> Blocks;
   SmallVector<MachineInstr *, 1> LiveMaskQueries;
@@ -164,6 +151,8 @@ private:
 
   void markInstruction(MachineInstr &MI, char Flag,
                        std::vector<WorkItem> &Worklist);
+  void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg,
+                unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist);
   void markInstructionUses(const MachineInstr &MI, char Flag,
                            std::vector<WorkItem> &Worklist);
   char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
@@ -252,6 +241,8 @@ void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
 
   assert(!(Flag & StateExact) && Flag != 0);
 
+  LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI);
+
   // Remove any disabled states from the flag. The user that required it gets
   // an undefined value in the helper lanes. For example, this can happen if
   // the result of an atomic is used by instruction that requires WQM, where
@@ -267,9 +258,70 @@ void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
   Worklist.push_back(&MI);
 }
 
+/// Mark all relevant definitions of register \p Reg in usage \p UseMI.
+void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
+                               Register Reg, unsigned SubReg, char Flag,
+                               std::vector<WorkItem> &Worklist) {
+  assert(!MRI->isSSA());
+
+  LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI);
+
+  LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI));
+  if (!UseLRQ.valueIn())
+    return;
+
+  SmallPtrSet<const VNInfo *, 4> Visited;
+  SmallVector<const VNInfo *, 4> ToProcess;
+  ToProcess.push_back(UseLRQ.valueIn());
+  do {
+    const VNInfo *Value = ToProcess.pop_back_val();
+    Visited.insert(Value);
+
+    if (Value->isPHIDef()) {
+      // Need to mark all defs used in the PHI node
+      const MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def);
+      assert(MBB && "Phi-def has no defining MBB");
+      for (MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(),
+                                                  PE = MBB->pred_end();
+           PI != PE; ++PI) {
+        if (const VNInfo *VN = LR.getVNInfoBefore(LIS->getMBBEndIdx(*PI))) {
+          if (!Visited.count(VN))
+            ToProcess.push_back(VN);
+        }
+      }
+    } else {
+      MachineInstr *MI = LIS->getInstructionFromIndex(Value->def);
+      assert(MI && "Def has no defining instruction");
+      markInstruction(*MI, Flag, Worklist);
+
+      // Iterate over all operands to find relevant definitions
+      for (const MachineOperand &Op : MI->operands()) {
+        if (!(Op.isReg() && Op.getReg() == Reg))
+          continue;
+
+        // Does this def cover whole register?
+        bool DefinesFullReg =
+            Op.isUndef() || !Op.getSubReg() || Op.getSubReg() == SubReg;
+        if (!DefinesFullReg) {
+          // Partial definition; need to follow and mark input value
+          LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI));
+          if (const VNInfo *VN = LRQ.valueIn()) {
+            if (!Visited.count(VN))
+              ToProcess.push_back(VN);
+          }
+        }
+      }
+    }
+  } while (!ToProcess.empty());
+}
+
 /// Mark all instructions defining the uses in \p MI with \p Flag.
 void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
                                           std::vector<WorkItem> &Worklist) {
+
+  LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": "
+                    << MI);
+
   for (const MachineOperand &Use : MI.uses()) {
     if (!Use.isReg() || !Use.isUse())
       continue;
@@ -279,30 +331,39 @@ void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
     // Handle physical registers that we need to track; this is mostly relevant
     // for VCC, which can appear as the (implicit) input of a uniform branch,
     // e.g. when a loop counter is stored in a VGPR.
-    if (!Register::isVirtualRegister(Reg)) {
+    if (!Reg.isVirtual()) {
       if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO)
         continue;
 
-      for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) {
+      for (MCRegUnitIterator RegUnit(Reg.asMCReg(), TRI); RegUnit.isValid();
+           ++RegUnit) {
         LiveRange &LR = LIS->getRegUnit(*RegUnit);
         const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
         if (!Value)
           continue;
 
-        // Since we're in machine SSA, we do not need to track physical
-        // registers across basic blocks.
-        if (Value->isPHIDef())
-          continue;
-
-        markInstruction(*LIS->getInstructionFromIndex(Value->def), Flag,
-                        Worklist);
+        if (MRI->isSSA()) {
+          // Since we're in machine SSA, we do not need to track physical
+          // registers across basic blocks.
+          if (Value->isPHIDef())
+            continue;
+          markInstruction(*LIS->getInstructionFromIndex(Value->def), Flag,
+                          Worklist);
+        } else {
+          markDefs(MI, LR, *RegUnit, AMDGPU::NoSubRegister, Flag, Worklist);
+        }
       }
 
       continue;
     }
 
-    for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
-      markInstruction(DefMI, Flag, Worklist);
+    if (MRI->isSSA()) {
+      for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
+        markInstruction(DefMI, Flag, Worklist);
+    } else {
+      LiveRange &LR = LIS->getInterval(Reg);
+      markDefs(MI, LR, Reg, Use.getSubReg(), Flag, Worklist);
+    }
   }
 }
 
@@ -363,7 +424,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
             LowerToCopyInstrs.push_back(&MI);
           } else {
             Register Reg = Inactive.getReg();
-            if (Register::isVirtualRegister(Reg)) {
+            if (Reg.isVirtual()) {
               for (MachineInstr &DefMI : MRI->def_instructions(Reg))
                 markInstruction(DefMI, StateWWM, Worklist);
             }
@@ -393,7 +454,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
 
             Register Reg = MO.getReg();
 
-            if (!Register::isVirtualRegister(Reg) &&
+            if (!Reg.isVirtual() &&
                 TRI->hasVectorRegisters(TRI->getPhysRegClass(Reg))) {
               Flags = StateWQM;
               break;
@@ -552,7 +613,8 @@ MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
   if (!SaveSCC)
     return PreferLast ? Last : First;
 
-  LiveRange &LR = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));
+  LiveRange &LR =
+      LIS->getRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI));
   auto MBBE = MBB.end();
   SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
                                      : LIS->getMBBEndIdx(&MBB);
@@ -572,7 +634,12 @@ MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
         break;
       Idx = Next;
     } else {
-      SlotIndex Next = S->end.getNextIndex().getBaseIndex();
+      MachineInstr *EndMI = LIS->getInstructionFromIndex(S->end.getBaseIndex());
+      assert(EndMI && "Segment does not end on valid instruction");
+      auto NextI = std::next(EndMI->getIterator());
+      if (NextI == MBB.end())
+        break;
+      SlotIndex Next = LIS->getInstructionIndex(*NextI);
       if (Next > LastIdx)
         break;
       Idx = Next;
@@ -588,6 +655,23 @@ MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
     MBBI = MBB.end();
   }
 
+  // Move insertion point past any operations modifying EXEC.
+  // This assumes that the value of SCC defined by any of these operations
+  // does not need to be preserved.
+  while (MBBI != Last) {
+    bool IsExecDef = false;
+    for (const MachineOperand &MO : MBBI->operands()) {
+      if (MO.isReg() && MO.isDef()) {
+        IsExecDef |=
+            MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC;
+      }
+    }
+    if (!IsExecDef)
+      break;
+    MBBI++;
+    S = nullptr;
+  }
+
   if (S)
     MBBI = saveSCC(MBB, MBBI);
 
@@ -682,8 +766,11 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
   const TargetRegisterClass *BoolRC = TRI->getBoolRC();
 
   auto II = MBB.getFirstNonPHI(), IE = MBB.end();
-  if (isEntry)
-    ++II; // Skip the instruction that saves LiveMask
+  if (isEntry) {
+    // Skip the instruction that saves LiveMask
+    if (II != IE && II->getOpcode() == AMDGPU::COPY)
+      ++II;
+  }
 
   // This stores the first instruction where it's safe to switch from WQM to
   // Exact or vice versa.
@@ -694,6 +781,7 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
   // FirstWQM since if it's safe to switch to/from WWM, it must be safe to
   // switch to/from WQM as well.
   MachineBasicBlock::iterator FirstWWM = IE;
+
   for (;;) {
     MachineBasicBlock::iterator Next = II;
     char Needs = StateExact | StateWQM; // WWM is disabled by default
@@ -730,9 +818,6 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
       if (MI.isTerminator() && OutNeeds == StateExact)
         Needs = StateExact;
 
-      if (MI.getOpcode() == AMDGPU::SI_ELSE && BI.OutNeeds == StateExact)
-        MI.getOperand(3).setImm(1);
-
       ++Next;
     } else {
       // End of basic block
@@ -809,6 +894,7 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
 
     if (II == IE)
       break;
+
     II = Next;
   }
   assert(!SavedWQMReg);
@@ -819,6 +905,7 @@ void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
   for (MachineInstr *MI : LiveMaskQueries) {
     const DebugLoc &DL = MI->getDebugLoc();
     Register Dest = MI->getOperand(0).getReg();
+
     MachineInstr *Copy =
         BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
             .addReg(LiveMaskReg);
@@ -833,19 +920,35 @@ void SIWholeQuadMode::lowerCopyInstrs() {
     assert(MI->getNumExplicitOperands() == 2);
 
     const Register Reg = MI->getOperand(0).getReg();
+    const unsigned SubReg = MI->getOperand(0).getSubReg();
 
     if (TRI->isVGPR(*MRI, Reg)) {
-      const TargetRegisterClass *regClass = Register::isVirtualRegister(Reg)
-                                                ? MRI->getRegClass(Reg)
-                                                : TRI->getPhysRegClass(Reg);
+      const TargetRegisterClass *regClass =
+          Reg.isVirtual() ? MRI->getRegClass(Reg) : TRI->getPhysRegClass(Reg);
+      if (SubReg)
+        regClass = TRI->getSubRegClass(regClass, SubReg);
 
       const unsigned MovOp = TII->getMovOpcode(regClass);
       MI->setDesc(TII->get(MovOp));
 
       // And make it implicitly depend on exec (like all VALU movs should do).
       MI->addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
-    } else {
+    } else if (!MRI->isSSA()) {
+      // Remove early-clobber and exec dependency from simple SGPR copies.
+      // This allows some to be eliminated during/post RA.
+      LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI);
+      if (MI->getOperand(0).isEarlyClobber()) {
+        LIS->removeInterval(Reg);
+        MI->getOperand(0).setIsEarlyClobber(false);
+        LIS->createAndComputeVirtRegInterval(Reg);
+      }
+      int Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC);
+      while (Index >= 0) {
+        MI->RemoveOperand(Index);
+        Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC);
+      }
       MI->setDesc(TII->get(AMDGPU::COPY));
+      LLVM_DEBUG(dbgs() << "  -> " << *MI);
     }
   }
   for (MachineInstr *MI : LowerToCopyInstrs) {
@@ -881,9 +984,20 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
   MRI = &MF.getRegInfo();
   LIS = &getAnalysis<LiveIntervals>();
 
+  if (ST->isWave32()) {
+    AndOpc = AMDGPU::S_AND_B32;
+    XorTermrOpc = AMDGPU::S_XOR_B32_term;
+    OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32;
+    Exec = AMDGPU::EXEC_LO;
+  } else {
+    AndOpc = AMDGPU::S_AND_B64;
+    XorTermrOpc = AMDGPU::S_XOR_B64_term;
+    OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64;
+    Exec = AMDGPU::EXEC;
+  }
+
   char GlobalFlags = analyzeFunction(MF);
   unsigned LiveMaskReg = 0;
-  unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
   if (!(GlobalFlags & StateWQM)) {
     lowerLiveMaskQueries(Exec);
     if (!(GlobalFlags & StateWWM) && LowerToCopyInstrs.empty() && LowerToMovInstrs.empty())
@@ -932,7 +1046,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
   // Physical registers like SCC aren't tracked by default anyway, so just
   // removing the ranges we computed is the simplest option for maintaining
   // the analysis results.
-  LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));
+  LIS->removeRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI));
 
   return true;
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SMInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SMInstructions.td
index 70bf215c03f3..5b8896c21832 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -332,7 +332,6 @@ let OtherPredicates = [HasScalarStores] in {
 def S_DCACHE_WB     : SM_Inval_Pseudo <"s_dcache_wb", int_amdgcn_s_dcache_wb>;
 def S_DCACHE_WB_VOL : SM_Inval_Pseudo <"s_dcache_wb_vol", int_amdgcn_s_dcache_wb_vol>;
 } // End OtherPredicates = [HasScalarStores]
-def S_MEMREALTIME   : SM_Time_Pseudo <"s_memrealtime", int_amdgcn_s_memrealtime>;
 
 defm S_ATC_PROBE        : SM_Pseudo_Probe <"s_atc_probe", SReg_64>;
 let is_buffer = 1 in {
@@ -340,6 +339,9 @@ defm S_ATC_PROBE_BUFFER : SM_Pseudo_Probe <"s_atc_probe_buffer", SReg_128>;
 }
 } // SubtargetPredicate = isGFX8Plus
 
+let SubtargetPredicate = HasSMemRealTime in
+def S_MEMREALTIME   : SM_Time_Pseudo <"s_memrealtime", int_amdgcn_s_memrealtime>;
+
 let SubtargetPredicate = isGFX10Plus in
 def S_GL1_INV : SM_Inval_Pseudo<"s_gl1_inv">;
 let SubtargetPredicate = HasGetWaveIdInst in
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SOPInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 9d7b25d55217..7426af931a62 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -54,9 +54,9 @@ class SOP1_Pseudo <string opName, dag outs, dag ins,
   bits<1> has_sdst = 1;
 }
 
-class SOP1_Real<bits<8> op, SOP1_Pseudo ps> :
+class SOP1_Real<bits<8> op, SOP1_Pseudo ps, string real_name = ps.Mnemonic> :
   InstSI <ps.OutOperandList, ps.InOperandList,
-          ps.Mnemonic # " " # ps.AsmOperands, []>,
+          real_name # " " # ps.AsmOperands, []>,
   Enc32 {
 
   let isPseudo = 0;
@@ -288,13 +288,11 @@ def S_MOVRELD_B64 : SOP1_64_movreld <"s_movreld_b64">;
 
 let SubtargetPredicate = isGFX6GFX7GFX8GFX9 in {
 def S_CBRANCH_JOIN : SOP1_0_32R <"s_cbranch_join">;
-def S_MOV_REGRD_B32 : SOP1_32 <"s_mov_regrd_b32">;
 } // End SubtargetPredicate = isGFX6GFX7GFX8GFX9
 
 let Defs = [SCC] in {
 def S_ABS_I32 : SOP1_32 <"s_abs_i32">;
 } // End Defs = [SCC]
-def S_MOV_FED_B32 : SOP1_32 <"s_mov_fed_b32">;
 
 let SubtargetPredicate = HasVGPRIndexMode in {
 def S_SET_GPR_IDX_IDX : SOP1_0_32<"s_set_gpr_idx_idx"> {
@@ -361,9 +359,9 @@ class SOP2_Pseudo<string opName, dag outs, dag ins,
   // let Size = 4; // Do we need size here?
 }
 
-class SOP2_Real<bits<7> op, SOP_Pseudo ps> :
+class SOP2_Real<bits<7> op, SOP_Pseudo ps, string real_name = ps.Mnemonic> :
   InstSI <ps.OutOperandList, ps.InOperandList,
-          ps.Mnemonic # " " # ps.AsmOperands, []>,
+          real_name # " " # ps.AsmOperands, []>,
   Enc32 {
   let isPseudo = 0;
   let isCodeGenOnly = 0;
@@ -410,8 +408,14 @@ class SOP2_64_32_32 <string opName, list<dag> pattern=[]> : SOP2_Pseudo <
 class UniformUnaryFrag<SDPatternOperator Op> : PatFrag <
   (ops node:$src0),
   (Op $src0),
-  [{ return !N->isDivergent(); }]
->;
+  [{ return !N->isDivergent(); }]> {
+  // This check is unnecessary as it's captured by the result register
+  // bank constraint.
+  //
+  // FIXME: Should add a way for the emitter to recognize this is a
+  // trivially true predicate to eliminate the check.
+  let GISelPredicateCode = [{return true;}];
+}
 
 class UniformBinFrag<SDPatternOperator Op> : PatFrag <
   (ops node:$src0, node:$src1),
@@ -425,6 +429,18 @@ class UniformBinFrag<SDPatternOperator Op> : PatFrag <
   let GISelPredicateCode = [{return true;}];
 }
 
+class DivergentBinFrag<SDPatternOperator Op> : PatFrag <
+  (ops node:$src0, node:$src1),
+  (Op $src0, $src1),
+  [{ return N->isDivergent(); }]> {
+  // This check is unnecessary as it's captured by the result register
+  // bank constraint.
+  //
+  // FIXME: Should add a way for the emitter to recognize this is a
+  // trivially true predicate to eliminate the check.
+  let GISelPredicateCode = [{return true;}];
+}
+
 let Defs = [SCC] in { // Carry out goes to SCC
 let isCommutable = 1 in {
 def S_ADD_U32 : SOP2_32 <"s_add_u32">;
@@ -465,10 +481,15 @@ def S_MAX_U32 : SOP2_32 <"s_max_u32",
 } // End isCommutable = 1
 } // End Defs = [SCC]
 
+// This pattern is restricted to certain subtargets (practically GFX8Plus)
+// because isel sometimes produces an sreg_64 copy to SCC as a by-product
+// of this pattern, and only for subtargets with hasScalarCompareEq64
+// is it possible to map such copy to a single instruction (S_CMP_LG_U64).
 class SelectPat<SDPatternOperator select> : PatFrag <
   (ops node:$src1, node:$src2),
   (select SCC, $src1, $src2),
-  [{ return N->getOperand(0)->hasOneUse() && !N->isDivergent(); }]
+  [{ return Subtarget->hasScalarCompareEq64() &&
+            N->getOperand(0)->hasOneUse() && !N->isDivergent(); }]
 >;
 
 let Uses = [SCC] in {
@@ -532,6 +553,7 @@ def S_NOR_B64 : SOP2_64 <"s_nor_b64",
 >;
 } // End isCommutable = 1
 
+// There are also separate patterns for types other than i32
 def S_ANDN2_B32 : SOP2_32 <"s_andn2_b32",
   [(set i32:$sdst, (UniformBinFrag<and> i32:$src0, (UniformUnaryFrag<not> i32:$src1)))]
 >;
@@ -803,48 +825,65 @@ def S_CBRANCH_I_FORK : SOPK_Pseudo <
   "$sdst, $simm16"
 >;
 
-let hasSideEffects = 1 in {
-
 let mayLoad = 1 in {
 // s_getreg_b32 should use hasSideEffects = 1 for tablegen to allow
 // its use in the readcyclecounter selection.
+// FIXME: Need to truncate immediate to 16-bits.
 def S_GETREG_B32 : SOPK_Pseudo <
   "s_getreg_b32",
   (outs SReg_32:$sdst), (ins hwreg:$simm16),
-  "$sdst, $simm16"
->;
+  "$sdst, $simm16",
+  [(set i32:$sdst, (int_amdgcn_s_getreg (i32 timm:$simm16)))]> {
+  let SOPKZext = 1;
+  let hasSideEffects = 1;
 }
+} // End mayLoad = 1
 
-let mayLoad = 0, mayStore =0 in {
+let mayLoad = 0, mayStore = 0, Defs = [MODE], Uses = [MODE] in {
 
-def S_SETREG_B32 : SOPK_Pseudo <
+// FIXME: Need to truncate immediate to 16-bits.
+class S_SETREG_B32_Pseudo <list<dag> pattern=[]> : SOPK_Pseudo <
   "s_setreg_b32",
   (outs), (ins SReg_32:$sdst, hwreg:$simm16),
   "$simm16, $sdst",
-  [(int_amdgcn_s_setreg (i32 timm:$simm16), i32:$sdst)]> {
+  pattern>;
 
+def S_SETREG_B32 : S_SETREG_B32_Pseudo <
+  [(int_amdgcn_s_setreg (i32 timm:$simm16), i32:$sdst)]> {
   // Use custom inserter to optimize some cases to
-  // S_DENORM_MODE/S_ROUND_MODE.
+  // S_DENORM_MODE/S_ROUND_MODE/S_SETREG_B32_mode.
   let usesCustomInserter = 1;
-  let Defs = [MODE];
-  let Uses = [MODE];
+  let hasSideEffects = 1;
+}
+
+// Variant of SETREG that is guaranteed to only touch FP bits in the MODE
+// register, so doesn't have unmodeled side effects.
+def S_SETREG_B32_mode : S_SETREG_B32_Pseudo {
+  let hasSideEffects = 0;
 }
 
 // FIXME: Not on SI?
 //def S_GETREG_REGRD_B32 : SOPK_32 <sopk<0x14, 0x13>, "s_getreg_regrd_b32">;
 
-def S_SETREG_IMM32_B32 : SOPK_Pseudo <
+class S_SETREG_IMM32_B32_Pseudo : SOPK_Pseudo <
   "s_setreg_imm32_b32",
   (outs), (ins i32imm:$imm, hwreg:$simm16),
   "$simm16, $imm"> {
   let Size = 8; // Unlike every other SOPK instruction.
   let has_sdst = 0;
-  let Defs = [MODE];
-  let Uses = [MODE];
 }
 
+def S_SETREG_IMM32_B32 : S_SETREG_IMM32_B32_Pseudo {
+  let hasSideEffects = 1;
+}
+
+// Variant of SETREG_IMM32 that is guaranteed to only touch FP bits in the MODE
+// register, so doesn't have unmodeled side effects.
+def S_SETREG_IMM32_B32_mode : S_SETREG_IMM32_B32_Pseudo {
+  let hasSideEffects = 0;
 }
-} // End hasSideEffects = 1
+
+} // End mayLoad = 0, mayStore = 0, Defs = [MODE], Uses = [MODE]
 
 class SOPK_WAITCNT<string opName, list<dag> pat=[]> :
     SOPK_Pseudo<
@@ -891,88 +930,101 @@ let SubtargetPredicate = isGFX10Plus in {
 // SOPC Instructions
 //===----------------------------------------------------------------------===//
 
-class SOPCe <bits<7> op> : Enc32 {
-  bits<8> src0;
-  bits<8> src1;
-
-  let Inst{7-0} = src0;
-  let Inst{15-8} = src1;
-  let Inst{22-16} = op;
-  let Inst{31-23} = 0x17e;
-}
-
-class SOPC <bits<7> op, dag outs, dag ins, string asm,
-            list<dag> pattern = []> :
-  InstSI<outs, ins, asm, pattern>, SOPCe <op> {
+class SOPC_Pseudo<string opName, dag outs, dag ins,
+                  string asmOps, list<dag> pattern=[]> :
+  SOP_Pseudo<opName, outs, ins, asmOps, pattern> {
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
   let SALU = 1;
   let SOPC = 1;
-  let isCodeGenOnly = 0;
   let Defs = [SCC];
   let SchedRW = [WriteSALU];
   let UseNamedOperandTable = 1;
 }
 
-class SOPC_Base <bits<7> op, RegisterOperand rc0, RegisterOperand rc1,
-                 string opName, list<dag> pattern = []> : SOPC <
-  op, (outs), (ins rc0:$src0, rc1:$src1),
-  opName#" $src0, $src1", pattern > {
-  let Defs = [SCC];
+class SOPC_Real<bits<7> op, SOPC_Pseudo ps, string real_name = ps.Mnemonic> :
+  InstSI <ps.OutOperandList, ps.InOperandList,
+          real_name # " " # ps.AsmOperands, []>,
+  Enc32 {
+  let isPseudo = 0;
+  let isCodeGenOnly = 0;
+
+  // copy relevant pseudo op flags
+  let SubtargetPredicate = ps.SubtargetPredicate;
+  let OtherPredicates    = ps.OtherPredicates;
+  let AsmMatchConverter  = ps.AsmMatchConverter;
+  let UseNamedOperandTable = ps.UseNamedOperandTable;
+  let TSFlags = ps.TSFlags;
+
+  // encoding
+  bits<8> src0;
+  bits<8> src1;
+
+  let Inst{7-0} = src0;
+  let Inst{15-8} = src1;
+  let Inst{22-16} = op;
+  let Inst{31-23} = 0x17e;
 }
-class SOPC_Helper <bits<7> op, RegisterOperand rc, ValueType vt,
+
+class SOPC_Base <RegisterOperand rc0, RegisterOperand rc1,
+                 string opName, list<dag> pattern = []> : SOPC_Pseudo <
+  opName, (outs), (ins rc0:$src0, rc1:$src1),
+  "$src0, $src1", pattern > {
+}
+
+class SOPC_Helper <RegisterOperand rc, ValueType vt,
                     string opName, SDPatternOperator cond> : SOPC_Base <
-  op, rc, rc, opName,
+  rc, rc, opName,
   [(set SCC, (si_setcc_uniform vt:$src0, vt:$src1, cond))] > {
 }
 
-class SOPC_CMP_32<bits<7> op, string opName,
+class SOPC_CMP_32<string opName,
                   SDPatternOperator cond = COND_NULL, string revOp = opName>
-  : SOPC_Helper<op, SSrc_b32, i32, opName, cond>,
+  : SOPC_Helper<SSrc_b32, i32, opName, cond>,
     Commutable_REV<revOp, !eq(revOp, opName)>,
     SOPKInstTable<0, opName> {
   let isCompare = 1;
   let isCommutable = 1;
 }
 
-class SOPC_CMP_64<bits<7> op, string opName,
+class SOPC_CMP_64<string opName,
                   SDPatternOperator cond = COND_NULL, string revOp = opName>
-  : SOPC_Helper<op, SSrc_b64, i64, opName, cond>,
+  : SOPC_Helper<SSrc_b64, i64, opName, cond>,
     Commutable_REV<revOp, !eq(revOp, opName)> {
   let isCompare = 1;
   let isCommutable = 1;
 }
 
-class SOPC_32<bits<7> op, string opName, list<dag> pattern = []>
-  : SOPC_Base<op, SSrc_b32, SSrc_b32, opName, pattern>;
-
-class SOPC_64_32<bits<7> op, string opName, list<dag> pattern = []>
-  : SOPC_Base<op, SSrc_b64, SSrc_b32, opName, pattern>;
-
-def S_CMP_EQ_I32 : SOPC_CMP_32 <0x00, "s_cmp_eq_i32">;
-def S_CMP_LG_I32 : SOPC_CMP_32 <0x01, "s_cmp_lg_i32">;
-def S_CMP_GT_I32 : SOPC_CMP_32 <0x02, "s_cmp_gt_i32", COND_SGT>;
-def S_CMP_GE_I32 : SOPC_CMP_32 <0x03, "s_cmp_ge_i32", COND_SGE>;
-def S_CMP_LT_I32 : SOPC_CMP_32 <0x04, "s_cmp_lt_i32", COND_SLT, "s_cmp_gt_i32">;
-def S_CMP_LE_I32 : SOPC_CMP_32 <0x05, "s_cmp_le_i32", COND_SLE, "s_cmp_ge_i32">;
-def S_CMP_EQ_U32 : SOPC_CMP_32 <0x06, "s_cmp_eq_u32", COND_EQ>;
-def S_CMP_LG_U32 : SOPC_CMP_32 <0x07, "s_cmp_lg_u32", COND_NE>;
-def S_CMP_GT_U32 : SOPC_CMP_32 <0x08, "s_cmp_gt_u32", COND_UGT>;
-def S_CMP_GE_U32 : SOPC_CMP_32 <0x09, "s_cmp_ge_u32", COND_UGE>;
-def S_CMP_LT_U32 : SOPC_CMP_32 <0x0a, "s_cmp_lt_u32", COND_ULT, "s_cmp_gt_u32">;
-def S_CMP_LE_U32 : SOPC_CMP_32 <0x0b, "s_cmp_le_u32", COND_ULE, "s_cmp_ge_u32">;
-
-def S_BITCMP0_B32 : SOPC_32 <0x0c, "s_bitcmp0_b32">;
-def S_BITCMP1_B32 : SOPC_32 <0x0d, "s_bitcmp1_b32">;
-def S_BITCMP0_B64 : SOPC_64_32 <0x0e, "s_bitcmp0_b64">;
-def S_BITCMP1_B64 : SOPC_64_32 <0x0f, "s_bitcmp1_b64">;
+class SOPC_32<string opName, list<dag> pattern = []>
+  : SOPC_Base<SSrc_b32, SSrc_b32, opName, pattern>;
+
+class SOPC_64_32<string opName, list<dag> pattern = []>
+  : SOPC_Base<SSrc_b64, SSrc_b32, opName, pattern>;
+
+def S_CMP_EQ_I32 : SOPC_CMP_32 <"s_cmp_eq_i32">;
+def S_CMP_LG_I32 : SOPC_CMP_32 <"s_cmp_lg_i32">;
+def S_CMP_GT_I32 : SOPC_CMP_32 <"s_cmp_gt_i32", COND_SGT>;
+def S_CMP_GE_I32 : SOPC_CMP_32 <"s_cmp_ge_i32", COND_SGE>;
+def S_CMP_LT_I32 : SOPC_CMP_32 <"s_cmp_lt_i32", COND_SLT, "s_cmp_gt_i32">;
+def S_CMP_LE_I32 : SOPC_CMP_32 <"s_cmp_le_i32", COND_SLE, "s_cmp_ge_i32">;
+def S_CMP_EQ_U32 : SOPC_CMP_32 <"s_cmp_eq_u32", COND_EQ>;
+def S_CMP_LG_U32 : SOPC_CMP_32 <"s_cmp_lg_u32", COND_NE>;
+def S_CMP_GT_U32 : SOPC_CMP_32 <"s_cmp_gt_u32", COND_UGT>;
+def S_CMP_GE_U32 : SOPC_CMP_32 <"s_cmp_ge_u32", COND_UGE>;
+def S_CMP_LT_U32 : SOPC_CMP_32 <"s_cmp_lt_u32", COND_ULT, "s_cmp_gt_u32">;
+def S_CMP_LE_U32 : SOPC_CMP_32 <"s_cmp_le_u32", COND_ULE, "s_cmp_ge_u32">;
+
+def S_BITCMP0_B32 : SOPC_32 <"s_bitcmp0_b32">;
+def S_BITCMP1_B32 : SOPC_32 <"s_bitcmp1_b32">;
+def S_BITCMP0_B64 : SOPC_64_32 <"s_bitcmp0_b64">;
+def S_BITCMP1_B64 : SOPC_64_32 <"s_bitcmp1_b64">;
 let SubtargetPredicate = isGFX6GFX7GFX8GFX9 in
-def S_SETVSKIP : SOPC_32 <0x10, "s_setvskip">;
+def S_SETVSKIP : SOPC_32 <"s_setvskip">;
 
 let SubtargetPredicate = isGFX8Plus in {
-def S_CMP_EQ_U64 : SOPC_CMP_64 <0x12, "s_cmp_eq_u64", COND_EQ>;
-def S_CMP_LG_U64 : SOPC_CMP_64 <0x13, "s_cmp_lg_u64", COND_NE>;
+def S_CMP_EQ_U64 : SOPC_CMP_64 <"s_cmp_eq_u64", COND_EQ>;
+def S_CMP_LG_U64 : SOPC_CMP_64 <"s_cmp_lg_u64", COND_NE>;
 } // End SubtargetPredicate = isGFX8Plus
 
 let SubtargetPredicate = HasVGPRIndexMode in {
@@ -980,10 +1032,11 @@ let SubtargetPredicate = HasVGPRIndexMode in {
 // register. We don't want to add mode register uses to every
 // instruction, and it's too complicated to deal with anyway. This is
 // modeled just as a side effect.
-def S_SET_GPR_IDX_ON : SOPC <0x11,
+def S_SET_GPR_IDX_ON : SOPC_Pseudo <
+  "s_set_gpr_idx_on" ,
   (outs),
   (ins SSrc_b32:$src0, GPRIdxMode:$src1),
-  "s_set_gpr_idx_on $src0,$src1"> {
+  "$src0, $src1"> {
   let Defs = [M0, MODE]; // No scc def
   let Uses = [M0, MODE]; // Other bits of mode, m0 unmodified.
   let hasSideEffects = 1; // Sets mode.gpr_idx_en
@@ -995,225 +1048,239 @@ def S_SET_GPR_IDX_ON : SOPC <0x11,
 // SOPP Instructions
 //===----------------------------------------------------------------------===//
 
-class Base_SOPP <string asm> {
-  string AsmString = asm;
-}
-
-class SOPPe <bits<7> op> : Enc32 {
-  bits <16> simm16;
-
-  let Inst{15-0} = simm16;
-  let Inst{22-16} = op;
-  let Inst{31-23} = 0x17f; // encoding
-}
-
-class SOPP <bits<7> op, dag ins, string asm, list<dag> pattern = []> :
-  InstSI <(outs), ins, asm, pattern >, SOPPe <op>, Base_SOPP <asm> {
-
+class SOPP_Pseudo<string opName, dag ins,
+                  string asmOps = "", list<dag> pattern=[], string keyName = opName> :
+  SOP_Pseudo<opName, (outs), ins, asmOps, pattern> {
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
   let SALU = 1;
   let SOPP = 1;
-  let Size = 4;
+  let FixedSize = 1;
   let SchedRW = [WriteSALU];
-
   let UseNamedOperandTable = 1;
+  bits <16> simm16;
+  bits <1> fixed_imm = 0;
+  string KeyName = keyName;
 }
 
-def S_NOP : SOPP <0x00000000, (ins i16imm:$simm16), "s_nop $simm16">;
+class SOPPRelaxTable <bit isRelaxed, string keyName, string gfxip> {
+  bit IsRelaxed = isRelaxed;
+  string KeyName = keyName # gfxip;
+}
+
+//spaces inserted in realname on instantiation of this record to allow s_endpgm to omit whitespace
+class SOPP_Real<bits<7> op, SOPP_Pseudo ps, string real_name = ps.Mnemonic> :
+  InstSI <ps.OutOperandList, ps.InOperandList,
+          real_name # ps.AsmOperands, []> {
+  let isPseudo = 0;
+  let isCodeGenOnly = 0;
 
-class SOPP_w_nop_e <bits<7> op> : Enc64 {
+  // copy relevant pseudo op flags
+  let SubtargetPredicate = ps.SubtargetPredicate;
+  let OtherPredicates    = ps.OtherPredicates;
+  let AsmMatchConverter  = ps.AsmMatchConverter;
+  let UseNamedOperandTable = ps.UseNamedOperandTable;
+  let TSFlags = ps.TSFlags;
   bits <16> simm16;
+}
 
-  let Inst{15-0} = simm16;
+class SOPP_Real_32 <bits<7> op, SOPP_Pseudo ps, string real_name = ps.Mnemonic> : SOPP_Real<op, ps, real_name>,
+Enc32 {
+  let Inst{15-0} = !if(ps.fixed_imm, ps.simm16, simm16);
   let Inst{22-16} = op;
-  let Inst{31-23} = 0x17f; // encoding
-  let Inst{47-32} = 0x0;
-  let Inst{54-48} = S_NOP.Inst{22-16}; // opcode
-  let Inst{63-55} = S_NOP.Inst{31-23}; // encoding
+  let Inst{31-23} = 0x17f;
 }
 
-class SOPP_w_nop <bits<7> op, dag ins, string asm, list<dag> pattern = []> :
-  InstSI <(outs), ins, asm, pattern >, SOPP_w_nop_e <op>, Base_SOPP <asm> {
-
-  let mayLoad = 0;
-  let mayStore = 0;
-  let hasSideEffects = 0;
-  let SALU = 1;
-  let SOPP = 1;
-  let Size = 8;
-  let SchedRW = [WriteSALU];
-
-  let UseNamedOperandTable = 1;
+class SOPP_Real_64 <bits<7> op, SOPP_Pseudo ps, string real_name = ps.Mnemonic> : SOPP_Real<op, ps, real_name>,
+Enc64 {
+  // encoding
+  let Inst{15-0} = !if(ps.fixed_imm, ps.simm16, simm16);
+  let Inst{22-16} = op;
+  let Inst{31-23} = 0x17f;
+  //effectively a nop
+  let Inst{47-32} = 0x0;
+  let Inst{54-48} = 0x0;
+  let Inst{63-55} = 0x17f;
 }
 
-multiclass SOPP_With_Relaxation <bits<7> op, dag ins, string asm, list<dag> pattern = []> {
-  def "" : SOPP <op, ins, asm, pattern>;
-  def _pad_s_nop : SOPP_w_nop <op, ins, asm, pattern>;
+multiclass SOPP_With_Relaxation <string opName, dag ins,
+                  string asmOps, list<dag> pattern=[]> {
+  def "" : SOPP_Pseudo <opName, ins, asmOps, pattern>;
+  def _pad_s_nop : SOPP_Pseudo <opName # "_pad_s_nop", ins, asmOps, pattern, opName>;
 }
 
-let isTerminator = 1 in {
+def S_NOP : SOPP_Pseudo<"s_nop" , (ins i16imm:$simm16), "$simm16">;
 
-def S_ENDPGM : SOPP <0x00000001, (ins EndpgmImm:$simm16), "s_endpgm$simm16"> {
+let isTerminator = 1 in {
+def S_ENDPGM : SOPP_Pseudo<"s_endpgm", (ins EndpgmImm:$simm16), "$simm16"> {
   let isBarrier = 1;
   let isReturn = 1;
+  let hasSideEffects = 1;
 }
 
-def S_ENDPGM_SAVED : SOPP <0x0000001B, (ins), "s_endpgm_saved"> {
+def S_ENDPGM_SAVED : SOPP_Pseudo<"s_endpgm_saved", (ins)> {
   let SubtargetPredicate = isGFX8Plus;
   let simm16 = 0;
+  let fixed_imm = 1;
   let isBarrier = 1;
   let isReturn = 1;
 }
 
 let SubtargetPredicate = isGFX9Plus in {
-  let isBarrier = 1, isReturn = 1, simm16 = 0 in {
+  let isBarrier = 1, isReturn = 1, simm16 = 0, fixed_imm = 1 in {
     def S_ENDPGM_ORDERED_PS_DONE :
-      SOPP<0x01e, (ins), "s_endpgm_ordered_ps_done">;
-  } // End isBarrier = 1, isReturn = 1, simm16 = 0
+      SOPP_Pseudo<"s_endpgm_ordered_ps_done", (ins)>;
+  } // End isBarrier = 1, isReturn = 1, simm16 = 0, fixed_imm = 1
 } // End SubtargetPredicate = isGFX9Plus
 
 let SubtargetPredicate = isGFX10Plus in {
-  let isBarrier = 1, isReturn = 1, simm16 = 0 in {
+  let isBarrier = 1, isReturn = 1, simm16 = 0, fixed_imm = 1 in {
     def S_CODE_END :
-      SOPP<0x01f, (ins), "s_code_end">;
-  } // End isBarrier = 1, isReturn = 1, simm16 = 0
+      SOPP_Pseudo<"s_code_end", (ins)>;
+  } // End isBarrier = 1, isReturn = 1, simm16 = 0, fixed_imm = 1
 } // End SubtargetPredicate = isGFX10Plus
 
 let isBranch = 1, SchedRW = [WriteBranch] in {
 let isBarrier = 1 in {
-defm S_BRANCH : SOPP_With_Relaxation <
-  0x00000002, (ins sopp_brtarget:$simm16), "s_branch $simm16",
+defm S_BRANCH : SOPP_With_Relaxation<
+  "s_branch" , (ins sopp_brtarget:$simm16), "$simm16",
   [(br bb:$simm16)]>;
 }
 
 let Uses = [SCC] in {
-defm S_CBRANCH_SCC0 : SOPP_With_Relaxation <
-  0x00000004, (ins sopp_brtarget:$simm16),
-  "s_cbranch_scc0 $simm16"
+defm S_CBRANCH_SCC0 : SOPP_With_Relaxation<
+  "s_cbranch_scc0" , (ins sopp_brtarget:$simm16),
+  "$simm16"
 >;
 defm S_CBRANCH_SCC1 : SOPP_With_Relaxation <
-  0x00000005, (ins sopp_brtarget:$simm16),
-  "s_cbranch_scc1 $simm16"
+  "s_cbranch_scc1" , (ins sopp_brtarget:$simm16),
+  "$simm16"
 >;
 } // End Uses = [SCC]
 
 let Uses = [VCC] in {
 defm S_CBRANCH_VCCZ : SOPP_With_Relaxation <
-  0x00000006, (ins sopp_brtarget:$simm16),
-  "s_cbranch_vccz $simm16"
+  "s_cbranch_vccz" , (ins sopp_brtarget:$simm16),
+  "$simm16"
 >;
 defm S_CBRANCH_VCCNZ : SOPP_With_Relaxation <
-  0x00000007, (ins sopp_brtarget:$simm16),
-  "s_cbranch_vccnz $simm16"
+  "s_cbranch_vccnz" , (ins sopp_brtarget:$simm16),
+  "$simm16"
 >;
 } // End Uses = [VCC]
 
 let Uses = [EXEC] in {
 defm S_CBRANCH_EXECZ : SOPP_With_Relaxation <
-  0x00000008, (ins sopp_brtarget:$simm16),
-  "s_cbranch_execz $simm16"
+  "s_cbranch_execz" , (ins sopp_brtarget:$simm16),
+  "$simm16"
 >;
 defm S_CBRANCH_EXECNZ : SOPP_With_Relaxation <
-  0x00000009, (ins sopp_brtarget:$simm16),
-  "s_cbranch_execnz $simm16"
+  "s_cbranch_execnz" , (ins sopp_brtarget:$simm16),
+  "$simm16"
 >;
 } // End Uses = [EXEC]
 
 defm S_CBRANCH_CDBGSYS : SOPP_With_Relaxation <
-  0x00000017, (ins sopp_brtarget:$simm16),
-  "s_cbranch_cdbgsys $simm16"
+  "s_cbranch_cdbgsys" , (ins sopp_brtarget:$simm16),
+  "$simm16"
 >;
 
 defm S_CBRANCH_CDBGSYS_AND_USER : SOPP_With_Relaxation <
-  0x0000001A, (ins sopp_brtarget:$simm16),
-  "s_cbranch_cdbgsys_and_user $simm16"
+  "s_cbranch_cdbgsys_and_user" , (ins sopp_brtarget:$simm16),
+  "$simm16"
 >;
 
 defm S_CBRANCH_CDBGSYS_OR_USER : SOPP_With_Relaxation <
-  0x00000019, (ins sopp_brtarget:$simm16),
-  "s_cbranch_cdbgsys_or_user $simm16"
+  "s_cbranch_cdbgsys_or_user" , (ins sopp_brtarget:$simm16),
+  "$simm16"
 >;
 
 defm S_CBRANCH_CDBGUSER : SOPP_With_Relaxation <
-  0x00000018, (ins sopp_brtarget:$simm16),
-  "s_cbranch_cdbguser $simm16"
+  "s_cbranch_cdbguser" , (ins sopp_brtarget:$simm16),
+  "$simm16"
 >;
 
 } // End isBranch = 1
 } // End isTerminator = 1
 
 let hasSideEffects = 1 in {
-def S_BARRIER : SOPP <0x0000000a, (ins), "s_barrier",
+def S_BARRIER : SOPP_Pseudo <"s_barrier", (ins), "",
   [(int_amdgcn_s_barrier)]> {
   let SchedRW = [WriteBarrier];
   let simm16 = 0;
+  let fixed_imm = 1;
   let isConvergent = 1;
 }
 
-def S_WAKEUP : SOPP <0x00000003, (ins), "s_wakeup"> {
+def S_WAKEUP : SOPP_Pseudo <"s_wakeup", (ins) > {
   let SubtargetPredicate = isGFX8Plus;
   let simm16 = 0;
+  let fixed_imm = 1;
   let mayLoad = 1;
   let mayStore = 1;
 }
 
 let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
-def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16",
+def S_WAITCNT : SOPP_Pseudo <"s_waitcnt" , (ins WAIT_FLAG:$simm16), "$simm16",
     [(int_amdgcn_s_waitcnt timm:$simm16)]>;
-def S_SETHALT : SOPP <0x0000000d, (ins i16imm:$simm16), "s_sethalt $simm16">;
-def S_SETKILL : SOPP <0x0000000b, (ins i16imm:$simm16), "s_setkill $simm16">;
+def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i16imm:$simm16), "$simm16">;
+def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16">;
 
 // On SI the documentation says sleep for approximately 64 * low 2
 // bits, consistent with the reported maximum of 448. On VI the
 // maximum reported is 960 cycles, so 960 / 64 = 15 max, so is the
 // maximum really 15 on VI?
-def S_SLEEP : SOPP <0x0000000e, (ins i32imm:$simm16),
-  "s_sleep $simm16", [(int_amdgcn_s_sleep timm:$simm16)]> {
+def S_SLEEP : SOPP_Pseudo <"s_sleep", (ins i32imm:$simm16),
+  "$simm16", [(int_amdgcn_s_sleep timm:$simm16)]> {
   let hasSideEffects = 1;
   let mayLoad = 0;
   let mayStore = 0;
 }
 
-def S_SETPRIO : SOPP <0x0000000f, (ins i16imm:$simm16), "s_setprio $simm16">;
+def S_SETPRIO : SOPP_Pseudo <"s_setprio" , (ins i16imm:$simm16), "$simm16">;
 
 let Uses = [EXEC, M0] in {
 // FIXME: Should this be mayLoad+mayStore?
-def S_SENDMSG : SOPP <0x00000010, (ins SendMsgImm:$simm16), "s_sendmsg $simm16",
+def S_SENDMSG : SOPP_Pseudo <"s_sendmsg" , (ins SendMsgImm:$simm16), "$simm16",
   [(int_amdgcn_s_sendmsg (i32 timm:$simm16), M0)]>;
 
-def S_SENDMSGHALT : SOPP <0x00000011, (ins SendMsgImm:$simm16), "s_sendmsghalt $simm16",
+def S_SENDMSGHALT : SOPP_Pseudo <"s_sendmsghalt" , (ins SendMsgImm:$simm16), "$simm16",
   [(int_amdgcn_s_sendmsghalt (i32 timm:$simm16), M0)]>;
 
 } // End Uses = [EXEC, M0]
 
-def S_TRAP : SOPP <0x00000012, (ins i16imm:$simm16), "s_trap $simm16"> {
+def S_TRAP : SOPP_Pseudo <"s_trap" , (ins i16imm:$simm16), "$simm16"> {
   let isTrap = 1;
 }
 
-def S_ICACHE_INV : SOPP <0x00000013, (ins), "s_icache_inv"> {
+def S_ICACHE_INV : SOPP_Pseudo <"s_icache_inv", (ins)> {
   let simm16 = 0;
+  let fixed_imm = 1;
 }
-def S_INCPERFLEVEL : SOPP <0x00000014, (ins i32imm:$simm16), "s_incperflevel $simm16",
+def S_INCPERFLEVEL : SOPP_Pseudo <"s_incperflevel", (ins i32imm:$simm16), "$simm16",
   [(int_amdgcn_s_incperflevel timm:$simm16)]> {
   let hasSideEffects = 1;
   let mayLoad = 0;
   let mayStore = 0;
 }
-def S_DECPERFLEVEL : SOPP <0x00000015, (ins i32imm:$simm16), "s_decperflevel $simm16",
+def S_DECPERFLEVEL : SOPP_Pseudo <"s_decperflevel", (ins i32imm:$simm16), "$simm16",
   [(int_amdgcn_s_decperflevel timm:$simm16)]> {
   let hasSideEffects = 1;
   let mayLoad = 0;
   let mayStore = 0;
 }
-def S_TTRACEDATA : SOPP <0x00000016, (ins), "s_ttracedata"> {
+def S_TTRACEDATA : SOPP_Pseudo <"s_ttracedata", (ins)> {
   let simm16 = 0;
+  let fixed_imm = 1;
 }
 
 let SubtargetPredicate = HasVGPRIndexMode in {
-def S_SET_GPR_IDX_OFF : SOPP<0x1c, (ins), "s_set_gpr_idx_off"> {
+def S_SET_GPR_IDX_OFF : SOPP_Pseudo<"s_set_gpr_idx_off", (ins) > {
   let simm16 = 0;
+  let fixed_imm = 1;
   let Defs = [MODE];
   let Uses = [MODE];
 }
@@ -1221,8 +1288,8 @@ def S_SET_GPR_IDX_OFF : SOPP<0x1c, (ins), "s_set_gpr_idx_off"> {
 } // End hasSideEffects
 
 let SubtargetPredicate = HasVGPRIndexMode in {
-def S_SET_GPR_IDX_MODE : SOPP<0x1d, (ins GPRIdxMode:$simm16),
-  "s_set_gpr_idx_mode$simm16"> {
+def S_SET_GPR_IDX_MODE : SOPP_Pseudo<"s_set_gpr_idx_mode", (ins GPRIdxMode:$simm16),
+  "$simm16"> {
   let Defs = [M0, MODE];
   let Uses = [MODE];
 }
@@ -1230,36 +1297,29 @@ def S_SET_GPR_IDX_MODE : SOPP<0x1d, (ins GPRIdxMode:$simm16),
 
 let SubtargetPredicate = isGFX10Plus in {
   def S_INST_PREFETCH :
-    SOPP<0x020, (ins s16imm:$simm16), "s_inst_prefetch $simm16">;
+    SOPP_Pseudo<"s_inst_prefetch", (ins s16imm:$simm16), "$simm16">;
   def S_CLAUSE :
-    SOPP<0x021, (ins s16imm:$simm16), "s_clause $simm16">;
-  def S_WAITCNT_IDLE :
-    SOPP <0x022, (ins), "s_wait_idle"> {
+    SOPP_Pseudo<"s_clause", (ins s16imm:$simm16), "$simm16">;
+  def S_WAIT_IDLE :
+    SOPP_Pseudo <"s_wait_idle", (ins), ""> {
       let simm16 = 0;
+      let fixed_imm = 1;
     }
   def S_WAITCNT_DEPCTR :
-    SOPP <0x023, (ins s16imm:$simm16), "s_waitcnt_depctr $simm16">;
+    SOPP_Pseudo <"s_waitcnt_depctr" , (ins s16imm:$simm16), "$simm16">;
 
   let hasSideEffects = 0, Uses = [MODE], Defs = [MODE] in {
     def S_ROUND_MODE :
-      SOPP<0x024, (ins s16imm:$simm16), "s_round_mode $simm16">;
+      SOPP_Pseudo<"s_round_mode", (ins s16imm:$simm16), "$simm16">;
     def S_DENORM_MODE :
-      SOPP<0x025, (ins i32imm:$simm16), "s_denorm_mode $simm16",
+      SOPP_Pseudo<"s_denorm_mode", (ins i32imm:$simm16), "$simm16",
       [(SIdenorm_mode (i32 timm:$simm16))]>;
   }
 
   def S_TTRACEDATA_IMM :
-    SOPP<0x028, (ins s16imm:$simm16), "s_ttracedata_imm $simm16">;
+    SOPP_Pseudo<"s_ttracedata_imm", (ins s16imm:$simm16), "$simm16">;
 } // End SubtargetPredicate = isGFX10Plus
 
-//===----------------------------------------------------------------------===//
-// S_GETREG_B32 Intrinsic Pattern.
-//===----------------------------------------------------------------------===//
-def : GCNPat <
-  (int_amdgcn_s_getreg timm:$simm16),
-  (S_GETREG_B32 (as_i16imm $simm16))
->;
-
 //===----------------------------------------------------------------------===//
 // SOP1 Patterns
 //===----------------------------------------------------------------------===//
@@ -1269,6 +1329,11 @@ def : GCNPat <
     (S_ENDPGM (i16 0))
 >;
 
+def : GCNPat <
+  (int_amdgcn_endpgm),
+    (S_ENDPGM (i16 0))
+>;
+
 def : GCNPat <
   (i64 (ctpop i64:$src)),
     (i64 (REG_SEQUENCE SReg_64,
@@ -1325,13 +1390,27 @@ def : GCNPat<
   (S_AND_B32 (S_MOV_B32 (i32 0xffff)), $src)
 >;
 
+// FIXME: ValueType should have isVector field
+class ScalarNot2Pat<Instruction inst, SDPatternOperator op, ValueType vt,
+                    bit isVector = 1> : GCNPat<
+  (UniformBinFrag<op> vt:$src0, (UniformUnaryFrag<!if(isVector, vnot, not)> vt:$src1)),
+  (inst getSOPSrcForVT<vt>.ret:$src0, getSOPSrcForVT<vt>.ret:$src1)
+>;
 
-//===----------------------------------------------------------------------===//
-// Target-specific instruction encodings.
-//===----------------------------------------------------------------------===//
+// Match these for some more types
+// TODO: i1
+def : ScalarNot2Pat<S_ANDN2_B32, and, i16, 0>;
+def : ScalarNot2Pat<S_ANDN2_B32, and, v2i16>;
+def : ScalarNot2Pat<S_ANDN2_B64, and, v4i16>;
+def : ScalarNot2Pat<S_ANDN2_B64, and, v2i32>;
+
+def : ScalarNot2Pat<S_ORN2_B32, or, i16, 0>;
+def : ScalarNot2Pat<S_ORN2_B32, or, v2i16>;
+def : ScalarNot2Pat<S_ORN2_B64, or, v4i16>;
+def : ScalarNot2Pat<S_ORN2_B64, or, v2i32>;
 
 //===----------------------------------------------------------------------===//
-// SOP1 - GFX10.
+// Target-specific instruction encodings.
 //===----------------------------------------------------------------------===//
 
 class Select_gfx10<string opName> : SIMCInstr<opName, SIEncodingFamily.GFX10> {
@@ -1339,6 +1418,20 @@ class Select_gfx10<string opName> : SIMCInstr<opName, SIEncodingFamily.GFX10> {
   string DecoderNamespace      = "GFX10";
 }
 
+class Select_vi<string opName> : SIMCInstr<opName, SIEncodingFamily.VI> {
+  Predicate AssemblerPredicate = isGFX8GFX9;
+  string DecoderNamespace = "GFX8";
+}
+
+class Select_gfx6_gfx7<string opName> : SIMCInstr<opName, SIEncodingFamily.SI> {
+  Predicate AssemblerPredicate = isGFX6GFX7;
+  string DecoderNamespace      = "GFX6GFX7";
+}
+
+//===----------------------------------------------------------------------===//
+// SOP1 - GFX10.
+//===----------------------------------------------------------------------===//
+
 multiclass SOP1_Real_gfx10<bits<8> op> {
   def _gfx10 : SOP1_Real<op, !cast<SOP1_Pseudo>(NAME)>,
                Select_gfx10<!cast<SOP1_Pseudo>(NAME).Mnemonic>;
@@ -1367,10 +1460,6 @@ defm S_MOVRELSD_2_B32       : SOP1_Real_gfx10<0x049>;
 // SOP1 - GFX6, GFX7.
 //===----------------------------------------------------------------------===//
 
-class Select_gfx6_gfx7<string opName> : SIMCInstr<opName, SIEncodingFamily.SI> {
-  Predicate AssemblerPredicate = isGFX6GFX7;
-  string DecoderNamespace      = "GFX6GFX7";
-}
 
 multiclass SOP1_Real_gfx6_gfx7<bits<8> op> {
   def _gfx6_gfx7 : SOP1_Real<op, !cast<SOP1_Pseudo>(NAME)>,
@@ -1381,7 +1470,6 @@ multiclass SOP1_Real_gfx6_gfx7_gfx10<bits<8> op> :
   SOP1_Real_gfx6_gfx7<op>, SOP1_Real_gfx10<op>;
 
 defm S_CBRANCH_JOIN  : SOP1_Real_gfx6_gfx7<0x032>;
-defm S_MOV_REGRD_B32 : SOP1_Real_gfx6_gfx7<0x033>;
 
 defm S_MOV_B32            : SOP1_Real_gfx6_gfx7_gfx10<0x003>;
 defm S_MOV_B64            : SOP1_Real_gfx6_gfx7_gfx10<0x004>;
@@ -1430,7 +1518,6 @@ defm S_MOVRELS_B64        : SOP1_Real_gfx6_gfx7_gfx10<0x02f>;
 defm S_MOVRELD_B32        : SOP1_Real_gfx6_gfx7_gfx10<0x030>;
 defm S_MOVRELD_B64        : SOP1_Real_gfx6_gfx7_gfx10<0x031>;
 defm S_ABS_I32            : SOP1_Real_gfx6_gfx7_gfx10<0x034>;
-defm S_MOV_FED_B32        : SOP1_Real_gfx6_gfx7_gfx10<0x035>;
 
 //===----------------------------------------------------------------------===//
 // SOP2 - GFX10.
@@ -1574,15 +1661,163 @@ defm S_SETREG_B32       : SOPK_Real32_gfx6_gfx7_gfx10<0x013>;
 defm S_SETREG_IMM32_B32 : SOPK_Real64_gfx6_gfx7_gfx10<0x015>;
 
 //===----------------------------------------------------------------------===//
-// GFX8, GFX9 (VI).
+// SOPP - GFX6, GFX7, GFX8, GFX9, GFX10
 //===----------------------------------------------------------------------===//
 
-class Select_vi<string opName> :
-  SIMCInstr<opName, SIEncodingFamily.VI> {
-  Predicate AssemblerPredicate = isGFX8GFX9;
-  string DecoderNamespace = "GFX8";
+multiclass SOPP_Real_32_gfx6_gfx7<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic> {
+  def _gfx6_gfx7 : SOPP_Real_32<op, !cast<SOPP_Pseudo>(NAME), real_name>,
+                   Select_gfx6_gfx7<!cast<SOPP_Pseudo>(NAME).Mnemonic>,
+                   SOPPRelaxTable<0, !cast<SOPP_Pseudo>(NAME).KeyName, "_gfx6_gfx7">;
+}
+
+multiclass SOPP_Real_32_gfx8_gfx9<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> {
+  def _vi : SOPP_Real_32<op, !cast<SOPP_Pseudo>(NAME), real_name>,
+            Select_vi<!cast<SOPP_Pseudo>(NAME).Mnemonic>,
+            SOPPRelaxTable<0, !cast<SOPP_Pseudo>(NAME).KeyName, "_vi">;
+}
+
+multiclass SOPP_Real_32_gfx10<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> {
+  def _gfx10 : SOPP_Real_32<op, !cast<SOPP_Pseudo>(NAME), real_name>,
+               Select_gfx10<!cast<SOPP_Pseudo>(NAME).Mnemonic>,
+               SOPPRelaxTable<0, !cast<SOPP_Pseudo>(NAME).KeyName, "_gfx10">;
+}
+
+multiclass SOPP_Real_32_gfx8_gfx9_gfx10<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> :
+  SOPP_Real_32_gfx8_gfx9<op, real_name>, SOPP_Real_32_gfx10<op, real_name>;
+
+multiclass SOPP_Real_32_gfx6_gfx7_gfx8_gfx9<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> :
+  SOPP_Real_32_gfx6_gfx7<op, real_name>, SOPP_Real_32_gfx8_gfx9<op, real_name>;
+
+multiclass SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> :
+  SOPP_Real_32_gfx6_gfx7_gfx8_gfx9<op, real_name>, SOPP_Real_32_gfx10<op, real_name>;
+
+//64 bit encodings, for Relaxation
+multiclass SOPP_Real_64_gfx6_gfx7<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> {
+  def _gfx6_gfx7 : SOPP_Real_64<op, !cast<SOPP_Pseudo>(NAME), real_name>,
+                   Select_gfx6_gfx7<!cast<SOPP_Pseudo>(NAME).Mnemonic>,
+                   SOPPRelaxTable<1, !cast<SOPP_Pseudo>(NAME).KeyName, "_gfx6_gfx7">;
+}
+
+multiclass SOPP_Real_64_gfx8_gfx9<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> {
+  def _vi : SOPP_Real_64<op, !cast<SOPP_Pseudo>(NAME), real_name>,
+            Select_vi<!cast<SOPP_Pseudo>(NAME).Mnemonic>,
+            SOPPRelaxTable<1, !cast<SOPP_Pseudo>(NAME).KeyName, "_vi">;
+}
+
+multiclass SOPP_Real_64_gfx10<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> {
+  def _gfx10 : SOPP_Real_64<op, !cast<SOPP_Pseudo>(NAME), real_name>,
+               Select_gfx10<!cast<SOPP_Pseudo>(NAME).Mnemonic>,
+               SOPPRelaxTable<1, !cast<SOPP_Pseudo>(NAME).KeyName, "_gfx10">;
+}
+
+multiclass SOPP_Real_64_gfx8_gfx9_gfx10<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> :
+  SOPP_Real_64_gfx8_gfx9<op, real_name>, SOPP_Real_64_gfx10<op, real_name>;
+
+multiclass SOPP_Real_64_gfx6_gfx7_gfx8_gfx9<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> :
+  SOPP_Real_64_gfx6_gfx7<op, real_name>, SOPP_Real_64_gfx8_gfx9<op, real_name>;
+
+multiclass SOPP_Real_64_gfx6_gfx7_gfx8_gfx9_gfx10<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> :
+  SOPP_Real_64_gfx6_gfx7_gfx8_gfx9<op, real_name>, SOPP_Real_64_gfx10<op, real_name>;
+
+//relaxation for insts with no operands not implemented
+multiclass SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<bits<7> op> {
+  defm "" : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<op>;
+  defm _pad_s_nop : SOPP_Real_64_gfx6_gfx7_gfx8_gfx9_gfx10<op>;
+}
+
+defm S_NOP                      : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x000>;
+defm S_ENDPGM                   : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x001, "s_endpgm">;
+defm S_BRANCH                   : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x002>;
+defm S_WAKEUP                   : SOPP_Real_32_gfx8_gfx9_gfx10<0x003>;
+defm S_CBRANCH_SCC0             : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x004>;
+defm S_CBRANCH_SCC1             : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x005>;
+defm S_CBRANCH_VCCZ             : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x006>;
+defm S_CBRANCH_VCCNZ            : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x007>;
+defm S_CBRANCH_EXECZ            : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x008>;
+defm S_CBRANCH_EXECNZ           : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x009>;
+defm S_CBRANCH_CDBGSYS          : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x017>;
+defm S_CBRANCH_CDBGUSER         : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x018>;
+defm S_CBRANCH_CDBGSYS_OR_USER  : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x019>;
+defm S_CBRANCH_CDBGSYS_AND_USER : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x01A>;
+defm S_BARRIER                  : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x00a>;
+defm S_WAITCNT                  : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x00c>;
+defm S_SETHALT                  : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x00d>;
+defm S_SETKILL                  : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x00b>;
+defm S_SLEEP                    : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x00e>;
+defm S_SETPRIO                  : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x00f>;
+defm S_SENDMSG                  : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x010>;
+defm S_SENDMSGHALT              : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x011>;
+defm S_TRAP                     : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x012>;
+defm S_ICACHE_INV               : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x013>;
+defm S_INCPERFLEVEL             : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x014>;
+defm S_DECPERFLEVEL             : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x015>;
+defm S_TTRACEDATA               : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x016>;
+defm S_ENDPGM_SAVED             : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x01B>;
+defm S_SET_GPR_IDX_OFF          : SOPP_Real_32_gfx8_gfx9<0x01c>;
+defm S_SET_GPR_IDX_MODE         : SOPP_Real_32_gfx8_gfx9<0x01d>;
+defm S_ENDPGM_ORDERED_PS_DONE   : SOPP_Real_32_gfx8_gfx9_gfx10<0x01e>;
+defm S_CODE_END                 : SOPP_Real_32_gfx10<0x01f>;
+defm S_INST_PREFETCH            : SOPP_Real_32_gfx10<0x020>;
+defm S_CLAUSE                   : SOPP_Real_32_gfx10<0x021>;
+defm S_WAIT_IDLE                : SOPP_Real_32_gfx10<0x022>;
+defm S_WAITCNT_DEPCTR           : SOPP_Real_32_gfx10<0x023>;
+defm S_ROUND_MODE               : SOPP_Real_32_gfx10<0x024>;
+defm S_DENORM_MODE              : SOPP_Real_32_gfx10<0x025>;
+defm S_TTRACEDATA_IMM           : SOPP_Real_32_gfx10<0x028>;
+
+//===----------------------------------------------------------------------===//
+// SOPC - GFX6, GFX7, GFX8, GFX9, GFX10
+//===----------------------------------------------------------------------===//
+
+multiclass SOPC_Real_gfx6_gfx7<bits<7> op> {
+  def _gfx6_gfx7 : SOPC_Real<op, !cast<SOPC_Pseudo>(NAME)>,
+                   Select_gfx6_gfx7<!cast<SOPC_Pseudo>(NAME).Mnemonic>;
+}
+
+multiclass SOPC_Real_gfx8_gfx9<bits<7> op> {
+  def _vi : SOPC_Real<op, !cast<SOPC_Pseudo>(NAME)>,
+            Select_vi<!cast<SOPC_Pseudo>(NAME).Mnemonic>;
+}
+
+multiclass SOPC_Real_gfx10<bits<7> op> {
+  def _gfx10 : SOPC_Real<op, !cast<SOPC_Pseudo>(NAME)>,
+               Select_gfx10<!cast<SOPC_Pseudo>(NAME).Mnemonic>;
 }
 
+multiclass SOPC_Real_gfx8_gfx9_gfx10<bits<7> op> :
+  SOPC_Real_gfx8_gfx9<op>, SOPC_Real_gfx10<op>;
+
+multiclass SOPC_Real_gfx6_gfx7_gfx8_gfx9<bits<7> op> :
+  SOPC_Real_gfx6_gfx7<op>, SOPC_Real_gfx8_gfx9<op>;
+
+multiclass SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10<bits<7> op> :
+  SOPC_Real_gfx6_gfx7_gfx8_gfx9<op>, SOPC_Real_gfx10<op>;
+
+defm S_CMP_EQ_I32     : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10<0x00>;
+defm S_CMP_LG_I32     : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10<0x01>;
+defm S_CMP_GT_I32     : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10<0x02>;
+defm S_CMP_GE_I32     : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10<0x03>;
+defm S_CMP_LT_I32     : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10<0x04>;
+defm S_CMP_LE_I32     : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10<0x05>;
+defm S_CMP_EQ_U32     : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10<0x06>;
+defm S_CMP_LG_U32     : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10<0x07>;
+defm S_CMP_GT_U32     : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10<0x08>;
+defm S_CMP_GE_U32     : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10<0x09>;
+defm S_CMP_LT_U32     : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10<0x0a>;
+defm S_CMP_LE_U32     : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10<0x0b>;
+defm S_BITCMP0_B32    : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10<0x0c>;
+defm S_BITCMP1_B32    : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10<0x0d>;
+defm S_BITCMP0_B64    : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10<0x0e>;
+defm S_BITCMP1_B64    : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10<0x0f>;
+defm S_SETVSKIP       : SOPC_Real_gfx6_gfx7_gfx8_gfx9<0x10>;
+defm S_SET_GPR_IDX_ON : SOPC_Real_gfx8_gfx9<0x11>;
+defm S_CMP_EQ_U64     : SOPC_Real_gfx8_gfx9_gfx10<0x12>;
+defm S_CMP_LG_U64     : SOPC_Real_gfx8_gfx9_gfx10<0x13>;
+
+//===----------------------------------------------------------------------===//
+// GFX8 (VI), GFX9.
+//===----------------------------------------------------------------------===//
+
 class SOP1_Real_vi<bits<8> op, SOP1_Pseudo ps> :
   SOP1_Real<op, ps>,
   Select_vi<ps.Mnemonic>;
@@ -1643,9 +1878,7 @@ def S_MOVRELS_B64_vi       : SOP1_Real_vi <0x2b, S_MOVRELS_B64>;
 def S_MOVRELD_B32_vi       : SOP1_Real_vi <0x2c, S_MOVRELD_B32>;
 def S_MOVRELD_B64_vi       : SOP1_Real_vi <0x2d, S_MOVRELD_B64>;
 def S_CBRANCH_JOIN_vi      : SOP1_Real_vi <0x2e, S_CBRANCH_JOIN>;
-def S_MOV_REGRD_B32_vi     : SOP1_Real_vi <0x2f, S_MOV_REGRD_B32>;
 def S_ABS_I32_vi           : SOP1_Real_vi <0x30, S_ABS_I32>;
-def S_MOV_FED_B32_vi       : SOP1_Real_vi <0x31, S_MOV_FED_B32>;
 def S_SET_GPR_IDX_IDX_vi   : SOP1_Real_vi <0x32, S_SET_GPR_IDX_IDX>;
 
 def S_ADD_U32_vi           : SOP2_Real_vi <0x00, S_ADD_U32>;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
index 5819a621f55d..c8a85d76a55b 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
@@ -6,6 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 #include "AMDGPUAsmUtils.h"
+#include "SIDefines.h"
+
+#include "llvm/ADT/StringRef.h"
 
 namespace llvm {
 namespace AMDGPU {
@@ -87,6 +90,250 @@ const char* const IdSymbolic[] = {
 
 } // namespace Hwreg
 
+namespace MTBUFFormat {
+
+StringLiteral const DfmtSymbolic[] = {
+  "BUF_DATA_FORMAT_INVALID",
+  "BUF_DATA_FORMAT_8",
+  "BUF_DATA_FORMAT_16",
+  "BUF_DATA_FORMAT_8_8",
+  "BUF_DATA_FORMAT_32",
+  "BUF_DATA_FORMAT_16_16",
+  "BUF_DATA_FORMAT_10_11_11",
+  "BUF_DATA_FORMAT_11_11_10",
+  "BUF_DATA_FORMAT_10_10_10_2",
+  "BUF_DATA_FORMAT_2_10_10_10",
+  "BUF_DATA_FORMAT_8_8_8_8",
+  "BUF_DATA_FORMAT_32_32",
+  "BUF_DATA_FORMAT_16_16_16_16",
+  "BUF_DATA_FORMAT_32_32_32",
+  "BUF_DATA_FORMAT_32_32_32_32",
+  "BUF_DATA_FORMAT_RESERVED_15"
+};
+
+StringLiteral const NfmtSymbolicGFX10[] = {
+  "BUF_NUM_FORMAT_UNORM",
+  "BUF_NUM_FORMAT_SNORM",
+  "BUF_NUM_FORMAT_USCALED",
+  "BUF_NUM_FORMAT_SSCALED",
+  "BUF_NUM_FORMAT_UINT",
+  "BUF_NUM_FORMAT_SINT",
+  "",
+  "BUF_NUM_FORMAT_FLOAT"
+};
+
+StringLiteral const NfmtSymbolicSICI[] = {
+  "BUF_NUM_FORMAT_UNORM",
+  "BUF_NUM_FORMAT_SNORM",
+  "BUF_NUM_FORMAT_USCALED",
+  "BUF_NUM_FORMAT_SSCALED",
+  "BUF_NUM_FORMAT_UINT",
+  "BUF_NUM_FORMAT_SINT",
+  "BUF_NUM_FORMAT_SNORM_OGL",
+  "BUF_NUM_FORMAT_FLOAT"
+};
+
+StringLiteral const NfmtSymbolicVI[] = {    // VI and GFX9
+  "BUF_NUM_FORMAT_UNORM",
+  "BUF_NUM_FORMAT_SNORM",
+  "BUF_NUM_FORMAT_USCALED",
+  "BUF_NUM_FORMAT_SSCALED",
+  "BUF_NUM_FORMAT_UINT",
+  "BUF_NUM_FORMAT_SINT",
+  "BUF_NUM_FORMAT_RESERVED_6",
+  "BUF_NUM_FORMAT_FLOAT"
+};
+
+StringLiteral const UfmtSymbolic[] = {
+  "BUF_FMT_INVALID",
+
+  "BUF_FMT_8_UNORM",
+  "BUF_FMT_8_SNORM",
+  "BUF_FMT_8_USCALED",
+  "BUF_FMT_8_SSCALED",
+  "BUF_FMT_8_UINT",
+  "BUF_FMT_8_SINT",
+
+  "BUF_FMT_16_UNORM",
+  "BUF_FMT_16_SNORM",
+  "BUF_FMT_16_USCALED",
+  "BUF_FMT_16_SSCALED",
+  "BUF_FMT_16_UINT",
+  "BUF_FMT_16_SINT",
+  "BUF_FMT_16_FLOAT",
+
+  "BUF_FMT_8_8_UNORM",
+  "BUF_FMT_8_8_SNORM",
+  "BUF_FMT_8_8_USCALED",
+  "BUF_FMT_8_8_SSCALED",
+  "BUF_FMT_8_8_UINT",
+  "BUF_FMT_8_8_SINT",
+
+  "BUF_FMT_32_UINT",
+  "BUF_FMT_32_SINT",
+  "BUF_FMT_32_FLOAT",
+
+  "BUF_FMT_16_16_UNORM",
+  "BUF_FMT_16_16_SNORM",
+  "BUF_FMT_16_16_USCALED",
+  "BUF_FMT_16_16_SSCALED",
+  "BUF_FMT_16_16_UINT",
+  "BUF_FMT_16_16_SINT",
+  "BUF_FMT_16_16_FLOAT",
+
+  "BUF_FMT_10_11_11_UNORM",
+  "BUF_FMT_10_11_11_SNORM",
+  "BUF_FMT_10_11_11_USCALED",
+  "BUF_FMT_10_11_11_SSCALED",
+  "BUF_FMT_10_11_11_UINT",
+  "BUF_FMT_10_11_11_SINT",
+  "BUF_FMT_10_11_11_FLOAT",
+
+  "BUF_FMT_11_11_10_UNORM",
+  "BUF_FMT_11_11_10_SNORM",
+  "BUF_FMT_11_11_10_USCALED",
+  "BUF_FMT_11_11_10_SSCALED",
+  "BUF_FMT_11_11_10_UINT",
+  "BUF_FMT_11_11_10_SINT",
+  "BUF_FMT_11_11_10_FLOAT",
+
+  "BUF_FMT_10_10_10_2_UNORM",
+  "BUF_FMT_10_10_10_2_SNORM",
+  "BUF_FMT_10_10_10_2_USCALED",
+  "BUF_FMT_10_10_10_2_SSCALED",
+  "BUF_FMT_10_10_10_2_UINT",
+  "BUF_FMT_10_10_10_2_SINT",
+
+  "BUF_FMT_2_10_10_10_UNORM",
+  "BUF_FMT_2_10_10_10_SNORM",
+  "BUF_FMT_2_10_10_10_USCALED",
+  "BUF_FMT_2_10_10_10_SSCALED",
+  "BUF_FMT_2_10_10_10_UINT",
+  "BUF_FMT_2_10_10_10_SINT",
+
+  "BUF_FMT_8_8_8_8_UNORM",
+  "BUF_FMT_8_8_8_8_SNORM",
+  "BUF_FMT_8_8_8_8_USCALED",
+  "BUF_FMT_8_8_8_8_SSCALED",
+  "BUF_FMT_8_8_8_8_UINT",
+  "BUF_FMT_8_8_8_8_SINT",
+
+  "BUF_FMT_32_32_UINT",
+  "BUF_FMT_32_32_SINT",
+  "BUF_FMT_32_32_FLOAT",
+
+  "BUF_FMT_16_16_16_16_UNORM",
+  "BUF_FMT_16_16_16_16_SNORM",
+  "BUF_FMT_16_16_16_16_USCALED",
+  "BUF_FMT_16_16_16_16_SSCALED",
+  "BUF_FMT_16_16_16_16_UINT",
+  "BUF_FMT_16_16_16_16_SINT",
+  "BUF_FMT_16_16_16_16_FLOAT",
+
+  "BUF_FMT_32_32_32_UINT",
+  "BUF_FMT_32_32_32_SINT",
+  "BUF_FMT_32_32_32_FLOAT",
+  "BUF_FMT_32_32_32_32_UINT",
+  "BUF_FMT_32_32_32_32_SINT",
+  "BUF_FMT_32_32_32_32_FLOAT"
+};
+
+unsigned const DfmtNfmt2UFmt[] = {
+  DFMT_INVALID     | (NFMT_UNORM   << NFMT_SHIFT),
+
+  DFMT_8           | (NFMT_UNORM   << NFMT_SHIFT),
+  DFMT_8           | (NFMT_SNORM   << NFMT_SHIFT),
+  DFMT_8           | (NFMT_USCALED << NFMT_SHIFT),
+  DFMT_8           | (NFMT_SSCALED << NFMT_SHIFT),
+  DFMT_8           | (NFMT_UINT    << NFMT_SHIFT),
+  DFMT_8           | (NFMT_SINT    << NFMT_SHIFT),
+
+  DFMT_16          | (NFMT_UNORM   << NFMT_SHIFT),
+  DFMT_16          | (NFMT_SNORM   << NFMT_SHIFT),
+  DFMT_16          | (NFMT_USCALED << NFMT_SHIFT),
+  DFMT_16          | (NFMT_SSCALED << NFMT_SHIFT),
+  DFMT_16          | (NFMT_UINT    << NFMT_SHIFT),
+  DFMT_16          | (NFMT_SINT    << NFMT_SHIFT),
+  DFMT_16          | (NFMT_FLOAT   << NFMT_SHIFT),
+
+  DFMT_8_8         | (NFMT_UNORM   << NFMT_SHIFT),
+  DFMT_8_8         | (NFMT_SNORM   << NFMT_SHIFT),
+  DFMT_8_8         | (NFMT_USCALED << NFMT_SHIFT),
+  DFMT_8_8         | (NFMT_SSCALED << NFMT_SHIFT),
+  DFMT_8_8         | (NFMT_UINT    << NFMT_SHIFT),
+  DFMT_8_8         | (NFMT_SINT    << NFMT_SHIFT),
+
+  DFMT_32          | (NFMT_UINT    << NFMT_SHIFT),
+  DFMT_32          | (NFMT_SINT    << NFMT_SHIFT),
+  DFMT_32          | (NFMT_FLOAT   << NFMT_SHIFT),
+
+  DFMT_16_16       | (NFMT_UNORM   << NFMT_SHIFT),
+  DFMT_16_16       | (NFMT_SNORM   << NFMT_SHIFT),
+  DFMT_16_16       | (NFMT_USCALED << NFMT_SHIFT),
+  DFMT_16_16       | (NFMT_SSCALED << NFMT_SHIFT),
+  DFMT_16_16       | (NFMT_UINT    << NFMT_SHIFT),
+  DFMT_16_16       | (NFMT_SINT    << NFMT_SHIFT),
+  DFMT_16_16       | (NFMT_FLOAT   << NFMT_SHIFT),
+
+  DFMT_10_11_11    | (NFMT_UNORM   << NFMT_SHIFT),
+  DFMT_10_11_11    | (NFMT_SNORM   << NFMT_SHIFT),
+  DFMT_10_11_11    | (NFMT_USCALED << NFMT_SHIFT),
+  DFMT_10_11_11    | (NFMT_SSCALED << NFMT_SHIFT),
+  DFMT_10_11_11    | (NFMT_UINT    << NFMT_SHIFT),
+  DFMT_10_11_11    | (NFMT_SINT    << NFMT_SHIFT),
+  DFMT_10_11_11    | (NFMT_FLOAT   << NFMT_SHIFT),
+
+  DFMT_11_11_10    | (NFMT_UNORM   << NFMT_SHIFT),
+  DFMT_11_11_10    | (NFMT_SNORM   << NFMT_SHIFT),
+  DFMT_11_11_10    | (NFMT_USCALED << NFMT_SHIFT),
+  DFMT_11_11_10    | (NFMT_SSCALED << NFMT_SHIFT),
+  DFMT_11_11_10    | (NFMT_UINT    << NFMT_SHIFT),
+  DFMT_11_11_10    | (NFMT_SINT    << NFMT_SHIFT),
+  DFMT_11_11_10    | (NFMT_FLOAT   << NFMT_SHIFT),
+
+  DFMT_10_10_10_2  | (NFMT_UNORM   << NFMT_SHIFT),
+  DFMT_10_10_10_2  | (NFMT_SNORM   << NFMT_SHIFT),
+  DFMT_10_10_10_2  | (NFMT_USCALED << NFMT_SHIFT),
+  DFMT_10_10_10_2  | (NFMT_SSCALED << NFMT_SHIFT),
+  DFMT_10_10_10_2  | (NFMT_UINT    << NFMT_SHIFT),
+  DFMT_10_10_10_2  | (NFMT_SINT    << NFMT_SHIFT),
+
+  DFMT_2_10_10_10  | (NFMT_UNORM   << NFMT_SHIFT),
+  DFMT_2_10_10_10  | (NFMT_SNORM   << NFMT_SHIFT),
+  DFMT_2_10_10_10  | (NFMT_USCALED << NFMT_SHIFT),
+  DFMT_2_10_10_10  | (NFMT_SSCALED << NFMT_SHIFT),
+  DFMT_2_10_10_10  | (NFMT_UINT    << NFMT_SHIFT),
+  DFMT_2_10_10_10  | (NFMT_SINT    << NFMT_SHIFT),
+
+  DFMT_8_8_8_8     | (NFMT_UNORM   << NFMT_SHIFT),
+  DFMT_8_8_8_8     | (NFMT_SNORM   << NFMT_SHIFT),
+  DFMT_8_8_8_8     | (NFMT_USCALED << NFMT_SHIFT),
+  DFMT_8_8_8_8     | (NFMT_SSCALED << NFMT_SHIFT),
+  DFMT_8_8_8_8     | (NFMT_UINT    << NFMT_SHIFT),
+  DFMT_8_8_8_8     | (NFMT_SINT    << NFMT_SHIFT),
+
+  DFMT_32_32       | (NFMT_UINT    << NFMT_SHIFT),
+  DFMT_32_32       | (NFMT_SINT    << NFMT_SHIFT),
+  DFMT_32_32       | (NFMT_FLOAT   << NFMT_SHIFT),
+
+  DFMT_16_16_16_16 | (NFMT_UNORM   << NFMT_SHIFT),
+  DFMT_16_16_16_16 | (NFMT_SNORM   << NFMT_SHIFT),
+  DFMT_16_16_16_16 | (NFMT_USCALED << NFMT_SHIFT),
+  DFMT_16_16_16_16 | (NFMT_SSCALED << NFMT_SHIFT),
+  DFMT_16_16_16_16 | (NFMT_UINT    << NFMT_SHIFT),
+  DFMT_16_16_16_16 | (NFMT_SINT    << NFMT_SHIFT),
+  DFMT_16_16_16_16 | (NFMT_FLOAT   << NFMT_SHIFT),
+
+  DFMT_32_32_32    | (NFMT_UINT    << NFMT_SHIFT),
+  DFMT_32_32_32    | (NFMT_SINT    << NFMT_SHIFT),
+  DFMT_32_32_32    | (NFMT_FLOAT   << NFMT_SHIFT),
+  DFMT_32_32_32_32 | (NFMT_UINT    << NFMT_SHIFT),
+  DFMT_32_32_32_32 | (NFMT_SINT    << NFMT_SHIFT),
+  DFMT_32_32_32_32 | (NFMT_FLOAT   << NFMT_SHIFT)
+};
+
+} // namespace MTBUFFormat
+
 namespace Swizzle {
 
 // This must be in sync with llvm::AMDGPU::Swizzle::Id enum members, see SIDefines.h.
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
index cd91c5f6edd5..3eb27c5e5f42 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
@@ -10,7 +10,11 @@
 #define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUASMUTILS_H
 
 namespace llvm {
+
+class StringLiteral;
+
 namespace AMDGPU {
+
 namespace SendMsg { // Symbolic names for the sendmsg(...) syntax.
 
 extern const char* const IdSymbolic[];
@@ -25,6 +29,17 @@ extern const char* const IdSymbolic[];
 
 } // namespace Hwreg
 
+namespace MTBUFFormat {
+
+extern StringLiteral const DfmtSymbolic[];
+extern StringLiteral const NfmtSymbolicGFX10[];
+extern StringLiteral const NfmtSymbolicSICI[];
+extern StringLiteral const NfmtSymbolicVI[];
+extern StringLiteral const UfmtSymbolic[];
+extern unsigned const DfmtNfmt2UFmt[];
+
+} // namespace MTBUFFormat
+
 namespace Swizzle { // Symbolic names for the swizzle(...) syntax.
 
 extern const char* const IdSymbolic[];
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 3df2157fc402..4c1e4dec7ecb 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -9,44 +9,28 @@
 #include "AMDGPUBaseInfo.h"
 #include "AMDGPU.h"
 #include "AMDGPUAsmUtils.h"
-#include "AMDGPUTargetTransformInfo.h"
-#include "SIDefines.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Triple.h"
+#include "AMDKernelCodeT.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/BinaryFormat/ELF.h"
-#include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/IR/Attributes.h"
-#include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/Instruction.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/IntrinsicsR600.h"
 #include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCInstrDesc.h"
-#include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/MC/SubtargetFeature.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MathExtras.h"
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
-#include <cstring>
-#include <utility>
-
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/Support/AMDHSAKernelDescriptor.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/TargetParser.h"
 
 #define GET_INSTRINFO_NAMED_OPS
 #define GET_INSTRMAP_INFO
 #include "AMDGPUGenInstrInfo.inc"
-#undef GET_INSTRMAP_INFO
-#undef GET_INSTRINFO_NAMED_OPS
+
+static llvm::cl::opt<unsigned> AmdhsaCodeObjectVersion(
+  "amdhsa-code-object-version", llvm::cl::Hidden,
+  llvm::cl::desc("AMDHSA Code Object Version"), llvm::cl::init(3));
 
 namespace {
 
@@ -103,6 +87,32 @@ namespace llvm {
 
 namespace AMDGPU {
 
+Optional<uint8_t> getHsaAbiVersion(const MCSubtargetInfo *STI) {
+  if (STI && STI->getTargetTriple().getOS() != Triple::AMDHSA)
+    return None;
+
+  switch (AmdhsaCodeObjectVersion) {
+  case 2:
+    return ELF::ELFABIVERSION_AMDGPU_HSA_V2;
+  case 3:
+    return ELF::ELFABIVERSION_AMDGPU_HSA_V3;
+  default:
+    return ELF::ELFABIVERSION_AMDGPU_HSA_V3;
+  }
+}
+
+bool isHsaAbiVersion2(const MCSubtargetInfo *STI) {
+  if (const auto &&HsaAbiVer = getHsaAbiVersion(STI))
+    return HsaAbiVer.getValue() == ELF::ELFABIVERSION_AMDGPU_HSA_V2;
+  return false;
+}
+
+bool isHsaAbiVersion3(const MCSubtargetInfo *STI) {
+  if (const auto &&HsaAbiVer = getHsaAbiVersion(STI))
+    return HsaAbiVer.getValue() == ELF::ELFABIVERSION_AMDGPU_HSA_V3;
+  return false;
+}
+
 #define GET_MIMGBaseOpcodesTable_IMPL
 #define GET_MIMGDimInfoTable_IMPL
 #define GET_MIMGInfoTable_IMPL
@@ -236,6 +246,94 @@ int getMCOpcode(uint16_t Opcode, unsigned Gen) {
 
 namespace IsaInfo {
 
+AMDGPUTargetID::AMDGPUTargetID(const MCSubtargetInfo &STI)
+    : XnackSetting(TargetIDSetting::Any), SramEccSetting(TargetIDSetting::Any) {
+  if (!STI.getFeatureBits().test(FeatureSupportsXNACK))
+    XnackSetting = TargetIDSetting::Unsupported;
+  if (!STI.getFeatureBits().test(FeatureSupportsSRAMECC))
+    SramEccSetting = TargetIDSetting::Unsupported;
+}
+
+void AMDGPUTargetID::setTargetIDFromFeaturesString(StringRef FS) {
+  // Check if xnack or sramecc is explicitly enabled or disabled.  In the
+  // absence of the target features we assume we must generate code that can run
+  // in any environment.
+  SubtargetFeatures Features(FS);
+  Optional<bool> XnackRequested;
+  Optional<bool> SramEccRequested;
+
+  for (const std::string &Feature : Features.getFeatures()) {
+    if (Feature == "+xnack")
+      XnackRequested = true;
+    else if (Feature == "-xnack")
+      XnackRequested = false;
+    else if (Feature == "+sramecc")
+      SramEccRequested = true;
+    else if (Feature == "-sramecc")
+      SramEccRequested = false;
+  }
+
+  bool XnackSupported = isXnackSupported();
+  bool SramEccSupported = isSramEccSupported();
+
+  if (XnackRequested) {
+    if (XnackSupported) {
+      XnackSetting =
+          *XnackRequested ? TargetIDSetting::On : TargetIDSetting::Off;
+    } else {
+      // If a specific xnack setting was requested and this GPU does not support
+      // xnack emit a warning. Setting will remain set to "Unsupported".
+      if (*XnackRequested) {
+        errs() << "warning: xnack 'On' was requested for a processor that does "
+                  "not support it!\n";
+      } else {
+        errs() << "warning: xnack 'Off' was requested for a processor that "
+                  "does not support it!\n";
+      }
+    }
+  }
+
+  if (SramEccRequested) {
+    if (SramEccSupported) {
+      SramEccSetting =
+          *SramEccRequested ? TargetIDSetting::On : TargetIDSetting::Off;
+    } else {
+      // If a specific sramecc setting was requested and this GPU does not
+      // support sramecc emit a warning. Setting will remain set to
+      // "Unsupported".
+      if (*SramEccRequested) {
+        errs() << "warning: sramecc 'On' was requested for a processor that "
+                  "does not support it!\n";
+      } else {
+        errs() << "warning: sramecc 'Off' was requested for a processor that "
+                  "does not support it!\n";
+      }
+    }
+  }
+}
+
+static TargetIDSetting
+getTargetIDSettingFromFeatureString(StringRef FeatureString) {
+  if (FeatureString.endswith("-"))
+    return TargetIDSetting::Off;
+  if (FeatureString.endswith("+"))
+    return TargetIDSetting::On;
+
+  llvm_unreachable("Malformed feature string");
+}
+
+void AMDGPUTargetID::setTargetIDFromTargetIDStream(StringRef TargetID) {
+  SmallVector<StringRef, 3> TargetIDSplit;
+  TargetID.split(TargetIDSplit, ':');
+
+  for (const auto &FeatureString : TargetIDSplit) {
+    if (FeatureString.startswith("xnack"))
+      XnackSetting = getTargetIDSettingFromFeatureString(FeatureString);
+    if (FeatureString.startswith("sramecc"))
+      SramEccSetting = getTargetIDSettingFromFeatureString(FeatureString);
+  }
+}
+
 void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream) {
   auto TargetTriple = STI->getTargetTriple();
   auto Version = getIsaVersion(STI->getCPU());
@@ -252,16 +350,11 @@ void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream) {
   if (hasXNACK(*STI))
     Stream << "+xnack";
   if (hasSRAMECC(*STI))
-    Stream << "+sram-ecc";
+    Stream << "+sramecc";
 
   Stream.flush();
 }
 
-bool hasCodeObjectV3(const MCSubtargetInfo *STI) {
-  return STI->getTargetTriple().getOS() == Triple::AMDHSA &&
-             STI->getFeatureBits().test(FeatureCodeObjectV3);
-}
-
 unsigned getWavefrontSize(const MCSubtargetInfo *STI) {
   if (STI->getFeatureBits().test(FeatureWavefrontSize16))
     return 16;
@@ -284,7 +377,7 @@ unsigned getEUsPerCU(const MCSubtargetInfo *STI) {
   // "Per CU" really means "per whatever functional block the waves of a
   // workgroup must share". For gfx10 in CU mode this is the CU, which contains
   // two SIMDs.
-  if (isGFX10(*STI) && STI->getFeatureBits().test(FeatureCuMode))
+  if (isGFX10Plus(*STI) && STI->getFeatureBits().test(FeatureCuMode))
     return 2;
   // Pre-gfx10 a CU contains four SIMDs. For gfx10 in WGP mode the WGP contains
   // two CUs, so a total of four SIMDs.
@@ -309,7 +402,7 @@ unsigned getMinWavesPerEU(const MCSubtargetInfo *STI) {
 
 unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI) {
   // FIXME: Need to take scratch memory into account.
-  if (!isGFX10(*STI))
+  if (!isGFX10Plus(*STI))
     return 10;
   return hasGFX10_3Insts(*STI) ? 16 : 20;
 }
@@ -459,7 +552,7 @@ unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI,
 }
 
 unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) {
-  if (!isGFX10(*STI))
+  if (!isGFX10Plus(*STI))
     return 256;
   return STI->getFeatureBits().test(FeatureWavefrontSize32) ? 1024 : 512;
 }
@@ -578,7 +671,7 @@ bool isReadOnlySegment(const GlobalValue *GV) {
 }
 
 bool shouldEmitConstantsToTextSection(const Triple &TT) {
-  return TT.getOS() == Triple::AMDPAL || TT.getArch() == Triple::r600;
+  return TT.getArch() == Triple::r600;
 }
 
 int getIntegerAttribute(const Function &F, StringRef Name, int Default) {
@@ -783,6 +876,165 @@ void decodeHwreg(unsigned Val, unsigned &Id, unsigned &Offset, unsigned &Width)
 
 } // namespace Hwreg
 
+//===----------------------------------------------------------------------===//
+// exp tgt
+//===----------------------------------------------------------------------===//
+
+namespace Exp {
+
+struct ExpTgt {
+  StringLiteral Name;
+  unsigned Tgt;
+  unsigned MaxIndex;
+};
+
+static constexpr ExpTgt ExpTgtInfo[] = {
+  {{"null"},  ET_NULL,   ET_NULL_MAX_IDX},
+  {{"mrtz"},  ET_MRTZ,   ET_MRTZ_MAX_IDX},
+  {{"prim"},  ET_PRIM,   ET_PRIM_MAX_IDX},
+  {{"mrt"},   ET_MRT0,   ET_MRT_MAX_IDX},
+  {{"pos"},   ET_POS0,   ET_POS_MAX_IDX},
+  {{"param"}, ET_PARAM0, ET_PARAM_MAX_IDX},
+};
+
+bool getTgtName(unsigned Id, StringRef &Name, int &Index) {
+  for (const ExpTgt &Val : ExpTgtInfo) {
+    if (Val.Tgt <= Id && Id <= Val.Tgt + Val.MaxIndex) {
+      Index = (Val.MaxIndex == 0) ? -1 : (Id - Val.Tgt);
+      Name = Val.Name;
+      return true;
+    }
+  }
+  return false;
+}
+
+unsigned getTgtId(const StringRef Name) {
+
+  for (const ExpTgt &Val : ExpTgtInfo) {
+    if (Val.MaxIndex == 0 && Name == Val.Name)
+      return Val.Tgt;
+
+    if (Val.MaxIndex > 0 && Name.startswith(Val.Name)) {
+      StringRef Suffix = Name.drop_front(Val.Name.size());
+
+      unsigned Id;
+      if (Suffix.getAsInteger(10, Id) || Id > Val.MaxIndex)
+        return ET_INVALID;
+
+      // Disable leading zeroes
+      if (Suffix.size() > 1 && Suffix[0] == '0')
+        return ET_INVALID;
+
+      return Val.Tgt + Id;
+    }
+  }
+  return ET_INVALID;
+}
+
+bool isSupportedTgtId(unsigned Id, const MCSubtargetInfo &STI) {
+  return (Id != ET_POS4 && Id != ET_PRIM) || isGFX10Plus(STI);
+}
+
+} // namespace Exp
+
+//===----------------------------------------------------------------------===//
+// MTBUF Format
+//===----------------------------------------------------------------------===//
+
+namespace MTBUFFormat {
+
+int64_t getDfmt(const StringRef Name) {
+  for (int Id = DFMT_MIN; Id <= DFMT_MAX; ++Id) {
+    if (Name == DfmtSymbolic[Id])
+      return Id;
+  }
+  return DFMT_UNDEF;
+}
+
+StringRef getDfmtName(unsigned Id) {
+  assert(Id <= DFMT_MAX);
+  return DfmtSymbolic[Id];
+}
+
+static StringLiteral const *getNfmtLookupTable(const MCSubtargetInfo &STI) {
+  if (isSI(STI) || isCI(STI))
+    return NfmtSymbolicSICI;
+  if (isVI(STI) || isGFX9(STI))
+    return NfmtSymbolicVI;
+  return NfmtSymbolicGFX10;
+}
+
+int64_t getNfmt(const StringRef Name, const MCSubtargetInfo &STI) {
+  auto lookupTable = getNfmtLookupTable(STI);
+  for (int Id = NFMT_MIN; Id <= NFMT_MAX; ++Id) {
+    if (Name == lookupTable[Id])
+      return Id;
+  }
+  return NFMT_UNDEF;
+}
+
+StringRef getNfmtName(unsigned Id, const MCSubtargetInfo &STI) {
+  assert(Id <= NFMT_MAX);
+  return getNfmtLookupTable(STI)[Id];
+}
+
+bool isValidDfmtNfmt(unsigned Id, const MCSubtargetInfo &STI) {
+  unsigned Dfmt;
+  unsigned Nfmt;
+  decodeDfmtNfmt(Id, Dfmt, Nfmt);
+  return isValidNfmt(Nfmt, STI);
+}
+
+bool isValidNfmt(unsigned Id, const MCSubtargetInfo &STI) {
+  return !getNfmtName(Id, STI).empty();
+}
+
+int64_t encodeDfmtNfmt(unsigned Dfmt, unsigned Nfmt) {
+  return (Dfmt << DFMT_SHIFT) | (Nfmt << NFMT_SHIFT);
+}
+
+void decodeDfmtNfmt(unsigned Format, unsigned &Dfmt, unsigned &Nfmt) {
+  Dfmt = (Format >> DFMT_SHIFT) & DFMT_MASK;
+  Nfmt = (Format >> NFMT_SHIFT) & NFMT_MASK;
+}
+
+int64_t getUnifiedFormat(const StringRef Name) {
+  for (int Id = UFMT_FIRST; Id <= UFMT_LAST; ++Id) {
+    if (Name == UfmtSymbolic[Id])
+      return Id;
+  }
+  return UFMT_UNDEF;
+}
+
+StringRef getUnifiedFormatName(unsigned Id) {
+  return isValidUnifiedFormat(Id) ? UfmtSymbolic[Id] : "";
+}
+
+bool isValidUnifiedFormat(unsigned Id) {
+  return Id <= UFMT_LAST;
+}
+
+int64_t convertDfmtNfmt2Ufmt(unsigned Dfmt, unsigned Nfmt) {
+  int64_t Fmt = encodeDfmtNfmt(Dfmt, Nfmt);
+  for (int Id = UFMT_FIRST; Id <= UFMT_LAST; ++Id) {
+    if (Fmt == DfmtNfmt2UFmt[Id])
+      return Id;
+  }
+  return UFMT_UNDEF;
+}
+
+bool isValidFormatEncoding(unsigned Val, const MCSubtargetInfo &STI) {
+  return isGFX10Plus(STI) ? (Val <= UFMT_MAX) : (Val <= DFMT_NFMT_MAX);
+}
+
+unsigned getDefaultFormatEncoding(const MCSubtargetInfo &STI) {
+  if (isGFX10Plus(STI))
+    return UFMT_DEFAULT;
+  return DFMT_NFMT_DEFAULT;
+}
+
+} // namespace MTBUFFormat
+
 //===----------------------------------------------------------------------===//
 // SendMsg
 //===----------------------------------------------------------------------===//
@@ -804,7 +1056,7 @@ static bool isValidMsgId(int64_t MsgId) {
 bool isValidMsgId(int64_t MsgId, const MCSubtargetInfo &STI, bool Strict) {
   if (Strict) {
     if (MsgId == ID_GS_ALLOC_REQ || MsgId == ID_GET_DOORBELL)
-      return isGFX9(STI) || isGFX10(STI);
+      return isGFX9Plus(STI);
     else
       return isValidMsgId(MsgId);
   } else {
@@ -919,8 +1171,12 @@ bool isShader(CallingConv::ID cc) {
   }
 }
 
+bool isGraphics(CallingConv::ID cc) {
+  return isShader(cc) || cc == CallingConv::AMDGPU_Gfx;
+}
+
 bool isCompute(CallingConv::ID cc) {
-  return !isShader(cc) || cc == CallingConv::AMDGPU_CS;
+  return !isGraphics(cc) || cc == CallingConv::AMDGPU_CS;
 }
 
 bool isEntryFunctionCC(CallingConv::ID CC) {
@@ -940,6 +1196,15 @@ bool isEntryFunctionCC(CallingConv::ID CC) {
   }
 }
 
+bool isModuleEntryFunctionCC(CallingConv::ID CC) {
+  switch (CC) {
+  case CallingConv::AMDGPU_Gfx:
+    return true;
+  default:
+    return isEntryFunctionCC(CC);
+  }
+}
+
 bool hasXNACK(const MCSubtargetInfo &STI) {
   return STI.getFeatureBits()[AMDGPU::FeatureXNACK];
 }
@@ -980,10 +1245,16 @@ bool isGFX9(const MCSubtargetInfo &STI) {
   return STI.getFeatureBits()[AMDGPU::FeatureGFX9];
 }
 
+bool isGFX9Plus(const MCSubtargetInfo &STI) {
+  return isGFX9(STI) || isGFX10Plus(STI);
+}
+
 bool isGFX10(const MCSubtargetInfo &STI) {
   return STI.getFeatureBits()[AMDGPU::FeatureGFX10];
 }
 
+bool isGFX10Plus(const MCSubtargetInfo &STI) { return isGFX10(STI); }
+
 bool isGCN3Encoding(const MCSubtargetInfo &STI) {
   return STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding];
 }
@@ -1017,46 +1288,46 @@ bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI) {
   CASE_CI_VI(FLAT_SCR) \
   CASE_CI_VI(FLAT_SCR_LO) \
   CASE_CI_VI(FLAT_SCR_HI) \
-  CASE_VI_GFX9_GFX10(TTMP0) \
-  CASE_VI_GFX9_GFX10(TTMP1) \
-  CASE_VI_GFX9_GFX10(TTMP2) \
-  CASE_VI_GFX9_GFX10(TTMP3) \
-  CASE_VI_GFX9_GFX10(TTMP4) \
-  CASE_VI_GFX9_GFX10(TTMP5) \
-  CASE_VI_GFX9_GFX10(TTMP6) \
-  CASE_VI_GFX9_GFX10(TTMP7) \
-  CASE_VI_GFX9_GFX10(TTMP8) \
-  CASE_VI_GFX9_GFX10(TTMP9) \
-  CASE_VI_GFX9_GFX10(TTMP10) \
-  CASE_VI_GFX9_GFX10(TTMP11) \
-  CASE_VI_GFX9_GFX10(TTMP12) \
-  CASE_VI_GFX9_GFX10(TTMP13) \
-  CASE_VI_GFX9_GFX10(TTMP14) \
-  CASE_VI_GFX9_GFX10(TTMP15) \
-  CASE_VI_GFX9_GFX10(TTMP0_TTMP1) \
-  CASE_VI_GFX9_GFX10(TTMP2_TTMP3) \
-  CASE_VI_GFX9_GFX10(TTMP4_TTMP5) \
-  CASE_VI_GFX9_GFX10(TTMP6_TTMP7) \
-  CASE_VI_GFX9_GFX10(TTMP8_TTMP9) \
-  CASE_VI_GFX9_GFX10(TTMP10_TTMP11) \
-  CASE_VI_GFX9_GFX10(TTMP12_TTMP13) \
-  CASE_VI_GFX9_GFX10(TTMP14_TTMP15) \
-  CASE_VI_GFX9_GFX10(TTMP0_TTMP1_TTMP2_TTMP3) \
-  CASE_VI_GFX9_GFX10(TTMP4_TTMP5_TTMP6_TTMP7) \
-  CASE_VI_GFX9_GFX10(TTMP8_TTMP9_TTMP10_TTMP11) \
-  CASE_VI_GFX9_GFX10(TTMP12_TTMP13_TTMP14_TTMP15) \
-  CASE_VI_GFX9_GFX10(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7) \
-  CASE_VI_GFX9_GFX10(TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11) \
-  CASE_VI_GFX9_GFX10(TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \
-  CASE_VI_GFX9_GFX10(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \
+  CASE_VI_GFX9PLUS(TTMP0) \
+  CASE_VI_GFX9PLUS(TTMP1) \
+  CASE_VI_GFX9PLUS(TTMP2) \
+  CASE_VI_GFX9PLUS(TTMP3) \
+  CASE_VI_GFX9PLUS(TTMP4) \
+  CASE_VI_GFX9PLUS(TTMP5) \
+  CASE_VI_GFX9PLUS(TTMP6) \
+  CASE_VI_GFX9PLUS(TTMP7) \
+  CASE_VI_GFX9PLUS(TTMP8) \
+  CASE_VI_GFX9PLUS(TTMP9) \
+  CASE_VI_GFX9PLUS(TTMP10) \
+  CASE_VI_GFX9PLUS(TTMP11) \
+  CASE_VI_GFX9PLUS(TTMP12) \
+  CASE_VI_GFX9PLUS(TTMP13) \
+  CASE_VI_GFX9PLUS(TTMP14) \
+  CASE_VI_GFX9PLUS(TTMP15) \
+  CASE_VI_GFX9PLUS(TTMP0_TTMP1) \
+  CASE_VI_GFX9PLUS(TTMP2_TTMP3) \
+  CASE_VI_GFX9PLUS(TTMP4_TTMP5) \
+  CASE_VI_GFX9PLUS(TTMP6_TTMP7) \
+  CASE_VI_GFX9PLUS(TTMP8_TTMP9) \
+  CASE_VI_GFX9PLUS(TTMP10_TTMP11) \
+  CASE_VI_GFX9PLUS(TTMP12_TTMP13) \
+  CASE_VI_GFX9PLUS(TTMP14_TTMP15) \
+  CASE_VI_GFX9PLUS(TTMP0_TTMP1_TTMP2_TTMP3) \
+  CASE_VI_GFX9PLUS(TTMP4_TTMP5_TTMP6_TTMP7) \
+  CASE_VI_GFX9PLUS(TTMP8_TTMP9_TTMP10_TTMP11) \
+  CASE_VI_GFX9PLUS(TTMP12_TTMP13_TTMP14_TTMP15) \
+  CASE_VI_GFX9PLUS(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7) \
+  CASE_VI_GFX9PLUS(TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11) \
+  CASE_VI_GFX9PLUS(TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \
+  CASE_VI_GFX9PLUS(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \
   }
 
 #define CASE_CI_VI(node) \
   assert(!isSI(STI)); \
   case node: return isCI(STI) ? node##_ci : node##_vi;
 
-#define CASE_VI_GFX9_GFX10(node) \
-  case node: return (isGFX9(STI) || isGFX10(STI)) ? node##_gfx9_gfx10 : node##_vi;
+#define CASE_VI_GFX9PLUS(node) \
+  case node: return isGFX9Plus(STI) ? node##_gfx9plus : node##_vi;
 
 unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) {
   if (STI.getTargetTriple().getArch() == Triple::r600)
@@ -1065,17 +1336,17 @@ unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) {
 }
 
 #undef CASE_CI_VI
-#undef CASE_VI_GFX9_GFX10
+#undef CASE_VI_GFX9PLUS
 
 #define CASE_CI_VI(node)   case node##_ci: case node##_vi:   return node;
-#define CASE_VI_GFX9_GFX10(node) case node##_vi: case node##_gfx9_gfx10: return node;
+#define CASE_VI_GFX9PLUS(node) case node##_vi: case node##_gfx9plus: return node;
 
 unsigned mc2PseudoReg(unsigned Reg) {
   MAP_REG2REG
 }
 
 #undef CASE_CI_VI
-#undef CASE_VI_GFX9_GFX10
+#undef CASE_VI_GFX9PLUS
 #undef MAP_REG2REG
 
 bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo) {
@@ -1311,6 +1582,7 @@ bool isArgPassedInSGPR(const Argument *A) {
   case CallingConv::AMDGPU_GS:
   case CallingConv::AMDGPU_PS:
   case CallingConv::AMDGPU_CS:
+  case CallingConv::AMDGPU_Gfx:
     // For non-compute shaders, SGPR inputs are marked with either inreg or byval.
     // Everything else is in VGPRs.
     return F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::InReg) ||
@@ -1322,11 +1594,11 @@ bool isArgPassedInSGPR(const Argument *A) {
 }
 
 static bool hasSMEMByteOffset(const MCSubtargetInfo &ST) {
-  return isGCN3Encoding(ST) || isGFX10(ST);
+  return isGCN3Encoding(ST) || isGFX10Plus(ST);
 }
 
 static bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST) {
-  return isGFX9(ST) || isGFX10(ST);
+  return isGFX9Plus(ST);
 }
 
 bool isLegalSMRDEncodedUnsignedOffset(const MCSubtargetInfo &ST,
@@ -1382,6 +1654,14 @@ Optional<int64_t> getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST,
   return isUInt<32>(EncodedOffset) ? Optional<int64_t>(EncodedOffset) : None;
 }
 
+unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST, bool Signed) {
+  // Address offset is 12-bit signed for GFX10, 13-bit for GFX9.
+  if (AMDGPU::isGFX10(ST))
+    return Signed ? 12 : 11;
+
+  return Signed ? 13 : 12;
+}
+
 // Given Imm, split it into the values to put into the SOffset and ImmOffset
 // fields in an MUBUF instruction. Return false if it is not possible (due to a
 // hardware bug needing a workaround).
@@ -1483,7 +1763,7 @@ const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t BitsPerComp,
                                                   uint8_t NumComponents,
                                                   uint8_t NumFormat,
                                                   const MCSubtargetInfo &STI) {
-  return isGFX10(STI)
+  return isGFX10Plus(STI)
              ? getGfx10PlusBufferFormatInfo(BitsPerComp, NumComponents,
                                             NumFormat)
              : getGfx9BufferFormatInfo(BitsPerComp, NumComponents, NumFormat);
@@ -1491,9 +1771,29 @@ const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t BitsPerComp,
 
 const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t Format,
                                                   const MCSubtargetInfo &STI) {
-  return isGFX10(STI) ? getGfx10PlusBufferFormatInfo(Format)
-                      : getGfx9BufferFormatInfo(Format);
+  return isGFX10Plus(STI) ? getGfx10PlusBufferFormatInfo(Format)
+                          : getGfx9BufferFormatInfo(Format);
 }
 
 } // namespace AMDGPU
+
+raw_ostream &operator<<(raw_ostream &OS,
+                        const AMDGPU::IsaInfo::TargetIDSetting S) {
+  switch (S) {
+  case (AMDGPU::IsaInfo::TargetIDSetting::Unsupported):
+    OS << "Unsupported";
+    break;
+  case (AMDGPU::IsaInfo::TargetIDSetting::Any):
+    OS << "Any";
+    break;
+  case (AMDGPU::IsaInfo::TargetIDSetting::Off):
+    OS << "Off";
+    break;
+  case (AMDGPU::IsaInfo::TargetIDSetting::On):
+    OS << "On";
+    break;
+  }
+  return OS;
+}
+
 } // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 26bb77f4b4c7..f9378693cf48 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -9,22 +9,15 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUBASEINFO_H
 #define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUBASEINFO_H
 
-#include "AMDGPU.h"
-#include "AMDKernelCodeT.h"
 #include "SIDefines.h"
 #include "llvm/IR/CallingConv.h"
-#include "llvm/MC/MCInstrDesc.h"
-#include "llvm/Support/AMDHSAKernelDescriptor.h"
 #include "llvm/Support/Alignment.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetParser.h"
-#include <cstdint>
-#include <string>
-#include <utility>
+
+struct amd_kernel_code_t;
 
 namespace llvm {
 
+struct Align;
 class Argument;
 class Function;
 class GCNSubtarget;
@@ -35,8 +28,23 @@ class MCSubtargetInfo;
 class StringRef;
 class Triple;
 
+namespace amdhsa {
+struct kernel_descriptor_t;
+}
+
 namespace AMDGPU {
 
+struct IsaVersion;
+
+/// \returns HSA OS ABI Version identification.
+Optional<uint8_t> getHsaAbiVersion(const MCSubtargetInfo *STI);
+/// \returns True if HSA OS ABI Version identification is 2,
+/// false otherwise.
+bool isHsaAbiVersion2(const MCSubtargetInfo *STI);
+/// \returns True if HSA OS ABI Version identification is 3,
+/// false otherwise.
+bool isHsaAbiVersion3(const MCSubtargetInfo *STI);
+
 struct GcnBufferFormatInfo {
   unsigned Format;
   unsigned BitsPerComp;
@@ -61,13 +69,87 @@ enum {
   TRAP_NUM_SGPRS = 16
 };
 
+enum class TargetIDSetting {
+  Unsupported,
+  Any,
+  Off,
+  On
+};
+
+class AMDGPUTargetID {
+private:
+  TargetIDSetting XnackSetting;
+  TargetIDSetting SramEccSetting;
+
+public:
+  explicit AMDGPUTargetID(const MCSubtargetInfo &STI);
+  ~AMDGPUTargetID() = default;
+
+  /// \return True if the current xnack setting is not "Unsupported".
+  bool isXnackSupported() const {
+    return XnackSetting != TargetIDSetting::Unsupported;
+  }
+
+  /// \returns True if the current xnack setting is "On" or "Any".
+  bool isXnackOnOrAny() const {
+    return XnackSetting == TargetIDSetting::On ||
+        XnackSetting == TargetIDSetting::Any;
+  }
+
+  /// \returns True if current xnack setting is "On" or "Off",
+  /// false otherwise.
+  bool isXnackOnOrOff() const {
+    return getXnackSetting() == TargetIDSetting::On ||
+        getXnackSetting() == TargetIDSetting::Off;
+  }
+
+  /// \returns The current xnack TargetIDSetting, possible options are
+  /// "Unsupported", "Any", "Off", and "On".
+  TargetIDSetting getXnackSetting() const {
+    return XnackSetting;
+  }
+
+  /// Sets xnack setting to \p NewXnackSetting.
+  void setXnackSetting(TargetIDSetting NewXnackSetting) {
+    XnackSetting = NewXnackSetting;
+  }
+
+  /// \return True if the current sramecc setting is not "Unsupported".
+  bool isSramEccSupported() const {
+    return SramEccSetting != TargetIDSetting::Unsupported;
+  }
+
+  /// \returns True if the current sramecc setting is "On" or "Any".
+  bool isSramEccOnOrAny() const {
+  return SramEccSetting == TargetIDSetting::On ||
+      SramEccSetting == TargetIDSetting::Any;
+  }
+
+  /// \returns True if current sramecc setting is "On" or "Off",
+  /// false otherwise.
+  bool isSramEccOnOrOff() const {
+    return getSramEccSetting() == TargetIDSetting::On ||
+        getSramEccSetting() == TargetIDSetting::Off;
+  }
+
+  /// \returns The current sramecc TargetIDSetting, possible options are
+  /// "Unsupported", "Any", "Off", and "On".
+  TargetIDSetting getSramEccSetting() const {
+    return SramEccSetting;
+  }
+
+  /// Sets sramecc setting to \p NewSramEccSetting.
+  void setSramEccSetting(TargetIDSetting NewSramEccSetting) {
+    SramEccSetting = NewSramEccSetting;
+  }
+
+  void setTargetIDFromFeaturesString(StringRef FS);
+  void setTargetIDFromTargetIDStream(StringRef TargetID);
+};
+
 /// Streams isa version string for given subtarget \p STI into \p Stream.
 void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream);
 
-/// \returns True if given subtarget \p STI supports code object version 3,
-/// false otherwise.
-bool hasCodeObjectV3(const MCSubtargetInfo *STI);
-
 /// \returns Wavefront size for given subtarget \p STI.
 unsigned getWavefrontSize(const MCSubtargetInfo *STI);
 
@@ -368,8 +450,8 @@ struct Waitcnt {
   Waitcnt(unsigned VmCnt, unsigned ExpCnt, unsigned LgkmCnt, unsigned VsCnt)
       : VmCnt(VmCnt), ExpCnt(ExpCnt), LgkmCnt(LgkmCnt), VsCnt(VsCnt) {}
 
-  static Waitcnt allZero(const IsaVersion &Version) {
-    return Waitcnt(0, 0, 0, Version.Major >= 10 ? 0 : ~0u);
+  static Waitcnt allZero(bool HasVscnt) {
+    return Waitcnt(0, 0, 0, HasVscnt ? 0 : ~0u);
   }
   static Waitcnt allZeroExceptVsCnt() { return Waitcnt(0, 0, 0, ~0u); }
 
@@ -482,6 +564,51 @@ void decodeHwreg(unsigned Val, unsigned &Id, unsigned &Offset, unsigned &Width);
 
 } // namespace Hwreg
 
+namespace Exp {
+
+bool getTgtName(unsigned Id, StringRef &Name, int &Index);
+
+LLVM_READONLY
+unsigned getTgtId(const StringRef Name);
+
+LLVM_READNONE
+bool isSupportedTgtId(unsigned Id, const MCSubtargetInfo &STI);
+
+} // namespace Exp
+
+namespace MTBUFFormat {
+
+LLVM_READNONE
+int64_t encodeDfmtNfmt(unsigned Dfmt, unsigned Nfmt);
+
+void decodeDfmtNfmt(unsigned Format, unsigned &Dfmt, unsigned &Nfmt);
+
+int64_t getDfmt(const StringRef Name);
+
+StringRef getDfmtName(unsigned Id);
+
+int64_t getNfmt(const StringRef Name, const MCSubtargetInfo &STI);
+
+StringRef getNfmtName(unsigned Id, const MCSubtargetInfo &STI);
+
+bool isValidDfmtNfmt(unsigned Val, const MCSubtargetInfo &STI);
+
+bool isValidNfmt(unsigned Val, const MCSubtargetInfo &STI);
+
+int64_t getUnifiedFormat(const StringRef Name);
+
+StringRef getUnifiedFormatName(unsigned Id);
+
+bool isValidUnifiedFormat(unsigned Val);
+
+int64_t convertDfmtNfmt2Ufmt(unsigned Dfmt, unsigned Nfmt);
+
+bool isValidFormatEncoding(unsigned Val, const MCSubtargetInfo &STI);
+
+unsigned getDefaultFormatEncoding(const MCSubtargetInfo &STI);
+
+} // namespace MTBUFFormat
+
 namespace SendMsg {
 
 LLVM_READONLY
@@ -529,12 +656,24 @@ unsigned getInitialPSInputAddr(const Function &F);
 LLVM_READNONE
 bool isShader(CallingConv::ID CC);
 
+LLVM_READNONE
+bool isGraphics(CallingConv::ID CC);
+
 LLVM_READNONE
 bool isCompute(CallingConv::ID CC);
 
 LLVM_READNONE
 bool isEntryFunctionCC(CallingConv::ID CC);
 
+// These functions are considered entrypoints into the current module, i.e. they
+// are allowed to be called from outside the current module. This is different
+// from isEntryFunctionCC, which is only true for functions that are entered by
+// the hardware. Module entry points include all entry functions but also
+// include functions that can be called from other functions inside or outside
+// the current module. Module entry functions are allowed to allocate LDS.
+LLVM_READNONE
+bool isModuleEntryFunctionCC(CallingConv::ID CC);
+
 // FIXME: Remove this when calling conventions cleaned up
 LLVM_READNONE
 inline bool isKernel(CallingConv::ID CC) {
@@ -558,7 +697,9 @@ bool isSI(const MCSubtargetInfo &STI);
 bool isCI(const MCSubtargetInfo &STI);
 bool isVI(const MCSubtargetInfo &STI);
 bool isGFX9(const MCSubtargetInfo &STI);
+bool isGFX9Plus(const MCSubtargetInfo &STI);
 bool isGFX10(const MCSubtargetInfo &STI);
+bool isGFX10Plus(const MCSubtargetInfo &STI);
 bool isGCN3Encoding(const MCSubtargetInfo &STI);
 bool isGFX10_BEncoding(const MCSubtargetInfo &STI);
 bool hasGFX10_3Insts(const MCSubtargetInfo &STI);
@@ -690,6 +831,13 @@ Optional<int64_t> getSMRDEncodedOffset(const MCSubtargetInfo &ST,
 Optional<int64_t> getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST,
                                                 int64_t ByteOffset);
 
+/// For FLAT segment the offset must be positive;
+/// MSB is ignored and forced to zero.
+///
+/// \return The number of bits available for the offset field in flat
+/// instructions.
+unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST, bool Signed);
+
 /// \returns true if this offset is small enough to fit in the SMRD
 /// offset field.  \p ByteOffset should be the offset in bytes and
 /// not the encoded offset.
@@ -735,10 +883,8 @@ struct SIModeRegisterDefaults {
   SIModeRegisterDefaults(const Function &F);
 
   static SIModeRegisterDefaults getDefaultForCallingConv(CallingConv::ID CC) {
-    const bool IsCompute = AMDGPU::isCompute(CC);
-
     SIModeRegisterDefaults Mode;
-    Mode.IEEE = IsCompute;
+    Mode.IEEE = !AMDGPU::isShader(CC);
     return Mode;
   }
 
@@ -805,6 +951,10 @@ struct SIModeRegisterDefaults {
 };
 
 } // end namespace AMDGPU
+
+raw_ostream &operator<<(raw_ostream &OS,
+                        const AMDGPU::IsaInfo::TargetIDSetting S);
+
 } // end namespace llvm
 
 #endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUBASEINFO_H
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
index ef010a7ac157..b7dd757a8af3 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
@@ -15,12 +15,10 @@
 //
 
 #include "AMDGPUPALMetadata.h"
-#include "AMDGPU.h"
-#include "AMDGPUAsmPrinter.h"
-#include "MCTargetDesc/AMDGPUTargetStreamer.h"
+#include "AMDGPUPTNote.h"
 #include "SIDefines.h"
 #include "llvm/BinaryFormat/ELF.h"
-#include "llvm/IR/CallingConv.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/AMDGPUMetadata.h"
@@ -45,8 +43,11 @@ void AMDGPUPALMetadata::readFromIR(Module &M) {
   }
   BlobType = ELF::NT_AMD_AMDGPU_PAL_METADATA;
   NamedMD = M.getNamedMetadata("amdgpu.pal.metadata");
-  if (!NamedMD || !NamedMD->getNumOperands())
+  if (!NamedMD || !NamedMD->getNumOperands()) {
+    // Emit msgpack metadata by default
+    BlobType = ELF::NT_AMDGPU_METADATA;
     return;
+  }
   // This is the old reg=value pair format for metadata. It is a NamedMD
   // containing an MDTuple containing a number of MDNodes each of which is an
   // integer value, and each two integer values forms a key=value pair that we
@@ -235,6 +236,13 @@ void AMDGPUPALMetadata::setScratchSize(CallingConv::ID CC, unsigned Val) {
   getHwStage(CC)[".scratch_memory_size"] = MsgPackDoc.getNode(Val);
 }
 
+// Set the stack frame size of a function in the metadata.
+void AMDGPUPALMetadata::setFunctionScratchSize(const MachineFunction &MF,
+                                               unsigned Val) {
+  auto Node = getShaderFunction(MF.getFunction().getName());
+  Node[".stack_frame_size_in_bytes"] = MsgPackDoc.getNode(Val);
+}
+
 // Set the hardware register bit in PAL metadata to enable wave32 on the
 // shader of the given calling convention.
 void AMDGPUPALMetadata::setWave32(unsigned CC) {
@@ -718,6 +726,30 @@ msgpack::MapDocNode AMDGPUPALMetadata::getRegisters() {
   return Registers.getMap();
 }
 
+// Reference (create if necessary) the node for the shader functions map.
+msgpack::DocNode &AMDGPUPALMetadata::refShaderFunctions() {
+  auto &N =
+      MsgPackDoc.getRoot()
+          .getMap(/*Convert=*/true)[MsgPackDoc.getNode("amdpal.pipelines")]
+          .getArray(/*Convert=*/true)[0]
+          .getMap(/*Convert=*/true)[MsgPackDoc.getNode(".shader_functions")];
+  N.getMap(/*Convert=*/true);
+  return N;
+}
+
+// Get (create if necessary) the shader functions map.
+msgpack::MapDocNode AMDGPUPALMetadata::getShaderFunctions() {
+  if (ShaderFunctions.isEmpty())
+    ShaderFunctions = refShaderFunctions();
+  return ShaderFunctions.getMap();
+}
+
+// Get (create if necessary) a function in the shader functions map.
+msgpack::MapDocNode AMDGPUPALMetadata::getShaderFunction(StringRef Name) {
+  auto Functions = getShaderFunctions();
+  return Functions[Name].getMap(/*Convert=*/true);
+}
+
 // Return the PAL metadata hardware shader stage name.
 static const char *getStageName(CallingConv::ID CC) {
   switch (CC) {
@@ -733,6 +765,8 @@ static const char *getStageName(CallingConv::ID CC) {
     return ".hs";
   case CallingConv::AMDGPU_LS:
     return ".ls";
+  case CallingConv::AMDGPU_Gfx:
+    llvm_unreachable("Callable shader has no hardware stage");
   default:
     return ".cs";
   }
@@ -773,3 +807,9 @@ void AMDGPUPALMetadata::setLegacy() {
   BlobType = ELF::NT_AMD_AMDGPU_PAL_METADATA;
 }
 
+// Erase all PAL metadata.
+void AMDGPUPALMetadata::reset() {
+  MsgPackDoc.clear();
+  Registers = MsgPackDoc.getEmptyNode();
+  HwStages = MsgPackDoc.getEmptyNode();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
index 544ab669d9ae..8fa1f738487c 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
@@ -13,11 +13,11 @@
 
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUPALMETADATA_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUPALMETADATA_H
-
 #include "llvm/BinaryFormat/MsgPackDocument.h"
 
 namespace llvm {
 
+class MachineFunction;
 class Module;
 class StringRef;
 
@@ -26,6 +26,7 @@ class AMDGPUPALMetadata {
   msgpack::Document MsgPackDoc;
   msgpack::DocNode Registers;
   msgpack::DocNode HwStages;
+  msgpack::DocNode ShaderFunctions;
 
 public:
   // Read the amdgpu.pal.metadata supplied by the frontend, ready for
@@ -76,6 +77,9 @@ public:
   // Set the scratch size in the metadata.
   void setScratchSize(unsigned CC, unsigned Val);
 
+  // Set the stack frame size of a function in the metadata.
+  void setFunctionScratchSize(const MachineFunction &MF, unsigned Val);
+
   // Set the hardware register bit in PAL metadata to enable wave32 on the
   // shader of the given calling convention.
   void setWave32(unsigned CC);
@@ -106,6 +110,9 @@ public:
   // Set legacy PAL metadata format.
   void setLegacy();
 
+  // Erase all PAL metadata.
+  void reset();
+
 private:
   // Return whether the blob type is legacy PAL metadata.
   bool isLegacy() const;
@@ -116,6 +123,15 @@ private:
   // Get (create if necessary) the registers map.
   msgpack::MapDocNode getRegisters();
 
+  // Reference (create if necessary) the node for the shader functions map.
+  msgpack::DocNode &refShaderFunctions();
+
+  // Get (create if necessary) the shader functions map.
+  msgpack::MapDocNode getShaderFunctions();
+
+  // Get (create if necessary) a function in the shader functions map.
+  msgpack::MapDocNode getShaderFunction(StringRef Name);
+
   // Get (create if necessary) the .hardware_stages entry for the given calling
   // convention.
   msgpack::MapDocNode getHwStage(unsigned CC);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp
index 443e2cc45ac0..45eb6c321476 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDKernelCodeTUtils.h"
+#include "AMDKernelCodeT.h"
 #include "SIDefines.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringMap.h"
@@ -18,9 +19,6 @@
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/Support/raw_ostream.h"
-#include <cassert>
-#include <cstdint>
-#include <utility>
 
 using namespace llvm;
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h
index a87325a78df3..41d0e0d745e5 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h
@@ -13,7 +13,7 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDKERNELCODETUTILS_H
 #define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDKERNELCODETUTILS_H
 
-#include "AMDKernelCodeT.h"
+struct amd_kernel_code_t;
 
 namespace llvm {
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 17f334f62a30..f1e470031982 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -138,7 +138,6 @@ class VOPProfileI2F<ValueType dstVt, ValueType srcVt> :
 
   let HasModifiers = 0;
   let HasClamp = 1;
-  let HasOMod = 1;
 }
 
 def VOP1_F64_I32 : VOPProfileI2F <f64, i32>;
@@ -242,25 +241,25 @@ defm V_CEIL_F32 : VOP1Inst <"v_ceil_f32", VOP_F32_F32, fceil>;
 defm V_RNDNE_F32 : VOP1Inst <"v_rndne_f32", VOP_F32_F32, frint>;
 defm V_FLOOR_F32 : VOP1Inst <"v_floor_f32", VOP_F32_F32, ffloor>;
 
-let SchedRW = [WriteTrans32] in {
+let TRANS = 1, SchedRW = [WriteTrans32] in {
 defm V_EXP_F32 : VOP1Inst <"v_exp_f32", VOP_F32_F32, fexp2>;
 defm V_LOG_F32 : VOP1Inst <"v_log_f32", VOP_F32_F32, flog2>;
 defm V_RCP_F32 : VOP1Inst <"v_rcp_f32", VOP_F32_F32, AMDGPUrcp>;
 defm V_RCP_IFLAG_F32 : VOP1Inst <"v_rcp_iflag_f32", VOP_F32_F32, AMDGPUrcp_iflag>;
 defm V_RSQ_F32 : VOP1Inst <"v_rsq_f32", VOP_F32_F32, AMDGPUrsq>;
 defm V_SQRT_F32 : VOP1Inst <"v_sqrt_f32", VOP_F32_F32, any_amdgcn_sqrt>;
-} // End SchedRW = [WriteTrans32]
+} // End TRANS = 1, SchedRW = [WriteTrans32]
 
-let SchedRW = [WriteTrans64] in {
+let TRANS = 1, SchedRW = [WriteTrans64] in {
 defm V_RCP_F64 : VOP1Inst <"v_rcp_f64", VOP_F64_F64, AMDGPUrcp>;
 defm V_RSQ_F64 : VOP1Inst <"v_rsq_f64", VOP_F64_F64, AMDGPUrsq>;
 defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64, any_amdgcn_sqrt>;
-} // End SchedRW = [WriteTrans64]
+} // End TRANS = 1, SchedRW = [WriteTrans64]
 
-let SchedRW = [WriteTrans32] in {
+let TRANS = 1, SchedRW = [WriteTrans32] in {
 defm V_SIN_F32 : VOP1Inst <"v_sin_f32", VOP_F32_F32, AMDGPUsin>;
 defm V_COS_F32 : VOP1Inst <"v_cos_f32", VOP_F32_F32, AMDGPUcos>;
-} // End SchedRW = [WriteTrans32]
+} // End TRANS = 1, SchedRW = [WriteTrans32]
 
 defm V_NOT_B32 : VOP1Inst <"v_not_b32", VOP_I32_I32>;
 defm V_BFREV_B32 : VOP1Inst <"v_bfrev_b32", VOP_I32_I32, bitreverse>;
@@ -338,10 +337,8 @@ defm V_MOVRELS_B32 : VOP1Inst <"v_movrels_b32", VOP_MOVRELS>;
 defm V_MOVRELSD_B32 : VOP1Inst <"v_movrelsd_b32", VOP_MOVRELSD>;
 } // End Uses = [M0, EXEC]
 
-defm V_MOV_FED_B32 : VOP1Inst <"v_mov_fed_b32", VOP_I32_I32>;
-
 let SubtargetPredicate = isGFX6GFX7 in {
-  let SchedRW = [WriteTrans32] in {
+  let TRANS = 1, SchedRW = [WriteTrans32] in {
     defm V_LOG_CLAMP_F32 :
       VOP1Inst<"v_log_clamp_f32", VOP_F32_F32, int_amdgcn_log_clamp>;
     defm V_RCP_CLAMP_F32 :
@@ -352,7 +349,7 @@ let SubtargetPredicate = isGFX6GFX7 in {
       VOP1Inst<"v_rsq_clamp_f32", VOP_F32_F32, AMDGPUrsq_clamp>;
     defm V_RSQ_LEGACY_F32 :
       VOP1Inst<"v_rsq_legacy_f32", VOP_F32_F32, int_amdgcn_rsq_legacy>;
-  } // End SchedRW = [WriteTrans32]
+  } // End TRANS = 1, SchedRW = [WriteTrans32]
 
   let SchedRW = [WriteDouble] in {
     defm V_RCP_CLAMP_F64 :
@@ -363,10 +360,10 @@ let SubtargetPredicate = isGFX6GFX7 in {
 } // End SubtargetPredicate = isGFX6GFX7
 
 let SubtargetPredicate = isGFX7GFX8GFX9 in {
-  let SchedRW = [WriteTrans32] in {
+  let TRANS = 1, SchedRW = [WriteTrans32] in {
     defm V_LOG_LEGACY_F32 : VOP1Inst<"v_log_legacy_f32", VOP_F32_F32>;
     defm V_EXP_LEGACY_F32 : VOP1Inst<"v_exp_legacy_f32", VOP_F32_F32>;
-  } // End SchedRW = [WriteTrans32]
+  } // End TRANS = 1, SchedRW = [WriteTrans32]
 } // End SubtargetPredicate = isGFX7GFX8GFX9
 
 let SubtargetPredicate = isGFX7Plus in {
@@ -386,7 +383,7 @@ defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP1_F16_I16, sint_to_fp>;
 } // End FPDPRounding = 1
 defm V_CVT_U16_F16 : VOP1Inst <"v_cvt_u16_f16", VOP_I16_F16, fp_to_uint>;
 defm V_CVT_I16_F16 : VOP1Inst <"v_cvt_i16_f16", VOP_I16_F16, fp_to_sint>;
-let SchedRW = [WriteTrans32] in {
+let TRANS = 1, SchedRW = [WriteTrans32] in {
 defm V_RCP_F16 : VOP1Inst <"v_rcp_f16", VOP_F16_F16, AMDGPUrcp>;
 defm V_SQRT_F16 : VOP1Inst <"v_sqrt_f16", VOP_F16_F16, any_amdgcn_sqrt>;
 defm V_RSQ_F16 : VOP1Inst <"v_rsq_f16", VOP_F16_F16, AMDGPUrsq>;
@@ -394,7 +391,7 @@ defm V_LOG_F16 : VOP1Inst <"v_log_f16", VOP_F16_F16, flog2>;
 defm V_EXP_F16 : VOP1Inst <"v_exp_f16", VOP_F16_F16, fexp2>;
 defm V_SIN_F16 : VOP1Inst <"v_sin_f16", VOP_F16_F16, AMDGPUsin>;
 defm V_COS_F16 : VOP1Inst <"v_cos_f16", VOP_F16_F16, AMDGPUcos>;
-} // End SchedRW = [WriteTrans32]
+} // End TRANS = 1, SchedRW = [WriteTrans32]
 defm V_FREXP_MANT_F16 : VOP1Inst <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>;
 defm V_FREXP_EXP_I16_F16 : VOP1Inst <"v_frexp_exp_i16_f16", VOP_I16_F16, int_amdgcn_frexp_exp>;
 defm V_FLOOR_F16 : VOP1Inst <"v_floor_f16", VOP_F16_F16, ffloor>;
@@ -650,7 +647,6 @@ defm V_CVT_F32_I32       : VOP1_Real_gfx6_gfx7_gfx10<0x005>;
 defm V_CVT_F32_U32       : VOP1_Real_gfx6_gfx7_gfx10<0x006>;
 defm V_CVT_U32_F32       : VOP1_Real_gfx6_gfx7_gfx10<0x007>;
 defm V_CVT_I32_F32       : VOP1_Real_gfx6_gfx7_gfx10<0x008>;
-defm V_MOV_FED_B32       : VOP1_Real_gfx6_gfx7_gfx10<0x009>;
 defm V_CVT_F16_F32       : VOP1_Real_gfx6_gfx7_gfx10<0x00a>;
 defm V_CVT_F32_F16       : VOP1_Real_gfx6_gfx7_gfx10<0x00b>;
 defm V_CVT_RPI_I32_F32   : VOP1_Real_gfx6_gfx7_gfx10<0x00c>;
@@ -754,7 +750,6 @@ defm V_CVT_F32_I32       : VOP1_Real_vi <0x5>;
 defm V_CVT_F32_U32       : VOP1_Real_vi <0x6>;
 defm V_CVT_U32_F32       : VOP1_Real_vi <0x7>;
 defm V_CVT_I32_F32       : VOP1_Real_vi <0x8>;
-defm V_MOV_FED_B32       : VOP1_Real_vi <0x9>;
 defm V_CVT_F16_F32       : VOP1_Real_vi <0xa>;
 defm V_CVT_F32_F16       : VOP1_Real_vi <0xb>;
 defm V_CVT_RPI_I32_F32   : VOP1_Real_vi <0xc>;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index aa37dbf1418f..7a334eaadaed 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -92,6 +92,7 @@ class VOP2_Real <VOP2_Pseudo ps, int EncodingFamily> :
 
   // copy relevant pseudo op flags
   let SubtargetPredicate = ps.SubtargetPredicate;
+  let OtherPredicates    = ps.OtherPredicates;
   let AsmMatchConverter  = ps.AsmMatchConverter;
   let AsmVariantName     = ps.AsmVariantName;
   let Constraints        = ps.Constraints;
@@ -240,12 +241,16 @@ multiclass VOP2eInst <string opName,
   }
 }
 
-class VOP2eInstAlias <VOP2_Pseudo ps, Instruction inst, string opnd> :
+class VOP2eInstAlias <VOP2_Pseudo ps, Instruction inst, string opnd = ""> :
   InstAlias <ps.OpName#" "#ps.Pfl.Asm32#", "#opnd,
              (inst ps.Pfl.DstRC:$vdst, ps.Pfl.Src0RC32:$src0,
-                   ps.Pfl.Src1RC32:$src1)>,
-  PredicateControl {
-}
+                   ps.Pfl.Src1RC32:$src1)>, PredicateControl;
+
+class VOP2e64InstAlias <VOP3_Pseudo ps, Instruction inst> :
+  InstAlias <ps.OpName#" "#ps.Pfl.Asm64,
+             (inst ps.Pfl.DstRC:$vdst, VOPDstS64orS32:$sdst,
+                   ps.Pfl.Src0RC32:$src0, ps.Pfl.Src1RC32:$src1, clampmod:$clamp)>,
+  PredicateControl;
 
 multiclass VOP2eInstAliases<VOP2_Pseudo ps, VOP2_Real inst> {
   let WaveSizePredicate = isWave32 in {
@@ -328,11 +333,12 @@ class VOP_MAC <ValueType vt0, ValueType vt1=vt0> : VOPProfile <[vt0, vt1, vt1, v
 
 def VOP_MAC_F16 : VOP_MAC <f16>;
 def VOP_MAC_F32 : VOP_MAC <f32>;
+let HasExtDPP = 0 in
+def VOP_MAC_LEGACY_F32 : VOP_MAC <f32>;
 
 class VOP_DOT_ACC<ValueType vt0, ValueType vt1> : VOP_MAC<vt0, vt1> {
   let HasClamp = 0;
   let HasExtSDWA = 0;
-  let HasModifiers = 1;
   let HasOpSel = 0;
   let IsPacked = 0;
 }
@@ -341,7 +347,11 @@ def VOP_DOT_ACC_F32_V2F16 : VOP_DOT_ACC<f32, v2f16> {
   let Src0ModDPP = FPVRegInputMods;
   let Src1ModDPP = FPVRegInputMods;
 }
-def VOP_DOT_ACC_I32_I32   : VOP_DOT_ACC<i32, i32>;
+
+def VOP_DOT_ACC_I32_I32   : VOP_DOT_ACC<i32, i32> {
+  let HasSrc0Mods = 1;
+  let HasSrc1Mods = 1;
+}
 
 // Write out to vcc or arbitrary SGPR.
 def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped], 0, /*EnableClamp=*/1> {
@@ -361,8 +371,8 @@ def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped], 0, /*EnableClamp
 def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1], 0, /*EnableClamp=*/1> {
   let Asm32 = "$vdst, vcc, $src0, $src1, vcc";
   let Asm64 = "$vdst, $sdst, $src0, $src1, $src2$clamp";
-  let AsmSDWA = "$vdst, vcc, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel";
-  let AsmSDWA9 = "$vdst, vcc, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel";
+  let AsmSDWA = "$vdst, vcc, $src0_modifiers, $src1_modifiers, vcc$clamp $dst_sel $dst_unused $src0_sel $src1_sel";
+  let AsmSDWA9 = "$vdst, vcc, $src0_modifiers, $src1_modifiers, vcc$clamp $dst_sel $dst_unused $src0_sel $src1_sel";
   let AsmDPP = "$vdst, vcc, $src0, $src1, vcc $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
   let AsmDPP8 = "$vdst, vcc, $src0, $src1, vcc $dpp8$fi";
   let AsmDPP16 = AsmDPP#"$fi";
@@ -396,8 +406,8 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1], 0, /*EnableClamp=*
 def VOP2e_I32_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1], /*EnableF32SrcMods=*/1> {
   let Asm32 = "$vdst, $src0, $src1";
   let Asm64 = "$vdst, $src0_modifiers, $src1_modifiers, $src2";
-  let AsmSDWA = "$vdst, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel";
-  let AsmSDWA9 = "$vdst, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel";
+  let AsmSDWA = "$vdst, $src0_modifiers, $src1_modifiers, vcc$clamp $dst_sel $dst_unused $src0_sel $src1_sel";
+  let AsmSDWA9 = "$vdst, $src0_modifiers, $src1_modifiers, vcc$clamp $dst_sel $dst_unused $src0_sel $src1_sel";
   let AsmDPP = "$vdst, $src0, $src1, vcc $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
   let AsmDPP8 = "$vdst, $src0, $src1, vcc $dpp8$fi";
   let AsmDPP16 = AsmDPP#"$fi";
@@ -468,7 +478,7 @@ def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32, []>;
 
 let isCommutable = 1 in {
 defm V_ADD_F32 : VOP2Inst <"v_add_f32", VOP_F32_F32_F32, any_fadd>;
-defm V_SUB_F32 : VOP2Inst <"v_sub_f32", VOP_F32_F32_F32, fsub>;
+defm V_SUB_F32 : VOP2Inst <"v_sub_f32", VOP_F32_F32_F32, any_fsub>;
 defm V_SUBREV_F32 : VOP2Inst <"v_subrev_f32", VOP_F32_F32_F32, null_frag, "v_sub_f32">;
 defm V_MUL_LEGACY_F32 : VOP2Inst <"v_mul_legacy_f32", VOP_F32_F32_F32, AMDGPUfmul_legacy>;
 defm V_MUL_F32 : VOP2Inst <"v_mul_f32", VOP_F32_F32_F32, any_fmul>;
@@ -490,24 +500,25 @@ defm V_OR_B32 : VOP2Inst <"v_or_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, or>;
 defm V_XOR_B32 : VOP2Inst <"v_xor_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, xor>;
 
 let mayRaiseFPException = 0 in {
-let SubtargetPredicate = HasMadMacF32Insts in {
+let OtherPredicates = [HasMadMacF32Insts] in {
 let Constraints = "$vdst = $src2", DisableEncoding="$src2",
     isConvertibleToThreeAddress = 1 in {
 defm V_MAC_F32 : VOP2Inst <"v_mac_f32", VOP_MAC_F32>;
-}
+
+let SubtargetPredicate = isGFX6GFX7GFX10 in
+defm V_MAC_LEGACY_F32 : VOP2Inst <"v_mac_legacy_f32", VOP_MAC_LEGACY_F32>;
+} // End Constraints = "$vdst = $src2", DisableEncoding="$src2",
+  //     isConvertibleToThreeAddress = 1
 
 def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK_F32, []>;
-} // End SubtargetPredicate = HasMadMacF32Insts
-}
+} // End OtherPredicates = [HasMadMacF32Insts]
+} // End mayRaiseFPException = 0
 
 // No patterns so that the scalar instructions are always selected.
 // The scalar versions will be replaced with vector when needed later.
-
-// V_ADD_I32, V_SUB_I32, and V_SUBREV_I32 where renamed to *_U32 in VI,
-// but the VI instructions behave the same as the SI versions.
-defm V_ADD_I32 : VOP2bInst <"v_add_i32", VOP2b_I32_I1_I32_I32, null_frag, "v_add_i32", 1>;
-defm V_SUB_I32 : VOP2bInst <"v_sub_i32", VOP2b_I32_I1_I32_I32, null_frag, "v_sub_i32", 1>;
-defm V_SUBREV_I32 : VOP2bInst <"v_subrev_i32", VOP2b_I32_I1_I32_I32, null_frag, "v_sub_i32", 1>;
+defm V_ADD_CO_U32 : VOP2bInst <"v_add_co_u32", VOP2b_I32_I1_I32_I32, null_frag, "v_add_co_u32", 1>;
+defm V_SUB_CO_U32 : VOP2bInst <"v_sub_co_u32", VOP2b_I32_I1_I32_I32, null_frag, "v_sub_co_u32", 1>;
+defm V_SUBREV_CO_U32 : VOP2bInst <"v_subrev_co_u32", VOP2b_I32_I1_I32_I32, null_frag, "v_sub_co_u32", 1>;
 defm V_ADDC_U32 : VOP2bInst <"v_addc_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, "v_addc_u32", 1>;
 defm V_SUBB_U32 : VOP2bInst <"v_subb_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, "v_subb_u32", 1>;
 defm V_SUBBREV_U32 : VOP2bInst <"v_subbrev_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, "v_subb_u32", 1>;
@@ -555,10 +566,6 @@ defm V_MAX_LEGACY_F32 : VOP2Inst <"v_max_legacy_f32", VOP_F32_F32_F32, AMDGPUfma
 } // End SubtargetPredicate = isGFX6GFX7
 
 let isCommutable = 1 in {
-let SubtargetPredicate = isGFX6GFX7GFX10 in {
-let OtherPredicates = [HasMadMacF32Insts] in
-defm V_MAC_LEGACY_F32 : VOP2Inst <"v_mac_legacy_f32", VOP_F32_F32_F32>;
-} // End SubtargetPredicate = isGFX6GFX7GFX10
 let SubtargetPredicate = isGFX6GFX7 in {
 defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, srl>;
 defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, sra>;
@@ -595,8 +602,8 @@ let SubtargetPredicate = HasAddNoCarryInsts in {
 }
 
 let SubtargetPredicate = isGFX6GFX7GFX8GFX9, Predicates = [isGFX6GFX7GFX8GFX9] in {
-def : DivergentClampingBinOp<add, V_ADD_I32_e64>;
-def : DivergentClampingBinOp<sub, V_SUB_I32_e64>;
+def : DivergentClampingBinOp<add, V_ADD_CO_U32_e64>;
+def : DivergentClampingBinOp<sub, V_SUB_CO_U32_e64>;
 }
 
 def : DivergentBinOp<adde, V_ADDC_U32_e32>;
@@ -635,7 +642,7 @@ defm V_ASHRREV_I16 : VOP2Inst <"v_ashrrev_i16", VOP_I16_I16_I16, ashr_rev>;
 let isCommutable = 1 in {
 let FPDPRounding = 1 in {
 defm V_ADD_F16 : VOP2Inst <"v_add_f16", VOP_F16_F16_F16, any_fadd>;
-defm V_SUB_F16 : VOP2Inst <"v_sub_f16", VOP_F16_F16_F16, fsub>;
+defm V_SUB_F16 : VOP2Inst <"v_sub_f16", VOP_F16_F16_F16, any_fsub>;
 defm V_SUBREV_F16 : VOP2Inst <"v_subrev_f16", VOP_F16_F16_F16, null_frag, "v_sub_f16">;
 defm V_MUL_F16 : VOP2Inst <"v_mul_f16", VOP_F16_F16_F16, any_fmul>;
 
@@ -668,14 +675,23 @@ let SubtargetPredicate = HasDLInsts in {
 defm V_XNOR_B32 : VOP2Inst <"v_xnor_b32", VOP_I32_I32_I32>;
 
 let Constraints = "$vdst = $src2",
-    DisableEncoding="$src2",
+    DisableEncoding = "$src2",
     isConvertibleToThreeAddress = 1,
-    isCommutable = 1 in {
+    isCommutable = 1 in
 defm V_FMAC_F32 : VOP2Inst <"v_fmac_f32", VOP_MAC_F32>;
-}
 
 } // End SubtargetPredicate = HasDLInsts
 
+let SubtargetPredicate = HasFmaLegacy32 in {
+
+let Constraints = "$vdst = $src2",
+    DisableEncoding = "$src2",
+    isConvertibleToThreeAddress = 1,
+    isCommutable = 1 in
+defm V_FMAC_LEGACY_F32 : VOP2Inst <"v_fmac_legacy_f32", VOP_MAC_LEGACY_F32>;
+
+} // End SubtargetPredicate = HasFmaLegacy32
+
 let Constraints = "$vdst = $src2",
       DisableEncoding="$src2",
       isConvertibleToThreeAddress = 1,
@@ -827,6 +843,24 @@ def : GCNPat <
 } // End Predicates = [Has16BitInsts]
 
 
+let SubtargetPredicate = HasIntClamp in {
+// Set clamp bit for saturation.
+def : VOPBinOpClampPat<uaddsat, V_ADD_CO_U32_e64, i32>;
+def : VOPBinOpClampPat<usubsat, V_SUB_CO_U32_e64, i32>;
+}
+
+let SubtargetPredicate = HasAddNoCarryInsts, OtherPredicates = [HasIntClamp] in {
+let AddedComplexity = 1 in { // Prefer over form with carry-out.
+def : VOPBinOpClampPat<uaddsat, V_ADD_U32_e64, i32>;
+def : VOPBinOpClampPat<usubsat, V_SUB_U32_e64, i32>;
+}
+}
+
+let SubtargetPredicate = Has16BitInsts, OtherPredicates = [HasIntClamp] in {
+def : VOPBinOpClampPat<uaddsat, V_ADD_U16_e64, i16>;
+def : VOPBinOpClampPat<usubsat, V_SUB_U16_e64, i16>;
+}
+
 //===----------------------------------------------------------------------===//
 // Target-specific instruction encodings.
 //===----------------------------------------------------------------------===//
@@ -854,6 +888,7 @@ class Base_VOP2_DPP16<bits<6> op, VOP2_DPP_Pseudo ps,
     VOP2_DPP<op, ps, opName, p, 1> {
   let AssemblerPredicate = HasDPP16;
   let SubtargetPredicate = HasDPP16;
+  let OtherPredicates = ps.OtherPredicates;
 }
 
 class VOP2_DPP16<bits<6> op, VOP2_DPP_Pseudo ps,
@@ -880,6 +915,7 @@ class VOP2_DPP8<bits<6> op, VOP2_Pseudo ps,
 
   let AssemblerPredicate = HasDPP8;
   let SubtargetPredicate = HasDPP8;
+  let OtherPredicates = ps.OtherPredicates;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1090,13 +1126,10 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
   }
 
   //===---------------------------- VOP3beOnly ----------------------------===//
-  multiclass VOP3beOnly_Real_gfx10<bits<10> op, string opName, string asmName> {
+  multiclass VOP3beOnly_Real_gfx10<bits<10> op> {
     def _e64_gfx10 :
-      VOP3_Real<!cast<VOP3_Pseudo>(opName#"_e64"), SIEncodingFamily.GFX10>,
-      VOP3be_gfx10<op, !cast<VOP3_Pseudo>(opName#"_e64").Pfl> {
-        VOP3_Pseudo Ps = !cast<VOP3_Pseudo>(opName#"_e64");
-        let AsmString = asmName # Ps.AsmOperands;
-      }
+      VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX10>,
+      VOP3be_gfx10<op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
   }
 } // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10"
 
@@ -1126,21 +1159,25 @@ multiclass VOP2_Real_gfx10_with_name<bits<6> op, string opName,
   VOP2_Real_dpp_gfx10_with_name<op, opName, asmName>,
   VOP2_Real_dpp8_gfx10_with_name<op, opName, asmName>;
 
-defm V_XNOR_B32      : VOP2_Real_gfx10<0x01e>;
-defm V_FMAC_F32      : VOP2_Real_gfx10<0x02b>;
-defm V_FMAMK_F32     : VOP2Only_Real_MADK_gfx10<0x02c>;
-defm V_FMAAK_F32     : VOP2Only_Real_MADK_gfx10<0x02d>;
-defm V_ADD_F16       : VOP2_Real_gfx10<0x032>;
-defm V_SUB_F16       : VOP2_Real_gfx10<0x033>;
-defm V_SUBREV_F16    : VOP2_Real_gfx10<0x034>;
-defm V_MUL_F16       : VOP2_Real_gfx10<0x035>;
-defm V_FMAC_F16      : VOP2_Real_gfx10<0x036>;
-defm V_FMAMK_F16     : VOP2Only_Real_MADK_gfx10<0x037>;
-defm V_FMAAK_F16     : VOP2Only_Real_MADK_gfx10<0x038>;
-defm V_MAX_F16       : VOP2_Real_gfx10<0x039>;
-defm V_MIN_F16       : VOP2_Real_gfx10<0x03a>;
-defm V_LDEXP_F16     : VOP2_Real_gfx10<0x03b>;
-defm V_PK_FMAC_F16   : VOP2_Real_e32_gfx10<0x03c>;
+// NB: Same opcode as v_mac_legacy_f32
+let DecoderNamespace = "GFX10_B" in
+defm V_FMAC_LEGACY_F32 : VOP2_Real_gfx10<0x006>;
+
+defm V_XNOR_B32        : VOP2_Real_gfx10<0x01e>;
+defm V_FMAC_F32        : VOP2_Real_gfx10<0x02b>;
+defm V_FMAMK_F32       : VOP2Only_Real_MADK_gfx10<0x02c>;
+defm V_FMAAK_F32       : VOP2Only_Real_MADK_gfx10<0x02d>;
+defm V_ADD_F16         : VOP2_Real_gfx10<0x032>;
+defm V_SUB_F16         : VOP2_Real_gfx10<0x033>;
+defm V_SUBREV_F16      : VOP2_Real_gfx10<0x034>;
+defm V_MUL_F16         : VOP2_Real_gfx10<0x035>;
+defm V_FMAC_F16        : VOP2_Real_gfx10<0x036>;
+defm V_FMAMK_F16       : VOP2Only_Real_MADK_gfx10<0x037>;
+defm V_FMAAK_F16       : VOP2Only_Real_MADK_gfx10<0x038>;
+defm V_MAX_F16         : VOP2_Real_gfx10<0x039>;
+defm V_MIN_F16         : VOP2_Real_gfx10<0x03a>;
+defm V_LDEXP_F16       : VOP2_Real_gfx10<0x03b>;
+defm V_PK_FMAC_F16     : VOP2_Real_e32_gfx10<0x03c>;
 
 // VOP2 no carry-in, carry-out.
 defm V_ADD_NC_U32 :
@@ -1172,13 +1209,10 @@ defm V_CVT_PKNORM_U16_F32 : VOP3Only_Real_gfx10<0x369>;
 defm V_CVT_PK_U16_U32     : VOP3Only_Real_gfx10<0x36a>;
 defm V_CVT_PK_I16_I32     : VOP3Only_Real_gfx10<0x36b>;
 
-// VOP3 carry-in, carry-out.
-defm V_ADD_CO_U32 :
-  VOP3beOnly_Real_gfx10<0x30f, "V_ADD_I32", "v_add_co_u32">;
-defm V_SUB_CO_U32 :
-  VOP3beOnly_Real_gfx10<0x310, "V_SUB_I32", "v_sub_co_u32">;
-defm V_SUBREV_CO_U32 :
-  VOP3beOnly_Real_gfx10<0x319, "V_SUBREV_I32", "v_subrev_co_u32">;
+// VOP3 carry-out.
+defm V_ADD_CO_U32 : VOP3beOnly_Real_gfx10<0x30f>;
+defm V_SUB_CO_U32 : VOP3beOnly_Real_gfx10<0x310>;
+defm V_SUBREV_CO_U32 : VOP3beOnly_Real_gfx10<0x319>;
 
 let SubtargetPredicate = isGFX10Plus in {
   defm : VOP2eInstAliases<V_CNDMASK_B32_e32, V_CNDMASK_B32_e32_gfx10>;
@@ -1207,7 +1241,7 @@ class VOP2_DPPe <bits<6> op, VOP2_DPP_Pseudo ps, VOPProfile P = ps.Pfl> :
 }
 
 let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in {
-  multiclass VOP2Only_Real_gfx6_gfx7<bits<6> op> {
+  multiclass VOP2_Lane_Real_gfx6_gfx7<bits<6> op> {
     def _gfx6_gfx7 :
       VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.SI>,
       VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl>;
@@ -1217,20 +1251,20 @@ let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in {
       VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.SI>,
       VOP2_MADKe<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl>;
   }
-  multiclass VOP2_Real_e32_gfx6_gfx7<bits<6> op> {
+  multiclass VOP2_Real_e32_gfx6_gfx7<bits<6> op, string PseudoName = NAME> {
     def _e32_gfx6_gfx7 :
-      VOP2_Real<!cast<VOP2_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>,
-      VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME#"_e32").Pfl>;
+      VOP2_Real<!cast<VOP2_Pseudo>(PseudoName#"_e32"), SIEncodingFamily.SI>,
+      VOP2e<op{5-0}, !cast<VOP2_Pseudo>(PseudoName#"_e32").Pfl>;
   }
-  multiclass VOP2_Real_e64_gfx6_gfx7<bits<6> op> {
+  multiclass VOP2_Real_e64_gfx6_gfx7<bits<6> op, string PseudoName = NAME> {
     def _e64_gfx6_gfx7 :
-      VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
-      VOP3e_gfx6_gfx7<{1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
+      VOP3_Real<!cast<VOP3_Pseudo>(PseudoName#"_e64"), SIEncodingFamily.SI>,
+      VOP3e_gfx6_gfx7<{1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(PseudoName#"_e64").Pfl>;
   }
-  multiclass VOP2be_Real_e64_gfx6_gfx7<bits<6> op> {
+  multiclass VOP2be_Real_e64_gfx6_gfx7<bits<6> op, string PseudoName = NAME> {
     def _e64_gfx6_gfx7 :
-      VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
-      VOP3be_gfx6_gfx7<{1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
+      VOP3_Real<!cast<VOP3_Pseudo>(PseudoName#"_e64"), SIEncodingFamily.SI>,
+      VOP3be_gfx6_gfx7<{1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(PseudoName#"_e64").Pfl>;
   }
 } // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7"
 
@@ -1246,6 +1280,20 @@ multiclass VOP2_Real_gfx6_gfx7_gfx10<bits<6> op> :
 multiclass VOP2be_Real_gfx6_gfx7<bits<6> op> :
   VOP2_Real_e32_gfx6_gfx7<op>, VOP2be_Real_e64_gfx6_gfx7<op>;
 
+multiclass VOP2be_Real_gfx6_gfx7_with_name<bits<6> op,
+  string PseudoName, string asmName>  {
+  defvar ps32 = !cast<VOP2_Pseudo>(PseudoName#"_e32");
+  defvar ps64 = !cast<VOP3_Pseudo>(PseudoName#"_e64");
+
+  let AsmString = asmName # ps32.AsmOperands in {
+    defm "" : VOP2_Real_e32_gfx6_gfx7<op, PseudoName>;
+  }
+
+   let AsmString = asmName # ps64.AsmOperands in {
+    defm "" : VOP2be_Real_e64_gfx6_gfx7<op, PseudoName>;
+  }
+}
+
 defm V_CNDMASK_B32        : VOP2_Real_gfx6_gfx7<0x000>;
 defm V_MIN_LEGACY_F32     : VOP2_Real_gfx6_gfx7<0x00d>;
 defm V_MAX_LEGACY_F32     : VOP2_Real_gfx6_gfx7<0x00e>;
@@ -1262,27 +1310,36 @@ defm V_CVT_PKNORM_I16_F32 : VOP2_Real_gfx6_gfx7<0x02d>;
 defm V_CVT_PKNORM_U16_F32 : VOP2_Real_gfx6_gfx7<0x02e>;
 defm V_CVT_PK_U16_U32     : VOP2_Real_gfx6_gfx7<0x030>;
 defm V_CVT_PK_I16_I32     : VOP2_Real_gfx6_gfx7<0x031>;
-defm V_ADD_I32            : VOP2be_Real_gfx6_gfx7<0x025>;
-defm V_SUB_I32            : VOP2be_Real_gfx6_gfx7<0x026>;
-defm V_SUBREV_I32         : VOP2be_Real_gfx6_gfx7<0x027>;
+
+// V_ADD_I32, V_SUB_I32, and V_SUBREV_I32 where renamed to *_U32 in
+// VI, but the VI instructions behave the same as the SI versions.
+defm V_ADD_I32            : VOP2be_Real_gfx6_gfx7_with_name<0x025, "V_ADD_CO_U32", "v_add_i32">;
+defm V_SUB_I32            : VOP2be_Real_gfx6_gfx7_with_name<0x026, "V_SUB_CO_U32", "v_sub_i32">;
+defm V_SUBREV_I32         : VOP2be_Real_gfx6_gfx7_with_name<0x027, "V_SUBREV_CO_U32", "v_subrev_i32">;
 defm V_ADDC_U32           : VOP2be_Real_gfx6_gfx7<0x028>;
 defm V_SUBB_U32           : VOP2be_Real_gfx6_gfx7<0x029>;
 defm V_SUBBREV_U32        : VOP2be_Real_gfx6_gfx7<0x02a>;
 
-defm V_READLANE_B32 : VOP2Only_Real_gfx6_gfx7<0x001>;
+defm V_READLANE_B32 : VOP2_Lane_Real_gfx6_gfx7<0x001>;
 
 let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in) in {
-  defm V_WRITELANE_B32 : VOP2Only_Real_gfx6_gfx7<0x002>;
+  defm V_WRITELANE_B32 : VOP2_Lane_Real_gfx6_gfx7<0x002>;
 } // End InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in)
 
 let SubtargetPredicate = isGFX6GFX7 in {
   defm : VOP2eInstAliases<V_CNDMASK_B32_e32, V_CNDMASK_B32_e32_gfx6_gfx7>;
+  defm : VOP2eInstAliases<V_ADD_CO_U32_e32, V_ADD_I32_e32_gfx6_gfx7>;
+  defm : VOP2eInstAliases<V_SUB_CO_U32_e32, V_SUB_I32_e32_gfx6_gfx7>;
+  defm : VOP2eInstAliases<V_SUBREV_CO_U32_e32, V_SUBREV_I32_e32_gfx6_gfx7>;
+
+  def : VOP2e64InstAlias<V_ADD_CO_U32_e64, V_ADD_I32_e64_gfx6_gfx7>;
+  def : VOP2e64InstAlias<V_SUB_CO_U32_e64, V_SUB_I32_e64_gfx6_gfx7>;
+  def : VOP2e64InstAlias<V_SUBREV_CO_U32_e64, V_SUBREV_I32_e64_gfx6_gfx7>;
 } // End SubtargetPredicate = isGFX6GFX7
 
 defm V_ADD_F32            : VOP2_Real_gfx6_gfx7_gfx10<0x003>;
 defm V_SUB_F32            : VOP2_Real_gfx6_gfx7_gfx10<0x004>;
 defm V_SUBREV_F32         : VOP2_Real_gfx6_gfx7_gfx10<0x005>;
-let OtherPredicates = [HasMadMacF32Insts] in
 defm V_MAC_LEGACY_F32     : VOP2_Real_gfx6_gfx7_gfx10<0x006>;
 defm V_MUL_LEGACY_F32     : VOP2_Real_gfx6_gfx7_gfx10<0x007>;
 defm V_MUL_F32            : VOP2_Real_gfx6_gfx7_gfx10<0x008>;
@@ -1490,16 +1547,16 @@ defm V_MAC_F32            : VOP2_Real_e32e64_vi <0x16>;
 defm V_MADMK_F32          : VOP2_Real_MADK_vi <0x17>;
 defm V_MADAK_F32          : VOP2_Real_MADK_vi <0x18>;
 
-defm V_ADD_U32            : VOP2be_Real_e32e64_vi_only <0x19, "V_ADD_I32",     "v_add_u32">;
-defm V_SUB_U32            : VOP2be_Real_e32e64_vi_only <0x1a, "V_SUB_I32",     "v_sub_u32">;
-defm V_SUBREV_U32         : VOP2be_Real_e32e64_vi_only <0x1b, "V_SUBREV_I32",  "v_subrev_u32">;
+defm V_ADD_U32            : VOP2be_Real_e32e64_vi_only <0x19, "V_ADD_CO_U32",     "v_add_u32">;
+defm V_SUB_U32            : VOP2be_Real_e32e64_vi_only <0x1a, "V_SUB_CO_U32",     "v_sub_u32">;
+defm V_SUBREV_U32         : VOP2be_Real_e32e64_vi_only <0x1b, "V_SUBREV_CO_U32",  "v_subrev_u32">;
 defm V_ADDC_U32           : VOP2be_Real_e32e64_vi_only <0x1c, "V_ADDC_U32",    "v_addc_u32">;
 defm V_SUBB_U32           : VOP2be_Real_e32e64_vi_only <0x1d, "V_SUBB_U32",    "v_subb_u32">;
 defm V_SUBBREV_U32        : VOP2be_Real_e32e64_vi_only <0x1e, "V_SUBBREV_U32", "v_subbrev_u32">;
 
-defm V_ADD_CO_U32         : VOP2be_Real_e32e64_gfx9 <0x19, "V_ADD_I32",     "v_add_co_u32">;
-defm V_SUB_CO_U32         : VOP2be_Real_e32e64_gfx9 <0x1a, "V_SUB_I32",     "v_sub_co_u32">;
-defm V_SUBREV_CO_U32      : VOP2be_Real_e32e64_gfx9 <0x1b, "V_SUBREV_I32",  "v_subrev_co_u32">;
+defm V_ADD_CO_U32         : VOP2be_Real_e32e64_gfx9 <0x19, "V_ADD_CO_U32",     "v_add_co_u32">;
+defm V_SUB_CO_U32         : VOP2be_Real_e32e64_gfx9 <0x1a, "V_SUB_CO_U32",     "v_sub_co_u32">;
+defm V_SUBREV_CO_U32      : VOP2be_Real_e32e64_gfx9 <0x1b, "V_SUBREV_CO_U32",  "v_subrev_co_u32">;
 defm V_ADDC_CO_U32        : VOP2be_Real_e32e64_gfx9 <0x1c, "V_ADDC_U32",    "v_addc_co_u32">;
 defm V_SUBB_CO_U32        : VOP2be_Real_e32e64_gfx9 <0x1d, "V_SUBB_U32",    "v_subb_co_u32">;
 defm V_SUBBREV_CO_U32     : VOP2be_Real_e32e64_gfx9 <0x1e, "V_SUBBREV_U32", "v_subbrev_co_u32">;
@@ -1568,11 +1625,11 @@ defm : VOP2eInstAliases<V_CNDMASK_B32_e32, V_CNDMASK_B32_e32_vi>;
 
 let SubtargetPredicate = isGFX9Only in {
 
-defm : VOP2bInstAliases<V_ADD_I32_e32,     V_ADD_CO_U32_e32_gfx9,     "v_add_co_u32">;
+defm : VOP2bInstAliases<V_ADD_U32_e32,     V_ADD_CO_U32_e32_gfx9,     "v_add_co_u32">;
 defm : VOP2bInstAliases<V_ADDC_U32_e32,    V_ADDC_CO_U32_e32_gfx9,    "v_addc_co_u32">;
-defm : VOP2bInstAliases<V_SUB_I32_e32,     V_SUB_CO_U32_e32_gfx9,     "v_sub_co_u32">;
+defm : VOP2bInstAliases<V_SUB_U32_e32,     V_SUB_CO_U32_e32_gfx9,     "v_sub_co_u32">;
 defm : VOP2bInstAliases<V_SUBB_U32_e32,    V_SUBB_CO_U32_e32_gfx9,    "v_subb_co_u32">;
-defm : VOP2bInstAliases<V_SUBREV_I32_e32,  V_SUBREV_CO_U32_e32_gfx9,  "v_subrev_co_u32">;
+defm : VOP2bInstAliases<V_SUBREV_U32_e32,  V_SUBREV_CO_U32_e32_gfx9,  "v_subrev_co_u32">;
 defm : VOP2bInstAliases<V_SUBBREV_U32_e32, V_SUBBREV_CO_U32_e32_gfx9, "v_subbrev_co_u32">;
 
 } // End SubtargetPredicate = isGFX9Only
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 169949f2171a..42dc995609f0 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -119,28 +119,37 @@ class getVOP3MAIPat<VOPProfile P, SDPatternOperator node> {
                                         timm:$cbsz, timm:$abid, timm:$blgp))];
 }
 
-class VOP3Inst<string OpName, VOPProfile P, SDPatternOperator node = null_frag, bit VOP3Only = 0> :
+// Consistently gives instructions a _e64 suffix.
+multiclass VOP3Inst_Pseudo_Wrapper<string opName, VOPProfile P, list<dag> pattern = [], bit VOP3Only = 0> {
+  def _e64 : VOP3_Pseudo<opName, P, pattern, VOP3Only>;
+}
+
+class VOP3InstBase<string OpName, VOPProfile P, SDPatternOperator node = null_frag, bit VOP3Only = 0> :
   VOP3_Pseudo<OpName, P,
-    !if(P.HasOpSel,
-        !if(P.HasModifiers,
-            getVOP3OpSelModPat<P, node>.ret,
-            getVOP3OpSelPat<P, node>.ret),
-        !if(P.HasModifiers,
-            getVOP3ModPat<P, node>.ret,
-            !if(P.HasIntClamp,
-                getVOP3ClampPat<P, node>.ret,
-                !if (P.IsMAI,
-                    getVOP3MAIPat<P, node>.ret,
-                    getVOP3Pat<P, node>.ret)))),
-    VOP3Only, 0, P.HasOpSel> {
+  !if(P.HasOpSel,
+      !if(P.HasModifiers,
+          getVOP3OpSelModPat<P, node>.ret,
+          getVOP3OpSelPat<P, node>.ret),
+      !if(P.HasModifiers,
+          getVOP3ModPat<P, node>.ret,
+          !if(P.HasIntClamp,
+              getVOP3ClampPat<P, node>.ret,
+              !if (P.IsMAI,
+                  getVOP3MAIPat<P, node>.ret,
+                  getVOP3Pat<P, node>.ret)))),
+  VOP3Only, 0, P.HasOpSel> {
 
   let IntClamp = P.HasIntClamp;
   let AsmMatchConverter =
-    !if(P.HasOpSel,
-        "cvtVOP3OpSel",
-        !if(!or(P.HasModifiers, !or(P.HasOMod, P.HasIntClamp)),
-            "cvtVOP3",
-            ""));
+  !if(P.HasOpSel,
+      "cvtVOP3OpSel",
+      !if(!or(P.HasModifiers, P.HasOMod, P.HasIntClamp),
+          "cvtVOP3",
+          ""));
+}
+
+multiclass VOP3Inst<string OpName, VOPProfile P, SDPatternOperator node = null_frag, bit VOP3Only = 0> {
+  def _e64 : VOP3InstBase<OpName, P, node, VOP3Only>;
 }
 
 // Special case for v_div_fmas_{f32|f64}, since it seems to be the
@@ -174,7 +183,7 @@ class VOP3_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOPProf
   let IsMAI    = !if(Features.IsMAI,    1, P.IsMAI);
   let IsPacked = !if(Features.IsPacked, 1, P.IsPacked);
 
-  let HasModifiers = !if(Features.IsPacked, !if(Features.IsMAI, 0, 1), P.HasModifiers);
+  let HasModifiers = !if(Features.IsMAI, 0, !or(Features.IsPacked, P.HasModifiers));
 
   // FIXME: Hack to stop printing _e64
   let Outs64 = (outs DstRC.RegClass:$vdst);
@@ -182,6 +191,7 @@ class VOP3_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOPProf
     " " # !if(Features.HasOpSel,
               getAsmVOP3OpSel<NumSrcArgs,
                               HasIntClamp,
+                              P.HasOMod,
                               HasSrc0FloatMods,
                               HasSrc1FloatMods,
                               HasSrc2FloatMods>.ret,
@@ -193,12 +203,8 @@ class VOP3_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOPProf
 }
 
 class VOP3b_Profile<ValueType vt> : VOPProfile<[vt, vt, vt, vt]> {
-  // v_div_scale_{f32|f64} do not support input modifiers.
-  let HasModifiers = 0;
-  let HasClamp = 0;
-  let HasOMod = 0;
   let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst);
-  let Asm64 = " $vdst, $sdst, $src0, $src1, $src2";
+  let Asm64 = " $vdst, $sdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$clamp$omod";
 }
 
 def VOP3b_F32_I1_F32_F32_F32 : VOP3b_Profile<f32> {
@@ -247,6 +253,7 @@ def VOP3_INTERP_MOV : VOPProfile<[f32, i32, i32, untyped]> {
   let Asm64 = "$vdst, $src0, $attr$attrchan$clamp$omod";
 
   let HasClamp = 1;
+  let HasSrc0Mods = 0;
 }
 
 class getInterp16Asm <bit HasSrc2, bit HasOMod> {
@@ -277,7 +284,7 @@ class getInterp16Ins <bit HasSrc2, bit HasOMod,
 
 class VOP3_INTERP16 <list<ValueType> ArgVT> : VOPProfile<ArgVT> {
 
-  let HasOMod = !if(!eq(DstVT.Value, f16.Value), 0, 1);
+  let HasOMod = !ne(DstVT.Value, f16.Value);
   let HasHigh = 1;
 
   let Outs64 = (outs VGPR_32:$vdst);
@@ -293,34 +300,36 @@ let isCommutable = 1 in {
 
 let mayRaiseFPException = 0 in {
 let SubtargetPredicate = HasMadMacF32Insts in {
-def V_MAD_LEGACY_F32 : VOP3Inst <"v_mad_legacy_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
-def V_MAD_F32 : VOP3Inst <"v_mad_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, fmad>;
+defm V_MAD_LEGACY_F32 : VOP3Inst <"v_mad_legacy_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
+defm V_MAD_F32 : VOP3Inst <"v_mad_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, fmad>;
 } // End SubtargetPredicate = HasMadMacInsts
 
-let SubtargetPredicate = HasNoMadMacF32Insts in
-def V_FMA_LEGACY_F32 : VOP3Inst <"v_fma_legacy_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
+let SubtargetPredicate = HasFmaLegacy32 in
+defm V_FMA_LEGACY_F32 : VOP3Inst <"v_fma_legacy_f32",
+                                 VOP3_Profile<VOP_F32_F32_F32_F32>,
+                                 int_amdgcn_fma_legacy>;
 }
 
-def V_MAD_I32_I24 : VOP3Inst <"v_mad_i32_i24", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
-def V_MAD_U32_U24 : VOP3Inst <"v_mad_u32_u24", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
-def V_FMA_F32 : VOP3Inst <"v_fma_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, any_fma>;
-def V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_lerp>;
+defm V_MAD_I32_I24 : VOP3Inst <"v_mad_i32_i24", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
+defm V_MAD_U32_U24 : VOP3Inst <"v_mad_u32_u24", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
+defm V_FMA_F32 : VOP3Inst <"v_fma_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, any_fma>;
+defm V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_lerp>;
 
 let SchedRW = [WriteDoubleAdd] in {
 let FPDPRounding = 1 in {
-def V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, any_fma>;
-def V_ADD_F64 : VOP3Inst <"v_add_f64", VOP3_Profile<VOP_F64_F64_F64>, any_fadd, 1>;
-def V_MUL_F64 : VOP3Inst <"v_mul_f64", VOP3_Profile<VOP_F64_F64_F64>, fmul, 1>;
+defm V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, any_fma>;
+defm V_ADD_F64 : VOP3Inst <"v_add_f64", VOP3_Profile<VOP_F64_F64_F64>, any_fadd, 1>;
+defm V_MUL_F64 : VOP3Inst <"v_mul_f64", VOP3_Profile<VOP_F64_F64_F64>, fmul, 1>;
 } // End FPDPRounding = 1
-def V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile<VOP_F64_F64_F64>, fminnum_like, 1>;
-def V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum_like, 1>;
+defm V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile<VOP_F64_F64_F64>, fminnum_like, 1>;
+defm V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum_like, 1>;
 } // End SchedRW = [WriteDoubleAdd]
 
 let SchedRW = [WriteQuarterRate32] in {
-def V_MUL_LO_U32 : VOP3Inst <"v_mul_lo_u32", VOP3_Profile<VOP_I32_I32_I32>, mul>;
-def V_MUL_HI_U32 : VOP3Inst <"v_mul_hi_u32", VOP3_Profile<VOP_I32_I32_I32>, mulhu>;
-def V_MUL_LO_I32 : VOP3Inst <"v_mul_lo_i32", VOP3_Profile<VOP_I32_I32_I32>>;
-def V_MUL_HI_I32 : VOP3Inst <"v_mul_hi_i32", VOP3_Profile<VOP_I32_I32_I32>, mulhs>;
+defm V_MUL_LO_U32 : VOP3Inst <"v_mul_lo_u32", VOP3_Profile<VOP_I32_I32_I32>, mul>;
+defm V_MUL_HI_U32 : VOP3Inst <"v_mul_hi_u32", VOP3_Profile<VOP_I32_I32_I32>, mulhu>;
+defm V_MUL_LO_I32 : VOP3Inst <"v_mul_lo_i32", VOP3_Profile<VOP_I32_I32_I32>>;
+defm V_MUL_HI_I32 : VOP3Inst <"v_mul_hi_i32", VOP3_Profile<VOP_I32_I32_I32>, mulhs>;
 } // End SchedRW = [WriteQuarterRate32]
 
 let Uses = [MODE, VCC, EXEC] in {
@@ -329,191 +338,165 @@ let Uses = [MODE, VCC, EXEC] in {
 //   if (vcc)
 //     result *= 2^32
 //
-def V_DIV_FMAS_F32 : VOP3_Pseudo <"v_div_fmas_f32", VOP_F32_F32_F32_F32_VCC, []> {
-  let SchedRW = [WriteFloatFMA];
-}
+let SchedRW = [WriteFloatFMA] in
+defm V_DIV_FMAS_F32 : VOP3Inst_Pseudo_Wrapper <"v_div_fmas_f32", VOP_F32_F32_F32_F32_VCC, []>;
 // v_div_fmas_f64:
 //   result = src0 * src1 + src2
 //   if (vcc)
 //     result *= 2^64
 //
-def V_DIV_FMAS_F64 : VOP3_Pseudo <"v_div_fmas_f64", VOP_F64_F64_F64_F64_VCC, []> {
-  let SchedRW = [WriteDouble];
-  let FPDPRounding = 1;
-}
-} // End Uses = [VCC, EXEC]
+let SchedRW = [WriteDouble], FPDPRounding = 1 in
+defm V_DIV_FMAS_F64 : VOP3Inst_Pseudo_Wrapper  <"v_div_fmas_f64", VOP_F64_F64_F64_F64_VCC, []>;
+} // End Uses = [MODE, VCC, EXEC]
 
 } // End isCommutable = 1
 
 let mayRaiseFPException = 0 in {
-def V_CUBEID_F32 : VOP3Inst <"v_cubeid_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubeid>;
-def V_CUBESC_F32 : VOP3Inst <"v_cubesc_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubesc>;
-def V_CUBETC_F32 : VOP3Inst <"v_cubetc_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubetc>;
-def V_CUBEMA_F32 : VOP3Inst <"v_cubema_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubema>;
+defm V_CUBEID_F32 : VOP3Inst <"v_cubeid_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubeid>;
+defm V_CUBESC_F32 : VOP3Inst <"v_cubesc_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubesc>;
+defm V_CUBETC_F32 : VOP3Inst <"v_cubetc_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubetc>;
+defm V_CUBEMA_F32 : VOP3Inst <"v_cubema_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubema>;
 } // End mayRaiseFPException
 
-def V_BFE_U32 : VOP3Inst <"v_bfe_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfe_u32>;
-def V_BFE_I32 : VOP3Inst <"v_bfe_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfe_i32>;
-def V_BFI_B32 : VOP3Inst <"v_bfi_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfi>;
-def V_ALIGNBIT_B32 : VOP3Inst <"v_alignbit_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, fshr>;
-def V_ALIGNBYTE_B32 : VOP3Inst <"v_alignbyte_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_alignbyte>;
+defm V_BFE_U32 : VOP3Inst <"v_bfe_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfe_u32>;
+defm V_BFE_I32 : VOP3Inst <"v_bfe_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfe_i32>;
+defm V_BFI_B32 : VOP3Inst <"v_bfi_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfi>;
+defm V_ALIGNBIT_B32 : VOP3Inst <"v_alignbit_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, fshr>;
+defm V_ALIGNBYTE_B32 : VOP3Inst <"v_alignbyte_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_alignbyte>;
 
 let mayRaiseFPException = 0 in { // XXX - Seems suspect but manual doesn't say it does
-def V_MIN3_F32 : VOP3Inst <"v_min3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmin3>;
-def V_MIN3_I32 : VOP3Inst <"v_min3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmin3>;
-def V_MIN3_U32 : VOP3Inst <"v_min3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumin3>;
-def V_MAX3_F32 : VOP3Inst <"v_max3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmax3>;
-def V_MAX3_I32 : VOP3Inst <"v_max3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmax3>;
-def V_MAX3_U32 : VOP3Inst <"v_max3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumax3>;
-def V_MED3_F32 : VOP3Inst <"v_med3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmed3>;
-def V_MED3_I32 : VOP3Inst <"v_med3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmed3>;
-def V_MED3_U32 : VOP3Inst <"v_med3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumed3>;
+defm V_MIN3_F32 : VOP3Inst <"v_min3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmin3>;
+defm V_MIN3_I32 : VOP3Inst <"v_min3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmin3>;
+defm V_MIN3_U32 : VOP3Inst <"v_min3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumin3>;
+defm V_MAX3_F32 : VOP3Inst <"v_max3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmax3>;
+defm V_MAX3_I32 : VOP3Inst <"v_max3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmax3>;
+defm V_MAX3_U32 : VOP3Inst <"v_max3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumax3>;
+defm V_MED3_F32 : VOP3Inst <"v_med3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmed3>;
+defm V_MED3_I32 : VOP3Inst <"v_med3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmed3>;
+defm V_MED3_U32 : VOP3Inst <"v_med3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumed3>;
 } // End mayRaiseFPException = 0
 
-def V_SAD_U8 : VOP3Inst <"v_sad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
-def V_SAD_HI_U8 : VOP3Inst <"v_sad_hi_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
-def V_SAD_U16 : VOP3Inst <"v_sad_u16", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
-def V_SAD_U32 : VOP3Inst <"v_sad_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
-def V_CVT_PK_U8_F32 : VOP3Inst<"v_cvt_pk_u8_f32", VOP3_Profile<VOP_I32_F32_I32_I32>, int_amdgcn_cvt_pk_u8_f32>;
-def V_DIV_FIXUP_F32 : VOP3Inst <"v_div_fixup_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUdiv_fixup>;
+defm V_SAD_U8 : VOP3Inst <"v_sad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
+defm V_SAD_HI_U8 : VOP3Inst <"v_sad_hi_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
+defm V_SAD_U16 : VOP3Inst <"v_sad_u16", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
+defm V_SAD_U32 : VOP3Inst <"v_sad_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
+defm V_CVT_PK_U8_F32 : VOP3Inst<"v_cvt_pk_u8_f32", VOP3_Profile<VOP_I32_F32_I32_I32>, int_amdgcn_cvt_pk_u8_f32>;
+
+defm V_DIV_FIXUP_F32 : VOP3Inst <"v_div_fixup_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUdiv_fixup>;
 
 let SchedRW = [WriteDoubleAdd], FPDPRounding = 1 in {
-def V_DIV_FIXUP_F64 : VOP3Inst <"v_div_fixup_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, AMDGPUdiv_fixup>;
-def V_LDEXP_F64 : VOP3Inst <"v_ldexp_f64", VOP3_Profile<VOP_F64_F64_I32>, AMDGPUldexp, 1>;
+  defm V_DIV_FIXUP_F64 : VOP3Inst <"v_div_fixup_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, AMDGPUdiv_fixup>;
+  defm V_LDEXP_F64 : VOP3Inst <"v_ldexp_f64", VOP3_Profile<VOP_F64_F64_I32>, AMDGPUldexp, 1>;
 } // End SchedRW = [WriteDoubleAdd], FPDPRounding = 1
 
 
 let mayRaiseFPException = 0 in { // Seems suspicious but manual doesn't say it does.
-def V_DIV_SCALE_F32 : VOP3_Pseudo <"v_div_scale_f32", VOP3b_F32_I1_F32_F32_F32, [], 1> {
-  let SchedRW = [WriteFloatFMA, WriteSALU];
-  let AsmMatchConverter = "";
-}
+  let SchedRW = [WriteFloatFMA, WriteSALU] in
+  defm V_DIV_SCALE_F32 : VOP3Inst_Pseudo_Wrapper <"v_div_scale_f32", VOP3b_F32_I1_F32_F32_F32, [], 1> ;
 
-// Double precision division pre-scale.
-def V_DIV_SCALE_F64 : VOP3_Pseudo <"v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64, [], 1> {
-  let SchedRW = [WriteDouble, WriteSALU];
-  let AsmMatchConverter = "";
-  let FPDPRounding = 1;
-}
+  // Double precision division pre-scale.
+  let SchedRW = [WriteDouble, WriteSALU], FPDPRounding = 1 in
+  defm V_DIV_SCALE_F64 : VOP3Inst_Pseudo_Wrapper <"v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64, [], 1>;
 } // End mayRaiseFPException = 0
 
-def V_MSAD_U8 : VOP3Inst <"v_msad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
+defm V_MSAD_U8 : VOP3Inst <"v_msad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
 
 let Constraints = "@earlyclobber $vdst" in {
-def V_MQSAD_PK_U16_U8 : VOP3Inst <"v_mqsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64, VOP3_CLAMP>>;
+defm V_MQSAD_PK_U16_U8 : VOP3Inst <"v_mqsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64, VOP3_CLAMP>>;
 } // End Constraints = "@earlyclobber $vdst"
 
-def V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile<VOP_F64_F64_I32>, int_amdgcn_trig_preop> {
-  let SchedRW = [WriteDouble];
-}
 
-let SchedRW = [Write64Bit] in {
-let SubtargetPredicate = isGFX6GFX7 in {
-def V_LSHL_B64 : VOP3Inst <"v_lshl_b64", VOP3_Profile<VOP_I64_I64_I32>, shl>;
-def V_LSHR_B64 : VOP3Inst <"v_lshr_b64", VOP3_Profile<VOP_I64_I64_I32>, srl>;
-def V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile<VOP_I64_I64_I32>, sra>;
-} // End SubtargetPredicate = isGFX6GFX7
+let SchedRW = [WriteDouble] in {
+defm V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile<VOP_F64_F64_I32>, int_amdgcn_trig_preop>;
+} // End SchedRW = [WriteDouble]
 
-let SubtargetPredicate = isGFX8Plus in {
-def V_LSHLREV_B64 : VOP3Inst <"v_lshlrev_b64", VOP3_Profile<VOP_I64_I32_I64>, lshl_rev>;
-def V_LSHRREV_B64 : VOP3Inst <"v_lshrrev_b64", VOP3_Profile<VOP_I64_I32_I64>, lshr_rev>;
-def V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>, ashr_rev>;
-} // End SubtargetPredicate = isGFX8Plus
+let SchedRW = [Write64Bit] in {
+  let SubtargetPredicate = isGFX6GFX7 in {
+  defm V_LSHL_B64 : VOP3Inst <"v_lshl_b64", VOP3_Profile<VOP_I64_I64_I32>, shl>;
+  defm V_LSHR_B64 : VOP3Inst <"v_lshr_b64", VOP3_Profile<VOP_I64_I64_I32>, srl>;
+  defm V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile<VOP_I64_I64_I32>, sra>;
+  } // End SubtargetPredicate = isGFX6GFX7
+
+  let SubtargetPredicate = isGFX8Plus in {
+  defm V_LSHLREV_B64 : VOP3Inst <"v_lshlrev_b64", VOP3_Profile<VOP_I64_I32_I64>, lshl_rev>;
+  defm V_LSHRREV_B64 : VOP3Inst <"v_lshrrev_b64", VOP3_Profile<VOP_I64_I32_I64>, lshr_rev>;
+  defm V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>, ashr_rev>;
+  } // End SubtargetPredicate = isGFX8Plus
 } // End SchedRW = [Write64Bit]
 
-def : GCNPat<
-  (i64 (getDivergentFrag<sext>.ret i16:$src)),
-    (REG_SEQUENCE VReg_64,
-      (i32 (V_BFE_I32 $src, (S_MOV_B32 (i32 0)), (S_MOV_B32 (i32 0x10)))), sub0,
-      (i32 (COPY_TO_REGCLASS
-         (V_ASHRREV_I32_e32 (S_MOV_B32 (i32 0x1f)), (i32 (V_BFE_I32 $src, (S_MOV_B32 (i32 0)), (S_MOV_B32 (i32 0x10))))
-      ), VGPR_32)), sub1)
->;
-
 def : GCNPat<
   (i32 (getDivergentFrag<sext>.ret i16:$src)),
-  (i32 (V_BFE_I32 $src, (S_MOV_B32 (i32 0)), (S_MOV_B32 (i32 0x10))))
+  (i32 (V_BFE_I32_e64 $src, (S_MOV_B32 (i32 0)), (S_MOV_B32 (i32 0x10))))
 >;
 
 let SubtargetPredicate = isGFX6GFX7GFX10 in {
-def V_MULLIT_F32 : VOP3Inst <"v_mullit_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
+defm V_MULLIT_F32 : VOP3Inst <"v_mullit_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
 } // End SubtargetPredicate = isGFX6GFX7GFX10
 
 let SchedRW = [Write32Bit] in {
 let SubtargetPredicate = isGFX8Plus in {
-def V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUperm>;
+defm V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUperm>;
 } // End SubtargetPredicate = isGFX8Plus
 } // End SchedRW = [Write32Bit]
 
 let SubtargetPredicate = isGFX7Plus in {
 
 let Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32] in {
-def V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64, VOP3_CLAMP>>;
-def V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOP3_Profile<VOP_V4I32_I64_I32_V4I32, VOP3_CLAMP>>;
+defm V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64, VOP3_CLAMP>>;
+defm V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOP3_Profile<VOP_V4I32_I64_I32_V4I32, VOP3_CLAMP>>;
 } // End Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32]
 
 let isCommutable = 1 in {
 let SchedRW = [WriteQuarterRate32, WriteSALU] in {
-def V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>;
-def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>;
+defm V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>;
+defm V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>;
 } // End SchedRW = [WriteQuarterRate32, WriteSALU]
 } // End isCommutable = 1
 
 } // End SubtargetPredicate = isGFX7Plus
 
-
-def V_DIV_FIXUP_F16 : VOP3Inst <"v_div_fixup_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUdiv_fixup> {
-  let Predicates = [Has16BitInsts, isGFX8Only];
-  let FPDPRounding = 1;
-}
-def V_DIV_FIXUP_F16_gfx9 : VOP3Inst <"v_div_fixup_f16_gfx9",
-                                      VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUdiv_fixup> {
-  let renamedInGFX9 = 1;
-  let Predicates = [Has16BitInsts, isGFX9Plus];
-  let FPDPRounding = 1;
-}
-
-def V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, any_fma> {
-  let Predicates = [Has16BitInsts, isGFX8Only];
-  let FPDPRounding = 1;
-}
-def V_FMA_F16_gfx9 : VOP3Inst <"v_fma_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, any_fma> {
-  let renamedInGFX9 = 1;
-  let Predicates = [Has16BitInsts, isGFX9Plus];
-  let FPDPRounding = 1;
-}
+let FPDPRounding = 1 in {
+  let Predicates = [Has16BitInsts, isGFX8Only] in {
+    defm V_DIV_FIXUP_F16 : VOP3Inst <"v_div_fixup_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUdiv_fixup>;
+    defm V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, any_fma>;
+  } // End Predicates = [Has16BitInsts, isGFX8Only]
+
+  let renamedInGFX9 = 1, Predicates = [Has16BitInsts, isGFX9Plus] in {
+    defm V_DIV_FIXUP_F16_gfx9 : VOP3Inst <"v_div_fixup_f16_gfx9",
+                                          VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUdiv_fixup>;
+    defm V_FMA_F16_gfx9 : VOP3Inst <"v_fma_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, any_fma>;
+  } // End renamedInGFX9 = 1, Predicates = [Has16BitInsts, isGFX9Plus]
+} // End FPDPRounding = 1
 
 let SubtargetPredicate = Has16BitInsts, isCommutable = 1 in {
 
 let renamedInGFX9 = 1 in {
-def V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_CLAMP>>;
-def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_CLAMP>>;
-let FPDPRounding = 1 in {
-def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fmad>;
-let Uses = [MODE, M0, EXEC] in {
-// For some reason the intrinsic operands are in a different order
-// from the instruction operands.
-def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i32, f32]>,
-       [(set f16:$vdst,
-         (int_amdgcn_interp_p2_f16 (VOP3Mods f32:$src2, i32:$src2_modifiers),
-                                   (VOP3Mods f32:$src0, i32:$src0_modifiers),
-                                   (i32 timm:$attrchan),
-                                   (i32 timm:$attr),
-                                   (i1 timm:$high),
-                                   M0))]>;
-} // End Uses = [M0, MODE, EXEC]
-} // End FPDPRounding = 1
+  defm V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_CLAMP>>;
+  defm V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_CLAMP>>;
+  let FPDPRounding = 1 in {
+    defm V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fmad>;
+    let Uses = [MODE, M0, EXEC] in {
+    // For some reason the intrinsic operands are in a different order
+    // from the instruction operands.
+    def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i32, f32]>,
+           [(set f16:$vdst,
+             (int_amdgcn_interp_p2_f16 (VOP3Mods f32:$src2, i32:$src2_modifiers),
+                                       (VOP3Mods f32:$src0, i32:$src0_modifiers),
+                                       (i32 timm:$attrchan),
+                                       (i32 timm:$attr),
+                                       (i1 timm:$high),
+                                       M0))]>;
+    } // End Uses = [M0, MODE, EXEC]
+  } // End FPDPRounding = 1
 } // End renamedInGFX9 = 1
 
-let SubtargetPredicate = isGFX9Only in {
-def V_MAD_F16_gfx9   : VOP3Inst <"v_mad_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>> {
-  let FPDPRounding = 1;
-}
-} // End SubtargetPredicate = isGFX9Only
+let SubtargetPredicate = isGFX9Only, FPDPRounding = 1 in {
+  defm V_MAD_F16_gfx9   : VOP3Inst <"v_mad_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>> ;
+} // End SubtargetPredicate = isGFX9Only, FPDPRounding = 1
 
 let SubtargetPredicate = isGFX9Plus in {
-def V_MAD_U16_gfx9   : VOP3Inst <"v_mad_u16_gfx9", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>>;
-def V_MAD_I16_gfx9   : VOP3Inst <"v_mad_i16_gfx9", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>>;
+defm V_MAD_U16_gfx9   : VOP3Inst <"v_mad_u16_gfx9", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>>;
+defm V_MAD_I16_gfx9   : VOP3Inst <"v_mad_i16_gfx9", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>>;
 def V_INTERP_P2_F16_gfx9 : VOP3Interp <"v_interp_p2_f16_gfx9", VOP3_INTERP16<[f16, f32, i32, f32]>>;
 } // End SubtargetPredicate = isGFX9Plus
 
@@ -535,6 +518,15 @@ def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32
 
 } // End SubtargetPredicate = Has16BitInsts, isCommutable = 1
 
+def : GCNPat<
+  (i64 (getDivergentFrag<sext>.ret i16:$src)),
+    (REG_SEQUENCE VReg_64,
+      (i32 (V_BFE_I32_e64 $src, (S_MOV_B32 (i32 0)), (S_MOV_B32 (i32 0x10)))), sub0,
+      (i32 (COPY_TO_REGCLASS
+         (V_ASHRREV_I32_e32 (S_MOV_B32 (i32 0x1f)), (i32 (V_BFE_I32_e64 $src, (S_MOV_B32 (i32 0)), (S_MOV_B32 (i32 0x10))))
+      ), VGPR_32)), sub1)
+>;
+
 let SubtargetPredicate = isGFX8Plus, Uses = [MODE, M0, EXEC] in {
 def V_INTERP_P1_F32_e64  : VOP3Interp <"v_interp_p1_f32", VOP3_INTERP>;
 def V_INTERP_P2_F32_e64  : VOP3Interp <"v_interp_p2_f32", VOP3_INTERP>;
@@ -552,8 +544,8 @@ def : GCNPat <
 
 }
 
-defm: Ternary_i16_Pats<mul, add, V_MAD_U16, zext>;
-defm: Ternary_i16_Pats<mul, add, V_MAD_I16, sext>;
+defm: Ternary_i16_Pats<mul, add, V_MAD_U16_e64, zext>;
+defm: Ternary_i16_Pats<mul, add, V_MAD_I16_e64, sext>;
 
 } // End Predicates = [Has16BitInsts, isGFX6GFX7GFX8GFX9]
 
@@ -568,8 +560,8 @@ def : GCNPat <
 
 }
 
-defm: Ternary_i16_Pats_gfx9<mul, add, V_MAD_U16_gfx9, zext>;
-defm: Ternary_i16_Pats_gfx9<mul, add, V_MAD_I16_gfx9, sext>;
+defm: Ternary_i16_Pats_gfx9<mul, add, V_MAD_U16_gfx9_e64, zext>;
+defm: Ternary_i16_Pats_gfx9<mul, add, V_MAD_I16_gfx9_e64, sext>;
 
 } // End Predicates = [Has16BitInsts, isGFX10Plus]
 
@@ -593,9 +585,9 @@ class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : PatFrag<
       if (!Operands[i]->isDivergent() &&
           !isInlineImmediate(Operands[i].getNode())) {
         ConstantBusUses++;
-        // This uses AMDGPU::V_ADD3_U32, but all three operand instructions
+        // This uses AMDGPU::V_ADD3_U32_e64, but all three operand instructions
         // have the same constant bus limit.
-        if (ConstantBusUses > Subtarget->getConstantBusLimit(AMDGPU::V_ADD3_U32))
+        if (ConstantBusUses > Subtarget->getConstantBusLimit(AMDGPU::V_ADD3_U32_e64))
           return false;
       }
     }
@@ -605,52 +597,60 @@ class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : PatFrag<
   let PredicateCodeUsesOperands = 1;
 
   // The divergence predicate is irrelevant in GlobalISel, as we have
-  // proper register bank checks. We also force all VOP instruction
-  // operands to VGPR, so we should not need to check the constant bus
-  // restriction.
+  // proper register bank checks. We just need to verify the constant
+  // bus restriction when all the sources are considered.
   //
   // FIXME: With unlucky SGPR operands, we could penalize code by
   // blocking folding SGPR->VGPR copies later.
   // FIXME: There's no register bank verifier
-  // FIXME: Should add a way for the emitter to recognize this is a
-  // trivially true predicate to eliminate the check.
-  let GISelPredicateCode = [{return true;}];
+  let GISelPredicateCode = [{
+    const int ConstantBusLimit = Subtarget->getConstantBusLimit(AMDGPU::V_ADD3_U32_e64);
+    int ConstantBusUses = 0;
+    for (unsigned i = 0; i < 3; ++i) {
+      const RegisterBank *RegBank = RBI.getRegBank(Operands[i]->getReg(), MRI, TRI);
+      if (RegBank->getID() == AMDGPU::SGPRRegBankID) {
+        if (++ConstantBusUses > ConstantBusLimit)
+          return false;
+      }
+    }
+    return true;
+  }];
 }
 
 let SubtargetPredicate = isGFX9Plus in {
-def V_PACK_B32_F16 : VOP3Inst <"v_pack_b32_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>;
-def V_LSHL_ADD_U32 : VOP3Inst <"v_lshl_add_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
-def V_ADD_LSHL_U32 : VOP3Inst <"v_add_lshl_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
-def V_ADD3_U32 : VOP3Inst <"v_add3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
-def V_LSHL_OR_B32 : VOP3Inst <"v_lshl_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
-def V_AND_OR_B32 : VOP3Inst <"v_and_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
-def V_OR3_B32 : VOP3Inst <"v_or3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+defm V_PACK_B32_F16 : VOP3Inst <"v_pack_b32_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>;
+defm V_LSHL_ADD_U32 : VOP3Inst <"v_lshl_add_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+defm V_ADD_LSHL_U32 : VOP3Inst <"v_add_lshl_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+defm V_ADD3_U32 : VOP3Inst <"v_add3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+defm V_LSHL_OR_B32 : VOP3Inst <"v_lshl_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+defm V_AND_OR_B32 : VOP3Inst <"v_and_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+defm V_OR3_B32 : VOP3Inst <"v_or3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
 
-def V_XAD_U32 : VOP3Inst <"v_xad_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+defm V_XAD_U32 : VOP3Inst <"v_xad_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
 
-def V_MED3_F16 : VOP3Inst <"v_med3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfmed3>;
-def V_MED3_I16 : VOP3Inst <"v_med3_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUsmed3>;
-def V_MED3_U16 : VOP3Inst <"v_med3_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUumed3>;
+defm V_MED3_F16 : VOP3Inst <"v_med3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfmed3>;
+defm V_MED3_I16 : VOP3Inst <"v_med3_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUsmed3>;
+defm V_MED3_U16 : VOP3Inst <"v_med3_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUumed3>;
 
-def V_MIN3_F16 : VOP3Inst <"v_min3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfmin3>;
-def V_MIN3_I16 : VOP3Inst <"v_min3_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUsmin3>;
-def V_MIN3_U16 : VOP3Inst <"v_min3_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUumin3>;
+defm V_MIN3_F16 : VOP3Inst <"v_min3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfmin3>;
+defm V_MIN3_I16 : VOP3Inst <"v_min3_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUsmin3>;
+defm V_MIN3_U16 : VOP3Inst <"v_min3_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUumin3>;
 
-def V_MAX3_F16 : VOP3Inst <"v_max3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfmax3>;
-def V_MAX3_I16 : VOP3Inst <"v_max3_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUsmax3>;
-def V_MAX3_U16 : VOP3Inst <"v_max3_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUumax3>;
+defm V_MAX3_F16 : VOP3Inst <"v_max3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfmax3>;
+defm V_MAX3_I16 : VOP3Inst <"v_max3_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUsmax3>;
+defm V_MAX3_U16 : VOP3Inst <"v_max3_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUumax3>;
 
-def V_ADD_I16 : VOP3Inst <"v_add_i16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>>;
-def V_SUB_I16 : VOP3Inst <"v_sub_i16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>>;
+defm V_ADD_I16 : VOP3Inst <"v_add_i16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>>;
+defm V_SUB_I16 : VOP3Inst <"v_sub_i16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>>;
 
-def V_MAD_U32_U16 : VOP3Inst <"v_mad_u32_u16", VOP3_Profile<VOP_I32_I16_I16_I32, VOP3_OPSEL>>;
-def V_MAD_I32_I16 : VOP3Inst <"v_mad_i32_i16", VOP3_Profile<VOP_I32_I16_I16_I32, VOP3_OPSEL>>;
+defm V_MAD_U32_U16 : VOP3Inst <"v_mad_u32_u16", VOP3_Profile<VOP_I32_I16_I16_I32, VOP3_OPSEL>>;
+defm V_MAD_I32_I16 : VOP3Inst <"v_mad_i32_i16", VOP3_Profile<VOP_I32_I16_I16_I32, VOP3_OPSEL>>;
 
-def V_CVT_PKNORM_I16_F16 : VOP3Inst <"v_cvt_pknorm_i16_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>;
-def V_CVT_PKNORM_U16_F16 : VOP3Inst <"v_cvt_pknorm_u16_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>;
+defm V_CVT_PKNORM_I16_F16 : VOP3Inst <"v_cvt_pknorm_i16_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>;
+defm V_CVT_PKNORM_U16_F16 : VOP3Inst <"v_cvt_pknorm_u16_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>;
 
-def V_ADD_I32_gfx9 : VOP3Inst <"v_add_i32_gfx9", VOP3_Profile<VOP_I32_I32_I32_ARITH>>;
-def V_SUB_I32_gfx9 : VOP3Inst <"v_sub_i32_gfx9", VOP3_Profile<VOP_I32_I32_I32_ARITH>>;
+defm V_ADD_I32 : VOP3Inst <"v_add_i32", VOP3_Profile<VOP_I32_I32_I32_ARITH>>;
+defm V_SUB_I32 : VOP3Inst <"v_sub_i32", VOP3_Profile<VOP_I32_I32_I32_ARITH>>;
 
 
 class ThreeOp_i32_Pats <SDPatternOperator op1, SDPatternOperator op2, Instruction inst> : GCNPat <
@@ -659,14 +659,28 @@ class ThreeOp_i32_Pats <SDPatternOperator op1, SDPatternOperator op2, Instructio
   (inst VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2)
 >;
 
-def : ThreeOp_i32_Pats<shl, add, V_LSHL_ADD_U32>;
-def : ThreeOp_i32_Pats<add, shl, V_ADD_LSHL_U32>;
-def : ThreeOp_i32_Pats<add, add, V_ADD3_U32>;
-def : ThreeOp_i32_Pats<shl, or, V_LSHL_OR_B32>;
-def : ThreeOp_i32_Pats<and, or, V_AND_OR_B32>;
-def : ThreeOp_i32_Pats<or, or, V_OR3_B32>;
-def : ThreeOp_i32_Pats<xor, add, V_XAD_U32>;
+def : ThreeOp_i32_Pats<shl, add, V_LSHL_ADD_U32_e64>;
+def : ThreeOp_i32_Pats<add, shl, V_ADD_LSHL_U32_e64>;
+def : ThreeOp_i32_Pats<add, add, V_ADD3_U32_e64>;
+def : ThreeOp_i32_Pats<shl, or, V_LSHL_OR_B32_e64>;
+def : ThreeOp_i32_Pats<and, or, V_AND_OR_B32_e64>;
+def : ThreeOp_i32_Pats<or, or, V_OR3_B32_e64>;
+def : ThreeOp_i32_Pats<xor, add, V_XAD_U32_e64>;
+
+def : VOPBinOpClampPat<saddsat, V_ADD_I32_e64, i32>;
+def : VOPBinOpClampPat<ssubsat, V_SUB_I32_e64, i32>;
 
+
+// FIXME: Probably should hardcode clamp bit in pseudo and avoid this.
+class OpSelBinOpClampPat<SDPatternOperator node,
+                         Instruction inst> : GCNPat<
+ (node (i16 (VOP3OpSel i16:$src0, i32:$src0_modifiers)),
+       (i16 (VOP3OpSel i16:$src1, i32:$src1_modifiers))),
+  (inst $src0_modifiers, $src0, $src1_modifiers, $src1, DSTCLAMP.ENABLE, 0)
+>;
+
+def : OpSelBinOpClampPat<saddsat, V_ADD_I16_e64>;
+def : OpSelBinOpClampPat<ssubsat, V_SUB_I16_e64>;
 } // End SubtargetPredicate = isGFX9Plus
 
 def VOP3_PERMLANE_Profile : VOP3_Profile<VOPProfile <[i32, i32, i32, i32]>, VOP3_OPSEL> {
@@ -676,9 +690,8 @@ def VOP3_PERMLANE_Profile : VOP3_Profile<VOPProfile <[i32, i32, i32, i32]>, VOP3
   let InsVOP3OpSel = (ins IntOpSelMods:$src0_modifiers, VRegSrc_32:$src0,
                           IntOpSelMods:$src1_modifiers, SCSrc_b32:$src1,
                           IntOpSelMods:$src2_modifiers, SCSrc_b32:$src2,
-                          VGPR_32:$vdst_in, op_sel:$op_sel);
+                          VGPR_32:$vdst_in, op_sel0:$op_sel);
   let HasClamp = 0;
-  let HasOMod = 0;
 }
 
 class PermlanePat<SDPatternOperator permlane,
@@ -716,23 +729,23 @@ class PermlaneDiscardVDstIn<SDPatternOperator permlane,
 
 
 let SubtargetPredicate = isGFX10Plus in {
-  def V_XOR3_B32 : VOP3Inst <"v_xor3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
-  def : ThreeOp_i32_Pats<xor, xor, V_XOR3_B32>;
+  defm V_XOR3_B32 : VOP3Inst <"v_xor3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+  def : ThreeOp_i32_Pats<xor, xor, V_XOR3_B32_e64>;
 
   let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
-    def V_PERMLANE16_B32 : VOP3Inst <"v_permlane16_b32", VOP3_PERMLANE_Profile>;
-    def V_PERMLANEX16_B32 : VOP3Inst <"v_permlanex16_b32", VOP3_PERMLANE_Profile>;
+    defm V_PERMLANE16_B32 : VOP3Inst<"v_permlane16_b32", VOP3_PERMLANE_Profile>;
+    defm V_PERMLANEX16_B32 : VOP3Inst<"v_permlanex16_b32", VOP3_PERMLANE_Profile>;
   } // End $vdst = $vdst_in, DisableEncoding $vdst_in
 
-  def : PermlanePat<int_amdgcn_permlane16, V_PERMLANE16_B32>;
-  def : PermlanePat<int_amdgcn_permlanex16, V_PERMLANEX16_B32>;
+  def : PermlanePat<int_amdgcn_permlane16, V_PERMLANE16_B32_e64>;
+  def : PermlanePat<int_amdgcn_permlanex16, V_PERMLANEX16_B32_e64>;
 
   def : PermlaneDiscardVDstIn<
     BoundControlOrFetchInvalidPermlane<int_amdgcn_permlane16>,
-    V_PERMLANE16_B32>;
+    V_PERMLANE16_B32_e64>;
   def : PermlaneDiscardVDstIn<
     BoundControlOrFetchInvalidPermlane<int_amdgcn_permlanex16>,
-    V_PERMLANEX16_B32>;
+    V_PERMLANEX16_B32_e64>;
 } // End SubtargetPredicate = isGFX10Plus
 
 class DivFmasPat<ValueType vt, Instruction inst, Register CondReg> : GCNPat<
@@ -744,13 +757,13 @@ class DivFmasPat<ValueType vt, Instruction inst, Register CondReg> : GCNPat<
 >;
 
 let WaveSizePredicate = isWave64 in {
-def : DivFmasPat<f32, V_DIV_FMAS_F32, VCC>;
-def : DivFmasPat<f64, V_DIV_FMAS_F64, VCC>;
+def : DivFmasPat<f32, V_DIV_FMAS_F32_e64, VCC>;
+def : DivFmasPat<f64, V_DIV_FMAS_F64_e64, VCC>;
 }
 
 let WaveSizePredicate = isWave32 in {
-def : DivFmasPat<f32, V_DIV_FMAS_F32, VCC_LO>;
-def : DivFmasPat<f64, V_DIV_FMAS_F64, VCC_LO>;
+def : DivFmasPat<f32, V_DIV_FMAS_F32_e64, VCC_LO>;
+def : DivFmasPat<f64, V_DIV_FMAS_F64_e64, VCC_LO>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -775,23 +788,23 @@ class getClampRes<VOPProfile P, Instruction inst> {
             ret1));
 }
 
-class IntClampPat<VOP3Inst inst, SDPatternOperator node> : GCNPat<
+class IntClampPat<VOP3InstBase inst, SDPatternOperator node> : GCNPat<
   getClampPat<inst.Pfl, node>.ret,
   getClampRes<inst.Pfl, inst>.ret
 >;
 
-def : IntClampPat<V_MAD_I32_I24, AMDGPUmad_i24>;
-def : IntClampPat<V_MAD_U32_U24, AMDGPUmad_u24>;
+def : IntClampPat<V_MAD_I32_I24_e64, AMDGPUmad_i24>;
+def : IntClampPat<V_MAD_U32_U24_e64, AMDGPUmad_u24>;
 
-def : IntClampPat<V_SAD_U8, int_amdgcn_sad_u8>;
-def : IntClampPat<V_SAD_HI_U8, int_amdgcn_sad_hi_u8>;
-def : IntClampPat<V_SAD_U16, int_amdgcn_sad_u16>;
+def : IntClampPat<V_SAD_U8_e64, int_amdgcn_sad_u8>;
+def : IntClampPat<V_SAD_HI_U8_e64, int_amdgcn_sad_hi_u8>;
+def : IntClampPat<V_SAD_U16_e64, int_amdgcn_sad_u16>;
 
-def : IntClampPat<V_MSAD_U8, int_amdgcn_msad_u8>;
-def : IntClampPat<V_MQSAD_PK_U16_U8, int_amdgcn_mqsad_pk_u16_u8>;
+def : IntClampPat<V_MSAD_U8_e64, int_amdgcn_msad_u8>;
+def : IntClampPat<V_MQSAD_PK_U16_U8_e64, int_amdgcn_mqsad_pk_u16_u8>;
 
-def : IntClampPat<V_QSAD_PK_U16_U8, int_amdgcn_qsad_pk_u16_u8>;
-def : IntClampPat<V_MQSAD_U32_U8, int_amdgcn_mqsad_u32_u8>;
+def : IntClampPat<V_QSAD_PK_U16_U8_e64, int_amdgcn_qsad_pk_u16_u8>;
+def : IntClampPat<V_MQSAD_U32_U8_e64, int_amdgcn_mqsad_u32_u8>;
 
 
 //===----------------------------------------------------------------------===//
@@ -804,6 +817,11 @@ def : IntClampPat<V_MQSAD_U32_U8, int_amdgcn_mqsad_u32_u8>;
 
 let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
   multiclass VOP3_Real_gfx10<bits<10> op> {
+    def _gfx10 :
+      VOP3_Real<!cast<VOP_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX10>,
+      VOP3e_gfx10<op, !cast<VOP_Pseudo>(NAME#"_e64").Pfl>;
+  }
+  multiclass VOP3_Real_No_Suffix_gfx10<bits<10> op> {
     def _gfx10 :
       VOP3_Real<!cast<VOP_Pseudo>(NAME), SIEncodingFamily.GFX10>,
       VOP3e_gfx10<op, !cast<VOP_Pseudo>(NAME).Pfl>;
@@ -811,16 +829,16 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
   multiclass VOP3_Real_gfx10_with_name<bits<10> op, string opName,
                                        string asmName> {
     def _gfx10 :
-      VOP3_Real<!cast<VOP3_Pseudo>(opName), SIEncodingFamily.GFX10>,
-      VOP3e_gfx10<op, !cast<VOP3_Pseudo>(opName).Pfl> {
-        VOP3_Pseudo ps = !cast<VOP3_Pseudo>(opName);
+      VOP3_Real<!cast<VOP3_Pseudo>(opName#"_e64"), SIEncodingFamily.GFX10>,
+      VOP3e_gfx10<op, !cast<VOP3_Pseudo>(opName#"_e64").Pfl> {
+        VOP3_Pseudo ps = !cast<VOP3_Pseudo>(opName#"_e64");
         let AsmString = asmName # ps.AsmOperands;
       }
   }
   multiclass VOP3be_Real_gfx10<bits<10> op> {
     def _gfx10 :
-      VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.GFX10>,
-      VOP3be_gfx10<op, !cast<VOP3_Pseudo>(NAME).Pfl>;
+      VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX10>,
+      VOP3be_gfx10<op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
   }
   multiclass VOP3Interp_Real_gfx10<bits<10> op> {
     def _gfx10 :
@@ -829,26 +847,30 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
   }
   multiclass VOP3OpSel_Real_gfx10<bits<10> op> {
     def _gfx10 :
-      VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.GFX10>,
-      VOP3OpSel_gfx10<op, !cast<VOP3_Pseudo>(NAME).Pfl>;
+      VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX10>,
+      VOP3OpSel_gfx10<op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
   }
   multiclass VOP3OpSel_Real_gfx10_with_name<bits<10> op, string opName,
                                             string asmName> {
     def _gfx10 :
-      VOP3_Real<!cast<VOP3_Pseudo>(opName), SIEncodingFamily.GFX10>,
-      VOP3OpSel_gfx10<op, !cast<VOP3_Pseudo>(opName).Pfl> {
-        VOP3_Pseudo ps = !cast<VOP3_Pseudo>(opName);
+      VOP3_Real<!cast<VOP3_Pseudo>(opName#"_e64"), SIEncodingFamily.GFX10>,
+      VOP3OpSel_gfx10<op, !cast<VOP3_Pseudo>(opName#"_e64").Pfl> {
+        VOP3_Pseudo ps = !cast<VOP3_Pseudo>(opName#"_e64");
         let AsmString = asmName # ps.AsmOperands;
       }
   }
 } // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10"
 
-defm V_READLANE_B32  : VOP3_Real_gfx10<0x360>;
+defm V_READLANE_B32  : VOP3_Real_No_Suffix_gfx10<0x360>;
 
 let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in) in {
-  defm V_WRITELANE_B32 : VOP3_Real_gfx10<0x361>;
+  defm V_WRITELANE_B32 : VOP3_Real_No_Suffix_gfx10<0x361>;
 } // End InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in)
 
+let SubtargetPredicate = isGFX10Before1030 in {
+  defm V_MUL_LO_I32      : VOP3_Real_gfx10<0x16b>;
+}
+
 defm V_XOR3_B32           : VOP3_Real_gfx10<0x178>;
 defm V_LSHLREV_B64        : VOP3_Real_gfx10<0x2ff>;
 defm V_LSHRREV_B64        : VOP3_Real_gfx10<0x300>;
@@ -868,9 +890,9 @@ defm V_ADD_NC_I16 :
 defm V_SUB_NC_I16 :
   VOP3OpSel_Real_gfx10_with_name<0x30e, "V_SUB_I16", "v_sub_nc_i16">;
 defm V_SUB_NC_I32 :
-  VOP3_Real_gfx10_with_name<0x376, "V_SUB_I32_gfx9", "v_sub_nc_i32">;
+  VOP3_Real_gfx10_with_name<0x376, "V_SUB_I32", "v_sub_nc_i32">;
 defm V_ADD_NC_I32 :
-  VOP3_Real_gfx10_with_name<0x37f, "V_ADD_I32_gfx9", "v_add_nc_i32">;
+  VOP3_Real_gfx10_with_name<0x37f, "V_ADD_I32", "v_add_nc_i32">;
 
 defm V_INTERP_P1_F32_e64  : VOP3Interp_Real_gfx10<0x200>;
 defm V_INTERP_P2_F32_e64  : VOP3Interp_Real_gfx10<0x201>;
@@ -907,16 +929,16 @@ defm V_DIV_FIXUP_F16 :
 
 // FIXME-GFX10-OPSEL: Need to add "selective" opsel support to some of these
 // (they do not support SDWA or DPP).
-defm V_ADD_NC_U16      : VOP3_Real_gfx10_with_name<0x303, "V_ADD_U16_e64", "v_add_nc_u16">;
-defm V_SUB_NC_U16      : VOP3_Real_gfx10_with_name<0x304, "V_SUB_U16_e64", "v_sub_nc_u16">;
-defm V_MUL_LO_U16      : VOP3_Real_gfx10_with_name<0x305, "V_MUL_LO_U16_e64", "v_mul_lo_u16">;
-defm V_LSHRREV_B16     : VOP3_Real_gfx10_with_name<0x307, "V_LSHRREV_B16_e64", "v_lshrrev_b16">;
-defm V_ASHRREV_I16     : VOP3_Real_gfx10_with_name<0x308, "V_ASHRREV_I16_e64", "v_ashrrev_i16">;
-defm V_MAX_U16         : VOP3_Real_gfx10_with_name<0x309, "V_MAX_U16_e64", "v_max_u16">;
-defm V_MAX_I16         : VOP3_Real_gfx10_with_name<0x30a, "V_MAX_I16_e64", "v_max_i16">;
-defm V_MIN_U16         : VOP3_Real_gfx10_with_name<0x30b, "V_MIN_U16_e64", "v_min_u16">;
-defm V_MIN_I16         : VOP3_Real_gfx10_with_name<0x30c, "V_MIN_I16_e64", "v_min_i16">;
-defm V_LSHLREV_B16     : VOP3_Real_gfx10_with_name<0x314, "V_LSHLREV_B16_e64", "v_lshlrev_b16">;
+defm V_ADD_NC_U16      : VOP3_Real_gfx10_with_name<0x303, "V_ADD_U16", "v_add_nc_u16">;
+defm V_SUB_NC_U16      : VOP3_Real_gfx10_with_name<0x304, "V_SUB_U16", "v_sub_nc_u16">;
+defm V_MUL_LO_U16      : VOP3_Real_gfx10_with_name<0x305, "V_MUL_LO_U16", "v_mul_lo_u16">;
+defm V_LSHRREV_B16     : VOP3_Real_gfx10_with_name<0x307, "V_LSHRREV_B16", "v_lshrrev_b16">;
+defm V_ASHRREV_I16     : VOP3_Real_gfx10_with_name<0x308, "V_ASHRREV_I16", "v_ashrrev_i16">;
+defm V_MAX_U16         : VOP3_Real_gfx10_with_name<0x309, "V_MAX_U16", "v_max_u16">;
+defm V_MAX_I16         : VOP3_Real_gfx10_with_name<0x30a, "V_MAX_I16", "v_max_i16">;
+defm V_MIN_U16         : VOP3_Real_gfx10_with_name<0x30b, "V_MIN_U16", "v_min_u16">;
+defm V_MIN_I16         : VOP3_Real_gfx10_with_name<0x30c, "V_MIN_I16", "v_min_i16">;
+defm V_LSHLREV_B16     : VOP3_Real_gfx10_with_name<0x314, "V_LSHLREV_B16", "v_lshlrev_b16">;
 defm V_PERMLANE16_B32  : VOP3OpSel_Real_gfx10<0x377>;
 defm V_PERMLANEX16_B32 : VOP3OpSel_Real_gfx10<0x378>;
 
@@ -927,13 +949,13 @@ defm V_PERMLANEX16_B32 : VOP3OpSel_Real_gfx10<0x378>;
 let AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7" in {
   multiclass VOP3_Real_gfx7<bits<10> op> {
     def _gfx7 :
-      VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>,
-      VOP3e_gfx6_gfx7<op{8-0}, !cast<VOP3_Pseudo>(NAME).Pfl>;
+      VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
+      VOP3e_gfx6_gfx7<op{8-0}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
   }
   multiclass VOP3be_Real_gfx7<bits<10> op> {
     def _gfx7 :
-      VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>,
-      VOP3be_gfx6_gfx7<op{8-0}, !cast<VOP3_Pseudo>(NAME).Pfl>;
+      VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
+      VOP3be_gfx6_gfx7<op{8-0}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
   }
 } // End AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7"
 
@@ -955,13 +977,13 @@ defm V_MAD_I64_I32      : VOP3be_Real_gfx7_gfx10<0x177>;
 let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in {
   multiclass VOP3_Real_gfx6_gfx7<bits<10> op> {
     def _gfx6_gfx7 :
-      VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>,
-      VOP3e_gfx6_gfx7<op{8-0}, !cast<VOP3_Pseudo>(NAME).Pfl>;
+      VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
+      VOP3e_gfx6_gfx7<op{8-0}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
   }
   multiclass VOP3be_Real_gfx6_gfx7<bits<10> op> {
     def _gfx6_gfx7 :
-      VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>,
-      VOP3be_gfx6_gfx7<op{8-0}, !cast<VOP3_Pseudo>(NAME).Pfl>;
+      VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
+      VOP3be_gfx6_gfx7<op{8-0}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
   }
 } // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7"
 
@@ -974,6 +996,7 @@ multiclass VOP3be_Real_gfx6_gfx7_gfx10<bits<10> op> :
 defm V_LSHL_B64        : VOP3_Real_gfx6_gfx7<0x161>;
 defm V_LSHR_B64        : VOP3_Real_gfx6_gfx7<0x162>;
 defm V_ASHR_I64        : VOP3_Real_gfx6_gfx7<0x163>;
+defm V_MUL_LO_I32      : VOP3_Real_gfx6_gfx7<0x16b>;
 
 defm V_MAD_LEGACY_F32  : VOP3_Real_gfx6_gfx7_gfx10<0x140>;
 defm V_MAD_F32         : VOP3_Real_gfx6_gfx7_gfx10<0x141>;
@@ -1015,7 +1038,6 @@ defm V_MAX_F64         : VOP3_Real_gfx6_gfx7_gfx10<0x167>;
 defm V_LDEXP_F64       : VOP3_Real_gfx6_gfx7_gfx10<0x168>;
 defm V_MUL_LO_U32      : VOP3_Real_gfx6_gfx7_gfx10<0x169>;
 defm V_MUL_HI_U32      : VOP3_Real_gfx6_gfx7_gfx10<0x16a>;
-defm V_MUL_LO_I32      : VOP3_Real_gfx6_gfx7_gfx10<0x16b>;
 defm V_MUL_HI_I32      : VOP3_Real_gfx6_gfx7_gfx10<0x16c>;
 defm V_DIV_FMAS_F32    : VOP3_Real_gfx6_gfx7_gfx10<0x16f>;
 defm V_DIV_FMAS_F64    : VOP3_Real_gfx6_gfx7_gfx10<0x170>;
@@ -1036,18 +1058,22 @@ defm V_FMA_LEGACY_F32  : VOP3_Real_gfx10<0x140>;
 let AssemblerPredicate = isGFX8GFX9, DecoderNamespace = "GFX8" in {
 
 multiclass VOP3_Real_vi<bits<10> op> {
+  def _vi : VOP3_Real<!cast<VOP_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
+            VOP3e_vi <op, !cast<VOP_Pseudo>(NAME#"_e64").Pfl>;
+}
+multiclass VOP3_Real_No_Suffix_vi<bits<10> op> {
   def _vi : VOP3_Real<!cast<VOP_Pseudo>(NAME), SIEncodingFamily.VI>,
             VOP3e_vi <op, !cast<VOP_Pseudo>(NAME).Pfl>;
 }
 
 multiclass VOP3be_Real_vi<bits<10> op> {
-  def _vi : VOP3_Real<!cast<VOP_Pseudo>(NAME), SIEncodingFamily.VI>,
-            VOP3be_vi <op, !cast<VOP_Pseudo>(NAME).Pfl>;
+  def _vi : VOP3_Real<!cast<VOP_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
+            VOP3be_vi <op, !cast<VOP_Pseudo>(NAME#"_e64").Pfl>;
 }
 
 multiclass VOP3OpSel_Real_gfx9<bits<10> op> {
-  def _vi : VOP3_Real<!cast<VOP_Pseudo>(NAME), SIEncodingFamily.VI>,
-            VOP3OpSel_gfx9 <op, !cast<VOP_Pseudo>(NAME).Pfl>;
+  def _vi : VOP3_Real<!cast<VOP_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
+            VOP3OpSel_gfx9 <op, !cast<VOP_Pseudo>(NAME#"_e64").Pfl>;
 }
 
 multiclass VOP3Interp_Real_vi<bits<10> op> {
@@ -1060,8 +1086,8 @@ multiclass VOP3Interp_Real_vi<bits<10> op> {
 let AssemblerPredicate = isGFX8Only, DecoderNamespace = "GFX8" in {
 
 multiclass VOP3_F16_Real_vi<bits<10> op> {
-  def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>,
-            VOP3e_vi <op, !cast<VOP3_Pseudo>(NAME).Pfl>;
+  def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
+            VOP3e_vi <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
 }
 
 multiclass VOP3Interp_F16_Real_vi<bits<10> op> {
@@ -1074,17 +1100,17 @@ multiclass VOP3Interp_F16_Real_vi<bits<10> op> {
 let AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9" in {
 
 multiclass VOP3_F16_Real_gfx9<bits<10> op, string OpName, string AsmName> {
-  def _gfx9 : VOP3_Real<!cast<VOP3_Pseudo>(OpName), SIEncodingFamily.GFX9>,
-            VOP3e_vi <op, !cast<VOP3_Pseudo>(OpName).Pfl> {
-              VOP3_Pseudo ps = !cast<VOP3_Pseudo>(OpName);
+  def _gfx9 : VOP3_Real<!cast<VOP3_Pseudo>(OpName#"_e64"), SIEncodingFamily.GFX9>,
+            VOP3e_vi <op, !cast<VOP3_Pseudo>(OpName#"_e64").Pfl> {
+              VOP3_Pseudo ps = !cast<VOP3_Pseudo>(OpName#"_e64");
               let AsmString = AsmName # ps.AsmOperands;
             }
 }
 
 multiclass VOP3OpSel_F16_Real_gfx9<bits<10> op, string AsmName> {
-  def _gfx9 : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.GFX9>,
-            VOP3OpSel_gfx9 <op, !cast<VOP3_Pseudo>(NAME).Pfl> {
-              VOP3_Pseudo ps = !cast<VOP3_Pseudo>(NAME);
+  def _gfx9 : VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX9>,
+            VOP3OpSel_gfx9 <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
+              VOP3_Pseudo ps = !cast<VOP3_Pseudo>(NAME#"_e64");
               let AsmString = AsmName # ps.AsmOperands;
             }
 }
@@ -1098,9 +1124,9 @@ multiclass VOP3Interp_F16_Real_gfx9<bits<10> op, string OpName, string AsmName>
 }
 
 multiclass VOP3_Real_gfx9<bits<10> op, string AsmName> {
-  def _gfx9 : VOP3_Real<!cast<VOP_Pseudo>(NAME), SIEncodingFamily.GFX9>,
-              VOP3e_vi <op, !cast<VOP_Pseudo>(NAME).Pfl> {
-              VOP_Pseudo ps = !cast<VOP_Pseudo>(NAME);
+  def _gfx9 : VOP3_Real<!cast<VOP_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX9>,
+              VOP3e_vi <op, !cast<VOP_Pseudo>(NAME#"_e64").Pfl> {
+              VOP_Pseudo ps = !cast<VOP_Pseudo>(NAME#"_e64");
               let AsmString = AsmName # ps.AsmOperands;
             }
 }
@@ -1177,8 +1203,8 @@ defm V_FMA_F16_gfx9         : VOP3OpSel_F16_Real_gfx9 <0x206, "v_fma_f16">;
 defm V_DIV_FIXUP_F16_gfx9   : VOP3OpSel_F16_Real_gfx9 <0x207, "v_div_fixup_f16">;
 defm V_INTERP_P2_F16_gfx9   : VOP3Interp_F16_Real_gfx9 <0x277, "V_INTERP_P2_F16_gfx9", "v_interp_p2_f16">;
 
-defm V_ADD_I32_gfx9         : VOP3_Real_gfx9 <0x29c, "v_add_i32">;
-defm V_SUB_I32_gfx9         : VOP3_Real_gfx9 <0x29d, "v_sub_i32">;
+defm V_ADD_I32         : VOP3_Real_vi <0x29c>;
+defm V_SUB_I32         : VOP3_Real_vi <0x29d>;
 
 defm V_INTERP_P1_F32_e64  : VOP3Interp_Real_vi <0x270>;
 defm V_INTERP_P2_F32_e64  : VOP3Interp_Real_vi <0x271>;
@@ -1201,8 +1227,8 @@ defm V_MUL_LO_I32       : VOP3_Real_vi <0x285>;
 defm V_MUL_HI_U32       : VOP3_Real_vi <0x286>;
 defm V_MUL_HI_I32       : VOP3_Real_vi <0x287>;
 
-defm V_READLANE_B32     : VOP3_Real_vi <0x289>;
-defm V_WRITELANE_B32    : VOP3_Real_vi <0x28a>;
+defm V_READLANE_B32     : VOP3_Real_No_Suffix_vi <0x289>;
+defm V_WRITELANE_B32    : VOP3_Real_No_Suffix_vi <0x28a>;
 
 defm V_LSHLREV_B64      : VOP3_Real_vi <0x28f>;
 defm V_LSHRREV_B64      : VOP3_Real_vi <0x290>;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index fc457ad212d4..64e70b8f64b0 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -39,7 +39,7 @@ class VOP3_VOP3PInst<string OpName, VOPProfile P, bit UseTiedOutput = 0,
          // class constraints.
          !if(UseTiedOutput, (ins clampmod:$clamp, VGPR_32:$vdst_in),
                             (ins clampmod0:$clamp))),
-         (ins op_sel:$op_sel, op_sel_hi:$op_sel_hi));
+         (ins op_sel0:$op_sel, op_sel_hi0:$op_sel_hi));
 
   let Constraints = !if(UseTiedOutput, "$vdst = $vdst_in", "");
   let DisableEncoding = !if(UseTiedOutput, "$vdst_in", "");
@@ -77,6 +77,8 @@ def V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile<VOP_V2I16_V2I1
 def V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshr_rev>;
 
 
+let SubtargetPredicate = HasVOP3PInsts in {
+
 // Undo sub x, c -> add x, -c canonicalization since c is more likely
 // an inline immediate than -c.
 // The constant will be emitted as a mov, and folded later.
@@ -86,6 +88,19 @@ def : GCNPat<
   (V_PK_SUB_U16 $src0_modifiers, $src0, SRCMODS.OP_SEL_1, NegSubInlineConstV216:$src1)
 >;
 
+// Integer operations with clamp bit set.
+class VOP3PSatPat<SDPatternOperator pat, Instruction inst> : GCNPat<
+  (pat (v2i16 (VOP3PMods v2i16:$src0, i32:$src0_modifiers)),
+       (v2i16 (VOP3PMods v2i16:$src1, i32:$src1_modifiers))),
+  (inst $src0_modifiers, $src0, $src1_modifiers, $src1, DSTCLAMP.ENABLE)
+>;
+
+def : VOP3PSatPat<uaddsat, V_PK_ADD_U16>;
+def : VOP3PSatPat<saddsat, V_PK_ADD_I16>;
+def : VOP3PSatPat<usubsat, V_PK_SUB_U16>;
+def : VOP3PSatPat<ssubsat, V_PK_SUB_I16>;
+} // End SubtargetPredicate = HasVOP3PInsts
+
 multiclass MadFmaMixPats<SDPatternOperator fma_like,
                          Instruction mix_inst,
                          Instruction mixlo_inst,
@@ -211,7 +226,7 @@ foreach Type = ["I", "U"] in
   foreach Index = 0-3 in {
     // Defines patterns that extract each Index'ed 8bit from an unsigned
     // 32bit scalar value;
-    def Type#Index#"_8bit" : Extract<!shl(Index, 3), 255, !if (!eq (Type, "U"), 1, 0)>;
+    def Type#Index#"_8bit" : Extract<!shl(Index, 3), 255, !eq (Type, "U")>;
 
     // Defines multiplication patterns where the multiplication is happening on each
     // Index'ed 8bit of a 32bit scalar value.
@@ -239,7 +254,7 @@ foreach Type = ["I", "U"] in
   foreach Index = 0-7 in {
     // Defines patterns that extract each Index'ed 4bit from an unsigned
     // 32bit scalar value;
-    def Type#Index#"_4bit" : Extract<!shl(Index, 2), 15, !if (!eq (Type, "U"), 1, 0)>;
+    def Type#Index#"_4bit" : Extract<!shl(Index, 2), 15, !eq (Type, "U")>;
 
     // Defines multiplication patterns where the multiplication is happening on each
     // Index'ed 8bit of a 32bit scalar value.
@@ -347,7 +362,6 @@ class VOPProfileMAI<VOPProfile P, RegisterOperand _SrcRC, RegisterOperand _DstRC
   let Src2RC64 = _SrcRC;
   let HasOpSel = 0;
   let HasClamp = 0;
-  let HasModifiers = 0;
   let Asm64 = " $vdst, $src0, $src1, $src2$cbsz$abid$blgp";
   let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, Src2RC64:$src2, cbsz:$cbsz, abid:$abid, blgp:$blgp);
 }
@@ -368,34 +382,34 @@ def VOPProfileMAI_F32_V4F16_X32 : VOPProfileMAI<VOP_V32F32_V4F16_V4F16_V32F32, A
 let Predicates = [HasMAIInsts] in {
 
 let isAsCheapAsAMove = 1, isReMaterializable = 1 in {
-def V_ACCVGPR_READ_B32  : VOP3Inst<"v_accvgpr_read_b32",  VOPProfileAccRead>;
-def V_ACCVGPR_WRITE_B32 : VOP3Inst<"v_accvgpr_write_b32", VOPProfileAccWrite> {
-  let isMoveImm = 1;
-}
-}
+  defm V_ACCVGPR_READ_B32  : VOP3Inst<"v_accvgpr_read_b32",  VOPProfileAccRead>;
+  let isMoveImm = 1 in {
+    defm V_ACCVGPR_WRITE_B32 : VOP3Inst<"v_accvgpr_write_b32", VOPProfileAccWrite>;
+  } // End isMoveImm = 1
+} // End isAsCheapAsAMove = 1, isReMaterializable = 1
 
 // FP32 denorm mode is respected, rounding mode is not. Exceptions are not supported.
 let isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 in {
-def V_MFMA_F32_4X4X1F32    : VOP3Inst<"v_mfma_f32_4x4x1f32",    VOPProfileMAI_F32_F32_X4,    int_amdgcn_mfma_f32_4x4x1f32>;
-def V_MFMA_F32_4X4X4F16    : VOP3Inst<"v_mfma_f32_4x4x4f16",    VOPProfileMAI_F32_V4F16_X4,  int_amdgcn_mfma_f32_4x4x4f16>;
-def V_MFMA_I32_4X4X4I8     : VOP3Inst<"v_mfma_i32_4x4x4i8",     VOPProfileMAI_I32_I32_X4,    int_amdgcn_mfma_i32_4x4x4i8>;
-def V_MFMA_F32_4X4X2BF16   : VOP3Inst<"v_mfma_f32_4x4x2bf16",   VOPProfileMAI_F32_V2I16_X4,  int_amdgcn_mfma_f32_4x4x2bf16>;
-def V_MFMA_F32_16X16X1F32  : VOP3Inst<"v_mfma_f32_16x16x1f32",  VOPProfileMAI_F32_F32_X16,   int_amdgcn_mfma_f32_16x16x1f32>;
-def V_MFMA_F32_16X16X4F32  : VOP3Inst<"v_mfma_f32_16x16x4f32",  VOPProfileMAI_F32_F32_X4,    int_amdgcn_mfma_f32_16x16x4f32>;
-def V_MFMA_F32_16X16X4F16  : VOP3Inst<"v_mfma_f32_16x16x4f16",  VOPProfileMAI_F32_V4F16_X16, int_amdgcn_mfma_f32_16x16x4f16>;
-def V_MFMA_F32_16X16X16F16 : VOP3Inst<"v_mfma_f32_16x16x16f16", VOPProfileMAI_F32_V4F16_X4,  int_amdgcn_mfma_f32_16x16x16f16>;
-def V_MFMA_I32_16X16X4I8   : VOP3Inst<"v_mfma_i32_16x16x4i8",   VOPProfileMAI_I32_I32_X16,   int_amdgcn_mfma_i32_16x16x4i8>;
-def V_MFMA_I32_16X16X16I8  : VOP3Inst<"v_mfma_i32_16x16x16i8",  VOPProfileMAI_I32_I32_X4,    int_amdgcn_mfma_i32_16x16x16i8>;
-def V_MFMA_F32_16X16X2BF16 : VOP3Inst<"v_mfma_f32_16x16x2bf16", VOPProfileMAI_F32_V2I16_X16, int_amdgcn_mfma_f32_16x16x2bf16>;
-def V_MFMA_F32_16X16X8BF16 : VOP3Inst<"v_mfma_f32_16x16x8bf16", VOPProfileMAI_F32_V2I16_X4,  int_amdgcn_mfma_f32_16x16x8bf16>;
-def V_MFMA_F32_32X32X1F32  : VOP3Inst<"v_mfma_f32_32x32x1f32",  VOPProfileMAI_F32_F32_X32,   int_amdgcn_mfma_f32_32x32x1f32>;
-def V_MFMA_F32_32X32X2F32  : VOP3Inst<"v_mfma_f32_32x32x2f32",  VOPProfileMAI_F32_F32_X16,   int_amdgcn_mfma_f32_32x32x2f32>;
-def V_MFMA_F32_32X32X4F16  : VOP3Inst<"v_mfma_f32_32x32x4f16",  VOPProfileMAI_F32_V4F16_X32, int_amdgcn_mfma_f32_32x32x4f16>;
-def V_MFMA_F32_32X32X8F16  : VOP3Inst<"v_mfma_f32_32x32x8f16",  VOPProfileMAI_F32_V4F16_X16, int_amdgcn_mfma_f32_32x32x8f16>;
-def V_MFMA_I32_32X32X4I8   : VOP3Inst<"v_mfma_i32_32x32x4i8",   VOPProfileMAI_I32_I32_X32,   int_amdgcn_mfma_i32_32x32x4i8>;
-def V_MFMA_I32_32X32X8I8   : VOP3Inst<"v_mfma_i32_32x32x8i8",   VOPProfileMAI_I32_I32_X16,   int_amdgcn_mfma_i32_32x32x8i8>;
-def V_MFMA_F32_32X32X2BF16 : VOP3Inst<"v_mfma_f32_32x32x2bf16", VOPProfileMAI_F32_V2I16_X32, int_amdgcn_mfma_f32_32x32x2bf16>;
-def V_MFMA_F32_32X32X4BF16 : VOP3Inst<"v_mfma_f32_32x32x4bf16", VOPProfileMAI_F32_V2I16_X16, int_amdgcn_mfma_f32_32x32x4bf16>;
+defm V_MFMA_F32_4X4X1F32    : VOP3Inst<"v_mfma_f32_4x4x1f32",    VOPProfileMAI_F32_F32_X4,    int_amdgcn_mfma_f32_4x4x1f32>;
+defm V_MFMA_F32_4X4X4F16    : VOP3Inst<"v_mfma_f32_4x4x4f16",    VOPProfileMAI_F32_V4F16_X4,  int_amdgcn_mfma_f32_4x4x4f16>;
+defm V_MFMA_I32_4X4X4I8     : VOP3Inst<"v_mfma_i32_4x4x4i8",     VOPProfileMAI_I32_I32_X4,    int_amdgcn_mfma_i32_4x4x4i8>;
+defm V_MFMA_F32_4X4X2BF16   : VOP3Inst<"v_mfma_f32_4x4x2bf16",   VOPProfileMAI_F32_V2I16_X4,  int_amdgcn_mfma_f32_4x4x2bf16>;
+defm V_MFMA_F32_16X16X1F32  : VOP3Inst<"v_mfma_f32_16x16x1f32",  VOPProfileMAI_F32_F32_X16,   int_amdgcn_mfma_f32_16x16x1f32>;
+defm V_MFMA_F32_16X16X4F32  : VOP3Inst<"v_mfma_f32_16x16x4f32",  VOPProfileMAI_F32_F32_X4,    int_amdgcn_mfma_f32_16x16x4f32>;
+defm V_MFMA_F32_16X16X4F16  : VOP3Inst<"v_mfma_f32_16x16x4f16",  VOPProfileMAI_F32_V4F16_X16, int_amdgcn_mfma_f32_16x16x4f16>;
+defm V_MFMA_F32_16X16X16F16 : VOP3Inst<"v_mfma_f32_16x16x16f16", VOPProfileMAI_F32_V4F16_X4,  int_amdgcn_mfma_f32_16x16x16f16>;
+defm V_MFMA_I32_16X16X4I8   : VOP3Inst<"v_mfma_i32_16x16x4i8",   VOPProfileMAI_I32_I32_X16,   int_amdgcn_mfma_i32_16x16x4i8>;
+defm V_MFMA_I32_16X16X16I8  : VOP3Inst<"v_mfma_i32_16x16x16i8",  VOPProfileMAI_I32_I32_X4,    int_amdgcn_mfma_i32_16x16x16i8>;
+defm V_MFMA_F32_16X16X2BF16 : VOP3Inst<"v_mfma_f32_16x16x2bf16", VOPProfileMAI_F32_V2I16_X16, int_amdgcn_mfma_f32_16x16x2bf16>;
+defm V_MFMA_F32_16X16X8BF16 : VOP3Inst<"v_mfma_f32_16x16x8bf16", VOPProfileMAI_F32_V2I16_X4,  int_amdgcn_mfma_f32_16x16x8bf16>;
+defm V_MFMA_F32_32X32X1F32  : VOP3Inst<"v_mfma_f32_32x32x1f32",  VOPProfileMAI_F32_F32_X32,   int_amdgcn_mfma_f32_32x32x1f32>;
+defm V_MFMA_F32_32X32X2F32  : VOP3Inst<"v_mfma_f32_32x32x2f32",  VOPProfileMAI_F32_F32_X16,   int_amdgcn_mfma_f32_32x32x2f32>;
+defm V_MFMA_F32_32X32X4F16  : VOP3Inst<"v_mfma_f32_32x32x4f16",  VOPProfileMAI_F32_V4F16_X32, int_amdgcn_mfma_f32_32x32x4f16>;
+defm V_MFMA_F32_32X32X8F16  : VOP3Inst<"v_mfma_f32_32x32x8f16",  VOPProfileMAI_F32_V4F16_X16, int_amdgcn_mfma_f32_32x32x8f16>;
+defm V_MFMA_I32_32X32X4I8   : VOP3Inst<"v_mfma_i32_32x32x4i8",   VOPProfileMAI_I32_I32_X32,   int_amdgcn_mfma_i32_32x32x4i8>;
+defm V_MFMA_I32_32X32X8I8   : VOP3Inst<"v_mfma_i32_32x32x8i8",   VOPProfileMAI_I32_I32_X16,   int_amdgcn_mfma_i32_32x32x8i8>;
+defm V_MFMA_F32_32X32X2BF16 : VOP3Inst<"v_mfma_f32_32x32x2bf16", VOPProfileMAI_F32_V2I16_X32, int_amdgcn_mfma_f32_32x32x2bf16>;
+defm V_MFMA_F32_32X32X4BF16 : VOP3Inst<"v_mfma_f32_32x32x4bf16", VOPProfileMAI_F32_V2I16_X16, int_amdgcn_mfma_f32_32x32x4bf16>;
 } // End isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1
 
 } // End SubtargetPredicate = HasMAIInsts
@@ -403,7 +417,15 @@ def V_MFMA_F32_32X32X4BF16 : VOP3Inst<"v_mfma_f32_32x32x4bf16", VOPProfileMAI_F3
 def : MnemonicAlias<"v_accvgpr_read",  "v_accvgpr_read_b32">;
 def : MnemonicAlias<"v_accvgpr_write", "v_accvgpr_write_b32">;
 
-multiclass VOP3P_Real_vi<bits<10> op> {
+//===----------------------------------------------------------------------===//
+// Begin Real Encodings
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// GFX8 (VI)
+//===----------------------------------------------------------------------===//
+
+multiclass VOP3P_Real_vi<bits<7> op> {
   def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>,
             VOP3Pe <op, !cast<VOP3_Pseudo>(NAME).Pfl> {
     let AssemblerPredicate = HasVOP3PInsts;
@@ -411,40 +433,51 @@ multiclass VOP3P_Real_vi<bits<10> op> {
   }
 }
 
-multiclass VOP3P_Real_MAI<bits<10> op> {
-  def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>,
-            VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME).Pfl> {
+multiclass VOP3P_Real_MAI<bits<7> op> {
+  def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
+            VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
+    let AssemblerPredicate = HasMAIInsts;
+    let DecoderNamespace = "GFX8";
+    let Inst{14} = 1; // op_sel_hi(2) default value
+    let Inst{59} = 1; // op_sel_hi(0) default value
+    let Inst{60} = 1; // op_sel_hi(1) default value
+  }
+}
+
+multiclass VOP3P_Real_MFMA<bits<7> op> {
+  def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
+            VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
     let AssemblerPredicate = HasMAIInsts;
     let DecoderNamespace = "GFX8";
   }
 }
 
-defm V_PK_MAD_I16 : VOP3P_Real_vi <0x380>;
-defm V_PK_MUL_LO_U16 : VOP3P_Real_vi <0x381>;
-defm V_PK_ADD_I16 : VOP3P_Real_vi <0x382>;
-defm V_PK_SUB_I16 : VOP3P_Real_vi <0x383>;
-defm V_PK_LSHLREV_B16 : VOP3P_Real_vi <0x384>;
-defm V_PK_LSHRREV_B16 : VOP3P_Real_vi <0x385>;
-defm V_PK_ASHRREV_I16 : VOP3P_Real_vi <0x386>;
-defm V_PK_MAX_I16 : VOP3P_Real_vi <0x387>;
-defm V_PK_MIN_I16 : VOP3P_Real_vi <0x388>;
-defm V_PK_MAD_U16 : VOP3P_Real_vi <0x389>;
-
-defm V_PK_ADD_U16 : VOP3P_Real_vi <0x38a>;
-defm V_PK_SUB_U16 : VOP3P_Real_vi <0x38b>;
-defm V_PK_MAX_U16 : VOP3P_Real_vi <0x38c>;
-defm V_PK_MIN_U16 : VOP3P_Real_vi <0x38d>;
-defm V_PK_FMA_F16 : VOP3P_Real_vi <0x38e>;
-defm V_PK_ADD_F16 : VOP3P_Real_vi <0x38f>;
-defm V_PK_MUL_F16 : VOP3P_Real_vi <0x390>;
-defm V_PK_MIN_F16 : VOP3P_Real_vi <0x391>;
-defm V_PK_MAX_F16 : VOP3P_Real_vi <0x392>;
+defm V_PK_MAD_I16 : VOP3P_Real_vi <0x00>;
+defm V_PK_MUL_LO_U16 : VOP3P_Real_vi <0x01>;
+defm V_PK_ADD_I16 : VOP3P_Real_vi <0x02>;
+defm V_PK_SUB_I16 : VOP3P_Real_vi <0x03>;
+defm V_PK_LSHLREV_B16 : VOP3P_Real_vi <0x04>;
+defm V_PK_LSHRREV_B16 : VOP3P_Real_vi <0x05>;
+defm V_PK_ASHRREV_I16 : VOP3P_Real_vi <0x06>;
+defm V_PK_MAX_I16 : VOP3P_Real_vi <0x07>;
+defm V_PK_MIN_I16 : VOP3P_Real_vi <0x08>;
+defm V_PK_MAD_U16 : VOP3P_Real_vi <0x09>;
+
+defm V_PK_ADD_U16 : VOP3P_Real_vi <0x0a>;
+defm V_PK_SUB_U16 : VOP3P_Real_vi <0x0b>;
+defm V_PK_MAX_U16 : VOP3P_Real_vi <0x0c>;
+defm V_PK_MIN_U16 : VOP3P_Real_vi <0x0d>;
+defm V_PK_FMA_F16 : VOP3P_Real_vi <0x0e>;
+defm V_PK_ADD_F16 : VOP3P_Real_vi <0x0f>;
+defm V_PK_MUL_F16 : VOP3P_Real_vi <0x10>;
+defm V_PK_MIN_F16 : VOP3P_Real_vi <0x11>;
+defm V_PK_MAX_F16 : VOP3P_Real_vi <0x12>;
 
 
 let SubtargetPredicate = HasMadMixInsts in {
-defm V_MAD_MIX_F32 : VOP3P_Real_vi <0x3a0>;
-defm V_MAD_MIXLO_F16 : VOP3P_Real_vi <0x3a1>;
-defm V_MAD_MIXHI_F16 : VOP3P_Real_vi <0x3a2>;
+defm V_MAD_MIX_F32 : VOP3P_Real_vi <0x20>;
+defm V_MAD_MIXLO_F16 : VOP3P_Real_vi <0x21>;
+defm V_MAD_MIXHI_F16 : VOP3P_Real_vi <0x22>;
 }
 
 let SubtargetPredicate = HasFmaMixInsts in {
@@ -452,54 +485,54 @@ let DecoderNamespace = "GFX9_DL" in {
 // The mad_mix instructions were renamed and their behaviors changed,
 // but the opcode stayed the same so we need to put these in a
 // different DecoderNamespace to avoid the ambiguity.
-defm V_FMA_MIX_F32 : VOP3P_Real_vi <0x3a0>;
-defm V_FMA_MIXLO_F16 : VOP3P_Real_vi <0x3a1>;
-defm V_FMA_MIXHI_F16 : VOP3P_Real_vi <0x3a2>;
+defm V_FMA_MIX_F32 : VOP3P_Real_vi <0x20>;
+defm V_FMA_MIXLO_F16 : VOP3P_Real_vi <0x21>;
+defm V_FMA_MIXHI_F16 : VOP3P_Real_vi <0x22>;
 }
 }
 
 
 let SubtargetPredicate = HasDot2Insts in {
 
-defm V_DOT2_F32_F16 : VOP3P_Real_vi <0x3a3>;
-defm V_DOT2_I32_I16 : VOP3P_Real_vi <0x3a6>;
-defm V_DOT2_U32_U16 : VOP3P_Real_vi <0x3a7>;
-defm V_DOT4_U32_U8  : VOP3P_Real_vi <0x3a9>;
-defm V_DOT8_U32_U4  : VOP3P_Real_vi <0x3ab>;
+defm V_DOT2_F32_F16 : VOP3P_Real_vi <0x23>;
+defm V_DOT2_I32_I16 : VOP3P_Real_vi <0x26>;
+defm V_DOT2_U32_U16 : VOP3P_Real_vi <0x27>;
+defm V_DOT4_U32_U8  : VOP3P_Real_vi <0x29>;
+defm V_DOT8_U32_U4  : VOP3P_Real_vi <0x2b>;
 
 } // End SubtargetPredicate = HasDot2Insts
 
 let SubtargetPredicate = HasDot1Insts in {
 
-defm V_DOT4_I32_I8  : VOP3P_Real_vi <0x3a8>;
-defm V_DOT8_I32_I4  : VOP3P_Real_vi <0x3aa>;
+defm V_DOT4_I32_I8  : VOP3P_Real_vi <0x28>;
+defm V_DOT8_I32_I4  : VOP3P_Real_vi <0x2a>;
 
 } // End SubtargetPredicate = HasDot1Insts
 
 let SubtargetPredicate = HasMAIInsts in {
 
-defm V_ACCVGPR_READ_B32  : VOP3P_Real_MAI <0x3d8>;
-defm V_ACCVGPR_WRITE_B32 : VOP3P_Real_MAI <0x3d9>;
-defm V_MFMA_F32_32X32X1F32  : VOP3P_Real_MAI <0x3c0>;
-defm V_MFMA_F32_16X16X1F32  : VOP3P_Real_MAI <0x3c1>;
-defm V_MFMA_F32_4X4X1F32    : VOP3P_Real_MAI <0x3c2>;
-defm V_MFMA_F32_32X32X2F32  : VOP3P_Real_MAI <0x3c4>;
-defm V_MFMA_F32_16X16X4F32  : VOP3P_Real_MAI <0x3c5>;
-defm V_MFMA_F32_32X32X4F16  : VOP3P_Real_MAI <0x3c8>;
-defm V_MFMA_F32_16X16X4F16  : VOP3P_Real_MAI <0x3c9>;
-defm V_MFMA_F32_4X4X4F16    : VOP3P_Real_MAI <0x3ca>;
-defm V_MFMA_F32_32X32X8F16  : VOP3P_Real_MAI <0x3cc>;
-defm V_MFMA_F32_16X16X16F16 : VOP3P_Real_MAI <0x3cd>;
-defm V_MFMA_I32_32X32X4I8   : VOP3P_Real_MAI <0x3d0>;
-defm V_MFMA_I32_16X16X4I8   : VOP3P_Real_MAI <0x3d1>;
-defm V_MFMA_I32_4X4X4I8     : VOP3P_Real_MAI <0x3d2>;
-defm V_MFMA_I32_32X32X8I8   : VOP3P_Real_MAI <0x3d4>;
-defm V_MFMA_I32_16X16X16I8  : VOP3P_Real_MAI <0x3d5>;
-defm V_MFMA_F32_32X32X2BF16 : VOP3P_Real_MAI <0x3e8>;
-defm V_MFMA_F32_16X16X2BF16 : VOP3P_Real_MAI <0x3e9>;
-defm V_MFMA_F32_4X4X2BF16   : VOP3P_Real_MAI <0x3eb>;
-defm V_MFMA_F32_32X32X4BF16 : VOP3P_Real_MAI <0x3ec>;
-defm V_MFMA_F32_16X16X8BF16 : VOP3P_Real_MAI <0x3ed>;
+defm V_ACCVGPR_READ_B32  : VOP3P_Real_MAI <0x58>;
+defm V_ACCVGPR_WRITE_B32 : VOP3P_Real_MAI <0x59>;
+defm V_MFMA_F32_32X32X1F32  : VOP3P_Real_MFMA <0x40>;
+defm V_MFMA_F32_16X16X1F32  : VOP3P_Real_MFMA <0x41>;
+defm V_MFMA_F32_4X4X1F32    : VOP3P_Real_MFMA <0x42>;
+defm V_MFMA_F32_32X32X2F32  : VOP3P_Real_MFMA <0x44>;
+defm V_MFMA_F32_16X16X4F32  : VOP3P_Real_MFMA <0x45>;
+defm V_MFMA_F32_32X32X4F16  : VOP3P_Real_MFMA <0x48>;
+defm V_MFMA_F32_16X16X4F16  : VOP3P_Real_MFMA <0x49>;
+defm V_MFMA_F32_4X4X4F16    : VOP3P_Real_MFMA <0x4a>;
+defm V_MFMA_F32_32X32X8F16  : VOP3P_Real_MFMA <0x4c>;
+defm V_MFMA_F32_16X16X16F16 : VOP3P_Real_MFMA <0x4d>;
+defm V_MFMA_I32_32X32X4I8   : VOP3P_Real_MFMA <0x50>;
+defm V_MFMA_I32_16X16X4I8   : VOP3P_Real_MFMA <0x51>;
+defm V_MFMA_I32_4X4X4I8     : VOP3P_Real_MFMA <0x52>;
+defm V_MFMA_I32_16X16X16I8  : VOP3P_Real_MFMA <0x55>;
+defm V_MFMA_I32_32X32X8I8   : VOP3P_Real_MFMA <0x54>;
+defm V_MFMA_F32_32X32X2BF16 : VOP3P_Real_MFMA <0x68>;
+defm V_MFMA_F32_16X16X2BF16 : VOP3P_Real_MFMA <0x69>;
+defm V_MFMA_F32_4X4X2BF16   : VOP3P_Real_MFMA <0x6b>;
+defm V_MFMA_F32_32X32X4BF16 : VOP3P_Real_MFMA <0x6c>;
+defm V_MFMA_F32_16X16X8BF16 : VOP3P_Real_MFMA <0x6d>;
 
 } // End SubtargetPredicate = HasMAIInsts
 
@@ -508,48 +541,48 @@ defm V_MFMA_F32_16X16X8BF16 : VOP3P_Real_MAI <0x3ed>;
 //===----------------------------------------------------------------------===//
 
 let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
-  multiclass VOP3P_Real_gfx10<bits<10> op> {
+  multiclass VOP3P_Real_gfx10<bits<7> op> {
     def _gfx10 : VOP3P_Real<!cast<VOP3P_Pseudo>(NAME), SIEncodingFamily.GFX10>,
                  VOP3Pe_gfx10 <op, !cast<VOP3P_Pseudo>(NAME).Pfl>;
   }
 } // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10"
 
-defm V_PK_MAD_I16     : VOP3P_Real_gfx10<0x000>;
-defm V_PK_MUL_LO_U16  : VOP3P_Real_gfx10<0x001>;
-defm V_PK_ADD_I16     : VOP3P_Real_gfx10<0x002>;
-defm V_PK_SUB_I16     : VOP3P_Real_gfx10<0x003>;
-defm V_PK_LSHLREV_B16 : VOP3P_Real_gfx10<0x004>;
-defm V_PK_LSHRREV_B16 : VOP3P_Real_gfx10<0x005>;
-defm V_PK_ASHRREV_I16 : VOP3P_Real_gfx10<0x006>;
-defm V_PK_MAX_I16     : VOP3P_Real_gfx10<0x007>;
-defm V_PK_MIN_I16     : VOP3P_Real_gfx10<0x008>;
-defm V_PK_MAD_U16     : VOP3P_Real_gfx10<0x009>;
-defm V_PK_ADD_U16     : VOP3P_Real_gfx10<0x00a>;
-defm V_PK_SUB_U16     : VOP3P_Real_gfx10<0x00b>;
-defm V_PK_MAX_U16     : VOP3P_Real_gfx10<0x00c>;
-defm V_PK_MIN_U16     : VOP3P_Real_gfx10<0x00d>;
-defm V_PK_FMA_F16     : VOP3P_Real_gfx10<0x00e>;
-defm V_PK_ADD_F16     : VOP3P_Real_gfx10<0x00f>;
-defm V_PK_MUL_F16     : VOP3P_Real_gfx10<0x010>;
-defm V_PK_MIN_F16     : VOP3P_Real_gfx10<0x011>;
-defm V_PK_MAX_F16     : VOP3P_Real_gfx10<0x012>;
-defm V_FMA_MIX_F32    : VOP3P_Real_gfx10<0x020>;
-defm V_FMA_MIXLO_F16  : VOP3P_Real_gfx10<0x021>;
-defm V_FMA_MIXHI_F16  : VOP3P_Real_gfx10<0x022>;
+defm V_PK_MAD_I16     : VOP3P_Real_gfx10<0x00>;
+defm V_PK_MUL_LO_U16  : VOP3P_Real_gfx10<0x01>;
+defm V_PK_ADD_I16     : VOP3P_Real_gfx10<0x02>;
+defm V_PK_SUB_I16     : VOP3P_Real_gfx10<0x03>;
+defm V_PK_LSHLREV_B16 : VOP3P_Real_gfx10<0x04>;
+defm V_PK_LSHRREV_B16 : VOP3P_Real_gfx10<0x05>;
+defm V_PK_ASHRREV_I16 : VOP3P_Real_gfx10<0x06>;
+defm V_PK_MAX_I16     : VOP3P_Real_gfx10<0x07>;
+defm V_PK_MIN_I16     : VOP3P_Real_gfx10<0x08>;
+defm V_PK_MAD_U16     : VOP3P_Real_gfx10<0x09>;
+defm V_PK_ADD_U16     : VOP3P_Real_gfx10<0x0a>;
+defm V_PK_SUB_U16     : VOP3P_Real_gfx10<0x0b>;
+defm V_PK_MAX_U16     : VOP3P_Real_gfx10<0x0c>;
+defm V_PK_MIN_U16     : VOP3P_Real_gfx10<0x0d>;
+defm V_PK_FMA_F16     : VOP3P_Real_gfx10<0x0e>;
+defm V_PK_ADD_F16     : VOP3P_Real_gfx10<0x0f>;
+defm V_PK_MUL_F16     : VOP3P_Real_gfx10<0x10>;
+defm V_PK_MIN_F16     : VOP3P_Real_gfx10<0x11>;
+defm V_PK_MAX_F16     : VOP3P_Real_gfx10<0x12>;
+defm V_FMA_MIX_F32    : VOP3P_Real_gfx10<0x20>;
+defm V_FMA_MIXLO_F16  : VOP3P_Real_gfx10<0x21>;
+defm V_FMA_MIXHI_F16  : VOP3P_Real_gfx10<0x22>;
 
 let SubtargetPredicate = HasDot2Insts in {
 
-defm V_DOT2_F32_F16 : VOP3P_Real_gfx10 <0x013>;
-defm V_DOT2_I32_I16 : VOP3P_Real_gfx10 <0x014>;
-defm V_DOT2_U32_U16 : VOP3P_Real_gfx10 <0x015>;
-defm V_DOT4_U32_U8  : VOP3P_Real_gfx10 <0x017>;
-defm V_DOT8_U32_U4  : VOP3P_Real_gfx10 <0x019>;
+defm V_DOT2_F32_F16 : VOP3P_Real_gfx10 <0x13>;
+defm V_DOT2_I32_I16 : VOP3P_Real_gfx10 <0x14>;
+defm V_DOT2_U32_U16 : VOP3P_Real_gfx10 <0x15>;
+defm V_DOT4_U32_U8  : VOP3P_Real_gfx10 <0x17>;
+defm V_DOT8_U32_U4  : VOP3P_Real_gfx10 <0x19>;
 
 } // End SubtargetPredicate = HasDot2Insts
 
 let SubtargetPredicate = HasDot1Insts in {
 
-defm V_DOT4_I32_I8  : VOP3P_Real_gfx10 <0x016>;
-defm V_DOT8_I32_I4  : VOP3P_Real_gfx10 <0x018>;
+defm V_DOT4_I32_I8  : VOP3P_Real_gfx10 <0x16>;
+defm V_DOT8_I32_I4  : VOP3P_Real_gfx10 <0x18>;
 
 } // End SubtargetPredicate = HasDot1Insts
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index aa2fa260e7b5..99599c5cd667 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -229,7 +229,7 @@ multiclass VOPC_Pseudos <string opName,
 
   foreach _ = BoolToList<P.HasExtSDWA>.ret in
   def _sdwa : VOPC_SDWA_Pseudo <opName, P> {
-    let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
+    let Defs = !if(DefExec, [EXEC], []);
     let SchedRW = P.Schedule;
     let isConvergent = DefExec;
     let isCompare = 1;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPInstructions.td
index f8a83e5f74c0..282c1002d3c9 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -8,6 +8,7 @@
 
 // dummies for outer let
 class LetDummies {
+  bit TRANS;
   bit ReadsModeReg;
   bit mayRaiseFPException;
   bit isCommutable;
@@ -69,7 +70,7 @@ class VOP3Common <dag outs, dag ins, string asm = "",
   let VOP3 = 1;
 
   let AsmVariantName = AMDGPUAsmVariants.VOP3;
-  let AsmMatchConverter = !if(!eq(HasMods,1), "cvtVOP3", "");
+  let AsmMatchConverter = !if(HasMods, "cvtVOP3", "");
 
   let isCodeGenOnly = 0;
 
@@ -129,7 +130,7 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
   let AsmMatchConverter =
     !if(isVOP3P,
         "cvtVOP3P",
-        !if(!or(P.HasModifiers, !or(P.HasOMod, P.HasIntClamp)),
+        !if(!or(P.HasModifiers, P.HasOMod, P.HasIntClamp),
             "cvtVOP3",
             ""));
 }
@@ -296,7 +297,7 @@ class VOP3be <VOPProfile P> : Enc64 {
   let Inst{63}    = !if(P.HasSrc2Mods, src2_modifiers{0}, 0);
 }
 
-class VOP3Pe <bits<10> op, VOPProfile P> : Enc64 {
+class VOP3Pe <bits<7> op, VOPProfile P> : Enc64 {
   bits<8> vdst;
   // neg, neg_hi, op_sel put in srcN_modifiers
   bits<4> src0_modifiers;
@@ -320,8 +321,8 @@ class VOP3Pe <bits<10> op, VOPProfile P> : Enc64 {
 
   let Inst{15} = !if(P.HasClamp, clamp{0}, 0);
 
-  let Inst{25-16} = op;
-  let Inst{31-26} = 0x34; //encoding
+  let Inst{22-16} = op;
+  let Inst{31-23} = 0x1a7; //encoding
   let Inst{40-32} = !if(P.HasSrc0, src0, 0);
   let Inst{49-41} = !if(P.HasSrc1, src1, 0);
   let Inst{58-50} = !if(P.HasSrc2, src2, 0);
@@ -332,7 +333,7 @@ class VOP3Pe <bits<10> op, VOPProfile P> : Enc64 {
   let Inst{63}    = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); // neg (lo)
 }
 
-class VOP3Pe_MAI <bits<10> op, VOPProfile P> : Enc64 {
+class VOP3Pe_MAI <bits<7> op, VOPProfile P> : Enc64 {
   bits<8> vdst;
   bits<10> src0;
   bits<10> src1;
@@ -349,8 +350,8 @@ class VOP3Pe_MAI <bits<10> op, VOPProfile P> : Enc64 {
 
   let Inst{15} = !if(P.HasClamp, clamp{0}, 0);
 
-  let Inst{25-16} = op;
-  let Inst{31-26} = 0x34; //encoding
+  let Inst{22-16} = op;
+  let Inst{31-23} = 0x1a7; //encoding
   let Inst{40-32} = !if(P.HasSrc0, src0{8-0}, 0);
   let Inst{49-41} = !if(P.HasSrc1, src1{8-0}, 0);
   let Inst{58-50} = !if(P.HasSrc2, src2, 0);
@@ -362,8 +363,8 @@ class VOP3Pe_MAI <bits<10> op, VOPProfile P> : Enc64 {
 }
 
 
-class VOP3Pe_gfx10 <bits<10> op, VOPProfile P> : VOP3Pe<op, P> {
-  let Inst{31-26} = 0x33; //encoding
+class VOP3Pe_gfx10 <bits<7> op, VOPProfile P> : VOP3Pe<op, P> {
+  let Inst{31-23} = 0x198; //encoding
 }
 
 class VOP3be_gfx6_gfx7<bits<9> op, VOPProfile p> : VOP3be<p> {
@@ -626,7 +627,7 @@ class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
   string Mnemonic = OpName;
   string AsmOperands = P.AsmDPP;
 
-  let AsmMatchConverter = !if(!eq(P.HasModifiers,1), "cvtDPP", "");
+  let AsmMatchConverter = !if(P.HasModifiers, "cvtDPP", "");
   let SubtargetPredicate = HasDPP;
   let AssemblerPredicate = HasDPP;
   let AsmVariantName = !if(P.HasExtDPP, AMDGPUAsmVariants.DPP,
@@ -681,7 +682,7 @@ class VOP_DPP <string OpName, VOPProfile P, bit IsDPP16,
   let DPP = 1;
   let Size = 8;
 
-  let AsmMatchConverter = !if(!eq(P.HasModifiers,1), "cvtDPP", "");
+  let AsmMatchConverter = !if(P.HasModifiers, "cvtDPP", "");
   let SubtargetPredicate = HasDPP;
   let AssemblerPredicate = HasDPP;
   let AsmVariantName = !if(P.HasExtDPP, AMDGPUAsmVariants.DPP,
@@ -776,6 +777,19 @@ class DivergentFragOrOp<SDPatternOperator Op, VOPProfile P> {
    !if(!isa<SDNode>(Op), getDivergentFrag<Op>.ret, Op), Op);
 }
 
+class getVSrcOp<ValueType vt> {
+  RegisterOperand ret = !if(!eq(vt.Size, 32), VSrc_b32, VSrc_b16);
+}
+
+// Class for binary integer operations with the clamp bit set for saturation
+// TODO: Add sub with negated inline constant pattern.
+class VOPBinOpClampPat<SDPatternOperator node, Instruction inst, ValueType vt> :
+  GCNPat<(node vt:$src0, vt:$src1),
+         (inst getVSrcOp<vt>.ret:$src0, getVSrcOp<vt>.ret:$src1,
+               DSTCLAMP.ENABLE)
+>;
+
+
 include "VOPCInstructions.td"
 include "VOP1Instructions.td"
 include "VOP2Instructions.td"
diff --git a/contrib/llvm-project/llvm/lib/Target/ARC/ARCISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/ARC/ARCISelLowering.cpp
index 4a6510f10eeb..ca33f5297471 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARC/ARCISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARC/ARCISelLowering.cpp
@@ -22,7 +22,6 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Intrinsics.h"
diff --git a/contrib/llvm-project/llvm/lib/Target/ARC/ARCSubtarget.cpp b/contrib/llvm-project/llvm/lib/Target/ARC/ARCSubtarget.cpp
index bce2dbd2eaa6..409dd2a98ab4 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARC/ARCSubtarget.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARC/ARCSubtarget.cpp
@@ -26,5 +26,5 @@ void ARCSubtarget::anchor() {}
 
 ARCSubtarget::ARCSubtarget(const Triple &TT, const std::string &CPU,
                            const std::string &FS, const TargetMachine &TM)
-    : ARCGenSubtargetInfo(TT, CPU, FS), FrameLowering(*this),
+    : ARCGenSubtargetInfo(TT, CPU, /*TuneCPU=*/CPU, FS), FrameLowering(*this),
       TLInfo(TM, *this) {}
diff --git a/contrib/llvm-project/llvm/lib/Target/ARC/ARCSubtarget.h b/contrib/llvm-project/llvm/lib/Target/ARC/ARCSubtarget.h
index 0be797f753d5..1f1b27f13f68 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARC/ARCSubtarget.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARC/ARCSubtarget.h
@@ -43,7 +43,7 @@ public:
 
   /// Parses features string setting specified subtarget options.
   /// Definition of function is auto generated by tblgen.
-  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+  void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
 
   const ARCInstrInfo *getInstrInfo() const override { return &InstrInfo; }
   const ARCFrameLowering *getFrameLowering() const override {
diff --git a/contrib/llvm-project/llvm/lib/Target/ARC/ARCTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/ARC/ARCTargetMachine.cpp
index 4a5b6fd4d5bf..b8c8949e18dd 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARC/ARCTargetMachine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARC/ARCTargetMachine.cpp
@@ -21,9 +21,7 @@
 using namespace llvm;
 
 static Reloc::Model getRelocModel(Optional<Reloc::Model> RM) {
-  if (!RM.hasValue())
-    return Reloc::Static;
-  return *RM;
+  return RM.getValueOr(Reloc::Static);
 }
 
 /// ARCTargetMachine ctor - Create an ILP32 architecture model
diff --git a/contrib/llvm-project/llvm/lib/Target/ARC/MCTargetDesc/ARCInstPrinter.h b/contrib/llvm-project/llvm/lib/Target/ARC/MCTargetDesc/ARCInstPrinter.h
index 266f2de08772..f6f8f9d089df 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARC/MCTargetDesc/ARCInstPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARC/MCTargetDesc/ARCInstPrinter.h
@@ -26,6 +26,7 @@ public:
       : MCInstPrinter(MAI, MII, MRI) {}
 
   // Autogenerated by tblgen.
+  std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
   void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &O);
   static const char *getRegisterName(unsigned RegNo);
 
diff --git a/contrib/llvm-project/llvm/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.cpp b/contrib/llvm-project/llvm/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.cpp
index 3e3613ccb90f..358ee6002f80 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.cpp
@@ -48,7 +48,7 @@ static MCRegisterInfo *createARCMCRegisterInfo(const Triple &TT) {
 
 static MCSubtargetInfo *createARCMCSubtargetInfo(const Triple &TT,
                                                  StringRef CPU, StringRef FS) {
-  return createARCMCSubtargetInfoImpl(TT, CPU, FS);
+  return createARCMCSubtargetInfoImpl(TT, CPU, /*TuneCPU=*/CPU, FS);
 }
 
 static MCAsmInfo *createARCMCAsmInfo(const MCRegisterInfo &MRI,
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/A15SDOptimizer.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/A15SDOptimizer.cpp
index f8a86a70c077..bb81233cf803 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/A15SDOptimizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/A15SDOptimizer.cpp
@@ -359,8 +359,7 @@ void A15SDOptimizer::elideCopiesAndPHIs(MachineInstr *MI,
    SmallVector<MachineInstr *, 8> Front;
    Front.push_back(MI);
    while (Front.size() != 0) {
-     MI = Front.back();
-     Front.pop_back();
+     MI = Front.pop_back_val();
 
      // If we have already explored this MachineInstr, ignore it.
      if (Reached.find(MI) != Reached.end())
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARM.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARM.h
index 7398968bb24a..f4fdc9803728 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARM.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARM.h
@@ -37,6 +37,7 @@ class PassRegistry;
 
 Pass *createMVETailPredicationPass();
 FunctionPass *createARMLowOverheadLoopsPass();
+FunctionPass *createARMBlockPlacementPass();
 Pass *createARMParallelDSPPass();
 FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM,
                                CodeGenOpt::Level OptLevel);
@@ -55,6 +56,8 @@ InstructionSelector *
 createARMInstructionSelector(const ARMBaseTargetMachine &TM, const ARMSubtarget &STI,
                              const ARMRegisterBankInfo &RBI);
 Pass *createMVEGatherScatterLoweringPass();
+FunctionPass *createARMSLSHardeningPass();
+FunctionPass *createARMIndirectThunks();
 
 void LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
                                   ARMAsmPrinter &AP);
@@ -69,8 +72,10 @@ void initializeThumb2ITBlockPass(PassRegistry &);
 void initializeMVEVPTBlockPass(PassRegistry &);
 void initializeMVEVPTOptimisationsPass(PassRegistry &);
 void initializeARMLowOverheadLoopsPass(PassRegistry &);
+void initializeARMBlockPlacementPass(PassRegistry &);
 void initializeMVETailPredicationPass(PassRegistry &);
 void initializeMVEGatherScatterLoweringPass(PassRegistry &);
+void initializeARMSLSHardeningPass(PassRegistry &);
 
 } // end namespace llvm
 
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARM.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARM.td
index 0468f7f1cf8e..3d0a0bf7f8c3 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARM.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARM.td
@@ -535,6 +535,10 @@ def HasV8_6aOps   : SubtargetFeature<"v8.6a", "HasV8_6aOps", "true",
                                    [HasV8_5aOps, FeatureBF16,
                                     FeatureMatMulInt8]>;
 
+def HasV8_7aOps   : SubtargetFeature<"v8.7a", "HasV8_7aOps", "true",
+                                   "Support ARM v8.7a instructions",
+                                   [HasV8_6aOps]>;
+
 def HasV8_1MMainlineOps : SubtargetFeature<
                "v8.1m.main", "HasV8_1MMainlineOps", "true",
                "Support ARM v8-1M Mainline instructions",
@@ -558,6 +562,20 @@ foreach i = {0-7} in
                                               "Coprocessor "#i#" ISA is CDEv1",
                                               [HasCDEOps]>;
 
+//===----------------------------------------------------------------------===//
+// Control codegen mitigation against Straight Line Speculation vulnerability.
+//===----------------------------------------------------------------------===//
+
+def FeatureHardenSlsRetBr : SubtargetFeature<"harden-sls-retbr",
+  "HardenSlsRetBr", "true",
+  "Harden against straight line speculation across RETurn and BranchRegister "
+  "instructions">;
+def FeatureHardenSlsBlr : SubtargetFeature<"harden-sls-blr",
+  "HardenSlsBlr", "true",
+  "Harden against straight line speculation across indirect calls">;
+
+
+
 //===----------------------------------------------------------------------===//
 // ARM Processor subtarget features.
 //
@@ -598,9 +616,14 @@ def ProcA77     : SubtargetFeature<"a77", "ARMProcFamily", "CortexA77",
                                    "Cortex-A77 ARM processors", []>;
 def ProcA78     : SubtargetFeature<"cortex-a78", "ARMProcFamily", "CortexA78",
                                    "Cortex-A78 ARM processors", []>;
+def ProcA78C    : SubtargetFeature<"a78c", "ARMProcFamily", "CortexA78C",
+                                   "Cortex-A78C ARM processors", []>;
 def ProcX1      : SubtargetFeature<"cortex-x1", "ARMProcFamily", "CortexX1",
                                    "Cortex-X1 ARM processors", []>;
 
+def ProcV1      : SubtargetFeature<"neoverse-v1", "ARMProcFamily",
+                                   "NeoverseV1", "Neoverse-V1 ARM processors", []>;
+
 def ProcKrait   : SubtargetFeature<"krait", "ARMProcFamily", "Krait",
                                    "Qualcomm Krait processors", []>;
 def ProcKryo    : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
@@ -639,7 +662,8 @@ def ProcR52     : SubtargetFeature<"r52", "ARMProcFamily", "CortexR52",
 
 def ProcM3      : SubtargetFeature<"m3", "ARMProcFamily", "CortexM3",
                                    "Cortex-M3 ARM processors", []>;
-
+def ProcM7      : SubtargetFeature<"m7", "ARMProcFamily", "CortexM7",
+                                   "Cortex-M7 ARM processors", []>;
 
 //===----------------------------------------------------------------------===//
 // ARM Helper classes.
@@ -828,6 +852,19 @@ def ARMv86a   : Architecture<"armv8.6-a", "ARMv86a",  [HasV8_6aOps,
                                                        FeatureCRC,
                                                        FeatureRAS,
                                                        FeatureDotProd]>;
+def ARMv87a   : Architecture<"armv8.7-a", "ARMv86a",  [HasV8_7aOps,
+                                                       FeatureAClass,
+                                                       FeatureDB,
+                                                       FeatureFPARMv8,
+                                                       FeatureNEON,
+                                                       FeatureDSP,
+                                                       FeatureTrustZone,
+                                                       FeatureMP,
+                                                       FeatureVirtualization,
+                                                       FeatureCrypto,
+                                                       FeatureCRC,
+                                                       FeatureRAS,
+                                                       FeatureDotProd]>;
 
 def ARMv8r    : Architecture<"armv8-r",   "ARMv8r",   [HasV8Ops,
                                                        FeatureRClass,
@@ -882,6 +919,13 @@ def ARMv6j   : Architecture<"armv6j",      "ARMv7a",   [ARMv6]>;
 def ARMv7k   : Architecture<"armv7k",      "ARMv7a",   [ARMv7a]>;
 def ARMv7s   : Architecture<"armv7s",      "ARMv7a",   [ARMv7a]>;
 
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "ARMRegisterInfo.td"
+include "ARMRegisterBanks.td"
+include "ARMCallingConv.td"
 
 //===----------------------------------------------------------------------===//
 // ARM schedules.
@@ -891,9 +935,27 @@ include "ARMPredicates.td"
 include "ARMSchedule.td"
 
 //===----------------------------------------------------------------------===//
-// ARM processors
+// Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "ARMInstrInfo.td"
+def ARMInstrInfo : InstrInfo;
+
+//===----------------------------------------------------------------------===//
+// ARM schedules
 //
+include "ARMScheduleV6.td"
+include "ARMScheduleA8.td"
+include "ARMScheduleA9.td"
+include "ARMScheduleSwift.td"
+include "ARMScheduleR52.td"
+include "ARMScheduleA57.td"
+include "ARMScheduleM4.td"
+include "ARMScheduleM7.td"
 
+//===----------------------------------------------------------------------===//
+// ARM processors
+//
 // Dummy CPU, used to target architectures
 def : ProcessorModel<"generic",     CortexA8Model,      []>;
 
@@ -1131,8 +1193,10 @@ def : ProcessorModel<"cortex-m4", CortexM4Model,        [ARMv7em,
                                                          FeatureUseMISched,
                                                          FeatureHasNoBranchPredictor]>;
 
-def : ProcNoItin<"cortex-m7",                           [ARMv7em,
-                                                         FeatureFPARMv8_D16]>;
+def : ProcessorModel<"cortex-m7", CortexM7Model,        [ARMv7em,
+                                                         ProcM7,
+                                                         FeatureFPARMv8_D16,
+                                                         FeatureUseMISched]>;
 
 def : ProcNoItin<"cortex-m23",                          [ARMv8mBaseline,
                                                          FeatureNoMovt]>;
@@ -1246,6 +1310,14 @@ def : ProcNoItin<"cortex-a78",                          [ARMv82a, ProcA78,
                                                          FeatureFullFP16,
                                                          FeatureDotProd]>;
 
+def : ProcNoItin<"cortex-a78c",                         [ARMv82a, ProcA78C,
+                                                         FeatureHWDivThumb,
+                                                         FeatureHWDivARM,
+                                                         FeatureCrypto,
+                                                         FeatureCRC,
+                                                         FeatureDotProd,
+                                                         FeatureFullFP16]>;
+
 def : ProcNoItin<"cortex-x1",                           [ARMv82a, ProcX1,
                                                          FeatureHWDivThumb,
                                                          FeatureHWDivARM,
@@ -1254,6 +1326,15 @@ def : ProcNoItin<"cortex-x1",                           [ARMv82a, ProcX1,
                                                          FeatureFullFP16,
                                                          FeatureDotProd]>;
 
+def : ProcNoItin<"neoverse-v1",                         [ARMv84a,
+                                                         FeatureHWDivThumb,
+                                                         FeatureHWDivARM,
+                                                         FeatureCrypto,
+                                                         FeatureCRC,
+                                                         FeatureFullFP16,
+                                                         FeatureBF16,
+                                                         FeatureMatMulInt8]>;
+
 def : ProcNoItin<"neoverse-n1",                         [ARMv82a,
                                                          FeatureHWDivThumb,
                                                          FeatureHWDivARM,
@@ -1261,6 +1342,11 @@ def : ProcNoItin<"neoverse-n1",                         [ARMv82a,
                                                          FeatureCRC,
                                                          FeatureDotProd]>;
 
+def : ProcNoItin<"neoverse-n2",                         [ARMv85a,
+                                                         FeatureBF16,
+                                                         FeatureMatMulInt8,
+                                                         FeaturePerfMon]>;
+
 def : ProcessorModel<"cyclone",     SwiftModel,         [ARMv8a, ProcSwift,
                                                          FeatureHasRetAddrStack,
                                                          FeatureNEONForFP,
@@ -1295,21 +1381,6 @@ def : ProcessorModel<"cortex-r52", CortexR52Model,      [ARMv8r, ProcR52,
                                                          FeatureUseMISched,
                                                          FeatureFPAO]>;
 
-//===----------------------------------------------------------------------===//
-// Register File Description
-//===----------------------------------------------------------------------===//
-
-include "ARMRegisterInfo.td"
-include "ARMRegisterBanks.td"
-include "ARMCallingConv.td"
-
-//===----------------------------------------------------------------------===//
-// Instruction Descriptions
-//===----------------------------------------------------------------------===//
-
-include "ARMInstrInfo.td"
-def ARMInstrInfo : InstrInfo;
-
 //===----------------------------------------------------------------------===//
 // Declare the target which we are implementing
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
index d6c1efa6327c..04e21867d571 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -285,7 +285,7 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
       return false;
     case 'y': // Print a VFP single precision register as indexed double.
       if (MI->getOperand(OpNum).isReg()) {
-        Register Reg = MI->getOperand(OpNum).getReg();
+        MCRegister Reg = MI->getOperand(OpNum).getReg().asMCReg();
         const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
         // Find the 'd' register that has this 's' register as a sub-register,
         // and determine the lane number.
@@ -903,7 +903,7 @@ void ARMAsmPrinter::emitMachineConstantPoolValue(
 
   MCSymbol *MCSym;
   if (ACPV->isLSDA()) {
-    MCSym = getCurExceptionSym();
+    MCSym = getMBBExceptionSym(MF->front());
   } else if (ACPV->isBlockAddress()) {
     const BlockAddress *BA =
       cast<ARMConstantPoolConstant>(ACPV)->getBlockAddress();
@@ -1897,7 +1897,7 @@ void ARMAsmPrinter::emitInstruction(const MachineInstr *MI) {
     // LSJLJEH:
     Register SrcReg = MI->getOperand(0).getReg();
     Register ValReg = MI->getOperand(1).getReg();
-    MCSymbol *Label = OutContext.createTempSymbol("SJLJEH", false, true);
+    MCSymbol *Label = OutContext.createTempSymbol("SJLJEH");
     OutStreamer->AddComment("eh_setjmp begin");
     EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tMOVr)
       .addReg(ValReg)
@@ -2180,6 +2180,48 @@ void ARMAsmPrinter::emitInstruction(const MachineInstr *MI) {
   case ARM::PATCHABLE_TAIL_CALL:
     LowerPATCHABLE_TAIL_CALL(*MI);
     return;
+  case ARM::SpeculationBarrierISBDSBEndBB: {
+    // Print DSB SYS + ISB
+    MCInst TmpInstDSB;
+    TmpInstDSB.setOpcode(ARM::DSB);
+    TmpInstDSB.addOperand(MCOperand::createImm(0xf));
+    EmitToStreamer(*OutStreamer, TmpInstDSB);
+    MCInst TmpInstISB;
+    TmpInstISB.setOpcode(ARM::ISB);
+    TmpInstISB.addOperand(MCOperand::createImm(0xf));
+    EmitToStreamer(*OutStreamer, TmpInstISB);
+    return;
+  }
+  case ARM::t2SpeculationBarrierISBDSBEndBB: {
+    // Print DSB SYS + ISB
+    MCInst TmpInstDSB;
+    TmpInstDSB.setOpcode(ARM::t2DSB);
+    TmpInstDSB.addOperand(MCOperand::createImm(0xf));
+    TmpInstDSB.addOperand(MCOperand::createImm(ARMCC::AL));
+    TmpInstDSB.addOperand(MCOperand::createReg(0));
+    EmitToStreamer(*OutStreamer, TmpInstDSB);
+    MCInst TmpInstISB;
+    TmpInstISB.setOpcode(ARM::t2ISB);
+    TmpInstISB.addOperand(MCOperand::createImm(0xf));
+    TmpInstISB.addOperand(MCOperand::createImm(ARMCC::AL));
+    TmpInstISB.addOperand(MCOperand::createReg(0));
+    EmitToStreamer(*OutStreamer, TmpInstISB);
+    return;
+  }
+  case ARM::SpeculationBarrierSBEndBB: {
+    // Print SB
+    MCInst TmpInstSB;
+    TmpInstSB.setOpcode(ARM::SB);
+    EmitToStreamer(*OutStreamer, TmpInstSB);
+    return;
+  }
+  case ARM::t2SpeculationBarrierSBEndBB: {
+    // Print SB
+    MCInst TmpInstSB;
+    TmpInstSB.setOpcode(ARM::t2SB);
+    EmitToStreamer(*OutStreamer, TmpInstSB);
+    return;
+  }
   }
 
   MCInst TmpInst;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 4cc2b6bf7e7e..e418d53b56a4 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -19,6 +19,7 @@
 #include "ARMSubtarget.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "MCTargetDesc/ARMBaseInfo.h"
+#include "MVETailPredUtils.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
@@ -35,6 +36,8 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/MultiHazardRecognizer.h"
 #include "llvm/CodeGen/ScoreboardHazardRecognizer.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
@@ -131,12 +134,43 @@ ARMBaseInstrInfo::CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI,
   return TargetInstrInfo::CreateTargetHazardRecognizer(STI, DAG);
 }
 
+// Called during:
+// - pre-RA scheduling
+// - post-RA scheduling when FeatureUseMISched is set
+ScheduleHazardRecognizer *ARMBaseInstrInfo::CreateTargetMIHazardRecognizer(
+    const InstrItineraryData *II, const ScheduleDAGMI *DAG) const {
+  MultiHazardRecognizer *MHR = new MultiHazardRecognizer();
+
+  // We would like to restrict this hazard recognizer to only
+  // post-RA scheduling; we can tell that we're post-RA because we don't
+  // track VRegLiveness.
+  // Cortex-M7: TRM indicates that there is a single ITCM bank and two DTCM
+  //            banks banked on bit 2.  Assume that TCMs are in use.
+  if (Subtarget.isCortexM7() && !DAG->hasVRegLiveness())
+    MHR->AddHazardRecognizer(
+        std::make_unique<ARMBankConflictHazardRecognizer>(DAG, 0x4, true));
+
+  // Not inserting ARMHazardRecognizerFPMLx because that would change
+  // legacy behavior
+
+  auto BHR = TargetInstrInfo::CreateTargetMIHazardRecognizer(II, DAG);
+  MHR->AddHazardRecognizer(std::unique_ptr<ScheduleHazardRecognizer>(BHR));
+  return MHR;
+}
+
+// Called during post-RA scheduling when FeatureUseMISched is not set
 ScheduleHazardRecognizer *ARMBaseInstrInfo::
 CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
                                    const ScheduleDAG *DAG) const {
+  MultiHazardRecognizer *MHR = new MultiHazardRecognizer();
+
   if (Subtarget.isThumb2() || Subtarget.hasVFP2Base())
-    return new ARMHazardRecognizer(II, DAG);
-  return TargetInstrInfo::CreateTargetPostRAHazardRecognizer(II, DAG);
+    MHR->AddHazardRecognizer(std::make_unique<ARMHazardRecognizerFPMLx>());
+
+  auto BHR = TargetInstrInfo::CreateTargetPostRAHazardRecognizer(II, DAG);
+  if (BHR)
+    MHR->AddHazardRecognizer(std::unique_ptr<ScheduleHazardRecognizer>(BHR));
+  return MHR;
 }
 
 MachineInstr *ARMBaseInstrInfo::convertToThreeAddress(
@@ -317,8 +351,8 @@ bool ARMBaseInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
   TBB = nullptr;
   FBB = nullptr;
 
-  MachineBasicBlock::iterator I = MBB.end();
-  if (I == MBB.begin())
+  MachineBasicBlock::instr_iterator I = MBB.instr_end();
+  if (I == MBB.instr_begin())
     return false; // Empty blocks are easy.
   --I;
 
@@ -330,9 +364,12 @@ bool ARMBaseInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
     // out.
     bool CantAnalyze = false;
 
-    // Skip over DEBUG values and predicated nonterminators.
-    while (I->isDebugInstr() || !I->isTerminator()) {
-      if (I == MBB.begin())
+    // Skip over DEBUG values, predicated nonterminators and speculation
+    // barrier terminators.
+    while (I->isDebugInstr() || !I->isTerminator() ||
+           isSpeculationBarrierEndBBOpcode(I->getOpcode()) ||
+           I->getOpcode() == ARM::t2DoLoopStartTP){
+      if (I == MBB.instr_begin())
         return false;
       --I;
     }
@@ -356,7 +393,7 @@ bool ARMBaseInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
       Cond.push_back(I->getOperand(2));
     } else if (I->isReturn()) {
       // Returns can't be analyzed, but we should run cleanup.
-      CantAnalyze = !isPredicated(*I);
+      CantAnalyze = true;
     } else {
       // We encountered other unrecognized terminator. Bail out immediately.
       return true;
@@ -377,18 +414,30 @@ bool ARMBaseInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
       // unconditional branch.
       if (AllowModify) {
         MachineBasicBlock::iterator DI = std::next(I);
-        while (DI != MBB.end()) {
+        while (DI != MBB.instr_end()) {
           MachineInstr &InstToDelete = *DI;
           ++DI;
+          // Speculation barriers must not be deleted.
+          if (isSpeculationBarrierEndBBOpcode(InstToDelete.getOpcode()))
+            continue;
           InstToDelete.eraseFromParent();
         }
       }
     }
 
-    if (CantAnalyze)
+    if (CantAnalyze) {
+      // We may not be able to analyze the block, but we could still have
+      // an unconditional branch as the last instruction in the block, which
+      // just branches to layout successor. If this is the case, then just
+      // remove it if we're allowed to make modifications.
+      if (AllowModify && !isPredicated(MBB.back()) &&
+          isUncondBranchOpcode(MBB.back().getOpcode()) &&
+          TBB && MBB.isLayoutSuccessor(TBB))
+        removeBranch(MBB);
       return true;
+    }
 
-    if (I == MBB.begin())
+    if (I == MBB.instr_begin())
       return false;
 
     --I;
@@ -537,6 +586,18 @@ bool ARMBaseInstrInfo::PredicateInstruction(
     MachineOperand &PMO = MI.getOperand(PIdx);
     PMO.setImm(Pred[0].getImm());
     MI.getOperand(PIdx+1).setReg(Pred[1].getReg());
+
+    // Thumb 1 arithmetic instructions do not set CPSR when executed inside an
+    // IT block. This affects how they are printed.
+    const MCInstrDesc &MCID = MI.getDesc();
+    if (MCID.TSFlags & ARMII::ThumbArithFlagSetting) {
+      assert(MCID.OpInfo[1].isOptionalDef() && "CPSR def isn't expected operand");
+      assert((MI.getOperand(1).isDead() ||
+              MI.getOperand(1).getReg() != ARM::CPSR) &&
+             "if conversion tried to stop defining used CPSR");
+      MI.getOperand(1).setReg(ARM::NoRegister);
+    }
+
     return true;
   }
   return false;
@@ -568,13 +629,23 @@ bool ARMBaseInstrInfo::SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
   }
 }
 
-bool ARMBaseInstrInfo::DefinesPredicate(
-    MachineInstr &MI, std::vector<MachineOperand> &Pred) const {
+bool ARMBaseInstrInfo::ClobbersPredicate(MachineInstr &MI,
+                                         std::vector<MachineOperand> &Pred,
+                                         bool SkipDead) const {
   bool Found = false;
   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI.getOperand(i);
-    if ((MO.isRegMask() && MO.clobbersPhysReg(ARM::CPSR)) ||
-        (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR)) {
+    bool ClobbersCPSR = MO.isRegMask() && MO.clobbersPhysReg(ARM::CPSR);
+    bool IsCPSR = MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR;
+    if (ClobbersCPSR || IsCPSR) {
+
+      // Filter out T1 instructions that have a dead CPSR,
+      // allowing IT blocks to be generated containing T1 instructions
+      const MCInstrDesc &MCID = MI.getDesc();
+      if (MCID.TSFlags & ARMII::ThumbArithFlagSetting && MO.isDead() &&
+          SkipDead)
+        continue;
+
       Pred.push_back(MO);
       Found = true;
     }
@@ -590,61 +661,6 @@ bool ARMBaseInstrInfo::isCPSRDefined(const MachineInstr &MI) {
   return false;
 }
 
-bool ARMBaseInstrInfo::isAddrMode3OpImm(const MachineInstr &MI,
-                                        unsigned Op) const {
-  const MachineOperand &Offset = MI.getOperand(Op + 1);
-  return Offset.getReg() != 0;
-}
-
-// Load with negative register offset requires additional 1cyc and +I unit
-// for Cortex A57
-bool ARMBaseInstrInfo::isAddrMode3OpMinusReg(const MachineInstr &MI,
-                                             unsigned Op) const {
-  const MachineOperand &Offset = MI.getOperand(Op + 1);
-  const MachineOperand &Opc = MI.getOperand(Op + 2);
-  assert(Opc.isImm());
-  assert(Offset.isReg());
-  int64_t OpcImm = Opc.getImm();
-
-  bool isSub = ARM_AM::getAM3Op(OpcImm) == ARM_AM::sub;
-  return (isSub && Offset.getReg() != 0);
-}
-
-bool ARMBaseInstrInfo::isLdstScaledReg(const MachineInstr &MI,
-                                       unsigned Op) const {
-  const MachineOperand &Opc = MI.getOperand(Op + 2);
-  unsigned OffImm = Opc.getImm();
-  return ARM_AM::getAM2ShiftOpc(OffImm) != ARM_AM::no_shift;
-}
-
-// Load, scaled register offset, not plus LSL2
-bool ARMBaseInstrInfo::isLdstScaledRegNotPlusLsl2(const MachineInstr &MI,
-                                                  unsigned Op) const {
-  const MachineOperand &Opc = MI.getOperand(Op + 2);
-  unsigned OffImm = Opc.getImm();
-
-  bool isAdd = ARM_AM::getAM2Op(OffImm) == ARM_AM::add;
-  unsigned Amt = ARM_AM::getAM2Offset(OffImm);
-  ARM_AM::ShiftOpc ShiftOpc = ARM_AM::getAM2ShiftOpc(OffImm);
-  if (ShiftOpc == ARM_AM::no_shift) return false; // not scaled
-  bool SimpleScaled = (isAdd && ShiftOpc == ARM_AM::lsl && Amt == 2);
-  return !SimpleScaled;
-}
-
-// Minus reg for ldstso addr mode
-bool ARMBaseInstrInfo::isLdstSoMinusReg(const MachineInstr &MI,
-                                        unsigned Op) const {
-  unsigned OffImm = MI.getOperand(Op + 2).getImm();
-  return ARM_AM::getAM2Op(OffImm) == ARM_AM::sub;
-}
-
-// Load, scaled register offset
-bool ARMBaseInstrInfo::isAm2ScaledReg(const MachineInstr &MI,
-                                      unsigned Op) const {
-  unsigned OffImm = MI.getOperand(Op + 2).getImm();
-  return ARM_AM::getAM2ShiftOpc(OffImm) != ARM_AM::no_shift;
-}
-
 static bool isEligibleForITBlock(const MachineInstr *MI) {
   switch (MI->getOpcode()) {
   default: return true;
@@ -687,14 +703,23 @@ bool ARMBaseInstrInfo::isPredicable(const MachineInstr &MI) const {
   if (!isEligibleForITBlock(&MI))
     return false;
 
+  const MachineFunction *MF = MI.getParent()->getParent();
   const ARMFunctionInfo *AFI =
-      MI.getParent()->getParent()->getInfo<ARMFunctionInfo>();
+      MF->getInfo<ARMFunctionInfo>();
 
   // Neon instructions in Thumb2 IT blocks are deprecated, see ARMARM.
   // In their ARM encoding, they can't be encoded in a conditional form.
   if ((MI.getDesc().TSFlags & ARMII::DomainMask) == ARMII::DomainNEON)
     return false;
 
+  // Make indirect control flow changes unpredicable when SLS mitigation is
+  // enabled.
+  const ARMSubtarget &ST = MF->getSubtarget<ARMSubtarget>();
+  if (ST.hardenSlsRetBr() && isIndirectControlFlowNotComingBack(MI))
+    return false;
+  if (ST.hardenSlsBlr() && isIndirectCall(MI))
+    return false;
+
   if (AFI->isThumb2Function()) {
     if (getSubtarget().restrictIT())
       return isV8EligibleForIT(&MI);
@@ -777,6 +802,14 @@ unsigned ARMBaseInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
       Size = alignTo(Size, 4);
     return Size;
   }
+  case ARM::SpeculationBarrierISBDSBEndBB:
+  case ARM::t2SpeculationBarrierISBDSBEndBB:
+    // This gets lowered to 2 4-byte instructions.
+    return 8;
+  case ARM::SpeculationBarrierSBEndBB:
+  case ARM::t2SpeculationBarrierSBEndBB:
+    // This gets lowered to 1 4-byte instructions.
+    return 4;
   }
 }
 
@@ -2142,7 +2175,12 @@ ARMBaseInstrInfo::extraSizeToPredicateInstructions(const MachineFunction &MF,
   // Thumb2 needs a 2-byte IT instruction to predicate up to 4 instructions.
   // ARM has a condition code field in every predicable instruction, using it
   // doesn't change code size.
-  return Subtarget.isThumb2() ? divideCeil(NumInsts, 4) * 2 : 0;
+  if (!Subtarget.isThumb2())
+    return 0;
+
+  // It's possible that the size of the IT is restricted to a single block.
+  unsigned MaxInsts = Subtarget.restrictIT() ? 1 : 4;
+  return divideCeil(NumInsts, MaxInsts) * 2;
 }
 
 unsigned
@@ -3379,7 +3417,7 @@ bool ARMBaseInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
     case ARM::t2SUBspImm:
     case ARM::t2ADDri:
     case ARM::t2SUBri:
-      MRI->setRegClass(UseMI.getOperand(0).getReg(), TRC);
+      MRI->constrainRegClass(UseMI.getOperand(0).getReg(), TRC);
   }
   return true;
 }
@@ -3840,22 +3878,6 @@ ARMBaseInstrInfo::getVLDMDefCycle(const InstrItineraryData *ItinData,
   return DefCycle;
 }
 
-bool ARMBaseInstrInfo::isLDMBaseRegInList(const MachineInstr &MI) const {
-  Register BaseReg = MI.getOperand(0).getReg();
-  for (unsigned i = 1, sz = MI.getNumOperands(); i < sz; ++i) {
-    const auto &Op = MI.getOperand(i);
-    if (Op.isReg() && Op.getReg() == BaseReg)
-      return true;
-  }
-  return false;
-}
-unsigned
-ARMBaseInstrInfo::getLDMVariableDefsSize(const MachineInstr &MI) const {
-  // ins GPR:$Rn, $p (2xOp), reglist:$regs, variable_ops
-  // (outs GPR:$wb), (ins GPR:$Rn, $p (2xOp), reglist:$regs, variable_ops)
-  return MI.getNumOperands() + 1 - MI.getDesc().getNumOperands();
-}
-
 int
 ARMBaseInstrInfo::getLDMDefCycle(const InstrItineraryData *ItinData,
                                  const MCInstrDesc &DefMCID,
@@ -4816,6 +4838,14 @@ bool ARMBaseInstrInfo::verifyInstruction(const MachineInstr &MI,
       }
     }
   }
+  if (MI.getOpcode() == ARM::MVE_VMOV_q_rr) {
+    assert(MI.getOperand(4).isImm() && MI.getOperand(5).isImm());
+    if ((MI.getOperand(4).getImm() != 2 && MI.getOperand(4).getImm() != 3) ||
+        MI.getOperand(4).getImm() != MI.getOperand(5).getImm() + 2) {
+      ErrInfo = "Incorrect array index for MVE_VMOV_q_rr";
+      return false;
+    }
+  }
   return true;
 }
 
@@ -5501,6 +5531,8 @@ unsigned llvm::ConstantMaterializationCost(unsigned Val,
       return ForCodesize ? 4 : 1;
     if (ARM_AM::isSOImmTwoPartVal(Val)) // two instrs
       return ForCodesize ? 8 : 2;
+    if (ARM_AM::isSOImmTwoPartValNeg(Val)) // two instrs
+      return ForCodesize ? 8 : 2;
   }
   if (Subtarget->useMovt()) // MOVW + MOVT
     return ForCodesize ? 8 : 2;
@@ -5605,12 +5637,32 @@ bool llvm::HasLowerConstantMaterializationCost(unsigned Val1, unsigned Val2,
 /// | Frame overhead in Bytes |      2 |   4 |
 /// | Stack fixup required    |     No |  No |
 /// +-------------------------+--------+-----+
+///
+/// \p MachineOutlinerDefault implies that the function should be called with
+/// a save and restore of LR to the stack.
+///
+/// That is,
+///
+/// I1     Save LR                    OUTLINED_FUNCTION:
+/// I2 --> BL OUTLINED_FUNCTION       I1
+/// I3     Restore LR                 I2
+///                                   I3
+///                                   BX LR
+///
+/// +-------------------------+--------+-----+
+/// |                         | Thumb2 | ARM |
+/// +-------------------------+--------+-----+
+/// | Call overhead in Bytes  |      8 |  12 |
+/// | Frame overhead in Bytes |      2 |   4 |
+/// | Stack fixup required    |    Yes | Yes |
+/// +-------------------------+--------+-----+
 
 enum MachineOutlinerClass {
   MachineOutlinerTailCall,
   MachineOutlinerThunk,
   MachineOutlinerNoLRSave,
-  MachineOutlinerRegSave
+  MachineOutlinerRegSave,
+  MachineOutlinerDefault
 };
 
 enum MachineOutlinerMBBFlags {
@@ -5628,6 +5680,9 @@ struct OutlinerCosts {
   const int FrameNoLRSave;
   const int CallRegSave;
   const int FrameRegSave;
+  const int CallDefault;
+  const int FrameDefault;
+  const int SaveRestoreLROnStack;
 
   OutlinerCosts(const ARMSubtarget &target)
       : CallTailCall(target.isThumb() ? 4 : 4),
@@ -5637,7 +5692,10 @@ struct OutlinerCosts {
         CallNoLRSave(target.isThumb() ? 4 : 4),
         FrameNoLRSave(target.isThumb() ? 4 : 4),
         CallRegSave(target.isThumb() ? 8 : 12),
-        FrameRegSave(target.isThumb() ? 2 : 4) {}
+        FrameRegSave(target.isThumb() ? 2 : 4),
+        CallDefault(target.isThumb() ? 8 : 12),
+        FrameDefault(target.isThumb() ? 2 : 4),
+        SaveRestoreLROnStack(target.isThumb() ? 8 : 8) {}
 };
 
 unsigned
@@ -5662,6 +5720,37 @@ ARMBaseInstrInfo::findRegisterToSaveLRTo(const outliner::Candidate &C) const {
   return 0u;
 }
 
+// Compute liveness of LR at the point after the interval [I, E), which
+// denotes a *backward* iteration through instructions. Used only for return
+// basic blocks, which do not end with a tail call.
+static bool isLRAvailable(const TargetRegisterInfo &TRI,
+                          MachineBasicBlock::reverse_iterator I,
+                          MachineBasicBlock::reverse_iterator E) {
+  // At the end of the function LR dead.
+  bool Live = false;
+  for (; I != E; ++I) {
+    const MachineInstr &MI = *I;
+
+    // Check defs of LR.
+    if (MI.modifiesRegister(ARM::LR, &TRI))
+      Live = false;
+
+    // Check uses of LR.
+    unsigned Opcode = MI.getOpcode();
+    if (Opcode == ARM::BX_RET || Opcode == ARM::MOVPCLR ||
+        Opcode == ARM::SUBS_PC_LR || Opcode == ARM::tBX_RET ||
+        Opcode == ARM::tBXNS_RET) {
+      // These instructions use LR, but it's not an (explicit or implicit)
+      // operand.
+      Live = true;
+      continue;
+    }
+    if (MI.readsRegister(ARM::LR, &TRI))
+      Live = true;
+  }
+  return !Live;
+}
+
 outliner::OutlinedFunction ARMBaseInstrInfo::getOutliningCandidateInfo(
     std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
   outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
@@ -5707,10 +5796,7 @@ outliner::OutlinedFunction ARMBaseInstrInfo::getOutliningCandidateInfo(
     // Erase every candidate that violates the restrictions above. (It could be
     // true that we have viable candidates, so it's not worth bailing out in
     // the case that, say, 1 out of 20 candidates violate the restructions.)
-    RepeatedSequenceLocs.erase(std::remove_if(RepeatedSequenceLocs.begin(),
-                                              RepeatedSequenceLocs.end(),
-                                              CantGuaranteeValueAcrossCall),
-                               RepeatedSequenceLocs.end());
+    llvm::erase_if(RepeatedSequenceLocs, CantGuaranteeValueAcrossCall);
 
     // If the sequence doesn't have enough candidates left, then we're done.
     if (RepeatedSequenceLocs.size() < 2)
@@ -5730,8 +5816,8 @@ outliner::OutlinedFunction ARMBaseInstrInfo::getOutliningCandidateInfo(
       };
 
   OutlinerCosts Costs(Subtarget);
-  unsigned FrameID = 0;
-  unsigned NumBytesToCreateFrame = 0;
+  unsigned FrameID = MachineOutlinerDefault;
+  unsigned NumBytesToCreateFrame = Costs.FrameDefault;
 
   // If the last instruction in any candidate is a terminator, then we should
   // tail call all of the candidates.
@@ -5740,22 +5826,31 @@ outliner::OutlinedFunction ARMBaseInstrInfo::getOutliningCandidateInfo(
     NumBytesToCreateFrame = Costs.FrameTailCall;
     SetCandidateCallInfo(MachineOutlinerTailCall, Costs.CallTailCall);
   } else if (LastInstrOpcode == ARM::BL || LastInstrOpcode == ARM::BLX ||
-             LastInstrOpcode == ARM::tBL || LastInstrOpcode == ARM::tBLXr ||
+             LastInstrOpcode == ARM::BLX_noip || LastInstrOpcode == ARM::tBL ||
+             LastInstrOpcode == ARM::tBLXr ||
+             LastInstrOpcode == ARM::tBLXr_noip ||
              LastInstrOpcode == ARM::tBLXi) {
     FrameID = MachineOutlinerThunk;
     NumBytesToCreateFrame = Costs.FrameThunk;
     SetCandidateCallInfo(MachineOutlinerThunk, Costs.CallThunk);
   } else {
     // We need to decide how to emit calls + frames. We can always emit the same
-    // frame if we don't need to save to the stack.
+    // frame if we don't need to save to the stack. If we have to save to the
+    // stack, then we need a different frame.
     unsigned NumBytesNoStackCalls = 0;
     std::vector<outliner::Candidate> CandidatesWithoutStackFixups;
 
     for (outliner::Candidate &C : RepeatedSequenceLocs) {
       C.initLRU(TRI);
-
-      // Is LR available? If so, we don't need a save.
-      if (C.LRU.available(ARM::LR)) {
+      // LR liveness is overestimated in return blocks, unless they end with a
+      // tail call.
+      const auto Last = C.getMBB()->rbegin();
+      const bool LRIsAvailable =
+          C.getMBB()->isReturnBlock() && !Last->isCall()
+              ? isLRAvailable(TRI, Last,
+                              (MachineBasicBlock::reverse_iterator)C.front())
+              : C.LRU.available(ARM::LR);
+      if (LRIsAvailable) {
         FrameID = MachineOutlinerNoLRSave;
         NumBytesNoStackCalls += Costs.CallNoLRSave;
         C.setCallInfo(MachineOutlinerNoLRSave, Costs.CallNoLRSave);
@@ -5770,18 +5865,157 @@ outliner::OutlinedFunction ARMBaseInstrInfo::getOutliningCandidateInfo(
         C.setCallInfo(MachineOutlinerRegSave, Costs.CallRegSave);
         CandidatesWithoutStackFixups.push_back(C);
       }
+
+      // Is SP used in the sequence at all? If not, we don't have to modify
+      // the stack, so we are guaranteed to get the same frame.
+      else if (C.UsedInSequence.available(ARM::SP)) {
+        NumBytesNoStackCalls += Costs.CallDefault;
+        C.setCallInfo(MachineOutlinerDefault, Costs.CallDefault);
+        CandidatesWithoutStackFixups.push_back(C);
+      }
+
+      // If we outline this, we need to modify the stack. Pretend we don't
+      // outline this by saving all of its bytes.
+      else
+        NumBytesNoStackCalls += SequenceSize;
     }
 
-    if (!CandidatesWithoutStackFixups.empty()) {
+    // If there are no places where we have to save LR, then note that we don't
+    // have to update the stack. Otherwise, give every candidate the default
+    // call type
+    if (NumBytesNoStackCalls <=
+        RepeatedSequenceLocs.size() * Costs.CallDefault) {
       RepeatedSequenceLocs = CandidatesWithoutStackFixups;
+      FrameID = MachineOutlinerNoLRSave;
     } else
-      return outliner::OutlinedFunction();
+      SetCandidateCallInfo(MachineOutlinerDefault, Costs.CallDefault);
+  }
+
+  // Does every candidate's MBB contain a call?  If so, then we might have a
+  // call in the range.
+  if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
+    // check if the range contains a call.  These require a save + restore of
+    // the link register.
+    if (std::any_of(FirstCand.front(), FirstCand.back(),
+                    [](const MachineInstr &MI) { return MI.isCall(); }))
+      NumBytesToCreateFrame += Costs.SaveRestoreLROnStack;
+
+    // Handle the last instruction separately.  If it is tail call, then the
+    // last instruction is a call, we don't want to save + restore in this
+    // case.  However, it could be possible that the last instruction is a
+    // call without it being valid to tail call this sequence.  We should
+    // consider this as well.
+    else if (FrameID != MachineOutlinerThunk &&
+             FrameID != MachineOutlinerTailCall && FirstCand.back()->isCall())
+      NumBytesToCreateFrame += Costs.SaveRestoreLROnStack;
   }
 
   return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize,
                                     NumBytesToCreateFrame, FrameID);
 }
 
+bool ARMBaseInstrInfo::checkAndUpdateStackOffset(MachineInstr *MI,
+                                                 int64_t Fixup,
+                                                 bool Updt) const {
+  int SPIdx = MI->findRegisterUseOperandIdx(ARM::SP);
+  unsigned AddrMode = (MI->getDesc().TSFlags & ARMII::AddrModeMask);
+  if (SPIdx < 0)
+    // No SP operand
+    return true;
+  else if (SPIdx != 1 && (AddrMode != ARMII::AddrModeT2_i8s4 || SPIdx != 2))
+    // If SP is not the base register we can't do much
+    return false;
+
+  // Stack might be involved but addressing mode doesn't handle any offset.
+  // Rq: AddrModeT1_[1|2|4] don't operate on SP
+  if (AddrMode == ARMII::AddrMode1        // Arithmetic instructions
+      || AddrMode == ARMII::AddrMode4     // Load/Store Multiple
+      || AddrMode == ARMII::AddrMode6     // Neon Load/Store Multiple
+      || AddrMode == ARMII::AddrModeT2_so // SP can't be used as based register
+      || AddrMode == ARMII::AddrModeT2_pc // PCrel access
+      || AddrMode == ARMII::AddrMode2     // Used by PRE and POST indexed LD/ST
+      || AddrMode == ARMII::AddrModeT2_i7 // v8.1-M MVE
+      || AddrMode == ARMII::AddrModeT2_i7s2 // v8.1-M MVE
+      || AddrMode == ARMII::AddrModeT2_i7s4 // v8.1-M sys regs VLDR/VSTR
+      || AddrMode == ARMII::AddrModeNone)
+    return false;
+
+  unsigned NumOps = MI->getDesc().getNumOperands();
+  unsigned ImmIdx = NumOps - 3;
+
+  const MachineOperand &Offset = MI->getOperand(ImmIdx);
+  assert(Offset.isImm() && "Is not an immediate");
+  int64_t OffVal = Offset.getImm();
+
+  if (OffVal < 0)
+    // Don't override data if the are below SP.
+    return false;
+
+  unsigned NumBits = 0;
+  unsigned Scale = 1;
+
+  switch (AddrMode) {
+  case ARMII::AddrMode3:
+    if (ARM_AM::getAM3Op(OffVal) == ARM_AM::sub)
+      return false;
+    OffVal = ARM_AM::getAM3Offset(OffVal);
+    NumBits = 8;
+    break;
+  case ARMII::AddrMode5:
+    if (ARM_AM::getAM5Op(OffVal) == ARM_AM::sub)
+      return false;
+    OffVal = ARM_AM::getAM5Offset(OffVal);
+    NumBits = 8;
+    Scale = 4;
+    break;
+  case ARMII::AddrMode5FP16:
+    if (ARM_AM::getAM5FP16Op(OffVal) == ARM_AM::sub)
+      return false;
+    OffVal = ARM_AM::getAM5FP16Offset(OffVal);
+    NumBits = 8;
+    Scale = 2;
+    break;
+  case ARMII::AddrModeT2_i8:
+    NumBits = 8;
+    break;
+  case ARMII::AddrModeT2_i8s4:
+    // FIXME: Values are already scaled in this addressing mode.
+    assert((Fixup & 3) == 0 && "Can't encode this offset!");
+    NumBits = 10;
+    break;
+  case ARMII::AddrModeT2_ldrex:
+    NumBits = 8;
+    Scale = 4;
+    break;
+  case ARMII::AddrModeT2_i12:
+  case ARMII::AddrMode_i12:
+    NumBits = 12;
+    break;
+  case ARMII::AddrModeT1_s: // SP-relative LD/ST
+    NumBits = 8;
+    Scale = 4;
+    break;
+  default:
+    llvm_unreachable("Unsupported addressing mode!");
+  }
+  // Make sure the offset is encodable for instructions that scale the
+  // immediate.
+  assert(((OffVal * Scale + Fixup) & (Scale - 1)) == 0 &&
+         "Can't encode this offset!");
+  OffVal += Fixup / Scale;
+
+  unsigned Mask = (1 << NumBits) - 1;
+
+  if (OffVal <= Mask) {
+    if (Updt)
+      MI->getOperand(ImmIdx).setImm(OffVal);
+    return true;
+  }
+
+  return false;
+
+}
+
 bool ARMBaseInstrInfo::isFunctionSafeToOutlineFrom(
     MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
   const Function &F = MF.getFunction();
@@ -5841,7 +6075,13 @@ bool ARMBaseInstrInfo::isMBBSafeToOutlineFrom(MachineBasicBlock &MBB,
   if (any_of(MBB, [](MachineInstr &MI) { return MI.isCall(); }))
     Flags |= MachineOutlinerMBBFlags::HasCalls;
 
-  if (!LRU.available(ARM::LR))
+  // LR liveness is overestimated in return blocks.
+
+  bool LRIsAvailable =
+      MBB.isReturnBlock() && !MBB.back().isCall()
+          ? isLRAvailable(getRegisterInfo(), MBB.rbegin(), MBB.rend())
+          : LRU.available(ARM::LR);
+  if (!LRIsAvailable)
     Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere;
 
   return true;
@@ -5879,8 +6119,9 @@ ARMBaseInstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT,
 
   // Be conservative with ARMv8.1 MVE instructions.
   if (Opc == ARM::t2BF_LabelPseudo || Opc == ARM::t2DoLoopStart ||
-      Opc == ARM::t2WhileLoopStart || Opc == ARM::t2LoopDec ||
-      Opc == ARM::t2LoopEnd)
+      Opc == ARM::t2DoLoopStartTP || Opc == ARM::t2WhileLoopStart ||
+      Opc == ARM::t2LoopDec || Opc == ARM::t2LoopEnd ||
+      Opc == ARM::t2LoopEndDec)
     return outliner::InstrType::Illegal;
 
   const MCInstrDesc &MCID = MI.getDesc();
@@ -5914,16 +6155,56 @@ ARMBaseInstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT,
     return outliner::InstrType::Illegal;
 
   if (MI.isCall()) {
+    // Get the function associated with the call.  Look at each operand and find
+    // the one that represents the calle and get its name.
+    const Function *Callee = nullptr;
+    for (const MachineOperand &MOP : MI.operands()) {
+      if (MOP.isGlobal()) {
+        Callee = dyn_cast<Function>(MOP.getGlobal());
+        break;
+      }
+    }
+
+    // Dont't outline calls to "mcount" like functions, in particular Linux
+    // kernel function tracing relies on it.
+    if (Callee &&
+        (Callee->getName() == "\01__gnu_mcount_nc" ||
+         Callee->getName() == "\01mcount" || Callee->getName() == "__mcount"))
+      return outliner::InstrType::Illegal;
+
     // If we don't know anything about the callee, assume it depends on the
     // stack layout of the caller. In that case, it's only legal to outline
     // as a tail-call. Explicitly list the call instructions we know about so
     // we don't get unexpected results with call pseudo-instructions.
     auto UnknownCallOutlineType = outliner::InstrType::Illegal;
     if (Opc == ARM::BL || Opc == ARM::tBL || Opc == ARM::BLX ||
-        Opc == ARM::tBLXr || Opc == ARM::tBLXi)
+        Opc == ARM::BLX_noip || Opc == ARM::tBLXr || Opc == ARM::tBLXr_noip ||
+        Opc == ARM::tBLXi)
       UnknownCallOutlineType = outliner::InstrType::LegalTerminator;
 
-    return UnknownCallOutlineType;
+    if (!Callee)
+      return UnknownCallOutlineType;
+
+    // We have a function we have information about.  Check if it's something we
+    // can safely outline.
+    MachineFunction *MF = MI.getParent()->getParent();
+    MachineFunction *CalleeMF = MF->getMMI().getMachineFunction(*Callee);
+
+    // We don't know what's going on with the callee at all.  Don't touch it.
+    if (!CalleeMF)
+      return UnknownCallOutlineType;
+
+    // Check if we know anything about the callee saves on the function. If we
+    // don't, then don't touch it, since that implies that we haven't computed
+    // anything about its stack frame yet.
+    MachineFrameInfo &MFI = CalleeMF->getFrameInfo();
+    if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 ||
+        MFI.getNumObjects() > 0)
+      return UnknownCallOutlineType;
+
+    // At this point, we can say that CalleeMF ought to not pass anything on the
+    // stack. Therefore, we can outline it.
+    return outliner::InstrType::Legal;
   }
 
   // Since calls are handled, don't touch LR or PC
@@ -5946,6 +6227,19 @@ ARMBaseInstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT,
     if (!MightNeedStackFixUp)
       return outliner::InstrType::Legal;
 
+    // Any modification of SP will break our code to save/restore LR.
+    // FIXME: We could handle some instructions which add a constant offset to
+    // SP, with a bit more work.
+    if (MI.modifiesRegister(ARM::SP, TRI))
+      return outliner::InstrType::Illegal;
+
+    // At this point, we have a stack instruction that we might need to fix up.
+    // up. We'll handle it if it's a load or store.
+    if (checkAndUpdateStackOffset(&MI, Subtarget.getStackAlignment().value(),
+                                  false))
+      return outliner::InstrType::Legal;
+
+    // We can't fix it up, so don't outline it.
     return outliner::InstrType::Illegal;
   }
 
@@ -5961,13 +6255,107 @@ ARMBaseInstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT,
   return outliner::InstrType::Legal;
 }
 
+void ARMBaseInstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
+  for (MachineInstr &MI : MBB) {
+    checkAndUpdateStackOffset(&MI, Subtarget.getStackAlignment().value(), true);
+  }
+}
+
+void ARMBaseInstrInfo::saveLROnStack(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator It) const {
+  unsigned Opc = Subtarget.isThumb() ? ARM::t2STR_PRE : ARM::STR_PRE_IMM;
+  int Align = -Subtarget.getStackAlignment().value();
+  BuildMI(MBB, It, DebugLoc(), get(Opc), ARM::SP)
+    .addReg(ARM::LR, RegState::Kill)
+    .addReg(ARM::SP)
+    .addImm(Align)
+    .add(predOps(ARMCC::AL));
+}
+
+void ARMBaseInstrInfo::emitCFIForLRSaveOnStack(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator It) const {
+  MachineFunction &MF = *MBB.getParent();
+  const MCRegisterInfo *MRI = Subtarget.getRegisterInfo();
+  unsigned DwarfLR = MRI->getDwarfRegNum(ARM::LR, true);
+  int Align = Subtarget.getStackAlignment().value();
+  // Add a CFI saying the stack was moved down.
+  int64_t StackPosEntry =
+      MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, Align));
+  BuildMI(MBB, It, DebugLoc(), get(ARM::CFI_INSTRUCTION))
+      .addCFIIndex(StackPosEntry)
+      .setMIFlags(MachineInstr::FrameSetup);
+
+  // Add a CFI saying that the LR that we want to find is now higher than
+  // before.
+  int64_t LRPosEntry =
+      MF.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfLR, -Align));
+  BuildMI(MBB, It, DebugLoc(), get(ARM::CFI_INSTRUCTION))
+      .addCFIIndex(LRPosEntry)
+      .setMIFlags(MachineInstr::FrameSetup);
+}
+
+void ARMBaseInstrInfo::emitCFIForLRSaveToReg(MachineBasicBlock &MBB,
+                                             MachineBasicBlock::iterator It,
+                                             Register Reg) const {
+  MachineFunction &MF = *MBB.getParent();
+  const MCRegisterInfo *MRI = Subtarget.getRegisterInfo();
+  unsigned DwarfLR = MRI->getDwarfRegNum(ARM::LR, true);
+  unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
+
+  int64_t LRPosEntry = MF.addFrameInst(
+      MCCFIInstruction::createRegister(nullptr, DwarfLR, DwarfReg));
+  BuildMI(MBB, It, DebugLoc(), get(ARM::CFI_INSTRUCTION))
+      .addCFIIndex(LRPosEntry)
+      .setMIFlags(MachineInstr::FrameSetup);
+}
+
+void ARMBaseInstrInfo::restoreLRFromStack(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator It) const {
+  unsigned Opc = Subtarget.isThumb() ? ARM::t2LDR_POST : ARM::LDR_POST_IMM;
+  MachineInstrBuilder MIB = BuildMI(MBB, It, DebugLoc(), get(Opc), ARM::LR)
+    .addReg(ARM::SP, RegState::Define)
+    .addReg(ARM::SP);
+  if (!Subtarget.isThumb())
+    MIB.addReg(0);
+  MIB.addImm(Subtarget.getStackAlignment().value()).add(predOps(ARMCC::AL));
+}
+
+void ARMBaseInstrInfo::emitCFIForLRRestoreFromStack(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator It) const {
+  // Now stack has moved back up...
+  MachineFunction &MF = *MBB.getParent();
+  const MCRegisterInfo *MRI = Subtarget.getRegisterInfo();
+  unsigned DwarfLR = MRI->getDwarfRegNum(ARM::LR, true);
+  int64_t StackPosEntry =
+      MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 0));
+  BuildMI(MBB, It, DebugLoc(), get(ARM::CFI_INSTRUCTION))
+      .addCFIIndex(StackPosEntry)
+      .setMIFlags(MachineInstr::FrameDestroy);
+
+  // ... and we have restored LR.
+  int64_t LRPosEntry =
+      MF.addFrameInst(MCCFIInstruction::createRestore(nullptr, DwarfLR));
+  BuildMI(MBB, It, DebugLoc(), get(ARM::CFI_INSTRUCTION))
+      .addCFIIndex(LRPosEntry)
+      .setMIFlags(MachineInstr::FrameDestroy);
+}
+
+void ARMBaseInstrInfo::emitCFIForLRRestoreFromReg(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator It) const {
+  MachineFunction &MF = *MBB.getParent();
+  const MCRegisterInfo *MRI = Subtarget.getRegisterInfo();
+  unsigned DwarfLR = MRI->getDwarfRegNum(ARM::LR, true);
+
+  int64_t LRPosEntry =
+      MF.addFrameInst(MCCFIInstruction::createRestore(nullptr, DwarfLR));
+  BuildMI(MBB, It, DebugLoc(), get(ARM::CFI_INSTRUCTION))
+      .addCFIIndex(LRPosEntry)
+      .setMIFlags(MachineInstr::FrameDestroy);
+}
+
 void ARMBaseInstrInfo::buildOutlinedFrame(
     MachineBasicBlock &MBB, MachineFunction &MF,
     const outliner::OutlinedFunction &OF) const {
-  // Nothing is needed for tail-calls.
-  if (OF.FrameConstructionID == MachineOutlinerTailCall)
-    return;
-
   // For thunk outlining, rewrite the last instruction from a call to a
   // tail-call.
   if (OF.FrameConstructionID == MachineOutlinerThunk) {
@@ -5984,13 +6372,59 @@ void ARMBaseInstrInfo::buildOutlinedFrame(
     if (isThumb && !Call->getOperand(FuncOp).isReg())
       MIB.add(predOps(ARMCC::AL));
     Call->eraseFromParent();
-    return;
   }
 
+  // Is there a call in the outlined range?
+  auto IsNonTailCall = [](MachineInstr &MI) {
+    return MI.isCall() && !MI.isReturn();
+  };
+  if (llvm::any_of(MBB.instrs(), IsNonTailCall)) {
+    MachineBasicBlock::iterator It = MBB.begin();
+    MachineBasicBlock::iterator Et = MBB.end();
+
+    if (OF.FrameConstructionID == MachineOutlinerTailCall ||
+        OF.FrameConstructionID == MachineOutlinerThunk)
+      Et = std::prev(MBB.end());
+
+    // We have to save and restore LR, we need to add it to the liveins if it
+    // is not already part of the set.  This is suffient since outlined
+    // functions only have one block.
+    if (!MBB.isLiveIn(ARM::LR))
+      MBB.addLiveIn(ARM::LR);
+
+    // Insert a save before the outlined region
+    saveLROnStack(MBB, It);
+    emitCFIForLRSaveOnStack(MBB, It);
+
+    // Fix up the instructions in the range, since we're going to modify the
+    // stack.
+    assert(OF.FrameConstructionID != MachineOutlinerDefault &&
+           "Can only fix up stack references once");
+    fixupPostOutline(MBB);
+
+    // Insert a restore before the terminator for the function.  Restore LR.
+    restoreLRFromStack(MBB, Et);
+    emitCFIForLRRestoreFromStack(MBB, Et);
+  }
+
+  // If this is a tail call outlined function, then there's already a return.
+  if (OF.FrameConstructionID == MachineOutlinerTailCall ||
+      OF.FrameConstructionID == MachineOutlinerThunk)
+    return;
+
   // Here we have to insert the return ourselves.  Get the correct opcode from
   // current feature set.
   BuildMI(MBB, MBB.end(), DebugLoc(), get(Subtarget.getReturnOpcode()))
       .add(predOps(ARMCC::AL));
+
+  // Did we have to modify the stack by saving the link register?
+  if (OF.FrameConstructionID != MachineOutlinerDefault &&
+      OF.Candidates[0].CallConstructionID != MachineOutlinerDefault)
+    return;
+
+  // We modified the stack.
+  // Walk over the basic block and fix up all the stack accesses.
+  fixupPostOutline(MBB);
 }
 
 MachineBasicBlock::iterator ARMBaseInstrInfo::insertOutlinedCall(
@@ -6022,21 +6456,70 @@ MachineBasicBlock::iterator ARMBaseInstrInfo::insertOutlinedCall(
     CallMIB.add(predOps(ARMCC::AL));
   CallMIB.addGlobalAddress(M.getNamedValue(MF.getName()));
 
+  if (C.CallConstructionID == MachineOutlinerNoLRSave ||
+      C.CallConstructionID == MachineOutlinerThunk) {
+    // No, so just insert the call.
+    It = MBB.insert(It, CallMIB);
+    return It;
+  }
+
+  const ARMFunctionInfo &AFI = *C.getMF()->getInfo<ARMFunctionInfo>();
   // Can we save to a register?
   if (C.CallConstructionID == MachineOutlinerRegSave) {
     unsigned Reg = findRegisterToSaveLRTo(C);
     assert(Reg != 0 && "No callee-saved register available?");
 
     // Save and restore LR from that register.
-    if (!MBB.isLiveIn(ARM::LR))
-      MBB.addLiveIn(ARM::LR);
     copyPhysReg(MBB, It, DebugLoc(), Reg, ARM::LR, true);
+    if (!AFI.isLRSpilled())
+      emitCFIForLRSaveToReg(MBB, It, Reg);
     CallPt = MBB.insert(It, CallMIB);
     copyPhysReg(MBB, It, DebugLoc(), ARM::LR, Reg, true);
+    if (!AFI.isLRSpilled())
+      emitCFIForLRRestoreFromReg(MBB, It);
     It--;
     return CallPt;
   }
-  // Insert the call.
-  It = MBB.insert(It, CallMIB);
-  return It;
+  // We have the default case. Save and restore from SP.
+  if (!MBB.isLiveIn(ARM::LR))
+    MBB.addLiveIn(ARM::LR);
+  saveLROnStack(MBB, It);
+  if (!AFI.isLRSpilled())
+    emitCFIForLRSaveOnStack(MBB, It);
+  CallPt = MBB.insert(It, CallMIB);
+  restoreLRFromStack(MBB, It);
+  if (!AFI.isLRSpilled())
+    emitCFIForLRRestoreFromStack(MBB, It);
+  It--;
+  return CallPt;
 }
+
+bool ARMBaseInstrInfo::shouldOutlineFromFunctionByDefault(
+    MachineFunction &MF) const {
+  return Subtarget.isMClass() && MF.getFunction().hasMinSize();
+}
+
+bool ARMBaseInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
+                                                         AAResults *AA) const {
+  // Try hard to rematerialize any VCTPs because if we spill P0, it will block
+  // the tail predication conversion. This means that the element count
+  // register has to be live for longer, but that has to be better than
+  // spill/restore and VPT predication.
+  return isVCTP(&MI) && !isPredicated(MI);
+}
+
+unsigned llvm::getBLXOpcode(const MachineFunction &MF) {
+  return (MF.getSubtarget<ARMSubtarget>().hardenSlsBlr()) ? ARM::BLX_noip
+                                                          : ARM::BLX;
+}
+
+unsigned llvm::gettBLXrOpcode(const MachineFunction &MF) {
+  return (MF.getSubtarget<ARMSubtarget>().hardenSlsBlr()) ? ARM::tBLXr_noip
+                                                          : ARM::tBLXr;
+}
+
+unsigned llvm::getBLXpredOpcode(const MachineFunction &MF) {
+  return (MF.getSubtarget<ARMSubtarget>().hardenSlsBlr()) ? ARM::BLX_pred_noip
+                                                          : ARM::BLX_pred;
+}
+
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
index 1a75b011ca59..1b843c428130 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -131,6 +131,10 @@ public:
   CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI,
                                const ScheduleDAG *DAG) const override;
 
+  ScheduleHazardRecognizer *
+  CreateTargetMIHazardRecognizer(const InstrItineraryData *II,
+                                 const ScheduleDAGMI *DAG) const override;
+
   ScheduleHazardRecognizer *
   CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
                                      const ScheduleDAG *DAG) const override;
@@ -171,28 +175,13 @@ public:
   bool SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
                          ArrayRef<MachineOperand> Pred2) const override;
 
-  bool DefinesPredicate(MachineInstr &MI,
-                        std::vector<MachineOperand> &Pred) const override;
+  bool ClobbersPredicate(MachineInstr &MI, std::vector<MachineOperand> &Pred,
+                         bool SkipDead) const override;
 
   bool isPredicable(const MachineInstr &MI) const override;
 
   // CPSR defined in instruction
   static bool isCPSRDefined(const MachineInstr &MI);
-  bool isAddrMode3OpImm(const MachineInstr &MI, unsigned Op) const;
-  bool isAddrMode3OpMinusReg(const MachineInstr &MI, unsigned Op) const;
-
-  // Load, scaled register offset
-  bool isLdstScaledReg(const MachineInstr &MI, unsigned Op) const;
-  // Load, scaled register offset, not plus LSL2
-  bool isLdstScaledRegNotPlusLsl2(const MachineInstr &MI, unsigned Op) const;
-  // Minus reg for ldstso addr mode
-  bool isLdstSoMinusReg(const MachineInstr &MI, unsigned Op) const;
-  // Scaled register offset in address mode 2
-  bool isAm2ScaledReg(const MachineInstr &MI, unsigned Op) const;
-  // Load multiple, base reg in list
-  bool isLDMBaseRegInList(const MachineInstr &MI) const;
-  // get LDM variable defs size
-  unsigned getLDMVariableDefsSize(const MachineInstr &MI) const;
 
   /// GetInstSize - Returns the size of the specified MachineInstr.
   ///
@@ -372,11 +361,60 @@ public:
                      MachineBasicBlock::iterator &It, MachineFunction &MF,
                      const outliner::Candidate &C) const override;
 
+  /// Enable outlining by default at -Oz.
+  bool shouldOutlineFromFunctionByDefault(MachineFunction &MF) const override;
+
+  bool isUnspillableTerminatorImpl(const MachineInstr *MI) const override {
+    return MI->getOpcode() == ARM::t2LoopEndDec ||
+           MI->getOpcode() == ARM::t2DoLoopStartTP;
+  }
+
 private:
   /// Returns an unused general-purpose register which can be used for
   /// constructing an outlined call if one exists. Returns 0 otherwise.
   unsigned findRegisterToSaveLRTo(const outliner::Candidate &C) const;
 
+  // Adds an instruction which saves the link register on top of the stack into
+  /// the MachineBasicBlock \p MBB at position \p It.
+  void saveLROnStack(MachineBasicBlock &MBB,
+                     MachineBasicBlock::iterator It) const;
+
+  /// Adds an instruction which restores the link register from the top the
+  /// stack into the MachineBasicBlock \p MBB at position \p It.
+  void restoreLRFromStack(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator It) const;
+
+  /// Emit CFI instructions into the MachineBasicBlock \p MBB at position \p It,
+  /// for the case when the LR is saved on the stack.
+  void emitCFIForLRSaveOnStack(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator It) const;
+
+  /// Emit CFI instructions into the MachineBasicBlock \p MBB at position \p It,
+  /// for the case when the LR is saved in the register \p Reg.
+  void emitCFIForLRSaveToReg(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator It,
+                             Register Reg) const;
+
+  /// Emit CFI instructions into the MachineBasicBlock \p MBB at position \p It,
+  /// after the LR is was restored from the stack.
+  void emitCFIForLRRestoreFromStack(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator It) const;
+
+  /// Emit CFI instructions into the MachineBasicBlock \p MBB at position \p It,
+  /// after the LR is was restored from a register.
+  void emitCFIForLRRestoreFromReg(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator It) const;
+  /// \brief Sets the offsets on outlined instructions in \p MBB which use SP
+  /// so that they will be valid post-outlining.
+  ///
+  /// \param MBB A \p MachineBasicBlock in an outlined function.
+  void fixupPostOutline(MachineBasicBlock &MBB) const;
+
+  /// Returns true if the machine instruction offset can handle the stack fixup
+  /// and updates it if requested.
+  bool checkAndUpdateStackOffset(MachineInstr *MI, int64_t Fixup,
+                                 bool Updt) const;
+
   unsigned getInstBundleLength(const MachineInstr &MI) const;
 
   int getVLDMDefCycle(const InstrItineraryData *ItinData,
@@ -439,6 +477,9 @@ private:
   MachineInstr *canFoldIntoMOVCC(Register Reg, const MachineRegisterInfo &MRI,
                                  const TargetInstrInfo *TII) const;
 
+  bool isReallyTriviallyReMaterializable(const MachineInstr &MI,
+                                         AAResults *AA) const override;
+
 private:
   /// Modeling special VFP / NEON fp MLA / MLS hazards.
 
@@ -592,56 +633,6 @@ unsigned VCMPOpcodeToVPT(unsigned Opcode) {
   }
 }
 
-static inline
-unsigned VCTPOpcodeToLSTP(unsigned Opcode, bool IsDoLoop) {
-  switch (Opcode) {
-  default:
-    llvm_unreachable("unhandled vctp opcode");
-    break;
-  case ARM::MVE_VCTP8:
-    return IsDoLoop ? ARM::MVE_DLSTP_8 : ARM::MVE_WLSTP_8;
-  case ARM::MVE_VCTP16:
-    return IsDoLoop ? ARM::MVE_DLSTP_16 : ARM::MVE_WLSTP_16;
-  case ARM::MVE_VCTP32:
-    return IsDoLoop ? ARM::MVE_DLSTP_32 : ARM::MVE_WLSTP_32;
-  case ARM::MVE_VCTP64:
-    return IsDoLoop ? ARM::MVE_DLSTP_64 : ARM::MVE_WLSTP_64;
-  }
-  return 0;
-}
-
-static inline unsigned getTailPredVectorWidth(unsigned Opcode) {
-  switch (Opcode) {
-  default:
-    llvm_unreachable("unhandled vctp opcode");
-  case ARM::MVE_VCTP8:  return 16;
-  case ARM::MVE_VCTP16: return 8;
-  case ARM::MVE_VCTP32: return 4;
-  case ARM::MVE_VCTP64: return 2;
-  }
-  return 0;
-}
-
-static inline
-bool isVCTP(MachineInstr *MI) {
-  switch (MI->getOpcode()) {
-  default:
-    break;
-  case ARM::MVE_VCTP8:
-  case ARM::MVE_VCTP16:
-  case ARM::MVE_VCTP32:
-  case ARM::MVE_VCTP64:
-    return true;
-  }
-  return false;
-}
-
-static inline
-bool isLoopStart(MachineInstr &MI) {
-  return MI.getOpcode() == ARM::t2DoLoopStart ||
-         MI.getOpcode() == ARM::t2WhileLoopStart;
-}
-
 static inline
 bool isCondBranchOpcode(int Opc) {
   return Opc == ARM::Bcc || Opc == ARM::tBcc || Opc == ARM::t2Bcc;
@@ -653,11 +644,77 @@ static inline bool isJumpTableBranchOpcode(int Opc) {
          Opc == ARM::t2BR_JT;
 }
 
+static inline bool isLowOverheadTerminatorOpcode(int Opc) {
+  return Opc == ARM::t2DoLoopStartTP || Opc == ARM::t2WhileLoopStart ||
+         Opc == ARM::t2LoopEnd || Opc == ARM::t2LoopEndDec;
+}
+
 static inline
 bool isIndirectBranchOpcode(int Opc) {
   return Opc == ARM::BX || Opc == ARM::MOVPCRX || Opc == ARM::tBRIND;
 }
 
+static inline bool isIndirectCall(const MachineInstr &MI) {
+  int Opc = MI.getOpcode();
+  switch (Opc) {
+    // indirect calls:
+  case ARM::BLX:
+  case ARM::BLX_noip:
+  case ARM::BLX_pred:
+  case ARM::BLX_pred_noip:
+  case ARM::BX_CALL:
+  case ARM::BMOVPCRX_CALL:
+  case ARM::TCRETURNri:
+  case ARM::TAILJMPr:
+  case ARM::TAILJMPr4:
+  case ARM::tBLXr:
+  case ARM::tBLXr_noip:
+  case ARM::tBLXNSr:
+  case ARM::tBLXNS_CALL:
+  case ARM::tBX_CALL:
+  case ARM::tTAILJMPr:
+    assert(MI.isCall(MachineInstr::IgnoreBundle));
+    return true;
+    // direct calls:
+  case ARM::BL:
+  case ARM::BL_pred:
+  case ARM::BMOVPCB_CALL:
+  case ARM::BL_PUSHLR:
+  case ARM::BLXi:
+  case ARM::TCRETURNdi:
+  case ARM::TAILJMPd:
+  case ARM::SVC:
+  case ARM::HVC:
+  case ARM::TPsoft:
+  case ARM::tTAILJMPd:
+  case ARM::t2SMC:
+  case ARM::t2HVC:
+  case ARM::tBL:
+  case ARM::tBLXi:
+  case ARM::tBL_PUSHLR:
+  case ARM::tTAILJMPdND:
+  case ARM::tSVC:
+  case ARM::tTPsoft:
+    assert(MI.isCall(MachineInstr::IgnoreBundle));
+    return false;
+  }
+  assert(!MI.isCall(MachineInstr::IgnoreBundle));
+  return false;
+}
+
+static inline bool isIndirectControlFlowNotComingBack(const MachineInstr &MI) {
+  int opc = MI.getOpcode();
+  return MI.isReturn() || isIndirectBranchOpcode(MI.getOpcode()) ||
+         isJumpTableBranchOpcode(opc);
+}
+
+static inline bool isSpeculationBarrierEndBBOpcode(int Opc) {
+  return Opc == ARM::SpeculationBarrierISBDSBEndBB ||
+         Opc == ARM::SpeculationBarrierSBEndBB ||
+         Opc == ARM::t2SpeculationBarrierISBDSBEndBB ||
+         Opc == ARM::t2SpeculationBarrierSBEndBB;
+}
+
 static inline bool isPopOpcode(int Opc) {
   return Opc == ARM::tPOP_RET || Opc == ARM::LDMIA_RET ||
          Opc == ARM::t2LDMIA_RET || Opc == ARM::tPOP || Opc == ARM::LDMIA_UPD ||
@@ -829,13 +886,17 @@ inline bool isLegalAddressImm(unsigned Opcode, int Imm,
     return std::abs(Imm) < (((1 << 7) * 2) - 1) && Imm % 2 == 0;
   case ARMII::AddrModeT2_i7s4:
     return std::abs(Imm) < (((1 << 7) * 4) - 1) && Imm % 4 == 0;
+  case ARMII::AddrModeT2_i8:
+    return std::abs(Imm) < (((1 << 8) * 1) - 1);
+  case ARMII::AddrModeT2_i12:
+    return Imm >= 0 && Imm < (((1 << 12) * 1) - 1);
   default:
     llvm_unreachable("Unhandled Addressing mode");
   }
 }
 
-// Return true if the given intrinsic is a gather or scatter
-inline bool isGatherScatter(IntrinsicInst *IntInst) {
+// Return true if the given intrinsic is a gather
+inline bool isGather(IntrinsicInst *IntInst) {
   if (IntInst == nullptr)
     return false;
   unsigned IntrinsicID = IntInst->getIntrinsicID();
@@ -845,8 +906,15 @@ inline bool isGatherScatter(IntrinsicInst *IntInst) {
           IntrinsicID == Intrinsic::arm_mve_vldr_gather_base_wb ||
           IntrinsicID == Intrinsic::arm_mve_vldr_gather_base_wb_predicated ||
           IntrinsicID == Intrinsic::arm_mve_vldr_gather_offset ||
-          IntrinsicID == Intrinsic::arm_mve_vldr_gather_offset_predicated ||
-          IntrinsicID == Intrinsic::masked_scatter ||
+          IntrinsicID == Intrinsic::arm_mve_vldr_gather_offset_predicated);
+}
+
+// Return true if the given intrinsic is a scatter
+inline bool isScatter(IntrinsicInst *IntInst) {
+  if (IntInst == nullptr)
+    return false;
+  unsigned IntrinsicID = IntInst->getIntrinsicID();
+  return (IntrinsicID == Intrinsic::masked_scatter ||
           IntrinsicID == Intrinsic::arm_mve_vstr_scatter_base ||
           IntrinsicID == Intrinsic::arm_mve_vstr_scatter_base_predicated ||
           IntrinsicID == Intrinsic::arm_mve_vstr_scatter_base_wb ||
@@ -855,6 +923,17 @@ inline bool isGatherScatter(IntrinsicInst *IntInst) {
           IntrinsicID == Intrinsic::arm_mve_vstr_scatter_offset_predicated);
 }
 
+// Return true if the given intrinsic is a gather or scatter
+inline bool isGatherScatter(IntrinsicInst *IntInst) {
+  if (IntInst == nullptr)
+    return false;
+  return isGather(IntInst) || isScatter(IntInst);
+}
+
+unsigned getBLXOpcode(const MachineFunction &MF);
+unsigned gettBLXrOpcode(const MachineFunction &MF);
+unsigned getBLXpredOpcode(const MachineFunction &MF);
+
 } // end namespace llvm
 
 #endif // LLVM_LIB_TARGET_ARM_ARMBASEINSTRINFO_H
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index 3579635f83b5..1a264dabeeb5 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -55,7 +55,9 @@
 using namespace llvm;
 
 ARMBaseRegisterInfo::ARMBaseRegisterInfo()
-    : ARMGenRegisterInfo(ARM::LR, 0, 0, ARM::PC) {}
+    : ARMGenRegisterInfo(ARM::LR, 0, 0, ARM::PC) {
+  ARM_MC::initLLVMToCVRegMapping(this);
+}
 
 static unsigned getFramePointerReg(const ARMSubtarget &STI) {
   return STI.useR7AsFramePointer() ? ARM::R7 : ARM::R11;
@@ -328,9 +330,13 @@ bool ARMBaseRegisterInfo::getRegAllocationHints(
   case ARMRI::RegPairOdd:
     Odd = 1;
     break;
-  default:
+  case ARMRI::RegLR:
     TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF, VRM);
+    if (MRI.getRegClass(VirtReg)->contains(ARM::LR))
+      Hints.push_back(ARM::LR);
     return false;
+  default:
+    return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF, VRM);
   }
 
   // This register should preferably be even (Odd == 0) or odd (Odd == 1).
@@ -634,10 +640,10 @@ needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
 
 /// materializeFrameBaseRegister - Insert defining instruction(s) for BaseReg to
 /// be a pointer to FrameIdx at the beginning of the basic block.
-void ARMBaseRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
-                                                       Register BaseReg,
-                                                       int FrameIdx,
-                                                       int64_t Offset) const {
+Register
+ARMBaseRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
+                                                  int FrameIdx,
+                                                  int64_t Offset) const {
   ARMFunctionInfo *AFI = MBB->getParent()->getInfo<ARMFunctionInfo>();
   unsigned ADDriOpc = !AFI->isThumbFunction() ? ARM::ADDri :
     (AFI->isThumb1OnlyFunction() ? ARM::tADDframe : ARM::t2ADDri);
@@ -651,6 +657,7 @@ void ARMBaseRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   const MCInstrDesc &MCID = TII.get(ADDriOpc);
+  Register BaseReg = MRI.createVirtualRegister(&ARM::GPRRegClass);
   MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0, this, MF));
 
   MachineInstrBuilder MIB = BuildMI(*MBB, Ins, DL, MCID, BaseReg)
@@ -658,6 +665,8 @@ void ARMBaseRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
 
   if (!AFI->isThumb1OnlyFunction())
     MIB.add(predOps(ARMCC::AL)).add(condCodeOp());
+
+  return BaseReg;
 }
 
 void ARMBaseRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
index 0a0907af2141..5afb6c6aa015 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -32,8 +32,11 @@ class LiveIntervals;
 namespace ARMRI {
 
   enum {
+    // Used for LDRD register pairs
     RegPairOdd  = 1,
-    RegPairEven = 2
+    RegPairEven = 2,
+    // Used to hint for lr in t2DoLoopStart
+    RegLR = 3
   };
 
 } // end namespace ARMRI
@@ -165,9 +168,8 @@ public:
   int64_t getFrameIndexInstrOffset(const MachineInstr *MI,
                                    int Idx) const override;
   bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override;
-  void materializeFrameBaseRegister(MachineBasicBlock *MBB, Register BaseReg,
-                                    int FrameIdx,
-                                    int64_t Offset) const override;
+  Register materializeFrameBaseRegister(MachineBasicBlock *MBB, int FrameIdx,
+                                        int64_t Offset) const override;
   void resolveFrameIndex(MachineInstr &MI, Register BaseReg,
                          int64_t Offset) const override;
   bool isFrameOffsetLegal(const MachineInstr *MI, Register BaseReg,
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBlockPlacement.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBlockPlacement.cpp
new file mode 100644
index 000000000000..9ba16003a97a
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBlockPlacement.cpp
@@ -0,0 +1,228 @@
+//===-- ARMBlockPlacement.cpp - ARM block placement pass ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass re-arranges machine basic blocks to suit target requirements.
+// Currently it only moves blocks to fix backwards WLS branches.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMBasicBlockInfo.h"
+#include "ARMSubtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "arm-block-placement"
+#define DEBUG_PREFIX "ARM Block Placement: "
+
+namespace llvm {
+class ARMBlockPlacement : public MachineFunctionPass {
+private:
+  const ARMBaseInstrInfo *TII;
+  std::unique_ptr<ARMBasicBlockUtils> BBUtils = nullptr;
+  MachineLoopInfo *MLI = nullptr;
+
+public:
+  static char ID;
+  ARMBlockPlacement() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  void moveBasicBlock(MachineBasicBlock *BB, MachineBasicBlock *After);
+  bool blockIsBefore(MachineBasicBlock *BB, MachineBasicBlock *Other);
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<MachineLoopInfo>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // namespace llvm
+
+FunctionPass *llvm::createARMBlockPlacementPass() {
+  return new ARMBlockPlacement();
+}
+
+char ARMBlockPlacement::ID = 0;
+
+INITIALIZE_PASS(ARMBlockPlacement, DEBUG_TYPE, "ARM block placement", false,
+                false)
+
+bool ARMBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()))
+      return false;
+  const ARMSubtarget &ST = static_cast<const ARMSubtarget &>(MF.getSubtarget());
+  if (!ST.hasLOB())
+    return false;
+  LLVM_DEBUG(dbgs() << DEBUG_PREFIX << "Running on " << MF.getName() << "\n");
+  MLI = &getAnalysis<MachineLoopInfo>();
+  TII = static_cast<const ARMBaseInstrInfo *>(ST.getInstrInfo());
+  BBUtils = std::unique_ptr<ARMBasicBlockUtils>(new ARMBasicBlockUtils(MF));
+  MF.RenumberBlocks();
+  BBUtils->computeAllBlockSizes();
+  BBUtils->adjustBBOffsetsAfter(&MF.front());
+  bool Changed = false;
+
+  // Find loops with a backwards branching WLS.
+  // This requires looping over the loops in the function, checking each
+  // preheader for a WLS and if its target is before the preheader. If moving
+  // the target block wouldn't produce another backwards WLS or a new forwards
+  // LE branch then move the target block after the preheader.
+  for (auto *ML : *MLI) {
+    MachineBasicBlock *Preheader = ML->getLoopPredecessor();
+    if (!Preheader)
+      continue;
+
+    for (auto &Terminator : Preheader->terminators()) {
+      if (Terminator.getOpcode() != ARM::t2WhileLoopStart)
+        continue;
+      MachineBasicBlock *LoopExit = Terminator.getOperand(1).getMBB();
+      // We don't want to move the function's entry block.
+      if (!LoopExit->getPrevNode())
+        continue;
+      if (blockIsBefore(Preheader, LoopExit))
+        continue;
+      LLVM_DEBUG(dbgs() << DEBUG_PREFIX << "Found a backwards WLS from "
+                        << Preheader->getFullName() << " to "
+                        << LoopExit->getFullName() << "\n");
+
+      // Make sure that moving the target block doesn't cause any of its WLSs
+      // that were previously not backwards to become backwards
+      bool CanMove = true;
+      for (auto &LoopExitTerminator : LoopExit->terminators()) {
+        if (LoopExitTerminator.getOpcode() != ARM::t2WhileLoopStart)
+          continue;
+        // An example loop structure where the LoopExit can't be moved, since
+        // bb1's WLS will become backwards once it's moved after bb3 bb1: -
+        // LoopExit
+        //      WLS bb2  - LoopExit2
+        // bb2:
+        //      ...
+        // bb3:          - Preheader
+        //      WLS bb1
+        // bb4:          - Header
+        MachineBasicBlock *LoopExit2 =
+            LoopExitTerminator.getOperand(1).getMBB();
+        // If the WLS from LoopExit to LoopExit2 is already backwards then
+        // moving LoopExit won't affect it, so it can be moved. If LoopExit2 is
+        // after the Preheader then moving will keep it as a forward branch, so
+        // it can be moved. If LoopExit2 is between the Preheader and LoopExit
+        // then moving LoopExit will make it a backwards branch, so it can't be
+        // moved since we'd fix one and introduce one backwards branch.
+        // TODO: Analyse the blocks to make a decision if it would be worth
+        // moving LoopExit even if LoopExit2 is between the Preheader and
+        // LoopExit.
+        if (!blockIsBefore(LoopExit2, LoopExit) &&
+            (LoopExit2 == Preheader || blockIsBefore(LoopExit2, Preheader))) {
+          LLVM_DEBUG(dbgs() << DEBUG_PREFIX
+                            << "Can't move the target block as it would "
+                               "introduce a new backwards WLS branch\n");
+          CanMove = false;
+          break;
+        }
+      }
+
+      if (CanMove) {
+        // Make sure no LEs become forwards.
+        // An example loop structure where the LoopExit can't be moved, since
+        // bb2's LE will become forwards once bb1 is moved after bb3.
+        // bb1:           - LoopExit
+        // bb2:
+        //      LE  bb1  - Terminator
+        // bb3:          - Preheader
+        //      WLS bb1
+        // bb4:          - Header
+        for (auto It = LoopExit->getIterator(); It != Preheader->getIterator();
+             It++) {
+          MachineBasicBlock *MBB = &*It;
+          for (auto &Terminator : MBB->terminators()) {
+            if (Terminator.getOpcode() != ARM::t2LoopEndDec)
+              continue;
+            MachineBasicBlock *LETarget = Terminator.getOperand(2).getMBB();
+            // The LE will become forwards branching if it branches to LoopExit
+            // which isn't allowed by the architecture, so we should avoid
+            // introducing these.
+            // TODO: Analyse the blocks to make a decision if it would be worth
+            // moving LoopExit even if we'd introduce a forwards LE
+            if (LETarget == LoopExit) {
+              LLVM_DEBUG(dbgs() << DEBUG_PREFIX
+                                << "Can't move the target block as it would "
+                                   "introduce a new forwards LE branch\n");
+              CanMove = false;
+              break;
+            }
+          }
+        }
+
+        if (!CanMove)
+          break;
+      }
+
+      if (CanMove) {
+        moveBasicBlock(LoopExit, Preheader);
+        Changed = true;
+        break;
+      }
+    }
+  }
+
+  return Changed;
+}
+
+bool ARMBlockPlacement::blockIsBefore(MachineBasicBlock *BB,
+                                      MachineBasicBlock *Other) {
+  return BBUtils->getOffsetOf(Other) > BBUtils->getOffsetOf(BB);
+}
+
+void ARMBlockPlacement::moveBasicBlock(MachineBasicBlock *BB,
+                                       MachineBasicBlock *After) {
+  LLVM_DEBUG(dbgs() << DEBUG_PREFIX << "Moving " << BB->getName() << " after "
+                    << After->getName() << "\n");
+  MachineBasicBlock *BBPrevious = BB->getPrevNode();
+  assert(BBPrevious && "Cannot move the function entry basic block");
+  MachineBasicBlock *AfterNext = After->getNextNode();
+  MachineBasicBlock *BBNext = BB->getNextNode();
+
+  BB->moveAfter(After);
+
+  auto FixFallthrough = [&](MachineBasicBlock *From, MachineBasicBlock *To) {
+    LLVM_DEBUG(dbgs() << DEBUG_PREFIX << "Checking for fallthrough from "
+                      << From->getName() << " to " << To->getName() << "\n");
+    assert(From->isSuccessor(To) &&
+           "'To' is expected to be a successor of 'From'");
+    MachineInstr &Terminator = *(--From->terminators().end());
+    if (!Terminator.isUnconditionalBranch()) {
+      // The BB doesn't have an unconditional branch so it relied on
+      // fall-through. Fix by adding an unconditional branch to the moved BB.
+      MachineInstrBuilder MIB =
+          BuildMI(From, Terminator.getDebugLoc(), TII->get(ARM::t2B));
+      MIB.addMBB(To);
+      MIB.addImm(ARMCC::CondCodes::AL);
+      MIB.addReg(ARM::NoRegister);
+      LLVM_DEBUG(dbgs() << DEBUG_PREFIX << "Adding unconditional branch from "
+                        << From->getName() << " to " << To->getName() << ": "
+                        << *MIB.getInstr());
+    }
+  };
+
+  // Fix fall-through to the moved BB from the one that used to be before it.
+  if (BBPrevious->isSuccessor(BB))
+    FixFallthrough(BBPrevious, BB);
+  // Fix fall through from the destination BB to the one that used to follow.
+  if (AfterNext && After->isSuccessor(AfterNext))
+    FixFallthrough(After, AfterNext);
+  // Fix fall through from the moved BB to the one that used to follow.
+  if (BBNext && BB->isSuccessor(BBNext))
+    FixFallthrough(BB, BBNext);
+
+  BBUtils->adjustBBOffsetsAfter(After);
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallLowering.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallLowering.cpp
index d860473011e7..6feed82596cc 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallLowering.cpp
@@ -85,12 +85,11 @@ namespace {
 
 /// Helper class for values going out through an ABI boundary (used for handling
 /// function return values and call parameters).
-struct OutgoingValueHandler : public CallLowering::ValueHandler {
-  OutgoingValueHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
-                       MachineInstrBuilder &MIB, CCAssignFn *AssignFn)
-      : ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {}
-
-  bool isIncomingArgumentHandler() const override { return false; }
+struct ARMOutgoingValueHandler : public CallLowering::OutgoingValueHandler {
+  ARMOutgoingValueHandler(MachineIRBuilder &MIRBuilder,
+                          MachineRegisterInfo &MRI, MachineInstrBuilder &MIB,
+                          CCAssignFn *AssignFn)
+      : OutgoingValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {}
 
   Register getStackAddress(uint64_t Size, int64_t Offset,
                            MachinePointerInfo &MPO) override {
@@ -258,13 +257,14 @@ bool ARMCallLowering::lowerReturnVal(MachineIRBuilder &MIRBuilder,
   CCAssignFn *AssignFn =
       TLI.CCAssignFnForReturn(F.getCallingConv(), F.isVarArg());
 
-  OutgoingValueHandler RetHandler(MIRBuilder, MF.getRegInfo(), Ret, AssignFn);
+  ARMOutgoingValueHandler RetHandler(MIRBuilder, MF.getRegInfo(), Ret,
+                                     AssignFn);
   return handleAssignments(MIRBuilder, SplitRetInfos, RetHandler);
 }
 
 bool ARMCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
-                                  const Value *Val,
-                                  ArrayRef<Register> VRegs) const {
+                                  const Value *Val, ArrayRef<Register> VRegs,
+                                  FunctionLoweringInfo &FLI) const {
   assert(!Val == VRegs.empty() && "Return value without a vreg");
 
   auto const &ST = MIRBuilder.getMF().getSubtarget<ARMSubtarget>();
@@ -282,12 +282,10 @@ namespace {
 
 /// Helper class for values coming in through an ABI boundary (used for handling
 /// formal arguments and call return values).
-struct IncomingValueHandler : public CallLowering::ValueHandler {
-  IncomingValueHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
-                       CCAssignFn AssignFn)
-      : ValueHandler(MIRBuilder, MRI, AssignFn) {}
-
-  bool isIncomingArgumentHandler() const override { return true; }
+struct ARMIncomingValueHandler : public CallLowering::IncomingValueHandler {
+  ARMIncomingValueHandler(MachineIRBuilder &MIRBuilder,
+                          MachineRegisterInfo &MRI, CCAssignFn AssignFn)
+      : IncomingValueHandler(MIRBuilder, MRI, AssignFn) {}
 
   Register getStackAddress(uint64_t Size, int64_t Offset,
                            MachinePointerInfo &MPO) override {
@@ -337,8 +335,8 @@ struct IncomingValueHandler : public CallLowering::ValueHandler {
     assert(VA.isRegLoc() && "Value shouldn't be assigned to reg");
     assert(VA.getLocReg() == PhysReg && "Assigning to the wrong reg?");
 
-    auto ValSize = VA.getValVT().getSizeInBits();
-    auto LocSize = VA.getLocVT().getSizeInBits();
+    uint64_t ValSize = VA.getValVT().getFixedSizeInBits();
+    uint64_t LocSize = VA.getLocVT().getFixedSizeInBits();
 
     assert(ValSize <= 64 && "Unsupported value size");
     assert(LocSize <= 64 && "Unsupported location size");
@@ -399,10 +397,10 @@ struct IncomingValueHandler : public CallLowering::ValueHandler {
   virtual void markPhysRegUsed(unsigned PhysReg) = 0;
 };
 
-struct FormalArgHandler : public IncomingValueHandler {
+struct FormalArgHandler : public ARMIncomingValueHandler {
   FormalArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
                    CCAssignFn AssignFn)
-      : IncomingValueHandler(MIRBuilder, MRI, AssignFn) {}
+      : ARMIncomingValueHandler(MIRBuilder, MRI, AssignFn) {}
 
   void markPhysRegUsed(unsigned PhysReg) override {
     MIRBuilder.getMRI()->addLiveIn(PhysReg);
@@ -412,9 +410,10 @@ struct FormalArgHandler : public IncomingValueHandler {
 
 } // end anonymous namespace
 
-bool ARMCallLowering::lowerFormalArguments(
-    MachineIRBuilder &MIRBuilder, const Function &F,
-    ArrayRef<ArrayRef<Register>> VRegs) const {
+bool ARMCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
+                                           const Function &F,
+                                           ArrayRef<ArrayRef<Register>> VRegs,
+                                           FunctionLoweringInfo &FLI) const {
   auto &TLI = *getTLI<ARMTargetLowering>();
   auto Subtarget = TLI.getSubtarget();
 
@@ -435,7 +434,7 @@ bool ARMCallLowering::lowerFormalArguments(
   for (auto &Arg : F.args()) {
     if (!isSupportedType(DL, TLI, Arg.getType()))
       return false;
-    if (Arg.hasPassPointeeByValueAttr())
+    if (Arg.hasPassPointeeByValueCopyAttr())
       return false;
   }
 
@@ -469,10 +468,10 @@ bool ARMCallLowering::lowerFormalArguments(
 
 namespace {
 
-struct CallReturnHandler : public IncomingValueHandler {
+struct CallReturnHandler : public ARMIncomingValueHandler {
   CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
                     MachineInstrBuilder MIB, CCAssignFn *AssignFn)
-      : IncomingValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {}
+      : ARMIncomingValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {}
 
   void markPhysRegUsed(unsigned PhysReg) override {
     MIB.addDef(PhysReg, RegState::Implicit);
@@ -482,15 +481,16 @@ struct CallReturnHandler : public IncomingValueHandler {
 };
 
 // FIXME: This should move to the ARMSubtarget when it supports all the opcodes.
-unsigned getCallOpcode(const ARMSubtarget &STI, bool isDirect) {
+unsigned getCallOpcode(const MachineFunction &MF, const ARMSubtarget &STI,
+                       bool isDirect) {
   if (isDirect)
     return STI.isThumb() ? ARM::tBL : ARM::BL;
 
   if (STI.isThumb())
-    return ARM::tBLXr;
+    return gettBLXrOpcode(MF);
 
   if (STI.hasV5TOps())
-    return ARM::BLX;
+    return getBLXOpcode(MF);
 
   if (STI.hasV4TOps())
     return ARM::BX_CALL;
@@ -518,7 +518,7 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &
   // Create the call instruction so we can add the implicit uses of arg
   // registers, but don't insert it yet.
   bool IsDirect = !Info.Callee.isReg();
-  auto CallOpcode = getCallOpcode(STI, IsDirect);
+  auto CallOpcode = getCallOpcode(MF, STI, IsDirect);
   auto MIB = MIRBuilder.buildInstrNoInsert(CallOpcode);
 
   bool IsThumb = STI.isThumb();
@@ -538,23 +538,19 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &
 
   MIB.addRegMask(TRI->getCallPreservedMask(MF, Info.CallConv));
 
-  bool IsVarArg = false;
   SmallVector<ArgInfo, 8> ArgInfos;
   for (auto Arg : Info.OrigArgs) {
     if (!isSupportedType(DL, TLI, Arg.Ty))
       return false;
 
-    if (!Arg.IsFixed)
-      IsVarArg = true;
-
     if (Arg.Flags[0].isByVal())
       return false;
 
     splitToValueTypes(Arg, ArgInfos, MF);
   }
 
-  auto ArgAssignFn = TLI.CCAssignFnForCall(Info.CallConv, IsVarArg);
-  OutgoingValueHandler ArgHandler(MIRBuilder, MRI, MIB, ArgAssignFn);
+  auto ArgAssignFn = TLI.CCAssignFnForCall(Info.CallConv, Info.IsVarArg);
+  ARMOutgoingValueHandler ArgHandler(MIRBuilder, MRI, MIB, ArgAssignFn);
   if (!handleAssignments(MIRBuilder, ArgInfos, ArgHandler))
     return false;
 
@@ -567,7 +563,7 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &
 
     ArgInfos.clear();
     splitToValueTypes(Info.OrigRet, ArgInfos, MF);
-    auto RetAssignFn = TLI.CCAssignFnForReturn(Info.CallConv, IsVarArg);
+    auto RetAssignFn = TLI.CCAssignFnForReturn(Info.CallConv, Info.IsVarArg);
     CallReturnHandler RetHandler(MIRBuilder, MRI, MIB, RetAssignFn);
     if (!handleAssignments(MIRBuilder, ArgInfos, RetHandler))
       return false;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallLowering.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallLowering.h
index ddbc9feb90e2..3be73d497d0b 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallLowering.h
@@ -33,10 +33,12 @@ public:
   ARMCallLowering(const ARMTargetLowering &TLI);
 
   bool lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val,
-                   ArrayRef<Register> VRegs) const override;
+                   ArrayRef<Register> VRegs,
+                   FunctionLoweringInfo &FLI) const override;
 
   bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
-                            ArrayRef<ArrayRef<Register>> VRegs) const override;
+                            ArrayRef<ArrayRef<Register>> VRegs,
+                            FunctionLoweringInfo &FLI) const override;
 
   bool lowerCall(MachineIRBuilder &MIRBuilder,
                  CallLoweringInfo &Info) const override;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
index 195d0a89291b..630490f6f914 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -338,6 +338,32 @@ LLVM_DUMP_METHOD void ARMConstantIslands::dumpBBs() {
 }
 #endif
 
+// Align blocks where the previous block does not fall through. This may add
+// extra NOP's but they will not be executed. It uses the PrefLoopAlignment as a
+// measure of how much to align, and only runs at CodeGenOpt::Aggressive.
+static bool AlignBlocks(MachineFunction *MF) {
+  if (MF->getTarget().getOptLevel() != CodeGenOpt::Aggressive ||
+      MF->getFunction().hasOptSize())
+    return false;
+
+  auto *TLI = MF->getSubtarget().getTargetLowering();
+  const Align Alignment = TLI->getPrefLoopAlignment();
+  if (Alignment < 4)
+    return false;
+
+  bool Changed = false;
+  bool PrevCanFallthough = true;
+  for (auto &MBB : *MF) {
+    if (!PrevCanFallthough) {
+      Changed = true;
+      MBB.setAlignment(Alignment);
+    }
+    PrevCanFallthough = MBB.canFallThrough();
+  }
+
+  return Changed;
+}
+
 bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
   MF = &mf;
   MCP = mf.getConstantPool();
@@ -359,6 +385,10 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
   isThumb2 = AFI->isThumb2Function();
 
   bool GenerateTBB = isThumb2 || (isThumb1 && SynthesizeThumb1TBB);
+  // TBB generation code in this constant island pass has not been adapted to
+  // deal with speculation barriers.
+  if (STI->hardenSlsRetBr())
+    GenerateTBB = false;
 
   // Renumber all of the machine basic blocks in the function, guaranteeing that
   // the numbers agree with the position of the block in the function.
@@ -376,6 +406,9 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
     MF->RenumberBlocks();
   }
 
+  // Align any non-fallthrough blocks
+  MadeChange |= AlignBlocks(MF);
+
   // Perform the initial placement of the constant pool entries.  To start with,
   // we put them all at the end of the function.
   std::vector<MachineInstr*> CPEMIs;
@@ -491,7 +524,11 @@ ARMConstantIslands::doInitialConstPlacement(std::vector<MachineInstr*> &CPEMIs)
 
   // The function needs to be as aligned as the basic blocks. The linker may
   // move functions around based on their alignment.
-  MF->ensureAlignment(BB->getAlignment());
+  // Special case: halfword literals still need word alignment on the function.
+  Align FuncAlign = MaxAlign;
+  if (MaxAlign == 2)
+    FuncAlign = Align(4);
+  MF->ensureAlignment(FuncAlign);
 
   // Order the entries in BB by descending alignment.  That ensures correct
   // alignment of all entries as long as BB is sufficiently aligned.  Keep
@@ -506,7 +543,7 @@ ARMConstantIslands::doInitialConstPlacement(std::vector<MachineInstr*> &CPEMIs)
 
   const DataLayout &TD = MF->getDataLayout();
   for (unsigned i = 0, e = CPs.size(); i != e; ++i) {
-    unsigned Size = TD.getTypeAllocSize(CPs[i].getType());
+    unsigned Size = CPs[i].getSizeInBytes(TD);
     Align Alignment = CPs[i].getAlign();
     // Verify that all constant pool entries are a multiple of their alignment.
     // If not, we would have to pad them out so that instructions stay aligned.
@@ -549,6 +586,12 @@ void ARMConstantIslands::doInitialJumpTablePlacement(
   MachineBasicBlock *LastCorrectlyNumberedBB = nullptr;
   for (MachineBasicBlock &MBB : *MF) {
     auto MI = MBB.getLastNonDebugInstr();
+    // Look past potential SpeculationBarriers at end of BB.
+    while (MI != MBB.end() &&
+           (isSpeculationBarrierEndBBOpcode(MI->getOpcode()) ||
+            MI->isDebugInstr()))
+      --MI;
+
     if (MI == MBB.end())
       continue;
 
@@ -771,15 +814,26 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
 
           // Taking the address of a CP entry.
           case ARM::LEApcrel:
-          case ARM::LEApcrelJT:
-            // This takes a SoImm, which is 8 bit immediate rotated. We'll
-            // pretend the maximum offset is 255 * 4. Since each instruction
-            // 4 byte wide, this is always correct. We'll check for other
-            // displacements that fits in a SoImm as well.
-            Bits = 8;
-            Scale = 4;
-            NegOk = true;
-            IsSoImm = true;
+          case ARM::LEApcrelJT: {
+              // This takes a SoImm, which is 8 bit immediate rotated. We'll
+              // pretend the maximum offset is 255 * 4. Since each instruction
+              // 4 byte wide, this is always correct. We'll check for other
+              // displacements that fits in a SoImm as well.
+              Bits = 8;
+              NegOk = true;
+              IsSoImm = true;
+              unsigned CPI = I.getOperand(op).getIndex();
+              assert(CPI < CPEMIs.size());
+              MachineInstr *CPEMI = CPEMIs[CPI];
+              const Align CPEAlign = getCPEAlign(CPEMI);
+              const unsigned LogCPEAlign = Log2(CPEAlign);
+              if (LogCPEAlign >= 2)
+                Scale = 4;
+              else
+                // For constants with less than 4-byte alignment,
+                // we'll pretend the maximum offset is 255 * 1.
+                Scale = 1;
+            }
             break;
           case ARM::t2LEApcrel:
           case ARM::t2LEApcrelJT:
@@ -2070,8 +2124,7 @@ static bool jumpTableFollowsTB(MachineInstr *JTMI, MachineInstr *CPEMI) {
   MachineFunction *MF = MBB->getParent();
   ++MBB;
 
-  return MBB != MF->end() && MBB->begin() != MBB->end() &&
-         &*MBB->begin() == CPEMI;
+  return MBB != MF->end() && !MBB->empty() && &*MBB->begin() == CPEMI;
 }
 
 static void RemoveDeadAddBetweenLEAAndJT(MachineInstr *LEAMI,
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index 48622aae3cb4..a7f1765a9311 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -873,16 +873,27 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
     // FIXME Windows CE supports older ARM CPUs
     assert(!STI->isTargetWindows() && "Windows on ARM requires ARMv7+");
 
-    // Expand into a movi + orr.
-    LO16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVi), DstReg);
-    HI16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::ORRri))
-      .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
-      .addReg(DstReg);
-
     assert (MO.isImm() && "MOVi32imm w/ non-immediate source operand!");
     unsigned ImmVal = (unsigned)MO.getImm();
-    unsigned SOImmValV1 = ARM_AM::getSOImmTwoPartFirst(ImmVal);
-    unsigned SOImmValV2 = ARM_AM::getSOImmTwoPartSecond(ImmVal);
+    unsigned SOImmValV1 = 0, SOImmValV2 = 0;
+
+    if (ARM_AM::isSOImmTwoPartVal(ImmVal)) { // Expand into a movi + orr.
+      LO16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVi), DstReg);
+      HI16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::ORRri))
+          .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+          .addReg(DstReg);
+      SOImmValV1 = ARM_AM::getSOImmTwoPartFirst(ImmVal);
+      SOImmValV2 = ARM_AM::getSOImmTwoPartSecond(ImmVal);
+    } else { // Expand into a mvn + sub.
+      LO16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MVNi), DstReg);
+      HI16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::SUBri))
+          .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+          .addReg(DstReg);
+      SOImmValV1 = ARM_AM::getSOImmTwoPartFirst(-ImmVal);
+      SOImmValV2 = ARM_AM::getSOImmTwoPartSecond(-ImmVal);
+      SOImmValV1 = ~(-SOImmValV1);
+    }
+
     unsigned MIFlags = MI.getFlags();
     LO16 = LO16.addImm(SOImmValV1);
     HI16 = HI16.addImm(SOImmValV2);
@@ -1860,6 +1871,66 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     default:
       return false;
 
+    case ARM::VBSPd:
+    case ARM::VBSPq: {
+      Register DstReg = MI.getOperand(0).getReg();
+      if (DstReg == MI.getOperand(3).getReg()) {
+        // Expand to VBIT
+        unsigned NewOpc = Opcode == ARM::VBSPd ? ARM::VBITd : ARM::VBITq;
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc))
+            .add(MI.getOperand(0))
+            .add(MI.getOperand(3))
+            .add(MI.getOperand(2))
+            .add(MI.getOperand(1))
+            .addImm(MI.getOperand(4).getImm())
+            .add(MI.getOperand(5));
+      } else if (DstReg == MI.getOperand(2).getReg()) {
+        // Expand to VBIF
+        unsigned NewOpc = Opcode == ARM::VBSPd ? ARM::VBIFd : ARM::VBIFq;
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc))
+            .add(MI.getOperand(0))
+            .add(MI.getOperand(2))
+            .add(MI.getOperand(3))
+            .add(MI.getOperand(1))
+            .addImm(MI.getOperand(4).getImm())
+            .add(MI.getOperand(5));
+      } else {
+        // Expand to VBSL
+        unsigned NewOpc = Opcode == ARM::VBSPd ? ARM::VBSLd : ARM::VBSLq;
+        if (DstReg == MI.getOperand(1).getReg()) {
+          BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc))
+              .add(MI.getOperand(0))
+              .add(MI.getOperand(1))
+              .add(MI.getOperand(2))
+              .add(MI.getOperand(3))
+              .addImm(MI.getOperand(4).getImm())
+              .add(MI.getOperand(5));
+        } else {
+          // Use move to satisfy constraints
+          unsigned MoveOpc = Opcode == ARM::VBSPd ? ARM::VORRd : ARM::VORRq;
+          BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(MoveOpc))
+              .addReg(DstReg,
+                      RegState::Define |
+                          getRenamableRegState(MI.getOperand(0).isRenamable()))
+              .add(MI.getOperand(1))
+              .add(MI.getOperand(1))
+              .addImm(MI.getOperand(4).getImm())
+              .add(MI.getOperand(5));
+          BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc))
+              .add(MI.getOperand(0))
+              .addReg(DstReg,
+                      RegState::Kill |
+                          getRenamableRegState(MI.getOperand(0).isRenamable()))
+              .add(MI.getOperand(2))
+              .add(MI.getOperand(3))
+              .addImm(MI.getOperand(4).getImm())
+              .add(MI.getOperand(5));
+        }
+      }
+      MI.eraseFromParent();
+      return true;
+    }
+
     case ARM::TCRETURNdi:
     case ARM::TCRETURNri: {
       MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
@@ -2233,8 +2304,9 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
           MIB.addImm(0);
         MIB.add(predOps(ARMCC::AL));
 
-        MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(),
-                      TII->get(Thumb ? ARM::tBLXr : ARM::BLX));
+        MIB =
+            BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                    TII->get(Thumb ? gettBLXrOpcode(*MF) : getBLXOpcode(*MF)));
         if (Thumb)
           MIB.add(predOps(ARMCC::AL));
         MIB.addReg(Reg, RegState::Kill);
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMFastISel.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMFastISel.cpp
index 4bfca8a803ca..da1d9af8d5b5 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMFastISel.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMFastISel.cpp
@@ -606,7 +606,9 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) {
     }
   }
 
-  if (IsIndirect) {
+  if ((Subtarget->isTargetELF() && Subtarget->isGVInGOT(GV)) ||
+      (Subtarget->isTargetMachO() && IsIndirect) ||
+      Subtarget->genLongCalls()) {
     MachineInstrBuilder MIB;
     unsigned NewDestReg = createResultReg(TLI.getRegClassFor(VT));
     if (isThumb2)
@@ -2173,7 +2175,7 @@ bool ARMFastISel::SelectRet(const Instruction *I) {
 
 unsigned ARMFastISel::ARMSelectCallOp(bool UseReg) {
   if (UseReg)
-    return isThumb2 ? ARM::tBLXr : ARM::BLX;
+    return isThumb2 ? gettBLXrOpcode(*MF) : getBLXOpcode(*MF);
   else
     return isThumb2 ? ARM::tBL : ARM::BL;
 }
@@ -2264,9 +2266,11 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
   // BL / BLX don't take a predicate, but tBL / tBLX do.
   if (isThumb2)
     MIB.add(predOps(ARMCC::AL));
-  if (Subtarget->genLongCalls())
+  if (Subtarget->genLongCalls()) {
+    CalleeReg =
+        constrainOperandRegClass(TII.get(CallOpc), CalleeReg, isThumb2 ? 2 : 0);
     MIB.addReg(CalleeReg);
-  else
+  } else
     MIB.addExternalSymbol(TLI.getLibcallName(Call));
 
   // Add implicit physical register uses to the call.
@@ -2404,9 +2408,11 @@ bool ARMFastISel::SelectCall(const Instruction *I,
   // ARM calls don't take a predicate, but tBL / tBLX do.
   if(isThumb2)
     MIB.add(predOps(ARMCC::AL));
-  if (UseReg)
+  if (UseReg) {
+    CalleeReg =
+        constrainOperandRegClass(TII.get(CallOpc), CalleeReg, isThumb2 ? 2 : 0);
     MIB.addReg(CalleeReg);
-  else if (!IntrMemName)
+  } else if (!IntrMemName)
     MIB.addGlobalAddress(GV, 0, 0);
   else
     MIB.addExternalSymbol(IntrMemName, 0);
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMFeatures.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMFeatures.h
index 5cd7006c22fc..99e0ef05b5e2 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMFeatures.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMFeatures.h
@@ -75,6 +75,7 @@ inline bool isV8EligibleForIT(const InstrType *Instr) {
 // there are some "conditionally deprecated" opcodes
   case ARM::tADDspr:
   case ARM::tBLXr:
+  case ARM::tBLXr_noip:
     return Instr->getOperand(2).getReg() != ARM::PC;
   // ADD PC, SP and BLX PC were always unpredictable,
   // now on top of it they're deprecated
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.cpp
index 8a8f3237bb6f..9eeb7f20dc8d 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -883,9 +883,10 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
 /// debug info.  It's the same as what we use for resolving the code-gen
 /// references for now.  FIXME: This can go wrong when references are
 /// SP-relative and simple call frames aren't used.
-int ARMFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
-                                             Register &FrameReg) const {
-  return ResolveFrameIndexReference(MF, FI, FrameReg, 0);
+StackOffset ARMFrameLowering::getFrameIndexReference(const MachineFunction &MF,
+                                                     int FI,
+                                                     Register &FrameReg) const {
+  return StackOffset::getFixed(ResolveFrameIndexReference(MF, FI, FrameReg, 0));
 }
 
 int ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF,
@@ -2113,8 +2114,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
       unsigned NumExtras = TargetAlign.value() / 4;
       SmallVector<unsigned, 2> Extras;
       while (NumExtras && !UnspilledCS1GPRs.empty()) {
-        unsigned Reg = UnspilledCS1GPRs.back();
-        UnspilledCS1GPRs.pop_back();
+        unsigned Reg = UnspilledCS1GPRs.pop_back_val();
         if (!MRI.isReserved(Reg) &&
             (!AFI->isThumb1OnlyFunction() || isARMLowRegister(Reg))) {
           Extras.push_back(Reg);
@@ -2124,8 +2124,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
       // For non-Thumb1 functions, also check for hi-reg CS registers
       if (!AFI->isThumb1OnlyFunction()) {
         while (NumExtras && !UnspilledCS2GPRs.empty()) {
-          unsigned Reg = UnspilledCS2GPRs.back();
-          UnspilledCS2GPRs.pop_back();
+          unsigned Reg = UnspilledCS2GPRs.pop_back_val();
           if (!MRI.isReserved(Reg)) {
             Extras.push_back(Reg);
             NumExtras--;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.h
index 4c2c07d64f57..9822e2321bb4 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.h
@@ -10,6 +10,7 @@
 #define LLVM_LIB_TARGET_ARM_ARMFRAMELOWERING_H
 
 #include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/Support/TypeSize.h"
 
 namespace llvm {
 
@@ -47,8 +48,8 @@ public:
   bool hasFP(const MachineFunction &MF) const override;
   bool hasReservedCallFrame(const MachineFunction &MF) const override;
   bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override;
-  int getFrameIndexReference(const MachineFunction &MF, int FI,
-                             Register &FrameReg) const override;
+  StackOffset getFrameIndexReference(const MachineFunction &MF, int FI,
+                                     Register &FrameReg) const override;
   int ResolveFrameIndexReference(const MachineFunction &MF, int FI,
                                  Register &FrameReg, int SPAdj) const;
 
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp
index 0fa32a0abeff..f083fa6662e9 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp
@@ -10,11 +10,19 @@
 #include "ARMBaseInstrInfo.h"
 #include "ARMBaseRegisterInfo.h"
 #include "ARMSubtarget.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+
 using namespace llvm;
 
+static cl::opt<int> DataBankMask("arm-data-bank-mask", cl::init(-1),
+                                 cl::Hidden);
+static cl::opt<bool> AssumeITCMConflict("arm-assume-itcm-bankconflict",
+                                        cl::init(false), cl::Hidden);
+
 static bool hasRAWHazard(MachineInstr *DefMI, MachineInstr *MI,
                          const TargetRegisterInfo &TRI) {
   // FIXME: Detect integer instructions properly.
@@ -31,7 +39,7 @@ static bool hasRAWHazard(MachineInstr *DefMI, MachineInstr *MI,
 }
 
 ScheduleHazardRecognizer::HazardType
-ARMHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
+ARMHazardRecognizerFPMLx::getHazardType(SUnit *SU, int Stalls) {
   assert(Stalls == 0 && "ARM hazards don't support scoreboard lookahead");
 
   MachineInstr *MI = SU->getInstr();
@@ -68,33 +76,193 @@ ARMHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
       }
     }
   }
-
-  return ScoreboardHazardRecognizer::getHazardType(SU, Stalls);
+  return NoHazard;
 }
 
-void ARMHazardRecognizer::Reset() {
+void ARMHazardRecognizerFPMLx::Reset() {
   LastMI = nullptr;
   FpMLxStalls = 0;
-  ScoreboardHazardRecognizer::Reset();
 }
 
-void ARMHazardRecognizer::EmitInstruction(SUnit *SU) {
+void ARMHazardRecognizerFPMLx::EmitInstruction(SUnit *SU) {
   MachineInstr *MI = SU->getInstr();
   if (!MI->isDebugInstr()) {
     LastMI = MI;
     FpMLxStalls = 0;
   }
-
-  ScoreboardHazardRecognizer::EmitInstruction(SU);
 }
 
-void ARMHazardRecognizer::AdvanceCycle() {
+void ARMHazardRecognizerFPMLx::AdvanceCycle() {
   if (FpMLxStalls && --FpMLxStalls == 0)
     // Stalled for 4 cycles but still can't schedule any other instructions.
     LastMI = nullptr;
-  ScoreboardHazardRecognizer::AdvanceCycle();
 }
 
-void ARMHazardRecognizer::RecedeCycle() {
+void ARMHazardRecognizerFPMLx::RecedeCycle() {
   llvm_unreachable("reverse ARM hazard checking unsupported");
 }
+
+///////// Bank conflicts handled as hazards //////////////
+
+static bool getBaseOffset(const MachineInstr &MI, const MachineOperand *&BaseOp,
+                          int64_t &Offset) {
+
+  uint64_t TSFlags = MI.getDesc().TSFlags;
+  unsigned AddrMode = (TSFlags & ARMII::AddrModeMask);
+  unsigned IndexMode =
+      (TSFlags & ARMII::IndexModeMask) >> ARMII::IndexModeShift;
+
+  // Address mode tells us what we want to know about operands for T2
+  // instructions (but not size).  It tells us size (but not about operands)
+  // for T1 instructions.
+  switch (AddrMode) {
+  default:
+    return false;
+  case ARMII::AddrModeT2_i8:
+    // t2LDRBT, t2LDRB_POST, t2LDRB_PRE, t2LDRBi8,
+    // t2LDRHT, t2LDRH_POST, t2LDRH_PRE, t2LDRHi8,
+    // t2LDRSBT, t2LDRSB_POST, t2LDRSB_PRE, t2LDRSBi8,
+    // t2LDRSHT, t2LDRSH_POST, t2LDRSH_PRE, t2LDRSHi8,
+    // t2LDRT, t2LDR_POST, t2LDR_PRE, t2LDRi8
+    BaseOp = &MI.getOperand(1);
+    Offset = (IndexMode == ARMII::IndexModePost)
+                 ? 0
+                 : (IndexMode == ARMII::IndexModePre ||
+                    IndexMode == ARMII::IndexModeUpd)
+                       ? MI.getOperand(3).getImm()
+                       : MI.getOperand(2).getImm();
+    return true;
+  case ARMII::AddrModeT2_i12:
+    // t2LDRBi12, t2LDRHi12
+    // t2LDRSBi12, t2LDRSHi12
+    // t2LDRi12
+    BaseOp = &MI.getOperand(1);
+    Offset = MI.getOperand(2).getImm();
+    return true;
+  case ARMII::AddrModeT2_i8s4:
+    // t2LDRD_POST, t2LDRD_PRE, t2LDRDi8
+    BaseOp = &MI.getOperand(2);
+    Offset = (IndexMode == ARMII::IndexModePost)
+                 ? 0
+                 : (IndexMode == ARMII::IndexModePre ||
+                    IndexMode == ARMII::IndexModeUpd)
+                       ? MI.getOperand(4).getImm()
+                       : MI.getOperand(3).getImm();
+    return true;
+  case ARMII::AddrModeT1_1:
+    // tLDRBi, tLDRBr (watch out!), TLDRSB
+  case ARMII::AddrModeT1_2:
+    // tLDRHi, tLDRHr (watch out!), TLDRSH
+  case ARMII::AddrModeT1_4:
+    // tLDRi, tLDRr (watch out!)
+    BaseOp = &MI.getOperand(1);
+    Offset = MI.getOperand(2).isImm() ? MI.getOperand(2).getImm() : 0;
+    return MI.getOperand(2).isImm();
+  }
+  return false;
+}
+
+ARMBankConflictHazardRecognizer::ARMBankConflictHazardRecognizer(
+    const ScheduleDAG *DAG, int64_t CPUBankMask, bool CPUAssumeITCMConflict)
+    : ScheduleHazardRecognizer(), MF(DAG->MF), DL(DAG->MF.getDataLayout()),
+      DataMask(DataBankMask.getNumOccurrences() ? int64_t(DataBankMask)
+                                                : CPUBankMask),
+      AssumeITCMBankConflict(AssumeITCMConflict.getNumOccurrences()
+                                 ? AssumeITCMConflict
+                                 : CPUAssumeITCMConflict) {
+  MaxLookAhead = 1;
+}
+
+ScheduleHazardRecognizer::HazardType
+ARMBankConflictHazardRecognizer::CheckOffsets(unsigned O0, unsigned O1) {
+  return (((O0 ^ O1) & DataMask) != 0) ? NoHazard : Hazard;
+}
+
+ScheduleHazardRecognizer::HazardType
+ARMBankConflictHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
+  MachineInstr &L0 = *SU->getInstr();
+  if (!L0.mayLoad() || L0.mayStore() || L0.getNumMemOperands() != 1)
+    return NoHazard;
+
+  auto MO0 = *L0.memoperands().begin();
+  auto BaseVal0 = MO0->getValue();
+  auto BasePseudoVal0 = MO0->getPseudoValue();
+  int64_t Offset0 = 0;
+
+  if (MO0->getSize() > 4)
+    return NoHazard;
+
+  bool SPvalid = false;
+  const MachineOperand *SP = nullptr;
+  int64_t SPOffset0 = 0;
+
+  for (auto L1 : Accesses) {
+    auto MO1 = *L1->memoperands().begin();
+    auto BaseVal1 = MO1->getValue();
+    auto BasePseudoVal1 = MO1->getPseudoValue();
+    int64_t Offset1 = 0;
+
+    // Pointers to the same object
+    if (BaseVal0 && BaseVal1) {
+      const Value *Ptr0, *Ptr1;
+      Ptr0 = GetPointerBaseWithConstantOffset(BaseVal0, Offset0, DL, true);
+      Ptr1 = GetPointerBaseWithConstantOffset(BaseVal1, Offset1, DL, true);
+      if (Ptr0 == Ptr1 && Ptr0)
+        return CheckOffsets(Offset0, Offset1);
+    }
+
+    if (BasePseudoVal0 && BasePseudoVal1 &&
+        BasePseudoVal0->kind() == BasePseudoVal1->kind() &&
+        BasePseudoVal0->kind() == PseudoSourceValue::FixedStack) {
+      // Spills/fills
+      auto FS0 = cast<FixedStackPseudoSourceValue>(BasePseudoVal0);
+      auto FS1 = cast<FixedStackPseudoSourceValue>(BasePseudoVal1);
+      Offset0 = MF.getFrameInfo().getObjectOffset(FS0->getFrameIndex());
+      Offset1 = MF.getFrameInfo().getObjectOffset(FS1->getFrameIndex());
+      return CheckOffsets(Offset0, Offset1);
+    }
+
+    // Constant pools (likely in ITCM)
+    if (BasePseudoVal0 && BasePseudoVal1 &&
+        BasePseudoVal0->kind() == BasePseudoVal1->kind() &&
+        BasePseudoVal0->isConstantPool() && AssumeITCMBankConflict)
+      return Hazard;
+
+    // Is this a stack pointer-relative access?  We could in general try to
+    // use "is this the same register and is it unchanged?", but the
+    // memory operand tracking is highly likely to have already found that.
+    // What we're after here is bank conflicts between different objects in
+    // the stack frame.
+    if (!SPvalid) { // set up SP
+      if (!getBaseOffset(L0, SP, SPOffset0) || SP->getReg().id() != ARM::SP)
+        SP = nullptr;
+      SPvalid = true;
+    }
+    if (SP) {
+      int64_t SPOffset1;
+      const MachineOperand *SP1;
+      if (getBaseOffset(*L1, SP1, SPOffset1) && SP1->getReg().id() == ARM::SP)
+        return CheckOffsets(SPOffset0, SPOffset1);
+    }
+  }
+
+  return NoHazard;
+}
+
+void ARMBankConflictHazardRecognizer::Reset() { Accesses.clear(); }
+
+void ARMBankConflictHazardRecognizer::EmitInstruction(SUnit *SU) {
+  MachineInstr &MI = *SU->getInstr();
+  if (!MI.mayLoad() || MI.mayStore() || MI.getNumMemOperands() != 1)
+    return;
+
+  auto MO = *MI.memoperands().begin();
+  uint64_t Size1 = MO->getSize();
+  if (Size1 > 4)
+    return;
+  Accesses.push_back(&MI);
+}
+
+void ARMBankConflictHazardRecognizer::AdvanceCycle() { Accesses.clear(); }
+
+void ARMBankConflictHazardRecognizer::RecedeCycle() { Accesses.clear(); }
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMHazardRecognizer.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMHazardRecognizer.h
index ca02cc739e11..c1f1bcd0a629 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMHazardRecognizer.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMHazardRecognizer.h
@@ -13,27 +13,28 @@
 #ifndef LLVM_LIB_TARGET_ARM_ARMHAZARDRECOGNIZER_H
 #define LLVM_LIB_TARGET_ARM_ARMHAZARDRECOGNIZER_H
 
-#include "llvm/CodeGen/ScoreboardHazardRecognizer.h"
+#include "ARMBaseInstrInfo.h"
+#include "llvm/ADT/BitmaskEnum.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/Support/DataTypes.h"
+#include <array>
+#include <initializer_list>
 
 namespace llvm {
 
-class ARMBaseInstrInfo;
-class ARMBaseRegisterInfo;
-class ARMSubtarget;
+class DataLayout;
+class MachineFunction;
 class MachineInstr;
+class ScheduleDAG;
 
-/// ARMHazardRecognizer handles special constraints that are not expressed in
-/// the scheduling itinerary. This is only used during postRA scheduling. The
-/// ARM preRA scheduler uses an unspecialized instance of the
-/// ScoreboardHazardRecognizer.
-class ARMHazardRecognizer : public ScoreboardHazardRecognizer {
+// Hazards related to FP MLx instructions
+class ARMHazardRecognizerFPMLx : public ScheduleHazardRecognizer {
   MachineInstr *LastMI = nullptr;
   unsigned FpMLxStalls = 0;
 
 public:
-  ARMHazardRecognizer(const InstrItineraryData *ItinData,
-                      const ScheduleDAG *DAG)
-      : ScoreboardHazardRecognizer(ItinData, DAG, "post-RA-sched") {}
+  ARMHazardRecognizerFPMLx() : ScheduleHazardRecognizer() { MaxLookAhead = 1; }
 
   HazardType getHazardType(SUnit *SU, int Stalls) override;
   void Reset() override;
@@ -42,6 +43,27 @@ public:
   void RecedeCycle() override;
 };
 
+// Hazards related to bank conflicts
+class ARMBankConflictHazardRecognizer : public ScheduleHazardRecognizer {
+  SmallVector<MachineInstr *, 8> Accesses;
+  const MachineFunction &MF;
+  const DataLayout &DL;
+  int64_t DataMask;
+  bool AssumeITCMBankConflict;
+
+public:
+  ARMBankConflictHazardRecognizer(const ScheduleDAG *DAG, int64_t DDM,
+                                  bool ABC);
+  HazardType getHazardType(SUnit *SU, int Stalls) override;
+  void Reset() override;
+  void EmitInstruction(SUnit *SU) override;
+  void AdvanceCycle() override;
+  void RecedeCycle() override;
+
+private:
+  inline HazardType CheckOffsets(unsigned O0, unsigned O1);
+};
+
 } // end namespace llvm
 
 #endif
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 7bad485e390c..c3cb02431e66 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -143,7 +143,7 @@ static cl::opt<unsigned> ConstpoolPromotionMaxTotal(
     cl::desc("Maximum size of ALL constants to promote into a constant pool"),
     cl::init(128));
 
-static cl::opt<unsigned>
+cl::opt<unsigned>
 MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden,
   cl::desc("Maximum interleave factor for MVE VLDn to generate."),
   cl::init(2));
@@ -289,6 +289,8 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
     setOperationAction(ISD::UDIVREM, VT, Expand);
     setOperationAction(ISD::SDIVREM, VT, Expand);
     setOperationAction(ISD::CTPOP, VT, Expand);
+    setOperationAction(ISD::SELECT, VT, Expand);
+    setOperationAction(ISD::SELECT_CC, VT, Expand);
 
     // Vector reductions
     setOperationAction(ISD::VECREDUCE_ADD, VT, Legal);
@@ -335,6 +337,8 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
     setOperationAction(ISD::SETCC, VT, Custom);
     setOperationAction(ISD::MLOAD, VT, Custom);
     setOperationAction(ISD::MSTORE, VT, Legal);
+    setOperationAction(ISD::SELECT, VT, Expand);
+    setOperationAction(ISD::SELECT_CC, VT, Expand);
 
     // Pre and Post inc are supported on loads and stores
     for (unsigned im = (unsigned)ISD::PRE_INC;
@@ -439,6 +443,9 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
     setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand);
     setOperationAction(ISD::LOAD, VT, Custom);
     setOperationAction(ISD::STORE, VT, Custom);
+    setOperationAction(ISD::TRUNCATE, VT, Custom);
+    setOperationAction(ISD::VSELECT, VT, Expand);
+    setOperationAction(ISD::SELECT, VT, Expand);
   }
 }
 
@@ -988,6 +995,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     setTargetDAGCombine(ISD::SMAX);
     setTargetDAGCombine(ISD::UMAX);
     setTargetDAGCombine(ISD::FP_EXTEND);
+    setTargetDAGCombine(ISD::SELECT);
+    setTargetDAGCombine(ISD::SELECT_CC);
   }
 
   if (!Subtarget->hasFP64()) {
@@ -1717,8 +1726,11 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::VCVTL:         return "ARMISD::VCVTL";
   case ARMISD::VMULLs:        return "ARMISD::VMULLs";
   case ARMISD::VMULLu:        return "ARMISD::VMULLu";
+  case ARMISD::VQDMULH:       return "ARMISD::VQDMULH";
   case ARMISD::VADDVs:        return "ARMISD::VADDVs";
   case ARMISD::VADDVu:        return "ARMISD::VADDVu";
+  case ARMISD::VADDVps:       return "ARMISD::VADDVps";
+  case ARMISD::VADDVpu:       return "ARMISD::VADDVpu";
   case ARMISD::VADDLVs:       return "ARMISD::VADDLVs";
   case ARMISD::VADDLVu:       return "ARMISD::VADDLVu";
   case ARMISD::VADDLVAs:      return "ARMISD::VADDLVAs";
@@ -1729,10 +1741,20 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::VADDLVApu:     return "ARMISD::VADDLVApu";
   case ARMISD::VMLAVs:        return "ARMISD::VMLAVs";
   case ARMISD::VMLAVu:        return "ARMISD::VMLAVu";
+  case ARMISD::VMLAVps:       return "ARMISD::VMLAVps";
+  case ARMISD::VMLAVpu:       return "ARMISD::VMLAVpu";
   case ARMISD::VMLALVs:       return "ARMISD::VMLALVs";
   case ARMISD::VMLALVu:       return "ARMISD::VMLALVu";
+  case ARMISD::VMLALVps:      return "ARMISD::VMLALVps";
+  case ARMISD::VMLALVpu:      return "ARMISD::VMLALVpu";
   case ARMISD::VMLALVAs:      return "ARMISD::VMLALVAs";
   case ARMISD::VMLALVAu:      return "ARMISD::VMLALVAu";
+  case ARMISD::VMLALVAps:     return "ARMISD::VMLALVAps";
+  case ARMISD::VMLALVApu:     return "ARMISD::VMLALVApu";
+  case ARMISD::VMINVu:        return "ARMISD::VMINVu";
+  case ARMISD::VMINVs:        return "ARMISD::VMINVs";
+  case ARMISD::VMAXVu:        return "ARMISD::VMAXVu";
+  case ARMISD::VMAXVs:        return "ARMISD::VMAXVs";
   case ARMISD::UMAAL:         return "ARMISD::UMAAL";
   case ARMISD::UMLAL:         return "ARMISD::UMLAL";
   case ARMISD::SMLAL:         return "ARMISD::SMLAL";
@@ -1756,7 +1778,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::BFI:           return "ARMISD::BFI";
   case ARMISD::VORRIMM:       return "ARMISD::VORRIMM";
   case ARMISD::VBICIMM:       return "ARMISD::VBICIMM";
-  case ARMISD::VBSL:          return "ARMISD::VBSL";
+  case ARMISD::VBSP:          return "ARMISD::VBSP";
   case ARMISD::MEMCPY:        return "ARMISD::MEMCPY";
   case ARMISD::VLD1DUP:       return "ARMISD::VLD1DUP";
   case ARMISD::VLD2DUP:       return "ARMISD::VLD2DUP";
@@ -2510,9 +2532,9 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
             DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY));
         Callee = DAG.getLoad(
             PtrVt, dl, DAG.getEntryNode(), Callee,
-            MachinePointerInfo::getGOT(DAG.getMachineFunction()),
-            /* Alignment = */ 0, MachineMemOperand::MODereferenceable |
-                                     MachineMemOperand::MOInvariant);
+            MachinePointerInfo::getGOT(DAG.getMachineFunction()), MaybeAlign(),
+            MachineMemOperand::MODereferenceable |
+                MachineMemOperand::MOInvariant);
       } else if (Subtarget->isTargetCOFF()) {
         assert(Subtarget->isTargetWindows() &&
                "Windows is the only supported COFF target");
@@ -3321,8 +3343,7 @@ ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op,
   SDValue Chain = DAG.getEntryNode();
   SDValue FuncTLVGet = DAG.getLoad(
       MVT::i32, DL, Chain, DescAddr,
-      MachinePointerInfo::getGOT(DAG.getMachineFunction()),
-      /* Alignment = */ 4,
+      MachinePointerInfo::getGOT(DAG.getMachineFunction()), Align(4),
       MachineMemOperand::MONonTemporal | MachineMemOperand::MODereferenceable |
           MachineMemOperand::MOInvariant);
   Chain = FuncTLVGet.getValue(1);
@@ -3536,8 +3557,7 @@ static bool allUsersAreInFunction(const Value *V, const Function *F) {
   while (!Worklist.empty()) {
     auto *U = Worklist.pop_back_val();
     if (isa<ConstantExpr>(U)) {
-      for (auto *UU : U->users())
-        Worklist.push_back(UU);
+      append_range(Worklist, U->users());
       continue;
     }
 
@@ -4424,13 +4444,26 @@ SDValue ARMTargetLowering::LowerFormalArguments(
   }
 
   // varargs
-  if (isVarArg && MFI.hasVAStart())
-    VarArgStyleRegisters(CCInfo, DAG, dl, Chain,
-                         CCInfo.getNextStackOffset(),
+  if (isVarArg && MFI.hasVAStart()) {
+    VarArgStyleRegisters(CCInfo, DAG, dl, Chain, CCInfo.getNextStackOffset(),
                          TotalArgRegsSaveSize);
+    if (AFI->isCmseNSEntryFunction()) {
+      DiagnosticInfoUnsupported Diag(
+          DAG.getMachineFunction().getFunction(),
+          "secure entry function must not be variadic", dl.getDebugLoc());
+      DAG.getContext()->diagnose(Diag);
+    }
+  }
 
   AFI->setArgumentStackSize(CCInfo.getNextStackOffset());
 
+  if (CCInfo.getNextStackOffset() > 0 && AFI->isCmseNSEntryFunction()) {
+    DiagnosticInfoUnsupported Diag(
+        DAG.getMachineFunction().getFunction(),
+        "secure entry function requires arguments on stack", dl.getDebugLoc());
+    DAG.getContext()->diagnose(Diag);
+  }
+
   return Chain;
 }
 
@@ -4991,16 +5024,6 @@ static bool isLowerSaturate(const SDValue LHS, const SDValue RHS,
           ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal)));
 }
 
-// Similar to isLowerSaturate(), but checks for upper-saturating conditions.
-static bool isUpperSaturate(const SDValue LHS, const SDValue RHS,
-                            const SDValue TrueVal, const SDValue FalseVal,
-                            const ISD::CondCode CC, const SDValue K) {
-  return (isGTorGE(CC) &&
-          ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal))) ||
-         (isLTorLE(CC) &&
-          ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal)));
-}
-
 // Check if two chained conditionals could be converted into SSAT or USAT.
 //
 // SSAT can replace a set of two conditional selectors that bound a number to an
@@ -5012,101 +5035,68 @@ static bool isUpperSaturate(const SDValue LHS, const SDValue RHS,
 //     x < k ? (x < -k ? -k : x) : k
 //     etc.
 //
-// USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1 is
-// a power of 2.
+// LLVM canonicalizes these to either a min(max()) or a max(min())
+// pattern. This function tries to match one of these and will return a SSAT
+// node if successful.
 //
-// It returns true if the conversion can be done, false otherwise.
-// Additionally, the variable is returned in parameter V, the constant in K and
-// usat is set to true if the conditional represents an unsigned saturation
-static bool isSaturatingConditional(const SDValue &Op, SDValue &V,
-                                    uint64_t &K, bool &usat) {
-  SDValue LHS1 = Op.getOperand(0);
-  SDValue RHS1 = Op.getOperand(1);
+// USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1
+// is a power of 2.
+static SDValue LowerSaturatingConditional(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+  SDValue V1 = Op.getOperand(0);
+  SDValue K1 = Op.getOperand(1);
   SDValue TrueVal1 = Op.getOperand(2);
   SDValue FalseVal1 = Op.getOperand(3);
   ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get();
 
   const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1;
   if (Op2.getOpcode() != ISD::SELECT_CC)
-    return false;
+    return SDValue();
 
-  SDValue LHS2 = Op2.getOperand(0);
-  SDValue RHS2 = Op2.getOperand(1);
+  SDValue V2 = Op2.getOperand(0);
+  SDValue K2 = Op2.getOperand(1);
   SDValue TrueVal2 = Op2.getOperand(2);
   SDValue FalseVal2 = Op2.getOperand(3);
   ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get();
 
-  // Find out which are the constants and which are the variables
-  // in each conditional
-  SDValue *K1 = isa<ConstantSDNode>(LHS1) ? &LHS1 : isa<ConstantSDNode>(RHS1)
-                                                        ? &RHS1
-                                                        : nullptr;
-  SDValue *K2 = isa<ConstantSDNode>(LHS2) ? &LHS2 : isa<ConstantSDNode>(RHS2)
-                                                        ? &RHS2
-                                                        : nullptr;
-  SDValue K2Tmp = isa<ConstantSDNode>(TrueVal2) ? TrueVal2 : FalseVal2;
-  SDValue V1Tmp = (K1 && *K1 == LHS1) ? RHS1 : LHS1;
-  SDValue V2Tmp = (K2 && *K2 == LHS2) ? RHS2 : LHS2;
-  SDValue V2 = (K2Tmp == TrueVal2) ? FalseVal2 : TrueVal2;
-
-  // We must detect cases where the original operations worked with 16- or
-  // 8-bit values. In such case, V2Tmp != V2 because the comparison operations
-  // must work with sign-extended values but the select operations return
-  // the original non-extended value.
-  SDValue V2TmpReg = V2Tmp;
-  if (V2Tmp->getOpcode() == ISD::SIGN_EXTEND_INREG)
-    V2TmpReg = V2Tmp->getOperand(0);
-
-  // Check that the registers and the constants have the correct values
-  // in both conditionals
-  if (!K1 || !K2 || *K1 == Op2 || *K2 != K2Tmp || V1Tmp != V2Tmp ||
-      V2TmpReg != V2)
-    return false;
+  SDValue V1Tmp = V1;
+  SDValue V2Tmp = V2;
 
-  // Figure out which conditional is saturating the lower/upper bound.
-  const SDValue *LowerCheckOp =
-      isLowerSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1)
-          ? &Op
-          : isLowerSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2)
-                ? &Op2
-                : nullptr;
-  const SDValue *UpperCheckOp =
-      isUpperSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1)
-          ? &Op
-          : isUpperSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2)
-                ? &Op2
-                : nullptr;
-
-  if (!UpperCheckOp || !LowerCheckOp || LowerCheckOp == UpperCheckOp)
-    return false;
+  // Check that the registers and the constants match a max(min()) or min(max())
+  // pattern
+  if (V1Tmp != TrueVal1 || V2Tmp != TrueVal2 || K1 != FalseVal1 ||
+      K2 != FalseVal2 ||
+      !((isGTorGE(CC1) && isLTorLE(CC2)) || (isLTorLE(CC1) && isGTorGE(CC2))))
+    return SDValue();
 
   // Check that the constant in the lower-bound check is
   // the opposite of the constant in the upper-bound check
   // in 1's complement.
-  int64_t Val1 = cast<ConstantSDNode>(*K1)->getSExtValue();
-  int64_t Val2 = cast<ConstantSDNode>(*K2)->getSExtValue();
+  if (!isa<ConstantSDNode>(K1) || !isa<ConstantSDNode>(K2))
+    return SDValue();
+
+  int64_t Val1 = cast<ConstantSDNode>(K1)->getSExtValue();
+  int64_t Val2 = cast<ConstantSDNode>(K2)->getSExtValue();
   int64_t PosVal = std::max(Val1, Val2);
   int64_t NegVal = std::min(Val1, Val2);
 
-  if (((Val1 > Val2 && UpperCheckOp == &Op) ||
-       (Val1 < Val2 && UpperCheckOp == &Op2)) &&
-      isPowerOf2_64(PosVal + 1)) {
-
-    // Handle the difference between USAT (unsigned) and SSAT (signed) saturation
-    if (Val1 == ~Val2)
-      usat = false;
-    else if (NegVal == 0)
-      usat = true;
-    else
-      return false;
-
-    V = V2;
-    K = (uint64_t)PosVal; // At this point, PosVal is guaranteed to be positive
+  if (!((Val1 > Val2 && isLTorLE(CC1)) || (Val1 < Val2 && isLTorLE(CC2))) ||
+      !isPowerOf2_64(PosVal + 1))
+    return SDValue();
 
-    return true;
-  }
+  // Handle the difference between USAT (unsigned) and SSAT (signed)
+  // saturation
+  // At this point, PosVal is guaranteed to be positive
+  uint64_t K = PosVal;
+  SDLoc dl(Op);
+  if (Val1 == ~Val2)
+    return DAG.getNode(ARMISD::SSAT, dl, VT, V2Tmp,
+                       DAG.getConstant(countTrailingOnes(K), dl, VT));
+  if (NegVal == 0)
+    return DAG.getNode(ARMISD::USAT, dl, VT, V2Tmp,
+                       DAG.getConstant(countTrailingOnes(K), dl, VT));
 
-  return false;
+  return SDValue();
 }
 
 // Check if a condition of the type x < k ? k : x can be converted into a
@@ -5166,18 +5156,9 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   SDLoc dl(Op);
 
   // Try to convert two saturating conditional selects into a single SSAT
-  SDValue SatValue;
-  uint64_t SatConstant;
-  bool SatUSat;
-  if (((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2()) &&
-      isSaturatingConditional(Op, SatValue, SatConstant, SatUSat)) {
-    if (SatUSat)
-      return DAG.getNode(ARMISD::USAT, dl, VT, SatValue,
-                         DAG.getConstant(countTrailingOnes(SatConstant), dl, VT));
-    else
-      return DAG.getNode(ARMISD::SSAT, dl, VT, SatValue,
-                         DAG.getConstant(countTrailingOnes(SatConstant), dl, VT));
-  }
+  if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2())
+    if (SDValue SatValue = LowerSaturatingConditional(Op, DAG))
+      return SatValue;
 
   // Try to convert expressions of the form x < k ? k : x (and similar forms)
   // into more efficient bit operations, which is possible when k is 0 or -1
@@ -5186,6 +5167,7 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   // instructions.
   // Only allow this transformation on full-width (32-bit) operations
   SDValue LowerSatConstant;
+  SDValue SatValue;
   if (VT == MVT::i32 &&
       isLowerSaturatingConditional(Op, SatValue, LowerSatConstant)) {
     SDValue ShiftV = DAG.getNode(ISD::SRA, dl, VT, SatValue,
@@ -7769,17 +7751,19 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
   for (auto &Src : Sources) {
     EVT SrcVT = Src.ShuffleVec.getValueType();
 
-    if (SrcVT.getSizeInBits() == VT.getSizeInBits())
+    uint64_t SrcVTSize = SrcVT.getFixedSizeInBits();
+    uint64_t VTSize = VT.getFixedSizeInBits();
+    if (SrcVTSize == VTSize)
       continue;
 
     // This stage of the search produces a source with the same element type as
     // the original, but with a total width matching the BUILD_VECTOR output.
     EVT EltVT = SrcVT.getVectorElementType();
-    unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits();
+    unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
     EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
 
-    if (SrcVT.getSizeInBits() < VT.getSizeInBits()) {
-      if (2 * SrcVT.getSizeInBits() != VT.getSizeInBits())
+    if (SrcVTSize < VTSize) {
+      if (2 * SrcVTSize != VTSize)
         return SDValue();
       // We can pad out the smaller vector for free, so if it's part of a
       // shuffle...
@@ -7789,7 +7773,7 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
       continue;
     }
 
-    if (SrcVT.getSizeInBits() != 2 * VT.getSizeInBits())
+    if (SrcVTSize != 2 * VTSize)
       return SDValue();
 
     if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
@@ -7857,7 +7841,7 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
     // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
     // segment.
     EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
-    int BitsDefined = std::min(OrigEltTy.getSizeInBits(),
+    int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
                                VT.getScalarSizeInBits());
     int LanesDefined = BitsDefined / BitsPerShuffleLane;
 
@@ -8659,6 +8643,23 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG,
                      DAG.getConstant(ARMCC::NE, dl, MVT::i32));
 }
 
+// Turn a truncate into a predicate (an i1 vector) into icmp(and(x, 1), 0).
+static SDValue LowerTruncatei1(SDValue N, SelectionDAG &DAG,
+                               const ARMSubtarget *ST) {
+  assert(ST->hasMVEIntegerOps() && "Expected MVE!");
+  EVT VT = N.getValueType();
+  assert((VT == MVT::v16i1 || VT == MVT::v8i1 || VT == MVT::v4i1) &&
+         "Expected a vector i1 type!");
+  SDValue Op = N.getOperand(0);
+  EVT FromVT = Op.getValueType();
+  SDLoc DL(N);
+
+  SDValue And =
+      DAG.getNode(ISD::AND, DL, FromVT, Op, DAG.getConstant(1, DL, FromVT));
+  return DAG.getNode(ISD::SETCC, DL, VT, And, DAG.getConstant(0, DL, FromVT),
+                     DAG.getCondCode(ISD::SETNE));
+}
+
 /// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each
 /// element has been zero/sign-extended, depending on the isSigned parameter,
 /// from an integer type half its size.
@@ -8723,10 +8724,11 @@ static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
   return false;
 }
 
-/// isZeroExtended - Check if a node is a vector value that is zero-extended
-/// or a constant BUILD_VECTOR with zero-extended elements.
+/// isZeroExtended - Check if a node is a vector value that is zero-extended (or
+/// any-extended) or a constant BUILD_VECTOR with zero-extended elements.
 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
-  if (N->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N))
+  if (N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND ||
+      ISD::isZEXTLoad(N))
     return true;
   if (isExtendedBUILD_VECTOR(N, DAG, false))
     return true;
@@ -8794,13 +8796,14 @@ static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) {
 }
 
 /// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND,
-/// extending load, or BUILD_VECTOR with extended elements, return the
-/// unextended value. The unextended vector should be 64 bits so that it can
+/// ANY_EXTEND, extending load, or BUILD_VECTOR with extended elements, return
+/// the unextended value. The unextended vector should be 64 bits so that it can
 /// be used as an operand to a VMULL instruction. If the original vector size
 /// before extension is less than 64 bits we add a an extension to resize
 /// the vector to 64 bits.
 static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) {
-  if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
+  if (N->getOpcode() == ISD::SIGN_EXTEND ||
+      N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND)
     return AddRequiredExtensionForVMULL(N->getOperand(0), DAG,
                                         N->getOperand(0)->getValueType(0),
                                         N->getValueType(0),
@@ -9768,6 +9771,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG, Subtarget);
   case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG, Subtarget);
+  case ISD::TRUNCATE:      return LowerTruncatei1(Op, DAG, Subtarget);
   case ISD::FLT_ROUNDS_:   return LowerFLT_ROUNDS_(Op, DAG);
   case ISD::MUL:           return LowerMUL(Op, DAG);
   case ISD::SDIV:
@@ -10400,8 +10404,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
 
     // Remove the landing pad successor from the invoke block and replace it
     // with the new dispatch block.
-    SmallVector<MachineBasicBlock*, 4> Successors(BB->succ_begin(),
-                                                  BB->succ_end());
+    SmallVector<MachineBasicBlock*, 4> Successors(BB->successors());
     while (!Successors.empty()) {
       MachineBasicBlock *SMBB = Successors.pop_back_val();
       if (SMBB->isEHPad()) {
@@ -10885,7 +10888,7 @@ ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI,
 
     BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg)
       .addExternalSymbol("__chkstk");
-    BuildMI(*MBB, MI, DL, TII.get(ARM::tBLXr))
+    BuildMI(*MBB, MI, DL, TII.get(gettBLXrOpcode(*MBB->getParent())))
         .add(predOps(ARMCC::AL))
         .addReg(Reg, RegState::Kill)
         .addReg(ARM::R4, RegState::Implicit | RegState::Kill)
@@ -11264,6 +11267,14 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     return EmitLowered__chkstk(MI, BB);
   case ARM::WIN__DBZCHK:
     return EmitLowered__dbzchk(MI, BB);
+  case ARM::t2DoLoopStart:
+    // We are just here to set a register allocation hint, prefering lr for the
+    // input register to make it more likely to be movable and removable, later
+    // in the pipeline.
+    Register R = MI.getOperand(1).getReg();
+    MachineFunction *MF = MI.getParent()->getParent();
+    MF->getRegInfo().setRegAllocationHint(R, ARMRI::RegLR, 0);
+    return BB;
   }
 }
 
@@ -12105,9 +12116,198 @@ static SDValue PerformAddeSubeCombine(SDNode *N,
   return SDValue();
 }
 
+static SDValue PerformSELECTCombine(SDNode *N,
+                                    TargetLowering::DAGCombinerInfo &DCI,
+                                    const ARMSubtarget *Subtarget) {
+  if (!Subtarget->hasMVEIntegerOps())
+    return SDValue();
+
+  SDLoc dl(N);
+  SDValue SetCC;
+  SDValue LHS;
+  SDValue RHS;
+  ISD::CondCode CC;
+  SDValue TrueVal;
+  SDValue FalseVal;
+
+  if (N->getOpcode() == ISD::SELECT &&
+      N->getOperand(0)->getOpcode() == ISD::SETCC) {
+    SetCC = N->getOperand(0);
+    LHS = SetCC->getOperand(0);
+    RHS = SetCC->getOperand(1);
+    CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
+    TrueVal = N->getOperand(1);
+    FalseVal = N->getOperand(2);
+  } else if (N->getOpcode() == ISD::SELECT_CC) {
+    LHS = N->getOperand(0);
+    RHS = N->getOperand(1);
+    CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
+    TrueVal = N->getOperand(2);
+    FalseVal = N->getOperand(3);
+  } else {
+    return SDValue();
+  }
+
+  unsigned int Opcode = 0;
+  if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMIN ||
+       FalseVal->getOpcode() == ISD::VECREDUCE_UMIN) &&
+      (CC == ISD::SETULT || CC == ISD::SETUGT)) {
+    Opcode = ARMISD::VMINVu;
+    if (CC == ISD::SETUGT)
+      std::swap(TrueVal, FalseVal);
+  } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMIN ||
+              FalseVal->getOpcode() == ISD::VECREDUCE_SMIN) &&
+             (CC == ISD::SETLT || CC == ISD::SETGT)) {
+    Opcode = ARMISD::VMINVs;
+    if (CC == ISD::SETGT)
+      std::swap(TrueVal, FalseVal);
+  } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMAX ||
+              FalseVal->getOpcode() == ISD::VECREDUCE_UMAX) &&
+             (CC == ISD::SETUGT || CC == ISD::SETULT)) {
+    Opcode = ARMISD::VMAXVu;
+    if (CC == ISD::SETULT)
+      std::swap(TrueVal, FalseVal);
+  } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMAX ||
+              FalseVal->getOpcode() == ISD::VECREDUCE_SMAX) &&
+             (CC == ISD::SETGT || CC == ISD::SETLT)) {
+    Opcode = ARMISD::VMAXVs;
+    if (CC == ISD::SETLT)
+      std::swap(TrueVal, FalseVal);
+  } else
+    return SDValue();
+
+  // Normalise to the right hand side being the vector reduction
+  switch (TrueVal->getOpcode()) {
+  case ISD::VECREDUCE_UMIN:
+  case ISD::VECREDUCE_SMIN:
+  case ISD::VECREDUCE_UMAX:
+  case ISD::VECREDUCE_SMAX:
+    std::swap(LHS, RHS);
+    std::swap(TrueVal, FalseVal);
+    break;
+  }
+
+  EVT VectorType = FalseVal->getOperand(0).getValueType();
+
+  if (VectorType != MVT::v16i8 && VectorType != MVT::v8i16 &&
+      VectorType != MVT::v4i32)
+    return SDValue();
+
+  EVT VectorScalarType = VectorType.getVectorElementType();
+
+  // The values being selected must also be the ones being compared
+  if (TrueVal != LHS || FalseVal != RHS)
+    return SDValue();
+
+  EVT LeftType = LHS->getValueType(0);
+  EVT RightType = RHS->getValueType(0);
+
+  // The types must match the reduced type too
+  if (LeftType != VectorScalarType || RightType != VectorScalarType)
+    return SDValue();
+
+  // Legalise the scalar to an i32
+  if (VectorScalarType != MVT::i32)
+    LHS = DCI.DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
+
+  // Generate the reduction as an i32 for legalisation purposes
+  auto Reduction =
+      DCI.DAG.getNode(Opcode, dl, MVT::i32, LHS, RHS->getOperand(0));
+
+  // The result isn't actually an i32 so truncate it back to its original type
+  if (VectorScalarType != MVT::i32)
+    Reduction = DCI.DAG.getNode(ISD::TRUNCATE, dl, VectorScalarType, Reduction);
+
+  return Reduction;
+}
+
+// A special combine for the vqdmulh family of instructions. This is one of the
+// potential set of patterns that could patch this instruction. The base pattern
+// you would expect to be min(max(ashr(mul(mul(sext(x), 2), sext(y)), 16))).
+// This matches the different min(max(ashr(mul(mul(sext(x), sext(y)), 2), 16))),
+// which llvm will have optimized to min(ashr(mul(sext(x), sext(y)), 15))) as
+// the max is unnecessary.
+static SDValue PerformVQDMULHCombine(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+  SDValue Shft;
+  ConstantSDNode *Clamp;
+
+  if (N->getOpcode() == ISD::SMIN) {
+    Shft = N->getOperand(0);
+    Clamp = isConstOrConstSplat(N->getOperand(1));
+  } else if (N->getOpcode() == ISD::VSELECT) {
+    // Detect a SMIN, which for an i64 node will be a vselect/setcc, not a smin.
+    SDValue Cmp = N->getOperand(0);
+    if (Cmp.getOpcode() != ISD::SETCC ||
+        cast<CondCodeSDNode>(Cmp.getOperand(2))->get() != ISD::SETLT ||
+        Cmp.getOperand(0) != N->getOperand(1) ||
+        Cmp.getOperand(1) != N->getOperand(2))
+      return SDValue();
+    Shft = N->getOperand(1);
+    Clamp = isConstOrConstSplat(N->getOperand(2));
+  } else
+    return SDValue();
+
+  if (!Clamp)
+    return SDValue();
+
+  MVT ScalarType;
+  int ShftAmt = 0;
+  switch (Clamp->getSExtValue()) {
+  case (1 << 7) - 1:
+    ScalarType = MVT::i8;
+    ShftAmt = 7;
+    break;
+  case (1 << 15) - 1:
+    ScalarType = MVT::i16;
+    ShftAmt = 15;
+    break;
+  case (1ULL << 31) - 1:
+    ScalarType = MVT::i32;
+    ShftAmt = 31;
+    break;
+  default:
+    return SDValue();
+  }
+
+  if (Shft.getOpcode() != ISD::SRA)
+    return SDValue();
+  ConstantSDNode *N1 = isConstOrConstSplat(Shft.getOperand(1));
+  if (!N1 || N1->getSExtValue() != ShftAmt)
+    return SDValue();
+
+  SDValue Mul = Shft.getOperand(0);
+  if (Mul.getOpcode() != ISD::MUL)
+    return SDValue();
+
+  SDValue Ext0 = Mul.getOperand(0);
+  SDValue Ext1 = Mul.getOperand(1);
+  if (Ext0.getOpcode() != ISD::SIGN_EXTEND ||
+      Ext1.getOpcode() != ISD::SIGN_EXTEND)
+    return SDValue();
+  EVT VecVT = Ext0.getOperand(0).getValueType();
+  if (VecVT != MVT::v4i32 && VecVT != MVT::v8i16 && VecVT != MVT::v16i8)
+    return SDValue();
+  if (Ext1.getOperand(0).getValueType() != VecVT ||
+      VecVT.getScalarType() != ScalarType ||
+      VT.getScalarSizeInBits() < ScalarType.getScalarSizeInBits() * 2)
+    return SDValue();
+
+  SDLoc DL(Mul);
+  SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, VecVT, Ext0.getOperand(0),
+                                Ext1.getOperand(0));
+  return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, VQDMULH);
+}
+
 static SDValue PerformVSELECTCombine(SDNode *N,
                                      TargetLowering::DAGCombinerInfo &DCI,
                                      const ARMSubtarget *Subtarget) {
+  if (!Subtarget->hasMVEIntegerOps())
+    return SDValue();
+
+  if (SDValue V = PerformVQDMULHCombine(N, DCI.DAG))
+    return V;
+
   // Transforms vselect(not(cond), lhs, rhs) into vselect(cond, rhs, lhs).
   //
   // We need to re-implement this optimization here as the implementation in the
@@ -12117,9 +12317,6 @@ static SDValue PerformVSELECTCombine(SDNode *N,
   //
   // Currently, this is only done for MVE, as it's the only target that benefits
   // from this transformation (e.g. VPNOT+VPSEL becomes a single VPSEL).
-  if (!Subtarget->hasMVEIntegerOps())
-    return SDValue();
-
   if (N->getOperand(0).getOpcode() != ISD::XOR)
     return SDValue();
   SDValue XOR = N->getOperand(0);
@@ -12260,6 +12457,14 @@ static SDValue PerformADDVecReduce(SDNode *N,
     return M;
   if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N1, N0))
     return M;
+  if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N0, N1))
+    return M;
+  if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N0, N1))
+    return M;
+  if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N1, N0))
+    return M;
+  if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N1, N0))
+    return M;
   return SDValue();
 }
 
@@ -13154,7 +13359,7 @@ static SDValue PerformORCombine(SDNode *N,
                 // Canonicalize the vector type to make instruction selection
                 // simpler.
                 EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
-                SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT,
+                SDValue Result = DAG.getNode(ARMISD::VBSP, dl, CanonicalVT,
                                              N0->getOperand(1),
                                              N0->getOperand(0),
                                              N1->getOperand(0));
@@ -13465,6 +13670,12 @@ static SDValue PerformVMOVrhCombine(SDNode *N,
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
+  // fold (VMOVrh (fpconst x)) -> const x
+  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N0)) {
+    APFloat V = C->getValueAPF();
+    return DCI.DAG.getConstant(V.bitcastToAPInt().getZExtValue(), SDLoc(N), VT);
+  }
+
   // fold (VMOVrh (load x)) -> (zextload (i16*)x)
   if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse()) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
@@ -13639,6 +13850,23 @@ PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
     return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0));
   }
 
+  // Turn pred_cast(xor x, -1) into xor(pred_cast x, -1), in order to produce
+  // more VPNOT which might get folded as else predicates.
+  if (Op.getValueType() == MVT::i32 && isBitwiseNot(Op)) {
+    SDValue X =
+        DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0));
+    SDValue C = DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT,
+                                DCI.DAG.getConstant(65535, dl, MVT::i32));
+    return DCI.DAG.getNode(ISD::XOR, dl, VT, X, C);
+  }
+
+  // Only the bottom 16 bits of the source register are used.
+  if (Op.getValueType() == MVT::i32) {
+    APInt DemandedMask = APInt::getLowBitsSet(32, 16);
+    const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
+    if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
+      return SDValue(N, 0);
+  }
   return SDValue();
 }
 
@@ -13851,10 +14079,13 @@ static SDValue CombineBaseUpdate(SDNode *N,
         NumVecs = 3; break;
       case Intrinsic::arm_neon_vld4:     NewOpc = ARMISD::VLD4_UPD;
         NumVecs = 4; break;
+      case Intrinsic::arm_neon_vld1x2:
+      case Intrinsic::arm_neon_vld1x3:
+      case Intrinsic::arm_neon_vld1x4:
       case Intrinsic::arm_neon_vld2dup:
       case Intrinsic::arm_neon_vld3dup:
       case Intrinsic::arm_neon_vld4dup:
-        // TODO: Support updating VLDxDUP nodes. For now, we just skip
+        // TODO: Support updating VLD1x and VLDxDUP nodes. For now, we just skip
         // combining base updates for such intrinsics.
         continue;
       case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD;
@@ -14446,27 +14677,38 @@ static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St,
   // use the VMOVN over splitting the store. We are looking for patterns of:
   // !rev: 0 N 1 N+1 2 N+2 ...
   //  rev: N 0 N+1 1 N+2 2 ...
-  auto isVMOVNOriginalMask = [&](ArrayRef<int> M, bool rev) {
+  // The shuffle may either be a single source (in which case N = NumElts/2) or
+  // two inputs extended with concat to the same size (in which case N =
+  // NumElts).
+  auto isVMOVNShuffle = [&](ShuffleVectorSDNode *SVN, bool Rev) {
+    ArrayRef<int> M = SVN->getMask();
     unsigned NumElts = ToVT.getVectorNumElements();
-    if (NumElts != M.size())
-      return false;
+    if (SVN->getOperand(1).isUndef())
+      NumElts /= 2;
 
-    unsigned Off0 = rev ? NumElts : 0;
-    unsigned Off1 = rev ? 0 : NumElts;
+    unsigned Off0 = Rev ? NumElts : 0;
+    unsigned Off1 = Rev ? 0 : NumElts;
 
-    for (unsigned i = 0; i < NumElts; i += 2) {
-      if (M[i] >= 0 && M[i] != (int)(Off0 + i / 2))
+    for (unsigned I = 0; I < NumElts; I += 2) {
+      if (M[I] >= 0 && M[I] != (int)(Off0 + I / 2))
         return false;
-      if (M[i + 1] >= 0 && M[i + 1] != (int)(Off1 + i / 2))
+      if (M[I + 1] >= 0 && M[I + 1] != (int)(Off1 + I / 2))
         return false;
     }
 
     return true;
   };
 
-  if (auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(Trunc->getOperand(0)))
-    if (isVMOVNOriginalMask(Shuffle->getMask(), false) ||
-        isVMOVNOriginalMask(Shuffle->getMask(), true))
+  // It may be preferable to keep the store unsplit as the trunc may end up
+  // being removed. Check that here.
+  if (Trunc.getOperand(0).getOpcode() == ISD::SMIN) {
+    if (SDValue U = PerformVQDMULHCombine(Trunc.getOperand(0).getNode(), DAG)) {
+      DAG.ReplaceAllUsesWith(Trunc.getOperand(0), U);
+      return SDValue();
+    }
+  }
+  if (auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(Trunc.getOperand(0)))
+    if (isVMOVNShuffle(Shuffle, false) || isVMOVNShuffle(Shuffle, true))
       return SDValue();
 
   LLVMContext &C = *DAG.getContext();
@@ -14487,7 +14729,8 @@ static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St,
   SmallVector<SDValue, 4> Stores;
   for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
     unsigned NewOffset = i * NumElements * ToEltVT.getSizeInBits() / 8;
-    SDValue NewPtr = DAG.getObjectPtrOffset(DL, BasePtr, NewOffset);
+    SDValue NewPtr =
+        DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::Fixed(NewOffset));
 
     SDValue Extract =
         DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0),
@@ -14540,15 +14783,15 @@ static SDValue PerformSTORECombine(SDNode *N,
     SDValue BasePtr = St->getBasePtr();
     SDValue NewST1 = DAG.getStore(
         St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0),
-        BasePtr, St->getPointerInfo(), St->getAlignment(),
+        BasePtr, St->getPointerInfo(), St->getOriginalAlign(),
         St->getMemOperand()->getFlags());
 
     SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
                                     DAG.getConstant(4, DL, MVT::i32));
     return DAG.getStore(NewST1.getValue(0), DL,
                         StVal.getNode()->getOperand(isBigEndian ? 0 : 1),
-                        OffsetPtr, St->getPointerInfo(),
-                        std::min(4U, St->getAlignment() / 2),
+                        OffsetPtr, St->getPointerInfo().getWithOffset(4),
+                        St->getOriginalAlign(),
                         St->getMemOperand()->getFlags());
   }
 
@@ -14722,27 +14965,105 @@ static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG,
   //   VADDLV u/s 32
   //   VMLALV u/s 16/32
 
+  // If the input vector is smaller than legal (v4i8/v4i16 for example) we can
+  // extend it and use v4i32 instead.
+  auto ExtendIfNeeded = [&](SDValue A, unsigned ExtendCode) {
+    EVT AVT = A.getValueType();
+    if (!AVT.is128BitVector())
+      A = DAG.getNode(ExtendCode, dl,
+                      AVT.changeVectorElementType(MVT::getIntegerVT(
+                          128 / AVT.getVectorMinNumElements())),
+                      A);
+    return A;
+  };
   auto IsVADDV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes) {
     if (ResVT != RetTy || N0->getOpcode() != ExtendCode)
       return SDValue();
     SDValue A = N0->getOperand(0);
     if (llvm::any_of(ExtTypes, [&A](MVT Ty) { return A.getValueType() == Ty; }))
-      return A;
+      return ExtendIfNeeded(A, ExtendCode);
+    return SDValue();
+  };
+  auto IsPredVADDV = [&](MVT RetTy, unsigned ExtendCode,
+                         ArrayRef<MVT> ExtTypes, SDValue &Mask) {
+    if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT ||
+        !ISD::isBuildVectorAllZeros(N0->getOperand(2).getNode()))
+      return SDValue();
+    Mask = N0->getOperand(0);
+    SDValue Ext = N0->getOperand(1);
+    if (Ext->getOpcode() != ExtendCode)
+      return SDValue();
+    SDValue A = Ext->getOperand(0);
+    if (llvm::any_of(ExtTypes, [&A](MVT Ty) { return A.getValueType() == Ty; }))
+      return ExtendIfNeeded(A, ExtendCode);
     return SDValue();
   };
   auto IsVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,
                      SDValue &A, SDValue &B) {
-    if (ResVT != RetTy || N0->getOpcode() != ISD::MUL)
+    // For a vmla we are trying to match a larger pattern:
+    // ExtA = sext/zext A
+    // ExtB = sext/zext B
+    // Mul = mul ExtA, ExtB
+    // vecreduce.add Mul
+    // There might also be en extra extend between the mul and the addreduce, so
+    // long as the bitwidth is high enough to make them equivalent (for example
+    // original v8i16 might be mul at v8i32 and the reduce happens at v8i64).
+    if (ResVT != RetTy)
       return false;
-    SDValue ExtA = N0->getOperand(0);
-    SDValue ExtB = N0->getOperand(1);
+    SDValue Mul = N0;
+    if (Mul->getOpcode() == ExtendCode &&
+        Mul->getOperand(0).getScalarValueSizeInBits() * 2 >=
+            ResVT.getScalarSizeInBits())
+      Mul = Mul->getOperand(0);
+    if (Mul->getOpcode() != ISD::MUL)
+      return false;
+    SDValue ExtA = Mul->getOperand(0);
+    SDValue ExtB = Mul->getOperand(1);
     if (ExtA->getOpcode() != ExtendCode && ExtB->getOpcode() != ExtendCode)
       return false;
     A = ExtA->getOperand(0);
     B = ExtB->getOperand(0);
     if (A.getValueType() == B.getValueType() &&
-        llvm::any_of(ExtTypes, [&A](MVT Ty) { return A.getValueType() == Ty; }))
+        llvm::any_of(ExtTypes,
+                     [&A](MVT Ty) { return A.getValueType() == Ty; })) {
+      A = ExtendIfNeeded(A, ExtendCode);
+      B = ExtendIfNeeded(B, ExtendCode);
       return true;
+    }
+    return false;
+  };
+  auto IsPredVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,
+                     SDValue &A, SDValue &B, SDValue &Mask) {
+    // Same as the pattern above with a select for the zero predicated lanes
+    // ExtA = sext/zext A
+    // ExtB = sext/zext B
+    // Mul = mul ExtA, ExtB
+    // N0 = select Mask, Mul, 0
+    // vecreduce.add N0
+    if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT ||
+        !ISD::isBuildVectorAllZeros(N0->getOperand(2).getNode()))
+      return false;
+    Mask = N0->getOperand(0);
+    SDValue Mul = N0->getOperand(1);
+    if (Mul->getOpcode() == ExtendCode &&
+        Mul->getOperand(0).getScalarValueSizeInBits() * 2 >=
+            ResVT.getScalarSizeInBits())
+      Mul = Mul->getOperand(0);
+    if (Mul->getOpcode() != ISD::MUL)
+      return false;
+    SDValue ExtA = Mul->getOperand(0);
+    SDValue ExtB = Mul->getOperand(1);
+    if (ExtA->getOpcode() != ExtendCode && ExtB->getOpcode() != ExtendCode)
+      return false;
+    A = ExtA->getOperand(0);
+    B = ExtB->getOperand(0);
+    if (A.getValueType() == B.getValueType() &&
+        llvm::any_of(ExtTypes,
+                     [&A](MVT Ty) { return A.getValueType() == Ty; })) {
+      A = ExtendIfNeeded(A, ExtendCode);
+      B = ExtendIfNeeded(B, ExtendCode);
+      return true;
+    }
     return false;
   };
   auto Create64bitNode = [&](unsigned Opcode, ArrayRef<SDValue> Ops) {
@@ -14755,20 +15076,93 @@ static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG,
     return DAG.getNode(ARMISD::VADDVs, dl, ResVT, A);
   if (SDValue A = IsVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}))
     return DAG.getNode(ARMISD::VADDVu, dl, ResVT, A);
-  if (SDValue A = IsVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}))
+  if (SDValue A = IsVADDV(MVT::i64, ISD::SIGN_EXTEND,
+                          {MVT::v4i8, MVT::v4i16, MVT::v4i32}))
     return Create64bitNode(ARMISD::VADDLVs, {A});
-  if (SDValue A = IsVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}))
+  if (SDValue A = IsVADDV(MVT::i64, ISD::ZERO_EXTEND,
+                          {MVT::v4i8, MVT::v4i16, MVT::v4i32}))
     return Create64bitNode(ARMISD::VADDLVu, {A});
+  if (SDValue A = IsVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}))
+    return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
+                       DAG.getNode(ARMISD::VADDVs, dl, MVT::i32, A));
+  if (SDValue A = IsVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}))
+    return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
+                       DAG.getNode(ARMISD::VADDVu, dl, MVT::i32, A));
+
+  SDValue Mask;
+  if (SDValue A = IsPredVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
+    return DAG.getNode(ARMISD::VADDVps, dl, ResVT, A, Mask);
+  if (SDValue A = IsPredVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
+    return DAG.getNode(ARMISD::VADDVpu, dl, ResVT, A, Mask);
+  if (SDValue A = IsPredVADDV(MVT::i64, ISD::SIGN_EXTEND,
+                              {MVT::v4i8, MVT::v4i16, MVT::v4i32}, Mask))
+    return Create64bitNode(ARMISD::VADDLVps, {A, Mask});
+  if (SDValue A = IsPredVADDV(MVT::i64, ISD::ZERO_EXTEND,
+                              {MVT::v4i8, MVT::v4i16, MVT::v4i32}, Mask))
+    return Create64bitNode(ARMISD::VADDLVpu, {A, Mask});
+  if (SDValue A = IsPredVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, Mask))
+    return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
+                       DAG.getNode(ARMISD::VADDVps, dl, MVT::i32, A, Mask));
+  if (SDValue A = IsPredVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, Mask))
+    return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
+                       DAG.getNode(ARMISD::VADDVpu, dl, MVT::i32, A, Mask));
 
   SDValue A, B;
   if (IsVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
     return DAG.getNode(ARMISD::VMLAVs, dl, ResVT, A, B);
   if (IsVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
     return DAG.getNode(ARMISD::VMLAVu, dl, ResVT, A, B);
-  if (IsVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B))
+  if (IsVMLAV(MVT::i64, ISD::SIGN_EXTEND,
+              {MVT::v8i8, MVT::v8i16, MVT::v4i8, MVT::v4i16, MVT::v4i32}, A, B))
     return Create64bitNode(ARMISD::VMLALVs, {A, B});
-  if (IsVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B))
+  if (IsVMLAV(MVT::i64, ISD::ZERO_EXTEND,
+              {MVT::v8i8, MVT::v8i16, MVT::v4i8, MVT::v4i16, MVT::v4i32}, A, B))
     return Create64bitNode(ARMISD::VMLALVu, {A, B});
+  if (IsVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B))
+    return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
+                       DAG.getNode(ARMISD::VMLAVs, dl, MVT::i32, A, B));
+  if (IsVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B))
+    return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
+                       DAG.getNode(ARMISD::VMLAVu, dl, MVT::i32, A, B));
+
+  if (IsPredVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B, Mask))
+    return DAG.getNode(ARMISD::VMLAVps, dl, ResVT, A, B, Mask);
+  if (IsPredVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B, Mask))
+    return DAG.getNode(ARMISD::VMLAVpu, dl, ResVT, A, B, Mask);
+  if (IsPredVMLAV(MVT::i64, ISD::SIGN_EXTEND,
+                  {MVT::v8i8, MVT::v8i16, MVT::v4i8, MVT::v4i16, MVT::v4i32}, A,
+                  B, Mask))
+    return Create64bitNode(ARMISD::VMLALVps, {A, B, Mask});
+  if (IsPredVMLAV(MVT::i64, ISD::ZERO_EXTEND,
+                  {MVT::v8i8, MVT::v8i16, MVT::v4i8, MVT::v4i16, MVT::v4i32}, A,
+                  B, Mask))
+    return Create64bitNode(ARMISD::VMLALVpu, {A, B, Mask});
+  if (IsPredVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B, Mask))
+    return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
+                       DAG.getNode(ARMISD::VMLAVps, dl, MVT::i32, A, B, Mask));
+  if (IsPredVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B, Mask))
+    return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
+                       DAG.getNode(ARMISD::VMLAVpu, dl, MVT::i32, A, B, Mask));
+
+  // Some complications. We can get a case where the two inputs of the mul are
+  // the same, then the output sext will have been helpfully converted to a
+  // zext. Turn it back.
+  SDValue Op = N0;
+  if (Op->getOpcode() == ISD::VSELECT)
+    Op = Op->getOperand(1);
+  if (Op->getOpcode() == ISD::ZERO_EXTEND &&
+      Op->getOperand(0)->getOpcode() == ISD::MUL) {
+    SDValue Mul = Op->getOperand(0);
+    if (Mul->getOperand(0) == Mul->getOperand(1) &&
+        Mul->getOperand(0)->getOpcode() == ISD::SIGN_EXTEND) {
+      SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, N0->getValueType(0), Mul);
+      if (Op != N0)
+        Ext = DAG.getNode(ISD::VSELECT, dl, N0->getValueType(0),
+                          N0->getOperand(0), Ext, N0->getOperand(2));
+      return DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, Ext);
+    }
+  }
+
   return SDValue();
 }
 
@@ -15220,12 +15614,13 @@ static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) {
   SmallVector<SDValue, 4> Chains;
   for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
     unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
-    SDValue NewPtr = DAG.getObjectPtrOffset(DL, BasePtr, NewOffset);
+    SDValue NewPtr =
+        DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::Fixed(NewOffset));
 
     SDValue NewLoad =
         DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
                     LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
-                    Alignment.value(), MMOFlags, AAInfo);
+                    Alignment, MMOFlags, AAInfo);
     Loads.push_back(NewLoad);
     Chains.push_back(SDValue(NewLoad.getNode(), 1));
   }
@@ -15313,6 +15708,9 @@ static SDValue PerformMinMaxCombine(SDNode *N, SelectionDAG &DAG,
   if (!ST->hasMVEIntegerOps())
     return SDValue();
 
+  if (SDValue V = PerformVQDMULHCombine(N, DAG))
+    return V;
+
   if (VT != MVT::v4i32 && VT != MVT::v8i16)
     return SDValue();
 
@@ -15920,6 +16318,8 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   switch (N->getOpcode()) {
   default: break;
+  case ISD::SELECT_CC:
+  case ISD::SELECT:     return PerformSELECTCombine(N, DCI, Subtarget);
   case ISD::VSELECT:    return PerformVSELECTCombine(N, DCI, Subtarget);
   case ISD::ABS:        return PerformABSCombine(N, DCI, Subtarget);
   case ARMISD::ADDE:    return PerformADDECombine(N, DCI, Subtarget);
@@ -16336,6 +16736,19 @@ bool ARMTargetLowering::shouldSinkOperands(Instruction *I,
         switch (II->getIntrinsicID()) {
         case Intrinsic::fma:
           return !IsFMS(I);
+        case Intrinsic::arm_mve_add_predicated:
+        case Intrinsic::arm_mve_mul_predicated:
+        case Intrinsic::arm_mve_qadd_predicated:
+        case Intrinsic::arm_mve_hadd_predicated:
+        case Intrinsic::arm_mve_vqdmull_predicated:
+        case Intrinsic::arm_mve_qdmulh_predicated:
+        case Intrinsic::arm_mve_qrdmulh_predicated:
+        case Intrinsic::arm_mve_fma_predicated:
+          return true;
+        case Intrinsic::arm_mve_sub_predicated:
+        case Intrinsic::arm_mve_qsub_predicated:
+        case Intrinsic::arm_mve_hsub_predicated:
+          return Operand == 1;
         default:
           return false;
         }
@@ -17064,8 +17477,7 @@ void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
       return;
 
     KnownBits KnownRHS = DAG.computeKnownBits(Op.getOperand(1), Depth+1);
-    Known.Zero &= KnownRHS.Zero;
-    Known.One  &= KnownRHS.One;
+    Known = KnownBits::commonBits(Known, KnownRHS);
     return;
   }
   case ISD::INTRINSIC_W_CHAIN: {
@@ -17938,6 +18350,9 @@ bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
     return false;
   if (VT == MVT::f16 && Subtarget->hasFullFP16())
     return ARM_AM::getFP16Imm(Imm) != -1;
+  if (VT == MVT::f32 && Subtarget->hasFullFP16() &&
+      ARM_AM::getFP32FP16Imm(Imm) != -1)
+    return true;
   if (VT == MVT::f32)
     return ARM_AM::getFP32Imm(Imm) != -1;
   if (VT == MVT::f64 && Subtarget->hasFP64())
@@ -18247,6 +18662,8 @@ ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
              : AtomicExpansionKind::None;
 }
 
+// Similar to shouldExpandAtomicRMWInIR, ldrex/strex can be used  up to 32
+// bits, and up to 64 bits on the non-M profiles.
 TargetLowering::AtomicExpansionKind
 ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const {
   // At -O0, fast-regalloc cannot cope with the live vregs necessary to
@@ -18254,9 +18671,11 @@ ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const {
   // on the stack and close enough to the spill slot, this can lead to a
   // situation where the monitor always gets cleared and the atomic operation
   // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
+  unsigned Size = AI->getOperand(1)->getType()->getPrimitiveSizeInBits();
   bool HasAtomicCmpXchg =
       !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps();
-  if (getTargetMachine().getOptLevel() != 0 && HasAtomicCmpXchg)
+  if (getTargetMachine().getOptLevel() != 0 && HasAtomicCmpXchg &&
+      Size <= (Subtarget->isMClass() ? 32U : 64U))
     return AtomicExpansionKind::LLSC;
   return AtomicExpansionKind::None;
 }
@@ -18711,8 +19130,7 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
 
       SmallVector<Value *, 6> Ops;
       Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));
-      for (auto S : Shuffles)
-        Ops.push_back(S);
+      append_range(Ops, Shuffles);
       Ops.push_back(Builder.getInt32(SI->getAlignment()));
       Builder.CreateCall(VstNFunc, Ops);
     } else {
@@ -18728,8 +19146,7 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
 
       SmallVector<Value *, 6> Ops;
       Ops.push_back(Builder.CreateBitCast(BaseAddr, EltPtrTy));
-      for (auto S : Shuffles)
-        Ops.push_back(S);
+      append_range(Ops, Shuffles);
       for (unsigned F = 0; F < Factor; F++) {
         Ops.push_back(Builder.getInt32(F));
         Builder.CreateCall(VstNFunc, Ops);
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.h
index 8b1f4183032e..61a127af07de 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -216,23 +216,37 @@ class VectorType;
       VMULLs,       // ...signed
       VMULLu,       // ...unsigned
 
+      VQDMULH,      // MVE vqdmulh instruction
+
       // MVE reductions
       VADDVs,       // sign- or zero-extend the elements of a vector to i32,
       VADDVu,       //   add them all together, and return an i32 of their sum
+      VADDVps,      // Same as VADDV[su] but with a v4i1 predicate mask
+      VADDVpu,
       VADDLVs,      // sign- or zero-extend elements to i64 and sum, returning
       VADDLVu,      //   the low and high 32-bit halves of the sum
-      VADDLVAs,     // same as VADDLV[su] but also add an input accumulator
+      VADDLVAs,     // Same as VADDLV[su] but also add an input accumulator
       VADDLVAu,     //   provided as low and high halves
-      VADDLVps,     // same as VADDLVs but with a v4i1 predicate mask
-      VADDLVpu,     // same as VADDLVu but with a v4i1 predicate mask
-      VADDLVAps,    // same as VADDLVps but with a v4i1 predicate mask
-      VADDLVApu,    // same as VADDLVpu but with a v4i1 predicate mask
-      VMLAVs,
-      VMLAVu,
-      VMLALVs,
-      VMLALVu,
-      VMLALVAs,
-      VMLALVAu,
+      VADDLVps,     // Same as VADDLV[su] but with a v4i1 predicate mask
+      VADDLVpu,
+      VADDLVAps,    // Same as VADDLVp[su] but with a v4i1 predicate mask
+      VADDLVApu,
+      VMLAVs,       // sign- or zero-extend the elements of two vectors to i32, multiply them
+      VMLAVu,       //   and add the results together, returning an i32 of their sum
+      VMLAVps,      // Same as VMLAV[su] with a v4i1 predicate mask
+      VMLAVpu,
+      VMLALVs,      // Same as VMLAV but with i64, returning the low and
+      VMLALVu,      //   high 32-bit halves of the sum
+      VMLALVps,     // Same as VMLALV[su] with a v4i1 predicate mask
+      VMLALVpu,
+      VMLALVAs,     // Same as VMLALV but also add an input accumulator
+      VMLALVAu,     //   provided as low and high halves
+      VMLALVAps,    // Same as VMLALVA[su] with a v4i1 predicate mask
+      VMLALVApu,
+      VMINVu,        // Find minimum unsigned value of a vector and register
+      VMINVs,        // Find minimum signed value of a vector and register
+      VMAXVu,        // Find maximum unsigned value of a vector and register
+      VMAXVs,        // Find maximum signed value of a vector and register
 
       SMULWB,       // Signed multiply word by half word, bottom
       SMULWT,       // Signed multiply word by half word, top
@@ -271,8 +285,8 @@ class VectorType;
       // Vector AND with NOT of immediate
       VBICIMM,
 
-      // Vector bitwise select
-      VBSL,
+      // Pseudo vector bitwise select
+      VBSP,
 
       // Pseudo-instruction representing a memory copy using ldm/stm
       // instructions.
@@ -520,12 +534,6 @@ class VectorType;
     const TargetRegisterClass *
     getRegClassFor(MVT VT, bool isDivergent = false) const override;
 
-    /// Returns true if a cast between SrcAS and DestAS is a noop.
-    bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
-      // Addrspacecasts are always noops.
-      return true;
-    }
-
     bool shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize,
                                 unsigned &PrefAlign) const override;
 
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrFormats.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrFormats.td
index e13f3437cc7b..85da7c5a535e 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrFormats.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrFormats.td
@@ -403,8 +403,9 @@ class InstTemplate<AddrMode am, int sz, IndexMode im,
   bit isUnaryDataProc = 0;
   bit canXformTo16Bit = 0;
   // The instruction is a 16-bit flag setting Thumb instruction. Used
-  // by the parser to determine whether to require the 'S' suffix on the
-  // mnemonic (when not in an IT block) or preclude it (when in an IT block).
+  // by the parser and if-converter to determine whether to require the 'S'
+  // suffix on the mnemonic (when not in an IT block) or preclude it (when
+  // in an IT block).
   bit thumbArithFlagSetting = 0;
 
   bit validForTailPredication = 0;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.td
index da0a836c8f95..8dcb319923ae 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -104,11 +104,6 @@ def SDT_ARMIntShiftParts : SDTypeProfile<2, 3, [SDTCisSameAs<0, 1>,
                                               SDTCisInt<0>,
                                               SDTCisInt<4>]>;
 
-// TODO Add another operand for 'Size' so that we can re-use this node when we
-// start supporting *TP versions.
-def SDT_ARMLoLoop : SDTypeProfile<0, 2, [SDTCisVT<0, i32>,
-                                         SDTCisVT<1, OtherVT>]>;
-
 def ARMSmlald        : SDNode<"ARMISD::SMLALD", SDT_LongMac>;
 def ARMSmlaldx       : SDNode<"ARMISD::SMLALDX", SDT_LongMac>;
 def ARMSmlsld        : SDNode<"ARMISD::SMLSLD", SDT_LongMac>;
@@ -167,9 +162,9 @@ def ARMcmov          : SDNode<"ARMISD::CMOV", SDT_ARMCMov,
                               [SDNPInGlue]>;
 def ARMsubs          : SDNode<"ARMISD::SUBS", SDTIntBinOp, [SDNPOutGlue]>;
 
-def ARMssatnoshift   : SDNode<"ARMISD::SSAT", SDTIntSatNoShOp, []>;
+def ARMssat   : SDNode<"ARMISD::SSAT", SDTIntSatNoShOp, []>;
 
-def ARMusatnoshift   : SDNode<"ARMISD::USAT", SDTIntSatNoShOp, []>;
+def ARMusat   : SDNode<"ARMISD::USAT", SDTIntSatNoShOp, []>;
 
 def ARMbrcond        : SDNode<"ARMISD::BRCOND", SDT_ARMBrcond,
                               [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>;
@@ -303,10 +298,6 @@ def SDTARMVCMPZ   : SDTypeProfile<1, 2, [SDTCisInt<2>]>;
 def ARMvcmp      : SDNode<"ARMISD::VCMP", SDTARMVCMP>;
 def ARMvcmpz     : SDNode<"ARMISD::VCMPZ", SDTARMVCMPZ>;
 
-def ARMWLS      : SDNode<"ARMISD::WLS", SDT_ARMLoLoop, [SDNPHasChain]>;
-def ARMLE       : SDNode<"ARMISD::LE", SDT_ARMLoLoop, [SDNPHasChain]>;
-def ARMLoopDec  : SDNode<"ARMISD::LOOP_DEC", SDTIntBinOp, [SDNPHasChain]>;
-
 // 'VECTOR_REG_CAST' is an operation that reinterprets the contents of a
 // vector register as a different vector type, without changing the contents of
 // the register. It differs from 'bitconvert' in that bitconvert reinterprets
@@ -380,6 +371,11 @@ def imm_not_XFORM : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(~(int)N->getZExtValue(), SDLoc(N), MVT::i32);
 }]>;
 
+// asr_imm_XFORM - Returns a shift immediate with bit {5} set to 1
+def asr_imm_XFORM : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(0x20 | N->getZExtValue(), SDLoc(N), MVT:: i32);
+}]>;
+
 /// imm16_31 predicate - True if the 32-bit immediate is in the range [16,31].
 def imm16_31 : ImmLeaf<i32, [{
   return (int32_t)Imm >= 16 && (int32_t)Imm < 32;
@@ -446,6 +442,8 @@ def fsub_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fsub node:$lhs, node:$rhs),[{
 def imm_even : ImmLeaf<i32, [{ return (Imm & 1) == 0; }]>;
 def imm_odd : ImmLeaf<i32, [{ return (Imm & 1) == 1; }]>;
 
+def asr_imm : ImmLeaf<i32, [{ return Imm > 0 && Imm <= 32; }], asr_imm_XFORM>;
+
 //===----------------------------------------------------------------------===//
 // NEON/MVE pattern fragments
 //
@@ -498,6 +496,18 @@ def SubReg_i32_lane : SDNodeXForm<imm, [{
 }]>;
 
 
+def ARMimmAllZerosV: PatLeaf<(bitconvert (v4i32 (ARMvmovImm (i32 0))))>;
+def ARMimmAllZerosD: PatLeaf<(bitconvert (v2i32 (ARMvmovImm (i32 0))))>;
+def ARMimmAllOnesV: PatLeaf<(bitconvert (v16i8 (ARMvmovImm (i32 0xEFF))))>;
+def ARMimmAllOnesD: PatLeaf<(bitconvert (v8i8 (ARMvmovImm (i32 0xEFF))))>;
+
+def ARMimmOneV: PatLeaf<(ARMvmovImm (i32 timm)), [{
+  ConstantSDNode *ConstVal = cast<ConstantSDNode>(N->getOperand(0));
+  unsigned EltBits = 0;
+  uint64_t EltVal = ARM_AM::decodeVMOVModImm(ConstVal->getZExtValue(), EltBits);
+  return (EltBits == N->getValueType(0).getScalarSizeInBits() && EltVal == 0x01);
+}]>;
+
 
 //===----------------------------------------------------------------------===//
 // Operand Definitions.
@@ -812,7 +822,9 @@ def mod_imm_neg : Operand<i32>, PatLeaf<(imm), [{
 def arm_i32imm : IntImmLeaf<i32, [{
   if (Subtarget->useMovt())
     return true;
-  return ARM_AM::isSOImmTwoPartVal(Imm.getZExtValue());
+  if (ARM_AM::isSOImmTwoPartVal(Imm.getZExtValue()))
+    return true;
+  return ARM_AM::isSOImmTwoPartValNeg(Imm.getZExtValue());
 }]>;
 
 /// imm0_1 predicate - Immediate in the range [0,1].
@@ -2480,23 +2492,29 @@ let isCall = 1,
   }
 
   // ARMv5T and above
-  def BLX : AXI<(outs), (ins GPR:$func), BrMiscFrm,
-                IIC_Br, "blx\t$func",
-                [(ARMcall GPR:$func)]>,
+  def BLX : AXI<(outs), (ins GPR:$func), BrMiscFrm, IIC_Br, "blx\t$func", []>,
             Requires<[IsARM, HasV5T]>, Sched<[WriteBrL]> {
     bits<4> func;
     let Inst{31-4} = 0b1110000100101111111111110011;
     let Inst{3-0}  = func;
   }
+  def BLX_noip :  ARMPseudoExpand<(outs), (ins GPRnoip:$func),
+                   4, IIC_Br, [], (BLX GPR:$func)>,
+                  Requires<[IsARM, HasV5T]>, Sched<[WriteBrL]>;
+
 
   def BLX_pred : AI<(outs), (ins GPR:$func), BrMiscFrm,
-                    IIC_Br, "blx", "\t$func",
-                    [(ARMcall_pred GPR:$func)]>,
+                    IIC_Br, "blx", "\t$func", []>,
                  Requires<[IsARM, HasV5T]>, Sched<[WriteBrL]> {
     bits<4> func;
     let Inst{27-4} = 0b000100101111111111110011;
     let Inst{3-0}  = func;
   }
+  def BLX_pred_noip :  ARMPseudoExpand<(outs), (ins GPRnoip:$func),
+                   4, IIC_Br, [],
+                   (BLX_pred GPR:$func, (ops 14, zero_reg))>,
+                   Requires<[IsARM, HasV5T]>, Sched<[WriteBrL]>;
+
 
   // ARMv4T
   // Note: Restrict $func to the tGPR regclass to prevent it being in LR.
@@ -2522,6 +2540,16 @@ let isCall = 1,
              Requires<[IsARM]>, Sched<[WriteBr]>;
 }
 
+def : ARMPat<(ARMcall GPR:$func), (BLX $func)>,
+      Requires<[IsARM, HasV5T, NoSLSBLRMitigation]>;
+def : ARMPat<(ARMcall GPRnoip:$func), (BLX_noip $func)>,
+      Requires<[IsARM, HasV5T, SLSBLRMitigation]>;
+def : ARMPat<(ARMcall_pred GPR:$func), (BLX_pred $func)>,
+      Requires<[IsARM, HasV5T, NoSLSBLRMitigation]>;
+def : ARMPat<(ARMcall_pred GPRnoip:$func), (BLX_pred_noip $func)>,
+      Requires<[IsARM, HasV5T, SLSBLRMitigation]>;
+
+
 let isBranch = 1, isTerminator = 1 in {
   // FIXME: should be able to write a pattern for ARMBrcond, but can't use
   // a two-value operand where a dag node expects two operands. :(
@@ -4061,14 +4089,31 @@ def : ARMV6Pat<(int_arm_ssat GPRnopc:$a, imm1_32:$pos),
                (SSAT imm1_32:$pos, GPRnopc:$a, 0)>;
 def : ARMV6Pat<(int_arm_usat GPRnopc:$a, imm0_31:$pos),
                (USAT imm0_31:$pos, GPRnopc:$a, 0)>;
-def : ARMPat<(ARMssatnoshift GPRnopc:$Rn, imm0_31:$imm),
+def : ARMPat<(ARMssat GPRnopc:$Rn, imm0_31:$imm),
              (SSAT imm0_31:$imm, GPRnopc:$Rn, 0)>;
-def : ARMPat<(ARMusatnoshift GPRnopc:$Rn, imm0_31:$imm),
+def : ARMPat<(ARMusat GPRnopc:$Rn, imm0_31:$imm),
              (USAT imm0_31:$imm, GPRnopc:$Rn, 0)>;
 def : ARMV6Pat<(int_arm_ssat16 GPRnopc:$a, imm1_16:$pos),
                (SSAT16 imm1_16:$pos, GPRnopc:$a)>;
 def : ARMV6Pat<(int_arm_usat16 GPRnopc:$a, imm0_15:$pos),
                (USAT16 imm0_15:$pos, GPRnopc:$a)>;
+def : ARMV6Pat<(int_arm_ssat (shl GPRnopc:$a, imm0_31:$shft), imm1_32:$pos),
+               (SSAT imm1_32:$pos, GPRnopc:$a, imm0_31:$shft)>;
+def : ARMV6Pat<(int_arm_ssat (sra GPRnopc:$a, asr_imm:$shft), imm1_32:$pos),
+               (SSAT imm1_32:$pos, GPRnopc:$a, asr_imm:$shft)>;
+def : ARMV6Pat<(int_arm_usat (shl GPRnopc:$a, imm0_31:$shft), imm0_31:$pos),
+               (USAT imm0_31:$pos, GPRnopc:$a, imm0_31:$shft)>;
+def : ARMV6Pat<(int_arm_usat (sra GPRnopc:$a, asr_imm:$shft), imm0_31:$pos),
+               (USAT imm0_31:$pos, GPRnopc:$a, asr_imm:$shft)>;
+def : ARMPat<(ARMssat (shl GPRnopc:$Rn, imm0_31:$shft), imm0_31:$pos),
+               (SSAT imm0_31:$pos, GPRnopc:$Rn, imm0_31:$shft)>;                            
+def : ARMPat<(ARMssat (sra GPRnopc:$Rn, asr_imm:$shft), imm0_31:$pos),
+               (SSAT imm0_31:$pos, GPRnopc:$Rn, asr_imm:$shft)>;
+def : ARMPat<(ARMusat (shl GPRnopc:$Rn, imm0_31:$shft), imm0_31:$pos),
+               (USAT imm0_31:$pos, GPRnopc:$Rn, imm0_31:$shft)>;  
+def : ARMPat<(ARMusat (sra GPRnopc:$Rn, asr_imm:$shft), imm0_31:$pos),
+               (USAT imm0_31:$pos, GPRnopc:$Rn, asr_imm:$shft)>;
+
 
 //===----------------------------------------------------------------------===//
 //  Bitwise Instructions.
@@ -6336,6 +6381,15 @@ def SPACE : PseudoInst<(outs GPR:$Rd), (ins i32imm:$size, GPR:$Rn),
                        NoItinerary,
                        [(set GPR:$Rd, (int_arm_space timm:$size, GPR:$Rn))]>;
 
+// SpeculationBarrierEndBB must only be used after an unconditional control
+// flow, i.e. after a terminator for which isBarrier is True.
+let hasSideEffects = 1, isCodeGenOnly = 1, isTerminator = 1, isBarrier = 1 in {
+  def SpeculationBarrierISBDSBEndBB
+      : PseudoInst<(outs), (ins), NoItinerary, []>, Sched<[]>;
+  def SpeculationBarrierSBEndBB
+      : PseudoInst<(outs), (ins), NoItinerary, []>, Sched<[]>;
+}
+
 //===----------------------------------
 // Atomic cmpxchg for -O0
 //===----------------------------------
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrMVE.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrMVE.td
index 2a1f50d97e3b..0dfea68887e5 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -318,6 +318,78 @@ def MVE_v2f64 : MVEVectorVTInfo<v2f64, ?,     v4i1,  ?,    0b11, "f", ?>;
 def MVE_v16p8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, v8i1, 0b11, "p", 0b0>;
 def MVE_v8p16 : MVEVectorVTInfo<v8i16, v4i32, v8i1,  v4i1, 0b11, "p", 0b1>;
 
+multiclass MVE_TwoOpPattern<MVEVectorVTInfo VTI, PatFrag Op, Intrinsic PredInt,
+                            dag PredOperands, Instruction Inst,
+                            SDPatternOperator IdentityVec = null_frag> {
+  // Unpredicated
+  def : Pat<(VTI.Vec (Op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
+            (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
+
+  // Predicated with select
+  if !ne(VTI.Size, 0b11) then {
+    def : Pat<(VTI.Vec (vselect (VTI.Pred VCCR:$mask),
+                                (VTI.Vec (Op (VTI.Vec MQPR:$Qm),
+                                             (VTI.Vec MQPR:$Qn))),
+                                (VTI.Vec MQPR:$inactive))),
+              (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+                              ARMVCCThen, (VTI.Pred VCCR:$mask),
+                              (VTI.Vec MQPR:$inactive)))>;
+
+    // Optionally with the select folded through the op
+    def : Pat<(VTI.Vec (Op (VTI.Vec MQPR:$Qm),
+                           (VTI.Vec (vselect (VTI.Pred VCCR:$mask),
+                                             (VTI.Vec MQPR:$Qn),
+                                             (VTI.Vec IdentityVec))))),
+              (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+                              ARMVCCThen, (VTI.Pred VCCR:$mask),
+                              (VTI.Vec MQPR:$Qm)))>;
+  }
+
+  // Predicated with intrinsic
+  def : Pat<(VTI.Vec !con((PredInt (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)),
+                          PredOperands,
+                          (? (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive)))),
+            (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+                            ARMVCCThen, (VTI.Pred VCCR:$mask),
+                            (VTI.Vec MQPR:$inactive)))>;
+}
+
+multiclass MVE_TwoOpPatternDup<MVEVectorVTInfo VTI, PatFrag Op, Intrinsic PredInt,
+                               dag PredOperands, Instruction Inst,
+                               SDPatternOperator IdentityVec = null_frag> {
+  // Unpredicated
+  def : Pat<(VTI.Vec (Op (VTI.Vec MQPR:$Qm), (VTI.Vec (ARMvdup rGPR:$Rn)))),
+            (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), rGPR:$Rn))>;
+
+  // Predicated with select
+  if !ne(VTI.Size, 0b11) then {
+    def : Pat<(VTI.Vec (vselect (VTI.Pred VCCR:$mask),
+                                (VTI.Vec (Op (VTI.Vec MQPR:$Qm),
+                                             (VTI.Vec (ARMvdup rGPR:$Rn)))),
+                                (VTI.Vec MQPR:$inactive))),
+              (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), rGPR:$Rn,
+                              ARMVCCThen, (VTI.Pred VCCR:$mask),
+                              (VTI.Vec MQPR:$inactive)))>;
+
+    // Optionally with the select folded through the op
+    def : Pat<(VTI.Vec (Op (VTI.Vec MQPR:$Qm),
+                           (VTI.Vec (vselect (VTI.Pred VCCR:$mask),
+                                             (ARMvdup rGPR:$Rn),
+                                             (VTI.Vec IdentityVec))))),
+              (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), rGPR:$Rn,
+                              ARMVCCThen, (VTI.Pred VCCR:$mask),
+                              (VTI.Vec MQPR:$Qm)))>;
+  }
+
+  // Predicated with intrinsic
+  def : Pat<(VTI.Vec !con((PredInt (VTI.Vec MQPR:$Qm), (VTI.Vec (ARMvdup rGPR:$Rn))),
+                          PredOperands,
+                          (? (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive)))),
+            (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), rGPR:$Rn,
+                            ARMVCCThen, (VTI.Pred VCCR:$mask),
+                            (VTI.Vec MQPR:$inactive)))>;
+}
+
 // --------- Start of base classes for the instructions themselves
 
 class MVE_MI<dag oops, dag iops, InstrItinClass itin, string asm,
@@ -378,7 +450,7 @@ class MVE_ScalarShift<string iname, dag oops, dag iops, string asm, string cstr,
   : MVE_MI_with_pred<oops, iops, NoItinerary, iname, asm, cstr, pattern> {
   let Inst{31-20} = 0b111010100101;
   let Inst{8} = 0b1;
-
+  let validForTailPredication=1;
 }
 
 class MVE_ScalarShiftSingleReg<string iname, dag iops, string asm, string cstr,
@@ -612,8 +684,13 @@ class MVE_VADDV<string iname, string suffix, dag iops, string cstr,
   let validForTailPredication = 1;
 }
 
+def SDTVecReduceP : SDTypeProfile<1, 2, [    // VADDLVp
+  SDTCisInt<0>, SDTCisVec<1>, SDTCisVec<2>
+]>;
 def ARMVADDVs       : SDNode<"ARMISD::VADDVs", SDTVecReduce>;
 def ARMVADDVu       : SDNode<"ARMISD::VADDVu", SDTVecReduce>;
+def ARMVADDVps      : SDNode<"ARMISD::VADDVps", SDTVecReduceP>;
+def ARMVADDVpu      : SDNode<"ARMISD::VADDVpu", SDTVecReduceP>;
 
 multiclass MVE_VADDV_A<MVEVectorVTInfo VTI> {
   def acc    : MVE_VADDV<"vaddva", VTI.Suffix,
@@ -630,20 +707,39 @@ multiclass MVE_VADDV_A<MVEVectorVTInfo VTI> {
     if VTI.Unsigned then {
       def : Pat<(i32 (vecreduce_add (VTI.Vec MQPR:$vec))),
                 (i32 (InstN $vec))>;
+      def : Pat<(i32 (vecreduce_add (VTI.Vec (vselect (VTI.Pred VCCR:$pred),
+                                                      (VTI.Vec MQPR:$vec),
+                                                      (VTI.Vec ARMimmAllZerosV))))),
+                (i32 (InstN $vec, ARMVCCThen, $pred))>;
       def : Pat<(i32 (ARMVADDVu (VTI.Vec MQPR:$vec))),
                 (i32 (InstN $vec))>;
+      def : Pat<(i32 (ARMVADDVpu (VTI.Vec MQPR:$vec), (VTI.Pred VCCR:$pred))),
+                (i32 (InstN $vec, ARMVCCThen, $pred))>;
       def : Pat<(i32 (add (i32 (vecreduce_add (VTI.Vec MQPR:$vec))),
                           (i32 tGPREven:$acc))),
                 (i32 (InstA $acc, $vec))>;
+      def : Pat<(i32 (add (i32 (vecreduce_add (VTI.Vec (vselect (VTI.Pred VCCR:$pred),
+                                                                (VTI.Vec MQPR:$vec),
+                                                                (VTI.Vec ARMimmAllZerosV))))),
+                          (i32 tGPREven:$acc))),
+                (i32 (InstA $acc, $vec, ARMVCCThen, $pred))>;
       def : Pat<(i32 (add (i32 (ARMVADDVu (VTI.Vec MQPR:$vec))),
                           (i32 tGPREven:$acc))),
                 (i32 (InstA $acc, $vec))>;
+      def : Pat<(i32 (add (i32 (ARMVADDVpu (VTI.Vec MQPR:$vec), (VTI.Pred VCCR:$pred))),
+                          (i32 tGPREven:$acc))),
+                (i32 (InstA $acc, $vec, ARMVCCThen, $pred))>;
     } else {
       def : Pat<(i32 (ARMVADDVs (VTI.Vec MQPR:$vec))),
                 (i32 (InstN $vec))>;
       def : Pat<(i32 (add (i32 (ARMVADDVs (VTI.Vec MQPR:$vec))),
                           (i32 tGPREven:$acc))),
                 (i32 (InstA $acc, $vec))>;
+      def : Pat<(i32 (ARMVADDVps (VTI.Vec MQPR:$vec), (VTI.Pred VCCR:$pred))),
+                (i32 (InstN $vec, ARMVCCThen, $pred))>;
+      def : Pat<(i32 (add (i32 (ARMVADDVps (VTI.Vec MQPR:$vec), (VTI.Pred VCCR:$pred))),
+                          (i32 tGPREven:$acc))),
+                (i32 (InstA $acc, $vec, ARMVCCThen, $pred))>;
     }
 
     def : Pat<(i32 (int_arm_mve_addv_predicated (VTI.Vec MQPR:$vec),
@@ -848,6 +944,14 @@ multiclass MVE_VMINMAXV_ty<string iname, bit isMin, string intrBaseName> {
   defm u32: MVE_VMINMAXV_p<iname, 1, isMin, MVE_v4u32, intrBaseName>;
 }
 
+def SDTVecReduceR : SDTypeProfile<1, 2, [   // Reduction of an integer and vector into an integer
+  SDTCisInt<0>, SDTCisInt<1>, SDTCisVec<2>
+]>;
+def ARMVMINVu       : SDNode<"ARMISD::VMINVu", SDTVecReduceR>;
+def ARMVMINVs       : SDNode<"ARMISD::VMINVs", SDTVecReduceR>;
+def ARMVMAXVu       : SDNode<"ARMISD::VMAXVu", SDTVecReduceR>;
+def ARMVMAXVs       : SDNode<"ARMISD::VMAXVs", SDTVecReduceR>;
+
 defm MVE_VMINV : MVE_VMINMAXV_ty<"vminv", 1, "int_arm_mve_minv">;
 defm MVE_VMAXV : MVE_VMINMAXV_ty<"vmaxv", 0, "int_arm_mve_maxv">;
 
@@ -878,6 +982,32 @@ let Predicates = [HasMVEInt] in {
   def : Pat<(i32 (vecreduce_umin (v4i32 MQPR:$src))),
             (i32 (MVE_VMINVu32 (t2MOVi (i32 4294967295)), $src))>;
 
+  def : Pat<(i32 (ARMVMINVu (i32 rGPR:$x), (v16i8 MQPR:$src))),
+            (i32 (MVE_VMINVu8 $x, $src))>;
+  def : Pat<(i32 (ARMVMINVu (i32 rGPR:$x), (v8i16 MQPR:$src))),
+            (i32 (MVE_VMINVu16 $x, $src))>;
+  def : Pat<(i32 (ARMVMINVu (i32 rGPR:$x), (v4i32 MQPR:$src))),
+            (i32 (MVE_VMINVu32 $x, $src))>;
+  def : Pat<(i32 (ARMVMINVs (i32 rGPR:$x), (v16i8 MQPR:$src))),
+            (i32 (MVE_VMINVs8 $x, $src))>;
+  def : Pat<(i32 (ARMVMINVs (i32 rGPR:$x), (v8i16 MQPR:$src))),
+            (i32 (MVE_VMINVs16 $x, $src))>;
+  def : Pat<(i32 (ARMVMINVs (i32 rGPR:$x), (v4i32 MQPR:$src))),
+            (i32 (MVE_VMINVs32 $x, $src))>;
+
+  def : Pat<(i32 (ARMVMAXVu (i32 rGPR:$x), (v16i8 MQPR:$src))),
+            (i32 (MVE_VMAXVu8 $x, $src))>;
+  def : Pat<(i32 (ARMVMAXVu (i32 rGPR:$x), (v8i16 MQPR:$src))),
+            (i32 (MVE_VMAXVu16 $x, $src))>;
+  def : Pat<(i32 (ARMVMAXVu (i32 rGPR:$x), (v4i32 MQPR:$src))),
+            (i32 (MVE_VMAXVu32 $x, $src))>;
+  def : Pat<(i32 (ARMVMAXVs (i32 rGPR:$x), (v16i8 MQPR:$src))),
+            (i32 (MVE_VMAXVs8 $x, $src))>;
+  def : Pat<(i32 (ARMVMAXVs (i32 rGPR:$x), (v8i16 MQPR:$src))),
+            (i32 (MVE_VMAXVs16 $x, $src))>;
+  def : Pat<(i32 (ARMVMAXVs (i32 rGPR:$x), (v4i32 MQPR:$src))),
+            (i32 (MVE_VMAXVs32 $x, $src))>;
+
 }
 
 multiclass MVE_VMINMAXAV_ty<string iname, bit isMin, string intrBaseName> {
@@ -1009,12 +1139,28 @@ def SDTVecReduce2LA : SDTypeProfile<2, 4, [    // VMLALVA
   SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>, SDTCisInt<3>,
   SDTCisVec<4>, SDTCisVec<5>
 ]>;
+def SDTVecReduce2P : SDTypeProfile<1, 3, [    // VMLAV
+  SDTCisInt<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>
+]>;
+def SDTVecReduce2LP : SDTypeProfile<2, 3, [    // VMLALV
+  SDTCisInt<0>, SDTCisInt<1>, SDTCisVec<2>, SDTCisVec<3>, SDTCisVec<4>
+]>;
+def SDTVecReduce2LAP : SDTypeProfile<2, 5, [    // VMLALVA
+  SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>, SDTCisInt<3>,
+  SDTCisVec<4>, SDTCisVec<5>, SDTCisVec<6>
+]>;
 def ARMVMLAVs       : SDNode<"ARMISD::VMLAVs", SDTVecReduce2>;
 def ARMVMLAVu       : SDNode<"ARMISD::VMLAVu", SDTVecReduce2>;
 def ARMVMLALVs      : SDNode<"ARMISD::VMLALVs", SDTVecReduce2L>;
 def ARMVMLALVu      : SDNode<"ARMISD::VMLALVu", SDTVecReduce2L>;
-def ARMVMLALVAs      : SDNode<"ARMISD::VMLALVAs", SDTVecReduce2LA>;
-def ARMVMLALVAu      : SDNode<"ARMISD::VMLALVAu", SDTVecReduce2LA>;
+def ARMVMLALVAs     : SDNode<"ARMISD::VMLALVAs", SDTVecReduce2LA>;
+def ARMVMLALVAu     : SDNode<"ARMISD::VMLALVAu", SDTVecReduce2LA>;
+def ARMVMLAVps      : SDNode<"ARMISD::VMLAVps", SDTVecReduce2P>;
+def ARMVMLAVpu      : SDNode<"ARMISD::VMLAVpu", SDTVecReduce2P>;
+def ARMVMLALVps     : SDNode<"ARMISD::VMLALVps", SDTVecReduce2LP>;
+def ARMVMLALVpu     : SDNode<"ARMISD::VMLALVpu", SDTVecReduce2LP>;
+def ARMVMLALVAps    : SDNode<"ARMISD::VMLALVAps", SDTVecReduce2LAP>;
+def ARMVMLALVApu    : SDNode<"ARMISD::VMLALVApu", SDTVecReduce2LAP>;
 
 let Predicates = [HasMVEInt] in {
   def : Pat<(i32 (vecreduce_add (mul (v4i32 MQPR:$src1), (v4i32 MQPR:$src2)))),
@@ -1033,22 +1179,68 @@ let Predicates = [HasMVEInt] in {
             (i32 (MVE_VMLADAVu8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
 
   def : Pat<(i32 (add (i32 (vecreduce_add (mul (v4i32 MQPR:$src1), (v4i32 MQPR:$src2)))),
-                                          (i32 tGPREven:$src3))),
+                      (i32 tGPREven:$src3))),
             (i32 (MVE_VMLADAVau32 $src3, $src1, $src2))>;
   def : Pat<(i32 (add (i32 (vecreduce_add (mul (v8i16 MQPR:$src1), (v8i16 MQPR:$src2)))),
-                                          (i32 tGPREven:$src3))),
+                      (i32 tGPREven:$src3))),
             (i32 (MVE_VMLADAVau16 $src3, $src1, $src2))>;
   def : Pat<(i32 (add (ARMVMLAVs (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)), tGPREven:$Rd)),
             (i32 (MVE_VMLADAVas16 tGPREven:$Rd, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
   def : Pat<(i32 (add (ARMVMLAVu (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)), tGPREven:$Rd)),
             (i32 (MVE_VMLADAVau16 tGPREven:$Rd, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
   def : Pat<(i32 (add (i32 (vecreduce_add (mul (v16i8 MQPR:$src1), (v16i8 MQPR:$src2)))),
-                                          (i32 tGPREven:$src3))),
+                      (i32 tGPREven:$src3))),
             (i32 (MVE_VMLADAVau8 $src3, $src1, $src2))>;
   def : Pat<(i32 (add (ARMVMLAVs (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)), tGPREven:$Rd)),
             (i32 (MVE_VMLADAVas8 tGPREven:$Rd, (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
   def : Pat<(i32 (add (ARMVMLAVu (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)), tGPREven:$Rd)),
             (i32 (MVE_VMLADAVau8 tGPREven:$Rd, (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
+
+  // Predicated
+  def : Pat<(i32 (vecreduce_add (vselect (v4i1 VCCR:$pred),
+                                         (mul (v4i32 MQPR:$src1), (v4i32 MQPR:$src2)),
+                                         (v4i32 ARMimmAllZerosV)))),
+            (i32 (MVE_VMLADAVu32 $src1, $src2, ARMVCCThen, $pred))>;
+  def : Pat<(i32 (vecreduce_add (vselect (v8i1 VCCR:$pred),
+                                         (mul (v8i16 MQPR:$src1), (v8i16 MQPR:$src2)),
+                                         (v8i16 ARMimmAllZerosV)))),
+            (i32 (MVE_VMLADAVu16 $src1, $src2, ARMVCCThen, $pred))>;
+  def : Pat<(i32 (ARMVMLAVps (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), (v8i1 VCCR:$pred))),
+            (i32 (MVE_VMLADAVs16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), ARMVCCThen, $pred))>;
+  def : Pat<(i32 (ARMVMLAVpu (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), (v8i1 VCCR:$pred))),
+            (i32 (MVE_VMLADAVu16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), ARMVCCThen, $pred))>;
+  def : Pat<(i32 (vecreduce_add (vselect (v16i1 VCCR:$pred),
+                                         (mul (v16i8 MQPR:$src1), (v16i8 MQPR:$src2)),
+                                         (v16i8 ARMimmAllZerosV)))),
+            (i32 (MVE_VMLADAVu8 $src1, $src2, ARMVCCThen, $pred))>;
+  def : Pat<(i32 (ARMVMLAVps (v16i8 MQPR:$val1), (v16i8 MQPR:$val2), (v16i1 VCCR:$pred))),
+            (i32 (MVE_VMLADAVs8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2), ARMVCCThen, $pred))>;
+  def : Pat<(i32 (ARMVMLAVpu (v16i8 MQPR:$val1), (v16i8 MQPR:$val2), (v16i1 VCCR:$pred))),
+            (i32 (MVE_VMLADAVu8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2), ARMVCCThen, $pred))>;
+
+  def : Pat<(i32 (add (i32 (vecreduce_add (vselect (v4i1 VCCR:$pred),
+                                                   (mul (v4i32 MQPR:$src1), (v4i32 MQPR:$src2)),
+                                                   (v4i32 ARMimmAllZerosV)))),
+                      (i32 tGPREven:$src3))),
+            (i32 (MVE_VMLADAVau32 $src3, $src1, $src2, ARMVCCThen, $pred))>;
+  def : Pat<(i32 (add (i32 (vecreduce_add (vselect (v8i1 VCCR:$pred),
+                                                   (mul (v8i16 MQPR:$src1), (v8i16 MQPR:$src2)),
+                                                   (v8i16 ARMimmAllZerosV)))),
+                      (i32 tGPREven:$src3))),
+            (i32 (MVE_VMLADAVau16 $src3, $src1, $src2, ARMVCCThen, $pred))>;
+  def : Pat<(i32 (add (ARMVMLAVps (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), (v8i1 VCCR:$pred)), tGPREven:$Rd)),
+            (i32 (MVE_VMLADAVas16 tGPREven:$Rd, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), ARMVCCThen, $pred))>;
+  def : Pat<(i32 (add (ARMVMLAVpu (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), (v8i1 VCCR:$pred)), tGPREven:$Rd)),
+            (i32 (MVE_VMLADAVau16 tGPREven:$Rd, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), ARMVCCThen, $pred))>;
+  def : Pat<(i32 (add (i32 (vecreduce_add (vselect (v16i1 VCCR:$pred),
+                                                   (mul (v16i8 MQPR:$src1), (v16i8 MQPR:$src2)),
+                                                   (v16i8 ARMimmAllZerosV)))),
+                      (i32 tGPREven:$src3))),
+            (i32 (MVE_VMLADAVau8 $src3, $src1, $src2, ARMVCCThen, $pred))>;
+  def : Pat<(i32 (add (ARMVMLAVps (v16i8 MQPR:$val1), (v16i8 MQPR:$val2), (v16i1 VCCR:$pred)), tGPREven:$Rd)),
+            (i32 (MVE_VMLADAVas8 tGPREven:$Rd, (v16i8 MQPR:$val1), (v16i8 MQPR:$val2), ARMVCCThen, $pred))>;
+  def : Pat<(i32 (add (ARMVMLAVpu (v16i8 MQPR:$val1), (v16i8 MQPR:$val2), (v16i1 VCCR:$pred)), tGPREven:$Rd)),
+            (i32 (MVE_VMLADAVau8 tGPREven:$Rd, (v16i8 MQPR:$val1), (v16i8 MQPR:$val2), ARMVCCThen, $pred))>;
 }
 
 // vmlav aliases vmladav
@@ -1168,6 +1360,25 @@ let Predicates = [HasMVEInt] in {
             (MVE_VMLALDAVas16 tGPREven:$Rda, tGPROdd:$Rdb, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))>;
   def : Pat<(ARMVMLALVAu tGPREven:$Rda, tGPROdd:$Rdb, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)),
             (MVE_VMLALDAVau16 tGPREven:$Rda, tGPROdd:$Rdb, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))>;
+
+  // Predicated
+  def : Pat<(ARMVMLALVps (v4i32 MQPR:$val1), (v4i32 MQPR:$val2), (v4i1 VCCR:$pred)),
+            (MVE_VMLALDAVs32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2), ARMVCCThen, $pred)>;
+  def : Pat<(ARMVMLALVpu (v4i32 MQPR:$val1), (v4i32 MQPR:$val2), (v4i1 VCCR:$pred)),
+            (MVE_VMLALDAVu32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2), ARMVCCThen, $pred)>;
+  def : Pat<(ARMVMLALVps (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), (v8i1 VCCR:$pred)),
+            (MVE_VMLALDAVs16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), ARMVCCThen, $pred)>;
+  def : Pat<(ARMVMLALVpu (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), (v8i1 VCCR:$pred)),
+            (MVE_VMLALDAVu16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), ARMVCCThen, $pred)>;
+
+  def : Pat<(ARMVMLALVAps tGPREven:$Rda, tGPROdd:$Rdb, (v4i32 MQPR:$val1), (v4i32 MQPR:$val2), (v4i1 VCCR:$pred)),
+            (MVE_VMLALDAVas32 tGPREven:$Rda, tGPROdd:$Rdb, (v4i32 MQPR:$val1), (v4i32 MQPR:$val2), ARMVCCThen, $pred)>;
+  def : Pat<(ARMVMLALVApu tGPREven:$Rda, tGPROdd:$Rdb, (v4i32 MQPR:$val1), (v4i32 MQPR:$val2), (v4i1 VCCR:$pred)),
+            (MVE_VMLALDAVau32 tGPREven:$Rda, tGPROdd:$Rdb, (v4i32 MQPR:$val1), (v4i32 MQPR:$val2), ARMVCCThen, $pred)>;
+  def : Pat<(ARMVMLALVAps tGPREven:$Rda, tGPROdd:$Rdb, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), (v8i1 VCCR:$pred)),
+            (MVE_VMLALDAVas16 tGPREven:$Rda, tGPROdd:$Rdb, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), ARMVCCThen, $pred)>;
+  def : Pat<(ARMVMLALVApu tGPREven:$Rda, tGPROdd:$Rdb, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), (v8i1 VCCR:$pred)),
+            (MVE_VMLALDAVau16 tGPREven:$Rda, tGPROdd:$Rdb, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), ARMVCCThen, $pred)>;
 }
 
 // vmlalv aliases vmlaldav
@@ -1215,7 +1426,7 @@ class MVE_comp<InstrItinClass itin, string iname, string suffix,
 }
 
 class MVE_VMINMAXNM<string iname, string suffix, bit sz, bit bit_21,
-                list<dag> pattern=[]>
+                    list<dag> pattern=[]>
   : MVE_comp<NoItinerary, iname, suffix, "", pattern> {
 
   let Inst{28} = 0b1;
@@ -1231,46 +1442,19 @@ class MVE_VMINMAXNM<string iname, string suffix, bit sz, bit bit_21,
   let Predicates = [HasMVEFloat];
 }
 
-def MVE_VMAXNMf32 : MVE_VMINMAXNM<"vmaxnm", "f32", 0b0, 0b0>;
-def MVE_VMAXNMf16 : MVE_VMINMAXNM<"vmaxnm", "f16", 0b1, 0b0>;
-
-let Predicates = [HasMVEFloat] in {
-  def : Pat<(v4f32 (fmaxnum (v4f32 MQPR:$val1), (v4f32 MQPR:$val2))),
-            (v4f32 (MVE_VMAXNMf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2)))>;
-  def : Pat<(v8f16 (fmaxnum (v8f16 MQPR:$val1), (v8f16 MQPR:$val2))),
-            (v8f16 (MVE_VMAXNMf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>;
-  def : Pat<(v4f32 (int_arm_mve_max_predicated (v4f32 MQPR:$val1), (v4f32 MQPR:$val2), (i32 0),
-                          (v4i1 VCCR:$mask), (v4f32 MQPR:$inactive))),
-            (v4f32 (MVE_VMAXNMf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2),
-                          ARMVCCThen, (v4i1 VCCR:$mask),
-                          (v4f32 MQPR:$inactive)))>;
-  def : Pat<(v8f16 (int_arm_mve_max_predicated (v8f16 MQPR:$val1), (v8f16 MQPR:$val2), (i32 0),
-                          (v8i1 VCCR:$mask), (v8f16 MQPR:$inactive))),
-            (v8f16 (MVE_VMAXNMf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2),
-                          ARMVCCThen, (v8i1 VCCR:$mask),
-                          (v8f16 MQPR:$inactive)))>;
-}
-
-def MVE_VMINNMf32 : MVE_VMINMAXNM<"vminnm", "f32", 0b0, 0b1>;
-def MVE_VMINNMf16 : MVE_VMINMAXNM<"vminnm", "f16", 0b1, 0b1>;
+multiclass MVE_VMINMAXNM_m<string iname, bit bit_4, MVEVectorVTInfo VTI, SDNode Op, Intrinsic PredInt> {
+  def "" : MVE_VMINMAXNM<iname, VTI.Suffix, VTI.Size{0}, bit_4>;
 
-let Predicates = [HasMVEFloat] in {
-  def : Pat<(v4f32 (fminnum (v4f32 MQPR:$val1), (v4f32 MQPR:$val2))),
-            (v4f32 (MVE_VMINNMf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2)))>;
-  def : Pat<(v8f16 (fminnum (v8f16 MQPR:$val1), (v8f16 MQPR:$val2))),
-            (v8f16 (MVE_VMINNMf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>;
-  def : Pat<(v4f32 (int_arm_mve_min_predicated (v4f32 MQPR:$val1), (v4f32 MQPR:$val2),
-                          (i32 0), (v4i1 VCCR:$mask), (v4f32 MQPR:$inactive))),
-            (v4f32 (MVE_VMINNMf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2),
-                          ARMVCCThen, (v4i1 VCCR:$mask),
-                          (v4f32 MQPR:$inactive)))>;
-  def : Pat<(v8f16 (int_arm_mve_min_predicated (v8f16 MQPR:$val1), (v8f16 MQPR:$val2),
-                          (i32 0), (v8i1 VCCR:$mask), (v8f16 MQPR:$inactive))),
-            (v8f16 (MVE_VMINNMf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2),
-                          ARMVCCThen, (v8i1 VCCR:$mask),
-                          (v8f16 MQPR:$inactive)))>;
+  let Predicates = [HasMVEFloat] in {
+    defm : MVE_TwoOpPattern<VTI, Op, PredInt, (? (i32 0)), !cast<Instruction>(NAME)>;
+  }
 }
 
+defm MVE_VMAXNMf32 : MVE_VMINMAXNM_m<"vmaxnm", 0b0, MVE_v4f32, fmaxnum, int_arm_mve_max_predicated>;
+defm MVE_VMAXNMf16 : MVE_VMINMAXNM_m<"vmaxnm", 0b0, MVE_v8f16, fmaxnum, int_arm_mve_max_predicated>;
+defm MVE_VMINNMf32 : MVE_VMINMAXNM_m<"vminnm", 0b1, MVE_v4f32, fminnum, int_arm_mve_min_predicated>;
+defm MVE_VMINNMf16 : MVE_VMINMAXNM_m<"vminnm", 0b1, MVE_v8f16, fminnum, int_arm_mve_min_predicated>;
+
 
 class MVE_VMINMAX<string iname, string suffix, bit U, bits<2> size,
               bit bit_4, list<dag> pattern=[]>
@@ -1288,22 +1472,11 @@ class MVE_VMINMAX<string iname, string suffix, bit U, bits<2> size,
 }
 
 multiclass MVE_VMINMAX_m<string iname, bit bit_4, MVEVectorVTInfo VTI,
-                      SDNode unpred_op, Intrinsic pred_int> {
+                      SDNode Op, Intrinsic PredInt> {
   def "" : MVE_VMINMAX<iname, VTI.Suffix, VTI.Unsigned, VTI.Size, bit_4>;
-  defvar Inst = !cast<Instruction>(NAME);
 
   let Predicates = [HasMVEInt] in {
-    // Unpredicated min/max
-    def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
-              (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
-
-    // Predicated min/max
-    def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
-                            (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask),
-                            (VTI.Vec MQPR:$inactive))),
-              (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
-                             ARMVCCThen, (VTI.Pred VCCR:$mask),
-                             (VTI.Vec MQPR:$inactive)))>;
+    defm : MVE_TwoOpPattern<VTI, Op, PredInt, (? (i32 VTI.Unsigned)), !cast<Instruction>(NAME)>;
   }
 }
 
@@ -1476,61 +1649,41 @@ foreach s=["s8", "s16", "s32", "u8", "u16", "u32", "i8", "i16", "i32", "f16", "f
         (MVE_VAND MQPR:$QdSrc, MQPR:$QnSrc, MQPR:$QmSrc, vpred_r:$vp)>;
 }
 
-multiclass MVE_bit_op<MVEVectorVTInfo VTI, SDNode unpred_op, Intrinsic pred_int, MVE_bit_ops instruction> {
-  let Predicates = [HasMVEInt] in {
-    // Unpredicated operation
-    def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
-              (VTI.Vec (instruction (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
-    // Predicated operation
-    def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
-                            (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
-              (VTI.Vec (instruction
-                            (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
-                            ARMVCCThen, (VTI.Pred VCCR:$mask),
-                            (VTI.Vec MQPR:$inactive)))>;
-  }
-}
-
-defm : MVE_bit_op<MVE_v16i8, and, int_arm_mve_and_predicated, MVE_VAND>;
-defm : MVE_bit_op<MVE_v8i16, and, int_arm_mve_and_predicated, MVE_VAND>;
-defm : MVE_bit_op<MVE_v4i32, and, int_arm_mve_and_predicated, MVE_VAND>;
-defm : MVE_bit_op<MVE_v2i64, and, int_arm_mve_and_predicated, MVE_VAND>;
-
-defm : MVE_bit_op<MVE_v16i8, or, int_arm_mve_orr_predicated, MVE_VORR>;
-defm : MVE_bit_op<MVE_v8i16, or, int_arm_mve_orr_predicated, MVE_VORR>;
-defm : MVE_bit_op<MVE_v4i32, or, int_arm_mve_orr_predicated, MVE_VORR>;
-defm : MVE_bit_op<MVE_v2i64, or, int_arm_mve_orr_predicated, MVE_VORR>;
-
-defm : MVE_bit_op<MVE_v16i8, xor, int_arm_mve_eor_predicated, MVE_VEOR>;
-defm : MVE_bit_op<MVE_v8i16, xor, int_arm_mve_eor_predicated, MVE_VEOR>;
-defm : MVE_bit_op<MVE_v4i32, xor, int_arm_mve_eor_predicated, MVE_VEOR>;
-defm : MVE_bit_op<MVE_v2i64, xor, int_arm_mve_eor_predicated, MVE_VEOR>;
-
-multiclass MVE_bit_op_with_inv<MVEVectorVTInfo VTI, SDNode unpred_op, Intrinsic pred_int, MVE_bit_ops instruction> {
-  let Predicates = [HasMVEInt] in {
-    // Unpredicated operation
-    def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (vnotq (VTI.Vec MQPR:$Qn)))),
-              (VTI.Vec (instruction (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
-    // Predicated operation
-    def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
-                            (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
-              (VTI.Vec (instruction
-                            (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
-                            ARMVCCThen, (VTI.Pred VCCR:$mask),
-                            (VTI.Vec MQPR:$inactive)))>;
-  }
+let Predicates = [HasMVEInt] in {
+  defm : MVE_TwoOpPattern<MVE_v16i8, and, int_arm_mve_and_predicated, (? ), MVE_VAND, ARMimmAllOnesV>;
+  defm : MVE_TwoOpPattern<MVE_v8i16, and, int_arm_mve_and_predicated, (? ), MVE_VAND, ARMimmAllOnesV>;
+  defm : MVE_TwoOpPattern<MVE_v4i32, and, int_arm_mve_and_predicated, (? ), MVE_VAND, ARMimmAllOnesV>;
+  defm : MVE_TwoOpPattern<MVE_v2i64, and, int_arm_mve_and_predicated, (? ), MVE_VAND, ARMimmAllOnesV>;
+
+  defm : MVE_TwoOpPattern<MVE_v16i8, or, int_arm_mve_orr_predicated, (? ), MVE_VORR, ARMimmAllZerosV>;
+  defm : MVE_TwoOpPattern<MVE_v8i16, or, int_arm_mve_orr_predicated, (? ), MVE_VORR, ARMimmAllZerosV>;
+  defm : MVE_TwoOpPattern<MVE_v4i32, or, int_arm_mve_orr_predicated, (? ), MVE_VORR, ARMimmAllZerosV>;
+  defm : MVE_TwoOpPattern<MVE_v2i64, or, int_arm_mve_orr_predicated, (? ), MVE_VORR, ARMimmAllZerosV>;
+
+  defm : MVE_TwoOpPattern<MVE_v16i8, xor, int_arm_mve_eor_predicated, (? ), MVE_VEOR, ARMimmAllZerosV>;
+  defm : MVE_TwoOpPattern<MVE_v8i16, xor, int_arm_mve_eor_predicated, (? ), MVE_VEOR, ARMimmAllZerosV>;
+  defm : MVE_TwoOpPattern<MVE_v4i32, xor, int_arm_mve_eor_predicated, (? ), MVE_VEOR, ARMimmAllZerosV>;
+  defm : MVE_TwoOpPattern<MVE_v2i64, xor, int_arm_mve_eor_predicated, (? ), MVE_VEOR, ARMimmAllZerosV>;
+
+  defm : MVE_TwoOpPattern<MVE_v16i8, BinOpFrag<(and node:$LHS, (vnotq node:$RHS))>,
+                          int_arm_mve_bic_predicated, (? ), MVE_VBIC>;
+  defm : MVE_TwoOpPattern<MVE_v8i16, BinOpFrag<(and node:$LHS, (vnotq node:$RHS))>,
+                          int_arm_mve_bic_predicated, (? ), MVE_VBIC>;
+  defm : MVE_TwoOpPattern<MVE_v4i32, BinOpFrag<(and node:$LHS, (vnotq node:$RHS))>,
+                          int_arm_mve_bic_predicated, (? ), MVE_VBIC>;
+  defm : MVE_TwoOpPattern<MVE_v2i64, BinOpFrag<(and node:$LHS, (vnotq node:$RHS))>,
+                          int_arm_mve_bic_predicated, (? ), MVE_VBIC>;
+
+  defm : MVE_TwoOpPattern<MVE_v16i8, BinOpFrag<(or node:$LHS, (vnotq node:$RHS))>,
+                          int_arm_mve_orn_predicated, (? ), MVE_VORN>;
+  defm : MVE_TwoOpPattern<MVE_v8i16, BinOpFrag<(or node:$LHS, (vnotq node:$RHS))>,
+                          int_arm_mve_orn_predicated, (? ), MVE_VORN>;
+  defm : MVE_TwoOpPattern<MVE_v4i32, BinOpFrag<(or node:$LHS, (vnotq node:$RHS))>,
+                          int_arm_mve_orn_predicated, (? ), MVE_VORN>;
+  defm : MVE_TwoOpPattern<MVE_v2i64, BinOpFrag<(or node:$LHS, (vnotq node:$RHS))>,
+                          int_arm_mve_orn_predicated, (? ), MVE_VORN>;
 }
 
-defm : MVE_bit_op_with_inv<MVE_v16i8, and, int_arm_mve_bic_predicated, MVE_VBIC>;
-defm : MVE_bit_op_with_inv<MVE_v8i16, and, int_arm_mve_bic_predicated, MVE_VBIC>;
-defm : MVE_bit_op_with_inv<MVE_v4i32, and, int_arm_mve_bic_predicated, MVE_VBIC>;
-defm : MVE_bit_op_with_inv<MVE_v2i64, and, int_arm_mve_bic_predicated, MVE_VBIC>;
-
-defm : MVE_bit_op_with_inv<MVE_v16i8, or, int_arm_mve_orn_predicated, MVE_VORN>;
-defm : MVE_bit_op_with_inv<MVE_v8i16, or, int_arm_mve_orn_predicated, MVE_VORN>;
-defm : MVE_bit_op_with_inv<MVE_v4i32, or, int_arm_mve_orn_predicated, MVE_VORN>;
-defm : MVE_bit_op_with_inv<MVE_v2i64, or, int_arm_mve_orn_predicated, MVE_VORN>;
-
 class MVE_bit_cmode<string iname, string suffix, bit halfword, dag inOps>
   : MVE_p<(outs MQPR:$Qd), inOps, NoItinerary,
           iname, suffix, "$Qd, $imm", vpred_n, "$Qd = $Qd_src"> {
@@ -1565,7 +1718,8 @@ multiclass MVE_bit_cmode_p<string iname, bit opcode,
   defvar UnpredPat = (VTI.Vec (op (VTI.Vec MQPR:$src), timm:$simm));
 
   let Predicates = [HasMVEInt] in {
-    def : Pat<UnpredPat, (VTI.Vec (Inst (VTI.Vec MQPR:$src), imm_type:$simm))>;
+    def : Pat<UnpredPat,
+              (VTI.Vec (Inst (VTI.Vec MQPR:$src), imm_type:$simm))>;
     def : Pat<(VTI.Vec (vselect (VTI.Pred VCCR:$pred),
                           UnpredPat, (VTI.Vec MQPR:$src))),
               (VTI.Vec (Inst (VTI.Vec MQPR:$src), imm_type:$simm,
@@ -1775,31 +1929,18 @@ class MVE_VMULt1<string iname, string suffix, bits<2> size,
   let validForTailPredication = 1;
 }
 
-multiclass MVE_VMUL_m<string iname, MVEVectorVTInfo VTI,
-                      SDNode unpred_op, Intrinsic pred_int> {
-  def "" : MVE_VMULt1<iname, VTI.Suffix, VTI.Size>;
-  defvar Inst = !cast<Instruction>(NAME);
+multiclass MVE_VMUL_m<MVEVectorVTInfo VTI> {
+  def "" : MVE_VMULt1<"vmul", VTI.Suffix, VTI.Size>;
 
   let Predicates = [HasMVEInt] in {
-    // Unpredicated multiply
-    def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
-              (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
-
-    // Predicated multiply
-    def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
-                            (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
-              (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
-                             ARMVCCThen, (VTI.Pred VCCR:$mask),
-                             (VTI.Vec MQPR:$inactive)))>;
+    defm : MVE_TwoOpPattern<VTI, mul, int_arm_mve_mul_predicated, (? ),
+                            !cast<Instruction>(NAME), ARMimmOneV>;
   }
 }
 
-multiclass MVE_VMUL<MVEVectorVTInfo VTI>
-  : MVE_VMUL_m<"vmul", VTI, mul, int_arm_mve_mul_predicated>;
-
-defm MVE_VMULi8  : MVE_VMUL<MVE_v16i8>;
-defm MVE_VMULi16 : MVE_VMUL<MVE_v8i16>;
-defm MVE_VMULi32 : MVE_VMUL<MVE_v4i32>;
+defm MVE_VMULi8  : MVE_VMUL_m<MVE_v16i8>;
+defm MVE_VMULi16 : MVE_VMUL_m<MVE_v8i16>;
+defm MVE_VMULi32 : MVE_VMUL_m<MVE_v4i32>;
 
 class MVE_VQxDMULH_Base<string iname, string suffix, bits<2> size, bit rounding,
                   list<dag> pattern=[]>
@@ -1811,30 +1952,30 @@ class MVE_VQxDMULH_Base<string iname, string suffix, bits<2> size, bit rounding,
   let Inst{12-8} = 0b01011;
   let Inst{4} = 0b0;
   let Inst{0} = 0b0;
+  let validForTailPredication = 1;
 }
 
+def MVEvqdmulh : SDNode<"ARMISD::VQDMULH", SDTIntBinOp>;
+
 multiclass MVE_VQxDMULH_m<string iname, MVEVectorVTInfo VTI,
-                      SDNode unpred_op, Intrinsic pred_int,
+                      SDNode Op, Intrinsic unpred_int, Intrinsic pred_int,
                       bit rounding> {
   def "" : MVE_VQxDMULH_Base<iname, VTI.Suffix, VTI.Size, rounding>;
   defvar Inst = !cast<Instruction>(NAME);
 
   let Predicates = [HasMVEInt] in {
-    // Unpredicated multiply
-    def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
-              (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
+    defm : MVE_TwoOpPattern<VTI, Op, pred_int, (? ), Inst>;
 
-    // Predicated multiply
-    def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
-                            (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
-              (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
-                             ARMVCCThen, (VTI.Pred VCCR:$mask),
-                             (VTI.Vec MQPR:$inactive)))>;
+    // Extra unpredicated multiply intrinsic patterns
+    def : Pat<(VTI.Vec (unpred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
+              (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
   }
 }
 
 multiclass MVE_VQxDMULH<string iname, MVEVectorVTInfo VTI, bit rounding>
-  : MVE_VQxDMULH_m<iname, VTI, !if(rounding, int_arm_mve_vqrdmulh,
+  : MVE_VQxDMULH_m<iname, VTI, !if(rounding, null_frag,
+                                             MVEvqdmulh),
+                               !if(rounding, int_arm_mve_vqrdmulh,
                                              int_arm_mve_vqdmulh),
                                !if(rounding, int_arm_mve_qrdmulh_predicated,
                                              int_arm_mve_qdmulh_predicated),
@@ -1862,21 +2003,12 @@ class MVE_VADDSUB<string iname, string suffix, bits<2> size, bit subtract,
 }
 
 multiclass MVE_VADDSUB_m<string iname, MVEVectorVTInfo VTI, bit subtract,
-                         SDNode unpred_op, Intrinsic pred_int> {
+                         SDNode Op, Intrinsic PredInt> {
   def "" : MVE_VADDSUB<iname, VTI.Suffix, VTI.Size, subtract>;
   defvar Inst = !cast<Instruction>(NAME);
 
   let Predicates = [HasMVEInt] in {
-    // Unpredicated add/subtract
-    def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
-              (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
-
-    // Predicated add/subtract
-    def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
-                            (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
-              (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
-                             ARMVCCThen, (VTI.Pred VCCR:$mask),
-                             (VTI.Vec MQPR:$inactive)))>;
+    defm : MVE_TwoOpPattern<VTI, Op, PredInt, (? ), !cast<Instruction>(NAME), ARMimmAllZerosV>;
   }
 }
 
@@ -1914,22 +2046,13 @@ class MVE_VQSUB_<string suffix, bit U, bits<2> size>
   : MVE_VQADDSUB<"vqsub", suffix, U, 0b1, size>;
 
 multiclass MVE_VQADD_m<MVEVectorVTInfo VTI,
-                      SDNode unpred_op, Intrinsic pred_int> {
+                      SDNode Op, Intrinsic PredInt> {
   def "" : MVE_VQADD_<VTI.Suffix, VTI.Unsigned, VTI.Size>;
   defvar Inst = !cast<Instruction>(NAME);
 
   let Predicates = [HasMVEInt] in {
-    // Unpredicated saturating add
-    def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
-              (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
-
-    // Predicated saturating add
-    def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
-                            (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask),
-                            (VTI.Vec MQPR:$inactive))),
-              (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
-                             ARMVCCThen, (VTI.Pred VCCR:$mask),
-                             (VTI.Vec MQPR:$inactive)))>;
+    defm : MVE_TwoOpPattern<VTI, Op, PredInt, (? (i32 VTI.Unsigned)),
+                            !cast<Instruction>(NAME)>;
   }
 }
 
@@ -1944,22 +2067,13 @@ defm MVE_VQADDu16 : MVE_VQADD<MVE_v8u16, uaddsat>;
 defm MVE_VQADDu32 : MVE_VQADD<MVE_v4u32, uaddsat>;
 
 multiclass MVE_VQSUB_m<MVEVectorVTInfo VTI,
-                      SDNode unpred_op, Intrinsic pred_int> {
+                      SDNode Op, Intrinsic PredInt> {
   def "" : MVE_VQSUB_<VTI.Suffix, VTI.Unsigned, VTI.Size>;
   defvar Inst = !cast<Instruction>(NAME);
 
   let Predicates = [HasMVEInt] in {
-    // Unpredicated saturating subtract
-    def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
-              (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
-
-    // Predicated saturating subtract
-    def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
-                            (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask),
-                            (VTI.Vec MQPR:$inactive))),
-              (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
-                             ARMVCCThen, (VTI.Pred VCCR:$mask),
-                             (VTI.Vec MQPR:$inactive)))>;
+    defm : MVE_TwoOpPattern<VTI, Op, PredInt, (? (i32 VTI.Unsigned)),
+                            !cast<Instruction>(NAME)>;
   }
 }
 
@@ -2085,30 +2199,32 @@ defm MVE_VRHADDu32 : MVE_VRHADD<MVE_v4u32>;
 // modelling that here with these patterns, but we're using no wrap forms of
 // add to ensure that the extra bit of information is not needed for the
 // arithmetic or the rounding.
-def : Pat<(v16i8 (ARMvshrsImm (addnsw (addnsw (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn)),
-                                      (v16i8 (ARMvmovImm (i32 3585)))),
-                                   (i32 1))),
-              (MVE_VRHADDs8 MQPR:$Qm, MQPR:$Qn)>;
-def : Pat<(v8i16 (ARMvshrsImm (addnsw (addnsw (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn)),
-                                      (v8i16 (ARMvmovImm (i32 2049)))),
-                                   (i32 1))),
-              (MVE_VRHADDs16 MQPR:$Qm, MQPR:$Qn)>;
-def : Pat<(v4i32 (ARMvshrsImm (addnsw (addnsw (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn)),
-                                      (v4i32 (ARMvmovImm (i32 1)))),
-                                   (i32 1))),
-              (MVE_VRHADDs32 MQPR:$Qm, MQPR:$Qn)>;
-def : Pat<(v16i8 (ARMvshruImm (addnuw (addnuw (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn)),
-                                      (v16i8 (ARMvmovImm (i32 3585)))),
-                                   (i32 1))),
-              (MVE_VRHADDu8 MQPR:$Qm, MQPR:$Qn)>;
-def : Pat<(v8i16 (ARMvshruImm (addnuw (addnuw (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn)),
-                                      (v8i16 (ARMvmovImm (i32 2049)))),
-                                   (i32 1))),
-              (MVE_VRHADDu16 MQPR:$Qm, MQPR:$Qn)>;
-def : Pat<(v4i32 (ARMvshruImm (addnuw (addnuw (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn)),
-                                      (v4i32 (ARMvmovImm (i32 1)))),
-                                   (i32 1))),
-             (MVE_VRHADDu32 MQPR:$Qm, MQPR:$Qn)>;
+let Predicates = [HasMVEInt] in {
+  def : Pat<(v16i8 (ARMvshrsImm (addnsw (addnsw (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn)),
+                                        (v16i8 (ARMvmovImm (i32 3585)))),
+                                (i32 1))),
+            (MVE_VRHADDs8 MQPR:$Qm, MQPR:$Qn)>;
+  def : Pat<(v8i16 (ARMvshrsImm (addnsw (addnsw (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn)),
+                                        (v8i16 (ARMvmovImm (i32 2049)))),
+                                (i32 1))),
+            (MVE_VRHADDs16 MQPR:$Qm, MQPR:$Qn)>;
+  def : Pat<(v4i32 (ARMvshrsImm (addnsw (addnsw (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn)),
+                                        (v4i32 (ARMvmovImm (i32 1)))),
+                                (i32 1))),
+            (MVE_VRHADDs32 MQPR:$Qm, MQPR:$Qn)>;
+  def : Pat<(v16i8 (ARMvshruImm (addnuw (addnuw (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn)),
+                                        (v16i8 (ARMvmovImm (i32 3585)))),
+                                (i32 1))),
+            (MVE_VRHADDu8 MQPR:$Qm, MQPR:$Qn)>;
+  def : Pat<(v8i16 (ARMvshruImm (addnuw (addnuw (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn)),
+                                        (v8i16 (ARMvmovImm (i32 2049)))),
+                                (i32 1))),
+            (MVE_VRHADDu16 MQPR:$Qm, MQPR:$Qn)>;
+  def : Pat<(v4i32 (ARMvshruImm (addnuw (addnuw (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn)),
+                                        (v4i32 (ARMvmovImm (i32 1)))),
+                                (i32 1))),
+            (MVE_VRHADDu32 MQPR:$Qm, MQPR:$Qn)>;
+}
 
 
 class MVE_VHADDSUB<string iname, string suffix, bit U, bit subtract,
@@ -2357,8 +2473,9 @@ multiclass MVE_VABSNEG_int_m<string iname, bit negate, bit saturate,
 
   let Predicates = [HasMVEInt] in {
     // VQABS and VQNEG have more difficult isel patterns defined elsewhere
-    if !eq(saturate, 0) then {
-      def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$v))), (VTI.Vec (Inst $v))>;
+    if !not(saturate) then {
+      def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$v))),
+                (VTI.Vec (Inst $v))>;
     }
 
     def : Pat<(VTI.Vec (pred_int  (VTI.Vec MQPR:$v), (VTI.Pred VCCR:$mask),
@@ -2915,7 +3032,7 @@ multiclass MVE_VSHRN_patterns<MVE_shift_imm_partial inst,
   defvar outparams = (inst (OutVTI.Vec MQPR:$QdSrc), (InVTI.Vec MQPR:$Qm),
                            (imm:$imm));
 
-  def : Pat<(OutVTI.Vec !setop(inparams, int_arm_mve_vshrn)),
+  def : Pat<(OutVTI.Vec !setdagop(inparams, int_arm_mve_vshrn)),
             (OutVTI.Vec outparams)>;
   def : Pat<(OutVTI.Vec !con(inparams, (int_arm_mve_vshrn_predicated
                                            (InVTI.Pred VCCR:$pred)))),
@@ -3117,7 +3234,7 @@ multiclass MVE_VSxI_patterns<MVE_VSxI_imm inst, string name,
   defvar unpred_int = !cast<Intrinsic>("int_arm_mve_" # name);
   defvar pred_int = !cast<Intrinsic>("int_arm_mve_" # name # "_predicated");
 
-  def : Pat<(VTI.Vec !setop(inparams, unpred_int)),
+  def : Pat<(VTI.Vec !setdagop(inparams, unpred_int)),
             (VTI.Vec outparams)>;
   def : Pat<(VTI.Vec !con(inparams, (pred_int (VTI.Pred VCCR:$pred)))),
             (VTI.Vec !con(outparams, (? ARMVCCThen, VCCR:$pred)))>;
@@ -3469,18 +3586,12 @@ class MVE_VMUL_fp<string iname, string suffix, bit size, list<dag> pattern=[]>
 }
 
 multiclass MVE_VMULT_fp_m<string iname, bit bit_21, MVEVectorVTInfo VTI,
-                            SDNode unpred_op, Intrinsic pred_int> {
+                            SDNode Op, Intrinsic PredInt> {
   def "" : MVE_VMUL_fp<iname, VTI.Suffix, VTI.Size{0}>;
   defvar Inst = !cast<Instruction>(NAME);
 
   let Predicates = [HasMVEFloat] in {
-    def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
-              (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
-    def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
-                            (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
-              (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
-                             ARMVCCThen, (VTI.Pred VCCR:$mask),
-                             (VTI.Vec MQPR:$inactive)))>;
+    defm : MVE_TwoOpPattern<VTI, Op, PredInt, (? ), !cast<Instruction>(NAME)>;
   }
 }
 
@@ -3571,14 +3682,23 @@ multiclass MVE_VFMA_fp_multi<string iname, bit fms, MVEVectorVTInfo VTI> {
 
   let Predicates = [HasMVEFloat] in {
     if fms then {
-      def : Pat<(VTI.Vec (fma (fneg m1), m2, add)), (Inst $add, $m1, $m2)>;
-      def : Pat<(VTI.Vec (fma m1, (fneg m2), add)), (Inst $add, $m1, $m2)>;
+      def : Pat<(VTI.Vec (fma (fneg m1), m2, add)),
+                (Inst $add, $m1, $m2)>;
+      def : Pat<(VTI.Vec (vselect (VTI.Pred VCCR:$pred),
+                                  (VTI.Vec (fma (fneg m1), m2, add)),
+                                  add)),
+                (Inst $add, $m1, $m2, ARMVCCThen, $pred)>;
       def : Pat<(VTI.Vec (pred_int (fneg m1), m2, add, pred)),
                 (Inst $add, $m1, $m2, ARMVCCThen, $pred)>;
       def : Pat<(VTI.Vec (pred_int m1, (fneg m2), add, pred)),
                 (Inst $add, $m1, $m2, ARMVCCThen, $pred)>;
     } else {
-      def : Pat<(VTI.Vec (fma m1, m2, add)), (Inst $add, $m1, $m2)>;
+      def : Pat<(VTI.Vec (fma m1, m2, add)),
+                (Inst $add, $m1, $m2)>;
+      def : Pat<(VTI.Vec (vselect (VTI.Pred VCCR:$pred),
+                                  (VTI.Vec (fma m1, m2, add)),
+                                  add)),
+                (Inst $add, $m1, $m2, ARMVCCThen, $pred)>;
       def : Pat<(VTI.Vec (pred_int m1, m2, add, pred)),
                 (Inst $add, $m1, $m2, ARMVCCThen, $pred)>;
     }
@@ -3591,20 +3711,14 @@ defm MVE_VFMSf32 : MVE_VFMA_fp_multi<"vfms", 1, MVE_v4f32>;
 defm MVE_VFMSf16 : MVE_VFMA_fp_multi<"vfms", 1, MVE_v8f16>;
 
 multiclass MVE_VADDSUB_fp_m<string iname, bit bit_21, MVEVectorVTInfo VTI,
-                            SDNode unpred_op, Intrinsic pred_int> {
+                            SDNode Op, Intrinsic PredInt> {
   def "" : MVE_VADDSUBFMA_fp<iname, VTI.Suffix, VTI.Size{0}, 0, 1, bit_21> {
     let validForTailPredication = 1;
   }
   defvar Inst = !cast<Instruction>(NAME);
 
   let Predicates = [HasMVEFloat] in {
-    def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
-              (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
-    def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
-                            (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
-              (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
-                             ARMVCCThen, (VTI.Pred VCCR:$mask),
-                             (VTI.Vec MQPR:$inactive)))>;
+    defm : MVE_TwoOpPattern<VTI, Op, PredInt, (? ), !cast<Instruction>(NAME)>;
   }
 }
 
@@ -3706,7 +3820,14 @@ multiclass MVE_VABD_fp_m<MVEVectorVTInfo VTI>
   : MVE_VABDT_fp_m<VTI, int_arm_mve_vabd, int_arm_mve_abd_predicated>;
 
 defm MVE_VABDf32 : MVE_VABD_fp_m<MVE_v4f32>;
-defm MVE_VABDf16 : MVE_VABD_fp_m<MVE_v8f16>;
+defm MVE_VABDf16 : MVE_VABD_fp_m<MVE_v8f16>; 
+
+let Predicates = [HasMVEFloat] in {
+  def : Pat<(v8f16 (fabs (fsub (v8f16 MQPR:$Qm), (v8f16 MQPR:$Qn)))),
+            (MVE_VABDf16 MQPR:$Qm, MQPR:$Qn)>;
+  def : Pat<(v4f32 (fabs (fsub (v4f32 MQPR:$Qm), (v4f32 MQPR:$Qn)))),
+            (MVE_VABDf32 MQPR:$Qm, MQPR:$Qn)>;
+}
 
 class MVE_VCVT_fix<string suffix, bit fsi, bit U, bit op,
                    Operand imm_operand_type>
@@ -3926,8 +4047,8 @@ multiclass MVE_VABSNEG_fp_m<string iname, SDNode unpred_op, Intrinsic pred_int,
   defvar Inst = !cast<Instruction>(NAME);
 
   let Predicates = [HasMVEInt] in {
-    def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$v))), (VTI.Vec (Inst $v))>;
-
+    def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$v))),
+              (VTI.Vec (Inst $v))>;
     def : Pat<(VTI.Vec (pred_int  (VTI.Vec MQPR:$v), (VTI.Pred VCCR:$mask),
                                   (VTI.Vec MQPR:$inactive))),
               (VTI.Vec (Inst $v, ARMVCCThen, $mask, $inactive))>;
@@ -3962,6 +4083,8 @@ class MVE_VMAXMINNMA<string iname, string suffix, bit size, bit bit_12,
   let Inst{4} = 0b0;
   let Inst{3-1} = Qm{2-0};
   let Inst{0} = 0b1;
+
+  let isCommutable = 1;
 }
 
 multiclass MVE_VMAXMINNMA_m<string iname, MVEVectorVTInfo VTI,
@@ -4287,6 +4410,10 @@ let Predicates = [HasMVEInt] in {
 // vector types (v4i1<>v8i1, etc.) also as part of lowering vector shuffles.
 def predicate_cast : SDNode<"ARMISD::PREDICATE_CAST", SDTUnaryOp>;
 
+def load_align4 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() >= 4;
+}]>;
+
 let Predicates = [HasMVEInt] in {
   foreach VT = [ v4i1, v8i1, v16i1 ] in {
     def : Pat<(i32 (predicate_cast (VT VCCR:$src))),
@@ -4299,6 +4426,13 @@ let Predicates = [HasMVEInt] in {
                 (VT  (COPY_TO_REGCLASS (VT2 VCCR:$src), VCCR))>;
   }
 
+  // If we happen to be casting from a load we can convert that straight
+  // into a predicate load, so long as the load is of the correct type.
+  foreach VT = [ v4i1, v8i1, v16i1 ] in {
+    def : Pat<(VT (predicate_cast (i32 (load_align4 taddrmode_imm7<2>:$addr)))),
+              (VT (VLDR_P0_off taddrmode_imm7<2>:$addr))>;
+  }
+
   // Here we match the specific SDNode type 'ARMVectorRegCastImpl'
   // rather than the more general 'ARMVectorRegCast' which would also
   // match some bitconverts. If we use the latter in cases where the
@@ -4307,7 +4441,8 @@ let Predicates = [HasMVEInt] in {
 
   foreach VT = [ v16i8, v8i16, v8f16, v4i32, v4f32, v2i64, v2f64 ] in
     foreach VT2 = [ v16i8, v8i16, v8f16, v4i32, v4f32, v2i64, v2f64 ] in
-      def : Pat<(VT (ARMVectorRegCastImpl (VT2 MQPR:$src))), (VT MQPR:$src)>;
+      def : Pat<(VT (ARMVectorRegCastImpl (VT2 MQPR:$src))),
+                (VT MQPR:$src)>;
 }
 
 // end of MVE compares
@@ -4635,7 +4770,7 @@ class MVE_VxMOVxN<string iname, string suffix, bit bit_28, bit bit_17,
   let Inst{16} = 0b1;
   let Inst{12} = T;
   let Inst{8} = 0b0;
-  let Inst{7} = !if(!eq(bit_17, 0), 1, 0);
+  let Inst{7} = !not(bit_17);
   let Inst{0} = 0b1;
   let validForTailPredication = 1;
   let retainsPreviousHalfElement = 1;
@@ -4666,7 +4801,7 @@ multiclass MVE_VMOVN_p<Instruction Inst, bit top,
                                (VTI.Vec MQPR:$Qm), (i32 top))),
             (VTI.Vec (Inst (VTI.Vec MQPR:$Qd_src), (VTI.Vec MQPR:$Qm)))>;
 
-  if !eq(top, 0) then {
+  if !not(top) then {
     // If we see MVEvmovn(a,ARMvrev(b),1), that wants to overwrite the odd
     // lanes of a with the odd lanes of b. In other words, the lanes we're
     // _keeping_ from a are the even ones. So we can flip it round and say that
@@ -5023,32 +5158,6 @@ multiclass MVE_vec_scalar_int_pat_m<Instruction inst, MVEVectorVTInfo VTI,
   }
 }
 
-// Patterns for vector-scalar instructions with FP operands
-multiclass MVE_vec_scalar_fp_pat_m<SDNode unpred_op, Intrinsic pred_int,
-                                   Instruction instr_f16,
-                                   Instruction instr_f32> {
-  let Predicates = [HasMVEFloat] in {
-    // Unpredicated F16
-    def : Pat<(v8f16 (unpred_op (v8f16 MQPR:$Qm), (v8f16 (ARMvdup rGPR:$val)))),
-              (v8f16 (instr_f16 (v8f16 MQPR:$Qm), (i32 rGPR:$val)))>;
-    // Unpredicated F32
-    def : Pat<(v4f32 (unpred_op (v4f32 MQPR:$Qm), (v4f32 (ARMvdup rGPR:$val)))),
-              (v4f32 (instr_f32 (v4f32 MQPR:$Qm), (i32 rGPR:$val)))>;
-    // Predicated F16
-    def : Pat<(v8f16 (pred_int (v8f16 MQPR:$Qm), (v8f16 (ARMvdup rGPR:$val)),
-                               (v8i1 VCCR:$mask), (v8f16 MQPR:$inactive))),
-              (v8f16 (instr_f16 (v8f16 MQPR:$Qm), (i32 rGPR:$val),
-                                ARMVCCThen, (v8i1 VCCR:$mask),
-                                (v8f16 MQPR:$inactive)))>;
-    // Predicated F32
-    def : Pat<(v4f32 (pred_int (v4f32 MQPR:$Qm), (v4f32 (ARMvdup rGPR:$val)),
-                               (v4i1 VCCR:$mask), (v4f32 MQPR:$inactive))),
-              (v4f32 (instr_f32 (v4f32 MQPR:$Qm), (i32 rGPR:$val),
-                                ARMVCCThen, (v4i1 VCCR:$mask),
-                                (v4f32 MQPR:$inactive)))>;
-  }
-}
-
 class MVE_VADDSUB_qr<string iname, string suffix, bits<2> size,
                      bit bit_5, bit bit_12, bit bit_16, bit bit_28>
   : MVE_qDest_rSrc<iname, suffix, ""> {
@@ -5064,10 +5173,11 @@ class MVE_VADDSUB_qr<string iname, string suffix, bits<2> size,
 
 // Vector-scalar add/sub
 multiclass MVE_VADDSUB_qr_m<string iname, MVEVectorVTInfo VTI, bit subtract,
-                            SDNode unpred_op, Intrinsic pred_int> {
+                            SDNode Op, Intrinsic PredInt> {
   def "" : MVE_VADDSUB_qr<iname, VTI.Suffix, VTI.Size, 0b0, subtract, 0b1, 0b0>;
-  defm : MVE_vec_scalar_int_pat_m<!cast<Instruction>(NAME), VTI,
-                                  unpred_op, pred_int>;
+  let Predicates = [HasMVEInt] in {
+    defm : MVE_TwoOpPatternDup<VTI, Op, PredInt, (? ), !cast<Instruction>(NAME), ARMimmAllZerosV>;
+  }
 }
 
 multiclass MVE_VADD_qr_m<MVEVectorVTInfo VTI>
@@ -5086,36 +5196,35 @@ defm MVE_VSUB_qr_i32 : MVE_VSUB_qr_m<MVE_v4i32>;
 
 // Vector-scalar saturating add/sub
 multiclass MVE_VQADDSUB_qr_m<string iname, MVEVectorVTInfo VTI, bit subtract,
-                             SDNode unpred_op_s, SDNode unpred_op_u,
-                             Intrinsic pred_int> {
+                             SDNode Op, Intrinsic PredInt> {
   def "" : MVE_VADDSUB_qr<iname, VTI.Suffix, VTI.Size, 0b1, subtract,
                           0b0, VTI.Unsigned>;
-  defvar unpred_op = !if(VTI.Unsigned, unpred_op_u, unpred_op_s);
-  defm : MVE_vec_scalar_int_pat_m<!cast<Instruction>(NAME), VTI,
-                                  unpred_op, pred_int, 0, 1>;
+
+  let Predicates = [HasMVEInt] in {
+    defm : MVE_TwoOpPatternDup<VTI, Op, PredInt, (? (i32 VTI.Unsigned)),
+                               !cast<Instruction>(NAME)>;
+  }
 }
 
-multiclass MVE_VQADD_qr_m<MVEVectorVTInfo VTI>
-  : MVE_VQADDSUB_qr_m<"vqadd", VTI, 0b0, saddsat, uaddsat,
-                     int_arm_mve_qadd_predicated>;
+multiclass MVE_VQADD_qr_m<MVEVectorVTInfo VTI, SDNode Op>
+  : MVE_VQADDSUB_qr_m<"vqadd", VTI, 0b0, Op, int_arm_mve_qadd_predicated>;
 
-multiclass MVE_VQSUB_qr_m<MVEVectorVTInfo VTI>
-  : MVE_VQADDSUB_qr_m<"vqsub", VTI, 0b1, ssubsat, usubsat,
-                     int_arm_mve_qsub_predicated>;
+multiclass MVE_VQSUB_qr_m<MVEVectorVTInfo VTI, SDNode Op>
+  : MVE_VQADDSUB_qr_m<"vqsub", VTI, 0b1, Op, int_arm_mve_qsub_predicated>;
 
-defm MVE_VQADD_qr_s8  : MVE_VQADD_qr_m<MVE_v16s8>;
-defm MVE_VQADD_qr_s16 : MVE_VQADD_qr_m<MVE_v8s16>;
-defm MVE_VQADD_qr_s32 : MVE_VQADD_qr_m<MVE_v4s32>;
-defm MVE_VQADD_qr_u8  : MVE_VQADD_qr_m<MVE_v16u8>;
-defm MVE_VQADD_qr_u16 : MVE_VQADD_qr_m<MVE_v8u16>;
-defm MVE_VQADD_qr_u32 : MVE_VQADD_qr_m<MVE_v4u32>;
+defm MVE_VQADD_qr_s8  : MVE_VQADD_qr_m<MVE_v16s8, saddsat>;
+defm MVE_VQADD_qr_s16 : MVE_VQADD_qr_m<MVE_v8s16, saddsat>;
+defm MVE_VQADD_qr_s32 : MVE_VQADD_qr_m<MVE_v4s32, saddsat>;
+defm MVE_VQADD_qr_u8  : MVE_VQADD_qr_m<MVE_v16u8, uaddsat>;
+defm MVE_VQADD_qr_u16 : MVE_VQADD_qr_m<MVE_v8u16, uaddsat>;
+defm MVE_VQADD_qr_u32 : MVE_VQADD_qr_m<MVE_v4u32, uaddsat>;
 
-defm MVE_VQSUB_qr_s8  : MVE_VQSUB_qr_m<MVE_v16s8>;
-defm MVE_VQSUB_qr_s16 : MVE_VQSUB_qr_m<MVE_v8s16>;
-defm MVE_VQSUB_qr_s32 : MVE_VQSUB_qr_m<MVE_v4s32>;
-defm MVE_VQSUB_qr_u8  : MVE_VQSUB_qr_m<MVE_v16u8>;
-defm MVE_VQSUB_qr_u16 : MVE_VQSUB_qr_m<MVE_v8u16>;
-defm MVE_VQSUB_qr_u32 : MVE_VQSUB_qr_m<MVE_v4u32>;
+defm MVE_VQSUB_qr_s8  : MVE_VQSUB_qr_m<MVE_v16s8, ssubsat>;
+defm MVE_VQSUB_qr_s16 : MVE_VQSUB_qr_m<MVE_v8s16, ssubsat>;
+defm MVE_VQSUB_qr_s32 : MVE_VQSUB_qr_m<MVE_v4s32, ssubsat>;
+defm MVE_VQSUB_qr_u8  : MVE_VQSUB_qr_m<MVE_v16u8, usubsat>;
+defm MVE_VQSUB_qr_u16 : MVE_VQSUB_qr_m<MVE_v8u16, usubsat>;
+defm MVE_VQSUB_qr_u32 : MVE_VQSUB_qr_m<MVE_v4u32, usubsat>;
 
 class MVE_VQDMULL_qr<string iname, string suffix, bit size,
                      bit T, string cstr="", list<dag> pattern=[]>
@@ -5206,19 +5315,25 @@ defm MVE_VHSUB_qr_u8  : MVE_VHSUB_qr_m<MVE_v16u8>;
 defm MVE_VHSUB_qr_u16 : MVE_VHSUB_qr_m<MVE_v8u16>;
 defm MVE_VHSUB_qr_u32 : MVE_VHSUB_qr_m<MVE_v4u32>;
 
+multiclass MVE_VADDSUB_qr_f<string iname, MVEVectorVTInfo VTI, bit subtract,
+                            SDNode Op, Intrinsic PredInt> {
+  def "" : MVE_VxADDSUB_qr<iname, VTI.Suffix, VTI.Size{0}, 0b11, subtract>;
+  defm : MVE_TwoOpPatternDup<VTI, Op, PredInt, (? ),
+                              !cast<Instruction>(NAME)>;
+}
+
 let Predicates = [HasMVEFloat] in {
-  def MVE_VADD_qr_f32 : MVE_VxADDSUB_qr<"vadd",  "f32", 0b0, 0b11, 0b0>;
-  def MVE_VADD_qr_f16 : MVE_VxADDSUB_qr<"vadd",  "f16", 0b1, 0b11, 0b0>;
+  defm MVE_VADD_qr_f32 : MVE_VADDSUB_qr_f<"vadd", MVE_v4f32, 0b0, fadd,
+                                          int_arm_mve_add_predicated>;
+  defm MVE_VADD_qr_f16 : MVE_VADDSUB_qr_f<"vadd", MVE_v8f16, 0b0, fadd,
+                                          int_arm_mve_add_predicated>;
 
-  def MVE_VSUB_qr_f32 : MVE_VxADDSUB_qr<"vsub",  "f32", 0b0, 0b11, 0b1>;
-  def MVE_VSUB_qr_f16 : MVE_VxADDSUB_qr<"vsub",  "f16", 0b1, 0b11, 0b1>;
+  defm MVE_VSUB_qr_f32 : MVE_VADDSUB_qr_f<"vsub", MVE_v4f32, 0b1, fsub,
+                                          int_arm_mve_sub_predicated>;
+  defm MVE_VSUB_qr_f16 : MVE_VADDSUB_qr_f<"vsub", MVE_v8f16, 0b1, fsub,
+                                          int_arm_mve_sub_predicated>;
 }
 
-defm : MVE_vec_scalar_fp_pat_m<fadd, int_arm_mve_add_predicated,
-                               MVE_VADD_qr_f16, MVE_VADD_qr_f32>;
-defm : MVE_vec_scalar_fp_pat_m<fsub, int_arm_mve_sub_predicated,
-                               MVE_VSUB_qr_f16, MVE_VSUB_qr_f32>;
-
 class MVE_VxSHL_qr<string iname, string suffix, bit U, bits<2> size,
                    bit bit_7, bit bit_17, list<dag> pattern=[]>
   : MVE_qDest_single_rSrc<iname, suffix, pattern> {
@@ -5346,8 +5461,10 @@ class MVE_VMUL_qr_int<string iname, string suffix, bits<2> size>
 
 multiclass MVE_VMUL_qr_int_m<MVEVectorVTInfo VTI> {
   def "" : MVE_VMUL_qr_int<"vmul", VTI.Suffix, VTI.Size>;
-  defm : MVE_vec_scalar_int_pat_m<!cast<Instruction>(NAME), VTI,
-                                  mul, int_arm_mve_mul_predicated>;
+  let Predicates = [HasMVEInt] in {
+    defm : MVE_TwoOpPatternDup<VTI, mul, int_arm_mve_mul_predicated, (? ),
+                               !cast<Instruction>(NAME), ARMimmOneV>;
+  }
 }
 
 defm MVE_VMUL_qr_i8  : MVE_VMUL_qr_int_m<MVE_v16i8>;
@@ -5364,21 +5481,25 @@ class MVE_VxxMUL_qr<string iname, string suffix,
   let Inst{12} = 0b0;
   let Inst{8} = 0b0;
   let Inst{5} = 0b1;
+  let validForTailPredication = 1;
 }
 
 multiclass MVE_VxxMUL_qr_m<string iname, MVEVectorVTInfo VTI, bit bit_28,
-                           Intrinsic int_unpred, Intrinsic int_pred> {
+                           PatFrag Op, Intrinsic int_unpred, Intrinsic int_pred> {
   def "" : MVE_VxxMUL_qr<iname, VTI.Suffix, bit_28, VTI.Size>;
-  defm : MVE_vec_scalar_int_pat_m<!cast<Instruction>(NAME), VTI,
-                                  int_unpred, int_pred>;
+
+  let Predicates = [HasMVEInt] in {
+    defm : MVE_TwoOpPatternDup<VTI, Op, int_pred, (? ), !cast<Instruction>(NAME)>;
+  }
+  defm : MVE_vec_scalar_int_pat_m<!cast<Instruction>(NAME), VTI, int_unpred, int_pred>;
 }
 
 multiclass MVE_VQDMULH_qr_m<MVEVectorVTInfo VTI> :
-  MVE_VxxMUL_qr_m<"vqdmulh", VTI, 0b0,
+  MVE_VxxMUL_qr_m<"vqdmulh", VTI, 0b0, MVEvqdmulh,
                   int_arm_mve_vqdmulh, int_arm_mve_qdmulh_predicated>;
 
 multiclass MVE_VQRDMULH_qr_m<MVEVectorVTInfo VTI> :
-  MVE_VxxMUL_qr_m<"vqrdmulh", VTI, 0b1,
+  MVE_VxxMUL_qr_m<"vqrdmulh", VTI, 0b1, null_frag,
                   int_arm_mve_vqrdmulh, int_arm_mve_qrdmulh_predicated>;
 
 defm MVE_VQDMULH_qr_s8    : MVE_VQDMULH_qr_m<MVE_v16s8>;
@@ -5389,13 +5510,17 @@ defm MVE_VQRDMULH_qr_s8   : MVE_VQRDMULH_qr_m<MVE_v16s8>;
 defm MVE_VQRDMULH_qr_s16  : MVE_VQRDMULH_qr_m<MVE_v8s16>;
 defm MVE_VQRDMULH_qr_s32  : MVE_VQRDMULH_qr_m<MVE_v4s32>;
 
-let Predicates = [HasMVEFloat], validForTailPredication = 1 in {
-  def MVE_VMUL_qr_f16   : MVE_VxxMUL_qr<"vmul", "f16", 0b1, 0b11>;
-  def MVE_VMUL_qr_f32   : MVE_VxxMUL_qr<"vmul", "f32", 0b0, 0b11>;
+multiclass MVE_VxxMUL_qr_f_m<MVEVectorVTInfo VTI> {
+  let validForTailPredication = 1 in
+  def "" : MVE_VxxMUL_qr<"vmul", VTI.Suffix, VTI.Size{0}, 0b11>;
+  defm : MVE_TwoOpPatternDup<VTI, fmul, int_arm_mve_mul_predicated, (? ),
+                             !cast<Instruction>(NAME)>;
 }
 
-defm : MVE_vec_scalar_fp_pat_m<fmul, int_arm_mve_mul_predicated,
-                               MVE_VMUL_qr_f16, MVE_VMUL_qr_f32>;
+let Predicates = [HasMVEFloat] in {
+  defm MVE_VMUL_qr_f16   : MVE_VxxMUL_qr_f_m<MVE_v8f16>;
+  defm MVE_VMUL_qr_f32   : MVE_VxxMUL_qr_f_m<MVE_v4f32>;
+}
 
 class MVE_VFMAMLA_qr<string iname, string suffix,
                      bit bit_28, bits<2> bits_21_20, bit S,
@@ -5470,6 +5595,10 @@ multiclass MVE_VFMA_qr_multi<string iname, MVEVectorVTInfo VTI,
     if scalar_addend then {
       def : Pat<(VTI.Vec (fma v1, v2, vs)),
                 (VTI.Vec (Inst v1, v2, is))>;
+      def : Pat<(VTI.Vec (vselect (VTI.Pred VCCR:$pred),
+                                  (VTI.Vec (fma v1, v2, vs)),
+                                  v1)),
+                (VTI.Vec (Inst v1, v2, is, ARMVCCThen, $pred))>;
       def : Pat<(VTI.Vec (pred_int v1, v2, vs, pred)),
                 (VTI.Vec (Inst v1, v2, is, ARMVCCThen, pred))>;
     } else {
@@ -5477,6 +5606,14 @@ multiclass MVE_VFMA_qr_multi<string iname, MVEVectorVTInfo VTI,
                 (VTI.Vec (Inst v2, v1, is))>;
       def : Pat<(VTI.Vec (fma vs, v1, v2)),
                 (VTI.Vec (Inst v2, v1, is))>;
+      def : Pat<(VTI.Vec (vselect (VTI.Pred VCCR:$pred),
+                                  (VTI.Vec (fma vs, v2, v1)),
+                                  v1)),
+                (VTI.Vec (Inst v1, v2, is, ARMVCCThen, $pred))>;
+      def : Pat<(VTI.Vec (vselect (VTI.Pred VCCR:$pred),
+                                  (VTI.Vec (fma v2, vs, v1)),
+                                  v1)),
+                (VTI.Vec (Inst v1, v2, is, ARMVCCThen, $pred))>;
       def : Pat<(VTI.Vec (pred_int v1, vs, v2, pred)),
                 (VTI.Vec (Inst v2, v1, is, ARMVCCThen, pred))>;
       def : Pat<(VTI.Vec (pred_int vs, v1, v2, pred)),
@@ -5605,7 +5742,7 @@ def MVE_VDWDUPu8  : MVE_VxWDUP<"vdwdup", "u8",  0b00, 0b1>;
 def MVE_VDWDUPu16 : MVE_VxWDUP<"vdwdup", "u16", 0b01, 0b1>;
 def MVE_VDWDUPu32 : MVE_VxWDUP<"vdwdup", "u32", 0b10, 0b1>;
 
-let hasSideEffects = 1 in
+let isReMaterializable = 1 in
 class MVE_VCTPInst<string suffix, bits<2> size, list<dag> pattern=[]>
   : MVE_p<(outs VCCR:$P0), (ins rGPR:$Rn), NoItinerary, "vctp", suffix,
           "$Rn", vpred_n, "", pattern> {
@@ -5629,7 +5766,8 @@ multiclass MVE_VCTP<MVEVectorVTInfo VTI, Intrinsic intr> {
   defvar Inst = !cast<Instruction>(NAME);
 
   let Predicates = [HasMVEInt] in {
-    def : Pat<(intr rGPR:$Rn), (VTI.Pred (Inst rGPR:$Rn))>;
+    def : Pat<(intr rGPR:$Rn),
+              (VTI.Pred (Inst rGPR:$Rn))>;
     def : Pat<(and (intr rGPR:$Rn), (VTI.Pred VCCR:$mask)),
               (VTI.Pred (Inst rGPR:$Rn, ARMVCCThen, VCCR:$mask))>;
   }
@@ -5707,6 +5845,41 @@ def MVE_VMOV_rr_q : MVE_VMOV_64bit<(outs rGPR:$Rt, rGPR:$Rt2), (ins MQPR:$Qd),
   let AsmMatchConverter = "cvtMVEVMOVQtoDReg";
 }
 
+let Predicates = [HasMVEInt] in {
+  // Double lane moves. There are a number of patterns here. We know that the
+  // insertelt's will be in descending order by index, and need to match the 5
+  // patterns that might contain 2-0 or 3-1 pairs. These are:
+  // 3 2 1 0    -> vmovqrr 31; vmovqrr 20
+  // 3 2 1      -> vmovqrr 31; vmov 2
+  // 3 1        -> vmovqrr 31
+  // 2 1 0      -> vmovqrr 20; vmov 1
+  // 2 0        -> vmovqrr 20
+  // The other potential patterns will be handled by single lane inserts.
+  def : Pat<(insertelt (insertelt (insertelt (insertelt (v4i32 MQPR:$src1),
+                                                        rGPR:$srcA, (i32 0)),
+                                             rGPR:$srcB, (i32 1)),
+                                  rGPR:$srcC, (i32 2)),
+                       rGPR:$srcD, (i32 3)),
+            (MVE_VMOV_q_rr (MVE_VMOV_q_rr MQPR:$src1, rGPR:$srcA, rGPR:$srcC, (i32 2), (i32 0)),
+                           rGPR:$srcB, rGPR:$srcD, (i32 3), (i32 1))>;
+  def : Pat<(insertelt (insertelt (insertelt (v4i32 MQPR:$src1),
+                                             rGPR:$srcB, (i32 1)),
+                                  rGPR:$srcC, (i32 2)),
+                       rGPR:$srcD, (i32 3)),
+            (MVE_VMOV_q_rr (MVE_VMOV_to_lane_32 MQPR:$src1, rGPR:$srcC, (i32 2)),
+                           rGPR:$srcB, rGPR:$srcD, (i32 3), (i32 1))>;
+  def : Pat<(insertelt (insertelt (v4i32 MQPR:$src1), rGPR:$srcA, (i32 1)), rGPR:$srcB, (i32 3)),
+            (MVE_VMOV_q_rr MQPR:$src1, rGPR:$srcA, rGPR:$srcB, (i32 3), (i32 1))>;
+  def : Pat<(insertelt (insertelt (insertelt (v4i32 MQPR:$src1),
+                                             rGPR:$srcB, (i32 0)),
+                                  rGPR:$srcC, (i32 1)),
+                       rGPR:$srcD, (i32 2)),
+            (MVE_VMOV_q_rr (MVE_VMOV_to_lane_32 MQPR:$src1, rGPR:$srcC, (i32 1)),
+                           rGPR:$srcB, rGPR:$srcD, (i32 2), (i32 0))>;
+  def : Pat<(insertelt (insertelt (v4i32 MQPR:$src1), rGPR:$srcA, (i32 0)), rGPR:$srcB, (i32 2)),
+            (MVE_VMOV_q_rr MQPR:$src1, rGPR:$srcA, rGPR:$srcB, (i32 2), (i32 0))>;
+}
+
 // end of coproc mov
 
 // start of MVE interleaving load/store
@@ -5735,6 +5908,7 @@ class MVE_vldst24_base<bit writeback, bit fourregs, bits<2> stage, bits<2> size,
   let mayLoad = load;
   let mayStore = !eq(load,0);
   let hasSideEffects = 0;
+  let validForTailPredication = load;
 }
 
 // A parameter class used to encapsulate all the ways the writeback
@@ -6344,6 +6518,7 @@ class MVE_VPT<string suffix, bits<2> size, dag iops, string asm, list<dag> patte
   let Inst{4} = 0b0;
 
   let Defs = [VPR];
+  let validForTailPredication=1;
 }
 
 class MVE_VPTt1<string suffix, bits<2> size, dag iops>
@@ -6456,6 +6631,7 @@ class MVE_VPTf<string suffix, bit size, dag iops, string asm, list<dag> pattern=
 
   let Defs = [VPR];
   let Predicates = [HasMVEFloat];
+  let validForTailPredication=1;
 }
 
 class MVE_VPTft1<string suffix, bit size>
@@ -6583,13 +6759,6 @@ let Predicates = [HasMVEInt] in {
             (v8i16 (MVE_VPSEL (MVE_VMOVimmi16 1), (MVE_VMOVimmi16 0), ARMVCCNone, VCCR:$pred))>;
   def : Pat<(v4i32 (anyext  (v4i1  VCCR:$pred))),
             (v4i32 (MVE_VPSEL (MVE_VMOVimmi32 1), (MVE_VMOVimmi32 0), ARMVCCNone, VCCR:$pred))>;
-
-  def : Pat<(v16i1 (trunc (v16i8 MQPR:$v1))),
-            (v16i1 (MVE_VCMPi32r (v16i8 MQPR:$v1), ZR, ARMCCne))>;
-  def : Pat<(v8i1 (trunc (v8i16  MQPR:$v1))),
-            (v8i1 (MVE_VCMPi32r (v8i16 MQPR:$v1), ZR, ARMCCne))>;
-  def : Pat<(v4i1 (trunc (v4i32  MQPR:$v1))),
-            (v4i1 (MVE_VCMPi32r (v4i32 MQPR:$v1), ZR, ARMCCne))>;
 }
 
 let Predicates = [HasMVEFloat] in {
@@ -6938,7 +7107,7 @@ class MVE_vector_load_typed<ValueType Ty, Instruction RegImmInst,
 
 class MVE_vector_maskedload_typed<ValueType Ty, Instruction RegImmInst,
                                   PatFrag LoadKind, int shift>
-  : Pat<(Ty (LoadKind t2addrmode_imm7<shift>:$addr, VCCR:$pred, (Ty NEONimmAllZerosV))),
+  : Pat<(Ty (LoadKind t2addrmode_imm7<shift>:$addr, VCCR:$pred, (Ty (ARMvmovImm (i32 0))))),
         (Ty (RegImmInst t2addrmode_imm7<shift>:$addr, ARMVCCThen, VCCR:$pred))>;
 
 multiclass MVE_vector_load<Instruction RegImmInst, PatFrag LoadKind,
@@ -7105,11 +7274,11 @@ multiclass MVEExtLoadStore<Instruction LoadSInst, Instruction LoadUInst, string
             (VT (LoadUInst taddrmode_imm7<Shift>:$addr))>;
 
   // Masked ext loads
-  def : Pat<(VT (!cast<PatFrag>("aligned_extmaskedload"#Amble) taddrmode_imm7<Shift>:$addr, VCCR:$pred, (VT NEONimmAllZerosV))),
+  def : Pat<(VT (!cast<PatFrag>("aligned_extmaskedload"#Amble) taddrmode_imm7<Shift>:$addr, VCCR:$pred, (VT (ARMvmovImm (i32 0))))),
             (VT (LoadUInst taddrmode_imm7<Shift>:$addr, ARMVCCThen, VCCR:$pred))>;
-  def : Pat<(VT (!cast<PatFrag>("aligned_sextmaskedload"#Amble) taddrmode_imm7<Shift>:$addr, VCCR:$pred, (VT NEONimmAllZerosV))),
+  def : Pat<(VT (!cast<PatFrag>("aligned_sextmaskedload"#Amble) taddrmode_imm7<Shift>:$addr, VCCR:$pred, (VT (ARMvmovImm (i32 0))))),
             (VT (LoadSInst taddrmode_imm7<Shift>:$addr, ARMVCCThen, VCCR:$pred))>;
-  def : Pat<(VT (!cast<PatFrag>("aligned_zextmaskedload"#Amble) taddrmode_imm7<Shift>:$addr, VCCR:$pred, (VT NEONimmAllZerosV))),
+  def : Pat<(VT (!cast<PatFrag>("aligned_zextmaskedload"#Amble) taddrmode_imm7<Shift>:$addr, VCCR:$pred, (VT (ARMvmovImm (i32 0))))),
             (VT (LoadUInst taddrmode_imm7<Shift>:$addr, ARMVCCThen, VCCR:$pred))>;
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrNEON.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrNEON.td
index 1b3f6075c0e9..a8c0d05d91c4 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrNEON.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrNEON.td
@@ -509,7 +509,7 @@ def NEONvqrshrnsuImm : SDNode<"ARMISD::VQRSHRNsuIMM", SDTARMVSHXIMM>;
 def NEONvsliImm      : SDNode<"ARMISD::VSLIIMM", SDTARMVSHINSIMM>;
 def NEONvsriImm      : SDNode<"ARMISD::VSRIIMM", SDTARMVSHINSIMM>;
 
-def NEONvbsl      : SDNode<"ARMISD::VBSL",
+def NEONvbsp      : SDNode<"ARMISD::VBSP",
                            SDTypeProfile<1, 3, [SDTCisVec<0>,
                                                 SDTCisSameAs<0, 1>,
                                                 SDTCisSameAs<0, 2>,
@@ -534,20 +534,6 @@ def NEONvtbl1     : SDNode<"ARMISD::VTBL1", SDTARMVTBL1>;
 def NEONvtbl2     : SDNode<"ARMISD::VTBL2", SDTARMVTBL2>;
 
 
-def NEONimmAllZerosV: PatLeaf<(ARMvmovImm (i32 timm)), [{
-  ConstantSDNode *ConstVal = cast<ConstantSDNode>(N->getOperand(0));
-  unsigned EltBits = 0;
-  uint64_t EltVal = ARM_AM::decodeVMOVModImm(ConstVal->getZExtValue(), EltBits);
-  return (EltBits == 32 && EltVal == 0);
-}]>;
-
-def NEONimmAllOnesV: PatLeaf<(ARMvmovImm (i32 timm)), [{
-  ConstantSDNode *ConstVal = cast<ConstantSDNode>(N->getOperand(0));
-  unsigned EltBits = 0;
-  uint64_t EltVal = ARM_AM::decodeVMOVModImm(ConstVal->getZExtValue(), EltBits);
-  return (EltBits == 8 && EltVal == 0xff);
-}]>;
-
 //===----------------------------------------------------------------------===//
 // NEON load / store instructions
 //===----------------------------------------------------------------------===//
@@ -4211,10 +4197,10 @@ def  VADDhq   : N3VQ<0, 0, 0b01, 0b1101, 0, IIC_VBINQ, "vadd", "f16",
 defm VADDLs   : N3VLExt_QHS<0,1,0b0000,0, IIC_VSHLiD, IIC_VSHLiD,
                             "vaddl", "s", add, sext, 1>;
 defm VADDLu   : N3VLExt_QHS<1,1,0b0000,0, IIC_VSHLiD, IIC_VSHLiD,
-                            "vaddl", "u", add, zext, 1>;
+                            "vaddl", "u", add, zanyext, 1>;
 //   VADDW    : Vector Add Wide (Q = Q + D)
 defm VADDWs   : N3VW_QHS<0,1,0b0001,0, "vaddw", "s", add, sext, 0>;
-defm VADDWu   : N3VW_QHS<1,1,0b0001,0, "vaddw", "u", add, zext, 0>;
+defm VADDWu   : N3VW_QHS<1,1,0b0001,0, "vaddw", "u", add, zanyext, 0>;
 //   VHADD    : Vector Halving Add
 defm VHADDs   : N3VInt_QHS<0, 0, 0b0000, 0, N3RegFrm,
                            IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
@@ -4526,9 +4512,9 @@ let Predicates = [HasNEON, HasV8_1a] in {
                                     (SubReg_i16_lane imm:$lane)))>;
   def : Pat<(v4i32 (saddsat
                      (v4i32 QPR:$src1),
-                     (v4i32 (int_arm_neon_vqrdmulh 
+                     (v4i32 (int_arm_neon_vqrdmulh
                               (v4i32 QPR:$src2),
-                              (v4i32 (ARMvduplane (v4i32 QPR:$src3), 
+                              (v4i32 (ARMvduplane (v4i32 QPR:$src3),
                                                    imm:$lane)))))),
             (v4i32 (VQRDMLAHslv4i32 (v4i32 QPR:$src1),
                                     (v4i32 QPR:$src2),
@@ -4579,17 +4565,17 @@ let Predicates = [HasNEON, HasV8_1a] in {
                               (v2i32 DPR:$Vn),
                               (v2i32 (ARMvduplane (v2i32 DPR_VFP2:$Vm),
                                                    imm:$lane)))))),
-            (v2i32 (VQRDMLSHslv2i32 DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, 
+            (v2i32 (VQRDMLSHslv2i32 DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm,
                                     imm:$lane))>;
   def : Pat<(v8i16 (ssubsat
                      (v8i16 QPR:$src1),
                      (v8i16 (int_arm_neon_vqrdmulh
                               (v8i16 QPR:$src2),
-                              (v8i16 (ARMvduplane (v8i16 QPR:$src3), 
+                              (v8i16 (ARMvduplane (v8i16 QPR:$src3),
                                                    imm:$lane)))))),
             (v8i16 (VQRDMLSHslv8i16 (v8i16 QPR:$src1),
                                     (v8i16 QPR:$src2),
-                                    (v4i16 (EXTRACT_SUBREG 
+                                    (v4i16 (EXTRACT_SUBREG
                                              QPR:$src3,
                                              (DSubReg_i16_reg imm:$lane))),
                                     (SubReg_i16_lane imm:$lane)))>;
@@ -4601,7 +4587,7 @@ let Predicates = [HasNEON, HasV8_1a] in {
                                                     imm:$lane)))))),
             (v4i32 (VQRDMLSHslv4i32 (v4i32 QPR:$src1),
                                     (v4i32 QPR:$src2),
-                                    (v2i32 (EXTRACT_SUBREG 
+                                    (v2i32 (EXTRACT_SUBREG
                                              QPR:$src3,
                                              (DSubReg_i32_reg imm:$lane))),
                                     (SubReg_i32_lane imm:$lane)))>;
@@ -5059,10 +5045,10 @@ def  VSUBhq   : N3VQ<0, 0, 0b11, 0b1101, 0, IIC_VBINQ, "vsub", "f16",
 defm VSUBLs   : N3VLExt_QHS<0,1,0b0010,0, IIC_VSHLiD, IIC_VSHLiD,
                             "vsubl", "s", sub, sext, 0>;
 defm VSUBLu   : N3VLExt_QHS<1,1,0b0010,0, IIC_VSHLiD, IIC_VSHLiD,
-                            "vsubl", "u", sub, zext, 0>;
+                            "vsubl", "u", sub, zanyext, 0>;
 //   VSUBW    : Vector Subtract Wide (Q = Q - D)
 defm VSUBWs   : N3VW_QHS<0,1,0b0011,0, "vsubw", "s", sub, sext, 0>;
-defm VSUBWu   : N3VW_QHS<1,1,0b0011,0, "vsubw", "u", sub, zext, 0>;
+defm VSUBWu   : N3VW_QHS<1,1,0b0011,0, "vsubw", "u", sub, zanyext, 0>;
 //   VHSUB    : Vector Halving Subtract
 defm VHSUBs   : N3VInt_QHS<0, 0, 0b0010, 0, N3RegFrm,
                            IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
@@ -5273,9 +5259,9 @@ def: NEONInstAlias<"vacle${p}.f16 $Vd, $Vm",
 // Vector Bitwise Operations.
 
 def vnotd : PatFrag<(ops node:$in),
-                    (xor node:$in, (bitconvert (v8i8 NEONimmAllOnesV)))>;
+                    (xor node:$in, ARMimmAllOnesD)>;
 def vnotq : PatFrag<(ops node:$in),
-                    (xor node:$in, (bitconvert (v16i8 NEONimmAllOnesV)))>;
+                    (xor node:$in, ARMimmAllOnesV)>;
 
 
 //   VAND     : Vector Bitwise AND
@@ -5442,74 +5428,86 @@ def : Pat<(v2i32 (vnotd DPR:$src)), (VMVNd DPR:$src)>;
 def : Pat<(v4i32 (vnotq QPR:$src)), (VMVNq QPR:$src)>;
 }
 
-//   VBSL     : Vector Bitwise Select
-def  VBSLd    : N3VX<1, 0, 0b01, 0b0001, 0, 1, (outs DPR:$Vd),
-                     (ins DPR:$src1, DPR:$Vn, DPR:$Vm),
-                     N3RegFrm, IIC_VCNTiD,
-                     "vbsl", "$Vd, $Vn, $Vm", "$src1 = $Vd",
-                     [(set DPR:$Vd,
-                           (v2i32 (NEONvbsl DPR:$src1, DPR:$Vn, DPR:$Vm)))]>;
+// The TwoAddress pass will not go looking for equivalent operations
+// with different register constraints; it just inserts copies.
+// That is why pseudo VBSP implemented. Is is expanded later into
+// VBIT/VBIF/VBSL taking into account register constraints to avoid copies.
+def  VBSPd
+  : PseudoNeonI<(outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm),
+                IIC_VBINiD, "",
+                [(set DPR:$Vd,
+                      (v2i32 (NEONvbsp DPR:$src1, DPR:$Vn, DPR:$Vm)))]>;
 let Predicates = [HasNEON] in {
 def : Pat<(v8i8 (int_arm_neon_vbsl (v8i8 DPR:$src1),
                                    (v8i8 DPR:$Vn), (v8i8 DPR:$Vm))),
-          (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>;
+          (VBSPd DPR:$src1, DPR:$Vn, DPR:$Vm)>;
 def : Pat<(v4i16 (int_arm_neon_vbsl (v4i16 DPR:$src1),
                                     (v4i16 DPR:$Vn), (v4i16 DPR:$Vm))),
-          (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>;
+          (VBSPd DPR:$src1, DPR:$Vn, DPR:$Vm)>;
 def : Pat<(v2i32 (int_arm_neon_vbsl (v2i32 DPR:$src1),
                                     (v2i32 DPR:$Vn), (v2i32 DPR:$Vm))),
-          (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>;
+          (VBSPd DPR:$src1, DPR:$Vn, DPR:$Vm)>;
 def : Pat<(v2f32 (int_arm_neon_vbsl (v2f32 DPR:$src1),
                                     (v2f32 DPR:$Vn), (v2f32 DPR:$Vm))),
-          (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>;
+          (VBSPd DPR:$src1, DPR:$Vn, DPR:$Vm)>;
 def : Pat<(v1i64 (int_arm_neon_vbsl (v1i64 DPR:$src1),
                                     (v1i64 DPR:$Vn), (v1i64 DPR:$Vm))),
-          (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>;
+          (VBSPd DPR:$src1, DPR:$Vn, DPR:$Vm)>;
 
 def : Pat<(v2i32 (or (and DPR:$Vn, DPR:$Vd),
                      (and DPR:$Vm, (vnotd DPR:$Vd)))),
-          (VBSLd DPR:$Vd, DPR:$Vn, DPR:$Vm)>;
+          (VBSPd DPR:$Vd, DPR:$Vn, DPR:$Vm)>;
 
 def : Pat<(v1i64 (or (and DPR:$Vn, DPR:$Vd),
                      (and DPR:$Vm, (vnotd DPR:$Vd)))),
-          (VBSLd DPR:$Vd, DPR:$Vn, DPR:$Vm)>;
+          (VBSPd DPR:$Vd, DPR:$Vn, DPR:$Vm)>;
 }
 
-def  VBSLq    : N3VX<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$Vd),
-                     (ins QPR:$src1, QPR:$Vn, QPR:$Vm),
-                     N3RegFrm, IIC_VCNTiQ,
-                     "vbsl", "$Vd, $Vn, $Vm", "$src1 = $Vd",
-                     [(set QPR:$Vd,
-                           (v4i32 (NEONvbsl QPR:$src1, QPR:$Vn, QPR:$Vm)))]>;
-
+def  VBSPq
+  : PseudoNeonI<(outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm),
+                IIC_VBINiQ, "",
+                [(set QPR:$Vd,
+                      (v4i32 (NEONvbsp QPR:$src1, QPR:$Vn, QPR:$Vm)))]>;
 let Predicates = [HasNEON] in {
 def : Pat<(v16i8 (int_arm_neon_vbsl (v16i8 QPR:$src1),
                                    (v16i8 QPR:$Vn), (v16i8 QPR:$Vm))),
-          (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>;
+          (VBSPq QPR:$src1, QPR:$Vn, QPR:$Vm)>;
 def : Pat<(v8i16 (int_arm_neon_vbsl (v8i16 QPR:$src1),
                                     (v8i16 QPR:$Vn), (v8i16 QPR:$Vm))),
-          (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>;
+          (VBSPq QPR:$src1, QPR:$Vn, QPR:$Vm)>;
 def : Pat<(v4i32 (int_arm_neon_vbsl (v4i32 QPR:$src1),
                                     (v4i32 QPR:$Vn), (v4i32 QPR:$Vm))),
-          (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>;
+          (VBSPq QPR:$src1, QPR:$Vn, QPR:$Vm)>;
 def : Pat<(v4f32 (int_arm_neon_vbsl (v4f32 QPR:$src1),
                                     (v4f32 QPR:$Vn), (v4f32 QPR:$Vm))),
-          (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>;
+          (VBSPq QPR:$src1, QPR:$Vn, QPR:$Vm)>;
 def : Pat<(v2i64 (int_arm_neon_vbsl (v2i64 QPR:$src1),
                                     (v2i64 QPR:$Vn), (v2i64 QPR:$Vm))),
-          (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>;
+          (VBSPq QPR:$src1, QPR:$Vn, QPR:$Vm)>;
 
 def : Pat<(v4i32 (or (and QPR:$Vn, QPR:$Vd),
                      (and QPR:$Vm, (vnotq QPR:$Vd)))),
-          (VBSLq QPR:$Vd, QPR:$Vn, QPR:$Vm)>;
+          (VBSPq QPR:$Vd, QPR:$Vn, QPR:$Vm)>;
 def : Pat<(v2i64 (or (and QPR:$Vn, QPR:$Vd),
                      (and QPR:$Vm, (vnotq QPR:$Vd)))),
-          (VBSLq QPR:$Vd, QPR:$Vn, QPR:$Vm)>;
+          (VBSPq QPR:$Vd, QPR:$Vn, QPR:$Vm)>;
 }
 
+//   VBSL     : Vector Bitwise Select
+def  VBSLd    : N3VX<1, 0, 0b01, 0b0001, 0, 1, (outs DPR:$Vd),
+                     (ins DPR:$src1, DPR:$Vn, DPR:$Vm),
+                     N3RegFrm, IIC_VBINiD,
+                     "vbsl", "$Vd, $Vn, $Vm", "$src1 = $Vd",
+                     []>;
+
+def  VBSLq    : N3VX<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$Vd),
+                     (ins QPR:$src1, QPR:$Vn, QPR:$Vm),
+                     N3RegFrm, IIC_VBINiQ,
+                     "vbsl", "$Vd, $Vn, $Vm", "$src1 = $Vd",
+                     []>;
+
 //   VBIF     : Vector Bitwise Insert if False
 //              like VBSL but with: "vbif $dst, $src3, $src1", "$src2 = $dst",
-// FIXME: This instruction's encoding MAY NOT BE correct.
 def  VBIFd    : N3VX<1, 0, 0b11, 0b0001, 0, 1,
                      (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm),
                      N3RegFrm, IIC_VBINiD,
@@ -5523,7 +5521,6 @@ def  VBIFq    : N3VX<1, 0, 0b11, 0b0001, 1, 1,
 
 //   VBIT     : Vector Bitwise Insert if True
 //              like VBSL but with: "vbit $dst, $src2, $src1", "$src3 = $dst",
-// FIXME: This instruction's encoding MAY NOT BE correct.
 def  VBITd    : N3VX<1, 0, 0b10, 0b0001, 0, 1,
                      (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm),
                      N3RegFrm, IIC_VBINiD,
@@ -5535,10 +5532,6 @@ def  VBITq    : N3VX<1, 0, 0b10, 0b0001, 1, 1,
                      "vbit", "$Vd, $Vn, $Vm", "$src1 = $Vd",
                      []>;
 
-// VBIT/VBIF are not yet implemented.  The TwoAddress pass will not go looking
-// for equivalent operations with different register constraints; it just
-// inserts copies.
-
 // Vector Absolute Differences.
 
 //   VABD     : Vector Absolute Difference
@@ -6047,9 +6040,9 @@ defm VQABS    : N2VInt_QHS<0b11, 0b11, 0b00, 0b01110, 0,
 // Vector Negate.
 
 def vnegd  : PatFrag<(ops node:$in),
-                     (sub (bitconvert (v2i32 NEONimmAllZerosV)), node:$in)>;
+                     (sub ARMimmAllZerosD, node:$in)>;
 def vnegq  : PatFrag<(ops node:$in),
-                     (sub (bitconvert (v4i32 NEONimmAllZerosV)), node:$in)>;
+                     (sub ARMimmAllZerosV, node:$in)>;
 
 class VNEGD<bits<2> size, string OpcodeStr, string Dt, ValueType Ty>
   : N2V<0b11, 0b11, size, 0b01, 0b00111, 0, 0, (outs DPR:$Vd), (ins DPR:$Vm),
@@ -6263,11 +6256,11 @@ defm : NEONImmReplicateInstAlias<i32, VMOVv2i32, VMOVv4i32,
 
 let AddedComplexity = 50, isAsCheapAsAMove = 1, isReMaterializable = 1 in {
   def VMOVD0 : ARMPseudoExpand<(outs DPR:$Vd), (ins), 4, IIC_VMOVImm,
-                               [(set DPR:$Vd, (v2i32 NEONimmAllZerosV))],
+                               [(set DPR:$Vd, (v2i32 ARMimmAllZerosD))],
                                (VMOVv2i32 DPR:$Vd, 0, (ops 14, zero_reg))>,
                Requires<[HasZCZ]>;
   def VMOVQ0 : ARMPseudoExpand<(outs QPR:$Vd), (ins), 4, IIC_VMOVImm,
-                               [(set QPR:$Vd, (v4i32 NEONimmAllZerosV))],
+                               [(set QPR:$Vd, (v4i32 ARMimmAllZerosV))],
                                (VMOVv4i32 QPR:$Vd, 0, (ops 14, zero_reg))>,
                Requires<[HasZCZ]>;
 }
@@ -7953,7 +7946,7 @@ let Predicates = [HasNEON,IsLE] in {
            (VLD1LNd16 addrmode6:$addr,
                       (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), dsub_0))>;
 }
-// The following patterns are basically a copy of the patterns above, 
+// The following patterns are basically a copy of the patterns above,
 // however with an additional VREV16d instruction to convert data
 // loaded by VLD1LN into proper vector format in big endian mode.
 let Predicates = [HasNEON,IsBE] in {
@@ -9086,11 +9079,11 @@ multiclass BF16VDOTI<bit Q, RegisterClass RegTy, string opc, ValueType AccumTy,
     (!cast<Instruction>(NAME) RegTy:$Vd, RegTy:$Vn, RHS, VectorIndex32:$lane)>;
 }
 
-def BF16VDOTS_VDOTD : BF16VDOTS<0, DPR, "vdot", v2f32, v8i8>;
-def BF16VDOTS_VDOTQ : BF16VDOTS<1, QPR, "vdot", v4f32, v16i8>;
+def BF16VDOTS_VDOTD : BF16VDOTS<0, DPR, "vdot", v2f32, v4bf16>;
+def BF16VDOTS_VDOTQ : BF16VDOTS<1, QPR, "vdot", v4f32, v8bf16>;
 
-defm BF16VDOTI_VDOTD : BF16VDOTI<0, DPR, "vdot", v2f32, v8i8, (v2f32 DPR_VFP2:$Vm)>;
-defm BF16VDOTI_VDOTQ : BF16VDOTI<1, QPR, "vdot", v4f32, v16i8, (EXTRACT_SUBREG QPR:$Vm, dsub_0)>;
+defm BF16VDOTI_VDOTD : BF16VDOTI<0, DPR, "vdot", v2f32, v4bf16, (v2f32 DPR_VFP2:$Vm)>;
+defm BF16VDOTI_VDOTQ : BF16VDOTI<1, QPR, "vdot", v4f32, v8bf16, (EXTRACT_SUBREG QPR:$Vm, dsub_0)>;
 
 class BF16MM<bit Q, RegisterClass RegTy,
              string opc>
@@ -9098,8 +9091,8 @@ class BF16MM<bit Q, RegisterClass RegTy,
            (outs RegTy:$dst), (ins RegTy:$Vd, RegTy:$Vn, RegTy:$Vm),
            N3RegFrm, IIC_VDOTPROD, "", "",
                 [(set (v4f32 QPR:$dst), (int_arm_neon_bfmmla (v4f32 QPR:$Vd),
-                                                (v16i8 QPR:$Vn),
-                                                (v16i8 QPR:$Vm)))]> {
+                                                (v8bf16 QPR:$Vn),
+                                                (v8bf16 QPR:$Vm)))]> {
    let Constraints = "$dst = $Vd";
    let AsmString = !strconcat(opc, ".bf16", "\t$Vd, $Vn, $Vm");
    let DecoderNamespace = "VFPV8";
@@ -9113,8 +9106,8 @@ class VBF16MALQ<bit T, string suffix, SDPatternOperator OpNode>
            NoItinerary, "vfma" # suffix, "bf16", "$Vd, $Vn, $Vm", "",
                 [(set (v4f32 QPR:$dst),
                       (OpNode (v4f32 QPR:$Vd),
-                              (v16i8 QPR:$Vn),
-                              (v16i8 QPR:$Vm)))]> {
+                              (v8bf16 QPR:$Vn),
+                              (v8bf16 QPR:$Vm)))]> {
   let Constraints = "$dst = $Vd";
   let DecoderNamespace = "VFPV8";
 }
@@ -9135,9 +9128,9 @@ multiclass VBF16MALQI<bit T, string suffix, SDPatternOperator OpNode> {
 
   def : Pat<
     (v4f32 (OpNode (v4f32 QPR:$Vd),
-                   (v16i8 QPR:$Vn),
-                   (v16i8 (bitconvert (v8bf16 (ARMvduplane (v8bf16 QPR:$Vm),
-                                                           VectorIndex16:$lane)))))),
+                   (v8bf16 QPR:$Vn),
+                   (v8bf16 (ARMvduplane (v8bf16 QPR:$Vm),
+                            VectorIndex16:$lane)))),
     (!cast<Instruction>(NAME) QPR:$Vd,
                               QPR:$Vn,
                               (EXTRACT_SUBREG QPR:$Vm,
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb.td
index 7fae32117243..3a33dfeecdc9 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb.td
@@ -548,14 +548,18 @@ let isCall = 1,
 
   // Also used for Thumb2
   def tBLXr : TI<(outs), (ins pred:$p, GPR:$func), IIC_Br,
-                  "blx${p}\t$func",
-                  [(ARMcall GPR:$func)]>,
+                  "blx${p}\t$func", []>,
               Requires<[IsThumb, HasV5T]>,
               T1Special<{1,1,1,?}>, Sched<[WriteBrL]> { // A6.2.3 & A8.6.24;
     bits<4> func;
     let Inst{6-3} = func;
     let Inst{2-0} = 0b000;
   }
+  def tBLXr_noip :  ARMPseudoExpand<(outs), (ins pred:$p, GPRnoip:$func),
+                   2, IIC_Br, [], (tBLXr pred:$p, GPR:$func)>,
+                   Requires<[IsThumb, HasV5T]>,
+                   Sched<[WriteBrL]>;
+
 
   // ARMv8-M Security Extensions
   def tBLXNSr : TI<(outs), (ins pred:$p, GPRnopc:$func), IIC_Br,
@@ -586,6 +590,11 @@ let isCall = 1,
              Requires<[IsThumb]>, Sched<[WriteBr]>;
 }
 
+def : ARMPat<(ARMcall GPR:$func), (tBLXr $func)>,
+      Requires<[IsThumb, HasV5T, NoSLSBLRMitigation]>;
+def : ARMPat<(ARMcall GPRnoip:$func), (tBLXr_noip $func)>,
+      Requires<[IsThumb, HasV5T, SLSBLRMitigation]>;
+
 let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
   let isPredicable = 1 in
   def tB   : T1pI<(outs), (ins t_brtarget:$target), IIC_Br,
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb2.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb2.td
index 7137e8ee66b8..5642cab32e7c 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb2.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb2.td
@@ -1724,7 +1724,7 @@ def t2STRH_preidx: t2PseudoInst<(outs GPRnopc:$Rn_wb),
 // only.
 // Ref: A8.6.193 STR (immediate, Thumb) Encoding T4
 class T2IstT<bits<2> type, string opc, InstrItinClass ii>
-  : T2Ii8<(outs rGPR:$Rt), (ins t2addrmode_imm8:$addr), ii, opc,
+  : T2Ii8<(outs), (ins rGPR:$Rt, t2addrmode_imm8:$addr), ii, opc,
           "\t$Rt, $addr", []>, Sched<[WriteST]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-25} = 0b00;
@@ -2575,7 +2575,6 @@ def t2USADA8  : T2FourReg_mac<0, 0b111, 0b0000, (outs rGPR:$Rd),
           Requires<[IsThumb2, HasDSP]>;
 
 // Signed/Unsigned saturate.
-let hasSideEffects = 1 in
 class T2SatI<dag iops, string opc, string asm>
   : T2I<(outs rGPR:$Rd), iops, NoItinerary, opc, asm, []> {
   bits<4> Rd;
@@ -2624,9 +2623,9 @@ def t2USAT16: T2SatI<(ins imm0_15:$sat_imm, rGPR:$Rn),
   let Inst{4} = 0;
 }
 
-def : T2Pat<(ARMssatnoshift GPRnopc:$Rn, imm0_31:$imm),
+def : T2Pat<(ARMssat GPRnopc:$Rn, imm0_31:$imm),
              (t2SSAT imm0_31:$imm, GPRnopc:$Rn, 0)>;
-def : T2Pat<(ARMusatnoshift GPRnopc:$Rn, imm0_31:$imm),
+def : T2Pat<(ARMusat GPRnopc:$Rn, imm0_31:$imm),
              (t2USAT imm0_31:$imm, GPRnopc:$Rn, 0)>;
 def : T2Pat<(int_arm_ssat GPR:$a, imm1_32:$pos),
             (t2SSAT imm1_32:$pos, GPR:$a, 0)>;
@@ -2636,6 +2635,23 @@ def : T2Pat<(int_arm_ssat16 GPR:$a, imm1_16:$pos),
             (t2SSAT16 imm1_16:$pos, GPR:$a)>;
 def : T2Pat<(int_arm_usat16 GPR:$a, imm0_15:$pos),
             (t2USAT16 imm0_15:$pos, GPR:$a)>;
+def : T2Pat<(int_arm_ssat (shl GPRnopc:$a, imm0_31:$shft), imm1_32:$pos),
+            (t2SSAT imm1_32:$pos, GPRnopc:$a, imm0_31:$shft)>;
+def : T2Pat<(int_arm_ssat (sra GPRnopc:$a, asr_imm:$shft), imm1_32:$pos),
+            (t2SSAT imm1_32:$pos, GPRnopc:$a, asr_imm:$shft)>;
+def : T2Pat<(int_arm_usat (shl GPRnopc:$a, imm0_31:$shft), imm0_31:$pos),
+            (t2USAT imm0_31:$pos, GPRnopc:$a, imm0_31:$shft)>;
+def : T2Pat<(int_arm_usat (sra GPRnopc:$a, asr_imm:$shft), imm0_31:$pos),
+            (t2USAT imm0_31:$pos, GPRnopc:$a, asr_imm:$shft)>;
+def : T2Pat<(ARMssat (shl GPRnopc:$a, imm0_31:$shft), imm0_31:$pos),
+            (t2SSAT imm0_31:$pos, GPRnopc:$a, imm0_31:$shft)>;
+def : T2Pat<(ARMssat (sra GPRnopc:$Rn, asr_imm:$shft), imm0_31:$pos),
+            (t2SSAT imm0_31:$pos, GPRnopc:$Rn, asr_imm:$shft)>;
+def : T2Pat<(ARMusat (shl GPRnopc:$a, imm0_31:$shft), imm0_31:$pos),
+            (t2USAT imm0_31:$pos, GPRnopc:$a, imm0_31:$shft)>;
+def : T2Pat<(ARMusat (sra GPRnopc:$Rn, asr_imm:$shft), imm0_31:$pos),
+            (t2USAT imm0_31:$pos, GPRnopc:$Rn, asr_imm:$shft)>;
+
 
 //===----------------------------------------------------------------------===//
 //  Shift and rotate Instructions.
@@ -4919,6 +4935,15 @@ def : InstAlias<"pssbb", (t2DSB 0x4, 14, 0), 1>, Requires<[HasDB, IsThumb2]>;
 // Armv8-R 'Data Full Barrier'
 def : InstAlias<"dfb${p}", (t2DSB 0xc, pred:$p), 1>, Requires<[HasDFB]>;
 
+// SpeculationBarrierEndBB must only be used after an unconditional control
+// flow, i.e. after a terminator for which isBarrier is True.
+let hasSideEffects = 1, isCodeGenOnly = 1, isTerminator = 1, isBarrier = 1 in {
+  def t2SpeculationBarrierISBDSBEndBB
+      : PseudoInst<(outs), (ins), NoItinerary, []>, Sched<[]>;
+  def t2SpeculationBarrierSBEndBB
+      : PseudoInst<(outs), (ins), NoItinerary, []>, Sched<[]>;
+}
+
 // Alias for LDR, LDRB, LDRH, LDRSB, and LDRSH without the ".w" optional
 // width specifier.
 def : t2InstAlias<"ldr${p} $Rt, $addr",
@@ -5404,9 +5429,16 @@ def t2LE : t2LOL<(outs ), (ins lelabel_u11:$label), "le", "$label"> {
   let isTerminator = 1;
 }
 
+let Predicates = [IsThumb2, HasV8_1MMainline, HasLOB] in {
+
+let usesCustomInserter = 1 in
 def t2DoLoopStart :
-  t2PseudoInst<(outs), (ins rGPR:$elts), 4, IIC_Br,
-  [(int_set_loop_iterations rGPR:$elts)]>, Sched<[WriteBr]>;
+  t2PseudoInst<(outs GPRlr:$X), (ins rGPR:$elts), 4, IIC_Br,
+  [(set GPRlr:$X, (int_start_loop_iterations rGPR:$elts))]>;
+
+let isTerminator = 1, hasSideEffects = 1 in
+def t2DoLoopStartTP :
+  t2PseudoInst<(outs GPRlr:$X), (ins rGPR:$elts, rGPR:$count), 4, IIC_Br, []>;
 
 let hasSideEffects = 0 in
 def t2LoopDec :
@@ -5426,8 +5458,14 @@ def t2LoopEnd :
   t2PseudoInst<(outs), (ins GPRlr:$elts, brtarget:$target),
   8, IIC_Br, []>, Sched<[WriteBr]>;
 
+def t2LoopEndDec :
+  t2PseudoInst<(outs GPRlr:$Rm), (ins GPRlr:$elts, brtarget:$target),
+  8, IIC_Br, []>, Sched<[WriteBr]>;
+
 } // end isBranch, isTerminator, hasSideEffects
 
+}
+
 } // end isNotDuplicable
 
 class CS<string iname, bits<4> opcode, list<dag> pattern=[]>
@@ -5446,6 +5484,7 @@ class CS<string iname, bits<4> opcode, list<dag> pattern=[]>
   let Inst{3-0} = Rm{3-0};
 
   let Uses = [CPSR];
+  let hasSideEffects = 0;
 }
 
 def t2CSEL  : CS<"csel",  0b1000>;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrVFP.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrVFP.td
index 8a652c1d90f6..2be58d7a0e62 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrVFP.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrVFP.td
@@ -54,6 +54,16 @@ def vfp_f16imm : Operand<f16>,
   let ParserMatchClass = FPImmOperand;
 }
 
+def vfp_f32f16imm_xform : SDNodeXForm<fpimm, [{
+      APFloat InVal = N->getValueAPF();
+      uint32_t enc = ARM_AM::getFP32FP16Imm(InVal);
+      return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
+    }]>;
+
+def vfp_f32f16imm : PatLeaf<(f32 fpimm), [{
+      return ARM_AM::getFP32FP16Imm(N->getValueAPF()) != -1;
+    }], vfp_f32f16imm_xform>;
+
 def vfp_f32imm_xform : SDNodeXForm<fpimm, [{
       APFloat InVal = N->getValueAPF();
       uint32_t enc = ARM_AM::getFP32Imm(InVal);
@@ -1551,6 +1561,8 @@ class AVConv1InsS_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
   let Inst{5}     = Sm{0};
   let Inst{15-12} = Sd{4-1};
   let Inst{22}    = Sd{0};
+
+  let hasSideEffects = 0;
 }
 
 class AVConv1IsH_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
@@ -2252,16 +2264,6 @@ def : Pat<(f32 (fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin)),
 def : Pat<(f16 (fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (f16 HPR:$Sdin))),
           (VFMSH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>,
       Requires<[HasFullFP16]>;
-// (fma x, (fneg y), z) -> (vfms z, x, y)
-def : Pat<(f64 (fma DPR:$Dn, (fneg DPR:$Dm), DPR:$Ddin)),
-          (VFMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
-      Requires<[HasVFP4,HasDPVFP]>;
-def : Pat<(f32 (fma SPR:$Sn, (fneg SPR:$Sm), SPR:$Sdin)),
-          (VFMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
-      Requires<[HasVFP4]>;
-def : Pat<(f16 (fma (f16 HPR:$Sn), (fneg (f16 HPR:$Sm)), (f16 HPR:$Sdin))),
-          (VFMSH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>,
-      Requires<[HasFullFP16]>;
 
 def VFNMAD : ADbI<0b11101, 0b01, 1, 0,
                   (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
@@ -2379,16 +2381,6 @@ def : Pat<(fneg (f32 (fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin))),
 def : Pat<(fneg (f16 (fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (f16 HPR:$Sdin)))),
           (VFNMSH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>,
       Requires<[HasFullFP16]>;
-// (fneg (fma x, (fneg y), z) -> (vfnms z, x, y)
-def : Pat<(fneg (f64 (fma DPR:$Dn, (fneg DPR:$Dm), DPR:$Ddin))),
-          (VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
-      Requires<[HasVFP4,HasDPVFP]>;
-def : Pat<(fneg (f32 (fma SPR:$Sn, (fneg SPR:$Sm), SPR:$Sdin))),
-          (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
-      Requires<[HasVFP4]>;
-def : Pat<(fneg (f16 (fma (f16 HPR:$Sn), (fneg (f16 HPR:$Sm)), (f16 HPR:$Sdin)))),
-          (VFNMSH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>,
-      Requires<[HasFullFP16]>;
 
 //===----------------------------------------------------------------------===//
 // FP Conditional moves.
@@ -2634,6 +2626,11 @@ def FCONSTH : VFPAI<(outs HPR:$Sd), (ins vfp_f16imm:$imm),
 }
 }
 
+def : Pat<(f32 (vfp_f32f16imm:$imm)),
+          (f32 (COPY_TO_REGCLASS (f16 (FCONSTH (vfp_f32f16imm_xform (f32 $imm)))), SPR))> {
+  let Predicates = [HasFullFP16];
+}
+
 //===----------------------------------------------------------------------===//
 // Assembler aliases.
 //
@@ -2849,6 +2846,12 @@ let Predicates = [HasV8_1MMainline, HasMVEInt] in {
   }
   defm VSTR_P0             : vfp_vstrldr_sysreg<0b0,0b1101, "p0",
                                                 (outs), (ins VCCR:$P0)>;
+
+  let Defs = [VPR] in {
+    defm VLDR_VPR          : vfp_vstrldr_sysreg<0b1,0b1100, "vpr">;
+  }
+  defm VLDR_P0             : vfp_vstrldr_sysreg<0b1,0b1101, "p0",
+                                                (outs VCCR:$P0), (ins)>;
 }
 
 let Uses = [FPSCR] in {
@@ -2860,11 +2863,3 @@ let Uses = [FPSCR] in {
     defm VLDR_FPCXTS       : vfp_vstrldr_sysreg<0b1,0b1111, "fpcxts">;
   }
 }
-
-let Predicates = [HasV8_1MMainline, HasMVEInt] in {
-  let Defs = [VPR] in {
-    defm VLDR_VPR          : vfp_vstrldr_sysreg<0b1,0b1100, "vpr">;
-  }
-  defm VLDR_P0             : vfp_vstrldr_sysreg<0b1,0b1101, "p0",
-                                                (outs VCCR:$P0), (ins)>;
-}
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstructionSelector.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstructionSelector.cpp
index c8a894fb11a8..09a94cc3a8e8 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstructionSelector.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstructionSelector.cpp
@@ -164,8 +164,6 @@ createARMInstructionSelector(const ARMBaseTargetMachine &TM,
 }
 }
 
-const unsigned zero_reg = 0;
-
 #define GET_GLOBALISEL_IMPL
 #include "ARMGenGlobalISel.inc"
 #undef GET_GLOBALISEL_IMPL
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
index f3657155f47e..d9b60f4c4eba 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
@@ -88,7 +88,7 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
 
   getActionDefinitionsBuilder({G_MUL, G_AND, G_OR, G_XOR})
       .legalFor({s32})
-      .minScalar(0, s32);
+      .clampScalar(0, s32, s32);
 
   if (ST.hasNEON())
     getActionDefinitionsBuilder({G_ADD, G_SUB})
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index a84d23d3bb96..aa1fe4e4ffda 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -1268,6 +1268,7 @@ findIncDecAfter(MachineBasicBlock::iterator MBBI, Register Reg,
 bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) {
   // Thumb1 is already using updating loads/stores.
   if (isThumb1) return false;
+  LLVM_DEBUG(dbgs() << "Attempting to merge update of: " << *MI);
 
   const MachineOperand &BaseOP = MI->getOperand(0);
   Register Base = BaseOP.getReg();
@@ -1319,8 +1320,10 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) {
         return false;
     }
   }
-  if (MergeInstr != MBB.end())
+  if (MergeInstr != MBB.end()) {
+    LLVM_DEBUG(dbgs() << "  Erasing old increment: " << *MergeInstr);
     MBB.erase(MergeInstr);
+  }
 
   unsigned NewOpc = getUpdatingLSMultipleOpcode(Opcode, Mode);
   MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc))
@@ -1335,6 +1338,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) {
   // Transfer memoperands.
   MIB.setMemRefs(MI->memoperands());
 
+  LLVM_DEBUG(dbgs() << "  Added new load/store: " << *MIB);
   MBB.erase(MBBI);
   return true;
 }
@@ -1382,9 +1386,27 @@ static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc,
   case ARM::t2LDRi8:
   case ARM::t2LDRi12:
     return ARM::t2LDR_POST;
+  case ARM::t2LDRBi8:
+  case ARM::t2LDRBi12:
+    return ARM::t2LDRB_POST;
+  case ARM::t2LDRSBi8:
+  case ARM::t2LDRSBi12:
+    return ARM::t2LDRSB_POST;
+  case ARM::t2LDRHi8:
+  case ARM::t2LDRHi12:
+    return ARM::t2LDRH_POST;
+  case ARM::t2LDRSHi8:
+  case ARM::t2LDRSHi12:
+    return ARM::t2LDRSH_POST;
   case ARM::t2STRi8:
   case ARM::t2STRi12:
     return ARM::t2STR_POST;
+  case ARM::t2STRBi8:
+  case ARM::t2STRBi12:
+    return ARM::t2STRB_POST;
+  case ARM::t2STRHi8:
+  case ARM::t2STRHi12:
+    return ARM::t2STRH_POST;
 
   case ARM::MVE_VLDRBS16:
     return ARM::MVE_VLDRBS16_post;
@@ -1427,6 +1449,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
   // Thumb1 doesn't have updating LDR/STR.
   // FIXME: Use LDM/STM with single register instead.
   if (isThumb1) return false;
+  LLVM_DEBUG(dbgs() << "Attempting to merge update of: " << *MI);
 
   Register Base = getLoadStoreBaseOp(*MI).getReg();
   bool BaseKill = getLoadStoreBaseOp(*MI).isKill();
@@ -1468,6 +1491,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
     } else
       return false;
   }
+  LLVM_DEBUG(dbgs() << "  Erasing old increment: " << *MergeInstr);
   MBB.erase(MergeInstr);
 
   ARM_AM::AddrOpc AddSub = Offset < 0 ? ARM_AM::sub : ARM_AM::add;
@@ -1479,39 +1503,54 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
     // updating load/store-multiple instructions can be used with only one
     // register.)
     MachineOperand &MO = MI->getOperand(0);
-    BuildMI(MBB, MBBI, DL, TII->get(NewOpc))
-      .addReg(Base, getDefRegState(true)) // WB base register
-      .addReg(Base, getKillRegState(isLd ? BaseKill : false))
-      .addImm(Pred).addReg(PredReg)
-      .addReg(MO.getReg(), (isLd ? getDefRegState(true) :
-                            getKillRegState(MO.isKill())))
-      .cloneMemRefs(*MI);
+    auto MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc))
+                   .addReg(Base, getDefRegState(true)) // WB base register
+                   .addReg(Base, getKillRegState(isLd ? BaseKill : false))
+                   .addImm(Pred)
+                   .addReg(PredReg)
+                   .addReg(MO.getReg(), (isLd ? getDefRegState(true)
+                                              : getKillRegState(MO.isKill())))
+                   .cloneMemRefs(*MI);
+    (void)MIB;
+    LLVM_DEBUG(dbgs() << "  Added new instruction: " << *MIB);
   } else if (isLd) {
     if (isAM2) {
       // LDR_PRE, LDR_POST
       if (NewOpc == ARM::LDR_PRE_IMM || NewOpc == ARM::LDRB_PRE_IMM) {
-        BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
-          .addReg(Base, RegState::Define)
-          .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg)
-          .cloneMemRefs(*MI);
+        auto MIB =
+            BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
+                .addReg(Base, RegState::Define)
+                .addReg(Base)
+                .addImm(Offset)
+                .addImm(Pred)
+                .addReg(PredReg)
+                .cloneMemRefs(*MI);
+        (void)MIB;
+        LLVM_DEBUG(dbgs() << "  Added new instruction: " << *MIB);
       } else {
         int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
-        BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
-            .addReg(Base, RegState::Define)
-            .addReg(Base)
-            .addReg(0)
-            .addImm(Imm)
-            .add(predOps(Pred, PredReg))
-            .cloneMemRefs(*MI);
+        auto MIB =
+            BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
+                .addReg(Base, RegState::Define)
+                .addReg(Base)
+                .addReg(0)
+                .addImm(Imm)
+                .add(predOps(Pred, PredReg))
+                .cloneMemRefs(*MI);
+        (void)MIB;
+        LLVM_DEBUG(dbgs() << "  Added new instruction: " << *MIB);
       }
     } else {
       // t2LDR_PRE, t2LDR_POST
-      BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
-          .addReg(Base, RegState::Define)
-          .addReg(Base)
-          .addImm(Offset)
-          .add(predOps(Pred, PredReg))
-          .cloneMemRefs(*MI);
+      auto MIB =
+          BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
+              .addReg(Base, RegState::Define)
+              .addReg(Base)
+              .addImm(Offset)
+              .add(predOps(Pred, PredReg))
+              .cloneMemRefs(*MI);
+      (void)MIB;
+      LLVM_DEBUG(dbgs() << "  Added new instruction: " << *MIB);
     }
   } else {
     MachineOperand &MO = MI->getOperand(0);
@@ -1521,21 +1560,25 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
     if (isAM2 && NewOpc == ARM::STR_POST_IMM) {
       int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
       // STR_PRE, STR_POST
-      BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base)
-          .addReg(MO.getReg(), getKillRegState(MO.isKill()))
-          .addReg(Base)
-          .addReg(0)
-          .addImm(Imm)
-          .add(predOps(Pred, PredReg))
-          .cloneMemRefs(*MI);
+      auto MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base)
+                     .addReg(MO.getReg(), getKillRegState(MO.isKill()))
+                     .addReg(Base)
+                     .addReg(0)
+                     .addImm(Imm)
+                     .add(predOps(Pred, PredReg))
+                     .cloneMemRefs(*MI);
+      (void)MIB;
+      LLVM_DEBUG(dbgs() << "  Added new instruction: " << *MIB);
     } else {
       // t2STR_PRE, t2STR_POST
-      BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base)
-          .addReg(MO.getReg(), getKillRegState(MO.isKill()))
-          .addReg(Base)
-          .addImm(Offset)
-          .add(predOps(Pred, PredReg))
-          .cloneMemRefs(*MI);
+      auto MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base)
+                     .addReg(MO.getReg(), getKillRegState(MO.isKill()))
+                     .addReg(Base)
+                     .addImm(Offset)
+                     .add(predOps(Pred, PredReg))
+                     .cloneMemRefs(*MI);
+      (void)MIB;
+      LLVM_DEBUG(dbgs() << "  Added new instruction: " << *MIB);
     }
   }
   MBB.erase(MBBI);
@@ -1549,6 +1592,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSDouble(MachineInstr &MI) const {
          "Must have t2STRDi8 or t2LDRDi8");
   if (MI.getOperand(3).getImm() != 0)
     return false;
+  LLVM_DEBUG(dbgs() << "Attempting to merge update of: " << MI);
 
   // Behaviour for writeback is undefined if base register is the same as one
   // of the others.
@@ -1576,6 +1620,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSDouble(MachineInstr &MI) const {
     } else
       return false;
   }
+  LLVM_DEBUG(dbgs() << "  Erasing old increment: " << *MergeInstr);
   MBB.erase(MergeInstr);
 
   DebugLoc DL = MI.getDebugLoc();
@@ -1597,6 +1642,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSDouble(MachineInstr &MI) const {
     MIB.add(MO);
   MIB.cloneMemRefs(MI);
 
+  LLVM_DEBUG(dbgs() << "  Added new load/store: " << *MIB);
   MBB.erase(MBBI);
   return true;
 }
@@ -2539,11 +2585,169 @@ static int getBaseOperandIndex(MachineInstr &MI) {
   case ARM::MVE_VSTRBU8:
   case ARM::MVE_VSTRHU16:
   case ARM::MVE_VSTRWU32:
+  case ARM::t2LDRHi8:
+  case ARM::t2LDRHi12:
+  case ARM::t2LDRSHi8:
+  case ARM::t2LDRSHi12:
+  case ARM::t2LDRBi8:
+  case ARM::t2LDRBi12:
+  case ARM::t2LDRSBi8:
+  case ARM::t2LDRSBi12:
+  case ARM::t2STRBi8:
+  case ARM::t2STRBi12:
+  case ARM::t2STRHi8:
+  case ARM::t2STRHi12:
     return 1;
+  case ARM::MVE_VLDRBS16_post:
+  case ARM::MVE_VLDRBS32_post:
+  case ARM::MVE_VLDRBU16_post:
+  case ARM::MVE_VLDRBU32_post:
+  case ARM::MVE_VLDRHS32_post:
+  case ARM::MVE_VLDRHU32_post:
+  case ARM::MVE_VLDRBU8_post:
+  case ARM::MVE_VLDRHU16_post:
+  case ARM::MVE_VLDRWU32_post:
+  case ARM::MVE_VSTRB16_post:
+  case ARM::MVE_VSTRB32_post:
+  case ARM::MVE_VSTRH32_post:
+  case ARM::MVE_VSTRBU8_post:
+  case ARM::MVE_VSTRHU16_post:
+  case ARM::MVE_VSTRWU32_post:
+  case ARM::MVE_VLDRBS16_pre:
+  case ARM::MVE_VLDRBS32_pre:
+  case ARM::MVE_VLDRBU16_pre:
+  case ARM::MVE_VLDRBU32_pre:
+  case ARM::MVE_VLDRHS32_pre:
+  case ARM::MVE_VLDRHU32_pre:
+  case ARM::MVE_VLDRBU8_pre:
+  case ARM::MVE_VLDRHU16_pre:
+  case ARM::MVE_VLDRWU32_pre:
+  case ARM::MVE_VSTRB16_pre:
+  case ARM::MVE_VSTRB32_pre:
+  case ARM::MVE_VSTRH32_pre:
+  case ARM::MVE_VSTRBU8_pre:
+  case ARM::MVE_VSTRHU16_pre:
+  case ARM::MVE_VSTRWU32_pre:
+    return 2;
   }
   return -1;
 }
 
+static bool isPostIndex(MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case ARM::MVE_VLDRBS16_post:
+  case ARM::MVE_VLDRBS32_post:
+  case ARM::MVE_VLDRBU16_post:
+  case ARM::MVE_VLDRBU32_post:
+  case ARM::MVE_VLDRHS32_post:
+  case ARM::MVE_VLDRHU32_post:
+  case ARM::MVE_VLDRBU8_post:
+  case ARM::MVE_VLDRHU16_post:
+  case ARM::MVE_VLDRWU32_post:
+  case ARM::MVE_VSTRB16_post:
+  case ARM::MVE_VSTRB32_post:
+  case ARM::MVE_VSTRH32_post:
+  case ARM::MVE_VSTRBU8_post:
+  case ARM::MVE_VSTRHU16_post:
+  case ARM::MVE_VSTRWU32_post:
+    return true;
+  }
+  return false;
+}
+
+static bool isPreIndex(MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case ARM::MVE_VLDRBS16_pre:
+  case ARM::MVE_VLDRBS32_pre:
+  case ARM::MVE_VLDRBU16_pre:
+  case ARM::MVE_VLDRBU32_pre:
+  case ARM::MVE_VLDRHS32_pre:
+  case ARM::MVE_VLDRHU32_pre:
+  case ARM::MVE_VLDRBU8_pre:
+  case ARM::MVE_VLDRHU16_pre:
+  case ARM::MVE_VLDRWU32_pre:
+  case ARM::MVE_VSTRB16_pre:
+  case ARM::MVE_VSTRB32_pre:
+  case ARM::MVE_VSTRH32_pre:
+  case ARM::MVE_VSTRBU8_pre:
+  case ARM::MVE_VSTRHU16_pre:
+  case ARM::MVE_VSTRWU32_pre:
+    return true;
+  }
+  return false;
+}
+
+// Given a memory access Opcode, check that the give Imm would be a valid Offset
+// for this instruction (same as isLegalAddressImm), Or if the instruction
+// could be easily converted to one where that was valid. For example converting
+// t2LDRi12 to t2LDRi8 for negative offsets. Works in conjunction with
+// AdjustBaseAndOffset below.
+static bool isLegalOrConvertableAddressImm(unsigned Opcode, int Imm,
+                                           const TargetInstrInfo *TII,
+                                           int &CodesizeEstimate) {
+  if (isLegalAddressImm(Opcode, Imm, TII))
+    return true;
+
+  // We can convert AddrModeT2_i12 to AddrModeT2_i8.
+  const MCInstrDesc &Desc = TII->get(Opcode);
+  unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
+  switch (AddrMode) {
+  case ARMII::AddrModeT2_i12:
+    CodesizeEstimate += 1;
+    return std::abs(Imm) < (((1 << 8) * 1) - 1);
+  }
+  return false;
+}
+
+// Given an MI adjust its address BaseReg to use NewBaseReg and address offset
+// by -Offset. This can either happen in-place or be a replacement as MI is
+// converted to another instruction type.
+static void AdjustBaseAndOffset(MachineInstr *MI, Register NewBaseReg,
+                                int Offset, const TargetInstrInfo *TII) {
+  unsigned BaseOp = getBaseOperandIndex(*MI);
+  MI->getOperand(BaseOp).setReg(NewBaseReg);
+  int OldOffset = MI->getOperand(BaseOp + 1).getImm();
+  if (isLegalAddressImm(MI->getOpcode(), OldOffset - Offset, TII))
+    MI->getOperand(BaseOp + 1).setImm(OldOffset - Offset);
+  else {
+    unsigned ConvOpcode;
+    switch (MI->getOpcode()) {
+    case ARM::t2LDRHi12:
+      ConvOpcode = ARM::t2LDRHi8;
+      break;
+    case ARM::t2LDRSHi12:
+      ConvOpcode = ARM::t2LDRSHi8;
+      break;
+    case ARM::t2LDRBi12:
+      ConvOpcode = ARM::t2LDRBi8;
+      break;
+    case ARM::t2LDRSBi12:
+      ConvOpcode = ARM::t2LDRSBi8;
+      break;
+    case ARM::t2STRHi12:
+      ConvOpcode = ARM::t2STRHi8;
+      break;
+    case ARM::t2STRBi12:
+      ConvOpcode = ARM::t2STRBi8;
+      break;
+    default:
+      llvm_unreachable("Unhandled convertable opcode");
+    }
+    assert(isLegalAddressImm(ConvOpcode, OldOffset - Offset, TII) &&
+           "Illegal Address Immediate after convert!");
+
+    const MCInstrDesc &MCID = TII->get(ConvOpcode);
+    BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), MCID)
+        .add(MI->getOperand(0))
+        .add(MI->getOperand(1))
+        .addImm(OldOffset - Offset)
+        .add(MI->getOperand(3))
+        .add(MI->getOperand(4))
+        .cloneMemRefs(*MI);
+    MI->eraseFromParent();
+  }
+}
+
 static MachineInstr *createPostIncLoadStore(MachineInstr *MI, int Offset,
                                             Register NewReg,
                                             const TargetInstrInfo *TII,
@@ -2562,34 +2766,70 @@ static MachineInstr *createPostIncLoadStore(MachineInstr *MI, int Offset,
   TRC = TII->getRegClass(MCID, 2, TRI, *MF);
   MRI.constrainRegClass(MI->getOperand(1).getReg(), TRC);
 
-  return BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), MCID)
-      .addReg(NewReg, RegState::Define)
-      .add(MI->getOperand(0))
-      .add(MI->getOperand(1))
-      .addImm(Offset)
-      .add(MI->getOperand(3))
-      .add(MI->getOperand(4))
-      .cloneMemRefs(*MI);
+  unsigned AddrMode = (MCID.TSFlags & ARMII::AddrModeMask);
+  switch (AddrMode) {
+  case ARMII::AddrModeT2_i7:
+  case ARMII::AddrModeT2_i7s2:
+  case ARMII::AddrModeT2_i7s4:
+    // Any MVE load/store
+    return BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), MCID)
+        .addReg(NewReg, RegState::Define)
+        .add(MI->getOperand(0))
+        .add(MI->getOperand(1))
+        .addImm(Offset)
+        .add(MI->getOperand(3))
+        .add(MI->getOperand(4))
+        .cloneMemRefs(*MI);
+  case ARMII::AddrModeT2_i8:
+    if (MI->mayLoad()) {
+      return BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), MCID)
+          .add(MI->getOperand(0))
+          .addReg(NewReg, RegState::Define)
+          .add(MI->getOperand(1))
+          .addImm(Offset)
+          .add(MI->getOperand(3))
+          .add(MI->getOperand(4))
+          .cloneMemRefs(*MI);
+    } else {
+      return BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), MCID)
+          .addReg(NewReg, RegState::Define)
+          .add(MI->getOperand(0))
+          .add(MI->getOperand(1))
+          .addImm(Offset)
+          .add(MI->getOperand(3))
+          .add(MI->getOperand(4))
+          .cloneMemRefs(*MI);
+    }
+  default:
+    llvm_unreachable("Unhandled createPostIncLoadStore");
+  }
 }
 
 // Given a Base Register, optimise the load/store uses to attempt to create more
-// post-inc accesses. We do this by taking zero offset loads/stores with an add,
-// and convert them to a postinc load/store of the same type. Any subsequent
-// accesses will be adjusted to use and account for the post-inc value.
+// post-inc accesses and less register moves. We do this by taking zero offset
+// loads/stores with an add, and convert them to a postinc load/store of the
+// same type. Any subsequent accesses will be adjusted to use and account for
+// the post-inc value.
 // For example:
 // LDR #0            LDR_POSTINC #16
 // LDR #4            LDR #-12
 // LDR #8            LDR #-8
 // LDR #12           LDR #-4
 // ADD #16
+//
+// At the same time if we do not find an increment but do find an existing
+// pre/post inc instruction, we can still adjust the offsets of subsequent
+// instructions to save the register move that would otherwise be needed for the
+// in-place increment.
 bool ARMPreAllocLoadStoreOpt::DistributeIncrements(Register Base) {
   // We are looking for:
   // One zero offset load/store that can become postinc
   MachineInstr *BaseAccess = nullptr;
+  MachineInstr *PrePostInc = nullptr;
   // An increment that can be folded in
   MachineInstr *Increment = nullptr;
   // Other accesses after BaseAccess that will need to be updated to use the
-  // postinc value
+  // postinc value.
   SmallPtrSet<MachineInstr *, 8> OtherAccesses;
   for (auto &Use : MRI->use_nodbg_instructions(Base)) {
     if (!Increment && getAddSubImmediate(Use) != 0) {
@@ -2604,53 +2844,81 @@ bool ARMPreAllocLoadStoreOpt::DistributeIncrements(Register Base) {
     if (!Use.getOperand(BaseOp).isReg() ||
         Use.getOperand(BaseOp).getReg() != Base)
       return false;
-    if (Use.getOperand(BaseOp + 1).getImm() == 0)
+    if (isPreIndex(Use) || isPostIndex(Use))
+      PrePostInc = &Use;
+    else if (Use.getOperand(BaseOp + 1).getImm() == 0)
       BaseAccess = &Use;
     else
       OtherAccesses.insert(&Use);
   }
 
-  if (!BaseAccess || !Increment ||
-      BaseAccess->getParent() != Increment->getParent())
-    return false;
-  Register PredReg;
-  if (Increment->definesRegister(ARM::CPSR) ||
-      getInstrPredicate(*Increment, PredReg) != ARMCC::AL)
-    return false;
+  int IncrementOffset;
+  Register NewBaseReg;
+  if (BaseAccess && Increment) {
+    if (PrePostInc || BaseAccess->getParent() != Increment->getParent())
+      return false;
+    Register PredReg;
+    if (Increment->definesRegister(ARM::CPSR) ||
+        getInstrPredicate(*Increment, PredReg) != ARMCC::AL)
+      return false;
 
-  LLVM_DEBUG(dbgs() << "\nAttempting to distribute increments on VirtualReg "
-                    << Base.virtRegIndex() << "\n");
+    LLVM_DEBUG(dbgs() << "\nAttempting to distribute increments on VirtualReg "
+                      << Base.virtRegIndex() << "\n");
 
-  // Make sure that Increment has no uses before BaseAccess.
-  for (MachineInstr &Use :
-       MRI->use_nodbg_instructions(Increment->getOperand(0).getReg())) {
-    if (!DT->dominates(BaseAccess, &Use) || &Use == BaseAccess) {
-      LLVM_DEBUG(dbgs() << "  BaseAccess doesn't dominate use of increment\n");
+    // Make sure that Increment has no uses before BaseAccess.
+    for (MachineInstr &Use :
+        MRI->use_nodbg_instructions(Increment->getOperand(0).getReg())) {
+      if (!DT->dominates(BaseAccess, &Use) || &Use == BaseAccess) {
+        LLVM_DEBUG(dbgs() << "  BaseAccess doesn't dominate use of increment\n");
+        return false;
+      }
+    }
+
+    // Make sure that Increment can be folded into Base
+    IncrementOffset = getAddSubImmediate(*Increment);
+    unsigned NewPostIncOpcode = getPostIndexedLoadStoreOpcode(
+        BaseAccess->getOpcode(), IncrementOffset > 0 ? ARM_AM::add : ARM_AM::sub);
+    if (!isLegalAddressImm(NewPostIncOpcode, IncrementOffset, TII)) {
+      LLVM_DEBUG(dbgs() << "  Illegal addressing mode immediate on postinc\n");
       return false;
     }
   }
+  else if (PrePostInc) {
+    // If we already have a pre/post index load/store then set BaseAccess,
+    // IncrementOffset and NewBaseReg to the values it already produces,
+    // allowing us to update and subsequent uses of BaseOp reg with the
+    // incremented value.
+    if (Increment)
+      return false;
 
-  // Make sure that Increment can be folded into Base
-  int IncrementOffset = getAddSubImmediate(*Increment);
-  unsigned NewPostIncOpcode = getPostIndexedLoadStoreOpcode(
-      BaseAccess->getOpcode(), IncrementOffset > 0 ? ARM_AM::add : ARM_AM::sub);
-  if (!isLegalAddressImm(NewPostIncOpcode, IncrementOffset, TII)) {
-    LLVM_DEBUG(dbgs() << "  Illegal addressing mode immediate on postinc\n");
-    return false;
+    LLVM_DEBUG(dbgs() << "\nAttempting to distribute increments on already "
+                      << "indexed VirtualReg " << Base.virtRegIndex() << "\n");
+    int BaseOp = getBaseOperandIndex(*PrePostInc);
+    IncrementOffset = PrePostInc->getOperand(BaseOp+1).getImm();
+    BaseAccess = PrePostInc;
+    NewBaseReg = PrePostInc->getOperand(0).getReg();
   }
+  else
+    return false;
 
   // And make sure that the negative value of increment can be added to all
   // other offsets after the BaseAccess. We rely on either
   // dominates(BaseAccess, OtherAccess) or dominates(OtherAccess, BaseAccess)
   // to keep things simple.
+  // This also adds a simple codesize metric, to detect if an instruction (like
+  // t2LDRBi12) which can often be shrunk to a thumb1 instruction (tLDRBi)
+  // cannot because it is converted to something else (t2LDRBi8). We start this
+  // at -1 for the gain from removing the increment.
   SmallPtrSet<MachineInstr *, 4> SuccessorAccesses;
+  int CodesizeEstimate = -1;
   for (auto *Use : OtherAccesses) {
     if (DT->dominates(BaseAccess, Use)) {
       SuccessorAccesses.insert(Use);
       unsigned BaseOp = getBaseOperandIndex(*Use);
-      if (!isLegalAddressImm(
-              Use->getOpcode(),
-              Use->getOperand(BaseOp + 1).getImm() - IncrementOffset, TII)) {
+      if (!isLegalOrConvertableAddressImm(Use->getOpcode(),
+                                          Use->getOperand(BaseOp + 1).getImm() -
+                                              IncrementOffset,
+                                          TII, CodesizeEstimate)) {
         LLVM_DEBUG(dbgs() << "  Illegal addressing mode immediate on use\n");
         return false;
       }
@@ -2660,24 +2928,27 @@ bool ARMPreAllocLoadStoreOpt::DistributeIncrements(Register Base) {
       return false;
     }
   }
+  if (STI->hasMinSize() && CodesizeEstimate > 0) {
+    LLVM_DEBUG(dbgs() << "  Expected to grow instructions under minsize\n");
+    return false;
+  }
 
-  // Replace BaseAccess with a post inc
-  LLVM_DEBUG(dbgs() << "Changing: "; BaseAccess->dump());
-  LLVM_DEBUG(dbgs() << "  And   : "; Increment->dump());
-  Register NewBaseReg = Increment->getOperand(0).getReg();
-  MachineInstr *BaseAccessPost =
-      createPostIncLoadStore(BaseAccess, IncrementOffset, NewBaseReg, TII, TRI);
-  BaseAccess->eraseFromParent();
-  Increment->eraseFromParent();
-  (void)BaseAccessPost;
-  LLVM_DEBUG(dbgs() << "  To    : "; BaseAccessPost->dump());
+  if (!PrePostInc) {
+    // Replace BaseAccess with a post inc
+    LLVM_DEBUG(dbgs() << "Changing: "; BaseAccess->dump());
+    LLVM_DEBUG(dbgs() << "  And   : "; Increment->dump());
+    NewBaseReg = Increment->getOperand(0).getReg();
+    MachineInstr *BaseAccessPost =
+        createPostIncLoadStore(BaseAccess, IncrementOffset, NewBaseReg, TII, TRI);
+    BaseAccess->eraseFromParent();
+    Increment->eraseFromParent();
+    (void)BaseAccessPost;
+    LLVM_DEBUG(dbgs() << "  To    : "; BaseAccessPost->dump());
+  }
 
   for (auto *Use : SuccessorAccesses) {
     LLVM_DEBUG(dbgs() << "Changing: "; Use->dump());
-    unsigned BaseOp = getBaseOperandIndex(*Use);
-    Use->getOperand(BaseOp).setReg(NewBaseReg);
-    int OldOffset = Use->getOperand(BaseOp + 1).getImm();
-    Use->getOperand(BaseOp + 1).setImm(OldOffset - IncrementOffset);
+    AdjustBaseAndOffset(Use, NewBaseReg, IncrementOffset, TII);
     LLVM_DEBUG(dbgs() << "  To    : "; Use->dump());
   }
 
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
index be75d6bef08c..8dc532058492 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -56,6 +56,7 @@
 #include "ARMBaseRegisterInfo.h"
 #include "ARMBasicBlockInfo.h"
 #include "ARMSubtarget.h"
+#include "MVETailPredUtils.h"
 #include "Thumb2InstrInfo.h"
 #include "llvm/ADT/SetOperations.h"
 #include "llvm/ADT/SmallSet.h"
@@ -73,6 +74,37 @@ using namespace llvm;
 #define DEBUG_TYPE "arm-low-overhead-loops"
 #define ARM_LOW_OVERHEAD_LOOPS_NAME "ARM Low Overhead Loops pass"
 
+static cl::opt<bool>
+DisableTailPredication("arm-loloops-disable-tailpred", cl::Hidden,
+    cl::desc("Disable tail-predication in the ARM LowOverheadLoop pass"),
+    cl::init(false));
+
+static bool isVectorPredicated(MachineInstr *MI) {
+  int PIdx = llvm::findFirstVPTPredOperandIdx(*MI);
+  return PIdx != -1 && MI->getOperand(PIdx + 1).getReg() == ARM::VPR;
+}
+
+static bool isVectorPredicate(MachineInstr *MI) {
+  return MI->findRegisterDefOperandIdx(ARM::VPR) != -1;
+}
+
+static bool hasVPRUse(MachineInstr &MI) {
+  return MI.findRegisterUseOperandIdx(ARM::VPR) != -1;
+}
+
+static bool isDomainMVE(MachineInstr *MI) {
+  uint64_t Domain = MI->getDesc().TSFlags & ARMII::DomainMask;
+  return Domain == ARMII::DomainMVE;
+}
+
+static bool shouldInspect(MachineInstr &MI) {
+  return isDomainMVE(&MI) || isVectorPredicate(&MI) || hasVPRUse(MI);
+}
+
+static bool isDo(MachineInstr *MI) {
+  return MI->getOpcode() != ARM::t2WhileLoopStart;
+}
+
 namespace {
 
   using InstSet = SmallPtrSetImpl<MachineInstr *>;
@@ -111,8 +143,7 @@ namespace {
       // Insert exit blocks.
       SmallVector<MachineBasicBlock*, 2> ExitBlocks;
       ML.getExitBlocks(ExitBlocks);
-      for (auto *MBB : ExitBlocks)
-        Order.push_back(MBB);
+      append_range(Order, ExitBlocks);
 
       // Then add the loop body.
       Search(ML.getHeader());
@@ -143,73 +174,187 @@ namespace {
     }
   };
 
-  // Represent a VPT block, a list of instructions that begins with a VPT/VPST
-  // and has a maximum of four proceeding instructions. All instructions within
-  // the block are predicated upon the vpr and we allow instructions to define
-  // the vpr within in the block too.
-  class VPTBlock {
-    // The predicate then instruction, which is either a VPT, or a VPST
-    // instruction.
-    std::unique_ptr<PredicatedMI> PredicateThen;
-    PredicatedMI *Divergent = nullptr;
-    SmallVector<PredicatedMI, 4> Insts;
+  // Represent the current state of the VPR and hold all instances which
+  // represent a VPT block, which is a list of instructions that begins with a
+  // VPT/VPST and has a maximum of four proceeding instructions. All
+  // instructions within the block are predicated upon the vpr and we allow
+  // instructions to define the vpr within in the block too.
+  class VPTState {
+    friend struct LowOverheadLoop;
+
+    SmallVector<MachineInstr *, 4> Insts;
+
+    static SmallVector<VPTState, 4> Blocks;
+    static SetVector<MachineInstr *> CurrentPredicates;
+    static std::map<MachineInstr *,
+      std::unique_ptr<PredicatedMI>> PredicatedInsts;
+
+    static void CreateVPTBlock(MachineInstr *MI) {
+      assert((CurrentPredicates.size() || MI->getParent()->isLiveIn(ARM::VPR))
+             && "Can't begin VPT without predicate");
+      Blocks.emplace_back(MI);
+      // The execution of MI is predicated upon the current set of instructions
+      // that are AND'ed together to form the VPR predicate value. In the case
+      // that MI is a VPT, CurrentPredicates will also just be MI.
+      PredicatedInsts.emplace(
+        MI, std::make_unique<PredicatedMI>(MI, CurrentPredicates));
+    }
 
-  public:
-    VPTBlock(MachineInstr *MI, SetVector<MachineInstr*> &Preds) {
-      PredicateThen = std::make_unique<PredicatedMI>(MI, Preds);
+    static void reset() {
+      Blocks.clear();
+      PredicatedInsts.clear();
+      CurrentPredicates.clear();
     }
 
-    void addInst(MachineInstr *MI, SetVector<MachineInstr*> &Preds) {
-      LLVM_DEBUG(dbgs() << "ARM Loops: Adding predicated MI: " << *MI);
-      if (!Divergent && !set_difference(Preds, PredicateThen->Predicates).empty()) {
-        Divergent = &Insts.back();
-        LLVM_DEBUG(dbgs() << " - has divergent predicate: " << *Divergent->MI);
-      }
-      Insts.emplace_back(MI, Preds);
-      assert(Insts.size() <= 4 && "Too many instructions in VPT block!");
+    static void addInst(MachineInstr *MI) {
+      Blocks.back().insert(MI);
+      PredicatedInsts.emplace(
+        MI, std::make_unique<PredicatedMI>(MI, CurrentPredicates));
     }
 
+    static void addPredicate(MachineInstr *MI) {
+      LLVM_DEBUG(dbgs() << "ARM Loops: Adding VPT Predicate: " << *MI);
+      CurrentPredicates.insert(MI);
+    }
+
+    static void resetPredicate(MachineInstr *MI) {
+      LLVM_DEBUG(dbgs() << "ARM Loops: Resetting VPT Predicate: " << *MI);
+      CurrentPredicates.clear();
+      CurrentPredicates.insert(MI);
+    }
+
+  public:
     // Have we found an instruction within the block which defines the vpr? If
     // so, not all the instructions in the block will have the same predicate.
-    bool HasNonUniformPredicate() const {
-      return Divergent != nullptr;
+    static bool hasUniformPredicate(VPTState &Block) {
+      return getDivergent(Block) == nullptr;
     }
 
-    // Is the given instruction part of the predicate set controlling the entry
-    // to the block.
-    bool IsPredicatedOn(MachineInstr *MI) const {
-      return PredicateThen->Predicates.count(MI);
+    // If it exists, return the first internal instruction which modifies the
+    // VPR.
+    static MachineInstr *getDivergent(VPTState &Block) {
+      SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
+      for (unsigned i = 1; i < Insts.size(); ++i) {
+        MachineInstr *Next = Insts[i];
+        if (isVectorPredicate(Next))
+          return Next; // Found an instruction altering the vpr.
+      }
+      return nullptr;
     }
 
-    // Returns true if this is a VPT instruction.
-    bool isVPT() const { return !isVPST(); }
+    // Return whether the given instruction is predicated upon a VCTP.
+    static bool isPredicatedOnVCTP(MachineInstr *MI, bool Exclusive = false) {
+      SetVector<MachineInstr *> &Predicates = PredicatedInsts[MI]->Predicates;
+      if (Exclusive && Predicates.size() != 1)
+        return false;
+      for (auto *PredMI : Predicates)
+        if (isVCTP(PredMI))
+          return true;
+      return false;
+    }
 
-    // Returns true if this is a VPST instruction.
-    bool isVPST() const {
-      return PredicateThen->MI->getOpcode() == ARM::MVE_VPST;
+    // Is the VPST, controlling the block entry, predicated upon a VCTP.
+    static bool isEntryPredicatedOnVCTP(VPTState &Block,
+                                        bool Exclusive = false) {
+      SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
+      return isPredicatedOnVCTP(Insts.front(), Exclusive);
     }
 
-    // Is the given instruction the only predicate which controls the entry to
-    // the block.
-    bool IsOnlyPredicatedOn(MachineInstr *MI) const {
-      return IsPredicatedOn(MI) && PredicateThen->Predicates.size() == 1;
+    // If this block begins with a VPT, we can check whether it's using
+    // at least one predicated input(s), as well as possible loop invariant
+    // which would result in it being implicitly predicated.
+    static bool hasImplicitlyValidVPT(VPTState &Block,
+                                      ReachingDefAnalysis &RDA) {
+      SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
+      MachineInstr *VPT = Insts.front();
+      assert(isVPTOpcode(VPT->getOpcode()) &&
+             "Expected VPT block to begin with VPT/VPST");
+
+      if (VPT->getOpcode() == ARM::MVE_VPST)
+        return false;
+
+      auto IsOperandPredicated = [&](MachineInstr *MI, unsigned Idx) {
+        MachineInstr *Op = RDA.getMIOperand(MI, MI->getOperand(Idx));
+        return Op && PredicatedInsts.count(Op) && isPredicatedOnVCTP(Op);
+      };
+
+      auto IsOperandInvariant = [&](MachineInstr *MI, unsigned Idx) {
+        MachineOperand &MO = MI->getOperand(Idx);
+        if (!MO.isReg() || !MO.getReg())
+          return true;
+
+        SmallPtrSet<MachineInstr *, 2> Defs;
+        RDA.getGlobalReachingDefs(MI, MO.getReg(), Defs);
+        if (Defs.empty())
+          return true;
+
+        for (auto *Def : Defs)
+          if (Def->getParent() == VPT->getParent())
+            return false;
+        return true;
+      };
+
+      // Check that at least one of the operands is directly predicated on a
+      // vctp and allow an invariant value too.
+      return (IsOperandPredicated(VPT, 1) || IsOperandPredicated(VPT, 2)) &&
+             (IsOperandPredicated(VPT, 1) || IsOperandInvariant(VPT, 1)) &&
+             (IsOperandPredicated(VPT, 2) || IsOperandInvariant(VPT, 2));
     }
 
-    unsigned size() const { return Insts.size(); }
-    SmallVectorImpl<PredicatedMI> &getInsts() { return Insts; }
-    MachineInstr *getPredicateThen() const { return PredicateThen->MI; }
-    PredicatedMI *getDivergent() const { return Divergent; }
-  };
+    static bool isValid(ReachingDefAnalysis &RDA) {
+      // All predication within the loop should be based on vctp. If the block
+      // isn't predicated on entry, check whether the vctp is within the block
+      // and that all other instructions are then predicated on it.
+      for (auto &Block : Blocks) {
+        if (isEntryPredicatedOnVCTP(Block, false) ||
+            hasImplicitlyValidVPT(Block, RDA))
+          continue;
 
-  struct Reduction {
-    MachineInstr *Init;
-    MachineInstr &Copy;
-    MachineInstr &Reduce;
-    MachineInstr &VPSEL;
+        SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
+        // We don't know how to convert a block with just a VPT;VCTP into
+        // anything valid once we remove the VCTP. For now just bail out.
+        assert(isVPTOpcode(Insts.front()->getOpcode()) &&
+               "Expected VPT block to start with a VPST or VPT!");
+        if (Insts.size() == 2 && Insts.front()->getOpcode() != ARM::MVE_VPST &&
+            isVCTP(Insts.back()))
+          return false;
+
+        for (auto *MI : Insts) {
+          // Check that any internal VCTPs are 'Then' predicated.
+          if (isVCTP(MI) && getVPTInstrPredicate(*MI) != ARMVCC::Then)
+            return false;
+          // Skip other instructions that build up the predicate.
+          if (MI->getOpcode() == ARM::MVE_VPST || isVectorPredicate(MI))
+            continue;
+          // Check that any other instructions are predicated upon a vctp.
+          // TODO: We could infer when VPTs are implicitly predicated on the
+          // vctp (when the operands are predicated).
+          if (!isPredicatedOnVCTP(MI)) {
+            LLVM_DEBUG(dbgs() << "ARM Loops: Can't convert: " << *MI);
+            return false;
+          }
+        }
+      }
+      return true;
+    }
 
-    Reduction(MachineInstr *Init, MachineInstr *Mov, MachineInstr *Add,
-              MachineInstr *Sel)
-      : Init(Init), Copy(*Mov), Reduce(*Add), VPSEL(*Sel) { }
+    VPTState(MachineInstr *MI) { Insts.push_back(MI); }
+
+    void insert(MachineInstr *MI) {
+      Insts.push_back(MI);
+      // VPT/VPST + 4 predicated instructions.
+      assert(Insts.size() <= 5 && "Too many instructions in VPT block!");
+    }
+
+    bool containsVCTP() const {
+      for (auto *MI : Insts)
+        if (isVCTP(MI))
+          return true;
+      return false;
+    }
+
+    unsigned size() const { return Insts.size(); }
+    SmallVectorImpl<MachineInstr *> &getInsts() { return Insts; }
   };
 
   struct LowOverheadLoop {
@@ -221,17 +366,14 @@ namespace {
     const TargetRegisterInfo &TRI;
     const ARMBaseInstrInfo &TII;
     MachineFunction *MF = nullptr;
-    MachineInstr *InsertPt = nullptr;
+    MachineBasicBlock::iterator StartInsertPt;
+    MachineBasicBlock *StartInsertBB = nullptr;
     MachineInstr *Start = nullptr;
     MachineInstr *Dec = nullptr;
     MachineInstr *End = nullptr;
-    MachineInstr *VCTP = nullptr;
-    SmallPtrSet<MachineInstr*, 4> SecondaryVCTPs;
-    VPTBlock *CurrentBlock = nullptr;
-    SetVector<MachineInstr*> CurrentPredicate;
-    SmallVector<VPTBlock, 4> VPTBlocks;
+    MachineOperand TPNumElements;
+    SmallVector<MachineInstr*, 4> VCTPs;
     SmallPtrSet<MachineInstr*, 4> ToRemove;
-    SmallVector<std::unique_ptr<Reduction>, 1> Reductions;
     SmallPtrSet<MachineInstr*, 4> BlockMasksToRecompute;
     bool Revert = false;
     bool CannotTailPredicate = false;
@@ -239,12 +381,14 @@ namespace {
     LowOverheadLoop(MachineLoop &ML, MachineLoopInfo &MLI,
                     ReachingDefAnalysis &RDA, const TargetRegisterInfo &TRI,
                     const ARMBaseInstrInfo &TII)
-      : ML(ML), MLI(MLI), RDA(RDA), TRI(TRI), TII(TII) {
+        : ML(ML), MLI(MLI), RDA(RDA), TRI(TRI), TII(TII),
+          TPNumElements(MachineOperand::CreateImm(0)) {
       MF = ML.getHeader()->getParent();
       if (auto *MBB = ML.getLoopPreheader())
         Preheader = MBB;
       else if (auto *MBB = MLI.findLoopPreheader(&ML, true))
         Preheader = MBB;
+      VPTState::reset();
     }
 
     // If this is an MVE instruction, check that we know how to use tail
@@ -259,18 +403,18 @@ namespace {
     bool IsTailPredicationLegal() const {
       // For now, let's keep things really simple and only support a single
       // block for tail predication.
-      return !Revert && FoundAllComponents() && VCTP &&
+      return !Revert && FoundAllComponents() && !VCTPs.empty() &&
              !CannotTailPredicate && ML.getNumBlocks() == 1;
     }
 
+    // Given that MI is a VCTP, check that is equivalent to any other VCTPs
+    // found.
+    bool AddVCTP(MachineInstr *MI);
+
     // Check that the predication in the loop will be equivalent once we
     // perform the conversion. Also ensure that we can provide the number
     // of elements to the loop start instruction.
-    bool ValidateTailPredicate(MachineInstr *StartInsertPt);
-
-    // See whether the live-out instructions are a reduction that we can fixup
-    // later.
-    bool FindValidReduction(InstSet &LiveMIs, InstSet &LiveOutUsers);
+    bool ValidateTailPredicate();
 
     // Check that any values available outside of the loop will be the same
     // after tail predication conversion.
@@ -283,34 +427,41 @@ namespace {
 
     // Check the branch targets are within range and we satisfy our
     // restrictions.
-    void CheckLegality(ARMBasicBlockUtils *BBUtils);
+    void Validate(ARMBasicBlockUtils *BBUtils);
 
     bool FoundAllComponents() const {
       return Start && Dec && End;
     }
 
-    SmallVectorImpl<VPTBlock> &getVPTBlocks() { return VPTBlocks; }
+    SmallVectorImpl<VPTState> &getVPTBlocks() {
+      return VPTState::Blocks;
+    }
 
-    // Return the loop iteration count, or the number of elements if we're tail
-    // predicating.
-    MachineOperand &getCount() {
-      return IsTailPredicationLegal() ?
-        VCTP->getOperand(1) : Start->getOperand(0);
+    // Return the operand for the loop start instruction. This will be the loop
+    // iteration count, or the number of elements if we're tail predicating.
+    MachineOperand &getLoopStartOperand() {
+      if (IsTailPredicationLegal())
+        return TPNumElements;
+      return isDo(Start) ? Start->getOperand(1) : Start->getOperand(0);
     }
 
     unsigned getStartOpcode() const {
-      bool IsDo = Start->getOpcode() == ARM::t2DoLoopStart;
+      bool IsDo = isDo(Start);
       if (!IsTailPredicationLegal())
         return IsDo ? ARM::t2DLS : ARM::t2WLS;
 
-      return VCTPOpcodeToLSTP(VCTP->getOpcode(), IsDo);
+      return VCTPOpcodeToLSTP(VCTPs.back()->getOpcode(), IsDo);
     }
 
     void dump() const {
       if (Start) dbgs() << "ARM Loops: Found Loop Start: " << *Start;
       if (Dec) dbgs() << "ARM Loops: Found Loop Dec: " << *Dec;
       if (End) dbgs() << "ARM Loops: Found Loop End: " << *End;
-      if (VCTP) dbgs() << "ARM Loops: Found VCTP: " << *VCTP;
+      if (!VCTPs.empty()) {
+        dbgs() << "ARM Loops: Found VCTP(s):\n";
+        for (auto *MI : VCTPs)
+          dbgs() << " - " << *MI;
+      }
       if (!FoundAllComponents())
         dbgs() << "ARM Loops: Not a low-overhead loop.\n";
       else if (!(Start && Dec && End))
@@ -357,14 +508,15 @@ namespace {
     bool RevertNonLoops();
 
     void RevertWhile(MachineInstr *MI) const;
+    void RevertDo(MachineInstr *MI) const;
 
     bool RevertLoopDec(MachineInstr *MI) const;
 
     void RevertLoopEnd(MachineInstr *MI, bool SkipCmp = false) const;
 
-    void ConvertVPTBlocks(LowOverheadLoop &LoLoop);
+    void RevertLoopEndDec(MachineInstr *MI) const;
 
-    void FixupReductions(LowOverheadLoop &LoLoop) const;
+    void ConvertVPTBlocks(LowOverheadLoop &LoLoop);
 
     MachineInstr *ExpandLoopStart(LowOverheadLoop &LoLoop);
 
@@ -376,149 +528,228 @@ namespace {
 
 char ARMLowOverheadLoops::ID = 0;
 
+SmallVector<VPTState, 4> VPTState::Blocks;
+SetVector<MachineInstr *> VPTState::CurrentPredicates;
+std::map<MachineInstr *,
+         std::unique_ptr<PredicatedMI>> VPTState::PredicatedInsts;
+
 INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME,
                 false, false)
 
-MachineInstr *LowOverheadLoop::isSafeToDefineLR() {
-  // We can define LR because LR already contains the same value.
-  if (Start->getOperand(0).getReg() == ARM::LR)
-    return Start;
-
-  unsigned CountReg = Start->getOperand(0).getReg();
-  auto IsMoveLR = [&CountReg](MachineInstr *MI) {
-    return MI->getOpcode() == ARM::tMOVr &&
-           MI->getOperand(0).getReg() == ARM::LR &&
-           MI->getOperand(1).getReg() == CountReg &&
-           MI->getOperand(2).getImm() == ARMCC::AL;
-   };
-
-  MachineBasicBlock *MBB = Start->getParent();
-
-  // Find an insertion point:
-  // - Is there a (mov lr, Count) before Start? If so, and nothing else writes
-  //   to Count before Start, we can insert at that mov.
-  if (auto *LRDef = RDA.getUniqueReachingMIDef(Start, ARM::LR))
-    if (IsMoveLR(LRDef) && RDA.hasSameReachingDef(Start, LRDef, CountReg))
-      return LRDef;
-
-  // - Is there a (mov lr, Count) after Start? If so, and nothing else writes
-  //   to Count after Start, we can insert at that mov.
-  if (auto *LRDef = RDA.getLocalLiveOutMIDef(MBB, ARM::LR))
-    if (IsMoveLR(LRDef) && RDA.hasSameReachingDef(Start, LRDef, CountReg))
-      return LRDef;
-
-  // We've found no suitable LR def and Start doesn't use LR directly. Can we
-  // just define LR anyway?
-  return RDA.isSafeToDefRegAt(Start, ARM::LR) ? Start : nullptr;
-}
+static bool TryRemove(MachineInstr *MI, ReachingDefAnalysis &RDA,
+                      InstSet &ToRemove, InstSet &Ignore) {
+
+  // Check that we can remove all of Killed without having to modify any IT
+  // blocks.
+  auto WontCorruptITs = [](InstSet &Killed, ReachingDefAnalysis &RDA) {
+    // Collect the dead code and the MBBs in which they reside.
+    SmallPtrSet<MachineBasicBlock*, 2> BasicBlocks;
+    for (auto *Dead : Killed)
+      BasicBlocks.insert(Dead->getParent());
+
+    // Collect IT blocks in all affected basic blocks.
+    std::map<MachineInstr *, SmallPtrSet<MachineInstr *, 2>> ITBlocks;
+    for (auto *MBB : BasicBlocks) {
+      for (auto &IT : *MBB) {
+        if (IT.getOpcode() != ARM::t2IT)
+          continue;
+        RDA.getReachingLocalUses(&IT, MCRegister::from(ARM::ITSTATE),
+                                 ITBlocks[&IT]);
+      }
+    }
 
-bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt) {
-  assert(VCTP && "VCTP instruction expected but is not set");
-  // All predication within the loop should be based on vctp. If the block
-  // isn't predicated on entry, check whether the vctp is within the block
-  // and that all other instructions are then predicated on it.
-  for (auto &Block : VPTBlocks) {
-    if (Block.IsPredicatedOn(VCTP))
-      continue;
-    if (Block.HasNonUniformPredicate() && !isVCTP(Block.getDivergent()->MI)) {
-      LLVM_DEBUG(dbgs() << "ARM Loops: Found unsupported diverging predicate: "
-                        << *Block.getDivergent()->MI);
-      return false;
+    // If we're removing all of the instructions within an IT block, then
+    // also remove the IT instruction.
+    SmallPtrSet<MachineInstr *, 2> ModifiedITs;
+    SmallPtrSet<MachineInstr *, 2> RemoveITs;
+    for (auto *Dead : Killed) {
+      if (MachineOperand *MO = Dead->findRegisterUseOperand(ARM::ITSTATE)) {
+        MachineInstr *IT = RDA.getMIOperand(Dead, *MO);
+        RemoveITs.insert(IT);
+        auto &CurrentBlock = ITBlocks[IT];
+        CurrentBlock.erase(Dead);
+        if (CurrentBlock.empty())
+          ModifiedITs.erase(IT);
+        else
+          ModifiedITs.insert(IT);
+      }
     }
-    SmallVectorImpl<PredicatedMI> &Insts = Block.getInsts();
-    for (auto &PredMI : Insts) {
-      // Check the instructions in the block and only allow:
-      //   - VCTPs
-      //   - Instructions predicated on the main VCTP
-      //   - Any VCMP
-      //      - VCMPs just "and" their result with VPR.P0. Whether they are
-      //      located before/after the VCTP is irrelevant - the end result will
-      //      be the same in both cases, so there's no point in requiring them
-      //      to be located after the VCTP!
-      if (PredMI.Predicates.count(VCTP) || isVCTP(PredMI.MI) ||
-          VCMPOpcodeToVPT(PredMI.MI->getOpcode()) != 0)
-        continue;
-      LLVM_DEBUG(dbgs() << "ARM Loops: Can't convert: " << *PredMI.MI
-                 << " - which is predicated on:\n";
-                 for (auto *MI : PredMI.Predicates)
-                   dbgs() << "   - " << *MI);
+    if (!ModifiedITs.empty())
       return false;
+    Killed.insert(RemoveITs.begin(), RemoveITs.end());
+    return true;
+  };
+
+  SmallPtrSet<MachineInstr *, 2> Uses;
+  if (!RDA.isSafeToRemove(MI, Uses, Ignore))
+    return false;
+
+  if (WontCorruptITs(Uses, RDA)) {
+    ToRemove.insert(Uses.begin(), Uses.end());
+    LLVM_DEBUG(dbgs() << "ARM Loops: Able to remove: " << *MI
+               << " - can also remove:\n";
+               for (auto *Use : Uses)
+                 dbgs() << "   - " << *Use);
+
+    SmallPtrSet<MachineInstr*, 4> Killed;
+    RDA.collectKilledOperands(MI, Killed);
+    if (WontCorruptITs(Killed, RDA)) {
+      ToRemove.insert(Killed.begin(), Killed.end());
+      LLVM_DEBUG(for (auto *Dead : Killed)
+                   dbgs() << "   - " << *Dead);
     }
+    return true;
+  }
+  return false;
+}
+
+bool LowOverheadLoop::ValidateTailPredicate() {
+  if (!IsTailPredicationLegal()) {
+    LLVM_DEBUG(if (VCTPs.empty())
+                 dbgs() << "ARM Loops: Didn't find a VCTP instruction.\n";
+               dbgs() << "ARM Loops: Tail-predication is not valid.\n");
+    return false;
   }
 
-  if (!ValidateLiveOuts())
+  assert(!VCTPs.empty() && "VCTP instruction expected but is not set");
+  assert(ML.getBlocks().size() == 1 &&
+         "Shouldn't be processing a loop with more than one block");
+
+  if (DisableTailPredication) {
+    LLVM_DEBUG(dbgs() << "ARM Loops: tail-predication is disabled\n");
     return false;
+  }
 
-  // For tail predication, we need to provide the number of elements, instead
-  // of the iteration count, to the loop start instruction. The number of
-  // elements is provided to the vctp instruction, so we need to check that
-  // we can use this register at InsertPt.
-  Register NumElements = VCTP->getOperand(1).getReg();
+  if (!VPTState::isValid(RDA)) {
+    LLVM_DEBUG(dbgs() << "ARM Loops: Invalid VPT state.\n");
+    return false;
+  }
 
-  // If the register is defined within loop, then we can't perform TP.
-  // TODO: Check whether this is just a mov of a register that would be
-  // available.
-  if (RDA.hasLocalDefBefore(VCTP, NumElements)) {
-    LLVM_DEBUG(dbgs() << "ARM Loops: VCTP operand is defined in the loop.\n");
+  if (!ValidateLiveOuts()) {
+    LLVM_DEBUG(dbgs() << "ARM Loops: Invalid live outs.\n");
     return false;
   }
 
-  // The element count register maybe defined after InsertPt, in which case we
-  // need to try to move either InsertPt or the def so that the [w|d]lstp can
-  // use the value.
-  // TODO: On failing to move an instruction, check if the count is provided by
-  // a mov and whether we can use the mov operand directly.
-  MachineBasicBlock *InsertBB = StartInsertPt->getParent();
-  if (!RDA.isReachingDefLiveOut(StartInsertPt, NumElements)) {
-    if (auto *ElemDef = RDA.getLocalLiveOutMIDef(InsertBB, NumElements)) {
-      if (RDA.isSafeToMoveForwards(ElemDef, StartInsertPt)) {
-        ElemDef->removeFromParent();
-        InsertBB->insert(MachineBasicBlock::iterator(StartInsertPt), ElemDef);
-        LLVM_DEBUG(dbgs() << "ARM Loops: Moved element count def: "
-                   << *ElemDef);
-      } else if (RDA.isSafeToMoveBackwards(StartInsertPt, ElemDef)) {
-        StartInsertPt->removeFromParent();
-        InsertBB->insertAfter(MachineBasicBlock::iterator(ElemDef),
-                              StartInsertPt);
-        LLVM_DEBUG(dbgs() << "ARM Loops: Moved start past: " << *ElemDef);
-      } else {
-        LLVM_DEBUG(dbgs() << "ARM Loops: Unable to move element count to loop "
-                   << "start instruction.\n");
-        return false;
+  // Check that creating a [W|D]LSTP, which will define LR with an element
+  // count instead of iteration count, won't affect any other instructions
+  // than the LoopStart and LoopDec.
+  // TODO: We should try to insert the [W|D]LSTP after any of the other uses.
+  Register StartReg = isDo(Start) ? Start->getOperand(1).getReg()
+                                  : Start->getOperand(0).getReg();
+  if (StartInsertPt == Start && StartReg == ARM::LR) {
+    if (auto *IterCount = RDA.getMIOperand(Start, isDo(Start) ? 1 : 0)) {
+      SmallPtrSet<MachineInstr *, 2> Uses;
+      RDA.getGlobalUses(IterCount, MCRegister::from(ARM::LR), Uses);
+      for (auto *Use : Uses) {
+        if (Use != Start && Use != Dec) {
+          LLVM_DEBUG(dbgs() << " ARM Loops: Found LR use: " << *Use);
+          return false;
+        }
       }
     }
   }
 
-  // Especially in the case of while loops, InsertBB may not be the
-  // preheader, so we need to check that the register isn't redefined
-  // before entering the loop.
-  auto CannotProvideElements = [this](MachineBasicBlock *MBB,
-                                      Register NumElements) {
-    // NumElements is redefined in this block.
-    if (RDA.hasLocalDefBefore(&MBB->back(), NumElements))
-      return true;
+  // For tail predication, we need to provide the number of elements, instead
+  // of the iteration count, to the loop start instruction. The number of
+  // elements is provided to the vctp instruction, so we need to check that
+  // we can use this register at InsertPt.
+  MachineInstr *VCTP = VCTPs.back();
+  if (Start->getOpcode() == ARM::t2DoLoopStartTP) {
+    TPNumElements = Start->getOperand(2);
+    StartInsertPt = Start;
+    StartInsertBB = Start->getParent();
+  } else {
+    TPNumElements = VCTP->getOperand(1);
+    MCRegister NumElements = TPNumElements.getReg().asMCReg();
+
+    // If the register is defined within loop, then we can't perform TP.
+    // TODO: Check whether this is just a mov of a register that would be
+    // available.
+    if (RDA.hasLocalDefBefore(VCTP, NumElements)) {
+      LLVM_DEBUG(dbgs() << "ARM Loops: VCTP operand is defined in the loop.\n");
+      return false;
+    }
 
-    // Don't continue searching up through multiple predecessors.
-    if (MBB->pred_size() > 1)
-      return true;
+    // The element count register maybe defined after InsertPt, in which case we
+    // need to try to move either InsertPt or the def so that the [w|d]lstp can
+    // use the value.
+
+    if (StartInsertPt != StartInsertBB->end() &&
+        !RDA.isReachingDefLiveOut(&*StartInsertPt, NumElements)) {
+      if (auto *ElemDef =
+              RDA.getLocalLiveOutMIDef(StartInsertBB, NumElements)) {
+        if (RDA.isSafeToMoveForwards(ElemDef, &*StartInsertPt)) {
+          ElemDef->removeFromParent();
+          StartInsertBB->insert(StartInsertPt, ElemDef);
+          LLVM_DEBUG(dbgs()
+                     << "ARM Loops: Moved element count def: " << *ElemDef);
+        } else if (RDA.isSafeToMoveBackwards(&*StartInsertPt, ElemDef)) {
+          StartInsertPt->removeFromParent();
+          StartInsertBB->insertAfter(MachineBasicBlock::iterator(ElemDef),
+                                     &*StartInsertPt);
+          LLVM_DEBUG(dbgs() << "ARM Loops: Moved start past: " << *ElemDef);
+        } else {
+          // If we fail to move an instruction and the element count is provided
+          // by a mov, use the mov operand if it will have the same value at the
+          // insertion point
+          MachineOperand Operand = ElemDef->getOperand(1);
+          if (isMovRegOpcode(ElemDef->getOpcode()) &&
+              RDA.getUniqueReachingMIDef(ElemDef, Operand.getReg().asMCReg()) ==
+                  RDA.getUniqueReachingMIDef(&*StartInsertPt,
+                                             Operand.getReg().asMCReg())) {
+            TPNumElements = Operand;
+            NumElements = TPNumElements.getReg();
+          } else {
+            LLVM_DEBUG(dbgs()
+                       << "ARM Loops: Unable to move element count to loop "
+                       << "start instruction.\n");
+            return false;
+          }
+        }
+      }
+    }
 
-    return false;
-  };
+    // Especially in the case of while loops, InsertBB may not be the
+    // preheader, so we need to check that the register isn't redefined
+    // before entering the loop.
+    auto CannotProvideElements = [this](MachineBasicBlock *MBB,
+                                        MCRegister NumElements) {
+      if (MBB->empty())
+        return false;
+      // NumElements is redefined in this block.
+      if (RDA.hasLocalDefBefore(&MBB->back(), NumElements))
+        return true;
 
-  // First, find the block that looks like the preheader.
-  MachineBasicBlock *MBB = Preheader;
-  if (!MBB) {
-    LLVM_DEBUG(dbgs() << "ARM Loops: Didn't find preheader.\n");
-    return false;
-  }
+      // Don't continue searching up through multiple predecessors.
+      if (MBB->pred_size() > 1)
+        return true;
 
-  // Then search backwards for a def, until we get to InsertBB.
-  while (MBB != InsertBB) {
-    if (CannotProvideElements(MBB, NumElements)) {
-      LLVM_DEBUG(dbgs() << "ARM Loops: Unable to provide element count.\n");
       return false;
+    };
+
+    // Search backwards for a def, until we get to InsertBB.
+    MachineBasicBlock *MBB = Preheader;
+    while (MBB && MBB != StartInsertBB) {
+      if (CannotProvideElements(MBB, NumElements)) {
+        LLVM_DEBUG(dbgs() << "ARM Loops: Unable to provide element count.\n");
+        return false;
+      }
+      MBB = *MBB->pred_begin();
     }
-    MBB = *MBB->pred_begin();
+  }
+
+  // Could inserting the [W|D]LSTP cause some unintended affects? In a perfect
+  // world the [w|d]lstp instruction would be last instruction in the preheader
+  // and so it would only affect instructions within the loop body. But due to
+  // scheduling, and/or the logic in this pass (above), the insertion point can
+  // be moved earlier. So if the Loop Start isn't the last instruction in the
+  // preheader, and if the initial element count is smaller than the vector
+  // width, the Loop Start instruction will immediately generate one or more
+  // false lane mask which can, incorrectly, affect the proceeding MVE
+  // instructions in the preheader.
+  if (std::any_of(StartInsertPt, StartInsertBB->end(), shouldInspect)) {
+    LLVM_DEBUG(dbgs() << "ARM Loops: Instruction blocks [W|D]LSTP\n");
+    return false;
   }
 
   // Check that the value change of the element count is what we expect and
@@ -529,15 +760,20 @@ bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt) {
     return -getAddSubImmediate(*MI) == ExpectedVecWidth;
   };
 
-  MBB = VCTP->getParent();
-  if (auto *Def = RDA.getUniqueReachingMIDef(&MBB->back(), NumElements)) {
+  MachineBasicBlock *MBB = VCTP->getParent();
+  // Remove modifications to the element count since they have no purpose in a
+  // tail predicated loop. Explicitly refer to the vctp operand no matter which
+  // register NumElements has been assigned to, since that is what the
+  // modifications will be using
+  if (auto *Def = RDA.getUniqueReachingMIDef(
+          &MBB->back(), VCTP->getOperand(1).getReg().asMCReg())) {
     SmallPtrSet<MachineInstr*, 2> ElementChain;
-    SmallPtrSet<MachineInstr*, 2> Ignore = { VCTP };
+    SmallPtrSet<MachineInstr*, 2> Ignore;
     unsigned ExpectedVectorWidth = getTailPredVectorWidth(VCTP->getOpcode());
 
-    Ignore.insert(SecondaryVCTPs.begin(), SecondaryVCTPs.end());
+    Ignore.insert(VCTPs.begin(), VCTPs.end());
 
-    if (RDA.isSafeToRemove(Def, ElementChain, Ignore)) {
+    if (TryRemove(Def, RDA, ElementChain, Ignore)) {
       bool FoundSub = false;
 
       for (auto *MI : ElementChain) {
@@ -545,27 +781,24 @@ bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt) {
           continue;
 
         if (isSubImmOpcode(MI->getOpcode())) {
-          if (FoundSub || !IsValidSub(MI, ExpectedVectorWidth))
+          if (FoundSub || !IsValidSub(MI, ExpectedVectorWidth)) {
+            LLVM_DEBUG(dbgs() << "ARM Loops: Unexpected instruction in element"
+                       " count: " << *MI);
             return false;
+          }
           FoundSub = true;
-        } else
+        } else {
+          LLVM_DEBUG(dbgs() << "ARM Loops: Unexpected instruction in element"
+                     " count: " << *MI);
           return false;
+        }
       }
-
-      LLVM_DEBUG(dbgs() << "ARM Loops: Will remove element count chain:\n";
-                 for (auto *MI : ElementChain)
-                   dbgs() << " - " << *MI);
       ToRemove.insert(ElementChain.begin(), ElementChain.end());
     }
   }
   return true;
 }
 
-static bool isVectorPredicated(MachineInstr *MI) {
-  int PIdx = llvm::findFirstVPTPredOperandIdx(*MI);
-  return PIdx != -1 && MI->getOperand(PIdx + 1).getReg() == ARM::VPR;
-}
-
 static bool isRegInClass(const MachineOperand &MO,
                          const TargetRegisterClass *Class) {
   return MO.isReg() && MO.getReg() && Class->contains(MO.getReg());
@@ -623,7 +856,6 @@ static bool canGenerateNonZeros(const MachineInstr &MI) {
   return false;
 }
 
-
 // Look at its register uses to see if it only can only receive zeros
 // into its false lanes which would then produce zeros. Also check that
 // the output register is also defined by an FalseLanesZero instruction
@@ -636,120 +868,40 @@ static bool producesFalseLanesZero(MachineInstr &MI,
   if (canGenerateNonZeros(MI))
     return false;
 
+  bool isPredicated = isVectorPredicated(&MI);
+  // Predicated loads will write zeros to the falsely predicated bytes of the
+  // destination register.
+  if (MI.mayLoad())
+    return isPredicated;
+
+  auto IsZeroInit = [](MachineInstr *Def) {
+    return !isVectorPredicated(Def) &&
+           Def->getOpcode() == ARM::MVE_VMOVimmi32 &&
+           Def->getOperand(1).getImm() == 0;
+  };
+
   bool AllowScalars = isHorizontalReduction(MI);
   for (auto &MO : MI.operands()) {
     if (!MO.isReg() || !MO.getReg())
       continue;
     if (!isRegInClass(MO, QPRs) && AllowScalars)
       continue;
-    if (auto *OpDef = RDA.getMIOperand(&MI, MO))
-      if (FalseLanesZero.count(OpDef))
-       continue;
-    return false;
-  }
-  LLVM_DEBUG(dbgs() << "ARM Loops: Always False Zeros: " << MI);
-  return true;
-}
-
-bool
-LowOverheadLoop::FindValidReduction(InstSet &LiveMIs, InstSet &LiveOutUsers) {
-  // Also check for reductions where the operation needs to be merging values
-  // from the last and previous loop iterations. This means an instruction
-  // producing a value and a vmov storing the value calculated in the previous
-  // iteration. So we can have two live-out regs, one produced by a vmov and
-  // both being consumed by a vpsel.
-  LLVM_DEBUG(dbgs() << "ARM Loops: Looking for reduction live-outs:\n";
-             for (auto *MI : LiveMIs)
-               dbgs() << " - " << *MI);
-
-  if (!Preheader)
-    return false;
-
-  // Expect a vmov, a vadd and a single vpsel user.
-  // TODO: This means we can't currently support multiple reductions in the
-  // loop.
-  if (LiveMIs.size() != 2 || LiveOutUsers.size() != 1)
-    return false;
-
-  MachineInstr *VPSEL = *LiveOutUsers.begin();
-  if (VPSEL->getOpcode() != ARM::MVE_VPSEL)
-    return false;
-
-  unsigned VPRIdx = llvm::findFirstVPTPredOperandIdx(*VPSEL) + 1;
-  MachineInstr *Pred = RDA.getMIOperand(VPSEL, VPRIdx);
-  if (!Pred || Pred != VCTP) {
-    LLVM_DEBUG(dbgs() << "ARM Loops: Not using equivalent predicate.\n");
-    return false;
-  }
 
-  MachineInstr *Reduce = RDA.getMIOperand(VPSEL, 1);
-  if (!Reduce)
-    return false;
-
-  assert(LiveMIs.count(Reduce) && "Expected MI to be live-out");
-
-  // TODO: Support more operations than VADD.
-  switch (VCTP->getOpcode()) {
-  default:
-    return false;
-  case ARM::MVE_VCTP8:
-    if (Reduce->getOpcode() != ARM::MVE_VADDi8)
-      return false;
-    break;
-  case ARM::MVE_VCTP16:
-    if (Reduce->getOpcode() != ARM::MVE_VADDi16)
-      return false;
-    break;
-  case ARM::MVE_VCTP32:
-    if (Reduce->getOpcode() != ARM::MVE_VADDi32)
+    // Check that this instruction will produce zeros in its false lanes:
+    // - If it only consumes false lanes zero or constant 0 (vmov #0)
+    // - If it's predicated, it only matters that it's def register already has
+    //   false lane zeros, so we can ignore the uses.
+    SmallPtrSet<MachineInstr *, 2> Defs;
+    RDA.getGlobalReachingDefs(&MI, MO.getReg(), Defs);
+    for (auto *Def : Defs) {
+      if (Def == &MI || FalseLanesZero.count(Def) || IsZeroInit(Def))
+        continue;
+      if (MO.isUse() && isPredicated)
+        continue;
       return false;
-    break;
-  }
-
-  // Test that the reduce op is overwriting ones of its operands.
-  if (Reduce->getOperand(0).getReg() != Reduce->getOperand(1).getReg() &&
-      Reduce->getOperand(0).getReg() != Reduce->getOperand(2).getReg()) {
-    LLVM_DEBUG(dbgs() << "ARM Loops: Reducing op isn't overwriting itself.\n");
-    return false;
-  }
-
-  // Check that the VORR is actually a VMOV.
-  MachineInstr *Copy = RDA.getMIOperand(VPSEL, 2);
-  if (!Copy || Copy->getOpcode() != ARM::MVE_VORR ||
-      !Copy->getOperand(1).isReg() || !Copy->getOperand(2).isReg() ||
-      Copy->getOperand(1).getReg() != Copy->getOperand(2).getReg())
-    return false;
-
-  assert(LiveMIs.count(Copy) && "Expected MI to be live-out");
-
-  // Check that the vadd and vmov are only used by each other and the vpsel.
-  SmallPtrSet<MachineInstr*, 2> CopyUsers;
-  RDA.getGlobalUses(Copy, Copy->getOperand(0).getReg(), CopyUsers);
-  if (CopyUsers.size() > 2 || !CopyUsers.count(Reduce)) {
-    LLVM_DEBUG(dbgs() << "ARM Loops: Copy users unsupported.\n");
-    return false;
-  }
-
-  SmallPtrSet<MachineInstr*, 2> ReduceUsers;
-  RDA.getGlobalUses(Reduce, Reduce->getOperand(0).getReg(), ReduceUsers);
-  if (ReduceUsers.size() > 2 || !ReduceUsers.count(Copy)) {
-    LLVM_DEBUG(dbgs() << "ARM Loops: Reduce users unsupported.\n");
-    return false;
+    }
   }
-
-  // Then find whether there's an instruction initialising the register that
-  // is storing the reduction.
-  SmallPtrSet<MachineInstr*, 2> Incoming;
-  RDA.getLiveOuts(Preheader, Copy->getOperand(1).getReg(), Incoming);
-  if (Incoming.size() > 1)
-    return false;
-
-  MachineInstr *Init = Incoming.empty() ? nullptr : *Incoming.begin();
-  LLVM_DEBUG(dbgs() << "ARM Loops: Found a reduction:\n"
-             << " - " << *Copy
-             << " - " << *Reduce
-             << " - " << *VPSEL);
-  Reductions.push_back(std::make_unique<Reduction>(Init, Copy, Reduce, VPSEL));
+  LLVM_DEBUG(dbgs() << "ARM Loops: Always False Zeros: " << MI);
   return true;
 }
 
@@ -769,7 +921,7 @@ bool LowOverheadLoop::ValidateLiveOuts() {
   // the false lanes are zeroed and here we're trying to track that those false
   // lanes remain zero, or where they change, the differences are masked away
   // by their user(s).
-  // All MVE loads and stores have to be predicated, so we know that any load
+  // All MVE stores have to be predicated, so we know that any predicate load
   // operands, or stored results are equivalent already. Other explicitly
   // predicated instructions will perform the same operation in the original
   // loop and the tail-predicated form too. Because of this, we can insert
@@ -782,42 +934,32 @@ bool LowOverheadLoop::ValidateLiveOuts() {
   MachineBasicBlock *Header = ML.getHeader();
 
   for (auto &MI : *Header) {
-    const MCInstrDesc &MCID = MI.getDesc();
-    uint64_t Flags = MCID.TSFlags;
-    if ((Flags & ARMII::DomainMask) != ARMII::DomainMVE)
+    if (!shouldInspect(MI))
       continue;
 
     if (isVCTP(&MI) || isVPTOpcode(MI.getOpcode()))
       continue;
 
-    // Predicated loads will write zeros to the falsely predicated bytes of the
-    // destination register.
-    if (isVectorPredicated(&MI)) {
-      if (MI.mayLoad())
-        FalseLanesZero.insert(&MI);
-      Predicated.insert(&MI);
-      continue;
-    }
+    bool isPredicated = isVectorPredicated(&MI);
+    bool retainsOrReduces =
+      retainsPreviousHalfElement(MI) || isHorizontalReduction(MI);
 
-    if (MI.getNumDefs() == 0)
+    if (isPredicated)
+      Predicated.insert(&MI);
+    if (producesFalseLanesZero(MI, QPRs, RDA, FalseLanesZero))
+      FalseLanesZero.insert(&MI);
+    else if (MI.getNumDefs() == 0)
       continue;
-
-    if (!producesFalseLanesZero(MI, QPRs, RDA, FalseLanesZero)) {
-      // We require retaining and horizontal operations to operate upon zero'd
-      // false lanes to ensure the conversion doesn't change the output.
-      if (retainsPreviousHalfElement(MI) || isHorizontalReduction(MI))
-        return false;
-      // Otherwise we need to evaluate this instruction later to see whether
-      // unknown false lanes will get masked away by their user(s).
+    else if (!isPredicated && retainsOrReduces)
+      return false;
+    else if (!isPredicated)
       FalseLanesUnknown.insert(&MI);
-    } else if (!isHorizontalReduction(MI))
-      FalseLanesZero.insert(&MI);
   }
 
   auto HasPredicatedUsers = [this](MachineInstr *MI, const MachineOperand &MO,
                               SmallPtrSetImpl<MachineInstr *> &Predicated) {
     SmallPtrSet<MachineInstr *, 2> Uses;
-    RDA.getGlobalUses(MI, MO.getReg(), Uses);
+    RDA.getGlobalUses(MI, MO.getReg().asMCReg(), Uses);
     for (auto *Use : Uses) {
       if (Use != MI && !Predicated.count(Use))
         return false;
@@ -840,139 +982,155 @@ bool LowOverheadLoop::ValidateLiveOuts() {
         LLVM_DEBUG(dbgs() << "ARM Loops: Found an unknown def of : "
                           << TRI.getRegAsmName(MO.getReg()) << " at " << *MI);
         NonPredicated.insert(MI);
-        continue;
+        break;
       }
     }
     // Any unknown false lanes have been masked away by the user(s).
-    Predicated.insert(MI);
+    if (!NonPredicated.contains(MI))
+      Predicated.insert(MI);
   }
 
   SmallPtrSet<MachineInstr *, 2> LiveOutMIs;
-  SmallPtrSet<MachineInstr*, 2> LiveOutUsers;
   SmallVector<MachineBasicBlock *, 2> ExitBlocks;
   ML.getExitBlocks(ExitBlocks);
   assert(ML.getNumBlocks() == 1 && "Expected single block loop!");
   assert(ExitBlocks.size() == 1 && "Expected a single exit block");
   MachineBasicBlock *ExitBB = ExitBlocks.front();
   for (const MachineBasicBlock::RegisterMaskPair &RegMask : ExitBB->liveins()) {
+    // TODO: Instead of blocking predication, we could move the vctp to the exit
+    // block and calculate it's operand there in or the preheader.
+    if (RegMask.PhysReg == ARM::VPR)
+      return false;
     // Check Q-regs that are live in the exit blocks. We don't collect scalars
     // because they won't be affected by lane predication.
-    if (QPRs->contains(RegMask.PhysReg)) {
+    if (QPRs->contains(RegMask.PhysReg))
       if (auto *MI = RDA.getLocalLiveOutMIDef(Header, RegMask.PhysReg))
         LiveOutMIs.insert(MI);
-      RDA.getLiveInUses(ExitBB, RegMask.PhysReg, LiveOutUsers);
-    }
   }
 
-  // If we have any non-predicated live-outs, they need to be part of a
-  // reduction that we can fixup later. The reduction that the form of an
-  // operation that uses its previous values through a vmov and then a vpsel
-  // resides in the exit blocks to select the final bytes from n and n-1
-  // iterations.
-  if (!NonPredicated.empty() &&
-      !FindValidReduction(NonPredicated, LiveOutUsers))
-    return false;
-
   // We've already validated that any VPT predication within the loop will be
   // equivalent when we perform the predication transformation; so we know that
   // any VPT predicated instruction is predicated upon VCTP. Any live-out
   // instruction needs to be predicated, so check this here. The instructions
   // in NonPredicated have been found to be a reduction that we can ensure its
   // legality.
-  for (auto *MI : LiveOutMIs)
-    if (!isVectorPredicated(MI) && !NonPredicated.count(MI))
+  for (auto *MI : LiveOutMIs) {
+    if (NonPredicated.count(MI) && FalseLanesUnknown.contains(MI)) {
+      LLVM_DEBUG(dbgs() << "ARM Loops: Unable to handle live out: " << *MI);
       return false;
+    }
+  }
 
   return true;
 }
 
-void LowOverheadLoop::CheckLegality(ARMBasicBlockUtils *BBUtils) {
+void LowOverheadLoop::Validate(ARMBasicBlockUtils *BBUtils) {
   if (Revert)
     return;
 
-  if (!End->getOperand(1).isMBB())
-    report_fatal_error("Expected LoopEnd to target basic block");
+  // Check branch target ranges: WLS[TP] can only branch forwards and LE[TP]
+  // can only jump back.
+  auto ValidateRanges = [](MachineInstr *Start, MachineInstr *End,
+                           ARMBasicBlockUtils *BBUtils, MachineLoop &ML) {
+    MachineBasicBlock *TgtBB = End->getOpcode() == ARM::t2LoopEnd
+                                   ? End->getOperand(1).getMBB()
+                                   : End->getOperand(2).getMBB();
+    // TODO Maybe there's cases where the target doesn't have to be the header,
+    // but for now be safe and revert.
+    if (TgtBB != ML.getHeader()) {
+      LLVM_DEBUG(dbgs() << "ARM Loops: LoopEnd is not targeting header.\n");
+      return false;
+    }
 
-  // TODO Maybe there's cases where the target doesn't have to be the header,
-  // but for now be safe and revert.
-  if (End->getOperand(1).getMBB() != ML.getHeader()) {
-    LLVM_DEBUG(dbgs() << "ARM Loops: LoopEnd is not targetting header.\n");
-    Revert = true;
-    return;
-  }
+    // The WLS and LE instructions have 12-bits for the label offset. WLS
+    // requires a positive offset, while LE uses negative.
+    if (BBUtils->getOffsetOf(End) < BBUtils->getOffsetOf(ML.getHeader()) ||
+        !BBUtils->isBBInRange(End, ML.getHeader(), 4094)) {
+      LLVM_DEBUG(dbgs() << "ARM Loops: LE offset is out-of-range\n");
+      return false;
+    }
 
-  // The WLS and LE instructions have 12-bits for the label offset. WLS
-  // requires a positive offset, while LE uses negative.
-  if (BBUtils->getOffsetOf(End) < BBUtils->getOffsetOf(ML.getHeader()) ||
-      !BBUtils->isBBInRange(End, ML.getHeader(), 4094)) {
-    LLVM_DEBUG(dbgs() << "ARM Loops: LE offset is out-of-range\n");
-    Revert = true;
-    return;
-  }
+    if (Start->getOpcode() == ARM::t2WhileLoopStart &&
+        (BBUtils->getOffsetOf(Start) >
+         BBUtils->getOffsetOf(Start->getOperand(1).getMBB()) ||
+         !BBUtils->isBBInRange(Start, Start->getOperand(1).getMBB(), 4094))) {
+      LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n");
+      return false;
+    }
+    return true;
+  };
 
-  if (Start->getOpcode() == ARM::t2WhileLoopStart &&
-      (BBUtils->getOffsetOf(Start) >
-       BBUtils->getOffsetOf(Start->getOperand(1).getMBB()) ||
-       !BBUtils->isBBInRange(Start, Start->getOperand(1).getMBB(), 4094))) {
-    LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n");
-    Revert = true;
-    return;
-  }
+  // Find a suitable position to insert the loop start instruction. It needs to
+  // be able to safely define LR.
+  auto FindStartInsertionPoint = [](MachineInstr *Start, MachineInstr *Dec,
+                                    MachineBasicBlock::iterator &InsertPt,
+                                    MachineBasicBlock *&InsertBB,
+                                    ReachingDefAnalysis &RDA,
+                                    InstSet &ToRemove) {
+    // For a t2DoLoopStart it is always valid to use the start insertion point.
+    // For WLS we can define LR if LR already contains the same value.
+    if (isDo(Start) || Start->getOperand(0).getReg() == ARM::LR) {
+      InsertPt = MachineBasicBlock::iterator(Start);
+      InsertBB = Start->getParent();
+      return true;
+    }
+
+    // We've found no suitable LR def and Start doesn't use LR directly. Can we
+    // just define LR anyway?
+    if (!RDA.isSafeToDefRegAt(Start, MCRegister::from(ARM::LR)))
+      return false;
 
-  InsertPt = Revert ? nullptr : isSafeToDefineLR();
-  if (!InsertPt) {
+    InsertPt = MachineBasicBlock::iterator(Start);
+    InsertBB = Start->getParent();
+    return true;
+  };
+
+  if (!FindStartInsertionPoint(Start, Dec, StartInsertPt, StartInsertBB, RDA,
+                               ToRemove)) {
     LLVM_DEBUG(dbgs() << "ARM Loops: Unable to find safe insertion point.\n");
     Revert = true;
     return;
-  } else
-    LLVM_DEBUG(dbgs() << "ARM Loops: Start insertion point: " << *InsertPt);
+  }
+  LLVM_DEBUG(if (StartInsertPt == StartInsertBB->end())
+               dbgs() << "ARM Loops: Will insert LoopStart at end of block\n";
+             else
+               dbgs() << "ARM Loops: Will insert LoopStart at "
+                      << *StartInsertPt
+            );
 
-  if (!IsTailPredicationLegal()) {
-    LLVM_DEBUG(if (!VCTP)
-                 dbgs() << "ARM Loops: Didn't find a VCTP instruction.\n";
-               dbgs() << "ARM Loops: Tail-predication is not valid.\n");
-    return;
+  Revert = !ValidateRanges(Start, End, BBUtils, ML);
+  CannotTailPredicate = !ValidateTailPredicate();
+}
+
+bool LowOverheadLoop::AddVCTP(MachineInstr *MI) {
+  LLVM_DEBUG(dbgs() << "ARM Loops: Adding VCTP: " << *MI);
+  if (VCTPs.empty()) {
+    VCTPs.push_back(MI);
+    return true;
   }
 
-  assert(ML.getBlocks().size() == 1 &&
-         "Shouldn't be processing a loop with more than one block");
-  CannotTailPredicate = !ValidateTailPredicate(InsertPt);
-  LLVM_DEBUG(if (CannotTailPredicate)
-             dbgs() << "ARM Loops: Couldn't validate tail predicate.\n");
+  // If we find another VCTP, check whether it uses the same value as the main VCTP.
+  // If it does, store it in the VCTPs set, else refuse it.
+  MachineInstr *Prev = VCTPs.back();
+  if (!Prev->getOperand(1).isIdenticalTo(MI->getOperand(1)) ||
+      !RDA.hasSameReachingDef(Prev, MI, MI->getOperand(1).getReg().asMCReg())) {
+    LLVM_DEBUG(dbgs() << "ARM Loops: Found VCTP with a different reaching "
+                         "definition from the main VCTP");
+    return false;
+  }
+  VCTPs.push_back(MI);
+  return true;
 }
 
 bool LowOverheadLoop::ValidateMVEInst(MachineInstr* MI) {
   if (CannotTailPredicate)
     return false;
 
-  if (isVCTP(MI)) {
-    // If we find another VCTP, check whether it uses the same value as the main VCTP.
-    // If it does, store it in the SecondaryVCTPs set, else refuse it.
-    if (VCTP) {
-      if (!VCTP->getOperand(1).isIdenticalTo(MI->getOperand(1)) ||
-          !RDA.hasSameReachingDef(VCTP, MI, MI->getOperand(1).getReg())) {
-        LLVM_DEBUG(dbgs() << "ARM Loops: Found VCTP with a different reaching "
-                             "definition from the main VCTP");
-        return false;
-      }
-      LLVM_DEBUG(dbgs() << "ARM Loops: Found secondary VCTP: " << *MI);
-      SecondaryVCTPs.insert(MI);
-    } else {
-      LLVM_DEBUG(dbgs() << "ARM Loops: Found 'main' VCTP: " << *MI);
-      VCTP = MI;
-    }
-  } else if (isVPTOpcode(MI->getOpcode())) {
-    if (MI->getOpcode() != ARM::MVE_VPST) {
-      assert(MI->findRegisterDefOperandIdx(ARM::VPR) != -1 &&
-             "VPT does not implicitly define VPR?!");
-      CurrentPredicate.insert(MI);
-    }
-
-    VPTBlocks.emplace_back(MI, CurrentPredicate);
-    CurrentBlock = &VPTBlocks.back();
+  if (!shouldInspect(*MI))
     return true;
-  } else if (MI->getOpcode() == ARM::MVE_VPSEL ||
-             MI->getOpcode() == ARM::MVE_VPNOT) {
+
+  if (MI->getOpcode() == ARM::MVE_VPSEL ||
+      MI->getOpcode() == ARM::MVE_VPNOT) {
     // TODO: Allow VPSEL and VPNOT, we currently cannot because:
     // 1) It will use the VPR as a predicate operand, but doesn't have to be
     //    instead a VPT block, which means we can assert while building up
@@ -984,49 +1142,62 @@ bool LowOverheadLoop::ValidateMVEInst(MachineInstr* MI) {
     return false;
   }
 
-  bool IsUse = false;
-  bool IsDef = false;
+  // Record all VCTPs and check that they're equivalent to one another.
+  if (isVCTP(MI) && !AddVCTP(MI))
+    return false;
+
+  // Inspect uses first so that any instructions that alter the VPR don't
+  // alter the predicate upon themselves.
   const MCInstrDesc &MCID = MI->getDesc();
-  for (int i = MI->getNumOperands() - 1; i >= 0; --i) {
-    const MachineOperand &MO = MI->getOperand(i);
-    if (!MO.isReg() || MO.getReg() != ARM::VPR)
+  bool IsUse = false;
+  unsigned LastOpIdx = MI->getNumOperands() - 1;
+  for (auto &Op : enumerate(reverse(MCID.operands()))) {
+    const MachineOperand &MO = MI->getOperand(LastOpIdx - Op.index());
+    if (!MO.isReg() || !MO.isUse() || MO.getReg() != ARM::VPR)
       continue;
 
-    if (MO.isDef()) {
-      CurrentPredicate.insert(MI);
-      IsDef = true;
-    } else if (ARM::isVpred(MCID.OpInfo[i].OperandType)) {
-      CurrentBlock->addInst(MI, CurrentPredicate);
+    if (ARM::isVpred(Op.value().OperandType)) {
+      VPTState::addInst(MI);
       IsUse = true;
-    } else {
+    } else if (MI->getOpcode() != ARM::MVE_VPST) {
       LLVM_DEBUG(dbgs() << "ARM Loops: Found instruction using vpr: " << *MI);
       return false;
     }
   }
 
-  // If we find a vpr def that is not already predicated on the vctp, we've
-  // got disjoint predicates that may not be equivalent when we do the
-  // conversion.
-  if (IsDef && !IsUse && VCTP && !isVCTP(MI)) {
-    LLVM_DEBUG(dbgs() << "ARM Loops: Found disjoint vpr def: " << *MI);
-    return false;
-  }
-
-  uint64_t Flags = MCID.TSFlags;
-  if ((Flags & ARMII::DomainMask) != ARMII::DomainMVE)
-    return true;
-
   // If we find an instruction that has been marked as not valid for tail
   // predication, only allow the instruction if it's contained within a valid
   // VPT block.
-  if ((Flags & ARMII::ValidForTailPredication) == 0 && !IsUse) {
-    LLVM_DEBUG(dbgs() << "ARM Loops: Can't tail predicate: " << *MI);
-    return false;
+  bool RequiresExplicitPredication =
+    (MCID.TSFlags & ARMII::ValidForTailPredication) == 0;
+  if (isDomainMVE(MI) && RequiresExplicitPredication) {
+    LLVM_DEBUG(if (!IsUse)
+               dbgs() << "ARM Loops: Can't tail predicate: " << *MI);
+    return IsUse;
   }
 
   // If the instruction is already explicitly predicated, then the conversion
-  // will be fine, but ensure that all memory operations are predicated.
-  return !IsUse && MI->mayLoadOrStore() ? false : true;
+  // will be fine, but ensure that all store operations are predicated.
+  if (MI->mayStore())
+    return IsUse;
+
+  // If this instruction defines the VPR, update the predicate for the
+  // proceeding instructions.
+  if (isVectorPredicate(MI)) {
+    // Clear the existing predicate when we're not in VPT Active state,
+    // otherwise we add to it.
+    if (!isVectorPredicated(MI))
+      VPTState::resetPredicate(MI);
+    else
+      VPTState::addPredicate(MI);
+  }
+
+  // Finally once the predicate has been modified, we can start a new VPT
+  // block if necessary.
+  if (isVPTOpcode(MI->getOpcode()))
+    VPTState::CreateVPTBlock(MI);
+
+  return true;
 }
 
 bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) {
@@ -1049,7 +1220,7 @@ bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) {
 
   bool Changed = false;
   for (auto ML : *MLI) {
-    if (!ML->getParentLoop())
+    if (ML->isOutermost())
       Changed |= ProcessLoop(ML);
   }
   Changed |= RevertNonLoops();
@@ -1108,6 +1279,8 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
         LoLoop.Dec = &MI;
       else if (MI.getOpcode() == ARM::t2LoopEnd)
         LoLoop.End = &MI;
+      else if (MI.getOpcode() == ARM::t2LoopEndDec)
+        LoLoop.End = LoLoop.Dec = &MI;
       else if (isLoopStart(MI))
         LoLoop.Start = &MI;
       else if (MI.getDesc().isCall()) {
@@ -1130,15 +1303,18 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
     return false;
   }
 
-  // Check that the only instruction using LoopDec is LoopEnd.
+  // Check that the only instruction using LoopDec is LoopEnd. This can only
+  // happen when the Dec and End are separate, not a single t2LoopEndDec.
   // TODO: Check for copy chains that really have no effect.
-  SmallPtrSet<MachineInstr*, 2> Uses;
-  RDA->getReachingLocalUses(LoLoop.Dec, ARM::LR, Uses);
-  if (Uses.size() > 1 || !Uses.count(LoLoop.End)) {
-    LLVM_DEBUG(dbgs() << "ARM Loops: Unable to remove LoopDec.\n");
-    LoLoop.Revert = true;
+  if (LoLoop.Dec != LoLoop.End) {
+    SmallPtrSet<MachineInstr *, 2> Uses;
+    RDA->getReachingLocalUses(LoLoop.Dec, MCRegister::from(ARM::LR), Uses);
+    if (Uses.size() > 1 || !Uses.count(LoLoop.End)) {
+      LLVM_DEBUG(dbgs() << "ARM Loops: Unable to remove LoopDec.\n");
+      LoLoop.Revert = true;
+    }
   }
-  LoLoop.CheckLegality(BBUtils.get());
+  LoLoop.Validate(BBUtils.get());
   Expand(LoLoop);
   return true;
 }
@@ -1149,23 +1325,16 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
 // another low register.
 void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const {
   LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp: " << *MI);
-  MachineBasicBlock *MBB = MI->getParent();
-  MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
-                                    TII->get(ARM::t2CMPri));
-  MIB.add(MI->getOperand(0));
-  MIB.addImm(0);
-  MIB.addImm(ARMCC::AL);
-  MIB.addReg(ARM::NoRegister);
-
   MachineBasicBlock *DestBB = MI->getOperand(1).getMBB();
   unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ?
     ARM::tBcc : ARM::t2Bcc;
 
-  MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc));
-  MIB.add(MI->getOperand(1));   // branch target
-  MIB.addImm(ARMCC::EQ);        // condition code
-  MIB.addReg(ARM::CPSR);
-  MI->eraseFromParent();
+  RevertWhileLoopStart(MI, TII, BrOpc);
+}
+
+void ARMLowOverheadLoops::RevertDo(MachineInstr *MI) const {
+  LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to mov: " << *MI);
+  RevertDoLoopStart(MI, TII);
 }
 
 bool ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI) const {
@@ -1180,23 +1349,10 @@ bool ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI) const {
   }
 
   // If nothing defines CPSR between LoopDec and LoopEnd, use a t2SUBS.
-  bool SetFlags = RDA->isSafeToDefRegAt(MI, ARM::CPSR, Ignore);
-
-  MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
-                                    TII->get(ARM::t2SUBri));
-  MIB.addDef(ARM::LR);
-  MIB.add(MI->getOperand(1));
-  MIB.add(MI->getOperand(2));
-  MIB.addImm(ARMCC::AL);
-  MIB.addReg(0);
-
-  if (SetFlags) {
-    MIB.addReg(ARM::CPSR);
-    MIB->getOperand(5).setIsDef(true);
-  } else
-    MIB.addReg(0);
+  bool SetFlags =
+      RDA->isSafeToDefRegAt(MI, MCRegister::from(ARM::CPSR), Ignore);
 
-  MI->eraseFromParent();
+  llvm::RevertLoopDec(MI, TII, SetFlags);
   return SetFlags;
 }
 
@@ -1204,27 +1360,39 @@ bool ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI) const {
 void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI, bool SkipCmp) const {
   LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp, br: " << *MI);
 
-  MachineBasicBlock *MBB = MI->getParent();
-  // Create cmp
-  if (!SkipCmp) {
-    MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
-                                      TII->get(ARM::t2CMPri));
-    MIB.addReg(ARM::LR);
-    MIB.addImm(0);
-    MIB.addImm(ARMCC::AL);
-    MIB.addReg(ARM::NoRegister);
-  }
-
   MachineBasicBlock *DestBB = MI->getOperand(1).getMBB();
   unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ?
     ARM::tBcc : ARM::t2Bcc;
 
-  // Create bne
+  llvm::RevertLoopEnd(MI, TII, BrOpc, SkipCmp);
+}
+
+// Generate a subs, or sub and cmp, and a branch instead of an LE.
+void ARMLowOverheadLoops::RevertLoopEndDec(MachineInstr *MI) const {
+  LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to subs, br: " << *MI);
+  assert(MI->getOpcode() == ARM::t2LoopEndDec && "Expected a t2LoopEndDec!");
+  MachineBasicBlock *MBB = MI->getParent();
+
   MachineInstrBuilder MIB =
-    BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc));
-  MIB.add(MI->getOperand(1));   // branch target
-  MIB.addImm(ARMCC::NE);        // condition code
+      BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2SUBri));
+  MIB.addDef(ARM::LR);
+  MIB.add(MI->getOperand(1));
+  MIB.addImm(1);
+  MIB.addImm(ARMCC::AL);
+  MIB.addReg(ARM::NoRegister);
+  MIB.addReg(ARM::CPSR);
+  MIB->getOperand(5).setIsDef(true);
+
+  MachineBasicBlock *DestBB = MI->getOperand(2).getMBB();
+  unsigned BrOpc =
+      BBUtils->isBBInRange(MI, DestBB, 254) ? ARM::tBcc : ARM::t2Bcc;
+
+  // Create bne
+  MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc));
+  MIB.add(MI->getOperand(2)); // branch target
+  MIB.addImm(ARMCC::NE);      // condition code
   MIB.addReg(ARM::CPSR);
+
   MI->eraseFromParent();
 }
 
@@ -1235,7 +1403,7 @@ void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI, bool SkipCmp) const {
 //
 //   $lr = big-itercount-expression
 //   ..
-//   t2DoLoopStart renamable $lr
+//   $lr = t2DoLoopStart renamable $lr
 //   vector.body:
 //     ..
 //     $vpr = MVE_VCTP32 renamable $r3
@@ -1258,7 +1426,8 @@ void ARMLowOverheadLoops::IterationCountDCE(LowOverheadLoop &LoLoop) {
 
   LLVM_DEBUG(dbgs() << "ARM Loops: Trying DCE on loop iteration count.\n");
 
-  MachineInstr *Def = RDA->getMIOperand(LoLoop.Start, 0);
+  MachineInstr *Def =
+      RDA->getMIOperand(LoLoop.Start, isDo(LoLoop.Start) ? 1 : 0);
   if (!Def) {
     LLVM_DEBUG(dbgs() << "ARM Loops: Couldn't find iteration count.\n");
     return;
@@ -1266,56 +1435,9 @@ void ARMLowOverheadLoops::IterationCountDCE(LowOverheadLoop &LoLoop) {
 
   // Collect and remove the users of iteration count.
   SmallPtrSet<MachineInstr*, 4> Killed  = { LoLoop.Start, LoLoop.Dec,
-                                            LoLoop.End, LoLoop.InsertPt };
-  SmallPtrSet<MachineInstr*, 2> Remove;
-  if (RDA->isSafeToRemove(Def, Remove, Killed))
-    LoLoop.ToRemove.insert(Remove.begin(), Remove.end());
-  else {
+                                            LoLoop.End };
+  if (!TryRemove(Def, *RDA, LoLoop.ToRemove, Killed))
     LLVM_DEBUG(dbgs() << "ARM Loops: Unsafe to remove loop iteration count.\n");
-    return;
-  }
-
-  // Collect the dead code and the MBBs in which they reside.
-  RDA->collectKilledOperands(Def, Killed);
-  SmallPtrSet<MachineBasicBlock*, 2> BasicBlocks;
-  for (auto *MI : Killed)
-    BasicBlocks.insert(MI->getParent());
-
-  // Collect IT blocks in all affected basic blocks.
-  std::map<MachineInstr *, SmallPtrSet<MachineInstr *, 2>> ITBlocks;
-  for (auto *MBB : BasicBlocks) {
-    for (auto &MI : *MBB) {
-      if (MI.getOpcode() != ARM::t2IT)
-        continue;
-      RDA->getReachingLocalUses(&MI, ARM::ITSTATE, ITBlocks[&MI]);
-    }
-  }
-
-  // If we're removing all of the instructions within an IT block, then
-  // also remove the IT instruction.
-  SmallPtrSet<MachineInstr*, 2> ModifiedITs;
-  for (auto *MI : Killed) {
-    if (MachineOperand *MO = MI->findRegisterUseOperand(ARM::ITSTATE)) {
-      MachineInstr *IT = RDA->getMIOperand(MI, *MO);
-      auto &CurrentBlock = ITBlocks[IT];
-      CurrentBlock.erase(MI);
-      if (CurrentBlock.empty())
-        ModifiedITs.erase(IT);
-      else
-        ModifiedITs.insert(IT);
-    }
-  }
-
-  // Delete the killed instructions only if we don't have any IT blocks that
-  // need to be modified because we need to fixup the mask.
-  // TODO: Handle cases where IT blocks are modified.
-  if (ModifiedITs.empty()) {
-    LLVM_DEBUG(dbgs() << "ARM Loops: Will remove iteration count:\n";
-               for (auto *MI : Killed)
-                 dbgs() << " - " << *MI);
-    LoLoop.ToRemove.insert(Killed.begin(), Killed.end());
-  } else
-    LLVM_DEBUG(dbgs() << "ARM Loops: Would need to modify IT block(s).\n");
 }
 
 MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) {
@@ -1324,113 +1446,71 @@ MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) {
   // calculate the number of loop iterations.
   IterationCountDCE(LoLoop);
 
-  MachineInstr *InsertPt = LoLoop.InsertPt;
+  MachineBasicBlock::iterator InsertPt = LoLoop.StartInsertPt;
   MachineInstr *Start = LoLoop.Start;
-  MachineBasicBlock *MBB = InsertPt->getParent();
-  bool IsDo = Start->getOpcode() == ARM::t2DoLoopStart;
+  MachineBasicBlock *MBB = LoLoop.StartInsertBB;
   unsigned Opc = LoLoop.getStartOpcode();
-  MachineOperand &Count = LoLoop.getCount();
+  MachineOperand &Count = LoLoop.getLoopStartOperand();
 
   MachineInstrBuilder MIB =
-    BuildMI(*MBB, InsertPt, InsertPt->getDebugLoc(), TII->get(Opc));
+    BuildMI(*MBB, InsertPt, Start->getDebugLoc(), TII->get(Opc));
 
   MIB.addDef(ARM::LR);
   MIB.add(Count);
-  if (!IsDo)
+  if (!isDo(Start))
     MIB.add(Start->getOperand(1));
 
-  // If we're inserting at a mov lr, then remove it as it's redundant.
-  if (InsertPt != Start)
-    LoLoop.ToRemove.insert(InsertPt);
   LoLoop.ToRemove.insert(Start);
   LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB);
   return &*MIB;
 }
 
-void ARMLowOverheadLoops::FixupReductions(LowOverheadLoop &LoLoop) const {
-  LLVM_DEBUG(dbgs() << "ARM Loops: Fixing up reduction(s).\n");
-  auto BuildMov = [this](MachineInstr &InsertPt, Register To, Register From) {
-    MachineBasicBlock *MBB = InsertPt.getParent();
-    MachineInstrBuilder MIB =
-      BuildMI(*MBB, &InsertPt, InsertPt.getDebugLoc(), TII->get(ARM::MVE_VORR));
-    MIB.addDef(To);
-    MIB.addReg(From);
-    MIB.addReg(From);
-    MIB.addImm(0);
-    MIB.addReg(0);
-    MIB.addReg(To);
-    LLVM_DEBUG(dbgs() << "ARM Loops: Inserted VMOV: " << *MIB);
-  };
-
-  for (auto &Reduction : LoLoop.Reductions) {
-    MachineInstr &Copy = Reduction->Copy;
-    MachineInstr &Reduce = Reduction->Reduce;
-    Register DestReg = Copy.getOperand(0).getReg();
-
-    // Change the initialiser if present
-    if (Reduction->Init) {
-      MachineInstr *Init = Reduction->Init;
-
-      for (unsigned i = 0; i < Init->getNumOperands(); ++i) {
-        MachineOperand &MO = Init->getOperand(i);
-        if (MO.isReg() && MO.isUse() && MO.isTied() &&
-            Init->findTiedOperandIdx(i) == 0)
-          Init->getOperand(i).setReg(DestReg);
-      }
-      Init->getOperand(0).setReg(DestReg);
-      LLVM_DEBUG(dbgs() << "ARM Loops: Changed init regs: " << *Init);
-    } else
-      BuildMov(LoLoop.Preheader->instr_back(), DestReg, Copy.getOperand(1).getReg());
-
-    // Change the reducing op to write to the register that is used to copy
-    // its value on the next iteration. Also update the tied-def operand.
-    Reduce.getOperand(0).setReg(DestReg);
-    Reduce.getOperand(5).setReg(DestReg);
-    LLVM_DEBUG(dbgs() << "ARM Loops: Changed reduction regs: " << Reduce);
-
-    // Instead of a vpsel, just copy the register into the necessary one.
-    MachineInstr &VPSEL = Reduction->VPSEL;
-    if (VPSEL.getOperand(0).getReg() != DestReg)
-      BuildMov(VPSEL, VPSEL.getOperand(0).getReg(), DestReg);
-
-    // Remove the unnecessary instructions.
-    LLVM_DEBUG(dbgs() << "ARM Loops: Removing:\n"
-               << " - " << Copy
-               << " - " << VPSEL << "\n");
-    Copy.eraseFromParent();
-    VPSEL.eraseFromParent();
-  }
-}
-
 void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
   auto RemovePredicate = [](MachineInstr *MI) {
+    if (MI->isDebugInstr())
+      return;
     LLVM_DEBUG(dbgs() << "ARM Loops: Removing predicate from: " << *MI);
-    if (int PIdx = llvm::findFirstVPTPredOperandIdx(*MI)) {
-      assert(MI->getOperand(PIdx).getImm() == ARMVCC::Then &&
-             "Expected Then predicate!");
-      MI->getOperand(PIdx).setImm(ARMVCC::None);
-      MI->getOperand(PIdx+1).setReg(0);
-    } else
-      llvm_unreachable("trying to unpredicate a non-predicated instruction");
+    int PIdx = llvm::findFirstVPTPredOperandIdx(*MI);
+    assert(PIdx >= 1 && "Trying to unpredicate a non-predicated instruction");
+    assert(MI->getOperand(PIdx).getImm() == ARMVCC::Then &&
+           "Expected Then predicate!");
+    MI->getOperand(PIdx).setImm(ARMVCC::None);
+    MI->getOperand(PIdx + 1).setReg(0);
   };
 
-  // There are a few scenarios which we have to fix up:
-  // 1. VPT Blocks with non-uniform predicates:
-  //    - a. When the divergent instruction is a vctp
-  //    - b. When the block uses a vpst, and is only predicated on the vctp
-  //    - c. When the block uses a vpt and (optionally) contains one or more
-  //         vctp.
-  // 2. VPT Blocks with uniform predicates:
-  //    - a. The block uses a vpst, and is only predicated on the vctp
   for (auto &Block : LoLoop.getVPTBlocks()) {
-    SmallVectorImpl<PredicatedMI> &Insts = Block.getInsts();
-    if (Block.HasNonUniformPredicate()) {
-      PredicatedMI *Divergent = Block.getDivergent();
-      if (isVCTP(Divergent->MI)) {
-        // The vctp will be removed, so the block mask of the vp(s)t will need
-        // to be recomputed.
-        LoLoop.BlockMasksToRecompute.insert(Block.getPredicateThen());
-      } else if (Block.isVPST() && Block.IsOnlyPredicatedOn(LoLoop.VCTP)) {
+    SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
+
+    auto ReplaceVCMPWithVPT = [&](MachineInstr *&TheVCMP, MachineInstr *At) {
+      assert(TheVCMP && "Replacing a removed or non-existent VCMP");
+      // Replace the VCMP with a VPT
+      MachineInstrBuilder MIB =
+          BuildMI(*At->getParent(), At, At->getDebugLoc(),
+                  TII->get(VCMPOpcodeToVPT(TheVCMP->getOpcode())));
+      MIB.addImm(ARMVCC::Then);
+      // Register one
+      MIB.add(TheVCMP->getOperand(1));
+      // Register two
+      MIB.add(TheVCMP->getOperand(2));
+      // The comparison code, e.g. ge, eq, lt
+      MIB.add(TheVCMP->getOperand(3));
+      LLVM_DEBUG(dbgs() << "ARM Loops: Combining with VCMP to VPT: " << *MIB);
+      LoLoop.BlockMasksToRecompute.insert(MIB.getInstr());
+      LoLoop.ToRemove.insert(TheVCMP);
+      TheVCMP = nullptr;
+    };
+
+    if (VPTState::isEntryPredicatedOnVCTP(Block, /*exclusive*/ true)) {
+      MachineInstr *VPST = Insts.front();
+      if (VPTState::hasUniformPredicate(Block)) {
+        // A vpt block starting with VPST, is only predicated upon vctp and has no
+        // internal vpr defs:
+        // - Remove vpst.
+        // - Unpredicate the remaining instructions.
+        LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST);
+        for (unsigned i = 1; i < Insts.size(); ++i)
+          RemovePredicate(Insts[i]);
+      } else {
         // The VPT block has a non-uniform predicate but it uses a vpst and its
         // entry is guarded only by a vctp, which means we:
         // - Need to remove the original vpst.
@@ -1438,73 +1518,88 @@ void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
         //   we come across the divergent vpr def.
         // - Insert a new vpst to predicate the instruction(s) that following
         //   the divergent vpr def.
-        // TODO: We could be producing more VPT blocks than necessary and could
-        // fold the newly created one into a proceeding one.
-        for (auto I = ++MachineBasicBlock::iterator(Block.getPredicateThen()),
-             E = ++MachineBasicBlock::iterator(Divergent->MI); I != E; ++I)
+        MachineInstr *Divergent = VPTState::getDivergent(Block);
+        MachineBasicBlock *MBB = Divergent->getParent();
+        auto DivergentNext = ++MachineBasicBlock::iterator(Divergent);
+        while (DivergentNext != MBB->end() && DivergentNext->isDebugInstr())
+          ++DivergentNext;
+
+        bool DivergentNextIsPredicated =
+            DivergentNext != MBB->end() &&
+            getVPTInstrPredicate(*DivergentNext) != ARMVCC::None;
+
+        for (auto I = ++MachineBasicBlock::iterator(VPST), E = DivergentNext;
+             I != E; ++I)
           RemovePredicate(&*I);
 
-        unsigned Size = 0;
-        auto E = MachineBasicBlock::reverse_iterator(Divergent->MI);
-        auto I = MachineBasicBlock::reverse_iterator(Insts.back().MI);
-        MachineInstr *InsertAt = nullptr;
-        while (I != E) {
-          InsertAt = &*I;
-          ++Size;
-          ++I;
+        // Check if the instruction defining vpr is a vcmp so it can be combined
+        // with the VPST This should be the divergent instruction
+        MachineInstr *VCMP =
+            VCMPOpcodeToVPT(Divergent->getOpcode()) != 0 ? Divergent : nullptr;
+
+        if (DivergentNextIsPredicated) {
+          // Insert a VPST at the divergent only if the next instruction
+          // would actually use it. A VCMP following a VPST can be
+          // merged into a VPT so do that instead if the VCMP exists.
+          if (!VCMP) {
+            // Create a VPST (with a null mask for now, we'll recompute it
+            // later)
+            MachineInstrBuilder MIB =
+                BuildMI(*Divergent->getParent(), Divergent,
+                        Divergent->getDebugLoc(), TII->get(ARM::MVE_VPST));
+            MIB.addImm(0);
+            LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB);
+            LoLoop.BlockMasksToRecompute.insert(MIB.getInstr());
+          } else {
+            // No RDA checks are necessary here since the VPST would have been
+            // directly after the VCMP
+            ReplaceVCMPWithVPT(VCMP, VCMP);
+          }
         }
-        // Create a VPST (with a null mask for now, we'll recompute it later).
-        MachineInstrBuilder MIB = BuildMI(*InsertAt->getParent(), InsertAt,
-                                          InsertAt->getDebugLoc(),
-                                          TII->get(ARM::MVE_VPST));
-        MIB.addImm(0);
-        LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getPredicateThen());
-        LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB);
-        LoLoop.ToRemove.insert(Block.getPredicateThen());
-        LoLoop.BlockMasksToRecompute.insert(MIB.getInstr());
       }
-      // Else, if the block uses a vpt, iterate over the block, removing the
-      // extra VCTPs it may contain.
-      else if (Block.isVPT()) {
-        bool RemovedVCTP = false;
-        for (PredicatedMI &Elt : Block.getInsts()) {
-          MachineInstr *MI = Elt.MI;
-          if (isVCTP(MI)) {
-            LLVM_DEBUG(dbgs() << "ARM Loops: Removing VCTP: " << *MI);
-            LoLoop.ToRemove.insert(MI);
-            RemovedVCTP = true;
-            continue;
-          }
+      LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST);
+      LoLoop.ToRemove.insert(VPST);
+    } else if (Block.containsVCTP()) {
+      // The vctp will be removed, so either the entire block will be dead or
+      // the block mask of the vp(s)t will need to be recomputed.
+      MachineInstr *VPST = Insts.front();
+      if (Block.size() == 2) {
+        assert(VPST->getOpcode() == ARM::MVE_VPST &&
+               "Found a VPST in an otherwise empty vpt block");
+        LoLoop.ToRemove.insert(VPST);
+      } else
+        LoLoop.BlockMasksToRecompute.insert(VPST);
+    } else if (Insts.front()->getOpcode() == ARM::MVE_VPST) {
+      // If this block starts with a VPST then attempt to merge it with the
+      // preceeding un-merged VCMP into a VPT. This VCMP comes from a VPT
+      // block that no longer exists
+      MachineInstr *VPST = Insts.front();
+      auto Next = ++MachineBasicBlock::iterator(VPST);
+      assert(getVPTInstrPredicate(*Next) != ARMVCC::None &&
+             "The instruction after a VPST must be predicated");
+      (void)Next;
+      MachineInstr *VprDef = RDA->getUniqueReachingMIDef(VPST, ARM::VPR);
+      if (VprDef && VCMPOpcodeToVPT(VprDef->getOpcode()) &&
+          !LoLoop.ToRemove.contains(VprDef)) {
+        MachineInstr *VCMP = VprDef;
+        // The VCMP and VPST can only be merged if the VCMP's operands will have
+        // the same values at the VPST.
+        // If any of the instructions between the VCMP and VPST are predicated
+        // then a different code path is expected to have merged the VCMP and
+        // VPST already.
+        if (!std::any_of(++MachineBasicBlock::iterator(VCMP),
+                         MachineBasicBlock::iterator(VPST), hasVPRUse) &&
+            RDA->hasSameReachingDef(VCMP, VPST, VCMP->getOperand(1).getReg()) &&
+            RDA->hasSameReachingDef(VCMP, VPST, VCMP->getOperand(2).getReg())) {
+          ReplaceVCMPWithVPT(VCMP, VPST);
+          LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST);
+          LoLoop.ToRemove.insert(VPST);
         }
-        if (RemovedVCTP)
-          LoLoop.BlockMasksToRecompute.insert(Block.getPredicateThen());
       }
-    } else if (Block.IsOnlyPredicatedOn(LoLoop.VCTP) && Block.isVPST()) {
-      // A vpt block starting with VPST, is only predicated upon vctp and has no
-      // internal vpr defs:
-      // - Remove vpst.
-      // - Unpredicate the remaining instructions.
-      LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getPredicateThen());
-      LoLoop.ToRemove.insert(Block.getPredicateThen());
-      for (auto &PredMI : Insts)
-        RemovePredicate(PredMI.MI);
-    }
-  }
-  LLVM_DEBUG(dbgs() << "ARM Loops: Removing remaining VCTPs...\n");
-  // Remove the "main" VCTP
-  LoLoop.ToRemove.insert(LoLoop.VCTP);
-  LLVM_DEBUG(dbgs() << "    " << *LoLoop.VCTP);
-  // Remove remaining secondary VCTPs
-  for (MachineInstr *VCTP : LoLoop.SecondaryVCTPs) {
-    // All VCTPs that aren't marked for removal yet should be unpredicated ones.
-    // The predicated ones should have already been marked for removal when
-    // visiting the VPT blocks.
-    if (LoLoop.ToRemove.insert(VCTP).second) {
-      assert(getVPTInstrPredicate(*VCTP) == ARMVCC::None &&
-             "Removing Predicated VCTP without updating the block mask!");
-      LLVM_DEBUG(dbgs() << "    " << *VCTP);
     }
   }
+
+  LoLoop.ToRemove.insert(LoLoop.VCTPs.begin(), LoLoop.VCTPs.end());
 }
 
 void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {
@@ -1518,8 +1613,9 @@ void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {
     MachineInstrBuilder MIB = BuildMI(*MBB, End, End->getDebugLoc(),
                                       TII->get(Opc));
     MIB.addDef(ARM::LR);
-    MIB.add(End->getOperand(0));
-    MIB.add(End->getOperand(1));
+    unsigned Off = LoLoop.Dec == LoLoop.End ? 1 : 0;
+    MIB.add(End->getOperand(Off + 0));
+    MIB.add(End->getOperand(Off + 1));
     LLVM_DEBUG(dbgs() << "ARM Loops: Inserted LE: " << *MIB);
     LoLoop.ToRemove.insert(LoLoop.Dec);
     LoLoop.ToRemove.insert(End);
@@ -1547,18 +1643,18 @@ void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {
     if (LoLoop.Start->getOpcode() == ARM::t2WhileLoopStart)
       RevertWhile(LoLoop.Start);
     else
-      LoLoop.Start->eraseFromParent();
-    bool FlagsAlreadySet = RevertLoopDec(LoLoop.Dec);
-    RevertLoopEnd(LoLoop.End, FlagsAlreadySet);
+      RevertDo(LoLoop.Start);
+    if (LoLoop.Dec == LoLoop.End)
+      RevertLoopEndDec(LoLoop.End);
+    else
+      RevertLoopEnd(LoLoop.End, RevertLoopDec(LoLoop.Dec));
   } else {
     LoLoop.Start = ExpandLoopStart(LoLoop);
     RemoveDeadBranch(LoLoop.Start);
     LoLoop.End = ExpandLoopEnd(LoLoop);
     RemoveDeadBranch(LoLoop.End);
-    if (LoLoop.IsTailPredicationLegal()) {
+    if (LoLoop.IsTailPredicationLegal())
       ConvertVPTBlocks(LoLoop);
-      FixupReductions(LoLoop);
-    }
     for (auto *I : LoLoop.ToRemove) {
       LLVM_DEBUG(dbgs() << "ARM Loops: Erasing " << *I);
       I->eraseFromParent();
@@ -1595,6 +1691,7 @@ bool ARMLowOverheadLoops::RevertNonLoops() {
     SmallVector<MachineInstr*, 4> Starts;
     SmallVector<MachineInstr*, 4> Decs;
     SmallVector<MachineInstr*, 4> Ends;
+    SmallVector<MachineInstr *, 4> EndDecs;
 
     for (auto &I : MBB) {
       if (isLoopStart(I))
@@ -1603,9 +1700,11 @@ bool ARMLowOverheadLoops::RevertNonLoops() {
         Decs.push_back(&I);
       else if (I.getOpcode() == ARM::t2LoopEnd)
         Ends.push_back(&I);
+      else if (I.getOpcode() == ARM::t2LoopEndDec)
+        EndDecs.push_back(&I);
     }
 
-    if (Starts.empty() && Decs.empty() && Ends.empty())
+    if (Starts.empty() && Decs.empty() && Ends.empty() && EndDecs.empty())
       continue;
 
     Changed = true;
@@ -1614,13 +1713,15 @@ bool ARMLowOverheadLoops::RevertNonLoops() {
       if (Start->getOpcode() == ARM::t2WhileLoopStart)
         RevertWhile(Start);
       else
-        Start->eraseFromParent();
+        RevertDo(Start);
     }
     for (auto *Dec : Decs)
       RevertLoopDec(Dec);
 
     for (auto *End : Ends)
       RevertLoopEnd(End);
+    for (auto *End : EndDecs)
+      RevertLoopEndDec(End);
   }
   return Changed;
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMParallelDSP.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMParallelDSP.cpp
index e750649ce86c..9a7c1f541aa2 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMParallelDSP.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMParallelDSP.cpp
@@ -22,6 +22,7 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicsARM.h"
@@ -201,8 +202,7 @@ namespace {
   public:
     WidenedLoad(SmallVectorImpl<LoadInst*> &Lds, LoadInst *Wide)
       : NewLd(Wide) {
-      for (auto *I : Lds)
-        Loads.push_back(I);
+      append_range(Loads, Lds);
     }
     LoadInst *getLoad() {
       return NewLd;
@@ -374,7 +374,7 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
   DepMap RAWDeps;
 
   // Record any writes that may alias a load.
-  const auto Size = LocationSize::unknown();
+  const auto Size = LocationSize::beforeOrAfterPointer();
   for (auto Write : Writes) {
     for (auto Read : Loads) {
       MemoryLocation ReadLoc =
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMPredicates.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMPredicates.td
index 1ae71be9f760..2dc097566d14 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMPredicates.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMPredicates.td
@@ -77,6 +77,8 @@ def HasV8_5a         : Predicate<"Subtarget->hasV8_5aOps()">,
                                  AssemblerPredicate<(all_of HasV8_5aOps), "armv8.5a">;
 def HasV8_6a         : Predicate<"Subtarget->hasV8_6aOps()">,
                                  AssemblerPredicate<(all_of HasV8_6aOps), "armv8.6a">;
+def HasV8_7a         : Predicate<"Subtarget->hasV8_7aOps()">,
+                                 AssemblerPredicate<(all_of HasV8_7aOps), "armv8.7a">;
 def NoVFP            : Predicate<"!Subtarget->hasVFP2Base()">;
 def HasVFP2          : Predicate<"Subtarget->hasVFP2Base()">,
                                  AssemblerPredicate<(all_of FeatureVFP2_SP), "VFP2">;
@@ -187,6 +189,9 @@ let RecomputePerFunction = 1 in {
   def UseFPVMLx: Predicate<"((Subtarget->useFPVMLx() &&"
                            "  TM.Options.AllowFPOpFusion != FPOpFusion::Fast) ||"
                            "Subtarget->hasMinSize())">;
+  def SLSBLRMitigation : Predicate<[{ MF->getSubtarget<ARMSubtarget>().hardenSlsBlr() }]>;
+  def NoSLSBLRMitigation : Predicate<[{ !MF->getSubtarget<ARMSubtarget>().hardenSlsBlr() }]>;
+
 }
 def UseMulOps        : Predicate<"Subtarget->useMulOps()">;
 
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp
index f9dbfef4c113..1a7f10a13ed3 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp
@@ -156,10 +156,10 @@ ARMRegisterBankInfo::ARMRegisterBankInfo(const TargetRegisterInfo &TRI)
            "Subclass not added?");
     assert(RBGPR.covers(*TRI.getRegClass(ARM::tcGPRRegClassID)) &&
            "Subclass not added?");
-    assert(RBGPR.covers(*TRI.getRegClass(ARM::tGPR_and_tcGPRRegClassID)) &&
+    assert(RBGPR.covers(*TRI.getRegClass(ARM::GPRnoip_and_tcGPRRegClassID)) &&
            "Subclass not added?");
-    assert(RBGPR.covers(
-               *TRI.getRegClass(ARM::tGPREven_and_tGPR_and_tcGPRRegClassID)) &&
+    assert(RBGPR.covers(*TRI.getRegClass(
+               ARM::tGPREven_and_GPRnoip_and_tcGPRRegClassID)) &&
            "Subclass not added?");
     assert(RBGPR.covers(*TRI.getRegClass(ARM::tGPROdd_and_tcGPRRegClassID)) &&
            "Subclass not added?");
@@ -182,10 +182,12 @@ ARMRegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
   switch (RC.getID()) {
   case GPRRegClassID:
   case GPRwithAPSRRegClassID:
+  case GPRnoipRegClassID:
   case GPRnopcRegClassID:
+  case GPRnoip_and_GPRnopcRegClassID:
   case rGPRRegClassID:
   case GPRspRegClassID:
-  case tGPR_and_tcGPRRegClassID:
+  case GPRnoip_and_tcGPRRegClassID:
   case tcGPRRegClassID:
   case tGPRRegClassID:
   case tGPREvenRegClassID:
@@ -193,7 +195,7 @@ ARMRegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
   case tGPR_and_tGPREvenRegClassID:
   case tGPR_and_tGPROddRegClassID:
   case tGPREven_and_tcGPRRegClassID:
-  case tGPREven_and_tGPR_and_tcGPRRegClassID:
+  case tGPREven_and_GPRnoip_and_tcGPRRegClassID:
   case tGPROdd_and_tcGPRRegClassID:
     return getRegBank(ARM::GPRRegBankID);
   case HPRRegClassID:
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterInfo.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterInfo.td
index a384b0dc757c..fe3243315d68 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterInfo.td
@@ -235,6 +235,23 @@ def GPR : RegisterClass<"ARM", [i32], 32, (add (sequence "R%u", 0, 12),
   let DiagnosticString = "operand must be a register in range [r0, r15]";
 }
 
+// Register set that excludes registers that are reserved for procedure calls.
+// This is used for pseudo-instructions that are actually implemented using a
+// procedure call.
+def GPRnoip : RegisterClass<"ARM", [i32], 32, (sub GPR, R12, LR)> {
+  // Allocate LR as the first CSR since it is always saved anyway.
+  // For Thumb1 mode, we don't want to allocate hi regs at all, as we don't
+  // know how to spill them. If we make our prologue/epilogue code smarter at
+  // some point, we can go back to using the above allocation orders for the
+  // Thumb1 instructions that know how to use hi regs.
+  let AltOrders = [(add GPRnoip, GPRnoip), (trunc GPRnoip, 8),
+                   (add (trunc GPRnoip, 8), (shl GPRnoip, 8))];
+  let AltOrderSelect = [{
+      return MF.getSubtarget<ARMSubtarget>().getGPRAllocationOrder(MF);
+  }];
+  let DiagnosticString = "operand must be a register in range [r0, r14]";
+}
+
 // GPRs without the PC.  Some ARM instructions do not allow the PC in
 // certain operand slots, particularly as the destination.  Primarily
 // useful for disassembly.
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMSLSHardening.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMSLSHardening.cpp
new file mode 100644
index 000000000000..cfcc7d5a0408
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMSLSHardening.cpp
@@ -0,0 +1,416 @@
+//===- ARMSLSHardening.cpp - Harden Straight Line Missspeculation ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass to insert code to mitigate against side channel
+// vulnerabilities that may happen under straight line miss-speculation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMInstrInfo.h"
+#include "ARMSubtarget.h"
+#include "llvm/CodeGen/IndirectThunks.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/IR/DebugLoc.h"
+#include <cassert>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "arm-sls-hardening"
+
+#define ARM_SLS_HARDENING_NAME "ARM sls hardening pass"
+
+namespace {
+
+class ARMSLSHardening : public MachineFunctionPass {
+public:
+  const TargetInstrInfo *TII;
+  const ARMSubtarget *ST;
+
+  static char ID;
+
+  ARMSLSHardening() : MachineFunctionPass(ID) {
+    initializeARMSLSHardeningPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &Fn) override;
+
+  StringRef getPassName() const override { return ARM_SLS_HARDENING_NAME; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+private:
+  bool hardenReturnsAndBRs(MachineBasicBlock &MBB) const;
+  bool hardenIndirectCalls(MachineBasicBlock &MBB) const;
+  MachineBasicBlock &
+  ConvertIndirectCallToIndirectJump(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator) const;
+};
+
+} // end anonymous namespace
+
+char ARMSLSHardening::ID = 0;
+
+INITIALIZE_PASS(ARMSLSHardening, "arm-sls-hardening",
+                ARM_SLS_HARDENING_NAME, false, false)
+
+static void insertSpeculationBarrier(const ARMSubtarget *ST,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MBBI,
+                                     DebugLoc DL,
+                                     bool AlwaysUseISBDSB = false) {
+  assert(MBBI != MBB.begin() &&
+         "Must not insert SpeculationBarrierEndBB as only instruction in MBB.");
+  assert(std::prev(MBBI)->isBarrier() &&
+         "SpeculationBarrierEndBB must only follow unconditional control flow "
+         "instructions.");
+  assert(std::prev(MBBI)->isTerminator() &&
+         "SpeculationBarrierEndBB must only follow terminators.");
+  const TargetInstrInfo *TII = ST->getInstrInfo();
+  assert(ST->hasDataBarrier() || ST->hasSB());
+  bool ProduceSB = ST->hasSB() && !AlwaysUseISBDSB;
+  unsigned BarrierOpc =
+      ProduceSB ? (ST->isThumb() ? ARM::t2SpeculationBarrierSBEndBB
+                                 : ARM::SpeculationBarrierSBEndBB)
+                : (ST->isThumb() ? ARM::t2SpeculationBarrierISBDSBEndBB
+                                 : ARM::SpeculationBarrierISBDSBEndBB);
+  if (MBBI == MBB.end() || !isSpeculationBarrierEndBBOpcode(MBBI->getOpcode()))
+    BuildMI(MBB, MBBI, DL, TII->get(BarrierOpc));
+}
+
+bool ARMSLSHardening::runOnMachineFunction(MachineFunction &MF) {
+  ST = &MF.getSubtarget<ARMSubtarget>();
+  TII = MF.getSubtarget().getInstrInfo();
+
+  bool Modified = false;
+  for (auto &MBB : MF) {
+    Modified |= hardenReturnsAndBRs(MBB);
+    Modified |= hardenIndirectCalls(MBB);
+  }
+
+  return Modified;
+}
+
+bool ARMSLSHardening::hardenReturnsAndBRs(MachineBasicBlock &MBB) const {
+  if (!ST->hardenSlsRetBr())
+    return false;
+  assert(!ST->isThumb1Only());
+  bool Modified = false;
+  MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(), E = MBB.end();
+  MachineBasicBlock::iterator NextMBBI;
+  for (; MBBI != E; MBBI = NextMBBI) {
+    MachineInstr &MI = *MBBI;
+    NextMBBI = std::next(MBBI);
+    if (isIndirectControlFlowNotComingBack(MI)) {
+      assert(MI.isTerminator());
+      assert(!TII->isPredicated(MI));
+      insertSpeculationBarrier(ST, MBB, std::next(MBBI), MI.getDebugLoc());
+      Modified = true;
+    }
+  }
+  return Modified;
+}
+
+static const char SLSBLRNamePrefix[] = "__llvm_slsblr_thunk_";
+
+static const struct ThunkNameRegMode {
+  const char* Name;
+  Register Reg;
+  bool isThumb;
+} SLSBLRThunks[] = {
+    {"__llvm_slsblr_thunk_arm_r0", ARM::R0, false},
+    {"__llvm_slsblr_thunk_arm_r1", ARM::R1, false},
+    {"__llvm_slsblr_thunk_arm_r2", ARM::R2, false},
+    {"__llvm_slsblr_thunk_arm_r3", ARM::R3, false},
+    {"__llvm_slsblr_thunk_arm_r4", ARM::R4, false},
+    {"__llvm_slsblr_thunk_arm_r5", ARM::R5, false},
+    {"__llvm_slsblr_thunk_arm_r6", ARM::R6, false},
+    {"__llvm_slsblr_thunk_arm_r7", ARM::R7, false},
+    {"__llvm_slsblr_thunk_arm_r8", ARM::R8, false},
+    {"__llvm_slsblr_thunk_arm_r9", ARM::R9, false},
+    {"__llvm_slsblr_thunk_arm_r10", ARM::R10, false},
+    {"__llvm_slsblr_thunk_arm_r11", ARM::R11, false},
+    {"__llvm_slsblr_thunk_arm_sp", ARM::SP, false},
+    {"__llvm_slsblr_thunk_arm_pc", ARM::PC, false},
+    {"__llvm_slsblr_thunk_thumb_r0", ARM::R0, true},
+    {"__llvm_slsblr_thunk_thumb_r1", ARM::R1, true},
+    {"__llvm_slsblr_thunk_thumb_r2", ARM::R2, true},
+    {"__llvm_slsblr_thunk_thumb_r3", ARM::R3, true},
+    {"__llvm_slsblr_thunk_thumb_r4", ARM::R4, true},
+    {"__llvm_slsblr_thunk_thumb_r5", ARM::R5, true},
+    {"__llvm_slsblr_thunk_thumb_r6", ARM::R6, true},
+    {"__llvm_slsblr_thunk_thumb_r7", ARM::R7, true},
+    {"__llvm_slsblr_thunk_thumb_r8", ARM::R8, true},
+    {"__llvm_slsblr_thunk_thumb_r9", ARM::R9, true},
+    {"__llvm_slsblr_thunk_thumb_r10", ARM::R10, true},
+    {"__llvm_slsblr_thunk_thumb_r11", ARM::R11, true},
+    {"__llvm_slsblr_thunk_thumb_sp", ARM::SP, true},
+    {"__llvm_slsblr_thunk_thumb_pc", ARM::PC, true},
+};
+
+namespace {
+struct SLSBLRThunkInserter : ThunkInserter<SLSBLRThunkInserter> {
+  const char *getThunkPrefix() { return SLSBLRNamePrefix; }
+  bool mayUseThunk(const MachineFunction &MF) {
+    // FIXME: This could also check if there are any indirect calls in the
+    // function to more accurately reflect if a thunk will be needed.
+    return MF.getSubtarget<ARMSubtarget>().hardenSlsBlr();
+  }
+  void insertThunks(MachineModuleInfo &MMI);
+  void populateThunk(MachineFunction &MF);
+};
+} // namespace
+
+void SLSBLRThunkInserter::insertThunks(MachineModuleInfo &MMI) {
+  // FIXME: It probably would be possible to filter which thunks to produce
+  // based on which registers are actually used in indirect calls in this
+  // function. But would that be a worthwhile optimization?
+  for (auto T : SLSBLRThunks)
+    createThunkFunction(MMI, T.Name);
+}
+
+void SLSBLRThunkInserter::populateThunk(MachineFunction &MF) {
+  // FIXME: How to better communicate Register number, rather than through
+  // name and lookup table?
+  assert(MF.getName().startswith(getThunkPrefix()));
+  auto ThunkIt = llvm::find_if(
+      SLSBLRThunks, [&MF](auto T) { return T.Name == MF.getName(); });
+  assert(ThunkIt != std::end(SLSBLRThunks));
+  Register ThunkReg = ThunkIt->Reg;
+  bool isThumb = ThunkIt->isThumb;
+
+  const TargetInstrInfo *TII = MF.getSubtarget<ARMSubtarget>().getInstrInfo();
+  MachineBasicBlock *Entry = &MF.front();
+  Entry->clear();
+
+  //  These thunks need to consist of the following instructions:
+  //  __llvm_slsblr_thunk_(arm/thumb)_rN:
+  //      bx  rN
+  //      barrierInsts
+  Entry->addLiveIn(ThunkReg);
+  if (isThumb)
+    BuildMI(Entry, DebugLoc(), TII->get(ARM::tBX))
+        .addReg(ThunkReg)
+        .add(predOps(ARMCC::AL));
+  else
+    BuildMI(Entry, DebugLoc(), TII->get(ARM::BX))
+        .addReg(ThunkReg);
+
+  // Make sure the thunks do not make use of the SB extension in case there is
+  // a function somewhere that will call to it that for some reason disabled
+  // the SB extension locally on that function, even though it's enabled for
+  // the module otherwise. Therefore set AlwaysUseISBSDB to true.
+  insertSpeculationBarrier(&MF.getSubtarget<ARMSubtarget>(), *Entry,
+                           Entry->end(), DebugLoc(), true /*AlwaysUseISBDSB*/);
+}
+
+MachineBasicBlock &ARMSLSHardening::ConvertIndirectCallToIndirectJump(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
+  // Transform an indirect call to an indirect jump as follows:
+  // Before:
+  //   |-----------------------------|
+  //   |      ...                    |
+  //   |  instI                      |
+  //   |  BLX rN                     |
+  //   |  instJ                      |
+  //   |      ...                    |
+  //   |-----------------------------|
+  //
+  // After:
+  //   |----------   -------------------------|
+  //   |      ...                             |
+  //   |  instI                               |
+  //   |  *call* __llvm_slsblr_thunk_mode_xN  |
+  //   |  instJ                               |
+  //   |      ...                             |
+  //   |--------------------------------------|
+  //
+  //   __llvm_slsblr_thunk_mode_xN:
+  //   |-----------------------------|
+  //   |  BX rN                      |
+  //   |  barrierInsts               |
+  //   |-----------------------------|
+  //
+  // The __llvm_slsblr_thunk_mode_xN thunks are created by the
+  // SLSBLRThunkInserter.
+  // This function merely needs to transform an indirect call to a direct call
+  // to __llvm_slsblr_thunk_xN.
+  MachineInstr &IndirectCall = *MBBI;
+  assert(isIndirectCall(IndirectCall) && !IndirectCall.isReturn());
+  int RegOpIdxOnIndirectCall = -1;
+  bool isThumb;
+  switch (IndirectCall.getOpcode()) {
+  case ARM::BLX:   // !isThumb2
+  case ARM::BLX_noip:   // !isThumb2
+    isThumb = false;
+    RegOpIdxOnIndirectCall = 0;
+    break;
+  case ARM::tBLXr:      // isThumb2
+  case ARM::tBLXr_noip: // isThumb2
+    isThumb = true;
+    RegOpIdxOnIndirectCall = 2;
+    break;
+  default:
+    llvm_unreachable("unhandled Indirect Call");
+  }
+
+  Register Reg = IndirectCall.getOperand(RegOpIdxOnIndirectCall).getReg();
+  // Since linkers are allowed to clobber R12 on function calls, the above
+  // mitigation only works if the original indirect call instruction was not
+  // using R12. Code generation before must make sure that no indirect call
+  // using R12 was produced if the mitigation is enabled.
+  // Also, the transformation is incorrect if the indirect call uses LR, so
+  // also have to avoid that.
+  assert(Reg != ARM::R12 && Reg != ARM::LR);
+  bool RegIsKilled = IndirectCall.getOperand(RegOpIdxOnIndirectCall).isKill();
+
+  DebugLoc DL = IndirectCall.getDebugLoc();
+
+  MachineFunction &MF = *MBBI->getMF();
+  auto ThunkIt = llvm::find_if(SLSBLRThunks, [Reg, isThumb](auto T) {
+    return T.Reg == Reg && T.isThumb == isThumb;
+  });
+  assert(ThunkIt != std::end(SLSBLRThunks));
+  Module *M = MF.getFunction().getParent();
+  const GlobalValue *GV = cast<GlobalValue>(M->getNamedValue(ThunkIt->Name));
+
+  MachineInstr *BL =
+      isThumb ? BuildMI(MBB, MBBI, DL, TII->get(ARM::tBL))
+                    .addImm(IndirectCall.getOperand(0).getImm())
+                    .addReg(IndirectCall.getOperand(1).getReg())
+                    .addGlobalAddress(GV)
+              : BuildMI(MBB, MBBI, DL, TII->get(ARM::BL)).addGlobalAddress(GV);
+
+  // Now copy the implicit operands from IndirectCall to BL and copy other
+  // necessary info.
+  // However, both IndirectCall and BL instructions implictly use SP and
+  // implicitly define LR. Blindly copying implicit operands would result in SP
+  // and LR operands to be present multiple times. While this may not be too
+  // much of an issue, let's avoid that for cleanliness, by removing those
+  // implicit operands from the BL created above before we copy over all
+  // implicit operands from the IndirectCall.
+  int ImpLROpIdx = -1;
+  int ImpSPOpIdx = -1;
+  for (unsigned OpIdx = BL->getNumExplicitOperands();
+       OpIdx < BL->getNumOperands(); OpIdx++) {
+    MachineOperand Op = BL->getOperand(OpIdx);
+    if (!Op.isReg())
+      continue;
+    if (Op.getReg() == ARM::LR && Op.isDef())
+      ImpLROpIdx = OpIdx;
+    if (Op.getReg() == ARM::SP && !Op.isDef())
+      ImpSPOpIdx = OpIdx;
+  }
+  assert(ImpLROpIdx != -1);
+  assert(ImpSPOpIdx != -1);
+  int FirstOpIdxToRemove = std::max(ImpLROpIdx, ImpSPOpIdx);
+  int SecondOpIdxToRemove = std::min(ImpLROpIdx, ImpSPOpIdx);
+  BL->RemoveOperand(FirstOpIdxToRemove);
+  BL->RemoveOperand(SecondOpIdxToRemove);
+  // Now copy over the implicit operands from the original IndirectCall
+  BL->copyImplicitOps(MF, IndirectCall);
+  MF.moveCallSiteInfo(&IndirectCall, BL);
+  // Also add the register called in the IndirectCall as being used in the
+  // called thunk.
+  BL->addOperand(MachineOperand::CreateReg(Reg, false /*isDef*/, true /*isImp*/,
+                                           RegIsKilled /*isKill*/));
+  // Remove IndirectCallinstruction
+  MBB.erase(MBBI);
+  return MBB;
+}
+
+bool ARMSLSHardening::hardenIndirectCalls(MachineBasicBlock &MBB) const {
+  if (!ST->hardenSlsBlr())
+    return false;
+  bool Modified = false;
+  MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+  MachineBasicBlock::iterator NextMBBI;
+  for (; MBBI != E; MBBI = NextMBBI) {
+    MachineInstr &MI = *MBBI;
+    NextMBBI = std::next(MBBI);
+    // Tail calls are both indirect calls and "returns".
+    // They are also indirect jumps, so should be handled by sls-harden-retbr,
+    // rather than sls-harden-blr.
+    if (isIndirectCall(MI) && !MI.isReturn()) {
+      ConvertIndirectCallToIndirectJump(MBB, MBBI);
+      Modified = true;
+    }
+  }
+  return Modified;
+}
+
+
+
+FunctionPass *llvm::createARMSLSHardeningPass() {
+  return new ARMSLSHardening();
+}
+
+namespace {
+class ARMIndirectThunks : public MachineFunctionPass {
+public:
+  static char ID;
+
+  ARMIndirectThunks() : MachineFunctionPass(ID) {}
+
+  StringRef getPassName() const override { return "ARM Indirect Thunks"; }
+
+  bool doInitialization(Module &M) override;
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    MachineFunctionPass::getAnalysisUsage(AU);
+    AU.addRequired<MachineModuleInfoWrapperPass>();
+    AU.addPreserved<MachineModuleInfoWrapperPass>();
+  }
+
+private:
+  std::tuple<SLSBLRThunkInserter> TIs;
+
+  // FIXME: When LLVM moves to C++17, these can become folds
+  template <typename... ThunkInserterT>
+  static void initTIs(Module &M,
+                      std::tuple<ThunkInserterT...> &ThunkInserters) {
+    (void)std::initializer_list<int>{
+        (std::get<ThunkInserterT>(ThunkInserters).init(M), 0)...};
+  }
+  template <typename... ThunkInserterT>
+  static bool runTIs(MachineModuleInfo &MMI, MachineFunction &MF,
+                     std::tuple<ThunkInserterT...> &ThunkInserters) {
+    bool Modified = false;
+    (void)std::initializer_list<int>{
+        Modified |= std::get<ThunkInserterT>(ThunkInserters).run(MMI, MF)...};
+    return Modified;
+  }
+};
+
+} // end anonymous namespace
+
+char ARMIndirectThunks::ID = 0;
+
+FunctionPass *llvm::createARMIndirectThunks() {
+  return new ARMIndirectThunks();
+}
+
+bool ARMIndirectThunks::doInitialization(Module &M) {
+  initTIs(M, TIs);
+  return false;
+}
+
+bool ARMIndirectThunks::runOnMachineFunction(MachineFunction &MF) {
+  LLVM_DEBUG(dbgs() << getPassName() << '\n');
+  auto &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
+  return runTIs(MMI, MF, TIs);
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMSchedule.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMSchedule.td
index ce74d325c4e5..53a2a6fec51e 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMSchedule.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMSchedule.td
@@ -151,7 +151,60 @@ def : PredicateProlog<[{
   (void)STI;
 }]>;
 
-def IsPredicatedPred : SchedPredicate<[{TII->isPredicated(*MI)}]>;
+def IsPredicated : CheckFunctionPredicateWithTII<
+  "ARM_MC::isPredicated",
+  "isPredicated"
+>;
+def IsPredicatedPred : MCSchedPredicate<IsPredicated>;
+
+def IsCPSRDefined : CheckFunctionPredicateWithTII<
+  "ARM_MC::isCPSRDefined",
+  "ARMBaseInstrInfo::isCPSRDefined"
+>;
+
+def IsCPSRDefinedPred : MCSchedPredicate<IsCPSRDefined>;
+
+let FunctionMapper = "ARM_AM::getAM2ShiftOpc" in {
+  class CheckAM2NoShift<int n> : CheckImmOperand_s<n, "ARM_AM::no_shift">;
+  class CheckAM2ShiftLSL<int n> : CheckImmOperand_s<n, "ARM_AM::lsl">;
+}
+
+let FunctionMapper = "ARM_AM::getAM2Op" in {
+  class CheckAM2OpAdd<int n> : CheckImmOperand_s<n, "ARM_AM::add"> {}
+  class CheckAM2OpSub<int n> : CheckImmOperand_s<n, "ARM_AM::sub"> {}
+}
+
+let FunctionMapper = "ARM_AM::getAM2Offset" in {
+  class CheckAM2Offset<int n, int of> : CheckImmOperand<n, of> {}
+}
+
+def IsLDMBaseRegInList : CheckFunctionPredicate<
+  "ARM_MC::isLDMBaseRegInList", "ARM_MC::isLDMBaseRegInList"
+>;
+
+let FunctionMapper = "ARM_AM::getAM3Op" in {
+  class CheckAM3OpSub<int n> : CheckImmOperand_s<n, "ARM_AM::sub"> {}
+}
+
+// LDM, base reg in list
+def IsLDMBaseRegInListPred : MCSchedPredicate<IsLDMBaseRegInList>;
+
+class IsRegPCPred<int n> : MCSchedPredicate<CheckRegOperand<n, PC>>;
+
+class BranchWriteRes<int lat, int uops, list<ProcResourceKind> resl,
+                     list<int> rcl, SchedWriteRes wr> :
+  SchedWriteRes<!listconcat(wr.ProcResources, resl)> {
+  let Latency = !add(wr.Latency, lat);
+  let ResourceCycles = !listconcat(wr.ResourceCycles, rcl);
+  let NumMicroOps = !add(wr.NumMicroOps, uops);
+  SchedWriteRes BaseWr = wr;
+}
+
+class CheckBranchForm<int n, BranchWriteRes br> :
+  SchedWriteVariant<[
+    SchedVar<IsRegPCPred<n>, [br]>,
+    SchedVar<NoSchedPred,    [br.BaseWr]>
+  ]>;
 
 //===----------------------------------------------------------------------===//
 // Instruction Itinerary classes used for ARM
@@ -414,14 +467,3 @@ def IIC_VTBX2      : InstrItinClass;
 def IIC_VTBX3      : InstrItinClass;
 def IIC_VTBX4      : InstrItinClass;
 def IIC_VDOTPROD   : InstrItinClass;
-
-//===----------------------------------------------------------------------===//
-// Processor instruction itineraries.
-
-include "ARMScheduleV6.td"
-include "ARMScheduleA8.td"
-include "ARMScheduleA9.td"
-include "ARMScheduleSwift.td"
-include "ARMScheduleR52.td"
-include "ARMScheduleA57.td"
-include "ARMScheduleM4.td"
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleA57.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleA57.td
index d9a8d304c41f..0c610a4839f8 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleA57.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleA57.td
@@ -21,59 +21,47 @@
 // Therefore, IssueWidth is set to the narrower of the two at three, while still
 // modeling the machine as out-of-order.
 
-def IsCPSRDefinedPred : SchedPredicate<[{TII->isCPSRDefined(*MI)}]>;
+def IsCPSRDefinedAndPredicated : CheckAll<[IsCPSRDefined, IsPredicated]>;
 def IsCPSRDefinedAndPredicatedPred :
-  SchedPredicate<[{TII->isCPSRDefined(*MI) && TII->isPredicated(*MI)}]>;
+    MCSchedPredicate<IsCPSRDefinedAndPredicated>;
 
 // Cortex A57 rev. r1p0 or later (false = r0px)
-def IsR1P0AndLaterPred : SchedPredicate<[{false}]>;
+def IsR1P0AndLaterPred : MCSchedPredicate<FalsePred>;
 
-// If Addrmode3 contains register offset (not immediate)
-def IsLdrAm3RegOffPred :
-  SchedPredicate<[{!TII->isAddrMode3OpImm(*MI, 1)}]>;
-// The same predicate with operand offset 2 and 3:
-def IsLdrAm3RegOffPredX2 :
-  SchedPredicate<[{!TII->isAddrMode3OpImm(*MI, 2)}]>;
-def IsLdrAm3RegOffPredX3 :
-  SchedPredicate<[{!TII->isAddrMode3OpImm(*MI, 3)}]>;
+def IsLdrAm3RegOffPred : MCSchedPredicate<CheckInvalidRegOperand<2>>;
+def IsLdrAm3RegOffPredX2 : MCSchedPredicate<CheckInvalidRegOperand<3>>;
+def IsLdrAm3RegOffPredX3 : MCSchedPredicate<CheckInvalidRegOperand<4>>;
 
 // If Addrmode3 contains "minus register"
-def IsLdrAm3NegRegOffPred :
-  SchedPredicate<[{TII->isAddrMode3OpMinusReg(*MI, 1)}]>;
-// The same predicate with operand offset 2 and 3:
-def IsLdrAm3NegRegOffPredX2 :
-  SchedPredicate<[{TII->isAddrMode3OpMinusReg(*MI, 2)}]>;
-def IsLdrAm3NegRegOffPredX3 :
-  SchedPredicate<[{TII->isAddrMode3OpMinusReg(*MI, 3)}]>;
+class Am3NegativeRegOffset<int n> : MCSchedPredicate<CheckAll<[
+                                      CheckValidRegOperand<n>,
+                                      CheckAM3OpSub<!add(n, 1)>]>>;
+
+def IsLdrAm3NegRegOffPred : Am3NegativeRegOffset<2>;
+def IsLdrAm3NegRegOffPredX2 : Am3NegativeRegOffset<3>;
+def IsLdrAm3NegRegOffPredX3 : Am3NegativeRegOffset<4>;
 
 // Load, scaled register offset, not plus LSL2
-def IsLdstsoScaledNotOptimalPredX0 :
-  SchedPredicate<[{TII->isLdstScaledRegNotPlusLsl2(*MI, 0)}]>;
-def IsLdstsoScaledNotOptimalPred :
-  SchedPredicate<[{TII->isLdstScaledRegNotPlusLsl2(*MI, 1)}]>;
-def IsLdstsoScaledNotOptimalPredX2 :
-  SchedPredicate<[{TII->isLdstScaledRegNotPlusLsl2(*MI, 2)}]>;
-
-// Load, scaled register offset
-def IsLdstsoScaledPred :
-  SchedPredicate<[{TII->isLdstScaledReg(*MI, 1)}]>;
-def IsLdstsoScaledPredX2 :
-  SchedPredicate<[{TII->isLdstScaledReg(*MI, 2)}]>;
-
-def IsLdstsoMinusRegPredX0 :
-  SchedPredicate<[{TII->isLdstSoMinusReg(*MI, 0)}]>;
-def IsLdstsoMinusRegPred :
-  SchedPredicate<[{TII->isLdstSoMinusReg(*MI, 1)}]>;
-def IsLdstsoMinusRegPredX2 :
-  SchedPredicate<[{TII->isLdstSoMinusReg(*MI, 2)}]>;
-
-// Load, scaled register offset
-def IsLdrAm2ScaledPred :
-  SchedPredicate<[{TII->isAm2ScaledReg(*MI, 1)}]>;
-
-// LDM, base reg in list
-def IsLdmBaseRegInList :
-  SchedPredicate<[{TII->isLDMBaseRegInList(*MI)}]>;
+class ScaledRegNotPlusLsl2<int n> : CheckNot<
+                                      CheckAny<[
+                                        CheckAM2NoShift<n>,
+                                        CheckAll<[
+                                          CheckAM2OpAdd<n>,
+                                          CheckAM2ShiftLSL<n>,
+                                          CheckAM2Offset<n, 2>
+                                        ]>
+                                      ]>
+                                    >;
+
+def IsLdstsoScaledNotOptimalPredX0 : MCSchedPredicate<ScaledRegNotPlusLsl2<2>>;
+def IsLdstsoScaledNotOptimalPred : MCSchedPredicate<ScaledRegNotPlusLsl2<3>>;
+def IsLdstsoScaledNotOptimalPredX2 : MCSchedPredicate<ScaledRegNotPlusLsl2<4>>;
+
+def IsLdstsoScaledPredX2 : MCSchedPredicate<CheckNot<CheckAM2NoShift<4>>>;
+
+def IsLdstsoMinusRegPredX0 : MCSchedPredicate<CheckAM2OpSub<2>>;
+def IsLdstsoMinusRegPred : MCSchedPredicate<CheckAM2OpSub<3>>;
+def IsLdstsoMinusRegPredX2 : MCSchedPredicate<CheckAM2OpSub<4>>;
 
 class A57WriteLMOpsListType<list<SchedWriteRes> writes> {
   list <SchedWriteRes> Writes = writes;
@@ -185,28 +173,29 @@ def : InstRW<[A57Write_6cyc_1B_1L], (instregex "BR_JTm")>;
 
 def : InstRW<[A57Write_1cyc_1I], (instregex "tADDframe")>;
 
+// Check branch forms of ALU ops:
+// check reg 0 for ARM_AM::PC
+// if so adds 2 cyc to latency, 1 uop, 1 res cycle for A57UnitB
+class A57BranchForm<SchedWriteRes non_br> :
+  BranchWriteRes<2, 1, [A57UnitB], [1], non_br>;
+
 // shift by register, conditional or unconditional
 // TODO: according to the doc, conditional uses I0/I1, unconditional uses M
 // Why more complex instruction uses more simple pipeline?
 // May be an error in doc.
-def A57WriteALUsi : SchedWriteVariant<[
-  // lsl #2, lsl #1, or lsr #1.
-  SchedVar<IsPredicatedPred, [A57Write_2cyc_1M]>,
-  SchedVar<NoSchedPred,      [A57Write_2cyc_1M]>
-]>;
 def A57WriteALUsr : SchedWriteVariant<[
-  SchedVar<IsPredicatedPred, [A57Write_2cyc_1I]>,
-  SchedVar<NoSchedPred,      [A57Write_2cyc_1M]>
+  SchedVar<IsPredicatedPred, [CheckBranchForm<0, A57BranchForm<A57Write_2cyc_1I>>]>,
+  SchedVar<NoSchedPred,      [CheckBranchForm<0, A57BranchForm<A57Write_2cyc_1M>>]>
 ]>;
 def A57WriteALUSsr : SchedWriteVariant<[
-  SchedVar<IsPredicatedPred, [A57Write_2cyc_1I]>,
-  SchedVar<NoSchedPred,      [A57Write_2cyc_1M]>
+  SchedVar<IsPredicatedPred, [CheckBranchForm<0, A57BranchForm<A57Write_2cyc_1I>>]>,
+  SchedVar<NoSchedPred,      [CheckBranchForm<0, A57BranchForm<A57Write_2cyc_1M>>]>
 ]>;
 def A57ReadALUsr : SchedReadVariant<[
   SchedVar<IsPredicatedPred, [ReadDefault]>,
   SchedVar<NoSchedPred,      [ReadDefault]>
 ]>;
-def : SchedAlias<WriteALUsi,  A57WriteALUsi>;
+def : SchedAlias<WriteALUsi,  CheckBranchForm<0, A57BranchForm<A57Write_2cyc_1M>>>;
 def : SchedAlias<WriteALUsr,  A57WriteALUsr>;
 def : SchedAlias<WriteALUSsr, A57WriteALUSsr>;
 def : SchedAlias<ReadALUsr,   A57ReadALUsr>;
@@ -282,7 +271,11 @@ def : ReadAdvance<ReadMUL, 0>;
 // from similar μops, allowing a typical sequence of multiply-accumulate μops
 // to issue one every 1 cycle (sched advance = 2).
 def A57WriteMLA : SchedWriteRes<[A57UnitM]> { let Latency = 3; }
-def A57WriteMLAL : SchedWriteRes<[A57UnitM]> { let Latency = 4; }
+def A57WriteMLAL : SchedWriteVariant<[
+  SchedVar<IsCPSRDefinedPred, [A57Write_5cyc_1I_1M]>,
+  SchedVar<NoSchedPred,       [A57Write_4cyc_1M]>
+]>;
+
 def A57ReadMLA  : SchedReadAdvance<2, [A57WriteMLA, A57WriteMLAL]>;
 
 def : InstRW<[A57WriteMLA],
@@ -477,11 +470,11 @@ def : InstRW<[A57Write_4cyc_1L_1I, A57WrBackTwo], (instregex "LDR_POST_REG",
   "LDRB_POST_REG", "LDR(B?)T_POST$")>;
 
 def A57WriteLdrTRegPost : SchedWriteVariant<[
-  SchedVar<IsLdrAm2ScaledPred, [A57Write_4cyc_1I_1L_1M]>,
+  SchedVar<IsLdstsoScaledPredX2, [A57Write_4cyc_1I_1L_1M]>,
   SchedVar<NoSchedPred,        [A57Write_4cyc_1L_1I]>
 ]>;
 def A57WriteLdrTRegPostWrBack : SchedWriteVariant<[
-  SchedVar<IsLdrAm2ScaledPred, [A57WrBackThree]>,
+  SchedVar<IsLdstsoScaledPredX2, [A57WrBackThree]>,
   SchedVar<NoSchedPred,        [A57WrBackTwo]>
 ]>;
 // 4(3) "I0/I1,L,M" for scaled register, otherwise 4(2) "I0/I1,L"
@@ -517,8 +510,12 @@ def : InstRW<[A57WritePLD], (instregex "PLDrs", "PLDWrs")>;
 
 // --- Load multiple instructions ---
 foreach NumAddr = 1-8 in {
-  def A57LMAddrPred#NumAddr :
-    SchedPredicate<"(TII->getLDMVariableDefsSize(*MI)+1)/2 == "#NumAddr>;
+  def A57LMAddrPred#NumAddr : MCSchedPredicate<CheckAny<[
+                                CheckNumOperands<!add(!shl(NumAddr, 1), 2)>,
+                                CheckNumOperands<!add(!shl(NumAddr, 1), 3)>]>>;
+  def A57LMAddrUpdPred#NumAddr : MCSchedPredicate<CheckAny<[
+                                   CheckNumOperands<!add(!shl(NumAddr, 1), 3)>,
+                                   CheckNumOperands<!add(!shl(NumAddr, 1), 4)>]>>;
 }
 
 def A57LDMOpsListNoregin : A57WriteLMOpsListType<
@@ -574,20 +571,20 @@ def A57LDMOpsList_Upd : A57WriteLMOpsListType<
                A57Write_9cyc_1L_1I, A57Write_9cyc_1L_1I,
                A57Write_10cyc_1L_1I, A57Write_10cyc_1L_1I]>;
 def A57WriteLDM_Upd : SchedWriteVariant<[
-  SchedVar<A57LMAddrPred1,     A57LDMOpsList_Upd.Writes[0-2]>,
-  SchedVar<A57LMAddrPred2,     A57LDMOpsList_Upd.Writes[0-4]>,
-  SchedVar<A57LMAddrPred3,     A57LDMOpsList_Upd.Writes[0-6]>,
-  SchedVar<A57LMAddrPred4,     A57LDMOpsList_Upd.Writes[0-8]>,
-  SchedVar<A57LMAddrPred5,     A57LDMOpsList_Upd.Writes[0-10]>,
-  SchedVar<A57LMAddrPred6,     A57LDMOpsList_Upd.Writes[0-12]>,
-  SchedVar<A57LMAddrPred7,     A57LDMOpsList_Upd.Writes[0-14]>,
-  SchedVar<A57LMAddrPred8,     A57LDMOpsList_Upd.Writes[0-16]>,
-  SchedVar<NoSchedPred,        A57LDMOpsList_Upd.Writes[0-16]>
+  SchedVar<A57LMAddrUpdPred1,     A57LDMOpsList_Upd.Writes[0-2]>,
+  SchedVar<A57LMAddrUpdPred2,     A57LDMOpsList_Upd.Writes[0-4]>,
+  SchedVar<A57LMAddrUpdPred3,     A57LDMOpsList_Upd.Writes[0-6]>,
+  SchedVar<A57LMAddrUpdPred4,     A57LDMOpsList_Upd.Writes[0-8]>,
+  SchedVar<A57LMAddrUpdPred5,     A57LDMOpsList_Upd.Writes[0-10]>,
+  SchedVar<A57LMAddrUpdPred6,     A57LDMOpsList_Upd.Writes[0-12]>,
+  SchedVar<A57LMAddrUpdPred7,     A57LDMOpsList_Upd.Writes[0-14]>,
+  SchedVar<A57LMAddrUpdPred8,     A57LDMOpsList_Upd.Writes[0-16]>,
+  SchedVar<NoSchedPred,           A57LDMOpsList_Upd.Writes[0-16]>
 ]> { let Variadic=1; }
 
 def A57WriteLDM : SchedWriteVariant<[
-  SchedVar<IsLdmBaseRegInList, [A57WriteLDMreginlist]>,
-  SchedVar<NoSchedPred,        [A57WriteLDMnoreginlist]>
+  SchedVar<IsLDMBaseRegInListPred, [A57WriteLDMreginlist]>,
+  SchedVar<NoSchedPred,            [A57WriteLDMnoreginlist]>
 ]> { let Variadic=1; }
 
 def : InstRW<[A57WriteLDM], (instregex "(t|t2|sys)?LDM(IA|DA|DB|IB)$")>;
@@ -834,7 +831,6 @@ def A57WriteVLDMuncond : SchedWriteVariant<[
   SchedVar<A57LMAddrPred5,  A57VLDMOpsListUncond.Writes[0-9]>,
   SchedVar<A57LMAddrPred6,  A57VLDMOpsListUncond.Writes[0-11]>,
   SchedVar<A57LMAddrPred7,  A57VLDMOpsListUncond.Writes[0-13]>,
-  SchedVar<A57LMAddrPred8,  A57VLDMOpsListUncond.Writes[0-15]>,
   SchedVar<NoSchedPred,     A57VLDMOpsListUncond.Writes[0-15]>
 ]> { let Variadic=1; }
 
@@ -855,7 +851,6 @@ def A57WriteVLDMcond : SchedWriteVariant<[
   SchedVar<A57LMAddrPred5,  A57VLDMOpsListCond.Writes[0-9]>,
   SchedVar<A57LMAddrPred6,  A57VLDMOpsListCond.Writes[0-11]>,
   SchedVar<A57LMAddrPred7,  A57VLDMOpsListCond.Writes[0-13]>,
-  SchedVar<A57LMAddrPred8,  A57VLDMOpsListCond.Writes[0-15]>,
   SchedVar<NoSchedPred,     A57VLDMOpsListCond.Writes[0-15]>
 ]> { let Variadic=1; }
 
@@ -883,7 +878,6 @@ def A57WriteVLDMuncond_UPD : SchedWriteVariant<[
   SchedVar<A57LMAddrPred5,  A57VLDMOpsListUncond_Upd.Writes[0-9]>,
   SchedVar<A57LMAddrPred6,  A57VLDMOpsListUncond_Upd.Writes[0-11]>,
   SchedVar<A57LMAddrPred7,  A57VLDMOpsListUncond_Upd.Writes[0-13]>,
-  SchedVar<A57LMAddrPred8,  A57VLDMOpsListUncond_Upd.Writes[0-15]>,
   SchedVar<NoSchedPred,     A57VLDMOpsListUncond_Upd.Writes[0-15]>
 ]> { let Variadic=1; }
 
@@ -904,7 +898,6 @@ def A57WriteVLDMcond_UPD : SchedWriteVariant<[
   SchedVar<A57LMAddrPred5,  A57VLDMOpsListCond_Upd.Writes[0-9]>,
   SchedVar<A57LMAddrPred6,  A57VLDMOpsListCond_Upd.Writes[0-11]>,
   SchedVar<A57LMAddrPred7,  A57VLDMOpsListCond_Upd.Writes[0-13]>,
-  SchedVar<A57LMAddrPred8,  A57VLDMOpsListCond_Upd.Writes[0-15]>,
   SchedVar<NoSchedPred,     A57VLDMOpsListCond_Upd.Writes[0-15]>
 ]> { let Variadic=1; }
 
@@ -1201,7 +1194,7 @@ def : InstRW<[A57Write_5cyc_1V], (instregex
 // --- 3.16 ASIMD Miscellaneous Instructions ---
 
 // ASIMD bitwise insert
-def : InstRW<[A57Write_3cyc_1V], (instregex "VBIF", "VBIT", "VBSL")>;
+def : InstRW<[A57Write_3cyc_1V], (instregex "VBIF", "VBIT", "VBSL", "VBSP")>;
 
 // ASIMD count
 def : InstRW<[A57Write_3cyc_1V], (instregex "VCLS", "VCLZ", "VCNT")>;
@@ -1490,7 +1483,7 @@ def : InstRW<[A57Write_3cyc_1W], (instregex "^(t2)?CRC32")>;
 // -----------------------------------------------------------------------------
 // Common definitions
 def : WriteRes<WriteNoop, []> { let Latency = 0; let NumMicroOps = 0; }
-def : SchedAlias<WriteALU, A57Write_1cyc_1I>;
+def : SchedAlias<WriteALU, CheckBranchForm<0, A57BranchForm<A57Write_1cyc_1I>>>;
 
 def : SchedAlias<WriteBr, A57Write_1cyc_1B>;
 def : SchedAlias<WriteBrL, A57Write_1cyc_1B_1I>;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleA57WriteRes.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleA57WriteRes.td
index 5ba61503686e..531b10bc5cfd 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleA57WriteRes.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleA57WriteRes.td
@@ -36,13 +36,16 @@ def A57Write_19cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 19;
 def A57Write_20cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 20;
                                                     let ResourceCycles = [20]; }
 def A57Write_1cyc_1B  : SchedWriteRes<[A57UnitB]> { let Latency = 1;  }
-def A57Write_1cyc_1I  : SchedWriteRes<[A57UnitI]> { let Latency = 1;  }
-def A57Write_2cyc_1I  : SchedWriteRes<[A57UnitI]> { let Latency = 2;  }
+def A57Write_1cyc_1I  : SchedWriteRes<[A57UnitI]> { let Latency = 1;
+                                                    let ResourceCycles = [1]; }
+def A57Write_2cyc_1I  : SchedWriteRes<[A57UnitI]> { let Latency = 2;
+                                                    let ResourceCycles = [1]; }
 def A57Write_3cyc_1I  : SchedWriteRes<[A57UnitI]> { let Latency = 3;  }
 def A57Write_1cyc_1S  : SchedWriteRes<[A57UnitS]> { let Latency = 1;  }
 def A57Write_2cyc_1S  : SchedWriteRes<[A57UnitS]> { let Latency = 2;  }
 def A57Write_3cyc_1S  : SchedWriteRes<[A57UnitS]> { let Latency = 3;  }
-def A57Write_2cyc_1M  : SchedWriteRes<[A57UnitM]> { let Latency = 2;  }
+def A57Write_2cyc_1M  : SchedWriteRes<[A57UnitM]> { let Latency = 2;
+                                                    let ResourceCycles = [1]; }
 def A57Write_32cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 32;
                                                     let ResourceCycles = [32]; }
 def A57Write_32cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 32;
@@ -68,7 +71,7 @@ foreach Lat = 4-16 in {
   }
 }
 
-def A57Write_4cyc_1M  : SchedWriteRes<[A57UnitL]> { let Latency = 4;  }
+def A57Write_4cyc_1M  : SchedWriteRes<[A57UnitM]> { let Latency = 4;  }
 def A57Write_4cyc_1X  : SchedWriteRes<[A57UnitX]> { let Latency = 4;  }
 def A57Write_4cyc_1W  : SchedWriteRes<[A57UnitW]> { let Latency = 4;  }
 def A57Write_5cyc_1X  : SchedWriteRes<[A57UnitX]> { let Latency = 5;  }
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleA9.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleA9.td
index 3f0b71afd977..be7017a7b426 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleA9.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleA9.td
@@ -2525,8 +2525,8 @@ def : ReadAdvance<ReadFPMAC, 0>;
 def : InstRW< [WriteALU],
       (instregex "ANDri", "ORRri", "EORri", "BICri", "ANDrr", "ORRrr", "EORrr",
                  "BICrr")>;
-def : InstRW< [WriteALUsi], (instregex "ANDrsi", "ORRrsi", "EORrsi", "BICrsi")>;
-def : InstRW< [WriteALUsr], (instregex "ANDrsr", "ORRrsr", "EORrsr", "BICrsr")>;
+def : InstRW< [WriteALUsi], (instrs ANDrsi, ORRrsi, EORrsi, BICrsi)>;
+def : InstRW< [WriteALUsr], (instrs ANDrsr, ORRrsr, EORrsr, BICrsr)>;
 
 
 def : SchedAlias<WriteCMP, A9WriteALU>;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleM7.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleM7.td
new file mode 100644
index 000000000000..12296ad09218
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleM7.td
@@ -0,0 +1,488 @@
+//=- ARMScheduleM7.td - ARM Cortex-M7 Scheduling Definitions -*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the SchedRead/Write data for the ARM Cortex-M7 processor.
+//
+//===----------------------------------------------------------------------===//
+
+def CortexM7Model : SchedMachineModel {
+  let IssueWidth = 2;        // Dual issue for most instructions.
+  let MicroOpBufferSize = 0; // The Cortex-M7 is in-order.
+  let LoadLatency = 2;       // Best case for load-use case.
+  let MispredictPenalty = 4; // Mispredict cost for forward branches is 6,
+                             // but 4 works better
+  let CompleteModel = 0;
+}
+
+//===--------------------------------------------------------------------===//
+// The Cortex-M7 has two ALU, two LOAD, a STORE, a MAC, a BRANCH and a VFP
+// pipe. The stages relevant to scheduling are as follows:
+//
+//   EX1: address generation  shifts
+//   EX2: fast load data      ALUs                  FP operation
+//   EX3: slow load data      integer writeback     FP operation
+//   EX4: store data                                FP writeback
+//
+// There are shifters in both EX1 and EX2, and some instructions can be
+// flexibly allocated between them.  EX2 is used as the "zero" point
+// for scheduling, so simple ALU operations executing in EX2 will have
+// ReadAdvance<0> (the default) for their source operands and Latency = 1.
+
+def M7UnitLoad   : ProcResource<2> { let BufferSize = 0; }
+def M7UnitStore  : ProcResource<1> { let BufferSize = 0; }
+def M7UnitALU    : ProcResource<2>;
+def M7UnitShift1 : ProcResource<1> { let BufferSize = 0; }
+def M7UnitShift2 : ProcResource<1> { let BufferSize = 0; }
+def M7UnitMAC    : ProcResource<1> { let BufferSize = 0; }
+def M7UnitBranch : ProcResource<1> { let BufferSize = 0; }
+def M7UnitVFP    : ProcResource<1> { let BufferSize = 0; }
+def M7UnitVPort  : ProcResource<2> { let BufferSize = 0; }
+def M7UnitSIMD   : ProcResource<1> { let BufferSize = 0; }
+
+//===---------------------------------------------------------------------===//
+// Subtarget-specific SchedWrite types with map ProcResources and set latency.
+
+let SchedModel = CortexM7Model in {
+
+def : WriteRes<WriteALU, [M7UnitALU]> { let Latency = 1; }
+
+// Basic ALU with shifts.
+let Latency = 1 in {
+  def : WriteRes<WriteALUsi,  [M7UnitALU, M7UnitShift1]>;
+  def : WriteRes<WriteALUsr,  [M7UnitALU, M7UnitShift1]>;
+  def : WriteRes<WriteALUSsr, [M7UnitALU, M7UnitShift1]>;
+}
+
+// Compares.
+def : WriteRes<WriteCMP,   [M7UnitALU]> { let Latency = 1; }
+def : WriteRes<WriteCMPsi, [M7UnitALU, M7UnitShift1]> { let Latency = 2; }
+def : WriteRes<WriteCMPsr, [M7UnitALU, M7UnitShift1]> { let Latency = 2; }
+
+// Multiplies.
+let Latency = 2 in {
+  def : WriteRes<WriteMUL16,   [M7UnitMAC]>;
+  def : WriteRes<WriteMUL32,   [M7UnitMAC]>;
+  def : WriteRes<WriteMUL64Lo, [M7UnitMAC]>;
+  def : WriteRes<WriteMUL64Hi, []> { let NumMicroOps = 0; }
+}
+
+// Multiply-accumulates.
+let Latency = 2 in {
+  def : WriteRes<WriteMAC16,   [M7UnitMAC]>;
+  def : WriteRes<WriteMAC32,   [M7UnitMAC]>;
+  def : WriteRes<WriteMAC64Lo, [M7UnitMAC]> { let Latency = 2; }
+  def : WriteRes<WriteMAC64Hi, []> { let NumMicroOps = 0; }
+}
+
+// Divisions.
+// These cannot be dual-issued with any instructions.
+def : WriteRes<WriteDIV, [M7UnitALU]> {
+  let Latency = 7;
+  let SingleIssue = 1;
+}
+
+// Loads/Stores.
+def : WriteRes<WriteLd,    [M7UnitLoad]> { let Latency = 1; }
+def : WriteRes<WritePreLd, [M7UnitLoad]> { let Latency = 2; }
+def : WriteRes<WriteST,    [M7UnitStore]> { let Latency = 2; }
+
+// Branches.
+def : WriteRes<WriteBr,    [M7UnitBranch]> { let Latency = 2; }
+def : WriteRes<WriteBrL,   [M7UnitBranch]> { let Latency = 2; }
+def : WriteRes<WriteBrTbl, [M7UnitBranch]> { let Latency = 2; }
+
+// Noop.
+def : WriteRes<WriteNoop, []> { let Latency = 0; }
+
+//===---------------------------------------------------------------------===//
+// Sched definitions for floating-point instructions
+//
+// Floating point conversions.
+def : WriteRes<WriteFPCVT, [M7UnitVFP, M7UnitVPort]> { let Latency = 3; }
+def : WriteRes<WriteFPMOV, [M7UnitVPort]>            { let Latency = 3; }
+
+// The FP pipeline has a latency of 3 cycles.
+// ALU operations (32/64-bit).  These go down the FP pipeline.
+def : WriteRes<WriteFPALU32, [M7UnitVFP, M7UnitVPort]>  { let Latency = 3; }
+def : WriteRes<WriteFPALU64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> {
+  let Latency = 4;
+  let BeginGroup = 1;
+}
+
+// Multiplication
+def : WriteRes<WriteFPMUL32, [M7UnitVFP, M7UnitVPort]> { let Latency = 3; }
+def : WriteRes<WriteFPMUL64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> {
+  let Latency = 7;
+  let BeginGroup = 1;
+}
+
+// Multiply-accumulate.  FPMAC goes down the FP Pipeline.
+def : WriteRes<WriteFPMAC32, [M7UnitVFP, M7UnitVPort]> { let Latency = 6; }
+def : WriteRes<WriteFPMAC64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> {
+  let Latency = 11;
+  let BeginGroup = 1;
+}
+
+// Division.   Effective scheduling latency is 3, though real latency is larger
+def : WriteRes<WriteFPDIV32, [M7UnitVFP, M7UnitVPort]>  { let Latency = 16; }
+def : WriteRes<WriteFPDIV64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> {
+  let Latency = 30;
+  let BeginGroup = 1;
+}
+
+// Square-root.  Effective scheduling latency is 3; real latency is larger
+def : WriteRes<WriteFPSQRT32, [M7UnitVFP, M7UnitVPort]> { let Latency = 16; }
+def : WriteRes<WriteFPSQRT64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> {
+  let Latency = 30;
+  let BeginGroup = 1;
+}
+
+def M7WriteShift2   : SchedWriteRes<[M7UnitALU, M7UnitShift2]> {}
+
+// Not used for M7, but needing definitions anyway
+def : WriteRes<WriteVLD1, []>;
+def : WriteRes<WriteVLD2, []>;
+def : WriteRes<WriteVLD3, []>;
+def : WriteRes<WriteVLD4, []>;
+def : WriteRes<WriteVST1, []>;
+def : WriteRes<WriteVST2, []>;
+def : WriteRes<WriteVST3, []>;
+def : WriteRes<WriteVST4, []>;
+
+def M7SingleIssue : SchedWriteRes<[]> {
+  let SingleIssue = 1;
+  let NumMicroOps = 0;
+}
+def M7Slot0Only   : SchedWriteRes<[]> {
+  let BeginGroup = 1;
+  let NumMicroOps = 0;
+}
+
+// What pipeline stage operands need to be ready for depending on
+// where they come from.
+def : ReadAdvance<ReadALUsr, 0>;
+def : ReadAdvance<ReadMUL, 0>;
+def : ReadAdvance<ReadMAC, 1>;
+def : ReadAdvance<ReadALU, 0>;
+def : ReadAdvance<ReadFPMUL, 0>;
+def : ReadAdvance<ReadFPMAC, 3>;
+def M7Read_ISS : SchedReadAdvance<-1>;     // operands needed at EX1
+def M7Read_EX2   : SchedReadAdvance<1>;    // operands needed at EX3
+def M7Read_EX3   : SchedReadAdvance<2>;    // operands needed at EX4
+
+// Non general purpose instructions may not be dual issued. These
+// use both issue units.
+def M7NonGeneralPurpose : SchedWriteRes<[]> {
+  // Assume that these will go down the main ALU pipeline.
+  // In reality, many look likely to stall the whole pipeline.
+  let Latency = 3;
+  let SingleIssue = 1;
+}
+
+// List the non general purpose instructions.
+def : InstRW<[M7NonGeneralPurpose], (instregex "t2MRS", "tSVC", "tBKPT",
+                                     "t2MSR", "t2DMB", "t2DSB", "t2ISB",
+                                     "t2HVC", "t2SMC", "t2UDF", "ERET",
+                                     "tHINT", "t2HINT", "t2CLREX", "BUNDLE")>;
+
+//===---------------------------------------------------------------------===//
+// Sched definitions for load/store
+//
+// Mark whether the loads/stores must be single-issue
+// Address operands are needed earlier
+// Data operands are needed later
+
+def M7BaseUpdate : SchedWriteRes<[]> {
+    let Latency = 0; // Update is bypassable out of EX1
+    let NumMicroOps = 0;
+}
+def M7LoadLatency1 : SchedWriteRes<[]> {
+    let Latency = 1;
+    let NumMicroOps = 0;
+}
+def M7SlowLoad : SchedWriteRes<[M7UnitLoad]>            { let Latency = 2; }
+
+// Byte and half-word loads should have greater latency than other loads.
+// So should load exclusive.
+
+def : InstRW<[M7SlowLoad],
+      (instregex "t2LDR(B|H|SB|SH)pc")>;
+def : InstRW<[M7SlowLoad, M7Read_ISS],
+      (instregex "t2LDR(B|H|SB|SH)T", "t2LDR(B|H|SB|SH)i",
+                 "tLDR(B|H)i")>;
+def : InstRW<[M7SlowLoad, M7Read_ISS, M7Read_ISS],
+      (instregex "t2LDR(B|H|SB|SH)s", "tLDR(B|H)r", "tLDR(SB|SH)")>;
+def : InstRW<[M7SlowLoad, M7BaseUpdate, M7Read_ISS],
+      (instregex "t2LDR(B|H|SB|SH)_(POST|PRE)")>;
+
+// Exclusive loads/stores cannot be dual-issued
+def : InstRW<[WriteLd, M7Slot0Only, M7Read_ISS],
+      (instregex "t2LDREX$")>;
+def : InstRW<[M7SlowLoad, M7Slot0Only, M7Read_ISS],
+      (instregex "t2LDREX(B|H)")>;
+def : InstRW<[WriteST, M7SingleIssue, M7Read_EX2, M7Read_ISS],
+      (instregex "t2STREX(B|H)?$")>;
+
+// Load/store multiples cannot be dual-issued.  Note that default scheduling
+// occurs around read/write times of individual registers in the list; read
+// time for STM cannot be overridden because it is a variadic source operand.
+
+def : InstRW<[WriteLd, M7SingleIssue, M7Read_ISS],
+      (instregex "(t|t2)LDM(DB|IA)$")>;
+def : InstRW<[WriteST, M7SingleIssue, M7Read_ISS],
+      (instregex "(t|t2)STM(DB|IA)$")>;
+def : InstRW<[M7BaseUpdate, WriteLd, M7SingleIssue, M7Read_ISS],
+      (instregex "(t|t2)LDM(DB|IA)_UPD$", "tPOP")>;
+def : InstRW<[M7BaseUpdate, WriteST, M7SingleIssue, M7Read_ISS],
+      (instregex "(t|t2)STM(DB|IA)_UPD$", "tPUSH")>;
+
+// Load/store doubles cannot be dual-issued.
+
+def : InstRW<[M7BaseUpdate, WriteST, M7SingleIssue,
+              M7Read_EX2, M7Read_EX2, M7Read_ISS],
+      (instregex "t2STRD_(PRE|POST)")>;
+def : InstRW<[WriteST, M7SingleIssue, M7Read_EX2, M7Read_EX2, M7Read_ISS],
+      (instregex "t2STRDi")>;
+def : InstRW<[WriteLd, M7LoadLatency1, M7SingleIssue, M7BaseUpdate, M7Read_ISS],
+      (instregex "t2LDRD_(PRE|POST)")>;
+def : InstRW<[WriteLd, M7LoadLatency1, M7SingleIssue, M7Read_ISS],
+      (instregex "t2LDRDi")>;
+
+// Word load / preload
+def : InstRW<[WriteLd],
+      (instregex "t2LDRpc", "t2PL[DI]pci", "tLDRpci")>;
+def : InstRW<[WriteLd, M7Read_ISS],
+      (instregex "t2LDR(i|T)", "t2PL[DI](W)?i", "tLDRi", "tLDRspi")>;
+def : InstRW<[WriteLd, M7Read_ISS, M7Read_ISS],
+      (instregex "t2LDRs", "t2PL[DI](w)?s", "tLDRr")>;
+def : InstRW<[WriteLd, M7BaseUpdate, M7Read_ISS],
+      (instregex "t2LDR_(POST|PRE)")>;
+
+// Stores
+def : InstRW<[M7BaseUpdate, WriteST, M7Read_EX2, M7Read_ISS],
+      (instregex "t2STR(B|H)?_(POST|PRE)")>;
+def : InstRW<[WriteST, M7Read_EX2, M7Read_ISS, M7Read_ISS],
+      (instregex "t2STR(B|H)?s$", "tSTR(B|H)?r$")>;
+def : InstRW<[WriteST, M7Read_EX2, M7Read_ISS],
+      (instregex "t2STR(B|H)?(i|T)", "tSTR(B|H)?i$", "tSTRspi")>;
+
+// TBB/TBH - single-issue only; takes two cycles to issue
+
+def M7TableLoad : SchedWriteRes<[M7UnitLoad]> {
+  let NumMicroOps = 2;
+  let SingleIssue = 1;
+}
+
+def : InstRW<[M7TableLoad, M7Read_ISS, M7Read_ISS], (instregex "t2TB")>;
+
+// VFP loads and stores
+
+def M7LoadSP  : SchedWriteRes<[M7UnitLoad, M7UnitVPort]> { let Latency = 1; }
+def M7LoadDP  : SchedWriteRes<[M7UnitLoad, M7UnitVPort, M7UnitVPort]> {
+  let Latency = 2;
+  let SingleIssue = 1;
+}
+def M7StoreSP : SchedWriteRes<[M7UnitStore, M7UnitVPort]>;
+def M7StoreDP : SchedWriteRes<[M7UnitStore, M7UnitVPort, M7UnitVPort]> {
+  let SingleIssue = 1;
+}
+
+def : InstRW<[M7LoadSP, M7Read_ISS],                 (instregex "VLDR(S|H)$")>;
+def : InstRW<[M7LoadDP, M7Read_ISS],                 (instregex "VLDRD$")>;
+def : InstRW<[M7StoreSP, M7Read_EX3, M7Read_ISS],    (instregex "VSTR(S|H)$")>;
+def : InstRW<[M7StoreDP, M7Read_EX3, M7Read_ISS],    (instregex "VSTRD$")>;
+
+// Load/store multiples cannot be dual-issued.
+
+def : InstRW<[WriteLd, M7SingleIssue, M7Read_ISS],
+      (instregex "VLDM(S|D|Q)(DB|IA)$")>;
+def : InstRW<[WriteST, M7SingleIssue, M7Read_ISS],
+      (instregex "VSTM(S|D|Q)(DB|IA)$")>;
+def : InstRW<[M7BaseUpdate, WriteLd, M7SingleIssue, M7Read_ISS],
+      (instregex "VLDM(S|D|Q)(DB|IA)_UPD$")>;
+def : InstRW<[M7BaseUpdate, WriteST, M7SingleIssue, M7Read_ISS],
+      (instregex "VSTM(S|D|Q)(DB|IA)_UPD$")>;
+
+//===---------------------------------------------------------------------===//
+// Sched definitions for ALU
+//
+
+// Shifted ALU operands are read a cycle early.
+def M7Ex1ReadNoFastBypass : SchedReadAdvance<-1, [WriteLd, M7LoadLatency1]>;
+
+def : InstRW<[WriteALUsi, M7Ex1ReadNoFastBypass, M7Read_ISS],
+             (instregex "t2(ADC|ADDS|ADD|BIC|EOR|ORN|ORR|RSBS|RSB|SBC|SUBS)rs$",
+                        "t2(SUB|CMP|CMNz|TEQ|TST)rs$",
+                        "t2MOVsr(a|l)")>;
+def : InstRW<[WriteALUsi, M7Read_ISS],
+             (instregex "t2MVNs")>;
+
+// Treat pure shift operations (except for RRX) as if they used the EX1
+// shifter but have timing as if they used the EX2 shifter as they usually
+// can choose the EX2 shifter when needed.  Will miss a few dual-issue cases,
+// but the results prove to be better than trying to get them exact.
+
+def : InstRW<[M7WriteShift2, M7Read_ISS], (instregex "t2RRX$")>;
+def : InstRW<[WriteALUsi], (instregex "(t|t2)(LSL|LSR|ASR|ROR)")>;
+
+// Instructions that use the shifter, but have normal timing.
+
+def : InstRW<[WriteALUsi,M7Slot0Only], (instregex "t2(BFC|BFI)$")>;
+
+// Instructions which are slot zero only but otherwise normal.
+
+def : InstRW<[WriteALU, M7Slot0Only], (instregex "t2CLZ")>;
+
+// MAC operations that don't have SchedRW set.
+
+def : InstRW<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC], (instregex "t2SML[AS]D")>;
+
+// Divides are special because they stall for their latency, and so look like a
+// single-cycle as far as scheduling opportunities go.  By putting WriteALU
+// first, we make the operand latency 1, but keep the instruction latency 7.
+
+def : InstRW<[WriteALU, WriteDIV], (instregex "t2(S|U)DIV")>;
+
+// DSP extension operations
+
+def M7WriteSIMD1   : SchedWriteRes<[M7UnitSIMD, M7UnitALU]> {
+  let Latency = 1;
+  let BeginGroup = 1;
+}
+def M7WriteSIMD2   : SchedWriteRes<[M7UnitSIMD, M7UnitALU]> {
+  let Latency = 2;
+  let BeginGroup = 1;
+}
+def M7WriteShSIMD1 : SchedWriteRes<[M7UnitSIMD, M7UnitALU, M7UnitShift1]> {
+  let Latency = 1;
+  let BeginGroup = 1;
+}
+def M7WriteShSIMD0 : SchedWriteRes<[M7UnitSIMD, M7UnitALU, M7UnitShift1]> {
+  let Latency = 0;      // Bypassable out of EX1
+  let BeginGroup = 1;
+}
+def M7WriteShSIMD2 : SchedWriteRes<[M7UnitSIMD, M7UnitALU, M7UnitShift1]> {
+  let Latency = 2;
+  let BeginGroup = 1;
+}
+
+def : InstRW<[M7WriteShSIMD2, M7Read_ISS],
+             (instregex "t2(S|U)SAT")>;
+def : InstRW<[M7WriteSIMD1, ReadALU],
+             (instregex "(t|t2)(S|U)XT(B|H)")>;
+def : InstRW<[M7WriteSIMD1, ReadALU, ReadALU],
+             (instregex "t2(S|SH|U|UH)(ADD16|ADD8|ASX|SAX|SUB16|SUB8)",
+                        "t2SEL")>;
+def : InstRW<[M7WriteSIMD2, ReadALU, ReadALU],
+             (instregex "t2(Q|UQ)(ADD|ASX|SAX|SUB)", "t2USAD8")>;
+def : InstRW<[M7WriteShSIMD2, M7Read_ISS, M7Read_ISS],
+             (instregex "t2QD(ADD|SUB)")>;
+def : InstRW<[M7WriteShSIMD0, M7Read_ISS],
+             (instregex "t2(RBIT|REV)", "tREV")>;
+def : InstRW<[M7WriteShSIMD1, M7Read_ISS],
+             (instregex "t2(SBFX|UBFX)")>;
+def : InstRW<[M7WriteShSIMD1, ReadALU, M7Read_ISS],
+             (instregex "t2PKH(BT|TB)", "t2(S|U)XTA")>;
+def : InstRW<[M7WriteSIMD2, ReadALU, ReadALU, M7Read_EX2],
+             (instregex "t2USADA8")>;
+
+// MSR/MRS
+def : InstRW<[M7NonGeneralPurpose], (instregex "MSR", "MRS")>;
+
+//===---------------------------------------------------------------------===//
+// Sched definitions for FP operations
+//
+
+// Effective scheduling latency is really 3 for nearly all FP operations,
+// even if their true latency is higher.
+def M7WriteVFPLatOverride : SchedWriteRes<[]> {
+  let Latency = 3;
+  let NumMicroOps = 0;
+}
+def M7WriteVFPExtraVPort  : SchedWriteRes<[M7UnitVPort]> {
+  let Latency = 3;
+  let NumMicroOps = 0;
+}
+
+// Instructions which are missing default schedules.
+def : InstRW<[WriteFPALU32],
+             (instregex "V(ABS|CVT.*|NEG|FP_VMAX.*|FP_VMIN.*|RINT.*)S$")>;
+def : InstRW<[M7WriteVFPLatOverride, WriteFPALU64],
+             (instregex "V(ABS|CVT.*|NEG|FP_VMAX.*|FP_VMIN.*|RINT.*)D$")>;
+
+// VCMP
+def M7WriteVCMPS : SchedWriteRes<[M7UnitVFP, M7UnitVPort]> { let Latency = 0; }
+def M7WriteVCMPD : SchedWriteRes<[M7UnitVFP, M7UnitVPort, M7UnitVPort]> {
+  let Latency = 0;
+  let BeginGroup = 1;
+}
+def : InstRW<[M7WriteVCMPS], (instregex "VCMPS$")>;
+def : InstRW<[M7WriteVCMPD], (instregex "VCMPD$")>;
+
+    // VMRS/VMSR
+def M7VMRS : SchedWriteRes<[M7UnitVFP, M7UnitVPort]> { let SingleIssue = 1; }
+def M7VMSR : SchedWriteRes<[M7UnitVFP, M7UnitVPort]> { let SingleIssue = 1; }
+def : InstRW<[M7VMRS], (instregex "FMSTAT")>;
+def : InstRW<[M7VMSR], (instregex "VMSR")>;
+
+// VSEL cannot bypass in its implied $cpsr operand; model as earlier read
+def : InstRW<[WriteFPALU32, M7Slot0Only, ReadALU, ReadALU, M7Read_ISS],
+             (instregex "VSEL.*S$")>;
+def : InstRW<[M7WriteVFPLatOverride, WriteFPALU64, M7Slot0Only,
+              ReadALU, ReadALU, M7Read_ISS],
+             (instregex "VSEL.*D$")>;
+
+// VMOV
+def : InstRW<[WriteFPMOV],
+             (instregex "VMOV(H|S)$", "FCONST(H|S)")>;
+def : InstRW<[WriteFPMOV, M7WriteVFPExtraVPort, M7Slot0Only],
+             (instregex "VMOVD$")>;
+def : InstRW<[WriteFPMOV, M7WriteVFPExtraVPort, M7Slot0Only],
+             (instregex "FCONSTD")>;
+def : InstRW<[WriteFPMOV, M7WriteVFPExtraVPort, M7SingleIssue],
+             (instregex "VMOV(DRR|RRD|RRS|SRR)")>;
+
+// Larger-latency overrides.
+
+def : InstRW<[M7WriteVFPLatOverride, WriteFPDIV32],  (instregex "VDIVS")>;
+def : InstRW<[M7WriteVFPLatOverride, WriteFPDIV64],  (instregex "VDIVD")>;
+def : InstRW<[M7WriteVFPLatOverride, WriteFPSQRT32], (instregex "VSQRTS")>;
+def : InstRW<[M7WriteVFPLatOverride, WriteFPSQRT64], (instregex "VSQRTD")>;
+def : InstRW<[M7WriteVFPLatOverride, WriteFPMUL64],
+             (instregex "V(MUL|NMUL)D")>;
+def : InstRW<[M7WriteVFPLatOverride, WriteFPALU64],
+             (instregex "V(ADD|SUB)D")>;
+
+// Multiply-accumulate.  Chained SP timing is correct; rest need overrides
+// Double-precision chained MAC stalls the pipeline behind it for 3 cycles,
+// making it appear to have 3 cycle latency for scheduling.
+
+def : InstRW<[M7WriteVFPLatOverride, WriteFPMAC64,
+              ReadFPMAC, ReadFPMUL, ReadFPMUL],
+             (instregex "V(N)?ML(A|S)D$")>;
+
+// Single-precision fused MACs look like latency 5 with advance of 2.
+
+def M7WriteVFPLatOverride5 : SchedWriteRes<[]> {
+  let Latency = 5;
+  let NumMicroOps = 0;
+}
+def M7ReadFPMAC2   : SchedReadAdvance<2>;
+
+def : InstRW<[M7WriteVFPLatOverride5, WriteFPMAC32,
+              M7ReadFPMAC2, ReadFPMUL, ReadFPMUL],
+             (instregex "VF(N)?M(A|S)S$")>;
+
+// Double-precision fused MAC stalls the pipeline behind it for 2 cycles, making
+// it appear to have 3 cycle latency for scheduling.
+
+def : InstRW<[M7WriteVFPLatOverride, WriteFPMAC64,
+              ReadFPMAC, ReadFPMUL, ReadFPMUL],
+             (instregex "VF(N)?M(A|S)D$")>;
+
+}  // SchedModel = CortexM7Model
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleR52.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleR52.td
index d1cbf754b5a1..466acec6f76a 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleR52.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleR52.td
@@ -787,8 +787,8 @@ def : InstRW<[R52Write2FPALU_F3, R52Read_F2, R52Read_F2], (instregex "(VAND|VBIC
 def : InstRW<[R52WriteFPALU_F3, R52Read_F2], (instregex "VBICi(v4i16|v2i32)")>;
 def : InstRW<[R52Write2FPALU_F3, R52Read_F2], (instregex "VBICi(v8i16|v4i32)")>;
 
-def : InstRW<[R52WriteFPALU_F3, R52Read_F1, R52Read_F2, R52Read_F2], (instregex "(VBIF|VBIT|VBSL)d")>;
-def : InstRW<[R52Write2FPALU_F3, R52Read_F1, R52Read_F2, R52Read_F2], (instregex "(VBIF|VBIT|VBSL)q")>;
+def : InstRW<[R52WriteFPALU_F3, R52Read_F1, R52Read_F2, R52Read_F2], (instregex "(VBIF|VBIT|VBSL|VBSP)d")>;
+def : InstRW<[R52Write2FPALU_F3, R52Read_F1, R52Read_F2, R52Read_F2], (instregex "(VBIF|VBIT|VBSL|VBSP)q")>;
 
 def : InstRW<[R52WriteFPALU_F3, R52Read_F1, R52Read_F1],
       (instregex "(VCEQ|VCGE|VCGT|VCLE|VCLT|VCLZ|VCMP|VCMPE|VCNT)")>;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleSwift.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleSwift.td
index e0e98bfa0e9b..d66b3065c7b7 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleSwift.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleSwift.td
@@ -558,8 +558,8 @@ let SchedModel = SwiftModel in {
         (instregex "VADDv", "VSUBv", "VNEG(s|f|v)", "VADDL", "VSUBL",
                    "VADDW", "VSUBW", "VHADD", "VHSUB", "VRHADD", "VPADDi",
                    "VPADDL", "VAND", "VBIC", "VEOR", "VORN", "VORR", "VTST",
-                   "VSHL", "VSHR(s|u)", "VSHLL", "VQSHL(s|u)", "VBIF",
-                   "VBIT", "VBSL", "VSLI", "VSRI", "VCLS", "VCLZ", "VCNT")>;
+                   "VSHL", "VSHR(s|u)", "VSHLL", "VQSHL(s|u)", "VBIF", "VBIT",
+                   "VBSL", "VBSP", "VSLI", "VSRI", "VCLS", "VCLZ", "VCNT")>;
 
   def : InstRW<[SwiftWriteP1TwoCycle],
         (instregex "VEXT", "VREV16", "VREV32", "VREV64")>;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.cpp
index 46802037c2aa..5cb608b74ace 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.cpp
@@ -97,9 +97,9 @@ ARMSubtarget::ARMSubtarget(const Triple &TT, const std::string &CPU,
                            const std::string &FS,
                            const ARMBaseTargetMachine &TM, bool IsLittle,
                            bool MinSize)
-    : ARMGenSubtargetInfo(TT, CPU, FS), UseMulOps(UseFusedMulOps),
-      CPUString(CPU), OptMinSize(MinSize), IsLittle(IsLittle),
-      TargetTriple(TT), Options(TM.Options), TM(TM),
+    : ARMGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS),
+      UseMulOps(UseFusedMulOps), CPUString(CPU), OptMinSize(MinSize),
+      IsLittle(IsLittle), TargetTriple(TT), Options(TM.Options), TM(TM),
       FrameLowering(initializeFrameLowering(CPU, FS)),
       // At this point initializeSubtargetDependencies has been called so
       // we can query directly.
@@ -185,7 +185,7 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
     else
       ArchFS = std::string(FS);
   }
-  ParseSubtargetFeatures(CPUString, ArchFS);
+  ParseSubtargetFeatures(CPUString, /*TuneCPU*/ CPUString, ArchFS);
 
   // FIXME: This used enable V6T2 support implicitly for Thumb2 mode.
   // Assert this for now to make the change obvious.
@@ -237,7 +237,7 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
 
   switch (IT) {
   case DefaultIT:
-    RestrictIT = hasV8Ops();
+    RestrictIT = hasV8Ops() && !hasMinSize();
     break;
   case RestrictedIT:
     RestrictIT = true;
@@ -294,11 +294,13 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
   case CortexA76:
   case CortexA77:
   case CortexA78:
+  case CortexA78C:
   case CortexR4:
   case CortexR4F:
   case CortexR5:
   case CortexR7:
   case CortexM3:
+  case CortexM7:
   case CortexR52:
   case CortexX1:
     break;
@@ -314,6 +316,8 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
     PreISelOperandLatencyAdjustment = 1;
     break;
   case NeoverseN1:
+  case NeoverseN2:
+  case NeoverseV1:
     break;
   case Swift:
     MaxInterleaveFactor = 2;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.h
index 2703e385dd81..fd9b94fdaa23 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -63,9 +63,11 @@ protected:
     CortexA76,
     CortexA77,
     CortexA78,
+    CortexA78C,
     CortexA8,
     CortexA9,
     CortexM3,
+    CortexM7,
     CortexR4,
     CortexR4F,
     CortexR5,
@@ -76,6 +78,8 @@ protected:
     Krait,
     Kryo,
     NeoverseN1,
+    NeoverseN2,
+    NeoverseV1,
     Swift
   };
   enum ARMProcClassEnum {
@@ -163,6 +167,7 @@ protected:
   bool HasV8_4aOps = false;
   bool HasV8_5aOps = false;
   bool HasV8_6aOps = false;
+  bool HasV8_7aOps = false;
   bool HasV8MBaselineOps = false;
   bool HasV8MMainlineOps = false;
   bool HasV8_1MMainlineOps = false;
@@ -461,6 +466,13 @@ protected:
   /// cannot be encoded. For example, ADD r0, r1, #FFFFFFFF -> SUB r0, r1, #1.
   bool NegativeImmediates = true;
 
+  /// Harden against Straight Line Speculation for Returns and Indirect
+  /// Branches.
+  bool HardenSlsRetBr = false;
+
+  /// Harden against Straight Line Speculation for indirect calls.
+  bool HardenSlsBlr = false;
+
   /// stackAlignment - The minimum alignment known to hold of the stack frame on
   /// entry to the function and which must be maintained by every function.
   Align stackAlignment = Align(4);
@@ -526,7 +538,7 @@ public:
 
   /// ParseSubtargetFeatures - Parses features string setting specified
   /// subtarget options.  Definition of function is auto generated by tblgen.
-  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+  void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
 
   /// initializeSubtargetDependencies - Initializes using a CPU and feature string
   /// so that we can use initializer lists for subtarget initialization.
@@ -594,6 +606,7 @@ public:
   bool hasV8_4aOps() const { return HasV8_4aOps; }
   bool hasV8_5aOps() const { return HasV8_5aOps; }
   bool hasV8_6aOps() const { return HasV8_6aOps; }
+  bool hasV8_7aOps() const { return HasV8_7aOps; }
   bool hasV8MBaselineOps() const { return HasV8MBaselineOps; }
   bool hasV8MMainlineOps() const { return HasV8MMainlineOps; }
   bool hasV8_1MMainlineOps() const { return HasV8_1MMainlineOps; }
@@ -614,6 +627,7 @@ public:
   bool isCortexA15() const { return ARMProcFamily == CortexA15; }
   bool isSwift()    const { return ARMProcFamily == Swift; }
   bool isCortexM3() const { return ARMProcFamily == CortexM3; }
+  bool isCortexM7() const { return ARMProcFamily == CortexM7; }
   bool isLikeA9() const { return isCortexA9() || isCortexA15() || isKrait(); }
   bool isCortexR5() const { return ARMProcFamily == CortexR5; }
   bool isKrait() const { return ARMProcFamily == Krait; }
@@ -901,6 +915,9 @@ public:
   bool ignoreCSRForAllocationOrder(const MachineFunction &MF,
                                    unsigned PhysReg) const override;
   unsigned getGPRAllocationOrder(const MachineFunction &MF) const;
+
+  bool hardenSlsRetBr() const { return HardenSlsRetBr; }
+  bool hardenSlsBlr() const { return HardenSlsBlr; }
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.cpp
index 9ead5fa4308c..237ef54c8339 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@@ -99,7 +99,9 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMTarget() {
   initializeMVEVPTOptimisationsPass(Registry);
   initializeMVETailPredicationPass(Registry);
   initializeARMLowOverheadLoopsPass(Registry);
+  initializeARMBlockPlacementPass(Registry);
   initializeMVEGatherScatterLoweringPass(Registry);
+  initializeARMSLSHardeningPass(Registry);
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -251,7 +253,7 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, const Triple &TT,
 
   // ARM supports the MachineOutliner.
   setMachineOutliner(true);
-  setSupportsDefaultOutlining(false);
+  setSupportsDefaultOutlining(true);
 }
 
 ARMBaseTargetMachine::~ARMBaseTargetMachine() = default;
@@ -261,12 +263,10 @@ ARMBaseTargetMachine::getSubtargetImpl(const Function &F) const {
   Attribute CPUAttr = F.getFnAttribute("target-cpu");
   Attribute FSAttr = F.getFnAttribute("target-features");
 
-  std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
-                        ? CPUAttr.getValueAsString().str()
-                        : TargetCPU;
-  std::string FS = !FSAttr.hasAttribute(Attribute::None)
-                       ? FSAttr.getValueAsString().str()
-                       : TargetFS;
+  std::string CPU =
+      CPUAttr.isValid() ? CPUAttr.getValueAsString().str() : TargetCPU;
+  std::string FS =
+      FSAttr.isValid() ? FSAttr.getValueAsString().str() : TargetFS;
 
   // FIXME: This is related to the code below to reset the target options,
   // we need to know whether or not the soft float flag is set on the
@@ -409,7 +409,8 @@ void ARMPassConfig::addIRPasses() {
   // ldrex/strex loops to simplify this, but it needs tidying up.
   if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy)
     addPass(createCFGSimplificationPass(
-        1, false, false, true, true, [this](const Function &F) {
+        SimplifyCFGOptions().hoistCommonInsts(true).sinkCommonInsts(true),
+        [this](const Function &F) {
           const auto &ST = this->TM->getSubtarget<ARMSubtarget>(F);
           return ST.hasAnyDataBarrier() && !ST.isThumb1Only();
         }));
@@ -471,7 +472,7 @@ bool ARMPassConfig::addInstSelector() {
 }
 
 bool ARMPassConfig::addIRTranslator() {
-  addPass(new IRTranslator());
+  addPass(new IRTranslator(getOptLevel()));
   return false;
 }
 
@@ -539,6 +540,9 @@ void ARMPassConfig::addPreSched2() {
     addPass(&PostMachineSchedulerID);
     addPass(&PostRASchedulerID);
   }
+
+  addPass(createARMIndirectThunks());
+  addPass(createARMSLSHardeningPass());
 }
 
 void ARMPassConfig::addPreEmitPass() {
@@ -549,9 +553,11 @@ void ARMPassConfig::addPreEmitPass() {
     return MF.getSubtarget<ARMSubtarget>().isThumb2();
   }));
 
-  // Don't optimize barriers at -O0.
-  if (getOptLevel() != CodeGenOpt::None)
+  // Don't optimize barriers or block placement at -O0.
+  if (getOptLevel() != CodeGenOpt::None) {
+    addPass(createARMBlockPlacementPass());
     addPass(createARMOptimizeBarriersPass());
+  }
 }
 
 void ARMPassConfig::addPreEmitPass2() {
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.h
index ac55d2bdcc2b..8428092bf179 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.h
@@ -72,6 +72,12 @@ public:
   }
 
   bool targetSchedulesPostRAScheduling() const override { return true; };
+
+  /// Returns true if a cast between SrcAS and DestAS is a noop.
+  bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
+    // Addrspacecasts are always noops.
+    return true;
+  }
 };
 
 /// ARM/Thumb little endian target machine.
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index bea4e157a131..890193401373 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -20,14 +20,18 @@
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsARM.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Type.h"
 #include "llvm/MC/SubtargetFeature.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MachineValueType.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include <algorithm>
 #include <cassert>
@@ -46,10 +50,38 @@ static cl::opt<bool> DisableLowOverheadLoops(
   "disable-arm-loloops", cl::Hidden, cl::init(false),
   cl::desc("Disable the generation of low-overhead loops"));
 
+static cl::opt<bool>
+    AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
+                  cl::desc("Enable the generation of WLS loops"));
+
 extern cl::opt<TailPredication::Mode> EnableTailPredication;
 
 extern cl::opt<bool> EnableMaskedGatherScatters;
 
+extern cl::opt<unsigned> MVEMaxSupportedInterleaveFactor;
+
+/// Convert a vector load intrinsic into a simple llvm load instruction.
+/// This is beneficial when the underlying object being addressed comes
+/// from a constant, since we get constant-folding for free.
+static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,
+                               InstCombiner::BuilderTy &Builder) {
+  auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));
+
+  if (!IntrAlign)
+    return nullptr;
+
+  unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
+                           ? MemAlign
+                           : IntrAlign->getLimitedValue();
+
+  if (!isPowerOf2_32(Alignment))
+    return nullptr;
+
+  auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0),
+                                          PointerType::get(II.getType(), 0));
+  return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment));
+}
+
 bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
                                      const Function *Callee) const {
   const TargetMachine &TM = getTLI()->getTargetMachine();
@@ -82,6 +114,138 @@ bool ARMTTIImpl::shouldFavorPostInc() const {
   return false;
 }
 
+Optional<Instruction *>
+ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
+  using namespace PatternMatch;
+  Intrinsic::ID IID = II.getIntrinsicID();
+  switch (IID) {
+  default:
+    break;
+  case Intrinsic::arm_neon_vld1: {
+    Align MemAlign =
+        getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
+                          &IC.getAssumptionCache(), &IC.getDominatorTree());
+    if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) {
+      return IC.replaceInstUsesWith(II, V);
+    }
+    break;
+  }
+
+  case Intrinsic::arm_neon_vld2:
+  case Intrinsic::arm_neon_vld3:
+  case Intrinsic::arm_neon_vld4:
+  case Intrinsic::arm_neon_vld2lane:
+  case Intrinsic::arm_neon_vld3lane:
+  case Intrinsic::arm_neon_vld4lane:
+  case Intrinsic::arm_neon_vst1:
+  case Intrinsic::arm_neon_vst2:
+  case Intrinsic::arm_neon_vst3:
+  case Intrinsic::arm_neon_vst4:
+  case Intrinsic::arm_neon_vst2lane:
+  case Intrinsic::arm_neon_vst3lane:
+  case Intrinsic::arm_neon_vst4lane: {
+    Align MemAlign =
+        getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
+                          &IC.getAssumptionCache(), &IC.getDominatorTree());
+    unsigned AlignArg = II.getNumArgOperands() - 1;
+    Value *AlignArgOp = II.getArgOperand(AlignArg);
+    MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue();
+    if (Align && *Align < MemAlign) {
+      return IC.replaceOperand(
+          II, AlignArg,
+          ConstantInt::get(Type::getInt32Ty(II.getContext()), MemAlign.value(),
+                           false));
+    }
+    break;
+  }
+
+  case Intrinsic::arm_mve_pred_i2v: {
+    Value *Arg = II.getArgOperand(0);
+    Value *ArgArg;
+    if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
+                       PatternMatch::m_Value(ArgArg))) &&
+        II.getType() == ArgArg->getType()) {
+      return IC.replaceInstUsesWith(II, ArgArg);
+    }
+    Constant *XorMask;
+    if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
+                             PatternMatch::m_Value(ArgArg)),
+                         PatternMatch::m_Constant(XorMask))) &&
+        II.getType() == ArgArg->getType()) {
+      if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
+        if (CI->getValue().trunc(16).isAllOnesValue()) {
+          auto TrueVector = IC.Builder.CreateVectorSplat(
+              cast<FixedVectorType>(II.getType())->getNumElements(),
+              IC.Builder.getTrue());
+          return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
+        }
+      }
+    }
+    KnownBits ScalarKnown(32);
+    if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16),
+                                ScalarKnown, 0)) {
+      return &II;
+    }
+    break;
+  }
+  case Intrinsic::arm_mve_pred_v2i: {
+    Value *Arg = II.getArgOperand(0);
+    Value *ArgArg;
+    if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(
+                       PatternMatch::m_Value(ArgArg)))) {
+      return IC.replaceInstUsesWith(II, ArgArg);
+    }
+    if (!II.getMetadata(LLVMContext::MD_range)) {
+      Type *IntTy32 = Type::getInt32Ty(II.getContext());
+      Metadata *M[] = {
+          ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0)),
+          ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0xFFFF))};
+      II.setMetadata(LLVMContext::MD_range, MDNode::get(II.getContext(), M));
+      return &II;
+    }
+    break;
+  }
+  case Intrinsic::arm_mve_vadc:
+  case Intrinsic::arm_mve_vadc_predicated: {
+    unsigned CarryOp =
+        (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
+    assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&
+           "Bad type for intrinsic!");
+
+    KnownBits CarryKnown(32);
+    if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29),
+                                CarryKnown)) {
+      return &II;
+    }
+    break;
+  }
+  case Intrinsic::arm_mve_vmldava: {
+    Instruction *I = cast<Instruction>(&II);
+    if (I->hasOneUse()) {
+      auto *User = cast<Instruction>(*I->user_begin());
+      Value *OpZ;
+      if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) &&
+          match(I->getOperand(3), m_Zero())) {
+        Value *OpX = I->getOperand(4);
+        Value *OpY = I->getOperand(5);
+        Type *OpTy = OpX->getType();
+
+        IC.Builder.SetInsertPoint(User);
+        Value *V =
+            IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy},
+                                       {I->getOperand(0), I->getOperand(1),
+                                        I->getOperand(2), OpZ, OpX, OpY});
+
+        IC.replaceInstUsesWith(*User, V);
+        return IC.eraseInstFromFunction(*User);
+      }
+    }
+    return None;
+  }
+  }
+  return None;
+}
+
 int ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
                               TTI::TargetCostKind CostKind) {
   assert(Ty->isIntegerTy());
@@ -125,8 +289,43 @@ int ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
   return 1;
 }
 
-int ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,
-                                  Type *Ty, TTI::TargetCostKind CostKind) {
+// Checks whether Inst is part of a min(max()) or max(min()) pattern
+// that will match to an SSAT instruction
+static bool isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) {
+  Value *LHS, *RHS;
+  ConstantInt *C;
+  SelectPatternFlavor InstSPF = matchSelectPattern(Inst, LHS, RHS).Flavor;
+
+  if (InstSPF == SPF_SMAX &&
+      PatternMatch::match(RHS, PatternMatch::m_ConstantInt(C)) &&
+      C->getValue() == Imm && Imm.isNegative() && (-Imm).isPowerOf2()) {
+
+    auto isSSatMin = [&](Value *MinInst) {
+      if (isa<SelectInst>(MinInst)) {
+        Value *MinLHS, *MinRHS;
+        ConstantInt *MinC;
+        SelectPatternFlavor MinSPF =
+            matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor;
+        if (MinSPF == SPF_SMIN &&
+            PatternMatch::match(MinRHS, PatternMatch::m_ConstantInt(MinC)) &&
+            MinC->getValue() == ((-Imm) - 1))
+          return true;
+      }
+      return false;
+    };
+
+    if (isSSatMin(Inst->getOperand(1)) ||
+        (Inst->hasNUses(2) && (isSSatMin(*Inst->user_begin()) ||
+                               isSSatMin(*(++Inst->user_begin())))))
+      return true;
+  }
+  return false;
+}
+
+int ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
+                                  const APInt &Imm, Type *Ty,
+                                  TTI::TargetCostKind CostKind,
+                                  Instruction *Inst) {
   // Division by a constant can be turned into multiplication, but only if we
   // know it's constant. So it's not so much that the immediate is cheap (it's
   // not), but that the alternative is worse.
@@ -165,10 +364,33 @@ int ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Im
   if (Opcode == Instruction::Xor && Imm.isAllOnesValue())
     return 0;
 
+  // Ensures negative constant of min(max()) or max(min()) patterns that
+  // match to SSAT instructions don't get hoisted
+  if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) &&
+      Ty->getIntegerBitWidth() <= 32) {
+    if (isSSATMinMaxPattern(Inst, Imm) ||
+        (isa<ICmpInst>(Inst) && Inst->hasOneUse() &&
+         isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm)))
+      return 0;
+  }
+
   return getIntImmCost(Imm, Ty, CostKind);
 }
 
+int ARMTTIImpl::getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) {
+  if (CostKind == TTI::TCK_RecipThroughput &&
+      (ST->hasNEON() || ST->hasMVEIntegerOps())) {
+    // FIXME: The vectorizer is highly sensistive to the cost of these
+    // instructions, which suggests that it may be using the costs incorrectly.
+    // But, for now, just make them free to avoid performance regressions for
+    // vector targets.
+    return 0;
+  }
+  return BaseT::getCFInstrCost(Opcode, CostKind);
+}
+
 int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                                 TTI::CastContextHint CCH,
                                  TTI::TargetCostKind CostKind,
                                  const Instruction *I) {
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
@@ -180,15 +402,35 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
       return Cost == 0 ? 0 : 1;
     return Cost;
   };
+  auto IsLegalFPType = [this](EVT VT) {
+    EVT EltVT = VT.getScalarType();
+    return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
+            (EltVT == MVT::f64 && ST->hasFP64()) ||
+            (EltVT == MVT::f16 && ST->hasFullFP16());
+  };
 
   EVT SrcTy = TLI->getValueType(DL, Src);
   EVT DstTy = TLI->getValueType(DL, Dst);
 
   if (!SrcTy.isSimple() || !DstTy.isSimple())
-    return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I));
-
-  // The extend of a load is free
-  if (I && isa<LoadInst>(I->getOperand(0))) {
+    return AdjustCost(
+        BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
+
+  // Extending masked load/Truncating masked stores is expensive because we
+  // currently don't split them. This means that we'll likely end up
+  // loading/storing each element individually (hence the high cost).
+  if ((ST->hasMVEIntegerOps() &&
+       (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt ||
+        Opcode == Instruction::SExt)) ||
+      (ST->hasMVEFloatOps() &&
+       (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) &&
+       IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)))
+    if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)
+      return 2 * DstTy.getVectorNumElements() * ST->getMVEVectorCostFactor();
+
+  // The extend of other kinds of load is free
+  if (CCH == TTI::CastContextHint::Normal ||
+      CCH == TTI::CastContextHint::Masked) {
     static const TypeConversionCostTblEntry LoadConversionTbl[] = {
         {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0},
         {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0},
@@ -242,33 +484,32 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
                                      DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
         return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor());
     }
-  }
 
-  // The truncate of a store is free. This is the mirror of extends above.
-  if (I && I->hasOneUse() && isa<StoreInst>(*I->user_begin())) {
-    static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
+    // The truncate of a store is free. This is the mirror of extends above.
+    static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
         {ISD::TRUNCATE, MVT::v4i32, MVT::v4i16, 0},
         {ISD::TRUNCATE, MVT::v4i32, MVT::v4i8, 0},
         {ISD::TRUNCATE, MVT::v8i16, MVT::v8i8, 0},
         {ISD::TRUNCATE, MVT::v8i32, MVT::v8i16, 1},
+        {ISD::TRUNCATE, MVT::v8i32, MVT::v8i8, 1},
         {ISD::TRUNCATE, MVT::v16i32, MVT::v16i8, 3},
         {ISD::TRUNCATE, MVT::v16i16, MVT::v16i8, 1},
     };
     if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
       if (const auto *Entry =
-              ConvertCostTableLookup(MVELoadConversionTbl, ISD, SrcTy.getSimpleVT(),
-                                     DstTy.getSimpleVT()))
+              ConvertCostTableLookup(MVEStoreConversionTbl, ISD,
+                                     SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
         return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor());
     }
 
-    static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
+    static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
         {ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1},
         {ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3},
     };
     if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
       if (const auto *Entry =
-              ConvertCostTableLookup(MVEFLoadConversionTbl, ISD, SrcTy.getSimpleVT(),
-                                     DstTy.getSimpleVT()))
+              ConvertCostTableLookup(MVEFStoreConversionTbl, ISD,
+                                     SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
         return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor());
     }
   }
@@ -504,19 +745,25 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     int Lanes = 1;
     if (SrcTy.isFixedLengthVector())
       Lanes = SrcTy.getVectorNumElements();
-    auto IsLegal = [this](EVT VT) {
-      EVT EltVT = VT.getScalarType();
-      return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
-             (EltVT == MVT::f64 && ST->hasFP64()) ||
-             (EltVT == MVT::f16 && ST->hasFullFP16());
-    };
 
-    if (IsLegal(SrcTy) && IsLegal(DstTy))
+    if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))
       return Lanes;
     else
       return Lanes * CallCost;
   }
 
+  if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&
+      SrcTy.isFixedLengthVector()) {
+    // Treat a truncate with larger than legal source (128bits for MVE) as
+    // expensive, 2 instructions per lane.
+    if ((SrcTy.getScalarType() == MVT::i8 ||
+         SrcTy.getScalarType() == MVT::i16 ||
+         SrcTy.getScalarType() == MVT::i32) &&
+        SrcTy.getSizeInBits() > 128 &&
+        SrcTy.getSizeInBits() > DstTy.getSizeInBits())
+      return SrcTy.getVectorNumElements() * 2;
+  }
+
   // Scalar integer conversion costs.
   static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
     // i16 -> i64 requires two dependent operations.
@@ -540,7 +787,7 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
                      ? ST->getMVEVectorCostFactor()
                      : 1;
   return AdjustCost(
-    BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I));
+      BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
 }
 
 int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
@@ -580,15 +827,37 @@ int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
 }
 
 int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                                   CmpInst::Predicate VecPred,
                                    TTI::TargetCostKind CostKind,
                                    const Instruction *I) {
-  // TODO: Handle other cost kinds.
-  if (CostKind != TTI::TCK_RecipThroughput)
-    return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I);
-
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
+
+  // Thumb scalar code size cost for select.
+  if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT &&
+      ST->isThumb() && !ValTy->isVectorTy()) {
+    // Assume expensive structs.
+    if (TLI->getValueType(DL, ValTy, true) == MVT::Other)
+      return TTI::TCC_Expensive;
+
+    // Select costs can vary because they:
+    // - may require one or more conditional mov (including an IT),
+    // - can't operate directly on immediates,
+    // - require live flags, which we can't copy around easily.
+    int Cost = TLI->getTypeLegalizationCost(DL, ValTy).first;
+
+    // Possible IT instruction for Thumb2, or more for Thumb1.
+    ++Cost;
+
+    // i1 values may need rematerialising by using mov immediates and/or
+    // flag setting instructions.
+    if (ValTy->isIntegerTy(1))
+      ++Cost;
+
+    return Cost;
+  }
+
   // On NEON a vector select gets lowered to vbsl.
-  if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT) {
+  if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
     // Lowering of some vector selects is currently far from perfect.
     static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
       { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
@@ -609,11 +878,15 @@ int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
     return LT.first;
   }
 
-  int BaseCost = ST->hasMVEIntegerOps() && ValTy->isVectorTy()
-                     ? ST->getMVEVectorCostFactor()
-                     : 1;
-  return BaseCost * BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind,
-                                              I);
+  // Default to cheap (throughput/size of 1 instruction) but adjust throughput
+  // for "multiple beats" potentially needed by MVE instructions.
+  int BaseCost = 1;
+  if (CostKind != TTI::TCK_CodeSize && ST->hasMVEIntegerOps() &&
+      ValTy->isVectorTy())
+    BaseCost = ST->getMVEVectorCostFactor();
+
+  return BaseCost *
+         BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
 }
 
 int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
@@ -695,39 +968,83 @@ bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) {
           (EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
 }
 
-int ARMTTIImpl::getMemcpyCost(const Instruction *I) {
-  const MemCpyInst *MI = dyn_cast<MemCpyInst>(I);
-  assert(MI && "MemcpyInst expected");
-  ConstantInt *C = dyn_cast<ConstantInt>(MI->getLength());
+/// Given a memcpy/memset/memmove instruction, return the number of memory
+/// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
+/// call is used.
+int ARMTTIImpl::getNumMemOps(const IntrinsicInst *I) const {
+  MemOp MOp;
+  unsigned DstAddrSpace = ~0u;
+  unsigned SrcAddrSpace = ~0u;
+  const Function *F = I->getParent()->getParent();
 
-  // To model the cost of a library call, we assume 1 for the call, and
-  // 3 for the argument setup.
-  const unsigned LibCallCost = 4;
+  if (const auto *MC = dyn_cast<MemTransferInst>(I)) {
+    ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength());
+    // If 'size' is not a constant, a library call will be generated.
+    if (!C)
+      return -1;
 
-  // If 'size' is not a constant, a library call will be generated.
-  if (!C)
-    return LibCallCost;
+    const unsigned Size = C->getValue().getZExtValue();
+    const Align DstAlign = *MC->getDestAlign();
+    const Align SrcAlign = *MC->getSourceAlign();
 
-  const unsigned Size = C->getValue().getZExtValue();
-  const Align DstAlign = *MI->getDestAlign();
-  const Align SrcAlign = *MI->getSourceAlign();
-  const Function *F = I->getParent()->getParent();
-  const unsigned Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
-  std::vector<EVT> MemOps;
+    MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
+                      /*IsVolatile*/ false);
+    DstAddrSpace = MC->getDestAddressSpace();
+    SrcAddrSpace = MC->getSourceAddressSpace();
+  }
+  else if (const auto *MS = dyn_cast<MemSetInst>(I)) {
+    ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength());
+    // If 'size' is not a constant, a library call will be generated.
+    if (!C)
+      return -1;
+
+    const unsigned Size = C->getValue().getZExtValue();
+    const Align DstAlign = *MS->getDestAlign();
+
+    MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign,
+                     /*IsZeroMemset*/ false, /*IsVolatile*/ false);
+    DstAddrSpace = MS->getDestAddressSpace();
+  }
+  else
+    llvm_unreachable("Expected a memcpy/move or memset!");
+
+  unsigned Limit, Factor = 2;
+  switch(I->getIntrinsicID()) {
+    case Intrinsic::memcpy:
+      Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize());
+      break;
+    case Intrinsic::memmove:
+      Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
+      break;
+    case Intrinsic::memset:
+      Limit = TLI->getMaxStoresPerMemset(F->hasMinSize());
+      Factor = 1;
+      break;
+    default:
+      llvm_unreachable("Expected a memcpy/move or memset!");
+  }
 
   // MemOps will be poplulated with a list of data types that needs to be
   // loaded and stored. That's why we multiply the number of elements by 2 to
   // get the cost for this memcpy.
+  std::vector<EVT> MemOps;
   if (getTLI()->findOptimalMemOpLowering(
-          MemOps, Limit,
-          MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
-                      /*IsVolatile*/ true),
-          MI->getDestAddressSpace(), MI->getSourceAddressSpace(),
-          F->getAttributes()))
-    return MemOps.size() * 2;
+          MemOps, Limit, MOp, DstAddrSpace,
+          SrcAddrSpace, F->getAttributes()))
+    return MemOps.size() * Factor;
 
   // If we can't find an optimal memop lowering, return the default cost
-  return LibCallCost;
+  return -1;
+}
+
+int ARMTTIImpl::getMemcpyCost(const Instruction *I) {
+  int NumOps = getNumMemOps(cast<IntrinsicInst>(I));
+
+  // To model the cost of a library call, we assume 1 for the call, and
+  // 3 for the argument setup.
+  if (NumOps == -1)
+    return 4;
+  return NumOps;
 }
 
 int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
@@ -832,13 +1149,22 @@ int ARMTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
                                        TTI::OperandValueProperties Opd2PropInfo,
                                        ArrayRef<const Value *> Args,
                                        const Instruction *CxtI) {
-  // TODO: Handle more cost kinds.
-  if (CostKind != TTI::TCK_RecipThroughput)
-    return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
-                                         Op2Info, Opd1PropInfo,
-                                         Opd2PropInfo, Args, CxtI);
-
   int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
+  if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) {
+    // Make operations on i1 relatively expensive as this often involves
+    // combining predicates. AND and XOR should be easier to handle with IT
+    // blocks.
+    switch (ISDOpcode) {
+    default:
+      break;
+    case ISD::AND:
+    case ISD::XOR:
+      return 2;
+    case ISD::OR:
+      return 3;
+    }
+  }
+
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
 
   if (ST->hasNEON()) {
@@ -933,9 +1259,12 @@ int ARMTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
   if (LooksLikeAFreeShift())
     return 0;
 
-  int BaseCost = ST->hasMVEIntegerOps() && Ty->isVectorTy()
-                     ? ST->getMVEVectorCostFactor()
-                     : 1;
+  // Default to cheap (throughput/size of 1 instruction) but adjust throughput
+  // for "multiple beats" potentially needed by MVE instructions.
+  int BaseCost = 1;
+  if (CostKind != TTI::TCK_CodeSize && ST->hasMVEIntegerOps() &&
+      Ty->isVectorTy())
+    BaseCost = ST->getMVEVectorCostFactor();
 
   // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
   // without treating floats as more expensive that scalars or increasing the
@@ -1002,6 +1331,24 @@ int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
                                            CostKind, I);
 }
 
+unsigned ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
+                                           Align Alignment,
+                                           unsigned AddressSpace,
+                                           TTI::TargetCostKind CostKind) {
+  if (ST->hasMVEIntegerOps()) {
+    if (Opcode == Instruction::Load && isLegalMaskedLoad(Src, Alignment))
+      return ST->getMVEVectorCostFactor();
+    if (Opcode == Instruction::Store && isLegalMaskedStore(Src, Alignment))
+      return ST->getMVEVectorCostFactor();
+  }
+  if (!isa<FixedVectorType>(Src))
+    return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
+                                        CostKind);
+  // Scalar cost, which is currently very high due to the efficiency of the
+  // generated code.
+  return cast<FixedVectorType>(Src)->getNumElements() * 8;
+}
+
 int ARMTTIImpl::getInterleavedMemoryOpCost(
     unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
     Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
@@ -1032,7 +1379,8 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(
     // promoted differently). The cost of 2 here is then a load and vrev or
     // vmovn.
     if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
-        VecTy->isIntOrIntVectorTy() && DL.getTypeSizeInBits(SubVecTy) <= 64)
+        VecTy->isIntOrIntVectorTy() &&
+        DL.getTypeSizeInBits(SubVecTy).getFixedSize() <= 64)
       return 2 * BaseCost;
   }
 
@@ -1065,13 +1413,13 @@ unsigned ARMTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
   // multiplied by the number of elements being loaded. This is possibly very
   // conservative, but even so we still end up vectorising loops because the
   // cost per iteration for many loops is lower than for scalar loops.
-  unsigned VectorCost = NumElems * LT.first;
+  unsigned VectorCost = NumElems * LT.first * ST->getMVEVectorCostFactor();
   // The scalarization cost should be a lot higher. We use the number of vector
   // elements plus the scalarization overhead.
   unsigned ScalarCost =
       NumElems * LT.first + BaseT::getScalarizationOverhead(VTy, {});
 
-  if (Alignment < EltSize / 8)
+  if (EltSize < 8 || Alignment < EltSize / 8)
     return ScalarCost;
 
   unsigned ExtSize = EltSize;
@@ -1140,6 +1488,92 @@ unsigned ARMTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
   return ScalarCost;
 }
 
+int ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
+                                           bool IsPairwiseForm,
+                                           TTI::TargetCostKind CostKind) {
+  EVT ValVT = TLI->getValueType(DL, ValTy);
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD)
+    return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm,
+                                             CostKind);
+
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+
+  static const CostTblEntry CostTblAdd[]{
+      {ISD::ADD, MVT::v16i8, 1},
+      {ISD::ADD, MVT::v8i16, 1},
+      {ISD::ADD, MVT::v4i32, 1},
+  };
+  if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second))
+    return Entry->Cost * ST->getMVEVectorCostFactor() * LT.first;
+
+  return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm,
+                                           CostKind);
+}
+
+InstructionCost
+ARMTTIImpl::getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned,
+                                        Type *ResTy, VectorType *ValTy,
+                                        TTI::TargetCostKind CostKind) {
+  EVT ValVT = TLI->getValueType(DL, ValTy);
+  EVT ResVT = TLI->getValueType(DL, ResTy);
+  if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
+    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+    if ((LT.second == MVT::v16i8 && ResVT.getSizeInBits() <= 32) ||
+        (LT.second == MVT::v8i16 &&
+         ResVT.getSizeInBits() <= (IsMLA ? 64 : 32)) ||
+        (LT.second == MVT::v4i32 && ResVT.getSizeInBits() <= 64))
+      return ST->getMVEVectorCostFactor() * LT.first;
+  }
+
+  return BaseT::getExtendedAddReductionCost(IsMLA, IsUnsigned, ResTy, ValTy,
+                                            CostKind);
+}
+
+int ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
+                                      TTI::TargetCostKind CostKind) {
+  switch (ICA.getID()) {
+  case Intrinsic::get_active_lane_mask:
+    // Currently we make a somewhat optimistic assumption that
+    // active_lane_mask's are always free. In reality it may be freely folded
+    // into a tail predicated loop, expanded into a VCPT or expanded into a lot
+    // of add/icmp code. We may need to improve this in the future, but being
+    // able to detect if it is free or not involves looking at a lot of other
+    // code. We currently assume that the vectorizer inserted these, and knew
+    // what it was doing in adding one.
+    if (ST->hasMVEIntegerOps())
+      return 0;
+    break;
+  case Intrinsic::sadd_sat:
+  case Intrinsic::ssub_sat:
+  case Intrinsic::uadd_sat:
+  case Intrinsic::usub_sat: {
+    if (!ST->hasMVEIntegerOps())
+      break;
+    // Get the Return type, either directly of from ICA.ReturnType and ICA.VF.
+    Type *VT = ICA.getReturnType();
+    if (!VT->isVectorTy() && !ICA.getVectorFactor().isScalar())
+      VT = VectorType::get(VT, ICA.getVectorFactor());
+
+    std::pair<int, MVT> LT =
+        TLI->getTypeLegalizationCost(DL, VT);
+    if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
+        LT.second == MVT::v16i8) {
+      // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
+      // need to extend the type, as it uses shr(qadd(shl, shl)).
+      unsigned Instrs = LT.second.getScalarSizeInBits() ==
+                                ICA.getReturnType()->getScalarSizeInBits()
+                            ? 1
+                            : 4;
+      return LT.first * ST->getMVEVectorCostFactor() * Instrs;
+    }
+    break;
+  }
+  }
+
+  return BaseT::getIntrinsicInstrCost(ICA, CostKind);
+}
+
 bool ARMTTIImpl::isLoweredToCall(const Function *F) {
   if (!F->isIntrinsic())
     BaseT::isLoweredToCall(F);
@@ -1201,6 +1635,93 @@ bool ARMTTIImpl::isLoweredToCall(const Function *F) {
   return BaseT::isLoweredToCall(F);
 }
 
+bool ARMTTIImpl::maybeLoweredToCall(Instruction &I) {
+  unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
+  EVT VT = TLI->getValueType(DL, I.getType(), true);
+  if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
+    return true;
+
+  // Check if an intrinsic will be lowered to a call and assume that any
+  // other CallInst will generate a bl.
+  if (auto *Call = dyn_cast<CallInst>(&I)) {
+    if (auto *II = dyn_cast<IntrinsicInst>(Call)) {
+      switch(II->getIntrinsicID()) {
+        case Intrinsic::memcpy:
+        case Intrinsic::memset:
+        case Intrinsic::memmove:
+          return getNumMemOps(II) == -1;
+        default:
+          if (const Function *F = Call->getCalledFunction())
+            return isLoweredToCall(F);
+      }
+    }
+    return true;
+  }
+
+  // FPv5 provides conversions between integer, double-precision,
+  // single-precision, and half-precision formats.
+  switch (I.getOpcode()) {
+  default:
+    break;
+  case Instruction::FPToSI:
+  case Instruction::FPToUI:
+  case Instruction::SIToFP:
+  case Instruction::UIToFP:
+  case Instruction::FPTrunc:
+  case Instruction::FPExt:
+    return !ST->hasFPARMv8Base();
+  }
+
+  // FIXME: Unfortunately the approach of checking the Operation Action does
+  // not catch all cases of Legalization that use library calls. Our
+  // Legalization step categorizes some transformations into library calls as
+  // Custom, Expand or even Legal when doing type legalization. So for now
+  // we have to special case for instance the SDIV of 64bit integers and the
+  // use of floating point emulation.
+  if (VT.isInteger() && VT.getSizeInBits() >= 64) {
+    switch (ISD) {
+    default:
+      break;
+    case ISD::SDIV:
+    case ISD::UDIV:
+    case ISD::SREM:
+    case ISD::UREM:
+    case ISD::SDIVREM:
+    case ISD::UDIVREM:
+      return true;
+    }
+  }
+
+  // Assume all other non-float operations are supported.
+  if (!VT.isFloatingPoint())
+    return false;
+
+  // We'll need a library call to handle most floats when using soft.
+  if (TLI->useSoftFloat()) {
+    switch (I.getOpcode()) {
+    default:
+      return true;
+    case Instruction::Alloca:
+    case Instruction::Load:
+    case Instruction::Store:
+    case Instruction::Select:
+    case Instruction::PHI:
+      return false;
+    }
+  }
+
+  // We'll need a libcall to perform double precision operations on a single
+  // precision only FPU.
+  if (I.getType()->isDoubleTy() && !ST->hasFP64())
+    return true;
+
+  // Likewise for half precision arithmetic.
+  if (I.getType()->isHalfTy() && !ST->hasFullFP16())
+    return true;
+
+  return false;
+}
+
 bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
                                           AssumptionCache &AC,
                                           TargetLibraryInfo *LibInfo,
@@ -1235,93 +1756,13 @@ bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
 
   // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
   // point in generating a hardware loop if that's going to happen.
-  auto MaybeCall = [this](Instruction &I) {
-    const ARMTargetLowering *TLI = getTLI();
-    unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
-    EVT VT = TLI->getValueType(DL, I.getType(), true);
-    if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
-      return true;
-
-    // Check if an intrinsic will be lowered to a call and assume that any
-    // other CallInst will generate a bl.
-    if (auto *Call = dyn_cast<CallInst>(&I)) {
-      if (isa<IntrinsicInst>(Call)) {
-        if (const Function *F = Call->getCalledFunction())
-          return isLoweredToCall(F);
-      }
-      return true;
-    }
-
-    // FPv5 provides conversions between integer, double-precision,
-    // single-precision, and half-precision formats.
-    switch (I.getOpcode()) {
-    default:
-      break;
-    case Instruction::FPToSI:
-    case Instruction::FPToUI:
-    case Instruction::SIToFP:
-    case Instruction::UIToFP:
-    case Instruction::FPTrunc:
-    case Instruction::FPExt:
-      return !ST->hasFPARMv8Base();
-    }
-
-    // FIXME: Unfortunately the approach of checking the Operation Action does
-    // not catch all cases of Legalization that use library calls. Our
-    // Legalization step categorizes some transformations into library calls as
-    // Custom, Expand or even Legal when doing type legalization. So for now
-    // we have to special case for instance the SDIV of 64bit integers and the
-    // use of floating point emulation.
-    if (VT.isInteger() && VT.getSizeInBits() >= 64) {
-      switch (ISD) {
-      default:
-        break;
-      case ISD::SDIV:
-      case ISD::UDIV:
-      case ISD::SREM:
-      case ISD::UREM:
-      case ISD::SDIVREM:
-      case ISD::UDIVREM:
-        return true;
-      }
-    }
-
-    // Assume all other non-float operations are supported.
-    if (!VT.isFloatingPoint())
-      return false;
-
-    // We'll need a library call to handle most floats when using soft.
-    if (TLI->useSoftFloat()) {
-      switch (I.getOpcode()) {
-      default:
-        return true;
-      case Instruction::Alloca:
-      case Instruction::Load:
-      case Instruction::Store:
-      case Instruction::Select:
-      case Instruction::PHI:
-        return false;
-      }
-    }
-
-    // We'll need a libcall to perform double precision operations on a single
-    // precision only FPU.
-    if (I.getType()->isDoubleTy() && !ST->hasFP64())
-      return true;
-
-    // Likewise for half precision arithmetic.
-    if (I.getType()->isHalfTy() && !ST->hasFullFP16())
-      return true;
-
-    return false;
-  };
 
   auto IsHardwareLoopIntrinsic = [](Instruction &I) {
     if (auto *Call = dyn_cast<IntrinsicInst>(&I)) {
       switch (Call->getIntrinsicID()) {
       default:
         break;
-      case Intrinsic::set_loop_iterations:
+      case Intrinsic::start_loop_iterations:
       case Intrinsic::test_set_loop_iterations:
       case Intrinsic::loop_decrement:
       case Intrinsic::loop_decrement_reg:
@@ -1332,14 +1773,24 @@ bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
   };
 
   // Scan the instructions to see if there's any that we know will turn into a
-  // call or if this loop is already a low-overhead loop.
+  // call or if this loop is already a low-overhead loop or will become a tail
+  // predicated loop.
+  bool IsTailPredLoop = false;
   auto ScanLoop = [&](Loop *L) {
     for (auto *BB : L->getBlocks()) {
       for (auto &I : *BB) {
-        if (MaybeCall(I) || IsHardwareLoopIntrinsic(I)) {
+        if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) ||
+            isa<InlineAsm>(I)) {
           LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n");
           return false;
         }
+        if (auto *II = dyn_cast<IntrinsicInst>(&I))
+          IsTailPredLoop |=
+              II->getIntrinsicID() == Intrinsic::get_active_lane_mask ||
+              II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 ||
+              II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 ||
+              II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 ||
+              II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;
       }
     }
     return true;
@@ -1360,7 +1811,7 @@ bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
   LLVMContext &C = L->getHeader()->getContext();
   HWLoopInfo.CounterInReg = true;
   HWLoopInfo.IsNestingLegal = false;
-  HWLoopInfo.PerformEntryTest = true;
+  HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;
   HWLoopInfo.CountType = Type::getInt32Ty(C);
   HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
   return true;
@@ -1408,35 +1859,28 @@ static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
                                  const LoopAccessInfo *LAI) {
   LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
 
-  // If there are live-out values, it is probably a reduction, which needs a
-  // final reduction step after the loop. MVE has a VADDV instruction to reduce
-  // integer vectors, but doesn't have an equivalent one for float vectors. A
-  // live-out value that is not recognised as a reduction will result in the
-  // tail-predicated loop to be reverted to a non-predicated loop and this is
-  // very expensive, i.e. it has a significant performance impact. So, in this
-  // case it's better not to tail-predicate the loop, which is what we check
-  // here. Thus, we allow only 1 live-out value, which has to be an integer
-  // reduction, which matches the loops supported by ARMLowOverheadLoops.
-  // It is important to keep ARMLowOverheadLoops and canTailPredicateLoop in
-  // sync with each other.
+  // If there are live-out values, it is probably a reduction. We can predicate
+  // most reduction operations freely under MVE using a combination of
+  // prefer-predicated-reduction-select and inloop reductions. We limit this to
+  // floating point and integer reductions, but don't check for operators
+  // specifically here. If the value ends up not being a reduction (and so the
+  // vectorizer cannot tailfold the loop), we should fall back to standard
+  // vectorization automatically.
   SmallVector< Instruction *, 8 > LiveOuts;
   LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
-  bool IntReductionsDisabled =
+  bool ReductionsDisabled =
       EnableTailPredication == TailPredication::EnabledNoReductions ||
       EnableTailPredication == TailPredication::ForceEnabledNoReductions;
 
   for (auto *I : LiveOuts) {
-    if (!I->getType()->isIntegerTy()) {
-      LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer "
+    if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
+        !I->getType()->isHalfTy()) {
+      LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "
                            "live-out value\n");
       return false;
     }
-    if (I->getOpcode() != Instruction::Add) {
-      LLVM_DEBUG(dbgs() << "Only add reductions supported\n");
-      return false;
-    }
-    if (IntReductionsDisabled) {
-      LLVM_DEBUG(dbgs() << "Integer add reductions not enabled\n");
+    if (ReductionsDisabled) {
+      LLVM_DEBUG(dbgs() << "Reductions not enabled\n");
       return false;
     }
   }
@@ -1445,7 +1889,6 @@ static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
   PredicatedScalarEvolution PSE = LAI->getPSE();
   SmallVector<Instruction *, 16> LoadStores;
   int ICmpCount = 0;
-  int Stride = 0;
 
   for (BasicBlock *BB : L->blocks()) {
     for (Instruction &I : BB->instructionsWithoutDebug()) {
@@ -1464,22 +1907,38 @@ static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
         LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
         return false;
       }
-
       if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
         Value *Ptr = isa<LoadInst>(I) ? I.getOperand(0) : I.getOperand(1);
         int64_t NextStride = getPtrStride(PSE, Ptr, L);
-        // TODO: for now only allow consecutive strides of 1. We could support
-        // other strides as long as it is uniform, but let's keep it simple for
-        // now.
-        if (Stride == 0 && NextStride == 1) {
-          Stride = NextStride;
+        if (NextStride == 1) {
+          // TODO: for now only allow consecutive strides of 1. We could support
+          // other strides as long as it is uniform, but let's keep it simple
+          // for now.
           continue;
-        }
-        if (Stride != NextStride) {
-          LLVM_DEBUG(dbgs() << "Different strides found, can't "
-                               "tail-predicate\n.");
+        } else if (NextStride == -1 ||
+                   (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) ||
+                   (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) {
+          LLVM_DEBUG(dbgs()
+                     << "Consecutive strides of 2 found, vld2/vstr2 can't "
+                        "be tail-predicated\n.");
           return false;
+          // TODO: don't tail predicate if there is a reversed load?
+        } else if (EnableMaskedGatherScatters) {
+          // Gather/scatters do allow loading from arbitrary strides, at
+          // least if they are loop invariant.
+          // TODO: Loop variant strides should in theory work, too, but
+          // this requires further testing.
+          const SCEV *PtrScev =
+              replaceSymbolicStrideSCEV(PSE, llvm::ValueToValueMap(), Ptr);
+          if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) {
+            const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
+            if (PSE.getSE()->isLoopInvariant(Step, L))
+              continue;
+          }
         }
+        LLVM_DEBUG(dbgs() << "Bad stride found, can't "
+                             "tail-predicate\n.");
+        return false;
       }
     }
   }
@@ -1512,7 +1971,7 @@ bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
     return false;
   }
 
-  assert(L->empty() && "preferPredicateOverEpilogue: inner-loop expected");
+  assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected");
 
   HardwareLoopInfo HWLoopInfo(L);
   if (!HWLoopInfo.canAnalyze(*LI)) {
@@ -1580,6 +2039,10 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
   if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
     return;
 
+  // Don't unroll vectorized loops, including the remainder loop
+  if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
+    return;
+
   // Scan the loop: don't unroll loops with calls as this could prevent
   // inlining.
   unsigned Cost = 0;
@@ -1598,9 +2061,9 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
         return;
       }
 
-      SmallVector<const Value*, 4> Operands(I.value_op_begin(),
-                                            I.value_op_end());
-      Cost += getUserCost(&I, Operands, TargetTransformInfo::TCK_CodeSize);
+      SmallVector<const Value*, 4> Operands(I.operand_values());
+      Cost +=
+        getUserCost(&I, Operands, TargetTransformInfo::TCK_SizeAndLatency);
     }
   }
 
@@ -1629,3 +2092,24 @@ bool ARMTTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
                                        TTI::ReductionFlags Flags) const {
   return ST->hasMVEIntegerOps();
 }
+
+bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty,
+                                       TTI::ReductionFlags Flags) const {
+  if (!ST->hasMVEIntegerOps())
+    return false;
+
+  unsigned ScalarBits = Ty->getScalarSizeInBits();
+  switch (Opcode) {
+  case Instruction::Add:
+    return ScalarBits <= 64;
+  default:
+    return false;
+  }
+}
+
+bool ARMTTIImpl::preferPredicatedReductionSelect(
+    unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const {
+  if (!ST->hasMVEIntegerOps())
+    return false;
+  return true;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index 7bf6de4bffe0..7f045080e320 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -113,6 +113,9 @@ public:
     return !ST->isTargetDarwin() && !ST->hasMVEFloatOps();
   }
 
+  Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
+                                               IntrinsicInst &II) const;
+
   /// \name Scalar TTI Implementations
   /// @{
 
@@ -123,7 +126,8 @@ public:
   int getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind);
 
   int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,
-                        Type *Ty, TTI::TargetCostKind CostKind);
+                        Type *Ty, TTI::TargetCostKind CostKind,
+                        Instruction *Inst = nullptr);
 
   /// @}
 
@@ -177,40 +181,31 @@ public:
 
   int getMemcpyCost(const Instruction *I);
 
+  int getNumMemOps(const IntrinsicInst *I) const;
+
   int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index,
                      VectorType *SubTp);
 
   bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
                              TTI::ReductionFlags Flags) const;
 
-  bool shouldExpandReduction(const IntrinsicInst *II) const {
-    switch (II->getIntrinsicID()) {
-    case Intrinsic::experimental_vector_reduce_v2_fadd:
-    case Intrinsic::experimental_vector_reduce_v2_fmul:
-      // We don't have legalization support for ordered FP reductions.
-      if (!II->getFastMathFlags().allowReassoc())
-        return true;
-      // Can't legalize reductions with soft floats.
-      return TLI->useSoftFloat() || !TLI->getSubtarget()->hasFPRegs();
-
-    case Intrinsic::experimental_vector_reduce_fmin:
-    case Intrinsic::experimental_vector_reduce_fmax:
-      // Can't legalize reductions with soft floats, and NoNan will create
-      // fminimum which we do not know how to lower.
-      return TLI->useSoftFloat() || !TLI->getSubtarget()->hasFPRegs() ||
-             !II->getFastMathFlags().noNaNs();
-
-    default:
-      // Don't expand anything else, let legalization deal with it.
-      return false;
-    }
-  }
+  bool preferInLoopReduction(unsigned Opcode, Type *Ty,
+                             TTI::ReductionFlags Flags) const;
+
+  bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty,
+                                       TTI::ReductionFlags Flags) const;
+
+  bool shouldExpandReduction(const IntrinsicInst *II) const { return false; }
+
+  int getCFInstrCost(unsigned Opcode,
+                     TTI::TargetCostKind CostKind);
 
   int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
-                       TTI::TargetCostKind CostKind,
+                       TTI::CastContextHint CCH, TTI::TargetCostKind CostKind,
                        const Instruction *I = nullptr);
 
   int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                         CmpInst::Predicate VecPred,
                          TTI::TargetCostKind CostKind,
                          const Instruction *I = nullptr);
 
@@ -234,6 +229,10 @@ public:
                       TTI::TargetCostKind CostKind,
                       const Instruction *I = nullptr);
 
+  unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
+                                 unsigned AddressSpace,
+                                 TTI::TargetCostKind CostKind);
+
   int getInterleavedMemoryOpCost(
       unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
       Align Alignment, unsigned AddressSpace,
@@ -245,6 +244,17 @@ public:
                                   Align Alignment, TTI::TargetCostKind CostKind,
                                   const Instruction *I = nullptr);
 
+  int getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
+                                 bool IsPairwiseForm,
+                                 TTI::TargetCostKind CostKind);
+  InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned,
+                                              Type *ResTy, VectorType *ValTy,
+                                              TTI::TargetCostKind CostKind);
+
+  int getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
+                            TTI::TargetCostKind CostKind);
+
+  bool maybeLoweredToCall(Instruction &I);
   bool isLoweredToCall(const Function *F);
   bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
                                 AssumptionCache &AC,
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 05f870b90ecd..52577d75ddf5 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -3087,7 +3087,6 @@ public:
     // This is container for the immediate that we will create the constant
     // pool from
     addExpr(Inst, getConstantPoolImm());
-    return;
   }
 
   void addMemTBBOperands(MCInst &Inst, unsigned N) const {
@@ -6240,10 +6239,9 @@ bool ARMAsmParser::parsePrefix(ARMMCExpr::VariantKind &RefKind) {
   StringRef IDVal = Parser.getTok().getIdentifier();
 
   const auto &Prefix =
-      std::find_if(std::begin(PrefixEntries), std::end(PrefixEntries),
-                   [&IDVal](const PrefixEntry &PE) {
-                      return PE.Spelling == IDVal;
-                   });
+      llvm::find_if(PrefixEntries, [&IDVal](const PrefixEntry &PE) {
+        return PE.Spelling == IDVal;
+      });
   if (Prefix == std::end(PrefixEntries)) {
     Error(Parser.getTok().getLoc(), "unexpected prefix in operand");
     return true;
@@ -10309,11 +10307,14 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
         !HasWideQualifier) {
       // The operands aren't the same for tMOV[S]r... (no cc_out)
       MCInst TmpInst;
-      TmpInst.setOpcode(Inst.getOperand(4).getReg() ? ARM::tMOVSr : ARM::tMOVr);
+      unsigned Op = Inst.getOperand(4).getReg() ? ARM::tMOVSr : ARM::tMOVr;
+      TmpInst.setOpcode(Op);
       TmpInst.addOperand(Inst.getOperand(0));
       TmpInst.addOperand(Inst.getOperand(1));
-      TmpInst.addOperand(Inst.getOperand(2));
-      TmpInst.addOperand(Inst.getOperand(3));
+      if (Op == ARM::tMOVr) {
+        TmpInst.addOperand(Inst.getOperand(2));
+        TmpInst.addOperand(Inst.getOperand(3));
+      }
       Inst = TmpInst;
       return true;
     }
@@ -10598,6 +10599,12 @@ unsigned ARMAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
         (isThumb() && !hasV8Ops()))
       return Match_InvalidOperand;
     break;
+  case ARM::t2TBB:
+  case ARM::t2TBH:
+    // Rn = sp is only allowed with ARMv8-A
+    if (!hasV8Ops() && (Inst.getOperand(0).getReg() == ARM::SP))
+      return Match_RequiresV8;
+    break;
   default:
     break;
   }
@@ -11128,7 +11135,8 @@ bool ARMAsmParser::parseDirectiveArch(SMLoc L) {
   bool WasThumb = isThumb();
   Triple T;
   MCSubtargetInfo &STI = copySTI();
-  STI.setDefaultFeatures("", ("+" + ARM::getArchName(ID)).str());
+  STI.setDefaultFeatures("", /*TuneCPU*/ "",
+                         ("+" + ARM::getArchName(ID)).str());
   setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
   FixModeAfterArchChange(WasThumb, L);
 
@@ -11241,7 +11249,7 @@ bool ARMAsmParser::parseDirectiveCPU(SMLoc L) {
 
   bool WasThumb = isThumb();
   MCSubtargetInfo &STI = copySTI();
-  STI.setDefaultFeatures(CPU, "");
+  STI.setDefaultFeatures(CPU, /*TuneCPU*/ CPU, "");
   setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
   FixModeAfterArchChange(WasThumb, L);
 
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index 54ff0d9966cb..8ea323a9ced5 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -860,7 +860,8 @@ ARMDisassembler::AddThumbPredicate(MCInst &MI) const {
         VCCPos + 2, MCOI::TIED_TO);
       assert(TiedOp >= 0 &&
              "Inactive register in vpred_r is not tied to an output!");
-      MI.insert(VCCI, MI.getOperand(TiedOp));
+      // Copy the operand to ensure it's not invalidated when MI grows.
+      MI.insert(VCCI, MCOperand(MI.getOperand(TiedOp)));
     }
   } else if (VCC != ARMVCC::None) {
     Check(S, SoftFail);
@@ -4529,12 +4530,14 @@ static DecodeStatus DecodeCoprocessor(MCInst &Inst, unsigned Val,
 static DecodeStatus
 DecodeThumbTableBranch(MCInst &Inst, unsigned Insn,
                        uint64_t Address, const void *Decoder) {
+  const FeatureBitset &FeatureBits =
+    ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits();
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rn = fieldFromInstruction(Insn, 16, 4);
   unsigned Rm = fieldFromInstruction(Insn, 0, 4);
 
-  if (Rn == ARM::SP) S = MCDisassembler::SoftFail;
+  if (Rn == 13 && !FeatureBits[ARM::HasV8Ops]) S = MCDisassembler::SoftFail;
   if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
     return MCDisassembler::Fail;
   if (!Check(S, DecoderGPRRegisterClass(Inst, Rm, Address, Decoder)))
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
index 24a9fabf0979..8459b4ff2a14 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
@@ -205,6 +205,20 @@ namespace ARM_AM {
     return V;
   }
 
+  /// isSOImmTwoPartValNeg - Return true if the specified value can be obtained
+  /// by two SOImmVal, that -V = First + Second.
+  /// "R+V" can be optimized to (sub (sub R, First), Second).
+  /// "R=V" can be optimized to (sub (mvn R, ~(-First)), Second).
+  inline bool isSOImmTwoPartValNeg(unsigned V) {
+    unsigned First;
+    if (!isSOImmTwoPartVal(-V))
+      return false;
+    // Return false if ~(-First) is not a SoImmval.
+    First = getSOImmTwoPartFirst(-V);
+    First = ~(-First);
+    return !(rotr32(~255U, getSOImmValRotate(First)) & First);
+  }
+
   /// getThumbImmValShift - Try to handle Imm with a 8-bit immediate followed
   /// by a left shift. Returns the shift amount to use.
   inline unsigned getThumbImmValShift(unsigned Imm) {
@@ -673,6 +687,18 @@ namespace ARM_AM {
     return getFP16Imm(FPImm.bitcastToAPInt());
   }
 
+  /// If this is a FP16Imm encoded as a fp32 value, return the 8-bit encoding
+  /// for it. Otherwise return -1 like getFP16Imm.
+  inline int getFP32FP16Imm(const APInt &Imm) {
+    if (Imm.getActiveBits() > 16)
+      return -1;
+    return ARM_AM::getFP16Imm(Imm.trunc(16));
+  }
+
+  inline int getFP32FP16Imm(const APFloat &FPImm) {
+    return getFP32FP16Imm(FPImm.bitcastToAPInt());
+  }
+
   /// getFP32Imm - Return an 8-bit floating-point version of the 32-bit
   /// floating-point value. If the value cannot be represented as an 8-bit
   /// floating-point value, then return -1.
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index 9ad595c016c4..b02aef3c338b 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -1010,6 +1010,7 @@ static unsigned getFixupKindContainerSizeBytes(unsigned Kind) {
   case ARM::fixup_t2_condbranch:
   case ARM::fixup_t2_uncondbranch:
   case ARM::fixup_t2_pcrel_10:
+  case ARM::fixup_t2_pcrel_9:
   case ARM::fixup_t2_adr_pcrel_12:
   case ARM::fixup_arm_thumb_bl:
   case ARM::fixup_arm_thumb_blx:
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
index 74cd2e681ded..ecd96114e8a4 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
@@ -254,7 +254,7 @@ namespace ARMII {
     MO_OPTION_MASK = 0x3,
 
     /// MO_COFFSTUB - On a symbol operand "FOO", this indicates that the
-    /// reference is actually to the ".refptrp.FOO" symbol.  This is used for
+    /// reference is actually to the ".refptr.FOO" symbol.  This is used for
     /// stub symbols on windows.
     MO_COFFSTUB = 0x4,
 
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index 876741d6c343..07ca5c29f0ec 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -644,7 +644,6 @@ private:
 
     Symbol->setType(ELF::STT_NOTYPE);
     Symbol->setBinding(ELF::STB_LOCAL);
-    Symbol->setExternal(false);
   }
 
   void EmitMappingSymbol(StringRef Name, SMLoc Loc, MCFragment *F,
@@ -654,7 +653,6 @@ private:
     emitLabelAtPos(Symbol, Loc, F, Offset);
     Symbol->setType(ELF::STT_NOTYPE);
     Symbol->setBinding(ELF::STB_LOCAL);
-    Symbol->setExternal(false);
   }
 
   void emitThumbFunc(MCSymbol *Func) override {
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h
index 37cb731ff001..d975d799e079 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h
@@ -30,6 +30,7 @@ public:
   void printRegName(raw_ostream &OS, unsigned RegNo) const override;
 
   // Autogenerated by tblgen.
+  std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
   void printInstruction(const MCInst *MI, uint64_t Address,
                         const MCSubtargetInfo &STI, raw_ostream &O);
   virtual bool printAliasInstr(const MCInst *MI, uint64_t Address,
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
index 765613cf347d..40e8e244e312 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
@@ -87,6 +87,7 @@ void ARMCOFFMCAsmInfoMicrosoft::anchor() { }
 
 ARMCOFFMCAsmInfoMicrosoft::ARMCOFFMCAsmInfoMicrosoft() {
   AlignmentIsInBytes = false;
+  SupportsDebugInformation = true;
   ExceptionsType = ExceptionHandling::WinEH;
   PrivateGlobalPrefix = "$M";
   PrivateLabelPrefix = "$M";
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index 05d73ccf6ff2..774f2507b8d2 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -11,11 +11,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARMMCTargetDesc.h"
+#include "ARMAddressingModes.h"
 #include "ARMBaseInfo.h"
 #include "ARMInstPrinter.h"
 #include "ARMMCAsmInfo.h"
 #include "TargetInfo/ARMTargetInfo.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCELFStreamer.h"
@@ -180,6 +182,23 @@ std::string ARM_MC::ParseARMTriple(const Triple &TT, StringRef CPU) {
   return ARMArchFeature;
 }
 
+bool ARM_MC::isPredicated(const MCInst &MI, const MCInstrInfo *MCII) {
+  const MCInstrDesc &Desc = MCII->get(MI.getOpcode());
+  int PredOpIdx = Desc.findFirstPredOperandIdx();
+  return PredOpIdx != -1 && MI.getOperand(PredOpIdx).getImm() != ARMCC::AL;
+}
+
+bool ARM_MC::isCPSRDefined(const MCInst &MI, const MCInstrInfo *MCII) {
+  const MCInstrDesc &Desc = MCII->get(MI.getOpcode());
+  for (unsigned I = 0; I < MI.getNumOperands(); ++I) {
+    const MCOperand &MO = MI.getOperand(I);
+    if (MO.isReg() && MO.getReg() == ARM::CPSR &&
+        Desc.OpInfo[I].isOptionalDef())
+      return true;
+  }
+  return false;
+}
+
 MCSubtargetInfo *ARM_MC::createARMMCSubtargetInfo(const Triple &TT,
                                                   StringRef CPU, StringRef FS) {
   std::string ArchFS = ARM_MC::ParseARMTriple(TT, CPU);
@@ -190,7 +209,7 @@ MCSubtargetInfo *ARM_MC::createARMMCSubtargetInfo(const Triple &TT,
       ArchFS = std::string(FS);
   }
 
-  return createARMMCSubtargetInfoImpl(TT, CPU, ArchFS);
+  return createARMMCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, ArchFS);
 }
 
 static MCInstrInfo *createARMMCInstrInfo() {
@@ -199,9 +218,120 @@ static MCInstrInfo *createARMMCInstrInfo() {
   return X;
 }
 
+void ARM_MC::initLLVMToCVRegMapping(MCRegisterInfo *MRI) {
+  // Mapping from CodeView to MC register id.
+  static const struct {
+    codeview::RegisterId CVReg;
+    MCPhysReg Reg;
+  } RegMap[] = {
+      {codeview::RegisterId::ARM_R0, ARM::R0},
+      {codeview::RegisterId::ARM_R1, ARM::R1},
+      {codeview::RegisterId::ARM_R2, ARM::R2},
+      {codeview::RegisterId::ARM_R3, ARM::R3},
+      {codeview::RegisterId::ARM_R4, ARM::R4},
+      {codeview::RegisterId::ARM_R5, ARM::R5},
+      {codeview::RegisterId::ARM_R6, ARM::R6},
+      {codeview::RegisterId::ARM_R7, ARM::R7},
+      {codeview::RegisterId::ARM_R8, ARM::R8},
+      {codeview::RegisterId::ARM_R9, ARM::R9},
+      {codeview::RegisterId::ARM_R10, ARM::R10},
+      {codeview::RegisterId::ARM_R11, ARM::R11},
+      {codeview::RegisterId::ARM_R12, ARM::R12},
+      {codeview::RegisterId::ARM_SP, ARM::SP},
+      {codeview::RegisterId::ARM_LR, ARM::LR},
+      {codeview::RegisterId::ARM_PC, ARM::PC},
+      {codeview::RegisterId::ARM_CPSR, ARM::CPSR},
+      {codeview::RegisterId::ARM_FPSCR, ARM::FPSCR},
+      {codeview::RegisterId::ARM_FPEXC, ARM::FPEXC},
+      {codeview::RegisterId::ARM_FS0, ARM::S0},
+      {codeview::RegisterId::ARM_FS1, ARM::S1},
+      {codeview::RegisterId::ARM_FS2, ARM::S2},
+      {codeview::RegisterId::ARM_FS3, ARM::S3},
+      {codeview::RegisterId::ARM_FS4, ARM::S4},
+      {codeview::RegisterId::ARM_FS5, ARM::S5},
+      {codeview::RegisterId::ARM_FS6, ARM::S6},
+      {codeview::RegisterId::ARM_FS7, ARM::S7},
+      {codeview::RegisterId::ARM_FS8, ARM::S8},
+      {codeview::RegisterId::ARM_FS9, ARM::S9},
+      {codeview::RegisterId::ARM_FS10, ARM::S10},
+      {codeview::RegisterId::ARM_FS11, ARM::S11},
+      {codeview::RegisterId::ARM_FS12, ARM::S12},
+      {codeview::RegisterId::ARM_FS13, ARM::S13},
+      {codeview::RegisterId::ARM_FS14, ARM::S14},
+      {codeview::RegisterId::ARM_FS15, ARM::S15},
+      {codeview::RegisterId::ARM_FS16, ARM::S16},
+      {codeview::RegisterId::ARM_FS17, ARM::S17},
+      {codeview::RegisterId::ARM_FS18, ARM::S18},
+      {codeview::RegisterId::ARM_FS19, ARM::S19},
+      {codeview::RegisterId::ARM_FS20, ARM::S20},
+      {codeview::RegisterId::ARM_FS21, ARM::S21},
+      {codeview::RegisterId::ARM_FS22, ARM::S22},
+      {codeview::RegisterId::ARM_FS23, ARM::S23},
+      {codeview::RegisterId::ARM_FS24, ARM::S24},
+      {codeview::RegisterId::ARM_FS25, ARM::S25},
+      {codeview::RegisterId::ARM_FS26, ARM::S26},
+      {codeview::RegisterId::ARM_FS27, ARM::S27},
+      {codeview::RegisterId::ARM_FS28, ARM::S28},
+      {codeview::RegisterId::ARM_FS29, ARM::S29},
+      {codeview::RegisterId::ARM_FS30, ARM::S30},
+      {codeview::RegisterId::ARM_FS31, ARM::S31},
+      {codeview::RegisterId::ARM_ND0, ARM::D0},
+      {codeview::RegisterId::ARM_ND1, ARM::D1},
+      {codeview::RegisterId::ARM_ND2, ARM::D2},
+      {codeview::RegisterId::ARM_ND3, ARM::D3},
+      {codeview::RegisterId::ARM_ND4, ARM::D4},
+      {codeview::RegisterId::ARM_ND5, ARM::D5},
+      {codeview::RegisterId::ARM_ND6, ARM::D6},
+      {codeview::RegisterId::ARM_ND7, ARM::D7},
+      {codeview::RegisterId::ARM_ND8, ARM::D8},
+      {codeview::RegisterId::ARM_ND9, ARM::D9},
+      {codeview::RegisterId::ARM_ND10, ARM::D10},
+      {codeview::RegisterId::ARM_ND11, ARM::D11},
+      {codeview::RegisterId::ARM_ND12, ARM::D12},
+      {codeview::RegisterId::ARM_ND13, ARM::D13},
+      {codeview::RegisterId::ARM_ND14, ARM::D14},
+      {codeview::RegisterId::ARM_ND15, ARM::D15},
+      {codeview::RegisterId::ARM_ND16, ARM::D16},
+      {codeview::RegisterId::ARM_ND17, ARM::D17},
+      {codeview::RegisterId::ARM_ND18, ARM::D18},
+      {codeview::RegisterId::ARM_ND19, ARM::D19},
+      {codeview::RegisterId::ARM_ND20, ARM::D20},
+      {codeview::RegisterId::ARM_ND21, ARM::D21},
+      {codeview::RegisterId::ARM_ND22, ARM::D22},
+      {codeview::RegisterId::ARM_ND23, ARM::D23},
+      {codeview::RegisterId::ARM_ND24, ARM::D24},
+      {codeview::RegisterId::ARM_ND25, ARM::D25},
+      {codeview::RegisterId::ARM_ND26, ARM::D26},
+      {codeview::RegisterId::ARM_ND27, ARM::D27},
+      {codeview::RegisterId::ARM_ND28, ARM::D28},
+      {codeview::RegisterId::ARM_ND29, ARM::D29},
+      {codeview::RegisterId::ARM_ND30, ARM::D30},
+      {codeview::RegisterId::ARM_ND31, ARM::D31},
+      {codeview::RegisterId::ARM_NQ0, ARM::Q0},
+      {codeview::RegisterId::ARM_NQ1, ARM::Q1},
+      {codeview::RegisterId::ARM_NQ2, ARM::Q2},
+      {codeview::RegisterId::ARM_NQ3, ARM::Q3},
+      {codeview::RegisterId::ARM_NQ4, ARM::Q4},
+      {codeview::RegisterId::ARM_NQ5, ARM::Q5},
+      {codeview::RegisterId::ARM_NQ6, ARM::Q6},
+      {codeview::RegisterId::ARM_NQ7, ARM::Q7},
+      {codeview::RegisterId::ARM_NQ8, ARM::Q8},
+      {codeview::RegisterId::ARM_NQ9, ARM::Q9},
+      {codeview::RegisterId::ARM_NQ10, ARM::Q10},
+      {codeview::RegisterId::ARM_NQ11, ARM::Q11},
+      {codeview::RegisterId::ARM_NQ12, ARM::Q12},
+      {codeview::RegisterId::ARM_NQ13, ARM::Q13},
+      {codeview::RegisterId::ARM_NQ14, ARM::Q14},
+      {codeview::RegisterId::ARM_NQ15, ARM::Q15},
+  };
+  for (unsigned I = 0; I < array_lengthof(RegMap); ++I)
+    MRI->mapLLVMRegToCVReg(RegMap[I].Reg, static_cast<int>(RegMap[I].CVReg));
+}
+
 static MCRegisterInfo *createARMMCRegisterInfo(const Triple &Triple) {
   MCRegisterInfo *X = new MCRegisterInfo();
   InitARMMCRegisterInfo(X, ARM::LR, 0, 0, ARM::PC);
+  ARM_MC::initLLVMToCVRegMapping(X);
   return X;
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
index 7cfe6881b456..5a0874f0ef1f 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
@@ -41,6 +41,21 @@ class raw_pwrite_stream;
 
 namespace ARM_MC {
 std::string ParseARMTriple(const Triple &TT, StringRef CPU);
+void initLLVMToCVRegMapping(MCRegisterInfo *MRI);
+
+bool isPredicated(const MCInst &MI, const MCInstrInfo *MCII);
+bool isCPSRDefined(const MCInst &MI, const MCInstrInfo *MCII);
+
+template<class Inst>
+bool isLDMBaseRegInList(const Inst &MI) {
+  auto BaseReg = MI.getOperand(0).getReg();
+  for (unsigned I = 1, E = MI.getNumOperands(); I < E; ++I) {
+    const auto &Op = MI.getOperand(I);
+    if (Op.isReg() && Op.getReg() == BaseReg)
+      return true;
+  }
+  return false;
+}
 
 /// Create a ARM MCSubtargetInfo instance. This is exposed so Asm parser, etc.
 /// do not need to go through TargetRegistry.
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
index 4d7ad6cd60cb..56823735e2d9 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
@@ -44,10 +44,10 @@
 
 using namespace llvm;
 
-#define DEBUG_TYPE "mve-gather-scatter-lowering"
+#define DEBUG_TYPE "arm-mve-gather-scatter-lowering"
 
 cl::opt<bool> EnableMaskedGatherScatters(
-    "enable-arm-maskedgatscat", cl::Hidden, cl::init(false),
+    "enable-arm-maskedgatscat", cl::Hidden, cl::init(true),
     cl::desc("Enable the generation of masked gathers and scatters"));
 
 namespace {
@@ -84,7 +84,7 @@ private:
   // Check for a getelementptr and deduce base and offsets from it, on success
   // returning the base directly and the offsets indirectly using the Offsets
   // argument
-  Value *checkGEP(Value *&Offsets, Type *Ty, GetElementPtrInst *GEP,
+  Value *checkGEP(Value *&Offsets, FixedVectorType *Ty, GetElementPtrInst *GEP,
                   IRBuilder<> &Builder);
   // Compute the scale of this gather/scatter instruction
   int computeScale(unsigned GEPElemSize, unsigned MemoryElemSize);
@@ -132,6 +132,11 @@ private:
   Value *tryCreateIncrementingWBGatScat(IntrinsicInst *I, Value *BasePtr,
                                         Value *Ptr, unsigned TypeScale,
                                         IRBuilder<> &Builder);
+
+  // Optimise the base and offsets of the given address
+  bool optimiseAddress(Value *Address, BasicBlock *BB, LoopInfo *LI);
+  // Try to fold consecutive geps together into one
+  Value *foldGEP(GetElementPtrInst *GEP, Value *&Offsets, IRBuilder<> &Builder);
   // Check whether these offsets could be moved out of the loop they're in
   bool optimiseOffsets(Value *Offsets, BasicBlock *BB, LoopInfo *LI);
   // Pushes the given add out of the loop
@@ -167,7 +172,49 @@ bool MVEGatherScatterLowering::isLegalTypeAndAlignment(unsigned NumElements,
   return false;
 }
 
-Value *MVEGatherScatterLowering::checkGEP(Value *&Offsets, Type *Ty,
+static bool checkOffsetSize(Value *Offsets, unsigned TargetElemCount) {
+  // Offsets that are not of type <N x i32> are sign extended by the
+  // getelementptr instruction, and MVE gathers/scatters treat the offset as
+  // unsigned. Thus, if the element size is smaller than 32, we can only allow
+  // positive offsets - i.e., the offsets are not allowed to be variables we
+  // can't look into.
+  // Additionally, <N x i32> offsets have to either originate from a zext of a
+  // vector with element types smaller or equal the type of the gather we're
+  // looking at, or consist of constants that we can check are small enough
+  // to fit into the gather type.
+  // Thus we check that 0 < value < 2^TargetElemSize.
+  unsigned TargetElemSize = 128 / TargetElemCount;
+  unsigned OffsetElemSize = cast<FixedVectorType>(Offsets->getType())
+                                ->getElementType()
+                                ->getScalarSizeInBits();
+  if (OffsetElemSize != TargetElemSize || OffsetElemSize != 32) {
+    Constant *ConstOff = dyn_cast<Constant>(Offsets);
+    if (!ConstOff)
+      return false;
+    int64_t TargetElemMaxSize = (1ULL << TargetElemSize);
+    auto CheckValueSize = [TargetElemMaxSize](Value *OffsetElem) {
+      ConstantInt *OConst = dyn_cast<ConstantInt>(OffsetElem);
+      if (!OConst)
+        return false;
+      int SExtValue = OConst->getSExtValue();
+      if (SExtValue >= TargetElemMaxSize || SExtValue < 0)
+        return false;
+      return true;
+    };
+    if (isa<FixedVectorType>(ConstOff->getType())) {
+      for (unsigned i = 0; i < TargetElemCount; i++) {
+        if (!CheckValueSize(ConstOff->getAggregateElement(i)))
+          return false;
+      }
+    } else {
+      if (!CheckValueSize(ConstOff))
+        return false;
+    }
+  }
+  return true;
+}
+
+Value *MVEGatherScatterLowering::checkGEP(Value *&Offsets, FixedVectorType *Ty,
                                           GetElementPtrInst *GEP,
                                           IRBuilder<> &Builder) {
   if (!GEP) {
@@ -178,40 +225,43 @@ Value *MVEGatherScatterLowering::checkGEP(Value *&Offsets, Type *Ty,
   LLVM_DEBUG(dbgs() << "masked gathers/scatters: getelementpointer found."
                     << " Looking at intrinsic for base + vector of offsets\n");
   Value *GEPPtr = GEP->getPointerOperand();
-  if (GEPPtr->getType()->isVectorTy()) {
+  Offsets = GEP->getOperand(1);
+  if (GEPPtr->getType()->isVectorTy() ||
+      !isa<FixedVectorType>(Offsets->getType()))
     return nullptr;
-  }
+
   if (GEP->getNumOperands() != 2) {
     LLVM_DEBUG(dbgs() << "masked gathers/scatters: getelementptr with too many"
                       << " operands. Expanding.\n");
     return nullptr;
   }
   Offsets = GEP->getOperand(1);
+  unsigned OffsetsElemCount =
+      cast<FixedVectorType>(Offsets->getType())->getNumElements();
   // Paranoid check whether the number of parallel lanes is the same
-  assert(cast<FixedVectorType>(Ty)->getNumElements() ==
-         cast<FixedVectorType>(Offsets->getType())->getNumElements());
-  // Only <N x i32> offsets can be integrated into an arm gather, any smaller
-  // type would have to be sign extended by the gep - and arm gathers can only
-  // zero extend. Additionally, the offsets do have to originate from a zext of
-  // a vector with element types smaller or equal the type of the gather we're
-  // looking at
-  if (Offsets->getType()->getScalarSizeInBits() != 32)
-    return nullptr;
-  if (ZExtInst *ZextOffs = dyn_cast<ZExtInst>(Offsets))
+  assert(Ty->getNumElements() == OffsetsElemCount);
+
+  ZExtInst *ZextOffs = dyn_cast<ZExtInst>(Offsets);
+  if (ZextOffs)
     Offsets = ZextOffs->getOperand(0);
-  else if (!(cast<FixedVectorType>(Offsets->getType())->getNumElements() == 4 &&
-             Offsets->getType()->getScalarSizeInBits() == 32))
-    return nullptr;
+  FixedVectorType *OffsetType = cast<FixedVectorType>(Offsets->getType());
+
+  // If the offsets are already being zext-ed to <N x i32>, that relieves us of
+  // having to make sure that they won't overflow.
+  if (!ZextOffs || cast<FixedVectorType>(ZextOffs->getDestTy())
+                           ->getElementType()
+                           ->getScalarSizeInBits() != 32)
+    if (!checkOffsetSize(Offsets, OffsetsElemCount))
+      return nullptr;
 
+  // The offset sizes have been checked; if any truncating or zext-ing is
+  // required to fix them, do that now
   if (Ty != Offsets->getType()) {
-    if ((Ty->getScalarSizeInBits() <
-         Offsets->getType()->getScalarSizeInBits())) {
-      LLVM_DEBUG(dbgs() << "masked gathers/scatters: no correct offset type."
-                        << " Can't create intrinsic.\n");
-      return nullptr;
+    if ((Ty->getElementType()->getScalarSizeInBits() <
+         OffsetType->getElementType()->getScalarSizeInBits())) {
+      Offsets = Builder.CreateTrunc(Offsets, Ty);
     } else {
-      Offsets = Builder.CreateZExt(
-          Offsets, VectorType::getInteger(cast<VectorType>(Ty)));
+      Offsets = Builder.CreateZExt(Offsets, VectorType::getInteger(Ty));
     }
   }
   // If none of the checks failed, return the gep's base pointer
@@ -426,7 +476,8 @@ Value *MVEGatherScatterLowering::tryCreateMaskedGatherOffset(
 
   GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
   Value *Offsets;
-  Value *BasePtr = checkGEP(Offsets, ResultTy, GEP, Builder);
+  Value *BasePtr =
+      checkGEP(Offsets, cast<FixedVectorType>(ResultTy), GEP, Builder);
   if (!BasePtr)
     return nullptr;
   // Check whether the offset is a constant increment that could be merged into
@@ -566,7 +617,8 @@ Value *MVEGatherScatterLowering::tryCreateMaskedScatterOffset(
 
   GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
   Value *Offsets;
-  Value *BasePtr = checkGEP(Offsets, InputTy, GEP, Builder);
+  Value *BasePtr =
+      checkGEP(Offsets, cast<FixedVectorType>(InputTy), GEP, Builder);
   if (!BasePtr)
     return nullptr;
   // Check whether the offset is a constant increment that could be merged into
@@ -801,7 +853,6 @@ void MVEGatherScatterLowering::pushOutMul(PHINode *&Phi,
   Phi->addIncoming(NewIncrement, Phi->getIncomingBlock(LoopIncrement));
   Phi->removeIncomingValue((unsigned)0);
   Phi->removeIncomingValue((unsigned)0);
-  return;
 }
 
 // Check whether all usages of this instruction are as offsets of
@@ -887,11 +938,10 @@ bool MVEGatherScatterLowering::optimiseOffsets(Value *Offsets, BasicBlock *BB,
     return false;
 
   // The phi must be an induction variable
-  Instruction *Op;
   int IncrementingBlock = -1;
 
   for (int i = 0; i < 2; i++)
-    if ((Op = dyn_cast<Instruction>(Phi->getIncomingValue(i))) != nullptr)
+    if (auto *Op = dyn_cast<Instruction>(Phi->getIncomingValue(i)))
       if (Op->getOpcode() == Instruction::Add &&
           (Op->getOperand(0) == Phi || Op->getOperand(1) == Phi))
         IncrementingBlock = i;
@@ -910,7 +960,8 @@ bool MVEGatherScatterLowering::optimiseOffsets(Value *Offsets, BasicBlock *BB,
   // Get the value that is added to/multiplied with the phi
   Value *OffsSecondOperand = Offs->getOperand(OffsSecondOp);
 
-  if (IncrementPerRound->getType() != OffsSecondOperand->getType())
+  if (IncrementPerRound->getType() != OffsSecondOperand->getType() ||
+      !L->isLoopInvariant(OffsSecondOperand))
     // Something has gone wrong, abort
     return false;
 
@@ -978,6 +1029,128 @@ bool MVEGatherScatterLowering::optimiseOffsets(Value *Offsets, BasicBlock *BB,
   return true;
 }
 
+static Value *CheckAndCreateOffsetAdd(Value *X, Value *Y, Value *GEP,
+                                      IRBuilder<> &Builder) {
+  // Splat the non-vector value to a vector of the given type - if the value is
+  // a constant (and its value isn't too big), we can even use this opportunity
+  // to scale it to the size of the vector elements
+  auto FixSummands = [&Builder](FixedVectorType *&VT, Value *&NonVectorVal) {
+    ConstantInt *Const;
+    if ((Const = dyn_cast<ConstantInt>(NonVectorVal)) &&
+        VT->getElementType() != NonVectorVal->getType()) {
+      unsigned TargetElemSize = VT->getElementType()->getPrimitiveSizeInBits();
+      uint64_t N = Const->getZExtValue();
+      if (N < (unsigned)(1 << (TargetElemSize - 1))) {
+        NonVectorVal = Builder.CreateVectorSplat(
+            VT->getNumElements(), Builder.getIntN(TargetElemSize, N));
+        return;
+      }
+    }
+    NonVectorVal =
+        Builder.CreateVectorSplat(VT->getNumElements(), NonVectorVal);
+  };
+
+  FixedVectorType *XElType = dyn_cast<FixedVectorType>(X->getType());
+  FixedVectorType *YElType = dyn_cast<FixedVectorType>(Y->getType());
+  // If one of X, Y is not a vector, we have to splat it in order
+  // to add the two of them.
+  if (XElType && !YElType) {
+    FixSummands(XElType, Y);
+    YElType = cast<FixedVectorType>(Y->getType());
+  } else if (YElType && !XElType) {
+    FixSummands(YElType, X);
+    XElType = cast<FixedVectorType>(X->getType());
+  }
+  assert(XElType && YElType && "Unknown vector types");
+  // Check that the summands are of compatible types
+  if (XElType != YElType) {
+    LLVM_DEBUG(dbgs() << "masked gathers/scatters: incompatible gep offsets\n");
+    return nullptr;
+  }
+
+  if (XElType->getElementType()->getScalarSizeInBits() != 32) {
+    // Check that by adding the vectors we do not accidentally
+    // create an overflow
+    Constant *ConstX = dyn_cast<Constant>(X);
+    Constant *ConstY = dyn_cast<Constant>(Y);
+    if (!ConstX || !ConstY)
+      return nullptr;
+    unsigned TargetElemSize = 128 / XElType->getNumElements();
+    for (unsigned i = 0; i < XElType->getNumElements(); i++) {
+      ConstantInt *ConstXEl =
+          dyn_cast<ConstantInt>(ConstX->getAggregateElement(i));
+      ConstantInt *ConstYEl =
+          dyn_cast<ConstantInt>(ConstY->getAggregateElement(i));
+      if (!ConstXEl || !ConstYEl ||
+          ConstXEl->getZExtValue() + ConstYEl->getZExtValue() >=
+              (unsigned)(1 << (TargetElemSize - 1)))
+        return nullptr;
+    }
+  }
+
+  Value *Add = Builder.CreateAdd(X, Y);
+
+  FixedVectorType *GEPType = cast<FixedVectorType>(GEP->getType());
+  if (checkOffsetSize(Add, GEPType->getNumElements()))
+    return Add;
+  else
+    return nullptr;
+}
+
+Value *MVEGatherScatterLowering::foldGEP(GetElementPtrInst *GEP,
+                                         Value *&Offsets,
+                                         IRBuilder<> &Builder) {
+  Value *GEPPtr = GEP->getPointerOperand();
+  Offsets = GEP->getOperand(1);
+  // We only merge geps with constant offsets, because only for those
+  // we can make sure that we do not cause an overflow
+  if (!isa<Constant>(Offsets))
+    return nullptr;
+  GetElementPtrInst *BaseGEP;
+  if ((BaseGEP = dyn_cast<GetElementPtrInst>(GEPPtr))) {
+    // Merge the two geps into one
+    Value *BaseBasePtr = foldGEP(BaseGEP, Offsets, Builder);
+    if (!BaseBasePtr)
+      return nullptr;
+    Offsets =
+        CheckAndCreateOffsetAdd(Offsets, GEP->getOperand(1), GEP, Builder);
+    if (Offsets == nullptr)
+      return nullptr;
+    return BaseBasePtr;
+  }
+  return GEPPtr;
+}
+
+bool MVEGatherScatterLowering::optimiseAddress(Value *Address, BasicBlock *BB,
+                                               LoopInfo *LI) {
+  GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Address);
+  if (!GEP)
+    return false;
+  bool Changed = false;
+  if (GEP->hasOneUse() &&
+      dyn_cast<GetElementPtrInst>(GEP->getPointerOperand())) {
+    IRBuilder<> Builder(GEP->getContext());
+    Builder.SetInsertPoint(GEP);
+    Builder.SetCurrentDebugLocation(GEP->getDebugLoc());
+    Value *Offsets;
+    Value *Base = foldGEP(GEP, Offsets, Builder);
+    // We only want to merge the geps if there is a real chance that they can be
+    // used by an MVE gather; thus the offset has to have the correct size
+    // (always i32 if it is not of vector type) and the base has to be a
+    // pointer.
+    if (Offsets && Base && Base != GEP) {
+      PointerType *BaseType = cast<PointerType>(Base->getType());
+      GetElementPtrInst *NewAddress = GetElementPtrInst::Create(
+          BaseType->getPointerElementType(), Base, Offsets, "gep.merged", GEP);
+      GEP->replaceAllUsesWith(NewAddress);
+      GEP = NewAddress;
+      Changed = true;
+    }
+  }
+  Changed |= optimiseOffsets(GEP->getOperand(1), GEP->getParent(), LI);
+  return Changed;
+}
+
 bool MVEGatherScatterLowering::runOnFunction(Function &F) {
   if (!EnableMaskedGatherScatters)
     return false;
@@ -993,24 +1166,21 @@ bool MVEGatherScatterLowering::runOnFunction(Function &F) {
   bool Changed = false;
 
   for (BasicBlock &BB : F) {
+    Changed |= SimplifyInstructionsInBlock(&BB);
+
     for (Instruction &I : BB) {
       IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
-      if (II && II->getIntrinsicID() == Intrinsic::masked_gather) {
+      if (II && II->getIntrinsicID() == Intrinsic::masked_gather &&
+          isa<FixedVectorType>(II->getType())) {
         Gathers.push_back(II);
-        if (isa<GetElementPtrInst>(II->getArgOperand(0)))
-          Changed |= optimiseOffsets(
-              cast<Instruction>(II->getArgOperand(0))->getOperand(1),
-              II->getParent(), LI);
-      } else if (II && II->getIntrinsicID() == Intrinsic::masked_scatter) {
+        Changed |= optimiseAddress(II->getArgOperand(0), II->getParent(), LI);
+      } else if (II && II->getIntrinsicID() == Intrinsic::masked_scatter &&
+                 isa<FixedVectorType>(II->getArgOperand(0)->getType())) {
         Scatters.push_back(II);
-        if (isa<GetElementPtrInst>(II->getArgOperand(1)))
-          Changed |= optimiseOffsets(
-              cast<Instruction>(II->getArgOperand(1))->getOperand(1),
-              II->getParent(), LI);
+        Changed |= optimiseAddress(II->getArgOperand(1), II->getParent(), LI);
       }
     }
   }
-
   for (unsigned i = 0; i < Gathers.size(); i++) {
     IntrinsicInst *I = Gathers[i];
     Value *L = lowerGather(I);
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MVETailPredUtils.h b/contrib/llvm-project/llvm/lib/Target/ARM/MVETailPredUtils.h
new file mode 100644
index 000000000000..9ab5d92729fe
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MVETailPredUtils.h
@@ -0,0 +1,157 @@
+//===-- MVETailPredUtils.h - Tail predication utility functions -*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains utility functions for low overhead and tail predicated
+// loops, shared between the ARMLowOverheadLoops pass and anywhere else that
+// needs them.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_MVETAILPREDUTILS_H
+#define LLVM_LIB_TARGET_ARM_MVETAILPREDUTILS_H
+
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+
+namespace llvm {
+
+static inline unsigned VCTPOpcodeToLSTP(unsigned Opcode, bool IsDoLoop) {
+  switch (Opcode) {
+  default:
+    llvm_unreachable("unhandled vctp opcode");
+    break;
+  case ARM::MVE_VCTP8:
+    return IsDoLoop ? ARM::MVE_DLSTP_8 : ARM::MVE_WLSTP_8;
+  case ARM::MVE_VCTP16:
+    return IsDoLoop ? ARM::MVE_DLSTP_16 : ARM::MVE_WLSTP_16;
+  case ARM::MVE_VCTP32:
+    return IsDoLoop ? ARM::MVE_DLSTP_32 : ARM::MVE_WLSTP_32;
+  case ARM::MVE_VCTP64:
+    return IsDoLoop ? ARM::MVE_DLSTP_64 : ARM::MVE_WLSTP_64;
+  }
+  return 0;
+}
+
+static inline unsigned getTailPredVectorWidth(unsigned Opcode) {
+  switch (Opcode) {
+  default:
+    llvm_unreachable("unhandled vctp opcode");
+  case ARM::MVE_VCTP8:
+    return 16;
+  case ARM::MVE_VCTP16:
+    return 8;
+  case ARM::MVE_VCTP32:
+    return 4;
+  case ARM::MVE_VCTP64:
+    return 2;
+  }
+  return 0;
+}
+
+static inline bool isVCTP(const MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case ARM::MVE_VCTP8:
+  case ARM::MVE_VCTP16:
+  case ARM::MVE_VCTP32:
+  case ARM::MVE_VCTP64:
+    return true;
+  }
+  return false;
+}
+
+static inline bool isLoopStart(MachineInstr &MI) {
+  return MI.getOpcode() == ARM::t2DoLoopStart ||
+         MI.getOpcode() == ARM::t2DoLoopStartTP ||
+         MI.getOpcode() == ARM::t2WhileLoopStart;
+}
+
+// WhileLoopStart holds the exit block, so produce a cmp lr, 0 and then a
+// beq that branches to the exit branch.
+inline void RevertWhileLoopStart(MachineInstr *MI, const TargetInstrInfo *TII,
+                        unsigned BrOpc = ARM::t2Bcc) {
+  MachineBasicBlock *MBB = MI->getParent();
+
+  // Cmp
+  MachineInstrBuilder MIB =
+      BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2CMPri));
+  MIB.add(MI->getOperand(0));
+  MIB.addImm(0);
+  MIB.addImm(ARMCC::AL);
+  MIB.addReg(ARM::NoRegister);
+
+  // Branch
+  MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc));
+  MIB.add(MI->getOperand(1)); // branch target
+  MIB.addImm(ARMCC::EQ);      // condition code
+  MIB.addReg(ARM::CPSR);
+
+  MI->eraseFromParent();
+}
+
+inline void RevertDoLoopStart(MachineInstr *MI, const TargetInstrInfo *TII) {
+  MachineBasicBlock *MBB = MI->getParent();
+  BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::tMOVr))
+      .add(MI->getOperand(0))
+      .add(MI->getOperand(1))
+      .add(predOps(ARMCC::AL));
+
+  MI->eraseFromParent();
+}
+
+inline void RevertLoopDec(MachineInstr *MI, const TargetInstrInfo *TII,
+                          bool SetFlags = false) {
+  MachineBasicBlock *MBB = MI->getParent();
+
+  MachineInstrBuilder MIB =
+      BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2SUBri));
+  MIB.add(MI->getOperand(0));
+  MIB.add(MI->getOperand(1));
+  MIB.add(MI->getOperand(2));
+  MIB.addImm(ARMCC::AL);
+  MIB.addReg(0);
+
+  if (SetFlags) {
+    MIB.addReg(ARM::CPSR);
+    MIB->getOperand(5).setIsDef(true);
+  } else
+    MIB.addReg(0);
+
+  MI->eraseFromParent();
+}
+
+// Generate a subs, or sub and cmp, and a branch instead of an LE.
+inline void RevertLoopEnd(MachineInstr *MI, const TargetInstrInfo *TII,
+                          unsigned BrOpc = ARM::t2Bcc, bool SkipCmp = false) {
+  MachineBasicBlock *MBB = MI->getParent();
+
+  // Create cmp
+  if (!SkipCmp) {
+    MachineInstrBuilder MIB =
+        BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2CMPri));
+    MIB.add(MI->getOperand(0));
+    MIB.addImm(0);
+    MIB.addImm(ARMCC::AL);
+    MIB.addReg(ARM::NoRegister);
+  }
+
+  // Create bne
+  MachineInstrBuilder MIB =
+      BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc));
+  MIB.add(MI->getOperand(1)); // branch target
+  MIB.addImm(ARMCC::NE);      // condition code
+  MIB.addReg(ARM::CPSR);
+  MI->eraseFromParent();
+}
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_ARM_MVETAILPREDUTILS_H
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MVETailPredication.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MVETailPredication.cpp
index 5bf3522ab2e6..cccac5595288 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MVETailPredication.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MVETailPredication.cpp
@@ -22,23 +22,13 @@
 /// The HardwareLoops pass inserts intrinsics identifying loops that the
 /// backend will attempt to convert into a low-overhead loop. The vectorizer is
 /// responsible for generating a vectorized loop in which the lanes are
-/// predicated upon the iteration counter. This pass looks at these predicated
-/// vector loops, that are targets for low-overhead loops, and prepares it for
-/// code generation. Once the vectorizer has produced a masked loop, there's a
-/// couple of final forms:
-/// - A tail-predicated loop, with implicit predication.
-/// - A loop containing multiple VCPT instructions, predicating multiple VPT
-///   blocks of instructions operating on different vector types.
-///
-/// This pass:
-/// 1) Checks if the predicates of the masked load/store instructions are
-///    generated by intrinsic @llvm.get.active.lanes(). This intrinsic consumes
-///    the Backedge Taken Count (BTC) of the scalar loop as its second argument,
-///    which we extract to set up the number of elements processed by the loop.
-/// 2) Intrinsic @llvm.get.active.lanes() is then replaced by the MVE target
-///    specific VCTP intrinsic to represent the effect of tail predication.
-///    This will be picked up by the ARM Low-overhead loop pass, which performs
-///    the final transformation to a DLSTP or WLSTP tail-predicated loop.
+/// predicated upon an get.active.lane.mask intrinsic. This pass looks at these
+/// get.active.lane.mask intrinsic and attempts to convert them to VCTP
+/// instructions. This will be picked up by the ARM Low-overhead loop pass later
+/// in the backend, which performs the final transformation to a DLSTP or WLSTP
+/// tail-predicated loop.
+//
+//===----------------------------------------------------------------------===//
 
 #include "ARM.h"
 #include "ARMSubtarget.h"
@@ -57,6 +47,7 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 
@@ -66,8 +57,8 @@ using namespace llvm;
 #define DESC "Transform predicated vector loops to use MVE tail predication"
 
 cl::opt<TailPredication::Mode> EnableTailPredication(
-   "tail-predication", cl::desc("MVE tail-predication options"),
-   cl::init(TailPredication::Disabled),
+   "tail-predication", cl::desc("MVE tail-predication pass options"),
+   cl::init(TailPredication::Enabled),
    cl::values(clEnumValN(TailPredication::Disabled, "disabled",
                          "Don't tail-predicate loops"),
               clEnumValN(TailPredication::EnabledNoReductions,
@@ -112,23 +103,18 @@ public:
   bool runOnLoop(Loop *L, LPPassManager&) override;
 
 private:
-  /// Perform the relevant checks on the loop and convert if possible.
-  bool TryConvert(Value *TripCount);
+  /// Perform the relevant checks on the loop and convert active lane masks if
+  /// possible.
+  bool TryConvertActiveLaneMask(Value *TripCount);
 
-  /// Return whether this is a vectorized loop, that contains masked
-  /// load/stores.
-  bool IsPredicatedVectorLoop();
-
-  /// Perform checks on the arguments of @llvm.get.active.lane.mask
-  /// intrinsic: check if the first is a loop induction variable, and for the
-  /// the second check that no overflow can occur in the expression that use
-  /// this backedge-taken count.
-  bool IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, Value *TripCount,
-                        FixedVectorType *VecTy);
+  /// Perform several checks on the arguments of @llvm.get.active.lane.mask
+  /// intrinsic. E.g., check that the loop induction variable and the element
+  /// count are of the form we expect, and also perform overflow checks for
+  /// the new expressions that are created.
+  bool IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, Value *TripCount);
 
   /// Insert the intrinsic to represent the effect of tail predication.
-  void InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, Value *TripCount,
-                           FixedVectorType *VecTy);
+  void InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, Value *TripCount);
 
   /// Rematerialize the iteration count in exit blocks, which enables
   /// ARMLowOverheadLoops to better optimise away loop update statements inside
@@ -138,25 +124,6 @@ private:
 
 } // end namespace
 
-static bool IsDecrement(Instruction &I) {
-  auto *Call = dyn_cast<IntrinsicInst>(&I);
-  if (!Call)
-    return false;
-
-  Intrinsic::ID ID = Call->getIntrinsicID();
-  return ID == Intrinsic::loop_decrement_reg;
-}
-
-static bool IsMasked(Instruction *I) {
-  auto *Call = dyn_cast<IntrinsicInst>(I);
-  if (!Call)
-    return false;
-
-  Intrinsic::ID ID = Call->getIntrinsicID();
-  // TODO: Support gather/scatter expand/compress operations.
-  return ID == Intrinsic::masked_store || ID == Intrinsic::masked_load;
-}
-
 bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
   if (skipLoop(L) || !EnableTailPredication)
     return false;
@@ -188,7 +155,7 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
         continue;
 
       Intrinsic::ID ID = Call->getIntrinsicID();
-      if (ID == Intrinsic::set_loop_iterations ||
+      if (ID == Intrinsic::start_loop_iterations ||
           ID == Intrinsic::test_set_loop_iterations)
         return cast<IntrinsicInst>(&I);
     }
@@ -207,148 +174,23 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
       return false;
   }
 
-  // Search for the hardware loop intrinic that decrements the loop counter.
-  IntrinsicInst *Decrement = nullptr;
-  for (auto *BB : L->getBlocks()) {
-    for (auto &I : *BB) {
-      if (IsDecrement(I)) {
-        Decrement = cast<IntrinsicInst>(&I);
-        break;
-      }
-    }
-  }
-
-  if (!Decrement)
-    return false;
-
-  LLVM_DEBUG(dbgs() << "ARM TP: Running on Loop: " << *L << *Setup << "\n"
-             << *Decrement << "\n");
-
-  if (!TryConvert(Setup->getArgOperand(0))) {
-    LLVM_DEBUG(dbgs() << "ARM TP: Can't tail-predicate this loop.\n");
-    return false;
-  }
-
-  return true;
-}
-
-static FixedVectorType *getVectorType(IntrinsicInst *I) {
-  unsigned TypeOp = I->getIntrinsicID() == Intrinsic::masked_load ? 0 : 1;
-  auto *PtrTy = cast<PointerType>(I->getOperand(TypeOp)->getType());
-  auto *VecTy = cast<FixedVectorType>(PtrTy->getElementType());
-  assert(VecTy && "No scalable vectors expected here");
-  return VecTy;
-}
+  LLVM_DEBUG(dbgs() << "ARM TP: Running on Loop: " << *L << *Setup << "\n");
 
-bool MVETailPredication::IsPredicatedVectorLoop() {
-  // Check that the loop contains at least one masked load/store intrinsic.
-  // We only support 'normal' vector instructions - other than masked
-  // load/stores.
-  bool ActiveLaneMask = false;
-  for (auto *BB : L->getBlocks()) {
-    for (auto &I : *BB) {
-      auto *Int = dyn_cast<IntrinsicInst>(&I);
-      if (!Int)
-        continue;
+  bool Changed = TryConvertActiveLaneMask(Setup->getArgOperand(0));
 
-      switch (Int->getIntrinsicID()) {
-      case Intrinsic::get_active_lane_mask:
-        ActiveLaneMask = true;
-        LLVM_FALLTHROUGH;
-      case Intrinsic::sadd_sat:
-      case Intrinsic::uadd_sat:
-      case Intrinsic::ssub_sat:
-      case Intrinsic::usub_sat:
-        continue;
-      case Intrinsic::fma:
-      case Intrinsic::trunc:
-      case Intrinsic::rint:
-      case Intrinsic::round:
-      case Intrinsic::floor:
-      case Intrinsic::ceil:
-      case Intrinsic::fabs:
-        if (ST->hasMVEFloatOps())
-          continue;
-        LLVM_FALLTHROUGH;
-      default:
-        break;
-      }
-
-      if (IsMasked(&I)) {
-        auto *VecTy = getVectorType(Int);
-        unsigned Lanes = VecTy->getNumElements();
-        unsigned ElementWidth = VecTy->getScalarSizeInBits();
-        // MVE vectors are 128-bit, but don't support 128 x i1.
-        // TODO: Can we support vectors larger than 128-bits?
-        unsigned MaxWidth = TTI->getRegisterBitWidth(true);
-        if (Lanes * ElementWidth > MaxWidth || Lanes == MaxWidth)
-          return false;
-        MaskedInsts.push_back(cast<IntrinsicInst>(&I));
-        continue;
-      }
-
-      for (const Use &U : Int->args()) {
-        if (isa<VectorType>(U->getType()))
-          return false;
-      }
-    }
-  }
-
-  if (!ActiveLaneMask) {
-    LLVM_DEBUG(dbgs() << "ARM TP: No get.active.lane.mask intrinsic found.\n");
-    return false;
-  }
-  return !MaskedInsts.empty();
-}
-
-// Look through the exit block to see whether there's a duplicate predicate
-// instruction. This can happen when we need to perform a select on values
-// from the last and previous iteration. Instead of doing a straight
-// replacement of that predicate with the vctp, clone the vctp and place it
-// in the block. This means that the VPR doesn't have to be live into the
-// exit block which should make it easier to convert this loop into a proper
-// tail predicated loop.
-static void Cleanup(SetVector<Instruction*> &MaybeDead, Loop *L) {
-  BasicBlock *Exit = L->getUniqueExitBlock();
-  if (!Exit) {
-    LLVM_DEBUG(dbgs() << "ARM TP: can't find loop exit block\n");
-    return;
-  }
-
-  // Drop references and add operands to check for dead.
-  SmallPtrSet<Instruction*, 4> Dead;
-  while (!MaybeDead.empty()) {
-    auto *I = MaybeDead.front();
-    MaybeDead.remove(I);
-    if (I->hasNUsesOrMore(1))
-      continue;
-
-    for (auto &U : I->operands())
-      if (auto *OpI = dyn_cast<Instruction>(U))
-        MaybeDead.insert(OpI);
-
-    Dead.insert(I);
-  }
-
-  for (auto *I : Dead) {
-    LLVM_DEBUG(dbgs() << "ARM TP: removing dead insn: "; I->dump());
-    I->eraseFromParent();
-  }
-
-  for (auto I : L->blocks())
-    DeleteDeadPHIs(I);
+  return Changed;
 }
 
 // The active lane intrinsic has this form:
 //
-//    @llvm.get.active.lane.mask(IV, BTC)
+//    @llvm.get.active.lane.mask(IV, TC)
 //
 // Here we perform checks that this intrinsic behaves as expected,
 // which means:
 //
-// 1) The element count, which is calculated with BTC + 1, cannot overflow.
-// 2) The element count needs to be sufficiently large that the decrement of
-//    element counter doesn't overflow, which means that we need to prove:
+// 1) Check that the TripCount (TC) belongs to this loop (originally).
+// 2) The element count (TC) needs to be sufficiently large that the decrement
+//    of element counter doesn't overflow, which means that we need to prove:
 //        ceil(ElementCount / VectorWidth) >= TripCount
 //    by rounding up ElementCount up:
 //        ((ElementCount + (VectorWidth - 1)) / VectorWidth
@@ -357,109 +199,122 @@ static void Cleanup(SetVector<Instruction*> &MaybeDead, Loop *L) {
 // 3) The IV must be an induction phi with an increment equal to the
 //    vector width.
 bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask,
-    Value *TripCount, FixedVectorType *VecTy) {
+                                          Value *TripCount) {
   bool ForceTailPredication =
     EnableTailPredication == TailPredication::ForceEnabledNoReductions ||
     EnableTailPredication == TailPredication::ForceEnabled;
-  // 1) Test whether entry to the loop is protected by a conditional
-  // BTC + 1 < 0. In other words, if the scalar trip count overflows,
-  // becomes negative, we shouldn't enter the loop and creating
-  // tripcount expression BTC + 1 is not safe. So, check that BTC
-  // isn't max. This is evaluated in unsigned, because the semantics
-  // of @get.active.lane.mask is a ULE comparison.
-
-  int VectorWidth = VecTy->getNumElements();
-  auto *BackedgeTakenCount = ActiveLaneMask->getOperand(1);
-  auto *BTC = SE->getSCEV(BackedgeTakenCount);
-
-  if (!llvm::cannotBeMaxInLoop(BTC, L, *SE, false /*Signed*/) &&
-      !ForceTailPredication) {
-    LLVM_DEBUG(dbgs() << "ARM TP: Overflow possible, BTC can be max: ";
-               BTC->dump());
+
+  Value *ElemCount = ActiveLaneMask->getOperand(1);
+  bool Changed = false;
+  if (!L->makeLoopInvariant(ElemCount, Changed))
     return false;
-  }
 
-  // 2) Prove that the sub expression is non-negative, i.e. it doesn't overflow:
-  //
-  //      (((ElementCount + (VectorWidth - 1)) / VectorWidth) - TripCount
-  //
-  // 2.1) First prove overflow can't happen in:
-  //
-  //      ElementCount + (VectorWidth - 1)
-  //
-  // Because of a lack of context, it is difficult to get a useful bounds on
-  // this expression. But since ElementCount uses the same variables as the
-  // TripCount (TC), for which we can find meaningful value ranges, we use that
-  // instead and assert that:
-  //
-  //     upperbound(TC) <= UINT_MAX - VectorWidth
-  //
+  auto *EC= SE->getSCEV(ElemCount);
   auto *TC = SE->getSCEV(TripCount);
-  unsigned SizeInBits = TripCount->getType()->getScalarSizeInBits();
-  auto Diff =  APInt(SizeInBits, ~0) - APInt(SizeInBits, VectorWidth);
-  uint64_t MaxMinusVW = Diff.getZExtValue();
-  uint64_t UpperboundTC = SE->getSignedRange(TC).getUpper().getZExtValue();
-
-  if (UpperboundTC > MaxMinusVW && !ForceTailPredication) {
-    LLVM_DEBUG(dbgs() << "ARM TP: Overflow possible in tripcount rounding:\n";
-               dbgs() << "upperbound(TC) <= UINT_MAX - VectorWidth\n";
-               dbgs() << UpperboundTC << " <= " << MaxMinusVW << "== false\n";);
+  int VectorWidth =
+      cast<FixedVectorType>(ActiveLaneMask->getType())->getNumElements();
+  if (VectorWidth != 4 && VectorWidth != 8 && VectorWidth != 16)
     return false;
-  }
+  ConstantInt *ConstElemCount = nullptr;
 
-  // 2.2) Make sure overflow doesn't happen in final expression:
-  //  (((ElementCount + (VectorWidth - 1)) / VectorWidth) - TripCount,
-  // To do this, compare the full ranges of these subexpressions:
-  //
-  //     Range(Ceil) <= Range(TC)
-  //
-  // where Ceil = ElementCount + (VW-1) / VW. If Ceil and TC are runtime
-  // values (and not constants), we have to compensate for the lowerbound value
-  // range to be off by 1. The reason is that BTC lives in the preheader in
-  // this form:
-  //
-  //     %trip.count.minus = add nsw nuw i32 %N, -1
-  //
-  // For the loop to be executed, %N has to be >= 1 and as a result the value
-  // range of %trip.count.minus has a lower bound of 0. Value %TC has this form:
-  //
-  //     %5 = add nuw nsw i32 %4, 1
-  //     call void @llvm.set.loop.iterations.i32(i32 %5)
-  //
-  // where %5 is some expression using %N, which needs to have a lower bound of
-  // 1. Thus, if the ranges of Ceil and TC are not a single constant but a set,
-  // we first add 0 to TC such that we can do the <= comparison on both sets.
-  //
-  auto *One = SE->getOne(TripCount->getType());
-  // ElementCount = BTC + 1
-  auto *ElementCount = SE->getAddExpr(BTC, One);
-  // Tmp = ElementCount + (VW-1)
-  auto *ECPlusVWMinus1 = SE->getAddExpr(ElementCount,
-      SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth - 1)));
-  // Ceil = ElementCount + (VW-1) / VW
-  auto *Ceil = SE->getUDivExpr(ECPlusVWMinus1,
-      SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth)));
-
-  ConstantRange RangeCeil = SE->getSignedRange(Ceil) ;
-  ConstantRange RangeTC = SE->getSignedRange(TC) ;
-  if (!RangeTC.isSingleElement()) {
-    auto ZeroRange =
-        ConstantRange(APInt(TripCount->getType()->getScalarSizeInBits(), 0));
-    RangeTC = RangeTC.unionWith(ZeroRange);
-  }
-  if (!RangeTC.contains(RangeCeil) && !ForceTailPredication) {
-    LLVM_DEBUG(dbgs() << "ARM TP: Overflow possible in sub\n");
+  // 1) Smoke tests that the original scalar loop TripCount (TC) belongs to
+  // this loop.  The scalar tripcount corresponds the number of elements
+  // processed by the loop, so we will refer to that from this point on.
+  if (!SE->isLoopInvariant(EC, L)) {
+    LLVM_DEBUG(dbgs() << "ARM TP: element count must be loop invariant.\n");
     return false;
   }
 
-  // 3) Find out if IV is an induction phi. Note that We can't use Loop
+  if ((ConstElemCount = dyn_cast<ConstantInt>(ElemCount))) {
+    ConstantInt *TC = dyn_cast<ConstantInt>(TripCount);
+    if (!TC) {
+      LLVM_DEBUG(dbgs() << "ARM TP: Constant tripcount expected in "
+                           "set.loop.iterations\n");
+      return false;
+    }
+
+    // Calculate 2 tripcount values and check that they are consistent with
+    // each other. The TripCount for a predicated vector loop body is
+    // ceil(ElementCount/Width), or floor((ElementCount+Width-1)/Width) as we
+    // work it out here.
+    uint64_t TC1 = TC->getZExtValue();
+    uint64_t TC2 =
+        (ConstElemCount->getZExtValue() + VectorWidth - 1) / VectorWidth;
+
+    // If the tripcount values are inconsistent, we can't insert the VCTP and
+    // trigger tail-predication; keep the intrinsic as a get.active.lane.mask
+    // and legalize this.
+    if (TC1 != TC2) {
+      LLVM_DEBUG(dbgs() << "ARM TP: inconsistent constant tripcount values: "
+                 << TC1 << " from set.loop.iterations, and "
+                 << TC2 << " from get.active.lane.mask\n");
+      return false;
+    }
+  } else if (!ForceTailPredication) {
+    // 2) We need to prove that the sub expression that we create in the
+    // tail-predicated loop body, which calculates the remaining elements to be
+    // processed, is non-negative, i.e. it doesn't overflow:
+    //
+    //   ((ElementCount + VectorWidth - 1) / VectorWidth) - TripCount >= 0
+    //
+    // This is true if:
+    //
+    //    TripCount == (ElementCount + VectorWidth - 1) / VectorWidth
+    //
+    // which what we will be using here.
+    //
+    auto *VW = SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth));
+    // ElementCount + (VW-1):
+    auto *ECPlusVWMinus1 = SE->getAddExpr(EC,
+        SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth - 1)));
+
+    // Ceil = ElementCount + (VW-1) / VW
+    auto *Ceil = SE->getUDivExpr(ECPlusVWMinus1, VW);
+
+    // Prevent unused variable warnings with TC
+    (void)TC;
+    LLVM_DEBUG(
+      dbgs() << "ARM TP: Analysing overflow behaviour for:\n";
+      dbgs() << "ARM TP: - TripCount = "; TC->dump();
+      dbgs() << "ARM TP: - ElemCount = "; EC->dump();
+      dbgs() << "ARM TP: - VecWidth =  " << VectorWidth << "\n";
+      dbgs() << "ARM TP: - (ElemCount+VW-1) / VW = "; Ceil->dump();
+    );
+
+    // As an example, almost all the tripcount expressions (produced by the
+    // vectoriser) look like this:
+    //
+    //   TC = ((-4 + (4 * ((3 + %N) /u 4))<nuw>) /u 4)
+    //
+    // and "ElementCount + (VW-1) / VW":
+    //
+    //   Ceil = ((3 + %N) /u 4)
+    //
+    // Check for equality of TC and Ceil by calculating SCEV expression
+    // TC - Ceil and test it for zero.
+    //
+    bool Zero = SE->getMinusSCEV(
+                      SE->getBackedgeTakenCount(L),
+                      SE->getUDivExpr(SE->getAddExpr(SE->getMulExpr(Ceil, VW),
+                                                     SE->getNegativeSCEV(VW)),
+                                      VW))
+                    ->isZero();
+
+    if (!Zero) {
+      LLVM_DEBUG(dbgs() << "ARM TP: possible overflow in sub expression.\n");
+      return false;
+    }
+  }
+
+  // 3) Find out if IV is an induction phi. Note that we can't use Loop
   // helpers here to get the induction variable, because the hardware loop is
-  // no longer in loopsimplify form, and also the hwloop intrinsic use a
-  // different counter.  Using SCEV, we check that the induction is of the
+  // no longer in loopsimplify form, and also the hwloop intrinsic uses a
+  // different counter. Using SCEV, we check that the induction is of the
   // form i = i + 4, where the increment must be equal to the VectorWidth.
   auto *IV = ActiveLaneMask->getOperand(0);
   auto *IVExpr = SE->getSCEV(IV);
   auto *AddExpr = dyn_cast<SCEVAddRecExpr>(IVExpr);
+
   if (!AddExpr) {
     LLVM_DEBUG(dbgs() << "ARM TP: induction not an add expr: "; IVExpr->dump());
     return false;
@@ -469,6 +324,11 @@ bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask,
     LLVM_DEBUG(dbgs() << "ARM TP: phi not part of this loop\n");
     return false;
   }
+  auto *Base = dyn_cast<SCEVConstant>(AddExpr->getOperand(0));
+  if (!Base || !Base->isZero()) {
+    LLVM_DEBUG(dbgs() << "ARM TP: induction base is not 0\n");
+    return false;
+  }
   auto *Step = dyn_cast<SCEVConstant>(AddExpr->getOperand(1));
   if (!Step) {
     LLVM_DEBUG(dbgs() << "ARM TP: induction step is not a constant: ";
@@ -479,68 +339,29 @@ bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask,
   if (VectorWidth == StepValue)
     return true;
 
-  LLVM_DEBUG(dbgs() << "ARM TP: Step value " << StepValue << " doesn't match "
-             "vector width " << VectorWidth << "\n");
+  LLVM_DEBUG(dbgs() << "ARM TP: Step value " << StepValue
+                    << " doesn't match vector width " << VectorWidth << "\n");
 
   return false;
 }
 
-// Materialize NumElements in the preheader block.
-static Value *getNumElements(BasicBlock *Preheader, Value *BTC) {
-  // First, check the preheader if it not already exist:
-  //
-  // preheader:
-  //    %BTC = add i32 %N, -1
-  //    ..
-  // vector.body:
-  //
-  // if %BTC already exists. We don't need to emit %NumElems = %BTC + 1,
-  // but instead can just return %N.
-  for (auto &I : *Preheader) {
-    if (I.getOpcode() != Instruction::Add || &I != BTC)
-      continue;
-    ConstantInt *MinusOne = nullptr;
-    if (!(MinusOne = dyn_cast<ConstantInt>(I.getOperand(1))))
-      continue;
-    if (MinusOne->getSExtValue() == -1) {
-      LLVM_DEBUG(dbgs() << "ARM TP: Found num elems: " << I << "\n");
-      return I.getOperand(0);
-    }
-  }
-
-  // But we do need to materialise BTC if it is not already there,
-  // e.g. if it is a constant.
-  IRBuilder<> Builder(Preheader->getTerminator());
-  Value *NumElements = Builder.CreateAdd(BTC,
-        ConstantInt::get(BTC->getType(), 1), "num.elements");
-  LLVM_DEBUG(dbgs() << "ARM TP: Created num elems: " << *NumElements << "\n");
-  return NumElements;
-}
-
 void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask,
-    Value *TripCount, FixedVectorType *VecTy) {
+                                             Value *TripCount) {
   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
   Module *M = L->getHeader()->getModule();
   Type *Ty = IntegerType::get(M->getContext(), 32);
-  unsigned VectorWidth = VecTy->getNumElements();
-
-  // The backedge-taken count in @llvm.get.active.lane.mask, its 2nd operand,
-  // is one less than the trip count. So we need to find or create
-  // %num.elements = %BTC + 1 in the preheader.
-  Value *BTC = ActiveLaneMask->getOperand(1);
-  Builder.SetInsertPoint(L->getLoopPreheader()->getTerminator());
-  Value *NumElements = getNumElements(L->getLoopPreheader(), BTC);
+  unsigned VectorWidth =
+      cast<FixedVectorType>(ActiveLaneMask->getType())->getNumElements();
 
   // Insert a phi to count the number of elements processed by the loop.
-  Builder.SetInsertPoint(L->getHeader()->getFirstNonPHI()  );
+  Builder.SetInsertPoint(L->getHeader()->getFirstNonPHI());
   PHINode *Processed = Builder.CreatePHI(Ty, 2);
-  Processed->addIncoming(NumElements, L->getLoopPreheader());
+  Processed->addIncoming(ActiveLaneMask->getOperand(1), L->getLoopPreheader());
 
-  // Replace @llvm.get.active.mask() with the ARM specific VCTP intrinic, and thus
-  // represent the effect of tail predication.
+  // Replace @llvm.get.active.mask() with the ARM specific VCTP intrinic, and
+  // thus represent the effect of tail predication.
   Builder.SetInsertPoint(ActiveLaneMask);
-  ConstantInt *Factor =
-    ConstantInt::get(cast<IntegerType>(Ty), VectorWidth);
+  ConstantInt *Factor = ConstantInt::get(cast<IntegerType>(Ty), VectorWidth);
 
   Intrinsic::ID VCTPID;
   switch (VectorWidth) {
@@ -569,42 +390,36 @@ void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask,
              << "ARM TP: Inserted VCTP: " << *VCTPCall << "\n");
 }
 
-bool MVETailPredication::TryConvert(Value *TripCount) {
-  if (!IsPredicatedVectorLoop()) {
-    LLVM_DEBUG(dbgs() << "ARM TP: no masked instructions in loop.\n");
+bool MVETailPredication::TryConvertActiveLaneMask(Value *TripCount) {
+  SmallVector<IntrinsicInst *, 4> ActiveLaneMasks;
+  for (auto *BB : L->getBlocks())
+    for (auto &I : *BB)
+      if (auto *Int = dyn_cast<IntrinsicInst>(&I))
+        if (Int->getIntrinsicID() == Intrinsic::get_active_lane_mask)
+          ActiveLaneMasks.push_back(Int);
+
+  if (ActiveLaneMasks.empty())
     return false;
-  }
 
   LLVM_DEBUG(dbgs() << "ARM TP: Found predicated vector loop.\n");
-  SetVector<Instruction*> Predicates;
-
-  // Walk through the masked intrinsics and try to find whether the predicate
-  // operand is generated by intrinsic @llvm.get.active.lane.mask().
-  for (auto *I : MaskedInsts) {
-    unsigned PredOp = I->getIntrinsicID() == Intrinsic::masked_load ? 2 : 3;
-    auto *Predicate = dyn_cast<Instruction>(I->getArgOperand(PredOp));
-    if (!Predicate || Predicates.count(Predicate))
-      continue;
-
-    auto *ActiveLaneMask = dyn_cast<IntrinsicInst>(Predicate);
-    if (!ActiveLaneMask ||
-        ActiveLaneMask->getIntrinsicID() != Intrinsic::get_active_lane_mask)
-      continue;
-
-    Predicates.insert(Predicate);
+
+  for (auto *ActiveLaneMask : ActiveLaneMasks) {
     LLVM_DEBUG(dbgs() << "ARM TP: Found active lane mask: "
                       << *ActiveLaneMask << "\n");
 
-    auto *VecTy = getVectorType(I);
-    if (!IsSafeActiveMask(ActiveLaneMask, TripCount, VecTy)) {
+    if (!IsSafeActiveMask(ActiveLaneMask, TripCount)) {
       LLVM_DEBUG(dbgs() << "ARM TP: Not safe to insert VCTP.\n");
       return false;
     }
     LLVM_DEBUG(dbgs() << "ARM TP: Safe to insert VCTP.\n");
-    InsertVCTPIntrinsic(ActiveLaneMask, TripCount, VecTy);
+    InsertVCTPIntrinsic(ActiveLaneMask, TripCount);
   }
 
-  Cleanup(Predicates, L);
+  // Remove dead instructions and now dead phis.
+  for (auto *II : ActiveLaneMasks)
+    RecursivelyDeleteTriviallyDeadInstructions(II);
+  for (auto I : L->blocks())
+    DeleteDeadPHIs(I);
   return true;
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MVEVPTBlockPass.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MVEVPTBlockPass.cpp
index dc769ae526bc..c7f451cba14f 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MVEVPTBlockPass.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MVEVPTBlockPass.cpp
@@ -107,6 +107,12 @@ static bool StepOverPredicatedInstrs(MachineBasicBlock::instr_iterator &Iter,
   NumInstrsSteppedOver = 0;
 
   while (Iter != EndIter) {
+    if (Iter->isDebugInstr()) {
+      // Skip debug instructions
+      ++Iter;
+      continue;
+    }
+
     NextPred = getVPTInstrPredicate(*Iter, PredReg);
     assert(NextPred != ARMVCC::Else &&
            "VPT block pass does not expect Else preds");
@@ -170,6 +176,8 @@ CreateVPTBlock(MachineBasicBlock::instr_iterator &Iter,
   LLVM_DEBUG(for (MachineBasicBlock::instr_iterator AddedInstIter =
                       std::next(BlockBeg);
                   AddedInstIter != Iter; ++AddedInstIter) {
+    if (AddedInstIter->isDebugInstr())
+      continue;
     dbgs() << "  adding: ";
     AddedInstIter->dump();
   });
@@ -197,7 +205,7 @@ CreateVPTBlock(MachineBasicBlock::instr_iterator &Iter,
     if (!IsVPRDefinedOrKilledByBlock(Iter, VPNOTBlockEndIter))
       break;
 
-    LLVM_DEBUG(dbgs() << "  removing VPNOT: "; Iter->dump(););
+    LLVM_DEBUG(dbgs() << "  removing VPNOT: "; Iter->dump());
 
     // Record the new size of the block
     BlockSize += ElseInstCnt;
@@ -211,6 +219,9 @@ CreateVPTBlock(MachineBasicBlock::instr_iterator &Iter,
     // Note that we are using "Iter" to iterate over the block so we can update
     // it at the same time.
     for (; Iter != VPNOTBlockEndIter; ++Iter) {
+      if (Iter->isDebugInstr())
+        continue;
+
       // Find the register in which the predicate is
       int OpIdx = findFirstVPTPredOperandIdx(*Iter);
       assert(OpIdx != -1);
@@ -270,26 +281,33 @@ bool MVEVPTBlock::InsertVPTBlocks(MachineBasicBlock &Block) {
       MIBuilder.add(VCMP->getOperand(1));
       MIBuilder.add(VCMP->getOperand(2));
       MIBuilder.add(VCMP->getOperand(3));
+
+      // We need to remove any kill flags between the original VCMP and the new
+      // insertion point.
+      for (MachineInstr &MII :
+           make_range(VCMP->getIterator(), MI->getIterator())) {
+        MII.clearRegisterKills(VCMP->getOperand(1).getReg(), TRI);
+        MII.clearRegisterKills(VCMP->getOperand(2).getReg(), TRI);
+      }
+
       VCMP->eraseFromParent();
     } else {
       MIBuilder = BuildMI(Block, MI, DL, TII->get(ARM::MVE_VPST));
       MIBuilder.addImm((uint64_t)BlockMask);
     }
 
+    // Erase all dead instructions (VPNOT's). Do that now so that they do not
+    // mess with the bundle creation.
+    for (MachineInstr *DeadMI : DeadInstructions)
+      DeadMI->eraseFromParent();
+    DeadInstructions.clear();
+
     finalizeBundle(
         Block, MachineBasicBlock::instr_iterator(MIBuilder.getInstr()), MBIter);
 
     Modified = true;
   }
 
-  // Erase all dead instructions
-  for (MachineInstr *DeadMI : DeadInstructions) {
-    if (DeadMI->isInsideBundle())
-      DeadMI->eraseFromBundle();
-    else
-      DeadMI->eraseFromParent();
-  }
-
   return Modified;
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp
index 382ddd4572c7..00e4449769f4 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp
@@ -6,28 +6,28 @@
 //
 //===----------------------------------------------------------------------===//
 //
-/// \file This pass does a few optimisations related to MVE VPT blocks before
-/// register allocation is performed. The goal is to maximize the sizes of the
-/// blocks that will be created by the MVE VPT Block Insertion pass (which runs
-/// after register allocation). The first optimisation done by this pass is the
-/// replacement of "opposite" VCMPs with VPNOTs, so the Block Insertion pass
-/// can delete them later to create larger VPT blocks.
-/// The second optimisation replaces re-uses of old VCCR values with VPNOTs when
-/// inside a block of predicated instructions. This is done to avoid
-/// spill/reloads of VPR in the middle of a block, which prevents the Block
-/// Insertion pass from creating large blocks.
-//
+/// \file This pass does a few optimisations related to Tail predicated loops
+/// and MVE VPT blocks before register allocation is performed. For VPT blocks
+/// the goal is to maximize the sizes of the blocks that will be created by the
+/// MVE VPT Block Insertion pass (which runs after register allocation). For
+/// tail predicated loops we transform the loop into something that will
+/// hopefully make the backend ARMLowOverheadLoops pass's job easier.
+///
 //===----------------------------------------------------------------------===//
 
 #include "ARM.h"
 #include "ARMSubtarget.h"
 #include "MCTargetDesc/ARMBaseInfo.h"
+#include "MVETailPredUtils.h"
 #include "Thumb2InstrInfo.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Support/Debug.h"
 #include <cassert>
 
@@ -35,6 +35,11 @@ using namespace llvm;
 
 #define DEBUG_TYPE "arm-mve-vpt-opts"
 
+static cl::opt<bool>
+MergeEndDec("arm-enable-merge-loopenddec", cl::Hidden,
+    cl::desc("Enable merging Loop End and Dec instructions."),
+    cl::init(true));
+
 namespace {
 class MVEVPTOptimisations : public MachineFunctionPass {
 public:
@@ -46,25 +51,314 @@ public:
 
   bool runOnMachineFunction(MachineFunction &Fn) override;
 
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineLoopInfo>();
+    AU.addPreserved<MachineLoopInfo>();
+    AU.addRequired<MachineDominatorTree>();
+    AU.addPreserved<MachineDominatorTree>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
   StringRef getPassName() const override {
-    return "ARM MVE VPT Optimisation Pass";
+    return "ARM MVE TailPred and VPT Optimisation Pass";
   }
 
 private:
+  bool MergeLoopEnd(MachineLoop *ML);
+  bool ConvertTailPredLoop(MachineLoop *ML, MachineDominatorTree *DT);
   MachineInstr &ReplaceRegisterUseWithVPNOT(MachineBasicBlock &MBB,
                                             MachineInstr &Instr,
                                             MachineOperand &User,
                                             Register Target);
   bool ReduceOldVCCRValueUses(MachineBasicBlock &MBB);
   bool ReplaceVCMPsByVPNOTs(MachineBasicBlock &MBB);
+  bool ReplaceConstByVPNOTs(MachineBasicBlock &MBB, MachineDominatorTree *DT);
+  bool ConvertVPSEL(MachineBasicBlock &MBB);
 };
 
 char MVEVPTOptimisations::ID = 0;
 
 } // end anonymous namespace
 
-INITIALIZE_PASS(MVEVPTOptimisations, DEBUG_TYPE,
-                "ARM MVE VPT Optimisations pass", false, false)
+INITIALIZE_PASS_BEGIN(MVEVPTOptimisations, DEBUG_TYPE,
+                      "ARM MVE TailPred and VPT Optimisations pass", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(MVEVPTOptimisations, DEBUG_TYPE,
+                    "ARM MVE TailPred and VPT Optimisations pass", false, false)
+
+static MachineInstr *LookThroughCOPY(MachineInstr *MI,
+                                     MachineRegisterInfo *MRI) {
+  while (MI && MI->getOpcode() == TargetOpcode::COPY &&
+         MI->getOperand(1).getReg().isVirtual())
+    MI = MRI->getVRegDef(MI->getOperand(1).getReg());
+  return MI;
+}
+
+// Given a loop ML, this attempts to find the t2LoopEnd, t2LoopDec and
+// corresponding PHI that make up a low overhead loop. Only handles 'do' loops
+// at the moment, returning a t2DoLoopStart in LoopStart.
+static bool findLoopComponents(MachineLoop *ML, MachineRegisterInfo *MRI,
+                               MachineInstr *&LoopStart, MachineInstr *&LoopPhi,
+                               MachineInstr *&LoopDec, MachineInstr *&LoopEnd) {
+  MachineBasicBlock *Header = ML->getHeader();
+  MachineBasicBlock *Latch = ML->getLoopLatch();
+  if (!Header || !Latch) {
+    LLVM_DEBUG(dbgs() << "  no Loop Latch or Header\n");
+    return false;
+  }
+
+  // Find the loop end from the terminators.
+  LoopEnd = nullptr;
+  for (auto &T : Latch->terminators()) {
+    if (T.getOpcode() == ARM::t2LoopEnd && T.getOperand(1).getMBB() == Header) {
+      LoopEnd = &T;
+      break;
+    }
+    if (T.getOpcode() == ARM::t2LoopEndDec &&
+        T.getOperand(2).getMBB() == Header) {
+      LoopEnd = &T;
+      break;
+    }
+  }
+  if (!LoopEnd) {
+    LLVM_DEBUG(dbgs() << "  no LoopEnd\n");
+    return false;
+  }
+  LLVM_DEBUG(dbgs() << "  found loop end: " << *LoopEnd);
+
+  // Find the dec from the use of the end. There may be copies between
+  // instructions. We expect the loop to loop like:
+  //   $vs = t2DoLoopStart ...
+  // loop:
+  //   $vp = phi [ $vs ], [ $vd ]
+  //   ...
+  //   $vd = t2LoopDec $vp
+  //   ...
+  //   t2LoopEnd $vd, loop
+  if (LoopEnd->getOpcode() == ARM::t2LoopEndDec)
+    LoopDec = LoopEnd;
+  else {
+    LoopDec =
+        LookThroughCOPY(MRI->getVRegDef(LoopEnd->getOperand(0).getReg()), MRI);
+    if (!LoopDec || LoopDec->getOpcode() != ARM::t2LoopDec) {
+      LLVM_DEBUG(dbgs() << "  didn't find LoopDec where we expected!\n");
+      return false;
+    }
+  }
+  LLVM_DEBUG(dbgs() << "  found loop dec: " << *LoopDec);
+
+  LoopPhi =
+      LookThroughCOPY(MRI->getVRegDef(LoopDec->getOperand(1).getReg()), MRI);
+  if (!LoopPhi || LoopPhi->getOpcode() != TargetOpcode::PHI ||
+      LoopPhi->getNumOperands() != 5 ||
+      (LoopPhi->getOperand(2).getMBB() != Latch &&
+       LoopPhi->getOperand(4).getMBB() != Latch)) {
+    LLVM_DEBUG(dbgs() << "  didn't find PHI where we expected!\n");
+    return false;
+  }
+  LLVM_DEBUG(dbgs() << "  found loop phi: " << *LoopPhi);
+
+  Register StartReg = LoopPhi->getOperand(2).getMBB() == Latch
+                          ? LoopPhi->getOperand(3).getReg()
+                          : LoopPhi->getOperand(1).getReg();
+  LoopStart = LookThroughCOPY(MRI->getVRegDef(StartReg), MRI);
+  if (!LoopStart || LoopStart->getOpcode() != ARM::t2DoLoopStart) {
+    LLVM_DEBUG(dbgs() << "  didn't find Start where we expected!\n");
+    return false;
+  }
+  LLVM_DEBUG(dbgs() << "  found loop start: " << *LoopStart);
+
+  return true;
+}
+
+// This function converts loops with t2LoopEnd and t2LoopEnd instructions into
+// a single t2LoopEndDec instruction. To do that it needs to make sure that LR
+// will be valid to be used for the low overhead loop, which means nothing else
+// is using LR (especially calls) and there are no superfluous copies in the
+// loop. The t2LoopEndDec is a branching terminator that produces a value (the
+// decrement) around the loop edge, which means we need to be careful that they
+// will be valid to allocate without any spilling.
+bool MVEVPTOptimisations::MergeLoopEnd(MachineLoop *ML) {
+  if (!MergeEndDec)
+    return false;
+
+  LLVM_DEBUG(dbgs() << "MergeLoopEnd on loop " << ML->getHeader()->getName()
+                    << "\n");
+
+  MachineInstr *LoopEnd, *LoopPhi, *LoopStart, *LoopDec;
+  if (!findLoopComponents(ML, MRI, LoopStart, LoopPhi, LoopDec, LoopEnd))
+    return false;
+
+  // Check if there is an illegal instruction (a call) in the low overhead loop
+  // and if so revert it now before we get any further.
+  for (MachineBasicBlock *MBB : ML->blocks()) {
+    for (MachineInstr &MI : *MBB) {
+      if (MI.isCall()) {
+        LLVM_DEBUG(dbgs() << "Found call in loop, reverting: " << MI);
+        RevertDoLoopStart(LoopStart, TII);
+        RevertLoopDec(LoopDec, TII);
+        RevertLoopEnd(LoopEnd, TII);
+        return true;
+      }
+    }
+  }
+
+  // Remove any copies from the loop, to ensure the phi that remains is both
+  // simpler and contains no extra uses. Because t2LoopEndDec is a terminator
+  // that cannot spill, we need to be careful what remains in the loop.
+  Register PhiReg = LoopPhi->getOperand(0).getReg();
+  Register DecReg = LoopDec->getOperand(0).getReg();
+  Register StartReg = LoopStart->getOperand(0).getReg();
+  // Ensure the uses are expected, and collect any copies we want to remove.
+  SmallVector<MachineInstr *, 4> Copies;
+  auto CheckUsers = [&Copies](Register BaseReg,
+                              ArrayRef<MachineInstr *> ExpectedUsers,
+                              MachineRegisterInfo *MRI) {
+    SmallVector<Register, 4> Worklist;
+    Worklist.push_back(BaseReg);
+    while (!Worklist.empty()) {
+      Register Reg = Worklist.pop_back_val();
+      for (MachineInstr &MI : MRI->use_nodbg_instructions(Reg)) {
+        if (count(ExpectedUsers, &MI))
+          continue;
+        if (MI.getOpcode() != TargetOpcode::COPY ||
+            !MI.getOperand(0).getReg().isVirtual()) {
+          LLVM_DEBUG(dbgs() << "Extra users of register found: " << MI);
+          return false;
+        }
+        Worklist.push_back(MI.getOperand(0).getReg());
+        Copies.push_back(&MI);
+      }
+    }
+    return true;
+  };
+  if (!CheckUsers(PhiReg, {LoopDec}, MRI) ||
+      !CheckUsers(DecReg, {LoopPhi, LoopEnd}, MRI) ||
+      !CheckUsers(StartReg, {LoopPhi}, MRI))
+    return false;
+
+  MRI->constrainRegClass(StartReg, &ARM::GPRlrRegClass);
+  MRI->constrainRegClass(PhiReg, &ARM::GPRlrRegClass);
+  MRI->constrainRegClass(DecReg, &ARM::GPRlrRegClass);
+
+  if (LoopPhi->getOperand(2).getMBB() == ML->getLoopLatch()) {
+    LoopPhi->getOperand(3).setReg(StartReg);
+    LoopPhi->getOperand(1).setReg(DecReg);
+  } else {
+    LoopPhi->getOperand(1).setReg(StartReg);
+    LoopPhi->getOperand(3).setReg(DecReg);
+  }
+
+  // Replace the loop dec and loop end as a single instruction.
+  MachineInstrBuilder MI =
+      BuildMI(*LoopEnd->getParent(), *LoopEnd, LoopEnd->getDebugLoc(),
+              TII->get(ARM::t2LoopEndDec), DecReg)
+          .addReg(PhiReg)
+          .add(LoopEnd->getOperand(1));
+  (void)MI;
+  LLVM_DEBUG(dbgs() << "Merged LoopDec and End into: " << *MI.getInstr());
+
+  LoopDec->eraseFromParent();
+  LoopEnd->eraseFromParent();
+  for (auto *MI : Copies)
+    MI->eraseFromParent();
+  return true;
+}
+
+// Convert t2DoLoopStart to t2DoLoopStartTP if the loop contains VCTP
+// instructions. This keeps the VCTP count reg operand on the t2DoLoopStartTP
+// instruction, making the backend ARMLowOverheadLoops passes job of finding the
+// VCTP operand much simpler.
+bool MVEVPTOptimisations::ConvertTailPredLoop(MachineLoop *ML,
+                                              MachineDominatorTree *DT) {
+  LLVM_DEBUG(dbgs() << "ConvertTailPredLoop on loop "
+                    << ML->getHeader()->getName() << "\n");
+
+  // Find some loop components including the LoopEnd/Dec/Start, and any VCTP's
+  // in the loop.
+  MachineInstr *LoopEnd, *LoopPhi, *LoopStart, *LoopDec;
+  if (!findLoopComponents(ML, MRI, LoopStart, LoopPhi, LoopDec, LoopEnd))
+    return false;
+  if (LoopDec != LoopEnd)
+    return false;
+
+  SmallVector<MachineInstr *, 4> VCTPs;
+  for (MachineBasicBlock *BB : ML->blocks())
+    for (MachineInstr &MI : *BB)
+      if (isVCTP(&MI))
+        VCTPs.push_back(&MI);
+
+  if (VCTPs.empty()) {
+    LLVM_DEBUG(dbgs() << "  no VCTPs\n");
+    return false;
+  }
+
+  // Check all VCTPs are the same.
+  MachineInstr *FirstVCTP = *VCTPs.begin();
+  for (MachineInstr *VCTP : VCTPs) {
+    LLVM_DEBUG(dbgs() << "  with VCTP " << *VCTP);
+    if (VCTP->getOpcode() != FirstVCTP->getOpcode() ||
+        VCTP->getOperand(0).getReg() != FirstVCTP->getOperand(0).getReg()) {
+      LLVM_DEBUG(dbgs() << "  VCTP's are not identical\n");
+      return false;
+    }
+  }
+
+  // Check for the register being used can be setup before the loop. We expect
+  // this to be:
+  //   $vx = ...
+  // loop:
+  //   $vp = PHI [ $vx ], [ $vd ]
+  //   ..
+  //   $vpr = VCTP $vp
+  //   ..
+  //   $vd = t2SUBri $vp, #n
+  //   ..
+  Register CountReg = FirstVCTP->getOperand(1).getReg();
+  if (!CountReg.isVirtual()) {
+    LLVM_DEBUG(dbgs() << "  cannot determine VCTP PHI\n");
+    return false;
+  }
+  MachineInstr *Phi = LookThroughCOPY(MRI->getVRegDef(CountReg), MRI);
+  if (!Phi || Phi->getOpcode() != TargetOpcode::PHI ||
+      Phi->getNumOperands() != 5 ||
+      (Phi->getOperand(2).getMBB() != ML->getLoopLatch() &&
+       Phi->getOperand(4).getMBB() != ML->getLoopLatch())) {
+    LLVM_DEBUG(dbgs() << "  cannot determine VCTP Count\n");
+    return false;
+  }
+  CountReg = Phi->getOperand(2).getMBB() == ML->getLoopLatch()
+                 ? Phi->getOperand(3).getReg()
+                 : Phi->getOperand(1).getReg();
+
+  // Replace the t2DoLoopStart with the t2DoLoopStartTP, move it to the end of
+  // the preheader and add the new CountReg to it. We attempt to place it late
+  // in the preheader, but may need to move that earlier based on uses.
+  MachineBasicBlock *MBB = LoopStart->getParent();
+  MachineBasicBlock::iterator InsertPt = MBB->getFirstTerminator();
+  for (MachineInstr &Use :
+       MRI->use_instructions(LoopStart->getOperand(0).getReg()))
+    if ((InsertPt != MBB->end() && !DT->dominates(&*InsertPt, &Use)) ||
+        !DT->dominates(ML->getHeader(), Use.getParent())) {
+      LLVM_DEBUG(dbgs() << "  InsertPt could not be a terminator!\n");
+      return false;
+    }
+
+  MachineInstrBuilder MI = BuildMI(*MBB, InsertPt, LoopStart->getDebugLoc(),
+                                   TII->get(ARM::t2DoLoopStartTP))
+                               .add(LoopStart->getOperand(0))
+                               .add(LoopStart->getOperand(1))
+                               .addReg(CountReg);
+  (void)MI;
+  LLVM_DEBUG(dbgs() << "Replacing " << *LoopStart << "  with "
+                    << *MI.getInstr());
+  MRI->constrainRegClass(CountReg, &ARM::rGPRRegClass);
+  LoopStart->eraseFromParent();
+
+  return true;
+}
 
 // Returns true if Opcode is any VCMP Opcode.
 static bool IsVCMP(unsigned Opcode) { return VCMPOpcodeToVPT(Opcode) != 0; }
@@ -356,7 +650,7 @@ bool MVEVPTOptimisations::ReduceOldVCCRValueUses(MachineBasicBlock &MBB) {
   }
 
   for (MachineInstr *DeadInstruction : DeadInstructions)
-    DeadInstruction->removeFromParent();
+    DeadInstruction->eraseFromParent();
 
   return Modified;
 }
@@ -430,7 +724,130 @@ bool MVEVPTOptimisations::ReplaceVCMPsByVPNOTs(MachineBasicBlock &MBB) {
   }
 
   for (MachineInstr *DeadInstruction : DeadInstructions)
-    DeadInstruction->removeFromParent();
+    DeadInstruction->eraseFromParent();
+
+  return !DeadInstructions.empty();
+}
+
+bool MVEVPTOptimisations::ReplaceConstByVPNOTs(MachineBasicBlock &MBB,
+                                               MachineDominatorTree *DT) {
+  // Scan through the block, looking for instructions that use constants moves
+  // into VPR that are the negative of one another. These are expected to be
+  // COPY's to VCCRRegClass, from a t2MOVi or t2MOVi16. The last seen constant
+  // mask is kept it or and VPNOT's of it are added or reused as we scan through
+  // the function.
+  unsigned LastVPTImm = 0;
+  Register LastVPTReg = 0;
+  SmallSet<MachineInstr *, 4> DeadInstructions;
+
+  for (MachineInstr &Instr : MBB.instrs()) {
+    // Look for predicated MVE instructions.
+    int PIdx = llvm::findFirstVPTPredOperandIdx(Instr);
+    if (PIdx == -1)
+      continue;
+    Register VPR = Instr.getOperand(PIdx + 1).getReg();
+    if (!VPR.isVirtual())
+      continue;
+
+    // From that we are looking for an instruction like %11:vccr = COPY %9:rgpr.
+    MachineInstr *Copy = MRI->getVRegDef(VPR);
+    if (!Copy || Copy->getOpcode() != TargetOpcode::COPY ||
+        !Copy->getOperand(1).getReg().isVirtual() ||
+        MRI->getRegClass(Copy->getOperand(1).getReg()) == &ARM::VCCRRegClass) {
+      LastVPTReg = 0;
+      continue;
+    }
+    Register GPR = Copy->getOperand(1).getReg();
+
+    // Find the Immediate used by the copy.
+    auto getImm = [&](Register GPR) -> unsigned {
+      MachineInstr *Def = MRI->getVRegDef(GPR);
+      if (Def && (Def->getOpcode() == ARM::t2MOVi ||
+                  Def->getOpcode() == ARM::t2MOVi16))
+        return Def->getOperand(1).getImm();
+      return -1U;
+    };
+    unsigned Imm = getImm(GPR);
+    if (Imm == -1U) {
+      LastVPTReg = 0;
+      continue;
+    }
+
+    unsigned NotImm = ~Imm & 0xffff;
+    if (LastVPTReg != 0 && LastVPTReg != VPR && LastVPTImm == Imm) {
+      Instr.getOperand(PIdx + 1).setReg(LastVPTReg);
+      if (MRI->use_empty(VPR)) {
+        DeadInstructions.insert(Copy);
+        if (MRI->hasOneUse(GPR))
+          DeadInstructions.insert(MRI->getVRegDef(GPR));
+      }
+      LLVM_DEBUG(dbgs() << "Reusing predicate: in  " << Instr);
+    } else if (LastVPTReg != 0 && LastVPTImm == NotImm) {
+      // We have found the not of a previous constant. Create a VPNot of the
+      // earlier predicate reg and use it instead of the copy.
+      Register NewVPR = MRI->createVirtualRegister(&ARM::VCCRRegClass);
+      auto VPNot = BuildMI(MBB, &Instr, Instr.getDebugLoc(),
+                           TII->get(ARM::MVE_VPNOT), NewVPR)
+                       .addReg(LastVPTReg);
+      addUnpredicatedMveVpredNOp(VPNot);
+
+      // Use the new register and check if the def is now dead.
+      Instr.getOperand(PIdx + 1).setReg(NewVPR);
+      if (MRI->use_empty(VPR)) {
+        DeadInstructions.insert(Copy);
+        if (MRI->hasOneUse(GPR))
+          DeadInstructions.insert(MRI->getVRegDef(GPR));
+      }
+      LLVM_DEBUG(dbgs() << "Adding VPNot: " << *VPNot << "  to replace use at "
+                        << Instr);
+      VPR = NewVPR;
+    }
+
+    LastVPTImm = Imm;
+    LastVPTReg = VPR;
+  }
+
+  for (MachineInstr *DI : DeadInstructions)
+    DI->eraseFromParent();
+
+  return !DeadInstructions.empty();
+}
+
+// Replace VPSEL with a predicated VMOV in blocks with a VCTP. This is a
+// somewhat blunt approximation to allow tail predicated with vpsel
+// instructions. We turn a vselect into a VPSEL in ISEL, but they have slightly
+// different semantics under tail predication. Until that is modelled we just
+// convert to a VMOVT (via a predicated VORR) instead.
+bool MVEVPTOptimisations::ConvertVPSEL(MachineBasicBlock &MBB) {
+  bool HasVCTP = false;
+  SmallVector<MachineInstr *, 4> DeadInstructions;
+
+  for (MachineInstr &MI : MBB.instrs()) {
+    if (isVCTP(&MI)) {
+      HasVCTP = true;
+      continue;
+    }
+
+    if (!HasVCTP || MI.getOpcode() != ARM::MVE_VPSEL)
+      continue;
+
+    MachineInstrBuilder MIBuilder =
+        BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(ARM::MVE_VORR))
+            .add(MI.getOperand(0))
+            .add(MI.getOperand(1))
+            .add(MI.getOperand(1))
+            .addImm(ARMVCC::Then)
+            .add(MI.getOperand(4))
+            .add(MI.getOperand(2));
+    // Silence unused variable warning in release builds.
+    (void)MIBuilder;
+    LLVM_DEBUG(dbgs() << "Replacing VPSEL: "; MI.dump();
+               dbgs() << "     with VMOVT: "; MIBuilder.getInstr()->dump());
+    DeadInstructions.push_back(&MI);
+  }
+
+  for (MachineInstr *DeadInstruction : DeadInstructions)
+    DeadInstruction->eraseFromParent();
 
   return !DeadInstructions.empty();
 }
@@ -439,19 +856,28 @@ bool MVEVPTOptimisations::runOnMachineFunction(MachineFunction &Fn) {
   const ARMSubtarget &STI =
       static_cast<const ARMSubtarget &>(Fn.getSubtarget());
 
-  if (!STI.isThumb2() || !STI.hasMVEIntegerOps())
+  if (!STI.isThumb2() || !STI.hasLOB())
     return false;
 
   TII = static_cast<const Thumb2InstrInfo *>(STI.getInstrInfo());
   MRI = &Fn.getRegInfo();
+  MachineLoopInfo *MLI = &getAnalysis<MachineLoopInfo>();
+  MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
 
   LLVM_DEBUG(dbgs() << "********** ARM MVE VPT Optimisations **********\n"
                     << "********** Function: " << Fn.getName() << '\n');
 
   bool Modified = false;
+  for (MachineLoop *ML : MLI->getBase().getLoopsInPreorder()) {
+    Modified |= MergeLoopEnd(ML);
+    Modified |= ConvertTailPredLoop(ML, DT);
+  }
+
   for (MachineBasicBlock &MBB : Fn) {
+    Modified |= ReplaceConstByVPNOTs(MBB, DT);
     Modified |= ReplaceVCMPsByVPNOTs(MBB);
     Modified |= ReduceOldVCCRValueUses(MBB);
+    Modified |= ConvertVPSEL(MBB);
   }
 
   LLVM_DEBUG(dbgs() << "**************************************\n");
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
index 48c6b47f2154..d728572e2858 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -12,6 +12,7 @@
 
 #include "Thumb2InstrInfo.h"
 #include "ARMMachineFunctionInfo.h"
+#include "ARMSubtarget.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -38,6 +39,11 @@ OldT2IfCvt("old-thumb2-ifcvt", cl::Hidden,
            cl::desc("Use old-style Thumb2 if-conversion heuristics"),
            cl::init(false));
 
+static cl::opt<bool>
+PreferNoCSEL("prefer-no-csel", cl::Hidden,
+             cl::desc("Prefer predicated Move to CSEL"),
+             cl::init(false));
+
 Thumb2InstrInfo::Thumb2InstrInfo(const ARMSubtarget &STI)
     : ARMBaseInstrInfo(STI) {}
 
@@ -118,6 +124,31 @@ Thumb2InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB,
   return getITInstrPredicate(*MBBI, PredReg) == ARMCC::AL;
 }
 
+MachineInstr *
+Thumb2InstrInfo::optimizeSelect(MachineInstr &MI,
+                                SmallPtrSetImpl<MachineInstr *> &SeenMIs,
+                                bool PreferFalse) const {
+  // Try to use the base optimizeSelect, which uses canFoldIntoMOVCC to fold the
+  // MOVCC into another instruction. If that fails on 8.1-M fall back to using a
+  // CSEL.
+  MachineInstr *RV = ARMBaseInstrInfo::optimizeSelect(MI, SeenMIs, PreferFalse);
+  if (!RV && getSubtarget().hasV8_1MMainlineOps() && !PreferNoCSEL) {
+    Register DestReg = MI.getOperand(0).getReg();
+
+    if (!DestReg.isVirtual())
+      return nullptr;
+
+    MachineInstrBuilder NewMI = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
+                                        get(ARM::t2CSEL), DestReg)
+                                    .add(MI.getOperand(2))
+                                    .add(MI.getOperand(1))
+                                    .add(MI.getOperand(3));
+    SeenMIs.insert(NewMI);
+    return NewMI;
+  }
+  return RV;
+}
+
 void Thumb2InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                   MachineBasicBlock::iterator I,
                                   const DebugLoc &DL, MCRegister DestReg,
@@ -227,6 +258,22 @@ void Thumb2InstrInfo::expandLoadStackGuard(
     expandLoadStackGuardBase(MI, ARM::t2MOVi32imm, ARM::t2LDRi12);
 }
 
+MachineInstr *Thumb2InstrInfo::commuteInstructionImpl(MachineInstr &MI,
+                                                      bool NewMI,
+                                                      unsigned OpIdx1,
+                                                      unsigned OpIdx2) const {
+  switch (MI.getOpcode()) {
+  case ARM::MVE_VMAXNMAf16:
+  case ARM::MVE_VMAXNMAf32:
+  case ARM::MVE_VMINNMAf16:
+  case ARM::MVE_VMINNMAf32:
+    // Don't allow predicated instructions to be commuted.
+    if (getVPTInstrPredicate(MI) != ARMVCC::None)
+      return nullptr;
+  }
+  return ARMBaseInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
+}
+
 void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB,
                                   MachineBasicBlock::iterator &MBBI,
                                   const DebugLoc &dl, Register DestReg,
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.h
index ec3763632239..808167bfdcbc 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.h
@@ -60,6 +60,14 @@ public:
   ///
   const ThumbRegisterInfo &getRegisterInfo() const override { return RI; }
 
+  MachineInstr *optimizeSelect(MachineInstr &MI,
+                               SmallPtrSetImpl<MachineInstr *> &SeenMIs,
+                               bool) const override;
+
+  MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI,
+                                       unsigned OpIdx1,
+                                       unsigned OpIdx2) const override;
+
 private:
   void expandLoadStackGuard(MachineBasicBlock::iterator MI) const override;
 };
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
index ae661594bdc9..0f7e19038673 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -43,7 +43,7 @@
 
 using namespace llvm;
 
-#define DEBUG_TYPE "t2-reduce-size"
+#define DEBUG_TYPE "thumb2-reduce-size"
 #define THUMB2_SIZE_REDUCE_NAME "Thumb2 instruction size reduce pass"
 
 STATISTIC(NumNarrows,  "Number of 32-bit instrs reduced to 16-bit ones");
diff --git a/contrib/llvm-project/llvm/lib/Target/AVR/AVRDevices.td b/contrib/llvm-project/llvm/lib/Target/AVR/AVRDevices.td
index 6730f2e1673e..9507aa40c3d8 100644
--- a/contrib/llvm-project/llvm/lib/Target/AVR/AVRDevices.td
+++ b/contrib/llvm-project/llvm/lib/Target/AVR/AVRDevices.td
@@ -195,7 +195,7 @@ def FamilyAVR6           : Family<"avr6",
 
 def FamilyTiny           : Family<"avrtiny",
                                  [FamilyAVR0, FeatureBREAK, FeatureSRAM,
-                                  FeatureTinyEncoding, FeatureMMR]>;
+                                  FeatureTinyEncoding]>;
 
 def FamilyXMEGA          : Family<"xmega",
                                  [FamilyAVR0, FeatureLPM, FeatureIJMPCALL, FeatureADDSUBIW,
@@ -286,8 +286,10 @@ def : Device<"attiny45",           FamilyAVR25, ELFArchAVR25>;
 def : Device<"attiny85",           FamilyAVR25, ELFArchAVR25>;
 def : Device<"attiny261",          FamilyAVR25, ELFArchAVR25>;
 def : Device<"attiny261a",         FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny441",          FamilyAVR25, ELFArchAVR25>;
 def : Device<"attiny461",          FamilyAVR25, ELFArchAVR25>;
 def : Device<"attiny461a",         FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny841",          FamilyAVR25, ELFArchAVR25>;
 def : Device<"attiny861",          FamilyAVR25, ELFArchAVR25>;
 def : Device<"attiny861a",         FamilyAVR25, ELFArchAVR25>;
 def : Device<"attiny87",           FamilyAVR25, ELFArchAVR25>;
@@ -307,19 +309,23 @@ def : Device<"atmega8u2",          FamilyAVR35, ELFArchAVR35>;
 def : Device<"atmega16u2",         FamilyAVR35, ELFArchAVR35>;
 def : Device<"atmega32u2",         FamilyAVR35, ELFArchAVR35>;
 def : Device<"attiny1634",         FamilyAVR35, ELFArchAVR35>;
-def : Device<"atmega8",            FamilyAVR4,  ELFArchAVR4>; // FIXME: family may be wrong
+def : Device<"atmega8",            FamilyAVR2,  ELFArchAVR4,
+             [FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM]>;
 def : Device<"ata6289",            FamilyAVR4,  ELFArchAVR4>;
-def : Device<"atmega8a",           FamilyAVR4,  ELFArchAVR4>;
+def : Device<"atmega8a",           FamilyAVR2,  ELFArchAVR4,
+             [FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM]>;
 def : Device<"ata6285",            FamilyAVR4,  ELFArchAVR4>;
 def : Device<"ata6286",            FamilyAVR4,  ELFArchAVR4>;
 def : Device<"atmega48",           FamilyAVR4,  ELFArchAVR4>;
 def : Device<"atmega48a",          FamilyAVR4,  ELFArchAVR4>;
 def : Device<"atmega48pa",         FamilyAVR4,  ELFArchAVR4>;
+def : Device<"atmega48pb",         FamilyAVR4,  ELFArchAVR4>;
 def : Device<"atmega48p",          FamilyAVR4,  ELFArchAVR4>;
 def : Device<"atmega88",           FamilyAVR4,  ELFArchAVR4>;
 def : Device<"atmega88a",          FamilyAVR4,  ELFArchAVR4>;
 def : Device<"atmega88p",          FamilyAVR4,  ELFArchAVR4>;
 def : Device<"atmega88pa",         FamilyAVR4,  ELFArchAVR4>;
+def : Device<"atmega88pb",         FamilyAVR4,  ELFArchAVR4>;
 def : Device<"atmega8515",         FamilyAVR2,  ELFArchAVR4,
              [FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM]>;
 def : Device<"atmega8535",         FamilyAVR2,  ELFArchAVR4,
@@ -351,6 +357,7 @@ def : Device<"atmega168",          FamilyAVR5,  ELFArchAVR5>;
 def : Device<"atmega168a",         FamilyAVR5,  ELFArchAVR5>;
 def : Device<"atmega168p",         FamilyAVR5,  ELFArchAVR5>;
 def : Device<"atmega168pa",        FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega168pb",        FamilyAVR5,  ELFArchAVR5>;
 def : Device<"atmega169",          FamilyAVR5,  ELFArchAVR5>;
 def : Device<"atmega169a",         FamilyAVR5,  ELFArchAVR5>;
 def : Device<"atmega169p",         FamilyAVR5,  ELFArchAVR5>;
@@ -361,6 +368,7 @@ def : Device<"atmega323",          FamilyAVR5,  ELFArchAVR5>;
 def : Device<"atmega324a",         FamilyAVR5,  ELFArchAVR5>;
 def : Device<"atmega324p",         FamilyAVR5,  ELFArchAVR5>;
 def : Device<"atmega324pa",        FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega324pb",        FamilyAVR5,  ELFArchAVR5>;
 def : Device<"atmega325",          FamilyAVR5,  ELFArchAVR5>;
 def : Device<"atmega325a",         FamilyAVR5,  ELFArchAVR5>;
 def : Device<"atmega325p",         FamilyAVR5,  ELFArchAVR5>;
@@ -371,6 +379,7 @@ def : Device<"atmega3250p",        FamilyAVR5,  ELFArchAVR5>;
 def : Device<"atmega3250pa",       FamilyAVR5,  ELFArchAVR5>;
 def : Device<"atmega328",          FamilyAVR5,  ELFArchAVR5>;
 def : Device<"atmega328p",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega328pb",        FamilyAVR5,  ELFArchAVR5>;
 def : Device<"atmega329",          FamilyAVR5,  ELFArchAVR5>;
 def : Device<"atmega329a",         FamilyAVR5,  ELFArchAVR5>;
 def : Device<"atmega329p",         FamilyAVR5,  ELFArchAVR5>;
@@ -451,9 +460,9 @@ def : Device<"atxmega32a4",        FamilyXMEGA, ELFArchXMEGA2>;
 def : Device<"atxmega32a4u",       FamilyXMEGAU, ELFArchXMEGA2>;
 def : Device<"atxmega32c4",        FamilyXMEGAU, ELFArchXMEGA2>;
 def : Device<"atxmega32d4",        FamilyXMEGA, ELFArchXMEGA2>;
-def : Device<"atxmega32e5",        FamilyXMEGA, ELFArchXMEGA2>;
-def : Device<"atxmega16e5",        FamilyXMEGA, ELFArchXMEGA2>;
-def : Device<"atxmega8e5",         FamilyXMEGA, ELFArchXMEGA2>;
+def : Device<"atxmega32e5",        FamilyXMEGAU, ELFArchXMEGA2>;
+def : Device<"atxmega16e5",        FamilyXMEGAU, ELFArchXMEGA2>;
+def : Device<"atxmega8e5",         FamilyXMEGAU, ELFArchXMEGA2>;
 def : Device<"atxmega32x1",        FamilyXMEGA, ELFArchXMEGA2>;
 def : Device<"atxmega64a3",        FamilyXMEGA, ELFArchXMEGA4>;
 def : Device<"atxmega64a3u",       FamilyXMEGAU, ELFArchXMEGA4>;
diff --git a/contrib/llvm-project/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp b/contrib/llvm-project/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp
index 8ee69201e932..a48d3d134bb5 100644
--- a/contrib/llvm-project/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp
@@ -415,6 +415,44 @@ bool AVRExpandPseudo::expand<AVR::COMWRd>(Block &MBB, BlockIt MBBI) {
   return true;
 }
 
+template <>
+bool AVRExpandPseudo::expand<AVR::NEGWRd>(Block &MBB, BlockIt MBBI) {
+  MachineInstr &MI = *MBBI;
+  Register DstLoReg, DstHiReg;
+  Register DstReg = MI.getOperand(0).getReg();
+  bool DstIsDead = MI.getOperand(0).isDead();
+  bool DstIsKill = MI.getOperand(1).isKill();
+  bool ImpIsDead = MI.getOperand(2).isDead();
+  TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+  // Do NEG on the upper byte.
+  auto MIBHI =
+      buildMI(MBB, MBBI, AVR::NEGRd)
+          .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+          .addReg(DstHiReg, getKillRegState(DstIsKill));
+  // SREG is always implicitly dead
+  MIBHI->getOperand(2).setIsDead();
+
+  // Do NEG on the lower byte.
+  buildMI(MBB, MBBI, AVR::NEGRd)
+      .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+      .addReg(DstLoReg, getKillRegState(DstIsKill));
+
+  // Do an extra SBCI.
+  auto MISBCI =
+      buildMI(MBB, MBBI, AVR::SBCIRdK)
+          .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+          .addReg(DstHiReg, getKillRegState(DstIsKill))
+          .addImm(0);
+  if (ImpIsDead)
+    MISBCI->getOperand(3).setIsDead();
+  // SREG is always implicitly killed
+  MISBCI->getOperand(4).setIsKill();
+
+  MI.eraseFromParent();
+  return true;
+}
+
 template <>
 bool AVRExpandPseudo::expand<AVR::CPWRdRr>(Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
@@ -1438,6 +1476,111 @@ bool AVRExpandPseudo::expand<AVR::ASRWRd>(Block &MBB, BlockIt MBBI) {
   return true;
 }
 
+template <>
+bool AVRExpandPseudo::expand<AVR::LSLB7Rd>(Block &MBB, BlockIt MBBI) {
+  MachineInstr &MI = *MBBI;
+  Register DstReg = MI.getOperand(0).getReg();
+  bool DstIsDead = MI.getOperand(0).isDead();
+  bool DstIsKill = MI.getOperand(1).isKill();
+  bool ImpIsDead = MI.getOperand(2).isDead();
+
+  // ror r24
+  // clr r24
+  // ror r24
+
+  buildMI(MBB, MBBI, AVR::RORRd)
+      .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+      .addReg(DstReg, getKillRegState(DstIsKill));
+
+  buildMI(MBB, MBBI, AVR::EORRdRr)
+      .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+      .addReg(DstReg, getKillRegState(DstIsKill))
+      .addReg(DstReg, getKillRegState(DstIsKill));
+
+  auto MIRRC =
+      buildMI(MBB, MBBI, AVR::RORRd)
+          .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+          .addReg(DstReg, getKillRegState(DstIsKill));
+
+  if (ImpIsDead)
+    MIRRC->getOperand(2).setIsDead();
+
+  // SREG is always implicitly killed
+  MIRRC->getOperand(3).setIsKill();
+
+  MI.eraseFromParent();
+  return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::LSRB7Rd>(Block &MBB, BlockIt MBBI) {
+  MachineInstr &MI = *MBBI;
+  Register DstReg = MI.getOperand(0).getReg();
+  bool DstIsDead = MI.getOperand(0).isDead();
+  bool DstIsKill = MI.getOperand(1).isKill();
+  bool ImpIsDead = MI.getOperand(2).isDead();
+
+  // rol r24
+  // clr r24
+  // rol r24
+
+  buildMI(MBB, MBBI, AVR::ADCRdRr)
+      .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+      .addReg(DstReg, getKillRegState(DstIsKill))
+      .addReg(DstReg, getKillRegState(DstIsKill));
+
+  buildMI(MBB, MBBI, AVR::EORRdRr)
+      .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+      .addReg(DstReg, getKillRegState(DstIsKill))
+      .addReg(DstReg, getKillRegState(DstIsKill));
+
+  auto MIRRC =
+      buildMI(MBB, MBBI, AVR::ADCRdRr)
+          .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+          .addReg(DstReg, getKillRegState(DstIsKill))
+          .addReg(DstReg, getKillRegState(DstIsKill));
+
+  if (ImpIsDead)
+    MIRRC->getOperand(3).setIsDead();
+
+  // SREG is always implicitly killed
+  MIRRC->getOperand(4).setIsKill();
+
+  MI.eraseFromParent();
+  return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::ASRB7Rd>(Block &MBB, BlockIt MBBI) {
+  MachineInstr &MI = *MBBI;
+  Register DstReg = MI.getOperand(0).getReg();
+  bool DstIsDead = MI.getOperand(0).isDead();
+  bool DstIsKill = MI.getOperand(1).isKill();
+  bool ImpIsDead = MI.getOperand(2).isDead();
+
+  // lsl r24
+  // sbc r24, r24
+
+  buildMI(MBB, MBBI, AVR::ADDRdRr)
+      .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+      .addReg(DstReg, getKillRegState(DstIsKill))
+      .addReg(DstReg, getKillRegState(DstIsKill));
+
+  auto MIRRC = buildMI(MBB, MBBI, AVR::SBCRdRr)
+      .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+      .addReg(DstReg, getKillRegState(DstIsKill))
+      .addReg(DstReg, getKillRegState(DstIsKill));
+
+  if (ImpIsDead)
+    MIRRC->getOperand(3).setIsDead();
+
+  // SREG is always implicitly killed
+  MIRRC->getOperand(4).setIsKill();
+
+  MI.eraseFromParent();
+  return true;
+}
+
 template <> bool AVRExpandPseudo::expand<AVR::SEXT>(Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
   Register DstLoReg, DstHiReg;
@@ -1616,6 +1759,7 @@ bool AVRExpandPseudo::expandMI(Block &MBB, BlockIt MBBI) {
     EXPAND(AVR::ORIWRdK);
     EXPAND(AVR::EORWRdRr);
     EXPAND(AVR::COMWRd);
+    EXPAND(AVR::NEGWRd);
     EXPAND(AVR::CPWRdRr);
     EXPAND(AVR::CPCWRdRr);
     EXPAND(AVR::LDIWRdK);
@@ -1658,6 +1802,9 @@ bool AVRExpandPseudo::expandMI(Block &MBB, BlockIt MBBI) {
     EXPAND(AVR::RORWRd);
     EXPAND(AVR::ROLWRd);
     EXPAND(AVR::ASRWRd);
+    EXPAND(AVR::LSLB7Rd);
+    EXPAND(AVR::LSRB7Rd);
+    EXPAND(AVR::ASRB7Rd);
     EXPAND(AVR::SEXT);
     EXPAND(AVR::ZEXT);
     EXPAND(AVR::SPREAD);
diff --git a/contrib/llvm-project/llvm/lib/Target/AVR/AVRFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AVR/AVRFrameLowering.cpp
index c95a553b86ac..757b41466c3f 100644
--- a/contrib/llvm-project/llvm/lib/Target/AVR/AVRFrameLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AVR/AVRFrameLowering.cpp
@@ -131,6 +131,26 @@ void AVRFrameLowering::emitPrologue(MachineFunction &MF,
       .setMIFlag(MachineInstr::FrameSetup);
 }
 
+static void restoreStatusRegister(MachineFunction &MF, MachineBasicBlock &MBB) {
+  const AVRMachineFunctionInfo *AFI = MF.getInfo<AVRMachineFunctionInfo>();
+
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+
+  DebugLoc DL = MBBI->getDebugLoc();
+  const AVRSubtarget &STI = MF.getSubtarget<AVRSubtarget>();
+  const AVRInstrInfo &TII = *STI.getInstrInfo();
+
+  // Emit special epilogue code to restore R1, R0 and SREG in interrupt/signal
+  // handlers at the very end of the function, just before reti.
+  if (AFI->isInterruptOrSignalHandler()) {
+    BuildMI(MBB, MBBI, DL, TII.get(AVR::POPRd), AVR::R0);
+    BuildMI(MBB, MBBI, DL, TII.get(AVR::OUTARr))
+        .addImm(0x3f)
+        .addReg(AVR::R0, RegState::Kill);
+    BuildMI(MBB, MBBI, DL, TII.get(AVR::POPWRd), AVR::R1R0);
+  }
+}
+
 void AVRFrameLowering::emitEpilogue(MachineFunction &MF,
                                     MachineBasicBlock &MBB) const {
   const AVRMachineFunctionInfo *AFI = MF.getInfo<AVRMachineFunctionInfo>();
@@ -151,18 +171,9 @@ void AVRFrameLowering::emitEpilogue(MachineFunction &MF,
   const AVRSubtarget &STI = MF.getSubtarget<AVRSubtarget>();
   const AVRInstrInfo &TII = *STI.getInstrInfo();
 
-  // Emit special epilogue code to restore R1, R0 and SREG in interrupt/signal
-  // handlers at the very end of the function, just before reti.
-  if (AFI->isInterruptOrSignalHandler()) {
-    BuildMI(MBB, MBBI, DL, TII.get(AVR::POPRd), AVR::R0);
-    BuildMI(MBB, MBBI, DL, TII.get(AVR::OUTARr))
-        .addImm(0x3f)
-        .addReg(AVR::R0, RegState::Kill);
-    BuildMI(MBB, MBBI, DL, TII.get(AVR::POPWRd), AVR::R1R0);
-  }
-
   // Early exit if there is no need to restore the frame pointer.
   if (!FrameSize) {
+    restoreStatusRegister(MF, MBB);
     return;
   }
 
@@ -198,6 +209,8 @@ void AVRFrameLowering::emitEpilogue(MachineFunction &MF,
   // Write back R29R28 to SP and temporarily disable interrupts.
   BuildMI(MBB, MBBI, DL, TII.get(AVR::SPWRITE), AVR::SP)
       .addReg(AVR::R29R28, RegState::Kill);
+
+  restoreStatusRegister(MF, MBB);
 }
 
 // Return true if the specified function should have a dedicated frame
diff --git a/contrib/llvm-project/llvm/lib/Target/AVR/AVRISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/AVR/AVRISelDAGToDAG.cpp
index fe31fa42c403..df382d553753 100644
--- a/contrib/llvm-project/llvm/lib/Target/AVR/AVRISelDAGToDAG.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AVR/AVRISelDAGToDAG.cpp
@@ -242,10 +242,7 @@ bool AVRDAGToDAGISel::SelectInlineAsmMemoryOperand(const SDValue &Op,
     ConstantSDNode *ImmNode = dyn_cast<ConstantSDNode>(ImmOp);
 
     unsigned Reg;
-    bool CanHandleRegImmOpt = true;
-
-    CanHandleRegImmOpt &= ImmNode != 0;
-    CanHandleRegImmOpt &= ImmNode->getAPIntValue().getZExtValue() < 64;
+    bool CanHandleRegImmOpt = ImmNode && ImmNode->getAPIntValue().ult(64);
 
     if (CopyFromRegOp->getOpcode() == ISD::CopyFromReg) {
       RegisterSDNode *RegNode =
diff --git a/contrib/llvm-project/llvm/lib/Target/AVR/AVRISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AVR/AVRISelLowering.cpp
index bf9b32e1278e..3e7c2984655a 100644
--- a/contrib/llvm-project/llvm/lib/Target/AVR/AVRISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AVR/AVRISelLowering.cpp
@@ -334,6 +334,36 @@ SDValue AVRTargetLowering::LowerShifts(SDValue Op, SelectionDAG &DAG) const {
     llvm_unreachable("Invalid shift opcode");
   }
 
+  // Optimize int8 shifts.
+  if (VT.getSizeInBits() == 8) {
+    if (Op.getOpcode() == ISD::SHL && 4 <= ShiftAmount && ShiftAmount < 7) {
+      // Optimize LSL when 4 <= ShiftAmount <= 6.
+      Victim = DAG.getNode(AVRISD::SWAP, dl, VT, Victim);
+      Victim =
+          DAG.getNode(ISD::AND, dl, VT, Victim, DAG.getConstant(0xf0, dl, VT));
+      ShiftAmount -= 4;
+    } else if (Op.getOpcode() == ISD::SRL && 4 <= ShiftAmount &&
+               ShiftAmount < 7) {
+      // Optimize LSR when 4 <= ShiftAmount <= 6.
+      Victim = DAG.getNode(AVRISD::SWAP, dl, VT, Victim);
+      Victim =
+          DAG.getNode(ISD::AND, dl, VT, Victim, DAG.getConstant(0x0f, dl, VT));
+      ShiftAmount -= 4;
+    } else if (Op.getOpcode() == ISD::SHL && ShiftAmount == 7) {
+      // Optimize LSL when ShiftAmount == 7.
+      Victim = DAG.getNode(AVRISD::LSL7, dl, VT, Victim);
+      ShiftAmount = 0;
+    } else if (Op.getOpcode() == ISD::SRL && ShiftAmount == 7) {
+      // Optimize LSR when ShiftAmount == 7.
+      Victim = DAG.getNode(AVRISD::LSR7, dl, VT, Victim);
+      ShiftAmount = 0;
+    } else if (Op.getOpcode() == ISD::SRA && ShiftAmount == 7) {
+      // Optimize ASR when ShiftAmount == 7.
+      Victim = DAG.getNode(AVRISD::ASR7, dl, VT, Victim);
+      ShiftAmount = 0;
+    }
+  }
+
   while (ShiftAmount--) {
     Victim = DAG.getNode(Opc8, dl, VT, Victim);
   }
@@ -437,6 +467,36 @@ static AVRCC::CondCodes intCCToAVRCC(ISD::CondCode CC) {
   }
 }
 
+/// Returns appropriate CP/CPI/CPC nodes code for the given 8/16-bit operands.
+SDValue AVRTargetLowering::getAVRCmp(SDValue LHS, SDValue RHS,
+                                     SelectionDAG &DAG, SDLoc DL) const {
+  assert((LHS.getSimpleValueType() == RHS.getSimpleValueType()) &&
+         "LHS and RHS have different types");
+  assert(((LHS.getSimpleValueType() == MVT::i16) ||
+          (LHS.getSimpleValueType() == MVT::i8)) && "invalid comparison type");
+
+  SDValue Cmp;
+
+  if (LHS.getSimpleValueType() == MVT::i16 && dyn_cast<ConstantSDNode>(RHS)) {
+    // Generate a CPI/CPC pair if RHS is a 16-bit constant.
+    SDValue LHSlo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i8, LHS,
+                                DAG.getIntPtrConstant(0, DL));
+    SDValue LHShi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i8, LHS,
+                                DAG.getIntPtrConstant(1, DL));
+    SDValue RHSlo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i8, RHS,
+                                DAG.getIntPtrConstant(0, DL));
+    SDValue RHShi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i8, RHS,
+                                DAG.getIntPtrConstant(1, DL));
+    Cmp = DAG.getNode(AVRISD::CMP, DL, MVT::Glue, LHSlo, RHSlo);
+    Cmp = DAG.getNode(AVRISD::CMPC, DL, MVT::Glue, LHShi, RHShi, Cmp);
+  } else {
+    // Generate ordinary 16-bit comparison.
+    Cmp = DAG.getNode(AVRISD::CMP, DL, MVT::Glue, LHS, RHS);
+  }
+
+  return Cmp;
+}
+
 /// Returns appropriate AVR CMP/CMPC nodes and corresponding condition code for
 /// the given operands.
 SDValue AVRTargetLowering::getAVRCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
@@ -549,7 +609,7 @@ SDValue AVRTargetLowering::getAVRCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
                                 DAG.getIntPtrConstant(1, DL));
       Cmp = DAG.getNode(AVRISD::TST, DL, MVT::Glue, Top);
     } else {
-      Cmp = DAG.getNode(AVRISD::CMP, DL, MVT::Glue, LHSlo, RHSlo);
+      Cmp = getAVRCmp(LHSlo, RHSlo, DAG, DL);
       Cmp = DAG.getNode(AVRISD::CMPC, DL, MVT::Glue, LHShi, RHShi, Cmp);
     }
   } else if (VT == MVT::i64) {
@@ -587,7 +647,7 @@ SDValue AVRTargetLowering::getAVRCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
                                 DAG.getIntPtrConstant(1, DL));
       Cmp = DAG.getNode(AVRISD::TST, DL, MVT::Glue, Top);
     } else {
-      Cmp = DAG.getNode(AVRISD::CMP, DL, MVT::Glue, LHS0, RHS0);
+      Cmp = getAVRCmp(LHS0, RHS0, DAG, DL);
       Cmp = DAG.getNode(AVRISD::CMPC, DL, MVT::Glue, LHS1, RHS1, Cmp);
       Cmp = DAG.getNode(AVRISD::CMPC, DL, MVT::Glue, LHS2, RHS2, Cmp);
       Cmp = DAG.getNode(AVRISD::CMPC, DL, MVT::Glue, LHS3, RHS3, Cmp);
@@ -601,7 +661,7 @@ SDValue AVRTargetLowering::getAVRCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
                             : DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i8,
                                           LHS, DAG.getIntPtrConstant(1, DL)));
     } else {
-      Cmp = DAG.getNode(AVRISD::CMP, DL, MVT::Glue, LHS, RHS);
+      Cmp = getAVRCmp(LHS, RHS, DAG, DL);
     }
   } else {
     llvm_unreachable("Invalid comparison size");
@@ -676,7 +736,7 @@ SDValue AVRTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   SDValue FI = DAG.getFrameIndex(AFI->getVarArgsFrameIndex(), getPointerTy(DL));
 
   return DAG.getStore(Op.getOperand(0), dl, FI, Op.getOperand(1),
-                      MachinePointerInfo(SV), 0);
+                      MachinePointerInfo(SV));
 }
 
 SDValue AVRTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
@@ -1096,8 +1156,7 @@ SDValue AVRTargetLowering::LowerFormalArguments(
       // from this parameter.
       SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DL));
       InVals.push_back(DAG.getLoad(LocVT, dl, Chain, FIN,
-                                   MachinePointerInfo::getFixedStack(MF, FI),
-                                   0));
+                                   MachinePointerInfo::getFixedStack(MF, FI)));
     }
   }
 
@@ -1230,8 +1289,7 @@ SDValue AVRTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
       Chain =
           DAG.getStore(Chain, DL, Arg, PtrOff,
-                       MachinePointerInfo::getStack(MF, VA.getLocMemOffset()),
-                       0);
+                       MachinePointerInfo::getStack(MF, VA.getLocMemOffset()));
     }
   }
 
@@ -1460,9 +1518,11 @@ MachineBasicBlock *AVRTargetLowering::insertShift(MachineInstr &MI,
 
   // Create loop block.
   MachineBasicBlock *LoopBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *CheckBB = F->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *RemBB = F->CreateMachineBasicBlock(LLVM_BB);
 
   F->insert(I, LoopBB);
+  F->insert(I, CheckBB);
   F->insert(I, RemBB);
 
   // Update machine-CFG edges by transferring all successors of the current
@@ -1471,14 +1531,14 @@ MachineBasicBlock *AVRTargetLowering::insertShift(MachineInstr &MI,
                 BB->end());
   RemBB->transferSuccessorsAndUpdatePHIs(BB);
 
-  // Add adges BB => LoopBB => RemBB, BB => RemBB, LoopBB => LoopBB.
-  BB->addSuccessor(LoopBB);
-  BB->addSuccessor(RemBB);
-  LoopBB->addSuccessor(RemBB);
-  LoopBB->addSuccessor(LoopBB);
+  // Add edges BB => LoopBB => CheckBB => RemBB, CheckBB => LoopBB.
+  BB->addSuccessor(CheckBB);
+  LoopBB->addSuccessor(CheckBB);
+  CheckBB->addSuccessor(LoopBB);
+  CheckBB->addSuccessor(RemBB);
 
-  Register ShiftAmtReg = RI.createVirtualRegister(&AVR::LD8RegClass);
-  Register ShiftAmtReg2 = RI.createVirtualRegister(&AVR::LD8RegClass);
+  Register ShiftAmtReg = RI.createVirtualRegister(&AVR::GPR8RegClass);
+  Register ShiftAmtReg2 = RI.createVirtualRegister(&AVR::GPR8RegClass);
   Register ShiftReg = RI.createVirtualRegister(RC);
   Register ShiftReg2 = RI.createVirtualRegister(RC);
   Register ShiftAmtSrcReg = MI.getOperand(2).getReg();
@@ -1486,44 +1546,41 @@ MachineBasicBlock *AVRTargetLowering::insertShift(MachineInstr &MI,
   Register DstReg = MI.getOperand(0).getReg();
 
   // BB:
-  // cpi N, 0
-  // breq RemBB
-  BuildMI(BB, dl, TII.get(AVR::CPIRdK)).addReg(ShiftAmtSrcReg).addImm(0);
-  BuildMI(BB, dl, TII.get(AVR::BREQk)).addMBB(RemBB);
+  // rjmp CheckBB
+  BuildMI(BB, dl, TII.get(AVR::RJMPk)).addMBB(CheckBB);
 
   // LoopBB:
-  // ShiftReg = phi [%SrcReg, BB], [%ShiftReg2, LoopBB]
-  // ShiftAmt = phi [%N, BB],      [%ShiftAmt2, LoopBB]
   // ShiftReg2 = shift ShiftReg
+  auto ShiftMI = BuildMI(LoopBB, dl, TII.get(Opc), ShiftReg2).addReg(ShiftReg);
+  if (HasRepeatedOperand)
+    ShiftMI.addReg(ShiftReg);
+
+  // CheckBB:
+  // ShiftReg = phi [%SrcReg, BB], [%ShiftReg2, LoopBB]
+  // ShiftAmt = phi [%N,      BB], [%ShiftAmt2, LoopBB]
+  // DestReg  = phi [%SrcReg, BB], [%ShiftReg,  LoopBB]
   // ShiftAmt2 = ShiftAmt - 1;
-  BuildMI(LoopBB, dl, TII.get(AVR::PHI), ShiftReg)
+  // if (ShiftAmt2 >= 0) goto LoopBB;
+  BuildMI(CheckBB, dl, TII.get(AVR::PHI), ShiftReg)
       .addReg(SrcReg)
       .addMBB(BB)
       .addReg(ShiftReg2)
       .addMBB(LoopBB);
-  BuildMI(LoopBB, dl, TII.get(AVR::PHI), ShiftAmtReg)
+  BuildMI(CheckBB, dl, TII.get(AVR::PHI), ShiftAmtReg)
       .addReg(ShiftAmtSrcReg)
       .addMBB(BB)
       .addReg(ShiftAmtReg2)
       .addMBB(LoopBB);
-
-  auto ShiftMI = BuildMI(LoopBB, dl, TII.get(Opc), ShiftReg2).addReg(ShiftReg);
-  if (HasRepeatedOperand)
-    ShiftMI.addReg(ShiftReg);
-
-  BuildMI(LoopBB, dl, TII.get(AVR::SUBIRdK), ShiftAmtReg2)
-      .addReg(ShiftAmtReg)
-      .addImm(1);
-  BuildMI(LoopBB, dl, TII.get(AVR::BRNEk)).addMBB(LoopBB);
-
-  // RemBB:
-  // DestReg = phi [%SrcReg, BB], [%ShiftReg, LoopBB]
-  BuildMI(*RemBB, RemBB->begin(), dl, TII.get(AVR::PHI), DstReg)
+  BuildMI(CheckBB, dl, TII.get(AVR::PHI), DstReg)
       .addReg(SrcReg)
       .addMBB(BB)
       .addReg(ShiftReg2)
       .addMBB(LoopBB);
 
+  BuildMI(CheckBB, dl, TII.get(AVR::DECRd), ShiftAmtReg2)
+      .addReg(ShiftAmtReg);
+  BuildMI(CheckBB, dl, TII.get(AVR::BRPLk)).addMBB(LoopBB);
+
   MI.eraseFromParent(); // The pseudo instruction is gone now.
   return RemBB;
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/AVR/AVRISelLowering.h b/contrib/llvm-project/llvm/lib/Target/AVR/AVRISelLowering.h
index d1eaf53b15e9..7aff4159211b 100644
--- a/contrib/llvm-project/llvm/lib/Target/AVR/AVRISelLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/AVR/AVRISelLowering.h
@@ -38,6 +38,9 @@ enum NodeType {
   LSL,     ///< Logical shift left.
   LSR,     ///< Logical shift right.
   ASR,     ///< Arithmetic shift right.
+  LSL7,    ///< Logical shift left 7 bits.
+  LSR7,    ///< Logical shift right 7 bits.
+  ASR7,    ///< Arithmetic shift right 7 bits.
   ROR,     ///< Bit rotate right.
   ROL,     ///< Bit rotate left.
   LSLLOOP, ///< A loop of single logical shift left instructions.
@@ -56,6 +59,8 @@ enum NodeType {
   CMPC,
   /// Test for zero or minus instruction.
   TST,
+  /// Swap Rd[7:4] <-> Rd[3:0].
+  SWAP,
   /// Operand 0 and operand 1 are selection variable, operand 2
   /// is condition code and operand 3 is flag operand.
   SELECT_CC
@@ -136,6 +141,8 @@ public:
 private:
   SDValue getAVRCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AVRcc,
                     SelectionDAG &DAG, SDLoc dl) const;
+  SDValue getAVRCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG,
+                    SDLoc dl) const;
   SDValue LowerShifts(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerDivRem(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
diff --git a/contrib/llvm-project/llvm/lib/Target/AVR/AVRInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/AVR/AVRInstrInfo.td
index f03c254382b4..9f7c16fc96d2 100644
--- a/contrib/llvm-project/llvm/lib/Target/AVR/AVRInstrInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/AVR/AVRInstrInfo.td
@@ -59,6 +59,9 @@ def AVRlsr : SDNode<"AVRISD::LSR", SDTIntUnaryOp>;
 def AVRrol : SDNode<"AVRISD::ROL", SDTIntUnaryOp>;
 def AVRror : SDNode<"AVRISD::ROR", SDTIntUnaryOp>;
 def AVRasr : SDNode<"AVRISD::ASR", SDTIntUnaryOp>;
+def AVRlsl7 : SDNode<"AVRISD::LSL7", SDTIntUnaryOp>;
+def AVRlsr7 : SDNode<"AVRISD::LSR7", SDTIntUnaryOp>;
+def AVRasr7 : SDNode<"AVRISD::ASR7", SDTIntUnaryOp>;
 
 // Pseudo shift nodes for non-constant shift amounts.
 def AVRlslLoop : SDNode<"AVRISD::LSLLOOP", SDTIntShiftOp>;
@@ -67,6 +70,9 @@ def AVRrolLoop : SDNode<"AVRISD::ROLLOOP", SDTIntShiftOp>;
 def AVRrorLoop : SDNode<"AVRISD::RORLOOP", SDTIntShiftOp>;
 def AVRasrLoop : SDNode<"AVRISD::ASRLOOP", SDTIntShiftOp>;
 
+// SWAP node.
+def AVRSwap : SDNode<"AVRISD::SWAP", SDTIntUnaryOp>;
+
 //===----------------------------------------------------------------------===//
 // AVR Operands, Complex Patterns and Transformations Definitions.
 //===----------------------------------------------------------------------===//
@@ -732,13 +738,23 @@ Defs = [SREG] in
                       "comw\t$rd",
                       [(set i16:$rd, (not i16:$src)), (implicit SREG)]>;
 
-  //:TODO: optimize NEG for wider types
   def NEGRd : FRd<0b1001,
                   0b0100001,
                   (outs GPR8:$rd),
                   (ins GPR8:$src),
                   "neg\t$rd",
                   [(set i8:$rd, (ineg i8:$src)), (implicit SREG)]>;
+
+  // NEGW Rd+1:Rd
+  //
+  // Expands to:
+  // neg Rd+1
+  // neg Rd
+  // sbci Rd+1, 0
+  def NEGWRd : Pseudo<(outs DREGS:$rd),
+                      (ins DREGS:$src),
+                      "negw\t$rd",
+                      [(set i16:$rd, (ineg i16:$src)), (implicit SREG)]>;
 }
 
 // TST Rd
@@ -1653,6 +1669,11 @@ Defs = [SREG] in
                       "lslw\t$rd",
                       [(set i16:$rd, (AVRlsl i16:$src)), (implicit SREG)]>;
 
+  def LSLB7Rd : Pseudo<(outs GPR8:$rd),
+                       (ins GPR8:$src),
+                       "lslb7\t$rd",
+                       [(set i8:$rd, (AVRlsl7 i8:$src)), (implicit SREG)]>;
+
   def LSRRd : FRd<0b1001,
                   0b0100110,
                   (outs GPR8:$rd),
@@ -1660,6 +1681,11 @@ Defs = [SREG] in
                   "lsr\t$rd",
                   [(set i8:$rd, (AVRlsr i8:$src)), (implicit SREG)]>;
 
+  def LSRB7Rd : Pseudo<(outs GPR8:$rd),
+                       (ins GPR8:$src),
+                       "lsrb7\t$rd",
+                       [(set i8:$rd, (AVRlsr7 i8:$src)), (implicit SREG)]>;
+
   def LSRWRd : Pseudo<(outs DREGS:$rd),
                       (ins DREGS:$src),
                       "lsrw\t$rd",
@@ -1672,6 +1698,11 @@ Defs = [SREG] in
                   "asr\t$rd",
                   [(set i8:$rd, (AVRasr i8:$src)), (implicit SREG)]>;
 
+  def ASRB7Rd : Pseudo<(outs GPR8:$rd),
+                       (ins GPR8:$src),
+                       "asrb7\t$rd",
+                       [(set i8:$rd, (AVRasr7 i8:$src)), (implicit SREG)]>;
+
   def ASRWRd : Pseudo<(outs DREGS:$rd),
                       (ins DREGS:$src),
                       "asrw\t$rd",
@@ -1719,7 +1750,7 @@ def SWAPRd : FRd<0b1001,
                  (outs GPR8:$rd),
                  (ins GPR8:$src),
                  "swap\t$rd",
-                 [(set i8:$rd, (bswap i8:$src))]>;
+                 [(set i8:$rd, (AVRSwap i8:$src))]>;
 
 // IO register bit set/clear operations.
 //:TODO: add patterns when popcount(imm)==2 to be expanded with 2 sbi/cbi
diff --git a/contrib/llvm-project/llvm/lib/Target/AVR/AVRSubtarget.cpp b/contrib/llvm-project/llvm/lib/Target/AVR/AVRSubtarget.cpp
index 195ca95bc3bd..601865120491 100644
--- a/contrib/llvm-project/llvm/lib/Target/AVR/AVRSubtarget.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AVR/AVRSubtarget.cpp
@@ -29,7 +29,7 @@ namespace llvm {
 
 AVRSubtarget::AVRSubtarget(const Triple &TT, const std::string &CPU,
                            const std::string &FS, const AVRTargetMachine &TM)
-    : AVRGenSubtargetInfo(TT, CPU, FS), ELFArch(0),
+    : AVRGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), ELFArch(0),
 
       // Subtarget features
       m_hasSRAM(false), m_hasJMPCALL(false), m_hasIJMPCALL(false),
@@ -43,14 +43,14 @@ AVRSubtarget::AVRSubtarget(const Triple &TT, const std::string &CPU,
       InstrInfo(), FrameLowering(),
       TLInfo(TM, initializeSubtargetDependencies(CPU, FS, TM)), TSInfo() {
   // Parse features string.
-  ParseSubtargetFeatures(CPU, FS);
+  ParseSubtargetFeatures(CPU, /*TuneCPU*/ CPU, FS);
 }
 
 AVRSubtarget &
 AVRSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS,
                                               const TargetMachine &TM) {
   // Parse features string.
-  ParseSubtargetFeatures(CPU, FS);
+  ParseSubtargetFeatures(CPU, /*TuneCPU*/ CPU, FS);
   return *this;
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AVR/AVRSubtarget.h b/contrib/llvm-project/llvm/lib/Target/AVR/AVRSubtarget.h
index 81d883eb30d9..7d49e43a83f5 100644
--- a/contrib/llvm-project/llvm/lib/Target/AVR/AVRSubtarget.h
+++ b/contrib/llvm-project/llvm/lib/Target/AVR/AVRSubtarget.h
@@ -46,7 +46,7 @@ public:
 
   /// Parses a subtarget feature string, setting appropriate options.
   /// \note Definition of function is auto generated by `tblgen`.
-  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+  void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
 
   AVRSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS,
                                                 const TargetMachine &TM);
diff --git a/contrib/llvm-project/llvm/lib/Target/AVR/AVRTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/AVR/AVRTargetMachine.cpp
index 0c7136e6f77e..0fa8623e2fb7 100644
--- a/contrib/llvm-project/llvm/lib/Target/AVR/AVRTargetMachine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AVR/AVRTargetMachine.cpp
@@ -37,7 +37,7 @@ static StringRef getCPU(StringRef CPU) {
 }
 
 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
-  return RM.hasValue() ? *RM : Reloc::Static;
+  return RM.getValueOr(Reloc::Static);
 }
 
 AVRTargetMachine::AVRTargetMachine(const Target &T, const Triple &TT,
diff --git a/contrib/llvm-project/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp b/contrib/llvm-project/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
index 230bc7adc07a..19f769270569 100644
--- a/contrib/llvm-project/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
@@ -14,7 +14,6 @@
 #include "TargetInfo/AVRTargetInfo.h"
 
 #include "llvm/ADT/APInt.h"
-#include "llvm/ADT/StringSwitch.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
@@ -166,13 +165,13 @@ public:
     assert(N == 1 && "Invalid number of operands!");
     // The operand is actually a imm8, but we have its bitwise
     // negation in the assembly source, so twiddle it here.
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    const auto *CE = cast<MCConstantExpr>(getImm());
     Inst.addOperand(MCOperand::createImm(~(uint8_t)CE->getValue()));
   }
 
   bool isImmCom8() const {
     if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    const auto *CE = dyn_cast<MCConstantExpr>(getImm());
     if (!CE) return false;
     int64_t Value = CE->getValue();
     return isUInt<8>(Value);
diff --git a/contrib/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp b/contrib/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
index ac72abe0d9f6..49840672bf9a 100644
--- a/contrib/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
@@ -13,12 +13,12 @@
 #include "MCTargetDesc/AVRAsmBackend.h"
 #include "MCTargetDesc/AVRFixupKinds.h"
 #include "MCTargetDesc/AVRMCTargetDesc.h"
-
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFixupKindInfo.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSubtargetInfo.h"
diff --git a/contrib/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h b/contrib/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h
index 9e150f120dd4..46dc914adf78 100644
--- a/contrib/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h
+++ b/contrib/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h
@@ -22,6 +22,7 @@
 namespace llvm {
 
 class MCAssembler;
+class MCContext;
 struct MCFixupKindInfo;
 
 /// Utilities for manipulating generated AVR machine code.
@@ -47,11 +48,6 @@ public:
     return AVR::NumTargetFixupKinds;
   }
 
-  bool mayNeedRelaxation(const MCInst &Inst,
-                         const MCSubtargetInfo &STI) const override {
-    return false;
-  }
-
   bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
                             const MCRelaxableFragment *DF,
                             const MCAsmLayout &Layout) const override {
diff --git a/contrib/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.h b/contrib/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.h
index 910fd3455dee..8976ef28f3dc 100644
--- a/contrib/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.h
@@ -45,6 +45,7 @@ private:
   void printMemri(const MCInst *MI, unsigned OpNo, raw_ostream &O);
 
   // Autogenerated by TableGen.
+  std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
   void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &O);
   bool printAliasInstr(const MCInst *MI, uint64_t Address, raw_ostream &O);
   void printCustomAliasOperand(const MCInst *MI, uint64_t Address,
diff --git a/contrib/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp b/contrib/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
index 0a53e5346779..9eff554a082b 100644
--- a/contrib/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
@@ -189,9 +189,10 @@ void AVRMCExpr::visitUsedExpr(MCStreamer &Streamer) const {
 }
 
 const char *AVRMCExpr::getName() const {
-  const auto &Modifier = std::find_if(
-      std::begin(ModifierNames), std::end(ModifierNames),
-      [this](ModifierEntry const &Mod) { return Mod.VariantKind == Kind; });
+  const auto &Modifier =
+      llvm::find_if(ModifierNames, [this](ModifierEntry const &Mod) {
+        return Mod.VariantKind == Kind;
+      });
 
   if (Modifier != std::end(ModifierNames)) {
     return Modifier->Spelling;
@@ -200,9 +201,10 @@ const char *AVRMCExpr::getName() const {
 }
 
 AVRMCExpr::VariantKind AVRMCExpr::getKindByName(StringRef Name) {
-  const auto &Modifier = std::find_if(
-      std::begin(ModifierNames), std::end(ModifierNames),
-      [&Name](ModifierEntry const &Mod) { return Mod.Spelling == Name; });
+  const auto &Modifier =
+      llvm::find_if(ModifierNames, [&Name](ModifierEntry const &Mod) {
+        return Mod.Spelling == Name;
+      });
 
   if (Modifier != std::end(ModifierNames)) {
     return Modifier->VariantKind;
diff --git a/contrib/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp b/contrib/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp
index bfc274d9cdcc..95f4465924cc 100644
--- a/contrib/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp
@@ -53,7 +53,7 @@ static MCRegisterInfo *createAVRMCRegisterInfo(const Triple &TT) {
 
 static MCSubtargetInfo *createAVRMCSubtargetInfo(const Triple &TT,
                                                  StringRef CPU, StringRef FS) {
-  return createAVRMCSubtargetInfoImpl(TT, CPU, FS);
+  return createAVRMCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FS);
 }
 
 static MCInstPrinter *createAVRMCInstPrinter(const Triple &T,
diff --git a/contrib/llvm-project/llvm/lib/Target/BPF/BPF.h b/contrib/llvm-project/llvm/lib/Target/BPF/BPF.h
index 4a46b11e5e08..a98a3e08d5de 100644
--- a/contrib/llvm-project/llvm/lib/Target/BPF/BPF.h
+++ b/contrib/llvm-project/llvm/lib/Target/BPF/BPF.h
@@ -10,14 +10,17 @@
 #define LLVM_LIB_TARGET_BPF_BPF_H
 
 #include "MCTargetDesc/BPFMCTargetDesc.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
 class BPFTargetMachine;
 
-ModulePass *createBPFAbstractMemberAccess(BPFTargetMachine *TM);
-ModulePass *createBPFPreserveDIType();
+ModulePass *createBPFAdjustOpt();
+ModulePass *createBPFCheckAndAdjustIR();
 
+FunctionPass *createBPFAbstractMemberAccess(BPFTargetMachine *TM);
+FunctionPass *createBPFPreserveDIType();
 FunctionPass *createBPFISelDag(BPFTargetMachine &TM);
 FunctionPass *createBPFMISimplifyPatchablePass();
 FunctionPass *createBPFMIPeepholePass();
@@ -25,13 +28,39 @@ FunctionPass *createBPFMIPeepholeTruncElimPass();
 FunctionPass *createBPFMIPreEmitPeepholePass();
 FunctionPass *createBPFMIPreEmitCheckingPass();
 
-void initializeBPFAbstractMemberAccessPass(PassRegistry&);
+void initializeBPFAdjustOptPass(PassRegistry&);
+void initializeBPFCheckAndAdjustIRPass(PassRegistry&);
+
+void initializeBPFAbstractMemberAccessLegacyPassPass(PassRegistry &);
 void initializeBPFPreserveDITypePass(PassRegistry&);
 void initializeBPFMISimplifyPatchablePass(PassRegistry&);
 void initializeBPFMIPeepholePass(PassRegistry&);
 void initializeBPFMIPeepholeTruncElimPass(PassRegistry&);
 void initializeBPFMIPreEmitPeepholePass(PassRegistry&);
 void initializeBPFMIPreEmitCheckingPass(PassRegistry&);
-}
+
+class BPFAbstractMemberAccessPass
+    : public PassInfoMixin<BPFAbstractMemberAccessPass> {
+  BPFTargetMachine *TM;
+
+public:
+  BPFAbstractMemberAccessPass(BPFTargetMachine *TM) : TM(TM) {}
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+  static bool isRequired() { return true; }
+};
+
+class BPFPreserveDITypePass : public PassInfoMixin<BPFPreserveDITypePass> {
+public:
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+  static bool isRequired() { return true; }
+};
+
+class BPFAdjustOptPass : public PassInfoMixin<BPFAdjustOptPass> {
+public:
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+} // namespace llvm
 
 #endif
diff --git a/contrib/llvm-project/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp b/contrib/llvm-project/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp
index 16708c4d1ce6..cd994a9c8365 100644
--- a/contrib/llvm-project/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp
@@ -81,7 +81,9 @@
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicsBPF.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
@@ -93,26 +95,30 @@
 
 namespace llvm {
 constexpr StringRef BPFCoreSharedInfo::AmaAttr;
+uint32_t BPFCoreSharedInfo::SeqNum;
+
+Instruction *BPFCoreSharedInfo::insertPassThrough(Module *M, BasicBlock *BB,
+                                                  Instruction *Input,
+                                                  Instruction *Before) {
+  Function *Fn = Intrinsic::getDeclaration(
+      M, Intrinsic::bpf_passthrough, {Input->getType(), Input->getType()});
+  Constant *SeqNumVal = ConstantInt::get(Type::getInt32Ty(BB->getContext()),
+                                         BPFCoreSharedInfo::SeqNum++);
+
+  auto *NewInst = CallInst::Create(Fn, {SeqNumVal, Input});
+  BB->getInstList().insert(Before->getIterator(), NewInst);
+  return NewInst;
+}
 } // namespace llvm
 
 using namespace llvm;
 
 namespace {
-
-class BPFAbstractMemberAccess final : public ModulePass {
-  StringRef getPassName() const override {
-    return "BPF Abstract Member Access";
-  }
-
-  bool runOnModule(Module &M) override;
-
+class BPFAbstractMemberAccess final {
 public:
-  static char ID;
-  TargetMachine *TM;
-  // Add optional BPFTargetMachine parameter so that BPF backend can add the phase
-  // with target machine to find out the endianness. The default constructor (without
-  // parameters) is used by the pass manager for managing purposes.
-  BPFAbstractMemberAccess(BPFTargetMachine *TM = nullptr) : ModulePass(ID), TM(TM) {}
+  BPFAbstractMemberAccess(BPFTargetMachine *TM) : TM(TM) {}
+
+  bool run(Function &F);
 
   struct CallInfo {
     uint32_t Kind;
@@ -131,9 +137,11 @@ private:
     BPFPreserveFieldInfoAI = 4,
   };
 
+  TargetMachine *TM;
   const DataLayout *DL = nullptr;
+  Module *M = nullptr;
 
-  std::map<std::string, GlobalVariable *> GEPGlobals;
+  static std::map<std::string, GlobalVariable *> GEPGlobals;
   // A map to link preserve_*_access_index instrinsic calls.
   std::map<CallInst *, std::pair<CallInst *, CallInfo>> AIChain;
   // A map to hold all the base preserve_*_access_index instrinsic calls.
@@ -141,19 +149,19 @@ private:
   // intrinsics.
   std::map<CallInst *, CallInfo> BaseAICalls;
 
-  bool doTransformation(Module &M);
+  bool doTransformation(Function &F);
 
   void traceAICall(CallInst *Call, CallInfo &ParentInfo);
   void traceBitCast(BitCastInst *BitCast, CallInst *Parent,
                     CallInfo &ParentInfo);
   void traceGEP(GetElementPtrInst *GEP, CallInst *Parent,
                 CallInfo &ParentInfo);
-  void collectAICallChains(Module &M, Function &F);
+  void collectAICallChains(Function &F);
 
   bool IsPreserveDIAccessIndexCall(const CallInst *Call, CallInfo &Cinfo);
   bool IsValidAIChain(const MDNode *ParentMeta, uint32_t ParentAI,
                       const MDNode *ChildMeta);
-  bool removePreserveAccessIndexIntrinsic(Module &M);
+  bool removePreserveAccessIndexIntrinsic(Function &F);
   void replaceWithGEP(std::vector<CallInst *> &CallList,
                       uint32_t NumOfZerosIndex, uint32_t DIIndex);
   bool HasPreserveFieldInfoCall(CallInfoStack &CallStack);
@@ -165,28 +173,55 @@ private:
 
   Value *computeBaseAndAccessKey(CallInst *Call, CallInfo &CInfo,
                                  std::string &AccessKey, MDNode *&BaseMeta);
+  MDNode *computeAccessKey(CallInst *Call, CallInfo &CInfo,
+                           std::string &AccessKey, bool &IsInt32Ret);
   uint64_t getConstant(const Value *IndexValue);
-  bool transformGEPChain(Module &M, CallInst *Call, CallInfo &CInfo);
+  bool transformGEPChain(CallInst *Call, CallInfo &CInfo);
 };
+
+std::map<std::string, GlobalVariable *> BPFAbstractMemberAccess::GEPGlobals;
+
+class BPFAbstractMemberAccessLegacyPass final : public FunctionPass {
+  BPFTargetMachine *TM;
+
+  bool runOnFunction(Function &F) override {
+    return BPFAbstractMemberAccess(TM).run(F);
+  }
+
+public:
+  static char ID;
+
+  // Add optional BPFTargetMachine parameter so that BPF backend can add the
+  // phase with target machine to find out the endianness. The default
+  // constructor (without parameters) is used by the pass manager for managing
+  // purposes.
+  BPFAbstractMemberAccessLegacyPass(BPFTargetMachine *TM = nullptr)
+      : FunctionPass(ID), TM(TM) {}
+};
+
 } // End anonymous namespace
 
-char BPFAbstractMemberAccess::ID = 0;
-INITIALIZE_PASS(BPFAbstractMemberAccess, DEBUG_TYPE,
-                "abstracting struct/union member accessees", false, false)
+char BPFAbstractMemberAccessLegacyPass::ID = 0;
+INITIALIZE_PASS(BPFAbstractMemberAccessLegacyPass, DEBUG_TYPE,
+                "BPF Abstract Member Access", false, false)
 
-ModulePass *llvm::createBPFAbstractMemberAccess(BPFTargetMachine *TM) {
-  return new BPFAbstractMemberAccess(TM);
+FunctionPass *llvm::createBPFAbstractMemberAccess(BPFTargetMachine *TM) {
+  return new BPFAbstractMemberAccessLegacyPass(TM);
 }
 
-bool BPFAbstractMemberAccess::runOnModule(Module &M) {
+bool BPFAbstractMemberAccess::run(Function &F) {
   LLVM_DEBUG(dbgs() << "********** Abstract Member Accesses **********\n");
 
+  M = F.getParent();
+  if (!M)
+    return false;
+
   // Bail out if no debug info.
-  if (M.debug_compile_units().empty())
+  if (M->debug_compile_units().empty())
     return false;
 
-  DL = &M.getDataLayout();
-  return doTransformation(M);
+  DL = &M->getDataLayout();
+  return doTransformation(F);
 }
 
 static bool SkipDIDerivedTag(unsigned Tag, bool skipTypedef) {
@@ -285,6 +320,34 @@ bool BPFAbstractMemberAccess::IsPreserveDIAccessIndexCall(const CallInst *Call,
     CInfo.AccessIndex = InfoKind;
     return true;
   }
+  if (GV->getName().startswith("llvm.bpf.preserve.type.info")) {
+    CInfo.Kind = BPFPreserveFieldInfoAI;
+    CInfo.Metadata = Call->getMetadata(LLVMContext::MD_preserve_access_index);
+    if (!CInfo.Metadata)
+      report_fatal_error("Missing metadata for llvm.preserve.type.info intrinsic");
+    uint64_t Flag = getConstant(Call->getArgOperand(1));
+    if (Flag >= BPFCoreSharedInfo::MAX_PRESERVE_TYPE_INFO_FLAG)
+      report_fatal_error("Incorrect flag for llvm.bpf.preserve.type.info intrinsic");
+    if (Flag == BPFCoreSharedInfo::PRESERVE_TYPE_INFO_EXISTENCE)
+      CInfo.AccessIndex = BPFCoreSharedInfo::TYPE_EXISTENCE;
+    else
+      CInfo.AccessIndex = BPFCoreSharedInfo::TYPE_SIZE;
+    return true;
+  }
+  if (GV->getName().startswith("llvm.bpf.preserve.enum.value")) {
+    CInfo.Kind = BPFPreserveFieldInfoAI;
+    CInfo.Metadata = Call->getMetadata(LLVMContext::MD_preserve_access_index);
+    if (!CInfo.Metadata)
+      report_fatal_error("Missing metadata for llvm.preserve.enum.value intrinsic");
+    uint64_t Flag = getConstant(Call->getArgOperand(2));
+    if (Flag >= BPFCoreSharedInfo::MAX_PRESERVE_ENUM_VALUE_FLAG)
+      report_fatal_error("Incorrect flag for llvm.bpf.preserve.enum.value intrinsic");
+    if (Flag == BPFCoreSharedInfo::PRESERVE_ENUM_VALUE_EXISTENCE)
+      CInfo.AccessIndex = BPFCoreSharedInfo::ENUM_VALUE_EXISTENCE;
+    else
+      CInfo.AccessIndex = BPFCoreSharedInfo::ENUM_VALUE;
+    return true;
+  }
 
   return false;
 }
@@ -311,28 +374,27 @@ void BPFAbstractMemberAccess::replaceWithGEP(std::vector<CallInst *> &CallList,
   }
 }
 
-bool BPFAbstractMemberAccess::removePreserveAccessIndexIntrinsic(Module &M) {
+bool BPFAbstractMemberAccess::removePreserveAccessIndexIntrinsic(Function &F) {
   std::vector<CallInst *> PreserveArrayIndexCalls;
   std::vector<CallInst *> PreserveUnionIndexCalls;
   std::vector<CallInst *> PreserveStructIndexCalls;
   bool Found = false;
 
-  for (Function &F : M)
-    for (auto &BB : F)
-      for (auto &I : BB) {
-        auto *Call = dyn_cast<CallInst>(&I);
-        CallInfo CInfo;
-        if (!IsPreserveDIAccessIndexCall(Call, CInfo))
-          continue;
-
-        Found = true;
-        if (CInfo.Kind == BPFPreserveArrayAI)
-          PreserveArrayIndexCalls.push_back(Call);
-        else if (CInfo.Kind == BPFPreserveUnionAI)
-          PreserveUnionIndexCalls.push_back(Call);
-        else
-          PreserveStructIndexCalls.push_back(Call);
-      }
+  for (auto &BB : F)
+    for (auto &I : BB) {
+      auto *Call = dyn_cast<CallInst>(&I);
+      CallInfo CInfo;
+      if (!IsPreserveDIAccessIndexCall(Call, CInfo))
+        continue;
+
+      Found = true;
+      if (CInfo.Kind == BPFPreserveArrayAI)
+        PreserveArrayIndexCalls.push_back(Call);
+      else if (CInfo.Kind == BPFPreserveUnionAI)
+        PreserveUnionIndexCalls.push_back(Call);
+      else
+        PreserveStructIndexCalls.push_back(Call);
+    }
 
   // do the following transformation:
   // . addr = preserve_array_access_index(base, dimension, index)
@@ -498,7 +560,7 @@ void BPFAbstractMemberAccess::traceGEP(GetElementPtrInst *GEP, CallInst *Parent,
   }
 }
 
-void BPFAbstractMemberAccess::collectAICallChains(Module &M, Function &F) {
+void BPFAbstractMemberAccess::collectAICallChains(Function &F) {
   AIChain.clear();
   BaseAICalls.clear();
 
@@ -847,28 +909,94 @@ Value *BPFAbstractMemberAccess::computeBaseAndAccessKey(CallInst *Call,
   return Base;
 }
 
+MDNode *BPFAbstractMemberAccess::computeAccessKey(CallInst *Call,
+                                                  CallInfo &CInfo,
+                                                  std::string &AccessKey,
+                                                  bool &IsInt32Ret) {
+  DIType *Ty = stripQualifiers(cast<DIType>(CInfo.Metadata), false);
+  assert(!Ty->getName().empty());
+
+  int64_t PatchImm;
+  std::string AccessStr("0");
+  if (CInfo.AccessIndex == BPFCoreSharedInfo::TYPE_EXISTENCE) {
+    PatchImm = 1;
+  } else if (CInfo.AccessIndex == BPFCoreSharedInfo::TYPE_SIZE) {
+    // typedef debuginfo type has size 0, get the eventual base type.
+    DIType *BaseTy = stripQualifiers(Ty, true);
+    PatchImm = BaseTy->getSizeInBits() / 8;
+  } else {
+    // ENUM_VALUE_EXISTENCE and ENUM_VALUE
+    IsInt32Ret = false;
+
+    const auto *CE = cast<ConstantExpr>(Call->getArgOperand(1));
+    const GlobalVariable *GV = cast<GlobalVariable>(CE->getOperand(0));
+    assert(GV->hasInitializer());
+    const ConstantDataArray *DA = cast<ConstantDataArray>(GV->getInitializer());
+    assert(DA->isString());
+    StringRef ValueStr = DA->getAsString();
+
+    // ValueStr format: <EnumeratorStr>:<Value>
+    size_t Separator = ValueStr.find_first_of(':');
+    StringRef EnumeratorStr = ValueStr.substr(0, Separator);
+
+    // Find enumerator index in the debuginfo
+    DIType *BaseTy = stripQualifiers(Ty, true);
+    const auto *CTy = cast<DICompositeType>(BaseTy);
+    assert(CTy->getTag() == dwarf::DW_TAG_enumeration_type);
+    int EnumIndex = 0;
+    for (const auto Element : CTy->getElements()) {
+      const auto *Enum = cast<DIEnumerator>(Element);
+      if (Enum->getName() == EnumeratorStr) {
+        AccessStr = std::to_string(EnumIndex);
+        break;
+      }
+      EnumIndex++;
+    }
+
+    if (CInfo.AccessIndex == BPFCoreSharedInfo::ENUM_VALUE) {
+      StringRef EValueStr = ValueStr.substr(Separator + 1);
+      PatchImm = std::stoll(std::string(EValueStr));
+    } else {
+      PatchImm = 1;
+    }
+  }
+
+  AccessKey = "llvm." + Ty->getName().str() + ":" +
+              std::to_string(CInfo.AccessIndex) + std::string(":") +
+              std::to_string(PatchImm) + std::string("$") + AccessStr;
+
+  return Ty;
+}
+
 /// Call/Kind is the base preserve_*_access_index() call. Attempts to do
 /// transformation to a chain of relocable GEPs.
-bool BPFAbstractMemberAccess::transformGEPChain(Module &M, CallInst *Call,
+bool BPFAbstractMemberAccess::transformGEPChain(CallInst *Call,
                                                 CallInfo &CInfo) {
   std::string AccessKey;
   MDNode *TypeMeta;
-  Value *Base =
-      computeBaseAndAccessKey(Call, CInfo, AccessKey, TypeMeta);
-  if (!Base)
-    return false;
+  Value *Base = nullptr;
+  bool IsInt32Ret;
+
+  IsInt32Ret = CInfo.Kind == BPFPreserveFieldInfoAI;
+  if (CInfo.Kind == BPFPreserveFieldInfoAI && CInfo.Metadata) {
+    TypeMeta = computeAccessKey(Call, CInfo, AccessKey, IsInt32Ret);
+  } else {
+    Base = computeBaseAndAccessKey(Call, CInfo, AccessKey, TypeMeta);
+    if (!Base)
+      return false;
+  }
 
   BasicBlock *BB = Call->getParent();
   GlobalVariable *GV;
 
   if (GEPGlobals.find(AccessKey) == GEPGlobals.end()) {
     IntegerType *VarType;
-    if (CInfo.Kind == BPFPreserveFieldInfoAI)
+    if (IsInt32Ret)
       VarType = Type::getInt32Ty(BB->getContext()); // 32bit return value
     else
-      VarType = Type::getInt64Ty(BB->getContext()); // 64bit ptr arith
+      VarType = Type::getInt64Ty(BB->getContext()); // 64bit ptr or enum value
 
-    GV = new GlobalVariable(M, VarType, false, GlobalVariable::ExternalLinkage,
+    GV = new GlobalVariable(*M, VarType, false, GlobalVariable::ExternalLinkage,
                             NULL, AccessKey);
     GV->addAttribute(BPFCoreSharedInfo::AmaAttr);
     GV->setMetadata(LLVMContext::MD_preserve_access_index, TypeMeta);
@@ -879,9 +1007,15 @@ bool BPFAbstractMemberAccess::transformGEPChain(Module &M, CallInst *Call,
 
   if (CInfo.Kind == BPFPreserveFieldInfoAI) {
     // Load the global variable which represents the returned field info.
-    auto *LDInst = new LoadInst(Type::getInt32Ty(BB->getContext()), GV, "",
-                                Call);
-    Call->replaceAllUsesWith(LDInst);
+    LoadInst *LDInst;
+    if (IsInt32Ret)
+      LDInst = new LoadInst(Type::getInt32Ty(BB->getContext()), GV, "", Call);
+    else
+      LDInst = new LoadInst(Type::getInt64Ty(BB->getContext()), GV, "", Call);
+
+    Instruction *PassThroughInst =
+        BPFCoreSharedInfo::insertPassThrough(M, BB, LDInst, Call);
+    Call->replaceAllUsesWith(PassThroughInst);
     Call->eraseFromParent();
     return true;
   }
@@ -889,7 +1023,7 @@ bool BPFAbstractMemberAccess::transformGEPChain(Module &M, CallInst *Call,
   // For any original GEP Call and Base %2 like
   //   %4 = bitcast %struct.net_device** %dev1 to i64*
   // it is transformed to:
-  //   %6 = load sk_buff:50:$0:0:0:2:0
+  //   %6 = load llvm.sk_buff:0:50$0:0:0:2:0
   //   %7 = bitcast %struct.sk_buff* %2 to i8*
   //   %8 = getelementptr i8, i8* %7, %6
   //   %9 = bitcast i8* %8 to i64*
@@ -912,24 +1046,75 @@ bool BPFAbstractMemberAccess::transformGEPChain(Module &M, CallInst *Call,
   auto *BCInst2 = new BitCastInst(GEP, Call->getType());
   BB->getInstList().insert(Call->getIterator(), BCInst2);
 
-  Call->replaceAllUsesWith(BCInst2);
+  // For the following code,
+  //    Block0:
+  //      ...
+  //      if (...) goto Block1 else ...
+  //    Block1:
+  //      %6 = load llvm.sk_buff:0:50$0:0:0:2:0
+  //      %7 = bitcast %struct.sk_buff* %2 to i8*
+  //      %8 = getelementptr i8, i8* %7, %6
+  //      ...
+  //      goto CommonExit
+  //    Block2:
+  //      ...
+  //      if (...) goto Block3 else ...
+  //    Block3:
+  //      %6 = load llvm.bpf_map:0:40$0:0:0:2:0
+  //      %7 = bitcast %struct.sk_buff* %2 to i8*
+  //      %8 = getelementptr i8, i8* %7, %6
+  //      ...
+  //      goto CommonExit
+  //    CommonExit
+  // SimplifyCFG may generate:
+  //    Block0:
+  //      ...
+  //      if (...) goto Block_Common else ...
+  //     Block2:
+  //       ...
+  //      if (...) goto Block_Common else ...
+  //    Block_Common:
+  //      PHI = [llvm.sk_buff:0:50$0:0:0:2:0, llvm.bpf_map:0:40$0:0:0:2:0]
+  //      %6 = load PHI
+  //      %7 = bitcast %struct.sk_buff* %2 to i8*
+  //      %8 = getelementptr i8, i8* %7, %6
+  //      ...
+  //      goto CommonExit
+  //  For the above code, we cannot perform proper relocation since
+  //  "load PHI" has two possible relocations.
+  //
+  // To prevent above tail merging, we use __builtin_bpf_passthrough()
+  // where one of its parameters is a seq_num. Since two
+  // __builtin_bpf_passthrough() funcs will always have different seq_num,
+  // tail merging cannot happen. The __builtin_bpf_passthrough() will be
+  // removed in the beginning of Target IR passes.
+  //
+  // This approach is also used in other places when global var
+  // representing a relocation is used.
+  Instruction *PassThroughInst =
+      BPFCoreSharedInfo::insertPassThrough(M, BB, BCInst2, Call);
+  Call->replaceAllUsesWith(PassThroughInst);
   Call->eraseFromParent();
 
   return true;
 }
 
-bool BPFAbstractMemberAccess::doTransformation(Module &M) {
+bool BPFAbstractMemberAccess::doTransformation(Function &F) {
   bool Transformed = false;
 
-  for (Function &F : M) {
-    // Collect PreserveDIAccessIndex Intrinsic call chains.
-    // The call chains will be used to generate the access
-    // patterns similar to GEP.
-    collectAICallChains(M, F);
+  // Collect PreserveDIAccessIndex Intrinsic call chains.
+  // The call chains will be used to generate the access
+  // patterns similar to GEP.
+  collectAICallChains(F);
 
-    for (auto &C : BaseAICalls)
-      Transformed = transformGEPChain(M, C.first, C.second) || Transformed;
-  }
+  for (auto &C : BaseAICalls)
+    Transformed = transformGEPChain(C.first, C.second) || Transformed;
+
+  return removePreserveAccessIndexIntrinsic(F) || Transformed;
+}
 
-  return removePreserveAccessIndexIntrinsic(M) || Transformed;
+PreservedAnalyses
+BPFAbstractMemberAccessPass::run(Function &F, FunctionAnalysisManager &AM) {
+  return BPFAbstractMemberAccess(TM).run(F) ? PreservedAnalyses::none()
+                                            : PreservedAnalyses::all();
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/BPF/BPFAdjustOpt.cpp b/contrib/llvm-project/llvm/lib/Target/BPF/BPFAdjustOpt.cpp
new file mode 100644
index 000000000000..da543e7eba53
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/BPF/BPFAdjustOpt.cpp
@@ -0,0 +1,323 @@
+//===---------------- BPFAdjustOpt.cpp - Adjust Optimization --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Adjust optimization to make the code more kernel verifier friendly.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPF.h"
+#include "BPFCORE.h"
+#include "BPFTargetMachine.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+#define DEBUG_TYPE "bpf-adjust-opt"
+
+using namespace llvm;
+
+static cl::opt<bool>
+    DisableBPFserializeICMP("bpf-disable-serialize-icmp", cl::Hidden,
+                            cl::desc("BPF: Disable Serializing ICMP insns."),
+                            cl::init(false));
+
+static cl::opt<bool> DisableBPFavoidSpeculation(
+    "bpf-disable-avoid-speculation", cl::Hidden,
+    cl::desc("BPF: Disable Avoiding Speculative Code Motion."),
+    cl::init(false));
+
+namespace {
+
+class BPFAdjustOpt final : public ModulePass {
+public:
+  static char ID;
+
+  BPFAdjustOpt() : ModulePass(ID) {}
+  bool runOnModule(Module &M) override;
+};
+
+class BPFAdjustOptImpl {
+  struct PassThroughInfo {
+    Instruction *Input;
+    Instruction *UsedInst;
+    uint32_t OpIdx;
+    PassThroughInfo(Instruction *I, Instruction *U, uint32_t Idx)
+        : Input(I), UsedInst(U), OpIdx(Idx) {}
+  };
+
+public:
+  BPFAdjustOptImpl(Module *M) : M(M) {}
+
+  bool run();
+
+private:
+  Module *M;
+  SmallVector<PassThroughInfo, 16> PassThroughs;
+
+  void adjustBasicBlock(BasicBlock &BB);
+  bool serializeICMPCrossBB(BasicBlock &BB);
+  void adjustInst(Instruction &I);
+  bool serializeICMPInBB(Instruction &I);
+  bool avoidSpeculation(Instruction &I);
+  bool insertPassThrough();
+};
+
+} // End anonymous namespace
+
+char BPFAdjustOpt::ID = 0;
+INITIALIZE_PASS(BPFAdjustOpt, "bpf-adjust-opt", "BPF Adjust Optimization",
+                false, false)
+
+ModulePass *llvm::createBPFAdjustOpt() { return new BPFAdjustOpt(); }
+
+bool BPFAdjustOpt::runOnModule(Module &M) { return BPFAdjustOptImpl(&M).run(); }
+
+bool BPFAdjustOptImpl::run() {
+  for (Function &F : *M)
+    for (auto &BB : F) {
+      adjustBasicBlock(BB);
+      for (auto &I : BB)
+        adjustInst(I);
+    }
+
+  return insertPassThrough();
+}
+
+bool BPFAdjustOptImpl::insertPassThrough() {
+  for (auto &Info : PassThroughs) {
+    auto *CI = BPFCoreSharedInfo::insertPassThrough(
+        M, Info.UsedInst->getParent(), Info.Input, Info.UsedInst);
+    Info.UsedInst->setOperand(Info.OpIdx, CI);
+  }
+
+  return !PassThroughs.empty();
+}
+
+// To avoid combining conditionals in the same basic block by
+// instrcombine optimization.
+bool BPFAdjustOptImpl::serializeICMPInBB(Instruction &I) {
+  // For:
+  //   comp1 = icmp <opcode> ...;
+  //   comp2 = icmp <opcode> ...;
+  //   ... or comp1 comp2 ...
+  // changed to:
+  //   comp1 = icmp <opcode> ...;
+  //   comp2 = icmp <opcode> ...;
+  //   new_comp1 = __builtin_bpf_passthrough(seq_num, comp1)
+  //   ... or new_comp1 comp2 ...
+  if (I.getOpcode() != Instruction::Or)
+    return false;
+  auto *Icmp1 = dyn_cast<ICmpInst>(I.getOperand(0));
+  if (!Icmp1)
+    return false;
+  auto *Icmp2 = dyn_cast<ICmpInst>(I.getOperand(1));
+  if (!Icmp2)
+    return false;
+
+  Value *Icmp1Op0 = Icmp1->getOperand(0);
+  Value *Icmp2Op0 = Icmp2->getOperand(0);
+  if (Icmp1Op0 != Icmp2Op0)
+    return false;
+
+  // Now we got two icmp instructions which feed into
+  // an "or" instruction.
+  PassThroughInfo Info(Icmp1, &I, 0);
+  PassThroughs.push_back(Info);
+  return true;
+}
+
+// To avoid combining conditionals in the same basic block by
+// instrcombine optimization.
+bool BPFAdjustOptImpl::serializeICMPCrossBB(BasicBlock &BB) {
+  // For:
+  //   B1:
+  //     comp1 = icmp <opcode> ...;
+  //     if (comp1) goto B2 else B3;
+  //   B2:
+  //     comp2 = icmp <opcode> ...;
+  //     if (comp2) goto B4 else B5;
+  //   B4:
+  //     ...
+  // changed to:
+  //   B1:
+  //     comp1 = icmp <opcode> ...;
+  //     comp1 = __builtin_bpf_passthrough(seq_num, comp1);
+  //     if (comp1) goto B2 else B3;
+  //   B2:
+  //     comp2 = icmp <opcode> ...;
+  //     if (comp2) goto B4 else B5;
+  //   B4:
+  //     ...
+
+  // Check basic predecessors, if two of them (say B1, B2) are using
+  // icmp instructions to generate conditions and one is the predesessor
+  // of another (e.g., B1 is the predecessor of B2). Add a passthrough
+  // barrier after icmp inst of block B1.
+  BasicBlock *B2 = BB.getSinglePredecessor();
+  if (!B2)
+    return false;
+
+  BasicBlock *B1 = B2->getSinglePredecessor();
+  if (!B1)
+    return false;
+
+  Instruction *TI = B2->getTerminator();
+  auto *BI = dyn_cast<BranchInst>(TI);
+  if (!BI || !BI->isConditional())
+    return false;
+  auto *Cond = dyn_cast<ICmpInst>(BI->getCondition());
+  if (!Cond || B2->getFirstNonPHI() != Cond)
+    return false;
+  Value *B2Op0 = Cond->getOperand(0);
+  auto Cond2Op = Cond->getPredicate();
+
+  TI = B1->getTerminator();
+  BI = dyn_cast<BranchInst>(TI);
+  if (!BI || !BI->isConditional())
+    return false;
+  Cond = dyn_cast<ICmpInst>(BI->getCondition());
+  if (!Cond)
+    return false;
+  Value *B1Op0 = Cond->getOperand(0);
+  auto Cond1Op = Cond->getPredicate();
+
+  if (B1Op0 != B2Op0)
+    return false;
+
+  if (Cond1Op == ICmpInst::ICMP_SGT || Cond1Op == ICmpInst::ICMP_SGE) {
+    if (Cond2Op != ICmpInst::ICMP_SLT && Cond1Op != ICmpInst::ICMP_SLE)
+      return false;
+  } else if (Cond1Op == ICmpInst::ICMP_SLT || Cond1Op == ICmpInst::ICMP_SLE) {
+    if (Cond2Op != ICmpInst::ICMP_SGT && Cond1Op != ICmpInst::ICMP_SGE)
+      return false;
+  } else {
+    return false;
+  }
+
+  PassThroughInfo Info(Cond, BI, 0);
+  PassThroughs.push_back(Info);
+
+  return true;
+}
+
+// To avoid speculative hoisting certain computations out of
+// a basic block.
+bool BPFAdjustOptImpl::avoidSpeculation(Instruction &I) {
+  if (auto *LdInst = dyn_cast<LoadInst>(&I)) {
+    if (auto *GV = dyn_cast<GlobalVariable>(LdInst->getOperand(0))) {
+      if (GV->hasAttribute(BPFCoreSharedInfo::AmaAttr) ||
+          GV->hasAttribute(BPFCoreSharedInfo::TypeIdAttr))
+        return false;
+    }
+  }
+
+  if (!isa<LoadInst>(&I) && !isa<CallInst>(&I))
+    return false;
+
+  // For:
+  //   B1:
+  //     var = ...
+  //     ...
+  //     /* icmp may not be in the same block as var = ... */
+  //     comp1 = icmp <opcode> var, <const>;
+  //     if (comp1) goto B2 else B3;
+  //   B2:
+  //     ... var ...
+  // change to:
+  //   B1:
+  //     var = ...
+  //     ...
+  //     /* icmp may not be in the same block as var = ... */
+  //     comp1 = icmp <opcode> var, <const>;
+  //     if (comp1) goto B2 else B3;
+  //   B2:
+  //     var = __builtin_bpf_passthrough(seq_num, var);
+  //     ... var ...
+  bool isCandidate = false;
+  SmallVector<PassThroughInfo, 4> Candidates;
+  for (User *U : I.users()) {
+    Instruction *Inst = dyn_cast<Instruction>(U);
+    if (!Inst)
+      continue;
+
+    // May cover a little bit more than the
+    // above pattern.
+    if (auto *Icmp1 = dyn_cast<ICmpInst>(Inst)) {
+      Value *Icmp1Op1 = Icmp1->getOperand(1);
+      if (!isa<Constant>(Icmp1Op1))
+        return false;
+      isCandidate = true;
+      continue;
+    }
+
+    // Ignore the use in the same basic block as the definition.
+    if (Inst->getParent() == I.getParent())
+      continue;
+
+    // use in a different basic block, If there is a call or
+    // load/store insn before this instruction in this basic
+    // block. Most likely it cannot be hoisted out. Skip it.
+    for (auto &I2 : *Inst->getParent()) {
+      if (dyn_cast<CallInst>(&I2))
+        return false;
+      if (dyn_cast<LoadInst>(&I2) || dyn_cast<StoreInst>(&I2))
+        return false;
+      if (&I2 == Inst)
+        break;
+    }
+
+    // It should be used in a GEP or a simple arithmetic like
+    // ZEXT/SEXT which is used for GEP.
+    if (Inst->getOpcode() == Instruction::ZExt ||
+        Inst->getOpcode() == Instruction::SExt) {
+      PassThroughInfo Info(&I, Inst, 0);
+      Candidates.push_back(Info);
+    } else if (auto *GI = dyn_cast<GetElementPtrInst>(Inst)) {
+      // traverse GEP inst to find Use operand index
+      unsigned i, e;
+      for (i = 1, e = GI->getNumOperands(); i != e; ++i) {
+        Value *V = GI->getOperand(i);
+        if (V == &I)
+          break;
+      }
+      if (i == e)
+        continue;
+
+      PassThroughInfo Info(&I, GI, i);
+      Candidates.push_back(Info);
+    }
+  }
+
+  if (!isCandidate || Candidates.empty())
+    return false;
+
+  llvm::append_range(PassThroughs, Candidates);
+  return true;
+}
+
+void BPFAdjustOptImpl::adjustBasicBlock(BasicBlock &BB) {
+  if (!DisableBPFserializeICMP && serializeICMPCrossBB(BB))
+    return;
+}
+
+void BPFAdjustOptImpl::adjustInst(Instruction &I) {
+  if (!DisableBPFserializeICMP && serializeICMPInBB(I))
+    return;
+  if (!DisableBPFavoidSpeculation && avoidSpeculation(I))
+    return;
+}
+
+PreservedAnalyses BPFAdjustOptPass::run(Module &M, ModuleAnalysisManager &AM) {
+  return BPFAdjustOptImpl(&M).run() ? PreservedAnalyses::none()
+                                    : PreservedAnalyses::all();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/BPF/BPFCORE.h b/contrib/llvm-project/llvm/lib/Target/BPF/BPFCORE.h
index af6425b16fa0..0c504412480d 100644
--- a/contrib/llvm-project/llvm/lib/Target/BPF/BPFCORE.h
+++ b/contrib/llvm-project/llvm/lib/Target/BPF/BPFCORE.h
@@ -13,6 +13,10 @@
 
 namespace llvm {
 
+class BasicBlock;
+class Instruction;
+class Module;
+
 class BPFCoreSharedInfo {
 public:
   enum PatchableRelocKind : uint32_t {
@@ -24,6 +28,10 @@ public:
     FIELD_RSHIFT_U64,
     BTF_TYPE_ID_LOCAL,
     BTF_TYPE_ID_REMOTE,
+    TYPE_EXISTENCE,
+    TYPE_SIZE,
+    ENUM_VALUE_EXISTENCE,
+    ENUM_VALUE,
 
     MAX_FIELD_RELOC_KIND,
   };
@@ -35,10 +43,32 @@ public:
     MAX_BTF_TYPE_ID_FLAG,
   };
 
+  enum PreserveTypeInfo : uint32_t {
+    PRESERVE_TYPE_INFO_EXISTENCE = 0,
+    PRESERVE_TYPE_INFO_SIZE,
+
+    MAX_PRESERVE_TYPE_INFO_FLAG,
+  };
+
+  enum PreserveEnumValue : uint32_t {
+    PRESERVE_ENUM_VALUE_EXISTENCE = 0,
+    PRESERVE_ENUM_VALUE,
+
+    MAX_PRESERVE_ENUM_VALUE_FLAG,
+  };
+
   /// The attribute attached to globals representing a field access
   static constexpr StringRef AmaAttr = "btf_ama";
   /// The attribute attached to globals representing a type id
   static constexpr StringRef TypeIdAttr = "btf_type_id";
+
+  /// llvm.bpf.passthrough builtin seq number
+  static uint32_t SeqNum;
+
+  /// Insert a bpf passthrough builtin function.
+  static Instruction *insertPassThrough(Module *M, BasicBlock *BB,
+                                        Instruction *Input,
+                                        Instruction *Before);
 };
 
 } // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp b/contrib/llvm-project/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp
new file mode 100644
index 000000000000..5239218ad003
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp
@@ -0,0 +1,130 @@
+//===------------ BPFCheckAndAdjustIR.cpp - Check and Adjust IR -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Check IR and adjust IR for verifier friendly codes.
+// The following are done for IR checking:
+//   - no relocation globals in PHI node.
+// The following are done for IR adjustment:
+//   - remove __builtin_bpf_passthrough builtins. Target independent IR
+//     optimizations are done and those builtins can be removed.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPF.h"
+#include "BPFCORE.h"
+#include "BPFTargetMachine.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+#define DEBUG_TYPE "bpf-check-and-opt-ir"
+
+using namespace llvm;
+
+namespace {
+
+class BPFCheckAndAdjustIR final : public ModulePass {
+  bool runOnModule(Module &F) override;
+
+public:
+  static char ID;
+  BPFCheckAndAdjustIR() : ModulePass(ID) {}
+
+private:
+  void checkIR(Module &M);
+  bool adjustIR(Module &M);
+  bool removePassThroughBuiltin(Module &M);
+};
+} // End anonymous namespace
+
+char BPFCheckAndAdjustIR::ID = 0;
+INITIALIZE_PASS(BPFCheckAndAdjustIR, DEBUG_TYPE, "BPF Check And Adjust IR",
+                false, false)
+
+ModulePass *llvm::createBPFCheckAndAdjustIR() {
+  return new BPFCheckAndAdjustIR();
+}
+
+void BPFCheckAndAdjustIR::checkIR(Module &M) {
+  // Ensure relocation global won't appear in PHI node
+  // This may happen if the compiler generated the following code:
+  //   B1:
+  //      g1 = @llvm.skb_buff:0:1...
+  //      ...
+  //      goto B_COMMON
+  //   B2:
+  //      g2 = @llvm.skb_buff:0:2...
+  //      ...
+  //      goto B_COMMON
+  //   B_COMMON:
+  //      g = PHI(g1, g2)
+  //      x = load g
+  //      ...
+  // If anything likes the above "g = PHI(g1, g2)", issue a fatal error.
+  for (Function &F : M)
+    for (auto &BB : F)
+      for (auto &I : BB) {
+        PHINode *PN = dyn_cast<PHINode>(&I);
+        if (!PN || PN->use_empty())
+          continue;
+        for (int i = 0, e = PN->getNumIncomingValues(); i < e; ++i) {
+          auto *GV = dyn_cast<GlobalVariable>(PN->getIncomingValue(i));
+          if (!GV)
+            continue;
+          if (GV->hasAttribute(BPFCoreSharedInfo::AmaAttr) ||
+              GV->hasAttribute(BPFCoreSharedInfo::TypeIdAttr))
+            report_fatal_error("relocation global in PHI node");
+        }
+      }
+}
+
+bool BPFCheckAndAdjustIR::removePassThroughBuiltin(Module &M) {
+  // Remove __builtin_bpf_passthrough()'s which are used to prevent
+  // certain IR optimizations. Now major IR optimizations are done,
+  // remove them.
+  bool Changed = false;
+  CallInst *ToBeDeleted = nullptr;
+  for (Function &F : M)
+    for (auto &BB : F)
+      for (auto &I : BB) {
+        if (ToBeDeleted) {
+          ToBeDeleted->eraseFromParent();
+          ToBeDeleted = nullptr;
+        }
+
+        auto *Call = dyn_cast<CallInst>(&I);
+        if (!Call)
+          continue;
+        auto *GV = dyn_cast<GlobalValue>(Call->getCalledOperand());
+        if (!GV)
+          continue;
+        if (!GV->getName().startswith("llvm.bpf.passthrough"))
+          continue;
+        Changed = true;
+        Value *Arg = Call->getArgOperand(1);
+        Call->replaceAllUsesWith(Arg);
+        ToBeDeleted = Call;
+      }
+  return Changed;
+}
+
+bool BPFCheckAndAdjustIR::adjustIR(Module &M) {
+  return removePassThroughBuiltin(M);
+}
+
+bool BPFCheckAndAdjustIR::runOnModule(Module &M) {
+  checkIR(M);
+  return adjustIR(M);
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
index 77f565fb5957..f10a0d4c0077 100644
--- a/contrib/llvm-project/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
@@ -494,8 +494,6 @@ void BPFDAGToDAGISel::PreprocessTrunc(SDNode *Node,
   CurDAG->ReplaceAllUsesWith(SDValue(Node, 0), BaseV);
   I++;
   CurDAG->DeleteNode(Node);
-
-  return;
 }
 
 FunctionPass *llvm::createBPFISelDag(BPFTargetMachine &TM) {
diff --git a/contrib/llvm-project/llvm/lib/Target/BPF/BPFISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/BPF/BPFISelLowering.cpp
index a02556a39909..3322b8d93b3a 100644
--- a/contrib/llvm-project/llvm/lib/Target/BPF/BPFISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/BPF/BPFISelLowering.cpp
@@ -20,7 +20,6 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/DiagnosticInfo.h"
diff --git a/contrib/llvm-project/llvm/lib/Target/BPF/BPFInstrFormats.td b/contrib/llvm-project/llvm/lib/Target/BPF/BPFInstrFormats.td
index 9f00dc85d789..a809065014e5 100644
--- a/contrib/llvm-project/llvm/lib/Target/BPF/BPFInstrFormats.td
+++ b/contrib/llvm-project/llvm/lib/Target/BPF/BPFInstrFormats.td
@@ -44,6 +44,9 @@ def BPF_MOV  : BPFArithOp<0xb>;
 def BPF_ARSH : BPFArithOp<0xc>;
 def BPF_END  : BPFArithOp<0xd>;
 
+def BPF_XCHG    : BPFArithOp<0xe>;
+def BPF_CMPXCHG : BPFArithOp<0xf>;
+
 class BPFEndDir<bits<1> val> {
   bits<1> Value = val;
 }
@@ -86,7 +89,13 @@ def BPF_IMM  : BPFModeModifer<0x0>;
 def BPF_ABS  : BPFModeModifer<0x1>;
 def BPF_IND  : BPFModeModifer<0x2>;
 def BPF_MEM  : BPFModeModifer<0x3>;
-def BPF_XADD : BPFModeModifer<0x6>;
+def BPF_ATOMIC : BPFModeModifer<0x6>;
+
+class BPFAtomicFlag<bits<4> val> {
+  bits<4> Value = val;
+}
+
+def BPF_FETCH : BPFAtomicFlag<0x1>;
 
 class InstBPF<dag outs, dag ins, string asmstr, list<dag> pattern>
   : Instruction {
diff --git a/contrib/llvm-project/llvm/lib/Target/BPF/BPFInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/BPF/BPFInstrInfo.td
index 4298e2eaec04..082e1f4a92c2 100644
--- a/contrib/llvm-project/llvm/lib/Target/BPF/BPFInstrInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/BPF/BPFInstrInfo.td
@@ -617,9 +617,9 @@ let Predicates = [BPFNoALU32] in {
   def : Pat<(i64 (extloadi32 ADDRri:$src)), (i64 (LDW ADDRri:$src))>;
 }
 
-// Atomics
+// Atomic XADD for BPFNoALU32
 class XADD<BPFWidthModifer SizeOp, string OpcodeStr, PatFrag OpNode>
-    : TYPE_LD_ST<BPF_XADD.Value, SizeOp.Value,
+    : TYPE_LD_ST<BPF_ATOMIC.Value, SizeOp.Value,
                  (outs GPR:$dst),
                  (ins MEMri:$addr, GPR:$val),
                  "lock *("#OpcodeStr#" *)($addr) += $val",
@@ -630,14 +630,88 @@ class XADD<BPFWidthModifer SizeOp, string OpcodeStr, PatFrag OpNode>
   let Inst{51-48} = addr{19-16}; // base reg
   let Inst{55-52} = dst;
   let Inst{47-32} = addr{15-0}; // offset
+  let Inst{7-4} = BPF_ADD.Value;
   let BPFClass = BPF_STX;
 }
 
-class XADD32<BPFWidthModifer SizeOp, string OpcodeStr, PatFrag OpNode>
-    : TYPE_LD_ST<BPF_XADD.Value, SizeOp.Value,
+let Constraints = "$dst = $val" in {
+  let Predicates = [BPFNoALU32] in {
+    def XADDW : XADD<BPF_W, "u32", atomic_load_add_32>;
+  }
+}
+
+// Atomic add, and, or, xor
+class ATOMIC_NOFETCH<BPFArithOp Opc, string Opstr>
+    : TYPE_LD_ST<BPF_ATOMIC.Value, BPF_DW.Value,
+                 (outs GPR:$dst),
+                 (ins MEMri:$addr, GPR:$val),
+                 "lock *(u64 *)($addr) " #Opstr# "= $val",
+                 []> {
+  bits<4> dst;
+  bits<20> addr;
+
+  let Inst{51-48} = addr{19-16}; // base reg
+  let Inst{55-52} = dst;
+  let Inst{47-32} = addr{15-0}; // offset
+  let Inst{7-4} = Opc.Value;
+  let BPFClass = BPF_STX;
+}
+
+class ATOMIC32_NOFETCH<BPFArithOp Opc, string Opstr>
+    : TYPE_LD_ST<BPF_ATOMIC.Value, BPF_W.Value,
                  (outs GPR32:$dst),
                  (ins MEMri:$addr, GPR32:$val),
-                 "lock *("#OpcodeStr#" *)($addr) += $val",
+                 "lock *(u32 *)($addr) " #Opstr# "= $val",
+                 []> {
+  bits<4> dst;
+  bits<20> addr;
+
+  let Inst{51-48} = addr{19-16}; // base reg
+  let Inst{55-52} = dst;
+  let Inst{47-32} = addr{15-0}; // offset
+  let Inst{7-4} = Opc.Value;
+  let BPFClass = BPF_STX;
+}
+
+let Constraints = "$dst = $val" in {
+  let Predicates = [BPFHasALU32], DecoderNamespace = "BPFALU32" in {
+    def XADDW32 : ATOMIC32_NOFETCH<BPF_ADD, "+">;
+    def XANDW32 : ATOMIC32_NOFETCH<BPF_AND, "&">;
+    def XORW32  : ATOMIC32_NOFETCH<BPF_OR, "|">;
+    def XXORW32 : ATOMIC32_NOFETCH<BPF_XOR, "^">;
+  }
+
+  def XADDD  : ATOMIC_NOFETCH<BPF_ADD, "+">;
+  def XANDD  : ATOMIC_NOFETCH<BPF_AND, "&">;
+  def XORD   : ATOMIC_NOFETCH<BPF_OR, "|">;
+  def XXORD  : ATOMIC_NOFETCH<BPF_XOR, "^">;
+}
+
+// Atomic Fetch-and-<add, and, or, xor> operations
+class XFALU64<BPFWidthModifer SizeOp, BPFArithOp Opc, string OpcodeStr,
+              string OpcStr, PatFrag OpNode>
+    : TYPE_LD_ST<BPF_ATOMIC.Value, SizeOp.Value,
+                 (outs GPR:$dst),
+                 (ins MEMri:$addr, GPR:$val),
+                 "$dst = atomic_fetch_"#OpcStr#"(("#OpcodeStr#" *)($addr), $val)",
+                 [(set GPR:$dst, (OpNode ADDRri:$addr, GPR:$val))]> {
+  bits<4> dst;
+  bits<20> addr;
+
+  let Inst{51-48} = addr{19-16}; // base reg
+  let Inst{55-52} = dst;
+  let Inst{47-32} = addr{15-0}; // offset
+  let Inst{7-4} = Opc.Value;
+  let Inst{3-0} = BPF_FETCH.Value;
+  let BPFClass = BPF_STX;
+}
+
+class XFALU32<BPFWidthModifer SizeOp, BPFArithOp Opc, string OpcodeStr,
+              string OpcStr, PatFrag OpNode>
+    : TYPE_LD_ST<BPF_ATOMIC.Value, SizeOp.Value,
+                 (outs GPR32:$dst),
+                 (ins MEMri:$addr, GPR32:$val),
+                 "$dst = atomic_fetch_"#OpcStr#"(("#OpcodeStr#" *)($addr), $val)",
                  [(set GPR32:$dst, (OpNode ADDRri:$addr, GPR32:$val))]> {
   bits<4> dst;
   bits<20> addr;
@@ -645,19 +719,117 @@ class XADD32<BPFWidthModifer SizeOp, string OpcodeStr, PatFrag OpNode>
   let Inst{51-48} = addr{19-16}; // base reg
   let Inst{55-52} = dst;
   let Inst{47-32} = addr{15-0}; // offset
+  let Inst{7-4} = Opc.Value;
+  let Inst{3-0} = BPF_FETCH.Value;
   let BPFClass = BPF_STX;
 }
 
 let Constraints = "$dst = $val" in {
-  let Predicates = [BPFNoALU32] in {
-    def XADDW : XADD<BPF_W, "u32", atomic_load_add_32>;
+  let Predicates = [BPFHasALU32], DecoderNamespace = "BPFALU32" in {
+    def XFADDW32 : XFALU32<BPF_W, BPF_ADD, "u32", "add", atomic_load_add_32>;
+    def XFANDW32 : XFALU32<BPF_W, BPF_AND, "u32", "and", atomic_load_and_32>;
+    def XFORW32  : XFALU32<BPF_W, BPF_OR,  "u32", "or",  atomic_load_or_32>;
+    def XFXORW32 : XFALU32<BPF_W, BPF_XOR, "u32", "xor", atomic_load_xor_32>;
   }
 
+  def XFADDD : XFALU64<BPF_DW, BPF_ADD, "u64", "add", atomic_load_add_64>;
+  def XFANDD : XFALU64<BPF_DW, BPF_AND, "u64", "and", atomic_load_and_64>;
+  def XFORD  : XFALU64<BPF_DW, BPF_OR,  "u64", "or",  atomic_load_or_64>;
+  def XFXORD : XFALU64<BPF_DW, BPF_XOR, "u64", "xor", atomic_load_xor_64>;
+}
+
+// atomic_load_sub can be represented as a neg followed
+// by an atomic_load_add.
+def : Pat<(atomic_load_sub_32 ADDRri:$addr, GPR32:$val),
+          (XFADDW32 ADDRri:$addr, (NEG_32 GPR32:$val))>;
+def : Pat<(atomic_load_sub_64 ADDRri:$addr, GPR:$val),
+          (XFADDD ADDRri:$addr, (NEG_64 GPR:$val))>;
+
+// Atomic Exchange
+class XCHG<BPFWidthModifer SizeOp, string OpcodeStr, PatFrag OpNode>
+    : TYPE_LD_ST<BPF_ATOMIC.Value, SizeOp.Value,
+                 (outs GPR:$dst),
+                 (ins MEMri:$addr, GPR:$val),
+                 "$dst = xchg_"#OpcodeStr#"($addr, $val)",
+                 [(set GPR:$dst, (OpNode ADDRri:$addr,GPR:$val))]> {
+  bits<4> dst;
+  bits<20> addr;
+
+  let Inst{51-48} = addr{19-16}; // base reg
+  let Inst{55-52} = dst;
+  let Inst{47-32} = addr{15-0}; // offset
+  let Inst{7-4} = BPF_XCHG.Value;
+  let Inst{3-0} = BPF_FETCH.Value;
+  let BPFClass = BPF_STX;
+}
+
+class XCHG32<BPFWidthModifer SizeOp, string OpcodeStr, PatFrag OpNode>
+    : TYPE_LD_ST<BPF_ATOMIC.Value, SizeOp.Value,
+                 (outs GPR32:$dst),
+                 (ins MEMri:$addr, GPR32:$val),
+                 "$dst = xchg32_"#OpcodeStr#"($addr, $val)",
+                 [(set GPR32:$dst, (OpNode ADDRri:$addr,GPR32:$val))]> {
+  bits<4> dst;
+  bits<20> addr;
+
+  let Inst{51-48} = addr{19-16}; // base reg
+  let Inst{55-52} = dst;
+  let Inst{47-32} = addr{15-0}; // offset
+  let Inst{7-4} = BPF_XCHG.Value;
+  let Inst{3-0} = BPF_FETCH.Value;
+  let BPFClass = BPF_STX;
+}
+
+let Constraints = "$dst = $val" in {
   let Predicates = [BPFHasALU32], DecoderNamespace = "BPFALU32" in {
-    def XADDW32 : XADD32<BPF_W, "u32", atomic_load_add_32>;
+    def XCHGW32 : XCHG32<BPF_W, "32", atomic_swap_32>;
   }
 
-  def XADDD : XADD<BPF_DW, "u64", atomic_load_add_64>;
+  def XCHGD : XCHG<BPF_DW, "64", atomic_swap_64>;
+}
+
+// Compare-And-Exchange
+class CMPXCHG<BPFWidthModifer SizeOp, string OpcodeStr, PatFrag OpNode>
+    : TYPE_LD_ST<BPF_ATOMIC.Value, SizeOp.Value,
+                 (outs),
+                 (ins MEMri:$addr, GPR:$new),
+                 "r0 = cmpxchg_"#OpcodeStr#"($addr, r0, $new)",
+                 [(set R0, (OpNode ADDRri:$addr, R0, GPR:$new))]> {
+  bits<4> new;
+  bits<20> addr;
+
+  let Inst{51-48} = addr{19-16}; // base reg
+  let Inst{55-52} = new;
+  let Inst{47-32} = addr{15-0}; // offset
+  let Inst{7-4} = BPF_CMPXCHG.Value;
+  let Inst{3-0} = BPF_FETCH.Value;
+  let BPFClass = BPF_STX;
+}
+
+class CMPXCHG32<BPFWidthModifer SizeOp, string OpcodeStr, PatFrag OpNode>
+    : TYPE_LD_ST<BPF_ATOMIC.Value, SizeOp.Value,
+                 (outs),
+                 (ins MEMri:$addr, GPR32:$new),
+                 "w0 = cmpxchg32_"#OpcodeStr#"($addr, w0, $new)",
+                 [(set W0, (OpNode ADDRri:$addr, W0, GPR32:$new))]> {
+  bits<4> new;
+  bits<20> addr;
+
+  let Inst{51-48} = addr{19-16}; // base reg
+  let Inst{55-52} = new;
+  let Inst{47-32} = addr{15-0}; // offset
+  let Inst{7-4} = BPF_CMPXCHG.Value;
+  let Inst{3-0} = BPF_FETCH.Value;
+  let BPFClass = BPF_STX;
+}
+
+let Predicates = [BPFHasALU32], Defs = [W0], Uses = [W0],
+    DecoderNamespace = "BPFALU32" in {
+  def CMPXCHGW32 : CMPXCHG32<BPF_W, "32", atomic_cmp_swap_32>;
+}
+
+let Defs = [R0], Uses = [R0] in {
+  def CMPXCHGD : CMPXCHG<BPF_DW, "64", atomic_cmp_swap_64>;
 }
 
 // bswap16, bswap32, bswap64
diff --git a/contrib/llvm-project/llvm/lib/Target/BPF/BPFMIChecking.cpp b/contrib/llvm-project/llvm/lib/Target/BPF/BPFMIChecking.cpp
index f82f166eda4d..4e24e3d911b8 100644
--- a/contrib/llvm-project/llvm/lib/Target/BPF/BPFMIChecking.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/BPF/BPFMIChecking.cpp
@@ -41,7 +41,7 @@ private:
   // Initialize class variables.
   void initialize(MachineFunction &MFParm);
 
-  void checkingIllegalXADD(void);
+  bool processAtomicInsts(void);
 
 public:
 
@@ -49,7 +49,7 @@ public:
   bool runOnMachineFunction(MachineFunction &MF) override {
     if (!skipFunction(MF.getFunction())) {
       initialize(MF);
-      checkingIllegalXADD();
+      return processAtomicInsts();
     }
     return false;
   }
@@ -143,17 +143,15 @@ static bool hasLiveDefs(const MachineInstr &MI, const TargetRegisterInfo *TRI) {
     return true;
 
   // Otherwise, return true if any aliased SuperReg of GPR32 is not dead.
-  std::vector<unsigned>::iterator search_begin = GPR64DeadDefs.begin();
-  std::vector<unsigned>::iterator search_end = GPR64DeadDefs.end();
   for (auto I : GPR32LiveDefs)
     for (MCSuperRegIterator SR(I, TRI); SR.isValid(); ++SR)
-       if (std::find(search_begin, search_end, *SR) == search_end)
-         return true;
+      if (!llvm::is_contained(GPR64DeadDefs, *SR))
+        return true;
 
   return false;
 }
 
-void BPFMIPreEmitChecking::checkingIllegalXADD(void) {
+bool BPFMIPreEmitChecking::processAtomicInsts(void) {
   for (MachineBasicBlock &MBB : *MF) {
     for (MachineInstr &MI : MBB) {
       if (MI.getOpcode() != BPF::XADDW &&
@@ -174,7 +172,71 @@ void BPFMIPreEmitChecking::checkingIllegalXADD(void) {
     }
   }
 
-  return;
+  // Check return values of atomic_fetch_and_{add,and,or,xor}.
+  // If the return is not used, the atomic_fetch_and_<op> instruction
+  // is replaced with atomic_<op> instruction.
+  MachineInstr *ToErase = nullptr;
+  bool Changed = false;
+  const BPFInstrInfo *TII = MF->getSubtarget<BPFSubtarget>().getInstrInfo();
+  for (MachineBasicBlock &MBB : *MF) {
+    for (MachineInstr &MI : MBB) {
+      if (ToErase) {
+        ToErase->eraseFromParent();
+        ToErase = nullptr;
+      }
+
+      if (MI.getOpcode() != BPF::XFADDW32 && MI.getOpcode() != BPF::XFADDD &&
+          MI.getOpcode() != BPF::XFANDW32 && MI.getOpcode() != BPF::XFANDD &&
+          MI.getOpcode() != BPF::XFXORW32 && MI.getOpcode() != BPF::XFXORD &&
+          MI.getOpcode() != BPF::XFORW32 && MI.getOpcode() != BPF::XFORD)
+        continue;
+
+      if (hasLiveDefs(MI, TRI))
+        continue;
+
+      LLVM_DEBUG(dbgs() << "Transforming "; MI.dump());
+      unsigned newOpcode;
+      switch (MI.getOpcode()) {
+      case BPF::XFADDW32:
+        newOpcode = BPF::XADDW32;
+        break;
+      case BPF::XFADDD:
+        newOpcode = BPF::XADDD;
+        break;
+      case BPF::XFANDW32:
+        newOpcode = BPF::XANDW32;
+        break;
+      case BPF::XFANDD:
+        newOpcode = BPF::XANDD;
+        break;
+      case BPF::XFXORW32:
+        newOpcode = BPF::XXORW32;
+        break;
+      case BPF::XFXORD:
+        newOpcode = BPF::XXORD;
+        break;
+      case BPF::XFORW32:
+        newOpcode = BPF::XORW32;
+        break;
+      case BPF::XFORD:
+        newOpcode = BPF::XORD;
+        break;
+      default:
+        llvm_unreachable("Incorrect Atomic Instruction Opcode");
+      }
+
+      BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(newOpcode))
+          .add(MI.getOperand(0))
+          .add(MI.getOperand(1))
+          .add(MI.getOperand(2))
+          .add(MI.getOperand(3));
+
+      ToErase = &MI;
+      Changed = true;
+    }
+  }
+
+  return Changed;
 }
 
 } // end default namespace
diff --git a/contrib/llvm-project/llvm/lib/Target/BPF/BPFMIPeephole.cpp b/contrib/llvm-project/llvm/lib/Target/BPF/BPFMIPeephole.cpp
index df870314fffe..354980e4bf3c 100644
--- a/contrib/llvm-project/llvm/lib/Target/BPF/BPFMIPeephole.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/BPF/BPFMIPeephole.cpp
@@ -475,6 +475,9 @@ bool BPFMIPeepholeTruncElim::eliminateTruncSeq(void) {
       if (MI.getOpcode() == BPF::SRL_ri &&
           MI.getOperand(2).getImm() == 32) {
         SrcReg = MI.getOperand(1).getReg();
+        if (!MRI->hasOneNonDBGUse(SrcReg))
+          continue;
+
         MI2 = MRI->getVRegDef(SrcReg);
         DstReg = MI.getOperand(0).getReg();
 
diff --git a/contrib/llvm-project/llvm/lib/Target/BPF/BPFPreserveDIType.cpp b/contrib/llvm-project/llvm/lib/Target/BPF/BPFPreserveDIType.cpp
index c3cb7647aa79..0348e2200acb 100644
--- a/contrib/llvm-project/llvm/lib/Target/BPF/BPFPreserveDIType.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/BPF/BPFPreserveDIType.cpp
@@ -17,6 +17,7 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
@@ -33,58 +34,32 @@ using namespace llvm;
 
 namespace {
 
-class BPFPreserveDIType final : public ModulePass {
-  StringRef getPassName() const override {
-    return "BPF Preserve DebugInfo Type";
-  }
-
-  bool runOnModule(Module &M) override;
-
-public:
-  static char ID;
-  BPFPreserveDIType() : ModulePass(ID) {}
-
-private:
-  bool doTransformation(Module &M);
-};
-} // End anonymous namespace
-
-char BPFPreserveDIType::ID = 0;
-INITIALIZE_PASS(BPFPreserveDIType, DEBUG_TYPE, "preserve debuginfo type", false,
-                false)
-
-ModulePass *llvm::createBPFPreserveDIType() { return new BPFPreserveDIType(); }
-
-bool BPFPreserveDIType::runOnModule(Module &M) {
+static bool BPFPreserveDITypeImpl(Function &F) {
   LLVM_DEBUG(dbgs() << "********** preserve debuginfo type **********\n");
 
+  Module *M = F.getParent();
+
   // Bail out if no debug info.
-  if (M.debug_compile_units().empty())
+  if (M->debug_compile_units().empty())
     return false;
 
-  return doTransformation(M);
-}
-
-bool BPFPreserveDIType::doTransformation(Module &M) {
   std::vector<CallInst *> PreserveDITypeCalls;
 
-  for (auto &F : M) {
-    for (auto &BB : F) {
-      for (auto &I : BB) {
-        auto *Call = dyn_cast<CallInst>(&I);
-        if (!Call)
-          continue;
-
-        const auto *GV = dyn_cast<GlobalValue>(Call->getCalledOperand());
-        if (!GV)
-          continue;
-
-        if (GV->getName().startswith("llvm.bpf.btf.type.id")) {
-          if (!Call->getMetadata(LLVMContext::MD_preserve_access_index))
-            report_fatal_error(
-                "Missing metadata for llvm.bpf.btf.type.id intrinsic");
-          PreserveDITypeCalls.push_back(Call);
-        }
+  for (auto &BB : F) {
+    for (auto &I : BB) {
+      auto *Call = dyn_cast<CallInst>(&I);
+      if (!Call)
+        continue;
+
+      const auto *GV = dyn_cast<GlobalValue>(Call->getCalledOperand());
+      if (!GV)
+        continue;
+
+      if (GV->getName().startswith("llvm.bpf.btf.type.id")) {
+        if (!Call->getMetadata(LLVMContext::MD_preserve_access_index))
+          report_fatal_error(
+              "Missing metadata for llvm.bpf.btf.type.id intrinsic");
+        PreserveDITypeCalls.push_back(Call);
       }
     }
   }
@@ -93,39 +68,81 @@ bool BPFPreserveDIType::doTransformation(Module &M) {
     return false;
 
   std::string BaseName = "llvm.btf_type_id.";
-  int Count = 0;
+  static int Count = 0;
   for (auto Call : PreserveDITypeCalls) {
-    const ConstantInt *Flag = dyn_cast<ConstantInt>(Call->getArgOperand(2));
+    const ConstantInt *Flag = dyn_cast<ConstantInt>(Call->getArgOperand(1));
     assert(Flag);
     uint64_t FlagValue = Flag->getValue().getZExtValue();
 
     if (FlagValue >= BPFCoreSharedInfo::MAX_BTF_TYPE_ID_FLAG)
       report_fatal_error("Incorrect flag for llvm.bpf.btf.type.id intrinsic");
 
+    MDNode *MD = Call->getMetadata(LLVMContext::MD_preserve_access_index);
+
     uint32_t Reloc;
-    if (FlagValue == BPFCoreSharedInfo::BTF_TYPE_ID_LOCAL_RELOC)
+    if (FlagValue == BPFCoreSharedInfo::BTF_TYPE_ID_LOCAL_RELOC) {
       Reloc = BPFCoreSharedInfo::BTF_TYPE_ID_LOCAL;
-    else
+    } else {
       Reloc = BPFCoreSharedInfo::BTF_TYPE_ID_REMOTE;
+      DIType *Ty = cast<DIType>(MD);
+      while (auto *DTy = dyn_cast<DIDerivedType>(Ty)) {
+        unsigned Tag = DTy->getTag();
+        if (Tag != dwarf::DW_TAG_const_type &&
+            Tag != dwarf::DW_TAG_volatile_type)
+          break;
+        Ty = DTy->getBaseType();
+      }
+
+      if (Ty->getName().empty())
+        report_fatal_error("Empty type name for BTF_TYPE_ID_REMOTE reloc");
+      MD = Ty;
+    }
 
     BasicBlock *BB = Call->getParent();
-    IntegerType *VarType = Type::getInt32Ty(BB->getContext());
+    IntegerType *VarType = Type::getInt64Ty(BB->getContext());
     std::string GVName = BaseName + std::to_string(Count) + "$" +
         std::to_string(Reloc);
-    GlobalVariable *GV =
-        new GlobalVariable(M, VarType, false, GlobalVariable::ExternalLinkage,
-                           NULL, GVName);
+    GlobalVariable *GV = new GlobalVariable(
+        *M, VarType, false, GlobalVariable::ExternalLinkage, NULL, GVName);
     GV->addAttribute(BPFCoreSharedInfo::TypeIdAttr);
-    MDNode *MD = Call->getMetadata(LLVMContext::MD_preserve_access_index);
     GV->setMetadata(LLVMContext::MD_preserve_access_index, MD);
 
     // Load the global variable which represents the type info.
-    auto *LDInst = new LoadInst(Type::getInt32Ty(BB->getContext()), GV, "",
-                                Call);
-    Call->replaceAllUsesWith(LDInst);
+    auto *LDInst =
+        new LoadInst(Type::getInt64Ty(BB->getContext()), GV, "", Call);
+    Instruction *PassThroughInst =
+        BPFCoreSharedInfo::insertPassThrough(M, BB, LDInst, Call);
+    Call->replaceAllUsesWith(PassThroughInst);
     Call->eraseFromParent();
     Count++;
   }
 
   return true;
 }
+
+class BPFPreserveDIType final : public FunctionPass {
+  bool runOnFunction(Function &F) override;
+
+public:
+  static char ID;
+  BPFPreserveDIType() : FunctionPass(ID) {}
+};
+} // End anonymous namespace
+
+char BPFPreserveDIType::ID = 0;
+INITIALIZE_PASS(BPFPreserveDIType, DEBUG_TYPE, "BPF Preserve Debuginfo Type",
+                false, false)
+
+FunctionPass *llvm::createBPFPreserveDIType() {
+  return new BPFPreserveDIType();
+}
+
+bool BPFPreserveDIType::runOnFunction(Function &F) {
+  return BPFPreserveDITypeImpl(F);
+}
+
+PreservedAnalyses BPFPreserveDITypePass::run(Function &F,
+                                             FunctionAnalysisManager &AM) {
+  return BPFPreserveDITypeImpl(F) ? PreservedAnalyses::none()
+                                  : PreservedAnalyses::all();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/BPF/BPFSubtarget.cpp b/contrib/llvm-project/llvm/lib/Target/BPF/BPFSubtarget.cpp
index f3cb03b1f1f5..fac02e6476b7 100644
--- a/contrib/llvm-project/llvm/lib/Target/BPF/BPFSubtarget.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/BPF/BPFSubtarget.cpp
@@ -29,7 +29,7 @@ BPFSubtarget &BPFSubtarget::initializeSubtargetDependencies(StringRef CPU,
                                                             StringRef FS) {
   initializeEnvironment();
   initSubtargetFeatures(CPU, FS);
-  ParseSubtargetFeatures(CPU, FS);
+  ParseSubtargetFeatures(CPU, /*TuneCPU*/ CPU, FS);
   return *this;
 }
 
@@ -59,6 +59,6 @@ void BPFSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
 
 BPFSubtarget::BPFSubtarget(const Triple &TT, const std::string &CPU,
                            const std::string &FS, const TargetMachine &TM)
-    : BPFGenSubtargetInfo(TT, CPU, FS), InstrInfo(),
+    : BPFGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), InstrInfo(),
       FrameLowering(initializeSubtargetDependencies(CPU, FS)),
       TLInfo(TM, *this) {}
diff --git a/contrib/llvm-project/llvm/lib/Target/BPF/BPFSubtarget.h b/contrib/llvm-project/llvm/lib/Target/BPF/BPFSubtarget.h
index 3da6a026ab7e..7649e0e92222 100644
--- a/contrib/llvm-project/llvm/lib/Target/BPF/BPFSubtarget.h
+++ b/contrib/llvm-project/llvm/lib/Target/BPF/BPFSubtarget.h
@@ -67,7 +67,7 @@ public:
 
   // ParseSubtargetFeatures - Parses features string setting specified
   // subtarget options.  Definition of function is auto generated by tblgen.
-  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+  void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
   bool getHasJmpExt() const { return HasJmpExt; }
   bool getHasJmp32() const { return HasJmp32; }
   bool getHasAlu32() const { return HasAlu32; }
diff --git a/contrib/llvm-project/llvm/lib/Target/BPF/BPFTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/BPF/BPFTargetMachine.cpp
index 54204ee197ec..a8fef2517b03 100644
--- a/contrib/llvm-project/llvm/lib/Target/BPF/BPFTargetMachine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/BPF/BPFTargetMachine.cpp
@@ -12,15 +12,22 @@
 
 #include "BPFTargetMachine.h"
 #include "BPF.h"
+#include "BPFTargetTransformInfo.h"
 #include "MCTargetDesc/BPFMCAsmInfo.h"
 #include "TargetInfo/BPFTargetInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Passes/PassBuilder.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetOptions.h"
+#include "llvm/Transforms/IPO/PassManagerBuilder.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/SimplifyCFG.h"
+#include "llvm/Transforms/Utils/SimplifyCFGOptions.h"
 using namespace llvm;
 
 static cl::
@@ -34,8 +41,10 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeBPFTarget() {
   RegisterTargetMachine<BPFTargetMachine> Z(getTheBPFTarget());
 
   PassRegistry &PR = *PassRegistry::getPassRegistry();
-  initializeBPFAbstractMemberAccessPass(PR);
+  initializeBPFAbstractMemberAccessLegacyPassPass(PR);
   initializeBPFPreserveDITypePass(PR);
+  initializeBPFAdjustOptPass(PR);
+  initializeBPFCheckAndAdjustIRPass(PR);
   initializeBPFMIPeepholePass(PR);
   initializeBPFMIPeepholeTruncElimPass(PR);
 }
@@ -49,9 +58,7 @@ static std::string computeDataLayout(const Triple &TT) {
 }
 
 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
-  if (!RM.hasValue())
-    return Reloc::PIC_;
-  return *RM;
+  return RM.getValueOr(Reloc::PIC_);
 }
 
 BPFTargetMachine::BPFTargetMachine(const Target &T, const Triple &TT,
@@ -94,14 +101,56 @@ TargetPassConfig *BPFTargetMachine::createPassConfig(PassManagerBase &PM) {
   return new BPFPassConfig(*this, PM);
 }
 
-void BPFPassConfig::addIRPasses() {
+void BPFTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
+ Builder.addExtension(
+      PassManagerBuilder::EP_EarlyAsPossible,
+      [&](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
+        PM.add(createBPFAbstractMemberAccess(this));
+        PM.add(createBPFPreserveDIType());
+      });
+
+  Builder.addExtension(
+      PassManagerBuilder::EP_Peephole,
+      [&](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
+        PM.add(createCFGSimplificationPass(
+            SimplifyCFGOptions().hoistCommonInsts(true)));
+      });
+  Builder.addExtension(
+      PassManagerBuilder::EP_ModuleOptimizerEarly,
+      [&](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
+        PM.add(createBPFAdjustOpt());
+      });
+}
 
-  addPass(createBPFAbstractMemberAccess(&getBPFTargetMachine()));
-  addPass(createBPFPreserveDIType());
+void BPFTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB,
+                                                    bool DebugPassManager) {
+  PB.registerPipelineStartEPCallback(
+      [=](ModulePassManager &MPM, PassBuilder::OptimizationLevel) {
+        FunctionPassManager FPM(DebugPassManager);
+        FPM.addPass(BPFAbstractMemberAccessPass(this));
+        FPM.addPass(BPFPreserveDITypePass());
+        MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+      });
+  PB.registerPeepholeEPCallback([=](FunctionPassManager &FPM,
+                                    PassBuilder::OptimizationLevel Level) {
+    FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions().hoistCommonInsts(true)));
+  });
+  PB.registerPipelineEarlySimplificationEPCallback(
+      [=](ModulePassManager &MPM, PassBuilder::OptimizationLevel) {
+        MPM.addPass(BPFAdjustOptPass());
+      });
+}
 
+void BPFPassConfig::addIRPasses() {
+  addPass(createBPFCheckAndAdjustIR());
   TargetPassConfig::addIRPasses();
 }
 
+TargetTransformInfo
+BPFTargetMachine::getTargetTransformInfo(const Function &F) {
+  return TargetTransformInfo(BPFTTIImpl(this, F));
+}
+
 // Install an instruction selector pass using
 // the ISelDag to gen BPF code.
 bool BPFPassConfig::addInstSelector() {
diff --git a/contrib/llvm-project/llvm/lib/Target/BPF/BPFTargetMachine.h b/contrib/llvm-project/llvm/lib/Target/BPF/BPFTargetMachine.h
index beac7bd862da..61c8a44cc402 100644
--- a/contrib/llvm-project/llvm/lib/Target/BPF/BPFTargetMachine.h
+++ b/contrib/llvm-project/llvm/lib/Target/BPF/BPFTargetMachine.h
@@ -34,9 +34,15 @@ public:
 
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 
+  TargetTransformInfo getTargetTransformInfo(const Function &F) override;
+
   TargetLoweringObjectFile *getObjFileLowering() const override {
     return TLOF.get();
   }
+
+  void adjustPassManager(PassManagerBuilder &) override;
+  void registerPassBuilderCallbacks(PassBuilder &PB,
+                                    bool DebugPassManager) override;
 };
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Target/BPF/BPFTargetTransformInfo.h b/contrib/llvm-project/llvm/lib/Target/BPF/BPFTargetTransformInfo.h
new file mode 100644
index 000000000000..62055497e685
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/BPF/BPFTargetTransformInfo.h
@@ -0,0 +1,61 @@
+//===------ BPFTargetTransformInfo.h - BPF specific TTI ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file uses the target's specific information to
+// provide more precise answers to certain TTI queries, while letting the
+// target independent and default TTI implementations handle the rest.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_BPF_BPFTARGETTRANSFORMINFO_H
+#define LLVM_LIB_TARGET_BPF_BPFTARGETTRANSFORMINFO_H
+
+#include "BPFTargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
+
+namespace llvm {
+class BPFTTIImpl : public BasicTTIImplBase<BPFTTIImpl> {
+  typedef BasicTTIImplBase<BPFTTIImpl> BaseT;
+  typedef TargetTransformInfo TTI;
+  friend BaseT;
+
+  const BPFSubtarget *ST;
+  const BPFTargetLowering *TLI;
+
+  const BPFSubtarget *getST() const { return ST; }
+  const BPFTargetLowering *getTLI() const { return TLI; }
+
+public:
+  explicit BPFTTIImpl(const BPFTargetMachine *TM, const Function &F)
+      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+        TLI(ST->getTargetLowering()) {}
+
+  int getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) {
+    if (Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue()))
+      return TTI::TCC_Free;
+
+    return TTI::TCC_Basic;
+  }
+
+  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                         CmpInst::Predicate VecPred,
+                         TTI::TargetCostKind CostKind,
+                         const llvm::Instruction *I = nullptr) {
+    if (Opcode == Instruction::Select)
+      return SCEVCheapExpansionBudget;
+
+    return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
+                                     I);
+  }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_BPF_BPFTARGETTRANSFORMINFO_H
diff --git a/contrib/llvm-project/llvm/lib/Target/BPF/BTF.def b/contrib/llvm-project/llvm/lib/Target/BPF/BTF.def
index 2d2e9a04aa6d..66cf2c90ead4 100644
--- a/contrib/llvm-project/llvm/lib/Target/BPF/BTF.def
+++ b/contrib/llvm-project/llvm/lib/Target/BPF/BTF.def
@@ -30,5 +30,6 @@ HANDLE_BTF_KIND(12, FUNC)
 HANDLE_BTF_KIND(13, FUNC_PROTO)
 HANDLE_BTF_KIND(14, VAR)
 HANDLE_BTF_KIND(15, DATASEC)
+HANDLE_BTF_KIND(16, FLOAT)
 
 #undef HANDLE_BTF_KIND
diff --git a/contrib/llvm-project/llvm/lib/Target/BPF/BTFDebug.cpp b/contrib/llvm-project/llvm/lib/Target/BPF/BTFDebug.cpp
index 4510e9357489..9249d679c7bd 100644
--- a/contrib/llvm-project/llvm/lib/Target/BPF/BTFDebug.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/BPF/BTFDebug.cpp
@@ -22,6 +22,7 @@
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/Support/LineIterator.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 
 using namespace llvm;
 
@@ -370,6 +371,21 @@ void BTFKindDataSec::emitType(MCStreamer &OS) {
   }
 }
 
+BTFTypeFloat::BTFTypeFloat(uint32_t SizeInBits, StringRef TypeName)
+    : Name(TypeName) {
+  Kind = BTF::BTF_KIND_FLOAT;
+  BTFType.Info = Kind << 24;
+  BTFType.Size = roundupToBytes(SizeInBits);
+}
+
+void BTFTypeFloat::completeType(BTFDebug &BDebug) {
+  if (IsCompleted)
+    return;
+  IsCompleted = true;
+
+  BTFType.NameOff = BDebug.addString(Name);
+}
+
 uint32_t BTFStringTable::addString(StringRef S) {
   // Check whether the string already exists.
   for (auto &OffsetM : OffsetToIdMap) {
@@ -408,18 +424,28 @@ uint32_t BTFDebug::addType(std::unique_ptr<BTFTypeBase> TypeEntry) {
 }
 
 void BTFDebug::visitBasicType(const DIBasicType *BTy, uint32_t &TypeId) {
-  // Only int types are supported in BTF.
+  // Only int and binary floating point types are supported in BTF.
   uint32_t Encoding = BTy->getEncoding();
-  if (Encoding != dwarf::DW_ATE_boolean && Encoding != dwarf::DW_ATE_signed &&
-      Encoding != dwarf::DW_ATE_signed_char &&
-      Encoding != dwarf::DW_ATE_unsigned &&
-      Encoding != dwarf::DW_ATE_unsigned_char)
+  std::unique_ptr<BTFTypeBase> TypeEntry;
+  switch (Encoding) {
+  case dwarf::DW_ATE_boolean:
+  case dwarf::DW_ATE_signed:
+  case dwarf::DW_ATE_signed_char:
+  case dwarf::DW_ATE_unsigned:
+  case dwarf::DW_ATE_unsigned_char:
+    // Create a BTF type instance for this DIBasicType and put it into
+    // DIToIdMap for cross-type reference check.
+    TypeEntry = std::make_unique<BTFTypeInt>(
+        Encoding, BTy->getSizeInBits(), BTy->getOffsetInBits(), BTy->getName());
+    break;
+  case dwarf::DW_ATE_float:
+    TypeEntry =
+        std::make_unique<BTFTypeFloat>(BTy->getSizeInBits(), BTy->getName());
+    break;
+  default:
     return;
+  }
 
-  // Create a BTF type instance for this DIBasicType and put it into
-  // DIToIdMap for cross-type reference check.
-  auto TypeEntry = std::make_unique<BTFTypeInt>(
-      Encoding, BTy->getSizeInBits(), BTy->getOffsetInBits(), BTy->getName());
   TypeId = addType(std::move(TypeEntry), BTy);
 }
 
@@ -993,12 +1019,13 @@ void BTFDebug::generatePatchImmReloc(const MCSymbol *ORSym, uint32_t RootId,
 
     FieldReloc.OffsetNameOff = addString(IndexPattern);
     FieldReloc.RelocKind = std::stoull(std::string(RelocKindStr));
-    PatchImms[GVar] = std::stoul(std::string(PatchImmStr));
+    PatchImms[GVar] = std::make_pair(std::stoll(std::string(PatchImmStr)),
+                                     FieldReloc.RelocKind);
   } else {
     StringRef RelocStr = AccessPattern.substr(FirstDollar + 1);
     FieldReloc.OffsetNameOff = addString("0");
     FieldReloc.RelocKind = std::stoull(std::string(RelocStr));
-    PatchImms[GVar] = RootId;
+    PatchImms[GVar] = std::make_pair(RootId, FieldReloc.RelocKind);
   }
   FieldRelocTable[SecNameOff].push_back(FieldReloc);
 }
@@ -1074,6 +1101,9 @@ void BTFDebug::beginInstruction(const MachineInstr *MI) {
     }
   }
 
+  if (!CurMI) // no debug info
+    return;
+
   // Skip this instruction if no DebugLoc or the DebugLoc
   // is the same as the previous instruction.
   const DebugLoc &DL = MI->getDebugLoc();
@@ -1125,6 +1155,20 @@ void BTFDebug::processGlobals(bool ProcessingMapDef) {
     if (ProcessingMapDef != SecName.startswith(".maps"))
       continue;
 
+    // Create a .rodata datasec if the global variable is an initialized
+    // constant with private linkage and if it won't be in .rodata.str<#>
+    // and .rodata.cst<#> sections.
+    if (SecName == ".rodata" && Global.hasPrivateLinkage() &&
+        DataSecEntries.find(std::string(SecName)) == DataSecEntries.end()) {
+      SectionKind GVKind =
+          TargetLoweringObjectFile::getKindForGlobal(&Global, Asm->TM);
+      // skip .rodata.str<#> and .rodata.cst<#> sections
+      if (!GVKind.isMergeableCString() && !GVKind.isMergeableConst()) {
+        DataSecEntries[std::string(SecName)] =
+            std::make_unique<BTFKindDataSec>(Asm, std::string(SecName));
+      }
+    }
+
     SmallVector<DIGlobalVariableExpression *, 1> GVs;
     Global.getDebugInfo(GVs);
 
@@ -1152,6 +1196,7 @@ void BTFDebug::processGlobals(bool ProcessingMapDef) {
     if (Linkage != GlobalValue::InternalLinkage &&
         Linkage != GlobalValue::ExternalLinkage &&
         Linkage != GlobalValue::WeakAnyLinkage &&
+        Linkage != GlobalValue::WeakODRLinkage &&
         Linkage != GlobalValue::ExternalWeakLinkage)
       continue;
 
@@ -1180,8 +1225,8 @@ void BTFDebug::processGlobals(bool ProcessingMapDef) {
     const DataLayout &DL = Global.getParent()->getDataLayout();
     uint32_t Size = DL.getTypeAllocSize(Global.getType()->getElementType());
 
-    DataSecEntries[std::string(SecName)]->addVar(VarId, Asm->getSymbol(&Global),
-                                                 Size);
+    DataSecEntries[std::string(SecName)]->addDataSecEntry(VarId,
+        Asm->getSymbol(&Global), Size);
   }
 }
 
@@ -1194,14 +1239,23 @@ bool BTFDebug::InstLower(const MachineInstr *MI, MCInst &OutMI) {
       auto *GVar = dyn_cast<GlobalVariable>(GVal);
       if (GVar) {
         // Emit "mov ri, <imm>"
-        uint32_t Imm;
+        int64_t Imm;
+        uint32_t Reloc;
         if (GVar->hasAttribute(BPFCoreSharedInfo::AmaAttr) ||
-            GVar->hasAttribute(BPFCoreSharedInfo::TypeIdAttr))
-          Imm = PatchImms[GVar];
-        else
+            GVar->hasAttribute(BPFCoreSharedInfo::TypeIdAttr)) {
+          Imm = PatchImms[GVar].first;
+          Reloc = PatchImms[GVar].second;
+        } else {
           return false;
+        }
 
-        OutMI.setOpcode(BPF::MOV_ri);
+        if (Reloc == BPFCoreSharedInfo::ENUM_VALUE_EXISTENCE ||
+            Reloc == BPFCoreSharedInfo::ENUM_VALUE ||
+            Reloc == BPFCoreSharedInfo::BTF_TYPE_ID_LOCAL ||
+            Reloc == BPFCoreSharedInfo::BTF_TYPE_ID_REMOTE)
+          OutMI.setOpcode(BPF::LD_imm64);
+        else
+          OutMI.setOpcode(BPF::MOV_ri);
         OutMI.addOperand(MCOperand::createReg(MI->getOperand(0).getReg()));
         OutMI.addOperand(MCOperand::createImm(Imm));
         return true;
@@ -1215,7 +1269,7 @@ bool BTFDebug::InstLower(const MachineInstr *MI, MCInst &OutMI) {
       const GlobalValue *GVal = MO.getGlobal();
       auto *GVar = dyn_cast<GlobalVariable>(GVal);
       if (GVar && GVar->hasAttribute(BPFCoreSharedInfo::AmaAttr)) {
-        uint32_t Imm = PatchImms[GVar];
+        uint32_t Imm = PatchImms[GVar].first;
         OutMI.setOpcode(MI->getOperand(1).getImm());
         if (MI->getOperand(0).isImm())
           OutMI.addOperand(MCOperand::createImm(MI->getOperand(0).getImm()));
@@ -1250,7 +1304,19 @@ void BTFDebug::processFuncPrototypes(const Function *F) {
   uint8_t Scope = BTF::FUNC_EXTERN;
   auto FuncTypeEntry =
       std::make_unique<BTFTypeFunc>(SP->getName(), ProtoTypeId, Scope);
-  addType(std::move(FuncTypeEntry));
+  uint32_t FuncId = addType(std::move(FuncTypeEntry));
+  if (F->hasSection()) {
+    StringRef SecName = F->getSection();
+
+    if (DataSecEntries.find(std::string(SecName)) == DataSecEntries.end()) {
+      DataSecEntries[std::string(SecName)] =
+          std::make_unique<BTFKindDataSec>(Asm, std::string(SecName));
+    }
+
+    // We really don't know func size, set it to 0.
+    DataSecEntries[std::string(SecName)]->addDataSecEntry(FuncId,
+        Asm->getSymbol(F), 0);
+  }
 }
 
 void BTFDebug::endModule() {
diff --git a/contrib/llvm-project/llvm/lib/Target/BPF/BTFDebug.h b/contrib/llvm-project/llvm/lib/Target/BPF/BTFDebug.h
index 2f39f665299a..76f1901779bb 100644
--- a/contrib/llvm-project/llvm/lib/Target/BPF/BTFDebug.h
+++ b/contrib/llvm-project/llvm/lib/Target/BPF/BTFDebug.h
@@ -16,7 +16,8 @@
 
 #include "llvm/ADT/StringMap.h"
 #include "llvm/CodeGen/DebugHandlerBase.h"
-#include "llvm/CodeGen/MachineInstr.h"
+#include <cstdint>
+#include <map>
 #include <set>
 #include <unordered_map>
 #include "BTF.h"
@@ -27,9 +28,12 @@ class AsmPrinter;
 class BTFDebug;
 class DIType;
 class GlobalVariable;
+class MachineFunction;
+class MachineInstr;
+class MachineOperand;
+class MCInst;
 class MCStreamer;
 class MCSymbol;
-class MachineFunction;
 
 /// The base class for BTF type generation.
 class BTFTypeBase {
@@ -183,7 +187,7 @@ public:
   uint32_t getSize() override {
     return BTFTypeBase::getSize() + BTF::BTFDataSecVarSize * Vars.size();
   }
-  void addVar(uint32_t Id, const MCSymbol *Sym, uint32_t Size) {
+  void addDataSecEntry(uint32_t Id, const MCSymbol *Sym, uint32_t Size) {
     Vars.push_back(std::make_tuple(Id, Sym, Size));
   }
   std::string getName() { return Name; }
@@ -191,6 +195,15 @@ public:
   void emitType(MCStreamer &OS) override;
 };
 
+/// Handle binary floating point type.
+class BTFTypeFloat : public BTFTypeBase {
+  StringRef Name;
+
+public:
+  BTFTypeFloat(uint32_t SizeInBits, StringRef TypeName);
+  void completeType(BTFDebug &BDebug) override;
+};
+
 /// String table.
 class BTFStringTable {
   /// String table size in bytes.
@@ -251,7 +264,7 @@ class BTFDebug : public DebugHandlerBase {
   StringMap<std::vector<std::string>> FileContent;
   std::map<std::string, std::unique_ptr<BTFKindDataSec>> DataSecEntries;
   std::vector<BTFTypeStruct *> StructTypes;
-  std::map<const GlobalVariable *, uint32_t> PatchImms;
+  std::map<const GlobalVariable *, std::pair<int64_t, uint32_t>> PatchImms;
   std::map<StringRef, std::pair<bool, std::vector<BTFTypeDerived *>>>
       FixupDerivedTypes;
   std::set<const Function *>ProtoFunctions;
diff --git a/contrib/llvm-project/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp b/contrib/llvm-project/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
index 4d98dc7341d0..3a1492743bf4 100644
--- a/contrib/llvm-project/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
@@ -58,7 +58,7 @@ public:
     BPF_MEM = 0x3,
     BPF_LEN = 0x4,
     BPF_MSH = 0x5,
-    BPF_XADD = 0x6
+    BPF_ATOMIC = 0x6
   };
 
   BPFDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
@@ -176,7 +176,7 @@ DecodeStatus BPFDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
   uint8_t InstMode = getInstMode(Insn);
   if ((InstClass == BPF_LDX || InstClass == BPF_STX) &&
       getInstSize(Insn) != BPF_DW &&
-      (InstMode == BPF_MEM || InstMode == BPF_XADD) &&
+      (InstMode == BPF_MEM || InstMode == BPF_ATOMIC) &&
       STI.getFeatureBits()[BPF::ALU32])
     Result = decodeInstruction(DecoderTableBPFALU3264, Instr, Insn, Address,
                                this, STI);
diff --git a/contrib/llvm-project/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp b/contrib/llvm-project/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
index 9d829ac45a10..29e9d5da0836 100644
--- a/contrib/llvm-project/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
@@ -43,11 +43,6 @@ public:
 
   unsigned getNumFixupKinds() const override { return 1; }
 
-  bool mayNeedRelaxation(const MCInst &Inst,
-                         const MCSubtargetInfo &STI) const override {
-    return false;
-  }
-
   bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
 };
 
diff --git a/contrib/llvm-project/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.h b/contrib/llvm-project/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.h
index 2181bb575cdd..e76067ea41ae 100644
--- a/contrib/llvm-project/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.h
@@ -32,6 +32,7 @@ public:
   void printBrTargetOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
 
   // Autogenerated by tblgen.
+  std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
   void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &O);
   static const char *getRegisterName(unsigned RegNo);
 };
diff --git a/contrib/llvm-project/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp b/contrib/llvm-project/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
index f9abe76c976b..12af92e0d198 100644
--- a/contrib/llvm-project/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
@@ -13,6 +13,7 @@
 #include "MCTargetDesc/BPFMCTargetDesc.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
@@ -158,12 +159,18 @@ void BPFMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
 uint64_t BPFMCCodeEmitter::getMemoryOpValue(const MCInst &MI, unsigned Op,
                                             SmallVectorImpl<MCFixup> &Fixups,
                                             const MCSubtargetInfo &STI) const {
+  // For CMPXCHG instructions, output is implicitly in R0/W0,
+  // so memory operand starts from operand 0.
+  int MemOpStartIndex = 1, Opcode = MI.getOpcode();
+  if (Opcode == BPF::CMPXCHGW32 || Opcode == BPF::CMPXCHGD)
+    MemOpStartIndex = 0;
+
   uint64_t Encoding;
-  const MCOperand Op1 = MI.getOperand(1);
+  const MCOperand Op1 = MI.getOperand(MemOpStartIndex);
   assert(Op1.isReg() && "First operand is not register.");
   Encoding = MRI.getEncodingValue(Op1.getReg());
   Encoding <<= 16;
-  MCOperand Op2 = MI.getOperand(2);
+  MCOperand Op2 = MI.getOperand(MemOpStartIndex + 1);
   assert(Op2.isImm() && "Second operand is not immediate.");
   Encoding |= Op2.getImm() & 0xffff;
   return Encoding;
diff --git a/contrib/llvm-project/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp b/contrib/llvm-project/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
index 58da0830d002..8fb7d7e89f09 100644
--- a/contrib/llvm-project/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
@@ -46,7 +46,7 @@ static MCRegisterInfo *createBPFMCRegisterInfo(const Triple &TT) {
 
 static MCSubtargetInfo *createBPFMCSubtargetInfo(const Triple &TT,
                                                  StringRef CPU, StringRef FS) {
-  return createBPFMCSubtargetInfoImpl(TT, CPU, FS);
+  return createBPFMCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FS);
 }
 
 static MCStreamer *createBPFMCStreamer(const Triple &T, MCContext &Ctx,
diff --git a/contrib/llvm-project/llvm/lib/Target/CSKY/CSKY.td b/contrib/llvm-project/llvm/lib/Target/CSKY/CSKY.td
new file mode 100644
index 000000000000..da6151befa1b
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/CSKY/CSKY.td
@@ -0,0 +1,32 @@
+//===-- CSKY.td - Describe the CSKY Target Machine ---------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// Registers, calling conventions, instruction descriptions.
+//===----------------------------------------------------------------------===//
+
+include "CSKYRegisterInfo.td"
+include "CSKYInstrInfo.td"
+
+//===----------------------------------------------------------------------===//
+// CSKY processors supported.
+//===----------------------------------------------------------------------===//
+
+def : ProcessorModel<"generic-csky", NoSchedModel, []>;
+
+//===----------------------------------------------------------------------===//
+// Define the CSKY target.
+//===----------------------------------------------------------------------===//
+
+def CSKYInstrInfo : InstrInfo;
+
+def CSKY : Target {
+  let InstructionSet = CSKYInstrInfo;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/CSKY/CSKYInstrFormats.td b/contrib/llvm-project/llvm/lib/Target/CSKY/CSKYInstrFormats.td
new file mode 100644
index 000000000000..86f9dd0b7da3
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/CSKY/CSKYInstrFormats.td
@@ -0,0 +1,528 @@
+//===-- CSKYInstrFormats.td - CSKY Instruction Formats -----*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+class AddrMode<bits<5> val> {
+  bits<5> Value = val;
+}
+
+def AddrModeNone : AddrMode<0>;
+def AddrMode32B : AddrMode<1>;   // ld32.b, ld32.bs, st32.b, st32.bs, +4kb
+def AddrMode32H : AddrMode<2>;   // ld32.h, ld32.hs, st32.h, st32.hs, +8kb
+def AddrMode32WD : AddrMode<3>;  // ld32.w, st32.w, ld32.d, st32.d, +16kb
+def AddrMode16B : AddrMode<4>;   // ld16.b, +32b
+def AddrMode16H : AddrMode<5>;   // ld16.h, +64b
+def AddrMode16W : AddrMode<6>;   // ld16.w, +128b or +1kb
+def AddrMode32SDF : AddrMode<7>; // flds, fldd, +1kb
+
+class CSKYInst<AddrMode am, int sz, dag outs, dag ins, string asmstr,
+               list<dag> pattern> : Instruction {
+  let Namespace = "CSKY";
+  int Size = sz;
+  AddrMode AM = am;
+
+  let OutOperandList = outs;
+  let InOperandList = ins;
+  let AsmString = asmstr;
+  let Pattern = pattern;
+  let Itinerary = NoItinerary;
+  let TSFlags{4 - 0} = AM.Value;
+}
+
+class CSKYPseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : CSKYInst<AddrModeNone, 0, outs, ins, asmstr, pattern> {
+  let isCodeGenOnly = 1;
+  let isPseudo = 1;
+}
+
+class CSKY32Inst<AddrMode am, bits<6> opcode, dag outs, dag ins, string asmstr,
+                 list<dag> pattern>
+    : CSKYInst<am, 4, outs, ins, asmstr, pattern> {
+  field bits<32> Inst;
+  let Inst{31 - 26} = opcode;
+}
+
+// CSKY 32-bit instruction
+// Format< OP[6] | Offset[26] >
+// Instruction(1): bsr32
+class J<bits<6> opcode, dag outs, dag ins, string op, list<dag> pattern>
+    : CSKY32Inst<AddrModeNone, opcode, outs, ins, !strconcat(op, "\t$offset"),
+                 pattern> {
+  bits<26> offset;
+  let Inst{25 - 0} = offset;
+}
+
+// Format< OP[6] | RZ[5] | SOP[3] | OFFSET[18] >
+// Instructions(7): grs, lrs32.b, lrs32.h, lrs32.w, srs32.b, srs32.h, srs32.w
+class I_18_Z_L<bits<3> sop, string op, Operand operand, list<dag> pattern>
+    : CSKY32Inst<AddrModeNone, 0x33, (outs GPR:$rz), (ins operand:$offset),
+                 !strconcat(op, "\t$rz, $offset"), pattern> {
+  bits<5> rz;
+  bits<18> offset;
+  let Inst{25 - 21} = rz;
+  let Inst{20 - 18} = sop;
+  let Inst{17 - 0} = offset;
+}
+
+// Format< OP[6] | RZ[5] | RX[5] | IMM[16] >
+// Instructions(1): ori32
+class I_16_ZX<string op, ImmLeaf ImmType, list<dag> pattern>
+    : CSKY32Inst<AddrModeNone, 0x3b,
+                 (outs GPR:$rz), (ins GPR:$rx,ImmType:$imm16),
+                 !strconcat(op, "\t$rz, $rx, $imm16"), pattern> {
+  bits<5> rz;
+  bits<5> rx;
+  bits<16> imm16;
+  let Inst{25 - 21} = rz;
+  let Inst{20 - 16} = rx;
+  let Inst{15 - 0} = imm16;
+}
+
+// Format< OP[6] | SOP[5] | RZ[5] | IMM[16] >
+// Instructions(3): movi32, movih32, (bgeni32)
+class I_16_MOV<bits<5> sop, string op, ImmLeaf ImmType>
+    : CSKY32Inst<AddrModeNone, 0x3a, (outs GPR:$rz), (ins ImmType:$imm16),
+                 !strconcat(op, "\t$rz, $imm16"),
+                 [(set GPR:$rz, ImmType:$imm16)]> {
+  bits<5> rz;
+  bits<16> imm16;
+  let Inst{25 - 21} = sop;
+  let Inst{20 - 16} = rz;
+  let Inst{15 - 0} = imm16;
+  let isReMaterializable = 1;
+  let isAsCheapAsAMove = 1;
+  let isMoveImm = 1;
+}
+
+// Format< OP[6] | SOP[5] | RZ[5] | OFFSET[16] >
+// Instructions(1): lrw32
+class I_16_Z_L<bits<5> sop, string op, Operand operand, list<dag> pattern>
+    : CSKY32Inst<AddrModeNone, 0x3a,
+                 (outs GPR:$rz), (ins operand:$imm16),
+                 !strconcat(op, "\t$rz, [$imm16]"), pattern> {
+  bits<5> rz;
+  bits<16> imm16;
+  let Inst{25 - 21} = sop;
+  let Inst{20 - 16} = rz;
+  let Inst{15 - 0} = imm16;
+}
+
+// Format< OP[6] | SOP[5] | 00000[5] | OFFSET[16] >
+// Instructions(5): bt32, bf32, br32, jmpi32, jsri32
+class I_16_L<bits<5> sop, dag outs, dag ins, string op, list<dag> pattern>
+    : CSKY32Inst<AddrModeNone, 0x3a, outs, ins, !strconcat(op, "\t$imm16"),
+                 pattern> {
+  bits<16> imm16;
+  let Inst{25 - 21} = sop;
+  let Inst{20 - 16} = 0;
+  let Inst{15 - 0} = imm16;
+}
+
+// bt32, bf32, br32, jmpi32
+class I_16_L_B<bits<5> sop, string op, Operand operand, list<dag> pattern>
+    : I_16_L<sop, (outs), (ins operand:$imm16, CARRY:$ca), op, pattern> {
+  let isBranch = 1;
+  let isTerminator = 1;
+}
+
+// Format< OP[6] | SOP[5] | RX[5] | 0000000000000000[16] >
+// Instructions(2): jmp32, jsr32
+class I_16_JX<bits<5> sop, string op, list<dag> pattern>
+    : CSKY32Inst<AddrModeNone, 0x3a, (outs), (ins GPR:$rx),
+                 !strconcat(op, "\t$rx"), pattern> {
+  bits<5> rx;
+  bits<16> imm16;
+  let Inst{25 - 21} = sop;
+  let Inst{20 - 16} = rx;
+  let Inst{15 - 0} = 0;
+}
+
+// Format< OP[6] | SOP[5] | RX[5] | 00000000000000[14] | IMM[2] >
+// Instructions(1): jmpix32
+class I_16_J_XI<bits<5> sop, string op, Operand operand, list<dag> pattern>
+    : CSKY32Inst<AddrModeNone, 0x3a, (outs),
+                 (ins GPR:$rx, operand:$imm2),
+                 !strconcat(op, "\t$rx, $imm2"), pattern> {
+  bits<5> rx;
+  bits<2> imm2;
+  let Inst{25 - 21} = sop;
+  let Inst{20 - 16} = rx;
+  let Inst{15 - 2} = 0;
+  let Inst{1 - 0} = imm2;
+}
+
+// Format< OP[6] | SOP[5] | PCODE[5] | 0000000000000000[16] >
+// Instructions(1): rts32
+class I_16_RET<bits<5> sop, bits<5> pcode, string op, list<dag> pattern>
+    : CSKY32Inst<AddrModeNone, 0x3a, (outs), (ins), op, pattern> {
+  let Inst{25 - 21} = sop;
+  let Inst{20 - 16} = pcode;
+  let Inst{15 - 0} = 0;
+  let isTerminator = 1;
+  let isReturn = 1;
+  let isBarrier = 1;
+}
+
+// Format< OP[6] | SOP[5] | RX[5] | IMM16[16] >
+// Instructions(3): cmpnei32, cmphsi32, cmplti32
+class I_16_X<bits<5> sop, string op>
+    : CSKY32Inst<AddrModeNone, 0x3a, (outs CARRY:$ca),
+    (ins GPR:$rx, i32imm:$imm16), !strconcat(op, "\t$rx, $imm16"), []> {
+  bits<16> imm16;
+  bits<5> rx;
+  let Inst{25 - 21} = sop;
+  let Inst{20 - 16} = rx;
+  let Inst{15 - 0} = imm16;
+  let isCompare = 1;
+}
+
+// Format< OP[6] | SOP[5] | RX[5] | OFFSET[16] >
+// Instructions(7): bez32, bnez32, bnezad32, bhz32, blsz32, blz32, bhsz32
+class I_16_X_L<bits<5> sop, string op, Operand operand>
+    : CSKY32Inst<AddrModeNone, 0x3a, (outs), (ins GPR:$rx, operand:$imm16),
+                 !strconcat(op, "\t$rx, $imm16"), []> {
+  bits<5> rx;
+  bits<16> imm16;
+  let Inst{25 - 21} = sop;
+  let Inst{20 - 16} = rx;
+  let Inst{15 - 0} = imm16;
+  let isBranch = 1;
+  let isTerminator = 1;
+}
+
+// Format< OP[6] | RZ[5] | RX[5] | SOP[4] | IMM[12] >
+// Instructions(5): addi32, subi32, andi32, andni32, xori32
+class I_12<bits<4> sop, string op, SDNode node, ImmLeaf ImmType>
+    : CSKY32Inst<AddrModeNone, 0x39, (outs GPR:$rz),
+    (ins GPR:$rx, ImmType:$imm12), !strconcat(op, "\t$rz, $rx, $imm12"),
+    [(set GPR:$rz, (node GPR:$rx, ImmType:$imm12))]> {
+  bits<5> rz;
+  bits<5> rx;
+  bits<12> imm12;
+  let Inst{25 - 21} = rz;
+  let Inst{20 - 16} = rx;
+  let Inst{15 - 12} = sop;
+  let Inst{11 - 0} = imm12;
+}
+
+class I_LDST<AddrMode am, bits<6> opcode, bits<4> sop, dag outs, dag ins,
+             string op, list<dag> pattern>
+    : CSKY32Inst<am, opcode, outs, ins, !strconcat(op, "\t$rz, ($rx, $imm12)"),
+                 pattern> {
+  bits<5> rx;
+  bits<5> rz;
+  bits<12> imm12;
+  let Inst{25 - 21} = rz;
+  let Inst{20 - 16} = rx;
+  let Inst{15 - 12} = sop;
+  let Inst{11 - 0} = imm12;
+}
+
+// Format< OP[6] | RZ[5] | RX[5] | SOP[4] | OFFSET[12] >
+// Instructions(6): ld32.b, ld32.bs, ld32.h, ld32.hs, ld32.w, ld32.d
+class I_LD<AddrMode am, bits<4> sop, string op, Operand operand>
+    : I_LDST<am, 0x36, sop,
+    (outs GPR:$rz), (ins GPR:$rx, operand:$imm12), op, []>;
+
+// Format< OP[6] | RZ[5] | RX[5] | SOP[4] | OFFSET[12] >
+// Instructions(4): st32.b, st32.h, st32.w, st32.d
+class I_ST<AddrMode am, bits<4> sop, string op, Operand operand>
+    : I_LDST<am, 0x37, sop, (outs),
+    (ins GPR:$rz, GPR:$rx, operand:$imm12), op, []>;
+
+// Format< OP[6] | SOP[5] | PCODE[5] | 0000[4] | 000 | R28 | LIST2[3] | R15 |
+// LIST1[4] >
+// Instructions(2): push32, pop32
+class I_12_PP<bits<5> sop, bits<5> pcode, dag outs, dag ins, string op>
+    : CSKY32Inst<AddrModeNone, 0x3a, outs, ins, !strconcat(op, "\t$regs"), []> {
+  bits<12> regs;
+  let Inst{25 - 21} = sop;
+  let Inst{20 - 16} = pcode;
+  let Inst{15 - 12} = 0;
+  let Inst{11 - 0} = regs;
+}
+
+// Format< OP[6] | RZ[5] | RX[5] | SOP[6] | PCODE[5] | IMM[5]>
+// Instructions(4): incf32, inct32, decf32, dect32
+class I_5_ZX<bits<6> sop, bits<5> pcode, string op, ImmLeaf ImmType,
+             list<dag> pattern>
+    : CSKY32Inst<AddrModeNone, 0x31, (outs GPR:$rz),
+    (ins GPR:$false, GPR:$rx, ImmType:$imm5),
+    !strconcat(op, "\t$rz, $rx, $imm5"), pattern> {
+  bits<5> rz;
+  bits<5> rx;
+  bits<5> imm5;
+  let Inst{25 - 21} = rz;
+  let Inst{20 - 16} = rx;
+  let Inst{15 - 10} = sop;
+  let Inst{9 - 5} = pcode;
+  let Inst{4 - 0} = imm5;
+  let Constraints = "$rz = $false";
+}
+
+// Format< OP[6] | IMM[5] | RX[5] | SOP[6] | PCODE[5] | RZ[5]>
+// Instructions(13): decgt32, declt32, decne32, lsli32, lslc32, lsri32
+//                   lsrc32, asri32, asrc32, rotli32, xsr32, bclri32, bseti32
+class I_5_XZ<bits<6> sop, bits<5> pcode, string op, dag ins, dag outs,
+             list<dag> pattern>
+    : CSKY32Inst<AddrModeNone, 0x31, ins, outs,
+                 !strconcat(op, "\t$rz, $rx, $imm5"), pattern> {
+  bits<5> imm5;
+  bits<5> rx;
+  bits<5> rz;
+  let Inst{25 - 21} = imm5;
+  let Inst{20 - 16} = rx;
+  let Inst{15 - 10} = sop;
+  let Inst{9 - 5} = pcode;
+  let Inst{4 - 0} = rz;
+}
+
+// Format< OP[6] | RY[5] | RX[5] | SOP[6] | PCODE[5] | IMM[5]>
+// Instructions(2): ldm32, (ldq32), stm32, (stq32)
+class I_5_YX<bits<6> opcode, dag outs, dag ins, string op, list<dag> pattern,
+             bits<5> imm5>
+    : CSKY32Inst<AddrModeNone, opcode, outs, ins,
+                 op #"\t${ry}, (${rx}), " #!cast<int>(imm5), pattern> {
+  bits<5> rx;
+  bits<5> ry;
+  let Inst{25 - 21} = ry; // ry
+  let Inst{20 - 16} = rx;
+  let Inst{15 - 10} = 0b000111;
+  let Inst{9 - 5} = 0b00001;
+  let Inst{4 - 0} = imm5{4 - 0}; // imm5
+}
+
+// Format< OP[6] | LSB[5] | RX[5] | SOP[6] | MSB[5] | RZ[5]>
+// Instructions(6): zext32, zextb32, zexth32, sext32, sextb32, sexth32
+class I_5_XZ_U<bits<6> sop, bits<5> lsb, bits<5> msb, dag outs, dag ins,
+               string op, list<dag> pattern>
+    : CSKY32Inst<AddrModeNone, 0x31, outs, ins,
+                 op #"\t$rz, $rx, " #!cast<int>(msb) #", " #!cast<int>(lsb),
+                 pattern> {
+  bits<5> rx;
+  bits<5> rz;
+  let Inst{25 - 21} = lsb; // lsb
+  let Inst{20 - 16} = rx;
+  let Inst{15 - 10} = sop;
+  let Inst{9 - 5} = msb; // msb
+  let Inst{4 - 0} = rz;
+}
+
+// sextb, sexth
+class I_5_XZ_US<bits<6> sop, bits<5> lsb, bits<5> msb, string op, SDNode opnode,
+  ValueType type> : I_5_XZ_U<sop, lsb, msb,
+  (outs GPR:$rz), (ins GPR:$rx),op, [(set GPR:$rz, (opnode GPR:$rx, type))]>;
+
+class I_5_XZ_UZ<bits<6> sop, bits<5> lsb, bits<5> msb, string op, int v>
+    : I_5_XZ_U<sop, lsb, msb, (outs GPR:$rz), (ins GPR:$rx), op,
+    [(set GPR:$rz, (and GPR:$rx, (i32 v)))]>;
+
+// Format< OP[6] | RZ[5] | RX[5] | SOP[6] | SIZE[5] | LSB[5]>
+// Instructions(1): ins32
+class I_5_ZX_U<bits<6> sop, string op, Operand operand, list<dag> pattern>
+    : CSKY32Inst<AddrModeNone, 0x31, (outs GPR:$rz), (ins operand:$size_lsb),
+                 !strconcat(op, "\t$rz, operand:$size_lsb"), pattern> {
+  bits<10> size_lsb;
+  bits<5> rz;
+  bits<5> rx;
+  let Inst{25 - 21} = rz;
+  let Inst{20 - 16} = rx;
+  let Inst{15 - 10} = sop;
+  let Inst{9 - 5} = size_lsb{9 - 5}; // size
+  let Inst{4 - 0} = size_lsb{4 - 0}; // lsb
+}
+
+// Format< OP[6] | IMM[5] | RX[5] | SOP[6] | PCODE[5] | 00000 >
+// Instructions(1): btsti32
+class I_5_X<bits<6> sop, bits<5> pcode, string op, ImmLeaf ImmType,
+            list<dag> pattern>
+    : CSKY32Inst<AddrModeNone, 0x31,
+    (outs CARRY:$ca), (ins GPR:$rx, ImmType:$imm5),
+    !strconcat(op, "\t$rx, $imm5"), pattern> {
+  bits<5> imm5;
+  bits<5> rx;
+  let Inst{25 - 21} = imm5;
+  let Inst{20 - 16} = rx;
+  let Inst{15 - 10} = sop;
+  let Inst{9 - 5} = pcode;
+  let Inst{4 - 0} = 0;
+  let isCompare = 1;
+}
+
+// Format< OP[6] | IMM[5] | 00000[5] | SOP[6] | PCODE[5] | RZ[5]>
+// Instructions(1): bmaski32
+class I_5_Z<bits<6> sop, bits<5> pcode, string op, ImmLeaf ImmType,
+            list<dag> pattern>
+    : CSKY32Inst<AddrModeNone, 0x31, (outs GPR:$rz), (ins ImmType:$imm5),
+                 !strconcat(op, "\t$rz, $imm5"), pattern> {
+  bits<5> imm5;
+  bits<5> rz;
+  let Inst{25 - 21} = imm5;
+  let Inst{20 - 16} = 0;
+  let Inst{15 - 10} = sop;
+  let Inst{9 - 5} = pcode;
+  let Inst{4 - 0} = rz;
+}
+
+// Format< OP[6] | RY[5] | RX[5] | SOP[6] | PCODE[5] | RZ[5] >
+// Instructions(24): addu32, addc32, subu32, subc32, (rsub32), ixh32, ixw32,
+// ixd32, and32, andn32, or32, xor32, nor32, lsl32, lsr32, asr32, rotl32
+// mult32, divu32, divs32, mul.(u/s)32, mula.32.l, mula.u32, mulall.s16.s
+class R_YXZ<bits<6> opcode, bits<6> sop, bits<5> pcode, dag outs, dag ins,
+            string op, list<dag> pattern>
+    : CSKY32Inst<AddrModeNone, opcode, outs, ins,
+                 !strconcat(op, "\t$rz, $rx, $ry"), pattern> {
+  bits<5> ry;
+  bits<5> rx;
+  bits<5> rz;
+  let Inst{25 - 21} = ry;
+  let Inst{20 - 16} = rx;
+  let Inst{15 - 10} = sop;
+  let Inst{9 - 5} = pcode;
+  let Inst{4 - 0} = rz;
+}
+
+// R_YXZ instructions with simple pattern
+// Output: GPR:rz
+// Input: GPR:rx, GPR:ry
+// Asm string: op rz, rx, ry
+// Instructions: addu32, subu32, ixh32, ixw32, ixd32, and32, andn32, or32,
+// xor32, nor32, lsl32, lsr32, asr32, mult32, divu32, divs32
+class R_YXZ_SP_F1<bits<6> sop, bits<5> pcode, PatFrag opnode, string op,
+  bit Commutable = 0> : R_YXZ<0x31, sop, pcode, (outs GPR:$rz),
+  (ins GPR:$rx, GPR:$ry), op, [(set GPR:$rz, (opnode GPR:$rx, GPR:$ry))]> {
+  let isCommutable = Commutable;
+}
+
+// Format< OP[6] | RY[5] | RX[5] | SOP[6] | PCODE[5] | RZ[5] >
+// Instructions:(8) ldr32.b, ldr32.h, ldr32.bs, ldr32.hs, ldr32.w,
+//                  str32.b, str32.h, str32.w
+class R_YXZ_LDST<bits<6> opcode, bits<6> sop, bits<5> pcode, int no, dag outs,
+                 dag ins, string op, list<dag> pattern>
+    : CSKY32Inst<AddrModeNone, opcode, outs, ins,
+                 op #"\t$rz, ($rx, $ry << " #no #")", pattern> {
+  bits<5> rx;
+  bits<5> ry;
+  bits<5> rz;
+  let Inst{25 - 21} = ry; // ry;
+  let Inst{20 - 16} = rx; // rx;
+  let Inst{15 - 10} = sop;
+  let Inst{9 - 5} = pcode; // pcode;
+  let Inst{4 - 0} = rz;
+}
+
+class I_LDR<bits<6> sop, bits<5> pcode, string op, int no>
+    : R_YXZ_LDST<0x34, sop, pcode, no,
+    (outs GPR:$rz), (ins GPR:$rx, GPR:$ry), op, []>;
+
+class I_STR<bits<6> sop, bits<5> pcode, string op, int no>
+    : R_YXZ_LDST<0x35, sop, pcode, no, (outs),
+    (ins GPR:$rz, GPR:$rx, GPR:$ry), op, []>;
+
+// Format< OP[6] | RX[5] | RX[5] | SOP[6] | PCODE[5] | RZ[5] >
+// Instructions:(1) not32
+class R_XXZ<bits<6> sop, bits<5> pcode, dag outs, dag ins, string op,
+            list<dag> pattern>
+    : CSKY32Inst<AddrModeNone, 0x31, outs, ins, !strconcat(op, "\t$rz, $rx"),
+                 pattern> {
+  bits<5> rx;
+  bits<5> rz;
+  let Inst{25 - 21} = rx;
+  let Inst{20 - 16} = rx;
+  let Inst{15 - 10} = sop;
+  let Inst{9 - 5} = pcode;
+  let Inst{4 - 0} = rz;
+}
+
+// Format< OP[6] | RY[5] | RX[5] | SOP[6] | PCODE[5] | 00000[5] >
+// Instructions:(4) cmpne32, cmphs32, cmplt32, tst32
+class R_YX<bits<6> sop, bits<5> pcode, string op>
+    : CSKY32Inst<AddrModeNone, 0x31, (outs CARRY:$ca),
+                 (ins GPR:$rx, GPR:$ry),
+                 !strconcat(op, "\t$rx, $ry"), []> {
+  bits<5> ry;
+  bits<5> rx;
+  let Inst{25 - 21} = ry;
+  let Inst{20 - 16} = rx;
+  let Inst{15 - 10} = sop;
+  let Inst{9 - 5} = pcode;
+  let Inst{4 - 0} = 0;
+  let isCompare = 1;
+}
+
+// Format< OP[6] | 00000[5] | RX[5] | SOP[6] | PCODE[5] | RZ[5] >
+// Instructions:(12)
+//   mov32, xtrb0.32, xtrb1.32, xtrb2.32, xtrb3.32, brev32, revb32
+//   revh32, abs32, ff0.32, ff1.32, bgenr32
+class R_XZ<bits<6> sop, bits<5> pcode, string op>
+    : CSKY32Inst<AddrModeNone, 0x31, (outs GPR:$rz), (ins GPR:$rx),
+                 !strconcat(op, "\t$rz, $rx"), []> {
+  bits<5> rx;
+  bits<5> rz;
+  let Inst{25 - 21} = 0;
+  let Inst{20 - 16} = rx;
+  let Inst{15 - 10} = sop;
+  let Inst{9 - 5} = pcode;
+  let Inst{4 - 0} = rz;
+}
+
+// Format< OP[6] | RZ[5] | RX[5] | SOP[6] | PCODE[5] | 00000[5] >
+// Instructions:(2) movf32, movt32
+class R_ZX<bits<6> sop, bits<5> pcode, string op, list<dag> pattern>
+    : CSKY32Inst<AddrModeNone, 0x31, (outs GPR:$rz),
+                 (ins CARRY:$ca, GPR:$rx, GPR:$false),
+                 !strconcat(op, "\t$rz, $rx"), pattern> {
+  bits<5> rz;
+  bits<5> rx;
+  let Inst{25 - 21} = rz;
+  let Inst{20 - 16} = rx;
+  let Inst{15 - 10} = sop;
+  let Inst{9 - 5} = pcode;
+  let Inst{4 - 0} = 0;
+  let Constraints = "$rz = $false";
+  let isSelect = 1;
+}
+
+// Format< OP[6] | 00000[5] | RX[5] | SOP[6] | PCODE[5] | 00000[5] >
+// Instructions:(1) tstnbz32
+class R_X<bits<6> sop, bits<5> pcode, string op, list<dag> pattern>
+    : CSKY32Inst<AddrModeNone, 0x31, (outs CARRY:$ca),(ins GPR:$rx),
+                 !strconcat(op, "\t$rx"), pattern> {
+  bits<5> rx;
+  let Inst{25 - 21} = 0;
+  let Inst{20 - 16} = rx;
+  let Inst{15 - 10} = sop;
+  let Inst{9 - 5} = pcode;
+  let Inst{4 - 0} = 0;
+}
+
+// Format< OP[6] | 00000[5] | 00000[5] | SOP[6] | PCODE[5] | RZ[5] >
+// Instructions:(2) mvc32, mvcv32
+class R_Z_1<bits<6> sop, bits<5> pcode, string op>
+    : CSKY32Inst<AddrModeNone, 0x31, (outs GPR:$rz),
+                 (ins CARRY:$ca), !strconcat(op, "\t$rz"), []> {
+  bits<5> rz;
+  let Inst{25 - 21} = 0;
+  let Inst{20 - 16} = 0;
+  let Inst{15 - 10} = sop;
+  let Inst{9 - 5} = pcode;
+  let Inst{4 - 0} = rz;
+}
+
+// Format< OP[6] | RZ[5] | 00000[5] | SOP[6] | PCODE[5] | 00000[5] >
+// Instructions:(2) clrf32, clrt32
+class R_Z_2<bits<6> sop, bits<5> pcode, string op, list<dag> pattern>
+    : CSKY32Inst<AddrModeNone, 0x31, (outs GPR:$rz),
+    (ins CARRY:$ca, GPR:$false), !strconcat(op, "\t$rz"), []> {
+  bits<5> rz;
+  let Inst{25 - 21} = rz;
+  let Inst{20 - 16} = 0;
+  let Inst{15 - 10} = sop;
+  let Inst{9 - 5} = pcode;
+  let Inst{4 - 0} = 0;
+  let Constraints = "$rz = $false";
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/CSKY/CSKYInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/CSKY/CSKYInstrInfo.td
new file mode 100644
index 000000000000..7add217530e1
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/CSKY/CSKYInstrInfo.td
@@ -0,0 +1,108 @@
+//===-- CSKYInstrInfo.td - Target Description for CSKY -----*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the CSKY instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+include "CSKYInstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// CSKY specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+// TODO: Add CSKY specific DAG Nodes.
+
+//===----------------------------------------------------------------------===//
+// Operand and SDNode transformation definitions.
+//===----------------------------------------------------------------------===//
+
+class oimm<int num> : Operand<i32>,
+  ImmLeaf<i32, "return isUInt<"#num#">(Imm - 1);"> {
+  let EncoderMethod = "getOImmOpValue";
+}
+
+class uimm<int num, int shift = 0> : Operand<i32>,
+  ImmLeaf<i32, "return isShiftedUInt<"#num#", "#shift#">(Imm);"> {
+  let EncoderMethod = "getImmOpValue<"#shift#">";
+}
+
+class simm<int num, int shift = 0> : Operand<i32>,
+  ImmLeaf<i32, "return isShiftedInt<"#num#", "#shift#">(Imm);"> {
+  let EncoderMethod = "getImmOpValue<"#shift#">";
+}
+
+def nimm_XFORM : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(~N->getSExtValue(), SDLoc(N), MVT::i32);
+}]>;
+class nimm<int num> : Operand<i32>,
+  ImmLeaf<i32, "return isUInt<"#num#">(~Imm);", nimm_XFORM> {
+}
+
+
+def oimm12 : oimm<12>;
+
+def nimm12 : nimm<12>;
+
+def uimm5 : uimm<5>;
+def uimm12 : uimm<12>;
+
+//===----------------------------------------------------------------------===//
+// Instruction definitions.
+//===----------------------------------------------------------------------===//
+
+class TriOpFrag<dag res> : PatFrag<(ops node: $LHS, node:$MHS, node:$RHS), res>;
+class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>;
+class UnOpFrag<dag res> : PatFrag<(ops node:$Src), res>;
+
+def ADDI32 : I_12<0x0, "addi32", add, oimm12>;
+def SUBI32 : I_12<0x1, "subi32", sub, oimm12>;
+def ANDI32 : I_12<0x2, "andi32", and, uimm12>;
+def ANDNI32 : I_12<0x3, "andni32", and, nimm12>;
+def XORI32 : I_12<0x4, "xori32", xor, uimm12>;
+def LSLI32 : I_5_XZ<0x12, 0x1, "lsli32",
+  (outs GPR:$rz), (ins GPR:$rx, uimm5:$imm5),
+  [(set GPR:$rz, (shl GPR:$rx, uimm5:$imm5))]>;
+def LSRI32 : I_5_XZ<0x12, 0x2, "lsri32",
+  (outs GPR:$rz), (ins GPR:$rx, uimm5:$imm5),
+  [(set GPR:$rz, (srl GPR:$rx, uimm5:$imm5))]>;
+def ASRI32 : I_5_XZ<0x12, 0x4, "asri32",
+  (outs GPR:$rz), (ins GPR:$rx, uimm5:$imm5),
+  [(set GPR:$rz, (sra GPR:$rx, uimm5:$imm5))]>;
+
+
+
+def ADDU32 : R_YXZ_SP_F1<0x0, 0x1,
+  BinOpFrag<(add node:$LHS, node:$RHS)>, "addu32", 1>;
+def SUBU32 : R_YXZ_SP_F1<0x0, 0x4,
+  BinOpFrag<(sub node:$LHS, node:$RHS)>, "subu32">;
+def AND32 : R_YXZ_SP_F1<0x8, 0x1,
+  BinOpFrag<(and node:$LHS, node:$RHS)>, "and32", 1>;
+def ANDN32 : R_YXZ_SP_F1<0x8, 0x2,
+  BinOpFrag<(and node:$LHS, (not node:$RHS))>, "andn32">;
+def OR32: R_YXZ_SP_F1<0x9, 0x1,
+  BinOpFrag<(or node:$LHS, node:$RHS)>, "or32", 1>;
+def XOR32 : R_YXZ_SP_F1<0x9, 0x2,
+  BinOpFrag<(xor node:$LHS, node:$RHS)>, "xor32", 1>;
+def NOR32 : R_YXZ_SP_F1<0x9, 0x4,
+  BinOpFrag<(not (or node:$LHS, node:$RHS))>, "nor32", 1>;
+def LSL32 : R_YXZ_SP_F1<0x10, 0x1,
+  BinOpFrag<(shl node:$LHS, node:$RHS)>, "lsl32">;
+def LSR32 : R_YXZ_SP_F1<0x10, 0x2,
+  BinOpFrag<(srl node:$LHS, node:$RHS)>, "lsr32">;
+def ASR32 : R_YXZ_SP_F1<0x10, 0x4,
+  BinOpFrag<(sra node:$LHS, node:$RHS)>, "asr32">;
+def MULT32 : R_YXZ_SP_F1<0x21, 0x1,
+  BinOpFrag<(mul node:$LHS, node:$RHS)>, "mult32", 1>;
+def DIVS32 : R_YXZ_SP_F1<0x20, 0x2,
+  BinOpFrag<(sdiv node:$LHS, node:$RHS)>, "divs32">;
+def DIVU32 : R_YXZ_SP_F1<0x20, 0x1,
+  BinOpFrag<(udiv node:$LHS, node:$RHS)>, "divu32">;
+
+def NOT32 : R_XXZ<0b001001, 0b00100, (outs GPR:$rz), (ins GPR:$rx),
+  "not", [(set GPR:$rz, (not GPR:$rx))]>;
diff --git a/contrib/llvm-project/llvm/lib/Target/CSKY/CSKYRegisterInfo.td b/contrib/llvm-project/llvm/lib/Target/CSKY/CSKYRegisterInfo.td
new file mode 100644
index 000000000000..aef4589a67f2
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/CSKY/CSKYRegisterInfo.td
@@ -0,0 +1,182 @@
+//===-- CSKYRegisterInfo.td - CSKY Register defs -----------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the CSKY registers.
+//===----------------------------------------------------------------------===//
+
+let Namespace = "CSKY" in {
+  class CSKYReg<bits<6> Enc, string n, list<string> alt = []> : Register<n> {
+    let HWEncoding{5 - 0} = Enc;
+    let AltNames = alt;
+  }
+
+  class CSKYFReg32<bits<5> Enc, string n, list<string> alt = []> : Register<n> {
+    let HWEncoding{4 - 0} = Enc;
+    let AltNames = alt;
+  }
+
+  // Because CSKYFReg64 register have AsmName and AltNames that alias with their
+  // 32-bit sub-register, CSKYAsmParser will need to coerce a register number
+  // from a CSKYFReg32 to the equivalent CSKYFReg64 when appropriate.
+  def sub32_0 : SubRegIndex<32, 0>;
+  def sub32_32 : SubRegIndex<32, 32>;
+  def sub64_0 : SubRegIndex<64, 0>;
+  def sub64_64 : SubRegIndex<64,64>;
+
+  class CSKYFReg64<CSKYFReg32 subreg> : Register<""> {
+    let HWEncoding{4 - 0} = subreg.HWEncoding{4 - 0};
+    let SubRegs = [subreg];
+    let SubRegIndices = [sub32_0];
+    let AsmName = subreg.AsmName;
+    let AltNames = subreg.AltNames;
+  }
+
+  class CSKYFReg128<CSKYFReg64 subreg> : Register<""> {
+    let HWEncoding{4 - 0} = subreg.HWEncoding{4 - 0};
+    let SubRegs = [subreg];
+    let SubRegIndices = [sub64_0];
+    let AsmName = subreg.AsmName;
+    let AltNames = subreg.AltNames;
+  }
+
+  def ABIRegAltName : RegAltNameIndex;
+} // Namespace = "CSKY"
+
+let RegAltNameIndices = [ABIRegAltName] in {
+  def R0 : CSKYReg<0, "r0", ["a0"]>, DwarfRegNum<[0]>;
+  def R1 : CSKYReg<1, "r1", ["a1"]>, DwarfRegNum<[1]>;
+  def R2 : CSKYReg<2, "r2", ["a2"]>, DwarfRegNum<[2]>;
+  def R3 : CSKYReg<3, "r3", ["a3"]>, DwarfRegNum<[3]>;
+  def R4 : CSKYReg<4, "r4", ["l0"]>, DwarfRegNum<[4]>;
+  def R5 : CSKYReg<5, "r5", ["l1"]>, DwarfRegNum<[5]>;
+  def R6 : CSKYReg<6, "r6", ["l2"]>, DwarfRegNum<[6]>;
+  def R7 : CSKYReg<7, "r7", ["l3"]>, DwarfRegNum<[7]>;
+  def R8 : CSKYReg<8, "r8", ["l4"]>, DwarfRegNum<[8]>;
+  def R9 : CSKYReg<9, "r9", ["l5"]>, DwarfRegNum<[9]>;
+  def R10 : CSKYReg<10, "r10", ["l6"]>, DwarfRegNum<[10]>;
+  def R11 : CSKYReg<11, "r11", ["l7"]>, DwarfRegNum<[11]>;
+  def R12 : CSKYReg<12, "r12", ["t0"]>, DwarfRegNum<[12]>;
+  def R13 : CSKYReg<13, "r13", ["t1"]>, DwarfRegNum<[13]>;
+  def R14 : CSKYReg<14, "r14", ["sp"]>, DwarfRegNum<[14]>;
+  def R15 : CSKYReg<15, "r15", ["lr"]>, DwarfRegNum<[15]>;
+  def R16 : CSKYReg<16, "r16", ["l8"]>, DwarfRegNum<[16]>;
+  def R17 : CSKYReg<17, "r17", ["l9"]>, DwarfRegNum<[17]>;
+  def R18 : CSKYReg<18, "r18", ["t2"]>, DwarfRegNum<[18]>;
+  def R19 : CSKYReg<19, "r19", ["t3"]>, DwarfRegNum<[19]>;
+  def R20 : CSKYReg<20, "r20", ["t4"]>, DwarfRegNum<[20]>;
+  def R21 : CSKYReg<21, "r21", ["t5"]>, DwarfRegNum<[21]>;
+  def R22 : CSKYReg<22, "r22", ["t6"]>, DwarfRegNum<[22]>;
+  def R23 : CSKYReg<23, "r23", ["t7"]>, DwarfRegNum<[23]>;
+  def R24 : CSKYReg<24, "r24", ["t8"]>, DwarfRegNum<[24]>;
+  def R25 : CSKYReg<25, "r25", ["t9"]>, DwarfRegNum<[25]>;
+  def R26 : CSKYReg<26, "r26", ["r26"]>, DwarfRegNum<[26]>;
+  def R27 : CSKYReg<27, "r27", ["r27"]>, DwarfRegNum<[27]>;
+  def R28 : CSKYReg<28, "r28", ["rgb"]>, DwarfRegNum<[28]>;
+  def R29 : CSKYReg<29, "r29", ["rtb"]>, DwarfRegNum<[29]>;
+  def R30 : CSKYReg<30, "r30", ["svbr"]>, DwarfRegNum<[30]>;
+  def R31 : CSKYReg<31, "r31", ["tls"]>, DwarfRegNum<[31]>;
+  def C : CSKYReg<32, "cr0", ["psr"]>;
+
+}
+
+def GPRTuple : RegisterTuples<
+          [sub32_0, sub32_32],
+          [(add (sequence "R%u", 0, 30)), (add (sequence "R%u", 1, 31))],
+          [ "r0",  "r1",  "r2",  "r3",  "r4",  "r5",  "r6",  "r7",
+            "r8",  "r9",  "r10", "r11", "r12", "r13", "r14", "r15",
+            "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
+            "r24", "r25", "r26", "r27", "r28", "r29", "r30"
+          ]>;
+
+// Floating point registers
+let RegAltNameIndices = [ABIRegAltName] in {
+  def F0_32 : CSKYFReg32<0, "fr0", ["vr0"]>, DwarfRegNum<[32]>;
+  def F1_32 : CSKYFReg32<1, "fr1", ["vr1"]>, DwarfRegNum<[33]>;
+  def F2_32 : CSKYFReg32<2, "fr2", ["vr2"]>, DwarfRegNum<[34]>;
+  def F3_32 : CSKYFReg32<3, "fr3", ["vr3"]>, DwarfRegNum<[35]>;
+  def F4_32 : CSKYFReg32<4, "fr4", ["vr4"]>, DwarfRegNum<[36]>;
+  def F5_32 : CSKYFReg32<5, "fr5", ["vr5"]>, DwarfRegNum<[37]>;
+  def F6_32 : CSKYFReg32<6, "fr6", ["vr6"]>, DwarfRegNum<[38]>;
+  def F7_32 : CSKYFReg32<7, "fr7", ["vr7"]>, DwarfRegNum<[39]>;
+  def F8_32 : CSKYFReg32<8, "fr8", ["vr8"]>, DwarfRegNum<[40]>;
+  def F9_32 : CSKYFReg32<9, "fr9", ["vr9"]>, DwarfRegNum<[41]>;
+  def F10_32 : CSKYFReg32<10, "fr10", ["vr10"]>, DwarfRegNum<[42]>;
+  def F11_32 : CSKYFReg32<11, "fr11", ["vr11"]>, DwarfRegNum<[43]>;
+  def F12_32 : CSKYFReg32<12, "fr12", ["vr12"]>, DwarfRegNum<[44]>;
+  def F13_32 : CSKYFReg32<13, "fr13", ["vr13"]>, DwarfRegNum<[45]>;
+  def F14_32 : CSKYFReg32<14, "fr14", ["vr14"]>, DwarfRegNum<[46]>;
+  def F15_32 : CSKYFReg32<15, "fr15", ["vr15"]>, DwarfRegNum<[47]>;
+  def F16_32 : CSKYFReg32<16, "fr16", ["vr16"]>, DwarfRegNum<[48]>;
+  def F17_32 : CSKYFReg32<17, "fr17", ["vr17"]>, DwarfRegNum<[49]>;
+  def F18_32 : CSKYFReg32<18, "fr18", ["vr18"]>, DwarfRegNum<[50]>;
+  def F19_32 : CSKYFReg32<19, "fr19", ["vr19"]>, DwarfRegNum<[51]>;
+  def F20_32 : CSKYFReg32<20, "fr20", ["vr20"]>, DwarfRegNum<[52]>;
+  def F21_32 : CSKYFReg32<21, "fr21", ["vr21"]>, DwarfRegNum<[53]>;
+  def F22_32 : CSKYFReg32<22, "fr22", ["vr22"]>, DwarfRegNum<[54]>;
+  def F23_32 : CSKYFReg32<23, "fr23", ["vr23"]>, DwarfRegNum<[55]>;
+  def F24_32 : CSKYFReg32<24, "fr24", ["vr24"]>, DwarfRegNum<[56]>;
+  def F25_32 : CSKYFReg32<25, "fr25", ["vr25"]>, DwarfRegNum<[57]>;
+  def F26_32 : CSKYFReg32<26, "fr26", ["vr26"]>, DwarfRegNum<[58]>;
+  def F27_32 : CSKYFReg32<27, "fr27", ["vr27"]>, DwarfRegNum<[59]>;
+  def F28_32 : CSKYFReg32<28, "fr28", ["vr28"]>, DwarfRegNum<[60]>;
+  def F29_32 : CSKYFReg32<29, "fr29", ["vr29"]>, DwarfRegNum<[61]>;
+  def F30_32 : CSKYFReg32<30, "fr30", ["vr30"]>, DwarfRegNum<[62]>;
+  def F31_32 : CSKYFReg32<31, "fr31", ["vr31"]>, DwarfRegNum<[63]>;
+
+  foreach Index = 0 - 31 in {
+    def F#Index#_64 : CSKYFReg64<!cast<CSKYFReg32>("F"#Index#"_32")>,
+                      DwarfRegNum<[!add(Index, 32)]>;
+
+    def F#Index#_128 : CSKYFReg128<!cast<CSKYFReg64>("F"#Index#"_64")>,
+                       DwarfRegNum<[!add(Index, 32)]>;
+  }
+}
+
+
+//===----------------------------------------------------------------------===//
+// Declarations that describe the CSKY register class.
+//===----------------------------------------------------------------------===//
+
+// The order of registers represents the preferred allocation sequence.
+// Registers are listed in the order caller-save, callee-save, specials.
+def GPR : RegisterClass<"CSKY", [i32], 32,
+                        (add (sequence "R%u", 0, 3), (sequence "R%u", 12, 13),
+                             (sequence "R%u", 18, 25), R15, (sequence "R%u", 4, 11),
+                             (sequence "R%u", 16, 17), (sequence "R%u", 26, 27), R28,
+                             (sequence "R%u", 29, 30), R14, R31)> {
+  let Size = 32;
+}
+
+def GPRPair : RegisterClass<"CSKY", [untyped], 32, (add GPRTuple)> {
+  let Size = 64;
+}
+
+def CARRY : RegisterClass<"CSKY", [i32], 32, (add C)> {
+  let Size = 32;
+  let CopyCost = -1;
+}
+
+// The order of registers represents the preferred allocation sequence.
+// Registers are listed in the order caller-save, callee-save, specials.
+def FPR32 : RegisterClass<"CSKY", [f32], 32,
+                         (add (sequence "F%u_32", 0, 31))>;
+def sFPR32 : RegisterClass<"CSKY", [f32], 32,
+                         (add (sequence "F%u_32", 0, 15))>;
+
+def FPR64 : RegisterClass<"CSKY", [f64], 64,
+                         (add (sequence "F%u_64", 0, 31))>;
+def sFPR64 : RegisterClass<"CSKY", [f64], 64,
+                         (add (sequence "F%u_64", 0, 15))>;
+
+def FPR128 : RegisterClass<"CSKY",
+             [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v8f16], 128,
+             (add (sequence "F%u_128", 0, 31))>;
+def sFPR128 : RegisterClass<"CSKY",
+              [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v8f16], 128,
+              (add (sequence "F%u_128", 0, 15))>;
diff --git a/contrib/llvm-project/llvm/lib/Target/CSKY/CSKYTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/CSKY/CSKYTargetMachine.cpp
new file mode 100644
index 000000000000..1c13796e84b6
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/CSKY/CSKYTargetMachine.cpp
@@ -0,0 +1,68 @@
+//===--- CSKYTargetMachine.cpp - Define TargetMachine for CSKY ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements the info about CSKY target spec.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CSKYTargetMachine.h"
+#include "TargetInfo/CSKYTargetInfo.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeCSKYTarget() {
+  RegisterTargetMachine<CSKYTargetMachine> X(getTheCSKYTarget());
+}
+
+static std::string computeDataLayout(const Triple &TT) {
+  std::string Ret;
+
+  // Only support little endian for now.
+  // TODO: Add support for big endian.
+  Ret += "e";
+
+  // CSKY is always 32-bit target with the CSKYv2 ABI as prefer now.
+  // It's a 4-byte aligned stack with ELF mangling only.
+  Ret += "-m:e-S32-p:32:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:32"
+         "-v128:32:32-a:0:32-Fi32-n32";
+
+  return Ret;
+}
+
+CSKYTargetMachine::CSKYTargetMachine(const Target &T, const Triple &TT,
+                                     StringRef CPU, StringRef FS,
+                                     const TargetOptions &Options,
+                                     Optional<Reloc::Model> RM,
+                                     Optional<CodeModel::Model> CM,
+                                     CodeGenOpt::Level OL, bool JIT)
+    : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options,
+                        RM.getValueOr(Reloc::Static),
+                        getEffectiveCodeModel(CM, CodeModel::Small), OL),
+      TLOF(std::make_unique<TargetLoweringObjectFileELF>()) {
+  initAsmInfo();
+}
+
+namespace {
+class CSKYPassConfig : public TargetPassConfig {
+public:
+  CSKYPassConfig(CSKYTargetMachine &TM, PassManagerBase &PM)
+      : TargetPassConfig(TM, PM) {}
+
+  CSKYTargetMachine &getCSKYTargetMachine() const {
+    return getTM<CSKYTargetMachine>();
+  }
+};
+
+} // namespace
+
+TargetPassConfig *CSKYTargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new CSKYPassConfig(*this, PM);
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/CSKY/CSKYTargetMachine.h b/contrib/llvm-project/llvm/lib/Target/CSKY/CSKYTargetMachine.h
new file mode 100644
index 000000000000..d50e3877b550
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/CSKY/CSKYTargetMachine.h
@@ -0,0 +1,38 @@
+//===--- CSKYTargetMachine.h - Define TargetMachine for CSKY ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the CSKY specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_CSKY_CSKYTARGETMACHINE_H
+#define LLVM_LIB_TARGET_CSKY_CSKYTARGETMACHINE_H
+
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class CSKYTargetMachine : public LLVMTargetMachine {
+  std::unique_ptr<TargetLoweringObjectFile> TLOF;
+
+public:
+  CSKYTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                    StringRef FS, const TargetOptions &Options,
+                    Optional<Reloc::Model> RM, Optional<CodeModel::Model> CM,
+                    CodeGenOpt::Level OL, bool JIT);
+
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
+  TargetLoweringObjectFile *getObjFileLowering() const override {
+    return TLOF.get();
+  }
+};
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp b/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp
new file mode 100644
index 000000000000..e30123d64755
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp
@@ -0,0 +1,69 @@
+//===-- CSKYAsmBackend.cpp - CSKY Assembler Backend -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "CSKYAsmBackend.h"
+#include "MCTargetDesc/CSKYMCTargetDesc.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "csky-asmbackend"
+
+using namespace llvm;
+
+std::unique_ptr<MCObjectTargetWriter>
+CSKYAsmBackend::createObjectTargetWriter() const {
+  return createCSKYELFObjectWriter();
+}
+
+unsigned int CSKYAsmBackend::getNumFixupKinds() const { return 1; }
+
+void CSKYAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                                const MCValue &Target,
+                                MutableArrayRef<char> Data, uint64_t Value,
+                                bool IsResolved,
+                                const MCSubtargetInfo *STI) const {
+  return;
+}
+
+bool CSKYAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                                          const MCRelaxableFragment *DF,
+                                          const MCAsmLayout &Layout) const {
+  return false;
+}
+
+void CSKYAsmBackend::relaxInstruction(MCInst &Inst,
+                                      const MCSubtargetInfo &STI) const {
+  llvm_unreachable("CSKYAsmBackend::relaxInstruction() unimplemented");
+}
+
+bool CSKYAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
+  if (Count % 2)
+    return false;
+
+  // MOV32 r0, r0
+  while (Count >= 4) {
+    OS.write("\xc4\x00\x48\x20", 4);
+    Count -= 4;
+  }
+  // MOV16 r0, r0
+  if (Count)
+    OS.write("\x6c\x03", 2);
+
+  return true;
+}
+
+MCAsmBackend *llvm::createCSKYAsmBackend(const Target &T,
+                                         const MCSubtargetInfo &STI,
+                                         const MCRegisterInfo &MRI,
+                                         const MCTargetOptions &Options) {
+  return new CSKYAsmBackend(STI, Options);
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h b/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h
new file mode 100644
index 000000000000..b4cba4264e03
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h
@@ -0,0 +1,39 @@
+//===-- CSKYAsmBackend.h - CSKY Assembler Backend -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_CSKY_MCTARGETDESC_CSKYASMBACKEND_H
+#define LLVM_LIB_TARGET_CSKY_MCTARGETDESC_CSKYASMBACKEND_H
+
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCTargetOptions.h"
+
+namespace llvm {
+
+class CSKYAsmBackend : public MCAsmBackend {
+
+public:
+  CSKYAsmBackend(const MCSubtargetInfo &STI, const MCTargetOptions &OP)
+      : MCAsmBackend(support::little) {}
+
+  unsigned int getNumFixupKinds() const override;
+  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                  const MCValue &Target, MutableArrayRef<char> Data,
+                  uint64_t Value, bool IsResolved,
+                  const MCSubtargetInfo *STI) const override;
+  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                            const MCRelaxableFragment *DF,
+                            const MCAsmLayout &Layout) const override;
+  void relaxInstruction(MCInst &Inst,
+                        const MCSubtargetInfo &STI) const override;
+  bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
+  std::unique_ptr<MCObjectTargetWriter>
+  createObjectTargetWriter() const override;
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_CSKY_MCTARGETDESC_CSKYASMBACKEND_H
diff --git a/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp b/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp
new file mode 100644
index 000000000000..163632632290
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp
@@ -0,0 +1,45 @@
+//===-- CSKYELFObjectWriter.cpp - CSKY ELF Writer -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "CSKYMCTargetDesc.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCObjectWriter.h"
+
+#define DEBUG_TYPE "csky-elf-object-writer"
+
+using namespace llvm;
+
+namespace {
+
+class CSKYELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+  CSKYELFObjectWriter(uint8_t OSABI = 0)
+      : MCELFObjectTargetWriter(false, OSABI, ELF::EM_CSKY, true){};
+  ~CSKYELFObjectWriter() {}
+
+  unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                        const MCFixup &Fixup, bool IsPCRel) const override;
+};
+
+} // namespace
+
+unsigned CSKYELFObjectWriter::getRelocType(MCContext &Ctx,
+                                           const MCValue &Target,
+                                           const MCFixup &Fixup,
+                                           bool IsPCRel) const {
+  // Determine the type of the relocation.
+  switch ((unsigned)Fixup.getKind()) {
+  default:
+    llvm_unreachable("invalid fixup kind!");
+  }
+}
+
+std::unique_ptr<MCObjectTargetWriter> llvm::createCSKYELFObjectWriter() {
+  return std::make_unique<CSKYELFObjectWriter>();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCAsmInfo.cpp b/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCAsmInfo.cpp
new file mode 100644
index 000000000000..668247bbbd87
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCAsmInfo.cpp
@@ -0,0 +1,25 @@
+//===-- CSKYMCAsmInfo.cpp - CSKY Asm properties ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the CSKYMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CSKYMCAsmInfo.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/MC/MCStreamer.h"
+
+using namespace llvm;
+
+void CSKYMCAsmInfo::anchor() {}
+
+CSKYMCAsmInfo::CSKYMCAsmInfo(const Triple &TargetTriple) {
+  AlignmentIsInBytes = false;
+  SupportsDebugInformation = true;
+  CommentString = "#";
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCAsmInfo.h b/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCAsmInfo.h
new file mode 100644
index 000000000000..3e0609f19531
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCAsmInfo.h
@@ -0,0 +1,29 @@
+//===-- CSKYMCAsmInfo.h - CSKY Asm Info ------------------------*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the CSKYMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_CSKY_MCTARGETDESC_CSKYMCASMINFO_H
+#define LLVM_LIB_TARGET_CSKY_MCTARGETDESC_CSKYMCASMINFO_H
+
+#include "llvm/MC/MCAsmInfoELF.h"
+
+namespace llvm {
+class Triple;
+
+class CSKYMCAsmInfo : public MCAsmInfoELF {
+  void anchor() override;
+
+public:
+  explicit CSKYMCAsmInfo(const Triple &TargetTriple);
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_CSKY_MCTARGETDESC_CSKYMCASMINFO_H
diff --git a/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCCodeEmitter.cpp b/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCCodeEmitter.cpp
new file mode 100644
index 000000000000..ed2b0e77b81a
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCCodeEmitter.cpp
@@ -0,0 +1,71 @@
+//===-- CSKYMCCodeEmitter.cpp - CSKY Code Emitter interface ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the CSKYMCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CSKYMCCodeEmitter.h"
+#include "MCTargetDesc/CSKYMCTargetDesc.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/Support/EndianStream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "csky-mccode-emitter"
+
+STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
+
+unsigned CSKYMCCodeEmitter::getOImmOpValue(const MCInst &MI, unsigned Idx,
+                                           SmallVectorImpl<MCFixup> &Fixups,
+                                           const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(Idx);
+  assert(MO.isImm() && "Unexpected MO type.");
+  return MO.getImm() - 1;
+}
+
+void CSKYMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  const MCInstrDesc &Desc = MII.get(MI.getOpcode());
+  unsigned Size = Desc.getSize();
+  uint32_t Bin = getBinaryCodeForInstr(MI, Fixups, STI);
+
+  uint16_t LO16 = static_cast<uint16_t>(Bin);
+  uint16_t HI16 = static_cast<uint16_t>(Bin >> 16);
+
+  if (Size == 4)
+    support::endian::write<uint16_t>(OS, HI16, support::little);
+
+  support::endian::write<uint16_t>(OS, LO16, support::little);
+  ++MCNumEmitted; // Keep track of the # of mi's emitted.
+}
+
+unsigned
+CSKYMCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const {
+  if (MO.isReg())
+    return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg());
+
+  if (MO.isImm())
+    return static_cast<unsigned>(MO.getImm());
+
+  llvm_unreachable("Unhandled expression!");
+  return 0;
+}
+
+MCCodeEmitter *llvm::createCSKYMCCodeEmitter(const MCInstrInfo &MCII,
+                                             const MCRegisterInfo &MRI,
+                                             MCContext &Ctx) {
+  return new CSKYMCCodeEmitter(Ctx, MCII);
+}
+
+#include "CSKYGenMCCodeEmitter.inc"
diff --git a/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCCodeEmitter.h b/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCCodeEmitter.h
new file mode 100644
index 000000000000..c850a4bab745
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCCodeEmitter.h
@@ -0,0 +1,61 @@
+//===-- CSKYMCCodeEmitter.cpp - CSKY Code Emitter interface ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the CSKYMCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_CSKY_MCTARGETDESC_CSKYMCCODEEMITTER_H
+#define LLVM_LIB_TARGET_CSKY_MCTARGETDESC_CSKYMCCODEEMITTER_H
+
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+
+namespace llvm {
+
+class CSKYMCCodeEmitter : public MCCodeEmitter {
+  MCContext &Ctx;
+  const MCInstrInfo &MII;
+
+public:
+  CSKYMCCodeEmitter(MCContext &Ctx, const MCInstrInfo &MII)
+      : Ctx(Ctx), MII(MII) {}
+
+  ~CSKYMCCodeEmitter() {}
+
+  void encodeInstruction(const MCInst &Inst, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const override;
+
+  // Generated by tablegen.
+  uint64_t getBinaryCodeForInstr(const MCInst &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  // Default encoding method used by tablegen.
+  unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+
+  template <int shift = 0>
+  unsigned getImmOpValue(const MCInst &MI, unsigned Idx,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const {
+    const MCOperand &MO = MI.getOperand(Idx);
+    assert(MO.isImm() && "Unexpected MO type.");
+    return (MO.getImm() >> shift);
+  }
+
+  unsigned getOImmOpValue(const MCInst &MI, unsigned Idx,
+                          SmallVectorImpl<MCFixup> &Fixups,
+                          const MCSubtargetInfo &STI) const;
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_CSKY_MCTARGETDESC_CSKYMCCODEEMITTER_H
diff --git a/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.cpp b/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.cpp
new file mode 100644
index 000000000000..876000a37004
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.cpp
@@ -0,0 +1,62 @@
+//===-- CSKYMCTargetDesc.cpp - CSKY Target Descriptions -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// This file provides CSKY specific target descriptions.
+///
+//===----------------------------------------------------------------------===//
+
+#include "CSKYMCTargetDesc.h"
+#include "CSKYAsmBackend.h"
+#include "CSKYMCAsmInfo.h"
+#include "CSKYMCCodeEmitter.h"
+#include "TargetInfo/CSKYTargetInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define GET_INSTRINFO_MC_DESC
+#include "CSKYGenInstrInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "CSKYGenRegisterInfo.inc"
+
+using namespace llvm;
+
+static MCAsmInfo *createCSKYMCAsmInfo(const MCRegisterInfo &MRI,
+                                      const Triple &TT,
+                                      const MCTargetOptions &Options) {
+  MCAsmInfo *MAI = new CSKYMCAsmInfo(TT);
+
+  // Initial state of the frame pointer is SP.
+  unsigned Reg = MRI.getDwarfRegNum(CSKY::R14, true);
+  MCCFIInstruction Inst = MCCFIInstruction::cfiDefCfa(nullptr, Reg, 0);
+  MAI->addInitialFrameState(Inst);
+  return MAI;
+}
+
+static MCInstrInfo *createCSKYMCInstrInfo() {
+  MCInstrInfo *Info = new MCInstrInfo();
+  InitCSKYMCInstrInfo(Info);
+  return Info;
+}
+
+static MCRegisterInfo *createCSKYMCRegisterInfo(const Triple &TT) {
+  MCRegisterInfo *Info = new MCRegisterInfo();
+  InitCSKYMCRegisterInfo(Info, CSKY::R15);
+  return Info;
+}
+
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeCSKYTargetMC() {
+  auto &CSKYTarget = getTheCSKYTarget();
+  TargetRegistry::RegisterMCAsmBackend(CSKYTarget, createCSKYAsmBackend);
+  TargetRegistry::RegisterMCAsmInfo(CSKYTarget, createCSKYMCAsmInfo);
+  TargetRegistry::RegisterMCInstrInfo(CSKYTarget, createCSKYMCInstrInfo);
+  TargetRegistry::RegisterMCRegInfo(CSKYTarget, createCSKYMCRegisterInfo);
+  TargetRegistry::RegisterMCCodeEmitter(CSKYTarget, createCSKYMCCodeEmitter);
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.h b/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.h
new file mode 100644
index 000000000000..da8a3b63a2f9
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.h
@@ -0,0 +1,48 @@
+//===-- CSKYMCTargetDesc.h - CSKY Target Descriptions -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides CSKY specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_CSKY_MCTARGETDESC_CSKYMCTARGETDESC_H
+#define LLVM_LIB_TARGET_CSKY_MCTARGETDESC_CSKYMCTARGETDESC_H
+
+#include "llvm/MC/MCTargetOptions.h"
+#include <memory>
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCRegisterInfo;
+class MCObjectTargetWriter;
+class MCRegisterInfo;
+class MCSubtargetInfo;
+class Target;
+class Triple;
+
+std::unique_ptr<MCObjectTargetWriter> createCSKYELFObjectWriter();
+
+MCAsmBackend *createCSKYAsmBackend(const Target &T, const MCSubtargetInfo &STI,
+                                   const MCRegisterInfo &MRI,
+                                   const MCTargetOptions &Options);
+
+MCCodeEmitter *createCSKYMCCodeEmitter(const MCInstrInfo &MCII,
+                                       const MCRegisterInfo &MRI,
+                                       MCContext &Ctx);
+} // namespace llvm
+
+#define GET_REGINFO_ENUM
+#include "CSKYGenRegisterInfo.inc"
+
+#define GET_INSTRINFO_ENUM
+#include "CSKYGenInstrInfo.inc"
+
+#endif // LLVM_LIB_TARGET_CSKY_MCTARGETDESC_CSKYMCTARGETDESC_H
diff --git a/contrib/llvm-project/llvm/lib/Target/CSKY/TargetInfo/CSKYTargetInfo.cpp b/contrib/llvm-project/llvm/lib/Target/CSKY/TargetInfo/CSKYTargetInfo.cpp
new file mode 100644
index 000000000000..1af2e672ff42
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/CSKY/TargetInfo/CSKYTargetInfo.cpp
@@ -0,0 +1,20 @@
+//===-- CSKYTargetInfo.cpp - CSKY Target Implementation -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "TargetInfo/CSKYTargetInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target &llvm::getTheCSKYTarget() {
+  static Target TheCSKYTarget;
+  return TheCSKYTarget;
+}
+
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeCSKYTargetInfo() {
+  RegisterTarget<Triple::csky> X(getTheCSKYTarget(), "csky", "C-SKY", "CSKY");
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/CSKY/TargetInfo/CSKYTargetInfo.h b/contrib/llvm-project/llvm/lib/Target/CSKY/TargetInfo/CSKYTargetInfo.h
new file mode 100644
index 000000000000..c317c5401f03
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/CSKY/TargetInfo/CSKYTargetInfo.h
@@ -0,0 +1,20 @@
+//===-- CSKYTargetInfo.cpp - CSKY Target Implementation -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_CSKY_TARGETINFO_CSKYTARGETINFO_H
+#define LLVM_LIB_TARGET_CSKY_TARGETINFO_CSKYTARGETINFO_H
+
+namespace llvm {
+
+class Target;
+
+Target &getTheCSKYTarget();
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_CSKY_TARGETINFO_CSKYTARGETINFO_H
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
index 1e7862c36ea0..b6763fd9aef0 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
@@ -641,7 +641,7 @@ bool HexagonAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
       return true;
     return finishBundle(IDLoc, Out);
   }
-  MCInst *SubInst = new (getParser().getContext()) MCInst;
+  MCInst *SubInst = getParser().getContext().createMCInst();
   if (matchOneInstruction(*SubInst, IDLoc, Operands, ErrorInfo,
                           MatchingInlineAsm)) {
     if (InBrackets)
@@ -945,7 +945,7 @@ bool HexagonAsmParser::isLabel(AsmToken &Token) {
   StringRef Raw(String.data(), Third.getString().data() - String.data() +
                                    Third.getString().size());
   std::string Collapsed = std::string(Raw);
-  Collapsed.erase(llvm::remove_if(Collapsed, isSpace), Collapsed.end());
+  llvm::erase_if(Collapsed, isSpace);
   StringRef Whole = Collapsed;
   std::pair<StringRef, StringRef> DotSplit = Whole.split('.');
   if (!matchRegister(DotSplit.first.lower()))
@@ -997,7 +997,7 @@ OperandMatchResultTy HexagonAsmParser::tryParseRegister(unsigned &RegNo,
     NeededWorkaround = NeededWorkaround || (Again && !(Contigious && Type));
   }
   std::string Collapsed = std::string(RawString);
-  Collapsed.erase(llvm::remove_if(Collapsed, isSpace), Collapsed.end());
+  llvm::erase_if(Collapsed, isSpace);
   StringRef FullString = Collapsed;
   std::pair<StringRef, StringRef> DotSplit = FullString.split('.');
   unsigned DotReg = matchRegister(DotSplit.first.lower());
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/BitTracker.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/BitTracker.cpp
index 7ef23ef35a74..8bced3cec082 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/BitTracker.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/BitTracker.cpp
@@ -198,10 +198,10 @@ BitTracker::~BitTracker() {
 // the actual bits of the "self" register.
 // While this cannot happen in the current implementation, I'm not sure
 // if this should be ruled out in the future.
-bool BT::RegisterCell::meet(const RegisterCell &RC, unsigned SelfR) {
+bool BT::RegisterCell::meet(const RegisterCell &RC, Register SelfR) {
   // An example when "meet" can be invoked with SelfR == 0 is a phi node
   // with a physical register as an operand.
-  assert(SelfR == 0 || Register::isVirtualRegister(SelfR));
+  assert(SelfR == 0 || SelfR.isVirtual());
   bool Changed = false;
   for (uint16_t i = 0, n = Bits.size(); i < n; ++i) {
     const BitValue &RCV = RC[i];
@@ -335,13 +335,13 @@ uint16_t BT::MachineEvaluator::getRegBitWidth(const RegisterRef &RR) const {
   // 1. find a physical register PhysR from the same class as RR.Reg,
   // 2. find a physical register PhysS that corresponds to PhysR:RR.Sub,
   // 3. find a register class that contains PhysS.
-  if (Register::isVirtualRegister(RR.Reg)) {
+  if (RR.Reg.isVirtual()) {
     const auto &VC = composeWithSubRegIndex(*MRI.getRegClass(RR.Reg), RR.Sub);
     return TRI.getRegSizeInBits(VC);
   }
-  assert(Register::isPhysicalRegister(RR.Reg));
-  Register PhysR =
-      (RR.Sub == 0) ? Register(RR.Reg) : TRI.getSubReg(RR.Reg, RR.Sub);
+  assert(RR.Reg.isPhysical());
+  MCRegister PhysR =
+      (RR.Sub == 0) ? RR.Reg.asMCReg() : TRI.getSubReg(RR.Reg, RR.Sub);
   return getPhysRegBitWidth(PhysR);
 }
 
@@ -351,10 +351,10 @@ BT::RegisterCell BT::MachineEvaluator::getCell(const RegisterRef &RR,
 
   // Physical registers are assumed to be present in the map with an unknown
   // value. Don't actually insert anything in the map, just return the cell.
-  if (Register::isPhysicalRegister(RR.Reg))
+  if (RR.Reg.isPhysical())
     return RegisterCell::self(0, BW);
 
-  assert(Register::isVirtualRegister(RR.Reg));
+  assert(RR.Reg.isVirtual());
   // For virtual registers that belong to a class that is not tracked,
   // generate an "unknown" value as well.
   const TargetRegisterClass *C = MRI.getRegClass(RR.Reg);
@@ -377,7 +377,7 @@ void BT::MachineEvaluator::putCell(const RegisterRef &RR, RegisterCell RC,
   // While updating the cell map can be done in a meaningful way for
   // a part of a register, it makes little sense to implement it as the
   // SSA representation would never contain such "partial definitions".
-  if (!Register::isVirtualRegister(RR.Reg))
+  if (!RR.Reg.isVirtual())
     return;
   assert(RR.Sub == 0 && "Unexpected sub-register in definition");
   // Eliminate all ref-to-reg-0 bit values: replace them with "self".
@@ -704,15 +704,14 @@ BT::RegisterCell BT::MachineEvaluator::eINS(const RegisterCell &A1,
   return Res;
 }
 
-BT::BitMask BT::MachineEvaluator::mask(unsigned Reg, unsigned Sub) const {
+BT::BitMask BT::MachineEvaluator::mask(Register Reg, unsigned Sub) const {
   assert(Sub == 0 && "Generic BitTracker::mask called for Sub != 0");
   uint16_t W = getRegBitWidth(Reg);
   assert(W > 0 && "Cannot generate mask for empty register");
   return BitMask(0, W-1);
 }
 
-uint16_t BT::MachineEvaluator::getPhysRegBitWidth(unsigned Reg) const {
-  assert(Register::isPhysicalRegister(Reg));
+uint16_t BT::MachineEvaluator::getPhysRegBitWidth(MCRegister Reg) const {
   const TargetRegisterClass &PC = *TRI.getMinimalPhysRegClass(Reg);
   return TRI.getRegSizeInBits(PC);
 }
@@ -875,7 +874,7 @@ void BT::visitNonBranch(const MachineInstr &MI) {
       continue;
     RegisterRef RD(MO);
     assert(RD.Sub == 0 && "Unexpected sub-register in definition");
-    if (!Register::isVirtualRegister(RD.Reg))
+    if (!RD.Reg.isVirtual())
       continue;
 
     bool Changed = false;
@@ -980,7 +979,7 @@ void BT::visitBranchesFrom(const MachineInstr &BI) {
     FlowQ.push(CFGEdge(ThisN, TB->getNumber()));
 }
 
-void BT::visitUsesOf(unsigned Reg) {
+void BT::visitUsesOf(Register Reg) {
   if (Trace)
     dbgs() << "queuing uses of modified reg " << printReg(Reg, &ME.TRI)
            << " cell: " << ME.getCell(Reg, Map) << '\n';
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/BitTracker.h b/contrib/llvm-project/llvm/lib/Target/Hexagon/BitTracker.h
index efb21805b801..08c0359a4b7f 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/BitTracker.h
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/BitTracker.h
@@ -62,7 +62,7 @@ private:
   void visitPHI(const MachineInstr &PI);
   void visitNonBranch(const MachineInstr &MI);
   void visitBranchesFrom(const MachineInstr &BI);
-  void visitUsesOf(unsigned Reg);
+  void visitUsesOf(Register Reg);
 
   using CFGEdge = std::pair<int, int>;
   using EdgeSetType = std::set<CFGEdge>;
@@ -131,19 +131,20 @@ struct BitTracker::BitRef {
     return Reg == BR.Reg && (Reg == 0 || Pos == BR.Pos);
   }
 
-  unsigned Reg;
+  Register Reg;
   uint16_t Pos;
 };
 
 // Abstraction of a register reference in MachineOperand.  It contains the
 // register number and the subregister index.
+// FIXME: Consolidate duplicate definitions of RegisterRef
 struct BitTracker::RegisterRef {
-  RegisterRef(unsigned R = 0, unsigned S = 0)
-    : Reg(R), Sub(S) {}
+  RegisterRef(Register R = 0, unsigned S = 0) : Reg(R), Sub(S) {}
   RegisterRef(const MachineOperand &MO)
       : Reg(MO.getReg()), Sub(MO.getSubReg()) {}
 
-  unsigned Reg, Sub;
+  Register Reg;
+  unsigned Sub;
 };
 
 // Value that a single bit can take.  This is outside of the context of
@@ -312,7 +313,7 @@ struct BitTracker::RegisterCell {
     return Bits[BitN];
   }
 
-  bool meet(const RegisterCell &RC, unsigned SelfR);
+  bool meet(const RegisterCell &RC, Register SelfR);
   RegisterCell &insert(const RegisterCell &RC, const BitMask &M);
   RegisterCell extract(const BitMask &M) const;  // Returns a new cell.
   RegisterCell &rol(uint16_t Sh);    // Rotate left.
@@ -461,7 +462,7 @@ struct BitTracker::MachineEvaluator {
   // Sub == 0, in this case, the function should return a mask that spans
   // the entire register Reg (which is what the default implementation
   // does).
-  virtual BitMask mask(unsigned Reg, unsigned Sub) const;
+  virtual BitMask mask(Register Reg, unsigned Sub) const;
   // Indicate whether a given register class should be tracked.
   virtual bool track(const TargetRegisterClass *RC) const { return true; }
   // Evaluate a non-branching machine instruction, given the cell map with
@@ -484,7 +485,7 @@ struct BitTracker::MachineEvaluator {
     llvm_unreachable("Unimplemented composeWithSubRegIndex");
   }
   // Return the size in bits of the physical register Reg.
-  virtual uint16_t getPhysRegBitWidth(unsigned Reg) const;
+  virtual uint16_t getPhysRegBitWidth(MCRegister Reg) const;
 
   const TargetRegisterInfo &TRI;
   MachineRegisterInfo &MRI;
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
index f3a87ef20a60..aeaeac65de96 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
@@ -175,7 +175,7 @@ DecodeStatus HexagonDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
   while (Result == Success && !Complete) {
     if (Bytes.size() < HEXAGON_INSTR_SIZE)
       return MCDisassembler::Fail;
-    MCInst *Inst = new (getContext()) MCInst;
+    MCInst *Inst = getContext().createMCInst();
     Result = getSingleInstruction(*Inst, MI, Bytes, Address, cs, Complete);
     MI.addOperand(MCOperand::createInst(Inst));
     Size += HEXAGON_INSTR_SIZE;
@@ -384,8 +384,8 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(MCInst &MI, MCInst &MCB,
       break;
     }
     MI.setOpcode(Hexagon::DuplexIClass0 + duplexIClass);
-    MCInst *MILow = new (getContext()) MCInst;
-    MCInst *MIHigh = new (getContext()) MCInst;
+    MCInst *MILow = getContext().createMCInst();
+    MCInst *MIHigh = getContext().createMCInst();
     auto TmpExtender = CurrentExtender;
     CurrentExtender =
         nullptr; // constant extenders in duplex must always be in slot 1
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/Hexagon.h b/contrib/llvm-project/llvm/lib/Target/Hexagon/Hexagon.h
index 58dadf012da5..98e5710d4fc1 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/Hexagon.h
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/Hexagon.h
@@ -14,12 +14,9 @@
 #ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGON_H
 #define LLVM_LIB_TARGET_HEXAGON_HEXAGON_H
 
-#include "MCTargetDesc/HexagonMCTargetDesc.h"
-#include "llvm/CodeGen/TargetLowering.h"
-#include "llvm/Target/TargetMachine.h"
-
 namespace llvm {
   class HexagonTargetMachine;
+  class ImmutablePass;
 
   /// Creates a Hexagon-specific Target Transformation Info pass.
   ImmutablePass *createHexagonTargetTransformInfoPass(const HexagonTargetMachine *TM);
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
index 49edb0d99492..54aa14849dd9 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
@@ -206,14 +206,14 @@ namespace {
         uint16_t W);
     static bool getConst(const BitTracker::RegisterCell &RC, uint16_t B,
         uint16_t W, uint64_t &U);
-    static bool replaceReg(unsigned OldR, unsigned NewR,
-        MachineRegisterInfo &MRI);
+    static bool replaceReg(Register OldR, Register NewR,
+                           MachineRegisterInfo &MRI);
     static bool getSubregMask(const BitTracker::RegisterRef &RR,
         unsigned &Begin, unsigned &Width, MachineRegisterInfo &MRI);
-    static bool replaceRegWithSub(unsigned OldR, unsigned NewR,
-        unsigned NewSR, MachineRegisterInfo &MRI);
-    static bool replaceSubWithSub(unsigned OldR, unsigned OldSR,
-        unsigned NewR, unsigned NewSR, MachineRegisterInfo &MRI);
+    static bool replaceRegWithSub(Register OldR, Register NewR, unsigned NewSR,
+                                  MachineRegisterInfo &MRI);
+    static bool replaceSubWithSub(Register OldR, unsigned OldSR, Register NewR,
+                                  unsigned NewSR, MachineRegisterInfo &MRI);
     static bool parseRegSequence(const MachineInstr &I,
         BitTracker::RegisterRef &SL, BitTracker::RegisterRef &SH,
         const MachineRegisterInfo &MRI);
@@ -292,7 +292,7 @@ void HexagonBitSimplify::getInstrDefs(const MachineInstr &MI,
     if (!Op.isReg() || !Op.isDef())
       continue;
     Register R = Op.getReg();
-    if (!Register::isVirtualRegister(R))
+    if (!R.isVirtual())
       continue;
     Defs.insert(R);
   }
@@ -304,7 +304,7 @@ void HexagonBitSimplify::getInstrUses(const MachineInstr &MI,
     if (!Op.isReg() || !Op.isUse())
       continue;
     Register R = Op.getReg();
-    if (!Register::isVirtualRegister(R))
+    if (!R.isVirtual())
       continue;
     Uses.insert(R);
   }
@@ -352,9 +352,9 @@ bool HexagonBitSimplify::getConst(const BitTracker::RegisterCell &RC,
   return true;
 }
 
-bool HexagonBitSimplify::replaceReg(unsigned OldR, unsigned NewR,
-      MachineRegisterInfo &MRI) {
-  if (!Register::isVirtualRegister(OldR) || !Register::isVirtualRegister(NewR))
+bool HexagonBitSimplify::replaceReg(Register OldR, Register NewR,
+                                    MachineRegisterInfo &MRI) {
+  if (!OldR.isVirtual() || !NewR.isVirtual())
     return false;
   auto Begin = MRI.use_begin(OldR), End = MRI.use_end();
   decltype(End) NextI;
@@ -365,9 +365,10 @@ bool HexagonBitSimplify::replaceReg(unsigned OldR, unsigned NewR,
   return Begin != End;
 }
 
-bool HexagonBitSimplify::replaceRegWithSub(unsigned OldR, unsigned NewR,
-      unsigned NewSR, MachineRegisterInfo &MRI) {
-  if (!Register::isVirtualRegister(OldR) || !Register::isVirtualRegister(NewR))
+bool HexagonBitSimplify::replaceRegWithSub(Register OldR, Register NewR,
+                                           unsigned NewSR,
+                                           MachineRegisterInfo &MRI) {
+  if (!OldR.isVirtual() || !NewR.isVirtual())
     return false;
   if (hasTiedUse(OldR, MRI, NewSR))
     return false;
@@ -381,9 +382,10 @@ bool HexagonBitSimplify::replaceRegWithSub(unsigned OldR, unsigned NewR,
   return Begin != End;
 }
 
-bool HexagonBitSimplify::replaceSubWithSub(unsigned OldR, unsigned OldSR,
-      unsigned NewR, unsigned NewSR, MachineRegisterInfo &MRI) {
-  if (!Register::isVirtualRegister(OldR) || !Register::isVirtualRegister(NewR))
+bool HexagonBitSimplify::replaceSubWithSub(Register OldR, unsigned OldSR,
+                                           Register NewR, unsigned NewSR,
+                                           MachineRegisterInfo &MRI) {
+  if (!OldR.isVirtual() || !NewR.isVirtual())
     return false;
   if (OldSR != NewSR && hasTiedUse(OldR, MRI, NewSR))
     return false;
@@ -894,7 +896,7 @@ bool HexagonBitSimplify::getUsedBits(unsigned Opc, unsigned OpN,
 // register class.
 const TargetRegisterClass *HexagonBitSimplify::getFinalVRegClass(
       const BitTracker::RegisterRef &RR, MachineRegisterInfo &MRI) {
-  if (!Register::isVirtualRegister(RR.Reg))
+  if (!RR.Reg.isVirtual())
     return nullptr;
   auto *RC = MRI.getRegClass(RR.Reg);
   if (RR.Sub == 0)
@@ -925,8 +927,7 @@ const TargetRegisterClass *HexagonBitSimplify::getFinalVRegClass(
 // with a 32-bit register.
 bool HexagonBitSimplify::isTransparentCopy(const BitTracker::RegisterRef &RD,
       const BitTracker::RegisterRef &RS, MachineRegisterInfo &MRI) {
-  if (!Register::isVirtualRegister(RD.Reg) ||
-      !Register::isVirtualRegister(RS.Reg))
+  if (!RD.Reg.isVirtual() || !RS.Reg.isVirtual())
     return false;
   // Return false if one (or both) classes are nullptr.
   auto *DRC = getFinalVRegClass(RD, MRI);
@@ -1017,7 +1018,7 @@ bool DeadCodeElimination::runOnNode(MachineDomTreeNode *N) {
       if (!Op.isReg() || !Op.isDef())
         continue;
       Register R = Op.getReg();
-      if (!Register::isVirtualRegister(R) || !isDead(R)) {
+      if (!R.isVirtual() || !isDead(R)) {
         AllDead = false;
         break;
       }
@@ -1219,7 +1220,7 @@ bool RedundantInstrElimination::computeUsedBits(unsigned Reg, BitVector &Bits) {
       MachineInstr &UseI = *I->getParent();
       if (UseI.isPHI() || UseI.isCopy()) {
         Register DefR = UseI.getOperand(0).getReg();
-        if (!Register::isVirtualRegister(DefR))
+        if (!DefR.isVirtual())
           return false;
         Pending.push_back(DefR);
       } else {
@@ -1380,8 +1381,9 @@ namespace {
     static bool isTfrConst(const MachineInstr &MI);
 
   private:
-    unsigned genTfrConst(const TargetRegisterClass *RC, int64_t C,
-        MachineBasicBlock &B, MachineBasicBlock::iterator At, DebugLoc &DL);
+    Register genTfrConst(const TargetRegisterClass *RC, int64_t C,
+                         MachineBasicBlock &B, MachineBasicBlock::iterator At,
+                         DebugLoc &DL);
 
     const HexagonInstrInfo &HII;
     MachineRegisterInfo &MRI;
@@ -1408,8 +1410,10 @@ bool ConstGeneration::isTfrConst(const MachineInstr &MI) {
 
 // Generate a transfer-immediate instruction that is appropriate for the
 // register class and the actual value being transferred.
-unsigned ConstGeneration::genTfrConst(const TargetRegisterClass *RC, int64_t C,
-      MachineBasicBlock &B, MachineBasicBlock::iterator At, DebugLoc &DL) {
+Register ConstGeneration::genTfrConst(const TargetRegisterClass *RC, int64_t C,
+                                      MachineBasicBlock &B,
+                                      MachineBasicBlock::iterator At,
+                                      DebugLoc &DL) {
   Register Reg = MRI.createVirtualRegister(RC);
   if (RC == &Hexagon::IntRegsRegClass) {
     BuildMI(B, At, DL, HII.get(Hexagon::A2_tfrsi), Reg)
@@ -1473,8 +1477,8 @@ bool ConstGeneration::processBlock(MachineBasicBlock &B, const RegisterSet&) {
     HBS::getInstrDefs(*I, Defs);
     if (Defs.count() != 1)
       continue;
-    unsigned DR = Defs.find_first();
-    if (!Register::isVirtualRegister(DR))
+    Register DR = Defs.find_first();
+    if (!DR.isVirtual())
       continue;
     uint64_t U;
     const BitTracker::RegisterCell &DRC = BT.lookup(DR);
@@ -1482,7 +1486,7 @@ bool ConstGeneration::processBlock(MachineBasicBlock &B, const RegisterSet&) {
       int64_t C = U;
       DebugLoc DL = I->getDebugLoc();
       auto At = I->isPHI() ? B.getFirstNonPHI() : I;
-      unsigned ImmReg = genTfrConst(MRI.getRegClass(DR), C, B, At, DL);
+      Register ImmReg = genTfrConst(MRI.getRegClass(DR), C, B, At, DL);
       if (ImmReg) {
         HBS::replaceReg(DR, ImmReg, MRI);
         BT.put(ImmReg, DRC);
@@ -1549,7 +1553,7 @@ bool CopyGeneration::findMatch(const BitTracker::RegisterRef &Inp,
   if (!HBS::getSubregMask(Inp, B, W, MRI))
     return false;
 
-  for (unsigned R = AVs.find_first(); R; R = AVs.find_next(R)) {
+  for (Register R = AVs.find_first(); R; R = AVs.find_next(R)) {
     if (!BT.has(R) || Forbidden[R])
       continue;
     const BitTracker::RegisterCell &RC = BT.lookup(R);
@@ -1608,7 +1612,7 @@ bool CopyGeneration::processBlock(MachineBasicBlock &B,
     DebugLoc DL = I->getDebugLoc();
     auto At = I->isPHI() ? B.getFirstNonPHI() : I;
 
-    for (unsigned R = Defs.find_first(); R; R = Defs.find_next(R)) {
+    for (Register R = Defs.find_first(); R; R = Defs.find_next(R)) {
       BitTracker::RegisterRef MR;
       auto *FRC = HBS::getFinalVRegClass(R, MRI);
 
@@ -1815,7 +1819,7 @@ bool BitSimplification::matchHalf(unsigned SelfR,
   if (I == B+16)
     return false;
 
-  unsigned Reg = RC[I].RefI.Reg;
+  Register Reg = RC[I].RefI.Reg;
   unsigned P = RC[I].RefI.Pos;    // The RefI.Pos will be advanced by I-B.
   if (P < I-B)
     return false;
@@ -1823,7 +1827,7 @@ bool BitSimplification::matchHalf(unsigned SelfR,
 
   if (Reg == 0 || Reg == SelfR)    // Don't match "self".
     return false;
-  if (!Register::isVirtualRegister(Reg))
+  if (!Reg.isVirtual())
     return false;
   if (!BT.has(Reg))
     return false;
@@ -2363,7 +2367,7 @@ bool BitSimplification::simplifyTstbit(MachineInstr *MI,
       P = V.RefI.Pos;
     }
     if (P != std::numeric_limits<unsigned>::max()) {
-      unsigned NewR = MRI.createVirtualRegister(&Hexagon::PredRegsRegClass);
+      Register NewR = MRI.createVirtualRegister(&Hexagon::PredRegsRegClass);
       BuildMI(B, At, DL, HII.get(Hexagon::S2_tstbit_i), NewR)
           .addReg(RR.Reg, 0, RR.Sub)
           .addImm(P);
@@ -3165,8 +3169,8 @@ bool HexagonLoopRescheduling::processLoop(LoopCand &C) {
     HBS::getInstrDefs(*I, Defs);
     if (Defs.count() != 1)
       continue;
-    unsigned DefR = Defs.find_first();
-    if (!Register::isVirtualRegister(DefR))
+    Register DefR = Defs.find_first();
+    if (!DefR.isVirtual())
       continue;
     if (!isBitShuffle(&*I, DefR))
       continue;
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp
index 1e4030b84bc1..0f6dedeb28c3 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp
@@ -86,7 +86,7 @@ HexagonEvaluator::HexagonEvaluator(const HexagonRegisterInfo &tri,
   }
 }
 
-BT::BitMask HexagonEvaluator::mask(unsigned Reg, unsigned Sub) const {
+BT::BitMask HexagonEvaluator::mask(Register Reg, unsigned Sub) const {
   if (Sub == 0)
     return MachineEvaluator::mask(Reg, 0);
   const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
@@ -110,9 +110,7 @@ BT::BitMask HexagonEvaluator::mask(unsigned Reg, unsigned Sub) const {
   llvm_unreachable("Unexpected register/subregister");
 }
 
-uint16_t HexagonEvaluator::getPhysRegBitWidth(unsigned Reg) const {
-  assert(Register::isPhysicalRegister(Reg));
-
+uint16_t HexagonEvaluator::getPhysRegBitWidth(MCRegister Reg) const {
   using namespace Hexagon;
   const auto &HST = MF.getSubtarget<HexagonSubtarget>();
   if (HST.useHVXOps()) {
@@ -1043,7 +1041,7 @@ unsigned HexagonEvaluator::getUniqueDefVReg(const MachineInstr &MI) const {
     if (!Op.isReg() || !Op.isDef())
       continue;
     Register R = Op.getReg();
-    if (!Register::isVirtualRegister(R))
+    if (!R.isVirtual())
       continue;
     if (DefReg != 0)
       return 0;
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBitTracker.h b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBitTracker.h
index 02607d50f686..2d24e859e761 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBitTracker.h
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBitTracker.h
@@ -36,9 +36,9 @@ struct HexagonEvaluator : public BitTracker::MachineEvaluator {
   bool evaluate(const MachineInstr &BI, const CellMapType &Inputs,
                 BranchTargetList &Targets, bool &FallsThru) const override;
 
-  BitTracker::BitMask mask(unsigned Reg, unsigned Sub) const override;
+  BitTracker::BitMask mask(Register Reg, unsigned Sub) const override;
 
-  uint16_t getPhysRegBitWidth(unsigned Reg) const override;
+  uint16_t getPhysRegBitWidth(MCRegister Reg) const override;
 
   const TargetRegisterClass &composeWithSubRegIndex(
         const TargetRegisterClass &RC, unsigned Idx) const override;
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBlockRanges.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBlockRanges.cpp
index d1d1b8ee7d41..56ee3cd60c17 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBlockRanges.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBlockRanges.cpp
@@ -84,7 +84,7 @@ void HexagonBlockRanges::RangeList::unionize(bool MergeAdjacent) {
   if (empty())
     return;
 
-  llvm::sort(begin(), end());
+  llvm::sort(*this);
   iterator Iter = begin();
 
   while (Iter != end()-1) {
@@ -275,7 +275,7 @@ HexagonBlockRanges::RegisterSet HexagonBlockRanges::expandToSubRegs(
     for (; I.isValid(); ++I)
       SRs.insert({*I, 0});
   } else {
-    assert(Register::isVirtualRegister(R.Reg));
+    assert(R.Reg.isVirtual());
     auto &RC = *MRI.getRegClass(R.Reg);
     unsigned PReg = *RC.begin();
     MCSubRegIndexIterator I(PReg, &TRI);
@@ -482,7 +482,7 @@ HexagonBlockRanges::RegToRangeMap HexagonBlockRanges::computeDeadMap(
     }
   }
   for (auto &P : LiveMap)
-    if (Register::isVirtualRegister(P.first.Reg))
+    if (P.first.Reg.isVirtual())
       addDeadRanges(P.first);
 
   LLVM_DEBUG(dbgs() << __func__ << ": dead map\n"
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBlockRanges.h b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBlockRanges.h
index 61115e29a708..5a3b6433fba7 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBlockRanges.h
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBlockRanges.h
@@ -10,6 +10,7 @@
 #define LLVM_LIB_TARGET_HEXAGON_HEXAGONBLOCKRANGES_H
 
 #include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/Register.h"
 #include <cassert>
 #include <map>
 #include <set>
@@ -30,8 +31,10 @@ class TargetRegisterInfo;
 struct HexagonBlockRanges {
   HexagonBlockRanges(MachineFunction &MF);
 
+  // FIXME: Consolidate duplicate definitions of RegisterRef
   struct RegisterRef {
-    unsigned Reg, Sub;
+    llvm::Register Reg;
+    unsigned Sub;
 
     bool operator<(RegisterRef R) const {
       return Reg < R.Reg || (Reg == R.Reg && Sub < R.Sub);
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
index 11a455ce4347..b456cf139c55 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "Hexagon.h"
+#include "MCTargetDesc/HexagonMCTargetDesc.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
index 6a5192c866cc..11e7d5a17fa9 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
@@ -447,7 +447,7 @@ static void nodes_for_root(GepNode *Root, NodeChildrenMap &NCM,
       Work.erase(First);
       NodeChildrenMap::iterator CF = NCM.find(N);
       if (CF != NCM.end()) {
-        Work.insert(Work.end(), CF->second.begin(), CF->second.end());
+        llvm::append_range(Work, CF->second);
         Nodes.insert(CF->second.begin(), CF->second.end());
       }
     }
@@ -472,10 +472,11 @@ static const NodeSet *node_class(GepNode *N, NodeSymRel &Rel) {
   // determining equality. The only purpose of the ordering is to eliminate
   // duplication due to the commutativity of equality/non-equality.
 static NodePair node_pair(GepNode *N1, GepNode *N2) {
-    uintptr_t P1 = uintptr_t(N1), P2 = uintptr_t(N2);
-    if (P1 <= P2)
-      return std::make_pair(N1, N2);
-    return std::make_pair(N2, N1);
+  uintptr_t P1 = reinterpret_cast<uintptr_t>(N1);
+  uintptr_t P2 = reinterpret_cast<uintptr_t>(N2);
+  if (P1 <= P2)
+    return std::make_pair(N1, N2);
+  return std::make_pair(N2, N1);
 }
 
 static unsigned node_hash(GepNode *N) {
@@ -650,8 +651,7 @@ void HexagonCommonGEP::common() {
     // Node for removal.
     Erase.insert(*I);
   }
-  NodeVect::iterator NewE = remove_if(Nodes, in_set(Erase));
-  Nodes.resize(std::distance(Nodes.begin(), NewE));
+  erase_if(Nodes, in_set(Erase));
 
   LLVM_DEBUG(dbgs() << "Gep nodes after post-commoning cleanup:\n" << Nodes);
 }
@@ -1145,7 +1145,7 @@ void HexagonCommonGEP::getAllUsersForNode(GepNode *Node, ValueVect &Values,
     NodeChildrenMap::iterator CF = NCM.find(N);
     if (CF != NCM.end()) {
       NodeVect &Cs = CF->second;
-      Work.insert(Work.end(), Cs.begin(), Cs.end());
+      llvm::append_range(Work, Cs);
     }
   }
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp
index 05b95d8b7314..a774baaa48e6 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp
@@ -242,18 +242,15 @@ namespace {
         return *this;
       }
       bool isVReg() const {
-        return Reg != 0 && !llvm::Register::isStackSlot(Reg) &&
-               llvm::Register::isVirtualRegister(Reg);
-      }
-      bool isSlot() const {
-        return Reg != 0 && llvm::Register::isStackSlot(Reg);
+        return Reg != 0 && !Reg.isStack() && Reg.isVirtual();
       }
+      bool isSlot() const { return Reg != 0 && Reg.isStack(); }
       operator MachineOperand() const {
         if (isVReg())
           return MachineOperand::CreateReg(Reg, /*Def*/false, /*Imp*/false,
                           /*Kill*/false, /*Dead*/false, /*Undef*/false,
                           /*EarlyClobber*/false, Sub);
-        if (llvm::Register::isStackSlot(Reg)) {
+        if (Reg.isStack()) {
           int FI = llvm::Register::stackSlot2Index(Reg);
           return MachineOperand::CreateFI(FI);
         }
@@ -265,7 +262,8 @@ namespace {
         // For std::map.
         return Reg < R.Reg || (Reg == R.Reg && Sub < R.Sub);
       }
-      unsigned Reg = 0, Sub = 0;
+      llvm::Register Reg;
+      unsigned Sub = 0;
     };
 
     struct ExtExpr {
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp
index 77578378b058..4a2b0600f42b 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp
@@ -83,7 +83,8 @@ namespace {
   // FIXME: Use TargetInstrInfo::RegSubRegPair. Also duplicated in
   // HexagonGenPredicate
   struct RegisterSubReg {
-    unsigned Reg, SubReg;
+    Register Reg;
+    unsigned SubReg;
 
     explicit RegisterSubReg(unsigned R, unsigned SR = 0) : Reg(R), SubReg(SR) {}
     explicit RegisterSubReg(const MachineOperand &MO)
@@ -216,16 +217,16 @@ namespace {
 
       void clear() { Map.clear(); }
 
-      bool has(unsigned R) const {
+      bool has(Register R) const {
         // All non-virtual registers are considered "bottom".
-        if (!Register::isVirtualRegister(R))
+        if (!R.isVirtual())
           return true;
         MapType::const_iterator F = Map.find(R);
         return F != Map.end();
       }
 
-      const LatticeCell &get(unsigned R) const {
-        if (!Register::isVirtualRegister(R))
+      const LatticeCell &get(Register R) const {
+        if (!R.isVirtual())
           return Bottom;
         MapType::const_iterator F = Map.find(R);
         if (F != Map.end())
@@ -234,14 +235,12 @@ namespace {
       }
 
       // Invalidates any const references.
-      void update(unsigned R, const LatticeCell &L) {
-        Map[R] = L;
-      }
+      void update(Register R, const LatticeCell &L) { Map[R] = L; }
 
       void print(raw_ostream &os, const TargetRegisterInfo &TRI) const;
 
     private:
-      using MapType = std::map<unsigned, LatticeCell>;
+      using MapType = std::map<Register, LatticeCell>;
 
       MapType Map;
       // To avoid creating "top" entries, return a const reference to
@@ -633,7 +632,7 @@ void MachineConstPropagator::visitPHI(const MachineInstr &PN) {
 
   const MachineOperand &MD = PN.getOperand(0);
   RegisterSubReg DefR(MD);
-  assert(Register::isVirtualRegister(DefR.Reg));
+  assert(DefR.Reg.isVirtual());
 
   bool Changed = false;
 
@@ -662,7 +661,7 @@ Bottomize:
     RegisterSubReg UseR(SO);
     // If the input is not a virtual register, we don't really know what
     // value it holds.
-    if (!Register::isVirtualRegister(UseR.Reg))
+    if (!UseR.Reg.isVirtual())
       goto Bottomize;
     // If there is no cell for an input register, it means top.
     if (!Cells.has(UseR.Reg))
@@ -704,7 +703,7 @@ void MachineConstPropagator::visitNonBranch(const MachineInstr &MI) {
       continue;
     RegisterSubReg DefR(MO);
     // Only track virtual registers.
-    if (!Register::isVirtualRegister(DefR.Reg))
+    if (!DefR.Reg.isVirtual())
       continue;
     bool Changed = false;
     // If the evaluation failed, set cells for all output registers to bottom.
@@ -1086,7 +1085,7 @@ bool MachineConstPropagator::run(MachineFunction &MF) {
 
 bool MachineConstEvaluator::getCell(const RegisterSubReg &R, const CellMap &Inputs,
       LatticeCell &RC) {
-  if (!Register::isVirtualRegister(R.Reg))
+  if (!R.Reg.isVirtual())
     return false;
   const LatticeCell &L = Inputs.get(R.Reg);
   if (!R.SubReg) {
@@ -1884,7 +1883,7 @@ namespace {
     bool evaluateHexVector2(const MachineInstr &MI, const CellMap &Inputs,
           CellMap &Outputs);
 
-    void replaceAllRegUsesWith(unsigned FromReg, unsigned ToReg);
+    void replaceAllRegUsesWith(Register FromReg, Register ToReg);
     bool rewriteHexBranch(MachineInstr &BrI, const CellMap &Inputs);
     bool rewriteHexConstDefs(MachineInstr &MI, const CellMap &Inputs,
           bool &AllDefs);
@@ -1942,7 +1941,7 @@ bool HexagonConstEvaluator::evaluate(const MachineInstr &MI,
   unsigned Opc = MI.getOpcode();
   RegisterSubReg DefR(MD);
   assert(!DefR.SubReg);
-  if (!Register::isVirtualRegister(DefR.Reg))
+  if (!DefR.Reg.isVirtual())
     return false;
 
   if (MI.isCopy()) {
@@ -2809,7 +2808,7 @@ bool HexagonConstEvaluator::rewriteHexConstDefs(MachineInstr &MI,
       if (!MO.isReg() || !MO.isUse() || MO.isImplicit())
         continue;
       RegisterSubReg R(MO);
-      if (!Register::isVirtualRegister(R.Reg))
+      if (!R.Reg.isVirtual())
         continue;
       HasUse = true;
       // PHIs can legitimately have "top" cells after propagation.
@@ -2851,7 +2850,7 @@ bool HexagonConstEvaluator::rewriteHexConstDefs(MachineInstr &MI,
     if (!MO.isReg() || !MO.isDef())
       continue;
     Register R = MO.getReg();
-    if (!Register::isVirtualRegister(R))
+    if (!R.isVirtual())
       continue;
     assert(!MO.getSubReg());
     assert(Inputs.has(R));
@@ -3130,10 +3129,10 @@ bool HexagonConstEvaluator::rewriteHexConstUses(MachineInstr &MI,
   return Changed;
 }
 
-void HexagonConstEvaluator::replaceAllRegUsesWith(unsigned FromReg,
-      unsigned ToReg) {
-  assert(Register::isVirtualRegister(FromReg));
-  assert(Register::isVirtualRegister(ToReg));
+void HexagonConstEvaluator::replaceAllRegUsesWith(Register FromReg,
+                                                  Register ToReg) {
+  assert(FromReg.isVirtual());
+  assert(ToReg.isVirtual());
   for (auto I = MRI->use_begin(FromReg), E = MRI->use_end(); I != E;) {
     MachineOperand &O = *I;
     ++I;
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp
index 587527d8c32c..23d0cc829e52 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp
@@ -10,6 +10,7 @@
 // to move them together. If we can move them next to each other we do so and
 // replace them with a combine instruction.
 //===----------------------------------------------------------------------===//
+
 #include "HexagonInstrInfo.h"
 #include "HexagonSubtarget.h"
 #include "llvm/ADT/DenseMap.h"
@@ -26,6 +27,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
 
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
index a431af17e6d0..d36ffc3da641 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
@@ -386,7 +386,7 @@ bool HexagonEarlyIfConversion::isValidCandidate(const MachineBasicBlock *B)
       if (!MO.isReg() || !MO.isDef())
         continue;
       Register R = MO.getReg();
-      if (!Register::isVirtualRegister(R))
+      if (!R.isVirtual())
         continue;
       if (!isPredicate(R))
         continue;
@@ -403,7 +403,7 @@ bool HexagonEarlyIfConversion::usesUndefVReg(const MachineInstr *MI) const {
     if (!MO.isReg() || !MO.isUse())
       continue;
     Register R = MO.getReg();
-    if (!Register::isVirtualRegister(R))
+    if (!R.isVirtual())
       continue;
     const MachineInstr *DefI = MRI->getVRegDef(R);
     // "Undefined" virtual registers are actually defined via IMPLICIT_DEF.
@@ -493,7 +493,7 @@ unsigned HexagonEarlyIfConversion::countPredicateDefs(
       if (!MO.isReg() || !MO.isDef())
         continue;
       Register R = MO.getReg();
-      if (!Register::isVirtualRegister(R))
+      if (!R.isVirtual())
         continue;
       if (isPredicate(R))
         PredDefs++;
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp
index c1d0599830cc..fcc880463925 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp
@@ -174,6 +174,7 @@ namespace {
     unsigned CoaCounter = 0;
     unsigned TfrCounter = 0;
 
+    // FIXME: Consolidate duplicate definitions of RegisterRef
     struct RegisterRef {
       RegisterRef(const MachineOperand &Op) : Reg(Op.getReg()),
           Sub(Op.getSubReg()) {}
@@ -187,7 +188,8 @@ namespace {
         return Reg < RR.Reg || (Reg == RR.Reg && Sub < RR.Sub);
       }
 
-      unsigned Reg, Sub;
+      Register Reg;
+      unsigned Sub;
     };
 
     using ReferenceMap = DenseMap<unsigned, unsigned>;
@@ -196,25 +198,25 @@ namespace {
 
     unsigned getMaskForSub(unsigned Sub);
     bool isCondset(const MachineInstr &MI);
-    LaneBitmask getLaneMask(unsigned Reg, unsigned Sub);
+    LaneBitmask getLaneMask(Register Reg, unsigned Sub);
 
     void addRefToMap(RegisterRef RR, ReferenceMap &Map, unsigned Exec);
     bool isRefInMap(RegisterRef, ReferenceMap &Map, unsigned Exec);
 
-    void updateDeadsInRange(unsigned Reg, LaneBitmask LM, LiveRange &Range);
-    void updateKillFlags(unsigned Reg);
-    void updateDeadFlags(unsigned Reg);
-    void recalculateLiveInterval(unsigned Reg);
+    void updateDeadsInRange(Register Reg, LaneBitmask LM, LiveRange &Range);
+    void updateKillFlags(Register Reg);
+    void updateDeadFlags(Register Reg);
+    void recalculateLiveInterval(Register Reg);
     void removeInstr(MachineInstr &MI);
-    void updateLiveness(std::set<unsigned> &RegSet, bool Recalc,
-        bool UpdateKills, bool UpdateDeads);
+    void updateLiveness(std::set<Register> &RegSet, bool Recalc,
+                        bool UpdateKills, bool UpdateDeads);
 
     unsigned getCondTfrOpcode(const MachineOperand &SO, bool Cond);
     MachineInstr *genCondTfrFor(MachineOperand &SrcOp,
         MachineBasicBlock::iterator At, unsigned DstR,
         unsigned DstSR, const MachineOperand &PredOp, bool PredSense,
         bool ReadUndef, bool ImpUse);
-    bool split(MachineInstr &MI, std::set<unsigned> &UpdRegs);
+    bool split(MachineInstr &MI, std::set<Register> &UpdRegs);
 
     bool isPredicable(MachineInstr *MI);
     MachineInstr *getReachingDefForPred(RegisterRef RD,
@@ -224,19 +226,18 @@ namespace {
     void predicateAt(const MachineOperand &DefOp, MachineInstr &MI,
                      MachineBasicBlock::iterator Where,
                      const MachineOperand &PredOp, bool Cond,
-                     std::set<unsigned> &UpdRegs);
+                     std::set<Register> &UpdRegs);
     void renameInRange(RegisterRef RO, RegisterRef RN, unsigned PredR,
         bool Cond, MachineBasicBlock::iterator First,
         MachineBasicBlock::iterator Last);
-    bool predicate(MachineInstr &TfrI, bool Cond, std::set<unsigned> &UpdRegs);
-    bool predicateInBlock(MachineBasicBlock &B,
-        std::set<unsigned> &UpdRegs);
+    bool predicate(MachineInstr &TfrI, bool Cond, std::set<Register> &UpdRegs);
+    bool predicateInBlock(MachineBasicBlock &B, std::set<Register> &UpdRegs);
 
     bool isIntReg(RegisterRef RR, unsigned &BW);
     bool isIntraBlocks(LiveInterval &LI);
     bool coalesceRegisters(RegisterRef R1, RegisterRef R2);
-    bool coalesceSegments(const SmallVectorImpl<MachineInstr*> &Condsets,
-                          std::set<unsigned> &UpdRegs);
+    bool coalesceSegments(const SmallVectorImpl<MachineInstr *> &Condsets,
+                          std::set<Register> &UpdRegs);
   };
 
 } // end anonymous namespace
@@ -285,8 +286,8 @@ bool HexagonExpandCondsets::isCondset(const MachineInstr &MI) {
   return false;
 }
 
-LaneBitmask HexagonExpandCondsets::getLaneMask(unsigned Reg, unsigned Sub) {
-  assert(Register::isVirtualRegister(Reg));
+LaneBitmask HexagonExpandCondsets::getLaneMask(Register Reg, unsigned Sub) {
+  assert(Reg.isVirtual());
   return Sub != 0 ? TRI->getSubRegIndexLaneMask(Sub)
                   : MRI->getMaxLaneMaskForVReg(Reg);
 }
@@ -312,7 +313,7 @@ bool HexagonExpandCondsets::isRefInMap(RegisterRef RR, ReferenceMap &Map,
   return false;
 }
 
-void HexagonExpandCondsets::updateKillFlags(unsigned Reg) {
+void HexagonExpandCondsets::updateKillFlags(Register Reg) {
   auto KillAt = [this,Reg] (SlotIndex K, LaneBitmask LM) -> void {
     // Set the <kill> flag on a use of Reg whose lane mask is contained in LM.
     MachineInstr *MI = LIS->getInstructionFromIndex(K);
@@ -363,9 +364,9 @@ void HexagonExpandCondsets::updateKillFlags(unsigned Reg) {
   }
 }
 
-void HexagonExpandCondsets::updateDeadsInRange(unsigned Reg, LaneBitmask LM,
-      LiveRange &Range) {
-  assert(Register::isVirtualRegister(Reg));
+void HexagonExpandCondsets::updateDeadsInRange(Register Reg, LaneBitmask LM,
+                                               LiveRange &Range) {
+  assert(Reg.isVirtual());
   if (Range.empty())
     return;
 
@@ -374,7 +375,7 @@ void HexagonExpandCondsets::updateDeadsInRange(unsigned Reg, LaneBitmask LM,
     if (!Op.isReg() || !Op.isDef())
       return { false, false };
     Register DR = Op.getReg(), DSR = Op.getSubReg();
-    if (!Register::isVirtualRegister(DR) || DR != Reg)
+    if (!DR.isVirtual() || DR != Reg)
       return { false, false };
     LaneBitmask SLM = getLaneMask(DR, DSR);
     LaneBitmask A = SLM & LM;
@@ -524,7 +525,7 @@ void HexagonExpandCondsets::updateDeadsInRange(unsigned Reg, LaneBitmask LM,
   }
 }
 
-void HexagonExpandCondsets::updateDeadFlags(unsigned Reg) {
+void HexagonExpandCondsets::updateDeadFlags(Register Reg) {
   LiveInterval &LI = LIS->getInterval(Reg);
   if (LI.hasSubRanges()) {
     for (LiveInterval::SubRange &S : LI.subranges()) {
@@ -538,7 +539,7 @@ void HexagonExpandCondsets::updateDeadFlags(unsigned Reg) {
   }
 }
 
-void HexagonExpandCondsets::recalculateLiveInterval(unsigned Reg) {
+void HexagonExpandCondsets::recalculateLiveInterval(Register Reg) {
   LIS->removeInterval(Reg);
   LIS->createAndComputeVirtRegInterval(Reg);
 }
@@ -548,12 +549,13 @@ void HexagonExpandCondsets::removeInstr(MachineInstr &MI) {
   MI.eraseFromParent();
 }
 
-void HexagonExpandCondsets::updateLiveness(std::set<unsigned> &RegSet,
-      bool Recalc, bool UpdateKills, bool UpdateDeads) {
+void HexagonExpandCondsets::updateLiveness(std::set<Register> &RegSet,
+                                           bool Recalc, bool UpdateKills,
+                                           bool UpdateDeads) {
   UpdateKills |= UpdateDeads;
-  for (unsigned R : RegSet) {
-    if (!Register::isVirtualRegister(R)) {
-      assert(Register::isPhysicalRegister(R));
+  for (Register R : RegSet) {
+    if (!R.isVirtual()) {
+      assert(R.isPhysical());
       // There shouldn't be any physical registers as operands, except
       // possibly reserved registers.
       assert(MRI->isReserved(R));
@@ -580,17 +582,16 @@ unsigned HexagonExpandCondsets::getCondTfrOpcode(const MachineOperand &SO,
   using namespace Hexagon;
 
   if (SO.isReg()) {
-    Register PhysR;
+    MCRegister PhysR;
     RegisterRef RS = SO;
-    if (Register::isVirtualRegister(RS.Reg)) {
+    if (RS.Reg.isVirtual()) {
       const TargetRegisterClass *VC = MRI->getRegClass(RS.Reg);
       assert(VC->begin() != VC->end() && "Empty register class");
       PhysR = *VC->begin();
     } else {
-      assert(Register::isPhysicalRegister(RS.Reg));
       PhysR = RS.Reg;
     }
-    Register PhysS = (RS.Sub == 0) ? PhysR : TRI->getSubReg(PhysR, RS.Sub);
+    MCRegister PhysS = (RS.Sub == 0) ? PhysR : TRI->getSubReg(PhysR, RS.Sub);
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysS);
     switch (TRI->getRegSizeInBits(*RC)) {
       case 32:
@@ -661,7 +662,7 @@ MachineInstr *HexagonExpandCondsets::genCondTfrFor(MachineOperand &SrcOp,
 /// Replace a MUX instruction MI with a pair A2_tfrt/A2_tfrf. This function
 /// performs all necessary changes to complete the replacement.
 bool HexagonExpandCondsets::split(MachineInstr &MI,
-                                  std::set<unsigned> &UpdRegs) {
+                                  std::set<Register> &UpdRegs) {
   if (TfrLimitActive) {
     if (TfrCounter >= TfrLimit)
       return false;
@@ -803,7 +804,7 @@ bool HexagonExpandCondsets::canMoveOver(MachineInstr &MI, ReferenceMap &Defs,
     // For physical register we would need to check register aliases, etc.
     // and we don't want to bother with that. It would be of little value
     // before the actual register rewriting (from virtual to physical).
-    if (!Register::isVirtualRegister(RR.Reg))
+    if (!RR.Reg.isVirtual())
       return false;
     // No redefs for any operand.
     if (isRefInMap(RR, Defs, Exec_Then))
@@ -855,7 +856,7 @@ void HexagonExpandCondsets::predicateAt(const MachineOperand &DefOp,
                                         MachineInstr &MI,
                                         MachineBasicBlock::iterator Where,
                                         const MachineOperand &PredOp, bool Cond,
-                                        std::set<unsigned> &UpdRegs) {
+                                        std::set<Register> &UpdRegs) {
   // The problem with updating live intervals is that we can move one def
   // past another def. In particular, this can happen when moving an A2_tfrt
   // over an A2_tfrf defining the same register. From the point of view of
@@ -933,7 +934,7 @@ void HexagonExpandCondsets::renameInRange(RegisterRef RO, RegisterRef RN,
 /// the copy under the given condition (using the same predicate register as
 /// the copy).
 bool HexagonExpandCondsets::predicate(MachineInstr &TfrI, bool Cond,
-                                      std::set<unsigned> &UpdRegs) {
+                                      std::set<Register> &UpdRegs) {
   // TfrI - A2_tfr[tf] Instruction (not A2_tfrsi).
   unsigned Opc = TfrI.getOpcode();
   (void)Opc;
@@ -1000,7 +1001,7 @@ bool HexagonExpandCondsets::predicate(MachineInstr &TfrI, bool Cond,
       // subregisters are other physical registers, and we are not checking
       // that.
       RegisterRef RR = Op;
-      if (!Register::isVirtualRegister(RR.Reg))
+      if (!RR.Reg.isVirtual())
         return false;
 
       ReferenceMap &Map = Op.isDef() ? Defs : Uses;
@@ -1067,7 +1068,7 @@ bool HexagonExpandCondsets::predicate(MachineInstr &TfrI, bool Cond,
 
 /// Predicate all cases of conditional copies in the specified block.
 bool HexagonExpandCondsets::predicateInBlock(MachineBasicBlock &B,
-      std::set<unsigned> &UpdRegs) {
+                                             std::set<Register> &UpdRegs) {
   bool Changed = false;
   MachineBasicBlock::iterator I, E, NextI;
   for (I = B.begin(), E = B.end(); I != E; I = NextI) {
@@ -1092,7 +1093,7 @@ bool HexagonExpandCondsets::predicateInBlock(MachineBasicBlock &B,
 }
 
 bool HexagonExpandCondsets::isIntReg(RegisterRef RR, unsigned &BW) {
-  if (!Register::isVirtualRegister(RR.Reg))
+  if (!RR.Reg.isVirtual())
     return false;
   const TargetRegisterClass *RC = MRI->getRegClass(RR.Reg);
   if (RC == &Hexagon::IntRegsRegClass) {
@@ -1172,7 +1173,7 @@ bool HexagonExpandCondsets::coalesceRegisters(RegisterRef R1, RegisterRef R2) {
     }
     L1.addSegment(LiveRange::Segment(I->start, I->end, NewVN));
   }
-  while (L2.begin() != L2.end())
+  while (!L2.empty())
     L2.removeSegment(*L2.begin());
   LIS->removeInterval(R2.Reg);
 
@@ -1187,8 +1188,8 @@ bool HexagonExpandCondsets::coalesceRegisters(RegisterRef R1, RegisterRef R2) {
 /// the destination register. This could lead to having only one predicated
 /// instruction in the end instead of two.
 bool HexagonExpandCondsets::coalesceSegments(
-      const SmallVectorImpl<MachineInstr*> &Condsets,
-      std::set<unsigned> &UpdRegs) {
+    const SmallVectorImpl<MachineInstr *> &Condsets,
+    std::set<Register> &UpdRegs) {
   SmallVector<MachineInstr*,16> TwoRegs;
   for (MachineInstr *MI : Condsets) {
     MachineOperand &S1 = MI->getOperand(2), &S2 = MI->getOperand(3);
@@ -1262,7 +1263,7 @@ bool HexagonExpandCondsets::runOnMachineFunction(MachineFunction &MF) {
                         MF.getFunction().getParent()));
 
   bool Changed = false;
-  std::set<unsigned> CoalUpd, PredUpd;
+  std::set<Register> CoalUpd, PredUpd;
 
   SmallVector<MachineInstr*,16> Condsets;
   for (auto &B : MF)
@@ -1279,7 +1280,7 @@ bool HexagonExpandCondsets::runOnMachineFunction(MachineFunction &MF) {
   // in the IR (they have been removed by live range analysis).
   // Updating them right before we split is the easiest, because splitting
   // adds definitions which would interfere with updating kills afterwards.
-  std::set<unsigned> KillUpd;
+  std::set<Register> KillUpd;
   for (MachineInstr *MI : Condsets)
     for (MachineOperand &Op : MI->operands())
       if (Op.isReg() && Op.isUse())
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
index 010b7171ce17..a62610ae2b7c 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -306,7 +306,7 @@ static bool needsStackFrame(const MachineBasicBlock &MBB, const BitVector &CSR,
           Register R = MO.getReg();
           // Virtual registers will need scavenging, which then may require
           // a stack slot.
-          if (Register::isVirtualRegister(R))
+          if (R.isVirtual())
             return true;
           for (MCSubRegIterator S(R, &HRI, true); S.isValid(); ++S)
             if (CSR[*S])
@@ -1104,7 +1104,8 @@ void HexagonFrameLowering::insertCFIInstructionsAt(MachineBasicBlock &MBB,
       Offset = MFI.getObjectOffset(F->getFrameIdx());
     } else {
       Register FrameReg;
-      Offset = getFrameIndexReference(MF, F->getFrameIdx(), FrameReg);
+      Offset =
+          getFrameIndexReference(MF, F->getFrameIdx(), FrameReg).getFixed();
     }
     // Subtract 8 to make room for R30 and R31, which are added above.
     Offset -= 8;
@@ -1256,9 +1257,9 @@ static const char *getSpillFunctionFor(unsigned MaxReg, SpillKind SpillType,
   return nullptr;
 }
 
-int HexagonFrameLowering::getFrameIndexReference(const MachineFunction &MF,
-                                                 int FI,
-                                                 Register &FrameReg) const {
+StackOffset
+HexagonFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
+                                             Register &FrameReg) const {
   auto &MFI = MF.getFrameInfo();
   auto &HRI = *MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
 
@@ -1354,7 +1355,7 @@ int HexagonFrameLowering::getFrameIndexReference(const MachineFunction &MF,
   int RealOffset = Offset;
   if (!UseFP && !UseAP)
     RealOffset = FrameSize+Offset;
-  return RealOffset;
+  return StackOffset::getFixed(RealOffset);
 }
 
 bool HexagonFrameLowering::insertCSRSpillsInBlock(MachineBasicBlock &MBB,
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonFrameLowering.h b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonFrameLowering.h
index 87d385e1ce3c..4ffd31b670e4 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonFrameLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonFrameLowering.h
@@ -11,6 +11,7 @@
 
 #include "Hexagon.h"
 #include "HexagonBlockRanges.h"
+#include "MCTargetDesc/HexagonMCTargetDesc.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -82,8 +83,8 @@ public:
     return true;
   }
 
-  int getFrameIndexReference(const MachineFunction &MF, int FI,
-                             Register &FrameReg) const override;
+  StackOffset getFrameIndexReference(const MachineFunction &MF, int FI,
+                                     Register &FrameReg) const override;
   bool hasFP(const MachineFunction &MF) const override;
 
   const SpillSlot *getCalleeSavedSpillSlots(unsigned &NumEntries)
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
index 2f29e88bc989..f2026877b22c 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
@@ -613,7 +613,7 @@ void HexagonGenInsert::buildOrderingMF(RegisterOrdering &RO) const {
         if (MO.isReg() && MO.isDef()) {
           Register R = MO.getReg();
           assert(MO.getSubReg() == 0 && "Unexpected subregister in definition");
-          if (Register::isVirtualRegister(R))
+          if (R.isVirtual())
             RO.insert(std::make_pair(R, Index++));
         }
       }
@@ -730,7 +730,7 @@ void HexagonGenInsert::getInstrDefs(const MachineInstr *MI,
     if (!MO.isReg() || !MO.isDef())
       continue;
     Register R = MO.getReg();
-    if (!Register::isVirtualRegister(R))
+    if (!R.isVirtual())
       continue;
     Defs.insert(R);
   }
@@ -743,7 +743,7 @@ void HexagonGenInsert::getInstrUses(const MachineInstr *MI,
     if (!MO.isReg() || !MO.isUse())
       continue;
     Register R = MO.getReg();
-    if (!Register::isVirtualRegister(R))
+    if (!R.isVirtual())
       continue;
     Uses.insert(R);
   }
@@ -1089,9 +1089,7 @@ void HexagonGenInsert::pruneCoveredSets(unsigned VR) {
     auto IsEmpty = [] (const IFRecordWithRegSet &IR) -> bool {
       return IR.second.empty();
     };
-    auto End = llvm::remove_if(LL, IsEmpty);
-    if (End != LL.end())
-      LL.erase(End, LL.end());
+    llvm::erase_if(LL, IsEmpty);
   } else {
     // The definition of VR is constant-extended, and all candidates have
     // empty removable-register sets. Pick the maximum candidate, and remove
@@ -1179,9 +1177,7 @@ void HexagonGenInsert::pruneRegCopies(unsigned VR) {
   auto IsCopy = [] (const IFRecordWithRegSet &IR) -> bool {
     return IR.first.Wdh == 32 && (IR.first.Off == 0 || IR.first.Off == 32);
   };
-  auto End = llvm::remove_if(LL, IsCopy);
-  if (End != LL.end())
-    LL.erase(End, LL.end());
+  llvm::erase_if(LL, IsCopy);
 }
 
 void HexagonGenInsert::pruneCandidates() {
@@ -1483,7 +1479,7 @@ bool HexagonGenInsert::removeDeadCode(MachineDomTreeNode *N) {
       if (!MO.isReg() || !MO.isDef())
         continue;
       Register R = MO.getReg();
-      if (!Register::isVirtualRegister(R) || !MRI->use_nodbg_empty(R)) {
+      if (!R.isVirtual() || !MRI->use_nodbg_empty(R)) {
         AllDead = false;
         break;
       }
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp
index 903287e68c99..d8d2025c5d27 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp
@@ -48,7 +48,8 @@ namespace {
 
   // FIXME: Use TargetInstrInfo::RegSubRegPair
   struct RegisterSubReg {
-    unsigned R, S;
+    Register R;
+    unsigned S;
 
     RegisterSubReg(unsigned r = 0, unsigned s = 0) : R(r), S(s) {}
     RegisterSubReg(const MachineOperand &MO) : R(MO.getReg()), S(MO.getSubReg()) {}
@@ -111,7 +112,7 @@ namespace {
     VectOfInst PUsers;
     RegToRegMap G2P;
 
-    bool isPredReg(unsigned R);
+    bool isPredReg(Register R);
     void collectPredicateGPR(MachineFunction &MF);
     void processPredicateGPR(const RegisterSubReg &Reg);
     unsigned getPredForm(unsigned Opc);
@@ -133,8 +134,8 @@ INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_END(HexagonGenPredicate, "hexagon-gen-pred",
   "Hexagon generate predicate operations", false, false)
 
-bool HexagonGenPredicate::isPredReg(unsigned R) {
-  if (!Register::isVirtualRegister(R))
+bool HexagonGenPredicate::isPredReg(Register R) {
+  if (!R.isVirtual())
     return false;
   const TargetRegisterClass *RC = MRI->getRegClass(R);
   return RC == &Hexagon::PredRegsRegClass;
@@ -214,7 +215,7 @@ void HexagonGenPredicate::collectPredicateGPR(MachineFunction &MF) {
         case TargetOpcode::COPY:
           if (isPredReg(MI->getOperand(1).getReg())) {
             RegisterSubReg RD = MI->getOperand(0);
-            if (Register::isVirtualRegister(RD.R))
+            if (RD.R.isVirtual())
               PredGPRs.insert(RD);
           }
           break;
@@ -246,7 +247,7 @@ RegisterSubReg HexagonGenPredicate::getPredRegFor(const RegisterSubReg &Reg) {
   // Create a predicate register for a given Reg. The newly created register
   // will have its value copied from Reg, so that it can be later used as
   // an operand in other instructions.
-  assert(Register::isVirtualRegister(Reg.R));
+  assert(Reg.R.isVirtual());
   RegToRegMap::iterator F = G2P.find(Reg);
   if (F != G2P.end())
     return F->second;
@@ -472,9 +473,9 @@ bool HexagonGenPredicate::eliminatePredCopies(MachineFunction &MF) {
         continue;
       RegisterSubReg DR = MI.getOperand(0);
       RegisterSubReg SR = MI.getOperand(1);
-      if (!Register::isVirtualRegister(DR.R))
+      if (!DR.R.isVirtual())
         continue;
-      if (!Register::isVirtualRegister(SR.R))
+      if (!SR.R.isVirtual())
         continue;
       if (MRI->getRegClass(DR.R) != PredRC)
         continue;
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
index 4833935f8d24..2f23e8643720 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
@@ -390,7 +390,7 @@ bool HexagonHardwareLoops::runOnMachineFunction(MachineFunction &MF) {
   TRI = HST.getRegisterInfo();
 
   for (auto &L : *MLI)
-    if (!L->getParentLoop()) {
+    if (L->isOutermost()) {
       bool L0Used = false;
       bool L1Used = false;
       Changed |= convertToHardwareLoop(L, L0Used, L1Used);
@@ -1432,7 +1432,7 @@ bool HexagonHardwareLoops::loopCountMayWrapOrUnderFlow(
   Register Reg = InitVal->getReg();
 
   // We don't know the value of a physical register.
-  if (!Register::isVirtualRegister(Reg))
+  if (!Reg.isVirtual())
     return true;
 
   MachineInstr *Def = MRI->getVRegDef(Reg);
@@ -1510,7 +1510,7 @@ bool HexagonHardwareLoops::checkForImmediate(const MachineOperand &MO,
   int64_t TV;
 
   Register R = MO.getReg();
-  if (!Register::isVirtualRegister(R))
+  if (!R.isVirtual())
     return false;
   MachineInstr *DI = MRI->getVRegDef(R);
   unsigned DOpc = DI->getOpcode();
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index b4b389a7b956..bdd5c7dd151e 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -231,10 +231,10 @@ SDNode *HexagonDAGToDAGISel::StoreInstrForLoadIntrinsic(MachineSDNode *LoadN,
 
   if (Size >= 4)
     TS = CurDAG->getStore(SDValue(LoadN, 2), dl, SDValue(LoadN, 0), Loc, PI,
-                          Size);
+                          Align(Size));
   else
     TS = CurDAG->getTruncStore(SDValue(LoadN, 2), dl, SDValue(LoadN, 0), Loc,
-                               PI, MVT::getIntegerVT(Size * 8), Size);
+                               PI, MVT::getIntegerVT(Size * 8), Align(Size));
 
   SDNode *StoreN;
   {
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
index c0f92042e5da..29e76b53910e 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
@@ -789,6 +789,12 @@ struct ShuffleMask {
     OS << " }";
   }
 };
+
+LLVM_ATTRIBUTE_UNUSED
+raw_ostream &operator<<(raw_ostream &OS, const ShuffleMask &SM) {
+  SM.print(OS);
+  return OS;
+}
 } // namespace
 
 // --------------------------------------------------------------------
@@ -828,6 +834,7 @@ namespace llvm {
     void selectVAlign(SDNode *N);
 
   private:
+    void select(SDNode *ISelN);
     void materialize(const ResultStack &Results);
 
     SDValue getVectorConstant(ArrayRef<uint8_t> Data, const SDLoc &dl);
@@ -931,46 +938,19 @@ bool HvxSelector::selectVectorConstants(SDNode *N) {
   SmallVector<SDNode*,4> Nodes;
   SetVector<SDNode*> WorkQ;
 
-  // The one-use test for VSPLATW's operand may fail due to dead nodes
-  // left over in the DAG.
-  DAG.RemoveDeadNodes();
-
   // The DAG can change (due to CSE) during selection, so cache all the
   // unselected nodes first to avoid traversing a mutating DAG.
-
-  auto IsNodeToSelect = [] (SDNode *N) {
-    if (N->isMachineOpcode())
-      return false;
-    switch (N->getOpcode()) {
-      case HexagonISD::VZERO:
-      case HexagonISD::VSPLATW:
-        return true;
-      case ISD::LOAD: {
-        SDValue Addr = cast<LoadSDNode>(N)->getBasePtr();
-        unsigned AddrOpc = Addr.getOpcode();
-        if (AddrOpc == HexagonISD::AT_PCREL || AddrOpc == HexagonISD::CP)
-          if (Addr.getOperand(0).getOpcode() == ISD::TargetConstantPool)
-            return true;
-      }
-      break;
-    }
-    // Make sure to select the operand of VSPLATW.
-    bool IsSplatOp = N->hasOneUse() &&
-                     N->use_begin()->getOpcode() == HexagonISD::VSPLATW;
-    return IsSplatOp;
-  };
-
   WorkQ.insert(N);
   for (unsigned i = 0; i != WorkQ.size(); ++i) {
     SDNode *W = WorkQ[i];
-    if (IsNodeToSelect(W))
+    if (!W->isMachineOpcode() && W->getOpcode() == HexagonISD::ISEL)
       Nodes.push_back(W);
     for (unsigned j = 0, f = W->getNumOperands(); j != f; ++j)
       WorkQ.insert(W->getOperand(j).getNode());
   }
 
   for (SDNode *L : Nodes)
-    ISel.Select(L);
+    select(L);
 
   return !Nodes.empty();
 }
@@ -1358,6 +1338,82 @@ namespace {
   };
 }
 
+void HvxSelector::select(SDNode *ISelN) {
+  // What's important here is to select the right set of nodes. The main
+  // selection algorithm loops over nodes in a topological order, i.e. users
+  // are visited before their operands.
+  //
+  // It is an error to have an unselected node with a selected operand, and
+  // there is an assertion in the main selector code to enforce that.
+  //
+  // Such a situation could occur if we selected a node, which is both a
+  // subnode of ISelN, and a subnode of an unrelated (and yet unselected)
+  // node in the DAG.
+  assert(ISelN->getOpcode() == HexagonISD::ISEL);
+  SDNode *N0 = ISelN->getOperand(0).getNode();
+  if (N0->isMachineOpcode()) {
+    ISel.ReplaceNode(ISelN, N0);
+    return;
+  }
+
+  // There could have been nodes created (i.e. inserted into the DAG)
+  // that are now dead. Remove them, in case they use any of the nodes
+  // to select (and make them look shared).
+  DAG.RemoveDeadNodes();
+
+  SetVector<SDNode*> SubNodes, TmpQ;
+  std::map<SDNode*,unsigned> NumOps;
+
+  // Don't want to select N0 if it's shared with another node, except if
+  // it's shared with other ISELs.
+  auto IsISelN = [](SDNode *T) { return T->getOpcode() == HexagonISD::ISEL; };
+  if (llvm::all_of(N0->uses(), IsISelN))
+    SubNodes.insert(N0);
+
+  auto InSubNodes = [&SubNodes](SDNode *T) { return SubNodes.count(T); };
+  for (unsigned I = 0; I != SubNodes.size(); ++I) {
+    SDNode *S = SubNodes[I];
+    unsigned OpN = 0;
+    // Only add subnodes that are only reachable from N0.
+    for (SDValue Op : S->ops()) {
+      SDNode *O = Op.getNode();
+      if (llvm::all_of(O->uses(), InSubNodes)) {
+        SubNodes.insert(O);
+        ++OpN;
+      }
+    }
+    NumOps.insert({S, OpN});
+    if (OpN == 0)
+      TmpQ.insert(S);
+  }
+
+  for (unsigned I = 0; I != TmpQ.size(); ++I) {
+    SDNode *S = TmpQ[I];
+    for (SDNode *U : S->uses()) {
+      if (U == ISelN)
+        continue;
+      auto F = NumOps.find(U);
+      assert(F != NumOps.end());
+      if (F->second > 0 && !--F->second)
+        TmpQ.insert(F->first);
+    }
+  }
+
+  // Remove the marker.
+  ISel.ReplaceNode(ISelN, N0);
+
+  assert(SubNodes.size() == TmpQ.size());
+  NullifyingVector<decltype(TmpQ)::vector_type> Queue(TmpQ.takeVector());
+
+  Deleter DUQ(DAG, Queue);
+  for (SDNode *S : reverse(Queue)) {
+    if (S == nullptr)
+      continue;
+    DEBUG_WITH_TYPE("isel", {dbgs() << "HVX selecting: "; S->dump(&DAG);});
+    ISel.Select(S);
+  }
+}
+
 bool HvxSelector::scalarizeShuffle(ArrayRef<int> Mask, const SDLoc &dl,
                                    MVT ResTy, SDValue Va, SDValue Vb,
                                    SDNode *N) {
@@ -1379,12 +1435,7 @@ bool HvxSelector::scalarizeShuffle(ArrayRef<int> Mask, const SDLoc &dl,
   // nodes, these nodes would not be selected (since the "local" selection
   // only visits nodes that are not in AllNodes).
   // To avoid this issue, remove all dead nodes from the DAG now.
-  DAG.RemoveDeadNodes();
-  DenseSet<SDNode*> AllNodes;
-  for (SDNode &S : DAG.allnodes())
-    AllNodes.insert(&S);
-
-  Deleter DUA(DAG, AllNodes);
+//  DAG.RemoveDeadNodes();
 
   SmallVector<SDValue,128> Ops;
   LLVMContext &Ctx = *DAG.getContext();
@@ -1434,57 +1485,9 @@ bool HvxSelector::scalarizeShuffle(ArrayRef<int> Mask, const SDLoc &dl,
   }
 
   assert(!N->use_empty());
-  ISel.ReplaceNode(N, LV.getNode());
-
-  if (AllNodes.count(LV.getNode())) {
-    DAG.RemoveDeadNodes();
-    return true;
-  }
-
-  // The lowered build-vector node will now need to be selected. It needs
-  // to be done here because this node and its submodes are not included
-  // in the main selection loop.
-  // Implement essentially the same topological ordering algorithm as is
-  // used in SelectionDAGISel.
-
-  SetVector<SDNode*> SubNodes, TmpQ;
-  std::map<SDNode*,unsigned> NumOps;
-
-  SubNodes.insert(LV.getNode());
-  for (unsigned I = 0; I != SubNodes.size(); ++I) {
-    unsigned OpN = 0;
-    SDNode *S = SubNodes[I];
-    for (SDValue Op : S->ops()) {
-      if (AllNodes.count(Op.getNode()))
-        continue;
-      SubNodes.insert(Op.getNode());
-      ++OpN;
-    }
-    NumOps.insert({S, OpN});
-    if (OpN == 0)
-      TmpQ.insert(S);
-  }
-
-  for (unsigned I = 0; I != TmpQ.size(); ++I) {
-    SDNode *S = TmpQ[I];
-    for (SDNode *U : S->uses()) {
-      if (!SubNodes.count(U))
-        continue;
-      auto F = NumOps.find(U);
-      assert(F != NumOps.end());
-      assert(F->second > 0);
-      if (!--F->second)
-        TmpQ.insert(F->first);
-    }
-  }
-  assert(SubNodes.size() == TmpQ.size());
-  NullifyingVector<decltype(TmpQ)::vector_type> Queue(TmpQ.takeVector());
-
-  Deleter DUQ(DAG, Queue);
-  for (SDNode *S : reverse(Queue))
-    if (S != nullptr)
-      ISel.Select(S);
-
+  SDValue IS = DAG.getNode(HexagonISD::ISEL, dl, ResTy, LV);
+  ISel.ReplaceNode(N, IS.getNode());
+  select(IS.getNode());
   DAG.RemoveDeadNodes();
   return true;
 }
@@ -1683,7 +1686,7 @@ OpRef HvxSelector::perfect(ShuffleMask SM, OpRef Va, ResultStack &Results) {
   // The result length must be the same as the length of a single vector,
   // or a vector pair.
   assert(LogLen == HwLog || LogLen == HwLog+1);
-  bool Extend = (LogLen == HwLog);
+  bool HavePairs = LogLen == HwLog+1;
 
   if (!isPermutation(SM.Mask))
     return OpRef::fail();
@@ -1767,6 +1770,22 @@ OpRef HvxSelector::perfect(ShuffleMask SM, OpRef Va, ResultStack &Results) {
   //  E  1 1 1 0      7  0 1 1 1      7  0 1 1 1      7  0 1 1 1
   //  F  1 1 1 1      F  1 1 1 1      F  1 1 1 1      F  1 1 1 1
 
+  // There is one special case that is not a perfect shuffle, but
+  // can be turned into one easily: when the shuffle operates on
+  // a vector pair, but the two vectors in the pair are swapped.
+  // The code below that identifies perfect shuffles will reject
+  // it, unless the order is reversed.
+  SmallVector<int,128> MaskStorage(SM.Mask.begin(), SM.Mask.end());
+  bool InvertedPair = false;
+  if (HavePairs && SM.Mask[0] >= int(HwLen)) {
+    for (int i = 0, e = SM.Mask.size(); i != e; ++i) {
+      int M = SM.Mask[i];
+      MaskStorage[i] = M >= int(HwLen) ? M-HwLen : M+HwLen;
+    }
+    InvertedPair = true;
+  }
+  ArrayRef<int> LocalMask(MaskStorage);
+
   auto XorPow2 = [] (ArrayRef<int> Mask, unsigned Num) {
     unsigned X = Mask[0] ^ Mask[Num/2];
     // Check that the first half has the X's bits clear.
@@ -1786,12 +1805,12 @@ OpRef HvxSelector::perfect(ShuffleMask SM, OpRef Va, ResultStack &Results) {
   assert(VecLen > 2);
   for (unsigned I = VecLen; I >= 2; I >>= 1) {
     // Examine the initial segment of Mask of size I.
-    unsigned X = XorPow2(SM.Mask, I);
+    unsigned X = XorPow2(LocalMask, I);
     if (!isPowerOf2_32(X))
       return OpRef::fail();
     // Check the other segments of Mask.
     for (int J = I; J < VecLen; J += I) {
-      if (XorPow2(SM.Mask.slice(J, I), I) != X)
+      if (XorPow2(LocalMask.slice(J, I), I) != X)
         return OpRef::fail();
     }
     Perm[Log2_32(X)] = Log2_32(I)-1;
@@ -1895,20 +1914,40 @@ OpRef HvxSelector::perfect(ShuffleMask SM, OpRef Va, ResultStack &Results) {
     }
   }
 
+  // From the cycles, construct the sequence of values that will
+  // then form the control values for vdealvdd/vshuffvdd, i.e.
+  // (M a1 a2)(M a3 a4 a5)... -> a1 a2 a3 a4 a5
+  // This essentially strips the M value from the cycles where
+  // it's present, and performs the insertion of M (then stripping)
+  // for cycles without M (as described in an earlier comment).
   SmallVector<unsigned,8> SwapElems;
-  if (HwLen == unsigned(VecLen))
+  // When the input is extended (i.e. single vector becomes a pair),
+  // this is done by using an "undef" vector as the second input.
+  // However, then we get
+  //   input 1: GOODBITS
+  //   input 2: ........
+  // but we need
+  //   input 1: ....BITS
+  //   input 2: ....GOOD
+  // Then at the end, this needs to be undone. To accomplish this,
+  // artificially add "LogLen-1" at both ends of the sequence.
+  if (!HavePairs)
     SwapElems.push_back(LogLen-1);
-
   for (const CycleType &C : Cycles) {
+    // Do the transformation: (a1..an) -> (M a1..an)(M a1).
     unsigned First = (C[0] == LogLen-1) ? 1 : 0;
     SwapElems.append(C.begin()+First, C.end());
     if (First == 0)
       SwapElems.push_back(C[0]);
   }
+  if (!HavePairs)
+    SwapElems.push_back(LogLen-1);
 
   const SDLoc &dl(Results.InpNode);
-  OpRef Arg = !Extend ? Va
-                      : concat(Va, OpRef::undef(SingleTy), Results);
+  OpRef Arg = HavePairs ? Va
+                        : concat(Va, OpRef::undef(SingleTy), Results);
+  if (InvertedPair)
+    Arg = concat(OpRef::hi(Arg), OpRef::lo(Arg), Results);
 
   for (unsigned I = 0, E = SwapElems.size(); I != E; ) {
     bool IsInc = I == E-1 || SwapElems[I] < SwapElems[I+1];
@@ -1932,7 +1971,7 @@ OpRef HvxSelector::perfect(ShuffleMask SM, OpRef Va, ResultStack &Results) {
     Arg = OpRef::res(Results.top());
   }
 
-  return !Extend ? Arg : OpRef::lo(Arg);
+  return HavePairs ? Arg : OpRef::lo(Arg);
 }
 
 OpRef HvxSelector::butterfly(ShuffleMask SM, OpRef Va, ResultStack &Results) {
@@ -1996,7 +2035,7 @@ SDValue HvxSelector::getVectorConstant(ArrayRef<uint8_t> Data,
   SDValue BV = DAG.getBuildVector(VecTy, dl, Elems);
   SDValue LV = Lower.LowerOperation(BV, DAG);
   DAG.RemoveDeadNode(BV.getNode());
-  return LV;
+  return DAG.getNode(HexagonISD::ISEL, dl, VecTy, LV);
 }
 
 void HvxSelector::selectShuffle(SDNode *N) {
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
index 768fea639cf9..c8994a3a28a3 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -1517,8 +1517,11 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
     setMinimumJumpTableEntries(std::numeric_limits<unsigned>::max());
   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
 
-  setOperationAction(ISD::ABS, MVT::i32, Legal);
-  setOperationAction(ISD::ABS, MVT::i64, Legal);
+  for (unsigned LegalIntOp :
+       {ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) {
+    setOperationAction(LegalIntOp, MVT::i32, Legal);
+    setOperationAction(LegalIntOp, MVT::i64, Legal);
+  }
 
   // Hexagon has A4_addp_c and A4_subp_c that take and generate a carry bit,
   // but they only operate on i64.
@@ -1620,7 +1623,8 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
     ISD::BUILD_VECTOR,          ISD::SCALAR_TO_VECTOR,
     ISD::EXTRACT_VECTOR_ELT,    ISD::INSERT_VECTOR_ELT,
     ISD::EXTRACT_SUBVECTOR,     ISD::INSERT_SUBVECTOR,
-    ISD::CONCAT_VECTORS,        ISD::VECTOR_SHUFFLE
+    ISD::CONCAT_VECTORS,        ISD::VECTOR_SHUFFLE,
+    ISD::SPLAT_VECTOR,
   };
 
   for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
@@ -1677,6 +1681,16 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::AND, NativeVT, Legal);
     setOperationAction(ISD::OR,  NativeVT, Legal);
     setOperationAction(ISD::XOR, NativeVT, Legal);
+
+    if (NativeVT.getVectorElementType() != MVT::i1)
+      setOperationAction(ISD::SPLAT_VECTOR, NativeVT, Legal);
+  }
+
+  for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32}) {
+    setOperationAction(ISD::SMIN, VT, Legal);
+    setOperationAction(ISD::SMAX, VT, Legal);
+    setOperationAction(ISD::UMIN, VT, Legal);
+    setOperationAction(ISD::UMAX, VT, Legal);
   }
 
   // Custom lower unaligned loads.
@@ -1843,15 +1857,12 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case HexagonISD::VASL:          return "HexagonISD::VASL";
   case HexagonISD::VASR:          return "HexagonISD::VASR";
   case HexagonISD::VLSR:          return "HexagonISD::VLSR";
-  case HexagonISD::VSPLAT:        return "HexagonISD::VSPLAT";
   case HexagonISD::VEXTRACTW:     return "HexagonISD::VEXTRACTW";
   case HexagonISD::VINSERTW0:     return "HexagonISD::VINSERTW0";
   case HexagonISD::VROR:          return "HexagonISD::VROR";
   case HexagonISD::READCYCLE:     return "HexagonISD::READCYCLE";
   case HexagonISD::PTRUE:         return "HexagonISD::PTRUE";
   case HexagonISD::PFALSE:        return "HexagonISD::PFALSE";
-  case HexagonISD::VZERO:         return "HexagonISD::VZERO";
-  case HexagonISD::VSPLATW:       return "HexagonISD::VSPLATW";
   case HexagonISD::D2P:           return "HexagonISD::D2P";
   case HexagonISD::P2D:           return "HexagonISD::P2D";
   case HexagonISD::V2Q:           return "HexagonISD::V2Q";
@@ -1862,6 +1873,10 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case HexagonISD::TYPECAST:      return "HexagonISD::TYPECAST";
   case HexagonISD::VALIGN:        return "HexagonISD::VALIGN";
   case HexagonISD::VALIGNADDR:    return "HexagonISD::VALIGNADDR";
+  case HexagonISD::VPACKL:        return "HexagonISD::VPACKL";
+  case HexagonISD::VUNPACK:       return "HexagonISD::VUNPACK";
+  case HexagonISD::VUNPACKU:      return "HexagonISD::VUNPACKU";
+  case HexagonISD::ISEL:          return "HexagonISD::ISEL";
   case HexagonISD::OP_END:        break;
   }
   return nullptr;
@@ -2064,20 +2079,9 @@ HexagonTargetLowering::getPreferredVectorAction(MVT VT) const {
     return TargetLoweringBase::TypeScalarizeVector;
 
   if (Subtarget.useHVXOps()) {
-    unsigned HwLen = Subtarget.getVectorLength();
-    // If the size of VT is at least half of the vector length,
-    // widen the vector. Note: the threshold was not selected in
-    // any scientific way.
-    ArrayRef<MVT> Tys = Subtarget.getHVXElementTypes();
-    if (llvm::find(Tys, ElemTy) != Tys.end()) {
-      unsigned HwWidth = 8*HwLen;
-      unsigned VecWidth = VT.getSizeInBits();
-      if (VecWidth >= HwWidth/2 && VecWidth < HwWidth)
-        return TargetLoweringBase::TypeWidenVector;
-    }
-    // Split vectors of i1 that correspond to (byte) vector pairs.
-    if (ElemTy == MVT::i1 && VecLen == 2*HwLen)
-      return TargetLoweringBase::TypeSplitVector;
+    unsigned Action = getPreferredHvxVectorAction(VT);
+    if (Action != ~0u)
+      return static_cast<TargetLoweringBase::LegalizeTypeAction>(Action);
   }
 
   // Always widen (remaining) vectors of i1.
@@ -2229,26 +2233,33 @@ HexagonTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG)
 SDValue
 HexagonTargetLowering::getVectorShiftByInt(SDValue Op, SelectionDAG &DAG)
       const {
-  if (auto *BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode())) {
-    if (SDValue S = BVN->getSplatValue()) {
-      unsigned NewOpc;
-      switch (Op.getOpcode()) {
-        case ISD::SHL:
-          NewOpc = HexagonISD::VASL;
-          break;
-        case ISD::SRA:
-          NewOpc = HexagonISD::VASR;
-          break;
-        case ISD::SRL:
-          NewOpc = HexagonISD::VLSR;
-          break;
-        default:
-          llvm_unreachable("Unexpected shift opcode");
-      }
-      return DAG.getNode(NewOpc, SDLoc(Op), ty(Op), Op.getOperand(0), S);
-    }
+  unsigned NewOpc;
+  switch (Op.getOpcode()) {
+    case ISD::SHL:
+      NewOpc = HexagonISD::VASL;
+      break;
+    case ISD::SRA:
+      NewOpc = HexagonISD::VASR;
+      break;
+    case ISD::SRL:
+      NewOpc = HexagonISD::VLSR;
+      break;
+    default:
+      llvm_unreachable("Unexpected shift opcode");
   }
 
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  const SDLoc &dl(Op);
+
+  switch (Op1.getOpcode()) {
+    case ISD::BUILD_VECTOR:
+      if (SDValue S = cast<BuildVectorSDNode>(Op1)->getSplatValue())
+        return DAG.getNode(NewOpc, dl, ty(Op), Op0, S);
+      break;
+    case ISD::SPLAT_VECTOR:
+      return DAG.getNode(NewOpc, dl, ty(Op), Op0, Op1.getOperand(0));
+  }
   return SDValue();
 }
 
@@ -2325,9 +2336,10 @@ HexagonTargetLowering::buildVector32(ArrayRef<SDValue> Elem, const SDLoc &dl,
   bool AllConst = getBuildVectorConstInts(Elem, VecTy, DAG, Consts);
 
   unsigned First, Num = Elem.size();
-  for (First = 0; First != Num; ++First)
+  for (First = 0; First != Num; ++First) {
     if (!isUndef(Elem[First]))
       break;
+  }
   if (First == Num)
     return DAG.getUNDEF(VecTy);
 
@@ -2359,18 +2371,16 @@ HexagonTargetLowering::buildVector32(ArrayRef<SDValue> Elem, const SDLoc &dl,
 
     // Then try splat.
     bool IsSplat = true;
-    for (unsigned i = 0; i != Num; ++i) {
-      if (i == First)
-        continue;
+    for (unsigned i = First+1; i != Num; ++i) {
       if (Elem[i] == Elem[First] || isUndef(Elem[i]))
         continue;
       IsSplat = false;
       break;
     }
     if (IsSplat) {
-      // Legalize the operand to VSPLAT.
+      // Legalize the operand of SPLAT_VECTOR.
       SDValue Ext = DAG.getZExtOrTrunc(Elem[First], dl, MVT::i32);
-      return DAG.getNode(HexagonISD::VSPLAT, dl, VecTy, Ext);
+      return DAG.getNode(ISD::SPLAT_VECTOR, dl, VecTy, Ext);
     }
 
     // Generate
@@ -2408,9 +2418,10 @@ HexagonTargetLowering::buildVector64(ArrayRef<SDValue> Elem, const SDLoc &dl,
   bool AllConst = getBuildVectorConstInts(Elem, VecTy, DAG, Consts);
 
   unsigned First, Num = Elem.size();
-  for (First = 0; First != Num; ++First)
+  for (First = 0; First != Num; ++First) {
     if (!isUndef(Elem[First]))
       break;
+  }
   if (First == Num)
     return DAG.getUNDEF(VecTy);
 
@@ -2421,18 +2432,16 @@ HexagonTargetLowering::buildVector64(ArrayRef<SDValue> Elem, const SDLoc &dl,
   // First try splat if possible.
   if (ElemTy == MVT::i16) {
     bool IsSplat = true;
-    for (unsigned i = 0; i != Num; ++i) {
-      if (i == First)
-        continue;
+    for (unsigned i = First+1; i != Num; ++i) {
       if (Elem[i] == Elem[First] || isUndef(Elem[i]))
         continue;
       IsSplat = false;
       break;
     }
     if (IsSplat) {
-      // Legalize the operand to VSPLAT.
+      // Legalize the operand of SPLAT_VECTOR
       SDValue Ext = DAG.getZExtOrTrunc(Elem[First], dl, MVT::i32);
-      return DAG.getNode(HexagonISD::VSPLAT, dl, VecTy, Ext);
+      return DAG.getNode(ISD::SPLAT_VECTOR, dl, VecTy, Ext);
     }
   }
 
@@ -2650,7 +2659,7 @@ HexagonTargetLowering::getZero(const SDLoc &dl, MVT Ty, SelectionDAG &DAG)
     unsigned W = Ty.getSizeInBits();
     if (W <= 64)
       return DAG.getBitcast(Ty, DAG.getConstant(0, dl, MVT::getIntegerVT(W)));
-    return DAG.getNode(HexagonISD::VZERO, dl, Ty);
+    return DAG.getNode(ISD::SPLAT_VECTOR, dl, Ty, getZero(dl, MVT::i32, DAG));
   }
 
   if (Ty.isInteger())
@@ -2660,6 +2669,28 @@ HexagonTargetLowering::getZero(const SDLoc &dl, MVT Ty, SelectionDAG &DAG)
   llvm_unreachable("Invalid type for zero");
 }
 
+SDValue
+HexagonTargetLowering::appendUndef(SDValue Val, MVT ResTy, SelectionDAG &DAG)
+      const {
+  MVT ValTy = ty(Val);
+  assert(ValTy.getVectorElementType() == ResTy.getVectorElementType());
+
+  unsigned ValLen = ValTy.getVectorNumElements();
+  unsigned ResLen = ResTy.getVectorNumElements();
+  if (ValLen == ResLen)
+    return Val;
+
+  const SDLoc &dl(Val);
+  assert(ValLen < ResLen);
+  assert(ResLen % ValLen == 0);
+
+  SmallVector<SDValue, 4> Concats = {Val};
+  for (unsigned i = 1, e = ResLen / ValLen; i < e; ++i)
+    Concats.push_back(DAG.getUNDEF(ValTy));
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResTy, Concats);
+}
+
 SDValue
 HexagonTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   MVT VecTy = ty(Op);
@@ -2910,8 +2941,10 @@ HexagonTargetLowering::LowerUnalignedLoad(SDValue Op, SelectionDAG &DAG)
       ? DAG.getNode(HexagonISD::VALIGNADDR, dl, MVT::i32, BO.first,
                     DAG.getConstant(NeedAlign, dl, MVT::i32))
       : BO.first;
-  SDValue Base0 = DAG.getMemBasePlusOffset(BaseNoOff, BO.second, dl);
-  SDValue Base1 = DAG.getMemBasePlusOffset(BaseNoOff, BO.second+LoadLen, dl);
+  SDValue Base0 =
+      DAG.getMemBasePlusOffset(BaseNoOff, TypeSize::Fixed(BO.second), dl);
+  SDValue Base1 = DAG.getMemBasePlusOffset(
+      BaseNoOff, TypeSize::Fixed(BO.second + LoadLen), dl);
 
   MachineMemOperand *WideMMO = nullptr;
   if (MachineMemOperand *MMO = LN->getMemOperand()) {
@@ -3023,7 +3056,7 @@ HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   if (Opc == ISD::INLINEASM || Opc == ISD::INLINEASM_BR)
     return LowerINLINEASM(Op, DAG);
 
-  if (isHvxOperation(Op)) {
+  if (isHvxOperation(Op.getNode(), DAG)) {
     // If HVX lowering returns nothing, try the default lowering.
     if (SDValue V = LowerHvxOperation(Op, DAG))
       return V;
@@ -3084,7 +3117,7 @@ void
 HexagonTargetLowering::LowerOperationWrapper(SDNode *N,
                                              SmallVectorImpl<SDValue> &Results,
                                              SelectionDAG &DAG) const {
-  if (isHvxOperation(N)) {
+  if (isHvxOperation(N, DAG)) {
     LowerHvxOperationWrapper(N, Results, DAG);
     if (!Results.empty())
       return;
@@ -3103,7 +3136,7 @@ void
 HexagonTargetLowering::ReplaceNodeResults(SDNode *N,
                                           SmallVectorImpl<SDValue> &Results,
                                           SelectionDAG &DAG) const {
-  if (isHvxOperation(N)) {
+  if (isHvxOperation(N, DAG)) {
     ReplaceHvxNodeResults(N, Results, DAG);
     if (!Results.empty())
       return;
@@ -3118,10 +3151,12 @@ HexagonTargetLowering::ReplaceNodeResults(SDNode *N,
     case ISD::BITCAST:
       // Handle a bitcast from v8i1 to i8.
       if (N->getValueType(0) == MVT::i8) {
-        SDValue P = getInstr(Hexagon::C2_tfrpr, dl, MVT::i32,
-                             N->getOperand(0), DAG);
-        SDValue T = DAG.getAnyExtOrTrunc(P, dl, MVT::i8);
-        Results.push_back(T);
+        if (N->getOperand(0).getValueType() == MVT::v8i1) {
+          SDValue P = getInstr(Hexagon::C2_tfrpr, dl, MVT::i32,
+                               N->getOperand(0), DAG);
+          SDValue T = DAG.getAnyExtOrTrunc(P, dl, MVT::i8);
+          Results.push_back(T);
+        }
       }
       break;
   }
@@ -3130,13 +3165,16 @@ HexagonTargetLowering::ReplaceNodeResults(SDNode *N,
 SDValue
 HexagonTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
       const {
-  SDValue Op(N, 0);
-  if (isHvxOperation(Op)) {
+  if (isHvxOperation(N, DCI.DAG)) {
     if (SDValue V = PerformHvxDAGCombine(N, DCI))
       return V;
     return SDValue();
   }
 
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SDValue Op(N, 0);
   const SDLoc &dl(Op);
   unsigned Opc = Op.getOpcode();
 
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLowering.h b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLowering.h
index 7d6e6b6185c8..cfccb14a09c9 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLowering.h
@@ -15,6 +15,7 @@
 #define LLVM_LIB_TARGET_HEXAGON_HEXAGONISELLOWERING_H
 
 #include "Hexagon.h"
+#include "MCTargetDesc/HexagonMCTargetDesc.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
@@ -30,464 +31,478 @@ namespace llvm {
 
 namespace HexagonISD {
 
-    enum NodeType : unsigned {
-      OP_BEGIN = ISD::BUILTIN_OP_END,
-
-      CONST32 = OP_BEGIN,
-      CONST32_GP,  // For marking data present in GP.
-      ADDC,        // Add with carry: (X, Y, Cin) -> (X+Y, Cout).
-      SUBC,        // Sub with carry: (X, Y, Cin) -> (X+~Y+Cin, Cout).
-      ALLOCA,
-
-      AT_GOT,      // Index in GOT.
-      AT_PCREL,    // Offset relative to PC.
-
-      CALL,        // Function call.
-      CALLnr,      // Function call that does not return.
-      CALLR,
-
-      RET_FLAG,    // Return with a flag operand.
-      BARRIER,     // Memory barrier.
-      JT,          // Jump table.
-      CP,          // Constant pool.
-
-      COMBINE,
-      VSPLAT,      // Generic splat, selection depends on argument/return
-                   // types.
-      VASL,
-      VASR,
-      VLSR,
-
-      TSTBIT,
-      INSERT,
-      EXTRACTU,
-      VEXTRACTW,
-      VINSERTW0,
-      VROR,
-      TC_RETURN,
-      EH_RETURN,
-      DCFETCH,
-      READCYCLE,
-      PTRUE,
-      PFALSE,
-      D2P,         // Convert 8-byte value to 8-bit predicate register. [*]
-      P2D,         // Convert 8-bit predicate register to 8-byte value. [*]
-      V2Q,         // Convert HVX vector to a vector predicate reg. [*]
-      Q2V,         // Convert vector predicate to an HVX vector. [*]
-                   // [*] The equivalence is defined as "Q <=> (V != 0)",
-                   //     where the != operation compares bytes.
-                   // Note: V != 0 is implemented as V >u 0.
-      QCAT,
-      QTRUE,
-      QFALSE,
-      VZERO,
-      VSPLATW,     // HVX splat of a 32-bit word with an arbitrary result type.
-      TYPECAST,    // No-op that's used to convert between different legal
-                   // types in a register.
-      VALIGN,      // Align two vectors (in Op0, Op1) to one that would have
-                   // been loaded from address in Op2.
-      VALIGNADDR,  // Align vector address: Op0 & -Op1, except when it is
-                   // an address in a vector load, then it's a no-op.
-      OP_END
-    };
+enum NodeType : unsigned {
+  OP_BEGIN = ISD::BUILTIN_OP_END,
+
+  CONST32 = OP_BEGIN,
+  CONST32_GP,  // For marking data present in GP.
+  ADDC,        // Add with carry: (X, Y, Cin) -> (X+Y, Cout).
+  SUBC,        // Sub with carry: (X, Y, Cin) -> (X+~Y+Cin, Cout).
+  ALLOCA,
+
+  AT_GOT,      // Index in GOT.
+  AT_PCREL,    // Offset relative to PC.
+
+  CALL,        // Function call.
+  CALLnr,      // Function call that does not return.
+  CALLR,
+
+  RET_FLAG,    // Return with a flag operand.
+  BARRIER,     // Memory barrier.
+  JT,          // Jump table.
+  CP,          // Constant pool.
+
+  COMBINE,
+  VASL,
+  VASR,
+  VLSR,
+
+  TSTBIT,
+  INSERT,
+  EXTRACTU,
+  VEXTRACTW,
+  VINSERTW0,
+  VROR,
+  TC_RETURN,
+  EH_RETURN,
+  DCFETCH,
+  READCYCLE,
+  PTRUE,
+  PFALSE,
+  D2P,         // Convert 8-byte value to 8-bit predicate register. [*]
+  P2D,         // Convert 8-bit predicate register to 8-byte value. [*]
+  V2Q,         // Convert HVX vector to a vector predicate reg. [*]
+  Q2V,         // Convert vector predicate to an HVX vector. [*]
+               // [*] The equivalence is defined as "Q <=> (V != 0)",
+               //     where the != operation compares bytes.
+               // Note: V != 0 is implemented as V >u 0.
+  QCAT,
+  QTRUE,
+  QFALSE,
+  TYPECAST,    // No-op that's used to convert between different legal
+               // types in a register.
+  VALIGN,      // Align two vectors (in Op0, Op1) to one that would have
+               // been loaded from address in Op2.
+  VALIGNADDR,  // Align vector address: Op0 & -Op1, except when it is
+               // an address in a vector load, then it's a no-op.
+  VPACKL,      // Pack low parts of the input vector to the front of the
+               // output. For example v64i16 VPACKL(v32i32) will pick
+               // the low halfwords and pack them into the first 32
+               // halfwords of the output. The rest of the output is
+               // unspecified.
+  VUNPACK,     // Unpacking into low elements with sign extension.
+  VUNPACKU,    // Unpacking into low elements with zero extension.
+  ISEL,        // Marker for nodes that were created during ISel, and
+               // which need explicit selection (would have been left
+               // unselected otherwise).
+  OP_END
+};
 
 } // end namespace HexagonISD
 
-  class HexagonSubtarget;
-
-  class HexagonTargetLowering : public TargetLowering {
-    int VarArgsFrameOffset;   // Frame offset to start of varargs area.
-    const HexagonTargetMachine &HTM;
-    const HexagonSubtarget &Subtarget;
-
-    bool CanReturnSmallStruct(const Function* CalleeFn, unsigned& RetSize)
-        const;
-
-  public:
-    explicit HexagonTargetLowering(const TargetMachine &TM,
-                                   const HexagonSubtarget &ST);
-
-    bool isHVXVectorType(MVT Ty) const;
-
-    /// IsEligibleForTailCallOptimization - Check whether the call is eligible
-    /// for tail call optimization. Targets which want to do tail call
-    /// optimization should implement this function.
-    bool IsEligibleForTailCallOptimization(SDValue Callee,
-        CallingConv::ID CalleeCC, bool isVarArg, bool isCalleeStructRet,
-        bool isCallerStructRet, const SmallVectorImpl<ISD::OutputArg> &Outs,
-        const SmallVectorImpl<SDValue> &OutVals,
-        const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG& DAG) const;
-
-    bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
-                            MachineFunction &MF,
-                            unsigned Intrinsic) const override;
-
-    bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
-    bool isTruncateFree(EVT VT1, EVT VT2) const override;
-
-    bool isCheapToSpeculateCttz() const override { return true; }
-    bool isCheapToSpeculateCtlz() const override { return true; }
-    bool isCtlzFast() const override { return true; }
-
-    bool hasBitTest(SDValue X, SDValue Y) const override;
-
-    bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override;
-
-    /// Return true if an FMA operation is faster than a pair of mul and add
-    /// instructions. fmuladd intrinsics will be expanded to FMAs when this
-    /// method returns true (and FMAs are legal), otherwise fmuladd is
-    /// expanded to mul + add.
-    bool isFMAFasterThanFMulAndFAdd(const MachineFunction &,
-                                    EVT) const override;
-
-    // Should we expand the build vector with shuffles?
-    bool shouldExpandBuildVectorWithShuffles(EVT VT,
-        unsigned DefinedValues) const override;
-
-    bool isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const override;
-    TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT)
-        const override;
-
-    SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
-    void LowerOperationWrapper(SDNode *N, SmallVectorImpl<SDValue> &Results,
-                               SelectionDAG &DAG) const override;
-    void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
-                            SelectionDAG &DAG) const override;
-
-    const char *getTargetNodeName(unsigned Opcode) const override;
-
-    SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerVECTOR_SHIFT(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerROTL(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerANY_EXTEND(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerLoad(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerStore(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerUnalignedLoad(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerUAddSubO(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerAddSubCarry(SDValue Op, SelectionDAG &DAG) const;
-
-    SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerREADCYCLECOUNTER(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerEH_LABEL(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
-    SDValue
-    LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
-                         const SmallVectorImpl<ISD::InputArg> &Ins,
-                         const SDLoc &dl, SelectionDAG &DAG,
-                         SmallVectorImpl<SDValue> &InVals) const override;
-    SDValue LowerGLOBALADDRESS(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
-        SelectionDAG &DAG) const;
-    SDValue LowerToTLSInitialExecModel(GlobalAddressSDNode *GA,
-        SelectionDAG &DAG) const;
-    SDValue LowerToTLSLocalExecModel(GlobalAddressSDNode *GA,
-        SelectionDAG &DAG) const;
-    SDValue GetDynamicTLSAddr(SelectionDAG &DAG, SDValue Chain,
-        GlobalAddressSDNode *GA, SDValue InFlag, EVT PtrVT,
-        unsigned ReturnReg, unsigned char OperandFlags) const;
-    SDValue LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG) const;
-
-    SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
-        SmallVectorImpl<SDValue> &InVals) const override;
-    SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
-                            CallingConv::ID CallConv, bool isVarArg,
-                            const SmallVectorImpl<ISD::InputArg> &Ins,
-                            const SDLoc &dl, SelectionDAG &DAG,
-                            SmallVectorImpl<SDValue> &InVals,
-                            const SmallVectorImpl<SDValue> &OutVals,
-                            SDValue Callee) const;
-
-    SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG& DAG) const;
-    SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
-
-    bool CanLowerReturn(CallingConv::ID CallConv,
-                        MachineFunction &MF, bool isVarArg,
-                        const SmallVectorImpl<ISD::OutputArg> &Outs,
-                        LLVMContext &Context) const override;
-
-    SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
-                        const SmallVectorImpl<ISD::OutputArg> &Outs,
-                        const SmallVectorImpl<SDValue> &OutVals,
-                        const SDLoc &dl, SelectionDAG &DAG) const override;
-
-    SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
-
-    bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
-
-    Register getRegisterByName(const char* RegName, LLT VT,
-                               const MachineFunction &MF) const override;
-
-    /// If a physical register, this returns the register that receives the
-    /// exception address on entry to an EH pad.
-    Register
-    getExceptionPointerRegister(const Constant *PersonalityFn) const override {
-      return Hexagon::R0;
-    }
-
-    /// If a physical register, this returns the register that receives the
-    /// exception typeid on entry to a landing pad.
-    Register
-    getExceptionSelectorRegister(const Constant *PersonalityFn) const override {
-      return Hexagon::R1;
-    }
-
-    SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
-
-    EVT getSetCCResultType(const DataLayout &, LLVMContext &C,
-                           EVT VT) const override {
-      if (!VT.isVector())
-        return MVT::i1;
-      else
-        return EVT::getVectorVT(C, MVT::i1, VT.getVectorNumElements());
-    }
-
-    bool getPostIndexedAddressParts(SDNode *N, SDNode *Op,
-                                    SDValue &Base, SDValue &Offset,
-                                    ISD::MemIndexedMode &AM,
-                                    SelectionDAG &DAG) const override;
-
-    ConstraintType getConstraintType(StringRef Constraint) const override;
-
-    std::pair<unsigned, const TargetRegisterClass *>
-    getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
-                                 StringRef Constraint, MVT VT) const override;
-
-    unsigned
-    getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
-      if (ConstraintCode == "o")
-        return InlineAsm::Constraint_o;
-      return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
-    }
-
-    // Intrinsics
-    SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
-    /// isLegalAddressingMode - Return true if the addressing mode represented
-    /// by AM is legal for this target, for a load/store of the specified type.
-    /// The type may be VoidTy, in which case only return true if the addressing
-    /// mode is legal for a load/store of any legal type.
-    /// TODO: Handle pre/postinc as well.
-    bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
-                               Type *Ty, unsigned AS,
-                               Instruction *I = nullptr) const override;
-    /// Return true if folding a constant offset with the given GlobalAddress
-    /// is legal.  It is frequently not legal in PIC relocation models.
-    bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
-
-    bool isFPImmLegal(const APFloat &Imm, EVT VT,
-                      bool ForCodeSize) const override;
-
-    /// isLegalICmpImmediate - Return true if the specified immediate is legal
-    /// icmp immediate, that is the target has icmp instructions which can
-    /// compare a register against the immediate without having to materialize
-    /// the immediate into a register.
-    bool isLegalICmpImmediate(int64_t Imm) const override;
-
-    EVT getOptimalMemOpType(const MemOp &Op,
-                            const AttributeList &FuncAttributes) const override;
-
-    bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT,
-                            unsigned AddrSpace, Align Alignment,
-                            MachineMemOperand::Flags Flags,
-                            bool *Fast) const override;
-
-    bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace,
-        unsigned Alignment, MachineMemOperand::Flags Flags, bool *Fast)
-        const override;
-
-    /// Returns relocation base for the given PIC jumptable.
-    SDValue getPICJumpTableRelocBase(SDValue Table, SelectionDAG &DAG)
-                                     const override;
-
-    bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy,
-                               EVT NewVT) const override;
-
-    // Handling of atomic RMW instructions.
-    Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
-        AtomicOrdering Ord) const override;
-    Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val,
-        Value *Addr, AtomicOrdering Ord) const override;
-    AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
-    bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
-    AtomicExpansionKind
-    shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override;
-
-    AtomicExpansionKind
-    shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override {
-      return AtomicExpansionKind::LLSC;
-    }
-
-  private:
-    void initializeHVXLowering();
-    void validateConstPtrAlignment(SDValue Ptr, const SDLoc &dl,
-                                   unsigned NeedAlign) const;
-
-    std::pair<SDValue,int> getBaseAndOffset(SDValue Addr) const;
-
-    bool getBuildVectorConstInts(ArrayRef<SDValue> Values, MVT VecTy,
-                                 SelectionDAG &DAG,
-                                 MutableArrayRef<ConstantInt*> Consts) const;
-    SDValue buildVector32(ArrayRef<SDValue> Elem, const SDLoc &dl, MVT VecTy,
+class HexagonSubtarget;
+
+class HexagonTargetLowering : public TargetLowering {
+  int VarArgsFrameOffset;   // Frame offset to start of varargs area.
+  const HexagonTargetMachine &HTM;
+  const HexagonSubtarget &Subtarget;
+
+  bool CanReturnSmallStruct(const Function* CalleeFn, unsigned& RetSize)
+      const;
+
+public:
+  explicit HexagonTargetLowering(const TargetMachine &TM,
+                                 const HexagonSubtarget &ST);
+
+  bool isHVXVectorType(MVT Ty) const;
+
+  /// IsEligibleForTailCallOptimization - Check whether the call is eligible
+  /// for tail call optimization. Targets which want to do tail call
+  /// optimization should implement this function.
+  bool IsEligibleForTailCallOptimization(SDValue Callee,
+      CallingConv::ID CalleeCC, bool isVarArg, bool isCalleeStructRet,
+      bool isCallerStructRet, const SmallVectorImpl<ISD::OutputArg> &Outs,
+      const SmallVectorImpl<SDValue> &OutVals,
+      const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG& DAG) const;
+
+  bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
+                          MachineFunction &MF,
+                          unsigned Intrinsic) const override;
+
+  bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
+  bool isTruncateFree(EVT VT1, EVT VT2) const override;
+
+  bool isCheapToSpeculateCttz() const override { return true; }
+  bool isCheapToSpeculateCtlz() const override { return true; }
+  bool isCtlzFast() const override { return true; }
+
+  bool hasBitTest(SDValue X, SDValue Y) const override;
+
+  bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override;
+
+  /// Return true if an FMA operation is faster than a pair of mul and add
+  /// instructions. fmuladd intrinsics will be expanded to FMAs when this
+  /// method returns true (and FMAs are legal), otherwise fmuladd is
+  /// expanded to mul + add.
+  bool isFMAFasterThanFMulAndFAdd(const MachineFunction &,
+                                  EVT) const override;
+
+  // Should we expand the build vector with shuffles?
+  bool shouldExpandBuildVectorWithShuffles(EVT VT,
+      unsigned DefinedValues) const override;
+
+  bool isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const override;
+  TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT)
+      const override;
+
+  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+  void LowerOperationWrapper(SDNode *N, SmallVectorImpl<SDValue> &Results,
+                             SelectionDAG &DAG) const override;
+  void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
+                          SelectionDAG &DAG) const override;
+
+  const char *getTargetNodeName(unsigned Opcode) const override;
+
+  SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVECTOR_SHIFT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerROTL(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerANY_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerLoad(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerStore(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerUnalignedLoad(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerUAddSubO(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerAddSubCarry(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerREADCYCLECOUNTER(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerEH_LABEL(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
+  SDValue
+  LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                       const SmallVectorImpl<ISD::InputArg> &Ins,
+                       const SDLoc &dl, SelectionDAG &DAG,
+                       SmallVectorImpl<SDValue> &InVals) const override;
+  SDValue LowerGLOBALADDRESS(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
+      SelectionDAG &DAG) const;
+  SDValue LowerToTLSInitialExecModel(GlobalAddressSDNode *GA,
+      SelectionDAG &DAG) const;
+  SDValue LowerToTLSLocalExecModel(GlobalAddressSDNode *GA,
+      SelectionDAG &DAG) const;
+  SDValue GetDynamicTLSAddr(SelectionDAG &DAG, SDValue Chain,
+      GlobalAddressSDNode *GA, SDValue InFlag, EVT PtrVT,
+      unsigned ReturnReg, unsigned char OperandFlags) const;
+  SDValue LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
+      SmallVectorImpl<SDValue> &InVals) const override;
+  SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                          CallingConv::ID CallConv, bool isVarArg,
+                          const SmallVectorImpl<ISD::InputArg> &Ins,
+                          const SDLoc &dl, SelectionDAG &DAG,
+                          SmallVectorImpl<SDValue> &InVals,
+                          const SmallVectorImpl<SDValue> &OutVals,
+                          SDValue Callee) const;
+
+  SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG& DAG) const;
+  SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+
+  bool CanLowerReturn(CallingConv::ID CallConv,
+                      MachineFunction &MF, bool isVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      LLVMContext &Context) const override;
+
+  SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      const SmallVectorImpl<SDValue> &OutVals,
+                      const SDLoc &dl, SelectionDAG &DAG) const override;
+
+  SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+
+  bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
+
+  Register getRegisterByName(const char* RegName, LLT VT,
+                             const MachineFunction &MF) const override;
+
+  /// If a physical register, this returns the register that receives the
+  /// exception address on entry to an EH pad.
+  Register
+  getExceptionPointerRegister(const Constant *PersonalityFn) const override {
+    return Hexagon::R0;
+  }
+
+  /// If a physical register, this returns the register that receives the
+  /// exception typeid on entry to a landing pad.
+  Register
+  getExceptionSelectorRegister(const Constant *PersonalityFn) const override {
+    return Hexagon::R1;
+  }
+
+  SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+
+  EVT getSetCCResultType(const DataLayout &, LLVMContext &C,
+                         EVT VT) const override {
+    if (!VT.isVector())
+      return MVT::i1;
+    else
+      return EVT::getVectorVT(C, MVT::i1, VT.getVectorNumElements());
+  }
+
+  bool getPostIndexedAddressParts(SDNode *N, SDNode *Op,
+                                  SDValue &Base, SDValue &Offset,
+                                  ISD::MemIndexedMode &AM,
+                                  SelectionDAG &DAG) const override;
+
+  ConstraintType getConstraintType(StringRef Constraint) const override;
+
+  std::pair<unsigned, const TargetRegisterClass *>
+  getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                               StringRef Constraint, MVT VT) const override;
+
+  unsigned
+  getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
+    if (ConstraintCode == "o")
+      return InlineAsm::Constraint_o;
+    return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
+  }
+
+  // Intrinsics
+  SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
+  /// isLegalAddressingMode - Return true if the addressing mode represented
+  /// by AM is legal for this target, for a load/store of the specified type.
+  /// The type may be VoidTy, in which case only return true if the addressing
+  /// mode is legal for a load/store of any legal type.
+  /// TODO: Handle pre/postinc as well.
+  bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
+                             Type *Ty, unsigned AS,
+                             Instruction *I = nullptr) const override;
+  /// Return true if folding a constant offset with the given GlobalAddress
+  /// is legal.  It is frequently not legal in PIC relocation models.
+  bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
+
+  bool isFPImmLegal(const APFloat &Imm, EVT VT,
+                    bool ForCodeSize) const override;
+
+  /// isLegalICmpImmediate - Return true if the specified immediate is legal
+  /// icmp immediate, that is the target has icmp instructions which can
+  /// compare a register against the immediate without having to materialize
+  /// the immediate into a register.
+  bool isLegalICmpImmediate(int64_t Imm) const override;
+
+  EVT getOptimalMemOpType(const MemOp &Op,
+                          const AttributeList &FuncAttributes) const override;
+
+  bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT,
+                          unsigned AddrSpace, Align Alignment,
+                          MachineMemOperand::Flags Flags,
+                          bool *Fast) const override;
+
+  bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace,
+      unsigned Alignment, MachineMemOperand::Flags Flags, bool *Fast)
+      const override;
+
+  /// Returns relocation base for the given PIC jumptable.
+  SDValue getPICJumpTableRelocBase(SDValue Table, SelectionDAG &DAG)
+                                   const override;
+
+  bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy,
+                             EVT NewVT) const override;
+
+  // Handling of atomic RMW instructions.
+  Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
+      AtomicOrdering Ord) const override;
+  Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val,
+      Value *Addr, AtomicOrdering Ord) const override;
+  AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
+  bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
+  AtomicExpansionKind
+  shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override;
+
+  AtomicExpansionKind
+  shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override {
+    return AtomicExpansionKind::LLSC;
+  }
+
+private:
+  void initializeHVXLowering();
+  unsigned getPreferredHvxVectorAction(MVT VecTy) const;
+
+  void validateConstPtrAlignment(SDValue Ptr, const SDLoc &dl,
+                                 unsigned NeedAlign) const;
+
+  std::pair<SDValue,int> getBaseAndOffset(SDValue Addr) const;
+
+  bool getBuildVectorConstInts(ArrayRef<SDValue> Values, MVT VecTy,
+                               SelectionDAG &DAG,
+                               MutableArrayRef<ConstantInt*> Consts) const;
+  SDValue buildVector32(ArrayRef<SDValue> Elem, const SDLoc &dl, MVT VecTy,
+                        SelectionDAG &DAG) const;
+  SDValue buildVector64(ArrayRef<SDValue> Elem, const SDLoc &dl, MVT VecTy,
+                        SelectionDAG &DAG) const;
+  SDValue extractVector(SDValue VecV, SDValue IdxV, const SDLoc &dl,
+                        MVT ValTy, MVT ResTy, SelectionDAG &DAG) const;
+  SDValue insertVector(SDValue VecV, SDValue ValV, SDValue IdxV,
+                       const SDLoc &dl, MVT ValTy, SelectionDAG &DAG) const;
+  SDValue expandPredicate(SDValue Vec32, const SDLoc &dl,
                           SelectionDAG &DAG) const;
-    SDValue buildVector64(ArrayRef<SDValue> Elem, const SDLoc &dl, MVT VecTy,
-                          SelectionDAG &DAG) const;
-    SDValue extractVector(SDValue VecV, SDValue IdxV, const SDLoc &dl,
-                          MVT ValTy, MVT ResTy, SelectionDAG &DAG) const;
-    SDValue insertVector(SDValue VecV, SDValue ValV, SDValue IdxV,
-                         const SDLoc &dl, MVT ValTy, SelectionDAG &DAG) const;
-    SDValue expandPredicate(SDValue Vec32, const SDLoc &dl,
+  SDValue contractPredicate(SDValue Vec64, const SDLoc &dl,
                             SelectionDAG &DAG) const;
-    SDValue contractPredicate(SDValue Vec64, const SDLoc &dl,
+  SDValue getVectorShiftByInt(SDValue Op, SelectionDAG &DAG) const;
+  SDValue appendUndef(SDValue Val, MVT ResTy, SelectionDAG &DAG) const;
+
+  bool isUndef(SDValue Op) const {
+    if (Op.isMachineOpcode())
+      return Op.getMachineOpcode() == TargetOpcode::IMPLICIT_DEF;
+    return Op.getOpcode() == ISD::UNDEF;
+  }
+  SDValue getInstr(unsigned MachineOpc, const SDLoc &dl, MVT Ty,
+                   ArrayRef<SDValue> Ops, SelectionDAG &DAG) const {
+    SDNode *N = DAG.getMachineNode(MachineOpc, dl, Ty, Ops);
+    return SDValue(N, 0);
+  }
+  SDValue getZero(const SDLoc &dl, MVT Ty, SelectionDAG &DAG) const;
+
+  using VectorPair = std::pair<SDValue, SDValue>;
+  using TypePair = std::pair<MVT, MVT>;
+
+  SDValue getInt(unsigned IntId, MVT ResTy, ArrayRef<SDValue> Ops,
+                 const SDLoc &dl, SelectionDAG &DAG) const;
+
+  MVT ty(SDValue Op) const {
+    return Op.getValueType().getSimpleVT();
+  }
+  TypePair ty(const VectorPair &Ops) const {
+    return { Ops.first.getValueType().getSimpleVT(),
+             Ops.second.getValueType().getSimpleVT() };
+  }
+  MVT tyScalar(MVT Ty) const {
+    if (!Ty.isVector())
+      return Ty;
+    return MVT::getIntegerVT(Ty.getSizeInBits());
+  }
+  MVT tyVector(MVT Ty, MVT ElemTy) const {
+    if (Ty.isVector() && Ty.getVectorElementType() == ElemTy)
+      return Ty;
+    unsigned TyWidth = Ty.getSizeInBits();
+    unsigned ElemWidth = ElemTy.getSizeInBits();
+    assert((TyWidth % ElemWidth) == 0);
+    return MVT::getVectorVT(ElemTy, TyWidth/ElemWidth);
+  }
+
+  MVT typeJoin(const TypePair &Tys) const;
+  TypePair typeSplit(MVT Ty) const;
+  MVT typeExtElem(MVT VecTy, unsigned Factor) const;
+  MVT typeTruncElem(MVT VecTy, unsigned Factor) const;
+
+  SDValue opJoin(const VectorPair &Ops, const SDLoc &dl,
+                 SelectionDAG &DAG) const;
+  VectorPair opSplit(SDValue Vec, const SDLoc &dl, SelectionDAG &DAG) const;
+  SDValue opCastElem(SDValue Vec, MVT ElemTy, SelectionDAG &DAG) const;
+
+  bool allowsHvxMemoryAccess(MVT VecTy, MachineMemOperand::Flags Flags,
+                             bool *Fast) const;
+  bool allowsHvxMisalignedMemoryAccesses(MVT VecTy,
+                                         MachineMemOperand::Flags Flags,
+                                         bool *Fast) const;
+
+  bool isHvxSingleTy(MVT Ty) const;
+  bool isHvxPairTy(MVT Ty) const;
+  bool isHvxBoolTy(MVT Ty) const;
+  SDValue convertToByteIndex(SDValue ElemIdx, MVT ElemTy,
+                             SelectionDAG &DAG) const;
+  SDValue getIndexInWord32(SDValue Idx, MVT ElemTy, SelectionDAG &DAG) const;
+  SDValue getByteShuffle(const SDLoc &dl, SDValue Op0, SDValue Op1,
+                         ArrayRef<int> Mask, SelectionDAG &DAG) const;
+
+  SDValue buildHvxVectorReg(ArrayRef<SDValue> Values, const SDLoc &dl,
+                            MVT VecTy, SelectionDAG &DAG) const;
+  SDValue buildHvxVectorPred(ArrayRef<SDValue> Values, const SDLoc &dl,
+                             MVT VecTy, SelectionDAG &DAG) const;
+  SDValue createHvxPrefixPred(SDValue PredV, const SDLoc &dl,
+                              unsigned BitBytes, bool ZeroFill,
                               SelectionDAG &DAG) const;
-    SDValue getVectorShiftByInt(SDValue Op, SelectionDAG &DAG) const;
-
-    bool isUndef(SDValue Op) const {
-      if (Op.isMachineOpcode())
-        return Op.getMachineOpcode() == TargetOpcode::IMPLICIT_DEF;
-      return Op.getOpcode() == ISD::UNDEF;
-    }
-    SDValue getInstr(unsigned MachineOpc, const SDLoc &dl, MVT Ty,
-                     ArrayRef<SDValue> Ops, SelectionDAG &DAG) const {
-      SDNode *N = DAG.getMachineNode(MachineOpc, dl, Ty, Ops);
-      return SDValue(N, 0);
-    }
-    SDValue getZero(const SDLoc &dl, MVT Ty, SelectionDAG &DAG) const;
-
-    using VectorPair = std::pair<SDValue, SDValue>;
-    using TypePair = std::pair<MVT, MVT>;
-
-    SDValue getInt(unsigned IntId, MVT ResTy, ArrayRef<SDValue> Ops,
-                   const SDLoc &dl, SelectionDAG &DAG) const;
-
-    MVT ty(SDValue Op) const {
-      return Op.getValueType().getSimpleVT();
-    }
-    TypePair ty(const VectorPair &Ops) const {
-      return { Ops.first.getValueType().getSimpleVT(),
-               Ops.second.getValueType().getSimpleVT() };
-    }
-    MVT tyScalar(MVT Ty) const {
-      if (!Ty.isVector())
-        return Ty;
-      return MVT::getIntegerVT(Ty.getSizeInBits());
-    }
-    MVT tyVector(MVT Ty, MVT ElemTy) const {
-      if (Ty.isVector() && Ty.getVectorElementType() == ElemTy)
-        return Ty;
-      unsigned TyWidth = Ty.getSizeInBits();
-      unsigned ElemWidth = ElemTy.getSizeInBits();
-      assert((TyWidth % ElemWidth) == 0);
-      return MVT::getVectorVT(ElemTy, TyWidth/ElemWidth);
-    }
-
-    MVT typeJoin(const TypePair &Tys) const;
-    TypePair typeSplit(MVT Ty) const;
-    MVT typeExtElem(MVT VecTy, unsigned Factor) const;
-    MVT typeTruncElem(MVT VecTy, unsigned Factor) const;
-
-    SDValue opJoin(const VectorPair &Ops, const SDLoc &dl,
-                   SelectionDAG &DAG) const;
-    VectorPair opSplit(SDValue Vec, const SDLoc &dl, SelectionDAG &DAG) const;
-    SDValue opCastElem(SDValue Vec, MVT ElemTy, SelectionDAG &DAG) const;
-
-    bool allowsHvxMemoryAccess(MVT VecTy, MachineMemOperand::Flags Flags,
-                               bool *Fast) const;
-    bool allowsHvxMisalignedMemoryAccesses(MVT VecTy,
-                                           MachineMemOperand::Flags Flags,
-                                           bool *Fast) const;
-
-    bool isHvxSingleTy(MVT Ty) const;
-    bool isHvxPairTy(MVT Ty) const;
-    bool isHvxBoolTy(MVT Ty) const;
-    SDValue convertToByteIndex(SDValue ElemIdx, MVT ElemTy,
-                               SelectionDAG &DAG) const;
-    SDValue getIndexInWord32(SDValue Idx, MVT ElemTy, SelectionDAG &DAG) const;
-    SDValue getByteShuffle(const SDLoc &dl, SDValue Op0, SDValue Op1,
-                           ArrayRef<int> Mask, SelectionDAG &DAG) const;
-
-    SDValue buildHvxVectorReg(ArrayRef<SDValue> Values, const SDLoc &dl,
-                              MVT VecTy, SelectionDAG &DAG) const;
-    SDValue buildHvxVectorPred(ArrayRef<SDValue> Values, const SDLoc &dl,
-                               MVT VecTy, SelectionDAG &DAG) const;
-    SDValue createHvxPrefixPred(SDValue PredV, const SDLoc &dl,
-                                unsigned BitBytes, bool ZeroFill,
-                                SelectionDAG &DAG) const;
-    SDValue extractHvxElementReg(SDValue VecV, SDValue IdxV, const SDLoc &dl,
+  SDValue extractHvxElementReg(SDValue VecV, SDValue IdxV, const SDLoc &dl,
+                               MVT ResTy, SelectionDAG &DAG) const;
+  SDValue extractHvxElementPred(SDValue VecV, SDValue IdxV, const SDLoc &dl,
+                                MVT ResTy, SelectionDAG &DAG) const;
+  SDValue insertHvxElementReg(SDValue VecV, SDValue IdxV, SDValue ValV,
+                              const SDLoc &dl, SelectionDAG &DAG) const;
+  SDValue insertHvxElementPred(SDValue VecV, SDValue IdxV, SDValue ValV,
+                               const SDLoc &dl, SelectionDAG &DAG) const;
+  SDValue extractHvxSubvectorReg(SDValue VecV, SDValue IdxV, const SDLoc &dl,
                                  MVT ResTy, SelectionDAG &DAG) const;
-    SDValue extractHvxElementPred(SDValue VecV, SDValue IdxV, const SDLoc &dl,
+  SDValue extractHvxSubvectorPred(SDValue VecV, SDValue IdxV, const SDLoc &dl,
                                   MVT ResTy, SelectionDAG &DAG) const;
-    SDValue insertHvxElementReg(SDValue VecV, SDValue IdxV, SDValue ValV,
+  SDValue insertHvxSubvectorReg(SDValue VecV, SDValue SubV, SDValue IdxV,
                                 const SDLoc &dl, SelectionDAG &DAG) const;
-    SDValue insertHvxElementPred(SDValue VecV, SDValue IdxV, SDValue ValV,
+  SDValue insertHvxSubvectorPred(SDValue VecV, SDValue SubV, SDValue IdxV,
                                  const SDLoc &dl, SelectionDAG &DAG) const;
-    SDValue extractHvxSubvectorReg(SDValue VecV, SDValue IdxV, const SDLoc &dl,
-                                   MVT ResTy, SelectionDAG &DAG) const;
-    SDValue extractHvxSubvectorPred(SDValue VecV, SDValue IdxV, const SDLoc &dl,
-                                    MVT ResTy, SelectionDAG &DAG) const;
-    SDValue insertHvxSubvectorReg(SDValue VecV, SDValue SubV, SDValue IdxV,
-                                  const SDLoc &dl, SelectionDAG &DAG) const;
-    SDValue insertHvxSubvectorPred(SDValue VecV, SDValue SubV, SDValue IdxV,
-                                   const SDLoc &dl, SelectionDAG &DAG) const;
-    SDValue extendHvxVectorPred(SDValue VecV, const SDLoc &dl, MVT ResTy,
-                                bool ZeroExt, SelectionDAG &DAG) const;
-    SDValue compressHvxPred(SDValue VecQ, const SDLoc &dl, MVT ResTy,
-                            SelectionDAG &DAG) const;
+  SDValue extendHvxVectorPred(SDValue VecV, const SDLoc &dl, MVT ResTy,
+                              bool ZeroExt, SelectionDAG &DAG) const;
+  SDValue compressHvxPred(SDValue VecQ, const SDLoc &dl, MVT ResTy,
+                          SelectionDAG &DAG) const;
 
-    SDValue LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerHvxConcatVectors(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerHvxExtractElement(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerHvxInsertElement(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerHvxExtractSubvector(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerHvxInsertSubvector(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerHvxBitcast(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerHvxAnyExt(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerHvxSignExt(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerHvxZeroExt(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerHvxCttz(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerHvxMul(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerHvxMulh(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerHvxSetCC(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerHvxExtend(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerHvxShift(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerHvxIntrinsic(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerHvxStore(SDValue Op, SelectionDAG &DAG) const;
-    SDValue HvxVecPredBitcastComputation(SDValue Op, SelectionDAG &DAG) const;
-
-    SDValue SplitHvxPairOp(SDValue Op, SelectionDAG &DAG) const;
-    SDValue SplitHvxMemOp(SDValue Op, SelectionDAG &DAG) const;
-
-    std::pair<const TargetRegisterClass*, uint8_t>
-    findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT)
-        const override;
-
-    bool isHvxOperation(SDValue Op) const;
-    bool isHvxOperation(SDNode *N) const;
-    SDValue LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const;
-    void LowerHvxOperationWrapper(SDNode *N, SmallVectorImpl<SDValue> &Results,
-                                  SelectionDAG &DAG) const;
-    void ReplaceHvxNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
-                               SelectionDAG &DAG) const;
-    SDValue PerformHvxDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
-  };
+  SDValue LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerHvxConcatVectors(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerHvxExtractElement(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerHvxInsertElement(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerHvxExtractSubvector(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerHvxInsertSubvector(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerHvxBitcast(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerHvxAnyExt(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerHvxSignExt(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerHvxZeroExt(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerHvxCttz(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerHvxMul(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerHvxMulh(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerHvxSetCC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerHvxExtend(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerHvxSelect(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerHvxShift(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerHvxIntrinsic(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerHvxMaskedOp(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue SplitHvxPairOp(SDValue Op, SelectionDAG &DAG) const;
+  SDValue SplitHvxMemOp(SDValue Op, SelectionDAG &DAG) const;
+  SDValue WidenHvxLoad(SDValue Op, SelectionDAG &DAG) const;
+  SDValue WidenHvxStore(SDValue Op, SelectionDAG &DAG) const;
+  SDValue WidenHvxSetCC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue WidenHvxExtend(SDValue Op, SelectionDAG &DAG) const;
+  SDValue WidenHvxTruncate(SDValue Op, SelectionDAG &DAG) const;
+
+  std::pair<const TargetRegisterClass*, uint8_t>
+  findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT)
+      const override;
+
+  bool shouldWidenToHvx(MVT Ty, SelectionDAG &DAG) const;
+  bool isHvxOperation(SDNode *N, SelectionDAG &DAG) const;
+  SDValue LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const;
+  void LowerHvxOperationWrapper(SDNode *N, SmallVectorImpl<SDValue> &Results,
+                                SelectionDAG &DAG) const;
+  void ReplaceHvxNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
+                             SelectionDAG &DAG) const;
+  SDValue PerformHvxDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+};
 
 } // end namespace llvm
 
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
index 7cda915fffe9..29b75814da6e 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
@@ -14,6 +14,10 @@
 
 using namespace llvm;
 
+static cl::opt<unsigned> HvxWidenThreshold("hexagon-hvx-widen",
+  cl::Hidden, cl::init(16),
+  cl::desc("Lower threshold (in bytes) for widening to HVX vectors"));
+
 static const MVT LegalV64[] =  { MVT::v64i8,  MVT::v32i16,  MVT::v16i32 };
 static const MVT LegalW64[] =  { MVT::v128i8, MVT::v64i16,  MVT::v32i32 };
 static const MVT LegalV128[] = { MVT::v128i8, MVT::v64i16,  MVT::v32i32 };
@@ -87,17 +91,28 @@ HexagonTargetLowering::initializeHVXLowering() {
     setOperationAction(ISD::XOR,            T, Legal);
     setOperationAction(ISD::ADD,            T, Legal);
     setOperationAction(ISD::SUB,            T, Legal);
+    setOperationAction(ISD::MUL,            T, Legal);
     setOperationAction(ISD::CTPOP,          T, Legal);
     setOperationAction(ISD::CTLZ,           T, Legal);
+    setOperationAction(ISD::SELECT,         T, Legal);
+    setOperationAction(ISD::SPLAT_VECTOR,   T, Legal);
     if (T != ByteV) {
       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, T, Legal);
       setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, T, Legal);
       setOperationAction(ISD::BSWAP,                    T, Legal);
     }
 
+    setOperationAction(ISD::SMIN,           T, Legal);
+    setOperationAction(ISD::SMAX,           T, Legal);
+    if (T.getScalarType() != MVT::i32) {
+      setOperationAction(ISD::UMIN,         T, Legal);
+      setOperationAction(ISD::UMAX,         T, Legal);
+    }
+
     setOperationAction(ISD::CTTZ,               T, Custom);
     setOperationAction(ISD::LOAD,               T, Custom);
-    setOperationAction(ISD::MUL,                T, Custom);
+    setOperationAction(ISD::MLOAD,              T, Custom);
+    setOperationAction(ISD::MSTORE,             T, Custom);
     setOperationAction(ISD::MULHS,              T, Custom);
     setOperationAction(ISD::MULHU,              T, Custom);
     setOperationAction(ISD::BUILD_VECTOR,       T, Custom);
@@ -147,9 +162,12 @@ HexagonTargetLowering::initializeHVXLowering() {
     setOperationAction(ISD::ANY_EXTEND_VECTOR_INREG,  T, Custom);
     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, T, Legal);
     setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, T, Legal);
+    setOperationAction(ISD::SPLAT_VECTOR,             T, Custom);
 
     setOperationAction(ISD::LOAD,     T, Custom);
     setOperationAction(ISD::STORE,    T, Custom);
+    setOperationAction(ISD::MLOAD,    T, Custom);
+    setOperationAction(ISD::MSTORE,   T, Custom);
     setOperationAction(ISD::CTLZ,     T, Custom);
     setOperationAction(ISD::CTTZ,     T, Custom);
     setOperationAction(ISD::CTPOP,    T, Custom);
@@ -172,6 +190,13 @@ HexagonTargetLowering::initializeHVXLowering() {
       // Promote all shuffles to operate on vectors of bytes.
       setPromoteTo(ISD::VECTOR_SHUFFLE, T, ByteW);
     }
+
+    setOperationAction(ISD::SMIN,     T, Custom);
+    setOperationAction(ISD::SMAX,     T, Custom);
+    if (T.getScalarType() != MVT::i32) {
+      setOperationAction(ISD::UMIN,   T, Custom);
+      setOperationAction(ISD::UMAX,   T, Custom);
+    }
   }
 
   // Boolean vectors.
@@ -188,6 +213,9 @@ HexagonTargetLowering::initializeHVXLowering() {
     setOperationAction(ISD::AND,                BoolW, Custom);
     setOperationAction(ISD::OR,                 BoolW, Custom);
     setOperationAction(ISD::XOR,                BoolW, Custom);
+    // Masked load/store takes a mask that may need splitting.
+    setOperationAction(ISD::MLOAD,              BoolW, Custom);
+    setOperationAction(ISD::MSTORE,             BoolW, Custom);
   }
 
   for (MVT T : LegalV) {
@@ -198,6 +226,7 @@ HexagonTargetLowering::initializeHVXLowering() {
     setOperationAction(ISD::INSERT_VECTOR_ELT,  BoolV, Custom);
     setOperationAction(ISD::EXTRACT_SUBVECTOR,  BoolV, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, BoolV, Custom);
+    setOperationAction(ISD::SELECT,             BoolV, Custom);
     setOperationAction(ISD::AND,                BoolV, Legal);
     setOperationAction(ISD::OR,                 BoolV, Legal);
     setOperationAction(ISD::XOR,                BoolV, Legal);
@@ -211,16 +240,82 @@ HexagonTargetLowering::initializeHVXLowering() {
       setOperationAction(ISD::SIGN_EXTEND_INREG, T, Legal);
   }
 
+  // Handle store widening for short vectors.
+  unsigned HwLen = Subtarget.getVectorLength();
+  for (MVT ElemTy : Subtarget.getHVXElementTypes()) {
+    if (ElemTy == MVT::i1)
+      continue;
+    int ElemWidth = ElemTy.getFixedSizeInBits();
+    int MaxElems = (8*HwLen) / ElemWidth;
+    for (int N = 2; N < MaxElems; N *= 2) {
+      MVT VecTy = MVT::getVectorVT(ElemTy, N);
+      auto Action = getPreferredVectorAction(VecTy);
+      if (Action == TargetLoweringBase::TypeWidenVector) {
+        setOperationAction(ISD::LOAD,         VecTy, Custom);
+        setOperationAction(ISD::STORE,        VecTy, Custom);
+        setOperationAction(ISD::SETCC,        VecTy, Custom);
+        setOperationAction(ISD::TRUNCATE,     VecTy, Custom);
+        setOperationAction(ISD::ANY_EXTEND,   VecTy, Custom);
+        setOperationAction(ISD::SIGN_EXTEND,  VecTy, Custom);
+        setOperationAction(ISD::ZERO_EXTEND,  VecTy, Custom);
+
+        MVT BoolTy = MVT::getVectorVT(MVT::i1, N);
+        if (!isTypeLegal(BoolTy))
+          setOperationAction(ISD::SETCC, BoolTy, Custom);
+      }
+    }
+  }
+
+  setTargetDAGCombine(ISD::SPLAT_VECTOR);
   setTargetDAGCombine(ISD::VSELECT);
 }
 
+unsigned
+HexagonTargetLowering::getPreferredHvxVectorAction(MVT VecTy) const {
+  MVT ElemTy = VecTy.getVectorElementType();
+  unsigned VecLen = VecTy.getVectorNumElements();
+  unsigned HwLen = Subtarget.getVectorLength();
+
+  // Split vectors of i1 that exceed byte vector length.
+  if (ElemTy == MVT::i1 && VecLen > HwLen)
+    return TargetLoweringBase::TypeSplitVector;
+
+  ArrayRef<MVT> Tys = Subtarget.getHVXElementTypes();
+  // For shorter vectors of i1, widen them if any of the corresponding
+  // vectors of integers needs to be widened.
+  if (ElemTy == MVT::i1) {
+    for (MVT T : Tys) {
+      assert(T != MVT::i1);
+      auto A = getPreferredHvxVectorAction(MVT::getVectorVT(T, VecLen));
+      if (A != ~0u)
+        return A;
+    }
+    return ~0u;
+  }
+
+  // If the size of VecTy is at least half of the vector length,
+  // widen the vector. Note: the threshold was not selected in
+  // any scientific way.
+  if (llvm::is_contained(Tys, ElemTy)) {
+    unsigned VecWidth = VecTy.getSizeInBits();
+    bool HaveThreshold = HvxWidenThreshold.getNumOccurrences() > 0;
+    if (HaveThreshold && 8*HvxWidenThreshold <= VecWidth)
+      return TargetLoweringBase::TypeWidenVector;
+    unsigned HwWidth = 8*HwLen;
+    if (VecWidth >= HwWidth/2 && VecWidth < HwWidth)
+      return TargetLoweringBase::TypeWidenVector;
+  }
+
+  // Defer to default.
+  return ~0u;
+}
+
 SDValue
 HexagonTargetLowering::getInt(unsigned IntId, MVT ResTy, ArrayRef<SDValue> Ops,
                               const SDLoc &dl, SelectionDAG &DAG) const {
   SmallVector<SDValue,4> IntOps;
   IntOps.push_back(DAG.getConstant(IntId, dl, MVT::i32));
-  for (const SDValue &Op : Ops)
-    IntOps.push_back(Op);
+  append_range(IntOps, Ops);
   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, ResTy, IntOps);
 }
 
@@ -432,7 +527,9 @@ HexagonTargetLowering::buildHvxVectorReg(ArrayRef<SDValue> Values,
     auto *IdxN = dyn_cast<ConstantSDNode>(SplatV.getNode());
     if (IdxN && IdxN->isNullValue())
       return getZero(dl, VecTy, DAG);
-    return DAG.getNode(HexagonISD::VSPLATW, dl, VecTy, SplatV);
+    MVT WordTy = MVT::getVectorVT(MVT::i32, HwLen/4);
+    SDValue S = DAG.getNode(ISD::SPLAT_VECTOR, dl, WordTy, SplatV);
+    return DAG.getBitcast(VecTy, S);
   }
 
   // Delay recognizing constant vectors until here, so that we can generate
@@ -571,6 +668,9 @@ HexagonTargetLowering::createHvxPrefixPred(SDValue PredV, const SDLoc &dl,
     if (!ZeroFill)
       return S;
     // Fill the bytes beyond BlockLen with 0s.
+    // V6_pred_scalar2 cannot fill the entire predicate, so it only works
+    // when BlockLen < HwLen.
+    assert(BlockLen < HwLen && "vsetq(v1) prerequisite");
     MVT BoolTy = MVT::getVectorVT(MVT::i1, HwLen);
     SDValue Q = getInstr(Hexagon::V6_pred_scalar2, dl, BoolTy,
                          {DAG.getConstant(BlockLen, dl, MVT::i32)}, DAG);
@@ -1034,6 +1134,7 @@ HexagonTargetLowering::insertHvxSubvectorPred(SDValue VecV, SDValue SubV,
   // ByteVec is the target vector VecV rotated in such a way that the
   // subvector should be inserted at index 0. Generate a predicate mask
   // and use vmux to do the insertion.
+  assert(BlockLen < HwLen && "vsetq(v1) prerequisite");
   MVT BoolTy = MVT::getVectorVT(MVT::i1, HwLen);
   SDValue Q = getInstr(Hexagon::V6_pred_scalar2, dl, BoolTy,
                        {DAG.getConstant(BlockLen, dl, MVT::i32)}, DAG);
@@ -1058,7 +1159,7 @@ HexagonTargetLowering::extendHvxVectorPred(SDValue VecV, const SDLoc &dl,
     return DAG.getNode(HexagonISD::Q2V, dl, ResTy, VecV);
 
   assert(ty(VecV).getVectorNumElements() == ResTy.getVectorNumElements());
-  SDValue True = DAG.getNode(HexagonISD::VSPLAT, dl, ResTy,
+  SDValue True = DAG.getNode(ISD::SPLAT_VECTOR, dl, ResTy,
                              DAG.getConstant(1, dl, MVT::i32));
   SDValue False = getZero(dl, ResTy, DAG);
   return DAG.getSelect(dl, ResTy, VecV, True, False);
@@ -1180,12 +1281,19 @@ HexagonTargetLowering::LowerHvxConcatVectors(SDValue Op, SelectionDAG &DAG)
           continue;
         }
         // A few less complicated cases.
-        if (V.getOpcode() == ISD::Constant)
-          Elems[i] = DAG.getSExtOrTrunc(V, dl, NTy);
-        else if (V.isUndef())
-          Elems[i] = DAG.getUNDEF(NTy);
-        else
-          llvm_unreachable("Unexpected vector element");
+        switch (V.getOpcode()) {
+          case ISD::Constant:
+            Elems[i] = DAG.getSExtOrTrunc(V, dl, NTy);
+            break;
+          case ISD::UNDEF:
+            Elems[i] = DAG.getUNDEF(NTy);
+            break;
+          case ISD::TRUNCATE:
+            Elems[i] = V.getOperand(0);
+            break;
+          default:
+            llvm_unreachable("Unexpected vector element");
+        }
       }
     }
     return DAG.getBuildVector(VecTy, dl, Elems);
@@ -1346,19 +1454,14 @@ HexagonTargetLowering::LowerHvxCttz(SDValue Op, SelectionDAG &DAG) const {
   // Calculate the vectors of 1 and bitwidth(x).
   MVT ElemTy = ty(InpV).getVectorElementType();
   unsigned ElemWidth = ElemTy.getSizeInBits();
-  // Using uint64_t because a shift by 32 can happen.
-  uint64_t Splat1 = 0, SplatW = 0;
-  assert(isPowerOf2_32(ElemWidth) && ElemWidth <= 32);
-  for (unsigned i = 0; i != 32/ElemWidth; ++i) {
-    Splat1 = (Splat1 << ElemWidth) | 1;
-    SplatW = (SplatW << ElemWidth) | ElemWidth;
-  }
-  SDValue Vec1 = DAG.getNode(HexagonISD::VSPLATW, dl, ResTy,
-                             DAG.getConstant(uint32_t(Splat1), dl, MVT::i32));
-  SDValue VecW = DAG.getNode(HexagonISD::VSPLATW, dl, ResTy,
-                             DAG.getConstant(uint32_t(SplatW), dl, MVT::i32));
-  SDValue VecN1 = DAG.getNode(HexagonISD::VSPLATW, dl, ResTy,
+
+  SDValue Vec1 = DAG.getNode(ISD::SPLAT_VECTOR, dl, ResTy,
+                             DAG.getConstant(1, dl, MVT::i32));
+  SDValue VecW = DAG.getNode(ISD::SPLAT_VECTOR, dl, ResTy,
+                             DAG.getConstant(ElemWidth, dl, MVT::i32));
+  SDValue VecN1 = DAG.getNode(ISD::SPLAT_VECTOR, dl, ResTy,
                               DAG.getConstant(-1, dl, MVT::i32));
+
   // Do not use DAG.getNOT, because that would create BUILD_VECTOR with
   // a BITCAST. Here we can skip the BITCAST (so we don't have to handle
   // it separately in custom combine or selection).
@@ -1369,60 +1472,6 @@ HexagonTargetLowering::LowerHvxCttz(SDValue Op, SelectionDAG &DAG) const {
                      {VecW, DAG.getNode(ISD::CTLZ, dl, ResTy, A)});
 }
 
-SDValue
-HexagonTargetLowering::LowerHvxMul(SDValue Op, SelectionDAG &DAG) const {
-  MVT ResTy = ty(Op);
-  assert(ResTy.isVector() && isHvxSingleTy(ResTy));
-  const SDLoc &dl(Op);
-  SmallVector<int,256> ShuffMask;
-
-  MVT ElemTy = ResTy.getVectorElementType();
-  unsigned VecLen = ResTy.getVectorNumElements();
-  SDValue Vs = Op.getOperand(0);
-  SDValue Vt = Op.getOperand(1);
-
-  switch (ElemTy.SimpleTy) {
-    case MVT::i8: {
-      // For i8 vectors Vs = (a0, a1, ...), Vt = (b0, b1, ...),
-      // V6_vmpybv Vs, Vt produces a pair of i16 vectors Hi:Lo,
-      // where Lo = (a0*b0, a2*b2, ...), Hi = (a1*b1, a3*b3, ...).
-      MVT ExtTy = typeExtElem(ResTy, 2);
-      unsigned MpyOpc = ElemTy == MVT::i8 ? Hexagon::V6_vmpybv
-                                          : Hexagon::V6_vmpyhv;
-      SDValue M = getInstr(MpyOpc, dl, ExtTy, {Vs, Vt}, DAG);
-
-      // Discard high halves of the resulting values, collect the low halves.
-      for (unsigned I = 0; I < VecLen; I += 2) {
-        ShuffMask.push_back(I);         // Pick even element.
-        ShuffMask.push_back(I+VecLen);  // Pick odd element.
-      }
-      VectorPair P = opSplit(opCastElem(M, ElemTy, DAG), dl, DAG);
-      SDValue BS = getByteShuffle(dl, P.first, P.second, ShuffMask, DAG);
-      return DAG.getBitcast(ResTy, BS);
-    }
-    case MVT::i16:
-      // For i16 there is V6_vmpyih, which acts exactly like the MUL opcode.
-      // (There is also V6_vmpyhv, which behaves in an analogous way to
-      // V6_vmpybv.)
-      return getInstr(Hexagon::V6_vmpyih, dl, ResTy, {Vs, Vt}, DAG);
-    case MVT::i32: {
-      // Use the following sequence for signed word multiply:
-      // T0 = V6_vmpyiowh Vs, Vt
-      // T1 = V6_vaslw T0, 16
-      // T2 = V6_vmpyiewuh_acc T1, Vs, Vt
-      SDValue S16 = DAG.getConstant(16, dl, MVT::i32);
-      SDValue T0 = getInstr(Hexagon::V6_vmpyiowh, dl, ResTy, {Vs, Vt}, DAG);
-      SDValue T1 = getInstr(Hexagon::V6_vaslw, dl, ResTy, {T0, S16}, DAG);
-      SDValue T2 = getInstr(Hexagon::V6_vmpyiewuh_acc, dl, ResTy,
-                            {T1, Vs, Vt}, DAG);
-      return T2;
-    }
-    default:
-      break;
-  }
-  return SDValue();
-}
-
 SDValue
 HexagonTargetLowering::LowerHvxMulh(SDValue Op, SelectionDAG &DAG) const {
   MVT ResTy = ty(Op);
@@ -1462,7 +1511,7 @@ HexagonTargetLowering::LowerHvxMulh(SDValue Op, SelectionDAG &DAG) const {
   assert(ElemTy == MVT::i32);
   SDValue S16 = DAG.getConstant(16, dl, MVT::i32);
 
-  if (IsSigned) {
+  auto MulHS_V60 = [&](SDValue Vs, SDValue Vt) {
     // mulhs(Vs,Vt) =
     //   = [(Hi(Vs)*2^16 + Lo(Vs)) *s (Hi(Vt)*2^16 + Lo(Vt))] >> 32
     //   = [Hi(Vs)*2^16 *s Hi(Vt)*2^16 + Hi(Vs) *su Lo(Vt)*2^16
@@ -1489,6 +1538,20 @@ HexagonTargetLowering::LowerHvxMulh(SDValue Op, SelectionDAG &DAG) const {
     // Add:
     SDValue T3 = DAG.getNode(ISD::ADD, dl, ResTy, {S2, T2});
     return T3;
+  };
+
+  auto MulHS_V62 = [&](SDValue Vs, SDValue Vt) {
+    MVT PairTy = typeJoin({ResTy, ResTy});
+    SDValue T0 = getInstr(Hexagon::V6_vmpyewuh_64, dl, PairTy, {Vs, Vt}, DAG);
+    SDValue T1 = getInstr(Hexagon::V6_vmpyowh_64_acc, dl, PairTy,
+                          {T0, Vs, Vt}, DAG);
+    return opSplit(T1, dl, DAG).second;
+  };
+
+  if (IsSigned) {
+    if (Subtarget.useHVXV62Ops())
+      return MulHS_V62(Vs, Vt);
+    return MulHS_V60(Vs, Vt);
   }
 
   // Unsigned mulhw. (Would expansion using signed mulhw be better?)
@@ -1584,6 +1647,26 @@ HexagonTargetLowering::LowerHvxExtend(SDValue Op, SelectionDAG &DAG) const {
                      Op.getOperand(0));
 }
 
+SDValue
+HexagonTargetLowering::LowerHvxSelect(SDValue Op, SelectionDAG &DAG) const {
+  MVT ResTy = ty(Op);
+  if (ResTy.getVectorElementType() != MVT::i1)
+    return Op;
+
+  const SDLoc &dl(Op);
+  unsigned HwLen = Subtarget.getVectorLength();
+  unsigned VecLen = ResTy.getVectorNumElements();
+  assert(HwLen % VecLen == 0);
+  unsigned ElemSize = HwLen / VecLen;
+
+  MVT VecTy = MVT::getVectorVT(MVT::getIntegerVT(ElemSize * 8), VecLen);
+  SDValue S =
+      DAG.getNode(ISD::SELECT, dl, VecTy, Op.getOperand(0),
+                  DAG.getNode(HexagonISD::Q2V, dl, VecTy, Op.getOperand(1)),
+                  DAG.getNode(HexagonISD::Q2V, dl, VecTy, Op.getOperand(2)));
+  return DAG.getNode(HexagonISD::V2Q, dl, ResTy, S);
+}
+
 SDValue
 HexagonTargetLowering::LowerHvxShift(SDValue Op, SelectionDAG &DAG) const {
   if (SDValue S = getVectorShiftByInt(Op, DAG))
@@ -1593,7 +1676,7 @@ HexagonTargetLowering::LowerHvxShift(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue
 HexagonTargetLowering::LowerHvxIntrinsic(SDValue Op, SelectionDAG &DAG) const {
-  const SDLoc &dl(Op);
+      const SDLoc &dl(Op);
   MVT ResTy = ty(Op);
 
   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
@@ -1613,6 +1696,76 @@ HexagonTargetLowering::LowerHvxIntrinsic(SDValue Op, SelectionDAG &DAG) const {
   return Op;
 }
 
+SDValue
+HexagonTargetLowering::LowerHvxMaskedOp(SDValue Op, SelectionDAG &DAG) const {
+  const SDLoc &dl(Op);
+  unsigned HwLen = Subtarget.getVectorLength();
+  MachineFunction &MF = DAG.getMachineFunction();
+  auto *MaskN = cast<MaskedLoadStoreSDNode>(Op.getNode());
+  SDValue Mask = MaskN->getMask();
+  SDValue Chain = MaskN->getChain();
+  SDValue Base = MaskN->getBasePtr();
+  auto *MemOp = MF.getMachineMemOperand(MaskN->getMemOperand(), 0, HwLen);
+
+  unsigned Opc = Op->getOpcode();
+  assert(Opc == ISD::MLOAD || Opc == ISD::MSTORE);
+
+  if (Opc == ISD::MLOAD) {
+    MVT ValTy = ty(Op);
+    SDValue Load = DAG.getLoad(ValTy, dl, Chain, Base, MemOp);
+    SDValue Thru = cast<MaskedLoadSDNode>(MaskN)->getPassThru();
+    if (isUndef(Thru))
+      return Load;
+    SDValue VSel = DAG.getNode(ISD::VSELECT, dl, ValTy, Mask, Load, Thru);
+    return DAG.getMergeValues({VSel, Load.getValue(1)}, dl);
+  }
+
+  // MSTORE
+  // HVX only has aligned masked stores.
+
+  // TODO: Fold negations of the mask into the store.
+  unsigned StoreOpc = Hexagon::V6_vS32b_qpred_ai;
+  SDValue Value = cast<MaskedStoreSDNode>(MaskN)->getValue();
+  SDValue Offset0 = DAG.getTargetConstant(0, dl, ty(Base));
+
+  if (MaskN->getAlign().value() % HwLen == 0) {
+    SDValue Store = getInstr(StoreOpc, dl, MVT::Other,
+                             {Mask, Base, Offset0, Value, Chain}, DAG);
+    DAG.setNodeMemRefs(cast<MachineSDNode>(Store.getNode()), {MemOp});
+    return Store;
+  }
+
+  // Unaligned case.
+  auto StoreAlign = [&](SDValue V, SDValue A) {
+    SDValue Z = getZero(dl, ty(V), DAG);
+    // TODO: use funnel shifts?
+    // vlalign(Vu,Vv,Rt) rotates the pair Vu:Vv left by Rt and takes the
+    // upper half.
+    SDValue LoV = getInstr(Hexagon::V6_vlalignb, dl, ty(V), {V, Z, A}, DAG);
+    SDValue HiV = getInstr(Hexagon::V6_vlalignb, dl, ty(V), {Z, V, A}, DAG);
+    return std::make_pair(LoV, HiV);
+  };
+
+  MVT ByteTy = MVT::getVectorVT(MVT::i8, HwLen);
+  MVT BoolTy = MVT::getVectorVT(MVT::i1, HwLen);
+  SDValue MaskV = DAG.getNode(HexagonISD::Q2V, dl, ByteTy, Mask);
+  VectorPair Tmp = StoreAlign(MaskV, Base);
+  VectorPair MaskU = {DAG.getNode(HexagonISD::V2Q, dl, BoolTy, Tmp.first),
+                      DAG.getNode(HexagonISD::V2Q, dl, BoolTy, Tmp.second)};
+  VectorPair ValueU = StoreAlign(Value, Base);
+
+  SDValue Offset1 = DAG.getTargetConstant(HwLen, dl, MVT::i32);
+  SDValue StoreLo =
+      getInstr(StoreOpc, dl, MVT::Other,
+               {MaskU.first, Base, Offset0, ValueU.first, Chain}, DAG);
+  SDValue StoreHi =
+      getInstr(StoreOpc, dl, MVT::Other,
+               {MaskU.second, Base, Offset1, ValueU.second, Chain}, DAG);
+  DAG.setNodeMemRefs(cast<MachineSDNode>(StoreLo.getNode()), {MemOp});
+  DAG.setNodeMemRefs(cast<MachineSDNode>(StoreHi.getNode()), {MemOp});
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, {StoreLo, StoreHi});
+}
+
 SDValue
 HexagonTargetLowering::SplitHvxPairOp(SDValue Op, SelectionDAG &DAG) const {
   assert(!Op.isMachineOpcode());
@@ -1648,45 +1801,252 @@ HexagonTargetLowering::SplitHvxPairOp(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue
 HexagonTargetLowering::SplitHvxMemOp(SDValue Op, SelectionDAG &DAG) const {
-  LSBaseSDNode *BN = cast<LSBaseSDNode>(Op.getNode());
-  assert(BN->isUnindexed());
-  MVT MemTy = BN->getMemoryVT().getSimpleVT();
+  auto *MemN = cast<MemSDNode>(Op.getNode());
+
+  MVT MemTy = MemN->getMemoryVT().getSimpleVT();
   if (!isHvxPairTy(MemTy))
     return Op;
 
   const SDLoc &dl(Op);
   unsigned HwLen = Subtarget.getVectorLength();
   MVT SingleTy = typeSplit(MemTy).first;
-  SDValue Chain = BN->getChain();
-  SDValue Base0 = BN->getBasePtr();
-  SDValue Base1 = DAG.getMemBasePlusOffset(Base0, HwLen, dl);
+  SDValue Chain = MemN->getChain();
+  SDValue Base0 = MemN->getBasePtr();
+  SDValue Base1 = DAG.getMemBasePlusOffset(Base0, TypeSize::Fixed(HwLen), dl);
 
   MachineMemOperand *MOp0 = nullptr, *MOp1 = nullptr;
-  if (MachineMemOperand *MMO = BN->getMemOperand()) {
+  if (MachineMemOperand *MMO = MemN->getMemOperand()) {
     MachineFunction &MF = DAG.getMachineFunction();
     MOp0 = MF.getMachineMemOperand(MMO, 0, HwLen);
     MOp1 = MF.getMachineMemOperand(MMO, HwLen, HwLen);
   }
 
-  unsigned MemOpc = BN->getOpcode();
-  SDValue NewOp;
+  unsigned MemOpc = MemN->getOpcode();
 
   if (MemOpc == ISD::LOAD) {
+    assert(cast<LoadSDNode>(Op)->isUnindexed());
     SDValue Load0 = DAG.getLoad(SingleTy, dl, Chain, Base0, MOp0);
     SDValue Load1 = DAG.getLoad(SingleTy, dl, Chain, Base1, MOp1);
-    NewOp = DAG.getMergeValues(
-              { DAG.getNode(ISD::CONCAT_VECTORS, dl, MemTy, Load0, Load1),
-                DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                            Load0.getValue(1), Load1.getValue(1)) }, dl);
-  } else {
-    assert(MemOpc == ISD::STORE);
+    return DAG.getMergeValues(
+        { DAG.getNode(ISD::CONCAT_VECTORS, dl, MemTy, Load0, Load1),
+          DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                      Load0.getValue(1), Load1.getValue(1)) }, dl);
+  }
+  if (MemOpc == ISD::STORE) {
+    assert(cast<StoreSDNode>(Op)->isUnindexed());
     VectorPair Vals = opSplit(cast<StoreSDNode>(Op)->getValue(), dl, DAG);
     SDValue Store0 = DAG.getStore(Chain, dl, Vals.first, Base0, MOp0);
     SDValue Store1 = DAG.getStore(Chain, dl, Vals.second, Base1, MOp1);
-    NewOp = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store0, Store1);
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store0, Store1);
   }
 
-  return NewOp;
+  assert(MemOpc == ISD::MLOAD || MemOpc == ISD::MSTORE);
+
+  auto MaskN = cast<MaskedLoadStoreSDNode>(Op);
+  assert(MaskN->isUnindexed());
+  VectorPair Masks = opSplit(MaskN->getMask(), dl, DAG);
+  SDValue Offset = DAG.getUNDEF(MVT::i32);
+
+  if (MemOpc == ISD::MLOAD) {
+    VectorPair Thru =
+        opSplit(cast<MaskedLoadSDNode>(Op)->getPassThru(), dl, DAG);
+    SDValue MLoad0 =
+        DAG.getMaskedLoad(SingleTy, dl, Chain, Base0, Offset, Masks.first,
+                          Thru.first, SingleTy, MOp0, ISD::UNINDEXED,
+                          ISD::NON_EXTLOAD, false);
+    SDValue MLoad1 =
+        DAG.getMaskedLoad(SingleTy, dl, Chain, Base1, Offset, Masks.second,
+                          Thru.second, SingleTy, MOp1, ISD::UNINDEXED,
+                          ISD::NON_EXTLOAD, false);
+    return DAG.getMergeValues(
+        { DAG.getNode(ISD::CONCAT_VECTORS, dl, MemTy, MLoad0, MLoad1),
+          DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                      MLoad0.getValue(1), MLoad1.getValue(1)) }, dl);
+  }
+  if (MemOpc == ISD::MSTORE) {
+    VectorPair Vals = opSplit(cast<MaskedStoreSDNode>(Op)->getValue(), dl, DAG);
+    SDValue MStore0 = DAG.getMaskedStore(Chain, dl, Vals.first, Base0, Offset,
+                                         Masks.first, SingleTy, MOp0,
+                                         ISD::UNINDEXED, false, false);
+    SDValue MStore1 = DAG.getMaskedStore(Chain, dl, Vals.second, Base1, Offset,
+                                         Masks.second, SingleTy, MOp1,
+                                         ISD::UNINDEXED, false, false);
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MStore0, MStore1);
+  }
+
+  std::string Name = "Unexpected operation: " + Op->getOperationName(&DAG);
+  llvm_unreachable(Name.c_str());
+}
+
+SDValue
+HexagonTargetLowering::WidenHvxLoad(SDValue Op, SelectionDAG &DAG) const {
+  const SDLoc &dl(Op);
+  auto *LoadN = cast<LoadSDNode>(Op.getNode());
+  assert(LoadN->isUnindexed() && "Not widening indexed loads yet");
+  assert(LoadN->getMemoryVT().getVectorElementType() != MVT::i1 &&
+         "Not widening loads of i1 yet");
+
+  SDValue Chain = LoadN->getChain();
+  SDValue Base = LoadN->getBasePtr();
+  SDValue Offset = DAG.getUNDEF(MVT::i32);
+
+  MVT ResTy = ty(Op);
+  unsigned HwLen = Subtarget.getVectorLength();
+  unsigned ResLen = ResTy.getStoreSize();
+  assert(ResLen < HwLen && "vsetq(v1) prerequisite");
+
+  MVT BoolTy = MVT::getVectorVT(MVT::i1, HwLen);
+  SDValue Mask = getInstr(Hexagon::V6_pred_scalar2, dl, BoolTy,
+                          {DAG.getConstant(ResLen, dl, MVT::i32)}, DAG);
+
+  MVT LoadTy = MVT::getVectorVT(MVT::i8, HwLen);
+  MachineFunction &MF = DAG.getMachineFunction();
+  auto *MemOp = MF.getMachineMemOperand(LoadN->getMemOperand(), 0, HwLen);
+
+  SDValue Load = DAG.getMaskedLoad(LoadTy, dl, Chain, Base, Offset, Mask,
+                                   DAG.getUNDEF(LoadTy), LoadTy, MemOp,
+                                   ISD::UNINDEXED, ISD::NON_EXTLOAD, false);
+  SDValue Value = opCastElem(Load, ResTy.getVectorElementType(), DAG);
+  return DAG.getMergeValues({Value, Chain}, dl);
+}
+
+SDValue
+HexagonTargetLowering::WidenHvxStore(SDValue Op, SelectionDAG &DAG) const {
+  const SDLoc &dl(Op);
+  auto *StoreN = cast<StoreSDNode>(Op.getNode());
+  assert(StoreN->isUnindexed() && "Not widening indexed stores yet");
+  assert(StoreN->getMemoryVT().getVectorElementType() != MVT::i1 &&
+         "Not widening stores of i1 yet");
+
+  SDValue Chain = StoreN->getChain();
+  SDValue Base = StoreN->getBasePtr();
+  SDValue Offset = DAG.getUNDEF(MVT::i32);
+
+  SDValue Value = opCastElem(StoreN->getValue(), MVT::i8, DAG);
+  MVT ValueTy = ty(Value);
+  unsigned ValueLen = ValueTy.getVectorNumElements();
+  unsigned HwLen = Subtarget.getVectorLength();
+  assert(isPowerOf2_32(ValueLen));
+
+  for (unsigned Len = ValueLen; Len < HwLen; ) {
+    Value = opJoin({DAG.getUNDEF(ty(Value)), Value}, dl, DAG);
+    Len = ty(Value).getVectorNumElements(); // This is Len *= 2
+  }
+  assert(ty(Value).getVectorNumElements() == HwLen);  // Paranoia
+
+  assert(ValueLen < HwLen && "vsetq(v1) prerequisite");
+  MVT BoolTy = MVT::getVectorVT(MVT::i1, HwLen);
+  SDValue Mask = getInstr(Hexagon::V6_pred_scalar2, dl, BoolTy,
+                          {DAG.getConstant(ValueLen, dl, MVT::i32)}, DAG);
+  MachineFunction &MF = DAG.getMachineFunction();
+  auto *MemOp = MF.getMachineMemOperand(StoreN->getMemOperand(), 0, HwLen);
+  return DAG.getMaskedStore(Chain, dl, Value, Base, Offset, Mask, ty(Value),
+                            MemOp, ISD::UNINDEXED, false, false);
+}
+
+SDValue
+HexagonTargetLowering::WidenHvxSetCC(SDValue Op, SelectionDAG &DAG) const {
+  const SDLoc &dl(Op);
+  SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
+  MVT ElemTy = ty(Op0).getVectorElementType();
+  unsigned HwLen = Subtarget.getVectorLength();
+
+  unsigned WideOpLen = (8 * HwLen) / ElemTy.getSizeInBits();
+  assert(WideOpLen * ElemTy.getSizeInBits() == 8 * HwLen);
+  MVT WideOpTy = MVT::getVectorVT(ElemTy, WideOpLen);
+
+  SDValue WideOp0 = appendUndef(Op0, WideOpTy, DAG);
+  SDValue WideOp1 = appendUndef(Op1, WideOpTy, DAG);
+  EVT ResTy =
+      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), WideOpTy);
+  SDValue SetCC = DAG.getNode(ISD::SETCC, dl, ResTy,
+                              {WideOp0, WideOp1, Op.getOperand(2)});
+
+  EVT RetTy = getTypeToTransformTo(*DAG.getContext(), ty(Op));
+  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RetTy,
+                     {SetCC, getZero(dl, MVT::i32, DAG)});
+}
+
+SDValue
+HexagonTargetLowering::WidenHvxExtend(SDValue Op, SelectionDAG &DAG) const {
+  const SDLoc &dl(Op);
+  unsigned HwWidth = 8*Subtarget.getVectorLength();
+
+  SDValue Op0 = Op.getOperand(0);
+  MVT ResTy = ty(Op);
+  MVT OpTy = ty(Op0);
+  if (!Subtarget.isHVXElementType(OpTy) || !Subtarget.isHVXElementType(ResTy))
+    return SDValue();
+
+  // .-res, op->      ScalarVec  Illegal      HVX
+  // Scalar                  ok        -        -
+  // Illegal      widen(insert)    widen        -
+  // HVX                      -    widen       ok
+
+  auto getFactor = [HwWidth](MVT Ty) {
+    unsigned Width = Ty.getSizeInBits();
+    return HwWidth > Width ? HwWidth / Width : 1;
+  };
+
+  auto getWideTy = [getFactor](MVT Ty) {
+    unsigned WideLen = Ty.getVectorNumElements() * getFactor(Ty);
+    return MVT::getVectorVT(Ty.getVectorElementType(), WideLen);
+  };
+
+  unsigned Opcode = Op.getOpcode() == ISD::SIGN_EXTEND ? HexagonISD::VUNPACK
+                                                       : HexagonISD::VUNPACKU;
+  SDValue WideOp = appendUndef(Op0, getWideTy(OpTy), DAG);
+  SDValue WideRes = DAG.getNode(Opcode, dl, getWideTy(ResTy), WideOp);
+  return WideRes;
+}
+
+SDValue
+HexagonTargetLowering::WidenHvxTruncate(SDValue Op, SelectionDAG &DAG) const {
+  const SDLoc &dl(Op);
+  unsigned HwWidth = 8*Subtarget.getVectorLength();
+
+  SDValue Op0 = Op.getOperand(0);
+  MVT ResTy = ty(Op);
+  MVT OpTy = ty(Op0);
+  if (!Subtarget.isHVXElementType(OpTy) || !Subtarget.isHVXElementType(ResTy))
+    return SDValue();
+
+  // .-res, op->  ScalarVec         Illegal      HVX
+  // Scalar              ok  extract(widen)        -
+  // Illegal              -           widen    widen
+  // HVX                  -               -       ok
+
+  auto getFactor = [HwWidth](MVT Ty) {
+    unsigned Width = Ty.getSizeInBits();
+    assert(HwWidth % Width == 0);
+    return HwWidth / Width;
+  };
+
+  auto getWideTy = [getFactor](MVT Ty) {
+    unsigned WideLen = Ty.getVectorNumElements() * getFactor(Ty);
+    return MVT::getVectorVT(Ty.getVectorElementType(), WideLen);
+  };
+
+  if (Subtarget.isHVXVectorType(OpTy))
+    return DAG.getNode(HexagonISD::VPACKL, dl, getWideTy(ResTy), Op0);
+
+  assert(!isTypeLegal(OpTy) && "HVX-widening a truncate of scalar?");
+
+  SDValue WideOp = appendUndef(Op0, getWideTy(OpTy), DAG);
+  SDValue WideRes = DAG.getNode(HexagonISD::VPACKL, dl, getWideTy(ResTy),
+                                WideOp);
+  // If the original result wasn't legal and was supposed to be widened,
+  // we're done.
+  if (shouldWidenToHvx(ResTy, DAG))
+    return WideRes;
+
+  // The original result type wasn't meant to be widened to HVX, so
+  // leave it as it is. Standard legalization should be able to deal
+  // with it (since now it's a result of a target-idendependent ISD
+  // node).
+  assert(ResTy.isVector());
+  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResTy,
+                     {WideRes, getZero(dl, MVT::i32, DAG)});
 }
 
 SDValue
@@ -1703,6 +2063,8 @@ HexagonTargetLowering::LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const {
         break;
       case ISD::LOAD:
       case ISD::STORE:
+      case ISD::MLOAD:
+      case ISD::MSTORE:
         return SplitHvxMemOp(Op, DAG);
       case ISD::CTPOP:
       case ISD::CTLZ:
@@ -1716,11 +2078,16 @@ HexagonTargetLowering::LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const {
       case ISD::SRA:
       case ISD::SHL:
       case ISD::SRL:
+      case ISD::SMIN:
+      case ISD::SMAX:
+      case ISD::UMIN:
+      case ISD::UMAX:
       case ISD::SETCC:
       case ISD::VSELECT:
       case ISD::SIGN_EXTEND:
       case ISD::ZERO_EXTEND:
       case ISD::SIGN_EXTEND_INREG:
+      case ISD::SPLAT_VECTOR:
         return SplitHvxPairOp(Op, DAG);
     }
   }
@@ -1739,16 +2106,18 @@ HexagonTargetLowering::LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const {
     case ISD::SIGN_EXTEND:             return LowerHvxSignExt(Op, DAG);
     case ISD::ZERO_EXTEND:             return LowerHvxZeroExt(Op, DAG);
     case ISD::CTTZ:                    return LowerHvxCttz(Op, DAG);
+    case ISD::SELECT:                  return LowerHvxSelect(Op, DAG);
     case ISD::SRA:
     case ISD::SHL:
     case ISD::SRL:                     return LowerHvxShift(Op, DAG);
-    case ISD::MUL:                     return LowerHvxMul(Op, DAG);
     case ISD::MULHS:
     case ISD::MULHU:                   return LowerHvxMulh(Op, DAG);
     case ISD::ANY_EXTEND_VECTOR_INREG: return LowerHvxExtend(Op, DAG);
     case ISD::SETCC:
     case ISD::INTRINSIC_VOID:          return Op;
     case ISD::INTRINSIC_WO_CHAIN:      return LowerHvxIntrinsic(Op, DAG);
+    case ISD::MLOAD:
+    case ISD::MSTORE:                  return LowerHvxMaskedOp(Op, DAG);
     // Unaligned loads will be handled by the default lowering.
     case ISD::LOAD:                    return SDValue();
   }
@@ -1761,13 +2130,91 @@ HexagonTargetLowering::LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const {
 void
 HexagonTargetLowering::LowerHvxOperationWrapper(SDNode *N,
       SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
+  unsigned Opc = N->getOpcode();
+  SDValue Op(N, 0);
+
+  switch (Opc) {
+    case ISD::ANY_EXTEND:
+    case ISD::SIGN_EXTEND:
+    case ISD::ZERO_EXTEND:
+      if (shouldWidenToHvx(ty(Op.getOperand(0)), DAG)) {
+        if (SDValue T = WidenHvxExtend(Op, DAG))
+          Results.push_back(T);
+      }
+      break;
+    case ISD::SETCC:
+      if (shouldWidenToHvx(ty(Op.getOperand(0)), DAG)) {
+        if (SDValue T = WidenHvxSetCC(Op, DAG))
+          Results.push_back(T);
+      }
+      break;
+    case ISD::TRUNCATE:
+      if (shouldWidenToHvx(ty(Op.getOperand(0)), DAG)) {
+        if (SDValue T = WidenHvxTruncate(Op, DAG))
+          Results.push_back(T);
+      }
+      break;
+    case ISD::STORE: {
+      if (shouldWidenToHvx(ty(cast<StoreSDNode>(N)->getValue()), DAG)) {
+        SDValue Store = WidenHvxStore(Op, DAG);
+        Results.push_back(Store);
+      }
+      break;
+    }
+    case ISD::MLOAD:
+      if (isHvxPairTy(ty(Op))) {
+        SDValue S = SplitHvxMemOp(Op, DAG);
+        assert(S->getOpcode() == ISD::MERGE_VALUES);
+        Results.push_back(S.getOperand(0));
+        Results.push_back(S.getOperand(1));
+      }
+      break;
+    case ISD::MSTORE:
+      if (isHvxPairTy(ty(Op->getOperand(1)))) {    // Stored value
+        SDValue S = SplitHvxMemOp(Op, DAG);
+        Results.push_back(S);
+      }
+      break;
+    default:
+      break;
+  }
 }
 
 void
 HexagonTargetLowering::ReplaceHvxNodeResults(SDNode *N,
       SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
   unsigned Opc = N->getOpcode();
+  SDValue Op(N, 0);
   switch (Opc) {
+    case ISD::ANY_EXTEND:
+    case ISD::SIGN_EXTEND:
+    case ISD::ZERO_EXTEND:
+      if (shouldWidenToHvx(ty(Op), DAG)) {
+        if (SDValue T = WidenHvxExtend(Op, DAG))
+          Results.push_back(T);
+      }
+      break;
+    case ISD::SETCC:
+      if (shouldWidenToHvx(ty(Op), DAG)) {
+        if (SDValue T = WidenHvxSetCC(Op, DAG))
+          Results.push_back(T);
+      }
+      break;
+    case ISD::TRUNCATE:
+      if (shouldWidenToHvx(ty(Op), DAG)) {
+        if (SDValue T = WidenHvxTruncate(Op, DAG))
+          Results.push_back(T);
+      }
+      break;
+    case ISD::LOAD: {
+      if (shouldWidenToHvx(ty(Op), DAG)) {
+        SDValue Load = WidenHvxLoad(Op, DAG);
+        assert(Load->getOpcode() == ISD::MERGE_VALUES);
+        Results.push_back(Load.getOperand(0));
+        Results.push_back(Load.getOperand(1));
+      }
+      break;
+    }
     case ISD::BITCAST:
       if (isHvxBoolTy(ty(N->getOperand(0)))) {
         SDValue Op(N, 0);
@@ -1784,44 +2231,95 @@ SDValue
 HexagonTargetLowering::PerformHvxDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
       const {
   const SDLoc &dl(N);
+  SelectionDAG &DAG = DCI.DAG;
   SDValue Op(N, 0);
-
   unsigned Opc = Op.getOpcode();
-  if (Opc == ISD::VSELECT) {
-    // (vselect (xor x, qtrue), v0, v1) -> (vselect x, v1, v0)
-    SDValue Cond = Op.getOperand(0);
-    if (Cond->getOpcode() == ISD::XOR) {
-      SDValue C0 = Cond.getOperand(0), C1 = Cond.getOperand(1);
-      if (C1->getOpcode() == HexagonISD::QTRUE) {
-        SDValue VSel = DCI.DAG.getNode(ISD::VSELECT, dl, ty(Op), C0,
-                                       Op.getOperand(2), Op.getOperand(1));
-        return VSel;
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SmallVector<SDValue, 4> Ops(N->ops().begin(), N->ops().end());
+
+  switch (Opc) {
+    case ISD::VSELECT: {
+      // (vselect (xor x, qtrue), v0, v1) -> (vselect x, v1, v0)
+      SDValue Cond = Ops[0];
+      if (Cond->getOpcode() == ISD::XOR) {
+        SDValue C0 = Cond.getOperand(0), C1 = Cond.getOperand(1);
+        if (C1->getOpcode() == HexagonISD::QTRUE)
+          return DAG.getNode(ISD::VSELECT, dl, ty(Op), C0, Ops[2], Ops[1]);
+      }
+      break;
+    }
+    case HexagonISD::V2Q:
+      if (Ops[0].getOpcode() == ISD::SPLAT_VECTOR) {
+        if (const auto *C = dyn_cast<ConstantSDNode>(Ops[0].getOperand(0)))
+          return C->isNullValue() ? DAG.getNode(HexagonISD::QFALSE, dl, ty(Op))
+                                  : DAG.getNode(HexagonISD::QTRUE, dl, ty(Op));
+      }
+      break;
+    case HexagonISD::Q2V:
+      if (Ops[0].getOpcode() == HexagonISD::QTRUE)
+        return DAG.getNode(ISD::SPLAT_VECTOR, dl, ty(Op),
+                           DAG.getConstant(-1, dl, MVT::i32));
+      if (Ops[0].getOpcode() == HexagonISD::QFALSE)
+        return getZero(dl, ty(Op), DAG);
+      break;
+    case HexagonISD::VINSERTW0:
+      if (isUndef(Ops[1]))
+        return Ops[0];;
+      break;
+    case HexagonISD::VROR: {
+      if (Ops[0].getOpcode() == HexagonISD::VROR) {
+        SDValue Vec = Ops[0].getOperand(0);
+        SDValue Rot0 = Ops[1], Rot1 = Ops[0].getOperand(1);
+        SDValue Rot = DAG.getNode(ISD::ADD, dl, ty(Rot0), {Rot0, Rot1});
+        return DAG.getNode(HexagonISD::VROR, dl, ty(Op), {Vec, Rot});
       }
+      break;
     }
   }
+
   return SDValue();
 }
 
 bool
-HexagonTargetLowering::isHvxOperation(SDValue Op) const {
-  // If the type of the result, or any operand type are HVX vector types,
-  // this is an HVX operation.
-  return Subtarget.isHVXVectorType(ty(Op), true) ||
-         llvm::any_of(Op.getNode()->ops(),
-                      [this] (SDValue V) {
-                        return Subtarget.isHVXVectorType(ty(V), true);
-                      });
+HexagonTargetLowering::shouldWidenToHvx(MVT Ty, SelectionDAG &DAG) const {
+  auto Action = getPreferredHvxVectorAction(Ty);
+  if (Action == TargetLoweringBase::TypeWidenVector) {
+    EVT WideTy = getTypeToTransformTo(*DAG.getContext(), Ty);
+    assert(WideTy.isSimple());
+    return Subtarget.isHVXVectorType(WideTy.getSimpleVT(), true);
+  }
+  return false;
 }
 
 bool
-HexagonTargetLowering::isHvxOperation(SDNode *N) const {
+HexagonTargetLowering::isHvxOperation(SDNode *N, SelectionDAG &DAG) const {
+  if (!Subtarget.useHVXOps())
+    return false;
   // If the type of any result, or any operand type are HVX vector types,
   // this is an HVX operation.
-  auto IsHvxTy = [this] (EVT Ty) {
+  auto IsHvxTy = [this](EVT Ty) {
     return Ty.isSimple() && Subtarget.isHVXVectorType(Ty.getSimpleVT(), true);
   };
-  auto IsHvxOp = [this] (SDValue Op) {
-    return Subtarget.isHVXVectorType(ty(Op), true);
+  auto IsHvxOp = [this](SDValue Op) {
+    return Op.getValueType().isSimple() &&
+           Subtarget.isHVXVectorType(ty(Op), true);
+  };
+  if (llvm::any_of(N->values(), IsHvxTy) || llvm::any_of(N->ops(), IsHvxOp))
+    return true;
+
+  // Check if this could be an HVX operation after type widening.
+  auto IsWidenedToHvx = [this, &DAG](SDValue Op) {
+    if (!Op.getValueType().isSimple())
+      return false;
+    MVT ValTy = ty(Op);
+    return ValTy.isVector() && shouldWidenToHvx(ValTy, DAG);
   };
-  return llvm::any_of(N->values(), IsHvxTy) || llvm::any_of(N->ops(), IsHvxOp);
+
+  for (int i = 0, e = N->getNumValues(); i != e; ++i) {
+    if (IsWidenedToHvx(SDValue(N, i)))
+      return true;
+  }
+  return llvm::any_of(N->ops(), IsWidenedToHvx);
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
index d1cd23c3be3e..26fc093d15a7 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -1639,8 +1639,9 @@ bool HexagonInstrInfo::SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
   return false;
 }
 
-bool HexagonInstrInfo::DefinesPredicate(MachineInstr &MI,
-      std::vector<MachineOperand> &Pred) const {
+bool HexagonInstrInfo::ClobbersPredicate(MachineInstr &MI,
+                                         std::vector<MachineOperand> &Pred,
+                                         bool SkipDead) const {
   const HexagonRegisterInfo &HRI = *Subtarget.getRegisterInfo();
 
   for (unsigned oper = 0; oper < MI.getNumOperands(); ++oper) {
@@ -2721,6 +2722,8 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset,
   case Hexagon::PS_vloadrw_nt_ai:
   case Hexagon::V6_vL32b_ai:
   case Hexagon::V6_vS32b_ai:
+  case Hexagon::V6_vS32b_qpred_ai:
+  case Hexagon::V6_vS32b_nqpred_ai:
   case Hexagon::V6_vL32b_nt_ai:
   case Hexagon::V6_vS32b_nt_ai:
   case Hexagon::V6_vL32Ub_ai:
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonInstrInfo.h b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
index 847b9a672891..11717996935d 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -238,8 +238,8 @@ public:
   /// If the specified instruction defines any predicate
   /// or condition code register(s) used for predication, returns true as well
   /// as the definition predicate(s) by reference.
-  bool DefinesPredicate(MachineInstr &MI,
-                        std::vector<MachineOperand> &Pred) const override;
+  bool ClobbersPredicate(MachineInstr &MI, std::vector<MachineOperand> &Pred,
+                         bool SkipDead) const override;
 
   /// Return true if the specified instruction can be predicated.
   /// By default, this returns true for every instruction with a
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonIntrinsicsV60.td b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonIntrinsicsV60.td
index 1245ee7974b5..796979e59061 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonIntrinsicsV60.td
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonIntrinsicsV60.td
@@ -1,4 +1,4 @@
-//=- HexagonIntrinsicsV60.td - Target Description for Hexagon -*- tablegen *-=//
+//===- HexagonIntrinsicsV60.td - V60 instruction intrinsics -*- tablegen *-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
index 2c1e0cadd9ee..76cc8f402c5a 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "HexagonLoopIdiomRecognition.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SetVector.h"
@@ -16,6 +17,7 @@
 #include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/MemoryLocation.h"
@@ -40,6 +42,7 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsHexagon.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
@@ -108,136 +111,151 @@ static const char *HexagonVolatileMemcpyName
 
 namespace llvm {
 
-  void initializeHexagonLoopIdiomRecognizePass(PassRegistry&);
-  Pass *createHexagonLoopIdiomPass();
+void initializeHexagonLoopIdiomRecognizeLegacyPassPass(PassRegistry &);
+Pass *createHexagonLoopIdiomPass();
 
 } // end namespace llvm
 
 namespace {
 
-  class HexagonLoopIdiomRecognize : public LoopPass {
-  public:
-    static char ID;
-
-    explicit HexagonLoopIdiomRecognize() : LoopPass(ID) {
-      initializeHexagonLoopIdiomRecognizePass(*PassRegistry::getPassRegistry());
-    }
+class HexagonLoopIdiomRecognize {
+public:
+  explicit HexagonLoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT,
+                                     LoopInfo *LF, const TargetLibraryInfo *TLI,
+                                     ScalarEvolution *SE)
+      : AA(AA), DT(DT), LF(LF), TLI(TLI), SE(SE) {}
+
+  bool run(Loop *L);
+
+private:
+  int getSCEVStride(const SCEVAddRecExpr *StoreEv);
+  bool isLegalStore(Loop *CurLoop, StoreInst *SI);
+  void collectStores(Loop *CurLoop, BasicBlock *BB,
+                     SmallVectorImpl<StoreInst *> &Stores);
+  bool processCopyingStore(Loop *CurLoop, StoreInst *SI, const SCEV *BECount);
+  bool coverLoop(Loop *L, SmallVectorImpl<Instruction *> &Insts) const;
+  bool runOnLoopBlock(Loop *CurLoop, BasicBlock *BB, const SCEV *BECount,
+                      SmallVectorImpl<BasicBlock *> &ExitBlocks);
+  bool runOnCountableLoop(Loop *L);
+
+  AliasAnalysis *AA;
+  const DataLayout *DL;
+  DominatorTree *DT;
+  LoopInfo *LF;
+  const TargetLibraryInfo *TLI;
+  ScalarEvolution *SE;
+  bool HasMemcpy, HasMemmove;
+};
+
+class HexagonLoopIdiomRecognizeLegacyPass : public LoopPass {
+public:
+  static char ID;
+
+  explicit HexagonLoopIdiomRecognizeLegacyPass() : LoopPass(ID) {
+    initializeHexagonLoopIdiomRecognizeLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
 
-    StringRef getPassName() const override {
-      return "Recognize Hexagon-specific loop idioms";
-    }
+  StringRef getPassName() const override {
+    return "Recognize Hexagon-specific loop idioms";
+  }
 
-   void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<LoopInfoWrapperPass>();
-      AU.addRequiredID(LoopSimplifyID);
-      AU.addRequiredID(LCSSAID);
-      AU.addRequired<AAResultsWrapperPass>();
-      AU.addPreserved<AAResultsWrapperPass>();
-      AU.addRequired<ScalarEvolutionWrapperPass>();
-      AU.addRequired<DominatorTreeWrapperPass>();
-      AU.addRequired<TargetLibraryInfoWrapperPass>();
-      AU.addPreserved<TargetLibraryInfoWrapperPass>();
-    }
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addRequiredID(LoopSimplifyID);
+    AU.addRequiredID(LCSSAID);
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addPreserved<TargetLibraryInfoWrapperPass>();
+  }
 
-    bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+};
 
-  private:
-    int getSCEVStride(const SCEVAddRecExpr *StoreEv);
-    bool isLegalStore(Loop *CurLoop, StoreInst *SI);
-    void collectStores(Loop *CurLoop, BasicBlock *BB,
-        SmallVectorImpl<StoreInst*> &Stores);
-    bool processCopyingStore(Loop *CurLoop, StoreInst *SI, const SCEV *BECount);
-    bool coverLoop(Loop *L, SmallVectorImpl<Instruction*> &Insts) const;
-    bool runOnLoopBlock(Loop *CurLoop, BasicBlock *BB, const SCEV *BECount,
-        SmallVectorImpl<BasicBlock*> &ExitBlocks);
-    bool runOnCountableLoop(Loop *L);
-
-    AliasAnalysis *AA;
-    const DataLayout *DL;
-    DominatorTree *DT;
-    LoopInfo *LF;
-    const TargetLibraryInfo *TLI;
-    ScalarEvolution *SE;
-    bool HasMemcpy, HasMemmove;
+struct Simplifier {
+  struct Rule {
+    using FuncType = std::function<Value *(Instruction *, LLVMContext &)>;
+    Rule(StringRef N, FuncType F) : Name(N), Fn(F) {}
+    StringRef Name; // For debugging.
+    FuncType Fn;
   };
 
-  struct Simplifier {
-    struct Rule {
-      using FuncType = std::function<Value* (Instruction*, LLVMContext&)>;
-      Rule(StringRef N, FuncType F) : Name(N), Fn(F) {}
-      StringRef Name;   // For debugging.
-      FuncType Fn;
-    };
-
-    void addRule(StringRef N, const Rule::FuncType &F) {
-      Rules.push_back(Rule(N, F));
-    }
+  void addRule(StringRef N, const Rule::FuncType &F) {
+    Rules.push_back(Rule(N, F));
+  }
 
-  private:
-    struct WorkListType {
-      WorkListType() = default;
+private:
+  struct WorkListType {
+    WorkListType() = default;
 
-      void push_back(Value* V) {
-        // Do not push back duplicates.
-        if (!S.count(V)) { Q.push_back(V); S.insert(V); }
+    void push_back(Value *V) {
+      // Do not push back duplicates.
+      if (!S.count(V)) {
+        Q.push_back(V);
+        S.insert(V);
       }
+    }
 
-      Value *pop_front_val() {
-        Value *V = Q.front(); Q.pop_front(); S.erase(V);
-        return V;
-      }
+    Value *pop_front_val() {
+      Value *V = Q.front();
+      Q.pop_front();
+      S.erase(V);
+      return V;
+    }
 
-      bool empty() const { return Q.empty(); }
+    bool empty() const { return Q.empty(); }
 
-    private:
-      std::deque<Value*> Q;
-      std::set<Value*> S;
-    };
+  private:
+    std::deque<Value *> Q;
+    std::set<Value *> S;
+  };
 
-    using ValueSetType = std::set<Value *>;
+  using ValueSetType = std::set<Value *>;
 
-    std::vector<Rule> Rules;
+  std::vector<Rule> Rules;
 
-  public:
-    struct Context {
-      using ValueMapType = DenseMap<Value *, Value *>;
+public:
+  struct Context {
+    using ValueMapType = DenseMap<Value *, Value *>;
 
-      Value *Root;
-      ValueSetType Used;    // The set of all cloned values used by Root.
-      ValueSetType Clones;  // The set of all cloned values.
-      LLVMContext &Ctx;
+    Value *Root;
+    ValueSetType Used;   // The set of all cloned values used by Root.
+    ValueSetType Clones; // The set of all cloned values.
+    LLVMContext &Ctx;
 
-      Context(Instruction *Exp)
+    Context(Instruction *Exp)
         : Ctx(Exp->getParent()->getParent()->getContext()) {
-        initialize(Exp);
-      }
-
-      ~Context() { cleanup(); }
+      initialize(Exp);
+    }
 
-      void print(raw_ostream &OS, const Value *V) const;
-      Value *materialize(BasicBlock *B, BasicBlock::iterator At);
+    ~Context() { cleanup(); }
 
-    private:
-      friend struct Simplifier;
+    void print(raw_ostream &OS, const Value *V) const;
+    Value *materialize(BasicBlock *B, BasicBlock::iterator At);
 
-      void initialize(Instruction *Exp);
-      void cleanup();
+  private:
+    friend struct Simplifier;
 
-      template <typename FuncT> void traverse(Value *V, FuncT F);
-      void record(Value *V);
-      void use(Value *V);
-      void unuse(Value *V);
+    void initialize(Instruction *Exp);
+    void cleanup();
 
-      bool equal(const Instruction *I, const Instruction *J) const;
-      Value *find(Value *Tree, Value *Sub) const;
-      Value *subst(Value *Tree, Value *OldV, Value *NewV);
-      void replace(Value *OldV, Value *NewV);
-      void link(Instruction *I, BasicBlock *B, BasicBlock::iterator At);
-    };
+    template <typename FuncT> void traverse(Value *V, FuncT F);
+    void record(Value *V);
+    void use(Value *V);
+    void unuse(Value *V);
 
-    Value *simplify(Context &C);
+    bool equal(const Instruction *I, const Instruction *J) const;
+    Value *find(Value *Tree, Value *Sub) const;
+    Value *subst(Value *Tree, Value *OldV, Value *NewV);
+    void replace(Value *OldV, Value *NewV);
+    void link(Instruction *I, BasicBlock *B, BasicBlock::iterator At);
   };
 
+  Value *simplify(Context &C);
+};
+
   struct PE {
     PE(const Simplifier::Context &c, Value *v = nullptr) : C(c), V(v) {}
 
@@ -253,10 +271,10 @@ namespace {
 
 } // end anonymous namespace
 
-char HexagonLoopIdiomRecognize::ID = 0;
+char HexagonLoopIdiomRecognizeLegacyPass::ID = 0;
 
-INITIALIZE_PASS_BEGIN(HexagonLoopIdiomRecognize, "hexagon-loop-idiom",
-    "Recognize Hexagon-specific loop idioms", false, false)
+INITIALIZE_PASS_BEGIN(HexagonLoopIdiomRecognizeLegacyPass, "hexagon-loop-idiom",
+                      "Recognize Hexagon-specific loop idioms", false, false)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
@@ -264,8 +282,8 @@ INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_END(HexagonLoopIdiomRecognize, "hexagon-loop-idiom",
-    "Recognize Hexagon-specific loop idioms", false, false)
+INITIALIZE_PASS_END(HexagonLoopIdiomRecognizeLegacyPass, "hexagon-loop-idiom",
+                    "Recognize Hexagon-specific loop idioms", false, false)
 
 template <typename FuncT>
 void Simplifier::Context::traverse(Value *V, FuncT F) {
@@ -1973,7 +1991,7 @@ mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,
   // Get the location that may be stored across the loop.  Since the access
   // is strided positively through memory, we say that the modified location
   // starts at the pointer and has infinite size.
-  LocationSize AccessSize = LocationSize::unknown();
+  LocationSize AccessSize = LocationSize::afterPointer();
 
   // If the loop iterates a fixed number of times, we can refine the access
   // size to be exactly the size of the memset, which is (BECount+1)*StoreSize
@@ -2404,14 +2422,11 @@ bool HexagonLoopIdiomRecognize::runOnCountableLoop(Loop *L) {
   return Changed;
 }
 
-bool HexagonLoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) {
+bool HexagonLoopIdiomRecognize::run(Loop *L) {
   const Module &M = *L->getHeader()->getParent()->getParent();
   if (Triple(M.getTargetTriple()).getArch() != Triple::hexagon)
     return false;
 
-  if (skipLoop(L))
-    return false;
-
   // If the loop could not be converted to canonical form, it must have an
   // indirectbr in it, just give up.
   if (!L->getLoopPreheader())
@@ -2422,13 +2437,7 @@ bool HexagonLoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) {
   if (Name == "memset" || Name == "memcpy" || Name == "memmove")
     return false;
 
-  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   DL = &L->getHeader()->getModule()->getDataLayout();
-  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  LF = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
-      *L->getHeader()->getParent());
-  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
 
   HasMemcpy = TLI->has(LibFunc_memcpy);
   HasMemmove = TLI->has(LibFunc_memmove);
@@ -2438,6 +2447,30 @@ bool HexagonLoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) {
   return false;
 }
 
+bool HexagonLoopIdiomRecognizeLegacyPass::runOnLoop(Loop *L,
+                                                    LPPassManager &LPM) {
+  if (skipLoop(L))
+    return false;
+
+  auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  auto *LF = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
+      *L->getHeader()->getParent());
+  auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  return HexagonLoopIdiomRecognize(AA, DT, LF, TLI, SE).run(L);
+}
+
 Pass *llvm::createHexagonLoopIdiomPass() {
-  return new HexagonLoopIdiomRecognize();
+  return new HexagonLoopIdiomRecognizeLegacyPass();
+}
+
+PreservedAnalyses
+HexagonLoopIdiomRecognitionPass::run(Loop &L, LoopAnalysisManager &AM,
+                                     LoopStandardAnalysisResults &AR,
+                                     LPMUpdater &U) {
+  return HexagonLoopIdiomRecognize(&AR.AA, &AR.DT, &AR.LI, &AR.TLI, &AR.SE)
+                 .run(&L)
+             ? getLoopPassPreservedAnalyses()
+             : PreservedAnalyses::all();
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.h b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.h
new file mode 100644
index 000000000000..28ec83b05dac
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.h
@@ -0,0 +1,24 @@
+//===- HexagonLoopIdiomRecognition.h --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONLOOPIDIOMRECOGNITION_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONLOOPIDIOMRECOGNITION_H
+
+#include "llvm/IR/PassManager.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+
+namespace llvm {
+
+struct HexagonLoopIdiomRecognitionPass
+    : PassInfoMixin<HexagonLoopIdiomRecognitionPass> {
+  PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
+                        LoopStandardAnalysisResults &AR, LPMUpdater &U);
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_HEXAGON_HEXAGONLOOPIDIOMRECOGNITION_H
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp
index 188d91355a35..9507de95231f 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp
@@ -104,7 +104,7 @@ void llvm::HexagonLowerToMC(const MCInstrInfo &MCII, const MachineInstr *MI,
     HexagonMCInstrInfo::setOuterLoop(MCB);
     return;
   }
-  MCInst *MCI = new (AP.OutContext) MCInst;
+  MCInst *MCI = AP.OutContext.createMCInst();
   MCI->setOpcode(MI->getOpcode());
   assert(MCI->getOpcode() == static_cast<unsigned>(MI->getOpcode()) &&
          "MCI opcode should have been set on construction");
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp
index c718e5f2d9fb..2cdfbe7845b6 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp
@@ -246,7 +246,7 @@ void HexagonOptAddrMode::getAllRealUses(NodeAddr<StmtNode *> SA,
   for (NodeAddr<DefNode *> DA : SA.Addr->members_if(DFG->IsDef, *DFG)) {
     LLVM_DEBUG(dbgs() << "\t\t[DefNode]: "
                       << Print<NodeAddr<DefNode *>>(DA, *DFG) << "\n");
-    RegisterRef DR = DFG->getPRI().normalize(DA.Addr->getRegRef(*DFG));
+    RegisterRef DR = DA.Addr->getRegRef(*DFG);
 
     auto UseSet = LV->getAllReachedUses(DR, DA);
 
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp
index d818e0897f75..e026bb6d601d 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp
@@ -11,7 +11,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "Hexagon.h"
 #include "llvm/CodeGen/StackProtector.h"
+#include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -19,8 +21,6 @@
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Scalar.h"
 
-#include "Hexagon.h"
-
 using namespace llvm;
 
 namespace llvm {
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonPatterns.td b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonPatterns.td
index cc10627955fb..d216c511a994 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonPatterns.td
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonPatterns.td
@@ -1,4 +1,4 @@
-//==- HexagonPatterns.td - Target Description for Hexagon -*- tablegen -*-===//
+//===- HexagonPatterns.td - Selection Patterns for Hexagon -*- tablegen -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -229,6 +229,21 @@ def NegImm32: SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(NV, SDLoc(N), MVT::i32);
 }]>;
 
+def SplatB: SDNodeXForm<imm, [{
+  uint32_t V = N->getZExtValue();
+  assert(isUInt<8>(V) || V >> 8 == 0xFFFFFF);
+  V &= 0xFF;
+  uint32_t S = V << 24 | V << 16 | V << 8 | V;
+  return CurDAG->getTargetConstant(S, SDLoc(N), MVT::i32);
+}]>;
+
+def SplatH: SDNodeXForm<imm, [{
+  uint32_t V = N->getZExtValue();
+  assert(isUInt<16>(V) || V >> 16 == 0xFFFF);
+  V &= 0xFFFF;
+  return CurDAG->getTargetConstant(V << 16 | V, SDLoc(N), MVT::i32);
+}]>;
+
 
 // Helpers for type promotions/contractions.
 def I1toI32:  OutPatFrag<(ops node:$Rs), (C2_muxii (i1 $Rs), 1, 0)>;
@@ -351,12 +366,14 @@ multiclass NopCast_pat<ValueType Ty1, ValueType Ty2, RegisterClass RC> {
   def: Pat<(Ty2 (bitconvert (Ty1 RC:$Val))), (Ty2 RC:$Val)>;
 }
 
-
 // Frags for commonly used SDNodes.
 def Add: pf2<add>;    def And: pf2<and>;    def Sra: pf2<sra>;
 def Sub: pf2<sub>;    def Or:  pf2<or>;     def Srl: pf2<srl>;
 def Mul: pf2<mul>;    def Xor: pf2<xor>;    def Shl: pf2<shl>;
 
+def Smin: pf2<smin>;  def Smax: pf2<smax>;
+def Umin: pf2<umin>;  def Umax: pf2<umax>;
+
 def Rol: pf2<rotl>;
 
 // --(1) Immediate -------------------------------------------------------
@@ -909,25 +926,14 @@ let AddedComplexity = 200 in {
   defm: SelMinMax16_pats<setult, A2_minu, A2_maxu>;
 }
 
-let AddedComplexity = 200 in {
-  defm: MinMax_pats<A2_min,   A2_max,   select,  setgt, i1, I32>;
-  defm: MinMax_pats<A2_min,   A2_max,   select,  setge, i1, I32>;
-  defm: MinMax_pats<A2_max,   A2_min,   select,  setlt, i1, I32>;
-  defm: MinMax_pats<A2_max,   A2_min,   select,  setle, i1, I32>;
-  defm: MinMax_pats<A2_minu,  A2_maxu,  select, setugt, i1, I32>;
-  defm: MinMax_pats<A2_minu,  A2_maxu,  select, setuge, i1, I32>;
-  defm: MinMax_pats<A2_maxu,  A2_minu,  select, setult, i1, I32>;
-  defm: MinMax_pats<A2_maxu,  A2_minu,  select, setule, i1, I32>;
-
-  defm: MinMax_pats<A2_minp,  A2_maxp,  select,  setgt, i1, I64>;
-  defm: MinMax_pats<A2_minp,  A2_maxp,  select,  setge, i1, I64>;
-  defm: MinMax_pats<A2_maxp,  A2_minp,  select,  setlt, i1, I64>;
-  defm: MinMax_pats<A2_maxp,  A2_minp,  select,  setle, i1, I64>;
-  defm: MinMax_pats<A2_minup, A2_maxup, select, setugt, i1, I64>;
-  defm: MinMax_pats<A2_minup, A2_maxup, select, setuge, i1, I64>;
-  defm: MinMax_pats<A2_maxup, A2_minup, select, setult, i1, I64>;
-  defm: MinMax_pats<A2_maxup, A2_minup, select, setule, i1, I64>;
-}
+def: OpR_RR_pat<A2_min,   Smin, i32, I32, I32>;
+def: OpR_RR_pat<A2_max,   Smax, i32, I32, I32>;
+def: OpR_RR_pat<A2_minu,  Umin, i32, I32, I32>;
+def: OpR_RR_pat<A2_maxu,  Umax, i32, I32, I32>;
+def: OpR_RR_pat<A2_minp,  Smin, i64, I64, I64>;
+def: OpR_RR_pat<A2_maxp,  Smax, i64, I64, I64>;
+def: OpR_RR_pat<A2_minup, Umin, i64, I64, I64>;
+def: OpR_RR_pat<A2_maxup, Umax, i64, I64, I64>;
 
 let AddedComplexity = 100 in {
   defm: MinMax_pats<F2_sfmin, F2_sfmax, select, setogt, i1, F32>;
@@ -943,18 +949,20 @@ let AddedComplexity = 100, Predicates = [HasV67] in {
   defm: MinMax_pats<F2_dfmax, F2_dfmin, select, setole, i1, F64>;
 }
 
-defm: MinMax_pats<A2_vminb,  A2_vmaxb,  vselect,  setgt,  v8i1,  V8I8>;
-defm: MinMax_pats<A2_vminb,  A2_vmaxb,  vselect,  setge,  v8i1,  V8I8>;
-defm: MinMax_pats<A2_vminh,  A2_vmaxh,  vselect,  setgt,  v4i1, V4I16>;
-defm: MinMax_pats<A2_vminh,  A2_vmaxh,  vselect,  setge,  v4i1, V4I16>;
-defm: MinMax_pats<A2_vminw,  A2_vmaxw,  vselect,  setgt,  v2i1, V2I32>;
-defm: MinMax_pats<A2_vminw,  A2_vmaxw,  vselect,  setge,  v2i1, V2I32>;
-defm: MinMax_pats<A2_vminub, A2_vmaxub, vselect, setugt,  v8i1,  V8I8>;
-defm: MinMax_pats<A2_vminub, A2_vmaxub, vselect, setuge,  v8i1,  V8I8>;
-defm: MinMax_pats<A2_vminuh, A2_vmaxuh, vselect, setugt,  v4i1, V4I16>;
-defm: MinMax_pats<A2_vminuh, A2_vmaxuh, vselect, setuge,  v4i1, V4I16>;
-defm: MinMax_pats<A2_vminuw, A2_vmaxuw, vselect, setugt,  v2i1, V2I32>;
-defm: MinMax_pats<A2_vminuw, A2_vmaxuw, vselect, setuge,  v2i1, V2I32>;
+def: OpR_RR_pat<A2_vminb,  Smin, v8i8,  V8I8>;
+def: OpR_RR_pat<A2_vmaxb,  Smax, v8i8,  V8I8>;
+def: OpR_RR_pat<A2_vminub, Umin, v8i8,  V8I8>;
+def: OpR_RR_pat<A2_vmaxub, Umax, v8i8,  V8I8>;
+
+def: OpR_RR_pat<A2_vminh,  Smin, v4i16, V4I16>;
+def: OpR_RR_pat<A2_vmaxh,  Smax, v4i16, V4I16>;
+def: OpR_RR_pat<A2_vminuh, Umin, v4i16, V4I16>;
+def: OpR_RR_pat<A2_vmaxuh, Umax, v4i16, V4I16>;
+
+def: OpR_RR_pat<A2_vminw,  Smin, v2i32, V2I32>;
+def: OpR_RR_pat<A2_vmaxw,  Smax, v2i32, V2I32>;
+def: OpR_RR_pat<A2_vminuw, Umin, v2i32, V2I32>;
+def: OpR_RR_pat<A2_vmaxuw, Umax, v2i32, V2I32>;
 
 // --(7) Insert/extract --------------------------------------------------
 //
@@ -991,21 +999,26 @@ def: Pat<(HexagonEXTRACTU I32:$Rs, I32:$Width, I32:$Off),
 def: Pat<(HexagonEXTRACTU I64:$Rs, I32:$Width, I32:$Off),
          (S2_extractup_rp I64:$Rs, (Combinew $Width, $Off))>;
 
-def SDTHexagonVSPLAT:
-  SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVT<1, i32>]>;
-
-def HexagonVSPLAT: SDNode<"HexagonISD::VSPLAT", SDTHexagonVSPLAT>;
-
-def: Pat<(v4i8  (HexagonVSPLAT I32:$Rs)), (S2_vsplatrb I32:$Rs)>;
-def: Pat<(v4i16 (HexagonVSPLAT I32:$Rs)), (S2_vsplatrh I32:$Rs)>;
-def: Pat<(v2i32 (HexagonVSPLAT s8_0ImmPred:$s8)),
+def: Pat<(v4i8  (splat_vector anyint:$V)), (ToI32 (SplatB $V))>;
+def: Pat<(v2i16 (splat_vector anyint:$V)), (ToI32 (SplatH $V))>;
+def: Pat<(v8i8  (splat_vector anyint:$V)),
+          (Combinew (ToI32 (SplatB $V)), (ToI32 (SplatB $V)))>;
+def: Pat<(v4i16 (splat_vector anyint:$V)),
+          (Combinew (ToI32 (SplatH $V)), (ToI32 (SplatH $V)))>;
+let AddedComplexity = 10 in
+def: Pat<(v2i32 (splat_vector s8_0ImmPred:$s8)),
          (A2_combineii imm:$s8, imm:$s8)>;
-def: Pat<(v2i32 (HexagonVSPLAT I32:$Rs)), (Combinew I32:$Rs, I32:$Rs)>;
+def: Pat<(v2i32 (splat_vector anyimm:$V)), (Combinew (ToI32 $V), (ToI32 $V))>;
+
+def: Pat<(v4i8  (splat_vector I32:$Rs)), (S2_vsplatrb I32:$Rs)>;
+def: Pat<(v2i16 (splat_vector I32:$Rs)), (LoReg (S2_vsplatrh I32:$Rs))>;
+def: Pat<(v4i16 (splat_vector I32:$Rs)), (S2_vsplatrh I32:$Rs)>;
+def: Pat<(v2i32 (splat_vector I32:$Rs)), (Combinew I32:$Rs, I32:$Rs)>;
 
 let AddedComplexity = 10 in
-def: Pat<(v8i8 (HexagonVSPLAT I32:$Rs)), (S6_vsplatrbp I32:$Rs)>,
+def: Pat<(v8i8 (splat_vector I32:$Rs)), (S6_vsplatrbp I32:$Rs)>,
      Requires<[HasV62]>;
-def: Pat<(v8i8 (HexagonVSPLAT I32:$Rs)),
+def: Pat<(v8i8 (splat_vector I32:$Rs)),
          (Combinew (S2_vsplatrb I32:$Rs), (S2_vsplatrb I32:$Rs))>;
 
 
@@ -1082,9 +1095,9 @@ def FShl32r: OutPatFrag<(ops node:$Rs, node:$Rt, node:$Ru),
   (HiReg (S2_asl_r_p (Combinew $Rs, $Rt), $Ru))>;
 
 def FShl64i: OutPatFrag<(ops node:$Rs, node:$Rt, node:$S),
-  (S2_lsr_i_p_or (S2_asl_i_p $Rt, $S),  $Rs, (Subi<64> $S))>;
+  (S2_lsr_i_p_or (S2_asl_i_p $Rs, $S),  $Rt, (Subi<64> $S))>;
 def FShl64r: OutPatFrag<(ops node:$Rs, node:$Rt, node:$Ru),
-  (S2_lsr_r_p_or (S2_asl_r_p $Rt, $Ru), $Rs, (A2_subri 64, $Ru))>;
+  (S2_lsr_r_p_or (S2_asl_r_p $Rs, $Ru), $Rt, (A2_subri 64, $Ru))>;
 
 // Combined SDNodeXForm: (Divu8 (Subi<64> $S))
 def Divu64_8: SDNodeXForm<imm, [{
@@ -1307,17 +1320,17 @@ def: OpR_RR_pat<S2_asr_r_vh, pf2<HexagonVASR>, v4i16, V4I16, I32>;
 def: OpR_RR_pat<S2_lsr_r_vw, pf2<HexagonVLSR>, v2i32, V2I32, I32>;
 def: OpR_RR_pat<S2_lsr_r_vh, pf2<HexagonVLSR>, v4i16, V4I16, I32>;
 
-def: Pat<(sra V2I32:$b, (v2i32 (HexagonVSPLAT u5_0ImmPred:$c))),
+def: Pat<(sra V2I32:$b, (v2i32 (splat_vector u5_0ImmPred:$c))),
          (S2_asr_i_vw V2I32:$b, imm:$c)>;
-def: Pat<(srl V2I32:$b, (v2i32 (HexagonVSPLAT u5_0ImmPred:$c))),
+def: Pat<(srl V2I32:$b, (v2i32 (splat_vector u5_0ImmPred:$c))),
          (S2_lsr_i_vw V2I32:$b, imm:$c)>;
-def: Pat<(shl V2I32:$b, (v2i32 (HexagonVSPLAT u5_0ImmPred:$c))),
+def: Pat<(shl V2I32:$b, (v2i32 (splat_vector u5_0ImmPred:$c))),
          (S2_asl_i_vw V2I32:$b, imm:$c)>;
-def: Pat<(sra V4I16:$b, (v4i16 (HexagonVSPLAT u4_0ImmPred:$c))),
+def: Pat<(sra V4I16:$b, (v4i16 (splat_vector u4_0ImmPred:$c))),
          (S2_asr_i_vh V4I16:$b, imm:$c)>;
-def: Pat<(srl V4I16:$b, (v4i16 (HexagonVSPLAT u4_0ImmPred:$c))),
+def: Pat<(srl V4I16:$b, (v4i16 (splat_vector u4_0ImmPred:$c))),
          (S2_lsr_i_vh V4I16:$b, imm:$c)>;
-def: Pat<(shl V4I16:$b, (v4i16 (HexagonVSPLAT u4_0ImmPred:$c))),
+def: Pat<(shl V4I16:$b, (v4i16 (splat_vector u4_0ImmPred:$c))),
          (S2_asl_i_vh V4I16:$b, imm:$c)>;
 
 def: Pat<(HexagonVASR V2I16:$Rs, u4_0ImmPred:$S),
@@ -1688,8 +1701,6 @@ def: Pat<(fma F32:$Rs, F32:$Rt, F32:$Rx),
          (F2_sffma F32:$Rx, F32:$Rs, F32:$Rt)>;
 def: Pat<(fma (fneg F32:$Rs), F32:$Rt, F32:$Rx),
          (F2_sffms F32:$Rx, F32:$Rs, F32:$Rt)>;
-def: Pat<(fma F32:$Rs, (fneg F32:$Rt), F32:$Rx),
-         (F2_sffms F32:$Rx, F32:$Rs, F32:$Rt)>;
 
 def: Pat<(mul V2I32:$Rs, V2I32:$Rt),
          (PS_vmulw V2I32:$Rs, V2I32:$Rt)>;
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
index 078a7135c55b..cd894c555adc 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
@@ -1,3 +1,15 @@
+//===- HexagonPatternsHVX.td - Selection Patterns for HVX --*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+
+def SDTVecUnaryOp:
+  SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>;
+
 def SDTVecBinOp:
   SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<1,2>]>;
 
@@ -9,9 +21,6 @@ def SDTHexagonVINSERTW0: SDTypeProfile<1, 2,
   [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisVT<2, i32>]>;
 def HexagonVINSERTW0: SDNode<"HexagonISD::VINSERTW0", SDTHexagonVINSERTW0>;
 
-def SDTHexagonVSPLATW: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVT<1, i32>]>;
-def HexagonVSPLATW: SDNode<"HexagonISD::VSPLATW", SDTHexagonVSPLATW>;
-
 def HwLen2: SDNodeXForm<imm, [{
   const auto &ST = static_cast<const HexagonSubtarget&>(CurDAG->getSubtarget());
   return CurDAG->getTargetConstant(ST.getVectorLength()/2, SDLoc(N), MVT::i32);
@@ -33,37 +42,29 @@ def Combineq: OutPatFrag<(ops node:$Qs, node:$Qt),
 def LoVec: OutPatFrag<(ops node:$Vs), (EXTRACT_SUBREG $Vs, vsub_lo)>;
 def HiVec: OutPatFrag<(ops node:$Vs), (EXTRACT_SUBREG $Vs, vsub_hi)>;
 
-def HexagonVZERO:      SDNode<"HexagonISD::VZERO",      SDTVecLeaf>;
 def HexagonQCAT:       SDNode<"HexagonISD::QCAT",       SDTVecBinOp>;
 def HexagonQTRUE:      SDNode<"HexagonISD::QTRUE",      SDTVecLeaf>;
 def HexagonQFALSE:     SDNode<"HexagonISD::QFALSE",     SDTVecLeaf>;
+def HexagonVPACKL:     SDNode<"HexagonISD::VPACKL",     SDTVecUnaryOp>;
+def HexagonVUNPACK:    SDNode<"HexagonISD::VUNPACK",    SDTVecUnaryOp>;
+def HexagonVUNPACKU:   SDNode<"HexagonISD::VUNPACKU",   SDTVecUnaryOp>;
 
-def vzero:  PatFrag<(ops), (HexagonVZERO)>;
+def vzero:  PatFrag<(ops), (splat_vector (i32 0))>;
 def qtrue:  PatFrag<(ops), (HexagonQTRUE)>;
 def qfalse: PatFrag<(ops), (HexagonQFALSE)>;
 def qcat:   PatFrag<(ops node:$Qs, node:$Qt),
                     (HexagonQCAT node:$Qs, node:$Qt)>;
 
-def qnot: PatFrag<(ops node:$Qs), (xor node:$Qs, qtrue)>;
+def qnot:     PatFrag<(ops node:$Qs), (xor node:$Qs, qtrue)>;
+def vpackl:   PatFrag<(ops node:$Vs), (HexagonVPACKL node:$Vs)>;
+def vunpack:  PatFrag<(ops node:$Vs), (HexagonVUNPACK node:$Vs)>;
+def vunpacku: PatFrag<(ops node:$Vs), (HexagonVUNPACKU node:$Vs)>;
 
 def VSxtb: OutPatFrag<(ops node:$Vs), (V6_vunpackb  $Vs)>;
 def VSxth: OutPatFrag<(ops node:$Vs), (V6_vunpackh  $Vs)>;
 def VZxtb: OutPatFrag<(ops node:$Vs), (V6_vunpackub $Vs)>;
 def VZxth: OutPatFrag<(ops node:$Vs), (V6_vunpackuh $Vs)>;
 
-def SplatB: SDNodeXForm<imm, [{
-  uint32_t V = N->getZExtValue();
-  assert(isUInt<8>(V));
-  uint32_t S = V << 24 | V << 16 | V << 8 | V;
-  return CurDAG->getTargetConstant(S, SDLoc(N), MVT::i32);
-}]>;
-
-def SplatH: SDNodeXForm<imm, [{
-  uint32_t V = N->getZExtValue();
-  assert(isUInt<16>(V));
-  return CurDAG->getTargetConstant(V << 16 | V, SDLoc(N), MVT::i32);
-}]>;
-
 def IsVecOff : PatLeaf<(i32 imm), [{
   int32_t V = N->getSExtValue();
   int32_t VecSize = HRI->getSpillSize(Hexagon::HvxVRRegClass);
@@ -171,16 +172,19 @@ let Predicates = [UseHVX] in {
 }
 
 let Predicates = [UseHVX] in {
-  def: Pat<(VecI8   vzero), (V6_vd0)>;
-  def: Pat<(VecI16  vzero), (V6_vd0)>;
-  def: Pat<(VecI32  vzero), (V6_vd0)>;
-  def: Pat<(VecPI8  vzero), (PS_vdd0)>;
-  def: Pat<(VecPI16 vzero), (PS_vdd0)>;
-  def: Pat<(VecPI32 vzero), (PS_vdd0)>;
-
-  def: Pat<(concat_vectors  (VecI8 vzero),  (VecI8 vzero)), (PS_vdd0)>;
-  def: Pat<(concat_vectors (VecI16 vzero), (VecI16 vzero)), (PS_vdd0)>;
-  def: Pat<(concat_vectors (VecI32 vzero), (VecI32 vzero)), (PS_vdd0)>;
+  let AddedComplexity = 100 in {
+    // These should be preferred over a vsplat of 0.
+    def: Pat<(VecI8   vzero), (V6_vd0)>;
+    def: Pat<(VecI16  vzero), (V6_vd0)>;
+    def: Pat<(VecI32  vzero), (V6_vd0)>;
+    def: Pat<(VecPI8  vzero), (PS_vdd0)>;
+    def: Pat<(VecPI16 vzero), (PS_vdd0)>;
+    def: Pat<(VecPI32 vzero), (PS_vdd0)>;
+
+    def: Pat<(concat_vectors  (VecI8 vzero),  (VecI8 vzero)), (PS_vdd0)>;
+    def: Pat<(concat_vectors (VecI16 vzero), (VecI16 vzero)), (PS_vdd0)>;
+    def: Pat<(concat_vectors (VecI32 vzero), (VecI32 vzero)), (PS_vdd0)>;
+  }
 
   def: Pat<(VecPI8 (concat_vectors HVI8:$Vs, HVI8:$Vt)),
            (Combinev HvxVR:$Vt, HvxVR:$Vs)>;
@@ -207,62 +211,69 @@ let Predicates = [UseHVX] in {
            (V6_vinsertwr HvxVR:$Vu, I32:$Rt)>;
 }
 
-def Vsplatib: OutPatFrag<(ops node:$V),  (V6_lvsplatw (ToI32 (SplatB $V)))>;
-def Vsplatih: OutPatFrag<(ops node:$V),  (V6_lvsplatw (ToI32 (SplatH $V)))>;
-def Vsplatiw: OutPatFrag<(ops node:$V),  (V6_lvsplatw (ToI32 $V))>;
-
-def Vsplatrb: OutPatFrag<(ops node:$Rs), (V6_lvsplatw (S2_vsplatrb $Rs))>;
-def Vsplatrh: OutPatFrag<(ops node:$Rs),
-                         (V6_lvsplatw (A2_combine_ll $Rs, $Rs))>;
-def Vsplatrw: OutPatFrag<(ops node:$Rs), (V6_lvsplatw $Rs)>;
+// Splats for HvxV60
+def V60splatib: OutPatFrag<(ops node:$V),  (V6_lvsplatw (ToI32 (SplatB $V)))>;
+def V60splatih: OutPatFrag<(ops node:$V),  (V6_lvsplatw (ToI32 (SplatH $V)))>;
+def V60splatiw: OutPatFrag<(ops node:$V),  (V6_lvsplatw (ToI32 $V))>;
+def V60splatrb: OutPatFrag<(ops node:$Rs), (V6_lvsplatw (S2_vsplatrb $Rs))>;
+def V60splatrh: OutPatFrag<(ops node:$Rs),
+                           (V6_lvsplatw (A2_combine_ll $Rs, $Rs))>;
+def V60splatrw: OutPatFrag<(ops node:$Rs), (V6_lvsplatw $Rs)>;
+
+// Splats for HvxV62+
+def V62splatib: OutPatFrag<(ops node:$V),  (V6_lvsplatb (ToI32 $V))>;
+def V62splatih: OutPatFrag<(ops node:$V),  (V6_lvsplath (ToI32 $V))>;
+def V62splatiw: OutPatFrag<(ops node:$V),  (V6_lvsplatw (ToI32 $V))>;
+def V62splatrb: OutPatFrag<(ops node:$Rs), (V6_lvsplatb $Rs)>;
+def V62splatrh: OutPatFrag<(ops node:$Rs), (V6_lvsplath $Rs)>;
+def V62splatrw: OutPatFrag<(ops node:$Rs), (V6_lvsplatw $Rs)>;
 
 def Rep: OutPatFrag<(ops node:$N), (Combinev $N, $N)>;
 
-let Predicates = [UseHVX] in {
+let Predicates = [UseHVX,UseHVXV60] in {
   let AddedComplexity = 10 in {
-    def: Pat<(VecI8   (HexagonVSPLAT u8_0ImmPred:$V)),  (Vsplatib $V)>;
-    def: Pat<(VecI16  (HexagonVSPLAT u16_0ImmPred:$V)), (Vsplatih $V)>;
-    def: Pat<(VecI32  (HexagonVSPLAT anyimm:$V)),       (Vsplatiw $V)>;
-    def: Pat<(VecPI8  (HexagonVSPLAT u8_0ImmPred:$V)),  (Rep (Vsplatib $V))>;
-    def: Pat<(VecPI16 (HexagonVSPLAT u16_0ImmPred:$V)), (Rep (Vsplatih $V))>;
-    def: Pat<(VecPI32 (HexagonVSPLAT anyimm:$V)),       (Rep (Vsplatiw $V))>;
+    def: Pat<(VecI8   (splat_vector u8_0ImmPred:$V)),  (V60splatib $V)>;
+    def: Pat<(VecI16  (splat_vector u16_0ImmPred:$V)), (V60splatih $V)>;
+    def: Pat<(VecI32  (splat_vector anyimm:$V)),       (V60splatiw $V)>;
+    def: Pat<(VecPI8  (splat_vector u8_0ImmPred:$V)),  (Rep (V60splatib $V))>;
+    def: Pat<(VecPI16 (splat_vector u16_0ImmPred:$V)), (Rep (V60splatih $V))>;
+    def: Pat<(VecPI32 (splat_vector anyimm:$V)),       (Rep (V60splatiw $V))>;
+  }
+  def: Pat<(VecI8   (splat_vector I32:$Rs)), (V60splatrb $Rs)>;
+  def: Pat<(VecI16  (splat_vector I32:$Rs)), (V60splatrh $Rs)>;
+  def: Pat<(VecI32  (splat_vector I32:$Rs)), (V60splatrw $Rs)>;
+  def: Pat<(VecPI8  (splat_vector I32:$Rs)), (Rep (V60splatrb $Rs))>;
+  def: Pat<(VecPI16 (splat_vector I32:$Rs)), (Rep (V60splatrh $Rs))>;
+  def: Pat<(VecPI32 (splat_vector I32:$Rs)), (Rep (V60splatrw $Rs))>;
+}
+let Predicates = [UseHVX,UseHVXV62] in {
+  let AddedComplexity = 30 in {
+    def: Pat<(VecI8   (splat_vector u8_0ImmPred:$V)),  (V62splatib imm:$V)>;
+    def: Pat<(VecI16  (splat_vector u16_0ImmPred:$V)), (V62splatih imm:$V)>;
+    def: Pat<(VecI32  (splat_vector anyimm:$V)),       (V62splatiw imm:$V)>;
+    def: Pat<(VecPI8  (splat_vector u8_0ImmPred:$V)),
+             (Rep (V62splatib imm:$V))>;
+    def: Pat<(VecPI16 (splat_vector u16_0ImmPred:$V)),
+             (Rep (V62splatih imm:$V))>;
+    def: Pat<(VecPI32 (splat_vector anyimm:$V)),
+             (Rep (V62splatiw imm:$V))>;
+  }
+  let AddedComplexity = 20 in {
+    def: Pat<(VecI8   (splat_vector I32:$Rs)), (V62splatrb $Rs)>;
+    def: Pat<(VecI16  (splat_vector I32:$Rs)), (V62splatrh $Rs)>;
+    def: Pat<(VecI32  (splat_vector I32:$Rs)), (V62splatrw $Rs)>;
+    def: Pat<(VecPI8  (splat_vector I32:$Rs)), (Rep (V62splatrb $Rs))>;
+    def: Pat<(VecPI16 (splat_vector I32:$Rs)), (Rep (V62splatrh $Rs))>;
+    def: Pat<(VecPI32 (splat_vector I32:$Rs)), (Rep (V62splatrw $Rs))>;
   }
-  def: Pat<(VecI8   (HexagonVSPLAT I32:$Rs)), (Vsplatrb $Rs)>;
-  def: Pat<(VecI16  (HexagonVSPLAT I32:$Rs)), (Vsplatrh $Rs)>;
-  def: Pat<(VecI32  (HexagonVSPLAT I32:$Rs)), (Vsplatrw $Rs)>;
-  def: Pat<(VecPI8  (HexagonVSPLAT I32:$Rs)), (Rep (Vsplatrb $Rs))>;
-  def: Pat<(VecPI16 (HexagonVSPLAT I32:$Rs)), (Rep (Vsplatrh $Rs))>;
-  def: Pat<(VecPI32 (HexagonVSPLAT I32:$Rs)), (Rep (Vsplatrw $Rs))>;
-
-  def: Pat<(VecI8   (HexagonVSPLATW I32:$Rs)), (Vsplatrw $Rs)>;
-  def: Pat<(VecI16  (HexagonVSPLATW I32:$Rs)), (Vsplatrw $Rs)>;
-  def: Pat<(VecI32  (HexagonVSPLATW I32:$Rs)), (Vsplatrw $Rs)>;
-  def: Pat<(VecPI8  (HexagonVSPLATW I32:$Rs)), (Rep (Vsplatrw $Rs))>;
-  def: Pat<(VecPI16 (HexagonVSPLATW I32:$Rs)), (Rep (Vsplatrw $Rs))>;
-  def: Pat<(VecPI32 (HexagonVSPLATW I32:$Rs)), (Rep (Vsplatrw $Rs))>;
 }
 
 class Vneg1<ValueType VecTy>
-  : PatFrag<(ops), (VecTy (HexagonVSPLATW (i32 -1)))>;
+  : PatFrag<(ops), (VecTy (splat_vector (i32 -1)))>;
 
 class Vnot<ValueType VecTy>
   : PatFrag<(ops node:$Vs), (xor $Vs, Vneg1<VecTy>)>;
 
-let Predicates = [UseHVX] in {
-  let AddedComplexity = 220 in {
-    defm: MinMax_pats<V6_vminb,  V6_vmaxb,  vselect,  setgt,  VecQ8,  HVI8>;
-    defm: MinMax_pats<V6_vminb,  V6_vmaxb,  vselect,  setge,  VecQ8,  HVI8>;
-    defm: MinMax_pats<V6_vminub, V6_vmaxub, vselect, setugt,  VecQ8,  HVI8>;
-    defm: MinMax_pats<V6_vminub, V6_vmaxub, vselect, setuge,  VecQ8,  HVI8>;
-    defm: MinMax_pats<V6_vminh,  V6_vmaxh,  vselect,  setgt, VecQ16, HVI16>;
-    defm: MinMax_pats<V6_vminh,  V6_vmaxh,  vselect,  setge, VecQ16, HVI16>;
-    defm: MinMax_pats<V6_vminuh, V6_vmaxuh, vselect, setugt, VecQ16, HVI16>;
-    defm: MinMax_pats<V6_vminuh, V6_vmaxuh, vselect, setuge, VecQ16, HVI16>;
-    defm: MinMax_pats<V6_vminw,  V6_vmaxw,  vselect,  setgt, VecQ32, HVI32>;
-    defm: MinMax_pats<V6_vminw,  V6_vmaxw,  vselect,  setge, VecQ32, HVI32>;
-  }
-}
-
 let Predicates = [UseHVX] in {
   let AddedComplexity = 200 in {
     def: Pat<(Vnot<VecI8>   HVI8:$Vs), (V6_vnot HvxVR:$Vs)>;
@@ -292,6 +303,17 @@ let Predicates = [UseHVX] in {
   def: OpR_RR_pat<V6_vxor,     Xor,  VecI16, HVI16>;
   def: OpR_RR_pat<V6_vxor,     Xor,  VecI32, HVI32>;
 
+  def: OpR_RR_pat<V6_vminb,   Smin,   VecI8,  HVI8>;
+  def: OpR_RR_pat<V6_vmaxb,   Smax,   VecI8,  HVI8>;
+  def: OpR_RR_pat<V6_vminub,  Umin,   VecI8,  HVI8>;
+  def: OpR_RR_pat<V6_vmaxub,  Umax,   VecI8,  HVI8>;
+  def: OpR_RR_pat<V6_vminh,   Smin,  VecI16, HVI16>;
+  def: OpR_RR_pat<V6_vmaxh,   Smax,  VecI16, HVI16>;
+  def: OpR_RR_pat<V6_vminuh,  Umin,  VecI16, HVI16>;
+  def: OpR_RR_pat<V6_vmaxuh,  Umax,  VecI16, HVI16>;
+  def: OpR_RR_pat<V6_vminw,   Smin,  VecI32, HVI32>;
+  def: OpR_RR_pat<V6_vmaxw,   Smax,  VecI32, HVI32>;
+
   def: Pat<(vselect HQ8:$Qu, HVI8:$Vs, HVI8:$Vt),
            (V6_vmux HvxQR:$Qu, HvxVR:$Vs, HvxVR:$Vt)>;
   def: Pat<(vselect HQ16:$Qu, HVI16:$Vs, HVI16:$Vt),
@@ -307,6 +329,20 @@ let Predicates = [UseHVX] in {
            (V6_vmux HvxQR:$Qu, HvxVR:$Vt, HvxVR:$Vs)>;
 }
 
+let Predicates = [UseHVX] in {
+  // For i8 vectors Vs = (a0, a1, ...), Vt = (b0, b1, ...),
+  // V6_vmpybv Vs, Vt produces a pair of i16 vectors Hi:Lo,
+  // where Lo = (a0*b0, a2*b2, ...), Hi = (a1*b1, a3*b3, ...).
+  def: Pat<(mul HVI8:$Vs, HVI8:$Vt),
+           (V6_vshuffeb (HiVec (V6_vmpybv HvxVR:$Vs, HvxVR:$Vt)),
+                        (LoVec (V6_vmpybv HvxVR:$Vs, HvxVR:$Vt)))>;
+  def: Pat<(mul HVI16:$Vs, HVI16:$Vt),
+           (V6_vmpyih HvxVR:$Vs, HvxVR:$Vt)>;
+  def: Pat<(mul HVI32:$Vs, HVI32:$Vt),
+           (V6_vmpyiewuh_acc (V6_vmpyieoh HvxVR:$Vs, HvxVR:$Vt),
+                             HvxVR:$Vs, HvxVR:$Vt)>;
+}
+
 let Predicates = [UseHVX] in {
   def: Pat<(VecPI16 (sext HVI8:$Vs)),  (VSxtb $Vs)>;
   def: Pat<(VecPI32 (sext HVI16:$Vs)), (VSxth $Vs)>;
@@ -364,6 +400,14 @@ let Predicates = [UseHVX] in {
              (V6_vasrw (V6_vaslw HVI32:$Vs, (A2_tfrsi 16)), (A2_tfrsi 16))>;
   }
 
+  // Take a pair of vectors Vt:Vs and shift them towards LSB by (Rt & HwLen).
+  def: Pat<(VecI8 (valign HVI8:$Vt, HVI8:$Vs, I32:$Rt)),
+           (LoVec (V6_valignb HvxVR:$Vt, HvxVR:$Vs, I32:$Rt))>;
+  def: Pat<(VecI16 (valign HVI16:$Vt, HVI16:$Vs, I32:$Rt)),
+           (LoVec (V6_valignb HvxVR:$Vt, HvxVR:$Vs, I32:$Rt))>;
+  def: Pat<(VecI32 (valign HVI32:$Vt, HVI32:$Vs, I32:$Rt)),
+           (LoVec (V6_valignb HvxVR:$Vt, HvxVR:$Vs, I32:$Rt))>;
+
   def: Pat<(HexagonVASL HVI8:$Vs, I32:$Rt),
            (V6_vpackeb (V6_vaslh (HiVec (VZxtb HvxVR:$Vs)), I32:$Rt),
                        (V6_vaslh (LoVec (VZxtb HvxVR:$Vs)), I32:$Rt))>;
@@ -393,10 +437,43 @@ let Predicates = [UseHVX] in {
   def: Pat<(srl HVI16:$Vs, HVI16:$Vt), (V6_vlsrhv HvxVR:$Vs, HvxVR:$Vt)>;
   def: Pat<(srl HVI32:$Vs, HVI32:$Vt), (V6_vlsrwv HvxVR:$Vs, HvxVR:$Vt)>;
 
-  def: Pat<(VecI16 (bswap HVI16:$Vs)),
-           (V6_vdelta HvxVR:$Vs, (V6_lvsplatw (A2_tfrsi 0x01010101)))>;
-  def: Pat<(VecI32 (bswap HVI32:$Vs)),
-           (V6_vdelta HvxVR:$Vs, (V6_lvsplatw (A2_tfrsi 0x03030303)))>;
+  // Vpackl is a pseudo-op that is used when legalizing widened truncates.
+  // It should never be produced with a register pair in the output, but
+  // it can happen to have a pair as an input.
+  def: Pat<(VecI8  (vpackl HVI16:$Vs)), (V6_vdealb HvxVR:$Vs)>;
+  def: Pat<(VecI8  (vpackl HVI32:$Vs)), (V6_vdealb4w (IMPLICIT_DEF), HvxVR:$Vs)>;
+  def: Pat<(VecI16 (vpackl HVI32:$Vs)), (V6_vdealh HvxVR:$Vs)>;
+  def: Pat<(VecI8  (vpackl HWI16:$Vs)), (V6_vpackeb (HiVec $Vs), (LoVec $Vs))>;
+  def: Pat<(VecI8  (vpackl HWI32:$Vs)),
+           (V6_vpackeb (IMPLICIT_DEF), (V6_vpackeh (HiVec $Vs), (LoVec $Vs)))>;
+  def: Pat<(VecI16 (vpackl HWI32:$Vs)), (V6_vpackeh (HiVec $Vs), (LoVec $Vs))>;
+
+  def: Pat<(VecI16  (vunpack   HVI8:$Vs)), (LoVec (VSxtb $Vs))>;
+  def: Pat<(VecI32  (vunpack   HVI8:$Vs)), (LoVec (VSxth (LoVec (VSxtb $Vs))))>;
+  def: Pat<(VecI32  (vunpack  HVI16:$Vs)), (LoVec (VSxth $Vs))>;
+  def: Pat<(VecPI16 (vunpack   HVI8:$Vs)), (VSxtb $Vs)>;
+  def: Pat<(VecPI32 (vunpack   HVI8:$Vs)), (VSxth (LoVec (VSxtb $Vs)))>;
+  def: Pat<(VecPI32 (vunpack  HVI32:$Vs)), (VSxth $Vs)>;
+
+  def: Pat<(VecI16  (vunpacku  HVI8:$Vs)), (LoVec (VZxtb $Vs))>;
+  def: Pat<(VecI32  (vunpacku  HVI8:$Vs)), (LoVec (VZxth (LoVec (VZxtb $Vs))))>;
+  def: Pat<(VecI32  (vunpacku HVI16:$Vs)), (LoVec (VZxth $Vs))>;
+  def: Pat<(VecPI16 (vunpacku  HVI8:$Vs)), (VZxtb $Vs)>;
+  def: Pat<(VecPI32 (vunpacku  HVI8:$Vs)), (VZxth (LoVec (VZxtb $Vs)))>;
+  def: Pat<(VecPI32 (vunpacku HVI32:$Vs)), (VZxth $Vs)>;
+
+  let Predicates = [UseHVX,UseHVXV60] in {
+    def: Pat<(VecI16 (bswap HVI16:$Vs)),
+             (V6_vdelta HvxVR:$Vs, (V60splatib (i32 0x01)))>;
+    def: Pat<(VecI32 (bswap HVI32:$Vs)),
+             (V6_vdelta HvxVR:$Vs, (V60splatib (i32 0x03)))>;
+  }
+  let Predicates = [UseHVX,UseHVXV62], AddedComplexity = 10 in {
+    def: Pat<(VecI16 (bswap HVI16:$Vs)),
+             (V6_vdelta HvxVR:$Vs, (V62splatib (i32 0x01)))>;
+    def: Pat<(VecI32 (bswap HVI32:$Vs)),
+             (V6_vdelta HvxVR:$Vs, (V62splatib (i32 0x03)))>;
+  }
 
   def: Pat<(VecI8 (ctpop HVI8:$Vs)),
            (V6_vpackeb (V6_vpopcounth (HiVec (V6_vunpackub HvxVR:$Vs))),
@@ -406,10 +483,17 @@ let Predicates = [UseHVX] in {
            (V6_vaddw (LoVec (V6_vzh (V6_vpopcounth HvxVR:$Vs))),
                      (HiVec (V6_vzh (V6_vpopcounth HvxVR:$Vs))))>;
 
+  let Predicates = [UseHVX,UseHVXV60] in
   def: Pat<(VecI8 (ctlz HVI8:$Vs)),
            (V6_vsubb (V6_vpackeb (V6_vcl0h (HiVec (V6_vunpackub HvxVR:$Vs))),
                                  (V6_vcl0h (LoVec (V6_vunpackub HvxVR:$Vs)))),
-                     (V6_lvsplatw (A2_tfrsi 0x08080808)))>;
+                     (V60splatib (i32 0x08)))>;
+  let Predicates = [UseHVX,UseHVXV62], AddedComplexity = 10 in
+  def: Pat<(VecI8 (ctlz HVI8:$Vs)),
+           (V6_vsubb (V6_vpackeb (V6_vcl0h (HiVec (V6_vunpackub HvxVR:$Vs))),
+                                 (V6_vcl0h (LoVec (V6_vunpackub HvxVR:$Vs)))),
+                     (V62splatib (i32 0x08)))>;
+
   def: Pat<(VecI16 (ctlz HVI16:$Vs)), (V6_vcl0h HvxVR:$Vs)>;
   def: Pat<(VecI32 (ctlz HVI32:$Vs)), (V6_vcl0w HvxVR:$Vs)>;
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonPeephole.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonPeephole.cpp
index d0b02f035d1e..fc31139e13ce 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonPeephole.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonPeephole.cpp
@@ -139,8 +139,7 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
         Register DstReg = Dst.getReg();
         Register SrcReg = Src.getReg();
         // Just handle virtual registers.
-        if (Register::isVirtualRegister(DstReg) &&
-            Register::isVirtualRegister(SrcReg)) {
+        if (DstReg.isVirtual() && SrcReg.isVirtual()) {
           // Map the following:
           // %170 = SXTW %166
           // PeepholeMap[170] = %166
@@ -188,8 +187,7 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
         Register DstReg = Dst.getReg();
         Register SrcReg = Src.getReg();
         // Just handle virtual registers.
-        if (Register::isVirtualRegister(DstReg) &&
-            Register::isVirtualRegister(SrcReg)) {
+        if (DstReg.isVirtual() && SrcReg.isVirtual()) {
           // Map the following:
           // %170 = NOT_xx %166
           // PeepholeMap[170] = %166
@@ -210,8 +208,7 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
 
         Register DstReg = Dst.getReg();
         Register SrcReg = Src.getReg();
-        if (Register::isVirtualRegister(DstReg) &&
-            Register::isVirtualRegister(SrcReg)) {
+        if (DstReg.isVirtual() && SrcReg.isVirtual()) {
           // Try to find in the map.
           if (unsigned PeepholeSrc = PeepholeMap.lookup(SrcReg)) {
             // Change the 1st operand.
@@ -242,7 +239,7 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
           if (RC0->getID() == Hexagon::PredRegsRegClassID) {
             // Handle instructions that have a prediate register in op0
             // (most cases of predicable instructions).
-            if (Register::isVirtualRegister(Reg0)) {
+            if (Reg0.isVirtual()) {
               // Try to find in the map.
               if (unsigned PeepholeSrc = PeepholeMap.lookup(Reg0)) {
                 // Change the 1st operand and, flip the opcode.
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp
index 52f247977094..5ece577e8285 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp
@@ -207,7 +207,7 @@ void HexagonRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   int FI = MI.getOperand(FIOp).getIndex();
   // Select the base pointer (BP) and calculate the actual offset from BP
   // to the beginning of the object at index FI.
-  int Offset = HFI.getFrameIndexReference(MF, FI, BP);
+  int Offset = HFI.getFrameIndexReference(MF, FI, BP).getFixed();
   // Add the offset from the instruction.
   int RealOffset = Offset + MI.getOperand(FIOp+1).getImm();
   bool IsKill = false;
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp
index b45d871e04d6..c8c66ebb69cd 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp
@@ -97,7 +97,7 @@ namespace {
     bool isFixedInstr(const MachineInstr *MI) const;
     void partitionRegisters(UUSetMap &P2Rs);
     int32_t profit(const MachineInstr *MI) const;
-    int32_t profit(unsigned Reg) const;
+    int32_t profit(Register Reg) const;
     bool isProfitable(const USet &Part, LoopRegMap &IRM) const;
 
     void collectIndRegsForLoop(const MachineLoop *L, USet &Rs);
@@ -211,7 +211,7 @@ bool HexagonSplitDoubleRegs::isFixedInstr(const MachineInstr *MI) const {
     if (!Op.isReg())
       continue;
     Register R = Op.getReg();
-    if (!Register::isVirtualRegister(R))
+    if (!R.isVirtual())
       return true;
   }
   return false;
@@ -259,7 +259,7 @@ void HexagonSplitDoubleRegs::partitionRegisters(UUSetMap &P2Rs) {
         if (&MO == &Op || !MO.isReg() || MO.getSubReg())
           continue;
         Register T = MO.getReg();
-        if (!Register::isVirtualRegister(T)) {
+        if (!T.isVirtual()) {
           FixedRegs.set(x);
           continue;
         }
@@ -399,8 +399,8 @@ int32_t HexagonSplitDoubleRegs::profit(const MachineInstr *MI) const {
   return 0;
 }
 
-int32_t HexagonSplitDoubleRegs::profit(unsigned Reg) const {
-  assert(Register::isVirtualRegister(Reg));
+int32_t HexagonSplitDoubleRegs::profit(Register Reg) const {
+  assert(Reg.isVirtual());
 
   const MachineInstr *DefI = MRI->getVRegDef(Reg);
   switch (DefI->getOpcode()) {
@@ -574,12 +574,9 @@ void HexagonSplitDoubleRegs::collectIndRegs(LoopRegMap &IRM) {
 
   LoopVector WorkQ;
 
-  for (auto I : *MLI)
-    WorkQ.push_back(I);
-  for (unsigned i = 0; i < WorkQ.size(); ++i) {
-    for (auto I : *WorkQ[i])
-      WorkQ.push_back(I);
-  }
+  append_range(WorkQ, *MLI);
+  for (unsigned i = 0; i < WorkQ.size(); ++i)
+    append_range(WorkQ, *WorkQ[i]);
 
   USet Rs;
   for (unsigned i = 0, n = WorkQ.size(); i < n; ++i) {
@@ -605,7 +602,7 @@ void HexagonSplitDoubleRegs::createHalfInstr(unsigned Opc, MachineInstr *MI,
     // For register operands, set the subregister.
     Register R = Op.getReg();
     unsigned SR = Op.getSubReg();
-    bool isVirtReg = Register::isVirtualRegister(R);
+    bool isVirtReg = R.isVirtual();
     bool isKill = Op.isKill();
     if (isVirtReg && MRI->getRegClass(R) == DoubleRC) {
       isKill = false;
@@ -1106,7 +1103,7 @@ void HexagonSplitDoubleRegs::collapseRegPairs(MachineInstr *MI,
     if (!Op.isReg() || !Op.isUse())
       continue;
     Register R = Op.getReg();
-    if (!Register::isVirtualRegister(R))
+    if (!R.isVirtual())
       continue;
     if (MRI->getRegClass(R) != DoubleRC || Op.getSubReg())
       continue;
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
index 2b7e1bcba9a3..87b1c43961d7 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -10,10 +10,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "HexagonSubtarget.h"
 #include "Hexagon.h"
 #include "HexagonInstrInfo.h"
 #include "HexagonRegisterInfo.h"
-#include "HexagonSubtarget.h"
 #include "MCTargetDesc/HexagonMCTargetDesc.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
@@ -26,6 +26,7 @@
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetMachine.h"
 #include <algorithm>
 #include <cassert>
 #include <map>
@@ -38,7 +39,6 @@ using namespace llvm;
 #define GET_SUBTARGETINFO_TARGET_DESC
 #include "HexagonGenSubtargetInfo.inc"
 
-
 static cl::opt<bool> EnableBSBSched("enable-bsb-sched",
   cl::Hidden, cl::ZeroOrMore, cl::init(true));
 
@@ -77,7 +77,8 @@ static cl::opt<bool> EnableCheckBankConflict("hexagon-check-bank-conflict",
 
 HexagonSubtarget::HexagonSubtarget(const Triple &TT, StringRef CPU,
                                    StringRef FS, const TargetMachine &TM)
-    : HexagonGenSubtargetInfo(TT, CPU, FS), OptLevel(TM.getOptLevel()),
+    : HexagonGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS),
+      OptLevel(TM.getOptLevel()),
       CPUString(std::string(Hexagon_MC::selectHexagonCPU(CPU))),
       TargetTriple(TT), InstrInfo(initializeSubtargetDependencies(CPU, FS)),
       RegInfo(getHwMode()), TLInfo(TM, *this),
@@ -104,7 +105,7 @@ HexagonSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
 
   UseBSBScheduling = hasV60Ops() && EnableBSBSched;
 
-  ParseSubtargetFeatures(CPUString, FS);
+  ParseSubtargetFeatures(CPUString, /*TuneCPU*/ CPUString, FS);
 
   if (OverrideLongCalls.getPosition())
     UseLongCalls = OverrideLongCalls;
@@ -124,6 +125,76 @@ HexagonSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
   return *this;
 }
 
+bool HexagonSubtarget::isHVXElementType(MVT Ty, bool IncludeBool) const {
+  if (!useHVXOps())
+    return false;
+  if (Ty.isVector())
+    Ty = Ty.getVectorElementType();
+  if (IncludeBool && Ty == MVT::i1)
+    return true;
+  ArrayRef<MVT> ElemTypes = getHVXElementTypes();
+  return llvm::is_contained(ElemTypes, Ty);
+}
+
+bool HexagonSubtarget::isHVXVectorType(MVT VecTy, bool IncludeBool) const {
+  if (!VecTy.isVector() || !useHVXOps() || VecTy.isScalableVector())
+    return false;
+  MVT ElemTy = VecTy.getVectorElementType();
+  if (!IncludeBool && ElemTy == MVT::i1)
+    return false;
+
+  unsigned HwLen = getVectorLength();
+  unsigned NumElems = VecTy.getVectorNumElements();
+  ArrayRef<MVT> ElemTypes = getHVXElementTypes();
+
+  if (IncludeBool && ElemTy == MVT::i1) {
+    // Boolean HVX vector types are formed from regular HVX vector types
+    // by replacing the element type with i1.
+    for (MVT T : ElemTypes)
+      if (NumElems * T.getSizeInBits() == 8 * HwLen)
+        return true;
+    return false;
+  }
+
+  unsigned VecWidth = VecTy.getSizeInBits();
+  if (VecWidth != 8 * HwLen && VecWidth != 16 * HwLen)
+    return false;
+  return llvm::is_contained(ElemTypes, ElemTy);
+}
+
+bool HexagonSubtarget::isTypeForHVX(Type *VecTy, bool IncludeBool) const {
+  if (!VecTy->isVectorTy() || isa<ScalableVectorType>(VecTy))
+    return false;
+  // Avoid types like <2 x i32*>.
+  if (!cast<VectorType>(VecTy)->getElementType()->isIntegerTy())
+    return false;
+  // The given type may be something like <17 x i32>, which is not MVT,
+  // but can be represented as (non-simple) EVT.
+  EVT Ty = EVT::getEVT(VecTy, /*HandleUnknown*/false);
+  if (Ty.getSizeInBits() <= 64 || !Ty.getVectorElementType().isSimple())
+    return false;
+
+  auto isHvxTy = [this, IncludeBool](MVT SimpleTy) {
+    if (isHVXVectorType(SimpleTy, IncludeBool))
+      return true;
+    auto Action = getTargetLowering()->getPreferredVectorAction(SimpleTy);
+    return Action == TargetLoweringBase::TypeWidenVector;
+  };
+
+  // Round up EVT to have power-of-2 elements, and keep checking if it
+  // qualifies for HVX, dividing it in half after each step.
+  MVT ElemTy = Ty.getVectorElementType().getSimpleVT();
+  unsigned VecLen = PowerOf2Ceil(Ty.getVectorNumElements());
+  while (ElemTy.getSizeInBits() * VecLen > 64) {
+    MVT SimpleTy = MVT::getVectorVT(ElemTy, VecLen);
+    if (SimpleTy.isValid() && isHvxTy(SimpleTy))
+      return true;
+    VecLen /= 2;
+  }
+
+  return false;
+}
+
 void HexagonSubtarget::UsrOverflowMutation::apply(ScheduleDAGInstrs *DAG) {
   for (SUnit &SU : DAG->SUnits) {
     if (!SU.isInstr())
@@ -420,14 +491,14 @@ void HexagonSubtarget::restoreLatency(SUnit *Src, SUnit *Dst) const {
   for (auto &I : Src->Succs) {
     if (!I.isAssignedRegDep() || I.getSUnit() != Dst)
       continue;
-    unsigned DepR = I.getReg();
+    Register DepR = I.getReg();
     int DefIdx = -1;
     for (unsigned OpNum = 0; OpNum < SrcI->getNumOperands(); OpNum++) {
       const MachineOperand &MO = SrcI->getOperand(OpNum);
       bool IsSameOrSubReg = false;
       if (MO.isReg()) {
-        unsigned MOReg = MO.getReg();
-        if (Register::isVirtualRegister(DepR)) {
+        Register MOReg = MO.getReg();
+        if (DepR.isVirtual()) {
           IsSameOrSubReg = (MOReg == DepR);
         } else {
           IsSameOrSubReg = getRegisterInfo()->isSubRegisterEq(DepR, MOReg);
@@ -456,7 +527,7 @@ void HexagonSubtarget::restoreLatency(SUnit *Src, SUnit *Dst) const {
 
     // Update the latency of opposite edge too.
     T.setSUnit(Src);
-    auto F = std::find(Dst->Preds.begin(), Dst->Preds.end(), T);
+    auto F = find(Dst->Preds, T);
     assert(F != Dst->Preds.end());
     F->setLatency(I.getLatency());
   }
@@ -473,7 +544,7 @@ void HexagonSubtarget::changeLatency(SUnit *Src, SUnit *Dst, unsigned Lat)
 
     // Update the latency of opposite edge too.
     T.setSUnit(Src);
-    auto F = std::find(Dst->Preds.begin(), Dst->Preds.end(), T);
+    auto F = find(Dst->Preds, T);
     assert(F != Dst->Preds.end());
     F->setLatency(Lat);
   }
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonSubtarget.h b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonSubtarget.h
index de4f245519e4..7b7fb8d04f47 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonSubtarget.h
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonSubtarget.h
@@ -135,7 +135,7 @@ public:
 
   /// ParseSubtargetFeatures - Parses features string setting specified
   /// subtarget options.  Definition of function is auto generated by tblgen.
-  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+  void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
 
   bool hasV5Ops() const {
     return getHexagonArchVersion() >= Hexagon::ArchEnum::V5;
@@ -275,31 +275,9 @@ public:
     return makeArrayRef(Types);
   }
 
-  bool isHVXVectorType(MVT VecTy, bool IncludeBool = false) const {
-    if (!VecTy.isVector() || !useHVXOps() || VecTy.isScalableVector())
-      return false;
-    MVT ElemTy = VecTy.getVectorElementType();
-    if (!IncludeBool && ElemTy == MVT::i1)
-      return false;
-
-    unsigned HwLen = getVectorLength();
-    unsigned NumElems = VecTy.getVectorNumElements();
-    ArrayRef<MVT> ElemTypes = getHVXElementTypes();
-
-    if (IncludeBool && ElemTy == MVT::i1) {
-      // Boolean HVX vector types are formed from regular HVX vector types
-      // by replacing the element type with i1.
-      for (MVT T : ElemTypes)
-        if (NumElems * T.getSizeInBits() == 8*HwLen)
-          return true;
-      return false;
-    }
-
-    unsigned VecWidth = VecTy.getSizeInBits();
-    if (VecWidth != 8*HwLen && VecWidth != 16*HwLen)
-      return false;
-    return llvm::any_of(ElemTypes, [ElemTy] (MVT T) { return ElemTy == T; });
-  }
+  bool isHVXElementType(MVT Ty, bool IncludeBool = false) const;
+  bool isHVXVectorType(MVT VecTy, bool IncludeBool = false) const;
+  bool isTypeForHVX(Type *VecTy, bool IncludeBool = false) const;
 
   unsigned getTypeAlignment(MVT Ty) const {
     if (isHVXVectorType(Ty, true))
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
index 3fe42ea13f51..9195bb3dc725 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -13,14 +13,17 @@
 #include "HexagonTargetMachine.h"
 #include "Hexagon.h"
 #include "HexagonISelLowering.h"
+#include "HexagonLoopIdiomRecognition.h"
 #include "HexagonMachineScheduler.h"
 #include "HexagonTargetObjectFile.h"
 #include "HexagonTargetTransformInfo.h"
+#include "HexagonVectorLoopCarriedReuse.h"
 #include "TargetInfo/HexagonTargetInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Passes/PassBuilder.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
@@ -97,10 +100,17 @@ static cl::opt<bool> EnableVectorPrint("enable-hexagon-vector-print",
 static cl::opt<bool> EnableVExtractOpt("hexagon-opt-vextract", cl::Hidden,
   cl::ZeroOrMore, cl::init(true), cl::desc("Enable vextract optimization"));
 
+static cl::opt<bool> EnableVectorCombine("hexagon-vector-combine", cl::Hidden,
+  cl::ZeroOrMore, cl::init(true), cl::desc("Enable HVX vector combining"));
+
 static cl::opt<bool> EnableInitialCFGCleanup("hexagon-initial-cfg-cleanup",
   cl::Hidden, cl::ZeroOrMore, cl::init(true),
   cl::desc("Simplify the CFG after atomic expansion pass"));
 
+static cl::opt<bool> EnableInstSimplify("hexagon-instsimplify", cl::Hidden,
+                                        cl::ZeroOrMore, cl::init(true),
+                                        cl::desc("Enable instsimplify"));
+
 /// HexagonTargetMachineModule - Note that this is used on hosts that
 /// cannot link in a library unless there are references into the
 /// library.  In particular, it seems that it is not possible to get
@@ -132,16 +142,17 @@ namespace llvm {
   void initializeHexagonExpandCondsetsPass(PassRegistry&);
   void initializeHexagonGenMuxPass(PassRegistry&);
   void initializeHexagonHardwareLoopsPass(PassRegistry&);
-  void initializeHexagonLoopIdiomRecognizePass(PassRegistry&);
-  void initializeHexagonVectorLoopCarriedReusePass(PassRegistry&);
+  void initializeHexagonLoopIdiomRecognizeLegacyPassPass(PassRegistry &);
   void initializeHexagonNewValueJumpPass(PassRegistry&);
   void initializeHexagonOptAddrModePass(PassRegistry&);
   void initializeHexagonPacketizerPass(PassRegistry&);
   void initializeHexagonRDFOptPass(PassRegistry&);
   void initializeHexagonSplitDoubleRegsPass(PassRegistry&);
+  void initializeHexagonVectorCombineLegacyPass(PassRegistry&);
+  void initializeHexagonVectorLoopCarriedReuseLegacyPassPass(PassRegistry &);
   void initializeHexagonVExtractPass(PassRegistry&);
   Pass *createHexagonLoopIdiomPass();
-  Pass *createHexagonVectorLoopCarriedReusePass();
+  Pass *createHexagonVectorLoopCarriedReuseLegacyPass();
 
   FunctionPass *createHexagonBitSimplify();
   FunctionPass *createHexagonBranchRelaxation();
@@ -162,22 +173,21 @@ namespace llvm {
                                      CodeGenOpt::Level OptLevel);
   FunctionPass *createHexagonLoopRescheduling();
   FunctionPass *createHexagonNewValueJump();
-  FunctionPass *createHexagonOptimizeSZextends();
   FunctionPass *createHexagonOptAddrMode();
+  FunctionPass *createHexagonOptimizeSZextends();
   FunctionPass *createHexagonPacketizer(bool Minimal);
   FunctionPass *createHexagonPeephole();
   FunctionPass *createHexagonRDFOpt();
   FunctionPass *createHexagonSplitConst32AndConst64();
   FunctionPass *createHexagonSplitDoubleRegs();
   FunctionPass *createHexagonStoreWidening();
+  FunctionPass *createHexagonVectorCombineLegacyPass();
   FunctionPass *createHexagonVectorPrint();
   FunctionPass *createHexagonVExtract();
 } // end namespace llvm;
 
 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
-  if (!RM.hasValue())
-    return Reloc::Static;
-  return *RM;
+  return RM.getValueOr(Reloc::Static);
 }
 
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeHexagonTarget() {
@@ -191,13 +201,14 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeHexagonTarget() {
   initializeHexagonEarlyIfConversionPass(PR);
   initializeHexagonGenMuxPass(PR);
   initializeHexagonHardwareLoopsPass(PR);
-  initializeHexagonLoopIdiomRecognizePass(PR);
-  initializeHexagonVectorLoopCarriedReusePass(PR);
+  initializeHexagonLoopIdiomRecognizeLegacyPassPass(PR);
   initializeHexagonNewValueJumpPass(PR);
   initializeHexagonOptAddrModePass(PR);
   initializeHexagonPacketizerPass(PR);
   initializeHexagonRDFOptPass(PR);
   initializeHexagonSplitDoubleRegsPass(PR);
+  initializeHexagonVectorCombineLegacyPass(PR);
+  initializeHexagonVectorLoopCarriedReuseLegacyPassPass(PR);
   initializeHexagonVExtractPass(PR);
 }
 
@@ -231,12 +242,10 @@ HexagonTargetMachine::getSubtargetImpl(const Function &F) const {
   Attribute FSAttr =
       FnAttrs.getAttribute(AttributeList::FunctionIndex, "target-features");
 
-  std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
-                        ? CPUAttr.getValueAsString().str()
-                        : TargetCPU;
-  std::string FS = !FSAttr.hasAttribute(Attribute::None)
-                       ? FSAttr.getValueAsString().str()
-                       : TargetFS;
+  std::string CPU =
+      CPUAttr.isValid() ? CPUAttr.getValueAsString().str() : TargetCPU;
+  std::string FS =
+      FSAttr.isValid() ? FSAttr.getValueAsString().str() : TargetFS;
   // Append the preexisting target features last, so that +mattr overrides
   // the "unsafe-fp-math" function attribute.
   // Creating a separate target feature is not strictly necessary, it only
@@ -264,10 +273,22 @@ void HexagonTargetMachine::adjustPassManager(PassManagerBuilder &PMB) {
       PM.add(createHexagonLoopIdiomPass());
     });
   PMB.addExtension(
-    PassManagerBuilder::EP_LoopOptimizerEnd,
-    [&](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
-      PM.add(createHexagonVectorLoopCarriedReusePass());
-    });
+      PassManagerBuilder::EP_LoopOptimizerEnd,
+      [&](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
+        PM.add(createHexagonVectorLoopCarriedReuseLegacyPass());
+      });
+}
+
+void HexagonTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB,
+                                                        bool DebugPassManager) {
+  PB.registerLateLoopOptimizationsEPCallback(
+      [=](LoopPassManager &LPM, PassBuilder::OptimizationLevel Level) {
+        LPM.addPass(HexagonLoopIdiomRecognitionPass());
+      });
+  PB.registerLoopOptimizerEndEPCallback(
+      [=](LoopPassManager &LPM, PassBuilder::OptimizationLevel Level) {
+        LPM.addPass(HexagonVectorLoopCarriedReusePass());
+      });
 }
 
 TargetTransformInfo
@@ -312,7 +333,8 @@ void HexagonPassConfig::addIRPasses() {
   bool NoOpt = (getOptLevel() == CodeGenOpt::None);
 
   if (!NoOpt) {
-    addPass(createConstantPropagationPass());
+    if (EnableInstSimplify)
+      addPass(createInstSimplifyLegacyPass());
     addPass(createDeadCodeEliminationPass());
   }
 
@@ -320,9 +342,16 @@ void HexagonPassConfig::addIRPasses() {
 
   if (!NoOpt) {
     if (EnableInitialCFGCleanup)
-      addPass(createCFGSimplificationPass(1, true, true, false, true));
+      addPass(createCFGSimplificationPass(SimplifyCFGOptions()
+                                              .forwardSwitchCondToPhi(true)
+                                              .convertSwitchToLookupTable(true)
+                                              .needCanonicalLoops(false)
+                                              .hoistCommonInsts(true)
+                                              .sinkCommonInsts(true)));
     if (EnableLoopPrefetch)
       addPass(createLoopDataPrefetchPass());
+    if (EnableVectorCombine)
+      addPass(createHexagonVectorCombineLegacyPass());
     if (EnableCommGEP)
       addPass(createHexagonCommonGEP());
     // Replace certain combinations of shifts and ands with extracts.
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetMachine.h b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetMachine.h
index 7ee4474e90e3..fa174128f708 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetMachine.h
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetMachine.h
@@ -37,6 +37,8 @@ public:
   static unsigned getModuleMatchQuality(const Module &M);
 
   void adjustPassManager(PassManagerBuilder &PMB) override;
+  void registerPassBuilderCallbacks(PassBuilder &PB,
+                                    bool DebugPassManager) override;
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
   TargetTransformInfo getTargetTransformInfo(const Function &F) override;
 
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
index cfc8ed813c92..595cf94e3f1d 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
@@ -331,6 +331,7 @@ unsigned HexagonTargetObjectFile::getSmallestAddressableSize(const Type *Ty,
   case Type::LabelTyID:
   case Type::MetadataTyID:
   case Type::X86_MMXTyID:
+  case Type::X86_AMXTyID:
   case Type::TokenTyID:
     return 0;
   }
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
index 80c8736cb74a..1cefa6a04640 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -21,6 +21,7 @@
 #include "llvm/IR/User.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils/LoopPeel.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
 
 using namespace llvm;
@@ -34,6 +35,9 @@ static cl::opt<bool> EmitLookupTables("hexagon-emit-lookup-tables",
   cl::init(true), cl::Hidden,
   cl::desc("Control lookup table emission on Hexagon target"));
 
+static cl::opt<bool> HexagonMaskedVMem("hexagon-masked-vmem", cl::init(true),
+  cl::Hidden, cl::desc("Enable masked loads/stores for HVX"));
+
 // Constant "cost factor" to make floating point operations more expensive
 // in terms of vectorization cost. This isn't the best way, but it should
 // do. Ultimately, the cost should use cycles.
@@ -43,22 +47,6 @@ bool HexagonTTIImpl::useHVX() const {
   return ST.useHVXOps() && HexagonAutoHVX;
 }
 
-bool HexagonTTIImpl::isTypeForHVX(Type *VecTy) const {
-  assert(VecTy->isVectorTy());
-  if (isa<ScalableVectorType>(VecTy))
-    return false;
-  // Avoid types like <2 x i32*>.
-  if (!cast<VectorType>(VecTy)->getElementType()->isIntegerTy())
-    return false;
-  EVT VecVT = EVT::getEVT(VecTy);
-  if (!VecVT.isSimple() || VecVT.getSizeInBits() <= 64)
-    return false;
-  if (ST.isHVXVectorType(VecVT.getSimpleVT()))
-    return true;
-  auto Action = TLI.getPreferredVectorAction(VecVT.getSimpleVT());
-  return Action == TargetLoweringBase::TypeWidenVector;
-}
-
 unsigned HexagonTTIImpl::getTypeNumElements(Type *Ty) const {
   if (auto *VTy = dyn_cast<FixedVectorType>(Ty))
     return VTy->getNumElements();
@@ -84,7 +72,7 @@ void HexagonTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
                                            TTI::PeelingPreferences &PP) {
   BaseT::getPeelingPreferences(L, SE, PP);
   // Only try to peel innermost loops with small runtime trip counts.
-  if (L && L->empty() && canPeel(L) &&
+  if (L && L->isInnermost() && canPeel(L) &&
       SE.getSmallConstantTripCount(L) == 0 &&
       SE.getSmallConstantMaxTripCount(L) > 0 &&
       SE.getSmallConstantMaxTripCount(L) <= 5) {
@@ -105,7 +93,7 @@ unsigned HexagonTTIImpl::getNumberOfRegisters(bool Vector) const {
 }
 
 unsigned HexagonTTIImpl::getMaxInterleaveFactor(unsigned VF) {
-  return useHVX() ? 2 : 0;
+  return useHVX() ? 2 : 1;
 }
 
 unsigned HexagonTTIImpl::getRegisterBitWidth(bool Vector) const {
@@ -113,7 +101,7 @@ unsigned HexagonTTIImpl::getRegisterBitWidth(bool Vector) const {
 }
 
 unsigned HexagonTTIImpl::getMinVectorRegisterBitWidth() const {
-  return useHVX() ? ST.getVectorLength()*8 : 0;
+  return useHVX() ? ST.getVectorLength()*8 : 32;
 }
 
 unsigned HexagonTTIImpl::getMinimumVF(unsigned ElemWidth) const {
@@ -168,7 +156,7 @@ unsigned HexagonTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
   if (Src->isVectorTy()) {
     VectorType *VecTy = cast<VectorType>(Src);
     unsigned VecWidth = VecTy->getPrimitiveSizeInBits().getFixedSize();
-    if (useHVX() && isTypeForHVX(VecTy)) {
+    if (useHVX() && ST.isTypeForHVX(VecTy)) {
       unsigned RegWidth = getRegisterBitWidth(true);
       assert(RegWidth && "Non-zero vector register width expected");
       // Cost of HVX loads.
@@ -239,13 +227,16 @@ unsigned HexagonTTIImpl::getInterleavedMemoryOpCost(
 }
 
 unsigned HexagonTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-      Type *CondTy, TTI::TargetCostKind CostKind, const Instruction *I) {
+                                            Type *CondTy,
+                                            CmpInst::Predicate VecPred,
+                                            TTI::TargetCostKind CostKind,
+                                            const Instruction *I) {
   if (ValTy->isVectorTy() && CostKind == TTI::TCK_RecipThroughput) {
     std::pair<int, MVT> LT = TLI.getTypeLegalizationCost(DL, ValTy);
     if (Opcode == Instruction::FCmp)
       return LT.first + FloatFactor * getTypeNumElements(ValTy);
   }
-  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I);
+  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
 }
 
 unsigned HexagonTTIImpl::getArithmeticInstrCost(
@@ -270,7 +261,9 @@ unsigned HexagonTTIImpl::getArithmeticInstrCost(
 }
 
 unsigned HexagonTTIImpl::getCastInstrCost(unsigned Opcode, Type *DstTy,
-      Type *SrcTy, TTI::TargetCostKind CostKind, const Instruction *I) {
+                                          Type *SrcTy, TTI::CastContextHint CCH,
+                                          TTI::TargetCostKind CostKind,
+                                          const Instruction *I) {
   if (SrcTy->isFPOrFPVectorTy() || DstTy->isFPOrFPVectorTy()) {
     unsigned SrcN = SrcTy->isFPOrFPVectorTy() ? getTypeNumElements(SrcTy) : 0;
     unsigned DstN = DstTy->isFPOrFPVectorTy() ? getTypeNumElements(DstTy) : 0;
@@ -305,6 +298,14 @@ unsigned HexagonTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
   return 1;
 }
 
+bool HexagonTTIImpl::isLegalMaskedStore(Type *DataType, Align /*Alignment*/) {
+  return HexagonMaskedVMem && ST.isTypeForHVX(DataType);
+}
+
+bool HexagonTTIImpl::isLegalMaskedLoad(Type *DataType, Align /*Alignment*/) {
+  return HexagonMaskedVMem && ST.isTypeForHVX(DataType);
+}
+
 /// --- Vector TTI end ---
 
 unsigned HexagonTTIImpl::getPrefetchDistance() const {
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
index 5fe397486402..835358d3fed0 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -43,7 +43,6 @@ class HexagonTTIImpl : public BasicTTIImplBase<HexagonTTIImpl> {
   const HexagonTargetLowering *getTLI() const { return &TLI; }
 
   bool useHVX() const;
-  bool isTypeForHVX(Type *VecTy) const;
 
   // Returns the number of vector elements of Ty, if Ty is a vector type,
   // or 1 if Ty is a scalar type. It is incorrect to call this function
@@ -134,6 +133,8 @@ public:
       TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency,
       bool UseMaskForCond = false, bool UseMaskForGaps = false);
   unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+
+                              CmpInst::Predicate VecPred,
                               TTI::TargetCostKind CostKind,
                               const Instruction *I = nullptr);
   unsigned getArithmeticInstrCost(
@@ -146,14 +147,18 @@ public:
       ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
       const Instruction *CxtI = nullptr);
   unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
-            TTI::TargetCostKind CostKind,
-            const Instruction *I = nullptr);
+                            TTI::CastContextHint CCH,
+                            TTI::TargetCostKind CostKind,
+                            const Instruction *I = nullptr);
   unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
 
   unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) {
     return 1;
   }
 
+  bool isLegalMaskedStore(Type *DataType, Align Alignment);
+  bool isLegalMaskedLoad(Type *DataType, Align Alignment);
+
   /// @}
 
   int getUserCost(const User *U, ArrayRef<const Value *> Operands,
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
new file mode 100644
index 000000000000..a605fdfcf100
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
@@ -0,0 +1,1487 @@
+//===-- HexagonVectorCombine.cpp ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// HexagonVectorCombine is a utility class implementing a variety of functions
+// that assist in vector-based optimizations.
+//
+// AlignVectors: replace unaligned vector loads and stores with aligned ones.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsHexagon.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+
+#include "HexagonSubtarget.h"
+#include "HexagonTargetMachine.h"
+
+#include <algorithm>
+#include <deque>
+#include <map>
+#include <set>
+#include <utility>
+#include <vector>
+
+#define DEBUG_TYPE "hexagon-vc"
+
+using namespace llvm;
+
+namespace {
+class HexagonVectorCombine {
+public:
+  HexagonVectorCombine(Function &F_, AliasAnalysis &AA_, AssumptionCache &AC_,
+                       DominatorTree &DT_, TargetLibraryInfo &TLI_,
+                       const TargetMachine &TM_)
+      : F(F_), DL(F.getParent()->getDataLayout()), AA(AA_), AC(AC_), DT(DT_),
+        TLI(TLI_),
+        HST(static_cast<const HexagonSubtarget &>(*TM_.getSubtargetImpl(F))) {}
+
+  bool run();
+
+  // Common integer type.
+  IntegerType *getIntTy() const;
+  // Byte type: either scalar (when Length = 0), or vector with given
+  // element count.
+  Type *getByteTy(int ElemCount = 0) const;
+  // Boolean type: either scalar (when Length = 0), or vector with given
+  // element count.
+  Type *getBoolTy(int ElemCount = 0) const;
+  // Create a ConstantInt of type returned by getIntTy with the value Val.
+  ConstantInt *getConstInt(int Val) const;
+  // Get the integer value of V, if it exists.
+  Optional<APInt> getIntValue(const Value *Val) const;
+  // Is V a constant 0, or a vector of 0s?
+  bool isZero(const Value *Val) const;
+  // Is V an undef value?
+  bool isUndef(const Value *Val) const;
+
+  int getSizeOf(const Value *Val) const;
+  int getSizeOf(const Type *Ty) const;
+  int getTypeAlignment(Type *Ty) const;
+
+  VectorType *getByteVectorTy(int ScLen) const;
+  Constant *getNullValue(Type *Ty) const;
+  Constant *getFullValue(Type *Ty) const;
+
+  Value *insertb(IRBuilder<> &Builder, Value *Dest, Value *Src, int Start,
+                 int Length, int Where) const;
+  Value *vlalignb(IRBuilder<> &Builder, Value *Lo, Value *Hi, Value *Amt) const;
+  Value *vralignb(IRBuilder<> &Builder, Value *Lo, Value *Hi, Value *Amt) const;
+  Value *concat(IRBuilder<> &Builder, ArrayRef<Value *> Vecs) const;
+  Value *vresize(IRBuilder<> &Builder, Value *Val, int NewSize,
+                 Value *Pad) const;
+  Value *rescale(IRBuilder<> &Builder, Value *Mask, Type *FromTy,
+                 Type *ToTy) const;
+  Value *vlsb(IRBuilder<> &Builder, Value *Val) const;
+  Value *vbytes(IRBuilder<> &Builder, Value *Val) const;
+
+  Value *createHvxIntrinsic(IRBuilder<> &Builder, Intrinsic::ID IntID,
+                            Type *RetTy, ArrayRef<Value *> Args) const;
+
+  Optional<int> calculatePointerDifference(Value *Ptr0, Value *Ptr1) const;
+
+  template <typename T = std::vector<Instruction *>>
+  bool isSafeToMoveBeforeInBB(const Instruction &In,
+                              BasicBlock::const_iterator To,
+                              const T &Ignore = {}) const;
+
+  Function &F;
+  const DataLayout &DL;
+  AliasAnalysis &AA;
+  AssumptionCache &AC;
+  DominatorTree &DT;
+  TargetLibraryInfo &TLI;
+  const HexagonSubtarget &HST;
+
+private:
+#ifndef NDEBUG
+  // These two functions are only used for assertions at the moment.
+  bool isByteVecTy(Type *Ty) const;
+  bool isSectorTy(Type *Ty) const;
+#endif
+  Value *getElementRange(IRBuilder<> &Builder, Value *Lo, Value *Hi, int Start,
+                         int Length) const;
+};
+
+class AlignVectors {
+public:
+  AlignVectors(HexagonVectorCombine &HVC_) : HVC(HVC_) {}
+
+  bool run();
+
+private:
+  using InstList = std::vector<Instruction *>;
+
+  struct Segment {
+    void *Data;
+    int Start;
+    int Size;
+  };
+
+  struct AddrInfo {
+    AddrInfo(const AddrInfo &) = default;
+    AddrInfo(const HexagonVectorCombine &HVC, Instruction *I, Value *A, Type *T,
+             Align H)
+        : Inst(I), Addr(A), ValTy(T), HaveAlign(H),
+          NeedAlign(HVC.getTypeAlignment(ValTy)) {}
+
+    // XXX: add Size member?
+    Instruction *Inst;
+    Value *Addr;
+    Type *ValTy;
+    Align HaveAlign;
+    Align NeedAlign;
+    int Offset = 0; // Offset (in bytes) from the first member of the
+                    // containing AddrList.
+  };
+  using AddrList = std::vector<AddrInfo>;
+
+  struct InstrLess {
+    bool operator()(const Instruction *A, const Instruction *B) const {
+      return A->comesBefore(B);
+    }
+  };
+  using DepList = std::set<Instruction *, InstrLess>;
+
+  struct MoveGroup {
+    MoveGroup(const AddrInfo &AI, Instruction *B, bool Hvx, bool Load)
+        : Base(B), Main{AI.Inst}, IsHvx(Hvx), IsLoad(Load) {}
+    Instruction *Base; // Base instruction of the parent address group.
+    InstList Main;     // Main group of instructions.
+    InstList Deps;     // List of dependencies.
+    bool IsHvx;        // Is this group of HVX instructions?
+    bool IsLoad;       // Is this a load group?
+  };
+  using MoveList = std::vector<MoveGroup>;
+
+  struct ByteSpan {
+    struct Segment {
+      Segment(Value *Val, int Begin, int Len)
+          : Val(Val), Start(Begin), Size(Len) {}
+      Segment(const Segment &Seg) = default;
+      Value *Val;
+      int Start;
+      int Size;
+    };
+
+    struct Block {
+      Block(Value *Val, int Len, int Pos) : Seg(Val, 0, Len), Pos(Pos) {}
+      Block(Value *Val, int Off, int Len, int Pos)
+          : Seg(Val, Off, Len), Pos(Pos) {}
+      Block(const Block &Blk) = default;
+      Segment Seg;
+      int Pos;
+    };
+
+    int extent() const;
+    ByteSpan section(int Start, int Length) const;
+    ByteSpan &shift(int Offset);
+
+    int size() const { return Blocks.size(); }
+    Block &operator[](int i) { return Blocks[i]; }
+
+    std::vector<Block> Blocks;
+
+    using iterator = decltype(Blocks)::iterator;
+    iterator begin() { return Blocks.begin(); }
+    iterator end() { return Blocks.end(); }
+    using const_iterator = decltype(Blocks)::const_iterator;
+    const_iterator begin() const { return Blocks.begin(); }
+    const_iterator end() const { return Blocks.end(); }
+  };
+
+  Align getAlignFromValue(const Value *V) const;
+  Optional<MemoryLocation> getLocation(const Instruction &In) const;
+  Optional<AddrInfo> getAddrInfo(Instruction &In) const;
+  bool isHvx(const AddrInfo &AI) const;
+
+  Value *getPayload(Value *Val) const;
+  Value *getMask(Value *Val) const;
+  Value *getPassThrough(Value *Val) const;
+
+  Value *createAdjustedPointer(IRBuilder<> &Builder, Value *Ptr, Type *ValTy,
+                               int Adjust) const;
+  Value *createAlignedPointer(IRBuilder<> &Builder, Value *Ptr, Type *ValTy,
+                              int Alignment) const;
+  Value *createAlignedLoad(IRBuilder<> &Builder, Type *ValTy, Value *Ptr,
+                           int Alignment, Value *Mask, Value *PassThru) const;
+  Value *createAlignedStore(IRBuilder<> &Builder, Value *Val, Value *Ptr,
+                            int Alignment, Value *Mask) const;
+
+  bool createAddressGroups();
+  MoveList createLoadGroups(const AddrList &Group) const;
+  MoveList createStoreGroups(const AddrList &Group) const;
+  bool move(const MoveGroup &Move) const;
+  bool realignGroup(const MoveGroup &Move) const;
+
+  friend raw_ostream &operator<<(raw_ostream &OS, const AddrInfo &AI);
+  friend raw_ostream &operator<<(raw_ostream &OS, const MoveGroup &MG);
+  friend raw_ostream &operator<<(raw_ostream &OS, const ByteSpan &BS);
+
+  std::map<Instruction *, AddrList> AddrGroups;
+  HexagonVectorCombine &HVC;
+};
+
+LLVM_ATTRIBUTE_UNUSED
+raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::AddrInfo &AI) {
+  OS << "Inst: " << AI.Inst << "  " << *AI.Inst << '\n';
+  OS << "Addr: " << *AI.Addr << '\n';
+  OS << "Type: " << *AI.ValTy << '\n';
+  OS << "HaveAlign: " << AI.HaveAlign.value() << '\n';
+  OS << "NeedAlign: " << AI.NeedAlign.value() << '\n';
+  OS << "Offset: " << AI.Offset;
+  return OS;
+}
+
+LLVM_ATTRIBUTE_UNUSED
+raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::MoveGroup &MG) {
+  OS << "Main\n";
+  for (Instruction *I : MG.Main)
+    OS << "  " << *I << '\n';
+  OS << "Deps\n";
+  for (Instruction *I : MG.Deps)
+    OS << "  " << *I << '\n';
+  return OS;
+}
+
+LLVM_ATTRIBUTE_UNUSED
+raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::ByteSpan &BS) {
+  OS << "ByteSpan[size=" << BS.size() << ", extent=" << BS.extent() << '\n';
+  for (const AlignVectors::ByteSpan::Block &B : BS) {
+    OS << "  @" << B.Pos << " [" << B.Seg.Start << ',' << B.Seg.Size << "] "
+       << *B.Seg.Val << '\n';
+  }
+  OS << ']';
+  return OS;
+}
+
+} // namespace
+
+namespace {
+
+template <typename T> T *getIfUnordered(T *MaybeT) {
+  return MaybeT && MaybeT->isUnordered() ? MaybeT : nullptr;
+}
+template <typename T> T *isCandidate(Instruction *In) {
+  return dyn_cast<T>(In);
+}
+template <> LoadInst *isCandidate<LoadInst>(Instruction *In) {
+  return getIfUnordered(dyn_cast<LoadInst>(In));
+}
+template <> StoreInst *isCandidate<StoreInst>(Instruction *In) {
+  return getIfUnordered(dyn_cast<StoreInst>(In));
+}
+
+#if !defined(_MSC_VER) || _MSC_VER >= 1924
+// VS2017 has trouble compiling this:
+// error C2976: 'std::map': too few template arguments
+template <typename Pred, typename... Ts>
+void erase_if(std::map<Ts...> &map, Pred p)
+#else
+template <typename Pred, typename T, typename U>
+void erase_if(std::map<T, U> &map, Pred p)
+#endif
+{
+  for (auto i = map.begin(), e = map.end(); i != e;) {
+    if (p(*i))
+      i = map.erase(i);
+    else
+      i = std::next(i);
+  }
+}
+
+// Forward other erase_ifs to the LLVM implementations.
+template <typename Pred, typename T> void erase_if(T &&container, Pred p) {
+  llvm::erase_if(std::forward<T>(container), p);
+}
+
+} // namespace
+
+// --- Begin AlignVectors
+
+auto AlignVectors::ByteSpan::extent() const -> int {
+  if (size() == 0)
+    return 0;
+  int Min = Blocks[0].Pos;
+  int Max = Blocks[0].Pos + Blocks[0].Seg.Size;
+  for (int i = 1, e = size(); i != e; ++i) {
+    Min = std::min(Min, Blocks[i].Pos);
+    Max = std::max(Max, Blocks[i].Pos + Blocks[i].Seg.Size);
+  }
+  return Max - Min;
+}
+
+auto AlignVectors::ByteSpan::section(int Start, int Length) const -> ByteSpan {
+  ByteSpan Section;
+  for (const ByteSpan::Block &B : Blocks) {
+    int L = std::max(B.Pos, Start);                       // Left end.
+    int R = std::min(B.Pos + B.Seg.Size, Start + Length); // Right end+1.
+    if (L < R) {
+      // How much to chop off the beginning of the segment:
+      int Off = L > B.Pos ? L - B.Pos : 0;
+      Section.Blocks.emplace_back(B.Seg.Val, B.Seg.Start + Off, R - L, L);
+    }
+  }
+  return Section;
+}
+
+auto AlignVectors::ByteSpan::shift(int Offset) -> ByteSpan & {
+  for (Block &B : Blocks)
+    B.Pos += Offset;
+  return *this;
+}
+
+auto AlignVectors::getAlignFromValue(const Value *V) const -> Align {
+  const auto *C = dyn_cast<ConstantInt>(V);
+  assert(C && "Alignment must be a compile-time constant integer");
+  return C->getAlignValue();
+}
+
+auto AlignVectors::getAddrInfo(Instruction &In) const -> Optional<AddrInfo> {
+  if (auto *L = isCandidate<LoadInst>(&In))
+    return AddrInfo(HVC, L, L->getPointerOperand(), L->getType(),
+                    L->getAlign());
+  if (auto *S = isCandidate<StoreInst>(&In))
+    return AddrInfo(HVC, S, S->getPointerOperand(),
+                    S->getValueOperand()->getType(), S->getAlign());
+  if (auto *II = isCandidate<IntrinsicInst>(&In)) {
+    Intrinsic::ID ID = II->getIntrinsicID();
+    switch (ID) {
+    case Intrinsic::masked_load:
+      return AddrInfo(HVC, II, II->getArgOperand(0), II->getType(),
+                      getAlignFromValue(II->getArgOperand(1)));
+    case Intrinsic::masked_store:
+      return AddrInfo(HVC, II, II->getArgOperand(1),
+                      II->getArgOperand(0)->getType(),
+                      getAlignFromValue(II->getArgOperand(2)));
+    }
+  }
+  return Optional<AddrInfo>();
+}
+
+auto AlignVectors::isHvx(const AddrInfo &AI) const -> bool {
+  return HVC.HST.isTypeForHVX(AI.ValTy);
+}
+
+auto AlignVectors::getPayload(Value *Val) const -> Value * {
+  if (auto *In = dyn_cast<Instruction>(Val)) {
+    Intrinsic::ID ID = 0;
+    if (auto *II = dyn_cast<IntrinsicInst>(In))
+      ID = II->getIntrinsicID();
+    if (isa<StoreInst>(In) || ID == Intrinsic::masked_store)
+      return In->getOperand(0);
+  }
+  return Val;
+}
+
+auto AlignVectors::getMask(Value *Val) const -> Value * {
+  if (auto *II = dyn_cast<IntrinsicInst>(Val)) {
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::masked_load:
+      return II->getArgOperand(2);
+    case Intrinsic::masked_store:
+      return II->getArgOperand(3);
+    }
+  }
+
+  Type *ValTy = getPayload(Val)->getType();
+  if (auto *VecTy = dyn_cast<VectorType>(ValTy)) {
+    int ElemCount = VecTy->getElementCount().getFixedValue();
+    return HVC.getFullValue(HVC.getBoolTy(ElemCount));
+  }
+  return HVC.getFullValue(HVC.getBoolTy());
+}
+
+auto AlignVectors::getPassThrough(Value *Val) const -> Value * {
+  if (auto *II = dyn_cast<IntrinsicInst>(Val)) {
+    if (II->getIntrinsicID() == Intrinsic::masked_load)
+      return II->getArgOperand(3);
+  }
+  return UndefValue::get(getPayload(Val)->getType());
+}
+
+auto AlignVectors::createAdjustedPointer(IRBuilder<> &Builder, Value *Ptr,
+                                         Type *ValTy, int Adjust) const
+    -> Value * {
+  // The adjustment is in bytes, but if it's a multiple of the type size,
+  // we don't need to do pointer casts.
+  Type *ElemTy = cast<PointerType>(Ptr->getType())->getElementType();
+  int ElemSize = HVC.getSizeOf(ElemTy);
+  if (Adjust % ElemSize == 0) {
+    Value *Tmp0 = Builder.CreateGEP(Ptr, HVC.getConstInt(Adjust / ElemSize));
+    return Builder.CreatePointerCast(Tmp0, ValTy->getPointerTo());
+  }
+
+  PointerType *CharPtrTy = Type::getInt8PtrTy(HVC.F.getContext());
+  Value *Tmp0 = Builder.CreatePointerCast(Ptr, CharPtrTy);
+  Value *Tmp1 = Builder.CreateGEP(Tmp0, HVC.getConstInt(Adjust));
+  return Builder.CreatePointerCast(Tmp1, ValTy->getPointerTo());
+}
+
+auto AlignVectors::createAlignedPointer(IRBuilder<> &Builder, Value *Ptr,
+                                        Type *ValTy, int Alignment) const
+    -> Value * {
+  Value *AsInt = Builder.CreatePtrToInt(Ptr, HVC.getIntTy());
+  Value *Mask = HVC.getConstInt(-Alignment);
+  Value *And = Builder.CreateAnd(AsInt, Mask);
+  return Builder.CreateIntToPtr(And, ValTy->getPointerTo());
+}
+
+auto AlignVectors::createAlignedLoad(IRBuilder<> &Builder, Type *ValTy,
+                                     Value *Ptr, int Alignment, Value *Mask,
+                                     Value *PassThru) const -> Value * {
+  assert(!HVC.isUndef(Mask)); // Should this be allowed?
+  if (HVC.isZero(Mask))
+    return PassThru;
+  if (Mask == ConstantInt::getTrue(Mask->getType()))
+    return Builder.CreateAlignedLoad(ValTy, Ptr, Align(Alignment));
+  return Builder.CreateMaskedLoad(Ptr, Align(Alignment), Mask, PassThru);
+}
+
+auto AlignVectors::createAlignedStore(IRBuilder<> &Builder, Value *Val,
+                                      Value *Ptr, int Alignment,
+                                      Value *Mask) const -> Value * {
+  if (HVC.isZero(Mask) || HVC.isUndef(Val) || HVC.isUndef(Mask))
+    return UndefValue::get(Val->getType());
+  if (Mask == ConstantInt::getTrue(Mask->getType()))
+    return Builder.CreateAlignedStore(Val, Ptr, Align(Alignment));
+  return Builder.CreateMaskedStore(Val, Ptr, Align(Alignment), Mask);
+}
+
+auto AlignVectors::createAddressGroups() -> bool {
+  // An address group created here may contain instructions spanning
+  // multiple basic blocks.
+  AddrList WorkStack;
+
+  auto findBaseAndOffset = [&](AddrInfo &AI) -> std::pair<Instruction *, int> {
+    for (AddrInfo &W : WorkStack) {
+      if (auto D = HVC.calculatePointerDifference(AI.Addr, W.Addr))
+        return std::make_pair(W.Inst, *D);
+    }
+    return std::make_pair(nullptr, 0);
+  };
+
+  auto traverseBlock = [&](DomTreeNode *DomN, auto Visit) -> void {
+    BasicBlock &Block = *DomN->getBlock();
+    for (Instruction &I : Block) {
+      auto AI = this->getAddrInfo(I); // Use this-> for gcc6.
+      if (!AI)
+        continue;
+      auto F = findBaseAndOffset(*AI);
+      Instruction *GroupInst;
+      if (Instruction *BI = F.first) {
+        AI->Offset = F.second;
+        GroupInst = BI;
+      } else {
+        WorkStack.push_back(*AI);
+        GroupInst = AI->Inst;
+      }
+      AddrGroups[GroupInst].push_back(*AI);
+    }
+
+    for (DomTreeNode *C : DomN->children())
+      Visit(C, Visit);
+
+    while (!WorkStack.empty() && WorkStack.back().Inst->getParent() == &Block)
+      WorkStack.pop_back();
+  };
+
+  traverseBlock(HVC.DT.getRootNode(), traverseBlock);
+  assert(WorkStack.empty());
+
+  // AddrGroups are formed.
+
+  // Remove groups of size 1.
+  erase_if(AddrGroups, [](auto &G) { return G.second.size() == 1; });
+  // Remove groups that don't use HVX types.
+  erase_if(AddrGroups, [&](auto &G) {
+    return !llvm::any_of(
+        G.second, [&](auto &I) { return HVC.HST.isTypeForHVX(I.ValTy); });
+  });
+  // Remove groups where everything is properly aligned.
+  erase_if(AddrGroups, [&](auto &G) {
+    return llvm::all_of(G.second,
+                        [&](auto &I) { return I.HaveAlign >= I.NeedAlign; });
+  });
+
+  return !AddrGroups.empty();
+}
+
+auto AlignVectors::createLoadGroups(const AddrList &Group) const -> MoveList {
+  // Form load groups.
+  // To avoid complications with moving code across basic blocks, only form
+  // groups that are contained within a single basic block.
+
+  auto getUpwardDeps = [](Instruction *In, Instruction *Base) {
+    BasicBlock *Parent = Base->getParent();
+    assert(In->getParent() == Parent &&
+           "Base and In should be in the same block");
+    assert(Base->comesBefore(In) && "Base should come before In");
+
+    DepList Deps;
+    std::deque<Instruction *> WorkQ = {In};
+    while (!WorkQ.empty()) {
+      Instruction *D = WorkQ.front();
+      WorkQ.pop_front();
+      Deps.insert(D);
+      for (Value *Op : D->operands()) {
+        if (auto *I = dyn_cast<Instruction>(Op)) {
+          if (I->getParent() == Parent && Base->comesBefore(I))
+            WorkQ.push_back(I);
+        }
+      }
+    }
+    return Deps;
+  };
+
+  auto tryAddTo = [&](const AddrInfo &Info, MoveGroup &Move) {
+    assert(!Move.Main.empty() && "Move group should have non-empty Main");
+    // Don't mix HVX and non-HVX instructions.
+    if (Move.IsHvx != isHvx(Info))
+      return false;
+    // Leading instruction in the load group.
+    Instruction *Base = Move.Main.front();
+    if (Base->getParent() != Info.Inst->getParent())
+      return false;
+
+    auto isSafeToMoveToBase = [&](const Instruction *I) {
+      return HVC.isSafeToMoveBeforeInBB(*I, Base->getIterator());
+    };
+    DepList Deps = getUpwardDeps(Info.Inst, Base);
+    if (!llvm::all_of(Deps, isSafeToMoveToBase))
+      return false;
+
+    // The dependencies will be moved together with the load, so make sure
+    // that none of them could be moved independently in another group.
+    Deps.erase(Info.Inst);
+    auto inAddrMap = [&](Instruction *I) { return AddrGroups.count(I) > 0; };
+    if (llvm::any_of(Deps, inAddrMap))
+      return false;
+    Move.Main.push_back(Info.Inst);
+    llvm::append_range(Move.Deps, Deps);
+    return true;
+  };
+
+  MoveList LoadGroups;
+
+  for (const AddrInfo &Info : Group) {
+    if (!Info.Inst->mayReadFromMemory())
+      continue;
+    if (LoadGroups.empty() || !tryAddTo(Info, LoadGroups.back()))
+      LoadGroups.emplace_back(Info, Group.front().Inst, isHvx(Info), true);
+  }
+
+  // Erase singleton groups.
+  erase_if(LoadGroups, [](const MoveGroup &G) { return G.Main.size() <= 1; });
+  return LoadGroups;
+}
+
+auto AlignVectors::createStoreGroups(const AddrList &Group) const -> MoveList {
+  // Form store groups.
+  // To avoid complications with moving code across basic blocks, only form
+  // groups that are contained within a single basic block.
+
+  auto tryAddTo = [&](const AddrInfo &Info, MoveGroup &Move) {
+    assert(!Move.Main.empty() && "Move group should have non-empty Main");
+    // For stores with return values we'd have to collect downward depenencies.
+    // There are no such stores that we handle at the moment, so omit that.
+    assert(Info.Inst->getType()->isVoidTy() &&
+           "Not handling stores with return values");
+    // Don't mix HVX and non-HVX instructions.
+    if (Move.IsHvx != isHvx(Info))
+      return false;
+    // For stores we need to be careful whether it's safe to move them.
+    // Stores that are otherwise safe to move together may not appear safe
+    // to move over one another (i.e. isSafeToMoveBefore may return false).
+    Instruction *Base = Move.Main.front();
+    if (Base->getParent() != Info.Inst->getParent())
+      return false;
+    if (!HVC.isSafeToMoveBeforeInBB(*Info.Inst, Base->getIterator(), Move.Main))
+      return false;
+    Move.Main.push_back(Info.Inst);
+    return true;
+  };
+
+  MoveList StoreGroups;
+
+  for (auto I = Group.rbegin(), E = Group.rend(); I != E; ++I) {
+    const AddrInfo &Info = *I;
+    if (!Info.Inst->mayWriteToMemory())
+      continue;
+    if (StoreGroups.empty() || !tryAddTo(Info, StoreGroups.back()))
+      StoreGroups.emplace_back(Info, Group.front().Inst, isHvx(Info), false);
+  }
+
+  // Erase singleton groups.
+  erase_if(StoreGroups, [](const MoveGroup &G) { return G.Main.size() <= 1; });
+  return StoreGroups;
+}
+
+auto AlignVectors::move(const MoveGroup &Move) const -> bool {
+  assert(!Move.Main.empty() && "Move group should have non-empty Main");
+  Instruction *Where = Move.Main.front();
+
+  if (Move.IsLoad) {
+    // Move all deps to before Where, keeping order.
+    for (Instruction *D : Move.Deps)
+      D->moveBefore(Where);
+    // Move all main instructions to after Where, keeping order.
+    ArrayRef<Instruction *> Main(Move.Main);
+    for (Instruction *M : Main.drop_front(1)) {
+      M->moveAfter(Where);
+      Where = M;
+    }
+  } else {
+    // NOTE: Deps are empty for "store" groups. If they need to be
+    // non-empty, decide on the order.
+    assert(Move.Deps.empty());
+    // Move all main instructions to before Where, inverting order.
+    ArrayRef<Instruction *> Main(Move.Main);
+    for (Instruction *M : Main.drop_front(1)) {
+      M->moveBefore(Where);
+      Where = M;
+    }
+  }
+
+  return Move.Main.size() + Move.Deps.size() > 1;
+}
+
+auto AlignVectors::realignGroup(const MoveGroup &Move) const -> bool {
+  // TODO: Needs support for masked loads/stores of "scalar" vectors.
+  if (!Move.IsHvx)
+    return false;
+
+  // Return the element with the maximum alignment from Range,
+  // where GetValue obtains the value to compare from an element.
+  auto getMaxOf = [](auto Range, auto GetValue) {
+    return *std::max_element(
+        Range.begin(), Range.end(),
+        [&GetValue](auto &A, auto &B) { return GetValue(A) < GetValue(B); });
+  };
+
+  const AddrList &BaseInfos = AddrGroups.at(Move.Base);
+
+  // Conceptually, there is a vector of N bytes covering the addresses
+  // starting from the minimum offset (i.e. Base.Addr+Start). This vector
+  // represents a contiguous memory region that spans all accessed memory
+  // locations.
+  // The correspondence between loaded or stored values will be expressed
+  // in terms of this vector. For example, the 0th element of the vector
+  // from the Base address info will start at byte Start from the beginning
+  // of this conceptual vector.
+  //
+  // This vector will be loaded/stored starting at the nearest down-aligned
+  // address and the amount od the down-alignment will be AlignVal:
+  //   valign(load_vector(align_down(Base+Start)), AlignVal)
+
+  std::set<Instruction *> TestSet(Move.Main.begin(), Move.Main.end());
+  AddrList MoveInfos;
+  llvm::copy_if(
+      BaseInfos, std::back_inserter(MoveInfos),
+      [&TestSet](const AddrInfo &AI) { return TestSet.count(AI.Inst); });
+
+  // Maximum alignment present in the whole address group.
+  const AddrInfo &WithMaxAlign =
+      getMaxOf(BaseInfos, [](const AddrInfo &AI) { return AI.HaveAlign; });
+  Align MaxGiven = WithMaxAlign.HaveAlign;
+
+  // Minimum alignment present in the move address group.
+  const AddrInfo &WithMinOffset =
+      getMaxOf(MoveInfos, [](const AddrInfo &AI) { return -AI.Offset; });
+
+  const AddrInfo &WithMaxNeeded =
+      getMaxOf(MoveInfos, [](const AddrInfo &AI) { return AI.NeedAlign; });
+  Align MinNeeded = WithMaxNeeded.NeedAlign;
+
+  // Set the builder at the top instruction in the move group.
+  Instruction *TopIn = Move.IsLoad ? Move.Main.front() : Move.Main.back();
+  IRBuilder<> Builder(TopIn);
+  Value *AlignAddr = nullptr; // Actual aligned address.
+  Value *AlignVal = nullptr;  // Right-shift amount (for valign).
+
+  if (MinNeeded <= MaxGiven) {
+    int Start = WithMinOffset.Offset;
+    int OffAtMax = WithMaxAlign.Offset;
+    // Shift the offset of the maximally aligned instruction (OffAtMax)
+    // back by just enough multiples of the required alignment to cover the
+    // distance from Start to OffAtMax.
+    // Calculate the address adjustment amount based on the address with the
+    // maximum alignment. This is to allow a simple gep instruction instead
+    // of potential bitcasts to i8*.
+    int Adjust = -alignTo(OffAtMax - Start, MinNeeded.value());
+    AlignAddr = createAdjustedPointer(Builder, WithMaxAlign.Addr,
+                                      WithMaxAlign.ValTy, Adjust);
+    int Diff = Start - (OffAtMax + Adjust);
+    AlignVal = HVC.getConstInt(Diff);
+    // Sanity.
+    assert(Diff >= 0);
+    assert(static_cast<decltype(MinNeeded.value())>(Diff) < MinNeeded.value());
+  } else {
+    // WithMinOffset is the lowest address in the group,
+    //   WithMinOffset.Addr = Base+Start.
+    // Align instructions for both HVX (V6_valign) and scalar (S2_valignrb)
+    // mask off unnecessary bits, so it's ok to just the original pointer as
+    // the alignment amount.
+    // Do an explicit down-alignment of the address to avoid creating an
+    // aligned instruction with an address that is not really aligned.
+    AlignAddr = createAlignedPointer(Builder, WithMinOffset.Addr,
+                                     WithMinOffset.ValTy, MinNeeded.value());
+    AlignVal = Builder.CreatePtrToInt(WithMinOffset.Addr, HVC.getIntTy());
+  }
+
+  ByteSpan VSpan;
+  for (const AddrInfo &AI : MoveInfos) {
+    VSpan.Blocks.emplace_back(AI.Inst, HVC.getSizeOf(AI.ValTy),
+                              AI.Offset - WithMinOffset.Offset);
+  }
+
+  // The aligned loads/stores will use blocks that are either scalars,
+  // or HVX vectors. Let "sector" be the unified term for such a block.
+  // blend(scalar, vector) -> sector...
+  int ScLen = Move.IsHvx ? HVC.HST.getVectorLength()
+                         : std::max<int>(MinNeeded.value(), 4);
+  assert(!Move.IsHvx || ScLen == 64 || ScLen == 128);
+  assert(Move.IsHvx || ScLen == 4 || ScLen == 8);
+
+  Type *SecTy = HVC.getByteTy(ScLen);
+  int NumSectors = (VSpan.extent() + ScLen - 1) / ScLen;
+
+  if (Move.IsLoad) {
+    ByteSpan ASpan;
+    auto *True = HVC.getFullValue(HVC.getBoolTy(ScLen));
+    auto *Undef = UndefValue::get(SecTy);
+
+    for (int i = 0; i != NumSectors + 1; ++i) {
+      Value *Ptr = createAdjustedPointer(Builder, AlignAddr, SecTy, i * ScLen);
+      // FIXME: generate a predicated load?
+      Value *Load = createAlignedLoad(Builder, SecTy, Ptr, ScLen, True, Undef);
+      ASpan.Blocks.emplace_back(Load, ScLen, i * ScLen);
+    }
+
+    for (int j = 0; j != NumSectors; ++j) {
+      ASpan[j].Seg.Val = HVC.vralignb(Builder, ASpan[j].Seg.Val,
+                                      ASpan[j + 1].Seg.Val, AlignVal);
+    }
+
+    for (ByteSpan::Block &B : VSpan) {
+      ByteSpan Section = ASpan.section(B.Pos, B.Seg.Size).shift(-B.Pos);
+      Value *Accum = UndefValue::get(HVC.getByteTy(B.Seg.Size));
+      for (ByteSpan::Block &S : Section) {
+        Value *Pay = HVC.vbytes(Builder, getPayload(S.Seg.Val));
+        Accum =
+            HVC.insertb(Builder, Accum, Pay, S.Seg.Start, S.Seg.Size, S.Pos);
+      }
+      // Instead of casting everything to bytes for the vselect, cast to the
+      // original value type. This will avoid complications with casting masks.
+      // For example, in cases when the original mask applied to i32, it could
+      // be converted to a mask applicable to i8 via pred_typecast intrinsic,
+      // but if the mask is not exactly of HVX length, extra handling would be
+      // needed to make it work.
+      Type *ValTy = getPayload(B.Seg.Val)->getType();
+      Value *Cast = Builder.CreateBitCast(Accum, ValTy);
+      Value *Sel = Builder.CreateSelect(getMask(B.Seg.Val), Cast,
+                                        getPassThrough(B.Seg.Val));
+      B.Seg.Val->replaceAllUsesWith(Sel);
+    }
+  } else {
+    // Stores.
+    ByteSpan ASpanV, ASpanM;
+
+    // Return a vector value corresponding to the input value Val:
+    // either <1 x Val> for scalar Val, or Val itself for vector Val.
+    auto MakeVec = [](IRBuilder<> &Builder, Value *Val) -> Value * {
+      Type *Ty = Val->getType();
+      if (Ty->isVectorTy())
+        return Val;
+      auto *VecTy = VectorType::get(Ty, 1, /*Scalable*/ false);
+      return Builder.CreateBitCast(Val, VecTy);
+    };
+
+    // Create an extra "undef" sector at the beginning and at the end.
+    // They will be used as the left/right filler in the vlalign step.
+    for (int i = -1; i != NumSectors + 1; ++i) {
+      // For stores, the size of each section is an aligned vector length.
+      // Adjust the store offsets relative to the section start offset.
+      ByteSpan Section = VSpan.section(i * ScLen, ScLen).shift(-i * ScLen);
+      Value *AccumV = UndefValue::get(SecTy);
+      Value *AccumM = HVC.getNullValue(SecTy);
+      for (ByteSpan::Block &S : Section) {
+        Value *Pay = getPayload(S.Seg.Val);
+        Value *Mask = HVC.rescale(Builder, MakeVec(Builder, getMask(S.Seg.Val)),
+                                  Pay->getType(), HVC.getByteTy());
+        AccumM = HVC.insertb(Builder, AccumM, HVC.vbytes(Builder, Mask),
+                             S.Seg.Start, S.Seg.Size, S.Pos);
+        AccumV = HVC.insertb(Builder, AccumV, HVC.vbytes(Builder, Pay),
+                             S.Seg.Start, S.Seg.Size, S.Pos);
+      }
+      ASpanV.Blocks.emplace_back(AccumV, ScLen, i * ScLen);
+      ASpanM.Blocks.emplace_back(AccumM, ScLen, i * ScLen);
+    }
+
+    // vlalign
+    for (int j = 1; j != NumSectors + 2; ++j) {
+      ASpanV[j - 1].Seg.Val = HVC.vlalignb(Builder, ASpanV[j - 1].Seg.Val,
+                                           ASpanV[j].Seg.Val, AlignVal);
+      ASpanM[j - 1].Seg.Val = HVC.vlalignb(Builder, ASpanM[j - 1].Seg.Val,
+                                           ASpanM[j].Seg.Val, AlignVal);
+    }
+
+    for (int i = 0; i != NumSectors + 1; ++i) {
+      Value *Ptr = createAdjustedPointer(Builder, AlignAddr, SecTy, i * ScLen);
+      Value *Val = ASpanV[i].Seg.Val;
+      Value *Mask = ASpanM[i].Seg.Val; // bytes
+      if (!HVC.isUndef(Val) && !HVC.isZero(Mask))
+        createAlignedStore(Builder, Val, Ptr, ScLen, HVC.vlsb(Builder, Mask));
+    }
+  }
+
+  for (auto *Inst : Move.Main)
+    Inst->eraseFromParent();
+
+  return true;
+}
+
+auto AlignVectors::run() -> bool {
+  if (!createAddressGroups())
+    return false;
+
+  bool Changed = false;
+  MoveList LoadGroups, StoreGroups;
+
+  for (auto &G : AddrGroups) {
+    llvm::append_range(LoadGroups, createLoadGroups(G.second));
+    llvm::append_range(StoreGroups, createStoreGroups(G.second));
+  }
+
+  for (auto &M : LoadGroups)
+    Changed |= move(M);
+  for (auto &M : StoreGroups)
+    Changed |= move(M);
+
+  for (auto &M : LoadGroups)
+    Changed |= realignGroup(M);
+  for (auto &M : StoreGroups)
+    Changed |= realignGroup(M);
+
+  return Changed;
+}
+
+// --- End AlignVectors
+
+auto HexagonVectorCombine::run() -> bool {
+  if (!HST.useHVXOps())
+    return false;
+
+  bool Changed = AlignVectors(*this).run();
+  return Changed;
+}
+
+auto HexagonVectorCombine::getIntTy() const -> IntegerType * {
+  return Type::getInt32Ty(F.getContext());
+}
+
+auto HexagonVectorCombine::getByteTy(int ElemCount) const -> Type * {
+  assert(ElemCount >= 0);
+  IntegerType *ByteTy = Type::getInt8Ty(F.getContext());
+  if (ElemCount == 0)
+    return ByteTy;
+  return VectorType::get(ByteTy, ElemCount, /*Scalable*/ false);
+}
+
+auto HexagonVectorCombine::getBoolTy(int ElemCount) const -> Type * {
+  assert(ElemCount >= 0);
+  IntegerType *BoolTy = Type::getInt1Ty(F.getContext());
+  if (ElemCount == 0)
+    return BoolTy;
+  return VectorType::get(BoolTy, ElemCount, /*Scalable*/ false);
+}
+
+auto HexagonVectorCombine::getConstInt(int Val) const -> ConstantInt * {
+  return ConstantInt::getSigned(getIntTy(), Val);
+}
+
+auto HexagonVectorCombine::isZero(const Value *Val) const -> bool {
+  if (auto *C = dyn_cast<Constant>(Val))
+    return C->isZeroValue();
+  return false;
+}
+
+auto HexagonVectorCombine::getIntValue(const Value *Val) const
+    -> Optional<APInt> {
+  if (auto *CI = dyn_cast<ConstantInt>(Val))
+    return CI->getValue();
+  return None;
+}
+
+auto HexagonVectorCombine::isUndef(const Value *Val) const -> bool {
+  return isa<UndefValue>(Val);
+}
+
+auto HexagonVectorCombine::getSizeOf(const Value *Val) const -> int {
+  return getSizeOf(Val->getType());
+}
+
+auto HexagonVectorCombine::getSizeOf(const Type *Ty) const -> int {
+  return DL.getTypeStoreSize(const_cast<Type *>(Ty)).getFixedValue();
+}
+
+auto HexagonVectorCombine::getTypeAlignment(Type *Ty) const -> int {
+  // The actual type may be shorter than the HVX vector, so determine
+  // the alignment based on subtarget info.
+  if (HST.isTypeForHVX(Ty))
+    return HST.getVectorLength();
+  return DL.getABITypeAlign(Ty).value();
+}
+
+auto HexagonVectorCombine::getNullValue(Type *Ty) const -> Constant * {
+  assert(Ty->isIntOrIntVectorTy());
+  auto Zero = ConstantInt::get(Ty->getScalarType(), 0);
+  if (auto *VecTy = dyn_cast<VectorType>(Ty))
+    return ConstantVector::getSplat(VecTy->getElementCount(), Zero);
+  return Zero;
+}
+
+auto HexagonVectorCombine::getFullValue(Type *Ty) const -> Constant * {
+  assert(Ty->isIntOrIntVectorTy());
+  auto Minus1 = ConstantInt::get(Ty->getScalarType(), -1);
+  if (auto *VecTy = dyn_cast<VectorType>(Ty))
+    return ConstantVector::getSplat(VecTy->getElementCount(), Minus1);
+  return Minus1;
+}
+
+// Insert bytes [Start..Start+Length) of Src into Dst at byte Where.
+auto HexagonVectorCombine::insertb(IRBuilder<> &Builder, Value *Dst, Value *Src,
+                                   int Start, int Length, int Where) const
+    -> Value * {
+  assert(isByteVecTy(Dst->getType()) && isByteVecTy(Src->getType()));
+  int SrcLen = getSizeOf(Src);
+  int DstLen = getSizeOf(Dst);
+  assert(0 <= Start && Start + Length <= SrcLen);
+  assert(0 <= Where && Where + Length <= DstLen);
+
+  int P2Len = PowerOf2Ceil(SrcLen | DstLen);
+  auto *Undef = UndefValue::get(getByteTy());
+  Value *P2Src = vresize(Builder, Src, P2Len, Undef);
+  Value *P2Dst = vresize(Builder, Dst, P2Len, Undef);
+
+  SmallVector<int, 256> SMask(P2Len);
+  for (int i = 0; i != P2Len; ++i) {
+    // If i is in [Where, Where+Length), pick Src[Start+(i-Where)].
+    // Otherwise, pick Dst[i];
+    SMask[i] =
+        (Where <= i && i < Where + Length) ? P2Len + Start + (i - Where) : i;
+  }
+
+  Value *P2Insert = Builder.CreateShuffleVector(P2Dst, P2Src, SMask);
+  return vresize(Builder, P2Insert, DstLen, Undef);
+}
+
+auto HexagonVectorCombine::vlalignb(IRBuilder<> &Builder, Value *Lo, Value *Hi,
+                                    Value *Amt) const -> Value * {
+  assert(Lo->getType() == Hi->getType() && "Argument type mismatch");
+  assert(isSectorTy(Hi->getType()));
+  if (isZero(Amt))
+    return Hi;
+  int VecLen = getSizeOf(Hi);
+  if (auto IntAmt = getIntValue(Amt))
+    return getElementRange(Builder, Lo, Hi, VecLen - IntAmt->getSExtValue(),
+                           VecLen);
+
+  if (HST.isTypeForHVX(Hi->getType())) {
+    int HwLen = HST.getVectorLength();
+    assert(VecLen == HwLen && "Expecting an exact HVX type");
+    Intrinsic::ID V6_vlalignb = HwLen == 64
+                                    ? Intrinsic::hexagon_V6_vlalignb
+                                    : Intrinsic::hexagon_V6_vlalignb_128B;
+    return createHvxIntrinsic(Builder, V6_vlalignb, Hi->getType(),
+                              {Hi, Lo, Amt});
+  }
+
+  if (VecLen == 4) {
+    Value *Pair = concat(Builder, {Lo, Hi});
+    Value *Shift = Builder.CreateLShr(Builder.CreateShl(Pair, Amt), 32);
+    Value *Trunc = Builder.CreateTrunc(Shift, Type::getInt32Ty(F.getContext()));
+    return Builder.CreateBitCast(Trunc, Hi->getType());
+  }
+  if (VecLen == 8) {
+    Value *Sub = Builder.CreateSub(getConstInt(VecLen), Amt);
+    return vralignb(Builder, Lo, Hi, Sub);
+  }
+  llvm_unreachable("Unexpected vector length");
+}
+
+auto HexagonVectorCombine::vralignb(IRBuilder<> &Builder, Value *Lo, Value *Hi,
+                                    Value *Amt) const -> Value * {
+  assert(Lo->getType() == Hi->getType() && "Argument type mismatch");
+  assert(isSectorTy(Lo->getType()));
+  if (isZero(Amt))
+    return Lo;
+  int VecLen = getSizeOf(Lo);
+  if (auto IntAmt = getIntValue(Amt))
+    return getElementRange(Builder, Lo, Hi, IntAmt->getSExtValue(), VecLen);
+
+  if (HST.isTypeForHVX(Lo->getType())) {
+    int HwLen = HST.getVectorLength();
+    assert(VecLen == HwLen && "Expecting an exact HVX type");
+    Intrinsic::ID V6_valignb = HwLen == 64 ? Intrinsic::hexagon_V6_valignb
+                                           : Intrinsic::hexagon_V6_valignb_128B;
+    return createHvxIntrinsic(Builder, V6_valignb, Lo->getType(),
+                              {Hi, Lo, Amt});
+  }
+
+  if (VecLen == 4) {
+    Value *Pair = concat(Builder, {Lo, Hi});
+    Value *Shift = Builder.CreateLShr(Pair, Amt);
+    Value *Trunc = Builder.CreateTrunc(Shift, Type::getInt32Ty(F.getContext()));
+    return Builder.CreateBitCast(Trunc, Lo->getType());
+  }
+  if (VecLen == 8) {
+    Type *Int64Ty = Type::getInt64Ty(F.getContext());
+    Value *Lo64 = Builder.CreateBitCast(Lo, Int64Ty);
+    Value *Hi64 = Builder.CreateBitCast(Hi, Int64Ty);
+    Function *FI = Intrinsic::getDeclaration(F.getParent(),
+                                             Intrinsic::hexagon_S2_valignrb);
+    Value *Call = Builder.CreateCall(FI, {Hi64, Lo64, Amt});
+    return Builder.CreateBitCast(Call, Lo->getType());
+  }
+  llvm_unreachable("Unexpected vector length");
+}
+
+// Concatenates a sequence of vectors of the same type.
+auto HexagonVectorCombine::concat(IRBuilder<> &Builder,
+                                  ArrayRef<Value *> Vecs) const -> Value * {
+  assert(!Vecs.empty());
+  SmallVector<int, 256> SMask;
+  std::vector<Value *> Work[2];
+  int ThisW = 0, OtherW = 1;
+
+  Work[ThisW].assign(Vecs.begin(), Vecs.end());
+  while (Work[ThisW].size() > 1) {
+    auto *Ty = cast<VectorType>(Work[ThisW].front()->getType());
+    int ElemCount = Ty->getElementCount().getFixedValue();
+    SMask.resize(ElemCount * 2);
+    std::iota(SMask.begin(), SMask.end(), 0);
+
+    Work[OtherW].clear();
+    if (Work[ThisW].size() % 2 != 0)
+      Work[ThisW].push_back(UndefValue::get(Ty));
+    for (int i = 0, e = Work[ThisW].size(); i < e; i += 2) {
+      Value *Joined = Builder.CreateShuffleVector(Work[ThisW][i],
+                                                  Work[ThisW][i + 1], SMask);
+      Work[OtherW].push_back(Joined);
+    }
+    std::swap(ThisW, OtherW);
+  }
+
+  // Since there may have been some undefs appended to make shuffle operands
+  // have the same type, perform the last shuffle to only pick the original
+  // elements.
+  SMask.resize(Vecs.size() * getSizeOf(Vecs.front()->getType()));
+  std::iota(SMask.begin(), SMask.end(), 0);
+  Value *Total = Work[OtherW].front();
+  return Builder.CreateShuffleVector(Total, SMask);
+}
+
+auto HexagonVectorCombine::vresize(IRBuilder<> &Builder, Value *Val,
+                                   int NewSize, Value *Pad) const -> Value * {
+  assert(isa<VectorType>(Val->getType()));
+  auto *ValTy = cast<VectorType>(Val->getType());
+  assert(ValTy->getElementType() == Pad->getType());
+
+  int CurSize = ValTy->getElementCount().getFixedValue();
+  if (CurSize == NewSize)
+    return Val;
+  // Truncate?
+  if (CurSize > NewSize)
+    return getElementRange(Builder, Val, /*Unused*/ Val, 0, NewSize);
+  // Extend.
+  SmallVector<int, 128> SMask(NewSize);
+  std::iota(SMask.begin(), SMask.begin() + CurSize, 0);
+  std::fill(SMask.begin() + CurSize, SMask.end(), CurSize);
+  Value *PadVec = Builder.CreateVectorSplat(CurSize, Pad);
+  return Builder.CreateShuffleVector(Val, PadVec, SMask);
+}
+
+auto HexagonVectorCombine::rescale(IRBuilder<> &Builder, Value *Mask,
+                                   Type *FromTy, Type *ToTy) const -> Value * {
+  // Mask is a vector <N x i1>, where each element corresponds to an
+  // element of FromTy. Remap it so that each element will correspond
+  // to an element of ToTy.
+  assert(isa<VectorType>(Mask->getType()));
+
+  Type *FromSTy = FromTy->getScalarType();
+  Type *ToSTy = ToTy->getScalarType();
+  if (FromSTy == ToSTy)
+    return Mask;
+
+  int FromSize = getSizeOf(FromSTy);
+  int ToSize = getSizeOf(ToSTy);
+  assert(FromSize % ToSize == 0 || ToSize % FromSize == 0);
+
+  auto *MaskTy = cast<VectorType>(Mask->getType());
+  int FromCount = MaskTy->getElementCount().getFixedValue();
+  int ToCount = (FromCount * FromSize) / ToSize;
+  assert((FromCount * FromSize) % ToSize == 0);
+
+  // Mask <N x i1> -> sext to <N x FromTy> -> bitcast to <M x ToTy> ->
+  // -> trunc to <M x i1>.
+  Value *Ext = Builder.CreateSExt(
+      Mask, VectorType::get(FromSTy, FromCount, /*Scalable*/ false));
+  Value *Cast = Builder.CreateBitCast(
+      Ext, VectorType::get(ToSTy, ToCount, /*Scalable*/ false));
+  return Builder.CreateTrunc(
+      Cast, VectorType::get(getBoolTy(), ToCount, /*Scalable*/ false));
+}
+
+// Bitcast to bytes, and return least significant bits.
+auto HexagonVectorCombine::vlsb(IRBuilder<> &Builder, Value *Val) const
+    -> Value * {
+  Type *ScalarTy = Val->getType()->getScalarType();
+  if (ScalarTy == getBoolTy())
+    return Val;
+
+  Value *Bytes = vbytes(Builder, Val);
+  if (auto *VecTy = dyn_cast<VectorType>(Bytes->getType()))
+    return Builder.CreateTrunc(Bytes, getBoolTy(getSizeOf(VecTy)));
+  // If Bytes is a scalar (i.e. Val was a scalar byte), return i1, not
+  // <1 x i1>.
+  return Builder.CreateTrunc(Bytes, getBoolTy());
+}
+
+// Bitcast to bytes for non-bool. For bool, convert i1 -> i8.
+auto HexagonVectorCombine::vbytes(IRBuilder<> &Builder, Value *Val) const
+    -> Value * {
+  Type *ScalarTy = Val->getType()->getScalarType();
+  if (ScalarTy == getByteTy())
+    return Val;
+
+  if (ScalarTy != getBoolTy())
+    return Builder.CreateBitCast(Val, getByteTy(getSizeOf(Val)));
+  // For bool, return a sext from i1 to i8.
+  if (auto *VecTy = dyn_cast<VectorType>(Val->getType()))
+    return Builder.CreateSExt(Val, VectorType::get(getByteTy(), VecTy));
+  return Builder.CreateSExt(Val, getByteTy());
+}
+
+auto HexagonVectorCombine::createHvxIntrinsic(IRBuilder<> &Builder,
+                                              Intrinsic::ID IntID, Type *RetTy,
+                                              ArrayRef<Value *> Args) const
+    -> Value * {
+  int HwLen = HST.getVectorLength();
+  Type *BoolTy = Type::getInt1Ty(F.getContext());
+  Type *Int32Ty = Type::getInt32Ty(F.getContext());
+  // HVX vector -> v16i32/v32i32
+  // HVX vector predicate -> v512i1/v1024i1
+  auto getTypeForIntrin = [&](Type *Ty) -> Type * {
+    if (HST.isTypeForHVX(Ty, /*IncludeBool*/ true)) {
+      Type *ElemTy = cast<VectorType>(Ty)->getElementType();
+      if (ElemTy == Int32Ty)
+        return Ty;
+      if (ElemTy == BoolTy)
+        return VectorType::get(BoolTy, 8 * HwLen, /*Scalable*/ false);
+      return VectorType::get(Int32Ty, HwLen / 4, /*Scalable*/ false);
+    }
+    // Non-HVX type. It should be a scalar.
+    assert(Ty == Int32Ty || Ty->isIntegerTy(64));
+    return Ty;
+  };
+
+  auto getCast = [&](IRBuilder<> &Builder, Value *Val,
+                     Type *DestTy) -> Value * {
+    Type *SrcTy = Val->getType();
+    if (SrcTy == DestTy)
+      return Val;
+    if (HST.isTypeForHVX(SrcTy, /*IncludeBool*/ true)) {
+      if (cast<VectorType>(SrcTy)->getElementType() == BoolTy) {
+        // This should take care of casts the other way too, for example
+        // v1024i1 -> v32i1.
+        Intrinsic::ID TC = HwLen == 64
+                               ? Intrinsic::hexagon_V6_pred_typecast
+                               : Intrinsic::hexagon_V6_pred_typecast_128B;
+        Function *FI = Intrinsic::getDeclaration(F.getParent(), TC,
+                                                 {DestTy, Val->getType()});
+        return Builder.CreateCall(FI, {Val});
+      }
+      // Non-predicate HVX vector.
+      return Builder.CreateBitCast(Val, DestTy);
+    }
+    // Non-HVX type. It should be a scalar, and it should already have
+    // a valid type.
+    llvm_unreachable("Unexpected type");
+  };
+
+  SmallVector<Value *, 4> IntOps;
+  for (Value *A : Args)
+    IntOps.push_back(getCast(Builder, A, getTypeForIntrin(A->getType())));
+  Function *FI = Intrinsic::getDeclaration(F.getParent(), IntID);
+  Value *Call = Builder.CreateCall(FI, IntOps);
+
+  Type *CallTy = Call->getType();
+  if (CallTy == RetTy)
+    return Call;
+  // Scalar types should have RetTy matching the call return type.
+  assert(HST.isTypeForHVX(CallTy, /*IncludeBool*/ true));
+  if (cast<VectorType>(CallTy)->getElementType() == BoolTy)
+    return getCast(Builder, Call, RetTy);
+  return Builder.CreateBitCast(Call, RetTy);
+}
+
+auto HexagonVectorCombine::calculatePointerDifference(Value *Ptr0,
+                                                      Value *Ptr1) const
+    -> Optional<int> {
+  struct Builder : IRBuilder<> {
+    Builder(BasicBlock *B) : IRBuilder<>(B) {}
+    ~Builder() {
+      for (Instruction *I : llvm::reverse(ToErase))
+        I->eraseFromParent();
+    }
+    SmallVector<Instruction *, 8> ToErase;
+  };
+
+#define CallBuilder(B, F)                                                      \
+  [&](auto &B_) {                                                              \
+    Value *V = B_.F;                                                           \
+    if (auto *I = dyn_cast<Instruction>(V))                                    \
+      B_.ToErase.push_back(I);                                                 \
+    return V;                                                                  \
+  }(B)
+
+  auto Simplify = [&](Value *V) {
+    if (auto *I = dyn_cast<Instruction>(V)) {
+      SimplifyQuery Q(DL, &TLI, &DT, &AC, I);
+      if (Value *S = SimplifyInstruction(I, Q))
+        return S;
+    }
+    return V;
+  };
+
+  auto StripBitCast = [](Value *V) {
+    while (auto *C = dyn_cast<BitCastInst>(V))
+      V = C->getOperand(0);
+    return V;
+  };
+
+  Ptr0 = StripBitCast(Ptr0);
+  Ptr1 = StripBitCast(Ptr1);
+  if (!isa<GetElementPtrInst>(Ptr0) || !isa<GetElementPtrInst>(Ptr1))
+    return None;
+
+  auto *Gep0 = cast<GetElementPtrInst>(Ptr0);
+  auto *Gep1 = cast<GetElementPtrInst>(Ptr1);
+  if (Gep0->getPointerOperand() != Gep1->getPointerOperand())
+    return None;
+
+  Builder B(Gep0->getParent());
+  Value *BasePtr = Gep0->getPointerOperand();
+  int Scale = DL.getTypeStoreSize(BasePtr->getType()->getPointerElementType());
+
+  // FIXME: for now only check GEPs with a single index.
+  if (Gep0->getNumOperands() != 2 || Gep1->getNumOperands() != 2)
+    return None;
+
+  Value *Idx0 = Gep0->getOperand(1);
+  Value *Idx1 = Gep1->getOperand(1);
+
+  // First, try to simplify the subtraction directly.
+  if (auto *Diff = dyn_cast<ConstantInt>(
+          Simplify(CallBuilder(B, CreateSub(Idx0, Idx1)))))
+    return Diff->getSExtValue() * Scale;
+
+  KnownBits Known0 = computeKnownBits(Idx0, DL, 0, &AC, Gep0, &DT);
+  KnownBits Known1 = computeKnownBits(Idx1, DL, 0, &AC, Gep1, &DT);
+  APInt Unknown = ~(Known0.Zero | Known0.One) | ~(Known1.Zero | Known1.One);
+  if (Unknown.isAllOnesValue())
+    return None;
+
+  Value *MaskU = ConstantInt::get(Idx0->getType(), Unknown);
+  Value *AndU0 = Simplify(CallBuilder(B, CreateAnd(Idx0, MaskU)));
+  Value *AndU1 = Simplify(CallBuilder(B, CreateAnd(Idx1, MaskU)));
+  Value *SubU = Simplify(CallBuilder(B, CreateSub(AndU0, AndU1)));
+  int Diff0 = 0;
+  if (auto *C = dyn_cast<ConstantInt>(SubU)) {
+    Diff0 = C->getSExtValue();
+  } else {
+    return None;
+  }
+
+  Value *MaskK = ConstantInt::get(MaskU->getType(), ~Unknown);
+  Value *AndK0 = Simplify(CallBuilder(B, CreateAnd(Idx0, MaskK)));
+  Value *AndK1 = Simplify(CallBuilder(B, CreateAnd(Idx1, MaskK)));
+  Value *SubK = Simplify(CallBuilder(B, CreateSub(AndK0, AndK1)));
+  int Diff1 = 0;
+  if (auto *C = dyn_cast<ConstantInt>(SubK)) {
+    Diff1 = C->getSExtValue();
+  } else {
+    return None;
+  }
+
+  return (Diff0 + Diff1) * Scale;
+
+#undef CallBuilder
+}
+
+template <typename T>
+auto HexagonVectorCombine::isSafeToMoveBeforeInBB(const Instruction &In,
+                                                  BasicBlock::const_iterator To,
+                                                  const T &Ignore) const
+    -> bool {
+  auto getLocOrNone = [this](const Instruction &I) -> Optional<MemoryLocation> {
+    if (const auto *II = dyn_cast<IntrinsicInst>(&I)) {
+      switch (II->getIntrinsicID()) {
+      case Intrinsic::masked_load:
+        return MemoryLocation::getForArgument(II, 0, TLI);
+      case Intrinsic::masked_store:
+        return MemoryLocation::getForArgument(II, 1, TLI);
+      }
+    }
+    return MemoryLocation::getOrNone(&I);
+  };
+
+  // The source and the destination must be in the same basic block.
+  const BasicBlock &Block = *In.getParent();
+  assert(Block.begin() == To || Block.end() == To || To->getParent() == &Block);
+  // No PHIs.
+  if (isa<PHINode>(In) || (To != Block.end() && isa<PHINode>(*To)))
+    return false;
+
+  if (!mayBeMemoryDependent(In))
+    return true;
+  bool MayWrite = In.mayWriteToMemory();
+  auto MaybeLoc = getLocOrNone(In);
+
+  auto From = In.getIterator();
+  if (From == To)
+    return true;
+  bool MoveUp = (To != Block.end() && To->comesBefore(&In));
+  auto Range =
+      MoveUp ? std::make_pair(To, From) : std::make_pair(std::next(From), To);
+  for (auto It = Range.first; It != Range.second; ++It) {
+    const Instruction &I = *It;
+    if (llvm::is_contained(Ignore, &I))
+      continue;
+    // Parts based on isSafeToMoveBefore from CoveMoverUtils.cpp.
+    if (I.mayThrow())
+      return false;
+    if (auto *CB = dyn_cast<CallBase>(&I)) {
+      if (!CB->hasFnAttr(Attribute::WillReturn))
+        return false;
+      if (!CB->hasFnAttr(Attribute::NoSync))
+        return false;
+    }
+    if (I.mayReadOrWriteMemory()) {
+      auto MaybeLocI = getLocOrNone(I);
+      if (MayWrite || I.mayWriteToMemory()) {
+        if (!MaybeLoc || !MaybeLocI)
+          return false;
+        if (!AA.isNoAlias(*MaybeLoc, *MaybeLocI))
+          return false;
+      }
+    }
+  }
+  return true;
+}
+
+#ifndef NDEBUG
+auto HexagonVectorCombine::isByteVecTy(Type *Ty) const -> bool {
+  if (auto *VecTy = dyn_cast<VectorType>(Ty))
+    return VecTy->getElementType() == getByteTy();
+  return false;
+}
+
+auto HexagonVectorCombine::isSectorTy(Type *Ty) const -> bool {
+  if (!isByteVecTy(Ty))
+    return false;
+  int Size = getSizeOf(Ty);
+  if (HST.isTypeForHVX(Ty))
+    return Size == static_cast<int>(HST.getVectorLength());
+  return Size == 4 || Size == 8;
+}
+#endif
+
+auto HexagonVectorCombine::getElementRange(IRBuilder<> &Builder, Value *Lo,
+                                           Value *Hi, int Start,
+                                           int Length) const -> Value * {
+  assert(0 <= Start && Start < Length);
+  SmallVector<int, 128> SMask(Length);
+  std::iota(SMask.begin(), SMask.end(), Start);
+  return Builder.CreateShuffleVector(Lo, Hi, SMask);
+}
+
+// Pass management.
+
+namespace llvm {
+void initializeHexagonVectorCombineLegacyPass(PassRegistry &);
+FunctionPass *createHexagonVectorCombineLegacyPass();
+} // namespace llvm
+
+namespace {
+class HexagonVectorCombineLegacy : public FunctionPass {
+public:
+  static char ID;
+
+  HexagonVectorCombineLegacy() : FunctionPass(ID) {}
+
+  StringRef getPassName() const override { return "Hexagon Vector Combine"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<TargetPassConfig>();
+    FunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnFunction(Function &F) override {
+    AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+    AssumptionCache &AC =
+        getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+    DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    TargetLibraryInfo &TLI =
+        getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+    auto &TM = getAnalysis<TargetPassConfig>().getTM<HexagonTargetMachine>();
+    HexagonVectorCombine HVC(F, AA, AC, DT, TLI, TM);
+    return HVC.run();
+  }
+};
+} // namespace
+
+char HexagonVectorCombineLegacy::ID = 0;
+
+INITIALIZE_PASS_BEGIN(HexagonVectorCombineLegacy, DEBUG_TYPE,
+                      "Hexagon Vector Combine", false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(HexagonVectorCombineLegacy, DEBUG_TYPE,
+                    "Hexagon Vector Combine", false, false)
+
+FunctionPass *llvm::createHexagonVectorCombineLegacyPass() {
+  return new HexagonVectorCombineLegacy();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp
index 42451e02ba36..310536458de9 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp
@@ -11,110 +11,9 @@
 // to identify loop carried dependences. This is scalar replacement for vector
 // types.
 //
-//-----------------------------------------------------------------------------
-// Motivation: Consider the case where we have the following loop structure.
-//
-// Loop:
-//  t0 = a[i];
-//  t1 = f(t0);
-//  t2 = g(t1);
-//  ...
-//  t3 = a[i+1];
-//  t4 = f(t3);
-//  t5 = g(t4);
-//  t6 = op(t2, t5)
-//  cond_branch <Loop>
-//
-// This can be converted to
-//  t00 = a[0];
-//  t10 = f(t00);
-//  t20 = g(t10);
-// Loop:
-//  t2 = t20;
-//  t3 = a[i+1];
-//  t4 = f(t3);
-//  t5 = g(t4);
-//  t6 = op(t2, t5)
-//  t20 = t5
-//  cond_branch <Loop>
-//
-// SROA does a good job of reusing a[i+1] as a[i] in the next iteration.
-// Such a loop comes to this pass in the following form.
-//
-// LoopPreheader:
-//  X0 = a[0];
-// Loop:
-//  X2 = PHI<(X0, LoopPreheader), (X1, Loop)>
-//  t1 = f(X2)   <-- I1
-//  t2 = g(t1)
-//  ...
-//  X1 = a[i+1]
-//  t4 = f(X1)   <-- I2
-//  t5 = g(t4)
-//  t6 = op(t2, t5)
-//  cond_branch <Loop>
-//
-// In this pass, we look for PHIs such as X2 whose incoming values come only
-// from the Loop Preheader and over the backedge and additionaly, both these
-// values are the results of the same operation in terms of opcode. We call such
-// a PHI node a dependence chain or DepChain. In this case, the dependence of X2
-// over X1 is carried over only one iteration and so the DepChain is only one
-// PHI node long.
-//
-// Then, we traverse the uses of the PHI (X2) and the uses of the value of the
-// PHI coming  over the backedge (X1). We stop at the first pair of such users
-// I1 (of X2) and I2 (of X1) that meet the following conditions.
-// 1. I1 and I2 are the same operation, but with different operands.
-// 2. X2 and X1 are used at the same operand number in the two instructions.
-// 3. All other operands Op1 of I1 and Op2 of I2 are also such that there is a
-//    a DepChain from Op1 to Op2 of the same length as that between X2 and X1.
-//
-// We then make the following transformation
-// LoopPreheader:
-//  X0 = a[0];
-//  Y0 = f(X0);
-// Loop:
-//  X2 = PHI<(X0, LoopPreheader), (X1, Loop)>
-//  Y2 = PHI<(Y0, LoopPreheader), (t4, Loop)>
-//  t1 = f(X2)   <-- Will be removed by DCE.
-//  t2 = g(Y2)
-//  ...
-//  X1 = a[i+1]
-//  t4 = f(X1)
-//  t5 = g(t4)
-//  t6 = op(t2, t5)
-//  cond_branch <Loop>
-//
-// We proceed until we cannot find any more such instructions I1 and I2.
-//
-// --- DepChains & Loop carried dependences ---
-// Consider a single basic block loop such as
-//
-// LoopPreheader:
-//  X0 = ...
-//  Y0 = ...
-// Loop:
-//  X2 = PHI<(X0, LoopPreheader), (X1, Loop)>
-//  Y2 = PHI<(Y0, LoopPreheader), (X2, Loop)>
-//  ...
-//  X1 = ...
-//  ...
-//  cond_branch <Loop>
-//
-// Then there is a dependence between X2 and X1 that goes back one iteration,
-// i.e. X1 is used as X2 in the very next iteration. We represent this as a
-// DepChain from X2 to X1 (X2->X1).
-// Similarly, there is a dependence between Y2 and X1 that goes back two
-// iterations. X1 is used as Y2 two iterations after it is computed. This is
-// represented by a DepChain as (Y2->X2->X1).
-//
-// A DepChain has the following properties.
-// 1. Num of edges in DepChain = Number of Instructions in DepChain = Number of
-//    iterations of carried dependence + 1.
-// 2. All instructions in the DepChain except the last are PHIs.
-//
 //===----------------------------------------------------------------------===//
 
+#include "HexagonVectorLoopCarriedReuse.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -161,8 +60,8 @@ static cl::opt<int> HexagonVLCRIterationLim("hexagon-vlcr-iteration-lim",
 
 namespace llvm {
 
-void initializeHexagonVectorLoopCarriedReusePass(PassRegistry&);
-Pass *createHexagonVectorLoopCarriedReusePass();
+void initializeHexagonVectorLoopCarriedReuseLegacyPassPass(PassRegistry &);
+Pass *createHexagonVectorLoopCarriedReuseLegacyPass();
 
 } // end namespace llvm
 
@@ -262,13 +161,13 @@ namespace {
     return OS;
   }
 
-  class HexagonVectorLoopCarriedReuse : public LoopPass {
+  class HexagonVectorLoopCarriedReuseLegacyPass : public LoopPass {
   public:
     static char ID;
 
-    explicit HexagonVectorLoopCarriedReuse() : LoopPass(ID) {
+    explicit HexagonVectorLoopCarriedReuseLegacyPass() : LoopPass(ID) {
       PassRegistry *PR = PassRegistry::getPassRegistry();
-      initializeHexagonVectorLoopCarriedReusePass(*PR);
+      initializeHexagonVectorLoopCarriedReuseLegacyPassPass(*PR);
     }
 
     StringRef getPassName() const override {
@@ -276,7 +175,6 @@ namespace {
     }
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<LoopInfoWrapperPass>();
       AU.addRequiredID(LoopSimplifyID);
       AU.addRequiredID(LCSSAID);
       AU.addPreservedID(LCSSAID);
@@ -284,6 +182,13 @@ namespace {
     }
 
     bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+  };
+
+  class HexagonVectorLoopCarriedReuse {
+  public:
+    HexagonVectorLoopCarriedReuse(Loop *L) : CurLoop(L){};
+
+    bool run();
 
   private:
     SetVector<DepChain *> Dependences;
@@ -305,33 +210,49 @@ namespace {
 
 } // end anonymous namespace
 
-char HexagonVectorLoopCarriedReuse::ID = 0;
+char HexagonVectorLoopCarriedReuseLegacyPass::ID = 0;
 
-INITIALIZE_PASS_BEGIN(HexagonVectorLoopCarriedReuse, "hexagon-vlcr",
-    "Hexagon-specific predictive commoning for HVX vectors", false, false)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_BEGIN(HexagonVectorLoopCarriedReuseLegacyPass, "hexagon-vlcr",
+                      "Hexagon-specific predictive commoning for HVX vectors",
+                      false, false)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
-INITIALIZE_PASS_END(HexagonVectorLoopCarriedReuse, "hexagon-vlcr",
-    "Hexagon-specific predictive commoning for HVX vectors", false, false)
+INITIALIZE_PASS_END(HexagonVectorLoopCarriedReuseLegacyPass, "hexagon-vlcr",
+                    "Hexagon-specific predictive commoning for HVX vectors",
+                    false, false)
+
+PreservedAnalyses
+HexagonVectorLoopCarriedReusePass::run(Loop &L, LoopAnalysisManager &LAM,
+                                       LoopStandardAnalysisResults &AR,
+                                       LPMUpdater &U) {
+  HexagonVectorLoopCarriedReuse Vlcr(&L);
+  if (!Vlcr.run())
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
 
-bool HexagonVectorLoopCarriedReuse::runOnLoop(Loop *L, LPPassManager &LPM) {
+bool HexagonVectorLoopCarriedReuseLegacyPass::runOnLoop(Loop *L,
+                                                        LPPassManager &LPM) {
   if (skipLoop(L))
     return false;
+  HexagonVectorLoopCarriedReuse Vlcr(L);
+  return Vlcr.run();
+}
 
-  if (!L->getLoopPreheader())
+bool HexagonVectorLoopCarriedReuse::run() {
+  if (!CurLoop->getLoopPreheader())
     return false;
 
   // Work only on innermost loops.
-  if (!L->getSubLoops().empty())
+  if (!CurLoop->getSubLoops().empty())
     return false;
 
   // Work only on single basic blocks loops.
-  if (L->getNumBlocks() != 1)
+  if (CurLoop->getNumBlocks() != 1)
     return false;
 
-  CurLoop = L;
-
   return doVLCR();
 }
 
@@ -745,6 +666,6 @@ void HexagonVectorLoopCarriedReuse::findLoopCarriedDeps() {
                   ++i) { dbgs() << *Dependences[i] << "\n"; });
 }
 
-Pass *llvm::createHexagonVectorLoopCarriedReusePass() {
-  return new HexagonVectorLoopCarriedReuse();
+Pass *llvm::createHexagonVectorLoopCarriedReuseLegacyPass() {
+  return new HexagonVectorLoopCarriedReuseLegacyPass();
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.h b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.h
new file mode 100644
index 000000000000..f1e0c5804ace
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.h
@@ -0,0 +1,139 @@
+//===- HexagonVectorLoopCarriedReuse.h ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass removes the computation of provably redundant expressions that have
+// been computed earlier in a previous iteration. It relies on the use of PHIs
+// to identify loop carried dependences. This is scalar replacement for vector
+// types.
+//
+//-----------------------------------------------------------------------------
+// Motivation: Consider the case where we have the following loop structure.
+//
+// Loop:
+//  t0 = a[i];
+//  t1 = f(t0);
+//  t2 = g(t1);
+//  ...
+//  t3 = a[i+1];
+//  t4 = f(t3);
+//  t5 = g(t4);
+//  t6 = op(t2, t5)
+//  cond_branch <Loop>
+//
+// This can be converted to
+//  t00 = a[0];
+//  t10 = f(t00);
+//  t20 = g(t10);
+// Loop:
+//  t2 = t20;
+//  t3 = a[i+1];
+//  t4 = f(t3);
+//  t5 = g(t4);
+//  t6 = op(t2, t5)
+//  t20 = t5
+//  cond_branch <Loop>
+//
+// SROA does a good job of reusing a[i+1] as a[i] in the next iteration.
+// Such a loop comes to this pass in the following form.
+//
+// LoopPreheader:
+//  X0 = a[0];
+// Loop:
+//  X2 = PHI<(X0, LoopPreheader), (X1, Loop)>
+//  t1 = f(X2)   <-- I1
+//  t2 = g(t1)
+//  ...
+//  X1 = a[i+1]
+//  t4 = f(X1)   <-- I2
+//  t5 = g(t4)
+//  t6 = op(t2, t5)
+//  cond_branch <Loop>
+//
+// In this pass, we look for PHIs such as X2 whose incoming values come only
+// from the Loop Preheader and over the backedge and additionaly, both these
+// values are the results of the same operation in terms of opcode. We call such
+// a PHI node a dependence chain or DepChain. In this case, the dependence of X2
+// over X1 is carried over only one iteration and so the DepChain is only one
+// PHI node long.
+//
+// Then, we traverse the uses of the PHI (X2) and the uses of the value of the
+// PHI coming  over the backedge (X1). We stop at the first pair of such users
+// I1 (of X2) and I2 (of X1) that meet the following conditions.
+// 1. I1 and I2 are the same operation, but with different operands.
+// 2. X2 and X1 are used at the same operand number in the two instructions.
+// 3. All other operands Op1 of I1 and Op2 of I2 are also such that there is a
+//    a DepChain from Op1 to Op2 of the same length as that between X2 and X1.
+//
+// We then make the following transformation
+// LoopPreheader:
+//  X0 = a[0];
+//  Y0 = f(X0);
+// Loop:
+//  X2 = PHI<(X0, LoopPreheader), (X1, Loop)>
+//  Y2 = PHI<(Y0, LoopPreheader), (t4, Loop)>
+//  t1 = f(X2)   <-- Will be removed by DCE.
+//  t2 = g(Y2)
+//  ...
+//  X1 = a[i+1]
+//  t4 = f(X1)
+//  t5 = g(t4)
+//  t6 = op(t2, t5)
+//  cond_branch <Loop>
+//
+// We proceed until we cannot find any more such instructions I1 and I2.
+//
+// --- DepChains & Loop carried dependences ---
+// Consider a single basic block loop such as
+//
+// LoopPreheader:
+//  X0 = ...
+//  Y0 = ...
+// Loop:
+//  X2 = PHI<(X0, LoopPreheader), (X1, Loop)>
+//  Y2 = PHI<(Y0, LoopPreheader), (X2, Loop)>
+//  ...
+//  X1 = ...
+//  ...
+//  cond_branch <Loop>
+//
+// Then there is a dependence between X2 and X1 that goes back one iteration,
+// i.e. X1 is used as X2 in the very next iteration. We represent this as a
+// DepChain from X2 to X1 (X2->X1).
+// Similarly, there is a dependence between Y2 and X1 that goes back two
+// iterations. X1 is used as Y2 two iterations after it is computed. This is
+// represented by a DepChain as (Y2->X2->X1).
+//
+// A DepChain has the following properties.
+// 1. Num of edges in DepChain = Number of Instructions in DepChain = Number of
+//    iterations of carried dependence + 1.
+// 2. All instructions in the DepChain except the last are PHIs.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONVLCR_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONVLCR_H
+
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+
+namespace llvm {
+
+class Loop;
+
+/// Hexagon Vector Loop Carried Reuse Pass
+struct HexagonVectorLoopCarriedReusePass
+    : public PassInfoMixin<HexagonVectorLoopCarriedReusePass> {
+  HexagonVectorLoopCarriedReusePass() {}
+
+  /// Run pass over the Loop.
+  PreservedAnalyses run(Loop &L, LoopAnalysisManager &LAM,
+                        LoopStandardAnalysisResults &AR, LPMUpdater &U);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_HEXAGON_HEXAGONVLCR_H
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
index e7069819fa57..627c53cadd84 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
@@ -74,7 +74,7 @@ public:
 
   void setExtender(MCContext &Context) const {
     if (Extender == nullptr)
-      const_cast<HexagonAsmBackend *>(this)->Extender = new (Context) MCInst;
+      const_cast<HexagonAsmBackend *>(this)->Extender = Context.createMCInst();
   }
 
   MCInst *takeExtender() const {
@@ -736,7 +736,7 @@ public:
               auto &Inst = const_cast<MCInst &>(RF.getInst());
               while (Size > 0 &&
                      HexagonMCInstrInfo::bundleSize(Inst) < MaxPacketSize) {
-                MCInst *Nop = new (Context) MCInst;
+                MCInst *Nop = Context.createMCInst();
                 Nop->setOpcode(Hexagon::A2_nop);
                 Inst.addOperand(MCOperand::createInst(Nop));
                 Size -= 4;
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h b/contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h
index cd96a23e1b94..76658378c0cd 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h
@@ -34,6 +34,7 @@ public:
 
   static char const *getRegisterName(unsigned RegNo);
 
+  std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
   void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &O);
   void printOperand(MCInst const *MI, unsigned OpNo, raw_ostream &O) const;
   void printBrtarget(MCInst const *MI, unsigned OpNo, raw_ostream &O) const;
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp
index 2b0bbdafa381..e7ade7834a9f 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp
@@ -14,6 +14,7 @@
 #include "MCTargetDesc/HexagonMCInstrInfo.h"
 #include "MCTargetDesc/HexagonMCShuffler.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -209,7 +210,7 @@ static MCInst *getCompoundInsn(MCContext &Context, MCInst const &L,
   case Hexagon::A2_tfrsi:
     Rt = L.getOperand(0);
     compoundOpcode = J4_jumpseti;
-    CompoundInsn = new (Context) MCInst;
+    CompoundInsn = Context.createMCInst();
     CompoundInsn->setOpcode(compoundOpcode);
 
     CompoundInsn->addOperand(Rt);
@@ -222,7 +223,7 @@ static MCInst *getCompoundInsn(MCContext &Context, MCInst const &L,
     Rs = L.getOperand(1);
 
     compoundOpcode = J4_jumpsetr;
-    CompoundInsn = new (Context) MCInst;
+    CompoundInsn = Context.createMCInst();
     CompoundInsn->setOpcode(compoundOpcode);
     CompoundInsn->addOperand(Rt);
     CompoundInsn->addOperand(Rs);
@@ -236,7 +237,7 @@ static MCInst *getCompoundInsn(MCContext &Context, MCInst const &L,
     Rt = L.getOperand(2);
 
     compoundOpcode = cmpeqBitOpcode[getCompoundOp(R)];
-    CompoundInsn = new (Context) MCInst;
+    CompoundInsn = Context.createMCInst();
     CompoundInsn->setOpcode(compoundOpcode);
     CompoundInsn->addOperand(Rs);
     CompoundInsn->addOperand(Rt);
@@ -249,7 +250,7 @@ static MCInst *getCompoundInsn(MCContext &Context, MCInst const &L,
     Rt = L.getOperand(2);
 
     compoundOpcode = cmpgtBitOpcode[getCompoundOp(R)];
-    CompoundInsn = new (Context) MCInst;
+    CompoundInsn = Context.createMCInst();
     CompoundInsn->setOpcode(compoundOpcode);
     CompoundInsn->addOperand(Rs);
     CompoundInsn->addOperand(Rt);
@@ -262,7 +263,7 @@ static MCInst *getCompoundInsn(MCContext &Context, MCInst const &L,
     Rt = L.getOperand(2);
 
     compoundOpcode = cmpgtuBitOpcode[getCompoundOp(R)];
-    CompoundInsn = new (Context) MCInst;
+    CompoundInsn = Context.createMCInst();
     CompoundInsn->setOpcode(compoundOpcode);
     CompoundInsn->addOperand(Rs);
     CompoundInsn->addOperand(Rt);
@@ -280,7 +281,7 @@ static MCInst *getCompoundInsn(MCContext &Context, MCInst const &L,
       compoundOpcode = cmpeqiBitOpcode[getCompoundOp(R)];
 
     Rs = L.getOperand(1);
-    CompoundInsn = new (Context) MCInst;
+    CompoundInsn = Context.createMCInst();
     CompoundInsn->setOpcode(compoundOpcode);
     CompoundInsn->addOperand(Rs);
     CompoundInsn->addOperand(L.getOperand(2));
@@ -298,7 +299,7 @@ static MCInst *getCompoundInsn(MCContext &Context, MCInst const &L,
       compoundOpcode = cmpgtiBitOpcode[getCompoundOp(R)];
 
     Rs = L.getOperand(1);
-    CompoundInsn = new (Context) MCInst;
+    CompoundInsn = Context.createMCInst();
     CompoundInsn->setOpcode(compoundOpcode);
     CompoundInsn->addOperand(Rs);
     CompoundInsn->addOperand(L.getOperand(2));
@@ -309,7 +310,7 @@ static MCInst *getCompoundInsn(MCContext &Context, MCInst const &L,
     LLVM_DEBUG(dbgs() << "CX: C2_cmpgtui\n");
     Rs = L.getOperand(1);
     compoundOpcode = cmpgtuiBitOpcode[getCompoundOp(R)];
-    CompoundInsn = new (Context) MCInst;
+    CompoundInsn = Context.createMCInst();
     CompoundInsn->setOpcode(compoundOpcode);
     CompoundInsn->addOperand(Rs);
     CompoundInsn->addOperand(L.getOperand(2));
@@ -320,7 +321,7 @@ static MCInst *getCompoundInsn(MCContext &Context, MCInst const &L,
     LLVM_DEBUG(dbgs() << "CX: S2_tstbit_i\n");
     Rs = L.getOperand(1);
     compoundOpcode = tstBitOpcode[getCompoundOp(R)];
-    CompoundInsn = new (Context) MCInst;
+    CompoundInsn = Context.createMCInst();
     CompoundInsn->setOpcode(compoundOpcode);
     CompoundInsn->addOperand(Rs);
     CompoundInsn->addOperand(R.getOperand(1));
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
index f9f342a07f6d..fa12fe1da448 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
@@ -110,7 +110,7 @@ HexagonMCInstrInfo::bundleInstructions(MCInstrInfo const &MCII,
 iterator_range<MCInst::const_iterator>
 HexagonMCInstrInfo::bundleInstructions(MCInst const &MCI) {
   assert(isBundle(MCI));
-  return make_range(MCI.begin() + bundleInstructionsOffset, MCI.end());
+  return drop_begin(MCI, bundleInstructionsOffset);
 }
 
 size_t HexagonMCInstrInfo::bundleSize(MCInst const &MCI) {
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index 7514d0e67744..5e4138ae6e09 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -468,7 +468,8 @@ MCSubtargetInfo *Hexagon_MC::createHexagonMCSubtargetInfo(const Triple &TT,
   StringRef CPUName = Features.first;
   StringRef ArchFS = Features.second;
 
-  MCSubtargetInfo *X = createHexagonMCSubtargetInfoImpl(TT, CPUName, ArchFS);
+  MCSubtargetInfo *X = createHexagonMCSubtargetInfoImpl(
+      TT, CPUName, /*TuneCPU*/ CPUName, ArchFS);
   if (X != nullptr && (CPUName == "hexagonv67t"))
     addArchSubtarget(X, ArchFS);
 
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
index 2788b86181e2..8a44ba32606e 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
@@ -326,7 +326,7 @@ bool HexagonShuffler::ValidResourceUsage(HexagonPacketSummary const &Summary) {
   }
 
   // Verify the CVI slot subscriptions.
-  std::stable_sort(begin(), end(), HexagonInstr::lessCVI);
+  llvm::stable_sort(*this, HexagonInstr::lessCVI);
   // create vector of hvx instructions to check
   HVXInstsT hvxInsts;
   hvxInsts.clear();
@@ -609,8 +609,7 @@ llvm::Optional<HexagonShuffler::HexagonPacket>
 HexagonShuffler::tryAuction(HexagonPacketSummary const &Summary) const {
   HexagonPacket PacketResult = Packet;
   HexagonUnitAuction AuctionCore(Summary.ReservedSlotMask);
-  std::stable_sort(PacketResult.begin(), PacketResult.end(),
-                   HexagonInstr::lessCore);
+  llvm::stable_sort(PacketResult, HexagonInstr::lessCore);
 
   const bool ValidSlots =
       llvm::all_of(insts(PacketResult), [&AuctionCore](HexagonInstr const &I) {
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/RDFDeadCode.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/RDFDeadCode.cpp
index 5a98debd3c00..894bdf38fe17 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/RDFDeadCode.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/RDFDeadCode.cpp
@@ -195,8 +195,7 @@ bool DeadCodeElimination::erase(const SetVector<NodeId> &Nodes) {
     // If it's a code node, add all ref nodes from it.
     uint16_t Kind = BA.Addr->getKind();
     if (Kind == NodeAttrs::Stmt || Kind == NodeAttrs::Phi) {
-      for (auto N : NodeAddr<CodeNode*>(BA).Addr->members(DFG))
-        DRNs.push_back(N);
+      append_range(DRNs, NodeAddr<CodeNode*>(BA).Addr->members(DFG));
       DINs.push_back(DFG.addr<InstrNode*>(I));
     } else {
       llvm_unreachable("Unexpected code node");
diff --git a/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
index 32ccf7172594..b96e178109d0 100644
--- a/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
@@ -1020,7 +1020,7 @@ SDValue LanaiTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
   SDValue Size = Op.getOperand(1);
   SDLoc DL(Op);
 
-  unsigned SPReg = getStackPointerRegisterToSaveRestore();
+  Register SPReg = getStackPointerRegisterToSaveRestore();
 
   // Get a reference to the stack pointer.
   SDValue StackPointer = DAG.getCopyFromReg(Chain, DL, SPReg, MVT::i32);
@@ -1500,8 +1500,7 @@ void LanaiTargetLowering::computeKnownBitsForTargetNode(
     KnownBits Known2;
     Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
     Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
-    Known.Zero &= Known2.Zero;
-    Known.One &= Known2.One;
+    Known = KnownBits::commonBits(Known, Known2);
     break;
   }
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiSubtarget.cpp b/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiSubtarget.cpp
index ebf91e08fbc8..d9d7847a0c5a 100644
--- a/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiSubtarget.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiSubtarget.cpp
@@ -27,7 +27,7 @@ void LanaiSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
   if (CPUName.empty())
     CPUName = "generic";
 
-  ParseSubtargetFeatures(CPUName, FS);
+  ParseSubtargetFeatures(CPUName, /*TuneCPU*/ CPUName, FS);
 }
 
 LanaiSubtarget &LanaiSubtarget::initializeSubtargetDependencies(StringRef CPU,
@@ -41,6 +41,6 @@ LanaiSubtarget::LanaiSubtarget(const Triple &TargetTriple, StringRef Cpu,
                                const TargetOptions & /*Options*/,
                                CodeModel::Model /*CodeModel*/,
                                CodeGenOpt::Level /*OptLevel*/)
-    : LanaiGenSubtargetInfo(TargetTriple, Cpu, FeatureString),
+    : LanaiGenSubtargetInfo(TargetTriple, Cpu, /*TuneCPU*/ Cpu, FeatureString),
       FrameLowering(initializeSubtargetDependencies(Cpu, FeatureString)),
       InstrInfo(), TLInfo(TM, *this), TSInfo() {}
diff --git a/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiSubtarget.h b/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiSubtarget.h
index 116c83a4df91..7955bfe0d8b9 100644
--- a/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiSubtarget.h
+++ b/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiSubtarget.h
@@ -17,7 +17,6 @@
 #include "LanaiISelLowering.h"
 #include "LanaiInstrInfo.h"
 #include "LanaiSelectionDAGInfo.h"
-#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Target/TargetMachine.h"
@@ -38,7 +37,7 @@ public:
 
   // ParseSubtargetFeatures - Parses features string setting specified
   // subtarget options.  Definition of function is auto generated by tblgen.
-  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+  void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
 
   LanaiSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
 
diff --git a/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp
index 69387119f1f4..a31f59214ec7 100644
--- a/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp
@@ -48,9 +48,7 @@ static std::string computeDataLayout() {
 }
 
 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
-  if (!RM.hasValue())
-    return Reloc::PIC_;
-  return *RM;
+  return RM.getValueOr(Reloc::PIC_);
 }
 
 LanaiTargetMachine::LanaiTargetMachine(const Target &T, const Triple &TT,
diff --git a/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiTargetMachine.h b/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiTargetMachine.h
index fb2bc0644fe8..00922f44f33a 100644
--- a/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiTargetMachine.h
+++ b/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiTargetMachine.h
@@ -13,12 +13,10 @@
 #ifndef LLVM_LIB_TARGET_LANAI_LANAITARGETMACHINE_H
 #define LLVM_LIB_TARGET_LANAI_LANAITARGETMACHINE_H
 
-#include "LanaiFrameLowering.h"
 #include "LanaiISelLowering.h"
 #include "LanaiInstrInfo.h"
 #include "LanaiSelectionDAGInfo.h"
 #include "LanaiSubtarget.h"
-#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
diff --git a/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h b/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h
index 7366d5059c9f..263f838e44a6 100644
--- a/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h
@@ -67,7 +67,8 @@ public:
   }
 
   int getIntImmCostInst(unsigned Opc, unsigned Idx, const APInt &Imm, Type *Ty,
-                        TTI::TargetCostKind CostKind) {
+                        TTI::TargetCostKind CostKind,
+                        Instruction *Inst = nullptr) {
     return getIntImmCost(Imm, Ty, CostKind);
   }
 
diff --git a/contrib/llvm-project/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp b/contrib/llvm-project/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
index 0fb27a926003..a17afe5e62f6 100644
--- a/contrib/llvm-project/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
@@ -69,11 +69,6 @@ public:
     return Lanai::NumTargetFixupKinds;
   }
 
-  bool mayNeedRelaxation(const MCInst & /*Inst*/,
-                         const MCSubtargetInfo &STI) const override {
-    return false;
-  }
-
   bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
 };
 
diff --git a/contrib/llvm-project/llvm/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.h b/contrib/llvm-project/llvm/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.h
index ce6df2969d73..f0d287c858d8 100644
--- a/contrib/llvm-project/llvm/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.h
@@ -43,6 +43,7 @@ public:
   void printMemImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
 
   // Autogenerated by tblgen.
+  std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
   void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &O);
   bool printAliasInstr(const MCInst *MI, uint64_t Address, raw_ostream &OS);
   void printCustomAliasOperand(const MCInst *MI, uint64_t Address,
diff --git a/contrib/llvm-project/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp b/contrib/llvm-project/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp
index 2ff893273c92..e850b98de806 100644
--- a/contrib/llvm-project/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp
@@ -56,7 +56,7 @@ createLanaiMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
   if (CPUName.empty())
     CPUName = "generic";
 
-  return createLanaiMCSubtargetInfoImpl(TT, CPUName, FS);
+  return createLanaiMCSubtargetInfoImpl(TT, CPUName, /*TuneCPU*/ CPUName, FS);
 }
 
 static MCStreamer *createMCStreamer(const Triple &T, MCContext &Context,
diff --git a/contrib/llvm-project/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp b/contrib/llvm-project/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
index 9529b5e802d5..f32418c5be55 100644
--- a/contrib/llvm-project/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
@@ -12,7 +12,6 @@
 #include "TargetInfo/MSP430TargetInfo.h"
 
 #include "llvm/ADT/APInt.h"
-#include "llvm/ADT/StringSwitch.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
diff --git a/contrib/llvm-project/llvm/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp b/contrib/llvm-project/llvm/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp
index 958212dc77c9..071e1484196b 100644
--- a/contrib/llvm-project/llvm/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp
@@ -90,11 +90,6 @@ public:
     return Infos[Kind - FirstTargetFixupKind];
   }
 
-  bool mayNeedRelaxation(const MCInst &Inst,
-                         const MCSubtargetInfo &STI) const override {
-    return false;
-  }
-
   bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
 };
 
diff --git a/contrib/llvm-project/llvm/lib/Target/MSP430/MCTargetDesc/MSP430InstPrinter.h b/contrib/llvm-project/llvm/lib/Target/MSP430/MCTargetDesc/MSP430InstPrinter.h
index 6a6b07f2eba0..08c466377ee3 100644
--- a/contrib/llvm-project/llvm/lib/Target/MSP430/MCTargetDesc/MSP430InstPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/MSP430/MCTargetDesc/MSP430InstPrinter.h
@@ -26,6 +26,7 @@ namespace llvm {
                    const MCSubtargetInfo &STI, raw_ostream &O) override;
 
     // Autogenerated by tblgen.
+    std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
     void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &O);
     bool printAliasInstr(const MCInst *MI, uint64_t Address, raw_ostream &O);
     void printCustomAliasOperand(const MCInst *MI, uint64_t Address,
diff --git a/contrib/llvm-project/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp b/contrib/llvm-project/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
index f207d24ce04b..c352ea563454 100644
--- a/contrib/llvm-project/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
@@ -44,7 +44,7 @@ static MCRegisterInfo *createMSP430MCRegisterInfo(const Triple &TT) {
 
 static MCSubtargetInfo *
 createMSP430MCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
-  return createMSP430MCSubtargetInfoImpl(TT, CPU, FS);
+  return createMSP430MCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FS);
 }
 
 static MCInstPrinter *createMSP430MCInstPrinter(const Triple &T,
diff --git a/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
index 821339f50355..9c6d44bf92de 100644
--- a/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -20,7 +20,6 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/CallingConv.h"
@@ -1209,9 +1208,8 @@ SDValue MSP430TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
     return SR;
   } else {
     SDValue Zero = DAG.getConstant(0, dl, VT);
-    SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
     SDValue Ops[] = {One, Zero, TargetCC, Flag};
-    return DAG.getNode(MSP430ISD::SELECT_CC, dl, VTs, Ops);
+    return DAG.getNode(MSP430ISD::SELECT_CC, dl, Op.getValueType(), Ops);
   }
 }
 
@@ -1227,10 +1225,9 @@ SDValue MSP430TargetLowering::LowerSELECT_CC(SDValue Op,
   SDValue TargetCC;
   SDValue Flag = EmitCMP(LHS, RHS, TargetCC, CC, dl, DAG);
 
-  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
   SDValue Ops[] = {TrueV, FalseV, TargetCC, Flag};
 
-  return DAG.getNode(MSP430ISD::SELECT_CC, dl, VTs, Ops);
+  return DAG.getNode(MSP430ISD::SELECT_CC, dl, Op.getValueType(), Ops);
 }
 
 SDValue MSP430TargetLowering::LowerSIGN_EXTEND(SDValue Op,
@@ -1392,14 +1389,15 @@ bool MSP430TargetLowering::isTruncateFree(Type *Ty1,
   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
     return false;
 
-  return (Ty1->getPrimitiveSizeInBits() > Ty2->getPrimitiveSizeInBits());
+  return (Ty1->getPrimitiveSizeInBits().getFixedSize() >
+          Ty2->getPrimitiveSizeInBits().getFixedSize());
 }
 
 bool MSP430TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
   if (!VT1.isInteger() || !VT2.isInteger())
     return false;
 
-  return (VT1.getSizeInBits() > VT2.getSizeInBits());
+  return (VT1.getFixedSizeInBits() > VT2.getFixedSizeInBits());
 }
 
 bool MSP430TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
diff --git a/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430Subtarget.cpp b/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430Subtarget.cpp
index 1f3c1d34f76f..5a117404d772 100644
--- a/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430Subtarget.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430Subtarget.cpp
@@ -47,7 +47,7 @@ MSP430Subtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
   if (CPUName.empty())
     CPUName = "msp430";
 
-  ParseSubtargetFeatures(CPUName, FS);
+  ParseSubtargetFeatures(CPUName, /*TuneCPU*/ CPUName, FS);
 
   if (HWMultModeOption != NoHWMult)
     HWMultMode = HWMultModeOption;
@@ -57,5 +57,5 @@ MSP430Subtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
 
 MSP430Subtarget::MSP430Subtarget(const Triple &TT, const std::string &CPU,
                                  const std::string &FS, const TargetMachine &TM)
-    : MSP430GenSubtargetInfo(TT, CPU, FS), FrameLowering(),
+    : MSP430GenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), FrameLowering(),
       InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this) {}
diff --git a/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430Subtarget.h b/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430Subtarget.h
index 2348d984d7e2..079af2c75ec1 100644
--- a/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430Subtarget.h
+++ b/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430Subtarget.h
@@ -54,7 +54,7 @@ public:
 
   /// ParseSubtargetFeatures - Parses features string setting specified
   /// subtarget options.  Definition of function is auto generated by tblgen.
-  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+  void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
 
   bool hasHWMult16() const { return HWMultMode == HWMult16; }
   bool hasHWMult32() const { return HWMultMode == HWMult32; }
diff --git a/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430TargetMachine.h b/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430TargetMachine.h
index 96fbc3ba0377..ef757dc7cb78 100644
--- a/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430TargetMachine.h
+++ b/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430TargetMachine.h
@@ -15,10 +15,10 @@
 #define LLVM_LIB_TARGET_MSP430_MSP430TARGETMACHINE_H
 
 #include "MSP430Subtarget.h"
-#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
+class StringRef;
 
 /// MSP430TargetMachine
 ///
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/contrib/llvm-project/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 9dbbdeb34dba..e4d61f8c210e 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -352,8 +352,8 @@ class MipsAsmParser : public MCTargetAsmParser {
   bool expandSaaAddr(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
                      const MCSubtargetInfo *STI);
 
-  bool reportParseError(Twine ErrorMsg);
-  bool reportParseError(SMLoc Loc, Twine ErrorMsg);
+  bool reportParseError(const Twine &ErrorMsg);
+  bool reportParseError(SMLoc Loc, const Twine &ErrorMsg);
 
   bool parseMemOffset(const MCExpr *&Res, bool isParenExpr);
 
@@ -6982,12 +6982,12 @@ bool MipsAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
 
 // FIXME: Given that these have the same name, these should both be
 // consistent on affecting the Parser.
-bool MipsAsmParser::reportParseError(Twine ErrorMsg) {
+bool MipsAsmParser::reportParseError(const Twine &ErrorMsg) {
   SMLoc Loc = getLexer().getLoc();
   return Error(Loc, ErrorMsg);
 }
 
-bool MipsAsmParser::reportParseError(SMLoc Loc, Twine ErrorMsg) {
+bool MipsAsmParser::reportParseError(SMLoc Loc, const Twine &ErrorMsg) {
   return Error(Loc, ErrorMsg);
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp b/contrib/llvm-project/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp
index 37e970f2f15b..3315a8ba18d6 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp
@@ -9,7 +9,6 @@
 #include "MipsABIInfo.h"
 #include "MipsRegisterInfo.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSwitch.h"
 #include "llvm/MC/MCTargetOptions.h"
 #include "llvm/Support/CommandLine.h"
 
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h b/contrib/llvm-project/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
index 1126b871cb11..16c7befb2670 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
@@ -53,15 +53,6 @@ public:
   /// @name Target Relaxation Interfaces
   /// @{
 
-  /// MayNeedRelaxation - Check whether the given instruction may need
-  /// relaxation.
-  ///
-  /// \param Inst - The instruction to test.
-  bool mayNeedRelaxation(const MCInst &Inst,
-                         const MCSubtargetInfo &STI) const override {
-    return false;
-  }
-
   /// fixupNeedsRelaxation - Target specific predicate for whether a given
   /// fixup requires the associated instruction to be relaxed.
   bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MCTargetDesc/MipsInstPrinter.h b/contrib/llvm-project/llvm/lib/Target/Mips/MCTargetDesc/MipsInstPrinter.h
index 3f534a2f1843..68b13bf1fcc3 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MCTargetDesc/MipsInstPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MCTargetDesc/MipsInstPrinter.h
@@ -79,6 +79,7 @@ public:
     : MCInstPrinter(MAI, MII, MRI) {}
 
   // Autogenerated by tblgen.
+  std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
   void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &O);
   static const char *getRegisterName(unsigned RegNo);
 
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/contrib/llvm-project/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
index de582bd60cbf..454f79926dd0 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
@@ -77,7 +77,7 @@ static MCRegisterInfo *createMipsMCRegisterInfo(const Triple &TT) {
 static MCSubtargetInfo *createMipsMCSubtargetInfo(const Triple &TT,
                                                   StringRef CPU, StringRef FS) {
   CPU = MIPS_MC::selectMipsCPU(TT, CPU);
-  return createMipsMCSubtargetInfoImpl(TT, CPU, FS);
+  return createMipsMCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FS);
 }
 
 static MCAsmInfo *createMipsMCAsmInfo(const MCRegisterInfo &MRI,
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/Mips.td b/contrib/llvm-project/llvm/lib/Target/Mips/Mips.td
index 7fe750249c58..792960332bcc 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/Mips.td
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/Mips.td
@@ -191,7 +191,7 @@ def FeatureUseTCCInDIV : SubtargetFeature<
                                "UseTCCInDIV", "false",
                                "Force the assembler to use trapping">;
 
-def FeatureMadd4
+def FeatureNoMadd4
     : SubtargetFeature<"nomadd4", "DisableMadd4", "true",
                        "Disable 4-operand madd.fmt and related instructions">;
 
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
index cc073fbf5231..b460bc71b11f 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -1233,7 +1233,7 @@ void MipsAsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind) {
                        .addImm(0x34));
   }
 
-  recordSled(CurSled, MI, Kind);
+  recordSled(CurSled, MI, Kind, 2);
 }
 
 void MipsAsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI) {
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsCallLowering.cpp b/contrib/llvm-project/llvm/lib/Target/Mips/MipsCallLowering.cpp
index cffd99affac1..377aa4825b43 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsCallLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsCallLowering.cpp
@@ -87,9 +87,10 @@ bool MipsCallLowering::MipsHandler::handle(
 }
 
 namespace {
-class IncomingValueHandler : public MipsCallLowering::MipsHandler {
+class MipsIncomingValueHandler : public MipsCallLowering::MipsHandler {
 public:
-  IncomingValueHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI)
+  MipsIncomingValueHandler(MachineIRBuilder &MIRBuilder,
+                           MachineRegisterInfo &MRI)
       : MipsHandler(MIRBuilder, MRI) {}
 
 private:
@@ -117,11 +118,11 @@ private:
   }
 };
 
-class CallReturnHandler : public IncomingValueHandler {
+class CallReturnHandler : public MipsIncomingValueHandler {
 public:
   CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
                     MachineInstrBuilder &MIB)
-      : IncomingValueHandler(MIRBuilder, MRI), MIB(MIB) {}
+      : MipsIncomingValueHandler(MIRBuilder, MRI), MIB(MIB) {}
 
 private:
   void markPhysRegUsed(unsigned PhysReg) override {
@@ -133,9 +134,9 @@ private:
 
 } // end anonymous namespace
 
-void IncomingValueHandler::assignValueToReg(Register ValVReg,
-                                            const CCValAssign &VA,
-                                            const EVT &VT) {
+void MipsIncomingValueHandler::assignValueToReg(Register ValVReg,
+                                                const CCValAssign &VA,
+                                                const EVT &VT) {
   Register PhysReg = VA.getLocReg();
   if (VT == MVT::f64 && PhysReg >= Mips::A0 && PhysReg <= Mips::A3) {
     const MipsSubtarget &STI =
@@ -167,8 +168,8 @@ void IncomingValueHandler::assignValueToReg(Register ValVReg,
   }
 }
 
-Register IncomingValueHandler::getStackAddress(const CCValAssign &VA,
-                                               MachineMemOperand *&MMO) {
+Register MipsIncomingValueHandler::getStackAddress(const CCValAssign &VA,
+                                                   MachineMemOperand *&MMO) {
   MachineFunction &MF = MIRBuilder.getMF();
   unsigned Size = alignTo(VA.getValVT().getSizeInBits(), 8) / 8;
   unsigned Offset = VA.getLocMemOffset();
@@ -186,8 +187,8 @@ Register IncomingValueHandler::getStackAddress(const CCValAssign &VA,
   return MIRBuilder.buildFrameIndex(LLT::pointer(0, 32), FI).getReg(0);
 }
 
-void IncomingValueHandler::assignValueToAddress(Register ValVReg,
-                                                const CCValAssign &VA) {
+void MipsIncomingValueHandler::assignValueToAddress(Register ValVReg,
+                                                    const CCValAssign &VA) {
   if (VA.getLocInfo() == CCValAssign::SExt ||
       VA.getLocInfo() == CCValAssign::ZExt ||
       VA.getLocInfo() == CCValAssign::AExt) {
@@ -197,10 +198,10 @@ void IncomingValueHandler::assignValueToAddress(Register ValVReg,
     buildLoad(ValVReg, VA);
 }
 
-bool IncomingValueHandler::handleSplit(SmallVectorImpl<Register> &VRegs,
-                                       ArrayRef<CCValAssign> ArgLocs,
-                                       unsigned ArgLocsStartIndex,
-                                       Register ArgsReg, const EVT &VT) {
+bool MipsIncomingValueHandler::handleSplit(SmallVectorImpl<Register> &VRegs,
+                                           ArrayRef<CCValAssign> ArgLocs,
+                                           unsigned ArgLocsStartIndex,
+                                           Register ArgsReg, const EVT &VT) {
   if (!assignVRegs(VRegs, ArgLocs, ArgLocsStartIndex, VT))
     return false;
   setLeastSignificantFirst(VRegs);
@@ -209,10 +210,10 @@ bool IncomingValueHandler::handleSplit(SmallVectorImpl<Register> &VRegs,
 }
 
 namespace {
-class OutgoingValueHandler : public MipsCallLowering::MipsHandler {
+class MipsOutgoingValueHandler : public MipsCallLowering::MipsHandler {
 public:
-  OutgoingValueHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
-                       MachineInstrBuilder &MIB)
+  MipsOutgoingValueHandler(MachineIRBuilder &MIRBuilder,
+                           MachineRegisterInfo &MRI, MachineInstrBuilder &MIB)
       : MipsHandler(MIRBuilder, MRI), MIB(MIB) {}
 
 private:
@@ -234,9 +235,9 @@ private:
 };
 } // end anonymous namespace
 
-void OutgoingValueHandler::assignValueToReg(Register ValVReg,
-                                            const CCValAssign &VA,
-                                            const EVT &VT) {
+void MipsOutgoingValueHandler::assignValueToReg(Register ValVReg,
+                                                const CCValAssign &VA,
+                                                const EVT &VT) {
   Register PhysReg = VA.getLocReg();
   if (VT == MVT::f64 && PhysReg >= Mips::A0 && PhysReg <= Mips::A3) {
     const MipsSubtarget &STI =
@@ -254,8 +255,8 @@ void OutgoingValueHandler::assignValueToReg(Register ValVReg,
   }
 }
 
-Register OutgoingValueHandler::getStackAddress(const CCValAssign &VA,
-                                               MachineMemOperand *&MMO) {
+Register MipsOutgoingValueHandler::getStackAddress(const CCValAssign &VA,
+                                                   MachineMemOperand *&MMO) {
   MachineFunction &MF = MIRBuilder.getMF();
   const TargetFrameLowering *TFL = MF.getSubtarget().getFrameLowering();
 
@@ -278,16 +279,16 @@ Register OutgoingValueHandler::getStackAddress(const CCValAssign &VA,
   return AddrReg.getReg(0);
 }
 
-void OutgoingValueHandler::assignValueToAddress(Register ValVReg,
-                                                const CCValAssign &VA) {
+void MipsOutgoingValueHandler::assignValueToAddress(Register ValVReg,
+                                                    const CCValAssign &VA) {
   MachineMemOperand *MMO;
   Register Addr = getStackAddress(VA, MMO);
   Register ExtReg = extendRegister(ValVReg, VA);
   MIRBuilder.buildStore(ExtReg, Addr, *MMO);
 }
 
-Register OutgoingValueHandler::extendRegister(Register ValReg,
-                                              const CCValAssign &VA) {
+Register MipsOutgoingValueHandler::extendRegister(Register ValReg,
+                                                  const CCValAssign &VA) {
   LLT LocTy{VA.getLocVT()};
   switch (VA.getLocInfo()) {
   case CCValAssign::SExt: {
@@ -308,10 +309,10 @@ Register OutgoingValueHandler::extendRegister(Register ValReg,
   llvm_unreachable("unable to extend register");
 }
 
-bool OutgoingValueHandler::handleSplit(SmallVectorImpl<Register> &VRegs,
-                                       ArrayRef<CCValAssign> ArgLocs,
-                                       unsigned ArgLocsStartIndex,
-                                       Register ArgsReg, const EVT &VT) {
+bool MipsOutgoingValueHandler::handleSplit(SmallVectorImpl<Register> &VRegs,
+                                           ArrayRef<CCValAssign> ArgLocs,
+                                           unsigned ArgLocsStartIndex,
+                                           Register ArgsReg, const EVT &VT) {
   MIRBuilder.buildUnmerge(VRegs, ArgsReg);
   setLeastSignificantFirst(VRegs);
   if (!assignVRegs(VRegs, ArgLocs, ArgLocsStartIndex, VT))
@@ -346,7 +347,7 @@ static CCValAssign::LocInfo determineLocInfo(const MVT RegisterVT, const EVT VT,
                                              const ISD::ArgFlagsTy &Flags) {
   // > does not mean loss of information as type RegisterVT can't hold type VT,
   // it means that type VT is split into multiple registers of type RegisterVT
-  if (VT.getSizeInBits() >= RegisterVT.getSizeInBits())
+  if (VT.getFixedSizeInBits() >= RegisterVT.getFixedSizeInBits())
     return CCValAssign::LocInfo::Full;
   if (Flags.isSExt())
     return CCValAssign::LocInfo::SExt;
@@ -373,8 +374,8 @@ static void setLocInfo(SmallVectorImpl<CCValAssign> &ArgLocs,
 }
 
 bool MipsCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
-                                   const Value *Val,
-                                   ArrayRef<Register> VRegs) const {
+                                   const Value *Val, ArrayRef<Register> VRegs,
+                                   FunctionLoweringInfo &FLI) const {
 
   MachineInstrBuilder Ret = MIRBuilder.buildInstrNoInsert(Mips::RetRA);
 
@@ -403,7 +404,7 @@ bool MipsCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
     CCInfo.AnalyzeReturn(Outs, TLI.CCAssignFnForReturn());
     setLocInfo(ArgLocs, Outs);
 
-    OutgoingValueHandler RetHandler(MIRBuilder, MF.getRegInfo(), Ret);
+    MipsOutgoingValueHandler RetHandler(MIRBuilder, MF.getRegInfo(), Ret);
     if (!RetHandler.handle(ArgLocs, RetInfos)) {
       return false;
     }
@@ -412,9 +413,10 @@ bool MipsCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
   return true;
 }
 
-bool MipsCallLowering::lowerFormalArguments(
-    MachineIRBuilder &MIRBuilder, const Function &F,
-    ArrayRef<ArrayRef<Register>> VRegs) const {
+bool MipsCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
+                                            const Function &F,
+                                            ArrayRef<ArrayRef<Register>> VRegs,
+                                            FunctionLoweringInfo &FLI) const {
 
   // Quick exit if there aren't any args.
   if (F.arg_empty())
@@ -455,7 +457,7 @@ bool MipsCallLowering::lowerFormalArguments(
   CCInfo.AnalyzeFormalArguments(Ins, TLI.CCAssignFnForCall());
   setLocInfo(ArgLocs, Ins);
 
-  IncomingValueHandler Handler(MIRBuilder, MF.getRegInfo());
+  MipsIncomingValueHandler Handler(MIRBuilder, MF.getRegInfo());
   if (!Handler.handle(ArgLocs, ArgInfos))
     return false;
 
@@ -579,7 +581,7 @@ bool MipsCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
   CCInfo.AnalyzeCallOperands(Outs, TLI.CCAssignFnForCall(), FuncOrigArgs, Call);
   setLocInfo(ArgLocs, Outs);
 
-  OutgoingValueHandler RetHandler(MIRBuilder, MF.getRegInfo(), MIB);
+  MipsOutgoingValueHandler RetHandler(MIRBuilder, MF.getRegInfo(), MIB);
   if (!RetHandler.handle(ArgLocs, ArgInfos)) {
     return false;
   }
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsCallLowering.h b/contrib/llvm-project/llvm/lib/Target/Mips/MipsCallLowering.h
index a284cf5e26cf..1c1c2080a76a 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsCallLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsCallLowering.h
@@ -18,6 +18,7 @@
 
 namespace llvm {
 
+class MachineMemOperand;
 class MipsTargetLowering;
 
 class MipsCallLowering : public CallLowering {
@@ -63,10 +64,12 @@ public:
   MipsCallLowering(const MipsTargetLowering &TLI);
 
   bool lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val,
-                   ArrayRef<Register> VRegs) const override;
+                   ArrayRef<Register> VRegs,
+                   FunctionLoweringInfo &FLI) const override;
 
   bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
-                            ArrayRef<ArrayRef<Register>> VRegs) const override;
+                            ArrayRef<ArrayRef<Register>> VRegs,
+                            FunctionLoweringInfo &FLI) const override;
 
   bool lowerCall(MachineIRBuilder &MIRBuilder,
                  CallLoweringInfo &Info) const override;
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp b/contrib/llvm-project/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp
index faf7160e63e2..8e619549f01c 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp
@@ -552,7 +552,7 @@ MipsConstantIslands::doInitialPlacement(std::vector<MachineInstr*> &CPEMIs) {
 
   const DataLayout &TD = MF->getDataLayout();
   for (unsigned i = 0, e = CPs.size(); i != e; ++i) {
-    unsigned Size = TD.getTypeAllocSize(CPs[i].getType());
+    unsigned Size = CPs[i].getSizeInBytes(TD);
     assert(Size >= 4 && "Too small constant pool entry");
     Align Alignment = CPs[i].getAlign();
     // Verify that all constant pool entries are a multiple of their alignment.
@@ -593,12 +593,7 @@ static bool BBHasFallthrough(MachineBasicBlock *MBB) {
     return false;
 
   MachineBasicBlock *NextBB = &*std::next(MBBI);
-  for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
-       E = MBB->succ_end(); I != E; ++I)
-    if (*I == NextBB)
-      return true;
-
-  return false;
+  return llvm::is_contained(MBB->successors(), NextBB);
 }
 
 /// findConstPoolEntry - Given the constpool index and CONSTPOOL_ENTRY MI,
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp b/contrib/llvm-project/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
index 155d19ba6959..797d81204305 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -182,7 +182,7 @@ namespace {
   /// memory instruction can be moved to a delay slot.
   class MemDefsUses : public InspectMemInstr {
   public:
-    MemDefsUses(const DataLayout &DL, const MachineFrameInfo *MFI);
+    explicit MemDefsUses(const MachineFrameInfo *MFI);
 
   private:
     using ValueType = PointerUnion<const Value *, const PseudoSourceValue *>;
@@ -200,7 +200,6 @@ namespace {
 
     const MachineFrameInfo *MFI;
     SmallPtrSet<ValueType, 4> Uses, Defs;
-    const DataLayout &DL;
 
     /// Flags indicating whether loads or stores with no underlying objects have
     /// been seen.
@@ -492,8 +491,8 @@ bool LoadFromStackOrConst::hasHazard_(const MachineInstr &MI) {
   return true;
 }
 
-MemDefsUses::MemDefsUses(const DataLayout &DL, const MachineFrameInfo *MFI_)
-    : InspectMemInstr(false), MFI(MFI_), DL(DL) {}
+MemDefsUses::MemDefsUses(const MachineFrameInfo *MFI_)
+    : InspectMemInstr(false), MFI(MFI_) {}
 
 bool MemDefsUses::hasHazard_(const MachineInstr &MI) {
   bool HasHazard = false;
@@ -542,7 +541,7 @@ getUnderlyingObjects(const MachineInstr &MI,
 
   if (const Value *V = MMO.getValue()) {
     SmallVector<const Value *, 4> Objs;
-    GetUnderlyingObjects(V, Objs, DL);
+    ::getUnderlyingObjects(V, Objs);
 
     for (const Value *UValue : Objs) {
       if (!isIdentifiedObject(V))
@@ -566,7 +565,11 @@ Iter MipsDelaySlotFiller::replaceWithCompactBranch(MachineBasicBlock &MBB,
   unsigned NewOpcode = TII->getEquivalentCompactForm(Branch);
   Branch = TII->genInstrWithNewOpc(NewOpcode, Branch);
 
-  std::next(Branch)->eraseFromParent();
+  auto *ToErase = cast<MachineInstr>(&*std::next(Branch));
+  // Update call site info for the Branch.
+  if (ToErase->shouldUpdateCallSiteInfo())
+    ToErase->getMF()->moveCallSiteInfo(ToErase, cast<MachineInstr>(&*Branch));
+  ToErase->eraseFromParent();
   return Branch;
 }
 
@@ -775,7 +778,7 @@ bool MipsDelaySlotFiller::searchBackward(MachineBasicBlock &MBB,
 
   auto *Fn = MBB.getParent();
   RegDefsUses RegDU(*Fn->getSubtarget().getRegisterInfo());
-  MemDefsUses MemDU(Fn->getDataLayout(), &Fn->getFrameInfo());
+  MemDefsUses MemDU(&Fn->getFrameInfo());
   ReverseIter Filler;
 
   RegDU.init(Slot);
@@ -851,7 +854,7 @@ bool MipsDelaySlotFiller::searchSuccBBs(MachineBasicBlock &MBB,
     IM.reset(new LoadFromStackOrConst());
   } else {
     const MachineFrameInfo &MFI = Fn->getFrameInfo();
-    IM.reset(new MemDefsUses(Fn->getDataLayout(), &MFI));
+    IM.reset(new MemDefsUses(&MFI));
   }
 
   if (!searchRange(MBB, SuccBB->begin(), SuccBB->end(), RegDU, *IM, Slot,
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsExpandPseudo.cpp b/contrib/llvm-project/llvm/lib/Target/Mips/MipsExpandPseudo.cpp
index b1abf4a33717..f72dc1da4131 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsExpandPseudo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsExpandPseudo.cpp
@@ -733,10 +733,10 @@ bool MipsExpandPseudo::expandAtomicBinOp(MachineBasicBlock &BB,
 
     assert(I->getNumOperands() == 5 &&
            "Atomics min|max|umin|umax use an additional register");
-    Register Scratch2 = I->getOperand(4).getReg();
+    MCRegister Scratch2 = I->getOperand(4).getReg().asMCReg();
 
     // On Mips64 result of slt is GPR32.
-    Register Scratch2_32 =
+    MCRegister Scratch2_32 =
         (Size == 8) ? STI->getRegisterInfo()->getSubReg(Scratch2, Mips::sub_32)
                     : Scratch2;
 
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/Mips/MipsISelLowering.cpp
index 2da35020006e..8b599bca3915 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -134,7 +134,7 @@ unsigned MipsTargetLowering::getVectorTypeBreakdownForCallingConv(
   // Break down vector types to either 2 i64s or 4 i32s.
   RegisterVT = getRegisterTypeForCallingConv(Context, CC, VT);
   IntermediateVT = RegisterVT;
-  NumIntermediates = VT.getSizeInBits() < RegisterVT.getSizeInBits()
+  NumIntermediates = VT.getFixedSizeInBits() < RegisterVT.getFixedSizeInBits()
                          ? VT.getVectorNumElements()
                          : VT.getSizeInBits() / RegisterVT.getSizeInBits();
 
@@ -1197,17 +1197,6 @@ bool MipsTargetLowering::shouldFoldConstantShiftPairToMask(
   return true;
 }
 
-void
-MipsTargetLowering::LowerOperationWrapper(SDNode *N,
-                                          SmallVectorImpl<SDValue> &Results,
-                                          SelectionDAG &DAG) const {
-  SDValue Res = LowerOperation(SDValue(N, 0), DAG);
-
-  if (Res)
-    for (unsigned I = 0, E = Res->getNumValues(); I != E; ++I)
-      Results.push_back(Res.getValue(I));
-}
-
 void
 MipsTargetLowering::ReplaceNodeResults(SDNode *N,
                                        SmallVectorImpl<SDValue> &Results,
@@ -3025,8 +3014,8 @@ SDValue MipsTargetLowering::passArgOnStack(SDValue StackPtr, unsigned Offset,
   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
   int FI = MFI.CreateFixedObject(Arg.getValueSizeInBits() / 8, Offset, false);
   SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
-  return DAG.getStore(Chain, DL, Arg, FIN, MachinePointerInfo(),
-                      /* Alignment = */ 0, MachineMemOperand::MOVolatile);
+  return DAG.getStore(Chain, DL, Arg, FIN, MachinePointerInfo(), MaybeAlign(),
+                      MachineMemOperand::MOVolatile);
 }
 
 void MipsTargetLowering::
@@ -4404,7 +4393,7 @@ void MipsTargetLowering::passByValArg(
       SDValue LoadPtr = DAG.getNode(ISD::ADD, DL, PtrTy, Arg,
                                     DAG.getConstant(OffsetInBytes, DL, PtrTy));
       SDValue LoadVal = DAG.getLoad(RegTy, DL, Chain, LoadPtr,
-                                    MachinePointerInfo(), Alignment.value());
+                                    MachinePointerInfo(), Alignment);
       MemOpChains.push_back(LoadVal.getValue(1));
       unsigned ArgReg = ArgRegs[FirstReg + I];
       RegsToPass.push_back(std::make_pair(ArgReg, LoadVal));
@@ -4431,7 +4420,7 @@ void MipsTargetLowering::passByValArg(
                                                       PtrTy));
         SDValue LoadVal = DAG.getExtLoad(
             ISD::ZEXTLOAD, DL, RegTy, Chain, LoadPtr, MachinePointerInfo(),
-            MVT::getIntegerVT(LoadSizeInBytes * 8), Alignment.value());
+            MVT::getIntegerVT(LoadSizeInBytes * 8), Alignment);
         MemOpChains.push_back(LoadVal.getValue(1));
 
         // Shift the loaded value.
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsISelLowering.h b/contrib/llvm-project/llvm/lib/Target/Mips/MipsISelLowering.h
index 16b4d51d3ca6..3820c42ba8aa 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsISelLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsISelLowering.h
@@ -40,8 +40,6 @@
 namespace llvm {
 
 class Argument;
-class CCState;
-class CCValAssign;
 class FastISel;
 class FunctionLoweringInfo;
 class MachineBasicBlock;
@@ -316,10 +314,6 @@ class TargetRegisterClass;
       return ISD::SIGN_EXTEND;
     }
 
-    void LowerOperationWrapper(SDNode *N,
-                               SmallVectorImpl<SDValue> &Results,
-                               SelectionDAG &DAG) const override;
-
     /// LowerOperation - Provide custom lowering hooks for some operations.
     SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
 
@@ -365,14 +359,6 @@ class TargetRegisterClass;
       return ABI.IsN64() ? Mips::A1_64 : Mips::A1;
     }
 
-    /// Returns true if a cast between SrcAS and DestAS is a noop.
-    bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
-      // Mips doesn't have any special address spaces so we just reserve
-      // the first 256 for software use (e.g. OpenCL) and treat casts
-      // between them as noops.
-      return SrcAS < 256 && DestAS < 256;
-    }
-
     bool isJumpTableRelative() const override {
       return getTargetMachine().isPositionIndependent();
     }
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsInstrFPU.td b/contrib/llvm-project/llvm/lib/Target/Mips/MipsInstrFPU.td
index 5696df96e798..14590ddacfcb 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsInstrFPU.td
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsInstrFPU.td
@@ -452,6 +452,12 @@ let AdditionalPredicates = [NotInMicroMips] in {
 
 let DecoderNamespace = "MipsFP64" in {
   let AdditionalPredicates = [NotInMicroMips] in {
+    def FADD_PS64   : ADDS_FT<"add.ps", FGR64Opnd, II_ADD_PS, 0>,
+                      ADDS_FM<0x0, 22>,
+                      ISA_MIPS32R2_NOT_32R6_64R6, FGR_64;
+    def FMUL_PS64   : ADDS_FT<"mul.ps", FGR64Opnd, II_MUL_PS, 0>,
+                      ADDS_FM<0x2, 22>,
+                      ISA_MIPS32R2_NOT_32R6_64R6, FGR_64;
     def PLL_PS64    : ADDS_FT<"pll.ps", FGR64Opnd, II_CVT, 0>,
                       ADDS_FM<0x2C, 22>,
                       ISA_MIPS32R2_NOT_32R6_64R6, FGR_64;
@@ -464,6 +470,9 @@ let DecoderNamespace = "MipsFP64" in {
     def PUU_PS64    : ADDS_FT<"puu.ps", FGR64Opnd, II_CVT, 0>,
                       ADDS_FM<0x2F, 22>,
                       ISA_MIPS32R2_NOT_32R6_64R6, FGR_64;
+    def FSUB_PS64   : ADDS_FT<"sub.ps", FGR64Opnd, II_SUB_PS, 0>,
+                      ADDS_FM<0x1, 22>,
+                      ISA_MIPS32R2_NOT_32R6_64R6, FGR_64;
 
     def CVT_S_PU64  : ABSS_FT<"cvt.s.pu", FGR32Opnd, FGR64Opnd, II_CVT>,
                       ABSS_FM<0x20, 22>,
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/Mips/MipsInstrInfo.cpp
index 0c6080258a3a..94828a976695 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsInstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsInstrInfo.cpp
@@ -894,4 +894,4 @@ Optional<RegImmPair> MipsInstrInfo::isAddImmediate(const MachineInstr &MI,
   }
   }
   return None;
-}
\ No newline at end of file
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/Mips/MipsInstrInfo.td
index a3b928870f3f..089fed9ec0bf 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsInstrInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsInstrInfo.td
@@ -242,7 +242,7 @@ def HasEVA       :    Predicate<"Subtarget->hasEVA()">,
 def HasMSA : Predicate<"Subtarget->hasMSA()">,
              AssemblerPredicate<(all_of FeatureMSA)>;
 def HasMadd4 : Predicate<"!Subtarget->disableMadd4()">,
-               AssemblerPredicate<(all_of (not FeatureMadd4))>;
+               AssemblerPredicate<(all_of (not FeatureNoMadd4))>;
 def HasMT  : Predicate<"Subtarget->hasMT()">,
              AssemblerPredicate<(all_of FeatureMT)>;
 def UseIndirectJumpsHazard : Predicate<"Subtarget->useIndirectJumpsHazard()">,
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp b/contrib/llvm-project/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp
index b489c8137769..2692c08b93de 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp
@@ -322,6 +322,8 @@ MipsLegalizerInfo::MipsLegalizerInfo(const MipsSubtarget &ST) {
 
   getActionDefinitionsBuilder(G_SEXT_INREG).lower();
 
+  getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE, G_MEMSET}).libcall();
+
   computeTables();
   verify(*ST.getInstrInfo());
 }
@@ -500,7 +502,6 @@ static bool MSA2OpIntrinsicToGeneric(MachineInstr &MI, unsigned Opcode,
 bool MipsLegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
                                           MachineInstr &MI) const {
   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
-  MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
   const MipsSubtarget &ST =
       static_cast<const MipsSubtarget &>(MI.getMF()->getSubtarget());
   const MipsInstrInfo &TII = *ST.getInstrInfo();
@@ -508,14 +509,6 @@ bool MipsLegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
   const RegisterBankInfo &RBI = *ST.getRegBankInfo();
 
   switch (MI.getIntrinsicID()) {
-  case Intrinsic::memcpy:
-  case Intrinsic::memset:
-  case Intrinsic::memmove:
-    if (createMemLibcall(MIRBuilder, MRI, MI) ==
-        LegalizerHelper::UnableToLegalize)
-      return false;
-    MI.eraseFromParent();
-    return true;
   case Intrinsic::trap: {
     MachineInstr *Trap = MIRBuilder.buildInstr(Mips::TRAP);
     MI.eraseFromParent();
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp b/contrib/llvm-project/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp
index 6325e513f9f8..3101820d476e 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp
@@ -716,10 +716,10 @@ void MipsRegisterBankInfo::setRegBank(MachineInstr &MI,
 
 static void
 combineAwayG_UNMERGE_VALUES(LegalizationArtifactCombiner &ArtCombiner,
-                            MachineInstr &MI, GISelObserverWrapper &Observer) {
+                            MachineInstr &MI, GISelChangeObserver &Observer) {
   SmallVector<Register, 4> UpdatedDefs;
   SmallVector<MachineInstr *, 2> DeadInstrs;
-  ArtCombiner.tryCombineMerges(MI, DeadInstrs, UpdatedDefs, Observer);
+  ArtCombiner.tryCombineUnmergeValues(MI, DeadInstrs, UpdatedDefs, Observer);
   for (MachineInstr *DeadMI : DeadInstrs)
     DeadMI->eraseFromParent();
 }
@@ -728,14 +728,13 @@ void MipsRegisterBankInfo::applyMappingImpl(
     const OperandsMapper &OpdMapper) const {
   MachineInstr &MI = OpdMapper.getMI();
   InstListTy NewInstrs;
-  MachineIRBuilder B(MI);
   MachineFunction *MF = MI.getMF();
   MachineRegisterInfo &MRI = OpdMapper.getMRI();
   const LegalizerInfo &LegInfo = *MF->getSubtarget().getLegalizerInfo();
 
   InstManager NewInstrObserver(NewInstrs);
-  GISelObserverWrapper WrapperObserver(&NewInstrObserver);
-  LegalizerHelper Helper(*MF, WrapperObserver, B);
+  MachineIRBuilder B(MI, NewInstrObserver);
+  LegalizerHelper Helper(*MF, NewInstrObserver, B);
   LegalizationArtifactCombiner ArtCombiner(B, MF->getRegInfo(), LegInfo);
 
   switch (MI.getOpcode()) {
@@ -752,7 +751,7 @@ void MipsRegisterBankInfo::applyMappingImpl(
       // not be considered for regbank selection. RegBankSelect for mips
       // visits/makes corresponding G_MERGE first. Combine them here.
       if (NewMI->getOpcode() == TargetOpcode::G_UNMERGE_VALUES)
-        combineAwayG_UNMERGE_VALUES(ArtCombiner, *NewMI, WrapperObserver);
+        combineAwayG_UNMERGE_VALUES(ArtCombiner, *NewMI, NewInstrObserver);
       // This G_MERGE will be combined away when its corresponding G_UNMERGE
       // gets regBankSelected.
       else if (NewMI->getOpcode() == TargetOpcode::G_MERGE_VALUES)
@@ -764,7 +763,7 @@ void MipsRegisterBankInfo::applyMappingImpl(
     return;
   }
   case TargetOpcode::G_UNMERGE_VALUES:
-    combineAwayG_UNMERGE_VALUES(ArtCombiner, MI, WrapperObserver);
+    combineAwayG_UNMERGE_VALUES(ArtCombiner, MI, NewInstrObserver);
     return;
   default:
     break;
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsRegisterBankInfo.h b/contrib/llvm-project/llvm/lib/Target/Mips/MipsRegisterBankInfo.h
index 55eeaf096b14..df51606e1e8a 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsRegisterBankInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsRegisterBankInfo.h
@@ -150,7 +150,7 @@ private:
 
   class TypeInfoForMF {
     /// MachineFunction name is used to recognise when MF changes.
-    std::string MFName = "";
+    std::string MFName;
     /// <key, value> : value is vector of all MachineInstrs that are waiting for
     /// key to figure out type of some of its ambiguous operands.
     DenseMap<const MachineInstr *, SmallVector<const MachineInstr *, 2>>
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
index a657bb44ac78..f31ba06a1e7c 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
@@ -774,9 +774,9 @@ void MipsSEFrameLowering::emitInterruptEpilogueStub(
       .addImm(0);
 }
 
-int MipsSEFrameLowering::getFrameIndexReference(const MachineFunction &MF,
-                                                int FI,
-                                                Register &FrameReg) const {
+StackOffset
+MipsSEFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
+                                            Register &FrameReg) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   MipsABIInfo ABI = STI.getABI();
 
@@ -785,8 +785,9 @@ int MipsSEFrameLowering::getFrameIndexReference(const MachineFunction &MF,
   else
     FrameReg = hasBP(MF) ? ABI.GetBasePtr() : ABI.GetStackPtr();
 
-  return MFI.getObjectOffset(FI) + MFI.getStackSize() -
-         getOffsetOfLocalArea() + MFI.getOffsetAdjustment();
+  return StackOffset::getFixed(MFI.getObjectOffset(FI) + MFI.getStackSize() -
+                               getOffsetOfLocalArea() +
+                               MFI.getOffsetAdjustment());
 }
 
 bool MipsSEFrameLowering::spillCalleeSavedRegisters(
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsSEFrameLowering.h b/contrib/llvm-project/llvm/lib/Target/Mips/MipsSEFrameLowering.h
index c818a65f5b14..bed2776c28da 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsSEFrameLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsSEFrameLowering.h
@@ -10,6 +10,7 @@
 #define LLVM_LIB_TARGET_MIPS_MIPSSEFRAMELOWERING_H
 
 #include "MipsFrameLowering.h"
+#include "llvm/Support/TypeSize.h"
 #include <vector>
 
 namespace llvm {
@@ -27,8 +28,8 @@ public:
   void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
 
-  int getFrameIndexReference(const MachineFunction &MF, int FI,
-                             Register &FrameReg) const override;
+  StackOffset getFrameIndexReference(const MachineFunction &MF, int FI,
+                                     Register &FrameReg) const override;
 
   bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MI,
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsSEISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
index bdf29c53cbd5..4a448a5f7c68 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -2307,7 +2307,7 @@ static SDValue lowerMSALoadIntr(SDValue Op, SelectionDAG &DAG, unsigned Intr,
 
   Address = DAG.getNode(ISD::ADD, DL, PtrTy, Address, Offset);
   return DAG.getLoad(ResTy, DL, ChainIn, Address, MachinePointerInfo(),
-                     /* Alignment = */ 16);
+                     Align(16));
 }
 
 SDValue MipsSETargetLowering::lowerINTRINSIC_W_CHAIN(SDValue Op,
@@ -2382,7 +2382,7 @@ static SDValue lowerMSAStoreIntr(SDValue Op, SelectionDAG &DAG, unsigned Intr,
   Address = DAG.getNode(ISD::ADD, DL, PtrTy, Address, Offset);
 
   return DAG.getStore(ChainIn, DL, Value, Address, MachinePointerInfo(),
-                      /* Alignment = */ 16);
+                      Align(16));
 }
 
 SDValue MipsSETargetLowering::lowerINTRINSIC_VOID(SDValue Op,
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsSchedule.td b/contrib/llvm-project/llvm/lib/Target/Mips/MipsSchedule.td
index 568c85af655d..3a5b3fe3b34b 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsSchedule.td
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsSchedule.td
@@ -26,6 +26,7 @@ def II_ADDIUPC          : InstrItinClass;
 def II_ADD              : InstrItinClass;
 def II_ADDU             : InstrItinClass;
 def II_ADD_D            : InstrItinClass;
+def II_ADD_PS           : InstrItinClass;
 def II_ADD_S            : InstrItinClass;
 def II_ADDR_PS          : InstrItinClass;
 def II_ALIGN            : InstrItinClass;
@@ -279,6 +280,7 @@ def II_MUL              : InstrItinClass;
 def II_MUH              : InstrItinClass;
 def II_MUHU             : InstrItinClass;
 def II_MULU             : InstrItinClass;
+def II_MUL_PS           : InstrItinClass;
 def II_MULR_PS          : InstrItinClass;
 def II_MULT             : InstrItinClass;
 def II_MULTU            : InstrItinClass;
@@ -341,6 +343,7 @@ def II_SRLV             : InstrItinClass;
 def II_SUB              : InstrItinClass;
 def II_SUBU             : InstrItinClass;
 def II_SUB_D            : InstrItinClass;
+def II_SUB_PS           : InstrItinClass;
 def II_SUB_S            : InstrItinClass;
 def II_SUXC1            : InstrItinClass;
 def II_SW               : InstrItinClass;
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsScheduleGeneric.td b/contrib/llvm-project/llvm/lib/Target/Mips/MipsScheduleGeneric.td
index 3888ca4e82f5..f076f2f9cf10 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsScheduleGeneric.td
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsScheduleGeneric.td
@@ -829,10 +829,11 @@ def : InstRW<[GenericWriteFPUL], (instrs ADDR_PS64,
                                   CVT_L_S, CVT_S_D32, CVT_S_D64, CVT_S_L,
                                   CVT_S_W, CVT_W_D32, CVT_W_D64, CVT_W_S,
                                   CVT_PS_S64, CVT_S_PL64, CVT_S_PU64,
-                                  CVT_PS_PW64, CVT_PW_PS64,
+                                  CVT_PS_PW64, CVT_PW_PS64, FADD_PS64,
                                   FLOOR_L_D64, FLOOR_L_S, FLOOR_W_D32,
                                   FLOOR_W_D64, FLOOR_W_S, FMUL_D32, FMUL_D64,
-                                  MADD_D32, MADD_D64, MSUB_D32, MSUB_D64, MULR_PS64,
+                                  FMUL_PS64, FSUB_PS64, MADD_D32, MADD_D64,
+                                  MSUB_D32, MSUB_D64, MULR_PS64,
                                   NMADD_D32, NMADD_D64, NMSUB_D32, NMSUB_D64,
                                   PLL_PS64, PLU_PS64, PUL_PS64, PUU_PS64,
                                   ROUND_L_D64, ROUND_L_S, ROUND_W_D32,
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsScheduleP5600.td b/contrib/llvm-project/llvm/lib/Target/Mips/MipsScheduleP5600.td
index 3d159d412489..466b5c6af696 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsScheduleP5600.td
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsScheduleP5600.td
@@ -449,8 +449,8 @@ def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(NLOC|NLZC)_[BHWD]$")>;
 // cvt.ps.[sw], cvt.s.(pl|pu), c.<cc>.[ds], c.<cc>.ps, mul.[ds], mul.ps,
 // pl[lu].ps, sub.[ds], sub.ps, trunc.w.[ds], trunc.w.ps
 def : InstRW<[P5600WriteFPUL],
-             (instrs FADD_D32, FADD_D64, FADD_S, FMUL_D32, FMUL_D64, FMUL_S,
-              FSUB_D32, FSUB_D64, FSUB_S)>;
+             (instrs FADD_D32, FADD_D64, FADD_PS64, FADD_S, FMUL_D32, FMUL_D64,
+              FMUL_PS64, FMUL_S, FSUB_D32, FSUB_D64, FSUB_PS64, FSUB_S)>;
 def : InstRW<[P5600WriteFPUL], (instregex "^TRUNC_(L|W)_(S|D32|D64)$")>;
 def : InstRW<[P5600WriteFPUL],
              (instregex "^CVT_(S|D32|D64|L|W)_(S|D32|D64|L|W)$")>;
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsSubtarget.cpp b/contrib/llvm-project/llvm/lib/Target/Mips/MipsSubtarget.cpp
index ef4191cec3df..8bb9d75e9173 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsSubtarget.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsSubtarget.cpp
@@ -70,21 +70,21 @@ void MipsSubtarget::anchor() {}
 MipsSubtarget::MipsSubtarget(const Triple &TT, StringRef CPU, StringRef FS,
                              bool little, const MipsTargetMachine &TM,
                              MaybeAlign StackAlignOverride)
-    : MipsGenSubtargetInfo(TT, CPU, FS), MipsArchVersion(MipsDefault),
-      IsLittle(little), IsSoftFloat(false), IsSingleFloat(false), IsFPXX(false),
-      NoABICalls(false), Abs2008(false), IsFP64bit(false), UseOddSPReg(true),
-      IsNaN2008bit(false), IsGP64bit(false), HasVFPU(false), HasCnMips(false),
-      HasCnMipsP(false), HasMips3_32(false), HasMips3_32r2(false),
-      HasMips4_32(false), HasMips4_32r2(false), HasMips5_32r2(false),
-      InMips16Mode(false), InMips16HardFloat(Mips16HardFloat),
-      InMicroMipsMode(false), HasDSP(false), HasDSPR2(false), HasDSPR3(false),
-      AllowMixed16_32(Mixed16_32 | Mips_Os16), Os16(Mips_Os16), HasMSA(false),
-      UseTCCInDIV(false), HasSym32(false), HasEVA(false), DisableMadd4(false),
-      HasMT(false), HasCRC(false), HasVirt(false), HasGINV(false),
-      UseIndirectJumpsHazard(false), StackAlignOverride(StackAlignOverride),
-      TM(TM), TargetTriple(TT), TSInfo(),
-      InstrInfo(
-          MipsInstrInfo::create(initializeSubtargetDependencies(CPU, FS, TM))),
+    : MipsGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS),
+      MipsArchVersion(MipsDefault), IsLittle(little), IsSoftFloat(false),
+      IsSingleFloat(false), IsFPXX(false), NoABICalls(false), Abs2008(false),
+      IsFP64bit(false), UseOddSPReg(true), IsNaN2008bit(false),
+      IsGP64bit(false), HasVFPU(false), HasCnMips(false), HasCnMipsP(false),
+      HasMips3_32(false), HasMips3_32r2(false), HasMips4_32(false),
+      HasMips4_32r2(false), HasMips5_32r2(false), InMips16Mode(false),
+      InMips16HardFloat(Mips16HardFloat), InMicroMipsMode(false), HasDSP(false),
+      HasDSPR2(false), HasDSPR3(false), AllowMixed16_32(Mixed16_32 | Mips_Os16),
+      Os16(Mips_Os16), HasMSA(false), UseTCCInDIV(false), HasSym32(false),
+      HasEVA(false), DisableMadd4(false), HasMT(false), HasCRC(false),
+      HasVirt(false), HasGINV(false), UseIndirectJumpsHazard(false),
+      StackAlignOverride(StackAlignOverride), TM(TM), TargetTriple(TT),
+      TSInfo(), InstrInfo(MipsInstrInfo::create(
+                    initializeSubtargetDependencies(CPU, FS, TM))),
       FrameLowering(MipsFrameLowering::create(*this)),
       TLInfo(MipsTargetLowering::create(TM, *this)) {
 
@@ -240,7 +240,7 @@ MipsSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS,
   StringRef CPUName = MIPS_MC::selectMipsCPU(TM.getTargetTriple(), CPU);
 
   // Parse features string.
-  ParseSubtargetFeatures(CPUName, FS);
+  ParseSubtargetFeatures(CPUName, /*TuneCPU*/ CPUName, FS);
   // Initialize scheduling itinerary for the specified CPU.
   InstrItins = getInstrItineraryForCPU(CPUName);
 
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsSubtarget.h b/contrib/llvm-project/llvm/lib/Target/Mips/MipsSubtarget.h
index 26ee961fc95d..2b4c2b19a95d 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsSubtarget.h
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsSubtarget.h
@@ -240,7 +240,7 @@ public:
 
   /// ParseSubtargetFeatures - Parses features string setting specified
   /// subtarget options.  Definition of function is auto generated by tblgen.
-  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+  void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
 
   bool hasMips1() const { return MipsArchVersion >= Mips1; }
   bool hasMips2() const { return MipsArchVersion >= Mips2; }
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/Mips/MipsTargetMachine.cpp
index 80cb6ce7ac0c..7e2c43164d52 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsTargetMachine.cpp
@@ -163,21 +163,15 @@ MipsTargetMachine::getSubtargetImpl(const Function &F) const {
   Attribute CPUAttr = F.getFnAttribute("target-cpu");
   Attribute FSAttr = F.getFnAttribute("target-features");
 
-  std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
-                        ? CPUAttr.getValueAsString().str()
-                        : TargetCPU;
-  std::string FS = !FSAttr.hasAttribute(Attribute::None)
-                       ? FSAttr.getValueAsString().str()
-                       : TargetFS;
-  bool hasMips16Attr =
-      !F.getFnAttribute("mips16").hasAttribute(Attribute::None);
-  bool hasNoMips16Attr =
-      !F.getFnAttribute("nomips16").hasAttribute(Attribute::None);
-
-  bool HasMicroMipsAttr =
-      !F.getFnAttribute("micromips").hasAttribute(Attribute::None);
-  bool HasNoMicroMipsAttr =
-      !F.getFnAttribute("nomicromips").hasAttribute(Attribute::None);
+  std::string CPU =
+      CPUAttr.isValid() ? CPUAttr.getValueAsString().str() : TargetCPU;
+  std::string FS =
+      FSAttr.isValid() ? FSAttr.getValueAsString().str() : TargetFS;
+  bool hasMips16Attr = F.getFnAttribute("mips16").isValid();
+  bool hasNoMips16Attr = F.getFnAttribute("nomips16").isValid();
+
+  bool HasMicroMipsAttr = F.getFnAttribute("micromips").isValid();
+  bool HasNoMicroMipsAttr = F.getFnAttribute("nomicromips").isValid();
 
   // FIXME: This is related to the code below to reset the target options,
   // we need to know whether or not the soft float flag is set on the
@@ -295,8 +289,7 @@ MipsTargetMachine::getTargetTransformInfo(const Function &F) {
 }
 
 // Implemented by targets that want to run passes immediately before
-// machine code is emitted. return true if -print-machineinstrs should
-// print out the code after the passes.
+// machine code is emitted.
 void MipsPassConfig::addPreEmitPass() {
   // Expand pseudo instructions that are sensitive to register allocation.
   addPass(createMipsExpandPseudoPass());
@@ -323,7 +316,7 @@ void MipsPassConfig::addPreEmitPass() {
 }
 
 bool MipsPassConfig::addIRTranslator() {
-  addPass(new IRTranslator());
+  addPass(new IRTranslator(getOptLevel()));
   return false;
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsTargetMachine.h b/contrib/llvm-project/llvm/lib/Target/Mips/MipsTargetMachine.h
index 25300504a02d..e0de924be4fd 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsTargetMachine.h
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsTargetMachine.h
@@ -63,6 +63,14 @@ public:
     return TLOF.get();
   }
 
+  /// Returns true if a cast between SrcAS and DestAS is a noop.
+  bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
+    // Mips doesn't have any special address spaces so we just reserve
+    // the first 256 for software use (e.g. OpenCL) and treat casts
+    // between them as noops.
+    return SrcAS < 256 && DestAS < 256;
+  }
+
   bool isLittleEndian() const { return isLittle; }
   const MipsABIInfo &getABI() const { return ABI; }
 };
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h b/contrib/llvm-project/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h
index cee0e7eec54a..503f0497b6f0 100644
--- a/contrib/llvm-project/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h
@@ -29,6 +29,7 @@ public:
                  const MCSubtargetInfo &STI, raw_ostream &OS) override;
 
   // Autogenerated by tblgen.
+  std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
   void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &O);
   static const char *getRegisterName(unsigned RegNo);
   // End
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp b/contrib/llvm-project/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
index aef0eed6ab9a..f275011018a3 100644
--- a/contrib/llvm-project/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
@@ -47,6 +47,7 @@ NVPTXMCAsmInfo::NVPTXMCAsmInfo(const Triple &TheTriple,
   AscizDirective = nullptr; // not supported
   SupportsQuotedNames = false;
   SupportsExtendedDwarfLocDirective = false;
+  SupportsSignedData = false;
 
   // @TODO: Can we just disable this?
   WeakDirective = "\t// .weak\t";
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp b/contrib/llvm-project/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
index d758c2c86959..d69166feb042 100644
--- a/contrib/llvm-project/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
@@ -46,7 +46,7 @@ static MCRegisterInfo *createNVPTXMCRegisterInfo(const Triple &TT) {
 
 static MCSubtargetInfo *
 createNVPTXMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
-  return createNVPTXMCSubtargetInfoImpl(TT, CPU, FS);
+  return createNVPTXMCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FS);
 }
 
 static MCInstPrinter *createNVPTXMCInstPrinter(const Triple &T,
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTX.h b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTX.h
index dfe0b9cb5ee6..c2fd090da084 100644
--- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTX.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_LIB_TARGET_NVPTX_NVPTX_H
 #define LLVM_LIB_TARGET_NVPTX_NVPTX_H
 
+#include "llvm/IR/PassManager.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CodeGen.h"
 
@@ -47,6 +48,24 @@ FunctionPass *createNVPTXLowerAllocaPass();
 MachineFunctionPass *createNVPTXPeephole();
 MachineFunctionPass *createNVPTXProxyRegErasurePass();
 
+struct NVVMIntrRangePass : PassInfoMixin<NVVMIntrRangePass> {
+  NVVMIntrRangePass();
+  NVVMIntrRangePass(unsigned SmVersion) : SmVersion(SmVersion) {}
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+private:
+  unsigned SmVersion;
+};
+
+struct NVVMReflectPass : PassInfoMixin<NVVMReflectPass> {
+  NVVMReflectPass();
+  NVVMReflectPass(unsigned SmVersion) : SmVersion(SmVersion) {}
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+private:
+  unsigned SmVersion;
+};
+
 namespace NVPTX {
 enum DrvInterface {
   NVCL,
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index da1a398a68f0..38844ff4ddf9 100644
--- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -1272,9 +1272,6 @@ void NVPTXAsmPrinter::emitPTXAddressSpace(unsigned int AddressSpace,
 std::string
 NVPTXAsmPrinter::getPTXFundamentalTypeStr(Type *Ty, bool useB4PTR) const {
   switch (Ty->getTypeID()) {
-  default:
-    llvm_unreachable("unexpected type");
-    break;
   case Type::IntegerTyID: {
     unsigned NumBits = cast<IntegerType>(Ty)->getBitWidth();
     if (NumBits == 1)
@@ -1305,9 +1302,10 @@ NVPTXAsmPrinter::getPTXFundamentalTypeStr(Type *Ty, bool useB4PTR) const {
       return "b32";
     else
       return "u32";
+  default:
+    break;
   }
   llvm_unreachable("unexpected type");
-  return nullptr;
 }
 
 void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar,
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp
index c533921842e4..024e51e5f488 100644
--- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp
@@ -63,12 +63,13 @@ void NVPTXFrameLowering::emitPrologue(MachineFunction &MF,
   }
 }
 
-int NVPTXFrameLowering::getFrameIndexReference(const MachineFunction &MF,
-                                               int FI,
-                                               Register &FrameReg) const {
+StackOffset
+NVPTXFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
+                                           Register &FrameReg) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   FrameReg = NVPTX::VRDepot;
-  return MFI.getObjectOffset(FI) - getOffsetOfLocalArea();
+  return StackOffset::getFixed(MFI.getObjectOffset(FI) -
+                               getOffsetOfLocalArea());
 }
 
 void NVPTXFrameLowering::emitEpilogue(MachineFunction &MF,
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h
index e4c2b9e77f70..a5d49ac3ab29 100644
--- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h
@@ -14,6 +14,7 @@
 #define LLVM_LIB_TARGET_NVPTX_NVPTXFRAMELOWERING_H
 
 #include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/Support/TypeSize.h"
 
 namespace llvm {
 
@@ -24,8 +25,8 @@ public:
   bool hasFP(const MachineFunction &MF) const override;
   void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
-  int getFrameIndexReference(const MachineFunction &MF, int FI,
-                             Register &FrameReg) const override;
+  StackOffset getFrameIndexReference(const MachineFunction &MF, int FI,
+                                     Register &FrameReg) const override;
 
   MachineBasicBlock::iterator
   eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 4296eca6a8df..08f4ab87c68d 100644
--- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -700,12 +700,11 @@ static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
 
   bool IsKernelFn = isKernelFunction(F->getFunction());
 
-  // We use GetUnderlyingObjects() here instead of GetUnderlyingObject() mainly
+  // We use getUnderlyingObjects() here instead of getUnderlyingObject() mainly
   // because the former looks through phi nodes while the latter does not. We
   // need to look through phi nodes to handle pointer induction variables.
   SmallVector<const Value *, 8> Objs;
-  GetUnderlyingObjects(N->getMemOperand()->getValue(),
-                       Objs, F->getDataLayout());
+  getUnderlyingObjects(N->getMemOperand()->getValue(), Objs);
 
   return all_of(Objs, [&](const Value *V) {
     if (auto *A = dyn_cast<const Argument>(V))
@@ -2855,7 +2854,7 @@ bool NVPTXDAGToDAGISel::tryTextureIntrinsic(SDNode *N) {
   }
 
   // Copy over operands
-  SmallVector<SDValue, 8> Ops(N->op_begin() + 1, N->op_end());
+  SmallVector<SDValue, 8> Ops(drop_begin(N->ops()));
   Ops.push_back(N->getOperand(0)); // Move chain to the back.
 
   ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
@@ -3364,7 +3363,7 @@ bool NVPTXDAGToDAGISel::trySurfaceIntrinsic(SDNode *N) {
   }
 
   // Copy over operands
-  SmallVector<SDValue, 8> Ops(N->op_begin() + 1, N->op_end());
+  SmallVector<SDValue, 8> Ops(drop_begin(N->ops()));
   Ops.push_back(N->getOperand(0)); // Move chain to the back.
 
   ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index f45cc06e0a0a..8860e90f2806 100644
--- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -19,6 +19,7 @@
 #include "NVPTXTargetObjectFile.h"
 #include "NVPTXUtilities.h"
 #include "llvm/ADT/APInt.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/Analysis.h"
@@ -64,7 +65,7 @@
 
 using namespace llvm;
 
-static unsigned int uniqueCallSite = 0;
+static std::atomic<unsigned> GlobalUniqueCallSite;
 
 static cl::opt<bool> sched4reg(
     "nvptx-sched4reg",
@@ -1242,7 +1243,7 @@ NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
 std::string NVPTXTargetLowering::getPrototype(
     const DataLayout &DL, Type *retTy, const ArgListTy &Args,
     const SmallVectorImpl<ISD::OutputArg> &Outs, MaybeAlign retAlignment,
-    const CallBase &CB) const {
+    const CallBase &CB, unsigned UniqueCallSite) const {
   auto PtrVT = getPointerTy(DL);
 
   bool isABI = (STI.getSmVersion() >= 20);
@@ -1251,7 +1252,7 @@ std::string NVPTXTargetLowering::getPrototype(
     return "";
 
   std::stringstream O;
-  O << "prototype_" << uniqueCallSite << " : .callprototype ";
+  O << "prototype_" << UniqueCallSite << " : .callprototype ";
 
   if (retTy->getTypeID() == Type::VoidTyID) {
     O << "()";
@@ -1421,8 +1422,9 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   if (!isABI)
     return Chain;
 
+  unsigned UniqueCallSite = GlobalUniqueCallSite.fetch_add(1);
   SDValue tempChain = Chain;
-  Chain = DAG.getCALLSEQ_START(Chain, uniqueCallSite, 0, dl);
+  Chain = DAG.getCALLSEQ_START(Chain, UniqueCallSite, 0, dl);
   SDValue InFlag = Chain.getValue(1);
 
   unsigned paramCount = 0;
@@ -1677,7 +1679,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     // The prototype is embedded in a string and put as the operand for a
     // CallPrototype SDNode which will print out to the value of the string.
     SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-    std::string Proto = getPrototype(DL, RetTy, Args, Outs, retAlignment, *CB);
+    std::string Proto =
+        getPrototype(DL, RetTy, Args, Outs, retAlignment, *CB, UniqueCallSite);
     const char *ProtoStr =
       nvTM->getManagedStrPool()->getManagedString(Proto.c_str())->c_str();
     SDValue ProtoOps[] = {
@@ -1733,9 +1736,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   if (isIndirectCall) {
     SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-    SDValue PrototypeOps[] = { Chain,
-                               DAG.getConstant(uniqueCallSite, dl, MVT::i32),
-                               InFlag };
+    SDValue PrototypeOps[] = {
+        Chain, DAG.getConstant(UniqueCallSite, dl, MVT::i32), InFlag};
     Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps);
     InFlag = Chain.getValue(1);
   }
@@ -1831,13 +1833,10 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     }
   }
 
-  Chain = DAG.getCALLSEQ_END(Chain,
-                             DAG.getIntPtrConstant(uniqueCallSite, dl, true),
-                             DAG.getIntPtrConstant(uniqueCallSite + 1, dl,
-                                                   true),
-                             InFlag, dl);
+  Chain = DAG.getCALLSEQ_END(
+      Chain, DAG.getIntPtrConstant(UniqueCallSite, dl, true),
+      DAG.getIntPtrConstant(UniqueCallSite + 1, dl, true), InFlag, dl);
   InFlag = Chain.getValue(1);
-  uniqueCallSite++;
 
   // Append ProxyReg instructions to the chain to make sure that `callseq_end`
   // will not get lost. Otherwise, during libcalls expansion, the nodes can become
@@ -2438,8 +2437,7 @@ static bool isImageOrSamplerVal(const Value *arg, const Module *context) {
   if (!STy || STy->isLiteral())
     return false;
 
-  return std::find(std::begin(specialTypes), std::end(specialTypes),
-                   STy->getName()) != std::end(specialTypes);
+  return llvm::is_contained(specialTypes, STy->getName());
 }
 
 SDValue NVPTXTargetLowering::LowerFormalArguments(
@@ -2590,7 +2588,8 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
             // Extend the element if necessary (e.g. an i8 is loaded
             // into an i16 register)
             if (Ins[InsIdx].VT.isInteger() &&
-                Ins[InsIdx].VT.getSizeInBits() > LoadVT.getSizeInBits()) {
+                Ins[InsIdx].VT.getFixedSizeInBits() >
+                    LoadVT.getFixedSizeInBits()) {
               unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
                                                            : ISD::ZERO_EXTEND;
               Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt);
@@ -4564,13 +4563,13 @@ static bool IsMulWideOperandDemotable(SDValue Op,
   if (Op.getOpcode() == ISD::SIGN_EXTEND ||
       Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {
     EVT OrigVT = Op.getOperand(0).getValueType();
-    if (OrigVT.getSizeInBits() <= OptSize) {
+    if (OrigVT.getFixedSizeInBits() <= OptSize) {
       S = Signed;
       return true;
     }
   } else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
     EVT OrigVT = Op.getOperand(0).getValueType();
-    if (OrigVT.getSizeInBits() <= OptSize) {
+    if (OrigVT.getFixedSizeInBits() <= OptSize) {
       S = Unsigned;
       return true;
     }
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index df9cd4159962..13829b924d4b 100644
--- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -491,7 +491,8 @@ public:
 
   std::string getPrototype(const DataLayout &DL, Type *, const ArgListTy &,
                            const SmallVectorImpl<ISD::OutputArg> &,
-                           MaybeAlign retAlignment, const CallBase &CB) const;
+                           MaybeAlign retAlignment, const CallBase &CB,
+                           unsigned UniqueCallSite) const;
 
   SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXInstrFormats.td b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXInstrFormats.td
index 77961c386827..9220f4766d92 100644
--- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXInstrFormats.td
+++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXInstrFormats.td
@@ -31,14 +31,14 @@ class NVPTXInst<dag outs, dag ins, string asmstr, list<dag> pattern>
 
   // TSFlagFields
   bits<4> VecInstType = VecNOP.Value;
-  bit IsSimpleMove = 0;
-  bit IsLoad = 0;
-  bit IsStore = 0;
+  bit IsSimpleMove = false;
+  bit IsLoad = false;
+  bit IsStore = false;
 
-  bit IsTex = 0;
-  bit IsSust = 0;
-  bit IsSurfTexQuery = 0;
-  bit IsTexModeUnified = 0;
+  bit IsTex = false;
+  bit IsSust = false;
+  bit IsSurfTexQuery = false;
+  bit IsTexModeUnified = false;
 
   // The following field is encoded as log2 of the vector size minus one,
   // with 0 meaning the operation is not a surface instruction.  For example,
@@ -46,13 +46,13 @@ class NVPTXInst<dag outs, dag ins, string asmstr, list<dag> pattern>
   // 2**(2-1) = 2.
   bits<2> IsSuld = 0;
 
-  let TSFlags{3-0}   = VecInstType;
-  let TSFlags{4-4}   = IsSimpleMove;
-  let TSFlags{5-5}   = IsLoad;
-  let TSFlags{6-6}   = IsStore;
-  let TSFlags{7}     = IsTex;
-  let TSFlags{9-8}   = IsSuld;
-  let TSFlags{10}    = IsSust;
-  let TSFlags{11}    = IsSurfTexQuery;
-  let TSFlags{12}    = IsTexModeUnified;
+  let TSFlags{3...0}   = VecInstType;
+  let TSFlags{4...4}   = IsSimpleMove;
+  let TSFlags{5...5}   = IsLoad;
+  let TSFlags{6...6}   = IsStore;
+  let TSFlags{7}       = IsTex;
+  let TSFlags{9...8}   = IsSuld;
+  let TSFlags{10}      = IsSust;
+  let TSFlags{11}      = IsSurfTexQuery;
+  let TSFlags{12}      = IsTexModeUnified;
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index fe7a84f9a361..381ed4dd6887 100644
--- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -13,7 +13,7 @@
 include "NVPTXInstrFormats.td"
 
 // A NOP instruction
-let hasSideEffects = 0 in {
+let hasSideEffects = false in {
   def NOP : NVPTXInst<(outs), (ins), "", []>;
 }
 
@@ -137,7 +137,7 @@ def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">;
 def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">;
 def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">;
 
-def true : Predicate<"true">;
+def True : Predicate<"true">;
 
 def hasPTX31 : Predicate<"Subtarget->getPTXVersion() >= 31">;
 def hasPTX60 : Predicate<"Subtarget->getPTXVersion() >= 60">;
@@ -407,7 +407,7 @@ multiclass F2<string OpcStr, SDNode OpNode> {
 // Type Conversion
 //-----------------------------------
 
-let hasSideEffects = 0 in {
+let hasSideEffects = false in {
   // Generate a cvt to the given type from all possible types.  Each instance
   // takes a CvtMode immediate that defines the conversion mode to use.  It can
   // be CvtNONE to omit a conversion mode.
@@ -1022,12 +1022,12 @@ multiclass FMA_F16<string OpcStr, RegisterClass RC, Predicate Pred> {
 }
 
 defm FMA16_ftz : FMA_F16<"fma.rn.ftz.f16", Float16Regs, doF32FTZ>;
-defm FMA16     : FMA_F16<"fma.rn.f16", Float16Regs, true>;
+defm FMA16     : FMA_F16<"fma.rn.f16", Float16Regs, True>;
 defm FMA16x2_ftz : FMA_F16<"fma.rn.ftz.f16x2", Float16x2Regs, doF32FTZ>;
-defm FMA16x2     : FMA_F16<"fma.rn.f16x2", Float16x2Regs, true>;
+defm FMA16x2     : FMA_F16<"fma.rn.f16x2", Float16x2Regs, True>;
 defm FMA32_ftz : FMA<"fma.rn.ftz.f32", Float32Regs, f32imm, doF32FTZ>;
-defm FMA32     : FMA<"fma.rn.f32", Float32Regs, f32imm, true>;
-defm FMA64     : FMA<"fma.rn.f64", Float64Regs, f64imm, true>;
+defm FMA32     : FMA<"fma.rn.f32", Float32Regs, f32imm, True>;
+defm FMA64     : FMA<"fma.rn.f64", Float64Regs, f64imm, True>;
 
 // sin/cos
 def SINF:  NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
@@ -1367,7 +1367,7 @@ multiclass BFE<string TyStr, RegisterClass RC> {
                 !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
 }
 
-let hasSideEffects = 0 in {
+let hasSideEffects = false in {
   defm BFE_S32 : BFE<"s32", Int32Regs>;
   defm BFE_U32 : BFE<"u32", Int32Regs>;
   defm BFE_S64 : BFE<"s64", Int64Regs>;
@@ -1381,7 +1381,7 @@ let hasSideEffects = 0 in {
 // FIXME: This doesn't cover versions of set and setp that combine with a
 // boolean predicate, e.g. setp.eq.and.b16.
 
-let hasSideEffects = 0 in {
+let hasSideEffects = false in {
   multiclass SETP<string TypeStr, RegisterClass RC, Operand ImmCls> {
     def rr :
       NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, RC:$b, CmpMode:$cmp),
@@ -1427,7 +1427,7 @@ def SETP_f16x2rr :
 // "set.CmpOp{.ftz}.dtype.stype", where dtype is the type of the destination
 // reg, either u32, s32, or f32.  Anyway these aren't used at the moment.
 
-let hasSideEffects = 0 in {
+let hasSideEffects = false in {
   multiclass SET<string TypeStr, RegisterClass RC, Operand ImmCls> {
     def rr : NVPTXInst<(outs Int32Regs:$dst),
                        (ins RC:$a, RC:$b, CmpMode:$cmp),
@@ -1462,7 +1462,7 @@ defm SET_f64 : SET<"f64", Float64Regs, f64imm>;
 
 // selp instructions that don't have any pattern matches; we explicitly use
 // them within this file.
-let hasSideEffects = 0 in {
+let hasSideEffects = false in {
   multiclass SELP<string TypeStr, RegisterClass RC, Operand ImmCls> {
     def rr : NVPTXInst<(outs RC:$dst),
                        (ins RC:$a, RC:$b, Int1Regs:$p),
@@ -1572,7 +1572,7 @@ def MOV_ADDR64 : NVPTXInst<(outs Int64Regs:$dst), (ins imem:$a),
                            [(set Int64Regs:$dst, (Wrapper tglobaladdr:$a))]>;
 
 // Get pointer to local stack.
-let hasSideEffects = 0 in {
+let hasSideEffects = false in {
   def MOV_DEPOT_ADDR :    NVPTXInst<(outs Int32Regs:$d), (ins i32imm:$num),
                                      "mov.u32 \t$d, __local_depot$num;", []>;
   def MOV_DEPOT_ADDR_64 : NVPTXInst<(outs Int64Regs:$d), (ins i32imm:$num),
@@ -1988,7 +1988,7 @@ def ProxyReg :
   SDNode<"NVPTXISD::ProxyReg", SDTProxyRegProfile,
          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
 
-let mayLoad = 1 in {
+let mayLoad = true in {
   class LoadParamMemInst<NVPTXRegClass regclass, string opstr> :
         NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
                   !strconcat("ld.param", opstr, " \t$dst, [retval0+$b];"),
@@ -2013,7 +2013,7 @@ class LoadParamRegInst<NVPTXRegClass regclass, string opstr> :
                 !strconcat("mov", opstr, " \t$dst, retval$b;"),
                 [(set regclass:$dst, (LoadParam (i32 0), (i32 imm:$b)))]>;
 
-let mayStore = 1 in {
+let mayStore = true in {
   class StoreParamInst<NVPTXRegClass regclass, string opstr> :
         NVPTXInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b),
                   !strconcat("st.param", opstr, " \t[param$a+$b], $val;"),
@@ -2823,7 +2823,7 @@ def : Pat<(select Int32Regs:$pred, Float64Regs:$a, Float64Regs:$b),
           (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
 
 
-let hasSideEffects = 0 in {
+let hasSideEffects = false in {
   // pack a set of smaller int registers to a larger int register
   def V4I16toI64 : NVPTXInst<(outs Int64Regs:$d),
                              (ins Int16Regs:$s1, Int16Regs:$s2,
@@ -2856,7 +2856,7 @@ let hasSideEffects = 0 in {
 
 }
 
-let hasSideEffects = 0 in {
+let hasSideEffects = false in {
   // Extract element of f16x2 register. PTX does not provide any way
   // to access elements of f16x2 vector directly, so we need to
   // extract it using a temporary register.
@@ -2899,7 +2899,7 @@ let hasSideEffects = 0 in {
 }
 
 // Count leading zeros
-let hasSideEffects = 0 in {
+let hasSideEffects = false in {
   def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
                          "clz.b32 \t$d, $a;", []>;
   def CLZr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
@@ -2937,7 +2937,7 @@ def : Pat<(i32 (zext (i16 (ctlz Int16Regs:$a)))),
           (SUBi32ri (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), 16)>;
 
 // Population count
-let hasSideEffects = 0 in {
+let hasSideEffects = false in {
   def POPCr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
                           "popc.b32 \t$d, $a;", []>;
   def POPCr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 76a4a1d4030a..8ccd47c0fcfd 100644
--- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -51,19 +51,19 @@ def ptx : PTX;
 // Generates list of n sequential register names.
 // E.g. RegNames<3,"r">.ret -> ["r0", "r1", "r2" ]
 class RegSeq<int n, string prefix> {
-  list<string> ret = !if(n, !listconcat(RegSeq<!add(n,-1), prefix>.ret,
-                                        [prefix # !add(n, -1)]),
+  list<string> ret = !if(n, !listconcat(RegSeq<!sub(n, 1), prefix>.ret,
+                                        [prefix # !sub(n, 1)]),
                             []);
 }
 
 class THREADMASK_INFO<bit sync> {
-  list<bit> ret = !if(sync, [0,1], [0]);
+  list<bit> ret = !if(sync, [0, 1], [0]);
 }
 
 //-----------------------------------
 // Synchronization and shuffle functions
 //-----------------------------------
-let isConvergent = 1 in {
+let isConvergent = true in {
 def INT_BARRIER0 : NVPTXInst<(outs), (ins),
                   "bar.sync \t0;",
       [(int_nvvm_barrier0)]>;
@@ -173,12 +173,12 @@ class SHFL_INSTR<bit sync, string mode, string reg, bit return_pred,
   )];
 }
 
-foreach sync = [0, 1] in {
+foreach sync = [false, true] in {
   foreach mode = ["up", "down", "bfly", "idx"] in {
     foreach regclass = ["i32", "f32"] in {
-      foreach return_pred = [0, 1] in {
-        foreach offset_imm = [0, 1] in {
-          foreach mask_imm = [0, 1] in {
+      foreach return_pred = [false, true] in {
+        foreach offset_imm = [false, true] in {
+          foreach mask_imm = [false, true] in {
             foreach threadmask_imm = THREADMASK_INFO<sync>.ret in {
               def : SHFL_INSTR<sync, mode, regclass, return_pred,
                                offset_imm, mask_imm, threadmask_imm>,
@@ -274,7 +274,7 @@ defm MATCH_ALLP_SYNC_32 : MATCH_ALLP_SYNC<Int32Regs, "b32", int_nvvm_match_all_s
 defm MATCH_ALLP_SYNC_64 : MATCH_ALLP_SYNC<Int64Regs, "b64", int_nvvm_match_all_sync_i64p,
                                          i64imm>;
 
-} // isConvergent = 1
+} // isConvergent = true
 
 //-----------------------------------
 // Explicit Memory Fence Functions
@@ -1548,7 +1548,7 @@ multiclass ATOM2N_impl<string OpStr, string IntTypeStr, string TypeStr,
                      !cast<Intrinsic>(
                             "int_nvvm_atomic_" # OpStr
                             # "_" # SpaceStr # "_" # IntTypeStr
-                            # !if(!eq(ScopeStr,""), "", "_" # ScopeStr)),
+                            # !if(!empty(ScopeStr), "", "_" # ScopeStr)),
                      regclass, ImmType, Imm, ImmTy, Preds>;
 }
 multiclass ATOM3N_impl<string OpStr, string IntTypeStr, string TypeStr,
@@ -1562,7 +1562,7 @@ multiclass ATOM3N_impl<string OpStr, string IntTypeStr, string TypeStr,
                      !cast<Intrinsic>(
                             "int_nvvm_atomic_" # OpStr
                             # "_" # SpaceStr # "_" # IntTypeStr
-                            # !if(!eq(ScopeStr,""), "", "_" # ScopeStr)),
+                            # !if(!empty(ScopeStr), "", "_" # ScopeStr)),
                      regclass, ImmType, Imm, ImmTy, Preds>;
 }
 
@@ -2131,7 +2131,7 @@ def : Pat<(int_nvvm_rotate_b32 Int32Regs:$src, Int32Regs:$amt),
           (ROTL32reg_sw Int32Regs:$src, Int32Regs:$amt)>,
       Requires<[noHWROT32]> ;
 
-let hasSideEffects = 0 in {
+let hasSideEffects = false in {
   def GET_LO_INT64 : NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src),
     !strconcat("{{\n\t",
                ".reg .b32 %dummy;\n\t",
@@ -2147,7 +2147,7 @@ let hasSideEffects = 0 in {
           []> ;
 }
 
-let hasSideEffects = 0 in {
+let hasSideEffects = false in {
   def PACK_TWO_INT32
     : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$lo, Int32Regs:$hi),
                 "mov.b64 \t$dst, {{$lo, $hi}};", []> ;
@@ -2159,7 +2159,7 @@ def : Pat<(int_nvvm_swap_lo_hi_b64 Int64Regs:$src),
 
 // Funnel shift, requires >= sm_32.  Does not trap if amt is out of range, so
 // no side effects.
-let hasSideEffects = 0 in {
+let hasSideEffects = false in {
   def SHF_L_WRAP_B32_IMM
     : NVPTXInst<(outs Int32Regs:$dst),
                 (ins  Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt),
@@ -2242,7 +2242,7 @@ def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, Int32Regs:$amt),
 // also defined in NVPTXReplaceImageHandles.cpp
 
 // texmode_independent
-let IsTex = 1, IsTexModeUnified = 0 in {
+let IsTex = true, IsTexModeUnified = false in {
 // Texture fetch instructions using handles
 def TEX_1D_F32_S32
   : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
@@ -2925,7 +2925,7 @@ def TLD4_A_2D_U32_F32
 
 
 // texmode_unified
-let IsTex = 1, IsTexModeUnified = 1 in {
+let IsTex = true, IsTexModeUnified = true in {
 // Texture fetch instructions using handles
 def TEX_UNIFIED_1D_F32_S32
   : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
@@ -3610,7 +3610,7 @@ def TLD4_UNIFIED_A_2D_U32_F32
 
 //=== Surface load instructions
 // .clamp variant
-let IsSuld = 1 in {
+let IsSuld = true in {
 def SULD_1D_I8_CLAMP
   : NVPTXInst<(outs Int16Regs:$r),
               (ins Int64Regs:$s, Int32Regs:$x),
@@ -3922,7 +3922,7 @@ def SULD_3D_V4I32_CLAMP
 
 
 // .trap variant
-let IsSuld = 1 in {
+let IsSuld = true in {
 def SULD_1D_I8_TRAP
   : NVPTXInst<(outs Int16Regs:$r),
               (ins Int64Regs:$s, Int32Regs:$x),
@@ -4233,7 +4233,7 @@ def SULD_3D_V4I32_TRAP
 }
 
 // .zero variant
-let IsSuld = 1 in {
+let IsSuld = true in {
 def SULD_1D_I8_ZERO
   : NVPTXInst<(outs Int16Regs:$r),
               (ins Int64Regs:$s, Int32Regs:$x),
@@ -4547,7 +4547,7 @@ def SULD_3D_V4I32_ZERO
 // Texture Query Intrinsics
 //-----------------------------------
 
-let IsSurfTexQuery = 1 in {
+let IsSurfTexQuery = true in {
 def TXQ_CHANNEL_ORDER
   : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
               "txq.channel_order.b32 \t$d, [$a];",
@@ -4604,7 +4604,7 @@ def : Pat<(int_nvvm_txq_num_mipmap_levels Int64Regs:$a),
 // Surface Query Intrinsics
 //-----------------------------------
 
-let IsSurfTexQuery = 1 in {
+let IsSurfTexQuery = true in {
 def SUQ_CHANNEL_ORDER
   : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
               "suq.channel_order.b32 \t$d, [$a];",
@@ -4663,7 +4663,7 @@ def ISTYPEP_TEXTURE
 
 //===- Surface Stores -----------------------------------------------------===//
 
-let IsSust = 1 in {
+let IsSust = true in {
 // Unformatted
 // .clamp variant
 def SUST_B_1D_B8_CLAMP
@@ -7361,16 +7361,13 @@ class WMMA_REGINFO<WMMA_REGS r>
     !eq(ptx_elt_type, "b1") : Int32Regs);
 
   // Instruction input/output arguments for the fragment.
-  list<NVPTXRegClass> ptx_regs = !foreach(tmp, regs, regclass);
+  list<NVPTXRegClass> ptx_regs = !listsplat(regclass, !size(regs));
 
   // List of register names for the fragment -- ["ra0", "ra1",...]
   list<string> reg_names = RegSeq<!size(ptx_regs), "r"#frag>.ret;
 
   // Generates "{{$r0, $r1,.... $rN-1}}" for use in asm string construction.
-  string regstring = "{{$" # !head(reg_names)
-                           # !foldl("", !tail(reg_names), a, b,
-                                    !strconcat(a, ", $", b))
-                     # "}}";
+  string regstring = "{{$" # !interleave(reg_names, ", $") # "}}";
 
   // Predicates for particular fragment variant. Technically those are
   // per-instruction predicates, but currently all fragments that can be used in
@@ -7453,12 +7450,13 @@ class WMMA_LOAD<WMMA_REGINFO Frag, string Layout, string Space, bit WithStride,
   // To match the right intrinsic, we need to build AS-constrained PatFrag.
   // Operands is a dag equivalent in shape to Args, but using (ops node:$name, .....).
   dag PFOperands = !if(WithStride, (ops node:$src, node:$ldm), (ops node:$src));
+  dag PFOperandsIntr = !if(WithStride, (Intr node:$src, node:$ldm), (Intr node:$src));
   // Build PatFrag that only matches particular address space.
   PatFrag IntrFrag = PatFrag<PFOperands,
-                             !foreach(tmp, PFOperands, !subst(ops, Intr, tmp)),
+                             PFOperandsIntr,
                              !cond(!eq(Space, ".shared"): AS_match.shared,
                                    !eq(Space, ".global"): AS_match.global,
-                                   1: AS_match.generic)>;
+                                   true: AS_match.generic)>;
   // Build AS-constrained pattern.
   let IntrinsicPattern = BuildPatternPF<IntrFrag, Args>.ret;
 
@@ -7493,14 +7491,14 @@ class WMMA_STORE_D<WMMA_REGINFO Frag, string Layout, string Space,
   // To match the right intrinsic, we need to build AS-constrained PatFrag.
   // Operands is a dag equivalent in shape to Args, but using (ops node:$name, .....).
   dag PFOperands = !con((ops node:$dst),
-                        !dag(ops, !foreach(tmp, Frag.regs, node), Frag.reg_names),
+                        !dag(ops, !listsplat(node, !size(Frag.regs)), Frag.reg_names),
                         !if(WithStride, (ops node:$ldm), (ops)));
   // Build PatFrag that only matches particular address space.
   PatFrag IntrFrag = PatFrag<PFOperands,
                              !foreach(tmp, PFOperands, !subst(ops, Intr, tmp)),
                              !cond(!eq(Space, ".shared"): AS_match.shared,
                                    !eq(Space, ".global"): AS_match.global,
-                                   1: AS_match.generic)>;
+                                   true: AS_match.generic)>;
   // Build AS-constrained pattern.
   let IntrinsicPattern = BuildPatternPF<IntrFrag, Args>.ret;
 
@@ -7521,14 +7519,14 @@ class WMMA_STORE_D<WMMA_REGINFO Frag, string Layout, string Space,
 // Create all load/store variants
 defset list<WMMA_INSTR> MMA_LDSTs  = {
   foreach layout = ["row", "col"] in {
-    foreach stride = [0, 1] in {
+    foreach stride = [false, true] in {
       foreach space = [".global", ".shared", ""] in {
         foreach addr = [imem, Int32Regs, Int64Regs, MEMri, MEMri64] in {
           foreach frag = NVVM_MMA_OPS.all_ld_ops in
-            foreach _ = NVVM_MMA_SUPPORTED<[frag], layout>.ret in
+            if NVVM_MMA_SUPPORTED<[frag], layout>.ret then
               def : WMMA_LOAD<WMMA_REGINFO<frag>, layout, space, stride, addr>;
           foreach frag = NVVM_MMA_OPS.all_st_ops in
-            foreach _ = NVVM_MMA_SUPPORTED<[frag], layout>.ret in
+            if NVVM_MMA_SUPPORTED<[frag], layout>.ret then
               def : WMMA_STORE_D<WMMA_REGINFO<frag>, layout, space, stride, addr>;
         } // addr
       } // space
@@ -7586,7 +7584,7 @@ defset list<WMMA_INSTR> MMAs  = {
     foreach layout_b = ["row", "col"] in {
       foreach satf = [0, 1] in {
         foreach op = NVVM_MMA_OPS.all_mma_ops in {
-          foreach _ = NVVM_MMA_SUPPORTED<op, layout_a, layout_b, satf>.ret in {
+          if NVVM_MMA_SUPPORTED<op, layout_a, layout_b, satf>.ret then {
             def : WMMA_MMA<WMMA_REGINFO<op[0]>,
                            WMMA_REGINFO<op[1]>,
                            WMMA_REGINFO<op[2]>,
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
index e60b5eeacdae..fd58ff13788d 100644
--- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -172,8 +172,12 @@ void NVPTXLowerArgs::handleByValParam(Argument *Arg) {
   Value *ArgInParam = new AddrSpaceCastInst(
       Arg, PointerType::get(StructType, ADDRESS_SPACE_PARAM), Arg->getName(),
       FirstInst);
+  // Be sure to propagate alignment to this load; LLVM doesn't know that NVPTX
+  // addrspacecast preserves alignment.  Since params are constant, this load is
+  // definitely not volatile.
   LoadInst *LI =
-      new LoadInst(StructType, ArgInParam, Arg->getName(), FirstInst);
+      new LoadInst(StructType, ArgInParam, Arg->getName(),
+                   /*isVolatile=*/false, AllocA->getAlign(), FirstInst);
   new StoreInst(LI, AllocA, FirstInst);
 }
 
@@ -214,8 +218,7 @@ bool NVPTXLowerArgs::runOnKernelFunction(Function &F) {
       for (auto &I : B) {
         if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
           if (LI->getType()->isPointerTy()) {
-            Value *UO = GetUnderlyingObject(LI->getPointerOperand(),
-                                            F.getParent()->getDataLayout());
+            Value *UO = getUnderlyingObject(LI->getPointerOperand());
             if (Argument *Arg = dyn_cast<Argument>(UO)) {
               if (Arg->hasByValAttr()) {
                 // LI is a load from a pointer within a byval kernel parameter.
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
index ea2274f394e6..756355f75e3d 100644
--- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
@@ -69,7 +69,8 @@ bool NVPTXPrologEpilogPass::runOnMachineFunction(MachineFunction &MF) {
                            "operand of a DBG_VALUE machine instruction");
           Register Reg;
           int64_t Offset =
-              TFI.getFrameIndexReference(MF, MI.getOperand(0).getIndex(), Reg);
+              TFI.getFrameIndexReference(MF, MI.getOperand(0).getIndex(), Reg)
+                  .getFixed();
           MI.getOperand(0).ChangeToRegister(Reg, /*isDef=*/false);
           MI.getOperand(0).setIsDebug();
           auto *DIExpr = DIExpression::prepend(
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
index 4b755dcb55ff..19895a20bacf 100644
--- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
@@ -30,7 +30,7 @@ def VRDepot  : NVPTXReg<"%Depot">;
 
 // We use virtual registers, but define a few physical registers here to keep
 // SDAG and the MachineInstr layers happy.
-foreach i = 0-4 in {
+foreach i = 0...4 in {
   def P#i  : NVPTXReg<"%p"#i>;  // Predicate
   def RS#i : NVPTXReg<"%rs"#i>; // 16-bit
   def R#i  : NVPTXReg<"%r"#i>;  // 32-bit
@@ -47,7 +47,7 @@ foreach i = 0-4 in {
   def da#i : NVPTXReg<"%da"#i>;
 }
 
-foreach i = 0-31 in {
+foreach i = 0...31 in {
   def ENVREG#i : NVPTXReg<"%envreg"#i>;
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
index f1fa6416f15f..05c20369abf4 100644
--- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -35,7 +35,7 @@ NVPTXSubtarget &NVPTXSubtarget::initializeSubtargetDependencies(StringRef CPU,
     // Provide the default CPU if we don't have one.
     TargetName = std::string(CPU.empty() ? "sm_20" : CPU);
 
-    ParseSubtargetFeatures(TargetName, FS);
+    ParseSubtargetFeatures(TargetName, /*TuneCPU*/ TargetName, FS);
 
     // Set default to PTX 3.2 (CUDA 5.5)
     if (PTXVersion == 0) {
@@ -48,9 +48,9 @@ NVPTXSubtarget &NVPTXSubtarget::initializeSubtargetDependencies(StringRef CPU,
 NVPTXSubtarget::NVPTXSubtarget(const Triple &TT, const std::string &CPU,
                                const std::string &FS,
                                const NVPTXTargetMachine &TM)
-    : NVPTXGenSubtargetInfo(TT, CPU, FS), PTXVersion(0), SmVersion(20), TM(TM),
-      InstrInfo(), TLInfo(TM, initializeSubtargetDependencies(CPU, FS)),
-      FrameLowering() {}
+    : NVPTXGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), PTXVersion(0),
+      SmVersion(20), TM(TM), InstrInfo(),
+      TLInfo(TM, initializeSubtargetDependencies(CPU, FS)), FrameLowering() {}
 
 bool NVPTXSubtarget::hasImageHandles() const {
   // Enable handles for Kepler+, where CUDA supports indirect surfaces and
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 0e9fa1fd3e56..9a249d3da3d5 100644
--- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -83,7 +83,7 @@ public:
   unsigned getPTXVersion() const { return PTXVersion; }
 
   NVPTXSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
-  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+  void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
 };
 
 } // End llvm namespace
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 85709eb731e2..f1a82f1cf607 100644
--- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -24,6 +24,7 @@
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/Pass.h"
+#include "llvm/Passes/PassBuilder.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetMachine.h"
@@ -170,11 +171,11 @@ public:
   void addFastRegAlloc() override;
   void addOptimizedRegAlloc() override;
 
-  bool addRegAssignmentFast() override {
+  bool addRegAssignAndRewriteFast() override {
     llvm_unreachable("should not be used");
   }
 
-  bool addRegAssignmentOptimized() override {
+  bool addRegAssignAndRewriteOptimized() override {
     llvm_unreachable("should not be used");
   }
 
@@ -205,6 +206,32 @@ void NVPTXTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
     });
 }
 
+void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB,
+                                                      bool DebugPassManager) {
+  PB.registerPipelineParsingCallback(
+      [](StringRef PassName, FunctionPassManager &PM,
+         ArrayRef<PassBuilder::PipelineElement>) {
+        if (PassName == "nvvm-reflect") {
+          PM.addPass(NVVMReflectPass());
+          return true;
+        }
+        if (PassName == "nvvm-intr-range") {
+          PM.addPass(NVVMIntrRangePass());
+          return true;
+        }
+        return false;
+      });
+
+  PB.registerPipelineStartEPCallback(
+      [this, DebugPassManager](ModulePassManager &PM,
+                               PassBuilder::OptimizationLevel Level) {
+        FunctionPassManager FPM(DebugPassManager);
+        FPM.addPass(NVVMReflectPass(Subtarget.getSmVersion()));
+        FPM.addPass(NVVMIntrRangePass(Subtarget.getSmVersion()));
+        PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+      });
+}
+
 TargetTransformInfo
 NVPTXTargetMachine::getTargetTransformInfo(const Function &F) {
   return TargetTransformInfo(NVPTXTTIImpl(this, F));
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h
index d84600c74e29..bef541c2b28d 100644
--- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h
+++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h
@@ -15,8 +15,6 @@
 
 #include "ManagedStringPool.h"
 #include "NVPTXSubtarget.h"
-#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
-#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
@@ -64,6 +62,8 @@ public:
   }
 
   void adjustPassManager(PassManagerBuilder &) override;
+  void registerPassBuilderCallbacks(PassBuilder &PB,
+                                    bool DebugPassManager) override;
 
   TargetTransformInfo getTargetTransformInfo(const Function &F) override;
 
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index 3873c73fb2e0..d4b2ae384068 100644
--- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -111,6 +111,263 @@ bool NVPTXTTIImpl::isSourceOfDivergence(const Value *V) {
   return false;
 }
 
+// Convert NVVM intrinsics to target-generic LLVM code where possible.
+static Instruction *simplifyNvvmIntrinsic(IntrinsicInst *II, InstCombiner &IC) {
+  // Each NVVM intrinsic we can simplify can be replaced with one of:
+  //
+  //  * an LLVM intrinsic,
+  //  * an LLVM cast operation,
+  //  * an LLVM binary operation, or
+  //  * ad-hoc LLVM IR for the particular operation.
+
+  // Some transformations are only valid when the module's
+  // flush-denormals-to-zero (ftz) setting is true/false, whereas other
+  // transformations are valid regardless of the module's ftz setting.
+  enum FtzRequirementTy {
+    FTZ_Any,       // Any ftz setting is ok.
+    FTZ_MustBeOn,  // Transformation is valid only if ftz is on.
+    FTZ_MustBeOff, // Transformation is valid only if ftz is off.
+  };
+  // Classes of NVVM intrinsics that can't be replaced one-to-one with a
+  // target-generic intrinsic, cast op, or binary op but that we can nonetheless
+  // simplify.
+  enum SpecialCase {
+    SPC_Reciprocal,
+  };
+
+  // SimplifyAction is a poor-man's variant (plus an additional flag) that
+  // represents how to replace an NVVM intrinsic with target-generic LLVM IR.
+  struct SimplifyAction {
+    // Invariant: At most one of these Optionals has a value.
+    Optional<Intrinsic::ID> IID;
+    Optional<Instruction::CastOps> CastOp;
+    Optional<Instruction::BinaryOps> BinaryOp;
+    Optional<SpecialCase> Special;
+
+    FtzRequirementTy FtzRequirement = FTZ_Any;
+
+    SimplifyAction() = default;
+
+    SimplifyAction(Intrinsic::ID IID, FtzRequirementTy FtzReq)
+        : IID(IID), FtzRequirement(FtzReq) {}
+
+    // Cast operations don't have anything to do with FTZ, so we skip that
+    // argument.
+    SimplifyAction(Instruction::CastOps CastOp) : CastOp(CastOp) {}
+
+    SimplifyAction(Instruction::BinaryOps BinaryOp, FtzRequirementTy FtzReq)
+        : BinaryOp(BinaryOp), FtzRequirement(FtzReq) {}
+
+    SimplifyAction(SpecialCase Special, FtzRequirementTy FtzReq)
+        : Special(Special), FtzRequirement(FtzReq) {}
+  };
+
+  // Try to generate a SimplifyAction describing how to replace our
+  // IntrinsicInstr with target-generic LLVM IR.
+  const SimplifyAction Action = [II]() -> SimplifyAction {
+    switch (II->getIntrinsicID()) {
+    // NVVM intrinsics that map directly to LLVM intrinsics.
+    case Intrinsic::nvvm_ceil_d:
+      return {Intrinsic::ceil, FTZ_Any};
+    case Intrinsic::nvvm_ceil_f:
+      return {Intrinsic::ceil, FTZ_MustBeOff};
+    case Intrinsic::nvvm_ceil_ftz_f:
+      return {Intrinsic::ceil, FTZ_MustBeOn};
+    case Intrinsic::nvvm_fabs_d:
+      return {Intrinsic::fabs, FTZ_Any};
+    case Intrinsic::nvvm_fabs_f:
+      return {Intrinsic::fabs, FTZ_MustBeOff};
+    case Intrinsic::nvvm_fabs_ftz_f:
+      return {Intrinsic::fabs, FTZ_MustBeOn};
+    case Intrinsic::nvvm_floor_d:
+      return {Intrinsic::floor, FTZ_Any};
+    case Intrinsic::nvvm_floor_f:
+      return {Intrinsic::floor, FTZ_MustBeOff};
+    case Intrinsic::nvvm_floor_ftz_f:
+      return {Intrinsic::floor, FTZ_MustBeOn};
+    case Intrinsic::nvvm_fma_rn_d:
+      return {Intrinsic::fma, FTZ_Any};
+    case Intrinsic::nvvm_fma_rn_f:
+      return {Intrinsic::fma, FTZ_MustBeOff};
+    case Intrinsic::nvvm_fma_rn_ftz_f:
+      return {Intrinsic::fma, FTZ_MustBeOn};
+    case Intrinsic::nvvm_fmax_d:
+      return {Intrinsic::maxnum, FTZ_Any};
+    case Intrinsic::nvvm_fmax_f:
+      return {Intrinsic::maxnum, FTZ_MustBeOff};
+    case Intrinsic::nvvm_fmax_ftz_f:
+      return {Intrinsic::maxnum, FTZ_MustBeOn};
+    case Intrinsic::nvvm_fmin_d:
+      return {Intrinsic::minnum, FTZ_Any};
+    case Intrinsic::nvvm_fmin_f:
+      return {Intrinsic::minnum, FTZ_MustBeOff};
+    case Intrinsic::nvvm_fmin_ftz_f:
+      return {Intrinsic::minnum, FTZ_MustBeOn};
+    case Intrinsic::nvvm_round_d:
+      return {Intrinsic::round, FTZ_Any};
+    case Intrinsic::nvvm_round_f:
+      return {Intrinsic::round, FTZ_MustBeOff};
+    case Intrinsic::nvvm_round_ftz_f:
+      return {Intrinsic::round, FTZ_MustBeOn};
+    case Intrinsic::nvvm_sqrt_rn_d:
+      return {Intrinsic::sqrt, FTZ_Any};
+    case Intrinsic::nvvm_sqrt_f:
+      // nvvm_sqrt_f is a special case.  For  most intrinsics, foo_ftz_f is the
+      // ftz version, and foo_f is the non-ftz version.  But nvvm_sqrt_f adopts
+      // the ftz-ness of the surrounding code.  sqrt_rn_f and sqrt_rn_ftz_f are
+      // the versions with explicit ftz-ness.
+      return {Intrinsic::sqrt, FTZ_Any};
+    case Intrinsic::nvvm_sqrt_rn_f:
+      return {Intrinsic::sqrt, FTZ_MustBeOff};
+    case Intrinsic::nvvm_sqrt_rn_ftz_f:
+      return {Intrinsic::sqrt, FTZ_MustBeOn};
+    case Intrinsic::nvvm_trunc_d:
+      return {Intrinsic::trunc, FTZ_Any};
+    case Intrinsic::nvvm_trunc_f:
+      return {Intrinsic::trunc, FTZ_MustBeOff};
+    case Intrinsic::nvvm_trunc_ftz_f:
+      return {Intrinsic::trunc, FTZ_MustBeOn};
+
+    // NVVM intrinsics that map to LLVM cast operations.
+    //
+    // Note that llvm's target-generic conversion operators correspond to the rz
+    // (round to zero) versions of the nvvm conversion intrinsics, even though
+    // most everything else here uses the rn (round to nearest even) nvvm ops.
+    case Intrinsic::nvvm_d2i_rz:
+    case Intrinsic::nvvm_f2i_rz:
+    case Intrinsic::nvvm_d2ll_rz:
+    case Intrinsic::nvvm_f2ll_rz:
+      return {Instruction::FPToSI};
+    case Intrinsic::nvvm_d2ui_rz:
+    case Intrinsic::nvvm_f2ui_rz:
+    case Intrinsic::nvvm_d2ull_rz:
+    case Intrinsic::nvvm_f2ull_rz:
+      return {Instruction::FPToUI};
+    case Intrinsic::nvvm_i2d_rz:
+    case Intrinsic::nvvm_i2f_rz:
+    case Intrinsic::nvvm_ll2d_rz:
+    case Intrinsic::nvvm_ll2f_rz:
+      return {Instruction::SIToFP};
+    case Intrinsic::nvvm_ui2d_rz:
+    case Intrinsic::nvvm_ui2f_rz:
+    case Intrinsic::nvvm_ull2d_rz:
+    case Intrinsic::nvvm_ull2f_rz:
+      return {Instruction::UIToFP};
+
+    // NVVM intrinsics that map to LLVM binary ops.
+    case Intrinsic::nvvm_add_rn_d:
+      return {Instruction::FAdd, FTZ_Any};
+    case Intrinsic::nvvm_add_rn_f:
+      return {Instruction::FAdd, FTZ_MustBeOff};
+    case Intrinsic::nvvm_add_rn_ftz_f:
+      return {Instruction::FAdd, FTZ_MustBeOn};
+    case Intrinsic::nvvm_mul_rn_d:
+      return {Instruction::FMul, FTZ_Any};
+    case Intrinsic::nvvm_mul_rn_f:
+      return {Instruction::FMul, FTZ_MustBeOff};
+    case Intrinsic::nvvm_mul_rn_ftz_f:
+      return {Instruction::FMul, FTZ_MustBeOn};
+    case Intrinsic::nvvm_div_rn_d:
+      return {Instruction::FDiv, FTZ_Any};
+    case Intrinsic::nvvm_div_rn_f:
+      return {Instruction::FDiv, FTZ_MustBeOff};
+    case Intrinsic::nvvm_div_rn_ftz_f:
+      return {Instruction::FDiv, FTZ_MustBeOn};
+
+    // The remainder of cases are NVVM intrinsics that map to LLVM idioms, but
+    // need special handling.
+    //
+    // We seem to be missing intrinsics for rcp.approx.{ftz.}f32, which is just
+    // as well.
+    case Intrinsic::nvvm_rcp_rn_d:
+      return {SPC_Reciprocal, FTZ_Any};
+    case Intrinsic::nvvm_rcp_rn_f:
+      return {SPC_Reciprocal, FTZ_MustBeOff};
+    case Intrinsic::nvvm_rcp_rn_ftz_f:
+      return {SPC_Reciprocal, FTZ_MustBeOn};
+
+      // We do not currently simplify intrinsics that give an approximate
+      // answer. These include:
+      //
+      //   - nvvm_cos_approx_{f,ftz_f}
+      //   - nvvm_ex2_approx_{d,f,ftz_f}
+      //   - nvvm_lg2_approx_{d,f,ftz_f}
+      //   - nvvm_sin_approx_{f,ftz_f}
+      //   - nvvm_sqrt_approx_{f,ftz_f}
+      //   - nvvm_rsqrt_approx_{d,f,ftz_f}
+      //   - nvvm_div_approx_{ftz_d,ftz_f,f}
+      //   - nvvm_rcp_approx_ftz_d
+      //
+      // Ideally we'd encode them as e.g. "fast call @llvm.cos", where "fast"
+      // means that fastmath is enabled in the intrinsic.  Unfortunately only
+      // binary operators (currently) have a fastmath bit in SelectionDAG, so
+      // this information gets lost and we can't select on it.
+      //
+      // TODO: div and rcp are lowered to a binary op, so these we could in
+      // theory lower them to "fast fdiv".
+
+    default:
+      return {};
+    }
+  }();
+
+  // If Action.FtzRequirementTy is not satisfied by the module's ftz state, we
+  // can bail out now.  (Notice that in the case that IID is not an NVVM
+  // intrinsic, we don't have to look up any module metadata, as
+  // FtzRequirementTy will be FTZ_Any.)
+  if (Action.FtzRequirement != FTZ_Any) {
+    StringRef Attr = II->getFunction()
+                         ->getFnAttribute("denormal-fp-math-f32")
+                         .getValueAsString();
+    DenormalMode Mode = parseDenormalFPAttribute(Attr);
+    bool FtzEnabled = Mode.Output != DenormalMode::IEEE;
+
+    if (FtzEnabled != (Action.FtzRequirement == FTZ_MustBeOn))
+      return nullptr;
+  }
+
+  // Simplify to target-generic intrinsic.
+  if (Action.IID) {
+    SmallVector<Value *, 4> Args(II->arg_operands());
+    // All the target-generic intrinsics currently of interest to us have one
+    // type argument, equal to that of the nvvm intrinsic's argument.
+    Type *Tys[] = {II->getArgOperand(0)->getType()};
+    return CallInst::Create(
+        Intrinsic::getDeclaration(II->getModule(), *Action.IID, Tys), Args);
+  }
+
+  // Simplify to target-generic binary op.
+  if (Action.BinaryOp)
+    return BinaryOperator::Create(*Action.BinaryOp, II->getArgOperand(0),
+                                  II->getArgOperand(1), II->getName());
+
+  // Simplify to target-generic cast op.
+  if (Action.CastOp)
+    return CastInst::Create(*Action.CastOp, II->getArgOperand(0), II->getType(),
+                            II->getName());
+
+  // All that's left are the special cases.
+  if (!Action.Special)
+    return nullptr;
+
+  switch (*Action.Special) {
+  case SPC_Reciprocal:
+    // Simplify reciprocal.
+    return BinaryOperator::Create(
+        Instruction::FDiv, ConstantFP::get(II->getArgOperand(0)->getType(), 1),
+        II->getArgOperand(0), II->getName());
+  }
+  llvm_unreachable("All SpecialCase enumerators should be handled in switch.");
+}
+
+Optional<Instruction *>
+NVPTXTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
+  if (Instruction *I = simplifyNvvmIntrinsic(&II, IC)) {
+    return I;
+  }
+  return None;
+}
+
 int NVPTXTTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
     TTI::OperandValueKind Opd1Info,
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
index cb832031f1ad..6f071040dd9d 100644
--- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -48,6 +48,9 @@ public:
     return AddressSpace::ADDRESS_SPACE_GENERIC;
   }
 
+  Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
+                                               IntrinsicInst &II) const;
+
   // Loads and stores can be vectorized if the alignment is at least as big as
   // the load/store we want to vectorize.
   bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment,
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp
index baaedc7ac87c..5381646434eb 100644
--- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp
@@ -17,6 +17,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsNVPTX.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/Support/CommandLine.h"
 
 using namespace llvm;
@@ -32,21 +33,13 @@ static cl::opt<unsigned> NVVMIntrRangeSM("nvvm-intr-range-sm", cl::init(20),
 namespace {
 class NVVMIntrRange : public FunctionPass {
  private:
-   struct {
-     unsigned x, y, z;
-   } MaxBlockSize, MaxGridSize;
+   unsigned SmVersion;
 
  public:
    static char ID;
    NVVMIntrRange() : NVVMIntrRange(NVVMIntrRangeSM) {}
-   NVVMIntrRange(unsigned int SmVersion) : FunctionPass(ID) {
-     MaxBlockSize.x = 1024;
-     MaxBlockSize.y = 1024;
-     MaxBlockSize.z = 64;
-
-     MaxGridSize.x = SmVersion >= 30 ? 0x7fffffff : 0xffff;
-     MaxGridSize.y = 0xffff;
-     MaxGridSize.z = 0xffff;
+   NVVMIntrRange(unsigned int SmVersion)
+       : FunctionPass(ID), SmVersion(SmVersion) {
 
      initializeNVVMIntrRangePass(*PassRegistry::getPassRegistry());
    }
@@ -79,7 +72,18 @@ static bool addRangeMetadata(uint64_t Low, uint64_t High, CallInst *C) {
   return true;
 }
 
-bool NVVMIntrRange::runOnFunction(Function &F) {
+static bool runNVVMIntrRange(Function &F, unsigned SmVersion) {
+  struct {
+    unsigned x, y, z;
+  } MaxBlockSize, MaxGridSize;
+  MaxBlockSize.x = 1024;
+  MaxBlockSize.y = 1024;
+  MaxBlockSize.z = 64;
+
+  MaxGridSize.x = SmVersion >= 30 ? 0x7fffffff : 0xffff;
+  MaxGridSize.y = 0xffff;
+  MaxGridSize.z = 0xffff;
+
   // Go through the calls in this function.
   bool Changed = false;
   for (Instruction &I : instructions(F)) {
@@ -151,3 +155,15 @@ bool NVVMIntrRange::runOnFunction(Function &F) {
 
   return Changed;
 }
+
+bool NVVMIntrRange::runOnFunction(Function &F) {
+  return runNVVMIntrRange(F, SmVersion);
+}
+
+NVVMIntrRangePass::NVVMIntrRangePass() : NVVMIntrRangePass(NVVMIntrRangeSM) {}
+
+PreservedAnalyses NVVMIntrRangePass::run(Function &F,
+                                         FunctionAnalysisManager &AM) {
+  return runNVVMIntrRange(F, SmVersion) ? PreservedAnalyses::none()
+                                        : PreservedAnalyses::all();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index ae166dc5a8d5..339f51d21087 100644
--- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -29,6 +29,7 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsNVPTX.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
@@ -73,7 +74,7 @@ INITIALIZE_PASS(NVVMReflect, "nvvm-reflect",
                 "Replace occurrences of __nvvm_reflect() calls with 0/1", false,
                 false)
 
-bool NVVMReflect::runOnFunction(Function &F) {
+static bool runNVVMReflect(Function &F, unsigned SmVersion) {
   if (!NVVMReflectEnabled)
     return false;
 
@@ -179,3 +180,15 @@ bool NVVMReflect::runOnFunction(Function &F) {
 
   return ToRemove.size() > 0;
 }
+
+bool NVVMReflect::runOnFunction(Function &F) {
+  return runNVVMReflect(F, SmVersion);
+}
+
+NVVMReflectPass::NVVMReflectPass() : NVVMReflectPass(0) {}
+
+PreservedAnalyses NVVMReflectPass::run(Function &F,
+                                       FunctionAnalysisManager &AM) {
+  return runNVVMReflect(F, SmVersion) ? PreservedAnalyses::none()
+                                      : PreservedAnalyses::all();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
index 13fd7d05ab9f..197fd3c7aa74 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
@@ -11,7 +11,6 @@
 #include "PPCTargetStreamer.h"
 #include "TargetInfo/PowerPCTargetInfo.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
@@ -99,12 +98,10 @@ struct PPCOperand;
 
 class PPCAsmParser : public MCTargetAsmParser {
   bool IsPPC64;
-  bool IsDarwin;
 
   void Warning(SMLoc L, const Twine &Msg) { getParser().Warning(L, Msg); }
 
   bool isPPC64() const { return IsPPC64; }
-  bool isDarwin() const { return IsDarwin; }
 
   bool MatchRegisterName(unsigned &RegNo, int64_t &IntVal);
 
@@ -116,14 +113,12 @@ class PPCAsmParser : public MCTargetAsmParser {
                                         PPCMCExpr::VariantKind &Variant);
   const MCExpr *FixupVariantKind(const MCExpr *E);
   bool ParseExpression(const MCExpr *&EVal);
-  bool ParseDarwinExpression(const MCExpr *&EVal);
 
   bool ParseOperand(OperandVector &Operands);
 
   bool ParseDirectiveWord(unsigned Size, AsmToken ID);
   bool ParseDirectiveTC(unsigned Size, AsmToken ID);
   bool ParseDirectiveMachine(SMLoc L);
-  bool ParseDarwinDirectiveMachine(SMLoc L);
   bool ParseDirectiveAbiVersion(SMLoc L);
   bool ParseDirectiveLocalEntry(SMLoc L);
 
@@ -150,7 +145,6 @@ public:
     // Check for 64-bit vs. 32-bit pointer mode.
     const Triple &TheTriple = STI.getTargetTriple();
     IsPPC64 = TheTriple.isPPC64();
-    IsDarwin = TheTriple.isMacOSX();
     // Initialize the set of available features.
     setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
   }
@@ -290,6 +284,16 @@ public:
     return (unsigned) Imm.Val;
   }
 
+  unsigned getACCReg() const {
+    assert(isACCRegNumber() && "Invalid access!");
+    return (unsigned) Imm.Val;
+  }
+
+  unsigned getVSRpEvenReg() const {
+    assert(isVSRpEvenRegNumber() && "Invalid access!");
+    return (unsigned) Imm.Val >> 1;
+  }
+
   unsigned getCCReg() const {
     assert(isCCRegNumber() && "Invalid access!");
     return (unsigned) (Kind == Immediate ? Imm.Val : Expr.CRVal);
@@ -402,6 +406,12 @@ public:
                                   (getImm() & 3) == 0); }
   bool isImmZero() const { return Kind == Immediate && getImm() == 0; }
   bool isRegNumber() const { return Kind == Immediate && isUInt<5>(getImm()); }
+  bool isACCRegNumber() const {
+    return Kind == Immediate && isUInt<3>(getImm());
+  }
+  bool isVSRpEvenRegNumber() const {
+    return Kind == Immediate && isUInt<6>(getImm()) && ((getImm() & 1) == 0);
+  }
   bool isVSRegNumber() const {
     return Kind == Immediate && isUInt<6>(getImm());
   }
@@ -492,29 +502,29 @@ public:
     Inst.addOperand(MCOperand::createReg(VSSRegs[getVSReg()]));
   }
 
-  void addRegQFRCOperands(MCInst &Inst, unsigned N) const {
+  void addRegSPE4RCOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::createReg(QFRegs[getReg()]));
+    Inst.addOperand(MCOperand::createReg(RRegs[getReg()]));
   }
 
-  void addRegQSRCOperands(MCInst &Inst, unsigned N) const {
+  void addRegSPERCOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::createReg(QFRegs[getReg()]));
+    Inst.addOperand(MCOperand::createReg(SPERegs[getReg()]));
   }
 
-  void addRegQBRCOperands(MCInst &Inst, unsigned N) const {
+  void addRegACCRCOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::createReg(QFRegs[getReg()]));
+    Inst.addOperand(MCOperand::createReg(ACCRegs[getACCReg()]));
   }
 
-  void addRegSPE4RCOperands(MCInst &Inst, unsigned N) const {
+  void addRegVSRpRCOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::createReg(RRegs[getReg()]));
+    Inst.addOperand(MCOperand::createReg(VSRpRegs[getVSRpEvenReg()]));
   }
 
-  void addRegSPERCOperands(MCInst &Inst, unsigned N) const {
+  void addRegVSRpEvenRCOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::createReg(SPERegs[getReg()]));
+    Inst.addOperand(MCOperand::createReg(VSRpRegs[getVSRpEvenReg()]));
   }
 
   void addRegCRBITRCOperands(MCInst &Inst, unsigned N) const {
@@ -666,7 +676,8 @@ public:
       return CreateImm(CE->getValue(), S, E, IsPPC64);
 
     if (const MCSymbolRefExpr *SRE = dyn_cast<MCSymbolRefExpr>(Val))
-      if (SRE->getKind() == MCSymbolRefExpr::VK_PPC_TLS)
+      if (SRE->getKind() == MCSymbolRefExpr::VK_PPC_TLS ||
+          SRE->getKind() == MCSymbolRefExpr::VK_PPC_TLS_PCREL)
         return CreateTLSReg(SRE, S, E, IsPPC64);
 
     if (const PPCMCExpr *TE = dyn_cast<PPCMCExpr>(Val)) {
@@ -762,12 +773,18 @@ void PPCAsmParser::ProcessInstruction(MCInst &Inst,
   }
   case PPC::DCBFx:
   case PPC::DCBFL:
-  case PPC::DCBFLP: {
+  case PPC::DCBFLP:
+  case PPC::DCBFPS:
+  case PPC::DCBSTPS: {
     int L = 0;
     if (Opcode == PPC::DCBFL)
       L = 1;
     else if (Opcode == PPC::DCBFLP)
       L = 3;
+    else if (Opcode == PPC::DCBFPS)
+      L = 4;
+    else if (Opcode == PPC::DCBSTPS)
+      L = 6;
 
     MCInst TmpInst;
     TmpInst.setOpcode(PPC::DCBF);
@@ -1184,41 +1201,41 @@ bool PPCAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
 }
 
 bool PPCAsmParser::MatchRegisterName(unsigned &RegNo, int64_t &IntVal) {
-  if (getParser().getTok().is(AsmToken::Identifier)) {
-    StringRef Name = getParser().getTok().getString();
-    if (Name.equals_lower("lr")) {
-      RegNo = isPPC64()? PPC::LR8 : PPC::LR;
-      IntVal = 8;
-    } else if (Name.equals_lower("ctr")) {
-      RegNo = isPPC64()? PPC::CTR8 : PPC::CTR;
-      IntVal = 9;
-    } else if (Name.equals_lower("vrsave")) {
-      RegNo = PPC::VRSAVE;
-      IntVal = 256;
-    } else if (Name.startswith_lower("r") &&
-               !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) {
-      RegNo = isPPC64()? XRegs[IntVal] : RRegs[IntVal];
-    } else if (Name.startswith_lower("f") &&
-               !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) {
-      RegNo = FRegs[IntVal];
-    } else if (Name.startswith_lower("vs") &&
-               !Name.substr(2).getAsInteger(10, IntVal) && IntVal < 64) {
-      RegNo = VSRegs[IntVal];
-    } else if (Name.startswith_lower("v") &&
-               !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) {
-      RegNo = VRegs[IntVal];
-    } else if (Name.startswith_lower("q") &&
-               !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) {
-      RegNo = QFRegs[IntVal];
-    } else if (Name.startswith_lower("cr") &&
-               !Name.substr(2).getAsInteger(10, IntVal) && IntVal < 8) {
-      RegNo = CRRegs[IntVal];
-    } else
-      return true;
-    getParser().Lex();
-    return false;
-  }
-  return true;
+  if (getParser().getTok().is(AsmToken::Percent))
+    getParser().Lex(); // Eat the '%'.
+
+  if (!getParser().getTok().is(AsmToken::Identifier))
+    return true;
+
+  StringRef Name = getParser().getTok().getString();
+  if (Name.equals_lower("lr")) {
+    RegNo = isPPC64() ? PPC::LR8 : PPC::LR;
+    IntVal = 8;
+  } else if (Name.equals_lower("ctr")) {
+    RegNo = isPPC64() ? PPC::CTR8 : PPC::CTR;
+    IntVal = 9;
+  } else if (Name.equals_lower("vrsave")) {
+    RegNo = PPC::VRSAVE;
+    IntVal = 256;
+  } else if (Name.startswith_lower("r") &&
+             !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) {
+    RegNo = isPPC64() ? XRegs[IntVal] : RRegs[IntVal];
+  } else if (Name.startswith_lower("f") &&
+             !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) {
+    RegNo = FRegs[IntVal];
+  } else if (Name.startswith_lower("vs") &&
+             !Name.substr(2).getAsInteger(10, IntVal) && IntVal < 64) {
+    RegNo = VSRegs[IntVal];
+  } else if (Name.startswith_lower("v") &&
+             !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) {
+    RegNo = VRegs[IntVal];
+  } else if (Name.startswith_lower("cr") &&
+             !Name.substr(2).getAsInteger(10, IntVal) && IntVal < 8) {
+    RegNo = CRRegs[IntVal];
+  } else
+    return true;
+  getParser().Lex();
+  return false;
 }
 
 bool PPCAsmParser::
@@ -1387,10 +1404,6 @@ FixupVariantKind(const MCExpr *E) {
 /// it handles modifiers.
 bool PPCAsmParser::
 ParseExpression(const MCExpr *&EVal) {
-
-  if (isDarwin())
-    return ParseDarwinExpression(EVal);
-
   // (ELF Platforms)
   // Handle \code @l/@ha \endcode
   if (getParser().parseExpression(EVal))
@@ -1406,53 +1419,6 @@ ParseExpression(const MCExpr *&EVal) {
   return false;
 }
 
-/// ParseDarwinExpression.  (MachO Platforms)
-/// This differs from the default "parseExpression" in that it handles detection
-/// of the \code hi16(), ha16() and lo16() \endcode modifiers.  At present,
-/// parseExpression() doesn't recognise the modifiers when in the Darwin/MachO
-/// syntax form so it is done here.  TODO: Determine if there is merit in
-/// arranging for this to be done at a higher level.
-bool PPCAsmParser::
-ParseDarwinExpression(const MCExpr *&EVal) {
-  MCAsmParser &Parser = getParser();
-  PPCMCExpr::VariantKind Variant = PPCMCExpr::VK_PPC_None;
-  switch (getLexer().getKind()) {
-  default:
-    break;
-  case AsmToken::Identifier:
-    // Compiler-generated Darwin identifiers begin with L,l,_ or "; thus
-    // something starting with any other char should be part of the
-    // asm syntax.  If handwritten asm includes an identifier like lo16,
-    // then all bets are off - but no-one would do that, right?
-    StringRef poss = Parser.getTok().getString();
-    if (poss.equals_lower("lo16")) {
-      Variant = PPCMCExpr::VK_PPC_LO;
-    } else if (poss.equals_lower("hi16")) {
-      Variant = PPCMCExpr::VK_PPC_HI;
-    } else if (poss.equals_lower("ha16")) {
-      Variant = PPCMCExpr::VK_PPC_HA;
-    }
-    if (Variant != PPCMCExpr::VK_PPC_None) {
-      Parser.Lex(); // Eat the xx16
-      if (getLexer().isNot(AsmToken::LParen))
-        return Error(Parser.getTok().getLoc(), "expected '('");
-      Parser.Lex(); // Eat the '('
-    }
-    break;
-  }
-
-  if (getParser().parseExpression(EVal))
-    return true;
-
-  if (Variant != PPCMCExpr::VK_PPC_None) {
-    if (getLexer().isNot(AsmToken::RParen))
-      return Error(Parser.getTok().getLoc(), "expected ')'");
-    Parser.Lex(); // Eat the ')'
-    EVal = PPCMCExpr::create(Variant, EVal, getParser().getContext());
-  }
-  return false;
-}
-
 /// ParseOperand
 /// This handles registers in the form 'NN', '%rNN' for ELF platforms and
 /// rNN for MachO.
@@ -1466,8 +1432,7 @@ bool PPCAsmParser::ParseOperand(OperandVector &Operands) {
   switch (getLexer().getKind()) {
   // Special handling for register names.  These are interpreted
   // as immediates corresponding to the register number.
-  case AsmToken::Percent:
-    Parser.Lex(); // Eat the '%'.
+  case AsmToken::Percent: {
     unsigned RegNo;
     int64_t IntVal;
     if (MatchRegisterName(RegNo, IntVal))
@@ -1475,7 +1440,7 @@ bool PPCAsmParser::ParseOperand(OperandVector &Operands) {
 
     Operands.push_back(PPCOperand::CreateImm(IntVal, S, E, isPPC64()));
     return false;
-
+  }
   case AsmToken::Identifier:
   case AsmToken::LParen:
   case AsmToken::Plus:
@@ -1485,20 +1450,6 @@ bool PPCAsmParser::ParseOperand(OperandVector &Operands) {
   case AsmToken::Dollar:
   case AsmToken::Exclaim:
   case AsmToken::Tilde:
-    // Note that non-register-name identifiers from the compiler will begin
-    // with '_', 'L'/'l' or '"'.  Of course, handwritten asm could include
-    // identifiers like r31foo - so we fall through in the event that parsing
-    // a register name fails.
-    if (isDarwin()) {
-      unsigned RegNo;
-      int64_t IntVal;
-      if (!MatchRegisterName(RegNo, IntVal)) {
-        Operands.push_back(PPCOperand::CreateImm(IntVal, S, E, isPPC64()));
-        return false;
-      }
-    }
-    // All other expressions
-
     if (!ParseExpression(EVal))
       break;
     // Fall-through
@@ -1537,29 +1488,18 @@ bool PPCAsmParser::ParseOperand(OperandVector &Operands) {
 
     int64_t IntVal;
     switch (getLexer().getKind()) {
-    case AsmToken::Percent:
-      Parser.Lex(); // Eat the '%'.
+    case AsmToken::Percent: {
       unsigned RegNo;
       if (MatchRegisterName(RegNo, IntVal))
         return Error(S, "invalid register name");
       break;
-
+    }
     case AsmToken::Integer:
-      if (isDarwin())
-        return Error(S, "unexpected integer value");
-      else if (getParser().parseAbsoluteExpression(IntVal) || IntVal < 0 ||
-               IntVal > 31)
+      if (getParser().parseAbsoluteExpression(IntVal) || IntVal < 0 ||
+          IntVal > 31)
         return Error(S, "invalid register number");
       break;
-   case AsmToken::Identifier:
-    if (isDarwin()) {
-      unsigned RegNo;
-      if (!MatchRegisterName(RegNo, IntVal)) {
-        break;
-      }
-    }
-    LLVM_FALLTHROUGH;
-
+    case AsmToken::Identifier:
     default:
       return Error(S, "invalid memory operand");
     }
@@ -1643,12 +1583,7 @@ bool PPCAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
 /// ParseDirective parses the PPC specific directives
 bool PPCAsmParser::ParseDirective(AsmToken DirectiveID) {
   StringRef IDVal = DirectiveID.getIdentifier();
-  if (isDarwin()) {
-    if (IDVal == ".machine")
-      ParseDarwinDirectiveMachine(DirectiveID.getLoc());
-    else
-      return true;
-  } else if (IDVal == ".word")
+  if (IDVal == ".word")
     ParseDirectiveWord(2, DirectiveID);
   else if (IDVal == ".llong")
     ParseDirectiveWord(8, DirectiveID);
@@ -1720,11 +1655,7 @@ bool PPCAsmParser::ParseDirectiveMachine(SMLoc L) {
 
   // FIXME: Right now, the parser always allows any available
   // instruction, so the .machine directive is not useful.
-  // Implement ".machine any" (by doing nothing) for the benefit
-  // of existing assembler code.  Likewise, we can then implement
-  // ".machine push" and ".machine pop" as no-op.
-  if (CPU != "any" && CPU != "push" && CPU != "pop")
-    return TokError("unrecognized machine type");
+  // In the wild, any/push/pop/ppc64/altivec/power[4-9] are seen.
 
   Parser.Lex();
 
@@ -1739,31 +1670,6 @@ bool PPCAsmParser::ParseDirectiveMachine(SMLoc L) {
   return false;
 }
 
-/// ParseDarwinDirectiveMachine (Mach-o platforms)
-///  ::= .machine cpu-identifier
-bool PPCAsmParser::ParseDarwinDirectiveMachine(SMLoc L) {
-  MCAsmParser &Parser = getParser();
-  if (Parser.getTok().isNot(AsmToken::Identifier) &&
-      Parser.getTok().isNot(AsmToken::String))
-    return Error(L, "unexpected token in directive");
-
-  StringRef CPU = Parser.getTok().getIdentifier();
-  Parser.Lex();
-
-  // FIXME: this is only the 'default' set of cpu variants.
-  // However we don't act on this information at present, this is simply
-  // allowing parsing to proceed with minimal sanity checking.
-  if (check(CPU != "ppc7400" && CPU != "ppc" && CPU != "ppc64", L,
-            "unrecognized cpu type") ||
-      check(isPPC64() && (CPU == "ppc7400" || CPU == "ppc"), L,
-            "wrong cpu type specified for 64bit") ||
-      check(!isPPC64() && CPU == "ppc64", L,
-            "wrong cpu type specified for 32bit") ||
-      parseToken(AsmToken::EndOfStatement))
-    return addErrorSuffix(" in '.machine' directive");
-  return false;
-}
-
 /// ParseDirectiveAbiVersion
 ///  ::= .abiversion constant-expression
 bool PPCAsmParser::ParseDirectiveAbiVersion(SMLoc L) {
@@ -1809,8 +1715,9 @@ bool PPCAsmParser::ParseDirectiveLocalEntry(SMLoc L) {
 /// Force static initialization.
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCAsmParser() {
   RegisterMCAsmParser<PPCAsmParser> A(getThePPC32Target());
-  RegisterMCAsmParser<PPCAsmParser> B(getThePPC64Target());
-  RegisterMCAsmParser<PPCAsmParser> C(getThePPC64LETarget());
+  RegisterMCAsmParser<PPCAsmParser> B(getThePPC32LETarget());
+  RegisterMCAsmParser<PPCAsmParser> C(getThePPC64Target());
+  RegisterMCAsmParser<PPCAsmParser> D(getThePPC64LETarget());
 }
 
 #define GET_REGISTER_MATCHER
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
index 74c6fd3733f0..3e9286fb0b30 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
@@ -54,6 +54,8 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCDisassembler() {
   // Register the disassembler for each target.
   TargetRegistry::RegisterMCDisassembler(getThePPC32Target(),
                                          createPPCDisassembler);
+  TargetRegistry::RegisterMCDisassembler(getThePPC32LETarget(),
+                                         createPPCLEDisassembler);
   TargetRegistry::RegisterMCDisassembler(getThePPC64Target(),
                                          createPPCDisassembler);
   TargetRegistry::RegisterMCDisassembler(getThePPC64LETarget(),
@@ -167,18 +169,24 @@ static DecodeStatus DecodeG8RC_NOX0RegisterClass(MCInst &Inst, uint64_t RegNo,
 #define DecodePointerLikeRegClass0 DecodeGPRCRegisterClass
 #define DecodePointerLikeRegClass1 DecodeGPRC_NOR0RegisterClass
 
-static DecodeStatus DecodeQFRCRegisterClass(MCInst &Inst, uint64_t RegNo,
-                                            uint64_t Address,
-                                            const void *Decoder) {
-  return decodeRegisterClass(Inst, RegNo, QFRegs);
-}
-
 static DecodeStatus DecodeSPERCRegisterClass(MCInst &Inst, uint64_t RegNo,
                                             uint64_t Address,
                                             const void *Decoder) {
   return decodeRegisterClass(Inst, RegNo, SPERegs);
 }
 
+static DecodeStatus DecodeACCRCRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, ACCRegs);
+}
+
+static DecodeStatus DecodeVSRpRCRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                              uint64_t Address,
+                                              const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, VSRpRegs);
+}
+
 #define DecodeQSRCRegisterClass DecodeQFRCRegisterClass
 #define DecodeQBRCRegisterClass DecodeQFRCRegisterClass
 
@@ -206,6 +214,15 @@ static DecodeStatus decodeImmZeroOperand(MCInst &Inst, uint64_t Imm,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus decodeVSRpEvenOperands(MCInst &Inst, uint64_t RegNo,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  if (RegNo & 1)
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createReg(VSRpRegs[RegNo >> 1]));
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus decodeMemRIOperands(MCInst &Inst, uint64_t Imm,
                                         int64_t Address, const void *Decoder) {
   // Decode the memri field (imm, reg), which has the low 16-bits as the
@@ -401,14 +418,9 @@ DecodeStatus PPCDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
   // Read the instruction in the proper endianness.
   uint64_t Inst = ReadFunc(Bytes.data());
 
-  if (STI.getFeatureBits()[PPC::FeatureQPX]) {
-    DecodeStatus result =
-      decodeInstruction(DecoderTableQPX32, MI, Inst, Address, this, STI);
-    if (result != MCDisassembler::Fail)
-      return result;
-  } else if (STI.getFeatureBits()[PPC::FeatureSPE]) {
+  if (STI.getFeatureBits()[PPC::FeatureSPE]) {
     DecodeStatus result =
-      decodeInstruction(DecoderTableSPE32, MI, Inst, Address, this, STI);
+        decodeInstruction(DecoderTableSPE32, MI, Inst, Address, this, STI);
     if (result != MCDisassembler::Fail)
       return result;
   }
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCCallLowering.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCCallLowering.cpp
new file mode 100644
index 000000000000..e8f8cbfee6ee
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCCallLowering.cpp
@@ -0,0 +1,53 @@
+//===-- PPCCallLowering.h - Call lowering for GlobalISel -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements the lowering of LLVM calls to machine code calls for
+/// GlobalISel.
+///
+//===----------------------------------------------------------------------===//
+
+#include "PPCCallLowering.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "ppc-call-lowering"
+
+using namespace llvm;
+
+PPCCallLowering::PPCCallLowering(const PPCTargetLowering &TLI)
+    : CallLowering(&TLI) {}
+
+bool PPCCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
+                                  const Value *Val, ArrayRef<Register> VRegs,
+                                  FunctionLoweringInfo &FLI,
+                                  Register SwiftErrorVReg) const {
+  assert(((Val && !VRegs.empty()) || (!Val && VRegs.empty())) &&
+         "Return value without a vreg");
+  if (VRegs.size() > 0)
+    return false;
+
+  MIRBuilder.buildInstr(PPC::BLR8);
+  return true;
+}
+
+bool PPCCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
+                                           const Function &F,
+                                           ArrayRef<ArrayRef<Register>> VRegs,
+                                           FunctionLoweringInfo &FLI) const {
+
+  // If VRegs is empty, then there are no formal arguments to lower and thus can
+  // always return true. If there are formal arguments, we currently do not
+  // handle them and thus return false.
+  return VRegs.empty();
+}
+
+bool PPCCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
+                                CallLoweringInfo &Info) const {
+  return false;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCCallLowering.h b/contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCCallLowering.h
new file mode 100644
index 000000000000..5a449f4cab1b
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCCallLowering.h
@@ -0,0 +1,40 @@
+//===-- PPCCallLowering.h - Call lowering for GlobalISel -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file describes how to lower LLVM calls to machine code calls.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_POWERPC_GISEL_PPCCALLLOWERING_H
+#define LLVM_LIB_TARGET_POWERPC_GISEL_PPCCALLLOWERING_H
+
+#include "PPCISelLowering.h"
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
+#include "llvm/IR/CallingConv.h"
+
+namespace llvm {
+
+class PPCTargetLowering;
+
+class PPCCallLowering : public CallLowering {
+public:
+  PPCCallLowering(const PPCTargetLowering &TLI);
+
+  bool lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val,
+                   ArrayRef<Register> VRegs, FunctionLoweringInfo &FLI,
+                   Register SwiftErrorVReg) const override;
+  bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
+                            ArrayRef<ArrayRef<Register>> VRegs,
+                            FunctionLoweringInfo &FLI) const override;
+  bool lowerCall(MachineIRBuilder &MIRBuilder,
+                 CallLoweringInfo &Info) const override;
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCInstructionSelector.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCInstructionSelector.cpp
new file mode 100644
index 000000000000..7d64816ed6c7
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCInstructionSelector.cpp
@@ -0,0 +1,92 @@
+//===- PPCInstructionSelector.cpp --------------------------------*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the InstructionSelector class for
+/// PowerPC.
+//===----------------------------------------------------------------------===//
+
+#include "PPCInstrInfo.h"
+#include "PPCRegisterBankInfo.h"
+#include "PPCSubtarget.h"
+#include "PPCTargetMachine.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/IR/IntrinsicsPowerPC.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "ppc-gisel"
+
+using namespace llvm;
+
+namespace {
+
+#define GET_GLOBALISEL_PREDICATE_BITSET
+#include "PPCGenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATE_BITSET
+
+class PPCInstructionSelector : public InstructionSelector {
+public:
+  PPCInstructionSelector(const PPCTargetMachine &TM, const PPCSubtarget &STI,
+                         const PPCRegisterBankInfo &RBI);
+
+  bool select(MachineInstr &I) override;
+  static const char *getName() { return DEBUG_TYPE; }
+
+private:
+  /// tblgen generated 'select' implementation that is used as the initial
+  /// selector for the patterns that do not require complex C++.
+  bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
+
+  const PPCInstrInfo &TII;
+  const PPCRegisterInfo &TRI;
+  const PPCRegisterBankInfo &RBI;
+
+#define GET_GLOBALISEL_PREDICATES_DECL
+#include "PPCGenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATES_DECL
+
+#define GET_GLOBALISEL_TEMPORARIES_DECL
+#include "PPCGenGlobalISel.inc"
+#undef GET_GLOBALISEL_TEMPORARIES_DECL
+};
+
+} // end anonymous namespace
+
+#define GET_GLOBALISEL_IMPL
+#include "PPCGenGlobalISel.inc"
+#undef GET_GLOBALISEL_IMPL
+
+PPCInstructionSelector::PPCInstructionSelector(const PPCTargetMachine &TM,
+                                               const PPCSubtarget &STI,
+                                               const PPCRegisterBankInfo &RBI)
+    : InstructionSelector(), TII(*STI.getInstrInfo()),
+      TRI(*STI.getRegisterInfo()), RBI(RBI),
+#define GET_GLOBALISEL_PREDICATES_INIT
+#include "PPCGenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATES_INIT
+#define GET_GLOBALISEL_TEMPORARIES_INIT
+#include "PPCGenGlobalISel.inc"
+#undef GET_GLOBALISEL_TEMPORARIES_INIT
+{
+}
+
+bool PPCInstructionSelector::select(MachineInstr &I) {
+  if (selectImpl(I, *CoverageInfo))
+    return true;
+  return false;
+}
+
+namespace llvm {
+InstructionSelector *
+createPPCInstructionSelector(const PPCTargetMachine &TM,
+                             const PPCSubtarget &Subtarget,
+                             const PPCRegisterBankInfo &RBI) {
+  return new PPCInstructionSelector(TM, Subtarget, RBI);
+}
+} // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCLegalizerInfo.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCLegalizerInfo.cpp
new file mode 100644
index 000000000000..c16bcaea592b
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCLegalizerInfo.cpp
@@ -0,0 +1,20 @@
+//===- PPCLegalizerInfo.h ----------------------------------------*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the Machinelegalizer class for PowerPC
+//===----------------------------------------------------------------------===//
+
+#include "PPCLegalizerInfo.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "ppc-legalinfo"
+
+using namespace llvm;
+using namespace LegalizeActions;
+
+PPCLegalizerInfo::PPCLegalizerInfo(const PPCSubtarget &ST) { computeTables(); }
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCLegalizerInfo.h b/contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCLegalizerInfo.h
new file mode 100644
index 000000000000..c73186d3d0c1
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCLegalizerInfo.h
@@ -0,0 +1,28 @@
+//===- PPCLegalizerInfo.h ----------------------------------------*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares the targeting of the Machinelegalizer class for PowerPC
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_POWERPC_GISEL_PPCMACHINELEGALIZER_H
+#define LLVM_LIB_TARGET_POWERPC_GISEL_PPCMACHINELEGALIZER_H
+
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+
+namespace llvm {
+
+class PPCSubtarget;
+
+/// This class provides the information for the PowerPC target legalizer for
+/// GlobalISel.
+class PPCLegalizerInfo : public LegalizerInfo {
+public:
+  PPCLegalizerInfo(const PPCSubtarget &ST);
+};
+} // namespace llvm
+#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.cpp
new file mode 100644
index 000000000000..6af79324919c
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.cpp
@@ -0,0 +1,27 @@
+//===- PPCRegisterBankInfo.cpp --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the RegisterBankInfo class for
+/// PowerPC.
+//===----------------------------------------------------------------------===//
+
+#include "PPCRegisterBankInfo.h"
+#include "PPCRegisterInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "ppc-reg-bank-info"
+
+#define GET_TARGET_REGBANK_IMPL
+#include "PPCGenRegisterBank.inc"
+
+using namespace llvm;
+
+PPCRegisterBankInfo::PPCRegisterBankInfo(const TargetRegisterInfo &TRI)
+    : PPCGenRegisterBankInfo() {}
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.h b/contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.h
new file mode 100644
index 000000000000..358d5ed3cf14
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.h
@@ -0,0 +1,39 @@
+//===-- PPCRegisterBankInfo.h -----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file declares the targeting of the RegisterBankInfo class for PowerPC.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_PPC_GISEL_PPCREGISTERBANKINFO_H
+#define LLVM_LIB_TARGET_PPC_GISEL_PPCREGISTERBANKINFO_H
+
+#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+
+#define GET_REGBANK_DECLARATIONS
+#include "PPCGenRegisterBank.inc"
+
+namespace llvm {
+class TargetRegisterInfo;
+
+class PPCGenRegisterBankInfo : public RegisterBankInfo {
+protected:
+#define GET_TARGET_REGBANK_CLASS
+#include "PPCGenRegisterBank.inc"
+};
+
+class PPCRegisterBankInfo final : public PPCGenRegisterBankInfo {
+public:
+  PPCRegisterBankInfo(const TargetRegisterInfo &TRI);
+};
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCRegisterBanks.td b/contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCRegisterBanks.td
new file mode 100644
index 000000000000..0e8a4b7061c5
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCRegisterBanks.td
@@ -0,0 +1,15 @@
+//===-- PPCRegisterBanks.td - Describe the PPC Banks -------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Define the PPC register banks used for GlobalISel.
+///
+//===----------------------------------------------------------------------===//
+
+/// General Purpose Registers
+def GPRRegBank : RegisterBank<"GPR", [G8RC]>;
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
index dbaf221db9fc..72401668c8d0 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -46,6 +46,7 @@ static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) {
   case PPC::fixup_ppc_half16ds:
     return Value & 0xfffc;
   case PPC::fixup_ppc_pcrel34:
+  case PPC::fixup_ppc_imm34:
     return Value & 0x3ffffffff;
   }
 }
@@ -68,6 +69,7 @@ static unsigned getFixupKindNumBytes(unsigned Kind) {
   case PPC::fixup_ppc_br24_notoc:
     return 4;
   case PPC::fixup_ppc_pcrel34:
+  case PPC::fixup_ppc_imm34:
   case FK_Data_8:
     return 8;
   case PPC::fixup_ppc_nofixup:
@@ -100,6 +102,7 @@ public:
       { "fixup_ppc_half16",       0,     16,   0 },
       { "fixup_ppc_half16ds",     0,     14,   0 },
       { "fixup_ppc_pcrel34",     0,      34,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_ppc_imm34",       0,      34,   0 },
       { "fixup_ppc_nofixup",      0,      0,   0 }
     };
     const static MCFixupKindInfo InfosLE[PPC::NumTargetFixupKinds] = {
@@ -112,6 +115,7 @@ public:
       { "fixup_ppc_half16",      0,      16,   0 },
       { "fixup_ppc_half16ds",    2,      14,   0 },
       { "fixup_ppc_pcrel34",     0,      34,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_ppc_imm34",       0,      34,   0 },
       { "fixup_ppc_nofixup",     0,       0,   0 }
     };
 
@@ -178,12 +182,6 @@ public:
     }
   }
 
-  bool mayNeedRelaxation(const MCInst &Inst,
-                         const MCSubtargetInfo &STI) const override {
-    // FIXME.
-    return false;
-  }
-
   bool fixupNeedsRelaxation(const MCFixup &Fixup,
                             uint64_t Value,
                             const MCRelaxableFragment *DF,
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
index d8b3301e97f1..94ef7b45434f 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
@@ -138,6 +138,15 @@ unsigned PPCELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
       case MCSymbolRefExpr::VK_PPC_GOT_PCREL:
         Type = ELF::R_PPC64_GOT_PCREL34;
         break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TLSGD_PCREL:
+        Type = ELF::R_PPC64_GOT_TLSGD_PCREL34;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TLSLD_PCREL:
+        Type = ELF::R_PPC64_GOT_TLSLD_PCREL34;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TPREL_PCREL:
+        Type = ELF::R_PPC64_GOT_TPREL_PCREL34;
+        break;
       }
       break;
     case FK_Data_4:
@@ -407,6 +416,21 @@ unsigned PPCELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
         else
           Type = ELF::R_PPC_TLS;
         break;
+      case MCSymbolRefExpr::VK_PPC_TLS_PCREL:
+        Type = ELF::R_PPC64_TLS;
+        break;
+      }
+      break;
+    case PPC::fixup_ppc_imm34:
+      switch (Modifier) {
+      default:
+        report_fatal_error("Unsupported Modifier for fixup_ppc_imm34.");
+      case MCSymbolRefExpr::VK_DTPREL:
+        Type = ELF::R_PPC64_DTPREL34;
+        break;
+      case MCSymbolRefExpr::VK_TPREL:
+        Type = ELF::R_PPC64_TPREL34;
+        break;
       }
       break;
     case FK_Data_8:
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp
index 4373778cc96c..386d59266096 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp
@@ -20,6 +20,7 @@
 
 
 #include "PPCELFStreamer.h"
+#include "PPCFixupKinds.h"
 #include "PPCInstrInfo.h"
 #include "PPCMCCodeEmitter.h"
 #include "llvm/BinaryFormat/ELF.h"
@@ -89,12 +90,33 @@ void PPCELFStreamer::emitInstruction(const MCInst &Inst,
   PPCMCCodeEmitter *Emitter =
       static_cast<PPCMCCodeEmitter*>(getAssembler().getEmitterPtr());
 
+  // If the instruction is a part of the GOT to PC-Rel link time optimization
+  // instruction pair, return a value, otherwise return None. A true returned
+  // value means the instruction is the PLDpc and a false value means it is
+  // the user instruction.
+  Optional<bool> IsPartOfGOTToPCRelPair = isPartOfGOTToPCRelPair(Inst, STI);
+
+  // User of the GOT-indirect address.
+  // For example, the load that will get the relocation as follows:
+  // .reloc .Lpcrel1-8,R_PPC64_PCREL_OPT,.-(.Lpcrel1-8)
+  //  lwa 3, 4(3)
+  if (IsPartOfGOTToPCRelPair.hasValue() && !IsPartOfGOTToPCRelPair.getValue())
+    emitGOTToPCRelReloc(Inst);
+
   // Special handling is only for prefixed instructions.
   if (!Emitter->isPrefixedInstruction(Inst)) {
     MCELFStreamer::emitInstruction(Inst, STI);
     return;
   }
   emitPrefixedInstruction(Inst, STI);
+
+  // Producer of the GOT-indirect address.
+  // For example, the prefixed load from the got that will get the label as
+  // follows:
+  //  pld 3, vec@got@pcrel(0), 1
+  // .Lpcrel1:
+  if (IsPartOfGOTToPCRelPair.hasValue() && IsPartOfGOTToPCRelPair.getValue())
+    emitGOTToPCRelLabel(Inst);
 }
 
 void PPCELFStreamer::emitLabel(MCSymbol *Symbol, SMLoc Loc) {
@@ -103,6 +125,102 @@ void PPCELFStreamer::emitLabel(MCSymbol *Symbol, SMLoc Loc) {
   MCELFStreamer::emitLabel(Symbol);
 }
 
+// This linker time GOT PC Relative optimization relocation will look like this:
+//   pld <reg> symbol@got@pcrel
+// <Label###>:
+//   .reloc Label###-8,R_PPC64_PCREL_OPT,.-(Label###-8)
+//   load <loadedreg>, 0(<reg>)
+// The reason we place the label after the PLDpc instruction is that there
+// may be an alignment nop before it since prefixed instructions must not
+// cross a 64-byte boundary (please see
+// PPCELFStreamer::emitPrefixedInstruction()). When referring to the
+// label, we subtract the width of a prefixed instruction (8 bytes) to ensure
+// we refer to the PLDpc.
+void PPCELFStreamer::emitGOTToPCRelReloc(const MCInst &Inst) {
+  // Get the last operand which contains the symbol.
+  const MCOperand &Operand = Inst.getOperand(Inst.getNumOperands() - 1);
+  assert(Operand.isExpr() && "Expecting an MCExpr.");
+  // Cast the last operand to MCSymbolRefExpr to get the symbol.
+  const MCExpr *Expr = Operand.getExpr();
+  const MCSymbolRefExpr *SymExpr = static_cast<const MCSymbolRefExpr *>(Expr);
+  assert(SymExpr->getKind() == MCSymbolRefExpr::VK_PPC_PCREL_OPT &&
+         "Expecting a symbol of type VK_PPC_PCREL_OPT");
+  MCSymbol *LabelSym =
+      getContext().getOrCreateSymbol(SymExpr->getSymbol().getName());
+  const MCExpr *LabelExpr = MCSymbolRefExpr::create(LabelSym, getContext());
+  const MCExpr *Eight = MCConstantExpr::create(8, getContext());
+  // SubExpr is just Label###-8
+  const MCExpr *SubExpr =
+      MCBinaryExpr::createSub(LabelExpr, Eight, getContext());
+  MCSymbol *CurrentLocation = getContext().createTempSymbol();
+  const MCExpr *CurrentLocationExpr =
+      MCSymbolRefExpr::create(CurrentLocation, getContext());
+  // SubExpr2 is .-(Label###-8)
+  const MCExpr *SubExpr2 =
+      MCBinaryExpr::createSub(CurrentLocationExpr, SubExpr, getContext());
+
+  MCDataFragment *DF = static_cast<MCDataFragment *>(LabelSym->getFragment());
+  assert(DF && "Expecting a valid data fragment.");
+  MCFixupKind FixupKind = static_cast<MCFixupKind>(FirstLiteralRelocationKind +
+                                                   ELF::R_PPC64_PCREL_OPT);
+  DF->getFixups().push_back(
+      MCFixup::create(LabelSym->getOffset() - 8, SubExpr2,
+                      FixupKind, Inst.getLoc()));
+  emitLabel(CurrentLocation, Inst.getLoc());
+}
+
+// Emit the label that immediately follows the PLDpc for a link time GOT PC Rel
+// optimization.
+void PPCELFStreamer::emitGOTToPCRelLabel(const MCInst &Inst) {
+  // Get the last operand which contains the symbol.
+  const MCOperand &Operand = Inst.getOperand(Inst.getNumOperands() - 1);
+  assert(Operand.isExpr() && "Expecting an MCExpr.");
+  // Cast the last operand to MCSymbolRefExpr to get the symbol.
+  const MCExpr *Expr = Operand.getExpr();
+  const MCSymbolRefExpr *SymExpr = static_cast<const MCSymbolRefExpr *>(Expr);
+  assert(SymExpr->getKind() == MCSymbolRefExpr::VK_PPC_PCREL_OPT &&
+         "Expecting a symbol of type VK_PPC_PCREL_OPT");
+  MCSymbol *LabelSym =
+      getContext().getOrCreateSymbol(SymExpr->getSymbol().getName());
+  emitLabel(LabelSym, Inst.getLoc());
+}
+
+// This funciton checks if the parameter Inst is part of the setup for a link
+// time GOT PC Relative optimization. For example in this situation:
+// <MCInst PLDpc <MCOperand Reg:282> <MCOperand Expr:(glob_double@got@pcrel)>
+//   <MCOperand Imm:0> <MCOperand Expr:(.Lpcrel@<<invalid>>)>>
+// <MCInst SOME_LOAD <MCOperand Reg:22> <MCOperand Imm:0> <MCOperand Reg:282>
+//   <MCOperand Expr:(.Lpcrel@<<invalid>>)>>
+// The above is a pair of such instructions and this function will not return
+// None for either one of them. In both cases we are looking for the last
+// operand <MCOperand Expr:(.Lpcrel@<<invalid>>)> which needs to be an MCExpr
+// and has the flag MCSymbolRefExpr::VK_PPC_PCREL_OPT. After that we just look
+// at the opcode and in the case of PLDpc we will return true. For the load
+// (or store) this function will return false indicating it has found the second
+// instruciton in the pair.
+Optional<bool> llvm::isPartOfGOTToPCRelPair(const MCInst &Inst,
+                                            const MCSubtargetInfo &STI) {
+  // Need at least two operands.
+  if (Inst.getNumOperands() < 2)
+    return None;
+
+  unsigned LastOp = Inst.getNumOperands() - 1;
+  // The last operand needs to be an MCExpr and it needs to have a variant kind
+  // of VK_PPC_PCREL_OPT. If it does not satisfy these conditions it is not a
+  // link time GOT PC Rel opt instruction and we can ignore it and return None.
+  const MCOperand &Operand = Inst.getOperand(LastOp);
+  if (!Operand.isExpr())
+    return None;
+
+  // Check for the variant kind VK_PPC_PCREL_OPT in this expression.
+  const MCExpr *Expr = Operand.getExpr();
+  const MCSymbolRefExpr *SymExpr = static_cast<const MCSymbolRefExpr *>(Expr);
+  if (!SymExpr || SymExpr->getKind() != MCSymbolRefExpr::VK_PPC_PCREL_OPT)
+    return None;
+
+  return (Inst.getOpcode() == PPC::PLDpc);
+}
+
 MCELFStreamer *llvm::createPPCELFStreamer(
     MCContext &Context, std::unique_ptr<MCAsmBackend> MAB,
     std::unique_ptr<MCObjectWriter> OW,
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.h b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.h
index 51863232d071..f44200104f32 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.h
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.h
@@ -43,8 +43,15 @@ public:
   void emitLabel(MCSymbol *Symbol, SMLoc Loc = SMLoc()) override;
 private:
   void emitPrefixedInstruction(const MCInst &Inst, const MCSubtargetInfo &STI);
+  void emitGOTToPCRelReloc(const MCInst &Inst);
+  void emitGOTToPCRelLabel(const MCInst &Inst);
 };
 
+// Check if the instruction Inst is part of a pair of instructions that make up
+// a link time GOT PC Rel optimization.
+Optional<bool> isPartOfGOTToPCRelPair(const MCInst &Inst,
+                                      const MCSubtargetInfo &STI);
+
 MCELFStreamer *createPPCELFStreamer(MCContext &Context,
                                     std::unique_ptr<MCAsmBackend> MAB,
                                     std::unique_ptr<MCObjectWriter> OW,
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
index 2fb8947fd4e0..73292f7b7938 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
@@ -43,6 +43,9 @@ enum Fixups {
   // A 34-bit fixup corresponding to PC-relative paddi.
   fixup_ppc_pcrel34,
 
+  // A 34-bit fixup corresponding to Non-PC-relative paddi.
+  fixup_ppc_imm34,
+
   /// Not a true fixup, but ties a symbol to a call to __tls_get_addr for the
   /// TLS general and local dynamic models, or inserts the thread-pointer
   /// register number.
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp
index 16da62a74b8c..a291a34d4c52 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp
@@ -49,18 +49,6 @@ FullRegNamesWithPercent("ppc-reg-with-percent-prefix", cl::Hidden,
 
 void PPCInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
   const char *RegName = getRegisterName(RegNo);
-  if (RegName[0] == 'q' /* QPX */) {
-    // The system toolchain on the BG/Q does not understand QPX register names
-    // in .cfi_* directives, so print the name of the floating-point
-    // subregister instead.
-    std::string RN(RegName);
-
-    RN[0] = 'f';
-    OS << RN;
-
-    return;
-  }
-
   OS << RegName;
 }
 
@@ -83,15 +71,45 @@ void PPCInstPrinter::printInst(const MCInst *MI, uint64_t Address,
            "reference expression if it is an expression at all.");
 
     O << "\taddis ";
-    printOperand(MI, 0, O);
+    printOperand(MI, 0, STI, O);
     O << ", ";
-    printOperand(MI, 2, O);
+    printOperand(MI, 2, STI, O);
     O << "(";
-    printOperand(MI, 1, O);
+    printOperand(MI, 1, STI, O);
     O << ")";
     return;
   }
 
+  // Check if the last operand is an expression with the variant kind
+  // VK_PPC_PCREL_OPT. If this is the case then this is a linker optimization
+  // relocation and the .reloc directive needs to be added.
+  unsigned LastOp = MI->getNumOperands() - 1;
+  if (MI->getNumOperands() > 1) {
+    const MCOperand &Operand = MI->getOperand(LastOp);
+    if (Operand.isExpr()) {
+      const MCExpr *Expr = Operand.getExpr();
+      const MCSymbolRefExpr *SymExpr =
+          static_cast<const MCSymbolRefExpr *>(Expr);
+
+      if (SymExpr && SymExpr->getKind() == MCSymbolRefExpr::VK_PPC_PCREL_OPT) {
+        const MCSymbol &Symbol = SymExpr->getSymbol();
+        if (MI->getOpcode() == PPC::PLDpc) {
+          printInstruction(MI, Address, STI, O);
+          O << "\n";
+          Symbol.print(O, &MAI);
+          O << ":";
+          return;
+        } else {
+          O << "\t.reloc ";
+          Symbol.print(O, &MAI);
+          O << "-8,R_PPC64_PCREL_OPT,.-(";
+          Symbol.print(O, &MAI);
+          O << "-8)\n";
+        }
+      }
+    }
+  }
+
   // Check for slwi/srwi mnemonics.
   if (MI->getOpcode() == PPC::RLWINM) {
     unsigned char SH = MI->getOperand(2).getImm();
@@ -106,9 +124,9 @@ void PPCInstPrinter::printInst(const MCInst *MI, uint64_t Address,
       SH = 32-SH;
     }
     if (useSubstituteMnemonic) {
-      printOperand(MI, 0, O);
+      printOperand(MI, 0, STI, O);
       O << ", ";
-      printOperand(MI, 1, O);
+      printOperand(MI, 1, STI, O);
       O << ", " << (unsigned int)SH;
 
       printAnnotation(O, Annot);
@@ -123,9 +141,9 @@ void PPCInstPrinter::printInst(const MCInst *MI, uint64_t Address,
     // rldicr RA, RS, SH, 63-SH == sldi RA, RS, SH
     if (63-SH == ME) {
       O << "\tsldi ";
-      printOperand(MI, 0, O);
+      printOperand(MI, 0, STI, O);
       O << ", ";
-      printOperand(MI, 1, O);
+      printOperand(MI, 1, STI, O);
       O << ", " << (unsigned int)SH;
       printAnnotation(O, Annot);
       return;
@@ -153,9 +171,9 @@ void PPCInstPrinter::printInst(const MCInst *MI, uint64_t Address,
     if (IsBookE && TH != 0 && TH != 16)
       O << (unsigned int) TH << ", ";
 
-    printOperand(MI, 1, O);
+    printOperand(MI, 1, STI, O);
     O << ", ";
-    printOperand(MI, 2, O);
+    printOperand(MI, 2, STI, O);
 
     if (!IsBookE && TH != 0 && TH != 16)
       O << ", " << (unsigned int) TH;
@@ -166,29 +184,36 @@ void PPCInstPrinter::printInst(const MCInst *MI, uint64_t Address,
 
   if (MI->getOpcode() == PPC::DCBF) {
     unsigned char L = MI->getOperand(0).getImm();
-    if (!L || L == 1 || L == 3) {
-      O << "\tdcbf";
-      if (L == 1 || L == 3)
+    if (!L || L == 1 || L == 3 || L == 4 || L == 6) {
+      O << "\tdcb";
+      if (L != 6)
+        O << "f";
+      if (L == 1)
         O << "l";
       if (L == 3)
-        O << "p";
+        O << "lp";
+      if (L == 4)
+        O << "ps";
+      if (L == 6)
+        O << "stps";
       O << " ";
 
-      printOperand(MI, 1, O);
+      printOperand(MI, 1, STI, O);
       O << ", ";
-      printOperand(MI, 2, O);
+      printOperand(MI, 2, STI, O);
 
       printAnnotation(O, Annot);
       return;
     }
   }
 
-  if (!printAliasInstr(MI, Address, O))
-    printInstruction(MI, Address, O);
+  if (!printAliasInstr(MI, Address, STI, O))
+    printInstruction(MI, Address, STI, O);
   printAnnotation(O, Annot);
 }
 
 void PPCInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNo,
+                                           const MCSubtargetInfo &STI,
                                            raw_ostream &O,
                                            const char *Modifier) {
   unsigned Code = MI->getOperand(OpNo).getImm();
@@ -282,10 +307,11 @@ void PPCInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNo,
 
   assert(StringRef(Modifier) == "reg" &&
          "Need to specify 'cc', 'pm' or 'reg' as predicate op modifier!");
-  printOperand(MI, OpNo+1, O);
+  printOperand(MI, OpNo + 1, STI, O);
 }
 
 void PPCInstPrinter::printATBitsAsHint(const MCInst *MI, unsigned OpNo,
+                                       const MCSubtargetInfo &STI,
                                        raw_ostream &O) {
   unsigned Code = MI->getOperand(OpNo).getImm();
   if (Code == 2)
@@ -295,6 +321,7 @@ void PPCInstPrinter::printATBitsAsHint(const MCInst *MI, unsigned OpNo,
 }
 
 void PPCInstPrinter::printU1ImmOperand(const MCInst *MI, unsigned OpNo,
+                                       const MCSubtargetInfo &STI,
                                        raw_ostream &O) {
   unsigned int Value = MI->getOperand(OpNo).getImm();
   assert(Value <= 1 && "Invalid u1imm argument!");
@@ -302,6 +329,7 @@ void PPCInstPrinter::printU1ImmOperand(const MCInst *MI, unsigned OpNo,
 }
 
 void PPCInstPrinter::printU2ImmOperand(const MCInst *MI, unsigned OpNo,
+                                       const MCSubtargetInfo &STI,
                                        raw_ostream &O) {
   unsigned int Value = MI->getOperand(OpNo).getImm();
   assert(Value <= 3 && "Invalid u2imm argument!");
@@ -309,6 +337,7 @@ void PPCInstPrinter::printU2ImmOperand(const MCInst *MI, unsigned OpNo,
 }
 
 void PPCInstPrinter::printU3ImmOperand(const MCInst *MI, unsigned OpNo,
+                                       const MCSubtargetInfo &STI,
                                        raw_ostream &O) {
   unsigned int Value = MI->getOperand(OpNo).getImm();
   assert(Value <= 8 && "Invalid u3imm argument!");
@@ -316,6 +345,7 @@ void PPCInstPrinter::printU3ImmOperand(const MCInst *MI, unsigned OpNo,
 }
 
 void PPCInstPrinter::printU4ImmOperand(const MCInst *MI, unsigned OpNo,
+                                       const MCSubtargetInfo &STI,
                                        raw_ostream &O) {
   unsigned int Value = MI->getOperand(OpNo).getImm();
   assert(Value <= 15 && "Invalid u4imm argument!");
@@ -323,6 +353,7 @@ void PPCInstPrinter::printU4ImmOperand(const MCInst *MI, unsigned OpNo,
 }
 
 void PPCInstPrinter::printS5ImmOperand(const MCInst *MI, unsigned OpNo,
+                                       const MCSubtargetInfo &STI,
                                        raw_ostream &O) {
   int Value = MI->getOperand(OpNo).getImm();
   Value = SignExtend32<5>(Value);
@@ -330,6 +361,7 @@ void PPCInstPrinter::printS5ImmOperand(const MCInst *MI, unsigned OpNo,
 }
 
 void PPCInstPrinter::printImmZeroOperand(const MCInst *MI, unsigned OpNo,
+                                         const MCSubtargetInfo &STI,
                                          raw_ostream &O) {
   unsigned int Value = MI->getOperand(OpNo).getImm();
   assert(Value == 0 && "Operand must be zero");
@@ -337,6 +369,7 @@ void PPCInstPrinter::printImmZeroOperand(const MCInst *MI, unsigned OpNo,
 }
 
 void PPCInstPrinter::printU5ImmOperand(const MCInst *MI, unsigned OpNo,
+                                       const MCSubtargetInfo &STI,
                                        raw_ostream &O) {
   unsigned int Value = MI->getOperand(OpNo).getImm();
   assert(Value <= 31 && "Invalid u5imm argument!");
@@ -344,6 +377,7 @@ void PPCInstPrinter::printU5ImmOperand(const MCInst *MI, unsigned OpNo,
 }
 
 void PPCInstPrinter::printU6ImmOperand(const MCInst *MI, unsigned OpNo,
+                                       const MCSubtargetInfo &STI,
                                        raw_ostream &O) {
   unsigned int Value = MI->getOperand(OpNo).getImm();
   assert(Value <= 63 && "Invalid u6imm argument!");
@@ -351,6 +385,7 @@ void PPCInstPrinter::printU6ImmOperand(const MCInst *MI, unsigned OpNo,
 }
 
 void PPCInstPrinter::printU7ImmOperand(const MCInst *MI, unsigned OpNo,
+                                       const MCSubtargetInfo &STI,
                                        raw_ostream &O) {
   unsigned int Value = MI->getOperand(OpNo).getImm();
   assert(Value <= 127 && "Invalid u7imm argument!");
@@ -361,12 +396,14 @@ void PPCInstPrinter::printU7ImmOperand(const MCInst *MI, unsigned OpNo,
 // of XXSPLTIB which are unsigned. So we simply truncate to 8 bits and
 // print as unsigned.
 void PPCInstPrinter::printU8ImmOperand(const MCInst *MI, unsigned OpNo,
+                                       const MCSubtargetInfo &STI,
                                        raw_ostream &O) {
   unsigned char Value = MI->getOperand(OpNo).getImm();
   O << (unsigned int)Value;
 }
 
 void PPCInstPrinter::printU10ImmOperand(const MCInst *MI, unsigned OpNo,
+                                        const MCSubtargetInfo &STI,
                                         raw_ostream &O) {
   unsigned short Value = MI->getOperand(OpNo).getImm();
   assert(Value <= 1023 && "Invalid u10imm argument!");
@@ -374,6 +411,7 @@ void PPCInstPrinter::printU10ImmOperand(const MCInst *MI, unsigned OpNo,
 }
 
 void PPCInstPrinter::printU12ImmOperand(const MCInst *MI, unsigned OpNo,
+                                        const MCSubtargetInfo &STI,
                                         raw_ostream &O) {
   unsigned short Value = MI->getOperand(OpNo).getImm();
   assert(Value <= 4095 && "Invalid u12imm argument!");
@@ -381,14 +419,16 @@ void PPCInstPrinter::printU12ImmOperand(const MCInst *MI, unsigned OpNo,
 }
 
 void PPCInstPrinter::printS16ImmOperand(const MCInst *MI, unsigned OpNo,
+                                        const MCSubtargetInfo &STI,
                                         raw_ostream &O) {
   if (MI->getOperand(OpNo).isImm())
     O << (short)MI->getOperand(OpNo).getImm();
   else
-    printOperand(MI, OpNo, O);
+    printOperand(MI, OpNo, STI, O);
 }
 
 void PPCInstPrinter::printS34ImmOperand(const MCInst *MI, unsigned OpNo,
+                                        const MCSubtargetInfo &STI,
                                         raw_ostream &O) {
   if (MI->getOperand(OpNo).isImm()) {
     long long Value = MI->getOperand(OpNo).getImm();
@@ -396,21 +436,24 @@ void PPCInstPrinter::printS34ImmOperand(const MCInst *MI, unsigned OpNo,
     O << (long long)Value;
   }
   else
-    printOperand(MI, OpNo, O);
+    printOperand(MI, OpNo, STI, O);
 }
 
 void PPCInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo,
+                                        const MCSubtargetInfo &STI,
                                         raw_ostream &O) {
   if (MI->getOperand(OpNo).isImm())
     O << (unsigned short)MI->getOperand(OpNo).getImm();
   else
-    printOperand(MI, OpNo, O);
+    printOperand(MI, OpNo, STI, O);
 }
 
 void PPCInstPrinter::printBranchOperand(const MCInst *MI, uint64_t Address,
-                                        unsigned OpNo, raw_ostream &O) {
+                                        unsigned OpNo,
+                                        const MCSubtargetInfo &STI,
+                                        raw_ostream &O) {
   if (!MI->getOperand(OpNo).isImm())
-    return printOperand(MI, OpNo, O);
+    return printOperand(MI, OpNo, STI, O);
   int32_t Imm = SignExtend32<32>((unsigned)MI->getOperand(OpNo).getImm() << 2);
   if (PrintBranchImmAsAddress) {
     uint64_t Target = Address + Imm;
@@ -433,16 +476,16 @@ void PPCInstPrinter::printBranchOperand(const MCInst *MI, uint64_t Address,
 }
 
 void PPCInstPrinter::printAbsBranchOperand(const MCInst *MI, unsigned OpNo,
+                                           const MCSubtargetInfo &STI,
                                            raw_ostream &O) {
   if (!MI->getOperand(OpNo).isImm())
-    return printOperand(MI, OpNo, O);
+    return printOperand(MI, OpNo, STI, O);
 
   O << SignExtend32<32>((unsigned)MI->getOperand(OpNo).getImm() << 2);
 }
 
-
 void PPCInstPrinter::printcrbitm(const MCInst *MI, unsigned OpNo,
-                                 raw_ostream &O) {
+                                 const MCSubtargetInfo &STI, raw_ostream &O) {
   unsigned CCReg = MI->getOperand(OpNo).getReg();
   unsigned RegNo;
   switch (CCReg) {
@@ -460,33 +503,37 @@ void PPCInstPrinter::printcrbitm(const MCInst *MI, unsigned OpNo,
 }
 
 void PPCInstPrinter::printMemRegImm(const MCInst *MI, unsigned OpNo,
+                                    const MCSubtargetInfo &STI,
                                     raw_ostream &O) {
-  printS16ImmOperand(MI, OpNo, O);
+  printS16ImmOperand(MI, OpNo, STI, O);
   O << '(';
   if (MI->getOperand(OpNo+1).getReg() == PPC::R0)
     O << "0";
   else
-    printOperand(MI, OpNo+1, O);
+    printOperand(MI, OpNo + 1, STI, O);
   O << ')';
 }
 
 void PPCInstPrinter::printMemRegImm34PCRel(const MCInst *MI, unsigned OpNo,
+                                           const MCSubtargetInfo &STI,
                                            raw_ostream &O) {
-  printS34ImmOperand(MI, OpNo, O);
+  printS34ImmOperand(MI, OpNo, STI, O);
   O << '(';
-  printImmZeroOperand(MI, OpNo + 1, O);
+  printImmZeroOperand(MI, OpNo + 1, STI, O);
   O << ')';
 }
 
 void PPCInstPrinter::printMemRegImm34(const MCInst *MI, unsigned OpNo,
-                                        raw_ostream &O) {
-  printS34ImmOperand(MI, OpNo, O);
+                                      const MCSubtargetInfo &STI,
+                                      raw_ostream &O) {
+  printS34ImmOperand(MI, OpNo, STI, O);
   O << '(';
-  printOperand(MI, OpNo + 1, O);
+  printOperand(MI, OpNo + 1, STI, O);
   O << ')';
 }
 
 void PPCInstPrinter::printMemRegReg(const MCInst *MI, unsigned OpNo,
+                                    const MCSubtargetInfo &STI,
                                     raw_ostream &O) {
   // When used as the base register, r0 reads constant zero rather than
   // the value contained in the register.  For this reason, the darwin
@@ -494,13 +541,13 @@ void PPCInstPrinter::printMemRegReg(const MCInst *MI, unsigned OpNo,
   if (MI->getOperand(OpNo).getReg() == PPC::R0)
     O << "0";
   else
-    printOperand(MI, OpNo, O);
+    printOperand(MI, OpNo, STI, O);
   O << ", ";
-  printOperand(MI, OpNo+1, O);
+  printOperand(MI, OpNo + 1, STI, O);
 }
 
 void PPCInstPrinter::printTLSCall(const MCInst *MI, unsigned OpNo,
-                                  raw_ostream &O) {
+                                  const MCSubtargetInfo &STI, raw_ostream &O) {
   // On PPC64, VariantKind is VK_None, but on PPC32, it's VK_PLT, and it must
   // come at the _end_ of the expression.
   const MCOperand &Op = MI->getOperand(OpNo);
@@ -513,10 +560,17 @@ void PPCInstPrinter::printTLSCall(const MCInst *MI, unsigned OpNo,
     RefExp = cast<MCSymbolRefExpr>(Op.getExpr());
 
   O << RefExp->getSymbol().getName();
+  // The variant kind VK_PPC_NOTOC needs to be handled as a special case
+  // because we do not want the assembly to print out the @notoc at the
+  // end like __tls_get_addr(x@tlsgd)@notoc. Instead we want it to look
+  // like __tls_get_addr@notoc(x@tlsgd).
+  if (RefExp->getKind() == MCSymbolRefExpr::VK_PPC_NOTOC)
+    O << '@' << MCSymbolRefExpr::getVariantKindName(RefExp->getKind());
   O << '(';
-  printOperand(MI, OpNo+1, O);
+  printOperand(MI, OpNo + 1, STI, O);
   O << ')';
-  if (RefExp->getKind() != MCSymbolRefExpr::VK_None)
+  if (RefExp->getKind() != MCSymbolRefExpr::VK_None &&
+      RefExp->getKind() != MCSymbolRefExpr::VK_PPC_NOTOC)
     O << '@' << MCSymbolRefExpr::getVariantKindName(RefExp->getKind());
   if (ConstExp != nullptr)
     O << '+' << ConstExp->getValue();
@@ -525,7 +579,7 @@ void PPCInstPrinter::printTLSCall(const MCInst *MI, unsigned OpNo,
 /// showRegistersWithPercentPrefix - Check if this register name should be
 /// printed with a percentage symbol as prefix.
 bool PPCInstPrinter::showRegistersWithPercentPrefix(const char *RegName) const {
-  if (!FullRegNamesWithPercent || TT.isOSDarwin() || TT.getOS() == Triple::AIX)
+  if (!FullRegNamesWithPercent || TT.getOS() == Triple::AIX)
     return false;
 
   switch (RegName[0]) {
@@ -545,7 +599,7 @@ bool PPCInstPrinter::showRegistersWithPercentPrefix(const char *RegName) const {
 const char *PPCInstPrinter::getVerboseConditionRegName(unsigned RegNum,
                                                        unsigned RegEncoding)
                                                        const {
-  if (!TT.isOSDarwin() && !FullRegNames)
+  if (!FullRegNames)
     return nullptr;
   if (RegNum < PPC::CR0EQ || RegNum > PPC::CR7UN)
     return nullptr;
@@ -567,11 +621,11 @@ const char *PPCInstPrinter::getVerboseConditionRegName(unsigned RegNum,
 bool PPCInstPrinter::showRegistersWithPrefix() const {
   if (TT.getOS() == Triple::AIX)
     return false;
-  return TT.isOSDarwin() || FullRegNamesWithPercent || FullRegNames;
+  return FullRegNamesWithPercent || FullRegNames;
 }
 
 void PPCInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
-                                  raw_ostream &O) {
+                                  const MCSubtargetInfo &STI, raw_ostream &O) {
   const MCOperand &Op = MI->getOperand(OpNo);
   if (Op.isReg()) {
     unsigned Reg = Op.getReg();
@@ -600,4 +654,3 @@ void PPCInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   assert(Op.isExpr() && "unknown operand kind in printOperand");
   Op.getExpr()->print(O, &MAI);
 }
-
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.h b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.h
index 9763aeceef94..5e9b01494416 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.h
@@ -36,45 +36,73 @@ public:
                  const MCSubtargetInfo &STI, raw_ostream &O) override;
 
   // Autogenerated by tblgen.
-  void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &O);
+  std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
+  void printInstruction(const MCInst *MI, uint64_t Address,
+                        const MCSubtargetInfo &STI, raw_ostream &O);
   static const char *getRegisterName(unsigned RegNo);
 
-  bool printAliasInstr(const MCInst *MI, uint64_t Address, raw_ostream &OS);
+  bool printAliasInstr(const MCInst *MI, uint64_t Address,
+                       const MCSubtargetInfo &STI, raw_ostream &OS);
   void printCustomAliasOperand(const MCInst *MI, uint64_t Address,
                                unsigned OpIdx, unsigned PrintMethodIdx,
-                               raw_ostream &OS);
+                               const MCSubtargetInfo &STI, raw_ostream &OS);
 
-  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                    raw_ostream &O);
   void printPredicateOperand(const MCInst *MI, unsigned OpNo,
-                             raw_ostream &O, const char *Modifier = nullptr);
-  void printATBitsAsHint(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+                             const MCSubtargetInfo &STI, raw_ostream &O,
+                             const char *Modifier = nullptr);
+  void printATBitsAsHint(const MCInst *MI, unsigned OpNo,
+                         const MCSubtargetInfo &STI, raw_ostream &O);
 
-  void printU1ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printU2ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printU3ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printU4ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printS5ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printU5ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printU6ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printU7ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printU8ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printU10ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printU12ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printS16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printS34ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printU16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printImmZeroOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printU1ImmOperand(const MCInst *MI, unsigned OpNo,
+                         const MCSubtargetInfo &STI, raw_ostream &O);
+  void printU2ImmOperand(const MCInst *MI, unsigned OpNo,
+                         const MCSubtargetInfo &STI, raw_ostream &O);
+  void printU3ImmOperand(const MCInst *MI, unsigned OpNo,
+                         const MCSubtargetInfo &STI, raw_ostream &O);
+  void printU4ImmOperand(const MCInst *MI, unsigned OpNo,
+                         const MCSubtargetInfo &STI, raw_ostream &O);
+  void printS5ImmOperand(const MCInst *MI, unsigned OpNo,
+                         const MCSubtargetInfo &STI, raw_ostream &O);
+  void printU5ImmOperand(const MCInst *MI, unsigned OpNo,
+                         const MCSubtargetInfo &STI, raw_ostream &O);
+  void printU6ImmOperand(const MCInst *MI, unsigned OpNo,
+                         const MCSubtargetInfo &STI, raw_ostream &O);
+  void printU7ImmOperand(const MCInst *MI, unsigned OpNo,
+                         const MCSubtargetInfo &STI, raw_ostream &O);
+  void printU8ImmOperand(const MCInst *MI, unsigned OpNo,
+                         const MCSubtargetInfo &STI, raw_ostream &O);
+  void printU10ImmOperand(const MCInst *MI, unsigned OpNo,
+                          const MCSubtargetInfo &STI, raw_ostream &O);
+  void printU12ImmOperand(const MCInst *MI, unsigned OpNo,
+                          const MCSubtargetInfo &STI, raw_ostream &O);
+  void printS16ImmOperand(const MCInst *MI, unsigned OpNo,
+                          const MCSubtargetInfo &STI, raw_ostream &O);
+  void printS34ImmOperand(const MCInst *MI, unsigned OpNo,
+                          const MCSubtargetInfo &STI, raw_ostream &O);
+  void printU16ImmOperand(const MCInst *MI, unsigned OpNo,
+                          const MCSubtargetInfo &STI, raw_ostream &O);
+  void printImmZeroOperand(const MCInst *MI, unsigned OpNo,
+                           const MCSubtargetInfo &STI, raw_ostream &O);
   void printBranchOperand(const MCInst *MI, uint64_t Address, unsigned OpNo,
-                          raw_ostream &O);
-  void printAbsBranchOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printTLSCall(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+                          const MCSubtargetInfo &STI, raw_ostream &O);
+  void printAbsBranchOperand(const MCInst *MI, unsigned OpNo,
+                             const MCSubtargetInfo &STI, raw_ostream &O);
+  void printTLSCall(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                    raw_ostream &O);
 
-  void printcrbitm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printcrbitm(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                   raw_ostream &O);
 
-  void printMemRegImm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printMemRegImm34PCRel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printMemRegImm34(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printMemRegReg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printMemRegImm(const MCInst *MI, unsigned OpNo,
+                      const MCSubtargetInfo &STI, raw_ostream &O);
+  void printMemRegImm34PCRel(const MCInst *MI, unsigned OpNo,
+                             const MCSubtargetInfo &STI, raw_ostream &O);
+  void printMemRegImm34(const MCInst *MI, unsigned OpNo,
+                        const MCSubtargetInfo &STI, raw_ostream &O);
+  void printMemRegReg(const MCInst *MI, unsigned OpNo,
+                      const MCSubtargetInfo &STI, raw_ostream &O);
 };
 } // end namespace llvm
 
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
index 593dc2843c3d..2b76af279ce6 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
@@ -26,7 +26,8 @@ PPCELFMCAsmInfo::PPCELFMCAsmInfo(bool is64Bit, const Triple& T) {
   if (is64Bit) {
     CodePointerSize = CalleeSaveStackSlotSize = 8;
   }
-  IsLittleEndian = T.getArch() == Triple::ppc64le;
+  IsLittleEndian =
+      T.getArch() == Triple::ppc64le || T.getArch() == Triple::ppcle;
 
   // ".comm align is in bytes but .align is pow-2."
   AlignmentIsInBytes = false;
@@ -56,7 +57,7 @@ PPCELFMCAsmInfo::PPCELFMCAsmInfo(bool is64Bit, const Triple& T) {
 void PPCXCOFFMCAsmInfo::anchor() {}
 
 PPCXCOFFMCAsmInfo::PPCXCOFFMCAsmInfo(bool Is64Bit, const Triple &T) {
-  if (T.getArch() == Triple::ppc64le)
+  if (T.getArch() == Triple::ppc64le || T.getArch() == Triple::ppcle)
     report_fatal_error("XCOFF is not supported for little-endian targets");
   CodePointerSize = CalleeSaveStackSlotSize = Is64Bit ? 8 : 4;
 
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
index 27c687686641..48806051f581 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
@@ -13,7 +13,6 @@
 #ifndef LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCMCASMINFO_H
 #define LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCMCASMINFO_H
 
-#include "llvm/MC/MCAsmInfoDarwin.h"
 #include "llvm/MC/MCAsmInfoELF.h"
 #include "llvm/MC/MCAsmInfoXCOFF.h"
 
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
index fb65e7320f2b..5f0769fd21f9 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
@@ -44,11 +44,13 @@ getDirectBrEncoding(const MCInst &MI, unsigned OpNo,
                     SmallVectorImpl<MCFixup> &Fixups,
                     const MCSubtargetInfo &STI) const {
   const MCOperand &MO = MI.getOperand(OpNo);
-  if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups, STI);
 
+  if (MO.isReg() || MO.isImm())
+    return getMachineOpValue(MI, MO, Fixups, STI);
   // Add a fixup for the branch target.
   Fixups.push_back(MCFixup::create(0, MO.getExpr(),
-                                   ((MI.getOpcode() == PPC::BL8_NOTOC)
+                                   ((MI.getOpcode() == PPC::BL8_NOTOC ||
+                                     MI.getOpcode() == PPC::BL8_NOTOC_TLS)
                                         ? (MCFixupKind)PPC::fixup_ppc_br24_notoc
                                         : (MCFixupKind)PPC::fixup_ppc_br24)));
   return 0;
@@ -92,6 +94,16 @@ getAbsCondBrEncoding(const MCInst &MI, unsigned OpNo,
   return 0;
 }
 
+unsigned
+PPCMCCodeEmitter::getVSRpEvenEncoding(const MCInst &MI, unsigned OpNo,
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const {
+  assert(MI.getOperand(OpNo).isReg() && "Operand should be a register");
+  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, STI)
+                     << 1;
+  return RegBits;
+}
+
 unsigned PPCMCCodeEmitter::getImm16Encoding(const MCInst &MI, unsigned OpNo,
                                        SmallVectorImpl<MCFixup> &Fixups,
                                        const MCSubtargetInfo &STI) const {
@@ -104,20 +116,36 @@ unsigned PPCMCCodeEmitter::getImm16Encoding(const MCInst &MI, unsigned OpNo,
   return 0;
 }
 
-uint64_t
-PPCMCCodeEmitter::getImm34Encoding(const MCInst &MI, unsigned OpNo,
-                                   SmallVectorImpl<MCFixup> &Fixups,
-                                   const MCSubtargetInfo &STI) const {
+uint64_t PPCMCCodeEmitter::getImm34Encoding(const MCInst &MI, unsigned OpNo,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI,
+                                            MCFixupKind Fixup) const {
   const MCOperand &MO = MI.getOperand(OpNo);
-  if (MO.isReg() || MO.isImm())
+  assert(!MO.isReg() && "Not expecting a register for this operand.");
+  if (MO.isImm())
     return getMachineOpValue(MI, MO, Fixups, STI);
 
   // Add a fixup for the immediate field.
-  Fixups.push_back(MCFixup::create(0, MO.getExpr(),
-                                   (MCFixupKind)PPC::fixup_ppc_pcrel34));
+  Fixups.push_back(MCFixup::create(0, MO.getExpr(), Fixup));
   return 0;
 }
 
+uint64_t
+PPCMCCodeEmitter::getImm34EncodingNoPCRel(const MCInst &MI, unsigned OpNo,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  return getImm34Encoding(MI, OpNo, Fixups, STI,
+                          (MCFixupKind)PPC::fixup_ppc_imm34);
+}
+
+uint64_t
+PPCMCCodeEmitter::getImm34EncodingPCRel(const MCInst &MI, unsigned OpNo,
+                                        SmallVectorImpl<MCFixup> &Fixups,
+                                        const MCSubtargetInfo &STI) const {
+  return getImm34Encoding(MI, OpNo, Fixups, STI,
+                          (MCFixupKind)PPC::fixup_ppc_pcrel34);
+}
+
 unsigned PPCMCCodeEmitter::getMemRIEncoding(const MCInst &MI, unsigned OpNo,
                                             SmallVectorImpl<MCFixup> &Fixups,
                                             const MCSubtargetInfo &STI) const {
@@ -213,8 +241,13 @@ PPCMCCodeEmitter::getMemRI34PCRelEncoding(const MCInst &MI, unsigned OpNo,
     (void)SRE;
     // Currently these are the only valid PCRelative Relocations.
     assert((SRE->getKind() == MCSymbolRefExpr::VK_PCREL ||
-            SRE->getKind() == MCSymbolRefExpr::VK_PPC_GOT_PCREL) &&
-           "VariantKind must be VK_PCREL or VK_PPC_GOT_PCREL");
+            SRE->getKind() == MCSymbolRefExpr::VK_PPC_GOT_PCREL ||
+            SRE->getKind() == MCSymbolRefExpr::VK_PPC_GOT_TLSGD_PCREL ||
+            SRE->getKind() == MCSymbolRefExpr::VK_PPC_GOT_TLSLD_PCREL ||
+            SRE->getKind() == MCSymbolRefExpr::VK_PPC_GOT_TPREL_PCREL) &&
+           "VariantKind must be VK_PCREL or VK_PPC_GOT_PCREL or "
+           "VK_PPC_GOT_TLSGD_PCREL or VK_PPC_GOT_TLSLD_PCREL or "
+           "VK_PPC_GOT_TPREL_PCREL.");
     // Generate the fixup for the relocation.
     Fixups.push_back(
         MCFixup::create(0, Expr,
@@ -326,8 +359,12 @@ unsigned PPCMCCodeEmitter::getTLSRegEncoding(const MCInst &MI, unsigned OpNo,
 
   // Add a fixup for the TLS register, which simply provides a relocation
   // hint to the linker that this statement is part of a relocation sequence.
-  // Return the thread-pointer register's encoding.
-  Fixups.push_back(MCFixup::create(0, MO.getExpr(),
+  // Return the thread-pointer register's encoding. Add a one byte displacement
+  // if using PC relative memops.
+  const MCExpr *Expr = MO.getExpr();
+  const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(Expr);
+  bool IsPCRel = SRE->getKind() == MCSymbolRefExpr::VK_PPC_TLS_PCREL;
+  Fixups.push_back(MCFixup::create(IsPCRel ? 1 : 0, Expr,
                                    (MCFixupKind)PPC::fixup_ppc_nofixup));
   const Triple &TT = STI.getTargetTriple();
   bool isPPC64 = TT.isPPC64();
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h
index 588aa76bd806..347e163c9515 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h
@@ -52,7 +52,14 @@ public:
                             const MCSubtargetInfo &STI) const;
   uint64_t getImm34Encoding(const MCInst &MI, unsigned OpNo,
                             SmallVectorImpl<MCFixup> &Fixups,
-                            const MCSubtargetInfo &STI) const;
+                            const MCSubtargetInfo &STI,
+                            MCFixupKind Fixup) const;
+  uint64_t getImm34EncodingNoPCRel(const MCInst &MI, unsigned OpNo,
+                                   SmallVectorImpl<MCFixup> &Fixups,
+                                   const MCSubtargetInfo &STI) const;
+  uint64_t getImm34EncodingPCRel(const MCInst &MI, unsigned OpNo,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
   unsigned getMemRIEncoding(const MCInst &MI, unsigned OpNo,
                             SmallVectorImpl<MCFixup> &Fixups,
                             const MCSubtargetInfo &STI) const;
@@ -86,6 +93,9 @@ public:
   unsigned get_crbitm_encoding(const MCInst &MI, unsigned OpNo,
                                SmallVectorImpl<MCFixup> &Fixups,
                                const MCSubtargetInfo &STI) const;
+  unsigned getVSRpEvenEncoding(const MCInst &MI, unsigned OpNo,
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
 
   /// getMachineOpValue - Return binary encoding of operand. If the machine
   /// operand requires relocation, record the relocation and return zero.
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
index 3092d56da1c5..bf9c6feb541e 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -20,8 +20,8 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/BinaryFormat/ELF.h"
-#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDwarf.h"
@@ -30,6 +30,7 @@
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSectionXCOFF.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
@@ -77,7 +78,17 @@ static MCRegisterInfo *createPPCMCRegisterInfo(const Triple &TT) {
 
 static MCSubtargetInfo *createPPCMCSubtargetInfo(const Triple &TT,
                                                  StringRef CPU, StringRef FS) {
-  return createPPCMCSubtargetInfoImpl(TT, CPU, FS);
+  // Set some default feature to MC layer.
+  std::string FullFS = std::string(FS);
+
+  if (TT.isOSAIX()) {
+    if (!FullFS.empty())
+      FullFS = "+aix," + FullFS;
+    else
+      FullFS = "+aix";
+  }
+
+  return createPPCMCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FullFS);
 }
 
 static MCAsmInfo *createPPCMCAsmInfo(const MCRegisterInfo &MRI,
@@ -122,11 +133,12 @@ public:
   void emitTCEntry(const MCSymbol &S) override {
     if (const MCSymbolXCOFF *XSym = dyn_cast<MCSymbolXCOFF>(&S)) {
       MCSymbolXCOFF *TCSym =
-          cast<MCSymbolXCOFF>(Streamer.getContext().getOrCreateSymbol(
-              XSym->getSymbolTableName() + "[TC]"));
+          cast<MCSectionXCOFF>(Streamer.getCurrentSectionOnly())
+              ->getQualNameSymbol();
+      OS << "\t.tc " << TCSym->getName() << "," << XSym->getName() << '\n';
+
       if (TCSym->hasRename())
         Streamer.emitXCOFFRenameDirective(TCSym, TCSym->getSymbolTableName());
-      OS << "\t.tc " << TCSym->getName() << "," << XSym->getName() << '\n';
       return;
     }
 
@@ -334,8 +346,8 @@ static MCInstPrinter *createPPCMCInstPrinter(const Triple &T,
 }
 
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCTargetMC() {
-  for (Target *T :
-       {&getThePPC32Target(), &getThePPC64Target(), &getThePPC64LETarget()}) {
+  for (Target *T : {&getThePPC32Target(), &getThePPC32LETarget(),
+                    &getThePPC64Target(), &getThePPC64LETarget()}) {
     // Register the MC asm info.
     RegisterMCAsmInfoFn C(*T, createPPCMCAsmInfo);
 
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
index 719e005d9813..03b316341717 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
@@ -124,6 +124,11 @@ static inline bool isRunOfOnes64(uint64_t Val, unsigned &MB, unsigned &ME) {
 #define GET_SUBTARGETINFO_ENUM
 #include "PPCGenSubtargetInfo.inc"
 
+#define PPC_REGS0_7(X)                                                         \
+  {                                                                            \
+    X##0, X##1, X##2, X##3, X##4, X##5, X##6, X##7                             \
+  }
+
 #define PPC_REGS0_31(X)                                                        \
   {                                                                            \
     X##0, X##1, X##2, X##3, X##4, X##5, X##6, X##7, X##8, X##9, X##10, X##11,  \
@@ -156,10 +161,10 @@ using llvm::MCPhysReg;
   static const MCPhysReg RRegs[32] = PPC_REGS0_31(PPC::R); \
   static const MCPhysReg XRegs[32] = PPC_REGS0_31(PPC::X); \
   static const MCPhysReg FRegs[32] = PPC_REGS0_31(PPC::F); \
+  static const MCPhysReg VSRpRegs[32] = PPC_REGS0_31(PPC::VSRp); \
   static const MCPhysReg SPERegs[32] = PPC_REGS0_31(PPC::S); \
   static const MCPhysReg VFRegs[32] = PPC_REGS0_31(PPC::VF); \
   static const MCPhysReg VRegs[32] = PPC_REGS0_31(PPC::V); \
-  static const MCPhysReg QFRegs[32] = PPC_REGS0_31(PPC::QF); \
   static const MCPhysReg RRegsNoR0[32] = \
     PPC_REGS_NO0_31(PPC::ZERO, PPC::R); \
   static const MCPhysReg XRegsNoX0[32] = \
@@ -179,8 +184,6 @@ using llvm::MCPhysReg;
     PPC::CR5LT, PPC::CR5GT, PPC::CR5EQ, PPC::CR5UN, \
     PPC::CR6LT, PPC::CR6GT, PPC::CR6EQ, PPC::CR6UN, \
     PPC::CR7LT, PPC::CR7GT, PPC::CR7EQ, PPC::CR7UN}; \
-  static const MCPhysReg CRRegs[8] = { \
-    PPC::CR0, PPC::CR1, PPC::CR2, PPC::CR3, \
-    PPC::CR4, PPC::CR5, PPC::CR6, PPC::CR7}
-
+  static const MCPhysReg CRRegs[8] = PPC_REGS0_7(PPC::CR); \
+  static const MCPhysReg ACCRegs[8] = PPC_REGS0_7(PPC::ACC)
 #endif // LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCMCTARGETDESC_H
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp
index d672d54772e0..77b0331bb14c 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp
@@ -58,14 +58,19 @@ std::pair<uint8_t, uint8_t> PPCXCOFFObjectWriter::getRelocTypeAndSignSize(
   switch ((unsigned)Fixup.getKind()) {
   default:
     report_fatal_error("Unimplemented fixup kind.");
-  case PPC::fixup_ppc_half16:
+  case PPC::fixup_ppc_half16: {
+    const uint8_t SignAndSizeForHalf16 = EncodedSignednessIndicator | 15;
     switch (Modifier) {
     default:
       report_fatal_error("Unsupported modifier for half16 fixup.");
     case MCSymbolRefExpr::VK_None:
-      return {XCOFF::RelocationType::R_TOC, EncodedSignednessIndicator | 15};
+      return {XCOFF::RelocationType::R_TOC, SignAndSizeForHalf16};
+    case MCSymbolRefExpr::VK_PPC_U:
+      return {XCOFF::RelocationType::R_TOCU, SignAndSizeForHalf16};
+    case MCSymbolRefExpr::VK_PPC_L:
+      return {XCOFF::RelocationType::R_TOCL, SignAndSizeForHalf16};
     }
-    break;
+  } break;
   case PPC::fixup_ppc_br24:
     // Branches are 4 byte aligned, so the 24 bits we encode in
     // the instruction actually represents a 26 bit offset.
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/P9InstrResources.td b/contrib/llvm-project/llvm/lib/Target/PowerPC/P9InstrResources.td
index d7e3519d5539..63531f72adfb 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/P9InstrResources.td
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/P9InstrResources.td
@@ -94,7 +94,7 @@ def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_3SLOTS_1C],
     (instregex "CMPRB(8)?$"),
     (instregex "TD(I)?$"),
     (instregex "TW(I)?$"),
-    (instregex "FCMPU(S|D)$"),
+    (instregex "FCMP(O|U)(S|D)$"),
     (instregex "XSTSTDC(S|D)P$"),
     FTDIV,
     FTSQRT,
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPC.h b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPC.h
index 7e0aa2c6061d..264582b244a7 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPC.h
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPC.h
@@ -20,17 +20,20 @@
 #undef PPC
 
 namespace llvm {
-  class PPCTargetMachine;
-  class PassRegistry;
-  class FunctionPass;
-  class MachineInstr;
-  class MachineOperand;
-  class AsmPrinter;
-  class MCInst;
-  class MCOperand;
-  class ModulePass;
-  
-  FunctionPass *createPPCCTRLoops();
+class PPCRegisterBankInfo;
+class PPCSubtarget;
+class PPCTargetMachine;
+class PassRegistry;
+class FunctionPass;
+class InstructionSelector;
+class MachineInstr;
+class MachineOperand;
+class AsmPrinter;
+class MCInst;
+class MCOperand;
+class ModulePass;
+
+FunctionPass *createPPCCTRLoops();
 #ifndef NDEBUG
   FunctionPass *createPPCCTRLoopsVerify();
 #endif
@@ -44,7 +47,6 @@ namespace llvm {
   FunctionPass *createPPCMIPeepholePass();
   FunctionPass *createPPCBranchSelectionPass();
   FunctionPass *createPPCBranchCoalescingPass();
-  FunctionPass *createPPCQPXLoadSplatPass();
   FunctionPass *createPPCISelDag(PPCTargetMachine &TM, CodeGenOpt::Level OL);
   FunctionPass *createPPCTLSDynamicCallPass();
   FunctionPass *createPPCBoolRetToIntPass();
@@ -68,7 +70,6 @@ namespace llvm {
   void initializePPCReduceCRLogicalsPass(PassRegistry&);
   void initializePPCBSelPass(PassRegistry&);
   void initializePPCBranchCoalescingPass(PassRegistry&);
-  void initializePPCQPXLoadSplatPass(PassRegistry&);
   void initializePPCBoolRetToIntPass(PassRegistry&);
   void initializePPCExpandISELPass(PassRegistry &);
   void initializePPCPreEmitPeepholePass(PassRegistry &);
@@ -80,7 +81,10 @@ namespace llvm {
   ModulePass *createPPCLowerMASSVEntriesPass();
   void initializePPCLowerMASSVEntriesPass(PassRegistry &);
   extern char &PPCLowerMASSVEntriesID;
-  
+
+  InstructionSelector *
+  createPPCInstructionSelector(const PPCTargetMachine &, const PPCSubtarget &,
+                               const PPCRegisterBankInfo &);
   namespace PPCII {
 
   /// Target Operand Flag enum.
@@ -107,6 +111,37 @@ namespace llvm {
     /// produce the relocation @got@pcrel. Fixup is VK_PPC_GOT_PCREL.
     MO_GOT_FLAG = 8,
 
+    // MO_PCREL_OPT_FLAG - If this bit is set the operand is part of a
+    // PC Relative linker optimization.
+    MO_PCREL_OPT_FLAG = 16,
+
+    /// MO_TLSGD_FLAG - If this bit is set the symbol reference is relative to
+    /// TLS General Dynamic model.
+    MO_TLSGD_FLAG = 32,
+
+    /// MO_TPREL_FLAG - If this bit is set the symbol reference is relative to
+    /// TLS Initial Exec model.
+    MO_TPREL_FLAG = 64,
+
+    /// MO_TLSLD_FLAG - If this bit is set the symbol reference is relative to
+    /// TLS Local Dynamic model.
+    MO_TLSLD_FLAG = 128,
+
+    /// MO_GOT_TLSGD_PCREL_FLAG - A combintaion of flags, if these bits are set
+    /// they should produce the relocation @got@tlsgd@pcrel.
+    /// Fix up is VK_PPC_GOT_TLSGD_PCREL
+    MO_GOT_TLSGD_PCREL_FLAG = MO_PCREL_FLAG | MO_GOT_FLAG | MO_TLSGD_FLAG,
+
+    /// MO_GOT_TLSLD_PCREL_FLAG - A combintaion of flags, if these bits are set
+    /// they should produce the relocation @got@tlsld@pcrel.
+    /// Fix up is VK_PPC_GOT_TLSLD_PCREL
+    MO_GOT_TLSLD_PCREL_FLAG = MO_PCREL_FLAG | MO_GOT_FLAG | MO_TLSLD_FLAG,
+
+    /// MO_GOT_TPREL_PCREL_FLAG - A combintaion of flags, if these bits are set
+    /// they should produce the relocation @got@tprel@pcrel.
+    /// Fix up is VK_PPC_GOT_TPREL_PCREL
+    MO_GOT_TPREL_PCREL_FLAG = MO_GOT_FLAG | MO_TPREL_FLAG | MO_PCREL_FLAG,
+
     /// The next are not flags but distinct values.
     MO_ACCESS_MASK = 0xf00,
 
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPC.td b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPC.td
index 9ad78bf67fe6..1e6ded231585 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPC.td
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPC.td
@@ -57,6 +57,10 @@ def DirectivePwrFuture
 
 def Feature64Bit     : SubtargetFeature<"64bit","Has64BitSupport", "true",
                                         "Enable 64-bit instructions">;
+def AIXOS: SubtargetFeature<"aix", "IsAIX", "true", "AIX OS">;
+def FeatureModernAIXAs
+    : SubtargetFeature<"modern-aix-as", "HasModernAIXAs", "true",
+                       "AIX system assembler is modern enough to support new mnes">;
 def FeatureHardFloat : SubtargetFeature<"hard-float", "HasHardFloat", "true",
                               "Enable floating-point instructions">;
 def Feature64BitRegs : SubtargetFeature<"64bitregs","Use64BitRegs", "true",
@@ -72,6 +76,9 @@ def FeatureAltivec   : SubtargetFeature<"altivec","HasAltivec", "true",
 def FeatureSPE       : SubtargetFeature<"spe","HasSPE", "true",
                                         "Enable SPE instructions",
                                         [FeatureHardFloat]>;
+def FeatureEFPU2 : SubtargetFeature<"efpu2", "HasEFPU2", "true", 
+                                        "Enable Embedded Floating-Point APU 2 instructions",
+                                        [FeatureSPE]>;
 def FeatureMFOCRF    : SubtargetFeature<"mfocrf","HasMFOCRF", "true",
                                         "Enable the MFOCRF instruction">;
 def FeatureFSqrt     : SubtargetFeature<"fsqrt","HasFSQRT", "true",
@@ -132,9 +139,6 @@ def FeaturePPC4xx    : SubtargetFeature<"ppc4xx", "IsPPC4xx", "true",
                                         "Enable PPC 4xx instructions">;
 def FeaturePPC6xx    : SubtargetFeature<"ppc6xx", "IsPPC6xx", "true",
                                         "Enable PPC 6xx instructions">;
-def FeatureQPX       : SubtargetFeature<"qpx","HasQPX", "true",
-                                        "Enable QPX instructions",
-                                        [FeatureFPU]>;
 def FeatureVSX       : SubtargetFeature<"vsx","HasVSX", "true",
                                         "Enable VSX instructions",
                                         [FeatureAltivec]>;
@@ -177,6 +181,9 @@ def FeatureAddisLoadFusion : SubtargetFeature<"fuse-addis-load",
                                               "HasAddisLoadFusion", "true",
                                               "Power8 Addis-Load fusion",
                                               [FeatureFusion]>;
+def FeatureStoreFusion : SubtargetFeature<"fuse-store", "HasStoreFusion", "true",
+                                          "Target supports store clustering",
+                                          [FeatureFusion]>;
 def FeatureUnalignedFloats :
   SubtargetFeature<"allow-unaligned-fp-access", "AllowsUnalignedFPAccess",
                    "true", "CPU does not trap on unaligned FP access">;
@@ -193,7 +200,7 @@ def FeatureFloat128 :
 def FeaturePOPCNTD   : SubtargetFeature<"popcntd","HasPOPCNTD",
                                         "POPCNTD_Fast",
                                         "Enable the popcnt[dw] instructions">;
-// Note that for the a2/a2q processor models we should not use popcnt[dw] by
+// Note that for the a2 processor models we should not use popcnt[dw] by
 // default. These processors do support the instructions, but they're
 // microcoded, and the software emulation is about twice as fast.
 def FeatureSlowPOPCNTD : SubtargetFeature<"slow-popcntd","HasPOPCNTD",
@@ -236,7 +243,15 @@ def FeaturePrefixInstrs : SubtargetFeature<"prefix-instrs", "HasPrefixInstrs",
 def FeaturePCRelativeMemops :
   SubtargetFeature<"pcrelative-memops", "HasPCRelativeMemops", "true",
                    "Enable PC relative Memory Ops",
+                   [FeatureISA3_0, FeaturePrefixInstrs]>;
+def FeaturePairedVectorMemops:
+  SubtargetFeature<"paired-vector-memops", "PairedVectorMemops", "true",
+                   "32Byte load and store instructions",
                    [FeatureISA3_0]>;
+def FeatureMMA : SubtargetFeature<"mma", "HasMMA", "true",
+                                  "Enable MMA instructions",
+                                  [FeatureP8Vector, FeatureP9Altivec,
+                                   FeaturePairedVectorMemops]>;
 
 def FeaturePredictableSelectIsExpensive :
   SubtargetFeature<"predictable-select-expensive",
@@ -320,6 +335,8 @@ def ProcessorFeatures {
     [DirectivePwr9,
      FeatureP9Altivec,
      FeatureP9Vector,
+     FeaturePPCPreRASched,
+     FeaturePPCPostRASched,
      FeatureISA3_0,
      FeaturePredictableSelectIsExpensive
     ];
@@ -329,9 +346,7 @@ def ProcessorFeatures {
   // dispatch for vector operations than scalar ones. For the time being,
   // this list also includes scheduling-related features since we do not have
   // enough info to create custom scheduling strategies for future CPUs.
-  list<SubtargetFeature> P9SpecificFeatures = [FeatureVectorsUseTwoUnits,
-                                               FeaturePPCPreRASched,
-                                               FeaturePPCPostRASched];
+  list<SubtargetFeature> P9SpecificFeatures = [FeatureVectorsUseTwoUnits];
   list<SubtargetFeature> P9InheritableFeatures =
     !listconcat(P8InheritableFeatures, P9AdditionalFeatures);
   list<SubtargetFeature> P9Features =
@@ -340,9 +355,12 @@ def ProcessorFeatures {
   // Power10
   // For P10 CPU we assume that all of the existing features from Power9
   // still exist with the exception of those we know are Power9 specific.
+  list<SubtargetFeature> FusionFeatures = [FeatureStoreFusion];
   list<SubtargetFeature> P10AdditionalFeatures =
-    [DirectivePwr10, FeatureISA3_1, FeaturePrefixInstrs,
-     FeaturePCRelativeMemops, FeatureP10Vector];
+    !listconcat(FusionFeatures, [
+       DirectivePwr10, FeatureISA3_1, FeaturePrefixInstrs,
+       FeaturePCRelativeMemops, FeatureP10Vector, FeatureMMA,
+       FeaturePairedVectorMemops]);
   list<SubtargetFeature> P10SpecificFeatures = [];
   list<SubtargetFeature> P10InheritableFeatures =
     !listconcat(P9InheritableFeatures, P10AdditionalFeatures);
@@ -427,6 +445,7 @@ def getAltVSXFMAOpcode : InstrMapping {
 
 include "PPCRegisterInfo.td"
 include "PPCSchedule.td"
+include "GISel/PPCRegisterBanks.td"
 
 //===----------------------------------------------------------------------===//
 // PowerPC processors supported.
@@ -514,15 +533,6 @@ def : ProcessorModel<"a2", PPCA2Model,
                    FeatureFPRND, FeatureFPCVT, FeatureISEL,
                    FeatureSlowPOPCNTD, FeatureCMPB, FeatureLDBRX,
                    Feature64Bit /*, Feature64BitRegs */, FeatureMFTB]>;
-def : ProcessorModel<"a2q", PPCA2Model,
-                  [DirectiveA2, FeatureICBT, FeatureBookE, FeatureMFOCRF,
-                   FeatureFCPSGN, FeatureFSqrt, FeatureFRE, FeatureFRES,
-                   FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec,
-                   FeatureSTFIWX, FeatureLFIWAX,
-                   FeatureFPRND, FeatureFPCVT, FeatureISEL,
-                   FeatureSlowPOPCNTD, FeatureCMPB, FeatureLDBRX,
-                   Feature64Bit /*, Feature64BitRegs */, FeatureQPX,
-                   FeatureMFTB]>;
 def : ProcessorModel<"pwr3", G5Model,
                   [DirectivePwr3, FeatureAltivec,
                    FeatureFRES, FeatureFRSQRTE, FeatureMFOCRF,
@@ -561,7 +571,7 @@ def : ProcessorModel<"pwr7", P7Model, ProcessorFeatures.P7Features>;
 def : ProcessorModel<"pwr8", P8Model, ProcessorFeatures.P8Features>;
 def : ProcessorModel<"pwr9", P9Model, ProcessorFeatures.P9Features>;
 // No scheduler model yet.
-def : ProcessorModel<"pwr10", NoSchedModel, ProcessorFeatures.P10Features>;
+def : ProcessorModel<"pwr10", P9Model, ProcessorFeatures.P10Features>;
 // No scheduler model for future CPU.
 def : ProcessorModel<"future", NoSchedModel,
                   ProcessorFeatures.FutureFeatures>;
@@ -592,6 +602,13 @@ def PPCInstrInfo : InstrInfo {
   let noNamedPositionallyEncodedOperands = 1;
 }
 
+def PPCAsmWriter : AsmWriter {
+  string AsmWriterClassName  = "InstPrinter";
+  int PassSubtarget = 1;
+  int Variant = 0;
+  bit isMCAsmWriter = 1;
+}
+
 def PPCAsmParser : AsmParser {
   let ShouldEmitMatchRegisterName = 0;
 }
@@ -610,6 +627,7 @@ def PPC : Target {
   // Information about the instructions.
   let InstructionSet = PPCInstrInfo;
 
+  let AssemblyWriters = [PPCAsmWriter];
   let AssemblyParsers = [PPCAsmParser];
   let AssemblyParserVariants = [PPCAsmParserVariant];
   let AllowRegisterRenaming = 1;
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index bf5fe741bac8..6257709731b9 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -27,11 +27,11 @@
 #include "PPCTargetStreamer.h"
 #include "TargetInfo/PowerPCTargetInfo.h"
 #include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/BinaryFormat/ELF.h"
-#include "llvm/BinaryFormat/MachO.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -47,11 +47,11 @@
 #include "llvm/IR/Module.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstBuilder.h"
 #include "llvm/MC/MCSectionELF.h"
-#include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCSectionXCOFF.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
@@ -62,9 +62,11 @@
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Process.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -72,6 +74,7 @@
 #include <new>
 
 using namespace llvm;
+using namespace llvm::XCOFF;
 
 #define DEBUG_TYPE "asmprinter"
 
@@ -147,7 +150,21 @@ public:
 
 class PPCAIXAsmPrinter : public PPCAsmPrinter {
 private:
+  /// Symbols lowered from ExternalSymbolSDNodes, we will need to emit extern
+  /// linkage for them in AIX.
+  SmallPtrSet<MCSymbol *, 8> ExtSymSDNodeSymbols;
+
+  /// A format indicator and unique trailing identifier to form part of the
+  /// sinit/sterm function names.
+  std::string FormatIndicatorAndUniqueModId;
+
   static void ValidateGV(const GlobalVariable *GV);
+  // Record a list of GlobalAlias associated with a GlobalObject.
+  // This is used for AIX's extra-label-at-definition aliasing strategy.
+  DenseMap<const GlobalObject *, SmallVector<const GlobalAlias *, 1>>
+      GOAliasMap;
+
+  void emitTracebackTable();
 
 public:
   PPCAIXAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
@@ -161,15 +178,28 @@ public:
 
   bool doInitialization(Module &M) override;
 
+  void emitXXStructorList(const DataLayout &DL, const Constant *List,
+                          bool IsCtor) override;
+
   void SetupMachineFunction(MachineFunction &MF) override;
 
   void emitGlobalVariable(const GlobalVariable *GV) override;
 
   void emitFunctionDescriptor() override;
 
+  void emitFunctionEntryLabel() override;
+
+  void emitFunctionBodyEnd() override;
+
   void emitEndOfAsmFile(Module &) override;
 
   void emitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const override;
+
+  void emitInstruction(const MachineInstr *MI) override;
+
+  bool doFinalization(Module &M) override;
+
+  void emitTTypeReference(const GlobalValue *GV, unsigned Encoding) override;
 };
 
 } // end anonymous namespace
@@ -291,6 +321,12 @@ bool PPCAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
       O << "0, ";
       printOperand(MI, OpNo, O);
       return false;
+    case 'I':
+      // Write 'i' if an integer constant, otherwise nothing.  Used to print
+      // addi vs add, etc.
+      if (MI->getOperand(OpNo).isImm())
+        O << "i";
+      return false;
     case 'U': // Print 'u' for update form.
     case 'X': // Print 'x' for indexed form.
       // FIXME: Currently for PowerPC memory operands are always loaded
@@ -463,6 +499,14 @@ void PPCAsmPrinter::EmitTlsCall(const MachineInstr *MI,
   StringRef Name = "__tls_get_addr";
   MCSymbol *TlsGetAddr = OutContext.getOrCreateSymbol(Name);
   MCSymbolRefExpr::VariantKind Kind = MCSymbolRefExpr::VK_None;
+  unsigned Opcode = PPC::BL8_NOP_TLS;
+
+  assert(MI->getNumOperands() >= 3 && "Expecting at least 3 operands from MI");
+  if (MI->getOperand(2).getTargetFlags() == PPCII::MO_GOT_TLSGD_PCREL_FLAG ||
+      MI->getOperand(2).getTargetFlags() == PPCII::MO_GOT_TLSLD_PCREL_FLAG) {
+    Kind = MCSymbolRefExpr::VK_PPC_NOTOC;
+    Opcode = PPC::BL8_NOTOC_TLS;
+  }
   const Module *M = MF->getFunction().getParent();
 
   assert(MI->getOperand(0).isReg() &&
@@ -490,10 +534,10 @@ void PPCAsmPrinter::EmitTlsCall(const MachineInstr *MI,
   MCSymbol *MOSymbol = getSymbol(GValue);
   const MCExpr *SymVar = MCSymbolRefExpr::create(MOSymbol, VK, OutContext);
   EmitToStreamer(*OutStreamer,
-                 MCInstBuilder(Subtarget->isPPC64() ?
-                               PPC::BL8_NOP_TLS : PPC::BL_TLS)
-                 .addExpr(TlsRef)
-                 .addExpr(SymVar));
+                 MCInstBuilder(Subtarget->isPPC64() ? Opcode
+                                                    : (unsigned)PPC::BL_TLS)
+                     .addExpr(TlsRef)
+                     .addExpr(SymVar));
 }
 
 /// Map a machine operand for a TOC pseudo-machine instruction to its
@@ -533,9 +577,6 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
         if (Subtarget->hasSPE()) {
           if (PPC::F4RCRegClass.contains(Reg) ||
               PPC::F8RCRegClass.contains(Reg) ||
-              PPC::QBRCRegClass.contains(Reg) ||
-              PPC::QFRCRegClass.contains(Reg) ||
-              PPC::QSRCRegClass.contains(Reg) ||
               PPC::VFRCRegClass.contains(Reg) ||
               PPC::VRRCRegClass.contains(Reg) ||
               PPC::VSFRCRegClass.contains(Reg) ||
@@ -550,6 +591,38 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     }
   }
 #endif
+
+  auto getTOCRelocAdjustedExprForXCOFF = [this](const MCExpr *Expr,
+                                                ptrdiff_t OriginalOffset) {
+    // Apply an offset to the TOC-based expression such that the adjusted
+    // notional offset from the TOC base (to be encoded into the instruction's D
+    // or DS field) is the signed 16-bit truncation of the original notional
+    // offset from the TOC base.
+    // This is consistent with the treatment used both by XL C/C++ and
+    // by AIX ld -r.
+    ptrdiff_t Adjustment =
+        OriginalOffset - llvm::SignExtend32<16>(OriginalOffset);
+    return MCBinaryExpr::createAdd(
+        Expr, MCConstantExpr::create(-Adjustment, OutContext), OutContext);
+  };
+
+  auto getTOCEntryLoadingExprForXCOFF =
+      [IsPPC64, getTOCRelocAdjustedExprForXCOFF,
+       this](const MCSymbol *MOSymbol, const MCExpr *Expr) -> const MCExpr * {
+    const unsigned EntryByteSize = IsPPC64 ? 8 : 4;
+    const auto TOCEntryIter = TOC.find(MOSymbol);
+    assert(TOCEntryIter != TOC.end() &&
+           "Could not find the TOC entry for this symbol.");
+    const ptrdiff_t EntryDistanceFromTOCBase =
+        (TOCEntryIter - TOC.begin()) * EntryByteSize;
+    constexpr int16_t PositiveTOCRange = INT16_MAX;
+
+    if (EntryDistanceFromTOCBase > PositiveTOCRange)
+      return getTOCRelocAdjustedExprForXCOFF(Expr, EntryDistanceFromTOCBase);
+
+    return Expr;
+  };
+
   // Lower multi-instruction pseudo operations.
   switch (MI->getOpcode()) {
   default: break;
@@ -696,6 +769,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
       assert(
           TM.getCodeModel() == CodeModel::Small &&
           "This pseudo should only be selected for 32-bit small code model.");
+      Exp = getTOCEntryLoadingExprForXCOFF(MOSymbol, Exp);
       TmpInst.getOperand(1) = MCOperand::createExpr(Exp);
       EmitToStreamer(*OutStreamer, TmpInst);
       return;
@@ -724,17 +798,20 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     assert((MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isBlockAddress()) &&
            "Invalid operand!");
 
+    // Map the operand to its corresponding MCSymbol.
+    const MCSymbol *const MOSymbol = getMCSymbolForTOCPseudoMO(MO, *this);
+
     // Map the machine operand to its corresponding MCSymbol, then map the
     // global address operand to be a reference to the TOC entry we will
     // synthesize later.
-    MCSymbol *TOCEntry =
-        lookUpOrCreateTOCEntry(getMCSymbolForTOCPseudoMO(MO, *this));
+    MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(MOSymbol);
 
     const MCSymbolRefExpr::VariantKind VK =
         IsAIX ? MCSymbolRefExpr::VK_None : MCSymbolRefExpr::VK_PPC_TOC;
     const MCExpr *Exp =
         MCSymbolRefExpr::create(TOCEntry, VK, OutContext);
-    TmpInst.getOperand(1) = MCOperand::createExpr(Exp);
+    TmpInst.getOperand(1) = MCOperand::createExpr(
+        IsAIX ? getTOCEntryLoadingExprForXCOFF(MOSymbol, Exp) : Exp);
     EmitToStreamer(*OutStreamer, TmpInst);
     return;
   }
@@ -1010,6 +1087,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
   case PPC::GETtlsADDR:
     // Transform: %x3 = GETtlsADDR %x3, @sym
     // Into: BL8_NOP_TLS __tls_get_addr(sym at tlsgd)
+  case PPC::GETtlsADDRPCREL:
   case PPC::GETtlsADDR32: {
     // Transform: %r3 = GETtlsADDR32 %r3, @sym
     // Into: BL_TLS __tls_get_addr(sym at tlsgd)@PLT
@@ -1055,6 +1133,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
   case PPC::GETtlsldADDR:
     // Transform: %x3 = GETtlsldADDR %x3, @sym
     // Into: BL8_NOP_TLS __tls_get_addr(sym at tlsld)
+  case PPC::GETtlsldADDRPCREL:
   case PPC::GETtlsldADDR32: {
     // Transform: %r3 = GETtlsldADDR32 %r3, @sym
     // Into: BL_TLS __tls_get_addr(sym at tlsld)@PLT
@@ -1081,6 +1160,21 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
             .addExpr(SymDtprel));
     return;
   }
+  case PPC::PADDIdtprel: {
+    // Transform: %rd = PADDIdtprel %rs, @sym
+    // Into:      %rd = PADDI8 %rs, sym@dtprel
+    const MachineOperand &MO = MI->getOperand(2);
+    const GlobalValue *GValue = MO.getGlobal();
+    MCSymbol *MOSymbol = getSymbol(GValue);
+    const MCExpr *SymDtprel = MCSymbolRefExpr::create(
+        MOSymbol, MCSymbolRefExpr::VK_DTPREL, OutContext);
+    EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::PADDI8)
+                                     .addReg(MI->getOperand(0).getReg())
+                                     .addReg(MI->getOperand(1).getReg())
+                                     .addExpr(SymDtprel));
+    return;
+  }
+
   case PPC::ADDIdtprelL:
     // Transform: %xd = ADDIdtprelL %xs, @sym
     // Into:      %xd = ADDI8 %xs, sym@dtprel@l
@@ -1137,10 +1231,6 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
   case PPC::LWA: {
     // Verify alignment is legal, so we don't create relocations
     // that can't be supported.
-    // FIXME:  This test is currently disabled for Darwin.  The test
-    // suite shows a handful of test cases that fail this check for
-    // Darwin.  Those need to be investigated before this sanity test
-    // can be enabled for those subtargets.
     unsigned OpNum = (MI->getOpcode() == PPC::STD) ? 2 : 1;
     const MachineOperand &MO = MI->getOperand(OpNum);
     if (MO.isGlobal()) {
@@ -1621,17 +1711,19 @@ void PPCAIXAsmPrinter::emitLinkage(const GlobalValue *GV,
   assert(LinkageAttr != MCSA_Invalid && "LinkageAttr should not MCSA_Invalid.");
 
   MCSymbolAttr VisibilityAttr = MCSA_Invalid;
-  switch (GV->getVisibility()) {
+  if (!TM.getIgnoreXCOFFVisibility()) {
+    switch (GV->getVisibility()) {
 
-  // TODO: "exported" and "internal" Visibility needs to go here.
-  case GlobalValue::DefaultVisibility:
-    break;
-  case GlobalValue::HiddenVisibility:
-    VisibilityAttr = MAI->getHiddenVisibilityAttr();
-    break;
-  case GlobalValue::ProtectedVisibility:
-    VisibilityAttr = MAI->getProtectedVisibilityAttr();
-    break;
+    // TODO: "exported" and "internal" Visibility needs to go here.
+    case GlobalValue::DefaultVisibility:
+      break;
+    case GlobalValue::HiddenVisibility:
+      VisibilityAttr = MAI->getHiddenVisibilityAttr();
+      break;
+    case GlobalValue::ProtectedVisibility:
+      VisibilityAttr = MAI->getProtectedVisibilityAttr();
+      break;
+    }
   }
 
   OutStreamer->emitXCOFFSymbolLinkageWithVisibility(GVSym, LinkageAttr,
@@ -1650,18 +1742,305 @@ void PPCAIXAsmPrinter::SetupMachineFunction(MachineFunction &MF) {
   return AsmPrinter::SetupMachineFunction(MF);
 }
 
+void PPCAIXAsmPrinter::emitFunctionBodyEnd() {
+
+  if (!TM.getXCOFFTracebackTable())
+    return;
+
+  emitTracebackTable();
+}
+
+void PPCAIXAsmPrinter::emitTracebackTable() {
+
+  // Create a symbol for the end of function.
+  MCSymbol *FuncEnd = createTempSymbol(MF->getName());
+  OutStreamer->emitLabel(FuncEnd);
+
+  OutStreamer->AddComment("Traceback table begin");
+  // Begin with a fullword of zero.
+  OutStreamer->emitIntValueInHexWithPadding(0, 4 /*size*/);
+
+  SmallString<128> CommentString;
+  raw_svector_ostream CommentOS(CommentString);
+
+  auto EmitComment = [&]() {
+    OutStreamer->AddComment(CommentOS.str());
+    CommentString.clear();
+  };
+
+  auto EmitCommentAndValue = [&](uint64_t Value, int Size) {
+    EmitComment();
+    OutStreamer->emitIntValueInHexWithPadding(Value, Size);
+  };
+
+  unsigned int Version = 0;
+  CommentOS << "Version = " << Version;
+  EmitCommentAndValue(Version, 1);
+
+  // There is a lack of information in the IR to assist with determining the
+  // source language. AIX exception handling mechanism would only search for
+  // personality routine and LSDA area when such language supports exception
+  // handling. So to be conservatively correct and allow runtime to do its job,
+  // we need to set it to C++ for now.
+  TracebackTable::LanguageID LanguageIdentifier =
+      TracebackTable::CPlusPlus; // C++
+
+  CommentOS << "Language = "
+            << getNameForTracebackTableLanguageId(LanguageIdentifier);
+  EmitCommentAndValue(LanguageIdentifier, 1);
+
+  //  This is only populated for the third and fourth bytes.
+  uint32_t FirstHalfOfMandatoryField = 0;
+
+  // Emit the 3rd byte of the mandatory field.
+
+  // We always set traceback offset bit to true.
+  FirstHalfOfMandatoryField |= TracebackTable::HasTraceBackTableOffsetMask;
+
+  const PPCFunctionInfo *FI = MF->getInfo<PPCFunctionInfo>();
+  const MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  // Check the function uses floating-point processor instructions or not
+  for (unsigned Reg = PPC::F0; Reg <= PPC::F31; ++Reg) {
+    if (MRI.isPhysRegUsed(Reg)) {
+      FirstHalfOfMandatoryField |= TracebackTable::IsFloatingPointPresentMask;
+      break;
+    }
+  }
+
+#define GENBOOLCOMMENT(Prefix, V, Field)                                       \
+  CommentOS << (Prefix) << ((V) & (TracebackTable::Field##Mask) ? "+" : "-")   \
+            << #Field
+
+#define GENVALUECOMMENT(PrefixAndName, V, Field)                               \
+  CommentOS << (PrefixAndName) << " = "                                        \
+            << static_cast<unsigned>(((V) & (TracebackTable::Field##Mask)) >>  \
+                                     (TracebackTable::Field##Shift))
+
+  GENBOOLCOMMENT("", FirstHalfOfMandatoryField, IsGlobaLinkage);
+  GENBOOLCOMMENT(", ", FirstHalfOfMandatoryField, IsOutOfLineEpilogOrPrologue);
+  EmitComment();
+
+  GENBOOLCOMMENT("", FirstHalfOfMandatoryField, HasTraceBackTableOffset);
+  GENBOOLCOMMENT(", ", FirstHalfOfMandatoryField, IsInternalProcedure);
+  EmitComment();
+
+  GENBOOLCOMMENT("", FirstHalfOfMandatoryField, HasControlledStorage);
+  GENBOOLCOMMENT(", ", FirstHalfOfMandatoryField, IsTOCless);
+  EmitComment();
+
+  GENBOOLCOMMENT("", FirstHalfOfMandatoryField, IsFloatingPointPresent);
+  EmitComment();
+  GENBOOLCOMMENT("", FirstHalfOfMandatoryField,
+                 IsFloatingPointOperationLogOrAbortEnabled);
+  EmitComment();
+
+  OutStreamer->emitIntValueInHexWithPadding(
+      (FirstHalfOfMandatoryField & 0x0000ff00) >> 8, 1);
+
+  // Set the 4th byte of the mandatory field.
+  FirstHalfOfMandatoryField |= TracebackTable::IsFunctionNamePresentMask;
+
+  static_assert(XCOFF::AllocRegNo == 31, "Unexpected register usage!");
+  if (MRI.isPhysRegUsed(Subtarget->isPPC64() ? PPC::X31 : PPC::R31))
+    FirstHalfOfMandatoryField |= TracebackTable::IsAllocaUsedMask;
+
+  const SmallVectorImpl<Register> &MustSaveCRs = FI->getMustSaveCRs();
+  if (!MustSaveCRs.empty())
+    FirstHalfOfMandatoryField |= TracebackTable::IsCRSavedMask;
+
+  if (FI->mustSaveLR())
+    FirstHalfOfMandatoryField |= TracebackTable::IsLRSavedMask;
+
+  GENBOOLCOMMENT("", FirstHalfOfMandatoryField, IsInterruptHandler);
+  GENBOOLCOMMENT(", ", FirstHalfOfMandatoryField, IsFunctionNamePresent);
+  GENBOOLCOMMENT(", ", FirstHalfOfMandatoryField, IsAllocaUsed);
+  EmitComment();
+  GENVALUECOMMENT("OnConditionDirective", FirstHalfOfMandatoryField,
+                  OnConditionDirective);
+  GENBOOLCOMMENT(", ", FirstHalfOfMandatoryField, IsCRSaved);
+  GENBOOLCOMMENT(", ", FirstHalfOfMandatoryField, IsLRSaved);
+  EmitComment();
+  OutStreamer->emitIntValueInHexWithPadding((FirstHalfOfMandatoryField & 0xff),
+                                            1);
+
+  // Set the 5th byte of mandatory field.
+  uint32_t SecondHalfOfMandatoryField = 0;
+
+  // Always store back chain.
+  SecondHalfOfMandatoryField |= TracebackTable::IsBackChainStoredMask;
+
+  uint32_t FPRSaved = 0;
+  for (unsigned Reg = PPC::F14; Reg <= PPC::F31; ++Reg) {
+    if (MRI.isPhysRegModified(Reg)) {
+      FPRSaved = PPC::F31 - Reg + 1;
+      break;
+    }
+  }
+  SecondHalfOfMandatoryField |= (FPRSaved << TracebackTable::FPRSavedShift) &
+                                TracebackTable::FPRSavedMask;
+  GENBOOLCOMMENT("", SecondHalfOfMandatoryField, IsBackChainStored);
+  GENBOOLCOMMENT(", ", SecondHalfOfMandatoryField, IsFixup);
+  GENVALUECOMMENT(", NumOfFPRsSaved", SecondHalfOfMandatoryField, FPRSaved);
+  EmitComment();
+  OutStreamer->emitIntValueInHexWithPadding(
+      (SecondHalfOfMandatoryField & 0xff000000) >> 24, 1);
+
+  // Set the 6th byte of mandatory field.
+  bool ShouldEmitEHBlock = TargetLoweringObjectFileXCOFF::ShouldEmitEHBlock(MF);
+  if (ShouldEmitEHBlock)
+    SecondHalfOfMandatoryField |= TracebackTable::HasExtensionTableMask;
+
+  uint32_t GPRSaved = 0;
+
+  // X13 is reserved under 64-bit environment.
+  unsigned GPRBegin = Subtarget->isPPC64() ? PPC::X14 : PPC::R13;
+  unsigned GPREnd = Subtarget->isPPC64() ? PPC::X31 : PPC::R31;
+
+  for (unsigned Reg = GPRBegin; Reg <= GPREnd; ++Reg) {
+    if (MRI.isPhysRegModified(Reg)) {
+      GPRSaved = GPREnd - Reg + 1;
+      break;
+    }
+  }
+
+  SecondHalfOfMandatoryField |= (GPRSaved << TracebackTable::GPRSavedShift) &
+                                TracebackTable::GPRSavedMask;
+
+  GENBOOLCOMMENT("", SecondHalfOfMandatoryField, HasVectorInfo);
+  GENBOOLCOMMENT(", ", SecondHalfOfMandatoryField, HasExtensionTable);
+  GENVALUECOMMENT(", NumOfGPRsSaved", SecondHalfOfMandatoryField, GPRSaved);
+  EmitComment();
+  OutStreamer->emitIntValueInHexWithPadding(
+      (SecondHalfOfMandatoryField & 0x00ff0000) >> 16, 1);
+
+  // Set the 7th byte of mandatory field.
+  uint32_t NumberOfFixedPara = FI->getFixedParamNum();
+  SecondHalfOfMandatoryField |=
+      (NumberOfFixedPara << TracebackTable::NumberOfFixedParmsShift) &
+      TracebackTable::NumberOfFixedParmsMask;
+  GENVALUECOMMENT("NumberOfFixedParms", SecondHalfOfMandatoryField,
+                  NumberOfFixedParms);
+  EmitComment();
+  OutStreamer->emitIntValueInHexWithPadding(
+      (SecondHalfOfMandatoryField & 0x0000ff00) >> 8, 1);
+
+  // Set the 8th byte of mandatory field.
+
+  // Always set parameter on stack.
+  SecondHalfOfMandatoryField |= TracebackTable::HasParmsOnStackMask;
+
+  uint32_t NumberOfFPPara = FI->getFloatingPointParamNum();
+  SecondHalfOfMandatoryField |=
+      (NumberOfFPPara << TracebackTable::NumberOfFloatingPointParmsShift) &
+      TracebackTable::NumberOfFloatingPointParmsMask;
+
+  GENVALUECOMMENT("NumberOfFPParms", SecondHalfOfMandatoryField,
+                  NumberOfFloatingPointParms);
+  GENBOOLCOMMENT(", ", SecondHalfOfMandatoryField, HasParmsOnStack);
+  EmitComment();
+  OutStreamer->emitIntValueInHexWithPadding(SecondHalfOfMandatoryField & 0xff,
+                                            1);
+
+  // Generate the optional fields of traceback table.
+
+  // Parameter type.
+  if (NumberOfFixedPara || NumberOfFPPara) {
+    assert((SecondHalfOfMandatoryField & TracebackTable::HasVectorInfoMask) ==
+               0 &&
+           "VectorInfo has not been implemented.");
+    uint32_t ParaType = FI->getParameterType();
+    CommentOS << "Parameter type = "
+              << XCOFF::parseParmsType(ParaType,
+                                       NumberOfFixedPara + NumberOfFPPara);
+    EmitComment();
+    OutStreamer->emitIntValueInHexWithPadding(ParaType, sizeof(ParaType));
+  }
+
+  // Traceback table offset.
+  OutStreamer->AddComment("Function size");
+  if (FirstHalfOfMandatoryField & TracebackTable::HasTraceBackTableOffsetMask) {
+    MCSymbol *FuncSectSym = getObjFileLowering().getFunctionEntryPointSymbol(
+        &(MF->getFunction()), TM);
+    OutStreamer->emitAbsoluteSymbolDiff(FuncEnd, FuncSectSym, 4);
+  }
+
+  // Since we unset the Int_Handler.
+  if (FirstHalfOfMandatoryField & TracebackTable::IsInterruptHandlerMask)
+    report_fatal_error("Hand_Mask not implement yet");
+
+  if (FirstHalfOfMandatoryField & TracebackTable::HasControlledStorageMask)
+    report_fatal_error("Ctl_Info not implement yet");
+
+  if (FirstHalfOfMandatoryField & TracebackTable::IsFunctionNamePresentMask) {
+    StringRef Name = MF->getName().substr(0, INT16_MAX);
+    int16_t NameLength = Name.size();
+    CommentOS << "Function name len = "
+              << static_cast<unsigned int>(NameLength);
+    EmitCommentAndValue(NameLength, 2);
+    OutStreamer->AddComment("Function Name");
+    OutStreamer->emitBytes(Name);
+  }
+
+  if (FirstHalfOfMandatoryField & TracebackTable::IsAllocaUsedMask) {
+    uint8_t AllocReg = XCOFF::AllocRegNo;
+    OutStreamer->AddComment("AllocaUsed");
+    OutStreamer->emitIntValueInHex(AllocReg, sizeof(AllocReg));
+  }
+
+  uint8_t ExtensionTableFlag = 0;
+  if (SecondHalfOfMandatoryField & TracebackTable::HasExtensionTableMask) {
+    if (ShouldEmitEHBlock)
+      ExtensionTableFlag |= ExtendedTBTableFlag::TB_EH_INFO;
+
+    CommentOS << "ExtensionTableFlag = "
+              << getExtendedTBTableFlagString(ExtensionTableFlag);
+    EmitCommentAndValue(ExtensionTableFlag, sizeof(ExtensionTableFlag));
+  }
+
+  if (ExtensionTableFlag & ExtendedTBTableFlag::TB_EH_INFO) {
+    auto &Ctx = OutStreamer->getContext();
+    MCSymbol *EHInfoSym =
+        TargetLoweringObjectFileXCOFF::getEHInfoTableSymbol(MF);
+    MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(EHInfoSym);
+    const MCSymbol *TOCBaseSym =
+        cast<MCSectionXCOFF>(getObjFileLowering().getTOCBaseSection())
+            ->getQualNameSymbol();
+    const MCExpr *Exp =
+        MCBinaryExpr::createSub(MCSymbolRefExpr::create(TOCEntry, Ctx),
+                                MCSymbolRefExpr::create(TOCBaseSym, Ctx), Ctx);
+
+    const DataLayout &DL = getDataLayout();
+    OutStreamer->emitValueToAlignment(4);
+    OutStreamer->AddComment("EHInfo Table");
+    OutStreamer->emitValue(Exp, DL.getPointerSize());
+  }
+
+#undef GENBOOLCOMMENT
+#undef GENVALUECOMMENT
+}
+
 void PPCAIXAsmPrinter::ValidateGV(const GlobalVariable *GV) {
   // Early error checking limiting what is supported.
   if (GV->isThreadLocal())
     report_fatal_error("Thread local not yet supported on AIX.");
 
-  if (GV->hasSection())
-    report_fatal_error("Custom section for Data not yet supported.");
-
   if (GV->hasComdat())
     report_fatal_error("COMDAT not yet supported by AIX.");
 }
 
+static bool isSpecialLLVMGlobalArrayToSkip(const GlobalVariable *GV) {
+  return GV->hasAppendingLinkage() &&
+         StringSwitch<bool>(GV->getName())
+             // TODO: Linker could still eliminate the GV if we just skip
+             // handling llvm.used array. Skipping them for now until we or the
+             // AIX OS team come up with a good solution.
+             .Case("llvm.used", true)
+             // It's correct to just skip llvm.compiler.used array here.
+             .Case("llvm.compiler.used", true)
+             .Default(false);
+}
+
 static bool isSpecialLLVMGlobalArrayForStaticInit(const GlobalVariable *GV) {
   return StringSwitch<bool>(GV->getName())
       .Cases("llvm.global_ctors", "llvm.global_dtors", true)
@@ -1669,19 +2048,15 @@ static bool isSpecialLLVMGlobalArrayForStaticInit(const GlobalVariable *GV) {
 }
 
 void PPCAIXAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
-  ValidateGV(GV);
-
-  // TODO: Update the handling of global arrays for static init when we support
-  // the ".ref" directive.
-  // Otherwise, we can skip these arrays, because the AIX linker collects
-  // static init functions simply based on their name.
-  if (isSpecialLLVMGlobalArrayForStaticInit(GV))
+  // Special LLVM global arrays have been handled at the initialization.
+  if (isSpecialLLVMGlobalArrayToSkip(GV) || isSpecialLLVMGlobalArrayForStaticInit(GV))
     return;
 
-  // Create the symbol, set its storage class.
+  assert(!GV->getName().startswith("llvm.") &&
+         "Unhandled intrinsic global variable.");
+  ValidateGV(GV);
+
   MCSymbolXCOFF *GVSym = cast<MCSymbolXCOFF>(getSymbol(GV));
-  GVSym->setStorageClass(
-      TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(GV));
 
   if (GV->isDeclarationForLinker()) {
     emitLinkage(GV, GVSym);
@@ -1705,10 +2080,12 @@ void PPCAIXAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
   if (GVKind.isCommon() || GVKind.isBSSLocal()) {
     Align Alignment = GV->getAlign().getValueOr(DL.getPreferredAlign(GV));
     uint64_t Size = DL.getTypeAllocSize(GV->getType()->getElementType());
+    GVSym->setStorageClass(
+        TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(GV));
 
     if (GVKind.isBSSLocal())
       OutStreamer->emitXCOFFLocalCommonSymbol(
-          OutContext.getOrCreateSymbol(GVSym->getUnqualifiedName()), Size,
+          OutContext.getOrCreateSymbol(GVSym->getSymbolTableName()), Size,
           GVSym, Alignment.value());
     else
       OutStreamer->emitCommonSymbol(GVSym, Size, Alignment.value());
@@ -1718,7 +2095,18 @@ void PPCAIXAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
   MCSymbol *EmittedInitSym = GVSym;
   emitLinkage(GV, EmittedInitSym);
   emitAlignment(getGVAlignment(GV, DL), GV);
-  OutStreamer->emitLabel(EmittedInitSym);
+
+  // When -fdata-sections is enabled, every GlobalVariable will
+  // be put into its own csect; therefore, label is not necessary here.
+  if (!TM.getDataSections() || GV->hasSection()) {
+    OutStreamer->emitLabel(EmittedInitSym);
+  }
+
+  // Emit aliasing label for global variable.
+  llvm::for_each(GOAliasMap[GV], [this](const GlobalAlias *Alias) {
+    OutStreamer->emitLabel(getSymbol(Alias));
+  });
+
   emitGlobalConstant(GV->getParent()->getDataLayout(), GV->getInitializer());
 }
 
@@ -1730,6 +2118,13 @@ void PPCAIXAsmPrinter::emitFunctionDescriptor() {
   // Emit function descriptor.
   OutStreamer->SwitchSection(
       cast<MCSymbolXCOFF>(CurrentFnDescSym)->getRepresentedCsect());
+
+  // Emit aliasing label for function descriptor csect.
+  llvm::for_each(GOAliasMap[&MF->getFunction()],
+                 [this](const GlobalAlias *Alias) {
+                   OutStreamer->emitLabel(getSymbol(Alias));
+                 });
+
   // Emit function entry point address.
   OutStreamer->emitValue(MCSymbolRefExpr::create(CurrentFnSym, OutContext),
                          PointerSize);
@@ -1745,6 +2140,20 @@ void PPCAIXAsmPrinter::emitFunctionDescriptor() {
   OutStreamer->SwitchSection(Current.first, Current.second);
 }
 
+void PPCAIXAsmPrinter::emitFunctionEntryLabel() {
+  // It's not necessary to emit the label when we have individual
+  // function in its own csect.
+  if (!TM.getFunctionSections())
+    PPCAsmPrinter::emitFunctionEntryLabel();
+
+  // Emit aliasing label for function entry point label.
+  llvm::for_each(
+      GOAliasMap[&MF->getFunction()], [this](const GlobalAlias *Alias) {
+        OutStreamer->emitLabel(
+            getObjFileLowering().getFunctionEntryPointSymbol(Alias, TM));
+      });
+}
+
 void PPCAIXAsmPrinter::emitEndOfAsmFile(Module &M) {
   // If there are no functions in this module, we will never need to reference
   // the TOC base.
@@ -1757,20 +2166,10 @@ void PPCAIXAsmPrinter::emitEndOfAsmFile(Module &M) {
   PPCTargetStreamer *TS =
       static_cast<PPCTargetStreamer *>(OutStreamer->getTargetStreamer());
 
-  const unsigned EntryByteSize = Subtarget->isPPC64() ? 8 : 4;
-  const unsigned TOCEntriesByteSize = TOC.size() * EntryByteSize;
-  // TODO: If TOC entries' size is larger than 32768, then we run out of
-  // positive displacement to reach the TOC entry. We need to decide how to
-  // handle entries' size larger than that later.
-  if (TOCEntriesByteSize > 32767) {
-    report_fatal_error("Handling of TOC entry displacement larger than 32767 "
-                       "is not yet implemented.");
-  }
-
   for (auto &I : TOC) {
     // Setup the csect for the current TC entry.
     MCSectionXCOFF *TCEntry = cast<MCSectionXCOFF>(
-        getObjFileLowering().getSectionForTOCEntry(I.first));
+        getObjFileLowering().getSectionForTOCEntry(I.first, TM));
     OutStreamer->SwitchSection(TCEntry);
 
     OutStreamer->emitLabel(I.second);
@@ -1780,10 +2179,6 @@ void PPCAIXAsmPrinter::emitEndOfAsmFile(Module &M) {
 }
 
 bool PPCAIXAsmPrinter::doInitialization(Module &M) {
-  if (M.alias_size() > 0u)
-    report_fatal_error(
-        "module has aliases, which LLVM does not yet support for AIX");
-
   const bool Result = PPCAsmPrinter::doInitialization(M);
 
   auto setCsectAlignment = [this](const GlobalObject *GO) {
@@ -1803,19 +2198,174 @@ bool PPCAIXAsmPrinter::doInitialization(Module &M) {
   // We need to know, up front, the alignment of csects for the assembly path,
   // because once a .csect directive gets emitted, we could not change the
   // alignment value on it.
-  for (const auto &G : M.globals())
+  for (const auto &G : M.globals()) {
+    if (isSpecialLLVMGlobalArrayToSkip(&G))
+      continue;
+
+    if (isSpecialLLVMGlobalArrayForStaticInit(&G)) {
+      // Generate a format indicator and a unique module id to be a part of
+      // the sinit and sterm function names.
+      if (FormatIndicatorAndUniqueModId.empty()) {
+        std::string UniqueModuleId = getUniqueModuleId(&M);
+        if (UniqueModuleId != "")
+          // TODO: Use source file full path to generate the unique module id
+          // and add a format indicator as a part of function name in case we
+          // will support more than one format.
+          FormatIndicatorAndUniqueModId = "clang_" + UniqueModuleId.substr(1);
+        else
+          // Use the Pid and current time as the unique module id when we cannot
+          // generate one based on a module's strong external symbols.
+          // FIXME: Adjust the comment accordingly after we use source file full
+          // path instead.
+          FormatIndicatorAndUniqueModId =
+              "clangPidTime_" + llvm::itostr(sys::Process::getProcessId()) +
+              "_" + llvm::itostr(time(nullptr));
+      }
+
+      emitSpecialLLVMGlobal(&G);
+      continue;
+    }
+
     setCsectAlignment(&G);
+  }
 
   for (const auto &F : M)
     setCsectAlignment(&F);
 
+  // Construct an aliasing list for each GlobalObject.
+  for (const auto &Alias : M.aliases()) {
+    const GlobalObject *Base = Alias.getBaseObject();
+    if (!Base)
+      report_fatal_error(
+          "alias without a base object is not yet supported on AIX");
+    GOAliasMap[Base].push_back(&Alias);
+  }
+
   return Result;
 }
 
-/// createPPCAsmPrinterPass - Returns a pass that prints the PPC assembly code
-/// for a MachineFunction to the given output stream, in a format that the
-/// Darwin assembler can deal with.
-///
+void PPCAIXAsmPrinter::emitInstruction(const MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case PPC::BL8:
+  case PPC::BL:
+  case PPC::BL8_NOP:
+  case PPC::BL_NOP: {
+    const MachineOperand &MO = MI->getOperand(0);
+    if (MO.isSymbol()) {
+      MCSymbolXCOFF *S =
+          cast<MCSymbolXCOFF>(OutContext.getOrCreateSymbol(MO.getSymbolName()));
+      ExtSymSDNodeSymbols.insert(S);
+    }
+  } break;
+  case PPC::BL_TLS:
+  case PPC::BL8_TLS:
+  case PPC::BL8_TLS_:
+  case PPC::BL8_NOP_TLS:
+    report_fatal_error("TLS call not yet implemented");
+  case PPC::TAILB:
+  case PPC::TAILB8:
+  case PPC::TAILBA:
+  case PPC::TAILBA8:
+  case PPC::TAILBCTR:
+  case PPC::TAILBCTR8:
+    if (MI->getOperand(0).isSymbol())
+      report_fatal_error("Tail call for extern symbol not yet supported.");
+    break;
+  }
+  return PPCAsmPrinter::emitInstruction(MI);
+}
+
+bool PPCAIXAsmPrinter::doFinalization(Module &M) {
+  for (MCSymbol *Sym : ExtSymSDNodeSymbols)
+    OutStreamer->emitSymbolAttribute(Sym, MCSA_Extern);
+  return PPCAsmPrinter::doFinalization(M);
+}
+
+static unsigned mapToSinitPriority(int P) {
+  if (P < 0 || P > 65535)
+    report_fatal_error("invalid init priority");
+
+  if (P <= 20)
+    return P;
+
+  if (P < 81)
+    return 20 + (P - 20) * 16;
+
+  if (P <= 1124)
+    return 1004 + (P - 81);
+
+  if (P < 64512)
+    return 2047 + (P - 1124) * 33878;
+
+  return 2147482625u + (P - 64512);
+}
+
+static std::string convertToSinitPriority(int Priority) {
+  // This helper function converts clang init priority to values used in sinit
+  // and sterm functions.
+  //
+  // The conversion strategies are:
+  // We map the reserved clang/gnu priority range [0, 100] into the sinit/sterm
+  // reserved priority range [0, 1023] by
+  // - directly mapping the first 21 and the last 20 elements of the ranges
+  // - linear interpolating the intermediate values with a step size of 16.
+  //
+  // We map the non reserved clang/gnu priority range of [101, 65535] into the
+  // sinit/sterm priority range [1024, 2147483648] by:
+  // - directly mapping the first and the last 1024 elements of the ranges
+  // - linear interpolating the intermediate values with a step size of 33878.
+  unsigned int P = mapToSinitPriority(Priority);
+
+  std::string PrioritySuffix;
+  llvm::raw_string_ostream os(PrioritySuffix);
+  os << llvm::format_hex_no_prefix(P, 8);
+  os.flush();
+  return PrioritySuffix;
+}
+
+void PPCAIXAsmPrinter::emitXXStructorList(const DataLayout &DL,
+                                          const Constant *List, bool IsCtor) {
+  SmallVector<Structor, 8> Structors;
+  preprocessXXStructorList(DL, List, Structors);
+  if (Structors.empty())
+    return;
+
+  unsigned Index = 0;
+  for (Structor &S : Structors) {
+    if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(S.Func))
+      S.Func = CE->getOperand(0);
+
+    llvm::GlobalAlias::create(
+        GlobalValue::ExternalLinkage,
+        (IsCtor ? llvm::Twine("__sinit") : llvm::Twine("__sterm")) +
+            llvm::Twine(convertToSinitPriority(S.Priority)) +
+            llvm::Twine("_", FormatIndicatorAndUniqueModId) +
+            llvm::Twine("_", llvm::utostr(Index++)),
+        cast<Function>(S.Func));
+  }
+}
+
+void PPCAIXAsmPrinter::emitTTypeReference(const GlobalValue *GV,
+                                          unsigned Encoding) {
+  if (GV) {
+    MCSymbol *TypeInfoSym = TM.getSymbol(GV);
+    MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(TypeInfoSym);
+    const MCSymbol *TOCBaseSym =
+        cast<MCSectionXCOFF>(getObjFileLowering().getTOCBaseSection())
+            ->getQualNameSymbol();
+    auto &Ctx = OutStreamer->getContext();
+    const MCExpr *Exp =
+        MCBinaryExpr::createSub(MCSymbolRefExpr::create(TOCEntry, Ctx),
+                                MCSymbolRefExpr::create(TOCBaseSym, Ctx), Ctx);
+    OutStreamer->emitValue(Exp, GetSizeOfEncodedValue(Encoding));
+  } else
+    OutStreamer->emitIntValue(0, GetSizeOfEncodedValue(Encoding));
+}
+
+// Return a pass that prints the PPC assembly code for a MachineFunction to the
+// given output stream.
 static AsmPrinter *
 createPPCAsmPrinterPass(TargetMachine &tm,
                         std::unique_ptr<MCStreamer> &&Streamer) {
@@ -1829,6 +2379,8 @@ createPPCAsmPrinterPass(TargetMachine &tm,
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCAsmPrinter() {
   TargetRegistry::RegisterAsmPrinter(getThePPC32Target(),
                                      createPPCAsmPrinterPass);
+  TargetRegistry::RegisterAsmPrinter(getThePPC32LETarget(),
+                                     createPPCAsmPrinterPass);
   TargetRegistry::RegisterAsmPrinter(getThePPC64Target(),
                                      createPPCAsmPrinterPass);
   TargetRegistry::RegisterAsmPrinter(getThePPC64LETarget(),
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp
index f125ca011cd2..3c6b1f84b821 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp
@@ -59,7 +59,7 @@ using namespace llvm;
 
 namespace {
 
-#define DEBUG_TYPE "bool-ret-to-int"
+#define DEBUG_TYPE "ppc-bool-ret-to-int"
 
 STATISTIC(NumBoolRetPromotion,
           "Number of times a bool feeding a RetInst was promoted to an int");
@@ -75,8 +75,7 @@ class PPCBoolRetToInt : public FunctionPass {
     WorkList.push_back(V);
     Defs.insert(V);
     while (!WorkList.empty()) {
-      Value *Curr = WorkList.back();
-      WorkList.pop_back();
+      Value *Curr = WorkList.pop_back_val();
       auto *CurrUser = dyn_cast<User>(Curr);
       // Operands of CallInst/Constant are skipped because they may not be Bool
       // type. For CallInst, their positions are defined by ABI.
@@ -283,8 +282,8 @@ private:
 } // end anonymous namespace
 
 char PPCBoolRetToInt::ID = 0;
-INITIALIZE_PASS(PPCBoolRetToInt, "bool-ret-to-int",
-                "Convert i1 constants to i32/i64 if they are returned",
-                false, false)
+INITIALIZE_PASS(PPCBoolRetToInt, "ppc-bool-ret-to-int",
+                "Convert i1 constants to i32/i64 if they are returned", false,
+                false)
 
 FunctionPass *llvm::createPPCBoolRetToIntPass() { return new PPCBoolRetToInt(); }
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCCCState.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCCCState.cpp
index 5116f0d121f4..79ffc6627a61 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCCCState.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCCCState.cpp
@@ -32,4 +32,4 @@ void PPCCCState::PreAnalyzeFormalArguments(
       OriginalArgWasPPCF128.push_back(false);
     }
   }
-}
\ No newline at end of file
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp
index bb12e05173a6..b9518d6d7064 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp
@@ -1,4 +1,4 @@
-//===-- PPCCTRLoops.cpp - Identify and generate CTR loops -----------------===//
+//===-- PPCCTRLoops.cpp - Verify CTR loops -----------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,74 +6,48 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This pass identifies loops where we can generate the PPC branch instructions
-// that decrement and test the count register (CTR) (bdnz and friends).
-//
-// The pattern that defines the induction variable can changed depending on
-// prior optimizations.  For example, the IndVarSimplify phase run by 'opt'
-// normalizes induction variables, and the Loop Strength Reduction pass
-// run by 'llc' may also make changes to the induction variable.
-//
-// Criteria for CTR loops:
-//  - Countable loops (w/ ind. var for a trip count)
-//  - Try inner-most loops first
-//  - No nested CTR loops.
-//  - No function calls in loops.
+// This pass verifies that all bdnz/bdz instructions are dominated by a loop
+// mtctr before any other instructions that might clobber the ctr register.
 //
 //===----------------------------------------------------------------------===//
 
+// CTR loops are produced by the HardwareLoops pass and this pass is simply a
+// verification that no invalid CTR loops are produced. As such, it isn't
+// something that needs to be run (or even defined) for Release builds so the
+// entire file is guarded by NDEBUG.
+#ifndef NDEBUG
+#include <vector>
+
+#include "MCTargetDesc/PPCMCTargetDesc.h"
 #include "PPC.h"
-#include "PPCSubtarget.h"
-#include "PPCTargetMachine.h"
-#include "PPCTargetTransformInfo.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/CFG.h"
-#include "llvm/Analysis/CodeMetrics.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopIterator.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/CodeGen/TargetSchedule.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/InlineAsm.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/ValueHandle.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/ilist_iterator.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBundleIterator.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/Register.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GenericDomTreeConstruction.h"
+#include "llvm/Support/Printable.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/LoopUtils.h"
-
-#ifndef NDEBUG
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#endif
 
 using namespace llvm;
 
-#define DEBUG_TYPE "ctrloops"
-
-#ifndef NDEBUG
-static cl::opt<int> CTRLoopLimit("ppc-max-ctrloop", cl::Hidden, cl::init(-1));
-#endif
+#define DEBUG_TYPE "ppc-ctrloops-verify"
 
 namespace {
 
-#ifndef NDEBUG
   struct PPCCTRLoopsVerify : public MachineFunctionPass {
   public:
     static char ID;
@@ -94,10 +68,8 @@ namespace {
   };
 
   char PPCCTRLoopsVerify::ID = 0;
-#endif // NDEBUG
 } // end anonymous namespace
 
-#ifndef NDEBUG
 INITIALIZE_PASS_BEGIN(PPCCTRLoopsVerify, "ppc-ctr-loops-verify",
                       "PowerPC CTR Loops Verify", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
@@ -107,9 +79,7 @@ INITIALIZE_PASS_END(PPCCTRLoopsVerify, "ppc-ctr-loops-verify",
 FunctionPass *llvm::createPPCCTRLoopsVerify() {
   return new PPCCTRLoopsVerify();
 }
-#endif // NDEBUG
 
-#ifndef NDEBUG
 static bool clobbersCTR(const MachineInstr &MI) {
   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI.getOperand(i);
@@ -178,9 +148,7 @@ queue_preds:
       return false;
     }
 
-    for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
-         PIE = MBB->pred_end(); PI != PIE; ++PI)
-      Preds.push_back(*PI);
+    append_range(Preds, MBB->predecessors());
   }
 
   do {
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCCallingConv.td b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCCallingConv.td
index 1eaa7f7a44b3..cc3486718179 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCCallingConv.td
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCCallingConv.td
@@ -59,10 +59,7 @@ def RetCC_PPC_Cold : CallingConv<[
 
   CCIfType<[f32], CCAssignToReg<[F1]>>,
   CCIfType<[f64], CCAssignToReg<[F1]>>,
-  CCIfType<[f128], CCIfSubtarget<"hasP9Vector()", CCAssignToReg<[V2]>>>,
-
-  CCIfType<[v4f64, v4f32, v4i1],
-           CCIfSubtarget<"hasQPX()", CCAssignToReg<[QF1]>>>,
+  CCIfType<[f128], CCIfSubtarget<"hasAltivec()", CCAssignToReg<[V2]>>>,
 
   CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32, v2f64],
            CCIfSubtarget<"hasAltivec()",
@@ -95,13 +92,9 @@ def RetCC_PPC : CallingConv<[
 
   // For P9, f128 are passed in vector registers.
   CCIfType<[f128],
-           CCIfSubtarget<"hasP9Vector()",
+           CCIfSubtarget<"hasAltivec()",
            CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>>,
 
-  // QPX vectors are returned in QF1 and QF2. 
-  CCIfType<[v4f64, v4f32, v4i1],
-           CCIfSubtarget<"hasQPX()", CCAssignToReg<[QF1, QF2]>>>,
- 
   // Vector types returned as "direct" go into V2 .. V9; note that only the
   // ELFv2 ABI fully utilizes all these registers.
   CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32, v2f64],
@@ -156,10 +149,8 @@ def RetCC_PPC64_ELF_FIS : CallingConv<[
   CCIfType<[f32],  CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
   CCIfType<[f64],  CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
   CCIfType<[f128],
-           CCIfSubtarget<"hasP9Vector()",
+           CCIfSubtarget<"hasAltivec()",
            CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>>,
-  CCIfType<[v4f64, v4f32, v4i1],
-           CCIfSubtarget<"hasQPX()", CCAssignToReg<[QF1, QF2]>>>,
   CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32, v2f64],
            CCIfSubtarget<"hasAltivec()",
            CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>>
@@ -223,12 +214,9 @@ def CC_PPC32_SVR4_Common : CallingConv<[
   CCIfType<[f32], CCIfSubtarget<"hasSPE()", CCAssignToStack<4, 4>>>,
   CCIfType<[f64], CCIfSubtarget<"hasSPE()", CCAssignToStack<8, 8>>>,
 
-  // QPX vectors that are stored in double precision need 32-byte alignment.
-  CCIfType<[v4f64, v4i1], CCAssignToStack<32, 32>>,
-
   // Vectors and float128 get 16-byte stack slots that are 16-byte aligned.
   CCIfType<[v16i8, v8i16, v4i32, v4f32, v2f64, v2i64], CCAssignToStack<16, 16>>,
-  CCIfType<[f128], CCIfSubtarget<"hasP9Vector()", CCAssignToStack<16, 16>>>
+  CCIfType<[f128], CCIfSubtarget<"hasAltivec()", CCAssignToStack<16, 16>>>
 ]>;
 
 // This calling convention puts vector arguments always on the stack. It is used
@@ -243,10 +231,6 @@ def CC_PPC32_SVR4_VarArg : CallingConv<[
 // put vector arguments in vector registers before putting them on the stack.
 let Entry = 1 in
 def CC_PPC32_SVR4 : CallingConv<[
-  // QPX vectors mirror the scalar FP convention.
-  CCIfType<[v4f64, v4f32, v4i1], CCIfSubtarget<"hasQPX()",
-    CCAssignToReg<[QF1, QF2, QF3, QF4, QF5, QF6, QF7, QF8]>>>,
-
   // The first 12 Vector arguments are passed in AltiVec registers.
   CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32, v2f64],
            CCIfSubtarget<"hasAltivec()", CCAssignToReg<[V2, V3, V4, V5, V6, V7,
@@ -254,7 +238,7 @@ def CC_PPC32_SVR4 : CallingConv<[
 
   // Float128 types treated as vector arguments.
   CCIfType<[f128],
-           CCIfSubtarget<"hasP9Vector()", CCAssignToReg<[V2, V3, V4, V5, V6, V7,
+           CCIfSubtarget<"hasAltivec()", CCAssignToReg<[V2, V3, V4, V5, V6, V7,
                           V8, V9, V10, V11, V12, V13]>>>,
            
   CCDelegateTo<CC_PPC32_SVR4_Common>
@@ -307,6 +291,8 @@ def CSR_AIX32 : CalleeSavedRegs<(add R13, R14, R15, R16, R17, R18, R19, R20,
                                      F27, F28, F29, F30, F31, CR2, CR3, CR4
                                 )>;
 
+def CSR_AIX32_Altivec : CalleeSavedRegs<(add CSR_AIX32, CSR_Altivec)>;
+
 // Common CalleeSavedRegs for SVR4 and AIX.
 def CSR_PPC64   : CalleeSavedRegs<(add X14, X15, X16, X17, X18, X19, X20,
                                         X21, X22, X23, X24, X25, X26, X27, X28,
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp
index c9f74bbf861c..08b7bdb3ac1e 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp
@@ -77,8 +77,9 @@ protected:
             if (J->getOperand(0).getMBB() == &ReturnMBB) {
               // This is an unconditional branch to the return. Replace the
               // branch with a blr.
-              BuildMI(**PI, J, J->getDebugLoc(), TII->get(I->getOpcode()))
-                  .copyImplicitOps(*I);
+              MachineInstr *MI = ReturnMBB.getParent()->CloneMachineInstr(&*I);
+              (*PI)->insert(J, MI);
+
               MachineBasicBlock::iterator K = J--;
               K->eraseFromParent();
               BlockChanged = true;
@@ -89,10 +90,13 @@ protected:
             if (J->getOperand(2).getMBB() == &ReturnMBB) {
               // This is a conditional branch to the return. Replace the branch
               // with a bclr.
-              BuildMI(**PI, J, J->getDebugLoc(), TII->get(PPC::BCCLR))
+              MachineInstr *MI = ReturnMBB.getParent()->CloneMachineInstr(&*I);
+              MI->setDesc(TII->get(PPC::BCCLR));
+              MachineInstrBuilder(*ReturnMBB.getParent(), MI)
                   .add(J->getOperand(0))
-                  .add(J->getOperand(1))
-                  .copyImplicitOps(*I);
+                  .add(J->getOperand(1));
+              (*PI)->insert(J, MI);
+
               MachineBasicBlock::iterator K = J--;
               K->eraseFromParent();
               BlockChanged = true;
@@ -103,11 +107,13 @@ protected:
             if (J->getOperand(1).getMBB() == &ReturnMBB) {
               // This is a conditional branch to the return. Replace the branch
               // with a bclr.
-              BuildMI(
-                  **PI, J, J->getDebugLoc(),
-                  TII->get(J->getOpcode() == PPC::BC ? PPC::BCLR : PPC::BCLRn))
-                  .add(J->getOperand(0))
-                  .copyImplicitOps(*I);
+              MachineInstr *MI = ReturnMBB.getParent()->CloneMachineInstr(&*I);
+              MI->setDesc(
+                  TII->get(J->getOpcode() == PPC::BC ? PPC::BCLR : PPC::BCLRn));
+              MachineInstrBuilder(*ReturnMBB.getParent(), MI)
+                  .add(J->getOperand(0));
+              (*PI)->insert(J, MI);
+
               MachineBasicBlock::iterator K = J--;
               K->eraseFromParent();
               BlockChanged = true;
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCFastISel.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCFastISel.cpp
index 39790ac9a8aa..c181816e31c6 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCFastISel.cpp
@@ -86,7 +86,6 @@ typedef struct Address {
 class PPCFastISel final : public FastISel {
 
   const TargetMachine &TM;
-  const PPCSubtarget *PPCSubTarget;
   const PPCSubtarget *Subtarget;
   PPCFunctionInfo *PPCFuncInfo;
   const TargetInstrInfo &TII;
@@ -97,7 +96,6 @@ class PPCFastISel final : public FastISel {
     explicit PPCFastISel(FunctionLoweringInfo &FuncInfo,
                          const TargetLibraryInfo *LibInfo)
         : FastISel(FuncInfo, LibInfo), TM(FuncInfo.MF->getTarget()),
-          PPCSubTarget(&FuncInfo.MF->getSubtarget<PPCSubtarget>()),
           Subtarget(&FuncInfo.MF->getSubtarget<PPCSubtarget>()),
           PPCFuncInfo(FuncInfo.MF->getInfo<PPCFunctionInfo>()),
           TII(*Subtarget->getInstrInfo()), TLI(*Subtarget->getTargetLowering()),
@@ -1567,6 +1565,10 @@ bool PPCFastISel::fastLowerCall(CallLoweringInfo &CLI) {
   if (IsVarArg)
     return false;
 
+  // If this is a PC-Rel function, let SDISel handle the call.
+  if (Subtarget->isUsingPCRelativeCalls())
+    return false;
+
   // Handle simple calls for now, with legal return types and
   // those that can be extended.
   Type *RetTy = CLI.RetTy;
@@ -1622,7 +1624,10 @@ bool PPCFastISel::fastLowerCall(CallLoweringInfo &CLI) {
     if (!isTypeLegal(ArgTy, ArgVT) && ArgVT != MVT::i16 && ArgVT != MVT::i8)
       return false;
 
-    if (ArgVT.isVector())
+    // FIXME: FastISel cannot handle non-simple types yet, including 128-bit FP
+    // types, which is passed through vector register. Skip these types and
+    // fallback to default SelectionDAG based selection.
+    if (ArgVT.isVector() || ArgVT == MVT::f128)
       return false;
 
     unsigned Arg = getRegForValue(ArgValue);
@@ -1991,6 +1996,10 @@ bool PPCFastISel::fastSelectInstruction(const Instruction *I) {
 // Materialize a floating-point constant into a register, and return
 // the register number (or zero if we failed to handle it).
 unsigned PPCFastISel::PPCMaterializeFP(const ConstantFP *CFP, MVT VT) {
+  // If this is a PC-Rel function, let SDISel handle constant pool.
+  if (Subtarget->isUsingPCRelativeCalls())
+    return false;
+
   // No plans to handle long double here.
   if (VT != MVT::f32 && VT != MVT::f64)
     return 0;
@@ -2055,6 +2064,10 @@ unsigned PPCFastISel::PPCMaterializeFP(const ConstantFP *CFP, MVT VT) {
 // Materialize the address of a global value into a register, and return
 // the register number (or zero if we failed to handle it).
 unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) {
+  // If this is a PC-Rel function, let SDISel handle GV materialization.
+  if (Subtarget->isUsingPCRelativeCalls())
+    return false;
+
   assert(VT == MVT::i64 && "Non-address!");
   const TargetRegisterClass *RC = &PPC::G8RC_and_G8RC_NOX0RegClass;
   unsigned DestReg = createResultReg(RC);
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
index 2ee394e9259d..16536bf23deb 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -39,15 +39,6 @@ EnablePEVectorSpills("ppc-enable-pe-vector-spills",
                      cl::desc("Enable spills in prologue to vector registers."),
                      cl::init(false), cl::Hidden);
 
-/// VRRegNo - Map from a numbered VR register to its enum value.
-///
-static const MCPhysReg VRRegNo[] = {
- PPC::V0 , PPC::V1 , PPC::V2 , PPC::V3 , PPC::V4 , PPC::V5 , PPC::V6 , PPC::V7 ,
- PPC::V8 , PPC::V9 , PPC::V10, PPC::V11, PPC::V12, PPC::V13, PPC::V14, PPC::V15,
- PPC::V16, PPC::V17, PPC::V18, PPC::V19, PPC::V20, PPC::V21, PPC::V22, PPC::V23,
- PPC::V24, PPC::V25, PPC::V26, PPC::V27, PPC::V28, PPC::V29, PPC::V30, PPC::V31
-};
-
 static unsigned computeReturnSaveOffset(const PPCSubtarget &STI) {
   if (STI.isAIXABI())
     return STI.isPPC64() ? 16 : 8;
@@ -227,19 +218,14 @@ const PPCFrameLowering::SpillSlot *PPCFrameLowering::getCalleeSavedSpillSlots(
       CALLEE_SAVED_VRS
   };
 
-  static const SpillSlot AIXOffsets32[] = {
-      CALLEE_SAVED_FPRS,
-      CALLEE_SAVED_GPRS32,
-      // Add AIX's extra CSR.
-      {PPC::R13, -76},
-      // TODO: Update when we add vector support for AIX.
-  };
+  static const SpillSlot AIXOffsets32[] = {CALLEE_SAVED_FPRS,
+                                           CALLEE_SAVED_GPRS32,
+                                           // Add AIX's extra CSR.
+                                           {PPC::R13, -76},
+                                           CALLEE_SAVED_VRS};
 
   static const SpillSlot AIXOffsets64[] = {
-      CALLEE_SAVED_FPRS,
-      CALLEE_SAVED_GPRS64,
-      // TODO: Update when we add vector support for AIX.
-  };
+      CALLEE_SAVED_FPRS, CALLEE_SAVED_GPRS64, CALLEE_SAVED_VRS};
 
   if (Subtarget.is64BitELFABI()) {
     NumEntries = array_lengthof(ELFOffsets64);
@@ -262,153 +248,11 @@ const PPCFrameLowering::SpillSlot *PPCFrameLowering::getCalleeSavedSpillSlots(
   return AIXOffsets32;
 }
 
-/// RemoveVRSaveCode - We have found that this function does not need any code
-/// to manipulate the VRSAVE register, even though it uses vector registers.
-/// This can happen when the only registers used are known to be live in or out
-/// of the function.  Remove all of the VRSAVE related code from the function.
-/// FIXME: The removal of the code results in a compile failure at -O0 when the
-/// function contains a function call, as the GPR containing original VRSAVE
-/// contents is spilled and reloaded around the call.  Without the prolog code,
-/// the spill instruction refers to an undefined register.  This code needs
-/// to account for all uses of that GPR.
-static void RemoveVRSaveCode(MachineInstr &MI) {
-  MachineBasicBlock *Entry = MI.getParent();
-  MachineFunction *MF = Entry->getParent();
-
-  // We know that the MTVRSAVE instruction immediately follows MI.  Remove it.
-  MachineBasicBlock::iterator MBBI = MI;
-  ++MBBI;
-  assert(MBBI != Entry->end() && MBBI->getOpcode() == PPC::MTVRSAVE);
-  MBBI->eraseFromParent();
-
-  bool RemovedAllMTVRSAVEs = true;
-  // See if we can find and remove the MTVRSAVE instruction from all of the
-  // epilog blocks.
-  for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I) {
-    // If last instruction is a return instruction, add an epilogue
-    if (I->isReturnBlock()) {
-      bool FoundIt = false;
-      for (MBBI = I->end(); MBBI != I->begin(); ) {
-        --MBBI;
-        if (MBBI->getOpcode() == PPC::MTVRSAVE) {
-          MBBI->eraseFromParent();  // remove it.
-          FoundIt = true;
-          break;
-        }
-      }
-      RemovedAllMTVRSAVEs &= FoundIt;
-    }
-  }
-
-  // If we found and removed all MTVRSAVE instructions, remove the read of
-  // VRSAVE as well.
-  if (RemovedAllMTVRSAVEs) {
-    MBBI = MI;
-    assert(MBBI != Entry->begin() && "UPDATE_VRSAVE is first instr in block?");
-    --MBBI;
-    assert(MBBI->getOpcode() == PPC::MFVRSAVE && "VRSAVE instrs wandered?");
-    MBBI->eraseFromParent();
-  }
-
-  // Finally, nuke the UPDATE_VRSAVE.
-  MI.eraseFromParent();
-}
-
-// HandleVRSaveUpdate - MI is the UPDATE_VRSAVE instruction introduced by the
-// instruction selector.  Based on the vector registers that have been used,
-// transform this into the appropriate ORI instruction.
-static void HandleVRSaveUpdate(MachineInstr &MI, const TargetInstrInfo &TII) {
-  MachineFunction *MF = MI.getParent()->getParent();
-  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
-  DebugLoc dl = MI.getDebugLoc();
-
-  const MachineRegisterInfo &MRI = MF->getRegInfo();
-  unsigned UsedRegMask = 0;
-  for (unsigned i = 0; i != 32; ++i)
-    if (MRI.isPhysRegModified(VRRegNo[i]))
-      UsedRegMask |= 1 << (31-i);
-
-  // Live in and live out values already must be in the mask, so don't bother
-  // marking them.
-  for (std::pair<unsigned, unsigned> LI : MF->getRegInfo().liveins()) {
-    unsigned RegNo = TRI->getEncodingValue(LI.first);
-    if (VRRegNo[RegNo] == LI.first)        // If this really is a vector reg.
-      UsedRegMask &= ~(1 << (31-RegNo));   // Doesn't need to be marked.
-  }
-
-  // Live out registers appear as use operands on return instructions.
-  for (MachineFunction::const_iterator BI = MF->begin(), BE = MF->end();
-       UsedRegMask != 0 && BI != BE; ++BI) {
-    const MachineBasicBlock &MBB = *BI;
-    if (!MBB.isReturnBlock())
-      continue;
-    const MachineInstr &Ret = MBB.back();
-    for (unsigned I = 0, E = Ret.getNumOperands(); I != E; ++I) {
-      const MachineOperand &MO = Ret.getOperand(I);
-      if (!MO.isReg() || !PPC::VRRCRegClass.contains(MO.getReg()))
-        continue;
-      unsigned RegNo = TRI->getEncodingValue(MO.getReg());
-      UsedRegMask &= ~(1 << (31-RegNo));
-    }
-  }
-
-  // If no registers are used, turn this into a copy.
-  if (UsedRegMask == 0) {
-    // Remove all VRSAVE code.
-    RemoveVRSaveCode(MI);
-    return;
-  }
-
-  Register SrcReg = MI.getOperand(1).getReg();
-  Register DstReg = MI.getOperand(0).getReg();
-
-  if ((UsedRegMask & 0xFFFF) == UsedRegMask) {
-    if (DstReg != SrcReg)
-      BuildMI(*MI.getParent(), MI, dl, TII.get(PPC::ORI), DstReg)
-          .addReg(SrcReg)
-          .addImm(UsedRegMask);
-    else
-      BuildMI(*MI.getParent(), MI, dl, TII.get(PPC::ORI), DstReg)
-          .addReg(SrcReg, RegState::Kill)
-          .addImm(UsedRegMask);
-  } else if ((UsedRegMask & 0xFFFF0000) == UsedRegMask) {
-    if (DstReg != SrcReg)
-      BuildMI(*MI.getParent(), MI, dl, TII.get(PPC::ORIS), DstReg)
-          .addReg(SrcReg)
-          .addImm(UsedRegMask >> 16);
-    else
-      BuildMI(*MI.getParent(), MI, dl, TII.get(PPC::ORIS), DstReg)
-          .addReg(SrcReg, RegState::Kill)
-          .addImm(UsedRegMask >> 16);
-  } else {
-    if (DstReg != SrcReg)
-      BuildMI(*MI.getParent(), MI, dl, TII.get(PPC::ORIS), DstReg)
-          .addReg(SrcReg)
-          .addImm(UsedRegMask >> 16);
-    else
-      BuildMI(*MI.getParent(), MI, dl, TII.get(PPC::ORIS), DstReg)
-          .addReg(SrcReg, RegState::Kill)
-          .addImm(UsedRegMask >> 16);
-
-    BuildMI(*MI.getParent(), MI, dl, TII.get(PPC::ORI), DstReg)
-        .addReg(DstReg, RegState::Kill)
-        .addImm(UsedRegMask & 0xFFFF);
-  }
-
-  // Remove the old UPDATE_VRSAVE instruction.
-  MI.eraseFromParent();
-}
-
 static bool spillsCR(const MachineFunction &MF) {
   const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
   return FuncInfo->isCRSpilled();
 }
 
-static bool spillsVRSAVE(const MachineFunction &MF) {
-  const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
-  return FuncInfo->isVRSAVESpilled();
-}
-
 static bool hasSpills(const MachineFunction &MF) {
   const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
   return FuncInfo->hasSpills();
@@ -474,7 +318,7 @@ PPCFrameLowering::determineFrameLayout(const MachineFunction &MF,
                        !FI->mustSaveTOC() &&        // No need to save TOC.
                        !RegInfo->hasBasePointer(MF); // No special alignment.
 
-  // Note: for PPC32 SVR4ABI (Non-DarwinABI), we can still generate stackless
+  // Note: for PPC32 SVR4ABI, we can still generate stackless
   // code if all local vars are reg-allocated.
   bool FitsInRedZone = FrameSize <= Subtarget.getRedZoneSize();
 
@@ -531,9 +375,10 @@ bool PPCFrameLowering::needsFP(const MachineFunction &MF) const {
     return false;
 
   return MF.getTarget().Options.DisableFramePointerElim(MF) ||
-    MFI.hasVarSizedObjects() || MFI.hasStackMap() || MFI.hasPatchPoint() ||
-    (MF.getTarget().Options.GuaranteedTailCallOpt &&
-     MF.getInfo<PPCFunctionInfo>()->hasFastCall());
+         MFI.hasVarSizedObjects() || MFI.hasStackMap() || MFI.hasPatchPoint() ||
+         MF.exposesReturnsTwice() ||
+         (MF.getTarget().Options.GuaranteedTailCallOpt &&
+          MF.getInfo<PPCFunctionInfo>()->hasFastCall());
 }
 
 void PPCFrameLowering::replaceFPWithRealFP(MachineFunction &MF) const {
@@ -681,6 +526,8 @@ PPCFrameLowering::findScratchRegister(MachineBasicBlock *MBB,
 // register is available, we can adjust for that by not overlapping the spill
 // code. However, if we need to realign the stack (i.e. have a base pointer)
 // and the stack frame is large, we need two scratch registers.
+// Also, stack probe requires two scratch registers, one for old sp, one for
+// large frame and large probe size.
 bool
 PPCFrameLowering::twoUniqueScratchRegsRequired(MachineBasicBlock *MBB) const {
   const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
@@ -692,8 +539,10 @@ PPCFrameLowering::twoUniqueScratchRegsRequired(MachineBasicBlock *MBB) const {
   MachineFrameInfo &MFI = MF.getFrameInfo();
   Align MaxAlign = MFI.getMaxAlign();
   bool HasRedZone = Subtarget.isPPC64() || !Subtarget.isSVR4ABI();
+  const PPCTargetLowering &TLI = *Subtarget.getTargetLowering();
 
-  return (IsLargeFrame || !HasRedZone) && HasBP && MaxAlign > 1;
+  return ((IsLargeFrame || !HasRedZone) && HasBP && MaxAlign > 1) ||
+         TLI.hasInlineStackProbe(MF);
 }
 
 bool PPCFrameLowering::canUseAsPrologue(const MachineBasicBlock &MBB) const {
@@ -736,8 +585,8 @@ bool PPCFrameLowering::stackUpdateCanBeMoved(MachineFunction &MF) const {
   // Frame pointers and base pointers complicate matters so don't do anything
   // if we have them. For example having a frame pointer will sometimes require
   // a copy of r1 into r31 and that makes keeping track of updates to r1 more
-  // difficult.
-  if (hasFP(MF) || RegInfo->hasBasePointer(MF))
+  // difficult. Similar situation exists with setjmp.
+  if (hasFP(MF) || RegInfo->hasBasePointer(MF) || MF.exposesReturnsTwice())
     return false;
 
   // Calls to fast_cc functions use different rules for passing parameters on
@@ -771,24 +620,8 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
   bool isPPC64 = Subtarget.isPPC64();
   // Get the ABI.
   bool isSVR4ABI = Subtarget.isSVR4ABI();
-  bool isAIXABI = Subtarget.isAIXABI();
   bool isELFv2ABI = Subtarget.isELFv2ABI();
-  assert((isSVR4ABI || isAIXABI) && "Unsupported PPC ABI.");
-
-  // Scan the prolog, looking for an UPDATE_VRSAVE instruction.  If we find it,
-  // process it.
-  if (!isSVR4ABI)
-    for (unsigned i = 0; MBBI != MBB.end(); ++i, ++MBBI) {
-      if (MBBI->getOpcode() == PPC::UPDATE_VRSAVE) {
-        if (isAIXABI)
-          report_fatal_error("UPDATE_VRSAVE is unexpected on AIX.");
-        HandleVRSaveUpdate(*MBBI, TII);
-        break;
-      }
-    }
-
-  // Move MBBI back to the beginning of the prologue block.
-  MBBI = MBB.begin();
+  assert((isSVR4ABI || Subtarget.isAIXABI()) && "Unsupported PPC ABI.");
 
   // Work out frame sizes.
   unsigned FrameSize = determineFrameLayoutAndUpdate(MF);
@@ -848,12 +681,8 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
          "FrameSize must be >0 to save/restore the FP or LR for 32-bit SVR4.");
 
   // Using the same bool variable as below to suppress compiler warnings.
-  // Stack probe requires two scratch registers, one for old sp, one for large
-  // frame and large probe size.
   bool SingleScratchReg = findScratchRegister(
-      &MBB, false,
-      twoUniqueScratchRegsRequired(&MBB) || TLI.hasInlineStackProbe(MF),
-      &ScratchReg, &TempReg);
+      &MBB, false, twoUniqueScratchRegsRequired(&MBB), &ScratchReg, &TempReg);
   assert(SingleScratchReg &&
          "Required number of registers not available in this block");
 
@@ -863,26 +692,18 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
 
   int FPOffset = 0;
   if (HasFP) {
-    if (isSVR4ABI) {
-      MachineFrameInfo &MFI = MF.getFrameInfo();
-      int FPIndex = FI->getFramePointerSaveIndex();
-      assert(FPIndex && "No Frame Pointer Save Slot!");
-      FPOffset = MFI.getObjectOffset(FPIndex);
-    } else {
-      FPOffset = getFramePointerSaveOffset();
-    }
+    MachineFrameInfo &MFI = MF.getFrameInfo();
+    int FPIndex = FI->getFramePointerSaveIndex();
+    assert(FPIndex && "No Frame Pointer Save Slot!");
+    FPOffset = MFI.getObjectOffset(FPIndex);
   }
 
   int BPOffset = 0;
   if (HasBP) {
-    if (isSVR4ABI) {
-      MachineFrameInfo &MFI = MF.getFrameInfo();
-      int BPIndex = FI->getBasePointerSaveIndex();
-      assert(BPIndex && "No Base Pointer Save Slot!");
-      BPOffset = MFI.getObjectOffset(BPIndex);
-    } else {
-      BPOffset = getBasePointerSaveOffset();
-    }
+    MachineFrameInfo &MFI = MF.getFrameInfo();
+    int BPIndex = FI->getBasePointerSaveIndex();
+    assert(BPIndex && "No Base Pointer Save Slot!");
+    BPOffset = MFI.getObjectOffset(BPIndex);
   }
 
   int PBPOffset = 0;
@@ -1038,15 +859,15 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
     BuildMI(MBB, MBBI, dl,
             TII.get(isPPC64 ? PPC::PROBED_STACKALLOC_64
                             : PPC::PROBED_STACKALLOC_32))
-        .addDef(ScratchReg)
-        .addDef(TempReg) // TempReg stores the old sp.
+        .addDef(TempReg)
+        .addDef(ScratchReg) // ScratchReg stores the old sp.
         .addImm(NegFrameSize);
     // FIXME: HasSTUX is only read if HasRedZone is not set, in such case, we
     // update the ScratchReg to meet the assumption that ScratchReg contains
     // the NegFrameSize. This solution is rather tricky.
     if (!HasRedZone) {
       BuildMI(MBB, MBBI, dl, TII.get(PPC::SUBF), ScratchReg)
-          .addReg(TempReg)
+          .addReg(ScratchReg)
           .addReg(SPReg);
       HasSTUX = true;
     }
@@ -1366,7 +1187,6 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
 
 void PPCFrameLowering::inlineStackProbe(MachineFunction &MF,
                                         MachineBasicBlock &PrologMBB) const {
-  // TODO: Generate CFI instructions.
   bool isPPC64 = Subtarget.isPPC64();
   const PPCTargetLowering &TLI = *Subtarget.getTargetLowering();
   const PPCInstrInfo &TII = *Subtarget.getInstrInfo();
@@ -1382,10 +1202,12 @@ void PPCFrameLowering::inlineStackProbe(MachineFunction &MF,
   if (StackAllocMIPos == PrologMBB.end())
     return;
   const BasicBlock *ProbedBB = PrologMBB.getBasicBlock();
+  MachineBasicBlock *CurrentMBB = &PrologMBB;
   DebugLoc DL = PrologMBB.findDebugLoc(StackAllocMIPos);
   MachineInstr &MI = *StackAllocMIPos;
   int64_t NegFrameSize = MI.getOperand(2).getImm();
-  int64_t NegProbeSize = -(int64_t)TLI.getStackProbeSize(MF);
+  unsigned ProbeSize = TLI.getStackProbeSize(MF);
+  int64_t NegProbeSize = -(int64_t)ProbeSize;
   assert(isInt<32>(NegProbeSize) && "Unhandled probe size");
   int64_t NumBlocks = NegFrameSize / NegProbeSize;
   int64_t NegResidualSize = NegFrameSize % NegProbeSize;
@@ -1394,10 +1216,10 @@ void PPCFrameLowering::inlineStackProbe(MachineFunction &MF,
   Register FPReg = MI.getOperand(1).getReg();
   const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   bool HasBP = RegInfo->hasBasePointer(MF);
+  Register BPReg = RegInfo->getBaseRegister(MF);
   Align MaxAlign = MFI.getMaxAlign();
-  // Initialize current frame pointer.
+  bool HasRedZone = Subtarget.isPPC64() || !Subtarget.isSVR4ABI();
   const MCInstrDesc &CopyInst = TII.get(isPPC64 ? PPC::OR8 : PPC::OR);
-  BuildMI(PrologMBB, {MI}, DL, CopyInst, FPReg).addReg(SPReg).addReg(SPReg);
   // Subroutines to generate .cfi_* directives.
   auto buildDefCFAReg = [&](MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator MBBI, Register Reg) {
@@ -1437,96 +1259,234 @@ void PPCFrameLowering::inlineStackProbe(MachineFunction &MF,
   // Subroutine to store frame pointer and decrease stack pointer by probe size.
   auto allocateAndProbe = [&](MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator MBBI, int64_t NegSize,
-                              Register NegSizeReg, bool UseDForm) {
+                              Register NegSizeReg, bool UseDForm,
+                              Register StoreReg) {
     if (UseDForm)
       BuildMI(MBB, MBBI, DL, TII.get(isPPC64 ? PPC::STDU : PPC::STWU), SPReg)
-          .addReg(FPReg)
+          .addReg(StoreReg)
           .addImm(NegSize)
           .addReg(SPReg);
     else
       BuildMI(MBB, MBBI, DL, TII.get(isPPC64 ? PPC::STDUX : PPC::STWUX), SPReg)
-          .addReg(FPReg)
+          .addReg(StoreReg)
           .addReg(SPReg)
           .addReg(NegSizeReg);
   };
-  // Use FPReg to calculate CFA.
-  if (needsCFI)
-    buildDefCFA(PrologMBB, {MI}, FPReg, 0);
-  // For case HasBP && MaxAlign > 1, we have to align the SP by performing
-  // SP = SP - SP % MaxAlign.
+  // Used to probe stack when realignment is required.
+  // Note that, according to ABI's requirement, *sp must always equals the
+  // value of back-chain pointer, only st(w|d)u(x) can be used to update sp.
+  // Following is pseudo code:
+  // final_sp = (sp & align) + negframesize;
+  // neg_gap = final_sp - sp;
+  // while (neg_gap < negprobesize) {
+  //   stdu fp, negprobesize(sp);
+  //   neg_gap -= negprobesize;
+  // }
+  // stdux fp, sp, neg_gap
+  //
+  // When HasBP & HasRedzone, back-chain pointer is already saved in BPReg
+  // before probe code, we don't need to save it, so we get one additional reg
+  // that can be used to materialize the probeside if needed to use xform.
+  // Otherwise, we can NOT materialize probeside, so we can only use Dform for
+  // now.
+  //
+  // The allocations are:
+  // if (HasBP && HasRedzone) {
+  //   r0: materialize the probesize if needed so that we can use xform.
+  //   r12: `neg_gap`
+  // } else {
+  //   r0: back-chain pointer
+  //   r12: `neg_gap`.
+  // }
+  auto probeRealignedStack = [&](MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MBBI,
+                                 Register ScratchReg, Register TempReg) {
+    assert(HasBP && "The function is supposed to have base pointer when its "
+                    "stack is realigned.");
+    assert(isPowerOf2_64(ProbeSize) && "Probe size should be power of 2");
+
+    // FIXME: We can eliminate this limitation if we get more infomation about
+    // which part of redzone are already used. Used redzone can be treated
+    // probed. But there might be `holes' in redzone probed, this could
+    // complicate the implementation.
+    assert(ProbeSize >= Subtarget.getRedZoneSize() &&
+           "Probe size should be larger or equal to the size of red-zone so "
+           "that red-zone is not clobbered by probing.");
+
+    Register &FinalStackPtr = TempReg;
+    // FIXME: We only support NegProbeSize materializable by DForm currently.
+    // When HasBP && HasRedzone, we can use xform if we have an additional idle
+    // register.
+    NegProbeSize = std::max(NegProbeSize, -((int64_t)1 << 15));
+    assert(isInt<16>(NegProbeSize) &&
+           "NegProbeSize should be materializable by DForm");
+    Register CRReg = PPC::CR0;
+    // Layout of output assembly kinda like:
+    // bb.0:
+    //   ...
+    //   sub $scratchreg, $finalsp, r1
+    //   cmpdi $scratchreg, <negprobesize>
+    //   bge bb.2
+    // bb.1:
+    //   stdu <backchain>, <negprobesize>(r1)
+    //   sub $scratchreg, $scratchreg, negprobesize
+    //   cmpdi $scratchreg, <negprobesize>
+    //   blt bb.1
+    // bb.2:
+    //   stdux <backchain>, r1, $scratchreg
+    MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator());
+    MachineBasicBlock *ProbeLoopBodyMBB = MF.CreateMachineBasicBlock(ProbedBB);
+    MF.insert(MBBInsertPoint, ProbeLoopBodyMBB);
+    MachineBasicBlock *ProbeExitMBB = MF.CreateMachineBasicBlock(ProbedBB);
+    MF.insert(MBBInsertPoint, ProbeExitMBB);
+    // bb.2
+    {
+      Register BackChainPointer = HasRedZone ? BPReg : TempReg;
+      allocateAndProbe(*ProbeExitMBB, ProbeExitMBB->end(), 0, ScratchReg, false,
+                       BackChainPointer);
+      if (HasRedZone)
+        // PROBED_STACKALLOC_64 assumes Operand(1) stores the old sp, copy BPReg
+        // to TempReg to satisfy it.
+        BuildMI(*ProbeExitMBB, ProbeExitMBB->end(), DL, CopyInst, TempReg)
+            .addReg(BPReg)
+            .addReg(BPReg);
+      ProbeExitMBB->splice(ProbeExitMBB->end(), &MBB, MBBI, MBB.end());
+      ProbeExitMBB->transferSuccessorsAndUpdatePHIs(&MBB);
+    }
+    // bb.0
+    {
+      BuildMI(&MBB, DL, TII.get(isPPC64 ? PPC::SUBF8 : PPC::SUBF), ScratchReg)
+          .addReg(SPReg)
+          .addReg(FinalStackPtr);
+      if (!HasRedZone)
+        BuildMI(&MBB, DL, CopyInst, TempReg).addReg(SPReg).addReg(SPReg);
+      BuildMI(&MBB, DL, TII.get(isPPC64 ? PPC::CMPDI : PPC::CMPWI), CRReg)
+          .addReg(ScratchReg)
+          .addImm(NegProbeSize);
+      BuildMI(&MBB, DL, TII.get(PPC::BCC))
+          .addImm(PPC::PRED_GE)
+          .addReg(CRReg)
+          .addMBB(ProbeExitMBB);
+      MBB.addSuccessor(ProbeLoopBodyMBB);
+      MBB.addSuccessor(ProbeExitMBB);
+    }
+    // bb.1
+    {
+      Register BackChainPointer = HasRedZone ? BPReg : TempReg;
+      allocateAndProbe(*ProbeLoopBodyMBB, ProbeLoopBodyMBB->end(), NegProbeSize,
+                       0, true /*UseDForm*/, BackChainPointer);
+      BuildMI(ProbeLoopBodyMBB, DL, TII.get(isPPC64 ? PPC::ADDI8 : PPC::ADDI),
+              ScratchReg)
+          .addReg(ScratchReg)
+          .addImm(-NegProbeSize);
+      BuildMI(ProbeLoopBodyMBB, DL, TII.get(isPPC64 ? PPC::CMPDI : PPC::CMPWI),
+              CRReg)
+          .addReg(ScratchReg)
+          .addImm(NegProbeSize);
+      BuildMI(ProbeLoopBodyMBB, DL, TII.get(PPC::BCC))
+          .addImm(PPC::PRED_LT)
+          .addReg(CRReg)
+          .addMBB(ProbeLoopBodyMBB);
+      ProbeLoopBodyMBB->addSuccessor(ProbeExitMBB);
+      ProbeLoopBodyMBB->addSuccessor(ProbeLoopBodyMBB);
+    }
+    // Update liveins.
+    recomputeLiveIns(*ProbeLoopBodyMBB);
+    recomputeLiveIns(*ProbeExitMBB);
+    return ProbeExitMBB;
+  };
+  // For case HasBP && MaxAlign > 1, we have to realign the SP by performing
+  // SP = SP - SP % MaxAlign, thus make the probe more like dynamic probe since
+  // the offset subtracted from SP is determined by SP's runtime value.
   if (HasBP && MaxAlign > 1) {
+    // Calculate final stack pointer.
     if (isPPC64)
-      BuildMI(PrologMBB, {MI}, DL, TII.get(PPC::RLDICL), ScratchReg)
-          .addReg(FPReg)
+      BuildMI(*CurrentMBB, {MI}, DL, TII.get(PPC::RLDICL), ScratchReg)
+          .addReg(SPReg)
           .addImm(0)
           .addImm(64 - Log2(MaxAlign));
     else
-      BuildMI(PrologMBB, {MI}, DL, TII.get(PPC::RLWINM), ScratchReg)
-          .addReg(FPReg)
+      BuildMI(*CurrentMBB, {MI}, DL, TII.get(PPC::RLWINM), ScratchReg)
+          .addReg(SPReg)
           .addImm(0)
           .addImm(32 - Log2(MaxAlign))
           .addImm(31);
-    BuildMI(PrologMBB, {MI}, DL, TII.get(isPPC64 ? PPC::SUBFC8 : PPC::SUBFC),
-            SPReg)
+    BuildMI(*CurrentMBB, {MI}, DL, TII.get(isPPC64 ? PPC::SUBF8 : PPC::SUBF),
+            FPReg)
         .addReg(ScratchReg)
         .addReg(SPReg);
-  }
-  // Probe residual part.
-  if (NegResidualSize) {
-    bool ResidualUseDForm = CanUseDForm(NegResidualSize);
-    if (!ResidualUseDForm)
-      MaterializeImm(PrologMBB, {MI}, NegResidualSize, ScratchReg);
-    allocateAndProbe(PrologMBB, {MI}, NegResidualSize, ScratchReg,
-                     ResidualUseDForm);
-  }
-  bool UseDForm = CanUseDForm(NegProbeSize);
-  // If number of blocks is small, just probe them directly.
-  if (NumBlocks < 3) {
-    if (!UseDForm)
-      MaterializeImm(PrologMBB, {MI}, NegProbeSize, ScratchReg);
-    for (int i = 0; i < NumBlocks; ++i)
-      allocateAndProbe(PrologMBB, {MI}, NegProbeSize, ScratchReg, UseDForm);
-    if (needsCFI) {
-      // Restore using SPReg to calculate CFA.
-      buildDefCFAReg(PrologMBB, {MI}, SPReg);
-    }
+    MaterializeImm(*CurrentMBB, {MI}, NegFrameSize, ScratchReg);
+    BuildMI(*CurrentMBB, {MI}, DL, TII.get(isPPC64 ? PPC::ADD8 : PPC::ADD4),
+            FPReg)
+        .addReg(ScratchReg)
+        .addReg(FPReg);
+    CurrentMBB = probeRealignedStack(*CurrentMBB, {MI}, ScratchReg, FPReg);
+    if (needsCFI)
+      buildDefCFAReg(*CurrentMBB, {MI}, FPReg);
   } else {
-    // Since CTR is a volatile register and current shrinkwrap implementation
-    // won't choose an MBB in a loop as the PrologMBB, it's safe to synthesize a
-    // CTR loop to probe.
-    // Calculate trip count and stores it in CTRReg.
-    MaterializeImm(PrologMBB, {MI}, NumBlocks, ScratchReg);
-    BuildMI(PrologMBB, {MI}, DL, TII.get(isPPC64 ? PPC::MTCTR8 : PPC::MTCTR))
-        .addReg(ScratchReg, RegState::Kill);
-    if (!UseDForm)
-      MaterializeImm(PrologMBB, {MI}, NegProbeSize, ScratchReg);
-    // Create MBBs of the loop.
-    MachineFunction::iterator MBBInsertPoint =
-        std::next(PrologMBB.getIterator());
-    MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(ProbedBB);
-    MF.insert(MBBInsertPoint, LoopMBB);
-    MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(ProbedBB);
-    MF.insert(MBBInsertPoint, ExitMBB);
-    // Synthesize the loop body.
-    allocateAndProbe(*LoopMBB, LoopMBB->end(), NegProbeSize, ScratchReg,
-                     UseDForm);
-    BuildMI(LoopMBB, DL, TII.get(isPPC64 ? PPC::BDNZ8 : PPC::BDNZ))
-        .addMBB(LoopMBB);
-    LoopMBB->addSuccessor(ExitMBB);
-    LoopMBB->addSuccessor(LoopMBB);
-    // Synthesize the exit MBB.
-    ExitMBB->splice(ExitMBB->end(), &PrologMBB,
-                    std::next(MachineBasicBlock::iterator(MI)),
-                    PrologMBB.end());
-    ExitMBB->transferSuccessorsAndUpdatePHIs(&PrologMBB);
-    PrologMBB.addSuccessor(LoopMBB);
-    if (needsCFI) {
-      // Restore using SPReg to calculate CFA.
-      buildDefCFAReg(*ExitMBB, ExitMBB->begin(), SPReg);
+    // Initialize current frame pointer.
+    BuildMI(*CurrentMBB, {MI}, DL, CopyInst, FPReg).addReg(SPReg).addReg(SPReg);
+    // Use FPReg to calculate CFA.
+    if (needsCFI)
+      buildDefCFA(*CurrentMBB, {MI}, FPReg, 0);
+    // Probe residual part.
+    if (NegResidualSize) {
+      bool ResidualUseDForm = CanUseDForm(NegResidualSize);
+      if (!ResidualUseDForm)
+        MaterializeImm(*CurrentMBB, {MI}, NegResidualSize, ScratchReg);
+      allocateAndProbe(*CurrentMBB, {MI}, NegResidualSize, ScratchReg,
+                       ResidualUseDForm, FPReg);
+    }
+    bool UseDForm = CanUseDForm(NegProbeSize);
+    // If number of blocks is small, just probe them directly.
+    if (NumBlocks < 3) {
+      if (!UseDForm)
+        MaterializeImm(*CurrentMBB, {MI}, NegProbeSize, ScratchReg);
+      for (int i = 0; i < NumBlocks; ++i)
+        allocateAndProbe(*CurrentMBB, {MI}, NegProbeSize, ScratchReg, UseDForm,
+                         FPReg);
+      if (needsCFI) {
+        // Restore using SPReg to calculate CFA.
+        buildDefCFAReg(*CurrentMBB, {MI}, SPReg);
+      }
+    } else {
+      // Since CTR is a volatile register and current shrinkwrap implementation
+      // won't choose an MBB in a loop as the PrologMBB, it's safe to synthesize a
+      // CTR loop to probe.
+      // Calculate trip count and stores it in CTRReg.
+      MaterializeImm(*CurrentMBB, {MI}, NumBlocks, ScratchReg);
+      BuildMI(*CurrentMBB, {MI}, DL, TII.get(isPPC64 ? PPC::MTCTR8 : PPC::MTCTR))
+          .addReg(ScratchReg, RegState::Kill);
+      if (!UseDForm)
+        MaterializeImm(*CurrentMBB, {MI}, NegProbeSize, ScratchReg);
+      // Create MBBs of the loop.
+      MachineFunction::iterator MBBInsertPoint =
+          std::next(CurrentMBB->getIterator());
+      MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(ProbedBB);
+      MF.insert(MBBInsertPoint, LoopMBB);
+      MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(ProbedBB);
+      MF.insert(MBBInsertPoint, ExitMBB);
+      // Synthesize the loop body.
+      allocateAndProbe(*LoopMBB, LoopMBB->end(), NegProbeSize, ScratchReg,
+                       UseDForm, FPReg);
+      BuildMI(LoopMBB, DL, TII.get(isPPC64 ? PPC::BDNZ8 : PPC::BDNZ))
+          .addMBB(LoopMBB);
+      LoopMBB->addSuccessor(ExitMBB);
+      LoopMBB->addSuccessor(LoopMBB);
+      // Synthesize the exit MBB.
+      ExitMBB->splice(ExitMBB->end(), CurrentMBB,
+                      std::next(MachineBasicBlock::iterator(MI)),
+                      CurrentMBB->end());
+      ExitMBB->transferSuccessorsAndUpdatePHIs(CurrentMBB);
+      CurrentMBB->addSuccessor(LoopMBB);
+      if (needsCFI) {
+        // Restore using SPReg to calculate CFA.
+        buildDefCFAReg(*ExitMBB, ExitMBB->begin(), SPReg);
+      }
+      // Update liveins.
+      recomputeLiveIns(*LoopMBB);
+      recomputeLiveIns(*ExitMBB);
     }
-    // Update liveins.
-    recomputeLiveIns(*LoopMBB);
-    recomputeLiveIns(*ExitMBB);
   }
   ++NumPrologProbed;
   MI.eraseFromParent();
@@ -1551,8 +1511,6 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
 
   // Get processor type.
   bool isPPC64 = Subtarget.isPPC64();
-  // Get the ABI.
-  bool isSVR4ABI = Subtarget.isSVR4ABI();
 
   // Check if the link register (LR) has been saved.
   PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
@@ -1600,24 +1558,16 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
   SingleScratchReg = ScratchReg == TempReg;
 
   if (HasFP) {
-    if (isSVR4ABI) {
-      int FPIndex = FI->getFramePointerSaveIndex();
-      assert(FPIndex && "No Frame Pointer Save Slot!");
-      FPOffset = MFI.getObjectOffset(FPIndex);
-    } else {
-      FPOffset = getFramePointerSaveOffset();
-    }
+    int FPIndex = FI->getFramePointerSaveIndex();
+    assert(FPIndex && "No Frame Pointer Save Slot!");
+    FPOffset = MFI.getObjectOffset(FPIndex);
   }
 
   int BPOffset = 0;
   if (HasBP) {
-    if (isSVR4ABI) {
       int BPIndex = FI->getBasePointerSaveIndex();
       assert(BPIndex && "No Base Pointer Save Slot!");
       BPOffset = MFI.getObjectOffset(BPIndex);
-    } else {
-      BPOffset = getBasePointerSaveOffset();
-    }
   }
 
   int PBPOffset = 0;
@@ -1703,11 +1653,18 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
     // offset by the STDU/STDUX/STWU/STWUX instruction. For targets with red
     // zone add this offset back now.
 
+    // If the function has a base pointer, the stack pointer has been copied
+    // to it so we can restore it by copying in the other direction.
+    if (HasRedZone && HasBP) {
+      BuildMI(MBB, MBBI, dl, OrInst, RBReg).
+        addReg(BPReg).
+        addReg(BPReg);
+    }
     // If this function contained a fastcc call and GuaranteedTailCallOpt is
     // enabled (=> hasFastCall()==true) the fastcc call might contain a tail
     // call which invalidates the stack pointer value in SP(0). So we use the
-    // value of R31 in this case.
-    if (FI->hasFastCall()) {
+    // value of R31 in this case. Similar situation exists with setjmp.
+    else if (FI->hasFastCall() || MF.exposesReturnsTwice()) {
       assert(HasFP && "Expecting a valid frame pointer.");
       if (!HasRedZone)
         RBReg = FPReg;
@@ -2053,7 +2010,6 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
   bool HasGPSaveArea = false;
   bool HasG8SaveArea = false;
   bool HasFPSaveArea = false;
-  bool HasVRSAVESaveArea = false;
   bool HasVRSaveArea = false;
 
   SmallVector<CalleeSavedInfo, 18> GPRegs;
@@ -2093,8 +2049,6 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
     } else if (PPC::CRBITRCRegClass.contains(Reg) ||
                PPC::CRRCRegClass.contains(Reg)) {
       ; // do nothing, as we already know whether CRs are spilled
-    } else if (PPC::VRSAVERCRegClass.contains(Reg)) {
-      HasVRSAVESaveArea = true;
     } else if (PPC::VRRCRegClass.contains(Reg) ||
                PPC::SPERCRegClass.contains(Reg)) {
       // Altivec and SPE are mutually exclusive, but have the same stack
@@ -2217,23 +2171,6 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
     LowerBound -= 4; // The CR save area is always 4 bytes long.
   }
 
-  if (HasVRSAVESaveArea) {
-    // FIXME SVR4: Is it actually possible to have multiple elements in CSI
-    //             which have the VRSAVE register class?
-    // Adjust the frame index of the VRSAVE spill slot.
-    for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
-      unsigned Reg = CSI[i].getReg();
-
-      if (PPC::VRSAVERCRegClass.contains(Reg)) {
-        int FI = CSI[i].getFrameIdx();
-
-        MFI.setObjectOffset(FI, LowerBound + MFI.getObjectOffset(FI));
-      }
-    }
-
-    LowerBound -= 4; // The VRSAVE save area is always 4 bytes long.
-  }
-
   // Both Altivec and SPE have the same alignment and padding requirements
   // within the stack frame.
   if (HasVRSaveArea) {
@@ -2273,8 +2210,8 @@ PPCFrameLowering::addScavengingSpillSlot(MachineFunction &MF,
   // needed alignment padding.
   unsigned StackSize = determineFrameLayout(MF, true);
   MachineFrameInfo &MFI = MF.getFrameInfo();
-  if (MFI.hasVarSizedObjects() || spillsCR(MF) || spillsVRSAVE(MF) ||
-      hasNonRISpills(MF) || (hasSpills(MF) && !isInt<16>(StackSize))) {
+  if (MFI.hasVarSizedObjects() || spillsCR(MF) || hasNonRISpills(MF) ||
+      (hasSpills(MF) && !isInt<16>(StackSize))) {
     const TargetRegisterClass &GPRC = PPC::GPRCRegClass;
     const TargetRegisterClass &G8RC = PPC::G8RCRegClass;
     const TargetRegisterClass &RC = Subtarget.isPPC64() ? G8RC : GPRC;
@@ -2288,7 +2225,7 @@ PPCFrameLowering::addScavengingSpillSlot(MachineFunction &MF,
         MFI.hasVarSizedObjects() && MFI.getMaxAlign() > getStackAlign();
 
     // These kinds of spills might need two registers.
-    if (spillsCR(MF) || spillsVRSAVE(MF) || HasAlVars)
+    if (spillsCR(MF) || HasAlVars)
       RS->addScavengingFrameIndex(
           MFI.CreateStackObject(Size, Alignment, false));
   }
@@ -2365,9 +2302,6 @@ bool PPCFrameLowering::spillCalleeSavedRegisters(
 
   for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
     unsigned Reg = CSI[i].getReg();
-    // VRSAVE can appear here if, for example, @llvm.eh.unwind.init() is used.
-    if (Reg == PPC::VRSAVE)
-      continue;
 
     // CR2 through CR4 are the nonvolatile CR fields.
     bool IsCRField = PPC::CR2 <= Reg && Reg <= PPC::CR4;
@@ -2532,10 +2466,6 @@ bool PPCFrameLowering::restoreCalleeSavedRegisters(
   for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
     unsigned Reg = CSI[i].getReg();
 
-    // VRSAVE can appear here if, for example, @llvm.eh.unwind.init() is used.
-    if (Reg == PPC::VRSAVE)
-      continue;
-
     if ((Reg == PPC::X2 || Reg == PPC::R2) && MustSaveTOC)
       continue;
 
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 8ffd89ef5ccd..2604218da160 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -43,6 +43,7 @@
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/IntrinsicsPowerPC.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CodeGen.h"
@@ -138,7 +139,6 @@ namespace {
   ///
   class PPCDAGToDAGISel : public SelectionDAGISel {
     const PPCTargetMachine &TM;
-    const PPCSubtarget *PPCSubTarget = nullptr;
     const PPCSubtarget *Subtarget = nullptr;
     const PPCTargetLowering *PPCLowering = nullptr;
     unsigned GlobalBaseReg = 0;
@@ -150,14 +150,10 @@ namespace {
     bool runOnMachineFunction(MachineFunction &MF) override {
       // Make sure we re-emit a set of the global base reg if necessary
       GlobalBaseReg = 0;
-      PPCSubTarget = &MF.getSubtarget<PPCSubtarget>();
       Subtarget = &MF.getSubtarget<PPCSubtarget>();
       PPCLowering = Subtarget->getTargetLowering();
       SelectionDAGISel::runOnMachineFunction(MF);
 
-      if (!Subtarget->isSVR4ABI())
-        InsertVRSaveCode(MF);
-
       return true;
     }
 
@@ -218,7 +214,7 @@ namespace {
     /// SelectCC - Select a comparison of the specified values with the
     /// specified condition code, returning the CR# of the expression.
     SDValue SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC,
-                     const SDLoc &dl);
+                     const SDLoc &dl, SDValue Chain = SDValue());
 
     /// SelectAddrImmOffs - Return true if the operand is valid for a preinc
     /// immediate field.  Note that the operand at this point is already the
@@ -295,6 +291,13 @@ namespace {
                                               Align(16));
     }
 
+    /// SelectAddrImmX34 - Returns true if the address N can be represented by
+    /// a base register plus a signed 34-bit displacement. Suitable for use by
+    /// PSTXVP and friends.
+    bool SelectAddrImmX34(SDValue N, SDValue &Disp, SDValue &Base) {
+      return PPCLowering->SelectAddressRegImm34(N, Disp, Base, *CurDAG);
+    }
+
     // Select an address into a single register.
     bool SelectAddr(SDValue N, SDValue &Base) {
       Base = N;
@@ -340,8 +343,6 @@ namespace {
       return true;
     }
 
-    void InsertVRSaveCode(MachineFunction &MF);
-
     StringRef getPassName() const override {
       return "PowerPC DAG->DAG Pattern Instruction Selection";
     }
@@ -351,6 +352,7 @@ namespace {
 
 private:
     bool trySETCC(SDNode *N);
+    bool tryFoldSWTestBRCC(SDNode *N);
     bool tryAsSingleRLDICL(SDNode *N);
     bool tryAsSingleRLDICR(SDNode *N);
     bool tryAsSingleRLWINM(SDNode *N);
@@ -375,70 +377,6 @@ private:
 
 } // end anonymous namespace
 
-/// InsertVRSaveCode - Once the entire function has been instruction selected,
-/// all virtual registers are created and all machine instructions are built,
-/// check to see if we need to save/restore VRSAVE.  If so, do it.
-void PPCDAGToDAGISel::InsertVRSaveCode(MachineFunction &Fn) {
-  // Check to see if this function uses vector registers, which means we have to
-  // save and restore the VRSAVE register and update it with the regs we use.
-  //
-  // In this case, there will be virtual registers of vector type created
-  // by the scheduler.  Detect them now.
-  bool HasVectorVReg = false;
-  for (unsigned i = 0, e = RegInfo->getNumVirtRegs(); i != e; ++i) {
-    unsigned Reg = Register::index2VirtReg(i);
-    if (RegInfo->getRegClass(Reg) == &PPC::VRRCRegClass) {
-      HasVectorVReg = true;
-      break;
-    }
-  }
-  if (!HasVectorVReg) return;  // nothing to do.
-
-  // If we have a vector register, we want to emit code into the entry and exit
-  // blocks to save and restore the VRSAVE register.  We do this here (instead
-  // of marking all vector instructions as clobbering VRSAVE) for two reasons:
-  //
-  // 1. This (trivially) reduces the load on the register allocator, by not
-  //    having to represent the live range of the VRSAVE register.
-  // 2. This (more significantly) allows us to create a temporary virtual
-  //    register to hold the saved VRSAVE value, allowing this temporary to be
-  //    register allocated, instead of forcing it to be spilled to the stack.
-
-  // Create two vregs - one to hold the VRSAVE register that is live-in to the
-  // function and one for the value after having bits or'd into it.
-  Register InVRSAVE = RegInfo->createVirtualRegister(&PPC::GPRCRegClass);
-  Register UpdatedVRSAVE = RegInfo->createVirtualRegister(&PPC::GPRCRegClass);
-
-  const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
-  MachineBasicBlock &EntryBB = *Fn.begin();
-  DebugLoc dl;
-  // Emit the following code into the entry block:
-  // InVRSAVE = MFVRSAVE
-  // UpdatedVRSAVE = UPDATE_VRSAVE InVRSAVE
-  // MTVRSAVE UpdatedVRSAVE
-  MachineBasicBlock::iterator IP = EntryBB.begin();  // Insert Point
-  BuildMI(EntryBB, IP, dl, TII.get(PPC::MFVRSAVE), InVRSAVE);
-  BuildMI(EntryBB, IP, dl, TII.get(PPC::UPDATE_VRSAVE),
-          UpdatedVRSAVE).addReg(InVRSAVE);
-  BuildMI(EntryBB, IP, dl, TII.get(PPC::MTVRSAVE)).addReg(UpdatedVRSAVE);
-
-  // Find all return blocks, outputting a restore in each epilog.
-  for (MachineFunction::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) {
-    if (BB->isReturnBlock()) {
-      IP = BB->end(); --IP;
-
-      // Skip over all terminator instructions, which are part of the return
-      // sequence.
-      MachineBasicBlock::iterator I2 = IP;
-      while (I2 != BB->begin() && (--I2)->isTerminator())
-        IP = I2;
-
-      // Emit: MTVRSAVE InVRSave
-      BuildMI(*BB, IP, dl, TII.get(PPC::MTVRSAVE)).addReg(InVRSAVE);
-    }
-  }
-}
-
 /// getGlobalBaseReg - Output the instructions required to put the
 /// base address to use for accessing globals into a register.
 ///
@@ -648,6 +586,8 @@ bool PPCDAGToDAGISel::tryTLSXFormStore(StoreSDNode *ST) {
   SDValue Offset = ST->getOffset();
   if (!Offset.isUndef())
     return false;
+  if (Base.getOperand(1).getOpcode() == PPCISD::TLS_LOCAL_EXEC_MAT_ADDR)
+    return false;
 
   SDLoc dl(ST);
   EVT MemVT = ST->getMemoryVT();
@@ -691,6 +631,8 @@ bool PPCDAGToDAGISel::tryTLSXFormLoad(LoadSDNode *LD) {
   SDValue Offset = LD->getOffset();
   if (!Offset.isUndef())
     return false;
+  if (Base.getOperand(1).getOpcode() == PPCISD::TLS_LOCAL_EXEC_MAT_ADDR)
+    return false;
 
   SDLoc dl(LD);
   EVT MemVT = LD->getMemoryVT();
@@ -800,251 +742,6 @@ bool PPCDAGToDAGISel::tryBitfieldInsert(SDNode *N) {
   return false;
 }
 
-// Predict the number of instructions that would be generated by calling
-// selectI64Imm(N).
-static unsigned selectI64ImmInstrCountDirect(int64_t Imm) {
-  // Assume no remaining bits.
-  unsigned Remainder = 0;
-  // Assume no shift required.
-  unsigned Shift = 0;
-
-  // If it can't be represented as a 32 bit value.
-  if (!isInt<32>(Imm)) {
-    Shift = countTrailingZeros<uint64_t>(Imm);
-    int64_t ImmSh = static_cast<uint64_t>(Imm) >> Shift;
-
-    // If the shifted value fits 32 bits.
-    if (isInt<32>(ImmSh)) {
-      // Go with the shifted value.
-      Imm = ImmSh;
-    } else {
-      // Still stuck with a 64 bit value.
-      Remainder = Imm;
-      Shift = 32;
-      Imm >>= 32;
-    }
-  }
-
-  // Intermediate operand.
-  unsigned Result = 0;
-
-  // Handle first 32 bits.
-  unsigned Lo = Imm & 0xFFFF;
-
-  // Simple value.
-  if (isInt<16>(Imm)) {
-    // Just the Lo bits.
-    ++Result;
-  } else if (Lo) {
-    // Handle the Hi bits and Lo bits.
-    Result += 2;
-  } else {
-    // Just the Hi bits.
-    ++Result;
-  }
-
-  // If no shift, we're done.
-  if (!Shift) return Result;
-
-  // If Hi word == Lo word,
-  // we can use rldimi to insert the Lo word into Hi word.
-  if ((unsigned)(Imm & 0xFFFFFFFF) == Remainder) {
-    ++Result;
-    return Result;
-  }
-
-  // Shift for next step if the upper 32-bits were not zero.
-  if (Imm)
-    ++Result;
-
-  // Add in the last bits as required.
-  if ((Remainder >> 16) & 0xFFFF)
-    ++Result;
-  if (Remainder & 0xFFFF)
-    ++Result;
-
-  return Result;
-}
-
-static uint64_t Rot64(uint64_t Imm, unsigned R) {
-  return (Imm << R) | (Imm >> (64 - R));
-}
-
-static unsigned selectI64ImmInstrCount(int64_t Imm) {
-  unsigned Count = selectI64ImmInstrCountDirect(Imm);
-
-  // If the instruction count is 1 or 2, we do not need further analysis
-  // since rotate + load constant requires at least 2 instructions.
-  if (Count <= 2)
-    return Count;
-
-  for (unsigned r = 1; r < 63; ++r) {
-    uint64_t RImm = Rot64(Imm, r);
-    unsigned RCount = selectI64ImmInstrCountDirect(RImm) + 1;
-    Count = std::min(Count, RCount);
-
-    // See comments in selectI64Imm for an explanation of the logic below.
-    unsigned LS = findLastSet(RImm);
-    if (LS != r-1)
-      continue;
-
-    uint64_t OnesMask = -(int64_t) (UINT64_C(1) << (LS+1));
-    uint64_t RImmWithOnes = RImm | OnesMask;
-
-    RCount = selectI64ImmInstrCountDirect(RImmWithOnes) + 1;
-    Count = std::min(Count, RCount);
-  }
-
-  return Count;
-}
-
-// Select a 64-bit constant. For cost-modeling purposes, selectI64ImmInstrCount
-// (above) needs to be kept in sync with this function.
-static SDNode *selectI64ImmDirect(SelectionDAG *CurDAG, const SDLoc &dl,
-                                  int64_t Imm) {
-  // Assume no remaining bits.
-  unsigned Remainder = 0;
-  // Assume no shift required.
-  unsigned Shift = 0;
-
-  // If it can't be represented as a 32 bit value.
-  if (!isInt<32>(Imm)) {
-    Shift = countTrailingZeros<uint64_t>(Imm);
-    int64_t ImmSh = static_cast<uint64_t>(Imm) >> Shift;
-
-    // If the shifted value fits 32 bits.
-    if (isInt<32>(ImmSh)) {
-      // Go with the shifted value.
-      Imm = ImmSh;
-    } else {
-      // Still stuck with a 64 bit value.
-      Remainder = Imm;
-      Shift = 32;
-      Imm >>= 32;
-    }
-  }
-
-  // Intermediate operand.
-  SDNode *Result;
-
-  // Handle first 32 bits.
-  unsigned Lo = Imm & 0xFFFF;
-  unsigned Hi = (Imm >> 16) & 0xFFFF;
-
-  auto getI32Imm = [CurDAG, dl](unsigned Imm) {
-      return CurDAG->getTargetConstant(Imm, dl, MVT::i32);
-  };
-
-  // Simple value.
-  if (isInt<16>(Imm)) {
-    uint64_t SextImm = SignExtend64(Lo, 16);
-    SDValue SDImm = CurDAG->getTargetConstant(SextImm, dl, MVT::i64);
-    // Just the Lo bits.
-    Result = CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64, SDImm);
-  } else if (Lo) {
-    // Handle the Hi bits.
-    unsigned OpC = Hi ? PPC::LIS8 : PPC::LI8;
-    Result = CurDAG->getMachineNode(OpC, dl, MVT::i64, getI32Imm(Hi));
-    // And Lo bits.
-    Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64,
-                                    SDValue(Result, 0), getI32Imm(Lo));
-  } else {
-    // Just the Hi bits.
-    Result = CurDAG->getMachineNode(PPC::LIS8, dl, MVT::i64, getI32Imm(Hi));
-  }
-
-  // If no shift, we're done.
-  if (!Shift) return Result;
-
-  // If Hi word == Lo word,
-  // we can use rldimi to insert the Lo word into Hi word.
-  if ((unsigned)(Imm & 0xFFFFFFFF) == Remainder) {
-    SDValue Ops[] =
-      { SDValue(Result, 0), SDValue(Result, 0), getI32Imm(Shift), getI32Imm(0)};
-    return CurDAG->getMachineNode(PPC::RLDIMI, dl, MVT::i64, Ops);
-  }
-
-  // Shift for next step if the upper 32-bits were not zero.
-  if (Imm) {
-    Result = CurDAG->getMachineNode(PPC::RLDICR, dl, MVT::i64,
-                                    SDValue(Result, 0),
-                                    getI32Imm(Shift),
-                                    getI32Imm(63 - Shift));
-  }
-
-  // Add in the last bits as required.
-  if ((Hi = (Remainder >> 16) & 0xFFFF)) {
-    Result = CurDAG->getMachineNode(PPC::ORIS8, dl, MVT::i64,
-                                    SDValue(Result, 0), getI32Imm(Hi));
-  }
-  if ((Lo = Remainder & 0xFFFF)) {
-    Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64,
-                                    SDValue(Result, 0), getI32Imm(Lo));
-  }
-
-  return Result;
-}
-
-static SDNode *selectI64Imm(SelectionDAG *CurDAG, const SDLoc &dl,
-                            int64_t Imm) {
-  unsigned Count = selectI64ImmInstrCountDirect(Imm);
-
-  // If the instruction count is 1 or 2, we do not need further analysis
-  // since rotate + load constant requires at least 2 instructions.
-  if (Count <= 2)
-    return selectI64ImmDirect(CurDAG, dl, Imm);
-
-  unsigned RMin = 0;
-
-  int64_t MatImm;
-  unsigned MaskEnd;
-
-  for (unsigned r = 1; r < 63; ++r) {
-    uint64_t RImm = Rot64(Imm, r);
-    unsigned RCount = selectI64ImmInstrCountDirect(RImm) + 1;
-    if (RCount < Count) {
-      Count = RCount;
-      RMin = r;
-      MatImm = RImm;
-      MaskEnd = 63;
-    }
-
-    // If the immediate to generate has many trailing zeros, it might be
-    // worthwhile to generate a rotated value with too many leading ones
-    // (because that's free with li/lis's sign-extension semantics), and then
-    // mask them off after rotation.
-
-    unsigned LS = findLastSet(RImm);
-    // We're adding (63-LS) higher-order ones, and we expect to mask them off
-    // after performing the inverse rotation by (64-r). So we need that:
-    //   63-LS == 64-r => LS == r-1
-    if (LS != r-1)
-      continue;
-
-    uint64_t OnesMask = -(int64_t) (UINT64_C(1) << (LS+1));
-    uint64_t RImmWithOnes = RImm | OnesMask;
-
-    RCount = selectI64ImmInstrCountDirect(RImmWithOnes) + 1;
-    if (RCount < Count) {
-      Count = RCount;
-      RMin = r;
-      MatImm = RImmWithOnes;
-      MaskEnd = LS;
-    }
-  }
-
-  if (!RMin)
-    return selectI64ImmDirect(CurDAG, dl, Imm);
-
-  auto getI32Imm = [CurDAG, dl](unsigned Imm) {
-      return CurDAG->getTargetConstant(Imm, dl, MVT::i32);
-  };
-
-  SDValue Val = SDValue(selectI64ImmDirect(CurDAG, dl, MatImm), 0);
-  return CurDAG->getMachineNode(PPC::RLDICR, dl, MVT::i64, Val,
-                                getI32Imm(64 - RMin), getI32Imm(MaskEnd));
-}
-
 static unsigned allUsesTruncate(SelectionDAG *CurDAG, SDNode *N) {
   unsigned MaxTruncation = 0;
   // Cannot use range-based for loop here as we need the actual use (i.e. we
@@ -1101,6 +798,274 @@ static unsigned allUsesTruncate(SelectionDAG *CurDAG, SDNode *N) {
   return MaxTruncation;
 }
 
+// For any 32 < Num < 64, check if the Imm contains at least Num consecutive
+// zeros and return the number of bits by the left of these consecutive zeros.
+static int findContiguousZerosAtLeast(uint64_t Imm, unsigned Num) {
+  unsigned HiTZ = countTrailingZeros<uint32_t>(Hi_32(Imm));
+  unsigned LoLZ = countLeadingZeros<uint32_t>(Lo_32(Imm));
+  if ((HiTZ + LoLZ) >= Num)
+    return (32 + HiTZ);
+  return 0;
+}
+
+// Direct materialization of 64-bit constants by enumerated patterns.
+static SDNode *selectI64ImmDirect(SelectionDAG *CurDAG, const SDLoc &dl,
+                                  uint64_t Imm, unsigned &InstCnt) {
+  unsigned TZ = countTrailingZeros<uint64_t>(Imm);
+  unsigned LZ = countLeadingZeros<uint64_t>(Imm);
+  unsigned TO = countTrailingOnes<uint64_t>(Imm);
+  unsigned LO = countLeadingOnes<uint64_t>(Imm);
+  unsigned Hi32 = Hi_32(Imm);
+  unsigned Lo32 = Lo_32(Imm);
+  SDNode *Result = nullptr;
+  unsigned Shift = 0;
+
+  auto getI32Imm = [CurDAG, dl](unsigned Imm) {
+    return CurDAG->getTargetConstant(Imm, dl, MVT::i32);
+  };
+
+  // Following patterns use 1 instructions to materialize the Imm.
+  InstCnt = 1;
+  // 1-1) Patterns : {zeros}{15-bit valve}
+  //                 {ones}{15-bit valve}
+  if (isInt<16>(Imm)) {
+    SDValue SDImm = CurDAG->getTargetConstant(Imm, dl, MVT::i64);
+    return CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64, SDImm);
+  }
+  // 1-2) Patterns : {zeros}{15-bit valve}{16 zeros}
+  //                 {ones}{15-bit valve}{16 zeros}
+  if (TZ > 15 && (LZ > 32 || LO > 32))
+    return CurDAG->getMachineNode(PPC::LIS8, dl, MVT::i64,
+                                  getI32Imm((Imm >> 16) & 0xffff));
+
+  // Following patterns use 2 instructions to materialize the Imm.
+  InstCnt = 2;
+  assert(LZ < 64 && "Unexpected leading zeros here.");
+  // Count of ones follwing the leading zeros.
+  unsigned FO = countLeadingOnes<uint64_t>(Imm << LZ);
+  // 2-1) Patterns : {zeros}{31-bit value}
+  //                 {ones}{31-bit value}
+  if (isInt<32>(Imm)) {
+    uint64_t ImmHi16 = (Imm >> 16) & 0xffff;
+    unsigned Opcode = ImmHi16 ? PPC::LIS8 : PPC::LI8;
+    Result = CurDAG->getMachineNode(Opcode, dl, MVT::i64, getI32Imm(ImmHi16));
+    return CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64, SDValue(Result, 0),
+                                  getI32Imm(Imm & 0xffff));
+  }
+  // 2-2) Patterns : {zeros}{ones}{15-bit value}{zeros}
+  //                 {zeros}{15-bit value}{zeros}
+  //                 {zeros}{ones}{15-bit value}
+  //                 {ones}{15-bit value}{zeros}
+  // We can take advantage of LI's sign-extension semantics to generate leading
+  // ones, and then use RLDIC to mask off the ones in both sides after rotation.
+  if ((LZ + FO + TZ) > 48) {
+    Result = CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64,
+                                    getI32Imm((Imm >> TZ) & 0xffff));
+    return CurDAG->getMachineNode(PPC::RLDIC, dl, MVT::i64, SDValue(Result, 0),
+                                  getI32Imm(TZ), getI32Imm(LZ));
+  }
+  // 2-3) Pattern : {zeros}{15-bit value}{ones}
+  // Shift right the Imm by (48 - LZ) bits to construct a negtive 16 bits value,
+  // therefore we can take advantage of LI's sign-extension semantics, and then
+  // mask them off after rotation.
+  //
+  // +--LZ--||-15-bit-||--TO--+     +-------------|--16-bit--+
+  // |00000001bbbbbbbbb1111111| ->  |00000000000001bbbbbbbbb1|
+  // +------------------------+     +------------------------+
+  // 63                      0      63                      0
+  //          Imm                   (Imm >> (48 - LZ) & 0xffff)
+  // +----sext-----|--16-bit--+     +clear-|-----------------+
+  // |11111111111111bbbbbbbbb1| ->  |00000001bbbbbbbbb1111111|
+  // +------------------------+     +------------------------+
+  // 63                      0      63                      0
+  // LI8: sext many leading zeros   RLDICL: rotate left (48 - LZ), clear left LZ
+  if ((LZ + TO) > 48) {
+    // Since the immediates with (LZ > 32) have been handled by previous
+    // patterns, here we have (LZ <= 32) to make sure we will not shift right
+    // the Imm by a negative value.
+    assert(LZ <= 32 && "Unexpected shift value.");
+    Result = CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64,
+                                    getI32Imm((Imm >> (48 - LZ) & 0xffff)));
+    return CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, SDValue(Result, 0),
+                                  getI32Imm(48 - LZ), getI32Imm(LZ));
+  }
+  // 2-4) Patterns : {zeros}{ones}{15-bit value}{ones}
+  //                 {ones}{15-bit value}{ones}
+  // We can take advantage of LI's sign-extension semantics to generate leading
+  // ones, and then use RLDICL to mask off the ones in left sides (if required)
+  // after rotation.
+  //
+  // +-LZ-FO||-15-bit-||--TO--+     +-------------|--16-bit--+
+  // |00011110bbbbbbbbb1111111| ->  |000000000011110bbbbbbbbb|
+  // +------------------------+     +------------------------+
+  // 63                      0      63                      0
+  //            Imm                    (Imm >> TO) & 0xffff
+  // +----sext-----|--16-bit--+     +LZ|---------------------+
+  // |111111111111110bbbbbbbbb| ->  |00011110bbbbbbbbb1111111|
+  // +------------------------+     +------------------------+
+  // 63                      0      63                      0
+  // LI8: sext many leading zeros   RLDICL: rotate left TO, clear left LZ
+  if ((LZ + FO + TO) > 48) {
+    Result = CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64,
+                                    getI32Imm((Imm >> TO) & 0xffff));
+    return CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, SDValue(Result, 0),
+                                  getI32Imm(TO), getI32Imm(LZ));
+  }
+  // 2-5) Pattern : {32 zeros}{****}{0}{15-bit value}
+  // If Hi32 is zero and the Lo16(in Lo32) can be presented as a positive 16 bit
+  // value, we can use LI for Lo16 without generating leading ones then add the
+  // Hi16(in Lo32).
+  if (LZ == 32 && ((Lo32 & 0x8000) == 0)) {
+    Result = CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64,
+                                    getI32Imm(Lo32 & 0xffff));
+    return CurDAG->getMachineNode(PPC::ORIS8, dl, MVT::i64, SDValue(Result, 0),
+                                  getI32Imm(Lo32 >> 16));
+  }
+  // 2-6) Patterns : {******}{49 zeros}{******}
+  //                 {******}{49 ones}{******}
+  // If the Imm contains 49 consecutive zeros/ones, it means that a total of 15
+  // bits remain on both sides. Rotate right the Imm to construct an int<16>
+  // value, use LI for int<16> value and then use RLDICL without mask to rotate
+  // it back.
+  //
+  // 1) findContiguousZerosAtLeast(Imm, 49)
+  // +------|--zeros-|------+     +---ones--||---15 bit--+
+  // |bbbbbb0000000000aaaaaa| ->  |0000000000aaaaaabbbbbb|
+  // +----------------------+     +----------------------+
+  // 63                    0      63                    0
+  //
+  // 2) findContiguousZerosAtLeast(~Imm, 49)
+  // +------|--ones--|------+     +---ones--||---15 bit--+
+  // |bbbbbb1111111111aaaaaa| ->  |1111111111aaaaaabbbbbb|
+  // +----------------------+     +----------------------+
+  // 63                    0      63                    0
+  if ((Shift = findContiguousZerosAtLeast(Imm, 49)) ||
+      (Shift = findContiguousZerosAtLeast(~Imm, 49))) {
+    uint64_t RotImm = (Imm >> Shift) | (Imm << (64 - Shift));
+    Result = CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64,
+                                    getI32Imm(RotImm & 0xffff));
+    return CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, SDValue(Result, 0),
+                                  getI32Imm(Shift), getI32Imm(0));
+  }
+
+  // Following patterns use 3 instructions to materialize the Imm.
+  InstCnt = 3;
+  // 3-1) Patterns : {zeros}{ones}{31-bit value}{zeros}
+  //                 {zeros}{31-bit value}{zeros}
+  //                 {zeros}{ones}{31-bit value}
+  //                 {ones}{31-bit value}{zeros}
+  // We can take advantage of LIS's sign-extension semantics to generate leading
+  // ones, add the remaining bits with ORI, and then use RLDIC to mask off the
+  // ones in both sides after rotation.
+  if ((LZ + FO + TZ) > 32) {
+    uint64_t ImmHi16 = (Imm >> (TZ + 16)) & 0xffff;
+    unsigned Opcode = ImmHi16 ? PPC::LIS8 : PPC::LI8;
+    Result = CurDAG->getMachineNode(Opcode, dl, MVT::i64, getI32Imm(ImmHi16));
+    Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64, SDValue(Result, 0),
+                                    getI32Imm((Imm >> TZ) & 0xffff));
+    return CurDAG->getMachineNode(PPC::RLDIC, dl, MVT::i64, SDValue(Result, 0),
+                                  getI32Imm(TZ), getI32Imm(LZ));
+  }
+  // 3-2) Pattern : {zeros}{31-bit value}{ones}
+  // Shift right the Imm by (32 - LZ) bits to construct a negtive 32 bits value,
+  // therefore we can take advantage of LIS's sign-extension semantics, add
+  // the remaining bits with ORI, and then mask them off after rotation.
+  // This is similar to Pattern 2-3, please refer to the diagram there.
+  if ((LZ + TO) > 32) {
+    // Since the immediates with (LZ > 32) have been handled by previous
+    // patterns, here we have (LZ <= 32) to make sure we will not shift right
+    // the Imm by a negative value.
+    assert(LZ <= 32 && "Unexpected shift value.");
+    Result = CurDAG->getMachineNode(PPC::LIS8, dl, MVT::i64,
+                                    getI32Imm((Imm >> (48 - LZ)) & 0xffff));
+    Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64, SDValue(Result, 0),
+                                    getI32Imm((Imm >> (32 - LZ)) & 0xffff));
+    return CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, SDValue(Result, 0),
+                                  getI32Imm(32 - LZ), getI32Imm(LZ));
+  }
+  // 3-3) Patterns : {zeros}{ones}{31-bit value}{ones}
+  //                 {ones}{31-bit value}{ones}
+  // We can take advantage of LIS's sign-extension semantics to generate leading
+  // ones, add the remaining bits with ORI, and then use RLDICL to mask off the
+  // ones in left sides (if required) after rotation.
+  // This is similar to Pattern 2-4, please refer to the diagram there.
+  if ((LZ + FO + TO) > 32) {
+    Result = CurDAG->getMachineNode(PPC::LIS8, dl, MVT::i64,
+                                    getI32Imm((Imm >> (TO + 16)) & 0xffff));
+    Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64, SDValue(Result, 0),
+                                    getI32Imm((Imm >> TO) & 0xffff));
+    return CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, SDValue(Result, 0),
+                                  getI32Imm(TO), getI32Imm(LZ));
+  }
+  // 3-4) Patterns : High word == Low word
+  if (Hi32 == Lo32) {
+    // Handle the first 32 bits.
+    uint64_t ImmHi16 = (Lo32 >> 16) & 0xffff;
+    unsigned Opcode = ImmHi16 ? PPC::LIS8 : PPC::LI8;
+    Result = CurDAG->getMachineNode(Opcode, dl, MVT::i64, getI32Imm(ImmHi16));
+    Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64, SDValue(Result, 0),
+                                    getI32Imm(Lo32 & 0xffff));
+    // Use rldimi to insert the Low word into High word.
+    SDValue Ops[] = {SDValue(Result, 0), SDValue(Result, 0), getI32Imm(32),
+                     getI32Imm(0)};
+    return CurDAG->getMachineNode(PPC::RLDIMI, dl, MVT::i64, Ops);
+  }
+  // 3-5) Patterns : {******}{33 zeros}{******}
+  //                 {******}{33 ones}{******}
+  // If the Imm contains 33 consecutive zeros/ones, it means that a total of 31
+  // bits remain on both sides. Rotate right the Imm to construct an int<32>
+  // value, use LIS + ORI for int<32> value and then use RLDICL without mask to
+  // rotate it back.
+  // This is similar to Pattern 2-6, please refer to the diagram there.
+  if ((Shift = findContiguousZerosAtLeast(Imm, 33)) ||
+      (Shift = findContiguousZerosAtLeast(~Imm, 33))) {
+    uint64_t RotImm = (Imm >> Shift) | (Imm << (64 - Shift));
+    uint64_t ImmHi16 = (RotImm >> 16) & 0xffff;
+    unsigned Opcode = ImmHi16 ? PPC::LIS8 : PPC::LI8;
+    Result = CurDAG->getMachineNode(Opcode, dl, MVT::i64, getI32Imm(ImmHi16));
+    Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64, SDValue(Result, 0),
+                                    getI32Imm(RotImm & 0xffff));
+    return CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, SDValue(Result, 0),
+                                  getI32Imm(Shift), getI32Imm(0));
+  }
+
+  InstCnt = 0;
+  return nullptr;
+}
+
+static SDNode *selectI64Imm(SelectionDAG *CurDAG, const SDLoc &dl, uint64_t Imm,
+                            unsigned *InstCnt = nullptr) {
+  unsigned InstCntDirect = 0;
+  // No more than 3 instructions is used if we can select the i64 immediate
+  // directly.
+  SDNode *Result = selectI64ImmDirect(CurDAG, dl, Imm, InstCntDirect);
+  if (Result) {
+    if (InstCnt)
+      *InstCnt = InstCntDirect;
+    return Result;
+  }
+  auto getI32Imm = [CurDAG, dl](unsigned Imm) {
+    return CurDAG->getTargetConstant(Imm, dl, MVT::i32);
+  };
+  // Handle the upper 32 bit value.
+  Result =
+      selectI64ImmDirect(CurDAG, dl, Imm & 0xffffffff00000000, InstCntDirect);
+  // Add in the last bits as required.
+  if (uint32_t Hi16 = (Lo_32(Imm) >> 16) & 0xffff) {
+    Result = CurDAG->getMachineNode(PPC::ORIS8, dl, MVT::i64,
+                                    SDValue(Result, 0), getI32Imm(Hi16));
+    ++InstCntDirect;
+  }
+  if (uint32_t Lo16 = Lo_32(Imm) & 0xffff) {
+    Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64, SDValue(Result, 0),
+                                    getI32Imm(Lo16));
+    ++InstCntDirect;
+  }
+  if (InstCnt)
+    *InstCnt = InstCntDirect;
+  return Result;
+}
+
 // Select a 64-bit constant.
 static SDNode *selectI64Imm(SelectionDAG *CurDAG, SDNode *N) {
   SDLoc dl(N);
@@ -1253,6 +1218,7 @@ class BitPermutationSelector {
       }
       break;
     case ISD::SHL:
+    case PPCISD::SHL:
       if (isa<ConstantSDNode>(V.getOperand(1))) {
         unsigned ShiftAmt = V.getConstantOperandVal(1);
 
@@ -1268,6 +1234,7 @@ class BitPermutationSelector {
       }
       break;
     case ISD::SRL:
+    case PPCISD::SRL:
       if (isa<ConstantSDNode>(V.getOperand(1))) {
         unsigned ShiftAmt = V.getConstantOperandVal(1);
 
@@ -2147,11 +2114,14 @@ class BitPermutationSelector {
 
       unsigned NumAndInsts = (unsigned) NeedsRotate +
                              (unsigned) (bool) Res;
+      unsigned NumOfSelectInsts = 0;
+      selectI64Imm(CurDAG, dl, Mask, &NumOfSelectInsts);
+      assert(NumOfSelectInsts > 0 && "Failed to select an i64 constant.");
       if (Use32BitInsts)
         NumAndInsts += (unsigned) (ANDIMask != 0) + (unsigned) (ANDISMask != 0) +
                        (unsigned) (ANDIMask != 0 && ANDISMask != 0);
       else
-        NumAndInsts += selectI64ImmInstrCount(Mask) + /* and */ 1;
+        NumAndInsts += NumOfSelectInsts + /* and */ 1;
 
       unsigned NumRLInsts = 0;
       bool FirstBG = true;
@@ -2375,12 +2345,14 @@ class BitPermutationSelector {
           Res = SDValue(CurDAG->getMachineNode(PPC::OR8, dl, MVT::i64,
                           ExtendToInt64(ANDIVal, dl), ANDISVal), 0);
       } else {
-        if (InstCnt) *InstCnt += selectI64ImmInstrCount(Mask) + /* and */ 1;
-
-        SDValue MaskVal = SDValue(selectI64Imm(CurDAG, dl, Mask), 0);
-        Res =
-          SDValue(CurDAG->getMachineNode(PPC::AND8, dl, MVT::i64,
-                                         ExtendToInt64(Res, dl), MaskVal), 0);
+        unsigned NumOfSelectInsts = 0;
+        SDValue MaskVal =
+            SDValue(selectI64Imm(CurDAG, dl, Mask, &NumOfSelectInsts), 0);
+        Res = SDValue(CurDAG->getMachineNode(PPC::AND8, dl, MVT::i64,
+                                             ExtendToInt64(Res, dl), MaskVal),
+                      0);
+        if (InstCnt)
+          *InstCnt += NumOfSelectInsts + /* and */ 1;
       }
     }
 
@@ -2411,7 +2383,7 @@ class BitPermutationSelector {
   }
 
   void eraseMatchingBitGroups(function_ref<bool(const BitGroup &)> F) {
-    BitGroups.erase(remove_if(BitGroups, F), BitGroups.end());
+    erase_if(BitGroups, F);
   }
 
   SmallVector<ValueBit, 64> Bits;
@@ -3661,6 +3633,12 @@ bool PPCDAGToDAGISel::tryIntCompareInGPR(SDNode *N) {
   if (TM.getOptLevel() == CodeGenOpt::None || !TM.isPPC64())
     return false;
 
+  // For POWER10, it is more profitable to use the set boolean extension
+  // instructions rather than the integer compare elimination codegen.
+  // Users can override this via the command line option, `--ppc-gpr-icmps`.
+  if (!(CmpInGPR.getNumOccurrences() > 0) && Subtarget->isISA3_1())
+    return false;
+
   switch (N->getOpcode()) {
   default: break;
   case ISD::ZERO_EXTEND:
@@ -3708,7 +3686,7 @@ bool PPCDAGToDAGISel::tryBitPermutation(SDNode *N) {
 /// SelectCC - Select a comparison of the specified values with the specified
 /// condition code, returning the CR# of the expression.
 SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC,
-                                  const SDLoc &dl) {
+                                  const SDLoc &dl, SDValue Chain) {
   // Always select the LHS.
   unsigned Opc;
 
@@ -3861,7 +3839,12 @@ SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC,
     assert(Subtarget->hasVSX() && "__float128 requires VSX");
     Opc = PPC::XSCMPUQP;
   }
-  return SDValue(CurDAG->getMachineNode(Opc, dl, MVT::i32, LHS, RHS), 0);
+  if (Chain)
+    return SDValue(
+        CurDAG->getMachineNode(Opc, dl, MVT::i32, MVT::Other, LHS, RHS, Chain),
+        0);
+  else
+    return SDValue(CurDAG->getMachineNode(Opc, dl, MVT::i32, LHS, RHS), 0);
 }
 
 static PPC::Predicate getPredicateForSetCC(ISD::CondCode CC, const EVT &VT,
@@ -3936,7 +3919,8 @@ static unsigned getCRIdxForSetCC(ISD::CondCode CC, bool &Invert) {
 
 // getVCmpInst: return the vector compare instruction for the specified
 // vector type and condition code. Since this is for altivec specific code,
-// only support the altivec types (v16i8, v8i16, v4i32, v2i64, and v4f32).
+// only support the altivec types (v16i8, v8i16, v4i32, v2i64, v1i128,
+// and v4f32).
 static unsigned int getVCmpInst(MVT VecVT, ISD::CondCode CC,
                                 bool HasVSX, bool &Swap, bool &Negate) {
   Swap = false;
@@ -4017,6 +4001,8 @@ static unsigned int getVCmpInst(MVT VecVT, ISD::CondCode CC,
           return PPC::VCMPEQUW;
         else if (VecVT == MVT::v2i64)
           return PPC::VCMPEQUD;
+        else if (VecVT == MVT::v1i128)
+          return PPC::VCMPEQUQ;
         break;
       case ISD::SETGT:
         if (VecVT == MVT::v16i8)
@@ -4027,6 +4013,8 @@ static unsigned int getVCmpInst(MVT VecVT, ISD::CondCode CC,
           return PPC::VCMPGTSW;
         else if (VecVT == MVT::v2i64)
           return PPC::VCMPGTSD;
+        else if (VecVT == MVT::v1i128)
+           return PPC::VCMPGTSQ;
         break;
       case ISD::SETUGT:
         if (VecVT == MVT::v16i8)
@@ -4037,6 +4025,8 @@ static unsigned int getVCmpInst(MVT VecVT, ISD::CondCode CC,
           return PPC::VCMPGTUW;
         else if (VecVT == MVT::v2i64)
           return PPC::VCMPGTUD;
+        else if (VecVT == MVT::v1i128)
+           return PPC::VCMPGTUQ;
         break;
       default:
         break;
@@ -4048,17 +4038,23 @@ static unsigned int getVCmpInst(MVT VecVT, ISD::CondCode CC,
 bool PPCDAGToDAGISel::trySETCC(SDNode *N) {
   SDLoc dl(N);
   unsigned Imm;
-  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  bool IsStrict = N->isStrictFPOpcode();
+  ISD::CondCode CC =
+      cast<CondCodeSDNode>(N->getOperand(IsStrict ? 3 : 2))->get();
   EVT PtrVT =
       CurDAG->getTargetLoweringInfo().getPointerTy(CurDAG->getDataLayout());
   bool isPPC64 = (PtrVT == MVT::i64);
+  SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
+
+  SDValue LHS = N->getOperand(IsStrict ? 1 : 0);
+  SDValue RHS = N->getOperand(IsStrict ? 2 : 1);
 
-  if (!Subtarget->useCRBits() && isInt32Immediate(N->getOperand(1), Imm)) {
+  if (!IsStrict && !Subtarget->useCRBits() && isInt32Immediate(RHS, Imm)) {
     // We can codegen setcc op, imm very efficiently compared to a brcond.
     // Check for those cases here.
     // setcc op, 0
     if (Imm == 0) {
-      SDValue Op = N->getOperand(0);
+      SDValue Op = LHS;
       switch (CC) {
       default: break;
       case ISD::SETEQ: {
@@ -4093,7 +4089,7 @@ bool PPCDAGToDAGISel::trySETCC(SDNode *N) {
       }
       }
     } else if (Imm == ~0U) {        // setcc op, -1
-      SDValue Op = N->getOperand(0);
+      SDValue Op = LHS;
       switch (CC) {
       default: break;
       case ISD::SETEQ:
@@ -4136,13 +4132,10 @@ bool PPCDAGToDAGISel::trySETCC(SDNode *N) {
     }
   }
 
-  SDValue LHS = N->getOperand(0);
-  SDValue RHS = N->getOperand(1);
-
   // Altivec Vector compare instructions do not set any CR register by default and
   // vector compare operations return the same type as the operands.
-  if (LHS.getValueType().isVector()) {
-    if (Subtarget->hasQPX() || Subtarget->hasSPE())
+  if (!IsStrict && LHS.getValueType().isVector()) {
+    if (Subtarget->hasSPE())
       return false;
 
     EVT VecVT = LHS.getValueType();
@@ -4169,7 +4162,9 @@ bool PPCDAGToDAGISel::trySETCC(SDNode *N) {
 
   bool Inv;
   unsigned Idx = getCRIdxForSetCC(CC, Inv);
-  SDValue CCReg = SelectCC(LHS, RHS, CC, dl);
+  SDValue CCReg = SelectCC(LHS, RHS, CC, dl, Chain);
+  if (IsStrict)
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 1), CCReg.getValue(1));
   SDValue IntCR;
 
   // SPE e*cmp* instructions only set the 'gt' bit, so hard-code that
@@ -4272,8 +4267,10 @@ static bool mayUseP9Setb(SDNode *N, const ISD::CondCode &CC, SelectionDAG *DAG,
        (FalseRes.getOpcode() != ISD::SELECT_CC || CC != ISD::SETEQ)))
     return false;
 
-  bool InnerIsSel = FalseRes.getOpcode() == ISD::SELECT_CC;
-  SDValue SetOrSelCC = InnerIsSel ? FalseRes : FalseRes.getOperand(0);
+  SDValue SetOrSelCC = FalseRes.getOpcode() == ISD::SELECT_CC
+                           ? FalseRes
+                           : FalseRes.getOperand(0);
+  bool InnerIsSel = SetOrSelCC.getOpcode() == ISD::SELECT_CC;
   if (SetOrSelCC.getOpcode() != ISD::SETCC &&
       SetOrSelCC.getOpcode() != ISD::SELECT_CC)
     return false;
@@ -4382,6 +4379,81 @@ static bool mayUseP9Setb(SDNode *N, const ISD::CondCode &CC, SelectionDAG *DAG,
   return true;
 }
 
+// Return true if it's a software square-root/divide operand.
+static bool isSWTestOp(SDValue N) {
+  if (N.getOpcode() == PPCISD::FTSQRT)
+    return true;
+  if (N.getNumOperands() < 1 || !isa<ConstantSDNode>(N.getOperand(0)))
+    return false;
+  switch (N.getConstantOperandVal(0)) {
+  case Intrinsic::ppc_vsx_xvtdivdp:
+  case Intrinsic::ppc_vsx_xvtdivsp:
+  case Intrinsic::ppc_vsx_xvtsqrtdp:
+  case Intrinsic::ppc_vsx_xvtsqrtsp:
+    return true;
+  }
+  return false;
+}
+
+bool PPCDAGToDAGISel::tryFoldSWTestBRCC(SDNode *N) {
+  assert(N->getOpcode() == ISD::BR_CC && "ISD::BR_CC is expected.");
+  // We are looking for following patterns, where `truncate to i1` actually has
+  // the same semantic with `and 1`.
+  // (br_cc seteq, (truncateToi1 SWTestOp), 0) -> (BCC PRED_NU, SWTestOp)
+  // (br_cc seteq, (and SWTestOp, 2), 0) -> (BCC PRED_NE, SWTestOp)
+  // (br_cc seteq, (and SWTestOp, 4), 0) -> (BCC PRED_LE, SWTestOp)
+  // (br_cc seteq, (and SWTestOp, 8), 0) -> (BCC PRED_GE, SWTestOp)
+  // (br_cc setne, (truncateToi1 SWTestOp), 0) -> (BCC PRED_UN, SWTestOp)
+  // (br_cc setne, (and SWTestOp, 2), 0) -> (BCC PRED_EQ, SWTestOp)
+  // (br_cc setne, (and SWTestOp, 4), 0) -> (BCC PRED_GT, SWTestOp)
+  // (br_cc setne, (and SWTestOp, 8), 0) -> (BCC PRED_LT, SWTestOp)
+  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
+  if (CC != ISD::SETEQ && CC != ISD::SETNE)
+    return false;
+
+  SDValue CmpRHS = N->getOperand(3);
+  if (!isa<ConstantSDNode>(CmpRHS) ||
+      cast<ConstantSDNode>(CmpRHS)->getSExtValue() != 0)
+    return false;
+
+  SDValue CmpLHS = N->getOperand(2);
+  if (CmpLHS.getNumOperands() < 1 || !isSWTestOp(CmpLHS.getOperand(0)))
+    return false;
+
+  unsigned PCC = 0;
+  bool IsCCNE = CC == ISD::SETNE;
+  if (CmpLHS.getOpcode() == ISD::AND &&
+      isa<ConstantSDNode>(CmpLHS.getOperand(1)))
+    switch (CmpLHS.getConstantOperandVal(1)) {
+    case 1:
+      PCC = IsCCNE ? PPC::PRED_UN : PPC::PRED_NU;
+      break;
+    case 2:
+      PCC = IsCCNE ? PPC::PRED_EQ : PPC::PRED_NE;
+      break;
+    case 4:
+      PCC = IsCCNE ? PPC::PRED_GT : PPC::PRED_LE;
+      break;
+    case 8:
+      PCC = IsCCNE ? PPC::PRED_LT : PPC::PRED_GE;
+      break;
+    default:
+      return false;
+    }
+  else if (CmpLHS.getOpcode() == ISD::TRUNCATE &&
+           CmpLHS.getValueType() == MVT::i1)
+    PCC = IsCCNE ? PPC::PRED_UN : PPC::PRED_NU;
+
+  if (PCC) {
+    SDLoc dl(N);
+    SDValue Ops[] = {getI32Imm(PCC, dl), CmpLHS.getOperand(0), N->getOperand(4),
+                     N->getOperand(0)};
+    CurDAG->SelectNodeTo(N, PPC::BCC, MVT::Other, Ops);
+    return true;
+  }
+  return false;
+}
+
 bool PPCDAGToDAGISel::tryAsSingleRLWINM(SDNode *N) {
   assert(N->getOpcode() == ISD::AND && "ISD::AND SDNode expected");
   unsigned Imm;
@@ -4661,7 +4733,48 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
     }
     break;
 
+  case ISD::INTRINSIC_WO_CHAIN: {
+    if (!Subtarget->isISA3_1())
+      break;
+    unsigned Opcode = 0;
+    switch (N->getConstantOperandVal(0)) {
+    default:
+      break;
+    case Intrinsic::ppc_altivec_vstribr_p:
+      Opcode = PPC::VSTRIBR_rec;
+      break;
+    case Intrinsic::ppc_altivec_vstribl_p:
+      Opcode = PPC::VSTRIBL_rec;
+      break;
+    case Intrinsic::ppc_altivec_vstrihr_p:
+      Opcode = PPC::VSTRIHR_rec;
+      break;
+    case Intrinsic::ppc_altivec_vstrihl_p:
+      Opcode = PPC::VSTRIHL_rec;
+      break;
+    }
+    if (!Opcode)
+      break;
+
+    // Generate the appropriate vector string isolate intrinsic to match.
+    EVT VTs[] = {MVT::v16i8, MVT::Glue};
+    SDValue VecStrOp =
+        SDValue(CurDAG->getMachineNode(Opcode, dl, VTs, N->getOperand(2)), 0);
+    // Vector string isolate instructions update the EQ bit of CR6.
+    // Generate a SETBC instruction to extract the bit and place it in a GPR.
+    SDValue SubRegIdx = CurDAG->getTargetConstant(PPC::sub_eq, dl, MVT::i32);
+    SDValue CR6Reg = CurDAG->getRegister(PPC::CR6, MVT::i32);
+    SDValue CRBit = SDValue(
+        CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, MVT::i1,
+                               CR6Reg, SubRegIdx, VecStrOp.getValue(1)),
+        0);
+    CurDAG->SelectNodeTo(N, PPC::SETBC, MVT::i32, CRBit);
+    return;
+  }
+
   case ISD::SETCC:
+  case ISD::STRICT_FSETCC:
+  case ISD::STRICT_FSETCCS:
     if (trySETCC(N))
       return;
     break;
@@ -4813,8 +4926,6 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
         assert((!isSExt || LoadedVT == MVT::i16) && "Invalid sext update load");
         switch (LoadedVT.getSimpleVT().SimpleTy) {
           default: llvm_unreachable("Invalid PPC load type!");
-          case MVT::v4f64: Opcode = PPC::QVLFDUX; break; // QPX
-          case MVT::v4f32: Opcode = PPC::QVLFSUX; break; // QPX
           case MVT::f64: Opcode = PPC::LFDUX; break;
           case MVT::f32: Opcode = PPC::LFSUX; break;
           case MVT::i32: Opcode = PPC::LWZUX; break;
@@ -4961,6 +5072,32 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
     // Other cases are autogenerated.
     break;
   }
+  case ISD::MUL: {
+    SDValue Op1 = N->getOperand(1);
+    if (Op1.getOpcode() != ISD::Constant || Op1.getValueType() != MVT::i64)
+      break;
+
+    // If the multiplier fits int16, we can handle it with mulli.
+    int64_t Imm = cast<ConstantSDNode>(Op1)->getZExtValue();
+    unsigned Shift = countTrailingZeros<uint64_t>(Imm);
+    if (isInt<16>(Imm) || !Shift)
+      break;
+
+    // If the shifted value fits int16, we can do this transformation:
+    // (mul X, c1 << c2) -> (rldicr (mulli X, c1) c2). We do this in ISEL due to
+    // DAGCombiner prefers (shl (mul X, c1), c2) -> (mul X, c1 << c2).
+    uint64_t ImmSh = Imm >> Shift;
+    if (isInt<16>(ImmSh)) {
+      uint64_t SextImm = SignExtend64(ImmSh & 0xFFFF, 16);
+      SDValue SDImm = CurDAG->getTargetConstant(SextImm, dl, MVT::i64);
+      SDNode *MulNode = CurDAG->getMachineNode(PPC::MULLI8, dl, MVT::i64,
+                                               N->getOperand(0), SDImm);
+      CurDAG->SelectNodeTo(N, PPC::RLDICR, MVT::i64, SDValue(MulNode, 0),
+                           getI32Imm(Shift, dl), getI32Imm(63 - Shift, dl));
+      return;
+    }
+    break;
+  }
   // FIXME: Remove this once the ANDI glue bug is fixed:
   case PPCISD::ANDI_rec_1_EQ_BIT:
   case PPCISD::ANDI_rec_1_GT_BIT: {
@@ -5095,12 +5232,6 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
       SelectCCOp = PPC::SELECT_CC_F16;
     else if (Subtarget->hasSPE())
       SelectCCOp = PPC::SELECT_CC_SPE;
-    else if (Subtarget->hasQPX() && N->getValueType(0) == MVT::v4f64)
-      SelectCCOp = PPC::SELECT_CC_QFRC;
-    else if (Subtarget->hasQPX() && N->getValueType(0) == MVT::v4f32)
-      SelectCCOp = PPC::SELECT_CC_QSRC;
-    else if (Subtarget->hasQPX() && N->getValueType(0) == MVT::v4i1)
-      SelectCCOp = PPC::SELECT_CC_QBRC;
     else if (N->getValueType(0) == MVT::v2f64 ||
              N->getValueType(0) == MVT::v2i64)
       SelectCCOp = PPC::SELECT_CC_VSRC;
@@ -5192,6 +5323,8 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
     return;
   }
   case ISD::BR_CC: {
+    if (tryFoldSWTestBRCC(N))
+      return;
     ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
     unsigned PCC =
         getPredicateForSetCC(CC, N->getOperand(2).getValueType(), Subtarget);
@@ -5763,7 +5896,13 @@ bool PPCDAGToDAGISel::AllUsersSelectZero(SDNode *N) {
         User->getMachineOpcode() != PPC::SELECT_I8)
       return false;
 
+    SDNode *Op1 = User->getOperand(1).getNode();
     SDNode *Op2 = User->getOperand(2).getNode();
+    // If we have a degenerate select with two equal operands, swapping will
+    // not do anything, and we may run into an infinite loop.
+    if (Op1 == Op2)
+      return false;
+
     if (!Op2->isMachineOpcode())
       return false;
 
@@ -5856,9 +5995,6 @@ void PPCDAGToDAGISel::PeepholeCROps() {
       case PPC::SELECT_I8:
       case PPC::SELECT_F4:
       case PPC::SELECT_F8:
-      case PPC::SELECT_QFRC:
-      case PPC::SELECT_QSRC:
-      case PPC::SELECT_QBRC:
       case PPC::SELECT_SPE:
       case PPC::SELECT_SPE4:
       case PPC::SELECT_VRRC:
@@ -6177,9 +6313,6 @@ void PPCDAGToDAGISel::PeepholeCROps() {
       case PPC::SELECT_I8:
       case PPC::SELECT_F4:
       case PPC::SELECT_F8:
-      case PPC::SELECT_QFRC:
-      case PPC::SELECT_QSRC:
-      case PPC::SELECT_QBRC:
       case PPC::SELECT_SPE:
       case PPC::SELECT_SPE4:
       case PPC::SELECT_VRRC:
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 641b2facdc41..26dc3afc899e 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -74,6 +74,7 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSectionXCOFF.h"
 #include "llvm/MC/MCSymbolXCOFF.h"
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/BranchProbability.h"
@@ -120,6 +121,11 @@ cl::desc("don't always align innermost loop to 32 bytes on ppc"), cl::Hidden);
 static cl::opt<bool> UseAbsoluteJumpTables("ppc-use-absolute-jumptables",
 cl::desc("use absolute jump tables on ppc"), cl::Hidden);
 
+// TODO - Remove this option if soft fp128 has been fully supported .
+static cl::opt<bool>
+    EnableSoftFP128("enable-soft-fp128",
+                    cl::desc("temp option to enable soft fp128"), cl::Hidden);
+
 STATISTIC(NumTailCalls, "Number of tail calls");
 STATISTIC(NumSiblingCalls, "Number of sibling calls");
 STATISTIC(ShufflesHandledWithVPERM, "Number of shuffles lowered to a VPERM");
@@ -145,7 +151,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   if (!useSoftFloat()) {
     if (hasSPE()) {
       addRegisterClass(MVT::f32, &PPC::GPRCRegClass);
-      addRegisterClass(MVT::f64, &PPC::SPERCRegClass);
+      // EFPU2 APU only supports f32
+      if (!Subtarget.hasEFPU2())
+        addRegisterClass(MVT::f64, &PPC::SPERCRegClass);
     } else {
       addRegisterClass(MVT::f32, &PPC::F4RCRegClass);
       addRegisterClass(MVT::f64, &PPC::F8RCRegClass);
@@ -159,6 +167,10 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   // Sub-word ATOMIC_CMP_SWAP need to ensure that the input is zero-extended.
   setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
 
+  // Custom lower inline assembly to check for special registers.
+  setOperationAction(ISD::INLINEASM, MVT::Other, Custom);
+  setOperationAction(ISD::INLINEASM_BR, MVT::Other, Custom);
+
   // PowerPC has an i16 but no i8 (or i1) SEXTLOAD.
   for (MVT VT : MVT::integer_valuetypes()) {
     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
@@ -215,13 +227,36 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
 
     if (isPPC64 || Subtarget.hasFPCVT()) {
+      setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i1, Promote);
+      AddPromotedToType(ISD::STRICT_SINT_TO_FP, MVT::i1,
+                        isPPC64 ? MVT::i64 : MVT::i32);
+      setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i1, Promote);
+      AddPromotedToType(ISD::STRICT_UINT_TO_FP, MVT::i1,
+                        isPPC64 ? MVT::i64 : MVT::i32);
+
       setOperationAction(ISD::SINT_TO_FP, MVT::i1, Promote);
       AddPromotedToType (ISD::SINT_TO_FP, MVT::i1,
                          isPPC64 ? MVT::i64 : MVT::i32);
       setOperationAction(ISD::UINT_TO_FP, MVT::i1, Promote);
       AddPromotedToType(ISD::UINT_TO_FP, MVT::i1,
                         isPPC64 ? MVT::i64 : MVT::i32);
+
+      setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i1, Promote);
+      AddPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::i1,
+                        isPPC64 ? MVT::i64 : MVT::i32);
+      setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i1, Promote);
+      AddPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::i1,
+                        isPPC64 ? MVT::i64 : MVT::i32);
+
+      setOperationAction(ISD::FP_TO_SINT, MVT::i1, Promote);
+      AddPromotedToType(ISD::FP_TO_SINT, MVT::i1,
+                        isPPC64 ? MVT::i64 : MVT::i32);
+      setOperationAction(ISD::FP_TO_UINT, MVT::i1, Promote);
+      AddPromotedToType(ISD::FP_TO_UINT, MVT::i1,
+                        isPPC64 ? MVT::i64 : MVT::i32);
     } else {
+      setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i1, Custom);
+      setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i1, Custom);
       setOperationAction(ISD::SINT_TO_FP, MVT::i1, Custom);
       setOperationAction(ISD::UINT_TO_FP, MVT::i1, Custom);
     }
@@ -247,6 +282,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   // PPC (the libcall is not available).
   setOperationAction(ISD::FP_TO_SINT, MVT::ppcf128, Custom);
   setOperationAction(ISD::FP_TO_UINT, MVT::ppcf128, Custom);
+  setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::ppcf128, Custom);
+  setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::ppcf128, Custom);
 
   // We do not currently implement these libm ops for PowerPC.
   setOperationAction(ISD::FFLOOR, MVT::ppcf128, Expand);
@@ -299,8 +336,10 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal);
   setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal);
   setOperationAction(ISD::STRICT_FMA, MVT::f64, Legal);
-  if (Subtarget.hasVSX())
-    setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f64, Legal);
+  if (Subtarget.hasVSX()) {
+    setOperationAction(ISD::STRICT_FRINT, MVT::f32, Legal);
+    setOperationAction(ISD::STRICT_FRINT, MVT::f64, Legal);
+  }
 
   if (Subtarget.hasFSQRT()) {
     setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal);
@@ -338,6 +377,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     setOperationAction(ISD::FMA  , MVT::f32, Legal);
   }
 
+  if (Subtarget.hasSPE())
+    setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
+
   setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
 
   // If we're enabling GP optimizations, use hardware square root
@@ -415,6 +457,16 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   if (!Subtarget.useCRBits())
     setOperationAction(ISD::SETCC, MVT::i32, Custom);
 
+  if (Subtarget.hasFPU()) {
+    setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Legal);
+    setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Legal);
+    setOperationAction(ISD::STRICT_FSETCC, MVT::f128, Legal);
+
+    setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Legal);
+    setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Legal);
+    setOperationAction(ISD::STRICT_FSETCCS, MVT::f128, Legal);
+  }
+
   // PowerPC does not have BRCOND which requires SetCC
   if (!Subtarget.useCRBits())
     setOperationAction(ISD::BRCOND, MVT::Other, Expand);
@@ -431,9 +483,12 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal);
   } else {
     // PowerPC turns FP_TO_SINT into FCTIWZ and some load/stores.
+    setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
     setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
 
     // PowerPC does not have [U|S]INT_TO_FP
+    setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Expand);
+    setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Expand);
     setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand);
     setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand);
   }
@@ -561,36 +616,56 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
   setCondCodeAction(ISD::SETONE, MVT::f64, Expand);
 
+  setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal);
+  setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
+
   if (Subtarget.has64BitSupport()) {
     // They also have instructions for converting between i64 and fp.
+    setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
+    setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Expand);
+    setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
+    setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Expand);
     setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
     setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
     setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
     setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);
     // This is just the low 32 bits of a (signed) fp->i64 conversion.
     // We cannot do this with Promote because i64 is not a legal type.
+    setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
     setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
 
-    if (Subtarget.hasLFIWAX() || Subtarget.isPPC64())
+    if (Subtarget.hasLFIWAX() || Subtarget.isPPC64()) {
       setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
+      setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
+    }
   } else {
     // PowerPC does not have FP_TO_UINT on 32-bit implementations.
     if (Subtarget.hasSPE()) {
       setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Legal);
       setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal);
-    } else
+    } else {
+      setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Expand);
       setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand);
+    }
   }
 
   // With the instructions enabled under FPCVT, we can do everything.
   if (Subtarget.hasFPCVT()) {
     if (Subtarget.has64BitSupport()) {
+      setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
+      setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
+      setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
+      setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
       setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
       setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
       setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
       setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
     }
 
+    setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
+    setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
+    setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
+    setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
     setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
     setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
     setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
@@ -613,6 +688,15 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
   }
 
+  // PowerPC has better expansions for funnel shifts than the generic
+  // TargetLowering::expandFunnelShift.
+  if (Subtarget.has64BitSupport()) {
+    setOperationAction(ISD::FSHL, MVT::i64, Custom);
+    setOperationAction(ISD::FSHR, MVT::i64, Custom);
+  }
+  setOperationAction(ISD::FSHL, MVT::i32, Custom);
+  setOperationAction(ISD::FSHR, MVT::i32, Custom);
+
   if (Subtarget.hasVSX()) {
     setOperationAction(ISD::FMAXNUM_IEEE, MVT::f64, Legal);
     setOperationAction(ISD::FMAXNUM_IEEE, MVT::f32, Legal);
@@ -745,9 +829,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
       setOperationAction(ISD::UMIN, MVT::v2i64, Expand);
     }
 
-    for (auto VT : {MVT::v2i64, MVT::v4i32, MVT::v8i16, MVT::v16i8})
-      setOperationAction(ISD::ABS, VT, Custom);
-
     // We can custom expand all VECTOR_SHUFFLEs to VPERM, others we can handle
     // with merges, splats, etc.
     setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom);
@@ -767,6 +848,10 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     setOperationAction(ISD::SELECT, MVT::v4i32,
                        Subtarget.useCRBits() ? Legal : Expand);
     setOperationAction(ISD::STORE , MVT::v4i32, Legal);
+    setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Legal);
+    setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32, Legal);
+    setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Legal);
+    setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Legal);
     setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
     setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
     setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
@@ -776,11 +861,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
     setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);
 
-    // Without hasP8Altivec set, v2i64 SMAX isn't available.
-    // But ABS custom lowering requires SMAX support.
-    if (!Subtarget.hasP8Altivec())
-      setOperationAction(ISD::ABS, MVT::v2i64, Expand);
-
     // Custom lowering ROTL v1i128 to VECTOR_SHUFFLE v16i8.
     setOperationAction(ISD::ROTL, MVT::v1i128, Custom);
     // With hasAltivec set, we can lower ISD::ROTL to vrl(b|h|w).
@@ -809,6 +889,27 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     else
       setOperationAction(ISD::MUL, MVT::v4i32, Custom);
 
+    if (Subtarget.isISA3_1()) {
+      setOperationAction(ISD::MUL, MVT::v2i64, Legal);
+      setOperationAction(ISD::MULHS, MVT::v2i64, Legal);
+      setOperationAction(ISD::MULHU, MVT::v2i64, Legal);
+      setOperationAction(ISD::MULHS, MVT::v4i32, Legal);
+      setOperationAction(ISD::MULHU, MVT::v4i32, Legal);
+      setOperationAction(ISD::UDIV, MVT::v2i64, Legal);
+      setOperationAction(ISD::SDIV, MVT::v2i64, Legal);
+      setOperationAction(ISD::UDIV, MVT::v4i32, Legal);
+      setOperationAction(ISD::SDIV, MVT::v4i32, Legal);
+      setOperationAction(ISD::UREM, MVT::v2i64, Legal);
+      setOperationAction(ISD::SREM, MVT::v2i64, Legal);
+      setOperationAction(ISD::UREM, MVT::v4i32, Legal);
+      setOperationAction(ISD::SREM, MVT::v4i32, Legal);
+      setOperationAction(ISD::UREM, MVT::v1i128, Legal);
+      setOperationAction(ISD::SREM, MVT::v1i128, Legal);
+      setOperationAction(ISD::UDIV, MVT::v1i128, Legal);
+      setOperationAction(ISD::SDIV, MVT::v1i128, Legal);
+      setOperationAction(ISD::ROTL, MVT::v1i128, Legal);
+    }
+
     setOperationAction(ISD::MUL, MVT::v8i16, Legal);
     setOperationAction(ISD::MUL, MVT::v16i8, Custom);
 
@@ -920,7 +1021,10 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
         setOperationAction(ISD::SUB, MVT::v2i64, Expand);
       }
 
-      setOperationAction(ISD::SETCC, MVT::v1i128, Expand);
+      if (Subtarget.isISA3_1())
+        setOperationAction(ISD::SETCC, MVT::v1i128, Legal);
+      else
+        setOperationAction(ISD::SETCC, MVT::v1i128, Expand);
 
       setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
       AddPromotedToType (ISD::LOAD, MVT::v2i64, MVT::v2f64);
@@ -929,6 +1033,10 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
 
       setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Legal);
 
+      setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i64, Legal);
+      setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i64, Legal);
+      setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i64, Legal);
+      setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i64, Legal);
       setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal);
       setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal);
       setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal);
@@ -937,6 +1045,14 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
       // Custom handling for partial vectors of integers converted to
       // floating point. We already have optimal handling for v2i32 through
       // the DAG combine, so those aren't necessary.
+      setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i8, Custom);
+      setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i8, Custom);
+      setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i16, Custom);
+      setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i16, Custom);
+      setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i8, Custom);
+      setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i8, Custom);
+      setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i16, Custom);
+      setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i16, Custom);
       setOperationAction(ISD::UINT_TO_FP, MVT::v2i8, Custom);
       setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom);
       setOperationAction(ISD::UINT_TO_FP, MVT::v2i16, Custom);
@@ -968,7 +1084,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
       setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal);
       setOperationAction(ISD::STRICT_FMAXNUM, MVT::v4f32, Legal);
       setOperationAction(ISD::STRICT_FMINNUM, MVT::v4f32, Legal);
-      setOperationAction(ISD::STRICT_FNEARBYINT, MVT::v4f32, Legal);
+      setOperationAction(ISD::STRICT_FRINT, MVT::v4f32, Legal);
       setOperationAction(ISD::STRICT_FFLOOR, MVT::v4f32, Legal);
       setOperationAction(ISD::STRICT_FCEIL,  MVT::v4f32, Legal);
       setOperationAction(ISD::STRICT_FTRUNC, MVT::v4f32, Legal);
@@ -982,7 +1098,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
       setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);
       setOperationAction(ISD::STRICT_FMAXNUM, MVT::v2f64, Legal);
       setOperationAction(ISD::STRICT_FMINNUM, MVT::v2f64, Legal);
-      setOperationAction(ISD::STRICT_FNEARBYINT, MVT::v2f64, Legal);
+      setOperationAction(ISD::STRICT_FRINT, MVT::v2f64, Legal);
       setOperationAction(ISD::STRICT_FFLOOR, MVT::v2f64, Legal);
       setOperationAction(ISD::STRICT_FCEIL,  MVT::v2f64, Legal);
       setOperationAction(ISD::STRICT_FTRUNC, MVT::v2f64, Legal);
@@ -1065,6 +1181,48 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
       setOperationAction(ISD::BSWAP, MVT::v4i32, Legal);
       setOperationAction(ISD::BSWAP, MVT::v2i64, Legal);
       setOperationAction(ISD::BSWAP, MVT::v1i128, Legal);
+    } else if (Subtarget.hasAltivec() && EnableSoftFP128) {
+      addRegisterClass(MVT::f128, &PPC::VRRCRegClass);
+
+      for (MVT FPT : MVT::fp_valuetypes())
+        setLoadExtAction(ISD::EXTLOAD, MVT::f128, FPT, Expand);
+
+      setOperationAction(ISD::LOAD, MVT::f128, Promote);
+      setOperationAction(ISD::STORE, MVT::f128, Promote);
+
+      AddPromotedToType(ISD::LOAD, MVT::f128, MVT::v4i32);
+      AddPromotedToType(ISD::STORE, MVT::f128, MVT::v4i32);
+
+      // Set FADD/FSUB as libcall to avoid the legalizer to expand the
+      // fp_to_uint and int_to_fp.
+      setOperationAction(ISD::FADD, MVT::f128, LibCall);
+      setOperationAction(ISD::FSUB, MVT::f128, LibCall);
+
+      setOperationAction(ISD::FMUL, MVT::f128, Expand);
+      setOperationAction(ISD::FDIV, MVT::f128, Expand);
+      setOperationAction(ISD::FNEG, MVT::f128, Expand);
+      setOperationAction(ISD::FABS, MVT::f128, Expand);
+      setOperationAction(ISD::FSIN, MVT::f128, Expand);
+      setOperationAction(ISD::FCOS, MVT::f128, Expand);
+      setOperationAction(ISD::FPOW, MVT::f128, Expand);
+      setOperationAction(ISD::FPOWI, MVT::f128, Expand);
+      setOperationAction(ISD::FREM, MVT::f128, Expand);
+      setOperationAction(ISD::FSQRT, MVT::f128, Expand);
+      setOperationAction(ISD::FMA, MVT::f128, Expand);
+      setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
+
+      setTruncStoreAction(MVT::f128, MVT::f64, Expand);
+      setTruncStoreAction(MVT::f128, MVT::f32, Expand);
+
+      // Expand the fp_extend if the target type is fp128.
+      setOperationAction(ISD::FP_EXTEND, MVT::f128, Expand);
+      setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Expand);
+
+      // Expand the fp_round if the source type is fp128.
+      for (MVT VT : {MVT::f32, MVT::f64}) {
+        setOperationAction(ISD::FP_ROUND, VT, Custom);
+        setOperationAction(ISD::STRICT_FP_ROUND, VT, Custom);
+      }
     }
 
     if (Subtarget.hasP9Altivec()) {
@@ -1081,164 +1239,24 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     }
   }
 
-  if (Subtarget.hasQPX()) {
-    setOperationAction(ISD::FADD, MVT::v4f64, Legal);
-    setOperationAction(ISD::FSUB, MVT::v4f64, Legal);
-    setOperationAction(ISD::FMUL, MVT::v4f64, Legal);
-    setOperationAction(ISD::FREM, MVT::v4f64, Expand);
-
-    setOperationAction(ISD::FCOPYSIGN, MVT::v4f64, Legal);
-    setOperationAction(ISD::FGETSIGN, MVT::v4f64, Expand);
-
-    setOperationAction(ISD::LOAD  , MVT::v4f64, Custom);
-    setOperationAction(ISD::STORE , MVT::v4f64, Custom);
-
-    setTruncStoreAction(MVT::v4f64, MVT::v4f32, Custom);
-    setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Custom);
-
-    if (!Subtarget.useCRBits())
-      setOperationAction(ISD::SELECT, MVT::v4f64, Expand);
-    setOperationAction(ISD::VSELECT, MVT::v4f64, Legal);
-
-    setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4f64, Legal);
-    setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4f64, Expand);
-    setOperationAction(ISD::CONCAT_VECTORS , MVT::v4f64, Expand);
-    setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4f64, Expand);
-    setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4f64, Custom);
-    setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f64, Legal);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v4f64, Custom);
-
-    setOperationAction(ISD::FP_TO_SINT , MVT::v4f64, Legal);
-    setOperationAction(ISD::FP_TO_UINT , MVT::v4f64, Expand);
-
-    setOperationAction(ISD::FP_ROUND , MVT::v4f32, Legal);
-    setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Legal);
-
-    setOperationAction(ISD::FNEG , MVT::v4f64, Legal);
-    setOperationAction(ISD::FABS , MVT::v4f64, Legal);
-    setOperationAction(ISD::FSIN , MVT::v4f64, Expand);
-    setOperationAction(ISD::FCOS , MVT::v4f64, Expand);
-    setOperationAction(ISD::FPOW , MVT::v4f64, Expand);
-    setOperationAction(ISD::FLOG , MVT::v4f64, Expand);
-    setOperationAction(ISD::FLOG2 , MVT::v4f64, Expand);
-    setOperationAction(ISD::FLOG10 , MVT::v4f64, Expand);
-    setOperationAction(ISD::FEXP , MVT::v4f64, Expand);
-    setOperationAction(ISD::FEXP2 , MVT::v4f64, Expand);
-
-    setOperationAction(ISD::FMINNUM, MVT::v4f64, Legal);
-    setOperationAction(ISD::FMAXNUM, MVT::v4f64, Legal);
-
-    setIndexedLoadAction(ISD::PRE_INC, MVT::v4f64, Legal);
-    setIndexedStoreAction(ISD::PRE_INC, MVT::v4f64, Legal);
-
-    addRegisterClass(MVT::v4f64, &PPC::QFRCRegClass);
-
-    setOperationAction(ISD::FADD, MVT::v4f32, Legal);
-    setOperationAction(ISD::FSUB, MVT::v4f32, Legal);
-    setOperationAction(ISD::FMUL, MVT::v4f32, Legal);
-    setOperationAction(ISD::FREM, MVT::v4f32, Expand);
-
-    setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Legal);
-    setOperationAction(ISD::FGETSIGN, MVT::v4f32, Expand);
-
-    setOperationAction(ISD::LOAD  , MVT::v4f32, Custom);
-    setOperationAction(ISD::STORE , MVT::v4f32, Custom);
-
-    if (!Subtarget.useCRBits())
-      setOperationAction(ISD::SELECT, MVT::v4f32, Expand);
-    setOperationAction(ISD::VSELECT, MVT::v4f32, Legal);
-
-    setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4f32, Legal);
-    setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4f32, Expand);
-    setOperationAction(ISD::CONCAT_VECTORS , MVT::v4f32, Expand);
-    setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4f32, Expand);
-    setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4f32, Custom);
-    setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
-
-    setOperationAction(ISD::FP_TO_SINT , MVT::v4f32, Legal);
-    setOperationAction(ISD::FP_TO_UINT , MVT::v4f32, Expand);
-
-    setOperationAction(ISD::FNEG , MVT::v4f32, Legal);
-    setOperationAction(ISD::FABS , MVT::v4f32, Legal);
-    setOperationAction(ISD::FSIN , MVT::v4f32, Expand);
-    setOperationAction(ISD::FCOS , MVT::v4f32, Expand);
-    setOperationAction(ISD::FPOW , MVT::v4f32, Expand);
-    setOperationAction(ISD::FLOG , MVT::v4f32, Expand);
-    setOperationAction(ISD::FLOG2 , MVT::v4f32, Expand);
-    setOperationAction(ISD::FLOG10 , MVT::v4f32, Expand);
-    setOperationAction(ISD::FEXP , MVT::v4f32, Expand);
-    setOperationAction(ISD::FEXP2 , MVT::v4f32, Expand);
-
-    setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
-    setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);
-
-    setIndexedLoadAction(ISD::PRE_INC, MVT::v4f32, Legal);
-    setIndexedStoreAction(ISD::PRE_INC, MVT::v4f32, Legal);
-
-    addRegisterClass(MVT::v4f32, &PPC::QSRCRegClass);
-
-    setOperationAction(ISD::AND , MVT::v4i1, Legal);
-    setOperationAction(ISD::OR , MVT::v4i1, Legal);
-    setOperationAction(ISD::XOR , MVT::v4i1, Legal);
-
-    if (!Subtarget.useCRBits())
-      setOperationAction(ISD::SELECT, MVT::v4i1, Expand);
-    setOperationAction(ISD::VSELECT, MVT::v4i1, Legal);
-
-    setOperationAction(ISD::LOAD  , MVT::v4i1, Custom);
-    setOperationAction(ISD::STORE , MVT::v4i1, Custom);
-
-    setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4i1, Custom);
-    setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4i1, Expand);
-    setOperationAction(ISD::CONCAT_VECTORS , MVT::v4i1, Expand);
-    setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4i1, Expand);
-    setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4i1, Custom);
-    setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i1, Expand);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v4i1, Custom);
-
-    setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom);
-    setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom);
-
-    addRegisterClass(MVT::v4i1, &PPC::QBRCRegClass);
-
-    setOperationAction(ISD::FFLOOR, MVT::v4f64, Legal);
-    setOperationAction(ISD::FCEIL,  MVT::v4f64, Legal);
-    setOperationAction(ISD::FTRUNC, MVT::v4f64, Legal);
-    setOperationAction(ISD::FROUND, MVT::v4f64, Legal);
-
-    setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
-    setOperationAction(ISD::FCEIL,  MVT::v4f32, Legal);
-    setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
-    setOperationAction(ISD::FROUND, MVT::v4f32, Legal);
-
-    setOperationAction(ISD::FNEARBYINT, MVT::v4f64, Expand);
-    setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand);
-
-    // These need to set FE_INEXACT, and so cannot be vectorized here.
-    setOperationAction(ISD::FRINT, MVT::v4f64, Expand);
-    setOperationAction(ISD::FRINT, MVT::v4f32, Expand);
-
-    if (TM.Options.UnsafeFPMath) {
-      setOperationAction(ISD::FDIV, MVT::v4f64, Legal);
-      setOperationAction(ISD::FSQRT, MVT::v4f64, Legal);
-
-      setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
-      setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
-    } else {
-      setOperationAction(ISD::FDIV, MVT::v4f64, Expand);
-      setOperationAction(ISD::FSQRT, MVT::v4f64, Expand);
-
-      setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
-      setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
-    }
-
-    // TODO: Handle constrained floating-point operations of v4f64
+  if (Subtarget.pairedVectorMemops()) {
+    addRegisterClass(MVT::v256i1, &PPC::VSRpRCRegClass);
+    setOperationAction(ISD::LOAD, MVT::v256i1, Custom);
+    setOperationAction(ISD::STORE, MVT::v256i1, Custom);
+  }
+  if (Subtarget.hasMMA()) {
+    addRegisterClass(MVT::v512i1, &PPC::UACCRCRegClass);
+    setOperationAction(ISD::LOAD, MVT::v512i1, Custom);
+    setOperationAction(ISD::STORE, MVT::v512i1, Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v512i1, Custom);
   }
 
   if (Subtarget.has64BitSupport())
     setOperationAction(ISD::PREFETCH, MVT::Other, Legal);
 
+  if (Subtarget.isISA3_1())
+    setOperationAction(ISD::SRA, MVT::v1i128, Legal);
+
   setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, isPPC64 ? Legal : Custom);
 
   if (!isPPC64) {
@@ -1315,8 +1333,19 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   setLibcallName(RTLIB::POW_F128, "powf128");
   setLibcallName(RTLIB::FMIN_F128, "fminf128");
   setLibcallName(RTLIB::FMAX_F128, "fmaxf128");
-  setLibcallName(RTLIB::POWI_F128, "__powikf2");
   setLibcallName(RTLIB::REM_F128, "fmodf128");
+  setLibcallName(RTLIB::SQRT_F128, "sqrtf128");
+  setLibcallName(RTLIB::CEIL_F128, "ceilf128");
+  setLibcallName(RTLIB::FLOOR_F128, "floorf128");
+  setLibcallName(RTLIB::TRUNC_F128, "truncf128");
+  setLibcallName(RTLIB::ROUND_F128, "roundf128");
+  setLibcallName(RTLIB::LROUND_F128, "lroundf128");
+  setLibcallName(RTLIB::LLROUND_F128, "llroundf128");
+  setLibcallName(RTLIB::RINT_F128, "rintf128");
+  setLibcallName(RTLIB::LRINT_F128, "lrintf128");
+  setLibcallName(RTLIB::LLRINT_F128, "llrintf128");
+  setLibcallName(RTLIB::NEARBYINT_F128, "nearbyintf128");
+  setLibcallName(RTLIB::FMA_F128, "fmaf128");
 
   // With 32 condition bits, we don't need to sink (and duplicate) compares
   // aggressively in CodeGenPrep.
@@ -1379,6 +1408,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     MaxLoadsPerMemcmpOptSize = 4;
   }
 
+  IsStrictFPEnabled = true;
+
   // Let the subtarget (CPU) decide if a predictable select is more expensive
   // than the corresponding branch. This information is used in CGP to decide
   // when to convert selects into branches.
@@ -1421,8 +1452,8 @@ unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty,
   // 16byte and wider vectors are passed on 16byte boundary.
   // The rest is 8 on PPC64 and 4 on PPC32 boundary.
   Align Alignment = Subtarget.isPPC64() ? Align(8) : Align(4);
-  if (Subtarget.hasAltivec() || Subtarget.hasQPX())
-    getMaxByValAlign(Ty, Alignment, Subtarget.hasQPX() ? Align(32) : Align(16));
+  if (Subtarget.hasAltivec())
+    getMaxByValAlign(Ty, Alignment, Align(16));
   return Alignment.value();
 }
 
@@ -1438,16 +1469,6 @@ bool PPCTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
   return VT.isScalarInteger();
 }
 
-/// isMulhCheaperThanMulShift - Return true if a mulh[s|u] node for a specific
-/// type is cheaper than a multiply followed by a shift.
-/// This is true for words and doublewords on 64-bit PowerPC.
-bool PPCTargetLowering::isMulhCheaperThanMulShift(EVT Type) const {
-  if (Subtarget.isPPC64() && (isOperationLegal(ISD::MULHS, Type) ||
-                              isOperationLegal(ISD::MULHU, Type)))
-    return true;
-  return TargetLowering::isMulhCheaperThanMulShift(Type);
-}
-
 const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch ((PPCISD::NodeType)Opcode) {
   case PPCISD::FIRST_NUMBER:    break;
@@ -1468,6 +1489,10 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
                                 return "PPCISD::FP_TO_SINT_IN_VSR";
   case PPCISD::FRE:             return "PPCISD::FRE";
   case PPCISD::FRSQRTE:         return "PPCISD::FRSQRTE";
+  case PPCISD::FTSQRT:
+    return "PPCISD::FTSQRT";
+  case PPCISD::FSQRT:
+    return "PPCISD::FSQRT";
   case PPCISD::STFIWX:          return "PPCISD::STFIWX";
   case PPCISD::VPERM:           return "PPCISD::VPERM";
   case PPCISD::XXSPLT:          return "PPCISD::XXSPLT";
@@ -1515,7 +1540,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::ANDI_rec_1_GT_BIT:
     return "PPCISD::ANDI_rec_1_GT_BIT";
   case PPCISD::VCMP:            return "PPCISD::VCMP";
-  case PPCISD::VCMPo:           return "PPCISD::VCMPo";
+  case PPCISD::VCMP_rec:        return "PPCISD::VCMP_rec";
   case PPCISD::LBRX:            return "PPCISD::LBRX";
   case PPCISD::STBRX:           return "PPCISD::STBRX";
   case PPCISD::LFIWAX:          return "PPCISD::LFIWAX";
@@ -1552,6 +1577,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::ADDI_TLSLD_L_ADDR: return "PPCISD::ADDI_TLSLD_L_ADDR";
   case PPCISD::ADDIS_DTPREL_HA: return "PPCISD::ADDIS_DTPREL_HA";
   case PPCISD::ADDI_DTPREL_L:   return "PPCISD::ADDI_DTPREL_L";
+  case PPCISD::PADDI_DTPREL:
+    return "PPCISD::PADDI_DTPREL";
   case PPCISD::VADD_SPLAT:      return "PPCISD::VADD_SPLAT";
   case PPCISD::SC:              return "PPCISD::SC";
   case PPCISD::CLRBHRB:         return "PPCISD::CLRBHRB";
@@ -1560,12 +1587,6 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::XXSWAPD:         return "PPCISD::XXSWAPD";
   case PPCISD::SWAP_NO_CHAIN:   return "PPCISD::SWAP_NO_CHAIN";
   case PPCISD::VABSD:           return "PPCISD::VABSD";
-  case PPCISD::QVFPERM:         return "PPCISD::QVFPERM";
-  case PPCISD::QVGPCI:          return "PPCISD::QVGPCI";
-  case PPCISD::QVALIGNI:        return "PPCISD::QVALIGNI";
-  case PPCISD::QVESPLATI:       return "PPCISD::QVESPLATI";
-  case PPCISD::QBFLT:           return "PPCISD::QBFLT";
-  case PPCISD::QVLFSb:          return "PPCISD::QVLFSb";
   case PPCISD::BUILD_FP128:     return "PPCISD::BUILD_FP128";
   case PPCISD::BUILD_SPE64:     return "PPCISD::BUILD_SPE64";
   case PPCISD::EXTRACT_SPE:     return "PPCISD::EXTRACT_SPE";
@@ -1573,8 +1594,35 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::LD_VSX_LH:       return "PPCISD::LD_VSX_LH";
   case PPCISD::FP_EXTEND_HALF:  return "PPCISD::FP_EXTEND_HALF";
   case PPCISD::MAT_PCREL_ADDR:  return "PPCISD::MAT_PCREL_ADDR";
+  case PPCISD::TLS_DYNAMIC_MAT_PCREL_ADDR:
+    return "PPCISD::TLS_DYNAMIC_MAT_PCREL_ADDR";
+  case PPCISD::TLS_LOCAL_EXEC_MAT_ADDR:
+    return "PPCISD::TLS_LOCAL_EXEC_MAT_ADDR";
+  case PPCISD::ACC_BUILD:       return "PPCISD::ACC_BUILD";
+  case PPCISD::PAIR_BUILD:      return "PPCISD::PAIR_BUILD";
+  case PPCISD::EXTRACT_VSX_REG: return "PPCISD::EXTRACT_VSX_REG";
+  case PPCISD::XXMFACC:         return "PPCISD::XXMFACC";
   case PPCISD::LD_SPLAT:        return "PPCISD::LD_SPLAT";
   case PPCISD::FNMSUB:          return "PPCISD::FNMSUB";
+  case PPCISD::STRICT_FADDRTZ:
+    return "PPCISD::STRICT_FADDRTZ";
+  case PPCISD::STRICT_FCTIDZ:
+    return "PPCISD::STRICT_FCTIDZ";
+  case PPCISD::STRICT_FCTIWZ:
+    return "PPCISD::STRICT_FCTIWZ";
+  case PPCISD::STRICT_FCTIDUZ:
+    return "PPCISD::STRICT_FCTIDUZ";
+  case PPCISD::STRICT_FCTIWUZ:
+    return "PPCISD::STRICT_FCTIWUZ";
+  case PPCISD::STRICT_FCFID:
+    return "PPCISD::STRICT_FCFID";
+  case PPCISD::STRICT_FCFIDU:
+    return "PPCISD::STRICT_FCFIDU";
+  case PPCISD::STRICT_FCFIDS:
+    return "PPCISD::STRICT_FCFIDS";
+  case PPCISD::STRICT_FCFIDUS:
+    return "PPCISD::STRICT_FCFIDUS";
+  case PPCISD::LXVRZX:          return "PPCISD::LXVRZX";
   }
   return nullptr;
 }
@@ -1584,9 +1632,6 @@ EVT PPCTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &C,
   if (!VT.isVector())
     return Subtarget.useCRBits() ? MVT::i1 : MVT::i32;
 
-  if (Subtarget.hasQPX())
-    return EVT::getVectorVT(C, MVT::i1, VT.getVectorNumElements());
-
   return VT.changeVectorElementTypeToInteger();
 }
 
@@ -2360,36 +2405,6 @@ SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
   return SDValue();
 }
 
-/// isQVALIGNIShuffleMask - If this is a qvaligni shuffle mask, return the shift
-/// amount, otherwise return -1.
-int PPC::isQVALIGNIShuffleMask(SDNode *N) {
-  EVT VT = N->getValueType(0);
-  if (VT != MVT::v4f64 && VT != MVT::v4f32 && VT != MVT::v4i1)
-    return -1;
-
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
-
-  // Find the first non-undef value in the shuffle mask.
-  unsigned i;
-  for (i = 0; i != 4 && SVOp->getMaskElt(i) < 0; ++i)
-    /*search*/;
-
-  if (i == 4) return -1;  // all undef.
-
-  // Otherwise, check to see if the rest of the elements are consecutively
-  // numbered from this value.
-  unsigned ShiftAmt = SVOp->getMaskElt(i);
-  if (ShiftAmt < i) return -1;
-  ShiftAmt -= i;
-
-  // Check the rest of the elements to see if they are consecutive.
-  for (++i; i != 4; ++i)
-    if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i))
-      return -1;
-
-  return ShiftAmt;
-}
-
 //===----------------------------------------------------------------------===//
 //  Addressing Mode Selection
 //===----------------------------------------------------------------------===//
@@ -2431,6 +2446,20 @@ bool PPCTargetLowering::SelectAddressEVXRegReg(SDValue N, SDValue &Base,
   return false;
 }
 
+/// isIntS34Immediate - This method tests if value of node given can be
+/// accurately represented as a sign extension from a 34-bit value.  If so,
+/// this returns true and the immediate.
+bool llvm::isIntS34Immediate(SDNode *N, int64_t &Imm) {
+  if (!isa<ConstantSDNode>(N))
+    return false;
+
+  Imm = (int64_t)cast<ConstantSDNode>(N)->getZExtValue();
+  return isInt<34>(Imm);
+}
+bool llvm::isIntS34Immediate(SDValue Op, int64_t &Imm) {
+  return isIntS34Immediate(Op.getNode(), Imm);
+}
+
 /// SelectAddressRegReg - Given the specified addressed, check to see if it
 /// can be represented as an indexed [r+r] operation.  Returns false if it
 /// can be more efficiently represented as [r+imm]. If \p EncodingAlignment is
@@ -2631,6 +2660,55 @@ bool PPCTargetLowering::SelectAddressRegImm(
   return true;      // [r+0]
 }
 
+/// Similar to the 16-bit case but for instructions that take a 34-bit
+/// displacement field (prefixed loads/stores).
+bool PPCTargetLowering::SelectAddressRegImm34(SDValue N, SDValue &Disp,
+                                              SDValue &Base,
+                                              SelectionDAG &DAG) const {
+  // Only on 64-bit targets.
+  if (N.getValueType() != MVT::i64)
+    return false;
+
+  SDLoc dl(N);
+  int64_t Imm = 0;
+
+  if (N.getOpcode() == ISD::ADD) {
+    if (!isIntS34Immediate(N.getOperand(1), Imm))
+      return false;
+    Disp = DAG.getTargetConstant(Imm, dl, N.getValueType());
+    if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0)))
+      Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
+    else
+      Base = N.getOperand(0);
+    return true;
+  }
+
+  if (N.getOpcode() == ISD::OR) {
+    if (!isIntS34Immediate(N.getOperand(1), Imm))
+      return false;
+    // If this is an or of disjoint bitfields, we can codegen this as an add
+    // (for better address arithmetic) if the LHS and RHS of the OR are
+    // provably disjoint.
+    KnownBits LHSKnown = DAG.computeKnownBits(N.getOperand(0));
+    if ((LHSKnown.Zero.getZExtValue() | ~(uint64_t)Imm) != ~0ULL)
+      return false;
+    if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0)))
+      Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
+    else
+      Base = N.getOperand(0);
+    Disp = DAG.getTargetConstant(Imm, dl, N.getValueType());
+    return true;
+  }
+
+  if (isIntS34Immediate(N, Imm)) { // If the address is a 34-bit const.
+    Disp = DAG.getTargetConstant(Imm, dl, N.getValueType());
+    Base = DAG.getRegister(PPC::ZERO8, N.getValueType());
+    return true;
+  }
+
+  return false;
+}
+
 /// SelectAddressRegRegOnly - Given the specified addressed, force it to be
 /// represented as an indexed [r+r] operation.
 bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base,
@@ -2760,16 +2838,9 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
     return false;
   }
 
-  // PowerPC doesn't have preinc load/store instructions for vectors (except
-  // for QPX, which does have preinc r+r forms).
-  if (VT.isVector()) {
-    if (!Subtarget.hasQPX() || (VT != MVT::v4f64 && VT != MVT::v4f32)) {
-      return false;
-    } else if (SelectAddressRegRegOnly(Ptr, Offset, Base, DAG)) {
-      AM = ISD::PRE_INC;
-      return true;
-    }
-  }
+  // PowerPC doesn't have preinc load/store instructions for vectors
+  if (VT.isVector())
+    return false;
 
   if (SelectAddressRegReg(Ptr, Base, Offset, DAG)) {
     // Common code will reject creating a pre-inc form if the base pointer
@@ -3064,6 +3135,15 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
   TLSModel::Model Model = TM.getTLSModel(GV);
 
   if (Model == TLSModel::LocalExec) {
+    if (Subtarget.isUsingPCRelativeCalls()) {
+      SDValue TLSReg = DAG.getRegister(PPC::X13, MVT::i64);
+      SDValue TGA = DAG.getTargetGlobalAddress(
+          GV, dl, PtrVT, 0, (PPCII::MO_PCREL_FLAG | PPCII::MO_TPREL_FLAG));
+      SDValue MatAddr =
+          DAG.getNode(PPCISD::TLS_LOCAL_EXEC_MAT_ADDR, dl, PtrVT, TGA);
+      return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TLSReg, MatAddr);
+    }
+
     SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
                                                PPCII::MO_TPREL_HA);
     SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
@@ -3076,29 +3156,44 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
   }
 
   if (Model == TLSModel::InitialExec) {
-    SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
-    SDValue TGATLS = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
-                                                PPCII::MO_TLS);
-    SDValue GOTPtr;
-    if (is64bit) {
-      setUsesTOCBasePtr(DAG);
-      SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
-      GOTPtr = DAG.getNode(PPCISD::ADDIS_GOT_TPREL_HA, dl,
-                           PtrVT, GOTReg, TGA);
+    bool IsPCRel = Subtarget.isUsingPCRelativeCalls();
+    SDValue TGA = DAG.getTargetGlobalAddress(
+        GV, dl, PtrVT, 0, IsPCRel ? PPCII::MO_GOT_TPREL_PCREL_FLAG : 0);
+    SDValue TGATLS = DAG.getTargetGlobalAddress(
+        GV, dl, PtrVT, 0,
+        IsPCRel ? (PPCII::MO_TLS | PPCII::MO_PCREL_FLAG) : PPCII::MO_TLS);
+    SDValue TPOffset;
+    if (IsPCRel) {
+      SDValue MatPCRel = DAG.getNode(PPCISD::MAT_PCREL_ADDR, dl, PtrVT, TGA);
+      TPOffset = DAG.getLoad(MVT::i64, dl, DAG.getEntryNode(), MatPCRel,
+                             MachinePointerInfo());
     } else {
-      if (!TM.isPositionIndependent())
-        GOTPtr = DAG.getNode(PPCISD::PPC32_GOT, dl, PtrVT);
-      else if (picLevel == PICLevel::SmallPIC)
-        GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT);
-      else
-        GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT);
+      SDValue GOTPtr;
+      if (is64bit) {
+        setUsesTOCBasePtr(DAG);
+        SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
+        GOTPtr =
+            DAG.getNode(PPCISD::ADDIS_GOT_TPREL_HA, dl, PtrVT, GOTReg, TGA);
+      } else {
+        if (!TM.isPositionIndependent())
+          GOTPtr = DAG.getNode(PPCISD::PPC32_GOT, dl, PtrVT);
+        else if (picLevel == PICLevel::SmallPIC)
+          GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT);
+        else
+          GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT);
+      }
+      TPOffset = DAG.getNode(PPCISD::LD_GOT_TPREL_L, dl, PtrVT, TGA, GOTPtr);
     }
-    SDValue TPOffset = DAG.getNode(PPCISD::LD_GOT_TPREL_L, dl,
-                                   PtrVT, TGA, GOTPtr);
     return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TPOffset, TGATLS);
   }
 
   if (Model == TLSModel::GeneralDynamic) {
+    if (Subtarget.isUsingPCRelativeCalls()) {
+      SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
+                                               PPCII::MO_GOT_TLSGD_PCREL_FLAG);
+      return DAG.getNode(PPCISD::TLS_DYNAMIC_MAT_PCREL_ADDR, dl, PtrVT, TGA);
+    }
+
     SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
     SDValue GOTPtr;
     if (is64bit) {
@@ -3117,6 +3212,14 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
   }
 
   if (Model == TLSModel::LocalDynamic) {
+    if (Subtarget.isUsingPCRelativeCalls()) {
+      SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
+                                               PPCII::MO_GOT_TLSLD_PCREL_FLAG);
+      SDValue MatPCRel =
+          DAG.getNode(PPCISD::TLS_DYNAMIC_MAT_PCREL_ADDR, dl, PtrVT, TGA);
+      return DAG.getNode(PPCISD::PADDI_DTPREL, dl, PtrVT, MatPCRel, TGA);
+    }
+
     SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
     SDValue GOTPtr;
     if (is64bit) {
@@ -3362,6 +3465,57 @@ SDValue PPCTargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
   return Op.getOperand(0);
 }
 
+SDValue PPCTargetLowering::LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  PPCFunctionInfo &MFI = *MF.getInfo<PPCFunctionInfo>();
+
+  assert((Op.getOpcode() == ISD::INLINEASM ||
+          Op.getOpcode() == ISD::INLINEASM_BR) &&
+         "Expecting Inline ASM node.");
+
+  // If an LR store is already known to be required then there is not point in
+  // checking this ASM as well.
+  if (MFI.isLRStoreRequired())
+    return Op;
+
+  // Inline ASM nodes have an optional last operand that is an incoming Flag of
+  // type MVT::Glue. We want to ignore this last operand if that is the case.
+  unsigned NumOps = Op.getNumOperands();
+  if (Op.getOperand(NumOps - 1).getValueType() == MVT::Glue)
+    --NumOps;
+
+  // Check all operands that may contain the LR.
+  for (unsigned i = InlineAsm::Op_FirstOperand; i != NumOps;) {
+    unsigned Flags = cast<ConstantSDNode>(Op.getOperand(i))->getZExtValue();
+    unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);
+    ++i; // Skip the ID value.
+
+    switch (InlineAsm::getKind(Flags)) {
+    default:
+      llvm_unreachable("Bad flags!");
+    case InlineAsm::Kind_RegUse:
+    case InlineAsm::Kind_Imm:
+    case InlineAsm::Kind_Mem:
+      i += NumVals;
+      break;
+    case InlineAsm::Kind_Clobber:
+    case InlineAsm::Kind_RegDef:
+    case InlineAsm::Kind_RegDefEarlyClobber: {
+      for (; NumVals; --NumVals, ++i) {
+        Register Reg = cast<RegisterSDNode>(Op.getOperand(i))->getReg();
+        if (Reg != PPC::LR && Reg != PPC::LR8)
+          continue;
+        MFI.setLRStoreRequired();
+        return Op;
+      }
+      break;
+    }
+    }
+  }
+
+  return Op;
+}
+
 SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
                                                 SelectionDAG &DAG) const {
   if (Subtarget.isAIXABI())
@@ -3491,11 +3645,6 @@ static const MCPhysReg FPR[] = {PPC::F1,  PPC::F2,  PPC::F3, PPC::F4, PPC::F5,
                                 PPC::F6,  PPC::F7,  PPC::F8, PPC::F9, PPC::F10,
                                 PPC::F11, PPC::F12, PPC::F13};
 
-/// QFPR - The set of QPX registers that should be allocated for arguments.
-static const MCPhysReg QFPR[] = {
-    PPC::QF1, PPC::QF2, PPC::QF3,  PPC::QF4,  PPC::QF5,  PPC::QF6, PPC::QF7,
-    PPC::QF8, PPC::QF9, PPC::QF10, PPC::QF11, PPC::QF12, PPC::QF13};
-
 /// CalculateStackSlotSize - Calculates the size reserved for this argument on
 /// the stack.
 static unsigned CalculateStackSlotSize(EVT ArgVT, ISD::ArgFlagsTy Flags,
@@ -3525,10 +3674,6 @@ static Align CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT,
       ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64 ||
       ArgVT == MVT::v1i128 || ArgVT == MVT::f128)
     Alignment = Align(16);
-  // QPX vector types stored in double-precision are padded to a 32 byte
-  // boundary.
-  else if (ArgVT == MVT::v4f64 || ArgVT == MVT::v4i1)
-    Alignment = Align(32);
 
   // ByVal parameters are aligned as requested.
   if (Flags.isByVal()) {
@@ -3560,14 +3705,11 @@ static Align CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT,
 /// stack slot (instead of being passed in registers).  ArgOffset,
 /// AvailableFPRs, and AvailableVRs must hold the current argument
 /// position, and will be updated to account for this argument.
-static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT,
-                                   ISD::ArgFlagsTy Flags,
-                                   unsigned PtrByteSize,
-                                   unsigned LinkageSize,
-                                   unsigned ParamAreaSize,
-                                   unsigned &ArgOffset,
+static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT, ISD::ArgFlagsTy Flags,
+                                   unsigned PtrByteSize, unsigned LinkageSize,
+                                   unsigned ParamAreaSize, unsigned &ArgOffset,
                                    unsigned &AvailableFPRs,
-                                   unsigned &AvailableVRs, bool HasQPX) {
+                                   unsigned &AvailableVRs) {
   bool UseMemory = false;
 
   // Respect alignment of argument on the stack.
@@ -3591,11 +3733,7 @@ static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT,
   // However, if the argument is actually passed in an FPR or a VR,
   // we don't use memory after all.
   if (!Flags.isByVal()) {
-    if (ArgVT == MVT::f32 || ArgVT == MVT::f64 ||
-        // QPX registers overlap with the scalar FP registers.
-        (HasQPX && (ArgVT == MVT::v4f32 ||
-                    ArgVT == MVT::v4f64 ||
-                    ArgVT == MVT::v4i1)))
+    if (ArgVT == MVT::f32 || ArgVT == MVT::f64)
       if (AvailableFPRs > 0) {
         --AvailableFPRs;
         return false;
@@ -3630,11 +3768,8 @@ SDValue PPCTargetLowering::LowerFormalArguments(
   if (Subtarget.is64BitELFABI())
     return LowerFormalArguments_64SVR4(Chain, CallConv, isVarArg, Ins, dl, DAG,
                                        InVals);
-  if (Subtarget.is32BitELFABI())
-    return LowerFormalArguments_32SVR4(Chain, CallConv, isVarArg, Ins, dl, DAG,
-                                       InVals);
-
-  return LowerFormalArguments_Darwin(Chain, CallConv, isVarArg, Ins, dl, DAG,
+  assert(Subtarget.is32BitELFABI());
+  return LowerFormalArguments_32SVR4(Chain, CallConv, isVarArg, Ins, dl, DAG,
                                      InVals);
 }
 
@@ -3734,18 +3869,12 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4(
           RC = &PPC::VRRCRegClass;
           break;
         case MVT::v4f32:
-          RC = Subtarget.hasQPX() ? &PPC::QSRCRegClass : &PPC::VRRCRegClass;
+          RC = &PPC::VRRCRegClass;
           break;
         case MVT::v2f64:
         case MVT::v2i64:
           RC = &PPC::VRRCRegClass;
           break;
-        case MVT::v4f64:
-          RC = &PPC::QFRCRegClass;
-          break;
-        case MVT::v4i1:
-          RC = &PPC::QBRCRegClass;
-          break;
       }
 
       SDValue ArgValue;
@@ -3944,7 +4073,6 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
   const unsigned Num_GPR_Regs = array_lengthof(GPR);
   const unsigned Num_FPR_Regs = useSoftFloat() ? 0 : 13;
   const unsigned Num_VR_Regs  = array_lengthof(VR);
-  const unsigned Num_QFPR_Regs = Num_FPR_Regs;
 
   // Do a first pass over the arguments to determine whether the ABI
   // guarantees that our caller has allocated the parameter save area
@@ -3963,8 +4091,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
 
     if (CalculateStackSlotUsed(Ins[i].VT, Ins[i].ArgVT, Ins[i].Flags,
                                PtrByteSize, LinkageSize, ParamAreaSize,
-                               NumBytes, AvailableFPRs, AvailableVRs,
-                               Subtarget.hasQPX()))
+                               NumBytes, AvailableFPRs, AvailableVRs))
       HasParameterArea = true;
   }
 
@@ -3974,7 +4101,6 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
 
   unsigned ArgOffset = LinkageSize;
   unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
-  unsigned &QFPR_idx = FPR_idx;
   SmallVector<SDValue, 8> MemOps;
   Function::const_arg_iterator FuncArg = MF.getFunction().arg_begin();
   unsigned CurArgIdx = 0;
@@ -4217,51 +4343,20 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
     case MVT::v2i64:
     case MVT::v1i128:
     case MVT::f128:
-      if (!Subtarget.hasQPX()) {
-        // These can be scalar arguments or elements of a vector array type
-        // passed directly.  The latter are used to implement ELFv2 homogenous
-        // vector aggregates.
-        if (VR_idx != Num_VR_Regs) {
-          unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass);
-          ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
-          ++VR_idx;
-        } else {
-          if (CallConv == CallingConv::Fast)
-            ComputeArgOffset();
-          needsLoad = true;
-        }
-        if (CallConv != CallingConv::Fast || needsLoad)
-          ArgOffset += 16;
-        break;
-      } // not QPX
-
-      assert(ObjectVT.getSimpleVT().SimpleTy == MVT::v4f32 &&
-             "Invalid QPX parameter type");
-      LLVM_FALLTHROUGH;
-
-    case MVT::v4f64:
-    case MVT::v4i1:
-      // QPX vectors are treated like their scalar floating-point subregisters
-      // (except that they're larger).
-      unsigned Sz = ObjectVT.getSimpleVT().SimpleTy == MVT::v4f32 ? 16 : 32;
-      if (QFPR_idx != Num_QFPR_Regs) {
-        const TargetRegisterClass *RC;
-        switch (ObjectVT.getSimpleVT().SimpleTy) {
-        case MVT::v4f64: RC = &PPC::QFRCRegClass; break;
-        case MVT::v4f32: RC = &PPC::QSRCRegClass; break;
-        default:         RC = &PPC::QBRCRegClass; break;
-        }
-
-        unsigned VReg = MF.addLiveIn(QFPR[QFPR_idx], RC);
+      // These can be scalar arguments or elements of a vector array type
+      // passed directly.  The latter are used to implement ELFv2 homogenous
+      // vector aggregates.
+      if (VR_idx != Num_VR_Regs) {
+        unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass);
         ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
-        ++QFPR_idx;
+        ++VR_idx;
       } else {
         if (CallConv == CallingConv::Fast)
           ComputeArgOffset();
         needsLoad = true;
       }
       if (CallConv != CallingConv::Fast || needsLoad)
-        ArgOffset += Sz;
+        ArgOffset += 16;
       break;
     }
 
@@ -4328,379 +4423,19 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
   return Chain;
 }
 
-SDValue PPCTargetLowering::LowerFormalArguments_Darwin(
-    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
-    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
-    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
-  // TODO: add description of PPC stack frame format, or at least some docs.
-  //
-  MachineFunction &MF = DAG.getMachineFunction();
-  MachineFrameInfo &MFI = MF.getFrameInfo();
-  PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+/// CalculateTailCallSPDiff - Get the amount the stack pointer has to be
+/// adjusted to accommodate the arguments for the tailcall.
+static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall,
+                                   unsigned ParamSize) {
 
-  EVT PtrVT = getPointerTy(MF.getDataLayout());
-  bool isPPC64 = PtrVT == MVT::i64;
-  // Potential tail calls could cause overwriting of argument stack slots.
-  bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
-                       (CallConv == CallingConv::Fast));
-  unsigned PtrByteSize = isPPC64 ? 8 : 4;
-  unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
-  unsigned ArgOffset = LinkageSize;
-  // Area that is at least reserved in caller of this function.
-  unsigned MinReservedArea = ArgOffset;
+  if (!isTailCall) return 0;
 
-  static const MCPhysReg GPR_32[] = {           // 32-bit registers.
-    PPC::R3, PPC::R4, PPC::R5, PPC::R6,
-    PPC::R7, PPC::R8, PPC::R9, PPC::R10,
-  };
-  static const MCPhysReg GPR_64[] = {           // 64-bit registers.
-    PPC::X3, PPC::X4, PPC::X5, PPC::X6,
-    PPC::X7, PPC::X8, PPC::X9, PPC::X10,
-  };
-  static const MCPhysReg VR[] = {
-    PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
-    PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
-  };
-
-  const unsigned Num_GPR_Regs = array_lengthof(GPR_32);
-  const unsigned Num_FPR_Regs = useSoftFloat() ? 0 : 13;
-  const unsigned Num_VR_Regs  = array_lengthof( VR);
-
-  unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
-
-  const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32;
-
-  // In 32-bit non-varargs functions, the stack space for vectors is after the
-  // stack space for non-vectors.  We do not use this space unless we have
-  // too many vectors to fit in registers, something that only occurs in
-  // constructed examples:), but we have to walk the arglist to figure
-  // that out...for the pathological case, compute VecArgOffset as the
-  // start of the vector parameter area.  Computing VecArgOffset is the
-  // entire point of the following loop.
-  unsigned VecArgOffset = ArgOffset;
-  if (!isVarArg && !isPPC64) {
-    for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e;
-         ++ArgNo) {
-      EVT ObjectVT = Ins[ArgNo].VT;
-      ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
-
-      if (Flags.isByVal()) {
-        // ObjSize is the true size, ArgSize rounded up to multiple of regs.
-        unsigned ObjSize = Flags.getByValSize();
-        unsigned ArgSize =
-                ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
-        VecArgOffset += ArgSize;
-        continue;
-      }
-
-      switch(ObjectVT.getSimpleVT().SimpleTy) {
-      default: llvm_unreachable("Unhandled argument type!");
-      case MVT::i1:
-      case MVT::i32:
-      case MVT::f32:
-        VecArgOffset += 4;
-        break;
-      case MVT::i64:  // PPC64
-      case MVT::f64:
-        // FIXME: We are guaranteed to be !isPPC64 at this point.
-        // Does MVT::i64 apply?
-        VecArgOffset += 8;
-        break;
-      case MVT::v4f32:
-      case MVT::v4i32:
-      case MVT::v8i16:
-      case MVT::v16i8:
-        // Nothing to do, we're only looking at Nonvector args here.
-        break;
-      }
-    }
-  }
-  // We've found where the vector parameter area in memory is.  Skip the
-  // first 12 parameters; these don't use that memory.
-  VecArgOffset = ((VecArgOffset+15)/16)*16;
-  VecArgOffset += 12*16;
-
-  // Add DAG nodes to load the arguments or copy them out of registers.  On
-  // entry to a function on PPC, the arguments start after the linkage area,
-  // although the first ones are often in registers.
-
-  SmallVector<SDValue, 8> MemOps;
-  unsigned nAltivecParamsAtEnd = 0;
-  Function::const_arg_iterator FuncArg = MF.getFunction().arg_begin();
-  unsigned CurArgIdx = 0;
-  for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
-    SDValue ArgVal;
-    bool needsLoad = false;
-    EVT ObjectVT = Ins[ArgNo].VT;
-    unsigned ObjSize = ObjectVT.getSizeInBits()/8;
-    unsigned ArgSize = ObjSize;
-    ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
-    if (Ins[ArgNo].isOrigArg()) {
-      std::advance(FuncArg, Ins[ArgNo].getOrigArgIndex() - CurArgIdx);
-      CurArgIdx = Ins[ArgNo].getOrigArgIndex();
-    }
-    unsigned CurArgOffset = ArgOffset;
-
-    // Varargs or 64 bit Altivec parameters are padded to a 16 byte boundary.
-    if (ObjectVT==MVT::v4f32 || ObjectVT==MVT::v4i32 ||
-        ObjectVT==MVT::v8i16 || ObjectVT==MVT::v16i8) {
-      if (isVarArg || isPPC64) {
-        MinReservedArea = ((MinReservedArea+15)/16)*16;
-        MinReservedArea += CalculateStackSlotSize(ObjectVT,
-                                                  Flags,
-                                                  PtrByteSize);
-      } else  nAltivecParamsAtEnd++;
-    } else
-      // Calculate min reserved area.
-      MinReservedArea += CalculateStackSlotSize(Ins[ArgNo].VT,
-                                                Flags,
-                                                PtrByteSize);
-
-    // FIXME the codegen can be much improved in some cases.
-    // We do not have to keep everything in memory.
-    if (Flags.isByVal()) {
-      assert(Ins[ArgNo].isOrigArg() && "Byval arguments cannot be implicit");
-
-      // ObjSize is the true size, ArgSize rounded up to multiple of registers.
-      ObjSize = Flags.getByValSize();
-      ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
-      // Objects of size 1 and 2 are right justified, everything else is
-      // left justified.  This means the memory address is adjusted forwards.
-      if (ObjSize==1 || ObjSize==2) {
-        CurArgOffset = CurArgOffset + (4 - ObjSize);
-      }
-      // The value of the object is its address.
-      int FI = MFI.CreateFixedObject(ObjSize, CurArgOffset, false, true);
-      SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
-      InVals.push_back(FIN);
-      if (ObjSize==1 || ObjSize==2) {
-        if (GPR_idx != Num_GPR_Regs) {
-          unsigned VReg;
-          if (isPPC64)
-            VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
-          else
-            VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
-          SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
-          EVT ObjType = ObjSize == 1 ? MVT::i8 : MVT::i16;
-          SDValue Store =
-              DAG.getTruncStore(Val.getValue(1), dl, Val, FIN,
-                                MachinePointerInfo(&*FuncArg), ObjType);
-          MemOps.push_back(Store);
-          ++GPR_idx;
-        }
-
-        ArgOffset += PtrByteSize;
-
-        continue;
-      }
-      for (unsigned j = 0; j < ArgSize; j += PtrByteSize) {
-        // Store whatever pieces of the object are in registers
-        // to memory.  ArgOffset will be the address of the beginning
-        // of the object.
-        if (GPR_idx != Num_GPR_Regs) {
-          unsigned VReg;
-          if (isPPC64)
-            VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
-          else
-            VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
-          int FI = MFI.CreateFixedObject(PtrByteSize, ArgOffset, true);
-          SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
-          SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
-          SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
-                                       MachinePointerInfo(&*FuncArg, j));
-          MemOps.push_back(Store);
-          ++GPR_idx;
-          ArgOffset += PtrByteSize;
-        } else {
-          ArgOffset += ArgSize - (ArgOffset-CurArgOffset);
-          break;
-        }
-      }
-      continue;
-    }
-
-    switch (ObjectVT.getSimpleVT().SimpleTy) {
-    default: llvm_unreachable("Unhandled argument type!");
-    case MVT::i1:
-    case MVT::i32:
-      if (!isPPC64) {
-        if (GPR_idx != Num_GPR_Regs) {
-          unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
-          ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
-
-          if (ObjectVT == MVT::i1)
-            ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgVal);
-
-          ++GPR_idx;
-        } else {
-          needsLoad = true;
-          ArgSize = PtrByteSize;
-        }
-        // All int arguments reserve stack space in the Darwin ABI.
-        ArgOffset += PtrByteSize;
-        break;
-      }
-      LLVM_FALLTHROUGH;
-    case MVT::i64:  // PPC64
-      if (GPR_idx != Num_GPR_Regs) {
-        unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
-        ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
-
-        if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1)
-          // PPC64 passes i8, i16, and i32 values in i64 registers. Promote
-          // value to MVT::i64 and then truncate to the correct register size.
-          ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);
-
-        ++GPR_idx;
-      } else {
-        needsLoad = true;
-        ArgSize = PtrByteSize;
-      }
-      // All int arguments reserve stack space in the Darwin ABI.
-      ArgOffset += 8;
-      break;
-
-    case MVT::f32:
-    case MVT::f64:
-      // Every 4 bytes of argument space consumes one of the GPRs available for
-      // argument passing.
-      if (GPR_idx != Num_GPR_Regs) {
-        ++GPR_idx;
-        if (ObjSize == 8 && GPR_idx != Num_GPR_Regs && !isPPC64)
-          ++GPR_idx;
-      }
-      if (FPR_idx != Num_FPR_Regs) {
-        unsigned VReg;
-
-        if (ObjectVT == MVT::f32)
-          VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F4RCRegClass);
-        else
-          VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F8RCRegClass);
-
-        ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
-        ++FPR_idx;
-      } else {
-        needsLoad = true;
-      }
-
-      // All FP arguments reserve stack space in the Darwin ABI.
-      ArgOffset += isPPC64 ? 8 : ObjSize;
-      break;
-    case MVT::v4f32:
-    case MVT::v4i32:
-    case MVT::v8i16:
-    case MVT::v16i8:
-      // Note that vector arguments in registers don't reserve stack space,
-      // except in varargs functions.
-      if (VR_idx != Num_VR_Regs) {
-        unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass);
-        ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
-        if (isVarArg) {
-          while ((ArgOffset % 16) != 0) {
-            ArgOffset += PtrByteSize;
-            if (GPR_idx != Num_GPR_Regs)
-              GPR_idx++;
-          }
-          ArgOffset += 16;
-          GPR_idx = std::min(GPR_idx+4, Num_GPR_Regs); // FIXME correct for ppc64?
-        }
-        ++VR_idx;
-      } else {
-        if (!isVarArg && !isPPC64) {
-          // Vectors go after all the nonvectors.
-          CurArgOffset = VecArgOffset;
-          VecArgOffset += 16;
-        } else {
-          // Vectors are aligned.
-          ArgOffset = ((ArgOffset+15)/16)*16;
-          CurArgOffset = ArgOffset;
-          ArgOffset += 16;
-        }
-        needsLoad = true;
-      }
-      break;
-    }
-
-    // We need to load the argument to a virtual register if we determined above
-    // that we ran out of physical registers of the appropriate type.
-    if (needsLoad) {
-      int FI = MFI.CreateFixedObject(ObjSize,
-                                     CurArgOffset + (ArgSize - ObjSize),
-                                     isImmutable);
-      SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
-      ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo());
-    }
-
-    InVals.push_back(ArgVal);
-  }
-
-  // Allow for Altivec parameters at the end, if needed.
-  if (nAltivecParamsAtEnd) {
-    MinReservedArea = ((MinReservedArea+15)/16)*16;
-    MinReservedArea += 16*nAltivecParamsAtEnd;
-  }
-
-  // Area that is at least reserved in the caller of this function.
-  MinReservedArea = std::max(MinReservedArea, LinkageSize + 8 * PtrByteSize);
-
-  // Set the size that is at least reserved in caller of this function.  Tail
-  // call optimized functions' reserved stack space needs to be aligned so that
-  // taking the difference between two stack areas will result in an aligned
-  // stack.
-  MinReservedArea =
-      EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea);
-  FuncInfo->setMinReservedArea(MinReservedArea);
-
-  // If the function takes variable number of arguments, make a frame index for
-  // the start of the first vararg value... for expansion of llvm.va_start.
-  if (isVarArg) {
-    int Depth = ArgOffset;
-
-    FuncInfo->setVarArgsFrameIndex(
-      MFI.CreateFixedObject(PtrVT.getSizeInBits()/8,
-                            Depth, true));
-    SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
-
-    // If this function is vararg, store any remaining integer argument regs
-    // to their spots on the stack so that they may be loaded by dereferencing
-    // the result of va_next.
-    for (; GPR_idx != Num_GPR_Regs; ++GPR_idx) {
-      unsigned VReg;
-
-      if (isPPC64)
-        VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
-      else
-        VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
-
-      SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
-      SDValue Store =
-          DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
-      MemOps.push_back(Store);
-      // Increment the address by four for the next argument to store
-      SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, dl, PtrVT);
-      FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
-    }
-  }
-
-  if (!MemOps.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
-
-  return Chain;
-}
-
-/// CalculateTailCallSPDiff - Get the amount the stack pointer has to be
-/// adjusted to accommodate the arguments for the tailcall.
-static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall,
-                                   unsigned ParamSize) {
-
-  if (!isTailCall) return 0;
-
-  PPCFunctionInfo *FI = DAG.getMachineFunction().getInfo<PPCFunctionInfo>();
-  unsigned CallerMinReservedArea = FI->getMinReservedArea();
-  int SPDiff = (int)CallerMinReservedArea - (int)ParamSize;
-  // Remember only if the new adjustment is bigger.
-  if (SPDiff < FI->getTailCallSPDelta())
-    FI->setTailCallSPDelta(SPDiff);
+  PPCFunctionInfo *FI = DAG.getMachineFunction().getInfo<PPCFunctionInfo>();
+  unsigned CallerMinReservedArea = FI->getMinReservedArea();
+  int SPDiff = (int)CallerMinReservedArea - (int)ParamSize;
+  // Remember only if the new adjustment is bigger.
+  if (SPDiff < FI->getTailCallSPDelta())
+    FI->setTailCallSPDelta(SPDiff);
 
   return SPDiff;
 }
@@ -4758,6 +4493,13 @@ static bool callsShareTOCBase(const Function *Caller, SDValue Callee,
   if (STICallee->isUsingPCRelativeCalls())
     return false;
 
+  // If the GV is not a strong definition then we need to assume it can be
+  // replaced by another function at link time. The function that replaces
+  // it may not share the same TOC as the caller since the callee may be
+  // replaced by a PC Relative version of the same function.
+  if (!GV->isStrongDefinitionForLinker())
+    return false;
+
   // The medium and large code models are expected to provide a sufficiently
   // large TOC to provide all data addressing needs of a module with a
   // single TOC.
@@ -4765,12 +4507,6 @@ static bool callsShareTOCBase(const Function *Caller, SDValue Callee,
       CodeModel::Large == TM.getCodeModel())
     return true;
 
-  // Otherwise we need to ensure callee and caller are in the same section,
-  // since the linker may allocate multiple TOCs, and we don't know which
-  // sections will belong to the same TOC base.
-  if (!GV->isStrongDefinitionForLinker())
-    return false;
-
   // Any explicitly-specified sections and section prefixes must also match.
   // Also, if we're using -ffunction-sections, then each function is always in
   // a different section (the same is true for COMDAT functions).
@@ -4814,10 +4550,9 @@ needStackSlotPassParameters(const PPCSubtarget &Subtarget,
   for (const ISD::OutputArg& Param : Outs) {
     if (Param.Flags.isNest()) continue;
 
-    if (CalculateStackSlotUsed(Param.VT, Param.ArgVT, Param.Flags,
-                               PtrByteSize, LinkageSize, ParamAreaSize,
-                               NumBytes, AvailableFPRs, AvailableVRs,
-                               Subtarget.hasQPX()))
+    if (CalculateStackSlotUsed(Param.VT, Param.ArgVT, Param.Flags, PtrByteSize,
+                               LinkageSize, ParamAreaSize, NumBytes,
+                               AvailableFPRs, AvailableVRs))
       return true;
   }
   return false;
@@ -5331,66 +5066,53 @@ static SDValue transformCallee(const SDValue &Callee, SelectionDAG &DAG,
       Subtarget.is32BitELFABI() && !isLocalCallee() &&
       Subtarget.getTargetMachine().getRelocationModel() == Reloc::PIC_;
 
-  // On AIX, direct function calls reference the symbol for the function's
-  // entry point, which is named by prepending a "." before the function's
-  // C-linkage name.
-  const auto getAIXFuncEntryPointSymbolSDNode =
-      [&](StringRef FuncName, bool IsDeclaration,
-          const XCOFF::StorageClass &SC) {
-        auto &Context = DAG.getMachineFunction().getMMI().getContext();
-
-        MCSymbolXCOFF *S = cast<MCSymbolXCOFF>(
-            Context.getOrCreateSymbol(Twine(".") + Twine(FuncName)));
-
-        if (IsDeclaration && !S->hasRepresentedCsectSet()) {
-          // On AIX, an undefined symbol needs to be associated with a
-          // MCSectionXCOFF to get the correct storage mapping class.
-          // In this case, XCOFF::XMC_PR.
-          MCSectionXCOFF *Sec = Context.getXCOFFSection(
-              S->getSymbolTableName(), XCOFF::XMC_PR, XCOFF::XTY_ER, SC,
-              SectionKind::getMetadata());
-          S->setRepresentedCsect(Sec);
-        }
+  const auto getAIXFuncEntryPointSymbolSDNode = [&](const GlobalValue *GV) {
+    const TargetMachine &TM = Subtarget.getTargetMachine();
+    const TargetLoweringObjectFile *TLOF = TM.getObjFileLowering();
+    MCSymbolXCOFF *S =
+        cast<MCSymbolXCOFF>(TLOF->getFunctionEntryPointSymbol(GV, TM));
 
-        MVT PtrVT =
-            DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
-        return DAG.getMCSymbol(S, PtrVT);
-      };
+    MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
+    return DAG.getMCSymbol(S, PtrVT);
+  };
 
   if (isFunctionGlobalAddress(Callee)) {
-    const GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Callee);
-    const GlobalValue *GV = G->getGlobal();
+    const GlobalValue *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
 
-    if (!Subtarget.isAIXABI())
-      return DAG.getTargetGlobalAddress(GV, dl, Callee.getValueType(), 0,
-                                        UsePlt ? PPCII::MO_PLT : 0);
-
-    assert(!isa<GlobalIFunc>(GV) && "IFunc is not supported on AIX.");
-    const GlobalObject *GO = cast<GlobalObject>(GV);
-    const XCOFF::StorageClass SC =
-        TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(GO);
-    return getAIXFuncEntryPointSymbolSDNode(GO->getName(), GO->isDeclaration(),
-                                            SC);
+    if (Subtarget.isAIXABI()) {
+      assert(!isa<GlobalIFunc>(GV) && "IFunc is not supported on AIX.");
+      return getAIXFuncEntryPointSymbolSDNode(GV);
+    }
+    return DAG.getTargetGlobalAddress(GV, dl, Callee.getValueType(), 0,
+                                      UsePlt ? PPCII::MO_PLT : 0);
   }
 
   if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
     const char *SymName = S->getSymbol();
-    if (!Subtarget.isAIXABI())
-      return DAG.getTargetExternalSymbol(SymName, Callee.getValueType(),
-                                         UsePlt ? PPCII::MO_PLT : 0);
+    if (Subtarget.isAIXABI()) {
+      // If there exists a user-declared function whose name is the same as the
+      // ExternalSymbol's, then we pick up the user-declared version.
+      const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
+      if (const Function *F =
+              dyn_cast_or_null<Function>(Mod->getNamedValue(SymName)))
+        return getAIXFuncEntryPointSymbolSDNode(F);
+
+      // On AIX, direct function calls reference the symbol for the function's
+      // entry point, which is named by prepending a "." before the function's
+      // C-linkage name. A Qualname is returned here because an external
+      // function entry point is a csect with XTY_ER property.
+      const auto getExternalFunctionEntryPointSymbol = [&](StringRef SymName) {
+        auto &Context = DAG.getMachineFunction().getMMI().getContext();
+        MCSectionXCOFF *Sec = Context.getXCOFFSection(
+            (Twine(".") + Twine(SymName)).str(), XCOFF::XMC_PR, XCOFF::XTY_ER,
+            SectionKind::getMetadata());
+        return Sec->getQualNameSymbol();
+      };
 
-    // If there exists a user-declared function whose name is the same as the
-    // ExternalSymbol's, then we pick up the user-declared version.
-    const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
-    if (const Function *F =
-            dyn_cast_or_null<Function>(Mod->getNamedValue(SymName))) {
-      const XCOFF::StorageClass SC =
-          TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(F);
-      return getAIXFuncEntryPointSymbolSDNode(F->getName(), F->isDeclaration(),
-                                              SC);
+      SymName = getExternalFunctionEntryPointSymbol(SymName)->getName().data();
     }
-
-    return getAIXFuncEntryPointSymbolSDNode(SymName, true, XCOFF::C_EXT);
+    return DAG.getTargetExternalSymbol(SymName, Callee.getValueType(),
+                                       UsePlt ? PPCII::MO_PLT : 0);
   }
 
   // No transformation needed.
@@ -5735,19 +5457,15 @@ PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
           any_of(Outs, [](ISD::OutputArg Arg) { return Arg.Flags.isNest(); }),
       CLI.NoMerge);
 
-  if (Subtarget.isSVR4ABI() && Subtarget.isPPC64())
-    return LowerCall_64SVR4(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG,
-                            InVals, CB);
-
-  if (Subtarget.isSVR4ABI())
-    return LowerCall_32SVR4(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG,
-                            InVals, CB);
-
   if (Subtarget.isAIXABI())
     return LowerCall_AIX(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG,
                          InVals, CB);
 
-  return LowerCall_Darwin(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG,
+  assert(Subtarget.isSVR4ABI());
+  if (Subtarget.isPPC64())
+    return LowerCall_64SVR4(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG,
+                            InVals, CB);
+  return LowerCall_32SVR4(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG,
                           InVals, CB);
 }
 
@@ -6044,7 +5762,6 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
   unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
   unsigned NumBytes = LinkageSize;
   unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
-  unsigned &QFPR_idx = FPR_idx;
 
   static const MCPhysReg GPR[] = {
     PPC::X3, PPC::X4, PPC::X5, PPC::X6,
@@ -6058,7 +5775,6 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
   const unsigned NumGPRs = array_lengthof(GPR);
   const unsigned NumFPRs = useSoftFloat() ? 0 : 13;
   const unsigned NumVRs  = array_lengthof(VR);
-  const unsigned NumQFPRs = NumFPRs;
 
   // On ELFv2, we can avoid allocating the parameter area if all the arguments
   // can be passed to the callee in registers.
@@ -6073,9 +5789,8 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
     for (unsigned i = 0; i != NumOps; ++i) {
       if (Outs[i].Flags.isNest()) continue;
       if (CalculateStackSlotUsed(Outs[i].VT, Outs[i].ArgVT, Outs[i].Flags,
-                                PtrByteSize, LinkageSize, ParamAreaSize,
-                                NumBytesTmp, AvailableFPRs, AvailableVRs,
-                                Subtarget.hasQPX()))
+                                 PtrByteSize, LinkageSize, ParamAreaSize,
+                                 NumBytesTmp, AvailableFPRs, AvailableVRs))
         HasParameterArea = true;
     }
   }
@@ -6123,20 +5838,11 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
             continue;
           break;
         case MVT::v4f32:
-          // When using QPX, this is handled like a FP register, otherwise, it
-          // is an Altivec register.
-          if (Subtarget.hasQPX()) {
-            if (++NumFPRsUsed <= NumFPRs)
-              continue;
-          } else {
-            if (++NumVRsUsed <= NumVRs)
-              continue;
-          }
+          if (++NumVRsUsed <= NumVRs)
+            continue;
           break;
         case MVT::f32:
         case MVT::f64:
-        case MVT::v4f64: // QPX
-        case MVT::v4i1:  // QPX
           if (++NumFPRsUsed <= NumFPRs)
             continue;
           break;
@@ -6494,460 +6200,23 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
     case MVT::v4i32:
     case MVT::v8i16:
     case MVT::v16i8:
-    case MVT::v2f64:
-    case MVT::v2i64:
-    case MVT::v1i128:
-    case MVT::f128:
-      if (!Subtarget.hasQPX()) {
-      // These can be scalar arguments or elements of a vector array type
-      // passed directly.  The latter are used to implement ELFv2 homogenous
-      // vector aggregates.
-
-      // For a varargs call, named arguments go into VRs or on the stack as
-      // usual; unnamed arguments always go to the stack or the corresponding
-      // GPRs when within range.  For now, we always put the value in both
-      // locations (or even all three).
-      if (CFlags.IsVarArg) {
-        assert(HasParameterArea &&
-               "Parameter area must exist if we have a varargs call.");
-        // We could elide this store in the case where the object fits
-        // entirely in R registers.  Maybe later.
-        SDValue Store =
-            DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
-        MemOpChains.push_back(Store);
-        if (VR_idx != NumVRs) {
-          SDValue Load =
-              DAG.getLoad(MVT::v4f32, dl, Store, PtrOff, MachinePointerInfo());
-          MemOpChains.push_back(Load.getValue(1));
-          RegsToPass.push_back(std::make_pair(VR[VR_idx++], Load));
-        }
-        ArgOffset += 16;
-        for (unsigned i=0; i<16; i+=PtrByteSize) {
-          if (GPR_idx == NumGPRs)
-            break;
-          SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff,
-                                   DAG.getConstant(i, dl, PtrVT));
-          SDValue Load =
-              DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo());
-          MemOpChains.push_back(Load.getValue(1));
-          RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
-        }
-        break;
-      }
-
-      // Non-varargs Altivec params go into VRs or on the stack.
-      if (VR_idx != NumVRs) {
-        RegsToPass.push_back(std::make_pair(VR[VR_idx++], Arg));
-      } else {
-        if (IsFastCall)
-          ComputePtrOff();
-
-        assert(HasParameterArea &&
-               "Parameter area must exist to pass an argument in memory.");
-        LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
-                         true, CFlags.IsTailCall, true, MemOpChains,
-                         TailCallArguments, dl);
-        if (IsFastCall)
-          ArgOffset += 16;
-      }
-
-      if (!IsFastCall)
-        ArgOffset += 16;
-      break;
-      } // not QPX
-
-      assert(Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32 &&
-             "Invalid QPX parameter type");
-
-      LLVM_FALLTHROUGH;
-    case MVT::v4f64:
-    case MVT::v4i1: {
-      bool IsF32 = Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32;
-      if (CFlags.IsVarArg) {
-        assert(HasParameterArea &&
-               "Parameter area must exist if we have a varargs call.");
-        // We could elide this store in the case where the object fits
-        // entirely in R registers.  Maybe later.
-        SDValue Store =
-            DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
-        MemOpChains.push_back(Store);
-        if (QFPR_idx != NumQFPRs) {
-          SDValue Load = DAG.getLoad(IsF32 ? MVT::v4f32 : MVT::v4f64, dl, Store,
-                                     PtrOff, MachinePointerInfo());
-          MemOpChains.push_back(Load.getValue(1));
-          RegsToPass.push_back(std::make_pair(QFPR[QFPR_idx++], Load));
-        }
-        ArgOffset += (IsF32 ? 16 : 32);
-        for (unsigned i = 0; i < (IsF32 ? 16U : 32U); i += PtrByteSize) {
-          if (GPR_idx == NumGPRs)
-            break;
-          SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff,
-                                   DAG.getConstant(i, dl, PtrVT));
-          SDValue Load =
-              DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo());
-          MemOpChains.push_back(Load.getValue(1));
-          RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
-        }
-        break;
-      }
-
-      // Non-varargs QPX params go into registers or on the stack.
-      if (QFPR_idx != NumQFPRs) {
-        RegsToPass.push_back(std::make_pair(QFPR[QFPR_idx++], Arg));
-      } else {
-        if (IsFastCall)
-          ComputePtrOff();
-
-        assert(HasParameterArea &&
-               "Parameter area must exist to pass an argument in memory.");
-        LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
-                         true, CFlags.IsTailCall, true, MemOpChains,
-                         TailCallArguments, dl);
-        if (IsFastCall)
-          ArgOffset += (IsF32 ? 16 : 32);
-      }
-
-      if (!IsFastCall)
-        ArgOffset += (IsF32 ? 16 : 32);
-      break;
-      }
-    }
-  }
-
-  assert((!HasParameterArea || NumBytesActuallyUsed == ArgOffset) &&
-         "mismatch in size of parameter area");
-  (void)NumBytesActuallyUsed;
-
-  if (!MemOpChains.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
-
-  // Check if this is an indirect call (MTCTR/BCTRL).
-  // See prepareDescriptorIndirectCall and buildCallOperands for more
-  // information about calls through function pointers in the 64-bit SVR4 ABI.
-  if (CFlags.IsIndirect) {
-    // For 64-bit ELFv2 ABI with PCRel, do not save the TOC of the
-    // caller in the TOC save area.
-    if (isTOCSaveRestoreRequired(Subtarget)) {
-      assert(!CFlags.IsTailCall && "Indirect tails calls not supported");
-      // Load r2 into a virtual register and store it to the TOC save area.
-      setUsesTOCBasePtr(DAG);
-      SDValue Val = DAG.getCopyFromReg(Chain, dl, PPC::X2, MVT::i64);
-      // TOC save area offset.
-      unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset();
-      SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset, dl);
-      SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
-      Chain = DAG.getStore(Val.getValue(1), dl, Val, AddPtr,
-                           MachinePointerInfo::getStack(
-                               DAG.getMachineFunction(), TOCSaveOffset));
-    }
-    // In the ELFv2 ABI, R12 must contain the address of an indirect callee.
-    // This does not mean the MTCTR instruction must use R12; it's easier
-    // to model this as an extra parameter, so do that.
-    if (isELFv2ABI && !CFlags.IsPatchPoint)
-      RegsToPass.push_back(std::make_pair((unsigned)PPC::X12, Callee));
-  }
-
-  // Build a sequence of copy-to-reg nodes chained together with token chain
-  // and flag operands which copy the outgoing args into the appropriate regs.
-  SDValue InFlag;
-  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
-    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
-                             RegsToPass[i].second, InFlag);
-    InFlag = Chain.getValue(1);
-  }
-
-  if (CFlags.IsTailCall && !IsSibCall)
-    PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp,
-                    TailCallArguments);
-
-  return FinishCall(CFlags, dl, DAG, RegsToPass, InFlag, Chain, CallSeqStart,
-                    Callee, SPDiff, NumBytes, Ins, InVals, CB);
-}
-
-SDValue PPCTargetLowering::LowerCall_Darwin(
-    SDValue Chain, SDValue Callee, CallFlags CFlags,
-    const SmallVectorImpl<ISD::OutputArg> &Outs,
-    const SmallVectorImpl<SDValue> &OutVals,
-    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
-    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
-    const CallBase *CB) const {
-  unsigned NumOps = Outs.size();
-
-  EVT PtrVT = getPointerTy(DAG.getDataLayout());
-  bool isPPC64 = PtrVT == MVT::i64;
-  unsigned PtrByteSize = isPPC64 ? 8 : 4;
-
-  MachineFunction &MF = DAG.getMachineFunction();
-
-  // Mark this function as potentially containing a function that contains a
-  // tail call. As a consequence the frame pointer will be used for dynamicalloc
-  // and restoring the callers stack pointer in this functions epilog. This is
-  // done because by tail calling the called function might overwrite the value
-  // in this function's (MF) stack pointer stack slot 0(SP).
-  if (getTargetMachine().Options.GuaranteedTailCallOpt &&
-      CFlags.CallConv == CallingConv::Fast)
-    MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
-
-  // Count how many bytes are to be pushed on the stack, including the linkage
-  // area, and parameter passing area.  We start with 24/48 bytes, which is
-  // prereserved space for [SP][CR][LR][3 x unused].
-  unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
-  unsigned NumBytes = LinkageSize;
-
-  // Add up all the space actually used.
-  // In 32-bit non-varargs calls, Altivec parameters all go at the end; usually
-  // they all go in registers, but we must reserve stack space for them for
-  // possible use by the caller.  In varargs or 64-bit calls, parameters are
-  // assigned stack space in order, with padding so Altivec parameters are
-  // 16-byte aligned.
-  unsigned nAltivecParamsAtEnd = 0;
-  for (unsigned i = 0; i != NumOps; ++i) {
-    ISD::ArgFlagsTy Flags = Outs[i].Flags;
-    EVT ArgVT = Outs[i].VT;
-    // Varargs Altivec parameters are padded to a 16 byte boundary.
-    if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 ||
-        ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||
-        ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64) {
-      if (!CFlags.IsVarArg && !isPPC64) {
-        // Non-varargs Altivec parameters go after all the non-Altivec
-        // parameters; handle those later so we know how much padding we need.
-        nAltivecParamsAtEnd++;
-        continue;
-      }
-      // Varargs and 64-bit Altivec parameters are padded to 16 byte boundary.
-      NumBytes = ((NumBytes+15)/16)*16;
-    }
-    NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
-  }
-
-  // Allow for Altivec parameters at the end, if needed.
-  if (nAltivecParamsAtEnd) {
-    NumBytes = ((NumBytes+15)/16)*16;
-    NumBytes += 16*nAltivecParamsAtEnd;
-  }
-
-  // The prolog code of the callee may store up to 8 GPR argument registers to
-  // the stack, allowing va_start to index over them in memory if its varargs.
-  // Because we cannot tell if this is needed on the caller side, we have to
-  // conservatively assume that it is needed.  As such, make sure we have at
-  // least enough stack space for the caller to store the 8 GPRs.
-  NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize);
-
-  // Tail call needs the stack to be aligned.
-  if (getTargetMachine().Options.GuaranteedTailCallOpt &&
-      CFlags.CallConv == CallingConv::Fast)
-    NumBytes = EnsureStackAlignment(Subtarget.getFrameLowering(), NumBytes);
-
-  // Calculate by how many bytes the stack has to be adjusted in case of tail
-  // call optimization.
-  int SPDiff = CalculateTailCallSPDiff(DAG, CFlags.IsTailCall, NumBytes);
-
-  // To protect arguments on the stack from being clobbered in a tail call,
-  // force all the loads to happen before doing any other lowering.
-  if (CFlags.IsTailCall)
-    Chain = DAG.getStackArgumentTokenFactor(Chain);
-
-  // Adjust the stack pointer for the new arguments...
-  // These operations are automatically eliminated by the prolog/epilog pass
-  Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
-  SDValue CallSeqStart = Chain;
-
-  // Load the return address and frame pointer so it can be move somewhere else
-  // later.
-  SDValue LROp, FPOp;
-  Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl);
-
-  // Set up a copy of the stack pointer for use loading and storing any
-  // arguments that may not fit in the registers available for argument
-  // passing.
-  SDValue StackPtr;
-  if (isPPC64)
-    StackPtr = DAG.getRegister(PPC::X1, MVT::i64);
-  else
-    StackPtr = DAG.getRegister(PPC::R1, MVT::i32);
-
-  // Figure out which arguments are going to go in registers, and which in
-  // memory.  Also, if this is a vararg function, floating point operations
-  // must be stored to our stack, and loaded into integer regs as well, if
-  // any integer regs are available for argument passing.
-  unsigned ArgOffset = LinkageSize;
-  unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
-
-  static const MCPhysReg GPR_32[] = {           // 32-bit registers.
-    PPC::R3, PPC::R4, PPC::R5, PPC::R6,
-    PPC::R7, PPC::R8, PPC::R9, PPC::R10,
-  };
-  static const MCPhysReg GPR_64[] = {           // 64-bit registers.
-    PPC::X3, PPC::X4, PPC::X5, PPC::X6,
-    PPC::X7, PPC::X8, PPC::X9, PPC::X10,
-  };
-  static const MCPhysReg VR[] = {
-    PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
-    PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
-  };
-  const unsigned NumGPRs = array_lengthof(GPR_32);
-  const unsigned NumFPRs = 13;
-  const unsigned NumVRs  = array_lengthof(VR);
-
-  const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32;
-
-  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
-  SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
-
-  SmallVector<SDValue, 8> MemOpChains;
-  for (unsigned i = 0; i != NumOps; ++i) {
-    SDValue Arg = OutVals[i];
-    ISD::ArgFlagsTy Flags = Outs[i].Flags;
-
-    // PtrOff will be used to store the current argument to the stack if a
-    // register cannot be found for it.
-    SDValue PtrOff;
-
-    PtrOff = DAG.getConstant(ArgOffset, dl, StackPtr.getValueType());
-
-    PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
-
-    // On PPC64, promote integers to 64-bit values.
-    if (isPPC64 && Arg.getValueType() == MVT::i32) {
-      // FIXME: Should this use ANY_EXTEND if neither sext nor zext?
-      unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
-      Arg = DAG.getNode(ExtOp, dl, MVT::i64, Arg);
-    }
-
-    // FIXME memcpy is used way more than necessary.  Correctness first.
-    // Note: "by value" is code for passing a structure by value, not
-    // basic types.
-    if (Flags.isByVal()) {
-      unsigned Size = Flags.getByValSize();
-      // Very small objects are passed right-justified.  Everything else is
-      // passed left-justified.
-      if (Size==1 || Size==2) {
-        EVT VT = (Size==1) ? MVT::i8 : MVT::i16;
-        if (GPR_idx != NumGPRs) {
-          SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg,
-                                        MachinePointerInfo(), VT);
-          MemOpChains.push_back(Load.getValue(1));
-          RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
-
-          ArgOffset += PtrByteSize;
-        } else {
-          SDValue Const = DAG.getConstant(PtrByteSize - Size, dl,
-                                          PtrOff.getValueType());
-          SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
-          Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,
-                                                            CallSeqStart,
-                                                            Flags, DAG, dl);
-          ArgOffset += PtrByteSize;
-        }
-        continue;
-      }
-      // Copy entire object into memory.  There are cases where gcc-generated
-      // code assumes it is there, even if it could be put entirely into
-      // registers.  (This is not what the doc says.)
-      Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff,
-                                                        CallSeqStart,
-                                                        Flags, DAG, dl);
-
-      // For small aggregates (Darwin only) and aggregates >= PtrByteSize,
-      // copy the pieces of the object that fit into registers from the
-      // parameter save area.
-      for (unsigned j=0; j<Size; j+=PtrByteSize) {
-        SDValue Const = DAG.getConstant(j, dl, PtrOff.getValueType());
-        SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
-        if (GPR_idx != NumGPRs) {
-          SDValue Load =
-              DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo());
-          MemOpChains.push_back(Load.getValue(1));
-          RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
-          ArgOffset += PtrByteSize;
-        } else {
-          ArgOffset += ((Size - j + PtrByteSize-1)/PtrByteSize)*PtrByteSize;
-          break;
-        }
-      }
-      continue;
-    }
-
-    switch (Arg.getSimpleValueType().SimpleTy) {
-    default: llvm_unreachable("Unexpected ValueType for argument!");
-    case MVT::i1:
-    case MVT::i32:
-    case MVT::i64:
-      if (GPR_idx != NumGPRs) {
-        if (Arg.getValueType() == MVT::i1)
-          Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, PtrVT, Arg);
-
-        RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg));
-      } else {
-        LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
-                         isPPC64, CFlags.IsTailCall, false, MemOpChains,
-                         TailCallArguments, dl);
-      }
-      ArgOffset += PtrByteSize;
-      break;
-    case MVT::f32:
-    case MVT::f64:
-      if (FPR_idx != NumFPRs) {
-        RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg));
-
-        if (CFlags.IsVarArg) {
-          SDValue Store =
-              DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
-          MemOpChains.push_back(Store);
-
-          // Float varargs are always shadowed in available integer registers
-          if (GPR_idx != NumGPRs) {
-            SDValue Load =
-                DAG.getLoad(PtrVT, dl, Store, PtrOff, MachinePointerInfo());
-            MemOpChains.push_back(Load.getValue(1));
-            RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
-          }
-          if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 && !isPPC64){
-            SDValue ConstFour = DAG.getConstant(4, dl, PtrOff.getValueType());
-            PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
-            SDValue Load =
-                DAG.getLoad(PtrVT, dl, Store, PtrOff, MachinePointerInfo());
-            MemOpChains.push_back(Load.getValue(1));
-            RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
-          }
-        } else {
-          // If we have any FPRs remaining, we may also have GPRs remaining.
-          // Args passed in FPRs consume either 1 (f32) or 2 (f64) available
-          // GPRs.
-          if (GPR_idx != NumGPRs)
-            ++GPR_idx;
-          if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 &&
-              !isPPC64)  // PPC64 has 64-bit GPR's obviously :)
-            ++GPR_idx;
-        }
-      } else
-        LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
-                         isPPC64, CFlags.IsTailCall, false, MemOpChains,
-                         TailCallArguments, dl);
-      if (isPPC64)
-        ArgOffset += 8;
-      else
-        ArgOffset += Arg.getValueType() == MVT::f32 ? 4 : 8;
-      break;
-    case MVT::v4f32:
-    case MVT::v4i32:
-    case MVT::v8i16:
-    case MVT::v16i8:
+    case MVT::v2f64:
+    case MVT::v2i64:
+    case MVT::v1i128:
+    case MVT::f128:
+      // These can be scalar arguments or elements of a vector array type
+      // passed directly.  The latter are used to implement ELFv2 homogenous
+      // vector aggregates.
+
+      // For a varargs call, named arguments go into VRs or on the stack as
+      // usual; unnamed arguments always go to the stack or the corresponding
+      // GPRs when within range.  For now, we always put the value in both
+      // locations (or even all three).
       if (CFlags.IsVarArg) {
-        // These go aligned on the stack, or in the corresponding R registers
-        // when within range.  The Darwin PPC ABI doc claims they also go in
-        // V registers; in fact gcc does this only for arguments that are
-        // prototyped, not for those that match the ...  We do it for all
-        // arguments, seems to work.
-        while (ArgOffset % 16 !=0) {
-          ArgOffset += PtrByteSize;
-          if (GPR_idx != NumGPRs)
-            GPR_idx++;
-        }
+        assert(HasParameterArea &&
+               "Parameter area must exist if we have a varargs call.");
         // We could elide this store in the case where the object fits
         // entirely in R registers.  Maybe later.
-        PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr,
-                             DAG.getConstant(ArgOffset, dl, PtrVT));
         SDValue Store =
             DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
         MemOpChains.push_back(Store);
@@ -6971,58 +6240,59 @@ SDValue PPCTargetLowering::LowerCall_Darwin(
         break;
       }
 
-      // Non-varargs Altivec params generally go in registers, but have
-      // stack space allocated at the end.
+      // Non-varargs Altivec params go into VRs or on the stack.
       if (VR_idx != NumVRs) {
-        // Doesn't have GPR space allocated.
         RegsToPass.push_back(std::make_pair(VR[VR_idx++], Arg));
-      } else if (nAltivecParamsAtEnd==0) {
-        // We are emitting Altivec params in order.
+      } else {
+        if (IsFastCall)
+          ComputePtrOff();
+
+        assert(HasParameterArea &&
+               "Parameter area must exist to pass an argument in memory.");
         LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
-                         isPPC64, CFlags.IsTailCall, true, MemOpChains,
+                         true, CFlags.IsTailCall, true, MemOpChains,
                          TailCallArguments, dl);
-        ArgOffset += 16;
-      }
-      break;
-    }
-  }
-  // If all Altivec parameters fit in registers, as they usually do,
-  // they get stack space following the non-Altivec parameters.  We
-  // don't track this here because nobody below needs it.
-  // If there are more Altivec parameters than fit in registers emit
-  // the stores here.
-  if (!CFlags.IsVarArg && nAltivecParamsAtEnd > NumVRs) {
-    unsigned j = 0;
-    // Offset is aligned; skip 1st 12 params which go in V registers.
-    ArgOffset = ((ArgOffset+15)/16)*16;
-    ArgOffset += 12*16;
-    for (unsigned i = 0; i != NumOps; ++i) {
-      SDValue Arg = OutVals[i];
-      EVT ArgType = Outs[i].VT;
-      if (ArgType==MVT::v4f32 || ArgType==MVT::v4i32 ||
-          ArgType==MVT::v8i16 || ArgType==MVT::v16i8) {
-        if (++j > NumVRs) {
-          SDValue PtrOff;
-          // We are emitting Altivec params in order.
-          LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
-                           isPPC64, CFlags.IsTailCall, true, MemOpChains,
-                           TailCallArguments, dl);
+        if (IsFastCall)
           ArgOffset += 16;
-        }
       }
+
+      if (!IsFastCall)
+        ArgOffset += 16;
+      break;
     }
   }
 
+  assert((!HasParameterArea || NumBytesActuallyUsed == ArgOffset) &&
+         "mismatch in size of parameter area");
+  (void)NumBytesActuallyUsed;
+
   if (!MemOpChains.empty())
     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
 
-  // On Darwin, R12 must contain the address of an indirect callee.  This does
-  // not mean the MTCTR instruction must use R12; it's easier to model this as
-  // an extra parameter, so do that.
+  // Check if this is an indirect call (MTCTR/BCTRL).
+  // See prepareDescriptorIndirectCall and buildCallOperands for more
+  // information about calls through function pointers in the 64-bit SVR4 ABI.
   if (CFlags.IsIndirect) {
-    assert(!CFlags.IsTailCall && "Indirect tail-calls not supported.");
-    RegsToPass.push_back(std::make_pair((unsigned)(isPPC64 ? PPC::X12 :
-                                                   PPC::R12), Callee));
+    // For 64-bit ELFv2 ABI with PCRel, do not save the TOC of the
+    // caller in the TOC save area.
+    if (isTOCSaveRestoreRequired(Subtarget)) {
+      assert(!CFlags.IsTailCall && "Indirect tails calls not supported");
+      // Load r2 into a virtual register and store it to the TOC save area.
+      setUsesTOCBasePtr(DAG);
+      SDValue Val = DAG.getCopyFromReg(Chain, dl, PPC::X2, MVT::i64);
+      // TOC save area offset.
+      unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset();
+      SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset, dl);
+      SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
+      Chain = DAG.getStore(Val.getValue(1), dl, Val, AddPtr,
+                           MachinePointerInfo::getStack(
+                               DAG.getMachineFunction(), TOCSaveOffset));
+    }
+    // In the ELFv2 ABI, R12 must contain the address of an indirect callee.
+    // This does not mean the MTCTR instruction must use R12; it's easier
+    // to model this as an extra parameter, so do that.
+    if (isELFv2ABI && !CFlags.IsPatchPoint)
+      RegsToPass.push_back(std::make_pair((unsigned)PPC::X12, Callee));
   }
 
   // Build a sequence of copy-to-reg nodes chained together with token chain
@@ -7034,7 +6304,7 @@ SDValue PPCTargetLowering::LowerCall_Darwin(
     InFlag = Chain.getValue(1);
   }
 
-  if (CFlags.IsTailCall)
+  if (CFlags.IsTailCall && !IsSibCall)
     PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp,
                     TailCallArguments);
 
@@ -7052,9 +6322,10 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
   const Align PtrAlign = IsPPC64 ? Align(8) : Align(4);
   const MVT RegVT = IsPPC64 ? MVT::i64 : MVT::i32;
 
-  assert((!ValVT.isInteger() ||
-          (ValVT.getSizeInBits() <= RegVT.getSizeInBits())) &&
-         "Integer argument exceeds register size: should have been legalized");
+  if (ValVT.isVector() && !State.getMachineFunction()
+                               .getTarget()
+                               .Options.EnableAIXExtendedAltivecABI)
+    report_fatal_error("the default Altivec AIX ABI is not yet supported");
 
   if (ValVT == MVT::f128)
     report_fatal_error("f128 is unimplemented on AIX.");
@@ -7062,9 +6333,6 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
   if (ArgFlags.isNest())
     report_fatal_error("Nest arguments are unimplemented.");
 
-  if (ValVT.isVector() || LocVT.isVector())
-    report_fatal_error("Vector arguments are unimplemented on AIX.");
-
   static const MCPhysReg GPR_32[] = {// 32-bit registers.
                                      PPC::R3, PPC::R4, PPC::R5, PPC::R6,
                                      PPC::R7, PPC::R8, PPC::R9, PPC::R10};
@@ -7072,6 +6340,11 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
                                      PPC::X3, PPC::X4, PPC::X5, PPC::X6,
                                      PPC::X7, PPC::X8, PPC::X9, PPC::X10};
 
+  static const MCPhysReg VR[] = {// Vector registers.
+                                 PPC::V2,  PPC::V3,  PPC::V4,  PPC::V5,
+                                 PPC::V6,  PPC::V7,  PPC::V8,  PPC::V9,
+                                 PPC::V10, PPC::V11, PPC::V12, PPC::V13};
+
   if (ArgFlags.isByVal()) {
     if (ArgFlags.getNonZeroByValAlign() > PtrAlign)
       report_fatal_error("Pass-by-value arguments with alignment greater than "
@@ -7116,7 +6389,7 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
   case MVT::i32: {
     const unsigned Offset = State.AllocateStack(PtrAlign.value(), PtrAlign);
     // AIX integer arguments are always passed in register width.
-    if (ValVT.getSizeInBits() < RegVT.getSizeInBits())
+    if (ValVT.getFixedSizeInBits() < RegVT.getFixedSizeInBits())
       LocInfo = ArgFlags.isSExt() ? CCValAssign::LocInfo::SExt
                                   : CCValAssign::LocInfo::ZExt;
     if (unsigned Reg = State.AllocateReg(IsPPC64 ? GPR_64 : GPR_32))
@@ -7167,6 +6440,25 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
 
     return false;
   }
+  case MVT::v4f32:
+  case MVT::v4i32:
+  case MVT::v8i16:
+  case MVT::v16i8:
+  case MVT::v2i64:
+  case MVT::v2f64:
+  case MVT::v1i128: {
+    if (State.isVarArg())
+      report_fatal_error(
+          "variadic arguments for vector types are unimplemented for AIX");
+
+    if (unsigned VReg = State.AllocateReg(VR))
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, VReg, LocVT, LocInfo));
+    else {
+      report_fatal_error(
+          "passing vector parameters to the stack is unimplemented for AIX");
+    }
+    return false;
+  }
   }
   return true;
 }
@@ -7187,6 +6479,14 @@ static const TargetRegisterClass *getRegClassForSVT(MVT::SimpleValueType SVT,
     return &PPC::F4RCRegClass;
   case MVT::f64:
     return &PPC::F8RCRegClass;
+  case MVT::v4f32:
+  case MVT::v4i32:
+  case MVT::v8i16:
+  case MVT::v16i8:
+  case MVT::v2i64:
+  case MVT::v2f64:
+  case MVT::v1i128:
+    return &PPC::VRRCRegClass;
   }
 }
 
@@ -7194,7 +6494,7 @@ static SDValue truncateScalarIntegerArg(ISD::ArgFlagsTy Flags, EVT ValVT,
                                         SelectionDAG &DAG, SDValue ArgValue,
                                         MVT LocVT, const SDLoc &dl) {
   assert(ValVT.isScalarInteger() && LocVT.isScalarInteger());
-  assert(ValVT.getSizeInBits() < LocVT.getSizeInBits());
+  assert(ValVT.getFixedSizeInBits() < LocVT.getFixedSizeInBits());
 
   if (Flags.isSExt())
     ArgValue = DAG.getNode(ISD::AssertSext, dl, LocVT, ArgValue,
@@ -7281,8 +6581,6 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX(
 
   const PPCSubtarget &Subtarget =
       static_cast<const PPCSubtarget &>(DAG.getSubtarget());
-  if (Subtarget.hasQPX())
-    report_fatal_error("QPX support is not supported on AIX.");
 
   const bool IsPPC64 = Subtarget.isPPC64();
   const unsigned PtrByteSize = IsPPC64 ? 8 : 4;
@@ -7291,6 +6589,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX(
   SmallVector<CCValAssign, 16> ArgLocs;
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo &MFI = MF.getFrameInfo();
+  PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
 
   const EVT PtrVT = getPointerTy(MF.getDataLayout());
@@ -7305,6 +6604,9 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX(
     CCValAssign &VA = ArgLocs[I++];
     MVT LocVT = VA.getLocVT();
     ISD::ArgFlagsTy Flags = Ins[VA.getValNo()].Flags;
+    if (VA.isMemLoc() && VA.getValVT().isVector())
+      report_fatal_error(
+          "passing vector parameters to the stack is unimplemented for AIX");
 
     // For compatibility with the AIX XL compiler, the float args in the
     // parameter save area are initialized even if the argument is available
@@ -7315,6 +6617,15 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX(
     if (VA.isMemLoc() && VA.needsCustom())
       continue;
 
+    if (VA.isRegLoc()) {
+      if (VA.getValVT().isScalarInteger())
+        FuncInfo->appendParameterType(PPCFunctionInfo::FixedType);
+      else if (VA.getValVT().isFloatingPoint() && !VA.getValVT().isVector())
+        FuncInfo->appendParameterType(VA.getValVT().SimpleTy == MVT::f32
+                                          ? PPCFunctionInfo::ShortFloatPoint
+                                          : PPCFunctionInfo::LongFloatPoint);
+    }
+
     if (Flags.isByVal() && VA.isMemLoc()) {
       const unsigned Size =
           alignTo(Flags.getByValSize() ? Flags.getByValSize() : PtrByteSize,
@@ -7360,10 +6671,10 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX(
         // to extracting the value from the register directly, and elide the
         // stores when the arguments address is not taken, but that will need to
         // be future work.
-        SDValue Store =
-            DAG.getStore(CopyFrom.getValue(1), dl, CopyFrom,
-                         DAG.getObjectPtrOffset(dl, FIN, Offset),
-                         MachinePointerInfo::getFixedStack(MF, FI, Offset));
+        SDValue Store = DAG.getStore(
+            CopyFrom.getValue(1), dl, CopyFrom,
+            DAG.getObjectPtrOffset(dl, FIN, TypeSize::Fixed(Offset)),
+            MachinePointerInfo::getFixedStack(MF, FI, Offset));
 
         MemOps.push_back(Store);
       };
@@ -7378,6 +6689,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX(
 
         const CCValAssign RL = ArgLocs[I++];
         HandleRegLoc(RL.getLocReg(), Offset);
+        FuncInfo->appendParameterType(PPCFunctionInfo::FixedType);
       }
 
       if (Offset != StackSize) {
@@ -7399,7 +6711,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX(
           MF.addLiveIn(VA.getLocReg(), getRegClassForSVT(SVT, IsPPC64));
       SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, VReg, LocVT);
       if (ValVT.isScalarInteger() &&
-          (ValVT.getSizeInBits() < LocVT.getSizeInBits())) {
+          (ValVT.getFixedSizeInBits() < LocVT.getFixedSizeInBits())) {
         ArgValue =
             truncateScalarIntegerArg(Flags, ValVT, DAG, ArgValue, LocVT, dl);
       }
@@ -7440,7 +6752,6 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX(
   // aligned stack.
   CallerReservedArea =
       EnsureStackAlignment(Subtarget.getFrameLowering(), CallerReservedArea);
-  PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
   FuncInfo->setMinReservedArea(CallerReservedArea);
 
   if (isVarArg) {
@@ -7502,10 +6813,6 @@ SDValue PPCTargetLowering::LowerCall_AIX(
 
   const PPCSubtarget& Subtarget =
       static_cast<const PPCSubtarget&>(DAG.getSubtarget());
-  if (Subtarget.hasQPX())
-    report_fatal_error("QPX is not supported on AIX.");
-  if (Subtarget.hasAltivec())
-    report_fatal_error("Altivec support is unimplemented on AIX.");
 
   MachineFunction &MF = DAG.getMachineFunction();
   SmallVector<CCValAssign, 16> ArgLocs;
@@ -7562,11 +6869,12 @@ SDValue PPCTargetLowering::LowerCall_AIX(
       }
 
       auto GetLoad = [&](EVT VT, unsigned LoadOffset) {
-        return DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain,
-                              (LoadOffset != 0)
-                                  ? DAG.getObjectPtrOffset(dl, Arg, LoadOffset)
-                                  : Arg,
-                              MachinePointerInfo(), VT);
+        return DAG.getExtLoad(
+            ISD::ZEXTLOAD, dl, PtrVT, Chain,
+            (LoadOffset != 0)
+                ? DAG.getObjectPtrOffset(dl, Arg, TypeSize::Fixed(LoadOffset))
+                : Arg,
+            MachinePointerInfo(), VT);
       };
 
       unsigned LoadOffset = 0;
@@ -7596,9 +6904,11 @@ SDValue PPCTargetLowering::LowerCall_AIX(
         // Only memcpy the bytes that don't pass in register.
         MemcpyFlags.setByValSize(ByValSize - LoadOffset);
         Chain = CallSeqStart = createMemcpyOutsideCallSeq(
-            (LoadOffset != 0) ? DAG.getObjectPtrOffset(dl, Arg, LoadOffset)
-                              : Arg,
-            DAG.getObjectPtrOffset(dl, StackPtr, ByValVA.getLocMemOffset()),
+            (LoadOffset != 0)
+                ? DAG.getObjectPtrOffset(dl, Arg, TypeSize::Fixed(LoadOffset))
+                : Arg,
+            DAG.getObjectPtrOffset(dl, StackPtr,
+                                   TypeSize::Fixed(ByValVA.getLocMemOffset())),
             CallSeqStart, MemcpyFlags, DAG, dl);
         continue;
       }
@@ -7648,6 +6958,10 @@ SDValue PPCTargetLowering::LowerCall_AIX(
     const MVT LocVT = VA.getLocVT();
     const MVT ValVT = VA.getValVT();
 
+    if (VA.isMemLoc() && VA.getValVT().isVector())
+      report_fatal_error(
+          "passing vector parameters to the stack is unimplemented for AIX");
+
     switch (VA.getLocInfo()) {
     default:
       report_fatal_error("Unexpected argument extension type.");
@@ -7689,7 +7003,8 @@ SDValue PPCTargetLowering::LowerCall_AIX(
       // f32 in 32-bit GPR
       // f64 in 64-bit GPR
       RegsToPass.push_back(std::make_pair(VA.getLocReg(), ArgAsInt));
-    else if (Arg.getValueType().getSizeInBits() < LocVT.getSizeInBits())
+    else if (Arg.getValueType().getFixedSizeInBits() <
+             LocVT.getFixedSizeInBits())
       // f32 in 64-bit GPR.
       RegsToPass.push_back(std::make_pair(
           VA.getLocReg(), DAG.getZExtOrTrunc(ArgAsInt, dl, LocVT)));
@@ -8048,20 +7363,45 @@ SDValue PPCTargetLowering::LowerTRUNCATEVector(SDValue Op,
   //   <uu, uu, uu, uu, uu, uu, LSB2|MSB2, LSB1|MSB1> to
   //   <u, u, u, u, u, u, u, u, u, u, u, u, u, u, LSB2, LSB1>
 
-  assert(Op.getValueType().isVector() && "Vector type expected.");
-
-  SDLoc DL(Op);
-  SDValue N1 = Op.getOperand(0);
-  unsigned SrcSize = N1.getValueType().getSizeInBits();
-  assert(SrcSize <= 128 && "Source must fit in an Altivec/VSX vector");
-  SDValue WideSrc = SrcSize == 128 ? N1 : widenVec(DAG, N1, DL);
-
   EVT TrgVT = Op.getValueType();
+  assert(TrgVT.isVector() && "Vector type expected.");
   unsigned TrgNumElts = TrgVT.getVectorNumElements();
   EVT EltVT = TrgVT.getVectorElementType();
+  if (!isOperationCustom(Op.getOpcode(), TrgVT) ||
+      TrgVT.getSizeInBits() > 128 || !isPowerOf2_32(TrgNumElts) ||
+      !isPowerOf2_32(EltVT.getSizeInBits()))
+    return SDValue();
+
+  SDValue N1 = Op.getOperand(0);
+  EVT SrcVT = N1.getValueType();  
+  unsigned SrcSize = SrcVT.getSizeInBits();
+  if (SrcSize > 256 ||
+      !isPowerOf2_32(SrcVT.getVectorNumElements()) ||
+      !isPowerOf2_32(SrcVT.getVectorElementType().getSizeInBits()))
+    return SDValue();
+  if (SrcSize == 256 && SrcVT.getVectorNumElements() < 2)
+    return SDValue();
+
   unsigned WideNumElts = 128 / EltVT.getSizeInBits();
   EVT WideVT = EVT::getVectorVT(*DAG.getContext(), EltVT, WideNumElts);
 
+  SDLoc DL(Op);
+  SDValue Op1, Op2;
+  if (SrcSize == 256) {
+    EVT VecIdxTy = getVectorIdxTy(DAG.getDataLayout());
+    EVT SplitVT =
+        N1.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
+    unsigned SplitNumElts = SplitVT.getVectorNumElements();
+    Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, N1,
+                      DAG.getConstant(0, DL, VecIdxTy));
+    Op2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, N1,
+                      DAG.getConstant(SplitNumElts, DL, VecIdxTy));
+  }
+  else {
+    Op1 = SrcSize == 128 ? N1 : widenVec(DAG, N1, DL);
+    Op2 = DAG.getUNDEF(WideVT);
+  }
+
   // First list the elements we want to keep.
   unsigned SizeMult = SrcSize / TrgVT.getSizeInBits();
   SmallVector<int, 16> ShuffV;
@@ -8077,16 +7417,17 @@ SDValue PPCTargetLowering::LowerTRUNCATEVector(SDValue Op,
     // ShuffV.push_back(i + WideNumElts);
     ShuffV.push_back(WideNumElts + 1);
 
-  SDValue Conv = DAG.getNode(ISD::BITCAST, DL, WideVT, WideSrc);
-  return DAG.getVectorShuffle(WideVT, DL, Conv, DAG.getUNDEF(WideVT), ShuffV);
+  Op1 = DAG.getNode(ISD::BITCAST, DL, WideVT, Op1);
+  Op2 = DAG.getNode(ISD::BITCAST, DL, WideVT, Op2);
+  return DAG.getVectorShuffle(WideVT, DL, Op1, Op2, ShuffV);
 }
 
 /// LowerSELECT_CC - Lower floating point select_cc's into fsel instruction when
 /// possible.
 SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
-  // Not FP? Not a fsel.
+  // Not FP, or using SPE? Not a fsel.
   if (!Op.getOperand(0).getValueType().isFloatingPoint() ||
-      !Op.getOperand(2).getValueType().isFloatingPoint())
+      !Op.getOperand(2).getValueType().isFloatingPoint() || Subtarget.hasSPE())
     return Op;
 
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
@@ -8202,54 +7543,105 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   return Op;
 }
 
-void PPCTargetLowering::LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI,
-                                               SelectionDAG &DAG,
-                                               const SDLoc &dl) const {
-  assert(Op.getOperand(0).getValueType().isFloatingPoint());
-  SDValue Src = Op.getOperand(0);
-  if (Src.getValueType() == MVT::f32)
-    Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src);
-
-  SDValue Tmp;
+static unsigned getPPCStrictOpcode(unsigned Opc) {
+  switch (Opc) {
+  default:
+    llvm_unreachable("No strict version of this opcode!");
+  case PPCISD::FCTIDZ:
+    return PPCISD::STRICT_FCTIDZ;
+  case PPCISD::FCTIWZ:
+    return PPCISD::STRICT_FCTIWZ;
+  case PPCISD::FCTIDUZ:
+    return PPCISD::STRICT_FCTIDUZ;
+  case PPCISD::FCTIWUZ:
+    return PPCISD::STRICT_FCTIWUZ;
+  case PPCISD::FCFID:
+    return PPCISD::STRICT_FCFID;
+  case PPCISD::FCFIDU:
+    return PPCISD::STRICT_FCFIDU;
+  case PPCISD::FCFIDS:
+    return PPCISD::STRICT_FCFIDS;
+  case PPCISD::FCFIDUS:
+    return PPCISD::STRICT_FCFIDUS;
+  }
+}
+
+static SDValue convertFPToInt(SDValue Op, SelectionDAG &DAG,
+                              const PPCSubtarget &Subtarget) {
+  SDLoc dl(Op);
+  bool IsStrict = Op->isStrictFPOpcode();
+  bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
+                  Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
+
+  // TODO: Any other flags to propagate?
+  SDNodeFlags Flags;
+  Flags.setNoFPExcept(Op->getFlags().hasNoFPExcept());
+
+  // For strict nodes, source is the second operand.
+  SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
+  SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
+  assert(Src.getValueType().isFloatingPoint());
+  if (Src.getValueType() == MVT::f32) {
+    if (IsStrict) {
+      Src =
+          DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
+                      DAG.getVTList(MVT::f64, MVT::Other), {Chain, Src}, Flags);
+      Chain = Src.getValue(1);
+    } else
+      Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src);
+  }
+  SDValue Conv;
+  unsigned Opc = ISD::DELETED_NODE;
   switch (Op.getSimpleValueType().SimpleTy) {
   default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!");
   case MVT::i32:
-    Tmp = DAG.getNode(
-        Op.getOpcode() == ISD::FP_TO_SINT
-            ? PPCISD::FCTIWZ
-            : (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ : PPCISD::FCTIDZ),
-        dl, MVT::f64, Src);
+    Opc = IsSigned ? PPCISD::FCTIWZ
+                   : (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ : PPCISD::FCTIDZ);
     break;
   case MVT::i64:
-    assert((Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()) &&
+    assert((IsSigned || Subtarget.hasFPCVT()) &&
            "i64 FP_TO_UINT is supported only with FPCVT");
-    Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIDZ :
-                                                        PPCISD::FCTIDUZ,
-                      dl, MVT::f64, Src);
-    break;
+    Opc = IsSigned ? PPCISD::FCTIDZ : PPCISD::FCTIDUZ;
+  }
+  if (IsStrict) {
+    Opc = getPPCStrictOpcode(Opc);
+    Conv = DAG.getNode(Opc, dl, DAG.getVTList(MVT::f64, MVT::Other),
+                       {Chain, Src}, Flags);
+  } else {
+    Conv = DAG.getNode(Opc, dl, MVT::f64, Src);
   }
+  return Conv;
+}
+
+void PPCTargetLowering::LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI,
+                                               SelectionDAG &DAG,
+                                               const SDLoc &dl) const {
+  SDValue Tmp = convertFPToInt(Op, DAG, Subtarget);
+  bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
+                  Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
+  bool IsStrict = Op->isStrictFPOpcode();
 
   // Convert the FP value to an int value through memory.
   bool i32Stack = Op.getValueType() == MVT::i32 && Subtarget.hasSTFIWX() &&
-    (Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT());
+                  (IsSigned || Subtarget.hasFPCVT());
   SDValue FIPtr = DAG.CreateStackTemporary(i32Stack ? MVT::i32 : MVT::f64);
   int FI = cast<FrameIndexSDNode>(FIPtr)->getIndex();
   MachinePointerInfo MPI =
       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
 
   // Emit a store to the stack slot.
-  SDValue Chain;
+  SDValue Chain = IsStrict ? Tmp.getValue(1) : DAG.getEntryNode();
   Align Alignment(DAG.getEVTAlign(Tmp.getValueType()));
   if (i32Stack) {
     MachineFunction &MF = DAG.getMachineFunction();
     Alignment = Align(4);
     MachineMemOperand *MMO =
         MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 4, Alignment);
-    SDValue Ops[] = { DAG.getEntryNode(), Tmp, FIPtr };
+    SDValue Ops[] = { Chain, Tmp, FIPtr };
     Chain = DAG.getMemIntrinsicNode(PPCISD::STFIWX, dl,
               DAG.getVTList(MVT::Other), Ops, MVT::i32, MMO);
   } else
-    Chain = DAG.getStore(DAG.getEntryNode(), dl, Tmp, FIPtr, MPI, Alignment);
+    Chain = DAG.getStore(Chain, dl, Tmp, FIPtr, MPI, Alignment);
 
   // Result is a load from the stack slot.  If loading 4 bytes, make sure to
   // add in a bias on big endian.
@@ -8271,76 +7663,100 @@ void PPCTargetLowering::LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI,
 SDValue PPCTargetLowering::LowerFP_TO_INTDirectMove(SDValue Op,
                                                     SelectionDAG &DAG,
                                                     const SDLoc &dl) const {
-  assert(Op.getOperand(0).getValueType().isFloatingPoint());
-  SDValue Src = Op.getOperand(0);
-
-  if (Src.getValueType() == MVT::f32)
-    Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src);
-
-  SDValue Tmp;
-  switch (Op.getSimpleValueType().SimpleTy) {
-  default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!");
-  case MVT::i32:
-    Tmp = DAG.getNode(
-        Op.getOpcode() == ISD::FP_TO_SINT
-            ? PPCISD::FCTIWZ
-            : (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ : PPCISD::FCTIDZ),
-        dl, MVT::f64, Src);
-    Tmp = DAG.getNode(PPCISD::MFVSR, dl, MVT::i32, Tmp);
-    break;
-  case MVT::i64:
-    assert((Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()) &&
-           "i64 FP_TO_UINT is supported only with FPCVT");
-    Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIDZ :
-                                                        PPCISD::FCTIDUZ,
-                      dl, MVT::f64, Src);
-    Tmp = DAG.getNode(PPCISD::MFVSR, dl, MVT::i64, Tmp);
-    break;
-  }
-  return Tmp;
+  SDValue Conv = convertFPToInt(Op, DAG, Subtarget);
+  SDValue Mov = DAG.getNode(PPCISD::MFVSR, dl, Op.getValueType(), Conv);
+  if (Op->isStrictFPOpcode())
+    return DAG.getMergeValues({Mov, Conv.getValue(1)}, dl);
+  else
+    return Mov;
 }
 
 SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
                                           const SDLoc &dl) const {
+  bool IsStrict = Op->isStrictFPOpcode();
+  bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
+                  Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
+  SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
+  EVT SrcVT = Src.getValueType();
+  EVT DstVT = Op.getValueType();
 
   // FP to INT conversions are legal for f128.
-  if (Op->getOperand(0).getValueType() == MVT::f128)
-    return Op;
+  if (SrcVT == MVT::f128)
+    return Subtarget.hasP9Vector() ? Op : SDValue();
 
   // Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on
   // PPC (the libcall is not available).
-  if (Op.getOperand(0).getValueType() == MVT::ppcf128) {
-    if (Op.getValueType() == MVT::i32) {
-      if (Op.getOpcode() == ISD::FP_TO_SINT) {
-        SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl,
-                                 MVT::f64, Op.getOperand(0),
+  if (SrcVT == MVT::ppcf128) {
+    if (DstVT == MVT::i32) {
+      // TODO: Conservatively pass only nofpexcept flag here. Need to check and
+      // set other fast-math flags to FP operations in both strict and
+      // non-strict cases. (FP_TO_SINT, FSUB)
+      SDNodeFlags Flags;
+      Flags.setNoFPExcept(Op->getFlags().hasNoFPExcept());
+
+      if (IsSigned) {
+        SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::f64, Src,
                                  DAG.getIntPtrConstant(0, dl));
-        SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl,
-                                 MVT::f64, Op.getOperand(0),
+        SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::f64, Src,
                                  DAG.getIntPtrConstant(1, dl));
 
-        // Add the two halves of the long double in round-to-zero mode.
-        SDValue Res = DAG.getNode(PPCISD::FADDRTZ, dl, MVT::f64, Lo, Hi);
-
-        // Now use a smaller FP_TO_SINT.
-        return DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Res);
-      }
-      if (Op.getOpcode() == ISD::FP_TO_UINT) {
+        // Add the two halves of the long double in round-to-zero mode, and use
+        // a smaller FP_TO_SINT.
+        if (IsStrict) {
+          SDValue Res = DAG.getNode(PPCISD::STRICT_FADDRTZ, dl,
+                                    DAG.getVTList(MVT::f64, MVT::Other),
+                                    {Op.getOperand(0), Lo, Hi}, Flags);
+          return DAG.getNode(ISD::STRICT_FP_TO_SINT, dl,
+                             DAG.getVTList(MVT::i32, MVT::Other),
+                             {Res.getValue(1), Res}, Flags);
+        } else {
+          SDValue Res = DAG.getNode(PPCISD::FADDRTZ, dl, MVT::f64, Lo, Hi);
+          return DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Res);
+        }
+      } else {
         const uint64_t TwoE31[] = {0x41e0000000000000LL, 0};
         APFloat APF = APFloat(APFloat::PPCDoubleDouble(), APInt(128, TwoE31));
-        SDValue Tmp = DAG.getConstantFP(APF, dl, MVT::ppcf128);
-        //  X>=2^31 ? (int)(X-2^31)+0x80000000 : (int)X
-        // FIXME: generated code sucks.
-        // TODO: Are there fast-math-flags to propagate to this FSUB?
-        SDValue True = DAG.getNode(ISD::FSUB, dl, MVT::ppcf128,
-                                   Op.getOperand(0), Tmp);
-        True = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, True);
-        True = DAG.getNode(ISD::ADD, dl, MVT::i32, True,
-                           DAG.getConstant(0x80000000, dl, MVT::i32));
-        SDValue False = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32,
-                                    Op.getOperand(0));
-        return DAG.getSelectCC(dl, Op.getOperand(0), Tmp, True, False,
-                               ISD::SETGE);
+        SDValue Cst = DAG.getConstantFP(APF, dl, SrcVT);
+        SDValue SignMask = DAG.getConstant(0x80000000, dl, DstVT);
+        if (IsStrict) {
+          // Sel = Src < 0x80000000
+          // FltOfs = select Sel, 0.0, 0x80000000
+          // IntOfs = select Sel, 0, 0x80000000
+          // Result = fp_to_sint(Src - FltOfs) ^ IntOfs
+          SDValue Chain = Op.getOperand(0);
+          EVT SetCCVT =
+              getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT);
+          EVT DstSetCCVT =
+              getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), DstVT);
+          SDValue Sel = DAG.getSetCC(dl, SetCCVT, Src, Cst, ISD::SETLT,
+                                     Chain, true);
+          Chain = Sel.getValue(1);
+
+          SDValue FltOfs = DAG.getSelect(
+              dl, SrcVT, Sel, DAG.getConstantFP(0.0, dl, SrcVT), Cst);
+          Sel = DAG.getBoolExtOrTrunc(Sel, dl, DstSetCCVT, DstVT);
+
+          SDValue Val = DAG.getNode(ISD::STRICT_FSUB, dl,
+                                    DAG.getVTList(SrcVT, MVT::Other),
+                                    {Chain, Src, FltOfs}, Flags);
+          Chain = Val.getValue(1);
+          SDValue SInt = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl,
+                                     DAG.getVTList(DstVT, MVT::Other),
+                                     {Chain, Val}, Flags);
+          Chain = SInt.getValue(1);
+          SDValue IntOfs = DAG.getSelect(
+              dl, DstVT, Sel, DAG.getConstant(0, dl, DstVT), SignMask);
+          SDValue Result = DAG.getNode(ISD::XOR, dl, DstVT, SInt, IntOfs);
+          return DAG.getMergeValues({Result, Chain}, dl);
+        } else {
+          // X>=2^31 ? (int)(X-2^31)+0x80000000 : (int)X
+          // FIXME: generated code sucks.
+          SDValue True = DAG.getNode(ISD::FSUB, dl, MVT::ppcf128, Src, Cst);
+          True = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, True);
+          True = DAG.getNode(ISD::ADD, dl, MVT::i32, True, SignMask);
+          SDValue False = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
+          return DAG.getSelectCC(dl, Src, Cst, True, False, ISD::SETGE);
+        }
       }
     }
 
@@ -8369,6 +7785,10 @@ bool PPCTargetLowering::canReuseLoadAddress(SDValue Op, EVT MemVT,
                                             ReuseLoadInfo &RLI,
                                             SelectionDAG &DAG,
                                             ISD::LoadExtType ET) const {
+  // Conservatively skip reusing for constrained FP nodes.
+  if (Op->isStrictFPOpcode())
+    return false;
+
   SDLoc dl(Op);
   bool ValidFPToUint = Op.getOpcode() == ISD::FP_TO_UINT &&
                        (Subtarget.hasFPCVT() || Op.getValueType() == MVT::i32);
@@ -8388,6 +7808,13 @@ bool PPCTargetLowering::canReuseLoadAddress(SDValue Op, EVT MemVT,
   if (LD->getMemoryVT() != MemVT)
     return false;
 
+  // If the result of the load is an illegal type, then we can't build a
+  // valid chain for reuse since the legalised loads and token factor node that
+  // ties the legalised loads together uses a different output chain then the
+  // illegal load.
+  if (!isTypeLegal(LD->getValueType(0)))
+    return false;
+
   RLI.Ptr = LD->getBasePtr();
   if (LD->isIndexed() && !LD->getOffset().isUndef()) {
     assert(LD->getAddressingMode() == ISD::PRE_INC &&
@@ -8452,13 +7879,41 @@ bool PPCTargetLowering::directMoveIsProfitable(const SDValue &Op) const {
       continue;
 
     if (UI->getOpcode() != ISD::SINT_TO_FP &&
-        UI->getOpcode() != ISD::UINT_TO_FP)
+        UI->getOpcode() != ISD::UINT_TO_FP &&
+        UI->getOpcode() != ISD::STRICT_SINT_TO_FP &&
+        UI->getOpcode() != ISD::STRICT_UINT_TO_FP)
       return true;
   }
 
   return false;
 }
 
+static SDValue convertIntToFP(SDValue Op, SDValue Src, SelectionDAG &DAG,
+                              const PPCSubtarget &Subtarget,
+                              SDValue Chain = SDValue()) {
+  bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP ||
+                  Op.getOpcode() == ISD::STRICT_SINT_TO_FP;
+  SDLoc dl(Op);
+
+  // TODO: Any other flags to propagate?
+  SDNodeFlags Flags;
+  Flags.setNoFPExcept(Op->getFlags().hasNoFPExcept());
+
+  // If we have FCFIDS, then use it when converting to single-precision.
+  // Otherwise, convert to double-precision and then round.
+  bool IsSingle = Op.getValueType() == MVT::f32 && Subtarget.hasFPCVT();
+  unsigned ConvOpc = IsSingle ? (IsSigned ? PPCISD::FCFIDS : PPCISD::FCFIDUS)
+                              : (IsSigned ? PPCISD::FCFID : PPCISD::FCFIDU);
+  EVT ConvTy = IsSingle ? MVT::f32 : MVT::f64;
+  if (Op->isStrictFPOpcode()) {
+    if (!Chain)
+      Chain = Op.getOperand(0);
+    return DAG.getNode(getPPCStrictOpcode(ConvOpc), dl,
+                       DAG.getVTList(ConvTy, MVT::Other), {Chain, Src}, Flags);
+  } else
+    return DAG.getNode(ConvOpc, dl, ConvTy, Src);
+}
+
 /// Custom lowers integer to floating point conversions to use
 /// the direct move instructions available in ISA 2.07 to avoid the
 /// need for load/store combinations.
@@ -8470,25 +7925,13 @@ SDValue PPCTargetLowering::LowerINT_TO_FPDirectMove(SDValue Op,
          "Invalid floating point type as target of conversion");
   assert(Subtarget.hasFPCVT() &&
          "Int to FP conversions with direct moves require FPCVT");
-  SDValue FP;
-  SDValue Src = Op.getOperand(0);
-  bool SinglePrec = Op.getValueType() == MVT::f32;
+  SDValue Src = Op.getOperand(Op->isStrictFPOpcode() ? 1 : 0);
   bool WordInt = Src.getSimpleValueType().SimpleTy == MVT::i32;
-  bool Signed = Op.getOpcode() == ISD::SINT_TO_FP;
-  unsigned ConvOp = Signed ? (SinglePrec ? PPCISD::FCFIDS : PPCISD::FCFID) :
-                             (SinglePrec ? PPCISD::FCFIDUS : PPCISD::FCFIDU);
-
-  if (WordInt) {
-    FP = DAG.getNode(Signed ? PPCISD::MTVSRA : PPCISD::MTVSRZ,
-                     dl, MVT::f64, Src);
-    FP = DAG.getNode(ConvOp, dl, SinglePrec ? MVT::f32 : MVT::f64, FP);
-  }
-  else {
-    FP = DAG.getNode(PPCISD::MTVSRA, dl, MVT::f64, Src);
-    FP = DAG.getNode(ConvOp, dl, SinglePrec ? MVT::f32 : MVT::f64, FP);
-  }
-
-  return FP;
+  bool Signed = Op.getOpcode() == ISD::SINT_TO_FP ||
+                Op.getOpcode() == ISD::STRICT_SINT_TO_FP;
+  unsigned MovOpc = (WordInt && !Signed) ? PPCISD::MTVSRZ : PPCISD::MTVSRA;
+  SDValue Mov = DAG.getNode(MovOpc, dl, MVT::f64, Src);
+  return convertIntToFP(Op, Mov, DAG, Subtarget);
 }
 
 static SDValue widenVec(SelectionDAG &DAG, SDValue Vec, const SDLoc &dl) {
@@ -8513,17 +7956,23 @@ static SDValue widenVec(SelectionDAG &DAG, SDValue Vec, const SDLoc &dl) {
 
 SDValue PPCTargetLowering::LowerINT_TO_FPVector(SDValue Op, SelectionDAG &DAG,
                                                 const SDLoc &dl) const {
-
+  bool IsStrict = Op->isStrictFPOpcode();
   unsigned Opc = Op.getOpcode();
-  assert((Opc == ISD::UINT_TO_FP || Opc == ISD::SINT_TO_FP) &&
+  SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
+  assert((Opc == ISD::UINT_TO_FP || Opc == ISD::SINT_TO_FP ||
+          Opc == ISD::STRICT_UINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP) &&
          "Unexpected conversion type");
   assert((Op.getValueType() == MVT::v2f64 || Op.getValueType() == MVT::v4f32) &&
          "Supports conversions to v2f64/v4f32 only.");
 
-  bool SignedConv = Opc == ISD::SINT_TO_FP;
+  // TODO: Any other flags to propagate?
+  SDNodeFlags Flags;
+  Flags.setNoFPExcept(Op->getFlags().hasNoFPExcept());
+
+  bool SignedConv = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
   bool FourEltRes = Op.getValueType() == MVT::v4f32;
 
-  SDValue Wide = widenVec(DAG, Op.getOperand(0), dl);
+  SDValue Wide = widenVec(DAG, Src, dl);
   EVT WideVT = Wide.getValueType();
   unsigned WideNumElts = WideVT.getVectorNumElements();
   MVT IntermediateVT = FourEltRes ? MVT::v4i32 : MVT::v2i64;
@@ -8548,7 +7997,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FPVector(SDValue Op, SelectionDAG &DAG,
   SDValue Extend;
   if (SignedConv) {
     Arrange = DAG.getBitcast(IntermediateVT, Arrange);
-    EVT ExtVT = Op.getOperand(0).getValueType();
+    EVT ExtVT = Src.getValueType();
     if (Subtarget.hasP9Altivec())
       ExtVT = EVT::getVectorVT(*DAG.getContext(), WideVT.getVectorElementType(),
                                IntermediateVT.getVectorNumElements());
@@ -8558,14 +8007,27 @@ SDValue PPCTargetLowering::LowerINT_TO_FPVector(SDValue Op, SelectionDAG &DAG,
   } else
     Extend = DAG.getNode(ISD::BITCAST, dl, IntermediateVT, Arrange);
 
+  if (IsStrict)
+    return DAG.getNode(Opc, dl, DAG.getVTList(Op.getValueType(), MVT::Other),
+                       {Op.getOperand(0), Extend}, Flags);
+
   return DAG.getNode(Opc, dl, Op.getValueType(), Extend);
 }
 
 SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
                                           SelectionDAG &DAG) const {
   SDLoc dl(Op);
+  bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP ||
+                  Op.getOpcode() == ISD::STRICT_SINT_TO_FP;
+  bool IsStrict = Op->isStrictFPOpcode();
+  SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
+  SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
+
+  // TODO: Any other flags to propagate?
+  SDNodeFlags Flags;
+  Flags.setNoFPExcept(Op->getFlags().hasNoFPExcept());
 
-  EVT InVT = Op.getOperand(0).getValueType();
+  EVT InVT = Src.getValueType();
   EVT OutVT = Op.getValueType();
   if (OutVT.isVector() && OutVT.isFloatingPoint() &&
       isOperationCustom(Op.getOpcode(), InVT))
@@ -8573,37 +8035,21 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
 
   // Conversions to f128 are legal.
   if (Op.getValueType() == MVT::f128)
-    return Op;
-
-  if (Subtarget.hasQPX() && Op.getOperand(0).getValueType() == MVT::v4i1) {
-    if (Op.getValueType() != MVT::v4f32 && Op.getValueType() != MVT::v4f64)
-      return SDValue();
-
-    SDValue Value = Op.getOperand(0);
-    // The values are now known to be -1 (false) or 1 (true). To convert this
-    // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5).
-    // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5
-    Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value);
-
-    SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64);
-
-    Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs);
-
-    if (Op.getValueType() != MVT::v4f64)
-      Value = DAG.getNode(ISD::FP_ROUND, dl,
-                          Op.getValueType(), Value,
-                          DAG.getIntPtrConstant(1, dl));
-    return Value;
-  }
+    return Subtarget.hasP9Vector() ? Op : SDValue();
 
   // Don't handle ppc_fp128 here; let it be lowered to a libcall.
   if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64)
     return SDValue();
 
-  if (Op.getOperand(0).getValueType() == MVT::i1)
-    return DAG.getNode(ISD::SELECT, dl, Op.getValueType(), Op.getOperand(0),
-                       DAG.getConstantFP(1.0, dl, Op.getValueType()),
-                       DAG.getConstantFP(0.0, dl, Op.getValueType()));
+  if (Src.getValueType() == MVT::i1) {
+    SDValue Sel = DAG.getNode(ISD::SELECT, dl, Op.getValueType(), Src,
+                              DAG.getConstantFP(1.0, dl, Op.getValueType()),
+                              DAG.getConstantFP(0.0, dl, Op.getValueType()));
+    if (IsStrict)
+      return DAG.getMergeValues({Sel, Chain}, dl);
+    else
+      return Sel;
+  }
 
   // If we have direct moves, we can do all the conversion, skip the store/load
   // however, without FPCVT we can't do most conversions.
@@ -8611,22 +8057,11 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
       Subtarget.isPPC64() && Subtarget.hasFPCVT())
     return LowerINT_TO_FPDirectMove(Op, DAG, dl);
 
-  assert((Op.getOpcode() == ISD::SINT_TO_FP || Subtarget.hasFPCVT()) &&
+  assert((IsSigned || Subtarget.hasFPCVT()) &&
          "UINT_TO_FP is supported only with FPCVT");
 
-  // If we have FCFIDS, then use it when converting to single-precision.
-  // Otherwise, convert to double-precision and then round.
-  unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32)
-                       ? (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDUS
-                                                            : PPCISD::FCFIDS)
-                       : (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDU
-                                                            : PPCISD::FCFID);
-  MVT FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32)
-                  ? MVT::f32
-                  : MVT::f64;
-
-  if (Op.getOperand(0).getValueType() == MVT::i64) {
-    SDValue SINT = Op.getOperand(0);
+  if (Src.getValueType() == MVT::i64) {
+    SDValue SINT = Src;
     // When converting to single-precision, we actually need to convert
     // to double-precision first and then round to single-precision.
     // To avoid double-rounding effects during that operation, we have
@@ -8714,16 +8149,16 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
       int FrameIdx = MFI.CreateStackObject(4, Align(4), false);
       SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
 
-      SDValue Store =
-          DAG.getStore(DAG.getEntryNode(), dl, SINT.getOperand(0), FIdx,
-                       MachinePointerInfo::getFixedStack(
-                           DAG.getMachineFunction(), FrameIdx));
+      SDValue Store = DAG.getStore(Chain, dl, SINT.getOperand(0), FIdx,
+                                   MachinePointerInfo::getFixedStack(
+                                       DAG.getMachineFunction(), FrameIdx));
+      Chain = Store;
 
       assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 &&
              "Expected an i32 store");
 
       RLI.Ptr = FIdx;
-      RLI.Chain = Store;
+      RLI.Chain = Chain;
       RLI.MPI =
           MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
       RLI.Alignment = Align(4);
@@ -8736,18 +8171,27 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
                                      PPCISD::LFIWZX : PPCISD::LFIWAX,
                                      dl, DAG.getVTList(MVT::f64, MVT::Other),
                                      Ops, MVT::i32, MMO);
+      Chain = Bits.getValue(1);
     } else
       Bits = DAG.getNode(ISD::BITCAST, dl, MVT::f64, SINT);
 
-    SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Bits);
+    SDValue FP = convertIntToFP(Op, Bits, DAG, Subtarget, Chain);
+    if (IsStrict)
+      Chain = FP.getValue(1);
 
-    if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT())
-      FP = DAG.getNode(ISD::FP_ROUND, dl,
-                       MVT::f32, FP, DAG.getIntPtrConstant(0, dl));
+    if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) {
+      if (IsStrict)
+        FP = DAG.getNode(ISD::STRICT_FP_ROUND, dl,
+                         DAG.getVTList(MVT::f32, MVT::Other),
+                         {Chain, FP, DAG.getIntPtrConstant(0, dl)}, Flags);
+      else
+        FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP,
+                         DAG.getIntPtrConstant(0, dl));
+    }
     return FP;
   }
 
-  assert(Op.getOperand(0).getValueType() == MVT::i32 &&
+  assert(Src.getValueType() == MVT::i32 &&
          "Unhandled INT_TO_FP type in custom expander!");
   // Since we only generate this in 64-bit mode, we can take advantage of
   // 64-bit registers.  In particular, sign extend the input value into the
@@ -8761,21 +8205,20 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
   if (Subtarget.hasLFIWAX() || Subtarget.hasFPCVT()) {
     ReuseLoadInfo RLI;
     bool ReusingLoad;
-    if (!(ReusingLoad = canReuseLoadAddress(Op.getOperand(0), MVT::i32, RLI,
-                                            DAG))) {
+    if (!(ReusingLoad = canReuseLoadAddress(Src, MVT::i32, RLI, DAG))) {
       int FrameIdx = MFI.CreateStackObject(4, Align(4), false);
       SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
 
-      SDValue Store =
-          DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx,
-                       MachinePointerInfo::getFixedStack(
-                           DAG.getMachineFunction(), FrameIdx));
+      SDValue Store = DAG.getStore(Chain, dl, Src, FIdx,
+                                   MachinePointerInfo::getFixedStack(
+                                       DAG.getMachineFunction(), FrameIdx));
+      Chain = Store;
 
       assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 &&
              "Expected an i32 store");
 
       RLI.Ptr = FIdx;
-      RLI.Chain = Store;
+      RLI.Chain = Chain;
       RLI.MPI =
           MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
       RLI.Alignment = Align(4);
@@ -8785,10 +8228,10 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
       MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
                               RLI.Alignment, RLI.AAInfo, RLI.Ranges);
     SDValue Ops[] = { RLI.Chain, RLI.Ptr };
-    Ld = DAG.getMemIntrinsicNode(Op.getOpcode() == ISD::UINT_TO_FP ?
-                                   PPCISD::LFIWZX : PPCISD::LFIWAX,
-                                 dl, DAG.getVTList(MVT::f64, MVT::Other),
-                                 Ops, MVT::i32, MMO);
+    Ld = DAG.getMemIntrinsicNode(IsSigned ? PPCISD::LFIWAX : PPCISD::LFIWZX, dl,
+                                 DAG.getVTList(MVT::f64, MVT::Other), Ops,
+                                 MVT::i32, MMO);
+    Chain = Ld.getValue(1);
     if (ReusingLoad)
       spliceIntoChain(RLI.ResChain, Ld.getValue(1), DAG);
   } else {
@@ -8798,25 +8241,34 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
     int FrameIdx = MFI.CreateStackObject(8, Align(8), false);
     SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
 
-    SDValue Ext64 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i64,
-                                Op.getOperand(0));
+    SDValue Ext64 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i64, Src);
 
     // STD the extended value into the stack slot.
     SDValue Store = DAG.getStore(
-        DAG.getEntryNode(), dl, Ext64, FIdx,
+        Chain, dl, Ext64, FIdx,
         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx));
+    Chain = Store;
 
     // Load the value as a double.
     Ld = DAG.getLoad(
-        MVT::f64, dl, Store, FIdx,
+        MVT::f64, dl, Chain, FIdx,
         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx));
+    Chain = Ld.getValue(1);
   }
 
   // FCFID it and return it.
-  SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Ld);
-  if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT())
-    FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP,
-                     DAG.getIntPtrConstant(0, dl));
+  SDValue FP = convertIntToFP(Op, Ld, DAG, Subtarget, Chain);
+  if (IsStrict)
+    Chain = FP.getValue(1);
+  if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) {
+    if (IsStrict)
+      FP = DAG.getNode(ISD::STRICT_FP_ROUND, dl,
+                       DAG.getVTList(MVT::f32, MVT::Other),
+                       {Chain, FP, DAG.getIntPtrConstant(0, dl)}, Flags);
+    else
+      FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP,
+                       DAG.getIntPtrConstant(0, dl));
+  }
   return FP;
 }
 
@@ -8851,16 +8303,24 @@ SDValue PPCTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
   SDValue MFFS = DAG.getNode(PPCISD::MFFS, dl, {MVT::f64, MVT::Other}, Chain);
   Chain = MFFS.getValue(1);
 
-  // Save FP register to stack slot
-  int SSFI = MF.getFrameInfo().CreateStackObject(8, Align(8), false);
-  SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
-  Chain = DAG.getStore(Chain, dl, MFFS, StackSlot, MachinePointerInfo());
+  SDValue CWD;
+  if (isTypeLegal(MVT::i64)) {
+    CWD = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
+                      DAG.getNode(ISD::BITCAST, dl, MVT::i64, MFFS));
+  } else {
+    // Save FP register to stack slot
+    int SSFI = MF.getFrameInfo().CreateStackObject(8, Align(8), false);
+    SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
+    Chain = DAG.getStore(Chain, dl, MFFS, StackSlot, MachinePointerInfo());
 
-  // Load FP Control Word from low 32 bits of stack slot.
-  SDValue Four = DAG.getConstant(4, dl, PtrVT);
-  SDValue Addr = DAG.getNode(ISD::ADD, dl, PtrVT, StackSlot, Four);
-  SDValue CWD = DAG.getLoad(MVT::i32, dl, Chain, Addr, MachinePointerInfo());
-  Chain = CWD.getValue(1);
+    // Load FP Control Word from low 32 bits of stack slot.
+    assert(hasBigEndianPartOrdering(MVT::i64, MF.getDataLayout()) &&
+           "Stack slot adjustment is valid only on big endian subtargets!");
+    SDValue Four = DAG.getConstant(4, dl, PtrVT);
+    SDValue Addr = DAG.getNode(ISD::ADD, dl, PtrVT, StackSlot, Four);
+    CWD = DAG.getLoad(MVT::i32, dl, Chain, Addr, MachinePointerInfo());
+    Chain = CWD.getValue(1);
+  }
 
   // Transform as necessary
   SDValue CWD1 =
@@ -8971,6 +8431,31 @@ SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getMergeValues(OutOps, dl);
 }
 
+SDValue PPCTargetLowering::LowerFunnelShift(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+  unsigned BitWidth = VT.getSizeInBits();
+
+  bool IsFSHL = Op.getOpcode() == ISD::FSHL;
+  SDValue X = Op.getOperand(0);
+  SDValue Y = Op.getOperand(1);
+  SDValue Z = Op.getOperand(2);
+  EVT AmtVT = Z.getValueType();
+
+  // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
+  // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
+  // This is simpler than TargetLowering::expandFunnelShift because we can rely
+  // on PowerPC shift by BW being well defined.
+  Z = DAG.getNode(ISD::AND, dl, AmtVT, Z,
+                  DAG.getConstant(BitWidth - 1, dl, AmtVT));
+  SDValue SubZ =
+      DAG.getNode(ISD::SUB, dl, AmtVT, DAG.getConstant(BitWidth, dl, AmtVT), Z);
+  X = DAG.getNode(PPCISD::SHL, dl, VT, X, IsFSHL ? Z : SubZ);
+  Y = DAG.getNode(PPCISD::SRL, dl, VT, Y, IsFSHL ? SubZ : Z);
+  return DAG.getNode(ISD::OR, dl, VT, X, Y);
+}
+
 //===----------------------------------------------------------------------===//
 // Vector related lowering.
 //
@@ -8986,7 +8471,7 @@ static SDValue getCanonicalConstSplat(uint64_t Val, unsigned SplatSize, EVT VT,
   EVT ReqVT = VT != MVT::Other ? VT : VTys[SplatSize-1];
 
   // For a splat with all ones, turn it to vspltisb 0xFF to canonicalize.
-  if (Val == ((1LU << (SplatSize * 8)) - 1)) {
+  if (Val == ((1LLU << (SplatSize * 8)) - 1)) {
     SplatSize = 1;
     Val = 0xFF;
   }
@@ -9164,110 +8649,6 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
   assert(BVN && "Expected a BuildVectorSDNode in LowerBUILD_VECTOR");
 
-  if (Subtarget.hasQPX() && Op.getValueType() == MVT::v4i1) {
-    // We first build an i32 vector, load it into a QPX register,
-    // then convert it to a floating-point vector and compare it
-    // to a zero vector to get the boolean result.
-    MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
-    int FrameIdx = MFI.CreateStackObject(16, Align(16), false);
-    MachinePointerInfo PtrInfo =
-        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
-    EVT PtrVT = getPointerTy(DAG.getDataLayout());
-    SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
-
-    assert(BVN->getNumOperands() == 4 &&
-      "BUILD_VECTOR for v4i1 does not have 4 operands");
-
-    bool IsConst = true;
-    for (unsigned i = 0; i < 4; ++i) {
-      if (BVN->getOperand(i).isUndef()) continue;
-      if (!isa<ConstantSDNode>(BVN->getOperand(i))) {
-        IsConst = false;
-        break;
-      }
-    }
-
-    if (IsConst) {
-      Constant *One =
-        ConstantFP::get(Type::getFloatTy(*DAG.getContext()), 1.0);
-      Constant *NegOne =
-        ConstantFP::get(Type::getFloatTy(*DAG.getContext()), -1.0);
-
-      Constant *CV[4];
-      for (unsigned i = 0; i < 4; ++i) {
-        if (BVN->getOperand(i).isUndef())
-          CV[i] = UndefValue::get(Type::getFloatTy(*DAG.getContext()));
-        else if (isNullConstant(BVN->getOperand(i)))
-          CV[i] = NegOne;
-        else
-          CV[i] = One;
-      }
-
-      Constant *CP = ConstantVector::get(CV);
-      SDValue CPIdx =
-          DAG.getConstantPool(CP, getPointerTy(DAG.getDataLayout()), Align(16));
-
-      SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
-      SDVTList VTs = DAG.getVTList({MVT::v4i1, /*chain*/ MVT::Other});
-      return DAG.getMemIntrinsicNode(
-          PPCISD::QVLFSb, dl, VTs, Ops, MVT::v4f32,
-          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
-    }
-
-    SmallVector<SDValue, 4> Stores;
-    for (unsigned i = 0; i < 4; ++i) {
-      if (BVN->getOperand(i).isUndef()) continue;
-
-      unsigned Offset = 4*i;
-      SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType());
-      Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx);
-
-      unsigned StoreSize = BVN->getOperand(i).getValueType().getStoreSize();
-      if (StoreSize > 4) {
-        Stores.push_back(
-            DAG.getTruncStore(DAG.getEntryNode(), dl, BVN->getOperand(i), Idx,
-                              PtrInfo.getWithOffset(Offset), MVT::i32));
-      } else {
-        SDValue StoreValue = BVN->getOperand(i);
-        if (StoreSize < 4)
-          StoreValue = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, StoreValue);
-
-        Stores.push_back(DAG.getStore(DAG.getEntryNode(), dl, StoreValue, Idx,
-                                      PtrInfo.getWithOffset(Offset)));
-      }
-    }
-
-    SDValue StoreChain;
-    if (!Stores.empty())
-      StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
-    else
-      StoreChain = DAG.getEntryNode();
-
-    // Now load from v4i32 into the QPX register; this will extend it to
-    // v4i64 but not yet convert it to a floating point. Nevertheless, this
-    // is typed as v4f64 because the QPX register integer states are not
-    // explicitly represented.
-
-    SDValue Ops[] = {StoreChain,
-                     DAG.getConstant(Intrinsic::ppc_qpx_qvlfiwz, dl, MVT::i32),
-                     FIdx};
-    SDVTList VTs = DAG.getVTList({MVT::v4f64, /*chain*/ MVT::Other});
-
-    SDValue LoadedVect = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN,
-      dl, VTs, Ops, MVT::v4i32, PtrInfo);
-    LoadedVect = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64,
-      DAG.getConstant(Intrinsic::ppc_qpx_qvfcfidu, dl, MVT::i32),
-      LoadedVect);
-
-    SDValue FPZeros = DAG.getConstantFP(0.0, dl, MVT::v4f64);
-
-    return DAG.getSetCC(dl, MVT::v4i1, LoadedVect, FPZeros, ISD::SETEQ);
-  }
-
-  // All other QPX vectors are handled by generic code.
-  if (Subtarget.hasQPX())
-    return SDValue();
-
   // Check if this is a splat of a constant value.
   APInt APSplatBits, APSplatUndef;
   unsigned SplatBitSize;
@@ -9279,13 +8660,44 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
   // If it is a splat of a double, check if we can shrink it to a 32 bit
   // non-denormal float which when converted back to double gives us the same
   // double. This is to exploit the XXSPLTIDP instruction.
-  if (BVNIsConstantSplat && Subtarget.hasPrefixInstrs() &&
-      (SplatBitSize == 64) && (Op->getValueType(0) == MVT::v2f64) &&
-      convertToNonDenormSingle(APSplatBits)) {
-    SDValue SplatNode = DAG.getNode(
-        PPCISD::XXSPLTI_SP_TO_DP, dl, MVT::v2f64,
-        DAG.getTargetConstant(APSplatBits.getZExtValue(), dl, MVT::i32));
-    return DAG.getBitcast(Op.getValueType(), SplatNode);
+  // If we lose precision, we use XXSPLTI32DX.
+  if (BVNIsConstantSplat && (SplatBitSize == 64) &&
+      Subtarget.hasPrefixInstrs()) {
+    // Check the type first to short-circuit so we don't modify APSplatBits if
+    // this block isn't executed.
+    if ((Op->getValueType(0) == MVT::v2f64) &&
+        convertToNonDenormSingle(APSplatBits)) {
+      SDValue SplatNode = DAG.getNode(
+          PPCISD::XXSPLTI_SP_TO_DP, dl, MVT::v2f64,
+          DAG.getTargetConstant(APSplatBits.getZExtValue(), dl, MVT::i32));
+      return DAG.getBitcast(Op.getValueType(), SplatNode);
+    } else {
+      // We may lose precision, so we have to use XXSPLTI32DX.
+
+      uint32_t Hi =
+          (uint32_t)((APSplatBits.getZExtValue() & 0xFFFFFFFF00000000LL) >> 32);
+      uint32_t Lo =
+          (uint32_t)(APSplatBits.getZExtValue() & 0xFFFFFFFF);
+      SDValue SplatNode = DAG.getUNDEF(MVT::v2i64);
+
+      if (!Hi || !Lo)
+        // If either load is 0, then we should generate XXLXOR to set to 0.
+        SplatNode = DAG.getTargetConstant(0, dl, MVT::v2i64);
+
+      if (Hi)
+        SplatNode = DAG.getNode(
+            PPCISD::XXSPLTI32DX, dl, MVT::v2i64, SplatNode,
+            DAG.getTargetConstant(0, dl, MVT::i32),
+            DAG.getTargetConstant(Hi, dl, MVT::i32));
+
+      if (Lo)
+        SplatNode =
+            DAG.getNode(PPCISD::XXSPLTI32DX, dl, MVT::v2i64, SplatNode,
+                        DAG.getTargetConstant(1, dl, MVT::i32),
+                        DAG.getTargetConstant(Lo, dl, MVT::i32));
+
+      return DAG.getBitcast(Op.getValueType(), SplatNode);
+    }
   }
 
   if (!BVNIsConstantSplat || SplatBitSize > 32) {
@@ -9304,7 +8716,12 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
       // Checking for a single use of this load, we have to check for vector
       // width (128 bits) / ElementSize uses (since each operand of the
       // BUILD_VECTOR is a separate use of the value.
-      if (InputLoad->getNode()->hasNUsesOfValue(128 / ElementSize, 0) &&
+      unsigned NumUsesOfInputLD = 128 / ElementSize;
+      for (SDValue BVInOp : Op->ops())
+        if (BVInOp.isUndef())
+          NumUsesOfInputLD--;
+      assert(NumUsesOfInputLD > 0 && "No uses of input LD of a build_vector?");
+      if (InputLoad->getNode()->hasNUsesOfValue(NumUsesOfInputLD, 0) &&
           ((Subtarget.hasVSX() && ElementSize == 64) ||
            (Subtarget.hasP9Vector() && ElementSize == 32))) {
         SDValue Ops[] = {
@@ -9312,17 +8729,21 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
           LD->getBasePtr(),  // Ptr
           DAG.getValueType(Op.getValueType()) // VT
         };
-        return
-          DAG.getMemIntrinsicNode(PPCISD::LD_SPLAT, dl,
-                                  DAG.getVTList(Op.getValueType(), MVT::Other),
-                                  Ops, LD->getMemoryVT(), LD->getMemOperand());
+        SDValue LdSplt = DAG.getMemIntrinsicNode(
+            PPCISD::LD_SPLAT, dl, DAG.getVTList(Op.getValueType(), MVT::Other),
+            Ops, LD->getMemoryVT(), LD->getMemOperand());
+        // Replace all uses of the output chain of the original load with the
+        // output chain of the new load.
+        DAG.ReplaceAllUsesOfValueWith(InputLoad->getValue(1),
+                                      LdSplt.getValue(1));
+        return LdSplt;
       }
     }
 
-    // BUILD_VECTOR nodes that are not constant splats of up to 32-bits can be
-    // lowered to VSX instructions under certain conditions.
+    // In 64BIT mode BUILD_VECTOR nodes that are not constant splats of up to
+    // 32-bits can be lowered to VSX instructions under certain conditions.
     // Without VSX, there is no pattern more efficient than expanding the node.
-    if (Subtarget.hasVSX() &&
+    if (Subtarget.hasVSX() && Subtarget.isPPC64() &&
         haveEfficientBuildVectorPattern(BVN, Subtarget.hasDirectMove(),
                                         Subtarget.hasP8Vector()))
       return Op;
@@ -9351,7 +8772,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
   // make a 4-byte splat element. For example: 2-byte splat of 0xABAB can be
   // turned into a 4-byte splat of 0xABABABAB.
   if (Subtarget.hasPrefixInstrs() && SplatSize == 2)
-    return getCanonicalConstSplat((SplatBits |= SplatBits << 16), SplatSize * 2,
+    return getCanonicalConstSplat(SplatBits | (SplatBits << 16), SplatSize * 2,
                                   Op.getValueType(), DAG, dl);
 
   if (Subtarget.hasPrefixInstrs() && SplatSize == 4)
@@ -9447,17 +8868,6 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
       return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
     }
 
-    // vsplti + sra self.
-    if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) {
-      SDValue Res = getCanonicalConstSplat(i, SplatSize, MVT::Other, DAG, dl);
-      static const unsigned IIDs[] = { // Intrinsic to use for each size.
-        Intrinsic::ppc_altivec_vsrab, Intrinsic::ppc_altivec_vsrah, 0,
-        Intrinsic::ppc_altivec_vsraw
-      };
-      Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
-      return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
-    }
-
     // vsplti + rol self.
     if (SextVal == (int)(((unsigned)i << TypeShiftAmt) |
                          ((unsigned)i >> (SplatBitSize-TypeShiftAmt)))) {
@@ -9957,6 +9367,7 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
       SDValue LdSplt =
         DAG.getMemIntrinsicNode(PPCISD::LD_SPLAT, dl, VTL,
                                 Ops, LD->getMemoryVT(), LD->getMemOperand());
+      DAG.ReplaceAllUsesOfValueWith(InputLoad->getValue(1), LdSplt.getValue(1));
       if (LdSplt.getValueType() != SVOp->getValueType(0))
         LdSplt = DAG.getBitcast(SVOp->getValueType(0), LdSplt);
       return LdSplt;
@@ -10060,42 +9471,6 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
     }
   }
 
-  if (Subtarget.hasQPX()) {
-    if (VT.getVectorNumElements() != 4)
-      return SDValue();
-
-    if (V2.isUndef()) V2 = V1;
-
-    int AlignIdx = PPC::isQVALIGNIShuffleMask(SVOp);
-    if (AlignIdx != -1) {
-      return DAG.getNode(PPCISD::QVALIGNI, dl, VT, V1, V2,
-                         DAG.getConstant(AlignIdx, dl, MVT::i32));
-    } else if (SVOp->isSplat()) {
-      int SplatIdx = SVOp->getSplatIndex();
-      if (SplatIdx >= 4) {
-        std::swap(V1, V2);
-        SplatIdx -= 4;
-      }
-
-      return DAG.getNode(PPCISD::QVESPLATI, dl, VT, V1,
-                         DAG.getConstant(SplatIdx, dl, MVT::i32));
-    }
-
-    // Lower this into a qvgpci/qvfperm pair.
-
-    // Compute the qvgpci literal
-    unsigned idx = 0;
-    for (unsigned i = 0; i < 4; ++i) {
-      int m = SVOp->getMaskElt(i);
-      unsigned mm = m >= 0 ? (unsigned) m : i;
-      idx |= mm << (3-i)*3;
-    }
-
-    SDValue V3 = DAG.getNode(PPCISD::QVGPCI, dl, MVT::v4f64,
-                             DAG.getConstant(idx, dl, MVT::i32));
-    return DAG.getNode(PPCISD::QVFPERM, dl, VT, V1, V2, V3);
-  }
-
   // Cases that are handled by instructions that take permute immediates
   // (such as vsplt*) should be left as VECTOR_SHUFFLE nodes so they can be
   // selected by the instruction selector.
@@ -10357,6 +9732,26 @@ static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc,
       return false;
     break;
 
+  case Intrinsic::ppc_altivec_vcmpequq:
+  case Intrinsic::ppc_altivec_vcmpgtsq:
+  case Intrinsic::ppc_altivec_vcmpgtuq:
+    if (!Subtarget.isISA3_1())
+      return false;
+    switch (IntrinsicID) {
+    default:
+      llvm_unreachable("Unknown comparison intrinsic.");
+    case Intrinsic::ppc_altivec_vcmpequq:
+      CompareOpc = 455;
+      break;
+    case Intrinsic::ppc_altivec_vcmpgtsq:
+      CompareOpc = 903;
+      break;
+    case Intrinsic::ppc_altivec_vcmpgtuq:
+      CompareOpc = 647;
+      break;
+    }
+    break;
+
   // VSX predicate comparisons use the same infrastructure
   case Intrinsic::ppc_vsx_xvcmpeqdp_p:
   case Intrinsic::ppc_vsx_xvcmpgedp_p:
@@ -10480,6 +9875,26 @@ static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc,
     else
       return false;
     break;
+  case Intrinsic::ppc_altivec_vcmpequq_p:
+  case Intrinsic::ppc_altivec_vcmpgtsq_p:
+  case Intrinsic::ppc_altivec_vcmpgtuq_p:
+    if (!Subtarget.isISA3_1())
+      return false;
+    switch (IntrinsicID) {
+    default:
+      llvm_unreachable("Unknown comparison intrinsic.");
+    case Intrinsic::ppc_altivec_vcmpequq_p:
+      CompareOpc = 455;
+      break;
+    case Intrinsic::ppc_altivec_vcmpgtsq_p:
+      CompareOpc = 903;
+      break;
+    case Intrinsic::ppc_altivec_vcmpgtuq_p:
+      CompareOpc = 647;
+      break;
+    }
+    isDot = true;
+    break;
   }
   return true;
 }
@@ -10493,11 +9908,32 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
 
   SDLoc dl(Op);
 
-  if (IntrinsicID == Intrinsic::thread_pointer) {
+  switch (IntrinsicID) {
+  case Intrinsic::thread_pointer:
     // Reads the thread pointer register, used for __builtin_thread_pointer.
     if (Subtarget.isPPC64())
       return DAG.getRegister(PPC::X13, MVT::i64);
     return DAG.getRegister(PPC::R2, MVT::i32);
+
+  case Intrinsic::ppc_mma_disassemble_acc:
+  case Intrinsic::ppc_vsx_disassemble_pair: {
+    int NumVecs = 2;
+    SDValue WideVec = Op.getOperand(1);
+    if (IntrinsicID == Intrinsic::ppc_mma_disassemble_acc) {
+      NumVecs = 4;
+      WideVec = DAG.getNode(PPCISD::XXMFACC, dl, MVT::v512i1, WideVec);
+    }
+    SmallVector<SDValue, 4> RetOps;
+    for (int VecNo = 0; VecNo < NumVecs; VecNo++) {
+      SDValue Extract = DAG.getNode(
+          PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8, WideVec,
+          DAG.getConstant(Subtarget.isLittleEndian() ? NumVecs - 1 - VecNo
+                                                     : VecNo,
+                          dl, MVT::i64));
+      RetOps.push_back(Extract);
+    }
+    return DAG.getMergeValues(RetOps, dl);
+  }
   }
 
   // If this is a lowered altivec predicate compare, CompareOpc is set to the
@@ -10522,7 +9958,7 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     DAG.getConstant(CompareOpc, dl, MVT::i32)
   };
   EVT VTs[] = { Op.getOperand(2).getValueType(), MVT::Glue };
-  SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops);
+  SDValue CompNode = DAG.getNode(PPCISD::VCMP_rec, dl, VTs, Ops);
 
   // Now that we have the comparison, emit a copy from the CR to a GPR.
   // This is flagged to the above dot comparison.
@@ -10683,154 +10119,51 @@ SDValue PPCTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
   return Op;
 }
 
-SDValue PPCTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
-                                                   SelectionDAG &DAG) const {
-  SDLoc dl(Op);
-  SDNode *N = Op.getNode();
-
-  assert(N->getOperand(0).getValueType() == MVT::v4i1 &&
-         "Unknown extract_vector_elt type");
-
-  SDValue Value = N->getOperand(0);
-
-  // The first part of this is like the store lowering except that we don't
-  // need to track the chain.
-
-  // The values are now known to be -1 (false) or 1 (true). To convert this
-  // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5).
-  // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5
-  Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value);
-
-  // FIXME: We can make this an f32 vector, but the BUILD_VECTOR code needs to
-  // understand how to form the extending load.
-  SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64);
-
-  Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs);
-
-  // Now convert to an integer and store.
-  Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64,
-    DAG.getConstant(Intrinsic::ppc_qpx_qvfctiwu, dl, MVT::i32),
-    Value);
-
-  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
-  int FrameIdx = MFI.CreateStackObject(16, Align(16), false);
-  MachinePointerInfo PtrInfo =
-      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
-  EVT PtrVT = getPointerTy(DAG.getDataLayout());
-  SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
-
-  SDValue StoreChain = DAG.getEntryNode();
-  SDValue Ops[] = {StoreChain,
-                   DAG.getConstant(Intrinsic::ppc_qpx_qvstfiw, dl, MVT::i32),
-                   Value, FIdx};
-  SDVTList VTs = DAG.getVTList(/*chain*/ MVT::Other);
-
-  StoreChain = DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID,
-    dl, VTs, Ops, MVT::v4i32, PtrInfo);
-
-  // Extract the value requested.
-  unsigned Offset = 4*cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
-  SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType());
-  Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx);
-
-  SDValue IntVal =
-      DAG.getLoad(MVT::i32, dl, StoreChain, Idx, PtrInfo.getWithOffset(Offset));
-
-  if (!Subtarget.useCRBits())
-    return IntVal;
-
-  return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, IntVal);
-}
-
-/// Lowering for QPX v4i1 loads
 SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op,
                                            SelectionDAG &DAG) const {
   SDLoc dl(Op);
   LoadSDNode *LN = cast<LoadSDNode>(Op.getNode());
   SDValue LoadChain = LN->getChain();
   SDValue BasePtr = LN->getBasePtr();
+  EVT VT = Op.getValueType();
 
-  if (Op.getValueType() == MVT::v4f64 ||
-      Op.getValueType() == MVT::v4f32) {
-    EVT MemVT = LN->getMemoryVT();
-    unsigned Alignment = LN->getAlignment();
-
-    // If this load is properly aligned, then it is legal.
-    if (Alignment >= MemVT.getStoreSize())
-      return Op;
-
-    EVT ScalarVT = Op.getValueType().getScalarType(),
-        ScalarMemVT = MemVT.getScalarType();
-    unsigned Stride = ScalarMemVT.getStoreSize();
-
-    SDValue Vals[4], LoadChains[4];
-    for (unsigned Idx = 0; Idx < 4; ++Idx) {
-      SDValue Load;
-      if (ScalarVT != ScalarMemVT)
-        Load = DAG.getExtLoad(LN->getExtensionType(), dl, ScalarVT, LoadChain,
-                              BasePtr,
-                              LN->getPointerInfo().getWithOffset(Idx * Stride),
-                              ScalarMemVT, MinAlign(Alignment, Idx * Stride),
-                              LN->getMemOperand()->getFlags(), LN->getAAInfo());
-      else
-        Load = DAG.getLoad(ScalarVT, dl, LoadChain, BasePtr,
-                           LN->getPointerInfo().getWithOffset(Idx * Stride),
-                           MinAlign(Alignment, Idx * Stride),
-                           LN->getMemOperand()->getFlags(), LN->getAAInfo());
-
-      if (Idx == 0 && LN->isIndexed()) {
-        assert(LN->getAddressingMode() == ISD::PRE_INC &&
-               "Unknown addressing mode on vector load");
-        Load = DAG.getIndexedLoad(Load, dl, BasePtr, LN->getOffset(),
-                                  LN->getAddressingMode());
-      }
-
-      Vals[Idx] = Load;
-      LoadChains[Idx] = Load.getValue(1);
-
-      BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
-                            DAG.getConstant(Stride, dl,
-                                            BasePtr.getValueType()));
-    }
-
-    SDValue TF =  DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);
-    SDValue Value = DAG.getBuildVector(Op.getValueType(), dl, Vals);
-
-    if (LN->isIndexed()) {
-      SDValue RetOps[] = { Value, Vals[0].getValue(1), TF };
-      return DAG.getMergeValues(RetOps, dl);
-    }
-
-    SDValue RetOps[] = { Value, TF };
-    return DAG.getMergeValues(RetOps, dl);
-  }
-
-  assert(Op.getValueType() == MVT::v4i1 && "Unknown load to lower");
-  assert(LN->isUnindexed() && "Indexed v4i1 loads are not supported");
-
-  // To lower v4i1 from a byte array, we load the byte elements of the
-  // vector and then reuse the BUILD_VECTOR logic.
-
-  SDValue VectElmts[4], VectElmtChains[4];
-  for (unsigned i = 0; i < 4; ++i) {
-    SDValue Idx = DAG.getConstant(i, dl, BasePtr.getValueType());
-    Idx = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, Idx);
-
-    VectElmts[i] = DAG.getExtLoad(
-        ISD::EXTLOAD, dl, MVT::i32, LoadChain, Idx,
-        LN->getPointerInfo().getWithOffset(i), MVT::i8,
-        /* Alignment = */ 1, LN->getMemOperand()->getFlags(), LN->getAAInfo());
-    VectElmtChains[i] = VectElmts[i].getValue(1);
-  }
-
-  LoadChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, VectElmtChains);
-  SDValue Value = DAG.getBuildVector(MVT::v4i1, dl, VectElmts);
+  if (VT != MVT::v256i1 && VT != MVT::v512i1)
+    return Op;
 
-  SDValue RVals[] = { Value, LoadChain };
-  return DAG.getMergeValues(RVals, dl);
+  // Type v256i1 is used for pairs and v512i1 is used for accumulators.
+  // Here we create 2 or 4 v16i8 loads to load the pair or accumulator value in
+  // 2 or 4 vsx registers.
+  assert((VT != MVT::v512i1 || Subtarget.hasMMA()) &&
+         "Type unsupported without MMA");
+  assert((VT != MVT::v256i1 || Subtarget.pairedVectorMemops()) &&
+         "Type unsupported without paired vector support");
+  Align Alignment = LN->getAlign();
+  SmallVector<SDValue, 4> Loads;
+  SmallVector<SDValue, 4> LoadChains;
+  unsigned NumVecs = VT.getSizeInBits() / 128;
+  for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
+    SDValue Load =
+        DAG.getLoad(MVT::v16i8, dl, LoadChain, BasePtr,
+                    LN->getPointerInfo().getWithOffset(Idx * 16),
+                    commonAlignment(Alignment, Idx * 16),
+                    LN->getMemOperand()->getFlags(), LN->getAAInfo());
+    BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
+                          DAG.getConstant(16, dl, BasePtr.getValueType()));
+    Loads.push_back(Load);
+    LoadChains.push_back(Load.getValue(1));
+  }
+  if (Subtarget.isLittleEndian()) {
+    std::reverse(Loads.begin(), Loads.end());
+    std::reverse(LoadChains.begin(), LoadChains.end());
+  }
+  SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);
+  SDValue Value =
+      DAG.getNode(VT == MVT::v512i1 ? PPCISD::ACC_BUILD : PPCISD::PAIR_BUILD,
+                  dl, VT, Loads);
+  SDValue RetOps[] = {Value, TF};
+  return DAG.getMergeValues(RetOps, dl);
 }
 
-/// Lowering for QPX v4i1 stores
 SDValue PPCTargetLowering::LowerVectorStore(SDValue Op,
                                             SelectionDAG &DAG) const {
   SDLoc dl(Op);
@@ -10838,122 +10171,40 @@ SDValue PPCTargetLowering::LowerVectorStore(SDValue Op,
   SDValue StoreChain = SN->getChain();
   SDValue BasePtr = SN->getBasePtr();
   SDValue Value = SN->getValue();
+  EVT StoreVT = Value.getValueType();
 
-  if (Value.getValueType() == MVT::v4f64 ||
-      Value.getValueType() == MVT::v4f32) {
-    EVT MemVT = SN->getMemoryVT();
-    unsigned Alignment = SN->getAlignment();
-
-    // If this store is properly aligned, then it is legal.
-    if (Alignment >= MemVT.getStoreSize())
-      return Op;
-
-    EVT ScalarVT = Value.getValueType().getScalarType(),
-        ScalarMemVT = MemVT.getScalarType();
-    unsigned Stride = ScalarMemVT.getStoreSize();
-
-    SDValue Stores[4];
-    for (unsigned Idx = 0; Idx < 4; ++Idx) {
-      SDValue Ex = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Value,
-                               DAG.getVectorIdxConstant(Idx, dl));
-      SDValue Store;
-      if (ScalarVT != ScalarMemVT)
-        Store =
-            DAG.getTruncStore(StoreChain, dl, Ex, BasePtr,
-                              SN->getPointerInfo().getWithOffset(Idx * Stride),
-                              ScalarMemVT, MinAlign(Alignment, Idx * Stride),
-                              SN->getMemOperand()->getFlags(), SN->getAAInfo());
-      else
-        Store = DAG.getStore(StoreChain, dl, Ex, BasePtr,
-                             SN->getPointerInfo().getWithOffset(Idx * Stride),
-                             MinAlign(Alignment, Idx * Stride),
-                             SN->getMemOperand()->getFlags(), SN->getAAInfo());
-
-      if (Idx == 0 && SN->isIndexed()) {
-        assert(SN->getAddressingMode() == ISD::PRE_INC &&
-               "Unknown addressing mode on vector store");
-        Store = DAG.getIndexedStore(Store, dl, BasePtr, SN->getOffset(),
-                                    SN->getAddressingMode());
-      }
-
-      BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
-                            DAG.getConstant(Stride, dl,
-                                            BasePtr.getValueType()));
-      Stores[Idx] = Store;
-    }
-
-    SDValue TF =  DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
-
-    if (SN->isIndexed()) {
-      SDValue RetOps[] = { TF, Stores[0].getValue(1) };
-      return DAG.getMergeValues(RetOps, dl);
-    }
-
-    return TF;
-  }
-
-  assert(SN->isUnindexed() && "Indexed v4i1 stores are not supported");
-  assert(Value.getValueType() == MVT::v4i1 && "Unknown store to lower");
-
-  // The values are now known to be -1 (false) or 1 (true). To convert this
-  // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5).
-  // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5
-  Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value);
-
-  // FIXME: We can make this an f32 vector, but the BUILD_VECTOR code needs to
-  // understand how to form the extending load.
-  SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64);
-
-  Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs);
-
-  // Now convert to an integer and store.
-  Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64,
-    DAG.getConstant(Intrinsic::ppc_qpx_qvfctiwu, dl, MVT::i32),
-    Value);
-
-  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
-  int FrameIdx = MFI.CreateStackObject(16, Align(16), false);
-  MachinePointerInfo PtrInfo =
-      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
-  EVT PtrVT = getPointerTy(DAG.getDataLayout());
-  SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
-
-  SDValue Ops[] = {StoreChain,
-                   DAG.getConstant(Intrinsic::ppc_qpx_qvstfiw, dl, MVT::i32),
-                   Value, FIdx};
-  SDVTList VTs = DAG.getVTList(/*chain*/ MVT::Other);
-
-  StoreChain = DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID,
-    dl, VTs, Ops, MVT::v4i32, PtrInfo);
-
-  // Move data into the byte array.
-  SDValue Loads[4], LoadChains[4];
-  for (unsigned i = 0; i < 4; ++i) {
-    unsigned Offset = 4*i;
-    SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType());
-    Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx);
-
-    Loads[i] = DAG.getLoad(MVT::i32, dl, StoreChain, Idx,
-                           PtrInfo.getWithOffset(Offset));
-    LoadChains[i] = Loads[i].getValue(1);
-  }
-
-  StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);
-
-  SDValue Stores[4];
-  for (unsigned i = 0; i < 4; ++i) {
-    SDValue Idx = DAG.getConstant(i, dl, BasePtr.getValueType());
-    Idx = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, Idx);
-
-    Stores[i] = DAG.getTruncStore(
-        StoreChain, dl, Loads[i], Idx, SN->getPointerInfo().getWithOffset(i),
-        MVT::i8, /* Alignment = */ 1, SN->getMemOperand()->getFlags(),
-        SN->getAAInfo());
-  }
-
-  StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
+  if (StoreVT != MVT::v256i1 && StoreVT != MVT::v512i1)
+    return Op;
 
-  return StoreChain;
+  // Type v256i1 is used for pairs and v512i1 is used for accumulators.
+  // Here we create 2 or 4 v16i8 stores to store the pair or accumulator
+  // underlying registers individually.
+  assert((StoreVT != MVT::v512i1 || Subtarget.hasMMA()) &&
+         "Type unsupported without MMA");
+  assert((StoreVT != MVT::v256i1 || Subtarget.pairedVectorMemops()) &&
+         "Type unsupported without paired vector support");
+  Align Alignment = SN->getAlign();
+  SmallVector<SDValue, 4> Stores;
+  unsigned NumVecs = 2;
+  if (StoreVT == MVT::v512i1) {
+    Value = DAG.getNode(PPCISD::XXMFACC, dl, MVT::v512i1, Value);
+    NumVecs = 4;
+  }
+  for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
+    unsigned VecNum = Subtarget.isLittleEndian() ? NumVecs - 1 - Idx : Idx;
+    SDValue Elt = DAG.getNode(PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8, Value,
+                              DAG.getConstant(VecNum, dl, MVT::i64));
+    SDValue Store =
+        DAG.getStore(StoreChain, dl, Elt, BasePtr,
+                     SN->getPointerInfo().getWithOffset(Idx * 16),
+                     commonAlignment(Alignment, Idx * 16),
+                     SN->getMemOperand()->getFlags(), SN->getAAInfo());
+    BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
+                          DAG.getConstant(16, dl, BasePtr.getValueType()));
+    Stores.push_back(Store);
+  }
+  SDValue TF = DAG.getTokenFactor(dl, Stores);
+  return TF;
 }
 
 SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
@@ -11020,42 +10271,13 @@ SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
   }
 }
 
-SDValue PPCTargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
-
-  assert(Op.getOpcode() == ISD::ABS && "Should only be called for ISD::ABS");
-
-  EVT VT = Op.getValueType();
-  assert(VT.isVector() &&
-         "Only set vector abs as custom, scalar abs shouldn't reach here!");
-  assert((VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
-          VT == MVT::v16i8) &&
-         "Unexpected vector element type!");
-  assert((VT != MVT::v2i64 || Subtarget.hasP8Altivec()) &&
-         "Current subtarget doesn't support smax v2i64!");
-
-  // For vector abs, it can be lowered to:
-  // abs x
-  // ==>
-  // y = -x
-  // smax(x, y)
-
-  SDLoc dl(Op);
-  SDValue X = Op.getOperand(0);
-  SDValue Zero = DAG.getConstant(0, dl, VT);
-  SDValue Y = DAG.getNode(ISD::SUB, dl, VT, Zero, X);
-
-  // SMAX patch https://reviews.llvm.org/D47332
-  // hasn't landed yet, so use intrinsic first here.
-  // TODO: Should use SMAX directly once SMAX patch landed
-  Intrinsic::ID BifID = Intrinsic::ppc_altivec_vmaxsw;
-  if (VT == MVT::v2i64)
-    BifID = Intrinsic::ppc_altivec_vmaxsd;
-  else if (VT == MVT::v8i16)
-    BifID = Intrinsic::ppc_altivec_vmaxsh;
-  else if (VT == MVT::v16i8)
-    BifID = Intrinsic::ppc_altivec_vmaxsb;
+SDValue PPCTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
+  bool IsStrict = Op->isStrictFPOpcode();
+  if (Op.getOperand(IsStrict ? 1 : 0).getValueType() == MVT::f128 &&
+      !Subtarget.hasP9Vector())
+    return SDValue();
 
-  return BuildIntrinsicOp(BifID, X, Y, DAG, dl, VT);
+  return Op;
 }
 
 // Custom lowering for fpext vf32 to v2f64
@@ -11149,6 +10371,8 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
   case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
 
+  case ISD::INLINEASM:
+  case ISD::INLINEASM_BR:       return LowerINLINEASM(Op, DAG);
   // Variable argument lowering.
   case ISD::VASTART:            return LowerVASTART(Op, DAG);
   case ISD::VAARG:              return LowerVAARG(Op, DAG);
@@ -11168,8 +10392,12 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::STORE:              return LowerSTORE(Op, DAG);
   case ISD::TRUNCATE:           return LowerTRUNCATE(Op, DAG);
   case ISD::SELECT_CC:          return LowerSELECT_CC(Op, DAG);
+  case ISD::STRICT_FP_TO_UINT:
+  case ISD::STRICT_FP_TO_SINT:
   case ISD::FP_TO_UINT:
   case ISD::FP_TO_SINT:         return LowerFP_TO_INT(Op, DAG, SDLoc(Op));
+  case ISD::STRICT_UINT_TO_FP:
+  case ISD::STRICT_SINT_TO_FP:
   case ISD::UINT_TO_FP:
   case ISD::SINT_TO_FP:         return LowerINT_TO_FP(Op, DAG);
   case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
@@ -11179,16 +10407,20 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SRL_PARTS:          return LowerSRL_PARTS(Op, DAG);
   case ISD::SRA_PARTS:          return LowerSRA_PARTS(Op, DAG);
 
+  case ISD::FSHL:               return LowerFunnelShift(Op, DAG);
+  case ISD::FSHR:               return LowerFunnelShift(Op, DAG);
+
   // Vector-related lowering.
   case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
   case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
   case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
-  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
   case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
   case ISD::MUL:                return LowerMUL(Op, DAG);
-  case ISD::ABS:                return LowerABS(Op, DAG);
   case ISD::FP_EXTEND:          return LowerFP_EXTEND(Op, DAG);
+  case ISD::STRICT_FP_ROUND:
+  case ISD::FP_ROUND:
+    return LowerFP_ROUND(Op, DAG);
   case ISD::ROTL:               return LowerROTL(Op, DAG);
 
   // For counter-based loop handling.
@@ -11256,23 +10488,28 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
     }
     return;
   }
+  case ISD::STRICT_FP_TO_SINT:
+  case ISD::STRICT_FP_TO_UINT:
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:
     // LowerFP_TO_INT() can only handle f32 and f64.
-    if (N->getOperand(0).getValueType() == MVT::ppcf128)
+    if (N->getOperand(N->isStrictFPOpcode() ? 1 : 0).getValueType() ==
+        MVT::ppcf128)
       return;
     Results.push_back(LowerFP_TO_INT(SDValue(N, 0), DAG, dl));
     return;
   case ISD::TRUNCATE: {
-    EVT TrgVT = N->getValueType(0);
-    EVT OpVT = N->getOperand(0).getValueType();
-    if (TrgVT.isVector() &&
-        isOperationCustom(N->getOpcode(), TrgVT) &&
-        OpVT.getSizeInBits() <= 128 &&
-        isPowerOf2_32(OpVT.getVectorElementType().getSizeInBits()))
-      Results.push_back(LowerTRUNCATEVector(SDValue(N, 0), DAG));
+    if (!N->getValueType(0).isVector())
+      return;
+    SDValue Lowered = LowerTRUNCATEVector(SDValue(N, 0), DAG);
+    if (Lowered)
+      Results.push_back(Lowered);
     return;
   }
+  case ISD::FSHL:
+  case ISD::FSHR:
+    // Don't handle funnel shifts here.
+    return;
   case ISD::BITCAST:
     // Don't handle bitcast here.
     return;
@@ -11444,17 +10681,88 @@ PPCTargetLowering::EmitAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB,
   return BB;
 }
 
+static bool isSignExtended(MachineInstr &MI, const PPCInstrInfo *TII) {
+  switch(MI.getOpcode()) {
+  default:
+    return false;
+  case PPC::COPY:
+    return TII->isSignExtended(MI);
+  case PPC::LHA:
+  case PPC::LHA8:
+  case PPC::LHAU:
+  case PPC::LHAU8:
+  case PPC::LHAUX:
+  case PPC::LHAUX8:
+  case PPC::LHAX:
+  case PPC::LHAX8:
+  case PPC::LWA:
+  case PPC::LWAUX:
+  case PPC::LWAX:
+  case PPC::LWAX_32:
+  case PPC::LWA_32:
+  case PPC::PLHA:
+  case PPC::PLHA8:
+  case PPC::PLHA8pc:
+  case PPC::PLHApc:
+  case PPC::PLWA:
+  case PPC::PLWA8:
+  case PPC::PLWA8pc:
+  case PPC::PLWApc:
+  case PPC::EXTSB:
+  case PPC::EXTSB8:
+  case PPC::EXTSB8_32_64:
+  case PPC::EXTSB8_rec:
+  case PPC::EXTSB_rec:
+  case PPC::EXTSH:
+  case PPC::EXTSH8:
+  case PPC::EXTSH8_32_64:
+  case PPC::EXTSH8_rec:
+  case PPC::EXTSH_rec:
+  case PPC::EXTSW:
+  case PPC::EXTSWSLI:
+  case PPC::EXTSWSLI_32_64:
+  case PPC::EXTSWSLI_32_64_rec:
+  case PPC::EXTSWSLI_rec:
+  case PPC::EXTSW_32:
+  case PPC::EXTSW_32_64:
+  case PPC::EXTSW_32_64_rec:
+  case PPC::EXTSW_rec:
+  case PPC::SRAW:
+  case PPC::SRAWI:
+  case PPC::SRAWI_rec:
+  case PPC::SRAW_rec:
+    return true;
+  }
+  return false;
+}
+
 MachineBasicBlock *PPCTargetLowering::EmitPartwordAtomicBinary(
     MachineInstr &MI, MachineBasicBlock *BB,
     bool is8bit, // operation
     unsigned BinOpcode, unsigned CmpOpcode, unsigned CmpPred) const {
+  // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
+  const PPCInstrInfo *TII = Subtarget.getInstrInfo();
+
+  // If this is a signed comparison and the value being compared is not known
+  // to be sign extended, sign extend it here.
+  DebugLoc dl = MI.getDebugLoc();
+  MachineFunction *F = BB->getParent();
+  MachineRegisterInfo &RegInfo = F->getRegInfo();
+  Register incr = MI.getOperand(3).getReg();
+  bool IsSignExtended = Register::isVirtualRegister(incr) &&
+    isSignExtended(*RegInfo.getVRegDef(incr), TII);
+
+  if (CmpOpcode == PPC::CMPW && !IsSignExtended) {
+    Register ValueReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass);
+    BuildMI(*BB, MI, dl, TII->get(is8bit ? PPC::EXTSB : PPC::EXTSH), ValueReg)
+        .addReg(MI.getOperand(3).getReg());
+    MI.getOperand(3).setReg(ValueReg);
+  }
   // If we support part-word atomic mnemonics, just use them
   if (Subtarget.hasPartwordAtomics())
     return EmitAtomicBinary(MI, BB, is8bit ? 1 : 2, BinOpcode, CmpOpcode,
                             CmpPred);
 
-  // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
-  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   // In 64 bit mode we have to use 64 bits for addresses, even though the
   // lwarx/stwcx are 32 bits.  With the 32-bit atomics we can use address
   // registers without caring whether they're 32 or 64, but here we're
@@ -11464,14 +10772,11 @@ MachineBasicBlock *PPCTargetLowering::EmitPartwordAtomicBinary(
   unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO;
 
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineFunction *F = BB->getParent();
   MachineFunction::iterator It = ++BB->getIterator();
 
   Register dest = MI.getOperand(0).getReg();
   Register ptrA = MI.getOperand(1).getReg();
   Register ptrB = MI.getOperand(2).getReg();
-  Register incr = MI.getOperand(3).getReg();
-  DebugLoc dl = MI.getDebugLoc();
 
   MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *loop2MBB =
@@ -11485,7 +10790,6 @@ MachineBasicBlock *PPCTargetLowering::EmitPartwordAtomicBinary(
                   std::next(MachineBasicBlock::iterator(MI)), BB->end());
   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
-  MachineRegisterInfo &RegInfo = F->getRegInfo();
   const TargetRegisterClass *RC =
       is64bit ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
   const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
@@ -12128,9 +11432,6 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   } else if (MI.getOpcode() == PPC::SELECT_CC_F4 ||
              MI.getOpcode() == PPC::SELECT_CC_F8 ||
              MI.getOpcode() == PPC::SELECT_CC_F16 ||
-             MI.getOpcode() == PPC::SELECT_CC_QFRC ||
-             MI.getOpcode() == PPC::SELECT_CC_QSRC ||
-             MI.getOpcode() == PPC::SELECT_CC_QBRC ||
              MI.getOpcode() == PPC::SELECT_CC_VRRC ||
              MI.getOpcode() == PPC::SELECT_CC_VSFRC ||
              MI.getOpcode() == PPC::SELECT_CC_VSSRC ||
@@ -12140,9 +11441,6 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
              MI.getOpcode() == PPC::SELECT_F4 ||
              MI.getOpcode() == PPC::SELECT_F8 ||
              MI.getOpcode() == PPC::SELECT_F16 ||
-             MI.getOpcode() == PPC::SELECT_QFRC ||
-             MI.getOpcode() == PPC::SELECT_QSRC ||
-             MI.getOpcode() == PPC::SELECT_QBRC ||
              MI.getOpcode() == PPC::SELECT_SPE ||
              MI.getOpcode() == PPC::SELECT_SPE4 ||
              MI.getOpcode() == PPC::SELECT_VRRC ||
@@ -12180,9 +11478,6 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
         MI.getOpcode() == PPC::SELECT_F16 ||
         MI.getOpcode() == PPC::SELECT_SPE4 ||
         MI.getOpcode() == PPC::SELECT_SPE ||
-        MI.getOpcode() == PPC::SELECT_QFRC ||
-        MI.getOpcode() == PPC::SELECT_QSRC ||
-        MI.getOpcode() == PPC::SELECT_QBRC ||
         MI.getOpcode() == PPC::SELECT_VRRC ||
         MI.getOpcode() == PPC::SELECT_VSFRC ||
         MI.getOpcode() == PPC::SELECT_VSSRC ||
@@ -12665,11 +11960,20 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), MFFSReg);
 
     // Set rounding mode to round-to-zero.
-    BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB1)).addImm(31);
-    BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB0)).addImm(30);
+    BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB1))
+        .addImm(31)
+        .addReg(PPC::RM, RegState::ImplicitDefine);
+
+    BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB0))
+        .addImm(30)
+        .addReg(PPC::RM, RegState::ImplicitDefine);
 
     // Perform addition.
-    BuildMI(*BB, MI, dl, TII->get(PPC::FADD), Dest).addReg(Src1).addReg(Src2);
+    auto MIB = BuildMI(*BB, MI, dl, TII->get(PPC::FADD), Dest)
+                   .addReg(Src1)
+                   .addReg(Src2);
+    if (MI.getFlag(MachineInstr::NoFPExcept))
+      MIB.setMIFlag(MachineInstr::NoFPExcept);
 
     // Restore FPSCR value.
     BuildMI(*BB, MI, dl, TII->get(PPC::MTFSFb)).addImm(1).addReg(MFFSReg);
@@ -12728,10 +12032,12 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     // the immediate to set the bits 62:63 of FPSCR.
     unsigned Mode = MI.getOperand(1).getImm();
     BuildMI(*BB, MI, dl, TII->get((Mode & 1) ? PPC::MTFSB1 : PPC::MTFSB0))
-      .addImm(31);
+        .addImm(31)
+        .addReg(PPC::RM, RegState::ImplicitDefine);
 
     BuildMI(*BB, MI, dl, TII->get((Mode & 2) ? PPC::MTFSB1 : PPC::MTFSB0))
-      .addImm(30);
+        .addImm(30)
+        .addReg(PPC::RM, RegState::ImplicitDefine);
   } else if (MI.getOpcode() == PPC::SETRND) {
     DebugLoc dl = MI.getDebugLoc();
 
@@ -12841,6 +12147,20 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
       .addReg(NewFPSCRReg)
       .addImm(0)
       .addImm(0);
+  } else if (MI.getOpcode() == PPC::SETFLM) {
+    DebugLoc Dl = MI.getDebugLoc();
+
+    // Result of setflm is previous FPSCR content, so we need to save it first.
+    Register OldFPSCRReg = MI.getOperand(0).getReg();
+    BuildMI(*BB, MI, Dl, TII->get(PPC::MFFS), OldFPSCRReg);
+
+    // Put bits in 32:63 to FPSCR.
+    Register NewFPSCRReg = MI.getOperand(1).getReg();
+    BuildMI(*BB, MI, Dl, TII->get(PPC::MTFSF))
+        .addImm(255)
+        .addReg(NewFPSCRReg)
+        .addImm(0)
+        .addImm(0);
   } else if (MI.getOpcode() == PPC::PROBED_ALLOCA_32 ||
              MI.getOpcode() == PPC::PROBED_ALLOCA_64) {
     return emitProbedAlloca(MI, BB);
@@ -12867,6 +12187,47 @@ static int getEstimateRefinementSteps(EVT VT, const PPCSubtarget &Subtarget) {
   return RefinementSteps;
 }
 
+SDValue PPCTargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
+                                            const DenormalMode &Mode) const {
+  // We only have VSX Vector Test for software Square Root.
+  EVT VT = Op.getValueType();
+  if (!isTypeLegal(MVT::i1) ||
+      (VT != MVT::f64 &&
+       ((VT != MVT::v2f64 && VT != MVT::v4f32) || !Subtarget.hasVSX())))
+    return TargetLowering::getSqrtInputTest(Op, DAG, Mode);
+
+  SDLoc DL(Op);
+  // The output register of FTSQRT is CR field.
+  SDValue FTSQRT = DAG.getNode(PPCISD::FTSQRT, DL, MVT::i32, Op);
+  // ftsqrt BF,FRB
+  // Let e_b be the unbiased exponent of the double-precision
+  // floating-point operand in register FRB.
+  // fe_flag is set to 1 if either of the following conditions occurs.
+  //   - The double-precision floating-point operand in register FRB is a zero,
+  //     a NaN, or an infinity, or a negative value.
+  //   - e_b is less than or equal to -970.
+  // Otherwise fe_flag is set to 0.
+  // Both VSX and non-VSX versions would set EQ bit in the CR if the number is
+  // not eligible for iteration. (zero/negative/infinity/nan or unbiased
+  // exponent is less than -970)
+  SDValue SRIdxVal = DAG.getTargetConstant(PPC::sub_eq, DL, MVT::i32);
+  return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::i1,
+                                    FTSQRT, SRIdxVal),
+                 0);
+}
+
+SDValue
+PPCTargetLowering::getSqrtResultForDenormInput(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  // We only have VSX Vector Square Root.
+  EVT VT = Op.getValueType();
+  if (VT != MVT::f64 &&
+      ((VT != MVT::v2f64 && VT != MVT::v4f32) || !Subtarget.hasVSX()))
+    return TargetLowering::getSqrtResultForDenormInput(Op, DAG);
+
+  return DAG.getNode(PPCISD::FSQRT, SDLoc(Op), VT, Op);
+}
+
 SDValue PPCTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
                                            int Enabled, int &RefinementSteps,
                                            bool &UseOneConstNR,
@@ -12875,9 +12236,7 @@ SDValue PPCTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
   if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) ||
       (VT == MVT::f64 && Subtarget.hasFRSQRTE()) ||
       (VT == MVT::v4f32 && Subtarget.hasAltivec()) ||
-      (VT == MVT::v2f64 && Subtarget.hasVSX()) ||
-      (VT == MVT::v4f32 && Subtarget.hasQPX()) ||
-      (VT == MVT::v4f64 && Subtarget.hasQPX())) {
+      (VT == MVT::v2f64 && Subtarget.hasVSX())) {
     if (RefinementSteps == ReciprocalEstimate::Unspecified)
       RefinementSteps = getEstimateRefinementSteps(VT, Subtarget);
 
@@ -12896,9 +12255,7 @@ SDValue PPCTargetLowering::getRecipEstimate(SDValue Operand, SelectionDAG &DAG,
   if ((VT == MVT::f32 && Subtarget.hasFRES()) ||
       (VT == MVT::f64 && Subtarget.hasFRE()) ||
       (VT == MVT::v4f32 && Subtarget.hasAltivec()) ||
-      (VT == MVT::v2f64 && Subtarget.hasVSX()) ||
-      (VT == MVT::v4f32 && Subtarget.hasQPX()) ||
-      (VT == MVT::v4f64 && Subtarget.hasQPX())) {
+      (VT == MVT::v2f64 && Subtarget.hasVSX())) {
     if (RefinementSteps == ReciprocalEstimate::Unspecified)
       RefinementSteps = getEstimateRefinementSteps(VT, Subtarget);
     return DAG.getNode(PPCISD::FRE, SDLoc(Operand), VT, Operand);
@@ -12996,24 +12353,6 @@ static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base,
     EVT VT;
     switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
     default: return false;
-    case Intrinsic::ppc_qpx_qvlfd:
-    case Intrinsic::ppc_qpx_qvlfda:
-      VT = MVT::v4f64;
-      break;
-    case Intrinsic::ppc_qpx_qvlfs:
-    case Intrinsic::ppc_qpx_qvlfsa:
-      VT = MVT::v4f32;
-      break;
-    case Intrinsic::ppc_qpx_qvlfcd:
-    case Intrinsic::ppc_qpx_qvlfcda:
-      VT = MVT::v2f64;
-      break;
-    case Intrinsic::ppc_qpx_qvlfcs:
-    case Intrinsic::ppc_qpx_qvlfcsa:
-      VT = MVT::v2f32;
-      break;
-    case Intrinsic::ppc_qpx_qvlfiwa:
-    case Intrinsic::ppc_qpx_qvlfiwz:
     case Intrinsic::ppc_altivec_lvx:
     case Intrinsic::ppc_altivec_lvxl:
     case Intrinsic::ppc_vsx_lxvw4x:
@@ -13042,24 +12381,6 @@ static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base,
     EVT VT;
     switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
     default: return false;
-    case Intrinsic::ppc_qpx_qvstfd:
-    case Intrinsic::ppc_qpx_qvstfda:
-      VT = MVT::v4f64;
-      break;
-    case Intrinsic::ppc_qpx_qvstfs:
-    case Intrinsic::ppc_qpx_qvstfsa:
-      VT = MVT::v4f32;
-      break;
-    case Intrinsic::ppc_qpx_qvstfcd:
-    case Intrinsic::ppc_qpx_qvstfcda:
-      VT = MVT::v2f64;
-      break;
-    case Intrinsic::ppc_qpx_qvstfcs:
-    case Intrinsic::ppc_qpx_qvstfcsa:
-      VT = MVT::v2f32;
-      break;
-    case Intrinsic::ppc_qpx_qvstfiw:
-    case Intrinsic::ppc_qpx_qvstfiwa:
     case Intrinsic::ppc_altivec_stvx:
     case Intrinsic::ppc_altivec_stvxl:
     case Intrinsic::ppc_vsx_stxvw4x:
@@ -13287,11 +12608,13 @@ SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N,
       KnownBits Op2Known = DAG.computeKnownBits(N->getOperand(1));
 
       // We don't really care about what is known about the first bit (if
-      // anything), so clear it in all masks prior to comparing them.
-      Op1Known.Zero.clearBit(0); Op1Known.One.clearBit(0);
-      Op2Known.Zero.clearBit(0); Op2Known.One.clearBit(0);
+      // anything), so pretend that it is known zero for both to ensure they can
+      // be compared as constants.
+      Op1Known.Zero.setBit(0); Op1Known.One.clearBit(0);
+      Op2Known.Zero.setBit(0); Op2Known.One.clearBit(0);
 
-      if (Op1Known.Zero != Op2Known.Zero || Op1Known.One != Op2Known.One)
+      if (!Op1Known.isConstant() || !Op2Known.isConstant() ||
+          Op1Known.getConstant() != Op2Known.getConstant())
         return SDValue();
     }
   }
@@ -13343,8 +12666,7 @@ SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N,
   // Visit all inputs, collect all binary operations (and, or, xor and
   // select) that are all fed by extensions.
   while (!BinOps.empty()) {
-    SDValue BinOp = BinOps.back();
-    BinOps.pop_back();
+    SDValue BinOp = BinOps.pop_back_val();
 
     if (!Visited.insert(BinOp.getNode()).second)
       continue;
@@ -13559,8 +12881,7 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
   // Visit all inputs, collect all binary operations (and, or, xor and
   // select) that are all fed by truncations.
   while (!BinOps.empty()) {
-    SDValue BinOp = BinOps.back();
-    BinOps.pop_back();
+    SDValue BinOp = BinOps.pop_back_val();
 
     if (!Visited.insert(BinOp.getNode()).second)
       continue;
@@ -14157,6 +13478,46 @@ static SDValue combineBVOfVecSExt(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
+// Look for the pattern of a load from a narrow width to i128, feeding
+// into a BUILD_VECTOR of v1i128. Replace this sequence with a PPCISD node
+// (LXVRZX). This node represents a zero extending load that will be matched
+// to the Load VSX Vector Rightmost instructions.
+static SDValue combineBVZEXTLOAD(SDNode *N, SelectionDAG &DAG) {
+  SDLoc DL(N);
+
+  // This combine is only eligible for a BUILD_VECTOR of v1i128.
+  if (N->getValueType(0) != MVT::v1i128)
+    return SDValue();
+
+  SDValue Operand = N->getOperand(0);
+  // Proceed with the transformation if the operand to the BUILD_VECTOR
+  // is a load instruction.
+  if (Operand.getOpcode() != ISD::LOAD)
+    return SDValue();
+
+  LoadSDNode *LD = dyn_cast<LoadSDNode>(Operand);
+  EVT MemoryType = LD->getMemoryVT();
+
+  // This transformation is only valid if the we are loading either a byte,
+  // halfword, word, or doubleword.
+  bool ValidLDType = MemoryType == MVT::i8 || MemoryType == MVT::i16 ||
+                     MemoryType == MVT::i32 || MemoryType == MVT::i64;
+
+  // Ensure that the load from the narrow width is being zero extended to i128.
+  if (!ValidLDType ||
+      (LD->getExtensionType() != ISD::ZEXTLOAD &&
+       LD->getExtensionType() != ISD::EXTLOAD))
+    return SDValue();
+
+  SDValue LoadOps[] = {
+      LD->getChain(), LD->getBasePtr(),
+      DAG.getIntPtrConstant(MemoryType.getScalarSizeInBits(), DL)};
+
+  return DAG.getMemIntrinsicNode(PPCISD::LXVRZX, DL,
+                                 DAG.getVTList(MVT::v1i128, MVT::Other),
+                                 LoadOps, MemoryType, LD->getMemOperand());
+}
+
 SDValue PPCTargetLowering::DAGCombineBuildVector(SDNode *N,
                                                  DAGCombinerInfo &DCI) const {
   assert(N->getOpcode() == ISD::BUILD_VECTOR &&
@@ -14194,6 +13555,14 @@ SDValue PPCTargetLowering::DAGCombineBuildVector(SDNode *N,
       return Reduced;
   }
 
+  // On Power10, the Load VSX Vector Rightmost instructions can be utilized
+  // if this is a BUILD_VECTOR of v1i128, and if the operand to the BUILD_VECTOR
+  // is a load from <valid narrow width> to i128.
+  if (Subtarget.isISA3_1()) {
+    SDValue BVOfZLoad = combineBVZEXTLOAD(N, DAG);
+    if (BVOfZLoad)
+      return BVOfZLoad;
+  }
 
   if (N->getValueType(0) != MVT::v2f64)
     return SDValue();
@@ -14495,8 +13864,7 @@ SDValue PPCTargetLowering::combineStoreFPToInt(SDNode *N,
   EVT Op1VT = N->getOperand(1).getValueType();
   EVT ResVT = Val.getValueType();
 
-  // Floating point types smaller than 32 bits are not legal on Power.
-  if (ResVT.getScalarSizeInBits() < 32)
+  if (!isTypeLegal(ResVT))
     return SDValue();
 
   // Only perform combine for conversion to i64/i32 or power9 i16/i8.
@@ -14590,7 +13958,6 @@ static void fixupShuffleMaskForPermutedSToV(SmallVectorImpl<int> &ShuffV,
     if ((Idx >= 0 && Idx < LHSMaxIdx) || (Idx >= RHSMinIdx && Idx < RHSMaxIdx))
       ShuffV[i] += HalfVec;
   }
-  return;
 }
 
 // Replace a SCALAR_TO_VECTOR with a SCALAR_TO_VECTOR_PERMUTED except if
@@ -15059,18 +14426,14 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
     EVT MemVT = LD->getMemoryVT();
     Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
     Align ABIAlignment = DAG.getDataLayout().getABITypeAlign(Ty);
-    Type *STy = MemVT.getScalarType().getTypeForEVT(*DAG.getContext());
-    Align ScalarABIAlignment = DAG.getDataLayout().getABITypeAlign(STy);
     if (LD->isUnindexed() && VT.isVector() &&
         ((Subtarget.hasAltivec() && ISD::isNON_EXTLoad(N) &&
           // P8 and later hardware should just use LOAD.
           !Subtarget.hasP8Vector() &&
           (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
-           VT == MVT::v4f32)) ||
-         (Subtarget.hasQPX() && (VT == MVT::v4f64 || VT == MVT::v4f32) &&
-          LD->getAlign() >= ScalarABIAlignment)) &&
+           VT == MVT::v4f32))) &&
         LD->getAlign() < ABIAlignment) {
-      // This is a type-legal unaligned Altivec or QPX load.
+      // This is a type-legal unaligned Altivec load.
       SDValue Chain = LD->getChain();
       SDValue Ptr = LD->getBasePtr();
       bool isLittleEndian = Subtarget.isLittleEndian();
@@ -15101,24 +14464,13 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
       // optimization later.
       Intrinsic::ID Intr, IntrLD, IntrPerm;
       MVT PermCntlTy, PermTy, LDTy;
-      if (Subtarget.hasAltivec()) {
-        Intr = isLittleEndian ?  Intrinsic::ppc_altivec_lvsr :
-                                 Intrinsic::ppc_altivec_lvsl;
-        IntrLD = Intrinsic::ppc_altivec_lvx;
-        IntrPerm = Intrinsic::ppc_altivec_vperm;
-        PermCntlTy = MVT::v16i8;
-        PermTy = MVT::v4i32;
-        LDTy = MVT::v4i32;
-      } else {
-        Intr =   MemVT == MVT::v4f64 ? Intrinsic::ppc_qpx_qvlpcld :
-                                       Intrinsic::ppc_qpx_qvlpcls;
-        IntrLD = MemVT == MVT::v4f64 ? Intrinsic::ppc_qpx_qvlfd :
-                                       Intrinsic::ppc_qpx_qvlfs;
-        IntrPerm = Intrinsic::ppc_qpx_qvfperm;
-        PermCntlTy = MVT::v4f64;
-        PermTy = MVT::v4f64;
-        LDTy = MemVT.getSimpleVT();
-      }
+      Intr = isLittleEndian ? Intrinsic::ppc_altivec_lvsr
+                            : Intrinsic::ppc_altivec_lvsl;
+      IntrLD = Intrinsic::ppc_altivec_lvx;
+      IntrPerm = Intrinsic::ppc_altivec_vperm;
+      PermCntlTy = MVT::v16i8;
+      PermTy = MVT::v4i32;
+      LDTy = MVT::v4i32;
 
       SDValue PermCntl = BuildIntrinsicOp(Intr, Ptr, DAG, dl, PermCntlTy);
 
@@ -15189,10 +14541,10 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
                                 BaseLoad, ExtraLoad, PermCntl, DAG, dl);
 
       if (VT != PermTy)
-        Perm = Subtarget.hasAltivec() ?
-                 DAG.getNode(ISD::BITCAST, dl, VT, Perm) :
-                 DAG.getNode(ISD::FP_ROUND, dl, VT, Perm, // QPX
-                               DAG.getTargetConstant(1, dl, MVT::i64));
+        Perm = Subtarget.hasAltivec()
+                   ? DAG.getNode(ISD::BITCAST, dl, VT, Perm)
+                   : DAG.getNode(ISD::FP_ROUND, dl, VT, Perm,
+                                 DAG.getTargetConstant(1, dl, MVT::i64));
                                // second argument is 1 because this rounding
                                // is always exact.
 
@@ -15208,14 +14560,10 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
       unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
       Intrinsic::ID Intr = (isLittleEndian ? Intrinsic::ppc_altivec_lvsr
                                            : Intrinsic::ppc_altivec_lvsl);
-      if ((IID == Intr ||
-           IID == Intrinsic::ppc_qpx_qvlpcld  ||
-           IID == Intrinsic::ppc_qpx_qvlpcls) &&
-        N->getOperand(1)->getOpcode() == ISD::ADD) {
+      if (IID == Intr && N->getOperand(1)->getOpcode() == ISD::ADD) {
         SDValue Add = N->getOperand(1);
 
-        int Bits = IID == Intrinsic::ppc_qpx_qvlpcld ?
-                   5 /* 32 byte alignment */ : 4 /* 16 byte alignment */;
+        int Bits = 4 /* 16 byte alignment */;
 
         if (DAG.MaskedValueIsZero(Add->getOperand(1),
                                   APInt::getAllOnesValue(Bits /* alignment */)
@@ -15225,7 +14573,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
                                     UE = BasePtr->use_end();
                UI != UE; ++UI) {
             if (UI->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
-                cast<ConstantSDNode>(UI->getOperand(0))->getZExtValue() == IID) {
+                cast<ConstantSDNode>(UI->getOperand(0))->getZExtValue() ==
+                    IID) {
               // We've found another LVSL/LVSR, and this address is an aligned
               // multiple of that one. The results will be the same, so use the
               // one we've just found instead.
@@ -15357,43 +14706,43 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
     }
     break;
   case PPCISD::VCMP:
-    // If a VCMPo node already exists with exactly the same operands as this
-    // node, use its result instead of this node (VCMPo computes both a CR6 and
-    // a normal output).
+    // If a VCMP_rec node already exists with exactly the same operands as this
+    // node, use its result instead of this node (VCMP_rec computes both a CR6
+    // and a normal output).
     //
     if (!N->getOperand(0).hasOneUse() &&
         !N->getOperand(1).hasOneUse() &&
         !N->getOperand(2).hasOneUse()) {
 
-      // Scan all of the users of the LHS, looking for VCMPo's that match.
-      SDNode *VCMPoNode = nullptr;
+      // Scan all of the users of the LHS, looking for VCMP_rec's that match.
+      SDNode *VCMPrecNode = nullptr;
 
       SDNode *LHSN = N->getOperand(0).getNode();
       for (SDNode::use_iterator UI = LHSN->use_begin(), E = LHSN->use_end();
            UI != E; ++UI)
-        if (UI->getOpcode() == PPCISD::VCMPo &&
+        if (UI->getOpcode() == PPCISD::VCMP_rec &&
             UI->getOperand(1) == N->getOperand(1) &&
             UI->getOperand(2) == N->getOperand(2) &&
             UI->getOperand(0) == N->getOperand(0)) {
-          VCMPoNode = *UI;
+          VCMPrecNode = *UI;
           break;
         }
 
-      // If there is no VCMPo node, or if the flag value has a single use, don't
-      // transform this.
-      if (!VCMPoNode || VCMPoNode->hasNUsesOfValue(0, 1))
+      // If there is no VCMP_rec node, or if the flag value has a single use,
+      // don't transform this.
+      if (!VCMPrecNode || VCMPrecNode->hasNUsesOfValue(0, 1))
         break;
 
       // Look at the (necessarily single) use of the flag value.  If it has a
       // chain, this transformation is more complex.  Note that multiple things
       // could use the value result, which we should ignore.
       SDNode *FlagUser = nullptr;
-      for (SDNode::use_iterator UI = VCMPoNode->use_begin();
+      for (SDNode::use_iterator UI = VCMPrecNode->use_begin();
            FlagUser == nullptr; ++UI) {
-        assert(UI != VCMPoNode->use_end() && "Didn't find user!");
+        assert(UI != VCMPrecNode->use_end() && "Didn't find user!");
         SDNode *User = *UI;
         for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) {
-          if (User->getOperand(i) == SDValue(VCMPoNode, 1)) {
+          if (User->getOperand(i) == SDValue(VCMPrecNode, 1)) {
             FlagUser = User;
             break;
           }
@@ -15403,7 +14752,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
       // If the user is a MFOCRF instruction, we know this is safe.
       // Otherwise we give up for right now.
       if (FlagUser->getOpcode() == PPCISD::MFOCRF)
-        return SDValue(VCMPoNode, 0);
+        return SDValue(VCMPrecNode, 0);
     }
     break;
   case ISD::BRCOND: {
@@ -15492,7 +14841,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
         DAG.getConstant(CompareOpc, dl, MVT::i32)
       };
       EVT VTs[] = { LHS.getOperand(2).getValueType(), MVT::Glue };
-      SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops);
+      SDValue CompNode = DAG.getNode(PPCISD::VCMP_rec, dl, VTs, Ops);
 
       // Unpack the result based on how the target uses it.
       PPC::Predicate CompOpc;
@@ -15587,16 +14936,19 @@ void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
     case Intrinsic::ppc_altivec_vcmpequh_p:
     case Intrinsic::ppc_altivec_vcmpequw_p:
     case Intrinsic::ppc_altivec_vcmpequd_p:
+    case Intrinsic::ppc_altivec_vcmpequq_p:
     case Intrinsic::ppc_altivec_vcmpgefp_p:
     case Intrinsic::ppc_altivec_vcmpgtfp_p:
     case Intrinsic::ppc_altivec_vcmpgtsb_p:
     case Intrinsic::ppc_altivec_vcmpgtsh_p:
     case Intrinsic::ppc_altivec_vcmpgtsw_p:
     case Intrinsic::ppc_altivec_vcmpgtsd_p:
+    case Intrinsic::ppc_altivec_vcmpgtsq_p:
     case Intrinsic::ppc_altivec_vcmpgtub_p:
     case Intrinsic::ppc_altivec_vcmpgtuh_p:
     case Intrinsic::ppc_altivec_vcmpgtuw_p:
     case Intrinsic::ppc_altivec_vcmpgtud_p:
+    case Intrinsic::ppc_altivec_vcmpgtuq_p:
       Known.Zero = ~1U;  // All bits but the low one are known to be zero.
       break;
     }
@@ -15774,17 +15126,9 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
           return std::make_pair(0U, &PPC::F4RCRegClass);
         if (VT == MVT::f64 || VT == MVT::i64)
           return std::make_pair(0U, &PPC::F8RCRegClass);
-        if (VT == MVT::v4f64 && Subtarget.hasQPX())
-          return std::make_pair(0U, &PPC::QFRCRegClass);
-        if (VT == MVT::v4f32 && Subtarget.hasQPX())
-          return std::make_pair(0U, &PPC::QSRCRegClass);
       }
       break;
     case 'v':
-      if (VT == MVT::v4f64 && Subtarget.hasQPX())
-        return std::make_pair(0U, &PPC::QFRCRegClass);
-      if (VT == MVT::v4f32 && Subtarget.hasQPX())
-        return std::make_pair(0U, &PPC::QSRCRegClass);
       if (Subtarget.hasAltivec())
         return std::make_pair(0U, &PPC::VRRCRegClass);
       break;
@@ -15803,19 +15147,45 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
       return std::make_pair(0U, &PPC::VSSRCRegClass);
     else
       return std::make_pair(0U, &PPC::VSFRCRegClass);
-  }
+  } else if (Constraint == "lr") {
+    if (VT == MVT::i64)
+      return std::make_pair(0U, &PPC::LR8RCRegClass);
+    else
+      return std::make_pair(0U, &PPC::LRRCRegClass);
+  }
+
+  // Handle special cases of physical registers that are not properly handled
+  // by the base class.
+  if (Constraint[0] == '{' && Constraint[Constraint.size() - 1] == '}') {
+    // If we name a VSX register, we can't defer to the base class because it
+    // will not recognize the correct register (their names will be VSL{0-31}
+    // and V{0-31} so they won't match). So we match them here.
+    if (Constraint.size() > 3 && Constraint[1] == 'v' && Constraint[2] == 's') {
+      int VSNum = atoi(Constraint.data() + 3);
+      assert(VSNum >= 0 && VSNum <= 63 &&
+             "Attempted to access a vsr out of range");
+      if (VSNum < 32)
+        return std::make_pair(PPC::VSL0 + VSNum, &PPC::VSRCRegClass);
+      return std::make_pair(PPC::V0 + VSNum - 32, &PPC::VSRCRegClass);
+    }
 
-  // If we name a VSX register, we can't defer to the base class because it
-  // will not recognize the correct register (their names will be VSL{0-31}
-  // and V{0-31} so they won't match). So we match them here.
-  if (Constraint.size() > 3 && Constraint[1] == 'v' && Constraint[2] == 's') {
-    int VSNum = atoi(Constraint.data() + 3);
-    assert(VSNum >= 0 && VSNum <= 63 &&
-           "Attempted to access a vsr out of range");
-    if (VSNum < 32)
-      return std::make_pair(PPC::VSL0 + VSNum, &PPC::VSRCRegClass);
-    return std::make_pair(PPC::V0 + VSNum - 32, &PPC::VSRCRegClass);
+    // For float registers, we can't defer to the base class as it will match
+    // the SPILLTOVSRRC class.
+    if (Constraint.size() > 3 && Constraint[1] == 'f') {
+      int RegNum = atoi(Constraint.data() + 2);
+      if (RegNum > 31 || RegNum < 0)
+        report_fatal_error("Invalid floating point register number");
+      if (VT == MVT::f32 || VT == MVT::i32)
+        return Subtarget.hasSPE()
+                   ? std::make_pair(PPC::R0 + RegNum, &PPC::GPRCRegClass)
+                   : std::make_pair(PPC::F0 + RegNum, &PPC::F4RCRegClass);
+      if (VT == MVT::f64 || VT == MVT::i64)
+        return Subtarget.hasSPE()
+                   ? std::make_pair(PPC::S0 + RegNum, &PPC::SPERCRegClass)
+                   : std::make_pair(PPC::F0 + RegNum, &PPC::F8RCRegClass);
+    }
   }
+
   std::pair<unsigned, const TargetRegisterClass *> R =
       TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
 
@@ -15920,9 +15290,15 @@ void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
 // by AM is legal for this target, for a load/store of the specified type.
 bool PPCTargetLowering::isLegalAddressingMode(const DataLayout &DL,
                                               const AddrMode &AM, Type *Ty,
-                                              unsigned AS, Instruction *I) const {
-  // PPC does not allow r+i addressing modes for vectors!
-  if (Ty->isVectorTy() && AM.BaseOffs != 0)
+                                              unsigned AS,
+                                              Instruction *I) const {
+  // Vector type r+i form is supported since power9 as DQ form. We don't check
+  // the offset matching DQ form requirement(off % 16 == 0), because on PowerPC,
+  // imm form is preferred and the offset can be adjusted to use imm form later
+  // in pass PPCLoopInstrFormPrep. Also in LSR, for one LSRUse, it uses min and
+  // max offset to check legal addressing mode, we should be a little aggressive
+  // to contain other offsets for that LSRUse.
+  if (Ty->isVectorTy() && AM.BaseOffs != 0 && !Subtarget.hasP9Vector())
     return false;
 
   // PPC allows a sign-extended 16-bit immediate field.
@@ -16076,19 +15452,17 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
                                            MachineFunction &MF,
                                            unsigned Intrinsic) const {
   switch (Intrinsic) {
-  case Intrinsic::ppc_qpx_qvlfd:
-  case Intrinsic::ppc_qpx_qvlfs:
-  case Intrinsic::ppc_qpx_qvlfcd:
-  case Intrinsic::ppc_qpx_qvlfcs:
-  case Intrinsic::ppc_qpx_qvlfiwa:
-  case Intrinsic::ppc_qpx_qvlfiwz:
   case Intrinsic::ppc_altivec_lvx:
   case Intrinsic::ppc_altivec_lvxl:
   case Intrinsic::ppc_altivec_lvebx:
   case Intrinsic::ppc_altivec_lvehx:
   case Intrinsic::ppc_altivec_lvewx:
   case Intrinsic::ppc_vsx_lxvd2x:
-  case Intrinsic::ppc_vsx_lxvw4x: {
+  case Intrinsic::ppc_vsx_lxvw4x:
+  case Intrinsic::ppc_vsx_lxvd2x_be:
+  case Intrinsic::ppc_vsx_lxvw4x_be:
+  case Intrinsic::ppc_vsx_lxvl:
+  case Intrinsic::ppc_vsx_lxvll: {
     EVT VT;
     switch (Intrinsic) {
     case Intrinsic::ppc_altivec_lvebx:
@@ -16101,20 +15475,9 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
       VT = MVT::i32;
       break;
     case Intrinsic::ppc_vsx_lxvd2x:
+    case Intrinsic::ppc_vsx_lxvd2x_be:
       VT = MVT::v2f64;
       break;
-    case Intrinsic::ppc_qpx_qvlfd:
-      VT = MVT::v4f64;
-      break;
-    case Intrinsic::ppc_qpx_qvlfs:
-      VT = MVT::v4f32;
-      break;
-    case Intrinsic::ppc_qpx_qvlfcd:
-      VT = MVT::v2f64;
-      break;
-    case Intrinsic::ppc_qpx_qvlfcs:
-      VT = MVT::v2f32;
-      break;
     default:
       VT = MVT::v4i32;
       break;
@@ -16129,52 +15492,17 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.flags = MachineMemOperand::MOLoad;
     return true;
   }
-  case Intrinsic::ppc_qpx_qvlfda:
-  case Intrinsic::ppc_qpx_qvlfsa:
-  case Intrinsic::ppc_qpx_qvlfcda:
-  case Intrinsic::ppc_qpx_qvlfcsa:
-  case Intrinsic::ppc_qpx_qvlfiwaa:
-  case Intrinsic::ppc_qpx_qvlfiwza: {
-    EVT VT;
-    switch (Intrinsic) {
-    case Intrinsic::ppc_qpx_qvlfda:
-      VT = MVT::v4f64;
-      break;
-    case Intrinsic::ppc_qpx_qvlfsa:
-      VT = MVT::v4f32;
-      break;
-    case Intrinsic::ppc_qpx_qvlfcda:
-      VT = MVT::v2f64;
-      break;
-    case Intrinsic::ppc_qpx_qvlfcsa:
-      VT = MVT::v2f32;
-      break;
-    default:
-      VT = MVT::v4i32;
-      break;
-    }
-
-    Info.opc = ISD::INTRINSIC_W_CHAIN;
-    Info.memVT = VT;
-    Info.ptrVal = I.getArgOperand(0);
-    Info.offset = 0;
-    Info.size = VT.getStoreSize();
-    Info.align = Align(1);
-    Info.flags = MachineMemOperand::MOLoad;
-    return true;
-  }
-  case Intrinsic::ppc_qpx_qvstfd:
-  case Intrinsic::ppc_qpx_qvstfs:
-  case Intrinsic::ppc_qpx_qvstfcd:
-  case Intrinsic::ppc_qpx_qvstfcs:
-  case Intrinsic::ppc_qpx_qvstfiw:
   case Intrinsic::ppc_altivec_stvx:
   case Intrinsic::ppc_altivec_stvxl:
   case Intrinsic::ppc_altivec_stvebx:
   case Intrinsic::ppc_altivec_stvehx:
   case Intrinsic::ppc_altivec_stvewx:
   case Intrinsic::ppc_vsx_stxvd2x:
-  case Intrinsic::ppc_vsx_stxvw4x: {
+  case Intrinsic::ppc_vsx_stxvw4x:
+  case Intrinsic::ppc_vsx_stxvd2x_be:
+  case Intrinsic::ppc_vsx_stxvw4x_be:
+  case Intrinsic::ppc_vsx_stxvl:
+  case Intrinsic::ppc_vsx_stxvll: {
     EVT VT;
     switch (Intrinsic) {
     case Intrinsic::ppc_altivec_stvebx:
@@ -16187,20 +15515,9 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
       VT = MVT::i32;
       break;
     case Intrinsic::ppc_vsx_stxvd2x:
+    case Intrinsic::ppc_vsx_stxvd2x_be:
       VT = MVT::v2f64;
       break;
-    case Intrinsic::ppc_qpx_qvstfd:
-      VT = MVT::v4f64;
-      break;
-    case Intrinsic::ppc_qpx_qvstfs:
-      VT = MVT::v4f32;
-      break;
-    case Intrinsic::ppc_qpx_qvstfcd:
-      VT = MVT::v2f64;
-      break;
-    case Intrinsic::ppc_qpx_qvstfcs:
-      VT = MVT::v2f32;
-      break;
     default:
       VT = MVT::v4i32;
       break;
@@ -16215,39 +15532,6 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.flags = MachineMemOperand::MOStore;
     return true;
   }
-  case Intrinsic::ppc_qpx_qvstfda:
-  case Intrinsic::ppc_qpx_qvstfsa:
-  case Intrinsic::ppc_qpx_qvstfcda:
-  case Intrinsic::ppc_qpx_qvstfcsa:
-  case Intrinsic::ppc_qpx_qvstfiwa: {
-    EVT VT;
-    switch (Intrinsic) {
-    case Intrinsic::ppc_qpx_qvstfda:
-      VT = MVT::v4f64;
-      break;
-    case Intrinsic::ppc_qpx_qvstfsa:
-      VT = MVT::v4f32;
-      break;
-    case Intrinsic::ppc_qpx_qvstfcda:
-      VT = MVT::v2f64;
-      break;
-    case Intrinsic::ppc_qpx_qvstfcsa:
-      VT = MVT::v2f32;
-      break;
-    default:
-      VT = MVT::v4i32;
-      break;
-    }
-
-    Info.opc = ISD::INTRINSIC_VOID;
-    Info.memVT = VT;
-    Info.ptrVal = I.getArgOperand(1);
-    Info.offset = 0;
-    Info.size = VT.getStoreSize();
-    Info.align = Align(1);
-    Info.flags = MachineMemOperand::MOStore;
-    return true;
-  }
   default:
     break;
   }
@@ -16260,14 +15544,6 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
 EVT PPCTargetLowering::getOptimalMemOpType(
     const MemOp &Op, const AttributeList &FuncAttributes) const {
   if (getTargetMachine().getOptLevel() != CodeGenOpt::None) {
-    // When expanding a memset, require at least two QPX instructions to cover
-    // the cost of loading the value to be stored from the constant pool.
-    if (Subtarget.hasQPX() && Op.size() >= 32 &&
-        (Op.isMemcpy() || Op.size() >= 64) && Op.isAligned(Align(32)) &&
-        !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
-      return MVT::v4f64;
-    }
-
     // We should use Altivec/VSX loads and stores when available. For unaligned
     // addresses, unaligned VSX loads are only fast starting with the P8.
     if (Subtarget.hasAltivec() && Op.size() >= 16 &&
@@ -16386,6 +15662,33 @@ bool PPCTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
   return true;
 }
 
+bool PPCTargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
+                                               SDValue C) const {
+  // Check integral scalar types.
+  if (!VT.isScalarInteger())
+    return false;
+  if (auto *ConstNode = dyn_cast<ConstantSDNode>(C.getNode())) {
+    if (!ConstNode->getAPIntValue().isSignedIntN(64))
+      return false;
+    // This transformation will generate >= 2 operations. But the following
+    // cases will generate <= 2 instructions during ISEL. So exclude them.
+    // 1. If the constant multiplier fits 16 bits, it can be handled by one
+    // HW instruction, ie. MULLI
+    // 2. If the multiplier after shifted fits 16 bits, an extra shift
+    // instruction is needed than case 1, ie. MULLI and RLDICR
+    int64_t Imm = ConstNode->getSExtValue();
+    unsigned Shift = countTrailingZeros<uint64_t>(Imm);
+    Imm >>= Shift;
+    if (isInt<16>(Imm))
+      return false;
+    uint64_t UImm = static_cast<uint64_t>(Imm);
+    if (isPowerOf2_64(UImm + 1) || isPowerOf2_64(UImm - 1) ||
+        isPowerOf2_64(1 - UImm) || isPowerOf2_64(-1 - UImm))
+      return true;
+  }
+  return false;
+}
+
 bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
                                                    EVT VT) const {
   return isFMAFasterThanFMulAndFAdd(
@@ -16405,31 +15708,56 @@ bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(const Function &F,
   }
 }
 
-// Currently this is a copy from AArch64TargetLowering::isProfitableToHoist.
-// FIXME: add more patterns which are profitable to hoist.
+// FIXME: add more patterns which are not profitable to hoist.
 bool PPCTargetLowering::isProfitableToHoist(Instruction *I) const {
-  if (I->getOpcode() != Instruction::FMul)
-    return true;
-
   if (!I->hasOneUse())
     return true;
 
   Instruction *User = I->user_back();
   assert(User && "A single use instruction with no uses.");
 
-  if (User->getOpcode() != Instruction::FSub &&
-      User->getOpcode() != Instruction::FAdd)
-    return true;
+  switch (I->getOpcode()) {
+  case Instruction::FMul: {
+    // Don't break FMA, PowerPC prefers FMA.
+    if (User->getOpcode() != Instruction::FSub &&
+        User->getOpcode() != Instruction::FAdd)
+      return true;
 
-  const TargetOptions &Options = getTargetMachine().Options;
-  const Function *F = I->getFunction();
-  const DataLayout &DL = F->getParent()->getDataLayout();
-  Type *Ty = User->getOperand(0)->getType();
+    const TargetOptions &Options = getTargetMachine().Options;
+    const Function *F = I->getFunction();
+    const DataLayout &DL = F->getParent()->getDataLayout();
+    Type *Ty = User->getOperand(0)->getType();
+
+    return !(
+        isFMAFasterThanFMulAndFAdd(*F, Ty) &&
+        isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) &&
+        (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath));
+  }
+  case Instruction::Load: {
+    // Don't break "store (load float*)" pattern, this pattern will be combined
+    // to "store (load int32)" in later InstCombine pass. See function
+    // combineLoadToOperationType. On PowerPC, loading a float point takes more
+    // cycles than loading a 32 bit integer.
+    LoadInst *LI = cast<LoadInst>(I);
+    // For the loads that combineLoadToOperationType does nothing, like
+    // ordered load, it should be profitable to hoist them.
+    // For swifterror load, it can only be used for pointer to pointer type, so
+    // later type check should get rid of this case.
+    if (!LI->isUnordered())
+      return true;
+
+    if (User->getOpcode() != Instruction::Store)
+      return true;
+
+    if (I->getType()->getTypeID() != Type::FloatTyID)
+      return true;
 
-  return !(
-      isFMAFasterThanFMulAndFAdd(*F, Ty) &&
-      isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) &&
-      (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath));
+    return false;
+  }
+  default:
+    return true;
+  }
+  return true;
 }
 
 const MCPhysReg *
@@ -16461,7 +15789,7 @@ PPCTargetLowering::shouldExpandBuildVectorWithShuffles(
   if (VT == MVT::v2i64)
     return Subtarget.hasDirectMove(); // Don't need stack ops with direct moves
 
-  if (Subtarget.hasVSX() || Subtarget.hasQPX())
+  if (Subtarget.hasVSX())
     return true;
 
   return TargetLowering::shouldExpandBuildVectorWithShuffles(VT, DefinedValues);
@@ -16507,8 +15835,7 @@ SDValue PPCTargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
 
   switch (Opc) {
   case PPCISD::FNMSUB:
-    // TODO: QPX subtarget is deprecated. No transformation here.
-    if (!Op.hasOneUse() || !isTypeLegal(VT) || Subtarget.hasQPX())
+    if (!Op.hasOneUse() || !isTypeLegal(VT))
       break;
 
     const TargetOptions &Options = getTargetMachine().Options;
@@ -16637,10 +15964,10 @@ SDValue PPCTargetLowering::combineSHL(SDNode *N, DAGCombinerInfo &DCI) const {
 
   SDValue N0 = N->getOperand(0);
   ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N->getOperand(1));
-  if (!Subtarget.isISA3_0() ||
+  if (!Subtarget.isISA3_0() || !Subtarget.isPPC64() ||
       N0.getOpcode() != ISD::SIGN_EXTEND ||
-      N0.getOperand(0).getValueType() != MVT::i32 ||
-      CN1 == nullptr || N->getValueType(0) != MVT::i64)
+      N0.getOperand(0).getValueType() != MVT::i32 || CN1 == nullptr ||
+      N->getValueType(0) != MVT::i64)
     return SDValue();
 
   // We can't save an operation here if the value is already extended, and
@@ -16989,8 +16316,7 @@ SDValue PPCTargetLowering::combineFMALike(SDNode *N,
   bool LegalOps = !DCI.isBeforeLegalizeOps();
   SDLoc Loc(N);
 
-  // TODO: QPX subtarget is deprecated. No transformation here.
-  if (Subtarget.hasQPX() || !isOperationLegal(ISD::FMA, VT))
+  if (!isOperationLegal(ISD::FMA, VT))
     return SDValue();
 
   // Allowing transformation to FNMSUB may change sign of zeroes when ab-c=0
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.h b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 768eaa43e013..836c52bdff95 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -89,6 +89,12 @@ namespace llvm {
     FRE,
     FRSQRTE,
 
+    /// Test instruction for software square root.
+    FTSQRT,
+
+    /// Square root instruction.
+    FSQRT,
+
     /// VPERM - The PPC VPERM Instruction.
     ///
     VPERM,
@@ -146,8 +152,7 @@ namespace llvm {
     /// probed.
     PROBED_ALLOCA,
 
-    /// GlobalBaseReg - On Darwin, this node represents the result of the mflr
-    /// at function entry, used for PIC code.
+    /// The result of the mflr at function entry, used for PIC code.
     GlobalBaseReg,
 
     /// These nodes represent PPC shifts.
@@ -265,11 +270,11 @@ namespace llvm {
     /// is VCMPGTSH.
     VCMP,
 
-    /// RESVEC, OUTFLAG = VCMPo(LHS, RHS, OPC) - Represents one of the
-    /// altivec VCMP*o instructions.  For lack of better number, we use the
+    /// RESVEC, OUTFLAG = VCMP_rec(LHS, RHS, OPC) - Represents one of the
+    /// altivec VCMP*_rec instructions.  For lack of better number, we use the
     /// opcode number encoding for the OPC field to identify the compare.  For
     /// example, 838 is VCMPGTSH.
-    VCMPo,
+    VCMP_rec,
 
     /// CHAIN = COND_BRANCH CHAIN, CRRC, OPC, DESTBB [, INFLAG] - This
     /// corresponds to the COND_BRANCH pseudo instruction.  CRRC is the
@@ -381,6 +386,10 @@ namespace llvm {
     /// sym\@got\@dtprel\@l.
     ADDI_DTPREL_L,
 
+    /// G8RC = PADDI_DTPREL %x3, Symbol - For the pc-rel based local-dynamic TLS
+    /// model, produces a PADDI8 instruction that adds X3 to sym\@dtprel.
+    PADDI_DTPREL,
+
     /// VRRC = VADD_SPLAT Elt, EltSize - Temporary node to be expanded
     /// during instruction selection to optimize a BUILD_VECTOR into
     /// operations on splats.  This is necessary to avoid losing these
@@ -427,22 +436,6 @@ namespace llvm {
     ///               => VABSDUW((XVNEGSP a), (XVNEGSP b))
     VABSD,
 
-    /// QVFPERM = This corresponds to the QPX qvfperm instruction.
-    QVFPERM,
-
-    /// QVGPCI = This corresponds to the QPX qvgpci instruction.
-    QVGPCI,
-
-    /// QVALIGNI = This corresponds to the QPX qvaligni instruction.
-    QVALIGNI,
-
-    /// QVESPLATI = This corresponds to the QPX qvesplati instruction.
-    QVESPLATI,
-
-    /// QBFLT = Access the underlying QPX floating-point boolean
-    /// representation.
-    QBFLT,
-
     /// FP_EXTEND_HALF(VECTOR, IDX) - Custom extend upper (IDX=0) half or
     /// lower (IDX=1) half of v4f32 to v2f64.
     FP_EXTEND_HALF,
@@ -452,6 +445,46 @@ namespace llvm {
     /// PLD.
     MAT_PCREL_ADDR,
 
+    /// TLS_DYNAMIC_MAT_PCREL_ADDR = Materialize a PC Relative address for
+    /// TLS global address when using dynamic access models. This can be done
+    /// through an add like PADDI.
+    TLS_DYNAMIC_MAT_PCREL_ADDR,
+
+    /// TLS_LOCAL_EXEC_MAT_ADDR = Materialize an address for TLS global address
+    /// when using local exec access models, and when prefixed instructions are
+    /// available. This is used with ADD_TLS to produce an add like PADDI.
+    TLS_LOCAL_EXEC_MAT_ADDR,
+
+    /// ACC_BUILD = Build an accumulator register from 4 VSX registers.
+    ACC_BUILD,
+
+    /// PAIR_BUILD = Build a vector pair register from 2 VSX registers.
+    PAIR_BUILD,
+
+    /// EXTRACT_VSX_REG = Extract one of the underlying vsx registers of
+    /// an accumulator or pair register. This node is needed because
+    /// EXTRACT_SUBVECTOR expects the input and output vectors to have the same
+    /// element type.
+    EXTRACT_VSX_REG,
+
+    /// XXMFACC = This corresponds to the xxmfacc instruction.
+    XXMFACC,
+
+    // Constrained conversion from floating point to int
+    STRICT_FCTIDZ = ISD::FIRST_TARGET_STRICTFP_OPCODE,
+    STRICT_FCTIWZ,
+    STRICT_FCTIDUZ,
+    STRICT_FCTIWUZ,
+
+    /// Constrained integer-to-floating-point conversion instructions.
+    STRICT_FCFID,
+    STRICT_FCFIDU,
+    STRICT_FCFIDS,
+    STRICT_FCFIDUS,
+
+    /// Constrained floating point add in round-to-zero mode.
+    STRICT_FADDRTZ,
+
     /// CHAIN = STBRX CHAIN, GPRC, Ptr, Type - This is a
     /// byte-swapping store instruction.  It byte-swaps the low "Type" bits of
     /// the GPRC input, then stores it through Ptr.  Type can be either i16 or
@@ -493,6 +526,12 @@ namespace llvm {
     /// an xxswapd.
     LXVD2X,
 
+    /// LXVRZX - Load VSX Vector Rightmost and Zero Extend
+    /// This node represents v1i128 BUILD_VECTOR of a zero extending load
+    /// instruction from <byte, halfword, word, or doubleword> to i128.
+    /// Allows utilization of the Load VSX Vector Rightmost Instructions.
+    LXVRZX,
+
     /// VSRC, CHAIN = LOAD_VEC_BE CHAIN, Ptr - Occurs only for little endian.
     /// Maps directly to one of lxvd2x/lxvw4x/lxvh8x/lxvb16x depending on
     /// the vector type to load vector in big-endian element order.
@@ -519,10 +558,6 @@ namespace llvm {
     /// Store scalar integers from VSR.
     ST_VSR_SCAL_INT,
 
-    /// QBRC, CHAIN = QVLFSb CHAIN, Ptr
-    /// The 4xf32 load used for v4i1 constants.
-    QVLFSb,
-
     /// ATOMIC_CMP_SWAP - the exact same as the target-independent nodes
     /// except they ensure that the compare input is zero-extended for
     /// sub-word versions because the atomic loads zero-extend.
@@ -627,10 +662,6 @@ namespace llvm {
     /// the number of bytes of each element [124] -> [bhw].
     SDValue get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG);
 
-    /// If this is a qvaligni shuffle mask, return the shift
-    /// amount, otherwise return -1.
-    int isQVALIGNIShuffleMask(SDNode *N);
-
   } // end namespace PPC
 
   class PPCTargetLowering : public TargetLowering {
@@ -740,6 +771,8 @@ namespace llvm {
     bool SelectAddressRegImm(SDValue N, SDValue &Disp, SDValue &Base,
                              SelectionDAG &DAG,
                              MaybeAlign EncodingAlignment) const;
+    bool SelectAddressRegImm34(SDValue N, SDValue &Disp, SDValue &Base,
+                               SelectionDAG &DAG) const;
 
     /// SelectAddressRegRegOnly - Given the specified addressed, force it to be
     /// represented as an indexed [r+r] operation.
@@ -895,6 +928,9 @@ namespace llvm {
       return true;
     }
 
+    bool decomposeMulByConstant(LLVMContext &Context, EVT VT,
+                                SDValue C) const override;
+
     bool isDesirableToTransformToIntegerOp(unsigned Opc,
                                            EVT VT) const override {
       // Only handle float load/store pair because float(fpr) load/store
@@ -951,6 +987,9 @@ namespace llvm {
     shouldExpandBuildVectorWithShuffles(EVT VT,
                                         unsigned DefinedValues) const override;
 
+    // Keep the zero-extensions for arguments to libcalls.
+    bool shouldKeepZExtForFP16Conv() const override { return true; }
+
     /// createFastISel - This method returns a target-specific FastISel object,
     /// or null if the target does not support "fast" instruction selection.
     FastISel *createFastISel(FunctionLoweringInfo &FuncInfo,
@@ -980,11 +1019,6 @@ namespace llvm {
     Register
     getExceptionSelectorRegister(const Constant *PersonalityFn) const override;
 
-    /// isMulhCheaperThanMulShift - Return true if a mulh[s|u] node for a
-    /// specific type is cheaper than a multiply followed by a shift.
-    /// This is true for words and doublewords on 64-bit PowerPC.
-    bool isMulhCheaperThanMulShift(EVT Type) const override;
-
     /// Override to support customized stack guard loading.
     bool useLoadStackGuardNode() const override;
     void insertSSPDeclarations(Module &M) const override;
@@ -1042,11 +1076,6 @@ namespace llvm {
       }
     };
 
-    bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
-      // Addrspacecasts are always noops.
-      return true;
-    }
-
     bool canReuseLoadAddress(SDValue Op, EVT MemVT, ReuseLoadInfo &RLI,
                              SelectionDAG &DAG,
                              ISD::LoadExtType ET = ISD::NON_EXTLOAD) const;
@@ -1099,6 +1128,7 @@ namespace llvm {
     SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
@@ -1117,19 +1147,18 @@ namespace llvm {
     SDValue LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBSWAP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerABS(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerROTL(SDValue Op, SelectionDAG &DAG) const;
 
     SDValue LowerVectorLoad(SDValue Op, SelectionDAG &DAG) const;
@@ -1176,10 +1205,6 @@ namespace llvm {
         SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
         const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
         SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const;
-    SDValue LowerFormalArguments_Darwin(
-        SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
-        const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
-        SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const;
     SDValue LowerFormalArguments_64SVR4(
         SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
         const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
@@ -1194,13 +1219,6 @@ namespace llvm {
                                        ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
                                        const SDLoc &dl) const;
 
-    SDValue LowerCall_Darwin(SDValue Chain, SDValue Callee, CallFlags CFlags,
-                             const SmallVectorImpl<ISD::OutputArg> &Outs,
-                             const SmallVectorImpl<SDValue> &OutVals,
-                             const SmallVectorImpl<ISD::InputArg> &Ins,
-                             const SDLoc &dl, SelectionDAG &DAG,
-                             SmallVectorImpl<SDValue> &InVals,
-                             const CallBase *CB) const;
     SDValue LowerCall_64SVR4(SDValue Chain, SDValue Callee, CallFlags CFlags,
                              const SmallVectorImpl<ISD::OutputArg> &Outs,
                              const SmallVectorImpl<SDValue> &OutVals,
@@ -1257,6 +1275,10 @@ namespace llvm {
                             bool Reciprocal) const override;
     SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
                              int &RefinementSteps) const override;
+    SDValue getSqrtInputTest(SDValue Operand, SelectionDAG &DAG,
+                             const DenormalMode &Mode) const override;
+    SDValue getSqrtResultForDenormInput(SDValue Operand,
+                                        SelectionDAG &DAG) const override;
     unsigned combineRepeatedFPDivisors() const override;
 
     SDValue
@@ -1295,6 +1317,8 @@ namespace llvm {
 
   bool isIntS16Immediate(SDNode *N, int16_t &Imm);
   bool isIntS16Immediate(SDValue Op, int16_t &Imm);
+  bool isIntS34Immediate(SDNode *N, int64_t &Imm);
+  bool isIntS34Immediate(SDValue Op, int64_t &Imm);
 
   bool convertToNonDenormSingle(APInt &ArgAPInt);
   bool convertToNonDenormSingle(APFloat &ArgAPFloat);
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
index de42d354a048..03e9d6970a30 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -19,12 +19,14 @@ def s16imm64 : Operand<i64> {
   let EncoderMethod = "getImm16Encoding";
   let ParserMatchClass = PPCS16ImmAsmOperand;
   let DecoderMethod = "decodeSImmOperand<16>";
+  let OperandType = "OPERAND_IMMEDIATE";
 }
 def u16imm64 : Operand<i64> {
   let PrintMethod = "printU16ImmOperand";
   let EncoderMethod = "getImm16Encoding";
   let ParserMatchClass = PPCU16ImmAsmOperand;
   let DecoderMethod = "decodeUImmOperand<16>";
+  let OperandType = "OPERAND_IMMEDIATE";
 }
 def s17imm64 : Operand<i64> {
   // This operand type is used for addis/lis to allow the assembler parser
@@ -34,6 +36,7 @@ def s17imm64 : Operand<i64> {
   let EncoderMethod = "getImm16Encoding";
   let ParserMatchClass = PPCS17ImmAsmOperand;
   let DecoderMethod = "decodeSImmOperand<16>";
+  let OperandType = "OPERAND_IMMEDIATE";
 }
 def tocentry : Operand<iPTR> {
   let MIOperandInfo = (ops i64imm:$imm);
@@ -148,6 +151,9 @@ let isCall = 1, PPC970_Unit = 7, Defs = [LR8] in {
       def BL8_NOTOC : IForm<18, 0, 1, (outs),
                             (ins calltarget:$func),
                             "bl $func", IIC_BrB, []>;
+      def BL8_NOTOC_TLS : IForm<18, 0, 1, (outs),
+                                (ins tlscall:$func),
+                                "bl $func", IIC_BrB, []>;
     }
   }
   let Uses = [CTR8, RM] in {
@@ -840,7 +846,7 @@ let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
   def SETB8 : XForm_44<31, 128, (outs g8rc:$RT), (ins crrc:$BFA),
                        "setb $RT, $BFA", IIC_IntGeneral>, isPPC64;
 }
-def DARN : XForm_45<31, 755, (outs g8rc:$RT), (ins i32imm:$L),
+def DARN : XForm_45<31, 755, (outs g8rc:$RT), (ins u2imm:$L),
                      "darn $RT, $L", IIC_LdStLD>, isPPC64;
 def ADDPCIS : DXForm<19, 2, (outs g8rc:$RT), (ins i32imm:$D),
                      "addpcis $RT, $D", IIC_BrB, []>, isPPC64;
@@ -981,8 +987,11 @@ def : InstAlias<"cntlzw. $rA, $rS", (CNTLZW8_rec g8rc:$rA, g8rc:$rS)>;
 def : InstAlias<"mtxer $Rx", (MTSPR8 1, g8rc:$Rx)>;
 def : InstAlias<"mfxer $Rx", (MFSPR8 g8rc:$Rx, 1)>;
 
-def : InstAlias<"mtudscr $Rx", (MTSPR8 3, g8rc:$Rx)>;
-def : InstAlias<"mfudscr $Rx", (MFSPR8 g8rc:$Rx, 3)>;
+//Disable this alias on AIX for now because as does not support them.
+let Predicates = [ModernAs] in {
+  def : InstAlias<"mtudscr $Rx", (MTSPR8 3, g8rc:$Rx)>;
+  def : InstAlias<"mfudscr $Rx", (MFSPR8 g8rc:$Rx, 3)>;
+}
 
 def : InstAlias<"mfrtcu $Rx", (MFSPR8 g8rc:$Rx, 4)>;
 def : InstAlias<"mfrtcl $Rx", (MFSPR8 g8rc:$Rx, 5)>;
@@ -1056,7 +1065,7 @@ def LHA8: DForm_1<42, (outs g8rc:$rD), (ins memri:$src),
 def LWA  : DSForm_1<58, 2, (outs g8rc:$rD), (ins memrix:$src),
                     "lwa $rD, $src", IIC_LdStLWA,
                     [(set i64:$rD,
-                          (aligned4sextloadi32 iaddrX4:$src))]>, isPPC64,
+                          (DSFormSextLoadi32 iaddrX4:$src))]>, isPPC64,
                     PPC970_DGroup_Cracked;
 let Interpretation64Bit = 1, isCodeGenOnly = 1 in
 def LHAX8: XForm_1_memOp<31, 343, (outs g8rc:$rD), (ins memrr:$src),
@@ -1167,7 +1176,7 @@ def LWZUX8 : XForm_1_memOp<31, 55, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
 let PPC970_Unit = 2 in {
 def LD   : DSForm_1<58, 0, (outs g8rc:$rD), (ins memrix:$src),
                     "ld $rD, $src", IIC_LdStLD,
-                    [(set i64:$rD, (aligned4load iaddrX4:$src))]>, isPPC64;
+                    [(set i64:$rD, (DSFormLoad iaddrX4:$src))]>, isPPC64;
 // The following four definitions are selected for small code model only.
 // Otherwise, we need to create two instructions to form a 32-bit offset,
 // so we have a custom matcher for TOC_ENTRY in PPCDAGToDAGIsel::Select().
@@ -1262,17 +1271,36 @@ def ADDItlsgdL : PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm6
                        [(set i64:$rD,
                          (PPCaddiTlsgdL i64:$reg, tglobaltlsaddr:$disp))]>,
                  isPPC64;
-// LR8 is a true define, while the rest of the Defs are clobbers.  X3 is
+
+class GETtlsADDRPseudo <string asmstr> : PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc:$reg, tlsgd:$sym),
+                                             asmstr,
+                                             [(set i64:$rD,
+                                               (PPCgetTlsAddr i64:$reg, tglobaltlsaddr:$sym))]>,
+                                      isPPC64;
+class GETtlsldADDRPseudo <string asmstr> : PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc:$reg, tlsgd:$sym),
+                                             asmstr,
+                                             [(set i64:$rD,
+                                               (PPCgetTlsldAddr i64:$reg, tglobaltlsaddr:$sym))]>,
+                                      isPPC64;
+
+let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1 in {
+// LR8 is a true define, while the rest of the Defs are clobbers. X3 is
 // explicitly defined when this op is created, so not mentioned here.
 // This is lowered to BL8_NOP_TLS by the assembly printer, so the size must be
 // correct because the branch select pass is relying on it.
-let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1, Size = 8,
-    Defs = [X0,X4,X5,X6,X7,X8,X9,X10,X11,X12,LR8,CTR8,CR0,CR1,CR5,CR6,CR7] in
-def GETtlsADDR : PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc:$reg, tlsgd:$sym),
-                        "#GETtlsADDR",
-                        [(set i64:$rD,
-                          (PPCgetTlsAddr i64:$reg, tglobaltlsaddr:$sym))]>,
-                 isPPC64;
+let Defs = [X0,X4,X5,X6,X7,X8,X9,X10,X11,X12,LR8,CTR8,CR0,CR1,CR5,CR6,CR7], Size = 8 in
+def GETtlsADDR : GETtlsADDRPseudo <"#GETtlsADDR">;
+let Defs = [X0,X2,X4,X5,X6,X7,X8,X9,X10,X11,X12,LR8,CTR8,CR0,CR1,CR5,CR6,CR7], Size = 8 in
+def GETtlsADDRPCREL : GETtlsADDRPseudo <"#GETtlsADDRPCREL">;
+
+// LR8 is a true define, while the rest of the Defs are clobbers. X3 is
+// explicitly defined when this op is created, so not mentioned here.
+let Defs = [X0,X4,X5,X6,X7,X8,X9,X10,X11,X12,LR8,CTR8,CR0,CR1,CR5,CR6,CR7] in
+def GETtlsldADDR : GETtlsldADDRPseudo <"#GETtlsldADDR">;
+let Defs = [X0,X2,X4,X5,X6,X7,X8,X9,X10,X11,X12,LR8,CTR8,CR0,CR1,CR5,CR6,CR7] in
+def GETtlsldADDRPCREL : GETtlsldADDRPseudo <"#GETtlsldADDRPCREL">;
+}
+
 // Combined op for ADDItlsgdL and GETtlsADDR, late expanded.  X3 and LR8
 // are true defines while the rest of the Defs are clobbers.
 let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
@@ -1296,15 +1324,6 @@ def ADDItlsldL : PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm6
                        [(set i64:$rD,
                          (PPCaddiTlsldL i64:$reg, tglobaltlsaddr:$disp))]>,
                  isPPC64;
-// LR8 is a true define, while the rest of the Defs are clobbers.  X3 is
-// explicitly defined when this op is created, so not mentioned here.
-let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
-    Defs = [X0,X4,X5,X6,X7,X8,X9,X10,X11,X12,LR8,CTR8,CR0,CR1,CR5,CR6,CR7] in
-def GETtlsldADDR : PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc:$reg, tlsgd:$sym),
-                          "#GETtlsldADDR",
-                          [(set i64:$rD,
-                            (PPCgetTlsldAddr i64:$reg, tglobaltlsaddr:$sym))]>,
-                   isPPC64;
 // Combined op for ADDItlsldL and GETtlsADDR, late expanded.  X3 and LR8
 // are true defines, while the rest of the Defs are clobbers.
 let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
@@ -1329,6 +1348,11 @@ def ADDIdtprelL : PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm
                          [(set i64:$rD,
                            (PPCaddiDtprelL i64:$reg, tglobaltlsaddr:$disp))]>,
                   isPPC64;
+def PADDIdtprel : PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
+                          "#PADDIdtprel",
+                          [(set i64:$rD,
+                            (PPCpaddiDtprel i64:$reg, tglobaltlsaddr:$disp))]>,
+                  isPPC64;
 
 let PPC970_Unit = 2 in {
 let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
@@ -1359,7 +1383,7 @@ def STWX8 : XForm_8_memOp<31, 151, (outs), (ins g8rc:$rS, memrr:$dst),
 // Normal 8-byte stores.
 def STD  : DSForm_1<62, 0, (outs), (ins g8rc:$rS, memrix:$dst),
                     "std $rS, $dst", IIC_LdStSTD,
-                    [(aligned4store i64:$rS, iaddrX4:$dst)]>, isPPC64;
+                    [(DSFormStore i64:$rS, iaddrX4:$dst)]>, isPPC64;
 def STDX  : XForm_8_memOp<31, 149, (outs), (ins g8rc:$rS, memrr:$dst),
                           "stdx $rS, $dst", IIC_LdStSTD,
                           [(store i64:$rS, xaddrX4:$dst)]>, isPPC64,
@@ -1426,7 +1450,7 @@ def : Pat<(pre_truncsti16 i64:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
           (STHU8 $rS, iaddroff:$ptroff, $ptrreg)>;
 def : Pat<(pre_truncsti32 i64:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
           (STWU8 $rS, iaddroff:$ptroff, $ptrreg)>;
-def : Pat<(aligned4pre_store i64:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
+def : Pat<(DSFormPreStore i64:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
           (STDU $rS, iaddroff:$ptroff, $ptrreg)>;
 
 def : Pat<(pre_truncsti8 i64:$rS, iPTR:$ptrreg, iPTR:$ptroff),
@@ -1444,11 +1468,11 @@ def : Pat<(pre_store i64:$rS, iPTR:$ptrreg, iPTR:$ptroff),
 //
 
 
-let PPC970_Unit = 3, hasSideEffects = 0,
+let PPC970_Unit = 3, hasSideEffects = 0, mayRaiseFPException = 1,
     Uses = [RM] in {  // FPU Operations.
 defm FCFID  : XForm_26r<63, 846, (outs f8rc:$frD), (ins f8rc:$frB),
                         "fcfid", "$frD, $frB", IIC_FPGeneral,
-                        [(set f64:$frD, (PPCfcfid f64:$frB))]>, isPPC64;
+                        [(set f64:$frD, (PPCany_fcfid f64:$frB))]>, isPPC64;
 defm FCTID  : XForm_26r<63, 814, (outs f8rc:$frD), (ins f8rc:$frB),
                         "fctid", "$frD, $frB", IIC_FPGeneral,
                         []>, isPPC64;
@@ -1457,23 +1481,23 @@ defm FCTIDU : XForm_26r<63, 942, (outs f8rc:$frD), (ins f8rc:$frB),
                         []>, isPPC64;
 defm FCTIDZ : XForm_26r<63, 815, (outs f8rc:$frD), (ins f8rc:$frB),
                         "fctidz", "$frD, $frB", IIC_FPGeneral,
-                        [(set f64:$frD, (PPCfctidz f64:$frB))]>, isPPC64;
+                        [(set f64:$frD, (PPCany_fctidz f64:$frB))]>, isPPC64;
 
 defm FCFIDU  : XForm_26r<63, 974, (outs f8rc:$frD), (ins f8rc:$frB),
                         "fcfidu", "$frD, $frB", IIC_FPGeneral,
-                        [(set f64:$frD, (PPCfcfidu f64:$frB))]>, isPPC64;
+                        [(set f64:$frD, (PPCany_fcfidu f64:$frB))]>, isPPC64;
 defm FCFIDS  : XForm_26r<59, 846, (outs f4rc:$frD), (ins f8rc:$frB),
                         "fcfids", "$frD, $frB", IIC_FPGeneral,
-                        [(set f32:$frD, (PPCfcfids f64:$frB))]>, isPPC64;
+                        [(set f32:$frD, (PPCany_fcfids f64:$frB))]>, isPPC64;
 defm FCFIDUS : XForm_26r<59, 974, (outs f4rc:$frD), (ins f8rc:$frB),
                         "fcfidus", "$frD, $frB", IIC_FPGeneral,
-                        [(set f32:$frD, (PPCfcfidus f64:$frB))]>, isPPC64;
+                        [(set f32:$frD, (PPCany_fcfidus f64:$frB))]>, isPPC64;
 defm FCTIDUZ : XForm_26r<63, 943, (outs f8rc:$frD), (ins f8rc:$frB),
                         "fctiduz", "$frD, $frB", IIC_FPGeneral,
-                        [(set f64:$frD, (PPCfctiduz f64:$frB))]>, isPPC64;
+                        [(set f64:$frD, (PPCany_fctiduz f64:$frB))]>, isPPC64;
 defm FCTIWUZ : XForm_26r<63, 143, (outs f8rc:$frD), (ins f8rc:$frB),
                         "fctiwuz", "$frD, $frB", IIC_FPGeneral,
-                        [(set f64:$frD, (PPCfctiwuz f64:$frB))]>, isPPC64;
+                        [(set f64:$frD, (PPCany_fctiwuz f64:$frB))]>, isPPC64;
 }
 
 
@@ -1570,11 +1594,11 @@ def : Pat<(add i64:$in, (PPChi tblockaddress:$g, 0)),
 
 // Patterns to match r+r indexed loads and stores for
 // addresses without at least 4-byte alignment.
-def : Pat<(i64 (unaligned4sextloadi32 xoaddr:$src)),
+def : Pat<(i64 (NonDSFormSextLoadi32 xoaddr:$src)),
           (LWAX xoaddr:$src)>;
-def : Pat<(i64 (unaligned4load xoaddr:$src)),
+def : Pat<(i64 (NonDSFormLoad xoaddr:$src)),
           (LDX xoaddr:$src)>;
-def : Pat<(unaligned4store i64:$rS, xoaddr:$dst),
+def : Pat<(NonDSFormStore i64:$rS, xoaddr:$dst),
           (STDX $rS, xoaddr:$dst)>;
 
 // 64-bits atomic loads and stores
@@ -1585,6 +1609,11 @@ def : Pat<(atomic_store_64 iaddrX4:$ptr, i64:$val), (STD  g8rc:$val, memrix:$ptr
 def : Pat<(atomic_store_64 xaddrX4:$ptr,  i64:$val), (STDX g8rc:$val, memrr:$ptr)>;
 
 let Predicates = [IsISA3_0] in {
+// DARN (deliver random number)
+// L=0 for 32-bit, L=1 for conditioned random, L=2 for raw random
+def : Pat<(int_ppc_darn32), (EXTRACT_SUBREG (DARN 0), sub_32)>;
+def : Pat<(int_ppc_darn), (DARN 1)>;
+def : Pat<(int_ppc_darnraw), (DARN 2)>;
 
 class X_L1_RA5_RB5<bits<6> opcode, bits<10> xo, string opc, RegisterOperand ty,
                    InstrItinClass itin, list<dag> pattern>
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrAltivec.td b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
index 920eeed9d41f..1a34aa09315b 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -404,12 +404,14 @@ let isCodeGenOnly = 1 in {
                           Deprecated<DeprecatedDST>;
 }
 
-def MFVSCR : VXForm_4<1540, (outs vrrc:$vD), (ins),
-                      "mfvscr $vD", IIC_LdStStore,
-                      [(set v8i16:$vD, (int_ppc_altivec_mfvscr))]>;
-def MTVSCR : VXForm_5<1604, (outs), (ins vrrc:$vB),
-                      "mtvscr $vB", IIC_LdStLoad,
-                      [(int_ppc_altivec_mtvscr v4i32:$vB)]>;
+let hasSideEffects = 1 in {
+  def MFVSCR : VXForm_4<1540, (outs vrrc:$vD), (ins),
+                        "mfvscr $vD", IIC_LdStStore,
+                        [(set v8i16:$vD, (int_ppc_altivec_mfvscr))]>;
+  def MTVSCR : VXForm_5<1604, (outs), (ins vrrc:$vB),
+                        "mtvscr $vB", IIC_LdStLoad,
+                        [(int_ppc_altivec_mtvscr v4i32:$vB)]>;
+}
 
 let PPC970_Unit = 2, mayLoad = 1, mayStore = 0 in {  // Loads.
 def LVEBX: XForm_1_memOp<31,   7, (outs vrrc:$vD), (ins memrr:$src),
@@ -469,10 +471,11 @@ def VNMSUBFP: VAForm_1<47, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vC, vrrc:$vB),
                        "vnmsubfp $vD, $vA, $vC, $vB", IIC_VecFP,
                        [(set v4f32:$vD, (fneg (fma v4f32:$vA, v4f32:$vC,
                                                   (fneg v4f32:$vB))))]>;
-
-def VMHADDSHS  : VA1a_Int_Ty<32, "vmhaddshs", int_ppc_altivec_vmhaddshs, v8i16>;
-def VMHRADDSHS : VA1a_Int_Ty<33, "vmhraddshs", int_ppc_altivec_vmhraddshs,
-                             v8i16>;
+let hasSideEffects = 1 in {
+  def VMHADDSHS  : VA1a_Int_Ty<32, "vmhaddshs", int_ppc_altivec_vmhaddshs, v8i16>;
+  def VMHRADDSHS : VA1a_Int_Ty<33, "vmhraddshs", int_ppc_altivec_vmhraddshs,
+                               v8i16>;
+}
 def VMLADDUHM  : VA1a_Int_Ty<34, "vmladduhm", int_ppc_altivec_vmladduhm, v8i16>;
 } // isCommutable
 
@@ -608,14 +611,16 @@ def VMSUMMBM : VA1a_Int_Ty3<37, "vmsummbm", int_ppc_altivec_vmsummbm,
                             v4i32, v16i8, v4i32>;
 def VMSUMSHM : VA1a_Int_Ty3<40, "vmsumshm", int_ppc_altivec_vmsumshm,
                             v4i32, v8i16, v4i32>;
-def VMSUMSHS : VA1a_Int_Ty3<41, "vmsumshs", int_ppc_altivec_vmsumshs,
-                            v4i32, v8i16, v4i32>;
 def VMSUMUBM : VA1a_Int_Ty3<36, "vmsumubm", int_ppc_altivec_vmsumubm,
                             v4i32, v16i8, v4i32>;
 def VMSUMUHM : VA1a_Int_Ty3<38, "vmsumuhm", int_ppc_altivec_vmsumuhm,
                             v4i32, v8i16, v4i32>;
-def VMSUMUHS : VA1a_Int_Ty3<39, "vmsumuhs", int_ppc_altivec_vmsumuhs,
-                            v4i32, v8i16, v4i32>;
+let hasSideEffects = 1 in {
+  def VMSUMSHS : VA1a_Int_Ty3<41, "vmsumshs", int_ppc_altivec_vmsumshs,
+                              v4i32, v8i16, v4i32>;
+  def VMSUMUHS : VA1a_Int_Ty3<39, "vmsumuhs", int_ppc_altivec_vmsumuhs,
+                              v4i32, v8i16, v4i32>;
+}
 
 let isCommutable = 1 in {
 def VMULESB : VX1_Int_Ty2<776, "vmulesb", int_ppc_altivec_vmulesb,
@@ -665,15 +670,17 @@ def VSUBUBS : VX1_Int_Ty<1536, "vsububs" , int_ppc_altivec_vsububs, v16i8>;
 def VSUBUHS : VX1_Int_Ty<1600, "vsubuhs" , int_ppc_altivec_vsubuhs, v8i16>;
 def VSUBUWS : VX1_Int_Ty<1664, "vsubuws" , int_ppc_altivec_vsubuws, v4i32>;
 
-def VSUMSWS : VX1_Int_Ty<1928, "vsumsws" , int_ppc_altivec_vsumsws, v4i32>;
-def VSUM2SWS: VX1_Int_Ty<1672, "vsum2sws", int_ppc_altivec_vsum2sws, v4i32>;
+let hasSideEffects = 1 in {
+  def VSUMSWS : VX1_Int_Ty<1928, "vsumsws" , int_ppc_altivec_vsumsws, v4i32>;
+  def VSUM2SWS: VX1_Int_Ty<1672, "vsum2sws", int_ppc_altivec_vsum2sws, v4i32>;
 
-def VSUM4SBS: VX1_Int_Ty3<1800, "vsum4sbs", int_ppc_altivec_vsum4sbs,
-                          v4i32, v16i8, v4i32>;
-def VSUM4SHS: VX1_Int_Ty3<1608, "vsum4shs", int_ppc_altivec_vsum4shs,
-                          v4i32, v8i16, v4i32>;
-def VSUM4UBS: VX1_Int_Ty3<1544, "vsum4ubs", int_ppc_altivec_vsum4ubs,
-                          v4i32, v16i8, v4i32>;
+  def VSUM4SBS: VX1_Int_Ty3<1800, "vsum4sbs", int_ppc_altivec_vsum4sbs,
+                            v4i32, v16i8, v4i32>;
+  def VSUM4SHS: VX1_Int_Ty3<1608, "vsum4shs", int_ppc_altivec_vsum4shs,
+                            v4i32, v8i16, v4i32>;
+  def VSUM4UBS: VX1_Int_Ty3<1544, "vsum4ubs", int_ppc_altivec_vsum4ubs,
+                            v4i32, v16i8, v4i32>;
+}
 
 def VNOR : VXForm_1<1284, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
                     "vnor $vD, $vA, $vB", IIC_VecFP,
@@ -742,26 +749,28 @@ def VSPLTISW : VXForm_3<908, (outs vrrc:$vD), (ins s5imm:$SIMM),
 // Vector Pack.
 def VPKPX   : VX1_Int_Ty2<782, "vpkpx", int_ppc_altivec_vpkpx,
                           v8i16, v4i32>;
-def VPKSHSS : VX1_Int_Ty2<398, "vpkshss", int_ppc_altivec_vpkshss,
-                          v16i8, v8i16>;
-def VPKSHUS : VX1_Int_Ty2<270, "vpkshus", int_ppc_altivec_vpkshus,
-                          v16i8, v8i16>;
-def VPKSWSS : VX1_Int_Ty2<462, "vpkswss", int_ppc_altivec_vpkswss,
-                          v8i16, v4i32>;
-def VPKSWUS : VX1_Int_Ty2<334, "vpkswus", int_ppc_altivec_vpkswus,
-                          v8i16, v4i32>;
+let hasSideEffects = 1 in {
+  def VPKSHSS : VX1_Int_Ty2<398, "vpkshss", int_ppc_altivec_vpkshss,
+                            v16i8, v8i16>;
+  def VPKSHUS : VX1_Int_Ty2<270, "vpkshus", int_ppc_altivec_vpkshus,
+                            v16i8, v8i16>;
+  def VPKSWSS : VX1_Int_Ty2<462, "vpkswss", int_ppc_altivec_vpkswss,
+                            v8i16, v4i32>;
+  def VPKSWUS : VX1_Int_Ty2<334, "vpkswus", int_ppc_altivec_vpkswus,
+                            v8i16, v4i32>;
+  def VPKUHUS : VX1_Int_Ty2<142, "vpkuhus", int_ppc_altivec_vpkuhus,
+                            v16i8, v8i16>;
+  def VPKUWUS : VX1_Int_Ty2<206, "vpkuwus", int_ppc_altivec_vpkuwus,
+                            v8i16, v4i32>;
+}
 def VPKUHUM : VXForm_1<14, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
                        "vpkuhum $vD, $vA, $vB", IIC_VecFP,
                        [(set v16i8:$vD,
                          (vpkuhum_shuffle v16i8:$vA, v16i8:$vB))]>;
-def VPKUHUS : VX1_Int_Ty2<142, "vpkuhus", int_ppc_altivec_vpkuhus,
-                          v16i8, v8i16>;
 def VPKUWUM : VXForm_1<78, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
                        "vpkuwum $vD, $vA, $vB", IIC_VecFP,
                        [(set v16i8:$vD,
                          (vpkuwum_shuffle v16i8:$vA, v16i8:$vB))]>;
-def VPKUWUS : VX1_Int_Ty2<206, "vpkuwus", int_ppc_altivec_vpkuwus,
-                          v8i16, v4i32>;
 
 // Vector Unpack.
 def VUPKHPX : VX2_Int_Ty2<846, "vupkhpx", int_ppc_altivec_vupkhpx,
@@ -784,47 +793,47 @@ class VCMP<bits<10> xo, string asmstr, ValueType Ty>
   : VXRForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), asmstr,
               IIC_VecFPCompare,
               [(set Ty:$vD, (Ty (PPCvcmp Ty:$vA, Ty:$vB, xo)))]>;
-class VCMPo<bits<10> xo, string asmstr, ValueType Ty>
+class VCMP_rec<bits<10> xo, string asmstr, ValueType Ty>
   : VXRForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), asmstr,
               IIC_VecFPCompare,
-              [(set Ty:$vD, (Ty (PPCvcmp_o Ty:$vA, Ty:$vB, xo)))]> {
+              [(set Ty:$vD, (Ty (PPCvcmp_rec Ty:$vA, Ty:$vB, xo)))]> {
   let Defs = [CR6];
   let RC = 1;
 }
 
 // f32 element comparisons.0
 def VCMPBFP   : VCMP <966, "vcmpbfp $vD, $vA, $vB"  , v4f32>;
-def VCMPBFP_rec  : VCMPo<966, "vcmpbfp. $vD, $vA, $vB" , v4f32>;
+def VCMPBFP_rec  : VCMP_rec<966, "vcmpbfp. $vD, $vA, $vB" , v4f32>;
 def VCMPEQFP  : VCMP <198, "vcmpeqfp $vD, $vA, $vB" , v4f32>;
-def VCMPEQFP_rec : VCMPo<198, "vcmpeqfp. $vD, $vA, $vB", v4f32>;
+def VCMPEQFP_rec : VCMP_rec<198, "vcmpeqfp. $vD, $vA, $vB", v4f32>;
 def VCMPGEFP  : VCMP <454, "vcmpgefp $vD, $vA, $vB" , v4f32>;
-def VCMPGEFP_rec : VCMPo<454, "vcmpgefp. $vD, $vA, $vB", v4f32>;
+def VCMPGEFP_rec : VCMP_rec<454, "vcmpgefp. $vD, $vA, $vB", v4f32>;
 def VCMPGTFP  : VCMP <710, "vcmpgtfp $vD, $vA, $vB" , v4f32>;
-def VCMPGTFP_rec : VCMPo<710, "vcmpgtfp. $vD, $vA, $vB", v4f32>;
+def VCMPGTFP_rec : VCMP_rec<710, "vcmpgtfp. $vD, $vA, $vB", v4f32>;
 
 // i8 element comparisons.
 def VCMPEQUB  : VCMP <  6, "vcmpequb $vD, $vA, $vB" , v16i8>;
-def VCMPEQUB_rec : VCMPo<  6, "vcmpequb. $vD, $vA, $vB", v16i8>;
+def VCMPEQUB_rec : VCMP_rec<  6, "vcmpequb. $vD, $vA, $vB", v16i8>;
 def VCMPGTSB  : VCMP <774, "vcmpgtsb $vD, $vA, $vB" , v16i8>;
-def VCMPGTSB_rec : VCMPo<774, "vcmpgtsb. $vD, $vA, $vB", v16i8>;
+def VCMPGTSB_rec : VCMP_rec<774, "vcmpgtsb. $vD, $vA, $vB", v16i8>;
 def VCMPGTUB  : VCMP <518, "vcmpgtub $vD, $vA, $vB" , v16i8>;
-def VCMPGTUB_rec : VCMPo<518, "vcmpgtub. $vD, $vA, $vB", v16i8>;
+def VCMPGTUB_rec : VCMP_rec<518, "vcmpgtub. $vD, $vA, $vB", v16i8>;
 
 // i16 element comparisons.
 def VCMPEQUH  : VCMP < 70, "vcmpequh $vD, $vA, $vB" , v8i16>;
-def VCMPEQUH_rec : VCMPo< 70, "vcmpequh. $vD, $vA, $vB", v8i16>;
+def VCMPEQUH_rec : VCMP_rec< 70, "vcmpequh. $vD, $vA, $vB", v8i16>;
 def VCMPGTSH  : VCMP <838, "vcmpgtsh $vD, $vA, $vB" , v8i16>;
-def VCMPGTSH_rec : VCMPo<838, "vcmpgtsh. $vD, $vA, $vB", v8i16>;
+def VCMPGTSH_rec : VCMP_rec<838, "vcmpgtsh. $vD, $vA, $vB", v8i16>;
 def VCMPGTUH  : VCMP <582, "vcmpgtuh $vD, $vA, $vB" , v8i16>;
-def VCMPGTUH_rec : VCMPo<582, "vcmpgtuh. $vD, $vA, $vB", v8i16>;
+def VCMPGTUH_rec : VCMP_rec<582, "vcmpgtuh. $vD, $vA, $vB", v8i16>;
 
 // i32 element comparisons.
 def VCMPEQUW  : VCMP <134, "vcmpequw $vD, $vA, $vB" , v4i32>;
-def VCMPEQUW_rec : VCMPo<134, "vcmpequw. $vD, $vA, $vB", v4i32>;
+def VCMPEQUW_rec : VCMP_rec<134, "vcmpequw. $vD, $vA, $vB", v4i32>;
 def VCMPGTSW  : VCMP <902, "vcmpgtsw $vD, $vA, $vB" , v4i32>;
-def VCMPGTSW_rec : VCMPo<902, "vcmpgtsw. $vD, $vA, $vB", v4i32>;
+def VCMPGTSW_rec : VCMP_rec<902, "vcmpgtsw. $vD, $vA, $vB", v4i32>;
 def VCMPGTUW  : VCMP <646, "vcmpgtuw $vD, $vA, $vB" , v4i32>;
-def VCMPGTUW_rec : VCMPo<646, "vcmpgtuw. $vD, $vA, $vB", v4i32>;
+def VCMPGTUW_rec : VCMP_rec<646, "vcmpgtuw. $vD, $vA, $vB", v4i32>;
 
 let isCodeGenOnly = 1, isMoveImm = 1, isAsCheapAsAMove = 1,
     isReMaterializable = 1 in {
@@ -933,6 +942,18 @@ def : Pat<(v1i128 (bitconvert (v4i32 VRRC:$src))), (v1i128 VRRC:$src)>;
 def : Pat<(v1i128 (bitconvert (v4f32 VRRC:$src))), (v1i128 VRRC:$src)>;
 def : Pat<(v1i128 (bitconvert (v2i64 VRRC:$src))), (v1i128 VRRC:$src)>;
 
+def : Pat<(f128 (bitconvert (v16i8 VRRC:$src))), (f128 VRRC:$src)>;
+def : Pat<(f128 (bitconvert (v8i16 VRRC:$src))), (f128 VRRC:$src)>;
+def : Pat<(f128 (bitconvert (v4i32 VRRC:$src))), (f128 VRRC:$src)>;
+def : Pat<(f128 (bitconvert (v4f32 VRRC:$src))), (f128 VRRC:$src)>;
+def : Pat<(f128 (bitconvert (v2f64 VRRC:$src))), (f128 VRRC:$src)>;
+
+def : Pat<(v16i8 (bitconvert (f128 VRRC:$src))), (v16i8 VRRC:$src)>;
+def : Pat<(v8i16 (bitconvert (f128 VRRC:$src))), (v8i16 VRRC:$src)>;
+def : Pat<(v4i32 (bitconvert (f128 VRRC:$src))), (v4i32 VRRC:$src)>;
+def : Pat<(v4f32 (bitconvert (f128 VRRC:$src))), (v4f32 VRRC:$src)>;
+def : Pat<(v2f64 (bitconvert (f128 VRRC:$src))), (v2f64 VRRC:$src)>;
+
 // Max/Min
 def : Pat<(v16i8 (umax v16i8:$src1, v16i8:$src2)),
           (v16i8 (VMAXUB $src1, $src2))>;
@@ -1291,11 +1312,11 @@ def VORC : VXForm_1<1348, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
 
 // i64 element comparisons.
 def VCMPEQUD  : VCMP <199, "vcmpequd $vD, $vA, $vB" , v2i64>;
-def VCMPEQUD_rec : VCMPo<199, "vcmpequd. $vD, $vA, $vB", v2i64>;
+def VCMPEQUD_rec : VCMP_rec<199, "vcmpequd. $vD, $vA, $vB", v2i64>;
 def VCMPGTSD  : VCMP <967, "vcmpgtsd $vD, $vA, $vB" , v2i64>;
-def VCMPGTSD_rec : VCMPo<967, "vcmpgtsd. $vD, $vA, $vB", v2i64>;
+def VCMPGTSD_rec : VCMP_rec<967, "vcmpgtsd. $vD, $vA, $vB", v2i64>;
 def VCMPGTUD  : VCMP <711, "vcmpgtud $vD, $vA, $vB" , v2i64>;
-def VCMPGTUD_rec : VCMPo<711, "vcmpgtud. $vD, $vA, $vB", v2i64>;
+def VCMPGTUD_rec : VCMP_rec<711, "vcmpgtud. $vD, $vA, $vB", v2i64>;
 
 // The cryptography instructions that do not require Category:Vector.Crypto
 def VPMSUMB : VX1_Int_Ty<1032, "vpmsumb",
@@ -1306,20 +1327,22 @@ def VPMSUMW : VX1_Int_Ty<1160, "vpmsumw",
                          int_ppc_altivec_crypto_vpmsumw, v4i32>;
 def VPMSUMD : VX1_Int_Ty<1224, "vpmsumd",
                          int_ppc_altivec_crypto_vpmsumd, v2i64>;
-def VPERMXOR : VA1a_Int_Ty<45, "vpermxor",
-                         int_ppc_altivec_crypto_vpermxor, v16i8>;
+def VPERMXOR : VAForm_1<45, (outs vrrc:$VD), (ins vrrc:$VA, vrrc:$VB, vrrc:$VC),
+                        "vpermxor $VD, $VA, $VB, $VC", IIC_VecFP, []>;
 
 // Vector doubleword integer pack and unpack.
-def VPKSDSS : VX1_Int_Ty2<1486, "vpksdss", int_ppc_altivec_vpksdss,
-                          v4i32, v2i64>;
-def VPKSDUS : VX1_Int_Ty2<1358, "vpksdus", int_ppc_altivec_vpksdus,
-                          v4i32, v2i64>;
+let hasSideEffects = 1 in {
+  def VPKSDSS : VX1_Int_Ty2<1486, "vpksdss", int_ppc_altivec_vpksdss,
+                            v4i32, v2i64>;
+  def VPKSDUS : VX1_Int_Ty2<1358, "vpksdus", int_ppc_altivec_vpksdus,
+                            v4i32, v2i64>;
+  def VPKUDUS : VX1_Int_Ty2<1230, "vpkudus", int_ppc_altivec_vpkudus,
+                            v4i32, v2i64>;
+}
 def VPKUDUM : VXForm_1<1102, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
                        "vpkudum $vD, $vA, $vB", IIC_VecFP,
                        [(set v16i8:$vD,
                          (vpkudum_shuffle v16i8:$vA, v16i8:$vB))]>;
-def VPKUDUS : VX1_Int_Ty2<1230, "vpkudus", int_ppc_altivec_vpkudus,
-                          v4i32, v2i64>;
 def VUPKHSW : VX2_Int_Ty2<1614, "vupkhsw", int_ppc_altivec_vupkhsw,
                           v2i64, v4i32>;
 def VUPKLSW : VX2_Int_Ty2<1742, "vupklsw", int_ppc_altivec_vupklsw,
@@ -1363,21 +1386,21 @@ def VMSUMUDM : VA1a_Int_Ty3<35, "vmsumudm", int_ppc_altivec_vmsumudm,
 
 // i8 element comparisons.
 def VCMPNEB   : VCMP   <  7, "vcmpneb $vD, $vA, $vB"  , v16i8>;
-def VCMPNEB_rec  : VCMPo  <  7, "vcmpneb. $vD, $vA, $vB" , v16i8>;
+def VCMPNEB_rec  : VCMP_rec  <  7, "vcmpneb. $vD, $vA, $vB" , v16i8>;
 def VCMPNEZB  : VCMP <263, "vcmpnezb $vD, $vA, $vB" , v16i8>;
-def VCMPNEZB_rec : VCMPo<263, "vcmpnezb. $vD, $vA, $vB", v16i8>;
+def VCMPNEZB_rec : VCMP_rec<263, "vcmpnezb. $vD, $vA, $vB", v16i8>;
 
 // i16 element comparisons.
 def VCMPNEH   : VCMP < 71, "vcmpneh $vD, $vA, $vB"  , v8i16>;
-def VCMPNEH_rec  : VCMPo< 71, "vcmpneh. $vD, $vA, $vB" , v8i16>;
+def VCMPNEH_rec  : VCMP_rec< 71, "vcmpneh. $vD, $vA, $vB" , v8i16>;
 def VCMPNEZH  : VCMP <327, "vcmpnezh $vD, $vA, $vB" , v8i16>;
-def VCMPNEZH_rec : VCMPo<327, "vcmpnezh. $vD, $vA, $vB", v8i16>;
+def VCMPNEZH_rec : VCMP_rec<327, "vcmpnezh. $vD, $vA, $vB", v8i16>;
 
 // i32 element comparisons.
 def VCMPNEW   : VCMP <135, "vcmpnew $vD, $vA, $vB"  , v4i32>;
-def VCMPNEW_rec  : VCMPo<135, "vcmpnew. $vD, $vA, $vB" , v4i32>;
+def VCMPNEW_rec  : VCMP_rec<135, "vcmpnew. $vD, $vA, $vB" , v4i32>;
 def VCMPNEZW  : VCMP <391, "vcmpnezw $vD, $vA, $vB" , v4i32>;
-def VCMPNEZW_rec : VCMPo<391, "vcmpnezw. $vD, $vA, $vB", v4i32>;
+def VCMPNEZW_rec : VCMP_rec<391, "vcmpnezw. $vD, $vA, $vB", v4i32>;
 
 // VX-Form: [PO VRT / UIM VRB XO].
 // We use VXForm_1 to implement it, that is, we use "VRA" (5 bit) to represent
@@ -1449,11 +1472,16 @@ def VCTZD : VX_VT5_EO5_VB5<1538, 31, "vctzd",
                            [(set v2i64:$vD, (cttz v2i64:$vB))]>;
 
 // Vector Extend Sign
-def VEXTSB2W : VX_VT5_EO5_VB5<1538, 16, "vextsb2w", []>;
-def VEXTSH2W : VX_VT5_EO5_VB5<1538, 17, "vextsh2w", []>;
-def VEXTSB2D : VX_VT5_EO5_VB5<1538, 24, "vextsb2d", []>;
-def VEXTSH2D : VX_VT5_EO5_VB5<1538, 25, "vextsh2d", []>;
-def VEXTSW2D : VX_VT5_EO5_VB5<1538, 26, "vextsw2d", []>;
+def VEXTSB2W : VX_VT5_EO5_VB5<1538, 16, "vextsb2w",
+                              [(set v4i32:$vD, (int_ppc_altivec_vextsb2w v16i8:$vB))]>;
+def VEXTSH2W : VX_VT5_EO5_VB5<1538, 17, "vextsh2w",
+                              [(set v4i32:$vD, (int_ppc_altivec_vextsh2w v8i16:$vB))]>;
+def VEXTSB2D : VX_VT5_EO5_VB5<1538, 24, "vextsb2d",
+                              [(set v2i64:$vD, (int_ppc_altivec_vextsb2d v16i8:$vB))]>;
+def VEXTSH2D : VX_VT5_EO5_VB5<1538, 25, "vextsh2d",
+                              [(set v2i64:$vD, (int_ppc_altivec_vextsh2d v8i16:$vB))]>;
+def VEXTSW2D : VX_VT5_EO5_VB5<1538, 26, "vextsw2d",
+                              [(set v2i64:$vD, (int_ppc_altivec_vextsw2d v4i32:$vB))]>;
 let isCodeGenOnly = 1 in {
   def VEXTSB2Ws : VX_VT5_EO5_VB5s<1538, 16, "vextsb2w", []>;
   def VEXTSH2Ws : VX_VT5_EO5_VB5s<1538, 17, "vextsh2w", []>;
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrFormats.td b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrFormats.td
index 632d4d9deb8a..646efe64a22c 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrFormats.td
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrFormats.td
@@ -637,12 +637,12 @@ class XForm_17<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
 }
 
 class XForm_17a<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
-               InstrItinClass itin>
+               InstrItinClass itin, list<dag> pattern>
   : XForm_17<opcode, xo, OOL, IOL, asmstr, itin > {
   let FRA = 0;
+  let Pattern = pattern;
 }
 
-// Used for QPX
 class XForm_18<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
                InstrItinClass itin, list<dag> pattern>
          : I<opcode, OOL, IOL, asmstr, itin> {
@@ -1781,14 +1781,6 @@ class AForm_4<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr,
   let Inst{31}    = 0;
 }
 
-// Used for QPX
-class AForm_4a<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr,
-              InstrItinClass itin, list<dag> pattern>
-  : AForm_1<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
-  let FRA = 0;
-  let FRC = 0;
-}
-
 // 1.7.13 M-Form
 class MForm_1<bits<6> opcode, dag OOL, dag IOL, string asmstr,
               InstrItinClass itin, list<dag> pattern>
@@ -2099,49 +2091,6 @@ class VX_RD5_RSp5_PS1_XO9<bits<9> xo, dag OOL, dag IOL, string asmstr,
   let Inst{23-31} = xo;
 }
 
-// Z23-Form (used by QPX)
-class Z23Form_1<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, 
-              InstrItinClass itin, list<dag> pattern>
-         : I<opcode, OOL, IOL, asmstr, itin> {
-  bits<5> FRT;
-  bits<5> FRA;
-  bits<5> FRB;
-  bits<2> idx;
-
-  let Pattern = pattern;
-
-  bit RC = 0;    // set by isRecordForm
-
-  let Inst{6-10}  = FRT;
-  let Inst{11-15} = FRA;
-  let Inst{16-20} = FRB;
-  let Inst{21-22} = idx;
-  let Inst{23-30} = xo;
-  let Inst{31}    = RC;
-}
-
-class Z23Form_2<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr,
-              InstrItinClass itin, list<dag> pattern>
-  : Z23Form_1<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
-  let FRB = 0;
-}
-
-class Z23Form_3<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, 
-              InstrItinClass itin, list<dag> pattern>
-         : I<opcode, OOL, IOL, asmstr, itin> {
-  bits<5> FRT;
-  bits<12> idx;
-
-  let Pattern = pattern;
-
-  bit RC = 0;    // set by isRecordForm
-
-  let Inst{6-10}  = FRT;
-  let Inst{11-22} = idx;
-  let Inst{23-30} = xo;
-  let Inst{31}    = RC;
-}
-
 class Z23Form_8<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr,
               InstrItinClass itin, list<dag> pattern>
          : I<opcode, OOL, IOL, asmstr, itin> {
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrHTM.td b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrHTM.td
index 992ad8216f3b..e59a08774dc5 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrHTM.td
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrHTM.td
@@ -164,9 +164,8 @@ def : Pat<(int_ppc_tsuspend),
           (TSR 0)>;
 
 def : Pat<(i64 (int_ppc_ttest)),
-          (RLDICL (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
-                                      (TABORTWCI 0, (LI 0), 0), sub_32)),
-                   36, 28)>;
+          (i64 (INSERT_SUBREG
+                (i64 (IMPLICIT_DEF)), (TABORTWCI 0, (LI 0), 0), sub_32))>;
 
 } // [HasHTM]
 
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
index e428e7155e5e..9e3c6c569bd7 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -21,12 +21,15 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/RegisterPressure.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/CodeGen/StackMaps.h"
@@ -73,6 +76,14 @@ static cl::opt<bool>
 UseOldLatencyCalc("ppc-old-latency-calc", cl::Hidden,
   cl::desc("Use the old (incorrect) instruction latency calculation"));
 
+static cl::opt<float>
+    FMARPFactor("ppc-fma-rp-factor", cl::Hidden, cl::init(1.5),
+                cl::desc("register pressure factor for the transformations."));
+
+static cl::opt<bool> EnableFMARegPressureReduction(
+    "ppc-fma-rp-reduction", cl::Hidden, cl::init(true),
+    cl::desc("enable register pressure reduce in machine combiner pass."));
+
 // Pin the vtable to this file.
 void PPCInstrInfo::anchor() {}
 
@@ -259,14 +270,6 @@ bool PPCInstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const {
   case PPC::XVMULDP:
   case PPC::XVMULSP:
   case PPC::XSMULSP:
-  // QPX Add:
-  case PPC::QVFADD:
-  case PPC::QVFADDS:
-  case PPC::QVFADDSs:
-  // QPX Multiply:
-  case PPC::QVFMUL:
-  case PPC::QVFMULS:
-  case PPC::QVFMULSs:
     return Inst.getFlag(MachineInstr::MIFlag::FmReassoc) &&
            Inst.getFlag(MachineInstr::MIFlag::FmNsz);
   // Fixed point:
@@ -286,23 +289,23 @@ bool PPCInstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const {
 #define InfoArrayIdxFMULInst 2
 #define InfoArrayIdxAddOpIdx 3
 #define InfoArrayIdxMULOpIdx 4
+#define InfoArrayIdxFSubInst 5
 // Array keeps info for FMA instructions:
 // Index 0(InfoArrayIdxFMAInst): FMA instruction;
-// Index 1(InfoArrayIdxFAddInst): ADD instruction assoaicted with FMA;
-// Index 2(InfoArrayIdxFMULInst): MUL instruction assoaicted with FMA;
+// Index 1(InfoArrayIdxFAddInst): ADD instruction associated with FMA;
+// Index 2(InfoArrayIdxFMULInst): MUL instruction associated with FMA;
 // Index 3(InfoArrayIdxAddOpIdx): ADD operand index in FMA operands;
 // Index 4(InfoArrayIdxMULOpIdx): first MUL operand index in FMA operands;
-//                                second MUL operand index is plus 1.
-static const uint16_t FMAOpIdxInfo[][5] = {
+//                                second MUL operand index is plus 1;
+// Index 5(InfoArrayIdxFSubInst): SUB instruction associated with FMA.
+static const uint16_t FMAOpIdxInfo[][6] = {
     // FIXME: Add more FMA instructions like XSNMADDADP and so on.
-    {PPC::XSMADDADP, PPC::XSADDDP, PPC::XSMULDP, 1, 2},
-    {PPC::XSMADDASP, PPC::XSADDSP, PPC::XSMULSP, 1, 2},
-    {PPC::XVMADDADP, PPC::XVADDDP, PPC::XVMULDP, 1, 2},
-    {PPC::XVMADDASP, PPC::XVADDSP, PPC::XVMULSP, 1, 2},
-    {PPC::FMADD, PPC::FADD, PPC::FMUL, 3, 1},
-    {PPC::FMADDS, PPC::FADDS, PPC::FMULS, 3, 1},
-    {PPC::QVFMADDSs, PPC::QVFADDSs, PPC::QVFMULSs, 3, 1},
-    {PPC::QVFMADD, PPC::QVFADD, PPC::QVFMUL, 3, 1}};
+    {PPC::XSMADDADP, PPC::XSADDDP, PPC::XSMULDP, 1, 2, PPC::XSSUBDP},
+    {PPC::XSMADDASP, PPC::XSADDSP, PPC::XSMULSP, 1, 2, PPC::XSSUBSP},
+    {PPC::XVMADDADP, PPC::XVADDDP, PPC::XVMULDP, 1, 2, PPC::XVSUBDP},
+    {PPC::XVMADDASP, PPC::XVADDSP, PPC::XVMULSP, 1, 2, PPC::XVSUBSP},
+    {PPC::FMADD, PPC::FADD, PPC::FMUL, 3, 1, PPC::FSUB},
+    {PPC::FMADDS, PPC::FADDS, PPC::FMULS, 3, 1, PPC::FSUBS}};
 
 // Check if an opcode is a FMA instruction. If it is, return the index in array
 // FMAOpIdxInfo. Otherwise, return -1.
@@ -313,6 +316,8 @@ int16_t PPCInstrInfo::getFMAOpIdxInfo(unsigned Opcode) const {
   return -1;
 }
 
+// On PowerPC target, we have two kinds of patterns related to FMA:
+// 1: Improve ILP.
 // Try to reassociate FMA chains like below:
 //
 // Pattern 1:
@@ -336,11 +341,35 @@ int16_t PPCInstrInfo::getFMAOpIdxInfo(unsigned Opcode) const {
 //
 // breaking the dependency between A and B, allowing FMA to be executed in
 // parallel (or back-to-back in a pipeline) instead of depending on each other.
+//
+// 2: Reduce register pressure.
+// Try to reassociate FMA with FSUB and a constant like below:
+// C is a floatint point const.
+//
+// Pattern 1:
+//   A = FSUB  X,  Y      (Leaf)
+//   D = FMA   B,  C,  A  (Root)
+// -->
+//   A = FMA   B,  Y,  -C
+//   D = FMA   A,  X,  C
+//
+// Pattern 2:
+//   A = FSUB  X,  Y      (Leaf)
+//   D = FMA   B,  A,  C  (Root)
+// -->
+//   A = FMA   B,  Y,  -C
+//   D = FMA   A,  X,  C
+//
+//  Before the transformation, A must be assigned with different hardware
+//  register with D. After the transformation, A and D must be assigned with
+//  same hardware register due to TIE attricute of FMA instructions.
+//
 bool PPCInstrInfo::getFMAPatterns(
-    MachineInstr &Root,
-    SmallVectorImpl<MachineCombinerPattern> &Patterns) const {
+    MachineInstr &Root, SmallVectorImpl<MachineCombinerPattern> &Patterns,
+    bool DoRegPressureReduce) const {
   MachineBasicBlock *MBB = Root.getParent();
-  const MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  const MachineRegisterInfo *MRI = &MBB->getParent()->getRegInfo();
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
 
   auto IsAllOpsVirtualReg = [](const MachineInstr &Instr) {
     for (const auto &MO : Instr.explicit_operands())
@@ -349,16 +378,35 @@ bool PPCInstrInfo::getFMAPatterns(
     return true;
   };
 
-  auto IsReassociable = [&](const MachineInstr &Instr, int16_t &AddOpIdx,
-                            bool IsLeaf, bool IsAdd) {
-    int16_t Idx = -1;
-    if (!IsAdd) {
-      Idx = getFMAOpIdxInfo(Instr.getOpcode());
-      if (Idx < 0)
-        return false;
-    } else if (Instr.getOpcode() !=
-               FMAOpIdxInfo[getFMAOpIdxInfo(Root.getOpcode())]
-                           [InfoArrayIdxFAddInst])
+  auto IsReassociableAddOrSub = [&](const MachineInstr &Instr,
+                                    unsigned OpType) {
+    if (Instr.getOpcode() !=
+        FMAOpIdxInfo[getFMAOpIdxInfo(Root.getOpcode())][OpType])
+      return false;
+
+    // Instruction can be reassociated.
+    // fast math flags may prohibit reassociation.
+    if (!(Instr.getFlag(MachineInstr::MIFlag::FmReassoc) &&
+          Instr.getFlag(MachineInstr::MIFlag::FmNsz)))
+      return false;
+
+    // Instruction operands are virtual registers for reassociation.
+    if (!IsAllOpsVirtualReg(Instr))
+      return false;
+
+    // For register pressure reassociation, the FSub must have only one use as
+    // we want to delete the sub to save its def.
+    if (OpType == InfoArrayIdxFSubInst &&
+        !MRI->hasOneNonDBGUse(Instr.getOperand(0).getReg()))
+      return false;
+
+    return true;
+  };
+
+  auto IsReassociableFMA = [&](const MachineInstr &Instr, int16_t &AddOpIdx,
+                               int16_t &MulOpIdx, bool IsLeaf) {
+    int16_t Idx = getFMAOpIdxInfo(Instr.getOpcode());
+    if (Idx < 0)
       return false;
 
     // Instruction can be reassociated.
@@ -371,65 +419,356 @@ bool PPCInstrInfo::getFMAPatterns(
     if (!IsAllOpsVirtualReg(Instr))
       return false;
 
-    if (IsAdd && IsLeaf)
+    MulOpIdx = FMAOpIdxInfo[Idx][InfoArrayIdxMULOpIdx];
+    if (IsLeaf)
       return true;
 
     AddOpIdx = FMAOpIdxInfo[Idx][InfoArrayIdxAddOpIdx];
 
     const MachineOperand &OpAdd = Instr.getOperand(AddOpIdx);
-    MachineInstr *MIAdd = MRI.getUniqueVRegDef(OpAdd.getReg());
+    MachineInstr *MIAdd = MRI->getUniqueVRegDef(OpAdd.getReg());
     // If 'add' operand's def is not in current block, don't do ILP related opt.
     if (!MIAdd || MIAdd->getParent() != MBB)
       return false;
 
     // If this is not Leaf FMA Instr, its 'add' operand should only have one use
     // as this fma will be changed later.
-    return IsLeaf ? true : MRI.hasOneNonDBGUse(OpAdd.getReg());
+    return IsLeaf ? true : MRI->hasOneNonDBGUse(OpAdd.getReg());
   };
 
   int16_t AddOpIdx = -1;
+  int16_t MulOpIdx = -1;
+
+  bool IsUsedOnceL = false;
+  bool IsUsedOnceR = false;
+  MachineInstr *MULInstrL = nullptr;
+  MachineInstr *MULInstrR = nullptr;
+
+  auto IsRPReductionCandidate = [&]() {
+    // Currently, we only support float and double.
+    // FIXME: add support for other types.
+    unsigned Opcode = Root.getOpcode();
+    if (Opcode != PPC::XSMADDASP && Opcode != PPC::XSMADDADP)
+      return false;
+
+    // Root must be a valid FMA like instruction.
+    // Treat it as leaf as we don't care its add operand.
+    if (IsReassociableFMA(Root, AddOpIdx, MulOpIdx, true)) {
+      assert((MulOpIdx >= 0) && "mul operand index not right!");
+      Register MULRegL = TRI->lookThruSingleUseCopyChain(
+          Root.getOperand(MulOpIdx).getReg(), MRI);
+      Register MULRegR = TRI->lookThruSingleUseCopyChain(
+          Root.getOperand(MulOpIdx + 1).getReg(), MRI);
+      if (!MULRegL && !MULRegR)
+        return false;
+
+      if (MULRegL && !MULRegR) {
+        MULRegR =
+            TRI->lookThruCopyLike(Root.getOperand(MulOpIdx + 1).getReg(), MRI);
+        IsUsedOnceL = true;
+      } else if (!MULRegL && MULRegR) {
+        MULRegL =
+            TRI->lookThruCopyLike(Root.getOperand(MulOpIdx).getReg(), MRI);
+        IsUsedOnceR = true;
+      } else {
+        IsUsedOnceL = true;
+        IsUsedOnceR = true;
+      }
+
+      if (!Register::isVirtualRegister(MULRegL) ||
+          !Register::isVirtualRegister(MULRegR))
+        return false;
+
+      MULInstrL = MRI->getVRegDef(MULRegL);
+      MULInstrR = MRI->getVRegDef(MULRegR);
+      return true;
+    }
+    return false;
+  };
+
+  // Register pressure fma reassociation patterns.
+  if (DoRegPressureReduce && IsRPReductionCandidate()) {
+    assert((MULInstrL && MULInstrR) && "wrong register preduction candidate!");
+    // Register pressure pattern 1
+    if (isLoadFromConstantPool(MULInstrL) && IsUsedOnceR &&
+        IsReassociableAddOrSub(*MULInstrR, InfoArrayIdxFSubInst)) {
+      LLVM_DEBUG(dbgs() << "add pattern REASSOC_XY_BCA\n");
+      Patterns.push_back(MachineCombinerPattern::REASSOC_XY_BCA);
+      return true;
+    }
+
+    // Register pressure pattern 2
+    if ((isLoadFromConstantPool(MULInstrR) && IsUsedOnceL &&
+         IsReassociableAddOrSub(*MULInstrL, InfoArrayIdxFSubInst))) {
+      LLVM_DEBUG(dbgs() << "add pattern REASSOC_XY_BAC\n");
+      Patterns.push_back(MachineCombinerPattern::REASSOC_XY_BAC);
+      return true;
+    }
+  }
+
+  // ILP fma reassociation patterns.
   // Root must be a valid FMA like instruction.
-  if (!IsReassociable(Root, AddOpIdx, false, false))
+  AddOpIdx = -1;
+  if (!IsReassociableFMA(Root, AddOpIdx, MulOpIdx, false))
     return false;
 
   assert((AddOpIdx >= 0) && "add operand index not right!");
 
   Register RegB = Root.getOperand(AddOpIdx).getReg();
-  MachineInstr *Prev = MRI.getUniqueVRegDef(RegB);
+  MachineInstr *Prev = MRI->getUniqueVRegDef(RegB);
 
   // Prev must be a valid FMA like instruction.
   AddOpIdx = -1;
-  if (!IsReassociable(*Prev, AddOpIdx, false, false))
+  if (!IsReassociableFMA(*Prev, AddOpIdx, MulOpIdx, false))
     return false;
 
   assert((AddOpIdx >= 0) && "add operand index not right!");
 
   Register RegA = Prev->getOperand(AddOpIdx).getReg();
-  MachineInstr *Leaf = MRI.getUniqueVRegDef(RegA);
+  MachineInstr *Leaf = MRI->getUniqueVRegDef(RegA);
   AddOpIdx = -1;
-  if (IsReassociable(*Leaf, AddOpIdx, true, false)) {
+  if (IsReassociableFMA(*Leaf, AddOpIdx, MulOpIdx, true)) {
     Patterns.push_back(MachineCombinerPattern::REASSOC_XMM_AMM_BMM);
+    LLVM_DEBUG(dbgs() << "add pattern REASSOC_XMM_AMM_BMM\n");
     return true;
   }
-  if (IsReassociable(*Leaf, AddOpIdx, true, true)) {
+  if (IsReassociableAddOrSub(*Leaf, InfoArrayIdxFAddInst)) {
     Patterns.push_back(MachineCombinerPattern::REASSOC_XY_AMM_BMM);
+    LLVM_DEBUG(dbgs() << "add pattern REASSOC_XY_AMM_BMM\n");
     return true;
   }
   return false;
 }
 
+void PPCInstrInfo::finalizeInsInstrs(
+    MachineInstr &Root, MachineCombinerPattern &P,
+    SmallVectorImpl<MachineInstr *> &InsInstrs) const {
+  assert(!InsInstrs.empty() && "Instructions set to be inserted is empty!");
+
+  MachineFunction *MF = Root.getMF();
+  MachineRegisterInfo *MRI = &MF->getRegInfo();
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  MachineConstantPool *MCP = MF->getConstantPool();
+
+  int16_t Idx = getFMAOpIdxInfo(Root.getOpcode());
+  if (Idx < 0)
+    return;
+
+  uint16_t FirstMulOpIdx = FMAOpIdxInfo[Idx][InfoArrayIdxMULOpIdx];
+
+  // For now we only need to fix up placeholder for register pressure reduce
+  // patterns.
+  Register ConstReg = 0;
+  switch (P) {
+  case MachineCombinerPattern::REASSOC_XY_BCA:
+    ConstReg =
+        TRI->lookThruCopyLike(Root.getOperand(FirstMulOpIdx).getReg(), MRI);
+    break;
+  case MachineCombinerPattern::REASSOC_XY_BAC:
+    ConstReg =
+        TRI->lookThruCopyLike(Root.getOperand(FirstMulOpIdx + 1).getReg(), MRI);
+    break;
+  default:
+    // Not register pressure reduce patterns.
+    return;
+  }
+
+  MachineInstr *ConstDefInstr = MRI->getVRegDef(ConstReg);
+  // Get const value from const pool.
+  const Constant *C = getConstantFromConstantPool(ConstDefInstr);
+  assert(isa<llvm::ConstantFP>(C) && "not a valid constant!");
+
+  // Get negative fp const.
+  APFloat F1((dyn_cast<ConstantFP>(C))->getValueAPF());
+  F1.changeSign();
+  Constant *NegC = ConstantFP::get(dyn_cast<ConstantFP>(C)->getContext(), F1);
+  Align Alignment = MF->getDataLayout().getPrefTypeAlign(C->getType());
+
+  // Put negative fp const into constant pool.
+  unsigned ConstPoolIdx = MCP->getConstantPoolIndex(NegC, Alignment);
+
+  MachineOperand *Placeholder = nullptr;
+  // Record the placeholder PPC::ZERO8 we add in reassociateFMA.
+  for (auto *Inst : InsInstrs) {
+    for (MachineOperand &Operand : Inst->explicit_operands()) {
+      assert(Operand.isReg() && "Invalid instruction in InsInstrs!");
+      if (Operand.getReg() == PPC::ZERO8) {
+        Placeholder = &Operand;
+        break;
+      }
+    }
+  }
+
+  assert(Placeholder && "Placeholder does not exist!");
+
+  // Generate instructions to load the const fp from constant pool.
+  // We only support PPC64 and medium code model.
+  Register LoadNewConst =
+      generateLoadForNewConst(ConstPoolIdx, &Root, C->getType(), InsInstrs);
+
+  // Fill the placeholder with the new load from constant pool.
+  Placeholder->setReg(LoadNewConst);
+}
+
+bool PPCInstrInfo::shouldReduceRegisterPressure(
+    MachineBasicBlock *MBB, RegisterClassInfo *RegClassInfo) const {
+
+  if (!EnableFMARegPressureReduction)
+    return false;
+
+  // Currently, we only enable register pressure reducing in machine combiner
+  // for: 1: PPC64; 2: Code Model is Medium; 3: Power9 which also has vector
+  // support.
+  //
+  // So we need following instructions to access a TOC entry:
+  //
+  // %6:g8rc_and_g8rc_nox0 = ADDIStocHA8 $x2, %const.0
+  // %7:vssrc = DFLOADf32 target-flags(ppc-toc-lo) %const.0,
+  //   killed %6:g8rc_and_g8rc_nox0, implicit $x2 :: (load 4 from constant-pool)
+  //
+  // FIXME: add more supported targets, like Small and Large code model, PPC32,
+  // AIX.
+  if (!(Subtarget.isPPC64() && Subtarget.hasP9Vector() &&
+        Subtarget.getTargetMachine().getCodeModel() == CodeModel::Medium))
+    return false;
+
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  MachineFunction *MF = MBB->getParent();
+  MachineRegisterInfo *MRI = &MF->getRegInfo();
+
+  auto GetMBBPressure = [&](MachineBasicBlock *MBB) -> std::vector<unsigned> {
+    RegionPressure Pressure;
+    RegPressureTracker RPTracker(Pressure);
+
+    // Initialize the register pressure tracker.
+    RPTracker.init(MBB->getParent(), RegClassInfo, nullptr, MBB, MBB->end(),
+                   /*TrackLaneMasks*/ false, /*TrackUntiedDefs=*/true);
+
+    for (MachineBasicBlock::iterator MII = MBB->instr_end(),
+                                     MIE = MBB->instr_begin();
+         MII != MIE; --MII) {
+      MachineInstr &MI = *std::prev(MII);
+      if (MI.isDebugValue() || MI.isDebugLabel())
+        continue;
+      RegisterOperands RegOpers;
+      RegOpers.collect(MI, *TRI, *MRI, false, false);
+      RPTracker.recedeSkipDebugValues();
+      assert(&*RPTracker.getPos() == &MI && "RPTracker sync error!");
+      RPTracker.recede(RegOpers);
+    }
+
+    // Close the RPTracker to finalize live ins.
+    RPTracker.closeRegion();
+
+    return RPTracker.getPressure().MaxSetPressure;
+  };
+
+  // For now we only care about float and double type fma.
+  unsigned VSSRCLimit = TRI->getRegPressureSetLimit(
+      *MBB->getParent(), PPC::RegisterPressureSets::VSSRC);
+
+  // Only reduce register pressure when pressure is high.
+  return GetMBBPressure(MBB)[PPC::RegisterPressureSets::VSSRC] >
+         (float)VSSRCLimit * FMARPFactor;
+}
+
+bool PPCInstrInfo::isLoadFromConstantPool(MachineInstr *I) const {
+  // I has only one memory operand which is load from constant pool.
+  if (!I->hasOneMemOperand())
+    return false;
+
+  MachineMemOperand *Op = I->memoperands()[0];
+  return Op->isLoad() && Op->getPseudoValue() &&
+         Op->getPseudoValue()->kind() == PseudoSourceValue::ConstantPool;
+}
+
+Register PPCInstrInfo::generateLoadForNewConst(
+    unsigned Idx, MachineInstr *MI, Type *Ty,
+    SmallVectorImpl<MachineInstr *> &InsInstrs) const {
+  // Now we only support PPC64, Medium code model and P9 with vector.
+  // We have immutable pattern to access const pool. See function
+  // shouldReduceRegisterPressure.
+  assert((Subtarget.isPPC64() && Subtarget.hasP9Vector() &&
+          Subtarget.getTargetMachine().getCodeModel() == CodeModel::Medium) &&
+         "Target not supported!\n");
+
+  MachineFunction *MF = MI->getMF();
+  MachineRegisterInfo *MRI = &MF->getRegInfo();
+
+  // Generate ADDIStocHA8
+  Register VReg1 = MRI->createVirtualRegister(&PPC::G8RC_and_G8RC_NOX0RegClass);
+  MachineInstrBuilder TOCOffset =
+      BuildMI(*MF, MI->getDebugLoc(), get(PPC::ADDIStocHA8), VReg1)
+          .addReg(PPC::X2)
+          .addConstantPoolIndex(Idx);
+
+  assert((Ty->isFloatTy() || Ty->isDoubleTy()) &&
+         "Only float and double are supported!");
+
+  unsigned LoadOpcode;
+  // Should be float type or double type.
+  if (Ty->isFloatTy())
+    LoadOpcode = PPC::DFLOADf32;
+  else
+    LoadOpcode = PPC::DFLOADf64;
+
+  const TargetRegisterClass *RC = MRI->getRegClass(MI->getOperand(0).getReg());
+  Register VReg2 = MRI->createVirtualRegister(RC);
+  MachineMemOperand *MMO = MF->getMachineMemOperand(
+      MachinePointerInfo::getConstantPool(*MF), MachineMemOperand::MOLoad,
+      Ty->getScalarSizeInBits() / 8, MF->getDataLayout().getPrefTypeAlign(Ty));
+
+  // Generate Load from constant pool.
+  MachineInstrBuilder Load =
+      BuildMI(*MF, MI->getDebugLoc(), get(LoadOpcode), VReg2)
+          .addConstantPoolIndex(Idx)
+          .addReg(VReg1, getKillRegState(true))
+          .addMemOperand(MMO);
+
+  Load->getOperand(1).setTargetFlags(PPCII::MO_TOC_LO);
+
+  // Insert the toc load instructions into InsInstrs.
+  InsInstrs.insert(InsInstrs.begin(), Load);
+  InsInstrs.insert(InsInstrs.begin(), TOCOffset);
+  return VReg2;
+}
+
+// This function returns the const value in constant pool if the \p I is a load
+// from constant pool.
+const Constant *
+PPCInstrInfo::getConstantFromConstantPool(MachineInstr *I) const {
+  MachineFunction *MF = I->getMF();
+  MachineRegisterInfo *MRI = &MF->getRegInfo();
+  MachineConstantPool *MCP = MF->getConstantPool();
+  assert(I->mayLoad() && "Should be a load instruction.\n");
+  for (auto MO : I->uses()) {
+    if (!MO.isReg())
+      continue;
+    Register Reg = MO.getReg();
+    if (Reg == 0 || !Register::isVirtualRegister(Reg))
+      continue;
+    // Find the toc address.
+    MachineInstr *DefMI = MRI->getVRegDef(Reg);
+    for (auto MO2 : DefMI->uses())
+      if (MO2.isCPI())
+        return (MCP->getConstants())[MO2.getIndex()].Val.ConstVal;
+  }
+  return nullptr;
+}
+
 bool PPCInstrInfo::getMachineCombinerPatterns(
-    MachineInstr &Root,
-    SmallVectorImpl<MachineCombinerPattern> &Patterns) const {
+    MachineInstr &Root, SmallVectorImpl<MachineCombinerPattern> &Patterns,
+    bool DoRegPressureReduce) const {
   // Using the machine combiner in this way is potentially expensive, so
   // restrict to when aggressive optimizations are desired.
   if (Subtarget.getTargetMachine().getOptLevel() != CodeGenOpt::Aggressive)
     return false;
 
-  if (getFMAPatterns(Root, Patterns))
+  if (getFMAPatterns(Root, Patterns, DoRegPressureReduce))
     return true;
 
-  return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns);
+  return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
+                                                     DoRegPressureReduce);
 }
 
 void PPCInstrInfo::genAlternativeCodeSequence(
@@ -440,6 +779,8 @@ void PPCInstrInfo::genAlternativeCodeSequence(
   switch (Pattern) {
   case MachineCombinerPattern::REASSOC_XY_AMM_BMM:
   case MachineCombinerPattern::REASSOC_XMM_AMM_BMM:
+  case MachineCombinerPattern::REASSOC_XY_BCA:
+  case MachineCombinerPattern::REASSOC_XY_BAC:
     reassociateFMA(Root, Pattern, InsInstrs, DelInstrs, InstrIdxForVirtReg);
     break;
   default:
@@ -450,8 +791,6 @@ void PPCInstrInfo::genAlternativeCodeSequence(
   }
 }
 
-// Currently, only handle two patterns REASSOC_XY_AMM_BMM and
-// REASSOC_XMM_AMM_BMM. See comments for getFMAPatterns.
 void PPCInstrInfo::reassociateFMA(
     MachineInstr &Root, MachineCombinerPattern Pattern,
     SmallVectorImpl<MachineInstr *> &InsInstrs,
@@ -459,6 +798,7 @@ void PPCInstrInfo::reassociateFMA(
     DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
   MachineFunction *MF = Root.getMF();
   MachineRegisterInfo &MRI = MF->getRegInfo();
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
   MachineOperand &OpC = Root.getOperand(0);
   Register RegC = OpC.getReg();
   const TargetRegisterClass *RC = MRI.getRegClass(RegC);
@@ -468,13 +808,42 @@ void PPCInstrInfo::reassociateFMA(
   int16_t Idx = getFMAOpIdxInfo(FmaOp);
   assert(Idx >= 0 && "Root must be a FMA instruction");
 
+  bool IsILPReassociate =
+      (Pattern == MachineCombinerPattern::REASSOC_XY_AMM_BMM) ||
+      (Pattern == MachineCombinerPattern::REASSOC_XMM_AMM_BMM);
+
   uint16_t AddOpIdx = FMAOpIdxInfo[Idx][InfoArrayIdxAddOpIdx];
   uint16_t FirstMulOpIdx = FMAOpIdxInfo[Idx][InfoArrayIdxMULOpIdx];
-  MachineInstr *Prev = MRI.getUniqueVRegDef(Root.getOperand(AddOpIdx).getReg());
-  MachineInstr *Leaf =
-      MRI.getUniqueVRegDef(Prev->getOperand(AddOpIdx).getReg());
-  uint16_t IntersectedFlags =
-      Root.getFlags() & Prev->getFlags() & Leaf->getFlags();
+
+  MachineInstr *Prev = nullptr;
+  MachineInstr *Leaf = nullptr;
+  switch (Pattern) {
+  default:
+    llvm_unreachable("not recognized pattern!");
+  case MachineCombinerPattern::REASSOC_XY_AMM_BMM:
+  case MachineCombinerPattern::REASSOC_XMM_AMM_BMM:
+    Prev = MRI.getUniqueVRegDef(Root.getOperand(AddOpIdx).getReg());
+    Leaf = MRI.getUniqueVRegDef(Prev->getOperand(AddOpIdx).getReg());
+    break;
+  case MachineCombinerPattern::REASSOC_XY_BAC: {
+    Register MULReg =
+        TRI->lookThruCopyLike(Root.getOperand(FirstMulOpIdx).getReg(), &MRI);
+    Leaf = MRI.getVRegDef(MULReg);
+    break;
+  }
+  case MachineCombinerPattern::REASSOC_XY_BCA: {
+    Register MULReg = TRI->lookThruCopyLike(
+        Root.getOperand(FirstMulOpIdx + 1).getReg(), &MRI);
+    Leaf = MRI.getVRegDef(MULReg);
+    break;
+  }
+  }
+
+  uint16_t IntersectedFlags = 0;
+  if (IsILPReassociate)
+    IntersectedFlags = Root.getFlags() & Prev->getFlags() & Leaf->getFlags();
+  else
+    IntersectedFlags = Root.getFlags() & Leaf->getFlags();
 
   auto GetOperandInfo = [&](const MachineOperand &Operand, Register &Reg,
                             bool &KillFlag) {
@@ -484,36 +853,51 @@ void PPCInstrInfo::reassociateFMA(
   };
 
   auto GetFMAInstrInfo = [&](const MachineInstr &Instr, Register &MulOp1,
-                             Register &MulOp2, bool &MulOp1KillFlag,
-                             bool &MulOp2KillFlag) {
+                             Register &MulOp2, Register &AddOp,
+                             bool &MulOp1KillFlag, bool &MulOp2KillFlag,
+                             bool &AddOpKillFlag) {
     GetOperandInfo(Instr.getOperand(FirstMulOpIdx), MulOp1, MulOp1KillFlag);
     GetOperandInfo(Instr.getOperand(FirstMulOpIdx + 1), MulOp2, MulOp2KillFlag);
+    GetOperandInfo(Instr.getOperand(AddOpIdx), AddOp, AddOpKillFlag);
   };
 
-  Register RegM11, RegM12, RegX, RegY, RegM21, RegM22, RegM31, RegM32;
+  Register RegM11, RegM12, RegX, RegY, RegM21, RegM22, RegM31, RegM32, RegA11,
+      RegA21, RegB;
   bool KillX = false, KillY = false, KillM11 = false, KillM12 = false,
-       KillM21 = false, KillM22 = false, KillM31 = false, KillM32 = false;
+       KillM21 = false, KillM22 = false, KillM31 = false, KillM32 = false,
+       KillA11 = false, KillA21 = false, KillB = false;
 
-  GetFMAInstrInfo(Root, RegM31, RegM32, KillM31, KillM32);
-  GetFMAInstrInfo(*Prev, RegM21, RegM22, KillM21, KillM22);
+  GetFMAInstrInfo(Root, RegM31, RegM32, RegB, KillM31, KillM32, KillB);
+
+  if (IsILPReassociate)
+    GetFMAInstrInfo(*Prev, RegM21, RegM22, RegA21, KillM21, KillM22, KillA21);
 
   if (Pattern == MachineCombinerPattern::REASSOC_XMM_AMM_BMM) {
-    GetFMAInstrInfo(*Leaf, RegM11, RegM12, KillM11, KillM12);
+    GetFMAInstrInfo(*Leaf, RegM11, RegM12, RegA11, KillM11, KillM12, KillA11);
     GetOperandInfo(Leaf->getOperand(AddOpIdx), RegX, KillX);
   } else if (Pattern == MachineCombinerPattern::REASSOC_XY_AMM_BMM) {
     GetOperandInfo(Leaf->getOperand(1), RegX, KillX);
     GetOperandInfo(Leaf->getOperand(2), RegY, KillY);
+  } else {
+    // Get FSUB instruction info.
+    GetOperandInfo(Leaf->getOperand(1), RegX, KillX);
+    GetOperandInfo(Leaf->getOperand(2), RegY, KillY);
   }
 
   // Create new virtual registers for the new results instead of
   // recycling legacy ones because the MachineCombiner's computation of the
   // critical path requires a new register definition rather than an existing
   // one.
+  // For register pressure reassociation, we only need create one virtual
+  // register for the new fma.
   Register NewVRA = MRI.createVirtualRegister(RC);
   InstrIdxForVirtReg.insert(std::make_pair(NewVRA, 0));
 
-  Register NewVRB = MRI.createVirtualRegister(RC);
-  InstrIdxForVirtReg.insert(std::make_pair(NewVRB, 1));
+  Register NewVRB = 0;
+  if (IsILPReassociate) {
+    NewVRB = MRI.createVirtualRegister(RC);
+    InstrIdxForVirtReg.insert(std::make_pair(NewVRB, 1));
+  }
 
   Register NewVRD = 0;
   if (Pattern == MachineCombinerPattern::REASSOC_XMM_AMM_BMM) {
@@ -532,7 +916,11 @@ void PPCInstrInfo::reassociateFMA(
     MI->getOperand(FirstMulOpIdx + 1).setIsKill(KillRegMul2);
   };
 
-  if (Pattern == MachineCombinerPattern::REASSOC_XY_AMM_BMM) {
+  MachineInstrBuilder NewARegPressure, NewCRegPressure;
+  switch (Pattern) {
+  default:
+    llvm_unreachable("not recognized pattern!");
+  case MachineCombinerPattern::REASSOC_XY_AMM_BMM: {
     // Create new instructions for insertion.
     MachineInstrBuilder MINewB =
         BuildMI(*MF, Prev->getDebugLoc(), get(FmaOp), NewVRB)
@@ -565,7 +953,9 @@ void PPCInstrInfo::reassociateFMA(
     InsInstrs.push_back(MINewA);
     InsInstrs.push_back(MINewB);
     InsInstrs.push_back(MINewC);
-  } else if (Pattern == MachineCombinerPattern::REASSOC_XMM_AMM_BMM) {
+    break;
+  }
+  case MachineCombinerPattern::REASSOC_XMM_AMM_BMM: {
     assert(NewVRD && "new FMA register not created!");
     // Create new instructions for insertion.
     MachineInstrBuilder MINewA =
@@ -607,6 +997,47 @@ void PPCInstrInfo::reassociateFMA(
     InsInstrs.push_back(MINewB);
     InsInstrs.push_back(MINewD);
     InsInstrs.push_back(MINewC);
+    break;
+  }
+  case MachineCombinerPattern::REASSOC_XY_BAC:
+  case MachineCombinerPattern::REASSOC_XY_BCA: {
+    Register VarReg;
+    bool KillVarReg = false;
+    if (Pattern == MachineCombinerPattern::REASSOC_XY_BCA) {
+      VarReg = RegM31;
+      KillVarReg = KillM31;
+    } else {
+      VarReg = RegM32;
+      KillVarReg = KillM32;
+    }
+    // We don't want to get negative const from memory pool too early, as the
+    // created entry will not be deleted even if it has no users. Since all
+    // operand of Leaf and Root are virtual register, we use zero register
+    // here as a placeholder. When the InsInstrs is selected in
+    // MachineCombiner, we call finalizeInsInstrs to replace the zero register
+    // with a virtual register which is a load from constant pool.
+    NewARegPressure = BuildMI(*MF, Root.getDebugLoc(), get(FmaOp), NewVRA)
+                          .addReg(RegB, getKillRegState(RegB))
+                          .addReg(RegY, getKillRegState(KillY))
+                          .addReg(PPC::ZERO8);
+    NewCRegPressure = BuildMI(*MF, Root.getDebugLoc(), get(FmaOp), RegC)
+                          .addReg(NewVRA, getKillRegState(true))
+                          .addReg(RegX, getKillRegState(KillX))
+                          .addReg(VarReg, getKillRegState(KillVarReg));
+    // For now, we only support xsmaddadp/xsmaddasp, their add operand are
+    // both at index 1, no need to adjust.
+    // FIXME: when add more fma instructions support, like fma/fmas, adjust
+    // the operand index here.
+    break;
+  }
+  }
+
+  if (!IsILPReassociate) {
+    setSpecialOperandAttr(*NewARegPressure, IntersectedFlags);
+    setSpecialOperandAttr(*NewCRegPressure, IntersectedFlags);
+
+    InsInstrs.push_back(NewARegPressure);
+    InsInstrs.push_back(NewCRegPressure);
   }
 
   assert(!InsInstrs.empty() &&
@@ -614,7 +1045,8 @@ void PPCInstrInfo::reassociateFMA(
 
   // Record old instructions for deletion.
   DelInstrs.push_back(Leaf);
-  DelInstrs.push_back(Prev);
+  if (IsILPReassociate)
+    DelInstrs.push_back(Prev);
   DelInstrs.push_back(&Root);
 }
 
@@ -666,7 +1098,6 @@ bool PPCInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
   case PPC::LI8:
   case PPC::LIS:
   case PPC::LIS8:
-  case PPC::QVGPCI:
   case PPC::ADDIStocHA:
   case PPC::ADDIStocHA8:
   case PPC::ADDItocL:
@@ -683,6 +1114,7 @@ bool PPCInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
   case PPC::V_SETALLONES:
   case PPC::CRSET:
   case PPC::CRUNSET:
+  case PPC::XXSETACCZ:
     return true;
   }
   return false;
@@ -1283,14 +1715,22 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
        .addImm(31);
     return;
   } else if (PPC::CRRCRegClass.contains(SrcReg) &&
-      PPC::G8RCRegClass.contains(DestReg)) {
-    BuildMI(MBB, I, DL, get(PPC::MFOCRF8), DestReg).addReg(SrcReg);
-    getKillRegState(KillSrc);
-    return;
-  } else if (PPC::CRRCRegClass.contains(SrcReg) &&
-      PPC::GPRCRegClass.contains(DestReg)) {
-    BuildMI(MBB, I, DL, get(PPC::MFOCRF), DestReg).addReg(SrcReg);
+             (PPC::G8RCRegClass.contains(DestReg) ||
+              PPC::GPRCRegClass.contains(DestReg))) {
+    bool Is64Bit = PPC::G8RCRegClass.contains(DestReg);
+    unsigned MvCode = Is64Bit ? PPC::MFOCRF8 : PPC::MFOCRF;
+    unsigned ShCode = Is64Bit ? PPC::RLWINM8 : PPC::RLWINM;
+    unsigned CRNum = TRI->getEncodingValue(SrcReg);
+    BuildMI(MBB, I, DL, get(MvCode), DestReg).addReg(SrcReg);
     getKillRegState(KillSrc);
+    if (CRNum == 7)
+      return;
+    // Shift the CR bits to make the CR field in the lowest 4 bits of GRC.
+    BuildMI(MBB, I, DL, get(ShCode), DestReg)
+        .addReg(DestReg, RegState::Kill)
+        .addImm(CRNum * 4 + 4)
+        .addImm(28)
+        .addImm(31);
     return;
   } else if (PPC::G8RCRegClass.contains(SrcReg) &&
              PPC::VSFRCRegClass.contains(DestReg)) {
@@ -1343,17 +1783,53 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   else if (PPC::VSFRCRegClass.contains(DestReg, SrcReg) ||
            PPC::VSSRCRegClass.contains(DestReg, SrcReg))
     Opc = (Subtarget.hasP9Vector()) ? PPC::XSCPSGNDP : PPC::XXLORf;
-  else if (PPC::QFRCRegClass.contains(DestReg, SrcReg))
-    Opc = PPC::QVFMR;
-  else if (PPC::QSRCRegClass.contains(DestReg, SrcReg))
-    Opc = PPC::QVFMRs;
-  else if (PPC::QBRCRegClass.contains(DestReg, SrcReg))
-    Opc = PPC::QVFMRb;
+  else if (Subtarget.pairedVectorMemops() &&
+           PPC::VSRpRCRegClass.contains(DestReg, SrcReg)) {
+    if (SrcReg > PPC::VSRp15)
+      SrcReg = PPC::V0 + (SrcReg - PPC::VSRp16) * 2;
+    else
+      SrcReg = PPC::VSL0 + (SrcReg - PPC::VSRp0) * 2;
+    if (DestReg > PPC::VSRp15)
+      DestReg = PPC::V0 + (DestReg - PPC::VSRp16) * 2;
+    else
+      DestReg = PPC::VSL0 + (DestReg - PPC::VSRp0) * 2;
+    BuildMI(MBB, I, DL, get(PPC::XXLOR), DestReg).
+      addReg(SrcReg).addReg(SrcReg, getKillRegState(KillSrc));
+    BuildMI(MBB, I, DL, get(PPC::XXLOR), DestReg + 1).
+      addReg(SrcReg + 1).addReg(SrcReg + 1, getKillRegState(KillSrc));
+    return;
+  }
   else if (PPC::CRBITRCRegClass.contains(DestReg, SrcReg))
     Opc = PPC::CROR;
   else if (PPC::SPERCRegClass.contains(DestReg, SrcReg))
     Opc = PPC::EVOR;
-  else
+  else if ((PPC::ACCRCRegClass.contains(DestReg) ||
+            PPC::UACCRCRegClass.contains(DestReg)) &&
+           (PPC::ACCRCRegClass.contains(SrcReg) ||
+            PPC::UACCRCRegClass.contains(SrcReg))) {
+    // If primed, de-prime the source register, copy the individual registers
+    // and prime the destination if needed. The vector subregisters are
+    // vs[(u)acc * 4] - vs[(u)acc * 4 + 3]. If the copy is not a kill and the
+    // source is primed, we need to re-prime it after the copy as well.
+    PPCRegisterInfo::emitAccCopyInfo(MBB, DestReg, SrcReg);
+    bool DestPrimed = PPC::ACCRCRegClass.contains(DestReg);
+    bool SrcPrimed = PPC::ACCRCRegClass.contains(SrcReg);
+    MCRegister VSLSrcReg =
+        PPC::VSL0 + (SrcReg - (SrcPrimed ? PPC::ACC0 : PPC::UACC0)) * 4;
+    MCRegister VSLDestReg =
+        PPC::VSL0 + (DestReg - (DestPrimed ? PPC::ACC0 : PPC::UACC0)) * 4;
+    if (SrcPrimed)
+      BuildMI(MBB, I, DL, get(PPC::XXMFACC), SrcReg).addReg(SrcReg);
+    for (unsigned Idx = 0; Idx < 4; Idx++)
+      BuildMI(MBB, I, DL, get(PPC::XXLOR), VSLDestReg + Idx)
+          .addReg(VSLSrcReg + Idx)
+          .addReg(VSLSrcReg + Idx, getKillRegState(KillSrc));
+    if (DestPrimed)
+      BuildMI(MBB, I, DL, get(PPC::XXMTACC), DestReg).addReg(DestReg);
+    if (SrcPrimed && !KillSrc)
+      BuildMI(MBB, I, DL, get(PPC::XXMTACC), SrcReg).addReg(SrcReg);
+    return;
+  } else
     llvm_unreachable("Impossible reg-to-reg copy");
 
   const MCInstrDesc &MCID = get(Opc);
@@ -1364,7 +1840,7 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     BuildMI(MBB, I, DL, MCID, DestReg).addReg(SrcReg, getKillRegState(KillSrc));
 }
 
-static unsigned getSpillIndex(const TargetRegisterClass *RC) {
+unsigned PPCInstrInfo::getSpillIndex(const TargetRegisterClass *RC) const {
   int OpcodeIndex = 0;
 
   if (PPC::GPRCRegClass.hasSubClassEq(RC) ||
@@ -1391,16 +1867,20 @@ static unsigned getSpillIndex(const TargetRegisterClass *RC) {
     OpcodeIndex = SOK_VectorFloat8Spill;
   } else if (PPC::VSSRCRegClass.hasSubClassEq(RC)) {
     OpcodeIndex = SOK_VectorFloat4Spill;
-  } else if (PPC::VRSAVERCRegClass.hasSubClassEq(RC)) {
-    OpcodeIndex = SOK_VRSaveSpill;
-  } else if (PPC::QFRCRegClass.hasSubClassEq(RC)) {
-    OpcodeIndex = SOK_QuadFloat8Spill;
-  } else if (PPC::QSRCRegClass.hasSubClassEq(RC)) {
-    OpcodeIndex = SOK_QuadFloat4Spill;
-  } else if (PPC::QBRCRegClass.hasSubClassEq(RC)) {
-    OpcodeIndex = SOK_QuadBitSpill;
   } else if (PPC::SPILLTOVSRRCRegClass.hasSubClassEq(RC)) {
     OpcodeIndex = SOK_SpillToVSR;
+  } else if (PPC::ACCRCRegClass.hasSubClassEq(RC)) {
+    assert(Subtarget.pairedVectorMemops() &&
+           "Register unexpected when paired memops are disabled.");
+    OpcodeIndex = SOK_AccumulatorSpill;
+  } else if (PPC::UACCRCRegClass.hasSubClassEq(RC)) {
+    assert(Subtarget.pairedVectorMemops() &&
+           "Register unexpected when paired memops are disabled.");
+    OpcodeIndex = SOK_UAccumulatorSpill;
+  } else if (PPC::VSRpRCRegClass.hasSubClassEq(RC)) {
+    assert(Subtarget.pairedVectorMemops() &&
+           "Register unexpected when paired memops are disabled.");
+    OpcodeIndex = SOK_PairedVecSpill;
   } else {
     llvm_unreachable("Unknown regclass!");
   }
@@ -1437,9 +1917,6 @@ void PPCInstrInfo::StoreRegToStackSlot(
       PPC::CRBITRCRegClass.hasSubClassEq(RC))
     FuncInfo->setSpillsCR();
 
-  if (PPC::VRSAVERCRegClass.hasSubClassEq(RC))
-    FuncInfo->setSpillsVRSAVE();
-
   if (isXFormMemOp(Opcode))
     FuncInfo->setHasNonRISpills();
 }
@@ -1495,9 +1972,6 @@ void PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, const DebugLoc &DL,
       PPC::CRBITRCRegClass.hasSubClassEq(RC))
     FuncInfo->setSpillsCR();
 
-  if (PPC::VRSAVERCRegClass.hasSubClassEq(RC))
-    FuncInfo->setSpillsVRSAVE();
-
   if (isXFormMemOp(Opcode))
     FuncInfo->setHasNonRISpills();
 }
@@ -1667,6 +2141,17 @@ bool PPCInstrInfo::isPredicated(const MachineInstr &MI) const {
   return false;
 }
 
+bool PPCInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
+                                        const MachineBasicBlock *MBB,
+                                        const MachineFunction &MF) const {
+  // Set MFFS and MTFSF as scheduling boundary to avoid unexpected code motion
+  // across them, since some FP operations may change content of FPSCR.
+  // TODO: Model FPSCR in PPC instruction definitions and remove the workaround
+  if (MI.getOpcode() == PPC::MFFS || MI.getOpcode() == PPC::MTFSF)
+    return true;
+  return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF);
+}
+
 bool PPCInstrInfo::PredicateInstruction(MachineInstr &MI,
                                         ArrayRef<MachineOperand> Pred) const {
   unsigned OpC = MI.getOpcode();
@@ -1675,6 +2160,10 @@ bool PPCInstrInfo::PredicateInstruction(MachineInstr &MI,
       bool isPPC64 = Subtarget.isPPC64();
       MI.setDesc(get(Pred[0].getImm() ? (isPPC64 ? PPC::BDNZLR8 : PPC::BDNZLR)
                                       : (isPPC64 ? PPC::BDZLR8 : PPC::BDZLR)));
+      // Need add Def and Use for CTR implicit operand.
+      MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+          .addReg(Pred[1].getReg(), RegState::Implicit)
+          .addReg(Pred[1].getReg(), RegState::ImplicitDefine);
     } else if (Pred[0].getImm() == PPC::PRED_BIT_SET) {
       MI.setDesc(get(PPC::BCLR));
       MachineInstrBuilder(*MI.getParent()->getParent(), MI).add(Pred[1]);
@@ -1694,6 +2183,10 @@ bool PPCInstrInfo::PredicateInstruction(MachineInstr &MI,
       bool isPPC64 = Subtarget.isPPC64();
       MI.setDesc(get(Pred[0].getImm() ? (isPPC64 ? PPC::BDNZ8 : PPC::BDNZ)
                                       : (isPPC64 ? PPC::BDZ8 : PPC::BDZ)));
+      // Need add Def and Use for CTR implicit operand.
+      MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+          .addReg(Pred[1].getReg(), RegState::Implicit)
+          .addReg(Pred[1].getReg(), RegState::ImplicitDefine);
     } else if (Pred[0].getImm() == PPC::PRED_BIT_SET) {
       MachineBasicBlock *MBB = MI.getOperand(0).getMBB();
       MI.RemoveOperand(0);
@@ -1734,19 +2227,24 @@ bool PPCInstrInfo::PredicateInstruction(MachineInstr &MI,
       MI.setDesc(get(isPPC64 ? (setLR ? PPC::BCCTRL8 : PPC::BCCTR8)
                              : (setLR ? PPC::BCCTRL : PPC::BCCTR)));
       MachineInstrBuilder(*MI.getParent()->getParent(), MI).add(Pred[1]);
-      return true;
     } else if (Pred[0].getImm() == PPC::PRED_BIT_UNSET) {
       MI.setDesc(get(isPPC64 ? (setLR ? PPC::BCCTRL8n : PPC::BCCTR8n)
                              : (setLR ? PPC::BCCTRLn : PPC::BCCTRn)));
       MachineInstrBuilder(*MI.getParent()->getParent(), MI).add(Pred[1]);
-      return true;
+    } else {
+      MI.setDesc(get(isPPC64 ? (setLR ? PPC::BCCCTRL8 : PPC::BCCCTR8)
+                             : (setLR ? PPC::BCCCTRL : PPC::BCCCTR)));
+      MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+          .addImm(Pred[0].getImm())
+          .add(Pred[1]);
     }
 
-    MI.setDesc(get(isPPC64 ? (setLR ? PPC::BCCCTRL8 : PPC::BCCCTR8)
-                           : (setLR ? PPC::BCCCTRL : PPC::BCCCTR)));
-    MachineInstrBuilder(*MI.getParent()->getParent(), MI)
-        .addImm(Pred[0].getImm())
-        .add(Pred[1]);
+    // Need add Def and Use for LR implicit operand.
+    if (setLR)
+      MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+          .addReg(isPPC64 ? PPC::LR8 : PPC::LR, RegState::Implicit)
+          .addReg(isPPC64 ? PPC::LR8 : PPC::LR, RegState::ImplicitDefine);
+
     return true;
   }
 
@@ -1784,8 +2282,9 @@ bool PPCInstrInfo::SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
   return false;
 }
 
-bool PPCInstrInfo::DefinesPredicate(MachineInstr &MI,
-                                    std::vector<MachineOperand> &Pred) const {
+bool PPCInstrInfo::ClobbersPredicate(MachineInstr &MI,
+                                     std::vector<MachineOperand> &Pred,
+                                     bool SkipDead) const {
   // Note: At the present time, the contents of Pred from this function is
   // unused by IfConversion. This implementation follows ARM by pushing the
   // CR-defining operand. Because the 'DZ' and 'DNZ' count as types of
@@ -2071,6 +2570,14 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
   if (NewOpC == -1)
     return false;
 
+  // This transformation should not be performed if `nsw` is missing and is not
+  // `equalityOnly` comparison. Since if there is overflow, sub_lt, sub_gt in
+  // CRReg do not reflect correct order. If `equalityOnly` is true, sub_eq in
+  // CRReg can reflect if compared values are equal, this optz is still valid.
+  if (!equalityOnly && (NewOpC == PPC::SUBF_rec || NewOpC == PPC::SUBF8_rec) &&
+      Sub && !Sub->getFlag(MachineInstr::NoSWrap))
+    return false;
+
   // If we have SUB(r1, r2) and CMP(r2, r1), the condition code based on CMP
   // needs to be updated to be based on SUB.  Push the condition code
   // operands to OperandsToUpdate.  If it is safe to remove CmpInstr, the
@@ -2221,6 +2728,112 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
   return true;
 }
 
+bool PPCInstrInfo::getMemOperandsWithOffsetWidth(
+    const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
+    int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
+    const TargetRegisterInfo *TRI) const {
+  const MachineOperand *BaseOp;
+  OffsetIsScalable = false;
+  if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, Width, TRI))
+    return false;
+  BaseOps.push_back(BaseOp);
+  return true;
+}
+
+static bool isLdStSafeToCluster(const MachineInstr &LdSt,
+                                const TargetRegisterInfo *TRI) {
+  // If this is a volatile load/store, don't mess with it.
+  if (LdSt.hasOrderedMemoryRef() || LdSt.getNumExplicitOperands() != 3)
+    return false;
+
+  if (LdSt.getOperand(2).isFI())
+    return true;
+
+  assert(LdSt.getOperand(2).isReg() && "Expected a reg operand.");
+  // Can't cluster if the instruction modifies the base register
+  // or it is update form. e.g. ld r2,3(r2)
+  if (LdSt.modifiesRegister(LdSt.getOperand(2).getReg(), TRI))
+    return false;
+
+  return true;
+}
+
+// Only cluster instruction pair that have the same opcode, and they are
+// clusterable according to PowerPC specification.
+static bool isClusterableLdStOpcPair(unsigned FirstOpc, unsigned SecondOpc,
+                                     const PPCSubtarget &Subtarget) {
+  switch (FirstOpc) {
+  default:
+    return false;
+  case PPC::STD:
+  case PPC::STFD:
+  case PPC::STXSD:
+  case PPC::DFSTOREf64:
+    return FirstOpc == SecondOpc;
+  // PowerPC backend has opcode STW/STW8 for instruction "stw" to deal with
+  // 32bit and 64bit instruction selection. They are clusterable pair though
+  // they are different opcode.
+  case PPC::STW:
+  case PPC::STW8:
+    return SecondOpc == PPC::STW || SecondOpc == PPC::STW8;
+  }
+}
+
+bool PPCInstrInfo::shouldClusterMemOps(
+    ArrayRef<const MachineOperand *> BaseOps1,
+    ArrayRef<const MachineOperand *> BaseOps2, unsigned NumLoads,
+    unsigned NumBytes) const {
+
+  assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
+  const MachineOperand &BaseOp1 = *BaseOps1.front();
+  const MachineOperand &BaseOp2 = *BaseOps2.front();
+  assert((BaseOp1.isReg() || BaseOp1.isFI()) &&
+         "Only base registers and frame indices are supported.");
+
+  // The NumLoads means the number of loads that has been clustered.
+  // Don't cluster memory op if there are already two ops clustered at least.
+  if (NumLoads > 2)
+    return false;
+
+  // Cluster the load/store only when they have the same base
+  // register or FI.
+  if ((BaseOp1.isReg() != BaseOp2.isReg()) ||
+      (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg()) ||
+      (BaseOp1.isFI() && BaseOp1.getIndex() != BaseOp2.getIndex()))
+    return false;
+
+  // Check if the load/store are clusterable according to the PowerPC
+  // specification.
+  const MachineInstr &FirstLdSt = *BaseOp1.getParent();
+  const MachineInstr &SecondLdSt = *BaseOp2.getParent();
+  unsigned FirstOpc = FirstLdSt.getOpcode();
+  unsigned SecondOpc = SecondLdSt.getOpcode();
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  // Cluster the load/store only when they have the same opcode, and they are
+  // clusterable opcode according to PowerPC specification.
+  if (!isClusterableLdStOpcPair(FirstOpc, SecondOpc, Subtarget))
+    return false;
+
+  // Can't cluster load/store that have ordered or volatile memory reference.
+  if (!isLdStSafeToCluster(FirstLdSt, TRI) ||
+      !isLdStSafeToCluster(SecondLdSt, TRI))
+    return false;
+
+  int64_t Offset1 = 0, Offset2 = 0;
+  unsigned Width1 = 0, Width2 = 0;
+  const MachineOperand *Base1 = nullptr, *Base2 = nullptr;
+  if (!getMemOperandWithOffsetWidth(FirstLdSt, Base1, Offset1, Width1, TRI) ||
+      !getMemOperandWithOffsetWidth(SecondLdSt, Base2, Offset2, Width2, TRI) ||
+      Width1 != Width2)
+    return false;
+
+  assert(Base1 == &BaseOp1 && Base2 == &BaseOp2 &&
+         "getMemOperandWithOffsetWidth return incorrect base op");
+  // The caller should already have ordered FirstMemOp/SecondMemOp by offset.
+  assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
+  return Offset1 + Width1 == Offset2;
+}
+
 /// GetInstSize - Return the number of bytes of code the specified
 /// instruction may be.  This returns the maximum number of bytes.
 ///
@@ -2270,7 +2883,14 @@ PPCInstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
       {MO_PLT, "ppc-plt"},
       {MO_PIC_FLAG, "ppc-pic"},
       {MO_PCREL_FLAG, "ppc-pcrel"},
-      {MO_GOT_FLAG, "ppc-got"}};
+      {MO_GOT_FLAG, "ppc-got"},
+      {MO_PCREL_OPT_FLAG, "ppc-opt-pcrel"},
+      {MO_TLSGD_FLAG, "ppc-tlsgd"},
+      {MO_TLSLD_FLAG, "ppc-tlsld"},
+      {MO_TPREL_FLAG, "ppc-tprel"},
+      {MO_GOT_TLSGD_PCREL_FLAG, "ppc-got-tlsgd-pcrel"},
+      {MO_GOT_TLSLD_PCREL_FLAG, "ppc-got-tlsld-pcrel"},
+      {MO_GOT_TPREL_PCREL_FLAG, "ppc-got-tprel-pcrel"}};
   return makeArrayRef(TargetFlags);
 }
 
@@ -2351,6 +2971,31 @@ bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   auto DL = MI.getDebugLoc();
 
   switch (MI.getOpcode()) {
+  case PPC::BUILD_UACC: {
+    MCRegister ACC = MI.getOperand(0).getReg();
+    MCRegister UACC = MI.getOperand(1).getReg();
+    if (ACC - PPC::ACC0 != UACC - PPC::UACC0) {
+      MCRegister SrcVSR = PPC::VSL0 + (UACC - PPC::UACC0) * 4;
+      MCRegister DstVSR = PPC::VSL0 + (ACC - PPC::ACC0) * 4;
+      // FIXME: This can easily be improved to look up to the top of the MBB
+      // to see if the inputs are XXLOR's. If they are and SrcReg is killed,
+      // we can just re-target any such XXLOR's to DstVSR + offset.
+      for (int VecNo = 0; VecNo < 4; VecNo++)
+        BuildMI(MBB, MI, DL, get(PPC::XXLOR), DstVSR + VecNo)
+            .addReg(SrcVSR + VecNo)
+            .addReg(SrcVSR + VecNo);
+    }
+    // BUILD_UACC is expanded to 4 copies of the underlying vsx regisers.
+    // So after building the 4 copies, we can replace the BUILD_UACC instruction
+    // with a NOP.
+    LLVM_FALLTHROUGH;
+  }
+  case PPC::KILL_PAIR: {
+    MI.setDesc(get(PPC::UNENCODED_NOP));
+    MI.RemoveOperand(1);
+    MI.RemoveOperand(0);
+    return true;
+  }
   case TargetOpcode::LOAD_STACK_GUARD: {
     assert(Subtarget.isTargetLinux() &&
            "Only Linux target is expected to contain LOAD_STACK_GUARD");
@@ -2642,7 +3287,10 @@ MachineInstr *PPCInstrInfo::getForwardingDefMI(
 }
 
 unsigned PPCInstrInfo::getSpillTarget() const {
-  return Subtarget.hasP9Vector() ? 1 : 0;
+  // With P10, we may need to spill paired vector registers or accumulator
+  // registers. MMA implies paired vectors, so we can just check that.
+  bool IsP10Variant = Subtarget.isISA3_1() || Subtarget.pairedVectorMemops();
+  return IsP10Variant ? 2 : Subtarget.hasP9Vector() ? 1 : 0;
 }
 
 const unsigned *PPCInstrInfo::getStoreOpcodesForSpillArray() const {
@@ -3033,6 +3681,143 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI,
   return false;
 }
 
+bool PPCInstrInfo::combineRLWINM(MachineInstr &MI,
+                                 MachineInstr **ToErase) const {
+  MachineRegisterInfo *MRI = &MI.getParent()->getParent()->getRegInfo();
+  unsigned FoldingReg = MI.getOperand(1).getReg();
+  if (!Register::isVirtualRegister(FoldingReg))
+    return false;
+  MachineInstr *SrcMI = MRI->getVRegDef(FoldingReg);
+  if (SrcMI->getOpcode() != PPC::RLWINM &&
+      SrcMI->getOpcode() != PPC::RLWINM_rec &&
+      SrcMI->getOpcode() != PPC::RLWINM8 &&
+      SrcMI->getOpcode() != PPC::RLWINM8_rec)
+    return false;
+  assert((MI.getOperand(2).isImm() && MI.getOperand(3).isImm() &&
+          MI.getOperand(4).isImm() && SrcMI->getOperand(2).isImm() &&
+          SrcMI->getOperand(3).isImm() && SrcMI->getOperand(4).isImm()) &&
+         "Invalid PPC::RLWINM Instruction!");
+  uint64_t SHSrc = SrcMI->getOperand(2).getImm();
+  uint64_t SHMI = MI.getOperand(2).getImm();
+  uint64_t MBSrc = SrcMI->getOperand(3).getImm();
+  uint64_t MBMI = MI.getOperand(3).getImm();
+  uint64_t MESrc = SrcMI->getOperand(4).getImm();
+  uint64_t MEMI = MI.getOperand(4).getImm();
+
+  assert((MEMI < 32 && MESrc < 32 && MBMI < 32 && MBSrc < 32) &&
+         "Invalid PPC::RLWINM Instruction!");
+  // If MBMI is bigger than MEMI, we always can not get run of ones.
+  // RotatedSrcMask non-wrap:
+  //                 0........31|32........63
+  // RotatedSrcMask:   B---E        B---E
+  // MaskMI:         -----------|--E  B------
+  // Result:           -----          ---      (Bad candidate)
+  //
+  // RotatedSrcMask wrap:
+  //                 0........31|32........63
+  // RotatedSrcMask: --E   B----|--E    B----
+  // MaskMI:         -----------|--E  B------
+  // Result:         ---   -----|---    -----  (Bad candidate)
+  //
+  // One special case is RotatedSrcMask is a full set mask.
+  // RotatedSrcMask full:
+  //                 0........31|32........63
+  // RotatedSrcMask: ------EB---|-------EB---
+  // MaskMI:         -----------|--E  B------
+  // Result:         -----------|---  -------  (Good candidate)
+
+  // Mark special case.
+  bool SrcMaskFull = (MBSrc - MESrc == 1) || (MBSrc == 0 && MESrc == 31);
+
+  // For other MBMI > MEMI cases, just return.
+  if ((MBMI > MEMI) && !SrcMaskFull)
+    return false;
+
+  // Handle MBMI <= MEMI cases.
+  APInt MaskMI = APInt::getBitsSetWithWrap(32, 32 - MEMI - 1, 32 - MBMI);
+  // In MI, we only need low 32 bits of SrcMI, just consider about low 32
+  // bit of SrcMI mask. Note that in APInt, lowerest bit is at index 0,
+  // while in PowerPC ISA, lowerest bit is at index 63.
+  APInt MaskSrc = APInt::getBitsSetWithWrap(32, 32 - MESrc - 1, 32 - MBSrc);
+
+  APInt RotatedSrcMask = MaskSrc.rotl(SHMI);
+  APInt FinalMask = RotatedSrcMask & MaskMI;
+  uint32_t NewMB, NewME;
+  bool Simplified = false;
+
+  // If final mask is 0, MI result should be 0 too.
+  if (FinalMask.isNullValue()) {
+    bool Is64Bit =
+        (MI.getOpcode() == PPC::RLWINM8 || MI.getOpcode() == PPC::RLWINM8_rec);
+    Simplified = true;
+    LLVM_DEBUG(dbgs() << "Replace Instr: ");
+    LLVM_DEBUG(MI.dump());
+
+    if (MI.getOpcode() == PPC::RLWINM || MI.getOpcode() == PPC::RLWINM8) {
+      // Replace MI with "LI 0"
+      MI.RemoveOperand(4);
+      MI.RemoveOperand(3);
+      MI.RemoveOperand(2);
+      MI.getOperand(1).ChangeToImmediate(0);
+      MI.setDesc(get(Is64Bit ? PPC::LI8 : PPC::LI));
+    } else {
+      // Replace MI with "ANDI_rec reg, 0"
+      MI.RemoveOperand(4);
+      MI.RemoveOperand(3);
+      MI.getOperand(2).setImm(0);
+      MI.setDesc(get(Is64Bit ? PPC::ANDI8_rec : PPC::ANDI_rec));
+      MI.getOperand(1).setReg(SrcMI->getOperand(1).getReg());
+      if (SrcMI->getOperand(1).isKill()) {
+        MI.getOperand(1).setIsKill(true);
+        SrcMI->getOperand(1).setIsKill(false);
+      } else
+        // About to replace MI.getOperand(1), clear its kill flag.
+        MI.getOperand(1).setIsKill(false);
+    }
+
+    LLVM_DEBUG(dbgs() << "With: ");
+    LLVM_DEBUG(MI.dump());
+
+  } else if ((isRunOfOnes((unsigned)(FinalMask.getZExtValue()), NewMB, NewME) &&
+              NewMB <= NewME) ||
+             SrcMaskFull) {
+    // Here we only handle MBMI <= MEMI case, so NewMB must be no bigger
+    // than NewME. Otherwise we get a 64 bit value after folding, but MI
+    // return a 32 bit value.
+    Simplified = true;
+    LLVM_DEBUG(dbgs() << "Converting Instr: ");
+    LLVM_DEBUG(MI.dump());
+
+    uint16_t NewSH = (SHSrc + SHMI) % 32;
+    MI.getOperand(2).setImm(NewSH);
+    // If SrcMI mask is full, no need to update MBMI and MEMI.
+    if (!SrcMaskFull) {
+      MI.getOperand(3).setImm(NewMB);
+      MI.getOperand(4).setImm(NewME);
+    }
+    MI.getOperand(1).setReg(SrcMI->getOperand(1).getReg());
+    if (SrcMI->getOperand(1).isKill()) {
+      MI.getOperand(1).setIsKill(true);
+      SrcMI->getOperand(1).setIsKill(false);
+    } else
+      // About to replace MI.getOperand(1), clear its kill flag.
+      MI.getOperand(1).setIsKill(false);
+
+    LLVM_DEBUG(dbgs() << "To: ");
+    LLVM_DEBUG(MI.dump());
+  }
+  if (Simplified & MRI->use_nodbg_empty(FoldingReg) &&
+      !SrcMI->hasImplicitDef()) {
+    // If FoldingReg has no non-debug use and it has no implicit def (it
+    // is not RLWINMO or RLWINM8o), it's safe to delete its def SrcMI.
+    // Otherwise keep it.
+    *ToErase = SrcMI;
+    LLVM_DEBUG(dbgs() << "Delete dead instruction: ");
+    LLVM_DEBUG(SrcMI->dump());
+  }
+  return Simplified;
+}
+
 bool PPCInstrInfo::instrHasImmForm(unsigned Opc, bool IsVFReg,
                                    ImmInstrInfo &III, bool PostRA) const {
   // The vast majority of the instructions would need their operand 2 replaced
@@ -3754,6 +4539,20 @@ bool PPCInstrInfo::simplifyToLI(MachineInstr &MI, MachineInstr &DefMI,
     }
     return false;
   }
+  case PPC::SUBFIC:
+  case PPC::SUBFIC8: {
+    // Only transform this if the CARRY implicit operand is dead.
+    if (MI.getNumOperands() > 3 && !MI.getOperand(3).isDead())
+      return false;
+    int64_t Minuend = MI.getOperand(2).getImm();
+    if (isInt<16>(Minuend - SExtImm)) {
+      ReplaceWithLI = true;
+      Is64BitLI = Opc == PPC::SUBFIC8;
+      NewImm = Minuend - SExtImm;
+      break;
+    }
+    return false;
+  }
   case PPC::RLDICL:
   case PPC::RLDICL_rec:
   case PPC::RLDICL_32:
@@ -4640,13 +5439,15 @@ MachineInstr *PPCInstrInfo::findLoopInstr(
 bool PPCInstrInfo::getMemOperandWithOffsetWidth(
     const MachineInstr &LdSt, const MachineOperand *&BaseReg, int64_t &Offset,
     unsigned &Width, const TargetRegisterInfo *TRI) const {
-  if (!LdSt.mayLoadOrStore())
+  if (!LdSt.mayLoadOrStore() || LdSt.getNumExplicitOperands() != 3)
     return false;
 
   // Handle only loads/stores with base register followed by immediate offset.
-  if (LdSt.getNumExplicitOperands() != 3)
+  if (!LdSt.getOperand(1).isImm() ||
+      (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()))
     return false;
-  if (!LdSt.getOperand(1).isImm() || !LdSt.getOperand(2).isReg())
+  if (!LdSt.getOperand(1).isImm() ||
+      (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()))
     return false;
 
   if (!LdSt.hasOneMemOperand())
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrInfo.h
index 556c95fef3bd..c6ef1742b722 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrInfo.h
@@ -122,61 +122,73 @@ enum SpillOpcodeKey {
   SOK_VSXVectorSpill,
   SOK_VectorFloat8Spill,
   SOK_VectorFloat4Spill,
-  SOK_VRSaveSpill,
-  SOK_QuadFloat8Spill,
-  SOK_QuadFloat4Spill,
-  SOK_QuadBitSpill,
   SOK_SpillToVSR,
+  SOK_PairedVecSpill,
+  SOK_AccumulatorSpill,
+  SOK_UAccumulatorSpill,
   SOK_SPESpill,
   SOK_LastOpcodeSpill // This must be last on the enum.
 };
 
 // Define list of load and store spill opcodes.
+#define NoInstr PPC::INSTRUCTION_LIST_END
 #define Pwr8LoadOpcodes                                                        \
   {                                                                            \
     PPC::LWZ, PPC::LD, PPC::LFD, PPC::LFS, PPC::RESTORE_CR,                    \
         PPC::RESTORE_CRBIT, PPC::LVX, PPC::LXVD2X, PPC::LXSDX, PPC::LXSSPX,    \
-        PPC::RESTORE_VRSAVE, PPC::QVLFDX, PPC::QVLFSXs, PPC::QVLFDXb,          \
-        PPC::SPILLTOVSR_LD, PPC::EVLDD                                         \
+        PPC::SPILLTOVSR_LD, NoInstr, NoInstr, NoInstr, PPC::EVLDD              \
   }
 
 #define Pwr9LoadOpcodes                                                        \
   {                                                                            \
     PPC::LWZ, PPC::LD, PPC::LFD, PPC::LFS, PPC::RESTORE_CR,                    \
         PPC::RESTORE_CRBIT, PPC::LVX, PPC::LXV, PPC::DFLOADf64,                \
-        PPC::DFLOADf32, PPC::RESTORE_VRSAVE, PPC::QVLFDX, PPC::QVLFSXs,        \
-        PPC::QVLFDXb, PPC::SPILLTOVSR_LD                                       \
+        PPC::DFLOADf32, PPC::SPILLTOVSR_LD, NoInstr, NoInstr, NoInstr, NoInstr \
+  }
+
+#define Pwr10LoadOpcodes                                                       \
+  {                                                                            \
+    PPC::LWZ, PPC::LD, PPC::LFD, PPC::LFS, PPC::RESTORE_CR,                    \
+        PPC::RESTORE_CRBIT, PPC::LVX, PPC::LXV, PPC::DFLOADf64,                \
+        PPC::DFLOADf32, PPC::SPILLTOVSR_LD, PPC::LXVP, PPC::RESTORE_ACC,       \
+        PPC::RESTORE_UACC, NoInstr                                             \
   }
 
 #define Pwr8StoreOpcodes                                                       \
   {                                                                            \
     PPC::STW, PPC::STD, PPC::STFD, PPC::STFS, PPC::SPILL_CR, PPC::SPILL_CRBIT, \
-        PPC::STVX, PPC::STXVD2X, PPC::STXSDX, PPC::STXSSPX, PPC::SPILL_VRSAVE, \
-        PPC::QVSTFDX, PPC::QVSTFSXs, PPC::QVSTFDXb, PPC::SPILLTOVSR_ST,        \
-        PPC::EVSTDD                                                            \
+        PPC::STVX, PPC::STXVD2X, PPC::STXSDX, PPC::STXSSPX,                    \
+        PPC::SPILLTOVSR_ST, NoInstr, NoInstr, NoInstr, PPC::EVSTDD             \
   }
 
 #define Pwr9StoreOpcodes                                                       \
   {                                                                            \
     PPC::STW, PPC::STD, PPC::STFD, PPC::STFS, PPC::SPILL_CR, PPC::SPILL_CRBIT, \
         PPC::STVX, PPC::STXV, PPC::DFSTOREf64, PPC::DFSTOREf32,                \
-        PPC::SPILL_VRSAVE, PPC::QVSTFDX, PPC::QVSTFSXs, PPC::QVSTFDXb,         \
-        PPC::SPILLTOVSR_ST                                                     \
+        PPC::SPILLTOVSR_ST, NoInstr, NoInstr, NoInstr, NoInstr                 \
+  }
+
+#define Pwr10StoreOpcodes                                                      \
+  {                                                                            \
+    PPC::STW, PPC::STD, PPC::STFD, PPC::STFS, PPC::SPILL_CR, PPC::SPILL_CRBIT, \
+        PPC::STVX, PPC::STXV, PPC::DFSTOREf64, PPC::DFSTOREf32,                \
+        PPC::SPILLTOVSR_ST, PPC::STXVP, PPC::SPILL_ACC, PPC::SPILL_UACC,       \
+        NoInstr                                                                \
   }
 
 // Initialize arrays for load and store spill opcodes on supported subtargets.
 #define StoreOpcodesForSpill                                                   \
-  { Pwr8StoreOpcodes, Pwr9StoreOpcodes }
+  { Pwr8StoreOpcodes, Pwr9StoreOpcodes, Pwr10StoreOpcodes }
 #define LoadOpcodesForSpill                                                    \
-  { Pwr8LoadOpcodes, Pwr9LoadOpcodes }
+  { Pwr8LoadOpcodes, Pwr9LoadOpcodes, Pwr10LoadOpcodes }
 
 class PPCSubtarget;
 class PPCInstrInfo : public PPCGenInstrInfo {
   PPCSubtarget &Subtarget;
   const PPCRegisterInfo RI;
-  const unsigned StoreSpillOpcodesArray[2][SOK_LastOpcodeSpill] =
+  const unsigned StoreSpillOpcodesArray[3][SOK_LastOpcodeSpill] =
       StoreOpcodesForSpill;
-  const unsigned LoadSpillOpcodesArray[2][SOK_LastOpcodeSpill] =
+  const unsigned LoadSpillOpcodesArray[3][SOK_LastOpcodeSpill] =
       LoadOpcodesForSpill;
 
   void StoreRegToStackSlot(MachineFunction &MF, unsigned SrcReg, bool isKill,
@@ -234,11 +246,17 @@ class PPCInstrInfo : public PPCGenInstrInfo {
   unsigned getSpillTarget() const;
   const unsigned *getStoreOpcodesForSpillArray() const;
   const unsigned *getLoadOpcodesForSpillArray() const;
+  unsigned getSpillIndex(const TargetRegisterClass *RC) const;
   int16_t getFMAOpIdxInfo(unsigned Opcode) const;
   void reassociateFMA(MachineInstr &Root, MachineCombinerPattern Pattern,
                       SmallVectorImpl<MachineInstr *> &InsInstrs,
                       SmallVectorImpl<MachineInstr *> &DelInstrs,
                       DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const;
+  bool isLoadFromConstantPool(MachineInstr *I) const;
+  Register
+  generateLoadForNewConst(unsigned Idx, MachineInstr *MI, Type *Ty,
+                          SmallVectorImpl<MachineInstr *> &InsInstrs) const;
+  const Constant *getConstantFromConstantPool(MachineInstr *I) const;
   virtual void anchor();
 
 protected:
@@ -273,10 +291,10 @@ public:
   }
 
   static bool isSameClassPhysRegCopy(unsigned Opcode) {
-    unsigned CopyOpcodes[] =
-      { PPC::OR, PPC::OR8, PPC::FMR, PPC::VOR, PPC::XXLOR, PPC::XXLORf,
-        PPC::XSCPSGNDP, PPC::MCRF, PPC::QVFMR, PPC::QVFMRs, PPC::QVFMRb,
-        PPC::CROR, PPC::EVOR, -1U };
+    unsigned CopyOpcodes[] = {PPC::OR,        PPC::OR8,   PPC::FMR,
+                              PPC::VOR,       PPC::XXLOR, PPC::XXLORf,
+                              PPC::XSCPSGNDP, PPC::MCRF,  PPC::CROR,
+                              PPC::EVOR,      -1U};
     for (int i = 0; CopyOpcodes[i] != -1U; i++)
       if (Opcode == CopyOpcodes[i])
         return true;
@@ -330,14 +348,29 @@ public:
   /// chain ending in \p Root. All potential patterns are output in the \p
   /// P array.
   bool getFMAPatterns(MachineInstr &Root,
-                      SmallVectorImpl<MachineCombinerPattern> &P) const;
+                      SmallVectorImpl<MachineCombinerPattern> &P,
+                      bool DoRegPressureReduce) const;
 
   /// Return true when there is potentially a faster code sequence
   /// for an instruction chain ending in <Root>. All potential patterns are
   /// output in the <Pattern> array.
-  bool getMachineCombinerPatterns(
-      MachineInstr &Root,
-      SmallVectorImpl<MachineCombinerPattern> &P) const override;
+  bool getMachineCombinerPatterns(MachineInstr &Root,
+                                  SmallVectorImpl<MachineCombinerPattern> &P,
+                                  bool DoRegPressureReduce) const override;
+
+  /// On PowerPC, we leverage machine combiner pass to reduce register pressure
+  /// when the register pressure is high for one BB.
+  /// Return true if register pressure for \p MBB is high and ABI is supported
+  /// to reduce register pressure. Otherwise return false.
+  bool
+  shouldReduceRegisterPressure(MachineBasicBlock *MBB,
+                               RegisterClassInfo *RegClassInfo) const override;
+
+  /// Fixup the placeholders we put in genAlternativeCodeSequence() for
+  /// MachineCombiner.
+  void
+  finalizeInsInstrs(MachineInstr &Root, MachineCombinerPattern &P,
+                    SmallVectorImpl<MachineInstr *> &InsInstrs) const override;
 
   bool isAssociativeAndCommutative(const MachineInstr &Inst) const override;
 
@@ -470,14 +503,18 @@ public:
   // Predication support.
   bool isPredicated(const MachineInstr &MI) const override;
 
+  bool isSchedulingBoundary(const MachineInstr &MI,
+                            const MachineBasicBlock *MBB,
+                            const MachineFunction &MF) const override;
+
   bool PredicateInstruction(MachineInstr &MI,
                             ArrayRef<MachineOperand> Pred) const override;
 
   bool SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
                          ArrayRef<MachineOperand> Pred2) const override;
 
-  bool DefinesPredicate(MachineInstr &MI,
-                        std::vector<MachineOperand> &Pred) const override;
+  bool ClobbersPredicate(MachineInstr &MI, std::vector<MachineOperand> &Pred,
+                         bool SkipDead) const override;
 
   // Comparison optimization.
 
@@ -497,6 +534,20 @@ public:
                                     int64_t &Offset, unsigned &Width,
                                     const TargetRegisterInfo *TRI) const;
 
+  /// Get the base operand and byte offset of an instruction that reads/writes
+  /// memory.
+  bool getMemOperandsWithOffsetWidth(
+      const MachineInstr &LdSt,
+      SmallVectorImpl<const MachineOperand *> &BaseOps, int64_t &Offset,
+      bool &OffsetIsScalable, unsigned &Width,
+      const TargetRegisterInfo *TRI) const override;
+
+  /// Returns true if the two given memory operations should be scheduled
+  /// adjacent.
+  bool shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
+                           ArrayRef<const MachineOperand *> BaseOps2,
+                           unsigned NumLoads, unsigned NumBytes) const override;
+
   /// Return true if two MIs access different memory addresses and false
   /// otherwise
   bool
@@ -554,6 +605,7 @@ public:
   bool convertToImmediateForm(MachineInstr &MI,
                               MachineInstr **KilledDef = nullptr) const;
   bool foldFrameOffset(MachineInstr &MI) const;
+  bool combineRLWINM(MachineInstr &MI, MachineInstr **ToErase = nullptr) const;
   bool isADDIInstrEligibleForFolding(MachineInstr &ADDIMI, int64_t &Imm) const;
   bool isADDInstrEligibleForFolding(MachineInstr &ADDMI) const;
   bool isImmInstrEligibleForFolding(MachineInstr &MI, unsigned &BaseReg,
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index fedbf592af39..724af23542d7 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -74,6 +74,9 @@ def SDT_PPCcondbr : SDTypeProfile<0, 3, [
   SDTCisVT<0, i32>, SDTCisVT<2, OtherVT>
 ]>;
 
+def SDT_PPCFtsqrt : SDTypeProfile<1, 1, [
+  SDTCisVT<0, i32>]>;
+
 def SDT_PPClbrx : SDTypeProfile<1, 2, [
   SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisVT<2, OtherVT>
 ]>;
@@ -124,6 +127,8 @@ def SDT_PPCFPMinMax : SDTypeProfile<1, 2, [
 
 def PPCfre    : SDNode<"PPCISD::FRE",     SDTFPUnaryOp, []>;
 def PPCfrsqrte: SDNode<"PPCISD::FRSQRTE", SDTFPUnaryOp, []>;
+def PPCfsqrt  : SDNode<"PPCISD::FSQRT",   SDTFPUnaryOp, []>;
+def PPCftsqrt : SDNode<"PPCISD::FTSQRT",  SDT_PPCFtsqrt,[]>;
 
 def PPCfcfid  : SDNode<"PPCISD::FCFID",   SDTFPUnaryOp, []>;
 def PPCfcfidu : SDNode<"PPCISD::FCFIDU",  SDTFPUnaryOp, []>;
@@ -134,6 +139,28 @@ def PPCfctiwz : SDNode<"PPCISD::FCTIWZ", SDTFPUnaryOp, []>;
 def PPCfctiduz: SDNode<"PPCISD::FCTIDUZ",SDTFPUnaryOp, []>;
 def PPCfctiwuz: SDNode<"PPCISD::FCTIWUZ",SDTFPUnaryOp, []>;
 
+def PPCstrict_fcfid : SDNode<"PPCISD::STRICT_FCFID",
+                             SDTFPUnaryOp, [SDNPHasChain]>;
+def PPCstrict_fcfidu : SDNode<"PPCISD::STRICT_FCFIDU",
+                              SDTFPUnaryOp, [SDNPHasChain]>;
+def PPCstrict_fcfids : SDNode<"PPCISD::STRICT_FCFIDS",
+                             SDTFPRoundOp, [SDNPHasChain]>;
+def PPCstrict_fcfidus : SDNode<"PPCISD::STRICT_FCFIDUS",
+                              SDTFPRoundOp, [SDNPHasChain]>;
+
+def PPCany_fcfid : PatFrags<(ops node:$op),
+                             [(PPCfcfid node:$op),
+                              (PPCstrict_fcfid node:$op)]>;
+def PPCany_fcfidu : PatFrags<(ops node:$op),
+                             [(PPCfcfidu node:$op),
+                              (PPCstrict_fcfidu node:$op)]>;
+def PPCany_fcfids : PatFrags<(ops node:$op),
+                              [(PPCfcfids node:$op),
+                               (PPCstrict_fcfids node:$op)]>;
+def PPCany_fcfidus : PatFrags<(ops node:$op),
+                              [(PPCfcfidus node:$op),
+                               (PPCstrict_fcfidus node:$op)]>;
+
 def PPCcv_fp_to_uint_in_vsr:
     SDNode<"PPCISD::FP_TO_UINT_IN_VSR", SDT_PPCcv_fp_to_int, []>;
 def PPCcv_fp_to_sint_in_vsr:
@@ -160,7 +187,12 @@ def PPCmffs   : SDNode<"PPCISD::MFFS",
 
 // Perform FADD in round-to-zero mode.
 def PPCfaddrtz: SDNode<"PPCISD::FADDRTZ", SDTFPBinOp, []>;
+def PPCstrict_faddrtz: SDNode<"PPCISD::STRICT_FADDRTZ", SDTFPBinOp,
+                              [SDNPHasChain]>;
 
+def PPCany_faddrtz: PatFrags<(ops node:$lhs, node:$rhs),
+                             [(PPCfaddrtz node:$lhs, node:$rhs),
+                              (PPCstrict_faddrtz node:$lhs, node:$rhs)]>;
 
 def PPCfsel   : SDNode<"PPCISD::FSEL",
    // Type constraint for fsel.
@@ -195,6 +227,7 @@ def PPCaddiTlsldLAddr : SDNode<"PPCISD::ADDI_TLSLD_L_ADDR",
                                  SDTCisSameAs<0, 3>, SDTCisInt<0> ]>>;
 def PPCaddisDtprelHA : SDNode<"PPCISD::ADDIS_DTPREL_HA", SDTIntBinOp>;
 def PPCaddiDtprelL   : SDNode<"PPCISD::ADDI_DTPREL_L", SDTIntBinOp>;
+def PPCpaddiDtprel   : SDNode<"PPCISD::PADDI_DTPREL", SDTIntBinOp>;
 
 def PPCvperm     : SDNode<"PPCISD::VPERM", SDT_PPCvperm, []>;
 def PPCxxsplt    : SDNode<"PPCISD::XXSPLT", SDT_PPCVecSplat, []>;
@@ -203,16 +236,6 @@ def PPCvecinsert : SDNode<"PPCISD::VECINSERT", SDT_PPCVecInsert, []>;
 def PPCxxpermdi  : SDNode<"PPCISD::XXPERMDI", SDT_PPCxxpermdi, []>;
 def PPCvecshl    : SDNode<"PPCISD::VECSHL", SDT_PPCVecShift, []>;
 
-def PPCqvfperm   : SDNode<"PPCISD::QVFPERM", SDT_PPCqvfperm, []>;
-def PPCqvgpci    : SDNode<"PPCISD::QVGPCI", SDT_PPCqvgpci, []>;
-def PPCqvaligni  : SDNode<"PPCISD::QVALIGNI", SDT_PPCqvaligni, []>;
-def PPCqvesplati : SDNode<"PPCISD::QVESPLATI", SDT_PPCqvesplati, []>;
-
-def PPCqbflt     : SDNode<"PPCISD::QBFLT", SDT_PPCqbflt, []>;
-
-def PPCqvlfsb    : SDNode<"PPCISD::QVLFSb", SDT_PPCqvlfsb,
-                          [SDNPHasChain, SDNPMayLoad]>;
-
 def PPCcmpb     : SDNode<"PPCISD::CMPB", SDTIntBinOp, []>;
 
 // These nodes represent the 32-bit PPC shifts that operate on 6-bit shift
@@ -225,6 +248,28 @@ def PPCfnmsub     : SDNode<"PPCISD::FNMSUB"    , SDTFPTernaryOp>;
 
 def PPCextswsli : SDNode<"PPCISD::EXTSWSLI" , SDT_PPCextswsli>;
 
+def PPCstrict_fctidz : SDNode<"PPCISD::STRICT_FCTIDZ",
+                              SDTFPUnaryOp, [SDNPHasChain]>;
+def PPCstrict_fctiwz : SDNode<"PPCISD::STRICT_FCTIWZ",
+                              SDTFPUnaryOp, [SDNPHasChain]>;
+def PPCstrict_fctiduz : SDNode<"PPCISD::STRICT_FCTIDUZ",
+                               SDTFPUnaryOp, [SDNPHasChain]>;
+def PPCstrict_fctiwuz : SDNode<"PPCISD::STRICT_FCTIWUZ",
+                                SDTFPUnaryOp, [SDNPHasChain]>;
+
+def PPCany_fctidz : PatFrags<(ops node:$op),
+                             [(PPCstrict_fctidz node:$op),
+                              (PPCfctidz node:$op)]>;
+def PPCany_fctiwz : PatFrags<(ops node:$op),
+                             [(PPCstrict_fctiwz node:$op),
+                              (PPCfctiwz node:$op)]>;
+def PPCany_fctiduz : PatFrags<(ops node:$op),
+                              [(PPCstrict_fctiduz node:$op),
+                               (PPCfctiduz node:$op)]>;
+def PPCany_fctiwuz : PatFrags<(ops node:$op),
+                              [(PPCstrict_fctiwuz node:$op),
+                               (PPCfctiwuz node:$op)]>;
+
 // Move 2 i64 values into a VSX register
 def PPCbuild_fp128: SDNode<"PPCISD::BUILD_FP128",
                            SDTypeProfile<1, 2,
@@ -295,7 +340,7 @@ def PPCrfebb      : SDNode<"PPCISD::RFEBB", SDT_PPCsc,
                            [SDNPHasChain, SDNPSideEffect]>;
 
 def PPCvcmp       : SDNode<"PPCISD::VCMP" , SDT_PPCvcmp, []>;
-def PPCvcmp_o     : SDNode<"PPCISD::VCMPo", SDT_PPCvcmp, [SDNPOutGlue]>;
+def PPCvcmp_rec   : SDNode<"PPCISD::VCMP_rec", SDT_PPCvcmp, [SDNPOutGlue]>;
 
 def PPCcondbranch : SDNode<"PPCISD::COND_BRANCH", SDT_PPCcondbr,
                            [SDNPHasChain, SDNPOptInGlue]>;
@@ -327,6 +372,10 @@ def PPCprobedalloca : SDNode<"PPCISD::PROBED_ALLOCA", SDTDynOp, [SDNPHasChain]>;
 
 // PC Relative Specific Nodes
 def PPCmatpcreladdr : SDNode<"PPCISD::MAT_PCREL_ADDR", SDTIntUnaryOp, []>;
+def PPCtlsdynamatpcreladdr : SDNode<"PPCISD::TLS_DYNAMIC_MAT_PCREL_ADDR",
+                                    SDTIntUnaryOp, []>;
+def PPCtlslocalexecmataddr : SDNode<"PPCISD::TLS_LOCAL_EXEC_MAT_ADDR",
+                                    SDTIntUnaryOp, []>;
 
 //===----------------------------------------------------------------------===//
 // PowerPC specific transformation functions and pattern fragments.
@@ -446,37 +495,41 @@ def imm64ZExt32  : Operand<i64>, ImmLeaf<i64, [{
   return isUInt<32>(Imm);
 }]>;
 
-// Some r+i load/store instructions (such as LD, STD, LDU, etc.) that require
+// This is a somewhat weaker condition than actually checking for 4-byte
+// alignment. It is simply checking that the displacement can be represented
+// as an immediate that is a multiple of 4 (i.e. the requirements for DS-Form
+// instructions).
+// But some r+i load/store instructions (such as LD, STD, LDU, etc.) that require
 // restricted memrix (4-aligned) constants are alignment sensitive. If these
 // offsets are hidden behind TOC entries than the values of the lower-order
 // bits cannot be checked directly. As a result, we need to also incorporate
 // an alignment check into the relevant patterns.
 
-def aligned4load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
-  return cast<LoadSDNode>(N)->getAlignment() >= 4;
+def DSFormLoad : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return isOffsetMultipleOf(N, 4) || cast<LoadSDNode>(N)->getAlignment() >= 4;
 }]>;
-def aligned4store : PatFrag<(ops node:$val, node:$ptr),
+def DSFormStore : PatFrag<(ops node:$val, node:$ptr),
                             (store node:$val, node:$ptr), [{
-  return cast<StoreSDNode>(N)->getAlignment() >= 4;
+  return isOffsetMultipleOf(N, 4) || cast<StoreSDNode>(N)->getAlignment() >= 4;
 }]>;
-def aligned4sextloadi32 : PatFrag<(ops node:$ptr), (sextloadi32 node:$ptr), [{
-  return cast<LoadSDNode>(N)->getAlignment() >= 4;
+def DSFormSextLoadi32 : PatFrag<(ops node:$ptr), (sextloadi32 node:$ptr), [{
+  return isOffsetMultipleOf(N, 4) || cast<LoadSDNode>(N)->getAlignment() >= 4;
 }]>;
-def aligned4pre_store : PatFrag<
+def DSFormPreStore : PatFrag<
                           (ops node:$val, node:$base, node:$offset),
                           (pre_store node:$val, node:$base, node:$offset), [{
-  return cast<StoreSDNode>(N)->getAlignment() >= 4;
+  return isOffsetMultipleOf(N, 4) || cast<StoreSDNode>(N)->getAlignment() >= 4;
 }]>;
 
-def unaligned4load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
-  return cast<LoadSDNode>(N)->getAlignment() < 4;
+def NonDSFormLoad : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() < 4 && !isOffsetMultipleOf(N, 4);
 }]>;
-def unaligned4store : PatFrag<(ops node:$val, node:$ptr),
+def NonDSFormStore : PatFrag<(ops node:$val, node:$ptr),
                               (store node:$val, node:$ptr), [{
-  return cast<StoreSDNode>(N)->getAlignment() < 4;
+  return cast<StoreSDNode>(N)->getAlignment() < 4 && !isOffsetMultipleOf(N, 4);
 }]>;
-def unaligned4sextloadi32 : PatFrag<(ops node:$ptr), (sextloadi32 node:$ptr), [{
-  return cast<LoadSDNode>(N)->getAlignment() < 4;
+def NonDSFormSextLoadi32 : PatFrag<(ops node:$ptr), (sextloadi32 node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() < 4 && !isOffsetMultipleOf(N, 4);
 }]>;
 
 // This is a somewhat weaker condition than actually checking for 16-byte
@@ -617,6 +670,7 @@ def PPCU1ImmAsmOperand : AsmOperandClass {
 def u1imm   : Operand<i32> {
   let PrintMethod = "printU1ImmOperand";
   let ParserMatchClass = PPCU1ImmAsmOperand;
+  let OperandType = "OPERAND_IMMEDIATE";
 }
 
 def PPCU2ImmAsmOperand : AsmOperandClass {
@@ -626,6 +680,7 @@ def PPCU2ImmAsmOperand : AsmOperandClass {
 def u2imm   : Operand<i32> {
   let PrintMethod = "printU2ImmOperand";
   let ParserMatchClass = PPCU2ImmAsmOperand;
+  let OperandType = "OPERAND_IMMEDIATE";
 }
 
 def PPCATBitsAsHintAsmOperand : AsmOperandClass {
@@ -635,6 +690,7 @@ def PPCATBitsAsHintAsmOperand : AsmOperandClass {
 def atimm   : Operand<i32> {
   let PrintMethod = "printATBitsAsHint";
   let ParserMatchClass = PPCATBitsAsHintAsmOperand;
+  let OperandType = "OPERAND_IMMEDIATE";
 }
 
 def PPCU3ImmAsmOperand : AsmOperandClass {
@@ -644,6 +700,7 @@ def PPCU3ImmAsmOperand : AsmOperandClass {
 def u3imm   : Operand<i32> {
   let PrintMethod = "printU3ImmOperand";
   let ParserMatchClass = PPCU3ImmAsmOperand;
+  let OperandType = "OPERAND_IMMEDIATE";
 }
 
 def PPCU4ImmAsmOperand : AsmOperandClass {
@@ -653,6 +710,7 @@ def PPCU4ImmAsmOperand : AsmOperandClass {
 def u4imm   : Operand<i32> {
   let PrintMethod = "printU4ImmOperand";
   let ParserMatchClass = PPCU4ImmAsmOperand;
+  let OperandType = "OPERAND_IMMEDIATE";
 }
 def PPCS5ImmAsmOperand : AsmOperandClass {
   let Name = "S5Imm"; let PredicateMethod = "isS5Imm";
@@ -662,6 +720,7 @@ def s5imm   : Operand<i32> {
   let PrintMethod = "printS5ImmOperand";
   let ParserMatchClass = PPCS5ImmAsmOperand;
   let DecoderMethod = "decodeSImmOperand<5>";
+  let OperandType = "OPERAND_IMMEDIATE";
 }
 def PPCU5ImmAsmOperand : AsmOperandClass {
   let Name = "U5Imm"; let PredicateMethod = "isU5Imm";
@@ -671,6 +730,7 @@ def u5imm   : Operand<i32> {
   let PrintMethod = "printU5ImmOperand";
   let ParserMatchClass = PPCU5ImmAsmOperand;
   let DecoderMethod = "decodeUImmOperand<5>";
+  let OperandType = "OPERAND_IMMEDIATE";
 }
 def PPCU6ImmAsmOperand : AsmOperandClass {
   let Name = "U6Imm"; let PredicateMethod = "isU6Imm";
@@ -680,6 +740,7 @@ def u6imm   : Operand<i32> {
   let PrintMethod = "printU6ImmOperand";
   let ParserMatchClass = PPCU6ImmAsmOperand;
   let DecoderMethod = "decodeUImmOperand<6>";
+  let OperandType = "OPERAND_IMMEDIATE";
 }
 def PPCU7ImmAsmOperand : AsmOperandClass {
   let Name = "U7Imm"; let PredicateMethod = "isU7Imm";
@@ -689,6 +750,7 @@ def u7imm   : Operand<i32> {
   let PrintMethod = "printU7ImmOperand";
   let ParserMatchClass = PPCU7ImmAsmOperand;
   let DecoderMethod = "decodeUImmOperand<7>";
+  let OperandType = "OPERAND_IMMEDIATE";
 }
 def PPCU8ImmAsmOperand : AsmOperandClass {
   let Name = "U8Imm"; let PredicateMethod = "isU8Imm";
@@ -698,6 +760,7 @@ def u8imm   : Operand<i32> {
   let PrintMethod = "printU8ImmOperand";
   let ParserMatchClass = PPCU8ImmAsmOperand;
   let DecoderMethod = "decodeUImmOperand<8>";
+  let OperandType = "OPERAND_IMMEDIATE";
 }
 def PPCU10ImmAsmOperand : AsmOperandClass {
   let Name = "U10Imm"; let PredicateMethod = "isU10Imm";
@@ -707,6 +770,7 @@ def u10imm  : Operand<i32> {
   let PrintMethod = "printU10ImmOperand";
   let ParserMatchClass = PPCU10ImmAsmOperand;
   let DecoderMethod = "decodeUImmOperand<10>";
+  let OperandType = "OPERAND_IMMEDIATE";
 }
 def PPCU12ImmAsmOperand : AsmOperandClass {
   let Name = "U12Imm"; let PredicateMethod = "isU12Imm";
@@ -716,6 +780,7 @@ def u12imm  : Operand<i32> {
   let PrintMethod = "printU12ImmOperand";
   let ParserMatchClass = PPCU12ImmAsmOperand;
   let DecoderMethod = "decodeUImmOperand<12>";
+  let OperandType = "OPERAND_IMMEDIATE";
 }
 def PPCS16ImmAsmOperand : AsmOperandClass {
   let Name = "S16Imm"; let PredicateMethod = "isS16Imm";
@@ -726,6 +791,7 @@ def s16imm  : Operand<i32> {
   let EncoderMethod = "getImm16Encoding";
   let ParserMatchClass = PPCS16ImmAsmOperand;
   let DecoderMethod = "decodeSImmOperand<16>";
+  let OperandType = "OPERAND_IMMEDIATE";
 }
 def PPCU16ImmAsmOperand : AsmOperandClass {
   let Name = "U16Imm"; let PredicateMethod = "isU16Imm";
@@ -736,6 +802,7 @@ def u16imm  : Operand<i32> {
   let EncoderMethod = "getImm16Encoding";
   let ParserMatchClass = PPCU16ImmAsmOperand;
   let DecoderMethod = "decodeUImmOperand<16>";
+  let OperandType = "OPERAND_IMMEDIATE";
 }
 def PPCS17ImmAsmOperand : AsmOperandClass {
   let Name = "S17Imm"; let PredicateMethod = "isS17Imm";
@@ -749,6 +816,7 @@ def s17imm  : Operand<i32> {
   let EncoderMethod = "getImm16Encoding";
   let ParserMatchClass = PPCS17ImmAsmOperand;
   let DecoderMethod = "decodeSImmOperand<16>";
+  let OperandType = "OPERAND_IMMEDIATE";
 }
 def PPCS34ImmAsmOperand : AsmOperandClass {
   let Name = "S34Imm";
@@ -757,9 +825,17 @@ def PPCS34ImmAsmOperand : AsmOperandClass {
 }
 def s34imm : Operand<i64> {
   let PrintMethod = "printS34ImmOperand";
-  let EncoderMethod = "getImm34Encoding";
+  let EncoderMethod = "getImm34EncodingNoPCRel";
   let ParserMatchClass = PPCS34ImmAsmOperand;
   let DecoderMethod = "decodeSImmOperand<34>";
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+def s34imm_pcrel : Operand<i64> {
+  let PrintMethod = "printS34ImmOperand";
+  let EncoderMethod = "getImm34EncodingPCRel";
+  let ParserMatchClass = PPCS34ImmAsmOperand;
+  let DecoderMethod = "decodeSImmOperand<34>";
+  let OperandType = "OPERAND_IMMEDIATE";
 }
 def PPCImmZeroAsmOperand : AsmOperandClass {
   let Name = "ImmZero";
@@ -770,6 +846,7 @@ def immZero : Operand<i32> {
   let PrintMethod = "printImmZeroOperand";
   let ParserMatchClass = PPCImmZeroAsmOperand;
   let DecoderMethod = "decodeImmZeroOperand";
+  let OperandType = "OPERAND_IMMEDIATE";
 }
 
 def fpimm0 : PatLeaf<(fpimm), [{ return N->isExactlyValue(+0.0); }]>;
@@ -915,40 +992,47 @@ def memri : Operand<iPTR> {
   let MIOperandInfo = (ops dispRI:$imm, ptr_rc_nor0:$reg);
   let EncoderMethod = "getMemRIEncoding";
   let DecoderMethod = "decodeMemRIOperands";
+  let OperandType = "OPERAND_MEMORY";
 }
 def memrr : Operand<iPTR> {
   let PrintMethod = "printMemRegReg";
   let MIOperandInfo = (ops ptr_rc_nor0:$ptrreg, ptr_rc_idx:$offreg);
+  let OperandType = "OPERAND_MEMORY";
 }
 def memrix : Operand<iPTR> {   // memri where the imm is 4-aligned.
   let PrintMethod = "printMemRegImm";
   let MIOperandInfo = (ops dispRIX:$imm, ptr_rc_nor0:$reg);
   let EncoderMethod = "getMemRIXEncoding";
   let DecoderMethod = "decodeMemRIXOperands";
+  let OperandType = "OPERAND_MEMORY";
 }
 def memrix16 : Operand<iPTR> { // memri, imm is 16-aligned, 12-bit, Inst{16:27}
   let PrintMethod = "printMemRegImm";
   let MIOperandInfo = (ops dispRIX16:$imm, ptr_rc_nor0:$reg);
   let EncoderMethod = "getMemRIX16Encoding";
   let DecoderMethod = "decodeMemRIX16Operands";
+  let OperandType = "OPERAND_MEMORY";
 }
 def spe8dis : Operand<iPTR> {   // SPE displacement where the imm is 8-aligned.
   let PrintMethod = "printMemRegImm";
   let MIOperandInfo = (ops dispSPE8:$imm, ptr_rc_nor0:$reg);
   let EncoderMethod = "getSPE8DisEncoding";
   let DecoderMethod = "decodeSPE8Operands";
+  let OperandType = "OPERAND_MEMORY";
 }
 def spe4dis : Operand<iPTR> {   // SPE displacement where the imm is 4-aligned.
   let PrintMethod = "printMemRegImm";
   let MIOperandInfo = (ops dispSPE4:$imm, ptr_rc_nor0:$reg);
   let EncoderMethod = "getSPE4DisEncoding";
   let DecoderMethod = "decodeSPE4Operands";
+  let OperandType = "OPERAND_MEMORY";
 }
 def spe2dis : Operand<iPTR> {   // SPE displacement where the imm is 2-aligned.
   let PrintMethod = "printMemRegImm";
   let MIOperandInfo = (ops dispSPE2:$imm, ptr_rc_nor0:$reg);
   let EncoderMethod = "getSPE2DisEncoding";
   let DecoderMethod = "decodeSPE2Operands";
+  let OperandType = "OPERAND_MEMORY";
 }
 
 // A single-register address. This is used with the SjLj
@@ -956,6 +1040,7 @@ def spe2dis : Operand<iPTR> {   // SPE displacement where the imm is 2-aligned.
 // G8RC_NOX0 registers.
 def memr : Operand<iPTR> {
   let MIOperandInfo = (ops ptr_rc_nor0:$ptrreg);
+  let OperandType = "OPERAND_MEMORY";
 }
 def PPCTLSRegOperand : AsmOperandClass {
   let Name = "TLSReg"; let PredicateMethod = "isTLSReg";
@@ -981,11 +1066,13 @@ def pred : Operand<OtherVT> {
 // Define PowerPC specific addressing mode.
 
 // d-form
-def iaddr    : ComplexPattern<iPTR, 2, "SelectAddrImm",     [], []>;  // "stb"
+def iaddr    : ComplexPattern<iPTR, 2, "SelectAddrImm",     [], []>; // "stb"
 // ds-form
-def iaddrX4  : ComplexPattern<iPTR, 2, "SelectAddrImmX4",   [], []>;  // "std"
+def iaddrX4  : ComplexPattern<iPTR, 2, "SelectAddrImmX4",   [], []>; // "std"
 // dq-form
-def iaddrX16 : ComplexPattern<iPTR, 2, "SelectAddrImmX16",  [], []>;  // "stxv"
+def iaddrX16 : ComplexPattern<iPTR, 2, "SelectAddrImmX16",  [], []>; // "stxv"
+// 8LS:d-form
+def iaddrX34 : ComplexPattern<iPTR, 2, "SelectAddrImmX34",  [], []>; // "pstxvp"
 
 // Below forms are all x-form addressing mode, use three different ones so we
 // can make a accurate check for x-form instructions in ISEL.
@@ -1031,6 +1118,11 @@ def HasExtDiv : Predicate<"Subtarget->hasExtDiv()">;
 def IsISA3_0 : Predicate<"Subtarget->isISA3_0()">;
 def HasFPU : Predicate<"Subtarget->hasFPU()">;
 def PCRelativeMemops : Predicate<"Subtarget->hasPCRelativeMemops()">;
+def IsNotISA3_1 : Predicate<"!Subtarget->isISA3_1()">;
+
+// AIX assembler may not be modern enough to support some extended mne.
+def ModernAs: Predicate<"!Subtarget->isAIXABI() || Subtarget->HasModernAIXAs">, 
+                 AssemblerPredicate<(any_of (not AIXOS), FeatureModernAIXAs)>;
 
 //===----------------------------------------------------------------------===//
 // PowerPC Multiclass Definitions.
@@ -1389,10 +1481,7 @@ def ADJCALLSTACKUP   : PPCEmitTimePseudo<(outs), (ins u16imm:$amt1, u16imm:$amt2
                               "#ADJCALLSTACKUP $amt1 $amt2",
                               [(callseq_end timm:$amt1, timm:$amt2)]>;
 }
-
-def UPDATE_VRSAVE    : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc:$rS),
-                              "UPDATE_VRSAVE $rD, $rS", []>;
-}
+} // hasCtrlDep
 
 let Defs = [R1], Uses = [R1] in
 def DYNALLOC : PPCEmitTimePseudo<(outs gprc:$result), (ins gprc:$negsize, memri:$fpsi), "#DYNALLOC",
@@ -1518,6 +1607,9 @@ def SETRNDi : PPCCustomInserterPseudo<(outs f8rc:$FRT), (ins u2imm:$RND),
 
 def SETRND : PPCCustomInserterPseudo<(outs f8rc:$FRT), (ins gprc:$in),
                     "#SETRND", [(set f64:$FRT, (int_ppc_setrnd gprc :$in))]>;
+
+def SETFLM : PPCCustomInserterPseudo<(outs f8rc:$FRT), (ins f8rc:$FLM),
+                    "#SETFLM", [(set f64:$FRT, (int_ppc_setflm f8rc:$FLM))]>;
 }
 
 let Defs = [LR] in
@@ -1567,11 +1659,12 @@ let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in {
     def BCn : BForm_4<16, 4, 0, 0, (outs), (ins crbitrc:$bi, condbrtarget:$dst),
              "bc 4, $bi, $dst">;
 
-    let isReturn = 1, Uses = [LR, RM] in
+    let isReturn = 1, Uses = [LR, RM] in {
     def BCLR  : XLForm_2_br2<19, 16, 12, 0, (outs), (ins crbitrc:$bi),
                              "bclr 12, $bi, 0", IIC_BrB, []>;
     def BCLRn : XLForm_2_br2<19, 16, 4, 0, (outs), (ins crbitrc:$bi),
                              "bclr 4, $bi, 0", IIC_BrB, []>;
+    }
   }
 
   let isReturn = 1, Defs = [CTR], Uses = [CTR, LR, RM] in {
@@ -1843,7 +1936,7 @@ def DCBZL  : DCB_Form<1014, 1, (outs), (ins memrr:$dst), "dcbzl $dst",
                       IIC_LdStDCBF, [(int_ppc_dcbzl xoaddr:$dst)]>,
                       PPC970_DGroup_Single;
 
-def DCBF   : DCB_Form_hint<86, (outs), (ins u5imm:$TH, memrr:$dst),
+def DCBF   : DCB_Form_hint<86, (outs), (ins u3imm:$TH, memrr:$dst),
                       "dcbf $dst, $TH", IIC_LdStDCBF, []>,
                       PPC970_DGroup_Single;
 
@@ -2378,7 +2471,7 @@ let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in
 def STMW : DForm_1<47, (outs), (ins gprc:$rS, memri:$dst),
                    "stmw $rS, $dst", IIC_LdStLMW, []>;
 
-def SYNC : XForm_24_sync<31, 598, (outs), (ins i32imm:$L),
+def SYNC : XForm_24_sync<31, 598, (outs), (ins u2imm:$L),
                         "sync $L", IIC_LdStSync, []>;
 
 let isCodeGenOnly = 1 in {
@@ -2573,37 +2666,26 @@ let isCompare = 1, hasSideEffects = 0 in {
 }
 }
 let PPC970_Unit = 3, Predicates = [HasFPU] in {  // FPU Operations.
-//def FCMPO  : XForm_17<63, 32, (outs CRRC:$crD), (ins FPRC:$fA, FPRC:$fB),
-//                      "fcmpo $crD, $fA, $fB", IIC_FPCompare>;
-let isCompare = 1, hasSideEffects = 0 in {
+let isCompare = 1, mayRaiseFPException = 1, hasSideEffects = 0 in {
   def FCMPUS : XForm_17<63, 0, (outs crrc:$crD), (ins f4rc:$fA, f4rc:$fB),
                         "fcmpu $crD, $fA, $fB", IIC_FPCompare>;
-  let Interpretation64Bit = 1, isCodeGenOnly = 1 in
-  def FCMPUD : XForm_17<63, 0, (outs crrc:$crD), (ins f8rc:$fA, f8rc:$fB),
-                        "fcmpu $crD, $fA, $fB", IIC_FPCompare>;
+  def FCMPOS : XForm_17<63, 32, (outs crrc:$crD), (ins f4rc:$fA, f4rc:$fB),
+                        "fcmpo $crD, $fA, $fB", IIC_FPCompare>;
+  let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
+    def FCMPUD : XForm_17<63, 0, (outs crrc:$crD), (ins f8rc:$fA, f8rc:$fB),
+                          "fcmpu $crD, $fA, $fB", IIC_FPCompare>;
+    def FCMPOD : XForm_17<63, 32, (outs crrc:$crD), (ins f8rc:$fA, f8rc:$fB),
+                          "fcmpo $crD, $fA, $fB", IIC_FPCompare>;
+  }
 }
 
 def FTDIV: XForm_17<63, 128, (outs crrc:$crD), (ins f8rc:$fA, f8rc:$fB),
                       "ftdiv $crD, $fA, $fB", IIC_FPCompare>;
 def FTSQRT: XForm_17a<63, 160, (outs crrc:$crD), (ins f8rc:$fB),
-                      "ftsqrt $crD, $fB", IIC_FPCompare>;
-
-let Uses = [RM], mayRaiseFPException = 1 in {
-  let hasSideEffects = 0 in {
-  defm FCTIW  : XForm_26r<63, 14, (outs f8rc:$frD), (ins f8rc:$frB),
-                          "fctiw", "$frD, $frB", IIC_FPGeneral,
-                          []>;
-  defm FCTIWU  : XForm_26r<63, 142, (outs f8rc:$frD), (ins f8rc:$frB),
-                          "fctiwu", "$frD, $frB", IIC_FPGeneral,
-                          []>;
-  defm FCTIWZ : XForm_26r<63, 15, (outs f8rc:$frD), (ins f8rc:$frB),
-                          "fctiwz", "$frD, $frB", IIC_FPGeneral,
-                          [(set f64:$frD, (PPCfctiwz f64:$frB))]>;
-
-  defm FRSP   : XForm_26r<63, 12, (outs f4rc:$frD), (ins f8rc:$frB),
-                          "frsp", "$frD, $frB", IIC_FPGeneral,
-                          [(set f32:$frD, (any_fpround f64:$frB))]>;
+                      "ftsqrt $crD, $fB", IIC_FPCompare,
+                      [(set i32:$crD, (PPCftsqrt f64:$fB))]>;
 
+let mayRaiseFPException = 1, hasSideEffects = 0 in {
   let Interpretation64Bit = 1, isCodeGenOnly = 1 in
   defm FRIND  : XForm_26r<63, 392, (outs f8rc:$frD), (ins f8rc:$frB),
                           "frin", "$frD, $frB", IIC_FPGeneral,
@@ -2611,9 +2693,7 @@ let Uses = [RM], mayRaiseFPException = 1 in {
   defm FRINS  : XForm_26r<63, 392, (outs f4rc:$frD), (ins f4rc:$frB),
                           "frin", "$frD, $frB", IIC_FPGeneral,
                           [(set f32:$frD, (any_fround f32:$frB))]>;
-  }
 
-  let hasSideEffects = 0 in {
   let Interpretation64Bit = 1, isCodeGenOnly = 1 in
   defm FRIPD  : XForm_26r<63, 456, (outs f8rc:$frD), (ins f8rc:$frB),
                           "frip", "$frD, $frB", IIC_FPGeneral,
@@ -2635,6 +2715,22 @@ let Uses = [RM], mayRaiseFPException = 1 in {
   defm FRIMS  : XForm_26r<63, 488, (outs f4rc:$frD), (ins f4rc:$frB),
                           "frim", "$frD, $frB", IIC_FPGeneral,
                           [(set f32:$frD, (any_ffloor f32:$frB))]>;
+}
+
+let Uses = [RM], mayRaiseFPException = 1, hasSideEffects = 0 in {
+  defm FCTIW  : XForm_26r<63, 14, (outs f8rc:$frD), (ins f8rc:$frB),
+                          "fctiw", "$frD, $frB", IIC_FPGeneral,
+                          []>;
+  defm FCTIWU  : XForm_26r<63, 142, (outs f8rc:$frD), (ins f8rc:$frB),
+                          "fctiwu", "$frD, $frB", IIC_FPGeneral,
+                          []>;
+  defm FCTIWZ : XForm_26r<63, 15, (outs f8rc:$frD), (ins f8rc:$frB),
+                          "fctiwz", "$frD, $frB", IIC_FPGeneral,
+                          [(set f64:$frD, (PPCany_fctiwz f64:$frB))]>;
+
+  defm FRSP   : XForm_26r<63, 12, (outs f4rc:$frD), (ins f8rc:$frB),
+                          "frsp", "$frD, $frB", IIC_FPGeneral,
+                          [(set f32:$frD, (any_fpround f64:$frB))]>;
 
   defm FSQRT  : XForm_26r<63, 22, (outs f8rc:$frD), (ins f8rc:$frB),
                           "fsqrt", "$frD, $frB", IIC_FPSqrtD,
@@ -2642,9 +2738,10 @@ let Uses = [RM], mayRaiseFPException = 1 in {
   defm FSQRTS : XForm_26r<59, 22, (outs f4rc:$frD), (ins f4rc:$frB),
                           "fsqrts", "$frD, $frB", IIC_FPSqrtS,
                           [(set f32:$frD, (any_fsqrt f32:$frB))]>;
-  }
-  }
 }
+}
+
+def : Pat<(PPCfsqrt f64:$frA), (FSQRT $frA)>;
 
 /// Note that FMR is defined as pseudo-ops on the PPC970 because they are
 /// often coalesced away and we don't want the dispatch group builder to think
@@ -2689,6 +2786,7 @@ defm FCPSGND : XForm_28r<63, 8, (outs f8rc:$frD), (ins f8rc:$frA, f8rc:$frB),
                         [(set f64:$frD, (fcopysign f64:$frB, f64:$frA))]>;
 
 // Reciprocal estimates.
+let mayRaiseFPException = 1 in {
 defm FRE      : XForm_26r<63, 24, (outs f8rc:$frD), (ins f8rc:$frB),
                           "fre", "$frD, $frB", IIC_FPGeneral,
                           [(set f64:$frD, (PPCfre f64:$frB))]>;
@@ -2702,6 +2800,7 @@ defm FRSQRTES : XForm_26r<59, 26, (outs f4rc:$frD), (ins f4rc:$frB),
                           "frsqrtes", "$frD, $frB", IIC_FPGeneral,
                           [(set f32:$frD, (PPCfrsqrte f32:$frB))]>;
 }
+}
 
 // XL-Form instructions.  condition register logical ops.
 //
@@ -2862,18 +2961,6 @@ let isCodeGenOnly = 1 in {
 def : InstAlias<"mtvrsave $rS", (MTVRSAVE gprc:$rS)>;
 def : InstAlias<"mfvrsave $rS", (MFVRSAVE gprc:$rS)>;
 
-// SPILL_VRSAVE - Indicate that we're dumping the VRSAVE register,
-// so we'll need to scavenge a register for it.
-let mayStore = 1 in
-def SPILL_VRSAVE : PPCEmitTimePseudo<(outs), (ins VRSAVERC:$vrsave, memri:$F),
-                     "#SPILL_VRSAVE", []>;
-
-// RESTORE_VRSAVE - Indicate that we're restoring the VRSAVE register (previously
-// spilled), so we'll need to scavenge a register for it.
-let mayLoad = 1 in
-def RESTORE_VRSAVE : PPCEmitTimePseudo<(outs VRSAVERC:$vrsave), (ins memri:$F),
-                     "#RESTORE_VRSAVE", []>;
-
 let hasSideEffects = 0 in {
 // mtocrf's input needs to be prepared by shifting by an amount dependent
 // on the cr register selected. Thus, post-ra anti-dep breaking must not
@@ -2913,20 +3000,24 @@ def : InstAlias<"mtcr $rA", (MTCRF 255, gprc:$rA)>;
 
 let Predicates = [HasFPU] in {
 // Custom inserter instruction to perform FADD in round-to-zero mode.
-let Uses = [RM] in {
+let Uses = [RM], mayRaiseFPException = 1 in {
   def FADDrtz: PPCCustomInserterPseudo<(outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRB), "",
-                      [(set f64:$FRT, (PPCfaddrtz f64:$FRA, f64:$FRB))]>;
+                      [(set f64:$FRT, (PPCany_faddrtz f64:$FRA, f64:$FRB))]>;
 }
 
 // The above pseudo gets expanded to make use of the following instructions
 // to manipulate FPSCR.  Note that FPSCR is not modeled at the DAG level.
-let Uses = [RM], Defs = [RM] in {
-  def MTFSB0 : XForm_43<63, 70, (outs), (ins u5imm:$FM),
-                        "mtfsb0 $FM", IIC_IntMTFSB0, []>,
-               PPC970_DGroup_Single, PPC970_Unit_FPU;
-  def MTFSB1 : XForm_43<63, 38, (outs), (ins u5imm:$FM),
-                        "mtfsb1 $FM", IIC_IntMTFSB0, []>,
-               PPC970_DGroup_Single, PPC970_Unit_FPU;
+
+// When FM is 30/31, we are setting the 62/63 bit of FPSCR, the implicit-def
+// RM should be set.
+def MTFSB0 : XForm_43<63, 70, (outs), (ins u5imm:$FM),
+                      "mtfsb0 $FM", IIC_IntMTFSB0, []>,
+             PPC970_DGroup_Single, PPC970_Unit_FPU;
+def MTFSB1 : XForm_43<63, 38, (outs), (ins u5imm:$FM),
+                      "mtfsb1 $FM", IIC_IntMTFSB0, []>,
+             PPC970_DGroup_Single, PPC970_Unit_FPU;
+
+let Defs = [RM] in {
   let isCodeGenOnly = 1 in
   def MTFSFb  : XFLForm<63, 711, (outs), (ins i32imm:$FM, f8rc:$rT),
                         "mtfsf $FM, $rT", IIC_IntMTFSB0, []>,
@@ -3065,7 +3156,7 @@ def : InstAlias<"subc. $rA, $rB, $rC", (SUBFC_rec gprc:$rA, gprc:$rC, gprc:$rB)>
 // this type.
 //
 let PPC970_Unit = 3, hasSideEffects = 0, Predicates = [HasFPU] in {  // FPU Operations.
-let Uses = [RM] in {
+let mayRaiseFPException = 1, Uses = [RM] in {
 let isCommutable = 1 in {
   defm FMADD : AForm_1r<63, 29,
                       (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC, f8rc:$FRB),
@@ -3251,9 +3342,13 @@ def : Pat<(PPCcall (i32 texternalsym:$dst)),
 // Calls for AIX only
 def : Pat<(PPCcall (i32 mcsym:$dst)),
           (BL mcsym:$dst)>;
+
 def : Pat<(PPCcall_nop (i32 mcsym:$dst)),
           (BL_NOP mcsym:$dst)>;
 
+def : Pat<(PPCcall_nop (i32 texternalsym:$dst)),
+          (BL_NOP texternalsym:$dst)>;
+
 def : Pat<(PPCtc_return (i32 tglobaladdr:$dst),  imm:$imm),
           (TCRETURNdi tglobaladdr:$dst, imm:$imm)>;
 
@@ -3263,7 +3358,7 @@ def : Pat<(PPCtc_return (i32 texternalsym:$dst), imm:$imm),
 def : Pat<(PPCtc_return CTRRC:$dst, imm:$imm),
           (TCRETURNri CTRRC:$dst, imm:$imm)>;
 
-
+def : Pat<(int_ppc_readflm), (MFFS)>;
 
 // Hi and Lo for Darwin Global Addresses.
 def : Pat<(PPChi tglobaladdr:$in, 0), (LIS tglobaladdr:$in)>;
@@ -3417,7 +3512,7 @@ def : Pat<(f64 (extloadf32 iaddr:$src)),
 def : Pat<(f64 (extloadf32 xaddr:$src)),
           (COPY_TO_REGCLASS (LFSX xaddr:$src), F8RC)>;
 
-def : Pat<(f64 (fpextend f32:$src)),
+def : Pat<(f64 (any_fpextend f32:$src)),
           (COPY_TO_REGCLASS $src, F8RC)>;
 }
 
@@ -3457,7 +3552,6 @@ include "PPCInstrAltivec.td"
 include "PPCInstrSPE.td"
 include "PPCInstr64Bit.td"
 include "PPCInstrVSX.td"
-include "PPCInstrQPX.td"
 include "PPCInstrHTM.td"
 
 def crnot : OutPatFrag<(ops node:$in),
@@ -3841,6 +3935,7 @@ def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETGT)),
 def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETEQ)),
           (EXTRACT_SUBREG (CMPD $s1, $s2), sub_eq)>;
 
+let Predicates = [IsNotISA3_1] in {
 // Instantiations of CRNotPat for i32.
 defm : CRNotPat<(i1 (setcc i32:$s1, immZExt16:$imm, SETUGE)),
                 (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_lt)>;
@@ -3898,106 +3993,62 @@ defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETLE)),
                 (EXTRACT_SUBREG (CMPD $s1, $s2), sub_gt)>;
 defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETNE)),
                 (EXTRACT_SUBREG (CMPD $s1, $s2), sub_eq)>;
+}
 
-let Predicates = [HasFPU] in {
-// Instantiations of CRNotPat for f32.
-defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETUGE)),
-                (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>;
-defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETGE)),
-                (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>;
-defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETULE)),
-                (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>;
-defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETLE)),
-                (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>;
-defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETUNE)),
-                (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>;
-defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETNE)),
-                (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>;
-defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETO)),
-                (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_un)>;
-
-// Instantiations of CRNotPat for f64.
-defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETUGE)),
-                (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>;
-defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETGE)),
-                (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>;
-defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETULE)),
-                (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>;
-defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETLE)),
-                (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>;
-defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETUNE)),
-                (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>;
-defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETNE)),
-                (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>;
-defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETO)),
-                (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_un)>;
-
-// Instantiations of CRNotPat for f128.
-defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETUGE)),
-                (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_lt)>;
-defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETGE)),
-                (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_lt)>;
-defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETULE)),
-                (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_gt)>;
-defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETLE)),
-                (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_gt)>;
-defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETUNE)),
-                (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_eq)>;
-defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETNE)),
-                (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_eq)>;
-defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETO)),
-                (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_un)>;
+multiclass FSetCCPat<SDNode SetCC, ValueType Ty, PatLeaf FCmp> {
+  defm : CRNotPat<(i1 (SetCC Ty:$s1, Ty:$s2, SETUGE)),
+                  (EXTRACT_SUBREG (FCmp $s1, $s2), sub_lt)>;
+  defm : CRNotPat<(i1 (SetCC Ty:$s1, Ty:$s2, SETGE)),
+                  (EXTRACT_SUBREG (FCmp $s1, $s2), sub_lt)>;
+  defm : CRNotPat<(i1 (SetCC Ty:$s1, Ty:$s2, SETULE)),
+                  (EXTRACT_SUBREG (FCmp $s1, $s2), sub_gt)>;
+  defm : CRNotPat<(i1 (SetCC Ty:$s1, Ty:$s2, SETLE)),
+                  (EXTRACT_SUBREG (FCmp $s1, $s2), sub_gt)>;
+  defm : CRNotPat<(i1 (SetCC Ty:$s1, Ty:$s2, SETUNE)),
+                  (EXTRACT_SUBREG (FCmp $s1, $s2), sub_eq)>;
+  defm : CRNotPat<(i1 (SetCC Ty:$s1, Ty:$s2, SETNE)),
+                  (EXTRACT_SUBREG (FCmp $s1, $s2), sub_eq)>;
+  defm : CRNotPat<(i1 (SetCC Ty:$s1, Ty:$s2, SETO)),
+                  (EXTRACT_SUBREG (FCmp $s1, $s2), sub_un)>;
+
+  def : Pat<(i1 (SetCC Ty:$s1, Ty:$s2, SETOLT)),
+            (EXTRACT_SUBREG (FCmp $s1, $s2), sub_lt)>;
+  def : Pat<(i1 (SetCC Ty:$s1, Ty:$s2, SETLT)),
+            (EXTRACT_SUBREG (FCmp $s1, $s2), sub_lt)>;
+  def : Pat<(i1 (SetCC Ty:$s1, Ty:$s2, SETOGT)),
+            (EXTRACT_SUBREG (FCmp $s1, $s2), sub_gt)>;
+  def : Pat<(i1 (SetCC Ty:$s1, Ty:$s2, SETGT)),
+            (EXTRACT_SUBREG (FCmp $s1, $s2), sub_gt)>;
+  def : Pat<(i1 (SetCC Ty:$s1, Ty:$s2, SETOEQ)),
+            (EXTRACT_SUBREG (FCmp $s1, $s2), sub_eq)>;
+  def : Pat<(i1 (SetCC Ty:$s1, Ty:$s2, SETEQ)),
+            (EXTRACT_SUBREG (FCmp $s1, $s2), sub_eq)>;
+  def : Pat<(i1 (SetCC Ty:$s1, Ty:$s2, SETUO)),
+            (EXTRACT_SUBREG (FCmp $s1, $s2), sub_un)>;
 }
 
-// SETCC for f32.
 let Predicates = [HasFPU] in {
-def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETOLT)),
-          (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>;
-def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETLT)),
-          (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>;
-def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETOGT)),
-          (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>;
-def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETGT)),
-          (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>;
-def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETOEQ)),
-          (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>;
-def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETEQ)),
-          (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>;
-def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETUO)),
-          (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_un)>;
+// FCMPU: If either of the operands is a Signaling NaN, then VXSNAN is set.
+// SETCC for f32.
+defm : FSetCCPat<any_fsetcc, f32, FCMPUS>;
 
 // SETCC for f64.
-def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETOLT)),
-          (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>;
-def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETLT)),
-          (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>;
-def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETOGT)),
-          (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>;
-def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETGT)),
-          (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>;
-def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETOEQ)),
-          (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>;
-def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETEQ)),
-          (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>;
-def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETUO)),
-          (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_un)>;
+defm : FSetCCPat<any_fsetcc, f64, FCMPUD>;
 
 // SETCC for f128.
-def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETOLT)),
-          (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_lt)>;
-def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETLT)),
-          (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_lt)>;
-def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETOGT)),
-          (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_gt)>;
-def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETGT)),
-          (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_gt)>;
-def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETOEQ)),
-          (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_eq)>;
-def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETEQ)),
-          (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_eq)>;
-def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETUO)),
-          (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_un)>;
+defm : FSetCCPat<any_fsetcc, f128, XSCMPUQP>;
+
+// FCMPO: If either of the operands is a Signaling NaN, then VXSNAN is set and,
+// if neither operand is a Signaling NaN but at least one operand is a Quiet NaN,
+// then VXVC is set.
+// SETCCS for f32.
+defm : FSetCCPat<strict_fsetccs, f32, FCMPOS>;
+
+// SETCCS for f64.
+defm : FSetCCPat<strict_fsetccs, f64, FCMPOD>;
 
+// SETCCS for f128.
+defm : FSetCCPat<strict_fsetccs, f128, XSCMPOQP>;
 }
 
 // This must be in this file because it relies on patterns defined in this file
@@ -4266,7 +4317,7 @@ def ISYNC : XLForm_2_ext<19, 150, 0, 0, 0, (outs), (ins),
 def ICBI : XForm_1a<31, 982, (outs), (ins memrr:$src),
                     "icbi $src", IIC_LdStICBI, []>;
 
-def WAIT : XForm_24_sync<31, 30, (outs), (ins i32imm:$L),
+def WAIT : XForm_24_sync<31, 30, (outs), (ins u2imm:$L),
                          "wait $L", IIC_LdStLoad, []>;
 
 def MBAR : XForm_mbar<31, 854, (outs), (ins u5imm:$MO),
@@ -4284,7 +4335,7 @@ def MTSRIN: XForm_srin<31, 242, (outs), (ins gprc:$RS, gprc:$RB),
 def MFSRIN: XForm_srin<31, 659, (outs gprc:$RS), (ins gprc:$RB),
             "mfsrin $RS, $RB", IIC_SprMFSR>;
 
-def MTMSR: XForm_mtmsr<31, 146, (outs), (ins gprc:$RS, i32imm:$L),
+def MTMSR: XForm_mtmsr<31, 146, (outs), (ins gprc:$RS, u1imm:$L),
                     "mtmsr $RS, $L", IIC_SprMTMSR>;
 
 def WRTEE: XForm_mtmsr<31, 131, (outs), (ins gprc:$RS),
@@ -4313,15 +4364,17 @@ def : InstAlias<"iccci", (ICCCI R0, R0)>, Requires<[IsPPC4xx]>;
 def MFMSR : XForm_rs<31, 83, (outs gprc:$RT), (ins),
                   "mfmsr $RT", IIC_SprMFMSR, []>;
 
-def MTMSRD : XForm_mtmsr<31, 178, (outs), (ins gprc:$RS, i32imm:$L),
+def MTMSRD : XForm_mtmsr<31, 178, (outs), (ins gprc:$RS, u1imm:$L),
                     "mtmsrd $RS, $L", IIC_SprMTMSRD>;
 
 def MCRFS : XLForm_3<63, 64, (outs crrc:$BF), (ins crrc:$BFA),
                      "mcrfs $BF, $BFA", IIC_BrMCR>;
 
+// If W is 0 and BF is 7, the 60:63 bits will be set, we should set the
+// implicit-def RM.
 def MTFSFI : XLForm_4<63, 134, (outs crrc:$BF), (ins i32imm:$U, i32imm:$W),
                       "mtfsfi $BF, $U, $W", IIC_IntMFFS>;
-
+let Defs = [CR1] in
 def MTFSFI_rec : XLForm_4<63, 134, (outs crrc:$BF), (ins i32imm:$U, i32imm:$W),
                        "mtfsfi. $BF, $U, $W", IIC_IntMFFS>, isRecordForm;
 
@@ -4329,12 +4382,15 @@ def : InstAlias<"mtfsfi $BF, $U", (MTFSFI crrc:$BF, i32imm:$U, 0)>;
 def : InstAlias<"mtfsfi. $BF, $U", (MTFSFI_rec crrc:$BF, i32imm:$U, 0)>;
 
 let Predicates = [HasFPU] in {
+let Defs = [RM] in {
 def MTFSF : XFLForm_1<63, 711, (outs),
-                      (ins i32imm:$FLM, f8rc:$FRB, i32imm:$L, i32imm:$W),
+                      (ins i32imm:$FLM, f8rc:$FRB, u1imm:$L, i32imm:$W),
                       "mtfsf $FLM, $FRB, $L, $W", IIC_IntMFFS, []>;
+let Defs = [CR1] in
 def MTFSF_rec : XFLForm_1<63, 711, (outs),
-                       (ins i32imm:$FLM, f8rc:$FRB, i32imm:$L, i32imm:$W),
+                       (ins i32imm:$FLM, f8rc:$FRB, u1imm:$L, i32imm:$W),
                        "mtfsf. $FLM, $FRB, $L, $W", IIC_IntMFFS, []>, isRecordForm;
+}
 
 def : InstAlias<"mtfsf $FLM, $FRB", (MTFSF i32imm:$FLM, f8rc:$FRB, 0, 0)>;
 def : InstAlias<"mtfsf. $FLM, $FRB", (MTFSF_rec i32imm:$FLM, f8rc:$FRB, 0, 0)>;
@@ -4561,6 +4617,16 @@ def : Pat<(int_ppc_dcbfl xoaddr:$dst),
 def : Pat<(int_ppc_dcbflp xoaddr:$dst),
           (DCBF 3, xoaddr:$dst)>;
 
+let Predicates = [IsISA3_1] in {
+  def DCBFPS  : PPCAsmPseudo<"dcbfps $dst", (ins memrr:$dst)>;
+  def DCBSTPS : PPCAsmPseudo<"dcbstps $dst", (ins memrr:$dst)>;
+
+  def : Pat<(int_ppc_dcbfps xoaddr:$dst),
+            (DCBF 4, xoaddr:$dst)>;
+  def : Pat<(int_ppc_dcbstps xoaddr:$dst),
+            (DCBF 6, xoaddr:$dst)>;
+}
+
 def : InstAlias<"crset $bx", (CREQV crbitrc:$bx, crbitrc:$bx, crbitrc:$bx)>;
 def : InstAlias<"crclr $bx", (CRXOR crbitrc:$bx, crbitrc:$bx, crbitrc:$bx)>;
 def : InstAlias<"crmove $bx, $by", (CROR crbitrc:$bx, crbitrc:$by, crbitrc:$by)>;
@@ -4587,8 +4653,11 @@ def : InstAlias<"mtmsr $RS", (MTMSR gprc:$RS, 0)>;
 def : InstAlias<"mtxer $Rx", (MTSPR 1, gprc:$Rx)>;
 def : InstAlias<"mfxer $Rx", (MFSPR gprc:$Rx, 1)>;
 
+//Disable this alias on AIX for now because as does not support them.
+let Predicates = [ModernAs] in {
 def : InstAlias<"mtudscr $Rx", (MTSPR 3, gprc:$Rx)>;
 def : InstAlias<"mfudscr $Rx", (MFSPR gprc:$Rx, 3)>;
+}
 
 def : InstAlias<"mfrtcu $Rx", (MFSPR gprc:$Rx, 4)>;
 def : InstAlias<"mfrtcl $Rx", (MFSPR gprc:$Rx, 5)>;
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrPrefix.td b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
index 2bab73418e10..b9eb3b3b7d37 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
@@ -1,3 +1,8 @@
+// Mask immediates for MMA instructions (2, 4 and 8 bits).
+def Msk2Imm : ImmLeaf<i32, [{ return isUInt<2>(Imm); }]>;
+def Msk4Imm : ImmLeaf<i32, [{ return isUInt<4>(Imm); }]>;
+def Msk8Imm : ImmLeaf<i32, [{ return isUInt<8>(Imm); }]>;
+
 //===----------------------------------------------------------------------===//
 // PowerPC ISA 3.1 specific type constraints.
 //
@@ -5,12 +10,35 @@
 def SDT_PPCSplat32 : SDTypeProfile<1, 3, [ SDTCisVT<0, v2i64>,
   SDTCisVec<1>, SDTCisInt<2>, SDTCisInt<3>
 ]>;
+def SDT_PPCAccBuild : SDTypeProfile<1, 4, [
+  SDTCisVT<0, v512i1>, SDTCisVT<1, v4i32>, SDTCisVT<2, v4i32>,
+                       SDTCisVT<3, v4i32>, SDTCisVT<4, v4i32>
+]>;
+def SDT_PPCPairBuild : SDTypeProfile<1, 2, [
+  SDTCisVT<0, v256i1>, SDTCisVT<1, v4i32>, SDTCisVT<2, v4i32>
+]>;
+def SDT_PPCAccExtractVsx : SDTypeProfile<1, 2, [
+  SDTCisVT<0, v4i32>, SDTCisVT<1, v512i1>, SDTCisInt<2>
+]>;
+def SDT_PPCPairExtractVsx : SDTypeProfile<1, 2, [
+  SDTCisVT<0, v4i32>, SDTCisVT<1, v256i1>, SDTCisInt<2>
+]>;
+def SDT_PPCxxmfacc : SDTypeProfile<1, 1, [
+  SDTCisVT<0, v512i1>, SDTCisVT<1, v512i1>
+]>;
 
 //===----------------------------------------------------------------------===//
 // ISA 3.1 specific PPCISD nodes.
 //
 
 def PPCxxsplti32dx : SDNode<"PPCISD::XXSPLTI32DX", SDT_PPCSplat32, []>;
+def PPCAccBuild : SDNode<"PPCISD::ACC_BUILD", SDT_PPCAccBuild, []>;
+def PPCPairBuild : SDNode<"PPCISD::PAIR_BUILD", SDT_PPCPairBuild, []>;
+def PPCAccExtractVsx : SDNode<"PPCISD::EXTRACT_VSX_REG", SDT_PPCAccExtractVsx,
+                       []>;
+def PPCPairExtractVsx : SDNode<"PPCISD::EXTRACT_VSX_REG", SDT_PPCPairExtractVsx,
+                        []>;
+def PPCxxmfacc : SDNode<"PPCISD::XXMFACC", SDT_PPCxxmfacc, []>;
 
 //===----------------------------------------------------------------------===//
 
@@ -18,6 +46,15 @@ def PPCxxsplti32dx : SDNode<"PPCISD::XXSPLTI32DX", SDT_PPCSplat32, []>;
 // address computations).
 class isPCRel { bit PCRel = 1; }
 
+// PowerPC specific type constraints.
+def SDT_PPCLXVRZX : SDTypeProfile<1, 2, [
+  SDTCisVT<0, v1i128>, SDTCisPtrTy<1>, SDTCisPtrTy<2>
+]>;
+
+// PPC Specific DAG Nodes.
+def PPClxvrzx : SDNode<"PPCISD::LXVRZX", SDT_PPCLXVRZX,
+                       [SDNPHasChain, SDNPMayLoad]>;
+
 // Top-level class for prefixed instructions.
 class PI<bits<6> pref, bits<6> opcode, dag OOL, dag IOL, string asmstr,
          InstrItinClass itin> : Instruction {
@@ -59,6 +96,39 @@ class PI<bits<6> pref, bits<6> opcode, dag OOL, dag IOL, string asmstr,
   string BaseName = "";
 }
 
+// VX-Form: [ PO VT R VB RC XO ]
+class VXForm_VTB5_RC<bits<10> xo, bits<5> R, dag OOL, dag IOL, string asmstr,
+                      InstrItinClass itin, list<dag> pattern>
+  : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VT;
+  bits<5> VB;
+  bit RC = 0;
+
+  let Pattern = pattern;
+
+  let Inst{6-10} = VT;
+  let Inst{11-15} = R;
+  let Inst{16-20} = VB;
+  let Inst{21} = RC;
+  let Inst{22-31} = xo;
+}
+
+// Multiclass definition to account for record and non-record form
+// instructions of VXRForm.
+multiclass VXForm_VTB5_RCr<bits<10> xo, bits<5> R, dag OOL, dag IOL,
+                            string asmbase, string asmstr,
+                            InstrItinClass itin, list<dag> pattern> {
+  let BaseName = asmbase in {
+    def NAME : VXForm_VTB5_RC<xo, R, OOL, IOL,
+                               !strconcat(asmbase, !strconcat(" ", asmstr)),
+                               itin, pattern>, RecFormRel;
+    let Defs = [CR6] in
+    def _rec : VXForm_VTB5_RC<xo, R, OOL, IOL,
+                               !strconcat(asmbase, !strconcat(". ", asmstr)),
+                               itin, []>, isRecordForm, RecFormRel;
+  }
+}
+
 class MLS_DForm_R_SI34_RTA5_MEM<bits<6> opcode, dag OOL, dag IOL, string asmstr,
                                 InstrItinClass itin, list<dag> pattern>
   : PI<1, opcode, OOL, IOL, asmstr, itin> {
@@ -242,29 +312,37 @@ class VXForm_RD5_N3_VB5<bits<11> xo, dag OOL, dag IOL, string asmstr,
 }
 
 
-// VX-Form: [PO VRT / UIM RB XO].
-// We use VXForm_1 to implement it, that is, we use "VRA" (5 bit) to represent
-// "/ UIM" (unused bit followed by a 4-bit immediate)
-// Destructive (insert) forms are suffixed with _ins.
-class VXForm_VRT5_UIM5_RB5_ins<bits<11> xo, string opc, list<dag> pattern>
-  : VXForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vDi, u4imm:$UIM, g8rc:$rB),
-             !strconcat(opc, " $vD, $rB, $UIM"), IIC_VecGeneral, pattern>,
-             RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">;
-
 // VX-Form: [PO VRT RA VRB XO].
 // Destructive (insert) forms are suffixed with _ins.
 class VXForm_VTB5_RA5_ins<bits<11> xo, string opc, list<dag> pattern>
-  : VXForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vDi, g8rc:$rA, vrrc:$vB),
+  : VXForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vDi, gprc:$rA, vrrc:$vB),
              !strconcat(opc, " $vD, $rA, $vB"), IIC_VecGeneral, pattern>,
              RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">;
 
 // VX-Form: [PO VRT RA RB XO].
 // Destructive (insert) forms are suffixed with _ins.
 class VXForm_VRT5_RAB5_ins<bits<11> xo, string opc, list<dag> pattern>
-  : VXForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vDi, g8rc:$rA, g8rc:$rB),
+  : VXForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vDi, gprc:$rA, gprc:$rB),
              !strconcat(opc, " $vD, $rA, $rB"), IIC_VecGeneral, pattern>,
              RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">;
 
+// VX-Form: [ PO BF // VRA VRB XO ]
+class VXForm_BF3_VAB5<bits<11> xo, dag OOL, dag IOL, string asmstr,
+                      InstrItinClass itin, list<dag> pattern>
+  : I<4, OOL, IOL, asmstr, itin> {
+  bits<3> BF;
+  bits<5> VA;
+  bits<5> VB;
+
+  let Pattern = pattern;
+
+  let Inst{6-8} = BF;
+  let Inst{9-10} = 0;
+  let Inst{11-15} = VA;
+  let Inst{16-20} = VB;
+  let Inst{21-31} = xo;
+}
+
 // VN-Form: [PO VRT VRA VRB PS SD XO]
 // SD is "Shift Direction"
 class VNForm_VTAB5_SD3<bits<6> xo, bits<2> ps, dag OOL, dag IOL, string asmstr,
@@ -285,6 +363,22 @@ class VNForm_VTAB5_SD3<bits<6> xo, bits<2> ps, dag OOL, dag IOL, string asmstr,
   let Inst{26-31} = xo;
 }
 
+class VXForm_RD5_MP_VB5<bits<11> xo, bits<4> eo, dag OOL, dag IOL,
+                        string asmstr, InstrItinClass itin, list<dag> pattern>
+  : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> RD;
+  bits<5> VB;
+  bit MP;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = RD;
+  let Inst{11-14} = eo;
+  let Inst{15} = MP;
+  let Inst{16-20} = VB;
+  let Inst{21-31} = xo;
+}
+
 // 8RR:D-Form: [ 1 1 0 // // imm0
 //               PO T XO TX imm1 ].
 class 8RR_DForm_IMM32_XT6<bits<6> opcode, bits<4> xo, dag OOL, dag IOL,
@@ -415,6 +509,13 @@ class XX2_BF3_XO5_XB6_XO9<bits<6> opcode, bits<5> xo2, bits<9> xo, dag OOL,
   let Inst{31}    = 0;
 }
 
+// X-Form: [ PO RT BI /// XO / ]
+class XForm_XT5_BI5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+                    string asmstr, InstrItinClass itin, list<dag> pattern>
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let B = 0;
+}
+
 multiclass MLS_DForm_R_SI34_RTA5_MEM_p<bits<6> opcode, dag OOL, dag IOL,
                                        dag PCRel_IOL, string asmstr,
                                        InstrItinClass itin> {
@@ -444,14 +545,307 @@ multiclass 8LS_DForm_R_SI34_XT6_RA5_p<bits<5> opcode, dag OOL, dag IOL,
                                     isPCRel;
 }
 
+def PPCRegVSRpRCAsmOperand : AsmOperandClass {
+  let Name = "RegVSRpRC"; let PredicateMethod = "isVSRpEvenRegNumber";
+}
+
+def vsrprc : RegisterOperand<VSRpRC> {
+  let ParserMatchClass = PPCRegVSRpRCAsmOperand;
+}
+
+def PPCRegVSRpEvenRCAsmOperand : AsmOperandClass {
+  let Name = "RegVSRpEvenRC"; let PredicateMethod = "isVSRpEvenRegNumber";
+}
+
+def vsrpevenrc : RegisterOperand<VSRpRC> {
+  let ParserMatchClass = PPCRegVSRpEvenRCAsmOperand;
+  let EncoderMethod = "getVSRpEvenEncoding";
+  let DecoderMethod = "decodeVSRpEvenOperands";
+}
+
+class DQForm_XTp5_RA17_MEM<bits<6> opcode, bits<4> xo, dag OOL, dag IOL,
+                           string asmstr, InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> XTp;
+  bits<17> DQ_RA;
+  let Pattern = pattern;
+
+  let Inst{6-9} = XTp{3-0};
+  let Inst{10} = XTp{4};
+  let Inst{11-15} = DQ_RA{16-12};  // Register #
+  let Inst{16-27} = DQ_RA{11-0};   // Displacement.
+  let Inst{28-31} = xo;
+}
+
+class XForm_XTp5_XAB5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+                      string asmstr, InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin>, XFormMemOp {
+  bits<5> XTp;
+  bits<5> A;
+  bits<5> B;
+
+  let Pattern = pattern;
+  let Inst{6-9} = XTp{3-0};
+  let Inst{10} = XTp{4};
+  let Inst{11-15} = A;
+  let Inst{16-20} = B;
+  let Inst{21-30} = xo;
+  let Inst{31} = 0;
+}
+
+class 8LS_DForm_R_XTp5_SI34_MEM<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+                                InstrItinClass itin, list<dag> pattern>
+  : PI<1, opcode, OOL, IOL, asmstr, itin> {
+  bits<5> XTp;
+  bits<39> D_RA;
+
+  let Pattern = pattern;
+
+  // The prefix.
+  let Inst{6-10} = 0;
+  let Inst{11} = PCRel;
+  let Inst{12-13} = 0;
+  let Inst{14-31} = D_RA{33-16}; // Imm18
+
+  // The instruction.
+  let Inst{38-41} = XTp{3-0};
+  let Inst{42}    = XTp{4};
+  let Inst{43-47} = D_RA{38-34};   // Register #
+  let Inst{48-63} = D_RA{15-0};    // D
+}
+
+multiclass 8LS_DForm_R_XTp5_SI34_MEM_p<bits<6> pref, bits<6> opcode, dag OOL,
+                                       dag IOL, dag PCRel_IOL,
+                                       string asmstr, InstrItinClass itin> {
+  def NAME : 8LS_DForm_R_XTp5_SI34_MEM<opcode, OOL, IOL,
+                                       !strconcat(asmstr, ", 0"), itin, []>;
+  def pc : 8LS_DForm_R_XTp5_SI34_MEM<opcode, OOL, PCRel_IOL,
+                                     !strconcat(asmstr, ", 1"), itin, []>,
+                                     isPCRel;
+}
+
+def PPCRegACCRCAsmOperand : AsmOperandClass {
+  let Name = "RegACCRC"; let PredicateMethod = "isACCRegNumber";
+}
+
+def acc : RegisterOperand<ACCRC> {
+  let ParserMatchClass = PPCRegACCRCAsmOperand;
+}
+
+def uacc : RegisterOperand<UACCRC> {
+  let ParserMatchClass = PPCRegACCRCAsmOperand;
+}
+
+// [PO AS XO2 XO]
+class XForm_AT3<bits<6> opcode, bits<5> xo2, bits<10> xo, dag OOL, dag IOL,
+                    string asmstr, InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<3> AT;
+
+  let Pattern = pattern;
+
+  let Inst{6-8}  = AT;
+  let Inst{9-10}  = 0;
+  let Inst{11-15} = xo2;
+  let Inst{16-20} = 0;
+  let Inst{21-30} = xo;
+  let Inst{31} = 0;
+}
+
+class XX3Form_AT3_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
+                           string asmstr, InstrItinClass itin,
+                           list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<3> AT;
+  bits<6> XA;
+  bits<6> XB;
+
+  let Pattern = pattern;
+
+  let Inst{6-8} = AT;
+  let Inst{9-10} = 0;
+  let Inst{11-15} = XA{4-0};
+  let Inst{16-20} = XB{4-0};
+  let Inst{21-28} = xo;
+  let Inst{29}    = XA{5};
+  let Inst{30}    = XB{5};
+  let Inst{31} = 0;
+}
+
+class MMIRR_XX3Form_XY4P2_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
+                               string asmstr, InstrItinClass itin,
+                               list<dag> pattern>
+  : PI<1, opcode, OOL, IOL, asmstr, itin> {
+  bits<3> AT;
+  bits<6> XA;
+  bits<6> XB;
+  bits<4> XMSK;
+  bits<4> YMSK;
+  bits<2> PMSK;
+
+  let Pattern = pattern;
+
+  // The prefix.
+  let Inst{6-7} = 3;
+  let Inst{8-11} = 9;
+  let Inst{12-15} = 0;
+  let Inst{16-17} = PMSK;
+  let Inst{18-23} = 0;
+  let Inst{24-27} = XMSK;
+  let Inst{28-31} = YMSK;
+
+  // The instruction.
+  let Inst{38-40} = AT;
+  let Inst{41-42} = 0;
+  let Inst{43-47} = XA{4-0};
+  let Inst{48-52} = XB{4-0};
+  let Inst{53-60} = xo;
+  let Inst{61} = XA{5};
+  let Inst{62} = XB{5};
+  let Inst{63} = 0;
+}
+
+class MMIRR_XX3Form_XY4_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
+                             string asmstr, InstrItinClass itin,
+                             list<dag> pattern>
+  : PI<1, opcode, OOL, IOL, asmstr, itin> {
+  bits<3> AT;
+  bits<6> XA;
+  bits<6> XB;
+  bits<4> XMSK;
+  bits<4> YMSK;
+
+  let Pattern = pattern;
+
+  // The prefix.
+  let Inst{6-7} = 3;
+  let Inst{8-11} = 9;
+  let Inst{12-23} = 0;
+  let Inst{24-27} = XMSK;
+  let Inst{28-31} = YMSK;
+
+  // The instruction.
+  let Inst{38-40} = AT;
+  let Inst{41-42} = 0;
+  let Inst{43-47} = XA{4-0};
+  let Inst{48-52} = XB{4-0};
+  let Inst{53-60} = xo;
+  let Inst{61} = XA{5};
+  let Inst{62} = XB{5};
+  let Inst{63} = 0;
+}
+
+class MMIRR_XX3Form_X4Y2_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
+                              string asmstr, InstrItinClass itin,
+                              list<dag> pattern>
+  : PI<1, opcode, OOL, IOL, asmstr, itin> {
+  bits<3> AT;
+  bits<6> XA;
+  bits<6> XB;
+  bits<4> XMSK;
+  bits<2> YMSK;
+
+  let Pattern = pattern;
+
+  // The prefix.
+  let Inst{6-7} = 3;
+  let Inst{8-11} = 9;
+  let Inst{12-23} = 0;
+  let Inst{24-27} = XMSK;
+  let Inst{28-29} = YMSK;
+  let Inst{30-31} = 0;
+
+  // The instruction.
+  let Inst{38-40} = AT;
+  let Inst{41-42} = 0;
+  let Inst{43-47} = XA{4-0};
+  let Inst{48-52} = XB{4-0};
+  let Inst{53-60} = xo;
+  let Inst{61} = XA{5};
+  let Inst{62} = XB{5};
+  let Inst{63} = 0;
+}
+
+class MMIRR_XX3Form_XY4P8_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
+                               string asmstr, InstrItinClass itin,
+                               list<dag> pattern>
+  : PI<1, opcode, OOL, IOL, asmstr, itin> {
+  bits<3> AT;
+  bits<6> XA;
+  bits<6> XB;
+  bits<4> XMSK;
+  bits<4> YMSK;
+  bits<8> PMSK;
+
+  let Pattern = pattern;
+
+  // The prefix.
+  let Inst{6-7} = 3;
+  let Inst{8-11} = 9;
+  let Inst{12-15} = 0;
+  let Inst{16-23} = PMSK;
+  let Inst{24-27} = XMSK;
+  let Inst{28-31} = YMSK;
+
+  // The instruction.
+  let Inst{38-40} = AT;
+  let Inst{41-42} = 0;
+  let Inst{43-47} = XA{4-0};
+  let Inst{48-52} = XB{4-0};
+  let Inst{53-60} = xo;
+  let Inst{61} = XA{5};
+  let Inst{62} = XB{5};
+  let Inst{63} = 0;
+}
+
+class MMIRR_XX3Form_XYP4_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
+                              string asmstr, InstrItinClass itin,
+                              list<dag> pattern>
+  : PI<1, opcode, OOL, IOL, asmstr, itin> {
+  bits<3> AT;
+  bits<6> XA;
+  bits<6> XB;
+  bits<4> XMSK;
+  bits<4> YMSK;
+  bits<4> PMSK;
+
+  let Pattern = pattern;
+
+  // The prefix.
+  let Inst{6-7} = 3;
+  let Inst{8-11} = 9;
+  let Inst{12-15} = 0;
+  let Inst{16-19} = PMSK;
+  let Inst{20-23} = 0;
+  let Inst{24-27} = XMSK;
+  let Inst{28-31} = YMSK;
+
+  // The instruction.
+  let Inst{38-40} = AT;
+  let Inst{41-42} = 0;
+  let Inst{43-47} = XA{4-0};
+  let Inst{48-52} = XB{4-0};
+  let Inst{53-60} = xo;
+  let Inst{61} = XA{5};
+  let Inst{62} = XB{5};
+  let Inst{63} = 0;
+}
+
 def PrefixInstrs : Predicate<"Subtarget->hasPrefixInstrs()">;
 def IsISA3_1 : Predicate<"Subtarget->isISA3_1()">;
+def PairedVectorMemops : Predicate<"Subtarget->pairedVectorMemops()">;
+def MMA : Predicate<"Subtarget->hasMMA()">;
+
+def RCCp {
+  dag AToVSRC = (COPY_TO_REGCLASS $XA, VSRC);
+  dag BToVSRC = (COPY_TO_REGCLASS $XB, VSRC);
+}
 
 let Predicates = [PrefixInstrs] in {
   let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
     defm PADDI8 :
       MLS_DForm_R_SI34_RTA5_p<14, (outs g8rc:$RT), (ins g8rc:$RA, s34imm:$SI),
-                              (ins immZero:$RA, s34imm:$SI),
+                              (ins immZero:$RA, s34imm_pcrel:$SI),
                               "paddi $RT, $RA, $SI", IIC_LdStLFD>;
     let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in {
       def PLI8 : MLS_DForm_SI34_RT5<14, (outs g8rc:$RT),
@@ -461,7 +855,7 @@ let Predicates = [PrefixInstrs] in {
   }
   defm PADDI :
     MLS_DForm_R_SI34_RTA5_p<14, (outs gprc:$RT), (ins gprc:$RA, s34imm:$SI),
-                            (ins immZero:$RA, s34imm:$SI),
+                            (ins immZero:$RA, s34imm_pcrel:$SI),
                             "paddi $RT, $RA, $SI", IIC_LdStLFD>;
   let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in {
     def PLI : MLS_DForm_SI34_RT5<14, (outs gprc:$RT),
@@ -592,6 +986,695 @@ let Predicates = [PrefixInstrs] in {
   }
 }
 
+// Multiclass definitions for MMA accumulator instructions.
+// ----------------------------------------------------------------------------
+
+// Defines 2 unmasked instructions where the xo field for acc/non-acc version
+// is even/odd.
+multiclass ACC_UM_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
+                       string asmstr> {
+  let Predicates = [MMA] in {
+  def NAME :
+    XX3Form_AT3_XAB6<opcode, !or(xo, 0x01), (outs acc:$AT), IOL,
+                     !strconcat(asmbase#" ", asmstr), IIC_VecFP, []>,
+    RegConstraint<"@earlyclobber $AT">;
+  def PP :
+    XX3Form_AT3_XAB6<opcode, xo, (outs acc:$AT), !con((ins acc:$ATi), IOL),
+                     !strconcat(asmbase#"pp ", asmstr), IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  }
+}
+
+// Defines 4 instructions, masked/unmasked with masks 8, 4, 4 bits.
+// The XO field for acc/non-acc version is even/odd.
+multiclass ACC_UM_M844_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
+                            string asmstr> {
+  defm NAME : ACC_UM_XOEO<opcode, xo, IOL, asmbase, asmstr>;
+  let Predicates = [MMA, PrefixInstrs] in {
+  def PM#NAME :
+    MMIRR_XX3Form_XY4P8_XAB6<
+      opcode, !or(xo, 0x01), (outs acc:$AT),
+      !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u8imm:$PMSK)),
+      !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK, $PMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"@earlyclobber $AT">;
+  def PM#NAME#PP :
+    MMIRR_XX3Form_XY4P8_XAB6<
+      opcode, xo, (outs acc:$AT),
+      !con((ins acc:$ATi),
+           !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u8imm:$PMSK))),
+      !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  }
+}
+
+// Defines 4 instructions, masked/unmasked with masks 4, 4, 4 bits.
+// The XO field for acc/non-acc version is even/odd.
+multiclass ACC_UM_M444_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
+                            string asmstr> {
+  defm NAME : ACC_UM_XOEO<opcode, xo, IOL, asmbase, asmstr>;
+  let Predicates = [MMA, PrefixInstrs] in {
+  def PM#NAME :
+    MMIRR_XX3Form_XYP4_XAB6<
+      opcode, !or(xo, 0x01), (outs acc:$AT),
+      !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u4imm:$PMSK)),
+      !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK, $PMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"@earlyclobber $AT">;
+  def PM#NAME#PP :
+    MMIRR_XX3Form_XYP4_XAB6<
+      opcode, xo, (outs acc:$AT),
+      !con((ins acc:$ATi),
+           !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u4imm:$PMSK))),
+      !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  }
+}
+
+// Defines 4 instructions, masked/unmasked with masks 2, 4, 4 bits.
+// The XO field for acc/non-acc version is even/odd.
+multiclass ACC_UM_M244_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
+                            string asmstr> {
+  defm NAME : ACC_UM_XOEO<opcode, xo, IOL, asmbase, asmstr>;
+  let Predicates = [MMA, PrefixInstrs] in {
+  def PM#NAME :
+    MMIRR_XX3Form_XY4P2_XAB6<
+      opcode, !or(xo, 0x01), (outs acc:$AT),
+      !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK)),
+      !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK, $PMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"@earlyclobber $AT">;
+  def PM#NAME#PP :
+    MMIRR_XX3Form_XY4P2_XAB6<
+      opcode, xo, (outs acc:$AT),
+      !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
+      !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  }
+}
+
+// Defines 4 instructions, masked/unmasked with masks 2, 4, 4 bits.
+// Upper nibble of XO field for acc/non-acc version is 0x4/0x6.
+multiclass ACC_UM_M244_XO46<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
+                            string asmstr> {
+  let Predicates = [MMA] in {
+  def NAME :
+    XX3Form_AT3_XAB6<opcode, xo, (outs acc:$AT), IOL,
+                     !strconcat(asmbase#" ", asmstr), IIC_VecFP, []>,
+    RegConstraint<"@earlyclobber $AT">;
+  def PP :
+    XX3Form_AT3_XAB6<
+      opcode, !or(xo, 0x20), (outs acc:$AT), !con((ins acc:$ATi), IOL),
+      !strconcat(asmbase#"pp ", asmstr), IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  }
+  let Predicates = [MMA, PrefixInstrs] in {
+  def PM#NAME :
+    MMIRR_XX3Form_XY4P2_XAB6<
+      opcode, xo, (outs acc:$AT),
+      !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK)),
+      !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK, $PMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"@earlyclobber $AT">;
+  def PM#NAME#PP :
+    MMIRR_XX3Form_XY4P2_XAB6<
+      opcode, !or(xo, 0x20), (outs acc:$AT),
+      !con((ins acc:$ATi),
+           !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
+      !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  }
+}
+
+// Defines 10 instructions, operand negating, unmasked, masked with 2, 4, 4
+// bits. Upper nibble are masked with 0x8, 0x4, 0xC for negating operands.
+multiclass ACC_NEG_UM_M244_XOM84C<bits<6> opcode, bits<8> xo, dag IOL,
+                                  string asmbase, string asmstr> {
+  defm NAME : ACC_UM_M244_XOEO<opcode, xo, IOL, asmbase, asmstr>;
+  let Predicates = [MMA] in {
+  def PN : XX3Form_AT3_XAB6<
+             opcode, !or(xo, 0x80), (outs acc:$AT), !con((ins acc:$ATi), IOL),
+             !strconcat(asmbase#"pn ", asmstr), IIC_VecFP, []>,
+           RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  def NP : XX3Form_AT3_XAB6<
+             opcode, !or(xo, 0x40), (outs acc:$AT), !con((ins acc:$ATi), IOL),
+             !strconcat(asmbase#"np ", asmstr), IIC_VecFP, []>,
+           RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  def NN : XX3Form_AT3_XAB6<
+             opcode, !or(xo, 0xC0), (outs acc:$AT), !con((ins acc:$ATi), IOL),
+             !strconcat(asmbase#"nn ", asmstr), IIC_VecFP, []>,
+           RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  }
+  let Predicates = [MMA, PrefixInstrs] in {
+  def PM#NAME#PN :
+    MMIRR_XX3Form_XY4P2_XAB6<
+      opcode, !or(xo, 0x80), (outs acc:$AT),
+      !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
+      !strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK, $PMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  def PM#NAME#NP :
+    MMIRR_XX3Form_XY4P2_XAB6<
+      opcode, !or(xo, 0x40), (outs acc:$AT),
+      !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
+      !strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK, $PMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  def PM#NAME#NN :
+    MMIRR_XX3Form_XY4P2_XAB6<
+      opcode, !or(xo, 0xC0), (outs acc:$AT),
+      !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
+      !strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK, $PMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  }
+}
+
+// Defines 5 instructions, unmasked, operand negating.
+// Upper nibble are masked with 0x8, 0x4, 0xC for negating operands.
+multiclass ACC_NEG_UM_XOM84C<bits<6> opcode, bits<8> xo, dag IOL,
+                             string asmbase, string asmstr> {
+  defm NAME : ACC_UM_XOEO<opcode, xo, IOL, asmbase, asmstr>;
+  let Predicates = [MMA] in {
+  def PN : XX3Form_AT3_XAB6<opcode, !or(xo, 0x80), (outs acc:$AT),
+                            !con((ins acc:$ATi), IOL),
+                            !strconcat(asmbase#"pn ", asmstr), IIC_VecFP, []>,
+           RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  def NP : XX3Form_AT3_XAB6<opcode, !or(xo, 0x40), (outs acc:$AT),
+                            !con((ins acc:$ATi), IOL),
+                            !strconcat(asmbase#"np ", asmstr), IIC_VecFP, []>,
+           RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  def NN : XX3Form_AT3_XAB6<opcode, !or(xo, 0xC0), (outs acc:$AT),
+                            !con((ins acc:$ATi), IOL),
+                            !strconcat(asmbase#"nn ", asmstr), IIC_VecFP, []>,
+           RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  }
+}
+
+// Defines 10 instructions, operand negating, unmasked, masked with 4, 4 bits.
+// Upper nibble are masked with 0x8, 0x4, 0xC for negating operands.
+multiclass ACC_NEG_UM_M44_XOM84C<bits<6> opcode, bits<8> xo, dag IOL,
+                                 string asmbase, string asmstr> {
+  defm NAME : ACC_NEG_UM_XOM84C<opcode, xo, IOL, asmbase, asmstr>;
+  let Predicates = [MMA, PrefixInstrs] in {
+  def PM#NAME :
+    MMIRR_XX3Form_XY4_XAB6<
+      opcode, !or(xo, 0x01), (outs acc:$AT),
+      !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK)),
+      !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"@earlyclobber $AT">;
+  def PM#NAME#PP :
+    MMIRR_XX3Form_XY4_XAB6<
+      opcode, xo, (outs acc:$AT),
+      !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK))),
+      !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  def PM#NAME#PN :
+    MMIRR_XX3Form_XY4_XAB6<
+      opcode, !or(xo, 0x80), (outs acc:$AT),
+      !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK))),
+      !strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  def PM#NAME#NP :
+    MMIRR_XX3Form_XY4_XAB6<
+      opcode, !or(xo, 0x40), (outs acc:$AT),
+      !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK))),
+      !strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  def PM#NAME#NN :
+    MMIRR_XX3Form_XY4_XAB6<
+      opcode, !or(xo, 0xC0), (outs acc:$AT),
+      !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK))),
+      !strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  }
+}
+
+// Defines 10 instructions, operand negating, unmasked, masked with 4, 2 bits.
+// Upper nibble are masked with 0x8, 0x4, 0xC for negating operands.
+multiclass ACC_NEG_UM_M42_XOM84C<bits<6> opcode, bits<8> xo, dag IOL,
+                                 string asmbase, string asmstr> {
+  defm NAME : ACC_NEG_UM_XOM84C<opcode, xo, IOL, asmbase, asmstr>;
+  let Predicates = [MMA, PrefixInstrs] in {
+  def PM#NAME :
+    MMIRR_XX3Form_X4Y2_XAB6<
+      opcode, !or(xo, 0x01), (outs acc:$AT),
+      !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK)),
+      !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"@earlyclobber $AT">;
+  def PM#NAME#PP :
+    MMIRR_XX3Form_X4Y2_XAB6<
+      opcode, xo, (outs acc:$AT),
+      !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK))),
+      !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  def PM#NAME#PN :
+    MMIRR_XX3Form_X4Y2_XAB6<
+      opcode, !or(xo, 0x80), (outs acc:$AT),
+      !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK))),
+      !strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  def PM#NAME#NP :
+    MMIRR_XX3Form_X4Y2_XAB6<
+      opcode, !or(xo, 0x40), (outs acc:$AT),
+      !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK))),
+      !strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  def PM#NAME#NN :
+    MMIRR_XX3Form_X4Y2_XAB6<
+      opcode, !or(xo, 0xC0), (outs acc:$AT),
+      !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK))),
+      !strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  }
+}
+
+// End of class definitions.
+//-----------------------------------------------------------------------------
+
+let Predicates = [MMA] in {
+  def XXMFACC :
+    XForm_AT3<31, 0, 177, (outs acc:$ASo), (ins acc:$AS), "xxmfacc $AS",
+              IIC_VecGeneral,
+              [(set v512i1:$ASo, (int_ppc_mma_xxmfacc v512i1:$AS))]>,
+              RegConstraint<"$ASo = $AS">, NoEncode<"$ASo">;
+  def XXMTACC :
+    XForm_AT3<31, 1, 177, (outs acc:$AT), (ins acc:$ATi), "xxmtacc $AT",
+              IIC_VecGeneral,
+              [(set v512i1:$AT, (int_ppc_mma_xxmtacc v512i1:$ATi))]>,
+              RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  def KILL_PAIR : PPCPostRAExpPseudo<(outs vsrprc:$XTp), (ins vsrprc:$XSp),
+                                      "#KILL_PAIR", []>,
+                                      RegConstraint<"$XTp = $XSp">;
+  def BUILD_UACC : PPCPostRAExpPseudo<(outs acc:$AT), (ins uacc:$AS),
+                                      "#BUILD_UACC $AT, $AS", []>;
+  // We define XXSETACCZ as rematerializable to undo CSE of that intrinsic in
+  // the backend. We avoid CSE here because it generates a copy of the acc
+  // register and this copy is more expensive than calling the intrinsic again.
+  let isAsCheapAsAMove = 1, isReMaterializable = 1 in {
+    def XXSETACCZ :
+      XForm_AT3<31, 3, 177, (outs acc:$AT), (ins), "xxsetaccz $AT", IIC_VecGeneral,
+                [(set v512i1:$AT, (int_ppc_mma_xxsetaccz))]>;
+  }
+  def XVI8GER4SPP :
+    XX3Form_AT3_XAB6<59, 99, (outs acc:$AT), (ins acc:$ATi, vsrc:$XA, vsrc:$XB),
+                     "xvi8ger4spp $AT, $XA, $XB", IIC_VecGeneral, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  let mayStore = 1 in {
+    def SPILL_ACC: PPCEmitTimePseudo<(outs), (ins acc:$AT, memrix16:$dst),
+                                     "#SPILL_ACC", []>;
+    def SPILL_UACC: PPCEmitTimePseudo<(outs), (ins uacc:$AT, memrix16:$dst),
+                                     "#SPILL_UACC", []>;
+  }
+  let mayLoad = 1, hasSideEffects = 0 in {
+    def RESTORE_ACC: PPCEmitTimePseudo<(outs acc:$AT), (ins memrix16:$src),
+                                       "#RESTORE_ACC", []>;
+    def RESTORE_UACC: PPCEmitTimePseudo<(outs uacc:$AT), (ins memrix16:$src),
+                                       "#RESTORE_UACC", []>;
+  }
+}
+
+let Predicates = [MMA, PrefixInstrs] in {
+  def PMXVI8GER4SPP :
+    MMIRR_XX3Form_XYP4_XAB6<59, 99, (outs acc:$AT),
+                            (ins acc:$ATi, vsrc:$XA,vsrc:$XB, u4imm:$XMSK,
+                             u4imm:$YMSK, u4imm:$PMSK),
+                            "pmxvi8ger4spp $AT, $XA, $XB, $XMSK, $YMSK, $PMSK",
+                            IIC_VecGeneral, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+}
+
+// MMA accumulating/non-accumulating instructions.
+//------------------------------------------------------------------------------
+
+// XVBF16GER2, XVBF16GER2PP, XVBF16GER2PN, XVBF16GER2NP, XVBF16GER2NN
+// PMXVBF16GER2, PMXVBF16GER2PP, PMXVBF16GER2PN, PMXVBF16GER2NP, PMXVBF16GER2NN
+defm XVBF16GER2 : ACC_NEG_UM_M244_XOM84C<59, 50, (ins vsrc:$XA, vsrc:$XB),
+                                         "xvbf16ger2", "$AT, $XA, $XB">;
+
+// XVI4GER8, XVI4GER8PP, PMXVI4GER8,  PMXVI4GER8PP
+defm XVI4GER8 : ACC_UM_M844_XOEO<59, 34, (ins vsrc:$XA, vsrc:$XB),
+                                 "xvi4ger8", "$AT, $XA, $XB">;
+
+// XVI8GER4, XVI8GER4PP, PMXVI8GER4, PMXVI8GER4PP
+defm XVI8GER4 : ACC_UM_M444_XOEO<59, 2, (ins vsrc:$XA, vsrc:$XB),
+                                 "xvi8ger4", "$AT, $XA, $XB">;
+
+// XVI16GER2, XVI16GER2PP, PMXVI16GER2, PMXVI16GER2PP
+defm XVI16GER2 : ACC_UM_M244_XO46<59, 75, (ins vsrc:$XA, vsrc:$XB),
+                                  "xvi16ger2", "$AT, $XA, $XB">;
+
+// XVI16GER2S, XVI16GER2SPP, PMXVI16GER2S, PMXVI16GER2SPP
+defm XVI16GER2S : ACC_UM_M244_XOEO<59, 42, (ins vsrc:$XA, vsrc:$XB),
+                                   "xvi16ger2s", "$AT, $XA, $XB">;
+
+// XVF16GER2, XVF16GER2PP, XVF16GER2PN, XVF16GER2NP, XVF16GER2NN
+// PMXVF16GER2, PMXVF16GER2PP, PMXVF16GER2PN, PMXVF16GER2NP, PMXVF16GER2NN
+defm XVF16GER2 : ACC_NEG_UM_M244_XOM84C<59, 18, (ins vsrc:$XA, vsrc:$XB),
+                                        "xvf16ger2", "$AT, $XA, $XB">;
+
+// XVF32GER, XVF32GERPP, XVF32GERPN, XVF32GERNP, XVF32GERPP
+// PMXVF32GER, PMXVF32GERPP, PMXVF32GERPN, PMXVF32GERNP, PMXVF32GERPP
+defm XVF32GER : ACC_NEG_UM_M44_XOM84C<59, 26, (ins vsrc:$XA, vsrc:$XB),
+                                      "xvf32ger", "$AT, $XA, $XB">;
+
+// XVF64GER, XVF64GERPP, XVF64GERPN, XVF64GERNP, XVF64GERNN
+// PMXVF64GER, PMXVF64GERPP, PMXVF64GERPN, PMXVF64GERNP, PMXVF64GERNN
+defm XVF64GER : ACC_NEG_UM_M42_XOM84C<59, 58, (ins vsrpevenrc:$XA, vsrc:$XB),
+                                      "xvf64ger", "$AT, $XA, $XB">;
+//------------------------------------------------------------------------------
+
+// MMA Intrinsics
+let Predicates = [MMA] in {
+  def : Pat<(v512i1 (int_ppc_mma_xvi4ger8 v16i8:$XA, v16i8:$XB)),
+            (XVI4GER8 RCCp.AToVSRC, RCCp.BToVSRC)>;
+  def : Pat<(v512i1 (int_ppc_mma_xvi4ger8pp v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
+            (XVI4GER8PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
+
+  def : Pat<(v512i1 (int_ppc_mma_xvi8ger4 v16i8:$XA, v16i8:$XB)),
+            (XVI8GER4 RCCp.AToVSRC, RCCp.BToVSRC)>;
+  def : Pat<(v512i1 (int_ppc_mma_xvi8ger4pp v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
+            (XVI8GER4PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
+
+  def : Pat<(v512i1 (int_ppc_mma_xvi16ger2s v16i8:$XA, v16i8:$XB)),
+            (XVI16GER2S RCCp.AToVSRC, RCCp.BToVSRC)>;
+  def : Pat<(v512i1 (int_ppc_mma_xvi16ger2spp v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
+            (XVI16GER2SPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
+
+  def : Pat<(v512i1 (int_ppc_mma_xvf16ger2 v16i8:$XA, v16i8:$XB)),
+            (XVF16GER2 RCCp.AToVSRC, RCCp.BToVSRC)>;
+  def : Pat<(v512i1 (int_ppc_mma_xvf16ger2pp v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
+            (XVF16GER2PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
+  def : Pat<(v512i1 (int_ppc_mma_xvf16ger2pn v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
+            (XVF16GER2PN $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
+  def : Pat<(v512i1 (int_ppc_mma_xvf16ger2np v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
+            (XVF16GER2NP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
+  def : Pat<(v512i1 (int_ppc_mma_xvf16ger2nn v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
+            (XVF16GER2NN $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
+
+  def : Pat<(v512i1 (int_ppc_mma_xvf32ger v16i8:$XA, v16i8:$XB)),
+            (XVF32GER RCCp.AToVSRC, RCCp.BToVSRC)>;
+  def : Pat<(v512i1 (int_ppc_mma_xvf32gerpp v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
+            (XVF32GERPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
+  def : Pat<(v512i1 (int_ppc_mma_xvf32gerpn v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
+            (XVF32GERPN $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
+  def : Pat<(v512i1 (int_ppc_mma_xvf32gernp v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
+            (XVF32GERNP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
+  def : Pat<(v512i1 (int_ppc_mma_xvf32gernn v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
+            (XVF32GERNN $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
+  def : Pat<(v512i1 (int_ppc_mma_xvf64ger v256i1:$XA, v16i8:$XB)),
+            (XVF64GER $XA, RCCp.BToVSRC)>;
+  def : Pat<(v512i1 (int_ppc_mma_xvf64gerpp v512i1:$ATi, v256i1:$XA, v16i8:$XB)),
+            (XVF64GERPP $ATi, $XA, RCCp.BToVSRC)>;
+  def : Pat<(v512i1 (int_ppc_mma_xvf64gerpn v512i1:$ATi, v256i1:$XA, v16i8:$XB)),
+            (XVF64GERPN $ATi, $XA, RCCp.BToVSRC)>;
+  def : Pat<(v512i1 (int_ppc_mma_xvf64gernp v512i1:$ATi, v256i1:$XA, v16i8:$XB)),
+            (XVF64GERNP $ATi, $XA, RCCp.BToVSRC)>;
+  def : Pat<(v512i1 (int_ppc_mma_xvf64gernn v512i1:$ATi, v256i1:$XA, v16i8:$XB)),
+            (XVF64GERNN $ATi, $XA, RCCp.BToVSRC)>;
+
+  def : Pat<(v512i1 (int_ppc_mma_xvbf16ger2 v16i8:$XA, v16i8:$XB)),
+            (XVBF16GER2 RCCp.AToVSRC, RCCp.BToVSRC)>;
+  def : Pat<(v512i1 (int_ppc_mma_xvbf16ger2pp v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
+            (XVBF16GER2PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
+  def : Pat<(v512i1 (int_ppc_mma_xvbf16ger2pn v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
+            (XVBF16GER2PN $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
+  def : Pat<(v512i1 (int_ppc_mma_xvbf16ger2np v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
+            (XVBF16GER2NP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
+  def : Pat<(v512i1 (int_ppc_mma_xvbf16ger2nn v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
+            (XVBF16GER2NN $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
+  def : Pat<(v512i1 (int_ppc_mma_xvi16ger2 v16i8:$XA, v16i8:$XB)),
+            (XVI16GER2 RCCp.AToVSRC, RCCp.BToVSRC)>;
+  def : Pat<(v512i1 (int_ppc_mma_xvi16ger2pp v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
+            (XVI16GER2PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
+  def : Pat<(v512i1 (int_ppc_mma_xvi8ger4spp v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
+            (XVI8GER4SPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
+}
+
+// MMA Intrinsics
+let Predicates = [MMA, PrefixInstrs] in {
+  def : Pat<(v512i1 (int_ppc_mma_pmxvi4ger8 v16i8:$XA, v16i8:$XB, Msk4Imm:$XMSK,
+                                            Msk4Imm:$YMSK, Msk8Imm:$PMSK)),
+            (PMXVI4GER8 RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                        Msk4Imm:$YMSK, Msk8Imm:$PMSK)>;
+  def : Pat<(v512i1 (int_ppc_mma_pmxvi4ger8pp v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+                                              Msk4Imm:$XMSK, Msk4Imm:$YMSK,
+                                              Msk8Imm:$PMSK)),
+            (PMXVI4GER8PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                          Msk4Imm:$YMSK, Msk8Imm:$PMSK)>;
+
+  def : Pat<(v512i1 (int_ppc_mma_pmxvi8ger4 v16i8:$XA, v16i8:$XB, Msk4Imm:$XMSK,
+                                            Msk4Imm:$YMSK, Msk4Imm:$PMSK)),
+            (PMXVI8GER4 RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                        Msk4Imm:$YMSK, Msk4Imm:$PMSK)>;
+  def : Pat<(v512i1 (int_ppc_mma_pmxvi8ger4pp v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+                                              Msk4Imm:$XMSK, Msk4Imm:$YMSK,
+                                              Msk4Imm:$PMSK)),
+            (PMXVI8GER4PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                          Msk4Imm:$YMSK, Msk4Imm:$PMSK)>;
+
+  def : Pat<(v512i1 (int_ppc_mma_pmxvi16ger2s v16i8:$XA, v16i8:$XB, Msk4Imm:$XMSK,
+                                              Msk4Imm:$YMSK, Msk2Imm:$PMSK)),
+            (PMXVI16GER2S RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                          Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+  def : Pat<(v512i1 (int_ppc_mma_pmxvi16ger2spp v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+                                                Msk4Imm:$XMSK, Msk4Imm:$YMSK,
+                                                Msk2Imm:$PMSK)),
+            (PMXVI16GER2SPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                            Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+  def : Pat<(v512i1 (int_ppc_mma_pmxvf16ger2 v16i8:$XA, v16i8:$XB, Msk4Imm:$XMSK,
+                                             Msk4Imm:$YMSK, Msk2Imm:$PMSK)),
+            (PMXVF16GER2 RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                         Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+  def : Pat<(v512i1 (int_ppc_mma_pmxvf16ger2pp v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+                                               Msk4Imm:$XMSK, Msk4Imm:$YMSK,
+                                               Msk2Imm:$PMSK)),
+            (PMXVF16GER2PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                           Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+  def : Pat<(v512i1 (int_ppc_mma_pmxvf16ger2pn v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+                                               Msk4Imm:$XMSK, Msk4Imm:$YMSK,
+                                               Msk2Imm:$PMSK)),
+            (PMXVF16GER2PN $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                           Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+  def : Pat<(v512i1 (int_ppc_mma_pmxvf16ger2np v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+                                               Msk4Imm:$XMSK, Msk4Imm:$YMSK,
+                                               Msk2Imm:$PMSK)),
+            (PMXVF16GER2NP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                           Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+  def : Pat<(v512i1 (int_ppc_mma_pmxvf16ger2pn v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+                                               Msk4Imm:$XMSK, Msk4Imm:$YMSK,
+                                               Msk2Imm:$PMSK)),
+            (PMXVF16GER2PN $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                           Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+  def : Pat<(v512i1 (int_ppc_mma_pmxvf16ger2np v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+                                               Msk4Imm:$XMSK, Msk4Imm:$YMSK,
+                                               Msk2Imm:$PMSK)),
+            (PMXVF16GER2NP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                           Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+  def : Pat<(v512i1 (int_ppc_mma_pmxvf16ger2nn v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+                                               Msk4Imm:$XMSK, Msk4Imm:$YMSK,
+                                               Msk2Imm:$PMSK)),
+            (PMXVF16GER2NN $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                           Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+
+  def : Pat<(v512i1 (int_ppc_mma_pmxvf32ger v16i8:$XA, v16i8:$XB, Msk4Imm:$XMSK,
+                                            Msk4Imm:$YMSK)),
+            (PMXVF32GER RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                        Msk4Imm:$YMSK)>;
+  def : Pat<(v512i1 (int_ppc_mma_pmxvf32gerpp v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+                                              Msk4Imm:$XMSK, Msk4Imm:$YMSK)),
+            (PMXVF32GERPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                          Msk4Imm:$YMSK)>;
+  def : Pat<(v512i1 (int_ppc_mma_pmxvf32gerpn v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+                                              Msk4Imm:$XMSK, Msk4Imm:$YMSK)),
+            (PMXVF32GERPN $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                          Msk4Imm:$YMSK)>;
+  def : Pat<(v512i1 (int_ppc_mma_pmxvf32gernp v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+                                              Msk4Imm:$XMSK, Msk4Imm:$YMSK)),
+            (PMXVF32GERNP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                          Msk4Imm:$YMSK)>;
+  def : Pat<(v512i1 (int_ppc_mma_pmxvf32gernn v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+                                              Msk4Imm:$XMSK, Msk4Imm:$YMSK)),
+            (PMXVF32GERNN $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                          Msk4Imm:$YMSK)>;
+
+  def : Pat<(v512i1 (int_ppc_mma_pmxvf64ger v256i1:$XA, v16i8:$XB, Msk4Imm:$XMSK,
+                                            Msk2Imm:$YMSK)),
+            (PMXVF64GER $XA, RCCp.BToVSRC, Msk4Imm:$XMSK, Msk2Imm:$YMSK)>;
+  def : Pat<(v512i1 (int_ppc_mma_pmxvf64gerpp v512i1:$ATi, v256i1:$XA, v16i8:$XB,
+                                              Msk4Imm:$XMSK, Msk2Imm:$YMSK)),
+            (PMXVF64GERPP $ATi, $XA, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                          Msk2Imm:$YMSK)>;
+  def : Pat<(v512i1 (int_ppc_mma_pmxvf64gerpn v512i1:$ATi, v256i1:$XA, v16i8:$XB,
+                                              Msk4Imm:$XMSK, Msk2Imm:$YMSK)),
+            (PMXVF64GERPN $ATi, $XA, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                          Msk2Imm:$YMSK)>;
+  def : Pat<(v512i1 (int_ppc_mma_pmxvf64gernp v512i1:$ATi, v256i1:$XA, v16i8:$XB,
+                                              Msk4Imm:$XMSK, Msk2Imm:$YMSK)),
+            (PMXVF64GERNP $ATi, $XA, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                          Msk2Imm:$YMSK)>;
+  def : Pat<(v512i1 (int_ppc_mma_pmxvf64gernn v512i1:$ATi, v256i1:$XA, v16i8:$XB,
+                                              Msk4Imm:$XMSK, Msk2Imm:$YMSK)),
+            (PMXVF64GERNN $ATi, $XA, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                          Msk2Imm:$YMSK)>;
+
+  def : Pat<(v512i1 (int_ppc_mma_pmxvbf16ger2 v16i8:$XA, v16i8:$XB, Msk4Imm:$XMSK,
+                                              Msk4Imm:$YMSK, Msk2Imm:$PMSK)),
+            (PMXVBF16GER2 RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                          Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+  def : Pat<(v512i1 (int_ppc_mma_pmxvbf16ger2pp v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+                                                Msk4Imm:$XMSK, Msk4Imm:$YMSK,
+                                                Msk2Imm:$PMSK)),
+            (PMXVBF16GER2PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                            Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+  def : Pat<(v512i1 (int_ppc_mma_pmxvbf16ger2pn v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+                                                Msk4Imm:$XMSK, Msk4Imm:$YMSK,
+                                                Msk2Imm:$PMSK)),
+            (PMXVBF16GER2PN $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                            Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+  def : Pat<(v512i1 (int_ppc_mma_pmxvbf16ger2np v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+                                                Msk4Imm:$XMSK, Msk4Imm:$YMSK,
+                                                Msk2Imm:$PMSK)),
+            (PMXVBF16GER2NP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                            Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+  def : Pat<(v512i1 (int_ppc_mma_pmxvbf16ger2nn v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+                                                Msk4Imm:$XMSK, Msk4Imm:$YMSK,
+                                                Msk2Imm:$PMSK)),
+            (PMXVBF16GER2NN $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                            Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+  def : Pat<(v512i1 (int_ppc_mma_pmxvi16ger2 v16i8:$XA, v16i8:$XB, Msk4Imm:$XMSK,
+                                             Msk4Imm:$YMSK, Msk2Imm:$PMSK)),
+            (PMXVI16GER2 RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                         Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+  def : Pat<(v512i1 (int_ppc_mma_pmxvi8ger4spp v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+                                               Msk4Imm:$XMSK, Msk4Imm:$YMSK,
+                                               Msk2Imm:$PMSK)),
+            (PMXVI8GER4SPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                           Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+  def : Pat<(v512i1 (int_ppc_mma_pmxvi16ger2pp v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+                                               Msk4Imm:$XMSK, Msk4Imm:$YMSK,
+                                               Msk2Imm:$PMSK)),
+            (PMXVI16GER2PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+                           Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+}
+
+def Concats {
+  dag VecsToVecPair0 =
+    (v256i1 (INSERT_SUBREG
+      (INSERT_SUBREG (IMPLICIT_DEF), $vs0, sub_vsx1),
+      $vs1, sub_vsx0));
+  dag VecsToVecPair1 =
+    (v256i1 (INSERT_SUBREG
+      (INSERT_SUBREG (IMPLICIT_DEF), $vs2, sub_vsx1),
+      $vs3, sub_vsx0));
+  dag VecsToVecQuad =
+    (BUILD_UACC (INSERT_SUBREG
+                  (INSERT_SUBREG (v512i1 (IMPLICIT_DEF)),
+                                 (KILL_PAIR VecsToVecPair0), sub_pair0),
+                  (KILL_PAIR VecsToVecPair1), sub_pair1));
+}
+
+def Extracts {
+  dag Pair0 = (v256i1 (EXTRACT_SUBREG $v, sub_pair0));
+  dag Pair1 = (v256i1 (EXTRACT_SUBREG $v, sub_pair1));
+  dag Vec0 = (v4i32 (EXTRACT_SUBREG Pair0, sub_vsx0));
+  dag Vec1 = (v4i32 (EXTRACT_SUBREG Pair0, sub_vsx1));
+  dag Vec2 = (v4i32 (EXTRACT_SUBREG Pair1, sub_vsx0));
+  dag Vec3 = (v4i32 (EXTRACT_SUBREG Pair1, sub_vsx1));
+}
+
+let Predicates = [MMA] in {
+  def : Pat<(v512i1 (PPCAccBuild v4i32:$vs1, v4i32:$vs0, v4i32:$vs3, v4i32:$vs2)),
+            (XXMTACC Concats.VecsToVecQuad)>;
+  def : Pat<(v512i1 (int_ppc_mma_assemble_acc v16i8:$vs1, v16i8:$vs0,
+                                              v16i8:$vs3, v16i8:$vs2)),
+            (XXMTACC Concats.VecsToVecQuad)>;
+  def : Pat<(v512i1 (PPCxxmfacc v512i1:$AS)), (XXMFACC acc:$AS)>;
+  def : Pat<(v4i32 (PPCAccExtractVsx acc:$v, (i64 0))),
+            Extracts.Vec0>;
+  def : Pat<(v4i32 (PPCAccExtractVsx acc:$v, (i64 1))),
+            Extracts.Vec1>;
+  def : Pat<(v4i32 (PPCAccExtractVsx acc:$v, (i64 2))),
+            Extracts.Vec2>;
+  def : Pat<(v4i32 (PPCAccExtractVsx acc:$v, (i64 3))),
+            Extracts.Vec3>;
+}
+
+let Predicates = [PairedVectorMemops] in {
+  def : Pat<(v256i1 (PPCPairBuild v4i32:$vs1, v4i32:$vs0)),
+            Concats.VecsToVecPair0>;
+  def : Pat<(v256i1 (int_ppc_vsx_assemble_pair v16i8:$vs1, v16i8:$vs0)),
+            Concats.VecsToVecPair0>;
+  def : Pat<(v4i32 (PPCPairExtractVsx vsrpevenrc:$v, (i64 0))),
+            (v4i32 (EXTRACT_SUBREG $v, sub_vsx0))>;
+  def : Pat<(v4i32 (PPCPairExtractVsx vsrpevenrc:$v, (i64 1))),
+            (v4i32 (EXTRACT_SUBREG $v, sub_vsx1))>;
+}
+
+let mayLoad = 1, mayStore = 0, Predicates = [PairedVectorMemops] in {
+  def LXVP : DQForm_XTp5_RA17_MEM<6, 0, (outs vsrprc:$XTp),
+                                  (ins memrix16:$DQ_RA), "lxvp $XTp, $DQ_RA",
+                                  IIC_LdStLFD, []>;
+  def LXVPX : XForm_XTp5_XAB5<31, 333, (outs vsrprc:$XTp), (ins memrr:$src),
+                              "lxvpx $XTp, $src", IIC_LdStLFD,
+                              []>;
+}
+
+let mayLoad = 0, mayStore = 1, Predicates = [PairedVectorMemops] in {
+  def STXVP : DQForm_XTp5_RA17_MEM<6, 1, (outs), (ins vsrprc:$XTp,
+                                   memrix16:$DQ_RA), "stxvp $XTp, $DQ_RA",
+                                   IIC_LdStLFD, []>;
+  def STXVPX : XForm_XTp5_XAB5<31, 461, (outs), (ins vsrprc:$XTp, memrr:$dst),
+                               "stxvpx $XTp, $dst", IIC_LdStLFD,
+                               []>;
+}
+
+let mayLoad = 1, mayStore = 0, Predicates = [PairedVectorMemops, PrefixInstrs] in {
+  defm PLXVP :
+    8LS_DForm_R_XTp5_SI34_MEM_p<1, 58, (outs vsrprc:$XTp), (ins memri34:$D_RA),
+                                (ins memri34_pcrel:$D_RA), "plxvp $XTp, $D_RA",
+                                IIC_LdStLFD>;
+}
+
+let mayLoad = 0, mayStore = 1, Predicates = [PairedVectorMemops, PrefixInstrs] in {
+  defm PSTXVP :
+    8LS_DForm_R_XTp5_SI34_MEM_p<1, 62, (outs), (ins vsrprc:$XTp, memri34:$D_RA),
+                                (ins vsrprc:$XTp, memri34_pcrel:$D_RA),
+                                "pstxvp $XTp, $D_RA", IIC_LdStLFD>;
+}
+
+let Predicates = [PairedVectorMemops] in {
+  // Intrinsics for Paired Vector Loads.
+  def : Pat<(v256i1 (int_ppc_vsx_lxvp iaddrX16:$src)), (LXVP memrix16:$src)>;
+  def : Pat<(v256i1 (int_ppc_vsx_lxvp xaddrX16:$src)), (LXVPX xaddrX16:$src)>;
+  let Predicates = [PairedVectorMemops, PrefixInstrs] in {
+    def : Pat<(v256i1 (int_ppc_vsx_lxvp iaddrX34:$src)), (PLXVP memri34:$src)>;
+  }
+  // Intrinsics for Paired Vector Stores.
+  def : Pat<(int_ppc_vsx_stxvp v256i1:$XSp, iaddrX16:$dst),
+            (STXVP $XSp, memrix16:$dst)>;
+  def : Pat<(int_ppc_vsx_stxvp v256i1:$XSp, xaddrX16:$dst),
+            (STXVPX $XSp, xaddrX16:$dst)>;
+  let Predicates = [PairedVectorMemops, PrefixInstrs] in {
+    def : Pat<(int_ppc_vsx_stxvp v256i1:$XSp, iaddrX34:$dst),
+              (PSTXVP $XSp, memri34:$dst)>;
+  }
+}
+
 // TODO: We have an added complexity of 500 here. This is only a temporary
 // solution to have tablegen consider these patterns first. The way we do
 // addressing for PowerPC is complex depending on available D form, X form, or
@@ -753,6 +1836,13 @@ let Predicates = [PCRelativeMemops], AddedComplexity = 500 in {
   // If the PPCmatpcreladdr node is not caught by any other pattern it should be
   // caught here and turned into a paddi instruction to materialize the address.
   def : Pat<(PPCmatpcreladdr pcreladdr:$addr), (PADDI8pc 0, $addr)>;
+  // PPCtlsdynamatpcreladdr node is used for TLS dynamic models to materialize
+  // tls global address with paddi instruction.
+  def : Pat<(PPCtlsdynamatpcreladdr pcreladdr:$addr), (PADDI8pc 0, $addr)>;
+  // PPCtlslocalexecmataddr node is used for TLS local exec models to
+  // materialize tls global address with paddi instruction.
+  def : Pat<(PPCaddTls i64:$in, (PPCtlslocalexecmataddr tglobaltlsaddr:$addr)),
+            (PADDI8 $in, $addr)>;
 }
 
 let Predicates = [PrefixInstrs] in {
@@ -797,6 +1887,26 @@ let Predicates = [PrefixInstrs] in {
 }
 
 let Predicates = [IsISA3_1] in {
+  def SETBC : XForm_XT5_BI5<31, 384, (outs gprc:$RT), (ins crbitrc:$BI),
+                            "setbc $RT, $BI", IIC_IntCompare, []>;
+  def SETBCR : XForm_XT5_BI5<31, 416, (outs gprc:$RT), (ins crbitrc:$BI),
+                             "setbcr $RT, $BI", IIC_IntCompare, []>;
+  def SETNBC : XForm_XT5_BI5<31, 448, (outs gprc:$RT), (ins crbitrc:$BI),
+                             "setnbc $RT, $BI", IIC_IntCompare, []>;
+  def SETNBCR : XForm_XT5_BI5<31, 480, (outs gprc:$RT), (ins crbitrc:$BI),
+                              "setnbcr $RT, $BI", IIC_IntCompare, []>;
+
+  let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
+    def SETBC8 : XForm_XT5_BI5<31, 384, (outs g8rc:$RT), (ins crbitrc:$BI),
+                               "setbc $RT, $BI", IIC_IntCompare, []>;
+    def SETBCR8 : XForm_XT5_BI5<31, 416, (outs g8rc:$RT), (ins crbitrc:$BI),
+                                "setbcr $RT, $BI", IIC_IntCompare, []>;
+    def SETNBC8 : XForm_XT5_BI5<31, 448, (outs g8rc:$RT), (ins crbitrc:$BI),
+                                "setnbc $RT, $BI", IIC_IntCompare, []>;
+    def SETNBCR8 : XForm_XT5_BI5<31, 480, (outs g8rc:$RT), (ins crbitrc:$BI),
+                                 "setnbcr $RT, $BI", IIC_IntCompare, []>;
+  }
+
   def VSLDBI : VNForm_VTAB5_SD3<22, 0, (outs vrrc:$VRT),
                                 (ins vrrc:$VRA, vrrc:$VRB, u3imm:$SH),
                                 "vsldbi $VRT, $VRA, $VRB, $SH",
@@ -813,87 +1923,254 @@ let Predicates = [IsISA3_1] in {
                                       (int_ppc_altivec_vsrdbi v16i8:$VRA,
                                                               v16i8:$VRB, 
                                                               i32:$SH))]>;
-  def VINSW : 
-    VXForm_VRT5_UIM5_RB5_ins<207, "vinsw",
-                             [(set v4i32:$vD,
-                                   (int_ppc_altivec_vinsw v4i32:$vDi, i64:$rB,
-                                                          timm:$UIM))]>;
+  defm VSTRIBR : VXForm_VTB5_RCr<13, 1, (outs vrrc:$vT), (ins vrrc:$vB),
+                                 "vstribr", "$vT, $vB", IIC_VecGeneral,
+				 [(set v16i8:$vT,
+                                       (int_ppc_altivec_vstribr v16i8:$vB))]>;
+  defm VSTRIBL : VXForm_VTB5_RCr<13, 0, (outs vrrc:$vT), (ins vrrc:$vB),
+                                 "vstribl", "$vT, $vB", IIC_VecGeneral,
+                                 [(set v16i8:$vT,
+                                       (int_ppc_altivec_vstribl v16i8:$vB))]>;
+  defm VSTRIHR : VXForm_VTB5_RCr<13, 3, (outs vrrc:$vT), (ins vrrc:$vB),
+                                 "vstrihr", "$vT, $vB", IIC_VecGeneral,
+                                 [(set v8i16:$vT,
+                                       (int_ppc_altivec_vstrihr v8i16:$vB))]>;
+  defm VSTRIHL : VXForm_VTB5_RCr<13, 2, (outs vrrc:$vT), (ins vrrc:$vB),
+                                 "vstrihl", "$vT, $vB", IIC_VecGeneral,
+                                 [(set v8i16:$vT,
+                                       (int_ppc_altivec_vstrihl v8i16:$vB))]>;
+  def VINSW :
+    VXForm_1<207, (outs vrrc:$vD), (ins vrrc:$vDi, u4imm:$UIM, gprc:$rB),
+             "vinsw $vD, $rB, $UIM", IIC_VecGeneral,
+             [(set v4i32:$vD,
+                   (int_ppc_altivec_vinsw v4i32:$vDi, i32:$rB, timm:$UIM))]>,
+             RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">;
   def VINSD :
-    VXForm_VRT5_UIM5_RB5_ins<463, "vinsd",
-                             [(set v2i64:$vD,
-                                   (int_ppc_altivec_vinsd v2i64:$vDi, i64:$rB,
-                                                          timm:$UIM))]>;
+    VXForm_1<463, (outs vrrc:$vD), (ins vrrc:$vDi, u4imm:$UIM, g8rc:$rB),
+             "vinsd $vD, $rB, $UIM", IIC_VecGeneral,
+             [(set v2i64:$vD,
+                   (int_ppc_altivec_vinsd v2i64:$vDi, i64:$rB, timm:$UIM))]>,
+             RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">;
   def VINSBVLX :
     VXForm_VTB5_RA5_ins<15, "vinsbvlx",
                         [(set v16i8:$vD,
-                              (int_ppc_altivec_vinsbvlx v16i8:$vDi, i64:$rA,
+                              (int_ppc_altivec_vinsbvlx v16i8:$vDi, i32:$rA,
                                                         v16i8:$vB))]>;
   def VINSBVRX :
     VXForm_VTB5_RA5_ins<271, "vinsbvrx",
                         [(set v16i8:$vD,
-                              (int_ppc_altivec_vinsbvrx v16i8:$vDi, i64:$rA,
+                              (int_ppc_altivec_vinsbvrx v16i8:$vDi, i32:$rA,
                                                         v16i8:$vB))]>;
   def VINSHVLX :
     VXForm_VTB5_RA5_ins<79, "vinshvlx",
                         [(set v8i16:$vD,
-                              (int_ppc_altivec_vinshvlx v8i16:$vDi, i64:$rA,
+                              (int_ppc_altivec_vinshvlx v8i16:$vDi, i32:$rA,
                                                         v8i16:$vB))]>;
   def VINSHVRX :
     VXForm_VTB5_RA5_ins<335, "vinshvrx",
                         [(set v8i16:$vD,
-                              (int_ppc_altivec_vinshvrx v8i16:$vDi, i64:$rA,
+                              (int_ppc_altivec_vinshvrx v8i16:$vDi, i32:$rA,
                                                         v8i16:$vB))]>;
   def VINSWVLX :
     VXForm_VTB5_RA5_ins<143, "vinswvlx",
                         [(set v4i32:$vD,
-                              (int_ppc_altivec_vinswvlx v4i32:$vDi, i64:$rA,
+                              (int_ppc_altivec_vinswvlx v4i32:$vDi, i32:$rA,
                                                         v4i32:$vB))]>;
   def VINSWVRX :
     VXForm_VTB5_RA5_ins<399, "vinswvrx",
                         [(set v4i32:$vD,
-                              (int_ppc_altivec_vinswvrx v4i32:$vDi, i64:$rA,
+                              (int_ppc_altivec_vinswvrx v4i32:$vDi, i32:$rA,
                                                         v4i32:$vB))]>;
   def VINSBLX :
     VXForm_VRT5_RAB5_ins<527, "vinsblx",
                          [(set v16i8:$vD,
-                               (int_ppc_altivec_vinsblx v16i8:$vDi, i64:$rA,
-                                                        i64:$rB))]>;
+                               (int_ppc_altivec_vinsblx v16i8:$vDi, i32:$rA,
+                                                        i32:$rB))]>;
   def VINSBRX :
     VXForm_VRT5_RAB5_ins<783, "vinsbrx",
                          [(set v16i8:$vD,
-                               (int_ppc_altivec_vinsbrx v16i8:$vDi, i64:$rA,
-                                                        i64:$rB))]>;
+                               (int_ppc_altivec_vinsbrx v16i8:$vDi, i32:$rA,
+                                                        i32:$rB))]>;
   def VINSHLX :
     VXForm_VRT5_RAB5_ins<591, "vinshlx",
                          [(set v8i16:$vD,
-                               (int_ppc_altivec_vinshlx v8i16:$vDi, i64:$rA,
-                                                        i64:$rB))]>;
+                               (int_ppc_altivec_vinshlx v8i16:$vDi, i32:$rA,
+                                                        i32:$rB))]>;
   def VINSHRX :
     VXForm_VRT5_RAB5_ins<847, "vinshrx",
                          [(set v8i16:$vD,
-                               (int_ppc_altivec_vinshrx v8i16:$vDi, i64:$rA,
-                                                        i64:$rB))]>;
+                               (int_ppc_altivec_vinshrx v8i16:$vDi, i32:$rA,
+                                                        i32:$rB))]>;
   def VINSWLX :
     VXForm_VRT5_RAB5_ins<655, "vinswlx",
                          [(set v4i32:$vD,
-                               (int_ppc_altivec_vinswlx v4i32:$vDi, i64:$rA,
-                                                        i64:$rB))]>;
+                               (int_ppc_altivec_vinswlx v4i32:$vDi, i32:$rA,
+                                                        i32:$rB))]>;
   def VINSWRX :
     VXForm_VRT5_RAB5_ins<911, "vinswrx",
                          [(set v4i32:$vD,
-                               (int_ppc_altivec_vinswrx v4i32:$vDi, i64:$rA,
-                                                        i64:$rB))]>;
+                               (int_ppc_altivec_vinswrx v4i32:$vDi, i32:$rA,
+                                                        i32:$rB))]>;
   def VINSDLX :
-    VXForm_VRT5_RAB5_ins<719, "vinsdlx",
-                         [(set v2i64:$vD,
-                               (int_ppc_altivec_vinsdlx v2i64:$vDi, i64:$rA,
-                                                        i64:$rB))]>;
+    VXForm_1<719, (outs vrrc:$vD), (ins vrrc:$vDi, g8rc:$rA, g8rc:$rB),
+             "vinsdlx $vD, $rA, $rB", IIC_VecGeneral,
+              [(set v2i64:$vD,
+                    (int_ppc_altivec_vinsdlx v2i64:$vDi, i64:$rA, i64:$rB))]>,
+              RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">;
   def VINSDRX :
-    VXForm_VRT5_RAB5_ins<975, "vinsdrx",
-                         [(set v2i64:$vD,
-                               (int_ppc_altivec_vinsdrx v2i64:$vDi, i64:$rA,
-                                                        i64:$rB))]>;
-
+    VXForm_1<975, (outs vrrc:$vD), (ins vrrc:$vDi, g8rc:$rA, g8rc:$rB),
+             "vinsdrx $vD, $rA, $rB", IIC_VecGeneral,
+              [(set v2i64:$vD,
+                    (int_ppc_altivec_vinsdrx v2i64:$vDi, i64:$rA, i64:$rB))]>,
+              RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">;
+  def VEXTRACTBM : VXForm_RD5_XO5_RS5<1602, 8, (outs gprc:$rD), (ins vrrc:$vB),
+                                      "vextractbm $rD, $vB", IIC_VecGeneral,
+                                      [(set i32:$rD,
+                                      (int_ppc_altivec_vextractbm v16i8:$vB))]>;
+  def VEXTRACTHM : VXForm_RD5_XO5_RS5<1602, 9, (outs gprc:$rD), (ins vrrc:$vB),
+                                      "vextracthm $rD, $vB", IIC_VecGeneral,
+                                      [(set i32:$rD,
+                                      (int_ppc_altivec_vextracthm v8i16:$vB))]>;
+  def VEXTRACTWM : VXForm_RD5_XO5_RS5<1602, 10, (outs gprc:$rD), (ins vrrc:$vB),
+                                      "vextractwm $rD, $vB", IIC_VecGeneral,
+                                      [(set i32:$rD,
+                                      (int_ppc_altivec_vextractwm v4i32:$vB))]>;
+  def VEXTRACTDM : VXForm_RD5_XO5_RS5<1602, 11, (outs gprc:$rD), (ins vrrc:$vB),
+                                      "vextractdm $rD, $vB", IIC_VecGeneral,
+                                      [(set i32:$rD,
+                                      (int_ppc_altivec_vextractdm v2i64:$vB))]>;
+  def VEXTRACTQM : VXForm_RD5_XO5_RS5<1602, 12, (outs gprc:$rD), (ins vrrc:$vB),
+                                      "vextractqm $rD, $vB", IIC_VecGeneral,
+                                      [(set i32:$rD,
+                                      (int_ppc_altivec_vextractqm v1i128:$vB))]>;
+  def VEXPANDBM : VXForm_RD5_XO5_RS5<1602, 0, (outs vrrc:$vD), (ins vrrc:$vB),
+                                     "vexpandbm $vD, $vB", IIC_VecGeneral,
+                                     [(set v16i8:$vD, (int_ppc_altivec_vexpandbm
+                                           v16i8:$vB))]>;
+  def VEXPANDHM : VXForm_RD5_XO5_RS5<1602, 1, (outs vrrc:$vD), (ins vrrc:$vB),
+                                     "vexpandhm $vD, $vB", IIC_VecGeneral,
+                                     [(set v8i16:$vD, (int_ppc_altivec_vexpandhm
+                                           v8i16:$vB))]>;
+  def VEXPANDWM : VXForm_RD5_XO5_RS5<1602, 2, (outs vrrc:$vD), (ins vrrc:$vB),
+                                     "vexpandwm $vD, $vB", IIC_VecGeneral,
+                                     [(set v4i32:$vD, (int_ppc_altivec_vexpandwm
+                                           v4i32:$vB))]>;
+  def VEXPANDDM : VXForm_RD5_XO5_RS5<1602, 3, (outs vrrc:$vD), (ins vrrc:$vB),
+                                     "vexpanddm $vD, $vB", IIC_VecGeneral,
+                                     [(set v2i64:$vD, (int_ppc_altivec_vexpanddm
+                                           v2i64:$vB))]>;
+  def VEXPANDQM : VXForm_RD5_XO5_RS5<1602, 4, (outs vrrc:$vD), (ins vrrc:$vB),
+                                     "vexpandqm $vD, $vB", IIC_VecGeneral,
+                                     [(set v1i128:$vD, (int_ppc_altivec_vexpandqm
+                                           v1i128:$vB))]>;
+  def MTVSRBM : VXForm_RD5_XO5_RS5<1602, 16, (outs vrrc:$vD), (ins g8rc:$rB),
+                                   "mtvsrbm $vD, $rB", IIC_VecGeneral,
+                                   [(set v16i8:$vD,
+                                         (int_ppc_altivec_mtvsrbm i64:$rB))]>;
+  def MTVSRHM : VXForm_RD5_XO5_RS5<1602, 17, (outs vrrc:$vD), (ins g8rc:$rB),
+                                   "mtvsrhm $vD, $rB", IIC_VecGeneral,
+                                   [(set v8i16:$vD,
+                                         (int_ppc_altivec_mtvsrhm i64:$rB))]>;
+  def MTVSRWM : VXForm_RD5_XO5_RS5<1602, 18, (outs vrrc:$vD), (ins g8rc:$rB),
+                                   "mtvsrwm $vD, $rB", IIC_VecGeneral,
+                                   [(set v4i32:$vD,
+                                         (int_ppc_altivec_mtvsrwm i64:$rB))]>;
+  def MTVSRDM : VXForm_RD5_XO5_RS5<1602, 19, (outs vrrc:$vD), (ins g8rc:$rB),
+                                   "mtvsrdm $vD, $rB", IIC_VecGeneral,
+                                   [(set v2i64:$vD,
+                                         (int_ppc_altivec_mtvsrdm i64:$rB))]>;
+  def MTVSRQM : VXForm_RD5_XO5_RS5<1602, 20, (outs vrrc:$vD), (ins g8rc:$rB),
+                                   "mtvsrqm $vD, $rB", IIC_VecGeneral,
+                                   [(set v1i128:$vD,
+                                         (int_ppc_altivec_mtvsrqm i64:$rB))]>;
+  def MTVSRBMI : DXForm<4, 10, (outs vrrc:$vD), (ins u16imm64:$D),
+                        "mtvsrbmi $vD, $D", IIC_VecGeneral,
+                        [(set v16i8:$vD,
+                              (int_ppc_altivec_mtvsrbm imm:$D))]>;
+  def VCNTMBB : VXForm_RD5_MP_VB5<1602, 12, (outs g8rc:$rD),
+                                  (ins vrrc:$vB, u1imm:$MP),
+                                  "vcntmbb $rD, $vB, $MP", IIC_VecGeneral,
+                                  [(set i64:$rD, (int_ppc_altivec_vcntmbb
+                                        v16i8:$vB, timm:$MP))]>;
+  def VCNTMBH : VXForm_RD5_MP_VB5<1602, 13, (outs g8rc:$rD),
+                                  (ins vrrc:$vB, u1imm:$MP),
+                                  "vcntmbh $rD, $vB, $MP", IIC_VecGeneral,
+                                  [(set i64:$rD, (int_ppc_altivec_vcntmbh
+                                        v8i16:$vB, timm:$MP))]>;
+  def VCNTMBW : VXForm_RD5_MP_VB5<1602, 14, (outs g8rc:$rD),
+                                  (ins vrrc:$vB, u1imm:$MP),
+                                  "vcntmbw $rD, $vB, $MP", IIC_VecGeneral,
+                                  [(set i64:$rD, (int_ppc_altivec_vcntmbw
+                                        v4i32:$vB, timm:$MP))]>;
+  def VCNTMBD : VXForm_RD5_MP_VB5<1602, 15, (outs g8rc:$rD),
+                                  (ins vrrc:$vB, u1imm:$MP),
+                                  "vcntmbd $rD, $vB, $MP", IIC_VecGeneral,
+                                  [(set i64:$rD, (int_ppc_altivec_vcntmbd
+                                        v2i64:$vB, timm:$MP))]>;
+  def VEXTDUBVLX : VAForm_1a<24, (outs vrrc:$vD),
+                             (ins vrrc:$vA, vrrc:$vB, gprc:$rC),
+                             "vextdubvlx $vD, $vA, $vB, $rC",
+                             IIC_VecGeneral,
+                             [(set v2i64:$vD,
+                                   (int_ppc_altivec_vextdubvlx v16i8:$vA,
+                                                               v16i8:$vB,
+                                                               i32:$rC))]>;
+  def VEXTDUBVRX : VAForm_1a<25, (outs vrrc:$vD),
+                             (ins vrrc:$vA, vrrc:$vB, gprc:$rC),
+                             "vextdubvrx $vD, $vA, $vB, $rC",
+                             IIC_VecGeneral,
+                             [(set v2i64:$vD,
+                                   (int_ppc_altivec_vextdubvrx v16i8:$vA,
+                                                               v16i8:$vB,
+                                                               i32:$rC))]>;
+  def VEXTDUHVLX : VAForm_1a<26, (outs vrrc:$vD),
+                             (ins vrrc:$vA, vrrc:$vB, gprc:$rC),
+                             "vextduhvlx $vD, $vA, $vB, $rC",
+                             IIC_VecGeneral,
+                             [(set v2i64:$vD,
+                                   (int_ppc_altivec_vextduhvlx v8i16:$vA,
+                                                               v8i16:$vB,
+                                                               i32:$rC))]>;
+  def VEXTDUHVRX : VAForm_1a<27, (outs vrrc:$vD),
+                             (ins vrrc:$vA, vrrc:$vB, gprc:$rC),
+                             "vextduhvrx $vD, $vA, $vB, $rC",
+                             IIC_VecGeneral,
+                             [(set v2i64:$vD,
+                                   (int_ppc_altivec_vextduhvrx v8i16:$vA,
+                                                               v8i16:$vB,
+                                                               i32:$rC))]>;
+  def VEXTDUWVLX : VAForm_1a<28, (outs vrrc:$vD),
+                             (ins vrrc:$vA, vrrc:$vB, gprc:$rC),
+                             "vextduwvlx $vD, $vA, $vB, $rC",
+                             IIC_VecGeneral,
+                             [(set v2i64:$vD,
+                                   (int_ppc_altivec_vextduwvlx v4i32:$vA,
+                                                               v4i32:$vB,
+                                                               i32:$rC))]>;
+  def VEXTDUWVRX : VAForm_1a<29, (outs vrrc:$vD),
+                             (ins vrrc:$vA, vrrc:$vB, gprc:$rC),
+                             "vextduwvrx $vD, $vA, $vB, $rC",
+                             IIC_VecGeneral,
+                             [(set v2i64:$vD,
+                                   (int_ppc_altivec_vextduwvrx v4i32:$vA,
+                                                               v4i32:$vB,
+                                                               i32:$rC))]>;
+  def VEXTDDVLX : VAForm_1a<30, (outs vrrc:$vD),
+                            (ins vrrc:$vA, vrrc:$vB, gprc:$rC),
+                            "vextddvlx $vD, $vA, $vB, $rC",
+                            IIC_VecGeneral,
+                            [(set v2i64:$vD,
+                                  (int_ppc_altivec_vextddvlx v2i64:$vA,
+                                                             v2i64:$vB,
+                                                             i32:$rC))]>;
+  def VEXTDDVRX : VAForm_1a<31, (outs vrrc:$vD),
+                            (ins vrrc:$vA, vrrc:$vB, gprc:$rC),
+                            "vextddvrx $vD, $vA, $vB, $rC",
+                            IIC_VecGeneral,
+                            [(set v2i64:$vD,
+                                  (int_ppc_altivec_vextddvrx v2i64:$vA,
+                                                             v2i64:$vB,
+                                                             i32:$rC))]>;
    def VPDEPD : VXForm_1<1485, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
                          "vpdepd $vD, $vA, $vB", IIC_VecGeneral,
                          [(set v2i64:$vD,
@@ -961,7 +2238,61 @@ let Predicates = [IsISA3_1] in {
                          "vclrrb $vD, $vA, $rB", IIC_VecGeneral,
                          [(set v16i8:$vD,
                                (int_ppc_altivec_vclrrb v16i8:$vA, i32:$rB))]>;
-
+  def VMULLD : VXForm_1<457, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                        "vmulld $vD, $vA, $vB", IIC_VecGeneral,
+                        [(set v2i64:$vD, (mul v2i64:$vA, v2i64:$vB))]>;
+  def VMULHSW : VXForm_1<905, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                         "vmulhsw $vD, $vA, $vB", IIC_VecGeneral,
+                         [(set v4i32:$vD, (mulhs v4i32:$vA, v4i32:$vB))]>;
+  def VMULHUW : VXForm_1<649, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                         "vmulhuw $vD, $vA, $vB", IIC_VecGeneral,
+                         [(set v4i32:$vD, (mulhu v4i32:$vA, v4i32:$vB))]>;
+  def VMULHSD : VXForm_1<969, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                         "vmulhsd $vD, $vA, $vB", IIC_VecGeneral,
+                         [(set v2i64:$vD, (mulhs v2i64:$vA, v2i64:$vB))]>;
+  def VMULHUD : VXForm_1<713, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                         "vmulhud $vD, $vA, $vB", IIC_VecGeneral,
+                         [(set v2i64:$vD, (mulhu v2i64:$vA, v2i64:$vB))]>;
+  def VMODSW : VXForm_1<1931, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                        "vmodsw $vD, $vA, $vB", IIC_VecGeneral,
+                        [(set v4i32:$vD, (srem v4i32:$vA, v4i32:$vB))]>;
+  def VMODUW : VXForm_1<1675, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                        "vmoduw $vD, $vA, $vB", IIC_VecGeneral,
+                        [(set v4i32:$vD, (urem v4i32:$vA, v4i32:$vB))]>;
+  def VMODSD : VXForm_1<1995, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                        "vmodsd $vD, $vA, $vB", IIC_VecGeneral,
+                        [(set v2i64:$vD, (srem v2i64:$vA, v2i64:$vB))]>;
+  def VMODUD : VXForm_1<1739, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                        "vmodud $vD, $vA, $vB", IIC_VecGeneral,
+                        [(set v2i64:$vD, (urem v2i64:$vA, v2i64:$vB))]>;
+  def VDIVSW : VXForm_1<395, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                        "vdivsw $vD, $vA, $vB", IIC_VecGeneral,
+                        [(set v4i32:$vD, (sdiv v4i32:$vA, v4i32:$vB))]>;
+  def VDIVUW : VXForm_1<139, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                        "vdivuw $vD, $vA, $vB", IIC_VecGeneral,
+                        [(set v4i32:$vD, (udiv v4i32:$vA, v4i32:$vB))]>;
+  def VDIVSD : VXForm_1<459, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                        "vdivsd $vD, $vA, $vB", IIC_VecGeneral,
+                        [(set v2i64:$vD, (sdiv v2i64:$vA, v2i64:$vB))]>;
+  def VDIVUD : VXForm_1<203, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                        "vdivud $vD, $vA, $vB", IIC_VecGeneral,
+                        [(set v2i64:$vD, (udiv v2i64:$vA, v2i64:$vB))]>;
+  def VDIVESW : VXForm_1<907, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                         "vdivesw $vD, $vA, $vB", IIC_VecGeneral,
+                         [(set v4i32:$vD, (int_ppc_altivec_vdivesw v4i32:$vA,
+                               v4i32:$vB))]>;
+  def VDIVEUW : VXForm_1<651, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                         "vdiveuw $vD, $vA, $vB", IIC_VecGeneral,
+                         [(set v4i32:$vD, (int_ppc_altivec_vdiveuw v4i32:$vA,
+                               v4i32:$vB))]>;
+  def VDIVESD : VXForm_1<971, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                         "vdivesd $vD, $vA, $vB", IIC_VecGeneral,
+                         [(set v2i64:$vD, (int_ppc_altivec_vdivesd v2i64:$vA,
+                               v2i64:$vB))]>;
+  def VDIVEUD : VXForm_1<715, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                         "vdiveud $vD, $vA, $vB", IIC_VecGeneral,
+                         [(set v2i64:$vD, (int_ppc_altivec_vdiveud v2i64:$vA,
+                               v2i64:$vB))]>;
   def XVTLSBB : XX2_BF3_XO5_XB6_XO9<60, 2, 475, (outs crrc:$BF), (ins vsrc:$XB),
                                     "xvtlsbb $BF, $XB", IIC_VecGeneral, []>;
 
@@ -980,10 +2311,204 @@ let Predicates = [IsISA3_1] in {
     def STXVRWX : X_XS6_RA5_RB5<31, 205, "stxvrwx", vsrc, []>;
     def STXVRDX : X_XS6_RA5_RB5<31, 237, "stxvrdx", vsrc, []>;
   }
+
+  def VMULESD : VXForm_1<968, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                         "vmulesd $vD, $vA, $vB", IIC_VecGeneral,
+                         [(set v1i128:$vD, (int_ppc_altivec_vmulesd v2i64:$vA,
+                               v2i64:$vB))]>;
+  def VMULEUD : VXForm_1<712, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                         "vmuleud $vD, $vA, $vB", IIC_VecGeneral,
+                         [(set v1i128:$vD, (int_ppc_altivec_vmuleud v2i64:$vA,
+                               v2i64:$vB))]>;
+  def VMULOSD : VXForm_1<456, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                         "vmulosd $vD, $vA, $vB", IIC_VecGeneral,
+                         [(set v1i128:$vD, (int_ppc_altivec_vmulosd v2i64:$vA,
+                               v2i64:$vB))]>;
+  def VMULOUD : VXForm_1<200, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                         "vmuloud $vD, $vA, $vB", IIC_VecGeneral,
+                         [(set v1i128:$vD, (int_ppc_altivec_vmuloud v2i64:$vA,
+                               v2i64:$vB))]>;
+  def VMSUMCUD : VAForm_1a<23, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, vrrc:$vC),
+                           "vmsumcud $vD, $vA, $vB, $vC", IIC_VecGeneral,
+                           [(set v1i128:$vD, (int_ppc_altivec_vmsumcud
+                                 v2i64:$vA, v2i64:$vB, v1i128:$vC))]>;
+  def VDIVSQ : VXForm_1<267, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                        "vdivsq $vD, $vA, $vB", IIC_VecGeneral,
+                        [(set v1i128:$vD, (sdiv v1i128:$vA, v1i128:$vB))]>;
+  def VDIVUQ : VXForm_1<11, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                        "vdivuq $vD, $vA, $vB", IIC_VecGeneral,
+                        [(set v1i128:$vD, (udiv v1i128:$vA, v1i128:$vB))]>;
+  def VDIVESQ : VXForm_1<779, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                         "vdivesq $vD, $vA, $vB", IIC_VecGeneral,
+                         [(set v1i128:$vD, (int_ppc_altivec_vdivesq v1i128:$vA,
+			       v1i128:$vB))]>;
+  def VDIVEUQ : VXForm_1<523, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                         "vdiveuq $vD, $vA, $vB", IIC_VecGeneral,
+                         [(set v1i128:$vD, (int_ppc_altivec_vdiveuq v1i128:$vA,
+			       v1i128:$vB))]>;
+  def VCMPEQUQ : VCMP <455, "vcmpequq $vD, $vA, $vB" , v1i128>;
+  def VCMPGTSQ : VCMP <903, "vcmpgtsq $vD, $vA, $vB" , v1i128>;
+  def VCMPGTUQ : VCMP <647, "vcmpgtuq $vD, $vA, $vB" , v1i128>;
+  def VCMPEQUQ_rec : VCMP_rec <455, "vcmpequq. $vD, $vA, $vB" , v1i128>;
+  def VCMPGTSQ_rec : VCMP_rec <903, "vcmpgtsq. $vD, $vA, $vB" , v1i128>;
+  def VCMPGTUQ_rec : VCMP_rec <647, "vcmpgtuq. $vD, $vA, $vB" , v1i128>;
+  def VMODSQ : VXForm_1<1803, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                        "vmodsq $vD, $vA, $vB", IIC_VecGeneral,
+                        [(set v1i128:$vD, (srem v1i128:$vA, v1i128:$vB))]>;
+  def VMODUQ : VXForm_1<1547, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                        "vmoduq $vD, $vA, $vB", IIC_VecGeneral,
+                        [(set v1i128:$vD, (urem v1i128:$vA, v1i128:$vB))]>;
+  def VEXTSD2Q : VXForm_RD5_XO5_RS5<1538, 27, (outs vrrc:$vD), (ins vrrc:$vB),
+                               "vextsd2q $vD, $vB", IIC_VecGeneral,
+                               [(set v1i128:$vD, (int_ppc_altivec_vextsd2q v2i64:$vB))]>;
+  def VCMPUQ : VXForm_BF3_VAB5<257, (outs crrc:$BF), (ins vrrc:$vA, vrrc:$vB),
+                               "vcmpuq $BF, $vA, $vB", IIC_VecGeneral, []>;
+  def VCMPSQ : VXForm_BF3_VAB5<321, (outs crrc:$BF), (ins vrrc:$vA, vrrc:$vB),
+                               "vcmpsq $BF, $vA, $vB", IIC_VecGeneral, []>;
+  def VRLQNM : VX1_VT5_VA5_VB5<325, "vrlqnm",
+                               [(set v1i128:$vD,
+                                   (int_ppc_altivec_vrlqnm v1i128:$vA,
+                                                           v1i128:$vB))]>;
+  def VRLQMI : VXForm_1<69, (outs vrrc:$vD),
+                        (ins vrrc:$vA, vrrc:$vB, vrrc:$vDi),
+                        "vrlqmi $vD, $vA, $vB", IIC_VecFP,
+                        [(set v1i128:$vD,
+                          (int_ppc_altivec_vrlqmi v1i128:$vA, v1i128:$vB,
+                                                  v1i128:$vDi))]>,
+                        RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">;
+  def VSLQ : VX1_VT5_VA5_VB5<261, "vslq", []>;
+  def VSRAQ : VX1_VT5_VA5_VB5<773, "vsraq", []>;
+  def VSRQ : VX1_VT5_VA5_VB5<517, "vsrq", []>;
+  def VRLQ : VX1_VT5_VA5_VB5<5, "vrlq", []>;
+  def XSCVQPUQZ : X_VT5_XO5_VB5<63, 0, 836, "xscvqpuqz", []>;
+  def XSCVQPSQZ : X_VT5_XO5_VB5<63, 8, 836, "xscvqpsqz", []>;
+  def XSCVUQQP : X_VT5_XO5_VB5<63, 3, 836, "xscvuqqp", []>;
+  def XSCVSQQP : X_VT5_XO5_VB5<63, 11, 836, "xscvsqqp", []>;
+}
+
+let Predicates = [IsISA3_1, HasVSX] in {
+  def XVCVSPBF16 : XX2_XT6_XO5_XB6<60, 17, 475, "xvcvspbf16", vsrc, []>;
+  def XVCVBF16SPN : XX2_XT6_XO5_XB6<60, 16, 475, "xvcvbf16spn", vsrc, []>;
+}
+
+// Multiclass defining patterns for Set Boolean Extension Reverse Instructions.
+// This is analogous to the CRNotPat multiclass but specifically for Power10
+// and newer subtargets since the extended forms use Set Boolean instructions.
+// The first two anonymous patterns defined are actually a duplicate of those
+// in CRNotPat, but it is preferable to define both multiclasses as complete
+// ones rather than pulling that small common section out.
+multiclass P10ReverseSetBool<dag pattern, dag result> {
+  def : Pat<pattern, (crnot result)>;
+  def : Pat<(not pattern), result>;
+
+  def : Pat<(i32 (zext pattern)),
+            (SETBCR result)>;
+  def : Pat<(i64 (zext pattern)),
+            (SETBCR8 result)>;
+
+  def : Pat<(i32 (sext pattern)),
+            (SETNBCR result)>;
+  def : Pat<(i64 (sext pattern)),
+            (SETNBCR8 result)>;
+
+  def : Pat<(i32 (anyext pattern)),
+            (SETBCR result)>;
+  def : Pat<(i64 (anyext pattern)),
+            (SETBCR8 result)>;
+}
+
+multiclass IntSetP10RevSetBool<SDNode SetCC, ValueType Ty, ImmLeaf ZExtTy,
+                               ImmLeaf SExtTy, PatLeaf Cmpi, PatLeaf Cmpli,
+                               PatLeaf Cmp, PatLeaf Cmpl> {
+  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETUGE)),
+                           (EXTRACT_SUBREG (Cmpl $s1, $s2), sub_lt)>;
+  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETGE)),
+                           (EXTRACT_SUBREG (Cmp $s1, $s2), sub_lt)>;
+  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETULE)),
+                           (EXTRACT_SUBREG (Cmpl $s1, $s2), sub_gt)>;
+  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETLE)),
+                           (EXTRACT_SUBREG (Cmp $s1, $s2), sub_gt)>;
+  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETNE)),
+                           (EXTRACT_SUBREG (Cmp $s1, $s2), sub_eq)>;
+
+  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, ZExtTy:$imm, SETUGE)),
+                           (EXTRACT_SUBREG (Cmpli $s1, imm:$imm), sub_lt)>;
+  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, SExtTy:$imm, SETGE)),
+                           (EXTRACT_SUBREG (Cmpi $s1, imm:$imm), sub_lt)>;
+  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, ZExtTy:$imm, SETULE)),
+                           (EXTRACT_SUBREG (Cmpli $s1, imm:$imm), sub_gt)>;
+  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, SExtTy:$imm, SETLE)),
+                           (EXTRACT_SUBREG (Cmpi $s1, imm:$imm), sub_gt)>;
+  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, SExtTy:$imm, SETNE)),
+                           (EXTRACT_SUBREG (Cmpi $s1, imm:$imm), sub_eq)>;
+  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, ZExtTy:$imm, SETNE)),
+                           (EXTRACT_SUBREG (Cmpli $s1, imm:$imm), sub_eq)>;
+}
+
+multiclass FSetP10RevSetBool<SDNode SetCC, ValueType Ty, PatLeaf FCmp> {
+  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETUGE)),
+                           (EXTRACT_SUBREG (FCmp $s1, $s2), sub_lt)>;
+  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETGE)),
+                           (EXTRACT_SUBREG (FCmp $s1, $s2), sub_lt)>;
+  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETULE)),
+                           (EXTRACT_SUBREG (FCmp $s1, $s2), sub_gt)>;
+  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETLE)),
+                           (EXTRACT_SUBREG (FCmp $s1, $s2), sub_gt)>;
+  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETUNE)),
+                           (EXTRACT_SUBREG (FCmp $s1, $s2), sub_eq)>;
+  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETNE)),
+                           (EXTRACT_SUBREG (FCmp $s1, $s2), sub_eq)>;
+  defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETO)),
+                           (EXTRACT_SUBREG (FCmp $s1, $s2), sub_un)>;
+}
+
+let Predicates = [IsISA3_1] in {
+  def : Pat<(i32 (zext i1:$in)),
+            (SETBC $in)>;
+  def : Pat<(i64 (zext i1:$in)),
+            (SETBC8 $in)>;
+  def : Pat<(i32 (sext i1:$in)),
+            (SETNBC $in)>;
+  def : Pat<(i64 (sext i1:$in)),
+            (SETNBC8 $in)>;
+  def : Pat<(i32 (anyext i1:$in)),
+            (SETBC $in)>;
+  def : Pat<(i64 (anyext i1:$in)),
+            (SETBC8 $in)>;
+
+  // Instantiation of the set boolean reverse patterns for 32-bit integers.
+  defm : IntSetP10RevSetBool<setcc, i32, immZExt16, imm32SExt16,
+                             CMPWI, CMPLWI, CMPW, CMPLW>;
+  defm : P10ReverseSetBool<(i1 (setcc i32:$s1, imm:$imm, SETNE)),
+                           (EXTRACT_SUBREG (CMPLWI (XORIS $s1, (HI16 imm:$imm)),
+                                           (LO16 imm:$imm)), sub_eq)>;
+
+  // Instantiation of the set boolean reverse patterns for 64-bit integers.
+  defm : IntSetP10RevSetBool<setcc, i64, immZExt16, imm64SExt16,
+                             CMPDI, CMPLDI, CMPD, CMPLD>;
+  defm : P10ReverseSetBool<(i1 (setcc i64:$s1, imm64ZExt32:$imm, SETNE)),
+                           (EXTRACT_SUBREG (CMPLDI (XORIS8 $s1, (HI16 imm:$imm)),
+                                           (LO16 imm:$imm)), sub_eq)>;
+}
+
+// Instantiation of the set boolean reverse patterns for f32, f64, f128.
+let Predicates = [IsISA3_1, HasFPU] in {
+  defm : FSetP10RevSetBool<setcc, f32, FCMPUS>;
+  defm : FSetP10RevSetBool<setcc, f64, FCMPUD>;
+  defm : FSetP10RevSetBool<setcc, f128, XSCMPUQP>;
 }
 
 //---------------------------- Anonymous Patterns ----------------------------//
 let Predicates = [IsISA3_1] in {
+  // Exploit the vector multiply high instructions using intrinsics.
+  def : Pat<(v4i32 (int_ppc_altivec_vmulhsw v4i32:$vA, v4i32:$vB)),
+            (v4i32 (VMULHSW $vA, $vB))>;
+  def : Pat<(v4i32 (int_ppc_altivec_vmulhuw v4i32:$vA, v4i32:$vB)),
+            (v4i32 (VMULHUW $vA, $vB))>;
+  def : Pat<(v2i64 (int_ppc_altivec_vmulhsd v2i64:$vA, v2i64:$vB)),
+            (v2i64 (VMULHSD $vA, $vB))>;
+  def : Pat<(v2i64 (int_ppc_altivec_vmulhud v2i64:$vA, v2i64:$vB)),
+            (v2i64 (VMULHUD $vA, $vB))>;
   def : Pat<(v16i8 (int_ppc_vsx_xxgenpcvbm v16i8:$VRB, imm:$IMM)),
             (v16i8 (COPY_TO_REGCLASS (XXGENPCVBM $VRB, imm:$IMM), VRRC))>;
   def : Pat<(v8i16 (int_ppc_vsx_xxgenpcvhm v8i16:$VRB, imm:$IMM)),
@@ -992,12 +2517,82 @@ let Predicates = [IsISA3_1] in {
             (v4i32 (COPY_TO_REGCLASS (XXGENPCVWM $VRB, imm:$IMM), VRRC))>;
   def : Pat<(v2i64 (int_ppc_vsx_xxgenpcvdm v2i64:$VRB, imm:$IMM)),
             (v2i64 (COPY_TO_REGCLASS (XXGENPCVDM $VRB, imm:$IMM), VRRC))>;
-  def : Pat<(i32 (int_ppc_vsx_xvtlsbb v16i8:$XB, -1)),
+  def : Pat<(i32 (int_ppc_vsx_xvtlsbb v16i8:$XB, 1)),
             (EXTRACT_SUBREG (XVTLSBB (COPY_TO_REGCLASS $XB, VSRC)), sub_lt)>;
   def : Pat<(i32 (int_ppc_vsx_xvtlsbb v16i8:$XB, 0)),
             (EXTRACT_SUBREG (XVTLSBB (COPY_TO_REGCLASS $XB, VSRC)), sub_eq)>;
+
+  def : Pat <(v1i128 (PPClxvrzx xoaddr:$src, 8)),
+             (v1i128 (COPY_TO_REGCLASS (LXVRBX xoaddr:$src), VRRC))>;
+  def : Pat <(v1i128 (PPClxvrzx xoaddr:$src, 16)),
+             (v1i128 (COPY_TO_REGCLASS (LXVRHX xoaddr:$src), VRRC))>;
+  def : Pat <(v1i128 (PPClxvrzx xoaddr:$src, 32)),
+             (v1i128 (COPY_TO_REGCLASS (LXVRWX xoaddr:$src), VRRC))>;
+  def : Pat <(v1i128 (PPClxvrzx xoaddr:$src, 64)),
+             (v1i128 (COPY_TO_REGCLASS (LXVRDX xoaddr:$src), VRRC))>;
+
+  def : Pat<(v1i128 (rotl v1i128:$vA, v1i128:$vB)),
+            (v1i128 (VRLQ v1i128:$vA, v1i128:$vB))>;
+
+  def : Pat <(v2i64 (PPCxxsplti32dx v2i64:$XT, i32:$XI, i32:$IMM32)),
+             (v2i64 (XXSPLTI32DX v2i64:$XT, i32:$XI, i32:$IMM32))>;
+}
+
+let Predicates = [IsISA3_1, HasVSX] in {
+  def : Pat<(v16i8 (int_ppc_vsx_xvcvspbf16 v16i8:$XA)),
+            (COPY_TO_REGCLASS (XVCVSPBF16 RCCp.AToVSRC), VRRC)>;
+  def : Pat<(v16i8 (int_ppc_vsx_xvcvbf16spn v16i8:$XA)),
+            (COPY_TO_REGCLASS (XVCVBF16SPN RCCp.AToVSRC), VRRC)>;
 }
 
+let AddedComplexity = 400, Predicates = [IsISA3_1, IsLittleEndian] in {
+  // Store element 0 of a VSX register to memory
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$src, 0)), xoaddr:$dst),
+            (STXVRBX (COPY_TO_REGCLASS v16i8:$src, VSRC), xoaddr:$dst)>;
+  def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$src, 0)), xoaddr:$dst),
+            (STXVRHX (COPY_TO_REGCLASS v8i16:$src, VSRC), xoaddr:$dst)>;
+  def : Pat<(store (i32 (extractelt v4i32:$src, 0)), xoaddr:$dst),
+            (STXVRWX $src, xoaddr:$dst)>;
+  def : Pat<(store (f32 (extractelt v4f32:$src, 0)), xoaddr:$dst),
+            (STXVRWX $src, xoaddr:$dst)>;
+  def : Pat<(store (i64 (extractelt v2i64:$src, 0)), xoaddr:$dst),
+            (STXVRDX $src, xoaddr:$dst)>;
+  def : Pat<(store (f64 (extractelt v2f64:$src, 0)), xoaddr:$dst),
+            (STXVRDX $src, xoaddr:$dst)>;
+ }
+
+// FIXME: The swap is overkill when the shift amount is a constant.
+// We should just fix the constant in the DAG.
+let AddedComplexity = 400, Predicates = [IsISA3_1, HasVSX] in {
+  def : Pat<(v1i128 (shl v1i128:$VRA, v1i128:$VRB)),
+            (v1i128 (VSLQ v1i128:$VRA,
+                     (XXPERMDI (COPY_TO_REGCLASS $VRB, VSRC),
+                               (COPY_TO_REGCLASS $VRB, VSRC), 2)))>;
+  def : Pat<(v1i128 (PPCshl v1i128:$VRA, v1i128:$VRB)),
+            (v1i128 (VSLQ v1i128:$VRA,
+                     (XXPERMDI (COPY_TO_REGCLASS $VRB, VSRC),
+                               (COPY_TO_REGCLASS $VRB, VSRC), 2)))>;
+  def : Pat<(v1i128 (srl v1i128:$VRA, v1i128:$VRB)),
+            (v1i128 (VSRQ v1i128:$VRA,
+                     (XXPERMDI (COPY_TO_REGCLASS $VRB, VSRC),
+                               (COPY_TO_REGCLASS $VRB, VSRC), 2)))>;
+  def : Pat<(v1i128 (PPCsrl v1i128:$VRA, v1i128:$VRB)),
+            (v1i128 (VSRQ v1i128:$VRA,
+                     (XXPERMDI (COPY_TO_REGCLASS $VRB, VSRC),
+                               (COPY_TO_REGCLASS $VRB, VSRC), 2)))>;
+  def : Pat<(v1i128 (sra v1i128:$VRA, v1i128:$VRB)),
+            (v1i128 (VSRAQ v1i128:$VRA,
+                     (XXPERMDI (COPY_TO_REGCLASS $VRB, VSRC),
+                               (COPY_TO_REGCLASS $VRB, VSRC), 2)))>;
+  def : Pat<(v1i128 (PPCsra v1i128:$VRA, v1i128:$VRB)),
+            (v1i128 (VSRAQ v1i128:$VRA,
+                     (XXPERMDI (COPY_TO_REGCLASS $VRB, VSRC),
+                               (COPY_TO_REGCLASS $VRB, VSRC), 2)))>;
+}
+
+class xxevalPattern <dag pattern, bits<8> imm> :
+  Pat<(v4i32 pattern), (XXEVAL $vA, $vB, $vC, imm)> {}
+
 let AddedComplexity = 400, Predicates = [PrefixInstrs] in {
  def : Pat<(v4i32 (build_vector i32immNonAllOneNonZero:$A,
                                 i32immNonAllOneNonZero:$A,
@@ -1010,6 +2605,44 @@ let AddedComplexity = 400, Predicates = [PrefixInstrs] in {
  def : Pat<(f64 nzFPImmAsi32:$A),
            (COPY_TO_REGCLASS (XXSPLTIDP (getFPAs32BitInt fpimm:$A)),
                              VSFRC)>;
+
+  // Anonymous patterns for XXEVAL
+  // AND
+  // and(A, B, C)
+  def : xxevalPattern<(and v4i32:$vA, (and v4i32:$vB, v4i32:$vC)), 1>;
+  // and(A, xor(B, C))
+  def : xxevalPattern<(and v4i32:$vA, (xor v4i32:$vB, v4i32:$vC)), 6>;
+  // and(A, or(B, C))
+  def : xxevalPattern<(and v4i32:$vA, (or v4i32:$vB, v4i32:$vC)), 7>;
+  // and(A, nor(B, C))
+  def : xxevalPattern<(and v4i32:$vA, (vnot_ppc (or v4i32:$vB, v4i32:$vC))),
+                       8>;
+  // and(A, eqv(B, C))
+  def : xxevalPattern<(and v4i32:$vA, (vnot_ppc (xor v4i32:$vB, v4i32:$vC))),
+                       9>;
+  // and(A, nand(B, C))
+  def : xxevalPattern<(and v4i32:$vA, (vnot_ppc (and v4i32:$vB, v4i32:$vC))),
+                       14>;
+
+  // NAND
+  // nand(A, B, C)
+  def : xxevalPattern<(vnot_ppc (and v4i32:$vA, (and v4i32:$vB, v4i32:$vC))),
+                       !sub(255, 1)>;
+  // nand(A, xor(B, C))
+  def : xxevalPattern<(vnot_ppc (and v4i32:$vA, (xor v4i32:$vB, v4i32:$vC))),
+                       !sub(255, 6)>;
+  // nand(A, or(B, C))
+  def : xxevalPattern<(vnot_ppc (and v4i32:$vA, (or v4i32:$vB, v4i32:$vC))),
+                       !sub(255, 7)>;
+  // nand(A, nor(B, C))
+  def : xxevalPattern<(or (vnot_ppc v4i32:$vA), (or v4i32:$vB, v4i32:$vC)),
+                       !sub(255, 8)>;
+  // nand(A, eqv(B, C))
+  def : xxevalPattern<(or (vnot_ppc v4i32:$vA), (xor v4i32:$vB, v4i32:$vC)),
+                       !sub(255, 9)>;
+  // nand(A, nand(B, C))
+  def : xxevalPattern<(or (vnot_ppc v4i32:$vA), (and v4i32:$vB, v4i32:$vC)),
+                       !sub(255, 14)>;
 }
 
 let Predicates = [PrefixInstrs] in {
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrQPX.td b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrQPX.td
deleted file mode 100644
index 2265af2815cb..000000000000
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrQPX.td
+++ /dev/null
@@ -1,1212 +0,0 @@
-//===- PPCInstrQPX.td - The PowerPC QPX Extension --*- tablegen -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes the QPX extension to the PowerPC instruction set.
-// Reference:
-// Book Q: QPX Architecture Definition. IBM (as updated in) 2011.
-//
-//===----------------------------------------------------------------------===//
-
-def PPCRegQFRCAsmOperand : AsmOperandClass {
-  let Name = "RegQFRC"; let PredicateMethod = "isRegNumber";
-}
-def qfrc : RegisterOperand<QFRC> {
-  let ParserMatchClass = PPCRegQFRCAsmOperand;
-}
-def PPCRegQSRCAsmOperand : AsmOperandClass {
-  let Name = "RegQSRC"; let PredicateMethod = "isRegNumber";
-}
-def qsrc : RegisterOperand<QSRC> {
-  let ParserMatchClass = PPCRegQSRCAsmOperand;
-}
-def PPCRegQBRCAsmOperand : AsmOperandClass {
-  let Name = "RegQBRC"; let PredicateMethod = "isRegNumber";
-}
-def qbrc : RegisterOperand<QBRC> {
-  let ParserMatchClass = PPCRegQBRCAsmOperand;
-}
-
-//===----------------------------------------------------------------------===//
-// Helpers for defining instructions that directly correspond to intrinsics.
-
-// QPXA1_Int - A AForm_1 intrinsic definition.
-class QPXA1_Int<bits<6> opcode, bits<5> xo, string opc, Intrinsic IntID>
-  : AForm_1<opcode, xo, (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, qfrc:$FRC),
-              !strconcat(opc, " $FRT, $FRA, $FRC, $FRB"), IIC_FPFused,
-                       [(set v4f64:$FRT, (IntID v4f64:$FRA, v4f64:$FRB, v4f64:$FRC))]>;
-// QPXA1s_Int - A AForm_1 intrinsic definition (simple instructions).
-class QPXA1s_Int<bits<6> opcode, bits<5> xo, string opc, Intrinsic IntID>
-  : AForm_1<opcode, xo, (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, qfrc:$FRC),
-              !strconcat(opc, " $FRT, $FRA, $FRC, $FRB"), IIC_VecPerm,
-                       [(set v4f64:$FRT, (IntID v4f64:$FRA, v4f64:$FRB, v4f64:$FRC))]>;
-// QPXA2_Int - A AForm_2 intrinsic definition.
-class QPXA2_Int<bits<6> opcode, bits<5> xo, string opc, Intrinsic IntID>
-  : AForm_2<opcode, xo, (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB),
-              !strconcat(opc, " $FRT, $FRA, $FRB"), IIC_FPGeneral,
-                       [(set v4f64:$FRT, (IntID v4f64:$FRA, v4f64:$FRB))]>;
-// QPXA3_Int - A AForm_3 intrinsic definition.
-class QPXA3_Int<bits<6> opcode, bits<5> xo, string opc, Intrinsic IntID>
-  : AForm_3<opcode, xo, (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRC),
-              !strconcat(opc, " $FRT, $FRA, $FRC"), IIC_FPGeneral,
-                       [(set v4f64:$FRT, (IntID v4f64:$FRA, v4f64:$FRC))]>;
-// QPXA4_Int - A AForm_4a intrinsic definition.
-class QPXA4_Int<bits<6> opcode, bits<5> xo, string opc, Intrinsic IntID>
-  : AForm_4a<opcode, xo, (outs qfrc:$FRT), (ins qfrc:$FRB),
-              !strconcat(opc, " $FRT, $FRB"), IIC_FPGeneral,
-                       [(set v4f64:$FRT, (IntID v4f64:$FRB))]>;
-// QPXX18_Int - A XForm_18 intrinsic definition.
-class QPXX18_Int<bits<6> opcode, bits<10> xo, string opc, Intrinsic IntID>
-  : XForm_18<opcode, xo, (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB),
-              !strconcat(opc, " $FRT, $FRA, $FRB"), IIC_FPCompare,
-                       [(set v4f64:$FRT, (IntID v4f64:$FRA, v4f64:$FRB))]>;
-// QPXX19_Int - A XForm_19 intrinsic definition.
-class QPXX19_Int<bits<6> opcode, bits<10> xo, string opc, Intrinsic IntID>
-  : XForm_19<opcode, xo, (outs qfrc:$FRT), (ins qfrc:$FRB),
-              !strconcat(opc, " $FRT, $FRB"), IIC_FPGeneral,
-                       [(set v4f64:$FRT, (IntID v4f64:$FRB))]>;
-
-//===----------------------------------------------------------------------===//
-// Pattern Frags.
-
-def extloadv4f32 : PatFrag<(ops node:$ptr), (extload node:$ptr), [{
-  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::v4f32;
-}]>;
-
-def truncstorev4f32 : PatFrag<(ops node:$val, node:$ptr),
-                            (truncstore node:$val, node:$ptr), [{
-  return cast<StoreSDNode>(N)->getMemoryVT() == MVT::v4f32;
-}]>;
-def pre_truncstv4f32 : PatFrag<(ops node:$val, node:$base, node:$offset),
-                               (pre_truncst node:$val,
-                                            node:$base, node:$offset), [{
-  return cast<StoreSDNode>(N)->getMemoryVT() == MVT::v4f32;
-}]>;
-
-def fround_inexact : PatFrag<(ops node:$val), (fpround node:$val), [{
-  return cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() == 0;
-}]>;
-
-def fround_exact : PatFrag<(ops node:$val), (fpround node:$val), [{
-  return cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() == 1;
-}]>;
-
-let FastIselShouldIgnore = 1 in // FastIsel should ignore all u12 instrs.
-  def u12 : ImmLeaf<i32, [{ return (Imm & 0xFFF) == Imm; }]>;
-
-//===----------------------------------------------------------------------===//
-// Instruction Definitions.
-
-def HasQPX : Predicate<"Subtarget->hasQPX()">;
-let Predicates = [HasQPX] in {
-let DecoderNamespace = "QPX" in {
-let hasSideEffects = 0 in { // QPX instructions don't have side effects.
-let Uses = [RM] in {
-  // Add Instructions
-  let isCommutable = 1 in {
-    def QVFADD : AForm_2<4, 21,
-                        (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB),
-                        "qvfadd $FRT, $FRA, $FRB", IIC_FPGeneral,
-                        [(set v4f64:$FRT, (fadd v4f64:$FRA, v4f64:$FRB))]>;
-    let isCodeGenOnly = 1 in
-      def QVFADDS : QPXA2_Int<0, 21, "qvfadds", int_ppc_qpx_qvfadds>;
-    def QVFADDSs : AForm_2<0, 21,
-                          (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB),
-                          "qvfadds $FRT, $FRA, $FRB", IIC_FPGeneral,
-                          [(set v4f32:$FRT, (fadd v4f32:$FRA, v4f32:$FRB))]>;
-  }
-  def QVFSUB : AForm_2<4, 20,
-                      (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB),
-                      "qvfsub $FRT, $FRA, $FRB", IIC_FPGeneral,
-                      [(set v4f64:$FRT, (fsub v4f64:$FRA, v4f64:$FRB))]>;
-  let isCodeGenOnly = 1 in
-    def QVFSUBS : QPXA2_Int<0, 20, "qvfsubs", int_ppc_qpx_qvfsubs>;
-  def QVFSUBSs : AForm_2<0, 20,
-                        (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB),
-                        "qvfsubs $FRT, $FRA, $FRB", IIC_FPGeneral,
-                        [(set v4f32:$FRT, (fsub v4f32:$FRA, v4f32:$FRB))]>;
-
-  // Estimate Instructions
-  def QVFRE : AForm_4a<4, 24, (outs qfrc:$FRT), (ins qfrc:$FRB),
-                       "qvfre $FRT, $FRB", IIC_FPGeneral,
-                       [(set v4f64:$FRT, (PPCfre v4f64:$FRB))]>;
-  def QVFRES : QPXA4_Int<0, 24, "qvfres", int_ppc_qpx_qvfres>;
-  let isCodeGenOnly = 1 in
-  def QVFRESs : AForm_4a<0, 24, (outs qsrc:$FRT), (ins qsrc:$FRB),
-                         "qvfres $FRT, $FRB", IIC_FPGeneral,
-                         [(set v4f32:$FRT, (PPCfre v4f32:$FRB))]>;
-
-  def QVFRSQRTE : AForm_4a<4, 26, (outs qfrc:$FRT), (ins qfrc:$FRB),
-                           "qvfrsqrte $FRT, $FRB", IIC_FPGeneral,
-                           [(set v4f64:$FRT, (PPCfrsqrte v4f64:$FRB))]>;
-  def QVFRSQRTES : QPXA4_Int<0, 26, "qvfrsqrtes", int_ppc_qpx_qvfrsqrtes>;
-  let isCodeGenOnly = 1 in
-  def QVFRSQRTESs : AForm_4a<0, 26, (outs qsrc:$FRT), (ins qsrc:$FRB),
-                             "qvfrsqrtes $FRT, $FRB", IIC_FPGeneral,
-                             [(set v4f32:$FRT, (PPCfrsqrte v4f32:$FRB))]>;
-
-  // Multiply Instructions
-  let isCommutable = 1 in {
-    def QVFMUL : AForm_3<4, 25,
-                        (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRC),
-                        "qvfmul $FRT, $FRA, $FRC", IIC_FPGeneral,
-                        [(set v4f64:$FRT, (fmul v4f64:$FRA, v4f64:$FRC))]>;
-    let isCodeGenOnly = 1 in
-      def QVFMULS : QPXA3_Int<0, 25, "qvfmuls", int_ppc_qpx_qvfmuls>;
-    def QVFMULSs : AForm_3<0, 25,
-                          (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRC),
-                          "qvfmuls $FRT, $FRA, $FRC", IIC_FPGeneral,
-                          [(set v4f32:$FRT, (fmul v4f32:$FRA, v4f32:$FRC))]>;
-  }
-  def QVFXMUL : QPXA3_Int<4, 17, "qvfxmul", int_ppc_qpx_qvfxmul>;
-  def QVFXMULS : QPXA3_Int<0, 17, "qvfxmuls", int_ppc_qpx_qvfxmuls>;
-
-  // Multiply-add instructions
-  def QVFMADD : AForm_1<4, 29,
-                      (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRC, qfrc:$FRB),
-                      "qvfmadd $FRT, $FRA, $FRC, $FRB", IIC_FPFused,
-                      [(set v4f64:$FRT, (fma v4f64:$FRA, v4f64:$FRC, v4f64:$FRB))]>;
-  let isCodeGenOnly = 1 in
-    def QVFMADDS : QPXA1_Int<0, 29, "qvfmadds", int_ppc_qpx_qvfmadds>;
-  def QVFMADDSs : AForm_1<0, 29,
-                        (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRC, qsrc:$FRB),
-                        "qvfmadds $FRT, $FRA, $FRC, $FRB", IIC_FPFused,
-                        [(set v4f32:$FRT, (fma v4f32:$FRA, v4f32:$FRC, v4f32:$FRB))]>;
-  def QVFNMADD : AForm_1<4, 31,
-                      (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRC, qfrc:$FRB),
-                      "qvfnmadd $FRT, $FRA, $FRC, $FRB", IIC_FPFused,
-                      [(set v4f64:$FRT, (fneg (fma v4f64:$FRA, v4f64:$FRC,
-                                                   v4f64:$FRB)))]>;
-  let isCodeGenOnly = 1 in
-    def QVFNMADDS : QPXA1_Int<0, 31, "qvfnmadds", int_ppc_qpx_qvfnmadds>;
-  def QVFNMADDSs : AForm_1<0, 31,
-                        (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRC, qsrc:$FRB),
-                        "qvfnmadds $FRT, $FRA, $FRC, $FRB", IIC_FPFused,
-                        [(set v4f32:$FRT, (fneg (fma v4f32:$FRA, v4f32:$FRC,
-                                                     v4f32:$FRB)))]>;
-  def QVFMSUB : AForm_1<4, 28,
-                      (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRC, qfrc:$FRB),
-                      "qvfmsub $FRT, $FRA, $FRC, $FRB", IIC_FPFused,
-                      [(set v4f64:$FRT, (fma v4f64:$FRA, v4f64:$FRC,
-                                             (fneg v4f64:$FRB)))]>;
-  let isCodeGenOnly = 1 in
-    def QVFMSUBS : QPXA1_Int<0, 28, "qvfmsubs", int_ppc_qpx_qvfmsubs>;
-  def QVFMSUBSs : AForm_1<0, 28,
-                      (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRC, qsrc:$FRB),
-                      "qvfmsubs $FRT, $FRA, $FRC, $FRB", IIC_FPFused,
-                      [(set v4f32:$FRT, (fma v4f32:$FRA, v4f32:$FRC,
-                                             (fneg v4f32:$FRB)))]>;
-  def QVFNMSUB : AForm_1<4, 30,
-                      (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRC, qfrc:$FRB),
-                      "qvfnmsub $FRT, $FRA, $FRC, $FRB", IIC_FPFused,
-                      [(set v4f64:$FRT, (fneg (fma v4f64:$FRA, v4f64:$FRC,
-                                              (fneg v4f64:$FRB))))]>;
-  let isCodeGenOnly = 1 in
-    def QVFNMSUBS : QPXA1_Int<0, 30, "qvfnmsubs", int_ppc_qpx_qvfnmsubs>;
-  def QVFNMSUBSs : AForm_1<0, 30,
-                      (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRC, qsrc:$FRB),
-                      "qvfnmsubs $FRT, $FRA, $FRC, $FRB", IIC_FPFused,
-                      [(set v4f32:$FRT, (fneg (fma v4f32:$FRA, v4f32:$FRC,
-                                              (fneg v4f32:$FRB))))]>;
-  def QVFXMADD : QPXA1_Int<4, 9, "qvfxmadd", int_ppc_qpx_qvfxmadd>;
-  def QVFXMADDS : QPXA1_Int<0, 9, "qvfxmadds", int_ppc_qpx_qvfxmadds>;
-  def QVFXXNPMADD : QPXA1_Int<4, 11, "qvfxxnpmadd", int_ppc_qpx_qvfxxnpmadd>;
-  def QVFXXNPMADDS : QPXA1_Int<0, 11, "qvfxxnpmadds", int_ppc_qpx_qvfxxnpmadds>;
-  def QVFXXCPNMADD : QPXA1_Int<4, 3, "qvfxxcpnmadd", int_ppc_qpx_qvfxxcpnmadd>;
-  def QVFXXCPNMADDS : QPXA1_Int<0, 3, "qvfxxcpnmadds", int_ppc_qpx_qvfxxcpnmadds>;
-  def QVFXXMADD : QPXA1_Int<4, 1, "qvfxxmadd", int_ppc_qpx_qvfxxmadd>;
-  def QVFXXMADDS : QPXA1_Int<0, 1, "qvfxxmadds", int_ppc_qpx_qvfxxmadds>;
-
-  // Select Instruction
-  let isCodeGenOnly = 1 in
-    def QVFSEL : QPXA1s_Int<4, 23, "qvfsel", int_ppc_qpx_qvfsel>;
-  def QVFSELb : AForm_1<4, 23, (outs qfrc:$FRT),
-                        (ins qbrc:$FRA, qfrc:$FRB, qfrc:$FRC),
-                        "qvfsel $FRT, $FRA, $FRC, $FRB", IIC_VecPerm,
-                        [(set v4f64:$FRT, (vselect v4i1:$FRA,
-                                                   v4f64:$FRC, v4f64:$FRB))]>;
-  let isCodeGenOnly = 1 in
-  def QVFSELbs : AForm_1<4, 23, (outs qsrc:$FRT),
-                        (ins qbrc:$FRA, qsrc:$FRB, qsrc:$FRC),
-                        "qvfsel $FRT, $FRA, $FRC, $FRB", IIC_VecPerm,
-                        [(set v4f32:$FRT, (vselect v4i1:$FRA,
-                                                   v4f32:$FRC, v4f32:$FRB))]>;
-  let isCodeGenOnly = 1 in
-  def QVFSELbb: AForm_1<4, 23, (outs qbrc:$FRT),
-                        (ins qbrc:$FRA, qbrc:$FRB, qbrc:$FRC),
-                        "qvfsel $FRT, $FRA, $FRC, $FRB", IIC_VecPerm,
-                        [(set v4i1:$FRT, (vselect v4i1:$FRA,
-                                                  v4i1:$FRC, v4i1:$FRB))]>;
-
-  // SELECT_CC_* - Used to implement the SELECT_CC DAG operation.  Expanded after
-  // instruction selection into a branch sequence.
-  def SELECT_CC_QFRC: PPCCustomInserterPseudo<(outs qfrc:$dst), (ins crrc:$cond, qfrc:$T, qfrc:$F,
-                              i32imm:$BROPC), "#SELECT_CC_QFRC",
-                              []>;
-  def SELECT_CC_QSRC: PPCCustomInserterPseudo<(outs qsrc:$dst), (ins crrc:$cond, qsrc:$T, qsrc:$F,
-                              i32imm:$BROPC), "#SELECT_CC_QSRC",
-                              []>;
-  def SELECT_CC_QBRC: PPCCustomInserterPseudo<(outs qbrc:$dst), (ins crrc:$cond, qbrc:$T, qbrc:$F,
-                              i32imm:$BROPC), "#SELECT_CC_QBRC",
-                              []>;
-
-  // SELECT_* pseudo instructions, like SELECT_CC_* but taking condition
-  // register bit directly.
-  def SELECT_QFRC: PPCCustomInserterPseudo<(outs qfrc:$dst), (ins crbitrc:$cond,
-                          qfrc:$T, qfrc:$F), "#SELECT_QFRC",
-                          [(set v4f64:$dst,
-                                (select i1:$cond, v4f64:$T, v4f64:$F))]>;
-  def SELECT_QSRC: PPCCustomInserterPseudo<(outs qsrc:$dst), (ins crbitrc:$cond,
-                          qsrc:$T, qsrc:$F), "#SELECT_QSRC",
-                          [(set v4f32:$dst,
-                                (select i1:$cond, v4f32:$T, v4f32:$F))]>;
-  def SELECT_QBRC: PPCCustomInserterPseudo<(outs qbrc:$dst), (ins crbitrc:$cond,
-                          qbrc:$T, qbrc:$F), "#SELECT_QBRC",
-                          [(set v4i1:$dst,
-                                (select i1:$cond, v4i1:$T, v4i1:$F))]>;
-
-  // Convert and Round Instructions
-  def QVFCTID : QPXX19_Int<4, 814, "qvfctid", int_ppc_qpx_qvfctid>;
-  let isCodeGenOnly = 1 in
-    def QVFCTIDb : XForm_19<4, 814, (outs qbrc:$FRT), (ins qbrc:$FRB),
-                            "qvfctid $FRT, $FRB", IIC_FPGeneral, []>;
-
-  def QVFCTIDU : QPXX19_Int<4, 942, "qvfctidu", int_ppc_qpx_qvfctidu>;
-  def QVFCTIDZ : QPXX19_Int<4, 815, "qvfctidz", int_ppc_qpx_qvfctidz>;
-  def QVFCTIDUZ : QPXX19_Int<4, 943, "qvfctiduz", int_ppc_qpx_qvfctiduz>;
-  def QVFCTIW : QPXX19_Int<4, 14, "qvfctiw", int_ppc_qpx_qvfctiw>;
-  def QVFCTIWU : QPXX19_Int<4, 142, "qvfctiwu", int_ppc_qpx_qvfctiwu>;
-  def QVFCTIWZ : QPXX19_Int<4, 15, "qvfctiwz", int_ppc_qpx_qvfctiwz>;
-  def QVFCTIWUZ : QPXX19_Int<4, 143, "qvfctiwuz", int_ppc_qpx_qvfctiwuz>;
-  def QVFCFID : QPXX19_Int<4, 846, "qvfcfid", int_ppc_qpx_qvfcfid>;
-  let isCodeGenOnly = 1 in
-    def QVFCFIDb : XForm_19<4, 846, (outs qbrc:$FRT), (ins qbrc:$FRB),
-                            "qvfcfid $FRT, $FRB", IIC_FPGeneral, []>;
-
-  def QVFCFIDU : QPXX19_Int<4, 974, "qvfcfidu", int_ppc_qpx_qvfcfidu>;
-  def QVFCFIDS : QPXX19_Int<0, 846, "qvfcfids", int_ppc_qpx_qvfcfids>;
-  def QVFCFIDUS : QPXX19_Int<0, 974, "qvfcfidus", int_ppc_qpx_qvfcfidus>;
-
-  let isCodeGenOnly = 1 in
-    def QVFRSP : QPXX19_Int<4, 12, "qvfrsp", int_ppc_qpx_qvfrsp>;
-  def QVFRSPs : XForm_19<4, 12,
-                      (outs qsrc:$FRT), (ins qfrc:$FRB),
-                      "qvfrsp $FRT, $FRB", IIC_FPGeneral,
-                      [(set v4f32:$FRT, (fround_inexact v4f64:$FRB))]>;
-
-  def QVFRIZ : XForm_19<4, 424, (outs qfrc:$FRT), (ins qfrc:$FRB),
-                        "qvfriz $FRT, $FRB", IIC_FPGeneral,
-                        [(set v4f64:$FRT, (ftrunc v4f64:$FRB))]>;
-  let isCodeGenOnly = 1 in
-    def QVFRIZs : XForm_19<4, 424, (outs qsrc:$FRT), (ins qsrc:$FRB),
-                           "qvfriz $FRT, $FRB", IIC_FPGeneral,
-                           [(set v4f32:$FRT, (ftrunc v4f32:$FRB))]>;
-
-  def QVFRIN : XForm_19<4, 392, (outs qfrc:$FRT), (ins qfrc:$FRB),
-                        "qvfrin $FRT, $FRB", IIC_FPGeneral,
-                        [(set v4f64:$FRT, (fround v4f64:$FRB))]>;
-  let isCodeGenOnly = 1 in
-    def QVFRINs : XForm_19<4, 392, (outs qsrc:$FRT), (ins qsrc:$FRB),
-                           "qvfrin $FRT, $FRB", IIC_FPGeneral,
-                           [(set v4f32:$FRT, (fround v4f32:$FRB))]>;
-
-  def QVFRIP : XForm_19<4, 456, (outs qfrc:$FRT), (ins qfrc:$FRB),
-                        "qvfrip $FRT, $FRB", IIC_FPGeneral,
-                        [(set v4f64:$FRT, (fceil v4f64:$FRB))]>;
-  let isCodeGenOnly = 1 in
-    def QVFRIPs : XForm_19<4, 456, (outs qsrc:$FRT), (ins qsrc:$FRB),
-                           "qvfrip $FRT, $FRB", IIC_FPGeneral,
-                           [(set v4f32:$FRT, (fceil v4f32:$FRB))]>;
-
-  def QVFRIM : XForm_19<4, 488, (outs qfrc:$FRT), (ins qfrc:$FRB),
-                        "qvfrim $FRT, $FRB", IIC_FPGeneral,
-                        [(set v4f64:$FRT, (ffloor v4f64:$FRB))]>;
-  let isCodeGenOnly = 1 in
-    def QVFRIMs : XForm_19<4, 488, (outs qsrc:$FRT), (ins qsrc:$FRB),
-                           "qvfrim $FRT, $FRB", IIC_FPGeneral,
-                           [(set v4f32:$FRT, (ffloor v4f32:$FRB))]>;
-
-  // Move Instructions
-  def QVFMR : XForm_19<4, 72,
-                      (outs qfrc:$FRT), (ins qfrc:$FRB),
-                      "qvfmr $FRT, $FRB", IIC_VecPerm,
-                      [/* (set v4f64:$FRT, v4f64:$FRB) */]>;
-  let isCodeGenOnly = 1 in {
-    def QVFMRs : XForm_19<4, 72,
-                         (outs qsrc:$FRT), (ins qsrc:$FRB),
-                         "qvfmr $FRT, $FRB", IIC_VecPerm,
-                         [/* (set v4f32:$FRT, v4f32:$FRB) */]>;
-    def QVFMRb : XForm_19<4, 72,
-                         (outs qbrc:$FRT), (ins qbrc:$FRB),
-                         "qvfmr $FRT, $FRB", IIC_VecPerm,
-                         [/* (set v4i1:$FRT, v4i1:$FRB) */]>;
-  }
-  def QVFNEG : XForm_19<4, 40,
-                      (outs qfrc:$FRT), (ins qfrc:$FRB),
-                      "qvfneg $FRT, $FRB", IIC_VecPerm,
-                      [(set v4f64:$FRT, (fneg v4f64:$FRB))]>;
-  let isCodeGenOnly = 1 in
-    def QVFNEGs : XForm_19<4, 40,
-                         (outs qsrc:$FRT), (ins qsrc:$FRB),
-                         "qvfneg $FRT, $FRB", IIC_VecPerm,
-                         [(set v4f32:$FRT, (fneg v4f32:$FRB))]>;
-  def QVFABS : XForm_19<4, 264,
-                      (outs qfrc:$FRT), (ins qfrc:$FRB),
-                      "qvfabs $FRT, $FRB", IIC_VecPerm,
-                      [(set v4f64:$FRT, (fabs v4f64:$FRB))]>;
-  let isCodeGenOnly = 1 in
-    def QVFABSs : XForm_19<4, 264,
-                         (outs qsrc:$FRT), (ins qsrc:$FRB),
-                         "qvfabs $FRT, $FRB", IIC_VecPerm,
-                         [(set v4f32:$FRT, (fabs v4f32:$FRB))]>;
-  def QVFNABS : XForm_19<4, 136,
-                      (outs qfrc:$FRT), (ins qfrc:$FRB),
-                      "qvfnabs $FRT, $FRB", IIC_VecPerm,
-                      [(set v4f64:$FRT, (fneg (fabs v4f64:$FRB)))]>;
-  let isCodeGenOnly = 1 in
-    def QVFNABSs : XForm_19<4, 136,
-                         (outs qsrc:$FRT), (ins qsrc:$FRB),
-                         "qvfnabs $FRT, $FRB", IIC_VecPerm,
-                         [(set v4f32:$FRT, (fneg (fabs v4f32:$FRB)))]>;
-  def QVFCPSGN : XForm_18<4, 8,
-                      (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB),
-                      "qvfcpsgn $FRT, $FRA, $FRB", IIC_VecPerm,
-                      [(set v4f64:$FRT, (fcopysign v4f64:$FRB, v4f64:$FRA))]>;
-  let isCodeGenOnly = 1 in
-    def QVFCPSGNs : XForm_18<4, 8,
-                         (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB),
-                         "qvfcpsgn $FRT, $FRA, $FRB", IIC_VecPerm,
-                         [(set v4f32:$FRT, (fcopysign v4f32:$FRB, v4f32:$FRA))]>;
-
-  def QVALIGNI : Z23Form_1<4, 5,
-                      (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, u2imm:$idx),
-                      "qvaligni $FRT, $FRA, $FRB, $idx", IIC_VecPerm,
-                      [(set v4f64:$FRT,
-                            (PPCqvaligni v4f64:$FRA, v4f64:$FRB,
-                                         (i32 imm:$idx)))]>;
-  let isCodeGenOnly = 1 in
-     def QVALIGNIs : Z23Form_1<4, 5,
-                         (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB, u2imm:$idx),
-                         "qvaligni $FRT, $FRA, $FRB, $idx", IIC_VecPerm,
-                         [(set v4f32:$FRT,
-                               (PPCqvaligni v4f32:$FRA, v4f32:$FRB,
-                                            (i32 imm:$idx)))]>;
-  let isCodeGenOnly = 1 in
-     def QVALIGNIb : Z23Form_1<4, 5,
-                         (outs qbrc:$FRT), (ins qbrc:$FRA, qbrc:$FRB, u2imm:$idx),
-                         "qvaligni $FRT, $FRA, $FRB, $idx", IIC_VecPerm,
-                         [(set v4i1:$FRT,
-                               (PPCqvaligni v4i1:$FRA, v4i1:$FRB,
-                                            (i32 imm:$idx)))]>;
-
-  def QVESPLATI : Z23Form_2<4, 37,
-                      (outs qfrc:$FRT), (ins qfrc:$FRA, u2imm:$idx),
-                      "qvesplati $FRT, $FRA, $idx", IIC_VecPerm,
-                      [(set v4f64:$FRT,
-                            (PPCqvesplati v4f64:$FRA, (i32 imm:$idx)))]>;
-  let isCodeGenOnly = 1 in
-     def QVESPLATIs : Z23Form_2<4, 37,
-                         (outs qsrc:$FRT), (ins qsrc:$FRA, u2imm:$idx),
-                         "qvesplati $FRT, $FRA, $idx", IIC_VecPerm,
-                         [(set v4f32:$FRT,
-                               (PPCqvesplati v4f32:$FRA, (i32 imm:$idx)))]>;
-  let isCodeGenOnly = 1 in
-     def QVESPLATIb : Z23Form_2<4, 37,
-                         (outs qbrc:$FRT), (ins qbrc:$FRA, u2imm:$idx),
-                         "qvesplati $FRT, $FRA, $idx", IIC_VecPerm,
-                         [(set v4i1:$FRT,
-                               (PPCqvesplati v4i1:$FRA, (i32 imm:$idx)))]>;
-
-  def QVFPERM : AForm_1<4, 6,
-                      (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, qfrc:$FRC),
-                      "qvfperm $FRT, $FRA, $FRB, $FRC", IIC_VecPerm,
-                      [(set v4f64:$FRT,
-                            (PPCqvfperm v4f64:$FRA, v4f64:$FRB, v4f64:$FRC))]>;
-  let isCodeGenOnly = 1 in
-     def QVFPERMs : AForm_1<4, 6,
-                         (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB, qfrc:$FRC),
-                         "qvfperm $FRT, $FRA, $FRB, $FRC", IIC_VecPerm,
-                         [(set v4f32:$FRT,
-                               (PPCqvfperm v4f32:$FRA, v4f32:$FRB, v4f64:$FRC))]>;
-
-  let isReMaterializable = 1, isAsCheapAsAMove = 1 in
-  def QVGPCI : Z23Form_3<4, 133,
-                      (outs qfrc:$FRT), (ins u12imm:$idx),
-                      "qvgpci $FRT, $idx", IIC_VecPerm,
-                      [(set v4f64:$FRT, (PPCqvgpci (u12:$idx)))]>;
-
-  // Compare Instruction
-  let isCodeGenOnly = 1 in
-    def QVFTSTNAN : QPXX18_Int<4, 64, "qvftstnan", int_ppc_qpx_qvftstnan>;
-  def QVFTSTNANb : XForm_18<4, 64, (outs qbrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB),
-                           "qvftstnan $FRT, $FRA, $FRB", IIC_FPCompare,
-                           [(set v4i1:$FRT,
-                                 (setcc v4f64:$FRA, v4f64:$FRB, SETUO))]>;
-  let isCodeGenOnly = 1 in
-  def QVFTSTNANbs : XForm_18<4, 64, (outs qbrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB),
-                            "qvftstnan $FRT, $FRA, $FRB", IIC_FPCompare,
-                            [(set v4i1:$FRT,
-                                  (setcc v4f32:$FRA, v4f32:$FRB, SETUO))]>;
-  let isCodeGenOnly = 1 in
-    def QVFCMPLT : QPXX18_Int<4, 96, "qvfcmplt", int_ppc_qpx_qvfcmplt>;
-  def QVFCMPLTb : XForm_18<4, 96, (outs qbrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB),
-                           "qvfcmplt $FRT, $FRA, $FRB", IIC_FPCompare,
-                           [(set v4i1:$FRT,
-                                 (setcc v4f64:$FRA, v4f64:$FRB, SETOLT))]>;
-  let isCodeGenOnly = 1 in
-  def QVFCMPLTbs : XForm_18<4, 96, (outs qbrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB),
-                            "qvfcmplt $FRT, $FRA, $FRB", IIC_FPCompare,
-                            [(set v4i1:$FRT,
-                                  (setcc v4f32:$FRA, v4f32:$FRB, SETOLT))]>;
-  let isCodeGenOnly = 1 in
-    def QVFCMPGT : QPXX18_Int<4, 32, "qvfcmpgt", int_ppc_qpx_qvfcmpgt>;
-  def QVFCMPGTb : XForm_18<4, 32, (outs qbrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB),
-                           "qvfcmpgt $FRT, $FRA, $FRB", IIC_FPCompare,
-                           [(set v4i1:$FRT,
-                                 (setcc v4f64:$FRA, v4f64:$FRB, SETOGT))]>;
-  let isCodeGenOnly = 1 in
-  def QVFCMPGTbs : XForm_18<4, 32, (outs qbrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB),
-                            "qvfcmpgt $FRT, $FRA, $FRB", IIC_FPCompare,
-                            [(set v4i1:$FRT,
-                                  (setcc v4f32:$FRA, v4f32:$FRB, SETOGT))]>;
-  let isCodeGenOnly = 1 in
-    def QVFCMPEQ : QPXX18_Int<4, 0, "qvfcmpeq", int_ppc_qpx_qvfcmpeq>;
-  def QVFCMPEQb : XForm_18<4, 0, (outs qbrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB),
-                           "qvfcmpeq $FRT, $FRA, $FRB", IIC_FPCompare,
-                           [(set v4i1:$FRT,
-                                 (setcc v4f64:$FRA, v4f64:$FRB, SETOEQ))]>;
-  let isCodeGenOnly = 1 in
-  def QVFCMPEQbs : XForm_18<4, 0, (outs qbrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB),
-                            "qvfcmpeq $FRT, $FRA, $FRB", IIC_FPCompare,
-                            [(set v4i1:$FRT,
-                                  (setcc v4f32:$FRA, v4f32:$FRB, SETOEQ))]>;
-
-  let isCodeGenOnly = 1 in
-  def QVFLOGICAL : XForm_20<4, 4,
-                      (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, u12imm:$tttt),
-                      "qvflogical $FRT, $FRA, $FRB, $tttt", IIC_VecPerm, []>;
-  def QVFLOGICALb : XForm_20<4, 4,
-                      (outs qbrc:$FRT), (ins qbrc:$FRA, qbrc:$FRB, u12imm:$tttt),
-                      "qvflogical $FRT, $FRA, $FRB, $tttt", IIC_VecPerm, []>;
-  let isCodeGenOnly = 1 in
-  def QVFLOGICALs : XForm_20<4, 4,
-                      (outs qbrc:$FRT), (ins qbrc:$FRA, qbrc:$FRB, u12imm:$tttt),
-                      "qvflogical $FRT, $FRA, $FRB, $tttt", IIC_VecPerm, []>;
-
-  // Load indexed instructions
-  let mayLoad = 1 in {
-    def QVLFDX : XForm_1_memOp<31, 583,
-                              (outs qfrc:$FRT), (ins memrr:$src),
-                              "qvlfdx $FRT, $src", IIC_LdStLFD,
-                              [(set v4f64:$FRT, (load xoaddr:$src))]>;
-    let isCodeGenOnly = 1 in
-    def QVLFDXb : XForm_1_memOp<31, 583,
-                                (outs qbrc:$FRT), (ins memrr:$src),
-                                "qvlfdx $FRT, $src", IIC_LdStLFD, []>;
-
-    let RC = 1 in
-    def QVLFDXA : XForm_1<31, 583,
-                        (outs qfrc:$FRT), (ins memrr:$src),
-                        "qvlfdxa $FRT, $src", IIC_LdStLFD, []>;
-
-    def QVLFDUX : XForm_1<31, 615,
-                        (outs qfrc:$FRT, ptr_rc_nor0:$ea_result),
-                        (ins memrr:$src),
-                        "qvlfdux $FRT, $src", IIC_LdStLFDU, []>,
-                        RegConstraint<"$src.ptrreg = $ea_result">,
-                        NoEncode<"$ea_result">;
-    let RC = 1 in
-    def QVLFDUXA : XForm_1<31, 615,
-                        (outs qfrc:$FRT), (ins memrr:$src),
-                        "qvlfduxa $FRT, $src", IIC_LdStLFD, []>;
-
-    def QVLFSX : XForm_1_memOp<31, 519,
-                              (outs qfrc:$FRT), (ins memrr:$src),
-                              "qvlfsx $FRT, $src", IIC_LdStLFD,
-                              [(set v4f64:$FRT, (extloadv4f32 xoaddr:$src))]>;
-
-    let isCodeGenOnly = 1 in
-    def QVLFSXb : XForm_1<31, 519,
-                        (outs qbrc:$FRT), (ins memrr:$src),
-                        "qvlfsx $FRT, $src", IIC_LdStLFD,
-                        [(set v4i1:$FRT, (PPCqvlfsb xoaddr:$src))]>;
-    let isCodeGenOnly = 1 in
-    def QVLFSXs : XForm_1_memOp<31, 519,
-                                (outs qsrc:$FRT), (ins memrr:$src),
-                                "qvlfsx $FRT, $src", IIC_LdStLFD,
-                                [(set v4f32:$FRT, (load xoaddr:$src))]>;
-
-    let RC = 1 in
-    def QVLFSXA : XForm_1<31, 519,
-                        (outs qfrc:$FRT), (ins memrr:$src),
-                        "qvlfsxa $FRT, $src", IIC_LdStLFD, []>;
-
-    def QVLFSUX : XForm_1<31, 551,
-                        (outs qsrc:$FRT, ptr_rc_nor0:$ea_result),
-                        (ins memrr:$src),
-                        "qvlfsux $FRT, $src", IIC_LdStLFDU, []>,
-                        RegConstraint<"$src.ptrreg = $ea_result">,
-                        NoEncode<"$ea_result">;
-
-    let RC = 1 in
-    def QVLFSUXA : XForm_1<31, 551,
-                        (outs qfrc:$FRT), (ins memrr:$src),
-                        "qvlfsuxa $FRT, $src", IIC_LdStLFD, []>;
-
-    def QVLFCDX : XForm_1<31, 71,
-                        (outs qfrc:$FRT), (ins memrr:$src),
-                        "qvlfcdx $FRT, $src", IIC_LdStLFD, []>;
-    let RC = 1 in
-    def QVLFCDXA : XForm_1<31, 71,
-                        (outs qfrc:$FRT), (ins memrr:$src),
-                        "qvlfcdxa $FRT, $src", IIC_LdStLFD, []>;
-
-    def QVLFCDUX : XForm_1<31, 103,
-                        (outs qfrc:$FRT), (ins memrr:$src),
-                        "qvlfcdux $FRT, $src", IIC_LdStLFD, []>;
-    let RC = 1 in
-    def QVLFCDUXA : XForm_1<31, 103,
-                        (outs qfrc:$FRT), (ins memrr:$src),
-                        "qvlfcduxa $FRT, $src", IIC_LdStLFD, []>;
-
-    def QVLFCSX : XForm_1<31, 7,
-                        (outs qfrc:$FRT), (ins memrr:$src),
-                        "qvlfcsx $FRT, $src", IIC_LdStLFD, []>;
-    let isCodeGenOnly = 1 in
-    def QVLFCSXs : XForm_1<31, 7,
-                         (outs qsrc:$FRT), (ins memrr:$src),
-                         "qvlfcsx $FRT, $src", IIC_LdStLFD, []>;
-
-    let RC = 1 in
-    def QVLFCSXA : XForm_1<31, 7,
-                        (outs qfrc:$FRT), (ins memrr:$src),
-                        "qvlfcsxa $FRT, $src", IIC_LdStLFD, []>;
-
-    def QVLFCSUX : XForm_1<31, 39,
-                        (outs qfrc:$FRT), (ins memrr:$src),
-                        "qvlfcsux $FRT, $src", IIC_LdStLFD, []>;
-    let RC = 1 in
-    def QVLFCSUXA : XForm_1<31, 39,
-                        (outs qfrc:$FRT), (ins memrr:$src),
-                        "qvlfcsuxa $FRT, $src", IIC_LdStLFD, []>;
-
-    def QVLFIWAX : XForm_1<31, 871,
-                        (outs qfrc:$FRT), (ins memrr:$src),
-                        "qvlfiwax $FRT, $src", IIC_LdStLFD, []>;
-    let RC = 1 in
-    def QVLFIWAXA : XForm_1<31, 871,
-                        (outs qfrc:$FRT), (ins memrr:$src),
-                        "qvlfiwaxa $FRT, $src", IIC_LdStLFD, []>;
-
-    def QVLFIWZX : XForm_1<31, 839,
-                        (outs qfrc:$FRT), (ins memrr:$src),
-                        "qvlfiwzx $FRT, $src", IIC_LdStLFD, []>;
-    let RC = 1 in
-    def QVLFIWZXA : XForm_1<31, 839,
-                        (outs qfrc:$FRT), (ins memrr:$src),
-                        "qvlfiwzxa $FRT, $src", IIC_LdStLFD, []>;
-  }
-
-
-  def QVLPCLDX : XForm_1<31, 582,
-                      (outs qfrc:$FRT), (ins memrr:$src),
-                      "qvlpcldx $FRT, $src", IIC_LdStLFD, []>;
-  def QVLPCLSX : XForm_1<31, 518,
-                      (outs qfrc:$FRT), (ins memrr:$src),
-                      "qvlpclsx $FRT, $src", IIC_LdStLFD, []>;
-  let isCodeGenOnly = 1 in
-    def QVLPCLSXint : XForm_11<31, 518,
-                              (outs qfrc:$FRT), (ins G8RC:$src),
-                              "qvlpclsx $FRT, 0, $src", IIC_LdStLFD, []>;
-  def QVLPCRDX : XForm_1<31, 70,
-                      (outs qfrc:$FRT), (ins memrr:$src),
-                      "qvlpcrdx $FRT, $src", IIC_LdStLFD, []>;
-  def QVLPCRSX : XForm_1<31, 6,
-                      (outs qfrc:$FRT), (ins memrr:$src),
-                      "qvlpcrsx $FRT, $src", IIC_LdStLFD, []>;
-
-  // Store indexed instructions
-  let mayStore = 1 in {
-    def QVSTFDX : XForm_8_memOp<31, 711,
-                        (outs), (ins qfrc:$FRT, memrr:$dst),
-                        "qvstfdx $FRT, $dst", IIC_LdStSTFD,
-                        [(store qfrc:$FRT, xoaddr:$dst)]>;
-    let isCodeGenOnly = 1 in
-    def QVSTFDXb : XForm_8_memOp<31, 711,
-                        (outs), (ins qbrc:$FRT, memrr:$dst),
-                        "qvstfdx $FRT, $dst", IIC_LdStSTFD, []>;
-
-    let RC = 1 in
-    def QVSTFDXA : XForm_8<31, 711,
-                        (outs), (ins qfrc:$FRT, memrr:$dst),
-                        "qvstfdxa $FRT, $dst", IIC_LdStSTFD, []>;
-
-    def QVSTFDUX : XForm_8<31, 743, (outs ptr_rc_nor0:$ea_res),
-                           (ins qfrc:$FRT, memrr:$dst),
-                           "qvstfdux $FRT, $dst", IIC_LdStSTFDU, []>,
-                           RegConstraint<"$dst.ptrreg = $ea_res">,
-                           NoEncode<"$ea_res">;
-
-    let RC = 1 in
-    def QVSTFDUXA : XForm_8<31, 743,
-                        (outs), (ins qfrc:$FRT, memrr:$dst),
-                        "qvstfduxa $FRT, $dst", IIC_LdStSTFD, []>;
-
-    def QVSTFDXI : XForm_8<31, 709,
-                        (outs), (ins qfrc:$FRT, memrr:$dst),
-                        "qvstfdxi $FRT, $dst", IIC_LdStSTFD, []>;
-    let RC = 1 in
-    def QVSTFDXIA : XForm_8<31, 709,
-                        (outs), (ins qfrc:$FRT, memrr:$dst),
-                        "qvstfdxia $FRT, $dst", IIC_LdStSTFD, []>;
-
-    def QVSTFDUXI : XForm_8<31, 741,
-                        (outs), (ins qfrc:$FRT, memrr:$dst),
-                        "qvstfduxi $FRT, $dst", IIC_LdStSTFD, []>;
-    let RC = 1 in
-    def QVSTFDUXIA : XForm_8<31, 741,
-                        (outs), (ins qfrc:$FRT, memrr:$dst),
-                        "qvstfduxia $FRT, $dst", IIC_LdStSTFD, []>;
-
-    def QVSTFSX : XForm_8_memOp<31, 647,
-                        (outs), (ins qfrc:$FRT, memrr:$dst),
-                        "qvstfsx $FRT, $dst", IIC_LdStSTFD,
-                        [(truncstorev4f32 qfrc:$FRT, xoaddr:$dst)]>;
-    let isCodeGenOnly = 1 in
-    def QVSTFSXs : XForm_8_memOp<31, 647,
-                         (outs), (ins qsrc:$FRT, memrr:$dst),
-                         "qvstfsx $FRT, $dst", IIC_LdStSTFD,
-                         [(store qsrc:$FRT, xoaddr:$dst)]>;
-
-    let RC = 1 in
-    def QVSTFSXA : XForm_8<31, 647,
-                        (outs), (ins qfrc:$FRT, memrr:$dst),
-                        "qvstfsxa $FRT, $dst", IIC_LdStSTFD, []>;
-
-    def QVSTFSUX : XForm_8<31, 679, (outs ptr_rc_nor0:$ea_res),
-                           (ins qsrc:$FRT, memrr:$dst),
-                           "qvstfsux $FRT, $dst", IIC_LdStSTFDU, []>,
-                           RegConstraint<"$dst.ptrreg = $ea_res">,
-                           NoEncode<"$ea_res">;
-    let isCodeGenOnly = 1 in
-    def QVSTFSUXs: XForm_8<31, 679, (outs ptr_rc_nor0:$ea_res),
-                           (ins qfrc:$FRT, memrr:$dst),
-                           "qvstfsux $FRT, $dst", IIC_LdStSTFDU, []>,
-                           RegConstraint<"$dst.ptrreg = $ea_res">,
-                           NoEncode<"$ea_res">;
-
-    let RC = 1 in
-    def QVSTFSUXA : XForm_8<31, 679,
-                        (outs), (ins qfrc:$FRT, memrr:$dst),
-                        "qvstfsuxa $FRT, $dst", IIC_LdStSTFD, []>;
-
-    def QVSTFSXI : XForm_8<31, 645,
-                        (outs), (ins qfrc:$FRT, memrr:$dst),
-                        "qvstfsxi $FRT, $dst", IIC_LdStSTFD, []>;
-    let RC = 1 in
-    def QVSTFSXIA : XForm_8<31, 645,
-                        (outs), (ins qfrc:$FRT, memrr:$dst),
-                        "qvstfsxia $FRT, $dst", IIC_LdStSTFD, []>;
-
-    def QVSTFSUXI : XForm_8<31, 677,
-                        (outs), (ins qfrc:$FRT, memrr:$dst),
-                        "qvstfsuxi $FRT, $dst", IIC_LdStSTFD, []>;
-    let RC = 1 in
-    def QVSTFSUXIA : XForm_8<31, 677,
-                        (outs), (ins qfrc:$FRT, memrr:$dst),
-                        "qvstfsuxia $FRT, $dst", IIC_LdStSTFD, []>;
-
-    def QVSTFCDX : XForm_8<31, 199,
-                        (outs), (ins qfrc:$FRT, memrr:$dst),
-                        "qvstfcdx $FRT, $dst", IIC_LdStSTFD, []>;
-    let RC = 1 in
-    def QVSTFCDXA : XForm_8<31, 199,
-                        (outs), (ins qfrc:$FRT, memrr:$dst),
-                        "qvstfcdxa $FRT, $dst", IIC_LdStSTFD, []>;
-
-    def QVSTFCSX : XForm_8<31, 135,
-                        (outs), (ins qfrc:$FRT, memrr:$dst),
-                        "qvstfcsx $FRT, $dst", IIC_LdStSTFD, []>;
-    let isCodeGenOnly = 1 in
-    def QVSTFCSXs : XForm_8<31, 135,
-                         (outs), (ins qsrc:$FRT, memrr:$dst),
-                         "qvstfcsx $FRT, $dst", IIC_LdStSTFD, []>;
-
-    let RC = 1 in
-    def QVSTFCSXA : XForm_8<31, 135,
-                        (outs), (ins qfrc:$FRT, memrr:$dst),
-                        "qvstfcsxa $FRT, $dst", IIC_LdStSTFD, []>;
-
-    def QVSTFCDUX : XForm_8<31, 231,
-                        (outs), (ins qfrc:$FRT, memrr:$dst),
-                        "qvstfcdux $FRT, $dst", IIC_LdStSTFD, []>;
-    let RC = 1 in
-    def QVSTFCDUXA : XForm_8<31, 231,
-                        (outs), (ins qfrc:$FRT, memrr:$dst),
-                        "qvstfcduxa $FRT, $dst", IIC_LdStSTFD, []>;
-
-    def QVSTFCSUX : XForm_8<31, 167,
-                        (outs), (ins qfrc:$FRT, memrr:$dst),
-                        "qvstfcsux $FRT, $dst", IIC_LdStSTFD, []>;
-    let RC = 1 in
-    def QVSTFCSUXA : XForm_8<31, 167,
-                        (outs), (ins qfrc:$FRT, memrr:$dst),
-                        "qvstfcsuxa $FRT, $dst", IIC_LdStSTFD, []>;
-
-    def QVSTFCDXI : XForm_8<31, 197,
-                        (outs), (ins qfrc:$FRT, memrr:$dst),
-                        "qvstfcdxi $FRT, $dst", IIC_LdStSTFD, []>;
-    let RC = 1 in
-    def QVSTFCDXIA : XForm_8<31, 197,
-                        (outs), (ins qfrc:$FRT, memrr:$dst),
-                        "qvstfcdxia $FRT, $dst", IIC_LdStSTFD, []>;
-
-    def QVSTFCSXI : XForm_8<31, 133,
-                        (outs), (ins qfrc:$FRT, memrr:$dst),
-                        "qvstfcsxi $FRT, $dst", IIC_LdStSTFD, []>;
-    let RC = 1 in
-    def QVSTFCSXIA : XForm_8<31, 133,
-                        (outs), (ins qfrc:$FRT, memrr:$dst),
-                        "qvstfcsxia $FRT, $dst", IIC_LdStSTFD, []>;
-
-    def QVSTFCDUXI : XForm_8<31, 229,
-                        (outs), (ins qfrc:$FRT, memrr:$dst),
-                        "qvstfcduxi $FRT, $dst", IIC_LdStSTFD, []>;
-    let RC = 1 in
-    def QVSTFCDUXIA : XForm_8<31, 229,
-                        (outs), (ins qfrc:$FRT, memrr:$dst),
-                        "qvstfcduxia $FRT, $dst", IIC_LdStSTFD, []>;
-
-    def QVSTFCSUXI : XForm_8<31, 165,
-                        (outs), (ins qfrc:$FRT, memrr:$dst),
-                        "qvstfcsuxi $FRT, $dst", IIC_LdStSTFD, []>;
-    let RC = 1 in
-    def QVSTFCSUXIA : XForm_8<31, 165,
-                        (outs), (ins qfrc:$FRT, memrr:$dst),
-                        "qvstfcsuxia $FRT, $dst", IIC_LdStSTFD, []>;
-
-    def QVSTFIWX : XForm_8<31, 967,
-                        (outs), (ins qfrc:$FRT, memrr:$dst),
-                        "qvstfiwx $FRT, $dst", IIC_LdStSTFD, []>;
-    let RC = 1 in
-    def QVSTFIWXA : XForm_8<31, 967,
-                        (outs), (ins qfrc:$FRT, memrr:$dst),
-                        "qvstfiwxa $FRT, $dst", IIC_LdStSTFD, []>;
-  }
-}
-
-} // neverHasSideEffects
-}
-
-def : InstAlias<"qvfclr $FRT",
-                (QVFLOGICALb qbrc:$FRT, qbrc:$FRT, qbrc:$FRT, 0)>;
-def : InstAlias<"qvfand $FRT, $FRA, $FRB",
-                (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 1)>;
-def : InstAlias<"qvfandc $FRT, $FRA, $FRB",
-                (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 4)>;
-def : InstAlias<"qvfctfb $FRT, $FRA",
-                (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRA, 5)>;
-def : InstAlias<"qvfxor $FRT, $FRA, $FRB",
-                (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 6)>;
-def : InstAlias<"qvfor $FRT, $FRA, $FRB",
-                (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 7)>;
-def : InstAlias<"qvfnor $FRT, $FRA, $FRB",
-                (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 8)>;
-def : InstAlias<"qvfequ $FRT, $FRA, $FRB",
-                (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 9)>;
-def : InstAlias<"qvfnot $FRT, $FRA",
-                (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRA, 10)>;
-def : InstAlias<"qvforc $FRT, $FRA, $FRB",
-                (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 13)>;
-def : InstAlias<"qvfnand $FRT, $FRA, $FRB",
-                (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 14)>;
-def : InstAlias<"qvfset $FRT",
-                (QVFLOGICALb qbrc:$FRT, qbrc:$FRT, qbrc:$FRT, 15)>;
-
-//===----------------------------------------------------------------------===//
-// Additional QPX Patterns
-//
-
-def : Pat<(v4f64 (scalar_to_vector f64:$A)),
-          (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), $A, sub_64)>;
-def : Pat<(v4f32 (scalar_to_vector f32:$A)),
-          (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), $A, sub_64)>;
-
-def : Pat<(f64 (extractelt v4f64:$S, 0)),
-          (EXTRACT_SUBREG $S, sub_64)>;
-def : Pat<(f32 (extractelt v4f32:$S, 0)),
-          (EXTRACT_SUBREG $S, sub_64)>;
-
-def : Pat<(f64 (extractelt v4f64:$S, 1)),
-          (EXTRACT_SUBREG (QVESPLATI $S, 1), sub_64)>;
-def : Pat<(f64 (extractelt v4f64:$S, 2)),
-          (EXTRACT_SUBREG (QVESPLATI $S, 2), sub_64)>;
-def : Pat<(f64 (extractelt v4f64:$S, 3)),
-          (EXTRACT_SUBREG (QVESPLATI $S, 3), sub_64)>;
-
-def : Pat<(f32 (extractelt v4f32:$S, 1)),
-          (EXTRACT_SUBREG (QVESPLATIs $S, 1), sub_64)>;
-def : Pat<(f32 (extractelt v4f32:$S, 2)),
-          (EXTRACT_SUBREG (QVESPLATIs $S, 2), sub_64)>;
-def : Pat<(f32 (extractelt v4f32:$S, 3)),
-          (EXTRACT_SUBREG (QVESPLATIs $S, 3), sub_64)>;
-
-def : Pat<(f64 (extractelt v4f64:$S, i64:$F)),
-          (EXTRACT_SUBREG (QVFPERM $S, $S,
-                                   (QVLPCLSXint (RLDICR $F, 2,
-                                                        /* 63-2 = */ 61))),
-                          sub_64)>;
-def : Pat<(f32 (extractelt v4f32:$S, i64:$F)),
-          (EXTRACT_SUBREG (QVFPERMs $S, $S,
-                                    (QVLPCLSXint (RLDICR $F, 2,
-                                                         /* 63-2 = */ 61))),
-                          sub_64)>;
-
-def : Pat<(int_ppc_qpx_qvfperm v4f64:$A, v4f64:$B, v4f64:$C),
-          (QVFPERM $A, $B, $C)>;
-
-def : Pat<(int_ppc_qpx_qvfcpsgn v4f64:$A, v4f64:$B),
-          (QVFCPSGN $A, $B)>;
-
-// FCOPYSIGN's operand types need not agree.
-def : Pat<(fcopysign v4f64:$frB, v4f32:$frA),
-          (QVFCPSGN (COPY_TO_REGCLASS $frA, QFRC), $frB)>;
-def : Pat<(fcopysign QSRC:$frB, QFRC:$frA),
-          (QVFCPSGNs (COPY_TO_REGCLASS $frA, QSRC), $frB)>;
-
-def : Pat<(int_ppc_qpx_qvfneg v4f64:$A), (QVFNEG $A)>;
-def : Pat<(int_ppc_qpx_qvfabs v4f64:$A), (QVFABS $A)>;
-def : Pat<(int_ppc_qpx_qvfnabs v4f64:$A), (QVFNABS $A)>;
-
-def : Pat<(int_ppc_qpx_qvfriz v4f64:$A), (QVFRIZ $A)>;
-def : Pat<(int_ppc_qpx_qvfrin v4f64:$A), (QVFRIN $A)>;
-def : Pat<(int_ppc_qpx_qvfrip v4f64:$A), (QVFRIP $A)>;
-def : Pat<(int_ppc_qpx_qvfrim v4f64:$A), (QVFRIM $A)>;
-
-def : Pat<(int_ppc_qpx_qvfre v4f64:$A), (QVFRE $A)>;
-def : Pat<(int_ppc_qpx_qvfrsqrte v4f64:$A), (QVFRSQRTE $A)>;
-
-def : Pat<(int_ppc_qpx_qvfadd v4f64:$A, v4f64:$B),
-          (QVFADD $A, $B)>;
-def : Pat<(int_ppc_qpx_qvfsub v4f64:$A, v4f64:$B),
-          (QVFSUB $A, $B)>;
-def : Pat<(int_ppc_qpx_qvfmul v4f64:$A, v4f64:$B),
-          (QVFMUL $A, $B)>;
-
-// Additional QVFNMSUB patterns: -a*c + b == -(a*c - b)
-def : Pat<(fma (fneg v4f64:$A), v4f64:$C, v4f64:$B),
-          (QVFNMSUB $A, $C, $B)>;
-def : Pat<(fma v4f64:$A, (fneg v4f64:$C), v4f64:$B),
-          (QVFNMSUB $A, $C, $B)>;
-def : Pat<(fma (fneg v4f32:$A), v4f32:$C, v4f32:$B),
-          (QVFNMSUBSs $A, $C, $B)>;
-def : Pat<(fma v4f32:$A, (fneg v4f32:$C), v4f32:$B),
-          (QVFNMSUBSs $A, $C, $B)>;
-
-def : Pat<(int_ppc_qpx_qvfmadd v4f64:$A, v4f64:$B, v4f64:$C),
-          (QVFMADD $A, $B, $C)>;
-def : Pat<(int_ppc_qpx_qvfnmadd v4f64:$A, v4f64:$B, v4f64:$C),
-          (QVFNMADD $A, $B, $C)>;
-def : Pat<(int_ppc_qpx_qvfmsub v4f64:$A, v4f64:$B, v4f64:$C),
-          (QVFMSUB $A, $B, $C)>;
-def : Pat<(int_ppc_qpx_qvfnmsub v4f64:$A, v4f64:$B, v4f64:$C),
-          (QVFNMSUB $A, $B, $C)>;
-
-def : Pat<(int_ppc_qpx_qvlfd xoaddr:$src),
-          (QVLFDX xoaddr:$src)>;
-def : Pat<(int_ppc_qpx_qvlfda xoaddr:$src),
-          (QVLFDXA xoaddr:$src)>;
-def : Pat<(int_ppc_qpx_qvlfs xoaddr:$src),
-          (QVLFSX xoaddr:$src)>;
-def : Pat<(int_ppc_qpx_qvlfsa xoaddr:$src),
-          (QVLFSXA xoaddr:$src)>;
-def : Pat<(int_ppc_qpx_qvlfcda xoaddr:$src),
-          (QVLFCDXA xoaddr:$src)>;
-def : Pat<(int_ppc_qpx_qvlfcd xoaddr:$src),
-          (QVLFCDX xoaddr:$src)>;
-def : Pat<(int_ppc_qpx_qvlfcsa xoaddr:$src),
-          (QVLFCSXA xoaddr:$src)>;
-def : Pat<(int_ppc_qpx_qvlfcs xoaddr:$src),
-          (QVLFCSX xoaddr:$src)>;
-def : Pat<(int_ppc_qpx_qvlfda xoaddr:$src),
-          (QVLFDXA xoaddr:$src)>;
-def : Pat<(int_ppc_qpx_qvlfiwaa xoaddr:$src),
-          (QVLFIWAXA xoaddr:$src)>;
-def : Pat<(int_ppc_qpx_qvlfiwa xoaddr:$src),
-          (QVLFIWAX xoaddr:$src)>;
-def : Pat<(int_ppc_qpx_qvlfiwza xoaddr:$src),
-          (QVLFIWZXA xoaddr:$src)>;
-def : Pat<(int_ppc_qpx_qvlfiwz xoaddr:$src),
-          (QVLFIWZX xoaddr:$src)>;
-def : Pat<(int_ppc_qpx_qvlfsa xoaddr:$src),
-          (QVLFSXA xoaddr:$src)>;
-def : Pat<(int_ppc_qpx_qvlpcld xoaddr:$src),
-          (QVLPCLDX xoaddr:$src)>;
-def : Pat<(int_ppc_qpx_qvlpcls xoaddr:$src),
-          (QVLPCLSX xoaddr:$src)>;
-def : Pat<(int_ppc_qpx_qvlpcrd xoaddr:$src),
-          (QVLPCRDX xoaddr:$src)>;
-def : Pat<(int_ppc_qpx_qvlpcrs xoaddr:$src),
-          (QVLPCRSX xoaddr:$src)>;
-
-def : Pat<(int_ppc_qpx_qvstfd v4f64:$T, xoaddr:$dst),
-          (QVSTFDX $T, xoaddr:$dst)>;
-def : Pat<(int_ppc_qpx_qvstfs v4f64:$T, xoaddr:$dst),
-          (QVSTFSX $T, xoaddr:$dst)>;
-def : Pat<(int_ppc_qpx_qvstfcda v4f64:$T, xoaddr:$dst),
-          (QVSTFCDXA $T, xoaddr:$dst)>;
-def : Pat<(int_ppc_qpx_qvstfcd v4f64:$T, xoaddr:$dst),
-          (QVSTFCDX $T, xoaddr:$dst)>;
-def : Pat<(int_ppc_qpx_qvstfcsa v4f64:$T, xoaddr:$dst),
-          (QVSTFCSXA $T, xoaddr:$dst)>;
-def : Pat<(int_ppc_qpx_qvstfcs v4f64:$T, xoaddr:$dst),
-          (QVSTFCSX $T, xoaddr:$dst)>;
-def : Pat<(int_ppc_qpx_qvstfda v4f64:$T, xoaddr:$dst),
-          (QVSTFDXA $T, xoaddr:$dst)>;
-def : Pat<(int_ppc_qpx_qvstfiwa v4f64:$T, xoaddr:$dst),
-          (QVSTFIWXA $T, xoaddr:$dst)>;
-def : Pat<(int_ppc_qpx_qvstfiw v4f64:$T, xoaddr:$dst),
-          (QVSTFIWX $T, xoaddr:$dst)>;
-def : Pat<(int_ppc_qpx_qvstfsa v4f64:$T, xoaddr:$dst),
-          (QVSTFSXA $T, xoaddr:$dst)>;
-
-def : Pat<(pre_store v4f64:$rS, iPTR:$ptrreg, iPTR:$ptroff),
-          (QVSTFDUX $rS, $ptrreg, $ptroff)>;
-def : Pat<(pre_store v4f32:$rS, iPTR:$ptrreg, iPTR:$ptroff),
-          (QVSTFSUX $rS, $ptrreg, $ptroff)>;
-def : Pat<(pre_truncstv4f32 v4f64:$rS, iPTR:$ptrreg, iPTR:$ptroff),
-          (QVSTFSUXs $rS, $ptrreg, $ptroff)>;
-
-def : Pat<(int_ppc_qpx_qvflogical  v4f64:$A, v4f64:$B, (i32 imm:$idx)),
-          (QVFLOGICAL $A, $B, imm:$idx)>;
-def : Pat<(int_ppc_qpx_qvgpci (u12:$idx)),
-          (QVGPCI imm:$idx)>;
-
-def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETOGE),
-          (QVFLOGICALb (QVFCMPLTb $FRA, $FRB),
-                       (QVFTSTNANb $FRA, $FRB), (i32 8))>;
-def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETOLE),
-          (QVFLOGICALb (QVFCMPGTb $FRA, $FRB),
-                       (QVFTSTNANb $FRA, $FRB), (i32 8))>;
-def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETONE),
-          (QVFLOGICALb (QVFCMPEQb $FRA, $FRB),
-                       (QVFTSTNANb $FRA, $FRB), (i32 8))>;
-def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETO),
-          (QVFLOGICALb (QVFTSTNANb $FRA, $FRB),
-                       (QVFTSTNANb $FRA, $FRB), (i32 10))>;
-def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETUEQ),
-          (QVFLOGICALb (QVFCMPEQb $FRA, $FRB),
-                       (QVFTSTNANb $FRA, $FRB), (i32 7))>;
-def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETUGT),
-          (QVFLOGICALb (QVFCMPGTb $FRA, $FRB),
-                       (QVFTSTNANb $FRA, $FRB), (i32 7))>;
-def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETUGE),
-          (QVFLOGICALb (QVFTSTNANb $FRA, $FRB),
-                       (QVFCMPLTb $FRA, $FRB), (i32 13))>;
-def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETULT),
-          (QVFLOGICALb (QVFCMPLTb $FRA, $FRB),
-                       (QVFTSTNANb $FRA, $FRB), (i32 7))>;
-def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETULE),
-          (QVFLOGICALb (QVFTSTNANb $FRA, $FRB),
-                       (QVFCMPGTb $FRA, $FRB), (i32 13))>;
-def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETUNE),
-          (QVFLOGICALb (QVFTSTNANb $FRA, $FRB),
-                       (QVFCMPEQb $FRA, $FRB), (i32 13))>;
-
-def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETEQ),
-          (QVFCMPEQb $FRA, $FRB)>;
-def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETGT),
-          (QVFCMPGTb $FRA, $FRB)>;
-def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETGE),
-          (QVFLOGICALb (QVFCMPLTb $FRA, $FRB),
-                       (QVFCMPLTb $FRA, $FRB), (i32 10))>;
-def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETLT),
-          (QVFCMPLTb $FRA, $FRB)>;
-def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETLE),
-          (QVFLOGICALb (QVFCMPGTb $FRA, $FRB),
-                       (QVFCMPGTb $FRA, $FRB), (i32 10))>;
-def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETNE),
-          (QVFLOGICALb (QVFCMPEQb $FRA, $FRB),
-                       (QVFCMPEQb $FRA, $FRB), (i32 10))>;
-
-def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETOGE),
-          (QVFLOGICALb (QVFCMPLTbs $FRA, $FRB),
-                       (QVFTSTNANbs $FRA, $FRB), (i32 8))>;
-def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETOLE),
-          (QVFLOGICALb (QVFCMPGTbs $FRA, $FRB),
-                       (QVFTSTNANbs $FRA, $FRB), (i32 8))>;
-def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETONE),
-          (QVFLOGICALb (QVFCMPEQbs $FRA, $FRB),
-                       (QVFTSTNANbs $FRA, $FRB), (i32 8))>;
-def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETO),
-          (QVFLOGICALb (QVFTSTNANbs $FRA, $FRB),
-                       (QVFTSTNANbs $FRA, $FRB), (i32 10))>;
-def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETUEQ),
-          (QVFLOGICALb (QVFCMPEQbs $FRA, $FRB),
-                       (QVFTSTNANbs $FRA, $FRB), (i32 7))>;
-def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETUGT),
-          (QVFLOGICALb (QVFCMPGTbs $FRA, $FRB),
-                       (QVFTSTNANbs $FRA, $FRB), (i32 7))>;
-def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETUGE),
-          (QVFLOGICALb (QVFTSTNANbs $FRA, $FRB),
-                       (QVFCMPLTbs $FRA, $FRB), (i32 13))>;
-def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETULT),
-          (QVFLOGICALb (QVFCMPLTbs $FRA, $FRB),
-                       (QVFTSTNANbs $FRA, $FRB), (i32 7))>;
-def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETULE),
-          (QVFLOGICALb (QVFTSTNANbs $FRA, $FRB),
-                       (QVFCMPGTbs $FRA, $FRB), (i32 13))>;
-def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETUNE),
-          (QVFLOGICALb (QVFTSTNANbs $FRA, $FRB),
-                       (QVFCMPEQbs $FRA, $FRB), (i32 13))>;
-
-def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETEQ),
-          (QVFCMPEQbs $FRA, $FRB)>;
-def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETGT),
-          (QVFCMPGTbs $FRA, $FRB)>;
-def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETGE),
-          (QVFLOGICALb (QVFCMPLTbs $FRA, $FRB),
-                       (QVFCMPLTbs $FRA, $FRB), (i32 10))>;
-def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETLT),
-          (QVFCMPLTbs $FRA, $FRB)>;
-def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETLE),
-          (QVFLOGICALb (QVFCMPGTbs $FRA, $FRB),
-                       (QVFCMPGTbs $FRA, $FRB), (i32 10))>;
-def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETNE),
-          (QVFLOGICALb (QVFCMPEQbs $FRA, $FRB),
-                       (QVFCMPEQbs $FRA, $FRB), (i32 10))>;
-
-def : Pat<(and v4i1:$FRA, (not v4i1:$FRB)),
-          (QVFLOGICALb $FRA, $FRB, (i32 4))>;
-def : Pat<(not (or v4i1:$FRA, v4i1:$FRB)),
-          (QVFLOGICALb $FRA, $FRB, (i32 8))>;
-def : Pat<(not (xor v4i1:$FRA, v4i1:$FRB)),
-          (QVFLOGICALb $FRA, $FRB, (i32 9))>;
-def : Pat<(or v4i1:$FRA, (not v4i1:$FRB)),
-          (QVFLOGICALb $FRA, $FRB, (i32 13))>;
-def : Pat<(not (and v4i1:$FRA, v4i1:$FRB)),
-          (QVFLOGICALb $FRA, $FRB, (i32 14))>;
-
-def : Pat<(and v4i1:$FRA, v4i1:$FRB),
-          (QVFLOGICALb $FRA, $FRB, (i32 1))>;
-def : Pat<(or v4i1:$FRA, v4i1:$FRB),
-          (QVFLOGICALb $FRA, $FRB, (i32 7))>;
-def : Pat<(xor v4i1:$FRA, v4i1:$FRB),
-          (QVFLOGICALb $FRA, $FRB, (i32 6))>;
-def : Pat<(not v4i1:$FRA),
-          (QVFLOGICALb $FRA, $FRA, (i32 10))>;
-
-def : Pat<(v4f64 (fpextend v4f32:$src)),
-          (COPY_TO_REGCLASS $src, QFRC)>;
-
-def : Pat<(v4f32 (fround_exact v4f64:$src)),
-          (COPY_TO_REGCLASS $src, QSRC)>;
-
-// Extract the underlying floating-point values from the
-// QPX (-1.0, 1.0) boolean representation.
-def : Pat<(v4f64 (PPCqbflt v4i1:$src)),
-          (COPY_TO_REGCLASS $src, QFRC)>;
-
-def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETLT)),
-          (SELECT_QFRC (CRANDC $lhs, $rhs), $tval, $fval)>;
-def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETULT)),
-          (SELECT_QFRC (CRANDC $rhs, $lhs), $tval, $fval)>;
-def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETLE)),
-          (SELECT_QFRC (CRORC  $lhs, $rhs), $tval, $fval)>;
-def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETULE)),
-          (SELECT_QFRC (CRORC  $rhs, $lhs), $tval, $fval)>;
-def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETEQ)),
-          (SELECT_QFRC (CREQV $lhs, $rhs), $tval, $fval)>;
-def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETGE)),
-          (SELECT_QFRC (CRORC  $rhs, $lhs), $tval, $fval)>;
-def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETUGE)),
-          (SELECT_QFRC (CRORC  $lhs, $rhs), $tval, $fval)>;
-def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETGT)),
-          (SELECT_QFRC (CRANDC $rhs, $lhs), $tval, $fval)>;
-def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETUGT)),
-          (SELECT_QFRC (CRANDC $lhs, $rhs), $tval, $fval)>;
-def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETNE)),
-          (SELECT_QFRC (CRXOR $lhs, $rhs), $tval, $fval)>;
-
-def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETLT)),
-          (SELECT_QSRC (CRANDC $lhs, $rhs), $tval, $fval)>;
-def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETULT)),
-          (SELECT_QSRC (CRANDC $rhs, $lhs), $tval, $fval)>;
-def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETLE)),
-          (SELECT_QSRC (CRORC  $lhs, $rhs), $tval, $fval)>;
-def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETULE)),
-          (SELECT_QSRC (CRORC  $rhs, $lhs), $tval, $fval)>;
-def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETEQ)),
-          (SELECT_QSRC (CREQV $lhs, $rhs), $tval, $fval)>;
-def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETGE)),
-          (SELECT_QSRC (CRORC  $rhs, $lhs), $tval, $fval)>;
-def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETUGE)),
-          (SELECT_QSRC (CRORC  $lhs, $rhs), $tval, $fval)>;
-def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETGT)),
-          (SELECT_QSRC (CRANDC $rhs, $lhs), $tval, $fval)>;
-def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETUGT)),
-          (SELECT_QSRC (CRANDC $lhs, $rhs), $tval, $fval)>;
-def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETNE)),
-          (SELECT_QSRC (CRXOR $lhs, $rhs), $tval, $fval)>;
-
-def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETLT)),
-          (SELECT_QBRC (CRANDC $lhs, $rhs), $tval, $fval)>;
-def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETULT)),
-          (SELECT_QBRC (CRANDC $rhs, $lhs), $tval, $fval)>;
-def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETLE)),
-          (SELECT_QBRC (CRORC  $lhs, $rhs), $tval, $fval)>;
-def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETULE)),
-          (SELECT_QBRC (CRORC  $rhs, $lhs), $tval, $fval)>;
-def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETEQ)),
-          (SELECT_QBRC (CREQV $lhs, $rhs), $tval, $fval)>;
-def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETGE)),
-          (SELECT_QBRC (CRORC  $rhs, $lhs), $tval, $fval)>;
-def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETUGE)),
-          (SELECT_QBRC (CRORC  $lhs, $rhs), $tval, $fval)>;
-def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETGT)),
-          (SELECT_QBRC (CRANDC $rhs, $lhs), $tval, $fval)>;
-def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETUGT)),
-          (SELECT_QBRC (CRANDC $lhs, $rhs), $tval, $fval)>;
-def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETNE)),
-          (SELECT_QBRC (CRXOR $lhs, $rhs), $tval, $fval)>;
-
-} // end HasQPX
-
-let Predicates = [HasQPX, NoNaNsFPMath] in {
-def : Pat<(fminnum v4f64:$FRA, v4f64:$FRB),
-          (QVFSELb (QVFCMPLTb $FRA, $FRB), $FRB, $FRA)>;
-def : Pat<(fmaxnum v4f64:$FRA, v4f64:$FRB),
-          (QVFSELb (QVFCMPGTb $FRA, $FRB), $FRB, $FRA)>;
-
-def : Pat<(fminnum v4f32:$FRA, v4f32:$FRB),
-          (QVFSELbs (QVFCMPLTbs $FRA, $FRB), $FRB, $FRA)>;
-def : Pat<(fmaxnum v4f32:$FRA, v4f32:$FRB),
-          (QVFSELbs (QVFCMPGTbs $FRA, $FRB), $FRB, $FRA)>;
-}
-
-let Predicates = [HasQPX, NaNsFPMath] in {
-// When either of these operands is NaN, we should return the other operand.
-// QVFCMPLT/QVFCMPGT return false is either operand is NaN, which means we need
-// to explicitly or with a NaN test on the second operand.
-def : Pat<(fminnum v4f64:$FRA, v4f64:$FRB),
-          (QVFSELb (QVFLOGICALb (QVFCMPLTb $FRA, $FRB),
-                                (QVFTSTNANb $FRB, $FRB), (i32 7)),
-                   $FRB, $FRA)>;
-def : Pat<(fmaxnum v4f64:$FRA, v4f64:$FRB),
-          (QVFSELb (QVFLOGICALb (QVFCMPGTb $FRA, $FRB),
-                                (QVFTSTNANb $FRB, $FRB), (i32 7)),
-                   $FRB, $FRA)>;
-
-def : Pat<(fminnum v4f32:$FRA, v4f32:$FRB),
-          (QVFSELbs (QVFLOGICALb (QVFCMPLTbs $FRA, $FRB),
-                                 (QVFTSTNANbs $FRB, $FRB), (i32 7)),
-                   $FRB, $FRA)>;
-def : Pat<(fmaxnum v4f32:$FRA, v4f32:$FRB),
-          (QVFSELbs (QVFLOGICALb (QVFCMPGTbs $FRA, $FRB),
-                                 (QVFTSTNANbs $FRB, $FRB), (i32 7)),
-                   $FRB, $FRA)>;
-}
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrSPE.td b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrSPE.td
index 858eb0c9fe50..299b34ca8283 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrSPE.td
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrSPE.td
@@ -819,16 +819,6 @@ def SPESTWX       : XForm_8<31, 151, (outs), (ins spe4rc:$rS, memrr:$dst),
 
 } // HasSPE
 
-let Predicates = [HasSPE] in {
-def : Pat<(f64 (extloadf32 iaddr:$src)),
-          (COPY_TO_REGCLASS (SPELWZ iaddr:$src), SPERC)>;
-def : Pat<(f64 (extloadf32 xaddr:$src)),
-          (COPY_TO_REGCLASS (SPELWZX xaddr:$src), SPERC)>;
-
-def : Pat<(f64 (fpextend f32:$src)),
-          (COPY_TO_REGCLASS $src, SPERC)>;
-}
-
 let Predicates = [HasSPE] in {
 def SELECT_CC_SPE4 : PPCCustomInserterPseudo<(outs spe4rc:$dst),
                             (ins crrc:$cond, spe4rc:$T, spe4rc:$F,
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrVSX.td
index 9ba5058a6f81..db6e00c71b89 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -145,6 +145,7 @@ def PPCSToV : SDNode<"PPCISD::SCALAR_TO_VECTOR_PERMUTED",
 def HasVSX : Predicate<"Subtarget->hasVSX()">;
 def IsLittleEndian : Predicate<"Subtarget->isLittleEndian()">;
 def IsBigEndian : Predicate<"!Subtarget->isLittleEndian()">;
+def IsPPC64 : Predicate<"Subtarget->isPPC64()">;
 def HasOnlySwappingMemOps : Predicate<"!Subtarget->hasP9Vector()">;
 def HasP8Vector : Predicate<"Subtarget->hasP8Vector()">;
 def HasDirectMove : Predicate<"Subtarget->hasDirectMove()">;
@@ -167,7 +168,7 @@ multiclass XX3Form_Rcr<bits<6> opcode, bits<7> xo, string asmbase,
     def _rec    : XX3Form_Rc<opcode, xo, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
                        !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
                        [(set InTy:$XT,
-                                (InTy (PPCvcmp_o InTy:$XA, InTy:$XB, xo)))]>,
+                                (InTy (PPCvcmp_rec InTy:$XA, InTy:$XB, xo)))]>,
                        isRecordForm;
   }
 }
@@ -362,7 +363,8 @@ let hasSideEffects = 0 in {
     }
   } // mayStore
 
-  let Uses = [RM], mayRaiseFPException = 1 in {
+  let mayRaiseFPException = 1 in {
+  let Uses = [RM] in {
   // Add/Mul Instructions
   let isCommutable = 1 in {
     def XSADDDP : XX3Form<60, 32,
@@ -622,12 +624,30 @@ let hasSideEffects = 0 in {
                            "xsrsqrtedp $XT, $XB", IIC_VecFP,
                            [(set f64:$XT, (PPCfrsqrte f64:$XB))]>;
 
+  let mayRaiseFPException = 0 in {
   def XSTDIVDP : XX3Form_1<60, 61,
                          (outs crrc:$crD), (ins vsfrc:$XA, vsfrc:$XB),
                          "xstdivdp $crD, $XA, $XB", IIC_FPCompare, []>;
   def XSTSQRTDP : XX2Form_1<60, 106,
                           (outs crrc:$crD), (ins vsfrc:$XB),
-                          "xstsqrtdp $crD, $XB", IIC_FPCompare, []>;
+                          "xstsqrtdp $crD, $XB", IIC_FPCompare,
+                          [(set i32:$crD, (PPCftsqrt f64:$XB))]>;
+  def XVTDIVDP : XX3Form_1<60, 125,
+                         (outs crrc:$crD), (ins vsrc:$XA, vsrc:$XB),
+                         "xvtdivdp $crD, $XA, $XB", IIC_FPCompare, []>;
+  def XVTDIVSP : XX3Form_1<60, 93,
+                         (outs crrc:$crD), (ins vsrc:$XA, vsrc:$XB),
+                         "xvtdivsp $crD, $XA, $XB", IIC_FPCompare, []>;
+
+  def XVTSQRTDP : XX2Form_1<60, 234,
+                          (outs crrc:$crD), (ins vsrc:$XB),
+                          "xvtsqrtdp $crD, $XB", IIC_FPCompare,
+                          [(set i32:$crD, (PPCftsqrt v2f64:$XB))]>;
+  def XVTSQRTSP : XX2Form_1<60, 170,
+                          (outs crrc:$crD), (ins vsrc:$XB),
+                          "xvtsqrtsp $crD, $XB", IIC_FPCompare,
+                          [(set i32:$crD, (PPCftsqrt v4f32:$XB))]>;
+  }
 
   def XVDIVDP : XX3Form<60, 120,
                         (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
@@ -647,20 +667,6 @@ let hasSideEffects = 0 in {
                         "xvsqrtsp $XT, $XB", IIC_FPSqrtS,
                         [(set v4f32:$XT, (any_fsqrt v4f32:$XB))]>;
 
-  def XVTDIVDP : XX3Form_1<60, 125,
-                         (outs crrc:$crD), (ins vsrc:$XA, vsrc:$XB),
-                         "xvtdivdp $crD, $XA, $XB", IIC_FPCompare, []>;
-  def XVTDIVSP : XX3Form_1<60, 93,
-                         (outs crrc:$crD), (ins vsrc:$XA, vsrc:$XB),
-                         "xvtdivsp $crD, $XA, $XB", IIC_FPCompare, []>;
-
-  def XVTSQRTDP : XX2Form_1<60, 234,
-                          (outs crrc:$crD), (ins vsrc:$XB),
-                          "xvtsqrtdp $crD, $XB", IIC_FPCompare, []>;
-  def XVTSQRTSP : XX2Form_1<60, 170,
-                          (outs crrc:$crD), (ins vsrc:$XB),
-                          "xvtsqrtsp $crD, $XB", IIC_FPCompare, []>;
-
   def XVREDP : XX2Form<60, 218,
                         (outs vsrc:$XT), (ins vsrc:$XB),
                         "xvredp $XT, $XB", IIC_VecFP,
@@ -707,6 +713,7 @@ let hasSideEffects = 0 in {
                              int_ppc_vsx_xvcmpgtsp, v4i32, v4f32>;
 
   // Move Instructions
+  let mayRaiseFPException = 0 in {
   def XSABSDP : XX2Form<60, 345,
                       (outs vsfrc:$XT), (ins vsfrc:$XB),
                       "xsabsdp $XT, $XB", IIC_VecFP,
@@ -760,6 +767,7 @@ let hasSideEffects = 0 in {
                       (outs vsrc:$XT), (ins vsrc:$XB),
                       "xvnegsp $XT, $XB", IIC_VecFP,
                       [(set v4f32:$XT, (fneg v4f32:$XB))]>;
+  }
 
   // Conversion Instructions
   def XSCVDPSP : XX2Form<60, 265,
@@ -768,50 +776,50 @@ let hasSideEffects = 0 in {
   def XSCVDPSXDS : XX2Form<60, 344,
                       (outs vsfrc:$XT), (ins vsfrc:$XB),
                       "xscvdpsxds $XT, $XB", IIC_VecFP,
-                      [(set f64:$XT, (PPCfctidz f64:$XB))]>;
+                      [(set f64:$XT, (PPCany_fctidz f64:$XB))]>;
   let isCodeGenOnly = 1 in
   def XSCVDPSXDSs : XX2Form<60, 344,
                       (outs vssrc:$XT), (ins vssrc:$XB),
                       "xscvdpsxds $XT, $XB", IIC_VecFP,
-                      [(set f32:$XT, (PPCfctidz f32:$XB))]>;
+                      [(set f32:$XT, (PPCany_fctidz f32:$XB))]>;
   def XSCVDPSXWS : XX2Form<60, 88,
                       (outs vsfrc:$XT), (ins vsfrc:$XB),
                       "xscvdpsxws $XT, $XB", IIC_VecFP,
-                      [(set f64:$XT, (PPCfctiwz f64:$XB))]>;
+                      [(set f64:$XT, (PPCany_fctiwz f64:$XB))]>;
   let isCodeGenOnly = 1 in
   def XSCVDPSXWSs : XX2Form<60, 88,
                       (outs vssrc:$XT), (ins vssrc:$XB),
                       "xscvdpsxws $XT, $XB", IIC_VecFP,
-                      [(set f32:$XT, (PPCfctiwz f32:$XB))]>;
+                      [(set f32:$XT, (PPCany_fctiwz f32:$XB))]>;
   def XSCVDPUXDS : XX2Form<60, 328,
                       (outs vsfrc:$XT), (ins vsfrc:$XB),
                       "xscvdpuxds $XT, $XB", IIC_VecFP,
-                      [(set f64:$XT, (PPCfctiduz f64:$XB))]>;
+                      [(set f64:$XT, (PPCany_fctiduz f64:$XB))]>;
   let isCodeGenOnly = 1 in
   def XSCVDPUXDSs : XX2Form<60, 328,
                       (outs vssrc:$XT), (ins vssrc:$XB),
                       "xscvdpuxds $XT, $XB", IIC_VecFP,
-                      [(set f32:$XT, (PPCfctiduz f32:$XB))]>;
+                      [(set f32:$XT, (PPCany_fctiduz f32:$XB))]>;
   def XSCVDPUXWS : XX2Form<60, 72,
                       (outs vsfrc:$XT), (ins vsfrc:$XB),
                       "xscvdpuxws $XT, $XB", IIC_VecFP,
-                      [(set f64:$XT, (PPCfctiwuz f64:$XB))]>;
+                      [(set f64:$XT, (PPCany_fctiwuz f64:$XB))]>;
   let isCodeGenOnly = 1 in
   def XSCVDPUXWSs : XX2Form<60, 72,
                       (outs vssrc:$XT), (ins vssrc:$XB),
                       "xscvdpuxws $XT, $XB", IIC_VecFP,
-                      [(set f32:$XT, (PPCfctiwuz f32:$XB))]>;
+                      [(set f32:$XT, (PPCany_fctiwuz f32:$XB))]>;
   def XSCVSPDP : XX2Form<60, 329,
                       (outs vsfrc:$XT), (ins vsfrc:$XB),
                       "xscvspdp $XT, $XB", IIC_VecFP, []>;
   def XSCVSXDDP : XX2Form<60, 376,
                       (outs vsfrc:$XT), (ins vsfrc:$XB),
                       "xscvsxddp $XT, $XB", IIC_VecFP,
-                      [(set f64:$XT, (PPCfcfid f64:$XB))]>;
+                      [(set f64:$XT, (PPCany_fcfid f64:$XB))]>;
   def XSCVUXDDP : XX2Form<60, 360,
                       (outs vsfrc:$XT), (ins vsfrc:$XB),
                       "xscvuxddp $XT, $XB", IIC_VecFP,
-                      [(set f64:$XT, (PPCfcfidu f64:$XB))]>;
+                      [(set f64:$XT, (PPCany_fcfidu f64:$XB))]>;
 
   def XVCVDPSP : XX2Form<60, 393,
                       (outs vsrc:$XT), (ins vsrc:$XB),
@@ -820,7 +828,7 @@ let hasSideEffects = 0 in {
   def XVCVDPSXDS : XX2Form<60, 472,
                       (outs vsrc:$XT), (ins vsrc:$XB),
                       "xvcvdpsxds $XT, $XB", IIC_VecFP,
-                      [(set v2i64:$XT, (fp_to_sint v2f64:$XB))]>;
+                      [(set v2i64:$XT, (any_fp_to_sint v2f64:$XB))]>;
   def XVCVDPSXWS : XX2Form<60, 216,
                       (outs vsrc:$XT), (ins vsrc:$XB),
                       "xvcvdpsxws $XT, $XB", IIC_VecFP,
@@ -828,7 +836,7 @@ let hasSideEffects = 0 in {
   def XVCVDPUXDS : XX2Form<60, 456,
                       (outs vsrc:$XT), (ins vsrc:$XB),
                       "xvcvdpuxds $XT, $XB", IIC_VecFP,
-                      [(set v2i64:$XT, (fp_to_uint v2f64:$XB))]>;
+                      [(set v2i64:$XT, (any_fp_to_uint v2f64:$XB))]>;
   def XVCVDPUXWS : XX2Form<60, 200,
                       (outs vsrc:$XT), (ins vsrc:$XB),
                       "xvcvdpuxws $XT, $XB", IIC_VecFP,
@@ -844,56 +852,105 @@ let hasSideEffects = 0 in {
   def XVCVSPSXWS : XX2Form<60, 152,
                       (outs vsrc:$XT), (ins vsrc:$XB),
                       "xvcvspsxws $XT, $XB", IIC_VecFP,
-                      [(set v4i32:$XT, (fp_to_sint v4f32:$XB))]>;
+                      [(set v4i32:$XT, (any_fp_to_sint v4f32:$XB))]>;
   def XVCVSPUXDS : XX2Form<60, 392,
                       (outs vsrc:$XT), (ins vsrc:$XB),
                       "xvcvspuxds $XT, $XB", IIC_VecFP, []>;
   def XVCVSPUXWS : XX2Form<60, 136,
                       (outs vsrc:$XT), (ins vsrc:$XB),
                       "xvcvspuxws $XT, $XB", IIC_VecFP,
-                      [(set v4i32:$XT, (fp_to_uint v4f32:$XB))]>;
+                      [(set v4i32:$XT, (any_fp_to_uint v4f32:$XB))]>;
   def XVCVSXDDP : XX2Form<60, 504,
                       (outs vsrc:$XT), (ins vsrc:$XB),
                       "xvcvsxddp $XT, $XB", IIC_VecFP,
-                      [(set v2f64:$XT, (sint_to_fp v2i64:$XB))]>;
+                      [(set v2f64:$XT, (any_sint_to_fp v2i64:$XB))]>;
   def XVCVSXDSP : XX2Form<60, 440,
                       (outs vsrc:$XT), (ins vsrc:$XB),
                       "xvcvsxdsp $XT, $XB", IIC_VecFP,
                       [(set v4f32:$XT, (int_ppc_vsx_xvcvsxdsp v2i64:$XB))]>;
-  def XVCVSXWDP : XX2Form<60, 248,
-                      (outs vsrc:$XT), (ins vsrc:$XB),
-                      "xvcvsxwdp $XT, $XB", IIC_VecFP,
-                      [(set v2f64:$XT, (int_ppc_vsx_xvcvsxwdp v4i32:$XB))]>;
   def XVCVSXWSP : XX2Form<60, 184,
                       (outs vsrc:$XT), (ins vsrc:$XB),
                       "xvcvsxwsp $XT, $XB", IIC_VecFP,
-                      [(set v4f32:$XT, (sint_to_fp v4i32:$XB))]>;
+                      [(set v4f32:$XT, (any_sint_to_fp v4i32:$XB))]>;
   def XVCVUXDDP : XX2Form<60, 488,
                       (outs vsrc:$XT), (ins vsrc:$XB),
                       "xvcvuxddp $XT, $XB", IIC_VecFP,
-                      [(set v2f64:$XT, (uint_to_fp v2i64:$XB))]>;
+                      [(set v2f64:$XT, (any_uint_to_fp v2i64:$XB))]>;
   def XVCVUXDSP : XX2Form<60, 424,
                       (outs vsrc:$XT), (ins vsrc:$XB),
                       "xvcvuxdsp $XT, $XB", IIC_VecFP,
                       [(set v4f32:$XT, (int_ppc_vsx_xvcvuxdsp v2i64:$XB))]>;
+  def XVCVUXWSP : XX2Form<60, 168,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvuxwsp $XT, $XB", IIC_VecFP,
+                      [(set v4f32:$XT, (any_uint_to_fp v4i32:$XB))]>;
+
+  let mayRaiseFPException = 0 in {
+  def XVCVSXWDP : XX2Form<60, 248,
+                    (outs vsrc:$XT), (ins vsrc:$XB),
+                    "xvcvsxwdp $XT, $XB", IIC_VecFP,
+                    [(set v2f64:$XT, (int_ppc_vsx_xvcvsxwdp v4i32:$XB))]>;
   def XVCVUXWDP : XX2Form<60, 232,
                       (outs vsrc:$XT), (ins vsrc:$XB),
                       "xvcvuxwdp $XT, $XB", IIC_VecFP,
                       [(set v2f64:$XT, (int_ppc_vsx_xvcvuxwdp v4i32:$XB))]>;
-  def XVCVUXWSP : XX2Form<60, 168,
+  }
+
+  // Rounding Instructions respecting current rounding mode
+  def XSRDPIC : XX2Form<60, 107,
+                      (outs vsfrc:$XT), (ins vsfrc:$XB),
+                      "xsrdpic $XT, $XB", IIC_VecFP,
+                      [(set f64:$XT, (fnearbyint f64:$XB))]>;
+  def XVRDPIC : XX2Form<60, 235,
                       (outs vsrc:$XT), (ins vsrc:$XB),
-                      "xvcvuxwsp $XT, $XB", IIC_VecFP,
-                      [(set v4f32:$XT, (uint_to_fp v4i32:$XB))]>;
+                      "xvrdpic $XT, $XB", IIC_VecFP,
+                      [(set v2f64:$XT, (fnearbyint v2f64:$XB))]>;
+  def XVRSPIC : XX2Form<60, 171,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvrspic $XT, $XB", IIC_VecFP,
+                      [(set v4f32:$XT, (fnearbyint v4f32:$XB))]>;
+  // Max/Min Instructions
+  let isCommutable = 1 in {
+  def XSMAXDP : XX3Form<60, 160,
+                        (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
+                        "xsmaxdp $XT, $XA, $XB", IIC_VecFP,
+                        [(set vsfrc:$XT,
+                              (int_ppc_vsx_xsmaxdp vsfrc:$XA, vsfrc:$XB))]>;
+  def XSMINDP : XX3Form<60, 168,
+                        (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
+                        "xsmindp $XT, $XA, $XB", IIC_VecFP,
+                        [(set vsfrc:$XT,
+                              (int_ppc_vsx_xsmindp vsfrc:$XA, vsfrc:$XB))]>;
+
+  def XVMAXDP : XX3Form<60, 224,
+                        (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                        "xvmaxdp $XT, $XA, $XB", IIC_VecFP,
+                        [(set vsrc:$XT,
+                              (int_ppc_vsx_xvmaxdp vsrc:$XA, vsrc:$XB))]>;
+  def XVMINDP : XX3Form<60, 232,
+                        (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                        "xvmindp $XT, $XA, $XB", IIC_VecFP,
+                        [(set vsrc:$XT,
+                              (int_ppc_vsx_xvmindp vsrc:$XA, vsrc:$XB))]>;
+
+  def XVMAXSP : XX3Form<60, 192,
+                        (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                        "xvmaxsp $XT, $XA, $XB", IIC_VecFP,
+                        [(set vsrc:$XT,
+                              (int_ppc_vsx_xvmaxsp vsrc:$XA, vsrc:$XB))]>;
+  def XVMINSP : XX3Form<60, 200,
+                        (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                        "xvminsp $XT, $XA, $XB", IIC_VecFP,
+                        [(set vsrc:$XT,
+                              (int_ppc_vsx_xvminsp vsrc:$XA, vsrc:$XB))]>;
+  } // isCommutable
+  } // Uses = [RM]
 
-  // Rounding Instructions
+  // Rounding Instructions with static direction.
   def XSRDPI : XX2Form<60, 73,
                       (outs vsfrc:$XT), (ins vsfrc:$XB),
                       "xsrdpi $XT, $XB", IIC_VecFP,
                       [(set f64:$XT, (any_fround f64:$XB))]>;
-  def XSRDPIC : XX2Form<60, 107,
-                      (outs vsfrc:$XT), (ins vsfrc:$XB),
-                      "xsrdpic $XT, $XB", IIC_VecFP,
-                      [(set f64:$XT, (any_fnearbyint f64:$XB))]>;
   def XSRDPIM : XX2Form<60, 121,
                       (outs vsfrc:$XT), (ins vsfrc:$XB),
                       "xsrdpim $XT, $XB", IIC_VecFP,
@@ -911,10 +968,6 @@ let hasSideEffects = 0 in {
                       (outs vsrc:$XT), (ins vsrc:$XB),
                       "xvrdpi $XT, $XB", IIC_VecFP,
                       [(set v2f64:$XT, (any_fround v2f64:$XB))]>;
-  def XVRDPIC : XX2Form<60, 235,
-                      (outs vsrc:$XT), (ins vsrc:$XB),
-                      "xvrdpic $XT, $XB", IIC_VecFP,
-                      [(set v2f64:$XT, (any_fnearbyint v2f64:$XB))]>;
   def XVRDPIM : XX2Form<60, 249,
                       (outs vsrc:$XT), (ins vsrc:$XB),
                       "xvrdpim $XT, $XB", IIC_VecFP,
@@ -932,10 +985,6 @@ let hasSideEffects = 0 in {
                       (outs vsrc:$XT), (ins vsrc:$XB),
                       "xvrspi $XT, $XB", IIC_VecFP,
                       [(set v4f32:$XT, (any_fround v4f32:$XB))]>;
-  def XVRSPIC : XX2Form<60, 171,
-                      (outs vsrc:$XT), (ins vsrc:$XB),
-                      "xvrspic $XT, $XB", IIC_VecFP,
-                      [(set v4f32:$XT, (any_fnearbyint v4f32:$XB))]>;
   def XVRSPIM : XX2Form<60, 185,
                       (outs vsrc:$XT), (ins vsrc:$XB),
                       "xvrspim $XT, $XB", IIC_VecFP,
@@ -948,43 +997,7 @@ let hasSideEffects = 0 in {
                       (outs vsrc:$XT), (ins vsrc:$XB),
                       "xvrspiz $XT, $XB", IIC_VecFP,
                       [(set v4f32:$XT, (any_ftrunc v4f32:$XB))]>;
-
-  // Max/Min Instructions
-  let isCommutable = 1 in {
-  def XSMAXDP : XX3Form<60, 160,
-                        (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
-                        "xsmaxdp $XT, $XA, $XB", IIC_VecFP,
-                        [(set vsfrc:$XT,
-                              (int_ppc_vsx_xsmaxdp vsfrc:$XA, vsfrc:$XB))]>;
-  def XSMINDP : XX3Form<60, 168,
-                        (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
-                        "xsmindp $XT, $XA, $XB", IIC_VecFP,
-                        [(set vsfrc:$XT,
-                              (int_ppc_vsx_xsmindp vsfrc:$XA, vsfrc:$XB))]>;
-
-  def XVMAXDP : XX3Form<60, 224,
-                        (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
-                        "xvmaxdp $XT, $XA, $XB", IIC_VecFP,
-                        [(set vsrc:$XT,
-                              (int_ppc_vsx_xvmaxdp vsrc:$XA, vsrc:$XB))]>;
-  def XVMINDP : XX3Form<60, 232,
-                        (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
-                        "xvmindp $XT, $XA, $XB", IIC_VecFP,
-                        [(set vsrc:$XT,
-                              (int_ppc_vsx_xvmindp vsrc:$XA, vsrc:$XB))]>;
-
-  def XVMAXSP : XX3Form<60, 192,
-                        (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
-                        "xvmaxsp $XT, $XA, $XB", IIC_VecFP,
-                        [(set vsrc:$XT,
-                              (int_ppc_vsx_xvmaxsp vsrc:$XA, vsrc:$XB))]>;
-  def XVMINSP : XX3Form<60, 200,
-                        (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
-                        "xvminsp $XT, $XA, $XB", IIC_VecFP,
-                        [(set vsrc:$XT,
-                              (int_ppc_vsx_xvminsp vsrc:$XA, vsrc:$XB))]>;
-  } // isCommutable
-  } // Uses = [RM], mayRaiseFPException
+  } // mayRaiseFPException
 
   // Logical Instructions
   let isCommutable = 1 in
@@ -1170,7 +1183,7 @@ let Predicates = [HasVSX, HasP8Vector] in {
                         "xsresp $XT, $XB", IIC_VecFP,
                         [(set f32:$XT, (PPCfre f32:$XB))]>;
   // FIXME: Setting the hasSideEffects flag here to match current behaviour.
-  let hasSideEffects = 1, mayRaiseFPException = 1 in
+  let hasSideEffects = 1 in
   def XSRSP : XX2Form<60, 281,
                         (outs vssrc:$XT), (ins vsfrc:$XB),
                         "xsrsp $XT, $XB", IIC_VecFP,
@@ -1268,18 +1281,18 @@ let Predicates = [HasVSX, HasP8Vector] in {
   def XSCVSXDSP : XX2Form<60, 312,
                       (outs vssrc:$XT), (ins vsfrc:$XB),
                       "xscvsxdsp $XT, $XB", IIC_VecFP,
-                      [(set f32:$XT, (PPCfcfids f64:$XB))]>;
+                      [(set f32:$XT, (PPCany_fcfids f64:$XB))]>;
   def XSCVUXDSP : XX2Form<60, 296,
                       (outs vssrc:$XT), (ins vsfrc:$XB),
                       "xscvuxdsp $XT, $XB", IIC_VecFP,
-                      [(set f32:$XT, (PPCfcfidus f64:$XB))]>;
+                      [(set f32:$XT, (PPCany_fcfidus f64:$XB))]>;
+  } // mayRaiseFPException
 
   // Conversions between vector and scalar single precision
   def XSCVDPSPN : XX2Form<60, 267, (outs vsrc:$XT), (ins vssrc:$XB),
                           "xscvdpspn $XT, $XB", IIC_VecFP, []>;
   def XSCVSPDPN : XX2Form<60, 331, (outs vssrc:$XT), (ins vsrc:$XB),
                           "xscvspdpn $XT, $XB", IIC_VecFP, []>;
-  } // mayRaiseFPException
 
   let Predicates = [HasVSX, HasDirectMove] in {
   // VSX direct move instructions
@@ -1440,15 +1453,16 @@ let Predicates = [HasVSX, HasP9Vector] in {
   // FIXME: Setting the hasSideEffects flag here to match current behaviour.
   // QP Compare Ordered/Unordered
   let hasSideEffects = 1 in {
-    def XSCMPOQP : X_BF3_VA5_VB5<63, 132, "xscmpoqp", []>;
-    def XSCMPUQP : X_BF3_VA5_VB5<63, 644, "xscmpuqp", []>;
-
     // DP/QP Compare Exponents
     def XSCMPEXPDP : XX3Form_1<60, 59,
                                (outs crrc:$crD), (ins vsfrc:$XA, vsfrc:$XB),
                                "xscmpexpdp $crD, $XA, $XB", IIC_FPCompare, []>;
     def XSCMPEXPQP : X_BF3_VA5_VB5<63, 164, "xscmpexpqp", []>;
 
+    let mayRaiseFPException = 1 in {
+    def XSCMPOQP : X_BF3_VA5_VB5<63, 132, "xscmpoqp", []>;
+    def XSCMPUQP : X_BF3_VA5_VB5<63, 644, "xscmpuqp", []>;
+
     // DP Compare ==, >=, >, !=
     // Use vsrc for XT, because the entire register of XT is set.
     // XT.dword[1] = 0x0000_0000_0000_0000
@@ -1458,6 +1472,7 @@ let Predicates = [HasVSX, HasP9Vector] in {
                                     IIC_FPCompare, []>;
     def XSCMPGTDP : XX3_XT5_XA5_XB5<60, 11, "xscmpgtdp", vsrc, vsfrc, vsfrc,
                                     IIC_FPCompare, []>;
+    }
   }
 
   //===--------------------------------------------------------------------===//
@@ -1476,9 +1491,8 @@ let Predicates = [HasVSX, HasP9Vector] in {
                                           f128:$vB))]>;
   }
 
-  // FIXME: Setting the hasSideEffects flag here to match current behaviour.
   // Truncate & Convert QP -> (Un)Signed (D)Word (dword[1] is set to zero)
-  let hasSideEffects = 1 in {
+  let mayRaiseFPException = 1 in {
     def XSCVQPSDZ : X_VT5_XO5_VB5<63, 25, 836, "xscvqpsdz", []>;
     def XSCVQPSWZ : X_VT5_XO5_VB5<63,  9, 836, "xscvqpswz", []>;
     def XSCVQPUDZ : X_VT5_XO5_VB5<63, 17, 836, "xscvqpudz", []>;
@@ -1494,11 +1508,12 @@ let Predicates = [HasVSX, HasP9Vector] in {
   // vsfrc for src and dest register. xscvhpdp's src only use the left 16 bits,
   // but we still use vsfrc for it.
   // FIXME: Setting the hasSideEffects flag here to match current behaviour.
-  let hasSideEffects = 1 in {
+  let hasSideEffects = 1, mayRaiseFPException = 1 in {
     def XSCVDPHP : XX2_XT6_XO5_XB6<60, 17, 347, "xscvdphp", vsfrc, []>;
     def XSCVHPDP : XX2_XT6_XO5_XB6<60, 16, 347, "xscvhpdp", vsfrc, []>;
   }
 
+  let mayRaiseFPException = 1 in {
   // Vector HP -> SP
   // FIXME: Setting the hasSideEffects flag here to match current behaviour.
   let hasSideEffects = 1 in
@@ -1507,16 +1522,15 @@ let Predicates = [HasVSX, HasP9Vector] in {
                                  [(set v4f32:$XT,
                                      (int_ppc_vsx_xvcvsphp v4f32:$XB))]>;
 
-  let mayRaiseFPException = 1 in {
-    // Round to Quad-Precision Integer [with Inexact]
-    def XSRQPI   : Z23_VT5_R1_VB5_RMC2_EX1<63,  5, 0, "xsrqpi" , []>;
-    def XSRQPIX  : Z23_VT5_R1_VB5_RMC2_EX1<63,  5, 1, "xsrqpix", []>;
-  }
+  // Round to Quad-Precision Integer [with Inexact]
+  def XSRQPI   : Z23_VT5_R1_VB5_RMC2_EX1<63,  5, 0, "xsrqpi" , []>;
+  def XSRQPIX  : Z23_VT5_R1_VB5_RMC2_EX1<63,  5, 1, "xsrqpix", []>;
 
   // Round Quad-Precision to Double-Extended Precision (fp80)
   // FIXME: Setting the hasSideEffects flag here to match current behaviour.
   let hasSideEffects = 1 in
   def XSRQPXP  : Z23_VT5_R1_VB5_RMC2_EX1<63, 37, 0, "xsrqpxp", []>;
+  }
 
   //===--------------------------------------------------------------------===//
   // Insert/Extract Instructions
@@ -1607,6 +1621,7 @@ let Predicates = [HasVSX, HasP9Vector] in {
                                (int_ppc_vsx_xvtstdcdp v2f64:$XB, timm:$DCMX))]>;
 
   // Maximum/Minimum Type-C/Type-J DP
+  let mayRaiseFPException = 1 in {
   def XSMAXCDP : XX3_XT5_XA5_XB5<60, 128, "xsmaxcdp", vsfrc, vsfrc, vsfrc,
                                  IIC_VecFP,
                                  [(set f64:$XT, (PPCxsmaxc f64:$XA, f64:$XB))]>;
@@ -1621,6 +1636,7 @@ let Predicates = [HasVSX, HasP9Vector] in {
     def XSMINJDP : XX3_XT5_XA5_XB5<60, 152, "xsminjdp", vsrc, vsfrc, vsfrc,
                                    IIC_VecFP, []>;
   }
+  }
 
   // Vector Byte-Reverse H/W/D/Q Word
   // FIXME: Setting the hasSideEffects flag here to match current behaviour.
@@ -2392,33 +2408,48 @@ def MrgWords {
 // arbitrarily chosen to be Big, Little.
 //
 // Predicate combinations available:
+// [HasVSX, IsLittleEndian, HasP8Altivec] Altivec patterns using VSX instr.
+// [HasVSX, IsBigEndian, HasP8Altivec] Altivec patterns using VSX instr.
 // [HasVSX]
 // [HasVSX, IsBigEndian]
 // [HasVSX, IsLittleEndian]
 // [HasVSX, NoP9Vector]
+// [HasVSX, NoP9Vector, IsLittleEndian]
 // [HasVSX, HasOnlySwappingMemOps]
 // [HasVSX, HasOnlySwappingMemOps, IsBigEndian]
 // [HasVSX, HasP8Vector]
-// [HasVSX, HasP8Vector, IsBigEndian]
+// [HasVSX, HasP8Vector, IsBigEndian, IsPPC64]
 // [HasVSX, HasP8Vector, IsLittleEndian]
-// [HasVSX, HasP8Vector, NoP9Vector, IsBigEndian]
+// [HasVSX, HasP8Vector, NoP9Vector, IsBigEndian, IsPPC64]
 // [HasVSX, HasP8Vector, NoP9Vector, IsLittleEndian]
 // [HasVSX, HasDirectMove]
 // [HasVSX, HasDirectMove, IsBigEndian]
 // [HasVSX, HasDirectMove, IsLittleEndian]
-// [HasVSX, HasDirectMove, NoP9Altivec, IsBigEndian]
+// [HasVSX, HasDirectMove, NoP9Altivec, IsBigEndian, IsPPC64]
+// [HasVSX, HasDirectMove, NoP9Vector, IsBigEndian, IsPPC64]
 // [HasVSX, HasDirectMove, NoP9Altivec, IsLittleEndian]
-// [HasVSX, HasDirectMove, NoP9Vector, IsBigEndian]
 // [HasVSX, HasDirectMove, NoP9Vector, IsLittleEndian]
 // [HasVSX, HasP9Vector]
-// [HasVSX, HasP9Vector, IsBigEndian]
+// [HasVSX, HasP9Vector, IsBigEndian, IsPPC64]
 // [HasVSX, HasP9Vector, IsLittleEndian]
 // [HasVSX, HasP9Altivec]
-// [HasVSX, HasP9Altivec, IsBigEndian]
+// [HasVSX, HasP9Altivec, IsBigEndian, IsPPC64]
 // [HasVSX, HasP9Altivec, IsLittleEndian]
-// [HasVSX, IsISA3_0, HasDirectMove, IsBigEndian]
+// [HasVSX, IsISA3_0, HasDirectMove, IsBigEndian, IsPPC64]
 // [HasVSX, IsISA3_0, HasDirectMove, IsLittleEndian]
 
+// These Altivec patterns are here because we need a VSX instruction to match
+// the intrinsic (but only for little endian system).
+let Predicates = [HasVSX, IsLittleEndian, HasP8Altivec] in
+  def : Pat<(v16i8 (int_ppc_altivec_crypto_vpermxor v16i8:$a,
+                                                    v16i8:$b, v16i8:$c)),
+            (v16i8 (VPERMXOR $a, $b, (XXLNOR (COPY_TO_REGCLASS $c, VSRC),
+                                             (COPY_TO_REGCLASS $c, VSRC))))>;
+let Predicates = [HasVSX, IsBigEndian, HasP8Altivec] in
+  def : Pat<(v16i8 (int_ppc_altivec_crypto_vpermxor v16i8:$a,
+                                                    v16i8:$b, v16i8:$c)),
+            (v16i8 (VPERMXOR $a, $b, $c))>;
+
 let AddedComplexity = 400 in {
 // Valid for any VSX subtarget, regardless of endianness.
 let Predicates = [HasVSX] in {
@@ -2450,6 +2481,10 @@ def : Pat<(fneg (PPCfnmsub v4f32:$A, v4f32:$B, v4f32:$C)),
 def : Pat<(PPCfnmsub v4f32:$A, v4f32:$B, (fneg v4f32:$C)),
           (XVNMADDASP $C, $A, $B)>;
 
+def : Pat<(PPCfsqrt f64:$frA), (XSSQRTDP $frA)>;
+def : Pat<(PPCfsqrt v2f64:$frA), (XVSQRTDP $frA)>;
+def : Pat<(PPCfsqrt v4f32:$frA), (XVSQRTSP $frA)>;
+
 def : Pat<(v2f64 (bitconvert v4f32:$A)),
           (COPY_TO_REGCLASS $A, VSRC)>;
 def : Pat<(v2f64 (bitconvert v4i32:$A)),
@@ -2579,6 +2614,16 @@ def : Pat<(int_ppc_vsx_xvdivsp v4f32:$A, v4f32:$B),
 def : Pat<(int_ppc_vsx_xvdivdp v2f64:$A, v2f64:$B),
           (XVDIVDP $A, $B)>;
 
+// Vector test for software divide and sqrt.
+def : Pat<(i32 (int_ppc_vsx_xvtdivdp v2f64:$A, v2f64:$B)),
+          (COPY_TO_REGCLASS (XVTDIVDP $A, $B), GPRC)>;
+def : Pat<(i32 (int_ppc_vsx_xvtdivsp v4f32:$A, v4f32:$B)),
+          (COPY_TO_REGCLASS (XVTDIVSP $A, $B), GPRC)>;
+def : Pat<(i32 (int_ppc_vsx_xvtsqrtdp v2f64:$A)),
+          (COPY_TO_REGCLASS (XVTSQRTDP $A), GPRC)>;
+def : Pat<(i32 (int_ppc_vsx_xvtsqrtsp v4f32:$A)),
+          (COPY_TO_REGCLASS (XVTSQRTSP $A), GPRC)>;
+
 // Reciprocal estimate
 def : Pat<(int_ppc_vsx_xvresp v4f32:$A),
           (XVRESP $A)>;
@@ -2679,7 +2724,7 @@ def : Pat<(v2f64 (int_ppc_vsx_lxvd2x_be xoaddr:$src)), (LXVD2X xoaddr:$src)>;
 def : Pat<(f32 (any_fround f32:$S)),
           (f32 (COPY_TO_REGCLASS (XSRDPI
                                    (COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>;
-def : Pat<(f32 (any_fnearbyint f32:$S)),
+def : Pat<(f32 (fnearbyint f32:$S)),
           (f32 (COPY_TO_REGCLASS (XSRDPIC
                                    (COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>;
 def : Pat<(f32 (any_ffloor f32:$S)),
@@ -2694,11 +2739,11 @@ def : Pat<(f32 (any_ftrunc f32:$S)),
 def : Pat<(f32 (any_frint f32:$S)),
           (f32 (COPY_TO_REGCLASS (XSRDPIC
                                    (COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>;
-def : Pat<(v4f32 (frint v4f32:$S)), (v4f32 (XVRSPIC $S))>;
+def : Pat<(v4f32 (any_frint v4f32:$S)), (v4f32 (XVRSPIC $S))>;
 
 // Rounding for double precision.
-def : Pat<(f64 (frint f64:$S)), (f64 (XSRDPIC $S))>;
-def : Pat<(v2f64 (frint v2f64:$S)), (v2f64 (XVRDPIC $S))>;
+def : Pat<(f64 (any_frint f64:$S)), (f64 (XSRDPIC $S))>;
+def : Pat<(v2f64 (any_frint v2f64:$S)), (v2f64 (XVRDPIC $S))>;
 
 // Materialize a zero-vector of long long
 def : Pat<(v2i64 immAllZerosV),
@@ -2975,6 +3020,19 @@ defm : ScalToVecWPermute<
                                                         VSFRC)), sub_64)>;
 } // HasVSX, NoP9Vector
 
+// Any little endian pre-Power9 VSX subtarget.
+let Predicates = [HasVSX, NoP9Vector, IsLittleEndian] in {
+// Load-and-splat using only X-Form VSX loads.
+defm : ScalToVecWPermute<
+  v2i64, (i64 (load xoaddr:$src)),
+  (XXPERMDIs (XFLOADf64 xoaddr:$src), 2),
+  (SUBREG_TO_REG (i64 1), (XFLOADf64 xoaddr:$src), sub_64)>;
+defm : ScalToVecWPermute<
+  v2f64, (f64 (load xoaddr:$src)),
+  (XXPERMDIs (XFLOADf64 xoaddr:$src), 2),
+  (SUBREG_TO_REG (i64 1), (XFLOADf64 xoaddr:$src), sub_64)>;
+} // HasVSX, NoP9Vector, IsLittleEndian
+
 // Any VSX subtarget that only has loads and stores that load in big endian
 // order regardless of endianness. This is really pre-Power9 subtargets.
 let Predicates = [HasVSX, HasOnlySwappingMemOps] in {
@@ -2986,8 +3044,8 @@ let Predicates = [HasVSX, HasOnlySwappingMemOps] in {
   def : Pat<(PPCstxvd2x v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
 } // HasVSX, HasOnlySwappingMemOps
 
-// Big endian VSX subtarget that only has loads and stores that always load
-// in big endian order. Really big endian pre-Power9 subtargets.
+// Big endian VSX subtarget that only has loads and stores that always
+// load in big endian order. Really big endian pre-Power9 subtargets.
 let Predicates = [HasVSX, HasOnlySwappingMemOps, IsBigEndian] in {
   def : Pat<(v2f64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>;
   def : Pat<(v2i64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>;
@@ -3080,7 +3138,7 @@ def : Pat<(v16i8 (bitconvert (v16i8 immAllOnesV))),
 } // HasVSX, HasP8Vector
 
 // Big endian Power8 VSX subtarget.
-let Predicates = [HasVSX, HasP8Vector, IsBigEndian] in {
+let Predicates = [HasVSX, HasP8Vector, IsBigEndian, IsPPC64] in {
 def : Pat<DWToSPExtractConv.El0SS1,
           (f32 (XSCVSXDSP (COPY_TO_REGCLASS $S1, VSFRC)))>;
 def : Pat<DWToSPExtractConv.El1SS1,
@@ -3158,7 +3216,7 @@ foreach Idx = [ [0,3], [2,1], [3,2] ] in {
             (STIWX (EXTRACT_SUBREG (XXSLDWI $A, $A, !head(!tail(Idx))),
                                    sub_64), xoaddr:$src)>;
 }
-} // HasVSX, HasP8Vector, IsBigEndian
+} // HasVSX, HasP8Vector, IsBigEndian, IsPPC64
 
 // Little endian Power8 VSX subtarget.
 let Predicates = [HasVSX, HasP8Vector, IsLittleEndian] in {
@@ -3257,7 +3315,7 @@ foreach Idx = [ [0,2], [1,1], [3,3] ] in {
 } // HasVSX, HasP8Vector, IsLittleEndian
 
 // Big endian pre-Power9 VSX subtarget.
-let Predicates = [HasVSX, HasP8Vector, NoP9Vector, IsBigEndian] in {
+let Predicates = [HasVSX, HasP8Vector, NoP9Vector, IsBigEndian, IsPPC64] in {
 def : Pat<(store (i64 (extractelt v2i64:$A, 0)), xoaddr:$src),
           (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>;
 def : Pat<(store (f64 (extractelt v2f64:$A, 0)), xoaddr:$src),
@@ -3268,7 +3326,7 @@ def : Pat<(store (i64 (extractelt v2i64:$A, 1)), xoaddr:$src),
 def : Pat<(store (f64 (extractelt v2f64:$A, 1)), xoaddr:$src),
           (XFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2), sub_64),
                       xoaddr:$src)>;
-} // HasVSX, HasP8Vector, NoP9Vector, IsBigEndian
+} // HasVSX, HasP8Vector, NoP9Vector, IsBigEndian, IsPPC64
 
 // Little endian pre-Power9 VSX subtarget.
 let Predicates = [HasVSX, HasP8Vector, NoP9Vector, IsLittleEndian] in {
@@ -3525,8 +3583,8 @@ def : Pat<(i32 (vector_extract v4i32:$S, i64:$Idx)),
           (i32 VectorExtractions.LE_VARIABLE_WORD)>;
 } // HasVSX, HasDirectMove, NoP9Altivec, IsLittleEndian
 
-// Big endian pre-Power9 VSX subtarget that has direct moves.
-let Predicates = [HasVSX, HasDirectMove, NoP9Vector, IsBigEndian] in {
+// Big endian pre-Power9 64Bit VSX subtarget that has direct moves.
+let Predicates = [HasVSX, HasDirectMove, NoP9Vector, IsBigEndian, IsPPC64] in {
 // Big endian integer vectors using direct moves.
 def : Pat<(v2i64 (build_vector i64:$A, i64:$B)),
           (v2i64 (XXPERMDI
@@ -3540,7 +3598,7 @@ def : Pat<(v4i32 (build_vector i32:$A, i32:$B, i32:$C, i32:$D)),
               (MTVSRD (RLDIMI AnyExts.D, AnyExts.C, 32, 0)), VSRC), 0)>;
 def : Pat<(v4i32 (build_vector i32:$A, i32:$A, i32:$A, i32:$A)),
           (XXSPLTW (COPY_TO_REGCLASS (MTVSRWZ $A), VSRC), 1)>;
-} // HasVSX, HasDirectMove, NoP9Vector, IsBigEndian
+} // HasVSX, HasDirectMove, NoP9Vector, IsBigEndian, IsPPC64
 
 // Little endian pre-Power9 VSX subtarget that has direct moves.
 let Predicates = [HasVSX, HasDirectMove, NoP9Vector, IsLittleEndian] in {
@@ -3569,25 +3627,25 @@ def : Pat<(fneg (PPCfnmsub f128:$A, f128:$B, f128:$C)),
 def : Pat<(PPCfnmsub f128:$A, f128:$B, (fneg f128:$C)),
           (XSNMADDQP $C, $A, $B)>;
 
-def : Pat<(f128 (sint_to_fp i64:$src)),
+def : Pat<(f128 (any_sint_to_fp i64:$src)),
           (f128 (XSCVSDQP (COPY_TO_REGCLASS $src, VFRC)))>;
-def : Pat<(f128 (sint_to_fp (i64 (PPCmfvsr f64:$src)))),
+def : Pat<(f128 (any_sint_to_fp (i64 (PPCmfvsr f64:$src)))),
           (f128 (XSCVSDQP $src))>;
-def : Pat<(f128 (sint_to_fp (i32 (PPCmfvsr f64:$src)))),
+def : Pat<(f128 (any_sint_to_fp (i32 (PPCmfvsr f64:$src)))),
           (f128 (XSCVSDQP (VEXTSW2Ds $src)))>;
-def : Pat<(f128 (uint_to_fp i64:$src)),
+def : Pat<(f128 (any_uint_to_fp i64:$src)),
           (f128 (XSCVUDQP (COPY_TO_REGCLASS $src, VFRC)))>;
-def : Pat<(f128 (uint_to_fp (i64 (PPCmfvsr f64:$src)))),
+def : Pat<(f128 (any_uint_to_fp (i64 (PPCmfvsr f64:$src)))),
           (f128 (XSCVUDQP $src))>;
 
 // Convert (Un)Signed Word -> QP.
-def : Pat<(f128 (sint_to_fp i32:$src)),
+def : Pat<(f128 (any_sint_to_fp i32:$src)),
           (f128 (XSCVSDQP (MTVSRWA $src)))>;
-def : Pat<(f128 (sint_to_fp (i32 (load xoaddr:$src)))),
+def : Pat<(f128 (any_sint_to_fp (i32 (load xoaddr:$src)))),
           (f128 (XSCVSDQP (LIWAX xoaddr:$src)))>;
-def : Pat<(f128 (uint_to_fp i32:$src)),
+def : Pat<(f128 (any_uint_to_fp i32:$src)),
           (f128 (XSCVUDQP (MTVSRWZ $src)))>;
-def : Pat<(f128 (uint_to_fp (i32 (load xoaddr:$src)))),
+def : Pat<(f128 (any_uint_to_fp (i32 (load xoaddr:$src)))),
           (f128 (XSCVUDQP (LIWZX xoaddr:$src)))>;
 
 // Pattern for matching Vector HP -> Vector SP intrinsic. Defined as a
@@ -3761,11 +3819,11 @@ def : Pat<(f128 (uint_to_fp ScalarLoads.ZELi8)),
           (f128 (XSCVUDQP (LXSIBZX xoaddr:$src)))>;
 
 // Truncate & Convert QP -> (Un)Signed (D)Word.
-def : Pat<(i64 (fp_to_sint f128:$src)), (i64 (MFVRD (XSCVQPSDZ $src)))>;
-def : Pat<(i64 (fp_to_uint f128:$src)), (i64 (MFVRD (XSCVQPUDZ $src)))>;
-def : Pat<(i32 (fp_to_sint f128:$src)),
+def : Pat<(i64 (any_fp_to_sint f128:$src)), (i64 (MFVRD (XSCVQPSDZ $src)))>;
+def : Pat<(i64 (any_fp_to_uint f128:$src)), (i64 (MFVRD (XSCVQPUDZ $src)))>;
+def : Pat<(i32 (any_fp_to_sint f128:$src)),
           (i32 (MFVSRWZ (COPY_TO_REGCLASS (XSCVQPSWZ $src), VFRC)))>;
-def : Pat<(i32 (fp_to_uint f128:$src)),
+def : Pat<(i32 (any_fp_to_uint f128:$src)),
           (i32 (MFVSRWZ (COPY_TO_REGCLASS (XSCVQPUWZ $src), VFRC)))>;
 
 // Instructions for store(fptosi).
@@ -3893,8 +3951,8 @@ def : Pat<(v4i32 (PPCldsplat xoaddr:$A)),
           (v4i32 (LXVWSX xoaddr:$A))>;
 } // HasVSX, HasP9Vector
 
-// Big endian Power9 subtarget.
-let Predicates = [HasVSX, HasP9Vector, IsBigEndian] in {
+// Big endian 64Bit Power9 subtarget.
+let Predicates = [HasVSX, HasP9Vector, IsBigEndian, IsPPC64] in {
 def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 0)))))),
           (f32 (XSCVUXDSP (XXEXTRACTUW $A, 0)))>;
 def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 1)))))),
@@ -4067,7 +4125,7 @@ foreach Idx = 0-15 in {
 def : Pat<(f128 (uint_to_fp (i32 (PPCmfvsr f64:$src)))),
           (f128 (XSCVUDQP
                   (XXEXTRACTUW (SUBREG_TO_REG (i64 1), $src, sub_64), 4)))>;
-} // HasVSX, HasP9Vector, IsBigEndian
+} // HasVSX, HasP9Vector, IsBigEndian, IsPPC64
 
 // Little endian Power9 subtarget.
 let Predicates = [HasVSX, HasP9Vector, IsLittleEndian] in {
@@ -4292,8 +4350,8 @@ def : Pat<(v4i32 (PPCvabsd v4i32:$A, v4i32:$B, (i32 1))),
           (v4i32 (VABSDUW (XVNEGSP $A), (XVNEGSP $B)))>;
 } // HasVSX, HasP9Altivec
 
-// Big endian Power9 VSX subtargets with P9 Altivec support.
-let Predicates = [HasVSX, HasP9Altivec, IsBigEndian] in {
+// Big endian Power9 64Bit VSX subtargets with P9 Altivec support.
+let Predicates = [HasVSX, HasP9Altivec, IsBigEndian, IsPPC64] in {
 def : Pat<(i64 (anyext (i32 (vector_extract v16i8:$S, i64:$Idx)))),
           (VEXTUBLX $Idx, $S)>;
 
@@ -4426,7 +4484,7 @@ def : Pat<(v4i32 (build_vector ByteToWord.BE_A0, ByteToWord.BE_A1,
           (v4i32 (VEXTSB2W $A))>;
 def : Pat<(v2i64 (build_vector ByteToDWord.BE_A0, ByteToDWord.BE_A1)),
           (v2i64 (VEXTSB2D $A))>;
-} // HasVSX, HasP9Altivec, IsBigEndian
+} // HasVSX, HasP9Altivec, IsBigEndian, IsPPC64
 
 // Little endian Power9 VSX subtargets with P9 Altivec support.
 let Predicates = [HasVSX, HasP9Altivec, IsLittleEndian] in {
@@ -4563,8 +4621,9 @@ def : Pat<(v2i64 (build_vector ByteToDWord.LE_A0, ByteToDWord.LE_A1)),
           (v2i64 (VEXTSB2D $A))>;
 } // HasVSX, HasP9Altivec, IsLittleEndian
 
-// Big endian VSX subtarget that supports additional direct moves from ISA3.0.
-let Predicates = [HasVSX, IsISA3_0, HasDirectMove, IsBigEndian] in {
+// Big endian 64Bit VSX subtarget that supports additional direct moves from
+// ISA3.0.
+let Predicates = [HasVSX, IsISA3_0, HasDirectMove, IsBigEndian, IsPPC64] in {
 def : Pat<(i64 (extractelt v2i64:$A, 1)),
           (i64 (MFVSRLD $A))>;
 // Better way to build integer vectors if we have MTVSRDD. Big endian.
@@ -4577,7 +4636,7 @@ def : Pat<(v4i32 (build_vector i32:$A, i32:$B, i32:$C, i32:$D)),
 
 def : Pat<(f128 (PPCbuild_fp128 i64:$rB, i64:$rA)),
           (f128 (COPY_TO_REGCLASS (MTVSRDD $rB, $rA), VRRC))>;
-} // HasVSX, IsISA3_0, HasDirectMove, IsBigEndian
+} // HasVSX, IsISA3_0, HasDirectMove, IsBigEndian, IsPPC64
 
 // Little endian VSX subtarget that supports direct moves from ISA3.0.
 let Predicates = [HasVSX, IsISA3_0, HasDirectMove, IsLittleEndian] in {
@@ -4602,20 +4661,24 @@ def : InstAlias<"xvmovdp $XT, $XB",
 def : InstAlias<"xvmovsp $XT, $XB",
                 (XVCPSGNSP vsrc:$XT, vsrc:$XB, vsrc:$XB)>;
 
-def : InstAlias<"xxspltd $XT, $XB, 0",
-                (XXPERMDI vsrc:$XT, vsrc:$XB, vsrc:$XB, 0)>;
-def : InstAlias<"xxspltd $XT, $XB, 1",
-                (XXPERMDI vsrc:$XT, vsrc:$XB, vsrc:$XB, 3)>;
+// Certain versions of the AIX assembler may missassemble these mnemonics.
+let Predicates = [ModernAs] in {
+  def : InstAlias<"xxspltd $XT, $XB, 0",
+                  (XXPERMDI vsrc:$XT, vsrc:$XB, vsrc:$XB, 0)>;
+  def : InstAlias<"xxspltd $XT, $XB, 1",
+                  (XXPERMDI vsrc:$XT, vsrc:$XB, vsrc:$XB, 3)>;
+  def : InstAlias<"xxspltd $XT, $XB, 0",
+                  (XXPERMDIs vsrc:$XT, vsfrc:$XB, 0)>;
+  def : InstAlias<"xxspltd $XT, $XB, 1",
+                  (XXPERMDIs vsrc:$XT, vsfrc:$XB, 3)>;
+}
+
 def : InstAlias<"xxmrghd $XT, $XA, $XB",
                 (XXPERMDI vsrc:$XT, vsrc:$XA, vsrc:$XB, 0)>;
 def : InstAlias<"xxmrgld $XT, $XA, $XB",
                 (XXPERMDI vsrc:$XT, vsrc:$XA, vsrc:$XB, 3)>;
 def : InstAlias<"xxswapd $XT, $XB",
                 (XXPERMDI vsrc:$XT, vsrc:$XB, vsrc:$XB, 2)>;
-def : InstAlias<"xxspltd $XT, $XB, 0",
-                (XXPERMDIs vsrc:$XT, vsfrc:$XB, 0)>;
-def : InstAlias<"xxspltd $XT, $XB, 1",
-                (XXPERMDIs vsrc:$XT, vsfrc:$XB, 3)>;
 def : InstAlias<"xxswapd $XT, $XB",
                 (XXPERMDIs vsrc:$XT, vsfrc:$XB, 2)>;
 def : InstAlias<"mfvrd $rA, $XT",
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
index a7546d2be5d8..c24240909797 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
@@ -60,6 +60,7 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsPowerPC.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
@@ -80,10 +81,8 @@
 
 using namespace llvm;
 
-// By default, we limit this to creating 16 common bases out of loops per
-// function. 16 is a little over half of the allocatable register set.
 static cl::opt<unsigned> MaxVarsPrep("ppc-formprep-max-vars",
-                                 cl::Hidden, cl::init(16),
+                                 cl::Hidden, cl::init(24),
   cl::desc("Potential common base number threshold per function for PPC loop "
            "prep"));
 
@@ -93,8 +92,7 @@ static cl::opt<bool> PreferUpdateForm("ppc-formprep-prefer-update",
 
 // Sum of following 3 per loop thresholds for all loops can not be larger
 // than MaxVarsPrep.
-// By default, we limit this to creating 9 PHIs for one loop.
-// 9 and 3 for each kind prep are exterimental values on Power9.
+// now the thresholds for each kind prep are exterimental values on Power9.
 static cl::opt<unsigned> MaxVarsUpdateForm("ppc-preinc-prep-max-vars",
                                  cl::Hidden, cl::init(3),
   cl::desc("Potential PHI threshold per loop for PPC loop prep of update "
@@ -105,7 +103,7 @@ static cl::opt<unsigned> MaxVarsDSForm("ppc-dsprep-max-vars",
   cl::desc("Potential PHI threshold per loop for PPC loop prep of DS form"));
 
 static cl::opt<unsigned> MaxVarsDQForm("ppc-dqprep-max-vars",
-                                 cl::Hidden, cl::init(3),
+                                 cl::Hidden, cl::init(8),
   cl::desc("Potential PHI threshold per loop for PPC loop prep of DQ form"));
 
 
@@ -277,8 +275,11 @@ static Value *GetPointerOperand(Value *MemI) {
   } else if (StoreInst *SMemI = dyn_cast<StoreInst>(MemI)) {
     return SMemI->getPointerOperand();
   } else if (IntrinsicInst *IMemI = dyn_cast<IntrinsicInst>(MemI)) {
-    if (IMemI->getIntrinsicID() == Intrinsic::prefetch)
+    if (IMemI->getIntrinsicID() == Intrinsic::prefetch ||
+        IMemI->getIntrinsicID() == Intrinsic::ppc_vsx_lxvp)
       return IMemI->getArgOperand(0);
+    if (IMemI->getIntrinsicID() == Intrinsic::ppc_vsx_stxvp)
+      return IMemI->getArgOperand(1);
   }
 
   return nullptr;
@@ -345,9 +346,13 @@ SmallVector<Bucket, 16> PPCLoopInstrFormPrep::collectCandidates(
         MemI = SMemI;
         PtrValue = SMemI->getPointerOperand();
       } else if (IntrinsicInst *IMemI = dyn_cast<IntrinsicInst>(&J)) {
-        if (IMemI->getIntrinsicID() == Intrinsic::prefetch) {
+        if (IMemI->getIntrinsicID() == Intrinsic::prefetch ||
+            IMemI->getIntrinsicID() == Intrinsic::ppc_vsx_lxvp) {
           MemI = IMemI;
           PtrValue = IMemI->getArgOperand(0);
+        } else if (IMemI->getIntrinsicID() == Intrinsic::ppc_vsx_stxvp) {
+          MemI = IMemI;
+          PtrValue = IMemI->getArgOperand(1);
         } else continue;
       } else continue;
 
@@ -606,6 +611,10 @@ bool PPCLoopInstrFormPrep::rewriteLoadStores(Loop *L, Bucket &BucketChain,
       NewBasePtr = NewPHI;
   }
 
+  // Clear the rewriter cache, because values that are in the rewriter's cache
+  // can be deleted below, causing the AssertingVH in the cache to trigger.
+  SCEVE.clear();
+
   if (Instruction *IDel = dyn_cast<Instruction>(BasePtr))
     BBChanged.insert(IDel->getParent());
   BasePtr->replaceAllUsesWith(NewBasePtr);
@@ -791,7 +800,7 @@ bool PPCLoopInstrFormPrep::runOnLoop(Loop *L) {
   bool MadeChange = false;
 
   // Only prep. the inner-most loop
-  if (!L->empty())
+  if (!L->isInnermost())
     return MadeChange;
 
   // Return if already done enough preparation.
@@ -823,6 +832,11 @@ bool PPCLoopInstrFormPrep::runOnLoop(Loop *L) {
     if (ST && ST->hasAltivec() &&
         PtrValue->getType()->getPointerElementType()->isVectorTy())
       return false;
+    // There are no update forms for P10 lxvp/stxvp intrinsic.
+    auto *II = dyn_cast<IntrinsicInst>(I);
+    if (II && ((II->getIntrinsicID() == Intrinsic::ppc_vsx_lxvp) ||
+               II->getIntrinsicID() == Intrinsic::ppc_vsx_stxvp))
+      return false;
     // See getPreIndexedAddressParts, the displacement for LDU/STDU has to
     // be 4's multiple (DS-form). For i64 loads/stores when the displacement
     // fits in a 16-bit signed field but isn't a multiple of 4, it will be
@@ -860,7 +874,13 @@ bool PPCLoopInstrFormPrep::runOnLoop(Loop *L) {
   // Check if a load/store has DQ form.
   auto isDQFormCandidate = [&] (const Instruction *I, const Value *PtrValue) {
     assert((PtrValue && I) && "Invalid parameter!");
-    return !isa<IntrinsicInst>(I) && ST && ST->hasP9Vector() &&
+    // Check if it is a P10 lxvp/stxvp intrinsic.
+    auto *II = dyn_cast<IntrinsicInst>(I);
+    if (II)
+      return II->getIntrinsicID() == Intrinsic::ppc_vsx_lxvp ||
+             II->getIntrinsicID() == Intrinsic::ppc_vsx_stxvp;
+    // Check if it is a P9 vector load/store.
+    return ST && ST->hasP9Vector() &&
            (PtrValue->getType()->getPointerElementType()->isVectorTy());
   };
 
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp
index 2b0e604e0ccd..27b2c9a628d0 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp
@@ -16,6 +16,7 @@
 #include "PPC.h"
 #include "PPCSubtarget.h"
 #include "PPCTargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Instructions.h"
@@ -64,8 +65,7 @@ private:
 /// Checks if the specified function name represents an entry in the MASSV
 /// library.
 bool PPCLowerMASSVEntries::isMASSVFunc(StringRef Name) {
-  auto Iter = std::find(std::begin(MASSVFuncs), std::end(MASSVFuncs), Name);
-  return Iter != std::end(MASSVFuncs);
+  return llvm::is_contained(MASSVFuncs, Name);
 }
 
 // FIXME:
@@ -105,7 +105,7 @@ bool PPCLowerMASSVEntries::handlePowSpecialCases(CallInst *CI, Function &Func,
     return false;
 
   if (Constant *Exp = dyn_cast<Constant>(CI->getArgOperand(1)))
-    if (ConstantFP *CFP = dyn_cast<ConstantFP>(Exp->getSplatValue())) {
+    if (ConstantFP *CFP = dyn_cast_or_null<ConstantFP>(Exp->getSplatValue())) {
       // If the argument is 0.75 or 0.25 it is cheaper to turn it into pow
       // intrinsic so that it could be optimzed as sequence of sqrt's.
       if (!CI->hasNoInfs() || !CI->hasApproxFunc())
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
index 236f98f32e18..5cc180d770b2 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
@@ -74,7 +74,9 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
       RefKind = MCSymbolRefExpr::VK_PPC_TOC_LO;
       break;
     case PPCII::MO_TLS:
-      RefKind = MCSymbolRefExpr::VK_PPC_TLS;
+      bool IsPCRel = (MO.getTargetFlags() & ~access) == PPCII::MO_PCREL_FLAG;
+      RefKind = IsPCRel ? MCSymbolRefExpr::VK_PPC_TLS_PCREL
+                        : MCSymbolRefExpr::VK_PPC_TLS;
       break;
   }
 
@@ -84,6 +86,14 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
     RefKind = MCSymbolRefExpr::VK_PCREL;
   else if (MO.getTargetFlags() == (PPCII::MO_PCREL_FLAG | PPCII::MO_GOT_FLAG))
     RefKind = MCSymbolRefExpr::VK_PPC_GOT_PCREL;
+  else if (MO.getTargetFlags() == (PPCII::MO_PCREL_FLAG | PPCII::MO_TPREL_FLAG))
+    RefKind = MCSymbolRefExpr::VK_TPREL;
+  else if (MO.getTargetFlags() == PPCII::MO_GOT_TLSGD_PCREL_FLAG)
+    RefKind = MCSymbolRefExpr::VK_PPC_GOT_TLSGD_PCREL;
+  else if (MO.getTargetFlags() == PPCII::MO_GOT_TLSLD_PCREL_FLAG)
+    RefKind = MCSymbolRefExpr::VK_PPC_GOT_TLSLD_PCREL;
+  else if (MO.getTargetFlags() == PPCII::MO_GOT_TPREL_PCREL_FLAG)
+    RefKind = MCSymbolRefExpr::VK_PPC_GOT_TPREL_PCREL;
 
   const MachineInstr *MI = MO.getParent();
   const MachineFunction *MF = MI->getMF();
@@ -100,6 +110,8 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
         MIOpcode == PPC::BL8_NOTOC) {
       RefKind = MCSymbolRefExpr::VK_PPC_NOTOC;
     }
+    if (MO.getTargetFlags() == PPCII::MO_PCREL_OPT_FLAG)
+      RefKind = MCSymbolRefExpr::VK_PPC_PCREL_OPT;
   }
 
   const MCExpr *Expr = MCSymbolRefExpr::create(Symbol, RefKind, Ctx);
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
index 227c863685ae..c8b01aaef828 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
@@ -267,6 +267,113 @@ void PPCMIPeephole::UpdateTOCSaves(
   TOCSaves[MI] = Keep;
 }
 
+// This function returns a list of all PHI nodes in the tree starting from
+// the RootPHI node. We perform a BFS traversal to get an ordered list of nodes.
+// The list initially only contains the root PHI. When we visit a PHI node, we
+// add it to the list. We continue to look for other PHI node operands while
+// there are nodes to visit in the list. The function returns false if the
+// optimization cannot be applied on this tree.
+static bool collectUnprimedAccPHIs(MachineRegisterInfo *MRI,
+                                   MachineInstr *RootPHI,
+                                   SmallVectorImpl<MachineInstr *> &PHIs) {
+  PHIs.push_back(RootPHI);
+  unsigned VisitedIndex = 0;
+  while (VisitedIndex < PHIs.size()) {
+    MachineInstr *VisitedPHI = PHIs[VisitedIndex];
+    for (unsigned PHIOp = 1, NumOps = VisitedPHI->getNumOperands();
+         PHIOp != NumOps; PHIOp += 2) {
+      Register RegOp = VisitedPHI->getOperand(PHIOp).getReg();
+      if (!Register::isVirtualRegister(RegOp))
+        return false;
+      MachineInstr *Instr = MRI->getVRegDef(RegOp);
+      // While collecting the PHI nodes, we check if they can be converted (i.e.
+      // all the operands are either copies, implicit defs or PHI nodes).
+      unsigned Opcode = Instr->getOpcode();
+      if (Opcode == PPC::COPY) {
+        Register Reg = Instr->getOperand(1).getReg();
+        if (!Register::isVirtualRegister(Reg) ||
+            MRI->getRegClass(Reg) != &PPC::ACCRCRegClass)
+          return false;
+      } else if (Opcode != PPC::IMPLICIT_DEF && Opcode != PPC::PHI)
+        return false;
+      // If we detect a cycle in the PHI nodes, we exit. It would be
+      // possible to change cycles as well, but that would add a lot
+      // of complexity for a case that is unlikely to occur with MMA
+      // code.
+      if (Opcode != PPC::PHI)
+        continue;
+      if (llvm::is_contained(PHIs, Instr))
+        return false;
+      PHIs.push_back(Instr);
+    }
+    VisitedIndex++;
+  }
+  return true;
+}
+
+// This function changes the unprimed accumulator PHI nodes in the PHIs list to
+// primed accumulator PHI nodes. The list is traversed in reverse order to
+// change all the PHI operands of a PHI node before changing the node itself.
+// We keep a map to associate each changed PHI node to its non-changed form.
+static void convertUnprimedAccPHIs(const PPCInstrInfo *TII,
+                                   MachineRegisterInfo *MRI,
+                                   SmallVectorImpl<MachineInstr *> &PHIs,
+                                   Register Dst) {
+  DenseMap<MachineInstr *, MachineInstr *> ChangedPHIMap;
+  for (auto It = PHIs.rbegin(), End = PHIs.rend(); It != End; ++It) {
+    MachineInstr *PHI = *It;
+    SmallVector<std::pair<MachineOperand, MachineOperand>, 4> PHIOps;
+    // We check if the current PHI node can be changed by looking at its
+    // operands. If all the operands are either copies from primed
+    // accumulators, implicit definitions or other unprimed accumulator
+    // PHI nodes, we change it.
+    for (unsigned PHIOp = 1, NumOps = PHI->getNumOperands(); PHIOp != NumOps;
+         PHIOp += 2) {
+      Register RegOp = PHI->getOperand(PHIOp).getReg();
+      MachineInstr *PHIInput = MRI->getVRegDef(RegOp);
+      unsigned Opcode = PHIInput->getOpcode();
+      assert((Opcode == PPC::COPY || Opcode == PPC::IMPLICIT_DEF ||
+              Opcode == PPC::PHI) &&
+             "Unexpected instruction");
+      if (Opcode == PPC::COPY) {
+        assert(MRI->getRegClass(PHIInput->getOperand(1).getReg()) ==
+                   &PPC::ACCRCRegClass &&
+               "Unexpected register class");
+        PHIOps.push_back({PHIInput->getOperand(1), PHI->getOperand(PHIOp + 1)});
+      } else if (Opcode == PPC::IMPLICIT_DEF) {
+        Register AccReg = MRI->createVirtualRegister(&PPC::ACCRCRegClass);
+        BuildMI(*PHIInput->getParent(), PHIInput, PHIInput->getDebugLoc(),
+                TII->get(PPC::IMPLICIT_DEF), AccReg);
+        PHIOps.push_back({MachineOperand::CreateReg(AccReg, false),
+                          PHI->getOperand(PHIOp + 1)});
+      } else if (Opcode == PPC::PHI) {
+        // We found a PHI operand. At this point we know this operand
+        // has already been changed so we get its associated changed form
+        // from the map.
+        assert(ChangedPHIMap.count(PHIInput) == 1 &&
+               "This PHI node should have already been changed.");
+        MachineInstr *PrimedAccPHI = ChangedPHIMap.lookup(PHIInput);
+        PHIOps.push_back({MachineOperand::CreateReg(
+                              PrimedAccPHI->getOperand(0).getReg(), false),
+                          PHI->getOperand(PHIOp + 1)});
+      }
+    }
+    Register AccReg = Dst;
+    // If the PHI node we are changing is the root node, the register it defines
+    // will be the destination register of the original copy (of the PHI def).
+    // For all other PHI's in the list, we need to create another primed
+    // accumulator virtual register as the PHI will no longer define the
+    // unprimed accumulator.
+    if (PHI != PHIs[0])
+      AccReg = MRI->createVirtualRegister(&PPC::ACCRCRegClass);
+    MachineInstrBuilder NewPHI = BuildMI(
+        *PHI->getParent(), PHI, PHI->getDebugLoc(), TII->get(PPC::PHI), AccReg);
+    for (auto RegMBB : PHIOps)
+      NewPHI.add(RegMBB.first).add(RegMBB.second);
+    ChangedPHIMap[PHI] = NewPHI.getInstr();
+  }
+}
+
 // Perform peephole optimizations.
 bool PPCMIPeephole::simplifyCode(void) {
   bool Simplified = false;
@@ -321,6 +428,38 @@ bool PPCMIPeephole::simplifyCode(void) {
 
       default:
         break;
+      case PPC::COPY: {
+        Register Src = MI.getOperand(1).getReg();
+        Register Dst = MI.getOperand(0).getReg();
+        if (!Register::isVirtualRegister(Src) ||
+            !Register::isVirtualRegister(Dst))
+          break;
+        if (MRI->getRegClass(Src) != &PPC::UACCRCRegClass ||
+            MRI->getRegClass(Dst) != &PPC::ACCRCRegClass)
+          break;
+
+        // We are copying an unprimed accumulator to a primed accumulator.
+        // If the input to the copy is a PHI that is fed only by (i) copies in
+        // the other direction (ii) implicitly defined unprimed accumulators or
+        // (iii) other PHI nodes satisfying (i) and (ii), we can change
+        // the PHI to a PHI on primed accumulators (as long as we also change
+        // its operands). To detect and change such copies, we first get a list
+        // of all the PHI nodes starting from the root PHI node in BFS order.
+        // We then visit all these PHI nodes to check if they can be changed to
+        // primed accumulator PHI nodes and if so, we change them.
+        MachineInstr *RootPHI = MRI->getVRegDef(Src);
+        if (RootPHI->getOpcode() != PPC::PHI)
+          break;
+
+        SmallVector<MachineInstr *, 4> PHIs;
+        if (!collectUnprimedAccPHIs(MRI, RootPHI, PHIs))
+          break;
+
+        convertUnprimedAccPHIs(TII, MRI, PHIs, Dst);
+
+        ToErase = &MI;
+        break;
+      }
       case PPC::LI:
       case PPC::LI8: {
         // If we are materializing a zero, look for any use operands for which
@@ -573,7 +712,7 @@ bool PPCMIPeephole::simplifyCode(void) {
               Simplified = true;
               Register ConvReg1 = RoundInstr->getOperand(1).getReg();
               Register FRSPDefines = RoundInstr->getOperand(0).getReg();
-              MachineInstr &Use = *(MRI->use_instr_begin(FRSPDefines));
+              MachineInstr &Use = *(MRI->use_instr_nodbg_begin(FRSPDefines));
               for (int i = 0, e = Use.getNumOperands(); i < e; ++i)
                 if (Use.getOperand(i).isReg() &&
                     Use.getOperand(i).getReg() == FRSPDefines)
@@ -848,142 +987,9 @@ bool PPCMIPeephole::simplifyCode(void) {
       case PPC::RLWINM_rec:
       case PPC::RLWINM8:
       case PPC::RLWINM8_rec: {
-        unsigned FoldingReg = MI.getOperand(1).getReg();
-        if (!Register::isVirtualRegister(FoldingReg))
-          break;
-
-        MachineInstr *SrcMI = MRI->getVRegDef(FoldingReg);
-        if (SrcMI->getOpcode() != PPC::RLWINM &&
-            SrcMI->getOpcode() != PPC::RLWINM_rec &&
-            SrcMI->getOpcode() != PPC::RLWINM8 &&
-            SrcMI->getOpcode() != PPC::RLWINM8_rec)
-          break;
-        assert((MI.getOperand(2).isImm() && MI.getOperand(3).isImm() &&
-                MI.getOperand(4).isImm() && SrcMI->getOperand(2).isImm() &&
-                SrcMI->getOperand(3).isImm() && SrcMI->getOperand(4).isImm()) &&
-               "Invalid PPC::RLWINM Instruction!");
-        uint64_t SHSrc = SrcMI->getOperand(2).getImm();
-        uint64_t SHMI = MI.getOperand(2).getImm();
-        uint64_t MBSrc = SrcMI->getOperand(3).getImm();
-        uint64_t MBMI = MI.getOperand(3).getImm();
-        uint64_t MESrc = SrcMI->getOperand(4).getImm();
-        uint64_t MEMI = MI.getOperand(4).getImm();
-
-        assert((MEMI < 32 && MESrc < 32 && MBMI < 32 && MBSrc < 32) &&
-               "Invalid PPC::RLWINM Instruction!");
-
-        // If MBMI is bigger than MEMI, we always can not get run of ones.
-        // RotatedSrcMask non-wrap:
-        //                 0........31|32........63
-        // RotatedSrcMask:   B---E        B---E
-        // MaskMI:         -----------|--E  B------
-        // Result:           -----          ---      (Bad candidate)
-        //
-        // RotatedSrcMask wrap:
-        //                 0........31|32........63
-        // RotatedSrcMask: --E   B----|--E    B----
-        // MaskMI:         -----------|--E  B------
-        // Result:         ---   -----|---    -----  (Bad candidate)
-        //
-        // One special case is RotatedSrcMask is a full set mask.
-        // RotatedSrcMask full:
-        //                 0........31|32........63
-        // RotatedSrcMask: ------EB---|-------EB---
-        // MaskMI:         -----------|--E  B------
-        // Result:         -----------|---  -------  (Good candidate)
-
-        // Mark special case.
-        bool SrcMaskFull = (MBSrc - MESrc == 1) || (MBSrc == 0 && MESrc == 31);
-
-        // For other MBMI > MEMI cases, just return.
-        if ((MBMI > MEMI) && !SrcMaskFull)
-          break;
-
-        // Handle MBMI <= MEMI cases.
-        APInt MaskMI = APInt::getBitsSetWithWrap(32, 32 - MEMI - 1, 32 - MBMI);
-        // In MI, we only need low 32 bits of SrcMI, just consider about low 32
-        // bit of SrcMI mask. Note that in APInt, lowerest bit is at index 0,
-        // while in PowerPC ISA, lowerest bit is at index 63.
-        APInt MaskSrc =
-            APInt::getBitsSetWithWrap(32, 32 - MESrc - 1, 32 - MBSrc);
-
-        APInt RotatedSrcMask = MaskSrc.rotl(SHMI);
-        APInt FinalMask = RotatedSrcMask & MaskMI;
-        uint32_t NewMB, NewME;
-
-        // If final mask is 0, MI result should be 0 too.
-        if (FinalMask.isNullValue()) {
-          bool Is64Bit = (MI.getOpcode() == PPC::RLWINM8 ||
-                          MI.getOpcode() == PPC::RLWINM8_rec);
-
-          Simplified = true;
-
-          LLVM_DEBUG(dbgs() << "Replace Instr: ");
-          LLVM_DEBUG(MI.dump());
-
-          if (MI.getOpcode() == PPC::RLWINM || MI.getOpcode() == PPC::RLWINM8) {
-            // Replace MI with "LI 0"
-            MI.RemoveOperand(4);
-            MI.RemoveOperand(3);
-            MI.RemoveOperand(2);
-            MI.getOperand(1).ChangeToImmediate(0);
-            MI.setDesc(TII->get(Is64Bit ? PPC::LI8 : PPC::LI));
-          } else {
-            // Replace MI with "ANDI_rec reg, 0"
-            MI.RemoveOperand(4);
-            MI.RemoveOperand(3);
-            MI.getOperand(2).setImm(0);
-            MI.setDesc(TII->get(Is64Bit ? PPC::ANDI8_rec : PPC::ANDI_rec));
-            MI.getOperand(1).setReg(SrcMI->getOperand(1).getReg());
-            if (SrcMI->getOperand(1).isKill()) {
-              MI.getOperand(1).setIsKill(true);
-              SrcMI->getOperand(1).setIsKill(false);
-            } else
-              // About to replace MI.getOperand(1), clear its kill flag.
-              MI.getOperand(1).setIsKill(false);
-          }
-
-          LLVM_DEBUG(dbgs() << "With: ");
-          LLVM_DEBUG(MI.dump());
-        } else if ((isRunOfOnes((unsigned)(FinalMask.getZExtValue()), NewMB,
-                               NewME) && NewMB <= NewME)|| SrcMaskFull) {
-          // Here we only handle MBMI <= MEMI case, so NewMB must be no bigger
-          // than NewME. Otherwise we get a 64 bit value after folding, but MI
-          // return a 32 bit value.
-
-          Simplified = true;
-          LLVM_DEBUG(dbgs() << "Converting Instr: ");
-          LLVM_DEBUG(MI.dump());
-
-          uint16_t NewSH = (SHSrc + SHMI) % 32;
-          MI.getOperand(2).setImm(NewSH);
-          // If SrcMI mask is full, no need to update MBMI and MEMI.
-          if (!SrcMaskFull) {
-            MI.getOperand(3).setImm(NewMB);
-            MI.getOperand(4).setImm(NewME);
-          }
-          MI.getOperand(1).setReg(SrcMI->getOperand(1).getReg());
-          if (SrcMI->getOperand(1).isKill()) {
-            MI.getOperand(1).setIsKill(true);
-            SrcMI->getOperand(1).setIsKill(false);
-          } else
-            // About to replace MI.getOperand(1), clear its kill flag.
-            MI.getOperand(1).setIsKill(false);
-
-          LLVM_DEBUG(dbgs() << "To: ");
-          LLVM_DEBUG(MI.dump());
-        }
-        if (Simplified) {
-          // If FoldingReg has no non-debug use and it has no implicit def (it
-          // is not RLWINMO or RLWINM8o), it's safe to delete its def SrcMI.
-          // Otherwise keep it.
+        Simplified = TII->combineRLWINM(MI, &ToErase);
+        if (Simplified)
           ++NumRotatesCollapsed;
-          if (MRI->use_nodbg_empty(FoldingReg) && !SrcMI->hasImplicitDef()) {
-            ToErase = SrcMI;
-            LLVM_DEBUG(dbgs() << "Delete dead instruction: ");
-            LLVM_DEBUG(SrcMI->dump());
-          }
-        }
         break;
       }
       }
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
index daf88589bb52..c976a9c62d3b 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "PPCMachineFunctionInfo.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/XCOFF.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/Support/CommandLine.h"
@@ -63,3 +64,36 @@ bool PPCFunctionInfo::isLiveInZExt(Register VReg) const {
       return LiveIn.second.isZExt();
   return false;
 }
+
+void PPCFunctionInfo::appendParameterType(ParamType Type) {
+  uint32_t CopyParamType = ParameterType;
+  int Bits = 0;
+
+  // If it is fixed type, we only need to increase the FixedParamNum, for
+  // the bit encode of fixed type is bit of zero, we do not need to change the
+  // ParamType.
+  if (Type == FixedType) {
+    ++FixedParamNum;
+    return;
+  }
+
+  ++FloatingPointParamNum;
+
+  for (int I = 0;
+       I < static_cast<int>(FloatingPointParamNum + FixedParamNum - 1); ++I) {
+    if (CopyParamType & XCOFF::TracebackTable::ParmTypeIsFloatingBit) {
+      // '10'b => floating point short parameter.
+      // '11'b => floating point long parameter.
+      CopyParamType <<= 2;
+      Bits += 2;
+    } else {
+      // '0'b => fixed parameter.
+      CopyParamType <<= 1;
+      ++Bits;
+    }
+  }
+
+  assert(Type != FixedType && "FixedType should already be handled.");
+  if (Bits < 31)
+    ParameterType |= Type << (30 - Bits);
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h
index 29ca53e273d7..4b73b36318b4 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h
@@ -22,6 +22,16 @@ namespace llvm {
 /// PPCFunctionInfo - This class is derived from MachineFunction private
 /// PowerPC target-specific information for each MachineFunction.
 class PPCFunctionInfo : public MachineFunctionInfo {
+public:
+  // The value in the ParamType are used to indicate the bitstrings used in the
+  // encoding format.
+  enum ParamType {
+    FixedType = 0x0,
+    ShortFloatPoint = 0x2,
+    LongFloatPoint = 0x3
+  };
+
+private:
   virtual void anchor();
 
   /// FramePointerSaveIndex - Frame index of where the old frame pointer is
@@ -69,9 +79,6 @@ class PPCFunctionInfo : public MachineFunctionInfo {
   /// disabled.
   bool DisableNonVolatileCR = false;
 
-  /// Indicates whether VRSAVE is spilled in the current function.
-  bool SpillsVRSAVE = false;
-
   /// LRStoreRequired - The bool indicates whether there is some explicit use of
   /// the LR/LR8 stack slot that is not obvious from scanning the code.  This
   /// requires that the code generator produce a store of LR to the stack on
@@ -110,6 +117,20 @@ class PPCFunctionInfo : public MachineFunctionInfo {
   /// register for parameter passing.
   unsigned VarArgsNumFPR = 0;
 
+  /// FixedParamNum - Number of fixed parameter.
+  unsigned FixedParamNum = 0;
+
+  /// FloatingParamNum - Number of floating point parameter.
+  unsigned FloatingPointParamNum = 0;
+
+  /// ParamType - Encode type for every parameter
+  /// in the order of parameters passing in.
+  /// Bitstring starts from the most significant (leftmost) bit.
+  /// '0'b => fixed parameter.
+  /// '10'b => floating point short parameter.
+  /// '11'b => floating point long parameter.
+  uint32_t ParameterType = 0;
+
   /// CRSpillFrameIndex - FrameIndex for CR spill slot for 32-bit SVR4.
   int CRSpillFrameIndex = 0;
 
@@ -175,9 +196,6 @@ public:
   void setDisableNonVolatileCR() { DisableNonVolatileCR = true; }
   bool isNonVolatileCRDisabled() const { return DisableNonVolatileCR; }
 
-  void setSpillsVRSAVE()       { SpillsVRSAVE = true; }
-  bool isVRSAVESpilled() const { return SpillsVRSAVE; }
-
   void setLRStoreRequired() { LRStoreRequired = true; }
   bool isLRStoreRequired() const { return LRStoreRequired; }
 
@@ -196,6 +214,13 @@ public:
   unsigned getVarArgsNumGPR() const { return VarArgsNumGPR; }
   void setVarArgsNumGPR(unsigned Num) { VarArgsNumGPR = Num; }
 
+  unsigned getFixedParamNum() const { return FixedParamNum; }
+
+  unsigned getFloatingPointParamNum() const { return FloatingPointParamNum; }
+
+  uint32_t getParameterType() const { return ParameterType; }
+  void appendParameterType(ParamType Type);
+
   unsigned getVarArgsNumFPR() const { return VarArgsNumFPR; }
   void setVarArgsNumFPR(unsigned Num) { VarArgsNumFPR = Num; }
 
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMachineScheduler.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMachineScheduler.cpp
index 5649d7d13966..ce615e554d94 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMachineScheduler.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMachineScheduler.cpp
@@ -49,10 +49,103 @@ bool PPCPreRASchedStrategy::biasAddiLoadCandidate(SchedCandidate &Cand,
 void PPCPreRASchedStrategy::tryCandidate(SchedCandidate &Cand,
                                          SchedCandidate &TryCand,
                                          SchedBoundary *Zone) const {
-  GenericScheduler::tryCandidate(Cand, TryCand, Zone);
+  // From GenericScheduler::tryCandidate
 
-  if (!Cand.isValid() || !Zone)
+  // Initialize the candidate if needed.
+  if (!Cand.isValid()) {
+    TryCand.Reason = NodeOrder;
     return;
+  }
+
+  // Bias PhysReg Defs and copies to their uses and defined respectively.
+  if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
+                 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
+    return;
+
+  // Avoid exceeding the target's limit.
+  if (DAG->isTrackingPressure() &&
+      tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
+                  RegExcess, TRI, DAG->MF))
+    return;
+
+  // Avoid increasing the max critical pressure in the scheduled region.
+  if (DAG->isTrackingPressure() &&
+      tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,
+                  TryCand, Cand, RegCritical, TRI, DAG->MF))
+    return;
+
+  // We only compare a subset of features when comparing nodes between
+  // Top and Bottom boundary. Some properties are simply incomparable, in many
+  // other instances we should only override the other boundary if something
+  // is a clear good pick on one boundary. Skip heuristics that are more
+  // "tie-breaking" in nature.
+  bool SameBoundary = Zone != nullptr;
+  if (SameBoundary) {
+    // For loops that are acyclic path limited, aggressively schedule for
+    // latency. Within an single cycle, whenever CurrMOps > 0, allow normal
+    // heuristics to take precedence.
+    if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&
+        tryLatency(TryCand, Cand, *Zone))
+      return;
+
+    // Prioritize instructions that read unbuffered resources by stall cycles.
+    if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
+                Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
+      return;
+  }
+
+  // Keep clustered nodes together to encourage downstream peephole
+  // optimizations which may reduce resource requirements.
+  //
+  // This is a best effort to set things up for a post-RA pass. Optimizations
+  // like generating loads of multiple registers should ideally be done within
+  // the scheduler pass by combining the loads during DAG postprocessing.
+  const SUnit *CandNextClusterSU =
+      Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
+  const SUnit *TryCandNextClusterSU =
+      TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
+  if (tryGreater(TryCand.SU == TryCandNextClusterSU,
+                 Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster))
+    return;
+
+  if (SameBoundary) {
+    // Weak edges are for clustering and other constraints.
+    if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
+                getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
+      return;
+  }
+
+  // Avoid increasing the max pressure of the entire region.
+  if (DAG->isTrackingPressure() &&
+      tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
+                  Cand, RegMax, TRI, DAG->MF))
+    return;
+
+  if (SameBoundary) {
+    // Avoid critical resource consumption and balance the schedule.
+    TryCand.initResourceDelta(DAG, SchedModel);
+    if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
+                TryCand, Cand, ResourceReduce))
+      return;
+    if (tryGreater(TryCand.ResDelta.DemandedResources,
+                   Cand.ResDelta.DemandedResources, TryCand, Cand,
+                   ResourceDemand))
+      return;
+
+    // Avoid serializing long latency dependence chains.
+    // For acyclic path limited loops, latency was already checked above.
+    if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
+        !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone))
+      return;
+
+    // Fall through to original instruction order.
+    if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
+        (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
+      TryCand.Reason = NodeOrder;
+    }
+  }
+
+  // GenericScheduler::tryCandidate end
 
   // Add powerpc specific heuristic only when TryCand isn't selected or
   // selected as node order.
@@ -61,8 +154,10 @@ void PPCPreRASchedStrategy::tryCandidate(SchedCandidate &Cand,
 
   // There are some benefits to schedule the ADDI before the load to hide the
   // latency, as RA may create a true dependency between the load and addi.
-  if (biasAddiLoadCandidate(Cand, TryCand, *Zone))
-    return;
+  if (SameBoundary) {
+    if (biasAddiLoadCandidate(Cand, TryCand, *Zone))
+      return;
+  }
 }
 
 bool PPCPostRASchedStrategy::biasAddiCandidate(SchedCandidate &Cand,
@@ -79,11 +174,44 @@ bool PPCPostRASchedStrategy::biasAddiCandidate(SchedCandidate &Cand,
 
 void PPCPostRASchedStrategy::tryCandidate(SchedCandidate &Cand,
                                           SchedCandidate &TryCand) {
-  PostGenericScheduler::tryCandidate(Cand, TryCand);
+  // From PostGenericScheduler::tryCandidate
+
+  // Initialize the candidate if needed.
+  if (!Cand.isValid()) {
+    TryCand.Reason = NodeOrder;
+    return;
+  }
+
+  // Prioritize instructions that read unbuffered resources by stall cycles.
+  if (tryLess(Top.getLatencyStallCycles(TryCand.SU),
+              Top.getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
+    return;
 
-  if (!Cand.isValid())
+  // Keep clustered nodes together.
+  if (tryGreater(TryCand.SU == DAG->getNextClusterSucc(),
+                 Cand.SU == DAG->getNextClusterSucc(), TryCand, Cand, Cluster))
     return;
 
+  // Avoid critical resource consumption and balance the schedule.
+  if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
+              TryCand, Cand, ResourceReduce))
+    return;
+  if (tryGreater(TryCand.ResDelta.DemandedResources,
+                 Cand.ResDelta.DemandedResources, TryCand, Cand,
+                 ResourceDemand))
+    return;
+
+  // Avoid serializing long latency dependence chains.
+  if (Cand.Policy.ReduceLatency && tryLatency(TryCand, Cand, Top)) {
+    return;
+  }
+
+  // Fall through to original instruction order.
+  if (TryCand.SU->NodeNum < Cand.SU->NodeNum)
+    TryCand.Reason = NodeOrder;
+
+  // PostGenericScheduler::tryCandidate end
+
   // Add powerpc post ra specific heuristic only when TryCand isn't selected or
   // selected as node order.
   if (TryCand.Reason != NodeOrder && TryCand.Reason != NoCand)
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMacroFusion.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMacroFusion.cpp
index 815dfd1402f4..d12c6d9cd406 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMacroFusion.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMacroFusion.cpp
@@ -51,8 +51,8 @@ public:
     Kd(Kind), Supported(HasFeature), DepOpIdx(Index), OpSet1(First), 
     OpSet2(Second) {}
 
-  bool hasOp1(unsigned Opc) const { return OpSet1.count(Opc) != 0; }
-  bool hasOp2(unsigned Opc) const { return OpSet2.count(Opc) != 0; }
+  bool hasOp1(unsigned Opc) const { return OpSet1.contains(Opc); }
+  bool hasOp2(unsigned Opc) const { return OpSet2.contains(Opc); }
   bool isSupported() const { return Supported; }
   Optional<unsigned> depOpIdx() const {
     if (DepOpIdx < 0)
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp
index 4ea714ff15f7..a8853609a7c8 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp
@@ -21,8 +21,8 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/ADT/Statistic.h"
 #include "llvm/Support/Debug.h"
 
 using namespace llvm;
@@ -38,11 +38,55 @@ STATISTIC(NumberOfSelfCopies,
 STATISTIC(NumFrameOffFoldInPreEmit,
           "Number of folding frame offset by using r+r in pre-emit peephole");
 
+static cl::opt<bool>
+EnablePCRelLinkerOpt("ppc-pcrel-linker-opt", cl::Hidden, cl::init(true),
+                     cl::desc("enable PC Relative linker optimization"));
+
 static cl::opt<bool>
 RunPreEmitPeephole("ppc-late-peephole", cl::Hidden, cl::init(true),
                    cl::desc("Run pre-emit peephole optimizations."));
 
 namespace {
+
+static bool hasPCRelativeForm(MachineInstr &Use) {
+  switch (Use.getOpcode()) {
+  default:
+    return false;
+  case PPC::LBZ:
+  case PPC::LBZ8:
+  case PPC::LHA:
+  case PPC::LHA8:
+  case PPC::LHZ:
+  case PPC::LHZ8:
+  case PPC::LWZ:
+  case PPC::LWZ8:
+  case PPC::STB:
+  case PPC::STB8:
+  case PPC::STH:
+  case PPC::STH8:
+  case PPC::STW:
+  case PPC::STW8:
+  case PPC::LD:
+  case PPC::STD:
+  case PPC::LWA:
+  case PPC::LXSD:
+  case PPC::LXSSP:
+  case PPC::LXV:
+  case PPC::STXSD:
+  case PPC::STXSSP:
+  case PPC::STXV:
+  case PPC::LFD:
+  case PPC::LFS:
+  case PPC::STFD:
+  case PPC::STFS:
+  case PPC::DFLOADf32:
+  case PPC::DFLOADf64:
+  case PPC::DFSTOREf32:
+  case PPC::DFSTOREf64:
+    return true;
+  }
+}
+
   class PPCPreEmitPeephole : public MachineFunctionPass {
   public:
     static char ID;
@@ -77,7 +121,7 @@ namespace {
       for (auto BBI = MBB.instr_begin(); BBI != MBB.instr_end(); ++BBI) {
         // Skip load immediate that is marked to be erased later because it
         // cannot be used to replace any other instructions.
-        if (InstrsToErase.find(&*BBI) != InstrsToErase.end())
+        if (InstrsToErase.contains(&*BBI))
           continue;
         // Skip non-load immediate.
         unsigned Opc = BBI->getOpcode();
@@ -172,6 +216,196 @@ namespace {
       return !InstrsToErase.empty();
     }
 
+    // Check if this instruction is a PLDpc that is part of a GOT indirect
+    // access.
+    bool isGOTPLDpc(MachineInstr &Instr) {
+      if (Instr.getOpcode() != PPC::PLDpc)
+        return false;
+
+      // The result must be a register.
+      const MachineOperand &LoadedAddressReg = Instr.getOperand(0);
+      if (!LoadedAddressReg.isReg())
+        return false;
+
+      // Make sure that this is a global symbol.
+      const MachineOperand &SymbolOp = Instr.getOperand(1);
+      if (!SymbolOp.isGlobal())
+        return false;
+
+      // Finally return true only if the GOT flag is present.
+      return (SymbolOp.getTargetFlags() & PPCII::MO_GOT_FLAG);
+    }
+
+    bool addLinkerOpt(MachineBasicBlock &MBB, const TargetRegisterInfo *TRI) {
+      MachineFunction *MF = MBB.getParent();
+      // If the linker opt is disabled then just return.
+      if (!EnablePCRelLinkerOpt)
+        return false;
+
+      // Add this linker opt only if we are using PC Relative memops.
+      if (!MF->getSubtarget<PPCSubtarget>().isUsingPCRelativeCalls())
+        return false;
+
+      // Struct to keep track of one def/use pair for a GOT indirect access.
+      struct GOTDefUsePair {
+        MachineBasicBlock::iterator DefInst;
+        MachineBasicBlock::iterator UseInst;
+        Register DefReg;
+        Register UseReg;
+        bool StillValid;
+      };
+      // Vector of def/ues pairs in this basic block.
+      SmallVector<GOTDefUsePair, 4> CandPairs;
+      SmallVector<GOTDefUsePair, 4> ValidPairs;
+      bool MadeChange = false;
+
+      // Run through all of the instructions in the basic block and try to
+      // collect potential pairs of GOT indirect access instructions.
+      for (auto BBI = MBB.instr_begin(); BBI != MBB.instr_end(); ++BBI) {
+        // Look for the initial GOT indirect load.
+        if (isGOTPLDpc(*BBI)) {
+          GOTDefUsePair CurrentPair{BBI, MachineBasicBlock::iterator(),
+                                    BBI->getOperand(0).getReg(),
+                                    PPC::NoRegister, true};
+          CandPairs.push_back(CurrentPair);
+          continue;
+        }
+
+        // We haven't encountered any new PLD instructions, nothing to check.
+        if (CandPairs.empty())
+          continue;
+
+        // Run through the candidate pairs and see if any of the registers
+        // defined in the PLD instructions are used by this instruction.
+        // Note: the size of CandPairs can change in the loop.
+        for (unsigned Idx = 0; Idx < CandPairs.size(); Idx++) {
+          GOTDefUsePair &Pair = CandPairs[Idx];
+          // The instruction does not use or modify this PLD's def reg,
+          // ignore it.
+          if (!BBI->readsRegister(Pair.DefReg, TRI) &&
+              !BBI->modifiesRegister(Pair.DefReg, TRI))
+            continue;
+
+          // The use needs to be used in the address compuation and not
+          // as the register being stored for a store.
+          const MachineOperand *UseOp =
+              hasPCRelativeForm(*BBI) ? &BBI->getOperand(2) : nullptr;
+
+          // Check for a valid use.
+          if (UseOp && UseOp->isReg() && UseOp->getReg() == Pair.DefReg &&
+              UseOp->isUse() && UseOp->isKill()) {
+            Pair.UseInst = BBI;
+            Pair.UseReg = BBI->getOperand(0).getReg();
+            ValidPairs.push_back(Pair);
+          }
+          CandPairs.erase(CandPairs.begin() + Idx);
+        }
+      }
+
+      // Go through all of the pairs and check for any more valid uses.
+      for (auto Pair = ValidPairs.begin(); Pair != ValidPairs.end(); Pair++) {
+        // We shouldn't be here if we don't have a valid pair.
+        assert(Pair->UseInst.isValid() && Pair->StillValid &&
+               "Kept an invalid def/use pair for GOT PCRel opt");
+        // We have found a potential pair. Search through the instructions
+        // between the def and the use to see if it is valid to mark this as a
+        // linker opt.
+        MachineBasicBlock::iterator BBI = Pair->DefInst;
+        ++BBI;
+        for (; BBI != Pair->UseInst; ++BBI) {
+          if (BBI->readsRegister(Pair->UseReg, TRI) ||
+              BBI->modifiesRegister(Pair->UseReg, TRI)) {
+            Pair->StillValid = false;
+            break;
+          }
+        }
+
+        if (!Pair->StillValid)
+          continue;
+
+        // The load/store instruction that uses the address from the PLD will
+        // either use a register (for a store) or define a register (for the
+        // load). That register will be added as an implicit def to the PLD
+        // and as an implicit use on the second memory op. This is a precaution
+        // to prevent future passes from using that register between the two
+        // instructions.
+        MachineOperand ImplDef =
+            MachineOperand::CreateReg(Pair->UseReg, true, true);
+        MachineOperand ImplUse =
+            MachineOperand::CreateReg(Pair->UseReg, false, true);
+        Pair->DefInst->addOperand(ImplDef);
+        Pair->UseInst->addOperand(ImplUse);
+
+        // Create the symbol.
+        MCContext &Context = MF->getContext();
+        MCSymbol *Symbol = Context.createNamedTempSymbol("pcrel");
+        MachineOperand PCRelLabel =
+            MachineOperand::CreateMCSymbol(Symbol, PPCII::MO_PCREL_OPT_FLAG);
+        Pair->DefInst->addOperand(*MF, PCRelLabel);
+        Pair->UseInst->addOperand(*MF, PCRelLabel);
+        MadeChange |= true;
+      }
+      return MadeChange;
+    }
+
+    // This function removes redundant pairs of accumulator prime/unprime
+    // instructions. In some situations, it's possible the compiler inserts an
+    // accumulator prime instruction followed by an unprime instruction (e.g.
+    // when we store an accumulator after restoring it from a spill). If the
+    // accumulator is not used between the two, they can be removed. This
+    // function removes these redundant pairs from basic blocks.
+    // The algorithm is quite straightforward - every time we encounter a prime
+    // instruction, the primed register is added to a candidate set. Any use
+    // other than a prime removes the candidate from the set and any de-prime
+    // of a current candidate marks both the prime and de-prime for removal.
+    // This way we ensure we only remove prime/de-prime *pairs* with no
+    // intervening uses.
+    bool removeAccPrimeUnprime(MachineBasicBlock &MBB) {
+      DenseSet<MachineInstr *> InstrsToErase;
+      // Initially, none of the acc registers are candidates.
+      SmallVector<MachineInstr *, 8> Candidates(
+          PPC::UACCRCRegClass.getNumRegs(), nullptr);
+
+      for (MachineInstr &BBI : MBB.instrs()) {
+        unsigned Opc = BBI.getOpcode();
+        // If we are visiting a xxmtacc instruction, we add it and its operand
+        // register to the candidate set.
+        if (Opc == PPC::XXMTACC) {
+          Register Acc = BBI.getOperand(0).getReg();
+          assert(PPC::ACCRCRegClass.contains(Acc) &&
+                 "Unexpected register for XXMTACC");
+          Candidates[Acc - PPC::ACC0] = &BBI;
+        }
+        // If we are visiting a xxmfacc instruction and its operand register is
+        // in the candidate set, we mark the two instructions for removal.
+        else if (Opc == PPC::XXMFACC) {
+          Register Acc = BBI.getOperand(0).getReg();
+          assert(PPC::ACCRCRegClass.contains(Acc) &&
+                 "Unexpected register for XXMFACC");
+          if (!Candidates[Acc - PPC::ACC0])
+            continue;
+          InstrsToErase.insert(&BBI);
+          InstrsToErase.insert(Candidates[Acc - PPC::ACC0]);
+        }
+        // If we are visiting an instruction using an accumulator register
+        // as operand, we remove it from the candidate set.
+        else {
+          for (MachineOperand &Operand : BBI.operands()) {
+            if (!Operand.isReg())
+              continue;
+            Register Reg = Operand.getReg();
+            if (PPC::ACCRCRegClass.contains(Reg))
+              Candidates[Reg - PPC::ACC0] = nullptr;
+          }
+        }
+      }
+
+      for (MachineInstr *MI : InstrsToErase)
+        MI->eraseFromParent();
+      NumRemovedInPreEmit += InstrsToErase.size();
+      return !InstrsToErase.empty();
+    }
+
     bool runOnMachineFunction(MachineFunction &MF) override {
       if (skipFunction(MF.getFunction()) || !RunPreEmitPeephole) {
         // Remove UNENCODED_NOP even when this pass is disabled.
@@ -192,6 +426,8 @@ namespace {
       SmallVector<MachineInstr *, 4> InstrsToErase;
       for (MachineBasicBlock &MBB : MF) {
         Changed |= removeRedundantLIs(MBB, TRI);
+        Changed |= addLinkerOpt(MBB, TRI);
+        Changed |= removeAccPrimeUnprime(MBB);
         for (MachineInstr &MI : MBB) {
           unsigned Opc = MI.getOpcode();
           if (Opc == PPC::UNENCODED_NOP) {
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCQPXLoadSplat.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCQPXLoadSplat.cpp
deleted file mode 100644
index 6e9042643820..000000000000
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCQPXLoadSplat.cpp
+++ /dev/null
@@ -1,161 +0,0 @@
-//===----- PPCQPXLoadSplat.cpp - QPX Load Splat Simplification ------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// The QPX vector registers overlay the scalar floating-point registers, and
-// any scalar floating-point loads splat their value across all vector lanes.
-// Thus, if we have a scalar load followed by a splat, we can remove the splat
-// (i.e. replace the load with a load-and-splat pseudo instruction).
-//
-// This pass must run after anything that might do store-to-load forwarding.
-//
-//===----------------------------------------------------------------------===//
-
-#include "PPC.h"
-#include "PPCInstrBuilder.h"
-#include "PPCInstrInfo.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/TargetSubtargetInfo.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Target/TargetMachine.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "ppc-qpx-load-splat"
-
-STATISTIC(NumSimplified, "Number of QPX load splats simplified");
-
-namespace {
-  struct PPCQPXLoadSplat : public MachineFunctionPass {
-    static char ID;
-    PPCQPXLoadSplat() : MachineFunctionPass(ID) {
-      initializePPCQPXLoadSplatPass(*PassRegistry::getPassRegistry());
-    }
-
-    bool runOnMachineFunction(MachineFunction &Fn) override;
-
-    StringRef getPassName() const override {
-      return "PowerPC QPX Load Splat Simplification";
-    }
-  };
-  char PPCQPXLoadSplat::ID = 0;
-}
-
-INITIALIZE_PASS(PPCQPXLoadSplat, "ppc-qpx-load-splat",
-                "PowerPC QPX Load Splat Simplification",
-                false, false)
-
-FunctionPass *llvm::createPPCQPXLoadSplatPass() {
-  return new PPCQPXLoadSplat();
-}
-
-bool PPCQPXLoadSplat::runOnMachineFunction(MachineFunction &MF) {
-  if (skipFunction(MF.getFunction()))
-    return false;
-
-  bool MadeChange = false;
-  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
-
-  for (auto MFI = MF.begin(), MFIE = MF.end(); MFI != MFIE; ++MFI) {
-    MachineBasicBlock *MBB = &*MFI;
-    SmallVector<MachineInstr *, 4> Splats;
-
-    for (auto MBBI = MBB->rbegin(); MBBI != MBB->rend(); ++MBBI) {
-      MachineInstr *MI = &*MBBI;
-
-      if (MI->hasUnmodeledSideEffects() || MI->isCall()) {
-        Splats.clear();
-        continue;
-      }
-
-      // We're looking for a sequence like this:
-      // %f0 = LFD 0, killed %x3, implicit-def %qf0; mem:LD8[%a](tbaa=!2)
-      // %qf1 = QVESPLATI killed %qf0, 0, implicit %rm
-
-      for (auto SI = Splats.begin(); SI != Splats.end();) {
-        MachineInstr *SMI = *SI;
-        Register SplatReg = SMI->getOperand(0).getReg();
-        Register SrcReg = SMI->getOperand(1).getReg();
-
-        if (MI->modifiesRegister(SrcReg, TRI)) {
-          switch (MI->getOpcode()) {
-          default:
-            SI = Splats.erase(SI);
-            continue;
-          case PPC::LFS:
-          case PPC::LFD:
-          case PPC::LFSU:
-          case PPC::LFDU:
-          case PPC::LFSUX:
-          case PPC::LFDUX:
-          case PPC::LFSX:
-          case PPC::LFDX:
-          case PPC::LFIWAX:
-          case PPC::LFIWZX:
-            if (SplatReg != SrcReg) {
-              // We need to change the load to define the scalar subregister of
-              // the QPX splat source register.
-              unsigned SubRegIndex =
-                TRI->getSubRegIndex(SrcReg, MI->getOperand(0).getReg());
-              Register SplatSubReg = TRI->getSubReg(SplatReg, SubRegIndex);
-
-              // Substitute both the explicit defined register, and also the
-              // implicit def of the containing QPX register.
-              MI->getOperand(0).setReg(SplatSubReg);
-              MI->substituteRegister(SrcReg, SplatReg, 0, *TRI);
-            }
-
-            SI = Splats.erase(SI);
-
-            // If SMI is directly after MI, then MBBI's base iterator is
-            // pointing at SMI.  Adjust MBBI around the call to erase SMI to
-            // avoid invalidating MBBI.
-            ++MBBI;
-            SMI->eraseFromParent();
-            --MBBI;
-
-            ++NumSimplified;
-            MadeChange = true;
-            continue;
-          }
-        }
-
-        // If this instruction defines the splat register, then we cannot move
-        // the previous definition above it. If it reads from the splat
-        // register, then it must already be alive from some previous
-        // definition, and if the splat register is different from the source
-        // register, then this definition must not be the load for which we're
-        // searching.
-        if (MI->modifiesRegister(SplatReg, TRI) ||
-            (SrcReg != SplatReg &&
-             MI->readsRegister(SplatReg, TRI))) {
-          SI = Splats.erase(SI);
-          continue;
-        }
-
-        ++SI;
-      }
-
-      if (MI->getOpcode() != PPC::QVESPLATI &&
-          MI->getOpcode() != PPC::QVESPLATIs &&
-          MI->getOpcode() != PPC::QVESPLATIb)
-        continue;
-      if (MI->getOperand(2).getImm() != 0)
-        continue;
-
-      // If there are other uses of the scalar value after this, replacing
-      // those uses might be non-trivial.
-      if (!MI->getOperand(1).isKill())
-        continue;
-
-      Splats.push_back(MI);
-    }
-  }
-
-  return MadeChange;
-}
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp
index 90cc81beb89d..5cee00c61fc1 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp
@@ -206,9 +206,9 @@ static bool splitMBB(BlockSplitInfo &BSI) {
   NewMBB->splice(NewMBB->end(), ThisMBB, InsertPoint, ThisMBB->end());
   NewMBB->transferSuccessors(ThisMBB);
   if (!ProbOrigTarget.isUnknown()) {
-    auto MBBI = std::find(NewMBB->succ_begin(), NewMBB->succ_end(), OrigTarget);
+    auto MBBI = find(NewMBB->successors(), OrigTarget);
     NewMBB->setSuccProbability(MBBI, ProbOrigTarget);
-    MBBI = std::find(NewMBB->succ_begin(), NewMBB->succ_end(), OrigFallThrough);
+    MBBI = find(NewMBB->successors(), OrigFallThrough);
     NewMBB->setSuccProbability(MBBI, ProbOrigFallThrough);
   }
 
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
index ed8948a63972..178a13443e2a 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -75,6 +75,21 @@ MaxCRBitSpillDist("ppc-max-crbit-spill-dist",
                            "spill on ppc"),
                   cl::Hidden, cl::init(100));
 
+// Copies/moves of physical accumulators are expensive operations
+// that should be avoided whenever possible. MMA instructions are
+// meant to be used in performance-sensitive computational kernels.
+// This option is provided, at least for the time being, to give the
+// user a tool to detect this expensive operation and either rework
+// their code or report a compiler bug if that turns out to be the
+// cause.
+#ifndef NDEBUG
+static cl::opt<bool>
+ReportAccMoves("ppc-report-acc-moves",
+               cl::desc("Emit information about accumulator register spills "
+                        "and copies"),
+               cl::Hidden, cl::init(false));
+#endif
+
 static unsigned offsetMinAlignForOpcode(unsigned OpC);
 
 PPCRegisterInfo::PPCRegisterInfo(const PPCTargetMachine &TM)
@@ -141,6 +156,10 @@ PPCRegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind)
 const MCPhysReg*
 PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   const PPCSubtarget &Subtarget = MF->getSubtarget<PPCSubtarget>();
+  if (Subtarget.isAIXABI() &&
+      (Subtarget.hasAltivec() && !TM.getAIXExtendedAltivecABI()))
+    report_fatal_error("the default AIX Altivec ABI is not yet "
+                       "supported.");
   if (MF->getFunction().getCallingConv() == CallingConv::AnyReg) {
     if (!TM.isPPC64() && Subtarget.isAIXABI())
       report_fatal_error("AnyReg unimplemented on 32-bit AIX.");
@@ -187,8 +206,11 @@ PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     return SaveR2 ? CSR_PPC64_R2_SaveList : CSR_PPC64_SaveList;
   }
   // 32-bit targets.
-  if (Subtarget.isAIXABI())
+  if (Subtarget.isAIXABI()) {
+    if (Subtarget.hasAltivec())
+      return CSR_AIX32_Altivec_SaveList;
     return CSR_AIX32_SaveList;
+  }
   if (Subtarget.hasAltivec())
     return CSR_SVR432_Altivec_SaveList;
   else if (Subtarget.hasSPE())
@@ -209,8 +231,10 @@ PPCRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
   }
 
   if (Subtarget.isAIXABI()) {
-    assert(!Subtarget.hasAltivec() && "Altivec is not implemented on AIX yet.");
-    return TM.isPPC64() ? CSR_PPC64_RegMask : CSR_AIX32_RegMask;
+    return TM.isPPC64() ? (Subtarget.hasAltivec() ? CSR_PPC64_Altivec_RegMask
+                                                  : CSR_PPC64_RegMask)
+                        : (Subtarget.hasAltivec() ? CSR_AIX32_Altivec_RegMask
+                                                  : CSR_AIX32_RegMask);
   }
 
   if (CC == CallingConv::Cold) {
@@ -404,9 +428,6 @@ unsigned PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
   }
   case PPC::F8RCRegClassID:
   case PPC::F4RCRegClassID:
-  case PPC::QFRCRegClassID:
-  case PPC::QSRCRegClassID:
-  case PPC::QBRCRegClassID:
   case PPC::VRRCRegClassID:
   case PPC::VFRCRegClassID:
   case PPC::VSLRCRegClassID:
@@ -830,6 +851,16 @@ void PPCRegisterInfo::lowerCRBitSpilling(MachineBasicBlock::iterator II,
     SpillsKnownBit = true;
     break;
   default:
+    // On Power10, we can use SETNBC to spill all CR bits. SETNBC will set all
+    // bits (specifically, it produces a -1 if the CR bit is set). Ultimately,
+    // the bit that is of importance to us is bit 32 (bit 0 of a 32-bit
+    // register), and SETNBC will set this.
+    if (Subtarget.isISA3_1()) {
+      BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::SETNBC8 : PPC::SETNBC), Reg)
+          .addReg(SrcReg, RegState::Undef);
+      break;
+    }
+
     // On Power9, we can use SETB to extract the LT bit. This only works for
     // the LT bit since SETB produces -1/1/0 for LT/GT/<neither>. So the value
     // of the bit we care about (32-bit sign bit) will be set to the value of
@@ -929,54 +960,104 @@ void PPCRegisterInfo::lowerCRBitRestore(MachineBasicBlock::iterator II,
   MBB.erase(II);
 }
 
-void PPCRegisterInfo::lowerVRSAVESpilling(MachineBasicBlock::iterator II,
-                                          unsigned FrameIndex) const {
-  // Get the instruction.
-  MachineInstr &MI = *II;       // ; SPILL_VRSAVE <SrcReg>, <offset>
-  // Get the instruction's basic block.
+void PPCRegisterInfo::emitAccCopyInfo(MachineBasicBlock &MBB,
+                                      MCRegister DestReg, MCRegister SrcReg) {
+#ifdef NDEBUG
+  return;
+#else
+  if (ReportAccMoves) {
+    std::string Dest = PPC::ACCRCRegClass.contains(DestReg) ? "acc" : "uacc";
+    std::string Src = PPC::ACCRCRegClass.contains(SrcReg) ? "acc" : "uacc";
+    dbgs() << "Emitting copy from " << Src << " to " << Dest << ":\n";
+    MBB.dump();
+  }
+#endif
+}
+
+static void emitAccSpillRestoreInfo(MachineBasicBlock &MBB, bool IsPrimed,
+                                    bool IsRestore) {
+#ifdef NDEBUG
+  return;
+#else
+  if (ReportAccMoves) {
+    dbgs() << "Emitting " << (IsPrimed ? "acc" : "uacc") << " register "
+           << (IsRestore ? "restore" : "spill") << ":\n";
+    MBB.dump();
+  }
+#endif
+}
+
+/// lowerACCSpilling - Generate the code for spilling the accumulator register.
+/// Similarly to other spills/reloads that use pseudo-ops, we do not actually
+/// eliminate the FrameIndex here nor compute the stack offset. We simply
+/// create a real instruction with an FI and rely on eliminateFrameIndex to
+/// handle the FI elimination.
+void PPCRegisterInfo::lowerACCSpilling(MachineBasicBlock::iterator II,
+                                       unsigned FrameIndex) const {
+  MachineInstr &MI = *II; // SPILL_ACC <SrcReg>, <offset>
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
   const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
   const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
-  DebugLoc dl = MI.getDebugLoc();
-
-  const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
-  Register Reg = MF.getRegInfo().createVirtualRegister(GPRC);
+  DebugLoc DL = MI.getDebugLoc();
   Register SrcReg = MI.getOperand(0).getReg();
-
-  BuildMI(MBB, II, dl, TII.get(PPC::MFVRSAVEv), Reg)
-      .addReg(SrcReg, getKillRegState(MI.getOperand(0).isKill()));
-
-  addFrameReference(
-      BuildMI(MBB, II, dl, TII.get(PPC::STW)).addReg(Reg, RegState::Kill),
-      FrameIndex);
+  bool IsKilled = MI.getOperand(0).isKill();
+
+  bool IsPrimed = PPC::ACCRCRegClass.contains(SrcReg);
+  Register Reg =
+      PPC::VSRp0 + (SrcReg - (IsPrimed ? PPC::ACC0 : PPC::UACC0)) * 2;
+  bool IsLittleEndian = Subtarget.isLittleEndian();
+
+  emitAccSpillRestoreInfo(MBB, IsPrimed, false);
+
+  // De-prime the register being spilled, create two stores for the pair
+  // subregisters accounting for endianness and then re-prime the register if
+  // it isn't killed.  This uses the Offset parameter to addFrameReference() to
+  // adjust the offset of the store that is within the 64-byte stack slot.
+  if (IsPrimed)
+    BuildMI(MBB, II, DL, TII.get(PPC::XXMFACC), SrcReg).addReg(SrcReg);
+  addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXVP))
+                        .addReg(Reg, getKillRegState(IsKilled)),
+                    FrameIndex, IsLittleEndian ? 32 : 0);
+  addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXVP))
+                        .addReg(Reg + 1, getKillRegState(IsKilled)),
+                    FrameIndex, IsLittleEndian ? 0 : 32);
+  if (IsPrimed && !IsKilled)
+    BuildMI(MBB, II, DL, TII.get(PPC::XXMTACC), SrcReg).addReg(SrcReg);
 
   // Discard the pseudo instruction.
   MBB.erase(II);
 }
 
-void PPCRegisterInfo::lowerVRSAVERestore(MachineBasicBlock::iterator II,
-                                         unsigned FrameIndex) const {
-  // Get the instruction.
-  MachineInstr &MI = *II;       // ; <DestReg> = RESTORE_VRSAVE <offset>
-  // Get the instruction's basic block.
+/// lowerACCRestore - Generate the code to restore the accumulator register.
+void PPCRegisterInfo::lowerACCRestore(MachineBasicBlock::iterator II,
+                                      unsigned FrameIndex) const {
+  MachineInstr &MI = *II; // <DestReg> = RESTORE_ACC <offset>
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
   const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
   const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
-  DebugLoc dl = MI.getDebugLoc();
+  DebugLoc DL = MI.getDebugLoc();
 
-  const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
-  Register Reg = MF.getRegInfo().createVirtualRegister(GPRC);
   Register DestReg = MI.getOperand(0).getReg();
   assert(MI.definesRegister(DestReg) &&
-    "RESTORE_VRSAVE does not define its destination");
+         "RESTORE_ACC does not define its destination");
 
-  addFrameReference(BuildMI(MBB, II, dl, TII.get(PPC::LWZ),
-                              Reg), FrameIndex);
+  bool IsPrimed = PPC::ACCRCRegClass.contains(DestReg);
+  Register Reg =
+      PPC::VSRp0 + (DestReg - (IsPrimed ? PPC::ACC0 : PPC::UACC0)) * 2;
+  bool IsLittleEndian = Subtarget.isLittleEndian();
 
-  BuildMI(MBB, II, dl, TII.get(PPC::MTVRSAVEv), DestReg)
-             .addReg(Reg, RegState::Kill);
+  emitAccSpillRestoreInfo(MBB, IsPrimed, true);
+
+  // Create two loads for the pair subregisters accounting for endianness and
+  // then prime the accumulator register being restored.
+  addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::LXVP), Reg),
+                    FrameIndex, IsLittleEndian ? 32 : 0);
+  addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::LXVP), Reg + 1),
+                    FrameIndex, IsLittleEndian ? 0 : 32);
+  if (IsPrimed)
+    BuildMI(MBB, II, DL, TII.get(PPC::XXMTACC), DestReg).addReg(DestReg);
 
   // Discard the pseudo instruction.
   MBB.erase(II);
@@ -1113,11 +1194,11 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   } else if (OpC == PPC::RESTORE_CRBIT) {
     lowerCRBitRestore(II, FrameIndex);
     return;
-  } else if (OpC == PPC::SPILL_VRSAVE) {
-    lowerVRSAVESpilling(II, FrameIndex);
+  } else if (OpC == PPC::SPILL_ACC || OpC == PPC::SPILL_UACC) {
+    lowerACCSpilling(II, FrameIndex);
     return;
-  } else if (OpC == PPC::RESTORE_VRSAVE) {
-    lowerVRSAVERestore(II, FrameIndex);
+  } else if (OpC == PPC::RESTORE_ACC || OpC == PPC::RESTORE_UACC) {
+    lowerACCRestore(II, FrameIndex);
     return;
   }
 
@@ -1294,10 +1375,9 @@ needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
 
 /// Insert defining instruction(s) for BaseReg to
 /// be a pointer to FrameIdx at the beginning of the basic block.
-void PPCRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
-                                                   Register BaseReg,
-                                                   int FrameIdx,
-                                                   int64_t Offset) const {
+Register PPCRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
+                                                       int FrameIdx,
+                                                       int64_t Offset) const {
   unsigned ADDriOpc = TM.isPPC64() ? PPC::ADDI8 : PPC::ADDI;
 
   MachineBasicBlock::iterator Ins = MBB->begin();
@@ -1310,10 +1390,14 @@ void PPCRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
   const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
   const MCInstrDesc &MCID = TII.get(ADDriOpc);
   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  const TargetRegisterClass *RC = getPointerRegClass(MF);
+  Register BaseReg = MRI.createVirtualRegister(RC);
   MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0, this, MF));
 
   BuildMI(*MBB, Ins, DL, MCID, BaseReg)
     .addFrameIndex(FrameIdx).addImm(Offset);
+
+  return BaseReg;
 }
 
 void PPCRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCRegisterInfo.h b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
index 61acd955e1cb..93f330ab56b6 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
@@ -119,10 +119,14 @@ public:
                           unsigned FrameIndex) const;
   void lowerCRBitRestore(MachineBasicBlock::iterator II,
                          unsigned FrameIndex) const;
-  void lowerVRSAVESpilling(MachineBasicBlock::iterator II,
-                           unsigned FrameIndex) const;
-  void lowerVRSAVERestore(MachineBasicBlock::iterator II,
-                          unsigned FrameIndex) const;
+
+  void lowerACCSpilling(MachineBasicBlock::iterator II,
+                        unsigned FrameIndex) const;
+  void lowerACCRestore(MachineBasicBlock::iterator II,
+                       unsigned FrameIndex) const;
+
+  static void emitAccCopyInfo(MachineBasicBlock &MBB, MCRegister DestReg,
+                              MCRegister SrcReg);
 
   bool hasReservedSpillSlot(const MachineFunction &MF, Register Reg,
                             int &FrameIdx) const override;
@@ -132,9 +136,8 @@ public:
 
   // Support for virtual base registers.
   bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override;
-  void materializeFrameBaseRegister(MachineBasicBlock *MBB, Register BaseReg,
-                                    int FrameIdx,
-                                    int64_t Offset) const override;
+  Register materializeFrameBaseRegister(MachineBasicBlock *MBB, int FrameIdx,
+                                        int64_t Offset) const override;
   void resolveFrameIndex(MachineInstr &MI, Register BaseReg,
                          int64_t Offset) const override;
   bool isFrameOffsetLegal(const MachineInstr *MI, Register BaseReg,
@@ -151,12 +154,18 @@ public:
   /// register name so that only the number is left.  Used by for linux asm.
   static const char *stripRegisterPrefix(const char *RegName) {
     switch (RegName[0]) {
+      case 'a':
+        if (RegName[1] == 'c' && RegName[2] == 'c')
+          return RegName + 3;
+      break;
       case 'r':
       case 'f':
-      case 'q': // for QPX
       case 'v':
-        if (RegName[1] == 's')
+        if (RegName[1] == 's') {
+          if (RegName[2] == 'p')
+            return RegName + 3;
           return RegName + 2;
+        }
         return RegName + 1;
       case 'c': if (RegName[1] == 'r') return RegName + 2;
     }
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCRegisterInfo.td b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
index b45757c1acc5..551735c85b51 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
@@ -16,6 +16,10 @@ def sub_eq : SubRegIndex<1, 2>;
 def sub_un : SubRegIndex<1, 3>;
 def sub_32 : SubRegIndex<32>;
 def sub_64 : SubRegIndex<64>;
+def sub_vsx0 : SubRegIndex<128>;
+def sub_vsx1 : SubRegIndex<128, 128>;
+def sub_pair0 : SubRegIndex<256>;
+def sub_pair1 : SubRegIndex<256, 256>;
 }
 
 
@@ -54,13 +58,6 @@ class FPR<bits<5> num, string n> : PPCReg<n> {
   let HWEncoding{4-0} = num;
 }
 
-// QFPR - One of the 32 256-bit floating-point vector registers (used for QPX)
-class QFPR<FPR SubReg, string n> : PPCReg<n> {
-  let HWEncoding = SubReg.HWEncoding;
-  let SubRegs = [SubReg];
-  let SubRegIndices = [sub_64];
-}
-
 // VF - One of the 32 64-bit floating-point subregisters of the vector
 // registers (used by VSX).
 class VF<bits<5> num, string n> : PPCReg<n> {
@@ -101,6 +98,27 @@ class CRBIT<bits<5> num, string n> : PPCReg<n> {
   let HWEncoding{4-0} = num;
 }
 
+// ACC - One of the 8 512-bit VSX accumulators.
+class ACC<bits<3> num, string n, list<Register> subregs> : PPCReg<n> {
+  let HWEncoding{2-0} = num;
+  let SubRegs = subregs;
+}
+
+// UACC - One of the 8 512-bit VSX accumulators prior to being primed.
+// Without using this register class, the register allocator has no way to
+// differentiate a primed accumulator from an unprimed accumulator.
+// This may result in invalid copies between primed and unprimed accumulators.
+class UACC<bits<3> num, string n, list<Register> subregs> : PPCReg<n> {
+  let HWEncoding{2-0} = num;
+  let SubRegs = subregs;
+}
+
+// VSR Pairs - One of the 32 paired even-odd consecutive VSRs.
+class VSRPair<bits<5> num, string n, list<Register> subregs> : PPCReg<n> {
+  let HWEncoding{4-0} = num;
+  let SubRegs = subregs;
+}
+
 // General-purpose registers
 foreach Index = 0-31 in {
   def R#Index : GPR<Index, "r"#Index>, DwarfRegNum<[-2, Index]>;
@@ -132,12 +150,6 @@ foreach Index = 0-31 in {
                  DwarfRegNum<[!add(Index, 77), !add(Index, 77)]>;
 }
 
-// QPX Floating-point registers
-foreach Index = 0-31 in {
-  def QF#Index : QFPR<!cast<FPR>("F"#Index), "q"#Index>,
-                 DwarfRegNum<[!add(Index, 32), !add(Index, 32)]>;
-}
-
 // Vector registers
 foreach Index = 0-31 in {
   def V#Index : VR<!cast<VF>("VF"#Index), "v"#Index>,
@@ -156,6 +168,23 @@ foreach Index = 32-63 in {
   def VSX#Index : VSXReg<Index, "vs"#Index>;
 }
 
+let SubRegIndices = [sub_vsx0, sub_vsx1] in {
+  // VSR pairs 0 - 15 (corresponding to VSRs 0 - 30 paired with 1 - 31).
+  foreach Index = { 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 } in {
+    def VSRp#!srl(Index, 1) : VSRPair<!srl(Index, 1), "vsp"#Index,
+                                      [!cast<VSRL>("VSL"#Index), !cast<VSRL>("VSL"#!add(Index, 1))]>,
+                              DwarfRegNum<[-1, -1]>;
+  }
+
+  // VSR pairs 16 - 31 (corresponding to VSRs 32 - 62 paired with 33 - 63).
+  foreach Index = { 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 } in {
+    def VSRp#!add(!srl(Index, 1), 16) :
+      VSRPair<!add(!srl(Index, 1), 16), "vsp"#!add(Index, 32),
+              [!cast<VR>("V"#Index), !cast<VR>("V"#!add(Index, 1))]>,
+      DwarfRegNum<[-1, -1]>;
+  }
+}
+
 // The representation of r0 when treated as the constant 0.
 def ZERO  : GPR<0, "0">,    DwarfRegAlias<R0>;
 def ZERO8 : GP8<ZERO, "0">, DwarfRegAlias<X0>;
@@ -343,16 +372,6 @@ def SPILLTOVSRRC : RegisterClass<"PPC", [i64, f64], 64, (add G8RC, (sub VSFRC,
 // Register class for single precision scalars in VSX registers
 def VSSRC : RegisterClass<"PPC", [f32], 32, (add VSFRC)>;
 
-// For QPX
-def QFRC : RegisterClass<"PPC", [v4f64], 256, (add (sequence "QF%u", 0, 13),
-                                                (sequence "QF%u", 31, 14))>;
-def QSRC : RegisterClass<"PPC", [v4f32], 128, (add QFRC)>;
-def QBRC : RegisterClass<"PPC", [v4i1], 256, (add QFRC)> {
-  // These are actually stored as floating-point values where a positive
-  // number is true and anything else (including NaN) is false.
-  let Size = 256;
-}
-
 def CRBITRC : RegisterClass<"PPC", [i1], 32,
   (add CR2LT, CR2GT, CR2EQ, CR2UN,
        CR3LT, CR3GT, CR3EQ, CR3UN,
@@ -390,8 +409,56 @@ def CTRRC8 : RegisterClass<"PPC", [i64], 64, (add CTR8)> {
   let isAllocatable = 0;
 }
 
+def LRRC : RegisterClass<"PPC", [i32], 32, (add LR)> {
+  let isAllocatable = 0;
+}
+def LR8RC : RegisterClass<"PPC", [i64], 64, (add LR8)> {
+  let isAllocatable = 0;
+}
+
 def VRSAVERC : RegisterClass<"PPC", [i32], 32, (add VRSAVE)>;
 def CARRYRC : RegisterClass<"PPC", [i32], 32, (add CARRY, XER)> {
   let CopyCost = -1;
 }
 
+let SubRegIndices = [sub_pair0, sub_pair1] in {
+  def ACC0 : ACC<0, "acc0", [VSRp0, VSRp1]>, DwarfRegNum<[-1, -1]>;
+  def ACC1 : ACC<1, "acc1", [VSRp2, VSRp3]>, DwarfRegNum<[-1, -1]>;
+  def ACC2 : ACC<2, "acc2", [VSRp4, VSRp5]>, DwarfRegNum<[-1, -1]>;
+  def ACC3 : ACC<3, "acc3", [VSRp6, VSRp7]>, DwarfRegNum<[-1, -1]>;
+  def ACC4 : ACC<4, "acc4", [VSRp8, VSRp9]>, DwarfRegNum<[-1, -1]>;
+  def ACC5 : ACC<5, "acc5", [VSRp10, VSRp11]>, DwarfRegNum<[-1, -1]>;
+  def ACC6 : ACC<6, "acc6", [VSRp12, VSRp13]>, DwarfRegNum<[-1, -1]>;
+  def ACC7 : ACC<7, "acc7", [VSRp14, VSRp15]>, DwarfRegNum<[-1, -1]>;
+}
+def ACCRC : RegisterClass<"PPC", [v512i1], 128, (add ACC0, ACC1, ACC2, ACC3,
+                                                      ACC4, ACC5, ACC6, ACC7)> {
+  let Size = 512;
+}
+
+let SubRegIndices = [sub_pair0, sub_pair1] in {
+  def UACC0 : UACC<0, "acc0", [VSRp0, VSRp1]>, DwarfRegNum<[-1, -1]>;
+  def UACC1 : UACC<1, "acc1", [VSRp2, VSRp3]>, DwarfRegNum<[-1, -1]>;
+  def UACC2 : UACC<2, "acc2", [VSRp4, VSRp5]>, DwarfRegNum<[-1, -1]>;
+  def UACC3 : UACC<3, "acc3", [VSRp6, VSRp7]>, DwarfRegNum<[-1, -1]>;
+  def UACC4 : UACC<4, "acc4", [VSRp8, VSRp9]>, DwarfRegNum<[-1, -1]>;
+  def UACC5 : UACC<5, "acc5", [VSRp10, VSRp11]>, DwarfRegNum<[-1, -1]>;
+  def UACC6 : UACC<6, "acc6", [VSRp12, VSRp13]>, DwarfRegNum<[-1, -1]>;
+  def UACC7 : UACC<7, "acc7", [VSRp14, VSRp15]>, DwarfRegNum<[-1, -1]>;
+}
+def UACCRC : RegisterClass<"PPC", [v512i1], 128,
+                           (add UACC0, UACC1, UACC2, UACC3,
+                                UACC4, UACC5, UACC6, UACC7)> {
+  let Size = 512;
+}
+
+// Allocate in the same order as the underlying VSX registers.
+def VSRpRC :
+  RegisterClass<"PPC", [v256i1], 128,
+                (add (sequence "VSRp%u", 0, 6),
+                     (sequence "VSRp%u", 15, 7), VSRp17, VSRp18,
+                     VSRp16, VSRp19, VSRp20, VSRp21, VSRp22, VSRp23,
+                     VSRp24, VSRp25, VSRp31, VSRp30, VSRp29, VSRp28,
+                     VSRp27, VSRp26)> {
+  let Size = 256;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCScheduleP9.td b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCScheduleP9.td
index 0a1ae7e55b3c..571cc219ff2b 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCScheduleP9.td
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCScheduleP9.td
@@ -40,12 +40,11 @@ def P9Model : SchedMachineModel {
 
   let CompleteModel = 1;
 
-  // Do not support QPX (Quad Processing eXtension), SPE (Signal Processing
-  // Engine), prefixed instructions on Power 9, PC relative mem ops, or
-  // instructions introduced in ISA 3.1.
-  let UnsupportedFeatures = [HasQPX, HasSPE, PrefixInstrs, PCRelativeMemops,
-                             IsISA3_1];
-
+  // Do not support SPE (Signal Processing Engine), prefixed instructions on
+  // Power 9, paired vector mem ops, MMA, PC relative mem ops, or instructions
+  // introduced in ISA 3.1.
+  let UnsupportedFeatures = [HasSPE, PrefixInstrs, PairedVectorMemops, MMA,
+                             PCRelativeMemops, IsISA3_1];
 }
 
 let SchedModel = P9Model in {
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
index 3836cc960394..d31195f67ef1 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -11,9 +11,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "PPCSubtarget.h"
+#include "GISel/PPCCallLowering.h"
+#include "GISel/PPCLegalizerInfo.h"
+#include "GISel/PPCRegisterBankInfo.h"
 #include "PPC.h"
 #include "PPCRegisterInfo.h"
 #include "PPCTargetMachine.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/IR/Attributes.h"
@@ -35,10 +39,6 @@ using namespace llvm;
 static cl::opt<bool> UseSubRegLiveness("ppc-track-subreg-liveness",
 cl::desc("Enable subregister liveness tracking for PPC"), cl::Hidden);
 
-static cl::opt<bool> QPXStackUnaligned("qpx-stack-unaligned",
-  cl::desc("Even when QPX is enabled the stack is not 32-byte aligned"),
-  cl::Hidden);
-
 static cl::opt<bool>
     EnableMachinePipeliner("ppc-enable-pipeliner",
                            cl::desc("Enable Machine Pipeliner for PPC"),
@@ -53,11 +53,19 @@ PPCSubtarget &PPCSubtarget::initializeSubtargetDependencies(StringRef CPU,
 
 PPCSubtarget::PPCSubtarget(const Triple &TT, const std::string &CPU,
                            const std::string &FS, const PPCTargetMachine &TM)
-    : PPCGenSubtargetInfo(TT, CPU, FS), TargetTriple(TT),
+    : PPCGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), TargetTriple(TT),
       IsPPC64(TargetTriple.getArch() == Triple::ppc64 ||
               TargetTriple.getArch() == Triple::ppc64le),
       TM(TM), FrameLowering(initializeSubtargetDependencies(CPU, FS)),
-      InstrInfo(*this), TLInfo(TM, *this) {}
+      InstrInfo(*this), TLInfo(TM, *this) {
+  CallLoweringInfo.reset(new PPCCallLowering(*getTargetLowering()));
+  Legalizer.reset(new PPCLegalizerInfo(*this));
+  auto *RBI = new PPCRegisterBankInfo(*getRegisterInfo());
+  RegBankInfo.reset(RBI);
+
+  InstSelector.reset(createPPCInstructionSelector(
+      *static_cast<const PPCTargetMachine *>(&TM), *this, *RBI));
+}
 
 void PPCSubtarget::initializeEnvironment() {
   StackAlignment = Align(16);
@@ -69,8 +77,8 @@ void PPCSubtarget::initializeEnvironment() {
   HasHardFloat = false;
   HasAltivec = false;
   HasSPE = false;
+  HasEFPU2 = false;
   HasFPU = false;
-  HasQPX = false;
   HasVSX = false;
   NeedsTwoConstNR = false;
   HasP8Vector = false;
@@ -78,6 +86,7 @@ void PPCSubtarget::initializeEnvironment() {
   HasP8Crypto = false;
   HasP9Vector = false;
   HasP9Altivec = false;
+  HasMMA = false;
   HasP10Vector = false;
   HasPrefixInstrs = false;
   HasPCRelativeMemops = false;
@@ -109,10 +118,10 @@ void PPCSubtarget::initializeEnvironment() {
   HasInvariantFunctionDescriptors = false;
   HasPartwordAtomics = false;
   HasDirectMove = false;
-  IsQPXStackUnaligned = false;
   HasHTM = false;
   HasFloat128 = false;
   HasFusion = false;
+  HasStoreFusion = false;
   HasAddiLoadFusion = false;
   HasAddisLoadFusion = false;
   IsISA3_0 = false;
@@ -122,7 +131,10 @@ void PPCSubtarget::initializeEnvironment() {
   VectorsUseTwoUnits = false;
   UsePPCPreRASchedStrategy = false;
   UsePPCPostRASchedStrategy = false;
+  PairedVectorMemops = false;
   PredictableSelectIsExpensive = false;
+  HasModernAIXAs = false;
+  IsAIX = false;
 
   HasPOPCNTD = POPCNTD_Unavailable;
 }
@@ -144,7 +156,7 @@ void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
   InstrItins = getInstrItineraryForCPU(CPUName);
 
   // Parse features string.
-  ParseSubtargetFeatures(CPUName, FS);
+  ParseSubtargetFeatures(CPUName, /*TuneCPU*/ CPUName, FS);
 
   // If the user requested use of 64-bit regs, but the cpu selected doesn't
   // support it, ignore.
@@ -158,7 +170,7 @@ void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
 
   if (HasSPE && IsPPC64)
     report_fatal_error( "SPE is only supported for 32-bit targets.\n", false);
-  if (HasSPE && (HasAltivec || HasQPX || HasVSX || HasFPU))
+  if (HasSPE && (HasAltivec || HasVSX || HasFPU))
     report_fatal_error(
         "SPE and traditional floating point cannot both be enabled.\n", false);
 
@@ -166,15 +178,12 @@ void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
   if (!HasSPE)
     HasFPU = true;
 
-  // QPX requires a 32-byte aligned stack. Note that we need to do this if
-  // we're compiling for a BG/Q system regardless of whether or not QPX
-  // is enabled because external functions will assume this alignment.
-  IsQPXStackUnaligned = QPXStackUnaligned;
   StackAlignment = getPlatformStackAlignment();
 
   // Determine endianness.
   // FIXME: Part of the TargetMachine.
-  IsLittleEndian = (TargetTriple.getArch() == Triple::ppc64le);
+  IsLittleEndian = (TargetTriple.getArch() == Triple::ppc64le ||
+                    TargetTriple.getArch() == Triple::ppcle);
 }
 
 bool PPCSubtarget::enableMachineScheduler() const { return true; }
@@ -235,3 +244,20 @@ bool PPCSubtarget::isUsingPCRelativeCalls() const {
   return isPPC64() && hasPCRelativeMemops() && isELFv2ABI() &&
          CodeModel::Medium == getTargetMachine().getCodeModel();
 }
+
+// GlobalISEL
+const CallLowering *PPCSubtarget::getCallLowering() const {
+  return CallLoweringInfo.get();
+}
+
+const RegisterBankInfo *PPCSubtarget::getRegBankInfo() const {
+  return RegBankInfo.get();
+}
+
+const LegalizerInfo *PPCSubtarget::getLegalizerInfo() const {
+  return Legalizer.get();
+}
+
+InstructionSelector *PPCSubtarget::getInstructionSelector() const {
+  return InstSelector.get();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCSubtarget.h b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCSubtarget.h
index ec329022c457..50d89390d5bc 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCSubtarget.h
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCSubtarget.h
@@ -17,6 +17,9 @@
 #include "PPCISelLowering.h"
 #include "PPCInstrInfo.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DataLayout.h"
@@ -97,7 +100,7 @@ protected:
   bool HasAltivec;
   bool HasFPU;
   bool HasSPE;
-  bool HasQPX;
+  bool HasEFPU2;
   bool HasVSX;
   bool NeedsTwoConstNR;
   bool HasP8Vector;
@@ -108,6 +111,7 @@ protected:
   bool HasP10Vector;
   bool HasPrefixInstrs;
   bool HasPCRelativeMemops;
+  bool HasMMA;
   bool HasFCPSGN;
   bool HasFSQRT;
   bool HasFRE, HasFRES, HasFRSQRTE, HasFRSQRTES;
@@ -137,6 +141,7 @@ protected:
   bool HasHTM;
   bool HasFloat128;
   bool HasFusion;
+  bool HasStoreFusion;
   bool HasAddiLoadFusion;
   bool HasAddisLoadFusion;
   bool IsISA3_0;
@@ -146,21 +151,25 @@ protected:
   bool VectorsUseTwoUnits;
   bool UsePPCPreRASchedStrategy;
   bool UsePPCPostRASchedStrategy;
+  bool PairedVectorMemops;
   bool PredictableSelectIsExpensive;
+  bool HasModernAIXAs;
+  bool IsAIX;
 
   POPCNTDKind HasPOPCNTD;
 
-  /// When targeting QPX running a stock PPC64 Linux kernel where the stack
-  /// alignment has not been changed, we need to keep the 16-byte alignment
-  /// of the stack.
-  bool IsQPXStackUnaligned;
-
   const PPCTargetMachine &TM;
   PPCFrameLowering FrameLowering;
   PPCInstrInfo InstrInfo;
   PPCTargetLowering TLInfo;
   SelectionDAGTargetInfo TSInfo;
 
+  /// GlobalISel related APIs.
+  std::unique_ptr<CallLowering> CallLoweringInfo;
+  std::unique_ptr<LegalizerInfo> Legalizer;
+  std::unique_ptr<RegisterBankInfo> RegBankInfo;
+  std::unique_ptr<InstructionSelector> InstSelector;
+
 public:
   /// This constructor initializes the data members to match that
   /// of the specified triple.
@@ -170,16 +179,13 @@ public:
 
   /// ParseSubtargetFeatures - Parses features string setting specified
   /// subtarget options.  Definition of function is auto generated by tblgen.
-  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+  void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
 
   /// getStackAlignment - Returns the minimum alignment known to hold of the
   /// stack frame on entry to the function and which must be maintained by every
   /// function for this subtarget.
   Align getStackAlignment() const { return StackAlignment; }
 
-  /// getDarwinDirective - Returns the -m directive specified for the cpu.
-  unsigned getDarwinDirective() const { return CPUDirective; }
-
   /// getCPUDirective - Returns the -m directive specified for the cpu.
   ///
   unsigned getCPUDirective() const { return CPUDirective; }
@@ -254,8 +260,8 @@ public:
   bool hasFPCVT() const { return HasFPCVT; }
   bool hasAltivec() const { return HasAltivec; }
   bool hasSPE() const { return HasSPE; }
+  bool hasEFPU2() const { return HasEFPU2; }
   bool hasFPU() const { return HasFPU; }
-  bool hasQPX() const { return HasQPX; }
   bool hasVSX() const { return HasVSX; }
   bool needsTwoConstNR() const { return NeedsTwoConstNR; }
   bool hasP8Vector() const { return HasP8Vector; }
@@ -266,6 +272,8 @@ public:
   bool hasP10Vector() const { return HasP10Vector; }
   bool hasPrefixInstrs() const { return HasPrefixInstrs; }
   bool hasPCRelativeMemops() const { return HasPCRelativeMemops; }
+  bool hasMMA() const { return HasMMA; }
+  bool pairedVectorMemops() const { return PairedVectorMemops; }
   bool hasMFOCRF() const { return HasMFOCRF; }
   bool hasISEL() const { return HasISEL; }
   bool hasBPERMD() const { return HasBPERMD; }
@@ -291,11 +299,7 @@ public:
   bool hasPartwordAtomics() const { return HasPartwordAtomics; }
   bool hasDirectMove() const { return HasDirectMove; }
 
-  bool isQPXStackUnaligned() const { return IsQPXStackUnaligned; }
   Align getPlatformStackAlignment() const {
-    if ((hasQPX() || isBGQ()) && !isQPXStackUnaligned())
-      return Align(32);
-
     return Align(16);
   }
 
@@ -315,6 +319,7 @@ public:
   bool isISA3_1() const { return IsISA3_1; }
   bool useLongCalls() const { return UseLongCalls; }
   bool hasFusion() const { return HasFusion; }
+  bool hasStoreFusion() const { return HasStoreFusion; }
   bool hasAddiLoadFusion() const { return HasAddiLoadFusion; }
   bool hasAddisLoadFusion() const { return HasAddisLoadFusion; }
   bool needsSwapsForVSXMemOps() const {
@@ -325,9 +330,6 @@ public:
 
   const Triple &getTargetTriple() const { return TargetTriple; }
 
-  /// isBGQ - True if this is a BG/Q platform.
-  bool isBGQ() const { return TargetTriple.getVendor() == Triple::BGQ; }
-
   bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
   bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
   bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
@@ -404,6 +406,12 @@ public:
   bool isPredictableSelectIsExpensive() const {
     return PredictableSelectIsExpensive;
   }
+
+  // GlobalISEL
+  const CallLowering *getCallLowering() const override;
+  const RegisterBankInfo *getRegBankInfo() const override;
+  const LegalizerInfo *getLegalizerInfo() const override;
+  InstructionSelector *getInstructionSelector() const override;
 };
 } // End llvm namespace
 
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
index 4b809e0c8553..43dcc5844c4e 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
@@ -50,16 +50,17 @@ protected:
       bool Changed = false;
       bool NeedFence = true;
       bool Is64Bit = MBB.getParent()->getSubtarget<PPCSubtarget>().isPPC64();
+      bool IsPCREL = false;
 
       for (MachineBasicBlock::iterator I = MBB.begin(), IE = MBB.end();
            I != IE;) {
         MachineInstr &MI = *I;
+        IsPCREL = isPCREL(MI);
 
         if (MI.getOpcode() != PPC::ADDItlsgdLADDR &&
             MI.getOpcode() != PPC::ADDItlsldLADDR &&
             MI.getOpcode() != PPC::ADDItlsgdLADDR32 &&
-            MI.getOpcode() != PPC::ADDItlsldLADDR32) {
-
+            MI.getOpcode() != PPC::ADDItlsldLADDR32 && !IsPCREL) {
           // Although we create ADJCALLSTACKDOWN and ADJCALLSTACKUP
           // as scheduling fences, we skip creating fences if we already
           // have existing ADJCALLSTACKDOWN/UP to avoid nesting,
@@ -76,12 +77,16 @@ protected:
         LLVM_DEBUG(dbgs() << "TLS Dynamic Call Fixup:\n    " << MI);
 
         Register OutReg = MI.getOperand(0).getReg();
-        Register InReg = MI.getOperand(1).getReg();
-        DebugLoc DL = MI.getDebugLoc();
+        Register InReg = PPC::NoRegister;
         Register GPR3 = Is64Bit ? PPC::X3 : PPC::R3;
-        unsigned Opc1, Opc2;
-        const Register OrigRegs[] = {OutReg, InReg, GPR3};
+        SmallVector<Register, 3> OrigRegs = {OutReg, GPR3};
+        if (!IsPCREL) {
+          InReg = MI.getOperand(1).getReg();
+          OrigRegs.push_back(InReg);
+        }
+        DebugLoc DL = MI.getDebugLoc();
 
+        unsigned Opc1, Opc2;
         switch (MI.getOpcode()) {
         default:
           llvm_unreachable("Opcode inconsistency error");
@@ -101,6 +106,13 @@ protected:
           Opc1 = PPC::ADDItlsldL32;
           Opc2 = PPC::GETtlsldADDR32;
           break;
+        case PPC::PADDI8pc:
+          assert(IsPCREL && "Expecting General/Local Dynamic PCRel");
+          Opc1 = PPC::PADDI8pc;
+          Opc2 = MI.getOperand(2).getTargetFlags() ==
+                         PPCII::MO_GOT_TLSGD_PCREL_FLAG
+                     ? PPC::GETtlsADDRPCREL
+                     : PPC::GETtlsldADDRPCREL;
         }
 
         // We create ADJCALLSTACKUP and ADJCALLSTACKDOWN around _tls_get_addr
@@ -113,9 +125,15 @@ protected:
           BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKDOWN)).addImm(0)
                                                               .addImm(0);
 
-        // Expand into two ops built prior to the existing instruction.
-        MachineInstr *Addi = BuildMI(MBB, I, DL, TII->get(Opc1), GPR3)
-          .addReg(InReg);
+        MachineInstr *Addi;
+        if (IsPCREL) {
+          Addi = BuildMI(MBB, I, DL, TII->get(Opc1), GPR3).addImm(0);
+        } else {
+          // Expand into two ops built prior to the existing instruction.
+          assert(InReg != PPC::NoRegister && "Operand must be a register");
+          Addi = BuildMI(MBB, I, DL, TII->get(Opc1), GPR3).addReg(InReg);
+        }
+
         Addi->addOperand(MI.getOperand(2));
 
         // The ADDItls* instruction is the first instruction in the
@@ -125,7 +143,10 @@ protected:
 
         MachineInstr *Call = (BuildMI(MBB, I, DL, TII->get(Opc2), GPR3)
                               .addReg(GPR3));
-        Call->addOperand(MI.getOperand(3));
+        if (IsPCREL)
+          Call->addOperand(MI.getOperand(2));
+        else
+          Call->addOperand(MI.getOperand(3));
 
         if (NeedFence)
           BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKUP)).addImm(0).addImm(0);
@@ -150,6 +171,14 @@ protected:
     }
 
 public:
+  bool isPCREL(const MachineInstr &MI) {
+    return (MI.getOpcode() == PPC::PADDI8pc) &&
+           (MI.getOperand(2).getTargetFlags() ==
+                PPCII::MO_GOT_TLSGD_PCREL_FLAG ||
+            MI.getOperand(2).getTargetFlags() ==
+                PPCII::MO_GOT_TLSLD_PCREL_FLAG);
+  }
+
     bool runOnMachineFunction(MachineFunction &MF) override {
       TII = MF.getSubtarget<PPCSubtarget>().getInstrInfo();
       LIS = &getAnalysis<LiveIntervals>();
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
index f15f9c7f4942..0634833e64dc 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -24,12 +24,18 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
+#include "llvm/CodeGen/GlobalISel/Legalizer.h"
+#include "llvm/CodeGen/GlobalISel/Localizer.h"
+#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
+#include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
@@ -63,10 +69,6 @@ static cl::
 opt<bool> DisableVSXSwapRemoval("disable-ppc-vsx-swap-removal", cl::Hidden,
                                 cl::desc("Disable VSX Swap Removal for PPC"));
 
-static cl::
-opt<bool> DisableQPXLoadSplat("disable-ppc-qpx-load-splat", cl::Hidden,
-                              cl::desc("Disable QPX load splat simplification"));
-
 static cl::
 opt<bool> DisableMIPeephole("disable-ppc-peephole", cl::Hidden,
                             cl::desc("Disable machine peepholes for PPC"));
@@ -98,8 +100,9 @@ static cl::opt<bool>
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCTarget() {
   // Register the targets
   RegisterTargetMachine<PPCTargetMachine> A(getThePPC32Target());
-  RegisterTargetMachine<PPCTargetMachine> B(getThePPC64Target());
-  RegisterTargetMachine<PPCTargetMachine> C(getThePPC64LETarget());
+  RegisterTargetMachine<PPCTargetMachine> B(getThePPC32LETarget());
+  RegisterTargetMachine<PPCTargetMachine> C(getThePPC64Target());
+  RegisterTargetMachine<PPCTargetMachine> D(getThePPC64LETarget());
 
   PassRegistry &PR = *PassRegistry::getPassRegistry();
 #ifndef NDEBUG
@@ -114,13 +117,13 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCTarget() {
   initializePPCReduceCRLogicalsPass(PR);
   initializePPCBSelPass(PR);
   initializePPCBranchCoalescingPass(PR);
-  initializePPCQPXLoadSplatPass(PR);
   initializePPCBoolRetToIntPass(PR);
   initializePPCExpandISELPass(PR);
   initializePPCPreEmitPeepholePass(PR);
   initializePPCTLSDynamicCallPass(PR);
   initializePPCMIPeepholePass(PR);
   initializePPCLowerMASSVEntriesPass(PR);
+  initializeGlobalISel(PR);
 }
 
 /// Return the datalayout string of a subtarget.
@@ -128,8 +131,8 @@ static std::string getDataLayoutString(const Triple &T) {
   bool is64Bit = T.getArch() == Triple::ppc64 || T.getArch() == Triple::ppc64le;
   std::string Ret;
 
-  // Most PPC* platforms are big endian, PPC64LE is little endian.
-  if (T.getArch() == Triple::ppc64le)
+  // Most PPC* platforms are big endian, PPC(64)LE is little endian.
+  if (T.getArch() == Triple::ppc64le || T.getArch() == Triple::ppcle)
     Ret = "e";
   else
     Ret = "E";
@@ -143,10 +146,7 @@ static std::string getDataLayoutString(const Triple &T) {
 
   // Note, the alignment values for f64 and i64 on ppc64 in Darwin
   // documentation are wrong; these are correct (i.e. "what gcc does").
-  if (is64Bit || !T.isOSDarwin())
-    Ret += "-i64:64";
-  else
-    Ret += "-f64:32:64";
+  Ret += "-i64:64";
 
   // PPC64 has 32 and 64 bit registers, PPC32 has only 32 bit ones.
   if (is64Bit)
@@ -154,6 +154,13 @@ static std::string getDataLayoutString(const Triple &T) {
   else
     Ret += "-n32";
 
+  // Specify the vector alignment explicitly. For v256i1 and v512i1, the
+  // calculated alignment would be 256*alignment(i1) and 512*alignment(i1),
+  // which is 256 and 512 bytes - way over aligned.
+  if ((T.getArch() == Triple::ppc64le || T.getArch() == Triple::ppc64) &&
+      (T.isOSAIX() || T.isOSLinux()))
+    Ret += "-v256:256:256-v512:512:512";
+
   return Ret;
 }
 
@@ -183,13 +190,17 @@ static std::string computeFSAdditions(StringRef FS, CodeGenOpt::Level OL,
       FullFS = "+invariant-function-descriptors";
   }
 
+  if (TT.isOSAIX()) {
+    if (!FullFS.empty())
+      FullFS = "+aix," + FullFS;
+    else
+      FullFS = "+aix";
+  }
+
   return FullFS;
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
-  if (TT.isOSDarwin())
-    return std::make_unique<TargetLoweringObjectFileMachO>();
-
   if (TT.isOSAIX())
     return std::make_unique<TargetLoweringObjectFileXCOFF>();
 
@@ -198,9 +209,6 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
 
 static PPCTargetMachine::PPCABI computeTargetABI(const Triple &TT,
                                                  const TargetOptions &Options) {
-  if (TT.isOSDarwin())
-    report_fatal_error("Darwin is no longer supported for PowerPC");
-  
   if (Options.MCOptions.getABIName().startswith("elfv1"))
     return PPCTargetMachine::PPC_ABI_ELFv1;
   else if (Options.MCOptions.getABIName().startswith("elfv2"))
@@ -230,10 +238,6 @@ static Reloc::Model getEffectiveRelocModel(const Triple &TT,
   if (RM.hasValue())
     return *RM;
 
-  // Darwin defaults to dynamic-no-pic.
-  if (TT.isOSDarwin())
-    return Reloc::DynamicNoPIC;
-
   // Big Endian PPC and AIX default to PIC.
   if (TT.getArch() == Triple::ppc64 || TT.isOSAIX())
     return Reloc::PIC_;
@@ -276,6 +280,8 @@ static ScheduleDAGInstrs *createPPCMachineScheduler(MachineSchedContext *C) {
                           std::make_unique<GenericScheduler>(C));
   // add DAG Mutations here.
   DAG->addMutation(createCopyConstrainDAGMutation(DAG->TII, DAG->TRI));
+  if (ST.hasStoreFusion())
+    DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
   if (ST.hasFusion())
     DAG->addMutation(createPowerPCMacroFusionDAGMutation());
 
@@ -290,6 +296,8 @@ static ScheduleDAGInstrs *createPPCPostMachineScheduler(
                       std::make_unique<PPCPostRASchedStrategy>(C) :
                       std::make_unique<PostGenericScheduler>(C), true);
   // add DAG Mutations here.
+  if (ST.hasStoreFusion())
+    DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
   if (ST.hasFusion())
     DAG->addMutation(createPowerPCMacroFusionDAGMutation());
   return DAG;
@@ -321,12 +329,10 @@ PPCTargetMachine::getSubtargetImpl(const Function &F) const {
   Attribute CPUAttr = F.getFnAttribute("target-cpu");
   Attribute FSAttr = F.getFnAttribute("target-features");
 
-  std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
-                        ? CPUAttr.getValueAsString().str()
-                        : TargetCPU;
-  std::string FS = !FSAttr.hasAttribute(Attribute::None)
-                       ? FSAttr.getValueAsString().str()
-                       : TargetFS;
+  std::string CPU =
+      CPUAttr.isValid() ? CPUAttr.getValueAsString().str() : TargetCPU;
+  std::string FS =
+      FSAttr.isValid() ? FSAttr.getValueAsString().str() : TargetFS;
 
   // FIXME: This is related to the code below to reset the target options,
   // we need to know whether or not the soft float flag is set on the
@@ -388,6 +394,12 @@ public:
   void addPreRegAlloc() override;
   void addPreSched2() override;
   void addPreEmitPass() override;
+  // GlobalISEL
+  bool addIRTranslator() override;
+  bool addLegalizeMachineIR() override;
+  bool addRegBankSelect() override;
+  bool addGlobalInstructionSelect() override;
+
   ScheduleDAGInstrs *
   createMachineScheduler(MachineSchedContext *C) const override {
     return createPPCMachineScheduler(C);
@@ -411,14 +423,9 @@ void PPCPassConfig::addIRPasses() {
 
   // Lower generic MASSV routines to PowerPC subtarget-specific entries.
   addPass(createPPCLowerMASSVEntriesPass());
-  
-  // For the BG/Q (or if explicitly requested), add explicit data prefetch
-  // intrinsics.
-  bool UsePrefetching = TM->getTargetTriple().getVendor() == Triple::BGQ &&
-                        getOptLevel() != CodeGenOpt::None;
+
+  // If explicitly requested, add explicit data prefetch intrinsics.
   if (EnablePrefetch.getNumOccurrences() > 0)
-    UsePrefetching = EnablePrefetch;
-  if (UsePrefetching)
     addPass(createLoopDataPrefetchPass());
 
   if (TM->getOptLevel() >= CodeGenOpt::Default && EnableGEPOpt) {
@@ -515,15 +522,8 @@ void PPCPassConfig::addPreRegAlloc() {
 }
 
 void PPCPassConfig::addPreSched2() {
-  if (getOptLevel() != CodeGenOpt::None) {
+  if (getOptLevel() != CodeGenOpt::None)
     addPass(&IfConverterID);
-
-    // This optimization must happen after anything that might do store-to-load
-    // forwarding. Here we're after RA (and, thus, when spills are inserted)
-    // but before post-RA scheduling.
-    if (!DisableQPXLoadSplat)
-      addPass(createPPCQPXLoadSplatPass());
-  }
 }
 
 void PPCPassConfig::addPreEmitPass() {
@@ -550,3 +550,24 @@ static MachineSchedRegistry
 PPCPostRASchedRegistry("ppc-postra",
                        "Run PowerPC PostRA specific scheduler",
                        createPPCPostMachineScheduler);
+
+// Global ISEL
+bool PPCPassConfig::addIRTranslator() {
+  addPass(new IRTranslator());
+  return false;
+}
+
+bool PPCPassConfig::addLegalizeMachineIR() {
+  addPass(new Legalizer());
+  return false;
+}
+
+bool PPCPassConfig::addRegBankSelect() {
+  addPass(new RegBankSelect());
+  return false;
+}
+
+bool PPCPassConfig::addGlobalInstructionSelect() {
+  addPass(new InstructionSelect());
+  return false;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetMachine.h b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetMachine.h
index fd1d14ae32d4..21faa4e710e3 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetMachine.h
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetMachine.h
@@ -58,6 +58,11 @@ public:
     const Triple &TT = getTargetTriple();
     return (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le);
   };
+
+  bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
+    // Addrspacecasts are always noops.
+    return true;
+  }
 };
 } // end namespace llvm
 
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index dc10dd80c8fa..c90ff8b7d59d 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -8,13 +8,19 @@
 
 #include "PPCTargetTransformInfo.h"
 #include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/BasicTTIImpl.h"
 #include "llvm/CodeGen/CostTable.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/IR/IntrinsicsPowerPC.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
+#include "llvm/Transforms/Utils/Local.h"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "ppctti"
@@ -22,8 +28,7 @@ using namespace llvm;
 static cl::opt<bool> DisablePPCConstHoist("disable-ppc-constant-hoisting",
 cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden);
 
-// This is currently only used for the data prefetch pass which is only enabled
-// for BG/Q by default.
+// This is currently only used for the data prefetch pass
 static cl::opt<unsigned>
 CacheLineSize("ppc-loop-prefetch-cache-line", cl::Hidden, cl::init(64),
               cl::desc("The loop prefetch cache line size"));
@@ -59,6 +64,109 @@ PPCTTIImpl::getPopcntSupport(unsigned TyWidth) {
   return TTI::PSK_Software;
 }
 
+Optional<Instruction *>
+PPCTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
+  Intrinsic::ID IID = II.getIntrinsicID();
+  switch (IID) {
+  default:
+    break;
+  case Intrinsic::ppc_altivec_lvx:
+  case Intrinsic::ppc_altivec_lvxl:
+    // Turn PPC lvx -> load if the pointer is known aligned.
+    if (getOrEnforceKnownAlignment(
+            II.getArgOperand(0), Align(16), IC.getDataLayout(), &II,
+            &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 16) {
+      Value *Ptr = IC.Builder.CreateBitCast(
+          II.getArgOperand(0), PointerType::getUnqual(II.getType()));
+      return new LoadInst(II.getType(), Ptr, "", false, Align(16));
+    }
+    break;
+  case Intrinsic::ppc_vsx_lxvw4x:
+  case Intrinsic::ppc_vsx_lxvd2x: {
+    // Turn PPC VSX loads into normal loads.
+    Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(0),
+                                          PointerType::getUnqual(II.getType()));
+    return new LoadInst(II.getType(), Ptr, Twine(""), false, Align(1));
+  }
+  case Intrinsic::ppc_altivec_stvx:
+  case Intrinsic::ppc_altivec_stvxl:
+    // Turn stvx -> store if the pointer is known aligned.
+    if (getOrEnforceKnownAlignment(
+            II.getArgOperand(1), Align(16), IC.getDataLayout(), &II,
+            &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 16) {
+      Type *OpPtrTy = PointerType::getUnqual(II.getArgOperand(0)->getType());
+      Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(1), OpPtrTy);
+      return new StoreInst(II.getArgOperand(0), Ptr, false, Align(16));
+    }
+    break;
+  case Intrinsic::ppc_vsx_stxvw4x:
+  case Intrinsic::ppc_vsx_stxvd2x: {
+    // Turn PPC VSX stores into normal stores.
+    Type *OpPtrTy = PointerType::getUnqual(II.getArgOperand(0)->getType());
+    Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(1), OpPtrTy);
+    return new StoreInst(II.getArgOperand(0), Ptr, false, Align(1));
+  }
+  case Intrinsic::ppc_altivec_vperm:
+    // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant.
+    // Note that ppc_altivec_vperm has a big-endian bias, so when creating
+    // a vectorshuffle for little endian, we must undo the transformation
+    // performed on vec_perm in altivec.h.  That is, we must complement
+    // the permutation mask with respect to 31 and reverse the order of
+    // V1 and V2.
+    if (Constant *Mask = dyn_cast<Constant>(II.getArgOperand(2))) {
+      assert(cast<FixedVectorType>(Mask->getType())->getNumElements() == 16 &&
+             "Bad type for intrinsic!");
+
+      // Check that all of the elements are integer constants or undefs.
+      bool AllEltsOk = true;
+      for (unsigned i = 0; i != 16; ++i) {
+        Constant *Elt = Mask->getAggregateElement(i);
+        if (!Elt || !(isa<ConstantInt>(Elt) || isa<UndefValue>(Elt))) {
+          AllEltsOk = false;
+          break;
+        }
+      }
+
+      if (AllEltsOk) {
+        // Cast the input vectors to byte vectors.
+        Value *Op0 =
+            IC.Builder.CreateBitCast(II.getArgOperand(0), Mask->getType());
+        Value *Op1 =
+            IC.Builder.CreateBitCast(II.getArgOperand(1), Mask->getType());
+        Value *Result = UndefValue::get(Op0->getType());
+
+        // Only extract each element once.
+        Value *ExtractedElts[32];
+        memset(ExtractedElts, 0, sizeof(ExtractedElts));
+
+        for (unsigned i = 0; i != 16; ++i) {
+          if (isa<UndefValue>(Mask->getAggregateElement(i)))
+            continue;
+          unsigned Idx =
+              cast<ConstantInt>(Mask->getAggregateElement(i))->getZExtValue();
+          Idx &= 31; // Match the hardware behavior.
+          if (DL.isLittleEndian())
+            Idx = 31 - Idx;
+
+          if (!ExtractedElts[Idx]) {
+            Value *Op0ToUse = (DL.isLittleEndian()) ? Op1 : Op0;
+            Value *Op1ToUse = (DL.isLittleEndian()) ? Op0 : Op1;
+            ExtractedElts[Idx] = IC.Builder.CreateExtractElement(
+                Idx < 16 ? Op0ToUse : Op1ToUse, IC.Builder.getInt32(Idx & 15));
+          }
+
+          // Insert this value into the result vector.
+          Result = IC.Builder.CreateInsertElement(Result, ExtractedElts[Idx],
+                                                  IC.Builder.getInt32(i));
+        }
+        return CastInst::Create(Instruction::BitCast, Result, II.getType());
+      }
+    }
+    break;
+  }
+  return None;
+}
+
 int PPCTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
                               TTI::TargetCostKind CostKind) {
   if (DisablePPCConstHoist)
@@ -126,9 +234,10 @@ int PPCTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
 
 int PPCTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
                                   const APInt &Imm, Type *Ty,
-                                  TTI::TargetCostKind CostKind) {
+                                  TTI::TargetCostKind CostKind,
+                                  Instruction *Inst) {
   if (DisablePPCConstHoist)
-    return BaseT::getIntImmCostInst(Opcode, Idx, Imm, Ty, CostKind);
+    return BaseT::getIntImmCostInst(Opcode, Idx, Imm, Ty, CostKind, Inst);
 
   assert(Ty->isIntegerTy());
 
@@ -274,8 +383,34 @@ bool PPCTTIImpl::mightUseCTR(BasicBlock *BB, TargetLibraryInfo *LibInfo,
     return false;
   };
 
+  auto supportedHalfPrecisionOp = [](Instruction *Inst) {
+    switch (Inst->getOpcode()) {
+    default:
+      return false;
+    case Instruction::FPTrunc:
+    case Instruction::FPExt:
+    case Instruction::Load:
+    case Instruction::Store:
+    case Instruction::FPToUI:
+    case Instruction::UIToFP:
+    case Instruction::FPToSI:
+    case Instruction::SIToFP:
+      return true;
+    }
+  };
+
   for (BasicBlock::iterator J = BB->begin(), JE = BB->end();
        J != JE; ++J) {
+    // There are no direct operations on half precision so assume that
+    // anything with that type requires a call except for a few select
+    // operations with Power9.
+    if (Instruction *CurrInst = dyn_cast<Instruction>(J)) {
+      for (const auto &Op : CurrInst->operands()) {
+        if (Op->getType()->getScalarType()->isHalfTy() ||
+            CurrInst->getType()->getScalarType()->isHalfTy())
+          return !(ST->isISA3_0() && supportedHalfPrecisionOp(CurrInst));
+      }
+    }
     if (CallInst *CI = dyn_cast<CallInst>(J)) {
       // Inline ASM is okay, unless it clobbers the ctr register.
       if (InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledOperand())) {
@@ -297,6 +432,30 @@ bool PPCTTIImpl::mightUseCTR(BasicBlock *BB, TargetLibraryInfo *LibInfo,
           case Intrinsic::loop_decrement:
             return true;
 
+          // Binary operations on 128-bit value will use CTR.
+          case Intrinsic::experimental_constrained_fadd:
+          case Intrinsic::experimental_constrained_fsub:
+          case Intrinsic::experimental_constrained_fmul:
+          case Intrinsic::experimental_constrained_fdiv:
+          case Intrinsic::experimental_constrained_frem:
+            if (F->getType()->getScalarType()->isFP128Ty() ||
+                F->getType()->getScalarType()->isPPC_FP128Ty())
+              return true;
+            break;
+
+          case Intrinsic::experimental_constrained_fptosi:
+          case Intrinsic::experimental_constrained_fptoui:
+          case Intrinsic::experimental_constrained_sitofp:
+          case Intrinsic::experimental_constrained_uitofp: {
+            Type *SrcType = CI->getArgOperand(0)->getType()->getScalarType();
+            Type *DstType = CI->getType()->getScalarType();
+            if (SrcType->isPPC_FP128Ty() || DstType->isPPC_FP128Ty() ||
+                isLargeIntegerTy(!TM.isPPC64(), SrcType) ||
+                isLargeIntegerTy(!TM.isPPC64(), DstType))
+              return true;
+            break;
+          }
+
           // Exclude eh_sjlj_setjmp; we don't need to exclude eh_sjlj_longjmp
           // because, although it does clobber the counter register, the
           // control can't then return to inside the loop unless there is also
@@ -315,6 +474,15 @@ bool PPCTTIImpl::mightUseCTR(BasicBlock *BB, TargetLibraryInfo *LibInfo,
           case Intrinsic::pow:
           case Intrinsic::sin:
           case Intrinsic::cos:
+          case Intrinsic::experimental_constrained_powi:
+          case Intrinsic::experimental_constrained_log:
+          case Intrinsic::experimental_constrained_log2:
+          case Intrinsic::experimental_constrained_log10:
+          case Intrinsic::experimental_constrained_exp:
+          case Intrinsic::experimental_constrained_exp2:
+          case Intrinsic::experimental_constrained_pow:
+          case Intrinsic::experimental_constrained_sin:
+          case Intrinsic::experimental_constrained_cos:
             return true;
           case Intrinsic::copysign:
             if (CI->getArgOperand(0)->getType()->getScalarType()->
@@ -336,6 +504,54 @@ bool PPCTTIImpl::mightUseCTR(BasicBlock *BB, TargetLibraryInfo *LibInfo,
           case Intrinsic::llround:            Opcode = ISD::LLROUND;    break;
           case Intrinsic::minnum:             Opcode = ISD::FMINNUM;    break;
           case Intrinsic::maxnum:             Opcode = ISD::FMAXNUM;    break;
+          case Intrinsic::experimental_constrained_fcmp:
+            Opcode = ISD::STRICT_FSETCC;
+            break;
+          case Intrinsic::experimental_constrained_fcmps:
+            Opcode = ISD::STRICT_FSETCCS;
+            break;
+          case Intrinsic::experimental_constrained_fma:
+            Opcode = ISD::STRICT_FMA;
+            break;
+          case Intrinsic::experimental_constrained_sqrt:
+            Opcode = ISD::STRICT_FSQRT;
+            break;
+          case Intrinsic::experimental_constrained_floor:
+            Opcode = ISD::STRICT_FFLOOR;
+            break;
+          case Intrinsic::experimental_constrained_ceil:
+            Opcode = ISD::STRICT_FCEIL;
+            break;
+          case Intrinsic::experimental_constrained_trunc:
+            Opcode = ISD::STRICT_FTRUNC;
+            break;
+          case Intrinsic::experimental_constrained_rint:
+            Opcode = ISD::STRICT_FRINT;
+            break;
+          case Intrinsic::experimental_constrained_lrint:
+            Opcode = ISD::STRICT_LRINT;
+            break;
+          case Intrinsic::experimental_constrained_llrint:
+            Opcode = ISD::STRICT_LLRINT;
+            break;
+          case Intrinsic::experimental_constrained_nearbyint:
+            Opcode = ISD::STRICT_FNEARBYINT;
+            break;
+          case Intrinsic::experimental_constrained_round:
+            Opcode = ISD::STRICT_FROUND;
+            break;
+          case Intrinsic::experimental_constrained_lround:
+            Opcode = ISD::STRICT_LROUND;
+            break;
+          case Intrinsic::experimental_constrained_llround:
+            Opcode = ISD::STRICT_LLROUND;
+            break;
+          case Intrinsic::experimental_constrained_minnum:
+            Opcode = ISD::STRICT_FMINNUM;
+            break;
+          case Intrinsic::experimental_constrained_maxnum:
+            Opcode = ISD::STRICT_FMAXNUM;
+            break;
           case Intrinsic::umul_with_overflow: Opcode = ISD::UMULO;      break;
           case Intrinsic::smul_with_overflow: Opcode = ISD::SMULO;      break;
           }
@@ -597,10 +813,7 @@ bool PPCTTIImpl::useColdCCForColdCall(Function &F) {
 }
 
 bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) {
-  // On the A2, always unroll aggressively. For QPX unaligned loads, we depend
-  // on combining the loads generated for consecutive accesses, and failure to
-  // do so is particularly expensive. This makes it much more likely (compared
-  // to only using concatenation unrolling).
+  // On the A2, always unroll aggressively.
   if (ST->getCPUDirective() == PPC::DIR_A2)
     return true;
 
@@ -660,7 +873,6 @@ const char* PPCTTIImpl::getRegisterClassName(unsigned ClassID) const {
 
 unsigned PPCTTIImpl::getRegisterBitWidth(bool Vector) const {
   if (Vector) {
-    if (ST->hasQPX()) return 256;
     if (ST->hasAltivec()) return 128;
     return 0;
   }
@@ -689,8 +901,6 @@ unsigned PPCTTIImpl::getCacheLineSize() const {
 }
 
 unsigned PPCTTIImpl::getPrefetchDistance() const {
-  // This seems like a reasonable default for the BG/Q (this pass is enabled, by
-  // default, only on the BG/Q).
   return 300;
 }
 
@@ -779,7 +989,7 @@ int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
   // Legalize the type.
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
 
-  // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations
+  // PPC, for both Altivec/VSX, support cheap arbitrary permutations
   // (at least in the sense that there need only be one non-loop-invariant
   // instruction). We need one such shuffle instruction for each actual
   // register (this is not true for arbitrary shuffles, but is true for the
@@ -796,11 +1006,12 @@ int PPCTTIImpl::getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) {
 }
 
 int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                                 TTI::CastContextHint CCH,
                                  TTI::TargetCostKind CostKind,
                                  const Instruction *I) {
   assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
 
-  int Cost = BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I);
+  int Cost = BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
   Cost = vectorCostAdjustment(Cost, Opcode, Dst, Src);
   // TODO: Allow non-throughput costs that aren't binary.
   if (CostKind != TTI::TCK_RecipThroughput)
@@ -809,9 +1020,11 @@ int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
 }
 
 int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                                   CmpInst::Predicate VecPred,
                                    TTI::TargetCostKind CostKind,
                                    const Instruction *I) {
-  int Cost = BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I);
+  int Cost =
+      BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
   // TODO: Handle other cost kinds.
   if (CostKind != TTI::TCK_RecipThroughput)
     return Cost;
@@ -835,13 +1048,6 @@ int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
 
     return Cost;
 
-  } else if (ST->hasQPX() && Val->getScalarType()->isFloatingPointTy()) {
-    // Floating point scalars are already located in index #0.
-    if (Index == 0)
-      return 0;
-
-    return Cost;
-
   } else if (Val->getScalarType()->isIntegerTy() && Index != -1U) {
     if (ST->hasP9Altivec()) {
       if (ISD == ISD::INSERT_VECTOR_ELT)
@@ -865,7 +1071,7 @@ int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
       // The cost of the load constant for a vector extract is disregarded
       // (invariant, easily schedulable).
       return vectorCostAdjustment(1, Opcode, Val, nullptr);
-      
+
     } else if (ST->hasDirectMove())
       // Assume permute has standard cost.
       // Assume move-to/move-from VSR have 2x standard cost.
@@ -916,8 +1122,6 @@ int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
                         LT.second == MVT::v4i32 || LT.second == MVT::v4f32);
   bool IsVSXType = ST->hasVSX() &&
                    (LT.second == MVT::v2f64 || LT.second == MVT::v2i64);
-  bool IsQPXType = ST->hasQPX() &&
-                   (LT.second == MVT::v4f64 || LT.second == MVT::v4f32);
 
   // VSX has 32b/64b load instructions. Legalization can handle loading of
   // 32b/64b to VSR correctly and cheaply. But BaseT::getMemoryOpCost and
@@ -940,8 +1144,7 @@ int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
   // for Altivec types using the VSX instructions, but that's more expensive
   // than using the permutation-based load sequence. On the P8, that's no
   // longer true.
-  if (Opcode == Instruction::Load &&
-      ((!ST->hasP8Vector() && IsAltivecType) || IsQPXType) &&
+  if (Opcode == Instruction::Load && (!ST->hasP8Vector() && IsAltivecType) &&
       *Alignment >= LT.second.getScalarType().getStoreSize())
     return Cost + LT.first; // Add the cost of the permutations.
 
@@ -994,7 +1197,7 @@ int PPCTTIImpl::getInterleavedMemoryOpCost(
       getMemoryOpCost(Opcode, VecTy, MaybeAlign(Alignment), AddressSpace,
                       CostKind);
 
-  // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations
+  // PPC, for both Altivec/VSX, support cheap arbitrary permutations
   // (at least in the sense that there need only be one non-loop-invariant
   // instruction). For each result vector, we need one shuffle per incoming
   // vector (except that the first shuffle can take two incoming vectors
@@ -1009,6 +1212,27 @@ unsigned PPCTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
   return BaseT::getIntrinsicInstrCost(ICA, CostKind);
 }
 
+bool PPCTTIImpl::areFunctionArgsABICompatible(
+    const Function *Caller, const Function *Callee,
+    SmallPtrSetImpl<Argument *> &Args) const {
+
+  // We need to ensure that argument promotion does not
+  // attempt to promote pointers to MMA types (__vector_pair
+  // and __vector_quad) since these types explicitly cannot be
+  // passed as arguments. Both of these types are larger than
+  // the 128-bit Altivec vectors and have a scalar size of 1 bit.
+  if (!BaseT::areFunctionArgsABICompatible(Caller, Callee, Args))
+    return false;
+
+  return llvm::none_of(Args, [](Argument *A) {
+    auto *EltTy = cast<PointerType>(A->getType())->getElementType();
+    if (EltTy->isSized())
+      return (EltTy->isIntOrIntVectorTy(1) &&
+              EltTy->getPrimitiveSizeInBits() > 128);
+    return false;
+  });
+}
+
 bool PPCTTIImpl::canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE,
                             LoopInfo *LI, DominatorTree *DT,
                             AssumptionCache *AC, TargetLibraryInfo *LibInfo) {
@@ -1044,3 +1268,51 @@ bool PPCTTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1,
   else
     return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
 }
+
+bool PPCTTIImpl::isNumRegsMajorCostOfLSR() {
+  return false;
+}
+
+bool PPCTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
+                                    MemIntrinsicInfo &Info) {
+  switch (Inst->getIntrinsicID()) {
+  case Intrinsic::ppc_altivec_lvx:
+  case Intrinsic::ppc_altivec_lvxl:
+  case Intrinsic::ppc_altivec_lvebx:
+  case Intrinsic::ppc_altivec_lvehx:
+  case Intrinsic::ppc_altivec_lvewx:
+  case Intrinsic::ppc_vsx_lxvd2x:
+  case Intrinsic::ppc_vsx_lxvw4x:
+  case Intrinsic::ppc_vsx_lxvd2x_be:
+  case Intrinsic::ppc_vsx_lxvw4x_be:
+  case Intrinsic::ppc_vsx_lxvl:
+  case Intrinsic::ppc_vsx_lxvll:
+  case Intrinsic::ppc_vsx_lxvp: {
+    Info.PtrVal = Inst->getArgOperand(0);
+    Info.ReadMem = true;
+    Info.WriteMem = false;
+    return true;
+  }
+  case Intrinsic::ppc_altivec_stvx:
+  case Intrinsic::ppc_altivec_stvxl:
+  case Intrinsic::ppc_altivec_stvebx:
+  case Intrinsic::ppc_altivec_stvehx:
+  case Intrinsic::ppc_altivec_stvewx:
+  case Intrinsic::ppc_vsx_stxvd2x:
+  case Intrinsic::ppc_vsx_stxvw4x:
+  case Intrinsic::ppc_vsx_stxvd2x_be:
+  case Intrinsic::ppc_vsx_stxvw4x_be:
+  case Intrinsic::ppc_vsx_stxvl:
+  case Intrinsic::ppc_vsx_stxvll:
+  case Intrinsic::ppc_vsx_stxvp: {
+    Info.PtrVal = Inst->getArgOperand(1);
+    Info.ReadMem = false;
+    Info.WriteMem = true;
+    return true;
+  }
+  default:
+    break;
+  }
+
+  return false;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
index d998521084e1..c38ae90bc7dc 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -41,6 +41,9 @@ public:
       : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
         TLI(ST->getTargetLowering()) {}
 
+  Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
+                                               IntrinsicInst &II) const;
+
   /// \name Scalar TTI Implementations
   /// @{
 
@@ -49,7 +52,8 @@ public:
                     TTI::TargetCostKind CostKind);
 
   int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,
-                        Type *Ty, TTI::TargetCostKind CostKind);
+                        Type *Ty, TTI::TargetCostKind CostKind,
+                        Instruction *Inst = nullptr);
   int getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
                           Type *Ty, TTI::TargetCostKind CostKind);
 
@@ -64,12 +68,14 @@ public:
   bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, LoopInfo *LI,
                   DominatorTree *DT, AssumptionCache *AC,
                   TargetLibraryInfo *LibInfo);
+  bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info);
   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                TTI::UnrollingPreferences &UP);
   void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
                              TTI::PeelingPreferences &PP);
   bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
                      TargetTransformInfo::LSRCost &C2);
+  bool isNumRegsMajorCostOfLSR();
 
   /// @}
 
@@ -103,10 +109,11 @@ public:
       const Instruction *CxtI = nullptr);
   int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
   int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
-                       TTI::TargetCostKind CostKind,
+                       TTI::CastContextHint CCH, TTI::TargetCostKind CostKind,
                        const Instruction *I = nullptr);
   int getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind);
   int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                         CmpInst::Predicate VecPred,
                          TTI::TargetCostKind CostKind,
                          const Instruction *I = nullptr);
   int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
@@ -122,6 +129,9 @@ public:
   unsigned getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                  TTI::TargetCostKind CostKind);
 
+  bool areFunctionArgsABICompatible(const Function *Caller,
+                                    const Function *Callee,
+                                    SmallPtrSetImpl<Argument *> &Args) const;
   /// @}
 };
 
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
index 3e6d1c7939f1..e72e29112da7 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
@@ -315,9 +315,9 @@ protected:
         // Extend the live interval of the addend source (it might end at the
         // copy to be removed, or somewhere in between there and here). This
         // is necessary only if it is a physical register.
-        if (!Register::isVirtualRegister(AddendSrcReg))
-          for (MCRegUnitIterator Units(AddendSrcReg, TRI); Units.isValid();
-               ++Units) {
+        if (!AddendSrcReg.isVirtual())
+          for (MCRegUnitIterator Units(AddendSrcReg.asMCReg(), TRI);
+               Units.isValid(); ++Units) {
             unsigned Unit = *Units;
 
             LiveRange &AddendSrcRange = LIS->getRegUnit(Unit);
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
index c3729da0b07b..ff251f55afff 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
@@ -254,10 +254,11 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() {
         if (!MO.isReg())
           continue;
         Register Reg = MO.getReg();
-        if (isAnyVecReg(Reg, Partial)) {
+        // All operands need to be checked because there are instructions that
+        // operate on a partial register and produce a full register (such as
+        // XXPERMDIs).
+        if (isAnyVecReg(Reg, Partial))
           RelevantInstr = true;
-          break;
-        }
       }
 
       if (!RelevantInstr)
@@ -689,6 +690,29 @@ void PPCVSXSwapRemoval::recordUnoptimizableWebs() {
           LLVM_DEBUG(UseMI.dump());
           LLVM_DEBUG(dbgs() << "\n");
         }
+
+        // It is possible that the load feeds a swap and that swap feeds a
+        // store. In such a case, the code is actually trying to store a swapped
+        // vector. We must reject such webs.
+        if (SwapVector[UseIdx].IsSwap && !SwapVector[UseIdx].IsLoad &&
+            !SwapVector[UseIdx].IsStore) {
+          Register SwapDefReg = UseMI.getOperand(0).getReg();
+          for (MachineInstr &UseOfUseMI :
+               MRI->use_nodbg_instructions(SwapDefReg)) {
+            int UseOfUseIdx = SwapMap[&UseOfUseMI];
+            if (SwapVector[UseOfUseIdx].IsStore) {
+              SwapVector[Repr].WebRejected = 1;
+              LLVM_DEBUG(
+                  dbgs() << format(
+                      "Web %d rejected for load/swap feeding a store\n", Repr));
+              LLVM_DEBUG(dbgs() << "  def " << EntryIdx << ": ");
+              LLVM_DEBUG(MI->dump());
+              LLVM_DEBUG(dbgs() << "  use " << UseIdx << ": ");
+              LLVM_DEBUG(UseMI.dump());
+              LLVM_DEBUG(dbgs() << "\n");
+            }
+          }
+        }
       }
 
     // Reject webs that contain swapping stores that are fed by something
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp
index 649bd648a6cf..6bb952f27fee 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp
@@ -14,6 +14,10 @@ Target &llvm::getThePPC32Target() {
   static Target ThePPC32Target;
   return ThePPC32Target;
 }
+Target &llvm::getThePPC32LETarget() {
+  static Target ThePPC32LETarget;
+  return ThePPC32LETarget;
+}
 Target &llvm::getThePPC64Target() {
   static Target ThePPC64Target;
   return ThePPC64Target;
@@ -24,9 +28,12 @@ Target &llvm::getThePPC64LETarget() {
 }
 
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCTargetInfo() {
-  RegisterTarget<Triple::ppc, /*HasJIT=*/true> X(getThePPC32Target(), "ppc32",
+  RegisterTarget<Triple::ppc, /*HasJIT=*/true> W(getThePPC32Target(), "ppc32",
                                                  "PowerPC 32", "PPC");
 
+  RegisterTarget<Triple::ppcle, /*HasJIT=*/true> X(
+      getThePPC32LETarget(), "ppc32le", "PowerPC 32 LE", "PPC");
+
   RegisterTarget<Triple::ppc64, /*HasJIT=*/true> Y(getThePPC64Target(), "ppc64",
                                                    "PowerPC 64", "PPC");
 
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.h b/contrib/llvm-project/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.h
index 2d0afbfb1be0..f9d20ef00df8 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.h
@@ -14,6 +14,7 @@ namespace llvm {
 class Target;
 
 Target &getThePPC32Target();
+Target &getThePPC32LETarget();
 Target &getThePPC64Target();
 Target &getThePPC64LETarget();
 
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index 407f980bd35e..dcf7525d7458 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -7,20 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/RISCVAsmBackend.h"
+#include "MCTargetDesc/RISCVBaseInfo.h"
+#include "MCTargetDesc/RISCVInstPrinter.h"
 #include "MCTargetDesc/RISCVMCExpr.h"
 #include "MCTargetDesc/RISCVMCTargetDesc.h"
+#include "MCTargetDesc/RISCVMatInt.h"
 #include "MCTargetDesc/RISCVTargetStreamer.h"
-#include "RISCVInstrInfo.h"
 #include "TargetInfo/RISCVTargetInfo.h"
-#include "Utils/RISCVBaseInfo.h"
-#include "Utils/RISCVMatInt.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/CodeGen/Register.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
@@ -33,6 +31,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCValue.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/RISCVAttributes.h"
@@ -99,7 +98,7 @@ class RISCVAsmParser : public MCTargetAsmParser {
 
   // Helper to emit a combination of LUI, ADDI(W), and SLLI instructions that
   // synthesize the desired immedate value into the destination register.
-  void emitLoadImm(Register DestReg, int64_t Value, MCStreamer &Out);
+  void emitLoadImm(MCRegister DestReg, int64_t Value, MCStreamer &Out);
 
   // Helper to emit a combination of AUIPC and SecondOpcode. Used to implement
   // helpers such as emitLoadLocalAddress and emitLoadAddress.
@@ -125,6 +124,13 @@ class RISCVAsmParser : public MCTargetAsmParser {
   void emitLoadStoreSymbol(MCInst &Inst, unsigned Opcode, SMLoc IDLoc,
                            MCStreamer &Out, bool HasTmpReg);
 
+  // Helper to emit pseudo sign/zero extend instruction.
+  void emitPseudoExtend(MCInst &Inst, bool SignExtend, int64_t Width,
+                        SMLoc IDLoc, MCStreamer &Out);
+
+  // Helper to emit pseudo vmsge{u}.vx instruction.
+  void emitVMSGE(MCInst &Inst, unsigned Opcode, SMLoc IDLoc, MCStreamer &Out);
+
   // Checks that a PseudoAddTPRel is using x4/tp in its second input operand.
   // Enforcing this using a restricted register class for the second input
   // operand of PseudoAddTPRel results in a poor diagnostic due to the fact
@@ -217,8 +223,7 @@ public:
   };
 
   static bool classifySymbolRef(const MCExpr *Expr,
-                                RISCVMCExpr::VariantKind &Kind,
-                                int64_t &Addend);
+                                RISCVMCExpr::VariantKind &Kind);
 
   RISCVAsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser,
                  const MCInstrInfo &MII, const MCTargetOptions &Options)
@@ -262,7 +267,7 @@ struct RISCVOperand : public MCParsedAsmOperand {
   bool IsRV64;
 
   struct RegOp {
-    Register RegNum;
+    MCRegister RegNum;
   };
 
   struct ImmOp {
@@ -277,23 +282,8 @@ struct RISCVOperand : public MCParsedAsmOperand {
     // e.g.: read/write or user/supervisor/machine privileges.
   };
 
-  enum class VSEW {
-    SEW_8 = 0,
-    SEW_16,
-    SEW_32,
-    SEW_64,
-    SEW_128,
-    SEW_256,
-    SEW_512,
-    SEW_1024,
-  };
-
-  enum class VLMUL { LMUL_1 = 0, LMUL_2, LMUL_4, LMUL_8 };
-
   struct VTypeOp {
-    VSEW Sew;
-    VLMUL Lmul;
-    unsigned Encoding;
+    unsigned Val;
   };
 
   SMLoc StartLoc, EndLoc;
@@ -373,7 +363,7 @@ public:
     bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
     bool IsValid;
     if (!IsConstantImm)
-      IsValid = RISCVAsmParser::classifySymbolRef(getImm(), VK, Imm);
+      IsValid = RISCVAsmParser::classifySymbolRef(getImm(), VK);
     else
       IsValid = isShiftedInt<N - 1, 1>(Imm);
     return IsValid && VK == RISCVMCExpr::VK_RISCV_None;
@@ -387,7 +377,7 @@ public:
     // Must be of 'immediate' type but not a constant.
     if (!isImm() || evaluateConstantImm(getImm(), Imm, VK))
       return false;
-    return RISCVAsmParser::classifySymbolRef(getImm(), VK, Imm) &&
+    return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
            VK == RISCVMCExpr::VK_RISCV_None;
   }
 
@@ -397,7 +387,7 @@ public:
     // Must be of 'immediate' type but not a constant.
     if (!isImm() || evaluateConstantImm(getImm(), Imm, VK))
       return false;
-    return RISCVAsmParser::classifySymbolRef(getImm(), VK, Imm) &&
+    return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
            (VK == RISCVMCExpr::VK_RISCV_CALL ||
             VK == RISCVMCExpr::VK_RISCV_CALL_PLT);
   }
@@ -408,7 +398,7 @@ public:
     // Must be of 'immediate' type but not a constant.
     if (!isImm() || evaluateConstantImm(getImm(), Imm, VK))
       return false;
-    return RISCVAsmParser::classifySymbolRef(getImm(), VK, Imm) &&
+    return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
            VK == RISCVMCExpr::VK_RISCV_CALL;
   }
 
@@ -418,7 +408,7 @@ public:
     // Must be of 'immediate' type but not a constant.
     if (!isImm() || evaluateConstantImm(getImm(), Imm, VK))
       return false;
-    return RISCVAsmParser::classifySymbolRef(getImm(), VK, Imm) &&
+    return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
            VK == RISCVMCExpr::VK_RISCV_TPREL_ADD;
   }
 
@@ -523,16 +513,6 @@ public:
     return IsConstantImm && isUInt<5>(Imm) && VK == RISCVMCExpr::VK_RISCV_None;
   }
 
-  bool isUImm5NonZero() const {
-    int64_t Imm;
-    RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
-    if (!isImm())
-      return false;
-    bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
-    return IsConstantImm && isUInt<5>(Imm) && (Imm != 0) &&
-           VK == RISCVMCExpr::VK_RISCV_None;
-  }
-
   bool isSImm5() const {
     if (!isImm())
       return false;
@@ -549,7 +529,7 @@ public:
     int64_t Imm;
     bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
     return IsConstantImm && isInt<6>(Imm) &&
-	    VK == RISCVMCExpr::VK_RISCV_None;
+           VK == RISCVMCExpr::VK_RISCV_None;
   }
 
   bool isSImm6NonZero() const {
@@ -633,7 +613,7 @@ public:
       return false;
     bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
     if (!IsConstantImm)
-      IsValid = RISCVAsmParser::classifySymbolRef(getImm(), VK, Imm);
+      IsValid = RISCVAsmParser::classifySymbolRef(getImm(), VK);
     else
       IsValid = isInt<12>(Imm);
     return IsValid && ((IsConstantImm && VK == RISCVMCExpr::VK_RISCV_None) ||
@@ -664,7 +644,7 @@ public:
       return false;
     bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
     if (!IsConstantImm) {
-      IsValid = RISCVAsmParser::classifySymbolRef(getImm(), VK, Imm);
+      IsValid = RISCVAsmParser::classifySymbolRef(getImm(), VK);
       return IsValid && (VK == RISCVMCExpr::VK_RISCV_HI ||
                          VK == RISCVMCExpr::VK_RISCV_TPREL_HI);
     } else {
@@ -682,7 +662,7 @@ public:
       return false;
     bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
     if (!IsConstantImm) {
-      IsValid = RISCVAsmParser::classifySymbolRef(getImm(), VK, Imm);
+      IsValid = RISCVAsmParser::classifySymbolRef(getImm(), VK);
       return IsValid && (VK == RISCVMCExpr::VK_RISCV_PCREL_HI ||
                          VK == RISCVMCExpr::VK_RISCV_GOT_HI ||
                          VK == RISCVMCExpr::VK_RISCV_TLS_GOT_HI ||
@@ -730,7 +710,7 @@ public:
   }
 
   StringRef getSysReg() const {
-    assert(Kind == KindTy::SystemRegister && "Invalid access!");
+    assert(Kind == KindTy::SystemRegister && "Invalid type access!");
     return StringRef(SysReg.Data, SysReg.Length);
   }
 
@@ -744,59 +724,25 @@ public:
     return Tok;
   }
 
-  static StringRef getSEWStr(VSEW Sew) {
-    switch (Sew) {
-    case VSEW::SEW_8:
-      return "e8";
-    case VSEW::SEW_16:
-      return "e16";
-    case VSEW::SEW_32:
-      return "e32";
-    case VSEW::SEW_64:
-      return "e64";
-    case VSEW::SEW_128:
-      return "e128";
-    case VSEW::SEW_256:
-      return "e256";
-    case VSEW::SEW_512:
-      return "e512";
-    case VSEW::SEW_1024:
-      return "e1024";
-    }
-    return "";
-  }
-
-  static StringRef getLMULStr(VLMUL Lmul) {
-    switch (Lmul) {
-    case VLMUL::LMUL_1:
-      return "m1";
-    case VLMUL::LMUL_2:
-      return "m2";
-    case VLMUL::LMUL_4:
-      return "m4";
-    case VLMUL::LMUL_8:
-      return "m8";
-    }
-    return "";
-  }
-
-  StringRef getVType(SmallString<32> &Buf) const {
-    assert(Kind == KindTy::VType && "Invalid access!");
-    Buf.append(getSEWStr(VType.Sew));
-    Buf.append(",");
-    Buf.append(getLMULStr(VType.Lmul));
-
-    return Buf.str();
+  unsigned getVType() const {
+    assert(Kind == KindTy::VType && "Invalid type access!");
+    return VType.Val;
   }
 
   void print(raw_ostream &OS) const override {
+    auto RegName = [](unsigned Reg) {
+      if (Reg)
+        return RISCVInstPrinter::getRegisterName(Reg);
+      else
+        return "noreg";
+    };
+
     switch (Kind) {
     case KindTy::Immediate:
       OS << *getImm();
       break;
     case KindTy::Register:
-      OS << "<register x";
-      OS << getReg() << ">";
+      OS << "<register " << RegName(getReg()) << ">";
       break;
     case KindTy::Token:
       OS << "'" << getToken() << "'";
@@ -805,8 +751,9 @@ public:
       OS << "<sysreg: " << getSysReg() << '>';
       break;
     case KindTy::VType:
-      SmallString<32> VTypeBuf;
-      OS << "<vtype: " << getVType(VTypeBuf) << '>';
+      OS << "<vtype: ";
+      RISCVVType::printVType(getVType(), OS);
+      OS << '>';
       break;
     }
   }
@@ -852,15 +799,10 @@ public:
     return Op;
   }
 
-  static std::unique_ptr<RISCVOperand> createVType(APInt Sew, APInt Lmul,
-                                                   SMLoc S, bool IsRV64) {
+  static std::unique_ptr<RISCVOperand> createVType(unsigned VTypeI, SMLoc S,
+                                                   bool IsRV64) {
     auto Op = std::make_unique<RISCVOperand>(KindTy::VType);
-    Sew.ashrInPlace(3);
-    unsigned SewLog2 = Sew.logBase2();
-    unsigned LmulLog2 = Lmul.logBase2();
-    Op->VType.Sew = static_cast<VSEW>(SewLog2);
-    Op->VType.Lmul = static_cast<VLMUL>(LmulLog2);
-    Op->VType.Encoding = (SewLog2 << 2) | LmulLog2;
+    Op->VType.Val = VTypeI;
     Op->StartLoc = S;
     Op->IsRV64 = IsRV64;
     return Op;
@@ -889,16 +831,6 @@ public:
     addExpr(Inst, getImm());
   }
 
-  void addSImm5Plus1Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    int64_t Imm = 0;
-    RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
-    bool IsConstant = evaluateConstantImm(getImm(), Imm, VK);
-    assert(IsConstant && "Expect constant value!");
-    (void)IsConstant;
-    Inst.addOperand(MCOperand::createImm(Imm - 1));
-  }
-
   void addFenceArgOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     // isFenceArg has validated the operand, meaning this cast is safe
@@ -925,7 +857,7 @@ public:
 
   void addVTypeIOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::createImm(VType.Encoding));
+    Inst.addOperand(MCOperand::createImm(getVType()));
   }
 
   // Returns the rounding mode represented by this RISCVOperand. Should only
@@ -952,7 +884,12 @@ public:
 #define GET_MNEMONIC_SPELL_CHECKER
 #include "RISCVGenAsmMatcher.inc"
 
-static Register convertFPR64ToFPR32(Register Reg) {
+static MCRegister convertFPR64ToFPR16(MCRegister Reg) {
+  assert(Reg >= RISCV::F0_D && Reg <= RISCV::F31_D && "Invalid register");
+  return Reg - RISCV::F0_D + RISCV::F0_H;
+}
+
+static MCRegister convertFPR64ToFPR32(MCRegister Reg) {
   assert(Reg >= RISCV::F0_D && Reg <= RISCV::F31_D && "Invalid register");
   return Reg - RISCV::F0_D + RISCV::F0_F;
 }
@@ -963,7 +900,7 @@ unsigned RISCVAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
   if (!Op.isReg())
     return Match_InvalidOperand;
 
-  Register Reg = Op.getReg();
+  MCRegister Reg = Op.getReg();
   bool IsRegFPR64 =
       RISCVMCRegisterClasses[RISCV::FPR64RegClassID].contains(Reg);
   bool IsRegFPR64C =
@@ -976,6 +913,12 @@ unsigned RISCVAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
     Op.Reg.RegNum = convertFPR64ToFPR32(Reg);
     return Match_Success;
   }
+  // As the parser couldn't differentiate an FPR16 from an FPR64, coerce the
+  // register from FPR64 to FPR16 if necessary.
+  if (IsRegFPR64 && Kind == MCK_FPR16) {
+    Op.Reg.RegNum = convertFPR64ToFPR16(Reg);
+    return Match_Success;
+  }
   return Match_InvalidOperand;
 }
 
@@ -1079,6 +1022,9 @@ bool RISCVAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 4) - 1);
   case Match_InvalidUImm5:
     return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 5) - 1);
+  case Match_InvalidSImm5:
+    return generateImmOutOfRangeError(Operands, ErrorInfo, -(1 << 4),
+                                      (1 << 4) - 1);
   case Match_InvalidSImm6:
     return generateImmOutOfRangeError(Operands, ErrorInfo, -(1 << 5),
                                       (1 << 5) - 1);
@@ -1181,8 +1127,10 @@ bool RISCVAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   }
   case Match_InvalidVTypeI: {
     SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
-    return Error(ErrorLoc,
-                 "operand must be e[8|16|32|64|128|256|512|1024],m[1|2|4|8]");
+    return Error(
+        ErrorLoc,
+        "operand must be "
+        "e[8|16|32|64|128|256|512|1024],m[1|2|4|8|f2|f4|f8],[ta|tu],[ma|mu]");
   }
   case Match_InvalidVMaskRegister: {
     SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
@@ -1202,13 +1150,15 @@ bool RISCVAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
 // alternative ABI names), setting RegNo to the matching register. Upon
 // failure, returns true and sets RegNo to 0. If IsRV32E then registers
 // x16-x31 will be rejected.
-static bool matchRegisterNameHelper(bool IsRV32E, Register &RegNo,
+static bool matchRegisterNameHelper(bool IsRV32E, MCRegister &RegNo,
                                     StringRef Name) {
   RegNo = MatchRegisterName(Name);
-  // The 32- and 64-bit FPRs have the same asm name. Check that the initial
-  // match always matches the 64-bit variant, and not the 32-bit one.
+  // The 16-/32- and 64-bit FPRs have the same asm name. Check that the initial
+  // match always matches the 64-bit variant, and not the 16/32-bit one.
+  assert(!(RegNo >= RISCV::F0_H && RegNo <= RISCV::F31_H));
   assert(!(RegNo >= RISCV::F0_F && RegNo <= RISCV::F31_F));
   // The default FPR register class is based on the tablegen enum ordering.
+  static_assert(RISCV::F0_D < RISCV::F0_H, "FPR matching must be updated");
   static_assert(RISCV::F0_D < RISCV::F0_F, "FPR matching must be updated");
   if (RegNo == RISCV::NoRegister)
     RegNo = MatchRegisterAltName(Name);
@@ -1233,7 +1183,7 @@ OperandMatchResultTy RISCVAsmParser::tryParseRegister(unsigned &RegNo,
   RegNo = 0;
   StringRef Name = getLexer().getTok().getIdentifier();
 
-  if (matchRegisterNameHelper(isRV32E(), (Register &)RegNo, Name))
+  if (matchRegisterNameHelper(isRV32E(), (MCRegister &)RegNo, Name))
     return MatchOperand_NoMatch;
 
   getParser().Lex(); // Eat identifier token.
@@ -1265,7 +1215,7 @@ OperandMatchResultTy RISCVAsmParser::parseRegister(OperandVector &Operands,
     return MatchOperand_NoMatch;
   case AsmToken::Identifier:
     StringRef Name = getLexer().getTok().getIdentifier();
-    Register RegNo;
+    MCRegister RegNo;
     matchRegisterNameHelper(isRV32E(), RegNo, Name);
 
     if (RegNo == RISCV::NoRegister) {
@@ -1549,39 +1499,75 @@ OperandMatchResultTy RISCVAsmParser::parseVTypeI(OperandVector &Operands) {
   if (getLexer().getKind() != AsmToken::Identifier)
     return MatchOperand_NoMatch;
 
-  // Parse "e8,m1"
+  // Parse "e8,m1,t[a|u],m[a|u]"
   StringRef Name = getLexer().getTok().getIdentifier();
   if (!Name.consume_front("e"))
     return MatchOperand_NoMatch;
-  APInt Sew(16, Name, 10);
-  if (Sew != 8 && Sew != 16 && Sew != 32 && Sew != 64 && Sew != 128 &&
-      Sew != 256 && Sew != 512 && Sew != 1024)
+  unsigned Sew;
+  if (Name.getAsInteger(10, Sew))
+    return MatchOperand_NoMatch;
+  if (!RISCVVType::isValidSEW(Sew))
     return MatchOperand_NoMatch;
   getLexer().Lex();
 
-  if (getLexer().getKind() == AsmToken::EndOfStatement) {
-    Operands.push_back(
-        RISCVOperand::createVType(Sew, APInt(16, 1), S, isRV64()));
+  if (!getLexer().is(AsmToken::Comma))
+    return MatchOperand_NoMatch;
+  getLexer().Lex();
 
-    return MatchOperand_Success;
-  }
+  Name = getLexer().getTok().getIdentifier();
+  if (!Name.consume_front("m"))
+    return MatchOperand_NoMatch;
+  // "m" or "mf"
+  bool Fractional = Name.consume_front("f");
+  unsigned Lmul;
+  if (Name.getAsInteger(10, Lmul))
+    return MatchOperand_NoMatch;
+  if (!RISCVVType::isValidLMUL(Lmul, Fractional))
+    return MatchOperand_NoMatch;
+  getLexer().Lex();
 
   if (!getLexer().is(AsmToken::Comma))
     return MatchOperand_NoMatch;
   getLexer().Lex();
 
   Name = getLexer().getTok().getIdentifier();
-  if (!Name.consume_front("m"))
+  // ta or tu
+  bool TailAgnostic;
+  if (Name == "ta")
+    TailAgnostic = true;
+  else if (Name == "tu")
+    TailAgnostic = false;
+  else
+    return MatchOperand_NoMatch;
+  getLexer().Lex();
+
+  if (!getLexer().is(AsmToken::Comma))
     return MatchOperand_NoMatch;
-  APInt Lmul(16, Name, 10);
-  if (Lmul != 1 && Lmul != 2 && Lmul != 4 && Lmul != 8)
+  getLexer().Lex();
+
+  Name = getLexer().getTok().getIdentifier();
+  // ma or mu
+  bool MaskAgnostic;
+  if (Name == "ma")
+    MaskAgnostic = true;
+  else if (Name == "mu")
+    MaskAgnostic = false;
+  else
     return MatchOperand_NoMatch;
   getLexer().Lex();
 
   if (getLexer().getKind() != AsmToken::EndOfStatement)
     return MatchOperand_NoMatch;
 
-  Operands.push_back(RISCVOperand::createVType(Sew, Lmul, S, isRV64()));
+  unsigned SewLog2 = Log2_32(Sew / 8);
+  unsigned LmulLog2 = Log2_32(Lmul);
+  RISCVVSEW VSEW = static_cast<RISCVVSEW>(SewLog2);
+  RISCVVLMUL VLMUL =
+      static_cast<RISCVVLMUL>(Fractional ? 8 - LmulLog2 : LmulLog2);
+
+  unsigned VTypeI =
+      RISCVVType::encodeVTYPE(VLMUL, VSEW, TailAgnostic, MaskAgnostic);
+  Operands.push_back(RISCVOperand::createVType(VTypeI, S, isRV64()));
 
   return MatchOperand_Success;
 }
@@ -1596,7 +1582,7 @@ OperandMatchResultTy RISCVAsmParser::parseMaskReg(OperandVector &Operands) {
       Error(getLoc(), "expected '.t' suffix");
       return MatchOperand_ParseFail;
     }
-    Register RegNo;
+    MCRegister RegNo;
     matchRegisterNameHelper(isRV32E(), RegNo, Name);
 
     if (RegNo == RISCV::NoRegister)
@@ -1788,48 +1774,19 @@ bool RISCVAsmParser::ParseInstruction(ParseInstructionInfo &Info,
 }
 
 bool RISCVAsmParser::classifySymbolRef(const MCExpr *Expr,
-                                       RISCVMCExpr::VariantKind &Kind,
-                                       int64_t &Addend) {
+                                       RISCVMCExpr::VariantKind &Kind) {
   Kind = RISCVMCExpr::VK_RISCV_None;
-  Addend = 0;
 
   if (const RISCVMCExpr *RE = dyn_cast<RISCVMCExpr>(Expr)) {
     Kind = RE->getKind();
     Expr = RE->getSubExpr();
   }
 
-  // It's a simple symbol reference or constant with no addend.
-  if (isa<MCConstantExpr>(Expr) || isa<MCSymbolRefExpr>(Expr))
-    return true;
-
-  const MCBinaryExpr *BE = dyn_cast<MCBinaryExpr>(Expr);
-  if (!BE)
-    return false;
-
-  if (!isa<MCSymbolRefExpr>(BE->getLHS()))
-    return false;
-
-  if (BE->getOpcode() != MCBinaryExpr::Add &&
-      BE->getOpcode() != MCBinaryExpr::Sub)
-    return false;
-
-  // We are able to support the subtraction of two symbol references
-  if (BE->getOpcode() == MCBinaryExpr::Sub &&
-      isa<MCSymbolRefExpr>(BE->getRHS()))
-    return true;
-
-  // See if the addend is a constant, otherwise there's more going
-  // on here than we can deal with.
-  auto AddendExpr = dyn_cast<MCConstantExpr>(BE->getRHS());
-  if (!AddendExpr)
-    return false;
-
-  Addend = AddendExpr->getValue();
-  if (BE->getOpcode() == MCBinaryExpr::Sub)
-    Addend = -Addend;
-
-  // It's some symbol reference + a constant addend
-  return Kind != RISCVMCExpr::VK_RISCV_Invalid;
+  MCValue Res;
+  MCFixup Fixup;
+  if (Expr->evaluateAsRelocatable(Res, nullptr, &Fixup))
+    return Res.getRefKind() == RISCVMCExpr::VK_RISCV_None;
+  return false;
 }
 
 bool RISCVAsmParser::ParseDirective(AsmToken DirectiveID) {
@@ -2040,7 +1997,33 @@ bool RISCVAsmParser::parseDirectiveAttribute() {
     else
       return Error(ValueExprLoc, "bad arch string " + Arch);
 
+    // .attribute arch overrides the current architecture, so unset all
+    // currently enabled extensions
+    clearFeatureBits(RISCV::FeatureRV32E, "e");
+    clearFeatureBits(RISCV::FeatureStdExtM, "m");
+    clearFeatureBits(RISCV::FeatureStdExtA, "a");
+    clearFeatureBits(RISCV::FeatureStdExtF, "f");
+    clearFeatureBits(RISCV::FeatureStdExtD, "d");
+    clearFeatureBits(RISCV::FeatureStdExtC, "c");
+    clearFeatureBits(RISCV::FeatureStdExtB, "experimental-b");
+    clearFeatureBits(RISCV::FeatureStdExtV, "experimental-v");
+    clearFeatureBits(RISCV::FeatureExtZfh, "experimental-zfh");
+    clearFeatureBits(RISCV::FeatureExtZba, "experimental-zba");
+    clearFeatureBits(RISCV::FeatureExtZbb, "experimental-zbb");
+    clearFeatureBits(RISCV::FeatureExtZbc, "experimental-zbc");
+    clearFeatureBits(RISCV::FeatureExtZbe, "experimental-zbe");
+    clearFeatureBits(RISCV::FeatureExtZbf, "experimental-zbf");
+    clearFeatureBits(RISCV::FeatureExtZbm, "experimental-zbm");
+    clearFeatureBits(RISCV::FeatureExtZbp, "experimental-zbp");
+    clearFeatureBits(RISCV::FeatureExtZbproposedc, "experimental-zbproposedc");
+    clearFeatureBits(RISCV::FeatureExtZbr, "experimental-zbr");
+    clearFeatureBits(RISCV::FeatureExtZbs, "experimental-zbs");
+    clearFeatureBits(RISCV::FeatureExtZbt, "experimental-zbt");
+    clearFeatureBits(RISCV::FeatureExtZvamo, "experimental-zvamo");
+    clearFeatureBits(RISCV::FeatureStdExtZvlsseg, "experimental-zvlsseg");
+
     while (!Arch.empty()) {
+      bool DropFirst = true;
       if (Arch[0] == 'i')
         clearFeatureBits(RISCV::FeatureRV32E, "e");
       else if (Arch[0] == 'e')
@@ -2062,19 +2045,57 @@ bool RISCVAsmParser::parseDirectiveAttribute() {
         setFeatureBits(RISCV::FeatureStdExtD, "d");
       } else if (Arch[0] == 'c') {
         setFeatureBits(RISCV::FeatureStdExtC, "c");
+      } else if (Arch[0] == 'b') {
+        setFeatureBits(RISCV::FeatureStdExtB, "experimental-b");
+      } else if (Arch[0] == 'v') {
+        setFeatureBits(RISCV::FeatureStdExtV, "experimental-v");
+      } else if (Arch[0] == 's' || Arch[0] == 'x' || Arch[0] == 'z') {
+        StringRef Ext =
+            Arch.take_until([](char c) { return ::isdigit(c) || c == '_'; });
+        if (Ext == "zba")
+          setFeatureBits(RISCV::FeatureExtZba, "experimental-zba");
+        else if (Ext == "zbb")
+          setFeatureBits(RISCV::FeatureExtZbb, "experimental-zbb");
+        else if (Ext == "zbc")
+          setFeatureBits(RISCV::FeatureExtZbc, "experimental-zbc");
+        else if (Ext == "zbe")
+          setFeatureBits(RISCV::FeatureExtZbe, "experimental-zbe");
+        else if (Ext == "zbf")
+          setFeatureBits(RISCV::FeatureExtZbf, "experimental-zbf");
+        else if (Ext == "zbm")
+          setFeatureBits(RISCV::FeatureExtZbm, "experimental-zbm");
+        else if (Ext == "zbp")
+          setFeatureBits(RISCV::FeatureExtZbp, "experimental-zbp");
+        else if (Ext == "zbproposedc")
+          setFeatureBits(RISCV::FeatureExtZbproposedc,
+                         "experimental-zbproposedc");
+        else if (Ext == "zbr")
+          setFeatureBits(RISCV::FeatureExtZbr, "experimental-zbr");
+        else if (Ext == "zbs")
+          setFeatureBits(RISCV::FeatureExtZbs, "experimental-zbs");
+        else if (Ext == "zbt")
+          setFeatureBits(RISCV::FeatureExtZbt, "experimental-zbt");
+        else if (Ext == "zfh")
+          setFeatureBits(RISCV::FeatureExtZfh, "experimental-zfh");
+        else if (Ext == "zvamo")
+          setFeatureBits(RISCV::FeatureExtZvamo, "experimental-zvamo");
+        else if (Ext == "zvlsseg")
+          setFeatureBits(RISCV::FeatureStdExtZvlsseg, "experimental-zvlsseg");
+        else
+          return Error(ValueExprLoc, "bad arch string " + Ext);
+        Arch = Arch.drop_until([](char c) { return ::isdigit(c) || c == '_'; });
+        DropFirst = false;
       } else
         return Error(ValueExprLoc, "bad arch string " + Arch);
 
-      Arch = Arch.drop_front(1);
+      if (DropFirst)
+        Arch = Arch.drop_front(1);
       int major = 0;
       int minor = 0;
       Arch.consumeInteger(10, major);
       Arch.consume_front("p");
       Arch.consumeInteger(10, minor);
-      if (major != 0 || minor != 0) {
-        Arch = Arch.drop_until([](char c) { return c == '_' || c == '"'; });
-        Arch = Arch.drop_while([](char c) { return c == '_'; });
-      }
+      Arch = Arch.drop_while([](char c) { return c == '_'; });
     }
   }
 
@@ -2102,6 +2123,38 @@ bool RISCVAsmParser::parseDirectiveAttribute() {
         formalArchStr = (Twine(formalArchStr) + "_d2p0").str();
       if (getFeatureBits(RISCV::FeatureStdExtC))
         formalArchStr = (Twine(formalArchStr) + "_c2p0").str();
+      if (getFeatureBits(RISCV::FeatureStdExtB))
+        formalArchStr = (Twine(formalArchStr) + "_b0p93").str();
+      if (getFeatureBits(RISCV::FeatureStdExtV))
+        formalArchStr = (Twine(formalArchStr) + "_v0p10").str();
+      if (getFeatureBits(RISCV::FeatureExtZfh))
+        formalArchStr = (Twine(formalArchStr) + "_zfh0p1").str();
+      if (getFeatureBits(RISCV::FeatureExtZba))
+        formalArchStr = (Twine(formalArchStr) + "_zba0p93").str();
+      if (getFeatureBits(RISCV::FeatureExtZbb))
+        formalArchStr = (Twine(formalArchStr) + "_zbb0p93").str();
+      if (getFeatureBits(RISCV::FeatureExtZbc))
+        formalArchStr = (Twine(formalArchStr) + "_zbc0p93").str();
+      if (getFeatureBits(RISCV::FeatureExtZbe))
+        formalArchStr = (Twine(formalArchStr) + "_zbe0p93").str();
+      if (getFeatureBits(RISCV::FeatureExtZbf))
+        formalArchStr = (Twine(formalArchStr) + "_zbf0p93").str();
+      if (getFeatureBits(RISCV::FeatureExtZbm))
+        formalArchStr = (Twine(formalArchStr) + "_zbm0p93").str();
+      if (getFeatureBits(RISCV::FeatureExtZbp))
+        formalArchStr = (Twine(formalArchStr) + "_zbp0p93").str();
+      if (getFeatureBits(RISCV::FeatureExtZbproposedc))
+        formalArchStr = (Twine(formalArchStr) + "_zbproposedc0p93").str();
+      if (getFeatureBits(RISCV::FeatureExtZbr))
+        formalArchStr = (Twine(formalArchStr) + "_zbr0p93").str();
+      if (getFeatureBits(RISCV::FeatureExtZbs))
+        formalArchStr = (Twine(formalArchStr) + "_zbs0p93").str();
+      if (getFeatureBits(RISCV::FeatureExtZbt))
+        formalArchStr = (Twine(formalArchStr) + "_zbt0p93").str();
+      if (getFeatureBits(RISCV::FeatureExtZvamo))
+        formalArchStr = (Twine(formalArchStr) + "_zvamo0p10").str();
+      if (getFeatureBits(RISCV::FeatureStdExtZvlsseg))
+        formalArchStr = (Twine(formalArchStr) + "_zvlsseg0p10").str();
 
       getTargetStreamer().emitTextAttribute(Tag, formalArchStr);
     }
@@ -2118,12 +2171,12 @@ void RISCVAsmParser::emitToStreamer(MCStreamer &S, const MCInst &Inst) {
   S.emitInstruction((Res ? CInst : Inst), getSTI());
 }
 
-void RISCVAsmParser::emitLoadImm(Register DestReg, int64_t Value,
+void RISCVAsmParser::emitLoadImm(MCRegister DestReg, int64_t Value,
                                  MCStreamer &Out) {
   RISCVMatInt::InstSeq Seq;
   RISCVMatInt::generateInstSeq(Value, isRV64(), Seq);
 
-  Register SrcReg = RISCV::X0;
+  MCRegister SrcReg = RISCV::X0;
   for (RISCVMatInt::Inst &Inst : Seq) {
     if (Inst.Opc == RISCV::LUI) {
       emitToStreamer(
@@ -2149,8 +2202,7 @@ void RISCVAsmParser::emitAuipcInstPair(MCOperand DestReg, MCOperand TmpReg,
   //             OP DestReg, TmpReg, %pcrel_lo(TmpLabel)
   MCContext &Ctx = getContext();
 
-  MCSymbol *TmpLabel = Ctx.createTempSymbol(
-      "pcrel_hi", /* AlwaysAddSuffix */ true, /* CanBeUnnamed */ false);
+  MCSymbol *TmpLabel = Ctx.createNamedTempSymbol("pcrel_hi");
   Out.emitLabel(TmpLabel);
 
   const RISCVMCExpr *SymbolHi = RISCVMCExpr::create(Symbol, VKHi, Ctx);
@@ -2254,6 +2306,88 @@ void RISCVAsmParser::emitLoadStoreSymbol(MCInst &Inst, unsigned Opcode,
                     Opcode, IDLoc, Out);
 }
 
+void RISCVAsmParser::emitPseudoExtend(MCInst &Inst, bool SignExtend,
+                                      int64_t Width, SMLoc IDLoc,
+                                      MCStreamer &Out) {
+  // The sign/zero extend pseudo-instruction does two shifts, with the shift
+  // amounts dependent on the XLEN.
+  //
+  // The expansion looks like this
+  //
+  //    SLLI rd, rs, XLEN - Width
+  //    SR[A|R]I rd, rd, XLEN - Width
+  MCOperand DestReg = Inst.getOperand(0);
+  MCOperand SourceReg = Inst.getOperand(1);
+
+  unsigned SecondOpcode = SignExtend ? RISCV::SRAI : RISCV::SRLI;
+  int64_t ShAmt = (isRV64() ? 64 : 32) - Width;
+
+  assert(ShAmt > 0 && "Shift amount must be non-zero.");
+
+  emitToStreamer(Out, MCInstBuilder(RISCV::SLLI)
+                          .addOperand(DestReg)
+                          .addOperand(SourceReg)
+                          .addImm(ShAmt));
+
+  emitToStreamer(Out, MCInstBuilder(SecondOpcode)
+                          .addOperand(DestReg)
+                          .addOperand(DestReg)
+                          .addImm(ShAmt));
+}
+
+void RISCVAsmParser::emitVMSGE(MCInst &Inst, unsigned Opcode, SMLoc IDLoc,
+                               MCStreamer &Out) {
+  if (Inst.getNumOperands() == 3) {
+    // unmasked va >= x
+    //
+    //  pseudoinstruction: vmsge{u}.vx vd, va, x
+    //  expansion: vmslt{u}.vx vd, va, x; vmnand.mm vd, vd, vd
+    emitToStreamer(Out, MCInstBuilder(Opcode)
+                            .addOperand(Inst.getOperand(0))
+                            .addOperand(Inst.getOperand(1))
+                            .addOperand(Inst.getOperand(2))
+                            .addReg(RISCV::NoRegister));
+    emitToStreamer(Out, MCInstBuilder(RISCV::VMNAND_MM)
+                            .addOperand(Inst.getOperand(0))
+                            .addOperand(Inst.getOperand(0))
+                            .addOperand(Inst.getOperand(0)));
+  } else if (Inst.getNumOperands() == 4) {
+    // masked va >= x, vd != v0
+    //
+    //  pseudoinstruction: vmsge{u}.vx vd, va, x, v0.t
+    //  expansion: vmslt{u}.vx vd, va, x, v0.t; vmxor.mm vd, vd, v0
+    assert(Inst.getOperand(0).getReg() != RISCV::V0 &&
+           "The destination register should not be V0.");
+    emitToStreamer(Out, MCInstBuilder(Opcode)
+                            .addOperand(Inst.getOperand(0))
+                            .addOperand(Inst.getOperand(1))
+                            .addOperand(Inst.getOperand(2))
+                            .addOperand(Inst.getOperand(3)));
+    emitToStreamer(Out, MCInstBuilder(RISCV::VMXOR_MM)
+                            .addOperand(Inst.getOperand(0))
+                            .addOperand(Inst.getOperand(0))
+                            .addReg(RISCV::V0));
+  } else if (Inst.getNumOperands() == 5) {
+    // masked va >= x, vd == v0
+    //
+    //  pseudoinstruction: vmsge{u}.vx vd, va, x, v0.t, vt
+    //  expansion: vmslt{u}.vx vt, va, x;  vmandnot.mm vd, vd, vt
+    assert(Inst.getOperand(0).getReg() == RISCV::V0 &&
+           "The destination register should be V0.");
+    assert(Inst.getOperand(1).getReg() != RISCV::V0 &&
+           "The temporary vector register should not be V0.");
+    emitToStreamer(Out, MCInstBuilder(Opcode)
+                            .addOperand(Inst.getOperand(1))
+                            .addOperand(Inst.getOperand(2))
+                            .addOperand(Inst.getOperand(3))
+                            .addOperand(Inst.getOperand(4)));
+    emitToStreamer(Out, MCInstBuilder(RISCV::VMANDNOT_MM)
+                            .addOperand(Inst.getOperand(0))
+                            .addOperand(Inst.getOperand(0))
+                            .addOperand(Inst.getOperand(1)));
+  }
+}
+
 bool RISCVAsmParser::checkPseudoAddTPRel(MCInst &Inst,
                                          OperandVector &Operands) {
   assert(Inst.getOpcode() == RISCV::PseudoAddTPRel && "Invalid instruction");
@@ -2275,77 +2409,48 @@ std::unique_ptr<RISCVOperand> RISCVAsmParser::defaultMaskRegOp() const {
 bool RISCVAsmParser::validateInstruction(MCInst &Inst,
                                          OperandVector &Operands) {
   const MCInstrDesc &MCID = MII.get(Inst.getOpcode());
-  unsigned TargetFlags =
-      (MCID.TSFlags >> RISCV::ConstraintOffset) & RISCV::ConstraintMask;
-  if (TargetFlags == RISCV::NoConstraint)
+  unsigned Constraints =
+      (MCID.TSFlags & RISCVII::ConstraintMask) >> RISCVII::ConstraintShift;
+  if (Constraints == RISCVII::NoConstraint)
     return false;
 
   unsigned DestReg = Inst.getOperand(0).getReg();
   // Operands[1] will be the first operand, DestReg.
   SMLoc Loc = Operands[1]->getStartLoc();
-  if ((TargetFlags == RISCV::WidenV) || (TargetFlags == RISCV::WidenW) ||
-      (TargetFlags == RISCV::SlideUp) || (TargetFlags == RISCV::Vrgather) ||
-      (TargetFlags == RISCV::Vcompress)) {
-    if (TargetFlags != RISCV::WidenW) {
-      unsigned Src2Reg = Inst.getOperand(1).getReg();
-      if (DestReg == Src2Reg)
-        return Error(Loc, "The destination vector register group cannot overlap"
-                          " the source vector register group.");
-      if (TargetFlags == RISCV::WidenV) {
-        // Assume DestReg LMUL is 2 at least for widening/narrowing operations.
-        if (DestReg + 1 == Src2Reg)
-          return Error(Loc,
-                       "The destination vector register group cannot overlap"
-                       " the source vector register group.");
-      }
-    }
-    if (Inst.getOperand(2).isReg()) {
-      unsigned Src1Reg = Inst.getOperand(2).getReg();
-      if (DestReg == Src1Reg)
-        return Error(Loc, "The destination vector register group cannot overlap"
-                          " the source vector register group.");
-      if (TargetFlags == RISCV::WidenV || TargetFlags == RISCV::WidenW) {
-        // Assume DestReg LMUL is 2 at least for widening/narrowing operations.
-        if (DestReg + 1 == Src1Reg)
-          return Error(Loc,
-                       "The destination vector register group cannot overlap"
-                       " the source vector register group.");
-      }
-    }
-    if (Inst.getNumOperands() == 4) {
-      unsigned MaskReg = Inst.getOperand(3).getReg();
-
-      if (DestReg == MaskReg)
-        return Error(Loc, "The destination vector register group cannot overlap"
-                          " the mask register.");
-    }
-  } else if (TargetFlags == RISCV::Narrow) {
-    unsigned Src2Reg = Inst.getOperand(1).getReg();
-    if (DestReg == Src2Reg)
+  if (Constraints & RISCVII::VS2Constraint) {
+    unsigned CheckReg = Inst.getOperand(1).getReg();
+    if (DestReg == CheckReg)
       return Error(Loc, "The destination vector register group cannot overlap"
                         " the source vector register group.");
-    // Assume Src2Reg LMUL is 2 at least for widening/narrowing operations.
-    if (DestReg == Src2Reg + 1)
+  }
+  if ((Constraints & RISCVII::VS1Constraint) && (Inst.getOperand(2).isReg())) {
+    unsigned CheckReg = Inst.getOperand(2).getReg();
+    if (DestReg == CheckReg)
       return Error(Loc, "The destination vector register group cannot overlap"
                         " the source vector register group.");
-  } else if (TargetFlags == RISCV::WidenCvt || TargetFlags == RISCV::Iota) {
-    unsigned Src2Reg = Inst.getOperand(1).getReg();
-    if (DestReg == Src2Reg)
+  }
+  if ((Constraints & RISCVII::VMConstraint) && (DestReg == RISCV::V0)) {
+    // vadc, vsbc are special cases. These instructions have no mask register.
+    // The destination register could not be V0.
+    unsigned Opcode = Inst.getOpcode();
+    if (Opcode == RISCV::VADC_VVM || Opcode == RISCV::VADC_VXM ||
+        Opcode == RISCV::VADC_VIM || Opcode == RISCV::VSBC_VVM ||
+        Opcode == RISCV::VSBC_VXM || Opcode == RISCV::VFMERGE_VFM ||
+        Opcode == RISCV::VMERGE_VIM || Opcode == RISCV::VMERGE_VVM ||
+        Opcode == RISCV::VMERGE_VXM)
+      return Error(Loc, "The destination vector register group cannot be V0.");
+
+    // Regardless masked or unmasked version, the number of operands is the
+    // same. For example, "viota.m v0, v2" is "viota.m v0, v2, NoRegister"
+    // actually. We need to check the last operand to ensure whether it is
+    // masked or not.
+    unsigned CheckReg = Inst.getOperand(Inst.getNumOperands() - 1).getReg();
+    assert((CheckReg == RISCV::V0 || CheckReg == RISCV::NoRegister) &&
+           "Unexpected register for mask operand");
+
+    if (DestReg == CheckReg)
       return Error(Loc, "The destination vector register group cannot overlap"
-                        " the source vector register group.");
-    if (TargetFlags == RISCV::WidenCvt) {
-      // Assume DestReg LMUL is 2 at least for widening/narrowing operations.
-      if (DestReg + 1 == Src2Reg)
-        return Error(Loc, "The destination vector register group cannot overlap"
-                          " the source vector register group.");
-    }
-    if (Inst.getNumOperands() == 3) {
-      unsigned MaskReg = Inst.getOperand(2).getReg();
-
-      if (DestReg == MaskReg)
-        return Error(Loc, "The destination vector register group cannot overlap"
-                          " the mask register.");
-    }
+                        " the mask register.");
   }
   return false;
 }
@@ -2359,7 +2464,7 @@ bool RISCVAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
   default:
     break;
   case RISCV::PseudoLI: {
-    Register Reg = Inst.getOperand(0).getReg();
+    MCRegister Reg = Inst.getOperand(0).getReg();
     const MCOperand &Op1 = Inst.getOperand(1);
     if (Op1.isExpr()) {
       // We must have li reg, %lo(sym) or li reg, %pcrel_lo(sym) or similar.
@@ -2412,6 +2517,9 @@ bool RISCVAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
   case RISCV::PseudoLD:
     emitLoadStoreSymbol(Inst, RISCV::LD, IDLoc, Out, /*HasTmpReg=*/false);
     return false;
+  case RISCV::PseudoFLH:
+    emitLoadStoreSymbol(Inst, RISCV::FLH, IDLoc, Out, /*HasTmpReg=*/true);
+    return false;
   case RISCV::PseudoFLW:
     emitLoadStoreSymbol(Inst, RISCV::FLW, IDLoc, Out, /*HasTmpReg=*/true);
     return false;
@@ -2430,6 +2538,9 @@ bool RISCVAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
   case RISCV::PseudoSD:
     emitLoadStoreSymbol(Inst, RISCV::SD, IDLoc, Out, /*HasTmpReg=*/true);
     return false;
+  case RISCV::PseudoFSH:
+    emitLoadStoreSymbol(Inst, RISCV::FSH, IDLoc, Out, /*HasTmpReg=*/true);
+    return false;
   case RISCV::PseudoFSW:
     emitLoadStoreSymbol(Inst, RISCV::FSW, IDLoc, Out, /*HasTmpReg=*/true);
     return false;
@@ -2440,6 +2551,72 @@ bool RISCVAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
     if (checkPseudoAddTPRel(Inst, Operands))
       return true;
     break;
+  case RISCV::PseudoSEXT_B:
+    emitPseudoExtend(Inst, /*SignExtend=*/true, /*Width=*/8, IDLoc, Out);
+    return false;
+  case RISCV::PseudoSEXT_H:
+    emitPseudoExtend(Inst, /*SignExtend=*/true, /*Width=*/16, IDLoc, Out);
+    return false;
+  case RISCV::PseudoZEXT_H:
+    emitPseudoExtend(Inst, /*SignExtend=*/false, /*Width=*/16, IDLoc, Out);
+    return false;
+  case RISCV::PseudoZEXT_W:
+    emitPseudoExtend(Inst, /*SignExtend=*/false, /*Width=*/32, IDLoc, Out);
+    return false;
+  case RISCV::PseudoVMSGEU_VX:
+  case RISCV::PseudoVMSGEU_VX_M:
+  case RISCV::PseudoVMSGEU_VX_M_T:
+    emitVMSGE(Inst, RISCV::VMSLTU_VX, IDLoc, Out);
+    return false;
+  case RISCV::PseudoVMSGE_VX:
+  case RISCV::PseudoVMSGE_VX_M:
+  case RISCV::PseudoVMSGE_VX_M_T:
+    emitVMSGE(Inst, RISCV::VMSLT_VX, IDLoc, Out);
+    return false;
+  case RISCV::PseudoVMSGE_VI:
+  case RISCV::PseudoVMSLT_VI: {
+    // These instructions are signed and so is immediate so we can subtract one
+    // and change the opcode.
+    int64_t Imm = Inst.getOperand(2).getImm();
+    unsigned Opc = Inst.getOpcode() == RISCV::PseudoVMSGE_VI ? RISCV::VMSGT_VI
+                                                             : RISCV::VMSLE_VI;
+    emitToStreamer(Out, MCInstBuilder(Opc)
+                            .addOperand(Inst.getOperand(0))
+                            .addOperand(Inst.getOperand(1))
+                            .addImm(Imm - 1)
+                            .addOperand(Inst.getOperand(3)));
+    return false;
+  }
+  case RISCV::PseudoVMSGEU_VI:
+  case RISCV::PseudoVMSLTU_VI: {
+    int64_t Imm = Inst.getOperand(2).getImm();
+    // Unsigned comparisons are tricky because the immediate is signed. If the
+    // immediate is 0 we can't just subtract one. vmsltu.vi v0, v1, 0 is always
+    // false, but vmsle.vi v0, v1, -1 is always true. Instead we use
+    // vmsne v0, v1, v1 which is always false.
+    if (Imm == 0) {
+      unsigned Opc = Inst.getOpcode() == RISCV::PseudoVMSGEU_VI
+                         ? RISCV::VMSEQ_VV
+                         : RISCV::VMSNE_VV;
+      emitToStreamer(Out, MCInstBuilder(Opc)
+                              .addOperand(Inst.getOperand(0))
+                              .addOperand(Inst.getOperand(1))
+                              .addOperand(Inst.getOperand(1))
+                              .addOperand(Inst.getOperand(3)));
+    } else {
+      // Other immediate values can subtract one like signed.
+      unsigned Opc = Inst.getOpcode() == RISCV::PseudoVMSGEU_VI
+                         ? RISCV::VMSGTU_VI
+                         : RISCV::VMSLEU_VI;
+      emitToStreamer(Out, MCInstBuilder(Opc)
+                              .addOperand(Inst.getOperand(0))
+                              .addOperand(Inst.getOperand(1))
+                              .addImm(Imm - 1)
+                              .addOperand(Inst.getOperand(3)));
+    }
+
+    return false;
+  }
   }
 
   emitToStreamer(Out, Inst);
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index 37edc19398a5..623552390f53 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -10,10 +10,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "MCTargetDesc/RISCVBaseInfo.h"
 #include "MCTargetDesc/RISCVMCTargetDesc.h"
 #include "TargetInfo/RISCVTargetInfo.h"
-#include "Utils/RISCVBaseInfo.h"
-#include "llvm/CodeGen/Register.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCFixedLenDisassembler.h"
@@ -71,7 +70,18 @@ static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, uint64_t RegNo,
   if (RegNo >= 32 || (IsRV32E && RegNo >= 16))
     return MCDisassembler::Fail;
 
-  Register Reg = RISCV::X0 + RegNo;
+  MCRegister Reg = RISCV::X0 + RegNo;
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeFPR16RegisterClass(MCInst &Inst, uint64_t RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder) {
+  if (RegNo >= 32)
+    return MCDisassembler::Fail;
+
+  MCRegister Reg = RISCV::F0_H + RegNo;
   Inst.addOperand(MCOperand::createReg(Reg));
   return MCDisassembler::Success;
 }
@@ -82,7 +92,7 @@ static DecodeStatus DecodeFPR32RegisterClass(MCInst &Inst, uint64_t RegNo,
   if (RegNo >= 32)
     return MCDisassembler::Fail;
 
-  Register Reg = RISCV::F0_F + RegNo;
+  MCRegister Reg = RISCV::F0_F + RegNo;
   Inst.addOperand(MCOperand::createReg(Reg));
   return MCDisassembler::Success;
 }
@@ -93,7 +103,7 @@ static DecodeStatus DecodeFPR32CRegisterClass(MCInst &Inst, uint64_t RegNo,
   if (RegNo >= 8) {
     return MCDisassembler::Fail;
   }
-  Register Reg = RISCV::F8_F + RegNo;
+  MCRegister Reg = RISCV::F8_F + RegNo;
   Inst.addOperand(MCOperand::createReg(Reg));
   return MCDisassembler::Success;
 }
@@ -104,7 +114,7 @@ static DecodeStatus DecodeFPR64RegisterClass(MCInst &Inst, uint64_t RegNo,
   if (RegNo >= 32)
     return MCDisassembler::Fail;
 
-  Register Reg = RISCV::F0_D + RegNo;
+  MCRegister Reg = RISCV::F0_D + RegNo;
   Inst.addOperand(MCOperand::createReg(Reg));
   return MCDisassembler::Success;
 }
@@ -115,7 +125,7 @@ static DecodeStatus DecodeFPR64CRegisterClass(MCInst &Inst, uint64_t RegNo,
   if (RegNo >= 8) {
     return MCDisassembler::Fail;
   }
-  Register Reg = RISCV::F8_D + RegNo;
+  MCRegister Reg = RISCV::F8_D + RegNo;
   Inst.addOperand(MCOperand::createReg(Reg));
   return MCDisassembler::Success;
 }
@@ -146,7 +156,7 @@ static DecodeStatus DecodeGPRCRegisterClass(MCInst &Inst, uint64_t RegNo,
   if (RegNo >= 8)
     return MCDisassembler::Fail;
 
-  Register Reg = RISCV::X8 + RegNo;
+  MCRegister Reg = RISCV::X8 + RegNo;
   Inst.addOperand(MCOperand::createReg(Reg));
   return MCDisassembler::Success;
 }
@@ -157,14 +167,14 @@ static DecodeStatus DecodeVRRegisterClass(MCInst &Inst, uint64_t RegNo,
   if (RegNo >= 32)
     return MCDisassembler::Fail;
 
-  Register Reg = RISCV::V0 + RegNo;
+  MCRegister Reg = RISCV::V0 + RegNo;
   Inst.addOperand(MCOperand::createReg(Reg));
   return MCDisassembler::Success;
 }
 
 static DecodeStatus decodeVMaskReg(MCInst &Inst, uint64_t RegNo,
                                    uint64_t Address, const void *Decoder) {
-  Register Reg = RISCV::NoRegister;
+  MCRegister Reg = RISCV::NoRegister;
   switch (RegNo) {
   default:
     return MCDisassembler::Fail;
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
index 090132af3585..56991ccf010a 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
@@ -9,9 +9,9 @@
 #ifndef LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVASMBACKEND_H
 #define LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVASMBACKEND_H
 
+#include "MCTargetDesc/RISCVBaseInfo.h"
 #include "MCTargetDesc/RISCVFixupKinds.h"
 #include "MCTargetDesc/RISCVMCTargetDesc.h"
-#include "Utils/RISCVBaseInfo.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCFixupKindInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp
new file mode 100644
index 000000000000..fa36234d0f5f
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp
@@ -0,0 +1,142 @@
+//===-- RISCVBaseInfo.cpp - Top level definitions for RISCV MC ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone enum definitions for the RISCV target
+// useful for the compiler back-end and the MC libraries.
+//
+//===----------------------------------------------------------------------===//
+
+#include "RISCVBaseInfo.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+namespace RISCVSysReg {
+#define GET_SysRegsList_IMPL
+#include "RISCVGenSearchableTables.inc"
+} // namespace RISCVSysReg
+
+namespace RISCVABI {
+ABI computeTargetABI(const Triple &TT, FeatureBitset FeatureBits,
+                     StringRef ABIName) {
+  auto TargetABI = getTargetABI(ABIName);
+  bool IsRV64 = TT.isArch64Bit();
+  bool IsRV32E = FeatureBits[RISCV::FeatureRV32E];
+
+  if (!ABIName.empty() && TargetABI == ABI_Unknown) {
+    errs()
+        << "'" << ABIName
+        << "' is not a recognized ABI for this target (ignoring target-abi)\n";
+  } else if (ABIName.startswith("ilp32") && IsRV64) {
+    errs() << "32-bit ABIs are not supported for 64-bit targets (ignoring "
+              "target-abi)\n";
+    TargetABI = ABI_Unknown;
+  } else if (ABIName.startswith("lp64") && !IsRV64) {
+    errs() << "64-bit ABIs are not supported for 32-bit targets (ignoring "
+              "target-abi)\n";
+    TargetABI = ABI_Unknown;
+  } else if (IsRV32E && TargetABI != ABI_ILP32E && TargetABI != ABI_Unknown) {
+    // TODO: move this checking to RISCVTargetLowering and RISCVAsmParser
+    errs()
+        << "Only the ilp32e ABI is supported for RV32E (ignoring target-abi)\n";
+    TargetABI = ABI_Unknown;
+  }
+
+  if (TargetABI != ABI_Unknown)
+    return TargetABI;
+
+  // For now, default to the ilp32/ilp32e/lp64 ABI if no explicit ABI is given
+  // or an invalid/unrecognised string is given. In the future, it might be
+  // worth changing this to default to ilp32f/lp64f and ilp32d/lp64d when
+  // hardware support for floating point is present.
+  if (IsRV32E)
+    return ABI_ILP32E;
+  if (IsRV64)
+    return ABI_LP64;
+  return ABI_ILP32;
+}
+
+ABI getTargetABI(StringRef ABIName) {
+  auto TargetABI = StringSwitch<ABI>(ABIName)
+                       .Case("ilp32", ABI_ILP32)
+                       .Case("ilp32f", ABI_ILP32F)
+                       .Case("ilp32d", ABI_ILP32D)
+                       .Case("ilp32e", ABI_ILP32E)
+                       .Case("lp64", ABI_LP64)
+                       .Case("lp64f", ABI_LP64F)
+                       .Case("lp64d", ABI_LP64D)
+                       .Default(ABI_Unknown);
+  return TargetABI;
+}
+
+// To avoid the BP value clobbered by a function call, we need to choose a
+// callee saved register to save the value. RV32E only has X8 and X9 as callee
+// saved registers and X8 will be used as fp. So we choose X9 as bp.
+MCRegister getBPReg() { return RISCV::X9; }
+
+// Returns the register holding shadow call stack pointer.
+MCRegister getSCSPReg() { return RISCV::X18; }
+
+} // namespace RISCVABI
+
+namespace RISCVFeatures {
+
+void validate(const Triple &TT, const FeatureBitset &FeatureBits) {
+  if (TT.isArch64Bit() && FeatureBits[RISCV::FeatureRV32E])
+    report_fatal_error("RV32E can't be enabled for an RV64 target");
+}
+
+} // namespace RISCVFeatures
+
+namespace RISCVVPseudosTable {
+
+#define GET_RISCVVPseudosTable_IMPL
+#include "RISCVGenSearchableTables.inc"
+
+} // namespace RISCVVPseudosTable
+
+void RISCVVType::printVType(unsigned VType, raw_ostream &OS) {
+  RISCVVSEW VSEW = getVSEW(VType);
+  RISCVVLMUL VLMUL = getVLMUL(VType);
+
+  unsigned Sew = 1 << (static_cast<unsigned>(VSEW) + 3);
+  OS << "e" << Sew;
+
+  switch (VLMUL) {
+  case RISCVVLMUL::LMUL_RESERVED:
+    llvm_unreachable("Unexpected LMUL value!");
+  case RISCVVLMUL::LMUL_1:
+  case RISCVVLMUL::LMUL_2:
+  case RISCVVLMUL::LMUL_4:
+  case RISCVVLMUL::LMUL_8: {
+    unsigned LMul = 1 << static_cast<unsigned>(VLMUL);
+    OS << ",m" << LMul;
+    break;
+  }
+  case RISCVVLMUL::LMUL_F2:
+  case RISCVVLMUL::LMUL_F4:
+  case RISCVVLMUL::LMUL_F8: {
+    unsigned LMul = 1 << (8 - static_cast<unsigned>(VLMUL));
+    OS << ",mf" << LMul;
+    break;
+  }
+  }
+
+  if (isTailAgnostic(VType))
+    OS << ",ta";
+  else
+    OS << ",tu";
+
+  if (isMaskAgnostic(VType))
+    OS << ",ma";
+  else
+    OS << ",mu";
+}
+
+} // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
new file mode 100644
index 000000000000..6c9f860c204c
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
@@ -0,0 +1,406 @@
+//===-- RISCVBaseInfo.h - Top level definitions for RISCV MC ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone enum definitions for the RISCV target
+// useful for the compiler back-end and the MC libraries.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVBASEINFO_H
+#define LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVBASEINFO_H
+
+#include "MCTargetDesc/RISCVMCTargetDesc.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/SubtargetFeature.h"
+#include "llvm/Support/MachineValueType.h"
+
+namespace llvm {
+
+// RISCVII - This namespace holds all of the target specific flags that
+// instruction info tracks. All definitions must match RISCVInstrFormats.td.
+namespace RISCVII {
+enum {
+  InstFormatPseudo = 0,
+  InstFormatR = 1,
+  InstFormatR4 = 2,
+  InstFormatI = 3,
+  InstFormatS = 4,
+  InstFormatB = 5,
+  InstFormatU = 6,
+  InstFormatJ = 7,
+  InstFormatCR = 8,
+  InstFormatCI = 9,
+  InstFormatCSS = 10,
+  InstFormatCIW = 11,
+  InstFormatCL = 12,
+  InstFormatCS = 13,
+  InstFormatCA = 14,
+  InstFormatCB = 15,
+  InstFormatCJ = 16,
+  InstFormatOther = 17,
+
+  InstFormatMask = 31,
+
+  ConstraintShift = 5,
+  ConstraintMask = 0b111 << ConstraintShift,
+
+  VLMulShift = ConstraintShift + 3,
+  VLMulMask = 0b111 << VLMulShift,
+
+  // Do we need to add a dummy mask op when converting RVV Pseudo to MCInst.
+  HasDummyMaskOpShift = VLMulShift + 3,
+  HasDummyMaskOpMask = 1 << HasDummyMaskOpShift,
+
+  // Does this instruction only update element 0 the destination register.
+  WritesElement0Shift = HasDummyMaskOpShift + 1,
+  WritesElement0Mask = 1 << WritesElement0Shift,
+
+  // Does this instruction have a merge operand that must be removed when
+  // converting to MCInst. It will be the first explicit use operand. Used by
+  // RVV Pseudos.
+  HasMergeOpShift = WritesElement0Shift + 1,
+  HasMergeOpMask = 1 << HasMergeOpShift,
+
+  // Does this instruction have a SEW operand. It will be the last explicit
+  // operand. Used by RVV Pseudos.
+  HasSEWOpShift = HasMergeOpShift + 1,
+  HasSEWOpMask = 1 << HasSEWOpShift,
+
+  // Does this instruction have a VL operand. It will be the second to last
+  // explicit operand. Used by RVV Pseudos.
+  HasVLOpShift = HasSEWOpShift + 1,
+  HasVLOpMask = 1 << HasVLOpShift,
+};
+
+// Match with the definitions in RISCVInstrFormatsV.td
+enum RVVConstraintType {
+  NoConstraint = 0,
+  VS2Constraint = 0b001,
+  VS1Constraint = 0b010,
+  VMConstraint = 0b100,
+};
+
+// RISC-V Specific Machine Operand Flags
+enum {
+  MO_None = 0,
+  MO_CALL = 1,
+  MO_PLT = 2,
+  MO_LO = 3,
+  MO_HI = 4,
+  MO_PCREL_LO = 5,
+  MO_PCREL_HI = 6,
+  MO_GOT_HI = 7,
+  MO_TPREL_LO = 8,
+  MO_TPREL_HI = 9,
+  MO_TPREL_ADD = 10,
+  MO_TLS_GOT_HI = 11,
+  MO_TLS_GD_HI = 12,
+
+  // Used to differentiate between target-specific "direct" flags and "bitmask"
+  // flags. A machine operand can only have one "direct" flag, but can have
+  // multiple "bitmask" flags.
+  MO_DIRECT_FLAG_MASK = 15
+};
+} // namespace RISCVII
+
+namespace RISCVOp {
+enum OperandType : unsigned {
+  OPERAND_FIRST_RISCV_IMM = MCOI::OPERAND_FIRST_TARGET,
+  OPERAND_UIMM4 = OPERAND_FIRST_RISCV_IMM,
+  OPERAND_UIMM5,
+  OPERAND_UIMM12,
+  OPERAND_SIMM12,
+  OPERAND_UIMM20,
+  OPERAND_UIMMLOG2XLEN,
+  OPERAND_LAST_RISCV_IMM = OPERAND_UIMMLOG2XLEN
+};
+} // namespace RISCVOp
+
+// Describes the predecessor/successor bits used in the FENCE instruction.
+namespace RISCVFenceField {
+enum FenceField {
+  I = 8,
+  O = 4,
+  R = 2,
+  W = 1
+};
+}
+
+// Describes the supported floating point rounding mode encodings.
+namespace RISCVFPRndMode {
+enum RoundingMode {
+  RNE = 0,
+  RTZ = 1,
+  RDN = 2,
+  RUP = 3,
+  RMM = 4,
+  DYN = 7,
+  Invalid
+};
+
+inline static StringRef roundingModeToString(RoundingMode RndMode) {
+  switch (RndMode) {
+  default:
+    llvm_unreachable("Unknown floating point rounding mode");
+  case RISCVFPRndMode::RNE:
+    return "rne";
+  case RISCVFPRndMode::RTZ:
+    return "rtz";
+  case RISCVFPRndMode::RDN:
+    return "rdn";
+  case RISCVFPRndMode::RUP:
+    return "rup";
+  case RISCVFPRndMode::RMM:
+    return "rmm";
+  case RISCVFPRndMode::DYN:
+    return "dyn";
+  }
+}
+
+inline static RoundingMode stringToRoundingMode(StringRef Str) {
+  return StringSwitch<RoundingMode>(Str)
+      .Case("rne", RISCVFPRndMode::RNE)
+      .Case("rtz", RISCVFPRndMode::RTZ)
+      .Case("rdn", RISCVFPRndMode::RDN)
+      .Case("rup", RISCVFPRndMode::RUP)
+      .Case("rmm", RISCVFPRndMode::RMM)
+      .Case("dyn", RISCVFPRndMode::DYN)
+      .Default(RISCVFPRndMode::Invalid);
+}
+
+inline static bool isValidRoundingMode(unsigned Mode) {
+  switch (Mode) {
+  default:
+    return false;
+  case RISCVFPRndMode::RNE:
+  case RISCVFPRndMode::RTZ:
+  case RISCVFPRndMode::RDN:
+  case RISCVFPRndMode::RUP:
+  case RISCVFPRndMode::RMM:
+  case RISCVFPRndMode::DYN:
+    return true;
+  }
+}
+} // namespace RISCVFPRndMode
+
+namespace RISCVSysReg {
+struct SysReg {
+  const char *Name;
+  unsigned Encoding;
+  const char *AltName;
+  // FIXME: add these additional fields when needed.
+  // Privilege Access: Read, Write, Read-Only.
+  // unsigned ReadWrite;
+  // Privilege Mode: User, System or Machine.
+  // unsigned Mode;
+  // Check field name.
+  // unsigned Extra;
+  // Register number without the privilege bits.
+  // unsigned Number;
+  FeatureBitset FeaturesRequired;
+  bool isRV32Only;
+
+  bool haveRequiredFeatures(FeatureBitset ActiveFeatures) const {
+    // Not in 32-bit mode.
+    if (isRV32Only && ActiveFeatures[RISCV::Feature64Bit])
+      return false;
+    // No required feature associated with the system register.
+    if (FeaturesRequired.none())
+      return true;
+    return (FeaturesRequired & ActiveFeatures) == FeaturesRequired;
+  }
+};
+
+#define GET_SysRegsList_DECL
+#include "RISCVGenSearchableTables.inc"
+} // end namespace RISCVSysReg
+
+namespace RISCVABI {
+
+enum ABI {
+  ABI_ILP32,
+  ABI_ILP32F,
+  ABI_ILP32D,
+  ABI_ILP32E,
+  ABI_LP64,
+  ABI_LP64F,
+  ABI_LP64D,
+  ABI_Unknown
+};
+
+// Returns the target ABI, or else a StringError if the requested ABIName is
+// not supported for the given TT and FeatureBits combination.
+ABI computeTargetABI(const Triple &TT, FeatureBitset FeatureBits,
+                     StringRef ABIName);
+
+ABI getTargetABI(StringRef ABIName);
+
+// Returns the register used to hold the stack pointer after realignment.
+MCRegister getBPReg();
+
+// Returns the register holding shadow call stack pointer.
+MCRegister getSCSPReg();
+
+} // namespace RISCVABI
+
+namespace RISCVFeatures {
+
+// Validates if the given combination of features are valid for the target
+// triple. Exits with report_fatal_error if not.
+void validate(const Triple &TT, const FeatureBitset &FeatureBits);
+
+} // namespace RISCVFeatures
+
+namespace RISCVVMVTs {
+
+constexpr MVT vint8mf8_t = MVT::nxv1i8;
+constexpr MVT vint8mf4_t = MVT::nxv2i8;
+constexpr MVT vint8mf2_t = MVT::nxv4i8;
+constexpr MVT vint8m1_t = MVT::nxv8i8;
+constexpr MVT vint8m2_t = MVT::nxv16i8;
+constexpr MVT vint8m4_t = MVT::nxv32i8;
+constexpr MVT vint8m8_t = MVT::nxv64i8;
+
+constexpr MVT vint16mf4_t = MVT::nxv1i16;
+constexpr MVT vint16mf2_t = MVT::nxv2i16;
+constexpr MVT vint16m1_t = MVT::nxv4i16;
+constexpr MVT vint16m2_t = MVT::nxv8i16;
+constexpr MVT vint16m4_t = MVT::nxv16i16;
+constexpr MVT vint16m8_t = MVT::nxv32i16;
+
+constexpr MVT vint32mf2_t = MVT::nxv1i32;
+constexpr MVT vint32m1_t = MVT::nxv2i32;
+constexpr MVT vint32m2_t = MVT::nxv4i32;
+constexpr MVT vint32m4_t = MVT::nxv8i32;
+constexpr MVT vint32m8_t = MVT::nxv16i32;
+
+constexpr MVT vint64m1_t = MVT::nxv1i64;
+constexpr MVT vint64m2_t = MVT::nxv2i64;
+constexpr MVT vint64m4_t = MVT::nxv4i64;
+constexpr MVT vint64m8_t = MVT::nxv8i64;
+
+constexpr MVT vfloat16mf4_t = MVT::nxv1f16;
+constexpr MVT vfloat16mf2_t = MVT::nxv2f16;
+constexpr MVT vfloat16m1_t = MVT::nxv4f16;
+constexpr MVT vfloat16m2_t = MVT::nxv8f16;
+constexpr MVT vfloat16m4_t = MVT::nxv16f16;
+constexpr MVT vfloat16m8_t = MVT::nxv32f16;
+
+constexpr MVT vfloat32mf2_t = MVT::nxv1f32;
+constexpr MVT vfloat32m1_t = MVT::nxv2f32;
+constexpr MVT vfloat32m2_t = MVT::nxv4f32;
+constexpr MVT vfloat32m4_t = MVT::nxv8f32;
+constexpr MVT vfloat32m8_t = MVT::nxv16f32;
+
+constexpr MVT vfloat64m1_t = MVT::nxv1f64;
+constexpr MVT vfloat64m2_t = MVT::nxv2f64;
+constexpr MVT vfloat64m4_t = MVT::nxv4f64;
+constexpr MVT vfloat64m8_t = MVT::nxv8f64;
+
+constexpr MVT vbool1_t = MVT::nxv64i1;
+constexpr MVT vbool2_t = MVT::nxv32i1;
+constexpr MVT vbool4_t = MVT::nxv16i1;
+constexpr MVT vbool8_t = MVT::nxv8i1;
+constexpr MVT vbool16_t = MVT::nxv4i1;
+constexpr MVT vbool32_t = MVT::nxv2i1;
+constexpr MVT vbool64_t = MVT::nxv1i1;
+
+} // namespace RISCVVMVTs
+
+enum class RISCVVSEW {
+  SEW_8 = 0,
+  SEW_16,
+  SEW_32,
+  SEW_64,
+  SEW_128,
+  SEW_256,
+  SEW_512,
+  SEW_1024,
+};
+
+enum class RISCVVLMUL {
+  LMUL_1 = 0,
+  LMUL_2,
+  LMUL_4,
+  LMUL_8,
+  LMUL_RESERVED,
+  LMUL_F8,
+  LMUL_F4,
+  LMUL_F2
+};
+
+namespace RISCVVType {
+// Is this a SEW value that can be encoded into the VTYPE format.
+inline static bool isValidSEW(unsigned SEW) {
+  return isPowerOf2_32(SEW) && SEW >= 8 && SEW <= 1024;
+}
+
+// Is this a LMUL value that can be encoded into the VTYPE format.
+inline static bool isValidLMUL(unsigned LMUL, bool Fractional) {
+  return isPowerOf2_32(LMUL) && LMUL <= 8 && (!Fractional || LMUL != 1);
+}
+
+// Encode VTYPE into the binary format used by the the VSETVLI instruction which
+// is used by our MC layer representation.
+//
+// Bits | Name       | Description
+// -----+------------+------------------------------------------------
+// 7    | vma        | Vector mask agnostic
+// 6    | vta        | Vector tail agnostic
+// 5:3  | vsew[2:0]  | Standard element width (SEW) setting
+// 2:0  | vlmul[2:0] | Vector register group multiplier (LMUL) setting
+inline static unsigned encodeVTYPE(RISCVVLMUL VLMUL, RISCVVSEW VSEW,
+                                   bool TailAgnostic, bool MaskAgnostic) {
+  unsigned VLMULBits = static_cast<unsigned>(VLMUL);
+  unsigned VSEWBits = static_cast<unsigned>(VSEW);
+  unsigned VTypeI = (VSEWBits << 3) | (VLMULBits & 0x7);
+  if (TailAgnostic)
+    VTypeI |= 0x40;
+  if (MaskAgnostic)
+    VTypeI |= 0x80;
+
+  return VTypeI;
+}
+
+inline static RISCVVLMUL getVLMUL(unsigned VType) {
+  unsigned VLMUL = VType & 0x7;
+  return static_cast<RISCVVLMUL>(VLMUL);
+}
+
+inline static RISCVVSEW getVSEW(unsigned VType) {
+  unsigned VSEW = (VType >> 3) & 0x7;
+  return static_cast<RISCVVSEW>(VSEW);
+}
+
+inline static bool isTailAgnostic(unsigned VType) { return VType & 0x40; }
+
+inline static bool isMaskAgnostic(unsigned VType) { return VType & 0x80; }
+
+void printVType(unsigned VType, raw_ostream &OS);
+
+} // namespace RISCVVType
+
+namespace RISCVVPseudosTable {
+
+struct PseudoInfo {
+#include "MCTargetDesc/RISCVBaseInfo.h"
+  uint16_t Pseudo;
+  uint16_t BaseInstr;
+};
+
+using namespace RISCV;
+
+#define GET_RISCVVPseudosTable_DECL
+#include "RISCVGenSearchableTables.inc"
+
+} // end namespace RISCVVPseudosTable
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
index 079dc919928a..7df454be8729 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
@@ -11,9 +11,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "RISCVELFStreamer.h"
-#include "MCTargetDesc/RISCVAsmBackend.h"
+#include "RISCVAsmBackend.h"
+#include "RISCVBaseInfo.h"
 #include "RISCVMCTargetDesc.h"
-#include "Utils/RISCVBaseInfo.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSectionELF.h"
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
index eae3e13dbe40..5f8d6e137518 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
@@ -11,8 +11,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "RISCVInstPrinter.h"
-#include "MCTargetDesc/RISCVMCExpr.h"
-#include "Utils/RISCVBaseInfo.h"
+#include "RISCVBaseInfo.h"
+#include "RISCVMCExpr.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
@@ -102,6 +102,24 @@ void RISCVInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   MO.getExpr()->print(O, &MAI);
 }
 
+void RISCVInstPrinter::printBranchOperand(const MCInst *MI, uint64_t Address,
+                                          unsigned OpNo,
+                                          const MCSubtargetInfo &STI,
+                                          raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNo);
+  if (!MO.isImm())
+    return printOperand(MI, OpNo, STI, O);
+
+  if (PrintBranchImmAsAddress) {
+    uint64_t Target = Address + MO.getImm();
+    if (!STI.hasFeature(RISCV::Feature64Bit))
+      Target &= 0xffffffff;
+    O << formatHex(Target);
+  } else {
+    O << MO.getImm();
+  }
+}
+
 void RISCVInstPrinter::printCSRSystemRegister(const MCInst *MI, unsigned OpNo,
                                               const MCSubtargetInfo &STI,
                                               raw_ostream &O) {
@@ -147,18 +165,12 @@ void RISCVInstPrinter::printAtomicMemOp(const MCInst *MI, unsigned OpNo,
   O << "(";
   printRegName(O, MO.getReg());
   O << ")";
-  return;
 }
 
 void RISCVInstPrinter::printVTypeI(const MCInst *MI, unsigned OpNo,
                                    const MCSubtargetInfo &STI, raw_ostream &O) {
   unsigned Imm = MI->getOperand(OpNo).getImm();
-  unsigned Sew = (Imm >> 2) & 0x7;
-  unsigned Lmul = Imm & 0x3;
-
-  Lmul = 0x1 << Lmul;
-  Sew = 0x1 << (Sew + 3);
-  O << "e" << Sew << ",m" << Lmul;
+  RISCVVType::printVType(Imm, O);
 }
 
 void RISCVInstPrinter::printVMaskReg(const MCInst *MI, unsigned OpNo,
@@ -174,15 +186,6 @@ void RISCVInstPrinter::printVMaskReg(const MCInst *MI, unsigned OpNo,
   O << ".t";
 }
 
-void RISCVInstPrinter::printSImm5Plus1(const MCInst *MI, unsigned OpNo,
-                                       const MCSubtargetInfo &STI,
-                                       raw_ostream &O) {
-  const MCOperand &MO = MI->getOperand(OpNo);
-
-  assert(MO.isImm() && "printSImm5Plus1 can only print constant operands");
-  O << MO.getImm() + 1;
-}
-
 const char *RISCVInstPrinter::getRegisterName(unsigned RegNo) {
   return getRegisterName(RegNo, ArchRegNames ? RISCV::NoRegAltName
                                              : RISCV::ABIRegAltName);
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.h b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.h
index fdaa00c5f8eb..d078ead2c8ad 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.h
@@ -32,6 +32,8 @@ public:
 
   void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                     raw_ostream &O, const char *Modifier = nullptr);
+  void printBranchOperand(const MCInst *MI, uint64_t Address, unsigned OpNo,
+                          const MCSubtargetInfo &STI, raw_ostream &O);
   void printCSRSystemRegister(const MCInst *MI, unsigned OpNo,
                               const MCSubtargetInfo &STI, raw_ostream &O);
   void printFenceArg(const MCInst *MI, unsigned OpNo,
@@ -44,10 +46,9 @@ public:
                    raw_ostream &O);
   void printVMaskReg(const MCInst *MI, unsigned OpNo,
                      const MCSubtargetInfo &STI, raw_ostream &O);
-  void printSImm5Plus1(const MCInst *MI, unsigned OpNo,
-                       const MCSubtargetInfo &STI, raw_ostream &O);
 
   // Autogenerated by tblgen.
+  std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
   void printInstruction(const MCInst *MI, uint64_t Address,
                         const MCSubtargetInfo &STI, raw_ostream &O);
   bool printAliasInstr(const MCInst *MI, uint64_t Address,
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
index 816206c477df..b299541939ec 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
@@ -10,12 +10,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "MCTargetDesc/RISCVBaseInfo.h"
 #include "MCTargetDesc/RISCVFixupKinds.h"
 #include "MCTargetDesc/RISCVMCExpr.h"
 #include "MCTargetDesc/RISCVMCTargetDesc.h"
-#include "Utils/RISCVBaseInfo.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/Register.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
@@ -84,6 +83,12 @@ public:
   unsigned getVMaskReg(const MCInst &MI, unsigned OpNo,
                        SmallVectorImpl<MCFixup> &Fixups,
                        const MCSubtargetInfo &STI) const;
+
+private:
+  FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const;
+  void
+  verifyInstructionPredicates(const MCInst &MI,
+                              const FeatureBitset &AvailableFeatures) const;
 };
 } // end anonymous namespace
 
@@ -106,7 +111,7 @@ void RISCVMCCodeEmitter::expandFunctionCall(const MCInst &MI, raw_ostream &OS,
                                             const MCSubtargetInfo &STI) const {
   MCInst TmpInst;
   MCOperand Func;
-  Register Ra;
+  MCRegister Ra;
   if (MI.getOpcode() == RISCV::PseudoTAIL) {
     Func = MI.getOperand(0);
     Ra = RISCV::X6;
@@ -185,6 +190,9 @@ void RISCVMCCodeEmitter::expandAddTPRel(const MCInst &MI, raw_ostream &OS,
 void RISCVMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
                                            SmallVectorImpl<MCFixup> &Fixups,
                                            const MCSubtargetInfo &STI) const {
+  verifyInstructionPredicates(MI,
+                              computeAvailableFeatures(STI.getFeatureBits()));
+
   const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
   // Get byte count of instruction.
   unsigned Size = Desc.getSize();
@@ -397,4 +405,5 @@ unsigned RISCVMCCodeEmitter::getVMaskReg(const MCInst &MI, unsigned OpNo,
   }
 }
 
+#define ENABLE_INSTR_PREDICATE_VERIFIER
 #include "RISCVGenMCCodeEmitter.inc"
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
index 2a6f372e50be..8ce2184c7a41 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
@@ -13,7 +13,6 @@
 
 #include "RISCVMCExpr.h"
 #include "MCTargetDesc/RISCVAsmBackend.h"
-#include "RISCV.h"
 #include "RISCVFixupKinds.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAsmLayout.h"
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
index a474224e1a4e..093118518db6 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
@@ -11,14 +11,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "RISCVMCTargetDesc.h"
+#include "RISCVBaseInfo.h"
 #include "RISCVELFStreamer.h"
 #include "RISCVInstPrinter.h"
 #include "RISCVMCAsmInfo.h"
 #include "RISCVTargetStreamer.h"
 #include "TargetInfo/RISCVTargetInfo.h"
-#include "Utils/RISCVBaseInfo.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/CodeGen/Register.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrInfo.h"
@@ -56,7 +55,7 @@ static MCAsmInfo *createRISCVMCAsmInfo(const MCRegisterInfo &MRI,
                                        const MCTargetOptions &Options) {
   MCAsmInfo *MAI = new RISCVMCAsmInfo(TT);
 
-  Register SP = MRI.getDwarfRegNum(RISCV::X2, true);
+  MCRegister SP = MRI.getDwarfRegNum(RISCV::X2, true);
   MCCFIInstruction Inst = MCCFIInstruction::cfiDefCfa(nullptr, SP, 0);
   MAI->addInitialFrameState(Inst);
 
@@ -68,7 +67,7 @@ static MCSubtargetInfo *createRISCVMCSubtargetInfo(const Triple &TT,
   std::string CPUName = std::string(CPU);
   if (CPUName.empty())
     CPUName = TT.isArch64Bit() ? "generic-rv64" : "generic-rv32";
-  return createRISCVMCSubtargetInfoImpl(TT, CPUName, FS);
+  return createRISCVMCSubtargetInfoImpl(TT, CPUName, /*TuneCPU*/ CPUName, FS);
 }
 
 static MCInstPrinter *createRISCVMCInstPrinter(const Triple &T,
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp
new file mode 100644
index 000000000000..1f3dead61011
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp
@@ -0,0 +1,91 @@
+//===- RISCVMatInt.cpp - Immediate materialisation -------------*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "RISCVMatInt.h"
+#include "MCTargetDesc/RISCVMCTargetDesc.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/Support/MathExtras.h"
+
+namespace llvm {
+
+namespace RISCVMatInt {
+void generateInstSeq(int64_t Val, bool IsRV64, InstSeq &Res) {
+  if (isInt<32>(Val)) {
+    // Depending on the active bits in the immediate Value v, the following
+    // instruction sequences are emitted:
+    //
+    // v == 0                        : ADDI
+    // v[0,12) != 0 && v[12,32) == 0 : ADDI
+    // v[0,12) == 0 && v[12,32) != 0 : LUI
+    // v[0,32) != 0                  : LUI+ADDI(W)
+    int64_t Hi20 = ((Val + 0x800) >> 12) & 0xFFFFF;
+    int64_t Lo12 = SignExtend64<12>(Val);
+
+    if (Hi20)
+      Res.push_back(Inst(RISCV::LUI, Hi20));
+
+    if (Lo12 || Hi20 == 0) {
+      unsigned AddiOpc = (IsRV64 && Hi20) ? RISCV::ADDIW : RISCV::ADDI;
+      Res.push_back(Inst(AddiOpc, Lo12));
+    }
+    return;
+  }
+
+  assert(IsRV64 && "Can't emit >32-bit imm for non-RV64 target");
+
+  // In the worst case, for a full 64-bit constant, a sequence of 8 instructions
+  // (i.e., LUI+ADDIW+SLLI+ADDI+SLLI+ADDI+SLLI+ADDI) has to be emmitted. Note
+  // that the first two instructions (LUI+ADDIW) can contribute up to 32 bits
+  // while the following ADDI instructions contribute up to 12 bits each.
+  //
+  // On the first glance, implementing this seems to be possible by simply
+  // emitting the most significant 32 bits (LUI+ADDIW) followed by as many left
+  // shift (SLLI) and immediate additions (ADDI) as needed. However, due to the
+  // fact that ADDI performs a sign extended addition, doing it like that would
+  // only be possible when at most 11 bits of the ADDI instructions are used.
+  // Using all 12 bits of the ADDI instructions, like done by GAS, actually
+  // requires that the constant is processed starting with the least significant
+  // bit.
+  //
+  // In the following, constants are processed from LSB to MSB but instruction
+  // emission is performed from MSB to LSB by recursively calling
+  // generateInstSeq. In each recursion, first the lowest 12 bits are removed
+  // from the constant and the optimal shift amount, which can be greater than
+  // 12 bits if the constant is sparse, is determined. Then, the shifted
+  // remaining constant is processed recursively and gets emitted as soon as it
+  // fits into 32 bits. The emission of the shifts and additions is subsequently
+  // performed when the recursion returns.
+
+  int64_t Lo12 = SignExtend64<12>(Val);
+  int64_t Hi52 = ((uint64_t)Val + 0x800ull) >> 12;
+  int ShiftAmount = 12 + findFirstSet((uint64_t)Hi52);
+  Hi52 = SignExtend64(Hi52 >> (ShiftAmount - 12), 64 - ShiftAmount);
+
+  generateInstSeq(Hi52, IsRV64, Res);
+
+  Res.push_back(Inst(RISCV::SLLI, ShiftAmount));
+  if (Lo12)
+    Res.push_back(Inst(RISCV::ADDI, Lo12));
+}
+
+int getIntMatCost(const APInt &Val, unsigned Size, bool IsRV64) {
+  int PlatRegSize = IsRV64 ? 64 : 32;
+
+  // Split the constant into platform register sized chunks, and calculate cost
+  // of each chunk.
+  int Cost = 0;
+  for (unsigned ShiftVal = 0; ShiftVal < Size; ShiftVal += PlatRegSize) {
+    APInt Chunk = Val.ashr(ShiftVal).sextOrTrunc(PlatRegSize);
+    InstSeq MatSeq;
+    generateInstSeq(Chunk.getSExtValue(), IsRV64, MatSeq);
+    Cost += MatSeq.size();
+  }
+  return std::max(1, Cost);
+}
+} // namespace RISCVMatInt
+} // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h
new file mode 100644
index 000000000000..17ca57458b49
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h
@@ -0,0 +1,43 @@
+//===- RISCVMatInt.h - Immediate materialisation ---------------*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_RISCV_MATINT_H
+#define LLVM_LIB_TARGET_RISCV_MATINT_H
+
+#include "llvm/ADT/SmallVector.h"
+#include <cstdint>
+
+namespace llvm {
+class APInt;
+
+namespace RISCVMatInt {
+struct Inst {
+  unsigned Opc;
+  int64_t Imm;
+
+  Inst(unsigned Opc, int64_t Imm) : Opc(Opc), Imm(Imm) {}
+};
+using InstSeq = SmallVector<Inst, 8>;
+
+// Helper to generate an instruction sequence that will materialise the given
+// immediate value into a register. A sequence of instructions represented by
+// a simple struct produced rather than directly emitting the instructions in
+// order to allow this helper to be used from both the MC layer and during
+// instruction selection.
+void generateInstSeq(int64_t Val, bool IsRV64, InstSeq &Res);
+
+// Helper to estimate the number of instructions required to materialise the
+// given immediate value into a register. This estimate does not account for
+// `Val` possibly fitting into an immediate, and so may over-estimate.
+//
+// This will attempt to produce instructions to materialise `Val` as an
+// `Size`-bit immediate. `IsRV64` should match the target architecture.
+int getIntMatCost(const APInt &Val, unsigned Size, bool IsRV64);
+} // namespace RISCVMatInt
+} // namespace llvm
+#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp
index 54a2fb288579..13c4b84aa300 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp
@@ -11,7 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "RISCVTargetStreamer.h"
-#include "RISCVSubtarget.h"
+#include "RISCVMCTargetDesc.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/RISCVAttributes.h"
 
@@ -60,6 +60,38 @@ void RISCVTargetStreamer::emitTargetAttributes(const MCSubtargetInfo &STI) {
     Arch += "_d2p0";
   if (STI.hasFeature(RISCV::FeatureStdExtC))
     Arch += "_c2p0";
+  if (STI.hasFeature(RISCV::FeatureStdExtB))
+    Arch += "_b0p93";
+  if (STI.hasFeature(RISCV::FeatureStdExtV))
+    Arch += "_v0p10";
+  if (STI.hasFeature(RISCV::FeatureExtZfh))
+    Arch += "_zfh0p1";
+  if (STI.hasFeature(RISCV::FeatureExtZba))
+    Arch += "_zba0p93";
+  if (STI.hasFeature(RISCV::FeatureExtZbb))
+    Arch += "_zbb0p93";
+  if (STI.hasFeature(RISCV::FeatureExtZbc))
+    Arch += "_zbc0p93";
+  if (STI.hasFeature(RISCV::FeatureExtZbe))
+    Arch += "_zbe0p93";
+  if (STI.hasFeature(RISCV::FeatureExtZbf))
+    Arch += "_zbf0p93";
+  if (STI.hasFeature(RISCV::FeatureExtZbm))
+    Arch += "_zbm0p93";
+  if (STI.hasFeature(RISCV::FeatureExtZbp))
+    Arch += "_zbp0p93";
+  if (STI.hasFeature(RISCV::FeatureExtZbproposedc))
+    Arch += "_zbproposedc0p93";
+  if (STI.hasFeature(RISCV::FeatureExtZbr))
+    Arch += "_zbr0p93";
+  if (STI.hasFeature(RISCV::FeatureExtZbs))
+    Arch += "_zbs0p93";
+  if (STI.hasFeature(RISCV::FeatureExtZbt))
+    Arch += "_zbt0p93";
+  if (STI.hasFeature(RISCV::FeatureExtZvamo))
+    Arch += "_zvamo0p10";
+  if (STI.hasFeature(RISCV::FeatureStdExtZvlsseg))
+    Arch += "_zvlsseg0p10";
 
   emitTextAttribute(RISCVAttrs::ARCH, Arch);
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCV.h b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCV.h
index 9baa2cc2741a..2538d9992de7 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCV.h
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCV.h
@@ -14,7 +14,7 @@
 #ifndef LLVM_LIB_TARGET_RISCV_RISCV_H
 #define LLVM_LIB_TARGET_RISCV_RISCV_H
 
-#include "Utils/RISCVBaseInfo.h"
+#include "MCTargetDesc/RISCVBaseInfo.h"
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
@@ -46,6 +46,9 @@ void initializeRISCVExpandPseudoPass(PassRegistry &);
 FunctionPass *createRISCVExpandAtomicPseudoPass();
 void initializeRISCVExpandAtomicPseudoPass(PassRegistry &);
 
+FunctionPass *createRISCVCleanupVSETVLIPass();
+void initializeRISCVCleanupVSETVLIPass(PassRegistry &);
+
 InstructionSelector *createRISCVInstructionSelector(const RISCVTargetMachine &,
                                                     RISCVSubtarget &,
                                                     RISCVRegisterBankInfo &);
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCV.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCV.td
index 57e7c41c4271..83811dadc9ab 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCV.td
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCV.td
@@ -41,6 +41,14 @@ def HasStdExtD : Predicate<"Subtarget->hasStdExtD()">,
                            AssemblerPredicate<(all_of FeatureStdExtD),
                            "'D' (Double-Precision Floating-Point)">;
 
+def FeatureExtZfh
+    : SubtargetFeature<"experimental-zfh", "HasStdExtZfh", "true",
+                       "'Zfh' (Half-Precision Floating-Point)",
+                       [FeatureStdExtF]>;
+def HasStdExtZfh : Predicate<"Subtarget->hasStdExtZfh()">,
+                             AssemblerPredicate<(all_of FeatureExtZfh),
+                             "'Zfh' (Half-Precision Floating-Point)">;
+
 def FeatureStdExtC
     : SubtargetFeature<"c", "HasStdExtC", "true",
                        "'C' (Compressed Instructions)">;
@@ -48,6 +56,14 @@ def HasStdExtC : Predicate<"Subtarget->hasStdExtC()">,
                            AssemblerPredicate<(all_of FeatureStdExtC),
                            "'C' (Compressed Instructions)">;
 
+def FeatureExtZba
+    : SubtargetFeature<"experimental-zba", "HasStdExtZba", "true",
+                       "'Zba' (Address calculation 'B' Instructions)">;
+def HasStdExtZba : Predicate<"Subtarget->hasStdExtZba()">,
+                             AssemblerPredicate<(all_of FeatureExtZba),
+                             "'Zba' (Address calculation 'B' Instructions)">;
+def NotHasStdExtZba : Predicate<"!Subtarget->hasStdExtZba()">;
+
 def FeatureExtZbb
     : SubtargetFeature<"experimental-zbb", "HasStdExtZbb", "true",
                        "'Zbb' (Base 'B' Instructions)">;
@@ -115,7 +131,9 @@ def HasStdExtZbt : Predicate<"Subtarget->hasStdExtZbt()">,
 // subextensions. They should be enabled if either has been specified.
 def HasStdExtZbbOrZbp
     : Predicate<"Subtarget->hasStdExtZbb() || Subtarget->hasStdExtZbp()">,
-                AssemblerPredicate<(any_of FeatureExtZbb, FeatureExtZbp)>;
+                AssemblerPredicate<(any_of FeatureExtZbb, FeatureExtZbp),
+                                   "'Zbb' (Base 'B' Instructions) or "
+                                   "'Zbp' (Permutation 'B' Instructions)">;
 
 def FeatureExtZbproposedc
     : SubtargetFeature<"experimental-zbproposedc", "HasStdExtZbproposedc", "true",
@@ -127,7 +145,8 @@ def HasStdExtZbproposedc : Predicate<"Subtarget->hasStdExtZbproposedc()">,
 def FeatureStdExtB
     : SubtargetFeature<"experimental-b", "HasStdExtB", "true",
                        "'B' (Bit Manipulation Instructions)",
-                       [FeatureExtZbb,
+                       [FeatureExtZba,
+                        FeatureExtZbb,
                         FeatureExtZbc,
                         FeatureExtZbe,
                         FeatureExtZbf,
@@ -145,16 +164,30 @@ def FeatureNoRVCHints
                        "Disable RVC Hint Instructions.">;
 def HasRVCHints : Predicate<"Subtarget->enableRVCHintInstrs()">,
                   AssemblerPredicate<(all_of(not FeatureNoRVCHints)),
-                                     "RVC Hint Instructions">;
+                                      "RVC Hint Instructions">;
 
 def FeatureStdExtV
     : SubtargetFeature<"experimental-v", "HasStdExtV", "true",
-                       "'V' (Vector Instructions)",
-                       [FeatureStdExtF]>;
+                       "'V' (Vector Instructions)">;
 def HasStdExtV : Predicate<"Subtarget->hasStdExtV()">,
                            AssemblerPredicate<(all_of FeatureStdExtV),
                            "'V' (Vector Instructions)">;
 
+def FeatureStdExtZvlsseg
+    : SubtargetFeature<"experimental-zvlsseg", "HasStdExtZvlsseg", "true",
+                       "'Zvlsseg' (Vector segment load/store instructions)",
+                       [FeatureStdExtV]>;
+def HasStdExtZvlsseg : Predicate<"Subtarget->hasStdExtZvlsseg()">,
+                        AssemblerPredicate<(all_of FeatureStdExtZvlsseg),
+                        "'Zvlsseg' (Vector segment load/store instructions)">;
+def FeatureExtZvamo
+    : SubtargetFeature<"experimental-zvamo", "HasStdExtZvamo", "true",
+                       "'Zvamo'(Vector AMO Operations)",
+                       [FeatureStdExtV]>;
+def HasStdExtZvamo : Predicate<"Subtarget->hasStdExtZvamo()">,
+                              AssemblerPredicate<(all_of FeatureExtZvamo),
+                              "'Zvamo'(Vector AMO Operations)">;
+
 def Feature64Bit
     : SubtargetFeature<"64bit", "HasRV64", "true", "Implements RV64">;
 def IsRV64 : Predicate<"Subtarget->is64Bit()">,
@@ -164,8 +197,8 @@ def IsRV32 : Predicate<"!Subtarget->is64Bit()">,
                        AssemblerPredicate<(all_of (not Feature64Bit)),
                        "RV32I Base Instruction Set">;
 
+defvar RV32 = DefaultMode;
 def RV64           : HwMode<"+64bit">;
-def RV32           : HwMode<"-64bit">;
 
 def FeatureRV32E
     : SubtargetFeature<"e", "IsRV32E", "true",
@@ -200,31 +233,44 @@ include "RISCVRegisterInfo.td"
 include "RISCVCallingConv.td"
 include "RISCVInstrInfo.td"
 include "RISCVRegisterBanks.td"
-include "RISCVSchedRocket32.td"
-include "RISCVSchedRocket64.td"
+include "RISCVSchedRocket.td"
+include "RISCVSchedSiFive7.td"
 
 //===----------------------------------------------------------------------===//
 // RISC-V processors supported.
 //===----------------------------------------------------------------------===//
 
 def : ProcessorModel<"generic-rv32", NoSchedModel, []>;
-
 def : ProcessorModel<"generic-rv64", NoSchedModel, [Feature64Bit]>;
 
-def : ProcessorModel<"rocket-rv32", Rocket32Model, []>;
-
-def : ProcessorModel<"rocket-rv64", Rocket64Model, [Feature64Bit]>;
-
-def : ProcessorModel<"sifive-e31", Rocket32Model, [FeatureStdExtM,
-                                                   FeatureStdExtA,
-                                                   FeatureStdExtC]>;
-
-def : ProcessorModel<"sifive-u54", Rocket64Model, [Feature64Bit,
-                                                   FeatureStdExtM,
-                                                   FeatureStdExtA,
-                                                   FeatureStdExtF,
-                                                   FeatureStdExtD,
-                                                   FeatureStdExtC]>;
+def : ProcessorModel<"rocket-rv32", RocketModel, []>;
+def : ProcessorModel<"rocket-rv64", RocketModel, [Feature64Bit]>;
+
+def : ProcessorModel<"sifive-7-rv32", SiFive7Model, []>;
+def : ProcessorModel<"sifive-7-rv64", SiFive7Model, [Feature64Bit]>;
+
+def : ProcessorModel<"sifive-e31", RocketModel, [FeatureStdExtM,
+                                                 FeatureStdExtA,
+                                                 FeatureStdExtC]>;
+
+def : ProcessorModel<"sifive-u54", RocketModel, [Feature64Bit,
+                                                 FeatureStdExtM,
+                                                 FeatureStdExtA,
+                                                 FeatureStdExtF,
+                                                 FeatureStdExtD,
+                                                 FeatureStdExtC]>;
+
+def : ProcessorModel<"sifive-e76", SiFive7Model, [FeatureStdExtM,
+                                                  FeatureStdExtA,
+                                                  FeatureStdExtF,
+                                                  FeatureStdExtC]>;
+
+def : ProcessorModel<"sifive-u74", SiFive7Model, [Feature64Bit,
+                                                  FeatureStdExtM,
+                                                  FeatureStdExtA,
+                                                  FeatureStdExtF,
+                                                  FeatureStdExtD,
+                                                  FeatureStdExtC]>;
 
 //===----------------------------------------------------------------------===//
 // Define the RISC-V target.
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
index 8955994b1c2e..0a915cbcc1af 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
@@ -198,7 +198,9 @@ void RISCVAsmPrinter::emitAttributes() {
   StringRef CPU = TM.getTargetCPU();
   StringRef FS = TM.getTargetFeatureString();
   const RISCVTargetMachine &RTM = static_cast<const RISCVTargetMachine &>(TM);
-  const RISCVSubtarget STI(TT, CPU, FS, /*ABIName=*/"", RTM);
+  /* TuneCPU doesn't impact emission of ELF attributes, ELF attributes only
+     care about arch related features, so we can set TuneCPU as CPU.  */
+  const RISCVSubtarget STI(TT, CPU, /*TuneCPU=*/CPU, FS, /*ABIName=*/"", RTM);
 
   RTS.emitTargetAttributes(STI);
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVCallLowering.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVCallLowering.cpp
index c63a84739c4a..d265f3a12b7f 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVCallLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVCallLowering.cpp
@@ -22,8 +22,8 @@ RISCVCallLowering::RISCVCallLowering(const RISCVTargetLowering &TLI)
     : CallLowering(&TLI) {}
 
 bool RISCVCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
-                                    const Value *Val,
-                                    ArrayRef<Register> VRegs) const {
+                                    const Value *Val, ArrayRef<Register> VRegs,
+                                    FunctionLoweringInfo &FLI) const {
 
   MachineInstrBuilder Ret = MIRBuilder.buildInstrNoInsert(RISCV::PseudoRET);
 
@@ -34,9 +34,10 @@ bool RISCVCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
   return true;
 }
 
-bool RISCVCallLowering::lowerFormalArguments(
-    MachineIRBuilder &MIRBuilder, const Function &F,
-    ArrayRef<ArrayRef<Register>> VRegs) const {
+bool RISCVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
+                                             const Function &F,
+                                             ArrayRef<ArrayRef<Register>> VRegs,
+                                             FunctionLoweringInfo &FLI) const {
 
   if (F.arg_empty())
     return true;
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVCallLowering.h b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVCallLowering.h
index 7ce074a61f0a..cd7fc4c76123 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVCallLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVCallLowering.h
@@ -28,10 +28,12 @@ public:
   RISCVCallLowering(const RISCVTargetLowering &TLI);
 
   bool lowerReturn(MachineIRBuilder &MIRBuiler, const Value *Val,
-                   ArrayRef<Register> VRegs) const override;
+                   ArrayRef<Register> VRegs,
+                   FunctionLoweringInfo &FLI) const override;
 
   bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
-                            ArrayRef<ArrayRef<Register>> VRegs) const override;
+                            ArrayRef<ArrayRef<Register>> VRegs,
+                            FunctionLoweringInfo &FLI) const override;
 
   bool lowerCall(MachineIRBuilder &MIRBuilder,
                  CallLoweringInfo &Info) const override;
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVCleanupVSETVLI.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVCleanupVSETVLI.cpp
new file mode 100644
index 000000000000..ae32cbd1ae59
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVCleanupVSETVLI.cpp
@@ -0,0 +1,154 @@
+//===- RISCVCleanupVSETVLI.cpp - Cleanup unneeded VSETVLI instructions ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a function pass that removes duplicate vsetvli
+// instructions within a basic block.
+//
+//===----------------------------------------------------------------------===//
+
+#include "RISCV.h"
+#include "RISCVSubtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "riscv-cleanup-vsetvli"
+#define RISCV_CLEANUP_VSETVLI_NAME "RISCV Cleanup VSETVLI pass"
+
+namespace {
+
+class RISCVCleanupVSETVLI : public MachineFunctionPass {
+public:
+  static char ID;
+
+  RISCVCleanupVSETVLI() : MachineFunctionPass(ID) {
+    initializeRISCVCleanupVSETVLIPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  bool runOnMachineBasicBlock(MachineBasicBlock &MBB);
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::IsSSA);
+  }
+
+  // This pass modifies the program, but does not modify the CFG
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  StringRef getPassName() const override { return RISCV_CLEANUP_VSETVLI_NAME; }
+};
+
+} // end anonymous namespace
+
+char RISCVCleanupVSETVLI::ID = 0;
+
+INITIALIZE_PASS(RISCVCleanupVSETVLI, DEBUG_TYPE,
+                RISCV_CLEANUP_VSETVLI_NAME, false, false)
+
+bool RISCVCleanupVSETVLI::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
+  bool Changed = false;
+  MachineInstr *PrevVSETVLI = nullptr;
+
+  for (auto MII = MBB.begin(), MIE = MBB.end(); MII != MIE;) {
+    MachineInstr &MI = *MII++;
+
+    if (MI.getOpcode() != RISCV::PseudoVSETVLI &&
+        MI.getOpcode() != RISCV::PseudoVSETIVLI) {
+      if (PrevVSETVLI &&
+          (MI.isCall() || MI.modifiesRegister(RISCV::VL) ||
+           MI.modifiesRegister(RISCV::VTYPE))) {
+        // Old VL/VTYPE is overwritten.
+        PrevVSETVLI = nullptr;
+      }
+      continue;
+    }
+
+    // If we don't have a previous VSET{I}VLI or the VL output isn't dead, we
+    // can't remove this VSETVLI.
+    if (!PrevVSETVLI || !MI.getOperand(0).isDead()) {
+      PrevVSETVLI = &MI;
+      continue;
+    }
+
+    // If a previous "set vl" instruction opcode is different from this one, we
+    // can't differentiate the AVL values.
+    if (PrevVSETVLI->getOpcode() != MI.getOpcode()) {
+      PrevVSETVLI = &MI;
+      continue;
+    }
+
+    // The remaining two cases are
+    // 1. PrevVSETVLI = PseudoVSETVLI
+    //    MI = PseudoVSETVLI
+    //
+    // 2. PrevVSETVLI = PseudoVSETIVLI
+    //    MI = PseudoVSETIVLI
+    Register AVLReg;
+    bool SameAVL = false;
+    if (MI.getOpcode() == RISCV::PseudoVSETVLI) {
+      AVLReg = MI.getOperand(1).getReg();
+      SameAVL = PrevVSETVLI->getOperand(1).getReg() == AVLReg;
+    } else { // RISCV::PseudoVSETIVLI
+      SameAVL =
+          PrevVSETVLI->getOperand(1).getImm() == MI.getOperand(1).getImm();
+    }
+    int64_t PrevVTYPEImm = PrevVSETVLI->getOperand(2).getImm();
+    int64_t VTYPEImm = MI.getOperand(2).getImm();
+
+    // Does this VSET{I}VLI use the same AVL register/value and VTYPE immediate?
+    if (!SameAVL || PrevVTYPEImm != VTYPEImm) {
+      PrevVSETVLI = &MI;
+      continue;
+    }
+
+    // If the AVLReg is X0 we need to look at the output VL of both VSETVLIs.
+    if ((MI.getOpcode() == RISCV::PseudoVSETVLI) && (AVLReg == RISCV::X0)) {
+      assert((PrevVSETVLI->getOpcode() == RISCV::PseudoVSETVLI) &&
+             "Unexpected vsetvli opcode.");
+      Register PrevOutVL = PrevVSETVLI->getOperand(0).getReg();
+      Register OutVL = MI.getOperand(0).getReg();
+      // We can't remove if the previous VSETVLI left VL unchanged and the
+      // current instruction is setting it to VLMAX. Without knowing the VL
+      // before the previous instruction we don't know if this is a change.
+      if (PrevOutVL == RISCV::X0 && OutVL != RISCV::X0) {
+        PrevVSETVLI = &MI;
+        continue;
+      }
+    }
+
+    // This VSETVLI is redundant, remove it.
+    MI.eraseFromParent();
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+bool RISCVCleanupVSETVLI::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()))
+    return false;
+
+  // Skip if the vector extension is not enabled.
+  const RISCVSubtarget &ST = MF.getSubtarget<RISCVSubtarget>();
+  if (!ST.hasStdExtV())
+    return false;
+
+  bool Changed = false;
+
+  for (MachineBasicBlock &MBB : MF)
+    Changed |= runOnMachineBasicBlock(MBB);
+
+  return Changed;
+}
+
+/// Returns an instance of the Cleanup VSETVLI pass.
+FunctionPass *llvm::createRISCVCleanupVSETVLIPass() {
+  return new RISCVCleanupVSETVLI();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
index 504355fb8bf8..ec9a39569952 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
@@ -59,6 +59,9 @@ private:
   bool expandLoadTLSGDAddress(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator MBBI,
                               MachineBasicBlock::iterator &NextMBBI);
+  bool expandVSetVL(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI);
+  bool expandVMSET_VMCLR(MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator MBBI, unsigned Opcode);
 };
 
 char RISCVExpandPseudo::ID = 0;
@@ -99,6 +102,27 @@ bool RISCVExpandPseudo::expandMI(MachineBasicBlock &MBB,
     return expandLoadTLSIEAddress(MBB, MBBI, NextMBBI);
   case RISCV::PseudoLA_TLS_GD:
     return expandLoadTLSGDAddress(MBB, MBBI, NextMBBI);
+  case RISCV::PseudoVSETVLI:
+  case RISCV::PseudoVSETIVLI:
+    return expandVSetVL(MBB, MBBI);
+  case RISCV::PseudoVMCLR_M_B1:
+  case RISCV::PseudoVMCLR_M_B2:
+  case RISCV::PseudoVMCLR_M_B4:
+  case RISCV::PseudoVMCLR_M_B8:
+  case RISCV::PseudoVMCLR_M_B16:
+  case RISCV::PseudoVMCLR_M_B32:
+  case RISCV::PseudoVMCLR_M_B64:
+    // vmclr.m vd => vmxor.mm vd, vd, vd
+    return expandVMSET_VMCLR(MBB, MBBI, RISCV::VMXOR_MM);
+  case RISCV::PseudoVMSET_M_B1:
+  case RISCV::PseudoVMSET_M_B2:
+  case RISCV::PseudoVMSET_M_B4:
+  case RISCV::PseudoVMSET_M_B8:
+  case RISCV::PseudoVMSET_M_B16:
+  case RISCV::PseudoVMSET_M_B32:
+  case RISCV::PseudoVMSET_M_B64:
+    // vmset.m vd => vmxnor.mm vd, vd, vd
+    return expandVMSET_VMCLR(MBB, MBBI, RISCV::VMXNOR_MM);
   }
 
   return false;
@@ -188,6 +212,47 @@ bool RISCVExpandPseudo::expandLoadTLSGDAddress(
                              RISCV::ADDI);
 }
 
+bool RISCVExpandPseudo::expandVSetVL(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MBBI) {
+  assert(MBBI->getNumOperands() == 5 && "Unexpected instruction format");
+
+  DebugLoc DL = MBBI->getDebugLoc();
+
+  assert((MBBI->getOpcode() == RISCV::PseudoVSETVLI ||
+          MBBI->getOpcode() == RISCV::PseudoVSETIVLI) &&
+         "Unexpected pseudo instruction");
+  unsigned Opcode;
+  if (MBBI->getOpcode() == RISCV::PseudoVSETVLI)
+    Opcode = RISCV::VSETVLI;
+  else
+    Opcode = RISCV::VSETIVLI;
+  const MCInstrDesc &Desc = TII->get(Opcode);
+  assert(Desc.getNumOperands() == 3 && "Unexpected instruction format");
+
+  Register DstReg = MBBI->getOperand(0).getReg();
+  bool DstIsDead = MBBI->getOperand(0).isDead();
+  BuildMI(MBB, MBBI, DL, Desc)
+      .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+      .add(MBBI->getOperand(1))  // VL
+      .add(MBBI->getOperand(2)); // VType
+
+  MBBI->eraseFromParent(); // The pseudo instruction is gone now.
+  return true;
+}
+
+bool RISCVExpandPseudo::expandVMSET_VMCLR(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator MBBI,
+                                          unsigned Opcode) {
+  DebugLoc DL = MBBI->getDebugLoc();
+  Register DstReg = MBBI->getOperand(0).getReg();
+  const MCInstrDesc &Desc = TII->get(Opcode);
+  BuildMI(MBB, MBBI, DL, Desc, DstReg)
+      .addReg(DstReg, RegState::Undef)
+      .addReg(DstReg, RegState::Undef);
+  MBBI->eraseFromParent(); // The pseudo instruction is gone now.
+  return true;
+}
+
 } // end of anonymous namespace
 
 INITIALIZE_PASS(RISCVExpandPseudo, "riscv-expand-pseudo",
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index 43adc7426c79..564d97f47d9e 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -23,6 +23,105 @@
 
 using namespace llvm;
 
+// For now we use x18, a.k.a s2, as pointer to shadow call stack.
+// User should explicitly set -ffixed-x18 and not use x18 in their asm.
+static void emitSCSPrologue(MachineFunction &MF, MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI,
+                            const DebugLoc &DL) {
+  if (!MF.getFunction().hasFnAttribute(Attribute::ShadowCallStack))
+    return;
+
+  const auto &STI = MF.getSubtarget<RISCVSubtarget>();
+  Register RAReg = STI.getRegisterInfo()->getRARegister();
+
+  // Do not save RA to the SCS if it's not saved to the regular stack,
+  // i.e. RA is not at risk of being overwritten.
+  std::vector<CalleeSavedInfo> &CSI = MF.getFrameInfo().getCalleeSavedInfo();
+  if (std::none_of(CSI.begin(), CSI.end(),
+                   [&](CalleeSavedInfo &CSR) { return CSR.getReg() == RAReg; }))
+    return;
+
+  Register SCSPReg = RISCVABI::getSCSPReg();
+
+  auto &Ctx = MF.getFunction().getContext();
+  if (!STI.isRegisterReservedByUser(SCSPReg)) {
+    Ctx.diagnose(DiagnosticInfoUnsupported{
+        MF.getFunction(), "x18 not reserved by user for Shadow Call Stack."});
+    return;
+  }
+
+  const auto *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();
+  if (RVFI->useSaveRestoreLibCalls(MF)) {
+    Ctx.diagnose(DiagnosticInfoUnsupported{
+        MF.getFunction(),
+        "Shadow Call Stack cannot be combined with Save/Restore LibCalls."});
+    return;
+  }
+
+  const RISCVInstrInfo *TII = STI.getInstrInfo();
+  bool IsRV64 = STI.hasFeature(RISCV::Feature64Bit);
+  int64_t SlotSize = STI.getXLen() / 8;
+  // Store return address to shadow call stack
+  // s[w|d]  ra, 0(s2)
+  // addi    s2, s2, [4|8]
+  BuildMI(MBB, MI, DL, TII->get(IsRV64 ? RISCV::SD : RISCV::SW))
+      .addReg(RAReg)
+      .addReg(SCSPReg)
+      .addImm(0);
+  BuildMI(MBB, MI, DL, TII->get(RISCV::ADDI))
+      .addReg(SCSPReg, RegState::Define)
+      .addReg(SCSPReg)
+      .addImm(SlotSize);
+}
+
+static void emitSCSEpilogue(MachineFunction &MF, MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI,
+                            const DebugLoc &DL) {
+  if (!MF.getFunction().hasFnAttribute(Attribute::ShadowCallStack))
+    return;
+
+  const auto &STI = MF.getSubtarget<RISCVSubtarget>();
+  Register RAReg = STI.getRegisterInfo()->getRARegister();
+
+  // See emitSCSPrologue() above.
+  std::vector<CalleeSavedInfo> &CSI = MF.getFrameInfo().getCalleeSavedInfo();
+  if (std::none_of(CSI.begin(), CSI.end(),
+                   [&](CalleeSavedInfo &CSR) { return CSR.getReg() == RAReg; }))
+    return;
+
+  Register SCSPReg = RISCVABI::getSCSPReg();
+
+  auto &Ctx = MF.getFunction().getContext();
+  if (!STI.isRegisterReservedByUser(SCSPReg)) {
+    Ctx.diagnose(DiagnosticInfoUnsupported{
+        MF.getFunction(), "x18 not reserved by user for Shadow Call Stack."});
+    return;
+  }
+
+  const auto *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();
+  if (RVFI->useSaveRestoreLibCalls(MF)) {
+    Ctx.diagnose(DiagnosticInfoUnsupported{
+        MF.getFunction(),
+        "Shadow Call Stack cannot be combined with Save/Restore LibCalls."});
+    return;
+  }
+
+  const RISCVInstrInfo *TII = STI.getInstrInfo();
+  bool IsRV64 = STI.hasFeature(RISCV::Feature64Bit);
+  int64_t SlotSize = STI.getXLen() / 8;
+  // Load return address from shadow call stack
+  // l[w|d]  ra, -[4|8](s2)
+  // addi    s2, s2, -[4|8]
+  BuildMI(MBB, MI, DL, TII->get(IsRV64 ? RISCV::LD : RISCV::LW))
+      .addReg(RAReg, RegState::Define)
+      .addReg(SCSPReg)
+      .addImm(-SlotSize);
+  BuildMI(MBB, MI, DL, TII->get(RISCV::ADDI))
+      .addReg(SCSPReg, RegState::Define)
+      .addReg(SCSPReg)
+      .addImm(-SlotSize);
+}
+
 // Get the ID of the libcall used for spilling and restoring callee saved
 // registers. The ID is representative of the number of registers saved or
 // restored by the libcall, except it is zero-indexed - ID 0 corresponds to a
@@ -39,7 +138,7 @@ static int getLibCallID(const MachineFunction &MF,
     // RISCVRegisterInfo::hasReservedSpillSlot assigns negative frame indexes to
     // registers which can be saved by libcall.
     if (CS.getFrameIdx() < 0)
-      MaxReg = std::max(MaxReg.id(), CS.getReg());
+      MaxReg = std::max(MaxReg.id(), CS.getReg().id());
 
   if (MaxReg == RISCV::NoRegister)
     return -1;
@@ -136,18 +235,12 @@ bool RISCVFrameLowering::hasBP(const MachineFunction &MF) const {
 // Determines the size of the frame and maximum call frame size.
 void RISCVFrameLowering::determineFrameLayout(MachineFunction &MF) const {
   MachineFrameInfo &MFI = MF.getFrameInfo();
-  const RISCVRegisterInfo *RI = STI.getRegisterInfo();
 
   // Get the number of bytes to allocate from the FrameInfo.
   uint64_t FrameSize = MFI.getStackSize();
 
   // Get the alignment.
   Align StackAlign = getStackAlign();
-  if (RI->needsStackRealignment(MF)) {
-    Align MaxStackAlign = std::max(StackAlign, MFI.getMaxAlign());
-    FrameSize += (MaxStackAlign.value() - StackAlign.value());
-    StackAlign = MaxStackAlign;
-  }
 
   // Set Max Call Frame Size
   uint64_t MaxCallSize = alignTo(MFI.getMaxCallFrameSize(), StackAlign);
@@ -222,15 +315,23 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
   Register SPReg = getSPReg(STI);
   Register BPReg = RISCVABI::getBPReg();
 
+  // Debug location must be unknown since the first debug location is used
+  // to determine the end of the prologue.
+  DebugLoc DL;
+
+  // All calls are tail calls in GHC calling conv, and functions have no
+  // prologue/epilogue.
+  if (MF.getFunction().getCallingConv() == CallingConv::GHC)
+    return;
+
+  // Emit prologue for shadow call stack.
+  emitSCSPrologue(MF, MBB, MBBI, DL);
+
   // Since spillCalleeSavedRegisters may have inserted a libcall, skip past
   // any instructions marked as FrameSetup
   while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup))
     ++MBBI;
 
-  // Debug location must be unknown since the first debug location is used
-  // to determine the end of the prologue.
-  DebugLoc DL;
-
   // Determine the correct frame layout
   determineFrameLayout(MF);
 
@@ -398,6 +499,11 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
   Register FPReg = getFPReg(STI);
   Register SPReg = getSPReg(STI);
 
+  // All calls are tail calls in GHC calling conv, and functions have no
+  // prologue/epilogue.
+  if (MF.getFunction().getCallingConv() == CallingConv::GHC)
+    return;
+
   // Get the insert location for the epilogue. If there were no terminators in
   // the block, get the last instruction.
   MachineBasicBlock::iterator MBBI = MBB.end();
@@ -457,11 +563,14 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
 
   // Deallocate stack
   adjustReg(MBB, MBBI, DL, SPReg, SPReg, StackSize, MachineInstr::FrameDestroy);
+
+  // Emit epilogue for shadow call stack.
+  emitSCSEpilogue(MF, MBB, MBBI, DL);
 }
 
-int RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF,
-                                               int FI,
-                                               Register &FrameReg) const {
+StackOffset
+RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
+                                           Register &FrameReg) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
   const auto *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();
@@ -513,7 +622,7 @@ int RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF,
         Offset += RVFI->getLibCallStackSize();
     }
   }
-  return Offset;
+  return StackOffset::getFixed(Offset);
 }
 
 void RISCVFrameLowering::determineCalleeSaves(MachineFunction &MF,
@@ -547,14 +656,14 @@ void RISCVFrameLowering::determineCalleeSaves(MachineFunction &MF,
     for (unsigned i = 0; CSRegs[i]; ++i)
       SavedRegs.set(CSRegs[i]);
 
-    if (MF.getSubtarget<RISCVSubtarget>().hasStdExtD() ||
-        MF.getSubtarget<RISCVSubtarget>().hasStdExtF()) {
+    if (MF.getSubtarget<RISCVSubtarget>().hasStdExtF()) {
 
       // If interrupt is enabled, this list contains all FP registers.
       const MCPhysReg * Regs = MF.getRegInfo().getCalleeSavedRegs();
 
       for (unsigned i = 0; Regs[i]; ++i)
-        if (RISCV::FPR32RegClass.contains(Regs[i]) ||
+        if (RISCV::FPR16RegClass.contains(Regs[i]) ||
+            RISCV::FPR32RegClass.contains(Regs[i]) ||
             RISCV::FPR64RegClass.contains(Regs[i]))
           SavedRegs.set(Regs[i]);
     }
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVFrameLowering.h b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVFrameLowering.h
index 1517c847a04c..889b9ce2e1a9 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVFrameLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVFrameLowering.h
@@ -14,6 +14,7 @@
 #define LLVM_LIB_TARGET_RISCV_RISCVFRAMELOWERING_H
 
 #include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/Support/TypeSize.h"
 
 namespace llvm {
 class RISCVSubtarget;
@@ -29,8 +30,8 @@ public:
   void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
 
-  int getFrameIndexReference(const MachineFunction &MF, int FI,
-                             Register &FrameReg) const override;
+  StackOffset getFrameIndexReference(const MachineFunction &MF, int FI,
+                                     Register &FrameReg) const override;
 
   void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
                             RegScavenger *RS) const override;
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 7570385e38e3..43bf16c53a62 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -12,8 +12,9 @@
 
 #include "RISCVISelDAGToDAG.h"
 #include "MCTargetDesc/RISCVMCTargetDesc.h"
-#include "Utils/RISCVMatInt.h"
+#include "MCTargetDesc/RISCVMatInt.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/IR/IntrinsicsRISCV.h"
 #include "llvm/Support/Alignment.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
@@ -48,15 +49,439 @@ static SDNode *selectImm(SelectionDAG *CurDAG, const SDLoc &DL, int64_t Imm,
   return Result;
 }
 
-// Returns true if the Node is an ISD::AND with a constant argument. If so,
-// set Mask to that constant value.
-static bool isConstantMask(SDNode *Node, uint64_t &Mask) {
-  if (Node->getOpcode() == ISD::AND &&
-      Node->getOperand(1).getOpcode() == ISD::Constant) {
-    Mask = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
-    return true;
+static RISCVVLMUL getLMUL(EVT VT) {
+  switch (VT.getSizeInBits().getKnownMinValue() / 8) {
+  default:
+    llvm_unreachable("Invalid LMUL.");
+  case 1:
+    return RISCVVLMUL::LMUL_F8;
+  case 2:
+    return RISCVVLMUL::LMUL_F4;
+  case 4:
+    return RISCVVLMUL::LMUL_F2;
+  case 8:
+    return RISCVVLMUL::LMUL_1;
+  case 16:
+    return RISCVVLMUL::LMUL_2;
+  case 32:
+    return RISCVVLMUL::LMUL_4;
+  case 64:
+    return RISCVVLMUL::LMUL_8;
   }
-  return false;
+}
+
+static unsigned getSubregIndexByEVT(EVT VT, unsigned Index) {
+  RISCVVLMUL LMUL = getLMUL(VT);
+  if (LMUL == RISCVVLMUL::LMUL_F8 || LMUL == RISCVVLMUL::LMUL_F4 ||
+      LMUL == RISCVVLMUL::LMUL_F2 || LMUL == RISCVVLMUL::LMUL_1) {
+    static_assert(RISCV::sub_vrm1_7 == RISCV::sub_vrm1_0 + 7,
+                  "Unexpected subreg numbering");
+    return RISCV::sub_vrm1_0 + Index;
+  } else if (LMUL == RISCVVLMUL::LMUL_2) {
+    static_assert(RISCV::sub_vrm2_3 == RISCV::sub_vrm2_0 + 3,
+                  "Unexpected subreg numbering");
+    return RISCV::sub_vrm2_0 + Index;
+  } else if (LMUL == RISCVVLMUL::LMUL_4) {
+    static_assert(RISCV::sub_vrm4_1 == RISCV::sub_vrm4_0 + 1,
+                  "Unexpected subreg numbering");
+    return RISCV::sub_vrm4_0 + Index;
+  }
+  llvm_unreachable("Invalid vector type.");
+}
+
+static SDValue createTupleImpl(SelectionDAG &CurDAG, ArrayRef<SDValue> Regs,
+                               unsigned RegClassID, unsigned SubReg0) {
+  assert(Regs.size() >= 2 && Regs.size() <= 8);
+
+  SDLoc DL(Regs[0]);
+  SmallVector<SDValue, 8> Ops;
+
+  Ops.push_back(CurDAG.getTargetConstant(RegClassID, DL, MVT::i32));
+
+  for (unsigned I = 0; I < Regs.size(); ++I) {
+    Ops.push_back(Regs[I]);
+    Ops.push_back(CurDAG.getTargetConstant(SubReg0 + I, DL, MVT::i32));
+  }
+  SDNode *N =
+      CurDAG.getMachineNode(TargetOpcode::REG_SEQUENCE, DL, MVT::Untyped, Ops);
+  return SDValue(N, 0);
+}
+
+static SDValue createM1Tuple(SelectionDAG &CurDAG, ArrayRef<SDValue> Regs,
+                             unsigned NF) {
+  static const unsigned RegClassIDs[] = {
+      RISCV::VRN2M1RegClassID, RISCV::VRN3M1RegClassID, RISCV::VRN4M1RegClassID,
+      RISCV::VRN5M1RegClassID, RISCV::VRN6M1RegClassID, RISCV::VRN7M1RegClassID,
+      RISCV::VRN8M1RegClassID};
+
+  return createTupleImpl(CurDAG, Regs, RegClassIDs[NF - 2], RISCV::sub_vrm1_0);
+}
+
+static SDValue createM2Tuple(SelectionDAG &CurDAG, ArrayRef<SDValue> Regs,
+                             unsigned NF) {
+  static const unsigned RegClassIDs[] = {RISCV::VRN2M2RegClassID,
+                                         RISCV::VRN3M2RegClassID,
+                                         RISCV::VRN4M2RegClassID};
+
+  return createTupleImpl(CurDAG, Regs, RegClassIDs[NF - 2], RISCV::sub_vrm2_0);
+}
+
+static SDValue createM4Tuple(SelectionDAG &CurDAG, ArrayRef<SDValue> Regs,
+                             unsigned NF) {
+  return createTupleImpl(CurDAG, Regs, RISCV::VRN2M4RegClassID,
+                         RISCV::sub_vrm4_0);
+}
+
+static SDValue createTuple(SelectionDAG &CurDAG, ArrayRef<SDValue> Regs,
+                           unsigned NF, RISCVVLMUL LMUL) {
+  switch (LMUL) {
+  default:
+    llvm_unreachable("Invalid LMUL.");
+  case RISCVVLMUL::LMUL_F8:
+  case RISCVVLMUL::LMUL_F4:
+  case RISCVVLMUL::LMUL_F2:
+  case RISCVVLMUL::LMUL_1:
+    return createM1Tuple(CurDAG, Regs, NF);
+  case RISCVVLMUL::LMUL_2:
+    return createM2Tuple(CurDAG, Regs, NF);
+  case RISCVVLMUL::LMUL_4:
+    return createM4Tuple(CurDAG, Regs, NF);
+  }
+}
+
+void RISCVDAGToDAGISel::selectVLSEG(SDNode *Node, unsigned IntNo,
+                                    bool IsStrided) {
+  SDLoc DL(Node);
+  unsigned NF = Node->getNumValues() - 1;
+  EVT VT = Node->getValueType(0);
+  unsigned ScalarSize = VT.getScalarSizeInBits();
+  MVT XLenVT = Subtarget->getXLenVT();
+  RISCVVLMUL LMUL = getLMUL(VT);
+  SDValue SEW = CurDAG->getTargetConstant(ScalarSize, DL, XLenVT);
+  SmallVector<SDValue, 5> Operands;
+  Operands.push_back(Node->getOperand(2)); // Base pointer.
+  if (IsStrided) {
+    Operands.push_back(Node->getOperand(3)); // Stride.
+    Operands.push_back(Node->getOperand(4)); // VL.
+  } else {
+    Operands.push_back(Node->getOperand(3)); // VL.
+  }
+  Operands.push_back(SEW);
+  Operands.push_back(Node->getOperand(0)); // Chain.
+  const RISCVZvlssegTable::RISCVZvlsseg *P = RISCVZvlssegTable::getPseudo(
+      IntNo, ScalarSize, static_cast<unsigned>(LMUL),
+      static_cast<unsigned>(RISCVVLMUL::LMUL_1));
+  SDNode *Load =
+      CurDAG->getMachineNode(P->Pseudo, DL, MVT::Untyped, MVT::Other, Operands);
+  SDValue SuperReg = SDValue(Load, 0);
+  for (unsigned I = 0; I < NF; ++I)
+    ReplaceUses(SDValue(Node, I),
+                CurDAG->getTargetExtractSubreg(getSubregIndexByEVT(VT, I), DL,
+                                               VT, SuperReg));
+
+  ReplaceUses(SDValue(Node, NF), SDValue(Load, 1));
+  CurDAG->RemoveDeadNode(Node);
+}
+
+void RISCVDAGToDAGISel::selectVLSEGMask(SDNode *Node, unsigned IntNo,
+                                        bool IsStrided) {
+  SDLoc DL(Node);
+  unsigned NF = Node->getNumValues() - 1;
+  EVT VT = Node->getValueType(0);
+  unsigned ScalarSize = VT.getScalarSizeInBits();
+  MVT XLenVT = Subtarget->getXLenVT();
+  RISCVVLMUL LMUL = getLMUL(VT);
+  SDValue SEW = CurDAG->getTargetConstant(ScalarSize, DL, XLenVT);
+  SmallVector<SDValue, 8> Regs(Node->op_begin() + 2, Node->op_begin() + 2 + NF);
+  SDValue MaskedOff = createTuple(*CurDAG, Regs, NF, LMUL);
+  SmallVector<SDValue, 7> Operands;
+  Operands.push_back(MaskedOff);
+  Operands.push_back(Node->getOperand(NF + 2)); // Base pointer.
+  if (IsStrided) {
+    Operands.push_back(Node->getOperand(NF + 3)); // Stride.
+    Operands.push_back(Node->getOperand(NF + 4)); // Mask.
+    Operands.push_back(Node->getOperand(NF + 5)); // VL.
+  } else {
+    Operands.push_back(Node->getOperand(NF + 3)); // Mask.
+    Operands.push_back(Node->getOperand(NF + 4)); // VL.
+  }
+  Operands.push_back(SEW);
+  Operands.push_back(Node->getOperand(0)); /// Chain.
+  const RISCVZvlssegTable::RISCVZvlsseg *P = RISCVZvlssegTable::getPseudo(
+      IntNo, ScalarSize, static_cast<unsigned>(LMUL),
+      static_cast<unsigned>(RISCVVLMUL::LMUL_1));
+  SDNode *Load =
+      CurDAG->getMachineNode(P->Pseudo, DL, MVT::Untyped, MVT::Other, Operands);
+  SDValue SuperReg = SDValue(Load, 0);
+  for (unsigned I = 0; I < NF; ++I)
+    ReplaceUses(SDValue(Node, I),
+                CurDAG->getTargetExtractSubreg(getSubregIndexByEVT(VT, I), DL,
+                                               VT, SuperReg));
+
+  ReplaceUses(SDValue(Node, NF), SDValue(Load, 1));
+  CurDAG->RemoveDeadNode(Node);
+}
+
+void RISCVDAGToDAGISel::selectVLSEGFF(SDNode *Node) {
+  SDLoc DL(Node);
+  unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+  unsigned NF = Node->getNumValues() - 2; // Do not count Chain and Glue.
+  EVT VT = Node->getValueType(0);
+  unsigned ScalarSize = VT.getScalarSizeInBits();
+  MVT XLenVT = Subtarget->getXLenVT();
+  RISCVVLMUL LMUL = getLMUL(VT);
+  SDValue SEW = CurDAG->getTargetConstant(ScalarSize, DL, XLenVT);
+  SmallVector<SDValue, 5> Operands;
+  Operands.push_back(Node->getOperand(2)); // Base pointer.
+  Operands.push_back(Node->getOperand(3)); // VL.
+  Operands.push_back(SEW);
+  Operands.push_back(Node->getOperand(0)); // Chain.
+  const RISCVZvlssegTable::RISCVZvlsseg *P = RISCVZvlssegTable::getPseudo(
+      IntNo, ScalarSize, static_cast<unsigned>(LMUL),
+      static_cast<unsigned>(RISCVVLMUL::LMUL_1));
+  SDNode *Load = CurDAG->getMachineNode(P->Pseudo, DL, MVT::Untyped, MVT::Other,
+                                        MVT::Glue, Operands);
+  SDValue SuperReg = SDValue(Load, 0);
+  for (unsigned I = 0; I < NF; ++I)
+    ReplaceUses(SDValue(Node, I),
+                CurDAG->getTargetExtractSubreg(getSubregIndexByEVT(VT, I), DL,
+                                               VT, SuperReg));
+
+  ReplaceUses(SDValue(Node, NF), SDValue(Load, 1));     // Chain.
+  ReplaceUses(SDValue(Node, NF + 1), SDValue(Load, 2)); // Glue.
+  CurDAG->RemoveDeadNode(Node);
+}
+
+void RISCVDAGToDAGISel::selectVLSEGFFMask(SDNode *Node) {
+  SDLoc DL(Node);
+  unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+  unsigned NF = Node->getNumValues() - 2; // Do not count Chain and Glue.
+  EVT VT = Node->getValueType(0);
+  unsigned ScalarSize = VT.getScalarSizeInBits();
+  MVT XLenVT = Subtarget->getXLenVT();
+  RISCVVLMUL LMUL = getLMUL(VT);
+  SDValue SEW = CurDAG->getTargetConstant(ScalarSize, DL, XLenVT);
+  SmallVector<SDValue, 8> Regs(Node->op_begin() + 2, Node->op_begin() + 2 + NF);
+  SDValue MaskedOff = createTuple(*CurDAG, Regs, NF, LMUL);
+  SmallVector<SDValue, 7> Operands;
+  Operands.push_back(MaskedOff);
+  Operands.push_back(Node->getOperand(NF + 2)); // Base pointer.
+  Operands.push_back(Node->getOperand(NF + 3)); // Mask.
+  Operands.push_back(Node->getOperand(NF + 4)); // VL.
+  Operands.push_back(SEW);
+  Operands.push_back(Node->getOperand(0)); /// Chain.
+  const RISCVZvlssegTable::RISCVZvlsseg *P = RISCVZvlssegTable::getPseudo(
+      IntNo, ScalarSize, static_cast<unsigned>(LMUL),
+      static_cast<unsigned>(RISCVVLMUL::LMUL_1));
+  SDNode *Load = CurDAG->getMachineNode(P->Pseudo, DL, MVT::Untyped, MVT::Other,
+                                        MVT::Glue, Operands);
+  SDValue SuperReg = SDValue(Load, 0);
+  for (unsigned I = 0; I < NF; ++I)
+    ReplaceUses(SDValue(Node, I),
+                CurDAG->getTargetExtractSubreg(getSubregIndexByEVT(VT, I), DL,
+                                               VT, SuperReg));
+
+  ReplaceUses(SDValue(Node, NF), SDValue(Load, 1));     // Chain.
+  ReplaceUses(SDValue(Node, NF + 1), SDValue(Load, 2)); // Glue.
+  CurDAG->RemoveDeadNode(Node);
+}
+
+void RISCVDAGToDAGISel::selectVLXSEG(SDNode *Node, unsigned IntNo) {
+  SDLoc DL(Node);
+  unsigned NF = Node->getNumValues() - 1;
+  EVT VT = Node->getValueType(0);
+  unsigned ScalarSize = VT.getScalarSizeInBits();
+  MVT XLenVT = Subtarget->getXLenVT();
+  RISCVVLMUL LMUL = getLMUL(VT);
+  SDValue SEW = CurDAG->getTargetConstant(ScalarSize, DL, XLenVT);
+  SDValue Operands[] = {
+      Node->getOperand(2),     // Base pointer.
+      Node->getOperand(3),     // Index.
+      Node->getOperand(4),     // VL.
+      SEW, Node->getOperand(0) // Chain.
+  };
+
+  EVT IndexVT = Node->getOperand(3)->getValueType(0);
+  RISCVVLMUL IndexLMUL = getLMUL(IndexVT);
+  unsigned IndexScalarSize = IndexVT.getScalarSizeInBits();
+  const RISCVZvlssegTable::RISCVZvlsseg *P = RISCVZvlssegTable::getPseudo(
+      IntNo, IndexScalarSize, static_cast<unsigned>(LMUL),
+      static_cast<unsigned>(IndexLMUL));
+  SDNode *Load =
+      CurDAG->getMachineNode(P->Pseudo, DL, MVT::Untyped, MVT::Other, Operands);
+  SDValue SuperReg = SDValue(Load, 0);
+  for (unsigned I = 0; I < NF; ++I)
+    ReplaceUses(SDValue(Node, I),
+                CurDAG->getTargetExtractSubreg(getSubregIndexByEVT(VT, I), DL,
+                                               VT, SuperReg));
+
+  ReplaceUses(SDValue(Node, NF), SDValue(Load, 1));
+  CurDAG->RemoveDeadNode(Node);
+}
+
+void RISCVDAGToDAGISel::selectVLXSEGMask(SDNode *Node, unsigned IntNo) {
+  SDLoc DL(Node);
+  unsigned NF = Node->getNumValues() - 1;
+  EVT VT = Node->getValueType(0);
+  unsigned ScalarSize = VT.getScalarSizeInBits();
+  MVT XLenVT = Subtarget->getXLenVT();
+  RISCVVLMUL LMUL = getLMUL(VT);
+  SDValue SEW = CurDAG->getTargetConstant(ScalarSize, DL, XLenVT);
+  SmallVector<SDValue, 8> Regs(Node->op_begin() + 2, Node->op_begin() + 2 + NF);
+  SDValue MaskedOff = createTuple(*CurDAG, Regs, NF, LMUL);
+  SDValue Operands[] = {
+      MaskedOff,
+      Node->getOperand(NF + 2), // Base pointer.
+      Node->getOperand(NF + 3), // Index.
+      Node->getOperand(NF + 4), // Mask.
+      Node->getOperand(NF + 5), // VL.
+      SEW,
+      Node->getOperand(0) // Chain.
+  };
+
+  EVT IndexVT = Node->getOperand(NF + 3)->getValueType(0);
+  RISCVVLMUL IndexLMUL = getLMUL(IndexVT);
+  unsigned IndexScalarSize = IndexVT.getScalarSizeInBits();
+  const RISCVZvlssegTable::RISCVZvlsseg *P = RISCVZvlssegTable::getPseudo(
+      IntNo, IndexScalarSize, static_cast<unsigned>(LMUL),
+      static_cast<unsigned>(IndexLMUL));
+  SDNode *Load =
+      CurDAG->getMachineNode(P->Pseudo, DL, MVT::Untyped, MVT::Other, Operands);
+  SDValue SuperReg = SDValue(Load, 0);
+  for (unsigned I = 0; I < NF; ++I)
+    ReplaceUses(SDValue(Node, I),
+                CurDAG->getTargetExtractSubreg(getSubregIndexByEVT(VT, I), DL,
+                                               VT, SuperReg));
+
+  ReplaceUses(SDValue(Node, NF), SDValue(Load, 1));
+  CurDAG->RemoveDeadNode(Node);
+}
+
+void RISCVDAGToDAGISel::selectVSSEG(SDNode *Node, unsigned IntNo,
+                                    bool IsStrided) {
+  SDLoc DL(Node);
+  unsigned NF = Node->getNumOperands() - 4;
+  if (IsStrided)
+    NF--;
+  EVT VT = Node->getOperand(2)->getValueType(0);
+  unsigned ScalarSize = VT.getScalarSizeInBits();
+  MVT XLenVT = Subtarget->getXLenVT();
+  RISCVVLMUL LMUL = getLMUL(VT);
+  SDValue SEW = CurDAG->getTargetConstant(ScalarSize, DL, XLenVT);
+  SmallVector<SDValue, 8> Regs(Node->op_begin() + 2, Node->op_begin() + 2 + NF);
+  SDValue StoreVal = createTuple(*CurDAG, Regs, NF, LMUL);
+  SmallVector<SDValue, 6> Operands;
+  Operands.push_back(StoreVal);
+  Operands.push_back(Node->getOperand(2 + NF)); // Base pointer.
+  if (IsStrided) {
+    Operands.push_back(Node->getOperand(3 + NF)); // Stride.
+    Operands.push_back(Node->getOperand(4 + NF)); // VL.
+  } else {
+    Operands.push_back(Node->getOperand(3 + NF)); // VL.
+  }
+  Operands.push_back(SEW);
+  Operands.push_back(Node->getOperand(0)); // Chain.
+  const RISCVZvlssegTable::RISCVZvlsseg *P = RISCVZvlssegTable::getPseudo(
+      IntNo, ScalarSize, static_cast<unsigned>(LMUL),
+      static_cast<unsigned>(RISCVVLMUL::LMUL_1));
+  SDNode *Store =
+      CurDAG->getMachineNode(P->Pseudo, DL, Node->getValueType(0), Operands);
+  ReplaceNode(Node, Store);
+}
+
+void RISCVDAGToDAGISel::selectVSSEGMask(SDNode *Node, unsigned IntNo,
+                                        bool IsStrided) {
+  SDLoc DL(Node);
+  unsigned NF = Node->getNumOperands() - 5;
+  if (IsStrided)
+    NF--;
+  EVT VT = Node->getOperand(2)->getValueType(0);
+  unsigned ScalarSize = VT.getScalarSizeInBits();
+  MVT XLenVT = Subtarget->getXLenVT();
+  RISCVVLMUL LMUL = getLMUL(VT);
+  SDValue SEW = CurDAG->getTargetConstant(ScalarSize, DL, XLenVT);
+  SmallVector<SDValue, 8> Regs(Node->op_begin() + 2, Node->op_begin() + 2 + NF);
+  SDValue StoreVal = createTuple(*CurDAG, Regs, NF, LMUL);
+  SmallVector<SDValue, 7> Operands;
+  Operands.push_back(StoreVal);
+  Operands.push_back(Node->getOperand(2 + NF)); // Base pointer.
+  if (IsStrided) {
+    Operands.push_back(Node->getOperand(3 + NF)); // Stride.
+    Operands.push_back(Node->getOperand(4 + NF)); // Mask.
+    Operands.push_back(Node->getOperand(5 + NF)); // VL.
+  } else {
+    Operands.push_back(Node->getOperand(3 + NF)); // Mask.
+    Operands.push_back(Node->getOperand(4 + NF)); // VL.
+  }
+  Operands.push_back(SEW);
+  Operands.push_back(Node->getOperand(0)); // Chain.
+  const RISCVZvlssegTable::RISCVZvlsseg *P = RISCVZvlssegTable::getPseudo(
+      IntNo, ScalarSize, static_cast<unsigned>(LMUL),
+      static_cast<unsigned>(RISCVVLMUL::LMUL_1));
+  SDNode *Store =
+      CurDAG->getMachineNode(P->Pseudo, DL, Node->getValueType(0), Operands);
+  ReplaceNode(Node, Store);
+}
+
+void RISCVDAGToDAGISel::selectVSXSEG(SDNode *Node, unsigned IntNo) {
+  SDLoc DL(Node);
+  unsigned NF = Node->getNumOperands() - 5;
+  EVT VT = Node->getOperand(2)->getValueType(0);
+  unsigned ScalarSize = VT.getScalarSizeInBits();
+  MVT XLenVT = Subtarget->getXLenVT();
+  RISCVVLMUL LMUL = getLMUL(VT);
+  SDValue SEW = CurDAG->getTargetConstant(ScalarSize, DL, XLenVT);
+  SmallVector<SDValue, 8> Regs(Node->op_begin() + 2, Node->op_begin() + 2 + NF);
+  SDValue StoreVal = createTuple(*CurDAG, Regs, NF, LMUL);
+  SDValue Operands[] = {
+      StoreVal,
+      Node->getOperand(2 + NF), // Base pointer.
+      Node->getOperand(3 + NF), // Index.
+      Node->getOperand(4 + NF), // VL.
+      SEW,
+      Node->getOperand(0) // Chain.
+  };
+
+  EVT IndexVT = Node->getOperand(3 + NF)->getValueType(0);
+  RISCVVLMUL IndexLMUL = getLMUL(IndexVT);
+  unsigned IndexScalarSize = IndexVT.getScalarSizeInBits();
+  const RISCVZvlssegTable::RISCVZvlsseg *P = RISCVZvlssegTable::getPseudo(
+      IntNo, IndexScalarSize, static_cast<unsigned>(LMUL),
+      static_cast<unsigned>(IndexLMUL));
+  SDNode *Store =
+      CurDAG->getMachineNode(P->Pseudo, DL, Node->getValueType(0), Operands);
+  ReplaceNode(Node, Store);
+}
+
+void RISCVDAGToDAGISel::selectVSXSEGMask(SDNode *Node, unsigned IntNo) {
+  SDLoc DL(Node);
+  unsigned NF = Node->getNumOperands() - 6;
+  EVT VT = Node->getOperand(2)->getValueType(0);
+  unsigned ScalarSize = VT.getScalarSizeInBits();
+  MVT XLenVT = Subtarget->getXLenVT();
+  RISCVVLMUL LMUL = getLMUL(VT);
+  SDValue SEW = CurDAG->getTargetConstant(ScalarSize, DL, XLenVT);
+  SmallVector<SDValue, 8> Regs(Node->op_begin() + 2, Node->op_begin() + 2 + NF);
+  SDValue StoreVal = createTuple(*CurDAG, Regs, NF, LMUL);
+  SDValue Operands[] = {
+      StoreVal,
+      Node->getOperand(2 + NF), // Base pointer.
+      Node->getOperand(3 + NF), // Index.
+      Node->getOperand(4 + NF), // Mask.
+      Node->getOperand(5 + NF), // VL.
+      SEW,
+      Node->getOperand(0) // Chain.
+  };
+
+  EVT IndexVT = Node->getOperand(3 + NF)->getValueType(0);
+  RISCVVLMUL IndexLMUL = getLMUL(IndexVT);
+  unsigned IndexScalarSize = IndexVT.getScalarSizeInBits();
+  const RISCVZvlssegTable::RISCVZvlsseg *P = RISCVZvlssegTable::getPseudo(
+      IntNo, IndexScalarSize, static_cast<unsigned>(LMUL),
+      static_cast<unsigned>(IndexLMUL));
+  SDNode *Store =
+      CurDAG->getMachineNode(P->Pseudo, DL, Node->getValueType(0), Operands);
+  ReplaceNode(Node, Store);
 }
 
 void RISCVDAGToDAGISel::Select(SDNode *Node) {
@@ -86,7 +511,6 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       if (!(-4096 <= Imm && Imm <= -2049) && !(2048 <= Imm && Imm <= 4094))
         break;
       // Break the imm to imm0+imm1.
-      SDLoc DL(Node);
       EVT VT = Node->getValueType(0);
       const SDValue ImmOp0 = CurDAG->getTargetConstant(Imm - Imm / 2, DL, VT);
       const SDValue ImmOp1 = CurDAG->getTargetConstant(Imm / 2, DL, VT);
@@ -102,14 +526,14 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
   case ISD::Constant: {
     auto ConstNode = cast<ConstantSDNode>(Node);
     if (VT == XLenVT && ConstNode->isNullValue()) {
-      SDValue New = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node),
-                                           RISCV::X0, XLenVT);
+      SDValue New =
+          CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, RISCV::X0, XLenVT);
       ReplaceNode(Node, New.getNode());
       return;
     }
     int64_t Imm = ConstNode->getSExtValue();
     if (XLenVT == MVT::i64) {
-      ReplaceNode(Node, selectImm(CurDAG, SDLoc(Node), Imm, XLenVT));
+      ReplaceNode(Node, selectImm(CurDAG, DL, Imm, XLenVT));
       return;
     }
     break;
@@ -121,38 +545,235 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     ReplaceNode(Node, CurDAG->getMachineNode(RISCV::ADDI, DL, VT, TFI, Imm));
     return;
   }
-  case ISD::SRL: {
-    if (!Subtarget->is64Bit())
+  case ISD::INTRINSIC_W_CHAIN: {
+    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+    switch (IntNo) {
+      // By default we do not custom select any intrinsic.
+    default:
       break;
-    SDValue Op0 = Node->getOperand(0);
-    SDValue Op1 = Node->getOperand(1);
-    uint64_t Mask;
-    // Match (srl (and val, mask), imm) where the result would be a
-    // zero-extended 32-bit integer. i.e. the mask is 0xffffffff or the result
-    // is equivalent to this (SimplifyDemandedBits may have removed lower bits
-    // from the mask that aren't necessary due to the right-shifting).
-    if (Op1.getOpcode() == ISD::Constant &&
-        isConstantMask(Op0.getNode(), Mask)) {
-      uint64_t ShAmt = cast<ConstantSDNode>(Op1.getNode())->getZExtValue();
-
-      if ((Mask | maskTrailingOnes<uint64_t>(ShAmt)) == 0xffffffff) {
-        SDValue ShAmtVal =
-            CurDAG->getTargetConstant(ShAmt, SDLoc(Node), XLenVT);
-        CurDAG->SelectNodeTo(Node, RISCV::SRLIW, XLenVT, Op0.getOperand(0),
-                             ShAmtVal);
-        return;
+
+    case Intrinsic::riscv_vsetvli: {
+      if (!Subtarget->hasStdExtV())
+        break;
+
+      assert(Node->getNumOperands() == 5);
+
+      RISCVVSEW VSEW =
+          static_cast<RISCVVSEW>(Node->getConstantOperandVal(3) & 0x7);
+      RISCVVLMUL VLMul =
+          static_cast<RISCVVLMUL>(Node->getConstantOperandVal(4) & 0x7);
+
+      unsigned VTypeI = RISCVVType::encodeVTYPE(
+          VLMul, VSEW, /*TailAgnostic*/ true, /*MaskAgnostic*/ false);
+      SDValue VTypeIOp = CurDAG->getTargetConstant(VTypeI, DL, XLenVT);
+
+      SDValue VLOperand = Node->getOperand(2);
+      if (auto *C = dyn_cast<ConstantSDNode>(VLOperand)) {
+        uint64_t AVL = C->getZExtValue();
+        if (isUInt<5>(AVL)) {
+          SDValue VLImm = CurDAG->getTargetConstant(AVL, DL, XLenVT);
+          ReplaceNode(Node,
+                      CurDAG->getMachineNode(RISCV::PseudoVSETIVLI, DL, XLenVT,
+                                             MVT::Other, VLImm, VTypeIOp,
+                                             /* Chain */ Node->getOperand(0)));
+          return;
+        }
       }
+
+      ReplaceNode(Node,
+                  CurDAG->getMachineNode(RISCV::PseudoVSETVLI, DL, XLenVT,
+                                         MVT::Other, VLOperand, VTypeIOp,
+                                         /* Chain */ Node->getOperand(0)));
+      return;
+    }
+    case Intrinsic::riscv_vsetvlimax: {
+      if (!Subtarget->hasStdExtV())
+        break;
+
+      assert(Node->getNumOperands() == 4);
+
+      RISCVVSEW VSEW =
+          static_cast<RISCVVSEW>(Node->getConstantOperandVal(2) & 0x7);
+      RISCVVLMUL VLMul =
+          static_cast<RISCVVLMUL>(Node->getConstantOperandVal(3) & 0x7);
+
+      unsigned VTypeI = RISCVVType::encodeVTYPE(
+          VLMul, VSEW, /*TailAgnostic*/ true, /*MaskAgnostic*/ false);
+      SDValue VTypeIOp = CurDAG->getTargetConstant(VTypeI, DL, XLenVT);
+
+      SDValue VLOperand = CurDAG->getRegister(RISCV::X0, XLenVT);
+      ReplaceNode(Node,
+                  CurDAG->getMachineNode(RISCV::PseudoVSETVLI, DL, XLenVT,
+                                         MVT::Other, VLOperand, VTypeIOp,
+                                         /* Chain */ Node->getOperand(0)));
+      return;
+    }
+    case Intrinsic::riscv_vlseg2:
+    case Intrinsic::riscv_vlseg3:
+    case Intrinsic::riscv_vlseg4:
+    case Intrinsic::riscv_vlseg5:
+    case Intrinsic::riscv_vlseg6:
+    case Intrinsic::riscv_vlseg7:
+    case Intrinsic::riscv_vlseg8: {
+      selectVLSEG(Node, IntNo, /*IsStrided=*/false);
+      return;
+    }
+    case Intrinsic::riscv_vlseg2_mask:
+    case Intrinsic::riscv_vlseg3_mask:
+    case Intrinsic::riscv_vlseg4_mask:
+    case Intrinsic::riscv_vlseg5_mask:
+    case Intrinsic::riscv_vlseg6_mask:
+    case Intrinsic::riscv_vlseg7_mask:
+    case Intrinsic::riscv_vlseg8_mask: {
+      selectVLSEGMask(Node, IntNo, /*IsStrided=*/false);
+      return;
+    }
+    case Intrinsic::riscv_vlsseg2:
+    case Intrinsic::riscv_vlsseg3:
+    case Intrinsic::riscv_vlsseg4:
+    case Intrinsic::riscv_vlsseg5:
+    case Intrinsic::riscv_vlsseg6:
+    case Intrinsic::riscv_vlsseg7:
+    case Intrinsic::riscv_vlsseg8: {
+      selectVLSEG(Node, IntNo, /*IsStrided=*/true);
+      return;
+    }
+    case Intrinsic::riscv_vlsseg2_mask:
+    case Intrinsic::riscv_vlsseg3_mask:
+    case Intrinsic::riscv_vlsseg4_mask:
+    case Intrinsic::riscv_vlsseg5_mask:
+    case Intrinsic::riscv_vlsseg6_mask:
+    case Intrinsic::riscv_vlsseg7_mask:
+    case Intrinsic::riscv_vlsseg8_mask: {
+      selectVLSEGMask(Node, IntNo, /*IsStrided=*/true);
+      return;
+    }
+    case Intrinsic::riscv_vloxseg2:
+    case Intrinsic::riscv_vloxseg3:
+    case Intrinsic::riscv_vloxseg4:
+    case Intrinsic::riscv_vloxseg5:
+    case Intrinsic::riscv_vloxseg6:
+    case Intrinsic::riscv_vloxseg7:
+    case Intrinsic::riscv_vloxseg8:
+    case Intrinsic::riscv_vluxseg2:
+    case Intrinsic::riscv_vluxseg3:
+    case Intrinsic::riscv_vluxseg4:
+    case Intrinsic::riscv_vluxseg5:
+    case Intrinsic::riscv_vluxseg6:
+    case Intrinsic::riscv_vluxseg7:
+    case Intrinsic::riscv_vluxseg8: {
+      selectVLXSEG(Node, IntNo);
+      return;
+    }
+    case Intrinsic::riscv_vloxseg2_mask:
+    case Intrinsic::riscv_vloxseg3_mask:
+    case Intrinsic::riscv_vloxseg4_mask:
+    case Intrinsic::riscv_vloxseg5_mask:
+    case Intrinsic::riscv_vloxseg6_mask:
+    case Intrinsic::riscv_vloxseg7_mask:
+    case Intrinsic::riscv_vloxseg8_mask:
+    case Intrinsic::riscv_vluxseg2_mask:
+    case Intrinsic::riscv_vluxseg3_mask:
+    case Intrinsic::riscv_vluxseg4_mask:
+    case Intrinsic::riscv_vluxseg5_mask:
+    case Intrinsic::riscv_vluxseg6_mask:
+    case Intrinsic::riscv_vluxseg7_mask:
+    case Intrinsic::riscv_vluxseg8_mask: {
+      selectVLXSEGMask(Node, IntNo);
+      return;
+    }
     }
     break;
   }
-  case RISCVISD::READ_CYCLE_WIDE:
-    assert(!Subtarget->is64Bit() && "READ_CYCLE_WIDE is only used on riscv32");
-
-    ReplaceNode(Node, CurDAG->getMachineNode(RISCV::ReadCycleWide, DL, MVT::i32,
-                                             MVT::i32, MVT::Other,
-                                             Node->getOperand(0)));
+  case ISD::INTRINSIC_VOID: {
+    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+    switch (IntNo) {
+    case Intrinsic::riscv_vsseg2:
+    case Intrinsic::riscv_vsseg3:
+    case Intrinsic::riscv_vsseg4:
+    case Intrinsic::riscv_vsseg5:
+    case Intrinsic::riscv_vsseg6:
+    case Intrinsic::riscv_vsseg7:
+    case Intrinsic::riscv_vsseg8: {
+      selectVSSEG(Node, IntNo, /*IsStrided=*/false);
+      return;
+    }
+    case Intrinsic::riscv_vsseg2_mask:
+    case Intrinsic::riscv_vsseg3_mask:
+    case Intrinsic::riscv_vsseg4_mask:
+    case Intrinsic::riscv_vsseg5_mask:
+    case Intrinsic::riscv_vsseg6_mask:
+    case Intrinsic::riscv_vsseg7_mask:
+    case Intrinsic::riscv_vsseg8_mask: {
+      selectVSSEGMask(Node, IntNo, /*IsStrided=*/false);
+      return;
+    }
+    case Intrinsic::riscv_vssseg2:
+    case Intrinsic::riscv_vssseg3:
+    case Intrinsic::riscv_vssseg4:
+    case Intrinsic::riscv_vssseg5:
+    case Intrinsic::riscv_vssseg6:
+    case Intrinsic::riscv_vssseg7:
+    case Intrinsic::riscv_vssseg8: {
+      selectVSSEG(Node, IntNo, /*IsStrided=*/true);
+      return;
+    }
+    case Intrinsic::riscv_vssseg2_mask:
+    case Intrinsic::riscv_vssseg3_mask:
+    case Intrinsic::riscv_vssseg4_mask:
+    case Intrinsic::riscv_vssseg5_mask:
+    case Intrinsic::riscv_vssseg6_mask:
+    case Intrinsic::riscv_vssseg7_mask:
+    case Intrinsic::riscv_vssseg8_mask: {
+      selectVSSEGMask(Node, IntNo, /*IsStrided=*/true);
+      return;
+    }
+    case Intrinsic::riscv_vsoxseg2:
+    case Intrinsic::riscv_vsoxseg3:
+    case Intrinsic::riscv_vsoxseg4:
+    case Intrinsic::riscv_vsoxseg5:
+    case Intrinsic::riscv_vsoxseg6:
+    case Intrinsic::riscv_vsoxseg7:
+    case Intrinsic::riscv_vsoxseg8:
+    case Intrinsic::riscv_vsuxseg2:
+    case Intrinsic::riscv_vsuxseg3:
+    case Intrinsic::riscv_vsuxseg4:
+    case Intrinsic::riscv_vsuxseg5:
+    case Intrinsic::riscv_vsuxseg6:
+    case Intrinsic::riscv_vsuxseg7:
+    case Intrinsic::riscv_vsuxseg8: {
+      selectVSXSEG(Node, IntNo);
+      return;
+    }
+    case Intrinsic::riscv_vsoxseg2_mask:
+    case Intrinsic::riscv_vsoxseg3_mask:
+    case Intrinsic::riscv_vsoxseg4_mask:
+    case Intrinsic::riscv_vsoxseg5_mask:
+    case Intrinsic::riscv_vsoxseg6_mask:
+    case Intrinsic::riscv_vsoxseg7_mask:
+    case Intrinsic::riscv_vsoxseg8_mask:
+    case Intrinsic::riscv_vsuxseg2_mask:
+    case Intrinsic::riscv_vsuxseg3_mask:
+    case Intrinsic::riscv_vsuxseg4_mask:
+    case Intrinsic::riscv_vsuxseg5_mask:
+    case Intrinsic::riscv_vsuxseg6_mask:
+    case Intrinsic::riscv_vsuxseg7_mask:
+    case Intrinsic::riscv_vsuxseg8_mask: {
+      selectVSXSEGMask(Node, IntNo);
+      return;
+    }
+    }
+    break;
+  }
+  case RISCVISD::VLSEGFF: {
+    selectVLSEGFF(Node);
     return;
   }
+  case RISCVISD::VLSEGFF_MASK: {
+    selectVLSEGFFMask(Node);
+    return;
+  }
+  }
 
   // Select the default instruction.
   SelectCode(Node);
@@ -184,328 +805,132 @@ bool RISCVDAGToDAGISel::SelectAddrFI(SDValue Addr, SDValue &Base) {
   return false;
 }
 
-// Check that it is a SLOI (Shift Left Ones Immediate). We first check that
-// it is the right node tree:
-//
-//  (OR (SHL RS1, VC2), VC1)
-//
-// and then we check that VC1, the mask used to fill with ones, is compatible
-// with VC2, the shamt:
-//
-//  VC1 == maskTrailingOnes<uint64_t>(VC2)
+// Match (srl (and val, mask), imm) where the result would be a
+// zero-extended 32-bit integer. i.e. the mask is 0xffffffff or the result
+// is equivalent to this (SimplifyDemandedBits may have removed lower bits
+// from the mask that aren't necessary due to the right-shifting).
+bool RISCVDAGToDAGISel::MatchSRLIW(SDNode *N) const {
+  assert(N->getOpcode() == ISD::SRL);
+  assert(N->getOperand(0).getOpcode() == ISD::AND);
+  assert(isa<ConstantSDNode>(N->getOperand(1)));
+  assert(isa<ConstantSDNode>(N->getOperand(0).getOperand(1)));
 
-bool RISCVDAGToDAGISel::SelectSLOI(SDValue N, SDValue &RS1, SDValue &Shamt) {
-  MVT XLenVT = Subtarget->getXLenVT();
-  if (N.getOpcode() == ISD::OR) {
-    SDValue Or = N;
-    if (Or.getOperand(0).getOpcode() == ISD::SHL) {
-      SDValue Shl = Or.getOperand(0);
-      if (isa<ConstantSDNode>(Shl.getOperand(1)) &&
-          isa<ConstantSDNode>(Or.getOperand(1))) {
-        if (XLenVT == MVT::i64) {
-          uint64_t VC1 = Or.getConstantOperandVal(1);
-          uint64_t VC2 = Shl.getConstantOperandVal(1);
-          if (VC1 == maskTrailingOnes<uint64_t>(VC2)) {
-            RS1 = Shl.getOperand(0);
-            Shamt = CurDAG->getTargetConstant(VC2, SDLoc(N),
-                           Shl.getOperand(1).getValueType());
-            return true;
-          }
-        }
-        if (XLenVT == MVT::i32) {
-          uint32_t VC1 = Or.getConstantOperandVal(1);
-          uint32_t VC2 = Shl.getConstantOperandVal(1);
-          if (VC1 == maskTrailingOnes<uint32_t>(VC2)) {
-            RS1 = Shl.getOperand(0);
-            Shamt = CurDAG->getTargetConstant(VC2, SDLoc(N),
-                           Shl.getOperand(1).getValueType());
-            return true;
-          }
-        }
-      }
-    }
-  }
-  return false;
-}
-
-// Check that it is a SROI (Shift Right Ones Immediate). We first check that
-// it is the right node tree:
-//
-//  (OR (SRL RS1, VC2), VC1)
-//
-// and then we check that VC1, the mask used to fill with ones, is compatible
-// with VC2, the shamt:
-//
-//  VC1 == maskLeadingOnes<uint64_t>(VC2)
-
-bool RISCVDAGToDAGISel::SelectSROI(SDValue N, SDValue &RS1, SDValue &Shamt) {
-  MVT XLenVT = Subtarget->getXLenVT();
-  if (N.getOpcode() == ISD::OR) {
-    SDValue Or = N;
-    if (Or.getOperand(0).getOpcode() == ISD::SRL) {
-      SDValue Srl = Or.getOperand(0);
-      if (isa<ConstantSDNode>(Srl.getOperand(1)) &&
-          isa<ConstantSDNode>(Or.getOperand(1))) {
-        if (XLenVT == MVT::i64) {
-          uint64_t VC1 = Or.getConstantOperandVal(1);
-          uint64_t VC2 = Srl.getConstantOperandVal(1);
-          if (VC1 == maskLeadingOnes<uint64_t>(VC2)) {
-            RS1 = Srl.getOperand(0);
-            Shamt = CurDAG->getTargetConstant(VC2, SDLoc(N),
-                           Srl.getOperand(1).getValueType());
-            return true;
-          }
-        }
-        if (XLenVT == MVT::i32) {
-          uint32_t VC1 = Or.getConstantOperandVal(1);
-          uint32_t VC2 = Srl.getConstantOperandVal(1);
-          if (VC1 == maskLeadingOnes<uint32_t>(VC2)) {
-            RS1 = Srl.getOperand(0);
-            Shamt = CurDAG->getTargetConstant(VC2, SDLoc(N),
-                           Srl.getOperand(1).getValueType());
-            return true;
-          }
-        }
-      }
-    }
-  }
-  return false;
-}
-
-// Check that it is a RORI (Rotate Right Immediate). We first check that
-// it is the right node tree:
-//
-//  (ROTL RS1, VC)
-//
-// The compiler translates immediate rotations to the right given by the call
-// to the rotateright32/rotateright64 intrinsics as rotations to the left.
-// Since the rotation to the left can be easily emulated as a rotation to the
-// right by negating the constant, there is no encoding for ROLI.
-// We then select the immediate left rotations as RORI by the complementary
-// constant:
-//
-//  Shamt == XLen - VC
+  // The IsRV64 predicate is checked after PatFrag predicates so we can get
+  // here even on RV32.
+  if (!Subtarget->is64Bit())
+    return false;
 
-bool RISCVDAGToDAGISel::SelectRORI(SDValue N, SDValue &RS1, SDValue &Shamt) {
-  MVT XLenVT = Subtarget->getXLenVT();
-  if (N.getOpcode() == ISD::ROTL) {
-    if (isa<ConstantSDNode>(N.getOperand(1))) {
-      if (XLenVT == MVT::i64) {
-        uint64_t VC = N.getConstantOperandVal(1);
-        Shamt = CurDAG->getTargetConstant((64 - VC), SDLoc(N),
-                                          N.getOperand(1).getValueType());
-        RS1 = N.getOperand(0);
-        return true;
-      }
-      if (XLenVT == MVT::i32) {
-        uint32_t VC = N.getConstantOperandVal(1);
-        Shamt = CurDAG->getTargetConstant((32 - VC), SDLoc(N),
-                                          N.getOperand(1).getValueType());
-        RS1 = N.getOperand(0);
-        return true;
-      }
-    }
-  }
-  return false;
+  SDValue And = N->getOperand(0);
+  uint64_t ShAmt = N->getConstantOperandVal(1);
+  uint64_t Mask = And.getConstantOperandVal(1);
+  return (Mask | maskTrailingOnes<uint64_t>(ShAmt)) == 0xffffffff;
 }
 
-
 // Check that it is a SLLIUW (Shift Logical Left Immediate Unsigned i32
 // on RV64).
 // SLLIUW is the same as SLLI except for the fact that it clears the bits
 // XLEN-1:32 of the input RS1 before shifting.
-// We first check that it is the right node tree:
+// A PatFrag has already checked that it has the right structure:
 //
 //  (AND (SHL RS1, VC2), VC1)
 //
 // We check that VC2, the shamt is less than 32, otherwise the pattern is
 // exactly the same as SLLI and we give priority to that.
-// Eventually we check that that VC1, the mask used to clear the upper 32 bits
+// Eventually we check that VC1, the mask used to clear the upper 32 bits
 // of RS1, is correct:
 //
 //  VC1 == (0xFFFFFFFF << VC2)
+//
+bool RISCVDAGToDAGISel::MatchSLLIUW(SDNode *N) const {
+  assert(N->getOpcode() == ISD::AND);
+  assert(N->getOperand(0).getOpcode() == ISD::SHL);
+  assert(isa<ConstantSDNode>(N->getOperand(1)));
+  assert(isa<ConstantSDNode>(N->getOperand(0).getOperand(1)));
 
-bool RISCVDAGToDAGISel::SelectSLLIUW(SDValue N, SDValue &RS1, SDValue &Shamt) {
-  if (N.getOpcode() == ISD::AND && Subtarget->getXLenVT() == MVT::i64) {
-    SDValue And = N;
-    if (And.getOperand(0).getOpcode() == ISD::SHL) {
-      SDValue Shl = And.getOperand(0);
-      if (isa<ConstantSDNode>(Shl.getOperand(1)) &&
-          isa<ConstantSDNode>(And.getOperand(1))) {
-        uint64_t VC1 = And.getConstantOperandVal(1);
-        uint64_t VC2 = Shl.getConstantOperandVal(1);
-        if (VC2 < 32 && VC1 == ((uint64_t)0xFFFFFFFF << VC2)) {
-          RS1 = Shl.getOperand(0);
-          Shamt = CurDAG->getTargetConstant(VC2, SDLoc(N),
-                                            Shl.getOperand(1).getValueType());
-          return true;
-        }
-      }
-    }
-  }
-  return false;
+  // The IsRV64 predicate is checked after PatFrag predicates so we can get
+  // here even on RV32.
+  if (!Subtarget->is64Bit())
+    return false;
+
+  SDValue Shl = N->getOperand(0);
+  uint64_t VC1 = N->getConstantOperandVal(1);
+  uint64_t VC2 = Shl.getConstantOperandVal(1);
+
+  // Immediate range should be enforced by uimm5 predicate.
+  assert(VC2 < 32 && "Unexpected immediate");
+  return (VC1 >> VC2) == UINT64_C(0xFFFFFFFF);
 }
 
-// Check that it is a SLOIW (Shift Left Ones Immediate i32 on RV64).
-// We first check that it is the right node tree:
-//
-//  (SIGN_EXTEND_INREG (OR (SHL RS1, VC2), VC1))
-//
-// and then we check that VC1, the mask used to fill with ones, is compatible
-// with VC2, the shamt:
-//
-//  VC1 == maskTrailingOnes<uint32_t>(VC2)
-
-bool RISCVDAGToDAGISel::SelectSLOIW(SDValue N, SDValue &RS1, SDValue &Shamt) {
-  if (Subtarget->getXLenVT() == MVT::i64 &&
-      N.getOpcode() == ISD::SIGN_EXTEND_INREG &&
-      cast<VTSDNode>(N.getOperand(1))->getVT() == MVT::i32) {
-    if (N.getOperand(0).getOpcode() == ISD::OR) {
-      SDValue Or = N.getOperand(0);
-      if (Or.getOperand(0).getOpcode() == ISD::SHL) {
-        SDValue Shl = Or.getOperand(0);
-        if (isa<ConstantSDNode>(Shl.getOperand(1)) &&
-            isa<ConstantSDNode>(Or.getOperand(1))) {
-          uint32_t VC1 = Or.getConstantOperandVal(1);
-          uint32_t VC2 = Shl.getConstantOperandVal(1);
-          if (VC1 == maskTrailingOnes<uint32_t>(VC2)) {
-            RS1 = Shl.getOperand(0);
-            Shamt = CurDAG->getTargetConstant(VC2, SDLoc(N),
-                                              Shl.getOperand(1).getValueType());
-            return true;
-          }
-        }
-      }
-    }
-  }
-  return false;
+// X0 has special meaning for vsetvl/vsetvli.
+//  rd | rs1 |   AVL value | Effect on vl
+//--------------------------------------------------------------
+// !X0 |  X0 |       VLMAX | Set vl to VLMAX
+//  X0 |  X0 | Value in vl | Keep current vl, just change vtype.
+bool RISCVDAGToDAGISel::selectVLOp(SDValue N, SDValue &VL) {
+  // If the VL value is a constant 0, manually select it to an ADDI with 0
+  // immediate to prevent the default selection path from matching it to X0.
+  auto *C = dyn_cast<ConstantSDNode>(N);
+  if (C && C->isNullValue())
+    VL = SDValue(selectImm(CurDAG, SDLoc(N), 0, Subtarget->getXLenVT()), 0);
+  else
+    VL = N;
+
+  return true;
 }
 
-// Check that it is a SROIW (Shift Right Ones Immediate i32 on RV64).
-// We first check that it is the right node tree:
-//
-//  (OR (SHL RS1, VC2), VC1)
-//
-// and then we check that VC1, the mask used to fill with ones, is compatible
-// with VC2, the shamt:
-//
-//  VC1 == maskLeadingOnes<uint32_t>(VC2)
-
-bool RISCVDAGToDAGISel::SelectSROIW(SDValue N, SDValue &RS1, SDValue &Shamt) {
-  if (N.getOpcode() == ISD::OR && Subtarget->getXLenVT() == MVT::i64) {
-    SDValue Or = N;
-    if (Or.getOperand(0).getOpcode() == ISD::SRL) {
-      SDValue Srl = Or.getOperand(0);
-      if (isa<ConstantSDNode>(Srl.getOperand(1)) &&
-          isa<ConstantSDNode>(Or.getOperand(1))) {
-        uint32_t VC1 = Or.getConstantOperandVal(1);
-        uint32_t VC2 = Srl.getConstantOperandVal(1);
-        if (VC1 == maskLeadingOnes<uint32_t>(VC2)) {
-          RS1 = Srl.getOperand(0);
-          Shamt = CurDAG->getTargetConstant(VC2, SDLoc(N),
-                                            Srl.getOperand(1).getValueType());
-          return true;
-        }
-      }
-    }
-  }
-  return false;
+bool RISCVDAGToDAGISel::selectVSplat(SDValue N, SDValue &SplatVal) {
+  if (N.getOpcode() != ISD::SPLAT_VECTOR &&
+      N.getOpcode() != RISCVISD::SPLAT_VECTOR_I64)
+    return false;
+  SplatVal = N.getOperand(0);
+  return true;
 }
 
-// Check that it is a RORIW (i32 Right Rotate Immediate on RV64).
-// We first check that it is the right node tree:
-//
-//  (SIGN_EXTEND_INREG (OR (SHL (AsserSext RS1, i32), VC2),
-//                         (SRL (AND (AssertSext RS2, i32), VC3), VC1)))
-//
-// Then we check that the constant operands respect these constraints:
-//
-// VC2 == 32 - VC1
-// VC3 == maskLeadingOnes<uint32_t>(VC2)
-//
-// being VC1 the Shamt we need, VC2 the complementary of Shamt over 32
-// and VC3 a 32 bit mask of (32 - VC1) leading ones.
-
-bool RISCVDAGToDAGISel::SelectRORIW(SDValue N, SDValue &RS1, SDValue &Shamt) {
-  if (N.getOpcode() == ISD::SIGN_EXTEND_INREG &&
-      Subtarget->getXLenVT() == MVT::i64 &&
-      cast<VTSDNode>(N.getOperand(1))->getVT() == MVT::i32) {
-    if (N.getOperand(0).getOpcode() == ISD::OR) {
-      SDValue Or = N.getOperand(0);
-      if (Or.getOperand(0).getOpcode() == ISD::SHL &&
-          Or.getOperand(1).getOpcode() == ISD::SRL) {
-        SDValue Shl = Or.getOperand(0);
-        SDValue Srl = Or.getOperand(1);
-        if (Srl.getOperand(0).getOpcode() == ISD::AND) {
-          SDValue And = Srl.getOperand(0);
-          if (isa<ConstantSDNode>(Srl.getOperand(1)) &&
-              isa<ConstantSDNode>(Shl.getOperand(1)) &&
-              isa<ConstantSDNode>(And.getOperand(1))) {
-            uint32_t VC1 = Srl.getConstantOperandVal(1);
-            uint32_t VC2 = Shl.getConstantOperandVal(1);
-            uint32_t VC3 = And.getConstantOperandVal(1);
-            if (VC2 == (32 - VC1) &&
-                VC3 == maskLeadingOnes<uint32_t>(VC2)) {
-              RS1 = Shl.getOperand(0);
-              Shamt = CurDAG->getTargetConstant(VC1, SDLoc(N),
-                                              Srl.getOperand(1).getValueType());
-              return true;
-            }
-          }
-        }
-      }
-    }
+bool RISCVDAGToDAGISel::selectVSplatSimm5(SDValue N, SDValue &SplatVal) {
+  if ((N.getOpcode() != ISD::SPLAT_VECTOR &&
+       N.getOpcode() != RISCVISD::SPLAT_VECTOR_I64) ||
+      !isa<ConstantSDNode>(N.getOperand(0)))
+    return false;
+
+  int64_t SplatImm = cast<ConstantSDNode>(N.getOperand(0))->getSExtValue();
+
+  // Both ISD::SPLAT_VECTOR and RISCVISD::SPLAT_VECTOR_I64 share semantics when
+  // the operand type is wider than the resulting vector element type: an
+  // implicit truncation first takes place. Therefore, perform a manual
+  // truncation/sign-extension in order to ignore any truncated bits and catch
+  // any zero-extended immediate.
+  // For example, we wish to match (i8 -1) -> (XLenVT 255) as a simm5 by first
+  // sign-extending to (XLenVT -1).
+  auto XLenVT = Subtarget->getXLenVT();
+  assert(XLenVT == N.getOperand(0).getSimpleValueType() &&
+         "Unexpected splat operand type");
+  auto EltVT = N.getValueType().getVectorElementType();
+  if (EltVT.bitsLT(XLenVT)) {
+    SplatImm = SignExtend64(SplatImm, EltVT.getSizeInBits());
   }
-  return false;
+
+  if (!isInt<5>(SplatImm))
+    return false;
+
+  SplatVal = CurDAG->getTargetConstant(SplatImm, SDLoc(N), XLenVT);
+  return true;
 }
 
-// Check that it is a FSRIW (i32 Funnel Shift Right Immediate on RV64).
-// We first check that it is the right node tree:
-//
-//  (SIGN_EXTEND_INREG (OR (SHL (AsserSext RS1, i32), VC2),
-//                         (SRL (AND (AssertSext RS2, i32), VC3), VC1)))
-//
-// Then we check that the constant operands respect these constraints:
-//
-// VC2 == 32 - VC1
-// VC3 == maskLeadingOnes<uint32_t>(VC2)
-//
-// being VC1 the Shamt we need, VC2 the complementary of Shamt over 32
-// and VC3 a 32 bit mask of (32 - VC1) leading ones.
-
-bool RISCVDAGToDAGISel::SelectFSRIW(SDValue N, SDValue &RS1, SDValue &RS2,
-                                    SDValue &Shamt) {
-  if (N.getOpcode() == ISD::SIGN_EXTEND_INREG &&
-      Subtarget->getXLenVT() == MVT::i64 &&
-      cast<VTSDNode>(N.getOperand(1))->getVT() == MVT::i32) {
-    if (N.getOperand(0).getOpcode() == ISD::OR) {
-      SDValue Or = N.getOperand(0);
-      if (Or.getOperand(0).getOpcode() == ISD::SHL &&
-          Or.getOperand(1).getOpcode() == ISD::SRL) {
-        SDValue Shl = Or.getOperand(0);
-        SDValue Srl = Or.getOperand(1);
-        if (Srl.getOperand(0).getOpcode() == ISD::AND) {
-          SDValue And = Srl.getOperand(0);
-          if (isa<ConstantSDNode>(Srl.getOperand(1)) &&
-              isa<ConstantSDNode>(Shl.getOperand(1)) &&
-              isa<ConstantSDNode>(And.getOperand(1))) {
-            uint32_t VC1 = Srl.getConstantOperandVal(1);
-            uint32_t VC2 = Shl.getConstantOperandVal(1);
-            uint32_t VC3 = And.getConstantOperandVal(1);
-            if (VC2 == (32 - VC1) &&
-                VC3 == maskLeadingOnes<uint32_t>(VC2)) {
-              RS1 = Shl.getOperand(0);
-              RS2 = And.getOperand(0);
-              Shamt = CurDAG->getTargetConstant(VC1, SDLoc(N),
-                                              Srl.getOperand(1).getValueType());
-              return true;
-            }
-          }
-        }
-      }
-    }
-  }
-  return false;
+bool RISCVDAGToDAGISel::selectVSplatUimm5(SDValue N, SDValue &SplatVal) {
+  if ((N.getOpcode() != ISD::SPLAT_VECTOR &&
+       N.getOpcode() != RISCVISD::SPLAT_VECTOR_I64) ||
+      !isa<ConstantSDNode>(N.getOperand(0)))
+    return false;
+
+  int64_t SplatImm = cast<ConstantSDNode>(N.getOperand(0))->getSExtValue();
+
+  if (!isUInt<5>(SplatImm))
+    return false;
+
+  SplatVal =
+      CurDAG->getTargetConstant(SplatImm, SDLoc(N), Subtarget->getXLenVT());
+
+  return true;
 }
 
 // Merge an ADDI into the offset of a load/store instruction where possible.
@@ -536,6 +961,7 @@ void RISCVDAGToDAGISel::doPeepholeLoadStoreADDI() {
     case RISCV::LHU:
     case RISCV::LWU:
     case RISCV::LD:
+    case RISCV::FLH:
     case RISCV::FLW:
     case RISCV::FLD:
       BaseOpIdx = 0;
@@ -545,6 +971,7 @@ void RISCVDAGToDAGISel::doPeepholeLoadStoreADDI() {
     case RISCV::SH:
     case RISCV::SW:
     case RISCV::SD:
+    case RISCV::FSH:
     case RISCV::FSW:
     case RISCV::FSD:
       BaseOpIdx = 1;
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
index 0ca12510a230..6099586d049d 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
@@ -45,14 +45,25 @@ public:
 
   bool SelectAddrFI(SDValue Addr, SDValue &Base);
 
-  bool SelectSLOI(SDValue N, SDValue &RS1, SDValue &Shamt);
-  bool SelectSROI(SDValue N, SDValue &RS1, SDValue &Shamt);
-  bool SelectRORI(SDValue N, SDValue &RS1, SDValue &Shamt);
-  bool SelectSLLIUW(SDValue N, SDValue &RS1, SDValue &Shamt);
-  bool SelectSLOIW(SDValue N, SDValue &RS1, SDValue &Shamt);
-  bool SelectSROIW(SDValue N, SDValue &RS1, SDValue &Shamt);
-  bool SelectRORIW(SDValue N, SDValue &RS1, SDValue &Shamt);
-  bool SelectFSRIW(SDValue N, SDValue &RS1, SDValue &RS2, SDValue &Shamt);
+  bool MatchSRLIW(SDNode *N) const;
+  bool MatchSLLIUW(SDNode *N) const;
+
+  bool selectVLOp(SDValue N, SDValue &VL);
+
+  bool selectVSplat(SDValue N, SDValue &SplatVal);
+  bool selectVSplatSimm5(SDValue N, SDValue &SplatVal);
+  bool selectVSplatUimm5(SDValue N, SDValue &SplatVal);
+
+  void selectVLSEG(SDNode *Node, unsigned IntNo, bool IsStrided);
+  void selectVLSEGMask(SDNode *Node, unsigned IntNo, bool IsStrided);
+  void selectVLSEGFF(SDNode *Node);
+  void selectVLSEGFFMask(SDNode *Node);
+  void selectVLXSEG(SDNode *Node, unsigned IntNo);
+  void selectVLXSEGMask(SDNode *Node, unsigned IntNo);
+  void selectVSSEG(SDNode *Node, unsigned IntNo, bool IsStrided);
+  void selectVSSEGMask(SDNode *Node, unsigned IntNo, bool IsStrided);
+  void selectVSXSEG(SDNode *Node, unsigned IntNo);
+  void selectVSXSEGMask(SDNode *Node, unsigned IntNo);
 
 // Include the pieces autogenerated from the target description.
 #include "RISCVGenDAGISel.inc"
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 03d9eefd59d0..97f46d9731b1 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -12,12 +12,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "RISCVISelLowering.h"
+#include "MCTargetDesc/RISCVMatInt.h"
 #include "RISCV.h"
 #include "RISCVMachineFunctionInfo.h"
 #include "RISCVRegisterInfo.h"
 #include "RISCVSubtarget.h"
 #include "RISCVTargetMachine.h"
-#include "Utils/RISCVMatInt.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/CallingConvLower.h"
@@ -25,7 +25,6 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/DiagnosticInfo.h"
@@ -33,6 +32,7 @@
 #include "llvm/IR/IntrinsicsRISCV.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -83,11 +83,73 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
   // Set up the register classes.
   addRegisterClass(XLenVT, &RISCV::GPRRegClass);
 
+  if (Subtarget.hasStdExtZfh())
+    addRegisterClass(MVT::f16, &RISCV::FPR16RegClass);
   if (Subtarget.hasStdExtF())
     addRegisterClass(MVT::f32, &RISCV::FPR32RegClass);
   if (Subtarget.hasStdExtD())
     addRegisterClass(MVT::f64, &RISCV::FPR64RegClass);
 
+  if (Subtarget.hasStdExtV()) {
+    addRegisterClass(RISCVVMVTs::vbool64_t, &RISCV::VRRegClass);
+    addRegisterClass(RISCVVMVTs::vbool32_t, &RISCV::VRRegClass);
+    addRegisterClass(RISCVVMVTs::vbool16_t, &RISCV::VRRegClass);
+    addRegisterClass(RISCVVMVTs::vbool8_t, &RISCV::VRRegClass);
+    addRegisterClass(RISCVVMVTs::vbool4_t, &RISCV::VRRegClass);
+    addRegisterClass(RISCVVMVTs::vbool2_t, &RISCV::VRRegClass);
+    addRegisterClass(RISCVVMVTs::vbool1_t, &RISCV::VRRegClass);
+
+    addRegisterClass(RISCVVMVTs::vint8mf8_t, &RISCV::VRRegClass);
+    addRegisterClass(RISCVVMVTs::vint8mf4_t, &RISCV::VRRegClass);
+    addRegisterClass(RISCVVMVTs::vint8mf2_t, &RISCV::VRRegClass);
+    addRegisterClass(RISCVVMVTs::vint8m1_t, &RISCV::VRRegClass);
+    addRegisterClass(RISCVVMVTs::vint8m2_t, &RISCV::VRM2RegClass);
+    addRegisterClass(RISCVVMVTs::vint8m4_t, &RISCV::VRM4RegClass);
+    addRegisterClass(RISCVVMVTs::vint8m8_t, &RISCV::VRM8RegClass);
+
+    addRegisterClass(RISCVVMVTs::vint16mf4_t, &RISCV::VRRegClass);
+    addRegisterClass(RISCVVMVTs::vint16mf2_t, &RISCV::VRRegClass);
+    addRegisterClass(RISCVVMVTs::vint16m1_t, &RISCV::VRRegClass);
+    addRegisterClass(RISCVVMVTs::vint16m2_t, &RISCV::VRM2RegClass);
+    addRegisterClass(RISCVVMVTs::vint16m4_t, &RISCV::VRM4RegClass);
+    addRegisterClass(RISCVVMVTs::vint16m8_t, &RISCV::VRM8RegClass);
+
+    addRegisterClass(RISCVVMVTs::vint32mf2_t, &RISCV::VRRegClass);
+    addRegisterClass(RISCVVMVTs::vint32m1_t, &RISCV::VRRegClass);
+    addRegisterClass(RISCVVMVTs::vint32m2_t, &RISCV::VRM2RegClass);
+    addRegisterClass(RISCVVMVTs::vint32m4_t, &RISCV::VRM4RegClass);
+    addRegisterClass(RISCVVMVTs::vint32m8_t, &RISCV::VRM8RegClass);
+
+    addRegisterClass(RISCVVMVTs::vint64m1_t, &RISCV::VRRegClass);
+    addRegisterClass(RISCVVMVTs::vint64m2_t, &RISCV::VRM2RegClass);
+    addRegisterClass(RISCVVMVTs::vint64m4_t, &RISCV::VRM4RegClass);
+    addRegisterClass(RISCVVMVTs::vint64m8_t, &RISCV::VRM8RegClass);
+
+    if (Subtarget.hasStdExtZfh()) {
+      addRegisterClass(RISCVVMVTs::vfloat16mf4_t, &RISCV::VRRegClass);
+      addRegisterClass(RISCVVMVTs::vfloat16mf2_t, &RISCV::VRRegClass);
+      addRegisterClass(RISCVVMVTs::vfloat16m1_t, &RISCV::VRRegClass);
+      addRegisterClass(RISCVVMVTs::vfloat16m2_t, &RISCV::VRM2RegClass);
+      addRegisterClass(RISCVVMVTs::vfloat16m4_t, &RISCV::VRM4RegClass);
+      addRegisterClass(RISCVVMVTs::vfloat16m8_t, &RISCV::VRM8RegClass);
+    }
+
+    if (Subtarget.hasStdExtF()) {
+      addRegisterClass(RISCVVMVTs::vfloat32mf2_t, &RISCV::VRRegClass);
+      addRegisterClass(RISCVVMVTs::vfloat32m1_t, &RISCV::VRRegClass);
+      addRegisterClass(RISCVVMVTs::vfloat32m2_t, &RISCV::VRM2RegClass);
+      addRegisterClass(RISCVVMVTs::vfloat32m4_t, &RISCV::VRM4RegClass);
+      addRegisterClass(RISCVVMVTs::vfloat32m8_t, &RISCV::VRM8RegClass);
+    }
+
+    if (Subtarget.hasStdExtD()) {
+      addRegisterClass(RISCVVMVTs::vfloat64m1_t, &RISCV::VRRegClass);
+      addRegisterClass(RISCVVMVTs::vfloat64m2_t, &RISCV::VRM2RegClass);
+      addRegisterClass(RISCVVMVTs::vfloat64m4_t, &RISCV::VRM4RegClass);
+      addRegisterClass(RISCVVMVTs::vfloat64m8_t, &RISCV::VRM8RegClass);
+    }
+  }
+
   // Compute derived properties from the register classes.
   computeRegisterProperties(STI.getRegisterInfo());
 
@@ -101,7 +163,6 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
   setOperationAction(ISD::BR_CC, XLenVT, Expand);
-  setOperationAction(ISD::SELECT, XLenVT, Custom);
   setOperationAction(ISD::SELECT_CC, XLenVT, Expand);
 
   setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
@@ -112,8 +173,11 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::VACOPY, MVT::Other, Expand);
   setOperationAction(ISD::VAEND, MVT::Other, Expand);
 
-  for (auto VT : {MVT::i1, MVT::i8, MVT::i16})
-    setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+  if (!Subtarget.hasStdExtZbb()) {
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
+  }
 
   if (Subtarget.is64Bit()) {
     setOperationAction(ISD::ADD, MVT::i32, Custom);
@@ -135,6 +199,13 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
 
   if (Subtarget.is64Bit() && Subtarget.hasStdExtM()) {
     setOperationAction(ISD::MUL, MVT::i32, Custom);
+
+    setOperationAction(ISD::SDIV, MVT::i8, Custom);
+    setOperationAction(ISD::UDIV, MVT::i8, Custom);
+    setOperationAction(ISD::UREM, MVT::i8, Custom);
+    setOperationAction(ISD::SDIV, MVT::i16, Custom);
+    setOperationAction(ISD::UDIV, MVT::i16, Custom);
+    setOperationAction(ISD::UREM, MVT::i16, Custom);
     setOperationAction(ISD::SDIV, MVT::i32, Custom);
     setOperationAction(ISD::UDIV, MVT::i32, Custom);
     setOperationAction(ISD::UREM, MVT::i32, Custom);
@@ -149,46 +220,90 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::SRL_PARTS, XLenVT, Custom);
   setOperationAction(ISD::SRA_PARTS, XLenVT, Custom);
 
-  if (!(Subtarget.hasStdExtZbb() || Subtarget.hasStdExtZbp())) {
+  if (Subtarget.hasStdExtZbb() || Subtarget.hasStdExtZbp()) {
+    if (Subtarget.is64Bit()) {
+      setOperationAction(ISD::ROTL, MVT::i32, Custom);
+      setOperationAction(ISD::ROTR, MVT::i32, Custom);
+    }
+  } else {
     setOperationAction(ISD::ROTL, XLenVT, Expand);
     setOperationAction(ISD::ROTR, XLenVT, Expand);
   }
 
-  if (!Subtarget.hasStdExtZbp())
-    setOperationAction(ISD::BSWAP, XLenVT, Expand);
+  if (Subtarget.hasStdExtZbp()) {
+    // Custom lower bswap/bitreverse so we can convert them to GREVI to enable
+    // more combining.
+    setOperationAction(ISD::BITREVERSE, XLenVT, Custom);
+    setOperationAction(ISD::BSWAP, XLenVT, Custom);
 
-  if (!Subtarget.hasStdExtZbb()) {
+    if (Subtarget.is64Bit()) {
+      setOperationAction(ISD::BITREVERSE, MVT::i32, Custom);
+      setOperationAction(ISD::BSWAP, MVT::i32, Custom);
+    }
+  } else {
+    // With Zbb we have an XLen rev8 instruction, but not GREVI. So we'll
+    // pattern match it directly in isel.
+    setOperationAction(ISD::BSWAP, XLenVT,
+                       Subtarget.hasStdExtZbb() ? Legal : Expand);
+  }
+
+  if (Subtarget.hasStdExtZbb()) {
+    setOperationAction(ISD::SMIN, XLenVT, Legal);
+    setOperationAction(ISD::SMAX, XLenVT, Legal);
+    setOperationAction(ISD::UMIN, XLenVT, Legal);
+    setOperationAction(ISD::UMAX, XLenVT, Legal);
+  } else {
     setOperationAction(ISD::CTTZ, XLenVT, Expand);
     setOperationAction(ISD::CTLZ, XLenVT, Expand);
     setOperationAction(ISD::CTPOP, XLenVT, Expand);
   }
 
-  if (Subtarget.hasStdExtZbp())
-    setOperationAction(ISD::BITREVERSE, XLenVT, Legal);
-
   if (Subtarget.hasStdExtZbt()) {
     setOperationAction(ISD::FSHL, XLenVT, Legal);
     setOperationAction(ISD::FSHR, XLenVT, Legal);
+    setOperationAction(ISD::SELECT, XLenVT, Legal);
+
+    if (Subtarget.is64Bit()) {
+      setOperationAction(ISD::FSHL, MVT::i32, Custom);
+      setOperationAction(ISD::FSHR, MVT::i32, Custom);
+    }
+  } else {
+    setOperationAction(ISD::SELECT, XLenVT, Custom);
   }
 
-  ISD::CondCode FPCCToExtend[] = {
+  ISD::CondCode FPCCToExpand[] = {
       ISD::SETOGT, ISD::SETOGE, ISD::SETONE, ISD::SETUEQ, ISD::SETUGT,
       ISD::SETUGE, ISD::SETULT, ISD::SETULE, ISD::SETUNE, ISD::SETGT,
-      ISD::SETGE,  ISD::SETNE};
+      ISD::SETGE,  ISD::SETNE,  ISD::SETO,   ISD::SETUO};
 
-  ISD::NodeType FPOpToExtend[] = {
+  ISD::NodeType FPOpToExpand[] = {
       ISD::FSIN, ISD::FCOS, ISD::FSINCOS, ISD::FPOW, ISD::FREM, ISD::FP16_TO_FP,
       ISD::FP_TO_FP16};
 
+  if (Subtarget.hasStdExtZfh())
+    setOperationAction(ISD::BITCAST, MVT::i16, Custom);
+
+  if (Subtarget.hasStdExtZfh()) {
+    setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
+    setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
+    for (auto CC : FPCCToExpand)
+      setCondCodeAction(CC, MVT::f16, Expand);
+    setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
+    setOperationAction(ISD::SELECT, MVT::f16, Custom);
+    setOperationAction(ISD::BR_CC, MVT::f16, Expand);
+    for (auto Op : FPOpToExpand)
+      setOperationAction(Op, MVT::f16, Expand);
+  }
+
   if (Subtarget.hasStdExtF()) {
     setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
     setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
-    for (auto CC : FPCCToExtend)
+    for (auto CC : FPCCToExpand)
       setCondCodeAction(CC, MVT::f32, Expand);
     setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
     setOperationAction(ISD::SELECT, MVT::f32, Custom);
     setOperationAction(ISD::BR_CC, MVT::f32, Expand);
-    for (auto Op : FPOpToExtend)
+    for (auto Op : FPOpToExpand)
       setOperationAction(Op, MVT::f32, Expand);
     setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
     setTruncStoreAction(MVT::f32, MVT::f16, Expand);
@@ -200,21 +315,20 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
   if (Subtarget.hasStdExtD()) {
     setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
     setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
-    for (auto CC : FPCCToExtend)
+    for (auto CC : FPCCToExpand)
       setCondCodeAction(CC, MVT::f64, Expand);
     setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
     setOperationAction(ISD::SELECT, MVT::f64, Custom);
     setOperationAction(ISD::BR_CC, MVT::f64, Expand);
     setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
     setTruncStoreAction(MVT::f64, MVT::f32, Expand);
-    for (auto Op : FPOpToExtend)
+    for (auto Op : FPOpToExpand)
       setOperationAction(Op, MVT::f64, Expand);
     setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
     setTruncStoreAction(MVT::f64, MVT::f16, Expand);
   }
 
-  if (Subtarget.is64Bit() &&
-      !(Subtarget.hasStdExtD() || Subtarget.hasStdExtF())) {
+  if (Subtarget.is64Bit()) {
     setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
     setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
@@ -224,6 +338,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::GlobalAddress, XLenVT, Custom);
   setOperationAction(ISD::BlockAddress, XLenVT, Custom);
   setOperationAction(ISD::ConstantPool, XLenVT, Custom);
+  setOperationAction(ISD::JumpTable, XLenVT, Custom);
 
   setOperationAction(ISD::GlobalTLSAddress, XLenVT, Custom);
 
@@ -245,25 +360,133 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
 
   setBooleanContents(ZeroOrOneBooleanContent);
 
+  if (Subtarget.hasStdExtV()) {
+    setBooleanVectorContents(ZeroOrOneBooleanContent);
+
+    setOperationAction(ISD::VSCALE, XLenVT, Custom);
+
+    // RVV intrinsics may have illegal operands.
+    // We also need to custom legalize vmv.x.s.
+    setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom);
+    setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
+    setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
+    setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i16, Custom);
+    setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i32, Custom);
+    setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i32, Custom);
+
+    setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
+
+    if (Subtarget.is64Bit()) {
+      setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
+      setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
+    }
+
+    for (auto VT : MVT::integer_scalable_vector_valuetypes()) {
+      setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
+
+      setOperationAction(ISD::SMIN, VT, Legal);
+      setOperationAction(ISD::SMAX, VT, Legal);
+      setOperationAction(ISD::UMIN, VT, Legal);
+      setOperationAction(ISD::UMAX, VT, Legal);
+
+      setOperationAction(ISD::ROTL, VT, Expand);
+      setOperationAction(ISD::ROTR, VT, Expand);
+
+      if (isTypeLegal(VT)) {
+        // Custom-lower extensions and truncations from/to mask types.
+        setOperationAction(ISD::ANY_EXTEND, VT, Custom);
+        setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
+        setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
+
+        // We custom-lower all legally-typed vector truncates:
+        // 1. Mask VTs are custom-expanded into a series of standard nodes
+        // 2. Integer VTs are lowered as a series of "RISCVISD::TRUNCATE_VECTOR"
+        // nodes which truncate by one power of two at a time.
+        setOperationAction(ISD::TRUNCATE, VT, Custom);
+
+        // Custom-lower insert/extract operations to simplify patterns.
+        setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+        setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+      }
+    }
+
+    // We must custom-lower certain vXi64 operations on RV32 due to the vector
+    // element type being illegal.
+    if (!Subtarget.is64Bit()) {
+      setOperationAction(ISD::SPLAT_VECTOR, MVT::i64, Custom);
+      setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::i64, Custom);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::i64, Custom);
+    }
+
+    // Expand various CCs to best match the RVV ISA, which natively supports UNE
+    // but no other unordered comparisons, and supports all ordered comparisons
+    // except ONE. Additionally, we expand GT,OGT,GE,OGE for optimization
+    // purposes; they are expanded to their swapped-operand CCs (LT,OLT,LE,OLE),
+    // and we pattern-match those back to the "original", swapping operands once
+    // more. This way we catch both operations and both "vf" and "fv" forms with
+    // fewer patterns.
+    ISD::CondCode VFPCCToExpand[] = {
+        ISD::SETO,   ISD::SETONE, ISD::SETUEQ, ISD::SETUGT,
+        ISD::SETUGE, ISD::SETULT, ISD::SETULE, ISD::SETUO,
+        ISD::SETGT,  ISD::SETOGT, ISD::SETGE,  ISD::SETOGE,
+    };
+
+    // Sets common operation actions on RVV floating-point vector types.
+    const auto SetCommonVFPActions = [&](MVT VT) {
+      setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
+      // Custom-lower insert/extract operations to simplify patterns.
+      setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+      for (auto CC : VFPCCToExpand)
+        setCondCodeAction(CC, VT, Expand);
+    };
+
+    if (Subtarget.hasStdExtZfh()) {
+      for (auto VT : {RISCVVMVTs::vfloat16mf4_t, RISCVVMVTs::vfloat16mf2_t,
+                      RISCVVMVTs::vfloat16m1_t, RISCVVMVTs::vfloat16m2_t,
+                      RISCVVMVTs::vfloat16m4_t, RISCVVMVTs::vfloat16m8_t})
+        SetCommonVFPActions(VT);
+    }
+
+    if (Subtarget.hasStdExtF()) {
+      for (auto VT : {RISCVVMVTs::vfloat32mf2_t, RISCVVMVTs::vfloat32m1_t,
+                      RISCVVMVTs::vfloat32m2_t, RISCVVMVTs::vfloat32m4_t,
+                      RISCVVMVTs::vfloat32m8_t})
+        SetCommonVFPActions(VT);
+    }
+
+    if (Subtarget.hasStdExtD()) {
+      for (auto VT : {RISCVVMVTs::vfloat64m1_t, RISCVVMVTs::vfloat64m2_t,
+                      RISCVVMVTs::vfloat64m4_t, RISCVVMVTs::vfloat64m8_t})
+        SetCommonVFPActions(VT);
+    }
+  }
+
   // Function alignments.
   const Align FunctionAlignment(Subtarget.hasStdExtC() ? 2 : 4);
   setMinFunctionAlignment(FunctionAlignment);
   setPrefFunctionAlignment(FunctionAlignment);
 
-  // Effectively disable jump table generation.
-  setMinimumJumpTableEntries(INT_MAX);
+  setMinimumJumpTableEntries(5);
 
   // Jumps are expensive, compared to logic
   setJumpIsExpensive();
 
   // We can use any register for comparisons
   setHasMultipleConditionRegisters();
+
+  setTargetDAGCombine(ISD::SETCC);
+  if (Subtarget.hasStdExtZbp()) {
+    setTargetDAGCombine(ISD::OR);
+  }
 }
 
 EVT RISCVTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
                                             EVT VT) const {
   if (!VT.isVector())
     return getPointerTy(DL);
+  if (Subtarget.hasStdExtV())
+    return MVT::getVectorVT(MVT::i1, VT.getVectorElementCount());
   return VT.changeVectorElementTypeToInteger();
 }
 
@@ -367,8 +590,18 @@ bool RISCVTargetLowering::isSExtCheaperThanZExt(EVT SrcVT, EVT DstVT) const {
   return Subtarget.is64Bit() && SrcVT == MVT::i32 && DstVT == MVT::i64;
 }
 
+bool RISCVTargetLowering::isCheapToSpeculateCttz() const {
+  return Subtarget.hasStdExtZbb();
+}
+
+bool RISCVTargetLowering::isCheapToSpeculateCtlz() const {
+  return Subtarget.hasStdExtZbb();
+}
+
 bool RISCVTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
                                        bool ForCodeSize) const {
+  if (VT == MVT::f16 && !Subtarget.hasStdExtZfh())
+    return false;
   if (VT == MVT::f32 && !Subtarget.hasStdExtF())
     return false;
   if (VT == MVT::f64 && !Subtarget.hasStdExtD())
@@ -379,7 +612,8 @@ bool RISCVTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
 }
 
 bool RISCVTargetLowering::hasBitPreservingFPLogic(EVT VT) const {
-  return (VT == MVT::f32 && Subtarget.hasStdExtF()) ||
+  return (VT == MVT::f16 && Subtarget.hasStdExtZfh()) ||
+         (VT == MVT::f32 && Subtarget.hasStdExtF()) ||
          (VT == MVT::f64 && Subtarget.hasStdExtD());
 }
 
@@ -433,6 +667,8 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
     return lowerBlockAddress(Op, DAG);
   case ISD::ConstantPool:
     return lowerConstantPool(Op, DAG);
+  case ISD::JumpTable:
+    return lowerJumpTable(Op, DAG);
   case ISD::GlobalTLSAddress:
     return lowerGlobalTLSAddress(Op, DAG);
   case ISD::SELECT:
@@ -450,18 +686,105 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
   case ISD::SRL_PARTS:
     return lowerShiftRightParts(Op, DAG, false);
   case ISD::BITCAST: {
-    assert(Subtarget.is64Bit() && Subtarget.hasStdExtF() &&
+    assert(((Subtarget.is64Bit() && Subtarget.hasStdExtF()) ||
+            Subtarget.hasStdExtZfh()) &&
            "Unexpected custom legalisation");
     SDLoc DL(Op);
     SDValue Op0 = Op.getOperand(0);
-    if (Op.getValueType() != MVT::f32 || Op0.getValueType() != MVT::i32)
-      return SDValue();
-    SDValue NewOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op0);
-    SDValue FPConv = DAG.getNode(RISCVISD::FMV_W_X_RV64, DL, MVT::f32, NewOp0);
-    return FPConv;
+    if (Op.getValueType() == MVT::f16 && Subtarget.hasStdExtZfh()) {
+      if (Op0.getValueType() != MVT::i16)
+        return SDValue();
+      SDValue NewOp0 =
+          DAG.getNode(ISD::ANY_EXTEND, DL, Subtarget.getXLenVT(), Op0);
+      SDValue FPConv = DAG.getNode(RISCVISD::FMV_H_X, DL, MVT::f16, NewOp0);
+      return FPConv;
+    } else if (Op.getValueType() == MVT::f32 && Subtarget.is64Bit() &&
+               Subtarget.hasStdExtF()) {
+      if (Op0.getValueType() != MVT::i32)
+        return SDValue();
+      SDValue NewOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op0);
+      SDValue FPConv =
+          DAG.getNode(RISCVISD::FMV_W_X_RV64, DL, MVT::f32, NewOp0);
+      return FPConv;
+    }
+    return SDValue();
   }
   case ISD::INTRINSIC_WO_CHAIN:
     return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+  case ISD::INTRINSIC_W_CHAIN:
+    return LowerINTRINSIC_W_CHAIN(Op, DAG);
+  case ISD::BSWAP:
+  case ISD::BITREVERSE: {
+    // Convert BSWAP/BITREVERSE to GREVI to enable GREVI combinining.
+    assert(Subtarget.hasStdExtZbp() && "Unexpected custom legalisation");
+    MVT VT = Op.getSimpleValueType();
+    SDLoc DL(Op);
+    // Start with the maximum immediate value which is the bitwidth - 1.
+    unsigned Imm = VT.getSizeInBits() - 1;
+    // If this is BSWAP rather than BITREVERSE, clear the lower 3 bits.
+    if (Op.getOpcode() == ISD::BSWAP)
+      Imm &= ~0x7U;
+    return DAG.getNode(RISCVISD::GREVI, DL, VT, Op.getOperand(0),
+                       DAG.getTargetConstant(Imm, DL, Subtarget.getXLenVT()));
+  }
+  case ISD::TRUNCATE: {
+    SDLoc DL(Op);
+    EVT VT = Op.getValueType();
+    // Only custom-lower vector truncates
+    if (!VT.isVector())
+      return Op;
+
+    // Truncates to mask types are handled differently
+    if (VT.getVectorElementType() == MVT::i1)
+      return lowerVectorMaskTrunc(Op, DAG);
+
+    // RVV only has truncates which operate from SEW*2->SEW, so lower arbitrary
+    // truncates as a series of "RISCVISD::TRUNCATE_VECTOR" nodes which
+    // truncate by one power of two at a time.
+    EVT DstEltVT = VT.getVectorElementType();
+
+    SDValue Src = Op.getOperand(0);
+    EVT SrcVT = Src.getValueType();
+    EVT SrcEltVT = SrcVT.getVectorElementType();
+
+    assert(DstEltVT.bitsLT(SrcEltVT) &&
+           isPowerOf2_64(DstEltVT.getSizeInBits()) &&
+           isPowerOf2_64(SrcEltVT.getSizeInBits()) &&
+           "Unexpected vector truncate lowering");
+
+    SDValue Result = Src;
+    LLVMContext &Context = *DAG.getContext();
+    const ElementCount Count = SrcVT.getVectorElementCount();
+    do {
+      SrcEltVT = EVT::getIntegerVT(Context, SrcEltVT.getSizeInBits() / 2);
+      EVT ResultVT = EVT::getVectorVT(Context, SrcEltVT, Count);
+      Result = DAG.getNode(RISCVISD::TRUNCATE_VECTOR, DL, ResultVT, Result);
+    } while (SrcEltVT != DstEltVT);
+
+    return Result;
+  }
+  case ISD::ANY_EXTEND:
+  case ISD::ZERO_EXTEND:
+    return lowerVectorMaskExt(Op, DAG, /*ExtVal*/ 1);
+  case ISD::SIGN_EXTEND:
+    return lowerVectorMaskExt(Op, DAG, /*ExtVal*/ -1);
+  case ISD::SPLAT_VECTOR:
+    return lowerSPLATVECTOR(Op, DAG);
+  case ISD::INSERT_VECTOR_ELT:
+    return lowerINSERT_VECTOR_ELT(Op, DAG);
+  case ISD::EXTRACT_VECTOR_ELT:
+    return lowerEXTRACT_VECTOR_ELT(Op, DAG);
+  case ISD::VSCALE: {
+    MVT VT = Op.getSimpleValueType();
+    SDLoc DL(Op);
+    SDValue VLENB = DAG.getNode(RISCVISD::READ_VLENB, DL, VT);
+    // We define our scalable vector types for lmul=1 to use a 64 bit known
+    // minimum size. e.g. <vscale x 2 x i32>. VLENB is in bytes so we calculate
+    // vscale as VLENB / 8.
+    SDValue VScale = DAG.getNode(ISD::SRL, DL, VT, VLENB,
+                                 DAG.getConstant(3, DL, VT));
+    return DAG.getNode(ISD::MUL, DL, VT, VScale, Op.getOperand(0));
+  }
   }
 }
 
@@ -482,6 +805,11 @@ static SDValue getTargetNode(ConstantPoolSDNode *N, SDLoc DL, EVT Ty,
                                    N->getOffset(), Flags);
 }
 
+static SDValue getTargetNode(JumpTableSDNode *N, SDLoc DL, EVT Ty,
+                             SelectionDAG &DAG, unsigned Flags) {
+  return DAG.getTargetJumpTable(N->getIndex(), Ty, Flags);
+}
+
 template <class NodeTy>
 SDValue RISCVTargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
                                      bool IsLocal) const {
@@ -559,6 +887,13 @@ SDValue RISCVTargetLowering::lowerConstantPool(SDValue Op,
   return getAddr(N, DAG);
 }
 
+SDValue RISCVTargetLowering::lowerJumpTable(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  JumpTableSDNode *N = cast<JumpTableSDNode>(Op);
+
+  return getAddr(N, DAG);
+}
+
 SDValue RISCVTargetLowering::getStaticTLSAddr(GlobalAddressSDNode *N,
                                               SelectionDAG &DAG,
                                               bool UseGOT) const {
@@ -642,6 +977,10 @@ SDValue RISCVTargetLowering::lowerGlobalTLSAddress(SDValue Op,
 
   TLSModel::Model Model = getTargetMachine().getTLSModel(N->getGlobal());
 
+  if (DAG.getMachineFunction().getFunction().getCallingConv() ==
+      CallingConv::GHC)
+    report_fatal_error("In GHC calling convention TLS is not supported");
+
   SDValue Addr;
   switch (Model) {
   case TLSModel::LocalExec:
@@ -689,9 +1028,8 @@ SDValue RISCVTargetLowering::lowerSELECT(SDValue Op, SelectionDAG &DAG) const {
     normaliseSetCC(LHS, RHS, CCVal);
 
     SDValue TargetCC = DAG.getConstant(CCVal, DL, XLenVT);
-    SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
     SDValue Ops[] = {LHS, RHS, TargetCC, TrueV, FalseV};
-    return DAG.getNode(RISCVISD::SELECT_CC, DL, VTs, Ops);
+    return DAG.getNode(RISCVISD::SELECT_CC, DL, Op.getValueType(), Ops);
   }
 
   // Otherwise:
@@ -700,10 +1038,9 @@ SDValue RISCVTargetLowering::lowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   SDValue Zero = DAG.getConstant(0, DL, XLenVT);
   SDValue SetNE = DAG.getConstant(ISD::SETNE, DL, XLenVT);
 
-  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
   SDValue Ops[] = {CondV, Zero, SetNE, TrueV, FalseV};
 
-  return DAG.getNode(RISCVISD::SELECT_CC, DL, VTs, Ops);
+  return DAG.getNode(RISCVISD::SELECT_CC, DL, Op.getValueType(), Ops);
 }
 
 SDValue RISCVTargetLowering::lowerVASTART(SDValue Op, SelectionDAG &DAG) const {
@@ -865,10 +1202,226 @@ SDValue RISCVTargetLowering::lowerShiftRightParts(SDValue Op, SelectionDAG &DAG,
   return DAG.getMergeValues(Parts, DL);
 }
 
+// Custom-lower a SPLAT_VECTOR where XLEN<SEW, as the SEW element type is
+// illegal (currently only vXi64 RV32).
+// FIXME: We could also catch non-constant sign-extended i32 values and lower
+// them to SPLAT_VECTOR_I64
+SDValue RISCVTargetLowering::lowerSPLATVECTOR(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT VecVT = Op.getValueType();
+  assert(!Subtarget.is64Bit() && VecVT.getVectorElementType() == MVT::i64 &&
+         "Unexpected SPLAT_VECTOR lowering");
+  SDValue SplatVal = Op.getOperand(0);
+
+  // If we can prove that the value is a sign-extended 32-bit value, lower this
+  // as a custom node in order to try and match RVV vector/scalar instructions.
+  if (auto *CVal = dyn_cast<ConstantSDNode>(SplatVal)) {
+    if (isInt<32>(CVal->getSExtValue()))
+      return DAG.getNode(RISCVISD::SPLAT_VECTOR_I64, DL, VecVT,
+                         DAG.getConstant(CVal->getSExtValue(), DL, MVT::i32));
+  }
+
+  if (SplatVal.getOpcode() == ISD::SIGN_EXTEND &&
+      SplatVal.getOperand(0).getValueType() == MVT::i32) {
+    return DAG.getNode(RISCVISD::SPLAT_VECTOR_I64, DL, VecVT,
+                       SplatVal.getOperand(0));
+  }
+
+  // Else, on RV32 we lower an i64-element SPLAT_VECTOR thus, being careful not
+  // to accidentally sign-extend the 32-bit halves to the e64 SEW:
+  // vmv.v.x vX, hi
+  // vsll.vx vX, vX, /*32*/
+  // vmv.v.x vY, lo
+  // vsll.vx vY, vY, /*32*/
+  // vsrl.vx vY, vY, /*32*/
+  // vor.vv vX, vX, vY
+  SDValue One = DAG.getConstant(1, DL, MVT::i32);
+  SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
+  SDValue ThirtyTwoV = DAG.getConstant(32, DL, VecVT);
+  SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, SplatVal, Zero);
+  SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, SplatVal, One);
+
+  Lo = DAG.getNode(RISCVISD::SPLAT_VECTOR_I64, DL, VecVT, Lo);
+  Lo = DAG.getNode(ISD::SHL, DL, VecVT, Lo, ThirtyTwoV);
+  Lo = DAG.getNode(ISD::SRL, DL, VecVT, Lo, ThirtyTwoV);
+
+  if (isNullConstant(Hi))
+    return Lo;
+
+  Hi = DAG.getNode(RISCVISD::SPLAT_VECTOR_I64, DL, VecVT, Hi);
+  Hi = DAG.getNode(ISD::SHL, DL, VecVT, Hi, ThirtyTwoV);
+
+  return DAG.getNode(ISD::OR, DL, VecVT, Lo, Hi);
+}
+
+// Custom-lower extensions from mask vectors by using a vselect either with 1
+// for zero/any-extension or -1 for sign-extension:
+//   (vXiN = (s|z)ext vXi1:vmask) -> (vXiN = vselect vmask, (-1 or 1), 0)
+// Note that any-extension is lowered identically to zero-extension.
+SDValue RISCVTargetLowering::lowerVectorMaskExt(SDValue Op, SelectionDAG &DAG,
+                                                int64_t ExtTrueVal) const {
+  SDLoc DL(Op);
+  EVT VecVT = Op.getValueType();
+  SDValue Src = Op.getOperand(0);
+  // Only custom-lower extensions from mask types
+  if (!Src.getValueType().isVector() ||
+      Src.getValueType().getVectorElementType() != MVT::i1)
+    return Op;
+
+  // Be careful not to introduce illegal scalar types at this stage, and be
+  // careful also about splatting constants as on RV32, vXi64 SPLAT_VECTOR is
+  // illegal and must be expanded. Since we know that the constants are
+  // sign-extended 32-bit values, we use SPLAT_VECTOR_I64 directly.
+  bool IsRV32E64 =
+      !Subtarget.is64Bit() && VecVT.getVectorElementType() == MVT::i64;
+  SDValue SplatZero = DAG.getConstant(0, DL, Subtarget.getXLenVT());
+  SDValue SplatTrueVal = DAG.getConstant(ExtTrueVal, DL, Subtarget.getXLenVT());
+
+  if (!IsRV32E64) {
+    SplatZero = DAG.getSplatVector(VecVT, DL, SplatZero);
+    SplatTrueVal = DAG.getSplatVector(VecVT, DL, SplatTrueVal);
+  } else {
+    SplatZero = DAG.getNode(RISCVISD::SPLAT_VECTOR_I64, DL, VecVT, SplatZero);
+    SplatTrueVal =
+        DAG.getNode(RISCVISD::SPLAT_VECTOR_I64, DL, VecVT, SplatTrueVal);
+  }
+
+  return DAG.getNode(ISD::VSELECT, DL, VecVT, Src, SplatTrueVal, SplatZero);
+}
+
+// Custom-lower truncations from vectors to mask vectors by using a mask and a
+// setcc operation:
+//   (vXi1 = trunc vXiN vec) -> (vXi1 = setcc (and vec, 1), 0, ne)
+SDValue RISCVTargetLowering::lowerVectorMaskTrunc(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT MaskVT = Op.getValueType();
+  // Only expect to custom-lower truncations to mask types
+  assert(MaskVT.isVector() && MaskVT.getVectorElementType() == MVT::i1 &&
+         "Unexpected type for vector mask lowering");
+  SDValue Src = Op.getOperand(0);
+  EVT VecVT = Src.getValueType();
+
+  // Be careful not to introduce illegal scalar types at this stage, and be
+  // careful also about splatting constants as on RV32, vXi64 SPLAT_VECTOR is
+  // illegal and must be expanded. Since we know that the constants are
+  // sign-extended 32-bit values, we use SPLAT_VECTOR_I64 directly.
+  bool IsRV32E64 =
+      !Subtarget.is64Bit() && VecVT.getVectorElementType() == MVT::i64;
+  SDValue SplatOne = DAG.getConstant(1, DL, Subtarget.getXLenVT());
+  SDValue SplatZero = DAG.getConstant(0, DL, Subtarget.getXLenVT());
+
+  if (!IsRV32E64) {
+    SplatOne = DAG.getSplatVector(VecVT, DL, SplatOne);
+    SplatZero = DAG.getSplatVector(VecVT, DL, SplatZero);
+  } else {
+    SplatOne = DAG.getNode(RISCVISD::SPLAT_VECTOR_I64, DL, VecVT, SplatOne);
+    SplatZero = DAG.getNode(RISCVISD::SPLAT_VECTOR_I64, DL, VecVT, SplatZero);
+  }
+
+  SDValue Trunc = DAG.getNode(ISD::AND, DL, VecVT, Src, SplatOne);
+
+  return DAG.getSetCC(DL, MaskVT, Trunc, SplatZero, ISD::SETNE);
+}
+
+SDValue RISCVTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
+                                                    SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT VecVT = Op.getValueType();
+  SDValue Vec = Op.getOperand(0);
+  SDValue Val = Op.getOperand(1);
+  SDValue Idx = Op.getOperand(2);
+
+  // Custom-legalize INSERT_VECTOR_ELT where XLEN>=SEW, so that the vector is
+  // first slid down into position, the value is inserted into the first
+  // position, and the vector is slid back up. We do this to simplify patterns.
+  //   (slideup vec, (insertelt (slidedown impdef, vec, idx), val, 0), idx),
+  if (Subtarget.is64Bit() || VecVT.getVectorElementType() != MVT::i64) {
+    if (isNullConstant(Idx))
+      return Op;
+    SDValue Slidedown = DAG.getNode(RISCVISD::VSLIDEDOWN, DL, VecVT,
+                                    DAG.getUNDEF(VecVT), Vec, Idx);
+    SDValue InsertElt0 =
+        DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT, Slidedown, Val,
+                    DAG.getConstant(0, DL, Subtarget.getXLenVT()));
+
+    return DAG.getNode(RISCVISD::VSLIDEUP, DL, VecVT, Vec, InsertElt0, Idx);
+  }
+
+  // Custom-legalize INSERT_VECTOR_ELT where XLEN<SEW, as the SEW element type
+  // is illegal (currently only vXi64 RV32).
+  // Since there is no easy way of getting a single element into a vector when
+  // XLEN<SEW, we lower the operation to the following sequence:
+  //   splat      vVal, rVal
+  //   vid.v      vVid
+  //   vmseq.vx   mMask, vVid, rIdx
+  //   vmerge.vvm vDest, vSrc, vVal, mMask
+  // This essentially merges the original vector with the inserted element by
+  // using a mask whose only set bit is that corresponding to the insert
+  // index.
+  SDValue SplattedVal = DAG.getSplatVector(VecVT, DL, Val);
+  SDValue SplattedIdx = DAG.getNode(RISCVISD::SPLAT_VECTOR_I64, DL, VecVT, Idx);
+
+  SDValue VID = DAG.getNode(RISCVISD::VID, DL, VecVT);
+  auto SetCCVT =
+      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VecVT);
+  SDValue Mask = DAG.getSetCC(DL, SetCCVT, VID, SplattedIdx, ISD::SETEQ);
+
+  return DAG.getNode(ISD::VSELECT, DL, VecVT, Mask, SplattedVal, Vec);
+}
+
+// Custom-lower EXTRACT_VECTOR_ELT operations to slide the vector down, then
+// extract the first element: (extractelt (slidedown vec, idx), 0). This is
+// done to maintain partity with the legalization of RV32 vXi64 legalization.
+SDValue RISCVTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  SDValue Idx = Op.getOperand(1);
+  if (isNullConstant(Idx))
+    return Op;
+
+  SDValue Vec = Op.getOperand(0);
+  EVT EltVT = Op.getValueType();
+  EVT VecVT = Vec.getValueType();
+  SDValue Slidedown = DAG.getNode(RISCVISD::VSLIDEDOWN, DL, VecVT,
+                                  DAG.getUNDEF(VecVT), Vec, Idx);
+
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Slidedown,
+                     DAG.getConstant(0, DL, Subtarget.getXLenVT()));
+}
+
 SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                      SelectionDAG &DAG) const {
   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   SDLoc DL(Op);
+
+  if (Subtarget.hasStdExtV()) {
+    // Some RVV intrinsics may claim that they want an integer operand to be
+    // extended.
+    if (const RISCVVIntrinsicsTable::RISCVVIntrinsicInfo *II =
+            RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IntNo)) {
+      if (II->ExtendedOperand) {
+        assert(II->ExtendedOperand < Op.getNumOperands());
+        SmallVector<SDValue, 8> Operands(Op->op_begin(), Op->op_end());
+        SDValue &ScalarOp = Operands[II->ExtendedOperand];
+        EVT OpVT = ScalarOp.getValueType();
+        if (OpVT == MVT::i8 || OpVT == MVT::i16 ||
+            (OpVT == MVT::i32 && Subtarget.is64Bit())) {
+          // If the operand is a constant, sign extend to increase our chances
+          // of being able to use a .vi instruction. ANY_EXTEND would become a
+          // a zero extend and the simm5 check in isel would fail.
+          // FIXME: Should we ignore the upper bits in isel instead?
+          unsigned ExtOpc = isa<ConstantSDNode>(ScalarOp) ? ISD::SIGN_EXTEND
+                                                          : ISD::ANY_EXTEND;
+          ScalarOp = DAG.getNode(ExtOpc, DL, Subtarget.getXLenVT(), ScalarOp);
+          return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
+                             Operands);
+        }
+      }
+    }
+  }
+
   switch (IntNo) {
   default:
     return SDValue();    // Don't custom lower most intrinsics.
@@ -876,6 +1429,151 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     EVT PtrVT = getPointerTy(DAG.getDataLayout());
     return DAG.getRegister(RISCV::X4, PtrVT);
   }
+  case Intrinsic::riscv_vmv_x_s:
+    assert(Op.getValueType() == Subtarget.getXLenVT() && "Unexpected VT!");
+    return DAG.getNode(RISCVISD::VMV_X_S, DL, Op.getValueType(),
+                       Op.getOperand(1));
+  }
+}
+
+SDValue RISCVTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
+                                                    SelectionDAG &DAG) const {
+  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+  SDLoc DL(Op);
+
+  if (Subtarget.hasStdExtV()) {
+    // Some RVV intrinsics may claim that they want an integer operand to be
+    // extended.
+    if (const RISCVVIntrinsicsTable::RISCVVIntrinsicInfo *II =
+            RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IntNo)) {
+      if (II->ExtendedOperand) {
+        // The operands start from the second argument in INTRINSIC_W_CHAIN.
+        unsigned ExtendOp = II->ExtendedOperand + 1;
+        assert(ExtendOp < Op.getNumOperands());
+        SmallVector<SDValue, 8> Operands(Op->op_begin(), Op->op_end());
+        SDValue &ScalarOp = Operands[ExtendOp];
+        EVT OpVT = ScalarOp.getValueType();
+        if (OpVT == MVT::i8 || OpVT == MVT::i16 ||
+            (OpVT == MVT::i32 && Subtarget.is64Bit())) {
+          // If the operand is a constant, sign extend to increase our chances
+          // of being able to use a .vi instruction. ANY_EXTEND would become a
+          // a zero extend and the simm5 check in isel would fail.
+          // FIXME: Should we ignore the upper bits in isel instead?
+          unsigned ExtOpc = isa<ConstantSDNode>(ScalarOp) ? ISD::SIGN_EXTEND
+                                                          : ISD::ANY_EXTEND;
+          ScalarOp = DAG.getNode(ExtOpc, DL, Subtarget.getXLenVT(), ScalarOp);
+          return DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, Op->getVTList(),
+                             Operands);
+        }
+      }
+    }
+  }
+
+  unsigned NF = 1;
+  switch (IntNo) {
+  default:
+    return SDValue(); // Don't custom lower most intrinsics.
+  case Intrinsic::riscv_vleff: {
+    SDLoc DL(Op);
+    SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other, MVT::Glue);
+    SDValue Load = DAG.getNode(RISCVISD::VLEFF, DL, VTs, Op.getOperand(0),
+                               Op.getOperand(2), Op.getOperand(3));
+    VTs = DAG.getVTList(Op->getValueType(1), MVT::Other);
+    SDValue ReadVL = DAG.getNode(RISCVISD::READ_VL, DL, VTs, Load.getValue(2));
+    return DAG.getMergeValues({Load, ReadVL, Load.getValue(1)}, DL);
+  }
+  case Intrinsic::riscv_vleff_mask: {
+    SDLoc DL(Op);
+    SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other, MVT::Glue);
+    SDValue Load = DAG.getNode(RISCVISD::VLEFF_MASK, DL, VTs, Op.getOperand(0),
+                               Op.getOperand(2), Op.getOperand(3),
+                               Op.getOperand(4), Op.getOperand(5));
+    VTs = DAG.getVTList(Op->getValueType(1), MVT::Other);
+    SDValue ReadVL = DAG.getNode(RISCVISD::READ_VL, DL, VTs, Load.getValue(2));
+    return DAG.getMergeValues({Load, ReadVL, Load.getValue(1)}, DL);
+  }
+  case Intrinsic::riscv_vlseg8ff:
+    NF++;
+    LLVM_FALLTHROUGH;
+  case Intrinsic::riscv_vlseg7ff:
+    NF++;
+    LLVM_FALLTHROUGH;
+  case Intrinsic::riscv_vlseg6ff:
+    NF++;
+    LLVM_FALLTHROUGH;
+  case Intrinsic::riscv_vlseg5ff:
+    NF++;
+    LLVM_FALLTHROUGH;
+  case Intrinsic::riscv_vlseg4ff:
+    NF++;
+    LLVM_FALLTHROUGH;
+  case Intrinsic::riscv_vlseg3ff:
+    NF++;
+    LLVM_FALLTHROUGH;
+  case Intrinsic::riscv_vlseg2ff: {
+    NF++;
+    SDLoc DL(Op);
+    SmallVector<EVT, 8> EVTs(NF, Op.getValueType());
+    EVTs.push_back(MVT::Other);
+    EVTs.push_back(MVT::Glue);
+    SDVTList VTs = DAG.getVTList(EVTs);
+    SDValue Load =
+        DAG.getNode(RISCVISD::VLSEGFF, DL, VTs, Op.getOperand(0),
+                    Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+    VTs = DAG.getVTList(Op->getValueType(NF), MVT::Other);
+    SDValue ReadVL = DAG.getNode(RISCVISD::READ_VL, DL, VTs,
+                                 /*Glue*/ Load.getValue(NF + 1));
+    SmallVector<SDValue, 8> Results;
+    for (unsigned i = 0; i < NF; ++i)
+      Results.push_back(Load.getValue(i));
+    Results.push_back(ReadVL);
+    Results.push_back(Load.getValue(NF)); // Chain.
+    return DAG.getMergeValues(Results, DL);
+  }
+  case Intrinsic::riscv_vlseg8ff_mask:
+    NF++;
+    LLVM_FALLTHROUGH;
+  case Intrinsic::riscv_vlseg7ff_mask:
+    NF++;
+    LLVM_FALLTHROUGH;
+  case Intrinsic::riscv_vlseg6ff_mask:
+    NF++;
+    LLVM_FALLTHROUGH;
+  case Intrinsic::riscv_vlseg5ff_mask:
+    NF++;
+    LLVM_FALLTHROUGH;
+  case Intrinsic::riscv_vlseg4ff_mask:
+    NF++;
+    LLVM_FALLTHROUGH;
+  case Intrinsic::riscv_vlseg3ff_mask:
+    NF++;
+    LLVM_FALLTHROUGH;
+  case Intrinsic::riscv_vlseg2ff_mask: {
+    NF++;
+    SDLoc DL(Op);
+    SmallVector<EVT, 8> EVTs(NF, Op.getValueType());
+    EVTs.push_back(MVT::Other);
+    EVTs.push_back(MVT::Glue);
+    SDVTList VTs = DAG.getVTList(EVTs);
+    SmallVector<SDValue, 13> LoadOps;
+    LoadOps.push_back(Op.getOperand(0)); // Chain.
+    LoadOps.push_back(Op.getOperand(1)); // Intrinsic ID.
+    for (unsigned i = 0; i < NF; ++i)
+      LoadOps.push_back(Op.getOperand(2 + i)); // MaskedOff.
+    LoadOps.push_back(Op.getOperand(2 + NF));  // Base.
+    LoadOps.push_back(Op.getOperand(3 + NF));  // Mask.
+    LoadOps.push_back(Op.getOperand(4 + NF));  // VL.
+    SDValue Load = DAG.getNode(RISCVISD::VLSEGFF_MASK, DL, VTs, LoadOps);
+    VTs = DAG.getVTList(Op->getValueType(NF), MVT::Other);
+    SDValue ReadVL = DAG.getNode(RISCVISD::READ_VL, DL, VTs,
+                                 /*Glue*/ Load.getValue(NF + 1));
+    SmallVector<SDValue, 8> Results;
+    for (unsigned i = 0; i < NF; ++i)
+      Results.push_back(Load.getValue(i));
+    Results.push_back(ReadVL);
+    Results.push_back(Load.getValue(NF)); // Chain.
+    return DAG.getMergeValues(Results, DL);
+  }
   }
 }
 
@@ -897,6 +1595,14 @@ static RISCVISD::NodeType getRISCVWOpcode(unsigned Opcode) {
     return RISCVISD::DIVUW;
   case ISD::UREM:
     return RISCVISD::REMUW;
+  case ISD::ROTL:
+    return RISCVISD::ROLW;
+  case ISD::ROTR:
+    return RISCVISD::RORW;
+  case RISCVISD::GREVI:
+    return RISCVISD::GREVIW;
+  case RISCVISD::GORCI:
+    return RISCVISD::GORCIW;
   }
 }
 
@@ -905,14 +1611,15 @@ static RISCVISD::NodeType getRISCVWOpcode(unsigned Opcode) {
 // be promoted to i64, making it difficult to select the SLLW/DIVUW/.../*W
 // later one because the fact the operation was originally of type i32 is
 // lost.
-static SDValue customLegalizeToWOp(SDNode *N, SelectionDAG &DAG) {
+static SDValue customLegalizeToWOp(SDNode *N, SelectionDAG &DAG,
+                                   unsigned ExtOpc = ISD::ANY_EXTEND) {
   SDLoc DL(N);
   RISCVISD::NodeType WOpcode = getRISCVWOpcode(N->getOpcode());
-  SDValue NewOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(0));
-  SDValue NewOp1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(1));
+  SDValue NewOp0 = DAG.getNode(ExtOpc, DL, MVT::i64, N->getOperand(0));
+  SDValue NewOp1 = DAG.getNode(ExtOpc, DL, MVT::i64, N->getOperand(1));
   SDValue NewRes = DAG.getNode(WOpcode, DL, MVT::i64, NewOp0, NewOp1);
   // ReplaceNodeResults requires we maintain the same type for the return value.
-  return DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, NewRes);
+  return DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), NewRes);
 }
 
 // Converts the given 32-bit operation to a i64 operation with signed extension
@@ -942,6 +1649,13 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
     assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
            "Unexpected custom legalisation");
     SDValue Op0 = IsStrict ? N->getOperand(1) : N->getOperand(0);
+    // If the FP type needs to be softened, emit a library call using the 'si'
+    // version. If we left it to default legalization we'd end up with 'di'. If
+    // the FP type doesn't need to be softened just let generic type
+    // legalization promote the result type.
+    if (getTypeAction(*DAG.getContext(), Op0.getValueType()) !=
+        TargetLowering::TypeSoftenFloat)
+      return;
     RTLIB::Libcall LC;
     if (N->getOpcode() == ISD::FP_TO_SINT ||
         N->getOpcode() == ISD::STRICT_FP_TO_SINT)
@@ -991,31 +1705,377 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
       return;
     Results.push_back(customLegalizeToWOp(N, DAG));
     break;
+  case ISD::ROTL:
+  case ISD::ROTR:
+    assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
+           "Unexpected custom legalisation");
+    Results.push_back(customLegalizeToWOp(N, DAG));
+    break;
   case ISD::SDIV:
   case ISD::UDIV:
-  case ISD::UREM:
-    assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
-           Subtarget.hasStdExtM() && "Unexpected custom legalisation");
+  case ISD::UREM: {
+    MVT VT = N->getSimpleValueType(0);
+    assert((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) &&
+           Subtarget.is64Bit() && Subtarget.hasStdExtM() &&
+           "Unexpected custom legalisation");
     if (N->getOperand(0).getOpcode() == ISD::Constant ||
         N->getOperand(1).getOpcode() == ISD::Constant)
       return;
-    Results.push_back(customLegalizeToWOp(N, DAG));
+
+    // If the input is i32, use ANY_EXTEND since the W instructions don't read
+    // the upper 32 bits. For other types we need to sign or zero extend
+    // based on the opcode.
+    unsigned ExtOpc = ISD::ANY_EXTEND;
+    if (VT != MVT::i32)
+      ExtOpc = N->getOpcode() == ISD::SDIV ? ISD::SIGN_EXTEND
+                                           : ISD::ZERO_EXTEND;
+
+    Results.push_back(customLegalizeToWOp(N, DAG, ExtOpc));
     break;
+  }
   case ISD::BITCAST: {
+    assert(((N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
+             Subtarget.hasStdExtF()) ||
+            (N->getValueType(0) == MVT::i16 && Subtarget.hasStdExtZfh())) &&
+           "Unexpected custom legalisation");
+    SDValue Op0 = N->getOperand(0);
+    if (N->getValueType(0) == MVT::i16 && Subtarget.hasStdExtZfh()) {
+      if (Op0.getValueType() != MVT::f16)
+        return;
+      SDValue FPConv =
+          DAG.getNode(RISCVISD::FMV_X_ANYEXTH, DL, Subtarget.getXLenVT(), Op0);
+      Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FPConv));
+    } else if (N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
+               Subtarget.hasStdExtF()) {
+      if (Op0.getValueType() != MVT::f32)
+        return;
+      SDValue FPConv =
+          DAG.getNode(RISCVISD::FMV_X_ANYEXTW_RV64, DL, MVT::i64, Op0);
+      Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPConv));
+    }
+    break;
+  }
+  case RISCVISD::GREVI:
+  case RISCVISD::GORCI: {
     assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
-           Subtarget.hasStdExtF() && "Unexpected custom legalisation");
+           "Unexpected custom legalisation");
+    // This is similar to customLegalizeToWOp, except that we pass the second
+    // operand (a TargetConstant) straight through: it is already of type
+    // XLenVT.
     SDLoc DL(N);
-    SDValue Op0 = N->getOperand(0);
-    if (Op0.getValueType() != MVT::f32)
-      return;
-    SDValue FPConv =
-        DAG.getNode(RISCVISD::FMV_X_ANYEXTW_RV64, DL, MVT::i64, Op0);
-    Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPConv));
+    RISCVISD::NodeType WOpcode = getRISCVWOpcode(N->getOpcode());
+    SDValue NewOp0 =
+        DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(0));
+    SDValue NewRes =
+        DAG.getNode(WOpcode, DL, MVT::i64, NewOp0, N->getOperand(1));
+    // ReplaceNodeResults requires we maintain the same type for the return
+    // value.
+    Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, NewRes));
+    break;
+  }
+  case ISD::BSWAP:
+  case ISD::BITREVERSE: {
+    assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
+           Subtarget.hasStdExtZbp() && "Unexpected custom legalisation");
+    SDValue NewOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64,
+                                 N->getOperand(0));
+    unsigned Imm = N->getOpcode() == ISD::BITREVERSE ? 31 : 24;
+    SDValue GREVIW = DAG.getNode(RISCVISD::GREVIW, DL, MVT::i64, NewOp0,
+                                 DAG.getTargetConstant(Imm, DL,
+                                                       Subtarget.getXLenVT()));
+    // ReplaceNodeResults requires we maintain the same type for the return
+    // value.
+    Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, GREVIW));
+    break;
+  }
+  case ISD::FSHL:
+  case ISD::FSHR: {
+    assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
+           Subtarget.hasStdExtZbt() && "Unexpected custom legalisation");
+    SDValue NewOp0 =
+        DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(0));
+    SDValue NewOp1 =
+        DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(1));
+    SDValue NewOp2 =
+        DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(2));
+    // FSLW/FSRW take a 6 bit shift amount but i32 FSHL/FSHR only use 5 bits.
+    // Mask the shift amount to 5 bits.
+    NewOp2 = DAG.getNode(ISD::AND, DL, MVT::i64, NewOp2,
+                         DAG.getConstant(0x1f, DL, MVT::i64));
+    unsigned Opc =
+        N->getOpcode() == ISD::FSHL ? RISCVISD::FSLW : RISCVISD::FSRW;
+    SDValue NewOp = DAG.getNode(Opc, DL, MVT::i64, NewOp0, NewOp1, NewOp2);
+    Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, NewOp));
+    break;
+  }
+  case ISD::EXTRACT_VECTOR_ELT: {
+    // Custom-legalize an EXTRACT_VECTOR_ELT where XLEN<SEW, as the SEW element
+    // type is illegal (currently only vXi64 RV32).
+    // With vmv.x.s, when SEW > XLEN, only the least-significant XLEN bits are
+    // transferred to the destination register. We issue two of these from the
+    // upper- and lower- halves of the SEW-bit vector element, slid down to the
+    // first element.
+    SDLoc DL(N);
+    SDValue Vec = N->getOperand(0);
+    SDValue Idx = N->getOperand(1);
+    EVT VecVT = Vec.getValueType();
+    assert(!Subtarget.is64Bit() && N->getValueType(0) == MVT::i64 &&
+           VecVT.getVectorElementType() == MVT::i64 &&
+           "Unexpected EXTRACT_VECTOR_ELT legalization");
+
+    SDValue Slidedown = Vec;
+    // Unless the index is known to be 0, we must slide the vector down to get
+    // the desired element into index 0.
+    if (!isNullConstant(Idx))
+      Slidedown = DAG.getNode(RISCVISD::VSLIDEDOWN, DL, VecVT,
+                              DAG.getUNDEF(VecVT), Vec, Idx);
+
+    MVT XLenVT = Subtarget.getXLenVT();
+    // Extract the lower XLEN bits of the correct vector element.
+    SDValue EltLo = DAG.getNode(RISCVISD::VMV_X_S, DL, XLenVT, Slidedown, Idx);
+
+    // To extract the upper XLEN bits of the vector element, shift the first
+    // element right by 32 bits and re-extract the lower XLEN bits.
+    SDValue ThirtyTwoV =
+        DAG.getNode(RISCVISD::SPLAT_VECTOR_I64, DL, VecVT,
+                    DAG.getConstant(32, DL, Subtarget.getXLenVT()));
+    SDValue LShr32 = DAG.getNode(ISD::SRL, DL, VecVT, Slidedown, ThirtyTwoV);
+
+    SDValue EltHi = DAG.getNode(RISCVISD::VMV_X_S, DL, XLenVT, LShr32, Idx);
+
+    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, EltLo, EltHi));
+    break;
+  }
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+    switch (IntNo) {
+    default:
+      llvm_unreachable(
+          "Don't know how to custom type legalize this intrinsic!");
+    case Intrinsic::riscv_vmv_x_s: {
+      EVT VT = N->getValueType(0);
+      assert((VT == MVT::i8 || VT == MVT::i16 ||
+              (Subtarget.is64Bit() && VT == MVT::i32)) &&
+             "Unexpected custom legalisation!");
+      SDValue Extract = DAG.getNode(RISCVISD::VMV_X_S, DL,
+                                    Subtarget.getXLenVT(), N->getOperand(1));
+      Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Extract));
+      break;
+    }
+    }
     break;
   }
   }
 }
 
+// A structure to hold one of the bit-manipulation patterns below. Together, a
+// SHL and non-SHL pattern may form a bit-manipulation pair on a single source:
+//   (or (and (shl x, 1), 0xAAAAAAAA),
+//       (and (srl x, 1), 0x55555555))
+struct RISCVBitmanipPat {
+  SDValue Op;
+  unsigned ShAmt;
+  bool IsSHL;
+
+  bool formsPairWith(const RISCVBitmanipPat &Other) const {
+    return Op == Other.Op && ShAmt == Other.ShAmt && IsSHL != Other.IsSHL;
+  }
+};
+
+// Matches any of the following bit-manipulation patterns:
+//   (and (shl x, 1), (0x55555555 << 1))
+//   (and (srl x, 1), 0x55555555)
+//   (shl (and x, 0x55555555), 1)
+//   (srl (and x, (0x55555555 << 1)), 1)
+// where the shift amount and mask may vary thus:
+//   [1]  = 0x55555555 / 0xAAAAAAAA
+//   [2]  = 0x33333333 / 0xCCCCCCCC
+//   [4]  = 0x0F0F0F0F / 0xF0F0F0F0
+//   [8]  = 0x00FF00FF / 0xFF00FF00
+//   [16] = 0x0000FFFF / 0xFFFFFFFF
+//   [32] = 0x00000000FFFFFFFF / 0xFFFFFFFF00000000 (for RV64)
+static Optional<RISCVBitmanipPat> matchRISCVBitmanipPat(SDValue Op) {
+  Optional<uint64_t> Mask;
+  // Optionally consume a mask around the shift operation.
+  if (Op.getOpcode() == ISD::AND && isa<ConstantSDNode>(Op.getOperand(1))) {
+    Mask = Op.getConstantOperandVal(1);
+    Op = Op.getOperand(0);
+  }
+  if (Op.getOpcode() != ISD::SHL && Op.getOpcode() != ISD::SRL)
+    return None;
+  bool IsSHL = Op.getOpcode() == ISD::SHL;
+
+  if (!isa<ConstantSDNode>(Op.getOperand(1)))
+    return None;
+  auto ShAmt = Op.getConstantOperandVal(1);
+
+  if (!isPowerOf2_64(ShAmt))
+    return None;
+
+  // These are the unshifted masks which we use to match bit-manipulation
+  // patterns. They may be shifted left in certain circumstances.
+  static const uint64_t BitmanipMasks[] = {
+      0x5555555555555555ULL, 0x3333333333333333ULL, 0x0F0F0F0F0F0F0F0FULL,
+      0x00FF00FF00FF00FFULL, 0x0000FFFF0000FFFFULL, 0x00000000FFFFFFFFULL,
+  };
+
+  unsigned MaskIdx = Log2_64(ShAmt);
+  if (MaskIdx >= array_lengthof(BitmanipMasks))
+    return None;
+
+  auto Src = Op.getOperand(0);
+
+  unsigned Width = Op.getValueType() == MVT::i64 ? 64 : 32;
+  auto ExpMask = BitmanipMasks[MaskIdx] & maskTrailingOnes<uint64_t>(Width);
+
+  // The expected mask is shifted left when the AND is found around SHL
+  // patterns.
+  //   ((x >> 1) & 0x55555555)
+  //   ((x << 1) & 0xAAAAAAAA)
+  bool SHLExpMask = IsSHL;
+
+  if (!Mask) {
+    // Sometimes LLVM keeps the mask as an operand of the shift, typically when
+    // the mask is all ones: consume that now.
+    if (Src.getOpcode() == ISD::AND && isa<ConstantSDNode>(Src.getOperand(1))) {
+      Mask = Src.getConstantOperandVal(1);
+      Src = Src.getOperand(0);
+      // The expected mask is now in fact shifted left for SRL, so reverse the
+      // decision.
+      //   ((x & 0xAAAAAAAA) >> 1)
+      //   ((x & 0x55555555) << 1)
+      SHLExpMask = !SHLExpMask;
+    } else {
+      // Use a default shifted mask of all-ones if there's no AND, truncated
+      // down to the expected width. This simplifies the logic later on.
+      Mask = maskTrailingOnes<uint64_t>(Width);
+      *Mask &= (IsSHL ? *Mask << ShAmt : *Mask >> ShAmt);
+    }
+  }
+
+  if (SHLExpMask)
+    ExpMask <<= ShAmt;
+
+  if (Mask != ExpMask)
+    return None;
+
+  return RISCVBitmanipPat{Src, (unsigned)ShAmt, IsSHL};
+}
+
+// Match the following pattern as a GREVI(W) operation
+//   (or (BITMANIP_SHL x), (BITMANIP_SRL x))
+static SDValue combineORToGREV(SDValue Op, SelectionDAG &DAG,
+                               const RISCVSubtarget &Subtarget) {
+  EVT VT = Op.getValueType();
+
+  if (VT == Subtarget.getXLenVT() || (Subtarget.is64Bit() && VT == MVT::i32)) {
+    auto LHS = matchRISCVBitmanipPat(Op.getOperand(0));
+    auto RHS = matchRISCVBitmanipPat(Op.getOperand(1));
+    if (LHS && RHS && LHS->formsPairWith(*RHS)) {
+      SDLoc DL(Op);
+      return DAG.getNode(
+          RISCVISD::GREVI, DL, VT, LHS->Op,
+          DAG.getTargetConstant(LHS->ShAmt, DL, Subtarget.getXLenVT()));
+    }
+  }
+  return SDValue();
+}
+
+// Matches any the following pattern as a GORCI(W) operation
+// 1.  (or (GREVI x, shamt), x) if shamt is a power of 2
+// 2.  (or x, (GREVI x, shamt)) if shamt is a power of 2
+// 3.  (or (or (BITMANIP_SHL x), x), (BITMANIP_SRL x))
+// Note that with the variant of 3.,
+//     (or (or (BITMANIP_SHL x), (BITMANIP_SRL x)), x)
+// the inner pattern will first be matched as GREVI and then the outer
+// pattern will be matched to GORC via the first rule above.
+// 4.  (or (rotl/rotr x, bitwidth/2), x)
+static SDValue combineORToGORC(SDValue Op, SelectionDAG &DAG,
+                               const RISCVSubtarget &Subtarget) {
+  EVT VT = Op.getValueType();
+
+  if (VT == Subtarget.getXLenVT() || (Subtarget.is64Bit() && VT == MVT::i32)) {
+    SDLoc DL(Op);
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+
+    auto MatchOROfReverse = [&](SDValue Reverse, SDValue X) {
+      if (Reverse.getOpcode() == RISCVISD::GREVI && Reverse.getOperand(0) == X &&
+          isPowerOf2_32(Reverse.getConstantOperandVal(1)))
+        return DAG.getNode(RISCVISD::GORCI, DL, VT, X, Reverse.getOperand(1));
+      // We can also form GORCI from ROTL/ROTR by half the bitwidth.
+      if ((Reverse.getOpcode() == ISD::ROTL ||
+           Reverse.getOpcode() == ISD::ROTR) &&
+          Reverse.getOperand(0) == X &&
+          isa<ConstantSDNode>(Reverse.getOperand(1))) {
+        uint64_t RotAmt = Reverse.getConstantOperandVal(1);
+        if (RotAmt == (VT.getSizeInBits() / 2))
+          return DAG.getNode(
+              RISCVISD::GORCI, DL, VT, X,
+              DAG.getTargetConstant(RotAmt, DL, Subtarget.getXLenVT()));
+      }
+      return SDValue();
+    };
+
+    // Check for either commutable permutation of (or (GREVI x, shamt), x)
+    if (SDValue V = MatchOROfReverse(Op0, Op1))
+      return V;
+    if (SDValue V = MatchOROfReverse(Op1, Op0))
+      return V;
+
+    // OR is commutable so canonicalize its OR operand to the left
+    if (Op0.getOpcode() != ISD::OR && Op1.getOpcode() == ISD::OR)
+      std::swap(Op0, Op1);
+    if (Op0.getOpcode() != ISD::OR)
+      return SDValue();
+    SDValue OrOp0 = Op0.getOperand(0);
+    SDValue OrOp1 = Op0.getOperand(1);
+    auto LHS = matchRISCVBitmanipPat(OrOp0);
+    // OR is commutable so swap the operands and try again: x might have been
+    // on the left
+    if (!LHS) {
+      std::swap(OrOp0, OrOp1);
+      LHS = matchRISCVBitmanipPat(OrOp0);
+    }
+    auto RHS = matchRISCVBitmanipPat(Op1);
+    if (LHS && RHS && LHS->formsPairWith(*RHS) && LHS->Op == OrOp1) {
+      return DAG.getNode(
+          RISCVISD::GORCI, DL, VT, LHS->Op,
+          DAG.getTargetConstant(LHS->ShAmt, DL, Subtarget.getXLenVT()));
+    }
+  }
+  return SDValue();
+}
+
+// Combine (GREVI (GREVI x, C2), C1) -> (GREVI x, C1^C2) when C1^C2 is
+// non-zero, and to x when it is. Any repeated GREVI stage undoes itself.
+// Combine (GORCI (GORCI x, C2), C1) -> (GORCI x, C1|C2). Repeated stage does
+// not undo itself, but they are redundant.
+static SDValue combineGREVI_GORCI(SDNode *N, SelectionDAG &DAG) {
+  unsigned ShAmt1 = N->getConstantOperandVal(1);
+  SDValue Src = N->getOperand(0);
+
+  if (Src.getOpcode() != N->getOpcode())
+    return SDValue();
+
+  unsigned ShAmt2 = Src.getConstantOperandVal(1);
+  Src = Src.getOperand(0);
+
+  unsigned CombinedShAmt;
+  if (N->getOpcode() == RISCVISD::GORCI || N->getOpcode() == RISCVISD::GORCIW)
+    CombinedShAmt = ShAmt1 | ShAmt2;
+  else
+    CombinedShAmt = ShAmt1 ^ ShAmt2;
+
+  if (CombinedShAmt == 0)
+    return Src;
+
+  SDLoc DL(N);
+  return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), Src,
+                     DAG.getTargetConstant(CombinedShAmt, DL,
+                                           N->getOperand(1).getValueType()));
+}
+
 SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
                                                DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -1067,17 +2127,53 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
   }
   case RISCVISD::SLLW:
   case RISCVISD::SRAW:
-  case RISCVISD::SRLW: {
+  case RISCVISD::SRLW:
+  case RISCVISD::ROLW:
+  case RISCVISD::RORW: {
     // Only the lower 32 bits of LHS and lower 5 bits of RHS are read.
     SDValue LHS = N->getOperand(0);
     SDValue RHS = N->getOperand(1);
     APInt LHSMask = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 32);
     APInt RHSMask = APInt::getLowBitsSet(RHS.getValueSizeInBits(), 5);
-    if ((SimplifyDemandedBits(N->getOperand(0), LHSMask, DCI)) ||
-        (SimplifyDemandedBits(N->getOperand(1), RHSMask, DCI)))
-      return SDValue();
+    if (SimplifyDemandedBits(N->getOperand(0), LHSMask, DCI) ||
+        SimplifyDemandedBits(N->getOperand(1), RHSMask, DCI)) {
+      if (N->getOpcode() != ISD::DELETED_NODE)
+        DCI.AddToWorklist(N);
+      return SDValue(N, 0);
+    }
+    break;
+  }
+  case RISCVISD::FSLW:
+  case RISCVISD::FSRW: {
+    // Only the lower 32 bits of Values and lower 6 bits of shift amount are
+    // read.
+    SDValue Op0 = N->getOperand(0);
+    SDValue Op1 = N->getOperand(1);
+    SDValue ShAmt = N->getOperand(2);
+    APInt OpMask = APInt::getLowBitsSet(Op0.getValueSizeInBits(), 32);
+    APInt ShAmtMask = APInt::getLowBitsSet(ShAmt.getValueSizeInBits(), 6);
+    if (SimplifyDemandedBits(Op0, OpMask, DCI) ||
+        SimplifyDemandedBits(Op1, OpMask, DCI) ||
+        SimplifyDemandedBits(ShAmt, ShAmtMask, DCI)) {
+      if (N->getOpcode() != ISD::DELETED_NODE)
+        DCI.AddToWorklist(N);
+      return SDValue(N, 0);
+    }
     break;
   }
+  case RISCVISD::GREVIW:
+  case RISCVISD::GORCIW: {
+    // Only the lower 32 bits of the first operand are read
+    SDValue Op0 = N->getOperand(0);
+    APInt Mask = APInt::getLowBitsSet(Op0.getValueSizeInBits(), 32);
+    if (SimplifyDemandedBits(Op0, Mask, DCI)) {
+      if (N->getOpcode() != ISD::DELETED_NODE)
+        DCI.AddToWorklist(N);
+      return SDValue(N, 0);
+    }
+
+    return combineGREVI_GORCI(N, DCI.DAG);
+  }
   case RISCVISD::FMV_X_ANYEXTW_RV64: {
     SDLoc DL(N);
     SDValue Op0 = N->getOperand(0);
@@ -1085,9 +2181,9 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     // conversion is unnecessary and can be replaced with an ANY_EXTEND
     // of the FMV_W_X_RV64 operand.
     if (Op0->getOpcode() == RISCVISD::FMV_W_X_RV64) {
-      SDValue AExtOp =
-          DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op0.getOperand(0));
-      return DCI.CombineTo(N, AExtOp);
+      assert(Op0.getOperand(0).getValueType() == MVT::i64 &&
+             "Unexpected value type!");
+      return Op0.getOperand(0);
     }
 
     // This is a target-specific version of a DAGCombine performed in
@@ -1100,15 +2196,61 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     SDValue NewFMV = DAG.getNode(RISCVISD::FMV_X_ANYEXTW_RV64, DL, MVT::i64,
                                  Op0.getOperand(0));
     APInt SignBit = APInt::getSignMask(32).sext(64);
-    if (Op0.getOpcode() == ISD::FNEG) {
-      return DCI.CombineTo(N,
-                           DAG.getNode(ISD::XOR, DL, MVT::i64, NewFMV,
-                                       DAG.getConstant(SignBit, DL, MVT::i64)));
-    }
+    if (Op0.getOpcode() == ISD::FNEG)
+      return DAG.getNode(ISD::XOR, DL, MVT::i64, NewFMV,
+                         DAG.getConstant(SignBit, DL, MVT::i64));
+
     assert(Op0.getOpcode() == ISD::FABS);
-    return DCI.CombineTo(N,
-                         DAG.getNode(ISD::AND, DL, MVT::i64, NewFMV,
-                                     DAG.getConstant(~SignBit, DL, MVT::i64)));
+    return DAG.getNode(ISD::AND, DL, MVT::i64, NewFMV,
+                       DAG.getConstant(~SignBit, DL, MVT::i64));
+  }
+  case RISCVISD::GREVI:
+  case RISCVISD::GORCI:
+    return combineGREVI_GORCI(N, DCI.DAG);
+  case ISD::OR:
+    if (auto GREV = combineORToGREV(SDValue(N, 0), DCI.DAG, Subtarget))
+      return GREV;
+    if (auto GORC = combineORToGORC(SDValue(N, 0), DCI.DAG, Subtarget))
+      return GORC;
+    break;
+  case RISCVISD::SELECT_CC: {
+    // Transform
+    // (select_cc (xor X, 1), 0, setne, trueV, falseV) ->
+    // (select_cc X, 0, seteq, trueV, falseV) if we can prove X is 0/1.
+    // This can occur when legalizing some floating point comparisons.
+    SDValue LHS = N->getOperand(0);
+    SDValue RHS = N->getOperand(1);
+    auto CCVal = static_cast<ISD::CondCode>(N->getConstantOperandVal(2));
+    APInt Mask = APInt::getBitsSetFrom(LHS.getValueSizeInBits(), 1);
+    if (ISD::isIntEqualitySetCC(CCVal) && isNullConstant(RHS) &&
+        LHS.getOpcode() == ISD::XOR && isOneConstant(LHS.getOperand(1)) &&
+        DAG.MaskedValueIsZero(LHS.getOperand(0), Mask)) {
+      SDLoc DL(N);
+      CCVal = ISD::getSetCCInverse(CCVal, LHS.getValueType());
+      SDValue TargetCC = DAG.getConstant(CCVal, DL, Subtarget.getXLenVT());
+      return DAG.getNode(RISCVISD::SELECT_CC, DL, N->getValueType(0),
+                         {LHS.getOperand(0), RHS, TargetCC, N->getOperand(3),
+                          N->getOperand(4)});
+    }
+    break;
+  }
+  case ISD::SETCC: {
+    // (setcc X, 1, setne) -> (setcc X, 0, seteq) if we can prove X is 0/1.
+    // Comparing with 0 may allow us to fold into bnez/beqz.
+    SDValue LHS = N->getOperand(0);
+    SDValue RHS = N->getOperand(1);
+    if (LHS.getValueType().isScalableVector())
+      break;
+    auto CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+    APInt Mask = APInt::getBitsSetFrom(LHS.getValueSizeInBits(), 1);
+    if (isOneConstant(RHS) && ISD::isIntEqualitySetCC(CC) &&
+        DAG.MaskedValueIsZero(LHS, Mask)) {
+      SDLoc DL(N);
+      SDValue Zero = DAG.getConstant(0, DL, LHS.getValueType());
+      CC = ISD::getSetCCInverse(CC, LHS.getValueType());
+      return DAG.getSetCC(DL, N->getValueType(0), LHS, Zero, CC);
+    }
+    break;
   }
   }
 
@@ -1129,7 +2271,7 @@ bool RISCVTargetLowering::isDesirableToCommuteWithShift(
     auto *C1 = dyn_cast<ConstantSDNode>(N0->getOperand(1));
     auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
     if (C1 && C2) {
-      APInt C1Int = C1->getAPIntValue();
+      const APInt &C1Int = C1->getAPIntValue();
       APInt ShiftedC1Int = C1Int << C2->getAPIntValue();
 
       // We can materialise `c1 << c2` into an add immediate, so it's "free",
@@ -1161,6 +2303,116 @@ bool RISCVTargetLowering::isDesirableToCommuteWithShift(
   return true;
 }
 
+bool RISCVTargetLowering::targetShrinkDemandedConstant(
+    SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
+    TargetLoweringOpt &TLO) const {
+  // Delay this optimization as late as possible.
+  if (!TLO.LegalOps)
+    return false;
+
+  EVT VT = Op.getValueType();
+  if (VT.isVector())
+    return false;
+
+  // Only handle AND for now.
+  if (Op.getOpcode() != ISD::AND)
+    return false;
+
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+  if (!C)
+    return false;
+
+  const APInt &Mask = C->getAPIntValue();
+
+  // Clear all non-demanded bits initially.
+  APInt ShrunkMask = Mask & DemandedBits;
+
+  // If the shrunk mask fits in sign extended 12 bits, let the target
+  // independent code apply it.
+  if (ShrunkMask.isSignedIntN(12))
+    return false;
+
+  // Try to make a smaller immediate by setting undemanded bits.
+
+  // We need to be able to make a negative number through a combination of mask
+  // and undemanded bits.
+  APInt ExpandedMask = Mask | ~DemandedBits;
+  if (!ExpandedMask.isNegative())
+    return false;
+
+  // What is the fewest number of bits we need to represent the negative number.
+  unsigned MinSignedBits = ExpandedMask.getMinSignedBits();
+
+  // Try to make a 12 bit negative immediate. If that fails try to make a 32
+  // bit negative immediate unless the shrunk immediate already fits in 32 bits.
+  APInt NewMask = ShrunkMask;
+  if (MinSignedBits <= 12)
+    NewMask.setBitsFrom(11);
+  else if (MinSignedBits <= 32 && !ShrunkMask.isSignedIntN(32))
+    NewMask.setBitsFrom(31);
+  else
+    return false;
+
+  // Sanity check that our new mask is a subset of the demanded mask.
+  assert(NewMask.isSubsetOf(ExpandedMask));
+
+  // If we aren't changing the mask, just return true to keep it and prevent
+  // the caller from optimizing.
+  if (NewMask == Mask)
+    return true;
+
+  // Replace the constant with the new mask.
+  SDLoc DL(Op);
+  SDValue NewC = TLO.DAG.getConstant(NewMask, DL, VT);
+  SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
+  return TLO.CombineTo(Op, NewOp);
+}
+
+void RISCVTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
+                                                        KnownBits &Known,
+                                                        const APInt &DemandedElts,
+                                                        const SelectionDAG &DAG,
+                                                        unsigned Depth) const {
+  unsigned BitWidth = Known.getBitWidth();
+  unsigned Opc = Op.getOpcode();
+  assert((Opc >= ISD::BUILTIN_OP_END ||
+          Opc == ISD::INTRINSIC_WO_CHAIN ||
+          Opc == ISD::INTRINSIC_W_CHAIN ||
+          Opc == ISD::INTRINSIC_VOID) &&
+         "Should use MaskedValueIsZero if you don't know whether Op"
+         " is a target node!");
+
+  Known.resetAll();
+  switch (Opc) {
+  default: break;
+  case RISCVISD::REMUW: {
+    KnownBits Known2;
+    Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    // We only care about the lower 32 bits.
+    Known = KnownBits::urem(Known.trunc(32), Known2.trunc(32));
+    // Restore the original width by sign extending.
+    Known = Known.sext(BitWidth);
+    break;
+  }
+  case RISCVISD::DIVUW: {
+    KnownBits Known2;
+    Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    // We only care about the lower 32 bits.
+    Known = KnownBits::udiv(Known.trunc(32), Known2.trunc(32));
+    // Restore the original width by sign extending.
+    Known = Known.sext(BitWidth);
+    break;
+  }
+  case RISCVISD::READ_VLENB:
+    // We assume VLENB is at least 8 bytes.
+    // FIXME: The 1.0 draft spec defines minimum VLEN as 128 bits.
+    Known.Zero.setLowBits(3);
+    break;
+  }
+}
+
 unsigned RISCVTargetLowering::ComputeNumSignBitsForTargetNode(
     SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
     unsigned Depth) const {
@@ -1173,10 +2425,25 @@ unsigned RISCVTargetLowering::ComputeNumSignBitsForTargetNode(
   case RISCVISD::DIVW:
   case RISCVISD::DIVUW:
   case RISCVISD::REMUW:
+  case RISCVISD::ROLW:
+  case RISCVISD::RORW:
+  case RISCVISD::GREVIW:
+  case RISCVISD::GORCIW:
+  case RISCVISD::FSLW:
+  case RISCVISD::FSRW:
     // TODO: As the result is sign-extended, this is conservatively correct. A
     // more precise answer could be calculated for SRAW depending on known
     // bits in the shift amount.
     return 33;
+  case RISCVISD::VMV_X_S:
+    // The number of sign bits of the scalar result is computed by obtaining the
+    // element type of the input vector operand, subtracting its width from the
+    // XLEN, and then adding one (sign bit within the element type). If the
+    // element type is wider than XLen, the least-significant XLEN bits are
+    // taken.
+    if (Op.getOperand(0).getScalarValueSizeInBits() > Subtarget.getXLen())
+      return 1;
+    return Subtarget.getXLen() - Op.getOperand(0).getScalarValueSizeInBits() + 1;
   }
 
   return 1;
@@ -1260,17 +2527,19 @@ static MachineBasicBlock *emitSplitF64Pseudo(MachineInstr &MI,
 
   TII.storeRegToStackSlot(*BB, MI, SrcReg, MI.getOperand(2).isKill(), FI, SrcRC,
                           RI);
-  MachineMemOperand *MMO =
-      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, FI),
-                              MachineMemOperand::MOLoad, 8, Align(8));
+  MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI);
+  MachineMemOperand *MMOLo =
+      MF.getMachineMemOperand(MPI, MachineMemOperand::MOLoad, 4, Align(8));
+  MachineMemOperand *MMOHi = MF.getMachineMemOperand(
+      MPI.getWithOffset(4), MachineMemOperand::MOLoad, 4, Align(8));
   BuildMI(*BB, MI, DL, TII.get(RISCV::LW), LoReg)
       .addFrameIndex(FI)
       .addImm(0)
-      .addMemOperand(MMO);
+      .addMemOperand(MMOLo);
   BuildMI(*BB, MI, DL, TII.get(RISCV::LW), HiReg)
       .addFrameIndex(FI)
       .addImm(4)
-      .addMemOperand(MMO);
+      .addMemOperand(MMOHi);
   MI.eraseFromParent(); // The pseudo instruction is gone now.
   return BB;
 }
@@ -1290,19 +2559,21 @@ static MachineBasicBlock *emitBuildPairF64Pseudo(MachineInstr &MI,
   const TargetRegisterClass *DstRC = &RISCV::FPR64RegClass;
   int FI = MF.getInfo<RISCVMachineFunctionInfo>()->getMoveF64FrameIndex(MF);
 
-  MachineMemOperand *MMO =
-      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, FI),
-                              MachineMemOperand::MOStore, 8, Align(8));
+  MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI);
+  MachineMemOperand *MMOLo =
+      MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 4, Align(8));
+  MachineMemOperand *MMOHi = MF.getMachineMemOperand(
+      MPI.getWithOffset(4), MachineMemOperand::MOStore, 4, Align(8));
   BuildMI(*BB, MI, DL, TII.get(RISCV::SW))
       .addReg(LoReg, getKillRegState(MI.getOperand(1).isKill()))
       .addFrameIndex(FI)
       .addImm(0)
-      .addMemOperand(MMO);
+      .addMemOperand(MMOLo);
   BuildMI(*BB, MI, DL, TII.get(RISCV::SW))
       .addReg(HiReg, getKillRegState(MI.getOperand(2).isKill()))
       .addFrameIndex(FI)
       .addImm(4)
-      .addMemOperand(MMO);
+      .addMemOperand(MMOHi);
   TII.loadRegFromStackSlot(*BB, MI, DstReg, FI, DstRC, RI);
   MI.eraseFromParent(); // The pseudo instruction is gone now.
   return BB;
@@ -1313,6 +2584,7 @@ static bool isSelectPseudo(MachineInstr &MI) {
   default:
     return false;
   case RISCV::Select_GPR_Using_CC_GPR:
+  case RISCV::Select_FPR16_Using_CC_GPR:
   case RISCV::Select_FPR32_Using_CC_GPR:
   case RISCV::Select_FPR64_Using_CC_GPR:
     return true;
@@ -1442,9 +2714,80 @@ static MachineBasicBlock *emitSelectPseudo(MachineInstr &MI,
   return TailMBB;
 }
 
+static MachineBasicBlock *addVSetVL(MachineInstr &MI, MachineBasicBlock *BB,
+                                    int VLIndex, unsigned SEWIndex,
+                                    RISCVVLMUL VLMul, bool WritesElement0) {
+  MachineFunction &MF = *BB->getParent();
+  DebugLoc DL = MI.getDebugLoc();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+
+  unsigned SEW = MI.getOperand(SEWIndex).getImm();
+  assert(RISCVVType::isValidSEW(SEW) && "Unexpected SEW");
+  RISCVVSEW ElementWidth = static_cast<RISCVVSEW>(Log2_32(SEW / 8));
+
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  // VL and VTYPE are alive here.
+  MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII.get(RISCV::PseudoVSETVLI));
+
+  if (VLIndex >= 0) {
+    // Set VL (rs1 != X0).
+    Register DestReg = MRI.createVirtualRegister(&RISCV::GPRRegClass);
+    MIB.addReg(DestReg, RegState::Define | RegState::Dead)
+        .addReg(MI.getOperand(VLIndex).getReg());
+  } else
+    // With no VL operator in the pseudo, do not modify VL (rd = X0, rs1 = X0).
+    MIB.addReg(RISCV::X0, RegState::Define | RegState::Dead)
+        .addReg(RISCV::X0, RegState::Kill);
+
+  // Default to tail agnostic unless the destination is tied to a source. In
+  // that case the user would have some control over the tail values. The tail
+  // policy is also ignored on instructions that only update element 0 like
+  // vmv.s.x or reductions so use agnostic there to match the common case.
+  // FIXME: This is conservatively correct, but we might want to detect that
+  // the input is undefined.
+  bool TailAgnostic = true;
+  unsigned UseOpIdx;
+  if (MI.isRegTiedToUseOperand(0, &UseOpIdx) && !WritesElement0) {
+    TailAgnostic = false;
+    // If the tied operand is an IMPLICIT_DEF we can keep TailAgnostic.
+    const MachineOperand &UseMO = MI.getOperand(UseOpIdx);
+    MachineInstr *UseMI = MRI.getVRegDef(UseMO.getReg());
+    if (UseMI && UseMI->isImplicitDef())
+      TailAgnostic = true;
+  }
+
+  // For simplicity we reuse the vtype representation here.
+  MIB.addImm(RISCVVType::encodeVTYPE(VLMul, ElementWidth,
+                                     /*TailAgnostic*/ TailAgnostic,
+                                     /*MaskAgnostic*/ false));
+
+  // Remove (now) redundant operands from pseudo
+  MI.getOperand(SEWIndex).setImm(-1);
+  if (VLIndex >= 0) {
+    MI.getOperand(VLIndex).setReg(RISCV::NoRegister);
+    MI.getOperand(VLIndex).setIsKill(false);
+  }
+
+  return BB;
+}
+
 MachineBasicBlock *
 RISCVTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                  MachineBasicBlock *BB) const {
+  uint64_t TSFlags = MI.getDesc().TSFlags;
+
+  if (TSFlags & RISCVII::HasSEWOpMask) {
+    unsigned NumOperands = MI.getNumExplicitOperands();
+    int VLIndex = (TSFlags & RISCVII::HasVLOpMask) ? NumOperands - 2 : -1;
+    unsigned SEWIndex = NumOperands - 1;
+    bool WritesElement0 = TSFlags & RISCVII::WritesElement0Mask;
+
+    RISCVVLMUL VLMul = static_cast<RISCVVLMUL>((TSFlags & RISCVII::VLMulMask) >>
+                                               RISCVII::VLMulShift);
+    return addVSetVL(MI, BB, VLIndex, SEWIndex, VLMul, WritesElement0);
+  }
+
   switch (MI.getOpcode()) {
   default:
     llvm_unreachable("Unexpected instr type to insert");
@@ -1453,6 +2796,7 @@ RISCVTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
            "ReadCycleWrite is only to be used on riscv32");
     return emitReadCycleWidePseudo(MI, BB);
   case RISCV::Select_GPR_Using_CC_GPR:
+  case RISCV::Select_FPR16_Using_CC_GPR:
   case RISCV::Select_FPR32_Using_CC_GPR:
   case RISCV::Select_FPR64_Using_CC_GPR:
     return emitSelectPseudo(MI, BB);
@@ -1492,6 +2836,10 @@ static const MCPhysReg ArgGPRs[] = {
   RISCV::X10, RISCV::X11, RISCV::X12, RISCV::X13,
   RISCV::X14, RISCV::X15, RISCV::X16, RISCV::X17
 };
+static const MCPhysReg ArgFPR16s[] = {
+  RISCV::F10_H, RISCV::F11_H, RISCV::F12_H, RISCV::F13_H,
+  RISCV::F14_H, RISCV::F15_H, RISCV::F16_H, RISCV::F17_H
+};
 static const MCPhysReg ArgFPR32s[] = {
   RISCV::F10_F, RISCV::F11_F, RISCV::F12_F, RISCV::F13_F,
   RISCV::F14_F, RISCV::F15_F, RISCV::F16_F, RISCV::F17_F
@@ -1500,6 +2848,17 @@ static const MCPhysReg ArgFPR64s[] = {
   RISCV::F10_D, RISCV::F11_D, RISCV::F12_D, RISCV::F13_D,
   RISCV::F14_D, RISCV::F15_D, RISCV::F16_D, RISCV::F17_D
 };
+// This is an interim calling convention and it may be changed in the future.
+static const MCPhysReg ArgVRs[] = {
+    RISCV::V8,  RISCV::V9,  RISCV::V10, RISCV::V11, RISCV::V12, RISCV::V13,
+    RISCV::V14, RISCV::V15, RISCV::V16, RISCV::V17, RISCV::V18, RISCV::V19,
+    RISCV::V20, RISCV::V21, RISCV::V22, RISCV::V23};
+static const MCPhysReg ArgVRM2s[] = {RISCV::V8M2,  RISCV::V10M2, RISCV::V12M2,
+                                     RISCV::V14M2, RISCV::V16M2, RISCV::V18M2,
+                                     RISCV::V20M2, RISCV::V22M2};
+static const MCPhysReg ArgVRM4s[] = {RISCV::V8M4, RISCV::V12M4, RISCV::V16M4,
+                                     RISCV::V20M4};
+static const MCPhysReg ArgVRM8s[] = {RISCV::V8M8, RISCV::V16M8};
 
 // Pass a 2*XLEN argument that has been split into two XLEN values through
 // registers or the stack as necessary.
@@ -1544,7 +2903,8 @@ static bool CC_RISCVAssign2XLen(unsigned XLen, CCState &State, CCValAssign VA1,
 static bool CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
                      MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo,
                      ISD::ArgFlagsTy ArgFlags, CCState &State, bool IsFixed,
-                     bool IsRet, Type *OrigTy) {
+                     bool IsRet, Type *OrigTy, const RISCVTargetLowering &TLI,
+                     Optional<unsigned> FirstMaskArgument) {
   unsigned XLen = DL.getLargestLegalIntTypeSizeInBits();
   assert(XLen == 32 || XLen == 64);
   MVT XLenVT = XLen == 32 ? MVT::i32 : MVT::i64;
@@ -1554,9 +2914,9 @@ static bool CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
   if (IsRet && ValNo > 1)
     return true;
 
-  // UseGPRForF32 if targeting one of the soft-float ABIs, if passing a
-  // variadic argument, or if no F32 argument registers are available.
-  bool UseGPRForF32 = true;
+  // UseGPRForF16_F32 if targeting one of the soft-float ABIs, if passing a
+  // variadic argument, or if no F16/F32 argument registers are available.
+  bool UseGPRForF16_F32 = true;
   // UseGPRForF64 if targeting soft-float ABIs or an FLEN=32 ABI, if passing a
   // variadic argument, or if no F64 argument registers are available.
   bool UseGPRForF64 = true;
@@ -1569,24 +2929,26 @@ static bool CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
     break;
   case RISCVABI::ABI_ILP32F:
   case RISCVABI::ABI_LP64F:
-    UseGPRForF32 = !IsFixed;
+    UseGPRForF16_F32 = !IsFixed;
     break;
   case RISCVABI::ABI_ILP32D:
   case RISCVABI::ABI_LP64D:
-    UseGPRForF32 = !IsFixed;
+    UseGPRForF16_F32 = !IsFixed;
     UseGPRForF64 = !IsFixed;
     break;
   }
 
-  if (State.getFirstUnallocated(ArgFPR32s) == array_lengthof(ArgFPR32s))
-    UseGPRForF32 = true;
-  if (State.getFirstUnallocated(ArgFPR64s) == array_lengthof(ArgFPR64s))
+  // FPR16, FPR32, and FPR64 alias each other.
+  if (State.getFirstUnallocated(ArgFPR32s) == array_lengthof(ArgFPR32s)) {
+    UseGPRForF16_F32 = true;
     UseGPRForF64 = true;
+  }
 
-  // From this point on, rely on UseGPRForF32, UseGPRForF64 and similar local
-  // variables rather than directly checking against the target ABI.
+  // From this point on, rely on UseGPRForF16_F32, UseGPRForF64 and
+  // similar local variables rather than directly checking against the target
+  // ABI.
 
-  if (UseGPRForF32 && ValVT == MVT::f32) {
+  if (UseGPRForF16_F32 && (ValVT == MVT::f16 || ValVT == MVT::f32)) {
     LocVT = XLenVT;
     LocInfo = CCValAssign::BCvt;
   } else if (UseGPRForF64 && XLen == 64 && ValVT == MVT::f64) {
@@ -1669,11 +3031,40 @@ static bool CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
 
   // Allocate to a register if possible, or else a stack slot.
   Register Reg;
-  if (ValVT == MVT::f32 && !UseGPRForF32)
-    Reg = State.AllocateReg(ArgFPR32s, ArgFPR64s);
+  if (ValVT == MVT::f16 && !UseGPRForF16_F32)
+    Reg = State.AllocateReg(ArgFPR16s);
+  else if (ValVT == MVT::f32 && !UseGPRForF16_F32)
+    Reg = State.AllocateReg(ArgFPR32s);
   else if (ValVT == MVT::f64 && !UseGPRForF64)
-    Reg = State.AllocateReg(ArgFPR64s, ArgFPR32s);
-  else
+    Reg = State.AllocateReg(ArgFPR64s);
+  else if (ValVT.isScalableVector()) {
+    const TargetRegisterClass *RC = TLI.getRegClassFor(ValVT);
+    if (RC == &RISCV::VRRegClass) {
+      // Assign the first mask argument to V0.
+      // This is an interim calling convention and it may be changed in the
+      // future.
+      if (FirstMaskArgument.hasValue() &&
+          ValNo == FirstMaskArgument.getValue()) {
+        Reg = State.AllocateReg(RISCV::V0);
+      } else {
+        Reg = State.AllocateReg(ArgVRs);
+      }
+    } else if (RC == &RISCV::VRM2RegClass) {
+      Reg = State.AllocateReg(ArgVRM2s);
+    } else if (RC == &RISCV::VRM4RegClass) {
+      Reg = State.AllocateReg(ArgVRM4s);
+    } else if (RC == &RISCV::VRM8RegClass) {
+      Reg = State.AllocateReg(ArgVRM8s);
+    } else {
+      llvm_unreachable("Unhandled class register for ValueType");
+    }
+    if (!Reg) {
+      LocInfo = CCValAssign::Indirect;
+      // Try using a GPR to pass the address
+      Reg = State.AllocateReg(ArgGPRs);
+      LocVT = XLenVT;
+    }
+  } else
     Reg = State.AllocateReg(ArgGPRs);
   unsigned StackOffset =
       Reg ? 0 : State.AllocateStack(XLen / 8, Align(XLen / 8));
@@ -1696,16 +3087,18 @@ static bool CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
     return false;
   }
 
-  assert((!UseGPRForF32 || !UseGPRForF64 || LocVT == XLenVT) &&
-         "Expected an XLenVT at this stage");
+  assert((!UseGPRForF16_F32 || !UseGPRForF64 || LocVT == XLenVT ||
+          (TLI.getSubtarget().hasStdExtV() && ValVT.isScalableVector())) &&
+         "Expected an XLenVT or scalable vector types at this stage");
 
   if (Reg) {
     State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
     return false;
   }
 
-  // When an f32 or f64 is passed on the stack, no bit-conversion is needed.
-  if (ValVT == MVT::f32 || ValVT == MVT::f64) {
+  // When a floating-point value is passed on the stack, no bit-conversion is
+  // needed.
+  if (ValVT.isFloatingPoint()) {
     LocVT = ValVT;
     LocInfo = CCValAssign::Full;
   }
@@ -1713,12 +3106,27 @@ static bool CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
   return false;
 }
 
+template <typename ArgTy>
+static Optional<unsigned> preAssignMask(const ArgTy &Args) {
+  for (const auto &ArgIdx : enumerate(Args)) {
+    MVT ArgVT = ArgIdx.value().VT;
+    if (ArgVT.isScalableVector() &&
+        ArgVT.getVectorElementType().SimpleTy == MVT::i1)
+      return ArgIdx.index();
+  }
+  return None;
+}
+
 void RISCVTargetLowering::analyzeInputArgs(
     MachineFunction &MF, CCState &CCInfo,
     const SmallVectorImpl<ISD::InputArg> &Ins, bool IsRet) const {
   unsigned NumArgs = Ins.size();
   FunctionType *FType = MF.getFunction().getFunctionType();
 
+  Optional<unsigned> FirstMaskArgument;
+  if (Subtarget.hasStdExtV())
+    FirstMaskArgument = preAssignMask(Ins);
+
   for (unsigned i = 0; i != NumArgs; ++i) {
     MVT ArgVT = Ins[i].VT;
     ISD::ArgFlagsTy ArgFlags = Ins[i].Flags;
@@ -1731,7 +3139,8 @@ void RISCVTargetLowering::analyzeInputArgs(
 
     RISCVABI::ABI ABI = MF.getSubtarget<RISCVSubtarget>().getTargetABI();
     if (CC_RISCV(MF.getDataLayout(), ABI, i, ArgVT, ArgVT, CCValAssign::Full,
-                 ArgFlags, CCInfo, /*IsFixed=*/true, IsRet, ArgTy)) {
+                 ArgFlags, CCInfo, /*IsFixed=*/true, IsRet, ArgTy, *this,
+                 FirstMaskArgument)) {
       LLVM_DEBUG(dbgs() << "InputArg #" << i << " has unhandled type "
                         << EVT(ArgVT).getEVTString() << '\n');
       llvm_unreachable(nullptr);
@@ -1745,6 +3154,10 @@ void RISCVTargetLowering::analyzeOutputArgs(
     CallLoweringInfo *CLI) const {
   unsigned NumArgs = Outs.size();
 
+  Optional<unsigned> FirstMaskArgument;
+  if (Subtarget.hasStdExtV())
+    FirstMaskArgument = preAssignMask(Outs);
+
   for (unsigned i = 0; i != NumArgs; i++) {
     MVT ArgVT = Outs[i].VT;
     ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
@@ -1752,7 +3165,8 @@ void RISCVTargetLowering::analyzeOutputArgs(
 
     RISCVABI::ABI ABI = MF.getSubtarget<RISCVSubtarget>().getTargetABI();
     if (CC_RISCV(MF.getDataLayout(), ABI, i, ArgVT, ArgVT, CCValAssign::Full,
-                 ArgFlags, CCInfo, Outs[i].IsFixed, IsRet, OrigTy)) {
+                 ArgFlags, CCInfo, Outs[i].IsFixed, IsRet, OrigTy, *this,
+                 FirstMaskArgument)) {
       LLVM_DEBUG(dbgs() << "OutputArg #" << i << " has unhandled type "
                         << EVT(ArgVT).getEVTString() << "\n");
       llvm_unreachable(nullptr);
@@ -1770,11 +3184,12 @@ static SDValue convertLocVTToValVT(SelectionDAG &DAG, SDValue Val,
   case CCValAssign::Full:
     break;
   case CCValAssign::BCvt:
-    if (VA.getLocVT() == MVT::i64 && VA.getValVT() == MVT::f32) {
+    if (VA.getLocVT().isInteger() && VA.getValVT() == MVT::f16)
+      Val = DAG.getNode(RISCVISD::FMV_H_X, DL, MVT::f16, Val);
+    else if (VA.getLocVT() == MVT::i64 && VA.getValVT() == MVT::f32)
       Val = DAG.getNode(RISCVISD::FMV_W_X_RV64, DL, MVT::f32, Val);
-      break;
-    }
-    Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
+    else
+      Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
     break;
   }
   return Val;
@@ -1783,28 +3198,13 @@ static SDValue convertLocVTToValVT(SelectionDAG &DAG, SDValue Val,
 // The caller is responsible for loading the full value if the argument is
 // passed with CCValAssign::Indirect.
 static SDValue unpackFromRegLoc(SelectionDAG &DAG, SDValue Chain,
-                                const CCValAssign &VA, const SDLoc &DL) {
+                                const CCValAssign &VA, const SDLoc &DL,
+                                const RISCVTargetLowering &TLI) {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineRegisterInfo &RegInfo = MF.getRegInfo();
   EVT LocVT = VA.getLocVT();
   SDValue Val;
-  const TargetRegisterClass *RC;
-
-  switch (LocVT.getSimpleVT().SimpleTy) {
-  default:
-    llvm_unreachable("Unexpected register type");
-  case MVT::i32:
-  case MVT::i64:
-    RC = &RISCV::GPRRegClass;
-    break;
-  case MVT::f32:
-    RC = &RISCV::FPR32RegClass;
-    break;
-  case MVT::f64:
-    RC = &RISCV::FPR64RegClass;
-    break;
-  }
-
+  const TargetRegisterClass *RC = TLI.getRegClassFor(LocVT.getSimpleVT());
   Register VReg = RegInfo.createVirtualRegister(RC);
   RegInfo.addLiveIn(VA.getLocReg(), VReg);
   Val = DAG.getCopyFromReg(Chain, DL, VReg, LocVT);
@@ -1825,11 +3225,12 @@ static SDValue convertValVTToLocVT(SelectionDAG &DAG, SDValue Val,
   case CCValAssign::Full:
     break;
   case CCValAssign::BCvt:
-    if (VA.getLocVT() == MVT::i64 && VA.getValVT() == MVT::f32) {
+    if (VA.getLocVT().isInteger() && VA.getValVT() == MVT::f16)
+      Val = DAG.getNode(RISCVISD::FMV_X_ANYEXTH, DL, VA.getLocVT(), Val);
+    else if (VA.getLocVT() == MVT::i64 && VA.getValVT() == MVT::f32)
       Val = DAG.getNode(RISCVISD::FMV_X_ANYEXTW_RV64, DL, MVT::i64, Val);
-      break;
-    }
-    Val = DAG.getNode(ISD::BITCAST, DL, LocVT, Val);
+    else
+      Val = DAG.getNode(ISD::BITCAST, DL, LocVT, Val);
     break;
   }
   return Val;
@@ -1920,6 +3321,18 @@ static bool CC_RISCV_FastCC(unsigned ValNo, MVT ValVT, MVT LocVT,
     }
   }
 
+  if (LocVT == MVT::f16) {
+    static const MCPhysReg FPR16List[] = {
+        RISCV::F10_H, RISCV::F11_H, RISCV::F12_H, RISCV::F13_H, RISCV::F14_H,
+        RISCV::F15_H, RISCV::F16_H, RISCV::F17_H, RISCV::F0_H,  RISCV::F1_H,
+        RISCV::F2_H,  RISCV::F3_H,  RISCV::F4_H,  RISCV::F5_H,  RISCV::F6_H,
+        RISCV::F7_H,  RISCV::F28_H, RISCV::F29_H, RISCV::F30_H, RISCV::F31_H};
+    if (unsigned Reg = State.AllocateReg(FPR16List)) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+      return false;
+    }
+  }
+
   if (LocVT == MVT::f32) {
     static const MCPhysReg FPR32List[] = {
         RISCV::F10_F, RISCV::F11_F, RISCV::F12_F, RISCV::F13_F, RISCV::F14_F,
@@ -1959,22 +3372,71 @@ static bool CC_RISCV_FastCC(unsigned ValNo, MVT ValVT, MVT LocVT,
   return true; // CC didn't match.
 }
 
+static bool CC_RISCV_GHC(unsigned ValNo, MVT ValVT, MVT LocVT,
+                         CCValAssign::LocInfo LocInfo,
+                         ISD::ArgFlagsTy ArgFlags, CCState &State) {
+
+  if (LocVT == MVT::i32 || LocVT == MVT::i64) {
+    // Pass in STG registers: Base, Sp, Hp, R1, R2, R3, R4, R5, R6, R7, SpLim
+    //                        s1    s2  s3  s4  s5  s6  s7  s8  s9  s10 s11
+    static const MCPhysReg GPRList[] = {
+        RISCV::X9, RISCV::X18, RISCV::X19, RISCV::X20, RISCV::X21, RISCV::X22,
+        RISCV::X23, RISCV::X24, RISCV::X25, RISCV::X26, RISCV::X27};
+    if (unsigned Reg = State.AllocateReg(GPRList)) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+      return false;
+    }
+  }
+
+  if (LocVT == MVT::f32) {
+    // Pass in STG registers: F1, ..., F6
+    //                        fs0 ... fs5
+    static const MCPhysReg FPR32List[] = {RISCV::F8_F, RISCV::F9_F,
+                                          RISCV::F18_F, RISCV::F19_F,
+                                          RISCV::F20_F, RISCV::F21_F};
+    if (unsigned Reg = State.AllocateReg(FPR32List)) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+      return false;
+    }
+  }
+
+  if (LocVT == MVT::f64) {
+    // Pass in STG registers: D1, ..., D6
+    //                        fs6 ... fs11
+    static const MCPhysReg FPR64List[] = {RISCV::F22_D, RISCV::F23_D,
+                                          RISCV::F24_D, RISCV::F25_D,
+                                          RISCV::F26_D, RISCV::F27_D};
+    if (unsigned Reg = State.AllocateReg(FPR64List)) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+      return false;
+    }
+  }
+
+  report_fatal_error("No registers left in GHC calling convention");
+  return true;
+}
+
 // Transform physical registers into virtual registers.
 SDValue RISCVTargetLowering::LowerFormalArguments(
     SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
 
+  MachineFunction &MF = DAG.getMachineFunction();
+
   switch (CallConv) {
   default:
     report_fatal_error("Unsupported calling convention");
   case CallingConv::C:
   case CallingConv::Fast:
     break;
+  case CallingConv::GHC:
+    if (!MF.getSubtarget().getFeatureBits()[RISCV::FeatureStdExtF] ||
+        !MF.getSubtarget().getFeatureBits()[RISCV::FeatureStdExtD])
+      report_fatal_error(
+        "GHC calling convention requires the F and D instruction set extensions");
   }
 
-  MachineFunction &MF = DAG.getMachineFunction();
-
   const Function &Func = MF.getFunction();
   if (Func.hasFnAttribute("interrupt")) {
     if (!Func.arg_empty())
@@ -2001,6 +3463,8 @@ SDValue RISCVTargetLowering::LowerFormalArguments(
 
   if (CallConv == CallingConv::Fast)
     CCInfo.AnalyzeFormalArguments(Ins, CC_RISCV_FastCC);
+  else if (CallConv == CallingConv::GHC)
+    CCInfo.AnalyzeFormalArguments(Ins, CC_RISCV_GHC);
   else
     analyzeInputArgs(MF, CCInfo, Ins, /*IsRet=*/false);
 
@@ -2012,7 +3476,7 @@ SDValue RISCVTargetLowering::LowerFormalArguments(
     if (VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64)
       ArgValue = unpackF64OnRV32DSoftABI(DAG, Chain, VA, DL);
     else if (VA.isRegLoc())
-      ArgValue = unpackFromRegLoc(DAG, Chain, VA, DL);
+      ArgValue = unpackFromRegLoc(DAG, Chain, VA, DL, *this);
     else
       ArgValue = unpackFromMemLoc(DAG, Chain, VA, DL);
 
@@ -2201,6 +3665,8 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   if (CallConv == CallingConv::Fast)
     ArgCCInfo.AnalyzeCallOperands(Outs, CC_RISCV_FastCC);
+  else if (CallConv == CallingConv::GHC)
+    ArgCCInfo.AnalyzeCallOperands(Outs, CC_RISCV_GHC);
   else
     analyzeOutputArgs(MF, ArgCCInfo, Outs, /*IsRet=*/false, &CLI);
 
@@ -2458,12 +3924,18 @@ bool RISCVTargetLowering::CanLowerReturn(
     const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
+
+  Optional<unsigned> FirstMaskArgument;
+  if (Subtarget.hasStdExtV())
+    FirstMaskArgument = preAssignMask(Outs);
+
   for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
     MVT VT = Outs[i].VT;
     ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
     RISCVABI::ABI ABI = MF.getSubtarget<RISCVSubtarget>().getTargetABI();
     if (CC_RISCV(MF.getDataLayout(), ABI, i, VT, VT, CCValAssign::Full,
-                 ArgFlags, CCInfo, /*IsFixed=*/true, /*IsRet=*/true, nullptr))
+                 ArgFlags, CCInfo, /*IsFixed=*/true, /*IsRet=*/true, nullptr,
+                 *this, FirstMaskArgument))
       return false;
   }
   return true;
@@ -2488,6 +3960,9 @@ RISCVTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   analyzeOutputArgs(DAG.getMachineFunction(), CCInfo, Outs, /*IsRet=*/true,
                     nullptr);
 
+  if (CallConv == CallingConv::GHC && !RVLocs.empty())
+    report_fatal_error("GHC functions return void only");
+
   SDValue Glue;
   SmallVector<SDValue, 4> RetOps(1, Chain);
 
@@ -2574,7 +4049,7 @@ void RISCVTargetLowering::validateCCReservedRegs(
   const Function &F = MF.getFunction();
   const RISCVSubtarget &STI = MF.getSubtarget<RISCVSubtarget>();
 
-  if (std::any_of(std::begin(Regs), std::end(Regs), [&STI](auto Reg) {
+  if (llvm::any_of(Regs, [&STI](auto Reg) {
         return STI.isRegisterReservedByUser(Reg.first);
       }))
     F.getContext().diagnose(DiagnosticInfoUnsupported{
@@ -2586,47 +4061,57 @@ bool RISCVTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
 }
 
 const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
+#define NODE_NAME_CASE(NODE)                                                   \
+  case RISCVISD::NODE:                                                         \
+    return "RISCVISD::" #NODE;
+  // clang-format off
   switch ((RISCVISD::NodeType)Opcode) {
   case RISCVISD::FIRST_NUMBER:
     break;
-  case RISCVISD::RET_FLAG:
-    return "RISCVISD::RET_FLAG";
-  case RISCVISD::URET_FLAG:
-    return "RISCVISD::URET_FLAG";
-  case RISCVISD::SRET_FLAG:
-    return "RISCVISD::SRET_FLAG";
-  case RISCVISD::MRET_FLAG:
-    return "RISCVISD::MRET_FLAG";
-  case RISCVISD::CALL:
-    return "RISCVISD::CALL";
-  case RISCVISD::SELECT_CC:
-    return "RISCVISD::SELECT_CC";
-  case RISCVISD::BuildPairF64:
-    return "RISCVISD::BuildPairF64";
-  case RISCVISD::SplitF64:
-    return "RISCVISD::SplitF64";
-  case RISCVISD::TAIL:
-    return "RISCVISD::TAIL";
-  case RISCVISD::SLLW:
-    return "RISCVISD::SLLW";
-  case RISCVISD::SRAW:
-    return "RISCVISD::SRAW";
-  case RISCVISD::SRLW:
-    return "RISCVISD::SRLW";
-  case RISCVISD::DIVW:
-    return "RISCVISD::DIVW";
-  case RISCVISD::DIVUW:
-    return "RISCVISD::DIVUW";
-  case RISCVISD::REMUW:
-    return "RISCVISD::REMUW";
-  case RISCVISD::FMV_W_X_RV64:
-    return "RISCVISD::FMV_W_X_RV64";
-  case RISCVISD::FMV_X_ANYEXTW_RV64:
-    return "RISCVISD::FMV_X_ANYEXTW_RV64";
-  case RISCVISD::READ_CYCLE_WIDE:
-    return "RISCVISD::READ_CYCLE_WIDE";
+  NODE_NAME_CASE(RET_FLAG)
+  NODE_NAME_CASE(URET_FLAG)
+  NODE_NAME_CASE(SRET_FLAG)
+  NODE_NAME_CASE(MRET_FLAG)
+  NODE_NAME_CASE(CALL)
+  NODE_NAME_CASE(SELECT_CC)
+  NODE_NAME_CASE(BuildPairF64)
+  NODE_NAME_CASE(SplitF64)
+  NODE_NAME_CASE(TAIL)
+  NODE_NAME_CASE(SLLW)
+  NODE_NAME_CASE(SRAW)
+  NODE_NAME_CASE(SRLW)
+  NODE_NAME_CASE(DIVW)
+  NODE_NAME_CASE(DIVUW)
+  NODE_NAME_CASE(REMUW)
+  NODE_NAME_CASE(ROLW)
+  NODE_NAME_CASE(RORW)
+  NODE_NAME_CASE(FSLW)
+  NODE_NAME_CASE(FSRW)
+  NODE_NAME_CASE(FMV_H_X)
+  NODE_NAME_CASE(FMV_X_ANYEXTH)
+  NODE_NAME_CASE(FMV_W_X_RV64)
+  NODE_NAME_CASE(FMV_X_ANYEXTW_RV64)
+  NODE_NAME_CASE(READ_CYCLE_WIDE)
+  NODE_NAME_CASE(GREVI)
+  NODE_NAME_CASE(GREVIW)
+  NODE_NAME_CASE(GORCI)
+  NODE_NAME_CASE(GORCIW)
+  NODE_NAME_CASE(VMV_X_S)
+  NODE_NAME_CASE(SPLAT_VECTOR_I64)
+  NODE_NAME_CASE(READ_VLENB)
+  NODE_NAME_CASE(TRUNCATE_VECTOR)
+  NODE_NAME_CASE(VLEFF)
+  NODE_NAME_CASE(VLEFF_MASK)
+  NODE_NAME_CASE(VLSEGFF)
+  NODE_NAME_CASE(VLSEGFF_MASK)
+  NODE_NAME_CASE(READ_VL)
+  NODE_NAME_CASE(VSLIDEUP)
+  NODE_NAME_CASE(VSLIDEDOWN)
+  NODE_NAME_CASE(VID)
   }
+  // clang-format on
   return nullptr;
+#undef NODE_NAME_CASE
 }
 
 /// getConstraintType - Given a constraint letter, return the type of
@@ -2661,6 +4146,8 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
     case 'r':
       return std::make_pair(0U, &RISCV::GPRRegClass);
     case 'f':
+      if (Subtarget.hasStdExtZfh() && VT == MVT::f16)
+        return std::make_pair(0U, &RISCV::FPR16RegClass);
       if (Subtarget.hasStdExtF() && VT == MVT::f32)
         return std::make_pair(0U, &RISCV::FPR32RegClass);
       if (Subtarget.hasStdExtD() && VT == MVT::f64)
@@ -2675,7 +4162,7 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
   // official names. However, other frontends like `rustc` do not. This allows
   // users of these frontends to use the ABI names for registers in LLVM-style
   // register constraints.
-  Register XRegFromAlias = StringSwitch<Register>(Constraint.lower())
+  unsigned XRegFromAlias = StringSwitch<unsigned>(Constraint.lower())
                                .Case("{zero}", RISCV::X0)
                                .Case("{ra}", RISCV::X1)
                                .Case("{sp}", RISCV::X2)
@@ -2719,46 +4206,50 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
   //
   // The second case is the ABI name of the register, so that frontends can also
   // use the ABI names in register constraint lists.
-  if (Subtarget.hasStdExtF() || Subtarget.hasStdExtD()) {
-    std::pair<Register, Register> FReg =
-        StringSwitch<std::pair<Register, Register>>(Constraint.lower())
-            .Cases("{f0}", "{ft0}", {RISCV::F0_F, RISCV::F0_D})
-            .Cases("{f1}", "{ft1}", {RISCV::F1_F, RISCV::F1_D})
-            .Cases("{f2}", "{ft2}", {RISCV::F2_F, RISCV::F2_D})
-            .Cases("{f3}", "{ft3}", {RISCV::F3_F, RISCV::F3_D})
-            .Cases("{f4}", "{ft4}", {RISCV::F4_F, RISCV::F4_D})
-            .Cases("{f5}", "{ft5}", {RISCV::F5_F, RISCV::F5_D})
-            .Cases("{f6}", "{ft6}", {RISCV::F6_F, RISCV::F6_D})
-            .Cases("{f7}", "{ft7}", {RISCV::F7_F, RISCV::F7_D})
-            .Cases("{f8}", "{fs0}", {RISCV::F8_F, RISCV::F8_D})
-            .Cases("{f9}", "{fs1}", {RISCV::F9_F, RISCV::F9_D})
-            .Cases("{f10}", "{fa0}", {RISCV::F10_F, RISCV::F10_D})
-            .Cases("{f11}", "{fa1}", {RISCV::F11_F, RISCV::F11_D})
-            .Cases("{f12}", "{fa2}", {RISCV::F12_F, RISCV::F12_D})
-            .Cases("{f13}", "{fa3}", {RISCV::F13_F, RISCV::F13_D})
-            .Cases("{f14}", "{fa4}", {RISCV::F14_F, RISCV::F14_D})
-            .Cases("{f15}", "{fa5}", {RISCV::F15_F, RISCV::F15_D})
-            .Cases("{f16}", "{fa6}", {RISCV::F16_F, RISCV::F16_D})
-            .Cases("{f17}", "{fa7}", {RISCV::F17_F, RISCV::F17_D})
-            .Cases("{f18}", "{fs2}", {RISCV::F18_F, RISCV::F18_D})
-            .Cases("{f19}", "{fs3}", {RISCV::F19_F, RISCV::F19_D})
-            .Cases("{f20}", "{fs4}", {RISCV::F20_F, RISCV::F20_D})
-            .Cases("{f21}", "{fs5}", {RISCV::F21_F, RISCV::F21_D})
-            .Cases("{f22}", "{fs6}", {RISCV::F22_F, RISCV::F22_D})
-            .Cases("{f23}", "{fs7}", {RISCV::F23_F, RISCV::F23_D})
-            .Cases("{f24}", "{fs8}", {RISCV::F24_F, RISCV::F24_D})
-            .Cases("{f25}", "{fs9}", {RISCV::F25_F, RISCV::F25_D})
-            .Cases("{f26}", "{fs10}", {RISCV::F26_F, RISCV::F26_D})
-            .Cases("{f27}", "{fs11}", {RISCV::F27_F, RISCV::F27_D})
-            .Cases("{f28}", "{ft8}", {RISCV::F28_F, RISCV::F28_D})
-            .Cases("{f29}", "{ft9}", {RISCV::F29_F, RISCV::F29_D})
-            .Cases("{f30}", "{ft10}", {RISCV::F30_F, RISCV::F30_D})
-            .Cases("{f31}", "{ft11}", {RISCV::F31_F, RISCV::F31_D})
-            .Default({RISCV::NoRegister, RISCV::NoRegister});
-    if (FReg.first != RISCV::NoRegister)
-      return Subtarget.hasStdExtD()
-                 ? std::make_pair(FReg.second, &RISCV::FPR64RegClass)
-                 : std::make_pair(FReg.first, &RISCV::FPR32RegClass);
+  if (Subtarget.hasStdExtF()) {
+    unsigned FReg = StringSwitch<unsigned>(Constraint.lower())
+                        .Cases("{f0}", "{ft0}", RISCV::F0_F)
+                        .Cases("{f1}", "{ft1}", RISCV::F1_F)
+                        .Cases("{f2}", "{ft2}", RISCV::F2_F)
+                        .Cases("{f3}", "{ft3}", RISCV::F3_F)
+                        .Cases("{f4}", "{ft4}", RISCV::F4_F)
+                        .Cases("{f5}", "{ft5}", RISCV::F5_F)
+                        .Cases("{f6}", "{ft6}", RISCV::F6_F)
+                        .Cases("{f7}", "{ft7}", RISCV::F7_F)
+                        .Cases("{f8}", "{fs0}", RISCV::F8_F)
+                        .Cases("{f9}", "{fs1}", RISCV::F9_F)
+                        .Cases("{f10}", "{fa0}", RISCV::F10_F)
+                        .Cases("{f11}", "{fa1}", RISCV::F11_F)
+                        .Cases("{f12}", "{fa2}", RISCV::F12_F)
+                        .Cases("{f13}", "{fa3}", RISCV::F13_F)
+                        .Cases("{f14}", "{fa4}", RISCV::F14_F)
+                        .Cases("{f15}", "{fa5}", RISCV::F15_F)
+                        .Cases("{f16}", "{fa6}", RISCV::F16_F)
+                        .Cases("{f17}", "{fa7}", RISCV::F17_F)
+                        .Cases("{f18}", "{fs2}", RISCV::F18_F)
+                        .Cases("{f19}", "{fs3}", RISCV::F19_F)
+                        .Cases("{f20}", "{fs4}", RISCV::F20_F)
+                        .Cases("{f21}", "{fs5}", RISCV::F21_F)
+                        .Cases("{f22}", "{fs6}", RISCV::F22_F)
+                        .Cases("{f23}", "{fs7}", RISCV::F23_F)
+                        .Cases("{f24}", "{fs8}", RISCV::F24_F)
+                        .Cases("{f25}", "{fs9}", RISCV::F25_F)
+                        .Cases("{f26}", "{fs10}", RISCV::F26_F)
+                        .Cases("{f27}", "{fs11}", RISCV::F27_F)
+                        .Cases("{f28}", "{ft8}", RISCV::F28_F)
+                        .Cases("{f29}", "{ft9}", RISCV::F29_F)
+                        .Cases("{f30}", "{ft10}", RISCV::F30_F)
+                        .Cases("{f31}", "{ft11}", RISCV::F31_F)
+                        .Default(RISCV::NoRegister);
+    if (FReg != RISCV::NoRegister) {
+      assert(RISCV::F0_F <= FReg && FReg <= RISCV::F31_F && "Unknown fp-reg");
+      if (Subtarget.hasStdExtD()) {
+        unsigned RegNo = FReg - RISCV::F0_F;
+        unsigned DReg = RISCV::F0_D + RegNo;
+        return std::make_pair(DReg, &RISCV::FPR64RegClass);
+      }
+      return std::make_pair(FReg, &RISCV::FPR32RegClass);
+    }
   }
 
   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
@@ -2974,6 +4465,27 @@ Value *RISCVTargetLowering::emitMaskedAtomicCmpXchgIntrinsic(
   return Result;
 }
 
+bool RISCVTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
+                                                     EVT VT) const {
+  VT = VT.getScalarType();
+
+  if (!VT.isSimple())
+    return false;
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::f16:
+    return Subtarget.hasStdExtZfh();
+  case MVT::f32:
+    return Subtarget.hasStdExtF();
+  case MVT::f64:
+    return Subtarget.hasStdExtD();
+  default:
+    break;
+  }
+
+  return false;
+}
+
 Register RISCVTargetLowering::getExceptionPointerRegister(
     const Constant *PersonalityFn) const {
   return RISCV::X10;
@@ -2994,20 +4506,39 @@ bool RISCVTargetLowering::shouldExtendTypeInLibCall(EVT Type) const {
   return true;
 }
 
+bool RISCVTargetLowering::shouldSignExtendTypeInLibCall(EVT Type, bool IsSigned) const {
+  if (Subtarget.is64Bit() && Type == MVT::i32)
+    return true;
+
+  return IsSigned;
+}
+
 bool RISCVTargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
                                                  SDValue C) const {
   // Check integral scalar types.
   if (VT.isScalarInteger()) {
-    // Do not perform the transformation on riscv32 with the M extension.
-    if (!Subtarget.is64Bit() && Subtarget.hasStdExtM())
+    // Omit the optimization if the sub target has the M extension and the data
+    // size exceeds XLen.
+    if (Subtarget.hasStdExtM() && VT.getSizeInBits() > Subtarget.getXLen())
       return false;
     if (auto *ConstNode = dyn_cast<ConstantSDNode>(C.getNode())) {
-      if (ConstNode->getAPIntValue().getBitWidth() > 8 * sizeof(int64_t))
+      // Break the MUL to a SLLI and an ADD/SUB.
+      const APInt &Imm = ConstNode->getAPIntValue();
+      if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2() ||
+          (1 - Imm).isPowerOf2() || (-1 - Imm).isPowerOf2())
+        return true;
+      // Omit the following optimization if the sub target has the M extension
+      // and the data size >= XLen.
+      if (Subtarget.hasStdExtM() && VT.getSizeInBits() >= Subtarget.getXLen())
         return false;
-      int64_t Imm = ConstNode->getSExtValue();
-      if (isPowerOf2_64(Imm + 1) || isPowerOf2_64(Imm - 1) ||
-          isPowerOf2_64(1 - Imm) || isPowerOf2_64(-1 - Imm))
+      // Break the MUL to two SLLI instructions and an ADD/SUB, if Imm needs
+      // a pair of LUI/ADDI.
+      if (!Imm.isSignedIntN(12) && Imm.countTrailingZeros() < 12) {
+        APInt ImmS = Imm.ashr(Imm.countTrailingZeros());
+        if ((ImmS + 1).isPowerOf2() || (ImmS - 1).isPowerOf2() ||
+            (1 - ImmS).isPowerOf2())
         return true;
+      }
     }
   }
 
@@ -3032,3 +4563,19 @@ RISCVTargetLowering::getRegisterByName(const char *RegName, LLT VT,
                              StringRef(RegName) + "\"."));
   return Reg;
 }
+
+namespace llvm {
+namespace RISCVVIntrinsicsTable {
+
+#define GET_RISCVVIntrinsicsTable_IMPL
+#include "RISCVGenSearchableTables.inc"
+
+} // namespace RISCVVIntrinsicsTable
+
+namespace RISCVZvlssegTable {
+
+#define GET_RISCVZvlssegTable_IMPL
+#include "RISCVGenSearchableTables.inc"
+
+} // namespace RISCVZvlssegTable
+} // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.h b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.h
index e420e879efc9..40b1a45c6d15 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -28,6 +28,12 @@ enum NodeType : unsigned {
   SRET_FLAG,
   MRET_FLAG,
   CALL,
+  /// Select with condition operator - This selects between a true value and
+  /// a false value (ops #3 and #4) based on the boolean result of comparing
+  /// the lhs and rhs (ops #0 and #1) of a conditional expression with the
+  /// condition code in op #2, a XLenVT constant from the ISD::CondCode enum.
+  /// The lhs and rhs are XLenVT integers. The true and false values can be
+  /// integer or floating point.
   SELECT_CC,
   BuildPairF64,
   SplitF64,
@@ -38,22 +44,75 @@ enum NodeType : unsigned {
   SRAW,
   SRLW,
   // 32-bit operations from RV64M that can't be simply matched with a pattern
-  // at instruction selection time.
+  // at instruction selection time. These have undefined behavior for division
+  // by 0 or overflow (divw) like their target independent counterparts.
   DIVW,
   DIVUW,
   REMUW,
-  // FPR32<->GPR transfer operations for RV64. Needed as an i32<->f32 bitcast
-  // is not legal on RV64. FMV_W_X_RV64 matches the semantics of the FMV.W.X.
+  // RV64IB rotates, directly matching the semantics of the named RISC-V
+  // instructions.
+  ROLW,
+  RORW,
+  // RV64IB funnel shifts, with the semantics of the named RISC-V instructions,
+  // but the same operand order as fshl/fshr intrinsics.
+  FSRW,
+  FSLW,
+  // FPR<->GPR transfer operations when the FPR is smaller than XLEN, needed as
+  // XLEN is the only legal integer width.
+  //
+  // FMV_H_X matches the semantics of the FMV.H.X.
+  // FMV_X_ANYEXTH is similar to FMV.X.H but has an any-extended result.
+  // FMV_W_X_RV64 matches the semantics of the FMV.W.X.
   // FMV_X_ANYEXTW_RV64 is similar to FMV.X.W but has an any-extended result.
+  //
   // This is a more convenient semantic for producing dagcombines that remove
   // unnecessary GPR->FPR->GPR moves.
+  FMV_H_X,
+  FMV_X_ANYEXTH,
   FMV_W_X_RV64,
   FMV_X_ANYEXTW_RV64,
   // READ_CYCLE_WIDE - A read of the 64-bit cycle CSR on a 32-bit target
   // (returns (Lo, Hi)). It takes a chain operand.
-  READ_CYCLE_WIDE
+  READ_CYCLE_WIDE,
+  // Generalized Reverse and Generalized Or-Combine - directly matching the
+  // semantics of the named RISC-V instructions. Lowered as custom nodes as
+  // TableGen chokes when faced with commutative permutations in deeply-nested
+  // DAGs. Each node takes an input operand and a TargetConstant immediate
+  // shift amount, and outputs a bit-manipulated version of input. All operands
+  // are of type XLenVT.
+  GREVI,
+  GREVIW,
+  GORCI,
+  GORCIW,
+  // Vector Extension
+  // VMV_X_S matches the semantics of vmv.x.s. The result is always XLenVT
+  // sign extended from the vector element size. NOTE: The result size will
+  // never be less than the vector element size.
+  VMV_X_S,
+  // Splats an i64 scalar to a vector type (with element type i64) where the
+  // scalar is a sign-extended i32.
+  SPLAT_VECTOR_I64,
+  // Read VLENB CSR
+  READ_VLENB,
+  // Truncates a RVV integer vector by one power-of-two.
+  TRUNCATE_VECTOR,
+  // Unit-stride fault-only-first load
+  VLEFF,
+  VLEFF_MASK,
+  // Unit-stride fault-only-first segment load
+  VLSEGFF,
+  VLSEGFF_MASK,
+  // read vl CSR
+  READ_VL,
+  // Matches the semantics of vslideup/vslidedown. The first operand is the
+  // pass-thru operand, the second is the source vector, and the third is the
+  // XLenVT index (either constant or non-constant).
+  VSLIDEUP,
+  VSLIDEDOWN,
+  // Matches the semantics of the unmasked vid.v instruction.
+  VID,
 };
-}
+} // namespace RISCVISD
 
 class RISCVTargetLowering : public TargetLowering {
   const RISCVSubtarget &Subtarget;
@@ -62,6 +121,8 @@ public:
   explicit RISCVTargetLowering(const TargetMachine &TM,
                                const RISCVSubtarget &STI);
 
+  const RISCVSubtarget &getSubtarget() const { return Subtarget; }
+
   bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
                           MachineFunction &MF,
                           unsigned Intrinsic) const override;
@@ -74,6 +135,8 @@ public:
   bool isTruncateFree(EVT SrcVT, EVT DstVT) const override;
   bool isZExtFree(SDValue Val, EVT VT2) const override;
   bool isSExtCheaperThanZExt(EVT SrcVT, EVT DstVT) const override;
+  bool isCheapToSpeculateCttz() const override;
+  bool isCheapToSpeculateCtlz() const override;
   bool isFPImmLegal(const APFloat &Imm, EVT VT,
                     bool ForCodeSize) const override;
 
@@ -86,6 +149,15 @@ public:
 
   SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
+  bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits,
+                                    const APInt &DemandedElts,
+                                    TargetLoweringOpt &TLO) const override;
+
+  void computeKnownBitsForTargetNode(const SDValue Op,
+                                     KnownBits &Known,
+                                     const APInt &DemandedElts,
+                                     const SelectionDAG &DAG,
+                                     unsigned Depth) const override;
   unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
                                            const APInt &DemandedElts,
                                            const SelectionDAG &DAG,
@@ -126,6 +198,9 @@ public:
   Instruction *emitTrailingFence(IRBuilder<> &Builder, Instruction *Inst,
                                  AtomicOrdering Ord) const override;
 
+  bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
+                                  EVT VT) const override;
+
   ISD::NodeType getExtendForAtomicOps() const override {
     return ISD::SIGN_EXTEND;
   }
@@ -153,6 +228,7 @@ public:
   getExceptionSelectorRegister(const Constant *PersonalityFn) const override;
 
   bool shouldExtendTypeInLibCall(EVT Type) const override;
+  bool shouldSignExtendTypeInLibCall(EVT Type, bool IsSigned) const override;
 
   /// Returns the register with the specified architectural or ABI name. This
   /// method is necessary to lower the llvm.read_register.* and
@@ -220,6 +296,7 @@ private:
   SDValue lowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerSELECT(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerVASTART(SDValue Op, SelectionDAG &DAG) const;
@@ -227,7 +304,14 @@ private:
   SDValue lowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerShiftRightParts(SDValue Op, SelectionDAG &DAG, bool IsSRA) const;
+  SDValue lowerSPLATVECTOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerVectorMaskExt(SDValue Op, SelectionDAG &DAG,
+                             int64_t ExtTrueVal) const;
+  SDValue lowerVectorMaskTrunc(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
 
   bool isEligibleForTailCallOptimization(
       CCState &CCInfo, CallLoweringInfo &CLI, MachineFunction &MF,
@@ -239,6 +323,37 @@ private:
       const SmallVectorImpl<std::pair<llvm::Register, llvm::SDValue>> &Regs,
       MachineFunction &MF) const;
 };
+
+namespace RISCVVIntrinsicsTable {
+
+struct RISCVVIntrinsicInfo {
+  unsigned int IntrinsicID;
+  unsigned int ExtendedOperand;
+};
+
+using namespace RISCV;
+
+#define GET_RISCVVIntrinsicsTable_DECL
+#include "RISCVGenSearchableTables.inc"
+
+} // end namespace RISCVVIntrinsicsTable
+
+namespace RISCVZvlssegTable {
+
+struct RISCVZvlsseg {
+  unsigned int IntrinsicID;
+  unsigned int SEW;
+  unsigned int LMUL;
+  unsigned int IndexLMUL;
+  unsigned int Pseudo;
+};
+
+using namespace RISCV;
+
+#define GET_RISCVZvlssegTable_DECL
+#include "RISCVGenSearchableTables.inc"
+
+} // namespace RISCVZvlssegTable
 }
 
 #endif
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrFormats.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrFormats.td
index a47945a6a515..7be74b79d99b 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrFormats.td
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrFormats.td
@@ -49,18 +49,61 @@ def InstFormatCB     : InstFormat<15>;
 def InstFormatCJ     : InstFormat<16>;
 def InstFormatOther  : InstFormat<17>;
 
-class RISCVVConstraint<bits<4> val> {
-  bits<4> Value = val;
+class RISCVVConstraint<bits<3> val> {
+  bits<3> Value = val;
 }
-def NoConstraint : RISCVVConstraint<0>;
-def WidenV       : RISCVVConstraint<1>;
-def WidenW       : RISCVVConstraint<2>;
-def WidenCvt     : RISCVVConstraint<3>;
-def Narrow       : RISCVVConstraint<4>;
-def Iota         : RISCVVConstraint<5>;
-def SlideUp      : RISCVVConstraint<6>;
-def Vrgather     : RISCVVConstraint<7>;
-def Vcompress    : RISCVVConstraint<8>;
+def NoConstraint  : RISCVVConstraint<0b000>;
+def VS2Constraint : RISCVVConstraint<0b001>;
+def VS1Constraint : RISCVVConstraint<0b010>;
+def VMConstraint  : RISCVVConstraint<0b100>;
+
+// Illegal instructions:
+//
+// * The destination vector register group for a masked vector instruction
+// cannot overlap the source mask register (v0), unless the destination vector
+// register is being written with a mask value (e.g., comparisons) or the
+// scalar result of a reduction.
+//
+// * Widening: The destination EEW is greater than the source EEW, the source
+// EMUL is at least 1. The destination vector register group cannot overlap
+// with the source vector register groups besides the highest-numbered part of
+// the destination register group.
+//
+// * Narrowing: The destination EEW is smaller than the source EEW. The
+// destination vector register group cannot overlap with the source vector
+// register groups besides the lowest-numbered part of the source register
+// group.
+//
+// * vmsbf.m/vmsif.m/vmsof.m: The destination register cannot overlap the
+// source register and, if masked, cannot overlap the mask register ('v0').
+//
+// * viota: The destination register cannot overlap the source register and,
+// if masked, cannot overlap the mask register ('v0').
+//
+// * v[f]slide[1]up: The destination vector register group for vslideup cannot
+// overlap the source vector register group.
+//
+// * vrgather: The destination vector register group cannot overlap with the
+// source vector register groups.
+//
+// * vcompress: The destination vector register group cannot overlap the
+// source vector register group or the source mask register
+def WidenV       : RISCVVConstraint<!or(VS2Constraint.Value,
+                                        VS1Constraint.Value,
+                                        VMConstraint.Value)>;
+def WidenW       : RISCVVConstraint<!or(VS1Constraint.Value,
+                                        VMConstraint.Value)>;
+def WidenCvt     : RISCVVConstraint<!or(VS2Constraint.Value,
+                                        VMConstraint.Value)>;
+def Iota         : RISCVVConstraint<!or(VS2Constraint.Value,
+                                        VMConstraint.Value)>;
+def SlideUp      : RISCVVConstraint<!or(VS2Constraint.Value,
+                                        VMConstraint.Value)>;
+def Vrgather     : RISCVVConstraint<!or(VS2Constraint.Value,
+                                        VS1Constraint.Value,
+                                        VMConstraint.Value)>;
+def Vcompress    : RISCVVConstraint<!or(VS2Constraint.Value,
+                                        VS1Constraint.Value)>;
 
 // The following opcode names match those given in Table 19.1 in the
 // RISC-V User-level ISA specification ("RISC-V base opcode map").
@@ -116,7 +159,25 @@ class RVInst<dag outs, dag ins, string opcodestr, string argstr,
 
   // Defaults
   RISCVVConstraint RVVConstraint = NoConstraint;
-  let TSFlags{8-5} = RVVConstraint.Value;
+  let TSFlags{7-5} = RVVConstraint.Value;
+
+  bits<3> VLMul = 0;
+  let TSFlags{10-8} = VLMul;
+
+  bit HasDummyMask = 0;
+  let TSFlags{11} = HasDummyMask;
+
+  bit WritesElement0 = 0;
+  let TSFlags{12} = WritesElement0;
+
+  bit HasMergeOp = 0;
+  let TSFlags{13} = HasMergeOp;
+
+  bit HasSEWOp = 0;
+  let TSFlags{14} = HasSEWOp;
+
+  bit HasVLOp = 0;
+  let TSFlags{15} = HasVLOp;
 }
 
 // Pseudo instructions
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrFormatsV.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrFormatsV.td
index e5f154966ba6..80f46b73bfd7 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrFormatsV.td
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrFormatsV.td
@@ -21,37 +21,67 @@ def OPIVX : RISCVVFormat<0b100>;
 def OPFVF : RISCVVFormat<0b101>;
 def OPMVX : RISCVVFormat<0b110>;
 
-class RISCVMOP<bits<3> val> {
-  bits<3> Value = val;
+class RISCVMOP<bits<2> val> {
+  bits<2> Value = val;
 }
-def MOPLDUnitStrideU : RISCVMOP<0b000>;
-def MOPLDStridedU    : RISCVMOP<0b010>;
-def MOPLDIndexedU    : RISCVMOP<0b011>;
-def MOPLDUnitStrideS : RISCVMOP<0b100>;
-def MOPLDStridedS    : RISCVMOP<0b110>;
-def MOPLDIndexedS    : RISCVMOP<0b111>;
-
-def MOPSTUnitStride  : RISCVMOP<0b000>;
-def MOPSTStrided     : RISCVMOP<0b010>;
-def MOPSTIndexedOrder: RISCVMOP<0b011>;
-def MOPSTIndexedUnOrd: RISCVMOP<0b111>;
+def MOPLDUnitStride   : RISCVMOP<0b00>;
+def MOPLDIndexedUnord : RISCVMOP<0b01>;
+def MOPLDStrided      : RISCVMOP<0b10>;
+def MOPLDIndexedOrder : RISCVMOP<0b11>;
+
+def MOPSTUnitStride   : RISCVMOP<0b00>;
+def MOPSTIndexedUnord : RISCVMOP<0b01>;
+def MOPSTStrided      : RISCVMOP<0b10>;
+def MOPSTIndexedOrder : RISCVMOP<0b11>;
 
 class RISCVLSUMOP<bits<5> val> {
   bits<5> Value = val;
 }
 def LUMOPUnitStride  : RISCVLSUMOP<0b00000>;
+def LUMOPUnitStrideMask : RISCVLSUMOP<0b01011>;
 def LUMOPUnitStrideWholeReg : RISCVLSUMOP<0b01000>;
 def LUMOPUnitStrideFF: RISCVLSUMOP<0b10000>;
 def SUMOPUnitStride  : RISCVLSUMOP<0b00000>;
+def SUMOPUnitStrideMask : RISCVLSUMOP<0b01011>;
 def SUMOPUnitStrideWholeReg : RISCVLSUMOP<0b01000>;
 
-class RISCVWidth<bits<3> val> {
-  bits<3> Value = val;
+class RISCVAMOOP<bits<5> val> {
+  bits<5> Value = val;
+}
+def AMOOPVamoSwap : RISCVAMOOP<0b00001>;
+def AMOOPVamoAdd : RISCVAMOOP<0b00000>;
+def AMOOPVamoXor : RISCVAMOOP<0b00100>;
+def AMOOPVamoAnd : RISCVAMOOP<0b01100>;
+def AMOOPVamoOr : RISCVAMOOP<0b01000>;
+def AMOOPVamoMin : RISCVAMOOP<0b10000>;
+def AMOOPVamoMax : RISCVAMOOP<0b10100>;
+def AMOOPVamoMinu : RISCVAMOOP<0b11000>;
+def AMOOPVamoMaxu : RISCVAMOOP<0b11100>;
+
+class RISCVWidth<bits<4> val> {
+  bits<4> Value = val;
+}
+def LSWidth8     : RISCVWidth<0b0000>;
+def LSWidth16    : RISCVWidth<0b0101>;
+def LSWidth32    : RISCVWidth<0b0110>;
+def LSWidth64    : RISCVWidth<0b0111>;
+
+class RVInstSetiVLi<dag outs, dag ins, string opcodestr, string argstr>
+    : RVInst<outs, ins, opcodestr, argstr, [], InstFormatI> {
+  bits<5> uimm;
+  bits<5> rd;
+  bits<10> vtypei;
+
+  let Inst{31} = 1;
+  let Inst{30} = 1;
+  let Inst{29-20} = vtypei{9-0};
+  let Inst{19-15} = uimm;
+  let Inst{14-12} = 0b111;
+  let Inst{11-7} = rd;
+  let Opcode = OPC_OP_V.Value;
+
+  let Defs = [VTYPE, VL];
 }
-def LSWidthVByte : RISCVWidth<0b000>;
-def LSWidthVHalf : RISCVWidth<0b101>;
-def LSWidthVWord : RISCVWidth<0b110>;
-def LSWidthVSEW  : RISCVWidth<0b111>;
 
 class RVInstSetVLi<dag outs, dag ins, string opcodestr, string argstr>
     : RVInst<outs, ins, opcodestr, argstr, [], InstFormatI> {
@@ -103,6 +133,7 @@ class RVInstVV<bits<6> funct6, RISCVVFormat opv, dag outs, dag ins,
   let Opcode = OPC_OP_V.Value;
 
   let Uses = [VTYPE, VL];
+  let RVVConstraint = VMConstraint;
 }
 
 class RVInstVX<bits<6> funct6, RISCVVFormat opv, dag outs, dag ins,
@@ -122,6 +153,7 @@ class RVInstVX<bits<6> funct6, RISCVVFormat opv, dag outs, dag ins,
   let Opcode = OPC_OP_V.Value;
 
   let Uses = [VTYPE, VL];
+  let RVVConstraint = VMConstraint;
 }
 
 class RVInstV2<bits<6> funct6, bits<5> vs2, RISCVVFormat opv, dag outs, dag ins,
@@ -140,6 +172,7 @@ class RVInstV2<bits<6> funct6, bits<5> vs2, RISCVVFormat opv, dag outs, dag ins,
   let Opcode = OPC_OP_V.Value;
 
   let Uses = [VTYPE, VL];
+  let RVVConstraint = VMConstraint;
 }
 
 class RVInstIVI<bits<6> funct6, dag outs, dag ins, string opcodestr,
@@ -159,6 +192,7 @@ class RVInstIVI<bits<6> funct6, dag outs, dag ins, string opcodestr,
   let Opcode = OPC_OP_V.Value;
 
   let Uses = [VTYPE, VL];
+  let RVVConstraint = VMConstraint;
 }
 
 class RVInstV<bits<6> funct6, bits<5> vs1, RISCVVFormat opv, dag outs,
@@ -177,10 +211,11 @@ class RVInstV<bits<6> funct6, bits<5> vs1, RISCVVFormat opv, dag outs,
   let Opcode = OPC_OP_V.Value;
 
   let Uses = [VTYPE, VL];
+  let RVVConstraint = VMConstraint;
 }
 
-class RVInstVLU<bits<3> nf, RISCVMOP mop, RISCVLSUMOP lumop,
-                RISCVWidth width, dag outs, dag ins, string opcodestr,
+class RVInstVLU<bits<3> nf, bit mew, RISCVLSUMOP lumop,
+                bits<3> width, dag outs, dag ins, string opcodestr,
                 string argstr>
     : RVInst<outs, ins, opcodestr, argstr, [], InstFormatR> {
   bits<5> rs1;
@@ -188,18 +223,20 @@ class RVInstVLU<bits<3> nf, RISCVMOP mop, RISCVLSUMOP lumop,
   bit vm;
 
   let Inst{31-29} = nf;
-  let Inst{28-26} = mop.Value;
+  let Inst{28} = mew;
+  let Inst{27-26} = MOPLDUnitStride.Value;
   let Inst{25} = vm;
   let Inst{24-20} = lumop.Value;
   let Inst{19-15} = rs1;
-  let Inst{14-12} = width.Value;
+  let Inst{14-12} = width;
   let Inst{11-7} = vd;
   let Opcode = OPC_LOAD_FP.Value;
 
   let Uses = [VTYPE, VL];
+  let RVVConstraint = VMConstraint;
 }
 
-class RVInstVLS<bits<3> nf, RISCVMOP mop, RISCVWidth width,
+class RVInstVLS<bits<3> nf, bit mew, bits<3> width,
                 dag outs, dag ins, string opcodestr, string argstr>
     : RVInst<outs, ins, opcodestr, argstr, [], InstFormatR> {
   bits<5> rs2;
@@ -208,18 +245,20 @@ class RVInstVLS<bits<3> nf, RISCVMOP mop, RISCVWidth width,
   bit vm;
 
   let Inst{31-29} = nf;
-  let Inst{28-26} = mop.Value;
+  let Inst{28} = mew;
+  let Inst{27-26} = MOPLDStrided.Value;
   let Inst{25} = vm;
   let Inst{24-20} = rs2;
   let Inst{19-15} = rs1;
-  let Inst{14-12} = width.Value;
+  let Inst{14-12} = width;
   let Inst{11-7} = vd;
   let Opcode = OPC_LOAD_FP.Value;
 
   let Uses = [VTYPE, VL];
+  let RVVConstraint = VMConstraint;
 }
 
-class RVInstVLX<bits<3> nf, RISCVMOP mop, RISCVWidth width,
+class RVInstVLX<bits<3> nf, bit mew, RISCVMOP mop, bits<3> width,
                 dag outs, dag ins, string opcodestr, string argstr>
     : RVInst<outs, ins, opcodestr, argstr, [], InstFormatR> {
   bits<5> vs2;
@@ -228,19 +267,21 @@ class RVInstVLX<bits<3> nf, RISCVMOP mop, RISCVWidth width,
   bit vm;
 
   let Inst{31-29} = nf;
-  let Inst{28-26} = mop.Value;
+  let Inst{28} = mew;
+  let Inst{27-26} = mop.Value;
   let Inst{25} = vm;
   let Inst{24-20} = vs2;
   let Inst{19-15} = rs1;
-  let Inst{14-12} = width.Value;
+  let Inst{14-12} = width;
   let Inst{11-7} = vd;
   let Opcode = OPC_LOAD_FP.Value;
 
   let Uses = [VTYPE, VL];
+  let RVVConstraint = VMConstraint;
 }
 
-class RVInstVSU<bits<3> nf, RISCVMOP mop, RISCVLSUMOP sumop,
-                RISCVWidth width, dag outs, dag ins, string opcodestr,
+class RVInstVSU<bits<3> nf, bit mew, RISCVLSUMOP sumop,
+                bits<3> width, dag outs, dag ins, string opcodestr,
                 string argstr>
     : RVInst<outs, ins, opcodestr, argstr, [], InstFormatR> {
   bits<5> rs1;
@@ -248,18 +289,19 @@ class RVInstVSU<bits<3> nf, RISCVMOP mop, RISCVLSUMOP sumop,
   bit vm;
 
   let Inst{31-29} = nf;
-  let Inst{28-26} = mop.Value;
+  let Inst{28} = mew;
+  let Inst{27-26} = MOPSTUnitStride.Value;
   let Inst{25} = vm;
   let Inst{24-20} = sumop.Value;
   let Inst{19-15} = rs1;
-  let Inst{14-12} = width.Value;
+  let Inst{14-12} = width;
   let Inst{11-7} = vs3;
   let Opcode = OPC_STORE_FP.Value;
 
   let Uses = [VTYPE, VL];
 }
 
-class RVInstVSS<bits<3> nf, RISCVMOP mop, RISCVWidth width,
+class RVInstVSS<bits<3> nf, bit mew, bits<3> width,
                 dag outs, dag ins, string opcodestr, string argstr>
     : RVInst<outs, ins, opcodestr, argstr, [], InstFormatR> {
   bits<5> rs2;
@@ -268,18 +310,19 @@ class RVInstVSS<bits<3> nf, RISCVMOP mop, RISCVWidth width,
   bit vm;
 
   let Inst{31-29} = nf;
-  let Inst{28-26} = mop.Value;
+  let Inst{28} = mew;
+  let Inst{27-26} = MOPSTStrided.Value;
   let Inst{25} = vm;
   let Inst{24-20} = rs2;
   let Inst{19-15} = rs1;
-  let Inst{14-12} = width.Value;
+  let Inst{14-12} = width;
   let Inst{11-7} = vs3;
   let Opcode = OPC_STORE_FP.Value;
 
   let Uses = [VTYPE, VL];
 }
 
-class RVInstVSX<bits<3> nf, RISCVMOP mop, RISCVWidth width,
+class RVInstVSX<bits<3> nf, bit mew, RISCVMOP mop, bits<3> width,
                 dag outs, dag ins, string opcodestr, string argstr>
     : RVInst<outs, ins, opcodestr, argstr, [], InstFormatR> {
   bits<5> vs2;
@@ -288,13 +331,33 @@ class RVInstVSX<bits<3> nf, RISCVMOP mop, RISCVWidth width,
   bit vm;
 
   let Inst{31-29} = nf;
-  let Inst{28-26} = mop.Value;
+  let Inst{28} = mew;
+  let Inst{27-26} = mop.Value;
   let Inst{25} = vm;
   let Inst{24-20} = vs2;
   let Inst{19-15} = rs1;
-  let Inst{14-12} = width.Value;
+  let Inst{14-12} = width;
   let Inst{11-7} = vs3;
   let Opcode = OPC_STORE_FP.Value;
 
   let Uses = [VTYPE, VL];
 }
+
+class RVInstVAMO<RISCVAMOOP amoop, bits<3> width, dag outs, 
+                 dag ins, string opcodestr, string argstr>
+    : RVInst<outs, ins, opcodestr, argstr, [], InstFormatR> {
+  bits<5> vs2;
+  bits<5> rs1;
+  bit wd;
+  bit vm;
+
+  let Inst{31-27} = amoop.Value;
+  let Inst{26} = wd;
+  let Inst{25} = vm;
+  let Inst{24-20} = vs2;
+  let Inst{19-15} = rs1;
+  let Inst{14-12} = width;
+  let Opcode = OPC_AMO.Value;
+
+  let Uses = [VTYPE, VL];
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 7b6ea002c7b7..45a5e10e26a3 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -11,10 +11,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "RISCVInstrInfo.h"
+#include "MCTargetDesc/RISCVMatInt.h"
 #include "RISCV.h"
 #include "RISCVSubtarget.h"
 #include "RISCVTargetMachine.h"
-#include "Utils/RISCVMatInt.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -45,6 +45,7 @@ unsigned RISCVInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
   case RISCV::LBU:
   case RISCV::LH:
   case RISCV::LHU:
+  case RISCV::FLH:
   case RISCV::LW:
   case RISCV::FLW:
   case RISCV::LWU:
@@ -70,6 +71,7 @@ unsigned RISCVInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
   case RISCV::SB:
   case RISCV::SH:
   case RISCV::SW:
+  case RISCV::FSH:
   case RISCV::FSW:
   case RISCV::SD:
   case RISCV::FSD:
@@ -96,18 +98,37 @@ void RISCVInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     return;
   }
 
-  // FPR->FPR copies
+  // FPR->FPR copies and VR->VR copies.
   unsigned Opc;
-  if (RISCV::FPR32RegClass.contains(DstReg, SrcReg))
+  bool IsScalableVector = false;
+  if (RISCV::FPR16RegClass.contains(DstReg, SrcReg))
+    Opc = RISCV::FSGNJ_H;
+  else if (RISCV::FPR32RegClass.contains(DstReg, SrcReg))
     Opc = RISCV::FSGNJ_S;
   else if (RISCV::FPR64RegClass.contains(DstReg, SrcReg))
     Opc = RISCV::FSGNJ_D;
-  else
+  else if (RISCV::VRRegClass.contains(DstReg, SrcReg)) {
+    Opc = RISCV::PseudoVMV1R_V;
+    IsScalableVector = true;
+  } else if (RISCV::VRM2RegClass.contains(DstReg, SrcReg)) {
+    Opc = RISCV::PseudoVMV2R_V;
+    IsScalableVector = true;
+  } else if (RISCV::VRM4RegClass.contains(DstReg, SrcReg)) {
+    Opc = RISCV::PseudoVMV4R_V;
+    IsScalableVector = true;
+  } else if (RISCV::VRM8RegClass.contains(DstReg, SrcReg)) {
+    Opc = RISCV::PseudoVMV8R_V;
+    IsScalableVector = true;
+  } else
     llvm_unreachable("Impossible reg-to-reg copy");
 
-  BuildMI(MBB, MBBI, DL, get(Opc), DstReg)
-      .addReg(SrcReg, getKillRegState(KillSrc))
-      .addReg(SrcReg, getKillRegState(KillSrc));
+  if (IsScalableVector)
+    BuildMI(MBB, MBBI, DL, get(Opc), DstReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+  else
+    BuildMI(MBB, MBBI, DL, get(Opc), DstReg)
+        .addReg(SrcReg, getKillRegState(KillSrc))
+        .addReg(SrcReg, getKillRegState(KillSrc));
 }
 
 void RISCVInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
@@ -119,11 +140,18 @@ void RISCVInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   if (I != MBB.end())
     DL = I->getDebugLoc();
 
-  unsigned Opcode;
+  MachineFunction *MF = MBB.getParent();
+  const MachineFrameInfo &MFI = MF->getFrameInfo();
+  MachineMemOperand *MMO = MF->getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore,
+      MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
 
+  unsigned Opcode;
   if (RISCV::GPRRegClass.hasSubClassEq(RC))
     Opcode = TRI->getRegSizeInBits(RISCV::GPRRegClass) == 32 ?
              RISCV::SW : RISCV::SD;
+  else if (RISCV::FPR16RegClass.hasSubClassEq(RC))
+    Opcode = RISCV::FSH;
   else if (RISCV::FPR32RegClass.hasSubClassEq(RC))
     Opcode = RISCV::FSW;
   else if (RISCV::FPR64RegClass.hasSubClassEq(RC))
@@ -134,7 +162,8 @@ void RISCVInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   BuildMI(MBB, I, DL, get(Opcode))
       .addReg(SrcReg, getKillRegState(IsKill))
       .addFrameIndex(FI)
-      .addImm(0);
+      .addImm(0)
+      .addMemOperand(MMO);
 }
 
 void RISCVInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
@@ -146,11 +175,18 @@ void RISCVInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   if (I != MBB.end())
     DL = I->getDebugLoc();
 
-  unsigned Opcode;
+  MachineFunction *MF = MBB.getParent();
+  const MachineFrameInfo &MFI = MF->getFrameInfo();
+  MachineMemOperand *MMO = MF->getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad,
+      MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
 
+  unsigned Opcode;
   if (RISCV::GPRRegClass.hasSubClassEq(RC))
     Opcode = TRI->getRegSizeInBits(RISCV::GPRRegClass) == 32 ?
              RISCV::LW : RISCV::LD;
+  else if (RISCV::FPR16RegClass.hasSubClassEq(RC))
+    Opcode = RISCV::FLH;
   else if (RISCV::FPR32RegClass.hasSubClassEq(RC))
     Opcode = RISCV::FLW;
   else if (RISCV::FPR64RegClass.hasSubClassEq(RC))
@@ -158,7 +194,10 @@ void RISCVInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   else
     llvm_unreachable("Can't load this register from stack slot");
 
-  BuildMI(MBB, I, DL, get(Opcode), DstReg).addFrameIndex(FI).addImm(0);
+  BuildMI(MBB, I, DL, get(Opcode), DstReg)
+    .addFrameIndex(FI)
+    .addImm(0)
+    .addMemOperand(MMO);
 }
 
 void RISCVInstrInfo::movImm(MachineBasicBlock &MBB,
@@ -512,17 +551,48 @@ unsigned RISCVInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
 
 bool RISCVInstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
   const unsigned Opcode = MI.getOpcode();
-  switch(Opcode) {
-    default:
-      break;
-    case RISCV::ADDI:
-    case RISCV::ORI:
-    case RISCV::XORI:
-      return (MI.getOperand(1).isReg() && MI.getOperand(1).getReg() == RISCV::X0);
+  switch (Opcode) {
+  default:
+    break;
+  case RISCV::FSGNJ_D:
+  case RISCV::FSGNJ_S:
+    // The canonical floating-point move is fsgnj rd, rs, rs.
+    return MI.getOperand(1).isReg() && MI.getOperand(2).isReg() &&
+           MI.getOperand(1).getReg() == MI.getOperand(2).getReg();
+  case RISCV::ADDI:
+  case RISCV::ORI:
+  case RISCV::XORI:
+    return (MI.getOperand(1).isReg() &&
+            MI.getOperand(1).getReg() == RISCV::X0) ||
+           (MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0);
   }
   return MI.isAsCheapAsAMove();
 }
 
+Optional<DestSourcePair>
+RISCVInstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
+  if (MI.isMoveReg())
+    return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
+  switch (MI.getOpcode()) {
+  default:
+    break;
+  case RISCV::ADDI:
+    // Operand 1 can be a frameindex but callers expect registers
+    if (MI.getOperand(1).isReg() && MI.getOperand(2).isImm() &&
+        MI.getOperand(2).getImm() == 0)
+      return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
+    break;
+  case RISCV::FSGNJ_D:
+  case RISCV::FSGNJ_S:
+    // The canonical floating-point move is fsgnj rd, rs, rs.
+    if (MI.getOperand(1).isReg() && MI.getOperand(2).isReg() &&
+        MI.getOperand(1).getReg() == MI.getOperand(2).getReg())
+      return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
+    break;
+  }
+  return None;
+}
+
 bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI,
                                        StringRef &ErrInfo) const {
   const MCInstrInfo *MCII = STI.getInstrInfo();
@@ -551,15 +621,9 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI,
         case RISCVOp::OPERAND_SIMM12:
           Ok = isInt<12>(Imm);
           break;
-        case RISCVOp::OPERAND_SIMM13_LSB0:
-          Ok = isShiftedInt<12, 1>(Imm);
-          break;
         case RISCVOp::OPERAND_UIMM20:
           Ok = isUInt<20>(Imm);
           break;
-        case RISCVOp::OPERAND_SIMM21_LSB0:
-          Ok = isShiftedInt<20, 1>(Imm);
-          break;
         case RISCVOp::OPERAND_UIMMLOG2XLEN:
           if (STI.getTargetTriple().isArch64Bit())
             Ok = isUInt<6>(Imm);
@@ -699,10 +763,7 @@ outliner::OutlinedFunction RISCVInstrInfo::getOutliningCandidateInfo(
     return !LRU.available(RISCV::X5);
   };
 
-  RepeatedSequenceLocs.erase(std::remove_if(RepeatedSequenceLocs.begin(),
-                                            RepeatedSequenceLocs.end(),
-                                            CannotInsertCall),
-                             RepeatedSequenceLocs.end());
+  llvm::erase_if(RepeatedSequenceLocs, CannotInsertCall);
 
   // If the sequence doesn't have enough candidates left, then we're done.
   if (RepeatedSequenceLocs.size() < 2)
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index 21bc508cdc9c..0b034210aa55 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -83,6 +83,9 @@ public:
 
   bool isAsCheapAsAMove(const MachineInstr &MI) const override;
 
+  Optional<DestSourcePair>
+  isCopyInstrImpl(const MachineInstr &MI) const override;
+
   bool verifyInstruction(const MachineInstr &MI,
                          StringRef &ErrInfo) const override;
 
@@ -134,23 +137,5 @@ protected:
   const RISCVSubtarget &STI;
 };
 
-namespace RISCV {
-// Match with the definitions in RISCVInstrFormatsV.td
-enum RVVConstraintType {
-  NoConstraint = 0,
-  WidenV = 1,
-  WidenW = 2,
-  WidenCvt = 3,
-  Narrow = 4,
-  Iota = 5,
-  SlideUp = 6,
-  Vrgather = 7,
-  Vcompress = 8,
-
-  ConstraintOffset = 5,
-  ConstraintMask = 0b1111
-};
-} // end namespace RISCV
-
 } // end namespace llvm
 #endif
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index 8547f791092b..a07b589e77fb 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -25,6 +25,8 @@ def SDT_RISCVCall     : SDTypeProfile<0, -1, [SDTCisVT<0, XLenVT>]>;
 def SDT_RISCVSelectCC : SDTypeProfile<1, 5, [SDTCisSameAs<1, 2>,
                                              SDTCisSameAs<0, 4>,
                                              SDTCisSameAs<4, 5>]>;
+def SDT_RISCVReadCycleWide : SDTypeProfile<2, 0, [SDTCisVT<0, i32>,
+                                                  SDTCisVT<1, i32>]>;
 
 // Target-independent nodes, but with target-specific formats.
 def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_CallSeqStart,
@@ -44,8 +46,7 @@ def riscv_sret_flag : SDNode<"RISCVISD::SRET_FLAG", SDTNone,
                              [SDNPHasChain, SDNPOptInGlue]>;
 def riscv_mret_flag : SDNode<"RISCVISD::MRET_FLAG", SDTNone,
                              [SDNPHasChain, SDNPOptInGlue]>;
-def riscv_selectcc  : SDNode<"RISCVISD::SELECT_CC", SDT_RISCVSelectCC,
-                             [SDNPInGlue]>;
+def riscv_selectcc  : SDNode<"RISCVISD::SELECT_CC", SDT_RISCVSelectCC>;
 def riscv_tail      : SDNode<"RISCVISD::TAIL", SDT_RISCVCall,
                              [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                               SDNPVariadic]>;
@@ -53,6 +54,10 @@ def riscv_sllw      : SDNode<"RISCVISD::SLLW", SDTIntShiftOp>;
 def riscv_sraw      : SDNode<"RISCVISD::SRAW", SDTIntShiftOp>;
 def riscv_srlw      : SDNode<"RISCVISD::SRLW", SDTIntShiftOp>;
 
+def riscv_read_cycle_wide : SDNode<"RISCVISD::READ_CYCLE_WIDE",
+                                   SDT_RISCVReadCycleWide,
+                                   [SDNPHasChain, SDNPSideEffect]>;
+
 //===----------------------------------------------------------------------===//
 // Operand and SDNode transformation definitions.
 //===----------------------------------------------------------------------===//
@@ -161,6 +166,7 @@ def simm12_plus1 : Operand<XLenVT>, ImmLeaf<XLenVT,
 // A 13-bit signed immediate where the least significant bit is zero.
 def simm13_lsb0 : Operand<OtherVT> {
   let ParserMatchClass = SImmAsmOperand<13, "Lsb0">;
+  let PrintMethod = "printBranchOperand";
   let EncoderMethod = "getImmOpValueAsr1";
   let DecoderMethod = "decodeSImmOperandAndLsl1<13>";
   let MCOperandPredicate = [{
@@ -169,8 +175,7 @@ def simm13_lsb0 : Operand<OtherVT> {
       return isShiftedInt<12, 1>(Imm);
     return MCOp.isBareSymbolRef();
   }];
-  let OperandType = "OPERAND_SIMM13_LSB0";
-  let OperandNamespace = "RISCVOp";
+  let OperandType = "OPERAND_PCREL";
 }
 
 class UImm20Operand : Operand<XLenVT> {
@@ -200,6 +205,7 @@ def Simm21Lsb0JALAsmOperand : SImmAsmOperand<21, "Lsb0JAL"> {
 // A 21-bit signed immediate where the least significant bit is zero.
 def simm21_lsb0_jal : Operand<OtherVT> {
   let ParserMatchClass = Simm21Lsb0JALAsmOperand;
+  let PrintMethod = "printBranchOperand";
   let EncoderMethod = "getImmOpValueAsr1";
   let DecoderMethod = "decodeSImmOperandAndLsl1<21>";
   let MCOperandPredicate = [{
@@ -208,8 +214,7 @@ def simm21_lsb0_jal : Operand<OtherVT> {
       return isShiftedInt<20, 1>(Imm);
     return MCOp.isBareSymbolRef();
   }];
-  let OperandType = "OPERAND_SIMM21_LSB0";
-  let OperandNamespace = "RISCVOp";
+  let OperandType = "OPERAND_PCREL";
 }
 
 def BareSymbol : AsmOperandClass {
@@ -291,6 +296,11 @@ def immbottomxlenset : ImmLeaf<XLenVT, [{
   return countTrailingOnes<uint64_t>(Imm) >= 5;
 }]>;
 
+// A 6-bit constant greater than 32.
+def uimm6gt32 : ImmLeaf<XLenVT, [{
+  return isUInt<6>(Imm) && Imm > 32;
+}]>;
+
 // Addressing modes.
 // Necessary because a frameindex can't be matched directly in a pattern.
 def AddrFI : ComplexPattern<iPTR, 1, "SelectAddrFI", [frameindex], []>;
@@ -316,6 +326,25 @@ def NegImm : SDNodeXForm<imm, [{
                                    N->getValueType(0));
 }]>;
 
+// Return an immediate value minus 32.
+def ImmSub32 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getSExtValue() - 32, SDLoc(N),
+                                   N->getValueType(0));
+}]>;
+
+// Return an immediate subtracted from XLen.
+def ImmSubFromXLen : SDNodeXForm<imm, [{
+  uint64_t XLen = Subtarget->getXLen();
+  return CurDAG->getTargetConstant(XLen - N->getZExtValue(), SDLoc(N),
+                                   N->getValueType(0));
+}]>;
+
+// Return an immediate subtracted from 32.
+def ImmSubFrom32 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(32 - N->getZExtValue(), SDLoc(N),
+                                   N->getValueType(0));
+}]>;
+
 //===----------------------------------------------------------------------===//
 // Instruction Formats
 //===----------------------------------------------------------------------===//
@@ -368,12 +397,14 @@ class ALU_rr<bits<7> funct7, bits<3> funct3, string opcodestr>
     : RVInstR<funct7, funct3, OPC_OP, (outs GPR:$rd), (ins GPR:$rs1, GPR:$rs2),
               opcodestr, "$rd, $rs1, $rs2">;
 
-let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in
+let hasNoSchedulingInfo = 1,
+    hasSideEffects = 1, mayLoad = 0, mayStore = 0 in
 class CSR_ir<bits<3> funct3, string opcodestr>
     : RVInstI<funct3, OPC_SYSTEM, (outs GPR:$rd), (ins csr_sysreg:$imm12, GPR:$rs1),
               opcodestr, "$rd, $imm12, $rs1">, Sched<[WriteCSR, ReadCSR]>;
 
-let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in
+let hasNoSchedulingInfo = 1,
+    hasSideEffects = 1, mayLoad = 0, mayStore = 0 in
 class CSR_ii<bits<3> funct3, string opcodestr>
     : RVInstI<funct3, OPC_SYSTEM, (outs GPR:$rd),
               (ins csr_sysreg:$imm12, uimm5:$rs1),
@@ -791,6 +822,11 @@ def : MnemonicAlias<"move", "mv">;
 def : MnemonicAlias<"scall", "ecall">;
 def : MnemonicAlias<"sbreak", "ebreak">;
 
+// This alias was added to the spec in December 2020. Don't print it by default
+// to allow assembly we print to be compatible with versions of GNU assembler
+// that don't support this alias.
+def : InstAlias<"zext.b $rd, $rs", (ANDI GPR:$rd, GPR:$rs, 0xFF), 0>;
+
 //===----------------------------------------------------------------------===//
 // Pseudo-instructions and codegen patterns
 //
@@ -815,18 +851,30 @@ def IsOrAdd: PatFrag<(ops node:$A, node:$B), (or node:$A, node:$B), [{
   return isOrEquivalentToAdd(N);
 }]>;
 def assertsexti32 : PatFrag<(ops node:$src), (assertsext node:$src), [{
-  return cast<VTSDNode>(N->getOperand(1))->getVT() == MVT::i32;
+  return cast<VTSDNode>(N->getOperand(1))->getVT().bitsLE(MVT::i32);
 }]>;
 def sexti32 : PatFrags<(ops node:$src),
                        [(sext_inreg node:$src, i32),
                         (assertsexti32 node:$src)]>;
 def assertzexti32 : PatFrag<(ops node:$src), (assertzext node:$src), [{
-  return cast<VTSDNode>(N->getOperand(1))->getVT() == MVT::i32;
+  return cast<VTSDNode>(N->getOperand(1))->getVT().bitsLE(MVT::i32);
 }]>;
 def zexti32 : PatFrags<(ops node:$src),
                        [(and node:$src, 0xffffffff),
                         (assertzexti32 node:$src)]>;
 
+def SRLIWPat : PatFrag<(ops node:$A, node:$B),
+                       (srl (and node:$A, imm), node:$B), [{
+  return MatchSRLIW(N);
+}]>;
+
+// Check that it is a SLLIUW (Shift Logical Left Immediate Unsigned i32
+// on RV64). Also used to optimize the same sequence without SLLIUW.
+def SLLIUWPat : PatFrag<(ops node:$A, node:$B),
+                        (and (shl node:$A, node:$B), imm), [{
+  return MatchSLLIUW(N);
+}]>;
+
 /// Immediates
 
 def : Pat<(simm12:$imm), (ADDI X0, simm12:$imm)>;
@@ -857,6 +905,10 @@ class shiftop<SDPatternOperator operator>
     : PatFrags<(ops node:$val, node:$count),
                [(operator node:$val, node:$count),
                 (operator node:$val, (and node:$count, immbottomxlenset))]>;
+class shiftopw<SDPatternOperator operator>
+    : PatFrags<(ops node:$val, node:$count),
+               [(operator node:$val, node:$count),
+                (operator node:$val, (and node:$count, (XLenVT 31)))]>;
 
 def : PatGprGpr<shiftop<shl>, SLL>;
 def : PatGprGpr<shiftop<srl>, SRL>;
@@ -873,10 +925,10 @@ def PseudoAddTPRel : Pseudo<(outs GPR:$rd),
 
 /// FrameIndex calculations
 
-def : Pat<(add (i32 AddrFI:$Rs), simm12:$imm12),
-          (ADDI (i32 AddrFI:$Rs), simm12:$imm12)>;
-def : Pat<(IsOrAdd (i32 AddrFI:$Rs), simm12:$imm12),
-          (ADDI (i32 AddrFI:$Rs), simm12:$imm12)>;
+def : Pat<(add (XLenVT AddrFI:$Rs), simm12:$imm12),
+          (ADDI (XLenVT AddrFI:$Rs), simm12:$imm12)>;
+def : Pat<(IsOrAdd (XLenVT AddrFI:$Rs), simm12:$imm12),
+          (ADDI (XLenVT AddrFI:$Rs), simm12:$imm12)>;
 
 /// Setcc
 
@@ -938,15 +990,18 @@ def : BccSwapPat<setle, BGE>;
 def : BccSwapPat<setugt, BLTU>;
 def : BccSwapPat<setule, BGEU>;
 
-// An extra pattern is needed for a brcond without a setcc (i.e. where the
+// Extra patterns are needed for a brcond without a setcc (i.e. where the
 // condition was calculated elsewhere).
 def : Pat<(brcond GPR:$cond, bb:$imm12), (BNE GPR:$cond, X0, bb:$imm12)>;
+// In this pattern, the `(xor $cond, 1)` functions like (boolean) `not`, as the
+// `brcond` only uses the lowest bit.
+def : Pat<(brcond (XLenVT (xor GPR:$cond, 1)), bb:$imm12),
+          (BEQ GPR:$cond, X0, bb:$imm12)>;
 
 let isBarrier = 1, isBranch = 1, isTerminator = 1 in
 def PseudoBR : Pseudo<(outs), (ins simm21_lsb0_jal:$imm20), [(br bb:$imm20)]>,
                PseudoInstExpansion<(JAL X0, simm21_lsb0_jal:$imm20)>;
 
-let isCall = 1, Defs=[X1] in
 let isBarrier = 1, isBranch = 1, isIndirectBranch = 1, isTerminator = 1 in
 def PseudoBRIND : Pseudo<(outs), (ins GPR:$rs1, simm12:$imm12), []>,
                   PseudoInstExpansion<(JALR X0, GPR:$rs1, simm12:$imm12)>;
@@ -1038,6 +1093,25 @@ let hasSideEffects = 0, mayLoad = 1, mayStore = 0, isCodeGenOnly = 0,
 def PseudoLA_TLS_GD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
                              "la.tls.gd", "$dst, $src">;
 
+
+/// Sign/Zero Extends
+
+// There are single-instruction versions of these in Zbb, so disable these
+// Pseudos if that extension is present.
+let hasSideEffects = 0, mayLoad = 0,
+    mayStore = 0, isCodeGenOnly = 0, isAsmParserOnly = 1 in {
+def PseudoSEXT_B : Pseudo<(outs GPR:$rd), (ins GPR:$rs), [], "sext.b", "$rd, $rs">;
+def PseudoSEXT_H : Pseudo<(outs GPR:$rd), (ins GPR:$rs), [], "sext.h", "$rd, $rs">;
+// rv64's sext.w is defined above, using InstAlias<"sext.w ...
+// zext.b is defined above, using InstAlias<"zext.b ...
+def PseudoZEXT_H : Pseudo<(outs GPR:$rd), (ins GPR:$rs), [], "zext.h", "$rd, $rs">;
+} // hasSideEffects = 0, ...
+
+let Predicates = [IsRV64], hasSideEffects = 0, mayLoad = 0, mayStore = 0,
+  isCodeGenOnly = 0, isAsmParserOnly = 1 in {
+def PseudoZEXT_W : Pseudo<(outs GPR:$rd), (ins GPR:$rs), [], "zext.w", "$rd, $rs">;
+} // Predicates = [IsRV64], ...
+
 /// Loads
 
 multiclass LdPat<PatFrag LoadOp, RVInst Inst> {
@@ -1108,12 +1182,23 @@ def ADJCALLSTACKUP   : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
 
 /// RV64 patterns
 
+let Predicates = [IsRV64, NotHasStdExtZba] in {
+def : Pat<(and GPR:$rs1, 0xffffffff), (SRLI (SLLI GPR:$rs1, 32), 32)>;
+
+// If we're shifting a 32-bit zero extended value left by 0-31 bits, use 2
+// shifts instead of 3. This can occur when unsigned is used to index an array.
+def : Pat<(shl (and GPR:$rs1, 0xffffffff), uimm5:$shamt),
+          (SRLI (SLLI GPR:$rs1, 32), (ImmSubFrom32 uimm5:$shamt))>;
+// shl/and can appear in the other order too.
+def : Pat<(SLLIUWPat GPR:$rs1, uimm5:$shamt),
+          (SRLI (SLLI GPR:$rs1, 32), (ImmSubFrom32 uimm5:$shamt))>;
+}
+
 let Predicates = [IsRV64] in {
 
 /// sext and zext
 
 def : Pat<(sext_inreg GPR:$rs1, i32), (ADDIW GPR:$rs1, 0)>;
-def : Pat<(and GPR:$rs1, 0xffffffff), (SRLI (SLLI GPR:$rs1, 32), 32)>;
 
 /// ALU operations
 
@@ -1125,14 +1210,18 @@ def : Pat<(sext_inreg (sub GPR:$rs1, GPR:$rs2), i32),
           (SUBW GPR:$rs1, GPR:$rs2)>;
 def : Pat<(sext_inreg (shl GPR:$rs1, uimm5:$shamt), i32),
           (SLLIW GPR:$rs1, uimm5:$shamt)>;
-// (srl (zexti32 ...), uimm5:$shamt) is matched with custom code due to the
-// need to undo manipulation of the mask value performed by DAGCombine.
+def : Pat<(SRLIWPat GPR:$rs1, uimm5:$shamt),
+          (SRLIW GPR:$rs1, uimm5:$shamt)>;
+def : Pat<(srl (shl GPR:$rs1, (i64 32)), uimm6gt32:$shamt),
+          (SRLIW GPR:$rs1, (ImmSub32 uimm6gt32:$shamt))>;
 def : Pat<(sra (sext_inreg GPR:$rs1, i32), uimm5:$shamt),
           (SRAIW GPR:$rs1, uimm5:$shamt)>;
+def : Pat<(sra (shl GPR:$rs1, (i64 32)), uimm6gt32:$shamt),
+          (SRAIW GPR:$rs1, (ImmSub32 uimm6gt32:$shamt))>;
 
-def : PatGprGpr<riscv_sllw, SLLW>;
-def : PatGprGpr<riscv_srlw, SRLW>;
-def : PatGprGpr<riscv_sraw, SRAW>;
+def : PatGprGpr<shiftopw<riscv_sllw>, SLLW>;
+def : PatGprGpr<shiftopw<riscv_srlw>, SRLW>;
+def : PatGprGpr<shiftopw<riscv_sraw>, SRAW>;
 
 /// Loads
 
@@ -1153,9 +1242,10 @@ let Predicates = [IsRV64] in
 def : Pat<(readcyclecounter), (CSRRS CYCLE.Encoding, X0)>;
 // On RV32, ReadCycleWide will be expanded to the suggested loop reading both
 // halves of the 64-bit "cycle" CSR.
-let Predicates = [IsRV32], usesCustomInserter = 1, hasSideEffects = 0,
-mayLoad = 0, mayStore = 0, hasNoSchedulingInfo = 1 in
-def ReadCycleWide : Pseudo<(outs GPR:$lo, GPR:$hi), (ins), [], "", "">;
+let Predicates = [IsRV32], usesCustomInserter = 1, hasNoSchedulingInfo = 1 in
+def ReadCycleWide : Pseudo<(outs GPR:$lo, GPR:$hi), (ins),
+                           [(set GPR:$lo, GPR:$hi, (riscv_read_cycle_wide))],
+                           "", "">;
 
 /// traps
 
@@ -1178,3 +1268,4 @@ include "RISCVInstrInfoD.td"
 include "RISCVInstrInfoC.td"
 include "RISCVInstrInfoB.td"
 include "RISCVInstrInfoV.td"
+include "RISCVInstrInfoZfh.td"
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoB.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoB.td
index afac509f743d..7888ac7bac8e 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoB.td
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoB.td
@@ -7,16 +7,21 @@
 //===----------------------------------------------------------------------===//
 //
 // This file describes the RISC-V instructions from the standard 'B' Bitmanip
-// extension, version 0.92.
+// extension, version 0.93.
 // This version is still experimental as the 'B' extension hasn't been
 // ratified yet.
 //
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
-// Operand definitions.
+// Operand and SDNode transformation definitions.
 //===----------------------------------------------------------------------===//
 
+def riscv_rolw : SDNode<"RISCVISD::ROLW", SDTIntShiftOp>;
+def riscv_rorw : SDNode<"RISCVISD::RORW", SDTIntShiftOp>;
+def riscv_fslw : SDNode<"RISCVISD::FSLW", SDTIntShiftDOp>;
+def riscv_fsrw : SDNode<"RISCVISD::FSRW", SDTIntShiftDOp>;
+
 def UImmLog2XLenHalfAsmOperand : AsmOperandClass {
   let Name = "UImmLog2XLenHalf";
   let RenderMethod = "addImmOperands";
@@ -40,6 +45,44 @@ def shfl_uimm : Operand<XLenVT>, ImmLeaf<XLenVT, [{
   }];
 }
 
+// Checks if this mask has a single 0 bit and cannot be used with ANDI.
+def BCLRMask : ImmLeaf<XLenVT, [{
+  if (Subtarget->is64Bit())
+    return !isInt<12>(Imm) && isPowerOf2_64(~Imm);
+  return !isInt<12>(Imm) && isPowerOf2_32(~Imm);
+}]>;
+
+// Checks if this mask has a single 1 bit and cannot be used with ORI/XORI.
+def BSETINVMask : ImmLeaf<XLenVT, [{
+  if (Subtarget->is64Bit())
+    return !isInt<12>(Imm) && isPowerOf2_64(Imm);
+  return !isInt<12>(Imm) && isPowerOf2_32(Imm);
+}]>;
+
+def BCLRXForm : SDNodeXForm<imm, [{
+  // Find the lowest 0.
+  return CurDAG->getTargetConstant(N->getAPIntValue().countTrailingOnes(),
+                                   SDLoc(N), N->getValueType(0));
+}]>;
+
+def BSETINVXForm : SDNodeXForm<imm, [{
+  // Find the lowest 1.
+  return CurDAG->getTargetConstant(N->getAPIntValue().countTrailingZeros(),
+                                   SDLoc(N), N->getValueType(0));
+}]>;
+
+// Similar to above, but makes sure the immediate has 33 sign bits. When used
+// with an AND/OR/XOR where the other operand has at least 33 sign bits, the
+// result will have 33 sign bits. This can match BCLRIW/BSETIW/BINVIW.
+def BCLRWMask : ImmLeaf<i64, [{
+  // After checking the sign bits, truncate to 32 bits for power of 2 check.
+  return isInt<32>(Imm) && !isInt<12>(Imm) && isPowerOf2_32(~Imm);
+}]>;
+
+def BSETINVWMask : ImmLeaf<i64, [{
+  return isInt<32>(Imm) && !isInt<12>(Imm) && isPowerOf2_32(Imm);
+}]>;
+
 //===----------------------------------------------------------------------===//
 // Instruction class templates
 //===----------------------------------------------------------------------===//
@@ -55,11 +98,6 @@ class RVBUnary<bits<7> funct7, bits<5> funct5, bits<3> funct3,
   let Inst{24-20} = funct5;
 }
 
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class RVBALUW_ri<bits<3> funct3, string opcodestr>
-    : RVInstI<funct3, OPC_OP_IMM_32, (outs GPR:$rd),
-              (ins GPR:$rs1, simm12:$imm12), opcodestr, "$rd, $rs1, $imm12">;
-
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
 class RVBShift_ri<bits<5> funct5, bits<3> funct3, RISCVOpcode opcode,
                   string opcodestr>
@@ -147,10 +185,11 @@ def ORN   : ALU_rr<0b0100000, 0b110, "orn">, Sched<[]>;
 def XNOR  : ALU_rr<0b0100000, 0b100, "xnor">, Sched<[]>;
 } // Predicates = [HasStdExtZbbOrZbp]
 
-let Predicates = [HasStdExtZbb] in {
-def SLO  : ALU_rr<0b0010000, 0b001, "slo">, Sched<[]>;
-def SRO  : ALU_rr<0b0010000, 0b101, "sro">, Sched<[]>;
-} // Predicates = [HasStdExtZbb]
+let Predicates = [HasStdExtZba] in {
+def SH1ADD : ALU_rr<0b0010000, 0b010, "sh1add">, Sched<[]>;
+def SH2ADD : ALU_rr<0b0010000, 0b100, "sh2add">, Sched<[]>;
+def SH3ADD : ALU_rr<0b0010000, 0b110, "sh3add">, Sched<[]>;
+} // Predicates = [HasStdExtZba]
 
 let Predicates = [HasStdExtZbbOrZbp] in {
 def ROL   : ALU_rr<0b0110000, 0b001, "rol">, Sched<[]>;
@@ -158,10 +197,10 @@ def ROR   : ALU_rr<0b0110000, 0b101, "ror">, Sched<[]>;
 } // Predicates = [HasStdExtZbbOrZbp]
 
 let Predicates = [HasStdExtZbs] in {
-def SBCLR : ALU_rr<0b0100100, 0b001, "sbclr">, Sched<[]>;
-def SBSET : ALU_rr<0b0010100, 0b001, "sbset">, Sched<[]>;
-def SBINV : ALU_rr<0b0110100, 0b001, "sbinv">, Sched<[]>;
-def SBEXT : ALU_rr<0b0100100, 0b101, "sbext">, Sched<[]>;
+def BCLR : ALU_rr<0b0100100, 0b001, "bclr">, Sched<[]>;
+def BSET : ALU_rr<0b0010100, 0b001, "bset">, Sched<[]>;
+def BINV : ALU_rr<0b0110100, 0b001, "binv">, Sched<[]>;
+def BEXT : ALU_rr<0b0100100, 0b101, "bext">, Sched<[]>;
 } // Predicates = [HasStdExtZbs]
 
 let Predicates = [HasStdExtZbp] in {
@@ -169,19 +208,20 @@ def GORC : ALU_rr<0b0010100, 0b101, "gorc">, Sched<[]>;
 def GREV : ALU_rr<0b0110100, 0b101, "grev">, Sched<[]>;
 } // Predicates = [HasStdExtZbp]
 
-let Predicates = [HasStdExtZbb] in {
-def SLOI : RVBShift_ri<0b00100, 0b001, OPC_OP_IMM, "sloi">, Sched<[]>;
-def SROI : RVBShift_ri<0b00100, 0b101, OPC_OP_IMM, "sroi">, Sched<[]>;
-} // Predicates = [HasStdExtZbb]
+let Predicates = [HasStdExtZbp] in {
+def XPERMN : ALU_rr<0b0010100, 0b010, "xperm.n">, Sched<[]>;
+def XPERMB : ALU_rr<0b0010100, 0b100, "xperm.b">, Sched<[]>;
+def XPERMH : ALU_rr<0b0010100, 0b110, "xperm.h">, Sched<[]>;
+} // Predicates = [HasStdExtZbp]
 
 let Predicates = [HasStdExtZbbOrZbp] in
 def RORI  : RVBShift_ri<0b01100, 0b101, OPC_OP_IMM, "rori">, Sched<[]>;
 
 let Predicates = [HasStdExtZbs] in {
-def SBCLRI : RVBShift_ri<0b01001, 0b001, OPC_OP_IMM, "sbclri">, Sched<[]>;
-def SBSETI : RVBShift_ri<0b00101, 0b001, OPC_OP_IMM, "sbseti">, Sched<[]>;
-def SBINVI : RVBShift_ri<0b01101, 0b001, OPC_OP_IMM, "sbinvi">, Sched<[]>;
-def SBEXTI : RVBShift_ri<0b01001, 0b101, OPC_OP_IMM, "sbexti">, Sched<[]>;
+def BCLRI : RVBShift_ri<0b01001, 0b001, OPC_OP_IMM, "bclri">, Sched<[]>;
+def BSETI : RVBShift_ri<0b00101, 0b001, OPC_OP_IMM, "bseti">, Sched<[]>;
+def BINVI : RVBShift_ri<0b01101, 0b001, OPC_OP_IMM, "binvi">, Sched<[]>;
+def BEXTI : RVBShift_ri<0b01001, 0b101, OPC_OP_IMM, "bexti">, Sched<[]>;
 } // Predicates = [HasStdExtZbs]
 
 let Predicates = [HasStdExtZbp] in {
@@ -207,7 +247,7 @@ def CLZ  : RVBUnary<0b0110000, 0b00000, 0b001, RISCVOpcode<0b0010011>, "clz">,
            Sched<[]>;
 def CTZ  : RVBUnary<0b0110000, 0b00001, 0b001, RISCVOpcode<0b0010011>, "ctz">,
            Sched<[]>;
-def PCNT : RVBUnary<0b0110000, 0b00010, 0b001, RISCVOpcode<0b0010011>, "pcnt">,
+def CPOP : RVBUnary<0b0110000, 0b00010, 0b001, RISCVOpcode<0b0010011>, "cpop">,
            Sched<[]>;
 } // Predicates = [HasStdExtZbb]
 
@@ -256,8 +296,8 @@ def CLMULH : ALU_rr<0b0000101, 0b011, "clmulh">, Sched<[]>;
 
 let Predicates = [HasStdExtZbb] in {
 def MIN  : ALU_rr<0b0000101, 0b100, "min">, Sched<[]>;
-def MAX  : ALU_rr<0b0000101, 0b101, "max">, Sched<[]>;
-def MINU : ALU_rr<0b0000101, 0b110, "minu">, Sched<[]>;
+def MINU : ALU_rr<0b0000101, 0b101, "minu">, Sched<[]>;
+def MAX  : ALU_rr<0b0000101, 0b110, "max">, Sched<[]>;
 def MAXU : ALU_rr<0b0000101, 0b111, "maxu">, Sched<[]>;
 } // Predicates = [HasStdExtZbb]
 
@@ -267,23 +307,23 @@ def UNSHFL : ALU_rr<0b0000100, 0b101, "unshfl">, Sched<[]>;
 } // Predicates = [HasStdExtZbp]
 
 let Predicates = [HasStdExtZbe] in {
-def BDEP : ALU_rr<0b0100100, 0b110, "bdep">, Sched<[]>;
-def BEXT : ALU_rr<0b0000100, 0b110, "bext">, Sched<[]>;
+// NOTE: These mnemonics are from the 0.94 spec. There is a name conflict with
+// bext in the 0.93 spec.
+def BDECOMPRESS : ALU_rr<0b0100100, 0b110, "bdecompress">, Sched<[]>;
+def BCOMPRESS   : ALU_rr<0b0000100, 0b110, "bcompress">, Sched<[]>;
 } // Predicates = [HasStdExtZbe]
 
-let Predicates = [HasStdExtZbbOrZbp] in {
+let Predicates = [HasStdExtZbp] in {
 def PACK  : ALU_rr<0b0000100, 0b100, "pack">, Sched<[]>;
 def PACKU : ALU_rr<0b0100100, 0b100, "packu">, Sched<[]>;
-} // Predicates = [HasStdExtZbbOrZbp]
+def PACKH : ALU_rr<0b0000100, 0b111, "packh">, Sched<[]>;
+} // Predicates = [HasStdExtZbp]
 
 let Predicates = [HasStdExtZbm, IsRV64] in {
 def BMATOR   : ALU_rr<0b0000100, 0b011, "bmator">, Sched<[]>;
 def BMATXOR  : ALU_rr<0b0100100, 0b011, "bmatxor">, Sched<[]>;
 } // Predicates = [HasStdExtZbm, IsRV64]
 
-let Predicates = [HasStdExtZbbOrZbp] in
-def PACKH : ALU_rr<0b0000100, 0b111, "packh">, Sched<[]>;
-
 let Predicates = [HasStdExtZbf] in
 def BFP : ALU_rr<0b0100100, 0b111, "bfp">, Sched<[]>;
 
@@ -292,18 +332,12 @@ def SHFLI   : RVBShfl_ri<0b000010, 0b001, OPC_OP_IMM, "shfli">, Sched<[]>;
 def UNSHFLI : RVBShfl_ri<0b000010, 0b101, OPC_OP_IMM, "unshfli">, Sched<[]>;
 } // Predicates = [HasStdExtZbp]
 
-let Predicates = [HasStdExtZbb, IsRV64] in {
-def ADDIWU : RVBALUW_ri<0b100, "addiwu">, Sched<[]>;
-def SLLIUW : RVBShift_ri<0b00001, 0b001, OPC_OP_IMM_32, "slliu.w">, Sched<[]>;
-def ADDWU : ALUW_rr<0b0000101, 0b000, "addwu">, Sched<[]>;
-def SUBWU : ALUW_rr<0b0100101, 0b000, "subwu">, Sched<[]>;
-def ADDUW : ALUW_rr<0b0000100, 0b000, "addu.w">, Sched<[]>;
-def SUBUW : ALUW_rr<0b0100100, 0b000, "subu.w">, Sched<[]>;
-} // Predicates = [HasStdExtZbb, IsRV64]
-
-let Predicates = [HasStdExtZbb, IsRV64] in {
-def SLOW   : ALUW_rr<0b0010000, 0b001, "slow">, Sched<[]>;
-def SROW   : ALUW_rr<0b0010000, 0b101, "srow">, Sched<[]>;
+let Predicates = [HasStdExtZba, IsRV64] in {
+def SLLIUW : RVBShift_ri<0b00001, 0b001, OPC_OP_IMM_32, "slli.uw">, Sched<[]>;
+def ADDUW : ALUW_rr<0b0000100, 0b000, "add.uw">, Sched<[]>;
+def SH1ADDUW : ALUW_rr<0b0010000, 0b010, "sh1add.uw">, Sched<[]>;
+def SH2ADDUW : ALUW_rr<0b0010000, 0b100, "sh2add.uw">, Sched<[]>;
+def SH3ADDUW : ALUW_rr<0b0010000, 0b110, "sh3add.uw">, Sched<[]>;
 } // Predicates = [HasStdExtZbb, IsRV64]
 
 let Predicates = [HasStdExtZbbOrZbp, IsRV64] in {
@@ -312,10 +346,10 @@ def RORW  : ALUW_rr<0b0110000, 0b101, "rorw">, Sched<[]>;
 } // Predicates = [HasStdExtZbbOrZbp, IsRV64]
 
 let Predicates = [HasStdExtZbs, IsRV64] in {
-def SBCLRW : ALUW_rr<0b0100100, 0b001, "sbclrw">, Sched<[]>;
-def SBSETW : ALUW_rr<0b0010100, 0b001, "sbsetw">, Sched<[]>;
-def SBINVW : ALUW_rr<0b0110100, 0b001, "sbinvw">, Sched<[]>;
-def SBEXTW : ALUW_rr<0b0100100, 0b101, "sbextw">, Sched<[]>;
+def BCLRW : ALUW_rr<0b0100100, 0b001, "bclrw">, Sched<[]>;
+def BSETW : ALUW_rr<0b0010100, 0b001, "bsetw">, Sched<[]>;
+def BINVW : ALUW_rr<0b0110100, 0b001, "binvw">, Sched<[]>;
+def BEXTW : ALUW_rr<0b0100100, 0b101, "bextw">, Sched<[]>;
 } // Predicates = [HasStdExtZbs, IsRV64]
 
 let Predicates = [HasStdExtZbp, IsRV64] in {
@@ -323,20 +357,19 @@ def GORCW  : ALUW_rr<0b0010100, 0b101, "gorcw">, Sched<[]>;
 def GREVW  : ALUW_rr<0b0110100, 0b101, "grevw">, Sched<[]>;
 } // Predicates = [HasStdExtZbp, IsRV64]
 
-let Predicates = [HasStdExtZbb, IsRV64] in {
-def SLOIW  : RVBShiftW_ri<0b0010000, 0b001, OPC_OP_IMM_32, "sloiw">, Sched<[]>;
-def SROIW  : RVBShiftW_ri<0b0010000, 0b101, OPC_OP_IMM_32, "sroiw">, Sched<[]>;
-} // Predicates = [HasStdExtZbb, IsRV64]
+let Predicates = [HasStdExtZbp, IsRV64] in {
+def XPERMW : ALU_rr<0b0010100, 0b000, "xperm.w">, Sched<[]>;
+} // Predicates = [HasStdExtZbp, IsRV64]
 
 let Predicates = [HasStdExtZbbOrZbp, IsRV64] in
 def RORIW : RVBShiftW_ri<0b0110000, 0b101, OPC_OP_IMM_32, "roriw">, Sched<[]>;
 
 let Predicates = [HasStdExtZbs, IsRV64] in {
-def SBCLRIW : RVBShiftW_ri<0b0100100, 0b001, OPC_OP_IMM_32, "sbclriw">,
+def BCLRIW : RVBShiftW_ri<0b0100100, 0b001, OPC_OP_IMM_32, "bclriw">,
               Sched<[]>;
-def SBSETIW : RVBShiftW_ri<0b0010100, 0b001, OPC_OP_IMM_32, "sbsetiw">,
+def BSETIW : RVBShiftW_ri<0b0010100, 0b001, OPC_OP_IMM_32, "bsetiw">,
               Sched<[]>;
-def SBINVIW : RVBShiftW_ri<0b0110100, 0b001, OPC_OP_IMM_32, "sbinviw">,
+def BINVIW : RVBShiftW_ri<0b0110100, 0b001, OPC_OP_IMM_32, "binviw">,
               Sched<[]>;
 } // Predicates = [HasStdExtZbs, IsRV64]
 
@@ -359,34 +392,77 @@ def CLZW   : RVBUnary<0b0110000, 0b00000, 0b001, RISCVOpcode<0b0011011>,
                       "clzw">, Sched<[]>;
 def CTZW   : RVBUnary<0b0110000, 0b00001, 0b001, RISCVOpcode<0b0011011>,
                       "ctzw">, Sched<[]>;
-def PCNTW  : RVBUnary<0b0110000, 0b00010, 0b001, RISCVOpcode<0b0011011>,
-                      "pcntw">, Sched<[]>;
+def CPOPW  : RVBUnary<0b0110000, 0b00010, 0b001, RISCVOpcode<0b0011011>,
+                      "cpopw">, Sched<[]>;
 } // Predicates = [HasStdExtZbb, IsRV64]
 
-let Predicates = [HasStdExtZbc, IsRV64] in {
-def CLMULW  : ALUW_rr<0b0000101, 0b001, "clmulw">, Sched<[]>;
-def CLMULRW : ALUW_rr<0b0000101, 0b010, "clmulrw">, Sched<[]>;
-def CLMULHW : ALUW_rr<0b0000101, 0b011, "clmulhw">, Sched<[]>;
-} // Predicates = [HasStdExtZbc, IsRV64]
-
 let Predicates = [HasStdExtZbp, IsRV64] in {
 def SHFLW   : ALUW_rr<0b0000100, 0b001, "shflw">, Sched<[]>;
 def UNSHFLW : ALUW_rr<0b0000100, 0b101, "unshflw">, Sched<[]>;
 } // Predicates = [HasStdExtZbp, IsRV64]
 
 let Predicates = [HasStdExtZbe, IsRV64] in {
-def BDEPW : ALUW_rr<0b0100100, 0b110, "bdepw">, Sched<[]>;
-def BEXTW : ALUW_rr<0b0000100, 0b110, "bextw">, Sched<[]>;
+// NOTE: These mnemonics are from the 0.94 spec. There is a name conflict with
+// bextw in the 0.93 spec.
+def BDECOMPRESSW : ALUW_rr<0b0100100, 0b110, "bdecompressw">, Sched<[]>;
+def BCOMPRESSW   : ALUW_rr<0b0000100, 0b110, "bcompressw">, Sched<[]>;
 } // Predicates = [HasStdExtZbe, IsRV64]
 
-let Predicates = [HasStdExtZbbOrZbp, IsRV64] in {
+let Predicates = [HasStdExtZbp, IsRV64] in {
 def PACKW  : ALUW_rr<0b0000100, 0b100, "packw">, Sched<[]>;
 def PACKUW : ALUW_rr<0b0100100, 0b100, "packuw">, Sched<[]>;
-} // Predicates = [HasStdExtZbbOrZbp, IsRV64]
+} // Predicates = [HasStdExtZbp, IsRV64]
 
 let Predicates = [HasStdExtZbf, IsRV64] in
 def BFPW : ALUW_rr<0b0100100, 0b111, "bfpw">, Sched<[]>;
 
+let Predicates = [HasStdExtZbbOrZbp, IsRV32] in {
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+def ZEXTH_RV32 : RVInstR<0b0000100, 0b100, OPC_OP, (outs GPR:$rd),
+                         (ins GPR:$rs1), "zext.h", "$rd, $rs1">, Sched<[]> {
+  let rs2 = 0b00000;
+}
+} // Predicates = [HasStdExtZbbOrZbp, IsRV32]
+
+let Predicates = [HasStdExtZbbOrZbp, IsRV64] in {
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+def ZEXTH_RV64 : RVInstR<0b0000100, 0b100, OPC_OP_32, (outs GPR:$rd),
+                         (ins GPR:$rs1), "zext.h", "$rd, $rs1">, Sched<[]> {
+  let rs2 = 0b00000;
+}
+} // Predicates = [HasStdExtZbbOrZbp, IsRV64]
+
+// We treat rev8 and orc.b as standalone instructions even though they use a
+// portion of the encodings for grevi and gorci. This allows us to support only
+// those encodings when only Zbb is enabled. We do this even when grevi and
+// gorci are available with Zbp. Trying to use 'HasStdExtZbb, NotHasStdExtZbp'
+// causes diagnostics to suggest that Zbp rather than Zbb is required for rev8
+// or gorci. Since Zbb is closer to being finalized than Zbp this will be
+// misleading to users.
+let Predicates = [HasStdExtZbbOrZbp, IsRV32] in {
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+def REV8_RV32 : RVInstI<0b101, OPC_OP_IMM, (outs GPR:$rd), (ins GPR:$rs1),
+                        "rev8", "$rd, $rs1">, Sched<[]> {
+  let imm12 = { 0b01101, 0b0011000 };
+}
+} // Predicates = [HasStdExtZbbOrZbp, IsRV32]
+
+let Predicates = [HasStdExtZbbOrZbp, IsRV64] in {
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+def REV8_RV64 : RVInstI<0b101, OPC_OP_IMM, (outs GPR:$rd), (ins GPR:$rs1),
+                        "rev8", "$rd, $rs1">, Sched<[]> {
+  let imm12 = { 0b01101, 0b0111000 };
+}
+} // Predicates = [HasStdExtZbbOrZbp, IsRV64]
+
+let Predicates = [HasStdExtZbbOrZbp] in {
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+def ORCB : RVInstI<0b101, OPC_OP_IMM, (outs GPR:$rd), (ins GPR:$rs1),
+                   "orc.b", "$rd, $rs1">, Sched<[]> {
+  let imm12 = { 0b00101, 0b0000111 };
+}
+} // Predicates = [HasStdExtZbbOrZbp]
+
 //===----------------------------------------------------------------------===//
 // Future compressed instructions
 //===----------------------------------------------------------------------===//
@@ -415,208 +491,123 @@ def C_NOT : RVBInstC<0b00, "c.not">, Sched<[]>;
 def C_NEG : RVBInstC<0b01, "c.neg">, Sched<[]>;
 } // DecoderNamespace = "RVBC", Predicates = [HasStdExtZbproposedc, HasStdExtC]
 
-let DecoderNamespace = "RVBC", Predicates = [HasStdExtZbproposedc, HasStdExtZbbOrZbp, HasStdExtC, IsRV64] in
+let DecoderNamespace = "RVBC", Predicates = [HasStdExtZbproposedc, HasStdExtZba, HasStdExtC, IsRV64] in
 def C_ZEXTW : RVBInstC<0b10, "c.zext.w">, Sched<[]>;
 
 //===----------------------------------------------------------------------===//
 // Pseudo Instructions
 //===----------------------------------------------------------------------===//
 
-let Predicates = [HasStdExtZbb, IsRV32] in {
-def : InstAlias<"zext.b $rd, $rs", (ANDI GPR:$rd, GPR:$rs, 0xFF)>;
-def : InstAlias<"zext.h $rd, $rs", (PACK GPR:$rd, GPR:$rs, X0)>;
-} // Predicates = [HasStdExtZbb, IsRV32]
-
-let Predicates = [HasStdExtZbb, IsRV64] in {
-def : InstAlias<"zext.b $rd, $rs", (ANDI GPR:$rd, GPR:$rs, 0xFF)>;
-def : InstAlias<"zext.h $rd, $rs", (PACKW GPR:$rd, GPR:$rs, X0)>;
-def : InstAlias<"zext.w $rd, $rs", (PACK GPR:$rd, GPR:$rs, X0)>;
-} // Predicates = [HasStdExtZbb, IsRV64]
+let Predicates = [HasStdExtZba, IsRV64] in {
+// NOTE: The 0.93 spec shows zext.w as an alias of pack/packw. It has been
+// changed to add.uw in a draft after 0.94.
+def : InstAlias<"zext.w $rd, $rs", (ADDUW GPR:$rd, GPR:$rs, X0)>;
+}
 
-let Predicates = [HasStdExtZbbOrZbp] in {
-def : InstAlias<"rev.p $rd, $rs",  (GREVI GPR:$rd, GPR:$rs, 0b00001)>,
-      Sched<[]>;
-def : InstAlias<"rev2.n $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b00010)>,
-      Sched<[]>;
-def : InstAlias<"rev.n $rd, $rs",  (GREVI GPR:$rd, GPR:$rs, 0b00011)>,
-      Sched<[]>;
-def : InstAlias<"rev4.b $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b00100)>,
-      Sched<[]>;
-def : InstAlias<"rev2.b $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b00110)>,
-      Sched<[]>;
-def : InstAlias<"rev.b $rd, $rs",  (GREVI GPR:$rd, GPR:$rs, 0b00111)>,
-      Sched<[]>;
-def : InstAlias<"rev8.h $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b01000)>,
-      Sched<[]>;
-def : InstAlias<"rev4.h $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b01100)>,
-      Sched<[]>;
-def : InstAlias<"rev2.h $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b01110)>,
-      Sched<[]>;
-def : InstAlias<"rev.h $rd, $rs",  (GREVI GPR:$rd, GPR:$rs, 0b01111)>,
-      Sched<[]>;
-
-def : InstAlias<"zip.n $rd, $rs",    (SHFLI   GPR:$rd, GPR:$rs, 0b0001)>,
-      Sched<[]>;
-def : InstAlias<"unzip.n $rd, $rs",  (UNSHFLI GPR:$rd, GPR:$rs, 0b0001)>,
-      Sched<[]>;
-def : InstAlias<"zip2.b $rd, $rs",   (SHFLI   GPR:$rd, GPR:$rs, 0b0010)>,
-      Sched<[]>;
-def : InstAlias<"unzip2.b $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b0010)>,
-      Sched<[]>;
-def : InstAlias<"zip.b $rd, $rs",    (SHFLI   GPR:$rd, GPR:$rs, 0b0011)>,
-      Sched<[]>;
-def : InstAlias<"unzip.b $rd, $rs",  (UNSHFLI GPR:$rd, GPR:$rs, 0b0011)>,
-      Sched<[]>;
-def : InstAlias<"zip4.h $rd, $rs",   (SHFLI   GPR:$rd, GPR:$rs, 0b0100)>,
-      Sched<[]>;
-def : InstAlias<"unzip4.h $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b0100)>,
-      Sched<[]>;
-def : InstAlias<"zip2.h $rd, $rs",   (SHFLI   GPR:$rd, GPR:$rs, 0b0110)>,
-      Sched<[]>;
-def : InstAlias<"unzip2.h $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b0110)>,
-      Sched<[]>;
-def : InstAlias<"zip.h $rd, $rs",    (SHFLI   GPR:$rd, GPR:$rs, 0b0111)>,
-      Sched<[]>;
-def : InstAlias<"unzip.h $rd, $rs",  (UNSHFLI GPR:$rd, GPR:$rs, 0b0111)>,
-      Sched<[]>;
-
-def : InstAlias<"orc.p $rd, $rs",  (GORCI GPR:$rd, GPR:$rs, 0b00001)>,
-      Sched<[]>;
-def : InstAlias<"orc2.n $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b00010)>,
-      Sched<[]>;
-def : InstAlias<"orc.n $rd, $rs",  (GORCI GPR:$rd, GPR:$rs, 0b00011)>,
-      Sched<[]>;
-def : InstAlias<"orc4.b $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b00100)>,
-      Sched<[]>;
-def : InstAlias<"orc2.b $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b00110)>,
-      Sched<[]>;
-def : InstAlias<"orc.b $rd, $rs",  (GORCI GPR:$rd, GPR:$rs, 0b00111)>,
-      Sched<[]>;
-def : InstAlias<"orc8.h $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b01000)>,
-      Sched<[]>;
-def : InstAlias<"orc4.h $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b01100)>,
-      Sched<[]>;
-def : InstAlias<"orc2.h $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b01110)>,
-      Sched<[]>;
-def : InstAlias<"orc.h $rd, $rs",  (GORCI GPR:$rd, GPR:$rs, 0b01111)>,
-      Sched<[]>;
-} // Predicates = [HasStdExtZbbOrZbp]
+let Predicates = [HasStdExtZbp] in {
+def : InstAlias<"rev.p $rd, $rs",  (GREVI GPR:$rd, GPR:$rs, 0b00001)>;
+def : InstAlias<"rev2.n $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b00010)>;
+def : InstAlias<"rev.n $rd, $rs",  (GREVI GPR:$rd, GPR:$rs, 0b00011)>;
+def : InstAlias<"rev4.b $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b00100)>;
+def : InstAlias<"rev2.b $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b00110)>;
+def : InstAlias<"rev.b $rd, $rs",  (GREVI GPR:$rd, GPR:$rs, 0b00111)>;
+def : InstAlias<"rev8.h $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b01000)>;
+def : InstAlias<"rev4.h $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b01100)>;
+def : InstAlias<"rev2.h $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b01110)>;
+def : InstAlias<"rev.h $rd, $rs",  (GREVI GPR:$rd, GPR:$rs, 0b01111)>;
+
+def : InstAlias<"zip.n $rd, $rs",    (SHFLI   GPR:$rd, GPR:$rs, 0b0001)>;
+def : InstAlias<"unzip.n $rd, $rs",  (UNSHFLI GPR:$rd, GPR:$rs, 0b0001)>;
+def : InstAlias<"zip2.b $rd, $rs",   (SHFLI   GPR:$rd, GPR:$rs, 0b0010)>;
+def : InstAlias<"unzip2.b $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b0010)>;
+def : InstAlias<"zip.b $rd, $rs",    (SHFLI   GPR:$rd, GPR:$rs, 0b0011)>;
+def : InstAlias<"unzip.b $rd, $rs",  (UNSHFLI GPR:$rd, GPR:$rs, 0b0011)>;
+def : InstAlias<"zip4.h $rd, $rs",   (SHFLI   GPR:$rd, GPR:$rs, 0b0100)>;
+def : InstAlias<"unzip4.h $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b0100)>;
+def : InstAlias<"zip2.h $rd, $rs",   (SHFLI   GPR:$rd, GPR:$rs, 0b0110)>;
+def : InstAlias<"unzip2.h $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b0110)>;
+def : InstAlias<"zip.h $rd, $rs",    (SHFLI   GPR:$rd, GPR:$rs, 0b0111)>;
+def : InstAlias<"unzip.h $rd, $rs",  (UNSHFLI GPR:$rd, GPR:$rs, 0b0111)>;
+
+def : InstAlias<"orc.p $rd, $rs",  (GORCI GPR:$rd, GPR:$rs, 0b00001)>;
+def : InstAlias<"orc2.n $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b00010)>;
+def : InstAlias<"orc.n $rd, $rs",  (GORCI GPR:$rd, GPR:$rs, 0b00011)>;
+def : InstAlias<"orc4.b $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b00100)>;
+def : InstAlias<"orc2.b $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b00110)>;
+// orc.b is considered an instruction rather than an alias.
+def : InstAlias<"orc8.h $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b01000)>;
+def : InstAlias<"orc4.h $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b01100)>;
+def : InstAlias<"orc2.h $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b01110)>;
+def : InstAlias<"orc.h $rd, $rs",  (GORCI GPR:$rd, GPR:$rs, 0b01111)>;
+} // Predicates = [HasStdExtZbp]
 
-let Predicates = [HasStdExtZbbOrZbp, IsRV32] in {
-def : InstAlias<"rev16 $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b10000)>, Sched<[]>;
-def : InstAlias<"rev8 $rd, $rs",  (GREVI GPR:$rd, GPR:$rs, 0b11000)>, Sched<[]>;
-def : InstAlias<"rev4 $rd, $rs",  (GREVI GPR:$rd, GPR:$rs, 0b11100)>, Sched<[]>;
-def : InstAlias<"rev2 $rd, $rs",  (GREVI GPR:$rd, GPR:$rs, 0b11110)>, Sched<[]>;
-def : InstAlias<"rev $rd, $rs",   (GREVI GPR:$rd, GPR:$rs, 0b11111)>, Sched<[]>;
-
-def : InstAlias<"zip8 $rd, $rs",   (SHFLI   GPR:$rd, GPR:$rs, 0b1000)>,
-      Sched<[]>;
-def : InstAlias<"unzip8 $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b1000)>,
-      Sched<[]>;
-def : InstAlias<"zip4 $rd, $rs",   (SHFLI   GPR:$rd, GPR:$rs, 0b1100)>,
-      Sched<[]>;
-def : InstAlias<"unzip4 $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b1100)>,
-      Sched<[]>;
-def : InstAlias<"zip2 $rd, $rs",   (SHFLI   GPR:$rd, GPR:$rs, 0b1110)>,
-      Sched<[]>;
-def : InstAlias<"unzip2 $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b1110)>,
-      Sched<[]>;
-def : InstAlias<"zip $rd, $rs",    (SHFLI   GPR:$rd, GPR:$rs, 0b1111)>,
-      Sched<[]>;
-def : InstAlias<"unzip $rd, $rs",  (UNSHFLI GPR:$rd, GPR:$rs, 0b1111)>,
-      Sched<[]>;
-
-def : InstAlias<"orc16 $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b10000)>, Sched<[]>;
-def : InstAlias<"orc8 $rd, $rs",  (GORCI GPR:$rd, GPR:$rs, 0b11000)>, Sched<[]>;
-def : InstAlias<"orc4 $rd, $rs",  (GORCI GPR:$rd, GPR:$rs, 0b11100)>, Sched<[]>;
-def : InstAlias<"orc2 $rd, $rs",  (GORCI GPR:$rd, GPR:$rs, 0b11110)>, Sched<[]>;
-def : InstAlias<"orc $rd, $rs",   (GORCI GPR:$rd, GPR:$rs, 0b11111)>, Sched<[]>;
-} // Predicates = [HasStdExtZbbOrZbp, IsRV32]
+let Predicates = [HasStdExtZbp, IsRV32] in {
+def : InstAlias<"rev16 $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b10000)>;
+// rev8 is considered an instruction rather than an alias.
+def : InstAlias<"rev4 $rd, $rs",  (GREVI GPR:$rd, GPR:$rs, 0b11100)>;
+def : InstAlias<"rev2 $rd, $rs",  (GREVI GPR:$rd, GPR:$rs, 0b11110)>;
+def : InstAlias<"rev $rd, $rs",   (GREVI GPR:$rd, GPR:$rs, 0b11111)>;
+
+def : InstAlias<"zip8 $rd, $rs",   (SHFLI   GPR:$rd, GPR:$rs, 0b1000)>;
+def : InstAlias<"unzip8 $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b1000)>;
+def : InstAlias<"zip4 $rd, $rs",   (SHFLI   GPR:$rd, GPR:$rs, 0b1100)>;
+def : InstAlias<"unzip4 $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b1100)>;
+def : InstAlias<"zip2 $rd, $rs",   (SHFLI   GPR:$rd, GPR:$rs, 0b1110)>;
+def : InstAlias<"unzip2 $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b1110)>;
+def : InstAlias<"zip $rd, $rs",    (SHFLI   GPR:$rd, GPR:$rs, 0b1111)>;
+def : InstAlias<"unzip $rd, $rs",  (UNSHFLI GPR:$rd, GPR:$rs, 0b1111)>;
+
+def : InstAlias<"orc16 $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b10000)>;
+def : InstAlias<"orc8 $rd, $rs",  (GORCI GPR:$rd, GPR:$rs, 0b11000)>;
+def : InstAlias<"orc4 $rd, $rs",  (GORCI GPR:$rd, GPR:$rs, 0b11100)>;
+def : InstAlias<"orc2 $rd, $rs",  (GORCI GPR:$rd, GPR:$rs, 0b11110)>;
+def : InstAlias<"orc $rd, $rs",   (GORCI GPR:$rd, GPR:$rs, 0b11111)>;
+} // Predicates = [HasStdExtZbp, IsRV32]
 
-let Predicates = [HasStdExtZbbOrZbp, IsRV64] in {
-def : InstAlias<"rev16.w $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b010000)>,
-      Sched<[]>;
-def : InstAlias<"rev8.w $rd, $rs",  (GREVI GPR:$rd, GPR:$rs, 0b011000)>,
-      Sched<[]>;
-def : InstAlias<"rev4.w $rd, $rs",  (GREVI GPR:$rd, GPR:$rs, 0b011100)>,
-      Sched<[]>;
-def : InstAlias<"rev2.w $rd, $rs",  (GREVI GPR:$rd, GPR:$rs, 0b011110)>,
-      Sched<[]>;
-def : InstAlias<"rev.w $rd, $rs",   (GREVI GPR:$rd, GPR:$rs, 0b011111)>,
-      Sched<[]>;
-def : InstAlias<"rev32 $rd, $rs",   (GREVI GPR:$rd, GPR:$rs, 0b100000)>,
-      Sched<[]>;
-def : InstAlias<"rev16 $rd, $rs",   (GREVI GPR:$rd, GPR:$rs, 0b110000)>,
-      Sched<[]>;
-def : InstAlias<"rev8 $rd, $rs",    (GREVI GPR:$rd, GPR:$rs, 0b111000)>,
-      Sched<[]>;
-def : InstAlias<"rev4 $rd, $rs",    (GREVI GPR:$rd, GPR:$rs, 0b111100)>,
-      Sched<[]>;
-def : InstAlias<"rev2 $rd, $rs",    (GREVI GPR:$rd, GPR:$rs, 0b111110)>,
-      Sched<[]>;
-def : InstAlias<"rev $rd, $rs",     (GREVI GPR:$rd, GPR:$rs, 0b111111)>,
-      Sched<[]>;
-
-def : InstAlias<"zip8.w $rd, $rs",   (SHFLI   GPR:$rd, GPR:$rs, 0b01000)>,
-      Sched<[]>;
-def : InstAlias<"unzip8.w $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b01000)>,
-      Sched<[]>;
-def : InstAlias<"zip4.w $rd, $rs",   (SHFLI   GPR:$rd, GPR:$rs, 0b01100)>,
-      Sched<[]>;
-def : InstAlias<"unzip4.w $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b01100)>,
-      Sched<[]>;
-def : InstAlias<"zip2.w $rd, $rs",   (SHFLI   GPR:$rd, GPR:$rs, 0b01110)>,
-      Sched<[]>;
-def : InstAlias<"unzip2.w $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b01110)>,
-      Sched<[]>;
-def : InstAlias<"zip.w $rd, $rs",    (SHFLI   GPR:$rd, GPR:$rs, 0b01111)>,
-      Sched<[]>;
-def : InstAlias<"unzip.w $rd, $rs",  (UNSHFLI GPR:$rd, GPR:$rs, 0b01111)>,
-      Sched<[]>;
-def : InstAlias<"zip16 $rd, $rs",    (SHFLI   GPR:$rd, GPR:$rs, 0b10000)>,
-      Sched<[]>;
-def : InstAlias<"unzip16 $rd, $rs",  (UNSHFLI GPR:$rd, GPR:$rs, 0b10000)>,
-      Sched<[]>;
-def : InstAlias<"zip8 $rd, $rs",     (SHFLI   GPR:$rd, GPR:$rs, 0b11000)>,
-      Sched<[]>;
-def : InstAlias<"unzip8 $rd, $rs",   (UNSHFLI GPR:$rd, GPR:$rs, 0b11000)>,
-      Sched<[]>;
-def : InstAlias<"zip4 $rd, $rs",     (SHFLI   GPR:$rd, GPR:$rs, 0b11100)>,
-      Sched<[]>;
-def : InstAlias<"unzip4 $rd, $rs",   (UNSHFLI GPR:$rd, GPR:$rs, 0b11100)>,
-      Sched<[]>;
-def : InstAlias<"zip2 $rd, $rs",     (SHFLI   GPR:$rd, GPR:$rs, 0b11110)>,
-      Sched<[]>;
-def : InstAlias<"unzip2 $rd, $rs",   (UNSHFLI GPR:$rd, GPR:$rs, 0b11110)>,
-      Sched<[]>;
-def : InstAlias<"zip $rd, $rs",      (SHFLI   GPR:$rd, GPR:$rs, 0b11111)>,
-      Sched<[]>;
-def : InstAlias<"unzip $rd, $rs",    (UNSHFLI GPR:$rd, GPR:$rs, 0b11111)>,
-      Sched<[]>;
-
-def : InstAlias<"orc16.w $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b010000)>,
-      Sched<[]>;
-def : InstAlias<"orc8.w $rd, $rs",  (GORCI GPR:$rd, GPR:$rs, 0b011000)>,
-      Sched<[]>;
-def : InstAlias<"orc4.w $rd, $rs",  (GORCI GPR:$rd, GPR:$rs, 0b011100)>,
-      Sched<[]>;
-def : InstAlias<"orc2.w $rd, $rs",  (GORCI GPR:$rd, GPR:$rs, 0b011110)>,
-      Sched<[]>;
-def : InstAlias<"orc.w $rd, $rs",   (GORCI GPR:$rd, GPR:$rs, 0b011111)>,
-      Sched<[]>;
-def : InstAlias<"orc32 $rd, $rs",   (GORCI GPR:$rd, GPR:$rs, 0b100000)>,
-      Sched<[]>;
-def : InstAlias<"orc16 $rd, $rs",   (GORCI GPR:$rd, GPR:$rs, 0b110000)>,
-      Sched<[]>;
-def : InstAlias<"orc8 $rd, $rs",    (GORCI GPR:$rd, GPR:$rs, 0b111000)>,
-      Sched<[]>;
-def : InstAlias<"orc4 $rd, $rs",    (GORCI GPR:$rd, GPR:$rs, 0b111100)>,
-      Sched<[]>;
-def : InstAlias<"orc2 $rd, $rs",    (GORCI GPR:$rd, GPR:$rs, 0b111110)>,
-      Sched<[]>;
-def : InstAlias<"orc $rd, $rs",     (GORCI GPR:$rd, GPR:$rs, 0b111111)>,
-      Sched<[]>;
-} // Predicates = [HasStdExtZbbOrZbp, IsRV64]
+let Predicates = [HasStdExtZbp, IsRV64] in {
+def : InstAlias<"rev16.w $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b010000)>;
+def : InstAlias<"rev8.w $rd, $rs",  (GREVI GPR:$rd, GPR:$rs, 0b011000)>;
+def : InstAlias<"rev4.w $rd, $rs",  (GREVI GPR:$rd, GPR:$rs, 0b011100)>;
+def : InstAlias<"rev2.w $rd, $rs",  (GREVI GPR:$rd, GPR:$rs, 0b011110)>;
+def : InstAlias<"rev.w $rd, $rs",   (GREVI GPR:$rd, GPR:$rs, 0b011111)>;
+def : InstAlias<"rev32 $rd, $rs",   (GREVI GPR:$rd, GPR:$rs, 0b100000)>;
+def : InstAlias<"rev16 $rd, $rs",   (GREVI GPR:$rd, GPR:$rs, 0b110000)>;
+// rev8 is considered an instruction rather than an alias.
+def : InstAlias<"rev4 $rd, $rs",    (GREVI GPR:$rd, GPR:$rs, 0b111100)>;
+def : InstAlias<"rev2 $rd, $rs",    (GREVI GPR:$rd, GPR:$rs, 0b111110)>;
+def : InstAlias<"rev $rd, $rs",     (GREVI GPR:$rd, GPR:$rs, 0b111111)>;
+
+def : InstAlias<"zip8.w $rd, $rs",   (SHFLI   GPR:$rd, GPR:$rs, 0b01000)>;
+def : InstAlias<"unzip8.w $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b01000)>;
+def : InstAlias<"zip4.w $rd, $rs",   (SHFLI   GPR:$rd, GPR:$rs, 0b01100)>;
+def : InstAlias<"unzip4.w $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b01100)>;
+def : InstAlias<"zip2.w $rd, $rs",   (SHFLI   GPR:$rd, GPR:$rs, 0b01110)>;
+def : InstAlias<"unzip2.w $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b01110)>;
+def : InstAlias<"zip.w $rd, $rs",    (SHFLI   GPR:$rd, GPR:$rs, 0b01111)>;
+def : InstAlias<"unzip.w $rd, $rs",  (UNSHFLI GPR:$rd, GPR:$rs, 0b01111)>;
+def : InstAlias<"zip16 $rd, $rs",    (SHFLI   GPR:$rd, GPR:$rs, 0b10000)>;
+def : InstAlias<"unzip16 $rd, $rs",  (UNSHFLI GPR:$rd, GPR:$rs, 0b10000)>;
+def : InstAlias<"zip8 $rd, $rs",     (SHFLI   GPR:$rd, GPR:$rs, 0b11000)>;
+def : InstAlias<"unzip8 $rd, $rs",   (UNSHFLI GPR:$rd, GPR:$rs, 0b11000)>;
+def : InstAlias<"zip4 $rd, $rs",     (SHFLI   GPR:$rd, GPR:$rs, 0b11100)>;
+def : InstAlias<"unzip4 $rd, $rs",   (UNSHFLI GPR:$rd, GPR:$rs, 0b11100)>;
+def : InstAlias<"zip2 $rd, $rs",     (SHFLI   GPR:$rd, GPR:$rs, 0b11110)>;
+def : InstAlias<"unzip2 $rd, $rs",   (UNSHFLI GPR:$rd, GPR:$rs, 0b11110)>;
+def : InstAlias<"zip $rd, $rs",      (SHFLI   GPR:$rd, GPR:$rs, 0b11111)>;
+def : InstAlias<"unzip $rd, $rs",    (UNSHFLI GPR:$rd, GPR:$rs, 0b11111)>;
+
+def : InstAlias<"orc16.w $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b010000)>;
+def : InstAlias<"orc8.w $rd, $rs",  (GORCI GPR:$rd, GPR:$rs, 0b011000)>;
+def : InstAlias<"orc4.w $rd, $rs",  (GORCI GPR:$rd, GPR:$rs, 0b011100)>;
+def : InstAlias<"orc2.w $rd, $rs",  (GORCI GPR:$rd, GPR:$rs, 0b011110)>;
+def : InstAlias<"orc.w $rd, $rs",   (GORCI GPR:$rd, GPR:$rs, 0b011111)>;
+def : InstAlias<"orc32 $rd, $rs",   (GORCI GPR:$rd, GPR:$rs, 0b100000)>;
+def : InstAlias<"orc16 $rd, $rs",   (GORCI GPR:$rd, GPR:$rs, 0b110000)>;
+def : InstAlias<"orc8 $rd, $rs",    (GORCI GPR:$rd, GPR:$rs, 0b111000)>;
+def : InstAlias<"orc4 $rd, $rs",    (GORCI GPR:$rd, GPR:$rs, 0b111100)>;
+def : InstAlias<"orc2 $rd, $rs",    (GORCI GPR:$rd, GPR:$rs, 0b111110)>;
+def : InstAlias<"orc $rd, $rs",     (GORCI GPR:$rd, GPR:$rs, 0b111111)>;
+} // Predicates = [HasStdExtZbp, IsRV64]
 
 //===----------------------------------------------------------------------===//
 // Compressed Instruction patterns
@@ -628,22 +619,14 @@ def : CompressPat<(SUB GPRC:$rs1, X0, GPRC:$rs1),
                   (C_NEG GPRC:$rs1)>;
 } // Predicates = [HasStdExtZbproposedc, HasStdExtC]
 
-let Predicates = [HasStdExtZbproposedc, HasStdExtZbbOrZbp, HasStdExtC, IsRV64] in {
-def : CompressPat<(PACK GPRC:$rs1, GPRC:$rs1, X0),
+let Predicates = [HasStdExtZbproposedc, HasStdExtZba, HasStdExtC, IsRV64] in {
+def : CompressPat<(ADDUW GPRC:$rs1, GPRC:$rs1, X0),
                   (C_ZEXTW GPRC:$rs1)>;
 } // Predicates = [HasStdExtZbproposedc, HasStdExtC, IsRV64]
 
 //===----------------------------------------------------------------------===//
 // Codegen patterns
 //===----------------------------------------------------------------------===//
-def SLOIPat   : ComplexPattern<XLenVT, 2, "SelectSLOI", [or]>;
-def SROIPat   : ComplexPattern<XLenVT, 2, "SelectSROI", [or]>;
-def RORIPat   : ComplexPattern<XLenVT, 2, "SelectRORI", [rotl]>;
-def SLLIUWPat : ComplexPattern<i64, 2, "SelectSLLIUW", [and]>;
-def SLOIWPat  : ComplexPattern<i64, 2, "SelectSLOIW", [sext_inreg]>;
-def SROIWPat  : ComplexPattern<i64, 2, "SelectSROIW", [or]>;
-def RORIWPat  : ComplexPattern<i64, 2, "SelectRORIW", [sext_inreg]>;
-def FSRIWPat  : ComplexPattern<i64, 3, "SelectFSRIW", [sext_inreg]>;
 
 let Predicates = [HasStdExtZbbOrZbp] in {
 def : Pat<(and GPR:$rs1, (not GPR:$rs2)), (ANDN GPR:$rs1, GPR:$rs2)>;
@@ -651,221 +634,180 @@ def : Pat<(or  GPR:$rs1, (not GPR:$rs2)), (ORN  GPR:$rs1, GPR:$rs2)>;
 def : Pat<(xor GPR:$rs1, (not GPR:$rs2)), (XNOR GPR:$rs1, GPR:$rs2)>;
 } // Predicates = [HasStdExtZbbOrZbp]
 
-let Predicates = [HasStdExtZbb] in {
-def : Pat<(xor (shl (xor GPR:$rs1, -1), GPR:$rs2), -1),
-          (SLO GPR:$rs1, GPR:$rs2)>;
-def : Pat<(xor (srl (xor GPR:$rs1, -1), GPR:$rs2), -1),
-          (SRO GPR:$rs1, GPR:$rs2)>;
-} // Predicates = [HasStdExtZbb]
-
 let Predicates = [HasStdExtZbbOrZbp] in {
 def : Pat<(rotl GPR:$rs1, GPR:$rs2), (ROL GPR:$rs1, GPR:$rs2)>;
-def : Pat<(fshl GPR:$rs1, GPR:$rs1, GPR:$rs2), (ROL GPR:$rs1, GPR:$rs2)>;
 def : Pat<(rotr GPR:$rs1, GPR:$rs2), (ROR GPR:$rs1, GPR:$rs2)>;
-def : Pat<(fshr GPR:$rs1, GPR:$rs1, GPR:$rs2), (ROR GPR:$rs1, GPR:$rs2)>;
 } // Predicates = [HasStdExtZbbOrZbp]
 
-let Predicates = [HasStdExtZbs, IsRV32] in
-def : Pat<(and (xor (shl 1, (and GPR:$rs2, 31)), -1), GPR:$rs1),
-          (SBCLR GPR:$rs1, GPR:$rs2)>;
-let Predicates = [HasStdExtZbs, IsRV64] in
-def : Pat<(and (xor (shl 1, (and GPR:$rs2, 63)), -1), GPR:$rs1),
-          (SBCLR GPR:$rs1, GPR:$rs2)>;
-
-let Predicates = [HasStdExtZbs] in
-def : Pat<(and (rotl -2, GPR:$rs2), GPR:$rs1), (SBCLR GPR:$rs1, GPR:$rs2)>;
-
-let Predicates = [HasStdExtZbs, IsRV32] in
-def : Pat<(or (shl 1, (and GPR:$rs2, 31)), GPR:$rs1),
-          (SBSET GPR:$rs1, GPR:$rs2)>;
-let Predicates = [HasStdExtZbs, IsRV64] in
-def : Pat<(or (shl 1, (and GPR:$rs2, 63)), GPR:$rs1),
-          (SBSET GPR:$rs1, GPR:$rs2)>;
-
-let Predicates = [HasStdExtZbs, IsRV32] in
-def : Pat<(xor (shl 1, (and GPR:$rs2, 31)), GPR:$rs1),
-          (SBINV GPR:$rs1, GPR:$rs2)>;
-let Predicates = [HasStdExtZbs, IsRV64] in
-def : Pat<(xor (shl 1, (and GPR:$rs2, 63)), GPR:$rs1),
-          (SBINV GPR:$rs1, GPR:$rs2)>;
-
-let Predicates = [HasStdExtZbs, IsRV32] in
-def : Pat<(and (srl GPR:$rs1, (and GPR:$rs2, 31)), 1),
-          (SBEXT GPR:$rs1, GPR:$rs2)>;
-
-let Predicates = [HasStdExtZbs, IsRV64] in
-def : Pat<(and (srl GPR:$rs1, (and GPR:$rs2, 63)), 1),
-          (SBEXT GPR:$rs1, GPR:$rs2)>;
+let Predicates = [HasStdExtZbs] in {
+def : Pat<(and (not (shiftop<shl> 1, GPR:$rs2)), GPR:$rs1),
+          (BCLR GPR:$rs1, GPR:$rs2)>;
+def : Pat<(and (rotl -2, GPR:$rs2), GPR:$rs1), (BCLR GPR:$rs1, GPR:$rs2)>;
+def : Pat<(or (shiftop<shl> 1, GPR:$rs2), GPR:$rs1),
+          (BSET GPR:$rs1, GPR:$rs2)>;
+def : Pat<(xor (shiftop<shl> 1, GPR:$rs2), GPR:$rs1),
+          (BINV GPR:$rs1, GPR:$rs2)>;
+def : Pat<(and (shiftop<srl> GPR:$rs1, GPR:$rs2), 1),
+          (BEXT GPR:$rs1, GPR:$rs2)>;
+
+def : Pat<(shiftop<shl> 1, GPR:$rs2),
+          (BSET X0, GPR:$rs2)>;
+
+def : Pat<(and GPR:$rs1, BCLRMask:$mask),
+          (BCLRI GPR:$rs1, (BCLRXForm imm:$mask))>;
+def : Pat<(or GPR:$rs1, BSETINVMask:$mask),
+          (BSETI GPR:$rs1, (BSETINVXForm imm:$mask))>;
+def : Pat<(xor GPR:$rs1, BSETINVMask:$mask),
+          (BINVI GPR:$rs1, (BSETINVXForm imm:$mask))>;
 
-let Predicates = [HasStdExtZbb] in {
-def : Pat<(SLOIPat GPR:$rs1, uimmlog2xlen:$shamt),
-          (SLOI GPR:$rs1, uimmlog2xlen:$shamt)>;
-def : Pat<(SROIPat GPR:$rs1, uimmlog2xlen:$shamt),
-          (SROI GPR:$rs1, uimmlog2xlen:$shamt)>;
-} // Predicates = [HasStdExtZbb]
+def : Pat<(and (srl GPR:$rs1, uimmlog2xlen:$shamt), (XLenVT 1)),
+          (BEXTI GPR:$rs1, uimmlog2xlen:$shamt)>;
+}
 
-// There's no encoding for roli in the current version of the 'B' extension
-// (v0.92) as it can be implemented with rori by negating the immediate.
-// For this reason we pattern-match only against rori[w].
-let Predicates = [HasStdExtZbbOrZbp] in
-def : Pat<(RORIPat GPR:$rs1, uimmlog2xlen:$shamt),
+// There's no encoding for roli in the the 'B' extension as it can be
+// implemented with rori by negating the immediate.
+let Predicates = [HasStdExtZbbOrZbp] in {
+def : Pat<(rotr GPR:$rs1, uimmlog2xlen:$shamt),
           (RORI GPR:$rs1, uimmlog2xlen:$shamt)>;
+def : Pat<(rotl GPR:$rs1, uimmlog2xlen:$shamt),
+          (RORI GPR:$rs1, (ImmSubFromXLen uimmlog2xlen:$shamt))>;
+}
 
-// We don't pattern-match sbclri[w], sbseti[w], sbinvi[w] because they are
-// pattern-matched by simple andi, ori, and xori.
-let Predicates = [HasStdExtZbs] in
-def : Pat<(and (srl GPR:$rs1, uimmlog2xlen:$shamt), (XLenVT 1)),
-          (SBEXTI GPR:$rs1, uimmlog2xlen:$shamt)>;
+def riscv_grevi    : SDNode<"RISCVISD::GREVI", SDTIntBinOp, []>;
+def riscv_greviw   : SDNode<"RISCVISD::GREVIW", SDTIntBinOp, []>;
+def riscv_gorci    : SDNode<"RISCVISD::GORCI", SDTIntBinOp, []>;
+def riscv_gorciw   : SDNode<"RISCVISD::GORCIW", SDTIntBinOp, []>;
 
-let Predicates = [HasStdExtZbp, IsRV32] in {
-def : Pat<(or (or (and (srl GPR:$rs1, (i32 1)), (i32 0x55555555)), GPR:$rs1),
-              (and (shl GPR:$rs1, (i32 1)), (i32 0xAAAAAAAA))),
-          (GORCI GPR:$rs1, (i32 1))>;
-def : Pat<(or (or (and (srl GPR:$rs1, (i32 2)), (i32 0x33333333)), GPR:$rs1),
-              (and (shl GPR:$rs1, (i32 2)), (i32 0xCCCCCCCC))),
-          (GORCI GPR:$rs1, (i32 2))>;
-def : Pat<(or (or (and (srl GPR:$rs1, (i32 4)), (i32 0x0F0F0F0F)), GPR:$rs1),
-              (and (shl GPR:$rs1, (i32 4)), (i32 0xF0F0F0F0))),
-          (GORCI GPR:$rs1, (i32 4))>;
-def : Pat<(or (or (and (srl GPR:$rs1, (i32 8)), (i32 0x00FF00FF)), GPR:$rs1),
-              (and (shl GPR:$rs1, (i32 8)), (i32 0xFF00FF00))),
-          (GORCI GPR:$rs1, (i32 8))>;
-def : Pat<(or (or (srl GPR:$rs1, (i32 16)), GPR:$rs1),
-              (shl GPR:$rs1, (i32 16))),
-          (GORCI GPR:$rs1, (i32 16))>;
-} // Predicates = [HasStdExtZbp, IsRV32]
+let Predicates = [HasStdExtZbp] in {
+def : Pat<(riscv_grevi GPR:$rs1, timm:$shamt), (GREVI GPR:$rs1, timm:$shamt)>;
+def : Pat<(riscv_gorci GPR:$rs1, timm:$shamt), (GORCI GPR:$rs1, timm:$shamt)>;
 
-let Predicates = [HasStdExtZbp, IsRV64] in {
-def : Pat<(or (or (and (srl GPR:$rs1, (i64 1)), (i64 0x5555555555555555)),
-                   GPR:$rs1),
-              (and (shl GPR:$rs1, (i64 1)), (i64 0xAAAAAAAAAAAAAAAA))),
-          (GORCI GPR:$rs1, (i64 1))>;
-def : Pat<(or (or (and (srl GPR:$rs1, (i64 2)), (i64 0x3333333333333333)),
-                   GPR:$rs1),
-              (and (shl GPR:$rs1, (i64 2)), (i64 0xCCCCCCCCCCCCCCCC))),
-          (GORCI GPR:$rs1, (i64 2))>;
-def : Pat<(or (or (and (srl GPR:$rs1, (i64 4)), (i64 0x0F0F0F0F0F0F0F0F)),
-                   GPR:$rs1),
-              (and (shl GPR:$rs1, (i64 4)), (i64 0xF0F0F0F0F0F0F0F0))),
-          (GORCI GPR:$rs1, (i64 4))>;
-def : Pat<(or (or (and (srl GPR:$rs1, (i64 8)), (i64 0x00FF00FF00FF00FF)),
-                   GPR:$rs1),
-              (and (shl GPR:$rs1, (i64 8)), (i64 0xFF00FF00FF00FF00))),
-          (GORCI GPR:$rs1, (i64 8))>;
-def : Pat<(or (or (and (srl GPR:$rs1, (i64 16)), (i64 0x0000FFFF0000FFFF)),
-                   GPR:$rs1),
-              (and (shl GPR:$rs1, (i64 16)), (i64 0xFFFF0000FFFF0000))),
-          (GORCI GPR:$rs1, (i64 16))>;
-def : Pat<(or (or (srl GPR:$rs1, (i64 32)), GPR:$rs1),
-              (shl GPR:$rs1, (i64 32))),
-          (GORCI GPR:$rs1, (i64 32))>;
-} // Predicates = [HasStdExtZbp, IsRV64]
+// We treat orc.b as a separate instruction, so match it directly.
+def : Pat<(riscv_gorci GPR:$rs1, (XLenVT 7)), (ORCB GPR:$rs1)>;
+} // Predicates = [HasStdExtZbp]
 
 let Predicates = [HasStdExtZbp, IsRV32] in {
-def : Pat<(or (and (shl GPR:$rs1, (i32 1)), (i32 0xAAAAAAAA)),
-              (and (srl GPR:$rs1, (i32 1)), (i32 0x55555555))),
-          (GREVI GPR:$rs1, (i32 1))>;
-def : Pat<(or (and (shl GPR:$rs1, (i32 2)), (i32 0xCCCCCCCC)),
-              (and (srl GPR:$rs1, (i32 2)), (i32 0x33333333))),
-          (GREVI GPR:$rs1, (i32 2))>;
-def : Pat<(or (and (shl GPR:$rs1, (i32 4)), (i32 0xF0F0F0F0)),
-              (and (srl GPR:$rs1, (i32 4)), (i32 0x0F0F0F0F))),
-          (GREVI GPR:$rs1, (i32 4))>;
-def : Pat<(or (and (shl GPR:$rs1, (i32 8)), (i32 0xFF00FF00)),
-              (and (srl GPR:$rs1, (i32 8)), (i32 0x00FF00FF))),
-          (GREVI GPR:$rs1, (i32 8))>;
-def : Pat<(rotr (bswap GPR:$rs1), (i32 16)), (GREVI GPR:$rs1, (i32 8))>;
-def : Pat<(or (shl GPR:$rs1, (i32 16)), (srl GPR:$rs1, (i32 16))),
-          (GREVI GPR:$rs1, (i32 16))>;
-def : Pat<(rotl GPR:$rs1, (i32 16)), (GREVI GPR:$rs1, (i32 16))>;
-def : Pat<(bswap GPR:$rs1), (GREVI GPR:$rs1, (i32 24))>;
-def : Pat<(bitreverse GPR:$rs1), (GREVI GPR:$rs1, (i32 31))>;
+def : Pat<(rotr (riscv_grevi GPR:$rs1, (i32 24)), (i32 16)), (GREVI GPR:$rs1, 8)>;
+def : Pat<(rotl (riscv_grevi GPR:$rs1, (i32 24)), (i32 16)), (GREVI GPR:$rs1, 8)>;
+
+// We treat rev8 as a separate instruction, so match it directly.
+def : Pat<(riscv_grevi GPR:$rs1, (i32 24)), (REV8_RV32 GPR:$rs1)>;
 } // Predicates = [HasStdExtZbp, IsRV32]
 
 let Predicates = [HasStdExtZbp, IsRV64] in {
-def : Pat<(or (and (shl GPR:$rs1, (i64 1)), (i64 0xAAAAAAAAAAAAAAAA)),
-              (and (srl GPR:$rs1, (i64 1)), (i64 0x5555555555555555))),
-          (GREVI GPR:$rs1, (i64 1))>;
-def : Pat<(or (and (shl GPR:$rs1, (i64 2)), (i64 0xCCCCCCCCCCCCCCCC)),
-              (and (srl GPR:$rs1, (i64 2)), (i64 0x3333333333333333))),
-          (GREVI GPR:$rs1, (i64 2))>;
-def : Pat<(or (and (shl GPR:$rs1, (i64 4)), (i64 0xF0F0F0F0F0F0F0F0)),
-              (and (srl GPR:$rs1, (i64 4)), (i64 0x0F0F0F0F0F0F0F0F))),
-          (GREVI GPR:$rs1, (i64 4))>;
-def : Pat<(or (and (shl GPR:$rs1, (i64 8)), (i64 0xFF00FF00FF00FF00)),
-              (and (srl GPR:$rs1, (i64 8)), (i64 0x00FF00FF00FF00FF))),
-          (GREVI GPR:$rs1, (i64 8))>;
-def : Pat<(or (and (shl GPR:$rs1, (i64 16)), (i64 0xFFFF0000FFFF0000)),
-              (and (srl GPR:$rs1, (i64 16)), (i64 0x0000FFFF0000FFFF))),
-          (GREVI GPR:$rs1, (i64 16))>;
-def : Pat<(or (shl GPR:$rs1, (i64 32)), (srl GPR:$rs1, (i64 32))),
-          (GREVI GPR:$rs1, (i64 32))>;
-def : Pat<(rotl GPR:$rs1, (i64 32)), (GREVI GPR:$rs1, (i64 32))>;
-def : Pat<(bswap GPR:$rs1), (GREVI GPR:$rs1, (i64 56))>;
-def : Pat<(bitreverse GPR:$rs1), (GREVI GPR:$rs1, (i64 63))>;
+// We treat rev8 as a separate instruction, so match it directly.
+def : Pat<(riscv_grevi GPR:$rs1, (i64 56)), (REV8_RV64 GPR:$rs1)>;
 } // Predicates = [HasStdExtZbp, IsRV64]
 
 let Predicates = [HasStdExtZbt] in {
-def : Pat<(or (and (xor GPR:$rs2, -1), GPR:$rs3), (and GPR:$rs2, GPR:$rs1)),
+def : Pat<(or (and (not GPR:$rs2), GPR:$rs3), (and GPR:$rs2, GPR:$rs1)),
           (CMIX GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
-def : Pat<(riscv_selectcc GPR:$rs2, (XLenVT 0), (XLenVT 17), GPR:$rs3, GPR:$rs1),
+
+def : Pat<(select (XLenVT (setne GPR:$rs2, 0)), GPR:$rs1, GPR:$rs3),
           (CMOV GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
-def : Pat<(fshl GPR:$rs1, GPR:$rs2, GPR:$rs3),
-          (FSL GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
-def : Pat<(fshr GPR:$rs1, GPR:$rs2, GPR:$rs3),
-          (FSR GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
-def : Pat<(fshr GPR:$rs1, GPR:$rs2, uimmlog2xlen:$shamt),
-          (FSRI GPR:$rs1, GPR:$rs2, uimmlog2xlen:$shamt)>;
+def : Pat<(select (XLenVT (seteq GPR:$rs2, 0)), GPR:$rs3, GPR:$rs1),
+          (CMOV GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
+def : Pat<(select (XLenVT (setne GPR:$x, simm12_plus1:$y)), GPR:$rs1, GPR:$rs3),
+          (CMOV GPR:$rs1, (ADDI GPR:$x, (NegImm simm12_plus1:$y)), GPR:$rs3)>;
+def : Pat<(select (XLenVT (seteq GPR:$x, simm12_plus1:$y)), GPR:$rs3, GPR:$rs1),
+          (CMOV GPR:$rs1, (ADDI GPR:$x, (NegImm simm12_plus1:$y)), GPR:$rs3)>;
+def : Pat<(select (XLenVT (setne GPR:$x, GPR:$y)), GPR:$rs1, GPR:$rs3),
+          (CMOV GPR:$rs1, (XOR GPR:$x, GPR:$y), GPR:$rs3)>;
+def : Pat<(select (XLenVT (seteq GPR:$x, GPR:$y)), GPR:$rs3, GPR:$rs1),
+          (CMOV GPR:$rs1, (XOR GPR:$x, GPR:$y), GPR:$rs3)>;
+def : Pat<(select (XLenVT (setuge GPR:$x, GPR:$y)), GPR:$rs3, GPR:$rs1),
+          (CMOV GPR:$rs1, (SLTU GPR:$x, GPR:$y), GPR:$rs3)>;
+def : Pat<(select (XLenVT (setule GPR:$y, GPR:$x)), GPR:$rs3, GPR:$rs1),
+          (CMOV GPR:$rs1, (SLTU GPR:$x, GPR:$y), GPR:$rs3)>;
+def : Pat<(select (XLenVT (setge GPR:$x, GPR:$y)), GPR:$rs3, GPR:$rs1),
+          (CMOV GPR:$rs1, (SLT GPR:$x, GPR:$y), GPR:$rs3)>;
+def : Pat<(select (XLenVT (setle GPR:$y, GPR:$x)), GPR:$rs3, GPR:$rs1),
+          (CMOV GPR:$rs1, (SLT GPR:$x, GPR:$y), GPR:$rs3)>;
+def : Pat<(select GPR:$rs2, GPR:$rs1, GPR:$rs3),
+          (CMOV GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
+} // Predicates = [HasStdExtZbt]
+
+// fshl and fshr concatenate their operands in the same order. fsr and fsl
+// instruction use different orders. fshl will return its first operand for
+// shift of zero, fshr will return its second operand. fsl and fsr both return
+// $rs1 so the patterns need to have different operand orders.
+//
+// fshl and fshr only read the lower log2(xlen) bits of the shift amount, but
+// fsl/fsr instructions read log2(xlen)+1 bits. DAG combine may have removed
+// an AND mask on the shift amount that we need to add back to avoid a one in
+// the extra bit.
+// FIXME: If we can prove that the extra bit in the shift amount is zero, we
+// don't need this mask.
+let Predicates = [HasStdExtZbt, IsRV32] in {
+def : Pat<(fshl GPR:$rs1, GPR:$rs3, GPR:$rs2),
+          (FSL GPR:$rs1, (ANDI GPR:$rs2, 31), GPR:$rs3)>;
+def : Pat<(fshr GPR:$rs3, GPR:$rs1, GPR:$rs2),
+          (FSR GPR:$rs1, (ANDI GPR:$rs2, 31), GPR:$rs3)>;
+}
+let Predicates = [HasStdExtZbt, IsRV64] in {
+def : Pat<(fshl GPR:$rs1, GPR:$rs3, GPR:$rs2),
+          (FSL GPR:$rs1, (ANDI GPR:$rs2, 63), GPR:$rs3)>;
+def : Pat<(fshr GPR:$rs3, GPR:$rs1, GPR:$rs2),
+          (FSR GPR:$rs1, (ANDI GPR:$rs2, 63), GPR:$rs3)>;
+}
+let Predicates = [HasStdExtZbt] in {
+def : Pat<(fshr GPR:$rs3, GPR:$rs1, uimmlog2xlen:$shamt),
+          (FSRI GPR:$rs1, GPR:$rs3, uimmlog2xlen:$shamt)>;
+// We can use FSRI for fshl by immediate if we subtract the immediate from
+// XLen and swap the operands.
+def : Pat<(fshl GPR:$rs3, GPR:$rs1, uimmlog2xlen:$shamt),
+          (FSRI GPR:$rs1, GPR:$rs3, (ImmSubFromXLen uimmlog2xlen:$shamt))>;
 } // Predicates = [HasStdExtZbt]
 
 let Predicates = [HasStdExtZbb] in {
 def : Pat<(ctlz GPR:$rs1), (CLZ GPR:$rs1)>;
 def : Pat<(cttz GPR:$rs1), (CTZ GPR:$rs1)>;
-def : Pat<(ctpop GPR:$rs1), (PCNT GPR:$rs1)>;
+def : Pat<(ctpop GPR:$rs1), (CPOP GPR:$rs1)>;
 } // Predicates = [HasStdExtZbb]
 
-let Predicates = [HasStdExtZbb, IsRV32] in
-def : Pat<(sra (shl GPR:$rs1, (i32 24)), (i32 24)), (SEXTB GPR:$rs1)>;
-let Predicates = [HasStdExtZbb, IsRV64] in
-def : Pat<(sra (shl GPR:$rs1, (i64 56)), (i64 56)), (SEXTB GPR:$rs1)>;
-
-let Predicates = [HasStdExtZbb, IsRV32] in
-def : Pat<(sra (shl GPR:$rs1, (i32 16)), (i32 16)), (SEXTH GPR:$rs1)>;
-let Predicates = [HasStdExtZbb, IsRV64] in
-def : Pat<(sra (shl GPR:$rs1, (i64 48)), (i64 48)), (SEXTH GPR:$rs1)>;
+let Predicates = [HasStdExtZbb] in {
+def : Pat<(sext_inreg GPR:$rs1, i8), (SEXTB GPR:$rs1)>;
+def : Pat<(sext_inreg GPR:$rs1, i16), (SEXTH GPR:$rs1)>;
+}
 
 let Predicates = [HasStdExtZbb] in {
 def : Pat<(smin GPR:$rs1, GPR:$rs2), (MIN  GPR:$rs1, GPR:$rs2)>;
-def : Pat<(riscv_selectcc GPR:$rs1, GPR:$rs2, (XLenVT 20), GPR:$rs1, GPR:$rs2),
-          (MIN  GPR:$rs1, GPR:$rs2)>;
 def : Pat<(smax GPR:$rs1, GPR:$rs2), (MAX  GPR:$rs1, GPR:$rs2)>;
-def : Pat<(riscv_selectcc GPR:$rs2, GPR:$rs1, (XLenVT 20), GPR:$rs1, GPR:$rs2),
-          (MAX  GPR:$rs1, GPR:$rs2)>;
 def : Pat<(umin GPR:$rs1, GPR:$rs2), (MINU GPR:$rs1, GPR:$rs2)>;
-def : Pat<(riscv_selectcc GPR:$rs1, GPR:$rs2, (XLenVT 12), GPR:$rs1, GPR:$rs2),
-          (MINU  GPR:$rs1, GPR:$rs2)>;
 def : Pat<(umax GPR:$rs1, GPR:$rs2), (MAXU GPR:$rs1, GPR:$rs2)>;
-def : Pat<(riscv_selectcc GPR:$rs2, GPR:$rs1, (XLenVT 12), GPR:$rs1, GPR:$rs2),
-          (MAXU  GPR:$rs1, GPR:$rs2)>;
 } // Predicates = [HasStdExtZbb]
 
-let Predicates = [HasStdExtZbbOrZbp, IsRV32] in
+let Predicates = [HasStdExtZbb, IsRV32] in {
+def : Pat<(bswap GPR:$rs1), (REV8_RV32 GPR:$rs1)>;
+} // Predicates = [HasStdExtZbb, IsRV32]
+
+let Predicates = [HasStdExtZbb, IsRV64] in {
+def : Pat<(bswap GPR:$rs1), (REV8_RV64 GPR:$rs1)>;
+} // Predicates = [HasStdExtZbb, IsRV64]
+
+let Predicates = [HasStdExtZbp, IsRV32] in
 def : Pat<(or (and GPR:$rs1, 0x0000FFFF), (shl GPR:$rs2, (i32 16))),
           (PACK GPR:$rs1, GPR:$rs2)>;
-let Predicates = [HasStdExtZbbOrZbp, IsRV64] in
+let Predicates = [HasStdExtZbp, IsRV64] in
 def : Pat<(or (and GPR:$rs1, 0x00000000FFFFFFFF), (shl GPR:$rs2, (i64 32))),
           (PACK GPR:$rs1, GPR:$rs2)>;
-let Predicates = [HasStdExtZbbOrZbp, IsRV32] in
+let Predicates = [HasStdExtZbp, IsRV32] in
 def : Pat<(or (and GPR:$rs2, 0xFFFF0000), (srl GPR:$rs1, (i32 16))),
           (PACKU GPR:$rs1, GPR:$rs2)>;
-let Predicates = [HasStdExtZbbOrZbp, IsRV64] in
+let Predicates = [HasStdExtZbp, IsRV64] in
 def : Pat<(or (and GPR:$rs2, 0xFFFFFFFF00000000), (srl GPR:$rs1, (i64 32))),
           (PACKU GPR:$rs1, GPR:$rs2)>;
-let Predicates = [HasStdExtZbbOrZbp] in
+let Predicates = [HasStdExtZbp] in
 def : Pat<(or (and (shl GPR:$rs2, (XLenVT 8)), 0xFF00),
               (and GPR:$rs1, 0x00FF)),
           (PACKH GPR:$rs1, GPR:$rs2)>;
 
+let Predicates = [HasStdExtZbbOrZbp, IsRV32] in
+def : Pat<(and GPR:$rs, 0x0000FFFF), (ZEXTH_RV32 GPR:$rs)>;
+let Predicates = [HasStdExtZbbOrZbp, IsRV64] in {
+def : Pat<(and GPR:$rs, 0x000000000000FFFF), (ZEXTH_RV64 GPR:$rs)>;
+}
+
 let Predicates = [HasStdExtZbp, IsRV32] in {
 def : Pat<(or (or (and (shl GPR:$rs1, (i32 8)), (i32 0x00FF0000)),
                   (and GPR:$rs1, (i32 0xFF0000FF))),
@@ -908,156 +850,115 @@ def : Pat<(or (or (and (shl GPR:$rs1, (i64 1)), (i64 0x4444444444444444)),
           (SHFLI GPR:$rs1, (i64 1))>;
 } // Predicates = [HasStdExtZbp, IsRV64]
 
-let Predicates = [HasStdExtZbb, IsRV64] in {
-def : Pat<(and (add GPR:$rs, simm12:$simm12), (i64 0xFFFFFFFF)),
-          (ADDIWU GPR:$rs, simm12:$simm12)>;
-def : Pat<(SLLIUWPat GPR:$rs1, uimmlog2xlen:$shamt),
-          (SLLIUW GPR:$rs1, uimmlog2xlen:$shamt)>;
-def : Pat<(and (add GPR:$rs1, GPR:$rs2), (i64 0xFFFFFFFF)),
-          (ADDWU GPR:$rs1, GPR:$rs2)>;
-def : Pat<(and (sub GPR:$rs1, GPR:$rs2), (i64 0xFFFFFFFF)),
-          (SUBWU GPR:$rs1, GPR:$rs2)>;
-def : Pat<(add GPR:$rs1, (and GPR:$rs2, (i64 0xFFFFFFFF))),
+let Predicates = [HasStdExtZba] in {
+def : Pat<(add (shl GPR:$rs1, (XLenVT 1)), GPR:$rs2),
+          (SH1ADD GPR:$rs1, GPR:$rs2)>;
+def : Pat<(add (shl GPR:$rs1, (XLenVT 2)), GPR:$rs2),
+          (SH2ADD GPR:$rs1, GPR:$rs2)>;
+def : Pat<(add (shl GPR:$rs1, (XLenVT 3)), GPR:$rs2),
+          (SH3ADD GPR:$rs1, GPR:$rs2)>;
+} // Predicates = [HasStdExtZba]
+
+let Predicates = [HasStdExtZba, IsRV64] in {
+def : Pat<(SLLIUWPat GPR:$rs1, uimm5:$shamt),
+          (SLLIUW GPR:$rs1, uimm5:$shamt)>;
+def : Pat<(shl (and GPR:$rs1, 0xFFFFFFFF), uimm5:$shamt),
+          (SLLIUW GPR:$rs1, uimm5:$shamt)>;
+def : Pat<(add (and GPR:$rs1, (i64 0xFFFFFFFF)), GPR:$rs2),
           (ADDUW GPR:$rs1, GPR:$rs2)>;
-def : Pat<(sub GPR:$rs1, (and GPR:$rs2, (i64 0xFFFFFFFF))),
-          (SUBUW GPR:$rs1, GPR:$rs2)>;
-def : Pat<(xor (riscv_sllw (xor GPR:$rs1, -1), GPR:$rs2), -1),
-          (SLOW GPR:$rs1, GPR:$rs2)>;
-def : Pat<(xor (riscv_srlw (xor GPR:$rs1, -1), GPR:$rs2), -1),
-          (SROW GPR:$rs1, GPR:$rs2)>;
-} // Predicates = [HasStdExtZbb, IsRV64]
+def : Pat<(and GPR:$rs, 0x00000000FFFFFFFF), (ADDUW GPR:$rs, X0)>;
+
+def : Pat<(add (shl (and GPR:$rs1, (i64 0xFFFFFFFF)), (XLenVT 1)), GPR:$rs2),
+          (SH1ADDUW GPR:$rs1, GPR:$rs2)>;
+def : Pat<(add (shl (and GPR:$rs1, (i64 0xFFFFFFFF)), (XLenVT 2)), GPR:$rs2),
+          (SH2ADDUW GPR:$rs1, GPR:$rs2)>;
+def : Pat<(add (shl (and GPR:$rs1, (i64 0xFFFFFFFF)), (XLenVT 3)), GPR:$rs2),
+          (SH3ADDUW GPR:$rs1, GPR:$rs2)>;
+
+def : Pat<(add (SLLIUWPat GPR:$rs1, (XLenVT 1)), GPR:$rs2),
+          (SH1ADDUW GPR:$rs1, GPR:$rs2)>;
+def : Pat<(add (SLLIUWPat GPR:$rs1, (XLenVT 2)), GPR:$rs2),
+          (SH2ADDUW GPR:$rs1, GPR:$rs2)>;
+def : Pat<(add (SLLIUWPat GPR:$rs1, (XLenVT 3)), GPR:$rs2),
+          (SH3ADDUW GPR:$rs1, GPR:$rs2)>;
+} // Predicates = [HasStdExtZba, IsRV64]
 
 let Predicates = [HasStdExtZbbOrZbp, IsRV64] in {
-def : Pat<(or (riscv_sllw (assertsexti32 GPR:$rs1), (assertsexti32 GPR:$rs2)),
-              (riscv_srlw (assertsexti32 GPR:$rs1),
-                          (sub (i64 0), (assertsexti32 GPR:$rs2)))),
+def : Pat<(riscv_rolw GPR:$rs1, GPR:$rs2),
           (ROLW GPR:$rs1, GPR:$rs2)>;
-def : Pat<(or (riscv_sllw (assertsexti32 GPR:$rs1),
-                          (sub (i64 0), (assertsexti32 GPR:$rs2))),
-              (riscv_srlw (assertsexti32 GPR:$rs1), (assertsexti32 GPR:$rs2))),
+def : Pat<(riscv_rorw GPR:$rs1, GPR:$rs2),
           (RORW GPR:$rs1, GPR:$rs2)>;
+def : Pat<(riscv_rorw GPR:$rs1, uimm5:$rs2),
+          (RORIW GPR:$rs1, uimm5:$rs2)>;
+def : Pat<(riscv_rolw GPR:$rs1, uimm5:$rs2),
+          (RORIW GPR:$rs1, (ImmSubFrom32 uimm5:$rs2))>;
 } // Predicates = [HasStdExtZbbOrZbp, IsRV64]
 
 let Predicates = [HasStdExtZbs, IsRV64] in {
-def : Pat<(and (xor (riscv_sllw 1, (assertsexti32 GPR:$rs2)), -1),
-               (assertsexti32 GPR:$rs1)),
-          (SBCLRW GPR:$rs1, GPR:$rs2)>;
-def : Pat<(or (riscv_sllw 1, (assertsexti32 GPR:$rs2)),
-              (assertsexti32 GPR:$rs1)),
-          (SBSETW GPR:$rs1, GPR:$rs2)>;
-def : Pat<(xor (riscv_sllw 1, (assertsexti32 GPR:$rs2)),
-               (assertsexti32 GPR:$rs1)),
-          (SBINVW GPR:$rs1, GPR:$rs2)>;
-def : Pat<(and (riscv_srlw (assertsexti32 GPR:$rs1), (assertsexti32 GPR:$rs2)),
-               1),
-          (SBEXTW GPR:$rs1, GPR:$rs2)>;
-} // Predicates = [HasStdExtZbs, IsRV64]
-
-let Predicates = [HasStdExtZbb, IsRV64] in {
-def : Pat<(SLOIWPat GPR:$rs1, uimmlog2xlen:$shamt),
-          (SLOIW GPR:$rs1, uimmlog2xlen:$shamt)>;
-def : Pat<(SROIWPat GPR:$rs1, uimmlog2xlen:$shamt),
-          (SROIW GPR:$rs1, uimmlog2xlen:$shamt)>;
-} // Predicates = [HasStdExtZbb, IsRV64]
+def : Pat<(and (not (riscv_sllw 1, GPR:$rs2)), (assertsexti32 GPR:$rs1)),
+          (BCLRW GPR:$rs1, GPR:$rs2)>;
+def : Pat<(sext_inreg (and (not (riscv_sllw 1, GPR:$rs2)), GPR:$rs1), i32),
+          (BCLRW GPR:$rs1, GPR:$rs2)>;
+def : Pat<(or (riscv_sllw 1, GPR:$rs2), (assertsexti32 GPR:$rs1)),
+          (BSETW GPR:$rs1, GPR:$rs2)>;
+def : Pat<(sext_inreg (or (riscv_sllw 1, GPR:$rs2), GPR:$rs1), i32),
+          (BSETW GPR:$rs1, GPR:$rs2)>;
+def : Pat<(xor (riscv_sllw 1, GPR:$rs2), (assertsexti32 GPR:$rs1)),
+          (BINVW GPR:$rs1, GPR:$rs2)>;
+def : Pat<(sext_inreg (xor (riscv_sllw 1, GPR:$rs2), GPR:$rs1), i32),
+          (BINVW GPR:$rs1, GPR:$rs2)>;
+def : Pat<(and (riscv_srlw GPR:$rs1, GPR:$rs2), 1),
+          (BEXTW GPR:$rs1, GPR:$rs2)>;
+
+def : Pat<(riscv_sllw 1, GPR:$rs2),
+          (BSETW X0, GPR:$rs2)>;
+
+def : Pat<(and (assertsexti32 GPR:$rs1), BCLRWMask:$mask),
+          (BCLRIW GPR:$rs1, (BCLRXForm imm:$mask))>;
+def : Pat<(or (assertsexti32 GPR:$rs1), BSETINVWMask:$mask),
+          (BSETIW GPR:$rs1, (BSETINVXForm imm:$mask))>;
+def : Pat<(xor (assertsexti32 GPR:$rs1), BSETINVWMask:$mask),
+          (BINVIW GPR:$rs1, (BSETINVXForm imm:$mask))>;
 
-let Predicates = [HasStdExtZbbOrZbp, IsRV64] in
-def : Pat<(RORIWPat GPR:$rs1, uimmlog2xlen:$shamt),
-          (RORIW GPR:$rs1, uimmlog2xlen:$shamt)>;
+} // Predicates = [HasStdExtZbs, IsRV64]
 
 let Predicates = [HasStdExtZbp, IsRV64] in {
-def : Pat<(sext_inreg (or (or (and (srl GPR:$rs1, (i64 1)), (i64 0x55555555)),
-                              GPR:$rs1),
-                          (and (shl GPR:$rs1, (i64 1)), (i64 0xAAAAAAAA))),
-                      i32),
-          (GORCIW GPR:$rs1, (i64 1))>;
-def : Pat<(sext_inreg (or (or (and (srl GPR:$rs1, (i64 2)), (i64 0x33333333)),
-                              GPR:$rs1),
-                          (and (shl GPR:$rs1, (i64 2)), (i64 0xCCCCCCCC))),
-                      i32),
-          (GORCIW GPR:$rs1, (i64 2))>;
-def : Pat<(sext_inreg (or (or (and (srl GPR:$rs1, (i64 4)), (i64 0x0F0F0F0F)),
-                              GPR:$rs1),
-                          (and (shl GPR:$rs1, (i64 4)), (i64 0xF0F0F0F0))),
-                      i32),
-          (GORCIW GPR:$rs1, (i64 4))>;
-def : Pat<(sext_inreg (or (or (and (srl GPR:$rs1, (i64 8)), (i64 0x00FF00FF)),
-                              GPR:$rs1),
-                          (and (shl GPR:$rs1, (i64 8)), (i64 0xFF00FF00))),
-                      i32),
-          (GORCIW GPR:$rs1, (i64 8))>;
-def : Pat<(sext_inreg (or (or (and (srl GPR:$rs1, (i64 16)), (i64 0x0000FFFF)),
-                              GPR:$rs1),
-                          (and (shl GPR:$rs1, (i64 16)), (i64 0xFFFF0000))),
-                      i32),
-          (GORCIW GPR:$rs1, (i64 16))>;
-def : Pat<(sext_inreg (or (or (srl (and GPR:$rs1, (i64 0xFFFF0000)), (i64 16)),
-                              GPR:$rs1),
-                          (shl GPR:$rs1, (i64 16))), i32),
-          (GORCIW GPR:$rs1, (i64 16))>;
-
-def : Pat<(sext_inreg (or (and (shl GPR:$rs1, (i64 1)), (i64 0xAAAAAAAA)),
-                          (and (srl GPR:$rs1, (i64 1)), (i64 0x55555555))),
-                      i32),
-          (GREVIW GPR:$rs1, (i64 1))>;
-def : Pat<(sext_inreg (or (and (shl GPR:$rs1, (i64 2)), (i64 0xCCCCCCCC)),
-                          (and (srl GPR:$rs1, (i64 2)), (i64 0x33333333))),
-                      i32),
-          (GREVIW GPR:$rs1, (i64 2))>;
-def : Pat<(sext_inreg (or (and (shl GPR:$rs1, (i64 4)), (i64 0xF0F0F0F0)),
-                          (and (srl GPR:$rs1, (i64 4)), (i64 0x0F0F0F0F))),
-                      i32),
-          (GREVIW GPR:$rs1, (i64 4))>;
-def : Pat<(sext_inreg (or (and (shl GPR:$rs1, (i64 8)), (i64 0xFF00FF00)),
-                          (and (srl GPR:$rs1, (i64 8)), (i64 0x00FF00FF))),
-                      i32),
-          (GREVIW GPR:$rs1, (i64 8))>;
-def : Pat<(sext_inreg (or (shl GPR:$rs1, (i64 16)),
-                          (srl (and GPR:$rs1, 0xFFFF0000), (i64 16))), i32),
-          (GREVIW GPR:$rs1, (i64 16))>;
-def : Pat<(sra (bswap GPR:$rs1), (i64 32)), (GREVIW GPR:$rs1, (i64 24))>;
-def : Pat<(sra (bitreverse GPR:$rs1), (i64 32)), (GREVIW GPR:$rs1, (i64 31))>;
+def : Pat<(riscv_rorw (riscv_greviw GPR:$rs1, 24), (i64 16)), (GREVIW GPR:$rs1, 8)>;
+def : Pat<(riscv_rolw (riscv_greviw GPR:$rs1, 24), (i64 16)), (GREVIW GPR:$rs1, 8)>;
+def : Pat<(riscv_greviw GPR:$rs1, timm:$shamt), (GREVIW GPR:$rs1, timm:$shamt)>;
+def : Pat<(riscv_gorciw GPR:$rs1, timm:$shamt), (GORCIW GPR:$rs1, timm:$shamt)>;
 } // Predicates = [HasStdExtZbp, IsRV64]
 
 let Predicates = [HasStdExtZbt, IsRV64] in {
-def : Pat<(riscv_selectcc (and (assertsexti32 GPR:$rs3), 31),
-                          (i64 0),
-                          (i64 17),
-                          (assertsexti32 GPR:$rs1),
-                          (or (riscv_sllw (assertsexti32 GPR:$rs1),
-                                          (and (assertsexti32 GPR:$rs3), 31)),
-                              (riscv_srlw (assertsexti32 GPR:$rs2),
-                                          (sub (i64 32),
-                                               (assertsexti32 GPR:$rs3))))),
+def : Pat<(riscv_fslw GPR:$rs1, GPR:$rs3, GPR:$rs2),
           (FSLW GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
-def : Pat<(riscv_selectcc (and (assertsexti32 GPR:$rs3), 31),
-                          (i64 0),
-                          (i64 17),
-                          (assertsexti32 GPR:$rs2),
-                          (or (riscv_sllw (assertsexti32 GPR:$rs1),
-                                          (sub (i64 32),
-                                               (assertsexti32 GPR:$rs3))),
-                              (riscv_srlw (assertsexti32 GPR:$rs2),
-                                          (and (assertsexti32 GPR:$rs3), 31)))),
+def : Pat<(riscv_fsrw GPR:$rs3, GPR:$rs1, GPR:$rs2),
           (FSRW GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
-def : Pat<(FSRIWPat GPR:$rs1, GPR:$rs2, uimmlog2xlen:$shamt),
-          (FSRIW GPR:$rs1, GPR:$rs2, uimmlog2xlen:$shamt)>;
+def : Pat<(riscv_fsrw GPR:$rs3, GPR:$rs1, uimm5:$shamt),
+          (FSRIW GPR:$rs1, GPR:$rs3, uimm5:$shamt)>;
+def : Pat<(riscv_fslw GPR:$rs3, GPR:$rs1, uimm5:$shamt),
+          (FSRIW GPR:$rs1, GPR:$rs3, (ImmSubFrom32 uimm5:$shamt))>;
 } // Predicates = [HasStdExtZbt, IsRV64]
 
 let Predicates = [HasStdExtZbb, IsRV64] in {
 def : Pat<(add (ctlz (and GPR:$rs1, (i64 0xFFFFFFFF))), (i64 -32)),
           (CLZW GPR:$rs1)>;
-// We don't pattern-match CTZW here as it has the same pattern and result as
-// RV64 CTZ
-def : Pat<(ctpop (and GPR:$rs1, (i64 0xFFFFFFFF))), (PCNTW GPR:$rs1)>;
+// computeKnownBits can't figure out that the and mask on the add result is
+// unnecessary so we need to pattern match it away.
+def : Pat<(and (add (ctlz (and GPR:$rs1, (i64 0xFFFFFFFF))), (i64 -32)),
+               (i64 0xFFFFFFFF)),
+          (CLZW GPR:$rs1)>;
+def : Pat<(cttz (or GPR:$rs1, (i64 0x100000000))),
+          (CTZW GPR:$rs1)>;
+def : Pat<(ctpop (and GPR:$rs1, (i64 0xFFFFFFFF))), (CPOPW GPR:$rs1)>;
 } // Predicates = [HasStdExtZbb, IsRV64]
 
-let Predicates = [HasStdExtZbbOrZbp, IsRV64] in {
-def : Pat<(sext_inreg (or (shl (assertsexti32 GPR:$rs2), (i64 16)),
-                          (and (assertsexti32 GPR:$rs1), 0x000000000000FFFF)),
+let Predicates = [HasStdExtZbp, IsRV64] in {
+def : Pat<(sext_inreg (or (shl GPR:$rs2, (i64 16)),
+                          (and GPR:$rs1, 0x000000000000FFFF)),
                       i32),
           (PACKW GPR:$rs1, GPR:$rs2)>;
 def : Pat<(or (and (assertsexti32 GPR:$rs2), 0xFFFFFFFFFFFF0000),
-              (srl (and (assertsexti32 GPR:$rs1), 0x00000000FFFF0000),
-                   (i64 16))),
+              (SRLIWPat GPR:$rs1, (i64 16))),
           (PACKUW GPR:$rs1, GPR:$rs2)>;
-} // Predicates = [HasStdExtZbbOrZbp, IsRV64]
+} // Predicates = [HasStdExtZbp, IsRV64]
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoC.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
index f68767847ade..30df455c1927 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
@@ -140,6 +140,7 @@ def uimm8_lsb000 : Operand<XLenVT>,
 def simm9_lsb0 : Operand<OtherVT>,
                  ImmLeaf<XLenVT, [{return isShiftedInt<8, 1>(Imm);}]> {
   let ParserMatchClass = SImmAsmOperand<9, "Lsb0">;
+  let PrintMethod = "printBranchOperand";
   let EncoderMethod = "getImmOpValueAsr1";
   let DecoderMethod = "decodeSImmOperandAndLsl1<9>";
   let MCOperandPredicate = [{
@@ -149,6 +150,7 @@ def simm9_lsb0 : Operand<OtherVT>,
     return MCOp.isBareSymbolRef();
 
   }];
+  let OperandType = "OPERAND_PCREL";
 }
 
 // A 9-bit unsigned immediate where the least significant three bits are zero.
@@ -200,6 +202,7 @@ def simm10_lsb0000nonzero : Operand<XLenVT>,
 def simm12_lsb0 : Operand<XLenVT>,
                   ImmLeaf<XLenVT, [{return isShiftedInt<11, 1>(Imm);}]> {
   let ParserMatchClass = SImmAsmOperand<12, "Lsb0">;
+  let PrintMethod = "printBranchOperand";
   let EncoderMethod = "getImmOpValueAsr1";
   let DecoderMethod = "decodeSImmOperandAndLsl1<12>";
   let MCOperandPredicate = [{
@@ -208,6 +211,7 @@ def simm12_lsb0 : Operand<XLenVT>,
       return isShiftedInt<11, 1>(Imm);
     return MCOp.isBareSymbolRef();
   }];
+  let OperandType = "OPERAND_PCREL";
 }
 
 //===----------------------------------------------------------------------===//
@@ -239,7 +243,7 @@ class CStore_rri<bits<3> funct3, string OpcodeStr,
                  OpcodeStr, "$rs2, ${imm}(${rs1})">;
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class Bcz<bits<3> funct3, string OpcodeStr, PatFrag CondOp,
+class Bcz<bits<3> funct3, string OpcodeStr,
           RegisterClass cls>
     : RVInst16CB<funct3, 0b01, (outs), (ins cls:$rs1, simm9_lsb0:$imm),
                  OpcodeStr, "$rs1, $imm"> {
@@ -469,8 +473,8 @@ def C_J : RVInst16CJ<0b101, 0b01, (outs), (ins simm12_lsb0:$offset),
   let isBarrier=1;
 }
 
-def C_BEQZ : Bcz<0b110, "c.beqz",  seteq, GPRC>, Sched<[WriteJmp]>;
-def C_BNEZ : Bcz<0b111, "c.bnez",  setne, GPRC>, Sched<[WriteJmp]>;
+def C_BEQZ : Bcz<0b110, "c.beqz", GPRC>, Sched<[WriteJmp]>;
+def C_BNEZ : Bcz<0b111, "c.bnez", GPRC>, Sched<[WriteJmp]>;
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
 def C_SLLI : RVInst16CI<0b000, 0b10, (outs GPRNoX0:$rd_wb),
@@ -519,7 +523,8 @@ def C_JR : RVInst16CR<0b1000, 0b10, (outs), (ins GPRNoX0:$rs1),
   let rs2 = 0;
 }
 
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isMoveReg = 1,
+    isAsCheapAsAMove = 1 in
 def C_MV : RVInst16CR<0b1000, 0b10, (outs GPRNoX0:$rs1), (ins GPRNoX0:$rs2),
                       "c.mv", "$rs1, $rs2">,
            Sched<[WriteIALU, ReadIALU]>;
@@ -744,6 +749,7 @@ class CompressPat<dag input, dag output> {
   dag Input  = input;
   dag Output    = output;
   list<Predicate> Predicates = [];
+  bit isCompressOnly = false;
 }
 
 // Patterns are defined in the same order the compressed instructions appear
@@ -829,25 +835,30 @@ def : CompressPat<(SUB GPRC:$rs1, GPRC:$rs1, GPRC:$rs2),
                   (C_SUB GPRC:$rs1, GPRC:$rs2)>;
 def : CompressPat<(XOR GPRC:$rs1, GPRC:$rs1, GPRC:$rs2),
                   (C_XOR GPRC:$rs1, GPRC:$rs2)>;
+let isCompressOnly = true in
 def : CompressPat<(XOR GPRC:$rs1, GPRC:$rs2, GPRC:$rs1),
                   (C_XOR GPRC:$rs1, GPRC:$rs2)>;
 def : CompressPat<(OR GPRC:$rs1, GPRC:$rs1, GPRC:$rs2),
                   (C_OR GPRC:$rs1, GPRC:$rs2)>;
+let isCompressOnly = true in
 def : CompressPat<(OR GPRC:$rs1, GPRC:$rs2, GPRC:$rs1),
                   (C_OR GPRC:$rs1, GPRC:$rs2)>;
 def : CompressPat<(AND GPRC:$rs1, GPRC:$rs1, GPRC:$rs2),
                   (C_AND GPRC:$rs1, GPRC:$rs2)>;
+let isCompressOnly = true in
 def : CompressPat<(AND GPRC:$rs1, GPRC:$rs2, GPRC:$rs1),
                   (C_AND GPRC:$rs1, GPRC:$rs2)>;
 } //  Predicates = [HasStdExtC]
 
 let Predicates = [HasStdExtC, IsRV64] in {
+let isCompressOnly = true in
 def : CompressPat<(ADDIW GPRNoX0:$rd, X0, simm6:$imm),
                   (C_LI GPRNoX0:$rd, simm6:$imm)>;
 def : CompressPat<(SUBW GPRC:$rs1, GPRC:$rs1, GPRC:$rs2),
                   (C_SUBW GPRC:$rs1, GPRC:$rs2)>;
 def : CompressPat<(ADDW GPRC:$rs1, GPRC:$rs1, GPRC:$rs2),
                    (C_ADDW GPRC:$rs1, GPRC:$rs2)>;
+let isCompressOnly = true in
 def : CompressPat<(ADDW GPRC:$rs1, GPRC:$rs2, GPRC:$rs1),
                    (C_ADDW GPRC:$rs1, GPRC:$rs2)>;
 } // Predicates = [HasStdExtC, IsRV64]
@@ -890,10 +901,12 @@ def : CompressPat<(LD GPRNoX0:$rd, SP:$rs1, uimm9_lsb000:$imm),
 let Predicates = [HasStdExtC] in {
 def : CompressPat<(JALR X0, GPRNoX0:$rs1, 0),
                   (C_JR GPRNoX0:$rs1)>;
+let isCompressOnly = true in {
 def : CompressPat<(ADD GPRNoX0:$rs1, X0, GPRNoX0:$rs2),
                   (C_MV GPRNoX0:$rs1, GPRNoX0:$rs2)>;
 def : CompressPat<(ADD GPRNoX0:$rs1, GPRNoX0:$rs2, X0),
                   (C_MV GPRNoX0:$rs1, GPRNoX0:$rs2)>;
+}
 def : CompressPat<(ADDI GPRNoX0:$rs1, GPRNoX0:$rs2, 0),
                   (C_MV GPRNoX0:$rs1, GPRNoX0:$rs2)>;
 def : CompressPat<(EBREAK), (C_EBREAK)>;
@@ -902,6 +915,7 @@ def : CompressPat<(JALR X1, GPRNoX0:$rs1, 0),
                   (C_JALR GPRNoX0:$rs1)>;
 def : CompressPat<(ADD GPRNoX0:$rs1, GPRNoX0:$rs1, GPRNoX0:$rs2),
                   (C_ADD GPRNoX0:$rs1, GPRNoX0:$rs2)>;
+let isCompressOnly = true in
 def : CompressPat<(ADD GPRNoX0:$rs1, GPRNoX0:$rs2, GPRNoX0:$rs1),
                   (C_ADD GPRNoX0:$rs1, GPRNoX0:$rs2)>;
 } // Predicates = [HasStdExtC]
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoD.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
index 6c36f53cd563..133599e13b8b 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
@@ -299,23 +299,6 @@ def : PatFpr64Fpr64<setolt, FLT_D>;
 def : PatFpr64Fpr64<setle, FLE_D>;
 def : PatFpr64Fpr64<setole, FLE_D>;
 
-// Define pattern expansions for setcc operations which aren't directly
-// handled by a RISC-V instruction and aren't expanded in the SelectionDAG
-// Legalizer.
-
-def : Pat<(seto FPR64:$rs1, FPR64:$rs2),
-          (AND (FEQ_D FPR64:$rs1, FPR64:$rs1),
-               (FEQ_D FPR64:$rs2, FPR64:$rs2))>;
-def : Pat<(seto FPR64:$rs1, FPR64:$rs1),
-          (FEQ_D $rs1, $rs1)>;
-
-def : Pat<(setuo FPR64:$rs1, FPR64:$rs2),
-          (SLTIU (AND (FEQ_D FPR64:$rs1, FPR64:$rs1),
-                      (FEQ_D FPR64:$rs2, FPR64:$rs2)),
-                 1)>;
-def : Pat<(setuo FPR64:$rs1, FPR64:$rs1),
-          (SLTIU (FEQ_D $rs1, $rs1), 1)>;
-
 def Select_FPR64_Using_CC_GPR : SelectCC_rrirr<FPR64, GPR>;
 
 /// Loads
@@ -361,6 +344,7 @@ let Predicates = [HasStdExtD, IsRV64] in {
 /// Float constants
 def : Pat<(f64 (fpimm0)), (FMV_D_X X0)>;
 
+// Moves (no conversion)
 def : Pat<(bitconvert GPR:$rs1), (FMV_D_X GPR:$rs1)>;
 def : Pat<(bitconvert FPR64:$rs1), (FMV_X_D FPR64:$rs1)>;
 
@@ -368,11 +352,11 @@ def : Pat<(bitconvert FPR64:$rs1), (FMV_X_D FPR64:$rs1)>;
 // because fpto[u|s]i produce poison if the value can't fit into the target.
 // We match the single case below because fcvt.wu.d sign-extends its result so
 // is cheaper than fcvt.lu.d+sext.w.
-def : Pat<(sext_inreg (zexti32 (fp_to_uint FPR64:$rs1)), i32),
+def : Pat<(sext_inreg (assertzexti32 (fp_to_uint FPR64:$rs1)), i32),
           (FCVT_WU_D $rs1, 0b001)>;
 
 // [u]int32->fp
-def : Pat<(sint_to_fp (sext_inreg GPR:$rs1, i32)), (FCVT_D_W $rs1)>;
+def : Pat<(sint_to_fp (sexti32 GPR:$rs1)), (FCVT_D_W $rs1)>;
 def : Pat<(uint_to_fp (zexti32 GPR:$rs1)), (FCVT_D_WU $rs1)>;
 
 def : Pat<(fp_to_sint FPR64:$rs1), (FCVT_L_D FPR64:$rs1, 0b001)>;
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoF.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
index ce5c3abb6a06..4529949f693e 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
@@ -303,10 +303,6 @@ def : Pat<(f32 (fpimm0)), (FMV_W_X X0)>;
 
 /// Float conversion operations
 
-// Moves (no conversion)
-def : Pat<(bitconvert GPR:$rs1), (FMV_W_X GPR:$rs1)>;
-def : Pat<(bitconvert FPR32:$rs1), (FMV_X_W FPR32:$rs1)>;
-
 // [u]int32<->float conversion patterns must be gated on IsRV32 or IsRV64, so
 // are defined later.
 
@@ -359,23 +355,6 @@ def : PatFpr32Fpr32<setolt, FLT_S>;
 def : PatFpr32Fpr32<setle, FLE_S>;
 def : PatFpr32Fpr32<setole, FLE_S>;
 
-// Define pattern expansions for setcc operations which aren't directly
-// handled by a RISC-V instruction and aren't expanded in the SelectionDAG
-// Legalizer.
-
-def : Pat<(seto FPR32:$rs1, FPR32:$rs2),
-          (AND (FEQ_S FPR32:$rs1, FPR32:$rs1),
-               (FEQ_S FPR32:$rs2, FPR32:$rs2))>;
-def : Pat<(seto FPR32:$rs1, FPR32:$rs1),
-          (FEQ_S $rs1, $rs1)>;
-
-def : Pat<(setuo FPR32:$rs1, FPR32:$rs2),
-          (SLTIU (AND (FEQ_S FPR32:$rs1, FPR32:$rs1),
-                      (FEQ_S FPR32:$rs2, FPR32:$rs2)),
-                 1)>;
-def : Pat<(setuo FPR32:$rs1, FPR32:$rs1),
-          (SLTIU (FEQ_S $rs1, $rs1), 1)>;
-
 def Select_FPR32_Using_CC_GPR : SelectCC_rrirr<FPR32, GPR>;
 
 /// Loads
@@ -389,6 +368,10 @@ defm : StPat<store, FSW, FPR32>;
 } // Predicates = [HasStdExtF]
 
 let Predicates = [HasStdExtF, IsRV32] in {
+// Moves (no conversion)
+def : Pat<(bitconvert GPR:$rs1), (FMV_W_X GPR:$rs1)>;
+def : Pat<(bitconvert FPR32:$rs1), (FMV_X_W FPR32:$rs1)>;
+
 // float->[u]int. Round-to-zero must be used.
 def : Pat<(fp_to_sint FPR32:$rs1), (FCVT_W_S $rs1, 0b001)>;
 def : Pat<(fp_to_uint FPR32:$rs1), (FCVT_WU_S $rs1, 0b001)>;
@@ -399,9 +382,10 @@ def : Pat<(uint_to_fp GPR:$rs1), (FCVT_S_WU $rs1, 0b111)>;
 } // Predicates = [HasStdExtF, IsRV32]
 
 let Predicates = [HasStdExtF, IsRV64] in {
+// Moves (no conversion)
 def : Pat<(riscv_fmv_w_x_rv64 GPR:$src), (FMV_W_X GPR:$src)>;
 def : Pat<(riscv_fmv_x_anyextw_rv64 FPR32:$src), (FMV_X_W FPR32:$src)>;
-def : Pat<(sexti32 (riscv_fmv_x_anyextw_rv64 FPR32:$src)),
+def : Pat<(sext_inreg (riscv_fmv_x_anyextw_rv64 FPR32:$src), i32),
           (FMV_X_W FPR32:$src)>;
 
 // FP->[u]int32 is mostly handled by the FP->[u]int64 patterns. This is safe
@@ -416,7 +400,7 @@ def : Pat<(fp_to_sint FPR32:$rs1), (FCVT_L_S $rs1, 0b001)>;
 def : Pat<(fp_to_uint FPR32:$rs1), (FCVT_LU_S $rs1, 0b001)>;
 
 // [u]int->fp. Match GCC and default to using dynamic rounding mode.
-def : Pat<(sint_to_fp (sext_inreg GPR:$rs1, i32)), (FCVT_S_W $rs1, 0b111)>;
+def : Pat<(sint_to_fp (sexti32 GPR:$rs1)), (FCVT_S_W $rs1, 0b111)>;
 def : Pat<(uint_to_fp (zexti32 GPR:$rs1)), (FCVT_S_WU $rs1, 0b111)>;
 def : Pat<(sint_to_fp GPR:$rs1), (FCVT_S_L $rs1, 0b111)>;
 def : Pat<(uint_to_fp GPR:$rs1), (FCVT_S_LU $rs1, 0b111)>;
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoM.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoM.td
index 987534aadd79..8cfb903a173c 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoM.td
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoM.td
@@ -81,9 +81,11 @@ def : PatGprGpr<riscv_remuw, REMUW>;
 // Handle the specific cases where using DIVU/REMU would be correct and result
 // in fewer instructions than emitting DIVUW/REMUW then zero-extending the
 // result.
-def : Pat<(zexti32 (riscv_divuw (zexti32 GPR:$rs1), (zexti32 GPR:$rs2))),
+def : Pat<(and (riscv_divuw (assertzexti32 GPR:$rs1),
+                            (assertzexti32 GPR:$rs2)), 0xffffffff),
           (DIVU GPR:$rs1, GPR:$rs2)>;
-def : Pat<(zexti32 (riscv_remuw (zexti32 GPR:$rs1), (zexti32 GPR:$rs2))),
+def : Pat<(and (riscv_remuw (assertzexti32 GPR:$rs1),
+                            (assertzexti32 GPR:$rs2)), 0xffffffff),
           (REMU GPR:$rs1, GPR:$rs2)>;
 
 // Although the sexti32 operands may not have originated from an i32 srem,
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoV.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
index 1c7f53fecb8c..b3fc76aee161 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// This file describes the RISC-V instructions from the standard 'V' Vector
-/// extension, version 0.8.
+/// extension, version 0.10.
 /// This version is still experimental as the 'V' extension hasn't been
 /// ratified yet.
 ///
@@ -31,18 +31,6 @@ def VTypeIOp : Operand<XLenVT> {
   let DecoderMethod = "decodeUImmOperand<11>";
 }
 
-def VRegAsmOperand : AsmOperandClass {
-  let Name = "RVVRegOpOperand";
-  let RenderMethod = "addRegOperands";
-  let PredicateMethod = "isReg";
-  let ParserMethod = "parseRegister";
-}
-
-def VRegOp : RegisterOperand<VR> {
-  let ParserMatchClass = VRegAsmOperand;
-  let PrintMethod = "printOperand";
-}
-
 def VMaskAsmOperand : AsmOperandClass {
   let Name = "RVVMaskRegOpOperand";
   let RenderMethod = "addRegOperands";
@@ -74,14 +62,13 @@ def simm5 : Operand<XLenVT>, ImmLeaf<XLenVT, [{return isInt<5>(Imm);}]> {
 
 def SImm5Plus1AsmOperand : AsmOperandClass {
   let Name = "SImm5Plus1";
-  let RenderMethod = "addSImm5Plus1Operands";
+  let RenderMethod = "addImmOperands";
   let DiagnosticType = "InvalidSImm5Plus1";
 }
 
 def simm5_plus1 : Operand<XLenVT>, ImmLeaf<XLenVT,
                                            [{return isInt<5>(Imm - 1);}]> {
   let ParserMatchClass = SImm5Plus1AsmOperand;
-  let PrintMethod = "printSImm5Plus1";
   let MCOperandPredicate = [{
     int64_t Imm;
     if (MCOp.evaluateAsConstantImm(Imm))
@@ -95,162 +82,242 @@ def simm5_plus1 : Operand<XLenVT>, ImmLeaf<XLenVT,
 //===----------------------------------------------------------------------===//
 
 let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
+// load vd, (rs1)
+class VUnitStrideLoadMask<string opcodestr>
+    : RVInstVLU<0b000, LSWidth8.Value{3}, LUMOPUnitStrideMask, LSWidth8.Value{2-0},
+                (outs VR:$vd),
+                (ins GPR:$rs1), opcodestr, "$vd, (${rs1})">;
+
 // load vd, (rs1), vm
-class VUnitStrideLoad<RISCVMOP mop, RISCVLSUMOP lumop, RISCVWidth width,
-                        string opcodestr>
-    : RVInstVLU<0b000, mop, lumop, width, (outs VRegOp:$vd),
+class VUnitStrideLoad<RISCVLSUMOP lumop, RISCVWidth width,
+                      string opcodestr>
+    : RVInstVLU<0b000, width.Value{3}, lumop, width.Value{2-0},
+                (outs VR:$vd),
                 (ins GPR:$rs1, VMaskOp:$vm), opcodestr, "$vd, (${rs1})$vm">;
 
 // load vd, (rs1), rs2, vm
-class VStridedLoad<RISCVMOP mop, RISCVWidth width, string opcodestr>
-    : RVInstVLS<0b000, mop, width, (outs VRegOp:$vd),
+class VStridedLoad<RISCVWidth width, string opcodestr>
+    : RVInstVLS<0b000, width.Value{3}, width.Value{2-0},
+                (outs VR:$vd),
                 (ins GPR:$rs1, GPR:$rs2, VMaskOp:$vm), opcodestr,
                 "$vd, (${rs1}), $rs2$vm">;
 
 // load vd, (rs1), vs2, vm
 class VIndexedLoad<RISCVMOP mop, RISCVWidth width, string opcodestr>
-    : RVInstVLX<0b000, mop, width, (outs VRegOp:$vd),
-                (ins GPR:$rs1, VRegOp:$vs2, VMaskOp:$vm), opcodestr,
+    : RVInstVLX<0b000, width.Value{3}, mop, width.Value{2-0},
+                (outs VR:$vd),
+                (ins GPR:$rs1, VR:$vs2, VMaskOp:$vm), opcodestr,
                 "$vd, (${rs1}), $vs2$vm">;
 
 // vl<nf>r.v vd, (rs1)
-class VWholeLoad<bits<3> nf, string opcodestr>
-    : RVInstVLU<nf, MOPLDUnitStrideU, LUMOPUnitStrideWholeReg,
-                LSWidthVSEW, (outs VRegOp:$vd), (ins GPR:$rs1),
+class VWholeLoad<bits<3> nf, RISCVWidth width, string opcodestr>
+    : RVInstVLU<nf, width.Value{3}, LUMOPUnitStrideWholeReg,
+                width.Value{2-0}, (outs VR:$vd), (ins GPR:$rs1),
                 opcodestr, "$vd, (${rs1})"> {
   let vm = 1;
   let Uses = [];
+  let RVVConstraint = NoConstraint;
 }
+
+// segment load vd, (rs1), vm
+class VUnitStrideSegmentLoad<bits<3> nf, RISCVLSUMOP lumop,
+                             RISCVWidth width, string opcodestr>
+    : RVInstVLU<nf, width.Value{3}, lumop, width.Value{2-0},
+                (outs VR:$vd),
+                (ins GPR:$rs1, VMaskOp:$vm), opcodestr, "$vd, (${rs1})$vm">;
+
+// segment load vd, (rs1), rs2, vm
+class VStridedSegmentLoad<bits<3> nf, RISCVWidth width, string opcodestr>
+    : RVInstVLS<nf, width.Value{3}, width.Value{2-0},
+                (outs VR:$vd),
+                (ins GPR:$rs1, GPR:$rs2, VMaskOp:$vm), opcodestr,
+                "$vd, (${rs1}), $rs2$vm">;
+
+// segment load vd, (rs1), vs2, vm
+class VIndexedSegmentLoad<bits<3> nf, RISCVMOP mop, RISCVWidth width,
+                          string opcodestr>
+    : RVInstVLX<nf, width.Value{3}, mop, width.Value{2-0},
+                (outs VR:$vd),
+                (ins GPR:$rs1, VR:$vs2, VMaskOp:$vm), opcodestr,
+                "$vd, (${rs1}), $vs2$vm">;
 } // hasSideEffects = 0, mayLoad = 1, mayStore = 0
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in {
 // store vd, vs3, (rs1), vm
-class VUnitStrideStore<RISCVMOP mop, RISCVLSUMOP sumop, RISCVWidth width,
+class VUnitStrideStoreMask<string opcodestr>
+    : RVInstVSU<0b000, LSWidth8.Value{3}, SUMOPUnitStrideMask, LSWidth8.Value{2-0},
+                (outs), (ins VR:$vs3, GPR:$rs1), opcodestr,
+                "$vs3, (${rs1})">;
+
+// store vd, vs3, (rs1), vm
+class VUnitStrideStore<RISCVLSUMOP sumop, RISCVWidth width,
                          string opcodestr>
-    : RVInstVSU<0b000, mop, sumop, width, (outs),
-                (ins VRegOp:$vs3, GPR:$rs1, VMaskOp:$vm), opcodestr,
+    : RVInstVSU<0b000, width.Value{3}, sumop, width.Value{2-0},
+                (outs), (ins VR:$vs3, GPR:$rs1, VMaskOp:$vm), opcodestr,
                 "$vs3, (${rs1})$vm">;
 
 // store vd, vs3, (rs1), rs2, vm
-class VStridedStore<RISCVMOP mop, RISCVWidth width, string opcodestr>
-    : RVInstVSS<0b000, mop, width, (outs),
-                (ins VRegOp:$vs3, GPR:$rs1, GPR:$rs2, VMaskOp:$vm),
+class VStridedStore<RISCVWidth width, string opcodestr>
+    : RVInstVSS<0b000, width.Value{3}, width.Value{2-0}, (outs),
+                (ins VR:$vs3, GPR:$rs1, GPR:$rs2, VMaskOp:$vm),
                 opcodestr, "$vs3, (${rs1}), $rs2$vm">;
 
 // store vd, vs3, (rs1), vs2, vm
 class VIndexedStore<RISCVMOP mop, RISCVWidth width, string opcodestr>
-    : RVInstVSX<0b000, mop, width, (outs),
-                (ins VRegOp:$vs3, GPR:$rs1, VRegOp:$vs2, VMaskOp:$vm),
+    : RVInstVSX<0b000, width.Value{3}, mop, width.Value{2-0}, (outs),
+                (ins VR:$vs3, GPR:$rs1, VR:$vs2, VMaskOp:$vm),
                 opcodestr, "$vs3, (${rs1}), $vs2$vm">;
 
 // vs<nf>r.v vd, (rs1)
 class VWholeStore<bits<3> nf, string opcodestr>
-    : RVInstVSU<nf, MOPSTUnitStride, SUMOPUnitStrideWholeReg,
-                LSWidthVSEW, (outs), (ins VRegOp:$vs3, GPR:$rs1),
+    : RVInstVSU<nf, 0, SUMOPUnitStrideWholeReg,
+                0b000, (outs), (ins VR:$vs3, GPR:$rs1),
                 opcodestr, "$vs3, (${rs1})"> {
   let vm = 1;
   let Uses = [];
 }
+
+// segment store vd, vs3, (rs1), vm
+class VUnitStrideSegmentStore<bits<3> nf, RISCVWidth width, string opcodestr>
+    : RVInstVSU<nf, width.Value{3}, SUMOPUnitStride, width.Value{2-0},
+                (outs), (ins VR:$vs3, GPR:$rs1, VMaskOp:$vm), opcodestr,
+                "$vs3, (${rs1})$vm">;
+
+// segment store vd, vs3, (rs1), rs2, vm
+class VStridedSegmentStore<bits<3> nf, RISCVWidth width, string opcodestr>
+    : RVInstVSS<nf, width.Value{3}, width.Value{2-0}, (outs),
+                (ins VR:$vs3, GPR:$rs1, GPR:$rs2, VMaskOp:$vm),
+                opcodestr, "$vs3, (${rs1}), $rs2$vm">;
+
+// segment store vd, vs3, (rs1), vs2, vm
+class VIndexedSegmentStore<bits<3> nf, RISCVMOP mop, RISCVWidth width,
+                           string opcodestr>
+    : RVInstVSX<nf, width.Value{3}, mop, width.Value{2-0}, (outs),
+                (ins VR:$vs3, GPR:$rs1, VR:$vs2, VMaskOp:$vm),
+                opcodestr, "$vs3, (${rs1}), $vs2$vm">;
 } // hasSideEffects = 0, mayLoad = 0, mayStore = 1
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
 // op vd, vs2, vs1, vm
 class VALUVV<bits<6> funct6, RISCVVFormat opv, string opcodestr>
-    : RVInstVV<funct6, opv, (outs VRegOp:$vd),
-                (ins VRegOp:$vs2, VRegOp:$vs1, VMaskOp:$vm),
+    : RVInstVV<funct6, opv, (outs VR:$vd),
+                (ins VR:$vs2, VR:$vs1, VMaskOp:$vm),
                 opcodestr, "$vd, $vs2, $vs1$vm">;
 
 // op vd, vs2, vs1, v0 (without mask, use v0 as carry input)
 class VALUmVV<bits<6> funct6, RISCVVFormat opv, string opcodestr>
-    : RVInstVV<funct6, opv, (outs VRegOp:$vd),
-                (ins VRegOp:$vs2, VRegOp:$vs1, VMV0:$v0),
+    : RVInstVV<funct6, opv, (outs VR:$vd),
+                (ins VR:$vs2, VR:$vs1, VMV0:$v0),
                 opcodestr, "$vd, $vs2, $vs1, v0"> {
   let vm = 0;
 }
 
 // op vd, vs1, vs2, vm (reverse the order of vs1 and vs2)
 class VALUrVV<bits<6> funct6, RISCVVFormat opv, string opcodestr>
-    : RVInstVV<funct6, opv, (outs VRegOp:$vd),
-                (ins VRegOp:$vs1, VRegOp:$vs2, VMaskOp:$vm),
+    : RVInstVV<funct6, opv, (outs VR:$vd),
+                (ins VR:$vs1, VR:$vs2, VMaskOp:$vm),
                 opcodestr, "$vd, $vs1, $vs2$vm">;
 
-// op vd, vs1, vs2
+// op vd, vs2, vs1
 class VALUVVNoVm<bits<6> funct6, RISCVVFormat opv, string opcodestr>
-    : RVInstVV<funct6, opv, (outs VRegOp:$vd),
-               (ins VRegOp:$vs2, VRegOp:$vs1),
+    : RVInstVV<funct6, opv, (outs VR:$vd),
+               (ins VR:$vs2, VR:$vs1),
                opcodestr, "$vd, $vs2, $vs1"> {
   let vm = 1;
 }
 
 // op vd, vs2, rs1, vm
 class VALUVX<bits<6> funct6, RISCVVFormat opv, string opcodestr>
-    : RVInstVX<funct6, opv, (outs VRegOp:$vd),
-                (ins VRegOp:$vs2, GPR:$rs1, VMaskOp:$vm),
+    : RVInstVX<funct6, opv, (outs VR:$vd),
+                (ins VR:$vs2, GPR:$rs1, VMaskOp:$vm),
                 opcodestr, "$vd, $vs2, $rs1$vm">;
 
 // op vd, vs2, rs1, v0 (without mask, use v0 as carry input)
 class VALUmVX<bits<6> funct6, RISCVVFormat opv, string opcodestr>
-    : RVInstVX<funct6, opv, (outs VRegOp:$vd),
-                (ins VRegOp:$vs2, GPR:$rs1, VMV0:$v0),
+    : RVInstVX<funct6, opv, (outs VR:$vd),
+                (ins VR:$vs2, GPR:$rs1, VMV0:$v0),
                 opcodestr, "$vd, $vs2, $rs1, v0"> {
   let vm = 0;
 }
 
 // op vd, rs1, vs2, vm (reverse the order of rs1 and vs2)
 class VALUrVX<bits<6> funct6, RISCVVFormat opv, string opcodestr>
-    : RVInstVX<funct6, opv, (outs VRegOp:$vd),
-                (ins GPR:$rs1, VRegOp:$vs2, VMaskOp:$vm),
+    : RVInstVX<funct6, opv, (outs VR:$vd),
+                (ins GPR:$rs1, VR:$vs2, VMaskOp:$vm),
                 opcodestr, "$vd, $rs1, $vs2$vm">;
 
 // op vd, vs1, vs2
 class VALUVXNoVm<bits<6> funct6, RISCVVFormat opv, string opcodestr>
-    : RVInstVX<funct6, opv, (outs VRegOp:$vd),
-               (ins VRegOp:$vs2, GPR:$rs1),
+    : RVInstVX<funct6, opv, (outs VR:$vd),
+               (ins VR:$vs2, GPR:$rs1),
                opcodestr, "$vd, $vs2, $rs1"> {
   let vm = 1;
 }
 
 // op vd, vs2, imm, vm
 class VALUVI<bits<6> funct6, string opcodestr, Operand optype = simm5>
-    : RVInstIVI<funct6, (outs VRegOp:$vd),
-                (ins VRegOp:$vs2, optype:$imm, VMaskOp:$vm),
+    : RVInstIVI<funct6, (outs VR:$vd),
+                (ins VR:$vs2, optype:$imm, VMaskOp:$vm),
                 opcodestr, "$vd, $vs2, $imm$vm">;
 
 // op vd, vs2, imm, v0 (without mask, use v0 as carry input)
 class VALUmVI<bits<6> funct6, string opcodestr, Operand optype = simm5>
-    : RVInstIVI<funct6, (outs VRegOp:$vd),
-                (ins VRegOp:$vs2, optype:$imm, VMV0:$v0),
+    : RVInstIVI<funct6, (outs VR:$vd),
+                (ins VR:$vs2, optype:$imm, VMV0:$v0),
                 opcodestr, "$vd, $vs2, $imm, v0"> {
   let vm = 0;
 }
 
 // op vd, vs2, imm, vm
 class VALUVINoVm<bits<6> funct6, string opcodestr, Operand optype = simm5>
-    : RVInstIVI<funct6, (outs VRegOp:$vd),
-                (ins VRegOp:$vs2, optype:$imm),
+    : RVInstIVI<funct6, (outs VR:$vd),
+                (ins VR:$vs2, optype:$imm),
                 opcodestr, "$vd, $vs2, $imm"> {
   let vm = 1;
 }
 
 // op vd, vs2, rs1, vm (Float)
 class VALUVF<bits<6> funct6, RISCVVFormat opv, string opcodestr>
-    : RVInstVX<funct6, opv, (outs VRegOp:$vd),
-                (ins VRegOp:$vs2, FPR32:$rs1, VMaskOp:$vm),
+    : RVInstVX<funct6, opv, (outs VR:$vd),
+                (ins VR:$vs2, FPR32:$rs1, VMaskOp:$vm),
                 opcodestr, "$vd, $vs2, $rs1$vm">;
 
 // op vd, rs1, vs2, vm (Float) (with mask, reverse the order of rs1 and vs2)
 class VALUrVF<bits<6> funct6, RISCVVFormat opv, string opcodestr>
-    : RVInstVX<funct6, opv, (outs VRegOp:$vd),
-                (ins FPR32:$rs1, VRegOp:$vs2, VMaskOp:$vm),
+    : RVInstVX<funct6, opv, (outs VR:$vd),
+                (ins FPR32:$rs1, VR:$vs2, VMaskOp:$vm),
                 opcodestr, "$vd, $rs1, $vs2$vm">;
 
 // op vd, vs2, vm (use vs1 as instruction encoding)
 class VALUVs2<bits<6> funct6, bits<5> vs1, RISCVVFormat opv, string opcodestr>
-    : RVInstV<funct6, vs1, opv, (outs VRegOp:$vd),
-               (ins VRegOp:$vs2, VMaskOp:$vm),
+    : RVInstV<funct6, vs1, opv, (outs VR:$vd),
+               (ins VR:$vs2, VMaskOp:$vm),
                opcodestr, "$vd, $vs2$vm">;
 } // hasSideEffects = 0, mayLoad = 0, mayStore = 0
 
+let hasSideEffects = 0, mayLoad = 1, mayStore = 1 in {
+// vamo vd, (rs1), vs2, vd, vm
+class VAMOWd<RISCVAMOOP amoop, RISCVWidth width, string opcodestr>
+    : RVInstVAMO<amoop, width.Value{2-0}, (outs VR:$vd_wd),
+            (ins GPR:$rs1, VR:$vs2, VR:$vd, VMaskOp:$vm),
+            opcodestr, "$vd_wd, (${rs1}), $vs2, $vd$vm"> {
+    let Constraints = "$vd_wd = $vd";
+    let wd = 1;
+    bits<5> vd;
+    let Inst{11-7} = vd;
+}
+
+// vamo x0, (rs1), vs2, vs3, vm
+class VAMONoWd<RISCVAMOOP amoop, RISCVWidth width, string opcodestr>
+    : RVInstVAMO<amoop, width.Value{2-0}, (outs),
+            (ins GPR:$rs1, VR:$vs2, VR:$vs3, VMaskOp:$vm),
+            opcodestr, "x0, (${rs1}), $vs2, $vs3$vm"> {
+    bits<5> vs3;
+    let Inst{11-7} = vs3;
+}
+
+} // hasSideEffects = 0, mayLoad = 1, mayStore = 1
+
 //===----------------------------------------------------------------------===//
 // Combination of instruction classes.
 // Use these multiclasses to define instructions more easily.
@@ -358,6 +425,18 @@ multiclass VALU_FV_VS2<string opcodestr, bits<6> funct6, bits<5> vs1> {
   def "" : VALUVs2<funct6, vs1, OPFVV, opcodestr>;
 }
 
+multiclass VAMO<RISCVAMOOP amoop, RISCVWidth width, string opcodestr> {
+  def _WD : VAMOWd<amoop, width, opcodestr>;
+  def _UNWD : VAMONoWd<amoop, width, opcodestr>;
+}
+
+multiclass VWholeLoad<bits<3> nf, string opcodestr> {
+  def E8_V : VWholeLoad<nf, LSWidth8, opcodestr # "e8.v">;
+  def E16_V : VWholeLoad<nf, LSWidth16, opcodestr # "e16.v">;
+  def E32_V : VWholeLoad<nf, LSWidth32, opcodestr # "e32.v">;
+  def E64_V : VWholeLoad<nf, LSWidth64, opcodestr # "e64.v">;
+}
+
 //===----------------------------------------------------------------------===//
 // Instructions
 //===----------------------------------------------------------------------===//
@@ -367,82 +446,85 @@ let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in {
 def VSETVLI : RVInstSetVLi<(outs GPR:$rd), (ins GPR:$rs1, VTypeIOp:$vtypei),
                            "vsetvli", "$rd, $rs1, $vtypei">;
 
+def VSETIVLI : RVInstSetiVLi<(outs GPR:$rd), (ins uimm5:$uimm, VTypeIOp:$vtypei),
+                             "vsetivli", "$rd, $uimm, $vtypei">;
+
 def VSETVL : RVInstSetVL<(outs GPR:$rd), (ins GPR:$rs1, GPR:$rs2),
                          "vsetvl", "$rd, $rs1, $rs2">;
 } // hasSideEffects = 1, mayLoad = 0, mayStore = 0
 
 // Vector Unit-Stride Instructions
-def VLB_V : VUnitStrideLoad<MOPLDUnitStrideS, LUMOPUnitStride, LSWidthVByte, "vlb.v">;
-def VLH_V : VUnitStrideLoad<MOPLDUnitStrideS, LUMOPUnitStride, LSWidthVHalf, "vlh.v">;
-def VLW_V : VUnitStrideLoad<MOPLDUnitStrideS, LUMOPUnitStride, LSWidthVWord, "vlw.v">;
-
-def VLBU_V : VUnitStrideLoad<MOPLDUnitStrideU, LUMOPUnitStride, LSWidthVByte, "vlbu.v">;
-def VLHU_V : VUnitStrideLoad<MOPLDUnitStrideU, LUMOPUnitStride, LSWidthVHalf, "vlhu.v">;
-def VLWU_V : VUnitStrideLoad<MOPLDUnitStrideU, LUMOPUnitStride, LSWidthVWord, "vlwu.v">;
-
-def VLE_V : VUnitStrideLoad<MOPLDUnitStrideU, LUMOPUnitStride, LSWidthVSEW, "vle.v">;
-
-def VLBFF_V : VUnitStrideLoad<MOPLDUnitStrideS, LUMOPUnitStrideFF, LSWidthVByte, "vlbff.v">;
-def VLHFF_V : VUnitStrideLoad<MOPLDUnitStrideS, LUMOPUnitStrideFF, LSWidthVHalf, "vlhff.v">;
-def VLWFF_V : VUnitStrideLoad<MOPLDUnitStrideS, LUMOPUnitStrideFF, LSWidthVWord, "vlwff.v">;
-
-def VLBUFF_V : VUnitStrideLoad<MOPLDUnitStrideU, LUMOPUnitStrideFF, LSWidthVByte, "vlbuff.v">;
-def VLHUFF_V : VUnitStrideLoad<MOPLDUnitStrideU, LUMOPUnitStrideFF, LSWidthVHalf, "vlhuff.v">;
-def VLWUFF_V : VUnitStrideLoad<MOPLDUnitStrideU, LUMOPUnitStrideFF, LSWidthVWord, "vlwuff.v">;
+def VLE8_V : VUnitStrideLoad<LUMOPUnitStride, LSWidth8, "vle8.v">;
+def VLE16_V : VUnitStrideLoad<LUMOPUnitStride, LSWidth16, "vle16.v">;
+def VLE32_V : VUnitStrideLoad<LUMOPUnitStride, LSWidth32, "vle32.v">;
+def VLE64_V : VUnitStrideLoad<LUMOPUnitStride, LSWidth64, "vle64.v">;
 
-def VLEFF_V : VUnitStrideLoad<MOPLDUnitStrideU, LUMOPUnitStrideFF, LSWidthVSEW, "vleff.v">;
+def VLE8FF_V : VUnitStrideLoad<LUMOPUnitStrideFF, LSWidth8, "vle8ff.v">;
+def VLE16FF_V : VUnitStrideLoad<LUMOPUnitStrideFF, LSWidth16, "vle16ff.v">;
+def VLE32FF_V : VUnitStrideLoad<LUMOPUnitStrideFF, LSWidth32, "vle32ff.v">;
+def VLE64FF_V : VUnitStrideLoad<LUMOPUnitStrideFF, LSWidth64, "vle64ff.v">;
 
-def VSB_V : VUnitStrideStore<MOPSTUnitStride, SUMOPUnitStride, LSWidthVByte, "vsb.v">;
-def VSH_V : VUnitStrideStore<MOPSTUnitStride, SUMOPUnitStride, LSWidthVHalf, "vsh.v">;
-def VSW_V : VUnitStrideStore<MOPSTUnitStride, SUMOPUnitStride, LSWidthVWord, "vsw.v">;
+def VLE1_V : VUnitStrideLoadMask<"vle1.v">;
+def VSE1_V : VUnitStrideStoreMask<"vse1.v">;
 
-def VSE_V : VUnitStrideStore<MOPSTUnitStride, SUMOPUnitStride, LSWidthVSEW, "vse.v">;
+def VSE8_V : VUnitStrideStore<SUMOPUnitStride, LSWidth8, "vse8.v">;
+def VSE16_V : VUnitStrideStore<SUMOPUnitStride, LSWidth16, "vse16.v">;
+def VSE32_V : VUnitStrideStore<SUMOPUnitStride, LSWidth32, "vse32.v">;
+def VSE64_V : VUnitStrideStore<SUMOPUnitStride, LSWidth64, "vse64.v">;
 
 // Vector Strided Instructions
-def VLSB_V : VStridedLoad<MOPLDStridedS, LSWidthVByte, "vlsb.v">;
-def VLSH_V : VStridedLoad<MOPLDStridedS, LSWidthVHalf, "vlsh.v">;
-def VLSW_V : VStridedLoad<MOPLDStridedS, LSWidthVWord, "vlsw.v">;
+def VLSE8_V : VStridedLoad<LSWidth8, "vlse8.v">;
+def VLSE16_V : VStridedLoad<LSWidth16, "vlse16.v">;
+def VLSE32_V : VStridedLoad<LSWidth32, "vlse32.v">;
+def VLSE64_V : VStridedLoad<LSWidth64, "vlse64.v">;
 
-def VLSBU_V : VStridedLoad<MOPLDStridedU, LSWidthVByte, "vlsbu.v">;
-def VLSHU_V : VStridedLoad<MOPLDStridedU, LSWidthVHalf, "vlshu.v">;
-def VLSWU_V : VStridedLoad<MOPLDStridedU, LSWidthVWord, "vlswu.v">;
-
-def VLSE_V : VStridedLoad<MOPLDStridedU, LSWidthVSEW, "vlse.v">;
-
-def VSSB_V : VStridedStore<MOPSTStrided, LSWidthVByte, "vssb.v">;
-def VSSH_V : VStridedStore<MOPSTStrided, LSWidthVHalf, "vssh.v">;
-def VSSW_V : VStridedStore<MOPSTStrided, LSWidthVWord, "vssw.v">;
-def VSSE_V : VStridedStore<MOPSTStrided, LSWidthVSEW, "vsse.v">;
+def VSSE8_V : VStridedStore<LSWidth8, "vsse8.v">;
+def VSSE16_V : VStridedStore<LSWidth16, "vsse16.v">;
+def VSSE32_V : VStridedStore<LSWidth32, "vsse32.v">;
+def VSSE64_V : VStridedStore<LSWidth64, "vsse64.v">;
 
 // Vector Indexed Instructions
-def VLXB_V : VIndexedLoad<MOPLDIndexedS, LSWidthVByte, "vlxb.v">;
-def VLXH_V : VIndexedLoad<MOPLDIndexedS, LSWidthVHalf, "vlxh.v">;
-def VLXW_V : VIndexedLoad<MOPLDIndexedS, LSWidthVWord, "vlxw.v">;
-
-def VLXBU_V : VIndexedLoad<MOPLDIndexedU, LSWidthVByte, "vlxbu.v">;
-def VLXHU_V : VIndexedLoad<MOPLDIndexedU, LSWidthVHalf, "vlxhu.v">;
-def VLXWU_V : VIndexedLoad<MOPLDIndexedU, LSWidthVWord, "vlxwu.v">;
-
-def VLXE_V : VIndexedLoad<MOPLDIndexedU, LSWidthVSEW, "vlxe.v">;
-
-def VSXB_V : VIndexedStore<MOPSTIndexedOrder, LSWidthVByte, "vsxb.v">;
-def VSXH_V : VIndexedStore<MOPSTIndexedOrder, LSWidthVHalf, "vsxh.v">;
-def VSXW_V : VIndexedStore<MOPSTIndexedOrder, LSWidthVWord, "vsxw.v">;
-def VSXE_V : VIndexedStore<MOPSTIndexedOrder, LSWidthVSEW, "vsxe.v">;
+def VLUXEI8_V : VIndexedLoad<MOPLDIndexedUnord, LSWidth8, "vluxei8.v">;
+def VLUXEI16_V : VIndexedLoad<MOPLDIndexedUnord, LSWidth16, "vluxei16.v">;
+def VLUXEI32_V : VIndexedLoad<MOPLDIndexedUnord, LSWidth32, "vluxei32.v">;
+def VLUXEI64_V : VIndexedLoad<MOPLDIndexedUnord, LSWidth64, "vluxei64.v">;
+
+def VLOXEI8_V : VIndexedLoad<MOPLDIndexedOrder, LSWidth8, "vloxei8.v">;
+def VLOXEI16_V : VIndexedLoad<MOPLDIndexedOrder, LSWidth16, "vloxei16.v">;
+def VLOXEI32_V : VIndexedLoad<MOPLDIndexedOrder, LSWidth32, "vloxei32.v">;
+def VLOXEI64_V : VIndexedLoad<MOPLDIndexedOrder, LSWidth64, "vloxei64.v">;
+
+def VSUXEI8_V : VIndexedStore<MOPSTIndexedUnord, LSWidth8, "vsuxei8.v">;
+def VSUXEI16_V : VIndexedStore<MOPSTIndexedUnord, LSWidth16, "vsuxei16.v">;
+def VSUXEI32_V : VIndexedStore<MOPSTIndexedUnord, LSWidth32, "vsuxei32.v">;
+def VSUXEI64_V : VIndexedStore<MOPSTIndexedUnord, LSWidth64, "vsuxei64.v">;
+
+def VSOXEI8_V : VIndexedStore<MOPSTIndexedOrder, LSWidth8, "vsoxei8.v">;
+def VSOXEI16_V : VIndexedStore<MOPSTIndexedOrder, LSWidth16, "vsoxei16.v">;
+def VSOXEI32_V : VIndexedStore<MOPSTIndexedOrder, LSWidth32, "vsoxei32.v">;
+def VSOXEI64_V : VIndexedStore<MOPSTIndexedOrder, LSWidth64, "vsoxei64.v">;
+
+defm VL1R : VWholeLoad<0, "vl1r">;
+defm VL2R : VWholeLoad<1, "vl2r">;
+defm VL4R : VWholeLoad<3, "vl4r">;
+defm VL8R : VWholeLoad<7, "vl8r">;
+def : InstAlias<"vl1r.v $vd, (${rs1})", (VL1RE8_V VR:$vd, GPR:$rs1)>;
+def : InstAlias<"vl2r.v $vd, (${rs1})", (VL2RE8_V VR:$vd, GPR:$rs1)>;
+def : InstAlias<"vl4r.v $vd, (${rs1})", (VL4RE8_V VR:$vd, GPR:$rs1)>;
+def : InstAlias<"vl8r.v $vd, (${rs1})", (VL8RE8_V VR:$vd, GPR:$rs1)>;
 
-def VSUXB_V : VIndexedStore<MOPSTIndexedUnOrd, LSWidthVByte, "vsuxb.v">;
-def VSUXH_V : VIndexedStore<MOPSTIndexedUnOrd, LSWidthVHalf, "vsuxh.v">;
-def VSUXW_V : VIndexedStore<MOPSTIndexedUnOrd, LSWidthVWord, "vsuxw.v">;
-def VSUXE_V : VIndexedStore<MOPSTIndexedUnOrd, LSWidthVSEW, "vsuxe.v">;
-
-def VL1R_V : VWholeLoad<0, "vl1r.v">;
 def VS1R_V : VWholeStore<0, "vs1r.v">;
+def VS2R_V : VWholeStore<1, "vs2r.v">;
+def VS4R_V : VWholeStore<3, "vs4r.v">;
+def VS8R_V : VWholeStore<7, "vs8r.v">;
 
 // Vector Single-Width Integer Add and Subtract
 defm VADD_V : VALU_IV_V_X_I<"vadd", 0b000000>;
 defm VSUB_V : VALU_IV_V_X<"vsub", 0b000010>;
 defm VRSUB_V : VALU_IV_X_I<"vrsub", 0b000011>;
 
+def : InstAlias<"vneg.v $vd, $vs$vm", (VRSUB_VX VR:$vd, VR:$vs, X0, VMaskOp:$vm)>;
+
 // Vector Widening Integer Add/Subtract
 // Refer to 11.2 Widening Vector Arithmetic Instructions
 // The destination vector register group cannot overlap a source vector
@@ -468,17 +550,29 @@ defm VWSUB_W : VALU_MV_V_X<"vwsub", 0b110111, "w">;
 } // Constraints = "@earlyclobber $vd"
 
 def : InstAlias<"vwcvt.x.x.v $vd, $vs$vm",
-                (VWADD_VX VRegOp:$vd, VRegOp:$vs, X0, VMaskOp:$vm)>;
+                (VWADD_VX VR:$vd, VR:$vs, X0, VMaskOp:$vm)>;
 def : InstAlias<"vwcvtu.x.x.v $vd, $vs$vm",
-                (VWADDU_VX VRegOp:$vd, VRegOp:$vs, X0, VMaskOp:$vm)>;
+                (VWADDU_VX VR:$vd, VR:$vs, X0, VMaskOp:$vm)>;
+
+// Vector Integer Extension
+defm VZEXT_VF8 : VALU_MV_VS2<"vzext.vf8", 0b010010, 0b00010>;
+defm VSEXT_VF8 : VALU_MV_VS2<"vsext.vf8", 0b010010, 0b00011>;
+defm VZEXT_VF4 : VALU_MV_VS2<"vzext.vf4", 0b010010, 0b00100>;
+defm VSEXT_VF4 : VALU_MV_VS2<"vsext.vf4", 0b010010, 0b00101>;
+defm VZEXT_VF2 : VALU_MV_VS2<"vzext.vf2", 0b010010, 0b00110>;
+defm VSEXT_VF2 : VALU_MV_VS2<"vsext.vf2", 0b010010, 0b00111>;
 
 // Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions
 defm VADC_V : VALUm_IV_V_X_I<"vadc", 0b010000>;
+let Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint in {
 defm VMADC_V : VALUm_IV_V_X_I<"vmadc", 0b010001>;
 defm VMADC_V : VALUNoVm_IV_V_X_I<"vmadc", 0b010001>;
+} // Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint
 defm VSBC_V : VALUm_IV_V_X<"vsbc", 0b010010>;
+let Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint in {
 defm VMSBC_V : VALUm_IV_V_X<"vmsbc", 0b010011>;
 defm VMSBC_V : VALUNoVm_IV_V_X<"vmsbc", 0b010011>;
+} // Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint
 
 // Vector Bitwise Logical Instructions
 defm VAND_V : VALU_IV_V_X_I<"vand", 0b001001>;
@@ -486,7 +580,7 @@ defm VOR_V : VALU_IV_V_X_I<"vor", 0b001010>;
 defm VXOR_V : VALU_IV_V_X_I<"vxor", 0b001011>;
 
 def : InstAlias<"vnot.v $vd, $vs$vm",
-                (VXOR_VI VRegOp:$vd, VRegOp:$vs, -1, VMaskOp:$vm)>;
+                (VXOR_VI VR:$vd, VR:$vs, -1, VMaskOp:$vm)>;
 
 // Vector Single-Width Bit Shift Instructions
 defm VSLL_V : VALU_IV_V_X_I<"vsll", 0b100101, uimm5>;
@@ -498,12 +592,16 @@ defm VSRA_V : VALU_IV_V_X_I<"vsra", 0b101001, uimm5>;
 // The destination vector register group cannot overlap the first source
 // vector register group (specified by vs2). The destination vector register
 // group cannot overlap the mask register if used, unless LMUL=1.
-let Constraints = "@earlyclobber $vd", RVVConstraint = Narrow in {
+let Constraints = "@earlyclobber $vd" in {
 defm VNSRL_W : VALU_IV_V_X_I<"vnsrl", 0b101100, uimm5, "w">;
 defm VNSRA_W : VALU_IV_V_X_I<"vnsra", 0b101101, uimm5, "w">;
-} // Constraints = "@earlyclobber $vd", RVVConstraint = Narrow
+} // Constraints = "@earlyclobber $vd"
+
+def : InstAlias<"vncvt.x.x.w $vd, $vs$vm",
+                (VNSRL_WX VR:$vd, VR:$vs, X0, VMaskOp:$vm)>;
 
 // Vector Integer Comparison Instructions
+let RVVConstraint = NoConstraint in {
 defm VMSEQ_V : VALU_IV_V_X_I<"vmseq", 0b011000>;
 defm VMSNE_V : VALU_IV_V_X_I<"vmsne", 0b011001>;
 defm VMSLTU_V : VALU_IV_V_X<"vmsltu", 0b011010>;
@@ -512,27 +610,61 @@ defm VMSLEU_V : VALU_IV_V_X_I<"vmsleu", 0b011100>;
 defm VMSLE_V : VALU_IV_V_X_I<"vmsle", 0b011101>;
 defm VMSGTU_V : VALU_IV_X_I<"vmsgtu", 0b011110>;
 defm VMSGT_V : VALU_IV_X_I<"vmsgt", 0b011111>;
+} // RVVConstraint = NoConstraint
 
 def : InstAlias<"vmsgtu.vv $vd, $va, $vb$vm",
-                (VMSLTU_VV VRegOp:$vd, VRegOp:$vb, VRegOp:$va, VMaskOp:$vm), 0>;
+                (VMSLTU_VV VR:$vd, VR:$vb, VR:$va, VMaskOp:$vm), 0>;
 def : InstAlias<"vmsgt.vv $vd, $va, $vb$vm",
-                (VMSLT_VV VRegOp:$vd, VRegOp:$vb, VRegOp:$va, VMaskOp:$vm), 0>;
+                (VMSLT_VV VR:$vd, VR:$vb, VR:$va, VMaskOp:$vm), 0>;
 def : InstAlias<"vmsgeu.vv $vd, $va, $vb$vm",
-                (VMSLEU_VV VRegOp:$vd, VRegOp:$vb, VRegOp:$va, VMaskOp:$vm), 0>;
+                (VMSLEU_VV VR:$vd, VR:$vb, VR:$va, VMaskOp:$vm), 0>;
 def : InstAlias<"vmsge.vv $vd, $va, $vb$vm",
-                (VMSLE_VV VRegOp:$vd, VRegOp:$vb, VRegOp:$va, VMaskOp:$vm), 0>;
-def : InstAlias<"vmsltu.vi $vd, $va, $imm$vm",
-                (VMSLEU_VI VRegOp:$vd, VRegOp:$va, simm5_plus1:$imm,
-                 VMaskOp:$vm), 0>;
-def : InstAlias<"vmslt.vi $vd, $va, $imm$vm",
-                (VMSLE_VI VRegOp:$vd, VRegOp:$va, simm5_plus1:$imm,
-                 VMaskOp:$vm), 0>;
-def : InstAlias<"vmsgeu.vi $vd, $va, $imm$vm",
-                (VMSGTU_VI VRegOp:$vd, VRegOp:$va, simm5_plus1:$imm,
-                 VMaskOp:$vm), 0>;
-def : InstAlias<"vmsge.vi $vd, $va, $imm$vm",
-                (VMSGT_VI VRegOp:$vd, VRegOp:$va, simm5_plus1:$imm,
-                 VMaskOp:$vm), 0>;
+                (VMSLE_VV VR:$vd, VR:$vb, VR:$va, VMaskOp:$vm), 0>;
+
+let isCodeGenOnly = 0, isAsmParserOnly = 1, hasSideEffects = 0, mayLoad = 0,
+    mayStore = 0 in {
+// For unsigned comparisons we need to special case 0 immediate to maintain
+// the always true/false semantics we would invert if we just decremented the
+// immediate like we do for signed. To match the GNU assembler we will use
+// vmseq/vmsne.vv with the same register for both operands which we can't do
+// from an InstAlias.
+def PseudoVMSGEU_VI : Pseudo<(outs VR:$vd),
+                             (ins VR:$vs2, simm5_plus1:$imm, VMaskOp:$vm),
+                             [], "vmsgeu.vi", "$vd, $vs2, $imm$vm">;
+def PseudoVMSLTU_VI : Pseudo<(outs VR:$vd),
+                             (ins VR:$vs2, simm5_plus1:$imm, VMaskOp:$vm),
+                             [], "vmsltu.vi", "$vd, $vs2, $imm$vm">;
+// Handle signed with pseudos as well for more consistency in the
+// implementation.
+def PseudoVMSGE_VI : Pseudo<(outs VR:$vd),
+                            (ins VR:$vs2, simm5_plus1:$imm, VMaskOp:$vm),
+                            [], "vmsge.vi", "$vd, $vs2, $imm$vm">;
+def PseudoVMSLT_VI : Pseudo<(outs VR:$vd),
+                            (ins VR:$vs2, simm5_plus1:$imm, VMaskOp:$vm),
+                            [], "vmslt.vi", "$vd, $vs2, $imm$vm">;
+}
+
+let isCodeGenOnly = 0, isAsmParserOnly = 1, hasSideEffects = 0, mayLoad = 0,
+    mayStore = 0 in {
+def PseudoVMSGEU_VX : Pseudo<(outs VR:$vd),
+                             (ins VR:$vs2, GPR:$rs1),
+                             [], "vmsgeu.vx", "$vd, $vs2, $rs1">;
+def PseudoVMSGE_VX : Pseudo<(outs VR:$vd),
+                            (ins VR:$vs2, GPR:$rs1),
+                            [], "vmsge.vx", "$vd, $vs2, $rs1">;
+def PseudoVMSGEU_VX_M : Pseudo<(outs VRNoV0:$vd),
+                               (ins VR:$vs2, GPR:$rs1, VMaskOp:$vm),
+                               [], "vmsgeu.vx", "$vd, $vs2, $rs1$vm">;
+def PseudoVMSGE_VX_M : Pseudo<(outs VRNoV0:$vd),
+                              (ins VR:$vs2, GPR:$rs1, VMaskOp:$vm),
+                              [], "vmsge.vx", "$vd, $vs2, $rs1$vm">;
+def PseudoVMSGEU_VX_M_T : Pseudo<(outs VMV0:$vd, VR:$scratch),
+                                 (ins VR:$vs2, GPR:$rs1, VMaskOp:$vm),
+                                 [], "vmsgeu.vx", "$vd, $vs2, $rs1$vm, $scratch">;
+def PseudoVMSGE_VX_M_T : Pseudo<(outs VMV0:$vd, VR:$scratch),
+                                (ins VR:$vs2, GPR:$rs1, VMaskOp:$vm),
+                                [], "vmsge.vx", "$vd, $vs2, $rs1$vm, $scratch">;
+}
 
 // Vector Integer Min/Max Instructions
 defm VMINU_V : VALU_IV_V_X<"vminu", 0b000100>;
@@ -577,15 +709,16 @@ defm VWMACCUS_V : VALUr_MV_X<"vwmaccus", 0b111110>;
 defm VMERGE_V : VALUm_IV_V_X_I<"vmerge", 0b010111>;
 
 // Vector Integer Move Instructions
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0, vs2 = 0, vm = 1 in {
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, vs2 = 0, vm = 1,
+    RVVConstraint = NoConstraint  in {
 // op vd, vs1
-def VMV_V_V : RVInstVV<0b010111, OPIVV, (outs VRegOp:$vd),
-                       (ins VRegOp:$vs1), "vmv.v.v", "$vd, $vs1">;
+def VMV_V_V : RVInstVV<0b010111, OPIVV, (outs VR:$vd),
+                       (ins VR:$vs1), "vmv.v.v", "$vd, $vs1">;
 // op vd, rs1
-def VMV_V_X : RVInstVX<0b010111, OPIVX, (outs VRegOp:$vd),
+def VMV_V_X : RVInstVX<0b010111, OPIVX, (outs VR:$vd),
                        (ins GPR:$rs1), "vmv.v.x", "$vd, $rs1">;
 // op vd, imm
-def VMV_V_I : RVInstIVI<0b010111, (outs VRegOp:$vd),
+def VMV_V_I : RVInstIVI<0b010111, (outs VR:$vd),
                        (ins simm5:$imm), "vmv.v.i", "$vd, $imm">;
 } // hasSideEffects = 0, mayLoad = 0, mayStore = 0
 
@@ -609,11 +742,13 @@ defm VSSRL_V : VALU_IV_V_X_I<"vssrl", 0b101010, uimm5>;
 defm VSSRA_V : VALU_IV_V_X_I<"vssra", 0b101011, uimm5>;
 
 // Vector Narrowing Fixed-Point Clip Instructions
-let Constraints = "@earlyclobber $vd", RVVConstraint = Narrow in {
+let Constraints = "@earlyclobber $vd" in {
 defm VNCLIPU_W : VALU_IV_V_X_I<"vnclipu", 0b101110, uimm5, "w">;
 defm VNCLIP_W : VALU_IV_V_X_I<"vnclip", 0b101111, uimm5, "w">;
-} // Constraints = "@earlyclobber $vd", RVVConstraint = Narrow
+} // Constraints = "@earlyclobber $vd"
+} // Predicates = [HasStdExtV]
 
+let Predicates = [HasStdExtV, HasStdExtF] in {
 // Vector Single-Width Floating-Point Add/Subtract Instructions
 defm VFADD_V : VALU_FV_V_F<"vfadd", 0b000000>;
 defm VFSUB_V : VALU_FV_V_F<"vfsub", 0b000010>;
@@ -664,7 +799,9 @@ defm VFWNMSAC_V : VALUr_FV_V_F<"vfwnmsac", 0b111111>;
 } // Constraints = "@earlyclobber $vd", RVVConstraint = WidenV
 
 // Vector Floating-Point Square-Root Instruction
-defm VFSQRT_V : VALU_FV_VS2<"vfsqrt.v", 0b100011, 0b00000>;
+defm VFSQRT_V : VALU_FV_VS2<"vfsqrt.v", 0b010011, 0b00000>;
+defm VFRSQRT7_V : VALU_FV_VS2<"vfrsqrt7.v", 0b010011, 0b00100>;
+defm VFREC7_V : VALU_FV_VS2<"vfrec7.v", 0b010011, 0b00101>;
 
 // Vector Floating-Point MIN/MAX Instructions
 defm VFMIN_V : VALU_FV_V_F<"vfmin", 0b000100>;
@@ -675,32 +812,38 @@ defm VFSGNJ_V : VALU_FV_V_F<"vfsgnj", 0b001000>;
 defm VFSGNJN_V : VALU_FV_V_F<"vfsgnjn", 0b001001>;
 defm VFSGNJX_V : VALU_FV_V_F<"vfsgnjx", 0b001010>;
 
+def : InstAlias<"vfneg.v $vd, $vs$vm",
+                (VFSGNJN_VV VR:$vd, VR:$vs, VR:$vs, VMaskOp:$vm)>;
+
 // Vector Floating-Point Compare Instructions
+let RVVConstraint = NoConstraint in {
 defm VMFEQ_V : VALU_FV_V_F<"vmfeq", 0b011000>;
 defm VMFNE_V : VALU_FV_V_F<"vmfne", 0b011100>;
 defm VMFLT_V : VALU_FV_V_F<"vmflt", 0b011011>;
 defm VMFLE_V : VALU_FV_V_F<"vmfle", 0b011001>;
 defm VMFGT_V : VALU_FV_F<"vmfgt", 0b011101>;
 defm VMFGE_V : VALU_FV_F<"vmfge", 0b011111>;
+} // RVVConstraint = NoConstraint
 
 def : InstAlias<"vmfgt.vv $vd, $va, $vb$vm",
-                (VMFLT_VV VRegOp:$vd, VRegOp:$vb, VRegOp:$va, VMaskOp:$vm), 0>;
+                (VMFLT_VV VR:$vd, VR:$vb, VR:$va, VMaskOp:$vm), 0>;
 def : InstAlias<"vmfge.vv $vd, $va, $vb$vm",
-                (VMFLE_VV VRegOp:$vd, VRegOp:$vb, VRegOp:$va, VMaskOp:$vm), 0>;
+                (VMFLE_VV VR:$vd, VR:$vb, VR:$va, VMaskOp:$vm), 0>;
 
 // Vector Floating-Point Classify Instruction
-defm VFCLASS_V : VALU_FV_VS2<"vfclass.v", 0b100011, 0b10000>;
+defm VFCLASS_V : VALU_FV_VS2<"vfclass.v", 0b010011, 0b10000>;
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
 // Vector Floating-Point Merge Instruction
-def VFMERGE_VFM : RVInstVX<0b010111, OPFVF, (outs VRegOp:$vd),
-                           (ins VRegOp:$vs2, FPR32:$rs1, VMV0:$v0),
+def VFMERGE_VFM : RVInstVX<0b010111, OPFVF, (outs VR:$vd),
+                           (ins VR:$vs2, FPR32:$rs1, VMV0:$v0),
                            "vfmerge.vfm", "$vd, $vs2, $rs1, v0"> {
   let vm = 0;
 }
 
 // Vector Floating-Point Move Instruction
-def VFMV_V_F : RVInstVX<0b010111, OPFVF, (outs VRegOp:$vd),
+let RVVConstraint = NoConstraint in
+def VFMV_V_F : RVInstVX<0b010111, OPFVF, (outs VR:$vd),
                        (ins FPR32:$rs1), "vfmv.v.f", "$vd, $rs1"> {
   let vs2 = 0;
   let vm = 1;
@@ -708,31 +851,40 @@ def VFMV_V_F : RVInstVX<0b010111, OPFVF, (outs VRegOp:$vd),
 } // hasSideEffects = 0, mayLoad = 0, mayStore = 0
 
 // Single-Width Floating-Point/Integer Type-Convert Instructions
-defm VFCVT_XU_F_V : VALU_FV_VS2<"vfcvt.xu.f.v", 0b100010, 0b00000>;
-defm VFCVT_X_F_V : VALU_FV_VS2<"vfcvt.x.f.v", 0b100010, 0b00001>;
-defm VFCVT_F_XU_V : VALU_FV_VS2<"vfcvt.f.xu.v", 0b100010, 0b00010>;
-defm VFCVT_F_X_V : VALU_FV_VS2<"vfcvt.f.x.v", 0b100010, 0b00011>;
+defm VFCVT_XU_F_V : VALU_FV_VS2<"vfcvt.xu.f.v", 0b010010, 0b00000>;
+defm VFCVT_X_F_V : VALU_FV_VS2<"vfcvt.x.f.v", 0b010010, 0b00001>;
+defm VFCVT_RTZ_XU_F_V : VALU_FV_VS2<"vfcvt.rtz.xu.f.v", 0b010010, 0b00110>;
+defm VFCVT_RTZ_X_F_V : VALU_FV_VS2<"vfcvt.rtz.x.f.v", 0b010010, 0b00111>;
+defm VFCVT_F_XU_V : VALU_FV_VS2<"vfcvt.f.xu.v", 0b010010, 0b00010>;
+defm VFCVT_F_X_V : VALU_FV_VS2<"vfcvt.f.x.v", 0b010010, 0b00011>;
 
 // Widening Floating-Point/Integer Type-Convert Instructions
 let Constraints = "@earlyclobber $vd", RVVConstraint = WidenCvt in {
-defm VFWCVT_XU_F_V : VALU_FV_VS2<"vfwcvt.xu.f.v", 0b100010, 0b01000>;
-defm VFWCVT_X_F_V : VALU_FV_VS2<"vfwcvt.x.f.v", 0b100010, 0b01001>;
-defm VFWCVT_F_XU_V : VALU_FV_VS2<"vfwcvt.f.xu.v", 0b100010, 0b01010>;
-defm VFWCVT_F_X_V : VALU_FV_VS2<"vfwcvt.f.x.v", 0b100010, 0b01011>;
-defm VFWCVT_F_F_V : VALU_FV_VS2<"vfwcvt.f.f.v", 0b100010, 0b01100>;
+defm VFWCVT_XU_F_V : VALU_FV_VS2<"vfwcvt.xu.f.v", 0b010010, 0b01000>;
+defm VFWCVT_X_F_V : VALU_FV_VS2<"vfwcvt.x.f.v", 0b010010, 0b01001>;
+defm VFWCVT_RTZ_XU_F_V : VALU_FV_VS2<"vfwcvt.rtz.xu.f.v", 0b010010, 0b01110>;
+defm VFWCVT_RTZ_X_F_V : VALU_FV_VS2<"vfwcvt.rtz.x.f.v", 0b010010, 0b01111>;
+defm VFWCVT_F_XU_V : VALU_FV_VS2<"vfwcvt.f.xu.v", 0b010010, 0b01010>;
+defm VFWCVT_F_X_V : VALU_FV_VS2<"vfwcvt.f.x.v", 0b010010, 0b01011>;
+defm VFWCVT_F_F_V : VALU_FV_VS2<"vfwcvt.f.f.v", 0b010010, 0b01100>;
 } // Constraints = "@earlyclobber $vd", RVVConstraint = WidenCvt
 
 // Narrowing Floating-Point/Integer Type-Convert Instructions
-let Constraints = "@earlyclobber $vd", RVVConstraint = Narrow in {
-defm VFNCVT_XU_F_W : VALU_FV_VS2<"vfncvt.xu.f.w", 0b100010, 0b10000>;
-defm VFNCVT_X_F_W : VALU_FV_VS2<"vfncvt.x.f.w", 0b100010, 0b10001>;
-defm VFNCVT_F_XU_W : VALU_FV_VS2<"vfncvt.f.xu.w", 0b100010, 0b10010>;
-defm VFNCVT_F_X_W : VALU_FV_VS2<"vfncvt.f.x.w", 0b100010, 0b10011>;
-defm VFNCVT_F_F_W : VALU_FV_VS2<"vfncvt.f.f.w", 0b100010, 0b10100>;
-defm VFNCVT_ROD_F_F_W : VALU_FV_VS2<"vfncvt.rod.f.f.w", 0b100010, 0b10101>;
-} // Constraints = "@earlyclobber $vd", RVVConstraint = Narrow
+let Constraints = "@earlyclobber $vd" in {
+defm VFNCVT_XU_F_W : VALU_FV_VS2<"vfncvt.xu.f.w", 0b010010, 0b10000>;
+defm VFNCVT_X_F_W : VALU_FV_VS2<"vfncvt.x.f.w", 0b010010, 0b10001>;
+defm VFNCVT_RTZ_XU_F_W : VALU_FV_VS2<"vfncvt.rtz.xu.f.w", 0b010010, 0b10110>;
+defm VFNCVT_RTZ_X_F_W : VALU_FV_VS2<"vfncvt.rtz.x.f.w", 0b010010, 0b10111>;
+defm VFNCVT_F_XU_W : VALU_FV_VS2<"vfncvt.f.xu.w", 0b010010, 0b10010>;
+defm VFNCVT_F_X_W : VALU_FV_VS2<"vfncvt.f.x.w", 0b010010, 0b10011>;
+defm VFNCVT_F_F_W : VALU_FV_VS2<"vfncvt.f.f.w", 0b010010, 0b10100>;
+defm VFNCVT_ROD_F_F_W : VALU_FV_VS2<"vfncvt.rod.f.f.w", 0b010010, 0b10101>;
+} // Constraints = "@earlyclobber $vd"
+} // Predicates = [HasStdExtV, HasStdExtF]
 
+let Predicates = [HasStdExtV] in {
 // Vector Single-Width Integer Reduction Instructions
+let RVVConstraint = NoConstraint in {
 defm VREDSUM : VALU_MV_V<"vredsum", 0b000000>;
 defm VREDMAXU : VALU_MV_V<"vredmaxu", 0b000110>;
 defm VREDMAX : VALU_MV_V<"vredmax", 0b000111>;
@@ -741,34 +893,42 @@ defm VREDMIN : VALU_MV_V<"vredmin", 0b000101>;
 defm VREDAND : VALU_MV_V<"vredand", 0b000001>;
 defm VREDOR : VALU_MV_V<"vredor", 0b000010>;
 defm VREDXOR : VALU_MV_V<"vredxor", 0b000011>;
+} // RVVConstraint = NoConstraint
 
 // Vector Widening Integer Reduction Instructions
-let Constraints = "@earlyclobber $vd" in {
+let Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint in {
 // Set earlyclobber for following instructions for second and mask operands.
 // This has the downside that the earlyclobber constraint is too coarse and
 // will impose unnecessary restrictions by not allowing the destination to
 // overlap with the first (wide) operand.
 defm VWREDSUMU : VALU_IV_V<"vwredsumu", 0b110000>;
 defm VWREDSUM : VALU_IV_V<"vwredsum", 0b110001>;
-} // Constraints = "@earlyclobber $vd"
+} // Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint
+} // Predicates = [HasStdExtV]
 
+let Predicates = [HasStdExtV, HasStdExtF] in {
 // Vector Single-Width Floating-Point Reduction Instructions
+let RVVConstraint = NoConstraint in {
 defm VFREDOSUM : VALU_FV_V<"vfredosum", 0b000011>;
 defm VFREDSUM : VALU_FV_V<"vfredsum", 0b000001>;
 defm VFREDMAX : VALU_FV_V<"vfredmax", 0b000111>;
 defm VFREDMIN : VALU_FV_V<"vfredmin", 0b000101>;
+} // RVVConstraint = NoConstraint
 
 // Vector Widening Floating-Point Reduction Instructions
-let Constraints = "@earlyclobber $vd" in {
+let Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint in {
 // Set earlyclobber for following instructions for second and mask operands.
 // This has the downside that the earlyclobber constraint is too coarse and
 // will impose unnecessary restrictions by not allowing the destination to
 // overlap with the first (wide) operand.
 defm VFWREDOSUM : VALU_FV_V<"vfwredosum", 0b110011>;
 defm VFWREDSUM : VALU_FV_V<"vfwredsum", 0b110001>;
-} // Constraints = "@earlyclobber $vd"
+} // Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint
+} // Predicates = [HasStdExtV, HasStdExtF]
 
+let Predicates = [HasStdExtV] in {
 // Vector Mask-Register Logical Instructions
+let RVVConstraint = NoConstraint in {
 defm VMAND_M : VALU_MV_Mask<"vmand", 0b011001, "m">;
 defm VMNAND_M : VALU_MV_Mask<"vmnand", 0b011101, "m">;
 defm VMANDNOT_M : VALU_MV_Mask<"vmandnot", 0b011000, "m">;
@@ -777,82 +937,95 @@ defm VMOR_M : VALU_MV_Mask<"vmor", 0b011010, "m">;
 defm VMNOR_M : VALU_MV_Mask<"vmnor", 0b011110, "m">;
 defm VMORNOT_M : VALU_MV_Mask<"vmornot", 0b011100, "m">;
 defm VMXNOR_M : VALU_MV_Mask<"vmxnor", 0b011111, "m">;
+}
 
-def : InstAlias<"vmcpy.m $vd, $vs",
-                (VMAND_MM VRegOp:$vd, VRegOp:$vs, VRegOp:$vs)>;
+def : InstAlias<"vmmv.m $vd, $vs",
+                (VMAND_MM VR:$vd, VR:$vs, VR:$vs)>;
 def : InstAlias<"vmclr.m $vd",
-                (VMXOR_MM VRegOp:$vd, VRegOp:$vd, VRegOp:$vd)>;
+                (VMXOR_MM VR:$vd, VR:$vd, VR:$vd)>;
 def : InstAlias<"vmset.m $vd",
-                (VMXNOR_MM VRegOp:$vd, VRegOp:$vd, VRegOp:$vd)>;
+                (VMXNOR_MM VR:$vd, VR:$vd, VR:$vd)>;
 def : InstAlias<"vmnot.m $vd, $vs",
-                (VMNAND_MM VRegOp:$vd, VRegOp:$vs, VRegOp:$vs)>;
+                (VMNAND_MM VR:$vd, VR:$vs, VR:$vs)>;
 
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0,
+    RVVConstraint = NoConstraint  in {
 // Vector mask population count vpopc
 def VPOPC_M : RVInstV<0b010000, 0b10000, OPMVV, (outs GPR:$vd),
-                        (ins VRegOp:$vs2, VMaskOp:$vm),
+                        (ins VR:$vs2, VMaskOp:$vm),
                         "vpopc.m", "$vd, $vs2$vm">;
 
 // vfirst find-first-set mask bit
 def VFIRST_M : RVInstV<0b010000, 0b10001, OPMVV, (outs GPR:$vd),
-                        (ins VRegOp:$vs2, VMaskOp:$vm),
+                        (ins VR:$vs2, VMaskOp:$vm),
                         "vfirst.m", "$vd, $vs2$vm">;
 } // hasSideEffects = 0, mayLoad = 0, mayStore = 0
 
+let Constraints = "@earlyclobber $vd", RVVConstraint = Iota in {
 // vmsbf.m set-before-first mask bit
 defm VMSBF_M : VALU_MV_VS2<"vmsbf.m", 0b010100, 0b00001>;
-
 // vmsif.m set-including-first mask bit
 defm VMSIF_M : VALU_MV_VS2<"vmsif.m", 0b010100, 0b00011>;
-
 // vmsof.m set-only-first mask bit
 defm VMSOF_M : VALU_MV_VS2<"vmsof.m", 0b010100, 0b00010>;
-
 // Vector Iota Instruction
-let Constraints = "@earlyclobber $vd", RVVConstraint = Iota in {
 defm VIOTA_M : VALU_MV_VS2<"viota.m", 0b010100, 0b10000>;
 } // Constraints = "@earlyclobber $vd", RVVConstraint = Iota
 
 // Vector Element Index Instruction
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
-def VID_V : RVInstV<0b010100, 0b10001, OPMVV, (outs VRegOp:$vd),
+def VID_V : RVInstV<0b010100, 0b10001, OPMVV, (outs VR:$vd),
                       (ins VMaskOp:$vm), "vid.v", "$vd$vm"> {
   let vs2 = 0;
 }
 
 // Integer Scalar Move Instructions
-let vm = 1 in {
+let vm = 1, RVVConstraint = NoConstraint in {
 def VMV_X_S : RVInstV<0b010000, 0b00000, OPMVV, (outs GPR:$vd),
-                      (ins VRegOp:$vs2), "vmv.x.s", "$vd, $vs2">;
-def VMV_S_X : RVInstV2<0b010000, 0b00000, OPMVX, (outs VRegOp:$vd),
-                      (ins GPR:$rs1), "vmv.s.x", "$vd, $rs1">;
+                      (ins VR:$vs2), "vmv.x.s", "$vd, $vs2">;
+let Constraints = "$vd = $vd_wb" in
+def VMV_S_X : RVInstV2<0b010000, 0b00000, OPMVX, (outs VR:$vd_wb),
+                      (ins VR:$vd, GPR:$rs1), "vmv.s.x", "$vd, $rs1">;
 
 }
 } // hasSideEffects = 0, mayLoad = 0, mayStore = 0
+} // Predicates = [HasStdExtV]
 
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0, vm = 1 in {
+let Predicates = [HasStdExtV, HasStdExtF] in {
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, vm = 1,
+    RVVConstraint = NoConstraint  in {
 // Floating-Point Scalar Move Instructions
 def VFMV_F_S : RVInstV<0b010000, 0b00000, OPFVV, (outs FPR32:$vd),
-                      (ins VRegOp:$vs2), "vfmv.f.s", "$vd, $vs2">;
-def VFMV_S_F : RVInstV2<0b010000, 0b00000, OPFVF, (outs VRegOp:$vd),
-                      (ins FPR32:$rs1), "vfmv.s.f", "$vd, $rs1">;
+                      (ins VR:$vs2), "vfmv.f.s", "$vd, $vs2">;
+let Constraints = "$vd = $vd_wb" in
+def VFMV_S_F : RVInstV2<0b010000, 0b00000, OPFVF, (outs VR:$vd_wb),
+                      (ins VR:$vd, FPR32:$rs1), "vfmv.s.f", "$vd, $rs1">;
 
 } // hasSideEffects = 0, mayLoad = 0, mayStore = 0, vm = 1
+} // Predicates = [HasStdExtV, HasStdExtF]
 
+let Predicates = [HasStdExtV] in {
 // Vector Slide Instructions
 let Constraints = "@earlyclobber $vd", RVVConstraint = SlideUp in {
 defm VSLIDEUP_V : VALU_IV_X_I<"vslideup", 0b001110, uimm5>;
+defm VSLIDE1UP_V : VALU_MV_X<"vslide1up", 0b001110>;
 } // Constraints = "@earlyclobber $vd", RVVConstraint = SlideUp
 defm VSLIDEDOWN_V : VALU_IV_X_I<"vslidedown", 0b001111, uimm5>;
+defm VSLIDE1DOWN_V : VALU_MV_X<"vslide1down", 0b001111>;
+} // Predicates = [HasStdExtV]
 
+let Predicates = [HasStdExtV, HasStdExtF] in {
 let Constraints = "@earlyclobber $vd", RVVConstraint = SlideUp in {
-defm VSLIDE1UP_V : VALU_MV_X<"vslide1up", 0b001110>;
+defm VFSLIDE1UP_V : VALU_FV_F<"vfslide1up", 0b001110>;
 } // Constraints = "@earlyclobber $vd", RVVConstraint = SlideUp
-defm VSLIDE1DOWN_V : VALU_MV_X<"vslide1down", 0b001111>;
+defm VFSLIDE1DOWN_V : VALU_FV_F<"vfslide1down", 0b001111>;
+} // Predicates = [HasStdExtV, HasStdExtF]
 
+let Predicates = [HasStdExtV] in {
 // Vector Register Gather Instruction
 let Constraints = "@earlyclobber $vd", RVVConstraint = Vrgather in {
 defm VRGATHER_V : VALU_IV_V_X_I<"vrgather", 0b001100, uimm5>;
+def VRGATHEREI16_VV : VALUVV<0b001110, OPIVV, "vrgatherei16.vv">;
 } // Constraints = "@earlyclobber $vd", RVVConstraint = Vrgather
 
 // Vector Compress Instruction
@@ -860,10 +1033,11 @@ let Constraints = "@earlyclobber $vd", RVVConstraint = Vcompress in {
 defm VCOMPRESS_V : VALU_MV_Mask<"vcompress", 0b010111>;
 } // Constraints = "@earlyclobber $vd", RVVConstraint = Vcompress
 
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0,
+    RVVConstraint = NoConstraint in {
 foreach nf = [1, 2, 4, 8] in {
-  def VMV#nf#R_V  : RVInstV<0b100111, !add(nf, -1), OPIVI, (outs VRegOp:$vd),
-                            (ins VRegOp:$vs2), "vmv" # nf # "r.v",
+  def VMV#nf#R_V  : RVInstV<0b100111, !add(nf, -1), OPIVI, (outs VR:$vd),
+                            (ins VR:$vs2), "vmv" # nf # "r.v",
                             "$vd, $vs2"> {
     let Uses = [];
     let vm = 1;
@@ -871,3 +1045,122 @@ foreach nf = [1, 2, 4, 8] in {
 }
 } // hasSideEffects = 0, mayLoad = 0, mayStore = 0
 } // Predicates = [HasStdExtV]
+
+let Predicates = [HasStdExtZvlsseg] in {
+  foreach nf=2-8 in {
+    def VLSEG#nf#E8_V : VUnitStrideSegmentLoad<!add(nf, -1), LUMOPUnitStride, LSWidth8, "vlseg"#nf#"e8.v">;
+    def VLSEG#nf#E16_V : VUnitStrideSegmentLoad<!add(nf, -1), LUMOPUnitStride, LSWidth16, "vlseg"#nf#"e16.v">;
+    def VLSEG#nf#E32_V : VUnitStrideSegmentLoad<!add(nf, -1), LUMOPUnitStride, LSWidth32, "vlseg"#nf#"e32.v">;
+    def VLSEG#nf#E64_V : VUnitStrideSegmentLoad<!add(nf, -1), LUMOPUnitStride, LSWidth64, "vlseg"#nf#"e64.v">;
+
+    def VLSEG#nf#E8FF_V : VUnitStrideSegmentLoad<!add(nf, -1), LUMOPUnitStrideFF, LSWidth8, "vlseg"#nf#"e8ff.v">;
+    def VLSEG#nf#E16FF_V : VUnitStrideSegmentLoad<!add(nf, -1), LUMOPUnitStrideFF, LSWidth16, "vlseg"#nf#"e16ff.v">;
+    def VLSEG#nf#E32FF_V : VUnitStrideSegmentLoad<!add(nf, -1), LUMOPUnitStrideFF, LSWidth32, "vlseg"#nf#"e32ff.v">;
+    def VLSEG#nf#E64FF_V : VUnitStrideSegmentLoad<!add(nf, -1), LUMOPUnitStrideFF, LSWidth64, "vlseg"#nf#"e64ff.v">;
+
+    def VSSEG#nf#E8_V : VUnitStrideSegmentStore<!add(nf, -1), LSWidth8, "vsseg"#nf#"e8.v">;
+    def VSSEG#nf#E16_V : VUnitStrideSegmentStore<!add(nf, -1), LSWidth16, "vsseg"#nf#"e16.v">;
+    def VSSEG#nf#E32_V : VUnitStrideSegmentStore<!add(nf, -1), LSWidth32, "vsseg"#nf#"e32.v">;
+    def VSSEG#nf#E64_V : VUnitStrideSegmentStore<!add(nf, -1), LSWidth64, "vsseg"#nf#"e64.v">;
+
+    // Vector Strided Instructions
+    def VLSSEG#nf#E8_V : VStridedSegmentLoad<!add(nf, -1), LSWidth8, "vlsseg"#nf#"e8.v">;
+    def VLSSEG#nf#E16_V : VStridedSegmentLoad<!add(nf, -1), LSWidth16, "vlsseg"#nf#"e16.v">;
+    def VLSSEG#nf#E32_V : VStridedSegmentLoad<!add(nf, -1), LSWidth32, "vlsseg"#nf#"e32.v">;
+    def VLSSEG#nf#E64_V : VStridedSegmentLoad<!add(nf, -1), LSWidth64, "vlsseg"#nf#"e64.v">;
+
+    def VSSSEG#nf#E8_V : VStridedSegmentStore<!add(nf, -1), LSWidth8, "vssseg"#nf#"e8.v">;
+    def VSSSEG#nf#E16_V : VStridedSegmentStore<!add(nf, -1), LSWidth16, "vssseg"#nf#"e16.v">;
+    def VSSSEG#nf#E32_V : VStridedSegmentStore<!add(nf, -1), LSWidth32, "vssseg"#nf#"e32.v">;
+    def VSSSEG#nf#E64_V : VStridedSegmentStore<!add(nf, -1), LSWidth64, "vssseg"#nf#"e64.v">;
+
+    // Vector Indexed Instructions
+    def VLUXSEG#nf#EI8_V : VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedUnord,
+                             LSWidth8, "vluxseg"#nf#"ei8.v">;
+    def VLUXSEG#nf#EI16_V : VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedUnord,
+                              LSWidth16, "vluxseg"#nf#"ei16.v">;
+    def VLUXSEG#nf#EI32_V : VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedUnord,
+                              LSWidth32, "vluxseg"#nf#"ei32.v">;
+    def VLUXSEG#nf#EI64_V : VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedUnord,
+                              LSWidth64, "vluxseg"#nf#"ei64.v">;
+
+    def VLOXSEG#nf#EI8_V : VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedOrder,
+                             LSWidth8, "vloxseg"#nf#"ei8.v">;
+    def VLOXSEG#nf#EI16_V : VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedOrder,
+                              LSWidth16, "vloxseg"#nf#"ei16.v">;
+    def VLOXSEG#nf#EI32_V : VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedOrder,
+                              LSWidth32, "vloxseg"#nf#"ei32.v">;
+    def VLOXSEG#nf#EI64_V : VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedOrder,
+                              LSWidth64, "vloxseg"#nf#"ei64.v">;
+
+    def VSUXSEG#nf#EI8_V : VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedUnord,
+                             LSWidth8, "vsuxseg"#nf#"ei8.v">;
+    def VSUXSEG#nf#EI16_V : VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedUnord,
+                              LSWidth16, "vsuxseg"#nf#"ei16.v">;
+    def VSUXSEG#nf#EI32_V : VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedUnord,
+                              LSWidth32, "vsuxseg"#nf#"ei32.v">;
+    def VSUXSEG#nf#EI64_V : VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedUnord,
+                              LSWidth64, "vsuxseg"#nf#"ei64.v">;
+
+    def VSOXSEG#nf#EI8_V : VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedOrder,
+                             LSWidth8, "vsoxseg"#nf#"ei8.v">;
+    def VSOXSEG#nf#EI16_V : VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedOrder,
+                              LSWidth16, "vsoxseg"#nf#"ei16.v">;
+    def VSOXSEG#nf#EI32_V : VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedOrder,
+                              LSWidth32, "vsoxseg"#nf#"ei32.v">;
+    def VSOXSEG#nf#EI64_V : VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedOrder,
+                              LSWidth64, "vsoxseg"#nf#"ei64.v">;
+  }
+} // Predicates = [HasStdExtZvlsseg]
+
+let Predicates = [HasStdExtZvamo, HasStdExtA] in {
+  defm VAMOSWAPEI8 : VAMO<AMOOPVamoSwap, LSWidth8, "vamoswapei8.v">;
+  defm VAMOSWAPEI16 : VAMO<AMOOPVamoSwap, LSWidth16, "vamoswapei16.v">;
+  defm VAMOSWAPEI32 : VAMO<AMOOPVamoSwap, LSWidth32, "vamoswapei32.v">;
+
+  defm VAMOADDEI8 : VAMO<AMOOPVamoAdd, LSWidth8, "vamoaddei8.v">;
+  defm VAMOADDEI16 : VAMO<AMOOPVamoAdd, LSWidth16, "vamoaddei16.v">;
+  defm VAMOADDEI32 : VAMO<AMOOPVamoAdd, LSWidth32, "vamoaddei32.v">;
+
+  defm VAMOXOREI8 : VAMO<AMOOPVamoXor, LSWidth8, "vamoxorei8.v">;
+  defm VAMOXOREI16 : VAMO<AMOOPVamoXor, LSWidth16, "vamoxorei16.v">;
+  defm VAMOXOREI32 : VAMO<AMOOPVamoXor, LSWidth32, "vamoxorei32.v">;
+
+  defm VAMOANDEI8 : VAMO<AMOOPVamoAnd, LSWidth8, "vamoandei8.v">;
+  defm VAMOANDEI16 : VAMO<AMOOPVamoAnd, LSWidth16, "vamoandei16.v">;
+  defm VAMOANDEI32 : VAMO<AMOOPVamoAnd, LSWidth32, "vamoandei32.v">;
+
+  defm VAMOOREI8 : VAMO<AMOOPVamoOr, LSWidth8, "vamoorei8.v">;
+  defm VAMOOREI16 : VAMO<AMOOPVamoOr, LSWidth16, "vamoorei16.v">;
+  defm VAMOOREI32 : VAMO<AMOOPVamoOr, LSWidth32, "vamoorei32.v">;
+
+  defm VAMOMINEI8 : VAMO<AMOOPVamoMin, LSWidth8, "vamominei8.v">;
+  defm VAMOMINEI16 : VAMO<AMOOPVamoMin, LSWidth16, "vamominei16.v">;
+  defm VAMOMINEI32 : VAMO<AMOOPVamoMin, LSWidth32, "vamominei32.v">;
+
+  defm VAMOMAXEI8 : VAMO<AMOOPVamoMax, LSWidth8, "vamomaxei8.v">;
+  defm VAMOMAXEI16 : VAMO<AMOOPVamoMax, LSWidth16, "vamomaxei16.v">;
+  defm VAMOMAXEI32 : VAMO<AMOOPVamoMax, LSWidth32, "vamomaxei32.v">;
+
+  defm VAMOMINUEI8 : VAMO<AMOOPVamoMinu, LSWidth8, "vamominuei8.v">;
+  defm VAMOMINUEI16 : VAMO<AMOOPVamoMinu, LSWidth16, "vamominuei16.v">;
+  defm VAMOMINUEI32 : VAMO<AMOOPVamoMinu, LSWidth32, "vamominuei32.v">;
+
+  defm VAMOMAXUEI8 : VAMO<AMOOPVamoMaxu, LSWidth8, "vamomaxuei8.v">;
+  defm VAMOMAXUEI16 : VAMO<AMOOPVamoMaxu, LSWidth16, "vamomaxuei16.v">;
+  defm VAMOMAXUEI32 : VAMO<AMOOPVamoMaxu, LSWidth32, "vamomaxuei32.v">;
+} // Predicates = [HasStdExtZvamo, HasStdExtA]
+
+let Predicates = [HasStdExtZvamo, HasStdExtA, IsRV64] in {
+  defm VAMOSWAPEI64 : VAMO<AMOOPVamoSwap, LSWidth64, "vamoswapei64.v">;
+  defm VAMOADDEI64 : VAMO<AMOOPVamoAdd, LSWidth64, "vamoaddei64.v">;
+  defm VAMOXOREI64 : VAMO<AMOOPVamoXor, LSWidth64, "vamoxorei64.v">;
+  defm VAMOANDEI64 : VAMO<AMOOPVamoAnd, LSWidth64, "vamoandei64.v">;
+  defm VAMOOREI64 : VAMO<AMOOPVamoOr, LSWidth64, "vamoorei64.v">;
+  defm VAMOMINEI64 : VAMO<AMOOPVamoMin, LSWidth64, "vamominei64.v">;
+  defm VAMOMAXEI64 : VAMO<AMOOPVamoMax, LSWidth64, "vamomaxei64.v">;
+  defm VAMOMINUEI64 : VAMO<AMOOPVamoMinu, LSWidth64, "vamominuei64.v">;
+  defm VAMOMAXUEI64 : VAMO<AMOOPVamoMaxu, LSWidth64, "vamomaxuei64.v">;
+} // Predicates = [HasStdExtZvamo, HasStdExtA, IsRV64]
+
+include "RISCVInstrInfoVPseudos.td"
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
new file mode 100644
index 000000000000..5c228820f0cc
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -0,0 +1,4416 @@
+//===-- RISCVInstrInfoVPseudos.td - RISC-V 'V' Pseudos -----*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// This file contains the required infrastructure to support code generation
+/// for the standard 'V' (Vector) extension, version 0.10.  This version is still
+/// experimental as the 'V' extension hasn't been ratified yet.
+///
+/// This file is included from RISCVInstrInfoV.td
+///
+//===----------------------------------------------------------------------===//
+
+def riscv_vmv_x_s : SDNode<"RISCVISD::VMV_X_S",
+                           SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisVec<1>,
+                                                SDTCisInt<1>]>>;
+def riscv_read_vlenb : SDNode<"RISCVISD::READ_VLENB",
+                              SDTypeProfile<1, 0, [SDTCisVT<0, XLenVT>]>>;
+
+def riscv_vleff : SDNode<"RISCVISD::VLEFF",
+                         SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisPtrTy<1>,
+                                              SDTCisVT<2, XLenVT>]>,
+                         [SDNPHasChain, SDNPOutGlue, SDNPMayLoad,
+                          SDNPSideEffect]>;
+def riscv_vleff_mask : SDNode<"RISCVISD::VLEFF_MASK",
+                              SDTypeProfile<1, 4, [SDTCisVec<0>,
+                                                   SDTCisSameAs<0, 1>,
+                                                   SDTCisPtrTy<2>,
+                                                   SDTCVecEltisVT<3, i1>,
+                                                   SDTCisVT<4, XLenVT>]>,
+                              [SDNPHasChain, SDNPOutGlue, SDNPMayLoad,
+                               SDNPSideEffect]>;
+def riscv_read_vl : SDNode<"RISCVISD::READ_VL",
+                           SDTypeProfile<1, 0, [SDTCisVT<0, XLenVT>]>,
+                           [SDNPInGlue]>;
+
+// X0 has special meaning for vsetvl/vsetvli.
+//  rd | rs1 |   AVL value | Effect on vl
+//--------------------------------------------------------------
+// !X0 |  X0 |       VLMAX | Set vl to VLMAX
+//  X0 |  X0 | Value in vl | Keep current vl, just change vtype.
+def VLOp : ComplexPattern<XLenVT, 1, "selectVLOp">;
+
+def DecImm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getSExtValue() - 1, SDLoc(N),
+                                   N->getValueType(0));
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Utilities.
+//===----------------------------------------------------------------------===//
+
+// This class describes information associated to the LMUL.
+class LMULInfo<int lmul, VReg regclass, VReg wregclass,
+               VReg f2regclass, VReg f4regclass, VReg f8regclass, string mx> {
+  bits<3> value = lmul; // This is encoded as the vlmul field of vtype.
+  VReg vrclass = regclass;
+  VReg wvrclass = wregclass;
+  VReg f8vrclass = f8regclass;
+  VReg f4vrclass = f4regclass;
+  VReg f2vrclass = f2regclass;
+  string MX = mx;
+}
+
+// Associate LMUL with tablegen records of register classes.
+def V_M1  : LMULInfo<0b000,   VR,        VRM2,   VR,   VR, VR, "M1">;
+def V_M2  : LMULInfo<0b001, VRM2,        VRM4,   VR,   VR, VR, "M2">;
+def V_M4  : LMULInfo<0b010, VRM4,        VRM8, VRM2,   VR, VR, "M4">;
+def V_M8  : LMULInfo<0b011, VRM8,/*NoVReg*/VR, VRM4, VRM2, VR, "M8">;
+
+def V_MF8 : LMULInfo<0b101, VR, VR,/*NoVReg*/VR,/*NoVReg*/VR,/*NoVReg*/VR, "MF8">;
+def V_MF4 : LMULInfo<0b110, VR, VR,          VR,/*NoVReg*/VR,/*NoVReg*/VR, "MF4">;
+def V_MF2 : LMULInfo<0b111, VR, VR,          VR,          VR,/*NoVReg*/VR, "MF2">;
+
+// Used to iterate over all possible LMULs.
+def MxList {
+  list<LMULInfo> m = [V_MF8, V_MF4, V_MF2, V_M1, V_M2, V_M4, V_M8];
+}
+
+class FPR_Info<RegisterClass regclass, string fx> {
+  RegisterClass fprclass = regclass;
+  string FX = fx;
+}
+
+def SCALAR_F16 : FPR_Info<FPR16, "F16">;
+def SCALAR_F32 : FPR_Info<FPR32, "F32">;
+def SCALAR_F64 : FPR_Info<FPR64, "F64">;
+
+def FPList {
+  list<FPR_Info> fpinfo = [SCALAR_F16, SCALAR_F32, SCALAR_F64];
+}
+
+class MxSet<int eew> {
+  list<LMULInfo> m = !cond(!eq(eew, 8) : [V_MF8, V_MF4, V_MF2, V_M1, V_M2, V_M4, V_M8],
+                           !eq(eew, 16) : [V_MF4, V_MF2, V_M1, V_M2, V_M4, V_M8],
+                           !eq(eew, 32) : [V_MF2, V_M1, V_M2, V_M4, V_M8],
+                           !eq(eew, 64) : [V_M1, V_M2, V_M4, V_M8]);
+}
+
+class NFSet<LMULInfo m> {
+  list<int> L = !cond(!eq(m.value, V_M8.value): [],
+                      !eq(m.value, V_M4.value): [2],
+                      !eq(m.value, V_M2.value): [2, 3, 4],
+                      true: [2, 3, 4, 5, 6, 7, 8]);
+}
+
+class shift_amount<int num> {
+  int val = !if(!eq(num, 1), 0, !add(1, shift_amount<!srl(num, 1)>.val));
+}
+
+class octuple_from_str<string MX> {
+  int ret = !cond(!eq(MX, "MF8") : 1,
+                        !eq(MX, "MF4") : 2,
+                        !eq(MX, "MF2") : 4,
+                        !eq(MX, "M1") : 8,
+                        !eq(MX, "M2") : 16,
+                        !eq(MX, "M4") : 32,
+                        !eq(MX, "M8") : 64);
+}
+
+class octuple_to_str<int octuple> {
+  string ret = !if(!eq(octuple, 1), "MF8",
+                   !if(!eq(octuple, 2), "MF4",
+                   !if(!eq(octuple, 4), "MF2",
+                   !if(!eq(octuple, 8), "M1",
+                   !if(!eq(octuple, 16), "M2",
+                   !if(!eq(octuple, 32), "M4",
+                   !if(!eq(octuple, 64), "M8",
+                   "NoDef")))))));
+}
+
+// Output pattern for X0 used to represent VLMAX in the pseudo instructions.
+def VLMax : OutPatFrag<(ops), (XLenVT X0)>;
+
+// List of EEW.
+defvar EEWList = [8, 16, 32, 64];
+
+class SegRegClass<LMULInfo m, int nf> {
+  VReg RC = !cast<VReg>("VRN" # nf # !cond(!eq(m.value, V_MF8.value): V_M1.MX,
+                                           !eq(m.value, V_MF4.value): V_M1.MX,
+                                           !eq(m.value, V_MF2.value): V_M1.MX,
+                                           true: m.MX));
+}
+
+//===----------------------------------------------------------------------===//
+// Vector register and vector group type information.
+//===----------------------------------------------------------------------===//
+
+class VTypeInfo<ValueType Vec, ValueType Mas, int Sew, VReg Reg, LMULInfo M,
+                ValueType Scal = XLenVT, RegisterClass ScalarReg = GPR>
+{
+  ValueType Vector = Vec;
+  ValueType Mask = Mas;
+  int SEW = Sew;
+  VReg RegClass = Reg;
+  LMULInfo LMul = M;
+  ValueType Scalar = Scal;
+  RegisterClass ScalarRegClass = ScalarReg;
+  // The pattern fragment which produces the AVL operand, representing the
+  // "natural" vector length for this type. For scalable vectors this is VLMax.
+  OutPatFrag AVL = VLMax;
+
+  string ScalarSuffix = !cond(!eq(Scal, XLenVT) : "X",
+                              !eq(Scal, f16) : "F16",
+                              !eq(Scal, f32) : "F32",
+                              !eq(Scal, f64) : "F64");
+}
+
+class GroupVTypeInfo<ValueType Vec, ValueType VecM1, ValueType Mas, int Sew,
+                     VReg Reg, LMULInfo M, ValueType Scal = XLenVT,
+                     RegisterClass ScalarReg = GPR>
+    : VTypeInfo<Vec, Mas, Sew, Reg, M, Scal, ScalarReg>
+{
+  ValueType VectorM1 = VecM1;
+}
+
+defset list<VTypeInfo> AllVectors = {
+  defset list<VTypeInfo> AllIntegerVectors = {
+    defset list<VTypeInfo> NoGroupIntegerVectors = {
+      def VI8MF8: VTypeInfo<vint8mf8_t,  vbool64_t,  8, VR, V_MF8>;
+      def VI8MF4: VTypeInfo<vint8mf4_t,  vbool32_t,  8, VR, V_MF4>;
+      def VI8MF2: VTypeInfo<vint8mf2_t,  vbool16_t,  8, VR, V_MF2>;
+      def VI8M1: VTypeInfo<vint8m1_t,   vbool8_t,   8, VR, V_M1>;
+      def VI16MF4: VTypeInfo<vint16mf4_t, vbool64_t, 16, VR, V_MF4>;
+      def VI16MF2: VTypeInfo<vint16mf2_t, vbool32_t, 16, VR, V_MF2>;
+      def VI16M1: VTypeInfo<vint16m1_t,  vbool16_t, 16, VR, V_M1>;
+      def VI32MF2: VTypeInfo<vint32mf2_t, vbool64_t, 32, VR, V_MF2>;
+      def VI32M1: VTypeInfo<vint32m1_t,  vbool32_t, 32, VR, V_M1>;
+      def VI64M1: VTypeInfo<vint64m1_t,  vbool64_t, 64, VR, V_M1>;
+    }
+    defset list<GroupVTypeInfo> GroupIntegerVectors = {
+      def VI8M2: GroupVTypeInfo<vint8m2_t, vint8m1_t, vbool4_t, 8, VRM2, V_M2>;
+      def VI8M4: GroupVTypeInfo<vint8m4_t, vint8m1_t, vbool2_t, 8, VRM4, V_M4>;
+      def VI8M8: GroupVTypeInfo<vint8m8_t, vint8m1_t, vbool1_t, 8, VRM8, V_M8>;
+
+      def VI16M2: GroupVTypeInfo<vint16m2_t,vint16m1_t,vbool8_t, 16,VRM2, V_M2>;
+      def VI16M4: GroupVTypeInfo<vint16m4_t,vint16m1_t,vbool4_t, 16,VRM4, V_M4>;
+      def VI16M8: GroupVTypeInfo<vint16m8_t,vint16m1_t,vbool2_t, 16,VRM8, V_M8>;
+
+      def VI32M2: GroupVTypeInfo<vint32m2_t,vint32m1_t,vbool16_t,32,VRM2, V_M2>;
+      def VI32M4: GroupVTypeInfo<vint32m4_t,vint32m1_t,vbool8_t, 32,VRM4, V_M4>;
+      def VI32M8: GroupVTypeInfo<vint32m8_t,vint32m1_t,vbool4_t, 32,VRM8, V_M8>;
+
+      def VI64M2: GroupVTypeInfo<vint64m2_t,vint64m1_t,vbool32_t,64,VRM2, V_M2>;
+      def VI64M4: GroupVTypeInfo<vint64m4_t,vint64m1_t,vbool16_t,64,VRM4, V_M4>;
+      def VI64M8: GroupVTypeInfo<vint64m8_t,vint64m1_t,vbool8_t, 64,VRM8, V_M8>;
+    }
+  }
+
+  defset list<VTypeInfo> AllFloatVectors = {
+    defset list<VTypeInfo> NoGroupFloatVectors = {
+      def VF16MF4: VTypeInfo<vfloat16mf4_t, vbool64_t, 16, VR, V_MF4, f16, FPR16>;
+      def VF16MF2: VTypeInfo<vfloat16mf2_t, vbool32_t, 16, VR, V_MF2, f16, FPR16>;
+      def VF16M1:  VTypeInfo<vfloat16m1_t,  vbool16_t, 16, VR, V_M1,  f16, FPR16>;
+
+      def VF32MF2: VTypeInfo<vfloat32mf2_t,vbool64_t, 32, VR, V_MF2, f32, FPR32>;
+      def VF32M1:  VTypeInfo<vfloat32m1_t, vbool32_t, 32, VR, V_M1,  f32, FPR32>;
+
+      def VF64M1: VTypeInfo<vfloat64m1_t, vbool64_t, 64, VR, V_M1, f64, FPR64>;
+    }
+
+    defset list<GroupVTypeInfo> GroupFloatVectors = {
+      def VF16M2: GroupVTypeInfo<vfloat16m2_t, vfloat16m1_t, vbool8_t, 16,
+                                  VRM2, V_M2, f16, FPR16>;
+      def VF16M4: GroupVTypeInfo<vfloat16m4_t, vfloat16m1_t, vbool4_t, 16,
+                                  VRM4, V_M4, f16, FPR16>;
+      def VF16M8: GroupVTypeInfo<vfloat16m8_t, vfloat16m1_t, vbool2_t, 16,
+                                  VRM8, V_M8, f16, FPR16>;
+
+      def VF32M2: GroupVTypeInfo<vfloat32m2_t, vfloat32m1_t, vbool16_t, 32,
+                                  VRM2, V_M2, f32, FPR32>;
+      def VF32M4: GroupVTypeInfo<vfloat32m4_t, vfloat32m1_t, vbool8_t,  32,
+                                  VRM4, V_M4, f32, FPR32>;
+      def VF32M8: GroupVTypeInfo<vfloat32m8_t, vfloat32m1_t, vbool4_t,  32,
+                                  VRM8, V_M8, f32, FPR32>;
+
+      def VF64M2: GroupVTypeInfo<vfloat64m2_t, vfloat64m1_t, vbool32_t, 64,
+                                  VRM2, V_M2, f64, FPR64>;
+      def VF64M4: GroupVTypeInfo<vfloat64m4_t, vfloat64m1_t, vbool16_t, 64,
+                                  VRM4, V_M4, f64, FPR64>;
+      def VF64M8: GroupVTypeInfo<vfloat64m8_t, vfloat64m1_t, vbool8_t,  64,
+                                  VRM8, V_M8, f64, FPR64>;
+    }
+  }
+}
+
+// This functor is used to obtain the int vector type that has the same SEW and
+// multiplier as the input parameter type
+class GetIntVTypeInfo<VTypeInfo vti>
+{
+  // Equivalent integer vector type. Eg.
+  //   VI8M1 → VI8M1 (identity)
+  //   VF64M4 → VI64M4
+  VTypeInfo Vti = !cast<VTypeInfo>(!subst("VF", "VI", !cast<string>(vti)));
+}
+
+class MTypeInfo<ValueType Mas, LMULInfo M, string Bx> {
+  ValueType Mask = Mas;
+  // {SEW, VLMul} values set a valid VType to deal with this mask type.
+  // we assume SEW=8 and set corresponding LMUL.
+  int SEW = 8;
+  LMULInfo LMul = M;
+  string BX = Bx; // Appendix of mask operations.
+  // The pattern fragment which produces the AVL operand, representing the
+  // "natural" vector length for this mask type. For scalable masks this is
+  // VLMax.
+  OutPatFrag AVL = VLMax;
+}
+
+defset list<MTypeInfo> AllMasks = {
+  // vbool<n>_t, <n> = SEW/LMUL, we assume SEW=8 and corresponding LMUL.
+  def : MTypeInfo<vbool64_t, V_MF8, "B1">;
+  def : MTypeInfo<vbool32_t, V_MF4, "B2">;
+  def : MTypeInfo<vbool16_t, V_MF2, "B4">;
+  def : MTypeInfo<vbool8_t, V_M1, "B8">;
+  def : MTypeInfo<vbool4_t, V_M2, "B16">;
+  def : MTypeInfo<vbool2_t, V_M4, "B32">;
+  def : MTypeInfo<vbool1_t, V_M8, "B64">;
+}
+
+class VTypeInfoToWide<VTypeInfo vti, VTypeInfo wti>
+{
+  VTypeInfo Vti = vti;
+  VTypeInfo Wti = wti;
+}
+
+class VTypeInfoToFraction<VTypeInfo vti, VTypeInfo fti>
+{
+  VTypeInfo Vti = vti;
+  VTypeInfo Fti = fti;
+}
+
+defset list<VTypeInfoToWide> AllWidenableIntVectors = {
+  def : VTypeInfoToWide<VI8MF8,  VI16MF4>;
+  def : VTypeInfoToWide<VI8MF4,  VI16MF2>;
+  def : VTypeInfoToWide<VI8MF2,  VI16M1>;
+  def : VTypeInfoToWide<VI8M1,   VI16M2>;
+  def : VTypeInfoToWide<VI8M2,   VI16M4>;
+  def : VTypeInfoToWide<VI8M4,   VI16M8>;
+
+  def : VTypeInfoToWide<VI16MF4, VI32MF2>;
+  def : VTypeInfoToWide<VI16MF2, VI32M1>;
+  def : VTypeInfoToWide<VI16M1,  VI32M2>;
+  def : VTypeInfoToWide<VI16M2,  VI32M4>;
+  def : VTypeInfoToWide<VI16M4,  VI32M8>;
+
+  def : VTypeInfoToWide<VI32MF2, VI64M1>;
+  def : VTypeInfoToWide<VI32M1,  VI64M2>;
+  def : VTypeInfoToWide<VI32M2,  VI64M4>;
+  def : VTypeInfoToWide<VI32M4,  VI64M8>;
+}
+
+defset list<VTypeInfoToWide> AllWidenableFloatVectors = {
+  def : VTypeInfoToWide<VF16MF4, VF32MF2>;
+  def : VTypeInfoToWide<VF16MF2, VF32M1>;
+  def : VTypeInfoToWide<VF16M1, VF32M2>;
+  def : VTypeInfoToWide<VF16M2, VF32M4>;
+  def : VTypeInfoToWide<VF16M4, VF32M8>;
+
+  def : VTypeInfoToWide<VF32MF2, VF64M1>;
+  def : VTypeInfoToWide<VF32M1, VF64M2>;
+  def : VTypeInfoToWide<VF32M2, VF64M4>;
+  def : VTypeInfoToWide<VF32M4, VF64M8>;
+}
+
+defset list<VTypeInfoToFraction> AllFractionableVF2IntVectors = {
+  def : VTypeInfoToFraction<VI16MF4, VI8MF8>;
+  def : VTypeInfoToFraction<VI16MF2, VI8MF4>;
+  def : VTypeInfoToFraction<VI16M1, VI8MF2>;
+  def : VTypeInfoToFraction<VI16M2, VI8M1>;
+  def : VTypeInfoToFraction<VI16M4, VI8M2>;
+  def : VTypeInfoToFraction<VI16M8, VI8M4>;
+  def : VTypeInfoToFraction<VI32MF2, VI16MF4>;
+  def : VTypeInfoToFraction<VI32M1, VI16MF2>;
+  def : VTypeInfoToFraction<VI32M2, VI16M1>;
+  def : VTypeInfoToFraction<VI32M4, VI16M2>;
+  def : VTypeInfoToFraction<VI32M8, VI16M4>;
+  def : VTypeInfoToFraction<VI64M1, VI32MF2>;
+  def : VTypeInfoToFraction<VI64M2, VI32M1>;
+  def : VTypeInfoToFraction<VI64M4, VI32M2>;
+  def : VTypeInfoToFraction<VI64M8, VI32M4>;
+}
+
+defset list<VTypeInfoToFraction> AllFractionableVF4IntVectors = {
+  def : VTypeInfoToFraction<VI32MF2, VI8MF8>;
+  def : VTypeInfoToFraction<VI32M1, VI8MF4>;
+  def : VTypeInfoToFraction<VI32M2, VI8MF2>;
+  def : VTypeInfoToFraction<VI32M4, VI8M1>;
+  def : VTypeInfoToFraction<VI32M8, VI8M2>;
+  def : VTypeInfoToFraction<VI64M1, VI16MF4>;
+  def : VTypeInfoToFraction<VI64M2, VI16MF2>;
+  def : VTypeInfoToFraction<VI64M4, VI16M1>;
+  def : VTypeInfoToFraction<VI64M8, VI16M2>;
+}
+
+defset list<VTypeInfoToFraction> AllFractionableVF8IntVectors = {
+  def : VTypeInfoToFraction<VI64M1, VI8MF8>;
+  def : VTypeInfoToFraction<VI64M2, VI8MF4>;
+  def : VTypeInfoToFraction<VI64M4, VI8MF2>;
+  def : VTypeInfoToFraction<VI64M8, VI8M1>;
+}
+
+defset list<VTypeInfoToWide> AllWidenableIntToFloatVectors = {
+  def : VTypeInfoToWide<VI8MF8, VF16MF4>;
+  def : VTypeInfoToWide<VI8MF4, VF16MF2>;
+  def : VTypeInfoToWide<VI8MF2, VF16M1>;
+  def : VTypeInfoToWide<VI8M1, VF16M2>;
+  def : VTypeInfoToWide<VI8M2, VF16M4>;
+  def : VTypeInfoToWide<VI8M4, VF16M8>;
+
+  def : VTypeInfoToWide<VI16MF4, VF32MF2>;
+  def : VTypeInfoToWide<VI16MF2, VF32M1>;
+  def : VTypeInfoToWide<VI16M1, VF32M2>;
+  def : VTypeInfoToWide<VI16M2, VF32M4>;
+  def : VTypeInfoToWide<VI16M4, VF32M8>;
+
+  def : VTypeInfoToWide<VI32MF2, VF64M1>;
+  def : VTypeInfoToWide<VI32M1, VF64M2>;
+  def : VTypeInfoToWide<VI32M2, VF64M4>;
+  def : VTypeInfoToWide<VI32M4, VF64M8>;
+}
+
+// This class holds the record of the RISCVVPseudoTable below.
+// This represents the information we need in codegen for each pseudo.
+// The definition should be consistent with `struct PseudoInfo` in
+// RISCVBaseInfo.h.
+class CONST8b<bits<8> val> {
+  bits<8> V = val;
+}
+def InvalidIndex : CONST8b<0x80>;
+class RISCVVPseudo {
+  Pseudo Pseudo = !cast<Pseudo>(NAME); // Used as a key.
+  Instruction BaseInstr;
+}
+
+// The actual table.
+def RISCVVPseudosTable : GenericTable {
+  let FilterClass = "RISCVVPseudo";
+  let CppTypeName = "PseudoInfo";
+  let Fields = [ "Pseudo", "BaseInstr" ];
+  let PrimaryKey = [ "Pseudo" ];
+  let PrimaryKeyName = "getPseudoInfo";
+}
+
+def RISCVVIntrinsicsTable : GenericTable {
+  let FilterClass = "RISCVVIntrinsic";
+  let CppTypeName = "RISCVVIntrinsicInfo";
+  let Fields = ["IntrinsicID", "ExtendOperand"];
+  let PrimaryKey = ["IntrinsicID"];
+  let PrimaryKeyName = "getRISCVVIntrinsicInfo";
+}
+
+class RISCVZvlsseg<string IntrName, bits<11> S, bits<3> L, bits<3> IL = V_M1.value> {
+  Intrinsic IntrinsicID = !cast<Intrinsic>(IntrName);
+  bits<11> SEW = S;
+  bits<3> LMUL = L;
+  bits<3> IndexLMUL = IL;
+  Pseudo Pseudo = !cast<Pseudo>(NAME);
+}
+
+def RISCVZvlssegTable : GenericTable {
+  let FilterClass = "RISCVZvlsseg";
+  let Fields = ["IntrinsicID", "SEW", "LMUL", "IndexLMUL", "Pseudo"];
+  let PrimaryKey = ["IntrinsicID", "SEW", "LMUL", "IndexLMUL"];
+  let PrimaryKeyName = "getPseudo";
+}
+
+//===----------------------------------------------------------------------===//
+// Helpers to define the different pseudo instructions.
+//===----------------------------------------------------------------------===//
+
+class PseudoToVInst<string PseudoInst> {
+  string VInst = !subst("_M8", "",
+                 !subst("_M4", "",
+                 !subst("_M2", "",
+                 !subst("_M1", "",
+                 !subst("_MF2", "",
+                 !subst("_MF4", "",
+                 !subst("_MF8", "",
+                 !subst("_B1", "",
+                 !subst("_B2", "",
+                 !subst("_B4", "",
+                 !subst("_B8", "",
+                 !subst("_B16", "",
+                 !subst("_B32", "",
+                 !subst("_B64", "",
+                 !subst("_MASK", "",
+                 !subst("F16", "F",
+                 !subst("F32", "F",
+                 !subst("F64", "F",
+                 !subst("Pseudo", "", PseudoInst)))))))))))))))))));
+}
+
+class ToLowerCase<string Upper> {
+  string L = !subst("FF", "ff",
+             !subst("VLSEG", "vlseg",
+             !subst("VLSSEG", "vlsseg",
+             !subst("VSSEG", "vsseg",
+             !subst("VSSSEG", "vssseg",
+             !subst("VLOXSEG", "vloxseg",
+             !subst("VLUXSEG", "vluxseg",
+             !subst("VSOXSEG", "vsoxseg",
+             !subst("VSUXSEG", "vsuxseg", Upper)))))))));
+}
+
+// Example: PseudoVLSEG2E32_V_M2 -> int_riscv_vlseg2
+// Example: PseudoVLSEG2E32_V_M2_MASK -> int_riscv_vlseg2_mask
+class PseudoToIntrinsic<string PseudoInst, bit IsMasked> {
+  string Intrinsic = !strconcat("int_riscv_",
+                        ToLowerCase<
+                        !subst("E8", "",
+                        !subst("E16", "",
+                        !subst("E32", "",
+                        !subst("E64", "",
+                        !subst("EI8", "",
+                        !subst("EI16", "",
+                        !subst("EI32", "",
+                        !subst("EI64", "",
+                        !subst("_V", "", PseudoToVInst<PseudoInst>.VInst)))))))))>.L,
+                        !if(IsMasked, "_mask", ""));
+}
+
+// The destination vector register group for a masked vector instruction cannot
+// overlap the source mask register (v0), unless the destination vector register
+// is being written with a mask value (e.g., comparisons) or the scalar result
+// of a reduction.
+class GetVRegNoV0<VReg VRegClass> {
+  VReg R = !cond(!eq(VRegClass, VR) : VRNoV0,
+                 !eq(VRegClass, VRM2) : VRM2NoV0,
+                 !eq(VRegClass, VRM4) : VRM4NoV0,
+                 !eq(VRegClass, VRM8) : VRM8NoV0,
+                 !eq(1, 1) : VRegClass);
+}
+
+// Join strings in list using separator and ignoring empty elements
+class Join<list<string> strings, string separator> {
+  string ret = !foldl(!head(strings), !tail(strings), a, b,
+                      !cond(
+                        !and(!empty(a), !empty(b)) : "",
+                        !empty(a) : b,
+                        !empty(b) : a,
+                        1 : a#separator#b));
+}
+
+class VPseudo<Instruction instr, LMULInfo m, dag outs, dag ins> :
+      Pseudo<outs, ins, []>, RISCVVPseudo {
+  let BaseInstr = instr;
+  let VLMul = m.value;
+}
+
+class VPseudoUSLoadNoMask<VReg RetClass>:
+      Pseudo<(outs RetClass:$rd),
+             (ins GPR:$rs1, GPR:$vl, ixlenimm:$sew),[]>,
+      RISCVVPseudo {
+  let mayLoad = 1;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let usesCustomInserter = 1;
+  let Uses = [VL, VTYPE];
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let HasDummyMask = 1;
+  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoUSLoadMask<VReg RetClass>:
+      Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
+              (ins GetVRegNoV0<RetClass>.R:$merge,
+                   GPR:$rs1,
+                   VMaskOp:$vm, GPR:$vl, ixlenimm:$sew),[]>,
+      RISCVVPseudo {
+  let mayLoad = 1;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let usesCustomInserter = 1;
+  let Constraints = "$rd = $merge";
+  let Uses = [VL, VTYPE];
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let HasMergeOp = 1;
+  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoSLoadNoMask<VReg RetClass>:
+      Pseudo<(outs RetClass:$rd),
+             (ins GPR:$rs1, GPR:$rs2, GPR:$vl, ixlenimm:$sew),[]>,
+      RISCVVPseudo {
+  let mayLoad = 1;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let usesCustomInserter = 1;
+  let Uses = [VL, VTYPE];
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let HasDummyMask = 1;
+  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoSLoadMask<VReg RetClass>:
+      Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
+              (ins GetVRegNoV0<RetClass>.R:$merge,
+                   GPR:$rs1, GPR:$rs2,
+                   VMaskOp:$vm, GPR:$vl, ixlenimm:$sew),[]>,
+      RISCVVPseudo {
+  let mayLoad = 1;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let usesCustomInserter = 1;
+  let Constraints = "$rd = $merge";
+  let Uses = [VL, VTYPE];
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let HasMergeOp = 1;
+  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoILoadNoMask<VReg RetClass, VReg IdxClass>:
+      Pseudo<(outs RetClass:$rd),
+             (ins GPR:$rs1, IdxClass:$rs2, GPR:$vl, ixlenimm:$sew),[]>,
+      RISCVVPseudo {
+  let mayLoad = 1;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let usesCustomInserter = 1;
+  let Uses = [VL, VTYPE];
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let HasDummyMask = 1;
+  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoILoadMask<VReg RetClass, VReg IdxClass>:
+      Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
+              (ins GetVRegNoV0<RetClass>.R:$merge,
+                   GPR:$rs1, IdxClass:$rs2,
+                   VMaskOp:$vm, GPR:$vl, ixlenimm:$sew),[]>,
+      RISCVVPseudo {
+  let mayLoad = 1;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let usesCustomInserter = 1;
+  let Constraints = "$rd = $merge";
+  let Uses = [VL, VTYPE];
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let HasMergeOp = 1;
+  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoUSStoreNoMask<VReg StClass>:
+      Pseudo<(outs),
+              (ins StClass:$rd, GPR:$rs1, GPR:$vl, ixlenimm:$sew),[]>,
+      RISCVVPseudo {
+  let mayLoad = 0;
+  let mayStore = 1;
+  let hasSideEffects = 0;
+  let usesCustomInserter = 1;
+  let Uses = [VL, VTYPE];
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let HasDummyMask = 1;
+  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoUSStoreMask<VReg StClass>:
+      Pseudo<(outs),
+              (ins StClass:$rd, GPR:$rs1, VMaskOp:$vm, GPR:$vl, ixlenimm:$sew),[]>,
+      RISCVVPseudo {
+  let mayLoad = 0;
+  let mayStore = 1;
+  let hasSideEffects = 0;
+  let usesCustomInserter = 1;
+  let Uses = [VL, VTYPE];
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoSStoreNoMask<VReg StClass>:
+      Pseudo<(outs),
+              (ins StClass:$rd, GPR:$rs1, GPR:$rs2, GPR:$vl, ixlenimm:$sew),[]>,
+      RISCVVPseudo {
+  let mayLoad = 0;
+  let mayStore = 1;
+  let hasSideEffects = 0;
+  let usesCustomInserter = 1;
+  let Uses = [VL, VTYPE];
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let HasDummyMask = 1;
+  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoSStoreMask<VReg StClass>:
+      Pseudo<(outs),
+              (ins StClass:$rd, GPR:$rs1, GPR:$rs2, VMaskOp:$vm, GPR:$vl, ixlenimm:$sew),[]>,
+      RISCVVPseudo {
+  let mayLoad = 0;
+  let mayStore = 1;
+  let hasSideEffects = 0;
+  let usesCustomInserter = 1;
+  let Uses = [VL, VTYPE];
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+// Unary instruction that is never masked so HasDummyMask=0.
+class VPseudoUnaryNoDummyMask<VReg RetClass,
+                              DAGOperand Op2Class> :
+        Pseudo<(outs RetClass:$rd),
+               (ins Op2Class:$rs1, GPR:$vl, ixlenimm:$sew), []>,
+        RISCVVPseudo {
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let usesCustomInserter = 1;
+  let Uses = [VL, VTYPE];
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoNullaryNoMask<VReg RegClass>:
+      Pseudo<(outs RegClass:$rd),
+             (ins GPR:$vl, ixlenimm:$sew),
+             []>, RISCVVPseudo {
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let usesCustomInserter = 1;
+  let Uses = [VL, VTYPE];
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let HasDummyMask = 1;
+  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoNullaryMask<VReg RegClass>:
+      Pseudo<(outs GetVRegNoV0<RegClass>.R:$rd),
+             (ins GetVRegNoV0<RegClass>.R:$merge, VMaskOp:$vm, GPR:$vl,
+              ixlenimm:$sew), []>, RISCVVPseudo {
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let usesCustomInserter = 1;
+  let Constraints ="$rd = $merge";
+  let Uses = [VL, VTYPE];
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let HasMergeOp = 1;
+  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+// Nullary for pseudo instructions. They are expanded in
+// RISCVExpandPseudoInsts pass.
+class VPseudoNullaryPseudoM<string BaseInst>
+       : Pseudo<(outs VR:$rd), (ins GPR:$vl, ixlenimm:$sew), []>,
+       RISCVVPseudo {
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let usesCustomInserter = 1;
+  let Uses = [VL, VTYPE];
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  // BaseInstr is not used in RISCVExpandPseudoInsts pass.
+  // Just fill a corresponding real v-inst to pass tablegen check.
+  let BaseInstr = !cast<Instruction>(BaseInst);
+}
+
+// RetClass could be GPR or VReg.
+class VPseudoUnaryNoMask<DAGOperand RetClass, VReg OpClass, string Constraint = ""> :
+        Pseudo<(outs RetClass:$rd),
+               (ins OpClass:$rs2, GPR:$vl, ixlenimm:$sew), []>,
+        RISCVVPseudo {
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let usesCustomInserter = 1;
+  let Constraints = Constraint;
+  let Uses = [VL, VTYPE];
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let HasDummyMask = 1;
+  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoUnaryMask<VReg RetClass, VReg OpClass, string Constraint = ""> :
+        Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
+               (ins GetVRegNoV0<RetClass>.R:$merge, OpClass:$rs2,
+                    VMaskOp:$vm, GPR:$vl, ixlenimm:$sew), []>,
+        RISCVVPseudo {
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let usesCustomInserter = 1;
+  let Constraints = Join<[Constraint, "$rd = $merge"], ",">.ret;
+  let Uses = [VL, VTYPE];
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let HasMergeOp = 1;
+  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+// mask unary operation without maskedoff
+class VPseudoMaskUnarySOutMask:
+        Pseudo<(outs GPR:$rd),
+               (ins VR:$rs1, VMaskOp:$vm, GPR:$vl, ixlenimm:$sew), []>,
+        RISCVVPseudo {
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let usesCustomInserter = 1;
+  let Uses = [VL, VTYPE];
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+// Masked mask operation have no $rd=$merge constraints
+class VPseudoUnaryMOutMask:
+        Pseudo<(outs VR:$rd),
+               (ins VR:$merge, VR:$rs1, VMaskOp:$vm, GPR:$vl, ixlenimm:$sew), []>,
+        RISCVVPseudo {
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let usesCustomInserter = 1;
+  let Constraints = "$rd = $merge";
+  let Uses = [VL, VTYPE];
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let HasMergeOp = 1;
+  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+// Mask can be V0~V31
+class VPseudoUnaryAnyMask<VReg RetClass,
+                          VReg Op1Class> :
+      Pseudo<(outs RetClass:$rd),
+             (ins RetClass:$merge,
+                  Op1Class:$rs2,
+                  VR:$vm, GPR:$vl, ixlenimm:$sew),
+             []>,
+      RISCVVPseudo {
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let usesCustomInserter = 1;
+  let Constraints = "@earlyclobber $rd, $rd = $merge";
+  let Uses = [VL, VTYPE];
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let HasMergeOp = 1;
+  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoBinaryNoMask<VReg RetClass,
+                          VReg Op1Class,
+                          DAGOperand Op2Class,
+                          string Constraint> :
+        Pseudo<(outs RetClass:$rd),
+               (ins Op1Class:$rs2, Op2Class:$rs1, GPR:$vl, ixlenimm:$sew), []>,
+        RISCVVPseudo {
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let usesCustomInserter = 1;
+  let Constraints = Constraint;
+  let Uses = [VL, VTYPE];
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let HasDummyMask = 1;
+  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoIStoreNoMask<VReg StClass, VReg IdxClass>:
+      Pseudo<(outs),
+              (ins StClass:$rd, GPR:$rs1, IdxClass:$rs2, GPR:$vl, ixlenimm:$sew),[]>,
+      RISCVVPseudo {
+  let mayLoad = 0;
+  let mayStore = 1;
+  let hasSideEffects = 0;
+  let usesCustomInserter = 1;
+  let Uses = [VL, VTYPE];
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let HasDummyMask = 1;
+  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoIStoreMask<VReg StClass, VReg IdxClass>:
+      Pseudo<(outs),
+              (ins StClass:$rd, GPR:$rs1, IdxClass:$rs2, VMaskOp:$vm, GPR:$vl, ixlenimm:$sew),[]>,
+      RISCVVPseudo {
+  let mayLoad = 0;
+  let mayStore = 1;
+  let hasSideEffects = 0;
+  let usesCustomInserter = 1;
+  let Uses = [VL, VTYPE];
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoBinaryMask<VReg RetClass,
+                        VReg Op1Class,
+                        DAGOperand Op2Class,
+                        string Constraint> :
+        Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
+                (ins GetVRegNoV0<RetClass>.R:$merge,
+                     Op1Class:$rs2, Op2Class:$rs1,
+                     VMaskOp:$vm, GPR:$vl, ixlenimm:$sew), []>,
+        RISCVVPseudo {
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let usesCustomInserter = 1;
+  let Constraints = Join<[Constraint, "$rd = $merge"], ",">.ret;
+  let Uses = [VL, VTYPE];
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let HasMergeOp = 1;
+  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoBinaryCarryIn<VReg RetClass,
+                           VReg Op1Class,
+                           DAGOperand Op2Class,
+                           LMULInfo MInfo,
+                           bit CarryIn,
+                           string Constraint> :
+        Pseudo<(outs RetClass:$rd),
+               !if(CarryIn,
+                  (ins Op1Class:$rs2, Op2Class:$rs1, VMV0:$carry, GPR:$vl,
+                       ixlenimm:$sew),
+                  (ins Op1Class:$rs2, Op2Class:$rs1, GPR:$vl, ixlenimm:$sew)), []>,
+        RISCVVPseudo {
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let usesCustomInserter = 1;
+  let Constraints = Constraint;
+  let Uses = [VL, VTYPE];
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let HasMergeOp = 0;
+  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+  let VLMul = MInfo.value;
+}
+
+class VPseudoTernaryNoMask<VReg RetClass,
+                           VReg Op1Class,
+                           DAGOperand Op2Class,
+                           string Constraint> :
+        Pseudo<(outs RetClass:$rd),
+               (ins RetClass:$rs3, Op1Class:$rs1, Op2Class:$rs2,
+                    GPR:$vl, ixlenimm:$sew),
+               []>,
+        RISCVVPseudo {
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let usesCustomInserter = 1;
+  let Constraints = Join<[Constraint, "$rd = $rs3"], ",">.ret;
+  let Uses = [VL, VTYPE];
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let HasMergeOp = 1;
+  let HasDummyMask = 1;
+  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoAMOWDNoMask<VReg RetClass,
+                         VReg Op1Class> :
+        Pseudo<(outs GetVRegNoV0<RetClass>.R:$vd_wd),
+               (ins GPR:$rs1,
+                    Op1Class:$vs2,
+                    GetVRegNoV0<RetClass>.R:$vd,
+                    GPR:$vl, ixlenimm:$sew), []>,
+        RISCVVPseudo {
+  let mayLoad = 1;
+  let mayStore = 1;
+  let hasSideEffects = 1;
+  let usesCustomInserter = 1;
+  let Constraints = "$vd_wd = $vd";
+  let Uses = [VL, VTYPE];
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let HasDummyMask = 1;
+  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoAMOWDMask<VReg RetClass,
+                       VReg Op1Class> :
+        Pseudo<(outs GetVRegNoV0<RetClass>.R:$vd_wd),
+               (ins GPR:$rs1,
+                    Op1Class:$vs2,
+                    GetVRegNoV0<RetClass>.R:$vd,
+                    VMaskOp:$vm, GPR:$vl, ixlenimm:$sew), []>,
+        RISCVVPseudo {
+  let mayLoad = 1;
+  let mayStore = 1;
+  let hasSideEffects = 1;
+  let usesCustomInserter = 1;
+  let Constraints = "$vd_wd = $vd";
+  let Uses = [VL, VTYPE];
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+multiclass VPseudoAMOEI<int eew> {
+  // Standard scalar AMO supports 32, 64, and 128 Mem data bits,
+  // and in the base vector "V" extension, only SEW up to ELEN = max(XLEN, FLEN)
+  // are required to be supported.
+  // therefore only [32, 64] is allowed here.
+  foreach sew = [32, 64] in {
+    foreach lmul = MxSet<sew>.m in {
+      defvar octuple_lmul = octuple_from_str<lmul.MX>.ret;
+      // Calculate emul = eew * lmul / sew
+      defvar octuple_emul = !srl(!mul(eew, octuple_lmul), shift_amount<sew>.val);
+      if !and(!ge(octuple_emul, 1), !le(octuple_emul, 64)) then {
+        defvar emulMX = octuple_to_str<octuple_emul>.ret;
+        defvar lmulMX = octuple_to_str<octuple_lmul>.ret;
+        defvar emul= !cast<LMULInfo>("V_" # emulMX);
+        defvar lmul = !cast<LMULInfo>("V_" # lmulMX);
+        let VLMul = lmul.value in {
+          def "_WD_" # lmulMX # "_" # emulMX : VPseudoAMOWDNoMask<lmul.vrclass, emul.vrclass>;
+          def "_WD_" # lmulMX # "_" # emulMX # "_MASK" : VPseudoAMOWDMask<lmul.vrclass, emul.vrclass>;
+        }
+      }
+    }
+  }
+}
+
+multiclass VPseudoAMO {
+  foreach eew = EEWList in
+    defm "EI" # eew : VPseudoAMOEI<eew>;
+}
+
+class VPseudoUSSegLoadNoMask<VReg RetClass, bits<11> EEW>:
+      Pseudo<(outs RetClass:$rd),
+             (ins GPR:$rs1, GPR:$vl, ixlenimm:$sew),[]>,
+      RISCVVPseudo,
+      RISCVZvlsseg<PseudoToIntrinsic<NAME, false>.Intrinsic, EEW, VLMul> {
+  let mayLoad = 1;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let usesCustomInserter = 1;
+  let Uses = [VL, VTYPE];
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let HasDummyMask = 1;
+  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoUSSegLoadMask<VReg RetClass, bits<11> EEW>:
+      Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
+             (ins GetVRegNoV0<RetClass>.R:$merge, GPR:$rs1,
+                  VMaskOp:$vm, GPR:$vl, ixlenimm:$sew),[]>,
+      RISCVVPseudo,
+      RISCVZvlsseg<PseudoToIntrinsic<NAME, true>.Intrinsic, EEW, VLMul> {
+  let mayLoad = 1;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let usesCustomInserter = 1;
+  let Constraints = "$rd = $merge";
+  let Uses = [VL, VTYPE];
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let HasMergeOp = 1;
+  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoSSegLoadNoMask<VReg RetClass, bits<11> EEW>:
+      Pseudo<(outs RetClass:$rd),
+             (ins GPR:$rs1, GPR:$offset, GPR:$vl, ixlenimm:$sew),[]>,
+      RISCVVPseudo,
+      RISCVZvlsseg<PseudoToIntrinsic<NAME, false>.Intrinsic, EEW, VLMul> {
+  let mayLoad = 1;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let usesCustomInserter = 1;
+  let Uses = [VL, VTYPE];
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let HasDummyMask = 1;
+  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoSSegLoadMask<VReg RetClass, bits<11> EEW>:
+      Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
+             (ins GetVRegNoV0<RetClass>.R:$merge, GPR:$rs1,
+                  GPR:$offset, VMaskOp:$vm, GPR:$vl, ixlenimm:$sew),[]>,
+      RISCVVPseudo,
+      RISCVZvlsseg<PseudoToIntrinsic<NAME, true>.Intrinsic, EEW, VLMul> {
+  let mayLoad = 1;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let usesCustomInserter = 1;
+  let Constraints = "$rd = $merge";
+  let Uses = [VL, VTYPE];
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let HasMergeOp = 1;
+  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoISegLoadNoMask<VReg RetClass, VReg IdxClass, bits<11> EEW, bits<3> LMUL>:
+      Pseudo<(outs RetClass:$rd),
+             (ins GPR:$rs1, IdxClass:$offset, GPR:$vl, ixlenimm:$sew),[]>,
+      RISCVVPseudo,
+      RISCVZvlsseg<PseudoToIntrinsic<NAME, false>.Intrinsic, EEW, VLMul, LMUL> {
+  let mayLoad = 1;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let usesCustomInserter = 1;
+  // For vector indexed segment loads, the destination vector register groups
+  // cannot overlap the source vector register group
+  let Constraints = "@earlyclobber $rd";
+  let Uses = [VL, VTYPE];
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let HasDummyMask = 1;
+  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoISegLoadMask<VReg RetClass, VReg IdxClass, bits<11> EEW, bits<3> LMUL>:
+      Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
+             (ins GetVRegNoV0<RetClass>.R:$merge, GPR:$rs1,
+                  IdxClass:$offset, VMaskOp:$vm, GPR:$vl, ixlenimm:$sew),[]>,
+      RISCVVPseudo,
+      RISCVZvlsseg<PseudoToIntrinsic<NAME, true>.Intrinsic, EEW, VLMul, LMUL> {
+  let mayLoad = 1;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let usesCustomInserter = 1;
+  // For vector indexed segment loads, the destination vector register groups
+  // cannot overlap the source vector register group
+  let Constraints = "@earlyclobber $rd, $rd = $merge";
+  let Uses = [VL, VTYPE];
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let HasMergeOp = 1;
+  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoUSSegStoreNoMask<VReg ValClass, bits<11> EEW>:
+      Pseudo<(outs),
+             (ins ValClass:$rd, GPR:$rs1, GPR:$vl, ixlenimm:$sew),[]>,
+      RISCVVPseudo,
+      RISCVZvlsseg<PseudoToIntrinsic<NAME, false>.Intrinsic, EEW, VLMul> {
+  let mayLoad = 0;
+  let mayStore = 1;
+  let hasSideEffects = 0;
+  let usesCustomInserter = 1;
+  let Uses = [VL, VTYPE];
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let HasDummyMask = 1;
+  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoUSSegStoreMask<VReg ValClass, bits<11> EEW>:
+      Pseudo<(outs),
+             (ins ValClass:$rd, GPR:$rs1,
+                  VMaskOp:$vm, GPR:$vl, ixlenimm:$sew),[]>,
+      RISCVVPseudo,
+      RISCVZvlsseg<PseudoToIntrinsic<NAME, true>.Intrinsic, EEW, VLMul> {
+  let mayLoad = 0;
+  let mayStore = 1;
+  let hasSideEffects = 0;
+  let usesCustomInserter = 1;
+  let Uses = [VL, VTYPE];
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoSSegStoreNoMask<VReg ValClass, bits<11> EEW>:
+      Pseudo<(outs),
+             (ins ValClass:$rd, GPR:$rs1, GPR: $offset, GPR:$vl, ixlenimm:$sew),[]>,
+      RISCVVPseudo,
+      RISCVZvlsseg<PseudoToIntrinsic<NAME, false>.Intrinsic, EEW, VLMul> {
+  let mayLoad = 0;
+  let mayStore = 1;
+  let hasSideEffects = 0;
+  let usesCustomInserter = 1;
+  let Uses = [VL, VTYPE];
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let HasDummyMask = 1;
+  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoSSegStoreMask<VReg ValClass, bits<11> EEW>:
+      Pseudo<(outs),
+             (ins ValClass:$rd, GPR:$rs1, GPR: $offset,
+                  VMaskOp:$vm, GPR:$vl, ixlenimm:$sew),[]>,
+      RISCVVPseudo,
+      RISCVZvlsseg<PseudoToIntrinsic<NAME, true>.Intrinsic, EEW, VLMul> {
+  let mayLoad = 0;
+  let mayStore = 1;
+  let hasSideEffects = 0;
+  let usesCustomInserter = 1;
+  let Uses = [VL, VTYPE];
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoISegStoreNoMask<VReg ValClass, VReg IdxClass, bits<11> EEW, bits<3> LMUL>:
+      Pseudo<(outs),
+             (ins ValClass:$rd, GPR:$rs1, IdxClass: $index,
+                  GPR:$vl, ixlenimm:$sew),[]>,
+      RISCVVPseudo,
+      RISCVZvlsseg<PseudoToIntrinsic<NAME, false>.Intrinsic, EEW, VLMul, LMUL> {
+  let mayLoad = 0;
+  let mayStore = 1;
+  let hasSideEffects = 0;
+  let usesCustomInserter = 1;
+  let Uses = [VL, VTYPE];
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let HasDummyMask = 1;
+  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoISegStoreMask<VReg ValClass, VReg IdxClass, bits<11> EEW, bits<3> LMUL>:
+      Pseudo<(outs),
+             (ins ValClass:$rd, GPR:$rs1, IdxClass: $index,
+                  VMaskOp:$vm, GPR:$vl, ixlenimm:$sew),[]>,
+      RISCVVPseudo,
+      RISCVZvlsseg<PseudoToIntrinsic<NAME, true>.Intrinsic, EEW, VLMul, LMUL> {
+  let mayLoad = 0;
+  let mayStore = 1;
+  let hasSideEffects = 0;
+  let usesCustomInserter = 1;
+  let Uses = [VL, VTYPE];
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+multiclass VPseudoUSLoad {
+  foreach lmul = MxList.m in {
+    defvar LInfo = lmul.MX;
+    defvar vreg = lmul.vrclass;
+    let VLMul = lmul.value in {
+      def "_V_" # LInfo : VPseudoUSLoadNoMask<vreg>;
+      def "_V_" # LInfo # "_MASK" : VPseudoUSLoadMask<vreg>;
+    }
+  }
+}
+
+multiclass VPseudoLoadMask {
+  foreach mti = AllMasks in {
+    let VLMul = mti.LMul.value in {
+      def "_V_" # mti.BX : VPseudoUSLoadNoMask<VR>;
+    }
+  }
+}
+
+multiclass VPseudoSLoad {
+  foreach lmul = MxList.m in {
+    defvar LInfo = lmul.MX;
+    defvar vreg = lmul.vrclass;
+    let VLMul = lmul.value in {
+      def "_V_" # LInfo : VPseudoSLoadNoMask<vreg>;
+      def "_V_" # LInfo # "_MASK" : VPseudoSLoadMask<vreg>;
+    }
+  }
+}
+
+multiclass VPseudoILoad {
+  foreach lmul = MxList.m in
+  foreach idx_lmul = MxList.m in {
+    defvar LInfo = lmul.MX;
+    defvar Vreg = lmul.vrclass;
+    defvar IdxLInfo = idx_lmul.MX;
+    defvar IdxVreg = idx_lmul.vrclass;
+    let VLMul = lmul.value in {
+      def "_V_" # IdxLInfo # "_" # LInfo : VPseudoILoadNoMask<Vreg, IdxVreg>;
+      def "_V_" # IdxLInfo # "_" # LInfo # "_MASK" : VPseudoILoadMask<Vreg, IdxVreg>;
+    }
+  }
+}
+
+multiclass VPseudoUSStore {
+  foreach lmul = MxList.m in {
+    defvar LInfo = lmul.MX;
+    defvar vreg = lmul.vrclass;
+    let VLMul = lmul.value in {
+      def "_V_" # LInfo : VPseudoUSStoreNoMask<vreg>;
+      def "_V_" # LInfo # "_MASK" : VPseudoUSStoreMask<vreg>;
+    }
+  }
+}
+
+multiclass VPseudoStoreMask {
+  foreach mti = AllMasks in {
+    let VLMul = mti.LMul.value in {
+      def "_V_" # mti.BX : VPseudoUSStoreNoMask<VR>;
+    }
+  }
+}
+
+multiclass VPseudoSStore {
+  foreach lmul = MxList.m in {
+    defvar LInfo = lmul.MX;
+    defvar vreg = lmul.vrclass;
+    let VLMul = lmul.value in {
+      def "_V_" # LInfo : VPseudoSStoreNoMask<vreg>;
+      def "_V_" # LInfo # "_MASK" : VPseudoSStoreMask<vreg>;
+    }
+  }
+}
+
+multiclass VPseudoIStore {
+  foreach lmul = MxList.m in
+  foreach idx_lmul = MxList.m in {
+    defvar LInfo = lmul.MX;
+    defvar Vreg = lmul.vrclass;
+    defvar IdxLInfo = idx_lmul.MX;
+    defvar IdxVreg = idx_lmul.vrclass;
+    let VLMul = lmul.value in {
+      def "_V_" # IdxLInfo # "_" # LInfo : VPseudoIStoreNoMask<Vreg, IdxVreg>;
+      def "_V_" # IdxLInfo # "_" # LInfo # "_MASK" : VPseudoIStoreMask<Vreg, IdxVreg>;
+    }
+  }
+}
+
+multiclass VPseudoUnaryS_M {
+  foreach mti = AllMasks in
+  {
+    let VLMul = mti.LMul.value in {
+      def "_M_" # mti.BX : VPseudoUnaryNoMask<GPR, VR>;
+      def "_M_" # mti.BX # "_MASK" : VPseudoMaskUnarySOutMask;
+    }
+  }
+}
+
+multiclass VPseudoUnaryM_M {
+  defvar constraint = "@earlyclobber $rd";
+  foreach mti = AllMasks in
+  {
+    let VLMul = mti.LMul.value in {
+      def "_M_" # mti.BX : VPseudoUnaryNoMask<VR, VR, constraint>;
+      def "_M_" # mti.BX # "_MASK" : VPseudoUnaryMask<VR, VR, constraint>;
+    }
+  }
+}
+
+multiclass VPseudoMaskNullaryV {
+  foreach m = MxList.m in {
+    let VLMul = m.value in {
+      def "_V_" # m.MX : VPseudoNullaryNoMask<m.vrclass>;
+      def "_V_" # m.MX # "_MASK" : VPseudoNullaryMask<m.vrclass>;
+    }
+  }
+}
+
+multiclass VPseudoNullaryPseudoM <string BaseInst> {
+  foreach mti = AllMasks in {
+    let VLMul = mti.LMul.value in {
+      def "_M_" # mti.BX : VPseudoNullaryPseudoM<BaseInst # "_MM">;
+    }
+  }
+}
+
+multiclass VPseudoUnaryV_M {
+  defvar constraint = "@earlyclobber $rd";
+  foreach m = MxList.m in {
+    let VLMul = m.value in {
+      def "_" # m.MX : VPseudoUnaryNoMask<m.vrclass, VR, constraint>;
+      def "_" # m.MX # "_MASK" : VPseudoUnaryMask<m.vrclass, VR, constraint>;
+    }
+  }
+}
+
+multiclass VPseudoUnaryV_V_AnyMask {
+  foreach m = MxList.m in {
+    let VLMul = m.value in
+      def _VM # "_" # m.MX : VPseudoUnaryAnyMask<m.vrclass, m.vrclass>;
+  }
+}
+
+multiclass VPseudoBinary<VReg RetClass,
+                         VReg Op1Class,
+                         DAGOperand Op2Class,
+                         LMULInfo MInfo,
+                         string Constraint = ""> {
+  let VLMul = MInfo.value in {
+    def "_" # MInfo.MX : VPseudoBinaryNoMask<RetClass, Op1Class, Op2Class,
+                                             Constraint>;
+    def "_" # MInfo.MX # "_MASK" : VPseudoBinaryMask<RetClass, Op1Class, Op2Class,
+                                                     Constraint>;
+  }
+}
+
+multiclass VPseudoBinaryEmul<VReg RetClass,
+                             VReg Op1Class,
+                             DAGOperand Op2Class,
+                             LMULInfo lmul,
+                             LMULInfo emul,
+                             string Constraint = ""> {
+  let VLMul = lmul.value in {
+    def "_" # lmul.MX # "_" # emul.MX : VPseudoBinaryNoMask<RetClass, Op1Class, Op2Class,
+                                                            Constraint>;
+    def "_" # lmul.MX # "_" # emul.MX # "_MASK" : VPseudoBinaryMask<RetClass, Op1Class, Op2Class,
+                                                                    Constraint>;
+  }
+}
+
+multiclass VPseudoBinaryV_VV<string Constraint = ""> {
+  foreach m = MxList.m in
+    defm _VV : VPseudoBinary<m.vrclass, m.vrclass, m.vrclass, m, Constraint>;
+}
+
+multiclass VPseudoBinaryV_VV_EEW<int eew, string Constraint = ""> {
+  foreach m = MxList.m in {
+    foreach sew = EEWList in {
+      defvar octuple_lmul = octuple_from_str<m.MX>.ret;
+      // emul = lmul * eew / sew
+      defvar octuple_emul = !srl(!mul(octuple_lmul, eew), shift_amount<sew>.val);
+      if !and(!ge(octuple_emul, 1), !le(octuple_emul, 64)) then {
+        defvar emulMX = octuple_to_str<octuple_emul>.ret;
+        defvar emul = !cast<LMULInfo>("V_" # emulMX);
+        defm _VV : VPseudoBinaryEmul<m.vrclass, m.vrclass, emul.vrclass, m, emul, Constraint>;
+      }
+    }
+  }
+}
+
+multiclass VPseudoBinaryV_VX<string Constraint = ""> {
+  foreach m = MxList.m in
+    defm "_VX" : VPseudoBinary<m.vrclass, m.vrclass, GPR, m, Constraint>;
+}
+
+multiclass VPseudoBinaryV_VF<string Constraint = ""> {
+  foreach m = MxList.m in
+    foreach f = FPList.fpinfo in
+      defm "_V" # f.FX : VPseudoBinary<m.vrclass, m.vrclass,
+                                       f.fprclass, m, Constraint>;
+}
+
+multiclass VPseudoBinaryV_VI<Operand ImmType = simm5, string Constraint = ""> {
+  foreach m = MxList.m in
+    defm _VI : VPseudoBinary<m.vrclass, m.vrclass, ImmType, m, Constraint>;
+}
+
+multiclass VPseudoBinaryM_MM {
+  foreach m = MxList.m in
+    let VLMul = m.value in {
+      def "_MM_" # m.MX : VPseudoBinaryNoMask<VR, VR, VR, "">;
+    }
+}
+
+// We use earlyclobber here due to
+// * The destination EEW is smaller than the source EEW and the overlap is
+//   in the lowest-numbered part of the source register group is legal.
+//   Otherwise, it is illegal.
+// * The destination EEW is greater than the source EEW, the source EMUL is
+//   at least 1, and the overlap is in the highest-numbered part of the
+//   destination register group is legal. Otherwise, it is illegal.
+multiclass VPseudoBinaryW_VV {
+  foreach m = MxList.m[0-5] in
+    defm _VV : VPseudoBinary<m.wvrclass, m.vrclass, m.vrclass, m,
+                             "@earlyclobber $rd">;
+}
+
+multiclass VPseudoBinaryW_VX {
+  foreach m = MxList.m[0-5] in
+    defm "_VX" : VPseudoBinary<m.wvrclass, m.vrclass, GPR, m,
+                               "@earlyclobber $rd">;
+}
+
+multiclass VPseudoBinaryW_VF {
+  foreach m = MxList.m[0-5] in
+    foreach f = FPList.fpinfo[0-1] in
+      defm "_V" # f.FX : VPseudoBinary<m.wvrclass, m.vrclass,
+                                       f.fprclass, m,
+                                       "@earlyclobber $rd">;
+}
+
+multiclass VPseudoBinaryW_WV {
+  foreach m = MxList.m[0-5] in
+    defm _WV : VPseudoBinary<m.wvrclass, m.wvrclass, m.vrclass, m,
+                             "@earlyclobber $rd">;
+}
+
+multiclass VPseudoBinaryW_WX {
+  foreach m = MxList.m[0-5] in
+    defm "_WX" : VPseudoBinary<m.wvrclass, m.wvrclass, GPR, m,
+                               "@earlyclobber $rd">;
+}
+
+multiclass VPseudoBinaryW_WF {
+  foreach m = MxList.m[0-5] in
+    foreach f = FPList.fpinfo[0-1] in
+      defm "_W" # f.FX : VPseudoBinary<m.wvrclass, m.wvrclass,
+                                       f.fprclass, m,
+                                       "@earlyclobber $rd">;
+}
+
+multiclass VPseudoBinaryV_WV {
+  foreach m = MxList.m[0-5] in
+    defm _WV : VPseudoBinary<m.vrclass, m.wvrclass, m.vrclass, m,
+                             "@earlyclobber $rd">;
+}
+
+multiclass VPseudoBinaryV_WX {
+  foreach m = MxList.m[0-5] in
+    defm _WX : VPseudoBinary<m.vrclass, m.wvrclass, GPR, m,
+                             "@earlyclobber $rd">;
+}
+
+multiclass VPseudoBinaryV_WI {
+  foreach m = MxList.m[0-5] in
+    defm _WI : VPseudoBinary<m.vrclass, m.wvrclass, uimm5, m,
+                             "@earlyclobber $rd">;
+}
+
+// For vadc and vsbc, the instruction encoding is reserved if the destination
+// vector register is v0.
+// For vadc and vsbc, CarryIn == 1 and CarryOut == 0
+multiclass VPseudoBinaryV_VM<bit CarryOut = 0, bit CarryIn = 1,
+                             string Constraint = ""> {
+  foreach m = MxList.m in
+    def "_VV" # !if(CarryIn, "M", "") # "_" # m.MX :
+      VPseudoBinaryCarryIn<!if(CarryOut, VR,
+                           !if(!and(CarryIn, !not(CarryOut)),
+                               GetVRegNoV0<m.vrclass>.R, m.vrclass)),
+                           m.vrclass, m.vrclass, m, CarryIn, Constraint>;
+}
+
+multiclass VPseudoBinaryV_XM<bit CarryOut = 0, bit CarryIn = 1,
+                             string Constraint = ""> {
+  foreach m = MxList.m in
+    def "_VX" # !if(CarryIn, "M", "") # "_" # m.MX :
+      VPseudoBinaryCarryIn<!if(CarryOut, VR,
+                           !if(!and(CarryIn, !not(CarryOut)),
+                               GetVRegNoV0<m.vrclass>.R, m.vrclass)),
+                           m.vrclass, GPR, m, CarryIn, Constraint>;
+}
+
+multiclass VPseudoBinaryV_FM {
+  foreach m = MxList.m in
+    foreach f = FPList.fpinfo in
+      def "_V" # f.FX # "M_" # m.MX :
+        VPseudoBinaryCarryIn<GetVRegNoV0<m.vrclass>.R,
+                             m.vrclass, f.fprclass, m, /*CarryIn=*/1, "">;
+}
+
+multiclass VPseudoBinaryV_IM<bit CarryOut = 0, bit CarryIn = 1,
+                             string Constraint = ""> {
+  foreach m = MxList.m in
+    def "_VI" # !if(CarryIn, "M", "") # "_" # m.MX :
+      VPseudoBinaryCarryIn<!if(CarryOut, VR,
+                           !if(!and(CarryIn, !not(CarryOut)),
+                               GetVRegNoV0<m.vrclass>.R, m.vrclass)),
+                           m.vrclass, simm5, m, CarryIn, Constraint>;
+}
+
+multiclass VPseudoUnaryV_V_X_I_NoDummyMask {
+  foreach m = MxList.m in {
+    let VLMul = m.value in {
+      def "_V_" # m.MX : VPseudoUnaryNoDummyMask<m.vrclass, m.vrclass>;
+      def "_X_" # m.MX : VPseudoUnaryNoDummyMask<m.vrclass, GPR>;
+      def "_I_" # m.MX : VPseudoUnaryNoDummyMask<m.vrclass, simm5>;
+    }
+  }
+}
+
+multiclass VPseudoUnaryV_F_NoDummyMask {
+  foreach m = MxList.m in {
+    foreach f = FPList.fpinfo in {
+      let VLMul = m.value in {
+        def "_" # f.FX # "_" # m.MX : VPseudoUnaryNoDummyMask<m.vrclass, f.fprclass>;
+      }
+    }
+  }
+}
+
+multiclass VPseudoUnaryV_V {
+  foreach m = MxList.m in {
+    let VLMul = m.value in {
+      def "_V_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.vrclass>;
+      def "_V_" # m.MX # "_MASK" : VPseudoUnaryMask<m.vrclass, m.vrclass>;
+    }
+  }
+}
+
+multiclass PseudoUnaryV_VF2 {
+  defvar constraints = "@earlyclobber $rd";
+  foreach m = MxList.m[1-6] in
+  {
+    let VLMul = m.value in {
+      def "_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.f2vrclass, constraints>;
+      def "_" # m.MX # "_MASK" : VPseudoUnaryMask<m.vrclass, m.f2vrclass,
+                                                  constraints>;
+    }
+  }
+}
+
+multiclass PseudoUnaryV_VF4 {
+  defvar constraints = "@earlyclobber $rd";
+  foreach m = MxList.m[2-6] in
+  {
+    let VLMul = m.value in {
+      def "_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.f4vrclass, constraints>;
+      def "_" # m.MX # "_MASK" : VPseudoUnaryMask<m.vrclass, m.f4vrclass,
+                                                  constraints>;
+    }
+  }
+}
+
+multiclass PseudoUnaryV_VF8 {
+  defvar constraints = "@earlyclobber $rd";
+  foreach m = MxList.m[3-6] in
+  {
+    let VLMul = m.value in {
+      def "_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.f8vrclass, constraints>;
+      def "_" # m.MX # "_MASK" : VPseudoUnaryMask<m.vrclass, m.f8vrclass,
+                                                  constraints>;
+    }
+  }
+}
+
+// The destination EEW is 1.
+// The source EEW is 8, 16, 32, or 64.
+// When the destination EEW is different from source EEW, we need to use
+// @earlyclobber to avoid the overlap between destination and source registers.
+multiclass VPseudoBinaryM_VV {
+  foreach m = MxList.m in
+    defm _VV : VPseudoBinary<VR, m.vrclass, m.vrclass, m, "@earlyclobber $rd">;
+}
+
+multiclass VPseudoBinaryM_VX {
+  foreach m = MxList.m in
+    defm "_VX" :
+      VPseudoBinary<VR, m.vrclass, GPR, m, "@earlyclobber $rd">;
+}
+
+multiclass VPseudoBinaryM_VF {
+  foreach m = MxList.m in
+    foreach f = FPList.fpinfo in
+      defm "_V" # f.FX :
+        VPseudoBinary<VR, m.vrclass, f.fprclass, m, "@earlyclobber $rd">;
+}
+
+multiclass VPseudoBinaryM_VI {
+  foreach m = MxList.m in
+    defm _VI : VPseudoBinary<VR, m.vrclass, simm5, m, "@earlyclobber $rd">;
+}
+
+multiclass VPseudoBinaryV_VV_VX_VI<Operand ImmType = simm5, string Constraint = ""> {
+  defm "" : VPseudoBinaryV_VV<Constraint>;
+  defm "" : VPseudoBinaryV_VX<Constraint>;
+  defm "" : VPseudoBinaryV_VI<ImmType, Constraint>;
+}
+
+multiclass VPseudoBinaryV_VV_VX {
+  defm "" : VPseudoBinaryV_VV;
+  defm "" : VPseudoBinaryV_VX;
+}
+
+multiclass VPseudoBinaryV_VV_VF {
+  defm "" : VPseudoBinaryV_VV;
+  defm "" : VPseudoBinaryV_VF;
+}
+
+multiclass VPseudoBinaryV_VX_VI<Operand ImmType = simm5> {
+  defm "" : VPseudoBinaryV_VX;
+  defm "" : VPseudoBinaryV_VI<ImmType>;
+}
+
+multiclass VPseudoBinaryW_VV_VX {
+  defm "" : VPseudoBinaryW_VV;
+  defm "" : VPseudoBinaryW_VX;
+}
+
+multiclass VPseudoBinaryW_VV_VF {
+  defm "" : VPseudoBinaryW_VV;
+  defm "" : VPseudoBinaryW_VF;
+}
+
+multiclass VPseudoBinaryW_WV_WX {
+  defm "" : VPseudoBinaryW_WV;
+  defm "" : VPseudoBinaryW_WX;
+}
+
+multiclass VPseudoBinaryW_WV_WF {
+  defm "" : VPseudoBinaryW_WV;
+  defm "" : VPseudoBinaryW_WF;
+}
+
+multiclass VPseudoBinaryV_VM_XM_IM {
+  defm "" : VPseudoBinaryV_VM;
+  defm "" : VPseudoBinaryV_XM;
+  defm "" : VPseudoBinaryV_IM;
+}
+
+multiclass VPseudoBinaryV_VM_XM {
+  defm "" : VPseudoBinaryV_VM;
+  defm "" : VPseudoBinaryV_XM;
+}
+
+multiclass VPseudoBinaryM_VM_XM_IM<string Constraint> {
+  defm "" : VPseudoBinaryV_VM</*CarryOut=*/1, /*CarryIn=*/1, Constraint>;
+  defm "" : VPseudoBinaryV_XM</*CarryOut=*/1, /*CarryIn=*/1, Constraint>;
+  defm "" : VPseudoBinaryV_IM</*CarryOut=*/1, /*CarryIn=*/1, Constraint>;
+}
+
+multiclass VPseudoBinaryM_VM_XM<string Constraint> {
+  defm "" : VPseudoBinaryV_VM</*CarryOut=*/1, /*CarryIn=*/1, Constraint>;
+  defm "" : VPseudoBinaryV_XM</*CarryOut=*/1, /*CarryIn=*/1, Constraint>;
+}
+
+multiclass VPseudoBinaryM_V_X_I<string Constraint> {
+  defm "" : VPseudoBinaryV_VM</*CarryOut=*/1, /*CarryIn=*/0, Constraint>;
+  defm "" : VPseudoBinaryV_XM</*CarryOut=*/1, /*CarryIn=*/0, Constraint>;
+  defm "" : VPseudoBinaryV_IM</*CarryOut=*/1, /*CarryIn=*/0, Constraint>;
+}
+
+multiclass VPseudoBinaryM_V_X<string Constraint> {
+  defm "" : VPseudoBinaryV_VM</*CarryOut=*/1, /*CarryIn=*/0, Constraint>;
+  defm "" : VPseudoBinaryV_XM</*CarryOut=*/1, /*CarryIn=*/0, Constraint>;
+}
+
+multiclass VPseudoBinaryV_WV_WX_WI {
+  defm "" : VPseudoBinaryV_WV;
+  defm "" : VPseudoBinaryV_WX;
+  defm "" : VPseudoBinaryV_WI;
+}
+
+multiclass VPseudoTernary<VReg RetClass,
+                          VReg Op1Class,
+                          RegisterClass Op2Class,
+                          LMULInfo MInfo,
+                          string Constraint = ""> {
+  let VLMul = MInfo.value in {
+    def "_" # MInfo.MX : VPseudoTernaryNoMask<RetClass, Op1Class, Op2Class, Constraint>;
+    def "_" # MInfo.MX # "_MASK" : VPseudoBinaryMask<RetClass, Op1Class, Op2Class, Constraint>;
+  }
+}
+
+multiclass VPseudoTernaryV_VV<string Constraint = ""> {
+  foreach m = MxList.m in
+    defm _VV : VPseudoTernary<m.vrclass, m.vrclass, m.vrclass, m, Constraint>;
+}
+
+multiclass VPseudoTernaryV_VX<string Constraint = ""> {
+  foreach m = MxList.m in
+    defm _VX : VPseudoTernary<m.vrclass, m.vrclass, GPR, m, Constraint>;
+}
+
+multiclass VPseudoTernaryV_VX_AAXA<string Constraint = ""> {
+  foreach m = MxList.m in
+    defm "_VX" : VPseudoTernary<m.vrclass, GPR, m.vrclass, m, Constraint>;
+}
+
+multiclass VPseudoTernaryV_VF_AAXA<string Constraint = ""> {
+  foreach m = MxList.m in
+    foreach f = FPList.fpinfo in
+      defm "_V" # f.FX : VPseudoTernary<m.vrclass, f.fprclass, m.vrclass,
+                                        m, Constraint>;
+}
+
+multiclass VPseudoTernaryW_VV {
+  defvar constraint = "@earlyclobber $rd";
+  foreach m = MxList.m[0-5] in
+    defm _VV : VPseudoTernary<m.wvrclass, m.vrclass, m.vrclass, m, constraint>;
+}
+
+multiclass VPseudoTernaryW_VX {
+  defvar constraint = "@earlyclobber $rd";
+  foreach m = MxList.m[0-5] in
+    defm "_VX" : VPseudoTernary<m.wvrclass, GPR, m.vrclass, m, constraint>;
+}
+
+multiclass VPseudoTernaryW_VF {
+  defvar constraint = "@earlyclobber $rd";
+  foreach m = MxList.m[0-5] in
+    foreach f = FPList.fpinfo[0-1] in
+      defm "_V" # f.FX : VPseudoTernary<m.wvrclass, f.fprclass, m.vrclass, m,
+                                        constraint>;
+}
+
+multiclass VPseudoTernaryV_VI<Operand ImmType = simm5, string Constraint = ""> {
+  foreach m = MxList.m in
+    defm _VI : VPseudoTernary<m.vrclass, m.vrclass, ImmType, m, Constraint>;
+}
+
+multiclass VPseudoTernaryV_VV_VX_AAXA<string Constraint = ""> {
+  defm "" : VPseudoTernaryV_VV<Constraint>;
+  defm "" : VPseudoTernaryV_VX_AAXA<Constraint>;
+}
+
+multiclass VPseudoTernaryV_VV_VF_AAXA<string Constraint = ""> {
+  defm "" : VPseudoTernaryV_VV<Constraint>;
+  defm "" : VPseudoTernaryV_VF_AAXA<Constraint>;
+}
+
+multiclass VPseudoTernaryV_VX_VI<Operand ImmType = simm5, string Constraint = ""> {
+  defm "" : VPseudoTernaryV_VX<Constraint>;
+  defm "" : VPseudoTernaryV_VI<ImmType, Constraint>;
+}
+
+multiclass VPseudoTernaryW_VV_VX {
+  defm "" : VPseudoTernaryW_VV;
+  defm "" : VPseudoTernaryW_VX;
+}
+
+multiclass VPseudoTernaryW_VV_VF {
+  defm "" : VPseudoTernaryW_VV;
+  defm "" : VPseudoTernaryW_VF;
+}
+
+multiclass VPseudoBinaryM_VV_VX_VI {
+  defm "" : VPseudoBinaryM_VV;
+  defm "" : VPseudoBinaryM_VX;
+  defm "" : VPseudoBinaryM_VI;
+}
+
+multiclass VPseudoBinaryM_VV_VX {
+  defm "" : VPseudoBinaryM_VV;
+  defm "" : VPseudoBinaryM_VX;
+}
+
+multiclass VPseudoBinaryM_VV_VF {
+  defm "" : VPseudoBinaryM_VV;
+  defm "" : VPseudoBinaryM_VF;
+}
+
+multiclass VPseudoBinaryM_VX_VI {
+  defm "" : VPseudoBinaryM_VX;
+  defm "" : VPseudoBinaryM_VI;
+}
+
+multiclass VPseudoReductionV_VS {
+  foreach m = MxList.m in {
+    let WritesElement0 = 1 in
+    defm _VS : VPseudoTernary<V_M1.vrclass, m.vrclass, V_M1.vrclass, m>;
+  }
+}
+
+multiclass VPseudoConversion<VReg RetClass,
+                             VReg Op1Class,
+                             LMULInfo MInfo,
+                             string Constraint = ""> {
+  let VLMul = MInfo.value in {
+    def "_" # MInfo.MX : VPseudoUnaryNoMask<RetClass, Op1Class, Constraint>;
+    def "_" # MInfo.MX # "_MASK" : VPseudoUnaryMask<RetClass, Op1Class,
+                                                    Constraint>;
+  }
+}
+
+multiclass VPseudoConversionV_V {
+  foreach m = MxList.m in
+    defm _V : VPseudoConversion<m.vrclass, m.vrclass, m>;
+}
+
+multiclass VPseudoConversionW_V {
+  defvar constraint = "@earlyclobber $rd";
+  foreach m = MxList.m[0-5] in
+    defm _V : VPseudoConversion<m.wvrclass, m.vrclass, m, constraint>;
+}
+
+multiclass VPseudoConversionV_W {
+  defvar constraint = "@earlyclobber $rd";
+  foreach m = MxList.m[0-5] in
+    defm _W : VPseudoConversion<m.vrclass, m.wvrclass, m, constraint>;
+}
+
+multiclass VPseudoUSSegLoad<bit isFF> {
+  foreach eew = EEWList in {
+    foreach lmul = MxSet<eew>.m in {
+      defvar LInfo = lmul.MX;
+      let VLMul = lmul.value in {
+        foreach nf = NFSet<lmul>.L in {
+          defvar vreg = SegRegClass<lmul, nf>.RC;
+          defvar FFStr = !if(isFF, "FF", "");
+          def nf # "E" # eew # FFStr # "_V_" # LInfo :
+            VPseudoUSSegLoadNoMask<vreg, eew>;
+          def nf # "E" # eew # FFStr # "_V_" # LInfo # "_MASK" :
+            VPseudoUSSegLoadMask<vreg, eew>;
+        }
+      }
+    }
+  }
+}
+
+multiclass VPseudoSSegLoad {
+  foreach eew = EEWList in {
+    foreach lmul = MxSet<eew>.m in {
+      defvar LInfo = lmul.MX;
+      let VLMul = lmul.value in {
+        foreach nf = NFSet<lmul>.L in {
+          defvar vreg = SegRegClass<lmul, nf>.RC;
+          def nf # "E" # eew # "_V_" # LInfo : VPseudoSSegLoadNoMask<vreg, eew>;
+          def nf # "E" # eew # "_V_" # LInfo # "_MASK" : VPseudoSSegLoadMask<vreg, eew>;
+        }
+      }
+    }
+  }
+}
+
+multiclass VPseudoISegLoad {
+  foreach idx_eew = EEWList in {  // EEW for index argument.
+    foreach idx_lmul = MxSet<idx_eew>.m in {  // LMUL for index argument.
+      foreach val_lmul = MxList.m in {  // LMUL for the value.
+        defvar IdxLInfo = idx_lmul.MX;
+        defvar IdxVreg = idx_lmul.vrclass;
+        defvar ValLInfo = val_lmul.MX;
+        let VLMul = val_lmul.value in {
+          foreach nf = NFSet<val_lmul>.L in {
+            defvar ValVreg = SegRegClass<val_lmul, nf>.RC;
+            def nf # "EI" # idx_eew # "_V_" # IdxLInfo # "_" # ValLInfo :
+              VPseudoISegLoadNoMask<ValVreg, IdxVreg, idx_eew, idx_lmul.value>;
+            def nf # "EI" # idx_eew # "_V_" # IdxLInfo # "_" # ValLInfo # "_MASK" :
+              VPseudoISegLoadMask<ValVreg, IdxVreg, idx_eew, idx_lmul.value>;
+          }
+        }
+      }
+    }
+  }
+}
+
+multiclass VPseudoUSSegStore {
+  foreach eew = EEWList in {
+    foreach lmul = MxSet<eew>.m in {
+      defvar LInfo = lmul.MX;
+      let VLMul = lmul.value in {
+        foreach nf = NFSet<lmul>.L in {
+          defvar vreg = SegRegClass<lmul, nf>.RC;
+          def nf # "E" # eew # "_V_" # LInfo : VPseudoUSSegStoreNoMask<vreg, eew>;
+          def nf # "E" # eew # "_V_" # LInfo # "_MASK" : VPseudoUSSegStoreMask<vreg, eew>;
+        }
+      }
+    }
+  }
+}
+
+multiclass VPseudoSSegStore {
+  foreach eew = EEWList in {
+    foreach lmul = MxSet<eew>.m in {
+      defvar LInfo = lmul.MX;
+      let VLMul = lmul.value in {
+        foreach nf = NFSet<lmul>.L in {
+          defvar vreg = SegRegClass<lmul, nf>.RC;
+          def nf # "E" # eew # "_V_" # LInfo : VPseudoSSegStoreNoMask<vreg, eew>;
+          def nf # "E" # eew # "_V_" # LInfo # "_MASK" : VPseudoSSegStoreMask<vreg, eew>;
+        }
+      }
+    }
+  }
+}
+
+multiclass VPseudoISegStore {
+  foreach idx_eew = EEWList in {  // EEW for index argument.
+    foreach idx_lmul = MxSet<idx_eew>.m in {  // LMUL for index argument.
+      foreach val_lmul = MxList.m in {  // LMUL for the value.
+        defvar IdxLInfo = idx_lmul.MX;
+        defvar IdxVreg = idx_lmul.vrclass;
+        defvar ValLInfo = val_lmul.MX;
+        let VLMul = val_lmul.value in {
+          foreach nf = NFSet<val_lmul>.L in {
+            defvar ValVreg = SegRegClass<val_lmul, nf>.RC;
+            def nf # "EI" # idx_eew # "_V_" # IdxLInfo # "_" # ValLInfo :
+              VPseudoISegStoreNoMask<ValVreg, IdxVreg, idx_eew, idx_lmul.value>;
+            def nf # "EI" # idx_eew # "_V_" # IdxLInfo # "_" # ValLInfo # "_MASK" :
+              VPseudoISegStoreMask<ValVreg, IdxVreg, idx_eew, idx_lmul.value>;
+          }
+        }
+      }
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Helpers to define the intrinsic patterns.
+//===----------------------------------------------------------------------===//
+
+class VPatUnaryNoMask<string intrinsic_name,
+                      string inst,
+                      string kind,
+                      ValueType result_type,
+                      ValueType op2_type,
+                      int sew,
+                      LMULInfo vlmul,
+                      VReg op2_reg_class> :
+  Pat<(result_type (!cast<Intrinsic>(intrinsic_name)
+                   (op2_type op2_reg_class:$rs2),
+                   (XLenVT (VLOp GPR:$vl)))),
+                   (!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX)
+                   (op2_type op2_reg_class:$rs2),
+                   GPR:$vl, sew)>;
+
+class VPatUnaryMask<string intrinsic_name,
+                    string inst,
+                    string kind,
+                    ValueType result_type,
+                    ValueType op2_type,
+                    ValueType mask_type,
+                    int sew,
+                    LMULInfo vlmul,
+                    VReg result_reg_class,
+                    VReg op2_reg_class> :
+  Pat<(result_type (!cast<Intrinsic>(intrinsic_name#"_mask")
+                   (result_type result_reg_class:$merge),
+                   (op2_type op2_reg_class:$rs2),
+                   (mask_type V0),
+                   (XLenVT (VLOp GPR:$vl)))),
+                   (!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX#"_MASK")
+                   (result_type result_reg_class:$merge),
+                   (op2_type op2_reg_class:$rs2),
+                   (mask_type V0), GPR:$vl, sew)>;
+
+class VPatMaskUnaryNoMask<string intrinsic_name,
+                          string inst,
+                          MTypeInfo mti> :
+  Pat<(mti.Mask (!cast<Intrinsic>(intrinsic_name)
+                (mti.Mask VR:$rs2),
+                (XLenVT (VLOp GPR:$vl)))),
+                (!cast<Instruction>(inst#"_M_"#mti.BX)
+                (mti.Mask VR:$rs2),
+                GPR:$vl, mti.SEW)>;
+
+class VPatMaskUnaryMask<string intrinsic_name,
+                        string inst,
+                        MTypeInfo mti> :
+  Pat<(mti.Mask (!cast<Intrinsic>(intrinsic_name#"_mask")
+                (mti.Mask VR:$merge),
+                (mti.Mask VR:$rs2),
+                (mti.Mask V0),
+                (XLenVT (VLOp GPR:$vl)))),
+                (!cast<Instruction>(inst#"_M_"#mti.BX#"_MASK")
+                (mti.Mask VR:$merge),
+                (mti.Mask VR:$rs2),
+                (mti.Mask V0), GPR:$vl, mti.SEW)>;
+
+class VPatUnaryAnyMask<string intrinsic,
+                       string inst,
+                       string kind,
+                       ValueType result_type,
+                       ValueType op1_type,
+                       ValueType mask_type,
+                       int sew,
+                       LMULInfo vlmul,
+                       VReg result_reg_class,
+                       VReg op1_reg_class> :
+  Pat<(result_type (!cast<Intrinsic>(intrinsic)
+                   (result_type result_reg_class:$merge),
+                   (op1_type op1_reg_class:$rs1),
+                   (mask_type VR:$rs2),
+                   (XLenVT (VLOp GPR:$vl)))),
+                   (!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX)
+                   (result_type result_reg_class:$merge),
+                   (op1_type op1_reg_class:$rs1),
+                   (mask_type VR:$rs2),
+                   GPR:$vl, sew)>;
+
+class VPatBinaryNoMask<string intrinsic_name,
+                       string inst,
+                       ValueType result_type,
+                       ValueType op1_type,
+                       ValueType op2_type,
+                       int sew,
+                       VReg op1_reg_class,
+                       DAGOperand op2_kind> :
+  Pat<(result_type (!cast<Intrinsic>(intrinsic_name)
+                   (op1_type op1_reg_class:$rs1),
+                   (op2_type op2_kind:$rs2),
+                   (XLenVT (VLOp GPR:$vl)))),
+                   (!cast<Instruction>(inst)
+                   (op1_type op1_reg_class:$rs1),
+                   (op2_type op2_kind:$rs2),
+                   GPR:$vl, sew)>;
+
+class VPatBinaryMask<string intrinsic_name,
+                     string inst,
+                     ValueType result_type,
+                     ValueType op1_type,
+                     ValueType op2_type,
+                     ValueType mask_type,
+                     int sew,
+                     VReg result_reg_class,
+                     VReg op1_reg_class,
+                     DAGOperand op2_kind> :
+  Pat<(result_type (!cast<Intrinsic>(intrinsic_name#"_mask")
+                   (result_type result_reg_class:$merge),
+                   (op1_type op1_reg_class:$rs1),
+                   (op2_type op2_kind:$rs2),
+                   (mask_type V0),
+                   (XLenVT (VLOp GPR:$vl)))),
+                   (!cast<Instruction>(inst#"_MASK")
+                   (result_type result_reg_class:$merge),
+                   (op1_type op1_reg_class:$rs1),
+                   (op2_type op2_kind:$rs2),
+                   (mask_type V0), GPR:$vl, sew)>;
+
+class VPatTernaryNoMask<string intrinsic,
+                        string inst,
+                        string kind,
+                        ValueType result_type,
+                        ValueType op1_type,
+                        ValueType op2_type,
+                        ValueType mask_type,
+                        int sew,
+                        LMULInfo vlmul,
+                        VReg result_reg_class,
+                        RegisterClass op1_reg_class,
+                        DAGOperand op2_kind> :
+  Pat<(result_type (!cast<Intrinsic>(intrinsic)
+                    (result_type result_reg_class:$rs3),
+                    (op1_type op1_reg_class:$rs1),
+                    (op2_type op2_kind:$rs2),
+                    (XLenVT (VLOp GPR:$vl)))),
+                   (!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX)
+                    result_reg_class:$rs3,
+                    (op1_type op1_reg_class:$rs1),
+                    op2_kind:$rs2,
+                    GPR:$vl, sew)>;
+
+class VPatTernaryMask<string intrinsic,
+                      string inst,
+                      string kind,
+                      ValueType result_type,
+                      ValueType op1_type,
+                      ValueType op2_type,
+                      ValueType mask_type,
+                      int sew,
+                      LMULInfo vlmul,
+                      VReg result_reg_class,
+                      RegisterClass op1_reg_class,
+                      DAGOperand op2_kind> :
+  Pat<(result_type (!cast<Intrinsic>(intrinsic#"_mask")
+                    (result_type result_reg_class:$rs3),
+                    (op1_type op1_reg_class:$rs1),
+                    (op2_type op2_kind:$rs2),
+                    (mask_type V0),
+                    (XLenVT (VLOp GPR:$vl)))),
+                   (!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX # "_MASK")
+                    result_reg_class:$rs3,
+                    (op1_type op1_reg_class:$rs1),
+                    op2_kind:$rs2,
+                    (mask_type V0),
+                    GPR:$vl, sew)>;
+
+class VPatAMOWDNoMask<string intrinsic_name,
+                    string inst,
+                    ValueType result_type,
+                    ValueType op1_type,
+                    int sew,
+                    LMULInfo vlmul,
+                    LMULInfo emul,
+                    VReg op1_reg_class> :
+  Pat<(result_type (!cast<Intrinsic>(intrinsic_name)
+                    GPR:$rs1,
+                    (op1_type op1_reg_class:$vs2),
+                    (result_type vlmul.vrclass:$vd),
+                    (XLenVT (VLOp GPR:$vl)))),
+                   (!cast<Instruction>(inst # "_WD_" # vlmul.MX # "_" # emul.MX)
+                    $rs1, $vs2, $vd,
+                    GPR:$vl, sew)>;
+
+class VPatAMOWDMask<string intrinsic_name,
+                    string inst,
+                    ValueType result_type,
+                    ValueType op1_type,
+                    ValueType mask_type,
+                    int sew,
+                    LMULInfo vlmul,
+                    LMULInfo emul,
+                    VReg op1_reg_class> :
+  Pat<(result_type (!cast<Intrinsic>(intrinsic_name # "_mask")
+                    GPR:$rs1,
+                    (op1_type op1_reg_class:$vs2),
+                    (result_type vlmul.vrclass:$vd),
+                    (mask_type V0),
+                    (XLenVT (VLOp GPR:$vl)))),
+                   (!cast<Instruction>(inst # "_WD_" # vlmul.MX # "_" # emul.MX # "_MASK")
+                    $rs1, $vs2, $vd,
+                    (mask_type V0), GPR:$vl, sew)>;
+
+multiclass VPatUSLoad<string intrinsic,
+                      string inst,
+                      LLVMType type,
+                      LLVMType mask_type,
+                      int sew,
+                      LMULInfo vlmul,
+                      VReg reg_class>
+{
+    defvar Intr = !cast<Intrinsic>(intrinsic);
+    defvar Pseudo = !cast<Instruction>(inst#"_V_"#vlmul.MX);
+    def : Pat<(type (Intr GPR:$rs1, (XLenVT (VLOp GPR:$vl)))),
+                    (Pseudo $rs1, GPR:$vl, sew)>;
+    defvar IntrMask = !cast<Intrinsic>(intrinsic # "_mask");
+    defvar PseudoMask = !cast<Instruction>(inst#"_V_"#vlmul.MX#"_MASK");
+    def : Pat<(type (IntrMask (type GetVRegNoV0<reg_class>.R:$merge),
+                               GPR:$rs1, (mask_type V0), (XLenVT (VLOp GPR:$vl)))),
+                    (PseudoMask $merge,
+                                $rs1, (mask_type V0), GPR:$vl, sew)>;
+}
+
+multiclass VPatUSLoadFF<string inst,
+                        LLVMType type,
+                        LLVMType mask_type,
+                        int sew,
+                        LMULInfo vlmul,
+                        VReg reg_class>
+{
+    defvar Pseudo = !cast<Instruction>(inst#"_V_"#vlmul.MX);
+    def : Pat<(type (riscv_vleff GPR:$rs1, (XLenVT (VLOp GPR:$vl)))),
+                    (Pseudo $rs1, GPR:$vl, sew)>;
+    defvar PseudoMask = !cast<Instruction>(inst#"_V_"#vlmul.MX#"_MASK");
+    def : Pat<(type (riscv_vleff_mask (type GetVRegNoV0<reg_class>.R:$merge),
+                                      GPR:$rs1, (mask_type V0), (XLenVT (VLOp GPR:$vl)))),
+                    (PseudoMask $merge,
+                                $rs1, (mask_type V0), GPR:$vl, sew)>;
+}
+
+multiclass VPatSLoad<string intrinsic,
+                     string inst,
+                     LLVMType type,
+                     LLVMType mask_type,
+                     int sew,
+                     LMULInfo vlmul,
+                     VReg reg_class>
+{
+    defvar Intr = !cast<Intrinsic>(intrinsic);
+    defvar Pseudo = !cast<Instruction>(inst#"_V_"#vlmul.MX);
+    def : Pat<(type (Intr GPR:$rs1, GPR:$rs2, (XLenVT (VLOp GPR:$vl)))),
+                    (Pseudo $rs1, $rs2, GPR:$vl, sew)>;
+    defvar IntrMask = !cast<Intrinsic>(intrinsic # "_mask");
+    defvar PseudoMask = !cast<Instruction>(inst#"_V_"#vlmul.MX#"_MASK");
+    def : Pat<(type (IntrMask (type GetVRegNoV0<reg_class>.R:$merge),
+                               GPR:$rs1, GPR:$rs2, (mask_type V0), (XLenVT (VLOp GPR:$vl)))),
+                    (PseudoMask $merge,
+                                $rs1, $rs2, (mask_type V0), GPR:$vl, sew)>;
+}
+
+multiclass VPatILoad<string intrinsic,
+                     string inst,
+                     LLVMType type,
+                     LLVMType idx_type,
+                     LLVMType mask_type,
+                     int sew,
+                     LMULInfo vlmul,
+                     LMULInfo idx_vlmul,
+                     VReg reg_class,
+                     VReg idx_reg_class>
+{
+    defvar Intr = !cast<Intrinsic>(intrinsic);
+    defvar Pseudo = !cast<Instruction>(inst#"_V_"#idx_vlmul.MX#"_"#vlmul.MX);
+    def : Pat<(type (Intr GPR:$rs1, (idx_type idx_reg_class:$rs2), (XLenVT (VLOp GPR:$vl)))),
+                    (Pseudo $rs1, $rs2, GPR:$vl, sew)>;
+
+    defvar IntrMask = !cast<Intrinsic>(intrinsic # "_mask");
+    defvar PseudoMask = !cast<Instruction>(inst#"_V_"#idx_vlmul.MX#"_"#vlmul.MX#"_MASK");
+    def : Pat<(type (IntrMask (type GetVRegNoV0<reg_class>.R:$merge),
+                               GPR:$rs1, (idx_type idx_reg_class:$rs2),
+                               (mask_type V0), (XLenVT (VLOp GPR:$vl)))),
+                    (PseudoMask $merge,
+                                $rs1, $rs2, (mask_type V0), GPR:$vl, sew)>;
+}
+
+multiclass VPatUSStore<string intrinsic,
+                       string inst,
+                       LLVMType type,
+                       LLVMType mask_type,
+                       int sew,
+                       LMULInfo vlmul,
+                       VReg reg_class>
+{
+    defvar Intr = !cast<Intrinsic>(intrinsic);
+    defvar Pseudo = !cast<Instruction>(inst#"_V_"#vlmul.MX);
+    def : Pat<(Intr (type reg_class:$rs3), GPR:$rs1, (XLenVT (VLOp GPR:$vl))),
+                    (Pseudo $rs3, $rs1, GPR:$vl, sew)>;
+    defvar IntrMask = !cast<Intrinsic>(intrinsic # "_mask");
+    defvar PseudoMask = !cast<Instruction>(inst#"_V_"#vlmul.MX#"_MASK");
+    def : Pat<(IntrMask (type reg_class:$rs3), GPR:$rs1, (mask_type V0), (XLenVT (VLOp GPR:$vl))),
+              (PseudoMask $rs3, $rs1, (mask_type V0), GPR:$vl, sew)>;
+}
+
+multiclass VPatSStore<string intrinsic,
+                      string inst,
+                      LLVMType type,
+                      LLVMType mask_type,
+                      int sew,
+                      LMULInfo vlmul,
+                      VReg reg_class>
+{
+    defvar Intr = !cast<Intrinsic>(intrinsic);
+    defvar Pseudo = !cast<Instruction>(inst#"_V_"#vlmul.MX);
+    def : Pat<(Intr (type reg_class:$rs3), GPR:$rs1, GPR:$rs2, (XLenVT (VLOp GPR:$vl))),
+                    (Pseudo $rs3, $rs1, $rs2, GPR:$vl, sew)>;
+    defvar IntrMask = !cast<Intrinsic>(intrinsic # "_mask");
+    defvar PseudoMask = !cast<Instruction>(inst#"_V_"#vlmul.MX#"_MASK");
+    def : Pat<(IntrMask (type reg_class:$rs3), GPR:$rs1, GPR:$rs2, (mask_type V0), (XLenVT (VLOp GPR:$vl))),
+              (PseudoMask $rs3, $rs1, $rs2, (mask_type V0), GPR:$vl, sew)>;
+}
+
+multiclass VPatIStore<string intrinsic,
+                      string inst,
+                      LLVMType type,
+                      LLVMType idx_type,
+                      LLVMType mask_type,
+                      int sew,
+                      LMULInfo vlmul,
+                      LMULInfo idx_vlmul,
+                      VReg reg_class,
+                      VReg idx_reg_class>
+{
+    defvar Intr = !cast<Intrinsic>(intrinsic);
+    defvar Pseudo = !cast<Instruction>(inst#"_V_"#idx_vlmul.MX#"_"#vlmul.MX);
+    def : Pat<(Intr (type reg_class:$rs3), GPR:$rs1,
+                    (idx_type idx_reg_class:$rs2), (XLenVT (VLOp GPR:$vl))),
+              (Pseudo $rs3, $rs1, $rs2, GPR:$vl, sew)>;
+    defvar IntrMask = !cast<Intrinsic>(intrinsic # "_mask");
+    defvar PseudoMask = !cast<Instruction>(inst#"_V_"#idx_vlmul.MX#"_"#vlmul.MX#"_MASK");
+    def : Pat<(IntrMask (type reg_class:$rs3), GPR:$rs1,
+                        (idx_type idx_reg_class:$rs2), (mask_type V0), (XLenVT (VLOp GPR:$vl))),
+              (PseudoMask $rs3, $rs1, $rs2, (mask_type V0), GPR:$vl, sew)>;
+}
+
+multiclass VPatUnaryS_M<string intrinsic_name,
+                             string inst>
+{
+  foreach mti = AllMasks in {
+    def : Pat<(XLenVT (!cast<Intrinsic>(intrinsic_name)
+                      (mti.Mask VR:$rs1), (XLenVT (VLOp GPR:$vl)))),
+                      (!cast<Instruction>(inst#"_M_"#mti.BX) $rs1,
+                      GPR:$vl, mti.SEW)>;
+    def : Pat<(XLenVT (!cast<Intrinsic>(intrinsic_name # "_mask")
+                      (mti.Mask VR:$rs1), (mti.Mask V0), (XLenVT (VLOp GPR:$vl)))),
+                      (!cast<Instruction>(inst#"_M_"#mti.BX#"_MASK") $rs1,
+                      (mti.Mask V0), GPR:$vl, mti.SEW)>;
+  }
+}
+
+multiclass VPatUnaryV_V_AnyMask<string intrinsic, string instruction,
+                                list<VTypeInfo> vtilist> {
+  foreach vti = vtilist in {
+    def : VPatUnaryAnyMask<intrinsic, instruction, "VM",
+                           vti.Vector, vti.Vector, vti.Mask,
+                           vti.SEW, vti.LMul, vti.RegClass,
+                           vti.RegClass>;
+  }
+}
+
+multiclass VPatUnaryM_M<string intrinsic,
+                         string inst>
+{
+  foreach mti = AllMasks in {
+    def : VPatMaskUnaryNoMask<intrinsic, inst, mti>;
+    def : VPatMaskUnaryMask<intrinsic, inst, mti>;
+  }
+}
+
+multiclass VPatUnaryV_M<string intrinsic, string instruction>
+{
+  foreach vti = AllIntegerVectors in {
+    def : VPatUnaryNoMask<intrinsic, instruction, "M", vti.Vector, vti.Mask,
+                          vti.SEW, vti.LMul, VR>;
+    def : VPatUnaryMask<intrinsic, instruction, "M", vti.Vector, vti.Mask,
+                        vti.Mask, vti.SEW, vti.LMul, vti.RegClass, VR>;
+  }
+}
+
+multiclass VPatUnaryV_VF<string intrinsic, string instruction, string suffix,
+                         list<VTypeInfoToFraction> fractionList>
+{
+  foreach vtiTofti = fractionList in
+  {
+      defvar vti = vtiTofti.Vti;
+      defvar fti = vtiTofti.Fti;
+      def : VPatUnaryNoMask<intrinsic, instruction, suffix,
+                            vti.Vector, fti.Vector,
+                            vti.SEW, vti.LMul, fti.RegClass>;
+      def : VPatUnaryMask<intrinsic, instruction, suffix,
+                          vti.Vector, fti.Vector, vti.Mask,
+                          vti.SEW, vti.LMul, vti.RegClass, fti.RegClass>;
+   }
+}
+
+multiclass VPatUnaryV_V<string intrinsic, string instruction,
+                        list<VTypeInfo> vtilist> {
+  foreach vti = vtilist in {
+    def : VPatUnaryNoMask<intrinsic, instruction, "V",
+                          vti.Vector, vti.Vector,
+                          vti.SEW, vti.LMul, vti.RegClass>;
+    def : VPatUnaryMask<intrinsic, instruction, "V",
+                        vti.Vector, vti.Vector, vti.Mask,
+                        vti.SEW, vti.LMul, vti.RegClass, vti.RegClass>;
+  }
+}
+
+multiclass VPatNullaryV<string intrinsic, string instruction>
+{
+  foreach vti = AllIntegerVectors in {
+    def : Pat<(vti.Vector (!cast<Intrinsic>(intrinsic)
+                          (XLenVT (VLOp GPR:$vl)))),
+                          (!cast<Instruction>(instruction#"_V_" # vti.LMul.MX)
+                          GPR:$vl, vti.SEW)>;
+    def : Pat<(vti.Vector (!cast<Intrinsic>(intrinsic # "_mask")
+                          (vti.Vector vti.RegClass:$merge),
+                          (vti.Mask V0), (XLenVT (VLOp GPR:$vl)))),
+                          (!cast<Instruction>(instruction#"_V_" # vti.LMul.MX # "_MASK")
+                          vti.RegClass:$merge, (vti.Mask V0),
+                          GPR:$vl, vti.SEW)>;
+  }
+}
+
+multiclass VPatNullaryM<string intrinsic, string inst> {
+  foreach mti = AllMasks in
+    def : Pat<(mti.Mask (!cast<Intrinsic>(intrinsic)
+                        (XLenVT (VLOp GPR:$vl)))),
+                        (!cast<Instruction>(inst#"_M_"#mti.BX)
+                        GPR:$vl, mti.SEW)>;
+}
+
+multiclass VPatBinary<string intrinsic,
+                      string inst,
+                      ValueType result_type,
+                      ValueType op1_type,
+                      ValueType op2_type,
+                      ValueType mask_type,
+                      int sew,
+                      VReg result_reg_class,
+                      VReg op1_reg_class,
+                      DAGOperand op2_kind>
+{
+  def : VPatBinaryNoMask<intrinsic, inst, result_type, op1_type, op2_type,
+                         sew, op1_reg_class, op2_kind>;
+  def : VPatBinaryMask<intrinsic, inst, result_type, op1_type, op2_type,
+                       mask_type, sew, result_reg_class, op1_reg_class,
+                       op2_kind>;
+}
+
+multiclass VPatBinaryCarryIn<string intrinsic,
+                             string inst,
+                             string kind,
+                             ValueType result_type,
+                             ValueType op1_type,
+                             ValueType op2_type,
+                             ValueType mask_type,
+                             int sew,
+                             LMULInfo vlmul,
+                             VReg op1_reg_class,
+                             DAGOperand op2_kind>
+{
+  def : Pat<(result_type (!cast<Intrinsic>(intrinsic)
+                         (op1_type op1_reg_class:$rs1),
+                         (op2_type op2_kind:$rs2),
+                         (mask_type V0),
+                         (XLenVT (VLOp GPR:$vl)))),
+                         (!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX)
+                         (op1_type op1_reg_class:$rs1),
+                         (op2_type op2_kind:$rs2),
+                         (mask_type V0), GPR:$vl, sew)>;
+}
+
+multiclass VPatBinaryMaskOut<string intrinsic,
+                             string inst,
+                             string kind,
+                             ValueType result_type,
+                             ValueType op1_type,
+                             ValueType op2_type,
+                             int sew,
+                             LMULInfo vlmul,
+                             VReg op1_reg_class,
+                             DAGOperand op2_kind>
+{
+  def : Pat<(result_type (!cast<Intrinsic>(intrinsic)
+                         (op1_type op1_reg_class:$rs1),
+                         (op2_type op2_kind:$rs2),
+                         (XLenVT (VLOp GPR:$vl)))),
+                         (!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX)
+                         (op1_type op1_reg_class:$rs1),
+                         (op2_type op2_kind:$rs2),
+                         GPR:$vl, sew)>;
+}
+
+multiclass VPatConversion<string intrinsic,
+                          string inst,
+                          string kind,
+                          ValueType result_type,
+                          ValueType op1_type,
+                          ValueType mask_type,
+                          int sew,
+                          LMULInfo vlmul,
+                          VReg result_reg_class,
+                          VReg op1_reg_class>
+{
+  def : VPatUnaryNoMask<intrinsic, inst, kind, result_type, op1_type,
+                        sew, vlmul, op1_reg_class>;
+  def : VPatUnaryMask<intrinsic, inst, kind, result_type, op1_type,
+                      mask_type, sew, vlmul, result_reg_class, op1_reg_class>;
+}
+
+multiclass VPatBinaryV_VV<string intrinsic, string instruction,
+                          list<VTypeInfo> vtilist> {
+  foreach vti = vtilist in
+    defm : VPatBinary<intrinsic, instruction # "_VV_" # vti.LMul.MX,
+                      vti.Vector, vti.Vector, vti.Vector,vti.Mask,
+                      vti.SEW, vti.RegClass,
+                      vti.RegClass, vti.RegClass>;
+}
+
+multiclass VPatBinaryV_VV_INT<string intrinsic, string instruction,
+                          list<VTypeInfo> vtilist> {
+  foreach vti = vtilist in {
+    defvar ivti = GetIntVTypeInfo<vti>.Vti;
+    defm : VPatBinary<intrinsic, instruction # "_VV_" # vti.LMul.MX,
+                      vti.Vector, vti.Vector, ivti.Vector, vti.Mask,
+                      vti.SEW, vti.RegClass,
+                      vti.RegClass, vti.RegClass>;
+  }
+}
+
+multiclass VPatBinaryV_VV_INT_EEW<string intrinsic, string instruction,
+                                  int eew, list<VTypeInfo> vtilist> {
+  foreach vti = vtilist in {
+    // emul = lmul * eew / sew
+    defvar vlmul = vti.LMul;
+    defvar octuple_lmul = octuple_from_str<vlmul.MX>.ret;
+    defvar octuple_emul = !srl(!mul(octuple_lmul, eew), shift_amount<vti.SEW>.val);
+    if !and(!ge(octuple_emul, 1), !le(octuple_emul, 64)) then {
+      defvar emul_str = octuple_to_str<octuple_emul>.ret;
+      defvar ivti = !cast<VTypeInfo>("VI" # eew # emul_str);
+      defvar inst = instruction # "_VV_" # vti.LMul.MX # "_" # emul_str;
+      defm : VPatBinary<intrinsic, inst,
+                        vti.Vector, vti.Vector, ivti.Vector, vti.Mask,
+                        vti.SEW, vti.RegClass,
+                        vti.RegClass, ivti.RegClass>;
+    }
+  }
+}
+
+multiclass VPatBinaryV_VX<string intrinsic, string instruction,
+                          list<VTypeInfo> vtilist> {
+  foreach vti = vtilist in {
+    defvar kind = "V"#vti.ScalarSuffix;
+    defm : VPatBinary<intrinsic, instruction#"_"#kind#"_"#vti.LMul.MX,
+                      vti.Vector, vti.Vector, vti.Scalar, vti.Mask,
+                      vti.SEW, vti.RegClass,
+                      vti.RegClass, vti.ScalarRegClass>;
+  }
+}
+
+multiclass VPatBinaryV_VX_INT<string intrinsic, string instruction,
+                          list<VTypeInfo> vtilist> {
+  foreach vti = vtilist in
+    defm : VPatBinary<intrinsic, instruction # "_VX_" # vti.LMul.MX,
+                      vti.Vector, vti.Vector, XLenVT, vti.Mask,
+                      vti.SEW, vti.RegClass,
+                      vti.RegClass, GPR>;
+}
+
+multiclass VPatBinaryV_VI<string intrinsic, string instruction,
+                          list<VTypeInfo> vtilist, Operand imm_type> {
+  foreach vti = vtilist in
+    defm : VPatBinary<intrinsic, instruction # "_VI_" # vti.LMul.MX,
+                      vti.Vector, vti.Vector, XLenVT, vti.Mask,
+                      vti.SEW, vti.RegClass,
+                      vti.RegClass, imm_type>;
+}
+
+multiclass VPatBinaryM_MM<string intrinsic, string instruction> {
+  foreach mti = AllMasks in
+    def : VPatBinaryNoMask<intrinsic, instruction # "_MM_" # mti.LMul.MX,
+                           mti.Mask, mti.Mask, mti.Mask,
+                           mti.SEW, VR, VR>;
+}
+
+multiclass VPatBinaryW_VV<string intrinsic, string instruction,
+                          list<VTypeInfoToWide> vtilist> {
+  foreach VtiToWti = vtilist in {
+    defvar Vti = VtiToWti.Vti;
+    defvar Wti = VtiToWti.Wti;
+    defm : VPatBinary<intrinsic, instruction # "_VV_" # Vti.LMul.MX,
+                      Wti.Vector, Vti.Vector, Vti.Vector, Vti.Mask,
+                      Vti.SEW, Wti.RegClass,
+                      Vti.RegClass, Vti.RegClass>;
+  }
+}
+
+multiclass VPatBinaryW_VX<string intrinsic, string instruction,
+                          list<VTypeInfoToWide> vtilist> {
+  foreach VtiToWti = vtilist in {
+    defvar Vti = VtiToWti.Vti;
+    defvar Wti = VtiToWti.Wti;
+    defvar kind = "V"#Vti.ScalarSuffix;
+    defm : VPatBinary<intrinsic, instruction#"_"#kind#"_"#Vti.LMul.MX,
+                      Wti.Vector, Vti.Vector, Vti.Scalar, Vti.Mask,
+                      Vti.SEW, Wti.RegClass,
+                      Vti.RegClass, Vti.ScalarRegClass>;
+  }
+}
+
+multiclass VPatBinaryW_WV<string intrinsic, string instruction,
+                          list<VTypeInfoToWide> vtilist> {
+  foreach VtiToWti = vtilist in {
+    defvar Vti = VtiToWti.Vti;
+    defvar Wti = VtiToWti.Wti;
+    defm : VPatBinary<intrinsic, instruction # "_WV_" # Vti.LMul.MX,
+                      Wti.Vector, Wti.Vector, Vti.Vector, Vti.Mask,
+                      Vti.SEW, Wti.RegClass,
+                      Wti.RegClass, Vti.RegClass>;
+  }
+}
+
+multiclass VPatBinaryW_WX<string intrinsic, string instruction,
+                          list<VTypeInfoToWide> vtilist> {
+  foreach VtiToWti = vtilist in {
+    defvar Vti = VtiToWti.Vti;
+    defvar Wti = VtiToWti.Wti;
+    defvar kind = "W"#Vti.ScalarSuffix;
+    defm : VPatBinary<intrinsic, instruction#"_"#kind#"_"#Vti.LMul.MX,
+                      Wti.Vector, Wti.Vector, Vti.Scalar, Vti.Mask,
+                      Vti.SEW, Wti.RegClass,
+                      Wti.RegClass, Vti.ScalarRegClass>;
+  }
+}
+
+multiclass VPatBinaryV_WV<string intrinsic, string instruction,
+                          list<VTypeInfoToWide> vtilist> {
+  foreach VtiToWti = vtilist in {
+    defvar Vti = VtiToWti.Vti;
+    defvar Wti = VtiToWti.Wti;
+    defm : VPatBinary<intrinsic, instruction # "_WV_" # Vti.LMul.MX,
+                      Vti.Vector, Wti.Vector, Vti.Vector, Vti.Mask,
+                      Vti.SEW, Vti.RegClass,
+                      Wti.RegClass, Vti.RegClass>;
+  }
+}
+
+multiclass VPatBinaryV_WX<string intrinsic, string instruction,
+                          list<VTypeInfoToWide> vtilist> {
+  foreach VtiToWti = vtilist in {
+    defvar Vti = VtiToWti.Vti;
+    defvar Wti = VtiToWti.Wti;
+    defvar kind = "W"#Vti.ScalarSuffix;
+    defm : VPatBinary<intrinsic, instruction#"_"#kind#"_"#Vti.LMul.MX,
+                      Vti.Vector, Wti.Vector, Vti.Scalar, Vti.Mask,
+                      Vti.SEW, Vti.RegClass,
+                      Wti.RegClass, Vti.ScalarRegClass>;
+  }
+}
+
+multiclass VPatBinaryV_WI<string intrinsic, string instruction,
+                          list<VTypeInfoToWide> vtilist> {
+  foreach VtiToWti = vtilist in {
+    defvar Vti = VtiToWti.Vti;
+    defvar Wti = VtiToWti.Wti;
+    defm : VPatBinary<intrinsic, instruction # "_WI_" # Vti.LMul.MX,
+                      Vti.Vector, Wti.Vector, XLenVT, Vti.Mask,
+                      Vti.SEW, Vti.RegClass,
+                      Wti.RegClass, uimm5>;
+  }
+}
+
+multiclass VPatBinaryV_VM<string intrinsic, string instruction,
+                          bit CarryOut = 0,
+                          list<VTypeInfo> vtilist = AllIntegerVectors> {
+  foreach vti = vtilist in
+    defm : VPatBinaryCarryIn<intrinsic, instruction, "VVM",
+                             !if(CarryOut, vti.Mask, vti.Vector),
+                             vti.Vector, vti.Vector, vti.Mask,
+                             vti.SEW, vti.LMul,
+                             vti.RegClass, vti.RegClass>;
+}
+
+multiclass VPatBinaryV_XM<string intrinsic, string instruction,
+                          bit CarryOut = 0,
+                          list<VTypeInfo> vtilist = AllIntegerVectors> {
+  foreach vti = vtilist in
+    defm : VPatBinaryCarryIn<intrinsic, instruction,
+                             "V"#vti.ScalarSuffix#"M",
+                             !if(CarryOut, vti.Mask, vti.Vector),
+                             vti.Vector, vti.Scalar, vti.Mask,
+                             vti.SEW, vti.LMul,
+                             vti.RegClass, vti.ScalarRegClass>;
+}
+
+multiclass VPatBinaryV_IM<string intrinsic, string instruction,
+                          bit CarryOut = 0> {
+  foreach vti = AllIntegerVectors in
+    defm : VPatBinaryCarryIn<intrinsic, instruction, "VIM",
+                             !if(CarryOut, vti.Mask, vti.Vector),
+                             vti.Vector, XLenVT, vti.Mask,
+                             vti.SEW, vti.LMul,
+                             vti.RegClass, simm5>;
+}
+
+multiclass VPatBinaryV_V<string intrinsic, string instruction> {
+  foreach vti = AllIntegerVectors in
+    defm : VPatBinaryMaskOut<intrinsic, instruction, "VV",
+                             vti.Mask, vti.Vector, vti.Vector,
+                             vti.SEW, vti.LMul,
+                             vti.RegClass, vti.RegClass>;
+}
+
+multiclass VPatBinaryV_X<string intrinsic, string instruction> {
+  foreach vti = AllIntegerVectors in
+    defm : VPatBinaryMaskOut<intrinsic, instruction, "VX",
+                             vti.Mask, vti.Vector, XLenVT,
+                             vti.SEW, vti.LMul,
+                             vti.RegClass, GPR>;
+}
+
+multiclass VPatBinaryV_I<string intrinsic, string instruction> {
+  foreach vti = AllIntegerVectors in
+    defm : VPatBinaryMaskOut<intrinsic, instruction, "VI",
+                             vti.Mask, vti.Vector, XLenVT,
+                             vti.SEW, vti.LMul,
+                             vti.RegClass, simm5>;
+}
+
+multiclass VPatBinaryM_VV<string intrinsic, string instruction,
+                          list<VTypeInfo> vtilist> {
+  foreach vti = vtilist in
+    defm : VPatBinary<intrinsic, instruction # "_VV_" # vti.LMul.MX,
+                      vti.Mask, vti.Vector, vti.Vector, vti.Mask,
+                      vti.SEW, VR,
+                      vti.RegClass, vti.RegClass>;
+}
+
+multiclass VPatBinaryM_VX<string intrinsic, string instruction,
+                          list<VTypeInfo> vtilist> {
+  foreach vti = vtilist in {
+    defvar kind = "V"#vti.ScalarSuffix;
+    defm : VPatBinary<intrinsic, instruction#"_"#kind#"_"#vti.LMul.MX,
+                      vti.Mask, vti.Vector, vti.Scalar, vti.Mask,
+                      vti.SEW, VR,
+                      vti.RegClass, vti.ScalarRegClass>;
+  }
+}
+
+multiclass VPatBinaryM_VI<string intrinsic, string instruction,
+                          list<VTypeInfo> vtilist> {
+  foreach vti = vtilist in
+    defm : VPatBinary<intrinsic, instruction # "_VI_" # vti.LMul.MX,
+                      vti.Mask, vti.Vector, XLenVT, vti.Mask,
+                      vti.SEW, VR,
+                      vti.RegClass, simm5>;
+}
+
+multiclass VPatBinaryV_VV_VX_VI<string intrinsic, string instruction,
+                                list<VTypeInfo> vtilist, Operand ImmType = simm5>
+{
+  defm "" : VPatBinaryV_VV<intrinsic, instruction, vtilist>;
+  defm "" : VPatBinaryV_VX<intrinsic, instruction, vtilist>;
+  defm "" : VPatBinaryV_VI<intrinsic, instruction, vtilist, ImmType>;
+}
+
+multiclass VPatBinaryV_VV_VX<string intrinsic, string instruction,
+                             list<VTypeInfo> vtilist>
+{
+  defm "" : VPatBinaryV_VV<intrinsic, instruction, vtilist>;
+  defm "" : VPatBinaryV_VX<intrinsic, instruction, vtilist>;
+}
+
+multiclass VPatBinaryV_VX_VI<string intrinsic, string instruction,
+                             list<VTypeInfo> vtilist>
+{
+  defm "" : VPatBinaryV_VX<intrinsic, instruction, vtilist>;
+  defm "" : VPatBinaryV_VI<intrinsic, instruction, vtilist, simm5>;
+}
+
+multiclass VPatBinaryW_VV_VX<string intrinsic, string instruction,
+                             list<VTypeInfoToWide> vtilist>
+{
+  defm "" : VPatBinaryW_VV<intrinsic, instruction, vtilist>;
+  defm "" : VPatBinaryW_VX<intrinsic, instruction, vtilist>;
+}
+
+multiclass VPatBinaryW_WV_WX<string intrinsic, string instruction,
+                             list<VTypeInfoToWide> vtilist>
+{
+  defm "" : VPatBinaryW_WV<intrinsic, instruction, vtilist>;
+  defm "" : VPatBinaryW_WX<intrinsic, instruction, vtilist>;
+}
+
+multiclass VPatBinaryV_WV_WX_WI<string intrinsic, string instruction,
+                                list<VTypeInfoToWide> vtilist>
+{
+  defm "" : VPatBinaryV_WV<intrinsic, instruction, vtilist>;
+  defm "" : VPatBinaryV_WX<intrinsic, instruction, vtilist>;
+  defm "" : VPatBinaryV_WI<intrinsic, instruction, vtilist>;
+}
+
+multiclass VPatBinaryV_VM_XM_IM<string intrinsic, string instruction>
+{
+  defm "" : VPatBinaryV_VM<intrinsic, instruction>;
+  defm "" : VPatBinaryV_XM<intrinsic, instruction>;
+  defm "" : VPatBinaryV_IM<intrinsic, instruction>;
+}
+
+multiclass VPatBinaryM_VM_XM_IM<string intrinsic, string instruction>
+{
+  defm "" : VPatBinaryV_VM<intrinsic, instruction, /*CarryOut=*/1>;
+  defm "" : VPatBinaryV_XM<intrinsic, instruction, /*CarryOut=*/1>;
+  defm "" : VPatBinaryV_IM<intrinsic, instruction, /*CarryOut=*/1>;
+}
+
+multiclass VPatBinaryM_V_X_I<string intrinsic, string instruction>
+{
+  defm "" : VPatBinaryV_V<intrinsic, instruction>;
+  defm "" : VPatBinaryV_X<intrinsic, instruction>;
+  defm "" : VPatBinaryV_I<intrinsic, instruction>;
+}
+
+multiclass VPatBinaryV_VM_XM<string intrinsic, string instruction>
+{
+  defm "" : VPatBinaryV_VM<intrinsic, instruction>;
+  defm "" : VPatBinaryV_XM<intrinsic, instruction>;
+}
+
+multiclass VPatBinaryM_VM_XM<string intrinsic, string instruction>
+{
+  defm "" : VPatBinaryV_VM<intrinsic, instruction, /*CarryOut=*/1>;
+  defm "" : VPatBinaryV_XM<intrinsic, instruction, /*CarryOut=*/1>;
+}
+
+multiclass VPatBinaryM_V_X<string intrinsic, string instruction>
+{
+  defm "" : VPatBinaryV_V<intrinsic, instruction>;
+  defm "" : VPatBinaryV_X<intrinsic, instruction>;
+}
+
+multiclass VPatTernary<string intrinsic,
+                       string inst,
+                       string kind,
+                       ValueType result_type,
+                       ValueType op1_type,
+                       ValueType op2_type,
+                       ValueType mask_type,
+                       int sew,
+                       LMULInfo vlmul,
+                       VReg result_reg_class,
+                       RegisterClass op1_reg_class,
+                       DAGOperand op2_kind> {
+  def : VPatTernaryNoMask<intrinsic, inst, kind, result_type, op1_type, op2_type,
+                    mask_type, sew, vlmul, result_reg_class, op1_reg_class,
+                    op2_kind>;
+  def : VPatTernaryMask<intrinsic, inst, kind, result_type, op1_type, op2_type,
+                        mask_type, sew, vlmul, result_reg_class, op1_reg_class,
+                        op2_kind>;
+}
+
+multiclass VPatTernaryV_VV<string intrinsic, string instruction,
+                           list<VTypeInfo> vtilist> {
+  foreach vti = vtilist in
+    defm : VPatTernary<intrinsic, instruction, "VV",
+                       vti.Vector, vti.Vector, vti.Vector, vti.Mask,
+                       vti.SEW, vti.LMul, vti.RegClass,
+                       vti.RegClass, vti.RegClass>;
+}
+
+multiclass VPatTernaryV_VX<string intrinsic, string instruction,
+                           list<VTypeInfo> vtilist> {
+  foreach vti = vtilist in
+    defm : VPatTernary<intrinsic, instruction, "VX",
+                       vti.Vector, vti.Vector, XLenVT, vti.Mask,
+                       vti.SEW, vti.LMul, vti.RegClass,
+                       vti.RegClass, GPR>;
+}
+
+multiclass VPatTernaryV_VX_AAXA<string intrinsic, string instruction,
+                           list<VTypeInfo> vtilist> {
+  foreach vti = vtilist in
+    defm : VPatTernary<intrinsic, instruction,
+                       "V"#vti.ScalarSuffix,
+                       vti.Vector, vti.Scalar, vti.Vector, vti.Mask,
+                       vti.SEW, vti.LMul, vti.RegClass,
+                       vti.ScalarRegClass, vti.RegClass>;
+}
+
+multiclass VPatTernaryV_VI<string intrinsic, string instruction,
+                           list<VTypeInfo> vtilist, Operand Imm_type> {
+  foreach vti = vtilist in
+    defm : VPatTernary<intrinsic, instruction, "VI",
+                      vti.Vector, vti.Vector, XLenVT, vti.Mask,
+                      vti.SEW, vti.LMul, vti.RegClass,
+                      vti.RegClass, Imm_type>;
+}
+
+multiclass VPatTernaryW_VV<string intrinsic, string instruction,
+                           list<VTypeInfoToWide> vtilist> {
+  foreach vtiToWti = vtilist in {
+    defvar vti = vtiToWti.Vti;
+    defvar wti = vtiToWti.Wti;
+    defm : VPatTernary<intrinsic, instruction, "VV",
+                      wti.Vector, vti.Vector, vti.Vector,
+                      vti.Mask, vti.SEW, vti.LMul,
+                      wti.RegClass, vti.RegClass, vti.RegClass>;
+  }
+}
+
+multiclass VPatTernaryW_VX<string intrinsic, string instruction,
+                           list<VTypeInfoToWide> vtilist> {
+  foreach vtiToWti = vtilist in {
+    defvar vti = vtiToWti.Vti;
+    defvar wti = vtiToWti.Wti;
+    defm : VPatTernary<intrinsic, instruction,
+                       "V"#vti.ScalarSuffix,
+                       wti.Vector, vti.Scalar, vti.Vector,
+                       vti.Mask, vti.SEW, vti.LMul,
+                       wti.RegClass, vti.ScalarRegClass, vti.RegClass>;
+  }
+}
+
+multiclass VPatTernaryV_VV_VX_AAXA<string intrinsic, string instruction,
+                              list<VTypeInfo> vtilist> {
+  defm "" : VPatTernaryV_VV<intrinsic, instruction, vtilist>;
+  defm "" : VPatTernaryV_VX_AAXA<intrinsic, instruction, vtilist>;
+}
+
+multiclass VPatTernaryV_VX_VI<string intrinsic, string instruction,
+                              list<VTypeInfo> vtilist, Operand Imm_type = simm5> {
+  defm "" : VPatTernaryV_VX<intrinsic, instruction, vtilist>;
+  defm "" : VPatTernaryV_VI<intrinsic, instruction, vtilist, Imm_type>;
+}
+
+multiclass VPatBinaryM_VV_VX_VI<string intrinsic, string instruction,
+                                list<VTypeInfo> vtilist>
+{
+  defm "" : VPatBinaryM_VV<intrinsic, instruction, vtilist>;
+  defm "" : VPatBinaryM_VX<intrinsic, instruction, vtilist>;
+  defm "" : VPatBinaryM_VI<intrinsic, instruction, vtilist>;
+}
+
+multiclass VPatTernaryW_VV_VX<string intrinsic, string instruction,
+                              list<VTypeInfoToWide> vtilist> {
+  defm "" : VPatTernaryW_VV<intrinsic, instruction, vtilist>;
+  defm "" : VPatTernaryW_VX<intrinsic, instruction, vtilist>;
+}
+
+multiclass VPatBinaryM_VV_VX<string intrinsic, string instruction,
+                             list<VTypeInfo> vtilist>
+{
+  defm "" : VPatBinaryM_VV<intrinsic, instruction, vtilist>;
+  defm "" : VPatBinaryM_VX<intrinsic, instruction, vtilist>;
+}
+
+multiclass VPatBinaryM_VX_VI<string intrinsic, string instruction,
+                             list<VTypeInfo> vtilist>
+{
+  defm "" : VPatBinaryM_VX<intrinsic, instruction, vtilist>;
+  defm "" : VPatBinaryM_VI<intrinsic, instruction, vtilist>;
+}
+
+multiclass VPatBinaryV_VV_VX_VI_INT<string intrinsic, string instruction,
+                                    list<VTypeInfo> vtilist, Operand ImmType = simm5>
+{
+  defm "" : VPatBinaryV_VV_INT<intrinsic, instruction, vtilist>;
+  defm "" : VPatBinaryV_VX_INT<intrinsic, instruction, vtilist>;
+  defm "" : VPatBinaryV_VI<intrinsic, instruction, vtilist, ImmType>;
+}
+
+multiclass VPatReductionV_VS<string intrinsic, string instruction, bit IsFloat = 0> {
+  foreach vti = !if(IsFloat, NoGroupFloatVectors, NoGroupIntegerVectors) in
+  {
+    defvar vectorM1 = !cast<VTypeInfo>(!if(IsFloat, "VF", "VI") # vti.SEW # "M1");
+    defm : VPatTernary<intrinsic, instruction, "VS",
+                       vectorM1.Vector, vti.Vector,
+                       vectorM1.Vector, vti.Mask,
+                       vti.SEW, vti.LMul,
+                       VR, vti.RegClass, VR>;
+  }
+  foreach gvti = !if(IsFloat, GroupFloatVectors, GroupIntegerVectors) in
+  {
+    defm : VPatTernary<intrinsic, instruction, "VS",
+                       gvti.VectorM1, gvti.Vector,
+                       gvti.VectorM1, gvti.Mask,
+                       gvti.SEW, gvti.LMul,
+                       VR, gvti.RegClass, VR>;
+  }
+}
+
+multiclass VPatReductionW_VS<string intrinsic, string instruction, bit IsFloat = 0> {
+  foreach vti = !if(IsFloat, AllFloatVectors, AllIntegerVectors) in
+  {
+    defvar wtiSEW = !mul(vti.SEW, 2);
+    if !le(wtiSEW, 64) then {
+      defvar wtiM1 = !cast<VTypeInfo>(!if(IsFloat, "VF", "VI") # wtiSEW # "M1");
+      defm : VPatTernary<intrinsic, instruction, "VS",
+                         wtiM1.Vector, vti.Vector,
+                         wtiM1.Vector, vti.Mask,
+                         vti.SEW, vti.LMul,
+                         wtiM1.RegClass, vti.RegClass,
+                         wtiM1.RegClass>;
+    }
+  }
+}
+
+multiclass VPatConversionVI_VF<string intrinsic,
+                               string instruction>
+{
+  foreach fvti = AllFloatVectors in
+  {
+    defvar ivti = GetIntVTypeInfo<fvti>.Vti;
+
+    defm : VPatConversion<intrinsic, instruction, "V",
+                          ivti.Vector, fvti.Vector, ivti.Mask, fvti.SEW,
+                          fvti.LMul, ivti.RegClass, fvti.RegClass>;
+  }
+}
+
+multiclass VPatConversionVF_VI<string intrinsic,
+                               string instruction>
+{
+  foreach fvti = AllFloatVectors in
+  {
+    defvar ivti = GetIntVTypeInfo<fvti>.Vti;
+
+    defm : VPatConversion<intrinsic, instruction, "V",
+                          fvti.Vector, ivti.Vector, fvti.Mask, ivti.SEW,
+                          ivti.LMul, fvti.RegClass, ivti.RegClass>;
+  }
+}
+
+multiclass VPatConversionWI_VF<string intrinsic, string instruction> {
+  foreach fvtiToFWti = AllWidenableFloatVectors in
+  {
+    defvar fvti = fvtiToFWti.Vti;
+    defvar iwti = GetIntVTypeInfo<fvtiToFWti.Wti>.Vti;
+
+    defm : VPatConversion<intrinsic, instruction, "V",
+                          iwti.Vector, fvti.Vector, iwti.Mask, fvti.SEW,
+                          fvti.LMul, iwti.RegClass, fvti.RegClass>;
+  }
+}
+
+multiclass VPatConversionWF_VI<string intrinsic, string instruction> {
+  foreach vtiToWti = AllWidenableIntToFloatVectors in
+  {
+    defvar vti = vtiToWti.Vti;
+    defvar fwti = vtiToWti.Wti;
+
+    defm : VPatConversion<intrinsic, instruction, "V",
+                          fwti.Vector, vti.Vector, fwti.Mask, vti.SEW,
+                          vti.LMul, fwti.RegClass, vti.RegClass>;
+  }
+}
+
+multiclass VPatConversionWF_VF <string intrinsic, string instruction> {
+  foreach fvtiToFWti = AllWidenableFloatVectors in
+  {
+    defvar fvti = fvtiToFWti.Vti;
+    defvar fwti = fvtiToFWti.Wti;
+
+    defm : VPatConversion<intrinsic, instruction, "V",
+                          fwti.Vector, fvti.Vector, fwti.Mask, fvti.SEW,
+                          fvti.LMul, fwti.RegClass, fvti.RegClass>;
+  }
+}
+
+multiclass VPatConversionVI_WF <string intrinsic, string instruction> {
+  foreach vtiToWti = AllWidenableIntToFloatVectors in
+  {
+    defvar vti = vtiToWti.Vti;
+    defvar fwti = vtiToWti.Wti;
+
+    defm : VPatConversion<intrinsic, instruction, "W",
+                          vti.Vector, fwti.Vector, vti.Mask, vti.SEW,
+                          vti.LMul, vti.RegClass, fwti.RegClass>;
+  }
+}
+
+multiclass VPatConversionVF_WI <string intrinsic, string instruction> {
+  foreach fvtiToFWti = AllWidenableFloatVectors in
+  {
+    defvar fvti = fvtiToFWti.Vti;
+    defvar iwti = GetIntVTypeInfo<fvtiToFWti.Wti>.Vti;
+
+    defm : VPatConversion<intrinsic, instruction, "W",
+                          fvti.Vector, iwti.Vector, fvti.Mask, fvti.SEW,
+                          fvti.LMul, fvti.RegClass, iwti.RegClass>;
+  }
+}
+
+multiclass VPatConversionVF_WF <string intrinsic, string instruction> {
+  foreach fvtiToFWti = AllWidenableFloatVectors in
+  {
+    defvar fvti = fvtiToFWti.Vti;
+    defvar fwti = fvtiToFWti.Wti;
+
+    defm : VPatConversion<intrinsic, instruction, "W",
+                          fvti.Vector, fwti.Vector, fvti.Mask, fvti.SEW,
+                          fvti.LMul, fvti.RegClass, fwti.RegClass>;
+  }
+}
+
+multiclass VPatAMOWD<string intrinsic,
+                     string inst,
+                     ValueType result_type,
+                     ValueType offset_type,
+                     ValueType mask_type,
+                     int sew,
+                     LMULInfo vlmul,
+                     LMULInfo emul,
+                     VReg op1_reg_class>
+{
+  def : VPatAMOWDNoMask<intrinsic, inst, result_type, offset_type,
+                        sew, vlmul, emul, op1_reg_class>;
+  def : VPatAMOWDMask<intrinsic, inst, result_type, offset_type,
+                      mask_type, sew, vlmul, emul, op1_reg_class>;
+}
+
+multiclass VPatAMOV_WD<string intrinsic,
+                       string inst,
+                       list<VTypeInfo> vtilist> {
+  foreach eew = EEWList in {
+    foreach vti = vtilist in {
+      if !or(!eq(vti.SEW, 32), !eq(vti.SEW, 64)) then {
+        defvar octuple_lmul = octuple_from_str<vti.LMul.MX>.ret;
+        // Calculate emul = eew * lmul / sew
+        defvar octuple_emul = !srl(!mul(eew, octuple_lmul), shift_amount<vti.SEW>.val);
+        if !and(!ge(octuple_emul, 1), !le(octuple_emul, 64)) then {
+          defvar emulMX = octuple_to_str<octuple_emul>.ret;
+          defvar offsetVti = !cast<VTypeInfo>("VI" # eew # emulMX);
+          defvar inst_ei = inst # "EI" # eew;
+          defm : VPatAMOWD<intrinsic, inst_ei,
+                           vti.Vector, offsetVti.Vector,
+                           vti.Mask, vti.SEW, vti.LMul, offsetVti.LMul, offsetVti.RegClass>;
+        }
+      }
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasStdExtV] in {
+
+//===----------------------------------------------------------------------===//
+// Pseudo Instructions for CodeGen
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
+  def PseudoVMV1R_V : VPseudo<VMV1R_V, V_M1, (outs VR:$vd), (ins VR:$vs2)>;
+  def PseudoVMV2R_V : VPseudo<VMV2R_V, V_M2, (outs VRM2:$vd), (ins VRM2:$vs2)>;
+  def PseudoVMV4R_V : VPseudo<VMV4R_V, V_M4, (outs VRM4:$vd), (ins VRM4:$vs2)>;
+  def PseudoVMV8R_V : VPseudo<VMV8R_V, V_M8, (outs VRM8:$vd), (ins VRM8:$vs2)>;
+}
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCodeGenOnly = 1 in {
+  def PseudoReadVLENB : Pseudo<(outs GPR:$rd), (ins),
+                               [(set GPR:$rd, (riscv_read_vlenb))]>;
+}
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCodeGenOnly = 1,
+    Uses = [VL] in
+def PseudoReadVL : Pseudo<(outs GPR:$rd), (ins),
+                          [(set GPR:$rd, (riscv_read_vl))]>;
+
+//===----------------------------------------------------------------------===//
+// 6. Configuration-Setting Instructions
+//===----------------------------------------------------------------------===//
+
+// Pseudos.
+let hasSideEffects = 1, mayLoad = 0, mayStore = 0, Defs = [VL, VTYPE] in {
+def PseudoVSETVLI : Pseudo<(outs GPR:$rd), (ins GPR:$rs1, VTypeIOp:$vtypei), []>;
+def PseudoVSETIVLI : Pseudo<(outs GPR:$rd), (ins uimm5:$rs1, VTypeIOp:$vtypei), []>;
+}
+
+//===----------------------------------------------------------------------===//
+// 7. Vector Loads and Stores
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// 7.4 Vector Unit-Stride Instructions
+//===----------------------------------------------------------------------===//
+
+// Pseudos Unit-Stride Loads and Stores
+foreach eew = EEWList in {
+  defm PseudoVLE # eew : VPseudoUSLoad;
+  defm PseudoVSE # eew : VPseudoUSStore;
+}
+
+defm PseudoVLE1 : VPseudoLoadMask;
+defm PseudoVSE1 : VPseudoStoreMask;
+
+//===----------------------------------------------------------------------===//
+// 7.5 Vector Strided Instructions
+//===----------------------------------------------------------------------===//
+
+// Vector Strided Loads and Stores
+foreach eew = EEWList in {
+  defm PseudoVLSE # eew : VPseudoSLoad;
+  defm PseudoVSSE # eew : VPseudoSStore;
+}
+
+//===----------------------------------------------------------------------===//
+// 7.6 Vector Indexed Instructions
+//===----------------------------------------------------------------------===//
+
+// Vector Indexed Loads and Stores
+foreach eew = EEWList in {
+  defm PseudoVLUXEI # eew : VPseudoILoad;
+  defm PseudoVLOXEI # eew : VPseudoILoad;
+  defm PseudoVSOXEI # eew : VPseudoIStore;
+  defm PseudoVSUXEI # eew : VPseudoIStore;
+}
+
+//===----------------------------------------------------------------------===//
+// 7.7. Unit-stride Fault-Only-First Loads
+//===----------------------------------------------------------------------===//
+
+// vleff may update VL register
+let hasSideEffects = 1, Defs = [VL] in
+foreach eew = EEWList in {
+  defm PseudoVLE # eew # FF : VPseudoUSLoad;
+}
+
+//===----------------------------------------------------------------------===//
+// 7.8. Vector Load/Store Segment Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVLSEG : VPseudoUSSegLoad</*fault-only-first*/false>;
+defm PseudoVLSSEG : VPseudoSSegLoad;
+defm PseudoVLOXSEG : VPseudoISegLoad;
+defm PseudoVLUXSEG : VPseudoISegLoad;
+defm PseudoVSSEG : VPseudoUSSegStore;
+defm PseudoVSSSEG : VPseudoSSegStore;
+defm PseudoVSOXSEG : VPseudoISegStore;
+defm PseudoVSUXSEG : VPseudoISegStore;
+
+// vlseg<nf>e<eew>ff.v may update VL register
+let hasSideEffects = 1, Defs = [VL] in
+defm PseudoVLSEG : VPseudoUSSegLoad</*fault-only-first*/true>;
+
+//===----------------------------------------------------------------------===//
+// 8. Vector AMO Operations
+//===----------------------------------------------------------------------===//
+defm PseudoVAMOSWAP : VPseudoAMO;
+defm PseudoVAMOADD : VPseudoAMO;
+defm PseudoVAMOXOR : VPseudoAMO;
+defm PseudoVAMOAND : VPseudoAMO;
+defm PseudoVAMOOR : VPseudoAMO;
+defm PseudoVAMOMIN : VPseudoAMO;
+defm PseudoVAMOMAX : VPseudoAMO;
+defm PseudoVAMOMINU : VPseudoAMO;
+defm PseudoVAMOMAXU : VPseudoAMO;
+
+//===----------------------------------------------------------------------===//
+// 12. Vector Integer Arithmetic Instructions
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// 12.1. Vector Single-Width Integer Add and Subtract
+//===----------------------------------------------------------------------===//
+defm PseudoVADD        : VPseudoBinaryV_VV_VX_VI;
+defm PseudoVSUB        : VPseudoBinaryV_VV_VX;
+defm PseudoVRSUB       : VPseudoBinaryV_VX_VI;
+
+//===----------------------------------------------------------------------===//
+// 12.2. Vector Widening Integer Add/Subtract
+//===----------------------------------------------------------------------===//
+defm PseudoVWADDU    : VPseudoBinaryW_VV_VX;
+defm PseudoVWSUBU    : VPseudoBinaryW_VV_VX;
+defm PseudoVWADD     : VPseudoBinaryW_VV_VX;
+defm PseudoVWSUB     : VPseudoBinaryW_VV_VX;
+defm PseudoVWADDU    : VPseudoBinaryW_WV_WX;
+defm PseudoVWSUBU    : VPseudoBinaryW_WV_WX;
+defm PseudoVWADD     : VPseudoBinaryW_WV_WX;
+defm PseudoVWSUB     : VPseudoBinaryW_WV_WX;
+
+//===----------------------------------------------------------------------===//
+// 12.3. Vector Integer Extension
+//===----------------------------------------------------------------------===//
+defm PseudoVZEXT_VF2 : PseudoUnaryV_VF2;
+defm PseudoVZEXT_VF4 : PseudoUnaryV_VF4;
+defm PseudoVZEXT_VF8 : PseudoUnaryV_VF8;
+defm PseudoVSEXT_VF2 : PseudoUnaryV_VF2;
+defm PseudoVSEXT_VF4 : PseudoUnaryV_VF4;
+defm PseudoVSEXT_VF8 : PseudoUnaryV_VF8;
+
+//===----------------------------------------------------------------------===//
+// 12.4. Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVADC        : VPseudoBinaryV_VM_XM_IM;
+defm PseudoVMADC       : VPseudoBinaryM_VM_XM_IM<"@earlyclobber $rd">;
+defm PseudoVMADC       : VPseudoBinaryM_V_X_I<"@earlyclobber $rd">;
+
+defm PseudoVSBC        : VPseudoBinaryV_VM_XM;
+defm PseudoVMSBC       : VPseudoBinaryM_VM_XM<"@earlyclobber $rd">;
+defm PseudoVMSBC       : VPseudoBinaryM_V_X<"@earlyclobber $rd">;
+
+//===----------------------------------------------------------------------===//
+// 12.5. Vector Bitwise Logical Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVAND        : VPseudoBinaryV_VV_VX_VI;
+defm PseudoVOR         : VPseudoBinaryV_VV_VX_VI;
+defm PseudoVXOR        : VPseudoBinaryV_VV_VX_VI;
+
+//===----------------------------------------------------------------------===//
+// 12.6. Vector Single-Width Bit Shift Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVSLL        : VPseudoBinaryV_VV_VX_VI<uimm5>;
+defm PseudoVSRL        : VPseudoBinaryV_VV_VX_VI<uimm5>;
+defm PseudoVSRA        : VPseudoBinaryV_VV_VX_VI<uimm5>;
+
+//===----------------------------------------------------------------------===//
+// 12.7. Vector Narrowing Integer Right Shift Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVNSRL       : VPseudoBinaryV_WV_WX_WI;
+defm PseudoVNSRA       : VPseudoBinaryV_WV_WX_WI;
+
+//===----------------------------------------------------------------------===//
+// 12.8. Vector Integer Comparison Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVMSEQ       : VPseudoBinaryM_VV_VX_VI;
+defm PseudoVMSNE       : VPseudoBinaryM_VV_VX_VI;
+defm PseudoVMSLTU      : VPseudoBinaryM_VV_VX;
+defm PseudoVMSLT       : VPseudoBinaryM_VV_VX;
+defm PseudoVMSLEU      : VPseudoBinaryM_VV_VX_VI;
+defm PseudoVMSLE       : VPseudoBinaryM_VV_VX_VI;
+defm PseudoVMSGTU      : VPseudoBinaryM_VX_VI;
+defm PseudoVMSGT       : VPseudoBinaryM_VX_VI;
+
+//===----------------------------------------------------------------------===//
+// 12.9. Vector Integer Min/Max Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVMINU       : VPseudoBinaryV_VV_VX;
+defm PseudoVMIN        : VPseudoBinaryV_VV_VX;
+defm PseudoVMAXU       : VPseudoBinaryV_VV_VX;
+defm PseudoVMAX        : VPseudoBinaryV_VV_VX;
+
+//===----------------------------------------------------------------------===//
+// 12.10. Vector Single-Width Integer Multiply Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVMUL        : VPseudoBinaryV_VV_VX;
+defm PseudoVMULH       : VPseudoBinaryV_VV_VX;
+defm PseudoVMULHU      : VPseudoBinaryV_VV_VX;
+defm PseudoVMULHSU     : VPseudoBinaryV_VV_VX;
+
+//===----------------------------------------------------------------------===//
+// 12.11. Vector Integer Divide Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVDIVU       : VPseudoBinaryV_VV_VX;
+defm PseudoVDIV        : VPseudoBinaryV_VV_VX;
+defm PseudoVREMU       : VPseudoBinaryV_VV_VX;
+defm PseudoVREM        : VPseudoBinaryV_VV_VX;
+
+//===----------------------------------------------------------------------===//
+// 12.12. Vector Widening Integer Multiply Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVWMUL       : VPseudoBinaryW_VV_VX;
+defm PseudoVWMULU      : VPseudoBinaryW_VV_VX;
+defm PseudoVWMULSU     : VPseudoBinaryW_VV_VX;
+
+//===----------------------------------------------------------------------===//
+// 12.13. Vector Single-Width Integer Multiply-Add Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVMACC       : VPseudoTernaryV_VV_VX_AAXA;
+defm PseudoVNMSAC      : VPseudoTernaryV_VV_VX_AAXA;
+defm PseudoVMADD       : VPseudoTernaryV_VV_VX_AAXA;
+defm PseudoVNMSUB      : VPseudoTernaryV_VV_VX_AAXA;
+
+//===----------------------------------------------------------------------===//
+// 12.14. Vector Widening Integer Multiply-Add Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVWMACCU     : VPseudoTernaryW_VV_VX;
+defm PseudoVWMACC      : VPseudoTernaryW_VV_VX;
+defm PseudoVWMACCSU    : VPseudoTernaryW_VV_VX;
+defm PseudoVWMACCUS    : VPseudoTernaryW_VX;
+
+//===----------------------------------------------------------------------===//
+// 12.16. Vector Integer Merge Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVMERGE      : VPseudoBinaryV_VM_XM_IM;
+
+//===----------------------------------------------------------------------===//
+// 12.17. Vector Integer Move Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVMV_V       : VPseudoUnaryV_V_X_I_NoDummyMask;
+
+//===----------------------------------------------------------------------===//
+// 13.1. Vector Single-Width Saturating Add and Subtract
+//===----------------------------------------------------------------------===//
+let Defs = [VXSAT], hasSideEffects = 1 in {
+  defm PseudoVSADDU      : VPseudoBinaryV_VV_VX_VI;
+  defm PseudoVSADD       : VPseudoBinaryV_VV_VX_VI;
+  defm PseudoVSSUBU      : VPseudoBinaryV_VV_VX;
+  defm PseudoVSSUB       : VPseudoBinaryV_VV_VX;
+}
+
+//===----------------------------------------------------------------------===//
+// 13.2. Vector Single-Width Averaging Add and Subtract
+//===----------------------------------------------------------------------===//
+let Uses = [VL, VTYPE, VXRM], hasSideEffects = 1 in {
+  defm PseudoVAADDU      : VPseudoBinaryV_VV_VX;
+  defm PseudoVAADD       : VPseudoBinaryV_VV_VX;
+  defm PseudoVASUBU      : VPseudoBinaryV_VV_VX;
+  defm PseudoVASUB       : VPseudoBinaryV_VV_VX;
+}
+
+//===----------------------------------------------------------------------===//
+// 13.3. Vector Single-Width Fractional Multiply with Rounding and Saturation
+//===----------------------------------------------------------------------===//
+let Uses = [VL, VTYPE, VXRM], Defs = [VXSAT], hasSideEffects = 1 in {
+  defm PseudoVSMUL      : VPseudoBinaryV_VV_VX;
+}
+
+//===----------------------------------------------------------------------===//
+// 13.4. Vector Single-Width Scaling Shift Instructions
+//===----------------------------------------------------------------------===//
+let Uses = [VL, VTYPE, VXRM], hasSideEffects = 1 in {
+  defm PseudoVSSRL        : VPseudoBinaryV_VV_VX_VI<uimm5>;
+  defm PseudoVSSRA        : VPseudoBinaryV_VV_VX_VI<uimm5>;
+}
+
+//===----------------------------------------------------------------------===//
+// 13.5. Vector Narrowing Fixed-Point Clip Instructions
+//===----------------------------------------------------------------------===//
+let Uses = [VL, VTYPE, VXRM], Defs = [VXSAT], hasSideEffects = 1 in {
+  defm PseudoVNCLIP     : VPseudoBinaryV_WV_WX_WI;
+  defm PseudoVNCLIPU    : VPseudoBinaryV_WV_WX_WI;
+}
+
+} // Predicates = [HasStdExtV]
+
+let Predicates = [HasStdExtV, HasStdExtF] in {
+//===----------------------------------------------------------------------===//
+// 14.2. Vector Single-Width Floating-Point Add/Subtract Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVFADD       : VPseudoBinaryV_VV_VF;
+defm PseudoVFSUB       : VPseudoBinaryV_VV_VF;
+defm PseudoVFRSUB      : VPseudoBinaryV_VF;
+
+//===----------------------------------------------------------------------===//
+// 14.3. Vector Widening Floating-Point Add/Subtract Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVFWADD     : VPseudoBinaryW_VV_VF;
+defm PseudoVFWSUB     : VPseudoBinaryW_VV_VF;
+defm PseudoVFWADD     : VPseudoBinaryW_WV_WF;
+defm PseudoVFWSUB     : VPseudoBinaryW_WV_WF;
+
+//===----------------------------------------------------------------------===//
+// 14.4. Vector Single-Width Floating-Point Multiply/Divide Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVFMUL       : VPseudoBinaryV_VV_VF;
+defm PseudoVFDIV       : VPseudoBinaryV_VV_VF;
+defm PseudoVFRDIV      : VPseudoBinaryV_VF;
+
+//===----------------------------------------------------------------------===//
+// 14.5. Vector Widening Floating-Point Multiply
+//===----------------------------------------------------------------------===//
+defm PseudoVFWMUL      : VPseudoBinaryW_VV_VF;
+
+//===----------------------------------------------------------------------===//
+// 14.6. Vector Single-Width Floating-Point Fused Multiply-Add Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVFMACC      : VPseudoTernaryV_VV_VF_AAXA;
+defm PseudoVFNMACC     : VPseudoTernaryV_VV_VF_AAXA;
+defm PseudoVFMSAC      : VPseudoTernaryV_VV_VF_AAXA;
+defm PseudoVFNMSAC     : VPseudoTernaryV_VV_VF_AAXA;
+defm PseudoVFMADD      : VPseudoTernaryV_VV_VF_AAXA;
+defm PseudoVFNMADD     : VPseudoTernaryV_VV_VF_AAXA;
+defm PseudoVFMSUB      : VPseudoTernaryV_VV_VF_AAXA;
+defm PseudoVFNMSUB     : VPseudoTernaryV_VV_VF_AAXA;
+
+//===----------------------------------------------------------------------===//
+// 14.7. Vector Widening Floating-Point Fused Multiply-Add Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVFWMACC     : VPseudoTernaryW_VV_VF;
+defm PseudoVFWNMACC    : VPseudoTernaryW_VV_VF;
+defm PseudoVFWMSAC     : VPseudoTernaryW_VV_VF;
+defm PseudoVFWNMSAC    : VPseudoTernaryW_VV_VF;
+
+//===----------------------------------------------------------------------===//
+// 14.8. Vector Floating-Point Square-Root Instruction
+//===----------------------------------------------------------------------===//
+defm PseudoVFSQRT      : VPseudoUnaryV_V;
+
+//===----------------------------------------------------------------------===//
+// 14.9. Vector Floating-Point Reciprocal Square-Root Estimate Instruction
+//===----------------------------------------------------------------------===//
+defm PseudoVFRSQRT7    : VPseudoUnaryV_V;
+
+//===----------------------------------------------------------------------===//
+// 14.10. Vector Floating-Point Reciprocal Estimate Instruction
+//===----------------------------------------------------------------------===//
+defm PseudoVFREC7      : VPseudoUnaryV_V;
+
+//===----------------------------------------------------------------------===//
+// 14.11. Vector Floating-Point Min/Max Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVFMIN       : VPseudoBinaryV_VV_VF;
+defm PseudoVFMAX       : VPseudoBinaryV_VV_VF;
+
+//===----------------------------------------------------------------------===//
+// 14.12. Vector Floating-Point Sign-Injection Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVFSGNJ      : VPseudoBinaryV_VV_VF;
+defm PseudoVFSGNJN     : VPseudoBinaryV_VV_VF;
+defm PseudoVFSGNJX     : VPseudoBinaryV_VV_VF;
+
+//===----------------------------------------------------------------------===//
+// 14.13. Vector Floating-Point Compare Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVMFEQ       : VPseudoBinaryM_VV_VF;
+defm PseudoVMFNE       : VPseudoBinaryM_VV_VF;
+defm PseudoVMFLT       : VPseudoBinaryM_VV_VF;
+defm PseudoVMFLE       : VPseudoBinaryM_VV_VF;
+defm PseudoVMFGT       : VPseudoBinaryM_VF;
+defm PseudoVMFGE       : VPseudoBinaryM_VF;
+
+//===----------------------------------------------------------------------===//
+// 14.14. Vector Floating-Point Classify Instruction
+//===----------------------------------------------------------------------===//
+defm PseudoVFCLASS     : VPseudoUnaryV_V;
+
+//===----------------------------------------------------------------------===//
+// 14.15. Vector Floating-Point Merge Instruction
+//===----------------------------------------------------------------------===//
+defm PseudoVFMERGE     : VPseudoBinaryV_FM;
+
+//===----------------------------------------------------------------------===//
+// 14.16. Vector Floating-Point Move Instruction
+//===----------------------------------------------------------------------===//
+defm PseudoVFMV_V      : VPseudoUnaryV_F_NoDummyMask;
+
+//===----------------------------------------------------------------------===//
+// 14.17. Single-Width Floating-Point/Integer Type-Convert Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVFCVT_XU_F : VPseudoConversionV_V;
+defm PseudoVFCVT_X_F : VPseudoConversionV_V;
+defm PseudoVFCVT_RTZ_XU_F : VPseudoConversionV_V;
+defm PseudoVFCVT_RTZ_X_F : VPseudoConversionV_V;
+defm PseudoVFCVT_F_XU : VPseudoConversionV_V;
+defm PseudoVFCVT_F_X : VPseudoConversionV_V;
+
+//===----------------------------------------------------------------------===//
+// 14.18. Widening Floating-Point/Integer Type-Convert Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVFWCVT_XU_F : VPseudoConversionW_V;
+defm PseudoVFWCVT_X_F : VPseudoConversionW_V;
+defm PseudoVFWCVT_RTZ_XU_F : VPseudoConversionW_V;
+defm PseudoVFWCVT_RTZ_X_F : VPseudoConversionW_V;
+defm PseudoVFWCVT_F_XU : VPseudoConversionW_V;
+defm PseudoVFWCVT_F_X : VPseudoConversionW_V;
+defm PseudoVFWCVT_F_F : VPseudoConversionW_V;
+
+//===----------------------------------------------------------------------===//
+// 14.19. Narrowing Floating-Point/Integer Type-Convert Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVFNCVT_XU_F : VPseudoConversionV_W;
+defm PseudoVFNCVT_X_F : VPseudoConversionV_W;
+defm PseudoVFNCVT_RTZ_XU_F : VPseudoConversionV_W;
+defm PseudoVFNCVT_RTZ_X_F : VPseudoConversionV_W;
+defm PseudoVFNCVT_F_XU : VPseudoConversionV_W;
+defm PseudoVFNCVT_F_X : VPseudoConversionV_W;
+defm PseudoVFNCVT_F_F : VPseudoConversionV_W;
+defm PseudoVFNCVT_ROD_F_F : VPseudoConversionV_W;
+} // Predicates = [HasStdExtV, HasStdExtF]
+
+let Predicates = [HasStdExtV] in {
+//===----------------------------------------------------------------------===//
+// 15.1. Vector Single-Width Integer Reduction Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVREDSUM     : VPseudoReductionV_VS;
+defm PseudoVREDAND     : VPseudoReductionV_VS;
+defm PseudoVREDOR      : VPseudoReductionV_VS;
+defm PseudoVREDXOR     : VPseudoReductionV_VS;
+defm PseudoVREDMINU    : VPseudoReductionV_VS;
+defm PseudoVREDMIN     : VPseudoReductionV_VS;
+defm PseudoVREDMAXU    : VPseudoReductionV_VS;
+defm PseudoVREDMAX     : VPseudoReductionV_VS;
+
+//===----------------------------------------------------------------------===//
+// 15.2. Vector Widening Integer Reduction Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVWREDSUMU   : VPseudoReductionV_VS;
+defm PseudoVWREDSUM    : VPseudoReductionV_VS;
+} // Predicates = [HasStdExtV]
+
+let Predicates = [HasStdExtV, HasStdExtF] in {
+//===----------------------------------------------------------------------===//
+// 15.3. Vector Single-Width Floating-Point Reduction Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVFREDOSUM   : VPseudoReductionV_VS;
+defm PseudoVFREDSUM    : VPseudoReductionV_VS;
+defm PseudoVFREDMIN    : VPseudoReductionV_VS;
+defm PseudoVFREDMAX    : VPseudoReductionV_VS;
+
+//===----------------------------------------------------------------------===//
+// 15.4. Vector Widening Floating-Point Reduction Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVFWREDSUM   : VPseudoReductionV_VS;
+defm PseudoVFWREDOSUM  : VPseudoReductionV_VS;
+
+} // Predicates = [HasStdExtV, HasStdExtF]
+
+//===----------------------------------------------------------------------===//
+// 16. Vector Mask Instructions
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// 16.1 Vector Mask-Register Logical Instructions
+//===----------------------------------------------------------------------===//
+
+defm PseudoVMAND: VPseudoBinaryM_MM;
+defm PseudoVMNAND: VPseudoBinaryM_MM;
+defm PseudoVMANDNOT: VPseudoBinaryM_MM;
+defm PseudoVMXOR: VPseudoBinaryM_MM;
+defm PseudoVMOR: VPseudoBinaryM_MM;
+defm PseudoVMNOR: VPseudoBinaryM_MM;
+defm PseudoVMORNOT: VPseudoBinaryM_MM;
+defm PseudoVMXNOR: VPseudoBinaryM_MM;
+
+// Pseudo insturctions
+defm PseudoVMCLR : VPseudoNullaryPseudoM<"VMXOR">;
+defm PseudoVMSET : VPseudoNullaryPseudoM<"VMXNOR">;
+
+//===----------------------------------------------------------------------===//
+// 16.2. Vector mask population count vpopc
+//===----------------------------------------------------------------------===//
+
+defm PseudoVPOPC: VPseudoUnaryS_M;
+
+//===----------------------------------------------------------------------===//
+// 16.3. vfirst find-first-set mask bit
+//===----------------------------------------------------------------------===//
+
+defm PseudoVFIRST: VPseudoUnaryS_M;
+
+//===----------------------------------------------------------------------===//
+// 16.4. vmsbf.m set-before-first mask bit
+//===----------------------------------------------------------------------===//
+defm PseudoVMSBF: VPseudoUnaryM_M;
+
+//===----------------------------------------------------------------------===//
+// 16.5. vmsif.m set-including-first mask bit
+//===----------------------------------------------------------------------===//
+defm PseudoVMSIF: VPseudoUnaryM_M;
+
+//===----------------------------------------------------------------------===//
+// 16.6. vmsof.m set-only-first mask bit
+//===----------------------------------------------------------------------===//
+defm PseudoVMSOF: VPseudoUnaryM_M;
+
+//===----------------------------------------------------------------------===//
+// 16.8.  Vector Iota Instruction
+//===----------------------------------------------------------------------===//
+defm PseudoVIOTA_M: VPseudoUnaryV_M;
+
+//===----------------------------------------------------------------------===//
+// 16.9. Vector Element Index Instruction
+//===----------------------------------------------------------------------===//
+defm PseudoVID : VPseudoMaskNullaryV;
+
+//===----------------------------------------------------------------------===//
+// 17. Vector Permutation Instructions
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// 17.1. Integer Scalar Move Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasStdExtV] in {
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0, usesCustomInserter = 1,
+    Uses = [VL, VTYPE] in {
+  foreach m = MxList.m in {
+    let VLMul = m.value in {
+      let HasSEWOp = 1, BaseInstr = VMV_X_S in
+      def PseudoVMV_X_S # "_" # m.MX: Pseudo<(outs GPR:$rd),
+                                             (ins m.vrclass:$rs2, ixlenimm:$sew),
+                                             []>, RISCVVPseudo;
+      let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VMV_S_X, WritesElement0 = 1,
+          Constraints = "$rd = $rs1" in
+      def PseudoVMV_S_X # "_" # m.MX: Pseudo<(outs m.vrclass:$rd),
+                                             (ins m.vrclass:$rs1, GPR:$rs2,
+                                                  GPR:$vl, ixlenimm:$sew),
+                                             []>, RISCVVPseudo;
+    }
+  }
+}
+} // Predicates = [HasStdExtV]
+
+//===----------------------------------------------------------------------===//
+// 17.2. Floating-Point Scalar Move Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasStdExtV, HasStdExtF] in {
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0, usesCustomInserter = 1,
+    Uses = [VL, VTYPE] in {
+  foreach m = MxList.m in {
+    foreach f = FPList.fpinfo in {
+      let VLMul = m.value in {
+        let HasSEWOp = 1, BaseInstr = VFMV_F_S in
+        def "PseudoVFMV_" # f.FX # "_S_" # m.MX :
+                                          Pseudo<(outs f.fprclass:$rd),
+                                                 (ins m.vrclass:$rs2,
+                                                      ixlenimm:$sew),
+                                                 []>, RISCVVPseudo;
+        let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VFMV_S_F, WritesElement0 = 1,
+            Constraints = "$rd = $rs1" in
+        def "PseudoVFMV_S_" # f.FX # "_" # m.MX :
+                                          Pseudo<(outs m.vrclass:$rd),
+                                                 (ins m.vrclass:$rs1, f.fprclass:$rs2,
+                                                      GPR:$vl, ixlenimm:$sew),
+                                                 []>, RISCVVPseudo;
+      }
+    }
+  }
+}
+} // Predicates = [HasStdExtV, HasStdExtF]
+
+//===----------------------------------------------------------------------===//
+// 17.3. Vector Slide Instructions
+//===----------------------------------------------------------------------===//
+let Predicates = [HasStdExtV] in {
+  defm PseudoVSLIDEUP    : VPseudoTernaryV_VX_VI<uimm5, "@earlyclobber $rd">;
+  defm PseudoVSLIDEDOWN  : VPseudoTernaryV_VX_VI<uimm5>;
+  defm PseudoVSLIDE1UP   : VPseudoBinaryV_VX<"@earlyclobber $rd">;
+  defm PseudoVSLIDE1DOWN : VPseudoBinaryV_VX;
+} // Predicates = [HasStdExtV]
+
+let Predicates = [HasStdExtV, HasStdExtF] in {
+  defm PseudoVFSLIDE1UP  : VPseudoBinaryV_VF<"@earlyclobber $rd">;
+  defm PseudoVFSLIDE1DOWN : VPseudoBinaryV_VF;
+} // Predicates = [HasStdExtV, HasStdExtF]
+
+//===----------------------------------------------------------------------===//
+// 17.4. Vector Register Gather Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVRGATHER    : VPseudoBinaryV_VV_VX_VI<uimm5, "@earlyclobber $rd">;
+defm PseudoVRGATHEREI16 : VPseudoBinaryV_VV_EEW</* eew */ 16, "@earlyclobber $rd">;
+
+//===----------------------------------------------------------------------===//
+// 17.5. Vector Compress Instruction
+//===----------------------------------------------------------------------===//
+defm PseudoVCOMPRESS : VPseudoUnaryV_V_AnyMask;
+
+//===----------------------------------------------------------------------===//
+// Patterns.
+//===----------------------------------------------------------------------===//
+let Predicates = [HasStdExtV] in {
+
+//===----------------------------------------------------------------------===//
+// 7. Vector Loads and Stores
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// 7.4 Vector Unit-Stride Instructions
+//===----------------------------------------------------------------------===//
+
+foreach vti = AllVectors in
+{
+  defm : VPatUSLoad<"int_riscv_vle",
+                    "PseudoVLE" # vti.SEW,
+                    vti.Vector, vti.Mask, vti.SEW, vti.LMul, vti.RegClass>;
+  defm : VPatUSLoadFF<"PseudoVLE" # vti.SEW # "FF",
+                      vti.Vector, vti.Mask, vti.SEW, vti.LMul, vti.RegClass>;
+  defm : VPatUSStore<"int_riscv_vse",
+                     "PseudoVSE" # vti.SEW,
+                     vti.Vector, vti.Mask, vti.SEW, vti.LMul, vti.RegClass>;
+}
+
+foreach vti = AllMasks in {
+  defvar PseudoVLE1 = !cast<Instruction>("PseudoVLE1_V_"#vti.BX);
+  def : Pat<(vti.Mask (int_riscv_vle1 GPR:$rs1, (XLenVT (VLOp GPR:$vl)))),
+            (PseudoVLE1 $rs1, GPR:$vl, vti.SEW)>;
+  defvar PseudoVSE1 = !cast<Instruction>("PseudoVSE1_V_"#vti.BX);
+  def : Pat<(int_riscv_vse1 (vti.Mask VR:$rs3), GPR:$rs1, (XLenVT (VLOp GPR:$vl))),
+            (PseudoVSE1 $rs3, $rs1, GPR:$vl, vti.SEW)>;
+}
+
+//===----------------------------------------------------------------------===//
+// 7.5 Vector Strided Instructions
+//===----------------------------------------------------------------------===//
+
+foreach vti = AllVectors in
+{
+  defm : VPatSLoad<"int_riscv_vlse",
+                   "PseudoVLSE" # vti.SEW,
+                   vti.Vector, vti.Mask, vti.SEW, vti.LMul, vti.RegClass>;
+  defm : VPatSStore<"int_riscv_vsse",
+                    "PseudoVSSE" # vti.SEW,
+                    vti.Vector, vti.Mask, vti.SEW, vti.LMul, vti.RegClass>;
+}
+
+//===----------------------------------------------------------------------===//
+// 7.6 Vector Indexed Instructions
+//===----------------------------------------------------------------------===//
+
+foreach vti = AllVectors in
+foreach eew = EEWList in {
+  defvar vlmul = vti.LMul;
+  defvar octuple_lmul = octuple_from_str<vti.LMul.MX>.ret;
+  defvar log_sew = shift_amount<vti.SEW>.val;
+  // The data vector register group has EEW=SEW, EMUL=LMUL, while the offset
+  // vector register group has EEW encoding in the instruction and EMUL=(EEW/SEW)*LMUL.
+  // calculate octuple elmul which is (eew * octuple_lmul) >> log_sew
+  defvar octuple_elmul = !srl(!mul(eew, octuple_lmul), log_sew);
+  // legal octuple elmul should be more than 0 and less than equal 64
+  if !gt(octuple_elmul, 0) then {
+    if !le(octuple_elmul, 64) then {
+       defvar elmul_str = octuple_to_str<octuple_elmul>.ret;
+       defvar elmul =!cast<LMULInfo>("V_" # elmul_str);
+       defvar idx_vti = !cast<VTypeInfo>("VI" # eew # elmul_str);
+
+       defm : VPatILoad<"int_riscv_vluxei",
+                        "PseudoVLUXEI"#eew,
+                         vti.Vector, idx_vti.Vector, vti.Mask, vti.SEW,
+                         vlmul, elmul, vti.RegClass, idx_vti.RegClass>;
+       defm : VPatILoad<"int_riscv_vloxei",
+                        "PseudoVLOXEI"#eew,
+                         vti.Vector, idx_vti.Vector, vti.Mask, vti.SEW,
+                         vlmul, elmul, vti.RegClass, idx_vti.RegClass>;
+       defm : VPatIStore<"int_riscv_vsoxei",
+                          "PseudoVSOXEI"#eew,
+                          vti.Vector, idx_vti.Vector, vti.Mask, vti.SEW,
+                          vlmul, elmul, vti.RegClass, idx_vti.RegClass>;
+       defm : VPatIStore<"int_riscv_vsuxei",
+                          "PseudoVSUXEI"#eew,
+                          vti.Vector, idx_vti.Vector, vti.Mask, vti.SEW,
+                          vlmul, elmul, vti.RegClass, idx_vti.RegClass>;
+    }
+  }
+}
+} // Predicates = [HasStdExtV]
+
+//===----------------------------------------------------------------------===//
+// 8. Vector AMO Operations
+//===----------------------------------------------------------------------===//
+let Predicates = [HasStdExtZvamo] in {
+  defm "" : VPatAMOV_WD<"int_riscv_vamoswap", "PseudoVAMOSWAP", AllIntegerVectors>;
+  defm "" : VPatAMOV_WD<"int_riscv_vamoadd", "PseudoVAMOADD", AllIntegerVectors>;
+  defm "" : VPatAMOV_WD<"int_riscv_vamoxor", "PseudoVAMOXOR", AllIntegerVectors>;
+  defm "" : VPatAMOV_WD<"int_riscv_vamoand", "PseudoVAMOAND", AllIntegerVectors>;
+  defm "" : VPatAMOV_WD<"int_riscv_vamoor", "PseudoVAMOOR", AllIntegerVectors>;
+  defm "" : VPatAMOV_WD<"int_riscv_vamomin", "PseudoVAMOMIN", AllIntegerVectors>;
+  defm "" : VPatAMOV_WD<"int_riscv_vamomax", "PseudoVAMOMAX", AllIntegerVectors>;
+  defm "" : VPatAMOV_WD<"int_riscv_vamominu", "PseudoVAMOMINU", AllIntegerVectors>;
+  defm "" : VPatAMOV_WD<"int_riscv_vamomaxu", "PseudoVAMOMAXU", AllIntegerVectors>;
+} // Predicates = [HasStdExtZvamo]
+
+let Predicates = [HasStdExtZvamo, HasStdExtF] in {
+  defm "" : VPatAMOV_WD<"int_riscv_vamoswap", "PseudoVAMOSWAP", AllFloatVectors>;
+} // Predicates = [HasStdExtZvamo, HasStdExtF]
+
+//===----------------------------------------------------------------------===//
+// 12. Vector Integer Arithmetic Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasStdExtV] in {
+//===----------------------------------------------------------------------===//
+// 12.1. Vector Single-Width Integer Add and Subtract
+//===----------------------------------------------------------------------===//
+defm "" : VPatBinaryV_VV_VX_VI<"int_riscv_vadd", "PseudoVADD", AllIntegerVectors>;
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vsub", "PseudoVSUB", AllIntegerVectors>;
+defm "" : VPatBinaryV_VX_VI<"int_riscv_vrsub", "PseudoVRSUB", AllIntegerVectors>;
+
+//===----------------------------------------------------------------------===//
+// 12.2. Vector Widening Integer Add/Subtract
+//===----------------------------------------------------------------------===//
+defm "" : VPatBinaryW_VV_VX<"int_riscv_vwaddu", "PseudoVWADDU", AllWidenableIntVectors>;
+defm "" : VPatBinaryW_VV_VX<"int_riscv_vwsubu", "PseudoVWSUBU", AllWidenableIntVectors>;
+defm "" : VPatBinaryW_VV_VX<"int_riscv_vwadd", "PseudoVWADD", AllWidenableIntVectors>;
+defm "" : VPatBinaryW_VV_VX<"int_riscv_vwsub", "PseudoVWSUB", AllWidenableIntVectors>;
+defm "" : VPatBinaryW_WV_WX<"int_riscv_vwaddu_w", "PseudoVWADDU", AllWidenableIntVectors>;
+defm "" : VPatBinaryW_WV_WX<"int_riscv_vwsubu_w", "PseudoVWSUBU", AllWidenableIntVectors>;
+defm "" : VPatBinaryW_WV_WX<"int_riscv_vwadd_w", "PseudoVWADD", AllWidenableIntVectors>;
+defm "" : VPatBinaryW_WV_WX<"int_riscv_vwsub_w", "PseudoVWSUB", AllWidenableIntVectors>;
+
+//===----------------------------------------------------------------------===//
+// 12.3. Vector Integer Extension
+//===----------------------------------------------------------------------===//
+defm "" : VPatUnaryV_VF<"int_riscv_vzext", "PseudoVZEXT", "VF2",
+                        AllFractionableVF2IntVectors>;
+defm "" : VPatUnaryV_VF<"int_riscv_vzext", "PseudoVZEXT", "VF4",
+                        AllFractionableVF4IntVectors>;
+defm "" : VPatUnaryV_VF<"int_riscv_vzext", "PseudoVZEXT", "VF8",
+                        AllFractionableVF8IntVectors>;
+defm "" : VPatUnaryV_VF<"int_riscv_vsext", "PseudoVSEXT", "VF2",
+                        AllFractionableVF2IntVectors>;
+defm "" : VPatUnaryV_VF<"int_riscv_vsext", "PseudoVSEXT", "VF4",
+                        AllFractionableVF4IntVectors>;
+defm "" : VPatUnaryV_VF<"int_riscv_vsext", "PseudoVSEXT", "VF8",
+                        AllFractionableVF8IntVectors>;
+
+//===----------------------------------------------------------------------===//
+// 12.4. Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatBinaryV_VM_XM_IM<"int_riscv_vadc", "PseudoVADC">;
+defm "" : VPatBinaryM_VM_XM_IM<"int_riscv_vmadc_carry_in", "PseudoVMADC">;
+defm "" : VPatBinaryM_V_X_I<"int_riscv_vmadc", "PseudoVMADC">;
+
+defm "" : VPatBinaryV_VM_XM<"int_riscv_vsbc", "PseudoVSBC">;
+defm "" : VPatBinaryM_VM_XM<"int_riscv_vmsbc_borrow_in", "PseudoVMSBC">;
+defm "" : VPatBinaryM_V_X<"int_riscv_vmsbc", "PseudoVMSBC">;
+
+//===----------------------------------------------------------------------===//
+// 12.5. Vector Bitwise Logical Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatBinaryV_VV_VX_VI<"int_riscv_vand", "PseudoVAND", AllIntegerVectors>;
+defm "" : VPatBinaryV_VV_VX_VI<"int_riscv_vor", "PseudoVOR", AllIntegerVectors>;
+defm "" : VPatBinaryV_VV_VX_VI<"int_riscv_vxor", "PseudoVXOR", AllIntegerVectors>;
+
+//===----------------------------------------------------------------------===//
+// 12.6. Vector Single-Width Bit Shift Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatBinaryV_VV_VX_VI<"int_riscv_vsll", "PseudoVSLL", AllIntegerVectors,
+                               uimm5>;
+defm "" : VPatBinaryV_VV_VX_VI<"int_riscv_vsrl", "PseudoVSRL", AllIntegerVectors,
+                               uimm5>;
+defm "" : VPatBinaryV_VV_VX_VI<"int_riscv_vsra", "PseudoVSRA", AllIntegerVectors,
+                               uimm5>;
+
+//===----------------------------------------------------------------------===//
+// 12.7. Vector Narrowing Integer Right Shift Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatBinaryV_WV_WX_WI<"int_riscv_vnsrl", "PseudoVNSRL", AllWidenableIntVectors>;
+defm "" : VPatBinaryV_WV_WX_WI<"int_riscv_vnsra", "PseudoVNSRA", AllWidenableIntVectors>;
+
+//===----------------------------------------------------------------------===//
+// 12.8. Vector Integer Comparison Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatBinaryM_VV_VX_VI<"int_riscv_vmseq", "PseudoVMSEQ", AllIntegerVectors>;
+defm "" : VPatBinaryM_VV_VX_VI<"int_riscv_vmsne", "PseudoVMSNE", AllIntegerVectors>;
+defm "" : VPatBinaryM_VV_VX<"int_riscv_vmsltu", "PseudoVMSLTU", AllIntegerVectors>;
+defm "" : VPatBinaryM_VV_VX<"int_riscv_vmslt", "PseudoVMSLT", AllIntegerVectors>;
+defm "" : VPatBinaryM_VV_VX_VI<"int_riscv_vmsleu", "PseudoVMSLEU", AllIntegerVectors>;
+defm "" : VPatBinaryM_VV_VX_VI<"int_riscv_vmsle", "PseudoVMSLE", AllIntegerVectors>;
+
+defm "" : VPatBinaryM_VX_VI<"int_riscv_vmsgtu", "PseudoVMSGTU", AllIntegerVectors>;
+defm "" : VPatBinaryM_VX_VI<"int_riscv_vmsgt", "PseudoVMSGT", AllIntegerVectors>;
+
+// Match vmslt(u).vx intrinsics to vmsle(u).vi if the scalar is -15 to 16. This
+// avoids the user needing to know that there is no vmslt(u).vi instruction.
+// This is limited to vmslt(u).vx as there is no vmsge().vx intrinsic or
+// instruction.
+foreach vti = AllIntegerVectors in {
+  def : Pat<(vti.Mask (int_riscv_vmslt (vti.Vector vti.RegClass:$rs1),
+                                       (vti.Scalar simm5_plus1:$rs2), (XLenVT (VLOp GPR:$vl)))),
+            (!cast<Instruction>("PseudoVMSLE_VI_"#vti.LMul.MX) vti.RegClass:$rs1,
+                                                               (DecImm simm5_plus1:$rs2),
+                                                               GPR:$vl,
+                                                               vti.SEW)>;
+  def : Pat<(vti.Mask (int_riscv_vmslt_mask (vti.Mask VR:$merge),
+                                            (vti.Vector vti.RegClass:$rs1),
+                                            (vti.Scalar simm5_plus1:$rs2),
+                                            (vti.Mask V0),
+                                            (XLenVT (VLOp GPR:$vl)))),
+            (!cast<Instruction>("PseudoVMSLE_VI_"#vti.LMul.MX#"_MASK")
+                                                      VR:$merge,
+                                                      vti.RegClass:$rs1,
+                                                      (DecImm simm5_plus1:$rs2),
+                                                      (vti.Mask V0),
+                                                      GPR:$vl,
+                                                      vti.SEW)>;
+
+  def : Pat<(vti.Mask (int_riscv_vmsltu (vti.Vector vti.RegClass:$rs1),
+                                        (vti.Scalar simm5_plus1:$rs2),
+                                        (XLenVT (VLOp GPR:$vl)))),
+            (!cast<Instruction>("PseudoVMSLEU_VI_"#vti.LMul.MX) vti.RegClass:$rs1,
+                                                                (DecImm simm5_plus1:$rs2),
+                                                                GPR:$vl,
+                                                                vti.SEW)>;
+  def : Pat<(vti.Mask (int_riscv_vmsltu_mask (vti.Mask VR:$merge),
+                                             (vti.Vector vti.RegClass:$rs1),
+                                             (vti.Scalar simm5_plus1:$rs2),
+                                             (vti.Mask V0),
+                                             (XLenVT (VLOp GPR:$vl)))),
+            (!cast<Instruction>("PseudoVMSLEU_VI_"#vti.LMul.MX#"_MASK")
+                                                      VR:$merge,
+                                                      vti.RegClass:$rs1,
+                                                      (DecImm simm5_plus1:$rs2),
+                                                      (vti.Mask V0),
+                                                      GPR:$vl,
+                                                      vti.SEW)>;
+
+  // Special cases to avoid matching vmsltu.vi 0 (always false) to
+  // vmsleu.vi -1 (always true). Instead match to vmsne.vv.
+  def : Pat<(vti.Mask (int_riscv_vmsltu (vti.Vector vti.RegClass:$rs1),
+                                        (vti.Scalar 0), (XLenVT (VLOp GPR:$vl)))),
+            (!cast<Instruction>("PseudoVMSNE_VV_"#vti.LMul.MX) vti.RegClass:$rs1,
+                                                               vti.RegClass:$rs1,
+                                                               GPR:$vl,
+                                                               vti.SEW)>;
+  def : Pat<(vti.Mask (int_riscv_vmsltu_mask (vti.Mask VR:$merge),
+                                             (vti.Vector vti.RegClass:$rs1),
+                                             (vti.Scalar 0),
+                                             (vti.Mask V0),
+                                             (XLenVT (VLOp GPR:$vl)))),
+            (!cast<Instruction>("PseudoVMSNE_VV_"#vti.LMul.MX#"_MASK")
+                                                     VR:$merge,
+                                                     vti.RegClass:$rs1,
+                                                     vti.RegClass:$rs1,
+                                                     (vti.Mask V0),
+                                                     GPR:$vl,
+                                                     vti.SEW)>;
+}
+
+//===----------------------------------------------------------------------===//
+// 12.9. Vector Integer Min/Max Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vminu", "PseudoVMINU", AllIntegerVectors>;
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vmin", "PseudoVMIN", AllIntegerVectors>;
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vmaxu", "PseudoVMAXU", AllIntegerVectors>;
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vmax", "PseudoVMAX", AllIntegerVectors>;
+
+//===----------------------------------------------------------------------===//
+// 12.10. Vector Single-Width Integer Multiply Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vmul", "PseudoVMUL", AllIntegerVectors>;
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vmulh", "PseudoVMULH", AllIntegerVectors>;
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vmulhu", "PseudoVMULHU", AllIntegerVectors>;
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vmulhsu", "PseudoVMULHSU", AllIntegerVectors>;
+
+//===----------------------------------------------------------------------===//
+// 12.11. Vector Integer Divide Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vdivu", "PseudoVDIVU", AllIntegerVectors>;
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vdiv", "PseudoVDIV", AllIntegerVectors>;
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vremu", "PseudoVREMU", AllIntegerVectors>;
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vrem", "PseudoVREM", AllIntegerVectors>;
+
+//===----------------------------------------------------------------------===//
+// 12.12. Vector Widening Integer Multiply Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatBinaryW_VV_VX<"int_riscv_vwmul", "PseudoVWMUL", AllWidenableIntVectors>;
+defm "" : VPatBinaryW_VV_VX<"int_riscv_vwmulu", "PseudoVWMULU", AllWidenableIntVectors>;
+defm "" : VPatBinaryW_VV_VX<"int_riscv_vwmulsu", "PseudoVWMULSU", AllWidenableIntVectors>;
+
+//===----------------------------------------------------------------------===//
+// 12.13. Vector Single-Width Integer Multiply-Add Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatTernaryV_VV_VX_AAXA<"int_riscv_vmadd", "PseudoVMADD", AllIntegerVectors>;
+defm "" : VPatTernaryV_VV_VX_AAXA<"int_riscv_vnmsub", "PseudoVNMSUB", AllIntegerVectors>;
+defm "" : VPatTernaryV_VV_VX_AAXA<"int_riscv_vmacc", "PseudoVMACC", AllIntegerVectors>;
+defm "" : VPatTernaryV_VV_VX_AAXA<"int_riscv_vnmsac", "PseudoVNMSAC", AllIntegerVectors>;
+
+//===----------------------------------------------------------------------===//
+// 12.14. Vector Widening Integer Multiply-Add Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatTernaryW_VV_VX<"int_riscv_vwmaccu", "PseudoVWMACCU", AllWidenableIntVectors>;
+defm "" : VPatTernaryW_VV_VX<"int_riscv_vwmacc", "PseudoVWMACC", AllWidenableIntVectors>;
+defm "" : VPatTernaryW_VV_VX<"int_riscv_vwmaccsu", "PseudoVWMACCSU", AllWidenableIntVectors>;
+defm "" : VPatTernaryW_VX<"int_riscv_vwmaccus", "PseudoVWMACCUS", AllWidenableIntVectors>;
+
+//===----------------------------------------------------------------------===//
+// 12.16. Vector Integer Merge Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatBinaryV_VM_XM_IM<"int_riscv_vmerge", "PseudoVMERGE">;
+
+//===----------------------------------------------------------------------===//
+// 12.17. Vector Integer Move Instructions
+//===----------------------------------------------------------------------===//
+foreach vti = AllVectors in {
+  def : Pat<(vti.Vector (int_riscv_vmv_v_v (vti.Vector vti.RegClass:$rs1),
+                                           (XLenVT (VLOp GPR:$vl)))),
+            (!cast<Instruction>("PseudoVMV_V_V_"#vti.LMul.MX)
+             $rs1, GPR:$vl, vti.SEW)>;
+}
+
+foreach vti = AllIntegerVectors in {
+  def : Pat<(vti.Vector (int_riscv_vmv_v_x GPR:$rs2, (XLenVT (VLOp GPR:$vl)))),
+            (!cast<Instruction>("PseudoVMV_V_X_"#vti.LMul.MX)
+             $rs2, GPR:$vl, vti.SEW)>;
+  def : Pat<(vti.Vector (int_riscv_vmv_v_x simm5:$imm5, (XLenVT (VLOp GPR:$vl)))),
+            (!cast<Instruction>("PseudoVMV_V_I_"#vti.LMul.MX)
+             simm5:$imm5, GPR:$vl, vti.SEW)>;
+}
+
+//===----------------------------------------------------------------------===//
+// 13.1. Vector Single-Width Saturating Add and Subtract
+//===----------------------------------------------------------------------===//
+defm "" : VPatBinaryV_VV_VX_VI<"int_riscv_vsaddu", "PseudoVSADDU", AllIntegerVectors>;
+defm "" : VPatBinaryV_VV_VX_VI<"int_riscv_vsadd", "PseudoVSADD", AllIntegerVectors>;
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vssubu", "PseudoVSSUBU", AllIntegerVectors>;
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vssub", "PseudoVSSUB", AllIntegerVectors>;
+
+//===----------------------------------------------------------------------===//
+// 13.2. Vector Single-Width Averaging Add and Subtract
+//===----------------------------------------------------------------------===//
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vaaddu", "PseudoVAADDU", AllIntegerVectors>;
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vaadd", "PseudoVAADD", AllIntegerVectors>;
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vasubu", "PseudoVASUBU", AllIntegerVectors>;
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vasub", "PseudoVASUB", AllIntegerVectors>;
+
+//===----------------------------------------------------------------------===//
+// 13.3. Vector Single-Width Fractional Multiply with Rounding and Saturation
+//===----------------------------------------------------------------------===//
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vsmul", "PseudoVSMUL", AllIntegerVectors>;
+
+//===----------------------------------------------------------------------===//
+// 13.4. Vector Single-Width Scaling Shift Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatBinaryV_VV_VX_VI<"int_riscv_vssrl", "PseudoVSSRL", AllIntegerVectors,
+                               uimm5>;
+defm "" : VPatBinaryV_VV_VX_VI<"int_riscv_vssra", "PseudoVSSRA", AllIntegerVectors,
+                               uimm5>;
+
+//===----------------------------------------------------------------------===//
+// 13.5. Vector Narrowing Fixed-Point Clip Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatBinaryV_WV_WX_WI<"int_riscv_vnclipu", "PseudoVNCLIPU", AllWidenableIntVectors>;
+defm "" : VPatBinaryV_WV_WX_WI<"int_riscv_vnclip", "PseudoVNCLIP", AllWidenableIntVectors>;
+
+} // Predicates = [HasStdExtV]
+
+let Predicates = [HasStdExtV, HasStdExtF] in {
+//===----------------------------------------------------------------------===//
+// 14.2. Vector Single-Width Floating-Point Add/Subtract Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vfadd", "PseudoVFADD", AllFloatVectors>;
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vfsub", "PseudoVFSUB", AllFloatVectors>;
+defm "" : VPatBinaryV_VX<"int_riscv_vfrsub", "PseudoVFRSUB", AllFloatVectors>;
+
+//===----------------------------------------------------------------------===//
+// 14.3. Vector Widening Floating-Point Add/Subtract Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatBinaryW_VV_VX<"int_riscv_vfwadd", "PseudoVFWADD", AllWidenableFloatVectors>;
+defm "" : VPatBinaryW_VV_VX<"int_riscv_vfwsub", "PseudoVFWSUB", AllWidenableFloatVectors>;
+defm "" : VPatBinaryW_WV_WX<"int_riscv_vfwadd_w", "PseudoVFWADD", AllWidenableFloatVectors>;
+defm "" : VPatBinaryW_WV_WX<"int_riscv_vfwsub_w", "PseudoVFWSUB", AllWidenableFloatVectors>;
+
+//===----------------------------------------------------------------------===//
+// 14.4. Vector Single-Width Floating-Point Multiply/Divide Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vfmul", "PseudoVFMUL", AllFloatVectors>;
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vfdiv", "PseudoVFDIV", AllFloatVectors>;
+defm "" : VPatBinaryV_VX<"int_riscv_vfrdiv", "PseudoVFRDIV", AllFloatVectors>;
+
+//===----------------------------------------------------------------------===//
+// 14.5. Vector Widening Floating-Point Multiply
+//===----------------------------------------------------------------------===//
+defm "" : VPatBinaryW_VV_VX<"int_riscv_vfwmul", "PseudoVFWMUL", AllWidenableFloatVectors>;
+
+//===----------------------------------------------------------------------===//
+// 14.6. Vector Single-Width Floating-Point Fused Multiply-Add Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatTernaryV_VV_VX_AAXA<"int_riscv_vfmacc", "PseudoVFMACC", AllFloatVectors>;
+defm "" : VPatTernaryV_VV_VX_AAXA<"int_riscv_vfnmacc", "PseudoVFNMACC", AllFloatVectors>;
+defm "" : VPatTernaryV_VV_VX_AAXA<"int_riscv_vfmsac", "PseudoVFMSAC", AllFloatVectors>;
+defm "" : VPatTernaryV_VV_VX_AAXA<"int_riscv_vfnmsac", "PseudoVFNMSAC", AllFloatVectors>;
+defm "" : VPatTernaryV_VV_VX_AAXA<"int_riscv_vfmadd", "PseudoVFMADD", AllFloatVectors>;
+defm "" : VPatTernaryV_VV_VX_AAXA<"int_riscv_vfnmadd", "PseudoVFNMADD", AllFloatVectors>;
+defm "" : VPatTernaryV_VV_VX_AAXA<"int_riscv_vfmsub", "PseudoVFMSUB", AllFloatVectors>;
+defm "" : VPatTernaryV_VV_VX_AAXA<"int_riscv_vfnmsub", "PseudoVFNMSUB", AllFloatVectors>;
+
+//===----------------------------------------------------------------------===//
+// 14.7. Vector Widening Floating-Point Fused Multiply-Add Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatTernaryW_VV_VX<"int_riscv_vfwmacc", "PseudoVFWMACC", AllWidenableFloatVectors>;
+defm "" : VPatTernaryW_VV_VX<"int_riscv_vfwnmacc", "PseudoVFWNMACC", AllWidenableFloatVectors>;
+defm "" : VPatTernaryW_VV_VX<"int_riscv_vfwmsac", "PseudoVFWMSAC", AllWidenableFloatVectors>;
+defm "" : VPatTernaryW_VV_VX<"int_riscv_vfwnmsac", "PseudoVFWNMSAC", AllWidenableFloatVectors>;
+
+//===----------------------------------------------------------------------===//
+// 14.8. Vector Floating-Point Square-Root Instruction
+//===----------------------------------------------------------------------===//
+defm "" : VPatUnaryV_V<"int_riscv_vfsqrt", "PseudoVFSQRT", AllFloatVectors>;
+
+//===----------------------------------------------------------------------===//
+// 14.9. Vector Floating-Point Reciprocal Square-Root Estimate Instruction
+//===----------------------------------------------------------------------===//
+defm "" : VPatUnaryV_V<"int_riscv_vfrsqrt7", "PseudoVFRSQRT7", AllFloatVectors>;
+
+//===----------------------------------------------------------------------===//
+// 14.10. Vector Floating-Point Reciprocal Estimate Instruction
+//===----------------------------------------------------------------------===//
+defm "" : VPatUnaryV_V<"int_riscv_vfrec7", "PseudoVFREC7", AllFloatVectors>;
+
+//===----------------------------------------------------------------------===//
+// 14.11. Vector Floating-Point Min/Max Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vfmin", "PseudoVFMIN", AllFloatVectors>;
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vfmax", "PseudoVFMAX", AllFloatVectors>;
+
+//===----------------------------------------------------------------------===//
+// 14.12. Vector Floating-Point Sign-Injection Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vfsgnj", "PseudoVFSGNJ", AllFloatVectors>;
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vfsgnjn", "PseudoVFSGNJN", AllFloatVectors>;
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vfsgnjx", "PseudoVFSGNJX", AllFloatVectors>;
+
+//===----------------------------------------------------------------------===//
+// 14.13. Vector Floating-Point Compare Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatBinaryM_VV_VX<"int_riscv_vmfeq", "PseudoVMFEQ", AllFloatVectors>;
+defm "" : VPatBinaryM_VV_VX<"int_riscv_vmfle", "PseudoVMFLE", AllFloatVectors>;
+defm "" : VPatBinaryM_VV_VX<"int_riscv_vmflt", "PseudoVMFLT", AllFloatVectors>;
+defm "" : VPatBinaryM_VV_VX<"int_riscv_vmfne", "PseudoVMFNE", AllFloatVectors>;
+defm "" : VPatBinaryM_VX<"int_riscv_vmfgt", "PseudoVMFGT", AllFloatVectors>;
+defm "" : VPatBinaryM_VX<"int_riscv_vmfge", "PseudoVMFGE", AllFloatVectors>;
+
+//===----------------------------------------------------------------------===//
+// 14.14. Vector Floating-Point Classify Instruction
+//===----------------------------------------------------------------------===//
+defm "" : VPatConversionVI_VF<"int_riscv_vfclass", "PseudoVFCLASS">;
+
+//===----------------------------------------------------------------------===//
+// 14.15. Vector Floating-Point Merge Instruction
+//===----------------------------------------------------------------------===//
+// We can use vmerge.vvm to support vector-vector vfmerge.
+defm "" : VPatBinaryV_VM<"int_riscv_vfmerge", "PseudoVMERGE",
+                         /*CarryOut = */0, /*vtilist=*/AllFloatVectors>;
+defm "" : VPatBinaryV_XM<"int_riscv_vfmerge", "PseudoVFMERGE",
+                         /*CarryOut = */0, /*vtilist=*/AllFloatVectors>;
+
+foreach fvti = AllFloatVectors in {
+  defvar instr = !cast<Instruction>("PseudoVMERGE_VIM_"#fvti.LMul.MX);
+  def : Pat<(fvti.Vector (int_riscv_vfmerge (fvti.Vector fvti.RegClass:$rs2),
+                                            (fvti.Scalar (fpimm0)),
+                                            (fvti.Mask V0), (XLenVT (VLOp GPR:$vl)))),
+            (instr fvti.RegClass:$rs2, 0, (fvti.Mask V0), GPR:$vl, fvti.SEW)>;
+}
+
+//===----------------------------------------------------------------------===//
+// 14.16. Vector Floating-Point Move Instruction
+//===----------------------------------------------------------------------===//
+foreach fvti = AllFloatVectors in {
+  // If we're splatting fpimm0, use vmv.v.x vd, x0.
+  def : Pat<(fvti.Vector (int_riscv_vfmv_v_f
+                         (fvti.Scalar (fpimm0)), (XLenVT (VLOp GPR:$vl)))),
+            (!cast<Instruction>("PseudoVMV_V_I_"#fvti.LMul.MX)
+             0, GPR:$vl, fvti.SEW)>;
+
+  def : Pat<(fvti.Vector (int_riscv_vfmv_v_f
+                         (fvti.Scalar fvti.ScalarRegClass:$rs2), (XLenVT (VLOp GPR:$vl)))),
+            (!cast<Instruction>("PseudoVFMV_V_" # fvti.ScalarSuffix # "_" #
+                                fvti.LMul.MX)
+             (fvti.Scalar fvti.ScalarRegClass:$rs2),
+             GPR:$vl, fvti.SEW)>;
+}
+
+//===----------------------------------------------------------------------===//
+// 14.17. Single-Width Floating-Point/Integer Type-Convert Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatConversionVI_VF<"int_riscv_vfcvt_xu_f_v", "PseudoVFCVT_XU_F">;
+defm "" : VPatConversionVI_VF<"int_riscv_vfcvt_rtz_xu_f_v", "PseudoVFCVT_RTZ_XU_F">;
+defm "" : VPatConversionVI_VF<"int_riscv_vfcvt_x_f_v", "PseudoVFCVT_X_F">;
+defm "" : VPatConversionVI_VF<"int_riscv_vfcvt_rtz_x_f_v", "PseudoVFCVT_RTZ_X_F">;
+defm "" : VPatConversionVF_VI<"int_riscv_vfcvt_f_x_v", "PseudoVFCVT_F_X">;
+defm "" : VPatConversionVF_VI<"int_riscv_vfcvt_f_xu_v", "PseudoVFCVT_F_XU">;
+
+//===----------------------------------------------------------------------===//
+// 14.18. Widening Floating-Point/Integer Type-Convert Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatConversionWI_VF<"int_riscv_vfwcvt_xu_f_v", "PseudoVFWCVT_XU_F">;
+defm "" : VPatConversionWI_VF<"int_riscv_vfwcvt_x_f_v", "PseudoVFWCVT_X_F">;
+defm "" : VPatConversionWI_VF<"int_riscv_vfwcvt_rtz_xu_f_v", "PseudoVFWCVT_RTZ_XU_F">;
+defm "" : VPatConversionWI_VF<"int_riscv_vfwcvt_rtz_x_f_v", "PseudoVFWCVT_RTZ_X_F">;
+defm "" : VPatConversionWF_VI<"int_riscv_vfwcvt_f_xu_v", "PseudoVFWCVT_F_XU">;
+defm "" : VPatConversionWF_VI<"int_riscv_vfwcvt_f_x_v", "PseudoVFWCVT_F_X">;
+defm "" : VPatConversionWF_VF<"int_riscv_vfwcvt_f_f_v", "PseudoVFWCVT_F_F">;
+
+//===----------------------------------------------------------------------===//
+// 14.19. Narrowing Floating-Point/Integer Type-Convert Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatConversionVI_WF<"int_riscv_vfncvt_xu_f_w", "PseudoVFNCVT_XU_F">;
+defm "" : VPatConversionVI_WF<"int_riscv_vfncvt_x_f_w", "PseudoVFNCVT_X_F">;
+defm "" : VPatConversionVI_WF<"int_riscv_vfncvt_rtz_xu_f_w", "PseudoVFNCVT_RTZ_XU_F">;
+defm "" : VPatConversionVI_WF<"int_riscv_vfncvt_rtz_x_f_w", "PseudoVFNCVT_RTZ_X_F">;
+defm "" : VPatConversionVF_WI <"int_riscv_vfncvt_f_xu_w", "PseudoVFNCVT_F_XU">;
+defm "" : VPatConversionVF_WI <"int_riscv_vfncvt_f_x_w", "PseudoVFNCVT_F_X">;
+defm "" : VPatConversionVF_WF<"int_riscv_vfncvt_f_f_w", "PseudoVFNCVT_F_F">;
+defm "" : VPatConversionVF_WF<"int_riscv_vfncvt_rod_f_f_w", "PseudoVFNCVT_ROD_F_F">;
+} // Predicates = [HasStdExtV, HasStdExtF]
+
+let Predicates = [HasStdExtV] in {
+//===----------------------------------------------------------------------===//
+// 15.1. Vector Single-Width Integer Reduction Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatReductionV_VS<"int_riscv_vredsum", "PseudoVREDSUM">;
+defm "" : VPatReductionV_VS<"int_riscv_vredand", "PseudoVREDAND">;
+defm "" : VPatReductionV_VS<"int_riscv_vredor", "PseudoVREDOR">;
+defm "" : VPatReductionV_VS<"int_riscv_vredxor", "PseudoVREDXOR">;
+defm "" : VPatReductionV_VS<"int_riscv_vredminu", "PseudoVREDMINU">;
+defm "" : VPatReductionV_VS<"int_riscv_vredmin", "PseudoVREDMIN">;
+defm "" : VPatReductionV_VS<"int_riscv_vredmaxu", "PseudoVREDMAXU">;
+defm "" : VPatReductionV_VS<"int_riscv_vredmax", "PseudoVREDMAX">;
+
+//===----------------------------------------------------------------------===//
+// 15.2. Vector Widening Integer Reduction Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatReductionW_VS<"int_riscv_vwredsumu", "PseudoVWREDSUMU">;
+defm "" : VPatReductionW_VS<"int_riscv_vwredsum", "PseudoVWREDSUM">;
+} // Predicates = [HasStdExtV]
+
+let Predicates = [HasStdExtV, HasStdExtF] in {
+//===----------------------------------------------------------------------===//
+// 15.3. Vector Single-Width Floating-Point Reduction Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatReductionV_VS<"int_riscv_vfredosum", "PseudoVFREDOSUM", /*IsFloat=*/1>;
+defm "" : VPatReductionV_VS<"int_riscv_vfredsum", "PseudoVFREDSUM", /*IsFloat=*/1>;
+defm "" : VPatReductionV_VS<"int_riscv_vfredmin", "PseudoVFREDMIN", /*IsFloat=*/1>;
+defm "" : VPatReductionV_VS<"int_riscv_vfredmax", "PseudoVFREDMAX", /*IsFloat=*/1>;
+
+//===----------------------------------------------------------------------===//
+// 15.4. Vector Widening Floating-Point Reduction Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatReductionW_VS<"int_riscv_vfwredsum", "PseudoVFWREDSUM", /*IsFloat=*/1>;
+defm "" : VPatReductionW_VS<"int_riscv_vfwredosum", "PseudoVFWREDOSUM", /*IsFloat=*/1>;
+
+} // Predicates = [HasStdExtV, HasStdExtF]
+
+//===----------------------------------------------------------------------===//
+// 16. Vector Mask Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasStdExtV] in {
+//===----------------------------------------------------------------------===//
+// 16.1 Vector Mask-Register Logical Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatBinaryM_MM<"int_riscv_vmand", "PseudoVMAND">;
+defm "" : VPatBinaryM_MM<"int_riscv_vmnand", "PseudoVMNAND">;
+defm "" : VPatBinaryM_MM<"int_riscv_vmandnot", "PseudoVMANDNOT">;
+defm "" : VPatBinaryM_MM<"int_riscv_vmxor", "PseudoVMXOR">;
+defm "" : VPatBinaryM_MM<"int_riscv_vmor", "PseudoVMOR">;
+defm "" : VPatBinaryM_MM<"int_riscv_vmnor", "PseudoVMNOR">;
+defm "" : VPatBinaryM_MM<"int_riscv_vmornot", "PseudoVMORNOT">;
+defm "" : VPatBinaryM_MM<"int_riscv_vmxnor", "PseudoVMXNOR">;
+
+// pseudo instructions
+defm "" : VPatNullaryM<"int_riscv_vmclr", "PseudoVMCLR">;
+defm "" : VPatNullaryM<"int_riscv_vmset", "PseudoVMSET">;
+
+//===----------------------------------------------------------------------===//
+// 16.2. Vector mask population count vpopc
+//===----------------------------------------------------------------------===//
+defm "" : VPatUnaryS_M<"int_riscv_vpopc", "PseudoVPOPC">;
+
+//===----------------------------------------------------------------------===//
+// 16.3. vfirst find-first-set mask bit
+//===----------------------------------------------------------------------===//
+defm "" : VPatUnaryS_M<"int_riscv_vfirst", "PseudoVFIRST">;
+
+//===----------------------------------------------------------------------===//
+// 16.4. vmsbf.m set-before-first mask bit
+//===----------------------------------------------------------------------===//
+defm "" : VPatUnaryM_M<"int_riscv_vmsbf", "PseudoVMSBF">;
+
+//===----------------------------------------------------------------------===//
+// 16.5. vmsif.m set-including-first mask bit
+//===----------------------------------------------------------------------===//
+defm "" : VPatUnaryM_M<"int_riscv_vmsif", "PseudoVMSIF">;
+
+//===----------------------------------------------------------------------===//
+// 16.6. vmsof.m set-only-first mask bit
+//===----------------------------------------------------------------------===//
+defm "" : VPatUnaryM_M<"int_riscv_vmsof", "PseudoVMSOF">;
+
+//===----------------------------------------------------------------------===//
+// 16.8.  Vector Iota Instruction
+//===----------------------------------------------------------------------===//
+defm "" : VPatUnaryV_M<"int_riscv_viota", "PseudoVIOTA">;
+
+//===----------------------------------------------------------------------===//
+// 16.9. Vector Element Index Instruction
+//===----------------------------------------------------------------------===//
+defm "" : VPatNullaryV<"int_riscv_vid", "PseudoVID">;
+
+} // Predicates = [HasStdExtV]
+
+//===----------------------------------------------------------------------===//
+// 17. Vector Permutation Instructions
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// 17.1. Integer Scalar Move Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasStdExtV] in {
+foreach vti = AllIntegerVectors in {
+  def : Pat<(riscv_vmv_x_s (vti.Vector vti.RegClass:$rs2)),
+            (!cast<Instruction>("PseudoVMV_X_S_" # vti.LMul.MX) $rs2, vti.SEW)>;
+  def : Pat<(vti.Vector (int_riscv_vmv_s_x (vti.Vector vti.RegClass:$rs1),
+                                           GPR:$rs2, (XLenVT (VLOp GPR:$vl)))),
+            (!cast<Instruction>("PseudoVMV_S_X_" # vti.LMul.MX)
+             (vti.Vector $rs1), $rs2, GPR:$vl, vti.SEW)>;
+}
+} // Predicates = [HasStdExtV]
+
+//===----------------------------------------------------------------------===//
+// 17.2. Floating-Point Scalar Move Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasStdExtV, HasStdExtF] in {
+foreach fvti = AllFloatVectors in {
+  defvar instr = !cast<Instruction>("PseudoVFMV_"#fvti.ScalarSuffix#"_S_" #
+                                    fvti.LMul.MX);
+  def : Pat<(fvti.Scalar (int_riscv_vfmv_f_s (fvti.Vector fvti.RegClass:$rs2))),
+                         (instr $rs2, fvti.SEW)>;
+
+  def : Pat<(fvti.Vector (int_riscv_vfmv_s_f (fvti.Vector fvti.RegClass:$rs1),
+                         (fvti.Scalar fvti.ScalarRegClass:$rs2), (XLenVT (VLOp GPR:$vl)))),
+            (!cast<Instruction>("PseudoVFMV_S_"#fvti.ScalarSuffix#"_" #
+                                fvti.LMul.MX)
+             (fvti.Vector $rs1),
+             (fvti.Scalar fvti.ScalarRegClass:$rs2),
+             GPR:$vl, fvti.SEW)>;
+}
+} // Predicates = [HasStdExtV, HasStdExtF]
+
+//===----------------------------------------------------------------------===//
+// 17.3. Vector Slide Instructions
+//===----------------------------------------------------------------------===//
+let Predicates = [HasStdExtV] in {
+  defm "" : VPatTernaryV_VX_VI<"int_riscv_vslideup", "PseudoVSLIDEUP", AllIntegerVectors, uimm5>;
+  defm "" : VPatTernaryV_VX_VI<"int_riscv_vslidedown", "PseudoVSLIDEDOWN", AllIntegerVectors, uimm5>;
+  defm "" : VPatBinaryV_VX<"int_riscv_vslide1up", "PseudoVSLIDE1UP", AllIntegerVectors>;
+  defm "" : VPatBinaryV_VX<"int_riscv_vslide1down", "PseudoVSLIDE1DOWN", AllIntegerVectors>;
+} // Predicates = [HasStdExtV]
+
+let Predicates = [HasStdExtV, HasStdExtF] in {
+  defm "" : VPatTernaryV_VX_VI<"int_riscv_vslideup", "PseudoVSLIDEUP", AllFloatVectors, uimm5>;
+  defm "" : VPatTernaryV_VX_VI<"int_riscv_vslidedown", "PseudoVSLIDEDOWN", AllFloatVectors, uimm5>;
+  defm "" : VPatBinaryV_VX<"int_riscv_vfslide1up", "PseudoVFSLIDE1UP", AllFloatVectors>;
+  defm "" : VPatBinaryV_VX<"int_riscv_vfslide1down", "PseudoVFSLIDE1DOWN", AllFloatVectors>;
+} // Predicates = [HasStdExtV, HasStdExtF]
+
+//===----------------------------------------------------------------------===//
+// 17.4. Vector Register Gather Instructions
+//===----------------------------------------------------------------------===//
+let Predicates = [HasStdExtV] in {
+  defm "" : VPatBinaryV_VV_VX_VI_INT<"int_riscv_vrgather", "PseudoVRGATHER",
+                                     AllIntegerVectors, uimm5>;
+  defm "" : VPatBinaryV_VV_INT_EEW<"int_riscv_vrgatherei16", "PseudoVRGATHEREI16",
+                                   /* eew */ 16, AllIntegerVectors>;
+} // Predicates = [HasStdExtV]
+
+let Predicates = [HasStdExtV, HasStdExtF] in {
+  defm "" : VPatBinaryV_VV_VX_VI_INT<"int_riscv_vrgather", "PseudoVRGATHER",
+                                     AllFloatVectors, uimm5>;
+  defm "" : VPatBinaryV_VV_INT_EEW<"int_riscv_vrgatherei16", "PseudoVRGATHEREI16",
+                                   /* eew */ 16, AllFloatVectors>;
+} // Predicates = [HasStdExtV, HasStdExtF]
+
+//===----------------------------------------------------------------------===//
+// 17.5. Vector Compress Instruction
+//===----------------------------------------------------------------------===//
+let Predicates = [HasStdExtV] in {
+  defm "" : VPatUnaryV_V_AnyMask<"int_riscv_vcompress", "PseudoVCOMPRESS", AllIntegerVectors>;
+} // Predicates = [HasStdExtV]
+
+let Predicates = [HasStdExtV, HasStdExtF] in {
+  defm "" : VPatUnaryV_V_AnyMask<"int_riscv_vcompress", "PseudoVCOMPRESS", AllFloatVectors>;
+} // Predicates = [HasStdExtV, HasStdExtF]
+
+// Include the non-intrinsic ISel patterns
+include "RISCVInstrInfoVSDPatterns.td"
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
new file mode 100644
index 000000000000..dee67708bed1
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
@@ -0,0 +1,643 @@
+//===- RISCVInstrInfoVSDPatterns.td - RVV SDNode patterns --*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// This file contains the required infrastructure and SDNode patterns to
+/// support code generation for the standard 'V' (Vector) extension, version
+/// 0.10.  This version is still experimental as the 'V' extension hasn't been
+/// ratified yet.
+///
+/// This file is included from and depends upon RISCVInstrInfoVPseudos.td
+///
+/// Note: the patterns for RVV intrinsics are found in
+/// RISCVInstrInfoVPseudos.td.
+///
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Helpers to define the SDNode patterns.
+//===----------------------------------------------------------------------===//
+
+def SDTSplatI64 : SDTypeProfile<1, 1, [
+  SDTCVecEltisVT<0, i64>, SDTCisVT<1, i32>
+]>;
+
+def rv32_splat_i64 : SDNode<"RISCVISD::SPLAT_VECTOR_I64", SDTSplatI64>;
+
+def riscv_trunc_vector : SDNode<"RISCVISD::TRUNCATE_VECTOR",
+                                SDTypeProfile<1, 1,
+                                 [SDTCisVec<0>, SDTCisVec<1>]>>;
+
+// Penalize the generic form with Complexity=1 to give the simm5/uimm5 variants
+// precedence
+def SplatPat       : ComplexPattern<vAny, 1, "selectVSplat", [], [], 1>;
+
+def SplatPat_simm5 : ComplexPattern<vAny, 1, "selectVSplatSimm5", []>;
+def SplatPat_uimm5 : ComplexPattern<vAny, 1, "selectVSplatUimm5", []>;
+
+class SwapHelper<dag Prefix, dag A, dag B, dag Suffix, bit swap> {
+   dag Value = !con(Prefix, !if(swap, B, A), !if(swap, A, B), Suffix);
+}
+
+multiclass VPatUSLoadStoreSDNode<LLVMType type,
+                                 LLVMType mask_type,
+                                 int sew,
+                                 LMULInfo vlmul,
+                                 OutPatFrag avl,
+                                 RegisterClass reg_rs1,
+                                 VReg reg_class>
+{
+  defvar load_instr = !cast<Instruction>("PseudoVLE"#sew#"_V_"#vlmul.MX);
+  defvar store_instr = !cast<Instruction>("PseudoVSE"#sew#"_V_"#vlmul.MX);
+  // Load
+  def : Pat<(type (load reg_rs1:$rs1)),
+            (load_instr reg_rs1:$rs1, avl, sew)>;
+  // Store
+  def : Pat<(store type:$rs2, reg_rs1:$rs1),
+            (store_instr reg_class:$rs2, reg_rs1:$rs1, avl, sew)>;
+}
+
+multiclass VPatUSLoadStoreSDNodes<RegisterClass reg_rs1> {
+  foreach vti = AllVectors in
+    defm "" : VPatUSLoadStoreSDNode<vti.Vector, vti.Mask, vti.SEW, vti.LMul,
+                                    vti.AVL, reg_rs1, vti.RegClass>;
+}
+
+class VPatBinarySDNode_VV<SDNode vop,
+                          string instruction_name,
+                          ValueType result_type,
+                          ValueType op_type,
+                          ValueType mask_type,
+                          int sew,
+                          LMULInfo vlmul,
+                          OutPatFrag avl,
+                          VReg RetClass,
+                          VReg op_reg_class> :
+    Pat<(result_type (vop
+                     (op_type op_reg_class:$rs1),
+                     (op_type op_reg_class:$rs2))),
+        (!cast<Instruction>(instruction_name#"_VV_"# vlmul.MX)
+                     op_reg_class:$rs1,
+                     op_reg_class:$rs2,
+                     avl, sew)>;
+
+class VPatBinarySDNode_XI<SDNode vop,
+                          string instruction_name,
+                          string suffix,
+                          ValueType result_type,
+                          ValueType vop_type,
+                          ValueType xop_type,
+                          ValueType mask_type,
+                          int sew,
+                          LMULInfo vlmul,
+                          OutPatFrag avl,
+                          VReg RetClass,
+                          VReg vop_reg_class,
+                          ComplexPattern SplatPatKind,
+                          DAGOperand xop_kind> :
+    Pat<(result_type (vop
+                     (vop_type vop_reg_class:$rs1),
+                     (vop_type (SplatPatKind xop_kind:$rs2)))),
+        (!cast<Instruction>(instruction_name#_#suffix#_# vlmul.MX)
+                     vop_reg_class:$rs1,
+                     xop_kind:$rs2,
+                     avl, sew)>;
+
+multiclass VPatBinarySDNode_VV_VX<SDNode vop, string instruction_name>
+{
+  foreach vti = AllIntegerVectors in {
+    def : VPatBinarySDNode_VV<vop, instruction_name,
+                              vti.Vector, vti.Vector, vti.Mask, vti.SEW,
+                              vti.LMul, vti.AVL, vti.RegClass, vti.RegClass>;
+    def : VPatBinarySDNode_XI<vop, instruction_name, "VX",
+                              vti.Vector, vti.Vector, XLenVT, vti.Mask, vti.SEW,
+                              vti.LMul, vti.AVL, vti.RegClass, vti.RegClass,
+                              SplatPat, GPR>;
+  }
+}
+
+multiclass VPatBinarySDNode_VV_VX_VI<SDNode vop, string instruction_name,
+                                     Operand ImmType = simm5>
+{
+  foreach vti = AllIntegerVectors in {
+    def : VPatBinarySDNode_VV<vop, instruction_name,
+                              vti.Vector, vti.Vector, vti.Mask, vti.SEW,
+                              vti.LMul, vti.AVL, vti.RegClass, vti.RegClass>;
+    def : VPatBinarySDNode_XI<vop, instruction_name, "VX",
+                              vti.Vector, vti.Vector, XLenVT, vti.Mask, vti.SEW,
+                              vti.LMul, vti.AVL, vti.RegClass, vti.RegClass,
+                              SplatPat, GPR>;
+    def : VPatBinarySDNode_XI<vop, instruction_name, "VI",
+                              vti.Vector, vti.Vector, XLenVT, vti.Mask, vti.SEW,
+                              vti.LMul, vti.AVL, vti.RegClass, vti.RegClass,
+                              !cast<ComplexPattern>(SplatPat#_#ImmType),
+                              ImmType>;
+  }
+}
+
+class VPatBinarySDNode_VF<SDNode vop,
+                          string instruction_name,
+                          ValueType result_type,
+                          ValueType vop_type,
+                          ValueType xop_type,
+                          ValueType mask_type,
+                          int sew,
+                          LMULInfo vlmul,
+                          OutPatFrag avl,
+                          VReg RetClass,
+                          VReg vop_reg_class,
+                          DAGOperand xop_kind> :
+    Pat<(result_type (vop (vop_type vop_reg_class:$rs1),
+                          (vop_type (splat_vector xop_kind:$rs2)))),
+        (!cast<Instruction>(instruction_name#"_"#vlmul.MX)
+                     vop_reg_class:$rs1,
+                     (xop_type xop_kind:$rs2),
+                     avl, sew)>;
+
+multiclass VPatBinaryFPSDNode_VV_VF<SDNode vop, string instruction_name> {
+  foreach vti = AllFloatVectors in {
+    def : VPatBinarySDNode_VV<vop, instruction_name,
+                              vti.Vector, vti.Vector, vti.Mask, vti.SEW,
+                              vti.LMul, vti.AVL, vti.RegClass, vti.RegClass>;
+    def : VPatBinarySDNode_VF<vop, instruction_name#"_V"#vti.ScalarSuffix,
+                              vti.Vector, vti.Vector, vti.Scalar, vti.Mask,
+                              vti.SEW, vti.LMul, vti.AVL, vti.RegClass, vti.RegClass,
+                              vti.ScalarRegClass>;
+  }
+}
+
+multiclass VPatBinaryFPSDNode_R_VF<SDNode vop, string instruction_name> {
+  foreach fvti = AllFloatVectors in
+    def : Pat<(fvti.Vector (vop (fvti.Vector (splat_vector fvti.Scalar:$rs2)),
+                                (fvti.Vector fvti.RegClass:$rs1))),
+              (!cast<Instruction>(instruction_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX)
+                           fvti.RegClass:$rs1,
+                           (fvti.Scalar fvti.ScalarRegClass:$rs2),
+                           fvti.AVL, fvti.SEW)>;
+}
+
+multiclass VPatIntegerSetCCSDNode_VV<CondCode cc,
+                                     string instruction_name,
+                                     bit swap = 0> {
+  foreach vti = AllIntegerVectors in {
+    defvar instruction = !cast<Instruction>(instruction_name#"_VV_"#vti.LMul.MX);
+    def : Pat<(vti.Mask (setcc (vti.Vector vti.RegClass:$rs1),
+                               (vti.Vector vti.RegClass:$rs2), cc)),
+              SwapHelper<(instruction),
+                         (instruction vti.RegClass:$rs1),
+                         (instruction vti.RegClass:$rs2),
+                         (instruction vti.AVL, vti.SEW),
+                         swap>.Value>;
+  }
+}
+
+multiclass VPatIntegerSetCCSDNode_XI<CondCode cc,
+                                     string instruction_name,
+                                     string kind,
+                                     ComplexPattern SplatPatKind,
+                                     DAGOperand xop_kind,
+                                     bit swap = 0> {
+  foreach vti = AllIntegerVectors in {
+    defvar instruction = !cast<Instruction>(instruction_name#_#kind#_#vti.LMul.MX);
+    def : Pat<(vti.Mask (setcc (vti.Vector vti.RegClass:$rs1),
+                               (vti.Vector (SplatPatKind xop_kind:$rs2)), cc)),
+              SwapHelper<(instruction),
+                         (instruction vti.RegClass:$rs1),
+                         (instruction xop_kind:$rs2),
+                         (instruction vti.AVL, vti.SEW),
+                         swap>.Value>;
+  }
+}
+
+multiclass VPatIntegerSetCCSDNode_VV_VX_VI<CondCode cc,
+                                           string instruction_name,
+                                           bit swap = 0> {
+  defm : VPatIntegerSetCCSDNode_VV<cc, instruction_name, swap>;
+  defm : VPatIntegerSetCCSDNode_XI<cc, instruction_name, "VX",
+                                   SplatPat, GPR, swap>;
+  defm : VPatIntegerSetCCSDNode_XI<cc, instruction_name, "VI",
+                                   SplatPat_simm5, simm5, swap>;
+}
+
+multiclass VPatIntegerSetCCSDNode_VV_VX<CondCode cc,
+                                        string instruction_name,
+                                        bit swap = 0> {
+  defm : VPatIntegerSetCCSDNode_VV<cc, instruction_name, swap>;
+  defm : VPatIntegerSetCCSDNode_XI<cc, instruction_name, "VX",
+                                   SplatPat, GPR, swap>;
+}
+
+multiclass VPatIntegerSetCCSDNode_VX_VI<CondCode cc,
+                                        string instruction_name,
+                                        bit swap = 0> {
+  defm : VPatIntegerSetCCSDNode_XI<cc, instruction_name, "VX",
+                                   SplatPat, GPR, swap>;
+  defm : VPatIntegerSetCCSDNode_XI<cc, instruction_name, "VI",
+                                   SplatPat_simm5, simm5, swap>;
+}
+
+multiclass VPatFPSetCCSDNode_VV<CondCode cc, string instruction_name> {
+  foreach fvti = AllFloatVectors in
+    def : Pat<(fvti.Mask (setcc (fvti.Vector fvti.RegClass:$rs1),
+                                (fvti.Vector fvti.RegClass:$rs2),
+                                cc)),
+              (!cast<Instruction>(instruction_name#"_VV_"#fvti.LMul.MX)
+                  fvti.RegClass:$rs1, fvti.RegClass:$rs2, fvti.AVL, fvti.SEW)>;
+}
+
+multiclass VPatFPSetCCSDNode_VF<CondCode cc, string instruction_name> {
+  foreach fvti = AllFloatVectors in
+    def : Pat<(fvti.Mask (setcc (fvti.Vector fvti.RegClass:$rs1),
+                                (fvti.Vector (splat_vector fvti.ScalarRegClass:$rs2)),
+                                cc)),
+              (!cast<Instruction>(instruction_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX)
+                  fvti.RegClass:$rs1,
+                  (fvti.Scalar fvti.ScalarRegClass:$rs2),
+                  fvti.AVL, fvti.SEW)>;
+}
+
+multiclass VPatFPSetCCSDNode_FV<CondCode cc, string swapped_op_instruction_name> {
+  foreach fvti = AllFloatVectors in
+    def : Pat<(fvti.Mask (setcc (fvti.Vector (splat_vector fvti.ScalarRegClass:$rs2)),
+                                (fvti.Vector fvti.RegClass:$rs1),
+                                cc)),
+              (!cast<Instruction>(swapped_op_instruction_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX)
+                  fvti.RegClass:$rs1,
+                  (fvti.Scalar fvti.ScalarRegClass:$rs2),
+                  fvti.AVL, fvti.SEW)>;
+}
+
+multiclass VPatFPSetCCSDNode_VV_VF_FV<CondCode cc,
+                                      string inst_name,
+                                      string swapped_op_inst_name> {
+  defm : VPatFPSetCCSDNode_VV<cc, inst_name>;
+  defm : VPatFPSetCCSDNode_VF<cc, inst_name>;
+  defm : VPatFPSetCCSDNode_FV<cc, swapped_op_inst_name>;
+}
+
+multiclass VPatExtendSDNode_V<list<SDNode> ops, string inst_name, string suffix,
+                              list <VTypeInfoToFraction> fraction_list> {
+  foreach vtiTofti = fraction_list in {
+    defvar vti = vtiTofti.Vti;
+    defvar fti = vtiTofti.Fti;
+    foreach op = ops in
+      def : Pat<(vti.Vector (op (fti.Vector fti.RegClass:$rs2))),
+                (!cast<Instruction>(inst_name#"_"#suffix#"_"#vti.LMul.MX)
+                    fti.RegClass:$rs2, fti.AVL, vti.SEW)>;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Patterns.
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasStdExtV] in {
+
+// 7.4. Vector Unit-Stride Instructions
+defm "" : VPatUSLoadStoreSDNodes<GPR>;
+defm "" : VPatUSLoadStoreSDNodes<AddrFI>;
+
+// 12.1. Vector Single-Width Integer Add and Subtract
+defm "" : VPatBinarySDNode_VV_VX_VI<add, "PseudoVADD">;
+defm "" : VPatBinarySDNode_VV_VX<sub, "PseudoVSUB">;
+// Handle VRSUB specially since it's the only integer binary op with reversed
+// pattern operands
+foreach vti = AllIntegerVectors in {
+  def : Pat<(sub (vti.Vector (SplatPat XLenVT:$rs2)),
+                 (vti.Vector vti.RegClass:$rs1)),
+            (!cast<Instruction>("PseudoVRSUB_VX_"# vti.LMul.MX)
+                 vti.RegClass:$rs1, GPR:$rs2, vti.AVL, vti.SEW)>;
+  def : Pat<(sub (vti.Vector (SplatPat_simm5 XLenVT:$rs2)),
+                 (vti.Vector vti.RegClass:$rs1)),
+            (!cast<Instruction>("PseudoVRSUB_VI_"# vti.LMul.MX)
+                 vti.RegClass:$rs1, simm5:$rs2, vti.AVL, vti.SEW)>;
+}
+
+// 12.3. Vector Integer Extension
+defm "" : VPatExtendSDNode_V<[zext, anyext], "PseudoVZEXT", "VF2",
+                             AllFractionableVF2IntVectors>;
+defm "" : VPatExtendSDNode_V<[sext],         "PseudoVSEXT", "VF2",
+                             AllFractionableVF2IntVectors>;
+defm "" : VPatExtendSDNode_V<[zext, anyext], "PseudoVZEXT", "VF4",
+                             AllFractionableVF4IntVectors>;
+defm "" : VPatExtendSDNode_V<[sext],         "PseudoVSEXT", "VF4",
+                             AllFractionableVF4IntVectors>;
+defm "" : VPatExtendSDNode_V<[zext, anyext], "PseudoVZEXT", "VF8",
+                             AllFractionableVF8IntVectors>;
+defm "" : VPatExtendSDNode_V<[sext],         "PseudoVSEXT", "VF8",
+                             AllFractionableVF8IntVectors>;
+
+// 12.5. Vector Bitwise Logical Instructions
+defm "" : VPatBinarySDNode_VV_VX_VI<and, "PseudoVAND">;
+defm "" : VPatBinarySDNode_VV_VX_VI<or, "PseudoVOR">;
+defm "" : VPatBinarySDNode_VV_VX_VI<xor, "PseudoVXOR">;
+
+// 12.6. Vector Single-Width Bit Shift Instructions
+defm "" : VPatBinarySDNode_VV_VX_VI<shl, "PseudoVSLL", uimm5>;
+defm "" : VPatBinarySDNode_VV_VX_VI<srl, "PseudoVSRL", uimm5>;
+defm "" : VPatBinarySDNode_VV_VX_VI<sra, "PseudoVSRA", uimm5>;
+
+// 12.7. Vector Narrowing Integer Right Shift Instructions
+foreach vtiTofti = AllFractionableVF2IntVectors in {
+  defvar vti = vtiTofti.Vti;
+  defvar fti = vtiTofti.Fti;
+  def : Pat<(fti.Vector (riscv_trunc_vector (vti.Vector vti.RegClass:$rs1))),
+            (!cast<Instruction>("PseudoVNSRL_WI_"#fti.LMul.MX)
+                vti.RegClass:$rs1, 0, fti.AVL, fti.SEW)>;
+}
+
+// 12.8. Vector Integer Comparison Instructions
+defm "" : VPatIntegerSetCCSDNode_VV_VX_VI<SETEQ,  "PseudoVMSEQ">;
+defm "" : VPatIntegerSetCCSDNode_VV_VX_VI<SETNE,  "PseudoVMSNE">;
+
+// FIXME: Support immediate forms of these by choosing SLE decrementing the
+// immediate
+defm "" : VPatIntegerSetCCSDNode_VV_VX<SETLT,  "PseudoVMSLT">;
+defm "" : VPatIntegerSetCCSDNode_VV_VX<SETULT, "PseudoVMSLTU">;
+
+defm "" : VPatIntegerSetCCSDNode_VV<SETGT,  "PseudoVMSLT", /*swap*/1>;
+defm "" : VPatIntegerSetCCSDNode_VV<SETUGT, "PseudoVMSLTU", /*swap*/1>;
+defm "" : VPatIntegerSetCCSDNode_VX_VI<SETGT,  "PseudoVMSGT">;
+defm "" : VPatIntegerSetCCSDNode_VX_VI<SETUGT, "PseudoVMSGTU">;
+
+defm "" : VPatIntegerSetCCSDNode_VV_VX_VI<SETLE,  "PseudoVMSLE">;
+defm "" : VPatIntegerSetCCSDNode_VV_VX_VI<SETULE, "PseudoVMSLEU">;
+
+// FIXME: Support immediate forms of these by choosing SGT and decrementing the
+// immediate
+defm "" : VPatIntegerSetCCSDNode_VV<SETGE,  "PseudoVMSLE", /*swap*/1>;
+defm "" : VPatIntegerSetCCSDNode_VV<SETUGE, "PseudoVMSLEU", /*swap*/1>;
+
+// 12.9. Vector Integer Min/Max Instructions
+defm "" : VPatBinarySDNode_VV_VX<umin, "PseudoVMINU">;
+defm "" : VPatBinarySDNode_VV_VX<smin, "PseudoVMIN">;
+defm "" : VPatBinarySDNode_VV_VX<umax, "PseudoVMAXU">;
+defm "" : VPatBinarySDNode_VV_VX<smax, "PseudoVMAX">;
+
+// 12.10. Vector Single-Width Integer Multiply Instructions
+defm "" : VPatBinarySDNode_VV_VX<mul, "PseudoVMUL">;
+defm "" : VPatBinarySDNode_VV_VX<mulhs, "PseudoVMULH">;
+defm "" : VPatBinarySDNode_VV_VX<mulhu, "PseudoVMULHU">;
+
+// 12.11. Vector Integer Divide Instructions
+defm "" : VPatBinarySDNode_VV_VX<udiv, "PseudoVDIVU">;
+defm "" : VPatBinarySDNode_VV_VX<sdiv, "PseudoVDIV">;
+defm "" : VPatBinarySDNode_VV_VX<urem, "PseudoVREMU">;
+defm "" : VPatBinarySDNode_VV_VX<srem, "PseudoVREM">;
+
+// 12.16. Vector Integer Merge Instructions
+foreach vti = AllIntegerVectors in {
+  def : Pat<(vti.Vector (vselect (vti.Mask VMV0:$vm), vti.RegClass:$rs1,
+                                                      vti.RegClass:$rs2)),
+            (!cast<Instruction>("PseudoVMERGE_VVM_"#vti.LMul.MX)
+                 vti.RegClass:$rs2, vti.RegClass:$rs1, VMV0:$vm,
+                 vti.AVL, vti.SEW)>;
+
+  def : Pat<(vti.Vector (vselect (vti.Mask VMV0:$vm), (SplatPat XLenVT:$rs1),
+                                                      vti.RegClass:$rs2)),
+            (!cast<Instruction>("PseudoVMERGE_VXM_"#vti.LMul.MX)
+                 vti.RegClass:$rs2, GPR:$rs1, VMV0:$vm, vti.AVL, vti.SEW)>;
+
+  def : Pat<(vti.Vector (vselect (vti.Mask VMV0:$vm), (SplatPat_simm5 simm5:$rs1),
+                                                      vti.RegClass:$rs2)),
+            (!cast<Instruction>("PseudoVMERGE_VIM_"#vti.LMul.MX)
+                 vti.RegClass:$rs2, simm5:$rs1, VMV0:$vm, vti.AVL, vti.SEW)>;
+}
+
+// 16.1. Vector Mask-Register Logical Instructions
+foreach mti = AllMasks in {
+  def : Pat<(mti.Mask (and VR:$rs1, VR:$rs2)),
+            (!cast<Instruction>("PseudoVMAND_MM_"#mti.LMul.MX)
+                 VR:$rs1, VR:$rs2, mti.AVL, mti.SEW)>;
+  def : Pat<(mti.Mask (or VR:$rs1, VR:$rs2)),
+            (!cast<Instruction>("PseudoVMOR_MM_"#mti.LMul.MX)
+                 VR:$rs1, VR:$rs2, mti.AVL, mti.SEW)>;
+  def : Pat<(mti.Mask (xor VR:$rs1, VR:$rs2)),
+            (!cast<Instruction>("PseudoVMXOR_MM_"#mti.LMul.MX)
+                 VR:$rs1, VR:$rs2, mti.AVL, mti.SEW)>;
+
+  def : Pat<(mti.Mask (vnot (and VR:$rs1, VR:$rs2))),
+            (!cast<Instruction>("PseudoVMNAND_MM_"#mti.LMul.MX)
+                 VR:$rs1, VR:$rs2, mti.AVL, mti.SEW)>;
+  def : Pat<(mti.Mask (vnot (or VR:$rs1, VR:$rs2))),
+            (!cast<Instruction>("PseudoVMNOR_MM_"#mti.LMul.MX)
+                 VR:$rs1, VR:$rs2, mti.AVL, mti.SEW)>;
+  def : Pat<(mti.Mask (vnot (xor VR:$rs1, VR:$rs2))),
+            (!cast<Instruction>("PseudoVMXNOR_MM_"#mti.LMul.MX)
+                 VR:$rs1, VR:$rs2, mti.AVL, mti.SEW)>;
+
+  def : Pat<(mti.Mask (and VR:$rs1, (vnot VR:$rs2))),
+            (!cast<Instruction>("PseudoVMANDNOT_MM_"#mti.LMul.MX)
+                 VR:$rs1, VR:$rs2, mti.AVL, mti.SEW)>;
+  def : Pat<(mti.Mask (or VR:$rs1, (vnot VR:$rs2))),
+            (!cast<Instruction>("PseudoVMORNOT_MM_"#mti.LMul.MX)
+                 VR:$rs1, VR:$rs2, mti.AVL, mti.SEW)>;
+}
+
+} // Predicates = [HasStdExtV]
+
+let Predicates = [HasStdExtV, HasStdExtF] in {
+
+// 14.2. Vector Single-Width Floating-Point Add/Subtract Instructions
+defm "" : VPatBinaryFPSDNode_VV_VF<fadd, "PseudoVFADD">;
+defm "" : VPatBinaryFPSDNode_VV_VF<fsub, "PseudoVFSUB">;
+defm "" : VPatBinaryFPSDNode_R_VF<fsub, "PseudoVFRSUB">;
+
+// 14.4. Vector Single-Width Floating-Point Multiply/Divide Instructions
+defm "" : VPatBinaryFPSDNode_VV_VF<fmul, "PseudoVFMUL">;
+defm "" : VPatBinaryFPSDNode_VV_VF<fdiv, "PseudoVFDIV">;
+defm "" : VPatBinaryFPSDNode_R_VF<fdiv, "PseudoVFRDIV">;
+
+// 14.11. Vector Floating-Point Compare Instructions
+defm "" : VPatFPSetCCSDNode_VV_VF_FV<SETEQ,  "PseudoVMFEQ", "PseudoVMFEQ">;
+defm "" : VPatFPSetCCSDNode_VV_VF_FV<SETOEQ, "PseudoVMFEQ", "PseudoVMFEQ">;
+
+defm "" : VPatFPSetCCSDNode_VV_VF_FV<SETNE,  "PseudoVMFNE", "PseudoVMFNE">;
+defm "" : VPatFPSetCCSDNode_VV_VF_FV<SETUNE, "PseudoVMFNE", "PseudoVMFNE">;
+
+defm "" : VPatFPSetCCSDNode_VV_VF_FV<SETLT,  "PseudoVMFLT", "PseudoVMFGT">;
+defm "" : VPatFPSetCCSDNode_VV_VF_FV<SETOLT, "PseudoVMFLT", "PseudoVMFGT">;
+
+defm "" : VPatFPSetCCSDNode_VV_VF_FV<SETLE,  "PseudoVMFLE", "PseudoVMFGE">;
+defm "" : VPatFPSetCCSDNode_VV_VF_FV<SETOLE, "PseudoVMFLE", "PseudoVMFGE">;
+
+// Floating-point vselects:
+// 12.16. Vector Integer Merge Instructions
+// 14.13. Vector Floating-Point Merge Instruction
+foreach fvti = AllFloatVectors in {
+  def : Pat<(fvti.Vector (vselect (fvti.Mask VMV0:$vm), fvti.RegClass:$rs1,
+                                                        fvti.RegClass:$rs2)),
+            (!cast<Instruction>("PseudoVMERGE_VVM_"#fvti.LMul.MX)
+                 fvti.RegClass:$rs2, fvti.RegClass:$rs1, VMV0:$vm,
+                 fvti.AVL, fvti.SEW)>;
+
+  def : Pat<(fvti.Vector (vselect (fvti.Mask VMV0:$vm),
+                                  (splat_vector fvti.ScalarRegClass:$rs1),
+                                  fvti.RegClass:$rs2)),
+            (!cast<Instruction>("PseudoVFMERGE_V"#fvti.ScalarSuffix#"M_"#fvti.LMul.MX)
+                 fvti.RegClass:$rs2,
+                 (fvti.Scalar fvti.ScalarRegClass:$rs1),
+                 VMV0:$vm, fvti.AVL, fvti.SEW)>;
+
+  def : Pat<(fvti.Vector (vselect (fvti.Mask VMV0:$vm),
+                                  (splat_vector (fvti.Scalar fpimm0)),
+                                  fvti.RegClass:$rs2)),
+            (!cast<Instruction>("PseudoVMERGE_VIM_"#fvti.LMul.MX)
+                 fvti.RegClass:$rs2, 0, VMV0:$vm, fvti.AVL, fvti.SEW)>;
+}
+} // Predicates = [HasStdExtV, HasStdExtF]
+
+//===----------------------------------------------------------------------===//
+// Vector Splats
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasStdExtV] in {
+foreach vti = AllIntegerVectors in {
+  def : Pat<(vti.Vector (splat_vector GPR:$rs1)),
+            (!cast<Instruction>("PseudoVMV_V_X_" # vti.LMul.MX)
+              GPR:$rs1, vti.AVL, vti.SEW)>;
+  def : Pat<(vti.Vector (splat_vector simm5:$rs1)),
+            (!cast<Instruction>("PseudoVMV_V_I_" # vti.LMul.MX)
+              simm5:$rs1, vti.AVL, vti.SEW)>;
+}
+
+foreach mti = AllMasks in {
+  def : Pat<(mti.Mask immAllOnesV),
+            (!cast<Instruction>("PseudoVMSET_M_"#mti.BX) mti.AVL, mti.SEW)>;
+  def : Pat<(mti.Mask immAllZerosV),
+            (!cast<Instruction>("PseudoVMCLR_M_"#mti.BX) mti.AVL, mti.SEW)>;
+}
+} // Predicates = [HasStdExtV]
+
+let Predicates = [HasStdExtV, IsRV32] in {
+foreach vti = AllIntegerVectors in {
+  if !eq(vti.SEW, 64) then {
+    def : Pat<(vti.Vector (rv32_splat_i64 GPR:$rs1)),
+              (!cast<Instruction>("PseudoVMV_V_X_" # vti.LMul.MX)
+                GPR:$rs1, vti.AVL, vti.SEW)>;
+    def : Pat<(vti.Vector (rv32_splat_i64 simm5:$rs1)),
+              (!cast<Instruction>("PseudoVMV_V_I_" # vti.LMul.MX)
+                simm5:$rs1, vti.AVL, vti.SEW)>;
+  }
+}
+} // Predicates = [HasStdExtV, IsRV32]
+
+let Predicates = [HasStdExtV, HasStdExtF] in {
+foreach fvti = AllFloatVectors in {
+  def : Pat<(fvti.Vector (splat_vector fvti.ScalarRegClass:$rs1)),
+            (!cast<Instruction>("PseudoVFMV_V_"#fvti.ScalarSuffix#"_"#fvti.LMul.MX)
+              (fvti.Scalar fvti.ScalarRegClass:$rs1),
+              fvti.AVL, fvti.SEW)>;
+
+  def : Pat<(fvti.Vector (splat_vector (fvti.Scalar fpimm0))),
+            (!cast<Instruction>("PseudoVMV_V_I_"#fvti.LMul.MX)
+              0, fvti.AVL, fvti.SEW)>;
+}
+} // Predicates = [HasStdExtV, HasStdExtF]
+
+//===----------------------------------------------------------------------===//
+// Vector Element Inserts/Extracts
+//===----------------------------------------------------------------------===//
+
+// The built-in TableGen 'extractelt' and 'insertelt' nodes must return the
+// same type as the vector element type. On RISC-V, XLenVT is the only legal
+// integer type, so for integer inserts/extracts we use a custom node which
+// returns XLenVT.
+def riscv_insert_vector_elt
+    : SDNode<"ISD::INSERT_VECTOR_ELT",
+             SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisVT<2, XLenVT>,
+                                  SDTCisPtrTy<3>]>, []>;
+def riscv_extract_vector_elt
+    : SDNode<"ISD::EXTRACT_VECTOR_ELT",
+             SDTypeProfile<1, 2, [SDTCisVT<0, XLenVT>, SDTCisPtrTy<2>]>, []>;
+
+multiclass VPatInsertExtractElt_XI_Idx<bit IsFloat> {
+  defvar vtilist = !if(IsFloat, AllFloatVectors, AllIntegerVectors);
+  defvar insertelt_node = !if(IsFloat, insertelt, riscv_insert_vector_elt);
+  defvar extractelt_node = !if(IsFloat, extractelt, riscv_extract_vector_elt);
+  foreach vti = vtilist in {
+    defvar MX = vti.LMul.MX;
+    defvar vmv_xf_s_inst = !cast<Instruction>(!strconcat("PseudoV",
+                                                         !if(IsFloat, "F", ""),
+                                                         "MV_",
+                                                         vti.ScalarSuffix,
+                                                         "_S_", MX));
+    defvar vmv_s_xf_inst = !cast<Instruction>(!strconcat("PseudoV",
+                                                         !if(IsFloat, "F", ""),
+                                                         "MV_S_",
+                                                         vti.ScalarSuffix,
+                                                         "_", MX));
+    // Only pattern-match insert/extract-element operations where the index is
+    // 0. Any other index will have been custom-lowered to slide the vector
+    // correctly into place (and, in the case of insert, slide it back again
+    // afterwards).
+    def : Pat<(vti.Scalar (extractelt_node (vti.Vector vti.RegClass:$rs2), 0)),
+              (vmv_xf_s_inst vti.RegClass:$rs2, vti.SEW)>;
+
+    def : Pat<(vti.Vector (insertelt_node (vti.Vector vti.RegClass:$merge),
+                                          vti.ScalarRegClass:$rs1, 0)),
+              (vmv_s_xf_inst vti.RegClass:$merge,
+                      (vti.Scalar vti.ScalarRegClass:$rs1),
+                      vti.AVL, vti.SEW)>;
+  }
+}
+
+let Predicates = [HasStdExtV] in
+defm "" : VPatInsertExtractElt_XI_Idx</*IsFloat*/0>;
+let Predicates = [HasStdExtV, HasStdExtF] in
+defm "" : VPatInsertExtractElt_XI_Idx</*IsFloat*/1>;
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous RISCVISD SDNodes
+//===----------------------------------------------------------------------===//
+
+def riscv_vid
+    : SDNode<"RISCVISD::VID", SDTypeProfile<1, 0, [SDTCisVec<0>]>, []>;
+
+def SDTRVVSlide : SDTypeProfile<1, 3, [
+  SDTCisVec<0>, SDTCisSameAs<1, 0>, SDTCisSameAs<2, 0>, SDTCisVT<3, XLenVT>
+]>;
+
+def riscv_slideup : SDNode<"RISCVISD::VSLIDEUP", SDTRVVSlide, []>;
+def riscv_slidedown : SDNode<"RISCVISD::VSLIDEDOWN", SDTRVVSlide, []>;
+
+let Predicates = [HasStdExtV] in {
+
+foreach vti = AllIntegerVectors in
+  def : Pat<(vti.Vector riscv_vid),
+            (!cast<Instruction>("PseudoVID_V_"#vti.LMul.MX) vti.AVL, vti.SEW)>;
+
+foreach vti = !listconcat(AllIntegerVectors, AllFloatVectors) in {
+    def : Pat<(vti.Vector (riscv_slideup (vti.Vector vti.RegClass:$rs3),
+                                         (vti.Vector vti.RegClass:$rs1),
+                                         uimm5:$rs2)),
+              (!cast<Instruction>("PseudoVSLIDEUP_VI_"#vti.LMul.MX)
+                  vti.RegClass:$rs3, vti.RegClass:$rs1, uimm5:$rs2,
+                  vti.AVL, vti.SEW)>;
+
+    def : Pat<(vti.Vector (riscv_slideup (vti.Vector vti.RegClass:$rs3),
+                                         (vti.Vector vti.RegClass:$rs1),
+                                         GPR:$rs2)),
+              (!cast<Instruction>("PseudoVSLIDEUP_VX_"#vti.LMul.MX)
+                  vti.RegClass:$rs3, vti.RegClass:$rs1, GPR:$rs2,
+                  vti.AVL, vti.SEW)>;
+
+    def : Pat<(vti.Vector (riscv_slidedown (vti.Vector vti.RegClass:$rs3),
+                                           (vti.Vector vti.RegClass:$rs1),
+                                           uimm5:$rs2)),
+              (!cast<Instruction>("PseudoVSLIDEDOWN_VI_"#vti.LMul.MX)
+                  vti.RegClass:$rs3, vti.RegClass:$rs1, uimm5:$rs2,
+                  vti.AVL, vti.SEW)>;
+
+    def : Pat<(vti.Vector (riscv_slidedown (vti.Vector vti.RegClass:$rs3),
+                                           (vti.Vector vti.RegClass:$rs1),
+                                           GPR:$rs2)),
+              (!cast<Instruction>("PseudoVSLIDEDOWN_VX_"#vti.LMul.MX)
+                  vti.RegClass:$rs3, vti.RegClass:$rs1, GPR:$rs2,
+                  vti.AVL, vti.SEW)>;
+}
+} // Predicates = [HasStdExtV]
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
new file mode 100644
index 000000000000..85ebe054499e
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
@@ -0,0 +1,371 @@
+//===-- RISCVInstrInfoFH.td - RISC-V 'FH' instructions -----*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the RISC-V instructions from the standard 'Zfh'
+// half-precision floating-point extension, version 0.1.
+// This version is still experimental as the 'Zfh' extension hasn't been
+// ratified yet.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// RISC-V specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+def SDT_RISCVFMV_H_X
+    : SDTypeProfile<1, 1, [SDTCisVT<0, f16>, SDTCisVT<1, XLenVT>]>;
+def SDT_RISCVFMV_X_ANYEXTH
+    : SDTypeProfile<1, 1, [SDTCisVT<0, XLenVT>, SDTCisVT<1, f16>]>;
+
+def riscv_fmv_h_x
+    : SDNode<"RISCVISD::FMV_H_X", SDT_RISCVFMV_H_X>;
+def riscv_fmv_x_anyexth
+    : SDNode<"RISCVISD::FMV_X_ANYEXTH", SDT_RISCVFMV_X_ANYEXTH>;
+
+//===----------------------------------------------------------------------===//
+// Instruction class templates
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class FPFMAH_rrr_frm<RISCVOpcode opcode, string opcodestr>
+    : RVInstR4<0b10, opcode, (outs FPR16:$rd),
+               (ins FPR16:$rs1, FPR16:$rs2, FPR16:$rs3, frmarg:$funct3),
+                opcodestr, "$rd, $rs1, $rs2, $rs3, $funct3">;
+
+class FPFMAHDynFrmAlias<FPFMAH_rrr_frm Inst, string OpcodeStr>
+    : InstAlias<OpcodeStr#" $rd, $rs1, $rs2, $rs3",
+                (Inst FPR16:$rd, FPR16:$rs1, FPR16:$rs2, FPR16:$rs3, 0b111)>;
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class FPALUH_rr<bits<7> funct7, bits<3> funct3, string opcodestr>
+    : RVInstR<funct7, funct3, OPC_OP_FP, (outs FPR16:$rd),
+              (ins FPR16:$rs1, FPR16:$rs2), opcodestr, "$rd, $rs1, $rs2">;
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class FPALUH_rr_frm<bits<7> funct7, string opcodestr>
+    : RVInstRFrm<funct7, OPC_OP_FP, (outs FPR16:$rd),
+                 (ins FPR16:$rs1, FPR16:$rs2, frmarg:$funct3), opcodestr,
+                  "$rd, $rs1, $rs2, $funct3">;
+
+class FPALUHDynFrmAlias<FPALUH_rr_frm Inst, string OpcodeStr>
+    : InstAlias<OpcodeStr#" $rd, $rs1, $rs2",
+                (Inst FPR16:$rd, FPR16:$rs1, FPR16:$rs2, 0b111)>;
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class FPCmpH_rr<bits<3> funct3, string opcodestr>
+    : RVInstR<0b1010010, funct3, OPC_OP_FP, (outs GPR:$rd),
+              (ins FPR16:$rs1, FPR16:$rs2), opcodestr, "$rd, $rs1, $rs2">,
+      Sched<[]>;
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasStdExtZfh] in {
+let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in
+def FLH : RVInstI<0b001, OPC_LOAD_FP, (outs FPR16:$rd),
+                  (ins GPR:$rs1, simm12:$imm12),
+                   "flh", "$rd, ${imm12}(${rs1})">,
+          Sched<[]>;
+
+// Operands for stores are in the order srcreg, base, offset rather than
+// reflecting the order these fields are specified in the instruction
+// encoding.
+let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in
+def FSH : RVInstS<0b001, OPC_STORE_FP, (outs),
+                  (ins FPR16:$rs2, GPR:$rs1, simm12:$imm12),
+                   "fsh", "$rs2, ${imm12}(${rs1})">,
+          Sched<[]>;
+
+def FMADD_H  : FPFMAH_rrr_frm<OPC_MADD, "fmadd.h">,
+               Sched<[]>;
+def          : FPFMAHDynFrmAlias<FMADD_H, "fmadd.h">;
+def FMSUB_H  : FPFMAH_rrr_frm<OPC_MSUB, "fmsub.h">,
+               Sched<[]>;
+def          : FPFMAHDynFrmAlias<FMSUB_H, "fmsub.h">;
+def FNMSUB_H : FPFMAH_rrr_frm<OPC_NMSUB, "fnmsub.h">,
+               Sched<[]>;
+def          : FPFMAHDynFrmAlias<FNMSUB_H, "fnmsub.h">;
+def FNMADD_H : FPFMAH_rrr_frm<OPC_NMADD, "fnmadd.h">,
+               Sched<[]>;
+def          : FPFMAHDynFrmAlias<FNMADD_H, "fnmadd.h">;
+
+def FADD_H : FPALUH_rr_frm<0b0000010, "fadd.h">,
+             Sched<[]>;
+def        : FPALUHDynFrmAlias<FADD_H, "fadd.h">;
+def FSUB_H : FPALUH_rr_frm<0b0000110, "fsub.h">,
+             Sched<[]>;
+def        : FPALUHDynFrmAlias<FSUB_H, "fsub.h">;
+def FMUL_H : FPALUH_rr_frm<0b0001010, "fmul.h">,
+             Sched<[]>;
+def        : FPALUHDynFrmAlias<FMUL_H, "fmul.h">;
+def FDIV_H : FPALUH_rr_frm<0b0001110, "fdiv.h">,
+             Sched<[]>;
+def        : FPALUHDynFrmAlias<FDIV_H, "fdiv.h">;
+
+def FSQRT_H : FPUnaryOp_r_frm<0b0101110, FPR16, FPR16, "fsqrt.h">,
+              Sched<[]> {
+  let rs2 = 0b00000;
+}
+def         : FPUnaryOpDynFrmAlias<FSQRT_H, "fsqrt.h", FPR16, FPR16>;
+
+def FSGNJ_H  : FPALUH_rr<0b0010010, 0b000, "fsgnj.h">,
+               Sched<[]>;
+def FSGNJN_H : FPALUH_rr<0b0010010, 0b001, "fsgnjn.h">,
+               Sched<[]>;
+def FSGNJX_H : FPALUH_rr<0b0010010, 0b010, "fsgnjx.h">,
+               Sched<[]>;
+
+def FMIN_H   : FPALUH_rr<0b0010110, 0b000, "fmin.h">,
+               Sched<[]>;
+def FMAX_H   : FPALUH_rr<0b0010110, 0b001, "fmax.h">,
+               Sched<[]>;
+
+def FCVT_W_H : FPUnaryOp_r_frm<0b1100010, GPR, FPR16, "fcvt.w.h">,
+               Sched<[]> {
+  let rs2 = 0b00000;
+}
+def          : FPUnaryOpDynFrmAlias<FCVT_W_H, "fcvt.w.h", GPR, FPR16>;
+
+def FCVT_WU_H : FPUnaryOp_r_frm<0b1100010, GPR, FPR16, "fcvt.wu.h">,
+                Sched<[]> {
+  let rs2 = 0b00001;
+}
+def           : FPUnaryOpDynFrmAlias<FCVT_WU_H, "fcvt.wu.h", GPR, FPR16>;
+
+def FCVT_H_W : FPUnaryOp_r_frm<0b1101010, FPR16, GPR, "fcvt.h.w">,
+               Sched<[]> {
+  let rs2 = 0b00000;
+}
+def          : FPUnaryOpDynFrmAlias<FCVT_H_W, "fcvt.h.w", FPR16, GPR>;
+
+def FCVT_H_WU : FPUnaryOp_r_frm<0b1101010, FPR16, GPR, "fcvt.h.wu">,
+                Sched<[]> {
+  let rs2 = 0b00001;
+}
+def           : FPUnaryOpDynFrmAlias<FCVT_H_WU, "fcvt.h.wu", FPR16, GPR>;
+
+def FCVT_H_S : FPUnaryOp_r_frm<0b0100010, FPR16, FPR32, "fcvt.h.s">,
+               Sched<[]> {
+  let rs2 = 0b00000;
+}
+def          : FPUnaryOpDynFrmAlias<FCVT_H_S, "fcvt.h.s", FPR16, FPR32>;
+
+def FCVT_S_H : FPUnaryOp_r<0b0100000, 0b000, FPR32, FPR16, "fcvt.s.h">,
+               Sched<[]> {
+  let rs2 = 0b00010;
+}
+
+def FMV_X_H : FPUnaryOp_r<0b1110010, 0b000, GPR, FPR16, "fmv.x.h">,
+              Sched<[]> {
+  let rs2 = 0b00000;
+}
+
+def FMV_H_X : FPUnaryOp_r<0b1111010, 0b000, FPR16, GPR, "fmv.h.x">,
+              Sched<[]> {
+  let rs2 = 0b00000;
+}
+
+def FEQ_H : FPCmpH_rr<0b010, "feq.h">;
+def FLT_H : FPCmpH_rr<0b001, "flt.h">;
+def FLE_H : FPCmpH_rr<0b000, "fle.h">;
+
+def FCLASS_H : FPUnaryOp_r<0b1110010, 0b001, GPR, FPR16, "fclass.h">,
+               Sched<[]> {
+  let rs2 = 0b00000;
+}
+} // Predicates = [HasStdExtZfh]
+
+let Predicates = [HasStdExtZfh, IsRV64] in {
+def FCVT_L_H  : FPUnaryOp_r_frm<0b1100010, GPR, FPR16, "fcvt.l.h">,
+                Sched<[]> {
+  let rs2 = 0b00010;
+}
+def           : FPUnaryOpDynFrmAlias<FCVT_L_H, "fcvt.l.h", GPR, FPR16>;
+
+def FCVT_LU_H  : FPUnaryOp_r_frm<0b1100010, GPR, FPR16, "fcvt.lu.h">,
+                 Sched<[]> {
+  let rs2 = 0b00011;
+}
+def            : FPUnaryOpDynFrmAlias<FCVT_LU_H, "fcvt.lu.h", GPR, FPR16>;
+
+def FCVT_H_L : FPUnaryOp_r_frm<0b1101010, FPR16, GPR, "fcvt.h.l">,
+               Sched<[]> {
+  let rs2 = 0b00010;
+}
+def          : FPUnaryOpDynFrmAlias<FCVT_H_L, "fcvt.h.l", FPR16, GPR>;
+
+def FCVT_H_LU : FPUnaryOp_r_frm<0b1101010, FPR16, GPR, "fcvt.h.lu">,
+                Sched<[]> {
+  let rs2 = 0b00011;
+}
+def           : FPUnaryOpDynFrmAlias<FCVT_H_LU, "fcvt.h.lu", FPR16, GPR>;
+} // Predicates = [HasStdExtZfh, IsRV64]
+
+let Predicates = [HasStdExtZfh, HasStdExtD] in {
+def FCVT_H_D : FPUnaryOp_r_frm<0b0100010, FPR16, FPR64, "fcvt.h.d">,
+               Sched<[]> {
+  let rs2 = 0b00001;
+}
+def          : FPUnaryOpDynFrmAlias<FCVT_H_D, "fcvt.h.d", FPR16, FPR64>;
+
+def FCVT_D_H : FPUnaryOp_r<0b0100001, 0b000, FPR64, FPR16, "fcvt.d.h">,
+               Sched<[]> {
+  let rs2 = 0b00010;
+}
+} // Predicates = [HasStdExtZfh, HasStdExtD]
+
+//===----------------------------------------------------------------------===//
+// Assembler Pseudo Instructions (User-Level ISA, Version 2.2, Chapter 20)
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasStdExtZfh] in {
+def : InstAlias<"flh $rd, (${rs1})",  (FLH FPR16:$rd,  GPR:$rs1, 0), 0>;
+def : InstAlias<"fsh $rs2, (${rs1})", (FSH FPR16:$rs2, GPR:$rs1, 0), 0>;
+
+def : InstAlias<"fmv.h $rd, $rs",  (FSGNJ_H  FPR16:$rd, FPR16:$rs, FPR16:$rs)>;
+def : InstAlias<"fabs.h $rd, $rs", (FSGNJX_H FPR16:$rd, FPR16:$rs, FPR16:$rs)>;
+def : InstAlias<"fneg.h $rd, $rs", (FSGNJN_H FPR16:$rd, FPR16:$rs, FPR16:$rs)>;
+
+// fgt.h/fge.h are recognised by the GNU assembler but the canonical
+// flt.h/fle.h forms will always be printed. Therefore, set a zero weight.
+def : InstAlias<"fgt.h $rd, $rs, $rt",
+                (FLT_H GPR:$rd, FPR16:$rt, FPR16:$rs), 0>;
+def : InstAlias<"fge.h $rd, $rs, $rt",
+                (FLE_H GPR:$rd, FPR16:$rt, FPR16:$rs), 0>;
+
+def PseudoFLH  : PseudoFloatLoad<"flh", FPR16>;
+def PseudoFSH  : PseudoStore<"fsh", FPR16>;
+} // Predicates = [HasStdExtZfh]
+
+//===----------------------------------------------------------------------===//
+// Pseudo-instructions and codegen patterns
+//===----------------------------------------------------------------------===//
+
+/// Generic pattern classes
+class PatFpr16Fpr16<SDPatternOperator OpNode, RVInstR Inst>
+    : Pat<(OpNode FPR16:$rs1, FPR16:$rs2), (Inst $rs1, $rs2)>;
+
+class PatFpr16Fpr16DynFrm<SDPatternOperator OpNode, RVInstRFrm Inst>
+    : Pat<(OpNode FPR16:$rs1, FPR16:$rs2), (Inst $rs1, $rs2, 0b111)>;
+
+let Predicates = [HasStdExtZfh] in {
+
+/// Float constants
+def : Pat<(f16 (fpimm0)), (FMV_H_X X0)>;
+
+/// Float conversion operations
+
+// [u]int32<->float conversion patterns must be gated on IsRV32 or IsRV64, so
+// are defined later.
+
+/// Float arithmetic operations
+
+def : PatFpr16Fpr16DynFrm<fadd, FADD_H>;
+def : PatFpr16Fpr16DynFrm<fsub, FSUB_H>;
+def : PatFpr16Fpr16DynFrm<fmul, FMUL_H>;
+def : PatFpr16Fpr16DynFrm<fdiv, FDIV_H>;
+
+def : Pat<(fsqrt FPR16:$rs1), (FSQRT_H FPR16:$rs1, 0b111)>;
+
+def : Pat<(fneg FPR16:$rs1), (FSGNJN_H $rs1, $rs1)>;
+def : Pat<(fabs FPR16:$rs1), (FSGNJX_H $rs1, $rs1)>;
+
+def : PatFpr16Fpr16<fcopysign, FSGNJ_H>;
+def : Pat<(fcopysign FPR16:$rs1, (fneg FPR16:$rs2)), (FSGNJN_H $rs1, $rs2)>;
+def : Pat<(fcopysign FPR16:$rs1, FPR32:$rs2),
+          (FSGNJ_H $rs1, (FCVT_H_S $rs2, 0b111))>;
+def : Pat<(fcopysign FPR16:$rs1, FPR64:$rs2),
+          (FSGNJ_H $rs1, (FCVT_H_D $rs2, 0b111))>;
+def : Pat<(fcopysign FPR32:$rs1, FPR16:$rs2), (FSGNJ_S $rs1, (FCVT_S_H $rs2))>;
+def : Pat<(fcopysign FPR64:$rs1, FPR16:$rs2), (FSGNJ_D $rs1, (FCVT_D_H $rs2))>;
+
+// fmadd: rs1 * rs2 + rs3
+def : Pat<(fma FPR16:$rs1, FPR16:$rs2, FPR16:$rs3),
+          (FMADD_H $rs1, $rs2, $rs3, 0b111)>;
+
+// fmsub: rs1 * rs2 - rs3
+def : Pat<(fma FPR16:$rs1, FPR16:$rs2, (fneg FPR16:$rs3)),
+          (FMSUB_H FPR16:$rs1, FPR16:$rs2, FPR16:$rs3, 0b111)>;
+
+// fnmsub: -rs1 * rs2 + rs3
+def : Pat<(fma (fneg FPR16:$rs1), FPR16:$rs2, FPR16:$rs3),
+          (FNMSUB_H FPR16:$rs1, FPR16:$rs2, FPR16:$rs3, 0b111)>;
+
+// fnmadd: -rs1 * rs2 - rs3
+def : Pat<(fma (fneg FPR16:$rs1), FPR16:$rs2, (fneg FPR16:$rs3)),
+          (FNMADD_H FPR16:$rs1, FPR16:$rs2, FPR16:$rs3, 0b111)>;
+
+def : PatFpr16Fpr16<fminnum, FMIN_H>;
+def : PatFpr16Fpr16<fmaxnum, FMAX_H>;
+
+/// Setcc
+
+def : PatFpr16Fpr16<seteq, FEQ_H>;
+def : PatFpr16Fpr16<setoeq, FEQ_H>;
+def : PatFpr16Fpr16<setlt, FLT_H>;
+def : PatFpr16Fpr16<setolt, FLT_H>;
+def : PatFpr16Fpr16<setle, FLE_H>;
+def : PatFpr16Fpr16<setole, FLE_H>;
+
+def Select_FPR16_Using_CC_GPR : SelectCC_rrirr<FPR16, GPR>;
+
+/// Loads
+
+defm : LdPat<load, FLH>;
+
+/// Stores
+
+defm : StPat<store, FSH, FPR16>;
+
+/// Float conversion operations
+
+// f32 -> f16, f16 -> f32
+def : Pat<(fpround FPR32:$rs1), (FCVT_H_S FPR32:$rs1, 0b111)>;
+def : Pat<(fpextend FPR16:$rs1), (FCVT_S_H FPR16:$rs1)>;
+
+// Moves (no conversion)
+def : Pat<(riscv_fmv_h_x GPR:$src), (FMV_H_X GPR:$src)>;
+def : Pat<(riscv_fmv_x_anyexth FPR16:$src), (FMV_X_H FPR16:$src)>;
+} // Predicates = [HasStdExtZfh]
+
+let Predicates = [HasStdExtZfh, IsRV32] in {
+// float->[u]int. Round-to-zero must be used.
+def : Pat<(fp_to_sint FPR16:$rs1), (FCVT_W_H $rs1, 0b001)>;
+def : Pat<(fp_to_uint FPR16:$rs1), (FCVT_WU_H $rs1, 0b001)>;
+
+// [u]int->float. Match GCC and default to using dynamic rounding mode.
+def : Pat<(sint_to_fp GPR:$rs1), (FCVT_H_W $rs1, 0b111)>;
+def : Pat<(uint_to_fp GPR:$rs1), (FCVT_H_WU $rs1, 0b111)>;
+} // Predicates = [HasStdExtZfh, IsRV32]
+
+let Predicates = [HasStdExtZfh, IsRV64] in {
+// FP->[u]int32 is mostly handled by the FP->[u]int64 patterns. This is safe
+// because fpto[u|s]i produces poison if the value can't fit into the target.
+// We match the single case below because fcvt.wu.s sign-extends its result so
+// is cheaper than fcvt.lu.h+sext.w.
+def : Pat<(sext_inreg (assertzexti32 (fp_to_uint FPR16:$rs1)), i32),
+          (FCVT_WU_H $rs1, 0b001)>;
+
+// FP->[u]int64
+def : Pat<(fp_to_sint FPR16:$rs1), (FCVT_L_H $rs1, 0b001)>;
+def : Pat<(fp_to_uint FPR16:$rs1), (FCVT_LU_H $rs1, 0b001)>;
+
+// [u]int->fp. Match GCC and default to using dynamic rounding mode.
+def : Pat<(sint_to_fp (sexti32 GPR:$rs1)), (FCVT_H_W $rs1, 0b111)>;
+def : Pat<(uint_to_fp (zexti32 GPR:$rs1)), (FCVT_H_WU $rs1, 0b111)>;
+def : Pat<(sint_to_fp GPR:$rs1), (FCVT_H_L $rs1, 0b111)>;
+def : Pat<(uint_to_fp GPR:$rs1), (FCVT_H_LU $rs1, 0b111)>;
+} // Predicates = [HasStdExtZfh, IsRV64]
+
+let Predicates = [HasStdExtZfh, HasStdExtD] in {
+/// Float conversion operations
+// f64 -> f16, f16 -> f64
+def : Pat<(fpround FPR64:$rs1), (FCVT_H_D FPR64:$rs1, 0b111)>;
+def : Pat<(fpextend FPR16:$rs1), (FCVT_D_H FPR16:$rs1)>;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVMCInstLower.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVMCInstLower.cpp
index b1dbcfa7f738..3c38dd1bf64d 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVMCInstLower.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVMCInstLower.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "RISCV.h"
+#include "RISCVSubtarget.h"
 #include "MCTargetDesc/RISCVMCExpr.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -121,12 +122,93 @@ bool llvm::LowerRISCVMachineOperandToMCOperand(const MachineOperand &MO,
   case MachineOperand::MO_ConstantPoolIndex:
     MCOp = lowerSymbolOperand(MO, AP.GetCPISymbol(MO.getIndex()), AP);
     break;
+  case MachineOperand::MO_JumpTableIndex:
+    MCOp = lowerSymbolOperand(MO, AP.GetJTISymbol(MO.getIndex()), AP);
+    break;
+  }
+  return true;
+}
+
+static bool lowerRISCVVMachineInstrToMCInst(const MachineInstr *MI,
+                                            MCInst &OutMI) {
+  const RISCVVPseudosTable::PseudoInfo *RVV =
+      RISCVVPseudosTable::getPseudoInfo(MI->getOpcode());
+  if (!RVV)
+    return false;
+
+  OutMI.setOpcode(RVV->BaseInstr);
+
+  const MachineBasicBlock *MBB = MI->getParent();
+  assert(MBB && "MI expected to be in a basic block");
+  const MachineFunction *MF = MBB->getParent();
+  assert(MF && "MBB expected to be in a machine function");
+
+  const TargetRegisterInfo *TRI =
+      MF->getSubtarget<RISCVSubtarget>().getRegisterInfo();
+  assert(TRI && "TargetRegisterInfo expected");
+
+  uint64_t TSFlags = MI->getDesc().TSFlags;
+  int NumOps = MI->getNumExplicitOperands();
+
+  for (const MachineOperand &MO : MI->explicit_operands()) {
+    int OpNo = (int)MI->getOperandNo(&MO);
+    assert(OpNo >= 0 && "Operand number doesn't fit in an 'int' type");
+
+    // Skip VL and SEW operands which are the last two operands if present.
+    if ((TSFlags & RISCVII::HasVLOpMask) && OpNo == (NumOps - 2))
+      continue;
+    if ((TSFlags & RISCVII::HasSEWOpMask) && OpNo == (NumOps - 1))
+      continue;
+
+    // Skip merge op. It should be the first operand after the result.
+    if ((TSFlags & RISCVII::HasMergeOpMask) && OpNo == 1) {
+      assert(MI->getNumExplicitDefs() == 1);
+      continue;
+    }
+
+    MCOperand MCOp;
+    switch (MO.getType()) {
+    default:
+      llvm_unreachable("Unknown operand type");
+    case MachineOperand::MO_Register: {
+      unsigned Reg = MO.getReg();
+
+      if (RISCV::VRM2RegClass.contains(Reg) ||
+          RISCV::VRM4RegClass.contains(Reg) ||
+          RISCV::VRM8RegClass.contains(Reg)) {
+        Reg = TRI->getSubReg(Reg, RISCV::sub_vrm1_0);
+        assert(Reg && "Subregister does not exist");
+      } else if (RISCV::FPR16RegClass.contains(Reg)) {
+        Reg = TRI->getMatchingSuperReg(Reg, RISCV::sub_16, &RISCV::FPR32RegClass);
+        assert(Reg && "Subregister does not exist");
+      } else if (RISCV::FPR64RegClass.contains(Reg)) {
+        Reg = TRI->getSubReg(Reg, RISCV::sub_32);
+        assert(Reg && "Superregister does not exist");
+      }
+
+      MCOp = MCOperand::createReg(Reg);
+      break;
+    }
+    case MachineOperand::MO_Immediate:
+      MCOp = MCOperand::createImm(MO.getImm());
+      break;
+    }
+    OutMI.addOperand(MCOp);
   }
+
+  // Unmasked pseudo instructions need to append dummy mask operand to
+  // V instructions. All V instructions are modeled as the masked version.
+  if (TSFlags & RISCVII::HasDummyMaskOpMask)
+    OutMI.addOperand(MCOperand::createReg(RISCV::NoRegister));
+
   return true;
 }
 
 void llvm::LowerRISCVMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
                                           const AsmPrinter &AP) {
+  if (lowerRISCVVMachineInstrToMCInst(MI, OutMI))
+    return;
+
   OutMI.setOpcode(MI->getOpcode());
 
   for (const MachineOperand &MO : MI->operands()) {
@@ -134,4 +216,20 @@ void llvm::LowerRISCVMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
     if (LowerRISCVMachineOperandToMCOperand(MO, MCOp, AP))
       OutMI.addOperand(MCOp);
   }
+
+  if (OutMI.getOpcode() == RISCV::PseudoReadVLENB) {
+    OutMI.setOpcode(RISCV::CSRRS);
+    OutMI.addOperand(MCOperand::createImm(
+        RISCVSysReg::lookupSysRegByName("VLENB")->Encoding));
+    OutMI.addOperand(MCOperand::createReg(RISCV::X0));
+    return;
+  }
+
+  if (OutMI.getOpcode() == RISCV::PseudoReadVL) {
+    OutMI.setOpcode(RISCV::CSRRS);
+    OutMI.addOperand(MCOperand::createImm(
+        RISCVSysReg::lookupSysRegByName("VL")->Encoding));
+    OutMI.addOperand(MCOperand::createReg(RISCV::X0));
+    return;
+  }
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
index 4c9013aa1e23..87586023caa4 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
@@ -64,7 +64,7 @@ private:
 } // end anonymous namespace
 
 char RISCVMergeBaseOffsetOpt::ID = 0;
-INITIALIZE_PASS(RISCVMergeBaseOffsetOpt, "riscv-merge-base-offset",
+INITIALIZE_PASS(RISCVMergeBaseOffsetOpt, DEBUG_TYPE,
                 RISCV_MERGE_BASE_OFFSET_NAME, false, false)
 
 // Detect the pattern:
@@ -216,12 +216,14 @@ bool RISCVMergeBaseOffsetOpt::detectAndFoldOffset(MachineInstr &HiLUI,
   case RISCV::LHU:
   case RISCV::LWU:
   case RISCV::LD:
+  case RISCV::FLH:
   case RISCV::FLW:
   case RISCV::FLD:
   case RISCV::SB:
   case RISCV::SH:
   case RISCV::SW:
   case RISCV::SD:
+  case RISCV::FSH:
   case RISCV::FSW:
   case RISCV::FSD: {
     // Transforms the sequence:            Into:
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index cb7d55eb0f0c..631077ef83f5 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -29,6 +29,9 @@ using namespace llvm;
 
 static_assert(RISCV::X1 == RISCV::X0 + 1, "Register list not consecutive");
 static_assert(RISCV::X31 == RISCV::X0 + 31, "Register list not consecutive");
+static_assert(RISCV::F1_H == RISCV::F0_H + 1, "Register list not consecutive");
+static_assert(RISCV::F31_H == RISCV::F0_H + 31,
+              "Register list not consecutive");
 static_assert(RISCV::F1_F == RISCV::F0_F + 1, "Register list not consecutive");
 static_assert(RISCV::F31_F == RISCV::F0_F + 31,
               "Register list not consecutive");
@@ -45,6 +48,8 @@ RISCVRegisterInfo::RISCVRegisterInfo(unsigned HwMode)
 const MCPhysReg *
 RISCVRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   auto &Subtarget = MF->getSubtarget<RISCVSubtarget>();
+  if (MF->getFunction().getCallingConv() == CallingConv::GHC)
+    return CSR_NoRegs_SaveList;
   if (MF->getFunction().hasFnAttribute("interrupt")) {
     if (Subtarget.hasStdExtD())
       return CSR_XLEN_F64_Interrupt_SaveList;
@@ -89,6 +94,13 @@ BitVector RISCVRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   // variable-sized objects at runtime.
   if (TFI->hasBP(MF))
     markSuperRegs(Reserved, RISCVABI::getBPReg()); // bp
+
+  // V registers for code generation. We handle them manually.
+  markSuperRegs(Reserved, RISCV::VL);
+  markSuperRegs(Reserved, RISCV::VTYPE);
+  markSuperRegs(Reserved, RISCV::VXSAT);
+  markSuperRegs(Reserved, RISCV::VXRM);
+
   assert(checkAllSuperRegsMarked(Reserved));
   return Reserved;
 }
@@ -152,9 +164,10 @@ void RISCVRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 
   int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
   Register FrameReg;
-  int Offset =
-      getFrameLowering(MF)->getFrameIndexReference(MF, FrameIndex, FrameReg) +
-      MI.getOperand(FIOperandNum + 1).getImm();
+  int Offset = getFrameLowering(MF)
+                   ->getFrameIndexReference(MF, FrameIndex, FrameReg)
+                   .getFixed() +
+               MI.getOperand(FIOperandNum + 1).getImm();
 
   if (!isInt<32>(Offset)) {
     report_fatal_error(
@@ -190,9 +203,11 @@ Register RISCVRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
 
 const uint32_t *
 RISCVRegisterInfo::getCallPreservedMask(const MachineFunction & MF,
-                                        CallingConv::ID /*CC*/) const {
+                                        CallingConv::ID CC) const {
   auto &Subtarget = MF.getSubtarget<RISCVSubtarget>();
 
+  if (CC == CallingConv::GHC)
+    return CSR_NoRegs_RegMask;
   switch (Subtarget.getTargetABI()) {
   default:
     llvm_unreachable("Unrecognized ABI");
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
index 7544b4b3b845..e1a11fd9389f 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
@@ -16,14 +16,23 @@ class RISCVReg<bits<5> Enc, string n, list<string> alt = []> : Register<n> {
   let AltNames = alt;
 }
 
-class RISCVReg32<bits<5> Enc, string n, list<string> alt = []> : Register<n> {
+class RISCVReg16<bits<5> Enc, string n, list<string> alt = []> : Register<n> {
   let HWEncoding{4-0} = Enc;
   let AltNames = alt;
 }
 
+def sub_16 : SubRegIndex<16>;
+class RISCVReg32<RISCVReg16 subreg> : Register<""> {
+  let HWEncoding{4-0} = subreg.HWEncoding{4-0};
+  let SubRegs = [subreg];
+  let SubRegIndices = [sub_16];
+  let AsmName = subreg.AsmName;
+  let AltNames = subreg.AltNames;
+}
+
 // Because RISCVReg64 register have AsmName and AltNames that alias with their
-// 32-bit sub-register, RISCVAsmParser will need to coerce a register number
-// from a RISCVReg32 to the equivalent RISCVReg64 when appropriate.
+// 16/32-bit sub-register, RISCVAsmParser will need to coerce a register number
+// from a RISCVReg16/RISCVReg32 to the equivalent RISCVReg64 when appropriate.
 def sub_32 : SubRegIndex<32>;
 class RISCVReg64<RISCVReg32 subreg> : Register<""> {
   let HWEncoding{4-0} = subreg.HWEncoding{4-0};
@@ -42,12 +51,21 @@ class RISCVRegWithSubRegs<bits<5> Enc, string n, list<Register> subregs,
 
 def ABIRegAltName : RegAltNameIndex;
 
-def sub_vrm2    : SubRegIndex<64, -1>;
-def sub_vrm2_hi : SubRegIndex<64, -1>;
-def sub_vrm4    : SubRegIndex<128, -1>;
-def sub_vrm4_hi : SubRegIndex<128, -1>;
-def sub_vrm8    : SubRegIndex<256, -1>;
-def sub_vrm8_hi : SubRegIndex<256, -1>;
+def sub_vrm1_0 : SubRegIndex<64,  -1>;
+def sub_vrm1_1 : SubRegIndex<64,  -1>;
+def sub_vrm1_2 : SubRegIndex<64,  -1>;
+def sub_vrm1_3 : SubRegIndex<64,  -1>;
+def sub_vrm1_4 : SubRegIndex<64,  -1>;
+def sub_vrm1_5 : SubRegIndex<64,  -1>;
+def sub_vrm1_6 : SubRegIndex<64,  -1>;
+def sub_vrm1_7 : SubRegIndex<64,  -1>;
+def sub_vrm2_0 : SubRegIndex<128, -1>;
+def sub_vrm2_1 : SubRegIndex<128, -1>;
+def sub_vrm2_2 : SubRegIndex<128, -1>;
+def sub_vrm2_3 : SubRegIndex<128, -1>;
+def sub_vrm4_0 : SubRegIndex<256, -1>;
+def sub_vrm4_1 : SubRegIndex<256, -1>;
+
 } // Namespace = "RISCV"
 
 // Integer registers
@@ -97,8 +115,8 @@ let RegAltNameIndices = [ABIRegAltName] in {
   }
 }
 
-def XLenVT : ValueTypeByHwMode<[RV32, RV64, DefaultMode],
-                               [i32,  i64,  i32]>;
+def XLenVT : ValueTypeByHwMode<[RV32, RV64],
+                               [i32,  i64]>;
 
 // The order of registers represents the preferred allocation sequence.
 // Registers are listed in the order caller-save, callee-save, specials.
@@ -111,14 +129,14 @@ def GPR : RegisterClass<"RISCV", [XLenVT], 32, (add
     (sequence "X%u", 0, 4)
   )> {
   let RegInfos = RegInfoByHwMode<
-      [RV32,              RV64,              DefaultMode],
-      [RegInfo<32,32,32>, RegInfo<64,64,64>, RegInfo<32,32,32>]>;
+      [RV32,              RV64],
+      [RegInfo<32,32,32>, RegInfo<64,64,64>]>;
 }
 
 def GPRX0 : RegisterClass<"RISCV", [XLenVT], 32, (add X0)> {
   let RegInfos = RegInfoByHwMode<
-      [RV32,              RV64,              DefaultMode],
-      [RegInfo<32,32,32>, RegInfo<64,64,64>, RegInfo<32,32,32>]>;
+      [RV32,              RV64],
+      [RegInfo<32,32,32>, RegInfo<64,64,64>]>;
 }
 
 // The order of registers represents the preferred allocation sequence.
@@ -132,8 +150,8 @@ def GPRNoX0 : RegisterClass<"RISCV", [XLenVT], 32, (add
     (sequence "X%u", 1, 4)
   )> {
   let RegInfos = RegInfoByHwMode<
-      [RV32,              RV64,              DefaultMode],
-      [RegInfo<32,32,32>, RegInfo<64,64,64>, RegInfo<32,32,32>]>;
+      [RV32,              RV64],
+      [RegInfo<32,32,32>, RegInfo<64,64,64>]>;
 }
 
 def GPRNoX0X2 : RegisterClass<"RISCV", [XLenVT], 32, (add
@@ -145,8 +163,8 @@ def GPRNoX0X2 : RegisterClass<"RISCV", [XLenVT], 32, (add
     X1, X3, X4
   )> {
   let RegInfos = RegInfoByHwMode<
-      [RV32,              RV64,              DefaultMode],
-      [RegInfo<32,32,32>, RegInfo<64,64,64>, RegInfo<32,32,32>]>;
+      [RV32,              RV64],
+      [RegInfo<32,32,32>, RegInfo<64,64,64>]>;
 }
 
 def GPRC : RegisterClass<"RISCV", [XLenVT], 32, (add
@@ -154,8 +172,8 @@ def GPRC : RegisterClass<"RISCV", [XLenVT], 32, (add
     (sequence "X%u", 8, 9)
   )> {
   let RegInfos = RegInfoByHwMode<
-      [RV32,              RV64,              DefaultMode],
-      [RegInfo<32,32,32>, RegInfo<64,64,64>, RegInfo<32,32,32>]>;
+      [RV32,              RV64],
+      [RegInfo<32,32,32>, RegInfo<64,64,64>]>;
 }
 
 // For indirect tail calls, we can't use callee-saved registers, as they are
@@ -167,50 +185,55 @@ def GPRTC : RegisterClass<"RISCV", [XLenVT], 32, (add
     (sequence "X%u", 28, 31)
   )> {
   let RegInfos = RegInfoByHwMode<
-      [RV32,              RV64,              DefaultMode],
-      [RegInfo<32,32,32>, RegInfo<64,64,64>, RegInfo<32,32,32>]>;
+      [RV32,              RV64],
+      [RegInfo<32,32,32>, RegInfo<64,64,64>]>;
 }
 
 def SP : RegisterClass<"RISCV", [XLenVT], 32, (add X2)> {
   let RegInfos = RegInfoByHwMode<
-      [RV32,              RV64,              DefaultMode],
-      [RegInfo<32,32,32>, RegInfo<64,64,64>, RegInfo<32,32,32>]>;
+      [RV32,              RV64],
+      [RegInfo<32,32,32>, RegInfo<64,64,64>]>;
 }
 
 // Floating point registers
 let RegAltNameIndices = [ABIRegAltName] in {
-  def F0_F  : RISCVReg32<0, "f0", ["ft0"]>, DwarfRegNum<[32]>;
-  def F1_F  : RISCVReg32<1, "f1", ["ft1"]>, DwarfRegNum<[33]>;
-  def F2_F  : RISCVReg32<2, "f2", ["ft2"]>, DwarfRegNum<[34]>;
-  def F3_F  : RISCVReg32<3, "f3", ["ft3"]>, DwarfRegNum<[35]>;
-  def F4_F  : RISCVReg32<4, "f4", ["ft4"]>, DwarfRegNum<[36]>;
-  def F5_F  : RISCVReg32<5, "f5", ["ft5"]>, DwarfRegNum<[37]>;
-  def F6_F  : RISCVReg32<6, "f6", ["ft6"]>, DwarfRegNum<[38]>;
-  def F7_F  : RISCVReg32<7, "f7", ["ft7"]>, DwarfRegNum<[39]>;
-  def F8_F  : RISCVReg32<8, "f8", ["fs0"]>, DwarfRegNum<[40]>;
-  def F9_F  : RISCVReg32<9, "f9", ["fs1"]>, DwarfRegNum<[41]>;
-  def F10_F : RISCVReg32<10,"f10", ["fa0"]>, DwarfRegNum<[42]>;
-  def F11_F : RISCVReg32<11,"f11", ["fa1"]>, DwarfRegNum<[43]>;
-  def F12_F : RISCVReg32<12,"f12", ["fa2"]>, DwarfRegNum<[44]>;
-  def F13_F : RISCVReg32<13,"f13", ["fa3"]>, DwarfRegNum<[45]>;
-  def F14_F : RISCVReg32<14,"f14", ["fa4"]>, DwarfRegNum<[46]>;
-  def F15_F : RISCVReg32<15,"f15", ["fa5"]>, DwarfRegNum<[47]>;
-  def F16_F : RISCVReg32<16,"f16", ["fa6"]>, DwarfRegNum<[48]>;
-  def F17_F : RISCVReg32<17,"f17", ["fa7"]>, DwarfRegNum<[49]>;
-  def F18_F : RISCVReg32<18,"f18", ["fs2"]>, DwarfRegNum<[50]>;
-  def F19_F : RISCVReg32<19,"f19", ["fs3"]>, DwarfRegNum<[51]>;
-  def F20_F : RISCVReg32<20,"f20", ["fs4"]>, DwarfRegNum<[52]>;
-  def F21_F : RISCVReg32<21,"f21", ["fs5"]>, DwarfRegNum<[53]>;
-  def F22_F : RISCVReg32<22,"f22", ["fs6"]>, DwarfRegNum<[54]>;
-  def F23_F : RISCVReg32<23,"f23", ["fs7"]>, DwarfRegNum<[55]>;
-  def F24_F : RISCVReg32<24,"f24", ["fs8"]>, DwarfRegNum<[56]>;
-  def F25_F : RISCVReg32<25,"f25", ["fs9"]>, DwarfRegNum<[57]>;
-  def F26_F : RISCVReg32<26,"f26", ["fs10"]>, DwarfRegNum<[58]>;
-  def F27_F : RISCVReg32<27,"f27", ["fs11"]>, DwarfRegNum<[59]>;
-  def F28_F : RISCVReg32<28,"f28", ["ft8"]>, DwarfRegNum<[60]>;
-  def F29_F : RISCVReg32<29,"f29", ["ft9"]>, DwarfRegNum<[61]>;
-  def F30_F : RISCVReg32<30,"f30", ["ft10"]>, DwarfRegNum<[62]>;
-  def F31_F : RISCVReg32<31,"f31", ["ft11"]>, DwarfRegNum<[63]>;
+  def F0_H  : RISCVReg16<0, "f0", ["ft0"]>, DwarfRegNum<[32]>;
+  def F1_H  : RISCVReg16<1, "f1", ["ft1"]>, DwarfRegNum<[33]>;
+  def F2_H  : RISCVReg16<2, "f2", ["ft2"]>, DwarfRegNum<[34]>;
+  def F3_H  : RISCVReg16<3, "f3", ["ft3"]>, DwarfRegNum<[35]>;
+  def F4_H  : RISCVReg16<4, "f4", ["ft4"]>, DwarfRegNum<[36]>;
+  def F5_H  : RISCVReg16<5, "f5", ["ft5"]>, DwarfRegNum<[37]>;
+  def F6_H  : RISCVReg16<6, "f6", ["ft6"]>, DwarfRegNum<[38]>;
+  def F7_H  : RISCVReg16<7, "f7", ["ft7"]>, DwarfRegNum<[39]>;
+  def F8_H  : RISCVReg16<8, "f8", ["fs0"]>, DwarfRegNum<[40]>;
+  def F9_H  : RISCVReg16<9, "f9", ["fs1"]>, DwarfRegNum<[41]>;
+  def F10_H : RISCVReg16<10,"f10", ["fa0"]>, DwarfRegNum<[42]>;
+  def F11_H : RISCVReg16<11,"f11", ["fa1"]>, DwarfRegNum<[43]>;
+  def F12_H : RISCVReg16<12,"f12", ["fa2"]>, DwarfRegNum<[44]>;
+  def F13_H : RISCVReg16<13,"f13", ["fa3"]>, DwarfRegNum<[45]>;
+  def F14_H : RISCVReg16<14,"f14", ["fa4"]>, DwarfRegNum<[46]>;
+  def F15_H : RISCVReg16<15,"f15", ["fa5"]>, DwarfRegNum<[47]>;
+  def F16_H : RISCVReg16<16,"f16", ["fa6"]>, DwarfRegNum<[48]>;
+  def F17_H : RISCVReg16<17,"f17", ["fa7"]>, DwarfRegNum<[49]>;
+  def F18_H : RISCVReg16<18,"f18", ["fs2"]>, DwarfRegNum<[50]>;
+  def F19_H : RISCVReg16<19,"f19", ["fs3"]>, DwarfRegNum<[51]>;
+  def F20_H : RISCVReg16<20,"f20", ["fs4"]>, DwarfRegNum<[52]>;
+  def F21_H : RISCVReg16<21,"f21", ["fs5"]>, DwarfRegNum<[53]>;
+  def F22_H : RISCVReg16<22,"f22", ["fs6"]>, DwarfRegNum<[54]>;
+  def F23_H : RISCVReg16<23,"f23", ["fs7"]>, DwarfRegNum<[55]>;
+  def F24_H : RISCVReg16<24,"f24", ["fs8"]>, DwarfRegNum<[56]>;
+  def F25_H : RISCVReg16<25,"f25", ["fs9"]>, DwarfRegNum<[57]>;
+  def F26_H : RISCVReg16<26,"f26", ["fs10"]>, DwarfRegNum<[58]>;
+  def F27_H : RISCVReg16<27,"f27", ["fs11"]>, DwarfRegNum<[59]>;
+  def F28_H : RISCVReg16<28,"f28", ["ft8"]>, DwarfRegNum<[60]>;
+  def F29_H : RISCVReg16<29,"f29", ["ft9"]>, DwarfRegNum<[61]>;
+  def F30_H : RISCVReg16<30,"f30", ["ft10"]>, DwarfRegNum<[62]>;
+  def F31_H : RISCVReg16<31,"f31", ["ft11"]>, DwarfRegNum<[63]>;
+
+  foreach Index = 0-31 in {
+    def F#Index#_F : RISCVReg32<!cast<RISCVReg16>("F"#Index#"_H")>,
+      DwarfRegNum<[!add(Index, 32)]>;
+  }
 
   foreach Index = 0-31 in {
     def F#Index#_D : RISCVReg64<!cast<RISCVReg32>("F"#Index#"_F")>,
@@ -220,6 +243,14 @@ let RegAltNameIndices = [ABIRegAltName] in {
 
 // The order of registers represents the preferred allocation sequence,
 // meaning caller-save regs are listed before callee-save.
+def FPR16 : RegisterClass<"RISCV", [f16], 16, (add
+    (sequence "F%u_H", 0, 7),
+    (sequence "F%u_H", 10, 17),
+    (sequence "F%u_H", 28, 31),
+    (sequence "F%u_H", 8, 9),
+    (sequence "F%u_H", 18, 27)
+)>;
+
 def FPR32 : RegisterClass<"RISCV", [f32], 32, (add
     (sequence "F%u_F", 0, 7),
     (sequence "F%u_F", 10, 17),
@@ -248,10 +279,139 @@ def FPR64C : RegisterClass<"RISCV", [f64], 64, (add
   (sequence "F%u_D", 8, 9)
 )>;
 
+// Vector type mapping to LLVM types.
+//
+// Though the V extension allows that VLEN be as small as 8,
+// this approach assumes that VLEN>=64.
+// Additionally, the only supported ELEN values are 32 and 64,
+// thus `vscale` can be defined as VLEN/64,
+// allowing the same types with either ELEN value.
+//
+//         MF8    MF4     MF2     M1      M2      M4       M8
+// i64*    N/A    N/A     N/A     nxv1i64 nxv2i64 nxv4i64  nxv8i64
+// i32     N/A    N/A     nxv1i32 nxv2i32 nxv4i32 nxv8i32  nxv16i32
+// i16     N/A    nxv1i16 nxv2i16 nxv4i16 nxv8i16 nxv16i16 nxv32i16
+// i8      nxv1i8 nxv2i8  nxv4i8  nxv8i8  nxv16i8 nxv32i8  nxv64i8
+// double* N/A    N/A     N/A     nxv1f64 nxv2f64 nxv4f64  nxv8f64
+// float   N/A    N/A     nxv1f32 nxv2f32 nxv4f32 nxv8f32  nxv16f32
+// half    N/A    nxv1f16 nxv2f16 nxv4f16 nxv8f16 nxv16f16 nxv32f16
+// * ELEN=64
+
+defvar vint8mf8_t = nxv1i8;
+defvar vint8mf4_t = nxv2i8;
+defvar vint8mf2_t = nxv4i8;
+defvar vint8m1_t = nxv8i8;
+defvar vint8m2_t = nxv16i8;
+defvar vint8m4_t = nxv32i8;
+defvar vint8m8_t = nxv64i8;
+
+defvar vint16mf4_t = nxv1i16;
+defvar vint16mf2_t = nxv2i16;
+defvar vint16m1_t  = nxv4i16;
+defvar vint16m2_t  = nxv8i16;
+defvar vint16m4_t  = nxv16i16;
+defvar vint16m8_t  = nxv32i16;
+
+defvar vint32mf2_t = nxv1i32;
+defvar vint32m1_t  = nxv2i32;
+defvar vint32m2_t  = nxv4i32;
+defvar vint32m4_t  = nxv8i32;
+defvar vint32m8_t  = nxv16i32;
+
+defvar vint64m1_t = nxv1i64;
+defvar vint64m2_t = nxv2i64;
+defvar vint64m4_t = nxv4i64;
+defvar vint64m8_t = nxv8i64;
+
+defvar vfloat16mf4_t = nxv1f16;
+defvar vfloat16mf2_t = nxv2f16;
+defvar vfloat16m1_t  = nxv4f16;
+defvar vfloat16m2_t  = nxv8f16;
+defvar vfloat16m4_t  = nxv16f16;
+defvar vfloat16m8_t  = nxv32f16;
+
+defvar vfloat32mf2_t = nxv1f32;
+defvar vfloat32m1_t  = nxv2f32;
+defvar vfloat32m2_t  = nxv4f32;
+defvar vfloat32m4_t  = nxv8f32;
+defvar vfloat32m8_t  = nxv16f32;
+
+defvar vfloat64m1_t = nxv1f64;
+defvar vfloat64m2_t = nxv2f64;
+defvar vfloat64m4_t = nxv4f64;
+defvar vfloat64m8_t = nxv8f64;
+
+defvar vbool1_t  = nxv64i1;
+defvar vbool2_t  = nxv32i1;
+defvar vbool4_t  = nxv16i1;
+defvar vbool8_t  = nxv8i1;
+defvar vbool16_t = nxv4i1;
+defvar vbool32_t = nxv2i1;
+defvar vbool64_t = nxv1i1;
+
+// There is no need to define register classes for fractional LMUL.
+def LMULList {
+  list<int> m = [1, 2, 4, 8];
+}
+
+//===----------------------------------------------------------------------===//
+// Utility classes for segment load/store.
+//===----------------------------------------------------------------------===//
+// The set of legal NF for LMUL = lmul.
+// LMUL == 1, NF = 2, 3, 4, 5, 6, 7, 8
+// LMUL == 2, NF = 2, 3, 4
+// LMUL == 4, NF = 2
+class NFList<int lmul> {
+  list<int> L = !cond(!eq(lmul, 1): [2, 3, 4, 5, 6, 7, 8],
+                      !eq(lmul, 2): [2, 3, 4],
+                      !eq(lmul, 4): [2],
+                      !eq(lmul, 8): []);
+}
+
+// Generate [start, end) SubRegIndex list.
+class SubRegSet<list<SubRegIndex> LIn, int start, int nf, int lmul> {
+  list<SubRegIndex> L = !foldl([]<SubRegIndex>,
+                               [0, 1, 2, 3, 4, 5, 6, 7],
+                               AccList, i,
+                               !listconcat(AccList,
+                                 !if(!lt(i, nf),
+                                   [!cast<SubRegIndex>("sub_vrm" # lmul # "_" # i)],
+                                   [])));
+}
+
+class IndexSet<int index, int nf, int lmul> {
+  list<int> R =
+    !foldl([]<int>,
+              [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
+               13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
+               23, 24, 25, 26, 27, 28, 29, 30, 31],
+              L, i,
+              !listconcat(L,
+                          !if(!and(
+                                !le(!mul(index, lmul), !mul(i, lmul)),
+                                !le(!mul(i, lmul),
+                                    !sub(!add(32, !mul(index, lmul)), !mul(nf, lmul)))
+                              ), [!mul(i, lmul)], [])));
+}
+
+class VRegList<list<dag> LIn, int start, int nf, int lmul> {
+  list<dag> L =
+    !if(!ge(start, nf),
+        LIn,
+        !listconcat(
+          [!dag(add,
+                !foreach(i, IndexSet<start, nf, lmul>.R,
+                         !cast<Register>("V" # i # !cond(!eq(lmul, 2): "M2",
+                                                         !eq(lmul, 4): "M4",
+                                                         true: ""))),
+                !listsplat("", !size(IndexSet<start, nf, lmul>.R)))],
+          VRegList<LIn, !add(start, 1), nf, lmul>.L));
+}
+
 // Vector registers
 let RegAltNameIndices = [ABIRegAltName] in {
   foreach Index = 0-31 in {
-    def V#Index : RISCVReg<Index, "v"#Index, ["v"#Index]>, DwarfRegNum<[!add(Index, 64)]>;
+    def V#Index : RISCVReg<Index, "v"#Index, ["v"#Index]>, DwarfRegNum<[!add(Index, 96)]>;
   }
 
   foreach Index = [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22,
@@ -261,7 +421,7 @@ let RegAltNameIndices = [ABIRegAltName] in {
                         !cast<Register>("V"#!add(Index, 1))],
                        ["v"#Index]>,
                      DwarfRegAlias<!cast<Register>("V"#Index)> {
-      let SubRegIndices = [sub_vrm2, sub_vrm2_hi];
+      let SubRegIndices = [sub_vrm1_0, sub_vrm1_1];
     }
   }
 
@@ -271,7 +431,7 @@ let RegAltNameIndices = [ABIRegAltName] in {
                         !cast<Register>("V"#!add(Index, 2)#"M2")],
                        ["v"#Index]>,
                      DwarfRegAlias<!cast<Register>("V"#Index)> {
-      let SubRegIndices = [sub_vrm4, sub_vrm4_hi];
+      let SubRegIndices = [sub_vrm2_0, sub_vrm2_1];
     }
   }
 
@@ -281,54 +441,91 @@ let RegAltNameIndices = [ABIRegAltName] in {
                         !cast<Register>("V"#!add(Index, 4)#"M4")],
                        ["v"#Index]>,
                      DwarfRegAlias<!cast<Register>("V"#Index)> {
-      let SubRegIndices = [sub_vrm8, sub_vrm8_hi];
+      let SubRegIndices = [sub_vrm4_0, sub_vrm4_1];
     }
   }
 
   def VTYPE  : RISCVReg<0, "vtype", ["vtype"]>;
   def VL     : RISCVReg<0, "vl", ["vl"]>;
+  def VXSAT  : RISCVReg<0, "vxsat", ["vxsat"]>;
+  def VXRM   : RISCVReg<0, "vxrm", ["vxrm"]>;
 }
 
-class RegisterTypes<list<ValueType> reg_types> {
-  list<ValueType> types = reg_types;
-}
-
-// The order of registers represents the preferred allocation sequence,
-// meaning caller-save regs are listed before callee-save.
-def VR : RegisterClass<"RISCV", [nxv8i8, nxv4i16, nxv2i32, nxv1i64],
-                         64, (add
-    (sequence "V%u", 25, 31),
-    (sequence "V%u", 8, 24),
-    (sequence "V%u", 0, 7)
-  )> {
-  let Size = 64;
-}
-
-def VRM2 : RegisterClass<"RISCV", [nxv16i8, nxv8i16, nxv4i32, nxv2i64], 64,
-                         (add V26M2, V28M2, V30M2, V8M2, V10M2, V12M2, V14M2, V16M2,
-                              V18M2, V20M2, V22M2, V24M2, V0M2, V2M2, V4M2, V6M2)> {
-  let Size = 128;
-}
-
-def VRM4 : RegisterClass<"RISCV", [nxv32i8, nxv16i16, nxv8i32, nxv4i64], 64,
-                         (add V28M4, V8M4, V12M4, V16M4, V20M4, V24M4, V0M4, V4M4)> {
-  let Size = 256;
+foreach m = [1, 2, 4] in {
+  foreach n = NFList<m>.L in {
+    def "VN" # n # "M" # m: RegisterTuples<SubRegSet<[], 0, n, m>.L,
+                                           VRegList<[], 0, n, m>.L>;
+  }
 }
 
-def VRM8 : RegisterClass<"RISCV", [nxv32i16, nxv16i32, nxv8i64], 64,
-                         (add V8M8, V16M8, V24M8, V0M8)> {
-  let Size = 512;
+class VReg<list<ValueType> regTypes, dag regList, int Vlmul>
+  : RegisterClass<"RISCV",
+                  regTypes,
+                  64, // The maximum supported ELEN is 64.
+                  regList> {
+  int VLMul = Vlmul;
+  int Size = !mul(Vlmul, 64);
 }
 
-def VMaskVT : RegisterTypes<[nxv1i1, nxv2i1, nxv4i1, nxv8i1, nxv16i1, nxv32i1]>;
-
-def VM : RegisterClass<"RISCV", VMaskVT.types, 64, (add
-    (sequence "V%u", 25, 31),
-    (sequence "V%u", 8, 24),
-    (sequence "V%u", 0, 7))> {
+def VR : VReg<[vint8mf2_t, vint8mf4_t, vint8mf8_t,
+               vint16mf2_t, vint16mf4_t, vint32mf2_t,
+               vint8m1_t, vint16m1_t, vint32m1_t, vint64m1_t,
+               vfloat16mf4_t, vfloat16mf2_t, vfloat16m1_t,
+               vfloat32mf2_t, vfloat32m1_t, vfloat64m1_t,
+               vbool64_t, vbool32_t, vbool16_t, vbool8_t, vbool4_t,
+               vbool2_t, vbool1_t],
+           (add (sequence "V%u", 25, 31),
+                (sequence "V%u", 8, 24),
+                (sequence "V%u", 0, 7)), 1>;
+
+def VRNoV0 : VReg<[vint8mf2_t, vint8mf4_t, vint8mf8_t,
+                   vint16mf2_t, vint16mf4_t, vint32mf2_t,
+                   vint8m1_t, vint16m1_t, vint32m1_t, vint64m1_t,
+                   vfloat16mf4_t, vfloat16mf2_t, vfloat16m1_t,
+                   vfloat32mf2_t, vfloat32m1_t, vfloat64m1_t,
+                   vbool64_t, vbool32_t, vbool16_t, vbool8_t, vbool4_t,
+                   vbool2_t, vbool1_t],
+               (add (sequence "V%u", 25, 31),
+                    (sequence "V%u", 8, 24),
+                    (sequence "V%u", 1, 7)), 1>;
+
+def VRM2 : VReg<[vint8m2_t, vint16m2_t, vint32m2_t, vint64m2_t,
+                 vfloat16m2_t, vfloat32m2_t, vfloat64m2_t],
+             (add V26M2, V28M2, V30M2, V8M2, V10M2, V12M2, V14M2, V16M2,
+                  V18M2, V20M2, V22M2, V24M2, V0M2, V2M2, V4M2, V6M2), 2>;
+
+def VRM2NoV0 : VReg<[vint8m2_t, vint16m2_t, vint32m2_t, vint64m2_t,
+                     vfloat16m2_t, vfloat32m2_t, vfloat64m2_t],
+                 (add V26M2, V28M2, V30M2, V8M2, V10M2, V12M2, V14M2, V16M2,
+                      V18M2, V20M2, V22M2, V24M2, V2M2, V4M2, V6M2), 2>;
+
+def VRM4 : VReg<[vint8m4_t, vint16m4_t, vint32m4_t, vint64m4_t,
+                 vfloat16m4_t, vfloat32m4_t, vfloat64m4_t],
+             (add V28M4, V8M4, V12M4, V16M4, V20M4, V24M4, V0M4, V4M4), 4>;
+
+def VRM4NoV0 : VReg<[vint8m4_t, vint16m4_t, vint32m4_t, vint64m4_t,
+                     vfloat16m4_t, vfloat32m4_t, vfloat64m4_t],
+                 (add V28M4, V8M4, V12M4, V16M4, V20M4, V24M4, V4M4), 4>;
+
+def VRM8 : VReg<[vint8m8_t, vint16m8_t, vint32m8_t, vint64m8_t,
+                 vfloat16m8_t, vfloat32m8_t, vfloat64m8_t],
+             (add V8M8, V16M8, V24M8, V0M8), 8>;
+
+def VRM8NoV0 : VReg<[vint8m8_t, vint16m8_t, vint32m8_t, vint64m8_t,
+                     vfloat16m8_t, vfloat32m8_t, vfloat64m8_t],
+                 (add V8M8, V16M8, V24M8), 8>;
+
+defvar VMaskVTs = [vbool64_t, vbool32_t, vbool16_t, vbool8_t,
+                   vbool4_t, vbool2_t, vbool1_t];
+
+def VMV0 : RegisterClass<"RISCV", VMaskVTs, 64, (add V0)> {
   let Size = 64;
 }
 
-def VMV0 : RegisterClass<"RISCV", VMaskVT.types, 64, (add V0)> {
-  let Size = 64;
+foreach m = LMULList.m in {
+  foreach nf = NFList<m>.L in {
+    def "VRN" # nf # "M" # m : VReg<[untyped],
+                               (add !cast<RegisterTuples>("VN" # nf # "M" # m)),
+                                    !mul(nf, m)>;
+  }
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSchedRocket.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSchedRocket.td
new file mode 100644
index 000000000000..de2cdf512e87
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSchedRocket.td
@@ -0,0 +1,233 @@
+//==- RISCVSchedRocket.td - Rocket Scheduling Definitions ----*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// ===---------------------------------------------------------------------===//
+// The following definitions describe the simpler per-operand machine model.
+// This works with MachineScheduler. See MCSchedule.h for details.
+
+// Rocket machine model for scheduling and other instruction cost heuristics.
+def RocketModel : SchedMachineModel {
+  let MicroOpBufferSize = 0; // Rocket is in-order.
+  let IssueWidth = 1;        // 1 micro-op is dispatched per cycle.
+  let LoadLatency = 3;
+  let MispredictPenalty = 3;
+  let UnsupportedFeatures = [HasStdExtV, HasStdExtZvamo, HasStdExtZvlsseg];
+}
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available.
+
+// Modeling each pipeline as a ProcResource using the BufferSize = 0 since
+// Rocket is in-order.
+
+let BufferSize = 0 in {
+def RocketUnitALU        : ProcResource<1>; // Int ALU
+def RocketUnitIMul       : ProcResource<1>; // Int Multiply
+def RocketUnitMem        : ProcResource<1>; // Load/Store
+def RocketUnitB          : ProcResource<1>; // Branch
+
+def RocketUnitFPALU      : ProcResource<1>; // FP ALU
+}
+
+let BufferSize = 1 in {
+def RocketUnitIDiv       : ProcResource<1>; // Int Division
+def RocketUnitFPDivSqrt  : ProcResource<1>; // FP Divide/Sqrt
+}
+
+//===----------------------------------------------------------------------===//
+
+let SchedModel = RocketModel in {
+
+// Branching
+def : WriteRes<WriteJmp, [RocketUnitB]>;
+def : WriteRes<WriteJal, [RocketUnitB]>;
+def : WriteRes<WriteJalr, [RocketUnitB]>;
+def : WriteRes<WriteJmpReg, [RocketUnitB]>;
+
+// Integer arithmetic and logic
+def : WriteRes<WriteIALU32, [RocketUnitALU]>;
+def : WriteRes<WriteIALU, [RocketUnitALU]>;
+def : WriteRes<WriteShift32, [RocketUnitALU]>;
+def : WriteRes<WriteShift, [RocketUnitALU]>;
+
+// Integer multiplication
+let Latency = 4 in {
+def : WriteRes<WriteIMul, [RocketUnitIMul]>;
+def : WriteRes<WriteIMul32, [RocketUnitIMul]>;
+}
+
+// Integer division
+// Worst case latency is used.
+def : WriteRes<WriteIDiv32, [RocketUnitIDiv]> {
+  let Latency = 34;
+  let ResourceCycles = [34];
+}
+def : WriteRes<WriteIDiv, [RocketUnitIDiv]> {
+  let Latency = 33;
+  let ResourceCycles = [33];
+}
+
+// Memory
+def : WriteRes<WriteSTB, [RocketUnitMem]>;
+def : WriteRes<WriteSTH, [RocketUnitMem]>;
+def : WriteRes<WriteSTW, [RocketUnitMem]>;
+def : WriteRes<WriteSTD, [RocketUnitMem]>;
+def : WriteRes<WriteFST32, [RocketUnitMem]>;
+def : WriteRes<WriteFST64, [RocketUnitMem]>;
+
+let Latency = 3 in {
+def : WriteRes<WriteLDB, [RocketUnitMem]>;
+def : WriteRes<WriteLDH, [RocketUnitMem]>;
+}
+
+let Latency = 2 in {
+def : WriteRes<WriteLDW, [RocketUnitMem]>;
+def : WriteRes<WriteLDWU, [RocketUnitMem]>;
+def : WriteRes<WriteLDD, [RocketUnitMem]>;
+def : WriteRes<WriteFLD32, [RocketUnitMem]>;
+def : WriteRes<WriteFLD64, [RocketUnitMem]>;
+
+// Atomic memory
+def : WriteRes<WriteAtomicW, [RocketUnitMem]>;
+def : WriteRes<WriteAtomicD, [RocketUnitMem]>;
+
+def : WriteRes<WriteAtomicLDW, [RocketUnitMem]>;
+def : WriteRes<WriteAtomicLDD, [RocketUnitMem]>;
+}
+
+def : WriteRes<WriteAtomicSTW, [RocketUnitMem]>;
+def : WriteRes<WriteAtomicSTD, [RocketUnitMem]>;
+
+// Single precision.
+let Latency = 4 in {
+def : WriteRes<WriteFALU32, [RocketUnitFPALU]>;
+def : WriteRes<WriteFSGNJ32, [RocketUnitFPALU]>;
+def : WriteRes<WriteFMinMax32, [RocketUnitFPALU]>;
+}
+
+// Double precision
+let Latency = 6 in {
+def : WriteRes<WriteFALU64, [RocketUnitFPALU]>;
+def : WriteRes<WriteFSGNJ64, [RocketUnitFPALU]>;
+def : WriteRes<WriteFMinMax64, [RocketUnitFPALU]>;
+}
+
+// Conversions
+let Latency = 2 in {
+def : WriteRes<WriteFCvtI32ToF32, [RocketUnitFPALU]>;
+def : WriteRes<WriteFCvtI32ToF64, [RocketUnitFPALU]>;
+def : WriteRes<WriteFCvtI64ToF32, [RocketUnitFPALU]>;
+def : WriteRes<WriteFCvtI64ToF64, [RocketUnitFPALU]>;
+def : WriteRes<WriteFCvtF32ToI32, [RocketUnitFPALU]>;
+def : WriteRes<WriteFCvtF32ToI64, [RocketUnitFPALU]>;
+def : WriteRes<WriteFCvtF64ToI32, [RocketUnitFPALU]>;
+def : WriteRes<WriteFCvtF64ToI64, [RocketUnitFPALU]>;
+def : WriteRes<WriteFCvtF32ToF64, [RocketUnitFPALU]>;
+def : WriteRes<WriteFCvtF64ToF32, [RocketUnitFPALU]>;
+
+def : WriteRes<WriteFClass32, [RocketUnitFPALU]>;
+def : WriteRes<WriteFClass64, [RocketUnitFPALU]>;
+def : WriteRes<WriteFCmp32, [RocketUnitFPALU]>;
+def : WriteRes<WriteFCmp64, [RocketUnitFPALU]>;
+def : WriteRes<WriteFMovF32ToI32, [RocketUnitFPALU]>;
+def : WriteRes<WriteFMovI32ToF32, [RocketUnitFPALU]>;
+def : WriteRes<WriteFMovF64ToI64, [RocketUnitFPALU]>;
+def : WriteRes<WriteFMovI64ToF64, [RocketUnitFPALU]>;
+}
+
+// FP multiplication
+let Latency = 5 in {
+def : WriteRes<WriteFMul32, [RocketUnitFPALU]>;
+def : WriteRes<WriteFMulAdd32, [RocketUnitFPALU]>;
+def : WriteRes<WriteFMulSub32, [RocketUnitFPALU]>;
+}
+
+let Latency = 7 in {
+def : WriteRes<WriteFMul64, [RocketUnitFPALU]>;
+def : WriteRes<WriteFMulAdd64, [RocketUnitFPALU]>;
+def : WriteRes<WriteFMulSub64, [RocketUnitFPALU]>;
+}
+
+// FP division
+// FP division unit on Rocket is not pipelined, so set resource cycles to latency.
+let Latency = 20, ResourceCycles = [20] in {
+def : WriteRes<WriteFDiv32, [RocketUnitFPDivSqrt]>;
+def : WriteRes<WriteFDiv64, [RocketUnitFPDivSqrt]>;
+}
+
+// FP square root unit on Rocket is not pipelined, so set resource cycles to latency.
+def : WriteRes<WriteFSqrt32, [RocketUnitFPDivSqrt]> { let Latency = 20;
+                                                      let ResourceCycles = [20]; }
+def : WriteRes<WriteFSqrt64, [RocketUnitFPDivSqrt]> { let Latency = 25;
+                                                      let ResourceCycles = [25]; }
+
+// Others
+def : WriteRes<WriteCSR, []>;
+def : WriteRes<WriteNop, []>;
+
+def : InstRW<[WriteIALU], (instrs COPY)>;
+
+//===----------------------------------------------------------------------===//
+// Bypass and advance
+def : ReadAdvance<ReadJmp, 0>;
+def : ReadAdvance<ReadJalr, 0>;
+def : ReadAdvance<ReadCSR, 0>;
+def : ReadAdvance<ReadStoreData, 0>;
+def : ReadAdvance<ReadMemBase, 0>;
+def : ReadAdvance<ReadIALU, 0>;
+def : ReadAdvance<ReadIALU32, 0>;
+def : ReadAdvance<ReadShift, 0>;
+def : ReadAdvance<ReadShift32, 0>;
+def : ReadAdvance<ReadIDiv, 0>;
+def : ReadAdvance<ReadIDiv32, 0>;
+def : ReadAdvance<ReadIMul, 0>;
+def : ReadAdvance<ReadIMul32, 0>;
+def : ReadAdvance<ReadAtomicWA, 0>;
+def : ReadAdvance<ReadAtomicWD, 0>;
+def : ReadAdvance<ReadAtomicDA, 0>;
+def : ReadAdvance<ReadAtomicDD, 0>;
+def : ReadAdvance<ReadAtomicLDW, 0>;
+def : ReadAdvance<ReadAtomicLDD, 0>;
+def : ReadAdvance<ReadAtomicSTW, 0>;
+def : ReadAdvance<ReadAtomicSTD, 0>;
+def : ReadAdvance<ReadFMemBase, 0>;
+def : ReadAdvance<ReadFALU32, 0>;
+def : ReadAdvance<ReadFALU64, 0>;
+def : ReadAdvance<ReadFMul32, 0>;
+def : ReadAdvance<ReadFMulAdd32, 0>;
+def : ReadAdvance<ReadFMulSub32, 0>;
+def : ReadAdvance<ReadFMul64, 0>;
+def : ReadAdvance<ReadFMulAdd64, 0>;
+def : ReadAdvance<ReadFMulSub64, 0>;
+def : ReadAdvance<ReadFDiv32, 0>;
+def : ReadAdvance<ReadFDiv64, 0>;
+def : ReadAdvance<ReadFSqrt32, 0>;
+def : ReadAdvance<ReadFSqrt64, 0>;
+def : ReadAdvance<ReadFCmp32, 0>;
+def : ReadAdvance<ReadFCmp64, 0>;
+def : ReadAdvance<ReadFSGNJ32, 0>;
+def : ReadAdvance<ReadFSGNJ64, 0>;
+def : ReadAdvance<ReadFMinMax32, 0>;
+def : ReadAdvance<ReadFMinMax64, 0>;
+def : ReadAdvance<ReadFCvtF32ToI32, 0>;
+def : ReadAdvance<ReadFCvtF32ToI64, 0>;
+def : ReadAdvance<ReadFCvtF64ToI32, 0>;
+def : ReadAdvance<ReadFCvtF64ToI64, 0>;
+def : ReadAdvance<ReadFCvtI32ToF32, 0>;
+def : ReadAdvance<ReadFCvtI32ToF64, 0>;
+def : ReadAdvance<ReadFCvtI64ToF32, 0>;
+def : ReadAdvance<ReadFCvtI64ToF64, 0>;
+def : ReadAdvance<ReadFCvtF32ToF64, 0>;
+def : ReadAdvance<ReadFCvtF64ToF32, 0>;
+def : ReadAdvance<ReadFMovF32ToI32, 0>;
+def : ReadAdvance<ReadFMovI32ToF32, 0>;
+def : ReadAdvance<ReadFMovF64ToI64, 0>;
+def : ReadAdvance<ReadFMovI64ToF64, 0>;
+def : ReadAdvance<ReadFClass32, 0>;
+def : ReadAdvance<ReadFClass64, 0>;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSchedRocket32.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSchedRocket32.td
deleted file mode 100644
index 305e2b9b5927..000000000000
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSchedRocket32.td
+++ /dev/null
@@ -1,227 +0,0 @@
-//==- RISCVSchedRocket32.td - Rocket Scheduling Definitions -*- tablegen -*-=//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// ===---------------------------------------------------------------------===//
-// The following definitions describe the simpler per-operand machine model.
-// This works with MachineScheduler. See MCSchedule.h for details.
-
-// Rocket machine model for scheduling and other instruction cost heuristics.
-def Rocket32Model : SchedMachineModel {
-  let MicroOpBufferSize = 0; // Explicitly set to zero since Rocket is in-order.
-  let IssueWidth = 1;        // 1 micro-ops are dispatched per cycle.
-  let LoadLatency = 3;
-  let MispredictPenalty = 3;
-  let CompleteModel = 1;
-  let UnsupportedFeatures = [HasStdExtV];
-}
-
-//===----------------------------------------------------------------------===//
-// Define each kind of processor resource and number available.
-
-// Modeling each pipeline as a ProcResource using the BufferSize = 0 since
-// Rocket is in-order.
-
-let BufferSize = 0 in {
-def Rocket32UnitALU        : ProcResource<1>; // Int ALU
-def Rocket32UnitIMul       : ProcResource<1>; // Int Multiply
-def Rocket32UnitMem        : ProcResource<1>; // Load/Store
-def Rocket32UnitB          : ProcResource<1>; // Branch
-
-def Rocket32UnitFPALU      : ProcResource<1>; // FP ALU
-}
-
-let BufferSize = 1 in {
-def Rocket32UnitIDiv       : ProcResource<1>; // Int Division
-def Rocket32UnitFPDivSqrt  : ProcResource<1>; // FP Divide/Sqrt'
-}
-
-//===----------------------------------------------------------------------===//
-// Subtarget-specific SchedWrite types which both map the ProcResources and
-// set the latency.
-
-let SchedModel = Rocket32Model in {
-
-def : WriteRes<WriteJmp, [Rocket32UnitB]>;
-def : WriteRes<WriteJal, [Rocket32UnitB]>;
-def : WriteRes<WriteJalr, [Rocket32UnitB]>;
-def : WriteRes<WriteJmpReg, [Rocket32UnitB]>;
-
-def : WriteRes<WriteIALU, [Rocket32UnitALU]>;
-def : WriteRes<WriteShift, [Rocket32UnitALU]>;
-
-// Multiplies on Rocket differ by implementation; placeholder until
-// we can determine how to read from command line
-def : WriteRes<WriteIMul, [Rocket32UnitIMul]> { let Latency = 4; }
-
-// 32-bit divides have worse case latency of 34 cycle
-def : WriteRes<WriteIDiv, [Rocket32UnitIDiv]> {
-  let Latency = 34;
-  let ResourceCycles = [34];
-}
-
-// Memory
-def : WriteRes<WriteSTB, [Rocket32UnitMem]>;
-def : WriteRes<WriteSTH, [Rocket32UnitMem]>;
-def : WriteRes<WriteSTW, [Rocket32UnitMem]>;
-def : WriteRes<WriteFST32, [Rocket32UnitMem]>;
-def : WriteRes<WriteFST64, [Rocket32UnitMem]>;
-
-let Latency = 3 in {
-def : WriteRes<WriteLDB, [Rocket32UnitMem]>;
-def : WriteRes<WriteLDH, [Rocket32UnitMem]>;
-def : WriteRes<WriteCSR, [Rocket32UnitALU]>;
-}
-
-let Latency = 2 in {
-def : WriteRes<WriteLDW, [Rocket32UnitMem]>;
-def : WriteRes<WriteFLD32, [Rocket32UnitMem]>;
-def : WriteRes<WriteFLD64, [Rocket32UnitMem]>;
-
-def : WriteRes<WriteAtomicW, [Rocket32UnitMem]>;
-def : WriteRes<WriteAtomicLDW, [Rocket32UnitMem]>;
-}
-
-def : WriteRes<WriteAtomicSTW, [Rocket32UnitMem]>;
-
-// Most FP single precision operations are 4 cycles
-let Latency = 4 in {
-def : WriteRes<WriteFALU32, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFSGNJ32, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFMinMax32, [Rocket32UnitFPALU]>;
-}
-
-// Most FP double precision operations are 6 cycles
-let Latency = 6 in {
-def : WriteRes<WriteFALU64, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFSGNJ64, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFMinMax64, [Rocket32UnitFPALU]>;
-}
-
-let Latency = 2 in {
-def : WriteRes<WriteFCvtI32ToF32, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFCvtI32ToF64, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFCvtF32ToI32, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFCvtF64ToI32, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFCvtF32ToF64, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFCvtF64ToF32, [Rocket32UnitFPALU]>;
-
-def : WriteRes<WriteFClass32, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFClass64, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFCmp32, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFCmp64, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFMovF32ToI32, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFMovI32ToF32, [Rocket32UnitFPALU]>;
-}
-
-let Latency = 5 in {
-def : WriteRes<WriteFMul32, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFMulAdd32, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFMulSub32, [Rocket32UnitFPALU]>;
-}
-
-let Latency = 7 in {
-def : WriteRes<WriteFMul64, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFMulAdd64, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFMulSub64, [Rocket32UnitFPALU]>;
-}
-
-// FP Divide unit on Rocket is not pipelined, so set resource cycles to latency
-let Latency = 20, ResourceCycles = [20] in {
-def : WriteRes<WriteFDiv32, [Rocket32UnitFPDivSqrt]>;
-def : WriteRes<WriteFDiv64, [Rocket32UnitFPDivSqrt]>;
-}
-
-// FP Sqrt unit on Rocket is not pipelined, so set resource cycles to latency
-def : WriteRes<WriteFSqrt32, [Rocket32UnitFPDivSqrt]> { let Latency = 20;
-                                                        let ResourceCycles = [20];}
-def : WriteRes<WriteFSqrt64, [Rocket32UnitFPDivSqrt]> { let Latency = 25;
-                                                        let ResourceCycles = [25];}
-
-def : WriteRes<WriteNop, []>;
-
-def : InstRW<[WriteIALU], (instrs COPY)>;
-
-let Unsupported = 1 in {
-def : WriteRes<WriteIALU32, []>;
-def : WriteRes<WriteShift32, []>;
-def : WriteRes<WriteIMul32, []>;
-def : WriteRes<WriteIDiv32, []>;
-def : WriteRes<WriteSTD, []>;
-def : WriteRes<WriteLDWU, []>;
-def : WriteRes<WriteLDD, []>;
-def : WriteRes<WriteAtomicD, []>;
-def : WriteRes<WriteAtomicLDD, []>;
-def : WriteRes<WriteAtomicSTD, []>;
-def : WriteRes<WriteFCvtI64ToF32, []>;
-def : WriteRes<WriteFCvtI64ToF64, []>;
-def : WriteRes<WriteFCvtF64ToI64, []>;
-def : WriteRes<WriteFCvtF32ToI64, []>;
-def : WriteRes<WriteFMovI64ToF64, []>;
-def : WriteRes<WriteFMovF64ToI64, []>;
-}
-
-//===----------------------------------------------------------------------===//
-// Subtarget-specific SchedRead types with cycles.
-// Dummy definitions for RocketCore.
-def : ReadAdvance<ReadJmp, 0>;
-def : ReadAdvance<ReadJalr, 0>;
-def : ReadAdvance<ReadCSR, 0>;
-def : ReadAdvance<ReadStoreData, 0>;
-def : ReadAdvance<ReadMemBase, 0>;
-def : ReadAdvance<ReadIALU, 0>;
-def : ReadAdvance<ReadIALU32, 0>;
-def : ReadAdvance<ReadShift, 0>;
-def : ReadAdvance<ReadShift32, 0>;
-def : ReadAdvance<ReadIDiv, 0>;
-def : ReadAdvance<ReadIDiv32, 0>;
-def : ReadAdvance<ReadIMul, 0>;
-def : ReadAdvance<ReadIMul32, 0>;
-def : ReadAdvance<ReadAtomicWA, 0>;
-def : ReadAdvance<ReadAtomicWD, 0>;
-def : ReadAdvance<ReadAtomicDA, 0>;
-def : ReadAdvance<ReadAtomicDD, 0>;
-def : ReadAdvance<ReadAtomicLDW, 0>;
-def : ReadAdvance<ReadAtomicLDD, 0>;
-def : ReadAdvance<ReadAtomicSTW, 0>;
-def : ReadAdvance<ReadAtomicSTD, 0>;
-def : ReadAdvance<ReadFMemBase, 0>;
-def : ReadAdvance<ReadFALU32, 0>;
-def : ReadAdvance<ReadFALU64, 0>;
-def : ReadAdvance<ReadFMul32, 0>;
-def : ReadAdvance<ReadFMulAdd32, 0>;
-def : ReadAdvance<ReadFMulSub32, 0>;
-def : ReadAdvance<ReadFMul64, 0>;
-def : ReadAdvance<ReadFMulAdd64, 0>;
-def : ReadAdvance<ReadFMulSub64, 0>;
-def : ReadAdvance<ReadFDiv32, 0>;
-def : ReadAdvance<ReadFDiv64, 0>;
-def : ReadAdvance<ReadFSqrt32, 0>;
-def : ReadAdvance<ReadFSqrt64, 0>;
-def : ReadAdvance<ReadFCmp32, 0>;
-def : ReadAdvance<ReadFCmp64, 0>;
-def : ReadAdvance<ReadFSGNJ32, 0>;
-def : ReadAdvance<ReadFSGNJ64, 0>;
-def : ReadAdvance<ReadFMinMax32, 0>;
-def : ReadAdvance<ReadFMinMax64, 0>;
-def : ReadAdvance<ReadFCvtF32ToI32, 0>;
-def : ReadAdvance<ReadFCvtF32ToI64, 0>;
-def : ReadAdvance<ReadFCvtF64ToI32, 0>;
-def : ReadAdvance<ReadFCvtF64ToI64, 0>;
-def : ReadAdvance<ReadFCvtI32ToF32, 0>;
-def : ReadAdvance<ReadFCvtI32ToF64, 0>;
-def : ReadAdvance<ReadFCvtI64ToF32, 0>;
-def : ReadAdvance<ReadFCvtI64ToF64, 0>;
-def : ReadAdvance<ReadFCvtF32ToF64, 0>;
-def : ReadAdvance<ReadFCvtF64ToF32, 0>;
-def : ReadAdvance<ReadFMovF32ToI32, 0>;
-def : ReadAdvance<ReadFMovI32ToF32, 0>;
-def : ReadAdvance<ReadFMovF64ToI64, 0>;
-def : ReadAdvance<ReadFMovI64ToF64, 0>;
-def : ReadAdvance<ReadFClass32, 0>;
-def : ReadAdvance<ReadFClass64, 0>;
-}
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSchedRocket64.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSchedRocket64.td
deleted file mode 100644
index e8514a275c45..000000000000
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSchedRocket64.td
+++ /dev/null
@@ -1,228 +0,0 @@
-//==- RISCVSchedRocket64.td - Rocket Scheduling Definitions -*- tablegen -*-=//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// ===---------------------------------------------------------------------===//
-// The following definitions describe the simpler per-operand machine model.
-// This works with MachineScheduler. See MCSchedule.h for details.
-
-// Rocket machine model for scheduling and other instruction cost heuristics.
-def Rocket64Model : SchedMachineModel {
-  let MicroOpBufferSize = 0; // Explicitly set to zero since Rocket is in-order.
-  let IssueWidth = 1;        // 1 micro-ops are dispatched per cycle.
-  let LoadLatency = 3;
-  let MispredictPenalty = 3;
-  let UnsupportedFeatures = [HasStdExtV];
-}
-
-//===----------------------------------------------------------------------===//
-// Define each kind of processor resource and number available.
-
-// Modeling each pipeline as a ProcResource using the BufferSize = 0 since
-// Rocket is in-order.
-
-let BufferSize = 0 in {
-def Rocket64UnitALU        : ProcResource<1>; // Int ALU
-def Rocket64UnitIMul       : ProcResource<1>; // Int Multiply
-def Rocket64UnitMem        : ProcResource<1>; // Load/Store
-def Rocket64UnitB          : ProcResource<1>; // Branch
-
-def Rocket64UnitFPALU      : ProcResource<1>; // FP ALU
-}
-
-let BufferSize = 1 in {
-def Rocket64UnitIDiv       : ProcResource<1>; // Int Division
-def Rocket64UnitFPDivSqrt  : ProcResource<1>; // FP Divide/Sqrt
-}
-
-//===----------------------------------------------------------------------===//
-// Subtarget-specific SchedWrite types which both map the ProcResources and
-// set the latency.
-
-let SchedModel = Rocket64Model in {
-
-def : WriteRes<WriteJmp, [Rocket64UnitB]>;
-def : WriteRes<WriteJal, [Rocket64UnitB]>;
-def : WriteRes<WriteJalr, [Rocket64UnitB]>;
-def : WriteRes<WriteJmpReg, [Rocket64UnitB]>;
-
-def : WriteRes<WriteIALU32, [Rocket64UnitALU]>;
-def : WriteRes<WriteIALU, [Rocket64UnitALU]>;
-def : WriteRes<WriteShift32, [Rocket64UnitALU]>;
-def : WriteRes<WriteShift, [Rocket64UnitALU]>;
-
-let Latency = 4 in {
-def : WriteRes<WriteIMul, [Rocket64UnitIMul]>;
-def : WriteRes<WriteIMul32, [Rocket64UnitIMul]>;
-}
-
-// Integer divide varies based on operand magnitude and sign; worse case latency is 34.
-def : WriteRes<WriteIDiv32, [Rocket64UnitIDiv]> {
-  let Latency = 34;
-  let ResourceCycles = [34];
-}
-def : WriteRes<WriteIDiv, [Rocket64UnitIDiv]> {
-  let Latency = 33;
-  let ResourceCycles = [33];
-}
-
-// Memory
-def : WriteRes<WriteSTB, [Rocket64UnitMem]>;
-def : WriteRes<WriteSTH, [Rocket64UnitMem]>;
-def : WriteRes<WriteSTW, [Rocket64UnitMem]>;
-def : WriteRes<WriteSTD, [Rocket64UnitMem]>;
-def : WriteRes<WriteFST32, [Rocket64UnitMem]>;
-def : WriteRes<WriteFST64, [Rocket64UnitMem]>;
-
-let Latency = 3 in {
-def : WriteRes<WriteLDB, [Rocket64UnitMem]>;
-def : WriteRes<WriteLDH, [Rocket64UnitMem]>;
-def : WriteRes<WriteCSR, [Rocket64UnitALU]>;
-}
-
-let Latency = 2 in {
-def : WriteRes<WriteLDW, [Rocket64UnitMem]>;
-def : WriteRes<WriteLDWU, [Rocket64UnitMem]>;
-def : WriteRes<WriteLDD, [Rocket64UnitMem]>;
-def : WriteRes<WriteFLD32, [Rocket64UnitMem]>;
-def : WriteRes<WriteFLD64, [Rocket64UnitMem]>;
-
-def : WriteRes<WriteAtomicW, [Rocket64UnitMem]>;
-def : WriteRes<WriteAtomicD, [Rocket64UnitMem]>;
-
-def : WriteRes<WriteAtomicLDW, [Rocket64UnitMem]>;
-def : WriteRes<WriteAtomicLDD, [Rocket64UnitMem]>;
-}
-
-def : WriteRes<WriteAtomicSTW, [Rocket64UnitMem]>;
-def : WriteRes<WriteAtomicSTD, [Rocket64UnitMem]>;
-
-// Most FP single precision operations are 4 cycles
-let Latency = 4 in {
-def : WriteRes<WriteFALU32, [Rocket64UnitFPALU]>;
-def : WriteRes<WriteFSGNJ32, [Rocket64UnitFPALU]>;
-def : WriteRes<WriteFMinMax32, [Rocket64UnitFPALU]>;
-}
-
-let Latency = 6 in {
-// Most FP double precision operations are 6 cycles
-def : WriteRes<WriteFALU64, [Rocket64UnitFPALU]>;
-def : WriteRes<WriteFSGNJ64, [Rocket64UnitFPALU]>;
-def : WriteRes<WriteFMinMax64, [Rocket64UnitFPALU]>;
-}
-
-// Conversion instructions
-let Latency = 2 in {
-def : WriteRes<WriteFCvtI32ToF32, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFCvtI32ToF64, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFCvtI64ToF32, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFCvtI64ToF64, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFCvtF32ToI32, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFCvtF32ToI64, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFCvtF64ToI32, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFCvtF64ToI64, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFCvtF32ToF64, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFCvtF64ToF32, [Rocket32UnitFPALU]>;
-
-def : WriteRes<WriteFClass32, [Rocket64UnitFPALU]>;
-def : WriteRes<WriteFClass64, [Rocket64UnitFPALU]>;
-def : WriteRes<WriteFCmp32, [Rocket64UnitFPALU]>;
-def : WriteRes<WriteFCmp64, [Rocket64UnitFPALU]>;
-def : WriteRes<WriteFMovF32ToI32, [Rocket64UnitFPALU]>;
-def : WriteRes<WriteFMovI32ToF32, [Rocket64UnitFPALU]>;
-def : WriteRes<WriteFMovF64ToI64, [Rocket64UnitFPALU]>;
-def : WriteRes<WriteFMovI64ToF64, [Rocket64UnitFPALU]>;
-}
-
-let Latency = 5 in {
-def : WriteRes<WriteFMul32, [Rocket64UnitFPALU]>;
-def : WriteRes<WriteFMulAdd32, [Rocket64UnitFPALU]>;
-def : WriteRes<WriteFMulSub32, [Rocket64UnitFPALU]>;
-}
-
-let Latency = 7 in {
-def : WriteRes<WriteFMul64, [Rocket64UnitFPALU]>;
-def : WriteRes<WriteFMulAdd64, [Rocket64UnitFPALU]>;
-def : WriteRes<WriteFMulSub64, [Rocket64UnitFPALU]>;
-}
-
-// FP Divide unit on Rocket is not pipelined, so set resource cycles to latency
-let Latency = 20, ResourceCycles = [20] in {
-def : WriteRes<WriteFDiv32, [Rocket64UnitFPDivSqrt]>;
-def : WriteRes<WriteFDiv64, [Rocket64UnitFPDivSqrt]>;
-}
-
-// FP Sqrt unit on Rocket is not pipelined, so set resource cycles to latency
-def : WriteRes<WriteFSqrt32, [Rocket64UnitFPDivSqrt]> { let Latency = 20;
-                                                        let ResourceCycles = [20]; }
-def : WriteRes<WriteFSqrt64, [Rocket64UnitFPDivSqrt]> { let Latency = 25;
-                                                        let ResourceCycles = [25]; }
-
-def : WriteRes<WriteNop, []>;
-
-def : InstRW<[WriteIALU], (instrs COPY)>;
-
-//===----------------------------------------------------------------------===//
-// Subtarget-specific SchedRead types with cycles.
-// Dummy definitions for RocketCore.
-def : ReadAdvance<ReadJmp, 0>;
-def : ReadAdvance<ReadJalr, 0>;
-def : ReadAdvance<ReadCSR, 0>;
-def : ReadAdvance<ReadStoreData, 0>;
-def : ReadAdvance<ReadMemBase, 0>;
-def : ReadAdvance<ReadIALU, 0>;
-def : ReadAdvance<ReadIALU32, 0>;
-def : ReadAdvance<ReadShift, 0>;
-def : ReadAdvance<ReadShift32, 0>;
-def : ReadAdvance<ReadIDiv, 0>;
-def : ReadAdvance<ReadIDiv32, 0>;
-def : ReadAdvance<ReadIMul, 0>;
-def : ReadAdvance<ReadIMul32, 0>;
-def : ReadAdvance<ReadAtomicWA, 0>;
-def : ReadAdvance<ReadAtomicWD, 0>;
-def : ReadAdvance<ReadAtomicDA, 0>;
-def : ReadAdvance<ReadAtomicDD, 0>;
-def : ReadAdvance<ReadAtomicLDW, 0>;
-def : ReadAdvance<ReadAtomicLDD, 0>;
-def : ReadAdvance<ReadAtomicSTW, 0>;
-def : ReadAdvance<ReadAtomicSTD, 0>;
-def : ReadAdvance<ReadFMemBase, 0>;
-def : ReadAdvance<ReadFALU32, 0>;
-def : ReadAdvance<ReadFALU64, 0>;
-def : ReadAdvance<ReadFMul32, 0>;
-def : ReadAdvance<ReadFMulAdd32, 0>;
-def : ReadAdvance<ReadFMulSub32, 0>;
-def : ReadAdvance<ReadFMul64, 0>;
-def : ReadAdvance<ReadFMulAdd64, 0>;
-def : ReadAdvance<ReadFMulSub64, 0>;
-def : ReadAdvance<ReadFDiv32, 0>;
-def : ReadAdvance<ReadFDiv64, 0>;
-def : ReadAdvance<ReadFSqrt32, 0>;
-def : ReadAdvance<ReadFSqrt64, 0>;
-def : ReadAdvance<ReadFCmp32, 0>;
-def : ReadAdvance<ReadFCmp64, 0>;
-def : ReadAdvance<ReadFSGNJ32, 0>;
-def : ReadAdvance<ReadFSGNJ64, 0>;
-def : ReadAdvance<ReadFMinMax32, 0>;
-def : ReadAdvance<ReadFMinMax64, 0>;
-def : ReadAdvance<ReadFCvtF32ToI32, 0>;
-def : ReadAdvance<ReadFCvtF32ToI64, 0>;
-def : ReadAdvance<ReadFCvtF64ToI32, 0>;
-def : ReadAdvance<ReadFCvtF64ToI64, 0>;
-def : ReadAdvance<ReadFCvtI32ToF32, 0>;
-def : ReadAdvance<ReadFCvtI32ToF64, 0>;
-def : ReadAdvance<ReadFCvtI64ToF32, 0>;
-def : ReadAdvance<ReadFCvtI64ToF64, 0>;
-def : ReadAdvance<ReadFCvtF32ToF64, 0>;
-def : ReadAdvance<ReadFCvtF64ToF32, 0>;
-def : ReadAdvance<ReadFMovF32ToI32, 0>;
-def : ReadAdvance<ReadFMovI32ToF32, 0>;
-def : ReadAdvance<ReadFMovF64ToI64, 0>;
-def : ReadAdvance<ReadFMovI64ToF64, 0>;
-def : ReadAdvance<ReadFClass32, 0>;
-def : ReadAdvance<ReadFClass64, 0>;
-}
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
new file mode 100644
index 000000000000..e57ba4f61b98
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
@@ -0,0 +1,222 @@
+//==- RISCVSchedSiFive7.td - SiFive7 Scheduling Definitions --*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+
+// SiFive7 machine model for scheduling and other instruction cost heuristics.
+def SiFive7Model : SchedMachineModel {
+  let MicroOpBufferSize = 0; // Explicitly set to zero since SiFive7 is in-order.
+  let IssueWidth = 2;        // 2 micro-ops are dispatched per cycle.
+  let LoadLatency = 3;
+  let MispredictPenalty = 3;
+  let CompleteModel = 0;
+  let UnsupportedFeatures = [HasStdExtV, HasStdExtZvamo, HasStdExtZvlsseg];
+}
+
+// The SiFive7 microarchitecure has two pipelines: A and B.
+// Pipe A can handle memory, integer alu and vector operations.
+// Pipe B can handle integer alu, control flow, integer multiply and divide,
+// and floating point computation.
+let SchedModel = SiFive7Model in {
+let BufferSize = 0 in {
+def SiFive7PipeA       : ProcResource<1>;
+def SiFive7PipeB       : ProcResource<1>;
+}
+
+let BufferSize = 1 in {
+def SiFive7IDiv        : ProcResource<1> { let Super = SiFive7PipeB; } // Int Division
+def SiFive7FDiv        : ProcResource<1> { let Super = SiFive7PipeB; } // FP Division/Sqrt
+}
+
+def SiFive7PipeAB : ProcResGroup<[SiFive7PipeA, SiFive7PipeB]>;
+
+// Branching
+def : WriteRes<WriteJmp, [SiFive7PipeB]>;
+def : WriteRes<WriteJal, [SiFive7PipeB]>;
+def : WriteRes<WriteJalr, [SiFive7PipeB]>;
+def : WriteRes<WriteJmpReg, [SiFive7PipeB]>;
+
+// Integer arithmetic and logic
+let Latency = 3 in {
+def : WriteRes<WriteIALU, [SiFive7PipeAB]>;
+def : WriteRes<WriteIALU32, [SiFive7PipeAB]>;
+def : WriteRes<WriteShift, [SiFive7PipeAB]>;
+def : WriteRes<WriteShift32, [SiFive7PipeAB]>;
+}
+
+// Integer multiplication
+let Latency = 3 in {
+def : WriteRes<WriteIMul, [SiFive7PipeB]>;
+def : WriteRes<WriteIMul32, [SiFive7PipeB]>;
+}
+
+// Integer division
+def : WriteRes<WriteIDiv, [SiFive7PipeB, SiFive7IDiv]> {
+  let Latency = 16;
+  let ResourceCycles = [1, 15];
+}
+def : WriteRes<WriteIDiv32,  [SiFive7PipeB, SiFive7IDiv]> {
+  let Latency = 16;
+  let ResourceCycles = [1, 15];
+}
+
+// Memory
+def : WriteRes<WriteSTB, [SiFive7PipeA]>;
+def : WriteRes<WriteSTH, [SiFive7PipeA]>;
+def : WriteRes<WriteSTW, [SiFive7PipeA]>;
+def : WriteRes<WriteSTD, [SiFive7PipeA]>;
+def : WriteRes<WriteFST32, [SiFive7PipeA]>;
+def : WriteRes<WriteFST64, [SiFive7PipeA]>;
+
+let Latency = 3 in {
+def : WriteRes<WriteLDB, [SiFive7PipeA]>;
+def : WriteRes<WriteLDH, [SiFive7PipeA]>;
+def : WriteRes<WriteLDW, [SiFive7PipeA]>;
+def : WriteRes<WriteLDWU, [SiFive7PipeA]>;
+def : WriteRes<WriteLDD, [SiFive7PipeA]>;
+}
+
+let Latency = 2 in {
+def : WriteRes<WriteFLD32, [SiFive7PipeA]>;
+def : WriteRes<WriteFLD64, [SiFive7PipeA]>;
+}
+
+// Atomic memory
+def : WriteRes<WriteAtomicSTW, [SiFive7PipeA]>;
+def : WriteRes<WriteAtomicSTD, [SiFive7PipeA]>;
+
+let Latency = 3 in {
+def : WriteRes<WriteAtomicW, [SiFive7PipeA]>;
+def : WriteRes<WriteAtomicD, [SiFive7PipeA]>;
+def : WriteRes<WriteAtomicLDW, [SiFive7PipeA]>;
+def : WriteRes<WriteAtomicLDD, [SiFive7PipeA]>;
+}
+
+// Single precision.
+let Latency = 5 in {
+def : WriteRes<WriteFALU32, [SiFive7PipeB]>;
+def : WriteRes<WriteFMul32, [SiFive7PipeB]>;
+def : WriteRes<WriteFMulAdd32, [SiFive7PipeB]>;
+def : WriteRes<WriteFMulSub32, [SiFive7PipeB]>;
+}
+let Latency = 3 in {
+def : WriteRes<WriteFSGNJ32, [SiFive7PipeB]>;
+def : WriteRes<WriteFMinMax32, [SiFive7PipeB]>;
+}
+
+def : WriteRes<WriteFDiv32, [SiFive7PipeB, SiFive7FDiv]> { let Latency = 27;
+                                                         let ResourceCycles = [1, 26]; }
+def : WriteRes<WriteFSqrt32, [SiFive7PipeB, SiFive7FDiv]> { let Latency = 27;
+                                                          let ResourceCycles = [1, 26]; }
+
+// Double precision
+let Latency = 7 in {
+def : WriteRes<WriteFALU64, [SiFive7PipeB]>;
+def : WriteRes<WriteFMul64, [SiFive7PipeB]>;
+def : WriteRes<WriteFMulAdd64, [SiFive7PipeB]>;
+def : WriteRes<WriteFMulSub64, [SiFive7PipeB]>;
+}
+let Latency = 3 in {
+def : WriteRes<WriteFSGNJ64, [SiFive7PipeB]>;
+def : WriteRes<WriteFMinMax64, [SiFive7PipeB]>;
+}
+
+def : WriteRes<WriteFDiv64, [SiFive7PipeB, SiFive7FDiv]> { let Latency = 56;
+                                                         let ResourceCycles = [1, 55]; }
+def : WriteRes<WriteFSqrt64, [SiFive7PipeB, SiFive7FDiv]> { let Latency = 56;
+                                                          let ResourceCycles = [1, 55]; }
+
+// Conversions
+let Latency = 3 in {
+def : WriteRes<WriteFCvtI32ToF32, [SiFive7PipeB]>;
+def : WriteRes<WriteFCvtI32ToF64, [SiFive7PipeB]>;
+def : WriteRes<WriteFCvtI64ToF32, [SiFive7PipeB]>;
+def : WriteRes<WriteFCvtI64ToF64, [SiFive7PipeB]>;
+def : WriteRes<WriteFCvtF32ToI32, [SiFive7PipeB]>;
+def : WriteRes<WriteFCvtF32ToI64, [SiFive7PipeB]>;
+def : WriteRes<WriteFCvtF32ToF64, [SiFive7PipeB]>;
+def : WriteRes<WriteFCvtF64ToI32, [SiFive7PipeB]>;
+def : WriteRes<WriteFCvtF64ToI64, [SiFive7PipeB]>;
+def : WriteRes<WriteFCvtF64ToF32, [SiFive7PipeB]>;
+
+def : WriteRes<WriteFClass32, [SiFive7PipeB]>;
+def : WriteRes<WriteFClass64, [SiFive7PipeB]>;
+def : WriteRes<WriteFCmp32, [SiFive7PipeB]>;
+def : WriteRes<WriteFCmp64, [SiFive7PipeB]>;
+def : WriteRes<WriteFMovI32ToF32, [SiFive7PipeB]>;
+def : WriteRes<WriteFMovF32ToI32, [SiFive7PipeB]>;
+def : WriteRes<WriteFMovI64ToF64, [SiFive7PipeB]>;
+def : WriteRes<WriteFMovF64ToI64, [SiFive7PipeB]>;
+}
+
+// Others
+def : WriteRes<WriteCSR, [SiFive7PipeB]>;
+def : WriteRes<WriteNop, []>;
+
+def : InstRW<[WriteIALU], (instrs COPY)>;
+
+
+//===----------------------------------------------------------------------===//
+// Bypass and advance
+def : ReadAdvance<ReadJmp, 0>;
+def : ReadAdvance<ReadJalr, 0>;
+def : ReadAdvance<ReadCSR, 0>;
+def : ReadAdvance<ReadStoreData, 0>;
+def : ReadAdvance<ReadMemBase, 0>;
+def : ReadAdvance<ReadIALU, 0>;
+def : ReadAdvance<ReadIALU32, 0>;
+def : ReadAdvance<ReadShift, 0>;
+def : ReadAdvance<ReadShift32, 0>;
+def : ReadAdvance<ReadIDiv, 0>;
+def : ReadAdvance<ReadIDiv32, 0>;
+def : ReadAdvance<ReadIMul, 0>;
+def : ReadAdvance<ReadIMul32, 0>;
+def : ReadAdvance<ReadAtomicWA, 0>;
+def : ReadAdvance<ReadAtomicWD, 0>;
+def : ReadAdvance<ReadAtomicDA, 0>;
+def : ReadAdvance<ReadAtomicDD, 0>;
+def : ReadAdvance<ReadAtomicLDW, 0>;
+def : ReadAdvance<ReadAtomicLDD, 0>;
+def : ReadAdvance<ReadAtomicSTW, 0>;
+def : ReadAdvance<ReadAtomicSTD, 0>;
+def : ReadAdvance<ReadFMemBase, 0>;
+def : ReadAdvance<ReadFALU32, 0>;
+def : ReadAdvance<ReadFALU64, 0>;
+def : ReadAdvance<ReadFMul32, 0>;
+def : ReadAdvance<ReadFMulAdd32, 0>;
+def : ReadAdvance<ReadFMulSub32, 0>;
+def : ReadAdvance<ReadFMul64, 0>;
+def : ReadAdvance<ReadFMulAdd64, 0>;
+def : ReadAdvance<ReadFMulSub64, 0>;
+def : ReadAdvance<ReadFDiv32, 0>;
+def : ReadAdvance<ReadFDiv64, 0>;
+def : ReadAdvance<ReadFSqrt32, 0>;
+def : ReadAdvance<ReadFSqrt64, 0>;
+def : ReadAdvance<ReadFCmp32, 0>;
+def : ReadAdvance<ReadFCmp64, 0>;
+def : ReadAdvance<ReadFSGNJ32, 0>;
+def : ReadAdvance<ReadFSGNJ64, 0>;
+def : ReadAdvance<ReadFMinMax32, 0>;
+def : ReadAdvance<ReadFMinMax64, 0>;
+def : ReadAdvance<ReadFCvtF32ToI32, 0>;
+def : ReadAdvance<ReadFCvtF32ToI64, 0>;
+def : ReadAdvance<ReadFCvtF64ToI32, 0>;
+def : ReadAdvance<ReadFCvtF64ToI64, 0>;
+def : ReadAdvance<ReadFCvtI32ToF32, 0>;
+def : ReadAdvance<ReadFCvtI32ToF64, 0>;
+def : ReadAdvance<ReadFCvtI64ToF32, 0>;
+def : ReadAdvance<ReadFCvtI64ToF64, 0>;
+def : ReadAdvance<ReadFCvtF32ToF64, 0>;
+def : ReadAdvance<ReadFCvtF64ToF32, 0>;
+def : ReadAdvance<ReadFMovF32ToI32, 0>;
+def : ReadAdvance<ReadFMovI32ToF32, 0>;
+def : ReadAdvance<ReadFMovF64ToI64, 0>;
+def : ReadAdvance<ReadFMovI64ToF64, 0>;
+def : ReadAdvance<ReadFClass32, 0>;
+def : ReadAdvance<ReadFClass64, 0>;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSchedule.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSchedule.td
index bbcd03d46236..0806be8a8d87 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSchedule.td
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSchedule.td
@@ -1,4 +1,4 @@
-//===-- RISCVSchedule.td - RISCV Scheduling Definitions -------*- tablegen -*-===//
+//===-- RISCVSchedule.td - RISCV Scheduling Definitions ----*- tablegen -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSubtarget.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
index 47a48c820a29..df11d237a16c 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
@@ -30,13 +30,16 @@ using namespace llvm;
 void RISCVSubtarget::anchor() {}
 
 RISCVSubtarget &RISCVSubtarget::initializeSubtargetDependencies(
-    const Triple &TT, StringRef CPU, StringRef FS, StringRef ABIName) {
+    const Triple &TT, StringRef CPU, StringRef TuneCPU, StringRef FS, StringRef ABIName) {
   // Determine default and user-specified characteristics
   bool Is64Bit = TT.isArch64Bit();
   std::string CPUName = std::string(CPU);
+  std::string TuneCPUName = std::string(TuneCPU);
   if (CPUName.empty())
     CPUName = Is64Bit ? "generic-rv64" : "generic-rv32";
-  ParseSubtargetFeatures(CPUName, FS);
+  if (TuneCPUName.empty())
+    TuneCPUName = CPUName;
+  ParseSubtargetFeatures(CPUName, TuneCPUName, FS);
   if (Is64Bit) {
     XLenVT = MVT::i64;
     XLen = 64;
@@ -47,11 +50,12 @@ RISCVSubtarget &RISCVSubtarget::initializeSubtargetDependencies(
   return *this;
 }
 
-RISCVSubtarget::RISCVSubtarget(const Triple &TT, StringRef CPU, StringRef FS,
+RISCVSubtarget::RISCVSubtarget(const Triple &TT, StringRef CPU,
+                               StringRef TuneCPU, StringRef FS,
                                StringRef ABIName, const TargetMachine &TM)
-    : RISCVGenSubtargetInfo(TT, CPU, FS),
+    : RISCVGenSubtargetInfo(TT, CPU, TuneCPU, FS),
       UserReservedRegister(RISCV::NUM_TARGET_REGS),
-      FrameLowering(initializeSubtargetDependencies(TT, CPU, FS, ABIName)),
+      FrameLowering(initializeSubtargetDependencies(TT, CPU, TuneCPU, FS, ABIName)),
       InstrInfo(*this), RegInfo(getHwMode()), TLInfo(TM, *this) {
   CallLoweringInfo.reset(new RISCVCallLowering(*getTargetLowering()));
   Legalizer.reset(new RISCVLegalizerInfo(*this));
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSubtarget.h b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSubtarget.h
index fe1285f23b15..561b04cc0b44 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -13,10 +13,10 @@
 #ifndef LLVM_LIB_TARGET_RISCV_RISCVSUBTARGET_H
 #define LLVM_LIB_TARGET_RISCV_RISCVSUBTARGET_H
 
+#include "MCTargetDesc/RISCVBaseInfo.h"
 #include "RISCVFrameLowering.h"
 #include "RISCVISelLowering.h"
 #include "RISCVInstrInfo.h"
-#include "Utils/RISCVBaseInfo.h"
 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
@@ -40,6 +40,7 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
   bool HasStdExtD = false;
   bool HasStdExtC = false;
   bool HasStdExtB = false;
+  bool HasStdExtZba = false;
   bool HasStdExtZbb = false;
   bool HasStdExtZbc = false;
   bool HasStdExtZbe = false;
@@ -51,6 +52,9 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
   bool HasStdExtZbt = false;
   bool HasStdExtZbproposedc = false;
   bool HasStdExtV = false;
+  bool HasStdExtZvlsseg = false;
+  bool HasStdExtZvamo = false;
+  bool HasStdExtZfh = false;
   bool HasRV64 = false;
   bool IsRV32E = false;
   bool EnableLinkerRelax = false;
@@ -69,17 +73,19 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
   /// Initializes using the passed in CPU and feature strings so that we can
   /// use initializer lists for subtarget initialization.
   RISCVSubtarget &initializeSubtargetDependencies(const Triple &TT,
-                                                  StringRef CPU, StringRef FS,
+                                                  StringRef CPU,
+                                                  StringRef TuneCPU,
+                                                  StringRef FS,
                                                   StringRef ABIName);
 
 public:
   // Initializes the data members to match that of the specified triple.
-  RISCVSubtarget(const Triple &TT, StringRef CPU, StringRef FS,
-                 StringRef ABIName, const TargetMachine &TM);
+  RISCVSubtarget(const Triple &TT, StringRef CPU, StringRef TuneCPU,
+                 StringRef FS, StringRef ABIName, const TargetMachine &TM);
 
   // Parses features string setting specified subtarget options. The
   // definition of this function is auto-generated by tblgen.
-  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+  void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
 
   const RISCVFrameLowering *getFrameLowering() const override {
     return &FrameLowering;
@@ -101,6 +107,7 @@ public:
   bool hasStdExtD() const { return HasStdExtD; }
   bool hasStdExtC() const { return HasStdExtC; }
   bool hasStdExtB() const { return HasStdExtB; }
+  bool hasStdExtZba() const { return HasStdExtZba; }
   bool hasStdExtZbb() const { return HasStdExtZbb; }
   bool hasStdExtZbc() const { return HasStdExtZbc; }
   bool hasStdExtZbe() const { return HasStdExtZbe; }
@@ -112,6 +119,9 @@ public:
   bool hasStdExtZbt() const { return HasStdExtZbt; }
   bool hasStdExtZbproposedc() const { return HasStdExtZbproposedc; }
   bool hasStdExtV() const { return HasStdExtV; }
+  bool hasStdExtZvlsseg() const { return HasStdExtZvlsseg; }
+  bool hasStdExtZvamo() const { return HasStdExtZvamo; }
+  bool hasStdExtZfh() const { return HasStdExtZfh; }
   bool is64Bit() const { return HasRV64; }
   bool isRV32E() const { return IsRV32E; }
   bool enableLinkerRelax() const { return EnableLinkerRelax; }
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSystemOperands.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSystemOperands.td
index 8e75647bd4a9..16399fea150e 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSystemOperands.td
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSystemOperands.td
@@ -310,7 +310,9 @@ def: SysReg<"mhpmcounter31h", 0xB9F>;
 //===--------------------------
 // Machine Counter Setup
 //===--------------------------
+let AltName = "mucounteren" in // Privileged spec v1.9.1 Name
 def : SysReg<"mcountinhibit", 0x320>;
+
 def : SysReg<"mhpmevent3", 0x323>;
 def : SysReg<"mhpmevent4", 0x324>;
 def : SysReg<"mhpmevent5", 0x325>;
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 75683e2fd8e9..32fb7cd6753c 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -11,11 +11,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "RISCVTargetMachine.h"
+#include "MCTargetDesc/RISCVBaseInfo.h"
 #include "RISCV.h"
 #include "RISCVTargetObjectFile.h"
 #include "RISCVTargetTransformInfo.h"
 #include "TargetInfo/RISCVTargetInfo.h"
-#include "Utils/RISCVBaseInfo.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
@@ -35,18 +35,18 @@ using namespace llvm;
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() {
   RegisterTargetMachine<RISCVTargetMachine> X(getTheRISCV32Target());
   RegisterTargetMachine<RISCVTargetMachine> Y(getTheRISCV64Target());
-  auto PR = PassRegistry::getPassRegistry();
+  auto *PR = PassRegistry::getPassRegistry();
   initializeGlobalISel(*PR);
+  initializeRISCVMergeBaseOffsetOptPass(*PR);
   initializeRISCVExpandPseudoPass(*PR);
+  initializeRISCVCleanupVSETVLIPass(*PR);
 }
 
 static StringRef computeDataLayout(const Triple &TT) {
-  if (TT.isArch64Bit()) {
+  if (TT.isArch64Bit())
     return "e-m:e-p:64:64-i64:64-i128:128-n64-S128";
-  } else {
-    assert(TT.isArch32Bit() && "only RV32 and RV64 are currently supported");
-    return "e-m:e-p:32:32-i64:64-n32-S128";
-  }
+  assert(TT.isArch32Bit() && "only RV32 and RV64 are currently supported");
+  return "e-m:e-p:32:32-i64:64-n32-S128";
 }
 
 static Reloc::Model getEffectiveRelocModel(const Triple &TT,
@@ -75,15 +75,16 @@ RISCVTargetMachine::RISCVTargetMachine(const Target &T, const Triple &TT,
 const RISCVSubtarget *
 RISCVTargetMachine::getSubtargetImpl(const Function &F) const {
   Attribute CPUAttr = F.getFnAttribute("target-cpu");
+  Attribute TuneAttr = F.getFnAttribute("tune-cpu");
   Attribute FSAttr = F.getFnAttribute("target-features");
 
-  std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
-                        ? CPUAttr.getValueAsString().str()
-                        : TargetCPU;
-  std::string FS = !FSAttr.hasAttribute(Attribute::None)
-                       ? FSAttr.getValueAsString().str()
-                       : TargetFS;
-  std::string Key = CPU + FS;
+  std::string CPU =
+      CPUAttr.isValid() ? CPUAttr.getValueAsString().str() : TargetCPU;
+  std::string TuneCPU =
+      TuneAttr.isValid() ? TuneAttr.getValueAsString().str() : CPU;
+  std::string FS =
+      FSAttr.isValid() ? FSAttr.getValueAsString().str() : TargetFS;
+  std::string Key = CPU + TuneCPU + FS;
   auto &I = SubtargetMap[Key];
   if (!I) {
     // This needs to be done before we create a new subtarget since any
@@ -100,7 +101,7 @@ RISCVTargetMachine::getSubtargetImpl(const Function &F) const {
       }
       ABIName = ModuleTargetABI->getString();
     }
-    I = std::make_unique<RISCVSubtarget>(TargetTriple, CPU, FS, ABIName, *this);
+    I = std::make_unique<RISCVSubtarget>(TargetTriple, CPU, TuneCPU, FS, ABIName, *this);
   }
   return I.get();
 }
@@ -110,6 +111,15 @@ RISCVTargetMachine::getTargetTransformInfo(const Function &F) {
   return TargetTransformInfo(RISCVTTIImpl(this, F));
 }
 
+// A RISC-V hart has a single byte-addressable address space of 2^XLEN bytes
+// for all memory accesses, so it is reasonable to assume that an
+// implementation has no-op address space casts. If an implementation makes a
+// change to this, they can override it here.
+bool RISCVTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS,
+                                             unsigned DstAS) const {
+  return true;
+}
+
 namespace {
 class RISCVPassConfig : public TargetPassConfig {
 public:
@@ -131,7 +141,7 @@ public:
   void addPreSched2() override;
   void addPreRegAlloc() override;
 };
-}
+} // namespace
 
 TargetPassConfig *RISCVTargetMachine::createPassConfig(PassManagerBase &PM) {
   return new RISCVPassConfig(*this, PM);
@@ -149,7 +159,7 @@ bool RISCVPassConfig::addInstSelector() {
 }
 
 bool RISCVPassConfig::addIRTranslator() {
-  addPass(new IRTranslator());
+  addPass(new IRTranslator(getOptLevel()));
   return false;
 }
 
@@ -181,5 +191,8 @@ void RISCVPassConfig::addPreEmitPass2() {
 }
 
 void RISCVPassConfig::addPreRegAlloc() {
-  addPass(createRISCVMergeBaseOffsetOptPass());
+  if (TM->getOptLevel() != CodeGenOpt::None) {
+    addPass(createRISCVMergeBaseOffsetOptPass());
+    addPass(createRISCVCleanupVSETVLIPass());
+  }
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVTargetMachine.h b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVTargetMachine.h
index a4476fa40a7d..3156333f7ee1 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVTargetMachine.h
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVTargetMachine.h
@@ -43,7 +43,10 @@ public:
   }
 
   TargetTransformInfo getTargetTransformInfo(const Function &F) override;
+
+  virtual bool isNoopAddrSpaceCast(unsigned SrcAS,
+                                   unsigned DstAS) const override;
 };
-}
+} // namespace llvm
 
 #endif
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index bd78f801c59a..27714cffc989 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "RISCVTargetTransformInfo.h"
-#include "Utils/RISCVMatInt.h"
+#include "MCTargetDesc/RISCVMatInt.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/BasicTTIImpl.h"
 #include "llvm/CodeGen/TargetLowering.h"
@@ -30,8 +30,10 @@ int RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
                                     getST()->is64Bit());
 }
 
-int RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,
-                                Type *Ty, TTI::TargetCostKind CostKind) {
+int RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
+                                    const APInt &Imm, Type *Ty,
+                                    TTI::TargetCostKind CostKind,
+                                    Instruction *Inst) {
   assert(Ty->isIntegerTy() &&
          "getIntImmCost can only estimate cost of materialising integers");
 
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 392700707760..8d077e946305 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -42,8 +42,9 @@ public:
         TLI(ST->getTargetLowering()) {}
 
   int getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind);
-  int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty,
-                        TTI::TargetCostKind CostKind);
+  int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,
+                        Type *Ty, TTI::TargetCostKind CostKind,
+                        Instruction *Inst = nullptr);
   int getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
                           Type *Ty, TTI::TargetCostKind CostKind);
 };
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/Utils/RISCVBaseInfo.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/Utils/RISCVBaseInfo.cpp
deleted file mode 100644
index 43b1f8b80c5f..000000000000
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/Utils/RISCVBaseInfo.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-#include "RISCVBaseInfo.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/Triple.h"
-#include "llvm/Support/raw_ostream.h"
-
-namespace llvm {
-namespace RISCVSysReg {
-#define GET_SysRegsList_IMPL
-#include "RISCVGenSystemOperands.inc"
-} // namespace RISCVSysReg
-
-namespace RISCVABI {
-ABI computeTargetABI(const Triple &TT, FeatureBitset FeatureBits,
-                     StringRef ABIName) {
-  auto TargetABI = getTargetABI(ABIName);
-  bool IsRV64 = TT.isArch64Bit();
-  bool IsRV32E = FeatureBits[RISCV::FeatureRV32E];
-
-  if (!ABIName.empty() && TargetABI == ABI_Unknown) {
-    errs()
-        << "'" << ABIName
-        << "' is not a recognized ABI for this target (ignoring target-abi)\n";
-  } else if (ABIName.startswith("ilp32") && IsRV64) {
-    errs() << "32-bit ABIs are not supported for 64-bit targets (ignoring "
-              "target-abi)\n";
-    TargetABI = ABI_Unknown;
-  } else if (ABIName.startswith("lp64") && !IsRV64) {
-    errs() << "64-bit ABIs are not supported for 32-bit targets (ignoring "
-              "target-abi)\n";
-    TargetABI = ABI_Unknown;
-  } else if (IsRV32E && TargetABI != ABI_ILP32E && TargetABI != ABI_Unknown) {
-    // TODO: move this checking to RISCVTargetLowering and RISCVAsmParser
-    errs()
-        << "Only the ilp32e ABI is supported for RV32E (ignoring target-abi)\n";
-    TargetABI = ABI_Unknown;
-  }
-
-  if (TargetABI != ABI_Unknown)
-    return TargetABI;
-
-  // For now, default to the ilp32/ilp32e/lp64 ABI if no explicit ABI is given
-  // or an invalid/unrecognised string is given. In the future, it might be
-  // worth changing this to default to ilp32f/lp64f and ilp32d/lp64d when
-  // hardware support for floating point is present.
-  if (IsRV32E)
-    return ABI_ILP32E;
-  if (IsRV64)
-    return ABI_LP64;
-  return ABI_ILP32;
-}
-
-ABI getTargetABI(StringRef ABIName) {
-  auto TargetABI = StringSwitch<ABI>(ABIName)
-                       .Case("ilp32", ABI_ILP32)
-                       .Case("ilp32f", ABI_ILP32F)
-                       .Case("ilp32d", ABI_ILP32D)
-                       .Case("ilp32e", ABI_ILP32E)
-                       .Case("lp64", ABI_LP64)
-                       .Case("lp64f", ABI_LP64F)
-                       .Case("lp64d", ABI_LP64D)
-                       .Default(ABI_Unknown);
-  return TargetABI;
-}
-
-// To avoid the BP value clobbered by a function call, we need to choose a
-// callee saved register to save the value. RV32E only has X8 and X9 as callee
-// saved registers and X8 will be used as fp. So we choose X9 as bp.
-Register getBPReg() { return RISCV::X9; }
-
-} // namespace RISCVABI
-
-namespace RISCVFeatures {
-
-void validate(const Triple &TT, const FeatureBitset &FeatureBits) {
-  if (TT.isArch64Bit() && FeatureBits[RISCV::FeatureRV32E])
-    report_fatal_error("RV32E can't be enabled for an RV64 target");
-}
-
-} // namespace RISCVFeatures
-
-} // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/Utils/RISCVBaseInfo.h b/contrib/llvm-project/llvm/lib/Target/RISCV/Utils/RISCVBaseInfo.h
deleted file mode 100644
index 4e6cdd8606b1..000000000000
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/Utils/RISCVBaseInfo.h
+++ /dev/null
@@ -1,223 +0,0 @@
-//===-- RISCVBaseInfo.h - Top level definitions for RISCV MC ----*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains small standalone enum definitions for the RISCV target
-// useful for the compiler back-end and the MC libraries.
-//
-//===----------------------------------------------------------------------===//
-#ifndef LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVBASEINFO_H
-#define LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVBASEINFO_H
-
-#include "RISCVRegisterInfo.h"
-#include "MCTargetDesc/RISCVMCTargetDesc.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/MC/MCInstrDesc.h"
-#include "llvm/MC/SubtargetFeature.h"
-
-namespace llvm {
-
-// RISCVII - This namespace holds all of the target specific flags that
-// instruction info tracks. All definitions must match RISCVInstrFormats.td.
-namespace RISCVII {
-enum {
-  InstFormatPseudo = 0,
-  InstFormatR = 1,
-  InstFormatR4 = 2,
-  InstFormatI = 3,
-  InstFormatS = 4,
-  InstFormatB = 5,
-  InstFormatU = 6,
-  InstFormatJ = 7,
-  InstFormatCR = 8,
-  InstFormatCI = 9,
-  InstFormatCSS = 10,
-  InstFormatCIW = 11,
-  InstFormatCL = 12,
-  InstFormatCS = 13,
-  InstFormatCA = 14,
-  InstFormatCB = 15,
-  InstFormatCJ = 16,
-  InstFormatOther = 17,
-
-  InstFormatMask = 31,
-};
-
-// RISC-V Specific Machine Operand Flags
-enum {
-  MO_None = 0,
-  MO_CALL = 1,
-  MO_PLT = 2,
-  MO_LO = 3,
-  MO_HI = 4,
-  MO_PCREL_LO = 5,
-  MO_PCREL_HI = 6,
-  MO_GOT_HI = 7,
-  MO_TPREL_LO = 8,
-  MO_TPREL_HI = 9,
-  MO_TPREL_ADD = 10,
-  MO_TLS_GOT_HI = 11,
-  MO_TLS_GD_HI = 12,
-
-  // Used to differentiate between target-specific "direct" flags and "bitmask"
-  // flags. A machine operand can only have one "direct" flag, but can have
-  // multiple "bitmask" flags.
-  MO_DIRECT_FLAG_MASK = 15
-};
-} // namespace RISCVII
-
-namespace RISCVOp {
-enum OperandType : unsigned {
-  OPERAND_FIRST_RISCV_IMM = MCOI::OPERAND_FIRST_TARGET,
-  OPERAND_UIMM4 = OPERAND_FIRST_RISCV_IMM,
-  OPERAND_UIMM5,
-  OPERAND_UIMM12,
-  OPERAND_SIMM12,
-  OPERAND_SIMM13_LSB0,
-  OPERAND_UIMM20,
-  OPERAND_SIMM21_LSB0,
-  OPERAND_UIMMLOG2XLEN,
-  OPERAND_LAST_RISCV_IMM = OPERAND_UIMMLOG2XLEN
-};
-} // namespace RISCVOp
-
-// Describes the predecessor/successor bits used in the FENCE instruction.
-namespace RISCVFenceField {
-enum FenceField {
-  I = 8,
-  O = 4,
-  R = 2,
-  W = 1
-};
-}
-
-// Describes the supported floating point rounding mode encodings.
-namespace RISCVFPRndMode {
-enum RoundingMode {
-  RNE = 0,
-  RTZ = 1,
-  RDN = 2,
-  RUP = 3,
-  RMM = 4,
-  DYN = 7,
-  Invalid
-};
-
-inline static StringRef roundingModeToString(RoundingMode RndMode) {
-  switch (RndMode) {
-  default:
-    llvm_unreachable("Unknown floating point rounding mode");
-  case RISCVFPRndMode::RNE:
-    return "rne";
-  case RISCVFPRndMode::RTZ:
-    return "rtz";
-  case RISCVFPRndMode::RDN:
-    return "rdn";
-  case RISCVFPRndMode::RUP:
-    return "rup";
-  case RISCVFPRndMode::RMM:
-    return "rmm";
-  case RISCVFPRndMode::DYN:
-    return "dyn";
-  }
-}
-
-inline static RoundingMode stringToRoundingMode(StringRef Str) {
-  return StringSwitch<RoundingMode>(Str)
-      .Case("rne", RISCVFPRndMode::RNE)
-      .Case("rtz", RISCVFPRndMode::RTZ)
-      .Case("rdn", RISCVFPRndMode::RDN)
-      .Case("rup", RISCVFPRndMode::RUP)
-      .Case("rmm", RISCVFPRndMode::RMM)
-      .Case("dyn", RISCVFPRndMode::DYN)
-      .Default(RISCVFPRndMode::Invalid);
-}
-
-inline static bool isValidRoundingMode(unsigned Mode) {
-  switch (Mode) {
-  default:
-    return false;
-  case RISCVFPRndMode::RNE:
-  case RISCVFPRndMode::RTZ:
-  case RISCVFPRndMode::RDN:
-  case RISCVFPRndMode::RUP:
-  case RISCVFPRndMode::RMM:
-  case RISCVFPRndMode::DYN:
-    return true;
-  }
-}
-} // namespace RISCVFPRndMode
-
-namespace RISCVSysReg {
-struct SysReg {
-  const char *Name;
-  unsigned Encoding;
-  const char *AltName;
-  // FIXME: add these additional fields when needed.
-  // Privilege Access: Read, Write, Read-Only.
-  // unsigned ReadWrite;
-  // Privilege Mode: User, System or Machine.
-  // unsigned Mode;
-  // Check field name.
-  // unsigned Extra;
-  // Register number without the privilege bits.
-  // unsigned Number;
-  FeatureBitset FeaturesRequired;
-  bool isRV32Only;
-
-  bool haveRequiredFeatures(FeatureBitset ActiveFeatures) const {
-    // Not in 32-bit mode.
-    if (isRV32Only && ActiveFeatures[RISCV::Feature64Bit])
-      return false;
-    // No required feature associated with the system register.
-    if (FeaturesRequired.none())
-      return true;
-    return (FeaturesRequired & ActiveFeatures) == FeaturesRequired;
-  }
-};
-
-#define GET_SysRegsList_DECL
-#include "RISCVGenSystemOperands.inc"
-} // end namespace RISCVSysReg
-
-namespace RISCVABI {
-
-enum ABI {
-  ABI_ILP32,
-  ABI_ILP32F,
-  ABI_ILP32D,
-  ABI_ILP32E,
-  ABI_LP64,
-  ABI_LP64F,
-  ABI_LP64D,
-  ABI_Unknown
-};
-
-// Returns the target ABI, or else a StringError if the requested ABIName is
-// not supported for the given TT and FeatureBits combination.
-ABI computeTargetABI(const Triple &TT, FeatureBitset FeatureBits,
-                     StringRef ABIName);
-
-ABI getTargetABI(StringRef ABIName);
-
-// Returns the register used to hold the stack pointer after realignment.
-Register getBPReg();
-
-} // namespace RISCVABI
-
-namespace RISCVFeatures {
-
-// Validates if the given combination of features are valid for the target
-// triple. Exits with report_fatal_error if not.
-void validate(const Triple &TT, const FeatureBitset &FeatureBits);
-
-} // namespace RISCVFeatures
-
-} // namespace llvm
-
-#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/Utils/RISCVMatInt.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/Utils/RISCVMatInt.cpp
deleted file mode 100644
index f390ddb89e3c..000000000000
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/Utils/RISCVMatInt.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-//===- RISCVMatInt.cpp - Immediate materialisation -------------*- C++ -*--===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "RISCVMatInt.h"
-#include "MCTargetDesc/RISCVMCTargetDesc.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/MachineValueType.h"
-#include "llvm/Support/MathExtras.h"
-#include <cstdint>
-
-namespace llvm {
-
-namespace RISCVMatInt {
-void generateInstSeq(int64_t Val, bool IsRV64, InstSeq &Res) {
-  if (isInt<32>(Val)) {
-    // Depending on the active bits in the immediate Value v, the following
-    // instruction sequences are emitted:
-    //
-    // v == 0                        : ADDI
-    // v[0,12) != 0 && v[12,32) == 0 : ADDI
-    // v[0,12) == 0 && v[12,32) != 0 : LUI
-    // v[0,32) != 0                  : LUI+ADDI(W)
-    int64_t Hi20 = ((Val + 0x800) >> 12) & 0xFFFFF;
-    int64_t Lo12 = SignExtend64<12>(Val);
-
-    if (Hi20)
-      Res.push_back(Inst(RISCV::LUI, Hi20));
-
-    if (Lo12 || Hi20 == 0) {
-      unsigned AddiOpc = (IsRV64 && Hi20) ? RISCV::ADDIW : RISCV::ADDI;
-      Res.push_back(Inst(AddiOpc, Lo12));
-    }
-    return;
-  }
-
-  assert(IsRV64 && "Can't emit >32-bit imm for non-RV64 target");
-
-  // In the worst case, for a full 64-bit constant, a sequence of 8 instructions
-  // (i.e., LUI+ADDIW+SLLI+ADDI+SLLI+ADDI+SLLI+ADDI) has to be emmitted. Note
-  // that the first two instructions (LUI+ADDIW) can contribute up to 32 bits
-  // while the following ADDI instructions contribute up to 12 bits each.
-  //
-  // On the first glance, implementing this seems to be possible by simply
-  // emitting the most significant 32 bits (LUI+ADDIW) followed by as many left
-  // shift (SLLI) and immediate additions (ADDI) as needed. However, due to the
-  // fact that ADDI performs a sign extended addition, doing it like that would
-  // only be possible when at most 11 bits of the ADDI instructions are used.
-  // Using all 12 bits of the ADDI instructions, like done by GAS, actually
-  // requires that the constant is processed starting with the least significant
-  // bit.
-  //
-  // In the following, constants are processed from LSB to MSB but instruction
-  // emission is performed from MSB to LSB by recursively calling
-  // generateInstSeq. In each recursion, first the lowest 12 bits are removed
-  // from the constant and the optimal shift amount, which can be greater than
-  // 12 bits if the constant is sparse, is determined. Then, the shifted
-  // remaining constant is processed recursively and gets emitted as soon as it
-  // fits into 32 bits. The emission of the shifts and additions is subsequently
-  // performed when the recursion returns.
-
-  int64_t Lo12 = SignExtend64<12>(Val);
-  int64_t Hi52 = ((uint64_t)Val + 0x800ull) >> 12;
-  int ShiftAmount = 12 + findFirstSet((uint64_t)Hi52);
-  Hi52 = SignExtend64(Hi52 >> (ShiftAmount - 12), 64 - ShiftAmount);
-
-  generateInstSeq(Hi52, IsRV64, Res);
-
-  Res.push_back(Inst(RISCV::SLLI, ShiftAmount));
-  if (Lo12)
-    Res.push_back(Inst(RISCV::ADDI, Lo12));
-}
-
-int getIntMatCost(const APInt &Val, unsigned Size, bool IsRV64) {
-  int PlatRegSize = IsRV64 ? 64 : 32;
-
-  // Split the constant into platform register sized chunks, and calculate cost
-  // of each chunk.
-  int Cost = 0;
-  for (unsigned ShiftVal = 0; ShiftVal < Size; ShiftVal += PlatRegSize) {
-    APInt Chunk = Val.ashr(ShiftVal).sextOrTrunc(PlatRegSize);
-    InstSeq MatSeq;
-    generateInstSeq(Chunk.getSExtValue(), IsRV64, MatSeq);
-    Cost += MatSeq.size();
-  }
-  return std::max(1, Cost);
-}
-} // namespace RISCVMatInt
-} // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/Utils/RISCVMatInt.h b/contrib/llvm-project/llvm/lib/Target/RISCV/Utils/RISCVMatInt.h
deleted file mode 100644
index b12ae2eade99..000000000000
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/Utils/RISCVMatInt.h
+++ /dev/null
@@ -1,44 +0,0 @@
-//===- RISCVMatInt.h - Immediate materialisation ---------------*- C++ -*--===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_RISCV_MATINT_H
-#define LLVM_LIB_TARGET_RISCV_MATINT_H
-
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/MachineValueType.h"
-#include <cstdint>
-
-namespace llvm {
-
-namespace RISCVMatInt {
-struct Inst {
-  unsigned Opc;
-  int64_t Imm;
-
-  Inst(unsigned Opc, int64_t Imm) : Opc(Opc), Imm(Imm) {}
-};
-using InstSeq = SmallVector<Inst, 8>;
-
-// Helper to generate an instruction sequence that will materialise the given
-// immediate value into a register. A sequence of instructions represented by
-// a simple struct produced rather than directly emitting the instructions in
-// order to allow this helper to be used from both the MC layer and during
-// instruction selection.
-void generateInstSeq(int64_t Val, bool IsRV64, InstSeq &Res);
-
-// Helper to estimate the number of instructions required to materialise the
-// given immediate value into a register. This estimate does not account for
-// `Val` possibly fitting into an immediate, and so may over-estimate.
-//
-// This will attempt to produce instructions to materialise `Val` as an
-// `Size`-bit immediate. `IsRV64` should match the target architecture.
-int getIntMatCost(const APInt &Val, unsigned Size, bool IsRV64);
-} // namespace RISCVMatInt
-} // namespace llvm
-#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/contrib/llvm-project/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
index 16e159621672..5f1bf316e871 100644
--- a/contrib/llvm-project/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
@@ -82,6 +82,11 @@ class SparcAsmParser : public MCTargetAsmParser {
 
   OperandMatchResultTy parseMembarTag(OperandVector &Operands);
 
+  template <unsigned N>
+  OperandMatchResultTy parseShiftAmtImm(OperandVector &Operands);
+
+  OperandMatchResultTy parseCallTarget(OperandVector &Operands);
+
   OperandMatchResultTy parseOperand(OperandVector &Operands, StringRef Name);
 
   OperandMatchResultTy
@@ -262,6 +267,36 @@ public:
   bool isMEMri() const { return Kind == k_MemoryImm; }
   bool isMembarTag() const { return Kind == k_Immediate; }
 
+  bool isCallTarget() const {
+    if (!isImm())
+      return false;
+
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val))
+      return CE->getValue() % 4 == 0;
+
+    return true;
+  }
+
+  bool isShiftAmtImm5() const {
+    if (!isImm())
+      return false;
+
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val))
+      return isUInt<5>(CE->getValue());
+
+    return false;
+  }
+
+  bool isShiftAmtImm6() const {
+    if (!isImm())
+      return false;
+
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val))
+      return isUInt<6>(CE->getValue());
+
+    return false;
+  }
+
   bool isIntReg() const {
     return (Kind == k_Register && Reg.Kind == rk_IntReg);
   }
@@ -343,6 +378,15 @@ public:
     addExpr(Inst, Expr);
   }
 
+  void addShiftAmtImm5Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
+  }
+  void addShiftAmtImm6Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
+  }
+
   void addExpr(MCInst &Inst, const MCExpr *Expr) const{
     // Add as immediate when possible.  Null MCExpr = 0.
     if (!Expr)
@@ -377,6 +421,11 @@ public:
     addExpr(Inst, Expr);
   }
 
+  void addCallTargetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
+  }
+
   static std::unique_ptr<SparcOperand> CreateToken(StringRef Str, SMLoc S) {
     auto Op = std::make_unique<SparcOperand>(k_Token);
     Op->Tok.Data = Str.data();
@@ -645,7 +694,7 @@ OperandMatchResultTy SparcAsmParser::tryParseRegister(unsigned &RegNo,
   EndLoc = Tok.getEndLoc();
   RegNo = 0;
   if (getLexer().getKind() != AsmToken::Percent)
-    return MatchOperand_Success;
+    return MatchOperand_NoMatch;
   Parser.Lex();
   unsigned regKind = SparcOperand::rk_None;
   if (matchRegisterName(Tok, RegNo, regKind)) {
@@ -729,37 +778,74 @@ ParseDirective(AsmToken DirectiveID)
 OperandMatchResultTy
 SparcAsmParser::parseMEMOperand(OperandVector &Operands) {
   SMLoc S, E;
-  unsigned BaseReg = 0;
 
-  if (ParseRegister(BaseReg, S, E)) {
+  std::unique_ptr<SparcOperand> LHS;
+  if (parseSparcAsmOperand(LHS) != MatchOperand_Success)
     return MatchOperand_NoMatch;
+
+  // Single immediate operand
+  if (LHS->isImm()) {
+    Operands.push_back(SparcOperand::MorphToMEMri(Sparc::G0, std::move(LHS)));
+    return MatchOperand_Success;
   }
 
-  switch (getLexer().getKind()) {
-  default: return MatchOperand_NoMatch;
+  if (!LHS->isIntReg()) {
+    Error(LHS->getStartLoc(), "invalid register kind for this operand");
+    return MatchOperand_ParseFail;
+  }
 
-  case AsmToken::Comma:
-  case AsmToken::RBrac:
-  case AsmToken::EndOfStatement:
-    Operands.push_back(SparcOperand::CreateMEMr(BaseReg, S, E));
-    return MatchOperand_Success;
+  AsmToken Tok = getLexer().getTok();
+  // The plus token may be followed by a register or an immediate value, the
+  // minus one is always interpreted as sign for the immediate value
+  if (Tok.is(AsmToken::Plus) || Tok.is(AsmToken::Minus)) {
+    (void)Parser.parseOptionalToken(AsmToken::Plus);
 
-  case AsmToken:: Plus:
-    Parser.Lex(); // Eat the '+'
-    break;
-  case AsmToken::Minus:
-    break;
+    std::unique_ptr<SparcOperand> RHS;
+    if (parseSparcAsmOperand(RHS) != MatchOperand_Success)
+      return MatchOperand_NoMatch;
+
+    if (RHS->isReg() && !RHS->isIntReg()) {
+      Error(RHS->getStartLoc(), "invalid register kind for this operand");
+      return MatchOperand_ParseFail;
+    }
+
+    Operands.push_back(
+        RHS->isImm()
+            ? SparcOperand::MorphToMEMri(LHS->getReg(), std::move(RHS))
+            : SparcOperand::MorphToMEMrr(LHS->getReg(), std::move(RHS)));
+
+    return MatchOperand_Success;
   }
 
-  std::unique_ptr<SparcOperand> Offset;
-  OperandMatchResultTy ResTy = parseSparcAsmOperand(Offset);
-  if (ResTy != MatchOperand_Success || !Offset)
+  Operands.push_back(SparcOperand::CreateMEMr(LHS->getReg(), S, E));
+  return MatchOperand_Success;
+}
+
+template <unsigned N>
+OperandMatchResultTy SparcAsmParser::parseShiftAmtImm(OperandVector &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+  SMLoc E = SMLoc::getFromPointer(S.getPointer() - 1);
+
+  // This is a register, not an immediate
+  if (getLexer().getKind() == AsmToken::Percent)
     return MatchOperand_NoMatch;
 
-  Operands.push_back(
-      Offset->isImm() ? SparcOperand::MorphToMEMri(BaseReg, std::move(Offset))
-                      : SparcOperand::MorphToMEMrr(BaseReg, std::move(Offset)));
+  const MCExpr *Expr;
+  if (getParser().parseExpression(Expr))
+    return MatchOperand_ParseFail;
+
+  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr);
+  if (!CE) {
+    Error(S, "constant expression expected");
+    return MatchOperand_ParseFail;
+  }
+
+  if (!isUInt<N>(CE->getValue())) {
+    Error(S, "immediate shift value out of range");
+    return MatchOperand_ParseFail;
+  }
 
+  Operands.push_back(SparcOperand::CreateImm(Expr, S, E));
   return MatchOperand_Success;
 }
 
@@ -809,6 +895,33 @@ OperandMatchResultTy SparcAsmParser::parseMembarTag(OperandVector &Operands) {
   return MatchOperand_Success;
 }
 
+OperandMatchResultTy SparcAsmParser::parseCallTarget(OperandVector &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+  SMLoc E = SMLoc::getFromPointer(S.getPointer() - 1);
+
+  switch (getLexer().getKind()) {
+  default:
+    return MatchOperand_NoMatch;
+  case AsmToken::LParen:
+  case AsmToken::Integer:
+  case AsmToken::Identifier:
+  case AsmToken::Dot:
+    break;
+  }
+
+  const MCExpr *DestValue;
+  if (getParser().parseExpression(DestValue))
+    return MatchOperand_NoMatch;
+
+  bool IsPic = getContext().getObjectFileInfo()->isPositionIndependent();
+  SparcMCExpr::VariantKind Kind =
+      IsPic ? SparcMCExpr::VK_Sparc_WPLT30 : SparcMCExpr::VK_Sparc_WDISP30;
+
+  const MCExpr *DestExpr = SparcMCExpr::create(Kind, DestValue, getContext());
+  Operands.push_back(SparcOperand::CreateImm(DestExpr, S, E));
+  return MatchOperand_Success;
+}
+
 OperandMatchResultTy
 SparcAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
 
@@ -936,6 +1049,7 @@ SparcAsmParser::parseSparcAsmOperand(std::unique_ptr<SparcOperand> &Op,
     }
     break;
 
+  case AsmToken::Plus:
   case AsmToken::Minus:
   case AsmToken::Integer:
   case AsmToken::LParen:
@@ -1272,7 +1386,7 @@ const SparcMCExpr *
 SparcAsmParser::adjustPICRelocation(SparcMCExpr::VariantKind VK,
                                     const MCExpr *subExpr) {
   // When in PIC mode, "%lo(...)" and "%hi(...)" behave differently.
-  // If the expression refers contains _GLOBAL_OFFSETE_TABLE, it is
+  // If the expression refers contains _GLOBAL_OFFSET_TABLE, it is
   // actually a %pc10 or %pc22 relocation. Otherwise, they are interpreted
   // as %got10 or %got22 relocation.
 
diff --git a/contrib/llvm-project/llvm/lib/Target/Sparc/LeonPasses.cpp b/contrib/llvm-project/llvm/lib/Target/Sparc/LeonPasses.cpp
index e9d3aaeb9cfe..6ad6940c6b51 100644
--- a/contrib/llvm-project/llvm/lib/Target/Sparc/LeonPasses.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Sparc/LeonPasses.cpp
@@ -10,14 +10,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "LeonPasses.h"
-#include "llvm/CodeGen/ISDOpcodes.h"
+#include "SparcSubtarget.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/DiagnosticInfo.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/raw_ostream.h"
+
 using namespace llvm;
 
 LEONMachineFunctionPass::LEONMachineFunctionPass(char &ID)
diff --git a/contrib/llvm-project/llvm/lib/Target/Sparc/LeonPasses.h b/contrib/llvm-project/llvm/lib/Target/Sparc/LeonPasses.h
index b165bc93780f..9bc4569a1298 100644
--- a/contrib/llvm-project/llvm/lib/Target/Sparc/LeonPasses.h
+++ b/contrib/llvm-project/llvm/lib/Target/Sparc/LeonPasses.h
@@ -12,14 +12,11 @@
 #ifndef LLVM_LIB_TARGET_SPARC_LEON_PASSES_H
 #define LLVM_LIB_TARGET_SPARC_LEON_PASSES_H
 
-#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/Passes.h"
-
-#include "Sparc.h"
-#include "SparcSubtarget.h"
 
 namespace llvm {
+class SparcSubtarget;
+
 class LLVM_LIBRARY_VISIBILITY LEONMachineFunctionPass
     : public MachineFunctionPass {
 protected:
@@ -33,13 +30,11 @@ protected:
 protected:
   LEONMachineFunctionPass(char &ID);
 
-  int GetRegIndexForOperand(MachineInstr &MI, int OperandIndex);
   void clearUsedRegisterList() { UsedRegisters.clear(); }
 
   void markRegisterUsed(int registerIndex) {
     UsedRegisters.push_back(registerIndex);
   }
-  int getUnusedFPRegister(MachineRegisterInfo &MRI);
 };
 
 class LLVM_LIBRARY_VISIBILITY InsertNOPLoad : public LEONMachineFunctionPass {
diff --git a/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp b/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
index 83c44e0682ce..5a9ecfe74ecc 100644
--- a/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
@@ -255,12 +255,6 @@ namespace {
       }
     }
 
-    bool mayNeedRelaxation(const MCInst &Inst,
-                           const MCSubtargetInfo &STI) const override {
-      // FIXME.
-      return false;
-    }
-
     /// fixupNeedsRelaxation - Target specific predicate for whether a given
     /// fixup requires the associated instruction to be relaxed.
     bool fixupNeedsRelaxation(const MCFixup &Fixup,
diff --git a/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp
index f6728a070736..8e4621946008 100644
--- a/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp
@@ -141,24 +141,34 @@ void SparcInstPrinter::printOperand(const MCInst *MI, int opNum,
 void SparcInstPrinter::printMemOperand(const MCInst *MI, int opNum,
                                        const MCSubtargetInfo &STI,
                                        raw_ostream &O, const char *Modifier) {
-  printOperand(MI, opNum, STI, O);
-
   // If this is an ADD operand, emit it like normal operands.
   if (Modifier && !strcmp(Modifier, "arith")) {
+    printOperand(MI, opNum, STI, O);
     O << ", ";
-    printOperand(MI, opNum+1, STI, O);
+    printOperand(MI, opNum + 1, STI, O);
     return;
   }
-  const MCOperand &MO = MI->getOperand(opNum+1);
 
-  if (MO.isReg() && MO.getReg() == SP::G0)
-    return;   // don't print "+%g0"
-  if (MO.isImm() && MO.getImm() == 0)
-    return;   // don't print "+0"
+  const MCOperand &Op1 = MI->getOperand(opNum);
+  const MCOperand &Op2 = MI->getOperand(opNum + 1);
+
+  bool PrintedFirstOperand = false;
+  if (Op1.isReg() && Op1.getReg() != SP::G0) {
+    printOperand(MI, opNum, STI, O);
+    PrintedFirstOperand = true;
+  }
 
-  O << "+";
+  // Skip the second operand iff it adds nothing (literal 0 or %g0) and we've
+  // already printed the first one
+  const bool SkipSecondOperand =
+      PrintedFirstOperand && ((Op2.isReg() && Op2.getReg() == SP::G0) ||
+                              (Op2.isImm() && Op2.getImm() == 0));
 
-  printOperand(MI, opNum+1, STI, O);
+  if (!SkipSecondOperand) {
+    if (PrintedFirstOperand)
+      O << '+';
+    printOperand(MI, opNum + 1, STI, O);
+  }
 }
 
 void SparcInstPrinter::printCCOperand(const MCInst *MI, int opNum,
diff --git a/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.h b/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.h
index 11587f165ef2..91b78bd03fc3 100644
--- a/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.h
@@ -31,6 +31,7 @@ public:
   bool isV9(const MCSubtargetInfo &STI) const;
 
   // Autogenerated by tblgen.
+  std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
   void printInstruction(const MCInst *MI, uint64_t Address,
                         const MCSubtargetInfo &STI, raw_ostream &O);
   bool printAliasInstr(const MCInst *MI, uint64_t Address,
diff --git a/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp b/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
index 7e908011bd50..9f8522541332 100644
--- a/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
@@ -22,6 +22,7 @@
 #include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
@@ -68,13 +69,15 @@ public:
   unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO,
                              SmallVectorImpl<MCFixup> &Fixups,
                              const MCSubtargetInfo &STI) const;
-
   unsigned getCallTargetOpValue(const MCInst &MI, unsigned OpNo,
                              SmallVectorImpl<MCFixup> &Fixups,
                              const MCSubtargetInfo &STI) const;
   unsigned getBranchTargetOpValue(const MCInst &MI, unsigned OpNo,
                              SmallVectorImpl<MCFixup> &Fixups,
                              const MCSubtargetInfo &STI) const;
+  unsigned getSImm13OpValue(const MCInst &MI, unsigned OpNo,
+                            SmallVectorImpl<MCFixup> &Fixups,
+                            const MCSubtargetInfo &STI) const;
   unsigned getBranchPredTargetOpValue(const MCInst &MI, unsigned OpNo,
                                       SmallVectorImpl<MCFixup> &Fixups,
                                       const MCSubtargetInfo &STI) const;
@@ -146,20 +149,50 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO,
   return 0;
 }
 
+unsigned
+SparcMCCodeEmitter::getSImm13OpValue(const MCInst &MI, unsigned OpNo,
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+
+  if (MO.isImm())
+    return MO.getImm();
+
+  assert(MO.isExpr() &&
+         "getSImm13OpValue expects only expressions or an immediate");
+
+  const MCExpr *Expr = MO.getExpr();
+
+  // Constant value, no fixup is needed
+  if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
+    return CE->getValue();
+
+  MCFixupKind Kind;
+  if (const SparcMCExpr *SExpr = dyn_cast<SparcMCExpr>(Expr)) {
+    Kind = MCFixupKind(SExpr->getFixupKind());
+  } else {
+    bool IsPic = Ctx.getObjectFileInfo()->isPositionIndependent();
+    Kind = IsPic ? MCFixupKind(Sparc::fixup_sparc_got13)
+                 : MCFixupKind(Sparc::fixup_sparc_13);
+  }
+
+  Fixups.push_back(MCFixup::create(0, Expr, Kind));
+  return 0;
+}
+
 unsigned SparcMCCodeEmitter::
 getCallTargetOpValue(const MCInst &MI, unsigned OpNo,
                      SmallVectorImpl<MCFixup> &Fixups,
                      const MCSubtargetInfo &STI) const {
   const MCOperand &MO = MI.getOperand(OpNo);
-  if (MO.isReg() || MO.isImm())
-    return getMachineOpValue(MI, MO, Fixups, STI);
+  const MCExpr *Expr = MO.getExpr();
+  const SparcMCExpr *SExpr = dyn_cast<SparcMCExpr>(Expr);
 
   if (MI.getOpcode() == SP::TLS_CALL) {
     // No fixups for __tls_get_addr. Will emit for fixups for tls_symbol in
     // encodeInstruction.
 #ifndef NDEBUG
     // Verify that the callee is actually __tls_get_addr.
-    const SparcMCExpr *SExpr = dyn_cast<SparcMCExpr>(MO.getExpr());
     assert(SExpr && SExpr->getSubExpr()->getKind() == MCExpr::SymbolRef &&
            "Unexpected expression in TLS_CALL");
     const MCSymbolRefExpr *SymExpr = cast<MCSymbolRefExpr>(SExpr->getSubExpr());
@@ -169,15 +202,8 @@ getCallTargetOpValue(const MCInst &MI, unsigned OpNo,
     return 0;
   }
 
-  MCFixupKind fixupKind = (MCFixupKind)Sparc::fixup_sparc_call30;
-
-  if (const SparcMCExpr *SExpr = dyn_cast<SparcMCExpr>(MO.getExpr())) {
-    if (SExpr->getKind() == SparcMCExpr::VK_Sparc_WPLT30)
-      fixupKind = (MCFixupKind)Sparc::fixup_sparc_wplt30;
-  }
-
-  Fixups.push_back(MCFixup::create(0, MO.getExpr(), fixupKind));
-
+  MCFixupKind Kind = MCFixupKind(SExpr->getFixupKind());
+  Fixups.push_back(MCFixup::create(0, Expr, Kind));
   return 0;
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp b/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
index 00f319fc37e1..b84ecf074455 100644
--- a/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
@@ -41,46 +41,46 @@ void SparcMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
 
 bool SparcMCExpr::printVariantKind(raw_ostream &OS, VariantKind Kind)
 {
-  bool closeParen = true;
   switch (Kind) {
-  case VK_Sparc_None:     closeParen = false; break;
-  case VK_Sparc_LO:       OS << "%lo(";  break;
-  case VK_Sparc_HI:       OS << "%hi(";  break;
-  case VK_Sparc_H44:      OS << "%h44("; break;
-  case VK_Sparc_M44:      OS << "%m44("; break;
-  case VK_Sparc_L44:      OS << "%l44("; break;
-  case VK_Sparc_HH:       OS << "%hh(";  break;
-  case VK_Sparc_HM:       OS << "%hm(";  break;
+  case VK_Sparc_None:     return false;
+  case VK_Sparc_LO:       OS << "%lo(";  return true;
+  case VK_Sparc_HI:       OS << "%hi(";  return true;
+  case VK_Sparc_H44:      OS << "%h44("; return true;
+  case VK_Sparc_M44:      OS << "%m44("; return true;
+  case VK_Sparc_L44:      OS << "%l44("; return true;
+  case VK_Sparc_HH:       OS << "%hh(";  return true;
+  case VK_Sparc_HM:       OS << "%hm(";  return true;
     // FIXME: use %pc22/%pc10, if system assembler supports them.
-  case VK_Sparc_PC22:     OS << "%hi("; break;
-  case VK_Sparc_PC10:     OS << "%lo("; break;
+  case VK_Sparc_PC22:     OS << "%hi("; return true;
+  case VK_Sparc_PC10:     OS << "%lo("; return true;
     // FIXME: use %got22/%got10, if system assembler supports them.
-  case VK_Sparc_GOT22:    OS << "%hi("; break;
-  case VK_Sparc_GOT10:    OS << "%lo("; break;
-  case VK_Sparc_GOT13:    closeParen = false; break;
-  case VK_Sparc_13:       closeParen = false; break;
-  case VK_Sparc_WPLT30:   closeParen = false; break;
-  case VK_Sparc_R_DISP32: OS << "%r_disp32("; break;
-  case VK_Sparc_TLS_GD_HI22:   OS << "%tgd_hi22(";   break;
-  case VK_Sparc_TLS_GD_LO10:   OS << "%tgd_lo10(";   break;
-  case VK_Sparc_TLS_GD_ADD:    OS << "%tgd_add(";    break;
-  case VK_Sparc_TLS_GD_CALL:   OS << "%tgd_call(";   break;
-  case VK_Sparc_TLS_LDM_HI22:  OS << "%tldm_hi22(";  break;
-  case VK_Sparc_TLS_LDM_LO10:  OS << "%tldm_lo10(";  break;
-  case VK_Sparc_TLS_LDM_ADD:   OS << "%tldm_add(";   break;
-  case VK_Sparc_TLS_LDM_CALL:  OS << "%tldm_call(";  break;
-  case VK_Sparc_TLS_LDO_HIX22: OS << "%tldo_hix22("; break;
-  case VK_Sparc_TLS_LDO_LOX10: OS << "%tldo_lox10("; break;
-  case VK_Sparc_TLS_LDO_ADD:   OS << "%tldo_add(";   break;
-  case VK_Sparc_TLS_IE_HI22:   OS << "%tie_hi22(";   break;
-  case VK_Sparc_TLS_IE_LO10:   OS << "%tie_lo10(";   break;
-  case VK_Sparc_TLS_IE_LD:     OS << "%tie_ld(";     break;
-  case VK_Sparc_TLS_IE_LDX:    OS << "%tie_ldx(";    break;
-  case VK_Sparc_TLS_IE_ADD:    OS << "%tie_add(";    break;
-  case VK_Sparc_TLS_LE_HIX22:  OS << "%tle_hix22(";  break;
-  case VK_Sparc_TLS_LE_LOX10:  OS << "%tle_lox10(";  break;
+  case VK_Sparc_GOT22:    OS << "%hi("; return true;
+  case VK_Sparc_GOT10:    OS << "%lo("; return true;
+  case VK_Sparc_GOT13:    return false;
+  case VK_Sparc_13:       return false;
+  case VK_Sparc_WDISP30:  return false;
+  case VK_Sparc_WPLT30:   return false;
+  case VK_Sparc_R_DISP32: OS << "%r_disp32("; return true;
+  case VK_Sparc_TLS_GD_HI22:   OS << "%tgd_hi22(";   return true;
+  case VK_Sparc_TLS_GD_LO10:   OS << "%tgd_lo10(";   return true;
+  case VK_Sparc_TLS_GD_ADD:    OS << "%tgd_add(";    return true;
+  case VK_Sparc_TLS_GD_CALL:   OS << "%tgd_call(";   return true;
+  case VK_Sparc_TLS_LDM_HI22:  OS << "%tldm_hi22(";  return true;
+  case VK_Sparc_TLS_LDM_LO10:  OS << "%tldm_lo10(";  return true;
+  case VK_Sparc_TLS_LDM_ADD:   OS << "%tldm_add(";   return true;
+  case VK_Sparc_TLS_LDM_CALL:  OS << "%tldm_call(";  return true;
+  case VK_Sparc_TLS_LDO_HIX22: OS << "%tldo_hix22("; return true;
+  case VK_Sparc_TLS_LDO_LOX10: OS << "%tldo_lox10("; return true;
+  case VK_Sparc_TLS_LDO_ADD:   OS << "%tldo_add(";   return true;
+  case VK_Sparc_TLS_IE_HI22:   OS << "%tie_hi22(";   return true;
+  case VK_Sparc_TLS_IE_LO10:   OS << "%tie_lo10(";   return true;
+  case VK_Sparc_TLS_IE_LD:     OS << "%tie_ld(";     return true;
+  case VK_Sparc_TLS_IE_LDX:    OS << "%tie_ldx(";    return true;
+  case VK_Sparc_TLS_IE_ADD:    OS << "%tie_add(";    return true;
+  case VK_Sparc_TLS_LE_HIX22:  OS << "%tle_hix22(";  return true;
+  case VK_Sparc_TLS_LE_LOX10:  OS << "%tle_lox10(";  return true;
   }
-  return closeParen;
+  llvm_unreachable("Unhandled SparcMCExpr::VariantKind");
 }
 
 SparcMCExpr::VariantKind SparcMCExpr::parseVariantKind(StringRef name)
@@ -137,6 +137,7 @@ Sparc::Fixups SparcMCExpr::getFixupKind(SparcMCExpr::VariantKind Kind) {
   case VK_Sparc_GOT13:    return Sparc::fixup_sparc_got13;
   case VK_Sparc_13:       return Sparc::fixup_sparc_13;
   case VK_Sparc_WPLT30:   return Sparc::fixup_sparc_wplt30;
+  case VK_Sparc_WDISP30:  return Sparc::fixup_sparc_call30;
   case VK_Sparc_TLS_GD_HI22:   return Sparc::fixup_sparc_tls_gd_hi22;
   case VK_Sparc_TLS_GD_LO10:   return Sparc::fixup_sparc_tls_gd_lo10;
   case VK_Sparc_TLS_GD_ADD:    return Sparc::fixup_sparc_tls_gd_add;
@@ -205,10 +206,8 @@ void SparcMCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {
     MCSymbol *Symbol = Asm.getContext().getOrCreateSymbol("__tls_get_addr");
     Asm.registerSymbol(*Symbol);
     auto ELFSymbol = cast<MCSymbolELF>(Symbol);
-    if (!ELFSymbol->isBindingSet()) {
+    if (!ELFSymbol->isBindingSet())
       ELFSymbol->setBinding(ELF::STB_GLOBAL);
-      ELFSymbol->setExternal(true);
-    }
     LLVM_FALLTHROUGH;
   }
   case VK_Sparc_TLS_GD_HI22:
diff --git a/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h b/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
index c2467faca257..76603530e521 100644
--- a/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
+++ b/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
@@ -38,6 +38,7 @@ public:
     VK_Sparc_GOT13,
     VK_Sparc_13,
     VK_Sparc_WPLT30,
+    VK_Sparc_WDISP30,
     VK_Sparc_R_DISP32,
     VK_Sparc_TLS_GD_HI22,
     VK_Sparc_TLS_GD_LO10,
diff --git a/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp b/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
index fb2bcdc6c91b..9531e3105fe2 100644
--- a/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
@@ -68,7 +68,7 @@ static MCSubtargetInfo *
 createSparcMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
   if (CPU.empty())
     CPU = (TT.getArch() == Triple::sparcv9) ? "v9" : "v8";
-  return createSparcMCSubtargetInfoImpl(TT, CPU, FS);
+  return createSparcMCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FS);
 }
 
 static MCTargetStreamer *
diff --git a/contrib/llvm-project/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
index 7845a18b14c1..ee0b85292cfd 100644
--- a/contrib/llvm-project/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
@@ -80,7 +80,7 @@ static MCOperand createSparcMCOperand(SparcMCExpr::VariantKind Kind,
 }
 static MCOperand createPCXCallOP(MCSymbol *Label,
                                  MCContext &OutContext) {
-  return createSparcMCOperand(SparcMCExpr::VK_Sparc_None, Label, OutContext);
+  return createSparcMCOperand(SparcMCExpr::VK_Sparc_WDISP30, Label, OutContext);
 }
 
 static MCOperand createPCXRelExprOp(SparcMCExpr::VariantKind Kind,
diff --git a/contrib/llvm-project/llvm/lib/Target/Sparc/SparcFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/Sparc/SparcFrameLowering.cpp
index 8d8424641cd9..63187fdce999 100644
--- a/contrib/llvm-project/llvm/lib/Target/Sparc/SparcFrameLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Sparc/SparcFrameLowering.cpp
@@ -257,9 +257,9 @@ bool SparcFrameLowering::hasFP(const MachineFunction &MF) const {
       MFI.isFrameAddressTaken();
 }
 
-int SparcFrameLowering::getFrameIndexReference(const MachineFunction &MF,
-                                               int FI,
-                                               Register &FrameReg) const {
+StackOffset
+SparcFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
+                                           Register &FrameReg) const {
   const SparcSubtarget &Subtarget = MF.getSubtarget<SparcSubtarget>();
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   const SparcRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
@@ -295,10 +295,10 @@ int SparcFrameLowering::getFrameIndexReference(const MachineFunction &MF,
 
   if (UseFP) {
     FrameReg = RegInfo->getFrameRegister(MF);
-    return FrameOffset;
+    return StackOffset::getFixed(FrameOffset);
   } else {
     FrameReg = SP::O6; // %sp
-    return FrameOffset + MF.getFrameInfo().getStackSize();
+    return StackOffset::getFixed(FrameOffset + MF.getFrameInfo().getStackSize());
   }
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Target/Sparc/SparcFrameLowering.h b/contrib/llvm-project/llvm/lib/Target/Sparc/SparcFrameLowering.h
index 3ec9dc8b85dd..ab0ceb6591c6 100644
--- a/contrib/llvm-project/llvm/lib/Target/Sparc/SparcFrameLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/Sparc/SparcFrameLowering.h
@@ -15,6 +15,7 @@
 
 #include "Sparc.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/Support/TypeSize.h"
 
 namespace llvm {
 
@@ -38,8 +39,8 @@ public:
   void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
                             RegScavenger *RS = nullptr) const override;
 
-  int getFrameIndexReference(const MachineFunction &MF, int FI,
-                             Register &FrameReg) const override;
+  StackOffset getFrameIndexReference(const MachineFunction &MF, int FI,
+                                     Register &FrameReg) const override;
 
   /// targetHandlesStackFrameRounding - Returns true if the target is
   /// responsible for rounding up the stack frame (probably at emitPrologue
diff --git a/contrib/llvm-project/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/Sparc/SparcISelLowering.cpp
index 116352e08382..e5c7794b7d2f 100644
--- a/contrib/llvm-project/llvm/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Sparc/SparcISelLowering.cpp
@@ -939,7 +939,8 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
   // If the callee is a GlobalAddress node (quite common, every direct call is)
   // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
   // Likewise ExternalSymbol -> TargetExternalSymbol.
-  unsigned TF = isPositionIndependent() ? SparcMCExpr::VK_Sparc_WPLT30 : 0;
+  unsigned TF = isPositionIndependent() ? SparcMCExpr::VK_Sparc_WPLT30
+                                        : SparcMCExpr::VK_Sparc_WDISP30;
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
     Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, MVT::i32, 0, TF);
   else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
@@ -1242,7 +1243,8 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
   // Likewise ExternalSymbol -> TargetExternalSymbol.
   SDValue Callee = CLI.Callee;
   bool hasReturnsTwice = hasReturnsTwiceAttr(DAG, Callee, CLI.CB);
-  unsigned TF = isPositionIndependent() ? SparcMCExpr::VK_Sparc_WPLT30 : 0;
+  unsigned TF = isPositionIndependent() ? SparcMCExpr::VK_Sparc_WPLT30
+                                        : SparcMCExpr::VK_Sparc_WDISP30;
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
     Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL, PtrVT, 0, TF);
   else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
@@ -1877,8 +1879,7 @@ void SparcTargetLowering::computeKnownBitsForTargetNode
     Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
 
     // Only known if known in both the LHS and RHS.
-    Known.One &= Known2.One;
-    Known.Zero &= Known2.Zero;
+    Known = KnownBits::commonBits(Known, Known2);
     break;
   }
 }
@@ -2139,7 +2140,7 @@ SDValue SparcTargetLowering::LowerF128_LibCallArg(SDValue Chain,
     int FI = MFI.CreateStackObject(16, Align(8), false);
     SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
     Chain = DAG.getStore(Chain, DL, Entry.Node, FIPtr, MachinePointerInfo(),
-                         /* Alignment = */ 8);
+                         Align(8));
 
     Entry.Node = FIPtr;
     Entry.Ty   = PointerType::getUnqual(ArgTy);
@@ -2198,7 +2199,7 @@ SparcTargetLowering::LowerF128Op(SDValue Op, SelectionDAG &DAG,
 
   // Load RetPtr to get the return value.
   return DAG.getLoad(Op.getValueType(), SDLoc(Op), Chain, RetPtr,
-                     MachinePointerInfo(), /* Alignment = */ 8);
+                     MachinePointerInfo(), Align(8));
 }
 
 SDValue SparcTargetLowering::LowerF128Compare(SDValue LHS, SDValue RHS,
@@ -2541,8 +2542,9 @@ static SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) {
                          MachinePointerInfo(SV));
   // Load the actual argument out of the pointer VAList.
   // We can't count on greater alignment than the word size.
-  return DAG.getLoad(VT, DL, InChain, VAList, MachinePointerInfo(),
-                     std::min(PtrVT.getSizeInBits(), VT.getSizeInBits()) / 8);
+  return DAG.getLoad(
+      VT, DL, InChain, VAList, MachinePointerInfo(),
+      std::min(PtrVT.getFixedSizeInBits(), VT.getFixedSizeInBits()) / 8);
 }
 
 static SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG,
@@ -2731,23 +2733,21 @@ static SDValue LowerF64Op(SDValue SrcReg64, const SDLoc &dl, SelectionDAG &DAG,
 static SDValue LowerF128Load(SDValue Op, SelectionDAG &DAG)
 {
   SDLoc dl(Op);
-  LoadSDNode *LdNode = dyn_cast<LoadSDNode>(Op.getNode());
-  assert(LdNode && LdNode->getOffset().isUndef()
-         && "Unexpected node type");
+  LoadSDNode *LdNode = cast<LoadSDNode>(Op.getNode());
+  assert(LdNode->getOffset().isUndef() && "Unexpected node type");
 
-  unsigned alignment = LdNode->getAlignment();
-  if (alignment > 8)
-    alignment = 8;
+  Align Alignment = commonAlignment(LdNode->getOriginalAlign(), 8);
 
   SDValue Hi64 =
       DAG.getLoad(MVT::f64, dl, LdNode->getChain(), LdNode->getBasePtr(),
-                  LdNode->getPointerInfo(), alignment);
+                  LdNode->getPointerInfo(), Alignment);
   EVT addrVT = LdNode->getBasePtr().getValueType();
   SDValue LoPtr = DAG.getNode(ISD::ADD, dl, addrVT,
                               LdNode->getBasePtr(),
                               DAG.getConstant(8, dl, addrVT));
   SDValue Lo64 = DAG.getLoad(MVT::f64, dl, LdNode->getChain(), LoPtr,
-                             LdNode->getPointerInfo(), alignment);
+                             LdNode->getPointerInfo().getWithOffset(8),
+                             Alignment);
 
   SDValue SubRegEven = DAG.getTargetConstant(SP::sub_even64, dl, MVT::i32);
   SDValue SubRegOdd  = DAG.getTargetConstant(SP::sub_odd64, dl, MVT::i32);
@@ -2785,9 +2785,9 @@ static SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG)
 // Lower a f128 store into two f64 stores.
 static SDValue LowerF128Store(SDValue Op, SelectionDAG &DAG) {
   SDLoc dl(Op);
-  StoreSDNode *StNode = dyn_cast<StoreSDNode>(Op.getNode());
-  assert(StNode && StNode->getOffset().isUndef()
-         && "Unexpected node type");
+  StoreSDNode *StNode = cast<StoreSDNode>(Op.getNode());
+  assert(StNode->getOffset().isUndef() && "Unexpected node type");
+
   SDValue SubRegEven = DAG.getTargetConstant(SP::sub_even64, dl, MVT::i32);
   SDValue SubRegOdd  = DAG.getTargetConstant(SP::sub_odd64, dl, MVT::i32);
 
@@ -2802,20 +2802,20 @@ static SDValue LowerF128Store(SDValue Op, SelectionDAG &DAG) {
                                     StNode->getValue(),
                                     SubRegOdd);
 
-  unsigned alignment = StNode->getAlignment();
-  if (alignment > 8)
-    alignment = 8;
+  Align Alignment = commonAlignment(StNode->getOriginalAlign(), 8);
 
   SDValue OutChains[2];
   OutChains[0] =
       DAG.getStore(StNode->getChain(), dl, SDValue(Hi64, 0),
-                   StNode->getBasePtr(), MachinePointerInfo(), alignment);
+                   StNode->getBasePtr(), StNode->getPointerInfo(),
+                   Alignment);
   EVT addrVT = StNode->getBasePtr().getValueType();
   SDValue LoPtr = DAG.getNode(ISD::ADD, dl, addrVT,
                               StNode->getBasePtr(),
                               DAG.getConstant(8, dl, addrVT));
   OutChains[1] = DAG.getStore(StNode->getChain(), dl, SDValue(Lo64, 0), LoPtr,
-                              MachinePointerInfo(), alignment);
+                              StNode->getPointerInfo().getWithOffset(8),
+                              Alignment);
   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
 }
 
@@ -2834,7 +2834,8 @@ static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG)
     SDValue Val = DAG.getNode(ISD::BITCAST, dl, MVT::v2i32, St->getValue());
     SDValue Chain = DAG.getStore(
         St->getChain(), dl, Val, St->getBasePtr(), St->getPointerInfo(),
-        St->getAlignment(), St->getMemOperand()->getFlags(), St->getAAInfo());
+        St->getOriginalAlign(), St->getMemOperand()->getFlags(),
+        St->getAAInfo());
     return Chain;
   }
 
@@ -3400,8 +3401,9 @@ void SparcTargetLowering::ReplaceNodeResults(SDNode *N,
     SDLoc dl(N);
     SDValue LoadRes = DAG.getExtLoad(
         Ld->getExtensionType(), dl, MVT::v2i32, Ld->getChain(),
-        Ld->getBasePtr(), Ld->getPointerInfo(), MVT::v2i32, Ld->getAlignment(),
-        Ld->getMemOperand()->getFlags(), Ld->getAAInfo());
+        Ld->getBasePtr(), Ld->getPointerInfo(), MVT::v2i32,
+        Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags(),
+        Ld->getAAInfo());
 
     SDValue Res = DAG.getNode(ISD::BITCAST, dl, MVT::i64, LoadRes);
     Results.push_back(Res);
diff --git a/contrib/llvm-project/llvm/lib/Target/Sparc/SparcInstr64Bit.td b/contrib/llvm-project/llvm/lib/Target/Sparc/SparcInstr64Bit.td
index 9a200a36cd3e..df65c5457c1d 100644
--- a/contrib/llvm-project/llvm/lib/Target/Sparc/SparcInstr64Bit.td
+++ b/contrib/llvm-project/llvm/lib/Target/Sparc/SparcInstr64Bit.td
@@ -42,9 +42,9 @@ def : Pat<(i64 (sext i32:$val)), (SRAri $val, 0)>;
 def : Pat<(i64 (and i64:$val, 0xffffffff)), (SRLri $val, 0)>;
 def : Pat<(i64 (sext_inreg i64:$val, i32)), (SRAri $val, 0)>;
 
-defm SLLX : F3_S<"sllx", 0b100101, 1, shl, i64, I64Regs>;
-defm SRLX : F3_S<"srlx", 0b100110, 1, srl, i64, I64Regs>;
-defm SRAX : F3_S<"srax", 0b100111, 1, sra, i64, I64Regs>;
+defm SLLX : F3_S<"sllx", 0b100101, 1, shl, i64, shift_imm6, I64Regs>;
+defm SRLX : F3_S<"srlx", 0b100110, 1, srl, i64, shift_imm6, I64Regs>;
+defm SRAX : F3_S<"srax", 0b100111, 1, sra, i64, shift_imm6, I64Regs>;
 
 } // Predicates = [Is64Bit]
 
diff --git a/contrib/llvm-project/llvm/lib/Target/Sparc/SparcInstrFormats.td b/contrib/llvm-project/llvm/lib/Target/Sparc/SparcInstrFormats.td
index 2d8f063f7ed1..da53307bcb1c 100644
--- a/contrib/llvm-project/llvm/lib/Target/Sparc/SparcInstrFormats.td
+++ b/contrib/llvm-project/llvm/lib/Target/Sparc/SparcInstrFormats.td
@@ -224,13 +224,13 @@ class F3_Si<bits<2> opVal, bits<6> op3val, bit xVal, dag outs, dag ins,
 
 // Define rr and ri shift instructions with patterns.
 multiclass F3_S<string OpcStr, bits<6> Op3Val, bit XVal, SDNode OpNode,
-                ValueType VT, RegisterClass RC,
+                ValueType VT, ValueType SIT, RegisterClass RC,
                 InstrItinClass itin = IIC_iu_instr> {
   def rr : F3_Sr<2, Op3Val, XVal, (outs RC:$rd), (ins RC:$rs1, IntRegs:$rs2),
                  !strconcat(OpcStr, " $rs1, $rs2, $rd"),
                  [(set VT:$rd, (OpNode VT:$rs1, i32:$rs2))],
                  itin>;
-  def ri : F3_Si<2, Op3Val, XVal, (outs RC:$rd), (ins RC:$rs1, i32imm:$shcnt),
+  def ri : F3_Si<2, Op3Val, XVal, (outs RC:$rd), (ins RC:$rs1, SIT:$shcnt),
                  !strconcat(OpcStr, " $rs1, $shcnt, $rd"),
                  [(set VT:$rd, (OpNode VT:$rs1, (i32 imm:$shcnt)))],
                  itin>;
diff --git a/contrib/llvm-project/llvm/lib/Target/Sparc/SparcInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/Sparc/SparcInstrInfo.td
index 8b01313c7911..d1190ae03d2c 100644
--- a/contrib/llvm-project/llvm/lib/Target/Sparc/SparcInstrInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/Sparc/SparcInstrInfo.td
@@ -113,6 +113,18 @@ def SETHIimm_not : PatLeaf<(i32 imm), [{
 def ADDRrr : ComplexPattern<iPTR, 2, "SelectADDRrr", [], []>;
 def ADDRri : ComplexPattern<iPTR, 2, "SelectADDRri", [frameindex], []>;
 
+// Constrained operands for the shift operations.
+class ShiftAmtImmAsmOperand<int Bits> : AsmOperandClass {
+    let Name = "ShiftAmtImm" # Bits;
+    let ParserMethod = "parseShiftAmtImm<" # Bits # ">";
+}
+def shift_imm5 : Operand<i32> {
+  let ParserMatchClass = ShiftAmtImmAsmOperand<5>;
+}
+def shift_imm6 : Operand<i32> {
+  let ParserMatchClass = ShiftAmtImmAsmOperand<6>;
+}
+
 // Address operands
 def SparcMEMrrAsmOperand : AsmOperandClass {
   let Name = "MEMrr";
@@ -160,13 +172,20 @@ def bprtarget16 : Operand<OtherVT> {
   let EncoderMethod = "getBranchOnRegTargetOpValue";
 }
 
+def SparcCallTargetAsmOperand : AsmOperandClass {
+  let Name = "CallTarget";
+  let ParserMethod = "parseCallTarget";
+}
+
 def calltarget : Operand<i32> {
   let EncoderMethod = "getCallTargetOpValue";
   let DecoderMethod = "DecodeCall";
+  let ParserMatchClass = SparcCallTargetAsmOperand;
 }
 
 def simm13Op : Operand<i32> {
   let DecoderMethod = "DecodeSIMM13";
+  let EncoderMethod = "getSImm13OpValue";
 }
 
 // Operand for printing out a condition code.
@@ -691,9 +710,9 @@ let Defs = [ICC] in {
 }
 
 // Section B.12 - Shift Instructions, p. 107
-defm SLL : F3_12<"sll", 0b100101, shl, IntRegs, i32, simm13Op>;
-defm SRL : F3_12<"srl", 0b100110, srl, IntRegs, i32, simm13Op>;
-defm SRA : F3_12<"sra", 0b100111, sra, IntRegs, i32, simm13Op>;
+defm SLL : F3_S<"sll", 0b100101, 0, shl, i32, shift_imm5, IntRegs>;
+defm SRL : F3_S<"srl", 0b100110, 0, srl, i32, shift_imm5, IntRegs>;
+defm SRA : F3_S<"sra", 0b100111, 0, sra, i32, shift_imm5, IntRegs>;
 
 // Section B.13 - Add Instructions, p. 108
 defm ADD   : F3_12<"add", 0b000000, add, IntRegs, i32, simm13Op>;
diff --git a/contrib/llvm-project/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp b/contrib/llvm-project/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp
index 990dbe23e7ac..21dced23210c 100644
--- a/contrib/llvm-project/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp
@@ -175,7 +175,7 @@ SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 
   Register FrameReg;
   int Offset;
-  Offset = TFI->getFrameIndexReference(MF, FrameIndex, FrameReg);
+  Offset = TFI->getFrameIndexReference(MF, FrameIndex, FrameReg).getFixed();
 
   Offset += MI.getOperand(FIOperandNum + 1).getImm();
 
diff --git a/contrib/llvm-project/llvm/lib/Target/Sparc/SparcSubtarget.cpp b/contrib/llvm-project/llvm/lib/Target/Sparc/SparcSubtarget.cpp
index dbc6cf8e5b86..abc47ef51563 100644
--- a/contrib/llvm-project/llvm/lib/Target/Sparc/SparcSubtarget.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Sparc/SparcSubtarget.cpp
@@ -55,7 +55,7 @@ SparcSubtarget &SparcSubtarget::initializeSubtargetDependencies(StringRef CPU,
     CPUName = (Is64Bit) ? "v9" : "v8";
 
   // Parse features string.
-  ParseSubtargetFeatures(CPUName, FS);
+  ParseSubtargetFeatures(CPUName, /*TuneCPU*/ CPUName, FS);
 
   // Popc is a v9-only instruction.
   if (!IsV9)
@@ -67,9 +67,9 @@ SparcSubtarget &SparcSubtarget::initializeSubtargetDependencies(StringRef CPU,
 SparcSubtarget::SparcSubtarget(const Triple &TT, const std::string &CPU,
                                const std::string &FS, const TargetMachine &TM,
                                bool is64Bit)
-    : SparcGenSubtargetInfo(TT, CPU, FS), TargetTriple(TT), Is64Bit(is64Bit),
-      InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this),
-      FrameLowering(*this) {}
+    : SparcGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), TargetTriple(TT),
+      Is64Bit(is64Bit), InstrInfo(initializeSubtargetDependencies(CPU, FS)),
+      TLInfo(TM, *this), FrameLowering(*this) {}
 
 int SparcSubtarget::getAdjustedFrameSize(int frameSize) const {
 
diff --git a/contrib/llvm-project/llvm/lib/Target/Sparc/SparcSubtarget.h b/contrib/llvm-project/llvm/lib/Target/Sparc/SparcSubtarget.h
index db19f99e3c9c..82a4aa510355 100644
--- a/contrib/llvm-project/llvm/lib/Target/Sparc/SparcSubtarget.h
+++ b/contrib/llvm-project/llvm/lib/Target/Sparc/SparcSubtarget.h
@@ -16,8 +16,8 @@
 #include "SparcFrameLowering.h"
 #include "SparcISelLowering.h"
 #include "SparcInstrInfo.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
-#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include <string>
@@ -101,7 +101,7 @@ public:
 
   /// ParseSubtargetFeatures - Parses features string setting specified
   /// subtarget options.  Definition of function is auto generated by tblgen.
-  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+  void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
   SparcSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
 
   bool is64Bit() const { return Is64Bit; }
diff --git a/contrib/llvm-project/llvm/lib/Target/Sparc/SparcTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
index d48d94e2faf1..ae5228db5827 100644
--- a/contrib/llvm-project/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
@@ -55,9 +55,7 @@ static std::string computeDataLayout(const Triple &T, bool is64Bit) {
 }
 
 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
-  if (!RM.hasValue())
-    return Reloc::Static;
-  return *RM;
+  return RM.getValueOr(Reloc::Static);
 }
 
 // Code models. Some only make sense for 64-bit code.
@@ -111,12 +109,10 @@ SparcTargetMachine::getSubtargetImpl(const Function &F) const {
   Attribute CPUAttr = F.getFnAttribute("target-cpu");
   Attribute FSAttr = F.getFnAttribute("target-features");
 
-  std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
-                        ? CPUAttr.getValueAsString().str()
-                        : TargetCPU;
-  std::string FS = !FSAttr.hasAttribute(Attribute::None)
-                       ? FSAttr.getValueAsString().str()
-                       : TargetFS;
+  std::string CPU =
+      CPUAttr.isValid() ? CPUAttr.getValueAsString().str() : TargetCPU;
+  std::string FS =
+      FSAttr.isValid() ? FSAttr.getValueAsString().str() : TargetFS;
 
   // FIXME: This is related to the code below to reset the target options,
   // we need to know whether or not the soft float flag is set on the
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/contrib/llvm-project/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index d5a3a19446c7..2b815a366ccd 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -565,7 +565,7 @@ struct InsnMatchEntry {
   StringRef Format;
   uint64_t Opcode;
   int32_t NumOperands;
-  MatchClassKind OperandKinds[5];
+  MatchClassKind OperandKinds[7];
 };
 
 // For equal_range comparison.
@@ -633,7 +633,20 @@ static struct InsnMatchEntry InsnMatchTable[] = {
   { "sse", SystemZ::InsnSSE, 3,
     { MCK_U48Imm, MCK_BDAddr64Disp12, MCK_BDAddr64Disp12 } },
   { "ssf", SystemZ::InsnSSF, 4,
-    { MCK_U48Imm, MCK_BDAddr64Disp12, MCK_BDAddr64Disp12, MCK_AnyReg } }
+    { MCK_U48Imm, MCK_BDAddr64Disp12, MCK_BDAddr64Disp12, MCK_AnyReg } },
+  { "vri", SystemZ::InsnVRI, 6,
+    { MCK_U48Imm, MCK_VR128, MCK_VR128, MCK_U12Imm, MCK_U4Imm, MCK_U4Imm } },
+  { "vrr", SystemZ::InsnVRR, 7,
+    { MCK_U48Imm, MCK_VR128, MCK_VR128, MCK_VR128, MCK_U4Imm, MCK_U4Imm,
+      MCK_U4Imm } },
+  { "vrs", SystemZ::InsnVRS, 5,
+    { MCK_U48Imm, MCK_AnyReg, MCK_VR128, MCK_BDAddr64Disp12, MCK_U4Imm } },
+  { "vrv", SystemZ::InsnVRV, 4,
+    { MCK_U48Imm, MCK_VR128, MCK_BDVAddr64Disp12, MCK_U4Imm } },
+  { "vrx", SystemZ::InsnVRX, 4,
+    { MCK_U48Imm, MCK_VR128, MCK_BDXAddr64Disp12, MCK_U4Imm } },
+  { "vsi", SystemZ::InsnVSI, 4,
+    { MCK_U48Imm, MCK_VR128, MCK_BDAddr64Disp12, MCK_U8Imm } }
 };
 
 static void printMCExpr(const MCExpr *E, raw_ostream &OS) {
@@ -838,10 +851,11 @@ SystemZAsmParser::parseRegister(OperandVector &Operands, RegisterKind Kind) {
 // Parse any type of register (including integers) and add it to Operands.
 OperandMatchResultTy
 SystemZAsmParser::parseAnyRegister(OperandVector &Operands) {
+  SMLoc StartLoc = Parser.getTok().getLoc();
+
   // Handle integer values.
   if (Parser.getTok().is(AsmToken::Integer)) {
     const MCExpr *Register;
-    SMLoc StartLoc = Parser.getTok().getLoc();
     if (Parser.parseExpression(Register))
       return MatchOperand_ParseFail;
 
@@ -863,6 +877,11 @@ SystemZAsmParser::parseAnyRegister(OperandVector &Operands) {
     if (parseRegister(Reg))
       return MatchOperand_ParseFail;
 
+    if (Reg.Num > 15) {
+      Error(StartLoc, "invalid register");
+      return MatchOperand_ParseFail;
+    }
+
     // Map to the correct register kind.
     RegisterKind Kind;
     unsigned RegNo;
@@ -1195,10 +1214,14 @@ bool SystemZAsmParser::ParseDirectiveInsn(SMLoc L) {
     OperandMatchResultTy ResTy;
     if (Kind == MCK_AnyReg)
       ResTy = parseAnyReg(Operands);
+    else if (Kind == MCK_VR128)
+      ResTy = parseVR128(Operands);
     else if (Kind == MCK_BDXAddr64Disp12 || Kind == MCK_BDXAddr64Disp20)
       ResTy = parseBDXAddr64(Operands);
     else if (Kind == MCK_BDAddr64Disp12 || Kind == MCK_BDAddr64Disp20)
       ResTy = parseBDAddr64(Operands);
+    else if (Kind == MCK_BDVAddr64Disp12)
+      ResTy = parseBDVAddr64(Operands);
     else if (Kind == MCK_PCRel32)
       ResTy = parsePCRel32(Operands);
     else if (Kind == MCK_PCRel16)
@@ -1243,6 +1266,8 @@ bool SystemZAsmParser::ParseDirectiveInsn(SMLoc L) {
       ZOperand.addBDAddrOperands(Inst, 2);
     else if (ZOperand.isMem(BDXMem))
       ZOperand.addBDXAddrOperands(Inst, 3);
+    else if (ZOperand.isMem(BDVMem))
+      ZOperand.addBDVAddrOperands(Inst, 3);
     else if (ZOperand.isImm())
       ZOperand.addImmOperands(Inst, 1);
     else
@@ -1297,6 +1322,11 @@ OperandMatchResultTy SystemZAsmParser::tryParseRegister(unsigned &RegNo,
 bool SystemZAsmParser::ParseInstruction(ParseInstructionInfo &Info,
                                         StringRef Name, SMLoc NameLoc,
                                         OperandVector &Operands) {
+
+  // Apply mnemonic aliases first, before doing anything else, in
+  // case the target uses it.
+  applyMnemonicAliases(Name, getAvailableFeatures(), 0 /*VariantID*/);
+
   Operands.push_back(SystemZOperand::createToken(Name, NameLoc));
 
   // Read the remaining operands.
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp b/contrib/llvm-project/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
index e42aa14fe589..e81db1030c01 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
@@ -468,8 +468,10 @@ DecodeStatus SystemZDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
   }
 
   // Read any remaining bytes.
-  if (Bytes.size() < Size)
+  if (Bytes.size() < Size) {
+    Size = Bytes.size();
     return MCDisassembler::Fail;
+  }
 
   // Construct the instruction.
   uint64_t Inst = 0;
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.h b/contrib/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.h
index cfe1bd89c3eb..0db7279a06c1 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.h
@@ -27,6 +27,7 @@ public:
     : MCInstPrinter(MAI, MII, MRI) {}
 
   // Automatically generated by tblgen.
+  std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
   void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &O);
   static const char *getRegisterName(unsigned RegNo);
 
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp b/contrib/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
index e62f5040898f..5f276f793578 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
@@ -54,10 +54,6 @@ public:
                   const MCValue &Target, MutableArrayRef<char> Data,
                   uint64_t Value, bool IsResolved,
                   const MCSubtargetInfo *STI) const override;
-  bool mayNeedRelaxation(const MCInst &Inst,
-                         const MCSubtargetInfo &STI) const override {
-    return false;
-  }
   bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
                             const MCRelaxableFragment *Fragment,
                             const MCAsmLayout &Layout) const override {
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp b/contrib/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
index e540ff4e4811..76df8cf0f3b2 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
@@ -17,6 +17,8 @@ SystemZMCAsmInfo::SystemZMCAsmInfo(const Triple &TT) {
   CalleeSaveStackSlotSize = 8;
   IsLittleEndian = false;
 
+  MaxInstLength = 6;
+
   CommentString = "#";
   ZeroDirective = "\t.space\t";
   Data64bitsDirective = "\t.quad\t";
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp b/contrib/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
index f2ef1ad6c698..5c191d17ebc5 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
@@ -171,7 +171,7 @@ static MCRegisterInfo *createSystemZMCRegisterInfo(const Triple &TT) {
 
 static MCSubtargetInfo *
 createSystemZMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
-  return createSystemZMCSubtargetInfoImpl(TT, CPU, FS);
+  return createSystemZMCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FS);
 }
 
 static MCInstPrinter *createSystemZMCInstPrinter(const Triple &T,
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
index 4109bfc11337..584737e1d940 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
@@ -236,14 +236,15 @@ void SystemZAsmPrinter::emitInstruction(const MachineInstr *MI) {
     break;
 
   case SystemZ::CallBR:
-    LoweredMI = MCInstBuilder(SystemZ::BR).addReg(SystemZ::R1D);
+    LoweredMI = MCInstBuilder(SystemZ::BR)
+      .addReg(MI->getOperand(0).getReg());
     break;
 
   case SystemZ::CallBCR:
     LoweredMI = MCInstBuilder(SystemZ::BCR)
       .addImm(MI->getOperand(0).getImm())
       .addImm(MI->getOperand(1).getImm())
-      .addReg(SystemZ::R1D);
+      .addReg(MI->getOperand(2).getReg());
     break;
 
   case SystemZ::CRBCall:
@@ -251,7 +252,7 @@ void SystemZAsmPrinter::emitInstruction(const MachineInstr *MI) {
       .addReg(MI->getOperand(0).getReg())
       .addReg(MI->getOperand(1).getReg())
       .addImm(MI->getOperand(2).getImm())
-      .addReg(SystemZ::R1D)
+      .addReg(MI->getOperand(3).getReg())
       .addImm(0);
     break;
 
@@ -260,7 +261,7 @@ void SystemZAsmPrinter::emitInstruction(const MachineInstr *MI) {
       .addReg(MI->getOperand(0).getReg())
       .addReg(MI->getOperand(1).getReg())
       .addImm(MI->getOperand(2).getImm())
-      .addReg(SystemZ::R1D)
+      .addReg(MI->getOperand(3).getReg())
       .addImm(0);
     break;
 
@@ -269,7 +270,7 @@ void SystemZAsmPrinter::emitInstruction(const MachineInstr *MI) {
       .addReg(MI->getOperand(0).getReg())
       .addImm(MI->getOperand(1).getImm())
       .addImm(MI->getOperand(2).getImm())
-      .addReg(SystemZ::R1D)
+      .addReg(MI->getOperand(3).getReg())
       .addImm(0);
     break;
 
@@ -278,7 +279,7 @@ void SystemZAsmPrinter::emitInstruction(const MachineInstr *MI) {
       .addReg(MI->getOperand(0).getReg())
       .addImm(MI->getOperand(1).getImm())
       .addImm(MI->getOperand(2).getImm())
-      .addReg(SystemZ::R1D)
+      .addReg(MI->getOperand(3).getReg())
       .addImm(0);
     break;
 
@@ -287,7 +288,7 @@ void SystemZAsmPrinter::emitInstruction(const MachineInstr *MI) {
       .addReg(MI->getOperand(0).getReg())
       .addReg(MI->getOperand(1).getReg())
       .addImm(MI->getOperand(2).getImm())
-      .addReg(SystemZ::R1D)
+      .addReg(MI->getOperand(3).getReg())
       .addImm(0);
     break;
 
@@ -296,7 +297,7 @@ void SystemZAsmPrinter::emitInstruction(const MachineInstr *MI) {
       .addReg(MI->getOperand(0).getReg())
       .addReg(MI->getOperand(1).getReg())
       .addImm(MI->getOperand(2).getImm())
-      .addReg(SystemZ::R1D)
+      .addReg(MI->getOperand(3).getReg())
       .addImm(0);
     break;
 
@@ -305,7 +306,7 @@ void SystemZAsmPrinter::emitInstruction(const MachineInstr *MI) {
       .addReg(MI->getOperand(0).getReg())
       .addImm(MI->getOperand(1).getImm())
       .addImm(MI->getOperand(2).getImm())
-      .addReg(SystemZ::R1D)
+      .addReg(MI->getOperand(3).getReg())
       .addImm(0);
     break;
 
@@ -314,7 +315,7 @@ void SystemZAsmPrinter::emitInstruction(const MachineInstr *MI) {
       .addReg(MI->getOperand(0).getReg())
       .addImm(MI->getOperand(1).getImm())
       .addImm(MI->getOperand(2).getImm())
-      .addReg(SystemZ::R1D)
+      .addReg(MI->getOperand(3).getReg())
       .addImm(0);
     break;
 
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp
index 2f0cf0317029..19b703bbb226 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp
@@ -640,18 +640,22 @@ bool SystemZElimCompare::fuseCompareOperations(
   MachineOperand CCMask(MBBI->getOperand(1));
   assert((CCMask.getImm() & ~SystemZ::CCMASK_ICMP) == 0 &&
          "Invalid condition-code mask for integer comparison");
-  // This is only valid for CompareAndBranch.
+  // This is only valid for CompareAndBranch and CompareAndSibcall.
   MachineOperand Target(MBBI->getOperand(
-    Type == SystemZII::CompareAndBranch ? 2 : 0));
+    (Type == SystemZII::CompareAndBranch ||
+     Type == SystemZII::CompareAndSibcall) ? 2 : 0));
   const uint32_t *RegMask;
   if (Type == SystemZII::CompareAndSibcall)
-    RegMask = MBBI->getOperand(2).getRegMask();
+    RegMask = MBBI->getOperand(3).getRegMask();
 
   // Clear out all current operands.
   int CCUse = MBBI->findRegisterUseOperandIdx(SystemZ::CC, false, TRI);
   assert(CCUse >= 0 && "BRC/BCR must use CC");
   Branch->RemoveOperand(CCUse);
-  // Remove target (branch) or regmask (sibcall).
+  // Remove regmask (sibcall).
+  if (Type == SystemZII::CompareAndSibcall)
+    Branch->RemoveOperand(3);
+  // Remove target (branch or sibcall).
   if (Type == SystemZII::CompareAndBranch ||
       Type == SystemZII::CompareAndSibcall)
     Branch->RemoveOperand(2);
@@ -678,8 +682,10 @@ bool SystemZElimCompare::fuseCompareOperations(
                            RegState::ImplicitDefine | RegState::Dead);
   }
 
-  if (Type == SystemZII::CompareAndSibcall)
+  if (Type == SystemZII::CompareAndSibcall) {
+    MIB.add(Target);
     MIB.addRegMask(RegMask);
+  }
 
   // Clear any intervening kills of SrcReg and SrcReg2.
   MBBI = Compare;
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZFeatures.td b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZFeatures.td
index 28f58cb310af..b1706a4a899a 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZFeatures.td
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZFeatures.td
@@ -196,7 +196,7 @@ def Arch11NewFeatures : SystemZFeatureList<[
 
 //===----------------------------------------------------------------------===//
 //
-// New features added in the Twelvth Edition of the z/Architecture
+// New features added in the Twelfth Edition of the z/Architecture
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
index 985722fdcab4..994f471b75b1 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -316,6 +316,8 @@ void SystemZFrameLowering::
 processFunctionBeforeFrameFinalized(MachineFunction &MF,
                                     RegScavenger *RS) const {
   MachineFrameInfo &MFFrame = MF.getFrameInfo();
+  SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>();
+  MachineRegisterInfo *MRI = &MF.getRegInfo();
   bool BackChain = MF.getFunction().hasFnAttribute("backchain");
 
   if (!usePackedStack(MF) || BackChain)
@@ -344,6 +346,14 @@ processFunctionBeforeFrameFinalized(MachineFunction &MF,
     RS->addScavengingFrameIndex(MFFrame.CreateStackObject(8, Align(8), false));
     RS->addScavengingFrameIndex(MFFrame.CreateStackObject(8, Align(8), false));
   }
+
+  // If R6 is used as an argument register it is still callee saved. If it in
+  // this case is not clobbered (and restored) it should never be marked as
+  // killed.
+  if (MF.front().isLiveIn(SystemZ::R6D) &&
+      ZFI->getRestoreGPRRegs().LowGPR != SystemZ::R6D)
+    for (auto &MO : MRI->use_nodbg_operands(SystemZ::R6D))
+      MO.setIsKill(false);
 }
 
 // Emit instructions before MBBI (in MBB) to add NumBytes to Reg.
@@ -478,15 +488,6 @@ void SystemZFrameLowering::emitPrologue(MachineFunction &MF,
   MFFrame.setStackSize(StackSize);
 
   if (StackSize) {
-    // Determine if we want to store a backchain.
-    bool StoreBackchain = MF.getFunction().hasFnAttribute("backchain");
-
-    // If we need backchain, save current stack pointer.  R1 is free at this
-    // point.
-    if (StoreBackchain)
-      BuildMI(MBB, MBBI, DL, ZII->get(SystemZ::LGR))
-        .addReg(SystemZ::R1D, RegState::Define).addReg(SystemZ::R15D);
-
     // Allocate StackSize bytes.
     int64_t Delta = -int64_t(StackSize);
     const unsigned ProbeSize = TLI.getStackProbeSize(MF);
@@ -502,18 +503,20 @@ void SystemZFrameLowering::emitPrologue(MachineFunction &MF,
         .addImm(StackSize);
     }
     else {
+      bool StoreBackchain = MF.getFunction().hasFnAttribute("backchain");
+      // If we need backchain, save current stack pointer.  R1 is free at
+      // this point.
+      if (StoreBackchain)
+        BuildMI(MBB, MBBI, DL, ZII->get(SystemZ::LGR))
+          .addReg(SystemZ::R1D, RegState::Define).addReg(SystemZ::R15D);
       emitIncrement(MBB, MBBI, DL, SystemZ::R15D, Delta, ZII);
       buildCFAOffs(MBB, MBBI, DL, SPOffsetFromCFA + Delta, ZII);
+      if (StoreBackchain)
+        BuildMI(MBB, MBBI, DL, ZII->get(SystemZ::STG))
+          .addReg(SystemZ::R1D, RegState::Kill).addReg(SystemZ::R15D)
+          .addImm(getBackchainOffset(MF)).addReg(0);
     }
     SPOffsetFromCFA += Delta;
-
-    if (StoreBackchain) {
-      // The back chain is stored topmost with packed-stack.
-      int Offset = usePackedStack(MF) ? SystemZMC::CallFrameSize - 8 : 0;
-      BuildMI(MBB, MBBI, DL, ZII->get(SystemZ::STG))
-        .addReg(SystemZ::R1D, RegState::Kill).addReg(SystemZ::R15D)
-        .addImm(Offset).addReg(0);
-    }
   }
 
   if (HasFP) {
@@ -555,7 +558,8 @@ void SystemZFrameLowering::emitPrologue(MachineFunction &MF,
     unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
     Register IgnoredFrameReg;
     int64_t Offset =
-        getFrameIndexReference(MF, Save.getFrameIdx(), IgnoredFrameReg);
+        getFrameIndexReference(MF, Save.getFrameIdx(), IgnoredFrameReg)
+            .getFixed();
 
     unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
           nullptr, DwarfReg, SPOffsetFromCFA + Offset));
@@ -657,6 +661,13 @@ void SystemZFrameLowering::inlineStackProbe(MachineFunction &MF,
       .addMemOperand(MMO);
   };
 
+  bool StoreBackchain = MF.getFunction().hasFnAttribute("backchain");
+  if (StoreBackchain)
+    BuildMI(*MBB, MBBI, DL, ZII->get(SystemZ::LGR))
+      .addReg(SystemZ::R1D, RegState::Define).addReg(SystemZ::R15D);
+
+  MachineBasicBlock *DoneMBB = nullptr;
+  MachineBasicBlock *LoopMBB = nullptr;
   if (NumFullBlocks < 3) {
     // Emit unrolled probe statements.
     for (unsigned int i = 0; i < NumFullBlocks; i++)
@@ -666,15 +677,16 @@ void SystemZFrameLowering::inlineStackProbe(MachineFunction &MF,
     uint64_t LoopAlloc = ProbeSize * NumFullBlocks;
     SPOffsetFromCFA -= LoopAlloc;
 
-    BuildMI(*MBB, MBBI, DL, ZII->get(SystemZ::LGR), SystemZ::R1D)
+    // Use R0D to hold the exit value.
+    BuildMI(*MBB, MBBI, DL, ZII->get(SystemZ::LGR), SystemZ::R0D)
       .addReg(SystemZ::R15D);
-    buildDefCFAReg(*MBB, MBBI, DL, SystemZ::R1D, ZII);
-    emitIncrement(*MBB, MBBI, DL, SystemZ::R1D, -int64_t(LoopAlloc), ZII);
+    buildDefCFAReg(*MBB, MBBI, DL, SystemZ::R0D, ZII);
+    emitIncrement(*MBB, MBBI, DL, SystemZ::R0D, -int64_t(LoopAlloc), ZII);
     buildCFAOffs(*MBB, MBBI, DL, -int64_t(SystemZMC::CallFrameSize + LoopAlloc),
                  ZII);
 
-    MachineBasicBlock *DoneMBB = SystemZ::splitBlockBefore(MBBI, MBB);
-    MachineBasicBlock *LoopMBB = SystemZ::emitBlockAfter(MBB);
+    DoneMBB = SystemZ::splitBlockBefore(MBBI, MBB);
+    LoopMBB = SystemZ::emitBlockAfter(MBB);
     MBB->addSuccessor(LoopMBB);
     LoopMBB->addSuccessor(LoopMBB);
     LoopMBB->addSuccessor(DoneMBB);
@@ -682,22 +694,29 @@ void SystemZFrameLowering::inlineStackProbe(MachineFunction &MF,
     MBB = LoopMBB;
     allocateAndProbe(*MBB, MBB->end(), ProbeSize, false/*EmitCFI*/);
     BuildMI(*MBB, MBB->end(), DL, ZII->get(SystemZ::CLGR))
-      .addReg(SystemZ::R15D).addReg(SystemZ::R1D);
+      .addReg(SystemZ::R15D).addReg(SystemZ::R0D);
     BuildMI(*MBB, MBB->end(), DL, ZII->get(SystemZ::BRC))
       .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_GT).addMBB(MBB);
 
     MBB = DoneMBB;
     MBBI = DoneMBB->begin();
     buildDefCFAReg(*MBB, MBBI, DL, SystemZ::R15D, ZII);
-
-    recomputeLiveIns(*DoneMBB);
-    recomputeLiveIns(*LoopMBB);
   }
 
   if (Residual)
     allocateAndProbe(*MBB, MBBI, Residual, true/*EmitCFI*/);
 
+  if (StoreBackchain)
+    BuildMI(*MBB, MBBI, DL, ZII->get(SystemZ::STG))
+      .addReg(SystemZ::R1D, RegState::Kill).addReg(SystemZ::R15D)
+      .addImm(getBackchainOffset(MF)).addReg(0);
+
   StackAllocMI->eraseFromParent();
+  if (DoneMBB != nullptr) {
+    // Compute the live-in lists for the new blocks.
+    recomputeLiveIns(*DoneMBB);
+    recomputeLiveIns(*LoopMBB);
+  }
 }
 
 bool SystemZFrameLowering::hasFP(const MachineFunction &MF) const {
@@ -715,14 +734,14 @@ SystemZFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
   return true;
 }
 
-int SystemZFrameLowering::getFrameIndexReference(const MachineFunction &MF,
-                                                 int FI,
-                                                 Register &FrameReg) const {
+StackOffset
+SystemZFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
+                                             Register &FrameReg) const {
   // Our incoming SP is actually SystemZMC::CallFrameSize below the CFA, so
   // add that difference here.
-  int64_t Offset =
-    TargetFrameLowering::getFrameIndexReference(MF, FI, FrameReg);
-  return Offset + SystemZMC::CallFrameSize;
+  StackOffset Offset =
+      TargetFrameLowering::getFrameIndexReference(MF, FI, FrameReg);
+  return Offset + StackOffset::getFixed(SystemZMC::CallFrameSize);
 }
 
 MachineBasicBlock::iterator SystemZFrameLowering::
@@ -765,8 +784,7 @@ getOrCreateFramePointerSaveIndex(MachineFunction &MF) const {
   int FI = ZFI->getFramePointerSaveIndex();
   if (!FI) {
     MachineFrameInfo &MFFrame = MF.getFrameInfo();
-    // The back chain is stored topmost with packed-stack.
-    int Offset = usePackedStack(MF) ? -8 : -SystemZMC::CallFrameSize;
+    int Offset = getBackchainOffset(MF) - SystemZMC::CallFrameSize;
     FI = MFFrame.CreateFixedObject(8, Offset, false);
     ZFI->setFramePointerSaveIndex(FI);
   }
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZFrameLowering.h b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
index 8752acc7e5ae..085c31ca0f18 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
@@ -9,8 +9,10 @@
 #ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZFRAMELOWERING_H
 #define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZFRAMELOWERING_H
 
+#include "MCTargetDesc/SystemZMCTargetDesc.h"
 #include "llvm/ADT/IndexedMap.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/Support/TypeSize.h"
 
 namespace llvm {
 class SystemZTargetMachine;
@@ -47,8 +49,8 @@ public:
                         MachineBasicBlock &PrologMBB) const override;
   bool hasFP(const MachineFunction &MF) const override;
   bool hasReservedCallFrame(const MachineFunction &MF) const override;
-  int getFrameIndexReference(const MachineFunction &MF, int FI,
-                             Register &FrameReg) const override;
+  StackOffset getFrameIndexReference(const MachineFunction &MF, int FI,
+                                     Register &FrameReg) const override;
   MachineBasicBlock::iterator
   eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator MI) const override;
@@ -62,6 +64,12 @@ public:
   int getOrCreateFramePointerSaveIndex(MachineFunction &MF) const;
 
   bool usePackedStack(MachineFunction &MF) const;
+
+  // Return the offset of the backchain.
+  unsigned getBackchainOffset(MachineFunction &MF) const {
+    // The back chain is stored topmost with packed-stack.
+    return usePackedStack(MF) ? SystemZMC::CallFrameSize - 8 : 0;
+  }
 };
 } // end namespace llvm
 
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp
index e2af02227999..c0a173df7ba2 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp
@@ -74,8 +74,8 @@ unsigned SystemZHazardRecognizer::getCurrCycleIdx(SUnit *SU) const {
 }
 
 ScheduleHazardRecognizer::HazardType SystemZHazardRecognizer::
-getHazardType(SUnit *m, int Stalls) {
-  return (fitsIntoCurrentGroup(m) ? NoHazard : Hazard);
+getHazardType(SUnit *SU, int Stalls) {
+  return (fitsIntoCurrentGroup(SU) ? NoHazard : Hazard);
 }
 
 void SystemZHazardRecognizer::Reset() {
@@ -179,7 +179,7 @@ void SystemZHazardRecognizer::dumpSU(SUnit *SU, raw_ostream &OS) const {
       *SchedModel->getProcResource(PI->ProcResourceIdx);
     std::string FU(PRD.Name);
     // trim e.g. Z13_FXaUnit -> FXa
-    FU = FU.substr(FU.find("_") + 1);
+    FU = FU.substr(FU.find('_') + 1);
     size_t Pos = FU.find("Unit");
     if (Pos != std::string::npos)
       FU.resize(Pos);
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.h b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.h
index 38bf41ebe96a..b2ee64a1bb4a 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.h
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.h
@@ -113,7 +113,7 @@ public:
     Reset();
   }
 
-  HazardType getHazardType(SUnit *m, int Stalls = 0) override;
+  HazardType getHazardType(SUnit *SU, int Stalls = 0) override;
   void Reset() override;
   void EmitInstruction(SUnit *SU) override;
 
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
index 37328684399b..9d90a4940cba 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
@@ -338,6 +338,10 @@ class SystemZDAGToDAGISel : public SelectionDAGISel {
   // to X.
   bool storeLoadCanUseBlockBinary(SDNode *N, unsigned I) const;
 
+  // Return true if N (a load or a store) fullfills the alignment
+  // requirements for a PC-relative access.
+  bool storeLoadIsAligned(SDNode *N) const;
+
   // Try to expand a boolean SELECT_CCMASK using an IPM sequence.
   SDValue expandSelectBoolean(SDNode *Node);
 
@@ -1460,6 +1464,46 @@ bool SystemZDAGToDAGISel::storeLoadCanUseBlockBinary(SDNode *N,
          canUseBlockOperation(StoreA, LoadB);
 }
 
+bool SystemZDAGToDAGISel::storeLoadIsAligned(SDNode *N) const {
+
+  auto *MemAccess = cast<LSBaseSDNode>(N);
+  TypeSize StoreSize = MemAccess->getMemoryVT().getStoreSize();
+  SDValue BasePtr = MemAccess->getBasePtr();
+  MachineMemOperand *MMO = MemAccess->getMemOperand();
+  assert(MMO && "Expected a memory operand.");
+
+  // The memory access must have a proper alignment and no index register.
+  if (MemAccess->getAlignment() < StoreSize ||
+      !MemAccess->getOffset().isUndef())
+    return false;
+
+  // The MMO must not have an unaligned offset.
+  if (MMO->getOffset() % StoreSize != 0)
+    return false;
+
+  // An access to GOT or the Constant Pool is aligned.
+  if (const PseudoSourceValue *PSV = MMO->getPseudoValue())
+    if ((PSV->isGOT() || PSV->isConstantPool()))
+      return true;
+
+  // Check the alignment of a Global Address.
+  if (BasePtr.getNumOperands())
+    if (GlobalAddressSDNode *GA =
+        dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0))) {
+      // The immediate offset must be aligned.
+      if (GA->getOffset() % StoreSize != 0)
+        return false;
+
+      // The alignment of the symbol itself must be at least the store size.
+      const GlobalValue *GV = GA->getGlobal();
+      const DataLayout &DL = GV->getParent()->getDataLayout();
+      if (GV->getPointerAlignment(DL).value() < StoreSize)
+        return false;
+    }
+
+  return true;
+}
+
 void SystemZDAGToDAGISel::Select(SDNode *Node) {
   // If we have a custom node, we already have selected!
   if (Node->isMachineOpcode()) {
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index eb1e51341ec4..270134d84c61 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -164,6 +164,8 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
        ++I) {
     MVT VT = MVT::SimpleValueType(I);
     if (isTypeLegal(VT)) {
+      setOperationAction(ISD::ABS, VT, Legal);
+
       // Expand individual DIV and REMs into DIVREMs.
       setOperationAction(ISD::SDIV, VT, Expand);
       setOperationAction(ISD::UDIV, VT, Expand);
@@ -283,10 +285,13 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
   // Give LowerOperation the chance to replace 64-bit ORs with subregs.
   setOperationAction(ISD::OR, MVT::i64, Custom);
 
-  // FIXME: Can we support these natively?
+  // Expand 128 bit shifts without using a libcall.
   setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand);
   setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand);
   setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand);
+  setLibcallName(RTLIB::SRL_I128, nullptr);
+  setLibcallName(RTLIB::SHL_I128, nullptr);
+  setLibcallName(RTLIB::SRA_I128, nullptr);
 
   // We have native instructions for i8, i16 and i32 extensions, but not i1.
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
@@ -358,6 +363,7 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::SUB, VT, Legal);
       if (VT != MVT::v2i64)
         setOperationAction(ISD::MUL, VT, Legal);
+      setOperationAction(ISD::ABS, VT, Legal);
       setOperationAction(ISD::AND, VT, Legal);
       setOperationAction(ISD::OR, VT, Legal);
       setOperationAction(ISD::XOR, VT, Legal);
@@ -784,10 +790,11 @@ bool SystemZVectorConstantInfo::isVectorConstantLegal(
 SystemZVectorConstantInfo::SystemZVectorConstantInfo(APFloat FPImm) {
   IntBits = FPImm.bitcastToAPInt().zextOrSelf(128);
   isFP128 = (&FPImm.getSemantics() == &APFloat::IEEEquad());
-
-  // Find the smallest splat.
   SplatBits = FPImm.bitcastToAPInt();
   unsigned Width = SplatBits.getBitWidth();
+  IntBits <<= (SystemZ::VectorBits - Width);
+
+  // Find the smallest splat.
   while (Width > 8) {
     unsigned HalfSize = Width / 2;
     APInt HighValue = SplatBits.lshr(HalfSize).trunc(HalfSize);
@@ -981,16 +988,16 @@ bool SystemZTargetLowering::isLegalAddressingMode(const DataLayout &DL,
 bool SystemZTargetLowering::isTruncateFree(Type *FromType, Type *ToType) const {
   if (!FromType->isIntegerTy() || !ToType->isIntegerTy())
     return false;
-  unsigned FromBits = FromType->getPrimitiveSizeInBits();
-  unsigned ToBits = ToType->getPrimitiveSizeInBits();
+  unsigned FromBits = FromType->getPrimitiveSizeInBits().getFixedSize();
+  unsigned ToBits = ToType->getPrimitiveSizeInBits().getFixedSize();
   return FromBits > ToBits;
 }
 
 bool SystemZTargetLowering::isTruncateFree(EVT FromVT, EVT ToVT) const {
   if (!FromVT.isInteger() || !ToVT.isInteger())
     return false;
-  unsigned FromBits = FromVT.getSizeInBits();
-  unsigned ToBits = ToVT.getSizeInBits();
+  unsigned FromBits = FromVT.getFixedSizeInBits();
+  unsigned ToBits = ToVT.getFixedSizeInBits();
   return FromBits > ToBits;
 }
 
@@ -1543,6 +1550,7 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
   bool IsVarArg = CLI.IsVarArg;
   MachineFunction &MF = DAG.getMachineFunction();
   EVT PtrVT = getPointerTy(MF.getDataLayout());
+  LLVMContext &Ctx = *DAG.getContext();
 
   // Detect unsupported vector argument and return types.
   if (Subtarget.hasVector()) {
@@ -1552,7 +1560,7 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   // Analyze the operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  SystemZCCState ArgCCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
+  SystemZCCState ArgCCInfo(CallConv, IsVarArg, MF, ArgLocs, Ctx);
   ArgCCInfo.AnalyzeCallOperands(Outs, CC_SystemZ);
 
   // We don't support GuaranteedTailCallOpt, only automatically-detected
@@ -1577,14 +1585,25 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
 
     if (VA.getLocInfo() == CCValAssign::Indirect) {
       // Store the argument in a stack slot and pass its address.
-      SDValue SpillSlot = DAG.CreateStackTemporary(Outs[I].ArgVT);
+      unsigned ArgIndex = Outs[I].OrigArgIndex;
+      EVT SlotVT;
+      if (I + 1 != E && Outs[I + 1].OrigArgIndex == ArgIndex) {
+        // Allocate the full stack space for a promoted (and split) argument.
+        Type *OrigArgType = CLI.Args[Outs[I].OrigArgIndex].Ty;
+        EVT OrigArgVT = getValueType(MF.getDataLayout(), OrigArgType);
+        MVT PartVT = getRegisterTypeForCallingConv(Ctx, CLI.CallConv, OrigArgVT);
+        unsigned N = getNumRegistersForCallingConv(Ctx, CLI.CallConv, OrigArgVT);
+        SlotVT = EVT::getIntegerVT(Ctx, PartVT.getSizeInBits() * N);
+      } else {
+        SlotVT = Outs[I].ArgVT;
+      }
+      SDValue SpillSlot = DAG.CreateStackTemporary(SlotVT);
       int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
       MemOpChains.push_back(
           DAG.getStore(Chain, DL, ArgValue, SpillSlot,
                        MachinePointerInfo::getFixedStack(MF, FI)));
       // If the original argument was split (e.g. i128), we need
       // to store all parts of it here (and pass just one address).
-      unsigned ArgIndex = Outs[I].OrigArgIndex;
       assert (Outs[I].PartOffset == 0);
       while (I + 1 != E && Outs[I + 1].OrigArgIndex == ArgIndex) {
         SDValue PartValue = OutVals[I + 1];
@@ -1594,6 +1613,8 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
         MemOpChains.push_back(
             DAG.getStore(Chain, DL, PartValue, Address,
                          MachinePointerInfo::getFixedStack(MF, FI)));
+        assert((PartOffset + PartValue.getValueType().getStoreSize() <=
+                SlotVT.getStoreSize()) && "Not enough space for argument part!");
         ++I;
       }
       ArgValue = SpillSlot;
@@ -1687,7 +1708,7 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RetLocs;
-  CCState RetCCInfo(CallConv, IsVarArg, MF, RetLocs, *DAG.getContext());
+  CCState RetCCInfo(CallConv, IsVarArg, MF, RetLocs, Ctx);
   RetCCInfo.AnalyzeCallResult(Ins, RetCC_SystemZ);
 
   // Copy all of the result registers out of their specified physreg.
@@ -2285,7 +2306,8 @@ static void adjustICmpTruncate(SelectionDAG &DAG, const SDLoc &DL,
       C.Op1.getOpcode() == ISD::Constant &&
       cast<ConstantSDNode>(C.Op1)->getZExtValue() == 0) {
     auto *L = cast<LoadSDNode>(C.Op0.getOperand(0));
-    if (L->getMemoryVT().getStoreSizeInBits() <= C.Op0.getValueSizeInBits()) {
+    if (L->getMemoryVT().getStoreSizeInBits().getFixedSize() <=
+        C.Op0.getValueSizeInBits().getFixedSize()) {
       unsigned Type = L->getExtensionType();
       if ((Type == ISD::ZEXTLOAD && C.ICmpType != SystemZICMP::SignedOnly) ||
           (Type == ISD::SEXTLOAD && C.ICmpType != SystemZICMP::UnsignedOnly)) {
@@ -2958,7 +2980,7 @@ static bool isAbsolute(SDValue CmpOp, SDValue Pos, SDValue Neg) {
 // Return the absolute or negative absolute of Op; IsNegative decides which.
 static SDValue getAbsolute(SelectionDAG &DAG, const SDLoc &DL, SDValue Op,
                            bool IsNegative) {
-  Op = DAG.getNode(SystemZISD::IABS, DL, Op.getValueType(), Op);
+  Op = DAG.getNode(ISD::ABS, DL, Op.getValueType(), Op);
   if (IsNegative)
     Op = DAG.getNode(ISD::SUB, DL, Op.getValueType(),
                      DAG.getConstant(0, DL, Op.getValueType()), Op);
@@ -3414,14 +3436,14 @@ lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
 
   // If user has set the no alignment function attribute, ignore
   // alloca alignments.
-  uint64_t AlignVal = (RealignOpt ?
-                       dyn_cast<ConstantSDNode>(Align)->getZExtValue() : 0);
+  uint64_t AlignVal =
+      (RealignOpt ? cast<ConstantSDNode>(Align)->getZExtValue() : 0);
 
   uint64_t StackAlign = TFI->getStackAlignment();
   uint64_t RequiredAlign = std::max(AlignVal, StackAlign);
   uint64_t ExtraAlignSpace = RequiredAlign - StackAlign;
 
-  unsigned SPReg = getStackPointerRegisterToSaveRestore();
+  Register SPReg = getStackPointerRegisterToSaveRestore();
   SDValue NeededSpace = Size;
 
   // Get a reference to the stack pointer.
@@ -3430,7 +3452,8 @@ lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
   // If we need a backchain, save it now.
   SDValue Backchain;
   if (StoreBackchain)
-    Backchain = DAG.getLoad(MVT::i64, DL, Chain, OldSP, MachinePointerInfo());
+    Backchain = DAG.getLoad(MVT::i64, DL, Chain, getBackchainAddress(OldSP, DAG),
+                            MachinePointerInfo());
 
   // Add extra space for alignment if needed.
   if (ExtraAlignSpace)
@@ -3467,7 +3490,8 @@ lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
   }
 
   if (StoreBackchain)
-    Chain = DAG.getStore(Chain, DL, Backchain, NewSP, MachinePointerInfo());
+    Chain = DAG.getStore(Chain, DL, Backchain, getBackchainAddress(NewSP, DAG),
+                         MachinePointerInfo());
 
   SDValue Ops[2] = { Result, Chain };
   return DAG.getMergeValues(Ops, DL);
@@ -4090,13 +4114,15 @@ SDValue SystemZTargetLowering::lowerSTACKRESTORE(SDValue Op,
 
   if (StoreBackchain) {
     SDValue OldSP = DAG.getCopyFromReg(Chain, DL, SystemZ::R15D, MVT::i64);
-    Backchain = DAG.getLoad(MVT::i64, DL, Chain, OldSP, MachinePointerInfo());
+    Backchain = DAG.getLoad(MVT::i64, DL, Chain, getBackchainAddress(OldSP, DAG),
+                            MachinePointerInfo());
   }
 
   Chain = DAG.getCopyToReg(Chain, DL, SystemZ::R15D, NewSP);
 
   if (StoreBackchain)
-    Chain = DAG.getStore(Chain, DL, Backchain, NewSP, MachinePointerInfo());
+    Chain = DAG.getStore(Chain, DL, Backchain, getBackchainAddress(NewSP, DAG),
+                         MachinePointerInfo());
 
   return Chain;
 }
@@ -5557,7 +5583,6 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
     OPCODE(TLS_LDCALL);
     OPCODE(PCREL_WRAPPER);
     OPCODE(PCREL_OFFSET);
-    OPCODE(IABS);
     OPCODE(ICMP);
     OPCODE(FCMP);
     OPCODE(STRICT_FCMP);
@@ -6815,8 +6840,7 @@ static void computeKnownBitsBinOp(const SDValue Op, KnownBits &Known,
       DAG.computeKnownBits(Op.getOperand(OpNo), Src0DemE, Depth + 1);
   KnownBits RHSKnown =
       DAG.computeKnownBits(Op.getOperand(OpNo + 1), Src1DemE, Depth + 1);
-  Known.Zero = LHSKnown.Zero & RHSKnown.Zero;
-  Known.One = LHSKnown.One & RHSKnown.One;
+  Known = KnownBits::commonBits(LHSKnown, RHSKnown);
 }
 
 void
@@ -7246,6 +7270,15 @@ MachineBasicBlock *SystemZTargetLowering::emitCondStore(MachineInstr &MI,
 
   StoreOpcode = TII->getOpcodeForOffset(StoreOpcode, Disp);
 
+  // ISel pattern matching also adds a load memory operand of the same
+  // address, so take special care to find the storing memory operand.
+  MachineMemOperand *MMO = nullptr;
+  for (auto *I : MI.memoperands())
+    if (I->isStore()) {
+      MMO = I;
+      break;
+    }
+
   // Use STOCOpcode if possible.  We could use different store patterns in
   // order to avoid matching the index register, but the performance trade-offs
   // might be more complicated in that case.
@@ -7253,15 +7286,6 @@ MachineBasicBlock *SystemZTargetLowering::emitCondStore(MachineInstr &MI,
     if (Invert)
       CCMask ^= CCValid;
 
-    // ISel pattern matching also adds a load memory operand of the same
-    // address, so take special care to find the storing memory operand.
-    MachineMemOperand *MMO = nullptr;
-    for (auto *I : MI.memoperands())
-      if (I->isStore()) {
-          MMO = I;
-          break;
-        }
-
     BuildMI(*MBB, MI, DL, TII->get(STOCOpcode))
       .addReg(SrcReg)
       .add(Base)
@@ -7306,7 +7330,8 @@ MachineBasicBlock *SystemZTargetLowering::emitCondStore(MachineInstr &MI,
       .addReg(SrcReg)
       .add(Base)
       .addImm(Disp)
-      .addReg(IndexReg);
+      .addReg(IndexReg)
+      .addMemOperand(MMO);
   MBB->addSuccessor(JoinMBB);
 
   MI.eraseFromParent();
@@ -8140,6 +8165,16 @@ MachineBasicBlock *SystemZTargetLowering::emitProbedAlloca(
   return DoneMBB;
 }
 
+SDValue SystemZTargetLowering::
+getBackchainAddress(SDValue SP, SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  auto *TFL =
+      static_cast<const SystemZFrameLowering *>(Subtarget.getFrameLowering());
+  SDLoc DL(SP);
+  return DAG.getNode(ISD::ADD, DL, MVT::i64, SP,
+                     DAG.getIntPtrConstant(TFL->getBackchainOffset(MF), DL));
+}
+
 MachineBasicBlock *SystemZTargetLowering::EmitInstrWithCustomInserter(
     MachineInstr &MI, MachineBasicBlock *MBB) const {
   switch (MI.getOpcode()) {
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index 27637762296a..955587da626f 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -50,9 +50,6 @@ enum NodeType : unsigned {
   // as a register base.
   PCREL_OFFSET,
 
-  // Integer absolute.
-  IABS,
-
   // Integer comparisons.  There are three operands: the two values
   // to compare, and an integer of type SystemZICMP.
   ICMP,
@@ -701,6 +698,8 @@ private:
   MachineBasicBlock *emitProbedAlloca(MachineInstr &MI,
                                       MachineBasicBlock *MBB) const;
 
+  SDValue getBackchainAddress(SDValue SP, SelectionDAG &DAG) const;
+
   MachineMemOperand::Flags
   getTargetMMOFlags(const Instruction &I) const override;
   const TargetRegisterClass *getRepRegClassFor(MVT VT) const override;
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrFormats.td b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
index 50f1e09c6ee5..95e94c4c8e1c 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -1764,6 +1764,55 @@ class DirectiveInsnSSF<dag outs, dag ins, string asmstr, list<dag> pattern>
   let Inst{35-32} = enc{35-32};
 }
 
+class DirectiveInsnVRI<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstVRIe<0, outs, ins, asmstr, pattern> {
+  bits<48> enc;
+
+  let Inst{47-40} = enc{47-40};
+  let Inst{7-0}   = enc{7-0};
+}
+
+class DirectiveInsnVRR<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstVRRc<0, outs, ins, asmstr, pattern> {
+  bits<48> enc;
+
+  let Inst{47-40} = enc{47-40};
+  let Inst{7-0}   = enc{7-0};
+}
+
+class DirectiveInsnVRS<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstVRSc<0, outs, ins, asmstr, pattern> {
+  bits<48> enc;
+
+  let Inst{47-40} = enc{47-40};
+  let Inst{7-0}   = enc{7-0};
+}
+
+class DirectiveInsnVRV<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstVRV<0, outs, ins, asmstr, pattern> {
+  bits<48> enc;
+
+  let Inst{47-40} = enc{47-40};
+  let Inst{7-0}   = enc{7-0};
+}
+
+class DirectiveInsnVRX<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstVRX<0, outs, ins, asmstr, pattern> {
+  bits<48> enc;
+
+  let Inst{47-40} = enc{47-40};
+  let Inst{7-0}   = enc{7-0};
+}
+
+class DirectiveInsnVSI<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstVSI<0, outs, ins, asmstr, pattern> {
+  bits<48> enc;
+
+  let Inst{47-40} = enc{47-40};
+  let Inst{7-0}   = enc{7-0};
+}
+
+
 //===----------------------------------------------------------------------===//
 // Variants of instructions with condition mask
 //===----------------------------------------------------------------------===//
@@ -1862,6 +1911,11 @@ class ICV<string name>
                 !cast<CondVariant>("IntCondVariant"#name).suffix,
                 !cast<CondVariant>("IntCondVariant"#name).alternate>;
 
+// Defines a class that makes it easier to define
+// a MnemonicAlias when CondVariant's are involved.
+class MnemonicCondBranchAlias<CondVariant V, string from, string to>
+  : MnemonicAlias<!subst("#", V.suffix, from), !subst("#", V.suffix, to)>;
+
 //===----------------------------------------------------------------------===//
 // Instruction definitions with semantics
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 223cfcba2fac..bf01c262afe1 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -752,11 +752,14 @@ bool SystemZInstrInfo::PredicateInstruction(
     return true;
   }
   if (Opcode == SystemZ::CallBR) {
-    const uint32_t *RegMask = MI.getOperand(0).getRegMask();
+    MachineOperand Target = MI.getOperand(0);
+    const uint32_t *RegMask = MI.getOperand(1).getRegMask();
+    MI.RemoveOperand(1);
     MI.RemoveOperand(0);
     MI.setDesc(get(SystemZ::CallBCR));
     MachineInstrBuilder(*MI.getParent()->getParent(), MI)
       .addImm(CCValid).addImm(CCMask)
+      .add(Target)
       .addRegMask(RegMask)
       .addReg(SystemZ::CC, RegState::Implicit);
     return true;
@@ -999,7 +1002,7 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
   unsigned Opcode = MI.getOpcode();
 
   // Check CC liveness if new instruction introduces a dead def of CC.
-  MCRegUnitIterator CCUnit(SystemZ::CC, TRI);
+  MCRegUnitIterator CCUnit(MCRegister::from(SystemZ::CC), TRI);
   SlotIndex MISlot = SlotIndex();
   LiveRange *CCLiveRange = nullptr;
   bool CCLiveAtMI = true;
@@ -1196,7 +1199,7 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
     if (RC == &SystemZ::VR32BitRegClass || RC == &SystemZ::VR64BitRegClass) {
       Register Reg = MI.getOperand(I).getReg();
       Register PhysReg = Register::isVirtualRegister(Reg)
-                             ? (VRM ? VRM->getPhys(Reg) : Register())
+                             ? (VRM ? Register(VRM->getPhys(Reg)) : Register())
                              : Reg;
       if (!PhysReg ||
           !(SystemZ::FP32BitRegClass.contains(PhysReg) ||
@@ -1242,7 +1245,8 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
     else {
       Register DstReg = MI.getOperand(0).getReg();
       Register DstPhys =
-          (Register::isVirtualRegister(DstReg) ? VRM->getPhys(DstReg) : DstReg);
+          (Register::isVirtualRegister(DstReg) ? Register(VRM->getPhys(DstReg))
+                                               : DstReg);
       Register SrcReg = (OpNum == 2 ? MI.getOperand(1).getReg()
                                     : ((OpNum == 1 && MI.isCommutable())
                                            ? MI.getOperand(2).getReg()
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
index d5d56ecf6e47..6e4f9e7f4922 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -101,10 +101,20 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
   }
 }
 
-// NOPs.  These are again variants of the conditional branches,
-// with the condition mask set to "never".
+// NOPs.  These are again variants of the conditional branches, with the
+// condition mask set to "never".  NOP_bare can't be an InstAlias since it
+// would need R0D hard coded which is not part of ADDR64BitRegClass.
 def NOP  : InstAlias<"nop\t$XBD", (BCAsm 0, bdxaddr12only:$XBD), 0>;
+let isAsmParserOnly = 1, hasNoSchedulingInfo = 1, M1 = 0, XBD2 = 0 in
+  def NOP_bare  : InstRXb<0x47,(outs), (ins), "nop", []>;
 def NOPR : InstAlias<"nopr\t$R", (BCRAsm 0, GR64:$R), 0>;
+def NOPR_bare : InstAlias<"nopr", (BCRAsm 0, R0D), 0>;
+
+// An alias of BRC 0, label
+def JNOP : InstAlias<"jnop\t$RI2", (BRCAsm 0, brtarget16:$RI2), 0>;
+
+// An alias of BRCL 0, label
+def JGNOP : InstAlias<"jgnop\t$RI2", (BRCLAsm 0, brtarget32:$RI2), 0>;
 
 // Fused compare-and-branch instructions.
 //
@@ -280,33 +290,32 @@ let isCall = 1, Defs = [R14D, CC] in {
                          [(z_tls_ldcall tglobaltlsaddr:$I2)]>;
 }
 
-// Sibling calls.  Indirect sibling calls must be via R1, since R2 upwards
-// are argument registers and since branching to R0 is a no-op.
+// Sibling calls.
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
   def CallJG : Alias<6, (outs), (ins pcrel32:$I2),
                      [(z_sibcall pcrel32:$I2)]>;
-  let Uses = [R1D] in
-    def CallBR : Alias<2, (outs), (ins), [(z_sibcall R1D)]>;
+  def CallBR : Alias<2, (outs), (ins ADDR64:$R2),
+                     [(z_sibcall ADDR64:$R2)]>;
 }
 
 // Conditional sibling calls.
 let CCMaskFirst = 1, isCall = 1, isTerminator = 1, isReturn = 1 in {
   def CallBRCL : Alias<6, (outs), (ins cond4:$valid, cond4:$R1,
                                    pcrel32:$I2), []>;
-  let Uses = [R1D] in
-    def CallBCR : Alias<2, (outs), (ins cond4:$valid, cond4:$R1), []>;
+  def CallBCR : Alias<2, (outs), (ins cond4:$valid, cond4:$R1,
+                                  ADDR64:$R2), []>;
 }
 
 // Fused compare and conditional sibling calls.
-let isCall = 1, isTerminator = 1, isReturn = 1, Uses = [R1D] in {
-  def CRBCall : Alias<6, (outs), (ins GR32:$R1, GR32:$R2, cond4:$M3), []>;
-  def CGRBCall : Alias<6, (outs), (ins GR64:$R1, GR64:$R2, cond4:$M3), []>;
-  def CIBCall : Alias<6, (outs), (ins GR32:$R1, imm32sx8:$I2, cond4:$M3), []>;
-  def CGIBCall : Alias<6, (outs), (ins GR64:$R1, imm64sx8:$I2, cond4:$M3), []>;
-  def CLRBCall : Alias<6, (outs), (ins GR32:$R1, GR32:$R2, cond4:$M3), []>;
-  def CLGRBCall : Alias<6, (outs), (ins GR64:$R1, GR64:$R2, cond4:$M3), []>;
-  def CLIBCall : Alias<6, (outs), (ins GR32:$R1, imm32zx8:$I2, cond4:$M3), []>;
-  def CLGIBCall : Alias<6, (outs), (ins GR64:$R1, imm64zx8:$I2, cond4:$M3), []>;
+let isCall = 1, isTerminator = 1, isReturn = 1 in {
+  def CRBCall : Alias<6, (outs), (ins GR32:$R1, GR32:$R2, cond4:$M3, ADDR64:$R4), []>;
+  def CGRBCall : Alias<6, (outs), (ins GR64:$R1, GR64:$R2, cond4:$M3, ADDR64:$R4), []>;
+  def CIBCall : Alias<6, (outs), (ins GR32:$R1, imm32sx8:$I2, cond4:$M3, ADDR64:$R4), []>;
+  def CGIBCall : Alias<6, (outs), (ins GR64:$R1, imm64sx8:$I2, cond4:$M3, ADDR64:$R4), []>;
+  def CLRBCall : Alias<6, (outs), (ins GR32:$R1, GR32:$R2, cond4:$M3, ADDR64:$R4), []>;
+  def CLGRBCall : Alias<6, (outs), (ins GR64:$R1, GR64:$R2, cond4:$M3, ADDR64:$R4), []>;
+  def CLIBCall : Alias<6, (outs), (ins GR32:$R1, imm32zx8:$I2, cond4:$M3, ADDR64:$R4), []>;
+  def CLGIBCall : Alias<6, (outs), (ins GR64:$R1, imm64zx8:$I2, cond4:$M3, ADDR64:$R4), []>;
 }
 
 // A return instruction (br %r14).
@@ -828,16 +837,13 @@ def GOT : Alias<6, (outs GR64:$R1), (ins),
 
 let Defs = [CC] in {
   let CCValues = 0xF, CompareZeroCCMask = 0x8 in {
-    def LPR  : UnaryRR <"lpr",  0x10,   z_iabs, GR32, GR32>;
-    def LPGR : UnaryRRE<"lpgr", 0xB900, z_iabs, GR64, GR64>;
+    def LPR  : UnaryRR <"lpr",  0x10,   abs, GR32, GR32>;
+    def LPGR : UnaryRRE<"lpgr", 0xB900, abs, GR64, GR64>;
   }
   let CCValues = 0xE, CompareZeroCCMask = 0xE in
     def LPGFR : UnaryRRE<"lpgfr", 0xB910, null_frag, GR64, GR32>;
 }
-def : Pat<(z_iabs32 GR32:$src), (LPR  GR32:$src)>;
-def : Pat<(z_iabs64 GR64:$src), (LPGR GR64:$src)>;
-defm : SXU<z_iabs,   LPGFR>;
-defm : SXU<z_iabs64, LPGFR>;
+defm : SXU<abs, LPGFR>;
 
 let Defs = [CC] in {
   let CCValues = 0xF, CompareZeroCCMask = 0x8 in {
@@ -847,10 +853,7 @@ let Defs = [CC] in {
   let CCValues = 0xE, CompareZeroCCMask = 0xE in
     def LNGFR : UnaryRRE<"lngfr", 0xB911, null_frag, GR64, GR32>;
 }
-def : Pat<(z_inegabs32 GR32:$src), (LNR  GR32:$src)>;
-def : Pat<(z_inegabs64 GR64:$src), (LNGR GR64:$src)>;
-defm : SXU<z_inegabs,   LNGFR>;
-defm : SXU<z_inegabs64, LNGFR>;
+defm : SXU<z_inegabs, LNGFR>;
 
 let Defs = [CC] in {
   let CCValues = 0xF, CompareZeroCCMask = 0x8 in {
@@ -2242,6 +2245,31 @@ let isCodeGenOnly = 1, hasSideEffects = 1 in {
                                  (ins imm64zx48:$enc, bdaddr12only:$BD1,
                                       bdaddr12only:$BD2, AnyReg:$R3),
                                  ".insn ssf,$enc,$BD1,$BD2,$R3", []>;
+  def InsnVRI : DirectiveInsnVRI<(outs),
+                                 (ins imm64zx48:$enc, VR128:$V1, VR128:$V2,
+                                  imm32zx12:$I3, imm32zx4:$M4, imm32zx4:$M5),
+                                 ".insn vri,$enc,$V1,$V2,$I3,$M4,$M5", []>;
+  def InsnVRR : DirectiveInsnVRR<(outs),
+                                 (ins imm64zx48:$enc, VR128:$V1, VR128:$V2,
+                                  VR128:$V3, imm32zx4:$M4, imm32zx4:$M5,
+                                  imm32zx4:$M6),
+                                  ".insn vrr,$enc,$V1,$V2,$V3,$M4,$M5,$M6", []>;
+  def InsnVRS : DirectiveInsnVRS<(outs),
+                                 (ins imm64zx48:$enc, AnyReg:$R1, VR128:$V3,
+                                  bdaddr12only:$BD2, imm32zx4:$M4),
+                                 ".insn vrs,$enc,$BD2,$M4", []>;
+  def InsnVRV : DirectiveInsnVRV<(outs),
+                                 (ins imm64zx48:$enc, VR128:$V1,
+                                      bdvaddr12only:$VBD2, imm32zx4:$M3),
+                                 ".insn vrv,$enc,$V1,$VBD2,$M3", []>;
+  def InsnVRX : DirectiveInsnVRX<(outs),
+                                 (ins imm64zx48:$enc, VR128:$V1,
+                                  bdxaddr12only:$XBD2, imm32zx4:$M3),
+                                 ".insn vrx,$enc,$V1,$XBD2,$M3", []>;
+  def InsnVSI : DirectiveInsnVSI<(outs),
+                                 (ins imm64zx48:$enc, VR128:$V1,
+                                  bdaddr12only:$BD2, imm32zx8:$I3),
+                                  ".insn vsi,$enc,$V1,$BD2,$I3", []>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2315,3 +2343,25 @@ defm : BlockLoadStore<anyextloadi32, i64, MVCSequence, NCSequence, OCSequence,
                       XCSequence, 4>;
 defm : BlockLoadStore<load, i64, MVCSequence, NCSequence, OCSequence,
                       XCSequence, 8>;
+
+//===----------------------------------------------------------------------===//
+// Mnemonic Aliases
+//===----------------------------------------------------------------------===//
+
+def JCT   : MnemonicAlias<"jct", "brct">;
+def JCTG  : MnemonicAlias<"jctg", "brctg">;
+def JAS   : MnemonicAlias<"jas", "bras">;
+def JASL  : MnemonicAlias<"jasl", "brasl">;
+def JXH   : MnemonicAlias<"jxh", "brxh">;
+def JXLE  : MnemonicAlias<"jxle", "brxle">;
+def JXHG  : MnemonicAlias<"jxhg", "brxhg">;
+def JXLEG : MnemonicAlias<"jxleg", "brxlg">;
+
+def BRU   : MnemonicAlias<"bru", "j">;
+def BRUL  : MnemonicAlias<"brul", "jg">;
+
+foreach V = [ "E", "NE", "H", "NH", "L", "NL", "HE", "NHE", "LE", "NLE",
+              "Z", "NZ", "P", "NP", "M", "NM", "LH", "NLH", "O", "NO" ] in {
+  def BRUAsm#V : MnemonicCondBranchAlias <CV<V>, "br#", "j#">;
+  def BRULAsm#V : MnemonicCondBranchAlias <CV<V>, "br#l", "jg#">;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrVector.td b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrVector.td
index e73f1e429c3c..a85eb1623e1c 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrVector.td
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrVector.td
@@ -571,10 +571,10 @@ let Predicates = [FeatureVector] in {
 
   // Load positive.
   def VLP  : UnaryVRRaGeneric<"vlp", 0xE7DF>;
-  def VLPB : UnaryVRRa<"vlpb", 0xE7DF, z_viabs8,  v128b, v128b, 0>;
-  def VLPH : UnaryVRRa<"vlph", 0xE7DF, z_viabs16, v128h, v128h, 1>;
-  def VLPF : UnaryVRRa<"vlpf", 0xE7DF, z_viabs32, v128f, v128f, 2>;
-  def VLPG : UnaryVRRa<"vlpg", 0xE7DF, z_viabs64, v128g, v128g, 3>;
+  def VLPB : UnaryVRRa<"vlpb", 0xE7DF, abs, v128b, v128b, 0>;
+  def VLPH : UnaryVRRa<"vlph", 0xE7DF, abs, v128h, v128h, 1>;
+  def VLPF : UnaryVRRa<"vlpf", 0xE7DF, abs, v128f, v128f, 2>;
+  def VLPG : UnaryVRRa<"vlpg", 0xE7DF, abs, v128g, v128g, 3>;
 
   let isCommutable = 1 in {
     // Maximum.
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZMachineScheduler.cpp b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZMachineScheduler.cpp
index 3fc25034dded..9bee5e8d1864 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZMachineScheduler.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZMachineScheduler.cpp
@@ -72,6 +72,7 @@ advanceTo(MachineBasicBlock::iterator NextBegin) {
 }
 
 void SystemZPostRASchedStrategy::initialize(ScheduleDAGMI *dag) {
+  Available.clear();  // -misched-cutoff.
   LLVM_DEBUG(HazardRec->dumpState(););
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZOperators.td b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZOperators.td
index 81af5fd854db..992b1512a077 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZOperators.td
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZOperators.td
@@ -259,7 +259,6 @@ def z_tls_ldcall        : SDNode<"SystemZISD::TLS_LDCALL", SDT_ZCall,
 def z_pcrel_wrapper     : SDNode<"SystemZISD::PCREL_WRAPPER", SDT_ZWrapPtr, []>;
 def z_pcrel_offset      : SDNode<"SystemZISD::PCREL_OFFSET",
                                  SDT_ZWrapOffset, []>;
-def z_iabs              : SDNode<"SystemZISD::IABS", SDTIntUnaryOp, []>;
 def z_icmp              : SDNode<"SystemZISD::ICMP", SDT_ZICmp>;
 def z_fcmp              : SDNode<"SystemZISD::FCMP", SDT_ZCmp>;
 def z_strict_fcmp       : SDNode<"SystemZISD::STRICT_FCMP", SDT_ZCmp,
@@ -572,10 +571,8 @@ def anyextloadi32 : PatFrag<(ops node:$ptr), (anyextload node:$ptr), [{
 
 // Aligned loads.
 class AlignedLoad<SDPatternOperator load>
-  : PatFrag<(ops node:$addr), (load node:$addr), [{
-  auto *Load = cast<LoadSDNode>(N);
-  return Load->getAlignment() >= Load->getMemoryVT().getStoreSize();
-}]>;
+  : PatFrag<(ops node:$addr), (load node:$addr),
+  [{ return storeLoadIsAligned(N); }]>;
 def aligned_load         : AlignedLoad<load>;
 def aligned_asextloadi16 : AlignedLoad<asextloadi16>;
 def aligned_asextloadi32 : AlignedLoad<asextloadi32>;
@@ -584,10 +581,8 @@ def aligned_azextloadi32 : AlignedLoad<azextloadi32>;
 
 // Aligned stores.
 class AlignedStore<SDPatternOperator store>
-  : PatFrag<(ops node:$src, node:$addr), (store node:$src, node:$addr), [{
-  auto *Store = cast<StoreSDNode>(N);
-  return Store->getAlignment() >= Store->getMemoryVT().getStoreSize();
-}]>;
+  : PatFrag<(ops node:$src, node:$addr), (store node:$src, node:$addr),
+  [{ return storeLoadIsAligned(N); }]>;
 def aligned_store         : AlignedStore<store>;
 def aligned_truncstorei16 : AlignedStore<truncstorei16>;
 def aligned_truncstorei32 : AlignedStore<truncstorei32>;
@@ -671,17 +666,7 @@ def or_as_revinserti8 : PatFrag<(ops node:$src1, node:$src2),
 }]>;
 
 // Negative integer absolute.
-def z_inegabs : PatFrag<(ops node:$src), (ineg (z_iabs node:$src))>;
-
-// Integer absolute, matching the canonical form generated by DAGCombiner.
-def z_iabs32 : PatFrag<(ops node:$src),
-                       (xor (add node:$src, (sra node:$src, (i32 31))),
-                            (sra node:$src, (i32 31)))>;
-def z_iabs64 : PatFrag<(ops node:$src),
-                       (xor (add node:$src, (sra node:$src, (i32 63))),
-                            (sra node:$src, (i32 63)))>;
-def z_inegabs32 : PatFrag<(ops node:$src), (ineg (z_iabs32 node:$src))>;
-def z_inegabs64 : PatFrag<(ops node:$src), (ineg (z_iabs64 node:$src))>;
+def z_inegabs : PatFrag<(ops node:$src), (ineg (abs node:$src))>;
 
 // Integer multiply-and-add
 def z_muladd : PatFrag<(ops node:$src1, node:$src2, node:$src3),
@@ -898,16 +883,6 @@ def z_vicmph_zero : PatFrag<(ops node:$x), (z_vicmph node:$x, immAllZerosV)>;
 // Signed "integer less than zero" on vectors.
 def z_vicmpl_zero : PatFrag<(ops node:$x), (z_vicmph immAllZerosV, node:$x)>;
 
-// Integer absolute on vectors.
-class z_viabs<int shift>
-  : PatFrag<(ops node:$src),
-            (xor (add node:$src, (z_vsra_by_scalar node:$src, (i32 shift))),
-                 (z_vsra_by_scalar node:$src, (i32 shift)))>;
-def z_viabs8  : z_viabs<7>;
-def z_viabs16 : z_viabs<15>;
-def z_viabs32 : z_viabs<31>;
-def z_viabs64 : z_viabs<63>;
-
 // Sign-extend the i64 elements of a vector.
 class z_vse<int shift>
   : PatFrag<(ops node:$src),
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp
index fe2aaca8429a..5139cc39d2af 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp
@@ -109,8 +109,9 @@ bool SystemZRegisterInfo::getRegAllocationHints(
 
         auto tryAddHint = [&](const MachineOperand *MO) -> void {
           Register Reg = MO->getReg();
-          Register PhysReg =
-            Register::isPhysicalRegister(Reg) ? Reg : VRM->getPhys(Reg);
+          Register PhysReg = Register::isPhysicalRegister(Reg)
+                                 ? Reg
+                                 : Register(VRM->getPhys(Reg));
           if (PhysReg) {
             if (MO->getSubReg())
               PhysReg = getSubReg(PhysReg, MO->getSubReg());
@@ -265,8 +266,9 @@ SystemZRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
   // Decompose the frame index into a base and offset.
   int FrameIndex = MI->getOperand(FIOperandNum).getIndex();
   Register BasePtr;
-  int64_t Offset = (TFI->getFrameIndexReference(MF, FrameIndex, BasePtr) +
-                    MI->getOperand(FIOperandNum + 1).getImm());
+  int64_t Offset =
+      (TFI->getFrameIndexReference(MF, FrameIndex, BasePtr).getFixed() +
+       MI->getOperand(FIOperandNum + 1).getImm());
 
   // Special handling of dbg_value instructions.
   if (MI->isDebugValue()) {
@@ -321,8 +323,8 @@ SystemZRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
         // Load the high offset into the scratch register and use it as
         // an index.
         TII->loadImmediate(MBB, MI, ScratchReg, HighOffset);
-        BuildMI(MBB, MI, DL, TII->get(SystemZ::AGR),ScratchReg)
-          .addReg(ScratchReg, RegState::Kill).addReg(BasePtr);
+        BuildMI(MBB, MI, DL, TII->get(SystemZ::LA), ScratchReg)
+          .addReg(BasePtr, RegState::Kill).addImm(0).addReg(ScratchReg);
       }
 
       // Use the scratch register as the base.  It then dies here.
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td
index b3266051da4e..de49106a5a60 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td
@@ -204,7 +204,7 @@ def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLIL(F|H|L)$")>;
 
 def : InstRW<[WLat1, FXa, NormalGr], (instregex "LG(F|H)I$")>;
 def : InstRW<[WLat1, FXa, NormalGr], (instregex "LHI(Mux)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "LR(Mux)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LR$")>;
 
 // Load and zero rightmost byte
 def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LZR(F|G)$")>;
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td
index df7282a2961b..5ea269cb891d 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td
@@ -205,7 +205,7 @@ def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLIL(F|H|L)$")>;
 
 def : InstRW<[WLat1, FXa, NormalGr], (instregex "LG(F|H)I$")>;
 def : InstRW<[WLat1, FXa, NormalGr], (instregex "LHI(Mux)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "LR(Mux)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LR$")>;
 
 // Load and zero rightmost byte
 def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LZR(F|G)$")>;
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td
index 56ceb88f35d4..6a28aec6f846 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td
@@ -206,7 +206,7 @@ def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLIL(F|H|L)$")>;
 
 def : InstRW<[WLat1, FXa, NormalGr], (instregex "LG(F|H)I$")>;
 def : InstRW<[WLat1, FXa, NormalGr], (instregex "LHI(Mux)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "LR(Mux)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LR$")>;
 
 // Load and zero rightmost byte
 def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LZR(F|G)$")>;
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td
index ca714ef1a702..9a306591a34f 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td
@@ -182,7 +182,7 @@ def : InstRW<[WLat1, FXU, NormalGr], (instregex "LLIL(F|H|L)$")>;
 
 def : InstRW<[WLat1, FXU, NormalGr], (instregex "LG(F|H)I$")>;
 def : InstRW<[WLat1, FXU, NormalGr], (instregex "LHI(Mux)?$")>;
-def : InstRW<[WLat1, FXU, NormalGr], (instregex "LR(Mux)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LR$")>;
 
 // Load and test
 def : InstRW<[WLat1LSU, WLat1LSU, LSU, FXU, NormalGr], (instregex "LT(G)?$")>;
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td
index fb226be678da..f3ff1dfaba75 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td
@@ -187,7 +187,7 @@ def : InstRW<[WLat1, FXU, NormalGr], (instregex "LLIL(F|H|L)$")>;
 
 def : InstRW<[WLat1, FXU, NormalGr], (instregex "LG(F|H)I$")>;
 def : InstRW<[WLat1, FXU, NormalGr], (instregex "LHI(Mux)?$")>;
-def : InstRW<[WLat1, FXU, NormalGr], (instregex "LR(Mux)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LR$")>;
 
 // Load and trap
 def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "L(FH|G)?AT$")>;
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
index 6b4f35e5ba2b..ca5ca7257bab 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
@@ -117,9 +117,8 @@ SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemset(
           return Chain1;
         SDValue Dst2 = DAG.getNode(ISD::ADD, DL, PtrVT, Dst,
                                    DAG.getConstant(1, DL, PtrVT));
-        SDValue Chain2 =
-            DAG.getStore(Chain, DL, Byte, Dst2, DstPtrInfo.getWithOffset(1),
-                         /* Alignment = */ 1);
+        SDValue Chain2 = DAG.getStore(Chain, DL, Byte, Dst2,
+                                      DstPtrInfo.getWithOffset(1), Align(1));
         return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chain1, Chain2);
       }
     }
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
index 68e0b7ae66a4..d24e264b03a5 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
@@ -33,24 +33,32 @@ SystemZSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
   if (CPUName.empty())
     CPUName = "generic";
   // Parse features string.
-  ParseSubtargetFeatures(CPUName, FS);
+  ParseSubtargetFeatures(CPUName, /*TuneCPU*/ CPUName, FS);
 
   // -msoft-float implies -mno-vx.
   if (HasSoftFloat)
     HasVector = false;
 
+  // -mno-vx implicitly disables all vector-related features.
+  if (!HasVector) {
+    HasVectorEnhancements1 = false;
+    HasVectorEnhancements2 = false;
+    HasVectorPackedDecimal = false;
+    HasVectorPackedDecimalEnhancement = false;
+  }
+
   return *this;
 }
 
 SystemZSubtarget::SystemZSubtarget(const Triple &TT, const std::string &CPU,
                                    const std::string &FS,
                                    const TargetMachine &TM)
-    : SystemZGenSubtargetInfo(TT, CPU, FS), HasDistinctOps(false),
-      HasLoadStoreOnCond(false), HasHighWord(false), HasFPExtension(false),
-      HasPopulationCount(false), HasMessageSecurityAssist3(false),
-      HasMessageSecurityAssist4(false), HasResetReferenceBitsMultiple(false),
-      HasFastSerialization(false), HasInterlockedAccess1(false),
-      HasMiscellaneousExtensions(false),
+    : SystemZGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS),
+      HasDistinctOps(false), HasLoadStoreOnCond(false), HasHighWord(false),
+      HasFPExtension(false), HasPopulationCount(false),
+      HasMessageSecurityAssist3(false), HasMessageSecurityAssist4(false),
+      HasResetReferenceBitsMultiple(false), HasFastSerialization(false),
+      HasInterlockedAccess1(false), HasMiscellaneousExtensions(false),
       HasExecutionHint(false), HasLoadAndTrap(false),
       HasTransactionalExecution(false), HasProcessorAssist(false),
       HasDFPZonedConversion(false), HasEnhancedDAT2(false),
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZSubtarget.h b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZSubtarget.h
index 4b49c37fe4e6..3841063d2f61 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZSubtarget.h
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZSubtarget.h
@@ -112,7 +112,7 @@ public:
   bool enableSubRegLiveness() const override;
 
   // Automatically generated by tblgen.
-  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+  void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
 
   // Return true if the target has the distinct-operands facility.
   bool hasDistinctOps() const { return HasDistinctOps; }
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
index 3f467b200852..7b78dc4ad13a 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -171,12 +171,10 @@ SystemZTargetMachine::getSubtargetImpl(const Function &F) const {
   Attribute CPUAttr = F.getFnAttribute("target-cpu");
   Attribute FSAttr = F.getFnAttribute("target-features");
 
-  std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
-                        ? CPUAttr.getValueAsString().str()
-                        : TargetCPU;
-  std::string FS = !FSAttr.hasAttribute(Attribute::None)
-                       ? FSAttr.getValueAsString().str()
-                       : TargetFS;
+  std::string CPU =
+      CPUAttr.isValid() ? CPUAttr.getValueAsString().str() : TargetCPU;
+  std::string FS =
+      FSAttr.isValid() ? FSAttr.getValueAsString().str() : TargetFS;
 
   // FIXME: This is related to the code below to reset the target options,
   // we need to know whether or not the soft float flag is set on the
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 864200e5f71c..e7ac2391512f 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -64,8 +64,9 @@ int SystemZTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
 }
 
 int SystemZTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
-                                  const APInt &Imm, Type *Ty,
-                                  TTI::TargetCostKind CostKind) {
+                                      const APInt &Imm, Type *Ty,
+                                      TTI::TargetCostKind CostKind,
+                                      Instruction *Inst) {
   assert(Ty->isIntegerTy());
 
   unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -340,8 +341,8 @@ unsigned SystemZTTIImpl::getMinPrefetchStride(unsigned NumMemAccesses,
 
   // Emit prefetch instructions for smaller strides in cases where we think
   // the hardware prefetcher might not be able to keep up.
-  if (NumStridedMemAccesses > 32 &&
-      NumStridedMemAccesses == NumMemAccesses && !HasCall)
+  if (NumStridedMemAccesses > 32 && !HasCall &&
+      (NumMemAccesses - NumStridedMemAccesses) * 32 <= NumStridedMemAccesses)
     return 1;
 
   return ST->hasMiscellaneousExtensions3() ? 8192 : 2048;
@@ -592,8 +593,9 @@ static unsigned getElSizeLog2Diff(Type *Ty0, Type *Ty1) {
 unsigned SystemZTTIImpl::
 getVectorTruncCost(Type *SrcTy, Type *DstTy) {
   assert (SrcTy->isVectorTy() && DstTy->isVectorTy());
-  assert (SrcTy->getPrimitiveSizeInBits() > DstTy->getPrimitiveSizeInBits() &&
-          "Packing must reduce size of vector type.");
+  assert(SrcTy->getPrimitiveSizeInBits().getFixedSize() >
+             DstTy->getPrimitiveSizeInBits().getFixedSize() &&
+         "Packing must reduce size of vector type.");
   assert(cast<FixedVectorType>(SrcTy)->getNumElements() ==
              cast<FixedVectorType>(DstTy)->getNumElements() &&
          "Packing should not change number of elements.");
@@ -699,11 +701,12 @@ getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst,
 }
 
 int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                                     TTI::CastContextHint CCH,
                                      TTI::TargetCostKind CostKind,
                                      const Instruction *I) {
   // FIXME: Can the logic below also be used for these cost kinds?
   if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency) {
-    int BaseCost = BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I);
+    int BaseCost = BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
     return BaseCost == 0 ? BaseCost : 1;
   }
 
@@ -786,8 +789,8 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
       // Return the cost of multiple scalar invocation plus the cost of
       // inserting and extracting the values. Base implementation does not
       // realize float->int gets scalarized.
-      unsigned ScalarCost = getCastInstrCost(Opcode, Dst->getScalarType(),
-                                             Src->getScalarType(), CostKind);
+      unsigned ScalarCost = getCastInstrCost(
+          Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind);
       unsigned TotCost = VF * ScalarCost;
       bool NeedsInserts = true, NeedsExtracts = true;
       // FP128 registers do not get inserted or extracted.
@@ -828,7 +831,7 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     }
   }
 
-  return BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I);
+  return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
 }
 
 // Scalar i8 / i16 operations will typically be made after first extending
@@ -844,11 +847,11 @@ static unsigned getOperandsExtensionCost(const Instruction *I) {
 }
 
 int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                                       Type *CondTy,
+                                       Type *CondTy, CmpInst::Predicate VecPred,
                                        TTI::TargetCostKind CostKind,
                                        const Instruction *I) {
   if (CostKind != TTI::TCK_RecipThroughput)
-    return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind);
+    return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind);
 
   if (!ValTy->isVectorTy()) {
     switch (Opcode) {
@@ -860,7 +863,7 @@ int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
         if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
           if (const ConstantInt *C = dyn_cast<ConstantInt>(I->getOperand(1)))
             if (!Ld->hasOneUse() && Ld->getParent() == I->getParent() &&
-                C->getZExtValue() == 0)
+                C->isZero())
               return 0;
 
       unsigned Cost = 1;
@@ -924,7 +927,7 @@ int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
     }
   }
 
-  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind);
+  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind);
 }
 
 int SystemZTTIImpl::
@@ -1019,7 +1022,7 @@ isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue) {
     // Comparison between memory and immediate.
     if (UserI->getOpcode() == Instruction::ICmp)
       if (ConstantInt *CI = dyn_cast<ConstantInt>(UserI->getOperand(1)))
-        if (isUInt<16>(CI->getZExtValue()))
+        if (CI->getValue().isIntN(16))
           return true;
     return (LoadOrTruncBits == 32 || LoadOrTruncBits == 64);
     break;
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index 7f8f7f6f923f..c97e099f9943 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -41,7 +41,8 @@ public:
   int getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind);
 
   int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,
-                        Type *Ty, TTI::TargetCostKind CostKind);
+                        Type *Ty, TTI::TargetCostKind CostKind,
+                        Instruction *Inst = nullptr);
   int getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
                           Type *Ty, TTI::TargetCostKind CostKind);
 
@@ -93,9 +94,10 @@ public:
   unsigned getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst,
                                          const Instruction *I);
   int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
-                       TTI::TargetCostKind CostKind,
+                       TTI::CastContextHint CCH, TTI::TargetCostKind CostKind,
                        const Instruction *I = nullptr);
   int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                         CmpInst::Predicate VecPred,
                          TTI::TargetCostKind CostKind,
                          const Instruction *I = nullptr);
   int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
diff --git a/contrib/llvm-project/llvm/lib/Target/TargetLoweringObjectFile.cpp b/contrib/llvm-project/llvm/lib/Target/TargetLoweringObjectFile.cpp
index eea0aeea2c45..81af4eead6d2 100644
--- a/contrib/llvm-project/llvm/lib/Target/TargetLoweringObjectFile.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/TargetLoweringObjectFile.cpp
@@ -20,6 +20,7 @@
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/IR/Module.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCStreamer.h"
@@ -49,12 +50,23 @@ void TargetLoweringObjectFile::Initialize(MCContext &ctx,
   // Reset various EH DWARF encodings.
   PersonalityEncoding = LSDAEncoding = TTypeEncoding = dwarf::DW_EH_PE_absptr;
   CallSiteEncoding = dwarf::DW_EH_PE_uleb128;
+
+  this->TM = &TM;
 }
 
 TargetLoweringObjectFile::~TargetLoweringObjectFile() {
   delete Mang;
 }
 
+unsigned TargetLoweringObjectFile::getCallSiteEncoding() const {
+  // If target does not have LEB128 directives, we would need the
+  // call site encoding to be udata4 so that the alternative path
+  // for not having LEB128 directives could work.
+  if (!getContext().getAsmInfo()->hasLEB128Directives())
+    return dwarf::DW_EH_PE_udata4;
+  return CallSiteEncoding;
+}
+
 static bool isNullOrUndef(const Constant *C) {
   // Check that the constant isn't all zeros or undefs.
   if (C->isNullValue() || isa<UndefValue>(C))
@@ -136,6 +148,52 @@ void TargetLoweringObjectFile::emitPersonalityValue(MCStreamer &Streamer,
                                                     const MCSymbol *Sym) const {
 }
 
+void TargetLoweringObjectFile::emitCGProfileMetadata(MCStreamer &Streamer,
+                                                     Module &M) const {
+  MCContext &C = getContext();
+  SmallVector<Module::ModuleFlagEntry, 8> ModuleFlags;
+  M.getModuleFlagsMetadata(ModuleFlags);
+
+  MDNode *CFGProfile = nullptr;
+
+  for (const auto &MFE : ModuleFlags) {
+    StringRef Key = MFE.Key->getString();
+    if (Key == "CG Profile") {
+      CFGProfile = cast<MDNode>(MFE.Val);
+      break;
+    }
+  }
+
+  if (!CFGProfile)
+    return;
+
+  auto GetSym = [this](const MDOperand &MDO) -> MCSymbol * {
+    if (!MDO)
+      return nullptr;
+    auto *V = cast<ValueAsMetadata>(MDO);
+    const Function *F = cast<Function>(V->getValue()->stripPointerCasts());
+    if (F->hasDLLImportStorageClass())
+      return nullptr;
+    return TM->getSymbol(F);
+  };
+
+  for (const auto &Edge : CFGProfile->operands()) {
+    MDNode *E = cast<MDNode>(Edge);
+    const MCSymbol *From = GetSym(E->getOperand(0));
+    const MCSymbol *To = GetSym(E->getOperand(1));
+    // Skip null functions. This can happen if functions are dead stripped after
+    // the CGProfile pass has been run.
+    if (!From || !To)
+      continue;
+    uint64_t Count = cast<ConstantAsMetadata>(E->getOperand(2))
+                         ->getValue()
+                         ->getUniqueInteger()
+                         .getZExtValue();
+    Streamer.emitCGProfileEntry(
+        MCSymbolRefExpr::create(From, MCSymbolRefExpr::VK_None, C),
+        MCSymbolRefExpr::create(To, MCSymbolRefExpr::VK_None, C), Count);
+  }
+}
 
 /// getKindForGlobal - This is a top-level target-independent classifier for
 /// a global object.  Given a global variable and information from the TM, this
diff --git a/contrib/llvm-project/llvm/lib/Target/TargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/TargetMachine.cpp
index 074e9fde79e6..2aee0e5c3fb8 100644
--- a/contrib/llvm-project/llvm/lib/Target/TargetMachine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/TargetMachine.cpp
@@ -93,34 +93,30 @@ static TLSModel::Model getSelectedTLSModel(const GlobalValue *GV) {
 
 bool TargetMachine::shouldAssumeDSOLocal(const Module &M,
                                          const GlobalValue *GV) const {
-  // If the IR producer requested that this GV be treated as dso local, obey.
-  if (GV && GV->isDSOLocal())
-    return true;
-
-  // If we are not supossed to use a PLT, we cannot assume that intrinsics are
-  // local since the linker can convert some direct access to access via plt.
-  if (M.getRtLibUseGOT() && !GV)
-    return false;
+  const Triple &TT = getTargetTriple();
+  Reloc::Model RM = getRelocationModel();
 
   // According to the llvm language reference, we should be able to
   // just return false in here if we have a GV, as we know it is
   // dso_preemptable.  At this point in time, the various IR producers
   // have not been transitioned to always produce a dso_local when it
   // is possible to do so.
-  // In the case of intrinsics, GV is null and there is nowhere to put
-  // dso_local. Returning false for those will produce worse code in some
-  // architectures. For example, on x86 the caller has to set ebx before calling
-  // a plt.
+  // In the case of ExternalSymbolSDNode, GV is null and we should just return
+  // false. However, COFF currently relies on this to be true
+  //
   // As a result we still have some logic in here to improve the quality of the
   // generated code.
   // FIXME: Add a module level metadata for whether intrinsics should be assumed
   // local.
+  if (!GV)
+    return TT.isOSBinFormatCOFF();
 
-  Reloc::Model RM = getRelocationModel();
-  const Triple &TT = getTargetTriple();
+  // If the IR producer requested that this GV be treated as dso local, obey.
+  if (GV->isDSOLocal())
+    return true;
 
   // DLLImport explicitly marks the GV as external.
-  if (GV && GV->hasDLLImportStorageClass())
+  if (GV->hasDLLImportStorageClass())
     return false;
 
   // On MinGW, variables that haven't been declared with DLLImport may still
@@ -128,14 +124,14 @@ bool TargetMachine::shouldAssumeDSOLocal(const Module &M,
   // don't assume the variables to be DSO local unless we actually know
   // that for sure. This only has to be done for variables; for functions
   // the linker can insert thunks for calling functions from another DLL.
-  if (TT.isWindowsGNUEnvironment() && TT.isOSBinFormatCOFF() && GV &&
+  if (TT.isWindowsGNUEnvironment() && TT.isOSBinFormatCOFF() &&
       GV->isDeclarationForLinker() && isa<GlobalVariable>(GV))
     return false;
 
   // On COFF, don't mark 'extern_weak' symbols as DSO local. If these symbols
   // remain unresolved in the link, they can be resolved to zero, which is
   // outside the current DSO.
-  if (TT.isOSBinFormatCOFF() && GV && GV->hasExternalWeakLinkage())
+  if (TT.isOSBinFormatCOFF() && GV->hasExternalWeakLinkage())
     return false;
 
   // Every other GV is local on COFF.
@@ -147,20 +143,10 @@ bool TargetMachine::shouldAssumeDSOLocal(const Module &M,
   if (TT.isOSBinFormatCOFF() || TT.isOSWindows())
     return true;
 
-  // Most PIC code sequences that assume that a symbol is local cannot
-  // produce a 0 if it turns out the symbol is undefined. While this
-  // is ABI and relocation depended, it seems worth it to handle it
-  // here.
-  if (GV && isPositionIndependent() && GV->hasExternalWeakLinkage())
-    return false;
-
-  if (GV && !GV->hasDefaultVisibility())
-    return true;
-
   if (TT.isOSBinFormatMachO()) {
     if (RM == Reloc::Static)
       return true;
-    return GV && GV->isStrongDefinitionForLinker();
+    return GV->isStrongDefinitionForLinker();
   }
 
   // Due to the AIX linkage model, any global with default visibility is
@@ -170,40 +156,6 @@ bool TargetMachine::shouldAssumeDSOLocal(const Module &M,
 
   assert(TT.isOSBinFormatELF() || TT.isOSBinFormatWasm());
   assert(RM != Reloc::DynamicNoPIC);
-
-  bool IsExecutable =
-      RM == Reloc::Static || M.getPIELevel() != PIELevel::Default;
-  if (IsExecutable) {
-    // If the symbol is defined, it cannot be preempted.
-    if (GV && !GV->isDeclarationForLinker())
-      return true;
-
-    // A symbol marked nonlazybind should not be accessed with a plt. If the
-    // symbol turns out to be external, the linker will convert a direct
-    // access to an access via the plt, so don't assume it is local.
-    const Function *F = dyn_cast_or_null<Function>(GV);
-    if (F && F->hasFnAttribute(Attribute::NonLazyBind))
-      return false;
-    Triple::ArchType Arch = TT.getArch();
-
-    // PowerPC prefers avoiding copy relocations.
-    if (Arch == Triple::ppc || TT.isPPC64())
-      return false;
-
-    // Check if we can use copy relocations.
-    if (!(GV && GV->isThreadLocal()) && RM == Reloc::Static)
-      return true;
-  } else if (TT.isOSBinFormatELF()) {
-    // If dso_local allows AsmPrinter::getSymbolPreferLocal to use a local
-    // alias, set the flag. We cannot set dso_local for other global values,
-    // because otherwise direct accesses to a probably interposable symbol (even
-    // if the codegen assumes not) will be rejected by the linker.
-    if (!GV || !GV->canBenefitFromLocalAlias())
-      return false;
-    return TT.isX86() && M.noSemanticInterposition();
-  }
-
-  // ELF & wasm support preemption of other symbols.
   return false;
 }
 
@@ -281,3 +233,12 @@ TargetIRAnalysis TargetMachine::getTargetIRAnalysis() {
   return TargetIRAnalysis(
       [this](const Function &F) { return this->getTargetTransformInfo(F); });
 }
+
+std::pair<int, int> TargetMachine::parseBinutilsVersion(StringRef Version) {
+  if (Version == "none")
+    return {INT_MAX, INT_MAX}; // Make binutilsIsAtLeast() return true.
+  std::pair<int, int> Ret;
+  if (!Version.consumeInteger(10, Ret.first) && Version.consume_front("."))
+    Version.consumeInteger(10, Ret.second);
+  return Ret;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp b/contrib/llvm-project/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp
index 7a899b4b38e2..a3309a68c76d 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp
@@ -125,6 +125,9 @@ static const MCPhysReg F128Regs[32] = {
     VE::Q16, VE::Q17, VE::Q18, VE::Q19, VE::Q20, VE::Q21, VE::Q22, VE::Q23,
     VE::Q24, VE::Q25, VE::Q26, VE::Q27, VE::Q28, VE::Q29, VE::Q30, VE::Q31};
 
+static const MCPhysReg VM512Regs[8] = {VE::VMP0, VE::VMP1, VE::VMP2, VE::VMP3,
+                                       VE::VMP4, VE::VMP5, VE::VMP6, VE::VMP7};
+
 static const MCPhysReg MISCRegs[31] = {
     VE::USRCC,      VE::PSW,        VE::SAR,        VE::NoRegister,
     VE::NoRegister, VE::NoRegister, VE::NoRegister, VE::PMMR,
@@ -277,6 +280,17 @@ public:
     }
     return false;
   }
+  bool isUImm4() {
+    if (!isImm())
+      return false;
+
+    // Constant case
+    if (const auto *ConstExpr = dyn_cast<MCConstantExpr>(Imm.Val)) {
+      int64_t Value = ConstExpr->getValue();
+      return isUInt<4>(Value);
+    }
+    return false;
+  }
   bool isUImm6() {
     if (!isImm())
       return false;
@@ -476,6 +490,10 @@ public:
     addImmOperands(Inst, N);
   }
 
+  void addUImm4Operands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+
   void addUImm6Operands(MCInst &Inst, unsigned N) const {
     addImmOperands(Inst, N);
   }
@@ -648,6 +666,15 @@ public:
     return true;
   }
 
+  static bool MorphToVM512Reg(VEOperand &Op) {
+    unsigned Reg = Op.getReg();
+    unsigned regIdx = Reg - VE::VM0;
+    if (regIdx % 2 || regIdx > 15)
+      return false;
+    Op.Reg.RegNum = VM512Regs[regIdx / 2];
+    return true;
+  }
+
   static bool MorphToMISCReg(VEOperand &Op) {
     const auto *ConstExpr = dyn_cast<MCConstantExpr>(Op.getImm());
     if (!ConstExpr)
@@ -902,6 +929,24 @@ StringRef VEAsmParser::splitMnemonic(StringRef Name, SMLoc NameLoc,
     Mnemonic = parseRD(Name, 10, NameLoc, Operands);
   } else if (Name.startswith("cvt.l.d")) {
     Mnemonic = parseRD(Name, 7, NameLoc, Operands);
+  } else if (Name.startswith("vcvt.w.d.sx") || Name.startswith("vcvt.w.d.zx") ||
+             Name.startswith("vcvt.w.s.sx") || Name.startswith("vcvt.w.s.zx")) {
+    Mnemonic = parseRD(Name, 11, NameLoc, Operands);
+  } else if (Name.startswith("vcvt.l.d")) {
+    Mnemonic = parseRD(Name, 8, NameLoc, Operands);
+  } else if (Name.startswith("pvcvt.w.s.lo") ||
+             Name.startswith("pvcvt.w.s.up")) {
+    Mnemonic = parseRD(Name, 12, NameLoc, Operands);
+  } else if (Name.startswith("pvcvt.w.s")) {
+    Mnemonic = parseRD(Name, 9, NameLoc, Operands);
+  } else if (Name.startswith("vfmk.l.") || Name.startswith("vfmk.w.") ||
+             Name.startswith("vfmk.d.") || Name.startswith("vfmk.s.")) {
+    bool ICC = Name[5] == 'l' || Name[5] == 'w' ? true : false;
+    Mnemonic = parseCC(Name, 7, Name.size(), ICC, true, NameLoc, Operands);
+  } else if (Name.startswith("pvfmk.w.lo.") || Name.startswith("pvfmk.w.up.") ||
+             Name.startswith("pvfmk.s.lo.") || Name.startswith("pvfmk.s.up.")) {
+    bool ICC = Name[6] == 'l' || Name[6] == 'w' ? true : false;
+    Mnemonic = parseCC(Name, 11, Name.size(), ICC, true, NameLoc, Operands);
   } else {
     Operands->push_back(VEOperand::CreateToken(Mnemonic, NameLoc));
   }
@@ -1362,9 +1407,38 @@ OperandMatchResultTy VEAsmParser::parseOperand(OperandVector &Operands,
     return ResTy;
 
   switch (getLexer().getKind()) {
-  case AsmToken::LParen:
-    // FIXME: Parsing "(" + %vreg + ", " + %vreg + ")"
-    // FALLTHROUGH
+  case AsmToken::LParen: {
+    // Parsing "(" + %vreg + ", " + %vreg + ")"
+    const AsmToken Tok1 = Parser.getTok();
+    Parser.Lex(); // Eat the '('.
+
+    unsigned RegNo1;
+    SMLoc S1, E1;
+    if (tryParseRegister(RegNo1, S1, E1) != MatchOperand_Success) {
+      getLexer().UnLex(Tok1);
+      return MatchOperand_NoMatch;
+    }
+
+    if (!Parser.getTok().is(AsmToken::Comma))
+      return MatchOperand_ParseFail;
+    Parser.Lex(); // Eat the ','.
+
+    unsigned RegNo2;
+    SMLoc S2, E2;
+    if (tryParseRegister(RegNo2, S2, E2) != MatchOperand_Success)
+      return MatchOperand_ParseFail;
+
+    if (!Parser.getTok().is(AsmToken::RParen))
+      return MatchOperand_ParseFail;
+
+    Operands.push_back(VEOperand::CreateToken(Tok1.getString(), Tok1.getLoc()));
+    Operands.push_back(VEOperand::CreateReg(RegNo1, S1, E1));
+    Operands.push_back(VEOperand::CreateReg(RegNo2, S2, E2));
+    Operands.push_back(VEOperand::CreateToken(Parser.getTok().getString(),
+                                              Parser.getTok().getLoc()));
+    Parser.Lex(); // Eat the ')'.
+    break;
+  }
   default: {
     std::unique_ptr<VEOperand> Op;
     ResTy = parseVEAsmOperand(Op);
@@ -1377,7 +1451,24 @@ OperandMatchResultTy VEAsmParser::parseOperand(OperandVector &Operands,
     if (!Parser.getTok().is(AsmToken::LParen))
       break;
 
-    // FIXME: Parsing %vec-reg + "(" + %sclar-reg/number + ")"
+    // Parsing %vec-reg + "(" + %sclar-reg/number + ")"
+    std::unique_ptr<VEOperand> Op1 = VEOperand::CreateToken(
+        Parser.getTok().getString(), Parser.getTok().getLoc());
+    Parser.Lex(); // Eat the '('.
+
+    std::unique_ptr<VEOperand> Op2;
+    ResTy = parseVEAsmOperand(Op2);
+    if (ResTy != MatchOperand_Success || !Op2)
+      return MatchOperand_ParseFail;
+
+    if (!Parser.getTok().is(AsmToken::RParen))
+      return MatchOperand_ParseFail;
+
+    Operands.push_back(std::move(Op1));
+    Operands.push_back(std::move(Op2));
+    Operands.push_back(VEOperand::CreateToken(Parser.getTok().getString(),
+                                              Parser.getTok().getLoc()));
+    Parser.Lex(); // Eat the ')'.
     break;
   }
   }
@@ -1445,6 +1536,10 @@ unsigned VEAsmParser::validateTargetOperandClass(MCParsedAsmOperand &GOp,
     if (Op.isReg() && VEOperand::MorphToF128Reg(Op))
       return MCTargetAsmParser::Match_Success;
     break;
+  case MCK_VM512:
+    if (Op.isReg() && VEOperand::MorphToVM512Reg(Op))
+      return MCTargetAsmParser::Match_Success;
+    break;
   case MCK_MISC:
     if (Op.isImm() && VEOperand::MorphToMISCReg(Op))
       return MCTargetAsmParser::Match_Success;
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp b/contrib/llvm-project/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp
index 35885a4e3cae..20d609bc6b32 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp
@@ -47,7 +47,7 @@ static MCDisassembler *createVEDisassembler(const Target &T,
   return new VEDisassembler(STI, Ctx);
 }
 
-extern "C" void LLVMInitializeVEDisassembler() {
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeVEDisassembler() {
   // Register the disassembler.
   TargetRegistry::RegisterMCDisassembler(getTheVETarget(),
                                          createVEDisassembler);
@@ -95,6 +95,25 @@ static const unsigned F128RegDecoderTable[] = {
     VE::Q16, VE::Q17, VE::Q18, VE::Q19, VE::Q20, VE::Q21, VE::Q22, VE::Q23,
     VE::Q24, VE::Q25, VE::Q26, VE::Q27, VE::Q28, VE::Q29, VE::Q30, VE::Q31};
 
+static const unsigned V64RegDecoderTable[] = {
+    VE::V0,  VE::V1,  VE::V2,  VE::V3,  VE::V4,  VE::V5,  VE::V6,  VE::V7,
+    VE::V8,  VE::V9,  VE::V10, VE::V11, VE::V12, VE::V13, VE::V14, VE::V15,
+    VE::V16, VE::V17, VE::V18, VE::V19, VE::V20, VE::V21, VE::V22, VE::V23,
+    VE::V24, VE::V25, VE::V26, VE::V27, VE::V28, VE::V29, VE::V30, VE::V31,
+    VE::V32, VE::V33, VE::V34, VE::V35, VE::V36, VE::V37, VE::V38, VE::V39,
+    VE::V40, VE::V41, VE::V42, VE::V43, VE::V44, VE::V45, VE::V46, VE::V47,
+    VE::V48, VE::V49, VE::V50, VE::V51, VE::V52, VE::V53, VE::V54, VE::V55,
+    VE::V56, VE::V57, VE::V58, VE::V59, VE::V60, VE::V61, VE::V62, VE::V63};
+
+static const unsigned VMRegDecoderTable[] = {
+    VE::VM0,  VE::VM1,  VE::VM2,  VE::VM3, VE::VM4,  VE::VM5,
+    VE::VM6,  VE::VM7,  VE::VM8,  VE::VM9, VE::VM10, VE::VM11,
+    VE::VM12, VE::VM13, VE::VM14, VE::VM15};
+
+static const unsigned VM512RegDecoderTable[] = {VE::VMP0, VE::VMP1, VE::VMP2,
+                                                VE::VMP3, VE::VMP4, VE::VMP5,
+                                                VE::VMP6, VE::VMP7};
+
 static const unsigned MiscRegDecoderTable[] = {
     VE::USRCC,      VE::PSW,        VE::SAR,        VE::NoRegister,
     VE::NoRegister, VE::NoRegister, VE::NoRegister, VE::PMMR,
@@ -145,6 +164,40 @@ static DecodeStatus DecodeF128RegisterClass(MCInst &Inst, unsigned RegNo,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeV64RegisterClass(MCInst &Inst, unsigned RegNo,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  unsigned Reg = VE::NoRegister;
+  if (RegNo == 255)
+    Reg = VE::VIX;
+  else if (RegNo > 63)
+    return MCDisassembler::Fail;
+  else
+    Reg = V64RegDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeVMRegisterClass(MCInst &Inst, unsigned RegNo,
+                                          uint64_t Address,
+                                          const void *Decoder) {
+  if (RegNo > 15)
+    return MCDisassembler::Fail;
+  unsigned Reg = VMRegDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeVM512RegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder) {
+  if (RegNo % 2 || RegNo > 15)
+    return MCDisassembler::Fail;
+  unsigned Reg = VM512RegDecoderTable[RegNo / 2];
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeMISCRegisterClass(MCInst &Inst, unsigned RegNo,
                                             uint64_t Address,
                                             const void *Decoder) {
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/LVLGen.cpp b/contrib/llvm-project/llvm/lib/Target/VE/LVLGen.cpp
new file mode 100644
index 000000000000..c4588926af9e
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/VE/LVLGen.cpp
@@ -0,0 +1,137 @@
+//===-- LVLGen.cpp - LVL instruction generator ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "VE.h"
+#include "VESubtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "lvl-gen"
+
+namespace {
+struct LVLGen : public MachineFunctionPass {
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+
+  static char ID;
+  LVLGen() : MachineFunctionPass(ID) {}
+  bool runOnMachineBasicBlock(MachineBasicBlock &MBB);
+  bool runOnMachineFunction(MachineFunction &F) override;
+
+  unsigned getVL(const MachineInstr &MI);
+  int getVLIndex(unsigned Opcode);
+};
+char LVLGen::ID = 0;
+
+} // end of anonymous namespace
+
+FunctionPass *llvm::createLVLGenPass() { return new LVLGen; }
+
+int LVLGen::getVLIndex(unsigned Opcode) {
+  const MCInstrDesc &MCID = TII->get(Opcode);
+
+  // If an instruction has VLIndex information, return it.
+  if (HAS_VLINDEX(MCID.TSFlags))
+    return GET_VLINDEX(MCID.TSFlags);
+
+  return -1;
+}
+
+// returns a register holding a vector length. NoRegister is returned when
+// this MI does not have a vector length.
+unsigned LVLGen::getVL(const MachineInstr &MI) {
+  int Index = getVLIndex(MI.getOpcode());
+  if (Index >= 0)
+    return MI.getOperand(Index).getReg();
+
+  return VE::NoRegister;
+}
+
+bool LVLGen::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
+#define RegName(no)                                                            \
+  (MBB.getParent()->getSubtarget<VESubtarget>().getRegisterInfo()->getName(no))
+
+  bool Changed = false;
+  bool HasRegForVL = false;
+  unsigned RegForVL;
+
+  for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end();) {
+    MachineBasicBlock::iterator MI = I;
+
+    // Check whether MI uses a vector length operand.  If so, we prepare for VL
+    // register.  We would like to reuse VL register as much as possible.  We
+    // also would like to keep the number of LEA instructions as fewer as
+    // possible.  Therefore, we use a regular scalar register to hold immediate
+    // values to load VL register.  And try to reuse identical scalar registers
+    // to avoid new LVLr instructions as much as possible.
+    unsigned Reg = getVL(*MI);
+    if (Reg != VE::NoRegister) {
+      LLVM_DEBUG(dbgs() << "Vector instruction found: ");
+      LLVM_DEBUG(MI->dump());
+      LLVM_DEBUG(dbgs() << "Vector length is " << RegName(Reg) << ". ");
+      LLVM_DEBUG(dbgs() << "Current VL is "
+                        << (HasRegForVL ? RegName(RegForVL) : "unknown")
+                        << ". ");
+
+      if (!HasRegForVL || RegForVL != Reg) {
+        // Use VL, but a different value in a different scalar register.
+        // So, generate new LVL instruction just before the current instruction.
+        LLVM_DEBUG(dbgs() << "Generate a LVL instruction to load "
+                          << RegName(Reg) << ".\n");
+        BuildMI(MBB, I, MI->getDebugLoc(), TII->get(VE::LVLr)).addReg(Reg);
+        HasRegForVL = true;
+        RegForVL = Reg;
+        Changed = true;
+      } else {
+        LLVM_DEBUG(dbgs() << "Reuse current VL.\n");
+      }
+    }
+    // Check the update of a given scalar register holding an immediate value
+    // for VL register.  Also, a call doesn't preserve VL register.
+    if (HasRegForVL) {
+      if (MI->definesRegister(RegForVL, TRI) ||
+          MI->modifiesRegister(RegForVL, TRI) ||
+          MI->killsRegister(RegForVL, TRI) || MI->isCall()) {
+        // The latest VL is needed to be updated, so disable HasRegForVL.
+        LLVM_DEBUG(dbgs() << RegName(RegForVL) << " is needed to be updated: ");
+        LLVM_DEBUG(MI->dump());
+        HasRegForVL = false;
+      }
+    }
+
+    ++I;
+  }
+  return Changed;
+}
+
+bool LVLGen::runOnMachineFunction(MachineFunction &F) {
+  LLVM_DEBUG(dbgs() << "********** Begin LVLGen **********\n");
+  LLVM_DEBUG(dbgs() << "********** Function: " << F.getName() << '\n');
+  LLVM_DEBUG(F.dump());
+
+  bool Changed = false;
+
+  const VESubtarget &Subtarget = F.getSubtarget<VESubtarget>();
+  TII = Subtarget.getInstrInfo();
+  TRI = Subtarget.getRegisterInfo();
+
+  for (MachineFunction::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI)
+    Changed |= runOnMachineBasicBlock(*FI);
+
+  if (Changed) {
+    LLVM_DEBUG(dbgs() << "\n");
+    LLVM_DEBUG(F.dump());
+  }
+  LLVM_DEBUG(dbgs() << "********** End LVLGen **********\n");
+  return Changed;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/MCTargetDesc/VEInstPrinter.h b/contrib/llvm-project/llvm/lib/Target/VE/MCTargetDesc/VEInstPrinter.h
index 657cc513b3c5..6995007c6dc6 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/MCTargetDesc/VEInstPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/VE/MCTargetDesc/VEInstPrinter.h
@@ -29,6 +29,7 @@ public:
                  const MCSubtargetInfo &STI, raw_ostream &OS) override;
 
   // Autogenerated by tblgen.
+  std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
   bool printAliasInstr(const MCInst *, uint64_t Address,
                        const MCSubtargetInfo &, raw_ostream &);
   void printInstruction(const MCInst *, uint64_t, const MCSubtargetInfo &,
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp b/contrib/llvm-project/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp
index a39cffc8f4a6..4c480c050274 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp
@@ -56,8 +56,8 @@ static MCRegisterInfo *createVEMCRegisterInfo(const Triple &TT) {
 static MCSubtargetInfo *createVEMCSubtargetInfo(const Triple &TT, StringRef CPU,
                                                 StringRef FS) {
   if (CPU.empty())
-    CPU = "ve";
-  return createVEMCSubtargetInfoImpl(TT, CPU, FS);
+    CPU = "generic";
+  return createVEMCSubtargetInfoImpl(TT, CPU, /*TuneCPU=*/CPU, FS);
 }
 
 static MCTargetStreamer *
@@ -80,7 +80,7 @@ static MCInstPrinter *createVEMCInstPrinter(const Triple &T,
   return new VEInstPrinter(MAI, MII, MRI);
 }
 
-extern "C" void LLVMInitializeVETargetMC() {
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeVETargetMC() {
   // Register the MC asm info.
   RegisterMCAsmInfoFn X(getTheVETarget(), createVEMCAsmInfo);
 
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/TargetInfo/VETargetInfo.cpp b/contrib/llvm-project/llvm/lib/Target/VE/TargetInfo/VETargetInfo.cpp
index 65bd142fe0db..a95a299def88 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/TargetInfo/VETargetInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/VE/TargetInfo/VETargetInfo.cpp
@@ -16,7 +16,7 @@ Target &llvm::getTheVETarget() {
   return TheVETarget;
 }
 
-extern "C" void LLVMInitializeVETargetInfo() {
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeVETargetInfo() {
   RegisterTarget<Triple::ve, /*HasJIT=*/false> X(getTheVETarget(), "ve",
                                                  "VE", "VE");
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VE.h b/contrib/llvm-project/llvm/lib/Target/VE/VE.h
index 7ed7797cbb83..8c1fa840f19c 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/VE.h
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VE.h
@@ -29,6 +29,7 @@ class MachineInstr;
 
 FunctionPass *createVEISelDag(VETargetMachine &TM);
 FunctionPass *createVEPromoteToI1Pass();
+FunctionPass *createLVLGenPass();
 
 void LowerVEMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
                                  AsmPrinter &AP);
@@ -333,7 +334,7 @@ inline static bool isMImmVal(uint64_t Val) {
     return true;
   }
   // (m)1 patterns
-  return (Val & (1UL << 63)) && isShiftedMask_64(Val);
+  return (Val & (UINT64_C(1) << 63)) && isShiftedMask_64(Val);
 }
 
 inline static bool isMImm32Val(uint32_t Val) {
@@ -346,7 +347,25 @@ inline static bool isMImm32Val(uint32_t Val) {
     return true;
   }
   // (m)1 patterns
-  return (Val & (1 << 31)) && isShiftedMask_32(Val);
+  return (Val & (UINT32_C(1) << 31)) && isShiftedMask_32(Val);
+}
+
+/// val2MImm - Convert an integer immediate value to target MImm immediate.
+inline static uint64_t val2MImm(uint64_t Val) {
+  if (Val == 0)
+    return 0; // (0)1
+  if (Val & (UINT64_C(1) << 63))
+    return countLeadingOnes(Val);       // (m)1
+  return countLeadingZeros(Val) | 0x40; // (m)0
+}
+
+/// mimm2Val - Convert a target MImm immediate to an integer immediate value.
+inline static uint64_t mimm2Val(uint64_t Val) {
+  if (Val == 0)
+    return 0; // (0)1
+  if ((Val & 0x40) == 0)
+    return (uint64_t)((INT64_C(1) << 63) >> (Val & 0x3f)); // (m)1
+  return ((uint64_t)INT64_C(-1) >> (Val & 0x3f));          // (m)0
 }
 
 inline unsigned M0(unsigned Val) { return Val + 64; }
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VE.td b/contrib/llvm-project/llvm/lib/Target/VE/VE.td
index 617a6ea458b6..9e8adcd42077 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/VE.td
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VE.td
@@ -18,6 +18,9 @@ include "llvm/Target/Target.td"
 //===----------------------------------------------------------------------===//
 // VE Subtarget features.
 //
+def FeatureEnableVPU
+  : SubtargetFeature<"vpu", "EnableVPU", "true",
+                     "Enable the VPU">;
 
 //===----------------------------------------------------------------------===//
 // Register File, Calling Conv, Instruction Descriptions
@@ -43,7 +46,7 @@ def VEAsmParser : AsmParser {
 class Proc<string Name, list<SubtargetFeature> Features>
  : Processor<Name, NoItineraries, Features>;
 
-def : Proc<"ve",             []>;
+def : Proc<"generic", []>;
 
 //===----------------------------------------------------------------------===//
 // Declare the target which we are implementing
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VEAsmPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/VE/VEAsmPrinter.cpp
index 86e3aa3d3fa1..08a75b6b8c55 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/VEAsmPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VEAsmPrinter.cpp
@@ -60,6 +60,9 @@ public:
   static const char *getRegisterName(unsigned RegNo) {
     return VEInstPrinter::getRegisterName(RegNo);
   }
+  void printOperand(const MachineInstr *MI, int OpNum, raw_ostream &OS);
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                       const char *ExtraCode, raw_ostream &O) override;
 };
 } // end of anonymous namespace
 
@@ -203,7 +206,7 @@ void VEAsmPrinter::lowerGETGOTAndEmitMCInsts(const MachineInstr *MI,
   // lea %got, _GLOBAL_OFFSET_TABLE_@PC_LO(-24)
   // and %got, %got, (32)0
   // sic %plt
-  // lea.sl %got, _GLOBAL_OFFSET_TABLE_@PC_HI(%got, %plt)
+  // lea.sl %got, _GLOBAL_OFFSET_TABLE_@PC_HI(%plt, %got)
   MCOperand cim24 = MCOperand::createImm(-24);
   MCOperand loImm =
       createGOTRelExprOp(VEMCExpr::VK_VE_PC_LO32, GOTLabel, OutContext);
@@ -248,10 +251,10 @@ void VEAsmPrinter::lowerGETFunPLTAndEmitMCInsts(const MachineInstr *MI,
 
   MCOperand RegPLT = MCOperand::createReg(VE::SX16); // PLT
 
-  // lea %dst, %plt_lo(func)(-24)
+  // lea %dst, func@plt_lo(-24)
   // and %dst, %dst, (32)0
   // sic %plt                            ; FIXME: is it safe to use %plt here?
-  // lea.sl %dst, %plt_hi(func)(%dst, %plt)
+  // lea.sl %dst, func@plt_hi(%plt, %dst)
   MCOperand cim24 = MCOperand::createImm(-24);
   MCOperand loImm =
       createGOTRelExprOp(VEMCExpr::VK_VE_PLT_LO32, AddrSym, OutContext);
@@ -295,7 +298,7 @@ void VEAsmPrinter::lowerGETTLSAddrAndEmitMCInsts(const MachineInstr *MI,
   // lea %s0, sym@tls_gd_lo(-24)
   // and %s0, %s0, (32)0
   // sic %lr
-  // lea.sl %s0, sym@tls_gd_hi(%s0, %lr)
+  // lea.sl %s0, sym@tls_gd_hi(%lr, %s0)
   // lea %s12, __tls_get_addr@plt_lo(8)
   // and %s12, %s12, (32)0
   // lea.sl %s12, __tls_get_addr@plt_hi(%s12, %lr)
@@ -349,7 +352,42 @@ void VEAsmPrinter::emitInstruction(const MachineInstr *MI) {
   } while ((++I != E) && I->isInsideBundle()); // Delay slot check.
 }
 
+void VEAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
+                                raw_ostream &O) {
+  const MachineOperand &MO = MI->getOperand(OpNum);
+
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+    O << "%" << StringRef(getRegisterName(MO.getReg())).lower();
+    break;
+  default:
+    llvm_unreachable("<unknown operand type>");
+  }
+}
+
+// PrintAsmOperand - Print out an operand for an inline asm expression.
+bool VEAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                   const char *ExtraCode, raw_ostream &O) {
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0)
+      return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default:
+      // See if this is a generic print operand
+      return AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O);
+    case 'r':
+    case 'v':
+      break;
+    }
+  }
+
+  printOperand(MI, OpNo, O);
+
+  return false;
+}
+
 // Force static initialization.
-extern "C" void LLVMInitializeVEAsmPrinter() {
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeVEAsmPrinter() {
   RegisterAsmPrinter<VEAsmPrinter> X(getTheVETarget());
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VECallingConv.td b/contrib/llvm-project/llvm/lib/Target/VE/VECallingConv.td
index 4f04dae884ab..93899c2cae3d 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/VECallingConv.td
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VECallingConv.td
@@ -14,71 +14,133 @@
 // Aurora VE
 //===----------------------------------------------------------------------===//
 def CC_VE_C_Stack: CallingConv<[
-  // float --> need special handling like below.
-  //    0      4
-  //    +------+------+
-  //    | empty| float|
-  //    +------+------+
-  CCIfType<[f32], CCCustom<"allocateFloat">>,
+  // F128 are assigned to the stack in 16-byte aligned units
+  CCIfType<[f128], CCAssignToStackWithShadow<16, 16, [SX7]>>,
 
   // All of the rest are assigned to the stack in 8-byte aligned units.
   CCAssignToStack<0, 8>
 ]>;
 
-def CC_VE : CallingConv<[
+///// C Calling Convention (VE ABI v2.1) /////
+//
+// Reference: https://www.nec.com/en/global/prod/hpc/aurora/document/VE-ABI_v2.1.pdf
+//
+def CC_VE_C : CallingConv<[
   // All arguments get passed in generic registers if there is space.
 
-  // Promote i1/i8/i16 arguments to i32.
-  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
-
-  // bool, char, int, enum, long --> generic integer 32 bit registers
-  CCIfType<[i32], CCAssignToRegWithShadow<
-    [SW0, SW1, SW2, SW3, SW4, SW5, SW6, SW7],
-    [SX0, SX1, SX2, SX3, SX4, SX5, SX6, SX7]>>,
+  // Promote i1/i8/i16/i32 arguments to i64.
+  CCIfType<[i1, i8, i16, i32], CCPromoteToType<i64>>,
 
-  // float --> generic floating point 32 bit registers
-  CCIfType<[f32], CCAssignToRegWithShadow<
-    [SF0, SF1, SF2, SF3, SF4, SF5, SF6, SF7],
-    [SX0, SX1, SX2, SX3, SX4, SX5, SX6, SX7]>>,
+  // Convert float arguments to i64 with padding.
+  //     63     31   0
+  //    +------+------+
+  //    | float|   0  |
+  //    +------+------+
+  CCIfType<[f32], CCBitConvertToType<i64>>,
 
-  // long long/double --> generic 64 bit registers
+  // bool, char, int, enum, long, long long, float, double
+  //     --> generic 64 bit registers
   CCIfType<[i64, f64],
            CCAssignToReg<[SX0, SX1, SX2, SX3, SX4, SX5, SX6, SX7]>>,
 
+  // long double --> pair of generic 64 bit registers
+  //
+  // NOTE: If Q1 is allocated while SX1 is free, llvm tries to allocate SX1 for
+  //       following operands, this masks SX1 to avoid such behavior.
+  CCIfType<[f128],
+           CCAssignToRegWithShadow<[Q0, Q1, Q2, Q3],
+                                   [SX0, SX1, SX3, SX5]>>,
+
   // Alternatively, they are assigned to the stack in 8-byte aligned units.
   CCDelegateTo<CC_VE_C_Stack>
 ]>;
 
+///// Standard vararg C Calling Convention (VE ABI v2.1) /////
 // All arguments get passed in stack for varargs function or non-prototyped
 // function.
 def CC_VE2 : CallingConv<[
-  // float --> need special handling like below.
-  //    0      4
+  // Promote i1/i8/i16/i32 arguments to i64.
+  CCIfType<[i1, i8, i16, i32], CCPromoteToType<i64>>,
+
+  // Convert float arguments to i64 with padding.
+  //     63     31   0
   //    +------+------+
-  //    | empty| float|
+  //    | float|   0  |
   //    +------+------+
-  CCIfType<[f32], CCCustom<"allocateFloat">>,
+  CCIfType<[f32], CCBitConvertToType<i64>>,
+
+  // F128 are assigned to the stack in 16-byte aligned units
+  CCIfType<[f128], CCAssignToStack<16, 16>>,
 
   CCAssignToStack<0, 8>
 ]>;
 
-def RetCC_VE : CallingConv<[
-  // Promote i1/i8/i16 arguments to i32.
-  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+def RetCC_VE_C : CallingConv<[
+  // Promote i1/i8/i16/i32 return values to i64.
+  CCIfType<[i1, i8, i16, i32], CCPromoteToType<i64>>,
 
-  // bool, char, int, enum, long --> generic integer 32 bit registers
-  CCIfType<[i32], CCAssignToRegWithShadow<
-    [SW0, SW1, SW2, SW3, SW4, SW5, SW6, SW7],
-    [SX0, SX1, SX2, SX3, SX4, SX5, SX6, SX7]>>,
-
-  // float --> generic floating point 32 bit registers
-  CCIfType<[f32], CCAssignToRegWithShadow<
-    [SF0, SF1, SF2, SF3, SF4, SF5, SF6, SF7],
-    [SX0, SX1, SX2, SX3, SX4, SX5, SX6, SX7]>>,
+  // Convert float return values to i64 with padding.
+  //     63     31   0
+  //    +------+------+
+  //    | float|   0  |
+  //    +------+------+
+  CCIfType<[f32], CCBitConvertToType<i64>>,
 
-  // long long/double --> generic 64 bit registers
+  // bool, char, int, enum, long, long long, float, double
+  //     --> generic 64 bit registers
   CCIfType<[i64, f64],
            CCAssignToReg<[SX0, SX1, SX2, SX3, SX4, SX5, SX6, SX7]>>,
+
+  // long double --> pair of generic 64 bit registers
+  CCIfType<[f128],
+           CCAssignToRegWithShadow<[Q0, Q1, Q2, Q3],
+                                   [SX0, SX1, SX3, SX5]>>,
+]>;
+
+///// Custom fastcc /////
+//
+// This passes vector params and return values in registers.  Scalar values are
+// handled conforming to the standard cc.
+def CC_VE_Fast : CallingConv<[
+  // vector --> generic vector registers
+  CCIfType<[v256i32, v256f32, v256i64, v256f64],
+           CCAssignToReg<[V0, V1, V2, V3, V4, V5, V6, V7]>>,
+  // TODO: make this conditional on packed mode
+  CCIfType<[v512i32, v512f32],
+           CCAssignToReg<[V0, V1, V2, V3, V4, V5, V6, V7]>>,
+
+  // vector mask --> generic vector mask registers
+  CCIfType<[v256i1],
+           CCAssignToReg<[VM1, VM2, VM3, VM4, VM5, VM6, VM7]>>,
+
+  // pair of vector mask --> generic vector mask registers
+  CCIfType<[v512i1],
+           CCAssignToRegWithShadow<[VMP1, VMP2, VMP3],
+                                   [VM1, VM3, VM5]>>,
+
+  // Follow the standard C CC for scalars.
+  CCDelegateTo<CC_VE_C>
+]>;
+
+def RetCC_VE_Fast : CallingConv<[
+  // vector --> generic vector registers
+  CCIfType<[v256i32, v256f32, v256i64, v256f64],
+           CCAssignToReg<[V0, V1, V2, V3, V4, V5, V6, V7]>>,
+  // TODO: make this conditional on packed mode
+  CCIfType<[v512i32, v512f32],
+           CCAssignToReg<[V0, V1, V2, V3, V4, V5, V6, V7]>>,
+
+  // vector mask --> generic vector mask registers
+  CCIfType<[v256i1],
+           CCAssignToReg<[VM1, VM2, VM3, VM4, VM5, VM6, VM7]>>,
+
+  // pair of vector mask --> generic vector mask registers
+  CCIfType<[v512i1],
+           CCAssignToRegWithShadow<[VMP1, VMP2, VMP3],
+                                   [VM1, VM3, VM5]>>,
+
+  // Follow the standard C CC for scalars.
+  CCDelegateTo<RetCC_VE_C>
 ]>;
 
 // Callee-saved registers
@@ -86,4 +148,6 @@ def CSR : CalleeSavedRegs<(add (sequence "SX%u", 18, 33))>;
 def CSR_NoRegs : CalleeSavedRegs<(add)>;
 
 // PreserveAll (clobbers s62,s63) - used for ve_grow_stack
-def CSR_preserve_all : CalleeSavedRegs<(add (sequence "SX%u", 0, 61))>;
+def CSR_preserve_all : CalleeSavedRegs<(add (sequence "SX%u", 0, 61),
+                                            (sequence "V%u", 0, 63),
+                                            (sequence "VM%u", 1, 15))>;
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VEFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/VE/VEFrameLowering.cpp
index 8b10e6466123..9e97d0eca833 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/VEFrameLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VEFrameLowering.cpp
@@ -8,6 +8,105 @@
 //
 // This file contains the VE implementation of TargetFrameLowering class.
 //
+// On VE, stack frames are structured as follows:
+//
+// The stack grows downward.
+//
+// All of the individual frame areas on the frame below are optional, i.e. it's
+// possible to create a function so that the particular area isn't present
+// in the frame.
+//
+// At function entry, the "frame" looks as follows:
+//
+// |                                              | Higher address
+// |----------------------------------------------|
+// | Parameter area for this function             |
+// |----------------------------------------------|
+// | Register save area (RSA) for this function   |
+// |----------------------------------------------|
+// | Return address for this function             |
+// |----------------------------------------------|
+// | Frame pointer for this function              |
+// |----------------------------------------------| <- sp
+// |                                              | Lower address
+//
+// VE doesn't use on demand stack allocation, so user code generated by LLVM
+// needs to call VEOS to allocate stack frame.  VE's ABI want to reduce the
+// number of VEOS calls, so ABI requires to allocate not only RSA (in general
+// CSR, callee saved register) area but also call frame at the prologue of
+// caller function.
+//
+// After the prologue has run, the frame has the following general structure.
+// Note that technically the last frame area (VLAs) doesn't get created until
+// in the main function body, after the prologue is run. However, it's depicted
+// here for completeness.
+//
+// |                                              | Higher address
+// |----------------------------------------------|
+// | Parameter area for this function             |
+// |----------------------------------------------|
+// | Register save area (RSA) for this function   |
+// |----------------------------------------------|
+// | Return address for this function             |
+// |----------------------------------------------|
+// | Frame pointer for this function              |
+// |----------------------------------------------| <- fp(=old sp)
+// |.empty.space.to.make.part.below.aligned.in....|
+// |.case.it.needs.more.than.the.standard.16-byte.| (size of this area is
+// |.alignment....................................|  unknown at compile time)
+// |----------------------------------------------|
+// | Local variables of fixed size including spill|
+// | slots                                        |
+// |----------------------------------------------| <- bp(not defined by ABI,
+// |.variable-sized.local.variables.(VLAs)........|       LLVM chooses SX17)
+// |..............................................| (size of this area is
+// |..............................................|  unknown at compile time)
+// |----------------------------------------------| <- stack top (returned by
+// | Parameter area for callee                    |               alloca)
+// |----------------------------------------------|
+// | Register save area (RSA) for callee          |
+// |----------------------------------------------|
+// | Return address for callee                    |
+// |----------------------------------------------|
+// | Frame pointer for callee                     |
+// |----------------------------------------------| <- sp
+// |                                              | Lower address
+//
+// To access the data in a frame, at-compile time, a constant offset must be
+// computable from one of the pointers (fp, bp, sp) to access it. The size
+// of the areas with a dotted background cannot be computed at compile-time
+// if they are present, making it required to have all three of fp, bp and
+// sp to be set up to be able to access all contents in the frame areas,
+// assuming all of the frame areas are non-empty.
+//
+// For most functions, some of the frame areas are empty. For those functions,
+// it may not be necessary to set up fp or bp:
+// * A base pointer is definitely needed when there are both VLAs and local
+//   variables with more-than-default alignment requirements.
+// * A frame pointer is definitely needed when there are local variables with
+//   more-than-default alignment requirements.
+//
+// In addition, VE ABI defines RSA frame, return address, and frame pointer
+// as follows:
+//
+// |----------------------------------------------| <- sp+176
+// | %s18...%s33                                  |
+// |----------------------------------------------| <- sp+48
+// | Linkage area register (%s17)                 |
+// |----------------------------------------------| <- sp+40
+// | Procedure linkage table register (%plt=%s16) |
+// |----------------------------------------------| <- sp+32
+// | Global offset table register (%got=%s15)     |
+// |----------------------------------------------| <- sp+24
+// | Thread pointer register (%tp=%s14)           |
+// |----------------------------------------------| <- sp+16
+// | Return address                               |
+// |----------------------------------------------| <- sp+8
+// | Frame pointer                                |
+// |----------------------------------------------| <- sp+0
+//
+// NOTE: This description is based on VE ABI and description in
+//       AArch64FrameLowering.cpp.  Thanks a lot.
 //===----------------------------------------------------------------------===//
 
 #include "VEFrameLowering.h"
@@ -38,48 +137,47 @@ void VEFrameLowering::emitPrologueInsns(MachineFunction &MF,
                                         MachineBasicBlock::iterator MBBI,
                                         uint64_t NumBytes,
                                         bool RequireFPUpdate) const {
+  const VEMachineFunctionInfo *FuncInfo = MF.getInfo<VEMachineFunctionInfo>();
+  DebugLoc DL;
+  const VEInstrInfo &TII = *STI.getInstrInfo();
 
-  DebugLoc dl;
-  const VEInstrInfo &TII =
-      *static_cast<const VEInstrInfo *>(MF.getSubtarget().getInstrInfo());
   // Insert following codes here as prologue
   //
-  //    st %fp, 0(,%sp)
-  //    st %lr, 8(,%sp)
-  //    st %got, 24(,%sp)
-  //    st %plt, 32(,%sp)
-  //    st %s17, 40(,%sp) iff this function is using s17 as BP
-  //    or %fp, 0, %sp
-
-  BuildMI(MBB, MBBI, dl, TII.get(VE::STrii))
-      .addReg(VE::SX11)
-      .addImm(0)
-      .addImm(0)
-      .addReg(VE::SX9);
-  BuildMI(MBB, MBBI, dl, TII.get(VE::STrii))
-      .addReg(VE::SX11)
-      .addImm(0)
-      .addImm(8)
-      .addReg(VE::SX10);
-  BuildMI(MBB, MBBI, dl, TII.get(VE::STrii))
-      .addReg(VE::SX11)
-      .addImm(0)
-      .addImm(24)
-      .addReg(VE::SX15);
-  BuildMI(MBB, MBBI, dl, TII.get(VE::STrii))
-      .addReg(VE::SX11)
-      .addImm(0)
-      .addImm(32)
-      .addReg(VE::SX16);
+  //    st %fp, 0(, %sp)   iff !isLeafProc
+  //    st %lr, 8(, %sp)   iff !isLeafProc
+  //    st %got, 24(, %sp) iff hasGOT
+  //    st %plt, 32(, %sp) iff hasGOT
+  //    st %s17, 40(, %sp) iff hasBP
+  if (!FuncInfo->isLeafProc()) {
+    BuildMI(MBB, MBBI, DL, TII.get(VE::STrii))
+        .addReg(VE::SX11)
+        .addImm(0)
+        .addImm(0)
+        .addReg(VE::SX9);
+    BuildMI(MBB, MBBI, DL, TII.get(VE::STrii))
+        .addReg(VE::SX11)
+        .addImm(0)
+        .addImm(8)
+        .addReg(VE::SX10);
+  }
+  if (hasGOT(MF)) {
+    BuildMI(MBB, MBBI, DL, TII.get(VE::STrii))
+        .addReg(VE::SX11)
+        .addImm(0)
+        .addImm(24)
+        .addReg(VE::SX15);
+    BuildMI(MBB, MBBI, DL, TII.get(VE::STrii))
+        .addReg(VE::SX11)
+        .addImm(0)
+        .addImm(32)
+        .addReg(VE::SX16);
+  }
   if (hasBP(MF))
-    BuildMI(MBB, MBBI, dl, TII.get(VE::STrii))
+    BuildMI(MBB, MBBI, DL, TII.get(VE::STrii))
         .addReg(VE::SX11)
         .addImm(0)
         .addImm(40)
         .addReg(VE::SX17);
-  BuildMI(MBB, MBBI, dl, TII.get(VE::ORri), VE::SX9)
-      .addReg(VE::SX11)
-      .addImm(0);
 }
 
 void VEFrameLowering::emitEpilogueInsns(MachineFunction &MF,
@@ -87,43 +185,42 @@ void VEFrameLowering::emitEpilogueInsns(MachineFunction &MF,
                                         MachineBasicBlock::iterator MBBI,
                                         uint64_t NumBytes,
                                         bool RequireFPUpdate) const {
+  const VEMachineFunctionInfo *FuncInfo = MF.getInfo<VEMachineFunctionInfo>();
+  DebugLoc DL;
+  const VEInstrInfo &TII = *STI.getInstrInfo();
 
-  DebugLoc dl;
-  const VEInstrInfo &TII =
-      *static_cast<const VEInstrInfo *>(MF.getSubtarget().getInstrInfo());
   // Insert following codes here as epilogue
   //
-  //    or %sp, 0, %fp
-  //    ld %s17, 40(,%sp) iff this function is using s17 as BP
-  //    ld %got, 32(,%sp)
-  //    ld %plt, 24(,%sp)
-  //    ld %lr, 8(,%sp)
-  //    ld %fp, 0(,%sp)
-
-  BuildMI(MBB, MBBI, dl, TII.get(VE::ORri), VE::SX11)
-      .addReg(VE::SX9)
-      .addImm(0);
+  //    ld %s17, 40(, %sp) iff hasBP
+  //    ld %plt, 32(, %sp) iff hasGOT
+  //    ld %got, 24(, %sp) iff hasGOT
+  //    ld %lr, 8(, %sp)   iff !isLeafProc
+  //    ld %fp, 0(, %sp)   iff !isLeafProc
   if (hasBP(MF))
-    BuildMI(MBB, MBBI, dl, TII.get(VE::LDrii), VE::SX17)
+    BuildMI(MBB, MBBI, DL, TII.get(VE::LDrii), VE::SX17)
         .addReg(VE::SX11)
         .addImm(0)
         .addImm(40);
-  BuildMI(MBB, MBBI, dl, TII.get(VE::LDrii), VE::SX16)
-      .addReg(VE::SX11)
-      .addImm(0)
-      .addImm(32);
-  BuildMI(MBB, MBBI, dl, TII.get(VE::LDrii), VE::SX15)
-      .addReg(VE::SX11)
-      .addImm(0)
-      .addImm(24);
-  BuildMI(MBB, MBBI, dl, TII.get(VE::LDrii), VE::SX10)
-      .addReg(VE::SX11)
-      .addImm(0)
-      .addImm(8);
-  BuildMI(MBB, MBBI, dl, TII.get(VE::LDrii), VE::SX9)
-      .addReg(VE::SX11)
-      .addImm(0)
-      .addImm(0);
+  if (hasGOT(MF)) {
+    BuildMI(MBB, MBBI, DL, TII.get(VE::LDrii), VE::SX16)
+        .addReg(VE::SX11)
+        .addImm(0)
+        .addImm(32);
+    BuildMI(MBB, MBBI, DL, TII.get(VE::LDrii), VE::SX15)
+        .addReg(VE::SX11)
+        .addImm(0)
+        .addImm(24);
+  }
+  if (!FuncInfo->isLeafProc()) {
+    BuildMI(MBB, MBBI, DL, TII.get(VE::LDrii), VE::SX10)
+        .addReg(VE::SX11)
+        .addImm(0)
+        .addImm(8);
+    BuildMI(MBB, MBBI, DL, TII.get(VE::LDrii), VE::SX9)
+        .addReg(VE::SX11)
+        .addImm(0)
+        .addImm(0);
+  }
 }
 
 void VEFrameLowering::emitSPAdjustment(MachineFunction &MF,
@@ -131,37 +228,44 @@ void VEFrameLowering::emitSPAdjustment(MachineFunction &MF,
                                        MachineBasicBlock::iterator MBBI,
                                        int64_t NumBytes,
                                        MaybeAlign MaybeAlign) const {
-  DebugLoc dl;
-  const VEInstrInfo &TII =
-      *static_cast<const VEInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  DebugLoc DL;
+  const VEInstrInfo &TII = *STI.getInstrInfo();
 
-  if (NumBytes >= -64 && NumBytes < 63) {
-    BuildMI(MBB, MBBI, dl, TII.get(VE::ADDSLri), VE::SX11)
+  if (NumBytes == 0) {
+    // Nothing to do here.
+  } else if (isInt<7>(NumBytes)) {
+    // adds.l %s11, NumBytes@lo, %s11
+    BuildMI(MBB, MBBI, DL, TII.get(VE::ADDSLri), VE::SX11)
         .addReg(VE::SX11)
         .addImm(NumBytes);
-    return;
+  } else if (isInt<32>(NumBytes)) {
+    // lea %s11, NumBytes@lo(, %s11)
+    BuildMI(MBB, MBBI, DL, TII.get(VE::LEArii), VE::SX11)
+        .addReg(VE::SX11)
+        .addImm(0)
+        .addImm(Lo_32(NumBytes));
+  } else {
+    // Emit following codes.  This clobbers SX13 which we always know is
+    // available here.
+    //   lea     %s13, NumBytes@lo
+    //   and     %s13, %s13, (32)0
+    //   lea.sl  %sp, NumBytes@hi(%s13, %sp)
+    BuildMI(MBB, MBBI, DL, TII.get(VE::LEAzii), VE::SX13)
+        .addImm(0)
+        .addImm(0)
+        .addImm(Lo_32(NumBytes));
+    BuildMI(MBB, MBBI, DL, TII.get(VE::ANDrm), VE::SX13)
+        .addReg(VE::SX13)
+        .addImm(M0(32));
+    BuildMI(MBB, MBBI, DL, TII.get(VE::LEASLrri), VE::SX11)
+        .addReg(VE::SX11)
+        .addReg(VE::SX13)
+        .addImm(Hi_32(NumBytes));
   }
 
-  // Emit following codes.  This clobbers SX13 which we always know is
-  // available here.
-  //   lea     %s13,%lo(NumBytes)
-  //   and     %s13,%s13,(32)0
-  //   lea.sl  %sp,%hi(NumBytes)(%sp, %s13)
-  BuildMI(MBB, MBBI, dl, TII.get(VE::LEAzii), VE::SX13)
-      .addImm(0)
-      .addImm(0)
-      .addImm(Lo_32(NumBytes));
-  BuildMI(MBB, MBBI, dl, TII.get(VE::ANDrm), VE::SX13)
-      .addReg(VE::SX13)
-      .addImm(M0(32));
-  BuildMI(MBB, MBBI, dl, TII.get(VE::LEASLrri), VE::SX11)
-      .addReg(VE::SX11)
-      .addReg(VE::SX13)
-      .addImm(Hi_32(NumBytes));
-
   if (MaybeAlign) {
     // and %sp, %sp, Align-1
-    BuildMI(MBB, MBBI, dl, TII.get(VE::ANDrm), VE::SX11)
+    BuildMI(MBB, MBBI, DL, TII.get(VE::ANDrm), VE::SX11)
         .addReg(VE::SX11)
         .addImm(M1(64 - Log2_64(MaybeAlign.valueOrOne().value())));
   }
@@ -169,9 +273,8 @@ void VEFrameLowering::emitSPAdjustment(MachineFunction &MF,
 
 void VEFrameLowering::emitSPExtend(MachineFunction &MF, MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator MBBI) const {
-  DebugLoc dl;
-  const VEInstrInfo &TII =
-      *static_cast<const VEInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  DebugLoc DL;
+  const VEInstrInfo &TII = *STI.getInstrInfo();
 
   // Emit following codes.  It is not possible to insert multiple
   // BasicBlocks in PEI pass, so we emit two pseudo instructions here.
@@ -198,22 +301,23 @@ void VEFrameLowering::emitSPExtend(MachineFunction &MF, MachineBasicBlock &MBB,
   // EXTEND_STACK_GUARD pseudo will be simply eliminated by ExpandPostRA
   // pass.  This pseudo is required to be at the next of EXTEND_STACK
   // pseudo in order to protect iteration loop in ExpandPostRA.
-
-  BuildMI(MBB, MBBI, dl, TII.get(VE::EXTEND_STACK));
-  BuildMI(MBB, MBBI, dl, TII.get(VE::EXTEND_STACK_GUARD));
+  BuildMI(MBB, MBBI, DL, TII.get(VE::EXTEND_STACK));
+  BuildMI(MBB, MBBI, DL, TII.get(VE::EXTEND_STACK_GUARD));
 }
 
 void VEFrameLowering::emitPrologue(MachineFunction &MF,
                                    MachineBasicBlock &MBB) const {
+  const VEMachineFunctionInfo *FuncInfo = MF.getInfo<VEMachineFunctionInfo>();
   assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
   MachineFrameInfo &MFI = MF.getFrameInfo();
   const VEInstrInfo &TII = *STI.getInstrInfo();
   const VERegisterInfo &RegInfo = *STI.getRegisterInfo();
   MachineBasicBlock::iterator MBBI = MBB.begin();
+  bool NeedsStackRealignment = RegInfo.needsStackRealignment(MF);
+
   // Debug location must be unknown since the first debug location is used
   // to determine the end of the prologue.
-  DebugLoc dl;
-  bool NeedsStackRealignment = RegInfo.needsStackRealignment(MF);
+  DebugLoc DL;
 
   // FIXME: unfortunately, returning false from canRealignStack
   // actually just causes needsStackRealignment to return false,
@@ -226,12 +330,17 @@ void VEFrameLowering::emitPrologue(MachineFunction &MF,
                        "stack re-alignment, but LLVM couldn't handle it "
                        "(probably because it has a dynamic alloca).");
 
-  // Get the number of bytes to allocate from the FrameInfo
+  // Get the number of bytes to allocate from the FrameInfo.
+  // This number of bytes is already aligned to ABI stack alignment.
   uint64_t NumBytes = MFI.getStackSize();
 
-  // The VE ABI requires a reserved 176 bytes area at the top
-  // of stack as described in VESubtarget.cpp.  So, we adjust it here.
-  NumBytes = STI.getAdjustedFrameSize(NumBytes);
+  // Adjust stack size if this function is not a leaf function since the
+  // VE ABI requires a reserved area at the top of stack as described in
+  // VEFrameLowering.cpp.
+  if (!FuncInfo->isLeafProc()) {
+    // NOTE: The number is aligned to ABI stack alignment after adjustment.
+    NumBytes = STI.getAdjustedFrameSize(NumBytes);
+  }
 
   // Finally, ensure that the size is sufficiently aligned for the
   // data on the stack.
@@ -240,36 +349,34 @@ void VEFrameLowering::emitPrologue(MachineFunction &MF,
   // Update stack size with corrected value.
   MFI.setStackSize(NumBytes);
 
-  // Emit Prologue instructions to save %lr
+  // Emit Prologue instructions to save multiple registers.
   emitPrologueInsns(MF, MBB, MBBI, NumBytes, true);
 
+  // Emit instructions to save SP in FP as follows if this is not a leaf
+  // function:
+  //    or %fp, 0, %sp
+  if (!FuncInfo->isLeafProc())
+    BuildMI(MBB, MBBI, DL, TII.get(VE::ORri), VE::SX9)
+        .addReg(VE::SX11)
+        .addImm(0);
+
   // Emit stack adjust instructions
   MaybeAlign RuntimeAlign =
       NeedsStackRealignment ? MaybeAlign(MFI.getMaxAlign()) : None;
+  assert((RuntimeAlign == None || !FuncInfo->isLeafProc()) &&
+         "SP has to be saved in order to align variable sized stack object!");
   emitSPAdjustment(MF, MBB, MBBI, -(int64_t)NumBytes, RuntimeAlign);
 
   if (hasBP(MF)) {
     // Copy SP to BP.
-    BuildMI(MBB, MBBI, dl, TII.get(VE::ORri), VE::SX17)
+    BuildMI(MBB, MBBI, DL, TII.get(VE::ORri), VE::SX17)
         .addReg(VE::SX11)
         .addImm(0);
   }
 
   // Emit stack extend instructions
-  emitSPExtend(MF, MBB, MBBI);
-
-  Register RegFP = RegInfo.getDwarfRegNum(VE::SX9, true);
-
-  // Emit ".cfi_def_cfa_register 30".
-  unsigned CFIIndex =
-      MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(nullptr, RegFP));
-  BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
-      .addCFIIndex(CFIIndex);
-
-  // Emit ".cfi_window_save".
-  CFIIndex = MF.addFrameInst(MCCFIInstruction::createWindowSave(nullptr));
-  BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
-      .addCFIIndex(CFIIndex);
+  if (NumBytes != 0)
+    emitSPExtend(MF, MBB, MBBI);
 }
 
 MachineBasicBlock::iterator VEFrameLowering::eliminateCallFramePseudoInstr(
@@ -289,21 +396,33 @@ MachineBasicBlock::iterator VEFrameLowering::eliminateCallFramePseudoInstr(
 
 void VEFrameLowering::emitEpilogue(MachineFunction &MF,
                                    MachineBasicBlock &MBB) const {
+  const VEMachineFunctionInfo *FuncInfo = MF.getInfo<VEMachineFunctionInfo>();
+  DebugLoc DL;
   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
-  DebugLoc dl = MBBI->getDebugLoc();
   MachineFrameInfo &MFI = MF.getFrameInfo();
+  const VEInstrInfo &TII = *STI.getInstrInfo();
 
   uint64_t NumBytes = MFI.getStackSize();
 
-  // Emit Epilogue instructions to restore %lr
+  // Emit instructions to retrieve original SP.
+  if (!FuncInfo->isLeafProc()) {
+    // If SP is saved in FP, retrieve it as follows:
+    //    or %sp, 0, %fp     iff !isLeafProc
+    BuildMI(MBB, MBBI, DL, TII.get(VE::ORri), VE::SX11)
+        .addReg(VE::SX9)
+        .addImm(0);
+  } else {
+    // Emit stack adjust instructions.
+    emitSPAdjustment(MF, MBB, MBBI, NumBytes, None);
+  }
+
+  // Emit Epilogue instructions to restore multiple registers.
   emitEpilogueInsns(MF, MBB, MBBI, NumBytes, true);
 }
 
 // hasFP - Return true if the specified function should have a dedicated frame
 // pointer register.  This is true if the function has variable sized allocas
-// or if frame pointer elimination is disabled.  For the case of VE, we don't
-// implement FP eliminator yet, but we returns false from this function to
-// not refer fp from generated code.
+// or if frame pointer elimination is disabled.
 bool VEFrameLowering::hasFP(const MachineFunction &MF) const {
   const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
 
@@ -320,34 +439,41 @@ bool VEFrameLowering::hasBP(const MachineFunction &MF) const {
   return MFI.hasVarSizedObjects() && TRI->needsStackRealignment(MF);
 }
 
-int VEFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
-                                            Register &FrameReg) const {
+bool VEFrameLowering::hasGOT(const MachineFunction &MF) const {
+  const VEMachineFunctionInfo *FuncInfo = MF.getInfo<VEMachineFunctionInfo>();
+
+  // If a global base register is assigned (!= 0), GOT is used.
+  return FuncInfo->getGlobalBaseReg() != 0;
+}
+
+StackOffset VEFrameLowering::getFrameIndexReference(const MachineFunction &MF,
+                                                    int FI,
+                                                    Register &FrameReg) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   const VERegisterInfo *RegInfo = STI.getRegisterInfo();
-  const VEMachineFunctionInfo *FuncInfo = MF.getInfo<VEMachineFunctionInfo>();
   bool isFixed = MFI.isFixedObjectIndex(FI);
 
   int64_t FrameOffset = MF.getFrameInfo().getObjectOffset(FI);
 
-  if (FuncInfo->isLeafProc()) {
-    // If there's a leaf proc, all offsets need to be %sp-based,
-    // because we haven't caused %fp to actually point to our frame.
+  if (!hasFP(MF)) {
+    // If FP is not used, frame indexies are based on a %sp regiter.
     FrameReg = VE::SX11; // %sp
-    return FrameOffset + MF.getFrameInfo().getStackSize();
+    return StackOffset::getFixed(FrameOffset +
+                                 MF.getFrameInfo().getStackSize());
   }
   if (RegInfo->needsStackRealignment(MF) && !isFixed) {
-    // If there is dynamic stack realignment, all local object
-    // references need to be via %sp or %s17 (bp), to take account
-    // of the re-alignment.
+    // If data on stack require realignemnt, frame indexies are based on a %sp
+    // or %s17 (bp) register.  If there is a variable sized object, bp is used.
     if (hasBP(MF))
       FrameReg = VE::SX17; // %bp
     else
       FrameReg = VE::SX11; // %sp
-    return FrameOffset + MF.getFrameInfo().getStackSize();
+    return StackOffset::getFixed(FrameOffset +
+                                 MF.getFrameInfo().getStackSize());
   }
-  // Finally, default to using %fp.
+  // Use %fp by default.
   FrameReg = RegInfo->getFrameRegister(MF);
-  return FrameOffset;
+  return StackOffset::getFixed(FrameOffset);
 }
 
 bool VEFrameLowering::isLeafProc(MachineFunction &MF) const {
@@ -367,8 +493,10 @@ void VEFrameLowering::determineCalleeSaves(MachineFunction &MF,
                                            RegScavenger *RS) const {
   TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
 
-  if (isLeafProc(MF)) {
-    VEMachineFunctionInfo *MFI = MF.getInfo<VEMachineFunctionInfo>();
-    MFI->setLeafProc(true);
+  // Functions having BP need to emit prologue and epilogue to allocate local
+  // buffer on the stack even if the function is a leaf function.
+  if (isLeafProc(MF) && !hasBP(MF)) {
+    VEMachineFunctionInfo *FuncInfo = MF.getInfo<VEMachineFunctionInfo>();
+    FuncInfo->setLeafProc(true);
   }
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VEFrameLowering.h b/contrib/llvm-project/llvm/lib/Target/VE/VEFrameLowering.h
index b548d663c504..99eb41189b25 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/VEFrameLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VEFrameLowering.h
@@ -15,6 +15,7 @@
 
 #include "VE.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/Support/TypeSize.h"
 
 namespace llvm {
 
@@ -38,8 +39,10 @@ public:
   eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator I) const override;
 
-  bool hasBP(const MachineFunction &MF) const;
   bool hasFP(const MachineFunction &MF) const override;
+  bool hasBP(const MachineFunction &MF) const;
+  bool hasGOT(const MachineFunction &MF) const;
+
   // VE reserves argument space always for call sites in the function
   // immediately on entry of the current function.
   bool hasReservedCallFrame(const MachineFunction &MF) const override {
@@ -48,8 +51,8 @@ public:
   void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
                             RegScavenger *RS = nullptr) const override;
 
-  int getFrameIndexReference(const MachineFunction &MF, int FI,
-                             Register &FrameReg) const override;
+  StackOffset getFrameIndexReference(const MachineFunction &MF, int FI,
+                                     Register &FrameReg) const override;
 
   const SpillSlot *
   getCalleeSavedSpillSlots(unsigned &NumEntries) const override {
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VEISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/VE/VEISelDAGToDAG.cpp
index f3d067d55fdb..761baa79b4ab 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/VEISelDAGToDAG.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VEISelDAGToDAG.cpp
@@ -113,15 +113,6 @@ inline static uint64_t getFpImmVal(const ConstantFPSDNode *N) {
   return Val;
 }
 
-/// convMImmVal - Convert a mimm integer immediate value to target immediate.
-inline static uint64_t convMImmVal(uint64_t Val) {
-  if (Val == 0)
-    return 0; // (0)1
-  if (Val & (1UL << 63))
-    return countLeadingOnes(Val);       // (m)1
-  return countLeadingZeros(Val) | 0x40; // (m)0
-}
-
 //===--------------------------------------------------------------------===//
 /// VEDAGToDAGISel - VE specific code to select VE machine
 /// instructions for SelectionDAG operations.
@@ -148,6 +139,7 @@ public:
   bool selectADDRzri(SDValue N, SDValue &Base, SDValue &Index, SDValue &Offset);
   bool selectADDRzii(SDValue N, SDValue &Base, SDValue &Index, SDValue &Offset);
   bool selectADDRri(SDValue N, SDValue &Base, SDValue &Offset);
+  bool selectADDRzi(SDValue N, SDValue &Base, SDValue &Offset);
 
   StringRef getPassName() const override {
     return "VE DAG->DAG Pattern Instruction Selection";
@@ -183,6 +175,14 @@ bool VEDAGToDAGISel::selectADDRrri(SDValue Addr, SDValue &Base, SDValue &Index,
     return false;
   }
   if (matchADDRrr(Addr, LHS, RHS)) {
+    // If the input is a pair of a frame-index and a register, move a
+    // frame-index to LHS.  This generates MI with following operands.
+    //    %dest, #FI, %reg, offset
+    // In the eliminateFrameIndex, above MI is converted to the following.
+    //    %dest, %fp, %reg, fi_offset + offset
+    if (dyn_cast<FrameIndexSDNode>(RHS))
+      std::swap(LHS, RHS);
+
     if (matchADDRri(RHS, Index, Offset)) {
       Base = LHS;
       return true;
@@ -228,7 +228,7 @@ bool VEDAGToDAGISel::selectADDRzii(SDValue Addr, SDValue &Base, SDValue &Index,
       Addr.getOpcode() == ISD::TargetGlobalTLSAddress)
     return false; // direct calls.
 
-  if (ConstantSDNode *CN = cast<ConstantSDNode>(Addr)) {
+  if (auto *CN = dyn_cast<ConstantSDNode>(Addr)) {
     if (isInt<32>(CN->getSExtValue())) {
       Base = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
       Index = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
@@ -250,6 +250,26 @@ bool VEDAGToDAGISel::selectADDRri(SDValue Addr, SDValue &Base,
   return true;
 }
 
+bool VEDAGToDAGISel::selectADDRzi(SDValue Addr, SDValue &Base,
+                                  SDValue &Offset) {
+  if (dyn_cast<FrameIndexSDNode>(Addr))
+    return false;
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress ||
+      Addr.getOpcode() == ISD::TargetGlobalTLSAddress)
+    return false; // direct calls.
+
+  if (auto *CN = dyn_cast<ConstantSDNode>(Addr)) {
+    if (isInt<32>(CN->getSExtValue())) {
+      Base = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
+      Offset =
+          CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(Addr), MVT::i32);
+      return true;
+    }
+  }
+  return false;
+}
+
 bool VEDAGToDAGISel::matchADDRrr(SDValue Addr, SDValue &Base, SDValue &Index) {
   if (dyn_cast<FrameIndexSDNode>(Addr))
     return false;
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VEISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/VE/VEISelLowering.cpp
index ab720545dd83..d377f8e27cfd 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/VEISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VEISelLowering.cpp
@@ -13,6 +13,7 @@
 
 #include "VEISelLowering.h"
 #include "MCTargetDesc/VEMCExpr.h"
+#include "VEInstrBuilder.h"
 #include "VEMachineFunctionInfo.h"
 #include "VERegisterInfo.h"
 #include "VETargetMachine.h"
@@ -21,6 +22,7 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
@@ -38,39 +40,280 @@ using namespace llvm;
 // Calling Convention Implementation
 //===----------------------------------------------------------------------===//
 
-static bool allocateFloat(unsigned ValNo, MVT ValVT, MVT LocVT,
-                          CCValAssign::LocInfo LocInfo,
-                          ISD::ArgFlagsTy ArgFlags, CCState &State) {
-  switch (LocVT.SimpleTy) {
-  case MVT::f32: {
-    // Allocate stack like below
-    //    0      4
-    //    +------+------+
-    //    | empty| float|
-    //    +------+------+
-    // Use align=8 for dummy area to align the beginning of these 2 area.
-    State.AllocateStack(4, Align(8)); // for empty area
-    // Use align=4 for value to place it at just after the dummy area.
-    unsigned Offset = State.AllocateStack(4, Align(4)); // for float value area
-    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
-    return true;
-  }
+#include "VEGenCallingConv.inc"
+
+CCAssignFn *getReturnCC(CallingConv::ID CallConv) {
+  switch (CallConv) {
   default:
-    return false;
+    return RetCC_VE_C;
+  case CallingConv::Fast:
+    return RetCC_VE_Fast;
   }
 }
 
-#include "VEGenCallingConv.inc"
+CCAssignFn *getParamCC(CallingConv::ID CallConv, bool IsVarArg) {
+  if (IsVarArg)
+    return CC_VE2;
+  switch (CallConv) {
+  default:
+    return CC_VE_C;
+  case CallingConv::Fast:
+    return CC_VE_Fast;
+  }
+}
 
 bool VETargetLowering::CanLowerReturn(
     CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
     const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
-  CCAssignFn *RetCC = RetCC_VE;
+  CCAssignFn *RetCC = getReturnCC(CallConv);
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
   return CCInfo.CheckReturn(Outs, RetCC);
 }
 
+static const MVT AllVectorVTs[] = {MVT::v256i32, MVT::v512i32, MVT::v256i64,
+                                   MVT::v256f32, MVT::v512f32, MVT::v256f64};
+
+static const MVT AllPackedVTs[] = {MVT::v512i32, MVT::v512f32};
+
+void VETargetLowering::initRegisterClasses() {
+  // Set up the register classes.
+  addRegisterClass(MVT::i32, &VE::I32RegClass);
+  addRegisterClass(MVT::i64, &VE::I64RegClass);
+  addRegisterClass(MVT::f32, &VE::F32RegClass);
+  addRegisterClass(MVT::f64, &VE::I64RegClass);
+  addRegisterClass(MVT::f128, &VE::F128RegClass);
+
+  if (Subtarget->enableVPU()) {
+    for (MVT VecVT : AllVectorVTs)
+      addRegisterClass(VecVT, &VE::V64RegClass);
+    addRegisterClass(MVT::v256i1, &VE::VMRegClass);
+    addRegisterClass(MVT::v512i1, &VE::VM512RegClass);
+  }
+}
+
+void VETargetLowering::initSPUActions() {
+  const auto &TM = getTargetMachine();
+  /// Load & Store {
+
+  // VE doesn't have i1 sign extending load.
+  for (MVT VT : MVT::integer_valuetypes()) {
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
+    setTruncStoreAction(VT, MVT::i1, Expand);
+  }
+
+  // VE doesn't have floating point extload/truncstore, so expand them.
+  for (MVT FPVT : MVT::fp_valuetypes()) {
+    for (MVT OtherFPVT : MVT::fp_valuetypes()) {
+      setLoadExtAction(ISD::EXTLOAD, FPVT, OtherFPVT, Expand);
+      setTruncStoreAction(FPVT, OtherFPVT, Expand);
+    }
+  }
+
+  // VE doesn't have fp128 load/store, so expand them in custom lower.
+  setOperationAction(ISD::LOAD, MVT::f128, Custom);
+  setOperationAction(ISD::STORE, MVT::f128, Custom);
+
+  /// } Load & Store
+
+  // Custom legalize address nodes into LO/HI parts.
+  MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
+  setOperationAction(ISD::BlockAddress, PtrVT, Custom);
+  setOperationAction(ISD::GlobalAddress, PtrVT, Custom);
+  setOperationAction(ISD::GlobalTLSAddress, PtrVT, Custom);
+  setOperationAction(ISD::ConstantPool, PtrVT, Custom);
+  setOperationAction(ISD::JumpTable, PtrVT, Custom);
+
+  /// VAARG handling {
+  setOperationAction(ISD::VASTART, MVT::Other, Custom);
+  // VAARG needs to be lowered to access with 8 bytes alignment.
+  setOperationAction(ISD::VAARG, MVT::Other, Custom);
+  // Use the default implementation.
+  setOperationAction(ISD::VACOPY, MVT::Other, Expand);
+  setOperationAction(ISD::VAEND, MVT::Other, Expand);
+  /// } VAARG handling
+
+  /// Stack {
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
+
+  // Use the default implementation.
+  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+  /// } Stack
+
+  /// Branch {
+
+  // VE doesn't have BRCOND
+  setOperationAction(ISD::BRCOND, MVT::Other, Expand);
+
+  // BR_JT is not implemented yet.
+  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+
+  /// } Branch
+
+  /// Int Ops {
+  for (MVT IntVT : {MVT::i32, MVT::i64}) {
+    // VE has no REM or DIVREM operations.
+    setOperationAction(ISD::UREM, IntVT, Expand);
+    setOperationAction(ISD::SREM, IntVT, Expand);
+    setOperationAction(ISD::SDIVREM, IntVT, Expand);
+    setOperationAction(ISD::UDIVREM, IntVT, Expand);
+
+    // VE has no SHL_PARTS/SRA_PARTS/SRL_PARTS operations.
+    setOperationAction(ISD::SHL_PARTS, IntVT, Expand);
+    setOperationAction(ISD::SRA_PARTS, IntVT, Expand);
+    setOperationAction(ISD::SRL_PARTS, IntVT, Expand);
+
+    // VE has no MULHU/S or U/SMUL_LOHI operations.
+    // TODO: Use MPD instruction to implement SMUL_LOHI for i32 type.
+    setOperationAction(ISD::MULHU, IntVT, Expand);
+    setOperationAction(ISD::MULHS, IntVT, Expand);
+    setOperationAction(ISD::UMUL_LOHI, IntVT, Expand);
+    setOperationAction(ISD::SMUL_LOHI, IntVT, Expand);
+
+    // VE has no CTTZ, ROTL, ROTR operations.
+    setOperationAction(ISD::CTTZ, IntVT, Expand);
+    setOperationAction(ISD::ROTL, IntVT, Expand);
+    setOperationAction(ISD::ROTR, IntVT, Expand);
+
+    // VE has 64 bits instruction which works as i64 BSWAP operation.  This
+    // instruction works fine as i32 BSWAP operation with an additional
+    // parameter.  Use isel patterns to lower BSWAP.
+    setOperationAction(ISD::BSWAP, IntVT, Legal);
+
+    // VE has only 64 bits instructions which work as i64 BITREVERSE/CTLZ/CTPOP
+    // operations.  Use isel patterns for i64, promote for i32.
+    LegalizeAction Act = (IntVT == MVT::i32) ? Promote : Legal;
+    setOperationAction(ISD::BITREVERSE, IntVT, Act);
+    setOperationAction(ISD::CTLZ, IntVT, Act);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, IntVT, Act);
+    setOperationAction(ISD::CTPOP, IntVT, Act);
+
+    // VE has only 64 bits instructions which work as i64 AND/OR/XOR operations.
+    // Use isel patterns for i64, promote for i32.
+    setOperationAction(ISD::AND, IntVT, Act);
+    setOperationAction(ISD::OR, IntVT, Act);
+    setOperationAction(ISD::XOR, IntVT, Act);
+  }
+  /// } Int Ops
+
+  /// Conversion {
+  // VE doesn't have instructions for fp<->uint, so expand them by llvm
+  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Promote); // use i64
+  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Promote); // use i64
+  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);
+
+  // fp16 not supported
+  for (MVT FPVT : MVT::fp_valuetypes()) {
+    setOperationAction(ISD::FP16_TO_FP, FPVT, Expand);
+    setOperationAction(ISD::FP_TO_FP16, FPVT, Expand);
+  }
+  /// } Conversion
+
+  /// Floating-point Ops {
+  /// Note: Floating-point operations are fneg, fadd, fsub, fmul, fdiv, frem,
+  ///       and fcmp.
+
+  // VE doesn't have following floating point operations.
+  for (MVT VT : MVT::fp_valuetypes()) {
+    setOperationAction(ISD::FNEG, VT, Expand);
+    setOperationAction(ISD::FREM, VT, Expand);
+  }
+
+  // VE doesn't have fdiv of f128.
+  setOperationAction(ISD::FDIV, MVT::f128, Expand);
+
+  for (MVT FPVT : {MVT::f32, MVT::f64}) {
+    // f32 and f64 uses ConstantFP.  f128 uses ConstantPool.
+    setOperationAction(ISD::ConstantFP, FPVT, Legal);
+  }
+  /// } Floating-point Ops
+
+  /// Floating-point math functions {
+
+  // VE doesn't have following floating point math functions.
+  for (MVT VT : MVT::fp_valuetypes()) {
+    setOperationAction(ISD::FABS, VT, Expand);
+    setOperationAction(ISD::FCOPYSIGN, VT, Expand);
+    setOperationAction(ISD::FCOS, VT, Expand);
+    setOperationAction(ISD::FSIN, VT, Expand);
+    setOperationAction(ISD::FSQRT, VT, Expand);
+  }
+
+  /// } Floating-point math functions
+
+  /// Atomic instructions {
+
+  setMaxAtomicSizeInBitsSupported(64);
+  setMinCmpXchgSizeInBits(32);
+  setSupportsUnalignedAtomics(false);
+
+  // Use custom inserter for ATOMIC_FENCE.
+  setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
+
+  // Other atomic instructions.
+  for (MVT VT : MVT::integer_valuetypes()) {
+    // Support i8/i16 atomic swap.
+    setOperationAction(ISD::ATOMIC_SWAP, VT, Custom);
+
+    // FIXME: Support "atmam" instructions.
+    setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Expand);
+
+    // VE doesn't have follwing instructions.
+    setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_CLR, VT, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_NAND, VT, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_MIN, VT, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_MAX, VT, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_UMIN, VT, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_UMAX, VT, Expand);
+  }
+
+  /// } Atomic instructions
+
+  /// SJLJ instructions {
+  setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
+  setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
+  setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
+  if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
+    setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
+  /// } SJLJ instructions
+
+  // Intrinsic instructions
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+}
+
+void VETargetLowering::initVPUActions() {
+  for (MVT LegalVecVT : AllVectorVTs) {
+    setOperationAction(ISD::BUILD_VECTOR, LegalVecVT, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT, LegalVecVT, Legal);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, LegalVecVT, Legal);
+    // Translate all vector instructions with legal element types to VVP_*
+    // nodes.
+    // TODO We will custom-widen into VVP_* nodes in the future. While we are
+    // buildling the infrastructure for this, we only do this for legal vector
+    // VTs.
+#define HANDLE_VP_TO_VVP(VP_OPC, VVP_NAME)                                     \
+  setOperationAction(ISD::VP_OPC, LegalVecVT, Custom);
+#define ADD_VVP_OP(VVP_NAME, ISD_NAME)                                         \
+  setOperationAction(ISD::ISD_NAME, LegalVecVT, Custom);
+#include "VVPNodes.def"
+  }
+
+  for (MVT LegalPackedVT : AllPackedVTs) {
+    setOperationAction(ISD::INSERT_VECTOR_ELT, LegalPackedVT, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, LegalPackedVT, Custom);
+  }
+}
+
 SDValue
 VETargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                               bool IsVarArg,
@@ -85,7 +328,7 @@ VETargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                  *DAG.getContext());
 
   // Analyze return values.
-  CCInfo.AnalyzeReturn(Outs, RetCC_VE);
+  CCInfo.AnalyzeReturn(Outs, getReturnCC(CallConv));
 
   SDValue Flag;
   SmallVector<SDValue, 4> RetOps(1, Chain);
@@ -94,6 +337,7 @@ VETargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
     CCValAssign &VA = RVLocs[i];
     assert(VA.isRegLoc() && "Can only return in registers!");
+    assert(!VA.needsCustom() && "Unexpected custom lowering");
     SDValue OutVal = OutVals[i];
 
     // Integer return values must be sign or zero extended by the callee.
@@ -109,12 +353,26 @@ VETargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
     case CCValAssign::AExt:
       OutVal = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), OutVal);
       break;
+    case CCValAssign::BCvt: {
+      // Convert a float return value to i64 with padding.
+      //     63     31   0
+      //    +------+------+
+      //    | float|   0  |
+      //    +------+------+
+      assert(VA.getLocVT() == MVT::i64);
+      assert(VA.getValVT() == MVT::f32);
+      SDValue Undef = SDValue(
+          DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i64), 0);
+      SDValue Sub_f32 = DAG.getTargetConstant(VE::sub_f32, DL, MVT::i32);
+      OutVal = SDValue(DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
+                                          MVT::i64, Undef, OutVal, Sub_f32),
+                       0);
+      break;
+    }
     default:
       llvm_unreachable("Unknown loc info!");
     }
 
-    assert(!VA.needsCustom() && "Unexpected custom lowering");
-
     Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), OutVal, Flag);
 
     // Guarantee that all emitted copies are stuck together with flags.
@@ -138,7 +396,7 @@ SDValue VETargetLowering::LowerFormalArguments(
   MachineFunction &MF = DAG.getMachineFunction();
 
   // Get the base offset of the incoming arguments stack space.
-  unsigned ArgsBaseOffset = 176;
+  unsigned ArgsBaseOffset = Subtarget->getRsaSize();
   // Get the size of the preserved arguments area
   unsigned ArgsPreserved = 64;
 
@@ -150,10 +408,11 @@ SDValue VETargetLowering::LowerFormalArguments(
   CCInfo.AllocateStack(ArgsPreserved, Align(8));
   // We already allocated the preserved area, so the stack offset computed
   // by CC_VE would be correct now.
-  CCInfo.AnalyzeFormalArguments(Ins, CC_VE);
+  CCInfo.AnalyzeFormalArguments(Ins, getParamCC(CallConv, false));
 
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
+    assert(!VA.needsCustom() && "Unexpected custom lowering");
     if (VA.isRegLoc()) {
       // This argument is passed in a register.
       // All integer register arguments are promoted by the caller to i64.
@@ -163,11 +422,6 @@ SDValue VETargetLowering::LowerFormalArguments(
           MF.addLiveIn(VA.getLocReg(), getRegClassFor(VA.getLocVT()));
       SDValue Arg = DAG.getCopyFromReg(Chain, DL, VReg, VA.getLocVT());
 
-      // Get the high bits for i32 struct elements.
-      if (VA.getValVT() == MVT::i32 && VA.needsCustom())
-        Arg = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Arg,
-                          DAG.getConstant(32, DL, MVT::i32));
-
       // The caller promoted the argument, so insert an Assert?ext SDNode so we
       // won't promote the value again in this function.
       switch (VA.getLocInfo()) {
@@ -179,6 +433,20 @@ SDValue VETargetLowering::LowerFormalArguments(
         Arg = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Arg,
                           DAG.getValueType(VA.getValVT()));
         break;
+      case CCValAssign::BCvt: {
+        // Extract a float argument from i64 with padding.
+        //     63     31   0
+        //    +------+------+
+        //    | float|   0  |
+        //    +------+------+
+        assert(VA.getLocVT() == MVT::i64);
+        assert(VA.getValVT() == MVT::f32);
+        SDValue Sub_f32 = DAG.getTargetConstant(VE::sub_f32, DL, MVT::i32);
+        Arg = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
+                                         MVT::f32, Arg, Sub_f32),
+                      0);
+        break;
+      }
       default:
         break;
       }
@@ -194,9 +462,23 @@ SDValue VETargetLowering::LowerFormalArguments(
     // The registers are exhausted. This argument was passed on the stack.
     assert(VA.isMemLoc());
     // The CC_VE_Full/Half functions compute stack offsets relative to the
-    // beginning of the arguments area at %fp+176.
+    // beginning of the arguments area at %fp + the size of reserved area.
     unsigned Offset = VA.getLocMemOffset() + ArgsBaseOffset;
     unsigned ValSize = VA.getValVT().getSizeInBits() / 8;
+
+    // Adjust offset for a float argument by adding 4 since the argument is
+    // stored in 8 bytes buffer with offset like below.  LLVM generates
+    // 4 bytes load instruction, so need to adjust offset here.  This
+    // adjustment is required in only LowerFormalArguments.  In LowerCall,
+    // a float argument is converted to i64 first, and stored as 8 bytes
+    // data, which is required by ABI, so no need for adjustment.
+    //    0      4
+    //    +------+------+
+    //    | empty| float|
+    //    +------+------+
+    if (VA.getValVT() == MVT::f32)
+      Offset += 4;
+
     int FI = MF.getFrameInfo().CreateFixedObject(ValSize, Offset, true);
     InVals.push_back(
         DAG.getLoad(VA.getValVT(), DL, Chain,
@@ -215,7 +497,7 @@ SDValue VETargetLowering::LowerFormalArguments(
   // TODO: need to calculate offset correctly once we support f128.
   unsigned ArgOffset = ArgLocs.size() * 8;
   VEMachineFunctionInfo *FuncInfo = MF.getInfo<VEMachineFunctionInfo>();
-  // Skip the 176 bytes of register save area.
+  // Skip the reserved area at the top of stack.
   FuncInfo->setVarArgsFrameOffset(ArgOffset + ArgsBaseOffset);
 
   return Chain;
@@ -258,7 +540,7 @@ SDValue VETargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   CLI.IsTailCall = false;
 
   // Get the base offset of the outgoing arguments stack space.
-  unsigned ArgsBaseOffset = 176;
+  unsigned ArgsBaseOffset = Subtarget->getRsaSize();
   // Get the size of the preserved arguments area
   unsigned ArgsPreserved = 8 * 8u;
 
@@ -270,7 +552,7 @@ SDValue VETargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   CCInfo.AllocateStack(ArgsPreserved, Align(8));
   // We already allocated the preserved area, so the stack offset computed
   // by CC_VE would be correct now.
-  CCInfo.AnalyzeCallOperands(CLI.Outs, CC_VE);
+  CCInfo.AnalyzeCallOperands(CLI.Outs, getParamCC(CLI.CallConv, false));
 
   // VE requires to use both register and stack for varargs or no-prototyped
   // functions.
@@ -281,7 +563,7 @@ SDValue VETargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   CCState CCInfo2(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(),
                   ArgLocs2, *DAG.getContext());
   if (UseBoth)
-    CCInfo2.AnalyzeCallOperands(CLI.Outs, CC_VE2);
+    CCInfo2.AnalyzeCallOperands(CLI.Outs, getParamCC(CLI.CallConv, true));
 
   // Get the size of the outgoing arguments stack space requirement.
   unsigned ArgsSize = CCInfo.getNextStackOffset();
@@ -371,6 +653,22 @@ SDValue VETargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     case CCValAssign::AExt:
       Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
       break;
+    case CCValAssign::BCvt: {
+      // Convert a float argument to i64 with padding.
+      //     63     31   0
+      //    +------+------+
+      //    | float|   0  |
+      //    +------+------+
+      assert(VA.getLocVT() == MVT::i64);
+      assert(VA.getValVT() == MVT::f32);
+      SDValue Undef = SDValue(
+          DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i64), 0);
+      SDValue Sub_f32 = DAG.getTargetConstant(VE::sub_f32, DL, MVT::i32);
+      Arg = SDValue(DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
+                                       MVT::i64, Undef, Arg, Sub_f32),
+                    0);
+      break;
+    }
     }
 
     if (VA.isRegLoc()) {
@@ -384,8 +682,7 @@ SDValue VETargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
     // Create a store off the stack pointer for this argument.
     SDValue StackPtr = DAG.getRegister(VE::SX11, PtrVT);
-    // The argument area starts at %fp+176 in the callee frame,
-    // %sp+176 in ours.
+    // The argument area starts at %fp/%sp + the size of reserved area.
     SDValue PtrOff =
         DAG.getIntPtrConstant(VA.getLocMemOffset() + ArgsBaseOffset, DL);
     PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
@@ -450,11 +747,12 @@ SDValue VETargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   if (CLI.Ins.size() == 1 && CLI.Ins[0].VT == MVT::f32 && !CLI.CB)
     CLI.Ins[0].Flags.setInReg();
 
-  RVInfo.AnalyzeCallResult(CLI.Ins, RetCC_VE);
+  RVInfo.AnalyzeCallResult(CLI.Ins, getReturnCC(CLI.CallConv));
 
   // Copy all of the result registers out of their specified physreg.
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
     CCValAssign &VA = RVLocs[i];
+    assert(!VA.needsCustom() && "Unexpected custom lowering");
     unsigned Reg = VA.getLocReg();
 
     // When returning 'inreg {i32, i32 }', two consecutive i32 arguments can
@@ -472,11 +770,6 @@ SDValue VETargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       InGlue = Chain.getValue(2);
     }
 
-    // Get the high bits for i32 struct elements.
-    if (VA.getValVT() == MVT::i32 && VA.needsCustom())
-      RV = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), RV,
-                       DAG.getConstant(32, DL, MVT::i32));
-
     // The callee promoted the return value, so insert an Assert?ext SDNode so
     // we won't promote the value again in this function.
     switch (VA.getLocInfo()) {
@@ -488,6 +781,20 @@ SDValue VETargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       RV = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), RV,
                        DAG.getValueType(VA.getValVT()));
       break;
+    case CCValAssign::BCvt: {
+      // Extract a float return value from i64 with padding.
+      //     63     31   0
+      //    +------+------+
+      //    | float|   0  |
+      //    +------+------+
+      assert(VA.getLocVT() == MVT::i64);
+      assert(VA.getValVT() == MVT::f32);
+      SDValue Sub_f32 = DAG.getTargetConstant(VE::sub_f32, DL, MVT::i32);
+      RV = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
+                                      MVT::f32, RV, Sub_f32),
+                   0);
+      break;
+    }
     default:
       break;
     }
@@ -502,6 +809,15 @@ SDValue VETargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   return Chain;
 }
 
+bool VETargetLowering::isOffsetFoldingLegal(
+    const GlobalAddressSDNode *GA) const {
+  // VE uses 64 bit addressing, so we need multiple instructions to generate
+  // an address.  Folding address with offset increases the number of
+  // instructions, so that we disable it here.  Offsets will be folded in
+  // the DAG combine later if it worth to do so.
+  return false;
+}
+
 /// isFPImmLegal - Returns true if the target can instruction select the
 /// specified FP immediate natively. If false, the legalizer will
 /// materialize the FP immediate as a load from a constant pool.
@@ -531,30 +847,6 @@ bool VETargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
   return true;
 }
 
-bool VETargetLowering::hasAndNot(SDValue Y) const {
-  EVT VT = Y.getValueType();
-
-  // VE doesn't have vector and not instruction.
-  if (VT.isVector())
-    return false;
-
-  // VE allows different immediate values for X and Y where ~X & Y.
-  // Only simm7 works for X, and only mimm works for Y on VE.  However, this
-  // function is used to check whether an immediate value is OK for and-not
-  // instruction as both X and Y.  Generating additional instruction to
-  // retrieve an immediate value is no good since the purpose of this
-  // function is to convert a series of 3 instructions to another series of
-  // 3 instructions with better parallelism.  Therefore, we return false
-  // for all immediate values now.
-  // FIXME: Change hasAndNot function to have two operands to make it work
-  //        correctly with Aurora VE.
-  if (isa<ConstantSDNode>(Y))
-    return false;
-
-  // It's ok for generic registers.
-  return true;
-}
-
 VETargetLowering::VETargetLowering(const TargetMachine &TM,
                                    const VESubtarget &STI)
     : TargetLowering(TM), Subtarget(&STI) {
@@ -566,91 +858,15 @@ VETargetLowering::VETargetLowering(const TargetMachine &TM,
   setBooleanContents(ZeroOrOneBooleanContent);
   setBooleanVectorContents(ZeroOrOneBooleanContent);
 
-  // Set up the register classes.
-  addRegisterClass(MVT::i32, &VE::I32RegClass);
-  addRegisterClass(MVT::i64, &VE::I64RegClass);
-  addRegisterClass(MVT::f32, &VE::F32RegClass);
-  addRegisterClass(MVT::f64, &VE::I64RegClass);
-
-  /// Load & Store {
-  for (MVT FPVT : MVT::fp_valuetypes()) {
-    for (MVT OtherFPVT : MVT::fp_valuetypes()) {
-      // Turn FP extload into load/fpextend
-      setLoadExtAction(ISD::EXTLOAD, FPVT, OtherFPVT, Expand);
-
-      // Turn FP truncstore into trunc + store.
-      setTruncStoreAction(FPVT, OtherFPVT, Expand);
-    }
-  }
-
-  // VE doesn't have i1 sign extending load
-  for (MVT VT : MVT::integer_valuetypes()) {
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
-    setTruncStoreAction(VT, MVT::i1, Expand);
-  }
-  /// } Load & Store
-
-  // Custom legalize address nodes into LO/HI parts.
-  MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
-  setOperationAction(ISD::BlockAddress, PtrVT, Custom);
-  setOperationAction(ISD::GlobalAddress, PtrVT, Custom);
-  setOperationAction(ISD::GlobalTLSAddress, PtrVT, Custom);
-
-  /// VAARG handling {
-  setOperationAction(ISD::VASTART, MVT::Other, Custom);
-  // VAARG needs to be lowered to access with 8 bytes alignment.
-  setOperationAction(ISD::VAARG, MVT::Other, Custom);
-  // Use the default implementation.
-  setOperationAction(ISD::VACOPY, MVT::Other, Expand);
-  setOperationAction(ISD::VAEND, MVT::Other, Expand);
-  /// } VAARG handling
-
-  /// Stack {
-  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
-  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
-  /// } Stack
-
-  /// Int Ops {
-  for (MVT IntVT : {MVT::i32, MVT::i64}) {
-    // VE has no REM or DIVREM operations.
-    setOperationAction(ISD::UREM, IntVT, Expand);
-    setOperationAction(ISD::SREM, IntVT, Expand);
-    setOperationAction(ISD::SDIVREM, IntVT, Expand);
-    setOperationAction(ISD::UDIVREM, IntVT, Expand);
-
-    setOperationAction(ISD::CTTZ, IntVT, Expand);
-    setOperationAction(ISD::ROTL, IntVT, Expand);
-    setOperationAction(ISD::ROTR, IntVT, Expand);
-
-    // Use isel patterns for i32 and i64
-    setOperationAction(ISD::BSWAP, IntVT, Legal);
-    setOperationAction(ISD::CTLZ, IntVT, Legal);
-    setOperationAction(ISD::CTPOP, IntVT, Legal);
-
-    // Use isel patterns for i64, Promote i32
-    LegalizeAction Act = (IntVT == MVT::i32) ? Promote : Legal;
-    setOperationAction(ISD::BITREVERSE, IntVT, Act);
-  }
-  /// } Int Ops
-
-  /// Conversion {
-  // VE doesn't have instructions for fp<->uint, so expand them by llvm
-  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Promote); // use i64
-  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Promote); // use i64
-  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
-  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);
-
-  // fp16 not supported
-  for (MVT FPVT : MVT::fp_valuetypes()) {
-    setOperationAction(ISD::FP16_TO_FP, FPVT, Expand);
-    setOperationAction(ISD::FP_TO_FP16, FPVT, Expand);
-  }
-  /// } Conversion
+  initRegisterClasses();
+  initSPUActions();
+  initVPUActions();
 
   setStackPointerRegisterToSaveRestore(VE::SX11);
 
+  // We have target-specific dag combine patterns for the following nodes:
+  setTargetDAGCombine(ISD::TRUNCATE);
+
   // Set function alignment to 16 bytes
   setMinFunctionAlignment(Align(16));
 
@@ -667,14 +883,24 @@ const char *VETargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch ((VEISD::NodeType)Opcode) {
   case VEISD::FIRST_NUMBER:
     break;
-    TARGET_NODE_CASE(Lo)
-    TARGET_NODE_CASE(Hi)
+    TARGET_NODE_CASE(CALL)
+    TARGET_NODE_CASE(EH_SJLJ_LONGJMP)
+    TARGET_NODE_CASE(EH_SJLJ_SETJMP)
+    TARGET_NODE_CASE(EH_SJLJ_SETUP_DISPATCH)
     TARGET_NODE_CASE(GETFUNPLT)
     TARGET_NODE_CASE(GETSTACKTOP)
     TARGET_NODE_CASE(GETTLSADDR)
-    TARGET_NODE_CASE(CALL)
-    TARGET_NODE_CASE(RET_FLAG)
     TARGET_NODE_CASE(GLOBAL_BASE_REG)
+    TARGET_NODE_CASE(Hi)
+    TARGET_NODE_CASE(Lo)
+    TARGET_NODE_CASE(MEMBARRIER)
+    TARGET_NODE_CASE(RET_FLAG)
+    TARGET_NODE_CASE(TS1AM)
+    TARGET_NODE_CASE(VEC_BROADCAST)
+
+    // Register the VVP_* SDNodes.
+#define ADD_VVP_OP(VVP_NAME, ...) TARGET_NODE_CASE(VVP_NAME)
+#include "VVPNodes.def"
   }
 #undef TARGET_NODE_CASE
   return nullptr;
@@ -696,10 +922,17 @@ SDValue VETargetLowering::withTargetFlags(SDValue Op, unsigned TF,
     return DAG.getTargetBlockAddress(BA->getBlockAddress(), Op.getValueType(),
                                      0, TF);
 
+  if (const ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Op))
+    return DAG.getTargetConstantPool(CP->getConstVal(), CP->getValueType(0),
+                                     CP->getAlign(), CP->getOffset(), TF);
+
   if (const ExternalSymbolSDNode *ES = dyn_cast<ExternalSymbolSDNode>(Op))
     return DAG.getTargetExternalSymbol(ES->getSymbol(), ES->getValueType(0),
                                        TF);
 
+  if (const JumpTableSDNode *JT = dyn_cast<JumpTableSDNode>(Op))
+    return DAG.getTargetJumpTable(JT->getIndex(), JT->getValueType(0), TF);
+
   llvm_unreachable("Unhandled address SDNode");
 }
 
@@ -722,32 +955,24 @@ SDValue VETargetLowering::makeAddress(SDValue Op, SelectionDAG &DAG) const {
 
   // Handle PIC mode first. VE needs a got load for every variable!
   if (isPositionIndependent()) {
-    // GLOBAL_BASE_REG codegen'ed with call. Inform MFI that this
-    // function has calls.
-    MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
-    MFI.setHasCalls(true);
     auto GlobalN = dyn_cast<GlobalAddressSDNode>(Op);
 
-    if (isa<ConstantPoolSDNode>(Op) ||
+    if (isa<ConstantPoolSDNode>(Op) || isa<JumpTableSDNode>(Op) ||
         (GlobalN && GlobalN->getGlobal()->hasLocalLinkage())) {
       // Create following instructions for local linkage PIC code.
-      //     lea %s35, %gotoff_lo(.LCPI0_0)
-      //     and %s35, %s35, (32)0
-      //     lea.sl %s35, %gotoff_hi(.LCPI0_0)(%s35)
-      //     adds.l %s35, %s15, %s35                  ; %s15 is GOT
-      // FIXME: use lea.sl %s35, %gotoff_hi(.LCPI0_0)(%s35, %s15)
+      //     lea %reg, label@gotoff_lo
+      //     and %reg, %reg, (32)0
+      //     lea.sl %reg, label@gotoff_hi(%reg, %got)
       SDValue HiLo = makeHiLoPair(Op, VEMCExpr::VK_VE_GOTOFF_HI32,
                                   VEMCExpr::VK_VE_GOTOFF_LO32, DAG);
       SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, PtrVT);
       return DAG.getNode(ISD::ADD, DL, PtrVT, GlobalBase, HiLo);
     }
     // Create following instructions for not local linkage PIC code.
-    //     lea %s35, %got_lo(.LCPI0_0)
-    //     and %s35, %s35, (32)0
-    //     lea.sl %s35, %got_hi(.LCPI0_0)(%s35)
-    //     adds.l %s35, %s15, %s35                  ; %s15 is GOT
-    //     ld     %s35, (,%s35)
-    // FIXME: use lea.sl %s35, %gotoff_hi(.LCPI0_0)(%s35, %s15)
+    //     lea %reg, label@got_lo
+    //     and %reg, %reg, (32)0
+    //     lea.sl %reg, label@got_hi(%reg)
+    //     ld %reg, (%reg, %got)
     SDValue HiLo = makeHiLoPair(Op, VEMCExpr::VK_VE_GOT_HI32,
                                 VEMCExpr::VK_VE_GOT_LO32, DAG);
     SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, PtrVT);
@@ -770,20 +995,222 @@ SDValue VETargetLowering::makeAddress(SDValue Op, SelectionDAG &DAG) const {
 
 /// Custom Lower {
 
-SDValue VETargetLowering::LowerGlobalAddress(SDValue Op,
+// The mappings for emitLeading/TrailingFence for VE is designed by following
+// http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
+Instruction *VETargetLowering::emitLeadingFence(IRBuilder<> &Builder,
+                                                Instruction *Inst,
+                                                AtomicOrdering Ord) const {
+  switch (Ord) {
+  case AtomicOrdering::NotAtomic:
+  case AtomicOrdering::Unordered:
+    llvm_unreachable("Invalid fence: unordered/non-atomic");
+  case AtomicOrdering::Monotonic:
+  case AtomicOrdering::Acquire:
+    return nullptr; // Nothing to do
+  case AtomicOrdering::Release:
+  case AtomicOrdering::AcquireRelease:
+    return Builder.CreateFence(AtomicOrdering::Release);
+  case AtomicOrdering::SequentiallyConsistent:
+    if (!Inst->hasAtomicStore())
+      return nullptr; // Nothing to do
+    return Builder.CreateFence(AtomicOrdering::SequentiallyConsistent);
+  }
+  llvm_unreachable("Unknown fence ordering in emitLeadingFence");
+}
+
+Instruction *VETargetLowering::emitTrailingFence(IRBuilder<> &Builder,
+                                                 Instruction *Inst,
+                                                 AtomicOrdering Ord) const {
+  switch (Ord) {
+  case AtomicOrdering::NotAtomic:
+  case AtomicOrdering::Unordered:
+    llvm_unreachable("Invalid fence: unordered/not-atomic");
+  case AtomicOrdering::Monotonic:
+  case AtomicOrdering::Release:
+    return nullptr; // Nothing to do
+  case AtomicOrdering::Acquire:
+  case AtomicOrdering::AcquireRelease:
+    return Builder.CreateFence(AtomicOrdering::Acquire);
+  case AtomicOrdering::SequentiallyConsistent:
+    return Builder.CreateFence(AtomicOrdering::SequentiallyConsistent);
+  }
+  llvm_unreachable("Unknown fence ordering in emitTrailingFence");
+}
+
+SDValue VETargetLowering::lowerATOMIC_FENCE(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
+      cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
+  SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
+      cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
+
+  // VE uses Release consistency, so need a fence instruction if it is a
+  // cross-thread fence.
+  if (FenceSSID == SyncScope::System) {
+    switch (FenceOrdering) {
+    case AtomicOrdering::NotAtomic:
+    case AtomicOrdering::Unordered:
+    case AtomicOrdering::Monotonic:
+      // No need to generate fencem instruction here.
+      break;
+    case AtomicOrdering::Acquire:
+      // Generate "fencem 2" as acquire fence.
+      return SDValue(DAG.getMachineNode(VE::FENCEM, DL, MVT::Other,
+                                        DAG.getTargetConstant(2, DL, MVT::i32),
+                                        Op.getOperand(0)),
+                     0);
+    case AtomicOrdering::Release:
+      // Generate "fencem 1" as release fence.
+      return SDValue(DAG.getMachineNode(VE::FENCEM, DL, MVT::Other,
+                                        DAG.getTargetConstant(1, DL, MVT::i32),
+                                        Op.getOperand(0)),
+                     0);
+    case AtomicOrdering::AcquireRelease:
+    case AtomicOrdering::SequentiallyConsistent:
+      // Generate "fencem 3" as acq_rel and seq_cst fence.
+      // FIXME: "fencem 3" doesn't wait for for PCIe deveices accesses,
+      //        so  seq_cst may require more instruction for them.
+      return SDValue(DAG.getMachineNode(VE::FENCEM, DL, MVT::Other,
+                                        DAG.getTargetConstant(3, DL, MVT::i32),
+                                        Op.getOperand(0)),
+                     0);
+    }
+  }
+
+  // MEMBARRIER is a compiler barrier; it codegens to a no-op.
+  return DAG.getNode(VEISD::MEMBARRIER, DL, MVT::Other, Op.getOperand(0));
+}
+
+TargetLowering::AtomicExpansionKind
+VETargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
+  // We have TS1AM implementation for i8/i16/i32/i64, so use it.
+  if (AI->getOperation() == AtomicRMWInst::Xchg) {
+    return AtomicExpansionKind::None;
+  }
+  // FIXME: Support "ATMAM" instruction for LOAD_ADD/SUB/AND/OR.
+
+  // Otherwise, expand it using compare and exchange instruction to not call
+  // __sync_fetch_and_* functions.
+  return AtomicExpansionKind::CmpXChg;
+}
+
+static SDValue prepareTS1AM(SDValue Op, SelectionDAG &DAG, SDValue &Flag,
+                            SDValue &Bits) {
+  SDLoc DL(Op);
+  AtomicSDNode *N = cast<AtomicSDNode>(Op);
+  SDValue Ptr = N->getOperand(1);
+  SDValue Val = N->getOperand(2);
+  EVT PtrVT = Ptr.getValueType();
+  bool Byte = N->getMemoryVT() == MVT::i8;
+  //   Remainder = AND Ptr, 3
+  //   Flag = 1 << Remainder  ; If Byte is true (1 byte swap flag)
+  //   Flag = 3 << Remainder  ; If Byte is false (2 bytes swap flag)
+  //   Bits = Remainder << 3
+  //   NewVal = Val << Bits
+  SDValue Const3 = DAG.getConstant(3, DL, PtrVT);
+  SDValue Remainder = DAG.getNode(ISD::AND, DL, PtrVT, {Ptr, Const3});
+  SDValue Mask = Byte ? DAG.getConstant(1, DL, MVT::i32)
+                      : DAG.getConstant(3, DL, MVT::i32);
+  Flag = DAG.getNode(ISD::SHL, DL, MVT::i32, {Mask, Remainder});
+  Bits = DAG.getNode(ISD::SHL, DL, PtrVT, {Remainder, Const3});
+  return DAG.getNode(ISD::SHL, DL, Val.getValueType(), {Val, Bits});
+}
+
+static SDValue finalizeTS1AM(SDValue Op, SelectionDAG &DAG, SDValue Data,
+                             SDValue Bits) {
+  SDLoc DL(Op);
+  EVT VT = Data.getValueType();
+  bool Byte = cast<AtomicSDNode>(Op)->getMemoryVT() == MVT::i8;
+  //   NewData = Data >> Bits
+  //   Result = NewData & 0xff   ; If Byte is true (1 byte)
+  //   Result = NewData & 0xffff ; If Byte is false (2 bytes)
+
+  SDValue NewData = DAG.getNode(ISD::SRL, DL, VT, Data, Bits);
+  return DAG.getNode(ISD::AND, DL, VT,
+                     {NewData, DAG.getConstant(Byte ? 0xff : 0xffff, DL, VT)});
+}
+
+SDValue VETargetLowering::lowerATOMIC_SWAP(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  AtomicSDNode *N = cast<AtomicSDNode>(Op);
+
+  if (N->getMemoryVT() == MVT::i8) {
+    // For i8, use "ts1am"
+    //   Input:
+    //     ATOMIC_SWAP Ptr, Val, Order
+    //
+    //   Output:
+    //     Remainder = AND Ptr, 3
+    //     Flag = 1 << Remainder   ; 1 byte swap flag for TS1AM inst.
+    //     Bits = Remainder << 3
+    //     NewVal = Val << Bits
+    //
+    //     Aligned = AND Ptr, -4
+    //     Data = TS1AM Aligned, Flag, NewVal
+    //
+    //     NewData = Data >> Bits
+    //     Result = NewData & 0xff ; 1 byte result
+    SDValue Flag;
+    SDValue Bits;
+    SDValue NewVal = prepareTS1AM(Op, DAG, Flag, Bits);
+
+    SDValue Ptr = N->getOperand(1);
+    SDValue Aligned = DAG.getNode(ISD::AND, DL, Ptr.getValueType(),
+                                  {Ptr, DAG.getConstant(-4, DL, MVT::i64)});
+    SDValue TS1AM = DAG.getAtomic(VEISD::TS1AM, DL, N->getMemoryVT(),
+                                  DAG.getVTList(Op.getNode()->getValueType(0),
+                                                Op.getNode()->getValueType(1)),
+                                  {N->getChain(), Aligned, Flag, NewVal},
+                                  N->getMemOperand());
+
+    SDValue Result = finalizeTS1AM(Op, DAG, TS1AM, Bits);
+    SDValue Chain = TS1AM.getValue(1);
+    return DAG.getMergeValues({Result, Chain}, DL);
+  }
+  if (N->getMemoryVT() == MVT::i16) {
+    // For i16, use "ts1am"
+    SDValue Flag;
+    SDValue Bits;
+    SDValue NewVal = prepareTS1AM(Op, DAG, Flag, Bits);
+
+    SDValue Ptr = N->getOperand(1);
+    SDValue Aligned = DAG.getNode(ISD::AND, DL, Ptr.getValueType(),
+                                  {Ptr, DAG.getConstant(-4, DL, MVT::i64)});
+    SDValue TS1AM = DAG.getAtomic(VEISD::TS1AM, DL, N->getMemoryVT(),
+                                  DAG.getVTList(Op.getNode()->getValueType(0),
+                                                Op.getNode()->getValueType(1)),
+                                  {N->getChain(), Aligned, Flag, NewVal},
+                                  N->getMemOperand());
+
+    SDValue Result = finalizeTS1AM(Op, DAG, TS1AM, Bits);
+    SDValue Chain = TS1AM.getValue(1);
+    return DAG.getMergeValues({Result, Chain}, DL);
+  }
+  // Otherwise, let llvm legalize it.
+  return Op;
+}
+
+SDValue VETargetLowering::lowerGlobalAddress(SDValue Op,
                                              SelectionDAG &DAG) const {
   return makeAddress(Op, DAG);
 }
 
-SDValue VETargetLowering::LowerBlockAddress(SDValue Op,
+SDValue VETargetLowering::lowerBlockAddress(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  return makeAddress(Op, DAG);
+}
+
+SDValue VETargetLowering::lowerConstantPool(SDValue Op,
                                             SelectionDAG &DAG) const {
   return makeAddress(Op, DAG);
 }
 
 SDValue
-VETargetLowering::LowerToTLSGeneralDynamicModel(SDValue Op,
+VETargetLowering::lowerToTLSGeneralDynamicModel(SDValue Op,
                                                 SelectionDAG &DAG) const {
-  SDLoc dl(Op);
+  SDLoc DL(Op);
 
   // Generate the following code:
   //   t1: ch,glue = callseq_start t0, 0, 0
@@ -799,13 +1226,13 @@ VETargetLowering::LowerToTLSGeneralDynamicModel(SDValue Op,
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   const uint32_t *Mask = Subtarget->getRegisterInfo()->getCallPreservedMask(
       DAG.getMachineFunction(), CallingConv::C);
-  Chain = DAG.getCALLSEQ_START(Chain, 64, 0, dl);
+  Chain = DAG.getCALLSEQ_START(Chain, 64, 0, DL);
   SDValue Args[] = {Chain, Label, DAG.getRegisterMask(Mask), Chain.getValue(1)};
-  Chain = DAG.getNode(VEISD::GETTLSADDR, dl, NodeTys, Args);
-  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(64, dl, true),
-                             DAG.getIntPtrConstant(0, dl, true),
-                             Chain.getValue(1), dl);
-  Chain = DAG.getCopyFromReg(Chain, dl, VE::SX0, PtrVT, Chain.getValue(1));
+  Chain = DAG.getNode(VEISD::GETTLSADDR, DL, NodeTys, Args);
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(64, DL, true),
+                             DAG.getIntPtrConstant(0, DL, true),
+                             Chain.getValue(1), DL);
+  Chain = DAG.getCopyFromReg(Chain, DL, VE::SX0, PtrVT, Chain.getValue(1));
 
   // GETTLSADDR will be codegen'ed as call. Inform MFI that function has calls.
   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
@@ -820,17 +1247,133 @@ VETargetLowering::LowerToTLSGeneralDynamicModel(SDValue Op,
   return Chain;
 }
 
-SDValue VETargetLowering::LowerGlobalTLSAddress(SDValue Op,
+SDValue VETargetLowering::lowerGlobalTLSAddress(SDValue Op,
                                                 SelectionDAG &DAG) const {
   // The current implementation of nld (2.26) doesn't allow local exec model
   // code described in VE-tls_v1.1.pdf (*1) as its input. Instead, we always
   // generate the general dynamic model code sequence.
   //
   // *1: https://www.nec.com/en/global/prod/hpc/aurora/document/VE-tls_v1.1.pdf
-  return LowerToTLSGeneralDynamicModel(Op, DAG);
+  return lowerToTLSGeneralDynamicModel(Op, DAG);
+}
+
+SDValue VETargetLowering::lowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
+  return makeAddress(Op, DAG);
+}
+
+// Lower a f128 load into two f64 loads.
+static SDValue lowerLoadF128(SDValue Op, SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  LoadSDNode *LdNode = dyn_cast<LoadSDNode>(Op.getNode());
+  assert(LdNode && LdNode->getOffset().isUndef() && "Unexpected node type");
+  unsigned Alignment = LdNode->getAlign().value();
+  if (Alignment > 8)
+    Alignment = 8;
+
+  SDValue Lo64 =
+      DAG.getLoad(MVT::f64, DL, LdNode->getChain(), LdNode->getBasePtr(),
+                  LdNode->getPointerInfo(), Alignment,
+                  LdNode->isVolatile() ? MachineMemOperand::MOVolatile
+                                       : MachineMemOperand::MONone);
+  EVT AddrVT = LdNode->getBasePtr().getValueType();
+  SDValue HiPtr = DAG.getNode(ISD::ADD, DL, AddrVT, LdNode->getBasePtr(),
+                              DAG.getConstant(8, DL, AddrVT));
+  SDValue Hi64 =
+      DAG.getLoad(MVT::f64, DL, LdNode->getChain(), HiPtr,
+                  LdNode->getPointerInfo(), Alignment,
+                  LdNode->isVolatile() ? MachineMemOperand::MOVolatile
+                                       : MachineMemOperand::MONone);
+
+  SDValue SubRegEven = DAG.getTargetConstant(VE::sub_even, DL, MVT::i32);
+  SDValue SubRegOdd = DAG.getTargetConstant(VE::sub_odd, DL, MVT::i32);
+
+  // VE stores Hi64 to 8(addr) and Lo64 to 0(addr)
+  SDNode *InFP128 =
+      DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::f128);
+  InFP128 = DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f128,
+                               SDValue(InFP128, 0), Hi64, SubRegEven);
+  InFP128 = DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f128,
+                               SDValue(InFP128, 0), Lo64, SubRegOdd);
+  SDValue OutChains[2] = {SDValue(Lo64.getNode(), 1),
+                          SDValue(Hi64.getNode(), 1)};
+  SDValue OutChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
+  SDValue Ops[2] = {SDValue(InFP128, 0), OutChain};
+  return DAG.getMergeValues(Ops, DL);
+}
+
+SDValue VETargetLowering::lowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+  LoadSDNode *LdNode = cast<LoadSDNode>(Op.getNode());
+
+  SDValue BasePtr = LdNode->getBasePtr();
+  if (isa<FrameIndexSDNode>(BasePtr.getNode())) {
+    // Do not expand store instruction with frame index here because of
+    // dependency problems.  We expand it later in eliminateFrameIndex().
+    return Op;
+  }
+
+  EVT MemVT = LdNode->getMemoryVT();
+  if (MemVT == MVT::f128)
+    return lowerLoadF128(Op, DAG);
+
+  return Op;
+}
+
+// Lower a f128 store into two f64 stores.
+static SDValue lowerStoreF128(SDValue Op, SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  StoreSDNode *StNode = dyn_cast<StoreSDNode>(Op.getNode());
+  assert(StNode && StNode->getOffset().isUndef() && "Unexpected node type");
+
+  SDValue SubRegEven = DAG.getTargetConstant(VE::sub_even, DL, MVT::i32);
+  SDValue SubRegOdd = DAG.getTargetConstant(VE::sub_odd, DL, MVT::i32);
+
+  SDNode *Hi64 = DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::i64,
+                                    StNode->getValue(), SubRegEven);
+  SDNode *Lo64 = DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::i64,
+                                    StNode->getValue(), SubRegOdd);
+
+  unsigned Alignment = StNode->getAlign().value();
+  if (Alignment > 8)
+    Alignment = 8;
+
+  // VE stores Hi64 to 8(addr) and Lo64 to 0(addr)
+  SDValue OutChains[2];
+  OutChains[0] =
+      DAG.getStore(StNode->getChain(), DL, SDValue(Lo64, 0),
+                   StNode->getBasePtr(), MachinePointerInfo(), Alignment,
+                   StNode->isVolatile() ? MachineMemOperand::MOVolatile
+                                        : MachineMemOperand::MONone);
+  EVT AddrVT = StNode->getBasePtr().getValueType();
+  SDValue HiPtr = DAG.getNode(ISD::ADD, DL, AddrVT, StNode->getBasePtr(),
+                              DAG.getConstant(8, DL, AddrVT));
+  OutChains[1] =
+      DAG.getStore(StNode->getChain(), DL, SDValue(Hi64, 0), HiPtr,
+                   MachinePointerInfo(), Alignment,
+                   StNode->isVolatile() ? MachineMemOperand::MOVolatile
+                                        : MachineMemOperand::MONone);
+  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
+}
+
+SDValue VETargetLowering::lowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+  StoreSDNode *StNode = cast<StoreSDNode>(Op.getNode());
+  assert(StNode && StNode->getOffset().isUndef() && "Unexpected node type");
+
+  SDValue BasePtr = StNode->getBasePtr();
+  if (isa<FrameIndexSDNode>(BasePtr.getNode())) {
+    // Do not expand store instruction with frame index here because of
+    // dependency problems.  We expand it later in eliminateFrameIndex().
+    return Op;
+  }
+
+  EVT MemVT = StNode->getMemoryVT();
+  if (MemVT == MVT::f128)
+    return lowerStoreF128(Op, DAG);
+
+  // Otherwise, ask llvm to expand it.
+  return SDValue();
 }
 
-SDValue VETargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
+SDValue VETargetLowering::lowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   VEMachineFunctionInfo *FuncInfo = MF.getInfo<VEMachineFunctionInfo>();
   auto PtrVT = getPointerTy(DAG.getDataLayout());
@@ -849,7 +1392,7 @@ SDValue VETargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
                       MachinePointerInfo(SV));
 }
 
-SDValue VETargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
+SDValue VETargetLowering::lowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   SDNode *Node = Op.getNode();
   EVT VT = Node->getValueType(0);
   SDValue InChain = Node->getOperand(0);
@@ -862,7 +1405,19 @@ SDValue VETargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   SDValue Chain = VAList.getValue(1);
   SDValue NextPtr;
 
-  if (VT == MVT::f32) {
+  if (VT == MVT::f128) {
+    // VE f128 values must be stored with 16 bytes alignment.  We doesn't
+    // know the actual alignment of VAList, so we take alignment of it
+    // dyanmically.
+    int Align = 16;
+    VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
+                         DAG.getConstant(Align - 1, DL, PtrVT));
+    VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
+                         DAG.getConstant(-Align, DL, PtrVT));
+    // Increment the pointer, VAList, by 16 to the next vaarg.
+    NextPtr =
+        DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getIntPtrConstant(16, DL));
+  } else if (VT == MVT::f32) {
     // float --> need special handling like below.
     //    0      4
     //    +------+------+
@@ -955,22 +1510,1325 @@ SDValue VETargetLowering::lowerDYNAMIC_STACKALLOC(SDValue Op,
   return DAG.getMergeValues(Ops, DL);
 }
 
+SDValue VETargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  return DAG.getNode(VEISD::EH_SJLJ_LONGJMP, DL, MVT::Other, Op.getOperand(0),
+                     Op.getOperand(1));
+}
+
+SDValue VETargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  return DAG.getNode(VEISD::EH_SJLJ_SETJMP, DL,
+                     DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
+                     Op.getOperand(1));
+}
+
+SDValue VETargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
+                                                      SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  return DAG.getNode(VEISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
+                     Op.getOperand(0));
+}
+
+static SDValue lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG,
+                              const VETargetLowering &TLI,
+                              const VESubtarget *Subtarget) {
+  SDLoc DL(Op);
+  MachineFunction &MF = DAG.getMachineFunction();
+  EVT PtrVT = TLI.getPointerTy(MF.getDataLayout());
+
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  MFI.setFrameAddressIsTaken(true);
+
+  unsigned Depth = Op.getConstantOperandVal(0);
+  const VERegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+  unsigned FrameReg = RegInfo->getFrameRegister(MF);
+  SDValue FrameAddr =
+      DAG.getCopyFromReg(DAG.getEntryNode(), DL, FrameReg, PtrVT);
+  while (Depth--)
+    FrameAddr = DAG.getLoad(Op.getValueType(), DL, DAG.getEntryNode(),
+                            FrameAddr, MachinePointerInfo());
+  return FrameAddr;
+}
+
+static SDValue lowerRETURNADDR(SDValue Op, SelectionDAG &DAG,
+                               const VETargetLowering &TLI,
+                               const VESubtarget *Subtarget) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  MFI.setReturnAddressIsTaken(true);
+
+  if (TLI.verifyReturnAddressArgumentIsConstant(Op, DAG))
+    return SDValue();
+
+  SDValue FrameAddr = lowerFRAMEADDR(Op, DAG, TLI, Subtarget);
+
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  SDValue Offset = DAG.getConstant(8, DL, VT);
+  return DAG.getLoad(VT, DL, DAG.getEntryNode(),
+                     DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
+                     MachinePointerInfo());
+}
+
+SDValue VETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  switch (IntNo) {
+  default: // Don't custom lower most intrinsics.
+    return SDValue();
+  case Intrinsic::eh_sjlj_lsda: {
+    MachineFunction &MF = DAG.getMachineFunction();
+    MVT VT = Op.getSimpleValueType();
+    const VETargetMachine *TM =
+        static_cast<const VETargetMachine *>(&DAG.getTarget());
+
+    // Create GCC_except_tableXX string.  The real symbol for that will be
+    // generated in EHStreamer::emitExceptionTable() later.  So, we just
+    // borrow it's name here.
+    TM->getStrList()->push_back(std::string(
+        (Twine("GCC_except_table") + Twine(MF.getFunctionNumber())).str()));
+    SDValue Addr =
+        DAG.getTargetExternalSymbol(TM->getStrList()->back().c_str(), VT, 0);
+    if (isPositionIndependent()) {
+      Addr = makeHiLoPair(Addr, VEMCExpr::VK_VE_GOTOFF_HI32,
+                          VEMCExpr::VK_VE_GOTOFF_LO32, DAG);
+      SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, VT);
+      return DAG.getNode(ISD::ADD, DL, VT, GlobalBase, Addr);
+    }
+    return makeHiLoPair(Addr, VEMCExpr::VK_VE_HI32, VEMCExpr::VK_VE_LO32, DAG);
+  }
+  }
+}
+
+static bool getUniqueInsertion(SDNode *N, unsigned &UniqueIdx) {
+  if (!isa<BuildVectorSDNode>(N))
+    return false;
+  const auto *BVN = cast<BuildVectorSDNode>(N);
+
+  // Find first non-undef insertion.
+  unsigned Idx;
+  for (Idx = 0; Idx < BVN->getNumOperands(); ++Idx) {
+    auto ElemV = BVN->getOperand(Idx);
+    if (!ElemV->isUndef())
+      break;
+  }
+  // Catch the (hypothetical) all-undef case.
+  if (Idx == BVN->getNumOperands())
+    return false;
+  // Remember insertion.
+  UniqueIdx = Idx++;
+  // Verify that all other insertions are undef.
+  for (; Idx < BVN->getNumOperands(); ++Idx) {
+    auto ElemV = BVN->getOperand(Idx);
+    if (!ElemV->isUndef())
+      return false;
+  }
+  return true;
+}
+
+static SDValue getSplatValue(SDNode *N) {
+  if (auto *BuildVec = dyn_cast<BuildVectorSDNode>(N)) {
+    return BuildVec->getSplatValue();
+  }
+  return SDValue();
+}
+
+SDValue VETargetLowering::lowerBUILD_VECTOR(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  unsigned NumEls = Op.getValueType().getVectorNumElements();
+  MVT ElemVT = Op.getSimpleValueType().getVectorElementType();
+
+  // If there is just one element, expand to INSERT_VECTOR_ELT.
+  unsigned UniqueIdx;
+  if (getUniqueInsertion(Op.getNode(), UniqueIdx)) {
+    SDValue AccuV = DAG.getUNDEF(Op.getValueType());
+    auto ElemV = Op->getOperand(UniqueIdx);
+    SDValue IdxV = DAG.getConstant(UniqueIdx, DL, MVT::i64);
+    return DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(), AccuV,
+                       ElemV, IdxV);
+  }
+
+  // Else emit a broadcast.
+  if (SDValue ScalarV = getSplatValue(Op.getNode())) {
+    // lower to VEC_BROADCAST
+    MVT LegalResVT = MVT::getVectorVT(ElemVT, 256);
+
+    auto AVL = DAG.getConstant(NumEls, DL, MVT::i32);
+    return DAG.getNode(VEISD::VEC_BROADCAST, DL, LegalResVT, Op.getOperand(0),
+                       AVL);
+  }
+
+  // Expand
+  return SDValue();
+}
+
 SDValue VETargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
-  switch (Op.getOpcode()) {
+  unsigned Opcode = Op.getOpcode();
+  if (ISD::isVPOpcode(Opcode))
+    return lowerToVVP(Op, DAG);
+
+  switch (Opcode) {
   default:
     llvm_unreachable("Should not custom lower this!");
+  case ISD::ATOMIC_FENCE:
+    return lowerATOMIC_FENCE(Op, DAG);
+  case ISD::ATOMIC_SWAP:
+    return lowerATOMIC_SWAP(Op, DAG);
   case ISD::BlockAddress:
-    return LowerBlockAddress(Op, DAG);
+    return lowerBlockAddress(Op, DAG);
+  case ISD::ConstantPool:
+    return lowerConstantPool(Op, DAG);
   case ISD::DYNAMIC_STACKALLOC:
     return lowerDYNAMIC_STACKALLOC(Op, DAG);
+  case ISD::EH_SJLJ_LONGJMP:
+    return lowerEH_SJLJ_LONGJMP(Op, DAG);
+  case ISD::EH_SJLJ_SETJMP:
+    return lowerEH_SJLJ_SETJMP(Op, DAG);
+  case ISD::EH_SJLJ_SETUP_DISPATCH:
+    return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
+  case ISD::FRAMEADDR:
+    return lowerFRAMEADDR(Op, DAG, *this, Subtarget);
   case ISD::GlobalAddress:
-    return LowerGlobalAddress(Op, DAG);
+    return lowerGlobalAddress(Op, DAG);
   case ISD::GlobalTLSAddress:
-    return LowerGlobalTLSAddress(Op, DAG);
+    return lowerGlobalTLSAddress(Op, DAG);
+  case ISD::INTRINSIC_WO_CHAIN:
+    return lowerINTRINSIC_WO_CHAIN(Op, DAG);
+  case ISD::JumpTable:
+    return lowerJumpTable(Op, DAG);
+  case ISD::LOAD:
+    return lowerLOAD(Op, DAG);
+  case ISD::RETURNADDR:
+    return lowerRETURNADDR(Op, DAG, *this, Subtarget);
+  case ISD::BUILD_VECTOR:
+    return lowerBUILD_VECTOR(Op, DAG);
+  case ISD::STORE:
+    return lowerSTORE(Op, DAG);
   case ISD::VASTART:
-    return LowerVASTART(Op, DAG);
+    return lowerVASTART(Op, DAG);
   case ISD::VAARG:
-    return LowerVAARG(Op, DAG);
+    return lowerVAARG(Op, DAG);
+
+  case ISD::INSERT_VECTOR_ELT:
+    return lowerINSERT_VECTOR_ELT(Op, DAG);
+  case ISD::EXTRACT_VECTOR_ELT:
+    return lowerEXTRACT_VECTOR_ELT(Op, DAG);
+
+#define ADD_BINARY_VVP_OP(VVP_NAME, ISD_NAME) case ISD::ISD_NAME:
+#include "VVPNodes.def"
+    return lowerToVVP(Op, DAG);
   }
 }
 /// } Custom Lower
+
+void VETargetLowering::ReplaceNodeResults(SDNode *N,
+                                          SmallVectorImpl<SDValue> &Results,
+                                          SelectionDAG &DAG) const {
+  switch (N->getOpcode()) {
+  case ISD::ATOMIC_SWAP:
+    // Let LLVM expand atomic swap instruction through LowerOperation.
+    return;
+  default:
+    LLVM_DEBUG(N->dumpr(&DAG));
+    llvm_unreachable("Do not know how to custom type legalize this operation!");
+  }
+}
+
+/// JumpTable for VE.
+///
+///   VE cannot generate relocatable symbol in jump table.  VE cannot
+///   generate expressions using symbols in both text segment and data
+///   segment like below.
+///             .4byte  .LBB0_2-.LJTI0_0
+///   So, we generate offset from the top of function like below as
+///   a custom label.
+///             .4byte  .LBB0_2-<function name>
+
+unsigned VETargetLowering::getJumpTableEncoding() const {
+  // Use custom label for PIC.
+  if (isPositionIndependent())
+    return MachineJumpTableInfo::EK_Custom32;
+
+  // Otherwise, use the normal jump table encoding heuristics.
+  return TargetLowering::getJumpTableEncoding();
+}
+
+const MCExpr *VETargetLowering::LowerCustomJumpTableEntry(
+    const MachineJumpTableInfo *MJTI, const MachineBasicBlock *MBB,
+    unsigned Uid, MCContext &Ctx) const {
+  assert(isPositionIndependent());
+
+  // Generate custom label for PIC like below.
+  //    .4bytes  .LBB0_2-<function name>
+  const auto *Value = MCSymbolRefExpr::create(MBB->getSymbol(), Ctx);
+  MCSymbol *Sym = Ctx.getOrCreateSymbol(MBB->getParent()->getName().data());
+  const auto *Base = MCSymbolRefExpr::create(Sym, Ctx);
+  return MCBinaryExpr::createSub(Value, Base, Ctx);
+}
+
+SDValue VETargetLowering::getPICJumpTableRelocBase(SDValue Table,
+                                                   SelectionDAG &DAG) const {
+  assert(isPositionIndependent());
+  SDLoc DL(Table);
+  Function *Function = &DAG.getMachineFunction().getFunction();
+  assert(Function != nullptr);
+  auto PtrTy = getPointerTy(DAG.getDataLayout(), Function->getAddressSpace());
+
+  // In the jump table, we have following values in PIC mode.
+  //    .4bytes  .LBB0_2-<function name>
+  // We need to add this value and the address of this function to generate
+  // .LBB0_2 label correctly under PIC mode.  So, we want to generate following
+  // instructions:
+  //     lea %reg, fun@gotoff_lo
+  //     and %reg, %reg, (32)0
+  //     lea.sl %reg, fun@gotoff_hi(%reg, %got)
+  // In order to do so, we need to genarate correctly marked DAG node using
+  // makeHiLoPair.
+  SDValue Op = DAG.getGlobalAddress(Function, DL, PtrTy);
+  SDValue HiLo = makeHiLoPair(Op, VEMCExpr::VK_VE_GOTOFF_HI32,
+                              VEMCExpr::VK_VE_GOTOFF_LO32, DAG);
+  SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, PtrTy);
+  return DAG.getNode(ISD::ADD, DL, PtrTy, GlobalBase, HiLo);
+}
+
+Register VETargetLowering::prepareMBB(MachineBasicBlock &MBB,
+                                      MachineBasicBlock::iterator I,
+                                      MachineBasicBlock *TargetBB,
+                                      const DebugLoc &DL) const {
+  MachineFunction *MF = MBB.getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  const VEInstrInfo *TII = Subtarget->getInstrInfo();
+
+  const TargetRegisterClass *RC = &VE::I64RegClass;
+  Register Tmp1 = MRI.createVirtualRegister(RC);
+  Register Tmp2 = MRI.createVirtualRegister(RC);
+  Register Result = MRI.createVirtualRegister(RC);
+
+  if (isPositionIndependent()) {
+    // Create following instructions for local linkage PIC code.
+    //     lea %Tmp1, TargetBB@gotoff_lo
+    //     and %Tmp2, %Tmp1, (32)0
+    //     lea.sl %Result, TargetBB@gotoff_hi(%Tmp2, %s15) ; %s15 is GOT
+    BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
+        .addImm(0)
+        .addImm(0)
+        .addMBB(TargetBB, VEMCExpr::VK_VE_GOTOFF_LO32);
+    BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
+        .addReg(Tmp1, getKillRegState(true))
+        .addImm(M0(32));
+    BuildMI(MBB, I, DL, TII->get(VE::LEASLrri), Result)
+        .addReg(VE::SX15)
+        .addReg(Tmp2, getKillRegState(true))
+        .addMBB(TargetBB, VEMCExpr::VK_VE_GOTOFF_HI32);
+  } else {
+    // Create following instructions for non-PIC code.
+    //     lea     %Tmp1, TargetBB@lo
+    //     and     %Tmp2, %Tmp1, (32)0
+    //     lea.sl  %Result, TargetBB@hi(%Tmp2)
+    BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
+        .addImm(0)
+        .addImm(0)
+        .addMBB(TargetBB, VEMCExpr::VK_VE_LO32);
+    BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
+        .addReg(Tmp1, getKillRegState(true))
+        .addImm(M0(32));
+    BuildMI(MBB, I, DL, TII->get(VE::LEASLrii), Result)
+        .addReg(Tmp2, getKillRegState(true))
+        .addImm(0)
+        .addMBB(TargetBB, VEMCExpr::VK_VE_HI32);
+  }
+  return Result;
+}
+
+Register VETargetLowering::prepareSymbol(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator I,
+                                         StringRef Symbol, const DebugLoc &DL,
+                                         bool IsLocal = false,
+                                         bool IsCall = false) const {
+  MachineFunction *MF = MBB.getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  const VEInstrInfo *TII = Subtarget->getInstrInfo();
+
+  const TargetRegisterClass *RC = &VE::I64RegClass;
+  Register Result = MRI.createVirtualRegister(RC);
+
+  if (isPositionIndependent()) {
+    if (IsCall && !IsLocal) {
+      // Create following instructions for non-local linkage PIC code function
+      // calls.  These instructions uses IC and magic number -24, so we expand
+      // them in VEAsmPrinter.cpp from GETFUNPLT pseudo instruction.
+      //     lea %Reg, Symbol@plt_lo(-24)
+      //     and %Reg, %Reg, (32)0
+      //     sic %s16
+      //     lea.sl %Result, Symbol@plt_hi(%Reg, %s16) ; %s16 is PLT
+      BuildMI(MBB, I, DL, TII->get(VE::GETFUNPLT), Result)
+          .addExternalSymbol("abort");
+    } else if (IsLocal) {
+      Register Tmp1 = MRI.createVirtualRegister(RC);
+      Register Tmp2 = MRI.createVirtualRegister(RC);
+      // Create following instructions for local linkage PIC code.
+      //     lea %Tmp1, Symbol@gotoff_lo
+      //     and %Tmp2, %Tmp1, (32)0
+      //     lea.sl %Result, Symbol@gotoff_hi(%Tmp2, %s15) ; %s15 is GOT
+      BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
+          .addImm(0)
+          .addImm(0)
+          .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_GOTOFF_LO32);
+      BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
+          .addReg(Tmp1, getKillRegState(true))
+          .addImm(M0(32));
+      BuildMI(MBB, I, DL, TII->get(VE::LEASLrri), Result)
+          .addReg(VE::SX15)
+          .addReg(Tmp2, getKillRegState(true))
+          .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_GOTOFF_HI32);
+    } else {
+      Register Tmp1 = MRI.createVirtualRegister(RC);
+      Register Tmp2 = MRI.createVirtualRegister(RC);
+      // Create following instructions for not local linkage PIC code.
+      //     lea %Tmp1, Symbol@got_lo
+      //     and %Tmp2, %Tmp1, (32)0
+      //     lea.sl %Tmp3, Symbol@gotoff_hi(%Tmp2, %s15) ; %s15 is GOT
+      //     ld %Result, 0(%Tmp3)
+      Register Tmp3 = MRI.createVirtualRegister(RC);
+      BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
+          .addImm(0)
+          .addImm(0)
+          .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_GOT_LO32);
+      BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
+          .addReg(Tmp1, getKillRegState(true))
+          .addImm(M0(32));
+      BuildMI(MBB, I, DL, TII->get(VE::LEASLrri), Tmp3)
+          .addReg(VE::SX15)
+          .addReg(Tmp2, getKillRegState(true))
+          .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_GOT_HI32);
+      BuildMI(MBB, I, DL, TII->get(VE::LDrii), Result)
+          .addReg(Tmp3, getKillRegState(true))
+          .addImm(0)
+          .addImm(0);
+    }
+  } else {
+    Register Tmp1 = MRI.createVirtualRegister(RC);
+    Register Tmp2 = MRI.createVirtualRegister(RC);
+    // Create following instructions for non-PIC code.
+    //     lea     %Tmp1, Symbol@lo
+    //     and     %Tmp2, %Tmp1, (32)0
+    //     lea.sl  %Result, Symbol@hi(%Tmp2)
+    BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
+        .addImm(0)
+        .addImm(0)
+        .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_LO32);
+    BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
+        .addReg(Tmp1, getKillRegState(true))
+        .addImm(M0(32));
+    BuildMI(MBB, I, DL, TII->get(VE::LEASLrii), Result)
+        .addReg(Tmp2, getKillRegState(true))
+        .addImm(0)
+        .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_HI32);
+  }
+  return Result;
+}
+
+void VETargetLowering::setupEntryBlockForSjLj(MachineInstr &MI,
+                                              MachineBasicBlock *MBB,
+                                              MachineBasicBlock *DispatchBB,
+                                              int FI, int Offset) const {
+  DebugLoc DL = MI.getDebugLoc();
+  const VEInstrInfo *TII = Subtarget->getInstrInfo();
+
+  Register LabelReg =
+      prepareMBB(*MBB, MachineBasicBlock::iterator(MI), DispatchBB, DL);
+
+  // Store an address of DispatchBB to a given jmpbuf[1] where has next IC
+  // referenced by longjmp (throw) later.
+  MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(VE::STrii));
+  addFrameReference(MIB, FI, Offset); // jmpbuf[1]
+  MIB.addReg(LabelReg, getKillRegState(true));
+}
+
+MachineBasicBlock *
+VETargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
+                                   MachineBasicBlock *MBB) const {
+  DebugLoc DL = MI.getDebugLoc();
+  MachineFunction *MF = MBB->getParent();
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+  const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  const BasicBlock *BB = MBB->getBasicBlock();
+  MachineFunction::iterator I = ++MBB->getIterator();
+
+  // Memory Reference.
+  SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
+                                           MI.memoperands_end());
+  Register BufReg = MI.getOperand(1).getReg();
+
+  Register DstReg;
+
+  DstReg = MI.getOperand(0).getReg();
+  const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
+  assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
+  (void)TRI;
+  Register MainDestReg = MRI.createVirtualRegister(RC);
+  Register RestoreDestReg = MRI.createVirtualRegister(RC);
+
+  // For `v = call @llvm.eh.sjlj.setjmp(buf)`, we generate following
+  // instructions.  SP/FP must be saved in jmpbuf before `llvm.eh.sjlj.setjmp`.
+  //
+  // ThisMBB:
+  //   buf[3] = %s17 iff %s17 is used as BP
+  //   buf[1] = RestoreMBB as IC after longjmp
+  //   # SjLjSetup RestoreMBB
+  //
+  // MainMBB:
+  //   v_main = 0
+  //
+  // SinkMBB:
+  //   v = phi(v_main, MainMBB, v_restore, RestoreMBB)
+  //   ...
+  //
+  // RestoreMBB:
+  //   %s17 = buf[3] = iff %s17 is used as BP
+  //   v_restore = 1
+  //   goto SinkMBB
+
+  MachineBasicBlock *ThisMBB = MBB;
+  MachineBasicBlock *MainMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *SinkMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *RestoreMBB = MF->CreateMachineBasicBlock(BB);
+  MF->insert(I, MainMBB);
+  MF->insert(I, SinkMBB);
+  MF->push_back(RestoreMBB);
+  RestoreMBB->setHasAddressTaken();
+
+  // Transfer the remainder of BB and its successor edges to SinkMBB.
+  SinkMBB->splice(SinkMBB->begin(), MBB,
+                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
+  SinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+  // ThisMBB:
+  Register LabelReg =
+      prepareMBB(*MBB, MachineBasicBlock::iterator(MI), RestoreMBB, DL);
+
+  // Store BP in buf[3] iff this function is using BP.
+  const VEFrameLowering *TFI = Subtarget->getFrameLowering();
+  if (TFI->hasBP(*MF)) {
+    MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(VE::STrii));
+    MIB.addReg(BufReg);
+    MIB.addImm(0);
+    MIB.addImm(24);
+    MIB.addReg(VE::SX17);
+    MIB.setMemRefs(MMOs);
+  }
+
+  // Store IP in buf[1].
+  MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(VE::STrii));
+  MIB.add(MI.getOperand(1)); // we can preserve the kill flags here.
+  MIB.addImm(0);
+  MIB.addImm(8);
+  MIB.addReg(LabelReg, getKillRegState(true));
+  MIB.setMemRefs(MMOs);
+
+  // SP/FP are already stored in jmpbuf before `llvm.eh.sjlj.setjmp`.
+
+  // Insert setup.
+  MIB =
+      BuildMI(*ThisMBB, MI, DL, TII->get(VE::EH_SjLj_Setup)).addMBB(RestoreMBB);
+
+  const VERegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+  MIB.addRegMask(RegInfo->getNoPreservedMask());
+  ThisMBB->addSuccessor(MainMBB);
+  ThisMBB->addSuccessor(RestoreMBB);
+
+  // MainMBB:
+  BuildMI(MainMBB, DL, TII->get(VE::LEAzii), MainDestReg)
+      .addImm(0)
+      .addImm(0)
+      .addImm(0);
+  MainMBB->addSuccessor(SinkMBB);
+
+  // SinkMBB:
+  BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(VE::PHI), DstReg)
+      .addReg(MainDestReg)
+      .addMBB(MainMBB)
+      .addReg(RestoreDestReg)
+      .addMBB(RestoreMBB);
+
+  // RestoreMBB:
+  // Restore BP from buf[3] iff this function is using BP.  The address of
+  // buf is in SX10.
+  // FIXME: Better to not use SX10 here
+  if (TFI->hasBP(*MF)) {
+    MachineInstrBuilder MIB =
+        BuildMI(RestoreMBB, DL, TII->get(VE::LDrii), VE::SX17);
+    MIB.addReg(VE::SX10);
+    MIB.addImm(0);
+    MIB.addImm(24);
+    MIB.setMemRefs(MMOs);
+  }
+  BuildMI(RestoreMBB, DL, TII->get(VE::LEAzii), RestoreDestReg)
+      .addImm(0)
+      .addImm(0)
+      .addImm(1);
+  BuildMI(RestoreMBB, DL, TII->get(VE::BRCFLa_t)).addMBB(SinkMBB);
+  RestoreMBB->addSuccessor(SinkMBB);
+
+  MI.eraseFromParent();
+  return SinkMBB;
+}
+
+MachineBasicBlock *
+VETargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
+                                    MachineBasicBlock *MBB) const {
+  DebugLoc DL = MI.getDebugLoc();
+  MachineFunction *MF = MBB->getParent();
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  // Memory Reference.
+  SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
+                                           MI.memoperands_end());
+  Register BufReg = MI.getOperand(0).getReg();
+
+  Register Tmp = MRI.createVirtualRegister(&VE::I64RegClass);
+  // Since FP is only updated here but NOT referenced, it's treated as GPR.
+  Register FP = VE::SX9;
+  Register SP = VE::SX11;
+
+  MachineInstrBuilder MIB;
+
+  MachineBasicBlock *ThisMBB = MBB;
+
+  // For `call @llvm.eh.sjlj.longjmp(buf)`, we generate following instructions.
+  //
+  // ThisMBB:
+  //   %fp = load buf[0]
+  //   %jmp = load buf[1]
+  //   %s10 = buf        ; Store an address of buf to SX10 for RestoreMBB
+  //   %sp = load buf[2] ; generated by llvm.eh.sjlj.setjmp.
+  //   jmp %jmp
+
+  // Reload FP.
+  MIB = BuildMI(*ThisMBB, MI, DL, TII->get(VE::LDrii), FP);
+  MIB.addReg(BufReg);
+  MIB.addImm(0);
+  MIB.addImm(0);
+  MIB.setMemRefs(MMOs);
+
+  // Reload IP.
+  MIB = BuildMI(*ThisMBB, MI, DL, TII->get(VE::LDrii), Tmp);
+  MIB.addReg(BufReg);
+  MIB.addImm(0);
+  MIB.addImm(8);
+  MIB.setMemRefs(MMOs);
+
+  // Copy BufReg to SX10 for later use in setjmp.
+  // FIXME: Better to not use SX10 here
+  BuildMI(*ThisMBB, MI, DL, TII->get(VE::ORri), VE::SX10)
+      .addReg(BufReg)
+      .addImm(0);
+
+  // Reload SP.
+  MIB = BuildMI(*ThisMBB, MI, DL, TII->get(VE::LDrii), SP);
+  MIB.add(MI.getOperand(0)); // we can preserve the kill flags here.
+  MIB.addImm(0);
+  MIB.addImm(16);
+  MIB.setMemRefs(MMOs);
+
+  // Jump.
+  BuildMI(*ThisMBB, MI, DL, TII->get(VE::BCFLari_t))
+      .addReg(Tmp, getKillRegState(true))
+      .addImm(0);
+
+  MI.eraseFromParent();
+  return ThisMBB;
+}
+
+MachineBasicBlock *
+VETargetLowering::emitSjLjDispatchBlock(MachineInstr &MI,
+                                        MachineBasicBlock *BB) const {
+  DebugLoc DL = MI.getDebugLoc();
+  MachineFunction *MF = BB->getParent();
+  MachineFrameInfo &MFI = MF->getFrameInfo();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  const VEInstrInfo *TII = Subtarget->getInstrInfo();
+  int FI = MFI.getFunctionContextIndex();
+
+  // Get a mapping of the call site numbers to all of the landing pads they're
+  // associated with.
+  DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
+  unsigned MaxCSNum = 0;
+  for (auto &MBB : *MF) {
+    if (!MBB.isEHPad())
+      continue;
+
+    MCSymbol *Sym = nullptr;
+    for (const auto &MI : MBB) {
+      if (MI.isDebugInstr())
+        continue;
+
+      assert(MI.isEHLabel() && "expected EH_LABEL");
+      Sym = MI.getOperand(0).getMCSymbol();
+      break;
+    }
+
+    if (!MF->hasCallSiteLandingPad(Sym))
+      continue;
+
+    for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
+      CallSiteNumToLPad[CSI].push_back(&MBB);
+      MaxCSNum = std::max(MaxCSNum, CSI);
+    }
+  }
+
+  // Get an ordered list of the machine basic blocks for the jump table.
+  std::vector<MachineBasicBlock *> LPadList;
+  SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
+  LPadList.reserve(CallSiteNumToLPad.size());
+
+  for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
+    for (auto &LP : CallSiteNumToLPad[CSI]) {
+      LPadList.push_back(LP);
+      InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
+    }
+  }
+
+  assert(!LPadList.empty() &&
+         "No landing pad destinations for the dispatch jump table!");
+
+  // The %fn_context is allocated like below (from --print-after=sjljehprepare):
+  //   %fn_context = alloca { i8*, i64, [4 x i64], i8*, i8*, [5 x i8*] }
+  //
+  // This `[5 x i8*]` is jmpbuf, so jmpbuf[1] is FI+72.
+  // First `i64` is callsite, so callsite is FI+8.
+  static const int OffsetIC = 72;
+  static const int OffsetCS = 8;
+
+  // Create the MBBs for the dispatch code like following:
+  //
+  // ThisMBB:
+  //   Prepare DispatchBB address and store it to buf[1].
+  //   ...
+  //
+  // DispatchBB:
+  //   %s15 = GETGOT iff isPositionIndependent
+  //   %callsite = load callsite
+  //   brgt.l.t #size of callsites, %callsite, DispContBB
+  //
+  // TrapBB:
+  //   Call abort.
+  //
+  // DispContBB:
+  //   %breg = address of jump table
+  //   %pc = load and calculate next pc from %breg and %callsite
+  //   jmp %pc
+
+  // Shove the dispatch's address into the return slot in the function context.
+  MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
+  DispatchBB->setIsEHPad(true);
+
+  // Trap BB will causes trap like `assert(0)`.
+  MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
+  DispatchBB->addSuccessor(TrapBB);
+
+  MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
+  DispatchBB->addSuccessor(DispContBB);
+
+  // Insert MBBs.
+  MF->push_back(DispatchBB);
+  MF->push_back(DispContBB);
+  MF->push_back(TrapBB);
+
+  // Insert code to call abort in the TrapBB.
+  Register Abort = prepareSymbol(*TrapBB, TrapBB->end(), "abort", DL,
+                                 /* Local */ false, /* Call */ true);
+  BuildMI(TrapBB, DL, TII->get(VE::BSICrii), VE::SX10)
+      .addReg(Abort, getKillRegState(true))
+      .addImm(0)
+      .addImm(0);
+
+  // Insert code into the entry block that creates and registers the function
+  // context.
+  setupEntryBlockForSjLj(MI, BB, DispatchBB, FI, OffsetIC);
+
+  // Create the jump table and associated information
+  unsigned JTE = getJumpTableEncoding();
+  MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
+  unsigned MJTI = JTI->createJumpTableIndex(LPadList);
+
+  const VERegisterInfo &RI = TII->getRegisterInfo();
+  // Add a register mask with no preserved registers.  This results in all
+  // registers being marked as clobbered.
+  BuildMI(DispatchBB, DL, TII->get(VE::NOP))
+      .addRegMask(RI.getNoPreservedMask());
+
+  if (isPositionIndependent()) {
+    // Force to generate GETGOT, since current implementation doesn't store GOT
+    // register.
+    BuildMI(DispatchBB, DL, TII->get(VE::GETGOT), VE::SX15);
+  }
+
+  // IReg is used as an index in a memory operand and therefore can't be SP
+  const TargetRegisterClass *RC = &VE::I64RegClass;
+  Register IReg = MRI.createVirtualRegister(RC);
+  addFrameReference(BuildMI(DispatchBB, DL, TII->get(VE::LDLZXrii), IReg), FI,
+                    OffsetCS);
+  if (LPadList.size() < 64) {
+    BuildMI(DispatchBB, DL, TII->get(VE::BRCFLir_t))
+        .addImm(VECC::CC_ILE)
+        .addImm(LPadList.size())
+        .addReg(IReg)
+        .addMBB(TrapBB);
+  } else {
+    assert(LPadList.size() <= 0x7FFFFFFF && "Too large Landing Pad!");
+    Register TmpReg = MRI.createVirtualRegister(RC);
+    BuildMI(DispatchBB, DL, TII->get(VE::LEAzii), TmpReg)
+        .addImm(0)
+        .addImm(0)
+        .addImm(LPadList.size());
+    BuildMI(DispatchBB, DL, TII->get(VE::BRCFLrr_t))
+        .addImm(VECC::CC_ILE)
+        .addReg(TmpReg, getKillRegState(true))
+        .addReg(IReg)
+        .addMBB(TrapBB);
+  }
+
+  Register BReg = MRI.createVirtualRegister(RC);
+  Register Tmp1 = MRI.createVirtualRegister(RC);
+  Register Tmp2 = MRI.createVirtualRegister(RC);
+
+  if (isPositionIndependent()) {
+    // Create following instructions for local linkage PIC code.
+    //     lea    %Tmp1, .LJTI0_0@gotoff_lo
+    //     and    %Tmp2, %Tmp1, (32)0
+    //     lea.sl %BReg, .LJTI0_0@gotoff_hi(%Tmp2, %s15) ; %s15 is GOT
+    BuildMI(DispContBB, DL, TII->get(VE::LEAzii), Tmp1)
+        .addImm(0)
+        .addImm(0)
+        .addJumpTableIndex(MJTI, VEMCExpr::VK_VE_GOTOFF_LO32);
+    BuildMI(DispContBB, DL, TII->get(VE::ANDrm), Tmp2)
+        .addReg(Tmp1, getKillRegState(true))
+        .addImm(M0(32));
+    BuildMI(DispContBB, DL, TII->get(VE::LEASLrri), BReg)
+        .addReg(VE::SX15)
+        .addReg(Tmp2, getKillRegState(true))
+        .addJumpTableIndex(MJTI, VEMCExpr::VK_VE_GOTOFF_HI32);
+  } else {
+    // Create following instructions for non-PIC code.
+    //     lea     %Tmp1, .LJTI0_0@lo
+    //     and     %Tmp2, %Tmp1, (32)0
+    //     lea.sl  %BReg, .LJTI0_0@hi(%Tmp2)
+    BuildMI(DispContBB, DL, TII->get(VE::LEAzii), Tmp1)
+        .addImm(0)
+        .addImm(0)
+        .addJumpTableIndex(MJTI, VEMCExpr::VK_VE_LO32);
+    BuildMI(DispContBB, DL, TII->get(VE::ANDrm), Tmp2)
+        .addReg(Tmp1, getKillRegState(true))
+        .addImm(M0(32));
+    BuildMI(DispContBB, DL, TII->get(VE::LEASLrii), BReg)
+        .addReg(Tmp2, getKillRegState(true))
+        .addImm(0)
+        .addJumpTableIndex(MJTI, VEMCExpr::VK_VE_HI32);
+  }
+
+  switch (JTE) {
+  case MachineJumpTableInfo::EK_BlockAddress: {
+    // Generate simple block address code for no-PIC model.
+    //     sll %Tmp1, %IReg, 3
+    //     lds %TReg, 0(%Tmp1, %BReg)
+    //     bcfla %TReg
+
+    Register TReg = MRI.createVirtualRegister(RC);
+    Register Tmp1 = MRI.createVirtualRegister(RC);
+
+    BuildMI(DispContBB, DL, TII->get(VE::SLLri), Tmp1)
+        .addReg(IReg, getKillRegState(true))
+        .addImm(3);
+    BuildMI(DispContBB, DL, TII->get(VE::LDrri), TReg)
+        .addReg(BReg, getKillRegState(true))
+        .addReg(Tmp1, getKillRegState(true))
+        .addImm(0);
+    BuildMI(DispContBB, DL, TII->get(VE::BCFLari_t))
+        .addReg(TReg, getKillRegState(true))
+        .addImm(0);
+    break;
+  }
+  case MachineJumpTableInfo::EK_Custom32: {
+    // Generate block address code using differences from the function pointer
+    // for PIC model.
+    //     sll %Tmp1, %IReg, 2
+    //     ldl.zx %OReg, 0(%Tmp1, %BReg)
+    //     Prepare function address in BReg2.
+    //     adds.l %TReg, %BReg2, %OReg
+    //     bcfla %TReg
+
+    assert(isPositionIndependent());
+    Register OReg = MRI.createVirtualRegister(RC);
+    Register TReg = MRI.createVirtualRegister(RC);
+    Register Tmp1 = MRI.createVirtualRegister(RC);
+
+    BuildMI(DispContBB, DL, TII->get(VE::SLLri), Tmp1)
+        .addReg(IReg, getKillRegState(true))
+        .addImm(2);
+    BuildMI(DispContBB, DL, TII->get(VE::LDLZXrri), OReg)
+        .addReg(BReg, getKillRegState(true))
+        .addReg(Tmp1, getKillRegState(true))
+        .addImm(0);
+    Register BReg2 =
+        prepareSymbol(*DispContBB, DispContBB->end(),
+                      DispContBB->getParent()->getName(), DL, /* Local */ true);
+    BuildMI(DispContBB, DL, TII->get(VE::ADDSLrr), TReg)
+        .addReg(OReg, getKillRegState(true))
+        .addReg(BReg2, getKillRegState(true));
+    BuildMI(DispContBB, DL, TII->get(VE::BCFLari_t))
+        .addReg(TReg, getKillRegState(true))
+        .addImm(0);
+    break;
+  }
+  default:
+    llvm_unreachable("Unexpected jump table encoding");
+  }
+
+  // Add the jump table entries as successors to the MBB.
+  SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
+  for (auto &LP : LPadList)
+    if (SeenMBBs.insert(LP).second)
+      DispContBB->addSuccessor(LP);
+
+  // N.B. the order the invoke BBs are processed in doesn't matter here.
+  SmallVector<MachineBasicBlock *, 64> MBBLPads;
+  const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
+  for (MachineBasicBlock *MBB : InvokeBBs) {
+    // Remove the landing pad successor from the invoke block and replace it
+    // with the new dispatch block.
+    // Keep a copy of Successors since it's modified inside the loop.
+    SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
+                                                   MBB->succ_rend());
+    // FIXME: Avoid quadratic complexity.
+    for (auto MBBS : Successors) {
+      if (MBBS->isEHPad()) {
+        MBB->removeSuccessor(MBBS);
+        MBBLPads.push_back(MBBS);
+      }
+    }
+
+    MBB->addSuccessor(DispatchBB);
+
+    // Find the invoke call and mark all of the callee-saved registers as
+    // 'implicit defined' so that they're spilled.  This prevents code from
+    // moving instructions to before the EH block, where they will never be
+    // executed.
+    for (auto &II : reverse(*MBB)) {
+      if (!II.isCall())
+        continue;
+
+      DenseMap<Register, bool> DefRegs;
+      for (auto &MOp : II.operands())
+        if (MOp.isReg())
+          DefRegs[MOp.getReg()] = true;
+
+      MachineInstrBuilder MIB(*MF, &II);
+      for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
+        Register Reg = SavedRegs[RI];
+        if (!DefRegs[Reg])
+          MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
+      }
+
+      break;
+    }
+  }
+
+  // Mark all former landing pads as non-landing pads.  The dispatch is the only
+  // landing pad now.
+  for (auto &LP : MBBLPads)
+    LP->setIsEHPad(false);
+
+  // The instruction is gone now.
+  MI.eraseFromParent();
+  return BB;
+}
+
+MachineBasicBlock *
+VETargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
+                                              MachineBasicBlock *BB) const {
+  switch (MI.getOpcode()) {
+  default:
+    llvm_unreachable("Unknown Custom Instruction!");
+  case VE::EH_SjLj_LongJmp:
+    return emitEHSjLjLongJmp(MI, BB);
+  case VE::EH_SjLj_SetJmp:
+    return emitEHSjLjSetJmp(MI, BB);
+  case VE::EH_SjLj_Setup_Dispatch:
+    return emitSjLjDispatchBlock(MI, BB);
+  }
+}
+
+static bool isI32Insn(const SDNode *User, const SDNode *N) {
+  switch (User->getOpcode()) {
+  default:
+    return false;
+  case ISD::ADD:
+  case ISD::SUB:
+  case ISD::MUL:
+  case ISD::SDIV:
+  case ISD::UDIV:
+  case ISD::SETCC:
+  case ISD::SMIN:
+  case ISD::SMAX:
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::BSWAP:
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+  case ISD::BR_CC:
+  case ISD::BITCAST:
+  case ISD::ATOMIC_CMP_SWAP:
+  case ISD::ATOMIC_SWAP:
+    return true;
+  case ISD::SRL:
+    if (N->getOperand(0).getOpcode() != ISD::SRL)
+      return true;
+    // (srl (trunc (srl ...))) may be optimized by combining srl, so
+    // doesn't optimize trunc now.
+    return false;
+  case ISD::SELECT_CC:
+    if (User->getOperand(2).getNode() != N &&
+        User->getOperand(3).getNode() != N)
+      return true;
+    LLVM_FALLTHROUGH;
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+  case ISD::SELECT:
+  case ISD::CopyToReg:
+    // Check all use of selections, bit operations, and copies.  If all of them
+    // are safe, optimize truncate to extract_subreg.
+    for (SDNode::use_iterator UI = User->use_begin(), UE = User->use_end();
+         UI != UE; ++UI) {
+      switch ((*UI)->getOpcode()) {
+      default:
+        // If the use is an instruction which treats the source operand as i32,
+        // it is safe to avoid truncate here.
+        if (isI32Insn(*UI, N))
+          continue;
+        break;
+      case ISD::ANY_EXTEND:
+      case ISD::SIGN_EXTEND:
+      case ISD::ZERO_EXTEND: {
+        // Special optimizations to the combination of ext and trunc.
+        // (ext ... (select ... (trunc ...))) is safe to avoid truncate here
+        // since this truncate instruction clears higher 32 bits which is filled
+        // by one of ext instructions later.
+        assert(N->getValueType(0) == MVT::i32 &&
+               "find truncate to not i32 integer");
+        if (User->getOpcode() == ISD::SELECT_CC ||
+            User->getOpcode() == ISD::SELECT)
+          continue;
+        break;
+      }
+      }
+      return false;
+    }
+    return true;
+  }
+}
+
+// Optimize TRUNCATE in DAG combining.  Optimizing it in CUSTOM lower is
+// sometime too early.  Optimizing it in DAG pattern matching in VEInstrInfo.td
+// is sometime too late.  So, doing it at here.
+SDValue VETargetLowering::combineTRUNCATE(SDNode *N,
+                                          DAGCombinerInfo &DCI) const {
+  assert(N->getOpcode() == ISD::TRUNCATE &&
+         "Should be called with a TRUNCATE node");
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+
+  // We prefer to do this when all types are legal.
+  if (!DCI.isAfterLegalizeDAG())
+    return SDValue();
+
+  // Skip combine TRUNCATE atm if the operand of TRUNCATE might be a constant.
+  if (N->getOperand(0)->getOpcode() == ISD::SELECT_CC &&
+      isa<ConstantSDNode>(N->getOperand(0)->getOperand(0)) &&
+      isa<ConstantSDNode>(N->getOperand(0)->getOperand(1)))
+    return SDValue();
+
+  // Check all use of this TRUNCATE.
+  for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE;
+       ++UI) {
+    SDNode *User = *UI;
+
+    // Make sure that we're not going to replace TRUNCATE for non i32
+    // instructions.
+    //
+    // FIXME: Although we could sometimes handle this, and it does occur in
+    // practice that one of the condition inputs to the select is also one of
+    // the outputs, we currently can't deal with this.
+    if (isI32Insn(User, N))
+      continue;
+
+    return SDValue();
+  }
+
+  SDValue SubI32 = DAG.getTargetConstant(VE::sub_i32, DL, MVT::i32);
+  return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, VT,
+                                    N->getOperand(0), SubI32),
+                 0);
+}
+
+SDValue VETargetLowering::PerformDAGCombine(SDNode *N,
+                                            DAGCombinerInfo &DCI) const {
+  switch (N->getOpcode()) {
+  default:
+    break;
+  case ISD::TRUNCATE:
+    return combineTRUNCATE(N, DCI);
+  }
+
+  return SDValue();
+}
+
+//===----------------------------------------------------------------------===//
+// VE Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+VETargetLowering::ConstraintType
+VETargetLowering::getConstraintType(StringRef Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default:
+      break;
+    case 'v': // vector registers
+      return C_RegisterClass;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+std::pair<unsigned, const TargetRegisterClass *>
+VETargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                                               StringRef Constraint,
+                                               MVT VT) const {
+  const TargetRegisterClass *RC = nullptr;
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default:
+      return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+    case 'r':
+      RC = &VE::I64RegClass;
+      break;
+    case 'v':
+      RC = &VE::V64RegClass;
+      break;
+    }
+    return std::make_pair(0U, RC);
+  }
+
+  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+}
+
+//===----------------------------------------------------------------------===//
+// VE Target Optimization Support
+//===----------------------------------------------------------------------===//
+
+unsigned VETargetLowering::getMinimumJumpTableEntries() const {
+  // Specify 8 for PIC model to relieve the impact of PIC load instructions.
+  if (isJumpTableRelative())
+    return 8;
+
+  return TargetLowering::getMinimumJumpTableEntries();
+}
+
+bool VETargetLowering::hasAndNot(SDValue Y) const {
+  EVT VT = Y.getValueType();
+
+  // VE doesn't have vector and not instruction.
+  if (VT.isVector())
+    return false;
+
+  // VE allows different immediate values for X and Y where ~X & Y.
+  // Only simm7 works for X, and only mimm works for Y on VE.  However, this
+  // function is used to check whether an immediate value is OK for and-not
+  // instruction as both X and Y.  Generating additional instruction to
+  // retrieve an immediate value is no good since the purpose of this
+  // function is to convert a series of 3 instructions to another series of
+  // 3 instructions with better parallelism.  Therefore, we return false
+  // for all immediate values now.
+  // FIXME: Change hasAndNot function to have two operands to make it work
+  //        correctly with Aurora VE.
+  if (isa<ConstantSDNode>(Y))
+    return false;
+
+  // It's ok for generic registers.
+  return true;
+}
+
+/// \returns the VVP_* SDNode opcode corresponsing to \p OC.
+static Optional<unsigned> getVVPOpcode(unsigned Opcode) {
+  switch (Opcode) {
+#define HANDLE_VP_TO_VVP(VPOPC, VVPNAME)                                       \
+  case ISD::VPOPC:                                                             \
+    return VEISD::VVPNAME;
+#define ADD_VVP_OP(VVPNAME, SDNAME)                                            \
+  case VEISD::VVPNAME:                                                         \
+  case ISD::SDNAME:                                                            \
+    return VEISD::VVPNAME;
+#include "VVPNodes.def"
+  }
+  return None;
+}
+
+SDValue VETargetLowering::lowerToVVP(SDValue Op, SelectionDAG &DAG) const {
+  // Can we represent this as a VVP node.
+  const unsigned Opcode = Op->getOpcode();
+  auto VVPOpcodeOpt = getVVPOpcode(Opcode);
+  if (!VVPOpcodeOpt.hasValue())
+    return SDValue();
+  unsigned VVPOpcode = VVPOpcodeOpt.getValue();
+  const bool FromVP = ISD::isVPOpcode(Opcode);
+
+  // The representative and legalized vector type of this operation.
+  SDLoc DL(Op);
+  MVT MaskVT = MVT::v256i1; // TODO: packed mode.
+  EVT OpVecVT = Op.getValueType();
+  EVT LegalVecVT = getTypeToTransformTo(*DAG.getContext(), OpVecVT);
+
+  SDValue AVL;
+  SDValue Mask;
+
+  if (FromVP) {
+    // All upstream VP SDNodes always have a mask and avl.
+    auto MaskIdx = ISD::getVPMaskIdx(Opcode).getValue();
+    auto AVLIdx = ISD::getVPExplicitVectorLengthIdx(Opcode).getValue();
+    Mask = Op->getOperand(MaskIdx);
+    AVL = Op->getOperand(AVLIdx);
+
+  } else {
+    // Materialize the VL parameter.
+    AVL = DAG.getConstant(OpVecVT.getVectorNumElements(), DL, MVT::i32);
+    SDValue ConstTrue = DAG.getConstant(1, DL, MVT::i32);
+    Mask = DAG.getNode(VEISD::VEC_BROADCAST, DL, MaskVT,
+                       ConstTrue); // emit a VEISD::VEC_BROADCAST here.
+  }
+
+  // Categories we are interested in.
+  bool IsBinaryOp = false;
+
+  switch (VVPOpcode) {
+#define ADD_BINARY_VVP_OP(VVPNAME, ...)                                        \
+  case VEISD::VVPNAME:                                                         \
+    IsBinaryOp = true;                                                         \
+    break;
+#include "VVPNodes.def"
+  }
+
+  if (IsBinaryOp) {
+    assert(LegalVecVT.isSimple());
+    return DAG.getNode(VVPOpcode, DL, LegalVecVT, Op->getOperand(0),
+                       Op->getOperand(1), Mask, AVL);
+  }
+  llvm_unreachable("lowerToVVP called for unexpected SDNode.");
+}
+
+SDValue VETargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
+  MVT VT = Op.getOperand(0).getSimpleValueType();
+
+  // Special treatment for packed V64 types.
+  assert(VT == MVT::v512i32 || VT == MVT::v512f32);
+  // Example of codes:
+  //   %packed_v = extractelt %vr, %idx / 2
+  //   %v = %packed_v >> (%idx % 2 * 32)
+  //   %res = %v & 0xffffffff
+
+  SDValue Vec = Op.getOperand(0);
+  SDValue Idx = Op.getOperand(1);
+  SDLoc DL(Op);
+  SDValue Result = Op;
+  if (0 /* Idx->isConstant() */) {
+    // TODO: optimized implementation using constant values
+  } else {
+    SDValue Const1 = DAG.getConstant(1, DL, MVT::i64);
+    SDValue HalfIdx = DAG.getNode(ISD::SRL, DL, MVT::i64, {Idx, Const1});
+    SDValue PackedElt =
+        SDValue(DAG.getMachineNode(VE::LVSvr, DL, MVT::i64, {Vec, HalfIdx}), 0);
+    SDValue AndIdx = DAG.getNode(ISD::AND, DL, MVT::i64, {Idx, Const1});
+    SDValue Shift = DAG.getNode(ISD::XOR, DL, MVT::i64, {AndIdx, Const1});
+    SDValue Const5 = DAG.getConstant(5, DL, MVT::i64);
+    Shift = DAG.getNode(ISD::SHL, DL, MVT::i64, {Shift, Const5});
+    PackedElt = DAG.getNode(ISD::SRL, DL, MVT::i64, {PackedElt, Shift});
+    SDValue Mask = DAG.getConstant(0xFFFFFFFFL, DL, MVT::i64);
+    PackedElt = DAG.getNode(ISD::AND, DL, MVT::i64, {PackedElt, Mask});
+    SDValue SubI32 = DAG.getTargetConstant(VE::sub_i32, DL, MVT::i32);
+    Result = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
+                                        MVT::i32, PackedElt, SubI32),
+                     0);
+
+    if (Op.getSimpleValueType() == MVT::f32) {
+      Result = DAG.getBitcast(MVT::f32, Result);
+    } else {
+      assert(Op.getSimpleValueType() == MVT::i32);
+    }
+  }
+  return Result;
+}
+
+SDValue VETargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
+  MVT VT = Op.getOperand(0).getSimpleValueType();
+
+  // Special treatment for packed V64 types.
+  assert(VT == MVT::v512i32 || VT == MVT::v512f32);
+  // The v512i32 and v512f32 starts from upper bits (0..31).  This "upper
+  // bits" required `val << 32` from C implementation's point of view.
+  //
+  // Example of codes:
+  //   %packed_elt = extractelt %vr, (%idx >> 1)
+  //   %shift = ((%idx & 1) ^ 1) << 5
+  //   %packed_elt &= 0xffffffff00000000 >> shift
+  //   %packed_elt |= (zext %val) << shift
+  //   %vr = insertelt %vr, %packed_elt, (%idx >> 1)
+
+  SDLoc DL(Op);
+  SDValue Vec = Op.getOperand(0);
+  SDValue Val = Op.getOperand(1);
+  SDValue Idx = Op.getOperand(2);
+  if (Idx.getSimpleValueType() == MVT::i32)
+    Idx = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Idx);
+  if (Val.getSimpleValueType() == MVT::f32)
+    Val = DAG.getBitcast(MVT::i32, Val);
+  assert(Val.getSimpleValueType() == MVT::i32);
+  Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
+
+  SDValue Result = Op;
+  if (0 /* Idx->isConstant()*/) {
+    // TODO: optimized implementation using constant values
+  } else {
+    SDValue Const1 = DAG.getConstant(1, DL, MVT::i64);
+    SDValue HalfIdx = DAG.getNode(ISD::SRL, DL, MVT::i64, {Idx, Const1});
+    SDValue PackedElt =
+        SDValue(DAG.getMachineNode(VE::LVSvr, DL, MVT::i64, {Vec, HalfIdx}), 0);
+    SDValue AndIdx = DAG.getNode(ISD::AND, DL, MVT::i64, {Idx, Const1});
+    SDValue Shift = DAG.getNode(ISD::XOR, DL, MVT::i64, {AndIdx, Const1});
+    SDValue Const5 = DAG.getConstant(5, DL, MVT::i64);
+    Shift = DAG.getNode(ISD::SHL, DL, MVT::i64, {Shift, Const5});
+    SDValue Mask = DAG.getConstant(0xFFFFFFFF00000000L, DL, MVT::i64);
+    Mask = DAG.getNode(ISD::SRL, DL, MVT::i64, {Mask, Shift});
+    PackedElt = DAG.getNode(ISD::AND, DL, MVT::i64, {PackedElt, Mask});
+    Val = DAG.getNode(ISD::SHL, DL, MVT::i64, {Val, Shift});
+    PackedElt = DAG.getNode(ISD::OR, DL, MVT::i64, {PackedElt, Val});
+    Result =
+        SDValue(DAG.getMachineNode(VE::LSVrr_v, DL, Vec.getSimpleValueType(),
+                                   {HalfIdx, PackedElt, Vec}),
+                0);
+  }
+  return Result;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VEISelLowering.h b/contrib/llvm-project/llvm/lib/Target/VE/VEISelLowering.h
index 4633220efaa1..a6e1bf396035 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/VEISelLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VEISelLowering.h
@@ -24,23 +24,36 @@ namespace VEISD {
 enum NodeType : unsigned {
   FIRST_NUMBER = ISD::BUILTIN_OP_END,
 
-  Hi,
-  Lo, // Hi/Lo operations, typically on a global address.
-
-  GETFUNPLT,   // load function address through %plt insturction
-  GETTLSADDR,  // load address for TLS access
-  GETSTACKTOP, // retrieve address of stack top (first address of
-               // locals and temporaries)
-
-  CALL,            // A call instruction.
-  RET_FLAG,        // Return with a flag operand.
-  GLOBAL_BASE_REG, // Global base reg for PIC.
+  CALL,                   // A call instruction.
+  EH_SJLJ_LONGJMP,        // SjLj exception handling longjmp.
+  EH_SJLJ_SETJMP,         // SjLj exception handling setjmp.
+  EH_SJLJ_SETUP_DISPATCH, // SjLj exception handling setup_dispatch.
+  GETFUNPLT,              // Load function address through %plt insturction.
+  GETTLSADDR,             // Load address for TLS access.
+  GETSTACKTOP,            // Retrieve address of stack top (first address of
+                          // locals and temporaries).
+  GLOBAL_BASE_REG,        // Global base reg for PIC.
+  Hi,                     // Hi/Lo operations, typically on a global address.
+  Lo,                     // Hi/Lo operations, typically on a global address.
+  MEMBARRIER,             // Compiler barrier only; generate a no-op.
+  RET_FLAG,               // Return with a flag operand.
+  TS1AM,                  // A TS1AM instruction used for 1/2 bytes swap.
+  VEC_BROADCAST,          // A vector broadcast instruction.
+                          //   0: scalar value, 1: VL
+
+// VVP_* nodes.
+#define ADD_VVP_OP(VVP_NAME, ...) VVP_NAME,
+#include "VVPNodes.def"
 };
 }
 
 class VETargetLowering : public TargetLowering {
   const VESubtarget *Subtarget;
 
+  void initRegisterClasses();
+  void initSPUActions();
+  void initVPUActions();
+
 public:
   VETargetLowering(const TargetMachine &TM, const VESubtarget &STI);
 
@@ -74,23 +87,98 @@ public:
                       const SmallVectorImpl<SDValue> &OutVals, const SDLoc &dl,
                       SelectionDAG &DAG) const override;
 
+  /// Helper functions for atomic operations.
+  bool shouldInsertFencesForAtomic(const Instruction *I) const override {
+    // VE uses release consistency, so need fence for each atomics.
+    return true;
+  }
+  Instruction *emitLeadingFence(IRBuilder<> &Builder, Instruction *Inst,
+                                AtomicOrdering Ord) const override;
+  Instruction *emitTrailingFence(IRBuilder<> &Builder, Instruction *Inst,
+                                 AtomicOrdering Ord) const override;
+  TargetLoweringBase::AtomicExpansionKind
+  shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
+
   /// Custom Lower {
   SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
-
-  SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerToTLSGeneralDynamicModel(SDValue Op, SelectionDAG &DAG) const;
+  unsigned getJumpTableEncoding() const override;
+  const MCExpr *LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
+                                          const MachineBasicBlock *MBB,
+                                          unsigned Uid,
+                                          MCContext &Ctx) const override;
+  SDValue getPICJumpTableRelocBase(SDValue Table,
+                                   SelectionDAG &DAG) const override;
+  // VE doesn't need getPICJumpTableRelocBaseExpr since it is used for only
+  // EK_LabelDifference32.
+
+  SDValue lowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerATOMIC_SWAP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerLOAD(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerToTLSGeneralDynamicModel(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerVAARG(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   /// } Custom Lower
 
+  /// Replace the results of node with an illegal result
+  /// type with new values built out of custom code.
+  ///
+  void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
+                          SelectionDAG &DAG) const override;
+
+  /// Custom Inserter {
+  MachineBasicBlock *
+  EmitInstrWithCustomInserter(MachineInstr &MI,
+                              MachineBasicBlock *MBB) const override;
+  MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr &MI,
+                                       MachineBasicBlock *MBB) const;
+  MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr &MI,
+                                      MachineBasicBlock *MBB) const;
+  MachineBasicBlock *emitSjLjDispatchBlock(MachineInstr &MI,
+                                           MachineBasicBlock *BB) const;
+
+  void setupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB,
+                              MachineBasicBlock *DispatchBB, int FI,
+                              int Offset) const;
+  // Setup basic block address.
+  Register prepareMBB(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                      MachineBasicBlock *TargetBB, const DebugLoc &DL) const;
+  // Prepare function/variable address.
+  Register prepareSymbol(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                         StringRef Symbol, const DebugLoc &DL, bool IsLocal,
+                         bool IsCall) const;
+  /// } Custom Inserter
+
+  /// VVP Lowering {
+  SDValue lowerToVVP(SDValue Op, SelectionDAG &DAG) const;
+  /// } VVPLowering
+
+  /// Custom DAGCombine {
+  SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+
+  SDValue combineTRUNCATE(SDNode *N, DAGCombinerInfo &DCI) const;
+  /// } Custom DAGCombine
+
   SDValue withTargetFlags(SDValue Op, unsigned TF, SelectionDAG &DAG) const;
   SDValue makeHiLoPair(SDValue Op, unsigned HiTF, unsigned LoTF,
                        SelectionDAG &DAG) const;
   SDValue makeAddress(SDValue Op, SelectionDAG &DAG) const;
 
+  bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
   bool isFPImmLegal(const APFloat &Imm, EVT VT,
                     bool ForCodeSize) const override;
   /// Returns true if the target allows unaligned memory accesses of the
@@ -99,10 +187,32 @@ public:
                                       MachineMemOperand::Flags Flags,
                                       bool *Fast) const override;
 
-  // Block s/udiv lowering for now
-  bool isIntDivCheap(EVT VT, AttributeList Attr) const override { return true; }
+  /// Inline Assembly {
+
+  ConstraintType getConstraintType(StringRef Constraint) const override;
+  std::pair<unsigned, const TargetRegisterClass *>
+  getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                               StringRef Constraint, MVT VT) const override;
+
+  /// } Inline Assembly
 
+  /// Target Optimization {
+
+  // Return lower limit for number of blocks in a jump table.
+  unsigned getMinimumJumpTableEntries() const override;
+
+  // SX-Aurora VE's s/udiv is 5-9 times slower than multiply.
+  bool isIntDivCheap(EVT, AttributeList) const override { return false; }
+  // VE doesn't have rem.
+  bool hasStandaloneRem(EVT) const override { return false; }
+  // VE LDZ instruction returns 64 if the input is zero.
+  bool isCheapToSpeculateCtlz() const override { return true; }
+  // VE LDZ instruction is fast.
+  bool isCtlzFast() const override { return true; }
+  // VE has NND instruction.
   bool hasAndNot(SDValue Y) const override;
+
+  /// } Target Optimization
 };
 } // namespace llvm
 
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VEInstrBuilder.h b/contrib/llvm-project/llvm/lib/Target/VE/VEInstrBuilder.h
new file mode 100644
index 000000000000..1b0e07546931
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VEInstrBuilder.h
@@ -0,0 +1,41 @@
+//===-- VEInstrBuilder.h - Aides for building VE insts ----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file exposes functions that may be used with BuildMI from the
+// MachineInstrBuilder.h file to simplify generating frame and constant pool
+// references.
+//
+// For reference, the order of operands for memory references is:
+// (Operand), Dest Reg, Base Reg, and either Reg Index or Immediate
+// Displacement.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_VE_VEINSTRBUILDER_H
+#define LLVM_LIB_TARGET_VE_VEINSTRBUILDER_H
+
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+
+namespace llvm {
+
+/// addFrameReference - This function is used to add a reference to the base of
+/// an abstract object on the stack frame of the current function.  This
+/// reference has base register as the FrameIndex offset until it is resolved.
+/// This allows a constant offset to be specified as well...
+///
+static inline const MachineInstrBuilder &
+addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0,
+                  bool ThreeOp = true) {
+  if (ThreeOp)
+    return MIB.addFrameIndex(FI).addImm(0).addImm(Offset);
+  return MIB.addFrameIndex(FI).addImm(Offset);
+}
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VEInstrFormats.td b/contrib/llvm-project/llvm/lib/Target/VE/VEInstrFormats.td
index 0c02411ff916..f43c9755f1b9 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/VEInstrFormats.td
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VEInstrFormats.td
@@ -35,6 +35,25 @@ class InstVE<dag outs, dag ins, string asmstr, list<dag> pattern>
   let AsmString   = asmstr;
   let Pattern = pattern;
 
+  bits<1> VE_Vector = 0;
+  bits<1> VE_VLInUse = 0;
+  bits<3> VE_VLIndex = 0;
+  bits<1> VE_VLWithMask = 0;
+
+  /// These fields correspond to the fields in VEInstrInfo.h.  Any changes to
+  /// these must be reflected there!  See comments there for what these are.
+  ///
+  /// VLIndex is the index of VL register in MI's operands.  The HW instruction
+  /// doesn't have that field, but we add is in MI for the ease of optimization.
+  /// For example, the index of VL of (VST $sy, $sz, $sx, $vl) is 3 (beginning
+  /// from 0), and the index of VL of (VST $sy, $sz, $sx, $vm, $vl) is 4.  We
+  /// define vector instructions hierarchically, so use VE_VLIndex which is
+  /// defined by the type of instruction and VE_VLWithMask which is defined
+  /// whether the insturction use mask or not.
+  let TSFlags{0}   = VE_Vector;
+  let TSFlags{1}   = VE_VLInUse;
+  let TSFlags{4-2} = !add(VE_VLIndex, VE_VLWithMask);
+
   let DecoderNamespace = "VE";
   field bits<64> SoftFail = 0;
 }
@@ -179,12 +198,82 @@ class RRFENCE<bits<8>opVal, dag outs, dag ins, string asmstr,
 
 //-----------------------------------------------------------------------------
 // Section 5.6 RVM Type
+//
+// RVM type is for vector transfer instructions.
 //-----------------------------------------------------------------------------
 
+class RVM<bits<8>opVal, dag outs, dag ins, string asmstr,
+          list<dag> pattern = []>
+   : InstVE<outs, ins, asmstr, pattern> {
+  bits<1>  cx = 0;
+  bits<1>  vc = 0;
+  bits<1>  cs = 0;
+  bits<4>  m = 0;
+  bits<1>  cy = 1;
+  bits<7>  sy;
+  bits<1>  cz = 1;
+  bits<7>  sz;
+  bits<8>  vx;
+  bits<8>  vy = 0;
+  bits<7>  sw = 0;
+  let op = opVal;
+  let Inst{55} = cx;
+  let Inst{54} = vc;
+  let Inst{53} = cs;
+  let Inst{52} = 0;
+  let Inst{51-48} = m;
+  let Inst{47} = cy;
+  let Inst{46-40} = sy;
+  let Inst{39} = cz;
+  let Inst{38-32} = sz;
+  let Inst{31-24} = vx;
+  let Inst{23-16} = vy;
+  let Inst{15-8} = 0;
+  let Inst{7} = 0;
+  let Inst{6-0} = sw;
+
+  let VE_Vector = 1;
+}
+
 //-----------------------------------------------------------------------------
 // Section 5.7 RV Type
+//
+// RV type is for vector instructions.
 //-----------------------------------------------------------------------------
 
+class RV<bits<8>opVal, dag outs, dag ins, string asmstr, list<dag> pattern = []>
+   : InstVE<outs, ins, asmstr, pattern> {
+  bits<1>  cx = 0;
+  bits<1>  cx2 = 0;
+  bits<1>  cs = 0;
+  bits<1>  cs2 = 0;
+  bits<4>  m = 0;
+  bits<1>  cy = 1;
+  bits<7>  sy;
+  bits<1>  cz = 0;
+  bits<7>  sz = 0;
+  bits<8>  vx = 0;
+  bits<8>  vy = 0;
+  bits<8>  vz = 0;
+  bits<8>  vw = 0;
+  let op = opVal;
+  let Inst{55} = cx;
+  let Inst{54} = cx2;
+  let Inst{53} = cs;
+  let Inst{52} = cs2;
+  let Inst{51-48} = m;
+  let Inst{47} = cy;
+  let Inst{46-40} = sy;
+  let Inst{39} = cz;
+  let Inst{38-32} = sz;
+  let Inst{31-24} = vx;
+  let Inst{23-16} = vy;
+  let Inst{15-8} = vz;
+  let Inst{7-0} = vw;
+
+  let VE_Vector = 1;
+}
+
 // Pseudo instructions.
 class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern = []>
    : InstVE<outs, ins, asmstr, pattern> {
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VEInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/VE/VEInstrInfo.cpp
index 86b2ac2078b1..9770052ff913 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/VEInstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VEInstrInfo.cpp
@@ -92,38 +92,46 @@ static VECC::CondCode GetOppositeBranchCondition(VECC::CondCode CC) {
   llvm_unreachable("Invalid cond code");
 }
 
-// Treat br.l [BRCF AT] as unconditional branch
+// Treat a branch relative long always instruction as unconditional branch.
+// For example, br.l.t and br.l.
 static bool isUncondBranchOpcode(int Opc) {
-  return Opc == VE::BRCFLa    || Opc == VE::BRCFWa    ||
-         Opc == VE::BRCFLa_nt || Opc == VE::BRCFWa_nt ||
-         Opc == VE::BRCFLa_t  || Opc == VE::BRCFWa_t  ||
-         Opc == VE::BRCFDa    || Opc == VE::BRCFSa    ||
-         Opc == VE::BRCFDa_nt || Opc == VE::BRCFSa_nt ||
-         Opc == VE::BRCFDa_t  || Opc == VE::BRCFSa_t;
+  using namespace llvm::VE;
+
+#define BRKIND(NAME) (Opc == NAME##a || Opc == NAME##a_nt || Opc == NAME##a_t)
+  // VE has other branch relative always instructions for word/double/float,
+  // but we use only long branches in our lower.  So, sanity check it here.
+  assert(!BRKIND(BRCFW) && !BRKIND(BRCFD) && !BRKIND(BRCFS) &&
+         "Branch relative word/double/float always instructions should not be "
+         "used!");
+  return BRKIND(BRCFL);
+#undef BRKIND
 }
 
+// Treat branch relative conditional as conditional branch instructions.
+// For example, brgt.l.t and brle.s.nt.
 static bool isCondBranchOpcode(int Opc) {
-  return Opc == VE::BRCFLrr    || Opc == VE::BRCFLir    ||
-         Opc == VE::BRCFLrr_nt || Opc == VE::BRCFLir_nt ||
-         Opc == VE::BRCFLrr_t  || Opc == VE::BRCFLir_t  ||
-         Opc == VE::BRCFWrr    || Opc == VE::BRCFWir    ||
-         Opc == VE::BRCFWrr_nt || Opc == VE::BRCFWir_nt ||
-         Opc == VE::BRCFWrr_t  || Opc == VE::BRCFWir_t  ||
-         Opc == VE::BRCFDrr    || Opc == VE::BRCFDir    ||
-         Opc == VE::BRCFDrr_nt || Opc == VE::BRCFDir_nt ||
-         Opc == VE::BRCFDrr_t  || Opc == VE::BRCFDir_t  ||
-         Opc == VE::BRCFSrr    || Opc == VE::BRCFSir    ||
-         Opc == VE::BRCFSrr_nt || Opc == VE::BRCFSir_nt ||
-         Opc == VE::BRCFSrr_t  || Opc == VE::BRCFSir_t;
+  using namespace llvm::VE;
+
+#define BRKIND(NAME)                                                           \
+  (Opc == NAME##rr || Opc == NAME##rr_nt || Opc == NAME##rr_t ||               \
+   Opc == NAME##ir || Opc == NAME##ir_nt || Opc == NAME##ir_t)
+  return BRKIND(BRCFL) || BRKIND(BRCFW) || BRKIND(BRCFD) || BRKIND(BRCFS);
+#undef BRKIND
 }
 
+// Treat branch long always instructions as indirect branch.
+// For example, b.l.t and b.l.
 static bool isIndirectBranchOpcode(int Opc) {
-  return Opc == VE::BCFLari    || Opc == VE::BCFLari    ||
-         Opc == VE::BCFLari_nt || Opc == VE::BCFLari_nt ||
-         Opc == VE::BCFLari_t  || Opc == VE::BCFLari_t  ||
-         Opc == VE::BCFLari    || Opc == VE::BCFLari    ||
-         Opc == VE::BCFLari_nt || Opc == VE::BCFLari_nt ||
-         Opc == VE::BCFLari_t  || Opc == VE::BCFLari_t;
+  using namespace llvm::VE;
+
+#define BRKIND(NAME)                                                           \
+  (Opc == NAME##ari || Opc == NAME##ari_nt || Opc == NAME##ari_t)
+  // VE has other branch always instructions for word/double/float, but
+  // we use only long branches in our lower.  So, sanity check it here.
+  assert(!BRKIND(BCFW) && !BRKIND(BCFD) && !BRKIND(BCFS) &&
+         "Branch word/double/float always instructions should not be used!");
+  return BRKIND(BCFL);
+#undef BRKIND
 }
 
 static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
@@ -311,11 +319,43 @@ bool VEInstrInfo::reverseBranchCondition(
 }
 
 static bool IsAliasOfSX(Register Reg) {
-  return VE::I8RegClass.contains(Reg) || VE::I16RegClass.contains(Reg) ||
-         VE::I32RegClass.contains(Reg) || VE::I64RegClass.contains(Reg) ||
+  return VE::I32RegClass.contains(Reg) || VE::I64RegClass.contains(Reg) ||
          VE::F32RegClass.contains(Reg);
 }
 
+static void copyPhysSubRegs(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator I, const DebugLoc &DL,
+                            MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
+                            const MCInstrDesc &MCID, unsigned int NumSubRegs,
+                            const unsigned *SubRegIdx,
+                            const TargetRegisterInfo *TRI) {
+  MachineInstr *MovMI = nullptr;
+
+  for (unsigned Idx = 0; Idx != NumSubRegs; ++Idx) {
+    Register SubDest = TRI->getSubReg(DestReg, SubRegIdx[Idx]);
+    Register SubSrc = TRI->getSubReg(SrcReg, SubRegIdx[Idx]);
+    assert(SubDest && SubSrc && "Bad sub-register");
+
+    if (MCID.getOpcode() == VE::ORri) {
+      // generate "ORri, dest, src, 0" instruction.
+      MachineInstrBuilder MIB =
+          BuildMI(MBB, I, DL, MCID, SubDest).addReg(SubSrc).addImm(0);
+      MovMI = MIB.getInstr();
+    } else if (MCID.getOpcode() == VE::ANDMmm) {
+      // generate "ANDM, dest, vm0, src" instruction.
+      MachineInstrBuilder MIB =
+          BuildMI(MBB, I, DL, MCID, SubDest).addReg(VE::VM0).addReg(SubSrc);
+      MovMI = MIB.getInstr();
+    } else {
+      llvm_unreachable("Unexpected reg-to-reg copy instruction");
+    }
+  }
+  // Add implicit super-register defs and kills to the last MovMI.
+  MovMI->addRegisterDefined(DestReg, TRI);
+  if (KillSrc)
+    MovMI->addRegisterKilled(SrcReg, TRI, true);
+}
+
 void VEInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator I, const DebugLoc &DL,
                               MCRegister DestReg, MCRegister SrcReg,
@@ -325,6 +365,41 @@ void VEInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     BuildMI(MBB, I, DL, get(VE::ORri), DestReg)
         .addReg(SrcReg, getKillRegState(KillSrc))
         .addImm(0);
+  } else if (VE::V64RegClass.contains(DestReg, SrcReg)) {
+    // Generate following instructions
+    //   %sw16 = LEA32zii 256
+    //   VORmvl %dest, (0)1, %src, %sw16
+    // TODO: reuse a register if vl is already assigned to a register
+    // FIXME: it would be better to scavenge a register here instead of
+    // reserving SX16 all of the time.
+    const TargetRegisterInfo *TRI = &getRegisterInfo();
+    Register TmpReg = VE::SX16;
+    Register SubTmp = TRI->getSubReg(TmpReg, VE::sub_i32);
+    BuildMI(MBB, I, DL, get(VE::LEAzii), TmpReg)
+        .addImm(0)
+        .addImm(0)
+        .addImm(256);
+    MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(VE::VORmvl), DestReg)
+                                  .addImm(M1(0)) // Represent (0)1.
+                                  .addReg(SrcReg, getKillRegState(KillSrc))
+                                  .addReg(SubTmp, getKillRegState(true));
+    MIB.getInstr()->addRegisterKilled(TmpReg, TRI, true);
+  } else if (VE::VMRegClass.contains(DestReg, SrcReg)) {
+    BuildMI(MBB, I, DL, get(VE::ANDMmm), DestReg)
+        .addReg(VE::VM0)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+  } else if (VE::VM512RegClass.contains(DestReg, SrcReg)) {
+    // Use two instructions.
+    const unsigned SubRegIdx[] = {VE::sub_vm_even, VE::sub_vm_odd};
+    unsigned int NumSubRegs = 2;
+    copyPhysSubRegs(MBB, I, DL, DestReg, SrcReg, KillSrc, get(VE::ANDMmm),
+                    NumSubRegs, SubRegIdx, &getRegisterInfo());
+  } else if (VE::F128RegClass.contains(DestReg, SrcReg)) {
+    // Use two instructions.
+    const unsigned SubRegIdx[] = {VE::sub_even, VE::sub_odd};
+    unsigned int NumSubRegs = 2;
+    copyPhysSubRegs(MBB, I, DL, DestReg, SrcReg, KillSrc, get(VE::ORri),
+                    NumSubRegs, SubRegIdx, &getRegisterInfo());
   } else {
     const TargetRegisterInfo *TRI = &getRegisterInfo();
     dbgs() << "Impossible reg-to-reg copy from " << printReg(SrcReg, TRI)
@@ -342,7 +417,8 @@ unsigned VEInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
                                           int &FrameIndex) const {
   if (MI.getOpcode() == VE::LDrii ||    // I64
       MI.getOpcode() == VE::LDLSXrii || // I32
-      MI.getOpcode() == VE::LDUrii      // F32
+      MI.getOpcode() == VE::LDUrii ||   // F32
+      MI.getOpcode() == VE::LDQrii      // F128 (pseudo)
   ) {
     if (MI.getOperand(1).isFI() && MI.getOperand(2).isImm() &&
         MI.getOperand(2).getImm() == 0 && MI.getOperand(3).isImm() &&
@@ -363,7 +439,8 @@ unsigned VEInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
                                          int &FrameIndex) const {
   if (MI.getOpcode() == VE::STrii ||  // I64
       MI.getOpcode() == VE::STLrii || // I32
-      MI.getOpcode() == VE::STUrii    // F32
+      MI.getOpcode() == VE::STUrii || // F32
+      MI.getOpcode() == VE::STQrii    // F128 (pseudo)
   ) {
     if (MI.getOperand(0).isFI() && MI.getOperand(1).isImm() &&
         MI.getOperand(1).getImm() == 0 && MI.getOperand(2).isImm() &&
@@ -412,6 +489,13 @@ void VEInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
         .addImm(0)
         .addReg(SrcReg, getKillRegState(isKill))
         .addMemOperand(MMO);
+  } else if (VE::F128RegClass.hasSubClassEq(RC)) {
+    BuildMI(MBB, I, DL, get(VE::STQrii))
+        .addFrameIndex(FI)
+        .addImm(0)
+        .addImm(0)
+        .addReg(SrcReg, getKillRegState(isKill))
+        .addMemOperand(MMO);
   } else
     report_fatal_error("Can't store this register to stack slot");
 }
@@ -449,10 +533,194 @@ void VEInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
         .addImm(0)
         .addImm(0)
         .addMemOperand(MMO);
+  } else if (VE::F128RegClass.hasSubClassEq(RC)) {
+    BuildMI(MBB, I, DL, get(VE::LDQrii), DestReg)
+        .addFrameIndex(FI)
+        .addImm(0)
+        .addImm(0)
+        .addMemOperand(MMO);
   } else
     report_fatal_error("Can't load this register from stack slot");
 }
 
+bool VEInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
+                                Register Reg, MachineRegisterInfo *MRI) const {
+  LLVM_DEBUG(dbgs() << "FoldImmediate\n");
+
+  LLVM_DEBUG(dbgs() << "checking DefMI\n");
+  int64_t ImmVal;
+  switch (DefMI.getOpcode()) {
+  default:
+    return false;
+  case VE::ORim:
+    // General move small immediate instruction on VE.
+    LLVM_DEBUG(dbgs() << "checking ORim\n");
+    LLVM_DEBUG(DefMI.dump());
+    // FIXME: We may need to support FPImm too.
+    assert(DefMI.getOperand(1).isImm());
+    assert(DefMI.getOperand(2).isImm());
+    ImmVal =
+        DefMI.getOperand(1).getImm() + mimm2Val(DefMI.getOperand(2).getImm());
+    LLVM_DEBUG(dbgs() << "ImmVal is " << ImmVal << "\n");
+    break;
+  case VE::LEAzii:
+    // General move immediate instruction on VE.
+    LLVM_DEBUG(dbgs() << "checking LEAzii\n");
+    LLVM_DEBUG(DefMI.dump());
+    // FIXME: We may need to support FPImm too.
+    assert(DefMI.getOperand(2).isImm());
+    if (!DefMI.getOperand(3).isImm())
+      // LEAzii may refer label
+      return false;
+    ImmVal = DefMI.getOperand(2).getImm() + DefMI.getOperand(3).getImm();
+    LLVM_DEBUG(dbgs() << "ImmVal is " << ImmVal << "\n");
+    break;
+  }
+
+  // Try to fold like below:
+  //   %1:i64 = ORim 0, 0(1)
+  //   %2:i64 = CMPSLrr %0, %1
+  // To
+  //   %2:i64 = CMPSLrm %0, 0(1)
+  //
+  // Another example:
+  //   %1:i64 = ORim 6, 0(1)
+  //   %2:i64 = CMPSLrr %1, %0
+  // To
+  //   %2:i64 = CMPSLir 6, %0
+  //
+  // Support commutable instructions like below:
+  //   %1:i64 = ORim 6, 0(1)
+  //   %2:i64 = ADDSLrr %1, %0
+  // To
+  //   %2:i64 = ADDSLri %0, 6
+  //
+  // FIXME: Need to support i32.  Current implementtation requires
+  //        EXTRACT_SUBREG, so input has following COPY and it avoids folding:
+  //   %1:i64 = ORim 6, 0(1)
+  //   %2:i32 = COPY %1.sub_i32
+  //   %3:i32 = ADDSWSXrr %0, %2
+  // FIXME: Need to support shift, cmov, and more instructions.
+  // FIXME: Need to support lvl too, but LVLGen runs after peephole-opt.
+
+  LLVM_DEBUG(dbgs() << "checking UseMI\n");
+  LLVM_DEBUG(UseMI.dump());
+  unsigned NewUseOpcSImm7;
+  unsigned NewUseOpcMImm;
+  enum InstType {
+    rr2ri_rm, // rr -> ri or rm, commutable
+    rr2ir_rm, // rr -> ir or rm
+  } InstType;
+
+  using namespace llvm::VE;
+#define INSTRKIND(NAME)                                                        \
+  case NAME##rr:                                                               \
+    NewUseOpcSImm7 = NAME##ri;                                                 \
+    NewUseOpcMImm = NAME##rm;                                                  \
+    InstType = rr2ri_rm;                                                       \
+    break
+#define NCINSTRKIND(NAME)                                                      \
+  case NAME##rr:                                                               \
+    NewUseOpcSImm7 = NAME##ir;                                                 \
+    NewUseOpcMImm = NAME##rm;                                                  \
+    InstType = rr2ir_rm;                                                       \
+    break
+
+  switch (UseMI.getOpcode()) {
+  default:
+    return false;
+
+    INSTRKIND(ADDUL);
+    INSTRKIND(ADDSWSX);
+    INSTRKIND(ADDSWZX);
+    INSTRKIND(ADDSL);
+    NCINSTRKIND(SUBUL);
+    NCINSTRKIND(SUBSWSX);
+    NCINSTRKIND(SUBSWZX);
+    NCINSTRKIND(SUBSL);
+    INSTRKIND(MULUL);
+    INSTRKIND(MULSWSX);
+    INSTRKIND(MULSWZX);
+    INSTRKIND(MULSL);
+    NCINSTRKIND(DIVUL);
+    NCINSTRKIND(DIVSWSX);
+    NCINSTRKIND(DIVSWZX);
+    NCINSTRKIND(DIVSL);
+    NCINSTRKIND(CMPUL);
+    NCINSTRKIND(CMPSWSX);
+    NCINSTRKIND(CMPSWZX);
+    NCINSTRKIND(CMPSL);
+    INSTRKIND(MAXSWSX);
+    INSTRKIND(MAXSWZX);
+    INSTRKIND(MAXSL);
+    INSTRKIND(MINSWSX);
+    INSTRKIND(MINSWZX);
+    INSTRKIND(MINSL);
+    INSTRKIND(AND);
+    INSTRKIND(OR);
+    INSTRKIND(XOR);
+    INSTRKIND(EQV);
+    NCINSTRKIND(NND);
+    NCINSTRKIND(MRG);
+  }
+
+#undef INSTRKIND
+
+  unsigned NewUseOpc;
+  unsigned UseIdx;
+  bool Commute = false;
+  LLVM_DEBUG(dbgs() << "checking UseMI operands\n");
+  switch (InstType) {
+  case rr2ri_rm:
+    UseIdx = 2;
+    if (UseMI.getOperand(1).getReg() == Reg) {
+      Commute = true;
+    } else {
+      assert(UseMI.getOperand(2).getReg() == Reg);
+    }
+    if (isInt<7>(ImmVal)) {
+      // This ImmVal matches to SImm7 slot, so change UseOpc to an instruction
+      // holds a simm7 slot.
+      NewUseOpc = NewUseOpcSImm7;
+    } else if (isMImmVal(ImmVal)) {
+      // Similarly, change UseOpc to an instruction holds a mimm slot.
+      NewUseOpc = NewUseOpcMImm;
+      ImmVal = val2MImm(ImmVal);
+    } else
+      return false;
+    break;
+  case rr2ir_rm:
+    if (UseMI.getOperand(1).getReg() == Reg) {
+      // Check immediate value whether it matchs to the UseMI instruction.
+      if (!isInt<7>(ImmVal))
+        return false;
+      NewUseOpc = NewUseOpcSImm7;
+      UseIdx = 1;
+    } else {
+      assert(UseMI.getOperand(2).getReg() == Reg);
+      // Check immediate value whether it matchs to the UseMI instruction.
+      if (!isMImmVal(ImmVal))
+        return false;
+      NewUseOpc = NewUseOpcMImm;
+      ImmVal = val2MImm(ImmVal);
+      UseIdx = 2;
+    }
+    break;
+  }
+
+  LLVM_DEBUG(dbgs() << "modifying UseMI\n");
+  bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
+  UseMI.setDesc(get(NewUseOpc));
+  if (Commute) {
+    UseMI.getOperand(1).setReg(UseMI.getOperand(UseIdx).getReg());
+  }
+  UseMI.getOperand(UseIdx).ChangeToImmediate(ImmVal);
+  if (DeleteDef)
+    DefMI.eraseFromParent();
+
+  return true;
+}
+
 Register VEInstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
   VEMachineFunctionInfo *VEFI = MF->getInfo<VEMachineFunctionInfo>();
   Register GlobalBaseReg = VEFI->getGlobalBaseReg();
@@ -472,6 +740,106 @@ Register VEInstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
   return GlobalBaseReg;
 }
 
+static Register getVM512Upper(Register reg) {
+  return (reg - VE::VMP0) * 2 + VE::VM0;
+}
+
+static Register getVM512Lower(Register reg) { return getVM512Upper(reg) + 1; }
+
+// Expand pseudo logical vector instructions for VM512 registers.
+static void expandPseudoLogM(MachineInstr &MI, const MCInstrDesc &MCID) {
+  MachineBasicBlock *MBB = MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+
+  Register VMXu = getVM512Upper(MI.getOperand(0).getReg());
+  Register VMXl = getVM512Lower(MI.getOperand(0).getReg());
+  Register VMYu = getVM512Upper(MI.getOperand(1).getReg());
+  Register VMYl = getVM512Lower(MI.getOperand(1).getReg());
+
+  switch (MI.getOpcode()) {
+  default: {
+    Register VMZu = getVM512Upper(MI.getOperand(2).getReg());
+    Register VMZl = getVM512Lower(MI.getOperand(2).getReg());
+    BuildMI(*MBB, MI, DL, MCID).addDef(VMXu).addUse(VMYu).addUse(VMZu);
+    BuildMI(*MBB, MI, DL, MCID).addDef(VMXl).addUse(VMYl).addUse(VMZl);
+    break;
+  }
+  case VE::NEGMy:
+    BuildMI(*MBB, MI, DL, MCID).addDef(VMXu).addUse(VMYu);
+    BuildMI(*MBB, MI, DL, MCID).addDef(VMXl).addUse(VMYl);
+    break;
+  }
+  MI.eraseFromParent();
+}
+
+static void addOperandsForVFMK(MachineInstrBuilder &MIB, MachineInstr &MI,
+                               bool Upper) {
+  // VM512
+  MIB.addReg(Upper ? getVM512Upper(MI.getOperand(0).getReg())
+                   : getVM512Lower(MI.getOperand(0).getReg()));
+
+  switch (MI.getNumExplicitOperands()) {
+  default:
+    report_fatal_error("unexpected number of operands for pvfmk");
+  case 2: // _Ml: VM512, VL
+    // VL
+    MIB.addReg(MI.getOperand(1).getReg());
+    break;
+  case 4: // _Mvl: VM512, CC, VR, VL
+    // CC
+    MIB.addImm(MI.getOperand(1).getImm());
+    // VR
+    MIB.addReg(MI.getOperand(2).getReg());
+    // VL
+    MIB.addReg(MI.getOperand(3).getReg());
+    break;
+  case 5: // _MvMl: VM512, CC, VR, VM512, VL
+    // CC
+    MIB.addImm(MI.getOperand(1).getImm());
+    // VR
+    MIB.addReg(MI.getOperand(2).getReg());
+    // VM512
+    MIB.addReg(Upper ? getVM512Upper(MI.getOperand(3).getReg())
+                     : getVM512Lower(MI.getOperand(3).getReg()));
+    // VL
+    MIB.addReg(MI.getOperand(4).getReg());
+    break;
+  }
+}
+
+static void expandPseudoVFMK(const TargetInstrInfo &TI, MachineInstr &MI) {
+  // replace to pvfmk.w.up and pvfmk.w.lo
+  // replace to pvfmk.s.up and pvfmk.s.lo
+
+  static std::map<unsigned, std::pair<unsigned, unsigned>> VFMKMap = {
+      {VE::VFMKyal, {VE::VFMKLal, VE::VFMKLal}},
+      {VE::VFMKynal, {VE::VFMKLnal, VE::VFMKLnal}},
+      {VE::VFMKWyvl, {VE::PVFMKWUPvl, VE::PVFMKWLOvl}},
+      {VE::VFMKWyvyl, {VE::PVFMKWUPvml, VE::PVFMKWLOvml}},
+      {VE::VFMKSyvl, {VE::PVFMKSUPvl, VE::PVFMKSLOvl}},
+      {VE::VFMKSyvyl, {VE::PVFMKSUPvml, VE::PVFMKSLOvml}},
+  };
+
+  unsigned Opcode = MI.getOpcode();
+
+  auto Found = VFMKMap.find(Opcode);
+  if (Found == VFMKMap.end())
+    report_fatal_error("unexpected opcode for pseudo vfmk");
+
+  unsigned OpcodeUpper = (*Found).second.first;
+  unsigned OpcodeLower = (*Found).second.second;
+
+  MachineBasicBlock *MBB = MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+
+  MachineInstrBuilder Bu = BuildMI(*MBB, MI, DL, TI.get(OpcodeUpper));
+  addOperandsForVFMK(Bu, MI, /* Upper */ true);
+  MachineInstrBuilder Bl = BuildMI(*MBB, MI, DL, TI.get(OpcodeLower));
+  addOperandsForVFMK(Bl, MI, /* Upper */ false);
+
+  MI.eraseFromParent();
+}
+
 bool VEInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   switch (MI.getOpcode()) {
   case VE::EXTEND_STACK: {
@@ -484,6 +852,110 @@ bool VEInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   case VE::GETSTACKTOP: {
     return expandGetStackTopPseudo(MI);
   }
+
+  case VE::ANDMyy:
+    expandPseudoLogM(MI, get(VE::ANDMmm));
+    return true;
+  case VE::ORMyy:
+    expandPseudoLogM(MI, get(VE::ORMmm));
+    return true;
+  case VE::XORMyy:
+    expandPseudoLogM(MI, get(VE::XORMmm));
+    return true;
+  case VE::EQVMyy:
+    expandPseudoLogM(MI, get(VE::EQVMmm));
+    return true;
+  case VE::NNDMyy:
+    expandPseudoLogM(MI, get(VE::NNDMmm));
+    return true;
+  case VE::NEGMy:
+    expandPseudoLogM(MI, get(VE::NEGMm));
+    return true;
+
+  case VE::LVMyir:
+  case VE::LVMyim:
+  case VE::LVMyir_y:
+  case VE::LVMyim_y: {
+    Register VMXu = getVM512Upper(MI.getOperand(0).getReg());
+    Register VMXl = getVM512Lower(MI.getOperand(0).getReg());
+    int64_t Imm = MI.getOperand(1).getImm();
+    bool IsSrcReg =
+        MI.getOpcode() == VE::LVMyir || MI.getOpcode() == VE::LVMyir_y;
+    Register Src = IsSrcReg ? MI.getOperand(2).getReg() : VE::NoRegister;
+    int64_t MImm = IsSrcReg ? 0 : MI.getOperand(2).getImm();
+    bool KillSrc = IsSrcReg ? MI.getOperand(2).isKill() : false;
+    Register VMX = VMXl;
+    if (Imm >= 4) {
+      VMX = VMXu;
+      Imm -= 4;
+    }
+    MachineBasicBlock *MBB = MI.getParent();
+    DebugLoc DL = MI.getDebugLoc();
+    switch (MI.getOpcode()) {
+    case VE::LVMyir:
+      BuildMI(*MBB, MI, DL, get(VE::LVMir))
+          .addDef(VMX)
+          .addImm(Imm)
+          .addReg(Src, getKillRegState(KillSrc));
+      break;
+    case VE::LVMyim:
+      BuildMI(*MBB, MI, DL, get(VE::LVMim))
+          .addDef(VMX)
+          .addImm(Imm)
+          .addImm(MImm);
+      break;
+    case VE::LVMyir_y:
+      assert(MI.getOperand(0).getReg() == MI.getOperand(3).getReg() &&
+             "LVMyir_y has different register in 3rd operand");
+      BuildMI(*MBB, MI, DL, get(VE::LVMir_m))
+          .addDef(VMX)
+          .addImm(Imm)
+          .addReg(Src, getKillRegState(KillSrc))
+          .addReg(VMX);
+      break;
+    case VE::LVMyim_y:
+      assert(MI.getOperand(0).getReg() == MI.getOperand(3).getReg() &&
+             "LVMyim_y has different register in 3rd operand");
+      BuildMI(*MBB, MI, DL, get(VE::LVMim_m))
+          .addDef(VMX)
+          .addImm(Imm)
+          .addImm(MImm)
+          .addReg(VMX);
+      break;
+    }
+    MI.eraseFromParent();
+    return true;
+  }
+  case VE::SVMyi: {
+    Register Dest = MI.getOperand(0).getReg();
+    Register VMZu = getVM512Upper(MI.getOperand(1).getReg());
+    Register VMZl = getVM512Lower(MI.getOperand(1).getReg());
+    bool KillSrc = MI.getOperand(1).isKill();
+    int64_t Imm = MI.getOperand(2).getImm();
+    Register VMZ = VMZl;
+    if (Imm >= 4) {
+      VMZ = VMZu;
+      Imm -= 4;
+    }
+    MachineBasicBlock *MBB = MI.getParent();
+    DebugLoc DL = MI.getDebugLoc();
+    MachineInstrBuilder MIB =
+        BuildMI(*MBB, MI, DL, get(VE::SVMmi), Dest).addReg(VMZ).addImm(Imm);
+    MachineInstr *Inst = MIB.getInstr();
+    MI.eraseFromParent();
+    if (KillSrc) {
+      const TargetRegisterInfo *TRI = &getRegisterInfo();
+      Inst->addRegisterKilled(MI.getOperand(1).getReg(), TRI, true);
+    }
+    return true;
+  }
+  case VE::VFMKyal:
+  case VE::VFMKynal:
+  case VE::VFMKWyvl:
+  case VE::VFMKWyvyl:
+  case VE::VFMKSyvl:
+  case VE::VFMKSyvyl:
+    expandPseudoVFMK(*this, MI);
   }
   return false;
 }
@@ -586,8 +1058,8 @@ bool VEInstrInfo::expandGetStackTopPseudo(MachineInstr &MI) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   const VEFrameLowering &TFL = *STI.getFrameLowering();
 
-  // The VE ABI requires a reserved 176 bytes area at the top
-  // of stack as described in VESubtarget.cpp.  So, we adjust it here.
+  // The VE ABI requires a reserved area at the top of stack as described
+  // in VEFrameLowering.cpp.  So, we adjust it here.
   unsigned NumBytes = STI.getAdjustedFrameSize(0);
 
   // Also adds the size of parameter area.
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VEInstrInfo.h b/contrib/llvm-project/llvm/lib/Target/VE/VEInstrInfo.h
index 7b6662df1d60..ed1f49182150 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/VEInstrInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VEInstrInfo.h
@@ -23,6 +23,31 @@ namespace llvm {
 
 class VESubtarget;
 
+/// VEII - This namespace holds all of the Aurora VE target-specific
+/// per-instruction flags.  These must match the corresponding definitions in
+/// VEInstrFormats.td.
+namespace VEII {
+enum {
+  // Aurora VE Instruction Flags.  These flags describe the characteristics of
+  // the Aurora VE instructions for vector handling.
+
+  /// VE_Vector - This instruction is Vector Instruction.
+  VE_Vector = 0x1,
+
+  /// VE_VLInUse - This instruction has a vector register in its operands.
+  VE_VLInUse = 0x2,
+
+  /// VE_VLMask/Shift - This is a bitmask that selects the index number where
+  /// an instruction holds vector length informatio (0 to 6, 7 means undef).n
+  VE_VLShift = 2,
+  VE_VLMask = 0x07 << VE_VLShift,
+};
+
+#define HAS_VLINDEX(TSF) ((TSF)&VEII::VE_VLInUse)
+#define GET_VLINDEX(TSF)                                                       \
+  (HAS_VLINDEX(TSF) ? (int)(((TSF)&VEII::VE_VLMask) >> VEII::VE_VLShift) : -1)
+} // end namespace VEII
+
 class VEInstrInfo : public VEGenInstrInfo {
   const VERegisterInfo RI;
   virtual void anchor();
@@ -75,6 +100,13 @@ public:
                             const TargetRegisterInfo *TRI) const override;
   /// } Stack Spill & Reload
 
+  /// Optimization {
+
+  bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg,
+                     MachineRegisterInfo *MRI) const override;
+
+  /// } Optimization
+
   Register getGlobalBaseReg(MachineFunction *MF) const;
 
   // Lower pseudo instructions after register allocation.
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VEInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/VE/VEInstrInfo.td
index 8500f8ef1292..b6862cf7b30d 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/VEInstrInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VEInstrInfo.td
@@ -48,7 +48,7 @@ def LO7 : SDNodeXForm<imm, [{
                                    SDLoc(N), MVT::i32);
 }]>;
 def MIMM : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(convMImmVal(getImmVal(N)),
+  return CurDAG->getTargetConstant(val2MImm(getImmVal(N)),
                                    SDLoc(N), MVT::i32);
 }]>;
 def LO32 : SDNodeXForm<imm, [{
@@ -66,7 +66,7 @@ def LO7FP : SDNodeXForm<fpimm, [{
   return CurDAG->getTargetConstant(SignExtend32(Val, 7), SDLoc(N), MVT::i32);
 }]>;
 def MIMMFP : SDNodeXForm<fpimm, [{
-  return CurDAG->getTargetConstant(convMImmVal(getFpImmVal(N)),
+  return CurDAG->getTargetConstant(val2MImm(getFpImmVal(N)),
                                    SDLoc(N), MVT::i32);
 }]>;
 def LOFP32 : SDNodeXForm<fpimm, [{
@@ -157,6 +157,15 @@ def uimm3 : Operand<i32>, PatLeaf<(imm), [{
   let ParserMatchClass = UImm3AsmOperand;
 }
 
+// uimm4 - Generic immediate value.
+def UImm4AsmOperand : AsmOperandClass {
+  let Name = "UImm4";
+}
+def uimm4 : Operand<i32>, PatLeaf<(imm), [{
+    return isUInt<4>(N->getZExtValue()); }], ULO7> {
+  let ParserMatchClass = UImm4AsmOperand;
+}
+
 // uimm6 - Generic immediate value.
 def UImm6AsmOperand : AsmOperandClass {
   let Name = "UImm6";
@@ -196,6 +205,12 @@ def mimm : Operand<i32>, PatLeaf<(imm), [{
   let PrintMethod = "printMImmOperand";
 }
 
+// zerofp - Generic fp immediate zero value.
+def zerofp : Operand<i32>, PatLeaf<(fpimm), [{
+    return getFpImmVal(N) == 0; }]> {
+  let ParserMatchClass = ZeroAsmOperand;
+}
+
 // simm7fp - Generic fp immediate value.
 def simm7fp : Operand<i32>, PatLeaf<(fpimm), [{
     return isInt<7>(getFpImmVal(N));
@@ -230,6 +245,7 @@ def fplomsbzero : PatLeaf<(fpimm), [{ return (getFpImmVal(N) & 0x80000000)
                                         == 0; }]>;
 def fplozero    : PatLeaf<(fpimm), [{ return (getFpImmVal(N) & 0xffffffff)
                                         == 0; }]>;
+def nonzero     : PatLeaf<(imm), [{ return N->getSExtValue() !=0 ; }]>;
 
 def CCSIOp : PatLeaf<(cond), [{
   switch (N->get()) {
@@ -430,6 +446,17 @@ def retflag       : SDNode<"VEISD::RET_FLAG", SDTNone,
 
 def getGOT        : Operand<iPTR>;
 
+def VEeh_sjlj_setjmp: SDNode<"VEISD::EH_SJLJ_SETJMP",
+                             SDTypeProfile<1, 1, [SDTCisInt<0>,
+                                                  SDTCisPtrTy<1>]>,
+                             [SDNPHasChain, SDNPSideEffect]>;
+def VEeh_sjlj_longjmp: SDNode<"VEISD::EH_SJLJ_LONGJMP",
+                              SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
+                              [SDNPHasChain, SDNPSideEffect]>;
+def VEeh_sjlj_setup_dispatch: SDNode<"VEISD::EH_SJLJ_SETUP_DISPATCH",
+                                     SDTypeProfile<0, 0, []>,
+                                     [SDNPHasChain, SDNPSideEffect]>;
+
 // GETFUNPLT for PIC
 def GetFunPLT : SDNode<"VEISD::GETFUNPLT", SDTIntUnaryOp>;
 
@@ -442,6 +469,16 @@ def GetTLSAddr : SDNode<"VEISD::GETTLSADDR", SDT_SPCall,
 def GetStackTop : SDNode<"VEISD::GETSTACKTOP", SDTNone,
                         [SDNPHasChain, SDNPSideEffect]>;
 
+// MEMBARRIER
+def MemBarrier : SDNode<"VEISD::MEMBARRIER", SDTNone,
+                        [SDNPHasChain, SDNPSideEffect]>;
+
+// TS1AM
+def SDT_TS1AM : SDTypeProfile<1, 3, [SDTCisSameAs<0, 3>, SDTCisPtrTy<1>,
+                                     SDTCisVT<2, i32>, SDTCisInt<3>]>;
+def ts1am     : SDNode<"VEISD::TS1AM", SDT_TS1AM,
+                       [SDNPHasChain, SDNPMayStore, SDNPMayLoad,
+                        SDNPMemOperand]>;
 
 //===----------------------------------------------------------------------===//
 // VE Flag Conditions
@@ -497,7 +534,8 @@ multiclass RRbm<string opcStr, bits<8>opc,
                 RegisterClass RCo, ValueType Tyo,
                 RegisterClass RCi, ValueType Tyi,
                 SDPatternOperator OpNode = null_frag,
-                Operand immOp = simm7, Operand mOp = mimm> {
+                Operand immOp = simm7, Operand mOp = mimm,
+                bit MoveImm = 0> {
   def rr : RR<opc, (outs RCo:$sx), (ins RCi:$sy, RCi:$sz),
               !strconcat(opcStr, " $sx, $sy, $sz"),
               [(set Tyo:$sx, (OpNode Tyi:$sy, Tyi:$sz))]>;
@@ -514,7 +552,12 @@ multiclass RRbm<string opcStr, bits<8>opc,
   let cy = 0, cz = 0 in
   def im : RR<opc, (outs RCo:$sx), (ins immOp:$sy, mOp:$sz),
               !strconcat(opcStr, " $sx, $sy, $sz"),
-              [(set Tyo:$sx, (OpNode (Tyi immOp:$sy), (Tyi mOp:$sz)))]>;
+              [(set Tyo:$sx, (OpNode (Tyi immOp:$sy), (Tyi mOp:$sz)))]> {
+    // VE uses ORim as a move immediate instruction, so declare it here.
+    // An instruction declared as MoveImm will be optimized in FoldImmediate
+    // later.
+    let isMoveImm = MoveImm;
+  }
 }
 
 // Multiclass for non-commutative RR type instructions
@@ -546,8 +589,8 @@ multiclass RRNCbm<string opcStr, bits<8>opc,
 multiclass RRm<string opcStr, bits<8>opc,
                RegisterClass RC, ValueType Ty,
                SDPatternOperator OpNode = null_frag,
-               Operand immOp = simm7, Operand mOp = mimm> :
-  RRbm<opcStr, opc, RC, Ty, RC, Ty, OpNode, immOp, mOp>;
+               Operand immOp = simm7, Operand mOp = mimm, bit MoveImm = 0> :
+  RRbm<opcStr, opc, RC, Ty, RC, Ty, OpNode, immOp, mOp, MoveImm>;
 
 // Generic RR multiclass for non-commutative instructions with 2 arguments.
 //   e.g. SUBUL, SUBUW, SUBSWSX, and etc.
@@ -775,10 +818,10 @@ multiclass BCbpfm<string opcStr, string cmpStr, bits<8> opc, dag cond,
   let bpf = 0 /* NONE */ in
   def "" : CF<opc, (outs), !con(cond, (ins ADDR:$addr)),
               !strconcat(opcStr, " ", cmpStr, "$addr")>;
-  let bpf = 2 /* NOT TaKEN */ in
+  let bpf = 2 /* NOT TAKEN */ in
   def _nt : CF<opc, (outs), !con(cond, (ins ADDR:$addr)),
                !strconcat(opcStr, ".nt ", cmpStr, "$addr")>;
-  let bpf = 3 /* TaKEN */ in
+  let bpf = 3 /* TAKEN */ in
   def _t : CF<opc, (outs), !con(cond, (ins ADDR:$addr)),
               !strconcat(opcStr, ".t ", cmpStr, "$addr")>;
 }
@@ -807,18 +850,25 @@ multiclass BCRbpfm<string opcStr, string cmpStr, bits<8> opc, dag cond> {
   let bpf = 0 /* NONE */ in
   def "" : CF<opc, (outs), !con(cond, (ins brtarget32:$imm32)),
               !strconcat(opcStr, " ", cmpStr, "$imm32")>;
-  let bpf = 2 /* NOT TaKEN */ in
+  let bpf = 2 /* NOT TAKEN */ in
   def _nt : CF<opc, (outs), !con(cond, (ins brtarget32:$imm32)),
                !strconcat(opcStr, ".nt ", cmpStr, "$imm32")>;
-  let bpf = 3 /* TaKEN */ in
+  let bpf = 3 /* TAKEN */ in
   def _t : CF<opc, (outs), !con(cond, (ins brtarget32:$imm32)),
               !strconcat(opcStr, ".t ", cmpStr, "$imm32")>;
 }
 multiclass BCRm<string opcStr, string opcStrAt, string opcStrAf, bits<8> opc,
-               RegisterClass RC, Operand immOp> {
+               RegisterClass RC, Operand immOp, Operand zeroOp> {
   defm rr : BCRbpfm<opcStr, "$sy, $sz, ", opc, (ins CCOp:$cf, RC:$sy, RC:$sz)>;
   let cy = 0 in
-  defm ir : BCRbpfm<opcStr, "$sy, $sz, ", opc, (ins CCOp:$cf, immOp:$sy, RC:$sz)>;
+  defm ir : BCRbpfm<opcStr, "$sy, $sz, ", opc, (ins CCOp:$cf, immOp:$sy,
+                                                    RC:$sz)>;
+  let cz = 0 in
+  defm rz : BCRbpfm<opcStr, "$sy, $sz, ", opc, (ins CCOp:$cf, RC:$sy,
+                                                    zeroOp:$sz)>;
+  let cy = 0, cz = 0 in
+  defm iz : BCRbpfm<opcStr, "$sy, $sz, ", opc, (ins CCOp:$cf, immOp:$sy,
+                                                    zeroOp:$sz)>;
   let cy = 0, sy = 0, cz = 0, sz = 0, cf = 15 /* AT */, isBarrier = 1 in
   defm a : BCRbpfm<opcStrAt, "", opc, (ins)>;
   let cy = 0, sy = 0, cz = 0, sz = 0, cf = 0 /* AF */ in
@@ -898,7 +948,7 @@ multiclass SHMm<string opcStr, bits<8> opc, RegisterClass RC> {
 //-----------------------------------------------------------------------------
 
 // Multiclass for generic RM instructions
-multiclass RMm<string opcStr, bits<8>opc, RegisterClass RC> {
+multiclass RMm<string opcStr, bits<8>opc, RegisterClass RC, bit MoveImm = 0> {
   def rri : RM<opc, (outs RC:$dest), (ins MEMrri:$addr),
                !strconcat(opcStr, " $dest, $addr"), []>;
   let cy = 0 in
@@ -909,36 +959,27 @@ multiclass RMm<string opcStr, bits<8>opc, RegisterClass RC> {
                !strconcat(opcStr, " $dest, $addr"), []>;
   let cy = 0, cz = 0 in
   def zii : RM<opc, (outs RC:$dest), (ins MEMzii:$addr),
-               !strconcat(opcStr, " $dest, $addr"), []>;
+               !strconcat(opcStr, " $dest, $addr"), []> {
+    // VE uses LEAzii and LEASLzii as a move immediate instruction, so declare
+    // it here.  An instruction declared as MoveImm will be optimized in
+    // FoldImmediate later.
+    let isMoveImm = MoveImm;
+  }
 }
 
 // Section 8.2.1 - LEA
-let cx = 0, DecoderMethod = "DecodeLoadI64" in
-defm LEA : RMm<"lea", 0x06, I64>;
-let cx = 1, DecoderMethod = "DecodeLoadI64" in
-defm LEASL : RMm<"lea.sl", 0x06, I64>;
-let cx = 0, DecoderMethod = "DecodeLoadI32", isCodeGenOnly = 1 in
-defm LEA32 : RMm<"lea", 0x06, I32>;
+let isReMaterializable = 1, isAsCheapAsAMove = 1,
+    DecoderMethod = "DecodeLoadI64" in {
+  let cx = 0 in defm LEA : RMm<"lea", 0x06, I64, /* MoveImm */ 1>;
+  let cx = 1 in defm LEASL : RMm<"lea.sl", 0x06, I64, /* MoveImm */ 1>;
+}
 
+// LEA basic patterns.
+//   Need to be defined here to prioritize LEA over ADX.
 def : Pat<(iPTR ADDRrri:$addr), (LEArri MEMrri:$addr)>;
 def : Pat<(iPTR ADDRrii:$addr), (LEArii MEMrii:$addr)>;
 def : Pat<(add I64:$base, simm32:$disp), (LEArii $base, 0, (LO32 $disp))>;
 def : Pat<(add I64:$base, lozero:$disp), (LEASLrii $base, 0, (HI32 $disp))>;
-def : Pat<(add I32:$base, simm32:$disp),
-          (LEA32rii (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $base, sub_i32), 0,
-                    (LO32 $disp))>;
-
-def lea_add : PatFrags<(ops node:$base, node:$idx, node:$disp),
-                       [(add (add node:$base, node:$idx), node:$disp),
-                        (add (add node:$base, node:$disp), node:$idx)]>;
-def : Pat<(lea_add I64:$base, simm7:$idx, simm32:$disp),
-          (LEArii $base, (LO7 $idx), (LO32 $disp))>;
-def : Pat<(lea_add I64:$base, I64:$idx, simm32:$disp),
-          (LEArri $base, $idx, (LO32 $disp))>;
-def : Pat<(lea_add I64:$base, simm7:$idx, lozero:$disp),
-          (LEASLrii $base, (LO7 $idx), (HI32 $disp))>;
-def : Pat<(lea_add I64:$base, I64:$idx, lozero:$disp),
-          (LEASLrri $base, $idx, (HI32 $disp))>;
 
 // Multiclass for load instructions.
 let mayLoad = 1, hasSideEffects = 0 in
@@ -991,6 +1032,13 @@ defm LD1BSX : LOADm<"ld1b.sx", 0x05, I32, i32, sextloadi8>;
 let cx = 1, DecoderMethod = "DecodeLoadI32" in
 defm LD1BZX : LOADm<"ld1b.zx", 0x05, I32, i32, zextloadi8>;
 
+// LDQ pseudo instructions
+let mayLoad = 1, hasSideEffects = 0 in {
+  def LDQrii : Pseudo<(outs F128:$dest), (ins MEMrii:$addr),
+                      "# pseudo ldq $dest, $addr",
+                      [(set f128:$dest, (load ADDRrii:$addr))]>;
+}
+
 // Multiclass for store instructions.
 let mayStore = 1 in
 multiclass STOREm<string opcStr, bits<8> opc, RegisterClass RC, ValueType Ty,
@@ -1036,6 +1084,13 @@ defm ST2B : STOREm<"st2b", 0x14, I32, i32, truncstorei16>;
 let DecoderMethod = "DecodeStoreI32" in
 defm ST1B : STOREm<"st1b", 0x15, I32, i32, truncstorei8>;
 
+// STQ pseudo instructions
+let mayStore = 1, hasSideEffects = 0 in {
+  def STQrii : Pseudo<(outs), (ins MEMrii:$addr, F128:$sx),
+                      "# pseudo stq $sx, $addr",
+                      [(store f128:$sx, ADDRrii:$addr)]>;
+}
+
 // Section 8.2.12 - DLDS
 let DecoderMethod = "DecodeLoadI64" in
 defm DLD : LOADm<"dld", 0x09, I64, i64, load>;
@@ -1074,9 +1129,9 @@ defm ATMAM : RRCASm<"atmam", 0x53, I64, i64, uimm0to2>;
 
 // Section 8.2.20 - CAS (Compare and Swap)
 let DecoderMethod = "DecodeCASI64" in
-defm CASL : RRCASm<"cas.l", 0x62, I64, i64, simm7>;
+defm CASL : RRCASm<"cas.l", 0x62, I64, i64, simm7, atomic_cmp_swap_64>;
 let DecoderMethod = "DecodeCASI32", cx = 1 in
-defm CASW : RRCASm<"cas.w", 0x62, I32, i32, simm7>;
+defm CASW : RRCASm<"cas.w", 0x62, I32, i32, simm7, atomic_cmp_swap_32>;
 
 //-----------------------------------------------------------------------------
 // Section 8.3 - Transfer Control Instructions
@@ -1106,6 +1161,8 @@ def SVOB : RR<0x30, (outs), (ins), "svob">;
 // Section 8.4 - Fixed-point Operation Instructions
 //-----------------------------------------------------------------------------
 
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+
 // Section 8.4.1 - ADD (Add)
 defm ADDUL : RRm<"addu.l", 0x48, I64, i64>;
 let cx = 1 in defm ADDUW : RRm<"addu.w", 0x48, I32, i32>;
@@ -1128,6 +1185,8 @@ let cx = 1 in defm SUBSWZX : RRNCm<"subs.w.zx", 0x5A, I32, i32>;
 // Section 8.4.6 - SBX (Subtract)
 defm SUBSL : RRNCm<"subs.l", 0x5B, I64, i64, sub>;
 
+} // isReMaterializable, isAsCheapAsAMove
+
 // Section 8.4.7 - MPY (Multiply)
 defm MULUL : RRm<"mulu.l", 0x49, I64, i64>;
 let cx = 1 in defm MULUW : RRm<"mulu.w", 0x49, I32, i32>;
@@ -1153,6 +1212,8 @@ let cx = 1 in defm DIVSWZX : RRNCm<"divs.w.zx", 0x7B, I32, i32>;
 // Section 8.4.13 - DVX (Divide)
 defm DIVSL : RRNCm<"divs.l", 0x7F, I64, i64, sdiv>;
 
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+
 // Section 8.4.14 - CMP (Compare)
 defm CMPUL : RRNCm<"cmpu.l", 0x55, I64, i64>;
 let cx = 1 in defm CMPUW : RRNCm<"cmpu.w", 0x55, I32, i32>;
@@ -1175,45 +1236,66 @@ let cx = 1, cw = 1 in defm MINSWZX : RRm<"mins.w.zx", 0x78, I32, i32>;
 defm MAXSL : RRm<"maxs.l", 0x68, I64, i64>;
 let cw = 1 in defm MINSL : RRm<"mins.l", 0x68, I64, i64>;
 
+} // isReMaterializable, isAsCheapAsAMove
+
 //-----------------------------------------------------------------------------
 // Section 8.5 - Logical Operation Instructions
 //-----------------------------------------------------------------------------
 
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+
 // Section 8.5.1 - AND (AND)
 defm AND : RRm<"and", 0x44, I64, i64, and>;
-let isCodeGenOnly = 1 in defm AND32 : RRm<"and", 0x44, I32, i32, and>;
 
 // Section 8.5.2 - OR (OR)
-defm OR : RRm<"or", 0x45, I64, i64, or>;
-let isCodeGenOnly = 1 in defm OR32 : RRm<"or", 0x45, I32, i32, or>;
+defm OR : RRm<"or", 0x45, I64, i64, or, simm7, mimm, /* MoveImm */ 1>;
 
 // Section 8.5.3 - XOR (Exclusive OR)
 defm XOR : RRm<"xor", 0x46, I64, i64, xor>;
-let isCodeGenOnly = 1 in defm XOR32 : RRm<"xor", 0x46, I32, i32, xor>;
 
 // Section 8.5.4 - EQV (Equivalence)
 defm EQV : RRm<"eqv", 0x47, I64, i64>;
 
+} // isReMaterializable, isAsCheapAsAMove
+
 // Section 8.5.5 - NND (Negate AND)
 def and_not : PatFrags<(ops node:$x, node:$y),
                        [(and (not node:$x), node:$y)]>;
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in
 defm NND : RRNCm<"nnd", 0x54, I64, i64, and_not>;
 
 // Section 8.5.6 - MRG (Merge)
 defm MRG : RRMRGm<"mrg", 0x56, I64, i64>;
 
 // Section 8.5.7 - LDZ (Leading Zero Count)
-defm LDZ : RRI1m<"ldz", 0x67, I64, i64, ctlz>;
+def ctlz_pat : PatFrags<(ops node:$src),
+                        [(ctlz node:$src),
+                         (ctlz_zero_undef node:$src)]>;
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in
+defm LDZ : RRI1m<"ldz", 0x67, I64, i64, ctlz_pat>;
 
 // Section 8.5.8 - PCNT (Population Count)
 defm PCNT : RRI1m<"pcnt", 0x38, I64, i64, ctpop>;
 
 // Section 8.5.9 - BRV (Bit Reverse)
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in
 defm BRV : RRI1m<"brv", 0x39, I64, i64, bitreverse>;
 
 // Section 8.5.10 - BSWP (Byte Swap)
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in
 defm BSWP : RRSWPm<"bswp", 0x2B, I64, i64>;
 
+def : Pat<(i64 (bswap i64:$src)),
+          (BSWPri $src, 0)>;
+def : Pat<(i64 (bswap (i64 mimm:$src))),
+          (BSWPmi (MIMM $src), 0)>;
+def : Pat<(i32 (bswap i32:$src)),
+          (EXTRACT_SUBREG
+              (BSWPri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $src, sub_i32), 1),
+              sub_i32)>;
+def : Pat<(i32 (bswap (i32 mimm:$src))),
+          (EXTRACT_SUBREG (BSWPmi (MIMM $src), 1), sub_i32)>;
+
 // Section 8.5.11 - CMOV (Conditional Move)
 let cw = 0, cw2 = 0 in defm CMOVL : RRCMOVm<"cmov.l.${cfw}", 0x3B, I64, i64>;
 let cw = 1, cw2 = 0 in defm CMOVW : RRCMOVm<"cmov.w.${cfw}", 0x3B, I32, i32>;
@@ -1229,17 +1311,21 @@ def : MnemonicAlias<"cmov.s", "cmov.s.at">;
 //-----------------------------------------------------------------------------
 
 // Section 8.6.1 - SLL (Shift Left Logical)
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in
 defm SLL : RRIm<"sll", 0x65, I64, i64, shl>;
 
 // Section 8.6.2 - SLD (Shift Left Double)
 defm SLD : RRILDm<"sld", 0x64, I64, i64>;
 
 // Section 8.6.3 - SRL (Shift Right Logical)
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in
 defm SRL : RRIm<"srl", 0x75, I64, i64, srl>;
 
 // Section 8.6.4 - SRD (Shift Right Double)
 defm SRD : RRIRDm<"srd", 0x74, I64, i64>;
 
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+
 // Section 8.6.5 - SLA (Shift Left Arithmetic)
 defm SLAWSX : RRIm<"sla.w.sx", 0x66, I32, i32, shl>;
 let cx = 1 in defm SLAWZX : RRIm<"sla.w.zx", 0x66, I32, i32>;
@@ -1254,6 +1340,8 @@ let cx = 1 in defm SRAWZX : RRIm<"sra.w.zx", 0x76, I32, i32>;
 // Section 8.6.8 - SRAX (Shift Right Arithmetic)
 defm SRAL : RRIm<"sra.l", 0x77, I64, i64, sra>;
 
+} // isReMaterializable, isAsCheapAsAMove
+
 def : Pat<(i32 (srl i32:$src, (i32 simm7:$val))),
           (EXTRACT_SUBREG (SRLri (ANDrm (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
             $src, sub_i32), !add(32, 64)), imm:$val), sub_i32)>;
@@ -1302,13 +1390,13 @@ let cw = 1, cx = 1 in
 defm FMINS : RRFm<"fmin.s", 0x3E, F32, f32, fminnum, simm7fp, mimmfp32>;
 
 // Section 8.7.7 - FAQ (Floating Add Quadruple)
-defm FADDQ : RRFm<"fadd.q", 0x6C, F128, f128>;
+defm FADDQ : RRFm<"fadd.q", 0x6C, F128, f128, fadd>;
 
 // Section 8.7.8 - FSQ (Floating Subtract Quadruple)
-defm FSUBQ : RRFm<"fsub.q", 0x7C, F128, f128>;
+defm FSUBQ : RRFm<"fsub.q", 0x7C, F128, f128, fsub>;
 
 // Section 8.7.9 - FMQ (Floating Subtract Quadruple)
-defm FMULQ : RRFm<"fmul.q", 0x6D, F128, f128>;
+defm FMULQ : RRFm<"fmul.q", 0x6D, F128, f128, fmul>;
 
 // Section 8.7.10 - FCQ (Floating Compare Quadruple)
 defm FCMPQ : RRNCbm<"fcmp.q", 0x7D, I64, f64, F128, f128, null_frag, simm7fp,
@@ -1339,17 +1427,17 @@ defm CVTDL : CVTm<"cvt.d.l", 0x5F, I64, f64, I64, i64, sint_to_fp>;
 // Section 8.7.15 - CVS (Convert to Single-format)
 defm CVTSD : CVTm<"cvt.s.d", 0x1F, F32, f32, I64, f64, fpround>;
 let cx = 1 in
-defm CVTSQ : CVTm<"cvt.s.q", 0x1F, F32, f32, F128, f128>;
+defm CVTSQ : CVTm<"cvt.s.q", 0x1F, F32, f32, F128, f128, fpround>;
 
 // Section 8.7.16 - CVD (Convert to Double-format)
 defm CVTDS : CVTm<"cvt.d.s", 0x0F, I64, f64, F32, f32, fpextend>;
 let cx = 1 in
-defm CVTDQ : CVTm<"cvt.d.q", 0x0F, I64, f64, F128, f128>;
+defm CVTDQ : CVTm<"cvt.d.q", 0x0F, I64, f64, F128, f128, fpround>;
 
 // Section 8.7.17 - CVQ (Convert to Single-format)
-defm CVTQD : CVTm<"cvt.q.d", 0x2D, F128, f128, I64, f64>;
+defm CVTQD : CVTm<"cvt.q.d", 0x2D, F128, f128, I64, f64, fpextend>;
 let cx = 1 in
-defm CVTQS : CVTm<"cvt.q.s", 0x2D, F128, f128, F32, f32>;
+defm CVTQS : CVTm<"cvt.q.s", 0x2D, F128, f128, F32, f32, fpextend>;
 
 //-----------------------------------------------------------------------------
 // Section 8.8 - Branch instructions
@@ -1378,13 +1466,13 @@ defm BCFS : BCm<"b${cond}.s", "b.s", "baf.s", 0x1C, F32, simm7fp>;
 
 // Section 8.8.4 - BCR (Branch on Condition Relative)
 let cx = 0, cx2 = 0 in
-defm BRCFL : BCRm<"br${cf}.l", "br.l", "braf.l", 0x18, I64, simm7>;
+defm BRCFL : BCRm<"br${cf}.l", "br.l", "braf.l", 0x18, I64, simm7, zero>;
 let cx = 1, cx2 = 0 in
-defm BRCFW : BCRm<"br${cf}.w", "br.w", "braf.w", 0x18, I32, simm7>;
+defm BRCFW : BCRm<"br${cf}.w", "br.w", "braf.w", 0x18, I32, simm7, zero>;
 let cx = 0, cx2 = 1 in
-defm BRCFD : BCRm<"br${cf}.d", "br.d", "braf.d", 0x18, I64, simm7fp>;
+defm BRCFD : BCRm<"br${cf}.d", "br.d", "braf.d", 0x18, I64, simm7fp, zerofp>;
 let cx = 1, cx2 = 1 in
-defm BRCFS : BCRm<"br${cf}.s", "br.s", "braf.s", 0x18, F32, simm7fp>;
+defm BRCFS : BCRm<"br${cf}.s", "br.s", "braf.s", 0x18, F32, simm7fp, zerofp>;
 
 // Section 8.8.5 - BSIC (Branch and Save IC)
 let isCall = 1, hasSideEffects = 0, DecoderMethod = "DecodeCall" in
@@ -1481,11 +1569,23 @@ defm SHMB : SHMm<"shm.b", 0x31, I64>;
 // Pattern Matchings
 //===----------------------------------------------------------------------===//
 
+// Basic cast between registers.  This is often used in ISel patterns, so make
+// them as OutPatFrag.
+def i2l : OutPatFrag<(ops node:$exp),
+                     (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $exp, sub_i32)>;
+def l2i : OutPatFrag<(ops node:$exp),
+                     (EXTRACT_SUBREG $exp, sub_i32)>;
+def f2l : OutPatFrag<(ops node:$exp),
+                     (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $exp, sub_f32)>;
+def l2f : OutPatFrag<(ops node:$exp),
+                     (EXTRACT_SUBREG $exp, sub_f32)>;
+
 // Small immediates.
-def : Pat<(i32 simm7:$val), (OR32im (LO7 $val), 0)>;
+def : Pat<(i32 simm7:$val), (EXTRACT_SUBREG (ORim (LO7 $val), 0), sub_i32)>;
 def : Pat<(i64 simm7:$val), (ORim (LO7 $val), 0)>;
 // Medium immediates.
-def : Pat<(i32 simm32:$val), (LEA32zii 0, 0, (LO32 $val))>;
+def : Pat<(i32 simm32:$val),
+          (EXTRACT_SUBREG (LEAzii 0, 0, (LO32 $val)), sub_i32)>;
 def : Pat<(i64 simm32:$val), (LEAzii 0, 0, (LO32 $val))>;
 def : Pat<(i64 uimm32:$val), (ANDrm (LEAzii 0, 0, (LO32 $val)), !add(32, 64))>;
 // Arbitrary immediates.
@@ -1497,6 +1597,54 @@ def : Pat<(i64 imm:$val),
           (LEASLrii (ANDrm (LEAzii 0, 0, (LO32 imm:$val)), !add(32, 64)), 0,
                     (HI32 imm:$val))>;
 
+// LEA patterns
+def lea_add : PatFrags<(ops node:$base, node:$idx, node:$disp),
+                       [(add (add node:$base, node:$idx), node:$disp),
+                        (add (add node:$base, node:$disp), node:$idx),
+                        (add node:$base, (add $idx, $disp))]>;
+def : Pat<(lea_add I64:$base, simm7:$idx, simm32:$disp),
+          (LEArii $base, (LO7 $idx), (LO32 $disp))>;
+def : Pat<(lea_add I64:$base, I64:$idx, simm32:$disp),
+          (LEArri $base, $idx, (LO32 $disp))>;
+def : Pat<(lea_add I64:$base, simm7:$idx, lozero:$disp),
+          (LEASLrii $base, (LO7 $idx), (HI32 $disp))>;
+def : Pat<(lea_add I64:$base, I64:$idx, lozero:$disp),
+          (LEASLrri $base, $idx, (HI32 $disp))>;
+
+// Address calculation patterns and optimizations
+//
+// Generate following instructions:
+//   1. LEA %reg, label@LO32
+//      AND %reg, %reg, (32)0
+//   2. LEASL %reg, label@HI32
+//   3. (LEA %reg, label@LO32)
+//      (AND %reg, %reg, (32)0)
+//      LEASL %reg, label@HI32(, %reg)
+//   4. (LEA %reg, label@LO32)
+//      (AND %reg, %reg, (32)0)
+//      LEASL %reg, label@HI32(%reg, %got)
+//
+def velo_only : OutPatFrag<(ops node:$lo),
+                           (ANDrm (LEAzii 0, 0, $lo), !add(32, 64))>;
+def vehi_only : OutPatFrag<(ops node:$hi),
+                           (LEASLzii 0, 0, $hi)>;
+def vehi_lo : OutPatFrag<(ops node:$hi, node:$lo),
+                         (LEASLrii $lo, 0, $hi)>;
+def vehi_lo_imm : OutPatFrag<(ops node:$hi, node:$lo, node:$idx),
+                             (LEASLrii $lo, $idx, $hi)>;
+def vehi_baselo : OutPatFrag<(ops node:$base, node:$hi, node:$lo),
+                             (LEASLrri $base, $lo, $hi)>;
+foreach type = [ "tblockaddress", "tconstpool", "texternalsym", "tglobaladdr",
+                 "tglobaltlsaddr", "tjumptable" ] in {
+  def : Pat<(VElo !cast<SDNode>(type):$lo), (velo_only $lo)>;
+  def : Pat<(VEhi !cast<SDNode>(type):$hi), (vehi_only $hi)>;
+  def : Pat<(add (VEhi !cast<SDNode>(type):$hi), I64:$lo), (vehi_lo $hi, $lo)>;
+  def : Pat<(add (add (VEhi !cast<SDNode>(type):$hi), I64:$lo), simm7:$val),
+            (vehi_lo_imm $hi, $lo, (LO7 $val))>;
+  def : Pat<(add I64:$base, (add (VEhi !cast<SDNode>(type):$hi), I64:$lo)),
+            (vehi_baselo $base, $hi, $lo)>;
+}
+
 // floating point
 def : Pat<(f32 fpimm:$val),
           (EXTRACT_SUBREG (LEASLzii 0, 0, (HIFP32 $val)), sub_f32)>;
@@ -1526,8 +1674,8 @@ def : Pat<(sext_inreg I64:$src, i8),
           (SRALri (SLLri $src, 56), 56)>;
 def : Pat<(sext_inreg (i32 (trunc i64:$src)), i8),
           (EXTRACT_SUBREG (SRALri (SLLri $src, 56), 56), sub_i32)>;
-def : Pat<(and (trunc i64:$src), 0xff),
-          (AND32rm (EXTRACT_SUBREG $src, sub_i32), !add(56, 64))>;
+def : Pat<(i32 (and (trunc i64:$src), 0xff)),
+          (EXTRACT_SUBREG (ANDrm $src, !add(56, 64)), sub_i32)>;
 
 // Cast to i16
 def : Pat<(sext_inreg I32:$src, i16),
@@ -1536,28 +1684,34 @@ def : Pat<(sext_inreg I64:$src, i16),
           (SRALri (SLLri $src, 48), 48)>;
 def : Pat<(sext_inreg (i32 (trunc i64:$src)), i16),
           (EXTRACT_SUBREG (SRALri (SLLri $src, 48), 48), sub_i32)>;
-def : Pat<(and (trunc i64:$src), 0xffff),
-          (AND32rm (EXTRACT_SUBREG $src, sub_i32), !add(48, 64))>;
+def : Pat<(i32 (and (trunc i64:$src), 0xffff)),
+          (EXTRACT_SUBREG (ANDrm $src, !add(48, 64)), sub_i32)>;
 
 // Cast to i32
 def : Pat<(i32 (trunc i64:$src)),
-          (ADDSWSXrm (EXTRACT_SUBREG $src, sub_i32), 0)>;
-def : Pat<(i32 (fp_to_sint I64:$reg)), (CVTWDSXr RD_RZ, $reg)>;
-def : Pat<(i32 (fp_to_sint F32:$reg)), (CVTWSSXr RD_RZ, $reg)>;
+          (EXTRACT_SUBREG (ANDrm $src, !add(32, 64)), sub_i32)>;
+def : Pat<(i32 (fp_to_sint f32:$src)), (CVTWSSXr RD_RZ, $src)>;
+def : Pat<(i32 (fp_to_sint f64:$src)), (CVTWDSXr RD_RZ, $src)>;
+def : Pat<(i32 (fp_to_sint f128:$src)), (CVTWDSXr RD_RZ, (CVTDQr $src))>;
 
 // Cast to i64
-def : Pat<(sext_inreg I64:$src, i32),
+def : Pat<(sext_inreg i64:$src, i32),
           (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
             (ADDSWSXrm (EXTRACT_SUBREG $src, sub_i32), 0), sub_i32)>;
-def : Pat<(i64 (sext i32:$sy)),
-          (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (ADDSWSXrm $sy, 0), sub_i32)>;
-def : Pat<(i64 (zext i32:$sy)),
-          (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (ADDSWZXrm $sy, 0), sub_i32)>;
-def : Pat<(i64 (fp_to_sint f32:$sy)), (CVTLDr RD_RZ, (CVTDSr $sy))>;
-def : Pat<(i64 (fp_to_sint I64:$reg)), (CVTLDr RD_RZ, $reg)>;
+def : Pat<(i64 (sext i32:$src)),
+          (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (ADDSWSXrm $src, 0), sub_i32)>;
+def : Pat<(i64 (zext i32:$src)),
+          (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (ADDSWZXrm $src, 0), sub_i32)>;
+def : Pat<(i64 (fp_to_sint f32:$src)), (CVTLDr RD_RZ, (CVTDSr $src))>;
+def : Pat<(i64 (fp_to_sint f64:$src)), (CVTLDr RD_RZ, $src)>;
+def : Pat<(i64 (fp_to_sint f128:$src)), (CVTLDr RD_RZ, (CVTDQr $src))>;
 
 // Cast to f32
-def : Pat<(f32 (sint_to_fp i64:$sy)), (CVTSDr (CVTDLr i64:$sy))>;
+def : Pat<(f32 (sint_to_fp i64:$src)), (CVTSDr (CVTDLr i64:$src))>;
+
+// Cast to f128
+def : Pat<(f128 (sint_to_fp i32:$src)), (CVTQDr (CVTDWr $src))>;
+def : Pat<(f128 (sint_to_fp i64:$src)), (CVTQDr (CVTDLr $src))>;
 
 def : Pat<(i64 (anyext i32:$sy)),
           (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $sy, sub_i32)>;
@@ -1625,29 +1779,150 @@ defm : TRUNC64m<truncstorei8, ST1Brri, ST1Brii, ST1Bzri, ST1Bzii>;
 defm : TRUNC64m<truncstorei16, ST2Brri, ST2Brii, ST2Bzri, ST2Bzii>;
 defm : TRUNC64m<truncstorei32, STLrri, STLrii, STLzri, ST1Bzii>;
 
-// Address calculation and its optimization
-def : Pat<(VEhi tglobaladdr:$in), (LEASLzii 0, 0, tglobaladdr:$in)>;
-def : Pat<(VElo tglobaladdr:$in),
-          (ANDrm (LEAzii 0, 0, tglobaladdr:$in), !add(32, 64))>;
-def : Pat<(add (VEhi tglobaladdr:$in1), (VElo tglobaladdr:$in2)),
-          (LEASLrii (ANDrm (LEAzii 0, 0, tglobaladdr:$in2), !add(32, 64)), 0,
-                    (tglobaladdr:$in1))>;
-
-// GlobalTLS address calculation and its optimization
-def : Pat<(VEhi tglobaltlsaddr:$in), (LEASLzii 0, 0, tglobaltlsaddr:$in)>;
-def : Pat<(VElo tglobaltlsaddr:$in),
-          (ANDrm (LEAzii 0, 0, tglobaltlsaddr:$in), !add(32, 64))>;
-def : Pat<(add (VEhi tglobaltlsaddr:$in1), (VElo tglobaltlsaddr:$in2)),
-          (LEASLrii (ANDrm (LEAzii 0, 0, tglobaltlsaddr:$in2), !add(32, 64)), 0,
-                    (tglobaltlsaddr:$in1))>;
-
-// Address calculation and its optimization
-def : Pat<(VEhi texternalsym:$in), (LEASLzii 0, 0, texternalsym:$in)>;
-def : Pat<(VElo texternalsym:$in),
-          (ANDrm (LEAzii 0, 0, texternalsym:$in), !add(32, 64))>;
-def : Pat<(add (VEhi texternalsym:$in1), (VElo texternalsym:$in2)),
-          (LEASLrii (ANDrm (LEAzii 0, 0, texternalsym:$in2), !add(32, 64)), 0,
-                    (texternalsym:$in1))>;
+// Atomic loads
+multiclass ATMLDm<SDPatternOperator from,
+                  SDPatternOperator torri, SDPatternOperator torii,
+                  SDPatternOperator tozri, SDPatternOperator tozii> {
+  def : Pat<(from ADDRrri:$addr), (torri MEMrri:$addr)>;
+  def : Pat<(from ADDRrii:$addr), (torii MEMrii:$addr)>;
+  def : Pat<(from ADDRzri:$addr), (tozri MEMzri:$addr)>;
+  def : Pat<(from ADDRzii:$addr), (tozii MEMzii:$addr)>;
+}
+defm : ATMLDm<atomic_load_8, LD1BZXrri, LD1BZXrii, LD1BZXzri, LD1BZXzii>;
+defm : ATMLDm<atomic_load_16, LD2BZXrri, LD2BZXrii, LD2BZXzri, LD2BZXzii>;
+defm : ATMLDm<atomic_load_32, LDLZXrri, LDLZXrii, LDLZXzri, LDLZXzii>;
+defm : ATMLDm<atomic_load_64, LDrri, LDrii, LDzri, LDzii>;
+
+// Optimized atomic loads with sext
+multiclass SXATMLDm<SDPatternOperator from, Operand TY,
+                    SDPatternOperator torri, SDPatternOperator torii,
+                    SDPatternOperator tozri, SDPatternOperator tozii> {
+  def : Pat<(i64 (sext_inreg (i64 (anyext (from ADDRrri:$addr))), TY)),
+            (i2l (torri MEMrri:$addr))>;
+  def : Pat<(i64 (sext_inreg (i64 (anyext (from ADDRrii:$addr))), TY)),
+            (i2l (torii MEMrii:$addr))>;
+  def : Pat<(i64 (sext_inreg (i64 (anyext (from ADDRzri:$addr))), TY)),
+            (i2l (tozri MEMzri:$addr))>;
+  def : Pat<(i64 (sext_inreg (i64 (anyext (from ADDRzii:$addr))), TY)),
+            (i2l (tozii MEMzii:$addr))>;
+}
+multiclass SXATMLD32m<SDPatternOperator from,
+                      SDPatternOperator torri, SDPatternOperator torii,
+                      SDPatternOperator tozri, SDPatternOperator tozii> {
+  def : Pat<(i64 (sext (from ADDRrri:$addr))),
+            (i2l (torri MEMrri:$addr))>;
+  def : Pat<(i64 (sext (from ADDRrii:$addr))),
+            (i2l (torii MEMrii:$addr))>;
+  def : Pat<(i64 (sext (from ADDRzri:$addr))),
+            (i2l (tozri MEMzri:$addr))>;
+  def : Pat<(i64 (sext (from ADDRzii:$addr))),
+            (i2l (tozii MEMzii:$addr))>;
+}
+defm : SXATMLDm<atomic_load_8, i8, LD1BSXrri, LD1BSXrii, LD1BSXzri, LD1BSXzii>;
+defm : SXATMLDm<atomic_load_16, i16, LD2BSXrri, LD2BSXrii, LD2BSXzri,
+                LD2BSXzii>;
+defm : SXATMLD32m<atomic_load_32, LDLSXrri, LDLSXrii, LDLSXzri, LDLSXzii>;
+
+// Optimized atomic loads with zext
+multiclass ZXATMLDm<SDPatternOperator from, Operand VAL,
+                    SDPatternOperator torri, SDPatternOperator torii,
+                    SDPatternOperator tozri, SDPatternOperator tozii> {
+  def : Pat<(i64 (and (anyext (from ADDRrri:$addr)), VAL)),
+            (i2l (torri MEMrri:$addr))>;
+  def : Pat<(i64 (and (anyext (from ADDRrii:$addr)), VAL)),
+            (i2l (torii MEMrii:$addr))>;
+  def : Pat<(i64 (and (anyext (from ADDRzri:$addr)), VAL)),
+            (i2l (tozri MEMzri:$addr))>;
+  def : Pat<(i64 (and (anyext (from ADDRzii:$addr)), VAL)),
+            (i2l (tozii MEMzii:$addr))>;
+}
+multiclass ZXATMLD32m<SDPatternOperator from, Operand VAL,
+                      SDPatternOperator torri, SDPatternOperator torii,
+                      SDPatternOperator tozri, SDPatternOperator tozii> {
+  def : Pat<(i64 (zext (from ADDRrri:$addr))),
+            (i2l (torri MEMrri:$addr))>;
+  def : Pat<(i64 (zext (from ADDRrii:$addr))),
+            (i2l (torii MEMrii:$addr))>;
+  def : Pat<(i64 (zext (from ADDRzri:$addr))),
+            (i2l (tozri MEMzri:$addr))>;
+  def : Pat<(i64 (zext (from ADDRzii:$addr))),
+            (i2l (tozii MEMzii:$addr))>;
+}
+defm : ZXATMLDm<atomic_load_8, 0xFF, LD1BZXrri, LD1BZXrii, LD1BZXzri,
+                LD1BZXzii>;
+defm : ZXATMLDm<atomic_load_16, 0xFFFF, LD2BZXrri, LD2BZXrii, LD2BZXzri,
+                LD2BZXzii>;
+defm : ZXATMLD32m<atomic_load_32, 0xFFFFFFFF, LDLZXrri, LDLZXrii, LDLZXzri,
+                  LDLZXzii>;
+
+// Atomic stores
+multiclass ATMSTm<SDPatternOperator from, ValueType ty,
+                  SDPatternOperator torri, SDPatternOperator torii,
+                  SDPatternOperator tozri, SDPatternOperator tozii> {
+  def : Pat<(from ADDRrri:$addr, ty:$src), (torri MEMrri:$addr, $src)>;
+  def : Pat<(from ADDRrii:$addr, ty:$src), (torii MEMrii:$addr, $src)>;
+  def : Pat<(from ADDRzri:$addr, ty:$src), (tozri MEMzri:$addr, $src)>;
+  def : Pat<(from ADDRzii:$addr, ty:$src), (tozii MEMzii:$addr, $src)>;
+}
+defm : ATMSTm<atomic_store_8, i32, ST1Brri, ST1Brii, ST1Bzri, ST1Bzii>;
+defm : ATMSTm<atomic_store_16, i32, ST2Brri, ST2Brii, ST2Bzri, ST2Bzii>;
+defm : ATMSTm<atomic_store_32, i32, STLrri, STLrii, STLzri, STLzii>;
+defm : ATMSTm<atomic_store_64, i64, STrri, STrii, STzri, STzii>;
+
+// Optimized atomic stores with truncate
+multiclass TRATMSTm<SDPatternOperator from,
+                  ValueType ty,
+                  SDPatternOperator torri,
+                  SDPatternOperator torii,
+                  SDPatternOperator tozri,
+                  SDPatternOperator tozii> {
+  def : Pat<(from ADDRrri:$addr, (i32 (trunc i64:$src))),
+            (torri MEMrri:$addr, (EXTRACT_SUBREG $src, sub_i32))>;
+  def : Pat<(from ADDRrii:$addr, (i32 (trunc i64:$src))),
+            (torii MEMrii:$addr, (EXTRACT_SUBREG $src, sub_i32))>;
+  def : Pat<(from ADDRzri:$addr, (i32 (trunc i64:$src))),
+            (tozri MEMzri:$addr, (EXTRACT_SUBREG $src, sub_i32))>;
+  def : Pat<(from ADDRzii:$addr, (i32 (trunc i64:$src))),
+            (tozii MEMzii:$addr, (EXTRACT_SUBREG $src, sub_i32))>;
+}
+defm : TRATMSTm<atomic_store_8, i32, ST1Brri, ST1Brii, ST1Bzri, ST1Bzii>;
+defm : TRATMSTm<atomic_store_16, i32, ST2Brri, ST2Brii, ST2Bzri, ST2Bzii>;
+defm : TRATMSTm<atomic_store_32, i32, STLrri, STLrii, STLzri, STLzii>;
+
+// Atomic swaps
+def : Pat<(i32 (ts1am i64:$src, i32:$flag, i32:$new)),
+          (TS1AMWrir $src, 0, $flag, $new)>;
+def : Pat<(i32 (atomic_swap_32 ADDRri:$src, i32:$new)),
+          (TS1AMWrii MEMriRRM:$src, 15, $new)>;
+def : Pat<(i64 (atomic_swap_64 ADDRri:$src, i64:$new)),
+          (TS1AMLrir MEMriRRM:$src, (LEAzii 0, 0, 255), i64:$new)>;
+
+//===----------------------------------------------------------------------===//
+// SJLJ Exception handling patterns
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1,
+    usesCustomInserter = 1 in {
+  let isTerminator = 1 in
+  def EH_SjLj_LongJmp : Pseudo<(outs), (ins I64:$buf),
+                               "# EH_SJLJ_LONGJMP",
+                               [(VEeh_sjlj_longjmp I64:$buf)]>;
+
+  def EH_SjLj_SetJmp  : Pseudo<(outs I32:$dst), (ins I64:$buf),
+                               "# EH_SJLJ_SETJMP",
+                               [(set I32:$dst, (VEeh_sjlj_setjmp I64:$buf))]>;
+
+  def EH_SjLj_Setup_Dispatch : Pseudo<(outs), (ins), "# EH_SJLJ_SETUP_DISPATCH",
+                                      [(VEeh_sjlj_setup_dispatch)]>;
+}
+
+let isTerminator = 1, isBranch = 1, isCodeGenOnly = 1 in
+  def EH_SjLj_Setup : Pseudo<(outs), (ins brtarget32:$dst),
+                             "# EH_SJlJ_SETUP $dst">;
+
+//===----------------------------------------------------------------------===//
+// Branch related patterns
+//===----------------------------------------------------------------------===//
 
 // Branches
 def : Pat<(br bb:$addr), (BRCFLa bb:$addr)>;
@@ -1681,6 +1956,8 @@ multiclass BRCCFm<ValueType ty, SDPatternOperator BrOpNode1,
 }
 defm : BRCCFm<f32, BRCFSrr, BRCFSir>;
 defm : BRCCFm<f64, BRCFDrr, BRCFDir>;
+def : Pat<(brcc cond:$cond, f128:$l, f128:$r, bb:$addr),
+          (BRCFDir (fcond2cc $cond), 0, (FCMPQrr $r, $l), bb:$addr)>;
 
 //===----------------------------------------------------------------------===//
 // Pseudo Instructions
@@ -1737,53 +2014,42 @@ let Uses = [SX11], hasSideEffects = 1 in
 def GETSTACKTOP : Pseudo<(outs I64:$dst), (ins),
                          "# GET STACK TOP",
                          [(set iPTR:$dst, (GetStackTop))]>;
+
+// MEMBARRIER
+let hasSideEffects = 1 in
+def MEMBARRIER : Pseudo<(outs), (ins), "# MEMBARRIER", [(MemBarrier)] >;
+
+//===----------------------------------------------------------------------===//
+// Other patterns
+//===----------------------------------------------------------------------===//
+
 // SETCC pattern matches
 //
 //   CMP  %tmp, lhs, rhs     ; compare lhs and rhs
 //   or   %res, 0, (0)1      ; initialize by 0
 //   CMOV %res, (63)0, %tmp  ; set 1 if %tmp is true
 
-def : Pat<(i32 (setcc i64:$LHS, i64:$RHS, CCSIOp:$cond)),
-          (EXTRACT_SUBREG
-              (CMOVLrm (icond2cc $cond),
-                       (CMPSLrr i64:$LHS, i64:$RHS),
-                       !add(63, 64),
-                       (ORim 0, 0)), sub_i32)>;
-
-def : Pat<(i32 (setcc i64:$LHS, i64:$RHS, CCUIOp:$cond)),
-          (EXTRACT_SUBREG
-              (CMOVLrm (icond2cc $cond),
-                       (CMPULrr i64:$LHS, i64:$RHS),
-                       !add(63, 64),
-                       (ORim 0, 0)), sub_i32)>;
-
-def : Pat<(i32 (setcc i32:$LHS, i32:$RHS, CCSIOp:$cond)),
-          (EXTRACT_SUBREG
-              (CMOVWrm (icond2cc $cond),
-                       (CMPSWSXrr i32:$LHS, i32:$RHS),
-                       !add(63, 64),
-                       (ORim 0, 0)), sub_i32)>;
-
-def : Pat<(i32 (setcc i32:$LHS, i32:$RHS, CCUIOp:$cond)),
-          (EXTRACT_SUBREG
-              (CMOVWrm (icond2cc $cond),
-                       (CMPUWrr i32:$LHS, i32:$RHS),
-                       !add(63, 64),
-                       (ORim 0, 0)), sub_i32)>;
-
-def : Pat<(i32 (setcc f64:$LHS, f64:$RHS, cond:$cond)),
-          (EXTRACT_SUBREG
-              (CMOVDrm (fcond2cc $cond),
-                       (FCMPDrr f64:$LHS, f64:$RHS),
-                       !add(63, 64),
-                       (ORim 0, 0)), sub_i32)>;
-
-def : Pat<(i32 (setcc f32:$LHS, f32:$RHS, cond:$cond)),
-          (EXTRACT_SUBREG
-              (CMOVSrm (fcond2cc $cond),
-                       (FCMPSrr f32:$LHS, f32:$RHS),
-                       !add(63, 64),
-                       (ORim 0, 0)), sub_i32)>;
+class setccrr<Instruction INSN> :
+    OutPatFrag<(ops node:$cond, node:$comp),
+               (EXTRACT_SUBREG
+                   (INSN $cond, $comp,
+                         !add(63, 64), // means (63)0 == 1
+                         (ORim 0, 0)), sub_i32)>;
+
+def : Pat<(i32 (setcc i32:$l, i32:$r, CCSIOp:$cond)),
+          (setccrr<CMOVWrm> (icond2cc $cond), (CMPSWSXrr $l, $r))>;
+def : Pat<(i32 (setcc i32:$l, i32:$r, CCUIOp:$cond)),
+          (setccrr<CMOVWrm> (icond2cc $cond), (CMPUWrr $l, $r))>;
+def : Pat<(i32 (setcc i64:$l, i64:$r, CCSIOp:$cond)),
+          (setccrr<CMOVLrm> (icond2cc $cond), (CMPSLrr $l, $r))>;
+def : Pat<(i32 (setcc i64:$l, i64:$r, CCUIOp:$cond)),
+          (setccrr<CMOVLrm> (icond2cc $cond), (CMPULrr $l, $r))>;
+def : Pat<(i32 (setcc f32:$l, f32:$r, cond:$cond)),
+          (setccrr<CMOVSrm> (fcond2cc $cond), (FCMPSrr $l, $r))>;
+def : Pat<(i32 (setcc f64:$l, f64:$r, cond:$cond)),
+          (setccrr<CMOVDrm> (fcond2cc $cond), (FCMPDrr $l, $r))>;
+def : Pat<(i32 (setcc f128:$l, f128:$r, cond:$cond)),
+          (setccrr<CMOVDrm> (fcond2cc $cond), (FCMPQrr $l, $r))>;
 
 // Special SELECTCC pattern matches
 // Use min/max for better performance.
@@ -1824,152 +2090,171 @@ def : Pat<(i64 (selectcc i64:$LHS, i64:$RHS, i64:$LHS, i64:$RHS, SETLE)),
 def : Pat<(i32 (selectcc i32:$LHS, i32:$RHS, i32:$LHS, i32:$RHS, SETLE)),
           (MINSWSXrr $LHS, $RHS)>;
 
+// Helper classes to construct cmov patterns for the ease.
+//
+//   Hiding INSERT_SUBREG/EXTRACT_SUBREG patterns.
+
+class cmovrr<Instruction INSN> :
+    OutPatFrag<(ops node:$cond, node:$comp, node:$t, node:$f),
+               (INSN $cond, $comp, $t, $f)>;
+class cmovrm<Instruction INSN, SDNodeXForm MOP = MIMM> :
+    OutPatFrag<(ops node:$cond, node:$comp, node:$t, node:$f),
+               (INSN $cond, $comp, (MOP $t), $f)>;
+class cmov32rr<Instruction INSN, SubRegIndex sub_oty> :
+    OutPatFrag<(ops node:$cond, node:$comp, node:$t, node:$f),
+               (EXTRACT_SUBREG
+                   (INSN $cond, $comp,
+                         (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $t, sub_oty),
+                         (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $f, sub_oty)),
+                   sub_oty)>;
+class cmov32rm<Instruction INSN, SubRegIndex sub_oty, SDNodeXForm MOP = MIMM> :
+    OutPatFrag<(ops node:$cond, node:$comp, node:$t, node:$f),
+               (EXTRACT_SUBREG
+                   (INSN $cond, $comp,
+                         (MOP $t),
+                         (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $f, sub_oty)),
+                   sub_oty)>;
+class cmov128rr<Instruction INSN> :
+    OutPatFrag<(ops node:$cond, node:$comp, node:$t, node:$f),
+               (INSERT_SUBREG
+                 (INSERT_SUBREG (f128 (IMPLICIT_DEF)),
+                   (INSN $cond, $comp,
+                       (EXTRACT_SUBREG $t, sub_odd),
+                       (EXTRACT_SUBREG $f, sub_odd)), sub_odd),
+                 (INSN $cond, $comp,
+                     (EXTRACT_SUBREG $t, sub_even),
+                     (EXTRACT_SUBREG $f, sub_even)), sub_even)>;
+
 // Generic SELECTCC pattern matches
 //
 //   CMP  %tmp, %l, %r       ; compare %l and %r
 //   or   %res, %f, (0)1     ; initialize by %f
 //   CMOV %res, %t, %tmp     ; set %t if %tmp is true
 
-// selectcc for i64 result
-def : Pat<(i64 (selectcc i32:$l, i32:$r, i64:$t, i64:$f, CCSIOp:$cond)),
-          (CMOVWrr (icond2cc $cond), (CMPSWSXrr $l, $r), $t, $f)>;
-def : Pat<(i64 (selectcc i32:$l, i32:$r, i64:$t, i64:$f, CCUIOp:$cond)),
-          (CMOVWrr (icond2cc $cond), (CMPUWrr $l, $r), $t, $f)>;
-def : Pat<(i64 (selectcc i64:$l, i64:$r, i64:$t, i64:$f, CCSIOp:$cond)),
-          (CMOVLrr (icond2cc $cond), (CMPSLrr $l, $r), $t, $f)>;
-def : Pat<(i64 (selectcc i64:$l, i64:$r, i64:$t, i64:$f, CCUIOp:$cond)),
-          (CMOVLrr (icond2cc $cond), (CMPULrr $l, $r), $t, $f)>;
-def : Pat<(i64 (selectcc f32:$l, f32:$r, i64:$t, i64:$f, cond:$cond)),
-          (CMOVSrr (fcond2cc $cond), (FCMPSrr $l, $r), $t, $f)>;
-def : Pat<(i64 (selectcc f64:$l, f64:$r, i64:$t, i64:$f, cond:$cond)),
-          (CMOVDrr (fcond2cc $cond), (FCMPDrr $l, $r), $t, $f)>;
-
-// selectcc for i32 result
 def : Pat<(i32 (selectcc i32:$l, i32:$r, i32:$t, i32:$f, CCSIOp:$cond)),
-          (EXTRACT_SUBREG
-              (CMOVWrr (icond2cc $cond),
-                       (CMPSWSXrr $l, $r),
-                       (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $t, sub_i32),
-                       (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $f, sub_i32)),
-              sub_i32)>;
+          (cmov32rr<CMOVWrr, sub_i32> (icond2cc $cond), (CMPSWSXrr $l, $r),
+                                      $t, $f)>;
 def : Pat<(i32 (selectcc i32:$l, i32:$r, i32:$t, i32:$f, CCUIOp:$cond)),
-          (EXTRACT_SUBREG
-              (CMOVWrr (icond2cc $cond),
-                       (CMPUWrr $l, $r),
-                       (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $t, sub_i32),
-                       (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $f, sub_i32)),
-              sub_i32)>;
+          (cmov32rr<CMOVWrr, sub_i32> (icond2cc $cond), (CMPUWrr $l, $r),
+                                      $t, $f)>;
 def : Pat<(i32 (selectcc i64:$l, i64:$r, i32:$t, i32:$f, CCSIOp:$cond)),
-          (EXTRACT_SUBREG
-              (CMOVLrr (icond2cc $cond),
-                       (CMPSLrr $l, $r),
-                       (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $t, sub_i32),
-                       (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $f, sub_i32)),
-              sub_i32)>;
+          (cmov32rr<CMOVLrr, sub_i32> (icond2cc $cond), (CMPSLrr $l, $r),
+                                      $t, $f)>;
 def : Pat<(i32 (selectcc i64:$l, i64:$r, i32:$t, i32:$f, CCUIOp:$cond)),
-          (EXTRACT_SUBREG
-              (CMOVLrr (icond2cc $cond),
-                       (CMPULrr $l, $r),
-                       (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $t, sub_i32),
-                       (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $f, sub_i32)),
-              sub_i32)>;
+          (cmov32rr<CMOVLrr, sub_i32> (icond2cc $cond), (CMPULrr $l, $r),
+                                      $t, $f)>;
 def : Pat<(i32 (selectcc f32:$l, f32:$r, i32:$t, i32:$f, cond:$cond)),
-          (EXTRACT_SUBREG
-              (CMOVSrr (fcond2cc $cond),
-                       (FCMPSrr $l, $r),
-                       (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $t, sub_i32),
-                       (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $f, sub_i32)),
-              sub_i32)>;
+          (cmov32rr<CMOVSrr, sub_i32> (fcond2cc $cond), (FCMPSrr $l, $r),
+                                      $t, $f)>;
 def : Pat<(i32 (selectcc f64:$l, f64:$r, i32:$t, i32:$f, cond:$cond)),
-          (EXTRACT_SUBREG
-              (CMOVDrr (fcond2cc $cond),
-                       (FCMPDrr $l, $r),
-                       (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $t, sub_i32),
-                       (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $f, sub_i32)),
-              sub_i32)>;
+          (cmov32rr<CMOVDrr, sub_i32> (fcond2cc $cond), (FCMPDrr $l, $r),
+                                      $t, $f)>;
+def : Pat<(i32 (selectcc f128:$l, f128:$r, i32:$t, i32:$f, cond:$cond)),
+          (cmov32rr<CMOVDrr, sub_i32> (fcond2cc $cond), (FCMPQrr $l, $r),
+                                      $t, $f)>;
 
-// selectcc for f64 result
-def : Pat<(f64 (selectcc i32:$l, i32:$r, f64:$t, f64:$f, CCSIOp:$cond)),
-          (CMOVWrr (icond2cc $cond), (CMPSWSXrr $l, $r), $t, $f)>;
-def : Pat<(f64 (selectcc i32:$l, i32:$r, f64:$t, f64:$f, CCUIOp:$cond)),
-          (CMOVWrr (icond2cc $cond), (CMPUWrr $l, $r), $t, $f)>;
-def : Pat<(f64 (selectcc i64:$l, i64:$r, f64:$t, f64:$f, CCSIOp:$cond)),
-          (CMOVLrr (icond2cc $cond), (CMPSLrr $l, $r), $t, $f)>;
-def : Pat<(f64 (selectcc i64:$l, i64:$r, f64:$t, f64:$f, CCUIOp:$cond)),
-          (CMOVLrr (icond2cc $cond), (CMPULrr $l, $r), $t, $f)>;
-def : Pat<(f64 (selectcc f32:$l, f32:$r, f64:$t, f64:$f, cond:$cond)),
-          (CMOVSrr (fcond2cc $cond), (FCMPSrr $l, $r), $t, $f)>;
-def : Pat<(f64 (selectcc f64:$l, f64:$r, f64:$t, f64:$f, cond:$cond)),
-          (CMOVDrr (fcond2cc $cond), (FCMPDrr $l, $r), $t, $f)>;
+def : Pat<(i64 (selectcc i32:$l, i32:$r, i64:$t, i64:$f, CCSIOp:$cond)),
+          (cmovrr<CMOVWrr> (icond2cc $cond), (CMPSWSXrr $l, $r), $t, $f)>;
+def : Pat<(i64 (selectcc i32:$l, i32:$r, i64:$t, i64:$f, CCUIOp:$cond)),
+          (cmovrr<CMOVWrr> (icond2cc $cond), (CMPUWrr $l, $r), $t, $f)>;
+def : Pat<(i64 (selectcc i64:$l, i64:$r, i64:$t, i64:$f, CCSIOp:$cond)),
+          (cmovrr<CMOVLrr> (icond2cc $cond), (CMPSLrr $l, $r), $t, $f)>;
+def : Pat<(i64 (selectcc i64:$l, i64:$r, i64:$t, i64:$f, CCUIOp:$cond)),
+          (cmovrr<CMOVLrr> (icond2cc $cond), (CMPULrr $l, $r), $t, $f)>;
+def : Pat<(i64 (selectcc f32:$l, f32:$r, i64:$t, i64:$f, cond:$cond)),
+          (cmovrr<CMOVSrr> (fcond2cc $cond), (FCMPSrr $l, $r), $t, $f)>;
+def : Pat<(i64 (selectcc f64:$l, f64:$r, i64:$t, i64:$f, cond:$cond)),
+          (cmovrr<CMOVDrr> (fcond2cc $cond), (FCMPDrr $l, $r), $t, $f)>;
+def : Pat<(i64 (selectcc f128:$l, f128:$r, i64:$t, i64:$f, cond:$cond)),
+          (cmovrr<CMOVDrr> (fcond2cc $cond), (FCMPQrr $l, $r), $t, $f)>;
 
-// selectcc for f32 result
 def : Pat<(f32 (selectcc i32:$l, i32:$r, f32:$t, f32:$f, CCSIOp:$cond)),
-          (EXTRACT_SUBREG
-              (CMOVWrr (icond2cc $cond),
-                       (CMPSWSXrr $l, $r),
-                       (INSERT_SUBREG (f64 (IMPLICIT_DEF)), $t, sub_f32),
-                       (INSERT_SUBREG (f64 (IMPLICIT_DEF)), $f, sub_f32)),
-              sub_f32)>;
+          (cmov32rr<CMOVWrr, sub_f32> (icond2cc $cond), (CMPSWSXrr $l, $r),
+                                      $t, $f)>;
 def : Pat<(f32 (selectcc i32:$l, i32:$r, f32:$t, f32:$f, CCUIOp:$cond)),
-          (EXTRACT_SUBREG
-              (CMOVWrr (icond2cc $cond),
-                       (CMPUWrr $l, $r),
-                       (INSERT_SUBREG (f64 (IMPLICIT_DEF)), $t, sub_f32),
-                       (INSERT_SUBREG (f64 (IMPLICIT_DEF)), $f, sub_f32)),
-              sub_f32)>;
+          (cmov32rr<CMOVWrr, sub_f32> (icond2cc $cond), (CMPUWrr $l, $r),
+                                      $t, $f)>;
 def : Pat<(f32 (selectcc i64:$l, i64:$r, f32:$t, f32:$f, CCSIOp:$cond)),
-          (EXTRACT_SUBREG
-              (CMOVLrr (icond2cc $cond),
-                       (CMPSLrr $l, $r),
-                       (INSERT_SUBREG (f64 (IMPLICIT_DEF)), $t, sub_f32),
-                       (INSERT_SUBREG (f64 (IMPLICIT_DEF)), $f, sub_f32)),
-              sub_f32)>;
+          (cmov32rr<CMOVLrr, sub_f32> (icond2cc $cond), (CMPSLrr $l, $r),
+                                      $t, $f)>;
 def : Pat<(f32 (selectcc i64:$l, i64:$r, f32:$t, f32:$f, CCUIOp:$cond)),
-          (EXTRACT_SUBREG
-              (CMOVLrr (icond2cc $cond),
-                       (CMPULrr $l, $r),
-                       (INSERT_SUBREG (f64 (IMPLICIT_DEF)), $t, sub_f32),
-                       (INSERT_SUBREG (f64 (IMPLICIT_DEF)), $f, sub_f32)),
-              sub_f32)>;
+          (cmov32rr<CMOVLrr, sub_f32> (icond2cc $cond), (CMPULrr $l, $r),
+                                      $t, $f)>;
 def : Pat<(f32 (selectcc f32:$l, f32:$r, f32:$t, f32:$f, cond:$cond)),
-          (EXTRACT_SUBREG
-              (CMOVSrr (fcond2cc $cond),
-                       (FCMPSrr $l, $r),
-                       (INSERT_SUBREG (f64 (IMPLICIT_DEF)), $t, sub_f32),
-                       (INSERT_SUBREG (f64 (IMPLICIT_DEF)), $f, sub_f32)),
-              sub_f32)>;
+          (cmov32rr<CMOVSrr, sub_f32> (fcond2cc $cond), (FCMPSrr $l, $r),
+                                      $t, $f)>;
 def : Pat<(f32 (selectcc f64:$l, f64:$r, f32:$t, f32:$f, cond:$cond)),
-          (EXTRACT_SUBREG
-              (CMOVDrr (fcond2cc $cond),
-                       (FCMPDrr $l, $r),
-                       (INSERT_SUBREG (f64 (IMPLICIT_DEF)), $t, sub_f32),
-                       (INSERT_SUBREG (f64 (IMPLICIT_DEF)), $f, sub_f32)),
-              sub_f32)>;
+          (cmov32rr<CMOVDrr, sub_f32> (fcond2cc $cond), (FCMPDrr $l, $r),
+                                      $t, $f)>;
+def : Pat<(f32 (selectcc f128:$l, f128:$r, f32:$t, f32:$f, cond:$cond)),
+          (cmov32rr<CMOVDrr, sub_f32> (fcond2cc $cond), (FCMPQrr $l, $r),
+                                      $t, $f)>;
+
+def : Pat<(f64 (selectcc i32:$l, i32:$r, f64:$t, f64:$f, CCSIOp:$cond)),
+          (cmovrr<CMOVWrr> (icond2cc $cond), (CMPSWSXrr $l, $r), $t, $f)>;
+def : Pat<(f64 (selectcc i32:$l, i32:$r, f64:$t, f64:$f, CCUIOp:$cond)),
+          (cmovrr<CMOVWrr> (icond2cc $cond), (CMPUWrr $l, $r), $t, $f)>;
+def : Pat<(f64 (selectcc i64:$l, i64:$r, f64:$t, f64:$f, CCSIOp:$cond)),
+          (cmovrr<CMOVLrr> (icond2cc $cond), (CMPSLrr $l, $r), $t, $f)>;
+def : Pat<(f64 (selectcc i64:$l, i64:$r, f64:$t, f64:$f, CCUIOp:$cond)),
+          (cmovrr<CMOVLrr> (icond2cc $cond), (CMPULrr $l, $r), $t, $f)>;
+def : Pat<(f64 (selectcc f32:$l, f32:$r, f64:$t, f64:$f, cond:$cond)),
+          (cmovrr<CMOVSrr> (fcond2cc $cond), (FCMPSrr $l, $r), $t, $f)>;
+def : Pat<(f64 (selectcc f64:$l, f64:$r, f64:$t, f64:$f, cond:$cond)),
+          (cmovrr<CMOVDrr> (fcond2cc $cond), (FCMPDrr $l, $r), $t, $f)>;
+def : Pat<(f64 (selectcc f128:$l, f128:$r, f64:$t, f64:$f, cond:$cond)),
+          (cmovrr<CMOVDrr> (fcond2cc $cond), (FCMPQrr $l, $r), $t, $f)>;
+
+def : Pat<(f128 (selectcc i32:$l, i32:$r, f128:$t, f128:$f, CCSIOp:$cond)),
+          (cmov128rr<CMOVWrr> (icond2cc $cond), (CMPSWSXrr $l, $r), $t, $f)>;
+def : Pat<(f128 (selectcc i32:$l, i32:$r, f128:$t, f128:$f, CCUIOp:$cond)),
+          (cmov128rr<CMOVWrr> (icond2cc $cond), (CMPUWrr $l, $r), $t, $f)>;
+def : Pat<(f128 (selectcc i64:$l, i64:$r, f128:$t, f128:$f, CCSIOp:$cond)),
+          (cmov128rr<CMOVLrr> (icond2cc $cond), (CMPSLrr $l, $r), $t, $f)>;
+def : Pat<(f128 (selectcc i64:$l, i64:$r, f128:$t, f128:$f, CCUIOp:$cond)),
+          (cmov128rr<CMOVLrr> (icond2cc $cond), (CMPULrr $l, $r), $t, $f)>;
+def : Pat<(f128 (selectcc f32:$l, f32:$r, f128:$t, f128:$f, cond:$cond)),
+          (cmov128rr<CMOVSrr> (fcond2cc $cond), (FCMPSrr $l, $r), $t, $f)>;
+def : Pat<(f128 (selectcc f64:$l, f64:$r, f128:$t, f128:$f, cond:$cond)),
+          (cmov128rr<CMOVDrr> (fcond2cc $cond), (FCMPDrr $l, $r), $t, $f)>;
+def : Pat<(f128 (selectcc f128:$l, f128:$r, f128:$t, f128:$f, cond:$cond)),
+          (cmov128rr<CMOVDrr> (fcond2cc $cond), (FCMPQrr $l, $r), $t, $f)>;
 
 // Generic SELECT pattern matches
 // Use cmov.w for all cases since %pred holds i32.
 //
 //   CMOV.w.ne %res, %tval, %tmp  ; set tval if %tmp is true
 
+def : Pat<(i32 (select i32:$pred, i32:$t, i32:$f)),
+          (cmov32rr<CMOVWrr, sub_i32> CC_INE, $pred, $t, $f)>;
+def : Pat<(i32 (select i32:$pred, (i32 mimm:$t), i32:$f)),
+          (cmov32rm<CMOVWrm, sub_i32> CC_INE, $pred, $t, $f)>;
+def : Pat<(i32 (select i32:$pred, i32:$t, (i32 mimm:$f))),
+          (cmov32rm<CMOVWrm, sub_i32> CC_IEQ, $pred, $f, $t)>;
+
 def : Pat<(i64 (select i32:$pred, i64:$t, i64:$f)),
-          (CMOVWrr CC_INE, $pred, $t, $f)>;
+          (cmovrr<CMOVWrr> CC_INE, $pred, $t, $f)>;
+def : Pat<(i64 (select i32:$pred, (i64 mimm:$t), i64:$f)),
+          (cmovrm<CMOVWrm, MIMM> CC_INE, $pred, $t, $f)>;
+def : Pat<(i64 (select i32:$pred, i64:$t, (i64 mimm:$f))),
+          (cmovrm<CMOVWrm, MIMM> CC_IEQ, $pred, $f, $t)>;
 
-def : Pat<(i32 (select i32:$pred, i32:$t, i32:$f)),
-          (EXTRACT_SUBREG
-              (CMOVWrr CC_INE, $pred,
-                       (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $t, sub_i32),
-                       (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $f, sub_i32)),
-              sub_i32)>;
+def : Pat<(f32 (select i32:$pred, f32:$t, f32:$f)),
+          (cmov32rr<CMOVWrr, sub_f32> CC_INE, $pred, $t, $f)>;
+def : Pat<(f32 (select i32:$pred, (f32 mimmfp:$t), f32:$f)),
+          (cmov32rm<CMOVWrm, sub_f32, MIMMFP> CC_INE, $pred, $t, $f)>;
+def : Pat<(f32 (select i32:$pred, f32:$t, (f32 mimmfp:$f))),
+          (cmov32rm<CMOVWrm, sub_f32, MIMMFP> CC_IEQ, $pred, $f, $t)>;
 
 def : Pat<(f64 (select i32:$pred, f64:$t, f64:$f)),
-          (CMOVWrr CC_INE, $pred, $t, $f)>;
+          (cmovrr<CMOVWrr> CC_INE, $pred, $t, $f)>;
+def : Pat<(f64 (select i32:$pred, (f64 mimmfp:$t), f64:$f)),
+          (cmovrm<CMOVWrm, MIMMFP> CC_INE, $pred, $t, $f)>;
+def : Pat<(f64 (select i32:$pred, f64:$t, (f64 mimmfp:$f))),
+          (cmovrm<CMOVWrm, MIMMFP> CC_IEQ, $pred, $f, $t)>;
 
-def : Pat<(f32 (select i32:$pred, f32:$t, f32:$f)),
-          (EXTRACT_SUBREG
-            (CMOVWrr CC_INE, $pred,
-                     (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $t, sub_f32),
-                     (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $f, sub_f32)),
-            sub_f32)>;
+def : Pat<(f128 (select i32:$pred, f128:$t, f128:$f)),
+          (cmov128rr<CMOVWrr> CC_INE, $pred, $t, $f)>;
 
 // bitconvert
 def : Pat<(f64 (bitconvert i64:$src)), (COPY_TO_REGCLASS $src, I64)>;
@@ -1982,24 +2267,48 @@ def : Pat<(f32 (bitconvert i32:$op)),
           (EXTRACT_SUBREG (SLLri (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
             $op, sub_i32), 32), sub_f32)>;
 
-// Bits operations pattern matchings.
-def : Pat<(i32 (ctpop i32:$src)),
-          (EXTRACT_SUBREG (PCNTr (ANDrm (INSERT_SUBREG
-            (i64 (IMPLICIT_DEF)), $src, sub_i32), !add(32, 64))), sub_i32)>;
-def : Pat<(i32 (ctlz i32:$src)),
-          (EXTRACT_SUBREG (LDZr (SLLri (INSERT_SUBREG
-            (i64 (IMPLICIT_DEF)), $src, sub_i32), 32)), sub_i32)>;
-def : Pat<(i64 (bswap i64:$src)),
-          (BSWPri $src, 0)>;
-def : Pat<(i32 (bswap i32:$src)),
-          (EXTRACT_SUBREG (BSWPri (INSERT_SUBREG
-            (i64 (IMPLICIT_DEF)), $src, sub_i32), 1), sub_i32)>;
+// Optimize code A generated by `(unsigned char)c << 5` to B.
+// A) sla.w.sx %s0, %s0, 5
+//    lea %s1, 224           ; 0xE0
+//    and %s0, %s0, %s1
+// B) sla.w.sx %s0, %s0, 5
+//    and %s0, %s0, (56)0
+
+def : Pat<(i32 (and i32:$val, 0xff)),
+          (EXTRACT_SUBREG
+              (ANDrm (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $val, sub_i32),
+                     !add(56, 64)), sub_i32)>;
+def : Pat<(i32 (and i32:$val, 0xffff)),
+          (EXTRACT_SUBREG
+              (ANDrm (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $val, sub_i32),
+                     !add(48, 64)), sub_i32)>;
+def : Pat<(i64 (and i64:$val, 0xffffffff)),
+          (ANDrm $val, !add(32, 64))>;
+
+//===----------------------------------------------------------------------===//
+// Vector Instruction Pattern Stuff
+//===----------------------------------------------------------------------===//
+
+// Custom intermediate ISDs.
+class IsVLVT<int OpIdx> : SDTCisVT<OpIdx,i32>;
+def vec_broadcast       : SDNode<"VEISD::VEC_BROADCAST", SDTypeProfile<1, 2,
+                                 [SDTCisVec<0>, IsVLVT<2>]>>;
+
+// Whether this is an all-true mask (assuming undef-bits above VL are all-true).
+def true_mask           : PatLeaf<
+                            (vec_broadcast (i32 nonzero), (i32 srcvalue))>;
+// Match any broadcast (ignoring VL).
+def any_broadcast       : PatFrag<(ops node:$sx),
+                                  (vec_broadcast node:$sx, (i32 srcvalue))>;
+
+// Vector instructions.
+include "VEInstrVec.td"
+
+// The vevlintrin
+include "VEInstrIntrinsicVL.td"
 
-// Several special pattern matches to optimize code
+// Patterns and intermediate SD nodes (VEC_*).
+include "VEInstrPatternsVec.td"
 
-def : Pat<(i32 (and i32:$lhs, 0xff)),
-          (AND32rm $lhs, !add(56, 64))>;
-def : Pat<(i32 (and i32:$lhs, 0xffff)),
-          (AND32rm $lhs, !add(48, 64))>;
-def : Pat<(i32 (and i32:$lhs, 0xffffffff)),
-          (AND32rm $lhs, !add(32, 64))>;
+// Patterns and intermediate SD nodes (VVP_*).
+include "VVPInstrPatternsVec.td"
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VEInstrIntrinsicVL.gen.td b/contrib/llvm-project/llvm/lib/Target/VE/VEInstrIntrinsicVL.gen.td
new file mode 100644
index 000000000000..9ec10838db05
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VEInstrIntrinsicVL.gen.td
@@ -0,0 +1,1604 @@
+def : Pat<(int_ve_vl_vld_vssl i64:$sy, i64:$sz, i32:$vl), (VLDrrl i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vld_vssvl i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VLDrrl_v i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vld_vssl simm7:$I, i64:$sz, i32:$vl), (VLDirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vld_vssvl simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VLDirl_v (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldnc_vssl i64:$sy, i64:$sz, i32:$vl), (VLDNCrrl i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldnc_vssvl i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VLDNCrrl_v i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldnc_vssl simm7:$I, i64:$sz, i32:$vl), (VLDNCirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldnc_vssvl simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VLDNCirl_v (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldu_vssl i64:$sy, i64:$sz, i32:$vl), (VLDUrrl i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldu_vssvl i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VLDUrrl_v i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldu_vssl simm7:$I, i64:$sz, i32:$vl), (VLDUirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldu_vssvl simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VLDUirl_v (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldunc_vssl i64:$sy, i64:$sz, i32:$vl), (VLDUNCrrl i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldunc_vssvl i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VLDUNCrrl_v i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldunc_vssl simm7:$I, i64:$sz, i32:$vl), (VLDUNCirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldunc_vssvl simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VLDUNCirl_v (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldlsx_vssl i64:$sy, i64:$sz, i32:$vl), (VLDLSXrrl i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldlsx_vssvl i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VLDLSXrrl_v i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldlsx_vssl simm7:$I, i64:$sz, i32:$vl), (VLDLSXirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldlsx_vssvl simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VLDLSXirl_v (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldlsxnc_vssl i64:$sy, i64:$sz, i32:$vl), (VLDLSXNCrrl i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldlsxnc_vssvl i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VLDLSXNCrrl_v i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldlsxnc_vssl simm7:$I, i64:$sz, i32:$vl), (VLDLSXNCirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldlsxnc_vssvl simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VLDLSXNCirl_v (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldlzx_vssl i64:$sy, i64:$sz, i32:$vl), (VLDLZXrrl i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldlzx_vssvl i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VLDLZXrrl_v i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldlzx_vssl simm7:$I, i64:$sz, i32:$vl), (VLDLZXirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldlzx_vssvl simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VLDLZXirl_v (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldlzxnc_vssl i64:$sy, i64:$sz, i32:$vl), (VLDLZXNCrrl i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldlzxnc_vssvl i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VLDLZXNCrrl_v i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldlzxnc_vssl simm7:$I, i64:$sz, i32:$vl), (VLDLZXNCirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldlzxnc_vssvl simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VLDLZXNCirl_v (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vld2d_vssl i64:$sy, i64:$sz, i32:$vl), (VLD2Drrl i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vld2d_vssvl i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VLD2Drrl_v i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vld2d_vssl simm7:$I, i64:$sz, i32:$vl), (VLD2Dirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vld2d_vssvl simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VLD2Dirl_v (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vld2dnc_vssl i64:$sy, i64:$sz, i32:$vl), (VLD2DNCrrl i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vld2dnc_vssvl i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VLD2DNCrrl_v i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vld2dnc_vssl simm7:$I, i64:$sz, i32:$vl), (VLD2DNCirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vld2dnc_vssvl simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VLD2DNCirl_v (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldu2d_vssl i64:$sy, i64:$sz, i32:$vl), (VLDU2Drrl i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldu2d_vssvl i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VLDU2Drrl_v i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldu2d_vssl simm7:$I, i64:$sz, i32:$vl), (VLDU2Dirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldu2d_vssvl simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VLDU2Dirl_v (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldu2dnc_vssl i64:$sy, i64:$sz, i32:$vl), (VLDU2DNCrrl i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldu2dnc_vssvl i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VLDU2DNCrrl_v i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldu2dnc_vssl simm7:$I, i64:$sz, i32:$vl), (VLDU2DNCirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldu2dnc_vssvl simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VLDU2DNCirl_v (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldl2dsx_vssl i64:$sy, i64:$sz, i32:$vl), (VLDL2DSXrrl i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldl2dsx_vssvl i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VLDL2DSXrrl_v i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldl2dsx_vssl simm7:$I, i64:$sz, i32:$vl), (VLDL2DSXirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldl2dsx_vssvl simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VLDL2DSXirl_v (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldl2dsxnc_vssl i64:$sy, i64:$sz, i32:$vl), (VLDL2DSXNCrrl i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldl2dsxnc_vssvl i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VLDL2DSXNCrrl_v i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldl2dsxnc_vssl simm7:$I, i64:$sz, i32:$vl), (VLDL2DSXNCirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldl2dsxnc_vssvl simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VLDL2DSXNCirl_v (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldl2dzx_vssl i64:$sy, i64:$sz, i32:$vl), (VLDL2DZXrrl i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldl2dzx_vssvl i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VLDL2DZXrrl_v i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldl2dzx_vssl simm7:$I, i64:$sz, i32:$vl), (VLDL2DZXirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldl2dzx_vssvl simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VLDL2DZXirl_v (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldl2dzxnc_vssl i64:$sy, i64:$sz, i32:$vl), (VLDL2DZXNCrrl i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldl2dzxnc_vssvl i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VLDL2DZXNCrrl_v i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldl2dzxnc_vssl simm7:$I, i64:$sz, i32:$vl), (VLDL2DZXNCirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldl2dzxnc_vssvl simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VLDL2DZXNCirl_v (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vst_vssl v256f64:$vx, i64:$sy, i64:$sz, i32:$vl), (VSTrrvl i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vst_vssl v256f64:$vx, simm7:$I, i64:$sz, i32:$vl), (VSTirvl (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vst_vssml v256f64:$vx, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSTrrvml i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vst_vssml v256f64:$vx, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSTirvml (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstnc_vssl v256f64:$vx, i64:$sy, i64:$sz, i32:$vl), (VSTNCrrvl i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstnc_vssl v256f64:$vx, simm7:$I, i64:$sz, i32:$vl), (VSTNCirvl (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstnc_vssml v256f64:$vx, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSTNCrrvml i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstnc_vssml v256f64:$vx, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSTNCirvml (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstot_vssl v256f64:$vx, i64:$sy, i64:$sz, i32:$vl), (VSTOTrrvl i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstot_vssl v256f64:$vx, simm7:$I, i64:$sz, i32:$vl), (VSTOTirvl (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstot_vssml v256f64:$vx, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSTOTrrvml i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstot_vssml v256f64:$vx, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSTOTirvml (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstncot_vssl v256f64:$vx, i64:$sy, i64:$sz, i32:$vl), (VSTNCOTrrvl i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstncot_vssl v256f64:$vx, simm7:$I, i64:$sz, i32:$vl), (VSTNCOTirvl (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstncot_vssml v256f64:$vx, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSTNCOTrrvml i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstncot_vssml v256f64:$vx, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSTNCOTirvml (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstu_vssl v256f64:$vx, i64:$sy, i64:$sz, i32:$vl), (VSTUrrvl i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstu_vssl v256f64:$vx, simm7:$I, i64:$sz, i32:$vl), (VSTUirvl (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstu_vssml v256f64:$vx, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSTUrrvml i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstu_vssml v256f64:$vx, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSTUirvml (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstunc_vssl v256f64:$vx, i64:$sy, i64:$sz, i32:$vl), (VSTUNCrrvl i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstunc_vssl v256f64:$vx, simm7:$I, i64:$sz, i32:$vl), (VSTUNCirvl (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstunc_vssml v256f64:$vx, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSTUNCrrvml i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstunc_vssml v256f64:$vx, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSTUNCirvml (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstuot_vssl v256f64:$vx, i64:$sy, i64:$sz, i32:$vl), (VSTUOTrrvl i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstuot_vssl v256f64:$vx, simm7:$I, i64:$sz, i32:$vl), (VSTUOTirvl (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstuot_vssml v256f64:$vx, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSTUOTrrvml i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstuot_vssml v256f64:$vx, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSTUOTirvml (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstuncot_vssl v256f64:$vx, i64:$sy, i64:$sz, i32:$vl), (VSTUNCOTrrvl i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstuncot_vssl v256f64:$vx, simm7:$I, i64:$sz, i32:$vl), (VSTUNCOTirvl (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstuncot_vssml v256f64:$vx, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSTUNCOTrrvml i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstuncot_vssml v256f64:$vx, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSTUNCOTirvml (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstl_vssl v256f64:$vx, i64:$sy, i64:$sz, i32:$vl), (VSTLrrvl i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstl_vssl v256f64:$vx, simm7:$I, i64:$sz, i32:$vl), (VSTLirvl (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstl_vssml v256f64:$vx, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSTLrrvml i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstl_vssml v256f64:$vx, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSTLirvml (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstlnc_vssl v256f64:$vx, i64:$sy, i64:$sz, i32:$vl), (VSTLNCrrvl i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstlnc_vssl v256f64:$vx, simm7:$I, i64:$sz, i32:$vl), (VSTLNCirvl (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstlnc_vssml v256f64:$vx, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSTLNCrrvml i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstlnc_vssml v256f64:$vx, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSTLNCirvml (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstlot_vssl v256f64:$vx, i64:$sy, i64:$sz, i32:$vl), (VSTLOTrrvl i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstlot_vssl v256f64:$vx, simm7:$I, i64:$sz, i32:$vl), (VSTLOTirvl (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstlot_vssml v256f64:$vx, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSTLOTrrvml i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstlot_vssml v256f64:$vx, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSTLOTirvml (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstlncot_vssl v256f64:$vx, i64:$sy, i64:$sz, i32:$vl), (VSTLNCOTrrvl i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstlncot_vssl v256f64:$vx, simm7:$I, i64:$sz, i32:$vl), (VSTLNCOTirvl (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstlncot_vssml v256f64:$vx, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSTLNCOTrrvml i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstlncot_vssml v256f64:$vx, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSTLNCOTirvml (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vst2d_vssl v256f64:$vx, i64:$sy, i64:$sz, i32:$vl), (VST2Drrvl i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vst2d_vssl v256f64:$vx, simm7:$I, i64:$sz, i32:$vl), (VST2Dirvl (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vst2d_vssml v256f64:$vx, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VST2Drrvml i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vst2d_vssml v256f64:$vx, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VST2Dirvml (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vst2dnc_vssl v256f64:$vx, i64:$sy, i64:$sz, i32:$vl), (VST2DNCrrvl i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vst2dnc_vssl v256f64:$vx, simm7:$I, i64:$sz, i32:$vl), (VST2DNCirvl (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vst2dnc_vssml v256f64:$vx, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VST2DNCrrvml i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vst2dnc_vssml v256f64:$vx, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VST2DNCirvml (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vst2dot_vssl v256f64:$vx, i64:$sy, i64:$sz, i32:$vl), (VST2DOTrrvl i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vst2dot_vssl v256f64:$vx, simm7:$I, i64:$sz, i32:$vl), (VST2DOTirvl (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vst2dot_vssml v256f64:$vx, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VST2DOTrrvml i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vst2dot_vssml v256f64:$vx, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VST2DOTirvml (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vst2dncot_vssl v256f64:$vx, i64:$sy, i64:$sz, i32:$vl), (VST2DNCOTrrvl i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vst2dncot_vssl v256f64:$vx, simm7:$I, i64:$sz, i32:$vl), (VST2DNCOTirvl (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vst2dncot_vssml v256f64:$vx, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VST2DNCOTrrvml i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vst2dncot_vssml v256f64:$vx, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VST2DNCOTirvml (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstu2d_vssl v256f64:$vx, i64:$sy, i64:$sz, i32:$vl), (VSTU2Drrvl i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstu2d_vssl v256f64:$vx, simm7:$I, i64:$sz, i32:$vl), (VSTU2Dirvl (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstu2d_vssml v256f64:$vx, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSTU2Drrvml i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstu2d_vssml v256f64:$vx, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSTU2Dirvml (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstu2dnc_vssl v256f64:$vx, i64:$sy, i64:$sz, i32:$vl), (VSTU2DNCrrvl i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstu2dnc_vssl v256f64:$vx, simm7:$I, i64:$sz, i32:$vl), (VSTU2DNCirvl (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstu2dnc_vssml v256f64:$vx, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSTU2DNCrrvml i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstu2dnc_vssml v256f64:$vx, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSTU2DNCirvml (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstu2dot_vssl v256f64:$vx, i64:$sy, i64:$sz, i32:$vl), (VSTU2DOTrrvl i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstu2dot_vssl v256f64:$vx, simm7:$I, i64:$sz, i32:$vl), (VSTU2DOTirvl (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstu2dot_vssml v256f64:$vx, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSTU2DOTrrvml i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstu2dot_vssml v256f64:$vx, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSTU2DOTirvml (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstu2dncot_vssl v256f64:$vx, i64:$sy, i64:$sz, i32:$vl), (VSTU2DNCOTrrvl i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstu2dncot_vssl v256f64:$vx, simm7:$I, i64:$sz, i32:$vl), (VSTU2DNCOTirvl (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstu2dncot_vssml v256f64:$vx, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSTU2DNCOTrrvml i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstu2dncot_vssml v256f64:$vx, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSTU2DNCOTirvml (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstl2d_vssl v256f64:$vx, i64:$sy, i64:$sz, i32:$vl), (VSTL2Drrvl i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstl2d_vssl v256f64:$vx, simm7:$I, i64:$sz, i32:$vl), (VSTL2Dirvl (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstl2d_vssml v256f64:$vx, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSTL2Drrvml i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstl2d_vssml v256f64:$vx, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSTL2Dirvml (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstl2dnc_vssl v256f64:$vx, i64:$sy, i64:$sz, i32:$vl), (VSTL2DNCrrvl i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstl2dnc_vssl v256f64:$vx, simm7:$I, i64:$sz, i32:$vl), (VSTL2DNCirvl (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstl2dnc_vssml v256f64:$vx, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSTL2DNCrrvml i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstl2dnc_vssml v256f64:$vx, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSTL2DNCirvml (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstl2dot_vssl v256f64:$vx, i64:$sy, i64:$sz, i32:$vl), (VSTL2DOTrrvl i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstl2dot_vssl v256f64:$vx, simm7:$I, i64:$sz, i32:$vl), (VSTL2DOTirvl (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstl2dot_vssml v256f64:$vx, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSTL2DOTrrvml i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstl2dot_vssml v256f64:$vx, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSTL2DOTirvml (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstl2dncot_vssl v256f64:$vx, i64:$sy, i64:$sz, i32:$vl), (VSTL2DNCOTrrvl i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstl2dncot_vssl v256f64:$vx, simm7:$I, i64:$sz, i32:$vl), (VSTL2DNCOTirvl (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstl2dncot_vssml v256f64:$vx, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSTL2DNCOTrrvml i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstl2dncot_vssml v256f64:$vx, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSTL2DNCOTirvml (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pfchv_ssl i64:$sy, i64:$sz, i32:$vl), (PFCHVrrl i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_pfchv_ssl simm7:$I, i64:$sz, i32:$vl), (PFCHVirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_pfchvnc_ssl i64:$sy, i64:$sz, i32:$vl), (PFCHVNCrrl i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_pfchvnc_ssl simm7:$I, i64:$sz, i32:$vl), (PFCHVNCirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_lvm_mmss v256i1:$ptm, uimm6:$N, i64:$sz), (LVMir_m (ULO7 $N), i64:$sz, v256i1:$ptm)>;
+def : Pat<(int_ve_vl_lvm_MMss v512i1:$ptm, uimm6:$N, i64:$sz), (LVMyir_y (ULO7 $N), i64:$sz, v512i1:$ptm)>;
+def : Pat<(int_ve_vl_svm_sms v256i1:$vmz, uimm6:$N), (SVMmi v256i1:$vmz, (ULO7 $N))>;
+def : Pat<(int_ve_vl_svm_sMs v512i1:$vmz, uimm6:$N), (SVMyi v512i1:$vmz, (ULO7 $N))>;
+def : Pat<(int_ve_vl_vbrdd_vsl f64:$sy, i32:$vl), (VBRDrl f64:$sy, i32:$vl)>;
+def : Pat<(int_ve_vl_vbrdd_vsvl f64:$sy, v256f64:$pt, i32:$vl), (VBRDrl_v f64:$sy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vbrdd_vsmvl f64:$sy, v256i1:$vm, v256f64:$pt, i32:$vl), (VBRDrml_v f64:$sy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vbrdl_vsl i64:$sy, i32:$vl), (VBRDrl i64:$sy, i32:$vl)>;
+def : Pat<(int_ve_vl_vbrdl_vsvl i64:$sy, v256f64:$pt, i32:$vl), (VBRDrl_v i64:$sy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vbrdl_vsmvl i64:$sy, v256i1:$vm, v256f64:$pt, i32:$vl), (VBRDrml_v i64:$sy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vbrdl_vsl simm7:$I, i32:$vl), (VBRDil (LO7 $I), i32:$vl)>;
+def : Pat<(int_ve_vl_vbrdl_vsvl simm7:$I, v256f64:$pt, i32:$vl), (VBRDil_v (LO7 $I), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vbrdl_vsmvl simm7:$I, v256i1:$vm, v256f64:$pt, i32:$vl), (VBRDiml_v (LO7 $I), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vbrds_vsl f32:$sy, i32:$vl), (VBRDUrl f32:$sy, i32:$vl)>;
+def : Pat<(int_ve_vl_vbrds_vsvl f32:$sy, v256f64:$pt, i32:$vl), (VBRDUrl_v f32:$sy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vbrds_vsmvl f32:$sy, v256i1:$vm, v256f64:$pt, i32:$vl), (VBRDUrml_v f32:$sy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vbrdw_vsl i32:$sy, i32:$vl), (VBRDLrl i32:$sy, i32:$vl)>;
+def : Pat<(int_ve_vl_vbrdw_vsvl i32:$sy, v256f64:$pt, i32:$vl), (VBRDLrl_v i32:$sy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vbrdw_vsmvl i32:$sy, v256i1:$vm, v256f64:$pt, i32:$vl), (VBRDLrml_v i32:$sy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vbrdw_vsl simm7:$I, i32:$vl), (VBRDLil (LO7 $I), i32:$vl)>;
+def : Pat<(int_ve_vl_vbrdw_vsvl simm7:$I, v256f64:$pt, i32:$vl), (VBRDLil_v (LO7 $I), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vbrdw_vsmvl simm7:$I, v256i1:$vm, v256f64:$pt, i32:$vl), (VBRDLiml_v (LO7 $I), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvbrd_vsl i64:$sy, i32:$vl), (PVBRDrl i64:$sy, i32:$vl)>;
+def : Pat<(int_ve_vl_pvbrd_vsvl i64:$sy, v256f64:$pt, i32:$vl), (PVBRDrl_v i64:$sy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvbrd_vsMvl i64:$sy, v512i1:$vm, v256f64:$pt, i32:$vl), (PVBRDrml_v i64:$sy, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmv_vsvl uimm7:$N, v256f64:$vz, i32:$vl), (VMVivl (ULO7 $N), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmv_vsvvl uimm7:$N, v256f64:$vz, v256f64:$pt, i32:$vl), (VMVivl_v (ULO7 $N), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmv_vsvmvl uimm7:$N, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMVivml_v (ULO7 $N), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vaddul_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VADDULvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vaddul_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VADDULvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vaddul_vsvl i64:$sy, v256f64:$vz, i32:$vl), (VADDULrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vaddul_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VADDULrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vaddul_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VADDULivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vaddul_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VADDULivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vaddul_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VADDULvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vaddul_vsvmvl i64:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VADDULrvml_v i64:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vaddul_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VADDULivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vadduw_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VADDUWvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vadduw_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VADDUWvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vadduw_vsvl i32:$sy, v256f64:$vz, i32:$vl), (VADDUWrvl i32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vadduw_vsvvl i32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VADDUWrvl_v i32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vadduw_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VADDUWivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vadduw_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VADDUWivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vadduw_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VADDUWvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vadduw_vsvmvl i32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VADDUWrvml_v i32:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vadduw_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VADDUWivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvaddu_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (PVADDUvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvaddu_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVADDUvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvaddu_vsvl i64:$sy, v256f64:$vz, i32:$vl), (PVADDUrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvaddu_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVADDUrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvaddu_vvvMvl v256f64:$vy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVADDUvvml_v v256f64:$vy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvaddu_vsvMvl i64:$sy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVADDUrvml_v i64:$sy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vaddswsx_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VADDSWSXvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vaddswsx_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VADDSWSXvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vaddswsx_vsvl i32:$sy, v256f64:$vz, i32:$vl), (VADDSWSXrvl i32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vaddswsx_vsvvl i32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VADDSWSXrvl_v i32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vaddswsx_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VADDSWSXivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vaddswsx_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VADDSWSXivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vaddswsx_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VADDSWSXvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vaddswsx_vsvmvl i32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VADDSWSXrvml_v i32:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vaddswsx_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VADDSWSXivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vaddswzx_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VADDSWZXvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vaddswzx_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VADDSWZXvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vaddswzx_vsvl i32:$sy, v256f64:$vz, i32:$vl), (VADDSWZXrvl i32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vaddswzx_vsvvl i32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VADDSWZXrvl_v i32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vaddswzx_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VADDSWZXivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vaddswzx_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VADDSWZXivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vaddswzx_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VADDSWZXvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vaddswzx_vsvmvl i32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VADDSWZXrvml_v i32:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vaddswzx_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VADDSWZXivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvadds_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (PVADDSvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvadds_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVADDSvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvadds_vsvl i64:$sy, v256f64:$vz, i32:$vl), (PVADDSrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvadds_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVADDSrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvadds_vvvMvl v256f64:$vy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVADDSvvml_v v256f64:$vy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvadds_vsvMvl i64:$sy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVADDSrvml_v i64:$sy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vaddsl_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VADDSLvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vaddsl_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VADDSLvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vaddsl_vsvl i64:$sy, v256f64:$vz, i32:$vl), (VADDSLrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vaddsl_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VADDSLrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vaddsl_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VADDSLivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vaddsl_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VADDSLivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vaddsl_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VADDSLvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vaddsl_vsvmvl i64:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VADDSLrvml_v i64:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vaddsl_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VADDSLivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubul_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VSUBULvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vsubul_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VSUBULvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubul_vsvl i64:$sy, v256f64:$vz, i32:$vl), (VSUBULrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vsubul_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VSUBULrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubul_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VSUBULivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vsubul_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VSUBULivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubul_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VSUBULvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubul_vsvmvl i64:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VSUBULrvml_v i64:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubul_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VSUBULivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubuw_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VSUBUWvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vsubuw_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VSUBUWvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubuw_vsvl i32:$sy, v256f64:$vz, i32:$vl), (VSUBUWrvl i32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vsubuw_vsvvl i32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VSUBUWrvl_v i32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubuw_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VSUBUWivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vsubuw_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VSUBUWivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubuw_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VSUBUWvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubuw_vsvmvl i32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VSUBUWrvml_v i32:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubuw_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VSUBUWivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvsubu_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (PVSUBUvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvsubu_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVSUBUvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvsubu_vsvl i64:$sy, v256f64:$vz, i32:$vl), (PVSUBUrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvsubu_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVSUBUrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvsubu_vvvMvl v256f64:$vy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVSUBUvvml_v v256f64:$vy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvsubu_vsvMvl i64:$sy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVSUBUrvml_v i64:$sy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubswsx_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VSUBSWSXvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vsubswsx_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VSUBSWSXvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubswsx_vsvl i32:$sy, v256f64:$vz, i32:$vl), (VSUBSWSXrvl i32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vsubswsx_vsvvl i32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VSUBSWSXrvl_v i32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubswsx_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VSUBSWSXivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vsubswsx_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VSUBSWSXivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubswsx_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VSUBSWSXvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubswsx_vsvmvl i32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VSUBSWSXrvml_v i32:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubswsx_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VSUBSWSXivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubswzx_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VSUBSWZXvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vsubswzx_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VSUBSWZXvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubswzx_vsvl i32:$sy, v256f64:$vz, i32:$vl), (VSUBSWZXrvl i32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vsubswzx_vsvvl i32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VSUBSWZXrvl_v i32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubswzx_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VSUBSWZXivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vsubswzx_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VSUBSWZXivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubswzx_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VSUBSWZXvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubswzx_vsvmvl i32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VSUBSWZXrvml_v i32:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubswzx_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VSUBSWZXivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvsubs_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (PVSUBSvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvsubs_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVSUBSvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvsubs_vsvl i64:$sy, v256f64:$vz, i32:$vl), (PVSUBSrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvsubs_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVSUBSrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvsubs_vvvMvl v256f64:$vy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVSUBSvvml_v v256f64:$vy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvsubs_vsvMvl i64:$sy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVSUBSrvml_v i64:$sy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubsl_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VSUBSLvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vsubsl_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VSUBSLvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubsl_vsvl i64:$sy, v256f64:$vz, i32:$vl), (VSUBSLrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vsubsl_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VSUBSLrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubsl_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VSUBSLivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vsubsl_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VSUBSLivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubsl_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VSUBSLvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubsl_vsvmvl i64:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VSUBSLrvml_v i64:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubsl_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VSUBSLivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulul_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VMULULvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmulul_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VMULULvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulul_vsvl i64:$sy, v256f64:$vz, i32:$vl), (VMULULrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmulul_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VMULULrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulul_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VMULULivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmulul_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VMULULivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulul_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMULULvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulul_vsvmvl i64:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMULULrvml_v i64:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulul_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMULULivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmuluw_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VMULUWvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmuluw_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VMULUWvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmuluw_vsvl i32:$sy, v256f64:$vz, i32:$vl), (VMULUWrvl i32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmuluw_vsvvl i32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VMULUWrvl_v i32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmuluw_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VMULUWivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmuluw_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VMULUWivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmuluw_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMULUWvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmuluw_vsvmvl i32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMULUWrvml_v i32:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmuluw_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMULUWivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulswsx_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VMULSWSXvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmulswsx_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VMULSWSXvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulswsx_vsvl i32:$sy, v256f64:$vz, i32:$vl), (VMULSWSXrvl i32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmulswsx_vsvvl i32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VMULSWSXrvl_v i32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulswsx_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VMULSWSXivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmulswsx_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VMULSWSXivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulswsx_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMULSWSXvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulswsx_vsvmvl i32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMULSWSXrvml_v i32:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulswsx_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMULSWSXivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulswzx_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VMULSWZXvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmulswzx_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VMULSWZXvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulswzx_vsvl i32:$sy, v256f64:$vz, i32:$vl), (VMULSWZXrvl i32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmulswzx_vsvvl i32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VMULSWZXrvl_v i32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulswzx_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VMULSWZXivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmulswzx_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VMULSWZXivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulswzx_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMULSWZXvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulswzx_vsvmvl i32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMULSWZXrvml_v i32:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulswzx_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMULSWZXivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulsl_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VMULSLvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmulsl_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VMULSLvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulsl_vsvl i64:$sy, v256f64:$vz, i32:$vl), (VMULSLrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmulsl_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VMULSLrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulsl_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VMULSLivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmulsl_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VMULSLivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulsl_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMULSLvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulsl_vsvmvl i64:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMULSLrvml_v i64:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulsl_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMULSLivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulslw_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VMULSLWvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmulslw_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VMULSLWvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulslw_vsvl i32:$sy, v256f64:$vz, i32:$vl), (VMULSLWrvl i32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmulslw_vsvvl i32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VMULSLWrvl_v i32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulslw_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VMULSLWivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmulslw_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VMULSLWivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivul_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VDIVULvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vdivul_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VDIVULvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivul_vsvl i64:$sy, v256f64:$vz, i32:$vl), (VDIVULrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vdivul_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VDIVULrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivul_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VDIVULivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vdivul_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VDIVULivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivul_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VDIVULvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivul_vsvmvl i64:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VDIVULrvml_v i64:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivul_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VDIVULivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivuw_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VDIVUWvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vdivuw_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VDIVUWvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivuw_vsvl i32:$sy, v256f64:$vz, i32:$vl), (VDIVUWrvl i32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vdivuw_vsvvl i32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VDIVUWrvl_v i32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivuw_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VDIVUWivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vdivuw_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VDIVUWivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivuw_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VDIVUWvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivuw_vsvmvl i32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VDIVUWrvml_v i32:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivuw_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VDIVUWivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivul_vvsl v256f64:$vy, i64:$sy, i32:$vl), (VDIVULvrl v256f64:$vy, i64:$sy, i32:$vl)>;
+def : Pat<(int_ve_vl_vdivul_vvsvl v256f64:$vy, i64:$sy, v256f64:$pt, i32:$vl), (VDIVULvrl_v v256f64:$vy, i64:$sy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivul_vvsl v256f64:$vy, simm7:$I, i32:$vl), (VDIVULvil v256f64:$vy, (LO7 $I), i32:$vl)>;
+def : Pat<(int_ve_vl_vdivul_vvsvl v256f64:$vy, simm7:$I, v256f64:$pt, i32:$vl), (VDIVULvil_v v256f64:$vy, (LO7 $I), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivul_vvsmvl v256f64:$vy, i64:$sy, v256i1:$vm, v256f64:$pt, i32:$vl), (VDIVULvrml_v v256f64:$vy, i64:$sy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivul_vvsmvl v256f64:$vy, simm7:$I, v256i1:$vm, v256f64:$pt, i32:$vl), (VDIVULviml_v v256f64:$vy, (LO7 $I), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivuw_vvsl v256f64:$vy, i32:$sy, i32:$vl), (VDIVUWvrl v256f64:$vy, i32:$sy, i32:$vl)>;
+def : Pat<(int_ve_vl_vdivuw_vvsvl v256f64:$vy, i32:$sy, v256f64:$pt, i32:$vl), (VDIVUWvrl_v v256f64:$vy, i32:$sy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivuw_vvsl v256f64:$vy, simm7:$I, i32:$vl), (VDIVUWvil v256f64:$vy, (LO7 $I), i32:$vl)>;
+def : Pat<(int_ve_vl_vdivuw_vvsvl v256f64:$vy, simm7:$I, v256f64:$pt, i32:$vl), (VDIVUWvil_v v256f64:$vy, (LO7 $I), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivuw_vvsmvl v256f64:$vy, i32:$sy, v256i1:$vm, v256f64:$pt, i32:$vl), (VDIVUWvrml_v v256f64:$vy, i32:$sy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivuw_vvsmvl v256f64:$vy, simm7:$I, v256i1:$vm, v256f64:$pt, i32:$vl), (VDIVUWviml_v v256f64:$vy, (LO7 $I), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivswsx_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VDIVSWSXvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vdivswsx_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VDIVSWSXvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivswsx_vsvl i32:$sy, v256f64:$vz, i32:$vl), (VDIVSWSXrvl i32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vdivswsx_vsvvl i32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VDIVSWSXrvl_v i32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivswsx_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VDIVSWSXivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vdivswsx_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VDIVSWSXivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivswsx_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VDIVSWSXvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivswsx_vsvmvl i32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VDIVSWSXrvml_v i32:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivswsx_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VDIVSWSXivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivswzx_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VDIVSWZXvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vdivswzx_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VDIVSWZXvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivswzx_vsvl i32:$sy, v256f64:$vz, i32:$vl), (VDIVSWZXrvl i32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vdivswzx_vsvvl i32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VDIVSWZXrvl_v i32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivswzx_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VDIVSWZXivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vdivswzx_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VDIVSWZXivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivswzx_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VDIVSWZXvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivswzx_vsvmvl i32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VDIVSWZXrvml_v i32:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivswzx_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VDIVSWZXivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivswsx_vvsl v256f64:$vy, i32:$sy, i32:$vl), (VDIVSWSXvrl v256f64:$vy, i32:$sy, i32:$vl)>;
+def : Pat<(int_ve_vl_vdivswsx_vvsvl v256f64:$vy, i32:$sy, v256f64:$pt, i32:$vl), (VDIVSWSXvrl_v v256f64:$vy, i32:$sy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivswsx_vvsl v256f64:$vy, simm7:$I, i32:$vl), (VDIVSWSXvil v256f64:$vy, (LO7 $I), i32:$vl)>;
+def : Pat<(int_ve_vl_vdivswsx_vvsvl v256f64:$vy, simm7:$I, v256f64:$pt, i32:$vl), (VDIVSWSXvil_v v256f64:$vy, (LO7 $I), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivswsx_vvsmvl v256f64:$vy, i32:$sy, v256i1:$vm, v256f64:$pt, i32:$vl), (VDIVSWSXvrml_v v256f64:$vy, i32:$sy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivswsx_vvsmvl v256f64:$vy, simm7:$I, v256i1:$vm, v256f64:$pt, i32:$vl), (VDIVSWSXviml_v v256f64:$vy, (LO7 $I), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivswzx_vvsl v256f64:$vy, i32:$sy, i32:$vl), (VDIVSWZXvrl v256f64:$vy, i32:$sy, i32:$vl)>;
+def : Pat<(int_ve_vl_vdivswzx_vvsvl v256f64:$vy, i32:$sy, v256f64:$pt, i32:$vl), (VDIVSWZXvrl_v v256f64:$vy, i32:$sy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivswzx_vvsl v256f64:$vy, simm7:$I, i32:$vl), (VDIVSWZXvil v256f64:$vy, (LO7 $I), i32:$vl)>;
+def : Pat<(int_ve_vl_vdivswzx_vvsvl v256f64:$vy, simm7:$I, v256f64:$pt, i32:$vl), (VDIVSWZXvil_v v256f64:$vy, (LO7 $I), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivswzx_vvsmvl v256f64:$vy, i32:$sy, v256i1:$vm, v256f64:$pt, i32:$vl), (VDIVSWZXvrml_v v256f64:$vy, i32:$sy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivswzx_vvsmvl v256f64:$vy, simm7:$I, v256i1:$vm, v256f64:$pt, i32:$vl), (VDIVSWZXviml_v v256f64:$vy, (LO7 $I), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivsl_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VDIVSLvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vdivsl_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VDIVSLvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivsl_vsvl i64:$sy, v256f64:$vz, i32:$vl), (VDIVSLrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vdivsl_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VDIVSLrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivsl_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VDIVSLivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vdivsl_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VDIVSLivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivsl_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VDIVSLvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivsl_vsvmvl i64:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VDIVSLrvml_v i64:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivsl_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VDIVSLivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivsl_vvsl v256f64:$vy, i64:$sy, i32:$vl), (VDIVSLvrl v256f64:$vy, i64:$sy, i32:$vl)>;
+def : Pat<(int_ve_vl_vdivsl_vvsvl v256f64:$vy, i64:$sy, v256f64:$pt, i32:$vl), (VDIVSLvrl_v v256f64:$vy, i64:$sy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivsl_vvsl v256f64:$vy, simm7:$I, i32:$vl), (VDIVSLvil v256f64:$vy, (LO7 $I), i32:$vl)>;
+def : Pat<(int_ve_vl_vdivsl_vvsvl v256f64:$vy, simm7:$I, v256f64:$pt, i32:$vl), (VDIVSLvil_v v256f64:$vy, (LO7 $I), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivsl_vvsmvl v256f64:$vy, i64:$sy, v256i1:$vm, v256f64:$pt, i32:$vl), (VDIVSLvrml_v v256f64:$vy, i64:$sy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivsl_vvsmvl v256f64:$vy, simm7:$I, v256i1:$vm, v256f64:$pt, i32:$vl), (VDIVSLviml_v v256f64:$vy, (LO7 $I), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpul_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VCMPULvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vcmpul_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VCMPULvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpul_vsvl i64:$sy, v256f64:$vz, i32:$vl), (VCMPULrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vcmpul_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VCMPULrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpul_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VCMPULivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vcmpul_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VCMPULivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpul_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VCMPULvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpul_vsvmvl i64:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VCMPULrvml_v i64:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpul_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VCMPULivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpuw_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VCMPUWvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vcmpuw_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VCMPUWvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpuw_vsvl i32:$sy, v256f64:$vz, i32:$vl), (VCMPUWrvl i32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vcmpuw_vsvvl i32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VCMPUWrvl_v i32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpuw_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VCMPUWivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vcmpuw_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VCMPUWivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpuw_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VCMPUWvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpuw_vsvmvl i32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VCMPUWrvml_v i32:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpuw_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VCMPUWivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvcmpu_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (PVCMPUvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvcmpu_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVCMPUvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvcmpu_vsvl i64:$sy, v256f64:$vz, i32:$vl), (PVCMPUrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvcmpu_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVCMPUrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvcmpu_vvvMvl v256f64:$vy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVCMPUvvml_v v256f64:$vy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvcmpu_vsvMvl i64:$sy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVCMPUrvml_v i64:$sy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpswsx_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VCMPSWSXvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vcmpswsx_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VCMPSWSXvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpswsx_vsvl i32:$sy, v256f64:$vz, i32:$vl), (VCMPSWSXrvl i32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vcmpswsx_vsvvl i32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VCMPSWSXrvl_v i32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpswsx_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VCMPSWSXivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vcmpswsx_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VCMPSWSXivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpswsx_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VCMPSWSXvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpswsx_vsvmvl i32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VCMPSWSXrvml_v i32:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpswsx_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VCMPSWSXivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpswzx_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VCMPSWZXvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vcmpswzx_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VCMPSWZXvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpswzx_vsvl i32:$sy, v256f64:$vz, i32:$vl), (VCMPSWZXrvl i32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vcmpswzx_vsvvl i32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VCMPSWZXrvl_v i32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpswzx_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VCMPSWZXivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vcmpswzx_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VCMPSWZXivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpswzx_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VCMPSWZXvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpswzx_vsvmvl i32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VCMPSWZXrvml_v i32:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpswzx_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VCMPSWZXivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvcmps_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (PVCMPSvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvcmps_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVCMPSvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvcmps_vsvl i64:$sy, v256f64:$vz, i32:$vl), (PVCMPSrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvcmps_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVCMPSrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvcmps_vvvMvl v256f64:$vy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVCMPSvvml_v v256f64:$vy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvcmps_vsvMvl i64:$sy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVCMPSrvml_v i64:$sy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpsl_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VCMPSLvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vcmpsl_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VCMPSLvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpsl_vsvl i64:$sy, v256f64:$vz, i32:$vl), (VCMPSLrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vcmpsl_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VCMPSLrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpsl_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VCMPSLivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vcmpsl_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VCMPSLivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpsl_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VCMPSLvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpsl_vsvmvl i64:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VCMPSLrvml_v i64:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpsl_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VCMPSLivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmaxswsx_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VMAXSWSXvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmaxswsx_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VMAXSWSXvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmaxswsx_vsvl i32:$sy, v256f64:$vz, i32:$vl), (VMAXSWSXrvl i32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmaxswsx_vsvvl i32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VMAXSWSXrvl_v i32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmaxswsx_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VMAXSWSXivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmaxswsx_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VMAXSWSXivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmaxswsx_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMAXSWSXvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmaxswsx_vsvmvl i32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMAXSWSXrvml_v i32:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmaxswsx_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMAXSWSXivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmaxswzx_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VMAXSWZXvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmaxswzx_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VMAXSWZXvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmaxswzx_vsvl i32:$sy, v256f64:$vz, i32:$vl), (VMAXSWZXrvl i32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmaxswzx_vsvvl i32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VMAXSWZXrvl_v i32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmaxswzx_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VMAXSWZXivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmaxswzx_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VMAXSWZXivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmaxswzx_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMAXSWZXvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmaxswzx_vsvmvl i32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMAXSWZXrvml_v i32:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmaxswzx_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMAXSWZXivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvmaxs_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (PVMAXSvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvmaxs_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVMAXSvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvmaxs_vsvl i64:$sy, v256f64:$vz, i32:$vl), (PVMAXSrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvmaxs_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVMAXSrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvmaxs_vvvMvl v256f64:$vy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVMAXSvvml_v v256f64:$vy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvmaxs_vsvMvl i64:$sy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVMAXSrvml_v i64:$sy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vminswsx_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VMINSWSXvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vminswsx_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VMINSWSXvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vminswsx_vsvl i32:$sy, v256f64:$vz, i32:$vl), (VMINSWSXrvl i32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vminswsx_vsvvl i32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VMINSWSXrvl_v i32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vminswsx_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VMINSWSXivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vminswsx_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VMINSWSXivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vminswsx_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMINSWSXvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vminswsx_vsvmvl i32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMINSWSXrvml_v i32:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vminswsx_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMINSWSXivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vminswzx_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VMINSWZXvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vminswzx_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VMINSWZXvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vminswzx_vsvl i32:$sy, v256f64:$vz, i32:$vl), (VMINSWZXrvl i32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vminswzx_vsvvl i32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VMINSWZXrvl_v i32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vminswzx_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VMINSWZXivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vminswzx_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VMINSWZXivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vminswzx_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMINSWZXvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vminswzx_vsvmvl i32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMINSWZXrvml_v i32:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vminswzx_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMINSWZXivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvmins_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (PVMINSvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvmins_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVMINSvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvmins_vsvl i64:$sy, v256f64:$vz, i32:$vl), (PVMINSrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvmins_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVMINSrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvmins_vvvMvl v256f64:$vy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVMINSvvml_v v256f64:$vy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvmins_vsvMvl i64:$sy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVMINSrvml_v i64:$sy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmaxsl_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VMAXSLvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmaxsl_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VMAXSLvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmaxsl_vsvl i64:$sy, v256f64:$vz, i32:$vl), (VMAXSLrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmaxsl_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VMAXSLrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmaxsl_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VMAXSLivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmaxsl_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VMAXSLivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmaxsl_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMAXSLvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmaxsl_vsvmvl i64:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMAXSLrvml_v i64:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmaxsl_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMAXSLivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vminsl_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VMINSLvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vminsl_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VMINSLvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vminsl_vsvl i64:$sy, v256f64:$vz, i32:$vl), (VMINSLrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vminsl_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VMINSLrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vminsl_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VMINSLivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vminsl_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VMINSLivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vminsl_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMINSLvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vminsl_vsvmvl i64:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMINSLrvml_v i64:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vminsl_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMINSLivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vand_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VANDvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vand_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VANDvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vand_vsvl i64:$sy, v256f64:$vz, i32:$vl), (VANDrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vand_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VANDrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vand_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VANDvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vand_vsvmvl i64:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VANDrvml_v i64:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvand_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (PVANDvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvand_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVANDvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvand_vsvl i64:$sy, v256f64:$vz, i32:$vl), (PVANDrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvand_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVANDrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvand_vvvMvl v256f64:$vy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVANDvvml_v v256f64:$vy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvand_vsvMvl i64:$sy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVANDrvml_v i64:$sy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vor_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VORvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vor_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VORvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vor_vsvl i64:$sy, v256f64:$vz, i32:$vl), (VORrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vor_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VORrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vor_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VORvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vor_vsvmvl i64:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VORrvml_v i64:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvor_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (PVORvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvor_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVORvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvor_vsvl i64:$sy, v256f64:$vz, i32:$vl), (PVORrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvor_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVORrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvor_vvvMvl v256f64:$vy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVORvvml_v v256f64:$vy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvor_vsvMvl i64:$sy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVORrvml_v i64:$sy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vxor_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VXORvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vxor_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VXORvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vxor_vsvl i64:$sy, v256f64:$vz, i32:$vl), (VXORrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vxor_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VXORrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vxor_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VXORvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vxor_vsvmvl i64:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VXORrvml_v i64:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvxor_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (PVXORvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvxor_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVXORvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvxor_vsvl i64:$sy, v256f64:$vz, i32:$vl), (PVXORrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvxor_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVXORrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvxor_vvvMvl v256f64:$vy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVXORvvml_v v256f64:$vy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvxor_vsvMvl i64:$sy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVXORrvml_v i64:$sy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_veqv_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VEQVvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_veqv_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VEQVvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_veqv_vsvl i64:$sy, v256f64:$vz, i32:$vl), (VEQVrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_veqv_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VEQVrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_veqv_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VEQVvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_veqv_vsvmvl i64:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VEQVrvml_v i64:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pveqv_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (PVEQVvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pveqv_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVEQVvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pveqv_vsvl i64:$sy, v256f64:$vz, i32:$vl), (PVEQVrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pveqv_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVEQVrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pveqv_vvvMvl v256f64:$vy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVEQVvvml_v v256f64:$vy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pveqv_vsvMvl i64:$sy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVEQVrvml_v i64:$sy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vseq_vl i32:$vl), (VSEQl i32:$vl)>;
+def : Pat<(int_ve_vl_vseq_vvl v256f64:$pt, i32:$vl), (VSEQl_v i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvseqlo_vl i32:$vl), (PVSEQLOl i32:$vl)>;
+def : Pat<(int_ve_vl_pvseqlo_vvl v256f64:$pt, i32:$vl), (PVSEQLOl_v i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvsequp_vl i32:$vl), (PVSEQUPl i32:$vl)>;
+def : Pat<(int_ve_vl_pvsequp_vvl v256f64:$pt, i32:$vl), (PVSEQUPl_v i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvseq_vl i32:$vl), (PVSEQl i32:$vl)>;
+def : Pat<(int_ve_vl_pvseq_vvl v256f64:$pt, i32:$vl), (PVSEQl_v i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsll_vvvl v256f64:$vz, v256f64:$vy, i32:$vl), (VSLLvvl v256f64:$vz, v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vsll_vvvvl v256f64:$vz, v256f64:$vy, v256f64:$pt, i32:$vl), (VSLLvvl_v v256f64:$vz, v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsll_vvsl v256f64:$vz, i64:$sy, i32:$vl), (VSLLvrl v256f64:$vz, i64:$sy, i32:$vl)>;
+def : Pat<(int_ve_vl_vsll_vvsvl v256f64:$vz, i64:$sy, v256f64:$pt, i32:$vl), (VSLLvrl_v v256f64:$vz, i64:$sy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsll_vvsl v256f64:$vz, uimm6:$N, i32:$vl), (VSLLvil v256f64:$vz, (ULO7 $N), i32:$vl)>;
+def : Pat<(int_ve_vl_vsll_vvsvl v256f64:$vz, uimm6:$N, v256f64:$pt, i32:$vl), (VSLLvil_v v256f64:$vz, (ULO7 $N), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsll_vvvmvl v256f64:$vz, v256f64:$vy, v256i1:$vm, v256f64:$pt, i32:$vl), (VSLLvvml_v v256f64:$vz, v256f64:$vy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsll_vvsmvl v256f64:$vz, i64:$sy, v256i1:$vm, v256f64:$pt, i32:$vl), (VSLLvrml_v v256f64:$vz, i64:$sy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsll_vvsmvl v256f64:$vz, uimm6:$N, v256i1:$vm, v256f64:$pt, i32:$vl), (VSLLviml_v v256f64:$vz, (ULO7 $N), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvsll_vvvl v256f64:$vz, v256f64:$vy, i32:$vl), (PVSLLvvl v256f64:$vz, v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_pvsll_vvvvl v256f64:$vz, v256f64:$vy, v256f64:$pt, i32:$vl), (PVSLLvvl_v v256f64:$vz, v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvsll_vvsl v256f64:$vz, i64:$sy, i32:$vl), (PVSLLvrl v256f64:$vz, i64:$sy, i32:$vl)>;
+def : Pat<(int_ve_vl_pvsll_vvsvl v256f64:$vz, i64:$sy, v256f64:$pt, i32:$vl), (PVSLLvrl_v v256f64:$vz, i64:$sy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvsll_vvvMvl v256f64:$vz, v256f64:$vy, v512i1:$vm, v256f64:$pt, i32:$vl), (PVSLLvvml_v v256f64:$vz, v256f64:$vy, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvsll_vvsMvl v256f64:$vz, i64:$sy, v512i1:$vm, v256f64:$pt, i32:$vl), (PVSLLvrml_v v256f64:$vz, i64:$sy, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsrl_vvvl v256f64:$vz, v256f64:$vy, i32:$vl), (VSRLvvl v256f64:$vz, v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vsrl_vvvvl v256f64:$vz, v256f64:$vy, v256f64:$pt, i32:$vl), (VSRLvvl_v v256f64:$vz, v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsrl_vvsl v256f64:$vz, i64:$sy, i32:$vl), (VSRLvrl v256f64:$vz, i64:$sy, i32:$vl)>;
+def : Pat<(int_ve_vl_vsrl_vvsvl v256f64:$vz, i64:$sy, v256f64:$pt, i32:$vl), (VSRLvrl_v v256f64:$vz, i64:$sy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsrl_vvsl v256f64:$vz, uimm6:$N, i32:$vl), (VSRLvil v256f64:$vz, (ULO7 $N), i32:$vl)>;
+def : Pat<(int_ve_vl_vsrl_vvsvl v256f64:$vz, uimm6:$N, v256f64:$pt, i32:$vl), (VSRLvil_v v256f64:$vz, (ULO7 $N), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsrl_vvvmvl v256f64:$vz, v256f64:$vy, v256i1:$vm, v256f64:$pt, i32:$vl), (VSRLvvml_v v256f64:$vz, v256f64:$vy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsrl_vvsmvl v256f64:$vz, i64:$sy, v256i1:$vm, v256f64:$pt, i32:$vl), (VSRLvrml_v v256f64:$vz, i64:$sy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsrl_vvsmvl v256f64:$vz, uimm6:$N, v256i1:$vm, v256f64:$pt, i32:$vl), (VSRLviml_v v256f64:$vz, (ULO7 $N), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvsrl_vvvl v256f64:$vz, v256f64:$vy, i32:$vl), (PVSRLvvl v256f64:$vz, v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_pvsrl_vvvvl v256f64:$vz, v256f64:$vy, v256f64:$pt, i32:$vl), (PVSRLvvl_v v256f64:$vz, v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvsrl_vvsl v256f64:$vz, i64:$sy, i32:$vl), (PVSRLvrl v256f64:$vz, i64:$sy, i32:$vl)>;
+def : Pat<(int_ve_vl_pvsrl_vvsvl v256f64:$vz, i64:$sy, v256f64:$pt, i32:$vl), (PVSRLvrl_v v256f64:$vz, i64:$sy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvsrl_vvvMvl v256f64:$vz, v256f64:$vy, v512i1:$vm, v256f64:$pt, i32:$vl), (PVSRLvvml_v v256f64:$vz, v256f64:$vy, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvsrl_vvsMvl v256f64:$vz, i64:$sy, v512i1:$vm, v256f64:$pt, i32:$vl), (PVSRLvrml_v v256f64:$vz, i64:$sy, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vslawsx_vvvl v256f64:$vz, v256f64:$vy, i32:$vl), (VSLAWSXvvl v256f64:$vz, v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vslawsx_vvvvl v256f64:$vz, v256f64:$vy, v256f64:$pt, i32:$vl), (VSLAWSXvvl_v v256f64:$vz, v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vslawsx_vvsl v256f64:$vz, i32:$sy, i32:$vl), (VSLAWSXvrl v256f64:$vz, i32:$sy, i32:$vl)>;
+def : Pat<(int_ve_vl_vslawsx_vvsvl v256f64:$vz, i32:$sy, v256f64:$pt, i32:$vl), (VSLAWSXvrl_v v256f64:$vz, i32:$sy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vslawsx_vvsl v256f64:$vz, uimm6:$N, i32:$vl), (VSLAWSXvil v256f64:$vz, (ULO7 $N), i32:$vl)>;
+def : Pat<(int_ve_vl_vslawsx_vvsvl v256f64:$vz, uimm6:$N, v256f64:$pt, i32:$vl), (VSLAWSXvil_v v256f64:$vz, (ULO7 $N), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vslawsx_vvvmvl v256f64:$vz, v256f64:$vy, v256i1:$vm, v256f64:$pt, i32:$vl), (VSLAWSXvvml_v v256f64:$vz, v256f64:$vy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vslawsx_vvsmvl v256f64:$vz, i32:$sy, v256i1:$vm, v256f64:$pt, i32:$vl), (VSLAWSXvrml_v v256f64:$vz, i32:$sy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vslawsx_vvsmvl v256f64:$vz, uimm6:$N, v256i1:$vm, v256f64:$pt, i32:$vl), (VSLAWSXviml_v v256f64:$vz, (ULO7 $N), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vslawzx_vvvl v256f64:$vz, v256f64:$vy, i32:$vl), (VSLAWZXvvl v256f64:$vz, v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vslawzx_vvvvl v256f64:$vz, v256f64:$vy, v256f64:$pt, i32:$vl), (VSLAWZXvvl_v v256f64:$vz, v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vslawzx_vvsl v256f64:$vz, i32:$sy, i32:$vl), (VSLAWZXvrl v256f64:$vz, i32:$sy, i32:$vl)>;
+def : Pat<(int_ve_vl_vslawzx_vvsvl v256f64:$vz, i32:$sy, v256f64:$pt, i32:$vl), (VSLAWZXvrl_v v256f64:$vz, i32:$sy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vslawzx_vvsl v256f64:$vz, uimm6:$N, i32:$vl), (VSLAWZXvil v256f64:$vz, (ULO7 $N), i32:$vl)>;
+def : Pat<(int_ve_vl_vslawzx_vvsvl v256f64:$vz, uimm6:$N, v256f64:$pt, i32:$vl), (VSLAWZXvil_v v256f64:$vz, (ULO7 $N), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vslawzx_vvvmvl v256f64:$vz, v256f64:$vy, v256i1:$vm, v256f64:$pt, i32:$vl), (VSLAWZXvvml_v v256f64:$vz, v256f64:$vy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vslawzx_vvsmvl v256f64:$vz, i32:$sy, v256i1:$vm, v256f64:$pt, i32:$vl), (VSLAWZXvrml_v v256f64:$vz, i32:$sy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vslawzx_vvsmvl v256f64:$vz, uimm6:$N, v256i1:$vm, v256f64:$pt, i32:$vl), (VSLAWZXviml_v v256f64:$vz, (ULO7 $N), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvsla_vvvl v256f64:$vz, v256f64:$vy, i32:$vl), (PVSLAvvl v256f64:$vz, v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_pvsla_vvvvl v256f64:$vz, v256f64:$vy, v256f64:$pt, i32:$vl), (PVSLAvvl_v v256f64:$vz, v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvsla_vvsl v256f64:$vz, i64:$sy, i32:$vl), (PVSLAvrl v256f64:$vz, i64:$sy, i32:$vl)>;
+def : Pat<(int_ve_vl_pvsla_vvsvl v256f64:$vz, i64:$sy, v256f64:$pt, i32:$vl), (PVSLAvrl_v v256f64:$vz, i64:$sy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvsla_vvvMvl v256f64:$vz, v256f64:$vy, v512i1:$vm, v256f64:$pt, i32:$vl), (PVSLAvvml_v v256f64:$vz, v256f64:$vy, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvsla_vvsMvl v256f64:$vz, i64:$sy, v512i1:$vm, v256f64:$pt, i32:$vl), (PVSLAvrml_v v256f64:$vz, i64:$sy, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vslal_vvvl v256f64:$vz, v256f64:$vy, i32:$vl), (VSLALvvl v256f64:$vz, v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vslal_vvvvl v256f64:$vz, v256f64:$vy, v256f64:$pt, i32:$vl), (VSLALvvl_v v256f64:$vz, v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vslal_vvsl v256f64:$vz, i64:$sy, i32:$vl), (VSLALvrl v256f64:$vz, i64:$sy, i32:$vl)>;
+def : Pat<(int_ve_vl_vslal_vvsvl v256f64:$vz, i64:$sy, v256f64:$pt, i32:$vl), (VSLALvrl_v v256f64:$vz, i64:$sy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vslal_vvsl v256f64:$vz, uimm6:$N, i32:$vl), (VSLALvil v256f64:$vz, (ULO7 $N), i32:$vl)>;
+def : Pat<(int_ve_vl_vslal_vvsvl v256f64:$vz, uimm6:$N, v256f64:$pt, i32:$vl), (VSLALvil_v v256f64:$vz, (ULO7 $N), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vslal_vvvmvl v256f64:$vz, v256f64:$vy, v256i1:$vm, v256f64:$pt, i32:$vl), (VSLALvvml_v v256f64:$vz, v256f64:$vy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vslal_vvsmvl v256f64:$vz, i64:$sy, v256i1:$vm, v256f64:$pt, i32:$vl), (VSLALvrml_v v256f64:$vz, i64:$sy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vslal_vvsmvl v256f64:$vz, uimm6:$N, v256i1:$vm, v256f64:$pt, i32:$vl), (VSLALviml_v v256f64:$vz, (ULO7 $N), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsrawsx_vvvl v256f64:$vz, v256f64:$vy, i32:$vl), (VSRAWSXvvl v256f64:$vz, v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vsrawsx_vvvvl v256f64:$vz, v256f64:$vy, v256f64:$pt, i32:$vl), (VSRAWSXvvl_v v256f64:$vz, v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsrawsx_vvsl v256f64:$vz, i32:$sy, i32:$vl), (VSRAWSXvrl v256f64:$vz, i32:$sy, i32:$vl)>;
+def : Pat<(int_ve_vl_vsrawsx_vvsvl v256f64:$vz, i32:$sy, v256f64:$pt, i32:$vl), (VSRAWSXvrl_v v256f64:$vz, i32:$sy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsrawsx_vvsl v256f64:$vz, uimm6:$N, i32:$vl), (VSRAWSXvil v256f64:$vz, (ULO7 $N), i32:$vl)>;
+def : Pat<(int_ve_vl_vsrawsx_vvsvl v256f64:$vz, uimm6:$N, v256f64:$pt, i32:$vl), (VSRAWSXvil_v v256f64:$vz, (ULO7 $N), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsrawsx_vvvmvl v256f64:$vz, v256f64:$vy, v256i1:$vm, v256f64:$pt, i32:$vl), (VSRAWSXvvml_v v256f64:$vz, v256f64:$vy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsrawsx_vvsmvl v256f64:$vz, i32:$sy, v256i1:$vm, v256f64:$pt, i32:$vl), (VSRAWSXvrml_v v256f64:$vz, i32:$sy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsrawsx_vvsmvl v256f64:$vz, uimm6:$N, v256i1:$vm, v256f64:$pt, i32:$vl), (VSRAWSXviml_v v256f64:$vz, (ULO7 $N), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsrawzx_vvvl v256f64:$vz, v256f64:$vy, i32:$vl), (VSRAWZXvvl v256f64:$vz, v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vsrawzx_vvvvl v256f64:$vz, v256f64:$vy, v256f64:$pt, i32:$vl), (VSRAWZXvvl_v v256f64:$vz, v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsrawzx_vvsl v256f64:$vz, i32:$sy, i32:$vl), (VSRAWZXvrl v256f64:$vz, i32:$sy, i32:$vl)>;
+def : Pat<(int_ve_vl_vsrawzx_vvsvl v256f64:$vz, i32:$sy, v256f64:$pt, i32:$vl), (VSRAWZXvrl_v v256f64:$vz, i32:$sy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsrawzx_vvsl v256f64:$vz, uimm6:$N, i32:$vl), (VSRAWZXvil v256f64:$vz, (ULO7 $N), i32:$vl)>;
+def : Pat<(int_ve_vl_vsrawzx_vvsvl v256f64:$vz, uimm6:$N, v256f64:$pt, i32:$vl), (VSRAWZXvil_v v256f64:$vz, (ULO7 $N), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsrawzx_vvvmvl v256f64:$vz, v256f64:$vy, v256i1:$vm, v256f64:$pt, i32:$vl), (VSRAWZXvvml_v v256f64:$vz, v256f64:$vy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsrawzx_vvsmvl v256f64:$vz, i32:$sy, v256i1:$vm, v256f64:$pt, i32:$vl), (VSRAWZXvrml_v v256f64:$vz, i32:$sy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsrawzx_vvsmvl v256f64:$vz, uimm6:$N, v256i1:$vm, v256f64:$pt, i32:$vl), (VSRAWZXviml_v v256f64:$vz, (ULO7 $N), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvsra_vvvl v256f64:$vz, v256f64:$vy, i32:$vl), (PVSRAvvl v256f64:$vz, v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_pvsra_vvvvl v256f64:$vz, v256f64:$vy, v256f64:$pt, i32:$vl), (PVSRAvvl_v v256f64:$vz, v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvsra_vvsl v256f64:$vz, i64:$sy, i32:$vl), (PVSRAvrl v256f64:$vz, i64:$sy, i32:$vl)>;
+def : Pat<(int_ve_vl_pvsra_vvsvl v256f64:$vz, i64:$sy, v256f64:$pt, i32:$vl), (PVSRAvrl_v v256f64:$vz, i64:$sy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvsra_vvvMvl v256f64:$vz, v256f64:$vy, v512i1:$vm, v256f64:$pt, i32:$vl), (PVSRAvvml_v v256f64:$vz, v256f64:$vy, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvsra_vvsMvl v256f64:$vz, i64:$sy, v512i1:$vm, v256f64:$pt, i32:$vl), (PVSRAvrml_v v256f64:$vz, i64:$sy, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsral_vvvl v256f64:$vz, v256f64:$vy, i32:$vl), (VSRALvvl v256f64:$vz, v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vsral_vvvvl v256f64:$vz, v256f64:$vy, v256f64:$pt, i32:$vl), (VSRALvvl_v v256f64:$vz, v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsral_vvsl v256f64:$vz, i64:$sy, i32:$vl), (VSRALvrl v256f64:$vz, i64:$sy, i32:$vl)>;
+def : Pat<(int_ve_vl_vsral_vvsvl v256f64:$vz, i64:$sy, v256f64:$pt, i32:$vl), (VSRALvrl_v v256f64:$vz, i64:$sy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsral_vvsl v256f64:$vz, uimm6:$N, i32:$vl), (VSRALvil v256f64:$vz, (ULO7 $N), i32:$vl)>;
+def : Pat<(int_ve_vl_vsral_vvsvl v256f64:$vz, uimm6:$N, v256f64:$pt, i32:$vl), (VSRALvil_v v256f64:$vz, (ULO7 $N), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsral_vvvmvl v256f64:$vz, v256f64:$vy, v256i1:$vm, v256f64:$pt, i32:$vl), (VSRALvvml_v v256f64:$vz, v256f64:$vy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsral_vvsmvl v256f64:$vz, i64:$sy, v256i1:$vm, v256f64:$pt, i32:$vl), (VSRALvrml_v v256f64:$vz, i64:$sy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsral_vvsmvl v256f64:$vz, uimm6:$N, v256i1:$vm, v256f64:$pt, i32:$vl), (VSRALviml_v v256f64:$vz, (ULO7 $N), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsfa_vvssl v256f64:$vz, i64:$sy, i64:$sz, i32:$vl), (VSFAvrrl v256f64:$vz, i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vsfa_vvssvl v256f64:$vz, i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VSFAvrrl_v v256f64:$vz, i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsfa_vvssl v256f64:$vz, simm7:$I, i64:$sz, i32:$vl), (VSFAvirl v256f64:$vz, (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vsfa_vvssvl v256f64:$vz, simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VSFAvirl_v v256f64:$vz, (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsfa_vvssmvl v256f64:$vz, i64:$sy, i64:$sz, v256i1:$vm, v256f64:$pt, i32:$vl), (VSFAvrrml_v v256f64:$vz, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsfa_vvssmvl v256f64:$vz, simm7:$I, i64:$sz, v256i1:$vm, v256f64:$pt, i32:$vl), (VSFAvirml_v v256f64:$vz, (LO7 $I), i64:$sz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfaddd_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VFADDDvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfaddd_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFADDDvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfaddd_vsvl f64:$sy, v256f64:$vz, i32:$vl), (VFADDDrvl f64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfaddd_vsvvl f64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFADDDrvl_v f64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfaddd_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFADDDvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfaddd_vsvmvl f64:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFADDDrvml_v f64:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfadds_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VFADDSvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfadds_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFADDSvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfadds_vsvl f32:$sy, v256f64:$vz, i32:$vl), (VFADDSrvl f32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfadds_vsvvl f32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFADDSrvl_v f32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfadds_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFADDSvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfadds_vsvmvl f32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFADDSrvml_v f32:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfadd_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (PVFADDvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfadd_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVFADDvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfadd_vsvl i64:$sy, v256f64:$vz, i32:$vl), (PVFADDrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfadd_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVFADDrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfadd_vvvMvl v256f64:$vy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVFADDvvml_v v256f64:$vy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfadd_vsvMvl i64:$sy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVFADDrvml_v i64:$sy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfsubd_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VFSUBDvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfsubd_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFSUBDvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfsubd_vsvl f64:$sy, v256f64:$vz, i32:$vl), (VFSUBDrvl f64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfsubd_vsvvl f64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFSUBDrvl_v f64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfsubd_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFSUBDvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfsubd_vsvmvl f64:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFSUBDrvml_v f64:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfsubs_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VFSUBSvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfsubs_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFSUBSvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfsubs_vsvl f32:$sy, v256f64:$vz, i32:$vl), (VFSUBSrvl f32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfsubs_vsvvl f32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFSUBSrvl_v f32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfsubs_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFSUBSvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfsubs_vsvmvl f32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFSUBSrvml_v f32:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfsub_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (PVFSUBvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfsub_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVFSUBvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfsub_vsvl i64:$sy, v256f64:$vz, i32:$vl), (PVFSUBrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfsub_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVFSUBrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfsub_vvvMvl v256f64:$vy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVFSUBvvml_v v256f64:$vy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfsub_vsvMvl i64:$sy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVFSUBrvml_v i64:$sy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmuld_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VFMULDvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmuld_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFMULDvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmuld_vsvl f64:$sy, v256f64:$vz, i32:$vl), (VFMULDrvl f64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmuld_vsvvl f64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFMULDrvl_v f64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmuld_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFMULDvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmuld_vsvmvl f64:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFMULDrvml_v f64:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmuls_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VFMULSvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmuls_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFMULSvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmuls_vsvl f32:$sy, v256f64:$vz, i32:$vl), (VFMULSrvl f32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmuls_vsvvl f32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFMULSrvl_v f32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmuls_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFMULSvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmuls_vsvmvl f32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFMULSrvml_v f32:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfmul_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (PVFMULvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmul_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVFMULvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfmul_vsvl i64:$sy, v256f64:$vz, i32:$vl), (PVFMULrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmul_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVFMULrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfmul_vvvMvl v256f64:$vy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVFMULvvml_v v256f64:$vy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfmul_vsvMvl i64:$sy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVFMULrvml_v i64:$sy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfdivd_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VFDIVDvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfdivd_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFDIVDvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfdivd_vsvl f64:$sy, v256f64:$vz, i32:$vl), (VFDIVDrvl f64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfdivd_vsvvl f64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFDIVDrvl_v f64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfdivd_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFDIVDvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfdivd_vsvmvl f64:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFDIVDrvml_v f64:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfdivs_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VFDIVSvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfdivs_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFDIVSvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfdivs_vsvl f32:$sy, v256f64:$vz, i32:$vl), (VFDIVSrvl f32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfdivs_vsvvl f32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFDIVSrvl_v f32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfdivs_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFDIVSvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfdivs_vsvmvl f32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFDIVSrvml_v f32:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfsqrtd_vvl v256f64:$vy, i32:$vl), (VFSQRTDvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vfsqrtd_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VFSQRTDvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfsqrts_vvl v256f64:$vy, i32:$vl), (VFSQRTSvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vfsqrts_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VFSQRTSvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfcmpd_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VFCMPDvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfcmpd_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFCMPDvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfcmpd_vsvl f64:$sy, v256f64:$vz, i32:$vl), (VFCMPDrvl f64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfcmpd_vsvvl f64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFCMPDrvl_v f64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfcmpd_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFCMPDvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfcmpd_vsvmvl f64:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFCMPDrvml_v f64:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfcmps_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VFCMPSvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfcmps_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFCMPSvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfcmps_vsvl f32:$sy, v256f64:$vz, i32:$vl), (VFCMPSrvl f32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfcmps_vsvvl f32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFCMPSrvl_v f32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfcmps_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFCMPSvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfcmps_vsvmvl f32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFCMPSrvml_v f32:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfcmp_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (PVFCMPvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfcmp_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVFCMPvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfcmp_vsvl i64:$sy, v256f64:$vz, i32:$vl), (PVFCMPrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfcmp_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVFCMPrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfcmp_vvvMvl v256f64:$vy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVFCMPvvml_v v256f64:$vy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfcmp_vsvMvl i64:$sy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVFCMPrvml_v i64:$sy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmaxd_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VFMAXDvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmaxd_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFMAXDvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmaxd_vsvl f64:$sy, v256f64:$vz, i32:$vl), (VFMAXDrvl f64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmaxd_vsvvl f64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFMAXDrvl_v f64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmaxd_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFMAXDvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmaxd_vsvmvl f64:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFMAXDrvml_v f64:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmaxs_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VFMAXSvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmaxs_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFMAXSvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmaxs_vsvl f32:$sy, v256f64:$vz, i32:$vl), (VFMAXSrvl f32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmaxs_vsvvl f32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFMAXSrvl_v f32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmaxs_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFMAXSvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmaxs_vsvmvl f32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFMAXSrvml_v f32:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfmax_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (PVFMAXvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmax_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVFMAXvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfmax_vsvl i64:$sy, v256f64:$vz, i32:$vl), (PVFMAXrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmax_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVFMAXrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfmax_vvvMvl v256f64:$vy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVFMAXvvml_v v256f64:$vy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfmax_vsvMvl i64:$sy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVFMAXrvml_v i64:$sy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmind_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VFMINDvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmind_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFMINDvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmind_vsvl f64:$sy, v256f64:$vz, i32:$vl), (VFMINDrvl f64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmind_vsvvl f64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFMINDrvl_v f64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmind_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFMINDvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmind_vsvmvl f64:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFMINDrvml_v f64:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmins_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VFMINSvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmins_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFMINSvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmins_vsvl f32:$sy, v256f64:$vz, i32:$vl), (VFMINSrvl f32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmins_vsvvl f32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFMINSrvl_v f32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmins_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFMINSvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmins_vsvmvl f32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFMINSrvml_v f32:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfmin_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (PVFMINvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmin_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVFMINvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfmin_vsvl i64:$sy, v256f64:$vz, i32:$vl), (PVFMINrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmin_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVFMINrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfmin_vvvMvl v256f64:$vy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVFMINvvml_v v256f64:$vy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfmin_vsvMvl i64:$sy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVFMINrvml_v i64:$sy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmadd_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl), (VFMADDvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmadd_vvvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, v256f64:$pt, i32:$vl), (VFMADDvvvl_v v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmadd_vsvvl f64:$sy, v256f64:$vz, v256f64:$vw, i32:$vl), (VFMADDrvvl f64:$sy, v256f64:$vz, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmadd_vsvvvl f64:$sy, v256f64:$vz, v256f64:$vw, v256f64:$pt, i32:$vl), (VFMADDrvvl_v f64:$sy, v256f64:$vz, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmadd_vvsvl v256f64:$vy, f64:$sy, v256f64:$vw, i32:$vl), (VFMADDvrvl v256f64:$vy, f64:$sy, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmadd_vvsvvl v256f64:$vy, f64:$sy, v256f64:$vw, v256f64:$pt, i32:$vl), (VFMADDvrvl_v v256f64:$vy, f64:$sy, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmadd_vvvvmvl v256f64:$vy, v256f64:$vz, v256f64:$vw, v256i1:$vm, v256f64:$pt, i32:$vl), (VFMADDvvvml_v v256f64:$vy, v256f64:$vz, v256f64:$vw, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmadd_vsvvmvl f64:$sy, v256f64:$vz, v256f64:$vw, v256i1:$vm, v256f64:$pt, i32:$vl), (VFMADDrvvml_v f64:$sy, v256f64:$vz, v256f64:$vw, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmadd_vvsvmvl v256f64:$vy, f64:$sy, v256f64:$vw, v256i1:$vm, v256f64:$pt, i32:$vl), (VFMADDvrvml_v v256f64:$vy, f64:$sy, v256f64:$vw, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmads_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl), (VFMADSvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmads_vvvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, v256f64:$pt, i32:$vl), (VFMADSvvvl_v v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmads_vsvvl f32:$sy, v256f64:$vz, v256f64:$vw, i32:$vl), (VFMADSrvvl f32:$sy, v256f64:$vz, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmads_vsvvvl f32:$sy, v256f64:$vz, v256f64:$vw, v256f64:$pt, i32:$vl), (VFMADSrvvl_v f32:$sy, v256f64:$vz, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmads_vvsvl v256f64:$vy, f32:$sy, v256f64:$vw, i32:$vl), (VFMADSvrvl v256f64:$vy, f32:$sy, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmads_vvsvvl v256f64:$vy, f32:$sy, v256f64:$vw, v256f64:$pt, i32:$vl), (VFMADSvrvl_v v256f64:$vy, f32:$sy, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmads_vvvvmvl v256f64:$vy, v256f64:$vz, v256f64:$vw, v256i1:$vm, v256f64:$pt, i32:$vl), (VFMADSvvvml_v v256f64:$vy, v256f64:$vz, v256f64:$vw, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmads_vsvvmvl f32:$sy, v256f64:$vz, v256f64:$vw, v256i1:$vm, v256f64:$pt, i32:$vl), (VFMADSrvvml_v f32:$sy, v256f64:$vz, v256f64:$vw, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmads_vvsvmvl v256f64:$vy, f32:$sy, v256f64:$vw, v256i1:$vm, v256f64:$pt, i32:$vl), (VFMADSvrvml_v v256f64:$vy, f32:$sy, v256f64:$vw, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfmad_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl), (PVFMADvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmad_vvvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, v256f64:$pt, i32:$vl), (PVFMADvvvl_v v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfmad_vsvvl i64:$sy, v256f64:$vz, v256f64:$vw, i32:$vl), (PVFMADrvvl i64:$sy, v256f64:$vz, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmad_vsvvvl i64:$sy, v256f64:$vz, v256f64:$vw, v256f64:$pt, i32:$vl), (PVFMADrvvl_v i64:$sy, v256f64:$vz, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfmad_vvsvl v256f64:$vy, i64:$sy, v256f64:$vw, i32:$vl), (PVFMADvrvl v256f64:$vy, i64:$sy, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmad_vvsvvl v256f64:$vy, i64:$sy, v256f64:$vw, v256f64:$pt, i32:$vl), (PVFMADvrvl_v v256f64:$vy, i64:$sy, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfmad_vvvvMvl v256f64:$vy, v256f64:$vz, v256f64:$vw, v512i1:$vm, v256f64:$pt, i32:$vl), (PVFMADvvvml_v v256f64:$vy, v256f64:$vz, v256f64:$vw, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfmad_vsvvMvl i64:$sy, v256f64:$vz, v256f64:$vw, v512i1:$vm, v256f64:$pt, i32:$vl), (PVFMADrvvml_v i64:$sy, v256f64:$vz, v256f64:$vw, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfmad_vvsvMvl v256f64:$vy, i64:$sy, v256f64:$vw, v512i1:$vm, v256f64:$pt, i32:$vl), (PVFMADvrvml_v v256f64:$vy, i64:$sy, v256f64:$vw, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmsbd_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl), (VFMSBDvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmsbd_vvvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, v256f64:$pt, i32:$vl), (VFMSBDvvvl_v v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmsbd_vsvvl f64:$sy, v256f64:$vz, v256f64:$vw, i32:$vl), (VFMSBDrvvl f64:$sy, v256f64:$vz, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmsbd_vsvvvl f64:$sy, v256f64:$vz, v256f64:$vw, v256f64:$pt, i32:$vl), (VFMSBDrvvl_v f64:$sy, v256f64:$vz, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmsbd_vvsvl v256f64:$vy, f64:$sy, v256f64:$vw, i32:$vl), (VFMSBDvrvl v256f64:$vy, f64:$sy, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmsbd_vvsvvl v256f64:$vy, f64:$sy, v256f64:$vw, v256f64:$pt, i32:$vl), (VFMSBDvrvl_v v256f64:$vy, f64:$sy, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmsbd_vvvvmvl v256f64:$vy, v256f64:$vz, v256f64:$vw, v256i1:$vm, v256f64:$pt, i32:$vl), (VFMSBDvvvml_v v256f64:$vy, v256f64:$vz, v256f64:$vw, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmsbd_vsvvmvl f64:$sy, v256f64:$vz, v256f64:$vw, v256i1:$vm, v256f64:$pt, i32:$vl), (VFMSBDrvvml_v f64:$sy, v256f64:$vz, v256f64:$vw, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmsbd_vvsvmvl v256f64:$vy, f64:$sy, v256f64:$vw, v256i1:$vm, v256f64:$pt, i32:$vl), (VFMSBDvrvml_v v256f64:$vy, f64:$sy, v256f64:$vw, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmsbs_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl), (VFMSBSvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmsbs_vvvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, v256f64:$pt, i32:$vl), (VFMSBSvvvl_v v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmsbs_vsvvl f32:$sy, v256f64:$vz, v256f64:$vw, i32:$vl), (VFMSBSrvvl f32:$sy, v256f64:$vz, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmsbs_vsvvvl f32:$sy, v256f64:$vz, v256f64:$vw, v256f64:$pt, i32:$vl), (VFMSBSrvvl_v f32:$sy, v256f64:$vz, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmsbs_vvsvl v256f64:$vy, f32:$sy, v256f64:$vw, i32:$vl), (VFMSBSvrvl v256f64:$vy, f32:$sy, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmsbs_vvsvvl v256f64:$vy, f32:$sy, v256f64:$vw, v256f64:$pt, i32:$vl), (VFMSBSvrvl_v v256f64:$vy, f32:$sy, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmsbs_vvvvmvl v256f64:$vy, v256f64:$vz, v256f64:$vw, v256i1:$vm, v256f64:$pt, i32:$vl), (VFMSBSvvvml_v v256f64:$vy, v256f64:$vz, v256f64:$vw, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmsbs_vsvvmvl f32:$sy, v256f64:$vz, v256f64:$vw, v256i1:$vm, v256f64:$pt, i32:$vl), (VFMSBSrvvml_v f32:$sy, v256f64:$vz, v256f64:$vw, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmsbs_vvsvmvl v256f64:$vy, f32:$sy, v256f64:$vw, v256i1:$vm, v256f64:$pt, i32:$vl), (VFMSBSvrvml_v v256f64:$vy, f32:$sy, v256f64:$vw, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfmsb_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl), (PVFMSBvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmsb_vvvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, v256f64:$pt, i32:$vl), (PVFMSBvvvl_v v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfmsb_vsvvl i64:$sy, v256f64:$vz, v256f64:$vw, i32:$vl), (PVFMSBrvvl i64:$sy, v256f64:$vz, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmsb_vsvvvl i64:$sy, v256f64:$vz, v256f64:$vw, v256f64:$pt, i32:$vl), (PVFMSBrvvl_v i64:$sy, v256f64:$vz, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfmsb_vvsvl v256f64:$vy, i64:$sy, v256f64:$vw, i32:$vl), (PVFMSBvrvl v256f64:$vy, i64:$sy, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmsb_vvsvvl v256f64:$vy, i64:$sy, v256f64:$vw, v256f64:$pt, i32:$vl), (PVFMSBvrvl_v v256f64:$vy, i64:$sy, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfmsb_vvvvMvl v256f64:$vy, v256f64:$vz, v256f64:$vw, v512i1:$vm, v256f64:$pt, i32:$vl), (PVFMSBvvvml_v v256f64:$vy, v256f64:$vz, v256f64:$vw, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfmsb_vsvvMvl i64:$sy, v256f64:$vz, v256f64:$vw, v512i1:$vm, v256f64:$pt, i32:$vl), (PVFMSBrvvml_v i64:$sy, v256f64:$vz, v256f64:$vw, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfmsb_vvsvMvl v256f64:$vy, i64:$sy, v256f64:$vw, v512i1:$vm, v256f64:$pt, i32:$vl), (PVFMSBvrvml_v v256f64:$vy, i64:$sy, v256f64:$vw, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfnmadd_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl), (VFNMADDvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_vfnmadd_vvvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, v256f64:$pt, i32:$vl), (VFNMADDvvvl_v v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfnmadd_vsvvl f64:$sy, v256f64:$vz, v256f64:$vw, i32:$vl), (VFNMADDrvvl f64:$sy, v256f64:$vz, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_vfnmadd_vsvvvl f64:$sy, v256f64:$vz, v256f64:$vw, v256f64:$pt, i32:$vl), (VFNMADDrvvl_v f64:$sy, v256f64:$vz, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfnmadd_vvsvl v256f64:$vy, f64:$sy, v256f64:$vw, i32:$vl), (VFNMADDvrvl v256f64:$vy, f64:$sy, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_vfnmadd_vvsvvl v256f64:$vy, f64:$sy, v256f64:$vw, v256f64:$pt, i32:$vl), (VFNMADDvrvl_v v256f64:$vy, f64:$sy, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfnmadd_vvvvmvl v256f64:$vy, v256f64:$vz, v256f64:$vw, v256i1:$vm, v256f64:$pt, i32:$vl), (VFNMADDvvvml_v v256f64:$vy, v256f64:$vz, v256f64:$vw, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfnmadd_vsvvmvl f64:$sy, v256f64:$vz, v256f64:$vw, v256i1:$vm, v256f64:$pt, i32:$vl), (VFNMADDrvvml_v f64:$sy, v256f64:$vz, v256f64:$vw, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfnmadd_vvsvmvl v256f64:$vy, f64:$sy, v256f64:$vw, v256i1:$vm, v256f64:$pt, i32:$vl), (VFNMADDvrvml_v v256f64:$vy, f64:$sy, v256f64:$vw, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfnmads_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl), (VFNMADSvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_vfnmads_vvvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, v256f64:$pt, i32:$vl), (VFNMADSvvvl_v v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfnmads_vsvvl f32:$sy, v256f64:$vz, v256f64:$vw, i32:$vl), (VFNMADSrvvl f32:$sy, v256f64:$vz, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_vfnmads_vsvvvl f32:$sy, v256f64:$vz, v256f64:$vw, v256f64:$pt, i32:$vl), (VFNMADSrvvl_v f32:$sy, v256f64:$vz, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfnmads_vvsvl v256f64:$vy, f32:$sy, v256f64:$vw, i32:$vl), (VFNMADSvrvl v256f64:$vy, f32:$sy, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_vfnmads_vvsvvl v256f64:$vy, f32:$sy, v256f64:$vw, v256f64:$pt, i32:$vl), (VFNMADSvrvl_v v256f64:$vy, f32:$sy, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfnmads_vvvvmvl v256f64:$vy, v256f64:$vz, v256f64:$vw, v256i1:$vm, v256f64:$pt, i32:$vl), (VFNMADSvvvml_v v256f64:$vy, v256f64:$vz, v256f64:$vw, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfnmads_vsvvmvl f32:$sy, v256f64:$vz, v256f64:$vw, v256i1:$vm, v256f64:$pt, i32:$vl), (VFNMADSrvvml_v f32:$sy, v256f64:$vz, v256f64:$vw, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfnmads_vvsvmvl v256f64:$vy, f32:$sy, v256f64:$vw, v256i1:$vm, v256f64:$pt, i32:$vl), (VFNMADSvrvml_v v256f64:$vy, f32:$sy, v256f64:$vw, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfnmad_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl), (PVFNMADvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfnmad_vvvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, v256f64:$pt, i32:$vl), (PVFNMADvvvl_v v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfnmad_vsvvl i64:$sy, v256f64:$vz, v256f64:$vw, i32:$vl), (PVFNMADrvvl i64:$sy, v256f64:$vz, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfnmad_vsvvvl i64:$sy, v256f64:$vz, v256f64:$vw, v256f64:$pt, i32:$vl), (PVFNMADrvvl_v i64:$sy, v256f64:$vz, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfnmad_vvsvl v256f64:$vy, i64:$sy, v256f64:$vw, i32:$vl), (PVFNMADvrvl v256f64:$vy, i64:$sy, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfnmad_vvsvvl v256f64:$vy, i64:$sy, v256f64:$vw, v256f64:$pt, i32:$vl), (PVFNMADvrvl_v v256f64:$vy, i64:$sy, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfnmad_vvvvMvl v256f64:$vy, v256f64:$vz, v256f64:$vw, v512i1:$vm, v256f64:$pt, i32:$vl), (PVFNMADvvvml_v v256f64:$vy, v256f64:$vz, v256f64:$vw, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfnmad_vsvvMvl i64:$sy, v256f64:$vz, v256f64:$vw, v512i1:$vm, v256f64:$pt, i32:$vl), (PVFNMADrvvml_v i64:$sy, v256f64:$vz, v256f64:$vw, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfnmad_vvsvMvl v256f64:$vy, i64:$sy, v256f64:$vw, v512i1:$vm, v256f64:$pt, i32:$vl), (PVFNMADvrvml_v v256f64:$vy, i64:$sy, v256f64:$vw, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfnmsbd_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl), (VFNMSBDvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_vfnmsbd_vvvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, v256f64:$pt, i32:$vl), (VFNMSBDvvvl_v v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfnmsbd_vsvvl f64:$sy, v256f64:$vz, v256f64:$vw, i32:$vl), (VFNMSBDrvvl f64:$sy, v256f64:$vz, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_vfnmsbd_vsvvvl f64:$sy, v256f64:$vz, v256f64:$vw, v256f64:$pt, i32:$vl), (VFNMSBDrvvl_v f64:$sy, v256f64:$vz, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfnmsbd_vvsvl v256f64:$vy, f64:$sy, v256f64:$vw, i32:$vl), (VFNMSBDvrvl v256f64:$vy, f64:$sy, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_vfnmsbd_vvsvvl v256f64:$vy, f64:$sy, v256f64:$vw, v256f64:$pt, i32:$vl), (VFNMSBDvrvl_v v256f64:$vy, f64:$sy, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfnmsbd_vvvvmvl v256f64:$vy, v256f64:$vz, v256f64:$vw, v256i1:$vm, v256f64:$pt, i32:$vl), (VFNMSBDvvvml_v v256f64:$vy, v256f64:$vz, v256f64:$vw, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfnmsbd_vsvvmvl f64:$sy, v256f64:$vz, v256f64:$vw, v256i1:$vm, v256f64:$pt, i32:$vl), (VFNMSBDrvvml_v f64:$sy, v256f64:$vz, v256f64:$vw, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfnmsbd_vvsvmvl v256f64:$vy, f64:$sy, v256f64:$vw, v256i1:$vm, v256f64:$pt, i32:$vl), (VFNMSBDvrvml_v v256f64:$vy, f64:$sy, v256f64:$vw, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfnmsbs_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl), (VFNMSBSvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_vfnmsbs_vvvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, v256f64:$pt, i32:$vl), (VFNMSBSvvvl_v v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfnmsbs_vsvvl f32:$sy, v256f64:$vz, v256f64:$vw, i32:$vl), (VFNMSBSrvvl f32:$sy, v256f64:$vz, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_vfnmsbs_vsvvvl f32:$sy, v256f64:$vz, v256f64:$vw, v256f64:$pt, i32:$vl), (VFNMSBSrvvl_v f32:$sy, v256f64:$vz, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfnmsbs_vvsvl v256f64:$vy, f32:$sy, v256f64:$vw, i32:$vl), (VFNMSBSvrvl v256f64:$vy, f32:$sy, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_vfnmsbs_vvsvvl v256f64:$vy, f32:$sy, v256f64:$vw, v256f64:$pt, i32:$vl), (VFNMSBSvrvl_v v256f64:$vy, f32:$sy, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfnmsbs_vvvvmvl v256f64:$vy, v256f64:$vz, v256f64:$vw, v256i1:$vm, v256f64:$pt, i32:$vl), (VFNMSBSvvvml_v v256f64:$vy, v256f64:$vz, v256f64:$vw, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfnmsbs_vsvvmvl f32:$sy, v256f64:$vz, v256f64:$vw, v256i1:$vm, v256f64:$pt, i32:$vl), (VFNMSBSrvvml_v f32:$sy, v256f64:$vz, v256f64:$vw, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfnmsbs_vvsvmvl v256f64:$vy, f32:$sy, v256f64:$vw, v256i1:$vm, v256f64:$pt, i32:$vl), (VFNMSBSvrvml_v v256f64:$vy, f32:$sy, v256f64:$vw, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfnmsb_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl), (PVFNMSBvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfnmsb_vvvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, v256f64:$pt, i32:$vl), (PVFNMSBvvvl_v v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfnmsb_vsvvl i64:$sy, v256f64:$vz, v256f64:$vw, i32:$vl), (PVFNMSBrvvl i64:$sy, v256f64:$vz, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfnmsb_vsvvvl i64:$sy, v256f64:$vz, v256f64:$vw, v256f64:$pt, i32:$vl), (PVFNMSBrvvl_v i64:$sy, v256f64:$vz, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfnmsb_vvsvl v256f64:$vy, i64:$sy, v256f64:$vw, i32:$vl), (PVFNMSBvrvl v256f64:$vy, i64:$sy, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfnmsb_vvsvvl v256f64:$vy, i64:$sy, v256f64:$vw, v256f64:$pt, i32:$vl), (PVFNMSBvrvl_v v256f64:$vy, i64:$sy, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfnmsb_vvvvMvl v256f64:$vy, v256f64:$vz, v256f64:$vw, v512i1:$vm, v256f64:$pt, i32:$vl), (PVFNMSBvvvml_v v256f64:$vy, v256f64:$vz, v256f64:$vw, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfnmsb_vsvvMvl i64:$sy, v256f64:$vz, v256f64:$vw, v512i1:$vm, v256f64:$pt, i32:$vl), (PVFNMSBrvvml_v i64:$sy, v256f64:$vz, v256f64:$vw, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfnmsb_vvsvMvl v256f64:$vy, i64:$sy, v256f64:$vw, v512i1:$vm, v256f64:$pt, i32:$vl), (PVFNMSBvrvml_v v256f64:$vy, i64:$sy, v256f64:$vw, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vrcpd_vvl v256f64:$vy, i32:$vl), (VRCPDvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vrcpd_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VRCPDvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vrcps_vvl v256f64:$vy, i32:$vl), (VRCPSvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vrcps_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VRCPSvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvrcp_vvl v256f64:$vy, i32:$vl), (PVRCPvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_pvrcp_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (PVRCPvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vrsqrtd_vvl v256f64:$vy, i32:$vl), (VRSQRTDvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vrsqrtd_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VRSQRTDvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vrsqrts_vvl v256f64:$vy, i32:$vl), (VRSQRTSvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vrsqrts_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VRSQRTSvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvrsqrt_vvl v256f64:$vy, i32:$vl), (PVRSQRTvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_pvrsqrt_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (PVRSQRTvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vrsqrtdnex_vvl v256f64:$vy, i32:$vl), (VRSQRTDNEXvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vrsqrtdnex_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VRSQRTDNEXvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vrsqrtsnex_vvl v256f64:$vy, i32:$vl), (VRSQRTSNEXvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vrsqrtsnex_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VRSQRTSNEXvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvrsqrtnex_vvl v256f64:$vy, i32:$vl), (PVRSQRTNEXvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_pvrsqrtnex_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (PVRSQRTNEXvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcvtwdsx_vvl v256f64:$vy, i32:$vl), (VCVTWDSXvl RD_NONE, v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vcvtwdsx_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VCVTWDSXvl_v RD_NONE, v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcvtwdsx_vvmvl v256f64:$vy, v256i1:$vm, v256f64:$pt, i32:$vl), (VCVTWDSXvml_v RD_NONE, v256f64:$vy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcvtwdsxrz_vvl v256f64:$vy, i32:$vl), (VCVTWDSXvl RD_RZ, v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vcvtwdsxrz_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VCVTWDSXvl_v RD_RZ, v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcvtwdsxrz_vvmvl v256f64:$vy, v256i1:$vm, v256f64:$pt, i32:$vl), (VCVTWDSXvml_v RD_RZ, v256f64:$vy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcvtwdzx_vvl v256f64:$vy, i32:$vl), (VCVTWDZXvl RD_NONE, v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vcvtwdzx_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VCVTWDZXvl_v RD_NONE, v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcvtwdzx_vvmvl v256f64:$vy, v256i1:$vm, v256f64:$pt, i32:$vl), (VCVTWDZXvml_v RD_NONE, v256f64:$vy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcvtwdzxrz_vvl v256f64:$vy, i32:$vl), (VCVTWDZXvl RD_RZ, v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vcvtwdzxrz_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VCVTWDZXvl_v RD_RZ, v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcvtwdzxrz_vvmvl v256f64:$vy, v256i1:$vm, v256f64:$pt, i32:$vl), (VCVTWDZXvml_v RD_RZ, v256f64:$vy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcvtwssx_vvl v256f64:$vy, i32:$vl), (VCVTWSSXvl RD_NONE, v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vcvtwssx_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VCVTWSSXvl_v RD_NONE, v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcvtwssx_vvmvl v256f64:$vy, v256i1:$vm, v256f64:$pt, i32:$vl), (VCVTWSSXvml_v RD_NONE, v256f64:$vy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcvtwssxrz_vvl v256f64:$vy, i32:$vl), (VCVTWSSXvl RD_RZ, v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vcvtwssxrz_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VCVTWSSXvl_v RD_RZ, v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcvtwssxrz_vvmvl v256f64:$vy, v256i1:$vm, v256f64:$pt, i32:$vl), (VCVTWSSXvml_v RD_RZ, v256f64:$vy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcvtwszx_vvl v256f64:$vy, i32:$vl), (VCVTWSZXvl RD_NONE, v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vcvtwszx_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VCVTWSZXvl_v RD_NONE, v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcvtwszx_vvmvl v256f64:$vy, v256i1:$vm, v256f64:$pt, i32:$vl), (VCVTWSZXvml_v RD_NONE, v256f64:$vy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcvtwszxrz_vvl v256f64:$vy, i32:$vl), (VCVTWSZXvl RD_RZ, v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vcvtwszxrz_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VCVTWSZXvl_v RD_RZ, v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcvtwszxrz_vvmvl v256f64:$vy, v256i1:$vm, v256f64:$pt, i32:$vl), (VCVTWSZXvml_v RD_RZ, v256f64:$vy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvcvtws_vvl v256f64:$vy, i32:$vl), (PVCVTWSvl RD_NONE, v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_pvcvtws_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (PVCVTWSvl_v RD_NONE, v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvcvtws_vvMvl v256f64:$vy, v512i1:$vm, v256f64:$pt, i32:$vl), (PVCVTWSvml_v RD_NONE, v256f64:$vy, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvcvtwsrz_vvl v256f64:$vy, i32:$vl), (PVCVTWSvl RD_RZ, v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_pvcvtwsrz_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (PVCVTWSvl_v RD_RZ, v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvcvtwsrz_vvMvl v256f64:$vy, v512i1:$vm, v256f64:$pt, i32:$vl), (PVCVTWSvml_v RD_RZ, v256f64:$vy, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcvtld_vvl v256f64:$vy, i32:$vl), (VCVTLDvl RD_NONE, v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vcvtld_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VCVTLDvl_v RD_NONE, v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcvtld_vvmvl v256f64:$vy, v256i1:$vm, v256f64:$pt, i32:$vl), (VCVTLDvml_v RD_NONE, v256f64:$vy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcvtldrz_vvl v256f64:$vy, i32:$vl), (VCVTLDvl RD_RZ, v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vcvtldrz_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VCVTLDvl_v RD_RZ, v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcvtldrz_vvmvl v256f64:$vy, v256i1:$vm, v256f64:$pt, i32:$vl), (VCVTLDvml_v RD_RZ, v256f64:$vy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcvtdw_vvl v256f64:$vy, i32:$vl), (VCVTDWvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vcvtdw_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VCVTDWvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcvtsw_vvl v256f64:$vy, i32:$vl), (VCVTSWvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vcvtsw_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VCVTSWvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvcvtsw_vvl v256f64:$vy, i32:$vl), (PVCVTSWvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_pvcvtsw_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (PVCVTSWvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcvtdl_vvl v256f64:$vy, i32:$vl), (VCVTDLvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vcvtdl_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VCVTDLvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcvtds_vvl v256f64:$vy, i32:$vl), (VCVTDSvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vcvtds_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VCVTDSvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcvtsd_vvl v256f64:$vy, i32:$vl), (VCVTSDvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vcvtsd_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VCVTSDvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmrg_vvvml v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl), (VMRGvvml v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vmrg_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMRGvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmrg_vsvml i64:$sy, v256f64:$vz, v256i1:$vm, i32:$vl), (VMRGrvml i64:$sy, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vmrg_vsvmvl i64:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMRGrvml_v i64:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmrg_vsvml simm7:$I, v256f64:$vz, v256i1:$vm, i32:$vl), (VMRGivml (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vmrg_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMRGivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmrgw_vvvMl v256f64:$vy, v256f64:$vz, v512i1:$vm, i32:$vl), (VMRGWvvml v256f64:$vy, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vmrgw_vvvMvl v256f64:$vy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (VMRGWvvml_v v256f64:$vy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vshf_vvvsl v256f64:$vy, v256f64:$vz, i64:$sy, i32:$vl), (VSHFvvrl v256f64:$vy, v256f64:$vz, i64:$sy, i32:$vl)>;
+def : Pat<(int_ve_vl_vshf_vvvsvl v256f64:$vy, v256f64:$vz, i64:$sy, v256f64:$pt, i32:$vl), (VSHFvvrl_v v256f64:$vy, v256f64:$vz, i64:$sy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vshf_vvvsl v256f64:$vy, v256f64:$vz, uimm6:$N, i32:$vl), (VSHFvvil v256f64:$vy, v256f64:$vz, (ULO7 $N), i32:$vl)>;
+def : Pat<(int_ve_vl_vshf_vvvsvl v256f64:$vy, v256f64:$vz, uimm6:$N, v256f64:$pt, i32:$vl), (VSHFvvil_v v256f64:$vy, v256f64:$vz, (ULO7 $N), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcp_vvmvl v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VCPvml_v v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vex_vvmvl v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VEXvml_v v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmklat_ml i32:$vl), (VFMKLal i32:$vl)>;
+def : Pat<(int_ve_vl_vfmklaf_ml i32:$vl), (VFMKLnal i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkat_Ml i32:$vl), (VFMKyal i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkaf_Ml i32:$vl), (VFMKynal i32:$vl)>;
+def : Pat<(int_ve_vl_vfmklgt_mvl v256f64:$vz, i32:$vl), (VFMKLvl CC_IG, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmklgt_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKLvml CC_IG, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkllt_mvl v256f64:$vz, i32:$vl), (VFMKLvl CC_IL, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkllt_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKLvml CC_IL, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmklne_mvl v256f64:$vz, i32:$vl), (VFMKLvl CC_INE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmklne_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKLvml CC_INE, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkleq_mvl v256f64:$vz, i32:$vl), (VFMKLvl CC_IEQ, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkleq_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKLvml CC_IEQ, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmklge_mvl v256f64:$vz, i32:$vl), (VFMKLvl CC_IGE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmklge_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKLvml CC_IGE, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmklle_mvl v256f64:$vz, i32:$vl), (VFMKLvl CC_ILE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmklle_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKLvml CC_ILE, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmklnum_mvl v256f64:$vz, i32:$vl), (VFMKLvl CC_NUM, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmklnum_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKLvml CC_NUM, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmklnan_mvl v256f64:$vz, i32:$vl), (VFMKLvl CC_NAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmklnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKLvml CC_NAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmklgtnan_mvl v256f64:$vz, i32:$vl), (VFMKLvl CC_GNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmklgtnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKLvml CC_GNAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmklltnan_mvl v256f64:$vz, i32:$vl), (VFMKLvl CC_LNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmklltnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKLvml CC_LNAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmklnenan_mvl v256f64:$vz, i32:$vl), (VFMKLvl CC_NENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmklnenan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKLvml CC_NENAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkleqnan_mvl v256f64:$vz, i32:$vl), (VFMKLvl CC_EQNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkleqnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKLvml CC_EQNAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmklgenan_mvl v256f64:$vz, i32:$vl), (VFMKLvl CC_GENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmklgenan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKLvml CC_GENAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkllenan_mvl v256f64:$vz, i32:$vl), (VFMKLvl CC_LENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkllenan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKLvml CC_LENAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkwgt_mvl v256f64:$vz, i32:$vl), (VFMKWvl CC_IG, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkwgt_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKWvml CC_IG, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkwlt_mvl v256f64:$vz, i32:$vl), (VFMKWvl CC_IL, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkwlt_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKWvml CC_IL, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkwne_mvl v256f64:$vz, i32:$vl), (VFMKWvl CC_INE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkwne_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKWvml CC_INE, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkweq_mvl v256f64:$vz, i32:$vl), (VFMKWvl CC_IEQ, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkweq_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKWvml CC_IEQ, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkwge_mvl v256f64:$vz, i32:$vl), (VFMKWvl CC_IGE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkwge_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKWvml CC_IGE, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkwle_mvl v256f64:$vz, i32:$vl), (VFMKWvl CC_ILE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkwle_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKWvml CC_ILE, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkwnum_mvl v256f64:$vz, i32:$vl), (VFMKWvl CC_NUM, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkwnum_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKWvml CC_NUM, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkwnan_mvl v256f64:$vz, i32:$vl), (VFMKWvl CC_NAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkwnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKWvml CC_NAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkwgtnan_mvl v256f64:$vz, i32:$vl), (VFMKWvl CC_GNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkwgtnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKWvml CC_GNAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkwltnan_mvl v256f64:$vz, i32:$vl), (VFMKWvl CC_LNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkwltnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKWvml CC_LNAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkwnenan_mvl v256f64:$vz, i32:$vl), (VFMKWvl CC_NENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkwnenan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKWvml CC_NENAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkweqnan_mvl v256f64:$vz, i32:$vl), (VFMKWvl CC_EQNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkweqnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKWvml CC_EQNAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkwgenan_mvl v256f64:$vz, i32:$vl), (VFMKWvl CC_GENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkwgenan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKWvml CC_GENAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkwlenan_mvl v256f64:$vz, i32:$vl), (VFMKWvl CC_LENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkwlenan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKWvml CC_LENAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwlogt_mvl v256f64:$vz, i32:$vl), (PVFMKWLOvl CC_IG, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwupgt_mvl v256f64:$vz, i32:$vl), (PVFMKWUPvl CC_IG, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwlogt_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWLOvml CC_IG, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwupgt_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWUPvml CC_IG, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwlolt_mvl v256f64:$vz, i32:$vl), (PVFMKWLOvl CC_IL, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwuplt_mvl v256f64:$vz, i32:$vl), (PVFMKWUPvl CC_IL, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwlolt_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWLOvml CC_IL, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwuplt_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWUPvml CC_IL, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwlone_mvl v256f64:$vz, i32:$vl), (PVFMKWLOvl CC_INE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwupne_mvl v256f64:$vz, i32:$vl), (PVFMKWUPvl CC_INE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwlone_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWLOvml CC_INE, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwupne_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWUPvml CC_INE, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwloeq_mvl v256f64:$vz, i32:$vl), (PVFMKWLOvl CC_IEQ, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwupeq_mvl v256f64:$vz, i32:$vl), (PVFMKWUPvl CC_IEQ, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwloeq_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWLOvml CC_IEQ, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwupeq_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWUPvml CC_IEQ, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwloge_mvl v256f64:$vz, i32:$vl), (PVFMKWLOvl CC_IGE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwupge_mvl v256f64:$vz, i32:$vl), (PVFMKWUPvl CC_IGE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwloge_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWLOvml CC_IGE, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwupge_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWUPvml CC_IGE, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwlole_mvl v256f64:$vz, i32:$vl), (PVFMKWLOvl CC_ILE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwuple_mvl v256f64:$vz, i32:$vl), (PVFMKWUPvl CC_ILE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwlole_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWLOvml CC_ILE, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwuple_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWUPvml CC_ILE, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwlonum_mvl v256f64:$vz, i32:$vl), (PVFMKWLOvl CC_NUM, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwupnum_mvl v256f64:$vz, i32:$vl), (PVFMKWUPvl CC_NUM, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwlonum_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWLOvml CC_NUM, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwupnum_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWUPvml CC_NUM, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwlonan_mvl v256f64:$vz, i32:$vl), (PVFMKWLOvl CC_NAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwupnan_mvl v256f64:$vz, i32:$vl), (PVFMKWUPvl CC_NAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwlonan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWLOvml CC_NAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwupnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWUPvml CC_NAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwlogtnan_mvl v256f64:$vz, i32:$vl), (PVFMKWLOvl CC_GNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwupgtnan_mvl v256f64:$vz, i32:$vl), (PVFMKWUPvl CC_GNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwlogtnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWLOvml CC_GNAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwupgtnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWUPvml CC_GNAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwloltnan_mvl v256f64:$vz, i32:$vl), (PVFMKWLOvl CC_LNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwupltnan_mvl v256f64:$vz, i32:$vl), (PVFMKWUPvl CC_LNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwloltnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWLOvml CC_LNAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwupltnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWUPvml CC_LNAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwlonenan_mvl v256f64:$vz, i32:$vl), (PVFMKWLOvl CC_NENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwupnenan_mvl v256f64:$vz, i32:$vl), (PVFMKWUPvl CC_NENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwlonenan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWLOvml CC_NENAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwupnenan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWUPvml CC_NENAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwloeqnan_mvl v256f64:$vz, i32:$vl), (PVFMKWLOvl CC_EQNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwupeqnan_mvl v256f64:$vz, i32:$vl), (PVFMKWUPvl CC_EQNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwloeqnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWLOvml CC_EQNAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwupeqnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWUPvml CC_EQNAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwlogenan_mvl v256f64:$vz, i32:$vl), (PVFMKWLOvl CC_GENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwupgenan_mvl v256f64:$vz, i32:$vl), (PVFMKWUPvl CC_GENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwlogenan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWLOvml CC_GENAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwupgenan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWUPvml CC_GENAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwlolenan_mvl v256f64:$vz, i32:$vl), (PVFMKWLOvl CC_LENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwuplenan_mvl v256f64:$vz, i32:$vl), (PVFMKWUPvl CC_LENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwlolenan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWLOvml CC_LENAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwuplenan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWUPvml CC_LENAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwgt_Mvl v256f64:$vz, i32:$vl), (VFMKWyvl CC_IG, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwgt_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKWyvyl CC_IG, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwlt_Mvl v256f64:$vz, i32:$vl), (VFMKWyvl CC_IL, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwlt_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKWyvyl CC_IL, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwne_Mvl v256f64:$vz, i32:$vl), (VFMKWyvl CC_INE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwne_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKWyvyl CC_INE, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkweq_Mvl v256f64:$vz, i32:$vl), (VFMKWyvl CC_IEQ, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkweq_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKWyvyl CC_IEQ, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwge_Mvl v256f64:$vz, i32:$vl), (VFMKWyvl CC_IGE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwge_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKWyvyl CC_IGE, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwle_Mvl v256f64:$vz, i32:$vl), (VFMKWyvl CC_ILE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwle_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKWyvyl CC_ILE, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwnum_Mvl v256f64:$vz, i32:$vl), (VFMKWyvl CC_NUM, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwnum_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKWyvyl CC_NUM, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwnan_Mvl v256f64:$vz, i32:$vl), (VFMKWyvl CC_NAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwnan_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKWyvyl CC_NAN, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwgtnan_Mvl v256f64:$vz, i32:$vl), (VFMKWyvl CC_GNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwgtnan_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKWyvyl CC_GNAN, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwltnan_Mvl v256f64:$vz, i32:$vl), (VFMKWyvl CC_LNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwltnan_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKWyvyl CC_LNAN, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwnenan_Mvl v256f64:$vz, i32:$vl), (VFMKWyvl CC_NENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwnenan_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKWyvyl CC_NENAN, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkweqnan_Mvl v256f64:$vz, i32:$vl), (VFMKWyvl CC_EQNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkweqnan_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKWyvyl CC_EQNAN, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwgenan_Mvl v256f64:$vz, i32:$vl), (VFMKWyvl CC_GENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwgenan_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKWyvyl CC_GENAN, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwlenan_Mvl v256f64:$vz, i32:$vl), (VFMKWyvl CC_LENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwlenan_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKWyvyl CC_LENAN, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdgt_mvl v256f64:$vz, i32:$vl), (VFMKDvl CC_G, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdgt_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKDvml CC_G, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdlt_mvl v256f64:$vz, i32:$vl), (VFMKDvl CC_L, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdlt_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKDvml CC_L, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdne_mvl v256f64:$vz, i32:$vl), (VFMKDvl CC_NE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdne_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKDvml CC_NE, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdeq_mvl v256f64:$vz, i32:$vl), (VFMKDvl CC_EQ, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdeq_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKDvml CC_EQ, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdge_mvl v256f64:$vz, i32:$vl), (VFMKDvl CC_GE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdge_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKDvml CC_GE, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdle_mvl v256f64:$vz, i32:$vl), (VFMKDvl CC_LE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdle_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKDvml CC_LE, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdnum_mvl v256f64:$vz, i32:$vl), (VFMKDvl CC_NUM, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdnum_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKDvml CC_NUM, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdnan_mvl v256f64:$vz, i32:$vl), (VFMKDvl CC_NAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKDvml CC_NAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdgtnan_mvl v256f64:$vz, i32:$vl), (VFMKDvl CC_GNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdgtnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKDvml CC_GNAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdltnan_mvl v256f64:$vz, i32:$vl), (VFMKDvl CC_LNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdltnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKDvml CC_LNAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdnenan_mvl v256f64:$vz, i32:$vl), (VFMKDvl CC_NENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdnenan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKDvml CC_NENAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdeqnan_mvl v256f64:$vz, i32:$vl), (VFMKDvl CC_EQNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdeqnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKDvml CC_EQNAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdgenan_mvl v256f64:$vz, i32:$vl), (VFMKDvl CC_GENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdgenan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKDvml CC_GENAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdlenan_mvl v256f64:$vz, i32:$vl), (VFMKDvl CC_LENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdlenan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKDvml CC_LENAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmksgt_mvl v256f64:$vz, i32:$vl), (VFMKSvl CC_G, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmksgt_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKSvml CC_G, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkslt_mvl v256f64:$vz, i32:$vl), (VFMKSvl CC_L, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkslt_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKSvml CC_L, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmksne_mvl v256f64:$vz, i32:$vl), (VFMKSvl CC_NE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmksne_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKSvml CC_NE, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkseq_mvl v256f64:$vz, i32:$vl), (VFMKSvl CC_EQ, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkseq_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKSvml CC_EQ, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmksge_mvl v256f64:$vz, i32:$vl), (VFMKSvl CC_GE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmksge_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKSvml CC_GE, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmksle_mvl v256f64:$vz, i32:$vl), (VFMKSvl CC_LE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmksle_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKSvml CC_LE, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmksnum_mvl v256f64:$vz, i32:$vl), (VFMKSvl CC_NUM, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmksnum_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKSvml CC_NUM, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmksnan_mvl v256f64:$vz, i32:$vl), (VFMKSvl CC_NAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmksnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKSvml CC_NAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmksgtnan_mvl v256f64:$vz, i32:$vl), (VFMKSvl CC_GNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmksgtnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKSvml CC_GNAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmksltnan_mvl v256f64:$vz, i32:$vl), (VFMKSvl CC_LNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmksltnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKSvml CC_LNAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmksnenan_mvl v256f64:$vz, i32:$vl), (VFMKSvl CC_NENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmksnenan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKSvml CC_NENAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkseqnan_mvl v256f64:$vz, i32:$vl), (VFMKSvl CC_EQNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkseqnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKSvml CC_EQNAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmksgenan_mvl v256f64:$vz, i32:$vl), (VFMKSvl CC_GENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmksgenan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKSvml CC_GENAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkslenan_mvl v256f64:$vz, i32:$vl), (VFMKSvl CC_LENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkslenan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKSvml CC_LENAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkslogt_mvl v256f64:$vz, i32:$vl), (PVFMKSLOvl CC_G, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksupgt_mvl v256f64:$vz, i32:$vl), (PVFMKSUPvl CC_G, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkslogt_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSLOvml CC_G, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksupgt_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSUPvml CC_G, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkslolt_mvl v256f64:$vz, i32:$vl), (PVFMKSLOvl CC_L, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksuplt_mvl v256f64:$vz, i32:$vl), (PVFMKSUPvl CC_L, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkslolt_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSLOvml CC_L, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksuplt_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSUPvml CC_L, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkslone_mvl v256f64:$vz, i32:$vl), (PVFMKSLOvl CC_NE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksupne_mvl v256f64:$vz, i32:$vl), (PVFMKSUPvl CC_NE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkslone_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSLOvml CC_NE, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksupne_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSUPvml CC_NE, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksloeq_mvl v256f64:$vz, i32:$vl), (PVFMKSLOvl CC_EQ, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksupeq_mvl v256f64:$vz, i32:$vl), (PVFMKSUPvl CC_EQ, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksloeq_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSLOvml CC_EQ, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksupeq_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSUPvml CC_EQ, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksloge_mvl v256f64:$vz, i32:$vl), (PVFMKSLOvl CC_GE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksupge_mvl v256f64:$vz, i32:$vl), (PVFMKSUPvl CC_GE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksloge_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSLOvml CC_GE, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksupge_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSUPvml CC_GE, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkslole_mvl v256f64:$vz, i32:$vl), (PVFMKSLOvl CC_LE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksuple_mvl v256f64:$vz, i32:$vl), (PVFMKSUPvl CC_LE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkslole_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSLOvml CC_LE, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksuple_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSUPvml CC_LE, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkslonum_mvl v256f64:$vz, i32:$vl), (PVFMKSLOvl CC_NUM, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksupnum_mvl v256f64:$vz, i32:$vl), (PVFMKSUPvl CC_NUM, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkslonum_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSLOvml CC_NUM, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksupnum_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSUPvml CC_NUM, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkslonan_mvl v256f64:$vz, i32:$vl), (PVFMKSLOvl CC_NAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksupnan_mvl v256f64:$vz, i32:$vl), (PVFMKSUPvl CC_NAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkslonan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSLOvml CC_NAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksupnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSUPvml CC_NAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkslogtnan_mvl v256f64:$vz, i32:$vl), (PVFMKSLOvl CC_GNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksupgtnan_mvl v256f64:$vz, i32:$vl), (PVFMKSUPvl CC_GNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkslogtnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSLOvml CC_GNAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksupgtnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSUPvml CC_GNAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksloltnan_mvl v256f64:$vz, i32:$vl), (PVFMKSLOvl CC_LNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksupltnan_mvl v256f64:$vz, i32:$vl), (PVFMKSUPvl CC_LNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksloltnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSLOvml CC_LNAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksupltnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSUPvml CC_LNAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkslonenan_mvl v256f64:$vz, i32:$vl), (PVFMKSLOvl CC_NENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksupnenan_mvl v256f64:$vz, i32:$vl), (PVFMKSUPvl CC_NENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkslonenan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSLOvml CC_NENAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksupnenan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSUPvml CC_NENAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksloeqnan_mvl v256f64:$vz, i32:$vl), (PVFMKSLOvl CC_EQNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksupeqnan_mvl v256f64:$vz, i32:$vl), (PVFMKSUPvl CC_EQNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksloeqnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSLOvml CC_EQNAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksupeqnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSUPvml CC_EQNAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkslogenan_mvl v256f64:$vz, i32:$vl), (PVFMKSLOvl CC_GENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksupgenan_mvl v256f64:$vz, i32:$vl), (PVFMKSUPvl CC_GENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkslogenan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSLOvml CC_GENAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksupgenan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSUPvml CC_GENAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkslolenan_mvl v256f64:$vz, i32:$vl), (PVFMKSLOvl CC_LENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksuplenan_mvl v256f64:$vz, i32:$vl), (PVFMKSUPvl CC_LENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkslolenan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSLOvml CC_LENAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksuplenan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSUPvml CC_LENAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksgt_Mvl v256f64:$vz, i32:$vl), (VFMKSyvl CC_G, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksgt_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKSyvyl CC_G, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkslt_Mvl v256f64:$vz, i32:$vl), (VFMKSyvl CC_L, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkslt_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKSyvyl CC_L, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksne_Mvl v256f64:$vz, i32:$vl), (VFMKSyvl CC_NE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksne_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKSyvyl CC_NE, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkseq_Mvl v256f64:$vz, i32:$vl), (VFMKSyvl CC_EQ, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkseq_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKSyvyl CC_EQ, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksge_Mvl v256f64:$vz, i32:$vl), (VFMKSyvl CC_GE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksge_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKSyvyl CC_GE, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksle_Mvl v256f64:$vz, i32:$vl), (VFMKSyvl CC_LE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksle_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKSyvyl CC_LE, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksnum_Mvl v256f64:$vz, i32:$vl), (VFMKSyvl CC_NUM, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksnum_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKSyvyl CC_NUM, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksnan_Mvl v256f64:$vz, i32:$vl), (VFMKSyvl CC_NAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksnan_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKSyvyl CC_NAN, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksgtnan_Mvl v256f64:$vz, i32:$vl), (VFMKSyvl CC_GNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksgtnan_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKSyvyl CC_GNAN, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksltnan_Mvl v256f64:$vz, i32:$vl), (VFMKSyvl CC_LNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksltnan_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKSyvyl CC_LNAN, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksnenan_Mvl v256f64:$vz, i32:$vl), (VFMKSyvl CC_NENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksnenan_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKSyvyl CC_NENAN, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkseqnan_Mvl v256f64:$vz, i32:$vl), (VFMKSyvl CC_EQNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkseqnan_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKSyvyl CC_EQNAN, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksgenan_Mvl v256f64:$vz, i32:$vl), (VFMKSyvl CC_GENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksgenan_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKSyvyl CC_GENAN, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkslenan_Mvl v256f64:$vz, i32:$vl), (VFMKSyvl CC_LENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkslenan_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKSyvyl CC_LENAN, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vsumwsx_vvl v256f64:$vy, i32:$vl), (VSUMWSXvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vsumwsx_vvml v256f64:$vy, v256i1:$vm, i32:$vl), (VSUMWSXvml v256f64:$vy, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vsumwzx_vvl v256f64:$vy, i32:$vl), (VSUMWZXvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vsumwzx_vvml v256f64:$vy, v256i1:$vm, i32:$vl), (VSUMWZXvml v256f64:$vy, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vsuml_vvl v256f64:$vy, i32:$vl), (VSUMLvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vsuml_vvml v256f64:$vy, v256i1:$vm, i32:$vl), (VSUMLvml v256f64:$vy, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfsumd_vvl v256f64:$vy, i32:$vl), (VFSUMDvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vfsumd_vvml v256f64:$vy, v256i1:$vm, i32:$vl), (VFSUMDvml v256f64:$vy, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfsums_vvl v256f64:$vy, i32:$vl), (VFSUMSvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vfsums_vvml v256f64:$vy, v256i1:$vm, i32:$vl), (VFSUMSvml v256f64:$vy, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vrmaxswfstsx_vvl v256f64:$vy, i32:$vl), (VRMAXSWFSTSXvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vrmaxswfstsx_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VRMAXSWFSTSXvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vrmaxswlstsx_vvl v256f64:$vy, i32:$vl), (VRMAXSWLSTSXvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vrmaxswlstsx_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VRMAXSWLSTSXvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vrmaxswfstzx_vvl v256f64:$vy, i32:$vl), (VRMAXSWFSTZXvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vrmaxswfstzx_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VRMAXSWFSTZXvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vrmaxswlstzx_vvl v256f64:$vy, i32:$vl), (VRMAXSWLSTZXvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vrmaxswlstzx_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VRMAXSWLSTZXvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vrminswfstsx_vvl v256f64:$vy, i32:$vl), (VRMINSWFSTSXvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vrminswfstsx_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VRMINSWFSTSXvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vrminswlstsx_vvl v256f64:$vy, i32:$vl), (VRMINSWLSTSXvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vrminswlstsx_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VRMINSWLSTSXvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vrminswfstzx_vvl v256f64:$vy, i32:$vl), (VRMINSWFSTZXvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vrminswfstzx_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VRMINSWFSTZXvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vrminswlstzx_vvl v256f64:$vy, i32:$vl), (VRMINSWLSTZXvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vrminswlstzx_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VRMINSWLSTZXvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vrmaxslfst_vvl v256f64:$vy, i32:$vl), (VRMAXSLFSTvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vrmaxslfst_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VRMAXSLFSTvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vrmaxsllst_vvl v256f64:$vy, i32:$vl), (VRMAXSLLSTvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vrmaxsllst_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VRMAXSLLSTvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vrminslfst_vvl v256f64:$vy, i32:$vl), (VRMINSLFSTvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vrminslfst_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VRMINSLFSTvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vrminsllst_vvl v256f64:$vy, i32:$vl), (VRMINSLLSTvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vrminsllst_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VRMINSLLSTvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfrmaxdfst_vvl v256f64:$vy, i32:$vl), (VFRMAXDFSTvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vfrmaxdfst_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VFRMAXDFSTvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfrmaxdlst_vvl v256f64:$vy, i32:$vl), (VFRMAXDLSTvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vfrmaxdlst_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VFRMAXDLSTvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfrmaxsfst_vvl v256f64:$vy, i32:$vl), (VFRMAXSFSTvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vfrmaxsfst_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VFRMAXSFSTvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfrmaxslst_vvl v256f64:$vy, i32:$vl), (VFRMAXSLSTvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vfrmaxslst_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VFRMAXSLSTvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfrmindfst_vvl v256f64:$vy, i32:$vl), (VFRMINDFSTvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vfrmindfst_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VFRMINDFSTvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfrmindlst_vvl v256f64:$vy, i32:$vl), (VFRMINDLSTvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vfrmindlst_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VFRMINDLSTvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfrminsfst_vvl v256f64:$vy, i32:$vl), (VFRMINSFSTvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vfrminsfst_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VFRMINSFSTvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfrminslst_vvl v256f64:$vy, i32:$vl), (VFRMINSLSTvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vfrminslst_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VFRMINSLSTvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vrand_vvl v256f64:$vy, i32:$vl), (VRANDvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vrand_vvml v256f64:$vy, v256i1:$vm, i32:$vl), (VRANDvml v256f64:$vy, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vror_vvl v256f64:$vy, i32:$vl), (VRORvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vror_vvml v256f64:$vy, v256i1:$vm, i32:$vl), (VRORvml v256f64:$vy, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vrxor_vvl v256f64:$vy, i32:$vl), (VRXORvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vrxor_vvml v256f64:$vy, v256i1:$vm, i32:$vl), (VRXORvml v256f64:$vy, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgt_vvssl v256f64:$vy, i64:$sy, i64:$sz, i32:$vl), (VGTvrrl v256f64:$vy, i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vgt_vvssvl v256f64:$vy, i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VGTvrrl_v v256f64:$vy, i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgt_vvssl v256f64:$vy, i64:$sy, zero:$Z, i32:$vl), (VGTvrzl v256f64:$vy, i64:$sy, (LO7 $Z), i32:$vl)>;
+def : Pat<(int_ve_vl_vgt_vvssvl v256f64:$vy, i64:$sy, zero:$Z, v256f64:$pt, i32:$vl), (VGTvrzl_v v256f64:$vy, i64:$sy, (LO7 $Z), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgt_vvssl v256f64:$vy, simm7:$I, i64:$sz, i32:$vl), (VGTvirl v256f64:$vy, (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vgt_vvssvl v256f64:$vy, simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VGTvirl_v v256f64:$vy, (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgt_vvssl v256f64:$vy, simm7:$I, zero:$Z, i32:$vl), (VGTvizl v256f64:$vy, (LO7 $I), (LO7 $Z), i32:$vl)>;
+def : Pat<(int_ve_vl_vgt_vvssvl v256f64:$vy, simm7:$I, zero:$Z, v256f64:$pt, i32:$vl), (VGTvizl_v v256f64:$vy, (LO7 $I), (LO7 $Z), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgt_vvssml v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VGTvrrml v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgt_vvssmvl v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTvrrml_v v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgt_vvssml v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, i32:$vl), (VGTvrzml v256f64:$vy, i64:$sy, (LO7 $Z), v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgt_vvssmvl v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTvrzml_v v256f64:$vy, i64:$sy, (LO7 $Z), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgt_vvssml v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VGTvirml v256f64:$vy, (LO7 $I), i64:$sz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgt_vvssmvl v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTvirml_v v256f64:$vy, (LO7 $I), i64:$sz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgt_vvssml v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, i32:$vl), (VGTvizml v256f64:$vy, (LO7 $I), (LO7 $Z), v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgt_vvssmvl v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTvizml_v v256f64:$vy, (LO7 $I), (LO7 $Z), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtnc_vvssl v256f64:$vy, i64:$sy, i64:$sz, i32:$vl), (VGTNCvrrl v256f64:$vy, i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtnc_vvssvl v256f64:$vy, i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VGTNCvrrl_v v256f64:$vy, i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtnc_vvssl v256f64:$vy, i64:$sy, zero:$Z, i32:$vl), (VGTNCvrzl v256f64:$vy, i64:$sy, (LO7 $Z), i32:$vl)>;
+def : Pat<(int_ve_vl_vgtnc_vvssvl v256f64:$vy, i64:$sy, zero:$Z, v256f64:$pt, i32:$vl), (VGTNCvrzl_v v256f64:$vy, i64:$sy, (LO7 $Z), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtnc_vvssl v256f64:$vy, simm7:$I, i64:$sz, i32:$vl), (VGTNCvirl v256f64:$vy, (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtnc_vvssvl v256f64:$vy, simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VGTNCvirl_v v256f64:$vy, (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtnc_vvssl v256f64:$vy, simm7:$I, zero:$Z, i32:$vl), (VGTNCvizl v256f64:$vy, (LO7 $I), (LO7 $Z), i32:$vl)>;
+def : Pat<(int_ve_vl_vgtnc_vvssvl v256f64:$vy, simm7:$I, zero:$Z, v256f64:$pt, i32:$vl), (VGTNCvizl_v v256f64:$vy, (LO7 $I), (LO7 $Z), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtnc_vvssml v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VGTNCvrrml v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtnc_vvssmvl v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTNCvrrml_v v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtnc_vvssml v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, i32:$vl), (VGTNCvrzml v256f64:$vy, i64:$sy, (LO7 $Z), v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtnc_vvssmvl v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTNCvrzml_v v256f64:$vy, i64:$sy, (LO7 $Z), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtnc_vvssml v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VGTNCvirml v256f64:$vy, (LO7 $I), i64:$sz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtnc_vvssmvl v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTNCvirml_v v256f64:$vy, (LO7 $I), i64:$sz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtnc_vvssml v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, i32:$vl), (VGTNCvizml v256f64:$vy, (LO7 $I), (LO7 $Z), v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtnc_vvssmvl v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTNCvizml_v v256f64:$vy, (LO7 $I), (LO7 $Z), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtu_vvssl v256f64:$vy, i64:$sy, i64:$sz, i32:$vl), (VGTUvrrl v256f64:$vy, i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtu_vvssvl v256f64:$vy, i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VGTUvrrl_v v256f64:$vy, i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtu_vvssl v256f64:$vy, i64:$sy, zero:$Z, i32:$vl), (VGTUvrzl v256f64:$vy, i64:$sy, (LO7 $Z), i32:$vl)>;
+def : Pat<(int_ve_vl_vgtu_vvssvl v256f64:$vy, i64:$sy, zero:$Z, v256f64:$pt, i32:$vl), (VGTUvrzl_v v256f64:$vy, i64:$sy, (LO7 $Z), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtu_vvssl v256f64:$vy, simm7:$I, i64:$sz, i32:$vl), (VGTUvirl v256f64:$vy, (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtu_vvssvl v256f64:$vy, simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VGTUvirl_v v256f64:$vy, (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtu_vvssl v256f64:$vy, simm7:$I, zero:$Z, i32:$vl), (VGTUvizl v256f64:$vy, (LO7 $I), (LO7 $Z), i32:$vl)>;
+def : Pat<(int_ve_vl_vgtu_vvssvl v256f64:$vy, simm7:$I, zero:$Z, v256f64:$pt, i32:$vl), (VGTUvizl_v v256f64:$vy, (LO7 $I), (LO7 $Z), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtu_vvssml v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VGTUvrrml v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtu_vvssmvl v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTUvrrml_v v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtu_vvssml v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, i32:$vl), (VGTUvrzml v256f64:$vy, i64:$sy, (LO7 $Z), v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtu_vvssmvl v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTUvrzml_v v256f64:$vy, i64:$sy, (LO7 $Z), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtu_vvssml v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VGTUvirml v256f64:$vy, (LO7 $I), i64:$sz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtu_vvssmvl v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTUvirml_v v256f64:$vy, (LO7 $I), i64:$sz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtu_vvssml v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, i32:$vl), (VGTUvizml v256f64:$vy, (LO7 $I), (LO7 $Z), v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtu_vvssmvl v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTUvizml_v v256f64:$vy, (LO7 $I), (LO7 $Z), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtunc_vvssl v256f64:$vy, i64:$sy, i64:$sz, i32:$vl), (VGTUNCvrrl v256f64:$vy, i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtunc_vvssvl v256f64:$vy, i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VGTUNCvrrl_v v256f64:$vy, i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtunc_vvssl v256f64:$vy, i64:$sy, zero:$Z, i32:$vl), (VGTUNCvrzl v256f64:$vy, i64:$sy, (LO7 $Z), i32:$vl)>;
+def : Pat<(int_ve_vl_vgtunc_vvssvl v256f64:$vy, i64:$sy, zero:$Z, v256f64:$pt, i32:$vl), (VGTUNCvrzl_v v256f64:$vy, i64:$sy, (LO7 $Z), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtunc_vvssl v256f64:$vy, simm7:$I, i64:$sz, i32:$vl), (VGTUNCvirl v256f64:$vy, (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtunc_vvssvl v256f64:$vy, simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VGTUNCvirl_v v256f64:$vy, (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtunc_vvssl v256f64:$vy, simm7:$I, zero:$Z, i32:$vl), (VGTUNCvizl v256f64:$vy, (LO7 $I), (LO7 $Z), i32:$vl)>;
+def : Pat<(int_ve_vl_vgtunc_vvssvl v256f64:$vy, simm7:$I, zero:$Z, v256f64:$pt, i32:$vl), (VGTUNCvizl_v v256f64:$vy, (LO7 $I), (LO7 $Z), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtunc_vvssml v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VGTUNCvrrml v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtunc_vvssmvl v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTUNCvrrml_v v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtunc_vvssml v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, i32:$vl), (VGTUNCvrzml v256f64:$vy, i64:$sy, (LO7 $Z), v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtunc_vvssmvl v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTUNCvrzml_v v256f64:$vy, i64:$sy, (LO7 $Z), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtunc_vvssml v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VGTUNCvirml v256f64:$vy, (LO7 $I), i64:$sz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtunc_vvssmvl v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTUNCvirml_v v256f64:$vy, (LO7 $I), i64:$sz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtunc_vvssml v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, i32:$vl), (VGTUNCvizml v256f64:$vy, (LO7 $I), (LO7 $Z), v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtunc_vvssmvl v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTUNCvizml_v v256f64:$vy, (LO7 $I), (LO7 $Z), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlsx_vvssl v256f64:$vy, i64:$sy, i64:$sz, i32:$vl), (VGTLSXvrrl v256f64:$vy, i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlsx_vvssvl v256f64:$vy, i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VGTLSXvrrl_v v256f64:$vy, i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlsx_vvssl v256f64:$vy, i64:$sy, zero:$Z, i32:$vl), (VGTLSXvrzl v256f64:$vy, i64:$sy, (LO7 $Z), i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlsx_vvssvl v256f64:$vy, i64:$sy, zero:$Z, v256f64:$pt, i32:$vl), (VGTLSXvrzl_v v256f64:$vy, i64:$sy, (LO7 $Z), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlsx_vvssl v256f64:$vy, simm7:$I, i64:$sz, i32:$vl), (VGTLSXvirl v256f64:$vy, (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlsx_vvssvl v256f64:$vy, simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VGTLSXvirl_v v256f64:$vy, (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlsx_vvssl v256f64:$vy, simm7:$I, zero:$Z, i32:$vl), (VGTLSXvizl v256f64:$vy, (LO7 $I), (LO7 $Z), i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlsx_vvssvl v256f64:$vy, simm7:$I, zero:$Z, v256f64:$pt, i32:$vl), (VGTLSXvizl_v v256f64:$vy, (LO7 $I), (LO7 $Z), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlsx_vvssml v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VGTLSXvrrml v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlsx_vvssmvl v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTLSXvrrml_v v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlsx_vvssml v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, i32:$vl), (VGTLSXvrzml v256f64:$vy, i64:$sy, (LO7 $Z), v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlsx_vvssmvl v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTLSXvrzml_v v256f64:$vy, i64:$sy, (LO7 $Z), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlsx_vvssml v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VGTLSXvirml v256f64:$vy, (LO7 $I), i64:$sz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlsx_vvssmvl v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTLSXvirml_v v256f64:$vy, (LO7 $I), i64:$sz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlsx_vvssml v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, i32:$vl), (VGTLSXvizml v256f64:$vy, (LO7 $I), (LO7 $Z), v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlsx_vvssmvl v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTLSXvizml_v v256f64:$vy, (LO7 $I), (LO7 $Z), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlsxnc_vvssl v256f64:$vy, i64:$sy, i64:$sz, i32:$vl), (VGTLSXNCvrrl v256f64:$vy, i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlsxnc_vvssvl v256f64:$vy, i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VGTLSXNCvrrl_v v256f64:$vy, i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlsxnc_vvssl v256f64:$vy, i64:$sy, zero:$Z, i32:$vl), (VGTLSXNCvrzl v256f64:$vy, i64:$sy, (LO7 $Z), i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlsxnc_vvssvl v256f64:$vy, i64:$sy, zero:$Z, v256f64:$pt, i32:$vl), (VGTLSXNCvrzl_v v256f64:$vy, i64:$sy, (LO7 $Z), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlsxnc_vvssl v256f64:$vy, simm7:$I, i64:$sz, i32:$vl), (VGTLSXNCvirl v256f64:$vy, (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlsxnc_vvssvl v256f64:$vy, simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VGTLSXNCvirl_v v256f64:$vy, (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlsxnc_vvssl v256f64:$vy, simm7:$I, zero:$Z, i32:$vl), (VGTLSXNCvizl v256f64:$vy, (LO7 $I), (LO7 $Z), i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlsxnc_vvssvl v256f64:$vy, simm7:$I, zero:$Z, v256f64:$pt, i32:$vl), (VGTLSXNCvizl_v v256f64:$vy, (LO7 $I), (LO7 $Z), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlsxnc_vvssml v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VGTLSXNCvrrml v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlsxnc_vvssmvl v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTLSXNCvrrml_v v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlsxnc_vvssml v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, i32:$vl), (VGTLSXNCvrzml v256f64:$vy, i64:$sy, (LO7 $Z), v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlsxnc_vvssmvl v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTLSXNCvrzml_v v256f64:$vy, i64:$sy, (LO7 $Z), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlsxnc_vvssml v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VGTLSXNCvirml v256f64:$vy, (LO7 $I), i64:$sz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlsxnc_vvssmvl v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTLSXNCvirml_v v256f64:$vy, (LO7 $I), i64:$sz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlsxnc_vvssml v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, i32:$vl), (VGTLSXNCvizml v256f64:$vy, (LO7 $I), (LO7 $Z), v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlsxnc_vvssmvl v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTLSXNCvizml_v v256f64:$vy, (LO7 $I), (LO7 $Z), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlzx_vvssl v256f64:$vy, i64:$sy, i64:$sz, i32:$vl), (VGTLZXvrrl v256f64:$vy, i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlzx_vvssvl v256f64:$vy, i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VGTLZXvrrl_v v256f64:$vy, i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlzx_vvssl v256f64:$vy, i64:$sy, zero:$Z, i32:$vl), (VGTLZXvrzl v256f64:$vy, i64:$sy, (LO7 $Z), i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlzx_vvssvl v256f64:$vy, i64:$sy, zero:$Z, v256f64:$pt, i32:$vl), (VGTLZXvrzl_v v256f64:$vy, i64:$sy, (LO7 $Z), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlzx_vvssl v256f64:$vy, simm7:$I, i64:$sz, i32:$vl), (VGTLZXvirl v256f64:$vy, (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlzx_vvssvl v256f64:$vy, simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VGTLZXvirl_v v256f64:$vy, (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlzx_vvssl v256f64:$vy, simm7:$I, zero:$Z, i32:$vl), (VGTLZXvizl v256f64:$vy, (LO7 $I), (LO7 $Z), i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlzx_vvssvl v256f64:$vy, simm7:$I, zero:$Z, v256f64:$pt, i32:$vl), (VGTLZXvizl_v v256f64:$vy, (LO7 $I), (LO7 $Z), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlzx_vvssml v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VGTLZXvrrml v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlzx_vvssmvl v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTLZXvrrml_v v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlzx_vvssml v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, i32:$vl), (VGTLZXvrzml v256f64:$vy, i64:$sy, (LO7 $Z), v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlzx_vvssmvl v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTLZXvrzml_v v256f64:$vy, i64:$sy, (LO7 $Z), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlzx_vvssml v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VGTLZXvirml v256f64:$vy, (LO7 $I), i64:$sz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlzx_vvssmvl v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTLZXvirml_v v256f64:$vy, (LO7 $I), i64:$sz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlzx_vvssml v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, i32:$vl), (VGTLZXvizml v256f64:$vy, (LO7 $I), (LO7 $Z), v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlzx_vvssmvl v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTLZXvizml_v v256f64:$vy, (LO7 $I), (LO7 $Z), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlzxnc_vvssl v256f64:$vy, i64:$sy, i64:$sz, i32:$vl), (VGTLZXNCvrrl v256f64:$vy, i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlzxnc_vvssvl v256f64:$vy, i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VGTLZXNCvrrl_v v256f64:$vy, i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlzxnc_vvssl v256f64:$vy, i64:$sy, zero:$Z, i32:$vl), (VGTLZXNCvrzl v256f64:$vy, i64:$sy, (LO7 $Z), i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlzxnc_vvssvl v256f64:$vy, i64:$sy, zero:$Z, v256f64:$pt, i32:$vl), (VGTLZXNCvrzl_v v256f64:$vy, i64:$sy, (LO7 $Z), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlzxnc_vvssl v256f64:$vy, simm7:$I, i64:$sz, i32:$vl), (VGTLZXNCvirl v256f64:$vy, (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlzxnc_vvssvl v256f64:$vy, simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VGTLZXNCvirl_v v256f64:$vy, (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlzxnc_vvssl v256f64:$vy, simm7:$I, zero:$Z, i32:$vl), (VGTLZXNCvizl v256f64:$vy, (LO7 $I), (LO7 $Z), i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlzxnc_vvssvl v256f64:$vy, simm7:$I, zero:$Z, v256f64:$pt, i32:$vl), (VGTLZXNCvizl_v v256f64:$vy, (LO7 $I), (LO7 $Z), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlzxnc_vvssml v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VGTLZXNCvrrml v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlzxnc_vvssmvl v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTLZXNCvrrml_v v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlzxnc_vvssml v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, i32:$vl), (VGTLZXNCvrzml v256f64:$vy, i64:$sy, (LO7 $Z), v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlzxnc_vvssmvl v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTLZXNCvrzml_v v256f64:$vy, i64:$sy, (LO7 $Z), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlzxnc_vvssml v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VGTLZXNCvirml v256f64:$vy, (LO7 $I), i64:$sz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlzxnc_vvssmvl v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTLZXNCvirml_v v256f64:$vy, (LO7 $I), i64:$sz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlzxnc_vvssml v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, i32:$vl), (VGTLZXNCvizml v256f64:$vy, (LO7 $I), (LO7 $Z), v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlzxnc_vvssmvl v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTLZXNCvizml_v v256f64:$vy, (LO7 $I), (LO7 $Z), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsc_vvssl v256f64:$vx, v256f64:$vy, i64:$sy, i64:$sz, i32:$vl), (VSCvrrvl v256f64:$vy, i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vsc_vvssl v256f64:$vx, v256f64:$vy, i64:$sy, zero:$Z, i32:$vl), (VSCvrzvl v256f64:$vy, i64:$sy, (LO7 $Z), v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vsc_vvssl v256f64:$vx, v256f64:$vy, simm7:$I, i64:$sz, i32:$vl), (VSCvirvl v256f64:$vy, (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vsc_vvssl v256f64:$vx, v256f64:$vy, simm7:$I, zero:$Z, i32:$vl), (VSCvizvl v256f64:$vy, (LO7 $I), (LO7 $Z), v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vsc_vvssml v256f64:$vx, v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSCvrrvml v256f64:$vy, i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vsc_vvssml v256f64:$vx, v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, i32:$vl), (VSCvrzvml v256f64:$vy, i64:$sy, (LO7 $Z), v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vsc_vvssml v256f64:$vx, v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSCvirvml v256f64:$vy, (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vsc_vvssml v256f64:$vx, v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, i32:$vl), (VSCvizvml v256f64:$vy, (LO7 $I), (LO7 $Z), v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscnc_vvssl v256f64:$vx, v256f64:$vy, i64:$sy, i64:$sz, i32:$vl), (VSCNCvrrvl v256f64:$vy, i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscnc_vvssl v256f64:$vx, v256f64:$vy, i64:$sy, zero:$Z, i32:$vl), (VSCNCvrzvl v256f64:$vy, i64:$sy, (LO7 $Z), v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscnc_vvssl v256f64:$vx, v256f64:$vy, simm7:$I, i64:$sz, i32:$vl), (VSCNCvirvl v256f64:$vy, (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscnc_vvssl v256f64:$vx, v256f64:$vy, simm7:$I, zero:$Z, i32:$vl), (VSCNCvizvl v256f64:$vy, (LO7 $I), (LO7 $Z), v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscnc_vvssml v256f64:$vx, v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSCNCvrrvml v256f64:$vy, i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscnc_vvssml v256f64:$vx, v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, i32:$vl), (VSCNCvrzvml v256f64:$vy, i64:$sy, (LO7 $Z), v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscnc_vvssml v256f64:$vx, v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSCNCvirvml v256f64:$vy, (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscnc_vvssml v256f64:$vx, v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, i32:$vl), (VSCNCvizvml v256f64:$vy, (LO7 $I), (LO7 $Z), v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscot_vvssl v256f64:$vx, v256f64:$vy, i64:$sy, i64:$sz, i32:$vl), (VSCOTvrrvl v256f64:$vy, i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscot_vvssl v256f64:$vx, v256f64:$vy, i64:$sy, zero:$Z, i32:$vl), (VSCOTvrzvl v256f64:$vy, i64:$sy, (LO7 $Z), v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscot_vvssl v256f64:$vx, v256f64:$vy, simm7:$I, i64:$sz, i32:$vl), (VSCOTvirvl v256f64:$vy, (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscot_vvssl v256f64:$vx, v256f64:$vy, simm7:$I, zero:$Z, i32:$vl), (VSCOTvizvl v256f64:$vy, (LO7 $I), (LO7 $Z), v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscot_vvssml v256f64:$vx, v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSCOTvrrvml v256f64:$vy, i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscot_vvssml v256f64:$vx, v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, i32:$vl), (VSCOTvrzvml v256f64:$vy, i64:$sy, (LO7 $Z), v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscot_vvssml v256f64:$vx, v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSCOTvirvml v256f64:$vy, (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscot_vvssml v256f64:$vx, v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, i32:$vl), (VSCOTvizvml v256f64:$vy, (LO7 $I), (LO7 $Z), v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscncot_vvssl v256f64:$vx, v256f64:$vy, i64:$sy, i64:$sz, i32:$vl), (VSCNCOTvrrvl v256f64:$vy, i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscncot_vvssl v256f64:$vx, v256f64:$vy, i64:$sy, zero:$Z, i32:$vl), (VSCNCOTvrzvl v256f64:$vy, i64:$sy, (LO7 $Z), v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscncot_vvssl v256f64:$vx, v256f64:$vy, simm7:$I, i64:$sz, i32:$vl), (VSCNCOTvirvl v256f64:$vy, (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscncot_vvssl v256f64:$vx, v256f64:$vy, simm7:$I, zero:$Z, i32:$vl), (VSCNCOTvizvl v256f64:$vy, (LO7 $I), (LO7 $Z), v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscncot_vvssml v256f64:$vx, v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSCNCOTvrrvml v256f64:$vy, i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscncot_vvssml v256f64:$vx, v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, i32:$vl), (VSCNCOTvrzvml v256f64:$vy, i64:$sy, (LO7 $Z), v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscncot_vvssml v256f64:$vx, v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSCNCOTvirvml v256f64:$vy, (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscncot_vvssml v256f64:$vx, v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, i32:$vl), (VSCNCOTvizvml v256f64:$vy, (LO7 $I), (LO7 $Z), v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscu_vvssl v256f64:$vx, v256f64:$vy, i64:$sy, i64:$sz, i32:$vl), (VSCUvrrvl v256f64:$vy, i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscu_vvssl v256f64:$vx, v256f64:$vy, i64:$sy, zero:$Z, i32:$vl), (VSCUvrzvl v256f64:$vy, i64:$sy, (LO7 $Z), v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscu_vvssl v256f64:$vx, v256f64:$vy, simm7:$I, i64:$sz, i32:$vl), (VSCUvirvl v256f64:$vy, (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscu_vvssl v256f64:$vx, v256f64:$vy, simm7:$I, zero:$Z, i32:$vl), (VSCUvizvl v256f64:$vy, (LO7 $I), (LO7 $Z), v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscu_vvssml v256f64:$vx, v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSCUvrrvml v256f64:$vy, i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscu_vvssml v256f64:$vx, v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, i32:$vl), (VSCUvrzvml v256f64:$vy, i64:$sy, (LO7 $Z), v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscu_vvssml v256f64:$vx, v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSCUvirvml v256f64:$vy, (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscu_vvssml v256f64:$vx, v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, i32:$vl), (VSCUvizvml v256f64:$vy, (LO7 $I), (LO7 $Z), v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscunc_vvssl v256f64:$vx, v256f64:$vy, i64:$sy, i64:$sz, i32:$vl), (VSCUNCvrrvl v256f64:$vy, i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscunc_vvssl v256f64:$vx, v256f64:$vy, i64:$sy, zero:$Z, i32:$vl), (VSCUNCvrzvl v256f64:$vy, i64:$sy, (LO7 $Z), v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscunc_vvssl v256f64:$vx, v256f64:$vy, simm7:$I, i64:$sz, i32:$vl), (VSCUNCvirvl v256f64:$vy, (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscunc_vvssl v256f64:$vx, v256f64:$vy, simm7:$I, zero:$Z, i32:$vl), (VSCUNCvizvl v256f64:$vy, (LO7 $I), (LO7 $Z), v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscunc_vvssml v256f64:$vx, v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSCUNCvrrvml v256f64:$vy, i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscunc_vvssml v256f64:$vx, v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, i32:$vl), (VSCUNCvrzvml v256f64:$vy, i64:$sy, (LO7 $Z), v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscunc_vvssml v256f64:$vx, v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSCUNCvirvml v256f64:$vy, (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscunc_vvssml v256f64:$vx, v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, i32:$vl), (VSCUNCvizvml v256f64:$vy, (LO7 $I), (LO7 $Z), v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscuot_vvssl v256f64:$vx, v256f64:$vy, i64:$sy, i64:$sz, i32:$vl), (VSCUOTvrrvl v256f64:$vy, i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscuot_vvssl v256f64:$vx, v256f64:$vy, i64:$sy, zero:$Z, i32:$vl), (VSCUOTvrzvl v256f64:$vy, i64:$sy, (LO7 $Z), v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscuot_vvssl v256f64:$vx, v256f64:$vy, simm7:$I, i64:$sz, i32:$vl), (VSCUOTvirvl v256f64:$vy, (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscuot_vvssl v256f64:$vx, v256f64:$vy, simm7:$I, zero:$Z, i32:$vl), (VSCUOTvizvl v256f64:$vy, (LO7 $I), (LO7 $Z), v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscuot_vvssml v256f64:$vx, v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSCUOTvrrvml v256f64:$vy, i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscuot_vvssml v256f64:$vx, v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, i32:$vl), (VSCUOTvrzvml v256f64:$vy, i64:$sy, (LO7 $Z), v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscuot_vvssml v256f64:$vx, v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSCUOTvirvml v256f64:$vy, (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscuot_vvssml v256f64:$vx, v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, i32:$vl), (VSCUOTvizvml v256f64:$vy, (LO7 $I), (LO7 $Z), v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscuncot_vvssl v256f64:$vx, v256f64:$vy, i64:$sy, i64:$sz, i32:$vl), (VSCUNCOTvrrvl v256f64:$vy, i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscuncot_vvssl v256f64:$vx, v256f64:$vy, i64:$sy, zero:$Z, i32:$vl), (VSCUNCOTvrzvl v256f64:$vy, i64:$sy, (LO7 $Z), v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscuncot_vvssl v256f64:$vx, v256f64:$vy, simm7:$I, i64:$sz, i32:$vl), (VSCUNCOTvirvl v256f64:$vy, (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscuncot_vvssl v256f64:$vx, v256f64:$vy, simm7:$I, zero:$Z, i32:$vl), (VSCUNCOTvizvl v256f64:$vy, (LO7 $I), (LO7 $Z), v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscuncot_vvssml v256f64:$vx, v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSCUNCOTvrrvml v256f64:$vy, i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscuncot_vvssml v256f64:$vx, v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, i32:$vl), (VSCUNCOTvrzvml v256f64:$vy, i64:$sy, (LO7 $Z), v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscuncot_vvssml v256f64:$vx, v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSCUNCOTvirvml v256f64:$vy, (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscuncot_vvssml v256f64:$vx, v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, i32:$vl), (VSCUNCOTvizvml v256f64:$vy, (LO7 $I), (LO7 $Z), v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscl_vvssl v256f64:$vx, v256f64:$vy, i64:$sy, i64:$sz, i32:$vl), (VSCLvrrvl v256f64:$vy, i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscl_vvssl v256f64:$vx, v256f64:$vy, i64:$sy, zero:$Z, i32:$vl), (VSCLvrzvl v256f64:$vy, i64:$sy, (LO7 $Z), v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscl_vvssl v256f64:$vx, v256f64:$vy, simm7:$I, i64:$sz, i32:$vl), (VSCLvirvl v256f64:$vy, (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscl_vvssl v256f64:$vx, v256f64:$vy, simm7:$I, zero:$Z, i32:$vl), (VSCLvizvl v256f64:$vy, (LO7 $I), (LO7 $Z), v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscl_vvssml v256f64:$vx, v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSCLvrrvml v256f64:$vy, i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscl_vvssml v256f64:$vx, v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, i32:$vl), (VSCLvrzvml v256f64:$vy, i64:$sy, (LO7 $Z), v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscl_vvssml v256f64:$vx, v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSCLvirvml v256f64:$vy, (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscl_vvssml v256f64:$vx, v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, i32:$vl), (VSCLvizvml v256f64:$vy, (LO7 $I), (LO7 $Z), v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vsclnc_vvssl v256f64:$vx, v256f64:$vy, i64:$sy, i64:$sz, i32:$vl), (VSCLNCvrrvl v256f64:$vy, i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vsclnc_vvssl v256f64:$vx, v256f64:$vy, i64:$sy, zero:$Z, i32:$vl), (VSCLNCvrzvl v256f64:$vy, i64:$sy, (LO7 $Z), v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vsclnc_vvssl v256f64:$vx, v256f64:$vy, simm7:$I, i64:$sz, i32:$vl), (VSCLNCvirvl v256f64:$vy, (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vsclnc_vvssl v256f64:$vx, v256f64:$vy, simm7:$I, zero:$Z, i32:$vl), (VSCLNCvizvl v256f64:$vy, (LO7 $I), (LO7 $Z), v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vsclnc_vvssml v256f64:$vx, v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSCLNCvrrvml v256f64:$vy, i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vsclnc_vvssml v256f64:$vx, v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, i32:$vl), (VSCLNCvrzvml v256f64:$vy, i64:$sy, (LO7 $Z), v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vsclnc_vvssml v256f64:$vx, v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSCLNCvirvml v256f64:$vy, (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vsclnc_vvssml v256f64:$vx, v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, i32:$vl), (VSCLNCvizvml v256f64:$vy, (LO7 $I), (LO7 $Z), v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vsclot_vvssl v256f64:$vx, v256f64:$vy, i64:$sy, i64:$sz, i32:$vl), (VSCLOTvrrvl v256f64:$vy, i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vsclot_vvssl v256f64:$vx, v256f64:$vy, i64:$sy, zero:$Z, i32:$vl), (VSCLOTvrzvl v256f64:$vy, i64:$sy, (LO7 $Z), v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vsclot_vvssl v256f64:$vx, v256f64:$vy, simm7:$I, i64:$sz, i32:$vl), (VSCLOTvirvl v256f64:$vy, (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vsclot_vvssl v256f64:$vx, v256f64:$vy, simm7:$I, zero:$Z, i32:$vl), (VSCLOTvizvl v256f64:$vy, (LO7 $I), (LO7 $Z), v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vsclot_vvssml v256f64:$vx, v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSCLOTvrrvml v256f64:$vy, i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vsclot_vvssml v256f64:$vx, v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, i32:$vl), (VSCLOTvrzvml v256f64:$vy, i64:$sy, (LO7 $Z), v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vsclot_vvssml v256f64:$vx, v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSCLOTvirvml v256f64:$vy, (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vsclot_vvssml v256f64:$vx, v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, i32:$vl), (VSCLOTvizvml v256f64:$vy, (LO7 $I), (LO7 $Z), v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vsclncot_vvssl v256f64:$vx, v256f64:$vy, i64:$sy, i64:$sz, i32:$vl), (VSCLNCOTvrrvl v256f64:$vy, i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vsclncot_vvssl v256f64:$vx, v256f64:$vy, i64:$sy, zero:$Z, i32:$vl), (VSCLNCOTvrzvl v256f64:$vy, i64:$sy, (LO7 $Z), v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vsclncot_vvssl v256f64:$vx, v256f64:$vy, simm7:$I, i64:$sz, i32:$vl), (VSCLNCOTvirvl v256f64:$vy, (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vsclncot_vvssl v256f64:$vx, v256f64:$vy, simm7:$I, zero:$Z, i32:$vl), (VSCLNCOTvizvl v256f64:$vy, (LO7 $I), (LO7 $Z), v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vsclncot_vvssml v256f64:$vx, v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSCLNCOTvrrvml v256f64:$vy, i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vsclncot_vvssml v256f64:$vx, v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, i32:$vl), (VSCLNCOTvrzvml v256f64:$vy, i64:$sy, (LO7 $Z), v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vsclncot_vvssml v256f64:$vx, v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSCLNCOTvirvml v256f64:$vy, (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vsclncot_vvssml v256f64:$vx, v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, i32:$vl), (VSCLNCOTvizvml v256f64:$vy, (LO7 $I), (LO7 $Z), v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_andm_mmm v256i1:$vmy, v256i1:$vmz), (ANDMmm v256i1:$vmy, v256i1:$vmz)>;
+def : Pat<(int_ve_vl_andm_MMM v512i1:$vmy, v512i1:$vmz), (ANDMyy v512i1:$vmy, v512i1:$vmz)>;
+def : Pat<(int_ve_vl_orm_mmm v256i1:$vmy, v256i1:$vmz), (ORMmm v256i1:$vmy, v256i1:$vmz)>;
+def : Pat<(int_ve_vl_orm_MMM v512i1:$vmy, v512i1:$vmz), (ORMyy v512i1:$vmy, v512i1:$vmz)>;
+def : Pat<(int_ve_vl_xorm_mmm v256i1:$vmy, v256i1:$vmz), (XORMmm v256i1:$vmy, v256i1:$vmz)>;
+def : Pat<(int_ve_vl_xorm_MMM v512i1:$vmy, v512i1:$vmz), (XORMyy v512i1:$vmy, v512i1:$vmz)>;
+def : Pat<(int_ve_vl_eqvm_mmm v256i1:$vmy, v256i1:$vmz), (EQVMmm v256i1:$vmy, v256i1:$vmz)>;
+def : Pat<(int_ve_vl_eqvm_MMM v512i1:$vmy, v512i1:$vmz), (EQVMyy v512i1:$vmy, v512i1:$vmz)>;
+def : Pat<(int_ve_vl_nndm_mmm v256i1:$vmy, v256i1:$vmz), (NNDMmm v256i1:$vmy, v256i1:$vmz)>;
+def : Pat<(int_ve_vl_nndm_MMM v512i1:$vmy, v512i1:$vmz), (NNDMyy v512i1:$vmy, v512i1:$vmz)>;
+def : Pat<(int_ve_vl_negm_mm v256i1:$vmy), (NEGMm v256i1:$vmy)>;
+def : Pat<(int_ve_vl_negm_MM v512i1:$vmy), (NEGMy v512i1:$vmy)>;
+def : Pat<(int_ve_vl_pcvm_sml v256i1:$vmy, i32:$vl), (PCVMml v256i1:$vmy, i32:$vl)>;
+def : Pat<(int_ve_vl_lzvm_sml v256i1:$vmy, i32:$vl), (LZVMml v256i1:$vmy, i32:$vl)>;
+def : Pat<(int_ve_vl_tovm_sml v256i1:$vmy, i32:$vl), (TOVMml v256i1:$vmy, i32:$vl)>;
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VEInstrIntrinsicVL.td b/contrib/llvm-project/llvm/lib/Target/VE/VEInstrIntrinsicVL.td
new file mode 100644
index 000000000000..69ea133ceed0
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VEInstrIntrinsicVL.td
@@ -0,0 +1,64 @@
+/// Pattern Matchings for VEL intrinsic instructions.
+
+/// Intrinsic patterns written by hand.
+
+// SVOB pattern.
+def : Pat<(int_ve_vl_svob), (SVOB)>;
+
+// Pack patterns.
+def : Pat<(i64 (int_ve_vl_pack_f32p ADDRrii:$addr0, ADDRrii:$addr1)),
+          (ORrr (f2l (LDUrii MEMrii:$addr0)),
+                (i2l (LDLZXrii MEMrii:$addr1)))>;
+
+def : Pat<(i64 (int_ve_vl_pack_f32a ADDRrii:$addr)),
+          (MULULrr
+            (i2l (LDLZXrii MEMrii:$addr)),
+            (LEASLrii (ANDrm (LEAzii 0, 0, (LO32 (i64 0x0000000100000001))),
+                             !add(32, 64)), 0,
+                      (HI32 (i64 0x0000000100000001))))>;
+
+// The extract/insert patterns.
+def : Pat<(v256i1 (int_ve_vl_extract_vm512u v512i1:$vm)),
+          (EXTRACT_SUBREG v512i1:$vm, sub_vm_even)>;
+
+def : Pat<(v256i1 (int_ve_vl_extract_vm512l v512i1:$vm)),
+          (EXTRACT_SUBREG v512i1:$vm, sub_vm_odd)>;
+
+def : Pat<(v512i1 (int_ve_vl_insert_vm512u v512i1:$vmx, v256i1:$vmy)),
+          (INSERT_SUBREG v512i1:$vmx, v256i1:$vmy, sub_vm_even)>;
+
+def : Pat<(v512i1 (int_ve_vl_insert_vm512l v512i1:$vmx, v256i1:$vmy)),
+          (INSERT_SUBREG v512i1:$vmx, v256i1:$vmy, sub_vm_odd)>;
+
+// VMRG patterns.
+def : Pat<(int_ve_vl_vmrgw_vsvMl i32:$sy, v256f64:$vz, v512i1:$vm, i32:$vl),
+          (VMRGWrvml (i2l i32:$sy), v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vmrgw_vsvMvl i32:$sy, v256f64:$vz, v512i1:$vm,
+                                  v256f64:$pt, i32:$vl),
+          (VMRGWrvml_v (i2l i32:$sy), v256f64:$vz, v512i1:$vm, i32:$vl,
+                       v256f64:$pt)>;
+
+// VMV patterns.
+def : Pat<(int_ve_vl_vmv_vsvl i32:$sy, v256f64:$vz, i32:$vl),
+          (VMVrvl (i2l i32:$sy), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmv_vsvvl i32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl),
+          (VMVrvl_v (i2l i32:$sy), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmv_vsvmvl i32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt,
+                                i32:$vl),
+          (VMVrvml_v (i2l i32:$sy), v256f64:$vz, v256i1:$vm, i32:$vl,
+                     v256f64:$pt)>;
+
+// LSV patterns.
+def : Pat<(int_ve_vl_lsv_vvss v256f64:$pt, i32:$sy, i64:$sz),
+          (LSVrr_v (i2l i32:$sy), i64:$sz, v256f64:$pt)>;
+
+// LVS patterns.
+def : Pat<(int_ve_vl_lvsl_svs v256f64:$vx, i32:$sy),
+          (LVSvr v256f64:$vx, (i2l i32:$sy))>;
+def : Pat<(int_ve_vl_lvsd_svs v256f64:$vx, i32:$sy),
+          (LVSvr v256f64:$vx, (i2l i32:$sy))>;
+def : Pat<(int_ve_vl_lvss_svs v256f64:$vx, i32:$sy),
+          (l2f (LVSvr v256f64:$vx, (i2l i32:$sy)))>;
+
+/// Intrinsic patterns automatically generated.
+include "VEInstrIntrinsicVL.gen.td"
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VEInstrPatternsVec.td b/contrib/llvm-project/llvm/lib/Target/VE/VEInstrPatternsVec.td
new file mode 100644
index 000000000000..0084876f9f1b
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VEInstrPatternsVec.td
@@ -0,0 +1,91 @@
+//===-- VEInstrPatternsVec.td - VEC_-type SDNodes and isel for VE Target --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the VEC_* prefixed intermediate SDNodes and their
+// isel patterns.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Instruction format superclass
+//===----------------------------------------------------------------------===//
+
+multiclass vbrd_elem32<ValueType v32, ValueType s32, SDPatternOperator ImmOp,
+                       SDNodeXForm ImmCast, SDNodeXForm SuperRegCast> {
+  // VBRDil
+  def : Pat<(v32 (vec_broadcast (s32 ImmOp:$sy), i32:$vl)),
+            (VBRDil (ImmCast $sy), i32:$vl)>;
+
+  // VBRDrl
+  def : Pat<(v32 (vec_broadcast s32:$sy, i32:$vl)),
+            (VBRDrl (SuperRegCast $sy), i32:$vl)>;
+}
+
+multiclass vbrd_elem64<ValueType v64, ValueType s64,
+                       SDPatternOperator ImmOp, SDNodeXForm ImmCast> {
+  // VBRDil
+  def : Pat<(v64 (vec_broadcast (s64 ImmOp:$sy), i32:$vl)),
+            (VBRDil (ImmCast $sy), i32:$vl)>;
+
+  // VBRDrl
+  def : Pat<(v64 (vec_broadcast s64:$sy, i32:$vl)),
+            (VBRDrl s64:$sy, i32:$vl)>;
+}
+
+multiclass extract_insert_elem32<ValueType v32, ValueType s32,
+                                 SDNodeXForm SubRegCast,
+                                 SDNodeXForm SuperRegCast> {
+  // LVSvi
+  def: Pat<(s32 (extractelt v32:$vec, uimm7:$idx)),
+           (SubRegCast (LVSvi v32:$vec, (ULO7 $idx)))>;
+  // LVSvr
+  def: Pat<(s32 (extractelt v32:$vec, i64:$idx)),
+           (SubRegCast (LVSvr v32:$vec, $idx))>;
+
+  // LSVir
+  def: Pat<(v32 (insertelt v32:$vec, s32:$val, uimm7:$idx)),
+           (LSVir_v (ULO7 $idx), (SuperRegCast $val), $vec)>;
+  // LSVrr
+  def: Pat<(v32 (insertelt v32:$vec, s32:$val, i64:$idx)),
+           (LSVrr_v $idx, (SuperRegCast $val), $vec)>;
+}
+
+multiclass extract_insert_elem64<ValueType v64, ValueType s64> {
+  // LVSvi
+  def: Pat<(s64 (extractelt v64:$vec, uimm7:$idx)),
+           (LVSvi v64:$vec, (ULO7 $idx))>;
+  // LVSvr
+  def: Pat<(s64 (extractelt v64:$vec, i64:$idx)),
+           (LVSvr v64:$vec, $idx)>;
+
+  // LSVir
+  def: Pat<(v64 (insertelt v64:$vec, s64:$val, uimm7:$idx)),
+           (LSVir_v (ULO7 $idx), $val, $vec)>;
+  // LSVrr
+  def: Pat<(v64 (insertelt v64:$vec, s64:$val, i64:$idx)),
+           (LSVrr_v $idx, $val, $vec)>;
+}
+
+multiclass patterns_elem32<ValueType v32, ValueType s32,
+                           SDPatternOperator ImmOp, SDNodeXForm ImmCast,
+                           SDNodeXForm SubRegCast, SDNodeXForm SuperRegCast> {
+  defm : vbrd_elem32<v32, s32, ImmOp, ImmCast, SuperRegCast>;
+  defm : extract_insert_elem32<v32, s32, SubRegCast, SuperRegCast>;
+}
+
+multiclass patterns_elem64<ValueType v64, ValueType s64,
+                           SDPatternOperator ImmOp, SDNodeXForm ImmCast> {
+  defm : vbrd_elem64<v64, s64, ImmOp, ImmCast>;
+  defm : extract_insert_elem64<v64, s64>;
+}
+
+defm : patterns_elem32<v256i32, i32, simm7, LO7, l2i, i2l>;
+defm : patterns_elem32<v256f32, f32, simm7fp, LO7FP, l2f, f2l>;
+
+defm : patterns_elem64<v256i64, i64, simm7, LO7>;
+defm : patterns_elem64<v256f64, f64, simm7fp, LO7FP>;
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VEInstrVec.td b/contrib/llvm-project/llvm/lib/Target/VE/VEInstrVec.td
new file mode 100644
index 000000000000..4a8476f7288a
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VEInstrVec.td
@@ -0,0 +1,1510 @@
+//===----------------------------------------------------------------------===//
+// Vector Instructions
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions for VM512 modifications
+//===----------------------------------------------------------------------===//
+
+// LVM/SVM instructions using VM512
+let hasSideEffects = 0, isCodeGenOnly = 1 in {
+  let Constraints = "$vx = $vd", DisableEncoding = "$vd" in {
+    def LVMyir_y : Pseudo<(outs VM512:$vx), (ins uimm3:$sy, I64:$sz, VM512:$vd),
+                          "# pseudo LVM $vx, $sy, $sz, $vd">;
+    def LVMyim_y : Pseudo<(outs VM512:$vx),
+                          (ins uimm3:$sy, mimm:$sz, VM512:$vd),
+                          "# pseudo LVM $vx, $sy, $sz, $vd">;
+  }
+  def LVMyir : Pseudo<(outs VM512:$vx), (ins uimm3:$sy, I64:$sz),
+                      "# pseudo LVM $vx, $sy, $sz">;
+  def LVMyim : Pseudo<(outs VM512:$vx), (ins uimm3:$sy, mimm:$sz),
+                      "# pseudo LVM $vx, $sy, $sz">;
+  def SVMyi : Pseudo<(outs I64:$sx), (ins VM512:$vz, uimm3:$sy),
+                     "# pseudo SVM $sx, $vz, $sy">;
+}
+
+// VFMK/VFMKW/VFMKS instructions using VM512
+let hasSideEffects = 0, isCodeGenOnly = 1, DisableEncoding = "$vl" in {
+  def VFMKyal : Pseudo<(outs VM512:$vmx), (ins I32:$vl),
+                       "# pseudo-vfmk.at $vmx">;
+  def VFMKynal : Pseudo<(outs VM512:$vmx), (ins I32:$vl),
+                        "# pseudo-vfmk.af $vmx">;
+  def VFMKWyvl  : Pseudo<(outs VM512:$vmx),
+                         (ins CCOp:$cf, V64:$vz, I32:$vl),
+                         "# pseudo-vfmk.w.$cf $vmx, $vz">;
+  def VFMKWyvyl : Pseudo<(outs VM512:$vmx),
+                         (ins CCOp:$cf, V64:$vz, VM512:$vm, I32:$vl),
+                         "# pseudo-vfmk.w.$cf $vmx, $vz, $vm">;
+  def VFMKSyvl  : Pseudo<(outs VM512:$vmx),
+                         (ins CCOp:$cf, V64:$vz, I32:$vl),
+                         "# pseudo-vfmk.s.$cf $vmx, $vz">;
+  def VFMKSyvyl : Pseudo<(outs VM512:$vmx),
+                         (ins CCOp:$cf, V64:$vz, VM512:$vm, I32:$vl),
+                         "# pseudo-vfmk.s.$cf $vmx, $vz, $vm">;
+}
+
+// ANDM/ORM/XORM/EQVM/NNDM/NEGM instructions using VM512
+let hasSideEffects = 0, isCodeGenOnly = 1 in {
+  def ANDMyy : Pseudo<(outs VM512:$vmx), (ins VM512:$vmy, VM512:$vmz),
+                      "# andm $vmx, $vmy, $vmz">;
+  def ORMyy : Pseudo<(outs VM512:$vmx), (ins VM512:$vmy, VM512:$vmz),
+                     "# orm $vmx, $vmy, $vmz">;
+  def XORMyy : Pseudo<(outs VM512:$vmx), (ins VM512:$vmy, VM512:$vmz),
+                      "# xorm $vmx, $vmy, $vmz">;
+  def EQVMyy : Pseudo<(outs VM512:$vmx), (ins VM512:$vmy, VM512:$vmz),
+                      "# eqvm $vmx, $vmy, $vmz">;
+  def NNDMyy : Pseudo<(outs VM512:$vmx), (ins VM512:$vmy, VM512:$vmz),
+                      "# nndm $vmx, $vmy, $vmz">;
+  def NEGMy : Pseudo<(outs VM512:$vmx), (ins VM512:$vmy),
+                     "# negm $vmx, $vmy">;
+}
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//
+// Define all vector instructions defined in SX-Aurora TSUBASA Architecture
+// Guide here.  As those mnemonics, we use mnemonics defined in Vector Engine
+// Assembly Language Reference Manual.
+//
+// Some instructions can update existing data by following instructions
+// sequence.
+//
+//   lea %s0, 256
+//   lea %s1, 128
+//   lvl %s0
+//   vbrd %v0, 2 # v0 = { 2, 2, 2, ..., 2, 2, 2 }
+//   lvl %s1
+//   vbrd %v0, 3 # v0 = { 3, 3, 3, ..., 3, 2, 2, 2, ..., 2, 2, 2 }
+//
+// In order to represent above with a virtual register, we defines instructions
+// with an additional base register and `_v` suffiex in mnemonic.
+//
+//   lea t0, 256
+//   lea t1, 128
+//   lea t0
+//   vbrd tv0, 2
+//   lvl t1
+//   vbrd_v tv1, 2, tv0
+//
+// We also have some instructions uses VL register with an pseudo VL value
+// with following suffixes in mnemonic.
+//
+//   l: have an additional I32 register to represent the VL value.
+//   L: have an additional VL register to represent the VL value.
+//===----------------------------------------------------------------------===//
+
+//-----------------------------------------------------------------------------
+// Section 8.9 - Vector Load/Store and Move Instructions
+//-----------------------------------------------------------------------------
+
+// Multiclass for VLD instructions
+let mayLoad = 1, hasSideEffects = 0, Uses = [VL] in
+multiclass VLDbm<string opcStr, bits<8>opc, RegisterClass RC, dag dag_in,
+                 string disEnc = ""> {
+  let DisableEncoding = disEnc in
+  def "" : RVM<opc, (outs RC:$vx), dag_in,
+               !strconcat(opcStr, " $vx, $sy, $sz")>;
+  let Constraints = "$vx = $base", DisableEncoding = disEnc#"$base",
+      isCodeGenOnly = 1 in
+  def _v : RVM<opc, (outs RC:$vx), !con(dag_in, (ins RC:$base)),
+               !strconcat(opcStr, " $vx, $sy, $sz")>;
+}
+multiclass VLDlm<string opcStr, bits<8>opc, RegisterClass RC, dag dag_in> {
+  defm "" : VLDbm<opcStr, opc, RC, dag_in>;
+  let isCodeGenOnly = 1, VE_VLInUse = 1 in {
+    defm l : VLDbm<opcStr, opc, RC, !con(dag_in, (ins I32:$vl)), "$vl,">;
+    defm L : VLDbm<opcStr, opc, RC, !con(dag_in, (ins VLS:$vl)), "$vl,">;
+  }
+}
+let VE_VLIndex = 3 in
+multiclass VLDtgm<string opcStr, bits<8>opc, RegisterClass RC> {
+  defm rr : VLDlm<opcStr, opc, RC, (ins I64:$sy, I64:$sz)>;
+  let cy = 0 in
+  defm ir : VLDlm<opcStr, opc, RC, (ins simm7:$sy, I64:$sz)>;
+  let cz = 0 in
+  defm rz : VLDlm<opcStr, opc, RC, (ins I64:$sy, zero:$sz)>;
+  let cy = 0, cz = 0 in
+  defm iz : VLDlm<opcStr, opc, RC, (ins simm7:$sy, zero:$sz)>;
+}
+multiclass VLDm<string opcStr, bits<8>opc, RegisterClass RC> {
+  let vc = 1 in defm "" : VLDtgm<opcStr, opc, RC>;
+  let vc = 0 in defm NC : VLDtgm<opcStr#".nc", opc, RC>;
+}
+
+// Section 8.9.1 - VLD (Vector Load)
+defm VLD : VLDm<"vld", 0x81, V64>;
+
+// Section 8.9.2 - VLDU (Vector Load Upper)
+defm VLDU : VLDm<"vldu", 0x82, V64>;
+
+// Section 8.9.3 - VLDL (Vector Load Lower)
+defm VLDLSX : VLDm<"vldl.sx", 0x83, V64>;
+let cx = 1 in defm VLDLZX : VLDm<"vldl.zx", 0x83, V64>;
+
+// Section 8.9.4 - VLD2D (Vector Load 2D)
+defm VLD2D : VLDm<"vld2d", 0xc1, V64>;
+
+// Section 8.9.5 - VLDU2D (Vector Load Upper 2D)
+defm VLDU2D : VLDm<"vldu2d", 0xc2, V64>;
+
+// Section 8.9.6 - VLDL2D (Vector Load Lower 2D)
+defm VLDL2DSX : VLDm<"vldl2d.sx", 0xc3, V64>;
+let cx = 1 in defm VLDL2DZX : VLDm<"vldl2d.zx", 0xc3, V64>;
+
+// Multiclass for VST instructions
+let mayStore = 1, hasSideEffects = 0, Uses = [VL] in
+multiclass VSTbm<string opcStr, string argStr, bits<8>opc, dag dag_in> {
+  def "" : RVM<opc, (outs), dag_in, !strconcat(opcStr, argStr)>;
+  let DisableEncoding = "$vl", isCodeGenOnly = 1, VE_VLInUse = 1 in {
+    def l : RVM<opc, (outs), !con(dag_in, (ins I32:$vl)),
+                !strconcat(opcStr, argStr)>;
+    def L : RVM<opc, (outs), !con(dag_in, (ins VLS:$vl)),
+                !strconcat(opcStr, argStr)>;
+  }
+}
+multiclass VSTmm<string opcStr, bits<8>opc, dag dag_in> {
+  defm "" : VSTbm<opcStr, " $vx, $sy, $sz", opc, dag_in>;
+  let m = ?, VE_VLWithMask = 1 in
+  defm m : VSTbm<opcStr, " $vx, $sy, $sz, $m", opc, !con(dag_in, (ins VM:$m))>;
+}
+let VE_VLIndex = 3 in
+multiclass VSTtgm<string opcStr, bits<8>opc, RegisterClass RC> {
+  defm rrv : VSTmm<opcStr, opc, (ins I64:$sy, I64:$sz, RC:$vx)>;
+  let cy = 0 in
+  defm irv : VSTmm<opcStr, opc, (ins simm7:$sy, I64:$sz, RC:$vx)>;
+  let cz = 0 in
+  defm rzv : VSTmm<opcStr, opc, (ins I64:$sy, zero:$sz, RC:$vx)>;
+  let cy = 0, cz = 0 in
+  defm izv : VSTmm<opcStr, opc, (ins simm7:$sy, zero:$sz, RC:$vx)>;
+}
+multiclass VSTm<string opcStr, bits<8>opc, RegisterClass RC> {
+  let vc = 1, cx = 0 in defm "" : VSTtgm<opcStr, opc, RC>;
+  let vc = 0, cx = 0 in defm NC : VSTtgm<opcStr#".nc", opc, RC>;
+  let vc = 1, cx = 1 in defm OT : VSTtgm<opcStr#".ot", opc, RC>;
+  let vc = 0, cx = 1 in defm NCOT : VSTtgm<opcStr#".nc.ot", opc, RC>;
+}
+
+// Section 8.9.7 - VST (Vector Store)
+defm VST : VSTm<"vst", 0x91, V64>;
+
+// Section 8.9.8 - VST (Vector Store Upper)
+defm VSTU : VSTm<"vstu", 0x92, V64>;
+
+// Section 8.9.9 - VSTL (Vector Store Lower)
+defm VSTL : VSTm<"vstl", 0x93, V64>;
+
+// Section 8.9.10 - VST2D (Vector Store 2D)
+defm VST2D : VSTm<"vst2d", 0xd1, V64>;
+
+// Section 8.9.11 - VSTU2D (Vector Store Upper 2D)
+defm VSTU2D : VSTm<"vstu2d", 0xd2, V64>;
+
+// Section 8.9.12 - VSTL2D (Vector Store Lower 2D)
+defm VSTL2D : VSTm<"vstl2d", 0xd3, V64>;
+
+// Multiclass for VGT instructions
+let mayLoad = 1, hasSideEffects = 0, Uses = [VL] in
+multiclass VGTbm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
+                 dag dag_in, string disEnc = ""> {
+  let DisableEncoding = disEnc in
+  def "" : RVM<opc, (outs RC:$vx), dag_in,
+               !strconcat(opcStr, " $vx, ", argStr)>;
+  let Constraints = "$vx = $base", DisableEncoding = disEnc#"$base",
+      isCodeGenOnly = 1 in
+  def _v : RVM<opc, (outs RC:$vx), !con(dag_in, (ins RC:$base)),
+               !strconcat(opcStr, " $vx, ", argStr)>;
+}
+multiclass VGTlm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
+                 dag dag_in> {
+  defm "" : VGTbm<opcStr, argStr, opc, RC, dag_in>;
+  let isCodeGenOnly = 1, VE_VLInUse = 1 in {
+    defm l : VGTbm<opcStr, argStr, opc, RC, !con(dag_in, (ins I32:$vl)),
+                   "$vl,">;
+    defm L : VGTbm<opcStr, argStr, opc, RC, !con(dag_in, (ins VLS:$vl)),
+                   "$vl,">;
+  }
+}
+multiclass VGTmm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
+                 dag dag_in> {
+  defm "" : VGTlm<opcStr, argStr, opc, RC, dag_in>;
+  let m = ?, VE_VLWithMask = 1 in
+  defm m : VGTlm<opcStr, argStr#", $m", opc, RC, !con(dag_in, (ins VM:$m))>;
+}
+let VE_VLIndex = 4 in
+multiclass VGTlhm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
+                  dag dag_in> {
+  defm rr : VGTmm<opcStr, argStr#", $sy, $sz", opc, RC,
+                  !con(dag_in, (ins I64:$sy, I64:$sz))>;
+  let cy = 0 in
+  defm ir : VGTmm<opcStr, argStr#", $sy, $sz", opc, RC,
+                  !con(dag_in, (ins simm7:$sy, I64:$sz))>;
+  let cz = 0 in
+  defm rz : VGTmm<opcStr, argStr#", $sy, $sz", opc, RC,
+                  !con(dag_in, (ins I64:$sy, zero:$sz))>;
+  let cy = 0, cz = 0 in
+  defm iz : VGTmm<opcStr, argStr#", $sy, $sz", opc, RC,
+                  !con(dag_in, (ins simm7:$sy, zero:$sz))>;
+}
+multiclass VGTtgm<string opcStr, bits<8>opc, RegisterClass RC> {
+  let vy = ? in defm v : VGTlhm<opcStr, "$vy", opc, RC, (ins V64:$vy)>;
+  let cs = 1, sw = ? in defm s : VGTlhm<opcStr, "$sw", opc, RC, (ins I64:$sw)>;
+}
+multiclass VGTm<string opcStr, bits<8>opc, RegisterClass RC> {
+  let vc = 1 in defm "" : VGTtgm<opcStr, opc, RC>;
+  let vc = 0 in defm NC : VGTtgm<opcStr#".nc", opc, RC>;
+}
+
+// Section 8.9.13 - VGT (Vector Gather)
+defm VGT : VGTm<"vgt", 0xa1, V64>;
+
+// Section 8.9.14 - VGTU (Vector Gather Upper)
+defm VGTU : VGTm<"vgtu", 0xa2, V64>;
+
+// Section 8.9.15 - VGTL (Vector Gather Lower)
+defm VGTLSX : VGTm<"vgtl.sx", 0xa3, V64>;
+let cx = 1 in defm VGTLZX : VGTm<"vgtl.zx", 0xa3, V64>;
+def : MnemonicAlias<"vgtl", "vgtl.zx">;
+def : MnemonicAlias<"vgtl.nc", "vgtl.zx.nc">;
+
+// Multiclass for VSC instructions
+let mayStore = 1, hasSideEffects = 0, Uses = [VL] in
+multiclass VSCbm<string opcStr, string argStr, bits<8>opc, dag dag_in> {
+  def "" : RVM<opc, (outs), dag_in, !strconcat(opcStr, argStr)>;
+  let DisableEncoding = "$vl", isCodeGenOnly = 1, VE_VLInUse = 1 in {
+    def l : RVM<opc, (outs), !con(dag_in, (ins I32:$vl)),
+                !strconcat(opcStr, argStr)>;
+    def L : RVM<opc, (outs), !con(dag_in, (ins VLS:$vl)),
+                !strconcat(opcStr, argStr)>;
+  }
+}
+multiclass VSCmm<string opcStr, string argStr, bits<8>opc, dag dag_in> {
+  defm "" : VSCbm<opcStr, argStr, opc, dag_in>;
+  let m = ?, VE_VLWithMask = 1 in
+  defm m : VSCbm<opcStr, argStr#", $m", opc, !con(dag_in, (ins VM:$m))>;
+}
+let VE_VLIndex = 4 in
+multiclass VSClhm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
+                  dag dag_in> {
+  defm rrv : VSCmm<opcStr, " $vx, "#argStr#", $sy, $sz", opc,
+                   !con(dag_in, (ins I64:$sy, I64:$sz, RC:$vx))>;
+  let cy = 0 in
+  defm irv : VSCmm<opcStr, " $vx, "#argStr#", $sy, $sz", opc,
+                   !con(dag_in, (ins simm7:$sy, I64:$sz, RC:$vx))>;
+  let cz = 0 in
+  defm rzv : VSCmm<opcStr, " $vx, "#argStr#", $sy, $sz", opc,
+                   !con(dag_in, (ins I64:$sy, zero:$sz, RC:$vx))>;
+  let cy = 0, cz = 0 in
+  defm izv : VSCmm<opcStr, " $vx, "#argStr#", $sy, $sz", opc,
+                   !con(dag_in, (ins simm7:$sy, zero:$sz, RC:$vx))>;
+}
+multiclass VSCtgm<string opcStr, bits<8>opc, RegisterClass RC> {
+  let vy = ? in defm v : VSClhm<opcStr, "$vy", opc, RC, (ins V64:$vy)>;
+  let cs = 1, sw = ? in defm s : VSClhm<opcStr, "$sw", opc, RC, (ins I64:$sw)>;
+}
+multiclass VSCm<string opcStr, bits<8>opc, RegisterClass RC> {
+  let vc = 1, cx = 0 in defm "" : VSCtgm<opcStr, opc, RC>;
+  let vc = 0, cx = 0 in defm NC : VSCtgm<opcStr#".nc", opc, RC>;
+  let vc = 1, cx = 1 in defm OT : VSCtgm<opcStr#".ot", opc, RC>;
+  let vc = 0, cx = 1 in defm NCOT : VSCtgm<opcStr#".nc.ot", opc, RC>;
+}
+
+// Section 8.9.16 - VSC (Vector Scatter)
+defm VSC : VSCm<"vsc", 0xb1, V64>;
+
+// Section 8.9.17 - VSCU (Vector Scatter Upper)
+defm VSCU : VSCm<"vscu", 0xb2, V64>;
+
+// Section 8.9.18 - VSCL (Vector Scatter Lower)
+defm VSCL : VSCm<"vscl", 0xb3, V64>;
+
+// Section 8.9.19 - PFCHV (Prefetch Vector)
+let Uses = [VL] in
+multiclass PFCHVbm<string opcStr, string argStr, bits<8>opc, dag dag_in> {
+  def "" : RVM<opc, (outs), dag_in, !strconcat(opcStr, argStr)>;
+  let DisableEncoding = "$vl", isCodeGenOnly = 1, VE_VLInUse = 1 in {
+    def l : RVM<opc, (outs), !con(dag_in, (ins I32:$vl)),
+                !strconcat(opcStr, argStr)>;
+    def L : RVM<opc, (outs), !con(dag_in, (ins VLS:$vl)),
+                !strconcat(opcStr, argStr)>;
+  }
+}
+let VE_VLIndex = 2 in
+multiclass PFCHVm<string opcStr, bits<8>opc> {
+  defm rr : PFCHVbm<opcStr, " $sy, $sz", opc, (ins I64:$sy, I64:$sz)>;
+  let cy = 0 in
+  defm ir : PFCHVbm<opcStr, " $sy, $sz", opc, (ins simm7:$sy, I64:$sz)>;
+  let cz = 0 in
+  defm rz : PFCHVbm<opcStr, " $sy, $sz", opc, (ins I64:$sy, zero:$sz)>;
+  let cy = 0, cz = 0 in
+  defm iz : PFCHVbm<opcStr, " $sy, $sz", opc, (ins simm7:$sy, zero:$sz)>;
+}
+let vc = 1, vx = 0 in defm PFCHV : PFCHVm<"pfchv", 0x80>;
+let vc = 0, vx = 0 in defm PFCHVNC : PFCHVm<"pfchv.nc", 0x80>;
+
+// Section 8.9.20 - LSV (Load S to V)
+let sx = 0, vx = ?, hasSideEffects = 0 in
+multiclass LSVbm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
+                 dag dag_in> {
+  def "" : RR<opc, (outs RC:$vx), dag_in, !strconcat(opcStr, " ${vx}", argStr)>;
+  let Constraints = "$vx = $base", DisableEncoding = "$base",
+      isCodeGenOnly = 1 in
+  def _v : RR<opc, (outs RC:$vx), !con(dag_in, (ins RC:$base)),
+               !strconcat(opcStr, " ${vx}", argStr)>;
+}
+multiclass LSVm<string opcStr, bits<8>opc, RegisterClass RC> {
+  defm rr : LSVbm<opcStr, "(${sy}), $sz", opc, RC, (ins I64:$sy, I64:$sz)>;
+  let cy = 0 in
+  defm ir : LSVbm<opcStr, "(${sy}), $sz", opc, RC, (ins uimm7:$sy, I64:$sz)>;
+  let cz = 0 in
+  defm rm : LSVbm<opcStr, "(${sy}), $sz", opc, RC, (ins I64:$sy, mimm:$sz)>;
+  let cy = 0, cz = 0 in
+  defm im : LSVbm<opcStr, "(${sy}), $sz", opc, RC, (ins uimm7:$sy, mimm:$sz)>;
+}
+defm LSV : LSVm<"lsv", 0x8e, V64>;
+
+// Section 8.9.21 - LVS (Load V to S)
+let cz = 0, sz = 0, vx = ?, hasSideEffects = 0 in
+multiclass LVSm<string opcStr, bits<8>opc, RegisterClass RC> {
+  def vr : RR<opc, (outs I64:$sx), (ins RC:$vx, I64:$sy),
+              opcStr#" $sx, ${vx}(${sy})">;
+  let cy = 0 in
+  def vi : RR<opc, (outs I64:$sx), (ins RC:$vx, uimm7:$sy),
+              opcStr#" $sx, ${vx}(${sy})">;
+}
+defm LVS : LVSm<"lvs", 0x9e, V64>;
+
+// Section 8.9.22 - LVM (Load VM)
+let sx = 0, vx = ?, hasSideEffects = 0 in
+multiclass LVMbm<string opcStr, string argStr, bits<8>opc, RegisterClass RCM,
+                 dag dag_in> {
+  def "" : RR<opc, (outs RCM:$vx), dag_in,
+              !strconcat(opcStr, " $vx, ", argStr)>;
+  let Constraints = "$vx = $base", DisableEncoding = "$base",
+      isCodeGenOnly = 1 in {
+    def _m : RR<opc, (outs RCM:$vx), !con(dag_in, (ins RCM:$base)),
+                !strconcat(opcStr, " $vx, ", argStr)>;
+  }
+}
+multiclass LVMom<string opcStr, bits<8>opc, RegisterClass RCM> {
+  defm rr : LVMbm<opcStr, "$sy, $sz", opc, RCM, (ins I64:$sy, I64:$sz)>;
+  let cy = 0 in
+  defm ir : LVMbm<opcStr, "$sy, $sz", opc, RCM, (ins uimm2:$sy, I64:$sz)>;
+  let cz = 0 in
+  defm rm : LVMbm<opcStr, "$sy, $sz", opc, RCM, (ins I64:$sy, mimm:$sz)>;
+  let cy = 0, cz = 0 in
+  defm im : LVMbm<opcStr, "$sy, $sz", opc, RCM, (ins uimm2:$sy, mimm:$sz)>;
+}
+multiclass LVMm<string opcStr, bits<8>opc, RegisterClass RCM> {
+  defm "" : LVMom<opcStr, opc, RCM>;
+}
+defm LVM : LVMm<"lvm", 0xb7, VM>;
+
+// Section 8.9.23 - SVM (Save VM)
+let cz = 0, sz = 0, vz = ?, hasSideEffects = 0 in
+multiclass SVMm<string opcStr, bits<8>opc, RegisterClass RCM> {
+  def mr : RR<opc, (outs I64:$sx), (ins RCM:$vz, I64:$sy),
+              opcStr#" $sx, $vz, $sy">;
+  let cy = 0 in
+  def mi : RR<opc, (outs I64:$sx), (ins RCM:$vz, uimm2:$sy),
+              opcStr#" $sx, $vz, $sy">;
+}
+defm SVM : SVMm<"svm", 0xa7, VM>;
+
+// Section 8.9.24 - VBRD (Vector Broadcast)
+let vx = ?, hasSideEffects = 0, Uses = [VL] in
+multiclass VBRDbm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
+                  dag dag_in, string disEnc = ""> {
+  let DisableEncoding = disEnc in
+  def "" : RV<opc, (outs RC:$vx), dag_in,
+              !strconcat(opcStr, " $vx, ", argStr)>;
+  let Constraints = "$vx = $base", DisableEncoding = disEnc#"$base",
+      isCodeGenOnly = 1 in
+  def _v : RV<opc, (outs RC:$vx), !con(dag_in, (ins RC:$base)),
+              !strconcat(opcStr, " $vx, ", argStr)>;
+}
+multiclass VBRDlm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
+                  dag dag_in> {
+  defm "" : VBRDbm<opcStr, argStr, opc, RC, dag_in>;
+  let isCodeGenOnly = 1, VE_VLInUse = 1 in {
+    defm l : VBRDbm<opcStr, argStr, opc, RC, !con(dag_in, (ins I32:$vl)),
+                   "$vl,">;
+    defm L : VBRDbm<opcStr, argStr, opc, RC, !con(dag_in, (ins VLS:$vl)),
+                   "$vl,">;
+  }
+}
+multiclass VBRDmm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
+                  RegisterClass RCM, dag dag_in> {
+  defm "" : VBRDlm<opcStr, argStr, opc, RC, dag_in>;
+  let m = ?, VE_VLWithMask = 1 in
+  defm m : VBRDlm<opcStr, argStr#", $m", opc, RC, !con(dag_in, (ins RCM:$m))>;
+}
+let VE_VLIndex = 2 in
+multiclass VBRDm<string opcStr, bits<8>opc, RegisterClass VRC, RegisterClass RC,
+                 RegisterClass RCM> {
+  defm r : VBRDmm<opcStr, "$sy", opc, VRC, RCM, (ins RC:$sy)>;
+  let cy = 0 in
+  defm i : VBRDmm<opcStr, "$sy", opc, VRC, RCM, (ins simm7:$sy)>;
+}
+let cx = 0, cx2 = 0 in
+defm VBRD : VBRDm<"vbrd", 0x8c, V64, I64, VM>;
+let cx = 0, cx2 = 1 in
+defm VBRDL : VBRDm<"vbrdl", 0x8c, V64, I32, VM>;
+let cx = 1, cx2 = 0 in
+defm VBRDU : VBRDm<"vbrdu", 0x8c, V64, F32, VM>;
+let cx = 1, cx2 = 1 in
+defm PVBRD : VBRDm<"pvbrd", 0x8c, V64, I64, VM512>;
+
+// Section 8.9.25 - VMV (Vector Move)
+let vx = ?, vz = ?, hasSideEffects = 0, Uses = [VL] in
+multiclass VMVbm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
+                 dag dag_in, string disEnc = ""> {
+  let DisableEncoding = disEnc in
+  def "" : RV<opc, (outs RC:$vx), dag_in,
+              !strconcat(opcStr, " $vx, ", argStr)>;
+  let Constraints = "$vx = $base", DisableEncoding = disEnc#"$base",
+      isCodeGenOnly = 1 in
+  def _v : RV<opc, (outs RC:$vx), !con(dag_in, (ins RC:$base)),
+              !strconcat(opcStr, " $vx, ", argStr)>;
+}
+multiclass VMVlm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
+                 dag dag_in> {
+  defm "" : VMVbm<opcStr, argStr, opc, RC, dag_in>;
+  let isCodeGenOnly = 1, VE_VLInUse = 1 in {
+    defm l : VMVbm<opcStr, argStr, opc, RC, !con(dag_in, (ins I32:$vl)),
+                   "$vl,">;
+    defm L : VMVbm<opcStr, argStr, opc, RC, !con(dag_in, (ins VLS:$vl)),
+                   "$vl,">;
+  }
+}
+multiclass VMVmm<string opcStr, bits<8>opc, RegisterClass RC,
+                 RegisterClass RCM, dag dag_in> {
+  defm "" : VMVlm<opcStr, "$sy, $vz", opc, RC, dag_in>;
+  let m = ?, VE_VLWithMask = 1 in
+  defm m : VMVlm<opcStr, "$sy, $vz, $m", opc, RC, !con(dag_in, (ins RCM:$m))>;
+}
+let VE_VLIndex = 3 in
+multiclass VMVm<string opcStr, bits<8>opc, RegisterClass RC,
+                RegisterClass RCM> {
+  defm rv : VMVmm<opcStr, opc, RC, RCM, (ins I64:$sy, RC:$vz)>;
+  let cy = 0 in
+  defm iv : VMVmm<opcStr, opc, RC, RCM, (ins uimm7:$sy, RC:$vz)>;
+}
+defm VMV : VMVm<"vmv", 0x9c, V64, VM>;
+
+//-----------------------------------------------------------------------------
+// Section 8.10 - Vector Fixed-Point Arithmetic Instructions
+//-----------------------------------------------------------------------------
+
+// Multiclass for generic vector calculation
+let vx = ?, hasSideEffects = 0, Uses = [VL] in
+multiclass RVbm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
+                dag dag_in, string disEnc = ""> {
+  let DisableEncoding = disEnc in
+  def "" : RV<opc, (outs RC:$vx), dag_in,
+              !strconcat(opcStr, " $vx", argStr)>;
+  let Constraints = "$vx = $base", DisableEncoding = disEnc#"$base",
+      isCodeGenOnly = 1 in
+  def _v : RV<opc, (outs RC:$vx), !con(dag_in, (ins RC:$base)),
+              !strconcat(opcStr, " $vx", argStr)>;
+}
+multiclass RVlm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
+                dag dag_in> {
+  defm "" : RVbm<opcStr, argStr, opc, RC, dag_in>;
+  let isCodeGenOnly = 1, VE_VLInUse = 1 in {
+    defm l : RVbm<opcStr, argStr, opc, RC, !con(dag_in, (ins I32:$vl)),
+                  "$vl,">;
+    defm L : RVbm<opcStr, argStr, opc, RC, !con(dag_in, (ins VLS:$vl)),
+                  "$vl,">;
+  }
+}
+multiclass RVmm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
+                RegisterClass RCM, dag dag_in> {
+  defm "" : RVlm<opcStr, argStr, opc, RC, dag_in>;
+  let m = ?, VE_VLWithMask = 1 in
+  defm m : RVlm<opcStr, argStr#", $m", opc, RC, !con(dag_in, (ins RCM:$m))>;
+}
+// Generic RV multiclass with 2 arguments.
+//   e.g. VADD, VSUB, VMPY, and etc.
+let VE_VLIndex = 3 in
+multiclass RVm<string opcStr, bits<8>opc, RegisterClass VRC, RegisterClass RC,
+               RegisterClass RCM, Operand SIMM = simm7> {
+  let cy = 0, sy = 0, vy = ?, vz = ? in
+  defm vv : RVmm<opcStr, ", $vy, $vz", opc, VRC, RCM, (ins VRC:$vy, VRC:$vz)>;
+  let cs = 1, vz = ? in
+  defm rv : RVmm<opcStr, ", $sy, $vz", opc, VRC, RCM, (ins RC:$sy, VRC:$vz)>;
+  let cs = 1, cy = 0, vz = ? in
+  defm iv : RVmm<opcStr, ", $sy, $vz", opc, VRC, RCM, (ins SIMM:$sy, VRC:$vz)>;
+}
+// Special RV multiclass with 2 arguments using cs2.
+//   e.g. VDIV, VDVS, and VDVX.
+let VE_VLIndex = 3 in
+multiclass RVDIVm<string opcStr, bits<8>opc, RegisterClass VRC,
+                  RegisterClass RC, RegisterClass RCM, Operand SIMM = simm7> {
+  let cy = 0, sy = 0, vy = ?, vz = ? in
+  defm vv : RVmm<opcStr, ", $vy, $vz", opc, VRC, RCM, (ins VRC:$vy, VRC:$vz)>;
+  let cs2 = 1, vy = ? in
+  defm vr : RVmm<opcStr, ", $vy, $sy", opc, VRC, RCM, (ins VRC:$vy, RC:$sy)>;
+  let cs2 = 1, cy = 0, vy = ? in
+  defm vi : RVmm<opcStr, ", $vy, $sy", opc, VRC, RCM, (ins VRC:$vy, SIMM:$sy)>;
+  let cs = 1, vz = ? in
+  defm rv : RVmm<opcStr, ", $sy, $vz", opc, VRC, RCM, (ins RC:$sy, VRC:$vz)>;
+  let cs = 1, cy = 0, vz = ? in
+  defm iv : RVmm<opcStr, ", $sy, $vz", opc, VRC, RCM, (ins SIMM:$sy, VRC:$vz)>;
+}
+// Generic RV multiclass with 2 arguments for logical operations.
+//   e.g. VAND, VOR, VXOR, and etc.
+let VE_VLIndex = 3 in
+multiclass RVLm<string opcStr, bits<8>opc, RegisterClass ScaRC,
+                RegisterClass RC, RegisterClass RCM> {
+  let cy = 0, sy = 0, vy = ?, vz = ? in
+  defm vv : RVmm<opcStr, ", $vy, $vz", opc, RC, RCM, (ins RC:$vy, RC:$vz)>;
+  let cs = 1, vz = ? in
+  defm rv : RVmm<opcStr, ", $sy, $vz", opc, RC, RCM, (ins ScaRC:$sy, RC:$vz)>;
+  let cs = 1, cy = 0, vz = ? in
+  defm mv : RVmm<opcStr, ", $sy, $vz", opc, RC, RCM, (ins mimm:$sy, RC:$vz)>;
+}
+// Generic RV multiclass with 1 argument.
+//   e.g. VLDZ, VPCNT, and VBRV.
+let VE_VLIndex = 2 in
+multiclass RV1m<string opcStr, bits<8>opc, RegisterClass RC,
+                RegisterClass RCM> {
+  let cy = 0, sy = 0, vz = ? in
+  defm v : RVmm<opcStr, ", $vz", opc, RC, RCM, (ins RC:$vz)>;
+}
+// Generic RV multiclass with no argument.
+//   e.g. VSEQ.
+let VE_VLIndex = 1 in
+multiclass RV0m<string opcStr, bits<8>opc, RegisterClass RC,
+                RegisterClass RCM> {
+  let cy = 0, sy = 0 in
+  defm "" : RVmm<opcStr, "", opc, RC, RCM, (ins)>;
+}
+// Generic RV multiclass with 2 arguments for shift operations.
+//   e.g. VSLL, VSRL, VSLA, and etc.
+let VE_VLIndex = 3 in
+multiclass RVSm<string opcStr, bits<8>opc, RegisterClass ScaRC,
+                RegisterClass RC, RegisterClass RCM> {
+  let cy = 0, sy = 0, vy = ?, vz = ? in
+  defm vv : RVmm<opcStr, ", $vz, $vy", opc, RC, RCM, (ins RC:$vz, RC:$vy)>;
+  let cs = 1, vz = ? in
+  defm vr : RVmm<opcStr, ", $vz, $sy", opc, RC, RCM, (ins RC:$vz, ScaRC:$sy)>;
+  let cs = 1, cy = 0, vz = ? in
+  defm vi : RVmm<opcStr, ", $vz, $sy", opc, RC, RCM, (ins RC:$vz, uimm7:$sy)>;
+}
+// Generic RV multiclass with 3 arguments for shift operations.
+//   e.g. VSLD and VSRD.
+let VE_VLIndex = 4 in
+multiclass RVSDm<string opcStr, bits<8>opc, RegisterClass RC,
+                 RegisterClass RCM> {
+  let vy = ?, vz = ? in
+  defm vvr : RVmm<opcStr, ", ($vy, ${vz}), $sy", opc, RC, RCM,
+                 (ins RC:$vy, RC:$vz, I64:$sy)>;
+  let cy = 0, vy = ?, vz = ? in
+  defm vvi : RVmm<opcStr, ", ($vy, ${vz}), $sy", opc, RC, RCM,
+                 (ins RC:$vy, RC:$vz, uimm7:$sy)>;
+}
+// Special RV multiclass with 3 arguments.
+//   e.g. VSFA
+let VE_VLIndex = 4 in
+multiclass RVSAm<string opcStr, bits<8>opc, RegisterClass RC,
+                 RegisterClass RCM> {
+  let cz = 1, sz = ?, vz = ? in
+  defm vrr : RVmm<opcStr, ", $vz, $sy, $sz", opc, RC, RCM,
+                  (ins RC:$vz, I64:$sy, I64:$sz)>;
+  let cz = 0, sz = ?, vz = ? in
+  defm vrm : RVmm<opcStr, ", $vz, $sy, $sz", opc, RC, RCM,
+                  (ins RC:$vz, I64:$sy, mimm:$sz)>;
+  let cy = 0, cz = 1, sz = ?, vz = ? in
+  defm vir : RVmm<opcStr, ", $vz, $sy, $sz", opc, RC, RCM,
+                  (ins RC:$vz, uimm3:$sy, I64:$sz)>;
+  let cy = 0, cz = 0, sz = ?, vz = ? in
+  defm vim : RVmm<opcStr, ", $vz, $sy, $sz", opc, RC, RCM,
+                  (ins RC:$vz, uimm3:$sy, mimm:$sz)>;
+}
+// Generic RV multiclass with 1 argument using vy field.
+//   e.g. VFSQRT, VRCP, and VRSQRT.
+let VE_VLIndex = 2 in
+multiclass RVF1m<string opcStr, bits<8>opc, RegisterClass RC,
+                 RegisterClass RCM> {
+  let cy = 0, sy = 0, vy = ? in
+  defm v : RVmm<opcStr, ", $vy", opc, RC, RCM, (ins RC:$vy)>;
+}
+// Special RV multiclass with 3 arguments using cs2.
+//   e.g. VFMAD, VFMSB, VFNMAD, and etc.
+let VE_VLIndex = 4 in
+multiclass RVMm<string opcStr, bits<8>opc, RegisterClass VRC, RegisterClass RC,
+                RegisterClass RCM, Operand SIMM = simm7> {
+  let cy = 0, sy = 0, vy = ?, vz = ?, vw = ? in
+  defm vvv : RVmm<opcStr, ", $vy, $vz, $vw", opc, VRC, RCM,
+                  (ins VRC:$vy, VRC:$vz, VRC:$vw)>;
+  let cs2 = 1, vy = ?, vw = ? in
+  defm vrv : RVmm<opcStr, ", $vy, $sy, $vw", opc, VRC, RCM,
+                  (ins VRC:$vy, RC:$sy, VRC:$vw)>;
+  let cs2 = 1, cy = 0, vy = ?, vw = ? in
+  defm viv : RVmm<opcStr, ", $vy, $sy, $vw", opc, VRC, RCM,
+                  (ins VRC:$vy, SIMM:$sy, VRC:$vw)>;
+  let cs = 1, vz = ?, vw = ? in
+  defm rvv : RVmm<opcStr, ", $sy, $vz, $vw", opc, VRC, RCM,
+                  (ins RC:$sy, VRC:$vz, VRC:$vw)>;
+  let cs = 1, cy = 0, vz = ?, vw = ? in
+  defm ivv : RVmm<opcStr, ", $sy, $vz, $vw", opc, VRC, RCM,
+                  (ins SIMM:$sy, VRC:$vz, VRC:$vw)>;
+}
+// Special RV multiclass with 2 arguments for floating point conversions.
+//   e.g. VFIX and VFIXX
+let hasSideEffects = 0, VE_VLIndex = 3 in
+multiclass RVFIXm<string opcStr, bits<8> opc, RegisterClass RC,
+                  RegisterClass RCM> {
+  let cy = 0, sy = 0, vy = ?, vz = ? in
+  defm v : RVmm<opcStr#"$vz", ", $vy", opc, RC, RCM, (ins RDOp:$vz, RC:$vy)>;
+}
+// Multiclass for generic iterative vector calculation
+let vx = ?, hasSideEffects = 0, Uses = [VL] in
+multiclass RVIbm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
+                dag dag_in, string disEnc = ""> {
+  let DisableEncoding = disEnc in
+  def "" : RV<opc, (outs RC:$vx), dag_in,
+              !strconcat(opcStr, " $vx", argStr)>;
+  let isCodeGenOnly = 1, Constraints = "$vx = $base", DisableEncoding = disEnc#"$base" in
+  def _v : RV<opc, (outs RC:$vx), !con(dag_in, (ins RC:$base)),
+              !strconcat(opcStr, " $vx", argStr)>;
+}
+multiclass RVIlm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
+                 dag dag_in> {
+  defm "" : RVIbm<opcStr, argStr, opc, RC, dag_in>;
+  let isCodeGenOnly = 1, VE_VLInUse = 1 in {
+    defm l : RVIbm<opcStr, argStr, opc, RC, !con(dag_in, (ins I32:$vl)),
+                   "$vl,">;
+    defm L : RVIbm<opcStr, argStr, opc, RC, !con(dag_in, (ins VLS:$vl)),
+                   "$vl,">;
+  }
+}
+// Generic RV multiclass for iterative operation with 2 argument.
+//   e.g. VFIA, VFIS, and VFIM
+let VE_VLIndex = 3 in
+multiclass RVI2m<string opcStr, bits<8>opc, RegisterClass VRC,
+                 RegisterClass RC> {
+  let vy = ? in
+  defm vr : RVIlm<opcStr, ", $vy, $sy", opc, VRC, (ins VRC:$vy, RC:$sy)>;
+  let cy = 0, vy = ? in
+  defm vi : RVIlm<opcStr, ", $vy, $sy", opc, VRC, (ins VRC:$vy, simm7fp:$sy)>;
+}
+// Generic RV multiclass for iterative operation with 3 argument.
+//   e.g. VFIAM, VFISM, VFIMA, and etc.
+let VE_VLIndex = 4 in
+multiclass RVI3m<string opcStr, bits<8>opc, RegisterClass VRC,
+                 RegisterClass RC> {
+  let vy = ?, vz = ? in
+  defm vvr : RVIlm<opcStr, ", $vy, $vz, $sy", opc, VRC,
+                   (ins VRC:$vy, VRC:$vz, RC:$sy)>;
+  let cy = 0, vy = ?, vz = ? in
+  defm vvi : RVIlm<opcStr, ", $vy, $vz, $sy", opc, VRC,
+                   (ins VRC:$vy, VRC:$vz, simm7fp:$sy)>;
+}
+// special RV multiclass with 3 arguments for VSHF.
+//   e.g. VSHF
+let vy = ?, vz = ?, VE_VLIndex = 4 in
+multiclass RVSHFm<string opcStr, bits<8>opc, RegisterClass RC,
+                  Operand SIMM = uimm4> {
+  defm vvr : RVlm<opcStr, ", $vy, $vz, $sy", opc, RC,
+                  (ins RC:$vy, RC:$vz, I64:$sy)>;
+  let cy = 0 in defm vvi : RVlm<opcStr, ", $vy, $vz, $sy", opc, RC,
+                                (ins RC:$vy, RC:$vz, SIMM:$sy)>;
+}
+// Multiclass for generic mask calculation
+let vx = ?, hasSideEffects = 0, Uses = [VL] in
+multiclass RVMKbm<string opcStr, string argStr, bits<8>opc, dag dag_out,
+                  dag dag_in> {
+  def "" : RV<opc, dag_out, dag_in, !strconcat(opcStr, argStr)>;
+  let DisableEncoding = "$vl", isCodeGenOnly = 1, VE_VLInUse = 1 in {
+    def l : RV<opc, dag_out, !con(dag_in, (ins I32:$vl)),
+               !strconcat(opcStr, argStr)>;
+    def L : RV<opc, dag_out, !con(dag_in, (ins VLS:$vl)),
+               !strconcat(opcStr, argStr)>;
+  }
+}
+multiclass RVMKlm<string opcStr, string argStr, bits<8>opc, RegisterClass RCM,
+                  dag dag_in> {
+  defm "" : RVMKbm<opcStr, " $vx"#argStr, opc, (outs RCM:$vx), dag_in>;
+  let m = ?, VE_VLWithMask = 1 in
+  defm m : RVMKbm<opcStr, " $vx"#argStr#", $m", opc, (outs RCM:$vx),
+                  !con(dag_in, (ins RCM:$m))>;
+}
+// Generic RV multiclass for mask calculation with a condition.
+//   e.g. VFMK, VFMS, and VFMF
+let cy = 0, sy = 0 in
+multiclass RVMKom<string opcStr, bits<8> opc, RegisterClass RC,
+                 RegisterClass RCM> {
+  let vy = ?, vz = ?, VE_VLIndex = 3 in
+  defm v : RVMKlm<opcStr#"$vy", ", $vz", opc, RCM, (ins CCOp:$vy, RC:$vz)>;
+  let vy = 15 /* AT */, VE_VLIndex = 1 in
+  defm a : RVMKlm<opcStr#"at", "", opc, RCM, (ins)>;
+  let vy = 0 /* AF */, VE_VLIndex = 1 in
+  defm na : RVMKlm<opcStr#"af", "", opc, RCM, (ins)>;
+}
+multiclass RVMKm<string opcStr, bits<8> opc, RegisterClass RC,
+                 RegisterClass RCM> {
+  defm "" : RVMKom<opcStr, opc, RC, RCM>;
+}
+// Generic RV multiclass for mask calculation with 2 arguments.
+//   e.g. ANDM, ORM, XORM, and etc.
+let cy = 0, sy = 0, vx = ?, vy = ?, vz = ?, hasSideEffects = 0 in
+multiclass RVM2m<string opcStr, bits<8> opc, RegisterClass RCM> {
+  def mm : RV<opc, (outs RCM:$vx), (ins RCM:$vy, RCM:$vz),
+              !strconcat(opcStr, " $vx, $vy, $vz")>;
+}
+// Generic RV multiclass for mask calculation with 1 argument.
+//   e.g. NEGM
+let cy = 0, sy = 0, vx = ?, vy = ?, hasSideEffects = 0 in
+multiclass RVM1m<string opcStr, bits<8> opc, RegisterClass RCM> {
+  def m : RV<opc, (outs RCM:$vx), (ins RCM:$vy),
+             !strconcat(opcStr, " $vx, $vy")>;
+}
+// Generic RV multiclass for mask calculation with 1 argument.
+//   e.g. PCVM, LZVM, and TOVM
+let cy = 0, sy = 0, vy = ?, hasSideEffects = 0, Uses = [VL] in
+multiclass RVMSbm<string opcStr, string argStr, bits<8>opc, dag dag_in> {
+  def "" : RV<opc, (outs I64:$sx), dag_in,
+              !strconcat(opcStr, " $sx,", argStr)> {
+    bits<7> sx;
+    let Inst{54-48} = sx;
+  }
+  let DisableEncoding = "$vl", isCodeGenOnly = 1, VE_VLInUse = 1 in {
+    def l : RV<opc, (outs I64:$sx), !con(dag_in, (ins I32:$vl)),
+               !strconcat(opcStr, " $sx,", argStr)> {
+      bits<7> sx;
+      let Inst{54-48} = sx;
+    }
+    def L : RV<opc, (outs I64:$sx), !con(dag_in, (ins VLS:$vl)),
+               !strconcat(opcStr, " $sx,", argStr)> {
+      bits<7> sx;
+      let Inst{54-48} = sx;
+    }
+  }
+}
+let VE_VLIndex = 2 in
+multiclass RVMSm<string opcStr, bits<8> opc, RegisterClass RCM> {
+  defm m : RVMSbm<opcStr, " $vy", opc, (ins RCM:$vy)>;
+}
+
+// Section 8.10.1 - VADD (Vector Add)
+let cx = 0, cx2 = 0 in
+defm VADDUL : RVm<"vaddu.l", 0xc8, V64, I64, VM>;
+let cx = 0, cx2 = 1 in {
+  defm PVADDULO : RVm<"pvaddu.lo", 0xc8, V64, I32, VM>;
+  let isCodeGenOnly = 1 in
+  defm VADDUW : RVm<"vaddu.w", 0xc8, V64, I32, VM>;
+}
+let cx = 1, cx2 = 0 in
+defm PVADDUUP : RVm<"pvaddu.up", 0xc8, V64, I64, VM>;
+let cx = 1, cx2 = 1 in
+defm PVADDU : RVm<"pvaddu", 0xc8, V64, I64, VM512>;
+def : MnemonicAlias<"vaddu.w", "pvaddu.lo">;
+
+// Section 8.10.2 - VADS (Vector Add Single)
+let cx = 0, cx2 = 0 in
+defm VADDSWSX : RVm<"vadds.w.sx", 0xca, V64, I32, VM>;
+let cx = 0, cx2 = 1 in {
+  defm PVADDSLO : RVm<"pvadds.lo", 0xca, V64, I32, VM>;
+  let isCodeGenOnly = 1 in
+  defm VADDSWZX : RVm<"vadds.w.zx", 0xca, V64, I32, VM>;
+}
+let cx = 1, cx2 = 0 in
+defm PVADDSUP : RVm<"pvadds.up", 0xca, V64, I64, VM>;
+let cx = 1, cx2 = 1 in
+defm PVADDS : RVm<"pvadds", 0xca, V64, I64, VM512>;
+def : MnemonicAlias<"pvadds.lo.sx", "vadds.w.sx">;
+def : MnemonicAlias<"vadds.w.zx", "pvadds.lo">;
+def : MnemonicAlias<"vadds.w", "pvadds.lo">;
+def : MnemonicAlias<"pvadds.lo.zx", "pvadds.lo">;
+
+// Section 8.10.3 - VADX (Vector Add)
+defm VADDSL : RVm<"vadds.l", 0x8b, V64, I64, VM>;
+
+// Section 8.10.4 - VSUB (Vector Subtract)
+let cx = 0, cx2 = 0 in
+defm VSUBUL : RVm<"vsubu.l", 0xd8, V64, I64, VM>;
+let cx = 0, cx2 = 1 in {
+  defm PVSUBULO : RVm<"pvsubu.lo", 0xd8, V64, I32, VM>;
+  let isCodeGenOnly = 1 in
+  defm VSUBUW : RVm<"vsubu.w", 0xd8, V64, I32, VM>;
+}
+let cx = 1, cx2 = 0 in
+defm PVSUBUUP : RVm<"pvsubu.up", 0xd8, V64, I64, VM>;
+let cx = 1, cx2 = 1 in
+defm PVSUBU : RVm<"pvsubu", 0xd8, V64, I64, VM512>;
+def : MnemonicAlias<"vsubu.w", "pvsubu.lo">;
+
+// Section 8.10.5 - VSBS (Vector Subtract Single)
+let cx = 0, cx2 = 0 in
+defm VSUBSWSX : RVm<"vsubs.w.sx", 0xda, V64, I32, VM>;
+let cx = 0, cx2 = 1 in {
+  defm PVSUBSLO : RVm<"pvsubs.lo", 0xda, V64, I32, VM>;
+  let isCodeGenOnly = 1 in
+  defm VSUBSWZX : RVm<"vsubs.w.zx", 0xda, V64, I32, VM>;
+}
+let cx = 1, cx2 = 0 in
+defm PVSUBSUP : RVm<"pvsubs.up", 0xda, V64, I64, VM>;
+let cx = 1, cx2 = 1 in
+defm PVSUBS : RVm<"pvsubs", 0xda, V64, I64, VM512>;
+def : MnemonicAlias<"pvsubs.lo.sx", "vsubs.w.sx">;
+def : MnemonicAlias<"vsubs.w.zx", "pvsubs.lo">;
+def : MnemonicAlias<"vsubs.w", "pvsubs.lo">;
+def : MnemonicAlias<"pvsubs.lo.zx", "pvsubs.lo">;
+
+// Section 8.10.6 - VSBX (Vector Subtract)
+defm VSUBSL : RVm<"vsubs.l", 0x9b, V64, I64, VM>;
+
+// Section 8.10.7 - VMPY (Vector Multiply)
+let cx2 = 0 in
+defm VMULUL : RVm<"vmulu.l", 0xc9, V64, I64, VM>;
+let cx2 = 1 in
+defm VMULUW : RVm<"vmulu.w", 0xc9, V64, I32, VM>;
+
+// Section 8.10.8 - VMPS (Vector Multiply Single)
+let cx2 = 0 in
+defm VMULSWSX : RVm<"vmuls.w.sx", 0xcb, V64, I32, VM>;
+let cx2 = 1 in
+defm VMULSWZX : RVm<"vmuls.w.zx", 0xcb, V64, I32, VM>;
+def : MnemonicAlias<"vmuls.w", "vmuls.w.zx">;
+
+// Section 8.10.9 - VMPX (Vector Multiply)
+defm VMULSL : RVm<"vmuls.l", 0xdb, V64, I64, VM>;
+
+// Section 8.10.10 - VMPD (Vector Multiply)
+defm VMULSLW : RVm<"vmuls.l.w", 0xd9, V64, I32, VM>;
+
+// Section 8.10.11 - VDIV (Vector Divide)
+let cx2 = 0 in
+defm VDIVUL : RVDIVm<"vdivu.l", 0xe9, V64, I64, VM>;
+let cx2 = 1 in
+defm VDIVUW : RVDIVm<"vdivu.w", 0xe9, V64, I32, VM>;
+
+// Section 8.10.12 - VDVS (Vector Divide Single)
+let cx2 = 0 in
+defm VDIVSWSX : RVDIVm<"vdivs.w.sx", 0xeb, V64, I32, VM>;
+let cx2 = 1 in
+defm VDIVSWZX : RVDIVm<"vdivs.w.zx", 0xeb, V64, I32, VM>;
+def : MnemonicAlias<"vdivs.w", "vdivs.w.zx">;
+
+// Section 8.10.13 - VDVX (Vector Divide)
+defm VDIVSL : RVDIVm<"vdivs.l", 0xfb, V64, I64, VM>;
+
+// Section 8.10.14 - VCMP (Vector Compare)
+let cx = 0, cx2 = 0 in
+defm VCMPUL : RVm<"vcmpu.l", 0xb9, V64, I64, VM>;
+let cx = 0, cx2 = 1 in {
+  defm PVCMPULO : RVm<"pvcmpu.lo", 0xb9, V64, I32, VM>;
+  let isCodeGenOnly = 1 in
+  defm VCMPUW : RVm<"vcmpu.w", 0xb9, V64, I32, VM>;
+}
+let cx = 1, cx2 = 0 in
+defm PVCMPUUP : RVm<"pvcmpu.up", 0xb9, V64, I64, VM>;
+let cx = 1, cx2 = 1 in
+defm PVCMPU : RVm<"pvcmpu", 0xb9, V64, I64, VM512>;
+def : MnemonicAlias<"vcmpu.w", "pvcmpu.lo">;
+
+// Section 8.10.15 - VCPS (Vector Compare Single)
+let cx = 0, cx2 = 0 in
+defm VCMPSWSX : RVm<"vcmps.w.sx", 0xfa, V64, I32, VM>;
+let cx = 0, cx2 = 1 in {
+  defm PVCMPSLO : RVm<"pvcmps.lo", 0xfa, V64, I32, VM>;
+  let isCodeGenOnly = 1 in
+  defm VCMPSWZX : RVm<"vcmps.w.zx", 0xfa, V64, I32, VM>;
+}
+let cx = 1, cx2 = 0 in
+defm PVCMPSUP : RVm<"pvcmps.up", 0xfa, V64, I64, VM>;
+let cx = 1, cx2 = 1 in
+defm PVCMPS : RVm<"pvcmps", 0xfa, V64, I64, VM512>;
+def : MnemonicAlias<"pvcmps.lo.sx", "vcmps.w.sx">;
+def : MnemonicAlias<"vcmps.w.zx", "pvcmps.lo">;
+def : MnemonicAlias<"vcmps.w", "pvcmps.lo">;
+def : MnemonicAlias<"pvcmps.lo.zx", "pvcmps.lo">;
+
+// Section 8.10.16 - VCPX (Vector Compare)
+defm VCMPSL : RVm<"vcmps.l", 0xba, V64, I64, VM>;
+
+// Section 8.10.17 - VCMS (Vector Compare and Select Maximum/Minimum Single)
+let cx = 0, cx2 = 0 in
+defm VMAXSWSX : RVm<"vmaxs.w.sx", 0x8a, V64, I32, VM>;
+let cx = 0, cx2 = 1 in {
+  defm PVMAXSLO : RVm<"pvmaxs.lo", 0x8a, V64, I32, VM>;
+  let isCodeGenOnly = 1 in
+  defm VMAXSWZX : RVm<"vmaxs.w.zx", 0x8a, V64, I32, VM>;
+}
+let cx = 1, cx2 = 0 in
+defm PVMAXSUP : RVm<"pvmaxs.up", 0x8a, V64, I64, VM>;
+let cx = 1, cx2 = 1 in
+defm PVMAXS : RVm<"pvmaxs", 0x8a, V64, I64, VM512>;
+let cs2 = 1 in {
+  let cx = 0, cx2 = 0 in
+  defm VMINSWSX : RVm<"vmins.w.sx", 0x8a, V64, I32, VM>;
+  let cx = 0, cx2 = 1 in {
+    defm PVMINSLO : RVm<"pvmins.lo", 0x8a, V64, I32, VM>;
+    let isCodeGenOnly = 1 in
+    defm VMINSWZX : RVm<"vmins.w.zx", 0x8a, V64, I32, VM>;
+  }
+  let cx = 1, cx2 = 0 in
+  defm PVMINSUP : RVm<"pvmins.up", 0x8a, V64, I64, VM>;
+  let cx = 1, cx2 = 1 in
+  defm PVMINS : RVm<"pvmins", 0x8a, V64, I64, VM512>;
+}
+def : MnemonicAlias<"pvmaxs.lo.sx", "vmaxs.w.sx">;
+def : MnemonicAlias<"vmaxs.w.zx", "pvmaxs.lo">;
+def : MnemonicAlias<"vmaxs.w", "pvmaxs.lo">;
+def : MnemonicAlias<"pvmaxs.lo.zx", "pvmaxs.lo">;
+def : MnemonicAlias<"pvmins.lo.sx", "vmins.w.sx">;
+def : MnemonicAlias<"vmins.w.zx", "pvmins.lo">;
+def : MnemonicAlias<"vmins.w", "pvmins.lo">;
+def : MnemonicAlias<"pvmins.lo.zx", "pvmins.lo">;
+
+// Section 8.10.18 - VCMX (Vector Compare and Select Maximum/Minimum)
+defm VMAXSL : RVm<"vmaxs.l", 0x9a, V64, I64, VM>;
+let cs2 = 1 in
+defm VMINSL : RVm<"vmins.l", 0x9a, V64, I64, VM>;
+
+//-----------------------------------------------------------------------------
+// Section 8.11 - Vector Logical Operation Instructions
+//-----------------------------------------------------------------------------
+
+// Section 8.11.1 - VAND (Vector And)
+let cx = 0, cx2 = 0 in defm VAND : RVLm<"vand", 0xc4, I64, V64, VM>;
+let cx = 0, cx2 = 1 in defm PVANDLO : RVLm<"pvand.lo", 0xc4, I32, V64, VM>;
+let cx = 1, cx2 = 0 in defm PVANDUP : RVLm<"pvand.up", 0xc4, F32, V64, VM>;
+let cx = 1, cx2 = 1 in defm PVAND : RVLm<"pvand", 0xc4, I64, V64, VM512>;
+
+// Section 8.11.2 - VOR (Vector Or)
+let cx = 0, cx2 = 0 in defm VOR : RVLm<"vor", 0xc5, I64, V64, VM>;
+let cx = 0, cx2 = 1 in defm PVORLO : RVLm<"pvor.lo", 0xc5, I32, V64, VM>;
+let cx = 1, cx2 = 0 in defm PVORUP : RVLm<"pvor.up", 0xc5, F32, V64, VM>;
+let cx = 1, cx2 = 1 in defm PVOR : RVLm<"pvor", 0xc5, I64, V64, VM512>;
+
+// Section 8.11.3 - VXOR (Vector Exclusive Or)
+let cx = 0, cx2 = 0 in defm VXOR : RVLm<"vxor", 0xc6, I64, V64, VM>;
+let cx = 0, cx2 = 1 in defm PVXORLO : RVLm<"pvxor.lo", 0xc6, I32, V64, VM>;
+let cx = 1, cx2 = 0 in defm PVXORUP : RVLm<"pvxor.up", 0xc6, F32, V64, VM>;
+let cx = 1, cx2 = 1 in defm PVXOR : RVLm<"pvxor", 0xc6, I64, V64, VM512>;
+
+// Section 8.11.4 - VEQV (Vector Equivalence)
+let cx = 0, cx2 = 0 in defm VEQV : RVLm<"veqv", 0xc7, I64, V64, VM>;
+let cx = 0, cx2 = 1 in defm PVEQVLO : RVLm<"pveqv.lo", 0xc7, I32, V64, VM>;
+let cx = 1, cx2 = 0 in defm PVEQVUP : RVLm<"pveqv.up", 0xc7, F32, V64, VM>;
+let cx = 1, cx2 = 1 in defm PVEQV : RVLm<"pveqv", 0xc7, I64, V64, VM512>;
+
+// Section 8.11.5 - VLDZ (Vector Leading Zero Count)
+let cx = 0, cx2 = 0 in defm VLDZ : RV1m<"vldz", 0xe7, V64, VM>;
+let cx = 0, cx2 = 1 in defm PVLDZLO : RV1m<"pvldz.lo", 0xe7, V64, VM>;
+let cx = 1, cx2 = 0 in defm PVLDZUP : RV1m<"pvldz.up", 0xe7, V64, VM>;
+let cx = 1, cx2 = 1 in defm PVLDZ : RV1m<"pvldz", 0xe7, V64, VM512>;
+
+// Section 8.11.6 - VPCNT (Vector Population Count)
+let cx = 0, cx2 = 0 in defm VPCNT : RV1m<"vpcnt", 0xac, V64, VM>;
+let cx = 0, cx2 = 1 in defm PVPCNTLO : RV1m<"pvpcnt.lo", 0xac, V64, VM>;
+let cx = 1, cx2 = 0 in defm PVPCNTUP : RV1m<"pvpcnt.up", 0xac, V64, VM>;
+let cx = 1, cx2 = 1 in defm PVPCNT : RV1m<"pvpcnt", 0xac, V64, VM512>;
+
+// Section 8.11.7 - VBRV (Vector Bit Reverse)
+let cx = 0, cx2 = 0 in defm VBRV : RV1m<"vbrv", 0xf7, V64, VM>;
+let cx = 0, cx2 = 1 in defm PVBRVLO : RV1m<"pvbrv.lo", 0xf7, V64, VM>;
+let cx = 1, cx2 = 0 in defm PVBRVUP : RV1m<"pvbrv.up", 0xf7, V64, VM>;
+let cx = 1, cx2 = 1 in defm PVBRV : RV1m<"pvbrv", 0xf7, V64, VM512>;
+
+// Section 8.11.8 - VSEQ (Vector Sequential Number)
+let cx = 0, cx2 = 0 in defm VSEQ : RV0m<"vseq", 0x99, V64, VM>;
+let cx = 0, cx2 = 1 in defm PVSEQLO : RV0m<"pvseq.lo", 0x99, V64, VM>;
+let cx = 1, cx2 = 0 in defm PVSEQUP : RV0m<"pvseq.up", 0x99, V64, VM>;
+let cx = 1, cx2 = 1 in defm PVSEQ : RV0m<"pvseq", 0x99, V64, VM512>;
+
+//-----------------------------------------------------------------------------
+// Section 8.12 - Vector Shift Operation Instructions
+//-----------------------------------------------------------------------------
+
+// Section 8.12.1 - VSLL (Vector Shift Left Logical)
+let cx = 0, cx2 = 0 in defm VSLL : RVSm<"vsll", 0xe5, I64, V64, VM>;
+let cx = 0, cx2 = 1 in defm PVSLLLO : RVSm<"pvsll.lo", 0xe5, I32, V64, VM>;
+let cx = 1, cx2 = 0 in defm PVSLLUP : RVSm<"pvsll.up", 0xe5, F32, V64, VM>;
+let cx = 1, cx2 = 1 in defm PVSLL : RVSm<"pvsll", 0xe5, I64, V64, VM512>;
+
+// Section 8.12.2 - VSLD (Vector Shift Left Double)
+defm VSLD : RVSDm<"vsld", 0xe4, V64, VM>;
+
+// Section 8.12.3 - VSRL (Vector Shift Right Logical)
+let cx = 0, cx2 = 0 in defm VSRL : RVSm<"vsrl", 0xf5, I64, V64, VM>;
+let cx = 0, cx2 = 1 in defm PVSRLLO : RVSm<"pvsrl.lo", 0xf5, I32, V64, VM>;
+let cx = 1, cx2 = 0 in defm PVSRLUP : RVSm<"pvsrl.up", 0xf5, F32, V64, VM>;
+let cx = 1, cx2 = 1 in defm PVSRL : RVSm<"pvsrl", 0xf5, I64, V64, VM512>;
+
+// Section 8.12.4 - VSRD (Vector Shift Right Double)
+defm VSRD : RVSDm<"vsrd", 0xf4, V64, VM>;
+
+// Section 8.12.5 - VSLA (Vector Shift Left Arithmetic)
+let cx = 0, cx2 = 0 in defm VSLAWSX : RVSm<"vsla.w.sx", 0xe6, I32, V64, VM>;
+let cx = 0, cx2 = 1 in {
+  defm PVSLALO : RVSm<"pvsla.lo", 0xe6, I32, V64, VM>;
+  let isCodeGenOnly = 1 in defm VSLAWZX : RVSm<"vsla.w.zx", 0xe6, I32, V64, VM>;
+}
+let cx = 1, cx2 = 0 in defm PVSLAUP : RVSm<"pvsla.up", 0xe6, F32, V64, VM>;
+let cx = 1, cx2 = 1 in defm PVSLA : RVSm<"pvsla", 0xe6, I64, V64, VM512>;
+def : MnemonicAlias<"pvsla.lo.sx", "vsla.w.sx">;
+def : MnemonicAlias<"vsla.w.zx", "pvsla.lo">;
+def : MnemonicAlias<"vsla.w", "pvsla.lo">;
+def : MnemonicAlias<"pvsla.lo.zx", "pvsla.lo">;
+
+// Section 8.12.6 - VSLAX (Vector Shift Left Arithmetic)
+defm VSLAL : RVSm<"vsla.l", 0xd4, I64, V64, VM>;
+
+// Section 8.12.7 - VSRA (Vector Shift Right Arithmetic)
+let cx = 0, cx2 = 0 in defm VSRAWSX : RVSm<"vsra.w.sx", 0xf6, I32, V64, VM>;
+let cx = 0, cx2 = 1 in {
+  defm PVSRALO : RVSm<"pvsra.lo", 0xf6, I32, V64, VM>;
+  let isCodeGenOnly = 1 in defm VSRAWZX : RVSm<"vsra.w.zx", 0xf6, I32, V64, VM>;
+}
+let cx = 1, cx2 = 0 in defm PVSRAUP : RVSm<"pvsra.up", 0xf6, F32, V64, VM>;
+let cx = 1, cx2 = 1 in defm PVSRA : RVSm<"pvsra", 0xf6, I64, V64, VM512>;
+def : MnemonicAlias<"pvsra.lo.sx", "vsra.w.sx">;
+def : MnemonicAlias<"vsra.w.zx", "pvsra.lo">;
+def : MnemonicAlias<"vsra.w", "pvsra.lo">;
+def : MnemonicAlias<"pvsra.lo.zx", "pvsra.lo">;
+
+// Section 8.12.8 - VSRAX (Vector Shift Right Arithmetic)
+defm VSRAL : RVSm<"vsra.l", 0xd5, I64, V64, VM>;
+
+// Section 8.12.9 - VSFA (Vector Shift Left and Add)
+defm VSFA : RVSAm<"vsfa", 0xd7, V64, VM>;
+
+//-----------------------------------------------------------------------------
+// Section 8.13 - Vector Floating-Point Arithmetic Instructions
+//-----------------------------------------------------------------------------
+
+// Section 8.13.1 - VFAD (Vector Floating Add)
+let cx = 0, cx2 = 0 in
+defm VFADDD : RVm<"vfadd.d", 0xcc, V64, I64, VM, simm7fp>;
+let cx = 0, cx2 = 1 in
+defm PVFADDLO : RVm<"pvfadd.lo", 0xcc, V64, I64, VM, simm7fp>;
+let cx = 1, cx2 = 0 in {
+  defm PVFADDUP : RVm<"pvfadd.up", 0xcc, V64, F32, VM, simm7fp>;
+  let isCodeGenOnly = 1 in
+  defm VFADDS : RVm<"vfadd.s", 0xcc, V64, F32, VM, simm7fp>;
+}
+let cx = 1, cx2 = 1 in
+defm PVFADD : RVm<"pvfadd", 0xcc, V64, I64, VM512, simm7fp>;
+def : MnemonicAlias<"vfadd.s", "pvfadd.up">;
+
+// Section 8.13.2 - VFSB (Vector Floating Subtract)
+let cx = 0, cx2 = 0 in
+defm VFSUBD : RVm<"vfsub.d", 0xdc, V64, I64, VM, simm7fp>;
+let cx = 0, cx2 = 1 in
+defm PVFSUBLO : RVm<"pvfsub.lo", 0xdc, V64, I64, VM, simm7fp>;
+let cx = 1, cx2 = 0 in {
+  defm PVFSUBUP : RVm<"pvfsub.up", 0xdc, V64, F32, VM, simm7fp>;
+  let isCodeGenOnly = 1 in
+  defm VFSUBS : RVm<"vfsub.s", 0xdc, V64, F32, VM, simm7fp>;
+}
+let cx = 1, cx2 = 1 in
+defm PVFSUB : RVm<"pvfsub", 0xdc, V64, I64, VM512, simm7fp>;
+def : MnemonicAlias<"vfsub.s", "pvfsub.up">;
+
+// Section 8.13.3 - VFMP (Vector Floating Multiply)
+let cx = 0, cx2 = 0 in
+defm VFMULD : RVm<"vfmul.d", 0xcd, V64, I64, VM, simm7fp>;
+let cx = 0, cx2 = 1 in
+defm PVFMULLO : RVm<"pvfmul.lo", 0xcd, V64, I64, VM, simm7fp>;
+let cx = 1, cx2 = 0 in {
+  defm PVFMULUP : RVm<"pvfmul.up", 0xcd, V64, F32, VM, simm7fp>;
+  let isCodeGenOnly = 1 in
+  defm VFMULS : RVm<"vfmul.s", 0xcd, V64, F32, VM, simm7fp>;
+}
+let cx = 1, cx2 = 1 in
+defm PVFMUL : RVm<"pvfmul", 0xcd, V64, I64, VM512, simm7fp>;
+def : MnemonicAlias<"vfmul.s", "pvfmul.up">;
+
+// Section 8.13.4 - VFDV (Vector Floating Divide)
+defm VFDIVD : RVDIVm<"vfdiv.d", 0xdd, V64, I64, VM, simm7fp>;
+let cx = 1 in
+defm VFDIVS : RVDIVm<"vfdiv.s", 0xdd, V64, F32, VM, simm7fp>;
+
+// Section 8.13.5 - VFSQRT (Vector Floating Square Root)
+defm VFSQRTD : RVF1m<"vfsqrt.d", 0xed, V64, VM>;
+let cx = 1 in
+defm VFSQRTS : RVF1m<"vfsqrt.s", 0xed, V64, VM>;
+
+// Section 8.13.6 - VFCP (Vector Floating Compare)
+let cx = 0, cx2 = 0 in
+defm VFCMPD : RVm<"vfcmp.d", 0xfc, V64, I64, VM, simm7fp>;
+let cx = 0, cx2 = 1 in
+defm PVFCMPLO : RVm<"pvfcmp.lo", 0xfc, V64, I64, VM, simm7fp>;
+let cx = 1, cx2 = 0 in {
+  defm PVFCMPUP : RVm<"pvfcmp.up", 0xfc, V64, F32, VM, simm7fp>;
+  let isCodeGenOnly = 1 in
+  defm VFCMPS : RVm<"vfcmp.s", 0xfc, V64, F32, VM, simm7fp>;
+}
+let cx = 1, cx2 = 1 in
+defm PVFCMP : RVm<"pvfcmp", 0xfc, V64, I64, VM512, simm7fp>;
+def : MnemonicAlias<"vfcmp.s", "pvfcmp.up">;
+
+// Section 8.13.7 - VFCM (Vector Floating Compare and Select Maximum/Minimum)
+let cx = 0, cx2 = 0 in
+defm VFMAXD : RVm<"vfmax.d", 0xbd, V64, I64, VM, simm7fp>;
+let cx = 0, cx2 = 1 in
+defm PVFMAXLO : RVm<"pvfmax.lo", 0xbd, V64, I64, VM, simm7fp>;
+let cx = 1, cx2 = 0 in {
+  defm PVFMAXUP : RVm<"pvfmax.up", 0xbd, V64, F32, VM, simm7fp>;
+  let isCodeGenOnly = 1 in
+  defm VFMAXS : RVm<"vfmax.s", 0xbd, V64, F32, VM, simm7fp>;
+}
+let cx = 1, cx2 = 1 in
+defm PVFMAX : RVm<"pvfmax", 0xbd, V64, I64, VM512, simm7fp>;
+let cs2 = 1 in {
+  let cx = 0, cx2 = 0 in
+  defm VFMIND : RVm<"vfmin.d", 0xbd, V64, I64, VM, simm7fp>;
+  let cx = 0, cx2 = 1 in
+  defm PVFMINLO : RVm<"pvfmin.lo", 0xbd, V64, I64, VM, simm7fp>;
+  let cx = 1, cx2 = 0 in {
+    defm PVFMINUP : RVm<"pvfmin.up", 0xbd, V64, F32, VM, simm7fp>;
+    let isCodeGenOnly = 1 in
+    defm VFMINS : RVm<"vfmin.s", 0xbd, V64, F32, VM, simm7fp>;
+  }
+  let cx = 1, cx2 = 1 in
+  defm PVFMIN : RVm<"pvfmin", 0xbd, V64, I64, VM512, simm7fp>;
+}
+def : MnemonicAlias<"vfmax.s", "pvfmax.up">;
+def : MnemonicAlias<"vfmin.s", "pvfmin.up">;
+
+// Section 8.13.8 - VFMAD (Vector Floating Fused Multiply Add)
+let cx = 0, cx2 = 0 in
+defm VFMADD : RVMm<"vfmad.d", 0xe2, V64, I64, VM, simm7fp>;
+let cx = 0, cx2 = 1 in
+defm PVFMADLO : RVMm<"pvfmad.lo", 0xe2, V64, I64, VM, simm7fp>;
+let cx = 1, cx2 = 0 in {
+  defm PVFMADUP : RVMm<"pvfmad.up", 0xe2, V64, F32, VM, simm7fp>;
+  let isCodeGenOnly = 1 in
+  defm VFMADS : RVMm<"vfmad.s", 0xe2, V64, F32, VM, simm7fp>;
+}
+let cx = 1, cx2 = 1 in
+defm PVFMAD : RVMm<"pvfmad", 0xe2, V64, I64, VM512, simm7fp>;
+def : MnemonicAlias<"vfmad.s", "pvfmad.up">;
+
+// Section 8.13.9 - VFMSB (Vector Floating Fused Multiply Subtract)
+let cx = 0, cx2 = 0 in
+defm VFMSBD : RVMm<"vfmsb.d", 0xf2, V64, I64, VM, simm7fp>;
+let cx = 0, cx2 = 1 in
+defm PVFMSBLO : RVMm<"pvfmsb.lo", 0xf2, V64, I64, VM, simm7fp>;
+let cx = 1, cx2 = 0 in {
+  defm PVFMSBUP : RVMm<"pvfmsb.up", 0xf2, V64, F32, VM, simm7fp>;
+  let isCodeGenOnly = 1 in
+  defm VFMSBS : RVMm<"vfmsb.s", 0xf2, V64, F32, VM, simm7fp>;
+}
+let cx = 1, cx2 = 1 in
+defm PVFMSB : RVMm<"pvfmsb", 0xf2, V64, I64, VM512, simm7fp>;
+def : MnemonicAlias<"vfmsb.s", "pvfmsb.up">;
+
+// Section 8.13.10 - VFNMAD (Vector Floating Fused Negative Multiply Add)
+let cx = 0, cx2 = 0 in
+defm VFNMADD : RVMm<"vfnmad.d", 0xe3, V64, I64, VM, simm7fp>;
+let cx = 0, cx2 = 1 in
+defm PVFNMADLO : RVMm<"pvfnmad.lo", 0xe3, V64, I64, VM, simm7fp>;
+let cx = 1, cx2 = 0 in {
+  defm PVFNMADUP : RVMm<"pvfnmad.up", 0xe3, V64, F32, VM, simm7fp>;
+  let isCodeGenOnly = 1 in
+  defm VFNMADS : RVMm<"vfnmad.s", 0xe3, V64, F32, VM, simm7fp>;
+}
+let cx = 1, cx2 = 1 in
+defm PVFNMAD : RVMm<"pvfnmad", 0xe3, V64, I64, VM512, simm7fp>;
+def : MnemonicAlias<"vfnmad.s", "pvfnmad.up">;
+
+// Section 8.13.11 - VFNMSB (Vector Floating Fused Negative Multiply Subtract)
+let cx = 0, cx2 = 0 in
+defm VFNMSBD : RVMm<"vfnmsb.d", 0xf3, V64, I64, VM, simm7fp>;
+let cx = 0, cx2 = 1 in
+defm PVFNMSBLO : RVMm<"pvfnmsb.lo", 0xf3, V64, I64, VM, simm7fp>;
+let cx = 1, cx2 = 0 in {
+  defm PVFNMSBUP : RVMm<"pvfnmsb.up", 0xf3, V64, F32, VM, simm7fp>;
+  let isCodeGenOnly = 1 in
+  defm VFNMSBS : RVMm<"vfnmsb.s", 0xf3, V64, F32, VM, simm7fp>;
+}
+let cx = 1, cx2 = 1 in
+defm PVFNMSB : RVMm<"pvfnmsb", 0xf3, V64, I64, VM512, simm7fp>;
+def : MnemonicAlias<"vfnmsb.s", "pvfnmsb.up">;
+
+// Section 8.13.12 - VRCP (Vector Floating Reciprocal)
+let cx = 0, cx2 = 0 in defm VRCPD : RVF1m<"vrcp.d", 0xe1, V64, VM>;
+let cx = 0, cx2 = 1 in defm PVRCPLO : RVF1m<"pvrcp.lo", 0xe1, V64, VM>;
+let cx = 1, cx2 = 0 in {
+  defm PVRCPUP : RVF1m<"pvrcp.up", 0xe1, V64, VM>;
+  let isCodeGenOnly = 1 in defm VRCPS : RVF1m<"vrcp.s", 0xe1, V64, VM>;
+}
+let cx = 1, cx2 = 1 in defm PVRCP : RVF1m<"pvrcp", 0xe1, V64, VM512>;
+def : MnemonicAlias<"vrcp.s", "pvrcp.up">;
+
+// Section 8.13.13 - VRSQRT (Vector Floating Reciprocal Square Root)
+let cx = 0, cx2 = 0 in defm VRSQRTD : RVF1m<"vrsqrt.d", 0xf1, V64, VM>;
+let cx = 0, cx2 = 1 in defm PVRSQRTLO : RVF1m<"pvrsqrt.lo", 0xf1, V64, VM>;
+let cx = 1, cx2 = 0 in {
+  defm PVRSQRTUP : RVF1m<"pvrsqrt.up", 0xf1, V64, VM>;
+  let isCodeGenOnly = 1 in
+  defm VRSQRTS : RVF1m<"vrsqrt.s", 0xf1, V64, VM>;
+}
+let cx = 1, cx2 = 1 in
+defm PVRSQRT : RVF1m<"pvrsqrt", 0xf1, V64, VM512>;
+let cs2 = 1 in {
+    let cx = 0, cx2 = 0 in
+    defm VRSQRTDNEX : RVF1m<"vrsqrt.d.nex", 0xf1, V64, VM>;
+    let cx = 0, cx2 = 1 in
+    defm PVRSQRTLONEX : RVF1m<"pvrsqrt.lo.nex", 0xf1, V64, VM>;
+    let cx = 1, cx2 = 0 in {
+      defm PVRSQRTUPNEX : RVF1m<"pvrsqrt.up.nex", 0xf1, V64, VM>;
+      let isCodeGenOnly = 1 in
+      defm VRSQRTSNEX : RVF1m<"vrsqrt.s.nex", 0xf1, V64, VM>;
+    }
+    let cx = 1, cx2 = 1 in
+    defm PVRSQRTNEX : RVF1m<"pvrsqrt.nex", 0xf1, V64, VM512>;
+}
+def : MnemonicAlias<"vrsqrt.s", "pvrsqrt.up">;
+def : MnemonicAlias<"vrsqrt.s.nex", "pvrsqrt.up.nex">;
+
+// Section 8.13.14 - VFIX (Vector Convert to Fixed Pointer)
+let cx = 0, cx2 = 0, cs2 = 0 in
+defm VCVTWDSX : RVFIXm<"vcvt.w.d.sx", 0xe8, V64, VM>;
+let cx = 0, cx2 = 1, cs2 = 0 in
+defm VCVTWDZX : RVFIXm<"vcvt.w.d.zx", 0xe8, V64, VM>;
+let cx = 1, cx2 = 0, cs2 = 0 in
+defm VCVTWSSX : RVFIXm<"vcvt.w.s.sx", 0xe8, V64, VM>;
+let cx = 1, cx2 = 1, cs2 = 0 in
+defm VCVTWSZX : RVFIXm<"vcvt.w.s.zx", 0xe8, V64, VM>;
+let cx = 0, cx2 = 1, cs2 = 1 in
+defm PVCVTWSLO : RVFIXm<"pvcvt.w.s.lo", 0xe8, V64, VM>;
+let cx = 1, cx2 = 0, cs2 = 1 in
+defm PVCVTWSUP : RVFIXm<"pvcvt.w.s.up", 0xe8, V64, VM>;
+let cx = 1, cx2 = 1, cs2 = 1 in
+defm PVCVTWS : RVFIXm<"pvcvt.w.s", 0xe8, V64, VM512>;
+
+// Section 8.13.15 - VFIXX (Vector Convert to Fixed Pointer)
+defm VCVTLD : RVFIXm<"vcvt.l.d", 0xa8, V64, VM>;
+
+// Section 8.13.16 - VFLT (Vector Convert to Floating Pointer)
+let cx = 0, cx2 = 0, cs2 = 0 in
+defm VCVTDW : RVF1m<"vcvt.d.w", 0xf8, V64, VM>;
+let cx = 1, cx2 = 0, cs2 = 0 in
+defm VCVTSW : RVF1m<"vcvt.s.w", 0xf8, V64, VM>;
+let cx = 0, cx2 = 1, cs2 = 1 in
+defm PVCVTSWLO : RVF1m<"pvcvt.s.w.lo", 0xf8, V64, VM>;
+let cx = 1, cx2 = 0, cs2 = 1 in
+defm PVCVTSWUP : RVF1m<"pvcvt.s.w.up", 0xf8, V64, VM>;
+let cx = 1, cx2 = 1, cs2 = 1 in
+defm PVCVTSW : RVF1m<"pvcvt.s.w", 0xf8, V64, VM512>;
+
+// Section 8.13.17 - VFLTX (Vector Convert to Floating Pointer)
+defm VCVTDL : RVF1m<"vcvt.d.l", 0xb8, V64, VM>;
+
+// Section 8.13.18 - VCVS (Vector Convert to Single-format)
+defm VCVTSD : RVF1m<"vcvt.s.d", 0x9f, V64, VM>;
+
+// Section 8.13.19 - VCVD (Vector Convert to Double-format)
+defm VCVTDS : RVF1m<"vcvt.d.s", 0x8f, V64, VM>;
+
+//-----------------------------------------------------------------------------
+// Section 8.14 - Vector Reduction Instructions
+//-----------------------------------------------------------------------------
+
+// Section 8.14.1 - VSUMS (Vector Sum Single)
+defm VSUMWSX : RVF1m<"vsum.w.sx", 0xea, V64, VM>;
+let cx2 = 1 in defm VSUMWZX : RVF1m<"vsum.w.zx", 0xea, V64, VM>;
+
+// Section 8.14.2 - VSUMX (Vector Sum)
+defm VSUML : RVF1m<"vsum.l", 0xaa, V64, VM>;
+
+// Section 8.14.3 - VFSUM (Vector Floating Sum)
+defm VFSUMD : RVF1m<"vfsum.d", 0xec, V64, VM>;
+let cx = 1 in defm VFSUMS : RVF1m<"vfsum.s", 0xec, V64, VM>;
+
+// Section 8.14.4 - VMAXS (Vector Maximum/Minimum Single)
+let cx2 = 0 in defm VRMAXSWFSTSX : RVF1m<"vrmaxs.w.fst.sx", 0xbb, V64, VM>;
+let cx2 = 1 in defm VRMAXSWFSTZX : RVF1m<"vrmaxs.w.fst.zx", 0xbb, V64, VM>;
+let cs = 1 in {
+  let cx2 = 0 in
+  defm VRMAXSWLSTSX : RVF1m<"vrmaxs.w.lst.sx", 0xbb, V64, VM>;
+  let cx2 = 1 in
+  defm VRMAXSWLSTZX : RVF1m<"vrmaxs.w.lst.zx", 0xbb, V64, VM>;
+}
+let cs2 = 1 in {
+  let cx2 = 0 in
+  defm VRMINSWFSTSX : RVF1m<"vrmins.w.fst.sx", 0xbb, V64, VM>;
+  let cx2 = 1 in
+  defm VRMINSWFSTZX : RVF1m<"vrmins.w.fst.zx", 0xbb, V64, VM>;
+  let cs = 1 in {
+    let cx2 = 0 in
+    defm VRMINSWLSTSX : RVF1m<"vrmins.w.lst.sx", 0xbb, V64, VM>;
+    let cx2 = 1 in
+    defm VRMINSWLSTZX : RVF1m<"vrmins.w.lst.zx", 0xbb, V64, VM>;
+  }
+}
+
+// Section 8.14.5 - VMAXX (Vector Maximum/Minimum)
+let cs = 0 in defm VRMAXSLFST : RVF1m<"vrmaxs.l.fst", 0xab, V64, VM>;
+let cs = 1 in defm VRMAXSLLST : RVF1m<"vrmaxs.l.lst", 0xab, V64, VM>;
+let cs2 = 1 in {
+  let cs = 0 in defm VRMINSLFST : RVF1m<"vrmins.l.fst", 0xab, V64, VM>;
+  let cs = 1 in defm VRMINSLLST : RVF1m<"vrmins.l.lst", 0xab, V64, VM>;
+}
+
+// Section 8.14.6 - VFMAX (Vector Floating Maximum/Minimum)
+let cs = 0 in defm VFRMAXDFST : RVF1m<"vfrmax.d.fst", 0xad, V64, VM>;
+let cs = 1 in defm VFRMAXDLST : RVF1m<"vfrmax.d.lst", 0xad, V64, VM>;
+let cs2 = 1 in {
+  let cs = 0 in defm VFRMINDFST : RVF1m<"vfrmin.d.fst", 0xad, V64, VM>;
+  let cs = 1 in defm VFRMINDLST : RVF1m<"vfrmin.d.lst", 0xad, V64, VM>;
+}
+let cx = 1 in {
+  let cs = 0 in defm VFRMAXSFST : RVF1m<"vfrmax.s.fst", 0xad, V64, VM>;
+  let cs = 1 in defm VFRMAXSLST : RVF1m<"vfrmax.s.lst", 0xad, V64, VM>;
+  let cs2 = 1 in {
+    let cs = 0 in defm VFRMINSFST : RVF1m<"vfrmin.s.fst", 0xad, V64, VM>;
+    let cs = 1 in defm VFRMINSLST : RVF1m<"vfrmin.s.lst", 0xad, V64, VM>;
+  }
+}
+
+// Section 8.14.7 - VRAND (Vector Reduction And)
+defm VRAND : RVF1m<"vrand", 0x88, V64, VM>;
+
+// Section 8.14.8 - VROR (Vector Reduction Or)
+defm VROR : RVF1m<"vror", 0x98, V64, VM>;
+
+// Section 8.14.9 - VRXOR (Vector Reduction Exclusive Or)
+defm VRXOR : RVF1m<"vrxor", 0x89, V64, VM>;
+
+//-----------------------------------------------------------------------------
+// Section 8.15 - Vector Iterative Operation Instructions
+//-----------------------------------------------------------------------------
+
+// Section 8.15.1 - VFIA (Vector Floating Iteration Add)
+let cx = 0 in defm VFIAD : RVI2m<"vfia.d", 0xce, V64, I64>;
+let cx = 1 in defm VFIAS : RVI2m<"vfia.s", 0xce, V64, F32>;
+
+// Section 8.15.2 - VFIS (Vector Floating Iteration Subtract)
+let cx = 0 in defm VFISD : RVI2m<"vfis.d", 0xde, V64, I64>;
+let cx = 1 in defm VFISS : RVI2m<"vfis.s", 0xde, V64, F32>;
+
+// Section 8.15.3 - VFIM (Vector Floating Iteration Multiply)
+let cx = 0 in defm VFIMD : RVI2m<"vfim.d", 0xcf, V64, I64>;
+let cx = 1 in defm VFIMS : RVI2m<"vfim.s", 0xcf, V64, F32>;
+
+// Section 8.15.4 - VFIAM (Vector Floating Iteration Add and Multiply)
+let cx = 0 in defm VFIAMD : RVI3m<"vfiam.d", 0xee, V64, I64>;
+let cx = 1 in defm VFIAMS : RVI3m<"vfiam.s", 0xee, V64, F32>;
+
+// Section 8.15.5 - VFISM (Vector Floating Iteration Subtract and Multiply)
+let cx = 0 in defm VFISMD : RVI3m<"vfism.d", 0xfe, V64, I64>;
+let cx = 1 in defm VFISMS : RVI3m<"vfism.s", 0xfe, V64, F32>;
+
+// Section 8.15.6 - VFIMA (Vector Floating Iteration Multiply and Add)
+let cx = 0 in defm VFIMAD : RVI3m<"vfima.d", 0xef, V64, I64>;
+let cx = 1 in defm VFIMAS : RVI3m<"vfima.s", 0xef, V64, F32>;
+
+// Section 8.15.7 - VFIMS (Vector Floating Iteration Multiply and Subtract)
+let cx = 0 in defm VFIMSD : RVI3m<"vfims.d", 0xff, V64, I64>;
+let cx = 1 in defm VFIMSS : RVI3m<"vfims.s", 0xff, V64, F32>;
+
+//-----------------------------------------------------------------------------
+// Section 8.16 - Vector Merger Operation Instructions
+//-----------------------------------------------------------------------------
+
+// Section 8.16.1 - VMRG (Vector Merge)
+let cx = 0 in defm VMRG : RVm<"vmrg", 0xd6, V64, I64, VM>;
+// FIXME: vmrg.w should be called as pvmrg, but following assembly manual.
+let cx = 1 in defm VMRGW : RVm<"vmrg.w", 0xd6, V64, I64, VM512>;
+def : MnemonicAlias<"vmrg.l", "vmrg">;
+
+// Section 8.16.2 - VSHF (Vector Shuffle)
+defm VSHF : RVSHFm<"vshf", 0xbc, V64>;
+
+// Section 8.16.3 - VCP (Vector Compress)
+defm VCP : RV1m<"vcp", 0x8d, V64, VM>;
+
+// Section 8.16.4 - VEX (Vector Expand)
+defm VEX : RV1m<"vex", 0x9d, V64, VM>;
+
+//-----------------------------------------------------------------------------
+// Section 8.17 - Vector Mask Operation Instructions
+//-----------------------------------------------------------------------------
+
+// Section 8.17.1 - VFMK (Vector Form Mask)
+defm VFMKL : RVMKm<"vfmk.l.", 0xb4, V64, VM>;
+def : MnemonicAlias<"vfmk.l", "vfmk.l.at">;
+
+// Section 8.17.2 - VFMS (Vector Form Mask Single)
+defm VFMKW : RVMKm<"vfmk.w.", 0xb5, V64, VM>;
+let isCodeGenOnly = 1 in defm PVFMKWLO : RVMKm<"vfmk.w.", 0xb5, V64, VM>;
+let cx = 1 in defm PVFMKWUP : RVMKm<"pvfmk.w.up.", 0xb5, V64, VM>;
+def : MnemonicAlias<"vfmk.w", "vfmk.w.at">;
+def : MnemonicAlias<"pvfmk.w.up", "pvfmk.w.up.at">;
+def : MnemonicAlias<"pvfmk.w.lo", "vfmk.w.at">;
+foreach CC = [ "af", "gt", "lt", "ne", "eq", "ge", "le", "at" ] in {
+  def : MnemonicAlias<"pvfmk.w.lo."#CC, "vfmk.w."#CC>;
+}
+
+// Section 8.17.3 - VFMF (Vector Form Mask Floating Point)
+defm VFMKD : RVMKm<"vfmk.d.", 0xb6, V64, VM>;
+let cx2 = 1 in defm PVFMKSLO : RVMKm<"pvfmk.s.lo.", 0xb6, V64, VM>;
+let cx = 1 in {
+  defm PVFMKSUP : RVMKm<"pvfmk.s.up.", 0xb6, V64, VM>;
+  let isCodeGenOnly = 1 in defm VFMKS : RVMKm<"vfmk.s.", 0xb6, V64, VM>;
+}
+def : MnemonicAlias<"vfmk.d", "vfmk.d.at">;
+def : MnemonicAlias<"pvfmk.s.lo", "pvfmk.s.lo.at">;
+def : MnemonicAlias<"pvfmk.s.up", "pvfmk.s.up.at">;
+def : MnemonicAlias<"vfmk.s", "pvfmk.s.up.at">;
+foreach CC = [ "af", "gt", "lt", "ne", "eq", "ge", "le", "at", "num", "nan",
+               "gtnan", "ltnan", "nenan", "eqnan", "genan", "lenan" ] in {
+  def : MnemonicAlias<"vfmk.s."#CC, "pvfmk.s.up."#CC>;
+}
+
+// Section 8.17.4 - ANDM (And VM)
+defm ANDM : RVM2m<"andm", 0x84, VM>;
+
+// Section 8.17.5 - ORM (Or VM)
+defm ORM : RVM2m<"orm", 0x85, VM>;
+
+// Section 8.17.6 - XORM (Exclusive Or VM)
+defm XORM : RVM2m<"xorm", 0x86, VM>;
+
+// Section 8.17.7 - EQVM (Equivalence VM)
+defm EQVM : RVM2m<"eqvm", 0x87, VM>;
+
+// Section 8.17.8 - NNDM (Negate And VM)
+defm NNDM : RVM2m<"nndm", 0x94, VM>;
+
+// Section 8.17.9 - NEGM (Negate VM)
+defm NEGM : RVM1m<"negm", 0x95, VM>;
+
+// Section 8.17.10 - PCVM (Population Count of VM)
+defm PCVM : RVMSm<"pcvm", 0xa4, VM>;
+
+// Section 8.17.11 - LZVM (Leading Zero of VM)
+defm LZVM : RVMSm<"lzvm", 0xa5, VM>;
+
+// Section 8.17.12 - TOVM (Trailing One of VM)
+defm TOVM : RVMSm<"tovm", 0xa6, VM>;
+
+//-----------------------------------------------------------------------------
+// Section 8.18 - Vector Control Instructions
+//-----------------------------------------------------------------------------
+
+// Section 8.18.1 - LVL (Load VL)
+let sx = 0, cz = 0, sz = 0, hasSideEffects = 0, Defs = [VL] in {
+  def LVLr : RR<0xbf, (outs), (ins I64:$sy), "lvl $sy">;
+  let cy = 0 in def LVLi : RR<0xbf, (outs), (ins simm7:$sy), "lvl $sy">;
+}
+
+// Section 8.18.2 - SVL (Save VL)
+let cy = 0, sy = 0, cz = 0, sz = 0, hasSideEffects = 0, Uses = [VL] in
+def SVL : RR<0x2f, (outs I64:$sx), (ins), "svl $sx">;
+
+// Section 8.18.3 - SMVL (Save Maximum Vector Length)
+let cy = 0, sy = 0, cz = 0, sz = 0, hasSideEffects = 0 in
+def SMVL : RR<0x2e, (outs I64:$sx), (ins), "smvl $sx">;
+
+// Section 8.18.4 - LVIX (Load Vector Data Index)
+let sx = 0, cz = 0, sz = 0, hasSideEffects = 0, Defs = [VIX] in {
+  def LVIXr : RR<0xaf, (outs), (ins I64:$sy), "lvix $sy">;
+  let cy = 0 in def LVIXi : RR<0xaf, (outs), (ins uimm6:$sy), "lvix $sy">;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VEMCInstLower.cpp b/contrib/llvm-project/llvm/lib/Target/VE/VEMCInstLower.cpp
index 9815610510e1..bc5577ce4f97 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/VEMCInstLower.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VEMCInstLower.cpp
@@ -51,6 +51,11 @@ static MCOperand LowerOperand(const MachineInstr *MI, const MachineOperand &MO,
       break;
     return MCOperand::createReg(MO.getReg());
 
+  case MachineOperand::MO_BlockAddress:
+    return LowerSymbolOperand(
+        MI, MO, AP.GetBlockAddressSymbol(MO.getBlockAddress()), AP);
+  case MachineOperand::MO_ConstantPoolIndex:
+    return LowerSymbolOperand(MI, MO, AP.GetCPISymbol(MO.getIndex()), AP);
   case MachineOperand::MO_ExternalSymbol:
     return LowerSymbolOperand(
         MI, MO, AP.GetExternalSymbolSymbol(MO.getSymbolName()), AP);
@@ -58,7 +63,8 @@ static MCOperand LowerOperand(const MachineInstr *MI, const MachineOperand &MO,
     return LowerSymbolOperand(MI, MO, AP.getSymbol(MO.getGlobal()), AP);
   case MachineOperand::MO_Immediate:
     return MCOperand::createImm(MO.getImm());
-
+  case MachineOperand::MO_JumpTableIndex:
+    return LowerSymbolOperand(MI, MO, AP.GetJTISymbol(MO.getIndex()), AP);
   case MachineOperand::MO_MachineBasicBlock:
     return LowerSymbolOperand(MI, MO, MO.getMBB()->getSymbol(), AP);
 
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VERegisterInfo.cpp b/contrib/llvm-project/llvm/lib/Target/VE/VERegisterInfo.cpp
index 5783a8df69d2..d175ad26c742 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/VERegisterInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VERegisterInfo.cpp
@@ -22,6 +22,7 @@
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 
 using namespace llvm;
@@ -35,6 +36,8 @@ VERegisterInfo::VERegisterInfo() : VEGenRegisterInfo(VE::SX10) {}
 const MCPhysReg *
 VERegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   switch (MF->getFunction().getCallingConv()) {
+  case CallingConv::Fast:
+    // Being explicit (same as standard CC).
   default:
     return CSR_SaveList;
   case CallingConv::PreserveAll:
@@ -45,6 +48,8 @@ VERegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
 const uint32_t *VERegisterInfo::getCallPreservedMask(const MachineFunction &MF,
                                                      CallingConv::ID CC) const {
   switch (CC) {
+  case CallingConv::Fast:
+    // Being explicit (same as standard CC).
   default:
     return CSR_RegMask;
   case CallingConv::PreserveAll:
@@ -82,10 +87,22 @@ BitVector VERegisterInfo::getReservedRegs(const MachineFunction &MF) const {
          ++ItAlias)
       Reserved.set(*ItAlias);
 
+  // Reserve constant registers.
+  Reserved.set(VE::VM0);
+  Reserved.set(VE::VMP0);
+
   return Reserved;
 }
 
-bool VERegisterInfo::isConstantPhysReg(MCRegister PhysReg) const { return false; }
+bool VERegisterInfo::isConstantPhysReg(MCRegister PhysReg) const {
+  switch (PhysReg) {
+  case VE::VM0:
+  case VE::VMP0:
+    return true;
+  default:
+    return false;
+  }
+}
 
 const TargetRegisterClass *
 VERegisterInfo::getPointerRegClass(const MachineFunction &MF,
@@ -93,6 +110,29 @@ VERegisterInfo::getPointerRegClass(const MachineFunction &MF,
   return &VE::I64RegClass;
 }
 
+static unsigned offsetToDisp(MachineInstr &MI) {
+  // Default offset in instruction's operands (reg+reg+imm).
+  unsigned OffDisp = 2;
+
+#define RRCAS_multi_cases(NAME) NAME##rir : case NAME##rii
+
+  {
+    using namespace llvm::VE;
+    switch (MI.getOpcode()) {
+    case RRCAS_multi_cases(TS1AML):
+    case RRCAS_multi_cases(TS1AMW):
+    case RRCAS_multi_cases(CASL):
+    case RRCAS_multi_cases(CASW):
+      // These instructions use AS format (reg+imm).
+      OffDisp = 1;
+      break;
+    }
+  }
+#undef RRCAS_multi_cases
+
+  return OffDisp;
+}
+
 static void replaceFI(MachineFunction &MF, MachineBasicBlock::iterator II,
                       MachineInstr &MI, const DebugLoc &dl,
                       unsigned FIOperandNum, int Offset, Register FrameReg) {
@@ -100,7 +140,7 @@ static void replaceFI(MachineFunction &MF, MachineBasicBlock::iterator II,
   // VE has 32 bit offset field, so no need to expand a target instruction.
   // Directly encode it.
   MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false);
-  MI.getOperand(FIOperandNum + 2).ChangeToImmediate(Offset);
+  MI.getOperand(FIOperandNum + offsetToDisp(MI)).ChangeToImmediate(Offset);
 }
 
 void VERegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
@@ -116,9 +156,41 @@ void VERegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 
   Register FrameReg;
   int Offset;
-  Offset = TFI->getFrameIndexReference(MF, FrameIndex, FrameReg);
-
-  Offset += MI.getOperand(FIOperandNum + 2).getImm();
+  Offset = TFI->getFrameIndexReference(MF, FrameIndex, FrameReg).getFixed();
+
+  Offset += MI.getOperand(FIOperandNum + offsetToDisp(MI)).getImm();
+
+  if (MI.getOpcode() == VE::STQrii) {
+    const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+    Register SrcReg = MI.getOperand(3).getReg();
+    Register SrcHiReg = getSubReg(SrcReg, VE::sub_even);
+    Register SrcLoReg = getSubReg(SrcReg, VE::sub_odd);
+    // VE stores HiReg to 8(addr) and LoReg to 0(addr)
+    MachineInstr *StMI = BuildMI(*MI.getParent(), II, dl, TII.get(VE::STrii))
+                             .addReg(FrameReg)
+                             .addImm(0)
+                             .addImm(0)
+                             .addReg(SrcLoReg);
+    replaceFI(MF, II, *StMI, dl, 0, Offset, FrameReg);
+    MI.setDesc(TII.get(VE::STrii));
+    MI.getOperand(3).setReg(SrcHiReg);
+    Offset += 8;
+  } else if (MI.getOpcode() == VE::LDQrii) {
+    const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+    Register DestReg = MI.getOperand(0).getReg();
+    Register DestHiReg = getSubReg(DestReg, VE::sub_even);
+    Register DestLoReg = getSubReg(DestReg, VE::sub_odd);
+    // VE loads HiReg from 8(addr) and LoReg from 0(addr)
+    MachineInstr *StMI =
+        BuildMI(*MI.getParent(), II, dl, TII.get(VE::LDrii), DestLoReg)
+            .addReg(FrameReg)
+            .addImm(0)
+            .addImm(0);
+    replaceFI(MF, II, *StMI, dl, 1, Offset, FrameReg);
+    MI.setDesc(TII.get(VE::LDrii));
+    MI.getOperand(0).setReg(DestHiReg);
+    Offset += 8;
+  }
 
   replaceFI(MF, II, MI, dl, FIOperandNum, Offset, FrameReg);
 }
@@ -126,26 +198,3 @@ void VERegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 Register VERegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   return VE::SX9;
 }
-
-// VE has no architectural need for stack realignment support,
-// except that LLVM unfortunately currently implements overaligned
-// stack objects by depending upon stack realignment support.
-// If that ever changes, this can probably be deleted.
-bool VERegisterInfo::canRealignStack(const MachineFunction &MF) const {
-  if (!TargetRegisterInfo::canRealignStack(MF))
-    return false;
-
-  // VE always has a fixed frame pointer register, so don't need to
-  // worry about needing to reserve it. [even if we don't have a frame
-  // pointer for our frame, it still cannot be used for other things,
-  // or register window traps will be SADNESS.]
-
-  // If there's a reserved call frame, we can use VE to access locals.
-  if (getFrameLowering(MF)->hasReservedCallFrame(MF))
-    return true;
-
-  // Otherwise, we'd need a base pointer, but those aren't implemented
-  // for VE at the moment.
-
-  return false;
-}
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VERegisterInfo.h b/contrib/llvm-project/llvm/lib/Target/VE/VERegisterInfo.h
index 9a32da16bea6..334fb965a986 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/VERegisterInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VERegisterInfo.h
@@ -40,8 +40,6 @@ public:
                            RegScavenger *RS = nullptr) const override;
 
   Register getFrameRegister(const MachineFunction &MF) const override;
-
-  bool canRealignStack(const MachineFunction &MF) const override;
 };
 
 } // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VERegisterInfo.td b/contrib/llvm-project/llvm/lib/Target/VE/VERegisterInfo.td
index 29708d35c730..70ff104b65b7 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/VERegisterInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VERegisterInfo.td
@@ -26,13 +26,33 @@ class VEMiscReg<bits<6> enc, string n>: Register<n> {
   let Namespace = "VE";
 }
 
+class VEVecReg<bits<8> enc, string n, list<Register> subregs = [],
+               list<string> altNames = [], list<Register> aliases = []>
+        : Register<n, altNames> {
+  let HWEncoding{15-8} = 0;
+  let HWEncoding{7-0} = enc;
+  let Namespace = "VE";
+  let SubRegs = subregs;
+  let Aliases = aliases;
+}
+
+class VEMaskReg<bits<4> enc, string n, list<Register> subregs = [],
+                list<string> altNames = [], list<Register> aliases = []>
+        : Register<n, altNames> {
+  let HWEncoding{15-4} = 0;
+  let HWEncoding{3-0} = enc;
+  let Namespace = "VE";
+  let SubRegs = subregs;
+  let Aliases = aliases;
+}
+
 let Namespace = "VE" in {
-  def sub_i8      : SubRegIndex<8, 56>;         // Low 8 bit (56..63)
-  def sub_i16     : SubRegIndex<16, 48>;        // Low 16 bit (48..63)
   def sub_i32     : SubRegIndex<32, 32>;        // Low 32 bit (32..63)
   def sub_f32     : SubRegIndex<32>;            // High 32 bit (0..31)
   def sub_even    : SubRegIndex<64>;            // High 64 bit (0..63)
   def sub_odd     : SubRegIndex<64, 64>;        // Low 64 bit (64..127)
+  def sub_vm_even : SubRegIndex<256>;           // High 256 bit (0..255)
+  def sub_vm_odd  : SubRegIndex<256, 256>;      // Low 256 bit (256..511)
   def AsmName     : RegAltNameIndex;
 }
 
@@ -66,26 +86,23 @@ def MISC : RegisterClass<"VE", [i64], 64,
 def IC : VEMiscReg<62, "ic">;
 
 //-----------------------------------------------------------------------------
-// Gneric Registers
+// Vector Length Register
 //-----------------------------------------------------------------------------
 
-let RegAltNameIndices = [AsmName] in {
+def VL : VEMiscReg<63, "vl">;
 
-// Generic integer registers - 8 bits wide
-foreach I = 0-63 in
-  def SB#I : VEReg<I, "sb"#I, [], ["s"#I]>, DwarfRegNum<[I]>;
+// Register classes.
+def VLS : RegisterClass<"VE", [i32], 64, (add VL)>;
 
-// Generic integer registers - 16 bits wide
-let SubRegIndices = [sub_i8] in
-foreach I = 0-63 in
-  def SH#I : VEReg<I, "sh"#I, [!cast<VEReg>("SB"#I)], ["s"#I]>,
-                   DwarfRegNum<[I]>;
+//-----------------------------------------------------------------------------
+// Generic Registers
+//-----------------------------------------------------------------------------
+
+let RegAltNameIndices = [AsmName] in {
 
 // Generic integer registers - 32 bits wide
-let SubRegIndices = [sub_i16] in
 foreach I = 0-63 in
-  def SW#I : VEReg<I, "sw"#I, [!cast<VEReg>("SH"#I)], ["s"#I]>,
-                   DwarfRegNum<[I]>;
+  def SW#I : VEReg<I, "sw"#I, [], ["s"#I]>, DwarfRegNum<[I]>;
 
 // Generic floating point registers - 32 bits wide
 //   NOTE: Mark SF#I as alias of SW#I temporary to avoid register allocation
@@ -95,10 +112,21 @@ foreach I = 0-63 in
                    DwarfRegNum<[I]>;
 
 // Generic integer registers - 64 bits wide
-let SubRegIndices = [sub_i32, sub_f32], CoveredBySubRegs = 1 in
-foreach I = 0-63 in
-  def SX#I : VEReg<I, "s"#I, [!cast<VEReg>("SW"#I), !cast<VEReg>("SF"#I)],
-                   ["s"#I]>, DwarfRegNum<[I]>;
+let SubRegIndices = [sub_i32, sub_f32], CoveredBySubRegs = 1 in {
+  // Several registers have specific names, so add them to one of aliases.
+  def SX8 : VEReg<8, "s8", [SW8, SF8], ["s8", "sl"]>, DwarfRegNum<[8]>;
+  def SX9 : VEReg<9, "s9", [SW9, SF9], ["s9", "fp"]>, DwarfRegNum<[9]>;
+  def SX10 : VEReg<10, "s10", [SW10, SF10], ["s10", "lr"]>, DwarfRegNum<[10]>;
+  def SX11 : VEReg<11, "s11", [SW11, SF11], ["s11", "sp"]>, DwarfRegNum<[11]>;
+  def SX14 : VEReg<14, "s14", [SW14, SF14], ["s14", "tp"]>, DwarfRegNum<[14]>;
+  def SX15 : VEReg<15, "s15", [SW15, SF15], ["s15", "got"]>, DwarfRegNum<[15]>;
+  def SX16 : VEReg<16, "s16", [SW16, SF16], ["s16", "plt"]>, DwarfRegNum<[16]>;
+
+  // Other generic registers.
+  foreach I = { 0-7, 12-13, 17-63 } in
+    def SX#I : VEReg<I, "s"#I, [!cast<VEReg>("SW"#I), !cast<VEReg>("SF"#I)],
+                     ["s"#I]>, DwarfRegNum<[I]>;
+}
 
 // Aliases of the S* registers used to hold 128-bit for values (long doubles).
 // Following foreach represents something like:
@@ -112,20 +140,31 @@ foreach I = 0-31 in
                    !cast<VEReg>("SX"#!add(!shl(I,1),1))],
                   ["s"#!shl(I,1)]>;
 
+// Vector registers - 64 bits wide 256 elements
+foreach I = 0-63 in
+  def V#I : VEVecReg<I, "v"#I, [], ["v"#I]>, DwarfRegNum<[!add(64,I)]>;
+
+// Vector Index Register
+def VIX : VEVecReg<255, "vix", [], ["vix"]>;
+
+// Vector mask registers - 256 bits wide
+foreach I = 0-15 in
+  def VM#I : VEMaskReg<I, "vm"#I, [], ["vm"#I]>, DwarfRegNum<[!add(128,I)]>;
+
+// Aliases of VMs to use as a pair of two VM for packed instructions
+let SubRegIndices = [sub_vm_even, sub_vm_odd], CoveredBySubRegs = 1 in
+foreach I = 0-7 in
+  def VMP#I : VEMaskReg<!shl(I,1), "vmp"#I,
+                        [!cast<VEMaskReg>("VM"#!shl(I,1)),
+                         !cast<VEMaskReg>("VM"#!add(!shl(I,1),1))],
+                        ["vm"#!shl(I,1)]>;
+
 } // RegAltNameIndices = [AsmName]
 
 // Register classes.
 //
 // The register order is defined in terms of the preferred
 // allocation order.
-def I8  : RegisterClass<"VE", [i8], 8,
-                        (add (sequence "SB%u", 0, 7),
-                             (sequence "SB%u", 34, 63),
-                             (sequence "SB%u", 8, 33))>;
-def I16 : RegisterClass<"VE", [i16], 16,
-                        (add (sequence "SH%u", 0, 7),
-                             (sequence "SH%u", 34, 63),
-                             (sequence "SH%u", 8, 33))>;
 def I32 : RegisterClass<"VE", [i32], 32,
                         (add (sequence "SW%u", 0, 7),
                              (sequence "SW%u", 34, 63),
@@ -142,3 +181,14 @@ def F128 : RegisterClass<"VE", [f128], 128,
                         (add (sequence "Q%u", 0, 3),
                              (sequence "Q%u", 17, 31),
                              (sequence "Q%u", 4, 16))>;
+
+def V64 : RegisterClass<"VE",
+                        [v256f64, // default type for vector registers
+                         v512i32, v512f32,
+                         v256i64, v256i32, v256f32, /* v256f64, */], 64,
+                        (add (sequence "V%u", 0, 63),
+                             VIX)>;
+
+// vm0 is reserved for always true
+def VM : RegisterClass<"VE", [v256i1], 64, (sequence "VM%u", 0, 15)>;
+def VM512 : RegisterClass<"VE", [v512i1], 64, (sequence "VMP%u", 0, 7)>;
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VESubtarget.cpp b/contrib/llvm-project/llvm/lib/Target/VE/VESubtarget.cpp
index a0b78d95e3cf..daa6cfb8aa84 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/VESubtarget.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VESubtarget.cpp
@@ -27,73 +27,35 @@ void VESubtarget::anchor() {}
 
 VESubtarget &VESubtarget::initializeSubtargetDependencies(StringRef CPU,
                                                           StringRef FS) {
+  // Default feature settings
+  EnableVPU = false;
+
   // Determine default and user specified characteristics
   std::string CPUName = std::string(CPU);
   if (CPUName.empty())
-    CPUName = "ve";
+    CPUName = "generic";
 
   // Parse features string.
-  ParseSubtargetFeatures(CPUName, FS);
+  ParseSubtargetFeatures(CPUName, /*TuneCPU=*/CPU, FS);
 
   return *this;
 }
 
 VESubtarget::VESubtarget(const Triple &TT, const std::string &CPU,
                          const std::string &FS, const TargetMachine &TM)
-    : VEGenSubtargetInfo(TT, CPU, FS), TargetTriple(TT),
+    : VEGenSubtargetInfo(TT, CPU, /*TuneCPU=*/CPU, FS), TargetTriple(TT),
       InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this),
       FrameLowering(*this) {}
 
-int VESubtarget::getAdjustedFrameSize(int frameSize) const {
-
-  // VE stack frame:
-  //
-  //         +----------------------------------------+
-  //         | Locals and temporaries                 |
-  //         +----------------------------------------+
-  //         | Parameter area for callee              |
-  // 176(fp) |                                        |
-  //         +----------------------------------------+
-  //         | Register save area (RSA) for callee    |
-  //         |                                        |
-  //  16(fp) |                         20 * 8 bytes   |
-  //         +----------------------------------------+
-  //   8(fp) | Return address                         |
-  //         +----------------------------------------+
-  //   0(fp) | Frame pointer of caller                |
-  // --------+----------------------------------------+--------
-  //         | Locals and temporaries for callee      |
-  //         +----------------------------------------+
-  //         | Parameter area for callee of callee    |
-  //         +----------------------------------------+
-  //  16(sp) | RSA for callee of callee               |
-  //         +----------------------------------------+
-  //   8(sp) | Return address                         |
-  //         +----------------------------------------+
-  //   0(sp) | Frame pointer of callee                |
-  //         +----------------------------------------+
-
-  // RSA frame:
-  //         +----------------------------------------------+
-  // 168(fp) | %s33                                         |
-  //         +----------------------------------------------+
-  //         | %s19...%s32                                  |
-  //         +----------------------------------------------+
-  //  48(fp) | %s18                                         |
-  //         +----------------------------------------------+
-  //  40(fp) | Linkage area register (%s17)                 |
-  //         +----------------------------------------------+
-  //  32(fp) | Procedure linkage table register (%plt=%s16) |
-  //         +----------------------------------------------+
-  //  24(fp) | Global offset table register (%got=%s15)     |
-  //         +----------------------------------------------+
-  //  16(fp) | Thread pointer register (%tp=%s14)           |
-  //         +----------------------------------------------+
+uint64_t VESubtarget::getAdjustedFrameSize(uint64_t FrameSize) const {
+  // Calculate adjusted frame size by adding the size of RSA frame,
+  // return address, and frame poitner as described in VEFrameLowering.cpp.
+  const VEFrameLowering *TFL = getFrameLowering();
 
-  frameSize += 176;                   // for RSA, RA, and FP
-  frameSize = alignTo(frameSize, 16); // requires 16 bytes alignment
+  FrameSize += getRsaSize();
+  FrameSize = alignTo(FrameSize, TFL->getStackAlign());
 
-  return frameSize;
+  return FrameSize;
 }
 
 bool VESubtarget::enableMachineScheduler() const { return true; }
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VESubtarget.h b/contrib/llvm-project/llvm/lib/Target/VE/VESubtarget.h
index f3a2c206162e..213aca2ea3f9 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/VESubtarget.h
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VESubtarget.h
@@ -32,6 +32,13 @@ class VESubtarget : public VEGenSubtargetInfo {
   Triple TargetTriple;
   virtual void anchor();
 
+  /// Features {
+
+  // Emit VPU instructions
+  bool EnableVPU;
+
+  /// } Features
+
   VEInstrInfo InstrInfo;
   VETargetLowering TLInfo;
   SelectionDAGTargetInfo TSInfo;
@@ -55,15 +62,21 @@ public:
 
   bool enableMachineScheduler() const override;
 
+  bool enableVPU() const { return EnableVPU; }
+
   /// ParseSubtargetFeatures - Parses features string setting specified
   /// subtarget options.  Definition of function is auto generated by tblgen.
-  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+  void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
   VESubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
 
   /// Given a actual stack size as determined by FrameInfo, this function
-  /// returns adjusted framesize which includes space for register window
-  /// spills and arguments.
-  int getAdjustedFrameSize(int stackSize) const;
+  /// returns adjusted framesize which includes space for RSA, return
+  /// address, and frame poitner.
+  uint64_t getAdjustedFrameSize(uint64_t FrameSize) const;
+
+  /// Get the size of RSA, return address, and frame pointer as described
+  /// in VEFrameLowering.cpp.
+  unsigned getRsaSize(void) const { return 176; };
 
   bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
 };
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VETargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/VE/VETargetMachine.cpp
index 08b55eebbc98..414ae09431c0 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/VETargetMachine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VETargetMachine.cpp
@@ -23,7 +23,7 @@ using namespace llvm;
 
 #define DEBUG_TYPE "ve"
 
-extern "C" void LLVMInitializeVETarget() {
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeVETarget() {
   // Register the target.
   RegisterTargetMachine<VETargetMachine> X(getTheVETarget());
 }
@@ -44,13 +44,24 @@ static std::string computeDataLayout(const Triple &T) {
   // Stack alignment is 128 bits
   Ret += "-S128";
 
+  // Vector alignments are 64 bits
+  // Need to define all of them.  Otherwise, each alignment becomes
+  // the size of each data by default.
+  Ret += "-v64:64:64"; // for v2f32
+  Ret += "-v128:64:64";
+  Ret += "-v256:64:64";
+  Ret += "-v512:64:64";
+  Ret += "-v1024:64:64";
+  Ret += "-v2048:64:64";
+  Ret += "-v4096:64:64";
+  Ret += "-v8192:64:64";
+  Ret += "-v16384:64:64"; // for v256f64
+
   return Ret;
 }
 
 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
-  if (!RM.hasValue())
-    return Reloc::Static;
-  return *RM;
+  return RM.getValueOr(Reloc::Static);
 }
 
 class VEELFTargetObjectFile : public TargetLoweringObjectFileELF {
@@ -96,7 +107,9 @@ public:
     return getTM<VETargetMachine>();
   }
 
+  void addIRPasses() override;
   bool addInstSelector() override;
+  void addPreEmitPass() override;
 };
 } // namespace
 
@@ -104,7 +117,18 @@ TargetPassConfig *VETargetMachine::createPassConfig(PassManagerBase &PM) {
   return new VEPassConfig(*this, PM);
 }
 
+void VEPassConfig::addIRPasses() {
+  // VE requires atomic expand pass.
+  addPass(createAtomicExpandPass());
+  TargetPassConfig::addIRPasses();
+}
+
 bool VEPassConfig::addInstSelector() {
   addPass(createVEISelDag(getVETargetMachine()));
   return false;
 }
+
+void VEPassConfig::addPreEmitPass() {
+  // LVLGen should be called after scheduling and register allocation
+  addPass(createLVLGenPass());
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VETargetTransformInfo.h b/contrib/llvm-project/llvm/lib/Target/VE/VETargetTransformInfo.h
index c267c4d9a578..68af66597485 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/VETargetTransformInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VETargetTransformInfo.h
@@ -33,16 +33,35 @@ class VETTIImpl : public BasicTTIImplBase<VETTIImpl> {
   const VESubtarget *getST() const { return ST; }
   const VETargetLowering *getTLI() const { return TLI; }
 
+  bool enableVPU() const { return getST()->enableVPU(); }
+
 public:
   explicit VETTIImpl(const VETargetMachine *TM, const Function &F)
       : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
         TLI(ST->getTargetLowering()) {}
 
-  unsigned getNumberOfRegisters(unsigned ClassID) const { return 64; }
+  unsigned getNumberOfRegisters(unsigned ClassID) const {
+    bool VectorRegs = (ClassID == 1);
+    if (VectorRegs) {
+      // TODO report vregs once vector isel is stable.
+      return 0;
+    }
+
+    return 64;
+  }
 
-  unsigned getRegisterBitWidth(bool Vector) const { return 64; }
+  unsigned getRegisterBitWidth(bool Vector) const {
+    if (Vector) {
+      // TODO report vregs once vector isel is stable.
+      return 0;
+    }
+    return 64;
+  }
 
-  unsigned getMinVectorRegisterBitWidth() const { return 64; }
+  unsigned getMinVectorRegisterBitWidth() const {
+    // TODO report vregs once vector isel is stable.
+    return 0;
+  }
 };
 
 } // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VVPInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/VE/VVPInstrInfo.td
new file mode 100644
index 000000000000..2c88d5099a7b
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VVPInstrInfo.td
@@ -0,0 +1,46 @@
+//===-------------- VVPInstrInfo.td - VVP_* SDNode patterns ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the VE Vector Predicated SDNodes (VVP SDNodes).  VVP
+// SDNodes are an intermediate isel layer between the vector SDNodes emitted by
+// LLVM and the actual VE vector instructions. For example:
+//
+//  ADD(x,y)   -->   VVP_ADD(x,y,mask,evl)   -->   VADDSWSXrvml(x,y,mask,evl)
+//     ^                      ^                            ^
+//  The standard     The VVP layer SDNode.        The VE vector instruction.
+//  SDNode.
+//
+// TODO explain how VVP nodes relate to VP SDNodes once VP ISel is uptream.
+//===----------------------------------------------------------------------===//
+
+// Binary Operators {
+
+// BinaryOp(x,y,mask,vl)
+def SDTIntBinOpVVP : SDTypeProfile<1, 4, [     // vp_add, vp_and, etc.
+  SDTCisSameAs<0, 1>,
+  SDTCisSameAs<0, 2>,
+  SDTCisInt<0>,
+  SDTCisSameNumEltsAs<0, 3>,
+  IsVLVT<4>
+]>;
+
+// Binary operator commutative pattern.
+class vvp_commutative<SDNode RootOp> :
+  PatFrags<
+  (ops node:$lhs, node:$rhs, node:$mask, node:$vlen),
+  [(RootOp node:$lhs, node:$rhs, node:$mask, node:$vlen),
+   (RootOp node:$rhs, node:$lhs, node:$mask, node:$vlen)]>;
+
+// VVP node definitions.
+def vvp_add    : SDNode<"VEISD::VVP_ADD",  SDTIntBinOpVVP>;
+def c_vvp_add  : vvp_commutative<vvp_add>;
+
+def vvp_and    : SDNode<"VEISD::VVP_AND",  SDTIntBinOpVVP>;
+def c_vvp_and  : vvp_commutative<vvp_and>;
+
+// } Binary Operators
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VVPInstrPatternsVec.td b/contrib/llvm-project/llvm/lib/Target/VE/VVPInstrPatternsVec.td
new file mode 100644
index 000000000000..7003fb387670
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VVPInstrPatternsVec.td
@@ -0,0 +1,71 @@
+//===----------- VVPInstrPatternsVec.td - VVP_* SDNode patterns -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes how VVP_* SDNodes are lowered to machine instructions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//
+// VVP SDNode definitions.
+//
+//===----------------------------------------------------------------------===//
+include "VVPInstrInfo.td"
+
+multiclass VectorBinaryArith<
+    SDPatternOperator OpNode,
+    ValueType ScalarVT, ValueType DataVT, ValueType MaskVT,
+    string OpBaseName,
+    SDPatternOperator ImmOp, SDNodeXForm ImmCast> {
+  // No mask.
+  def : Pat<(OpNode
+                (any_broadcast ScalarVT:$sx),
+                DataVT:$vy, (MaskVT true_mask), i32:$avl),
+            (!cast<Instruction>(OpBaseName#"rvl")
+                ScalarVT:$sx, $vy, $avl)>;
+  def : Pat<(OpNode DataVT:$vx, DataVT:$vy, (MaskVT true_mask), i32:$avl),
+            (!cast<Instruction>(OpBaseName#"vvl")
+                $vx, $vy, $avl)>;
+
+  // Mask.
+  def : Pat<(OpNode
+                (any_broadcast ScalarVT:$sx),
+                DataVT:$vy, MaskVT:$mask, i32:$avl),
+            (!cast<Instruction>(OpBaseName#"rvml")
+                ScalarVT:$sx, $vy, $mask, $avl)>;
+  def : Pat<(OpNode DataVT:$vx, DataVT:$vy, MaskVT:$mask, i32:$avl),
+            (!cast<Instruction>(OpBaseName#"vvml")
+                $vx, $vy, $mask, $avl)>;
+
+  // TODO We do not specify patterns for the immediate variants here. There
+  // will be an immediate folding pass that takes care of switching to the
+  // immediate variant where applicable.
+
+  // TODO Fold vvp_select into passthru.
+}
+
+// Expand both 64bit and 32 bit variant (256 elements)
+multiclass VectorBinaryArith_ShortLong<
+    SDPatternOperator OpNode,
+    ValueType LongScalarVT, ValueType LongDataVT, string LongOpBaseName,
+    ValueType ShortScalarVT, ValueType ShortDataVT, string ShortOpBaseName> {
+  defm : VectorBinaryArith<OpNode,
+                           LongScalarVT, LongDataVT, v256i1,
+                           LongOpBaseName, simm7, LO7>;
+  defm : VectorBinaryArith<OpNode,
+                           ShortScalarVT, ShortDataVT, v256i1,
+                           ShortOpBaseName, simm7, LO7>;
+}
+
+
+defm : VectorBinaryArith_ShortLong<c_vvp_add,
+                                   i64, v256i64, "VADDSL",
+                                   i32, v256i32, "VADDSWSX">;
+defm : VectorBinaryArith_ShortLong<c_vvp_and,
+                                   i64, v256i64, "VAND",
+                                   i32, v256i32, "PVANDLO">;
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VVPNodes.def b/contrib/llvm-project/llvm/lib/Target/VE/VVPNodes.def
new file mode 100644
index 000000000000..a68402e9ea10
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VVPNodes.def
@@ -0,0 +1,41 @@
+//===-- VVPNodes.def - Lists & properties of VE Vector Predication Nodes --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines all VVP_* SDNodes and their properties
+//
+//===----------------------------------------------------------------------===//
+
+/// HANDLE_VP_TO_VVP(VPOPC, VVPOPC)
+/// \p  VPOPC is the VP_* SDNode opcode.
+/// \p  VVPOPC is the VVP_* SDNode opcode.
+#ifndef HANDLE_VP_TO_VVP
+#define HANDLE_VP_TO_VVP(VPOPC, VVPOPC)
+#endif
+
+/// ADD_VVP_OP(VVPNAME,SDNAME)
+/// \p VVPName is a VVP SDNode operator.
+/// \p SDNAME is the generic SD opcode corresponding to \p VVPName.
+#ifndef ADD_VVP_OP
+#define ADD_VVP_OP(X, Y)
+#endif
+
+/// ADD_BINARY_VVP_OP(VVPNAME,SDNAME)
+/// \p VVPName is a VVP Binary operator.
+/// \p SDNAME is the generic SD opcode corresponding to \p VVPName.
+#ifndef ADD_BINARY_VVP_OP
+#define ADD_BINARY_VVP_OP(X,Y) ADD_VVP_OP(X,Y) HANDLE_VP_TO_VVP(VP_##Y, X)
+#endif
+
+// Integer arithmetic.
+ADD_BINARY_VVP_OP(VVP_ADD,ADD)
+
+ADD_BINARY_VVP_OP(VVP_AND,AND)
+
+#undef HANDLE_VP_TO_VVP
+#undef ADD_BINARY_VVP_OP
+#undef ADD_VVP_OP
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
index e29d85d7588d..60ac3248b9e7 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
@@ -35,10 +35,12 @@ using namespace llvm;
 
 #define DEBUG_TYPE "wasm-asm-parser"
 
+static const char *getSubtargetFeatureName(uint64_t Val);
+
 namespace {
 
 /// WebAssemblyOperand - Instances of this class represent the operands in a
-/// parsed WASM machine instruction.
+/// parsed Wasm machine instruction.
 struct WebAssemblyOperand : public MCParsedAsmOperand {
   enum KindTy { Token, Integer, Float, Symbol, BrList } Kind;
 
@@ -158,6 +160,24 @@ struct WebAssemblyOperand : public MCParsedAsmOperand {
   }
 };
 
+static MCSymbolWasm *GetOrCreateFunctionTableSymbol(MCContext &Ctx,
+                                                    const StringRef &Name) {
+  // FIXME: Duplicates functionality from
+  // MC/WasmObjectWriter::recordRelocation, as well as WebAssemblyCodegen's
+  // WebAssembly:getOrCreateFunctionTableSymbol.
+  MCSymbolWasm *Sym = cast_or_null<MCSymbolWasm>(Ctx.lookupSymbol(Name));
+  if (Sym) {
+    if (!Sym->isFunctionTable())
+      Ctx.reportError(SMLoc(), "symbol is not a wasm funcref table");
+  } else {
+    Sym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(Name));
+    Sym->setFunctionTable();
+    // The default function table is synthesized by the linker.
+    Sym->setUndefined();
+  }
+  return Sym;
+}
+
 class WebAssemblyAsmParser final : public MCTargetAsmParser {
   MCAsmParser &Parser;
   MCAsmLexer &Lexer;
@@ -320,8 +340,8 @@ public:
         Type == "i32x4" || Type == "i64x2" || Type == "f32x4" ||
         Type == "f64x2")
       return wasm::ValType::V128;
-    if (Type == "exnref")
-      return wasm::ValType::EXNREF;
+    if (Type == "funcref")
+      return wasm::ValType::FUNCREF;
     if (Type == "externref")
       return wasm::ValType::EXTERNREF;
     return Optional<wasm::ValType>();
@@ -335,7 +355,8 @@ public:
         .Case("f32", WebAssembly::BlockType::F32)
         .Case("f64", WebAssembly::BlockType::F64)
         .Case("v128", WebAssembly::BlockType::V128)
-        .Case("exnref", WebAssembly::BlockType::Exnref)
+        .Case("funcref", WebAssembly::BlockType::Funcref)
+        .Case("externref", WebAssembly::BlockType::Externref)
         .Case("void", WebAssembly::BlockType::Void)
         .Default(WebAssembly::BlockType::Invalid);
   }
@@ -403,7 +424,8 @@ public:
   bool checkForP2AlignIfLoadStore(OperandVector &Operands, StringRef InstName) {
     // FIXME: there is probably a cleaner way to do this.
     auto IsLoadStore = InstName.find(".load") != StringRef::npos ||
-                       InstName.find(".store") != StringRef::npos;
+                       InstName.find(".store") != StringRef::npos ||
+                       InstName.find("prefetch") != StringRef::npos;
     auto IsAtomic = InstName.find("atomic.") != StringRef::npos;
     if (IsLoadStore || IsAtomic) {
       // Parse load/store operands of the form: offset:p2align=align
@@ -417,6 +439,12 @@ public:
           return error("Expected integer constant");
         parseSingleInteger(false, Operands);
       } else {
+        // v128.{load,store}{8,16,32,64}_lane has both a memarg and a lane
+        // index. We need to avoid parsing an extra alignment operand for the
+        // lane index.
+        auto IsLoadStoreLane = InstName.find("_lane") != StringRef::npos;
+        if (IsLoadStoreLane && Operands.size() == 4)
+          return false;
         // Alignment not specified (or atomics, must use default alignment).
         // We can't just call WebAssembly::GetDefaultP2Align since we don't have
         // an opcode until after the assembly matcher, so set a default to fix
@@ -430,6 +458,13 @@ public:
     return false;
   }
 
+  WebAssembly::HeapType parseHeapType(StringRef Id) {
+    return StringSwitch<WebAssembly::HeapType>(Id)
+        .Case("extern", WebAssembly::HeapType::Externref)
+        .Case("func", WebAssembly::HeapType::Funcref)
+        .Default(WebAssembly::HeapType::Invalid);
+  }
+
   void addBlockTypeOperand(OperandVector &Operands, SMLoc NameLoc,
                            WebAssembly::BlockType BT) {
     Operands.push_back(std::make_unique<WebAssemblyOperand>(
@@ -472,6 +507,7 @@ public:
     // proper nesting.
     bool ExpectBlockType = false;
     bool ExpectFuncType = false;
+    bool ExpectHeapType = false;
     if (Name == "block") {
       push(Block);
       ExpectBlockType = true;
@@ -511,6 +547,17 @@ public:
         return true;
     } else if (Name == "call_indirect" || Name == "return_call_indirect") {
       ExpectFuncType = true;
+      // Ensure that the object file has a __indirect_function_table import, as
+      // we call_indirect against it.
+      auto &Ctx = getStreamer().getContext();
+      MCSymbolWasm *Sym =
+          GetOrCreateFunctionTableSymbol(Ctx, "__indirect_function_table");
+      // Until call_indirect emits TABLE_NUMBER relocs against this symbol, mark
+      // it as NO_STRIP so as to ensure that the indirect function table makes
+      // it to linked output.
+      Sym->setNoStrip();
+    } else if (Name == "ref.null") {
+      ExpectHeapType = true;
     }
 
     if (ExpectFuncType || (ExpectBlockType && Lexer.is(AsmToken::LParen))) {
@@ -552,6 +599,15 @@ public:
             return error("Unknown block type: ", Id);
           addBlockTypeOperand(Operands, NameLoc, BT);
           Parser.Lex();
+        } else if (ExpectHeapType) {
+          auto HeapType = parseHeapType(Id.getString());
+          if (HeapType == WebAssembly::HeapType::Invalid) {
+            return error("Expected a heap type: ", Id);
+          }
+          Operands.push_back(std::make_unique<WebAssemblyOperand>(
+              WebAssemblyOperand::Integer, Id.getLoc(), Id.getEndLoc(),
+              WebAssemblyOperand::IntOp{static_cast<int64_t>(HeapType)}));
+          Parser.Lex();
         } else {
           // Assume this identifier is a label.
           const MCExpr *Val;
@@ -687,16 +743,52 @@ public:
       auto Type = parseType(TypeName);
       if (!Type)
         return error("Unknown type in .globaltype directive: ", TypeTok);
+      // Optional mutable modifier. Default to mutable for historical reasons.
+      // Ideally we would have gone with immutable as the default and used `mut`
+      // as the modifier to match the `.wat` format.
+      bool Mutable = true;
+      if (isNext(AsmToken::Comma)) {
+        TypeTok = Lexer.getTok();
+        auto Id = expectIdent();
+        if (Id == "immutable")
+          Mutable = false;
+        else
+          // Should we also allow `mutable` and `mut` here for clarity?
+          return error("Unknown type in .globaltype modifier: ", TypeTok);
+      }
       // Now set this symbol with the correct type.
       auto WasmSym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(SymName));
       WasmSym->setType(wasm::WASM_SYMBOL_TYPE_GLOBAL);
       WasmSym->setGlobalType(
-          wasm::WasmGlobalType{uint8_t(Type.getValue()), true});
+          wasm::WasmGlobalType{uint8_t(Type.getValue()), Mutable});
       // And emit the directive again.
       TOut.emitGlobalType(WasmSym);
       return expect(AsmToken::EndOfStatement, "EOL");
     }
 
+    if (DirectiveID.getString() == ".tabletype") {
+      auto SymName = expectIdent();
+      if (SymName.empty())
+        return true;
+      if (expect(AsmToken::Comma, ","))
+        return true;
+      auto TypeTok = Lexer.getTok();
+      auto TypeName = expectIdent();
+      if (TypeName.empty())
+        return true;
+      auto Type = parseType(TypeName);
+      if (!Type)
+        return error("Unknown type in .tabletype directive: ", TypeTok);
+
+      // Now that we have the name and table type, we can actually create the
+      // symbol
+      auto WasmSym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(SymName));
+      WasmSym->setType(wasm::WASM_SYMBOL_TYPE_TABLE);
+      WasmSym->setTableType(Type.getValue());
+      TOut.emitTableType(WasmSym);
+      return expect(AsmToken::EndOfStatement, "EOL");
+    }
+
     if (DirectiveID.getString() == ".functype") {
       // This code has to send things to the streamer similar to
       // WebAssemblyAsmPrinter::EmitFunctionBodyStart.
@@ -836,8 +928,9 @@ public:
                                bool MatchingInlineAsm) override {
     MCInst Inst;
     Inst.setLoc(IDLoc);
-    unsigned MatchResult =
-        MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm);
+    FeatureBitset MissingFeatures;
+    unsigned MatchResult = MatchInstructionImpl(
+        Operands, Inst, ErrorInfo, MissingFeatures, MatchingInlineAsm);
     switch (MatchResult) {
     case Match_Success: {
       ensureLocals(Out);
@@ -866,9 +959,16 @@ public:
       }
       return false;
     }
-    case Match_MissingFeature:
-      return Parser.Error(
-          IDLoc, "instruction requires a WASM feature not currently enabled");
+    case Match_MissingFeature: {
+      assert(MissingFeatures.count() > 0 && "Expected missing features");
+      SmallString<128> Message;
+      raw_svector_ostream OS(Message);
+      OS << "instruction requires:";
+      for (unsigned i = 0, e = MissingFeatures.size(); i != e; ++i)
+        if (MissingFeatures.test(i))
+          OS << ' ' << getSubtargetFeatureName(i);
+      return Parser.Error(IDLoc, Message);
+    }
     case Match_MnemonicFail:
       return Parser.Error(IDLoc, "invalid instruction");
     case Match_NearMisses:
@@ -896,12 +996,27 @@ public:
     auto SymName = Symbol->getName();
     if (SymName.startswith(".L"))
       return; // Local Symbol.
+
     // Only create a new text section if we're already in one.
+    // TODO: If the user explicitly creates a new function section, we ignore
+    // its name when we create this one. It would be nice to honor their
+    // choice, while still ensuring that we create one if they forget.
+    // (that requires coordination with WasmAsmParser::parseSectionDirective)
     auto CWS = cast<MCSectionWasm>(getStreamer().getCurrentSection().first);
     if (!CWS || !CWS->getKind().isText())
       return;
     auto SecName = ".text." + SymName;
-    auto WS = getContext().getWasmSection(SecName, SectionKind::getText());
+
+    auto *Group = CWS->getGroup();
+    // If the current section is a COMDAT, also set the flag on the symbol.
+    // TODO: Currently the only place that the symbols' comdat flag matters is
+    // for importing comdat functions. But there's no way to specify that in
+    // assembly currently.
+    if (Group)
+      cast<MCSymbolWasm>(Symbol)->setComdat(true);
+    auto *WS =
+        getContext().getWasmSection(SecName, SectionKind::getText(), Group,
+                                    MCContext::GenericSectionID, nullptr);
     getStreamer().SwitchSection(WS);
     // Also generate DWARF for this section if requested.
     if (getContext().getGenDwarfForAssembly())
@@ -932,5 +1047,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeWebAssemblyAsmParser() {
 }
 
 #define GET_REGISTER_MATCHER
+#define GET_SUBTARGET_FEATURE_NAME
 #define GET_MATCHER_IMPLEMENTATION
 #include "WebAssemblyGenAsmMatcher.inc"
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
index 42fa6d58fffd..1b7cc093f7ad 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
@@ -198,6 +198,7 @@ MCDisassembler::DecodeStatus WebAssemblyDisassembler::getInstruction(
     case WebAssembly::OPERAND_LOCAL:
     case WebAssembly::OPERAND_GLOBAL:
     case WebAssembly::OPERAND_FUNCTION32:
+    case WebAssembly::OPERAND_TABLE:
     case WebAssembly::OPERAND_OFFSET32:
     case WebAssembly::OPERAND_OFFSET64:
     case WebAssembly::OPERAND_P2ALIGN:
@@ -240,6 +241,28 @@ MCDisassembler::DecodeStatus WebAssemblyDisassembler::getInstruction(
       }
       break;
     }
+    // heap_type operands, for e.g. ref.null:
+    case WebAssembly::OPERAND_HEAPTYPE: {
+      int64_t Val;
+      uint64_t PrevSize = Size;
+      if (!nextLEB(Val, Bytes, Size, true))
+        return MCDisassembler::Fail;
+      if (Val < 0 && Size == PrevSize + 1) {
+        // The HeapType encoding is like BlockType, in that encodings that
+        // decode as negative values indicate ValTypes.  In practice we expect
+        // either wasm::ValType::EXTERNREF or wasm::ValType::FUNCREF here.
+        //
+        // The positive SLEB values are reserved for future expansion and are
+        // expected to be type indices in the typed function references
+        // proposal, and should disassemble as MCSymbolRefExpr as in BlockType
+        // above.
+        MI.addOperand(MCOperand::createImm(Val & 0x7f));
+      } else {
+        MI.addOperand(
+            MCOperand::createImm(int64_t(WebAssembly::HeapType::Invalid)));
+      }
+      break;
+    }
     // FP operands.
     case WebAssembly::OPERAND_F32IMM: {
       if (!parseImmediate<float>(MI, Size, Bytes))
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
index 8ecd7c53621d..d88311197c1a 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
@@ -59,11 +59,6 @@ public:
     return false;
   }
 
-  bool mayNeedRelaxation(const MCInst &Inst,
-                         const MCSubtargetInfo &STI) const override {
-    return false;
-  }
-
   bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
 };
 
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp
index f60b5fcd14ec..fb8b0c364f30 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp
@@ -94,19 +94,18 @@ void WebAssemblyInstPrinter::printInst(const MCInst *MI, uint64_t Address,
     case WebAssembly::LOOP_S:
       printAnnotation(OS, "label" + utostr(ControlFlowCounter) + ':');
       ControlFlowStack.push_back(std::make_pair(ControlFlowCounter++, true));
-      break;
+      return;
 
     case WebAssembly::BLOCK:
     case WebAssembly::BLOCK_S:
       ControlFlowStack.push_back(std::make_pair(ControlFlowCounter++, false));
-      break;
+      return;
 
     case WebAssembly::TRY:
     case WebAssembly::TRY_S:
-      ControlFlowStack.push_back(std::make_pair(ControlFlowCounter++, false));
-      EHPadStack.push_back(EHPadStackCounter++);
-      LastSeenEHInst = TRY;
-      break;
+      ControlFlowStack.push_back(std::make_pair(ControlFlowCounter, false));
+      EHPadStack.push_back(ControlFlowCounter++);
+      return;
 
     case WebAssembly::END_LOOP:
     case WebAssembly::END_LOOP_S:
@@ -115,7 +114,7 @@ void WebAssemblyInstPrinter::printInst(const MCInst *MI, uint64_t Address,
       } else {
         ControlFlowStack.pop_back();
       }
-      break;
+      return;
 
     case WebAssembly::END_BLOCK:
     case WebAssembly::END_BLOCK_S:
@@ -125,7 +124,7 @@ void WebAssemblyInstPrinter::printInst(const MCInst *MI, uint64_t Address,
         printAnnotation(
             OS, "label" + utostr(ControlFlowStack.pop_back_val().first) + ':');
       }
-      break;
+      return;
 
     case WebAssembly::END_TRY:
     case WebAssembly::END_TRY_S:
@@ -134,60 +133,60 @@ void WebAssemblyInstPrinter::printInst(const MCInst *MI, uint64_t Address,
       } else {
         printAnnotation(
             OS, "label" + utostr(ControlFlowStack.pop_back_val().first) + ':');
-        LastSeenEHInst = END_TRY;
       }
-      break;
+      return;
 
     case WebAssembly::CATCH:
     case WebAssembly::CATCH_S:
+    case WebAssembly::CATCH_ALL:
+    case WebAssembly::CATCH_ALL_S:
       if (EHPadStack.empty()) {
         printAnnotation(OS, "try-catch mismatch!");
       } else {
         printAnnotation(OS, "catch" + utostr(EHPadStack.pop_back_val()) + ':');
       }
-      break;
-    }
-
-    // Annotate any control flow label references.
+      return;
 
-    // rethrow instruction does not take any depth argument and rethrows to the
-    // nearest enclosing catch scope, if any. If there's no enclosing catch
-    // scope, it throws up to the caller.
-    if (Opc == WebAssembly::RETHROW || Opc == WebAssembly::RETHROW_S) {
+    case WebAssembly::RETHROW:
+    case WebAssembly::RETHROW_S:
+      // 'rethrow' rethrows to the nearest enclosing catch scope, if any. If
+      // there's no enclosing catch scope, it throws up to the caller.
       if (EHPadStack.empty()) {
         printAnnotation(OS, "to caller");
       } else {
         printAnnotation(OS, "down to catch" + utostr(EHPadStack.back()));
       }
+      return;
+    }
 
-    } else {
-      unsigned NumFixedOperands = Desc.NumOperands;
-      SmallSet<uint64_t, 8> Printed;
-      for (unsigned I = 0, E = MI->getNumOperands(); I < E; ++I) {
-        // See if this operand denotes a basic block target.
-        if (I < NumFixedOperands) {
-          // A non-variable_ops operand, check its type.
-          if (Desc.OpInfo[I].OperandType != WebAssembly::OPERAND_BASIC_BLOCK)
-            continue;
-        } else {
-          // A variable_ops operand, which currently can be immediates (used in
-          // br_table) which are basic block targets, or for call instructions
-          // when using -wasm-keep-registers (in which case they are registers,
-          // and should not be processed).
-          if (!MI->getOperand(I).isImm())
-            continue;
-        }
-        uint64_t Depth = MI->getOperand(I).getImm();
-        if (!Printed.insert(Depth).second)
+    // Annotate any control flow label references.
+
+    unsigned NumFixedOperands = Desc.NumOperands;
+    SmallSet<uint64_t, 8> Printed;
+    for (unsigned I = 0, E = MI->getNumOperands(); I < E; ++I) {
+      // See if this operand denotes a basic block target.
+      if (I < NumFixedOperands) {
+        // A non-variable_ops operand, check its type.
+        if (Desc.OpInfo[I].OperandType != WebAssembly::OPERAND_BASIC_BLOCK)
           continue;
-        if (Depth >= ControlFlowStack.size()) {
-          printAnnotation(OS, "Invalid depth argument!");
-        } else {
-          const auto &Pair = ControlFlowStack.rbegin()[Depth];
-          printAnnotation(OS, utostr(Depth) + ": " +
-                                  (Pair.second ? "up" : "down") + " to label" +
-                                  utostr(Pair.first));
-        }
+      } else {
+        // A variable_ops operand, which currently can be immediates (used in
+        // br_table) which are basic block targets, or for call instructions
+        // when using -wasm-keep-registers (in which case they are registers,
+        // and should not be processed).
+        if (!MI->getOperand(I).isImm())
+          continue;
+      }
+      uint64_t Depth = MI->getOperand(I).getImm();
+      if (!Printed.insert(Depth).second)
+        continue;
+      if (Depth >= ControlFlowStack.size()) {
+        printAnnotation(OS, "Invalid depth argument!");
+      } else {
+        const auto &Pair = ControlFlowStack.rbegin()[Depth];
+        printAnnotation(OS, utostr(Depth) + ": " +
+                                (Pair.second ? "up" : "down") + " to label" +
+                                utostr(Pair.first));
       }
     }
   }
@@ -302,6 +301,29 @@ void WebAssemblyInstPrinter::printWebAssemblySignatureOperand(const MCInst *MI,
   }
 }
 
+void WebAssemblyInstPrinter::printWebAssemblyHeapTypeOperand(const MCInst *MI,
+                                                             unsigned OpNo,
+                                                             raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isImm()) {
+    switch (Op.getImm()) {
+    case long(wasm::ValType::EXTERNREF):
+      O << "extern";
+      break;
+    case long(wasm::ValType::FUNCREF):
+      O << "func";
+      break;
+    default:
+      O << "unsupported_heap_type_value";
+      break;
+    }
+  } else {
+    // Typed function references and other subtypes of funcref and externref
+    // currently unimplemented.
+    O << "unsupported_heap_type_operand";
+  }
+}
+
 // We have various enums representing a subset of these types, use this
 // function to convert any of them to text.
 const char *WebAssembly::anyTypeToString(unsigned Ty) {
@@ -318,10 +340,10 @@ const char *WebAssembly::anyTypeToString(unsigned Ty) {
     return "v128";
   case wasm::WASM_TYPE_FUNCREF:
     return "funcref";
+  case wasm::WASM_TYPE_EXTERNREF:
+    return "externref";
   case wasm::WASM_TYPE_FUNC:
     return "func";
-  case wasm::WASM_TYPE_EXNREF:
-    return "exnref";
   case wasm::WASM_TYPE_NORESULT:
     return "void";
   default:
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h b/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h
index 1387a1928b3f..2ed6d562acff 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h
@@ -25,13 +25,9 @@ class MCSubtargetInfo;
 
 class WebAssemblyInstPrinter final : public MCInstPrinter {
   uint64_t ControlFlowCounter = 0;
-  uint64_t EHPadStackCounter = 0;
   SmallVector<std::pair<uint64_t, bool>, 4> ControlFlowStack;
   SmallVector<uint64_t, 4> EHPadStack;
 
-  enum EHInstKind { TRY, CATCH, END_TRY };
-  EHInstKind LastSeenEHInst = END_TRY;
-
 public:
   WebAssemblyInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
                          const MCRegisterInfo &MRI);
@@ -48,8 +44,11 @@ public:
                                       raw_ostream &O);
   void printWebAssemblySignatureOperand(const MCInst *MI, unsigned OpNo,
                                         raw_ostream &O);
+  void printWebAssemblyHeapTypeOperand(const MCInst *MI, unsigned OpNo,
+                                       raw_ostream &O);
 
   // Autogenerated by tblgen.
+  std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
   void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &O);
   static const char *getRegisterName(unsigned RegNo);
 };
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
index dfed3451e45b..55bf5d14fdac 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
@@ -62,12 +62,16 @@ void WebAssemblyMCCodeEmitter::encodeInstruction(
   uint64_t Start = OS.tell();
 
   uint64_t Binary = getBinaryCodeForInstr(MI, Fixups, STI);
-  if (Binary <= UINT8_MAX) {
+  if (Binary < (1 << 8)) {
     OS << uint8_t(Binary);
-  } else {
-    assert(Binary <= UINT16_MAX && "Several-byte opcodes not supported yet");
+  } else if (Binary < (1 << 16)) {
     OS << uint8_t(Binary >> 8);
     encodeULEB128(uint8_t(Binary), OS);
+  } else if (Binary < (1 << 24)) {
+    OS << uint8_t(Binary >> 16);
+    encodeULEB128(uint16_t(Binary), OS);
+  } else {
+    llvm_unreachable("Very large (prefix + 3 byte) opcodes not supported");
   }
 
   // For br_table instructions, encode the size of the table. In the MCInst,
@@ -102,6 +106,7 @@ void WebAssemblyMCCodeEmitter::encodeInstruction(
           encodeSLEB128(int64_t(MO.getImm()), OS);
           break;
         case WebAssembly::OPERAND_SIGNATURE:
+        case WebAssembly::OPERAND_HEAPTYPE:
           OS << uint8_t(MO.getImm());
           break;
         case WebAssembly::OPERAND_VEC_I8IMM:
@@ -151,6 +156,7 @@ void WebAssemblyMCCodeEmitter::encodeInstruction(
         PaddedSize = 10;
         break;
       case WebAssembly::OPERAND_FUNCTION32:
+      case WebAssembly::OPERAND_TABLE:
       case WebAssembly::OPERAND_OFFSET32:
       case WebAssembly::OPERAND_SIGNATURE:
       case WebAssembly::OPERAND_TYPEINDEX:
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
index 027e5408c633..064e613cfc8e 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
@@ -76,7 +76,7 @@ static MCAsmBackend *createAsmBackend(const Target & /*T*/,
 
 static MCSubtargetInfo *createMCSubtargetInfo(const Triple &TT, StringRef CPU,
                                               StringRef FS) {
-  return createWebAssemblyMCSubtargetInfoImpl(TT, CPU, FS);
+  return createWebAssemblyMCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FS);
 }
 
 static MCTargetStreamer *
@@ -147,8 +147,10 @@ wasm::ValType WebAssembly::toValType(const MVT &Ty) {
   case MVT::v4f32:
   case MVT::v2f64:
     return wasm::ValType::V128;
-  case MVT::exnref:
-    return wasm::ValType::EXNREF;
+  case MVT::funcref:
+    return wasm::ValType::FUNCREF;
+  case MVT::externref:
+    return wasm::ValType::EXTERNREF;
   default:
     llvm_unreachable("unexpected type");
   }
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
index 02b310628ee1..5b77b8495adf 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
@@ -76,6 +76,10 @@ enum OperandType {
   OPERAND_EVENT,
   /// A list of branch targets for br_list.
   OPERAND_BRLIST,
+  /// 32-bit unsigned table number.
+  OPERAND_TABLE,
+  /// heap type immediate for ref.null.
+  OPERAND_HEAPTYPE,
 };
 } // end namespace WebAssembly
 
@@ -96,6 +100,11 @@ enum TOF {
   // Only applicable to data symbols.
   MO_MEMORY_BASE_REL,
 
+  // On a symbol operand this indicates that the immediate is the symbol
+  // address relative the __tls_base wasm global.
+  // Only applicable to data symbols.
+  MO_TLS_BASE_REL,
+
   // On a symbol operand this indicates that the immediate is the symbol
   // address relative the __table_base wasm global.
   // Only applicable to function symbols.
@@ -129,7 +138,8 @@ enum class BlockType : unsigned {
   F32 = unsigned(wasm::ValType::F32),
   F64 = unsigned(wasm::ValType::F64),
   V128 = unsigned(wasm::ValType::V128),
-  Exnref = unsigned(wasm::ValType::EXNREF),
+  Externref = unsigned(wasm::ValType::EXTERNREF),
+  Funcref = unsigned(wasm::ValType::FUNCREF),
   // Multivalue blocks (and other non-void blocks) are only emitted when the
   // blocks will never be exited and are at the ends of functions (see
   // WebAssemblyCFGStackify::fixEndsAtEndOfFunction). They also are never made
@@ -138,6 +148,13 @@ enum class BlockType : unsigned {
   Multivalue = 0xffff,
 };
 
+/// Used as immediate MachineOperands for heap types, e.g. for ref.null.
+enum class HeapType : unsigned {
+  Invalid = 0x00,
+  Externref = unsigned(wasm::ValType::EXTERNREF),
+  Funcref = unsigned(wasm::ValType::FUNCREF),
+};
+
 /// Instruction opcodes emitted via means other than CodeGen.
 static const unsigned Nop = 0x01;
 static const unsigned End = 0x0b;
@@ -176,8 +193,12 @@ inline unsigned GetDefaultP2AlignAny(unsigned Opc) {
   WASM_LOAD_STORE(ATOMIC_RMW8_U_XCHG_I64)
   WASM_LOAD_STORE(ATOMIC_RMW8_U_CMPXCHG_I32)
   WASM_LOAD_STORE(ATOMIC_RMW8_U_CMPXCHG_I64)
-  WASM_LOAD_STORE(LOAD_SPLAT_v8x16)
-    return 0;
+  WASM_LOAD_STORE(LOAD8_SPLAT)
+  WASM_LOAD_STORE(LOAD_LANE_I8x16)
+  WASM_LOAD_STORE(STORE_LANE_I8x16)
+  WASM_LOAD_STORE(PREFETCH_T)
+  WASM_LOAD_STORE(PREFETCH_NT)
+  return 0;
   WASM_LOAD_STORE(LOAD16_S_I32)
   WASM_LOAD_STORE(LOAD16_U_I32)
   WASM_LOAD_STORE(LOAD16_S_I64)
@@ -202,8 +223,10 @@ inline unsigned GetDefaultP2AlignAny(unsigned Opc) {
   WASM_LOAD_STORE(ATOMIC_RMW16_U_XCHG_I64)
   WASM_LOAD_STORE(ATOMIC_RMW16_U_CMPXCHG_I32)
   WASM_LOAD_STORE(ATOMIC_RMW16_U_CMPXCHG_I64)
-  WASM_LOAD_STORE(LOAD_SPLAT_v16x8)
-    return 1;
+  WASM_LOAD_STORE(LOAD16_SPLAT)
+  WASM_LOAD_STORE(LOAD_LANE_I16x8)
+  WASM_LOAD_STORE(STORE_LANE_I16x8)
+  return 1;
   WASM_LOAD_STORE(LOAD_I32)
   WASM_LOAD_STORE(LOAD_F32)
   WASM_LOAD_STORE(STORE_I32)
@@ -229,10 +252,13 @@ inline unsigned GetDefaultP2AlignAny(unsigned Opc) {
   WASM_LOAD_STORE(ATOMIC_RMW32_U_XCHG_I64)
   WASM_LOAD_STORE(ATOMIC_RMW_CMPXCHG_I32)
   WASM_LOAD_STORE(ATOMIC_RMW32_U_CMPXCHG_I64)
-  WASM_LOAD_STORE(ATOMIC_NOTIFY)
-  WASM_LOAD_STORE(ATOMIC_WAIT_I32)
-  WASM_LOAD_STORE(LOAD_SPLAT_v32x4)
-    return 2;
+  WASM_LOAD_STORE(MEMORY_ATOMIC_NOTIFY)
+  WASM_LOAD_STORE(MEMORY_ATOMIC_WAIT32)
+  WASM_LOAD_STORE(LOAD32_SPLAT)
+  WASM_LOAD_STORE(LOAD_ZERO_I32x4)
+  WASM_LOAD_STORE(LOAD_LANE_I32x4)
+  WASM_LOAD_STORE(STORE_LANE_I32x4)
+  return 2;
   WASM_LOAD_STORE(LOAD_I64)
   WASM_LOAD_STORE(LOAD_F64)
   WASM_LOAD_STORE(STORE_I64)
@@ -246,15 +272,18 @@ inline unsigned GetDefaultP2AlignAny(unsigned Opc) {
   WASM_LOAD_STORE(ATOMIC_RMW_XOR_I64)
   WASM_LOAD_STORE(ATOMIC_RMW_XCHG_I64)
   WASM_LOAD_STORE(ATOMIC_RMW_CMPXCHG_I64)
-  WASM_LOAD_STORE(ATOMIC_WAIT_I64)
-  WASM_LOAD_STORE(LOAD_SPLAT_v64x2)
-  WASM_LOAD_STORE(LOAD_EXTEND_S_v8i16)
-  WASM_LOAD_STORE(LOAD_EXTEND_U_v8i16)
-  WASM_LOAD_STORE(LOAD_EXTEND_S_v4i32)
-  WASM_LOAD_STORE(LOAD_EXTEND_U_v4i32)
-  WASM_LOAD_STORE(LOAD_EXTEND_S_v2i64)
-  WASM_LOAD_STORE(LOAD_EXTEND_U_v2i64)
-    return 3;
+  WASM_LOAD_STORE(MEMORY_ATOMIC_WAIT64)
+  WASM_LOAD_STORE(LOAD64_SPLAT)
+  WASM_LOAD_STORE(LOAD_EXTEND_S_I16x8)
+  WASM_LOAD_STORE(LOAD_EXTEND_U_I16x8)
+  WASM_LOAD_STORE(LOAD_EXTEND_S_I32x4)
+  WASM_LOAD_STORE(LOAD_EXTEND_U_I32x4)
+  WASM_LOAD_STORE(LOAD_EXTEND_S_I64x2)
+  WASM_LOAD_STORE(LOAD_EXTEND_U_I64x2)
+  WASM_LOAD_STORE(LOAD_ZERO_I64x2)
+  WASM_LOAD_STORE(LOAD_LANE_I64x2)
+  WASM_LOAD_STORE(STORE_LANE_I64x2)
+  return 3;
   WASM_LOAD_STORE(LOAD_V128)
   WASM_LOAD_STORE(STORE_V128)
     return 4;
@@ -294,8 +323,10 @@ inline bool isArgument(unsigned Opc) {
   case WebAssembly::ARGUMENT_v4f32_S:
   case WebAssembly::ARGUMENT_v2f64:
   case WebAssembly::ARGUMENT_v2f64_S:
-  case WebAssembly::ARGUMENT_exnref:
-  case WebAssembly::ARGUMENT_exnref_S:
+  case WebAssembly::ARGUMENT_funcref:
+  case WebAssembly::ARGUMENT_funcref_S:
+  case WebAssembly::ARGUMENT_externref:
+  case WebAssembly::ARGUMENT_externref_S:
     return true;
   default:
     return false;
@@ -314,8 +345,10 @@ inline bool isCopy(unsigned Opc) {
   case WebAssembly::COPY_F64_S:
   case WebAssembly::COPY_V128:
   case WebAssembly::COPY_V128_S:
-  case WebAssembly::COPY_EXNREF:
-  case WebAssembly::COPY_EXNREF_S:
+  case WebAssembly::COPY_FUNCREF:
+  case WebAssembly::COPY_FUNCREF_S:
+  case WebAssembly::COPY_EXTERNREF:
+  case WebAssembly::COPY_EXTERNREF_S:
     return true;
   default:
     return false;
@@ -334,8 +367,10 @@ inline bool isTee(unsigned Opc) {
   case WebAssembly::TEE_F64_S:
   case WebAssembly::TEE_V128:
   case WebAssembly::TEE_V128_S:
-  case WebAssembly::TEE_EXNREF:
-  case WebAssembly::TEE_EXNREF_S:
+  case WebAssembly::TEE_FUNCREF:
+  case WebAssembly::TEE_FUNCREF_S:
+  case WebAssembly::TEE_EXTERNREF:
+  case WebAssembly::TEE_EXTERNREF_S:
     return true;
   default:
     return false;
@@ -398,6 +433,18 @@ inline bool isMarker(unsigned Opc) {
   }
 }
 
+inline bool isCatch(unsigned Opc) {
+  switch (Opc) {
+  case WebAssembly::CATCH:
+  case WebAssembly::CATCH_S:
+  case WebAssembly::CATCH_ALL:
+  case WebAssembly::CATCH_ALL_S:
+    return true;
+  default:
+    return false;
+  }
+}
+
 } // end namespace WebAssembly
 } // end namespace llvm
 
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
index e954eeaebb14..652d7a00a63c 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
@@ -71,8 +71,17 @@ void WebAssemblyTargetAsmStreamer::emitGlobalType(const MCSymbolWasm *Sym) {
   assert(Sym->isGlobal());
   OS << "\t.globaltype\t" << Sym->getName() << ", "
      << WebAssembly::typeToString(
-            static_cast<wasm::ValType>(Sym->getGlobalType().Type))
-     << '\n';
+            static_cast<wasm::ValType>(Sym->getGlobalType().Type));
+  if (!Sym->getGlobalType().Mutable)
+    OS << ", immutable";
+  OS << '\n';
+}
+
+void WebAssemblyTargetAsmStreamer::emitTableType(const MCSymbolWasm *Sym) {
+  assert(Sym->isTable());
+  OS << "\t.tabletype\t" << Sym->getName() << ", "
+     << WebAssembly::typeToString(Sym->getTableType());
+  OS << '\n';
 }
 
 void WebAssemblyTargetAsmStreamer::emitEventType(const MCSymbolWasm *Sym) {
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h b/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
index d6fba05c9986..75c9fb4e289d 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
@@ -39,6 +39,8 @@ public:
   virtual void emitIndIdx(const MCExpr *Value) = 0;
   /// .globaltype
   virtual void emitGlobalType(const MCSymbolWasm *Sym) = 0;
+  /// .tabletype
+  virtual void emitTableType(const MCSymbolWasm *Sym) = 0;
   /// .eventtype
   virtual void emitEventType(const MCSymbolWasm *Sym) = 0;
   /// .import_module
@@ -67,6 +69,7 @@ public:
   void emitFunctionType(const MCSymbolWasm *Sym) override;
   void emitIndIdx(const MCExpr *Value) override;
   void emitGlobalType(const MCSymbolWasm *Sym) override;
+  void emitTableType(const MCSymbolWasm *Sym) override;
   void emitEventType(const MCSymbolWasm *Sym) override;
   void emitImportModule(const MCSymbolWasm *Sym, StringRef ImportModule) override;
   void emitImportName(const MCSymbolWasm *Sym, StringRef ImportName) override;
@@ -83,6 +86,7 @@ public:
   void emitFunctionType(const MCSymbolWasm *Sym) override {}
   void emitIndIdx(const MCExpr *Value) override;
   void emitGlobalType(const MCSymbolWasm *Sym) override {}
+  void emitTableType(const MCSymbolWasm *Sym) override {}
   void emitEventType(const MCSymbolWasm *Sym) override {}
   void emitImportModule(const MCSymbolWasm *Sym,
                         StringRef ImportModule) override {}
@@ -103,6 +107,7 @@ public:
   void emitFunctionType(const MCSymbolWasm *) override {}
   void emitIndIdx(const MCExpr *) override {}
   void emitGlobalType(const MCSymbolWasm *) override {}
+  void emitTableType(const MCSymbolWasm *) override {}
   void emitEventType(const MCSymbolWasm *) override {}
   void emitImportModule(const MCSymbolWasm *, StringRef) override {}
   void emitImportName(const MCSymbolWasm *, StringRef) override {}
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
index 779e921c1d94..aa7e2311d240 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
@@ -76,6 +76,8 @@ unsigned WebAssemblyWasmObjectWriter::getRelocType(const MCValue &Target,
     case MCSymbolRefExpr::VK_WASM_TBREL:
       assert(SymA.isFunction());
       return wasm::R_WASM_TABLE_INDEX_REL_SLEB;
+    case MCSymbolRefExpr::VK_WASM_TLSREL:
+      return wasm::R_WASM_MEMORY_ADDR_TLS_SLEB;
     case MCSymbolRefExpr::VK_WASM_MBREL:
       assert(SymA.isData());
       return is64Bit() ? wasm::R_WASM_MEMORY_ADDR_REL_SLEB64
@@ -92,7 +94,8 @@ unsigned WebAssemblyWasmObjectWriter::getRelocType(const MCValue &Target,
       return wasm::R_WASM_TABLE_INDEX_SLEB;
     return wasm::R_WASM_MEMORY_ADDR_SLEB;
   case WebAssembly::fixup_sleb128_i64:
-    assert(SymA.isData());
+    if (SymA.isFunction())
+      return wasm::R_WASM_TABLE_INDEX_SLEB64;
     return wasm::R_WASM_MEMORY_ADDR_SLEB64;
   case WebAssembly::fixup_uleb128_i32:
     if (SymA.isGlobal())
@@ -101,6 +104,8 @@ unsigned WebAssemblyWasmObjectWriter::getRelocType(const MCValue &Target,
       return wasm::R_WASM_FUNCTION_INDEX_LEB;
     if (SymA.isEvent())
       return wasm::R_WASM_EVENT_INDEX_LEB;
+    if (SymA.isTable())
+      return wasm::R_WASM_TABLE_NUMBER_LEB;
     return wasm::R_WASM_MEMORY_ADDR_LEB;
   case WebAssembly::fixup_uleb128_i64:
     assert(SymA.isData());
@@ -119,6 +124,17 @@ unsigned WebAssemblyWasmObjectWriter::getRelocType(const MCValue &Target,
     }
     return wasm::R_WASM_MEMORY_ADDR_I32;
   case FK_Data_8:
+    if (SymA.isFunction())
+      return wasm::R_WASM_TABLE_INDEX_I64;
+    if (SymA.isGlobal())
+      llvm_unreachable("unimplemented R_WASM_GLOBAL_INDEX_I64");
+    if (auto Section = static_cast<const MCSectionWasm *>(
+            getFixupSection(Fixup.getValue()))) {
+      if (Section->getKind().isText())
+        return wasm::R_WASM_FUNCTION_OFFSET_I64;
+      else if (!Section->isWasmData())
+        llvm_unreachable("unimplemented R_WASM_SECTION_OFFSET_I64");
+    }
     assert(SymA.isData());
     return wasm::R_WASM_MEMORY_ADDR_I64;
   default:
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index 96fa13d30729..7f1c4bb40a4c 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -49,6 +49,8 @@ using namespace llvm;
 #define DEBUG_TYPE "asm-printer"
 
 extern cl::opt<bool> WasmKeepRegisters;
+extern cl::opt<bool> EnableEmException;
+extern cl::opt<bool> EnableEmSjLj;
 
 //===----------------------------------------------------------------------===//
 // Helpers.
@@ -81,10 +83,92 @@ WebAssemblyTargetStreamer *WebAssemblyAsmPrinter::getTargetStreamer() {
   return static_cast<WebAssemblyTargetStreamer *>(TS);
 }
 
+// Emscripten exception handling helpers
+//
+// This converts invoke names generated by LowerEmscriptenEHSjLj to real names
+// that are expected by JavaScript glue code. The invoke names generated by
+// Emscripten JS glue code are based on their argument and return types; for
+// example, for a function that takes an i32 and returns nothing, it is
+// 'invoke_vi'. But the format of invoke generated by LowerEmscriptenEHSjLj pass
+// contains a mangled string generated from their IR types, for example,
+// "__invoke_void_%struct.mystruct*_int", because final wasm types are not
+// available in the IR pass. So we convert those names to the form that
+// Emscripten JS code expects.
+//
+// Refer to LowerEmscriptenEHSjLj pass for more details.
+
+// Returns true if the given function name is an invoke name generated by
+// LowerEmscriptenEHSjLj pass.
+static bool isEmscriptenInvokeName(StringRef Name) {
+  if (Name.front() == '"' && Name.back() == '"')
+    Name = Name.substr(1, Name.size() - 2);
+  return Name.startswith("__invoke_");
+}
+
+// Returns a character that represents the given wasm value type in invoke
+// signatures.
+static char getInvokeSig(wasm::ValType VT) {
+  switch (VT) {
+  case wasm::ValType::I32:
+    return 'i';
+  case wasm::ValType::I64:
+    return 'j';
+  case wasm::ValType::F32:
+    return 'f';
+  case wasm::ValType::F64:
+    return 'd';
+  case wasm::ValType::V128:
+    return 'V';
+  case wasm::ValType::FUNCREF:
+    return 'F';
+  case wasm::ValType::EXTERNREF:
+    return 'X';
+  }
+  llvm_unreachable("Unhandled wasm::ValType enum");
+}
+
+// Given the wasm signature, generate the invoke name in the format JS glue code
+// expects.
+static std::string getEmscriptenInvokeSymbolName(wasm::WasmSignature *Sig) {
+  assert(Sig->Returns.size() <= 1);
+  std::string Ret = "invoke_";
+  if (!Sig->Returns.empty())
+    for (auto VT : Sig->Returns)
+      Ret += getInvokeSig(VT);
+  else
+    Ret += 'v';
+  // Invokes' first argument is a pointer to the original function, so skip it
+  for (unsigned I = 1, E = Sig->Params.size(); I < E; I++)
+    Ret += getInvokeSig(Sig->Params[I]);
+  return Ret;
+}
+
 //===----------------------------------------------------------------------===//
 // WebAssemblyAsmPrinter Implementation.
 //===----------------------------------------------------------------------===//
 
+MCSymbolWasm *WebAssemblyAsmPrinter::getMCSymbolForFunction(
+    const Function *F, bool EnableEmEH, wasm::WasmSignature *Sig,
+    bool &InvokeDetected) {
+  MCSymbolWasm *WasmSym = nullptr;
+  if (EnableEmEH && isEmscriptenInvokeName(F->getName())) {
+    assert(Sig);
+    InvokeDetected = true;
+    if (Sig->Returns.size() > 1) {
+      std::string Msg =
+          "Emscripten EH/SjLj does not support multivalue returns: " +
+          std::string(F->getName()) + ": " +
+          WebAssembly::signatureToString(Sig);
+      report_fatal_error(Msg);
+    }
+    WasmSym = cast<MCSymbolWasm>(
+        GetExternalSymbolSymbol(getEmscriptenInvokeSymbolName(Sig)));
+  } else {
+    WasmSym = cast<MCSymbolWasm>(getSymbol(F));
+  }
+  return WasmSym;
+}
+
 void WebAssemblyAsmPrinter::emitEndOfAsmFile(Module &M) {
   for (auto &It : OutContext.getSymbols()) {
     // Emit a .globaltype and .eventtype declaration.
@@ -95,6 +179,7 @@ void WebAssemblyAsmPrinter::emitEndOfAsmFile(Module &M) {
       getTargetStreamer()->emitEventType(Sym);
   }
 
+  DenseSet<MCSymbol *> InvokeSymbols;
   for (const auto &F : M) {
     if (F.isIntrinsic())
       continue;
@@ -104,31 +189,46 @@ void WebAssemblyAsmPrinter::emitEndOfAsmFile(Module &M) {
       SmallVector<MVT, 4> Results;
       SmallVector<MVT, 4> Params;
       computeSignatureVTs(F.getFunctionType(), &F, F, TM, Params, Results);
-      auto *Sym = cast<MCSymbolWasm>(getSymbol(&F));
+      // At this point these MCSymbols may or may not have been created already
+      // and thus also contain a signature, but we need to get the signature
+      // anyway here in case it is an invoke that has not yet been created. We
+      // will discard it later if it turns out not to be necessary.
+      auto Signature = signatureFromMVTs(Results, Params);
+      bool InvokeDetected = false;
+      auto *Sym = getMCSymbolForFunction(&F, EnableEmException || EnableEmSjLj,
+                                         Signature.get(), InvokeDetected);
+
+      // Multiple functions can be mapped to the same invoke symbol. For
+      // example, two IR functions '__invoke_void_i8*' and '__invoke_void_i32'
+      // are both mapped to '__invoke_vi'. We keep them in a set once we emit an
+      // Emscripten EH symbol so we don't emit the same symbol twice.
+      if (InvokeDetected && !InvokeSymbols.insert(Sym).second)
+        continue;
+
       Sym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
       if (!Sym->getSignature()) {
-        auto Signature = signatureFromMVTs(Results, Params);
         Sym->setSignature(Signature.get());
         addSignature(std::move(Signature));
+      } else {
+        // This symbol has already been created and had a signature. Discard it.
+        Signature.reset();
       }
-      // FIXME: this was originally intended for post-linking and was only used
-      // for imports that were only called indirectly (i.e. s2wasm could not
-      // infer the type from a call). With object files it applies to all
-      // imports. so fix the names and the tests, or rethink how import
-      // delcarations work in asm files.
+
       getTargetStreamer()->emitFunctionType(Sym);
 
-      if (TM.getTargetTriple().isOSBinFormatWasm() &&
-          F.hasFnAttribute("wasm-import-module")) {
+      if (F.hasFnAttribute("wasm-import-module")) {
         StringRef Name =
             F.getFnAttribute("wasm-import-module").getValueAsString();
         Sym->setImportModule(storeName(Name));
         getTargetStreamer()->emitImportModule(Sym, Name);
       }
-      if (TM.getTargetTriple().isOSBinFormatWasm() &&
-          F.hasFnAttribute("wasm-import-name")) {
+      if (F.hasFnAttribute("wasm-import-name")) {
+        // If this is a converted Emscripten EH/SjLj symbol, we shouldn't use
+        // the original function name but the converted symbol name.
         StringRef Name =
-            F.getFnAttribute("wasm-import-name").getValueAsString();
+            InvokeDetected
+                ? Sym->getName()
+                : F.getFnAttribute("wasm-import-name").getValueAsString();
         Sym->setImportName(storeName(Name));
         getTargetStreamer()->emitImportName(Sym, Name);
       }
@@ -304,7 +404,6 @@ void WebAssemblyAsmPrinter::emitFunctionBodyStart() {
   addSignature(std::move(Signature));
   WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
 
-  // FIXME: clean up how params and results are emitted (use signatures)
   getTargetStreamer()->emitFunctionType(WasmSym);
 
   // Emit the function index.
@@ -362,14 +461,6 @@ void WebAssemblyAsmPrinter::emitInstruction(const MachineInstr *MI) {
     // This is a compiler barrier that prevents instruction reordering during
     // backend compilation, and should not be emitted.
     break;
-  case WebAssembly::EXTRACT_EXCEPTION_I32:
-  case WebAssembly::EXTRACT_EXCEPTION_I32_S:
-    // These are pseudo instructions that simulates popping values from stack.
-    // We print these only when we have -wasm-keep-registers on for assembly
-    // readability.
-    if (!WasmKeepRegisters)
-      break;
-    LLVM_FALLTHROUGH;
   default: {
     WebAssemblyMCInstLower MCInstLowering(OutContext, *this);
     MCInst TmpInst;
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.h b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.h
index d9281568638d..7a6a3247a19f 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.h
@@ -77,6 +77,9 @@ public:
   MVT getRegType(unsigned RegNo) const;
   std::string regToString(const MachineOperand &MO);
   WebAssemblyTargetStreamer *getTargetStreamer();
+  MCSymbolWasm *getMCSymbolForFunction(const Function *F, bool EnableEmEH,
+                                       wasm::WasmSignature *Sig,
+                                       bool &InvokeDetected);
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp
index 8442b49e25f4..eb3e9b91d40d 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp
@@ -19,6 +19,7 @@
 #include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
 #include "WebAssembly.h"
 #include "WebAssemblyExceptionInfo.h"
+#include "WebAssemblySortRegion.h"
 #include "WebAssemblySubtarget.h"
 #include "WebAssemblyUtilities.h"
 #include "llvm/ADT/PriorityQueue.h"
@@ -31,6 +32,8 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
+using WebAssembly::SortRegion;
+using WebAssembly::SortRegionInfo;
 
 #define DEBUG_TYPE "wasm-cfg-sort"
 
@@ -44,78 +47,6 @@ static cl::opt<bool> WasmDisableEHPadSort(
 
 namespace {
 
-// Wrapper for loops and exceptions
-class Region {
-public:
-  virtual ~Region() = default;
-  virtual MachineBasicBlock *getHeader() const = 0;
-  virtual bool contains(const MachineBasicBlock *MBB) const = 0;
-  virtual unsigned getNumBlocks() const = 0;
-  using block_iterator = typename ArrayRef<MachineBasicBlock *>::const_iterator;
-  virtual iterator_range<block_iterator> blocks() const = 0;
-  virtual bool isLoop() const = 0;
-};
-
-template <typename T> class ConcreteRegion : public Region {
-  const T *Region;
-
-public:
-  ConcreteRegion(const T *Region) : Region(Region) {}
-  MachineBasicBlock *getHeader() const override { return Region->getHeader(); }
-  bool contains(const MachineBasicBlock *MBB) const override {
-    return Region->contains(MBB);
-  }
-  unsigned getNumBlocks() const override { return Region->getNumBlocks(); }
-  iterator_range<block_iterator> blocks() const override {
-    return Region->blocks();
-  }
-  bool isLoop() const override { return false; }
-};
-
-template <> bool ConcreteRegion<MachineLoop>::isLoop() const { return true; }
-
-// This class has information of nested Regions; this is analogous to what
-// LoopInfo is for loops.
-class RegionInfo {
-  const MachineLoopInfo &MLI;
-  const WebAssemblyExceptionInfo &WEI;
-  DenseMap<const MachineLoop *, std::unique_ptr<Region>> LoopMap;
-  DenseMap<const WebAssemblyException *, std::unique_ptr<Region>> ExceptionMap;
-
-public:
-  RegionInfo(const MachineLoopInfo &MLI, const WebAssemblyExceptionInfo &WEI)
-      : MLI(MLI), WEI(WEI) {}
-
-  // Returns a smallest loop or exception that contains MBB
-  const Region *getRegionFor(const MachineBasicBlock *MBB) {
-    const auto *ML = MLI.getLoopFor(MBB);
-    const auto *WE = WEI.getExceptionFor(MBB);
-    if (!ML && !WE)
-      return nullptr;
-    // We determine subregion relationship by domination of their headers, i.e.,
-    // if region A's header dominates region B's header, B is a subregion of A.
-    // WebAssemblyException contains BBs in all its subregions (loops or
-    // exceptions), but MachineLoop may not, because MachineLoop does not contain
-    // BBs that don't have a path to its header even if they are dominated by
-    // its header. So here we should use WE->contains(ML->getHeader()), but not
-    // ML->contains(WE->getHeader()).
-    if ((ML && !WE) || (ML && WE && WE->contains(ML->getHeader()))) {
-      // If the smallest region containing MBB is a loop
-      if (LoopMap.count(ML))
-        return LoopMap[ML].get();
-      LoopMap[ML] = std::make_unique<ConcreteRegion<MachineLoop>>(ML);
-      return LoopMap[ML].get();
-    } else {
-      // If the smallest region containing MBB is an exception
-      if (ExceptionMap.count(WE))
-        return ExceptionMap[WE].get();
-      ExceptionMap[WE] =
-          std::make_unique<ConcreteRegion<WebAssemblyException>>(WE);
-      return ExceptionMap[WE].get();
-    }
-  }
-};
-
 class WebAssemblyCFGSort final : public MachineFunctionPass {
   StringRef getPassName() const override { return "WebAssembly CFG Sort"; }
 
@@ -236,14 +167,14 @@ struct CompareBlockNumbersBackwards {
 /// Bookkeeping for a region to help ensure that we don't mix blocks not
 /// dominated by the its header among its blocks.
 struct Entry {
-  const Region *TheRegion;
+  const SortRegion *TheRegion;
   unsigned NumBlocksLeft;
 
   /// List of blocks not dominated by Loop's header that are deferred until
   /// after all of Loop's blocks have been seen.
   std::vector<MachineBasicBlock *> Deferred;
 
-  explicit Entry(const class Region *R)
+  explicit Entry(const SortRegion *R)
       : TheRegion(R), NumBlocksLeft(R->getNumBlocks()) {}
 };
 } // end anonymous namespace
@@ -287,10 +218,10 @@ static void sortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI,
                 CompareBlockNumbersBackwards>
       Ready;
 
-  RegionInfo RI(MLI, WEI);
+  SortRegionInfo SRI(MLI, WEI);
   SmallVector<Entry, 4> Entries;
   for (MachineBasicBlock *MBB = &MF.front();;) {
-    const Region *R = RI.getRegionFor(MBB);
+    const SortRegion *R = SRI.getRegionFor(MBB);
     if (R) {
       // If MBB is a region header, add it to the active region list. We can't
       // put any blocks that it doesn't dominate until we see the end of the
@@ -373,7 +304,7 @@ static void sortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI,
   MF.RenumberBlocks();
 
 #ifndef NDEBUG
-  SmallSetVector<const Region *, 8> OnStack;
+  SmallSetVector<const SortRegion *, 8> OnStack;
 
   // Insert a sentinel representing the degenerate loop that starts at the
   // function entry block and includes the entire function as a "loop" that
@@ -382,7 +313,7 @@ static void sortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI,
 
   for (auto &MBB : MF) {
     assert(MBB.getNumber() >= 0 && "Renumbered blocks should be non-negative.");
-    const Region *Region = RI.getRegionFor(&MBB);
+    const SortRegion *Region = SRI.getRegionFor(&MBB);
 
     if (Region && &MBB == Region->getHeader()) {
       // Region header.
@@ -408,10 +339,10 @@ static void sortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI,
       for (auto Pred : MBB.predecessors())
         assert(Pred->getNumber() < MBB.getNumber() &&
                "Non-loop-header predecessors should be topologically sorted");
-      assert(OnStack.count(RI.getRegionFor(&MBB)) &&
+      assert(OnStack.count(SRI.getRegionFor(&MBB)) &&
              "Blocks must be nested in their regions");
     }
-    while (OnStack.size() > 1 && &MBB == WebAssembly::getBottom(OnStack.back()))
+    while (OnStack.size() > 1 && &MBB == SRI.getBottom(OnStack.back()))
       OnStack.pop_back();
   }
   assert(OnStack.pop_back_val() == nullptr &&
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
index 8cbfc98e8197..a8e0c3efea0e 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
@@ -24,6 +24,7 @@
 #include "WebAssembly.h"
 #include "WebAssemblyExceptionInfo.h"
 #include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySortRegion.h"
 #include "WebAssemblySubtarget.h"
 #include "WebAssemblyUtilities.h"
 #include "llvm/ADT/Statistic.h"
@@ -33,6 +34,7 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Target/TargetMachine.h"
 using namespace llvm;
+using WebAssembly::SortRegionInfo;
 
 #define DEBUG_TYPE "wasm-cfg-stackify"
 
@@ -55,6 +57,11 @@ class WebAssemblyCFGStackify final : public MachineFunctionPass {
   // which holds the beginning of the scope. This will allow us to quickly skip
   // over scoped regions when walking blocks.
   SmallVector<MachineBasicBlock *, 8> ScopeTops;
+  void updateScopeTops(MachineBasicBlock *Begin, MachineBasicBlock *End) {
+    int EndNo = End->getNumber();
+    if (!ScopeTops[EndNo] || ScopeTops[EndNo]->getNumber() > Begin->getNumber())
+      ScopeTops[EndNo] = Begin;
+  }
 
   // Placing markers.
   void placeMarkers(MachineFunction &MF);
@@ -133,10 +140,10 @@ static bool explicitlyBranchesTo(MachineBasicBlock *Pred,
 // contains instructions that should go before the marker, and AfterSet contains
 // ones that should go after the marker. In this function, AfterSet is only
 // used for sanity checking.
+template <typename Container>
 static MachineBasicBlock::iterator
-getEarliestInsertPos(MachineBasicBlock *MBB,
-                     const SmallPtrSet<const MachineInstr *, 4> &BeforeSet,
-                     const SmallPtrSet<const MachineInstr *, 4> &AfterSet) {
+getEarliestInsertPos(MachineBasicBlock *MBB, const Container &BeforeSet,
+                     const Container &AfterSet) {
   auto InsertPos = MBB->end();
   while (InsertPos != MBB->begin()) {
     if (BeforeSet.count(&*std::prev(InsertPos))) {
@@ -157,10 +164,10 @@ getEarliestInsertPos(MachineBasicBlock *MBB,
 // contains instructions that should go before the marker, and AfterSet contains
 // ones that should go after the marker. In this function, BeforeSet is only
 // used for sanity checking.
+template <typename Container>
 static MachineBasicBlock::iterator
-getLatestInsertPos(MachineBasicBlock *MBB,
-                   const SmallPtrSet<const MachineInstr *, 4> &BeforeSet,
-                   const SmallPtrSet<const MachineInstr *, 4> &AfterSet) {
+getLatestInsertPos(MachineBasicBlock *MBB, const Container &BeforeSet,
+                   const Container &AfterSet) {
   auto InsertPos = MBB->begin();
   while (InsertPos != MBB->end()) {
     if (AfterSet.count(&*InsertPos)) {
@@ -219,20 +226,12 @@ void WebAssemblyCFGStackify::placeBlockMarker(MachineBasicBlock &MBB) {
   // which reduces overall stack height.
   MachineBasicBlock *Header = nullptr;
   bool IsBranchedTo = false;
-  bool IsBrOnExn = false;
-  MachineInstr *BrOnExn = nullptr;
   int MBBNumber = MBB.getNumber();
   for (MachineBasicBlock *Pred : MBB.predecessors()) {
     if (Pred->getNumber() < MBBNumber) {
       Header = Header ? MDT.findNearestCommonDominator(Header, Pred) : Pred;
-      if (explicitlyBranchesTo(Pred, &MBB)) {
+      if (explicitlyBranchesTo(Pred, &MBB))
         IsBranchedTo = true;
-        if (Pred->getFirstTerminator()->getOpcode() == WebAssembly::BR_ON_EXN) {
-          IsBrOnExn = true;
-          assert(!BrOnExn && "There should be only one br_on_exn per block");
-          BrOnExn = &*Pred->getFirstTerminator();
-        }
-      }
     }
   }
   if (!Header)
@@ -317,22 +316,7 @@ void WebAssemblyCFGStackify::placeBlockMarker(MachineBasicBlock &MBB) {
   }
 
   // Add the BLOCK.
-
-  // 'br_on_exn' extracts exnref object and pushes variable number of values
-  // depending on its tag. For C++ exception, its a single i32 value, and the
-  // generated code will be in the form of:
-  // block i32
-  //   br_on_exn 0, $__cpp_exception
-  //   rethrow
-  // end_block
   WebAssembly::BlockType ReturnType = WebAssembly::BlockType::Void;
-  if (IsBrOnExn) {
-    const char *TagName = BrOnExn->getOperand(1).getSymbolName();
-    if (std::strcmp(TagName, "__cpp_exception") != 0)
-      llvm_unreachable("Only C++ exception is supported");
-    ReturnType = WebAssembly::BlockType::I32;
-  }
-
   auto InsertPos = getLatestInsertPos(Header, BeforeSet, AfterSet);
   MachineInstr *Begin =
       BuildMI(*Header, InsertPos, Header->findDebugLoc(InsertPos),
@@ -372,16 +356,15 @@ void WebAssemblyCFGStackify::placeBlockMarker(MachineBasicBlock &MBB) {
   registerScope(Begin, End);
 
   // Track the farthest-spanning scope that ends at this point.
-  int Number = MBB.getNumber();
-  if (!ScopeTops[Number] ||
-      ScopeTops[Number]->getNumber() > Header->getNumber())
-    ScopeTops[Number] = Header;
+  updateScopeTops(Header, &MBB);
 }
 
 /// Insert a LOOP marker for a loop starting at MBB (if it's a loop header).
 void WebAssemblyCFGStackify::placeLoopMarker(MachineBasicBlock &MBB) {
   MachineFunction &MF = *MBB.getParent();
   const auto &MLI = getAnalysis<MachineLoopInfo>();
+  const auto &WEI = getAnalysis<WebAssemblyExceptionInfo>();
+  SortRegionInfo SRI(MLI, WEI);
   const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
 
   MachineLoop *Loop = MLI.getLoopFor(&MBB);
@@ -390,7 +373,7 @@ void WebAssemblyCFGStackify::placeLoopMarker(MachineBasicBlock &MBB) {
 
   // The operand of a LOOP is the first block after the loop. If the loop is the
   // bottom of the function, insert a dummy block at the end.
-  MachineBasicBlock *Bottom = WebAssembly::getBottom(Loop);
+  MachineBasicBlock *Bottom = SRI.getBottom(Loop);
   auto Iter = std::next(Bottom->getIterator());
   if (Iter == MF.end()) {
     getAppendixBlock(MF);
@@ -441,8 +424,7 @@ void WebAssemblyCFGStackify::placeLoopMarker(MachineBasicBlock &MBB) {
   assert((!ScopeTops[AfterLoop->getNumber()] ||
           ScopeTops[AfterLoop->getNumber()]->getNumber() < MBB.getNumber()) &&
          "With block sorting the outermost loop for a block should be first.");
-  if (!ScopeTops[AfterLoop->getNumber()])
-    ScopeTops[AfterLoop->getNumber()] = &MBB;
+  updateScopeTops(&MBB, AfterLoop);
 }
 
 void WebAssemblyCFGStackify::placeTryMarker(MachineBasicBlock &MBB) {
@@ -450,7 +432,9 @@ void WebAssemblyCFGStackify::placeTryMarker(MachineBasicBlock &MBB) {
   MachineFunction &MF = *MBB.getParent();
   auto &MDT = getAnalysis<MachineDominatorTree>();
   const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+  const auto &MLI = getAnalysis<MachineLoopInfo>();
   const auto &WEI = getAnalysis<WebAssemblyExceptionInfo>();
+  SortRegionInfo SRI(MLI, WEI);
   const auto &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
 
   // Compute the nearest common dominator of all unwind predecessors
@@ -470,7 +454,7 @@ void WebAssemblyCFGStackify::placeTryMarker(MachineBasicBlock &MBB) {
   // end.
   WebAssemblyException *WE = WEI.getExceptionFor(&MBB);
   assert(WE);
-  MachineBasicBlock *Bottom = WebAssembly::getBottom(WE);
+  MachineBasicBlock *Bottom = SRI.getBottom(WE);
 
   auto Iter = std::next(Bottom->getIterator());
   if (Iter == MF.end()) {
@@ -639,11 +623,8 @@ void WebAssemblyCFGStackify::placeTryMarker(MachineBasicBlock &MBB) {
   // catch         |
   //   end_block --|
   // end_try
-  for (int Number : {Cont->getNumber(), MBB.getNumber()}) {
-    if (!ScopeTops[Number] ||
-        ScopeTops[Number]->getNumber() > Header->getNumber())
-      ScopeTops[Number] = Header;
-  }
+  for (auto *End : {&MBB, Cont})
+    updateScopeTops(Header, End);
 }
 
 void WebAssemblyCFGStackify::removeUnnecessaryInstrs(MachineFunction &MF) {
@@ -656,11 +637,32 @@ void WebAssemblyCFGStackify::removeUnnecessaryInstrs(MachineFunction &MF) {
   //   try
   //     ...
   //     br bb2      <- Not necessary
-  // bb1:
+  // bb1 (ehpad):
+  //   catch
+  //     ...
+  // bb2:            <- Continuation BB
+  //   end
+  //
+  // A more involved case: When the BB where 'end' is located is an another EH
+  // pad, the Cont (= continuation) BB is that EH pad's 'end' BB. For example,
+  // bb0:
+  //   try
+  //     try
+  //       ...
+  //       br bb3      <- Not necessary
+  // bb1 (ehpad):
+  //     catch
+  // bb2 (ehpad):
+  //     end
   //   catch
   //     ...
-  // bb2:
+  // bb3:            <- Continuation BB
   //   end
+  //
+  // When the EH pad at hand is bb1, its matching end_try is in bb2. But it is
+  // another EH pad, so bb0's continuation BB becomes bb3. So 'br bb3' in the
+  // code can be deleted. This is why we run 'while' until 'Cont' is not an EH
+  // pad.
   for (auto &MBB : MF) {
     if (!MBB.isEHPad())
       continue;
@@ -668,7 +670,14 @@ void WebAssemblyCFGStackify::removeUnnecessaryInstrs(MachineFunction &MF) {
     MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
     SmallVector<MachineOperand, 4> Cond;
     MachineBasicBlock *EHPadLayoutPred = MBB.getPrevNode();
-    MachineBasicBlock *Cont = BeginToEnd[EHPadToTry[&MBB]]->getParent();
+
+    MachineBasicBlock *Cont = &MBB;
+    while (Cont->isEHPad()) {
+      MachineInstr *Try = EHPadToTry[Cont];
+      MachineInstr *EndTry = BeginToEnd[Try];
+      Cont = EndTry->getParent();
+    }
+
     bool Analyzable = !TII.analyzeBranch(*EHPadLayoutPred, TBB, FBB, Cond);
     // This condition means either
     // 1. This BB ends with a single unconditional branch whose destinaion is
@@ -745,18 +754,26 @@ static unsigned getCopyOpcode(const TargetRegisterClass *RC) {
     return WebAssembly::COPY_F64;
   if (RC == &WebAssembly::V128RegClass)
     return WebAssembly::COPY_V128;
-  if (RC == &WebAssembly::EXNREFRegClass)
-    return WebAssembly::COPY_EXNREF;
+  if (RC == &WebAssembly::FUNCREFRegClass)
+    return WebAssembly::COPY_FUNCREF;
+  if (RC == &WebAssembly::EXTERNREFRegClass)
+    return WebAssembly::COPY_EXTERNREF;
   llvm_unreachable("Unexpected register class");
 }
 
 // When MBB is split into MBB and Split, we should unstackify defs in MBB that
 // have their uses in Split.
-static void unstackifyVRegsUsedInSplitBB(MachineBasicBlock &MBB,
-                                         MachineBasicBlock &Split,
-                                         WebAssemblyFunctionInfo &MFI,
-                                         MachineRegisterInfo &MRI,
-                                         const WebAssemblyInstrInfo &TII) {
+// FIXME This function will be used when fixing unwind mismatches, but the old
+// version of that function was removed for the moment and the new version has
+// not yet been added. So 'LLVM_ATTRIBUTE_UNUSED' is added to suppress the
+// warning. Remove the attribute after the new functionality is added.
+LLVM_ATTRIBUTE_UNUSED static void
+unstackifyVRegsUsedInSplitBB(MachineBasicBlock &MBB, MachineBasicBlock &Split) {
+  MachineFunction &MF = *MBB.getParent();
+  const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+  auto &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
+  auto &MRI = MF.getRegInfo();
+
   for (auto &MI : Split) {
     for (auto &MO : MI.explicit_uses()) {
       if (!MO.isReg() || Register::isPhysicalRegister(MO.getReg()))
@@ -810,525 +827,8 @@ static void unstackifyVRegsUsedInSplitBB(MachineBasicBlock &MBB,
 }
 
 bool WebAssemblyCFGStackify::fixUnwindMismatches(MachineFunction &MF) {
-  const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
-  auto &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-
-  // Linearizing the control flow by placing TRY / END_TRY markers can create
-  // mismatches in unwind destinations. There are two kinds of mismatches we
-  // try to solve here.
-
-  // 1. When an instruction may throw, but the EH pad it will unwind to can be
-  //    different from the original CFG.
-  //
-  // Example: we have the following CFG:
-  // bb0:
-  //   call @foo (if it throws, unwind to bb2)
-  // bb1:
-  //   call @bar (if it throws, unwind to bb3)
-  // bb2 (ehpad):
-  //   catch
-  //   ...
-  // bb3 (ehpad)
-  //   catch
-  //   handler body
-  //
-  // And the CFG is sorted in this order. Then after placing TRY markers, it
-  // will look like: (BB markers are omitted)
-  // try $label1
-  //   try
-  //     call @foo
-  //     call @bar   (if it throws, unwind to bb3)
-  //   catch         <- ehpad (bb2)
-  //     ...
-  //   end_try
-  // catch           <- ehpad (bb3)
-  //   handler body
-  // end_try
-  //
-  // Now if bar() throws, it is going to end up ip in bb2, not bb3, where it
-  // is supposed to end up. We solve this problem by
-  // a. Split the target unwind EH pad (here bb3) so that the handler body is
-  //    right after 'end_try', which means we extract the handler body out of
-  //    the catch block. We do this because this handler body should be
-  //    somewhere branch-eable from the inner scope.
-  // b. Wrap the call that has an incorrect unwind destination ('call @bar'
-  //    here) with a nested try/catch/end_try scope, and within the new catch
-  //    block, branches to the handler body.
-  // c. Place a branch after the newly inserted nested end_try so it can bypass
-  //    the handler body, which is now outside of a catch block.
-  //
-  // The result will like as follows. (new: a) means this instruction is newly
-  // created in the process of doing 'a' above.
-  //
-  // block $label0                 (new: placeBlockMarker)
-  //   try $label1
-  //     try
-  //       call @foo
-  //       try                     (new: b)
-  //         call @bar
-  //       catch                   (new: b)
-  //         local.set n / drop    (new: b)
-  //         br $label1            (new: b)
-  //       end_try                 (new: b)
-  //     catch                     <- ehpad (bb2)
-  //     end_try
-  //     br $label0                (new: c)
-  //   catch                       <- ehpad (bb3)
-  //   end_try                     (hoisted: a)
-  //   handler body
-  // end_block                     (new: placeBlockMarker)
-  //
-  // Note that the new wrapping block/end_block will be generated later in
-  // placeBlockMarker.
-  //
-  // TODO Currently local.set and local.gets are generated to move exnref value
-  // created by catches. That's because we don't support yielding values from a
-  // block in LLVM machine IR yet, even though it is supported by wasm. Delete
-  // unnecessary local.get/local.sets once yielding values from a block is
-  // supported. The full EH spec requires multi-value support to do this, but
-  // for C++ we don't yet need it because we only throw a single i32.
-  //
-  // ---
-  // 2. The same as 1, but in this case an instruction unwinds to a caller
-  //    function and not another EH pad.
-  //
-  // Example: we have the following CFG:
-  // bb0:
-  //   call @foo (if it throws, unwind to bb2)
-  // bb1:
-  //   call @bar (if it throws, unwind to caller)
-  // bb2 (ehpad):
-  //   catch
-  //   ...
-  //
-  // And the CFG is sorted in this order. Then after placing TRY markers, it
-  // will look like:
-  // try
-  //   call @foo
-  //   call @bar   (if it throws, unwind to caller)
-  // catch         <- ehpad (bb2)
-  //   ...
-  // end_try
-  //
-  // Now if bar() throws, it is going to end up ip in bb2, when it is supposed
-  // throw up to the caller.
-  // We solve this problem by
-  // a. Create a new 'appendix' BB at the end of the function and put a single
-  //    'rethrow' instruction (+ local.get) in there.
-  // b. Wrap the call that has an incorrect unwind destination ('call @bar'
-  //    here) with a nested try/catch/end_try scope, and within the new catch
-  //    block, branches to the new appendix block.
-  //
-  // block $label0          (new: placeBlockMarker)
-  //   try
-  //     call @foo
-  //     try                (new: b)
-  //       call @bar
-  //     catch              (new: b)
-  //       local.set n      (new: b)
-  //       br $label0       (new: b)
-  //     end_try            (new: b)
-  //   catch                <- ehpad (bb2)
-  //     ...
-  //   end_try
-  // ...
-  // end_block              (new: placeBlockMarker)
-  // local.get n            (new: a)  <- appendix block
-  // rethrow                (new: a)
-  //
-  // In case there are multiple calls in a BB that may throw to the caller, they
-  // can be wrapped together in one nested try scope. (In 1, this couldn't
-  // happen, because may-throwing instruction there had an unwind destination,
-  // i.e., it was an invoke before, and there could be only one invoke within a
-  // BB.)
-
-  SmallVector<const MachineBasicBlock *, 8> EHPadStack;
-  // Range of intructions to be wrapped in a new nested try/catch
-  using TryRange = std::pair<MachineInstr *, MachineInstr *>;
-  // In original CFG, <unwind destination BB, a vector of try ranges>
-  DenseMap<MachineBasicBlock *, SmallVector<TryRange, 4>> UnwindDestToTryRanges;
-  // In new CFG, <destination to branch to, a vector of try ranges>
-  DenseMap<MachineBasicBlock *, SmallVector<TryRange, 4>> BrDestToTryRanges;
-  // In new CFG, <destination to branch to, register containing exnref>
-  DenseMap<MachineBasicBlock *, unsigned> BrDestToExnReg;
-
-  // Destinations for branches that will be newly added, for which a new
-  // BLOCK/END_BLOCK markers are necessary.
-  SmallVector<MachineBasicBlock *, 8> BrDests;
-
-  // Gather possibly throwing calls (i.e., previously invokes) whose current
-  // unwind destination is not the same as the original CFG.
-  for (auto &MBB : reverse(MF)) {
-    bool SeenThrowableInstInBB = false;
-    for (auto &MI : reverse(MBB)) {
-      if (MI.getOpcode() == WebAssembly::TRY)
-        EHPadStack.pop_back();
-      else if (MI.getOpcode() == WebAssembly::CATCH)
-        EHPadStack.push_back(MI.getParent());
-
-      // In this loop we only gather calls that have an EH pad to unwind. So
-      // there will be at most 1 such call (= invoke) in a BB, so after we've
-      // seen one, we can skip the rest of BB. Also if MBB has no EH pad
-      // successor or MI does not throw, this is not an invoke.
-      if (SeenThrowableInstInBB || !MBB.hasEHPadSuccessor() ||
-          !WebAssembly::mayThrow(MI))
-        continue;
-      SeenThrowableInstInBB = true;
-
-      // If the EH pad on the stack top is where this instruction should unwind
-      // next, we're good.
-      MachineBasicBlock *UnwindDest = nullptr;
-      for (auto *Succ : MBB.successors()) {
-        if (Succ->isEHPad()) {
-          UnwindDest = Succ;
-          break;
-        }
-      }
-      if (EHPadStack.back() == UnwindDest)
-        continue;
-
-      // If not, record the range.
-      UnwindDestToTryRanges[UnwindDest].push_back(TryRange(&MI, &MI));
-    }
-  }
-
-  assert(EHPadStack.empty());
-
-  // Gather possibly throwing calls that are supposed to unwind up to the caller
-  // if they throw, but currently unwind to an incorrect destination. Unlike the
-  // loop above, there can be multiple calls within a BB that unwind to the
-  // caller, which we should group together in a range.
-  bool NeedAppendixBlock = false;
-  for (auto &MBB : reverse(MF)) {
-    MachineInstr *RangeBegin = nullptr, *RangeEnd = nullptr; // inclusive
-    for (auto &MI : reverse(MBB)) {
-      if (MI.getOpcode() == WebAssembly::TRY)
-        EHPadStack.pop_back();
-      else if (MI.getOpcode() == WebAssembly::CATCH)
-        EHPadStack.push_back(MI.getParent());
-
-      // If MBB has an EH pad successor, this inst does not unwind to caller.
-      if (MBB.hasEHPadSuccessor())
-        continue;
-
-      // We wrap up the current range when we see a marker even if we haven't
-      // finished a BB.
-      if (RangeEnd && WebAssembly::isMarker(MI.getOpcode())) {
-        NeedAppendixBlock = true;
-        // Record the range. nullptr here means the unwind destination is the
-        // caller.
-        UnwindDestToTryRanges[nullptr].push_back(
-            TryRange(RangeBegin, RangeEnd));
-        RangeBegin = RangeEnd = nullptr; // Reset range pointers
-      }
-
-      // If EHPadStack is empty, that means it is correctly unwind to caller if
-      // it throws, so we're good. If MI does not throw, we're good too.
-      if (EHPadStack.empty() || !WebAssembly::mayThrow(MI))
-        continue;
-
-      // We found an instruction that unwinds to the caller but currently has an
-      // incorrect unwind destination. Create a new range or increment the
-      // currently existing range.
-      if (!RangeEnd)
-        RangeBegin = RangeEnd = &MI;
-      else
-        RangeBegin = &MI;
-    }
-
-    if (RangeEnd) {
-      NeedAppendixBlock = true;
-      // Record the range. nullptr here means the unwind destination is the
-      // caller.
-      UnwindDestToTryRanges[nullptr].push_back(TryRange(RangeBegin, RangeEnd));
-      RangeBegin = RangeEnd = nullptr; // Reset range pointers
-    }
-  }
-
-  assert(EHPadStack.empty());
-  // We don't have any unwind destination mismatches to resolve.
-  if (UnwindDestToTryRanges.empty())
-    return false;
-
-  // If we found instructions that should unwind to the caller but currently
-  // have incorrect unwind destination, we create an appendix block at the end
-  // of the function with a local.get and a rethrow instruction.
-  if (NeedAppendixBlock) {
-    auto *AppendixBB = getAppendixBlock(MF);
-    Register ExnReg = MRI.createVirtualRegister(&WebAssembly::EXNREFRegClass);
-    BuildMI(AppendixBB, DebugLoc(), TII.get(WebAssembly::RETHROW))
-        .addReg(ExnReg);
-    // These instruction ranges should branch to this appendix BB.
-    for (auto Range : UnwindDestToTryRanges[nullptr])
-      BrDestToTryRanges[AppendixBB].push_back(Range);
-    BrDestToExnReg[AppendixBB] = ExnReg;
-  }
-
-  // We loop through unwind destination EH pads that are targeted from some
-  // inner scopes. Because these EH pads are destination of more than one scope
-  // now, we split them so that the handler body is after 'end_try'.
-  // - Before
-  // ehpad:
-  //   catch
-  //   local.set n / drop
-  //   handler body
-  // ...
-  // cont:
-  //   end_try
-  //
-  // - After
-  // ehpad:
-  //   catch
-  //   local.set n / drop
-  // brdest:               (new)
-  //   end_try             (hoisted from 'cont' BB)
-  //   handler body        (taken from 'ehpad')
-  // ...
-  // cont:
-  for (auto &P : UnwindDestToTryRanges) {
-    NumUnwindMismatches += P.second.size();
-
-    // This means the destination is the appendix BB, which was separately
-    // handled above.
-    if (!P.first)
-      continue;
-
-    MachineBasicBlock *EHPad = P.first;
-
-    // Find 'catch' and 'local.set' or 'drop' instruction that follows the
-    // 'catch'. If -wasm-disable-explicit-locals is not set, 'catch' should be
-    // always followed by either 'local.set' or a 'drop', because 'br_on_exn' is
-    // generated after 'catch' in LateEHPrepare and we don't support blocks
-    // taking values yet.
-    MachineInstr *Catch = nullptr;
-    unsigned ExnReg = 0;
-    for (auto &MI : *EHPad) {
-      switch (MI.getOpcode()) {
-      case WebAssembly::CATCH:
-        Catch = &MI;
-        ExnReg = Catch->getOperand(0).getReg();
-        break;
-      }
-    }
-    assert(Catch && "EH pad does not have a catch");
-    assert(ExnReg != 0 && "Invalid register");
-
-    auto SplitPos = std::next(Catch->getIterator());
-
-    // Create a new BB that's gonna be the destination for branches from the
-    // inner mismatched scope.
-    MachineInstr *BeginTry = EHPadToTry[EHPad];
-    MachineInstr *EndTry = BeginToEnd[BeginTry];
-    MachineBasicBlock *Cont = EndTry->getParent();
-    auto *BrDest = MF.CreateMachineBasicBlock();
-    MF.insert(std::next(EHPad->getIterator()), BrDest);
-    // Hoist up the existing 'end_try'.
-    BrDest->insert(BrDest->end(), EndTry->removeFromParent());
-    // Take out the handler body from EH pad to the new branch destination BB.
-    BrDest->splice(BrDest->end(), EHPad, SplitPos, EHPad->end());
-    unstackifyVRegsUsedInSplitBB(*EHPad, *BrDest, MFI, MRI, TII);
-    // Fix predecessor-successor relationship.
-    BrDest->transferSuccessors(EHPad);
-    EHPad->addSuccessor(BrDest);
-
-    // All try ranges that were supposed to unwind to this EH pad now have to
-    // branch to this new branch dest BB.
-    for (auto Range : UnwindDestToTryRanges[EHPad])
-      BrDestToTryRanges[BrDest].push_back(Range);
-    BrDestToExnReg[BrDest] = ExnReg;
-
-    // In case we fall through to the continuation BB after the catch block, we
-    // now have to add a branch to it.
-    // - Before
-    // try
-    //   ...
-    //   (falls through to 'cont')
-    // catch
-    //   handler body
-    // end
-    //               <-- cont
-    //
-    // - After
-    // try
-    //   ...
-    //   br %cont    (new)
-    // catch
-    // end
-    // handler body
-    //               <-- cont
-    MachineBasicBlock *EHPadLayoutPred = &*std::prev(EHPad->getIterator());
-    MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
-    SmallVector<MachineOperand, 4> Cond;
-    bool Analyzable = !TII.analyzeBranch(*EHPadLayoutPred, TBB, FBB, Cond);
-    if (Analyzable && !TBB && !FBB) {
-      DebugLoc DL = EHPadLayoutPred->empty()
-                        ? DebugLoc()
-                        : EHPadLayoutPred->rbegin()->getDebugLoc();
-      BuildMI(EHPadLayoutPred, DL, TII.get(WebAssembly::BR)).addMBB(Cont);
-      BrDests.push_back(Cont);
-    }
-  }
-
-  // For possibly throwing calls whose unwind destinations are currently
-  // incorrect because of CFG linearization, we wrap them with a nested
-  // try/catch/end_try, and within the new catch block, we branch to the correct
-  // handler.
-  // - Before
-  // mbb:
-  //   call @foo       <- Unwind destination mismatch!
-  // ehpad:
-  //   ...
-  //
-  // - After
-  // mbb:
-  //   try                (new)
-  //   call @foo
-  // nested-ehpad:        (new)
-  //   catch              (new)
-  //   local.set n / drop (new)
-  //   br %brdest         (new)
-  // nested-end:          (new)
-  //   end_try            (new)
-  // ehpad:
-  //   ...
-  for (auto &P : BrDestToTryRanges) {
-    MachineBasicBlock *BrDest = P.first;
-    auto &TryRanges = P.second;
-    unsigned ExnReg = BrDestToExnReg[BrDest];
-
-    for (auto Range : TryRanges) {
-      MachineInstr *RangeBegin = nullptr, *RangeEnd = nullptr;
-      std::tie(RangeBegin, RangeEnd) = Range;
-      auto *MBB = RangeBegin->getParent();
-      // Store the first function call from this range, because RangeBegin can
-      // be moved to point EH_LABEL before the call
-      MachineInstr *RangeBeginCall = RangeBegin;
-
-      // Include possible EH_LABELs in the range
-      if (RangeBegin->getIterator() != MBB->begin() &&
-          std::prev(RangeBegin->getIterator())->isEHLabel())
-        RangeBegin = &*std::prev(RangeBegin->getIterator());
-      if (std::next(RangeEnd->getIterator()) != MBB->end() &&
-          std::next(RangeEnd->getIterator())->isEHLabel())
-        RangeEnd = &*std::next(RangeEnd->getIterator());
-
-      MachineBasicBlock *EHPad = nullptr;
-      for (auto *Succ : MBB->successors()) {
-        if (Succ->isEHPad()) {
-          EHPad = Succ;
-          break;
-        }
-      }
-
-      // Local expression tree before the first call of this range should go
-      // after the nested TRY.
-      SmallPtrSet<const MachineInstr *, 4> AfterSet;
-      AfterSet.insert(RangeBegin);
-      AfterSet.insert(RangeBeginCall);
-      for (auto I = MachineBasicBlock::iterator(RangeBeginCall),
-                E = MBB->begin();
-           I != E; --I) {
-        if (std::prev(I)->isDebugInstr() || std::prev(I)->isPosition())
-          continue;
-        if (WebAssembly::isChild(*std::prev(I), MFI))
-          AfterSet.insert(&*std::prev(I));
-        else
-          break;
-      }
-
-      // Create the nested try instruction.
-      auto InsertPos = getLatestInsertPos(
-          MBB, SmallPtrSet<const MachineInstr *, 4>(), AfterSet);
-      MachineInstr *NestedTry =
-          BuildMI(*MBB, InsertPos, RangeBegin->getDebugLoc(),
-                  TII.get(WebAssembly::TRY))
-              .addImm(int64_t(WebAssembly::BlockType::Void));
-
-      // Create the nested EH pad and fill instructions in.
-      MachineBasicBlock *NestedEHPad = MF.CreateMachineBasicBlock();
-      MF.insert(std::next(MBB->getIterator()), NestedEHPad);
-      NestedEHPad->setIsEHPad();
-      NestedEHPad->setIsEHScopeEntry();
-      BuildMI(NestedEHPad, RangeEnd->getDebugLoc(), TII.get(WebAssembly::CATCH),
-              ExnReg);
-      BuildMI(NestedEHPad, RangeEnd->getDebugLoc(), TII.get(WebAssembly::BR))
-          .addMBB(BrDest);
-
-      // Create the nested continuation BB and end_try instruction.
-      MachineBasicBlock *NestedCont = MF.CreateMachineBasicBlock();
-      MF.insert(std::next(NestedEHPad->getIterator()), NestedCont);
-      MachineInstr *NestedEndTry =
-          BuildMI(*NestedCont, NestedCont->begin(), RangeEnd->getDebugLoc(),
-                  TII.get(WebAssembly::END_TRY));
-      // In case MBB has more instructions after the try range, move them to the
-      // new nested continuation BB.
-      NestedCont->splice(NestedCont->end(), MBB,
-                         std::next(RangeEnd->getIterator()), MBB->end());
-      unstackifyVRegsUsedInSplitBB(*MBB, *NestedCont, MFI, MRI, TII);
-      registerTryScope(NestedTry, NestedEndTry, NestedEHPad);
-
-      // Fix predecessor-successor relationship.
-      NestedCont->transferSuccessors(MBB);
-      if (EHPad) {
-        NestedCont->removeSuccessor(EHPad);
-        // If EHPad does not have any predecessors left after removing
-        // NextedCont predecessor, remove its successor too, because this EHPad
-        // is not reachable from the entry BB anyway. We can't remove EHPad BB
-        // itself because it can contain 'catch' or 'end', which are necessary
-        // for keeping try-catch-end structure.
-        if (EHPad->pred_empty())
-          EHPad->removeSuccessor(BrDest);
-      }
-      MBB->addSuccessor(NestedEHPad);
-      MBB->addSuccessor(NestedCont);
-      NestedEHPad->addSuccessor(BrDest);
-    }
-  }
-
-  // Renumber BBs and recalculate ScopeTop info because new BBs might have been
-  // created and inserted above.
-  MF.RenumberBlocks();
-  ScopeTops.clear();
-  ScopeTops.resize(MF.getNumBlockIDs());
-  for (auto &MBB : reverse(MF)) {
-    for (auto &MI : reverse(MBB)) {
-      if (ScopeTops[MBB.getNumber()])
-        break;
-      switch (MI.getOpcode()) {
-      case WebAssembly::END_BLOCK:
-      case WebAssembly::END_LOOP:
-      case WebAssembly::END_TRY:
-        ScopeTops[MBB.getNumber()] = EndToBegin[&MI]->getParent();
-        break;
-      case WebAssembly::CATCH:
-        ScopeTops[MBB.getNumber()] = EHPadToTry[&MBB]->getParent();
-        break;
-      }
-    }
-  }
-
-  // Recompute the dominator tree.
-  getAnalysis<MachineDominatorTree>().runOnMachineFunction(MF);
-
-  // Place block markers for newly added branches, if necessary.
-
-  // If we've created an appendix BB and a branch to it, place a block/end_block
-  // marker for that. For some new branches, those branch destination BBs start
-  // with a hoisted end_try marker, so we don't need a new marker there.
-  if (AppendixBB)
-    BrDests.push_back(AppendixBB);
-
-  llvm::sort(BrDests,
-             [&](const MachineBasicBlock *A, const MachineBasicBlock *B) {
-               auto ANum = A->getNumber();
-               auto BNum = B->getNumber();
-               return ANum < BNum;
-             });
-  for (auto *Dest : BrDests)
-    placeBlockMarker(*Dest);
-
-  return true;
+  // TODO Implement this
+  return false;
 }
 
 static unsigned
@@ -1365,22 +865,44 @@ void WebAssemblyCFGStackify::fixEndsAtEndOfFunction(MachineFunction &MF) {
           : WebAssembly::BlockType(
                 WebAssembly::toValType(MFI.getResults().front()));
 
-  for (MachineBasicBlock &MBB : reverse(MF)) {
-    for (MachineInstr &MI : reverse(MBB)) {
+  SmallVector<MachineBasicBlock::reverse_iterator, 4> Worklist;
+  Worklist.push_back(MF.rbegin()->rbegin());
+
+  auto Process = [&](MachineBasicBlock::reverse_iterator It) {
+    auto *MBB = It->getParent();
+    while (It != MBB->rend()) {
+      MachineInstr &MI = *It++;
       if (MI.isPosition() || MI.isDebugInstr())
         continue;
       switch (MI.getOpcode()) {
+      case WebAssembly::END_TRY: {
+        // If a 'try''s return type is fixed, both its try body and catch body
+        // should satisfy the return type, so we need to search 'end'
+        // instructions before its corresponding 'catch' too.
+        auto *EHPad = TryToEHPad.lookup(EndToBegin[&MI]);
+        assert(EHPad);
+        auto NextIt =
+            std::next(WebAssembly::findCatch(EHPad)->getReverseIterator());
+        if (NextIt != EHPad->rend())
+          Worklist.push_back(NextIt);
+        LLVM_FALLTHROUGH;
+      }
       case WebAssembly::END_BLOCK:
       case WebAssembly::END_LOOP:
-      case WebAssembly::END_TRY:
         EndToBegin[&MI]->getOperand(0).setImm(int32_t(RetType));
         continue;
       default:
-        // Something other than an `end`. We're done.
+        // Something other than an `end`. We're done for this BB.
         return;
       }
     }
-  }
+    // We've reached the beginning of a BB. Continue the search in the previous
+    // BB.
+    Worklist.push_back(MBB->getPrevNode()->rbegin());
+  };
+
+  while (!Worklist.empty())
+    Process(Worklist.pop_back_val());
 }
 
 // WebAssembly functions end with an end instruction, as if the function body
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyDebugValueManager.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyDebugValueManager.cpp
index 159fb4c00ddc..78191ae758fe 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyDebugValueManager.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyDebugValueManager.cpp
@@ -20,7 +20,19 @@ using namespace llvm;
 
 WebAssemblyDebugValueManager::WebAssemblyDebugValueManager(
     MachineInstr *Instr) {
-  Instr->collectDebugValues(DbgValues);
+  // This code differs from MachineInstr::collectDebugValues in that it scans
+  // the whole BB, not just contiguous DBG_VALUEs.
+  if (!Instr->getOperand(0).isReg())
+    return;
+
+  MachineBasicBlock::iterator DI = *Instr;
+  ++DI;
+  for (MachineBasicBlock::iterator DE = Instr->getParent()->end(); DI != DE;
+       ++DI) {
+    if (DI->isDebugValue() &&
+        DI->getDebugOperandForReg(Instr->getOperand(0).getReg()))
+      DbgValues.push_back(&*DI);
+  }
 }
 
 void WebAssemblyDebugValueManager::move(MachineInstr *Insert) {
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
index 55925bcbe771..ac94e9e80d01 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
@@ -96,8 +96,10 @@ static unsigned getDropOpcode(const TargetRegisterClass *RC) {
     return WebAssembly::DROP_F64;
   if (RC == &WebAssembly::V128RegClass)
     return WebAssembly::DROP_V128;
-  if (RC == &WebAssembly::EXNREFRegClass)
-    return WebAssembly::DROP_EXNREF;
+  if (RC == &WebAssembly::FUNCREFRegClass)
+    return WebAssembly::DROP_FUNCREF;
+  if (RC == &WebAssembly::EXTERNREFRegClass)
+    return WebAssembly::DROP_EXTERNREF;
   llvm_unreachable("Unexpected register class");
 }
 
@@ -113,8 +115,10 @@ static unsigned getLocalGetOpcode(const TargetRegisterClass *RC) {
     return WebAssembly::LOCAL_GET_F64;
   if (RC == &WebAssembly::V128RegClass)
     return WebAssembly::LOCAL_GET_V128;
-  if (RC == &WebAssembly::EXNREFRegClass)
-    return WebAssembly::LOCAL_GET_EXNREF;
+  if (RC == &WebAssembly::FUNCREFRegClass)
+    return WebAssembly::LOCAL_GET_FUNCREF;
+  if (RC == &WebAssembly::EXTERNREFRegClass)
+    return WebAssembly::LOCAL_GET_EXTERNREF;
   llvm_unreachable("Unexpected register class");
 }
 
@@ -130,8 +134,10 @@ static unsigned getLocalSetOpcode(const TargetRegisterClass *RC) {
     return WebAssembly::LOCAL_SET_F64;
   if (RC == &WebAssembly::V128RegClass)
     return WebAssembly::LOCAL_SET_V128;
-  if (RC == &WebAssembly::EXNREFRegClass)
-    return WebAssembly::LOCAL_SET_EXNREF;
+  if (RC == &WebAssembly::FUNCREFRegClass)
+    return WebAssembly::LOCAL_SET_FUNCREF;
+  if (RC == &WebAssembly::EXTERNREFRegClass)
+    return WebAssembly::LOCAL_SET_EXTERNREF;
   llvm_unreachable("Unexpected register class");
 }
 
@@ -147,8 +153,10 @@ static unsigned getLocalTeeOpcode(const TargetRegisterClass *RC) {
     return WebAssembly::LOCAL_TEE_F64;
   if (RC == &WebAssembly::V128RegClass)
     return WebAssembly::LOCAL_TEE_V128;
-  if (RC == &WebAssembly::EXNREFRegClass)
-    return WebAssembly::LOCAL_TEE_EXNREF;
+  if (RC == &WebAssembly::FUNCREFRegClass)
+    return WebAssembly::LOCAL_TEE_FUNCREF;
+  if (RC == &WebAssembly::EXTERNREFRegClass)
+    return WebAssembly::LOCAL_TEE_EXTERNREF;
   llvm_unreachable("Unexpected register class");
 }
 
@@ -164,8 +172,10 @@ static MVT typeForRegClass(const TargetRegisterClass *RC) {
     return MVT::f64;
   if (RC == &WebAssembly::V128RegClass)
     return MVT::v16i8;
-  if (RC == &WebAssembly::EXNREFRegClass)
-    return MVT::exnref;
+  if (RC == &WebAssembly::FUNCREFRegClass)
+    return MVT::funcref;
+  if (RC == &WebAssembly::EXTERNREFRegClass)
+    return MVT::externref;
   llvm_unreachable("unrecognized register class");
 }
 
@@ -221,6 +231,10 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
     auto Local = static_cast<unsigned>(MI.getOperand(1).getImm());
     Reg2Local[Reg] = Local;
     checkFrameBase(MFI, Local, Reg);
+
+    // Update debug value to point to the local before removing.
+    WebAssemblyDebugValueManager(&MI).replaceWithLocal(Local);
+
     MI.eraseFromParent();
     Changed = true;
   }
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
index c2a0d3e01740..82b032267d55 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -20,12 +20,14 @@
 #include "WebAssemblyMachineFunctionInfo.h"
 #include "WebAssemblySubtarget.h"
 #include "WebAssemblyTargetMachine.h"
+#include "WebAssemblyUtilities.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/CodeGen/FastISel.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -128,7 +130,8 @@ private:
     case MVT::i64:
     case MVT::f32:
     case MVT::f64:
-    case MVT::exnref:
+    case MVT::funcref:
+    case MVT::externref:
       return VT;
     case MVT::f16:
       return MVT::f32;
@@ -704,9 +707,13 @@ bool WebAssemblyFastISel::fastLowerArguments() {
       Opc = WebAssembly::ARGUMENT_v2f64;
       RC = &WebAssembly::V128RegClass;
       break;
-    case MVT::exnref:
-      Opc = WebAssembly::ARGUMENT_exnref;
-      RC = &WebAssembly::EXNREFRegClass;
+    case MVT::funcref:
+      Opc = WebAssembly::ARGUMENT_funcref;
+      RC = &WebAssembly::FUNCREFRegClass;
+      break;
+    case MVT::externref:
+      Opc = WebAssembly::ARGUMENT_externref;
+      RC = &WebAssembly::EXTERNREFRegClass;
       break;
     default:
       return false;
@@ -806,8 +813,11 @@ bool WebAssemblyFastISel::selectCall(const Instruction *I) {
     case MVT::v2f64:
       ResultReg = createResultReg(&WebAssembly::V128RegClass);
       break;
-    case MVT::exnref:
-      ResultReg = createResultReg(&WebAssembly::EXNREFRegClass);
+    case MVT::funcref:
+      ResultReg = createResultReg(&WebAssembly::FUNCREFRegClass);
+      break;
+    case MVT::externref:
+      ResultReg = createResultReg(&WebAssembly::EXTERNREFRegClass);
       break;
     default:
       return false;
@@ -862,6 +872,15 @@ bool WebAssemblyFastISel::selectCall(const Instruction *I) {
     // Add placeholders for the type index and immediate flags
     MIB.addImm(0);
     MIB.addImm(0);
+
+    // Ensure that the object file has a __indirect_function_table import, as we
+    // call_indirect against it.
+    MCSymbolWasm *Sym = WebAssembly::getOrCreateFunctionTableSymbol(
+        MF->getMMI().getContext(), "__indirect_function_table");
+    // Until call_indirect emits TABLE_NUMBER relocs against this symbol, mark
+    // it as NO_STRIP so as to ensure that the indirect function table makes it
+    // to linked output.
+    Sym->setNoStrip();
   }
 
   for (unsigned ArgReg : Args)
@@ -916,9 +935,13 @@ bool WebAssemblyFastISel::selectSelect(const Instruction *I) {
     Opc = WebAssembly::SELECT_F64;
     RC = &WebAssembly::F64RegClass;
     break;
-  case MVT::exnref:
-    Opc = WebAssembly::SELECT_EXNREF;
-    RC = &WebAssembly::EXNREFRegClass;
+  case MVT::funcref:
+    Opc = WebAssembly::SELECT_FUNCREF;
+    RC = &WebAssembly::FUNCREFRegClass;
+    break;
+  case MVT::externref:
+    Opc = WebAssembly::SELECT_EXTERNREF;
+    RC = &WebAssembly::EXTERNREFRegClass;
     break;
   default:
     return false;
@@ -1321,7 +1344,8 @@ bool WebAssemblyFastISel::selectRet(const Instruction *I) {
   case MVT::v2i64:
   case MVT::v4f32:
   case MVT::v2f64:
-  case MVT::exnref:
+  case MVT::funcref:
+  case MVT::externref:
     break;
   default:
     return false;
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyFixBrTableDefaults.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyFixBrTableDefaults.cpp
index 7f805b34b499..52aa3534c78e 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyFixBrTableDefaults.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyFixBrTableDefaults.cpp
@@ -41,13 +41,51 @@ public:
 
 char WebAssemblyFixBrTableDefaults::ID = 0;
 
+// Target indepedent selection dag assumes that it is ok to use PointerTy
+// as the index for a "switch", whereas Wasm so far only has a 32-bit br_table.
+// See e.g. SelectionDAGBuilder::visitJumpTableHeader
+// We have a 64-bit br_table in the tablegen defs as a result, which does get
+// selected, and thus we get incorrect truncates/extensions happening on
+// wasm64. Here we fix that.
+void fixBrTableIndex(MachineInstr &MI, MachineBasicBlock *MBB,
+                     MachineFunction &MF) {
+  // Only happens on wasm64.
+  auto &WST = MF.getSubtarget<WebAssemblySubtarget>();
+  if (!WST.hasAddr64())
+    return;
+
+  assert(MI.getDesc().getOpcode() == WebAssembly::BR_TABLE_I64 &&
+         "64-bit br_table pseudo instruction expected");
+
+  // Find extension op, if any. It sits in the previous BB before the branch.
+  auto ExtMI = MF.getRegInfo().getVRegDef(MI.getOperand(0).getReg());
+  if (ExtMI->getOpcode() == WebAssembly::I64_EXTEND_U_I32) {
+    // Unnecessarily extending a 32-bit value to 64, remove it.
+    assert(MI.getOperand(0).getReg() == ExtMI->getOperand(0).getReg());
+    MI.getOperand(0).setReg(ExtMI->getOperand(1).getReg());
+    ExtMI->eraseFromParent();
+  } else {
+    // Incoming 64-bit value that needs to be truncated.
+    Register Reg32 =
+        MF.getRegInfo().createVirtualRegister(&WebAssembly::I32RegClass);
+    BuildMI(*MBB, MI.getIterator(), MI.getDebugLoc(),
+            WST.getInstrInfo()->get(WebAssembly::I32_WRAP_I64), Reg32)
+        .addReg(MI.getOperand(0).getReg());
+    MI.getOperand(0).setReg(Reg32);
+  }
+
+  // We now have a 32-bit operand in all cases, so change the instruction
+  // accordingly.
+  MI.setDesc(WST.getInstrInfo()->get(WebAssembly::BR_TABLE_I32));
+}
+
 // `MI` is a br_table instruction with a dummy default target argument. This
 // function finds and adds the default target argument and removes any redundant
 // range check preceding the br_table. Returns the MBB that the br_table is
 // moved into so it can be removed from further consideration, or nullptr if the
 // br_table cannot be optimized.
-MachineBasicBlock *fixBrTable(MachineInstr &MI, MachineBasicBlock *MBB,
-                              MachineFunction &MF) {
+MachineBasicBlock *fixBrTableDefault(MachineInstr &MI, MachineBasicBlock *MBB,
+                                     MachineFunction &MF) {
   // Get the header block, which contains the redundant range check.
   assert(MBB->pred_size() == 1 && "Expected a single guard predecessor");
   auto *HeaderMBB = *MBB->pred_begin();
@@ -125,7 +163,8 @@ bool WebAssemblyFixBrTableDefaults::runOnMachineFunction(MachineFunction &MF) {
     MBBSet.erase(MBB);
     for (auto &MI : *MBB) {
       if (WebAssembly::isBrTable(MI)) {
-        auto *Fixed = fixBrTable(MI, MBB, MF);
+        fixBrTableIndex(MI, MBB, MF);
+        auto *Fixed = fixBrTableDefault(MI, MBB, MF);
         if (Fixed != nullptr) {
           MBBSet.erase(Fixed);
           Changed = true;
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISD.def b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
index dee1c4e28149..d75afdcefb7d 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
@@ -29,7 +29,12 @@ HANDLE_NODETYPE(SWIZZLE)
 HANDLE_NODETYPE(VEC_SHL)
 HANDLE_NODETYPE(VEC_SHR_S)
 HANDLE_NODETYPE(VEC_SHR_U)
+HANDLE_NODETYPE(WIDEN_LOW_S)
+HANDLE_NODETYPE(WIDEN_LOW_U)
+HANDLE_NODETYPE(WIDEN_HIGH_S)
+HANDLE_NODETYPE(WIDEN_HIGH_U)
 HANDLE_NODETYPE(THROW)
+HANDLE_NODETYPE(CATCH)
 HANDLE_NODETYPE(MEMORY_COPY)
 HANDLE_NODETYPE(MEMORY_FILL)
 
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
index d1a696f854f8..b9154b09fbbc 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
@@ -80,9 +80,6 @@ void WebAssemblyDAGToDAGISel::Select(SDNode *Node) {
   MVT PtrVT = TLI->getPointerTy(CurDAG->getDataLayout());
   auto GlobalGetIns = PtrVT == MVT::i64 ? WebAssembly::GLOBAL_GET_I64
                                         : WebAssembly::GLOBAL_GET_I32;
-  auto ConstIns =
-      PtrVT == MVT::i64 ? WebAssembly::CONST_I64 : WebAssembly::CONST_I32;
-  auto AddIns = PtrVT == MVT::i64 ? WebAssembly::ADD_I64 : WebAssembly::ADD_I32;
 
   // Few custom selection stuff.
   SDLoc DL(Node);
@@ -126,41 +123,6 @@ void WebAssemblyDAGToDAGISel::Select(SDNode *Node) {
     return;
   }
 
-  case ISD::GlobalTLSAddress: {
-    const auto *GA = cast<GlobalAddressSDNode>(Node);
-
-    if (!MF.getSubtarget<WebAssemblySubtarget>().hasBulkMemory())
-      report_fatal_error("cannot use thread-local storage without bulk memory",
-                         false);
-
-    // Currently Emscripten does not support dynamic linking with threads.
-    // Therefore, if we have thread-local storage, only the local-exec model
-    // is possible.
-    // TODO: remove this and implement proper TLS models once Emscripten
-    // supports dynamic linking with threads.
-    if (GA->getGlobal()->getThreadLocalMode() !=
-            GlobalValue::LocalExecTLSModel &&
-        !Subtarget->getTargetTriple().isOSEmscripten()) {
-      report_fatal_error("only -ftls-model=local-exec is supported for now on "
-                         "non-Emscripten OSes: variable " +
-                             GA->getGlobal()->getName(),
-                         false);
-    }
-
-    SDValue TLSBaseSym = CurDAG->getTargetExternalSymbol("__tls_base", PtrVT);
-    SDValue TLSOffsetSym = CurDAG->getTargetGlobalAddress(
-        GA->getGlobal(), DL, PtrVT, GA->getOffset(), 0);
-
-    MachineSDNode *TLSBase =
-        CurDAG->getMachineNode(GlobalGetIns, DL, PtrVT, TLSBaseSym);
-    MachineSDNode *TLSOffset =
-        CurDAG->getMachineNode(ConstIns, DL, PtrVT, TLSOffsetSym);
-    MachineSDNode *TLSAddress = CurDAG->getMachineNode(
-        AddIns, DL, PtrVT, SDValue(TLSBase, 0), SDValue(TLSOffset, 0));
-    ReplaceNode(Node, TLSAddress);
-    return;
-  }
-
   case ISD::INTRINSIC_WO_CHAIN: {
     unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
     switch (IntNo) {
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 925636c82321..e348bba2b04c 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -16,6 +16,7 @@
 #include "WebAssemblyMachineFunctionInfo.h"
 #include "WebAssemblySubtarget.h"
 #include "WebAssemblyTargetMachine.h"
+#include "WebAssemblyUtilities.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -31,6 +32,7 @@
 #include "llvm/IR/IntrinsicsWebAssembly.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetOptions.h"
 using namespace llvm;
@@ -68,6 +70,7 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
   computeRegisterProperties(Subtarget->getRegisterInfo());
 
   setOperationAction(ISD::GlobalAddress, MVTPtr, Custom);
+  setOperationAction(ISD::GlobalTLSAddress, MVTPtr, Custom);
   setOperationAction(ISD::ExternalSymbol, MVTPtr, Custom);
   setOperationAction(ISD::JumpTable, MVTPtr, Custom);
   setOperationAction(ISD::BlockAddress, MVTPtr, Custom);
@@ -123,6 +126,10 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
     // Hoist bitcasts out of shuffles
     setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
 
+    // Combine extends of extract_subvectors into widening ops
+    setTargetDAGCombine(ISD::SIGN_EXTEND);
+    setTargetDAGCombine(ISD::ZERO_EXTEND);
+
     // Support saturating add for i8x16 and i16x8
     for (auto Op : {ISD::SADDSAT, ISD::UADDSAT})
       for (auto T : {MVT::v16i8, MVT::v8i16})
@@ -156,11 +163,10 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
     // There is no i8x16.mul instruction
     setOperationAction(ISD::MUL, MVT::v16i8, Expand);
 
-    // There are no vector select instructions
-    for (auto Op : {ISD::VSELECT, ISD::SELECT_CC, ISD::SELECT})
-      for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32, MVT::v2i64,
-                     MVT::v2f64})
-        setOperationAction(Op, T, Expand);
+    // There is no vector conditional select instruction
+    for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32, MVT::v2i64,
+                   MVT::v2f64})
+      setOperationAction(ISD::SELECT_CC, T, Expand);
 
     // Expand integer operations supported for scalars but not SIMD
     for (auto Op : {ISD::CTLZ, ISD::CTTZ, ISD::CTPOP, ISD::SDIV, ISD::UDIV,
@@ -247,6 +253,9 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
       setLoadExtAction(Ext, MVT::v4i32, MVT::v4i16, Legal);
       setLoadExtAction(Ext, MVT::v2i64, MVT::v2i32, Legal);
     }
+    // And some truncating stores are legal as well
+    setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
+    setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
   }
 
   // Don't do anything clever with build_pairs
@@ -258,6 +267,7 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
 
   // Exception handling intrinsics
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
 
   setMaxAtomicSizeInBitsSupported(64);
@@ -268,7 +278,7 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
   setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
 
   // Define the emscripten name for return address helper.
-  // TODO: when implementing other WASM backends, make this generic or only do
+  // TODO: when implementing other Wasm backends, make this generic or only do
   // this on emscripten depending on what they end up doing.
   setLibcallName(RTLIB::RETURN_ADDRESS, "emscripten_return_address");
 
@@ -442,6 +452,19 @@ static MachineBasicBlock *LowerCallResults(MachineInstr &CallResults,
   const MCInstrDesc &MCID = TII.get(CallOp);
   MachineInstrBuilder MIB(MF, MF.CreateMachineInstr(MCID, DL));
 
+  // See if we must truncate the function pointer.
+  // CALL_INDIRECT takes an i32, but in wasm64 we represent function pointers
+  // as 64-bit for uniformity with other pointer types.
+  if (IsIndirect && MF.getSubtarget<WebAssemblySubtarget>().hasAddr64()) {
+    Register Reg32 =
+        MF.getRegInfo().createVirtualRegister(&WebAssembly::I32RegClass);
+    auto &FnPtr = CallParams.getOperand(0);
+    BuildMI(*BB, CallResults.getIterator(), DL,
+            TII.get(WebAssembly::I32_WRAP_I64), Reg32)
+        .addReg(FnPtr.getReg());
+    FnPtr.setReg(Reg32);
+  }
+
   // Move the function pointer to the end of the arguments for indirect calls
   if (IsIndirect) {
     auto FnPtr = CallParams.getOperand(0);
@@ -456,6 +479,15 @@ static MachineBasicBlock *LowerCallResults(MachineInstr &CallResults,
   if (IsIndirect) {
     MIB.addImm(0);
     MIB.addImm(0);
+
+    // Ensure that the object file has a __indirect_function_table import, as we
+    // call_indirect against it.
+    MCSymbolWasm *Sym = WebAssembly::getOrCreateFunctionTableSymbol(
+        MF.getContext(), "__indirect_function_table");
+    // Until call_indirect emits TABLE_NUMBER relocs against this symbol, mark
+    // it as NO_STRIP so as to ensure that the indirect function table makes it
+    // to linked output.
+    Sym->setNoStrip();
   }
 
   for (auto Use : CallParams.uses())
@@ -542,6 +574,16 @@ WebAssemblyTargetLowering::getRegForInlineAsmConstraint(
         if (VT.getSizeInBits() <= 64)
           return std::make_pair(0U, &WebAssembly::I64RegClass);
       }
+      if (VT.isFloatingPoint() && !VT.isVector()) {
+        switch (VT.getSizeInBits()) {
+        case 32:
+          return std::make_pair(0U, &WebAssembly::F32RegClass);
+        case 64:
+          return std::make_pair(0U, &WebAssembly::F64RegClass);
+        default:
+          break;
+        }
+      }
       break;
     default:
       break;
@@ -626,7 +668,7 @@ bool WebAssemblyTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
                                                    MachineFunction &MF,
                                                    unsigned Intrinsic) const {
   switch (Intrinsic) {
-  case Intrinsic::wasm_atomic_notify:
+  case Intrinsic::wasm_memory_atomic_notify:
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::i32;
     Info.ptrVal = I.getArgOperand(0);
@@ -640,7 +682,7 @@ bool WebAssemblyTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     // consistent. The same applies for wasm_atomic_wait intrinsics too.
     Info.flags = MachineMemOperand::MOVolatile | MachineMemOperand::MOLoad;
     return true;
-  case Intrinsic::wasm_atomic_wait_i32:
+  case Intrinsic::wasm_memory_atomic_wait32:
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::i32;
     Info.ptrVal = I.getArgOperand(0);
@@ -648,7 +690,7 @@ bool WebAssemblyTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.align = Align(4);
     Info.flags = MachineMemOperand::MOVolatile | MachineMemOperand::MOLoad;
     return true;
-  case Intrinsic::wasm_atomic_wait_i64:
+  case Intrinsic::wasm_memory_atomic_wait64:
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::i64;
     Info.ptrVal = I.getArgOperand(0);
@@ -656,6 +698,75 @@ bool WebAssemblyTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.align = Align(8);
     Info.flags = MachineMemOperand::MOVolatile | MachineMemOperand::MOLoad;
     return true;
+  case Intrinsic::wasm_load32_zero:
+  case Intrinsic::wasm_load64_zero:
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = Intrinsic == Intrinsic::wasm_load32_zero ? MVT::i32 : MVT::i64;
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.align = Info.memVT == MVT::i32 ? Align(4) : Align(8);
+    Info.flags = MachineMemOperand::MOLoad;
+    return true;
+  case Intrinsic::wasm_load8_lane:
+  case Intrinsic::wasm_load16_lane:
+  case Intrinsic::wasm_load32_lane:
+  case Intrinsic::wasm_load64_lane:
+  case Intrinsic::wasm_store8_lane:
+  case Intrinsic::wasm_store16_lane:
+  case Intrinsic::wasm_store32_lane:
+  case Intrinsic::wasm_store64_lane: {
+    MVT MemVT;
+    Align MemAlign;
+    switch (Intrinsic) {
+    case Intrinsic::wasm_load8_lane:
+    case Intrinsic::wasm_store8_lane:
+      MemVT = MVT::i8;
+      MemAlign = Align(1);
+      break;
+    case Intrinsic::wasm_load16_lane:
+    case Intrinsic::wasm_store16_lane:
+      MemVT = MVT::i16;
+      MemAlign = Align(2);
+      break;
+    case Intrinsic::wasm_load32_lane:
+    case Intrinsic::wasm_store32_lane:
+      MemVT = MVT::i32;
+      MemAlign = Align(4);
+      break;
+    case Intrinsic::wasm_load64_lane:
+    case Intrinsic::wasm_store64_lane:
+      MemVT = MVT::i64;
+      MemAlign = Align(8);
+      break;
+    default:
+      llvm_unreachable("unexpected intrinsic");
+    }
+    if (Intrinsic == Intrinsic::wasm_load8_lane ||
+        Intrinsic == Intrinsic::wasm_load16_lane ||
+        Intrinsic == Intrinsic::wasm_load32_lane ||
+        Intrinsic == Intrinsic::wasm_load64_lane) {
+      Info.opc = ISD::INTRINSIC_W_CHAIN;
+      Info.flags = MachineMemOperand::MOLoad;
+    } else {
+      Info.opc = ISD::INTRINSIC_VOID;
+      Info.flags = MachineMemOperand::MOStore;
+    }
+    Info.ptrVal = I.getArgOperand(0);
+    Info.memVT = MemVT;
+    Info.offset = 0;
+    Info.align = MemAlign;
+    return true;
+  }
+  case Intrinsic::wasm_prefetch_t:
+  case Intrinsic::wasm_prefetch_nt: {
+    Info.opc = ISD::INTRINSIC_VOID;
+    Info.memVT = MVT::i8;
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.align = Align(1);
+    Info.flags = MachineMemOperand::MOLoad;
+    return true;
+  }
   default:
     return false;
   }
@@ -866,8 +977,7 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
                                                  /*isSS=*/false);
     unsigned ValNo = 0;
     SmallVector<SDValue, 8> Chains;
-    for (SDValue Arg :
-         make_range(OutVals.begin() + NumFixedArgs, OutVals.end())) {
+    for (SDValue Arg : drop_begin(OutVals, NumFixedArgs)) {
       assert(ArgLocs[ValNo].getValNo() == ValNo &&
              "ArgLocs should remain in order and only hold varargs args");
       unsigned Offset = ArgLocs[ValNo++].getLocMemOffset();
@@ -876,7 +986,7 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
                                 DAG.getConstant(Offset, DL, PtrVT));
       Chains.push_back(
           DAG.getStore(Chain, DL, Arg, Add,
-                       MachinePointerInfo::getFixedStack(MF, FI, Offset), 0));
+                       MachinePointerInfo::getFixedStack(MF, FI, Offset)));
     }
     if (!Chains.empty())
       Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
@@ -1091,6 +1201,8 @@ SDValue WebAssemblyTargetLowering::LowerOperation(SDValue Op,
     return LowerFrameIndex(Op, DAG);
   case ISD::GlobalAddress:
     return LowerGlobalAddress(Op, DAG);
+  case ISD::GlobalTLSAddress:
+    return LowerGlobalTLSAddress(Op, DAG);
   case ISD::ExternalSymbol:
     return LowerExternalSymbol(Op, DAG);
   case ISD::JumpTable:
@@ -1199,6 +1311,49 @@ SDValue WebAssemblyTargetLowering::LowerFRAMEADDR(SDValue Op,
   return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(Op), FP, VT);
 }
 
+SDValue
+WebAssemblyTargetLowering::LowerGlobalTLSAddress(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  const auto *GA = cast<GlobalAddressSDNode>(Op);
+  MVT PtrVT = getPointerTy(DAG.getDataLayout());
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  if (!MF.getSubtarget<WebAssemblySubtarget>().hasBulkMemory())
+    report_fatal_error("cannot use thread-local storage without bulk memory",
+                       false);
+
+  const GlobalValue *GV = GA->getGlobal();
+
+  // Currently Emscripten does not support dynamic linking with threads.
+  // Therefore, if we have thread-local storage, only the local-exec model
+  // is possible.
+  // TODO: remove this and implement proper TLS models once Emscripten
+  // supports dynamic linking with threads.
+  if (GV->getThreadLocalMode() != GlobalValue::LocalExecTLSModel &&
+      !Subtarget->getTargetTriple().isOSEmscripten()) {
+    report_fatal_error("only -ftls-model=local-exec is supported for now on "
+                       "non-Emscripten OSes: variable " +
+                           GV->getName(),
+                       false);
+  }
+
+  auto GlobalGet = PtrVT == MVT::i64 ? WebAssembly::GLOBAL_GET_I64
+                                     : WebAssembly::GLOBAL_GET_I32;
+  const char *BaseName = MF.createExternalSymbolName("__tls_base");
+
+  SDValue BaseAddr(
+      DAG.getMachineNode(GlobalGet, DL, PtrVT,
+                         DAG.getTargetExternalSymbol(BaseName, PtrVT)),
+      0);
+
+  SDValue TLSOffset = DAG.getTargetGlobalAddress(
+      GV, DL, PtrVT, GA->getOffset(), WebAssemblyII::MO_TLS_BASE_REL);
+  SDValue SymAddr = DAG.getNode(WebAssemblyISD::Wrapper, DL, PtrVT, TLSOffset);
+
+  return DAG.getNode(ISD::ADD, DL, PtrVT, BaseAddr, SymAddr);
+}
+
 SDValue WebAssemblyTargetLowering::LowerGlobalAddress(SDValue Op,
                                                       SelectionDAG &DAG) const {
   SDLoc DL(Op);
@@ -1303,7 +1458,22 @@ SDValue WebAssemblyTargetLowering::LowerVASTART(SDValue Op,
   SDValue ArgN = DAG.getCopyFromReg(DAG.getEntryNode(), DL,
                                     MFI->getVarargBufferVreg(), PtrVT);
   return DAG.getStore(Op.getOperand(0), DL, ArgN, Op.getOperand(1),
-                      MachinePointerInfo(SV), 0);
+                      MachinePointerInfo(SV));
+}
+
+static SDValue getCppExceptionSymNode(SDValue Op, unsigned TagIndex,
+                                      SelectionDAG &DAG) {
+  // We only support C++ exceptions for now
+  int Tag =
+      cast<ConstantSDNode>(Op.getOperand(TagIndex).getNode())->getZExtValue();
+  if (Tag != WebAssembly::CPP_EXCEPTION)
+    llvm_unreachable("Invalid tag: We only support C++ exceptions for now");
+  auto &MF = DAG.getMachineFunction();
+  const auto &TLI = DAG.getTargetLoweringInfo();
+  MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
+  const char *SymName = MF.createExternalSymbolName("__cpp_exception");
+  return DAG.getNode(WebAssemblyISD::Wrapper, SDLoc(Op), PtrVT,
+                     DAG.getTargetExternalSymbol(SymName, PtrVT));
 }
 
 SDValue WebAssemblyTargetLowering::LowerIntrinsic(SDValue Op,
@@ -1339,15 +1509,7 @@ SDValue WebAssemblyTargetLowering::LowerIntrinsic(SDValue Op,
   }
 
   case Intrinsic::wasm_throw: {
-    // We only support C++ exceptions for now
-    int Tag = cast<ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
-    if (Tag != CPP_EXCEPTION)
-      llvm_unreachable("Invalid tag!");
-    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-    MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
-    const char *SymName = MF.createExternalSymbolName("__cpp_exception");
-    SDValue SymNode = DAG.getNode(WebAssemblyISD::Wrapper, DL, PtrVT,
-                                  DAG.getTargetExternalSymbol(SymName, PtrVT));
+    SDValue SymNode = getCppExceptionSymNode(Op, 2, DAG);
     return DAG.getNode(WebAssemblyISD::THROW, DL,
                        MVT::Other, // outchain type
                        {
@@ -1357,6 +1519,19 @@ SDValue WebAssemblyTargetLowering::LowerIntrinsic(SDValue Op,
                        });
   }
 
+  case Intrinsic::wasm_catch: {
+    SDValue SymNode = getCppExceptionSymNode(Op, 2, DAG);
+    return DAG.getNode(WebAssemblyISD::CATCH, DL,
+                       {
+                           MVT::i32,  // outchain type
+                           MVT::Other // return value
+                       },
+                       {
+                           Op.getOperand(0), // inchain
+                           SymNode           // exception symbol
+                       });
+  }
+
   case Intrinsic::wasm_shuffle: {
     // Drop in-chain and replace undefs, but otherwise pass through unchanged
     SDValue Ops[18];
@@ -1474,8 +1649,8 @@ SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op,
   SmallVector<SwizzleEntry, 16> SwizzleCounts;
 
   auto AddCount = [](auto &Counts, const auto &Val) {
-    auto CountIt = std::find_if(Counts.begin(), Counts.end(),
-                                [&Val](auto E) { return E.first == Val; });
+    auto CountIt =
+        llvm::find_if(Counts, [&Val](auto E) { return E.first == Val; });
     if (CountIt == Counts.end()) {
       Counts.emplace_back(Val, 1);
     } else {
@@ -1537,6 +1712,7 @@ SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op,
     };
   } else if (NumConstantLanes >= NumSplatLanes &&
              Subtarget->hasUnimplementedSIMD128()) {
+    // If we support v128.const, emit it directly
     SmallVector<SDValue, 16> ConstLanes;
     for (const SDValue &Lane : Op->op_values()) {
       if (IsConstant(Lane)) {
@@ -1548,11 +1724,59 @@ SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op,
       }
     }
     Result = DAG.getBuildVector(VecT, DL, ConstLanes);
-    IsLaneConstructed = [&](size_t _, const SDValue &Lane) {
+    IsLaneConstructed = [&IsConstant](size_t _, const SDValue &Lane) {
       return IsConstant(Lane);
     };
-  }
-  if (!Result) {
+  } else if (NumConstantLanes >= NumSplatLanes && VecT.isInteger()) {
+    // Otherwise, if this is an integer vector, pack the lane values together so
+    // we can construct the 128-bit constant from a pair of i64s using a splat
+    // followed by at most one i64x2.replace_lane. Also keep track of the lanes
+    // that actually matter so we can avoid the replace_lane in more cases.
+    std::array<uint64_t, 2> I64s{{0, 0}};
+    std::array<uint64_t, 2> ConstLaneMasks{{0, 0}};
+    size_t LaneBits = 128 / Lanes;
+    size_t HalfLanes = Lanes / 2;
+    for (size_t I = 0; I < Lanes; ++I) {
+      const SDValue &Lane = Op.getOperand(I);
+      if (IsConstant(Lane)) {
+        // How much we need to shift Val to position it in an i64
+        auto Shift = LaneBits * (I % HalfLanes);
+        auto Mask = maskTrailingOnes<uint64_t>(LaneBits);
+        auto Val = cast<ConstantSDNode>(Lane.getNode())->getZExtValue() & Mask;
+        I64s[I / HalfLanes] |= Val << Shift;
+        ConstLaneMasks[I / HalfLanes] |= Mask << Shift;
+      }
+    }
+    // Check whether all constant lanes in the second half of the vector are
+    // equivalent in the first half or vice versa to determine whether splatting
+    // either side will be sufficient to materialize the constant. As a special
+    // case, if the first and second halves have no constant lanes in common, we
+    // can just combine them.
+    bool FirstHalfSufficient = (I64s[0] & ConstLaneMasks[1]) == I64s[1];
+    bool SecondHalfSufficient = (I64s[1] & ConstLaneMasks[0]) == I64s[0];
+    bool CombinedSufficient = (ConstLaneMasks[0] & ConstLaneMasks[1]) == 0;
+
+    uint64_t Splatted;
+    if (SecondHalfSufficient) {
+      Splatted = I64s[1];
+    } else if (CombinedSufficient) {
+      Splatted = I64s[0] | I64s[1];
+    } else {
+      Splatted = I64s[0];
+    }
+
+    Result = DAG.getSplatBuildVector(MVT::v2i64, DL,
+                                     DAG.getConstant(Splatted, DL, MVT::i64));
+    if (!FirstHalfSufficient && !SecondHalfSufficient && !CombinedSufficient) {
+      Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v2i64, Result,
+                           DAG.getConstant(I64s[1], DL, MVT::i64),
+                           DAG.getConstant(1, DL, MVT::i32));
+    }
+    Result = DAG.getBitcast(VecT, Result);
+    IsLaneConstructed = [&IsConstant](size_t _, const SDValue &Lane) {
+      return IsConstant(Lane);
+    };
+  } else {
     // Use a splat, but possibly a load_splat
     LoadSDNode *SplattedLoad;
     if ((SplattedLoad = dyn_cast<LoadSDNode>(SplatValue)) &&
@@ -1565,11 +1789,14 @@ SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op,
     } else {
       Result = DAG.getSplatBuildVector(VecT, DL, SplatValue);
     }
-    IsLaneConstructed = [&](size_t _, const SDValue &Lane) {
+    IsLaneConstructed = [&SplatValue](size_t _, const SDValue &Lane) {
       return Lane == SplatValue;
     };
   }
 
+  assert(Result);
+  assert(IsLaneConstructed);
+
   // Add replace_lane instructions for any unhandled values
   for (size_t I = 0; I < Lanes; ++I) {
     const SDValue &Lane = Op->getOperand(I);
@@ -1730,6 +1957,49 @@ performVECTOR_SHUFFLECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
   return DAG.getBitcast(DstType, NewShuffle);
 }
 
+static SDValue performVectorWidenCombine(SDNode *N,
+                                         TargetLowering::DAGCombinerInfo &DCI) {
+  auto &DAG = DCI.DAG;
+  assert(N->getOpcode() == ISD::SIGN_EXTEND ||
+         N->getOpcode() == ISD::ZERO_EXTEND);
+
+  // Combine ({s,z}ext (extract_subvector src, i)) into a widening operation if
+  // possible before the extract_subvector can be expanded.
+  auto Extract = N->getOperand(0);
+  if (Extract.getOpcode() != ISD::EXTRACT_SUBVECTOR)
+    return SDValue();
+  auto Source = Extract.getOperand(0);
+  auto *IndexNode = dyn_cast<ConstantSDNode>(Extract.getOperand(1));
+  if (IndexNode == nullptr)
+    return SDValue();
+  auto Index = IndexNode->getZExtValue();
+
+  // Only v8i8 and v4i16 extracts can be widened, and only if the extracted
+  // subvector is the low or high half of its source.
+  EVT ResVT = N->getValueType(0);
+  if (ResVT == MVT::v8i16) {
+    if (Extract.getValueType() != MVT::v8i8 ||
+        Source.getValueType() != MVT::v16i8 || (Index != 0 && Index != 8))
+      return SDValue();
+  } else if (ResVT == MVT::v4i32) {
+    if (Extract.getValueType() != MVT::v4i16 ||
+        Source.getValueType() != MVT::v8i16 || (Index != 0 && Index != 4))
+      return SDValue();
+  } else {
+    return SDValue();
+  }
+
+  bool IsSext = N->getOpcode() == ISD::SIGN_EXTEND;
+  bool IsLow = Index == 0;
+
+  unsigned Op = IsSext ? (IsLow ? WebAssemblyISD::WIDEN_LOW_S
+                                : WebAssemblyISD::WIDEN_HIGH_S)
+                       : (IsLow ? WebAssemblyISD::WIDEN_LOW_U
+                                : WebAssemblyISD::WIDEN_HIGH_U);
+
+  return DAG.getNode(Op, SDLoc(N), ResVT, Source);
+}
+
 SDValue
 WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
@@ -1738,5 +2008,8 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
     return SDValue();
   case ISD::VECTOR_SHUFFLE:
     return performVECTOR_SHUFFLECombine(N, DCI);
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+    return performVectorWidenCombine(N, DCI);
   }
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
index b8e612377529..c8a052d01199 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -106,6 +106,7 @@ private:
   SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
index 256b77e33db9..22103b0bfb38 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
@@ -33,112 +33,117 @@ multiclass ATOMIC_NRI<dag oops, dag iops, list<dag> pattern, string asmstr = "",
 //===----------------------------------------------------------------------===//
 
 let hasSideEffects = 1 in {
-defm ATOMIC_NOTIFY_A32 :
+defm MEMORY_ATOMIC_NOTIFY_A32 :
   ATOMIC_I<(outs I32:$dst),
            (ins P2Align:$p2align, offset32_op:$off, I32:$addr, I32:$count),
            (outs), (ins P2Align:$p2align, offset32_op:$off), [],
-           "atomic.notify \t$dst, ${off}(${addr})${p2align}, $count",
-           "atomic.notify \t${off}${p2align}", 0x00, "false">;
-defm ATOMIC_NOTIFY_A64 :
+           "memory.atomic.notify \t$dst, ${off}(${addr})${p2align}, $count",
+           "memory.atomic.notify \t${off}${p2align}", 0x00, "false">;
+defm MEMORY_ATOMIC_NOTIFY_A64 :
   ATOMIC_I<(outs I32:$dst),
            (ins P2Align:$p2align, offset64_op:$off, I64:$addr, I32:$count),
            (outs), (ins P2Align:$p2align, offset64_op:$off), [],
-           "atomic.notify \t$dst, ${off}(${addr})${p2align}, $count",
-           "atomic.notify \t${off}${p2align}", 0x00, "true">;
+           "memory.atomic.notify \t$dst, ${off}(${addr})${p2align}, $count",
+           "memory.atomic.notify \t${off}${p2align}", 0x00, "true">;
 let mayLoad = 1 in {
-defm ATOMIC_WAIT_I32_A32 :
+defm MEMORY_ATOMIC_WAIT32_A32 :
   ATOMIC_I<(outs I32:$dst),
            (ins P2Align:$p2align, offset32_op:$off, I32:$addr, I32:$exp,
                 I64:$timeout),
            (outs), (ins P2Align:$p2align, offset32_op:$off), [],
-           "i32.atomic.wait \t$dst, ${off}(${addr})${p2align}, $exp, $timeout",
-           "i32.atomic.wait \t${off}${p2align}", 0x01, "false">;
-defm ATOMIC_WAIT_I32_A64 :
+           "memory.atomic.wait32 \t$dst, ${off}(${addr})${p2align}, $exp, $timeout",
+           "memory.atomic.wait32 \t${off}${p2align}", 0x01, "false">;
+defm MEMORY_ATOMIC_WAIT32_A64 :
   ATOMIC_I<(outs I32:$dst),
            (ins P2Align:$p2align, offset64_op:$off, I64:$addr, I32:$exp,
                 I64:$timeout),
            (outs), (ins P2Align:$p2align, offset64_op:$off), [],
-           "i32.atomic.wait \t$dst, ${off}(${addr})${p2align}, $exp, $timeout",
-           "i32.atomic.wait \t${off}${p2align}", 0x01, "true">;
-defm ATOMIC_WAIT_I64_A32 :
+           "memory.atomic.wait32 \t$dst, ${off}(${addr})${p2align}, $exp, $timeout",
+           "memory.atomic.wait32 \t${off}${p2align}", 0x01, "true">;
+defm MEMORY_ATOMIC_WAIT64_A32 :
   ATOMIC_I<(outs I32:$dst),
            (ins P2Align:$p2align, offset32_op:$off, I32:$addr, I64:$exp,
                 I64:$timeout),
            (outs), (ins P2Align:$p2align, offset32_op:$off), [],
-           "i64.atomic.wait \t$dst, ${off}(${addr})${p2align}, $exp, $timeout",
-           "i64.atomic.wait \t${off}${p2align}", 0x02, "false">;
-defm ATOMIC_WAIT_I64_A64 :
+           "memory.atomic.wait64 \t$dst, ${off}(${addr})${p2align}, $exp, $timeout",
+           "memory.atomic.wait64 \t${off}${p2align}", 0x02, "false">;
+defm MEMORY_ATOMIC_WAIT64_A64 :
   ATOMIC_I<(outs I32:$dst),
            (ins P2Align:$p2align, offset64_op:$off, I64:$addr, I64:$exp,
                 I64:$timeout),
            (outs), (ins P2Align:$p2align, offset64_op:$off), [],
-           "i64.atomic.wait \t$dst, ${off}(${addr})${p2align}, $exp, $timeout",
-           "i64.atomic.wait \t${off}${p2align}", 0x02, "true">;
+           "memory.atomic.wait64 \t$dst, ${off}(${addr})${p2align}, $exp, $timeout",
+           "memory.atomic.wait64 \t${off}${p2align}", 0x02, "true">;
 } // mayLoad = 1
 } // hasSideEffects = 1
 
-let Predicates = [HasAtomics] in {
 // Select notifys with no constant offset.
 def NotifyPatNoOffset_A32 :
-  Pat<(i32 (int_wasm_atomic_notify I32:$addr, I32:$count)),
-      (ATOMIC_NOTIFY_A32 0, 0, I32:$addr, I32:$count)>,
-  Requires<[HasAddr32]>;
+  Pat<(i32 (int_wasm_memory_atomic_notify I32:$addr, I32:$count)),
+      (MEMORY_ATOMIC_NOTIFY_A32 0, 0, I32:$addr, I32:$count)>,
+  Requires<[HasAddr32, HasAtomics]>;
 def NotifyPatNoOffset_A64 :
-  Pat<(i32 (int_wasm_atomic_notify I64:$addr, I32:$count)),
-      (ATOMIC_NOTIFY_A64 0, 0, I64:$addr, I32:$count)>,
-  Requires<[HasAddr64]>;
+  Pat<(i32 (int_wasm_memory_atomic_notify I64:$addr, I32:$count)),
+      (MEMORY_ATOMIC_NOTIFY_A64 0, 0, I64:$addr, I32:$count)>,
+  Requires<[HasAddr64, HasAtomics]>;
 
 // Select notifys with a constant offset.
 
 // Pattern with address + immediate offset
 multiclass NotifyPatImmOff<PatFrag operand, string inst> {
-  def : Pat<(i32 (int_wasm_atomic_notify (operand I32:$addr, imm:$off),
+  def : Pat<(i32 (int_wasm_memory_atomic_notify (operand I32:$addr, imm:$off),
                   I32:$count)),
             (!cast<NI>(inst#_A32) 0, imm:$off, I32:$addr, I32:$count)>,
-        Requires<[HasAddr32]>;
-  def : Pat<(i32 (int_wasm_atomic_notify (operand I64:$addr, imm:$off),
+        Requires<[HasAddr32, HasAtomics]>;
+  def : Pat<(i32 (int_wasm_memory_atomic_notify (operand I64:$addr, imm:$off),
                   I32:$count)),
             (!cast<NI>(inst#_A64) 0, imm:$off, I64:$addr, I32:$count)>,
-        Requires<[HasAddr64]>;
+        Requires<[HasAddr64, HasAtomics]>;
 }
-defm : NotifyPatImmOff<regPlusImm, "ATOMIC_NOTIFY">;
-defm : NotifyPatImmOff<or_is_add, "ATOMIC_NOTIFY">;
+defm : NotifyPatImmOff<regPlusImm, "MEMORY_ATOMIC_NOTIFY">;
+defm : NotifyPatImmOff<or_is_add, "MEMORY_ATOMIC_NOTIFY">;
 
 // Select notifys with just a constant offset.
 def NotifyPatOffsetOnly_A32 :
-  Pat<(i32 (int_wasm_atomic_notify imm:$off, I32:$count)),
-      (ATOMIC_NOTIFY_A32 0, imm:$off, (CONST_I32 0), I32:$count)>,
-  Requires<[HasAddr32]>;
+  Pat<(i32 (int_wasm_memory_atomic_notify imm:$off, I32:$count)),
+      (MEMORY_ATOMIC_NOTIFY_A32 0, imm:$off, (CONST_I32 0), I32:$count)>,
+  Requires<[HasAddr32, HasAtomics]>;
 def NotifyPatOffsetOnly_A64 :
-  Pat<(i32 (int_wasm_atomic_notify imm:$off, I32:$count)),
-      (ATOMIC_NOTIFY_A64 0, imm:$off, (CONST_I64 0), I32:$count)>,
-  Requires<[HasAddr64]>;
+  Pat<(i32 (int_wasm_memory_atomic_notify imm:$off, I32:$count)),
+      (MEMORY_ATOMIC_NOTIFY_A64 0, imm:$off, (CONST_I64 0), I32:$count)>,
+  Requires<[HasAddr64, HasAtomics]>;
 
 def NotifyPatGlobalAddrOffOnly_A32 :
-  Pat<(i32 (int_wasm_atomic_notify (WebAssemblywrapper tglobaladdr:$off),
-                                   I32:$count)),
-      (ATOMIC_NOTIFY_A32 0, tglobaladdr:$off, (CONST_I32 0), I32:$count)>,
-  Requires<[HasAddr32]>;
+  Pat<(i32 (int_wasm_memory_atomic_notify (WebAssemblywrapper tglobaladdr:$off),
+                                          I32:$count)),
+      (MEMORY_ATOMIC_NOTIFY_A32 0, tglobaladdr:$off, (CONST_I32 0), I32:$count)
+     >,
+  Requires<[HasAddr32, HasAtomics, IsNotPIC]>;
 def NotifyPatGlobalAddrOffOnly_A64 :
-  Pat<(i32 (int_wasm_atomic_notify (WebAssemblywrapper tglobaladdr:$off),
-                                   I32:$count)),
-      (ATOMIC_NOTIFY_A64 0, tglobaladdr:$off, (CONST_I64 0), I32:$count)>,
-  Requires<[HasAddr64]>;
+  Pat<(i32 (int_wasm_memory_atomic_notify (WebAssemblywrapper tglobaladdr:$off),
+                                          I32:$count)),
+      (MEMORY_ATOMIC_NOTIFY_A64 0, tglobaladdr:$off, (CONST_I64 0), I32:$count)
+     >,
+  Requires<[HasAddr64, HasAtomics, IsNotPIC]>;
 
 // Select waits with no constant offset.
 multiclass WaitPatNoOffset<ValueType ty, Intrinsic kind,
                       string inst> {
   def : Pat<(i32 (kind I32:$addr, ty:$exp, I64:$timeout)),
             (!cast<NI>(inst#_A32) 0, 0, I32:$addr, ty:$exp, I64:$timeout)>,
-        Requires<[HasAddr32]>;
+        Requires<[HasAddr32, HasAtomics]>;
   def : Pat<(i32 (kind I64:$addr, ty:$exp, I64:$timeout)),
             (!cast<NI>(inst#_A64) 0, 0, I64:$addr, ty:$exp, I64:$timeout)>,
-        Requires<[HasAddr64]>;
+        Requires<[HasAddr64, HasAtomics]>;
 }
-defm : WaitPatNoOffset<i32, int_wasm_atomic_wait_i32, "ATOMIC_WAIT_I32">;
-defm : WaitPatNoOffset<i64, int_wasm_atomic_wait_i64, "ATOMIC_WAIT_I64">;
-defm : WaitPatNoOffset<i32, int_wasm_atomic_wait_i32, "ATOMIC_WAIT_I32">;
-defm : WaitPatNoOffset<i64, int_wasm_atomic_wait_i64, "ATOMIC_WAIT_I64">;
+defm : WaitPatNoOffset<i32, int_wasm_memory_atomic_wait32,
+                       "MEMORY_ATOMIC_WAIT32">;
+defm : WaitPatNoOffset<i64, int_wasm_memory_atomic_wait64,
+                       "MEMORY_ATOMIC_WAIT64">;
+defm : WaitPatNoOffset<i32, int_wasm_memory_atomic_wait32,
+                       "MEMORY_ATOMIC_WAIT32">;
+defm : WaitPatNoOffset<i64, int_wasm_memory_atomic_wait64,
+                       "MEMORY_ATOMIC_WAIT64">;
 
 // Select waits with a constant offset.
 
@@ -148,52 +153,53 @@ multiclass WaitPatImmOff<ValueType ty, Intrinsic kind, PatFrag operand,
   def : Pat<(i32 (kind (operand I32:$addr, imm:$off), ty:$exp, I64:$timeout)),
             (!cast<NI>(inst#_A32) 0, imm:$off, I32:$addr, ty:$exp,
               I64:$timeout)>,
-        Requires<[HasAddr32]>;
+        Requires<[HasAddr32, HasAtomics]>;
   def : Pat<(i32 (kind (operand I64:$addr, imm:$off), ty:$exp, I64:$timeout)),
             (!cast<NI>(inst#_A64) 0, imm:$off, I64:$addr, ty:$exp,
               I64:$timeout)>,
-        Requires<[HasAddr64]>;
+        Requires<[HasAddr64, HasAtomics]>;
 }
-defm : WaitPatImmOff<i32, int_wasm_atomic_wait_i32, regPlusImm,
-                     "ATOMIC_WAIT_I32">;
-defm : WaitPatImmOff<i32, int_wasm_atomic_wait_i32, or_is_add,
-                     "ATOMIC_WAIT_I32">;
-defm : WaitPatImmOff<i64, int_wasm_atomic_wait_i64, regPlusImm,
-                     "ATOMIC_WAIT_I64">;
-defm : WaitPatImmOff<i64, int_wasm_atomic_wait_i64, or_is_add,
-                     "ATOMIC_WAIT_I64">;
-
-// Select wait_i32, "ATOMIC_WAIT_I32s with just a constant offset.
+defm : WaitPatImmOff<i32, int_wasm_memory_atomic_wait32, regPlusImm,
+                     "MEMORY_ATOMIC_WAIT32">;
+defm : WaitPatImmOff<i32, int_wasm_memory_atomic_wait32, or_is_add,
+                     "MEMORY_ATOMIC_WAIT32">;
+defm : WaitPatImmOff<i64, int_wasm_memory_atomic_wait64, regPlusImm,
+                     "MEMORY_ATOMIC_WAIT64">;
+defm : WaitPatImmOff<i64, int_wasm_memory_atomic_wait64, or_is_add,
+                     "MEMORY_ATOMIC_WAIT64">;
+
+// Select waits with just a constant offset.
 multiclass WaitPatOffsetOnly<ValueType ty, Intrinsic kind, string inst> {
   def : Pat<(i32 (kind imm:$off, ty:$exp, I64:$timeout)),
             (!cast<NI>(inst#_A32) 0, imm:$off, (CONST_I32 0), ty:$exp,
                I64:$timeout)>,
-        Requires<[HasAddr32]>;
+        Requires<[HasAddr32, HasAtomics]>;
   def : Pat<(i32 (kind imm:$off, ty:$exp, I64:$timeout)),
             (!cast<NI>(inst#_A64) 0, imm:$off, (CONST_I64 0), ty:$exp,
                I64:$timeout)>,
-        Requires<[HasAddr64]>;
+        Requires<[HasAddr64, HasAtomics]>;
 }
-defm : WaitPatOffsetOnly<i32, int_wasm_atomic_wait_i32, "ATOMIC_WAIT_I32">;
-defm : WaitPatOffsetOnly<i64, int_wasm_atomic_wait_i64, "ATOMIC_WAIT_I64">;
+defm : WaitPatOffsetOnly<i32, int_wasm_memory_atomic_wait32,
+                         "MEMORY_ATOMIC_WAIT32">;
+defm : WaitPatOffsetOnly<i64, int_wasm_memory_atomic_wait64,
+                         "MEMORY_ATOMIC_WAIT64">;
 
 multiclass WaitPatGlobalAddrOffOnly<ValueType ty, Intrinsic kind, string inst> {
   def : Pat<(i32 (kind (WebAssemblywrapper tglobaladdr:$off), ty:$exp,
                   I64:$timeout)),
             (!cast<NI>(inst#_A32) 0, tglobaladdr:$off, (CONST_I32 0), ty:$exp,
                I64:$timeout)>,
-        Requires<[HasAddr32]>;
+        Requires<[HasAddr32, HasAtomics, IsNotPIC]>;
   def : Pat<(i32 (kind (WebAssemblywrapper tglobaladdr:$off), ty:$exp,
                   I64:$timeout)),
             (!cast<NI>(inst#_A64) 0, tglobaladdr:$off, (CONST_I64 0), ty:$exp,
                I64:$timeout)>,
-        Requires<[HasAddr64]>;
+        Requires<[HasAddr64, HasAtomics, IsNotPIC]>;
 }
-defm : WaitPatGlobalAddrOffOnly<i32, int_wasm_atomic_wait_i32,
-                                "ATOMIC_WAIT_I32">;
-defm : WaitPatGlobalAddrOffOnly<i64, int_wasm_atomic_wait_i64,
-                                "ATOMIC_WAIT_I64">;
-} // Predicates = [HasAtomics]
+defm : WaitPatGlobalAddrOffOnly<i32, int_wasm_memory_atomic_wait32,
+                                "MEMORY_ATOMIC_WAIT32">;
+defm : WaitPatGlobalAddrOffOnly<i64, int_wasm_memory_atomic_wait64,
+                                "MEMORY_ATOMIC_WAIT64">;
 
 //===----------------------------------------------------------------------===//
 // Atomic fences
@@ -221,7 +227,6 @@ defm ATOMIC_LOAD_I32 : AtomicLoad<I32, "i32.atomic.load", 0x10>;
 defm ATOMIC_LOAD_I64 : AtomicLoad<I64, "i64.atomic.load", 0x11>;
 
 // Select loads with no constant offset.
-let Predicates = [HasAtomics] in {
 defm : LoadPatNoOffset<i32, atomic_load_32, "ATOMIC_LOAD_I32">;
 defm : LoadPatNoOffset<i64, atomic_load_64, "ATOMIC_LOAD_I64">;
 
@@ -240,7 +245,6 @@ defm : LoadPatOffsetOnly<i64, atomic_load_64, "ATOMIC_LOAD_I64">;
 defm : LoadPatGlobalAddrOffOnly<i32, atomic_load_32, "ATOMIC_LOAD_I32">;
 defm : LoadPatGlobalAddrOffOnly<i64, atomic_load_64, "ATOMIC_LOAD_I64">;
 
-} // Predicates = [HasAtomics]
 
 // Extending loads. Note that there are only zero-extending atomic loads, no
 // sign-extending loads.
@@ -285,7 +289,6 @@ def sext_aload_8_64 :
 def sext_aload_16_64 :
   PatFrag<(ops node:$addr), (anyext (i32 (atomic_load_16 node:$addr)))>;
 
-let Predicates = [HasAtomics] in {
 // Select zero-extending loads with no constant offset.
 defm : LoadPatNoOffset<i32, zext_aload_8_32, "ATOMIC_LOAD8_U_I32">;
 defm : LoadPatNoOffset<i32, zext_aload_16_32, "ATOMIC_LOAD16_U_I32">;
@@ -344,7 +347,6 @@ defm : LoadPatGlobalAddrOffOnly<i32, atomic_load_16, "ATOMIC_LOAD16_U_I32">;
 defm : LoadPatGlobalAddrOffOnly<i64, sext_aload_8_64, "ATOMIC_LOAD8_U_I64">;
 defm : LoadPatGlobalAddrOffOnly<i64, sext_aload_16_64, "ATOMIC_LOAD16_U_I64">;
 
-} // Predicates = [HasAtomics]
 
 //===----------------------------------------------------------------------===//
 // Atomic stores
@@ -363,16 +365,15 @@ defm ATOMIC_STORE_I64 : AtomicStore<I64, "i64.atomic.store", 0x18>;
 // store: (store $val, $ptr)
 // atomic_store: (store $ptr, $val)
 
-let Predicates = [HasAtomics] in {
 
 // Select stores with no constant offset.
 multiclass AStorePatNoOffset<ValueType ty, PatFrag kind, string inst> {
   def : Pat<(kind I32:$addr, ty:$val),
             (!cast<NI>(inst#_A32) 0, 0, I32:$addr, ty:$val)>,
-        Requires<[HasAddr32]>;
+        Requires<[HasAddr32, HasAtomics]>;
   def : Pat<(kind I64:$addr, ty:$val),
             (!cast<NI>(inst#_A64) 0, 0, I64:$addr, ty:$val)>,
-        Requires<[HasAddr64]>;
+        Requires<[HasAddr64, HasAtomics]>;
 }
 defm : AStorePatNoOffset<i32, atomic_store_32, "ATOMIC_STORE_I32">;
 defm : AStorePatNoOffset<i64, atomic_store_64, "ATOMIC_STORE_I64">;
@@ -384,10 +385,10 @@ multiclass AStorePatImmOff<ValueType ty, PatFrag kind, PatFrag operand,
                            string inst> {
   def : Pat<(kind (operand I32:$addr, imm:$off), ty:$val),
             (!cast<NI>(inst#_A32) 0, imm:$off, I32:$addr, ty:$val)>,
-        Requires<[HasAddr32]>;
+        Requires<[HasAddr32, HasAtomics]>;
   def : Pat<(kind (operand I64:$addr, imm:$off), ty:$val),
             (!cast<NI>(inst#_A64) 0, imm:$off, I64:$addr, ty:$val)>,
-        Requires<[HasAddr64]>;
+        Requires<[HasAddr64, HasAtomics]>;
 }
 defm : AStorePatImmOff<i32, atomic_store_32, regPlusImm, "ATOMIC_STORE_I32">;
 defm : AStorePatImmOff<i64, atomic_store_64, regPlusImm, "ATOMIC_STORE_I64">;
@@ -396,10 +397,10 @@ defm : AStorePatImmOff<i64, atomic_store_64, regPlusImm, "ATOMIC_STORE_I64">;
 multiclass AStorePatOffsetOnly<ValueType ty, PatFrag kind, string inst> {
   def : Pat<(kind imm:$off, ty:$val),
             (!cast<NI>(inst#_A32) 0, imm:$off, (CONST_I32 0), ty:$val)>,
-        Requires<[HasAddr32]>;
+        Requires<[HasAddr32, HasAtomics]>;
   def : Pat<(kind imm:$off, ty:$val),
             (!cast<NI>(inst#_A64) 0, imm:$off, (CONST_I64 0), ty:$val)>,
-        Requires<[HasAddr64]>;
+        Requires<[HasAddr64, HasAtomics]>;
 }
 defm : AStorePatOffsetOnly<i32, atomic_store_32, "ATOMIC_STORE_I32">;
 defm : AStorePatOffsetOnly<i64, atomic_store_64, "ATOMIC_STORE_I64">;
@@ -407,15 +408,14 @@ defm : AStorePatOffsetOnly<i64, atomic_store_64, "ATOMIC_STORE_I64">;
 multiclass AStorePatGlobalAddrOffOnly<ValueType ty, PatFrag kind, string inst> {
   def : Pat<(kind (WebAssemblywrapper tglobaladdr:$off), ty:$val),
             (!cast<NI>(inst#_A32) 0, tglobaladdr:$off, (CONST_I32 0), ty:$val)>,
-        Requires<[HasAddr32]>;
+        Requires<[HasAddr32, HasAtomics, IsNotPIC]>;
   def : Pat<(kind (WebAssemblywrapper tglobaladdr:$off), ty:$val),
             (!cast<NI>(inst#_A64) 0, tglobaladdr:$off, (CONST_I64 0), ty:$val)>,
-        Requires<[HasAddr64]>;
+        Requires<[HasAddr64, HasAtomics, IsNotPIC]>;
 }
 defm : AStorePatGlobalAddrOffOnly<i32, atomic_store_32, "ATOMIC_STORE_I32">;
 defm : AStorePatGlobalAddrOffOnly<i64, atomic_store_64, "ATOMIC_STORE_I64">;
 
-} // Predicates = [HasAtomics]
 
 // Truncating stores.
 defm ATOMIC_STORE8_I32 : AtomicStore<I32, "i32.atomic.store8", 0x19>;
@@ -436,7 +436,6 @@ def trunc_astore_8_64 : trunc_astore_64<atomic_store_8>;
 def trunc_astore_16_64 : trunc_astore_64<atomic_store_16>;
 def trunc_astore_32_64 : trunc_astore_64<atomic_store_32>;
 
-let Predicates = [HasAtomics] in {
 
 // Truncating stores with no constant offset
 defm : AStorePatNoOffset<i32, atomic_store_8, "ATOMIC_STORE8_I32">;
@@ -474,7 +473,6 @@ defm : AStorePatGlobalAddrOffOnly<i64, trunc_astore_8_64, "ATOMIC_STORE8_I64">;
 defm : AStorePatGlobalAddrOffOnly<i64, trunc_astore_16_64, "ATOMIC_STORE16_I64">;
 defm : AStorePatGlobalAddrOffOnly<i64, trunc_astore_32_64, "ATOMIC_STORE32_I64">;
 
-} // Predicates = [HasAtomics]
 
 //===----------------------------------------------------------------------===//
 // Atomic binary read-modify-writes
@@ -580,10 +578,10 @@ defm ATOMIC_RMW32_U_XCHG_I64 :
 multiclass BinRMWPatNoOffset<ValueType ty, PatFrag kind, string inst> {
   def : Pat<(ty (kind I32:$addr, ty:$val)),
             (!cast<NI>(inst#_A32) 0, 0, I32:$addr, ty:$val)>,
-        Requires<[HasAddr32]>;
+        Requires<[HasAddr32, HasAtomics]>;
   def : Pat<(ty (kind I64:$addr, ty:$val)),
             (!cast<NI>(inst#_A64) 0, 0, I64:$addr, ty:$val)>,
-        Requires<[HasAddr64]>;
+        Requires<[HasAddr64, HasAtomics]>;
 }
 
 // Select binary RMWs with a constant offset.
@@ -593,29 +591,29 @@ multiclass BinRMWPatImmOff<ValueType ty, PatFrag kind, PatFrag operand,
                            string inst> {
   def : Pat<(ty (kind (operand I32:$addr, imm:$off), ty:$val)),
             (!cast<NI>(inst#_A32) 0, imm:$off, I32:$addr, ty:$val)>,
-        Requires<[HasAddr32]>;
+        Requires<[HasAddr32, HasAtomics]>;
   def : Pat<(ty (kind (operand I64:$addr, imm:$off), ty:$val)),
             (!cast<NI>(inst#_A64) 0, imm:$off, I64:$addr, ty:$val)>,
-        Requires<[HasAddr64]>;
+        Requires<[HasAddr64, HasAtomics]>;
 }
 
 // Select binary RMWs with just a constant offset.
 multiclass BinRMWPatOffsetOnly<ValueType ty, PatFrag kind, string inst> {
   def : Pat<(ty (kind imm:$off, ty:$val)),
             (!cast<NI>(inst#_A32) 0, imm:$off, (CONST_I32 0), ty:$val)>,
-        Requires<[HasAddr32]>;
+        Requires<[HasAddr32, HasAtomics]>;
   def : Pat<(ty (kind imm:$off, ty:$val)),
             (!cast<NI>(inst#_A64) 0, imm:$off, (CONST_I64 0), ty:$val)>,
-        Requires<[HasAddr64]>;
+        Requires<[HasAddr64, HasAtomics]>;
 }
 
 multiclass BinRMWPatGlobalAddrOffOnly<ValueType ty, PatFrag kind, NI inst> {
   def : Pat<(ty (kind (WebAssemblywrapper tglobaladdr:$off), ty:$val)),
             (!cast<NI>(inst#_A32) 0, tglobaladdr:$off, (CONST_I32 0), ty:$val)>,
-        Requires<[HasAddr32]>;
+        Requires<[HasAddr32, HasAtomics, IsNotPIC]>;
   def : Pat<(ty (kind (WebAssemblywrapper tglobaladdr:$off), ty:$val)),
             (!cast<NI>(inst#_A64) 0, tglobaladdr:$off, (CONST_I64 0), ty:$val)>,
-        Requires<[HasAddr64]>;
+        Requires<[HasAddr64, HasAtomics, IsNotPIC]>;
 }
 
 // Patterns for various addressing modes.
@@ -636,7 +634,6 @@ multiclass BinRMWPattern<PatFrag rmw_32, PatFrag rmw_64, string inst_32,
   defm : BinRMWPatGlobalAddrOffOnly<i64, rmw_64, inst_64>;
 }
 
-let Predicates = [HasAtomics] in {
 defm : BinRMWPattern<atomic_load_add_32, atomic_load_add_64,
                      "ATOMIC_RMW_ADD_I32", "ATOMIC_RMW_ADD_I64">;
 defm : BinRMWPattern<atomic_load_sub_32, atomic_load_sub_64,
@@ -649,7 +646,6 @@ defm : BinRMWPattern<atomic_load_xor_32, atomic_load_xor_64,
                      "ATOMIC_RMW_XOR_I32", "ATOMIC_RMW_XOR_I64">;
 defm : BinRMWPattern<atomic_swap_32, atomic_swap_64,
                      "ATOMIC_RMW_XCHG_I32", "ATOMIC_RMW_XCHG_I64">;
-} // Predicates = [HasAtomics]
 
 // Truncating & zero-extending binary RMW patterns.
 // These are combined patterns of truncating store patterns and zero-extending
@@ -752,7 +748,6 @@ multiclass BinRMWTruncExtPattern<
   defm : BinRMWPatGlobalAddrOffOnly<i64, sext_bin_rmw_16_64<rmw_16>, inst16_64>;
 }
 
-let Predicates = [HasAtomics] in {
 defm : BinRMWTruncExtPattern<
   atomic_load_add_8, atomic_load_add_16, atomic_load_add_32, atomic_load_add_64,
   "ATOMIC_RMW8_U_ADD_I32", "ATOMIC_RMW16_U_ADD_I32",
@@ -778,7 +773,6 @@ defm : BinRMWTruncExtPattern<
   "ATOMIC_RMW8_U_XCHG_I32", "ATOMIC_RMW16_U_XCHG_I32",
   "ATOMIC_RMW8_U_XCHG_I64", "ATOMIC_RMW16_U_XCHG_I64",
   "ATOMIC_RMW32_U_XCHG_I64">;
-} // Predicates = [HasAtomics]
 
 //===----------------------------------------------------------------------===//
 // Atomic ternary read-modify-writes
@@ -827,10 +821,10 @@ defm ATOMIC_RMW32_U_CMPXCHG_I64 :
 multiclass TerRMWPatNoOffset<ValueType ty, PatFrag kind, string inst> {
   def : Pat<(ty (kind I32:$addr, ty:$exp, ty:$new)),
             (!cast<NI>(inst#_A32) 0, 0, I32:$addr, ty:$exp, ty:$new)>,
-        Requires<[HasAddr32]>;
+        Requires<[HasAddr32, HasAtomics]>;
   def : Pat<(ty (kind I64:$addr, ty:$exp, ty:$new)),
             (!cast<NI>(inst#_A64) 0, 0, I64:$addr, ty:$exp, ty:$new)>,
-        Requires<[HasAddr64]>;
+        Requires<[HasAddr64, HasAtomics]>;
 }
 
 // Select ternary RMWs with a constant offset.
@@ -840,10 +834,10 @@ multiclass TerRMWPatImmOff<ValueType ty, PatFrag kind, PatFrag operand,
                            string inst> {
   def : Pat<(ty (kind (operand I32:$addr, imm:$off), ty:$exp, ty:$new)),
             (!cast<NI>(inst#_A32) 0, imm:$off, I32:$addr, ty:$exp, ty:$new)>,
-        Requires<[HasAddr32]>;
+        Requires<[HasAddr32, HasAtomics]>;
   def : Pat<(ty (kind (operand I64:$addr, imm:$off), ty:$exp, ty:$new)),
             (!cast<NI>(inst#_A64) 0, imm:$off, I64:$addr, ty:$exp, ty:$new)>,
-        Requires<[HasAddr64]>;
+        Requires<[HasAddr64, HasAtomics]>;
 }
 
 // Select ternary RMWs with just a constant offset.
@@ -860,11 +854,11 @@ multiclass TerRMWPatGlobalAddrOffOnly<ValueType ty, PatFrag kind, string inst> {
   def : Pat<(ty (kind (WebAssemblywrapper tglobaladdr:$off), ty:$exp, ty:$new)),
             (!cast<NI>(inst#_A32) 0, tglobaladdr:$off, (CONST_I32 0), ty:$exp,
               ty:$new)>,
-        Requires<[HasAddr32]>;
+        Requires<[HasAddr32, HasAtomics, IsNotPIC]>;
   def : Pat<(ty (kind (WebAssemblywrapper tglobaladdr:$off), ty:$exp, ty:$new)),
             (!cast<NI>(inst#_A64) 0, tglobaladdr:$off, (CONST_I64 0), ty:$exp,
               ty:$new)>,
-        Requires<[HasAddr64]>;
+        Requires<[HasAddr64, HasAtomics, IsNotPIC]>;
 }
 
 // Patterns for various addressing modes.
@@ -885,7 +879,6 @@ multiclass TerRMWPattern<PatFrag rmw_32, PatFrag rmw_64, string inst_32,
   defm : TerRMWPatGlobalAddrOffOnly<i64, rmw_64, inst_64>;
 }
 
-let Predicates = [HasAtomics] in
 defm : TerRMWPattern<atomic_cmp_swap_32, atomic_cmp_swap_64,
                      "ATOMIC_RMW_CMPXCHG_I32", "ATOMIC_RMW_CMPXCHG_I64">;
 
@@ -994,7 +987,6 @@ multiclass TerRMWTruncExtPattern<
   defm : TerRMWPatGlobalAddrOffOnly<i64, sext_ter_rmw_16_64<rmw_16>, inst16_64>;
 }
 
-let Predicates = [HasAtomics] in
 defm : TerRMWTruncExtPattern<
   atomic_cmp_swap_8, atomic_cmp_swap_16, atomic_cmp_swap_32, atomic_cmp_swap_64,
   "ATOMIC_RMW8_U_CMPXCHG_I32", "ATOMIC_RMW16_U_CMPXCHG_I32",
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrBulkMemory.td b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrBulkMemory.td
index 3e9ef6fbc7ea..7aeae54d95a8 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrBulkMemory.td
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrBulkMemory.td
@@ -39,7 +39,7 @@ let mayStore = 1, hasSideEffects = 1 in
 defm MEMORY_INIT_A#B :
   BULK_I<(outs),
          (ins i32imm_op:$seg, i32imm_op:$idx, rc:$dest,
-              rc:$offset, rc:$size),
+              I32:$offset, I32:$size),
          (outs), (ins i32imm_op:$seg, i32imm_op:$idx),
          [],
          "memory.init\t$seg, $idx, $dest, $offset, $size",
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
index 171dd9a67beb..702560bea100 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
@@ -103,7 +103,7 @@ defm FALLTHROUGH_RETURN : I<(outs), (ins variable_ops), (outs), (ins), []>;
 
 } // isReturn = 1
 
-let isTrap = 1 in
+let IsCanonical = 1, isTrap = 1 in
 defm UNREACHABLE : NRI<(outs), (ins), [(trap)], "unreachable", 0x00>;
 
 } // isTerminator = 1
@@ -131,14 +131,11 @@ defm THROW : I<(outs), (ins event_op:$tag, variable_ops),
                (outs), (ins event_op:$tag),
                [(WebAssemblythrow (WebAssemblywrapper texternalsym:$tag))],
                "throw   \t$tag", "throw   \t$tag", 0x08>;
-defm RETHROW : I<(outs), (ins EXNREF:$exn), (outs), (ins), [],
-                 "rethrow \t$exn", "rethrow", 0x09>;
-// Pseudo instruction to be the lowering target of int_wasm_rethrow_in_catch
-// intrinsic. Will be converted to the real rethrow instruction later.
-let isPseudo = 1 in
-defm RETHROW_IN_CATCH : NRI<(outs), (ins), [(int_wasm_rethrow_in_catch)],
-                            "rethrow_in_catch", 0>;
+defm RETHROW : NRI<(outs), (ins i32imm:$depth), [], "rethrow \t$depth", 0x09>;
 } // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1
+// For C++ support, we only rethrow the latest exception, thus always setting
+// the depth to 0.
+def : Pat<(int_wasm_rethrow), (RETHROW 0)>;
 
 // Region within which an exception is caught: try / end_try
 let Uses = [VALUE_STACK], Defs = [VALUE_STACK] in {
@@ -146,26 +143,18 @@ defm TRY     : NRI<(outs), (ins Signature:$sig), [], "try     \t$sig", 0x06>;
 defm END_TRY : NRI<(outs), (ins), [], "end_try", 0x0b>;
 } // Uses = [VALUE_STACK], Defs = [VALUE_STACK]
 
-// Catching an exception: catch / extract_exception
-let hasCtrlDep = 1, hasSideEffects = 1 in
-defm CATCH : I<(outs EXNREF:$dst), (ins), (outs), (ins), [],
-               "catch   \t$dst", "catch", 0x07>;
-
-// Querying / extracing exception: br_on_exn
-// br_on_exn queries an exnref to see if it matches the corresponding exception
-// tag index. If true it branches to the given label and pushes the
-// corresponding argument values of the exception onto the stack.
-let isBranch = 1, isTerminator = 1, hasCtrlDep = 1 in
-defm BR_ON_EXN : I<(outs), (ins bb_op:$dst, event_op:$tag, EXNREF:$exn),
-                   (outs), (ins bb_op:$dst, event_op:$tag), [],
-                   "br_on_exn \t$dst, $tag, $exn", "br_on_exn \t$dst, $tag",
-                   0x0a>;
-// This is a pseudo instruction that simulates popping a value from stack, which
-// has been pushed by br_on_exn
-let isCodeGenOnly = 1, hasSideEffects = 1 in
-defm EXTRACT_EXCEPTION_I32 : NRI<(outs I32:$dst), (ins),
-                                 [(set I32:$dst, (int_wasm_extract_exception))],
-                                 "extract_exception\t$dst">;
+// Catching an exception: catch / catch_all
+let hasCtrlDep = 1, hasSideEffects = 1 in {
+// Currently 'catch' can only extract an i32, which is sufficient for C++
+// support, but according to the spec 'catch' can extract any number of values
+// based on the event type.
+defm CATCH : I<(outs I32:$dst), (ins event_op:$tag),
+               (outs), (ins event_op:$tag),
+               [(set I32:$dst,
+                 (WebAssemblycatch (WebAssemblywrapper texternalsym:$tag)))],
+               "catch   \t$dst, $tag", "catch   \t$tag", 0x07>;
+defm CATCH_ALL : NRI<(outs), (ins), [], "catch_all", 0x05>;
+}
 
 // Pseudo instructions: cleanupret / catchret
 let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1,
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
index 6fe1fd2b5c5a..db2ad05b4cdf 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
@@ -76,8 +76,10 @@ void WebAssemblyInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     CopyOpcode = WebAssembly::COPY_F64;
   else if (RC == &WebAssembly::V128RegClass)
     CopyOpcode = WebAssembly::COPY_V128;
-  else if (RC == &WebAssembly::EXNREFRegClass)
-    CopyOpcode = WebAssembly::COPY_EXNREF;
+  else if (RC == &WebAssembly::FUNCREFRegClass)
+    CopyOpcode = WebAssembly::COPY_FUNCREF;
+  else if (RC == &WebAssembly::EXTERNREFRegClass)
+    CopyOpcode = WebAssembly::COPY_EXTERNREF;
   else
     llvm_unreachable("Unexpected register class");
 
@@ -139,14 +141,6 @@ bool WebAssemblyInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
       else
         FBB = MI.getOperand(0).getMBB();
       break;
-    case WebAssembly::BR_ON_EXN:
-      if (HaveCond)
-        return true;
-      Cond.push_back(MachineOperand::CreateImm(true));
-      Cond.push_back(MI.getOperand(2));
-      TBB = MI.getOperand(0).getMBB();
-      HaveCond = true;
-      break;
     }
     if (MI.isBarrier())
       break;
@@ -192,24 +186,10 @@ unsigned WebAssemblyInstrInfo::insertBranch(
 
   assert(Cond.size() == 2 && "Expected a flag and a successor block");
 
-  MachineFunction &MF = *MBB.getParent();
-  auto &MRI = MF.getRegInfo();
-  bool IsBrOnExn = Cond[1].isReg() && MRI.getRegClass(Cond[1].getReg()) ==
-                                          &WebAssembly::EXNREFRegClass;
-
-  if (Cond[0].getImm()) {
-    if (IsBrOnExn) {
-      const char *CPPExnSymbol = MF.createExternalSymbolName("__cpp_exception");
-      BuildMI(&MBB, DL, get(WebAssembly::BR_ON_EXN))
-          .addMBB(TBB)
-          .addExternalSymbol(CPPExnSymbol)
-          .add(Cond[1]);
-    } else
-      BuildMI(&MBB, DL, get(WebAssembly::BR_IF)).addMBB(TBB).add(Cond[1]);
-  } else {
-    assert(!IsBrOnExn && "br_on_exn does not have a reversed condition");
+  if (Cond[0].getImm())
+    BuildMI(&MBB, DL, get(WebAssembly::BR_IF)).addMBB(TBB).add(Cond[1]);
+  else
     BuildMI(&MBB, DL, get(WebAssembly::BR_UNLESS)).addMBB(TBB).add(Cond[1]);
-  }
   if (!FBB)
     return 1;
 
@@ -220,14 +200,6 @@ unsigned WebAssemblyInstrInfo::insertBranch(
 bool WebAssemblyInstrInfo::reverseBranchCondition(
     SmallVectorImpl<MachineOperand> &Cond) const {
   assert(Cond.size() == 2 && "Expected a flag and a condition expression");
-
-  // br_on_exn's condition cannot be reversed
-  MachineFunction &MF = *Cond[1].getParent()->getParent()->getParent();
-  auto &MRI = MF.getRegInfo();
-  if (Cond[1].isReg() &&
-      MRI.getRegClass(Cond[1].getReg()) == &WebAssembly::EXNREFRegClass)
-    return true;
-
   Cond.front() = MachineOperand::CreateImm(!Cond.front().getImm());
   return false;
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index 085910f01ee6..2f5a64a87a59 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -74,8 +74,6 @@ def SDT_WebAssemblyCallSeqStart : SDCallSeqStart<[SDTCisVT<0, iPTR>,
                                                   SDTCisVT<1, iPTR>]>;
 def SDT_WebAssemblyCallSeqEnd :
     SDCallSeqEnd<[SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>;
-def SDT_WebAssemblyCall0      : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
-def SDT_WebAssemblyCall1      : SDTypeProfile<1, -1, [SDTCisPtrTy<1>]>;
 def SDT_WebAssemblyBrTable    : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
 def SDT_WebAssemblyArgument   : SDTypeProfile<1, 1, [SDTCisVT<1, i32>]>;
 def SDT_WebAssemblyReturn     : SDTypeProfile<0, -1, []>;
@@ -83,7 +81,8 @@ def SDT_WebAssemblyWrapper    : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
                                                      SDTCisPtrTy<0>]>;
 def SDT_WebAssemblyWrapperPIC : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
                                                      SDTCisPtrTy<0>]>;
-def SDT_WebAssemblyThrow      : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
+def SDT_WebAssemblyThrow      : SDTypeProfile<0, -1, []>;
+def SDT_WebAssemblyCatch      : SDTypeProfile<1, 1, [SDTCisPtrTy<0>]>;
 
 //===----------------------------------------------------------------------===//
 // WebAssembly-specific DAG Nodes.
@@ -109,6 +108,8 @@ def WebAssemblywrapperPIC  : SDNode<"WebAssemblyISD::WrapperPIC",
                                      SDT_WebAssemblyWrapperPIC>;
 def WebAssemblythrow : SDNode<"WebAssemblyISD::THROW", SDT_WebAssemblyThrow,
                               [SDNPHasChain, SDNPVariadic]>;
+def WebAssemblycatch : SDNode<"WebAssemblyISD::CATCH", SDT_WebAssemblyCatch,
+                              [SDNPHasChain, SDNPSideEffect]>;
 
 //===----------------------------------------------------------------------===//
 // WebAssembly-specific Operands.
@@ -163,6 +164,9 @@ def vec_i64imm_op : Operand<i64>;
 let OperandType = "OPERAND_FUNCTION32" in
 def function32_op : Operand<i32>;
 
+let OperandType = "OPERAND_TABLE" in
+def table32_op : Operand<i32>;
+
 let OperandType = "OPERAND_OFFSET32" in
 def offset32_op : Operand<i32>;
 
@@ -184,6 +188,11 @@ def Signature : Operand<i32> {
   let PrintMethod = "printWebAssemblySignatureOperand";
 }
 
+let OperandType = "OPERAND_HEAPTYPE" in
+def HeapType : Operand<i32> {
+  let PrintMethod = "printWebAssemblyHeapTypeOperand";
+}
+
 let OperandType = "OPERAND_TYPEINDEX" in
 def TypeIndex : Operand<i32>;
 
@@ -236,7 +245,8 @@ defm "": ARGUMENT<I32, i32>;
 defm "": ARGUMENT<I64, i64>;
 defm "": ARGUMENT<F32, f32>;
 defm "": ARGUMENT<F64, f64>;
-defm "": ARGUMENT<EXNREF, exnref>;
+defm "": ARGUMENT<FUNCREF, funcref>;
+defm "": ARGUMENT<EXTERNREF, externref>;
 
 // local.get and local.set are not generated by instruction selection; they
 // are implied by virtual register uses and defs.
@@ -306,7 +316,8 @@ defm "" : LOCAL<I64>;
 defm "" : LOCAL<F32>;
 defm "" : LOCAL<F64>;
 defm "" : LOCAL<V128>, Requires<[HasSIMD128]>;
-defm "" : LOCAL<EXNREF>, Requires<[HasExceptionHandling]>;
+defm "" : LOCAL<FUNCREF>, Requires<[HasReferenceTypes]>;
+defm "" : LOCAL<EXTERNREF>, Requires<[HasReferenceTypes]>;
 
 let isMoveImm = 1, isAsCheapAsAMove = 1, isReMaterializable = 1 in {
 defm CONST_I32 : I<(outs I32:$res), (ins i32imm_op:$imm),
@@ -333,16 +344,25 @@ def : Pat<(i64 (WebAssemblywrapper tglobaladdr:$addr)),
           (CONST_I64 tglobaladdr:$addr)>, Requires<[IsNotPIC, HasAddr64]>;
 
 def : Pat<(i32 (WebAssemblywrapper tglobaladdr:$addr)),
-          (GLOBAL_GET_I32 tglobaladdr:$addr)>, Requires<[IsPIC]>;
+          (GLOBAL_GET_I32 tglobaladdr:$addr)>, Requires<[IsPIC, HasAddr32]>;
 
 def : Pat<(i32 (WebAssemblywrapperPIC tglobaladdr:$addr)),
-          (CONST_I32 tglobaladdr:$addr)>, Requires<[IsPIC]>;
+          (CONST_I32 tglobaladdr:$addr)>, Requires<[IsPIC, HasAddr32]>;
+def : Pat<(i64 (WebAssemblywrapperPIC tglobaladdr:$addr)),
+          (CONST_I64 tglobaladdr:$addr)>, Requires<[IsPIC, HasAddr64]>;
+
+def : Pat<(i32 (WebAssemblywrapper tglobaltlsaddr:$addr)),
+          (CONST_I32 tglobaltlsaddr:$addr)>, Requires<[HasAddr32]>;
+def : Pat<(i64 (WebAssemblywrapper tglobaltlsaddr:$addr)),
+          (CONST_I64 tglobaltlsaddr:$addr)>, Requires<[HasAddr64]>;
 
 def : Pat<(i32 (WebAssemblywrapper texternalsym:$addr)),
-          (GLOBAL_GET_I32 texternalsym:$addr)>, Requires<[IsPIC]>;
+          (GLOBAL_GET_I32 texternalsym:$addr)>, Requires<[IsPIC, HasAddr32]>;
 
 def : Pat<(i32 (WebAssemblywrapper texternalsym:$addr)),
-          (CONST_I32 texternalsym:$addr)>, Requires<[IsNotPIC]>;
+          (CONST_I32 texternalsym:$addr)>, Requires<[IsNotPIC, HasAddr32]>;
+def : Pat<(i64 (WebAssemblywrapper texternalsym:$addr)),
+          (CONST_I64 texternalsym:$addr)>, Requires<[IsNotPIC, HasAddr64]>;
 
 def : Pat<(i32 (WebAssemblywrapper mcsym:$sym)), (CONST_I32 mcsym:$sym)>;
 def : Pat<(i64 (WebAssemblywrapper mcsym:$sym)), (CONST_I64 mcsym:$sym)>;
@@ -361,3 +381,4 @@ include "WebAssemblyInstrAtomics.td"
 include "WebAssemblyInstrSIMD.td"
 include "WebAssemblyInstrRef.td"
 include "WebAssemblyInstrBulkMemory.td"
+include "WebAssemblyInstrTable.td"
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
index b3c63cc1f884..48b934457267 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
@@ -70,7 +70,7 @@ defm LOAD_F64 : WebAssemblyLoad<F64, "f64.load", 0x2b, []>;
 multiclass LoadPatNoOffset<ValueType ty, PatFrag kind, string inst> {
   def : Pat<(ty (kind I32:$addr)), (!cast<NI>(inst # "_A32") 0, 0, I32:$addr)>,
         Requires<[HasAddr32]>;
-  def : Pat<(ty (kind I64:$addr)), (!cast<NI>(inst # "_A64") 0, 0, I64:$addr)>,
+  def : Pat<(ty (kind (i64 I64:$addr))), (!cast<NI>(inst # "_A64") 0, 0, I64:$addr)>,
         Requires<[HasAddr64]>;
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td
index 14d723750f07..7f324fc11210 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td
@@ -11,15 +11,29 @@
 ///
 //===----------------------------------------------------------------------===//
 
-defm SELECT_EXNREF : I<(outs EXNREF:$dst),
-                       (ins EXNREF:$lhs, EXNREF:$rhs, I32:$cond),
-                       (outs), (ins),
-                       [(set EXNREF:$dst,
-                         (select I32:$cond, EXNREF:$lhs, EXNREF:$rhs))],
-                       "exnref.select\t$dst, $lhs, $rhs, $cond",
-                       "exnref.select", 0x1b>;
+multiclass REF_I<WebAssemblyRegClass reg, ValueType vt> {
+  defm REF_NULL_#reg : I<(outs reg:$res), (ins HeapType:$heaptype),
+                         (outs), (ins HeapType:$heaptype),
+                         [],
+                         "ref.null\t$res, $heaptype",
+                         "ref.null\t$heaptype",
+                         0xd0>,
+                       Requires<[HasReferenceTypes]>;
+  defm SELECT_#reg: I<(outs reg:$dst), (ins reg:$lhs, reg:$rhs, I32:$cond),
+                      (outs), (ins),
+                      [(set reg:$dst,
+                        (select I32:$cond, reg:$lhs, reg:$rhs))],
+                      vt#".select\t$dst, $lhs, $rhs, $cond",
+                      vt#".select", 0x1b>,
+                    Requires<[HasReferenceTypes]>;
+}
 
-def : Pat<(select (i32 (setne I32:$cond, 0)), EXNREF:$lhs, EXNREF:$rhs),
-          (SELECT_EXNREF EXNREF:$lhs, EXNREF:$rhs, I32:$cond)>;
-def : Pat<(select (i32 (seteq I32:$cond, 0)), EXNREF:$lhs, EXNREF:$rhs),
-          (SELECT_EXNREF EXNREF:$rhs, EXNREF:$lhs, I32:$cond)>;
+defm "" : REF_I<FUNCREF, funcref>;
+defm "" : REF_I<EXTERNREF, externref>;
+
+foreach reg = [FUNCREF, EXTERNREF] in {
+def : Pat<(select (i32 (setne I32:$cond, 0)), reg:$lhs, reg:$rhs),
+          (!cast<Instruction>("SELECT_"#reg) reg:$lhs, reg:$rhs, I32:$cond)>;
+def : Pat<(select (i32 (seteq I32:$cond, 0)), reg:$lhs, reg:$rhs),
+          (!cast<Instruction>("SELECT_"#reg) reg:$rhs, reg:$lhs, I32:$cond)>;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 4f3da2f35c61..9f3d0f4ab2c3 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -16,7 +16,9 @@ multiclass SIMD_I<dag oops_r, dag iops_r, dag oops_s, dag iops_s,
                   list<dag> pattern_r, string asmstr_r = "",
                   string asmstr_s = "", bits<32> simdop = -1> {
   defm "" : I<oops_r, iops_r, oops_s, iops_s, pattern_r, asmstr_r, asmstr_s,
-              !or(0xfd00, !and(0xff, simdop))>,
+              !if(!ge(simdop, 0x100),
+                  !or(0xfd0000, !and(0xffff, simdop)),
+                  !or(0xfd00, !and(0xff, simdop)))>,
             Requires<[HasSIMD128]>;
 }
 
@@ -35,6 +37,99 @@ def ImmI#SIZE : ImmLeaf<i32,
 foreach SIZE = [2, 4, 8, 16, 32] in
 def LaneIdx#SIZE : ImmLeaf<i32, "return 0 <= Imm && Imm < "#SIZE#";">;
 
+// Create vector with identical lanes: splat
+def splat2 : PatFrag<(ops node:$x), (build_vector $x, $x)>;
+def splat4 : PatFrag<(ops node:$x), (build_vector $x, $x, $x, $x)>;
+def splat8 : PatFrag<(ops node:$x), (build_vector $x, $x, $x, $x,
+                                                  $x, $x, $x, $x)>;
+def splat16 : PatFrag<(ops node:$x),
+                      (build_vector $x, $x, $x, $x, $x, $x, $x, $x,
+                                    $x, $x, $x, $x, $x, $x, $x, $x)>;
+
+class Vec {
+  ValueType vt;
+  ValueType int_vt;
+  ValueType lane_vt;
+  WebAssemblyRegClass lane_rc;
+  int lane_bits;
+  ImmLeaf lane_idx;
+  PatFrag splat;
+  string prefix;
+  Vec split;
+}
+
+def I8x16 : Vec {
+  let vt = v16i8;
+  let int_vt = vt;
+  let lane_vt = i32;
+  let lane_rc = I32;
+  let lane_bits = 8;
+  let lane_idx = LaneIdx16;
+  let splat = splat16;
+  let prefix = "i8x16";
+}
+
+def I16x8 : Vec {
+  let vt = v8i16;
+  let int_vt = vt;
+  let lane_vt = i32;
+  let lane_rc = I32;
+  let lane_bits = 16;
+  let lane_idx = LaneIdx8;
+  let splat = splat8;
+  let prefix = "i16x8";
+  let split = I8x16;
+}
+
+def I32x4 : Vec {
+  let vt = v4i32;
+  let int_vt = vt;
+  let lane_vt = i32;
+  let lane_rc = I32;
+  let lane_bits = 32;
+  let lane_idx = LaneIdx4;
+  let splat = splat4;
+  let prefix = "i32x4";
+  let split = I16x8;
+}
+
+def I64x2 : Vec {
+  let vt = v2i64;
+  let int_vt = vt;
+  let lane_vt = i64;
+  let lane_rc = I64;
+  let lane_bits = 64;
+  let lane_idx = LaneIdx2;
+  let splat = splat2;
+  let prefix = "i64x2";
+  let split = I32x4;
+}
+
+def F32x4 : Vec {
+  let vt = v4f32;
+  let int_vt = v4i32;
+  let lane_vt = f32;
+  let lane_rc = F32;
+  let lane_bits = 32;
+  let lane_idx = LaneIdx4;
+  let splat = splat4;
+  let prefix = "f32x4";
+}
+
+def F64x2 : Vec {
+  let vt = v2f64;
+  let int_vt = v2i64;
+  let lane_vt = f64;
+  let lane_rc = F64;
+  let lane_bits = 64;
+  let lane_idx = LaneIdx2;
+  let splat = splat2;
+  let prefix = "f64x2";
+}
+
+defvar AllVecs = [I8x16, I16x8, I32x4, I64x2, F32x4, F64x2];
+defvar IntVecs = [I8x16, I16x8, I32x4, I64x2];
+
 //===----------------------------------------------------------------------===//
 // Load and store
 //===----------------------------------------------------------------------===//
@@ -53,116 +148,186 @@ defm LOAD_V128_A64 :
          "v128.load\t$off$p2align", 0>;
 }
 
-// Def load and store patterns from WebAssemblyInstrMemory.td for vector types
-foreach vec_t = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in {
-defm : LoadPatNoOffset<vec_t, load, "LOAD_V128">;
-defm : LoadPatImmOff<vec_t, load, regPlusImm, "LOAD_V128">;
-defm : LoadPatImmOff<vec_t, load, or_is_add, "LOAD_V128">;
-defm : LoadPatOffsetOnly<vec_t, load, "LOAD_V128">;
-defm : LoadPatGlobalAddrOffOnly<vec_t, load, "LOAD_V128">;
+// Def load patterns from WebAssemblyInstrMemory.td for vector types
+foreach vec = AllVecs in {
+defm : LoadPatNoOffset<vec.vt, load, "LOAD_V128">;
+defm : LoadPatImmOff<vec.vt, load, regPlusImm, "LOAD_V128">;
+defm : LoadPatImmOff<vec.vt, load, or_is_add, "LOAD_V128">;
+defm : LoadPatOffsetOnly<vec.vt, load, "LOAD_V128">;
+defm : LoadPatGlobalAddrOffOnly<vec.vt, load, "LOAD_V128">;
 }
 
-// vNxM.load_splat
-multiclass SIMDLoadSplat<string vec, bits<32> simdop> {
+// v128.loadX_splat
+multiclass SIMDLoadSplat<int size, bits<32> simdop> {
   let mayLoad = 1, UseNamedOperandTable = 1 in {
-  defm LOAD_SPLAT_#vec#_A32 :
+  defm LOAD#size#_SPLAT_A32 :
     SIMD_I<(outs V128:$dst),
            (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
            (outs),
            (ins P2Align:$p2align, offset32_op:$off), [],
-           vec#".load_splat\t$dst, ${off}(${addr})$p2align",
-           vec#".load_splat\t$off$p2align", simdop>;
-  defm LOAD_SPLAT_#vec#_A64 :
+           "v128.load"#size#"_splat\t$dst, ${off}(${addr})$p2align",
+           "v128.load"#size#"_splat\t$off$p2align", simdop>;
+  defm LOAD#size#_SPLAT_A64 :
     SIMD_I<(outs V128:$dst),
            (ins P2Align:$p2align, offset64_op:$off, I64:$addr),
            (outs),
            (ins P2Align:$p2align, offset64_op:$off), [],
-           vec#".load_splat\t$dst, ${off}(${addr})$p2align",
-           vec#".load_splat\t$off$p2align", simdop>;
+           "v128.load"#size#"_splat\t$dst, ${off}(${addr})$p2align",
+           "v128.load"#size#"_splat\t$off$p2align", simdop>;
   }
 }
 
-defm "" : SIMDLoadSplat<"v8x16", 7>;
-defm "" : SIMDLoadSplat<"v16x8", 8>;
-defm "" : SIMDLoadSplat<"v32x4", 9>;
-defm "" : SIMDLoadSplat<"v64x2", 10>;
+defm "" : SIMDLoadSplat<8, 7>;
+defm "" : SIMDLoadSplat<16, 8>;
+defm "" : SIMDLoadSplat<32, 9>;
+defm "" : SIMDLoadSplat<64, 10>;
 
 def wasm_load_splat_t : SDTypeProfile<1, 1, [SDTCisPtrTy<1>]>;
 def wasm_load_splat : SDNode<"WebAssemblyISD::LOAD_SPLAT", wasm_load_splat_t,
                              [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 def load_splat : PatFrag<(ops node:$addr), (wasm_load_splat node:$addr)>;
 
-foreach args = [["v16i8", "v8x16"], ["v8i16", "v16x8"], ["v4i32", "v32x4"],
-                ["v2i64", "v64x2"], ["v4f32", "v32x4"], ["v2f64", "v64x2"]] in {
-defm : LoadPatNoOffset<!cast<ValueType>(args[0]),
-                       load_splat,
-                       "LOAD_SPLAT_"#args[1]>;
-defm : LoadPatImmOff<!cast<ValueType>(args[0]),
-                     load_splat,
-                     regPlusImm,
-                     "LOAD_SPLAT_"#args[1]>;
-defm : LoadPatImmOff<!cast<ValueType>(args[0]),
-                     load_splat,
-                     or_is_add,
-                     "LOAD_SPLAT_"#args[1]>;
-defm : LoadPatOffsetOnly<!cast<ValueType>(args[0]),
-                         load_splat,
-                         "LOAD_SPLAT_"#args[1]>;
-defm : LoadPatGlobalAddrOffOnly<!cast<ValueType>(args[0]),
-                                load_splat,
-                                "LOAD_SPLAT_"#args[1]>;
+foreach vec = AllVecs in {
+defvar inst = "LOAD"#vec.lane_bits#"_SPLAT";
+defm : LoadPatNoOffset<vec.vt, load_splat, inst>;
+defm : LoadPatImmOff<vec.vt, load_splat, regPlusImm, inst>;
+defm : LoadPatImmOff<vec.vt, load_splat, or_is_add, inst>;
+defm : LoadPatOffsetOnly<vec.vt, load_splat, inst>;
+defm : LoadPatGlobalAddrOffOnly<vec.vt, load_splat, inst>;
 }
 
 // Load and extend
-multiclass SIMDLoadExtend<ValueType vec_t, string name, bits<32> simdop> {
+multiclass SIMDLoadExtend<Vec vec, string loadPat, bits<32> simdop> {
+  defvar signed = vec.prefix#".load"#loadPat#"_s";
+  defvar unsigned = vec.prefix#".load"#loadPat#"_u";
   let mayLoad = 1, UseNamedOperandTable = 1 in {
-  defm LOAD_EXTEND_S_#vec_t#_A32 :
+  defm LOAD_EXTEND_S_#vec#_A32 :
     SIMD_I<(outs V128:$dst),
            (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
            (outs), (ins P2Align:$p2align, offset32_op:$off), [],
-           name#"_s\t$dst, ${off}(${addr})$p2align",
-           name#"_s\t$off$p2align", simdop>;
-  defm LOAD_EXTEND_U_#vec_t#_A32 :
+           signed#"\t$dst, ${off}(${addr})$p2align",
+           signed#"\t$off$p2align", simdop>;
+  defm LOAD_EXTEND_U_#vec#_A32 :
     SIMD_I<(outs V128:$dst),
            (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
            (outs), (ins P2Align:$p2align, offset32_op:$off), [],
-           name#"_u\t$dst, ${off}(${addr})$p2align",
-           name#"_u\t$off$p2align", !add(simdop, 1)>;
-  defm LOAD_EXTEND_S_#vec_t#_A64 :
+           unsigned#"\t$dst, ${off}(${addr})$p2align",
+           unsigned#"\t$off$p2align", !add(simdop, 1)>;
+  defm LOAD_EXTEND_S_#vec#_A64 :
     SIMD_I<(outs V128:$dst),
            (ins P2Align:$p2align, offset64_op:$off, I64:$addr),
            (outs), (ins P2Align:$p2align, offset64_op:$off), [],
-           name#"_s\t$dst, ${off}(${addr})$p2align",
-           name#"_s\t$off$p2align", simdop>;
-  defm LOAD_EXTEND_U_#vec_t#_A64 :
+           signed#"\t$dst, ${off}(${addr})$p2align",
+           signed#"\t$off$p2align", simdop>;
+  defm LOAD_EXTEND_U_#vec#_A64 :
     SIMD_I<(outs V128:$dst),
            (ins P2Align:$p2align, offset64_op:$off, I64:$addr),
            (outs), (ins P2Align:$p2align, offset64_op:$off), [],
-           name#"_u\t$dst, ${off}(${addr})$p2align",
-           name#"_u\t$off$p2align", !add(simdop, 1)>;
+           unsigned#"\t$dst, ${off}(${addr})$p2align",
+           unsigned#"\t$off$p2align", !add(simdop, 1)>;
   }
 }
 
-defm "" : SIMDLoadExtend<v8i16, "i16x8.load8x8", 1>;
-defm "" : SIMDLoadExtend<v4i32, "i32x4.load16x4", 3>;
-defm "" : SIMDLoadExtend<v2i64, "i64x2.load32x2", 5>;
+defm "" : SIMDLoadExtend<I16x8, "8x8", 1>;
+defm "" : SIMDLoadExtend<I32x4, "16x4", 3>;
+defm "" : SIMDLoadExtend<I64x2, "32x2", 5>;
+
+foreach vec = [I16x8, I32x4, I64x2] in
+foreach exts = [["sextloadvi", "_S"],
+                ["zextloadvi", "_U"],
+                ["extloadvi", "_U"]] in {
+defvar loadpat = !cast<PatFrag>(exts[0]#vec.split.lane_bits);
+defvar inst = "LOAD_EXTEND"#exts[1]#"_"#vec;
+defm : LoadPatNoOffset<vec.vt, loadpat, inst>;
+defm : LoadPatImmOff<vec.vt, loadpat, regPlusImm, inst>;
+defm : LoadPatImmOff<vec.vt, loadpat, or_is_add, inst>;
+defm : LoadPatOffsetOnly<vec.vt, loadpat, inst>;
+defm : LoadPatGlobalAddrOffOnly<vec.vt, loadpat, inst>;
+}
+
+// Load lane into zero vector
+multiclass SIMDLoadZero<Vec vec, bits<32> simdop> {
+  defvar name = "v128.load"#vec.lane_bits#"_zero";
+  let mayLoad = 1, UseNamedOperandTable = 1 in {
+  defm LOAD_ZERO_#vec#_A32 :
+    SIMD_I<(outs V128:$dst),
+           (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
+           (outs), (ins P2Align:$p2align, offset32_op:$off), [],
+           name#"\t$dst, ${off}(${addr})$p2align",
+           name#"\t$off$p2align", simdop>;
+  defm LOAD_ZERO_#vec#_A64 :
+    SIMD_I<(outs V128:$dst),
+           (ins P2Align:$p2align, offset64_op:$off, I64:$addr),
+           (outs), (ins P2Align:$p2align, offset64_op:$off), [],
+           name#"\t$dst, ${off}(${addr})$p2align",
+           name#"\t$off$p2align", simdop>;
+  } // mayLoad = 1, UseNamedOperandTable = 1
+}
+
+// TODO: Also support v4f32 and v2f64 once the instructions are merged
+// to the proposal
+defm "" : SIMDLoadZero<I32x4, 252>;
+defm "" : SIMDLoadZero<I64x2, 253>;
+
+foreach vec = [I32x4, I64x2] in {
+defvar loadpat = !cast<Intrinsic>("int_wasm_load"#vec.lane_bits#"_zero");
+defvar inst = "LOAD_ZERO_"#vec;
+defm : LoadPatNoOffset<vec.vt, loadpat, inst>;
+defm : LoadPatImmOff<vec.vt, loadpat, regPlusImm, inst>;
+defm : LoadPatImmOff<vec.vt, loadpat, or_is_add, inst>;
+defm : LoadPatOffsetOnly<vec.vt, loadpat, inst>;
+defm : LoadPatGlobalAddrOffOnly<vec.vt, loadpat, inst>;
+}
+
+// Load lane
+multiclass SIMDLoadLane<Vec vec, bits<32> simdop> {
+  defvar name = "v128.load"#vec.lane_bits#"_lane";
+  let mayLoad = 1, UseNamedOperandTable = 1 in {
+  defm LOAD_LANE_#vec#_A32 :
+    SIMD_I<(outs V128:$dst),
+           (ins P2Align:$p2align, offset32_op:$off, vec_i8imm_op:$idx,
+                I32:$addr, V128:$vec),
+           (outs), (ins P2Align:$p2align, offset32_op:$off, vec_i8imm_op:$idx),
+           [], name#"\t$dst, ${off}(${addr})$p2align, $vec, $idx",
+           name#"\t$off$p2align, $idx", simdop>;
+  defm LOAD_LANE_#vec#_A64 :
+    SIMD_I<(outs V128:$dst),
+           (ins P2Align:$p2align, offset64_op:$off, vec_i8imm_op:$idx,
+                I64:$addr, V128:$vec),
+           (outs), (ins P2Align:$p2align, offset64_op:$off, vec_i8imm_op:$idx),
+           [], name#"\t$dst, ${off}(${addr})$p2align, $vec, $idx",
+           name#"\t$off$p2align, $idx", simdop>;
+  } // mayLoad = 1, UseNamedOperandTable = 1
+}
 
-foreach types = [[v8i16, i8], [v4i32, i16], [v2i64, i32]] in
-foreach exts = [["sextloadv", "_S"],
-                ["zextloadv", "_U"],
-                ["extloadv", "_U"]] in {
-defm : LoadPatNoOffset<types[0], !cast<PatFrag>(exts[0]#types[1]),
-                       "LOAD_EXTEND"#exts[1]#"_"#types[0]>;
-defm : LoadPatImmOff<types[0], !cast<PatFrag>(exts[0]#types[1]), regPlusImm,
-                     "LOAD_EXTEND"#exts[1]#"_"#types[0]>;
-defm : LoadPatImmOff<types[0], !cast<PatFrag>(exts[0]#types[1]), or_is_add,
-                     "LOAD_EXTEND"#exts[1]#"_"#types[0]>;
-defm : LoadPatOffsetOnly<types[0], !cast<PatFrag>(exts[0]#types[1]),
-                         "LOAD_EXTEND"#exts[1]#"_"#types[0]>;
-defm : LoadPatGlobalAddrOffOnly<types[0], !cast<PatFrag>(exts[0]#types[1]),
-                                "LOAD_EXTEND"#exts[1]#"_"#types[0]>;
+// TODO: Also support v4f32 and v2f64 once the instructions are merged
+// to the proposal
+defm "" : SIMDLoadLane<I8x16, 88>;
+defm "" : SIMDLoadLane<I16x8, 89>;
+defm "" : SIMDLoadLane<I32x4, 90>;
+defm "" : SIMDLoadLane<I64x2, 91>;
+
+// Select loads with no constant offset.
+multiclass LoadLanePatNoOffset<Vec vec, PatFrag kind> {
+  defvar load_lane_a32 = !cast<NI>("LOAD_LANE_"#vec#"_A32");
+  defvar load_lane_a64 = !cast<NI>("LOAD_LANE_"#vec#"_A64");
+  def : Pat<(vec.vt (kind (i32 I32:$addr),
+              (vec.vt V128:$vec), (i32 vec.lane_idx:$idx))),
+            (load_lane_a32 0, 0, imm:$idx, $addr, $vec)>,
+        Requires<[HasAddr32]>;
+  def : Pat<(vec.vt (kind (i64 I64:$addr),
+              (vec.vt V128:$vec), (i32 vec.lane_idx:$idx))),
+            (load_lane_a64 0, 0, imm:$idx, $addr, $vec)>,
+        Requires<[HasAddr64]>;
 }
 
+defm : LoadLanePatNoOffset<I8x16, int_wasm_load8_lane>;
+defm : LoadLanePatNoOffset<I16x8, int_wasm_load16_lane>;
+defm : LoadLanePatNoOffset<I32x4, int_wasm_load32_lane>;
+defm : LoadLanePatNoOffset<I64x2, int_wasm_load64_lane>;
+
+// TODO: Also support the other load patterns for load_lane once the instructions
+// are merged to the proposal.
 
 // Store: v128.store
 let mayStore = 1, UseNamedOperandTable = 1 in {
@@ -177,30 +342,77 @@ defm STORE_V128_A64 :
          "v128.store\t${off}(${addr})$p2align, $vec",
          "v128.store\t$off$p2align", 11>;
 }
-foreach vec_t = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in {
-// Def load and store patterns from WebAssemblyInstrMemory.td for vector types
-defm : StorePatNoOffset<vec_t, store, "STORE_V128">;
-defm : StorePatImmOff<vec_t, store, regPlusImm, "STORE_V128">;
-defm : StorePatImmOff<vec_t, store, or_is_add, "STORE_V128">;
-defm : StorePatOffsetOnly<vec_t, store, "STORE_V128">;
-defm : StorePatGlobalAddrOffOnly<vec_t, store, "STORE_V128">;
+
+// Def store patterns from WebAssemblyInstrMemory.td for vector types
+foreach vec = AllVecs in {
+defm : StorePatNoOffset<vec.vt, store, "STORE_V128">;
+defm : StorePatImmOff<vec.vt, store, regPlusImm, "STORE_V128">;
+defm : StorePatImmOff<vec.vt, store, or_is_add, "STORE_V128">;
+defm : StorePatOffsetOnly<vec.vt, store, "STORE_V128">;
+defm : StorePatGlobalAddrOffOnly<vec.vt, store, "STORE_V128">;
+}
+
+// Store lane
+multiclass SIMDStoreLane<Vec vec, bits<32> simdop> {
+  defvar name = "v128.store"#vec.lane_bits#"_lane";
+  let mayStore = 1, UseNamedOperandTable = 1 in {
+  defm STORE_LANE_#vec#_A32 :
+    SIMD_I<(outs),
+           (ins P2Align:$p2align, offset32_op:$off, vec_i8imm_op:$idx,
+                I32:$addr, V128:$vec),
+           (outs), (ins P2Align:$p2align, offset32_op:$off, vec_i8imm_op:$idx),
+           [], name#"\t${off}(${addr})$p2align, $vec, $idx",
+           name#"\t$off$p2align, $idx", simdop>;
+  defm STORE_LANE_#vec#_A64 :
+    SIMD_I<(outs V128:$dst),
+           (ins P2Align:$p2align, offset64_op:$off, vec_i8imm_op:$idx,
+                I64:$addr, V128:$vec),
+           (outs), (ins P2Align:$p2align, offset64_op:$off, vec_i8imm_op:$idx),
+           [], name#"\t${off}(${addr})$p2align, $vec, $idx",
+           name#"\t$off$p2align, $idx", simdop>;
+  } // mayStore = 1, UseNamedOperandTable = 1
+}
+
+// TODO: Also support v4f32 and v2f64 once the instructions are merged
+// to the proposal
+defm "" : SIMDStoreLane<I8x16, 92>;
+defm "" : SIMDStoreLane<I16x8, 93>;
+defm "" : SIMDStoreLane<I32x4, 94>;
+defm "" : SIMDStoreLane<I64x2, 95>;
+
+// Select stores with no constant offset.
+multiclass StoreLanePatNoOffset<Vec vec, PatFrag kind> {
+  def : Pat<(kind (i32 I32:$addr), (vec.vt V128:$vec), (i32 vec.lane_idx:$idx)),
+            (!cast<NI>("STORE_LANE_"#vec#"_A32") 0, 0, imm:$idx, $addr, $vec)>,
+        Requires<[HasAddr32]>;
+  def : Pat<(kind (i64 I64:$addr), (vec.vt V128:$vec), (i32 vec.lane_idx:$idx)),
+            (!cast<NI>("STORE_LANE_"#vec#"_A64") 0, 0, imm:$idx, $addr, $vec)>,
+        Requires<[HasAddr64]>;
 }
 
+defm : StoreLanePatNoOffset<I8x16, int_wasm_store8_lane>;
+defm : StoreLanePatNoOffset<I16x8, int_wasm_store16_lane>;
+defm : StoreLanePatNoOffset<I32x4, int_wasm_store32_lane>;
+defm : StoreLanePatNoOffset<I64x2, int_wasm_store64_lane>;
+
+// TODO: Also support the other store patterns for store_lane once the
+// instructions are merged to the proposal.
+
 //===----------------------------------------------------------------------===//
 // Constructing SIMD values
 //===----------------------------------------------------------------------===//
 
 // Constant: v128.const
-multiclass ConstVec<ValueType vec_t, dag ops, dag pat, string args> {
+multiclass ConstVec<Vec vec, dag ops, dag pat, string args> {
   let isMoveImm = 1, isReMaterializable = 1,
       Predicates = [HasUnimplementedSIMD128] in
-  defm CONST_V128_#vec_t : SIMD_I<(outs V128:$dst), ops, (outs), ops,
-                                  [(set V128:$dst, (vec_t pat))],
-                                  "v128.const\t$dst, "#args,
-                                  "v128.const\t"#args, 12>;
+  defm CONST_V128_#vec : SIMD_I<(outs V128:$dst), ops, (outs), ops,
+                                 [(set V128:$dst, (vec.vt pat))],
+                                 "v128.const\t$dst, "#args,
+                                 "v128.const\t"#args, 12>;
 }
 
-defm "" : ConstVec<v16i8,
+defm "" : ConstVec<I8x16,
                    (ins vec_i8imm_op:$i0, vec_i8imm_op:$i1,
                         vec_i8imm_op:$i2, vec_i8imm_op:$i3,
                         vec_i8imm_op:$i4, vec_i8imm_op:$i5,
@@ -215,7 +427,7 @@ defm "" : ConstVec<v16i8,
                                  ImmI8:$iC, ImmI8:$iD, ImmI8:$iE, ImmI8:$iF),
                    !strconcat("$i0, $i1, $i2, $i3, $i4, $i5, $i6, $i7, ",
                               "$i8, $i9, $iA, $iB, $iC, $iD, $iE, $iF")>;
-defm "" : ConstVec<v8i16,
+defm "" : ConstVec<I16x8,
                    (ins vec_i16imm_op:$i0, vec_i16imm_op:$i1,
                         vec_i16imm_op:$i2, vec_i16imm_op:$i3,
                         vec_i16imm_op:$i4, vec_i16imm_op:$i5,
@@ -225,23 +437,23 @@ defm "" : ConstVec<v8i16,
                      ImmI16:$i4, ImmI16:$i5, ImmI16:$i6, ImmI16:$i7),
                    "$i0, $i1, $i2, $i3, $i4, $i5, $i6, $i7">;
 let IsCanonical = 1 in
-defm "" : ConstVec<v4i32,
+defm "" : ConstVec<I32x4,
                    (ins vec_i32imm_op:$i0, vec_i32imm_op:$i1,
                         vec_i32imm_op:$i2, vec_i32imm_op:$i3),
                    (build_vector (i32 imm:$i0), (i32 imm:$i1),
                                  (i32 imm:$i2), (i32 imm:$i3)),
                    "$i0, $i1, $i2, $i3">;
-defm "" : ConstVec<v2i64,
+defm "" : ConstVec<I64x2,
                    (ins vec_i64imm_op:$i0, vec_i64imm_op:$i1),
                    (build_vector (i64 imm:$i0), (i64 imm:$i1)),
                    "$i0, $i1">;
-defm "" : ConstVec<v4f32,
+defm "" : ConstVec<F32x4,
                    (ins f32imm_op:$i0, f32imm_op:$i1,
                         f32imm_op:$i2, f32imm_op:$i3),
                    (build_vector (f32 fpimm:$i0), (f32 fpimm:$i1),
                                  (f32 fpimm:$i2), (f32 fpimm:$i3)),
                    "$i0, $i1, $i2, $i3">;
-defm "" : ConstVec<v2f64,
+defm "" : ConstVec<F64x2,
                   (ins f64imm_op:$i0, f64imm_op:$i1),
                   (build_vector (f64 fpimm:$i0), (f64 fpimm:$i1)),
                   "$i0, $i1">;
@@ -269,10 +481,10 @@ defm SHUFFLE :
            vec_i8imm_op:$mC, vec_i8imm_op:$mD,
            vec_i8imm_op:$mE, vec_i8imm_op:$mF),
          [],
-         "v8x16.shuffle\t$dst, $x, $y, "#
+         "i8x16.shuffle\t$dst, $x, $y, "#
            "$m0, $m1, $m2, $m3, $m4, $m5, $m6, $m7, "#
            "$m8, $m9, $mA, $mB, $mC, $mD, $mE, $mF",
-         "v8x16.shuffle\t"#
+         "i8x16.shuffle\t"#
            "$m0, $m1, $m2, $m3, $m4, $m5, $m6, $m7, "#
            "$m8, $m9, $mA, $mB, $mC, $mD, $mE, $mF",
          13>;
@@ -280,8 +492,8 @@ defm SHUFFLE :
 // Shuffles after custom lowering
 def wasm_shuffle_t : SDTypeProfile<1, 18, []>;
 def wasm_shuffle : SDNode<"WebAssemblyISD::SHUFFLE", wasm_shuffle_t>;
-foreach vec_t = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in {
-def : Pat<(vec_t (wasm_shuffle (vec_t V128:$x), (vec_t V128:$y),
+foreach vec = AllVecs in {
+def : Pat<(vec.vt (wasm_shuffle (vec.vt V128:$x), (vec.vt V128:$y),
             (i32 LaneIdx32:$m0), (i32 LaneIdx32:$m1),
             (i32 LaneIdx32:$m2), (i32 LaneIdx32:$m3),
             (i32 LaneIdx32:$m4), (i32 LaneIdx32:$m5),
@@ -290,178 +502,150 @@ def : Pat<(vec_t (wasm_shuffle (vec_t V128:$x), (vec_t V128:$y),
             (i32 LaneIdx32:$mA), (i32 LaneIdx32:$mB),
             (i32 LaneIdx32:$mC), (i32 LaneIdx32:$mD),
             (i32 LaneIdx32:$mE), (i32 LaneIdx32:$mF))),
-          (vec_t (SHUFFLE (vec_t V128:$x), (vec_t V128:$y),
-            (i32 LaneIdx32:$m0), (i32 LaneIdx32:$m1),
-            (i32 LaneIdx32:$m2), (i32 LaneIdx32:$m3),
-            (i32 LaneIdx32:$m4), (i32 LaneIdx32:$m5),
-            (i32 LaneIdx32:$m6), (i32 LaneIdx32:$m7),
-            (i32 LaneIdx32:$m8), (i32 LaneIdx32:$m9),
-            (i32 LaneIdx32:$mA), (i32 LaneIdx32:$mB),
-            (i32 LaneIdx32:$mC), (i32 LaneIdx32:$mD),
-            (i32 LaneIdx32:$mE), (i32 LaneIdx32:$mF)))>;
+          (SHUFFLE $x, $y,
+            imm:$m0, imm:$m1, imm:$m2, imm:$m3,
+            imm:$m4, imm:$m5, imm:$m6, imm:$m7,
+            imm:$m8, imm:$m9, imm:$mA, imm:$mB,
+            imm:$mC, imm:$mD, imm:$mE, imm:$mF)>;
 }
 
-// Swizzle lanes: v8x16.swizzle
+// Swizzle lanes: i8x16.swizzle
 def wasm_swizzle_t : SDTypeProfile<1, 2, []>;
 def wasm_swizzle : SDNode<"WebAssemblyISD::SWIZZLE", wasm_swizzle_t>;
 defm SWIZZLE :
   SIMD_I<(outs V128:$dst), (ins V128:$src, V128:$mask), (outs), (ins),
          [(set (v16i8 V128:$dst),
            (wasm_swizzle (v16i8 V128:$src), (v16i8 V128:$mask)))],
-         "v8x16.swizzle\t$dst, $src, $mask", "v8x16.swizzle", 14>;
+         "i8x16.swizzle\t$dst, $src, $mask", "i8x16.swizzle", 14>;
 
 def : Pat<(int_wasm_swizzle (v16i8 V128:$src), (v16i8 V128:$mask)),
-          (SWIZZLE V128:$src, V128:$mask)>;
+          (SWIZZLE $src, $mask)>;
+
+multiclass Splat<Vec vec, bits<32> simdop> {
+  defm SPLAT_#vec : SIMD_I<(outs V128:$dst), (ins vec.lane_rc:$x),
+                           (outs), (ins),
+                           [(set (vec.vt V128:$dst),
+                              (vec.splat vec.lane_rc:$x))],
+                           vec.prefix#".splat\t$dst, $x", vec.prefix#".splat",
+                           simdop>;
+}
 
-// Create vector with identical lanes: splat
-def splat2 : PatFrag<(ops node:$x), (build_vector node:$x, node:$x)>;
-def splat4 : PatFrag<(ops node:$x), (build_vector
-                       node:$x, node:$x, node:$x, node:$x)>;
-def splat8 : PatFrag<(ops node:$x), (build_vector
-                       node:$x, node:$x, node:$x, node:$x,
-                       node:$x, node:$x, node:$x, node:$x)>;
-def splat16 : PatFrag<(ops node:$x), (build_vector
-                        node:$x, node:$x, node:$x, node:$x,
-                        node:$x, node:$x, node:$x, node:$x,
-                        node:$x, node:$x, node:$x, node:$x,
-                        node:$x, node:$x, node:$x, node:$x)>;
-
-multiclass Splat<ValueType vec_t, string vec, WebAssemblyRegClass reg_t,
-                 PatFrag splat_pat, bits<32> simdop> {
-  defm SPLAT_#vec_t : SIMD_I<(outs V128:$dst), (ins reg_t:$x), (outs), (ins),
-                             [(set (vec_t V128:$dst), (splat_pat reg_t:$x))],
-                             vec#".splat\t$dst, $x", vec#".splat", simdop>;
-}
-
-defm "" : Splat<v16i8, "i8x16", I32, splat16, 15>;
-defm "" : Splat<v8i16, "i16x8", I32, splat8, 16>;
-defm "" : Splat<v4i32, "i32x4", I32, splat4, 17>;
-defm "" : Splat<v2i64, "i64x2", I64, splat2, 18>;
-defm "" : Splat<v4f32, "f32x4", F32, splat4, 19>;
-defm "" : Splat<v2f64, "f64x2", F64, splat2, 20>;
+defm "" : Splat<I8x16, 15>;
+defm "" : Splat<I16x8, 16>;
+defm "" : Splat<I32x4, 17>;
+defm "" : Splat<I64x2, 18>;
+defm "" : Splat<F32x4, 19>;
+defm "" : Splat<F64x2, 20>;
 
 // scalar_to_vector leaves high lanes undefined, so can be a splat
-class ScalarSplatPat<ValueType vec_t, ValueType lane_t,
-                     WebAssemblyRegClass reg_t> :
-  Pat<(vec_t (scalar_to_vector (lane_t reg_t:$x))),
-      (!cast<Instruction>("SPLAT_"#vec_t) reg_t:$x)>;
-
-def : ScalarSplatPat<v16i8, i32, I32>;
-def : ScalarSplatPat<v8i16, i32, I32>;
-def : ScalarSplatPat<v4i32, i32, I32>;
-def : ScalarSplatPat<v2i64, i64, I64>;
-def : ScalarSplatPat<v4f32, f32, F32>;
-def : ScalarSplatPat<v2f64, f64, F64>;
+foreach vec = AllVecs in
+def : Pat<(vec.vt (scalar_to_vector (vec.lane_vt vec.lane_rc:$x))),
+          (!cast<Instruction>("SPLAT_"#vec) $x)>;
 
 //===----------------------------------------------------------------------===//
 // Accessing lanes
 //===----------------------------------------------------------------------===//
 
 // Extract lane as a scalar: extract_lane / extract_lane_s / extract_lane_u
-multiclass ExtractLane<ValueType vec_t, string vec, WebAssemblyRegClass reg_t,
-                       bits<32> simdop, string suffix = ""> {
-  defm EXTRACT_LANE_#vec_t#suffix :
-      SIMD_I<(outs reg_t:$dst), (ins V128:$vec, vec_i8imm_op:$idx),
+multiclass ExtractLane<Vec vec, bits<32> simdop, string suffix = ""> {
+  defm EXTRACT_LANE_#vec#suffix :
+      SIMD_I<(outs vec.lane_rc:$dst), (ins V128:$vec, vec_i8imm_op:$idx),
              (outs), (ins vec_i8imm_op:$idx), [],
-             vec#".extract_lane"#suffix#"\t$dst, $vec, $idx",
-             vec#".extract_lane"#suffix#"\t$idx", simdop>;
+             vec.prefix#".extract_lane"#suffix#"\t$dst, $vec, $idx",
+             vec.prefix#".extract_lane"#suffix#"\t$idx", simdop>;
 }
 
-defm "" : ExtractLane<v16i8, "i8x16", I32, 21, "_s">;
-defm "" : ExtractLane<v16i8, "i8x16", I32, 22, "_u">;
-defm "" : ExtractLane<v8i16, "i16x8", I32, 24, "_s">;
-defm "" : ExtractLane<v8i16, "i16x8", I32, 25, "_u">;
-defm "" : ExtractLane<v4i32, "i32x4", I32, 27>;
-defm "" : ExtractLane<v2i64, "i64x2", I64, 29>;
-defm "" : ExtractLane<v4f32, "f32x4", F32, 31>;
-defm "" : ExtractLane<v2f64, "f64x2", F64, 33>;
+defm "" : ExtractLane<I8x16, 21, "_s">;
+defm "" : ExtractLane<I8x16, 22, "_u">;
+defm "" : ExtractLane<I16x8, 24, "_s">;
+defm "" : ExtractLane<I16x8, 25, "_u">;
+defm "" : ExtractLane<I32x4, 27>;
+defm "" : ExtractLane<I64x2, 29>;
+defm "" : ExtractLane<F32x4, 31>;
+defm "" : ExtractLane<F64x2, 33>;
 
 def : Pat<(vector_extract (v16i8 V128:$vec), (i32 LaneIdx16:$idx)),
-          (EXTRACT_LANE_v16i8_u V128:$vec, imm:$idx)>;
+          (EXTRACT_LANE_I8x16_u $vec, imm:$idx)>;
 def : Pat<(vector_extract (v8i16 V128:$vec), (i32 LaneIdx8:$idx)),
-          (EXTRACT_LANE_v8i16_u V128:$vec, imm:$idx)>;
+          (EXTRACT_LANE_I16x8_u $vec, imm:$idx)>;
 def : Pat<(vector_extract (v4i32 V128:$vec), (i32 LaneIdx4:$idx)),
-          (EXTRACT_LANE_v4i32 V128:$vec, imm:$idx)>;
+          (EXTRACT_LANE_I32x4 $vec, imm:$idx)>;
 def : Pat<(vector_extract (v4f32 V128:$vec), (i32 LaneIdx4:$idx)),
-          (EXTRACT_LANE_v4f32 V128:$vec, imm:$idx)>;
+          (EXTRACT_LANE_F32x4 $vec, imm:$idx)>;
 def : Pat<(vector_extract (v2i64 V128:$vec), (i32 LaneIdx2:$idx)),
-          (EXTRACT_LANE_v2i64 V128:$vec, imm:$idx)>;
+          (EXTRACT_LANE_I64x2 $vec, imm:$idx)>;
 def : Pat<(vector_extract (v2f64 V128:$vec), (i32 LaneIdx2:$idx)),
-          (EXTRACT_LANE_v2f64 V128:$vec, imm:$idx)>;
+          (EXTRACT_LANE_F64x2 $vec, imm:$idx)>;
 
 def : Pat<
   (sext_inreg (vector_extract (v16i8 V128:$vec), (i32 LaneIdx16:$idx)), i8),
-  (EXTRACT_LANE_v16i8_s V128:$vec, imm:$idx)>;
+  (EXTRACT_LANE_I8x16_s $vec, imm:$idx)>;
 def : Pat<
   (and (vector_extract (v16i8 V128:$vec), (i32 LaneIdx16:$idx)), (i32 0xff)),
-  (EXTRACT_LANE_v16i8_u V128:$vec, imm:$idx)>;
+  (EXTRACT_LANE_I8x16_u $vec, imm:$idx)>;
 def : Pat<
   (sext_inreg (vector_extract (v8i16 V128:$vec), (i32 LaneIdx8:$idx)), i16),
-  (EXTRACT_LANE_v8i16_s V128:$vec, imm:$idx)>;
+  (EXTRACT_LANE_I16x8_s $vec, imm:$idx)>;
 def : Pat<
   (and (vector_extract (v8i16 V128:$vec), (i32 LaneIdx8:$idx)), (i32 0xffff)),
-  (EXTRACT_LANE_v8i16_u V128:$vec, imm:$idx)>;
+  (EXTRACT_LANE_I16x8_u $vec, imm:$idx)>;
 
 // Replace lane value: replace_lane
-multiclass ReplaceLane<ValueType vec_t, string vec, ImmLeaf imm_t,
-                       WebAssemblyRegClass reg_t, ValueType lane_t,
-                       bits<32> simdop> {
-  defm REPLACE_LANE_#vec_t :
-      SIMD_I<(outs V128:$dst), (ins V128:$vec, vec_i8imm_op:$idx, reg_t:$x),
-             (outs), (ins vec_i8imm_op:$idx),
-             [(set V128:$dst, (vector_insert
-               (vec_t V128:$vec), (lane_t reg_t:$x), (i32 imm_t:$idx)))],
-             vec#".replace_lane\t$dst, $vec, $idx, $x",
-             vec#".replace_lane\t$idx", simdop>;
-}
-
-defm "" : ReplaceLane<v16i8, "i8x16", LaneIdx16, I32, i32, 23>;
-defm "" : ReplaceLane<v8i16, "i16x8", LaneIdx8, I32, i32, 26>;
-defm "" : ReplaceLane<v4i32, "i32x4", LaneIdx4, I32, i32, 28>;
-defm "" : ReplaceLane<v2i64, "i64x2", LaneIdx2, I64, i64, 30>;
-defm "" : ReplaceLane<v4f32, "f32x4", LaneIdx4, F32, f32, 32>;
-defm "" : ReplaceLane<v2f64, "f64x2", LaneIdx2, F64, f64, 34>;
+multiclass ReplaceLane<Vec vec, bits<32> simdop> {
+  defm REPLACE_LANE_#vec :
+    SIMD_I<(outs V128:$dst), (ins V128:$vec, vec_i8imm_op:$idx, vec.lane_rc:$x),
+           (outs), (ins vec_i8imm_op:$idx),
+           [(set V128:$dst, (vector_insert
+             (vec.vt V128:$vec),
+             (vec.lane_vt vec.lane_rc:$x),
+             (i32 vec.lane_idx:$idx)))],
+           vec.prefix#".replace_lane\t$dst, $vec, $idx, $x",
+           vec.prefix#".replace_lane\t$idx", simdop>;
+}
+
+defm "" : ReplaceLane<I8x16, 23>;
+defm "" : ReplaceLane<I16x8, 26>;
+defm "" : ReplaceLane<I32x4, 28>;
+defm "" : ReplaceLane<I64x2, 30>;
+defm "" : ReplaceLane<F32x4, 32>;
+defm "" : ReplaceLane<F64x2, 34>;
 
 // Lower undef lane indices to zero
 def : Pat<(vector_insert (v16i8 V128:$vec), I32:$x, undef),
-          (REPLACE_LANE_v16i8 V128:$vec, 0, I32:$x)>;
+          (REPLACE_LANE_I8x16 $vec, 0, $x)>;
 def : Pat<(vector_insert (v8i16 V128:$vec), I32:$x, undef),
-          (REPLACE_LANE_v8i16 V128:$vec, 0, I32:$x)>;
+          (REPLACE_LANE_I16x8 $vec, 0, $x)>;
 def : Pat<(vector_insert (v4i32 V128:$vec), I32:$x, undef),
-          (REPLACE_LANE_v4i32 V128:$vec, 0, I32:$x)>;
+          (REPLACE_LANE_I32x4 $vec, 0, $x)>;
 def : Pat<(vector_insert (v2i64 V128:$vec), I64:$x, undef),
-          (REPLACE_LANE_v2i64 V128:$vec, 0, I64:$x)>;
+          (REPLACE_LANE_I64x2 $vec, 0, $x)>;
 def : Pat<(vector_insert (v4f32 V128:$vec), F32:$x, undef),
-          (REPLACE_LANE_v4f32 V128:$vec, 0, F32:$x)>;
+          (REPLACE_LANE_F32x4 $vec, 0, $x)>;
 def : Pat<(vector_insert (v2f64 V128:$vec), F64:$x, undef),
-          (REPLACE_LANE_v2f64 V128:$vec, 0, F64:$x)>;
+          (REPLACE_LANE_F64x2 $vec, 0, $x)>;
 
 //===----------------------------------------------------------------------===//
 // Comparisons
 //===----------------------------------------------------------------------===//
 
-multiclass SIMDCondition<ValueType vec_t, ValueType out_t, string vec,
-                         string name, CondCode cond, bits<32> simdop> {
-  defm _#vec_t :
+multiclass SIMDCondition<Vec vec, string name, CondCode cond, bits<32> simdop> {
+  defm _#vec :
     SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs), (outs), (ins),
-           [(set (out_t V128:$dst),
-             (setcc (vec_t V128:$lhs), (vec_t V128:$rhs), cond)
-           )],
-           vec#"."#name#"\t$dst, $lhs, $rhs", vec#"."#name, simdop>;
+           [(set (vec.int_vt V128:$dst),
+             (setcc (vec.vt V128:$lhs), (vec.vt V128:$rhs), cond))],
+           vec.prefix#"."#name#"\t$dst, $lhs, $rhs",
+           vec.prefix#"."#name, simdop>;
 }
 
 multiclass SIMDConditionInt<string name, CondCode cond, bits<32> baseInst> {
-  defm "" : SIMDCondition<v16i8, v16i8, "i8x16", name, cond, baseInst>;
-  defm "" : SIMDCondition<v8i16, v8i16, "i16x8", name, cond,
-                          !add(baseInst, 10)>;
-  defm "" : SIMDCondition<v4i32, v4i32, "i32x4", name, cond,
-                          !add(baseInst, 20)>;
+  defm "" : SIMDCondition<I8x16, name, cond, baseInst>;
+  defm "" : SIMDCondition<I16x8, name, cond, !add(baseInst, 10)>;
+  defm "" : SIMDCondition<I32x4, name, cond, !add(baseInst, 20)>;
 }
 
 multiclass SIMDConditionFP<string name, CondCode cond, bits<32> baseInst> {
-  defm "" : SIMDCondition<v4f32, v4i32, "f32x4", name, cond, baseInst>;
-  defm "" : SIMDCondition<v2f64, v2i64, "f64x2", name, cond,
-                          !add(baseInst, 6)>;
+  defm "" : SIMDCondition<F32x4, name, cond, baseInst>;
+  defm "" : SIMDCondition<F64x2, name, cond, !add(baseInst, 6)>;
 }
 
 // Equality: eq
@@ -499,108 +683,157 @@ defm GE : SIMDConditionFP<"ge", SETOGE, 70>;
 // Lower float comparisons that don't care about NaN to standard WebAssembly
 // float comparisons. These instructions are generated with nnan and in the
 // target-independent expansion of unordered comparisons and ordered ne.
-foreach nodes = [[seteq, EQ_v4f32], [setne, NE_v4f32], [setlt, LT_v4f32],
-                 [setgt, GT_v4f32], [setle, LE_v4f32], [setge, GE_v4f32]] in
+foreach nodes = [[seteq, EQ_F32x4], [setne, NE_F32x4], [setlt, LT_F32x4],
+                 [setgt, GT_F32x4], [setle, LE_F32x4], [setge, GE_F32x4]] in
 def : Pat<(v4i32 (nodes[0] (v4f32 V128:$lhs), (v4f32 V128:$rhs))),
-          (v4i32 (nodes[1] (v4f32 V128:$lhs), (v4f32 V128:$rhs)))>;
+          (nodes[1] $lhs, $rhs)>;
 
-foreach nodes = [[seteq, EQ_v2f64], [setne, NE_v2f64], [setlt, LT_v2f64],
-                 [setgt, GT_v2f64], [setle, LE_v2f64], [setge, GE_v2f64]] in
+foreach nodes = [[seteq, EQ_F64x2], [setne, NE_F64x2], [setlt, LT_F64x2],
+                 [setgt, GT_F64x2], [setle, LE_F64x2], [setge, GE_F64x2]] in
 def : Pat<(v2i64 (nodes[0] (v2f64 V128:$lhs), (v2f64 V128:$rhs))),
-          (v2i64 (nodes[1] (v2f64 V128:$lhs), (v2f64 V128:$rhs)))>;
+          (nodes[1] $lhs, $rhs)>;
+
+// Prototype i64x2.eq
+defm EQ_v2i64 :
+  SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs), (outs), (ins),
+         [(set (v2i64 V128:$dst),
+           (int_wasm_eq (v2i64 V128:$lhs), (v2i64 V128:$rhs)))],
+         "i64x2.eq\t$dst, $lhs, $rhs", "i64x2.eq", 192>;
 
 
 //===----------------------------------------------------------------------===//
 // Bitwise operations
 //===----------------------------------------------------------------------===//
 
-multiclass SIMDBinary<ValueType vec_t, string vec, SDNode node, string name,
-                      bits<32> simdop> {
-  defm _#vec_t : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs),
-                        (outs), (ins),
-                        [(set (vec_t V128:$dst),
-                          (node (vec_t V128:$lhs), (vec_t V128:$rhs))
-                        )],
-                        vec#"."#name#"\t$dst, $lhs, $rhs", vec#"."#name,
-                        simdop>;
+multiclass SIMDBinary<Vec vec, SDNode node, string name, bits<32> simdop> {
+  defm _#vec : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs),
+                      (outs), (ins),
+                      [(set (vec.vt V128:$dst),
+                        (node (vec.vt V128:$lhs), (vec.vt V128:$rhs)))],
+                      vec.prefix#"."#name#"\t$dst, $lhs, $rhs",
+                      vec.prefix#"."#name, simdop>;
 }
 
-multiclass SIMDBitwise<SDNode node, string name, bits<32> simdop> {
-  defm "" : SIMDBinary<v16i8, "v128", node, name, simdop>;
-  defm "" : SIMDBinary<v8i16, "v128", node, name, simdop>;
-  defm "" : SIMDBinary<v4i32, "v128", node, name, simdop>;
-  defm "" : SIMDBinary<v2i64, "v128", node, name, simdop>;
+multiclass SIMDBitwise<SDNode node, string name, bits<32> simdop, bit commutable = false> {
+  let isCommutable = commutable in
+  defm "" : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs),
+                   (outs), (ins), [],
+                   "v128."#name#"\t$dst, $lhs, $rhs", "v128."#name, simdop>;
+  foreach vec = IntVecs in
+  def : Pat<(node (vec.vt V128:$lhs), (vec.vt V128:$rhs)),
+            (!cast<NI>(NAME) $lhs, $rhs)>;
 }
 
-multiclass SIMDUnary<ValueType vec_t, string vec, SDNode node, string name,
-                     bits<32> simdop> {
-  defm _#vec_t : SIMD_I<(outs V128:$dst), (ins V128:$vec), (outs), (ins),
-                        [(set (vec_t V128:$dst),
-                          (vec_t (node (vec_t V128:$vec)))
-                        )],
-                        vec#"."#name#"\t$dst, $vec", vec#"."#name, simdop>;
+multiclass SIMDUnary<Vec vec, SDNode node, string name, bits<32> simdop> {
+  defm _#vec : SIMD_I<(outs V128:$dst), (ins V128:$v), (outs), (ins),
+                      [(set (vec.vt V128:$dst),
+                        (vec.vt (node (vec.vt V128:$v))))],
+                      vec.prefix#"."#name#"\t$dst, $v",
+                      vec.prefix#"."#name, simdop>;
 }
 
 // Bitwise logic: v128.not
-foreach vec_t = [v16i8, v8i16, v4i32, v2i64] in
-defm NOT: SIMDUnary<vec_t, "v128", vnot, "not", 77>;
+defm NOT : SIMD_I<(outs V128:$dst), (ins V128:$v), (outs), (ins), [],
+                  "v128.not\t$dst, $v", "v128.not", 77>;
+foreach vec = IntVecs in
+def : Pat<(vnot (vec.vt V128:$v)), (NOT $v)>;
 
 // Bitwise logic: v128.and / v128.or / v128.xor
-let isCommutable = 1 in {
-defm AND : SIMDBitwise<and, "and", 78>;
-defm OR : SIMDBitwise<or, "or", 80>;
-defm XOR : SIMDBitwise<xor, "xor", 81>;
-} // isCommutable = 1
+defm AND : SIMDBitwise<and, "and", 78, true>;
+defm OR : SIMDBitwise<or, "or", 80, true>;
+defm XOR : SIMDBitwise<xor, "xor", 81, true>;
 
 // Bitwise logic: v128.andnot
 def andnot : PatFrag<(ops node:$left, node:$right), (and $left, (vnot $right))>;
 defm ANDNOT : SIMDBitwise<andnot, "andnot", 79>;
 
 // Bitwise select: v128.bitselect
-foreach vec_t = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in
-  defm BITSELECT_#vec_t :
-    SIMD_I<(outs V128:$dst), (ins V128:$v1, V128:$v2, V128:$c), (outs), (ins),
-           [(set (vec_t V128:$dst),
-             (vec_t (int_wasm_bitselect
-               (vec_t V128:$v1), (vec_t V128:$v2), (vec_t V128:$c)
-             ))
-           )],
-           "v128.bitselect\t$dst, $v1, $v2, $c", "v128.bitselect", 82>;
+defm BITSELECT :
+  SIMD_I<(outs V128:$dst), (ins V128:$v1, V128:$v2, V128:$c), (outs), (ins), [],
+         "v128.bitselect\t$dst, $v1, $v2, $c", "v128.bitselect", 82>;
+
+foreach vec = AllVecs in
+def : Pat<(vec.vt (int_wasm_bitselect
+            (vec.vt V128:$v1), (vec.vt V128:$v2), (vec.vt V128:$c))),
+          (BITSELECT $v1, $v2, $c)>;
 
 // Bitselect is equivalent to (c & v1) | (~c & v2)
-foreach vec_t = [v16i8, v8i16, v4i32, v2i64] in
-  def : Pat<(vec_t (or (and (vec_t V128:$c), (vec_t V128:$v1)),
-              (and (vnot V128:$c), (vec_t V128:$v2)))),
-            (!cast<Instruction>("BITSELECT_"#vec_t)
-              V128:$v1, V128:$v2, V128:$c)>;
+foreach vec = IntVecs in
+def : Pat<(vec.vt (or (and (vec.vt V128:$c), (vec.vt V128:$v1)),
+            (and (vnot V128:$c), (vec.vt V128:$v2)))),
+          (BITSELECT $v1, $v2, $c)>;
+
+// Also implement vselect in terms of bitselect
+foreach vec = AllVecs in
+def : Pat<(vec.vt (vselect
+            (vec.int_vt V128:$c), (vec.vt V128:$v1), (vec.vt V128:$v2))),
+          (BITSELECT $v1, $v2, $c)>;
+
+// MVP select on v128 values
+defm SELECT_V128 :
+  I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs, I32:$cond), (outs), (ins), [],
+    "v128.select\t$dst, $lhs, $rhs, $cond", "v128.select", 0x1b>;
+
+foreach vec = AllVecs in {
+def : Pat<(select I32:$cond, (vec.vt V128:$lhs), (vec.vt V128:$rhs)),
+          (SELECT_V128 $lhs, $rhs, $cond)>;
+
+// ISD::SELECT requires its operand to conform to getBooleanContents, but
+// WebAssembly's select interprets any non-zero value as true, so we can fold
+// a setne with 0 into a select.
+def : Pat<(select
+            (i32 (setne I32:$cond, 0)), (vec.vt V128:$lhs), (vec.vt V128:$rhs)),
+          (SELECT_V128 $lhs, $rhs, $cond)>;
+
+// And again, this time with seteq instead of setne and the arms reversed.
+def : Pat<(select
+            (i32 (seteq I32:$cond, 0)), (vec.vt V128:$lhs), (vec.vt V128:$rhs)),
+          (SELECT_V128 $rhs, $lhs, $cond)>;
+} // foreach vec
+
+// Sign select
+multiclass SIMDSignSelect<Vec vec, bits<32> simdop> {
+  defm SIGNSELECT_#vec :
+    SIMD_I<(outs V128:$dst), (ins V128:$v1, V128:$v2, V128:$c), (outs), (ins),
+           [(set (vec.vt V128:$dst),
+             (vec.vt (int_wasm_signselect
+               (vec.vt V128:$v1), (vec.vt V128:$v2), (vec.vt V128:$c))))],
+           vec.prefix#".signselect\t$dst, $v1, $v2, $c",
+           vec.prefix#".signselect", simdop>;
+}
+
+defm : SIMDSignSelect<I8x16, 125>;
+defm : SIMDSignSelect<I16x8, 126>;
+defm : SIMDSignSelect<I32x4, 127>;
+defm : SIMDSignSelect<I64x2, 148>;
 
 //===----------------------------------------------------------------------===//
 // Integer unary arithmetic
 //===----------------------------------------------------------------------===//
 
 multiclass SIMDUnaryInt<SDNode node, string name, bits<32> baseInst> {
-  defm "" : SIMDUnary<v16i8, "i8x16", node, name, baseInst>;
-  defm "" : SIMDUnary<v8i16, "i16x8", node, name, !add(baseInst, 32)>;
-  defm "" : SIMDUnary<v4i32, "i32x4", node, name, !add(baseInst, 64)>;
-  defm "" : SIMDUnary<v2i64, "i64x2", node, name, !add(baseInst, 96)>;
+  defm "" : SIMDUnary<I8x16, node, name, baseInst>;
+  defm "" : SIMDUnary<I16x8, node, name, !add(baseInst, 32)>;
+  defm "" : SIMDUnary<I32x4, node, name, !add(baseInst, 64)>;
+  defm "" : SIMDUnary<I64x2, node, name, !add(baseInst, 96)>;
 }
 
-multiclass SIMDReduceVec<ValueType vec_t, string vec, SDNode op, string name,
-                         bits<32> simdop> {
-  defm _#vec_t : SIMD_I<(outs I32:$dst), (ins V128:$vec), (outs), (ins),
-                        [(set I32:$dst, (i32 (op (vec_t V128:$vec))))],
-                        vec#"."#name#"\t$dst, $vec", vec#"."#name, simdop>;
+multiclass SIMDReduceVec<Vec vec, SDNode op, string name, bits<32> simdop> {
+  defm _#vec : SIMD_I<(outs I32:$dst), (ins V128:$vec), (outs), (ins),
+                      [(set I32:$dst, (i32 (op (vec.vt V128:$vec))))],
+                      vec.prefix#"."#name#"\t$dst, $vec", vec.prefix#"."#name,
+                      simdop>;
 }
 
 multiclass SIMDReduce<SDNode op, string name, bits<32> baseInst> {
-  defm "" : SIMDReduceVec<v16i8, "i8x16", op, name, baseInst>;
-  defm "" : SIMDReduceVec<v8i16, "i16x8", op, name, !add(baseInst, 32)>;
-  defm "" : SIMDReduceVec<v4i32, "i32x4", op, name, !add(baseInst, 64)>;
-  defm "" : SIMDReduceVec<v2i64, "i64x2", op, name, !add(baseInst, 96)>;
+  defm "" : SIMDReduceVec<I8x16, op, name, baseInst>;
+  defm "" : SIMDReduceVec<I16x8, op, name, !add(baseInst, 32)>;
+  defm "" : SIMDReduceVec<I32x4, op, name, !add(baseInst, 64)>;
+  defm "" : SIMDReduceVec<I64x2, op, name, !add(baseInst, 96)>;
 }
 
 // Integer vector negation
-def ivneg : PatFrag<(ops node:$in), (sub immAllZerosV, node:$in)>;
+def ivneg : PatFrag<(ops node:$in), (sub immAllZerosV, $in)>;
 
 // Integer absolute value: abs
 defm ABS : SIMDUnaryInt<abs, "abs", 96>;
@@ -614,64 +847,56 @@ defm ANYTRUE : SIMDReduce<int_wasm_anytrue, "any_true", 98>;
 // All lanes true: all_true
 defm ALLTRUE : SIMDReduce<int_wasm_alltrue, "all_true", 99>;
 
+// Population count: popcnt
+defm POPCNT : SIMDUnary<I8x16, int_wasm_popcnt, "popcnt", 124>;
+
 // Reductions already return 0 or 1, so and 1, setne 0, and seteq 1
 // can be folded out
 foreach reduction =
   [["int_wasm_anytrue", "ANYTRUE"], ["int_wasm_alltrue", "ALLTRUE"]] in
-foreach ty = [v16i8, v8i16, v4i32, v2i64] in {
-def : Pat<(i32 (and
-            (i32 (!cast<Intrinsic>(reduction[0]) (ty V128:$x))),
-            (i32 1)
-          )),
-          (i32 (!cast<NI>(reduction[1]#"_"#ty) (ty V128:$x)))>;
-def : Pat<(i32 (setne
-            (i32 (!cast<Intrinsic>(reduction[0]) (ty V128:$x))),
-            (i32 0)
-          )),
-          (i32 (!cast<NI>(reduction[1]#"_"#ty) (ty V128:$x)))>;
-def : Pat<(i32 (seteq
-            (i32 (!cast<Intrinsic>(reduction[0]) (ty V128:$x))),
-            (i32 1)
-          )),
-          (i32 (!cast<NI>(reduction[1]#"_"#ty) (ty V128:$x)))>;
-}
-
-multiclass SIMDBitmask<ValueType vec_t, string vec, bits<32> simdop> {
-  defm _#vec_t : SIMD_I<(outs I32:$dst), (ins V128:$vec), (outs), (ins),
-                         [(set I32:$dst,
-                           (i32 (int_wasm_bitmask (vec_t V128:$vec)))
-                         )],
-                         vec#".bitmask\t$dst, $vec", vec#".bitmask", simdop>;
-}
-
-defm BITMASK : SIMDBitmask<v16i8, "i8x16", 100>;
-defm BITMASK : SIMDBitmask<v8i16, "i16x8", 132>;
-defm BITMASK : SIMDBitmask<v4i32, "i32x4", 164>;
+foreach vec = IntVecs in {
+defvar intrinsic = !cast<Intrinsic>(reduction[0]);
+defvar inst = !cast<NI>(reduction[1]#"_"#vec);
+def : Pat<(i32 (and (i32 (intrinsic (vec.vt V128:$x))), (i32 1))), (inst $x)>;
+def : Pat<(i32 (setne (i32 (intrinsic (vec.vt V128:$x))), (i32 0))), (inst $x)>;
+def : Pat<(i32 (seteq (i32 (intrinsic (vec.vt V128:$x))), (i32 1))), (inst $x)>;
+}
+
+multiclass SIMDBitmask<Vec vec, bits<32> simdop> {
+  defm _#vec : SIMD_I<(outs I32:$dst), (ins V128:$vec), (outs), (ins),
+                      [(set I32:$dst,
+                         (i32 (int_wasm_bitmask (vec.vt V128:$vec))))],
+                      vec.prefix#".bitmask\t$dst, $vec", vec.prefix#".bitmask",
+                      simdop>;
+}
+
+defm BITMASK : SIMDBitmask<I8x16, 100>;
+defm BITMASK : SIMDBitmask<I16x8, 132>;
+defm BITMASK : SIMDBitmask<I32x4, 164>;
+defm BITMASK : SIMDBitmask<I64x2, 196>;
 
 //===----------------------------------------------------------------------===//
 // Bit shifts
 //===----------------------------------------------------------------------===//
 
-multiclass SIMDShift<ValueType vec_t, string vec, SDNode node, string name,
-                     bits<32> simdop> {
-  defm _#vec_t : SIMD_I<(outs V128:$dst), (ins V128:$vec, I32:$x),
-                        (outs), (ins),
-                        [(set (vec_t V128:$dst), (node V128:$vec, I32:$x))],
-                        vec#"."#name#"\t$dst, $vec, $x", vec#"."#name, simdop>;
+multiclass SIMDShift<Vec vec, SDNode node, string name, bits<32> simdop> {
+  defm _#vec : SIMD_I<(outs V128:$dst), (ins V128:$vec, I32:$x), (outs), (ins),
+                      [(set (vec.vt V128:$dst), (node V128:$vec, I32:$x))],
+                      vec.prefix#"."#name#"\t$dst, $vec, $x",
+                      vec.prefix#"."#name, simdop>;
 }
 
 multiclass SIMDShiftInt<SDNode node, string name, bits<32> baseInst> {
-  defm "" : SIMDShift<v16i8, "i8x16", node, name, baseInst>;
-  defm "" : SIMDShift<v8i16, "i16x8", node, name, !add(baseInst, 32)>;
-  defm "" : SIMDShift<v4i32, "i32x4", node, name, !add(baseInst, 64)>;
-  defm "" : SIMDShift<v2i64, "i64x2", node, name, !add(baseInst, 96)>;
+  defm "" : SIMDShift<I8x16, node, name, baseInst>;
+  defm "" : SIMDShift<I16x8, node, name, !add(baseInst, 32)>;
+  defm "" : SIMDShift<I32x4, node, name, !add(baseInst, 64)>;
+  defm "" : SIMDShift<I64x2, node, name, !add(baseInst, 96)>;
 }
 
 // WebAssembly SIMD shifts are nonstandard in that the shift amount is
 // an i32 rather than a vector, so they need custom nodes.
-def wasm_shift_t : SDTypeProfile<1, 2,
-  [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisVT<2, i32>]
->;
+def wasm_shift_t :
+  SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisVT<2, i32>]>;
 def wasm_shl : SDNode<"WebAssemblyISD::VEC_SHL", wasm_shift_t>;
 def wasm_shr_s : SDNode<"WebAssemblyISD::VEC_SHR_S", wasm_shift_t>;
 def wasm_shr_u : SDNode<"WebAssemblyISD::VEC_SHR_U", wasm_shift_t>;
@@ -688,24 +913,24 @@ defm SHR_U : SIMDShiftInt<wasm_shr_u, "shr_u", 109>;
 //===----------------------------------------------------------------------===//
 
 multiclass SIMDBinaryIntNoI8x16<SDNode node, string name, bits<32> baseInst> {
-  defm "" : SIMDBinary<v8i16, "i16x8", node, name, !add(baseInst, 32)>;
-  defm "" : SIMDBinary<v4i32, "i32x4", node, name, !add(baseInst, 64)>;
-  defm "" : SIMDBinary<v2i64, "i64x2", node, name, !add(baseInst, 96)>;
+  defm "" : SIMDBinary<I16x8, node, name, !add(baseInst, 32)>;
+  defm "" : SIMDBinary<I32x4, node, name, !add(baseInst, 64)>;
+  defm "" : SIMDBinary<I64x2, node, name, !add(baseInst, 96)>;
 }
 
 multiclass SIMDBinaryIntSmall<SDNode node, string name, bits<32> baseInst> {
-  defm "" : SIMDBinary<v16i8, "i8x16", node, name, baseInst>;
-  defm "" : SIMDBinary<v8i16, "i16x8", node, name, !add(baseInst, 32)>;
+  defm "" : SIMDBinary<I8x16, node, name, baseInst>;
+  defm "" : SIMDBinary<I16x8, node, name, !add(baseInst, 32)>;
 }
 
 multiclass SIMDBinaryIntNoI64x2<SDNode node, string name, bits<32> baseInst> {
   defm "" : SIMDBinaryIntSmall<node, name, baseInst>;
-  defm "" : SIMDBinary<v4i32, "i32x4", node, name, !add(baseInst, 64)>;
+  defm "" : SIMDBinary<I32x4, node, name, !add(baseInst, 64)>;
 }
 
 multiclass SIMDBinaryInt<SDNode node, string name, bits<32> baseInst> {
   defm "" : SIMDBinaryIntNoI64x2<node, name, baseInst>;
-  defm "" : SIMDBinary<v2i64, "i64x2", node, name, !add(baseInst, 96)>;
+  defm "" : SIMDBinary<I64x2, node, name, !add(baseInst, 96)>;
 }
 
 // Integer addition: add / add_saturate_s / add_saturate_u
@@ -736,38 +961,74 @@ defm MAX_U : SIMDBinaryIntNoI64x2<umax, "max_u", 121>;
 
 // Integer unsigned rounding average: avgr_u
 let isCommutable = 1 in {
-defm AVGR_U : SIMDBinary<v16i8, "i8x16", int_wasm_avgr_unsigned, "avgr_u", 123>;
-defm AVGR_U : SIMDBinary<v8i16, "i16x8", int_wasm_avgr_unsigned, "avgr_u", 155>;
+defm AVGR_U : SIMDBinary<I8x16, int_wasm_avgr_unsigned, "avgr_u", 123>;
+defm AVGR_U : SIMDBinary<I16x8, int_wasm_avgr_unsigned, "avgr_u", 155>;
 }
 
-def add_nuw : PatFrag<(ops node:$lhs, node:$rhs),
-                      (add node:$lhs, node:$rhs),
+def add_nuw : PatFrag<(ops node:$lhs, node:$rhs), (add $lhs, $rhs),
                       "return N->getFlags().hasNoUnsignedWrap();">;
 
-foreach nodes = [[v16i8, splat16], [v8i16, splat8]] in
+foreach vec = [I8x16, I16x8] in {
+defvar inst = !cast<NI>("AVGR_U_"#vec);
 def : Pat<(wasm_shr_u
             (add_nuw
-              (add_nuw (nodes[0] V128:$lhs), (nodes[0] V128:$rhs)),
-              (nodes[1] (i32 1))
-            ),
-            (i32 1)
-          ),
-          (!cast<NI>("AVGR_U_"#nodes[0]) V128:$lhs, V128:$rhs)>;
+              (add_nuw (vec.vt V128:$lhs), (vec.vt V128:$rhs)),
+              (vec.splat (i32 1))),
+            (i32 1)),
+          (inst $lhs, $rhs)>;
+}
 
 // Widening dot product: i32x4.dot_i16x8_s
 let isCommutable = 1 in
 defm DOT : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs), (outs), (ins),
                   [(set V128:$dst, (int_wasm_dot V128:$lhs, V128:$rhs))],
                   "i32x4.dot_i16x8_s\t$dst, $lhs, $rhs", "i32x4.dot_i16x8_s",
-                  180>;
+                  186>;
+
+// Extending multiplication: extmul_{low,high}_P, extmul_high
+multiclass SIMDExtBinary<Vec vec, SDNode node, string name, bits<32> simdop> {
+  defm _#vec : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs),
+                      (outs), (ins),
+                      [(set (vec.vt V128:$dst), (node
+                         (vec.split.vt V128:$lhs),(vec.split.vt V128:$rhs)))],
+                      vec.prefix#"."#name#"\t$dst, $lhs, $rhs",
+                      vec.prefix#"."#name, simdop>;
+}
+
+defm EXTMUL_LOW_S :
+  SIMDExtBinary<I16x8, int_wasm_extmul_low_signed, "extmul_low_i8x16_s", 154>;
+defm EXTMUL_HIGH_S :
+  SIMDExtBinary<I16x8, int_wasm_extmul_high_signed, "extmul_high_i8x16_s", 157>;
+defm EXTMUL_LOW_U :
+  SIMDExtBinary<I16x8, int_wasm_extmul_low_unsigned, "extmul_low_i8x16_u", 158>;
+defm EXTMUL_HIGH_U :
+  SIMDExtBinary<I16x8, int_wasm_extmul_high_unsigned, "extmul_high_i8x16_u", 159>;
+
+defm EXTMUL_LOW_S :
+  SIMDExtBinary<I32x4, int_wasm_extmul_low_signed, "extmul_low_i16x8_s", 187>;
+defm EXTMUL_HIGH_S :
+  SIMDExtBinary<I32x4, int_wasm_extmul_high_signed, "extmul_high_i16x8_s", 189>;
+defm EXTMUL_LOW_U :
+  SIMDExtBinary<I32x4, int_wasm_extmul_low_unsigned, "extmul_low_i16x8_u", 190>;
+defm EXTMUL_HIGH_U :
+  SIMDExtBinary<I32x4, int_wasm_extmul_high_unsigned, "extmul_high_i16x8_u", 191>;
+
+defm EXTMUL_LOW_S :
+  SIMDExtBinary<I64x2, int_wasm_extmul_low_signed, "extmul_low_i32x4_s", 210>;
+defm EXTMUL_HIGH_S :
+  SIMDExtBinary<I64x2, int_wasm_extmul_high_signed, "extmul_high_i32x4_s", 211>;
+defm EXTMUL_LOW_U :
+  SIMDExtBinary<I64x2, int_wasm_extmul_low_unsigned, "extmul_low_i32x4_u", 214>;
+defm EXTMUL_HIGH_U :
+  SIMDExtBinary<I64x2, int_wasm_extmul_high_unsigned, "extmul_high_i32x4_u", 215>;
 
 //===----------------------------------------------------------------------===//
 // Floating-point unary arithmetic
 //===----------------------------------------------------------------------===//
 
 multiclass SIMDUnaryFP<SDNode node, string name, bits<32> baseInst> {
-  defm "" : SIMDUnary<v4f32, "f32x4", node, name, baseInst>;
-  defm "" : SIMDUnary<v2f64, "f64x2", node, name, !add(baseInst, 12)>;
+  defm "" : SIMDUnary<F32x4, node, name, baseInst>;
+  defm "" : SIMDUnary<F64x2, node, name, !add(baseInst, 12)>;
 }
 
 // Absolute value: abs
@@ -780,22 +1041,22 @@ defm NEG : SIMDUnaryFP<fneg, "neg", 225>;
 defm SQRT : SIMDUnaryFP<fsqrt, "sqrt", 227>;
 
 // Rounding: ceil, floor, trunc, nearest
-defm CEIL : SIMDUnary<v4f32, "f32x4", int_wasm_ceil, "ceil", 216>;
-defm FLOOR : SIMDUnary<v4f32, "f32x4", int_wasm_floor, "floor", 217>;
-defm TRUNC: SIMDUnary<v4f32, "f32x4", int_wasm_trunc, "trunc", 218>;
-defm NEAREST: SIMDUnary<v4f32, "f32x4", int_wasm_nearest, "nearest", 219>;
-defm CEIL : SIMDUnary<v2f64, "f64x2", int_wasm_ceil, "ceil", 220>;
-defm FLOOR : SIMDUnary<v2f64, "f64x2", int_wasm_floor, "floor", 221>;
-defm TRUNC: SIMDUnary<v2f64, "f64x2", int_wasm_trunc, "trunc", 222>;
-defm NEAREST: SIMDUnary<v2f64, "f64x2", int_wasm_nearest, "nearest", 223>;
+defm CEIL : SIMDUnary<F32x4, int_wasm_ceil, "ceil", 216>;
+defm FLOOR : SIMDUnary<F32x4, int_wasm_floor, "floor", 217>;
+defm TRUNC: SIMDUnary<F32x4, int_wasm_trunc, "trunc", 218>;
+defm NEAREST: SIMDUnary<F32x4, int_wasm_nearest, "nearest", 219>;
+defm CEIL : SIMDUnary<F64x2, int_wasm_ceil, "ceil", 220>;
+defm FLOOR : SIMDUnary<F64x2, int_wasm_floor, "floor", 221>;
+defm TRUNC: SIMDUnary<F64x2, int_wasm_trunc, "trunc", 222>;
+defm NEAREST: SIMDUnary<F64x2, int_wasm_nearest, "nearest", 223>;
 
 //===----------------------------------------------------------------------===//
 // Floating-point binary arithmetic
 //===----------------------------------------------------------------------===//
 
 multiclass SIMDBinaryFP<SDNode node, string name, bits<32> baseInst> {
-  defm "" : SIMDBinary<v4f32, "f32x4", node, name, baseInst>;
-  defm "" : SIMDBinary<v2f64, "f64x2", node, name, !add(baseInst, 12)>;
+  defm "" : SIMDBinary<F32x4, node, name, baseInst>;
+  defm "" : SIMDBinary<F64x2, node, name, !add(baseInst, 12)>;
 }
 
 // Addition: add
@@ -828,63 +1089,151 @@ defm PMAX : SIMDBinaryFP<int_wasm_pmax, "pmax", 235>;
 // Conversions
 //===----------------------------------------------------------------------===//
 
-multiclass SIMDConvert<ValueType vec_t, ValueType arg_t, SDNode op,
-                       string name, bits<32> simdop> {
-  defm op#_#vec_t#_#arg_t :
+multiclass SIMDConvert<Vec vec, Vec arg, SDNode op, string name,
+                       bits<32> simdop> {
+  defm op#_#vec :
     SIMD_I<(outs V128:$dst), (ins V128:$vec), (outs), (ins),
-           [(set (vec_t V128:$dst), (vec_t (op (arg_t V128:$vec))))],
-           name#"\t$dst, $vec", name, simdop>;
+           [(set (vec.vt V128:$dst), (vec.vt (op (arg.vt V128:$vec))))],
+           vec.prefix#"."#name#"\t$dst, $vec", vec.prefix#"."#name, simdop>;
 }
 
 // Floating point to integer with saturation: trunc_sat
-defm "" : SIMDConvert<v4i32, v4f32, fp_to_sint, "i32x4.trunc_sat_f32x4_s", 248>;
-defm "" : SIMDConvert<v4i32, v4f32, fp_to_uint, "i32x4.trunc_sat_f32x4_u", 249>;
+defm "" : SIMDConvert<I32x4, F32x4, fp_to_sint, "trunc_sat_f32x4_s", 248>;
+defm "" : SIMDConvert<I32x4, F32x4, fp_to_uint, "trunc_sat_f32x4_u", 249>;
 
 // Integer to floating point: convert
-defm "" : SIMDConvert<v4f32, v4i32, sint_to_fp, "f32x4.convert_i32x4_s", 250>;
-defm "" : SIMDConvert<v4f32, v4i32, uint_to_fp, "f32x4.convert_i32x4_u", 251>;
+defm "" : SIMDConvert<F32x4, I32x4, sint_to_fp, "convert_i32x4_s", 250>;
+defm "" : SIMDConvert<F32x4, I32x4, uint_to_fp, "convert_i32x4_u", 251>;
+
+// Lower llvm.wasm.trunc.saturate.* to saturating instructions
+def : Pat<(v4i32 (int_wasm_trunc_saturate_signed (v4f32 V128:$src))),
+          (fp_to_sint_I32x4 $src)>;
+def : Pat<(v4i32 (int_wasm_trunc_saturate_unsigned (v4f32 V128:$src))),
+          (fp_to_uint_I32x4 $src)>;
 
 // Widening operations
-multiclass SIMDWiden<ValueType vec_t, string vec, ValueType arg_t, string arg,
-                     bits<32> baseInst> {
-  defm "" : SIMDConvert<vec_t, arg_t, int_wasm_widen_low_signed,
-                        vec#".widen_low_"#arg#"_s", baseInst>;
-  defm "" : SIMDConvert<vec_t, arg_t, int_wasm_widen_high_signed,
-                        vec#".widen_high_"#arg#"_s", !add(baseInst, 1)>;
-  defm "" : SIMDConvert<vec_t, arg_t, int_wasm_widen_low_unsigned,
-                        vec#".widen_low_"#arg#"_u", !add(baseInst, 2)>;
-  defm "" : SIMDConvert<vec_t, arg_t, int_wasm_widen_high_unsigned,
-                        vec#".widen_high_"#arg#"_u", !add(baseInst, 3)>;
+def widen_t : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>;
+def widen_low_s : SDNode<"WebAssemblyISD::WIDEN_LOW_S", widen_t>;
+def widen_high_s : SDNode<"WebAssemblyISD::WIDEN_HIGH_S", widen_t>;
+def widen_low_u : SDNode<"WebAssemblyISD::WIDEN_LOW_U", widen_t>;
+def widen_high_u : SDNode<"WebAssemblyISD::WIDEN_HIGH_U", widen_t>;
+
+// TODO: refactor this to be uniform for i64x2 if the numbering is not changed.
+multiclass SIMDWiden<Vec vec, bits<32> baseInst> {
+  defm "" : SIMDConvert<vec, vec.split, widen_low_s,
+                        "widen_low_"#vec.split.prefix#"_s", baseInst>;
+  defm "" : SIMDConvert<vec, vec.split, widen_high_s,
+                        "widen_high_"#vec.split.prefix#"_s", !add(baseInst, 1)>;
+  defm "" : SIMDConvert<vec, vec.split, widen_low_u,
+                        "widen_low_"#vec.split.prefix#"_u", !add(baseInst, 2)>;
+  defm "" : SIMDConvert<vec, vec.split, widen_high_u,
+                        "widen_high_"#vec.split.prefix#"_u", !add(baseInst, 3)>;
 }
 
-defm "" : SIMDWiden<v8i16, "i16x8", v16i8, "i8x16", 135>;
-defm "" : SIMDWiden<v4i32, "i32x4", v8i16, "i16x8", 167>;
+defm "" : SIMDWiden<I16x8, 135>;
+defm "" : SIMDWiden<I32x4, 167>;
+
+defm "" : SIMDConvert<I64x2, I32x4, int_wasm_widen_low_signed,
+                      "widen_low_i32x4_s", 199>;
+defm "" : SIMDConvert<I64x2, I32x4, int_wasm_widen_high_signed,
+                      "widen_high_i32x4_s", 200>;
+defm "" : SIMDConvert<I64x2, I32x4, int_wasm_widen_low_unsigned,
+                      "widen_low_i32x4_u", 201>;
+defm "" : SIMDConvert<I64x2, I32x4, int_wasm_widen_high_unsigned,
+                      "widen_high_i32x4_u", 202>;
 
 // Narrowing operations
-multiclass SIMDNarrow<ValueType vec_t, string vec, ValueType arg_t, string arg,
-                      bits<32> baseInst> {
-  defm NARROW_S_#vec_t :
+multiclass SIMDNarrow<Vec vec, bits<32> baseInst> {
+  defvar name = vec.split.prefix#".narrow_"#vec.prefix;
+  defm NARROW_S_#vec.split :
     SIMD_I<(outs V128:$dst), (ins V128:$low, V128:$high), (outs), (ins),
-           [(set (vec_t V128:$dst), (vec_t (int_wasm_narrow_signed
-             (arg_t V128:$low), (arg_t V128:$high))))],
-           vec#".narrow_"#arg#"_s\t$dst, $low, $high", vec#".narrow_"#arg#"_s",
-           baseInst>;
-  defm NARROW_U_#vec_t :
+           [(set (vec.split.vt V128:$dst), (vec.split.vt (int_wasm_narrow_signed
+             (vec.vt V128:$low), (vec.vt V128:$high))))],
+           name#"_s\t$dst, $low, $high", name#"_s", baseInst>;
+  defm NARROW_U_#vec.split :
     SIMD_I<(outs V128:$dst), (ins V128:$low, V128:$high), (outs), (ins),
-           [(set (vec_t V128:$dst), (vec_t (int_wasm_narrow_unsigned
-             (arg_t V128:$low), (arg_t V128:$high))))],
-           vec#".narrow_"#arg#"_u\t$dst, $low, $high", vec#".narrow_"#arg#"_u",
-           !add(baseInst, 1)>;
+           [(set (vec.split.vt V128:$dst), (vec.split.vt (int_wasm_narrow_unsigned
+             (vec.vt V128:$low), (vec.vt V128:$high))))],
+           name#"_u\t$dst, $low, $high", name#"_u", !add(baseInst, 1)>;
 }
 
-defm "" : SIMDNarrow<v16i8, "i8x16", v8i16, "i16x8", 101>;
-defm "" : SIMDNarrow<v8i16, "i16x8", v4i32, "i32x4", 133>;
+defm "" : SIMDNarrow<I16x8, 101>;
+defm "" : SIMDNarrow<I32x4, 133>;
+
+// Use narrowing operations for truncating stores. Since the narrowing
+// operations are saturating instead of truncating, we need to mask
+// the stored values first.
+// TODO: Use consts instead of splats
+def store_v8i8_trunc_v8i16 :
+  OutPatFrag<(ops node:$val),
+             (EXTRACT_LANE_I64x2
+               (NARROW_U_I8x16
+                 (AND (SPLAT_I32x4 (CONST_I32 0x00ff00ff)), node:$val),
+                 $val), // Unused input
+               0)>;
+
+def store_v4i16_trunc_v4i32 :
+  OutPatFrag<(ops node:$val),
+             (EXTRACT_LANE_I64x2
+               (NARROW_U_I16x8
+                 (AND (SPLAT_I32x4 (CONST_I32 0x0000ffff)), node:$val),
+                 $val), // Unused input
+               0)>;
+
+// Store patterns adapted from WebAssemblyInstrMemory.td
+multiclass NarrowingStorePatNoOffset<Vec vec, OutPatFrag out> {
+  defvar node = !cast<PatFrag>("truncstorevi"#vec.split.lane_bits);
+  def : Pat<(node vec.vt:$val, I32:$addr),
+            (STORE_I64_A32 0, 0, $addr, (out $val))>,
+        Requires<[HasAddr32]>;
+  def : Pat<(node vec.vt:$val, I64:$addr),
+            (STORE_I64_A64 0, 0, $addr, (out $val))>,
+        Requires<[HasAddr64]>;
+}
 
-// Lower llvm.wasm.trunc.saturate.* to saturating instructions
-def : Pat<(v4i32 (int_wasm_trunc_saturate_signed (v4f32 V128:$src))),
-          (fp_to_sint_v4i32_v4f32 (v4f32 V128:$src))>;
-def : Pat<(v4i32 (int_wasm_trunc_saturate_unsigned (v4f32 V128:$src))),
-          (fp_to_uint_v4i32_v4f32 (v4f32 V128:$src))>;
+defm : NarrowingStorePatNoOffset<I16x8, store_v8i8_trunc_v8i16>;
+defm : NarrowingStorePatNoOffset<I32x4, store_v4i16_trunc_v4i32>;
+
+multiclass NarrowingStorePatImmOff<Vec vec, PatFrag operand, OutPatFrag out> {
+  defvar node = !cast<PatFrag>("truncstorevi"#vec.split.lane_bits);
+  def : Pat<(node vec.vt:$val, (operand I32:$addr, imm:$off)),
+            (STORE_I64_A32 0, imm:$off, $addr, (out $val))>,
+        Requires<[HasAddr32]>;
+  def : Pat<(node vec.vt:$val, (operand I64:$addr, imm:$off)),
+            (STORE_I64_A64 0, imm:$off, $addr, (out $val))>,
+        Requires<[HasAddr64]>;
+}
+
+defm : NarrowingStorePatImmOff<I16x8, regPlusImm, store_v8i8_trunc_v8i16>;
+defm : NarrowingStorePatImmOff<I32x4, regPlusImm, store_v4i16_trunc_v4i32>;
+defm : NarrowingStorePatImmOff<I16x8, or_is_add, store_v8i8_trunc_v8i16>;
+defm : NarrowingStorePatImmOff<I32x4, or_is_add, store_v4i16_trunc_v4i32>;
+
+multiclass NarrowingStorePatOffsetOnly<Vec vec, OutPatFrag out> {
+  defvar node = !cast<PatFrag>("truncstorevi"#vec.split.lane_bits);
+  def : Pat<(node vec.vt:$val, imm:$off),
+            (STORE_I64_A32 0, imm:$off, (CONST_I32 0), (out $val))>,
+        Requires<[HasAddr32]>;
+  def : Pat<(node vec.vt:$val, imm:$off),
+            (STORE_I64_A64 0, imm:$off, (CONST_I64 0), (out $val))>,
+        Requires<[HasAddr64]>;
+}
+
+defm : NarrowingStorePatOffsetOnly<I16x8, store_v8i8_trunc_v8i16>;
+defm : NarrowingStorePatOffsetOnly<I32x4, store_v4i16_trunc_v4i32>;
+
+multiclass NarrowingStorePatGlobalAddrOffOnly<Vec vec, OutPatFrag out> {
+  defvar node = !cast<PatFrag>("truncstorevi"#vec.split.lane_bits);
+  def : Pat<(node vec.vt:$val, (WebAssemblywrapper tglobaladdr:$off)),
+            (STORE_I64_A32 0, tglobaladdr:$off, (CONST_I32 0), (out $val))>,
+        Requires<[IsNotPIC, HasAddr32]>;
+  def : Pat<(node vec.vt:$val, (WebAssemblywrapper tglobaladdr:$off)),
+            (STORE_I64_A64  0, tglobaladdr:$off, (CONST_I64 0), (out $val))>,
+        Requires<[IsNotPIC, HasAddr64]>;
+}
+
+defm : NarrowingStorePatGlobalAddrOffOnly<I16x8, store_v8i8_trunc_v8i16>;
+defm : NarrowingStorePatGlobalAddrOffOnly<I32x4, store_v4i16_trunc_v4i32>;
 
 // Bitcasts are nops
 // Matching bitcast t1 to t1 causes strange errors, so avoid repeating types
@@ -897,24 +1246,96 @@ foreach t2 = !foldl(
 ) in
 def : Pat<(t1 (bitconvert (t2 V128:$v))), (t1 V128:$v)>;
 
+// Extended pairwise addition
+defm "" : SIMDConvert<I16x8, I8x16, int_wasm_extadd_pairwise_signed,
+                      "extadd_pairwise_i8x16_s", 0xc2>;
+defm "" : SIMDConvert<I16x8, I8x16, int_wasm_extadd_pairwise_unsigned,
+                      "extadd_pairwise_i8x16_u", 0xc3>;
+defm "" : SIMDConvert<I32x4, I16x8, int_wasm_extadd_pairwise_signed,
+                      "extadd_pairwise_i16x8_s", 0xa5>;
+defm "" : SIMDConvert<I32x4, I16x8, int_wasm_extadd_pairwise_unsigned,
+                      "extadd_pairwise_i16x8_u", 0xa6>;
+
+
+// Prototype f64x2 conversions
+defm "" : SIMDConvert<F64x2, I32x4, int_wasm_convert_low_signed,
+                      "convert_low_i32x4_s", 0x53>;
+defm "" : SIMDConvert<F64x2, I32x4, int_wasm_convert_low_unsigned,
+                      "convert_low_i32x4_u", 0x54>;
+defm "" : SIMDConvert<I32x4, F64x2, int_wasm_trunc_saturate_zero_signed,
+                      "trunc_sat_zero_f64x2_s", 0x55>;
+defm "" : SIMDConvert<I32x4, F64x2, int_wasm_trunc_saturate_zero_unsigned,
+                      "trunc_sat_zero_f64x2_u", 0x56>;
+defm "" : SIMDConvert<F32x4, F64x2, int_wasm_demote_zero,
+                      "demote_zero_f64x2", 0x57>;
+defm "" : SIMDConvert<F64x2, F32x4, int_wasm_promote_low,
+                      "promote_low_f32x4", 0x69>;
+
 //===----------------------------------------------------------------------===//
 // Quasi-Fused Multiply- Add and Subtract (QFMA/QFMS)
 //===----------------------------------------------------------------------===//
 
-multiclass SIMDQFM<ValueType vec_t, string vec, bits<32> baseInst> {
-  defm QFMA_#vec_t :
+multiclass SIMDQFM<Vec vec, bits<32> simdopA, bits<32> simdopS> {
+  defm QFMA_#vec :
     SIMD_I<(outs V128:$dst), (ins V128:$a, V128:$b, V128:$c),
            (outs), (ins),
-           [(set (vec_t V128:$dst),
-             (int_wasm_qfma (vec_t V128:$a), (vec_t V128:$b), (vec_t V128:$c)))],
-           vec#".qfma\t$dst, $a, $b, $c", vec#".qfma", baseInst>;
-  defm QFMS_#vec_t :
+           [(set (vec.vt V128:$dst), (int_wasm_qfma
+              (vec.vt V128:$a), (vec.vt V128:$b), (vec.vt V128:$c)))],
+           vec.prefix#".qfma\t$dst, $a, $b, $c", vec.prefix#".qfma", simdopA>;
+  defm QFMS_#vec :
     SIMD_I<(outs V128:$dst), (ins V128:$a, V128:$b, V128:$c),
            (outs), (ins),
-           [(set (vec_t V128:$dst),
-             (int_wasm_qfms (vec_t V128:$a), (vec_t V128:$b), (vec_t V128:$c)))],
-           vec#".qfms\t$dst, $a, $b, $c", vec#".qfms", !add(baseInst, 1)>;
+           [(set (vec.vt V128:$dst), (int_wasm_qfms
+              (vec.vt V128:$a), (vec.vt V128:$b), (vec.vt V128:$c)))],
+           vec.prefix#".qfms\t$dst, $a, $b, $c", vec.prefix#".qfms", simdopS>;
 }
 
-defm "" : SIMDQFM<v4f32, "f32x4", 252>;
-defm "" : SIMDQFM<v2f64, "f64x2", 254>;
+defm "" : SIMDQFM<F32x4, 180, 212>;
+defm "" : SIMDQFM<F64x2, 254, 255>;
+
+//===----------------------------------------------------------------------===//
+// Saturating Rounding Q-Format Multiplication
+//===----------------------------------------------------------------------===//
+
+defm Q15MULR_SAT_S :
+  SIMDBinary<I16x8, int_wasm_q15mulr_saturate_signed, "q15mulr_sat_s", 156>;
+
+//===----------------------------------------------------------------------===//
+// Experimental prefetch instructions: prefetch.t, prefetch.nt
+//===----------------------------------------------------------------------===//
+
+let mayLoad = true, UseNamedOperandTable = true in {
+defm PREFETCH_T_A32 :
+  SIMD_I<(outs), (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
+         (outs), (ins P2Align:$p2align, offset32_op:$off), [],
+         "prefetch.t\t${off}(${addr})$p2align",
+         "prefetch.t\t$off$p2align", 0xc5>;
+defm PREFETCH_T_A64 :
+  SIMD_I<(outs), (ins P2Align:$p2align, offset64_op:$off, I64:$addr),
+         (outs), (ins P2Align:$p2align, offset64_op:$off), [],
+         "prefetch.t\t${off}(${addr})$p2align",
+         "prefetch.t\t$off$p2align", 0xc5>;
+defm PREFETCH_NT_A32 :
+  SIMD_I<(outs), (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
+         (outs), (ins P2Align:$p2align, offset32_op:$off), [],
+         "prefetch.nt\t${off}(${addr})$p2align",
+         "prefetch.nt\t$off$p2align", 0xc6>;
+defm PREFETCH_NT_A64 :
+  SIMD_I<(outs), (ins P2Align:$p2align, offset64_op:$off, I64:$addr),
+         (outs), (ins P2Align:$p2align, offset64_op:$off), [],
+         "prefetch.nt\t${off}(${addr})$p2align",
+         "prefetch.nt\t$off$p2align", 0xc6>;
+} // mayLoad, UseNamedOperandTable
+
+multiclass PrefetchPatNoOffset<PatFrag kind, string inst> {
+  def : Pat<(kind I32:$addr), (!cast<NI>(inst # "_A32") 0, 0, $addr)>,
+        Requires<[HasAddr32]>;
+  def : Pat<(kind I64:$addr), (!cast<NI>(inst # "_A64") 0, 0, $addr)>,
+        Requires<[HasAddr64]>;
+}
+
+foreach inst = [["PREFETCH_T", "int_wasm_prefetch_t"],
+                ["PREFETCH_NT", "int_wasm_prefetch_nt"]] in {
+defvar node = !cast<Intrinsic>(inst[1]);
+defm : PrefetchPatNoOffset<node, inst[0]>;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrTable.td b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrTable.td
new file mode 100644
index 000000000000..97638c3494ae
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrTable.td
@@ -0,0 +1,64 @@
+// WebAssemblyInstrTable.td - WebAssembly Table codegen support -*- tablegen -*-
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// WebAssembly Table operand code-gen constructs.
+/// Instructions that handle tables
+//===----------------------------------------------------------------------===//
+
+
+multiclass TABLE<WebAssemblyRegClass rt> {
+  defm TABLE_GET_#rt : I<(outs rt:$res), (ins table32_op:$table),
+                         (outs), (ins table32_op:$table),
+                         [],
+                         "table.get\t$res, $table",
+                         "table.get\t$table",
+                         0x25>;
+
+  defm TABLE_SET_#rt : I<(outs), (ins table32_op:$table, rt:$val, I32:$i),
+                         (outs), (ins table32_op:$table),
+                         [],
+                         "table.set\t$table, $val, $i",
+                         "table.set\t$table",
+                         0x26>;
+
+  defm TABLE_GROW_#rt : I<(outs I32:$sz), (ins table32_op:$table, I32:$n, rt:$val),
+                          (outs), (ins table32_op:$table),
+                          [],
+                          "table.grow\t$sz, $table, $n, $val",
+                          "table.grow\t$table",
+                          0xfc0f>;
+
+  defm TABLE_FILL_#rt : I<(outs), (ins table32_op:$table, I32:$n, rt:$val, I32:$i),
+                          (outs), (ins table32_op:$table),
+                          [],
+                          "table.fill\t$table, $n, $val, $i",
+                          "table.fill\t$table",
+                          0xfc11>;
+
+}
+
+defm "" : TABLE<FUNCREF>, Requires<[HasReferenceTypes]>;
+defm "" : TABLE<EXTERNREF>, Requires<[HasReferenceTypes]>;
+
+defm TABLE_SIZE : I<(outs I32:$sz), (ins table32_op:$table),
+                    (outs), (ins table32_op:$table),
+                    [],
+                    "table.size\t$sz, $table",
+                    "table.size\t$table",
+                    0xfc10>,
+                    Requires<[HasReferenceTypes]>;
+
+
+defm TABLE_COPY : I<(outs), (ins table32_op:$table1, table32_op:$table2, I32:$n, I32:$s, I32:$d),
+                    (outs), (ins table32_op:$table1, table32_op:$table2),
+                    [],
+                    "table.copy\t$table1, $table2, $n, $s, $d",
+                    "table.copy\t$table1, $table2",
+                    0xfc0e>,
+                    Requires<[HasReferenceTypes]>;
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp
index 346938daf1aa..e07dae65fc4a 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp
@@ -15,7 +15,7 @@
 #include "WebAssembly.h"
 #include "WebAssemblySubtarget.h"
 #include "WebAssemblyUtilities.h"
-#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/WasmEHFuncInfo.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -32,15 +32,17 @@ class WebAssemblyLateEHPrepare final : public MachineFunctionPass {
   }
 
   bool runOnMachineFunction(MachineFunction &MF) override;
+  bool removeUnreachableEHPads(MachineFunction &MF);
   void recordCatchRetBBs(MachineFunction &MF);
-  bool addCatches(MachineFunction &MF);
+  bool hoistCatches(MachineFunction &MF);
+  bool addCatchAlls(MachineFunction &MF);
   bool replaceFuncletReturns(MachineFunction &MF);
   bool removeUnnecessaryUnreachables(MachineFunction &MF);
-  bool addExceptionExtraction(MachineFunction &MF);
+  bool ensureSingleBBTermPads(MachineFunction &MF);
   bool restoreStackPointer(MachineFunction &MF);
 
   MachineBasicBlock *getMatchingEHPad(MachineInstr *MI);
-  SmallSet<MachineBasicBlock *, 8> CatchRetBBs;
+  SmallPtrSet<MachineBasicBlock *, 8> CatchRetBBs;
 
 public:
   static char ID; // Pass identification, replacement for typeid
@@ -94,15 +96,18 @@ WebAssemblyLateEHPrepare::getMatchingEHPad(MachineInstr *MI) {
 template <typename Container>
 static void eraseDeadBBsAndChildren(const Container &MBBs) {
   SmallVector<MachineBasicBlock *, 8> WL(MBBs.begin(), MBBs.end());
+  SmallPtrSet<MachineBasicBlock *, 8> Deleted;
   while (!WL.empty()) {
     MachineBasicBlock *MBB = WL.pop_back_val();
-    if (!MBB->pred_empty())
+    if (Deleted.count(MBB) || !MBB->pred_empty())
       continue;
-    SmallVector<MachineBasicBlock *, 4> Succs(MBB->succ_begin(),
-                                              MBB->succ_end());
+    SmallVector<MachineBasicBlock *, 4> Succs(MBB->successors());
     WL.append(MBB->succ_begin(), MBB->succ_end());
     for (auto *Succ : Succs)
       MBB->removeSuccessor(Succ);
+    // To prevent deleting the same BB multiple times, which can happen when
+    // 'MBBs' contain both a parent and a child
+    Deleted.insert(MBB);
     MBB->eraseFromParent();
   }
 }
@@ -118,21 +123,33 @@ bool WebAssemblyLateEHPrepare::runOnMachineFunction(MachineFunction &MF) {
 
   bool Changed = false;
   if (MF.getFunction().hasPersonalityFn()) {
+    Changed |= removeUnreachableEHPads(MF);
     recordCatchRetBBs(MF);
-    Changed |= addCatches(MF);
+    Changed |= hoistCatches(MF);
+    Changed |= addCatchAlls(MF);
     Changed |= replaceFuncletReturns(MF);
+    Changed |= ensureSingleBBTermPads(MF);
   }
   Changed |= removeUnnecessaryUnreachables(MF);
-  if (MF.getFunction().hasPersonalityFn()) {
-    Changed |= addExceptionExtraction(MF);
+  if (MF.getFunction().hasPersonalityFn())
     Changed |= restoreStackPointer(MF);
-  }
   return Changed;
 }
 
-// Record which BB ends with 'CATCHRET' instruction, because this will be
-// replaced with BRs later. This set of 'CATCHRET' BBs is necessary in
-// 'getMatchingEHPad' function.
+// Remove unreachable EH pads and its children. If they remain, CFG
+// stackification can be tricky.
+bool WebAssemblyLateEHPrepare::removeUnreachableEHPads(MachineFunction &MF) {
+  SmallVector<MachineBasicBlock *, 4> ToDelete;
+  for (auto &MBB : MF)
+    if (MBB.isEHPad() && MBB.pred_empty())
+      ToDelete.push_back(&MBB);
+  eraseDeadBBsAndChildren(ToDelete);
+  return !ToDelete.empty();
+}
+
+// Record which BB ends with catchret instruction, because this will be replaced
+// with 'br's later. This set of catchret BBs is necessary in 'getMatchingEHPad'
+// function.
 void WebAssemblyLateEHPrepare::recordCatchRetBBs(MachineFunction &MF) {
   CatchRetBBs.clear();
   for (auto &MBB : MF) {
@@ -145,25 +162,69 @@ void WebAssemblyLateEHPrepare::recordCatchRetBBs(MachineFunction &MF) {
   }
 }
 
-// Add catch instruction to beginning of catchpads and cleanuppads.
-bool WebAssemblyLateEHPrepare::addCatches(MachineFunction &MF) {
+// Hoist catch instructions to the beginning of their matching EH pad BBs in
+// case,
+// (1) catch instruction is not the first instruction in EH pad.
+// ehpad:
+//   some_other_instruction
+//   ...
+//   %exn = catch 0
+// (2) catch instruction is in a non-EH pad BB. For example,
+// ehpad:
+//   br bb0
+// bb0:
+//   %exn = catch 0
+bool WebAssemblyLateEHPrepare::hoistCatches(MachineFunction &MF) {
+  bool Changed = false;
+  SmallVector<MachineInstr *, 16> Catches;
+  for (auto &MBB : MF)
+    for (auto &MI : MBB)
+      if (WebAssembly::isCatch(MI.getOpcode()))
+        Catches.push_back(&MI);
+
+  for (auto *Catch : Catches) {
+    MachineBasicBlock *EHPad = getMatchingEHPad(Catch);
+    assert(EHPad && "No matching EH pad for catch");
+    auto InsertPos = EHPad->begin();
+    // Skip EH_LABELs in the beginning of an EH pad if present. We don't use
+    // these labels at the moment, but other targets also seem to have an
+    // EH_LABEL instruction in the beginning of an EH pad.
+    while (InsertPos != EHPad->end() && InsertPos->isEHLabel())
+      InsertPos++;
+    if (InsertPos == Catch)
+      continue;
+    Changed = true;
+    EHPad->insert(InsertPos, Catch->removeFromParent());
+  }
+  return Changed;
+}
+
+// Add catch_all to beginning of cleanup pads.
+bool WebAssemblyLateEHPrepare::addCatchAlls(MachineFunction &MF) {
   bool Changed = false;
   const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
-  MachineRegisterInfo &MRI = MF.getRegInfo();
+
   for (auto &MBB : MF) {
-    if (MBB.isEHPad()) {
+    if (!MBB.isEHPad())
+      continue;
+    auto InsertPos = MBB.begin();
+    // Skip EH_LABELs in the beginning of an EH pad if present.
+    while (InsertPos != MBB.end() && InsertPos->isEHLabel())
+      InsertPos++;
+    // This runs after hoistCatches(), so we assume that if there is a catch,
+    // that should be the non-EH label first instruction in an EH pad.
+    if (InsertPos == MBB.end() ||
+        !WebAssembly::isCatch(InsertPos->getOpcode())) {
       Changed = true;
-      auto InsertPos = MBB.begin();
-      if (InsertPos->isEHLabel()) // EH pad starts with an EH label
-        ++InsertPos;
-      Register DstReg = MRI.createVirtualRegister(&WebAssembly::EXNREFRegClass);
-      BuildMI(MBB, InsertPos, MBB.begin()->getDebugLoc(),
-              TII.get(WebAssembly::CATCH), DstReg);
+      BuildMI(MBB, InsertPos, InsertPos->getDebugLoc(),
+              TII.get(WebAssembly::CATCH_ALL));
     }
   }
   return Changed;
 }
 
+// Replace pseudo-instructions catchret and cleanupret with br and rethrow
+// respectively.
 bool WebAssemblyLateEHPrepare::replaceFuncletReturns(MachineFunction &MF) {
   bool Changed = false;
   const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
@@ -185,17 +246,11 @@ bool WebAssemblyLateEHPrepare::replaceFuncletReturns(MachineFunction &MF) {
       Changed = true;
       break;
     }
-    case WebAssembly::CLEANUPRET:
-    case WebAssembly::RETHROW_IN_CATCH: {
-      // Replace a cleanupret/rethrow_in_catch with a rethrow
-      auto *EHPad = getMatchingEHPad(TI);
-      auto CatchPos = EHPad->begin();
-      if (CatchPos->isEHLabel()) // EH pad starts with an EH label
-        ++CatchPos;
-      MachineInstr *Catch = &*CatchPos;
-      Register ExnReg = Catch->getOperand(0).getReg();
+    case WebAssembly::CLEANUPRET: {
+      // Replace a cleanupret with a rethrow. For C++ support, currently
+      // rethrow's immediate argument is always 0 (= the latest exception).
       BuildMI(MBB, TI, TI->getDebugLoc(), TII.get(WebAssembly::RETHROW))
-          .addReg(ExnReg);
+          .addImm(0);
       TI->eraseFromParent();
       Changed = true;
       break;
@@ -205,6 +260,7 @@ bool WebAssemblyLateEHPrepare::replaceFuncletReturns(MachineFunction &MF) {
   return Changed;
 }
 
+// Remove unnecessary unreachables after a throw or rethrow.
 bool WebAssemblyLateEHPrepare::removeUnnecessaryUnreachables(
     MachineFunction &MF) {
   bool Changed = false;
@@ -220,8 +276,7 @@ bool WebAssemblyLateEHPrepare::removeUnnecessaryUnreachables(
       // because throw itself is a terminator, and also delete successors if
       // any.
       MBB.erase(std::next(MI.getIterator()), MBB.end());
-      SmallVector<MachineBasicBlock *, 8> Succs(MBB.succ_begin(),
-                                                MBB.succ_end());
+      SmallVector<MachineBasicBlock *, 8> Succs(MBB.successors());
       for (auto *Succ : Succs)
         if (!Succ->isEHPad())
           MBB.removeSuccessor(Succ);
@@ -232,154 +287,78 @@ bool WebAssemblyLateEHPrepare::removeUnnecessaryUnreachables(
   return Changed;
 }
 
-// Wasm uses 'br_on_exn' instruction to check the tag of an exception. It takes
-// exnref type object returned by 'catch', and branches to the destination if it
-// matches a given tag. We currently use __cpp_exception symbol to represent the
-// tag for all C++ exceptions.
+// Clang-generated terminate pads are an single-BB EH pad in the form of
+// termpad:
+//   %exn = catch $__cpp_exception
+//   call @__clang_call_terminate(%exn)
+//   unreachable
+// (There can be local.set and local.gets before the call if we didn't run
+// RegStackify)
+// But code transformations can change or add more control flow, so the call to
+// __clang_call_terminate() function may not be in the original EH pad anymore.
+// This ensures every terminate pad is a single BB in the form illustrated
+// above.
 //
-// block $l (result i32)
-//   ...
-//   ;; exnref $e is on the stack at this point
-//   br_on_exn $l $e ;; branch to $l with $e's arguments
-//   ...
-// end
-// ;; Here we expect the extracted values are on top of the wasm value stack
-// ... Handle exception using values ...
-//
-// br_on_exn takes an exnref object and branches if it matches the given tag.
-// There can be multiple br_on_exn instructions if we want to match for another
-// tag, but for now we only test for __cpp_exception tag, and if it does not
-// match, i.e., it is a foreign exception, we rethrow it.
-//
-// In the destination BB that's the target of br_on_exn, extracted exception
-// values (in C++'s case a single i32, which represents an exception pointer)
-// are placed on top of the wasm stack. Because we can't model wasm stack in
-// LLVM instruction, we use 'extract_exception' pseudo instruction to retrieve
-// it. The pseudo instruction will be deleted later.
-bool WebAssemblyLateEHPrepare::addExceptionExtraction(MachineFunction &MF) {
+// This is preparation work for the HandleEHTerminatePads pass later, which
+// duplicates terminate pads both for 'catch' and 'catch_all'. Refer to
+// WebAssemblyHandleEHTerminatePads.cpp for details.
+bool WebAssemblyLateEHPrepare::ensureSingleBBTermPads(MachineFunction &MF) {
   const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  auto *EHInfo = MF.getWasmEHFuncInfo();
-  SmallVector<MachineInstr *, 16> ExtractInstrs;
-  SmallVector<MachineInstr *, 8> ToDelete;
-  for (auto &MBB : MF) {
-    for (auto &MI : MBB) {
-      if (MI.getOpcode() == WebAssembly::EXTRACT_EXCEPTION_I32) {
-        if (MI.getOperand(0).isDead())
-          ToDelete.push_back(&MI);
-        else
-          ExtractInstrs.push_back(&MI);
-      }
-    }
-  }
-  bool Changed = !ToDelete.empty() || !ExtractInstrs.empty();
-  for (auto *MI : ToDelete)
-    MI->eraseFromParent();
-  if (ExtractInstrs.empty())
-    return Changed;
-
-  // Find terminate pads.
-  SmallSet<MachineBasicBlock *, 8> TerminatePads;
+
+  // Find calls to __clang_call_terminate()
+  SmallVector<MachineInstr *, 8> ClangCallTerminateCalls;
+  SmallPtrSet<MachineBasicBlock *, 8> TermPads;
   for (auto &MBB : MF) {
     for (auto &MI : MBB) {
       if (MI.isCall()) {
         const MachineOperand &CalleeOp = MI.getOperand(0);
         if (CalleeOp.isGlobal() && CalleeOp.getGlobal()->getName() ==
-                                       WebAssembly::ClangCallTerminateFn)
-          TerminatePads.insert(getMatchingEHPad(&MI));
+                                       WebAssembly::ClangCallTerminateFn) {
+          MachineBasicBlock *EHPad = getMatchingEHPad(&MI);
+          assert(EHPad && "No matching EH pad for __clang_call_terminate");
+          // In case a __clang_call_terminate call is duplicated during code
+          // transformation so one terminate pad contains multiple
+          // __clang_call_terminate calls, we only count one of them
+          if (TermPads.insert(EHPad).second)
+            ClangCallTerminateCalls.push_back(&MI);
+        }
       }
     }
   }
 
-  for (auto *Extract : ExtractInstrs) {
-    MachineBasicBlock *EHPad = getMatchingEHPad(Extract);
-    assert(EHPad && "No matching EH pad for extract_exception");
-    auto CatchPos = EHPad->begin();
-    if (CatchPos->isEHLabel()) // EH pad starts with an EH label
-      ++CatchPos;
-    MachineInstr *Catch = &*CatchPos;
-
-    if (Catch->getNextNode() != Extract)
-      EHPad->insert(Catch->getNextNode(), Extract->removeFromParent());
-
-    // - Before:
-    // ehpad:
-    //   %exnref:exnref = catch
-    //   %exn:i32 = extract_exception
-    //   ... use exn ...
-    //
-    // - After:
-    // ehpad:
-    //   %exnref:exnref = catch
-    //   br_on_exn %thenbb, $__cpp_exception, %exnref
-    //   br %elsebb
-    // elsebb:
-    //   rethrow
-    // thenbb:
-    //   %exn:i32 = extract_exception
-    //   ... use exn ...
-    Register ExnReg = Catch->getOperand(0).getReg();
-    auto *ThenMBB = MF.CreateMachineBasicBlock();
-    auto *ElseMBB = MF.CreateMachineBasicBlock();
-    MF.insert(std::next(MachineFunction::iterator(EHPad)), ElseMBB);
-    MF.insert(std::next(MachineFunction::iterator(ElseMBB)), ThenMBB);
-    ThenMBB->splice(ThenMBB->end(), EHPad, Extract, EHPad->end());
-    ThenMBB->transferSuccessors(EHPad);
-    EHPad->addSuccessor(ThenMBB);
-    EHPad->addSuccessor(ElseMBB);
-
-    DebugLoc DL = Extract->getDebugLoc();
-    const char *CPPExnSymbol = MF.createExternalSymbolName("__cpp_exception");
-    BuildMI(EHPad, DL, TII.get(WebAssembly::BR_ON_EXN))
-        .addMBB(ThenMBB)
-        .addExternalSymbol(CPPExnSymbol)
-        .addReg(ExnReg);
-    BuildMI(EHPad, DL, TII.get(WebAssembly::BR)).addMBB(ElseMBB);
-
-    // When this is a terminate pad with __clang_call_terminate() call, we don't
-    // rethrow it anymore and call __clang_call_terminate() with a nullptr
-    // argument, which will call std::terminate().
-    //
-    // - Before:
-    // ehpad:
-    //   %exnref:exnref = catch
-    //   %exn:i32 = extract_exception
-    //   call @__clang_call_terminate(%exn)
-    //   unreachable
-    //
-    // - After:
-    // ehpad:
-    //   %exnref:exnref = catch
-    //   br_on_exn %thenbb, $__cpp_exception, %exnref
-    //   br %elsebb
-    // elsebb:
-    //   call @__clang_call_terminate(0)
-    //   unreachable
-    // thenbb:
-    //   %exn:i32 = extract_exception
-    //   call @__clang_call_terminate(%exn)
-    //   unreachable
-    if (TerminatePads.count(EHPad)) {
-      Function *ClangCallTerminateFn =
-          MF.getFunction().getParent()->getFunction(
-              WebAssembly::ClangCallTerminateFn);
-      assert(ClangCallTerminateFn &&
-             "There is no __clang_call_terminate() function");
-      Register Reg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
-      BuildMI(ElseMBB, DL, TII.get(WebAssembly::CONST_I32), Reg).addImm(0);
-      BuildMI(ElseMBB, DL, TII.get(WebAssembly::CALL))
-          .addGlobalAddress(ClangCallTerminateFn)
-          .addReg(Reg);
-      BuildMI(ElseMBB, DL, TII.get(WebAssembly::UNREACHABLE));
-
-    } else {
-      BuildMI(ElseMBB, DL, TII.get(WebAssembly::RETHROW)).addReg(ExnReg);
-      if (EHInfo->hasEHPadUnwindDest(EHPad))
-        ElseMBB->addSuccessor(EHInfo->getEHPadUnwindDest(EHPad));
-    }
-  }
+  bool Changed = false;
+  for (auto *Call : ClangCallTerminateCalls) {
+    MachineBasicBlock *EHPad = getMatchingEHPad(Call);
+    assert(EHPad && "No matching EH pad for __clang_call_terminate");
+
+    // If it is already the form we want, skip it
+    if (Call->getParent() == EHPad &&
+        Call->getNextNode()->getOpcode() == WebAssembly::UNREACHABLE)
+      continue;
 
-  return true;
+    // In case the __clang_call_terminate() call is not in its matching EH pad,
+    // move the call to the end of EH pad and add an unreachable instruction
+    // after that. Delete all successors and their children if any, because here
+    // the program terminates.
+    Changed = true;
+    // This runs after hoistCatches(), so catch instruction should be at the top
+    MachineInstr *Catch = WebAssembly::findCatch(EHPad);
+    assert(Catch && "EH pad does not have a catch instruction");
+    // Takes the result register of the catch instruction as argument. There may
+    // have been some other local.set/local.gets in between, but at this point
+    // we don't care.
+    Call->getOperand(1).setReg(Catch->getOperand(0).getReg());
+    auto InsertPos = std::next(MachineBasicBlock::iterator(Catch));
+    EHPad->insert(InsertPos, Call->removeFromParent());
+    BuildMI(*EHPad, InsertPos, Call->getDebugLoc(),
+            TII.get(WebAssembly::UNREACHABLE));
+    EHPad->erase(InsertPos, EHPad->end());
+    SmallVector<MachineBasicBlock *, 8> Succs(EHPad->successors());
+    for (auto *Succ : Succs)
+      EHPad->removeSuccessor(Succ);
+    eraseDeadBBsAndChildren(Succs);
+  }
+  return Changed;
 }
 
 // After the stack is unwound due to a thrown exception, the __stack_pointer
@@ -406,7 +385,7 @@ bool WebAssemblyLateEHPrepare::restoreStackPointer(MachineFunction &MF) {
     auto InsertPos = MBB.begin();
     if (InsertPos->isEHLabel()) // EH pad starts with an EH label
       ++InsertPos;
-    if (InsertPos->getOpcode() == WebAssembly::CATCH)
+    if (WebAssembly::isCatch(InsertPos->getOpcode()))
       ++InsertPos;
     FrameLowering->writeSPToGlobal(FrameLowering->getSPReg(MF), MF, MBB,
                                    InsertPos, MBB.begin()->getDebugLoc());
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
index 5fce4a600510..ff6404c30971 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
@@ -140,8 +140,7 @@
 /// 1) Lower
 ///      longjmp(buf, value)
 ///    into
-///      emscripten_longjmp_jmpbuf(buf, value)
-///    emscripten_longjmp_jmpbuf will be lowered to emscripten_longjmp later.
+///      emscripten_longjmp(buf, value)
 ///
 /// In case calls to setjmp() exists
 ///
@@ -196,19 +195,16 @@
 ///    stored in saveSetjmp. testSetjmp returns a setjmp label, a unique ID to
 ///    each setjmp callsite. Label 0 means this longjmp buffer does not
 ///    correspond to one of the setjmp callsites in this function, so in this
-///    case we just chain the longjmp to the caller. (Here we call
-///    emscripten_longjmp, which is different from emscripten_longjmp_jmpbuf.
-///    emscripten_longjmp_jmpbuf takes jmp_buf as its first argument, while
-///    emscripten_longjmp takes an int. Both of them will eventually be lowered
-///    to emscripten_longjmp in s2wasm, but here we need two signatures - we
-///    can't translate an int value to a jmp_buf.)
-///    Label -1 means no longjmp occurred. Otherwise we jump to the right
-///    post-setjmp BB based on the label.
+///    case we just chain the longjmp to the caller. Label -1 means no longjmp
+///    occurred. Otherwise we jump to the right post-setjmp BB based on the
+///    label.
 ///
 ///===----------------------------------------------------------------------===//
 
 #include "WebAssembly.h"
+#include "WebAssemblyTargetMachine.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
@@ -239,7 +235,6 @@ class WebAssemblyLowerEmscriptenEHSjLj final : public ModulePass {
   Function *ResumeF = nullptr;
   Function *EHTypeIDF = nullptr;
   Function *EmLongjmpF = nullptr;
-  Function *EmLongjmpJmpbufF = nullptr;
   Function *SaveSetjmpF = nullptr;
   Function *TestSetjmpF = nullptr;
 
@@ -314,13 +309,23 @@ static bool canThrow(const Value *V) {
 // Get a global variable with the given name.  If it doesn't exist declare it,
 // which will generate an import and asssumes that it will exist at link time.
 static GlobalVariable *getGlobalVariableI32(Module &M, IRBuilder<> &IRB,
+                                            WebAssemblyTargetMachine &TM,
                                             const char *Name) {
-
-  auto *GV =
-      dyn_cast<GlobalVariable>(M.getOrInsertGlobal(Name, IRB.getInt32Ty()));
+  auto Int32Ty = IRB.getInt32Ty();
+  auto *GV = dyn_cast<GlobalVariable>(M.getOrInsertGlobal(Name, Int32Ty));
   if (!GV)
     report_fatal_error(Twine("unable to create global: ") + Name);
 
+  // If the target supports TLS, make this variable thread-local. We can't just
+  // unconditionally make it thread-local and depend on
+  // CoalesceFeaturesAndStripAtomics to downgrade it, because stripping TLS has
+  // the side effect of disallowing the object from being linked into a
+  // shared-memory module, which we don't want to be responsible for.
+  auto *Subtarget = TM.getSubtargetImpl();
+  auto TLS = Subtarget->hasAtomics() && Subtarget->hasBulkMemory()
+                 ? GlobalValue::LocalExecTLSModel
+                 : GlobalValue::NotThreadLocal;
+  GV->setThreadLocalMode(TLS);
   return GV;
 }
 
@@ -338,7 +343,7 @@ static std::string getSignature(FunctionType *FTy) {
   if (FTy->isVarArg())
     OS << "_...";
   Sig = OS.str();
-  Sig.erase(remove_if(Sig, isSpace), Sig.end());
+  erase_if(Sig, isSpace);
   // When s2wasm parses .s file, a comma means the end of an argument. So a
   // mangled function name can contain any character but a comma.
   std::replace(Sig.begin(), Sig.end(), ',', '.');
@@ -630,6 +635,40 @@ void WebAssemblyLowerEmscriptenEHSjLj::rebuildSSA(Function &F) {
   }
 }
 
+// Replace uses of longjmp with emscripten_longjmp. emscripten_longjmp takes
+// arguments of type {i32, i32} and longjmp takes {jmp_buf*, i32}, so we need a
+// ptrtoint instruction here to make the type match. jmp_buf* will eventually be
+// lowered to i32 in the wasm backend.
+static void replaceLongjmpWithEmscriptenLongjmp(Function *LongjmpF,
+                                                Function *EmLongjmpF) {
+  SmallVector<CallInst *, 8> ToErase;
+  LLVMContext &C = LongjmpF->getParent()->getContext();
+  IRBuilder<> IRB(C);
+
+  // For calls to longjmp, replace it with emscripten_longjmp and cast its first
+  // argument (jmp_buf*) to int
+  for (User *U : LongjmpF->users()) {
+    auto *CI = dyn_cast<CallInst>(U);
+    if (CI && CI->getCalledFunction() == LongjmpF) {
+      IRB.SetInsertPoint(CI);
+      Value *Jmpbuf =
+          IRB.CreatePtrToInt(CI->getArgOperand(0), IRB.getInt32Ty(), "jmpbuf");
+      IRB.CreateCall(EmLongjmpF, {Jmpbuf, CI->getArgOperand(1)});
+      ToErase.push_back(CI);
+    }
+  }
+  for (auto *I : ToErase)
+    I->eraseFromParent();
+
+  // If we have any remaining uses of longjmp's function pointer, replace it
+  // with (int(*)(jmp_buf*, int))emscripten_longjmp.
+  if (!LongjmpF->uses().empty()) {
+    Value *EmLongjmp =
+        IRB.CreateBitCast(EmLongjmpF, LongjmpF->getType(), "em_longjmp");
+    LongjmpF->replaceAllUsesWith(EmLongjmp);
+  }
+}
+
 bool WebAssemblyLowerEmscriptenEHSjLj::runOnModule(Module &M) {
   LLVM_DEBUG(dbgs() << "********** Lower Emscripten EH & SjLj **********\n");
 
@@ -642,11 +681,19 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runOnModule(Module &M) {
   bool LongjmpUsed = LongjmpF && !LongjmpF->use_empty();
   bool DoSjLj = EnableSjLj && (SetjmpUsed || LongjmpUsed);
 
+  if ((EnableEH || DoSjLj) &&
+      Triple(M.getTargetTriple()).getArch() == Triple::wasm64)
+    report_fatal_error("Emscripten EH/SjLj is not supported with wasm64 yet");
+
+  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+  assert(TPC && "Expected a TargetPassConfig");
+  auto &TM = TPC->getTM<WebAssemblyTargetMachine>();
+
   // Declare (or get) global variables __THREW__, __threwValue, and
   // getTempRet0/setTempRet0 function which are used in common for both
   // exception handling and setjmp/longjmp handling
-  ThrewGV = getGlobalVariableI32(M, IRB, "__THREW__");
-  ThrewValueGV = getGlobalVariableI32(M, IRB, "__threwValue");
+  ThrewGV = getGlobalVariableI32(M, IRB, TM, "__THREW__");
+  ThrewValueGV = getGlobalVariableI32(M, IRB, TM, "__threwValue");
   GetTempRet0Func = getEmscriptenFunction(
       FunctionType::get(IRB.getInt32Ty(), false), "getTempRet0", &M);
   SetTempRet0Func = getEmscriptenFunction(
@@ -680,22 +727,21 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runOnModule(Module &M) {
   if (DoSjLj) {
     Changed = true; // We have setjmp or longjmp somewhere
 
-    if (LongjmpF) {
-      // Replace all uses of longjmp with emscripten_longjmp_jmpbuf, which is
-      // defined in JS code
-      EmLongjmpJmpbufF = getEmscriptenFunction(LongjmpF->getFunctionType(),
-                                               "emscripten_longjmp_jmpbuf", &M);
-      LongjmpF->replaceAllUsesWith(EmLongjmpJmpbufF);
-    }
+    // Register emscripten_longjmp function
+    FunctionType *FTy = FunctionType::get(
+        IRB.getVoidTy(), {IRB.getInt32Ty(), IRB.getInt32Ty()}, false);
+    EmLongjmpF = getEmscriptenFunction(FTy, "emscripten_longjmp", &M);
+
+    if (LongjmpF)
+      replaceLongjmpWithEmscriptenLongjmp(LongjmpF, EmLongjmpF);
 
     if (SetjmpF) {
       // Register saveSetjmp function
       FunctionType *SetjmpFTy = SetjmpF->getFunctionType();
-      FunctionType *FTy =
-          FunctionType::get(Type::getInt32PtrTy(C),
-                            {SetjmpFTy->getParamType(0), IRB.getInt32Ty(),
-                             Type::getInt32PtrTy(C), IRB.getInt32Ty()},
-                            false);
+      FTy = FunctionType::get(Type::getInt32PtrTy(C),
+                              {SetjmpFTy->getParamType(0), IRB.getInt32Ty(),
+                               Type::getInt32PtrTy(C), IRB.getInt32Ty()},
+                              false);
       SaveSetjmpF = getEmscriptenFunction(FTy, "saveSetjmp", &M);
 
       // Register testSetjmp function
@@ -704,10 +750,6 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runOnModule(Module &M) {
           {IRB.getInt32Ty(), Type::getInt32PtrTy(C), IRB.getInt32Ty()}, false);
       TestSetjmpF = getEmscriptenFunction(FTy, "testSetjmp", &M);
 
-      FTy = FunctionType::get(IRB.getVoidTy(),
-                              {IRB.getInt32Ty(), IRB.getInt32Ty()}, false);
-      EmLongjmpF = getEmscriptenFunction(FTy, "emscripten_longjmp", &M);
-
       // Only traverse functions that uses setjmp in order not to insert
       // unnecessary prep / cleanup code in every function
       SmallPtrSet<Function *, 8> SetjmpUsers;
@@ -769,7 +811,7 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runEHOnFunction(Function &F) {
     } else {
       // This can't throw, and we don't need this invoke, just replace it with a
       // call+branch
-      SmallVector<Value *, 16> Args(II->arg_begin(), II->arg_end());
+      SmallVector<Value *, 16> Args(II->args());
       CallInst *NewCall =
           IRB.CreateCall(II->getFunctionType(), II->getCalledOperand(), Args);
       NewCall->takeName(II);
@@ -843,16 +885,9 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runEHOnFunction(Function &F) {
     SmallVector<Value *, 16> FMCArgs;
     for (unsigned I = 0, E = LPI->getNumClauses(); I < E; ++I) {
       Constant *Clause = LPI->getClause(I);
-      // As a temporary workaround for the lack of aggregate varargs support
-      // in the interface between JS and wasm, break out filter operands into
-      // their component elements.
-      if (LPI->isFilter(I)) {
-        auto *ATy = cast<ArrayType>(Clause->getType());
-        for (unsigned J = 0, E = ATy->getNumElements(); J < E; ++J) {
-          Value *EV = IRB.CreateExtractValue(Clause, makeArrayRef(J), "filter");
-          FMCArgs.push_back(EV);
-        }
-      } else
+      // TODO Handle filters (= exception specifications).
+      // https://bugs.llvm.org/show_bug.cgi?id=50396
+      if (LPI->isCatch(I))
         FMCArgs.push_back(Clause);
     }
 
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
index 304dca2ebfe4..86d59ef807ab 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
@@ -38,29 +38,34 @@ cl::opt<bool>
                                " instruction output for test purposes only."),
                       cl::init(false));
 
+extern cl::opt<bool> EnableEmException;
+extern cl::opt<bool> EnableEmSjLj;
+
 static void removeRegisterOperands(const MachineInstr *MI, MCInst &OutMI);
 
 MCSymbol *
 WebAssemblyMCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
   const GlobalValue *Global = MO.getGlobal();
-  auto *WasmSym = cast<MCSymbolWasm>(Printer.getSymbol(Global));
-
-  if (const auto *FuncTy = dyn_cast<FunctionType>(Global->getValueType())) {
-    const MachineFunction &MF = *MO.getParent()->getParent()->getParent();
-    const TargetMachine &TM = MF.getTarget();
-    const Function &CurrentFunc = MF.getFunction();
-
-    SmallVector<MVT, 1> ResultMVTs;
-    SmallVector<MVT, 4> ParamMVTs;
-    const auto *const F = dyn_cast<Function>(Global);
-    computeSignatureVTs(FuncTy, F, CurrentFunc, TM, ParamMVTs, ResultMVTs);
-
-    auto Signature = signatureFromMVTs(ResultMVTs, ParamMVTs);
-    WasmSym->setSignature(Signature.get());
-    Printer.addSignature(std::move(Signature));
-    WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
-  }
-
+  if (!isa<Function>(Global))
+    return cast<MCSymbolWasm>(Printer.getSymbol(Global));
+
+  const auto *FuncTy = cast<FunctionType>(Global->getValueType());
+  const MachineFunction &MF = *MO.getParent()->getParent()->getParent();
+  const TargetMachine &TM = MF.getTarget();
+  const Function &CurrentFunc = MF.getFunction();
+
+  SmallVector<MVT, 1> ResultMVTs;
+  SmallVector<MVT, 4> ParamMVTs;
+  const auto *const F = dyn_cast<Function>(Global);
+  computeSignatureVTs(FuncTy, F, CurrentFunc, TM, ParamMVTs, ResultMVTs);
+  auto Signature = signatureFromMVTs(ResultMVTs, ParamMVTs);
+
+  bool InvokeDetected = false;
+  auto *WasmSym = Printer.getMCSymbolForFunction(
+      F, EnableEmException || EnableEmSjLj, Signature.get(), InvokeDetected);
+  WasmSym->setSignature(Signature.get());
+  Printer.addSignature(std::move(Signature));
+  WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
   return WasmSym;
 }
 
@@ -134,6 +139,9 @@ MCOperand WebAssemblyMCInstLower::lowerSymbolOperand(const MachineOperand &MO,
     case WebAssemblyII::MO_MEMORY_BASE_REL:
       Kind = MCSymbolRefExpr::VK_WASM_MBREL;
       break;
+    case WebAssemblyII::MO_TLS_BASE_REL:
+      Kind = MCSymbolRefExpr::VK_WASM_TLSREL;
+      break;
     case WebAssemblyII::MO_TABLE_BASE_REL:
       Kind = MCSymbolRefExpr::VK_WASM_TBREL;
       break;
@@ -266,6 +274,11 @@ void WebAssemblyMCInstLower::lower(const MachineInstr *MI,
                                          SmallVector<wasm::ValType, 4>());
             break;
           }
+        } else if (Info.OperandType == WebAssembly::OPERAND_HEAPTYPE) {
+          assert(static_cast<WebAssembly::HeapType>(MO.getImm()) !=
+                 WebAssembly::HeapType::Invalid);
+          // With typed function references, this will need a case for type
+          // index operands.  Otherwise, fall through.
         }
       }
       MCOp = MCOperand::createImm(MO.getImm());
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp
index a2da0ea849e0..6bfed1a7195c 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp
@@ -97,7 +97,7 @@ bool WebAssemblyOptimizeLiveIntervals::runOnMachineFunction(
       // values through live-range splitting and stackification, it will have to
       // do.
       MF.getInfo<WebAssemblyFunctionInfo>()->setFrameBaseVreg(
-          SplitLIs.back()->reg);
+          SplitLIs.back()->reg());
     }
     SplitLIs.clear();
   }
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
index a587c9d23d2b..ba1c4b7233f2 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
@@ -111,8 +111,11 @@ static bool maybeRewriteToFallthrough(MachineInstr &MI, MachineBasicBlock &MBB,
       case WebAssembly::V128RegClassID:
         CopyLocalOpc = WebAssembly::COPY_V128;
         break;
-      case WebAssembly::EXNREFRegClassID:
-        CopyLocalOpc = WebAssembly::COPY_EXNREF;
+      case WebAssembly::FUNCREFRegClassID:
+        CopyLocalOpc = WebAssembly::COPY_FUNCREF;
+        break;
+      case WebAssembly::EXTERNREFRegClassID:
+        CopyLocalOpc = WebAssembly::COPY_EXTERNREF;
         break;
       default:
         llvm_unreachable("Unexpected register class for return operand");
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
index 20fe2b2b7bfc..fe127dec8aed 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
@@ -106,8 +106,8 @@ bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) {
       continue;
 
     LiveInterval *LI = &Liveness->getInterval(VReg);
-    assert(LI->weight == 0.0f);
-    LI->weight = computeWeight(MRI, MBFI, VReg);
+    assert(LI->weight() == 0.0f);
+    LI->setWeight(computeWeight(MRI, MBFI, VReg));
     LLVM_DEBUG(LI->dump());
     SortedIntervals.push_back(LI);
   }
@@ -118,10 +118,10 @@ bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) {
   // TODO: Investigate more intelligent sorting heuristics. For starters, we
   // should try to coalesce adjacent live intervals before non-adjacent ones.
   llvm::sort(SortedIntervals, [MRI](LiveInterval *LHS, LiveInterval *RHS) {
-    if (MRI->isLiveIn(LHS->reg) != MRI->isLiveIn(RHS->reg))
-      return MRI->isLiveIn(LHS->reg);
-    if (LHS->weight != RHS->weight)
-      return LHS->weight > RHS->weight;
+    if (MRI->isLiveIn(LHS->reg()) != MRI->isLiveIn(RHS->reg()))
+      return MRI->isLiveIn(LHS->reg());
+    if (LHS->weight() != RHS->weight())
+      return LHS->weight() > RHS->weight();
     if (LHS->empty() || RHS->empty())
       return !LHS->empty() && RHS->empty();
     return *LHS < *RHS;
@@ -135,14 +135,14 @@ bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) {
   bool Changed = false;
   for (size_t I = 0, E = SortedIntervals.size(); I < E; ++I) {
     LiveInterval *LI = SortedIntervals[I];
-    unsigned Old = LI->reg;
+    unsigned Old = LI->reg();
     size_t Color = I;
     const TargetRegisterClass *RC = MRI->getRegClass(Old);
 
     // Check if it's possible to reuse any of the used colors.
     if (!MRI->isLiveIn(Old))
       for (unsigned C : UsedColors.set_bits()) {
-        if (MRI->getRegClass(SortedIntervals[C]->reg) != RC)
+        if (MRI->getRegClass(SortedIntervals[C]->reg()) != RC)
           continue;
         for (LiveInterval *OtherLI : Assignments[C])
           if (!OtherLI->empty() && OtherLI->overlaps(*LI))
@@ -152,7 +152,7 @@ bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) {
       continue_outer:;
       }
 
-    unsigned New = SortedIntervals[Color]->reg;
+    unsigned New = SortedIntervals[Color]->reg();
     SlotMapping[I] = New;
     Changed |= Old != New;
     UsedColors.set(Color);
@@ -160,7 +160,7 @@ bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) {
     // If we reassigned the stack pointer, update the debug frame base info.
     if (Old != New && MFI.isFrameBaseVirtual() && MFI.getFrameBaseVreg() == Old)
       MFI.setFrameBaseVreg(New);
-    LLVM_DEBUG(dbgs() << "Assigning vreg" << Register::virtReg2Index(LI->reg)
+    LLVM_DEBUG(dbgs() << "Assigning vreg" << Register::virtReg2Index(LI->reg())
                       << " to vreg" << Register::virtReg2Index(New) << "\n");
   }
   if (!Changed)
@@ -168,7 +168,7 @@ bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) {
 
   // Rewrite register operands.
   for (size_t I = 0, E = SortedIntervals.size(); I < E; ++I) {
-    unsigned Old = SortedIntervals[I]->reg;
+    unsigned Old = SortedIntervals[I]->reg();
     unsigned New = SlotMapping[I];
     if (Old != New)
       MRI->replaceRegWith(Old, New);
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
index 1d4e2e3a8f9e..d474b9a2c1ee 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@@ -123,7 +123,7 @@ static void convertImplicitDefToConstZero(MachineInstr *MI,
   } else if (RegClass == &WebAssembly::V128RegClass) {
     // TODO: Replace this with v128.const 0 once that is supported in V8
     Register TempReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
-    MI->setDesc(TII->get(WebAssembly::SPLAT_v4i32));
+    MI->setDesc(TII->get(WebAssembly::SPLAT_I32x4));
     MI->addOperand(MachineOperand::CreateReg(TempReg, false));
     MachineInstr *Const = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
                                   TII->get(WebAssembly::CONST_I32), TempReg)
@@ -342,7 +342,7 @@ static bool isSafeToMove(const MachineOperand *Def, const MachineOperand *Use,
   // instruction in which the current value is used, we cannot
   // stackify. Stackifying in this case would require that def moving below the
   // current def in the stack, which cannot be achieved, even with locals.
-  for (const auto &SubsequentDef : drop_begin(DefI->defs(), 1)) {
+  for (const auto &SubsequentDef : drop_begin(DefI->defs())) {
     for (const auto &PriorUse : UseI->uses()) {
       if (&PriorUse == Use)
         break;
@@ -359,10 +359,9 @@ static bool isSafeToMove(const MachineOperand *Def, const MachineOperand *Use,
   if (NextI == Insert)
     return true;
 
-  // 'catch' and 'extract_exception' should be the first instruction of a BB and
-  // cannot move.
-  if (DefI->getOpcode() == WebAssembly::CATCH ||
-      DefI->getOpcode() == WebAssembly::EXTRACT_EXCEPTION_I32)
+  // 'catch' and 'catch_all' should be the first instruction of a BB and cannot
+  // move.
+  if (WebAssembly::isCatch(DefI->getOpcode()))
     return false;
 
   // Check for register dependencies.
@@ -595,7 +594,7 @@ static MachineInstr *rematerializeCheapDef(
   if (IsDead) {
     LLVM_DEBUG(dbgs() << " - Deleting original\n");
     SlotIndex Idx = LIS.getInstructionIndex(Def).getRegSlot();
-    LIS.removePhysRegDefAt(WebAssembly::ARGUMENTS, Idx);
+    LIS.removePhysRegDefAt(MCRegister::from(WebAssembly::ARGUMENTS), Idx);
     LIS.removeInterval(Reg);
     LIS.RemoveMachineInstrFromMaps(Def);
     Def.eraseFromParent();
@@ -693,7 +692,7 @@ class TreeWalkerState {
 public:
   explicit TreeWalkerState(MachineInstr *Insert) {
     const iterator_range<mop_iterator> &Range = Insert->explicit_uses();
-    if (Range.begin() != Range.end())
+    if (!Range.empty())
       Worklist.push_back(reverse(Range));
   }
 
@@ -702,11 +701,10 @@ public:
   MachineOperand &pop() {
     RangeTy &Range = Worklist.back();
     MachineOperand &Op = *Range.begin();
-    Range = drop_begin(Range, 1);
-    if (Range.begin() == Range.end())
+    Range = drop_begin(Range);
+    if (Range.empty())
       Worklist.pop_back();
-    assert((Worklist.empty() ||
-            Worklist.back().begin() != Worklist.back().end()) &&
+    assert((Worklist.empty() || !Worklist.back().empty()) &&
            "Empty ranges shouldn't remain in the worklist");
     return Op;
   }
@@ -714,7 +712,7 @@ public:
   /// Push Instr's operands onto the stack to be visited.
   void pushOperands(MachineInstr *Instr) {
     const iterator_range<mop_iterator> &Range(Instr->explicit_uses());
-    if (Range.begin() != Range.end())
+    if (!Range.empty())
       Worklist.push_back(reverse(Range));
   }
 
@@ -733,7 +731,7 @@ public:
     if (Worklist.empty())
       return false;
     const RangeTy &Range = Worklist.back();
-    return Range.begin() != Range.end() && Range.begin()->getParent() == Instr;
+    return !Range.empty() && Range.begin()->getParent() == Instr;
   }
 
   /// Test whether the given register is present on the stack, indicating an
@@ -865,24 +863,6 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
         if (WebAssembly::isArgument(DefI->getOpcode()))
           continue;
 
-        // Currently catch's return value register cannot be stackified, because
-        // the wasm LLVM backend currently does not support live-in values
-        // entering blocks, which is a part of multi-value proposal.
-        //
-        // Once we support live-in values of wasm blocks, this can be:
-        // catch                           ; push exnref value onto stack
-        // block exnref -> i32
-        // br_on_exn $__cpp_exception      ; pop the exnref value
-        // end_block
-        //
-        // But because we don't support it yet, the catch instruction's dst
-        // register should be assigned to a local to be propagated across
-        // 'block' boundary now.
-        //
-        // TODO: Fix this once we support the multivalue blocks
-        if (DefI->getOpcode() == WebAssembly::CATCH)
-          continue;
-
         MachineOperand *Def = DefI->findRegisterDefOperand(Reg);
         assert(Def != nullptr);
 
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
index 6d3d6c723277..ba2936b492a9 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
@@ -43,7 +43,8 @@ def F64_0 : WebAssemblyReg<"%f64.0">;
 
 def V128_0: WebAssemblyReg<"%v128">;
 
-def EXNREF_0 : WebAssemblyReg<"%exnref.0">;
+def FUNCREF_0 : WebAssemblyReg<"%funcref.0">;
+def EXTERNREF_0 : WebAssemblyReg<"%externref.0">;
 
 // The value stack "register". This is an opaque entity which serves to order
 // uses and defs that must remain in LIFO order.
@@ -64,4 +65,5 @@ def F32 : WebAssemblyRegClass<[f32], 32, (add F32_0)>;
 def F64 : WebAssemblyRegClass<[f64], 64, (add F64_0)>;
 def V128 : WebAssemblyRegClass<[v4f32, v2f64, v2i64, v4i32, v16i8, v8i16], 128,
                                (add V128_0)>;
-def EXNREF : WebAssemblyRegClass<[exnref], 0, (add EXNREF_0)>;
+def FUNCREF : WebAssemblyRegClass<[funcref], 0, (add FUNCREF_0)>;
+def EXTERNREF : WebAssemblyRegClass<[externref], 0, (add EXTERNREF_0)>;
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblySortRegion.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblySortRegion.cpp
new file mode 100644
index 000000000000..cd84e68aed14
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblySortRegion.cpp
@@ -0,0 +1,78 @@
+#include "WebAssemblySortRegion.h"
+#include "WebAssemblyExceptionInfo.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+
+using namespace llvm;
+using namespace WebAssembly;
+
+namespace llvm {
+namespace WebAssembly {
+template <>
+bool ConcreteSortRegion<MachineLoop>::isLoop() const {
+  return true;
+}
+} // end namespace WebAssembly
+} // end namespace llvm
+
+const SortRegion *SortRegionInfo::getRegionFor(const MachineBasicBlock *MBB) {
+  const auto *ML = MLI.getLoopFor(MBB);
+  const auto *WE = WEI.getExceptionFor(MBB);
+  if (!ML && !WE)
+    return nullptr;
+  // We determine subregion relationship by domination of their headers, i.e.,
+  // if region A's header dominates region B's header, B is a subregion of A.
+  // WebAssemblyException contains BBs in all its subregions (loops or
+  // exceptions), but MachineLoop may not, because MachineLoop does not
+  // contain BBs that don't have a path to its header even if they are
+  // dominated by its header. So here we should use
+  // WE->contains(ML->getHeader()), but not ML->contains(WE->getHeader()).
+  if ((ML && !WE) || (ML && WE && WE->contains(ML->getHeader()))) {
+    // If the smallest region containing MBB is a loop
+    if (LoopMap.count(ML))
+      return LoopMap[ML].get();
+    LoopMap[ML] = std::make_unique<ConcreteSortRegion<MachineLoop>>(ML);
+    return LoopMap[ML].get();
+  } else {
+    // If the smallest region containing MBB is an exception
+    if (ExceptionMap.count(WE))
+      return ExceptionMap[WE].get();
+    ExceptionMap[WE] =
+        std::make_unique<ConcreteSortRegion<WebAssemblyException>>(WE);
+    return ExceptionMap[WE].get();
+  }
+}
+
+MachineBasicBlock *SortRegionInfo::getBottom(const SortRegion *R) {
+  if (R->isLoop())
+    return getBottom(MLI.getLoopFor(R->getHeader()));
+  else
+    return getBottom(WEI.getExceptionFor(R->getHeader()));
+}
+
+MachineBasicBlock *SortRegionInfo::getBottom(const MachineLoop *ML) {
+  MachineBasicBlock *Bottom = ML->getHeader();
+  for (MachineBasicBlock *MBB : ML->blocks()) {
+    if (MBB->getNumber() > Bottom->getNumber())
+      Bottom = MBB;
+    // MachineLoop does not contain all BBs dominated by its header. BBs that
+    // don't have a path back to the loop header aren't included. But for the
+    // purpose of CFG sorting and stackification, we need a bottom BB among all
+    // BBs that are dominated by the loop header. So we check if there is any
+    // WebAssemblyException contained in this loop, and computes the most bottom
+    // BB of them all.
+    if (MBB->isEHPad()) {
+      MachineBasicBlock *ExBottom = getBottom(WEI.getExceptionFor(MBB));
+      if (ExBottom->getNumber() > Bottom->getNumber())
+        Bottom = ExBottom;
+    }
+  }
+  return Bottom;
+}
+
+MachineBasicBlock *SortRegionInfo::getBottom(const WebAssemblyException *WE) {
+  MachineBasicBlock *Bottom = WE->getHeader();
+  for (MachineBasicBlock *MBB : WE->blocks())
+    if (MBB->getNumber() > Bottom->getNumber())
+      Bottom = MBB;
+  return Bottom;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblySortRegion.h b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblySortRegion.h
new file mode 100644
index 000000000000..e92bf1764185
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblySortRegion.h
@@ -0,0 +1,91 @@
+//===-- WebAssemblySortRegion.h - WebAssembly Sort SortRegion ----*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements regions used in CFGSort and CFGStackify.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYSORTREGION_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYSORTREGION_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/iterator_range.h"
+
+namespace llvm {
+
+class MachineBasicBlock;
+class MachineLoop;
+class MachineLoopInfo;
+class WebAssemblyException;
+class WebAssemblyExceptionInfo;
+
+namespace WebAssembly {
+
+// Wrapper for loops and exceptions
+class SortRegion {
+public:
+  virtual ~SortRegion() = default;
+  virtual MachineBasicBlock *getHeader() const = 0;
+  virtual bool contains(const MachineBasicBlock *MBB) const = 0;
+  virtual unsigned getNumBlocks() const = 0;
+  using block_iterator = typename ArrayRef<MachineBasicBlock *>::const_iterator;
+  virtual iterator_range<block_iterator> blocks() const = 0;
+  virtual bool isLoop() const = 0;
+};
+
+template <typename T> class ConcreteSortRegion : public SortRegion {
+  const T *Unit;
+
+public:
+  ConcreteSortRegion(const T *Unit) : Unit(Unit) {}
+  MachineBasicBlock *getHeader() const override { return Unit->getHeader(); }
+  bool contains(const MachineBasicBlock *MBB) const override {
+    return Unit->contains(MBB);
+  }
+  unsigned getNumBlocks() const override { return Unit->getNumBlocks(); }
+  iterator_range<block_iterator> blocks() const override {
+    return Unit->blocks();
+  }
+  bool isLoop() const override { return false; }
+};
+
+// This class has information of nested SortRegions; this is analogous to what
+// LoopInfo is for loops.
+class SortRegionInfo {
+  friend class ConcreteSortRegion<MachineLoopInfo>;
+  friend class ConcreteSortRegion<WebAssemblyException>;
+
+  const MachineLoopInfo &MLI;
+  const WebAssemblyExceptionInfo &WEI;
+  DenseMap<const MachineLoop *, std::unique_ptr<SortRegion>> LoopMap;
+  DenseMap<const WebAssemblyException *, std::unique_ptr<SortRegion>>
+      ExceptionMap;
+
+public:
+  SortRegionInfo(const MachineLoopInfo &MLI,
+                 const WebAssemblyExceptionInfo &WEI)
+      : MLI(MLI), WEI(WEI) {}
+
+  // Returns a smallest loop or exception that contains MBB
+  const SortRegion *getRegionFor(const MachineBasicBlock *MBB);
+
+  // Return the "bottom" block among all blocks dominated by the region
+  // (MachineLoop or WebAssemblyException) header. This works when the entity is
+  // discontiguous.
+  MachineBasicBlock *getBottom(const SortRegion *R);
+  MachineBasicBlock *getBottom(const MachineLoop *ML);
+  MachineBasicBlock *getBottom(const WebAssemblyException *WE);
+};
+
+} // end namespace WebAssembly
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
index cacf5ab078a0..7943e1ecc8e1 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
@@ -33,7 +33,7 @@ WebAssemblySubtarget::initializeSubtargetDependencies(StringRef CPU,
   if (CPU.empty())
     CPU = "generic";
 
-  ParseSubtargetFeatures(CPU, FS);
+  ParseSubtargetFeatures(CPU, /*TuneCPU*/ CPU, FS);
   return *this;
 }
 
@@ -41,9 +41,10 @@ WebAssemblySubtarget::WebAssemblySubtarget(const Triple &TT,
                                            const std::string &CPU,
                                            const std::string &FS,
                                            const TargetMachine &TM)
-    : WebAssemblyGenSubtargetInfo(TT, CPU, FS), TargetTriple(TT),
-      FrameLowering(), InstrInfo(initializeSubtargetDependencies(CPU, FS)),
-      TSInfo(), TLInfo(TM, *this) {}
+    : WebAssemblyGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS),
+      TargetTriple(TT), FrameLowering(),
+      InstrInfo(initializeSubtargetDependencies(CPU, FS)), TSInfo(),
+      TLInfo(TM, *this) {}
 
 bool WebAssemblySubtarget::enableAtomicExpand() const {
   // If atomics are disabled, atomic ops are lowered instead of expanded
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h
index 8b95a3ddb837..a1c872ef2135 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h
@@ -105,7 +105,7 @@ public:
 
   /// Parses features string setting specified subtarget options. Definition of
   /// function is auto generated by tblgen.
-  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+  void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index 7bf655c925a4..135055a43afc 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -34,13 +34,13 @@ using namespace llvm;
 #define DEBUG_TYPE "wasm"
 
 // Emscripten's asm.js-style exception handling
-static cl::opt<bool> EnableEmException(
+cl::opt<bool> EnableEmException(
     "enable-emscripten-cxx-exceptions",
     cl::desc("WebAssembly Emscripten-style exception handling"),
     cl::init(false));
 
 // Emscripten's asm.js-style setjmp/longjmp handling
-static cl::opt<bool> EnableEmSjLj(
+cl::opt<bool> EnableEmSjLj(
     "enable-emscripten-sjlj",
     cl::desc("WebAssembly Emscripten-style setjmp/longjmp handling"),
     cl::init(false));
@@ -145,6 +145,11 @@ WebAssemblyTargetMachine::WebAssemblyTargetMachine(
 
 WebAssemblyTargetMachine::~WebAssemblyTargetMachine() = default; // anchor.
 
+const WebAssemblySubtarget *WebAssemblyTargetMachine::getSubtargetImpl() const {
+  return getSubtargetImpl(std::string(getTargetCPU()),
+                          std::string(getTargetFeatureString()));
+}
+
 const WebAssemblySubtarget *
 WebAssemblyTargetMachine::getSubtargetImpl(std::string CPU,
                                            std::string FS) const {
@@ -160,12 +165,10 @@ WebAssemblyTargetMachine::getSubtargetImpl(const Function &F) const {
   Attribute CPUAttr = F.getFnAttribute("target-cpu");
   Attribute FSAttr = F.getFnAttribute("target-features");
 
-  std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
-                        ? CPUAttr.getValueAsString().str()
-                        : TargetCPU;
-  std::string FS = !FSAttr.hasAttribute(Attribute::None)
-                       ? FSAttr.getValueAsString().str()
-                       : TargetFS;
+  std::string CPU =
+      CPUAttr.isValid() ? CPUAttr.getValueAsString().str() : TargetCPU;
+  std::string FS =
+      FSAttr.isValid() ? FSAttr.getValueAsString().str() : TargetFS;
 
   // This needs to be done before we create a new subtarget since any
   // creation will depend on the TM and the code generation flags on the
@@ -193,6 +196,7 @@ public:
     FeatureBitset Features = coalesceFeatures(M);
 
     std::string FeatureStr = getFeatureString(Features);
+    WasmTM->setTargetFeatureString(FeatureStr);
     for (auto &F : M)
       replaceFeatures(F, FeatureStr);
 
@@ -273,10 +277,9 @@ private:
   bool stripThreadLocals(Module &M) {
     bool Stripped = false;
     for (auto &GV : M.globals()) {
-      if (GV.getThreadLocalMode() !=
-          GlobalValue::ThreadLocalMode::NotThreadLocal) {
+      if (GV.isThreadLocal()) {
         Stripped = true;
-        GV.setThreadLocalMode(GlobalValue::ThreadLocalMode::NotThreadLocal);
+        GV.setThreadLocal(false);
       }
     }
     return Stripped;
@@ -323,10 +326,10 @@ public:
   void addPreEmitPass() override;
 
   // No reg alloc
-  bool addRegAssignmentFast() override { return false; }
+  bool addRegAssignAndRewriteFast() override { return false; }
 
   // No reg alloc
-  bool addRegAssignmentOptimized() override { return false; }
+  bool addRegAssignAndRewriteOptimized() override { return false; }
 };
 } // end anonymous namespace
 
@@ -350,7 +353,7 @@ FunctionPass *WebAssemblyPassConfig::createTargetRegisterAllocator(bool) {
 //===----------------------------------------------------------------------===//
 
 void WebAssemblyPassConfig::addIRPasses() {
-  // Runs LowerAtomicPass if necessary
+  // Lower atomics and TLS if necessary
   addPass(new CoalesceFeaturesAndStripAtomics(&getWebAssemblyTargetMachine()));
 
   // This is a no-op if atomics are not used in the module
@@ -443,7 +446,8 @@ void WebAssemblyPassConfig::addPreEmitPass() {
 
   // Do various transformations for exception handling.
   // Every CFG-changing optimizations should come before this.
-  addPass(createWebAssemblyLateEHPrepare());
+  if (TM->Options.ExceptionModel == ExceptionHandling::Wasm)
+    addPass(createWebAssemblyLateEHPrepare());
 
   // Now that we have a prologue and epilogue and all frame indices are
   // rewritten, eliminate SP and FP. This allows them to be stackified,
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.h b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.h
index dd5b39773313..29e968bfe8eb 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.h
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.h
@@ -33,6 +33,7 @@ public:
 
   ~WebAssemblyTargetMachine() override;
 
+  const WebAssemblySubtarget *getSubtargetImpl() const;
   const WebAssemblySubtarget *getSubtargetImpl(std::string CPU,
                                                std::string FS) const;
   const WebAssemblySubtarget *
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
index 28703a2787e0..be1cfbaef3e4 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -84,3 +84,21 @@ unsigned WebAssemblyTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
 
   return Cost;
 }
+
+bool WebAssemblyTTIImpl::areInlineCompatible(const Function *Caller,
+                                             const Function *Callee) const {
+  // Allow inlining only when the Callee has a subset of the Caller's
+  // features. In principle, we should be able to inline regardless of any
+  // features because WebAssembly supports features at module granularity, not
+  // function granularity, but without this restriction it would be possible for
+  // a module to "forget" about features if all the functions that used them
+  // were inlined.
+  const TargetMachine &TM = getTLI()->getTargetMachine();
+
+  const FeatureBitset &CallerBits =
+      TM.getSubtargetImpl(*Caller)->getFeatureBits();
+  const FeatureBitset &CalleeBits =
+      TM.getSubtargetImpl(*Callee)->getFeatureBits();
+
+  return (CallerBits & CalleeBits) == CalleeBits;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
index 79588a9f5669..41e358c159b4 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@@ -67,6 +67,9 @@ public:
   unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
 
   /// @}
+
+  bool areInlineCompatible(const Function *Caller,
+                           const Function *Callee) const;
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
index bc2bb4fd6935..f8fb57d8a461 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
@@ -15,6 +15,7 @@
 #include "WebAssemblyMachineFunctionInfo.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/MC/MCContext.h"
 using namespace llvm;
 
 const char *const WebAssembly::ClangCallTerminateFn = "__clang_call_terminate";
@@ -96,3 +97,35 @@ const MachineOperand &WebAssembly::getCalleeOp(const MachineInstr &MI) {
     llvm_unreachable("Not a call instruction");
   }
 }
+
+MCSymbolWasm *
+WebAssembly::getOrCreateFunctionTableSymbol(MCContext &Ctx,
+                                            const StringRef &Name) {
+  // FIXME: Duplicates functionality from
+  // MC/WasmObjectWriter::recordRelocation.
+  MCSymbolWasm *Sym = cast_or_null<MCSymbolWasm>(Ctx.lookupSymbol(Name));
+  if (Sym) {
+    if (!Sym->isFunctionTable())
+      Ctx.reportError(SMLoc(), "symbol is not a wasm funcref table");
+  } else {
+    Sym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(Name));
+    Sym->setFunctionTable();
+    // The default function table is synthesized by the linker.
+    Sym->setUndefined();
+  }
+  return Sym;
+}
+
+// Find a catch instruction from an EH pad.
+MachineInstr *WebAssembly::findCatch(MachineBasicBlock *EHPad) {
+  assert(EHPad->isEHPad());
+  auto Pos = EHPad->begin();
+  // Skip any label or debug instructions. Also skip 'end' marker instructions
+  // that may exist after marker placement in CFGStackify.
+  while (Pos != EHPad->end() &&
+         (Pos->isLabel() || Pos->isDebugInstr() || isMarker(Pos->getOpcode())))
+    Pos++;
+  if (Pos != EHPad->end() && WebAssembly::isCatch(Pos->getOpcode()))
+    return &*Pos;
+  return nullptr;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.h b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.h
index 4f0ed43a2481..41ad7869cf46 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.h
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.h
@@ -15,10 +15,14 @@
 #ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYUTILITIES_H
 #define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYUTILITIES_H
 
-#include "llvm/CodeGen/MachineBasicBlock.h"
-
 namespace llvm {
 
+class MachineBasicBlock;
+class MachineInstr;
+class MachineOperand;
+class MCContext;
+class MCSymbolWasm;
+class StringRef;
 class WebAssemblyFunctionInfo;
 
 namespace WebAssembly {
@@ -33,21 +37,19 @@ extern const char *const CxaRethrowFn;
 extern const char *const StdTerminateFn;
 extern const char *const PersonalityWrapperFn;
 
-/// Return the "bottom" block of an entity, which can be either a MachineLoop or
-/// WebAssemblyException. This differs from MachineLoop::getBottomBlock in that
-/// it works even if the entity is discontiguous.
-template <typename T> MachineBasicBlock *getBottom(const T *Unit) {
-  MachineBasicBlock *Bottom = Unit->getHeader();
-  for (MachineBasicBlock *MBB : Unit->blocks())
-    if (MBB->getNumber() > Bottom->getNumber())
-      Bottom = MBB;
-  return Bottom;
-}
-
 /// Returns the operand number of a callee, assuming the argument is a call
 /// instruction.
 const MachineOperand &getCalleeOp(const MachineInstr &MI);
 
+/// Returns the operand number of a callee, assuming the argument is a call
+/// instruction.
+MCSymbolWasm *getOrCreateFunctionTableSymbol(MCContext &Ctx,
+                                             const StringRef &Name);
+
+/// Find a catch instruction from an EH pad. Returns null if no catch
+/// instruction found or the catch is in an invalid location.
+MachineInstr *findCatch(MachineBasicBlock *EHPad);
+
 } // end namespace WebAssembly
 
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/contrib/llvm-project/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index a3014b2aba92..9d9a20183f0f 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -32,6 +32,7 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
@@ -56,37 +57,53 @@ static bool checkScale(unsigned Scale, StringRef &ErrMsg) {
 namespace {
 
 static const char OpPrecedence[] = {
-  0, // IC_OR
-  1, // IC_XOR
-  2, // IC_AND
-  3, // IC_LSHIFT
-  3, // IC_RSHIFT
-  4, // IC_PLUS
-  4, // IC_MINUS
-  5, // IC_MULTIPLY
-  5, // IC_DIVIDE
-  5, // IC_MOD
-  6, // IC_NOT
-  7, // IC_NEG
-  8, // IC_RPAREN
-  9, // IC_LPAREN
-  0, // IC_IMM
-  0  // IC_REGISTER
+    0,  // IC_OR
+    1,  // IC_XOR
+    2,  // IC_AND
+    4,  // IC_LSHIFT
+    4,  // IC_RSHIFT
+    5,  // IC_PLUS
+    5,  // IC_MINUS
+    6,  // IC_MULTIPLY
+    6,  // IC_DIVIDE
+    6,  // IC_MOD
+    7,  // IC_NOT
+    8,  // IC_NEG
+    9,  // IC_RPAREN
+    10, // IC_LPAREN
+    0,  // IC_IMM
+    0,  // IC_REGISTER
+    3,  // IC_EQ
+    3,  // IC_NE
+    3,  // IC_LT
+    3,  // IC_LE
+    3,  // IC_GT
+    3   // IC_GE
 };
 
 class X86AsmParser : public MCTargetAsmParser {
   ParseInstructionInfo *InstInfo;
   bool Code16GCC;
+  unsigned ForcedDataPrefix = 0;
 
   enum VEXEncoding {
     VEXEncoding_Default,
     VEXEncoding_VEX,
+    VEXEncoding_VEX2,
     VEXEncoding_VEX3,
     VEXEncoding_EVEX,
   };
 
   VEXEncoding ForcedVEXEncoding = VEXEncoding_Default;
 
+  enum DispEncoding {
+    DispEncoding_Default,
+    DispEncoding_Disp8,
+    DispEncoding_Disp32,
+  };
+
+  DispEncoding ForcedDispEncoding = DispEncoding_Default;
+
 private:
   SMLoc consumeToken() {
     MCAsmParser &Parser = getParser();
@@ -132,7 +149,13 @@ private:
     IC_RPAREN,
     IC_LPAREN,
     IC_IMM,
-    IC_REGISTER
+    IC_REGISTER,
+    IC_EQ,
+    IC_NE,
+    IC_LT,
+    IC_LE,
+    IC_GT,
+    IC_GE
   };
 
   enum IntelOperatorKind {
@@ -142,12 +165,19 @@ private:
     IOK_TYPE,
   };
 
+  enum MasmOperatorKind {
+    MOK_INVALID = 0,
+    MOK_LENGTHOF,
+    MOK_SIZEOF,
+    MOK_TYPE,
+  };
+
   class InfixCalculator {
     typedef std::pair< InfixCalculatorTok, int64_t > ICToken;
     SmallVector<InfixCalculatorTok, 4> InfixOperatorStack;
     SmallVector<ICToken, 4> PostfixStack;
 
-    bool isUnaryOperator(const InfixCalculatorTok Op) {
+    bool isUnaryOperator(InfixCalculatorTok Op) const {
       return Op == IC_NEG || Op == IC_NOT;
     }
 
@@ -314,6 +344,44 @@ private:
             Val = Op1.second >> Op2.second;
             OperandStack.push_back(std::make_pair(IC_IMM, Val));
             break;
+          case IC_EQ:
+            assert(Op1.first == IC_IMM && Op2.first == IC_IMM &&
+                   "Equals operation with an immediate and a register!");
+            Val = (Op1.second == Op2.second) ? -1 : 0;
+            OperandStack.push_back(std::make_pair(IC_IMM, Val));
+            break;
+          case IC_NE:
+            assert(Op1.first == IC_IMM && Op2.first == IC_IMM &&
+                   "Not-equals operation with an immediate and a register!");
+            Val = (Op1.second != Op2.second) ? -1 : 0;
+            OperandStack.push_back(std::make_pair(IC_IMM, Val));
+            break;
+          case IC_LT:
+            assert(Op1.first == IC_IMM && Op2.first == IC_IMM &&
+                   "Less-than operation with an immediate and a register!");
+            Val = (Op1.second < Op2.second) ? -1 : 0;
+            OperandStack.push_back(std::make_pair(IC_IMM, Val));
+            break;
+          case IC_LE:
+            assert(Op1.first == IC_IMM && Op2.first == IC_IMM &&
+                   "Less-than-or-equal operation with an immediate and a "
+                   "register!");
+            Val = (Op1.second <= Op2.second) ? -1 : 0;
+            OperandStack.push_back(std::make_pair(IC_IMM, Val));
+            break;
+          case IC_GT:
+            assert(Op1.first == IC_IMM && Op2.first == IC_IMM &&
+                   "Greater-than operation with an immediate and a register!");
+            Val = (Op1.second > Op2.second) ? -1 : 0;
+            OperandStack.push_back(std::make_pair(IC_IMM, Val));
+            break;
+          case IC_GE:
+            assert(Op1.first == IC_IMM && Op2.first == IC_IMM &&
+                   "Greater-than-or-equal operation with an immediate and a "
+                   "register!");
+            Val = (Op1.second >= Op2.second) ? -1 : 0;
+            OperandStack.push_back(std::make_pair(IC_IMM, Val));
+            break;
           }
         }
       }
@@ -327,6 +395,12 @@ private:
     IES_OR,
     IES_XOR,
     IES_AND,
+    IES_EQ,
+    IES_NE,
+    IES_LT,
+    IES_LE,
+    IES_GT,
+    IES_GE,
     IES_LSHIFT,
     IES_RSHIFT,
     IES_PLUS,
@@ -359,7 +433,7 @@ private:
     bool MemExpr;
     bool OffsetOperator;
     SMLoc OffsetOperatorLoc;
-    StringRef CurType;
+    AsmTypeInfo CurType;
 
     bool setSymRef(const MCExpr *Val, StringRef ID, StringRef &ErrMsg) {
       if (Sym) {
@@ -378,22 +452,25 @@ private:
           MemExpr(false), OffsetOperator(false) {}
 
     void addImm(int64_t imm) { Imm += imm; }
-    short getBracCount() { return BracCount; }
-    bool isMemExpr() { return MemExpr; }
-    bool isOffsetOperator() { return OffsetOperator; }
-    SMLoc getOffsetLoc() { return OffsetOperatorLoc; }
-    unsigned getBaseReg() { return BaseReg; }
-    unsigned getIndexReg() { return IndexReg; }
-    unsigned getScale() { return Scale; }
-    const MCExpr *getSym() { return Sym; }
-    StringRef getSymName() { return SymName; }
-    StringRef getType() { return CurType; }
+    short getBracCount() const { return BracCount; }
+    bool isMemExpr() const { return MemExpr; }
+    bool isOffsetOperator() const { return OffsetOperator; }
+    SMLoc getOffsetLoc() const { return OffsetOperatorLoc; }
+    unsigned getBaseReg() const { return BaseReg; }
+    unsigned getIndexReg() const { return IndexReg; }
+    unsigned getScale() const { return Scale; }
+    const MCExpr *getSym() const { return Sym; }
+    StringRef getSymName() const { return SymName; }
+    StringRef getType() const { return CurType.Name; }
+    unsigned getSize() const { return CurType.Size; }
+    unsigned getElementSize() const { return CurType.ElementSize; }
+    unsigned getLength() const { return CurType.Length; }
     int64_t getImm() { return Imm + IC.execute(); }
-    bool isValidEndState() {
+    bool isValidEndState() const {
       return State == IES_RBRAC || State == IES_INTEGER;
     }
-    bool hadError() { return State == IES_ERROR; }
-    InlineAsmIdentifierInfo &getIdentifierInfo() { return Info; }
+    bool hadError() const { return State == IES_ERROR; }
+    const InlineAsmIdentifierInfo &getIdentifierInfo() const { return Info; }
 
     void onOr() {
       IntelExprState CurrState = State;
@@ -440,6 +517,96 @@ private:
       }
       PrevState = CurrState;
     }
+    void onEq() {
+      IntelExprState CurrState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_INTEGER:
+      case IES_RPAREN:
+      case IES_REGISTER:
+        State = IES_EQ;
+        IC.pushOperator(IC_EQ);
+        break;
+      }
+      PrevState = CurrState;
+    }
+    void onNE() {
+      IntelExprState CurrState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_INTEGER:
+      case IES_RPAREN:
+      case IES_REGISTER:
+        State = IES_NE;
+        IC.pushOperator(IC_NE);
+        break;
+      }
+      PrevState = CurrState;
+    }
+    void onLT() {
+      IntelExprState CurrState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_INTEGER:
+      case IES_RPAREN:
+      case IES_REGISTER:
+        State = IES_LT;
+        IC.pushOperator(IC_LT);
+        break;
+      }
+      PrevState = CurrState;
+    }
+    void onLE() {
+      IntelExprState CurrState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_INTEGER:
+      case IES_RPAREN:
+      case IES_REGISTER:
+        State = IES_LE;
+        IC.pushOperator(IC_LE);
+        break;
+      }
+      PrevState = CurrState;
+    }
+    void onGT() {
+      IntelExprState CurrState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_INTEGER:
+      case IES_RPAREN:
+      case IES_REGISTER:
+        State = IES_GT;
+        IC.pushOperator(IC_GT);
+        break;
+      }
+      PrevState = CurrState;
+    }
+    void onGE() {
+      IntelExprState CurrState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_INTEGER:
+      case IES_RPAREN:
+      case IES_REGISTER:
+        State = IES_GE;
+        IC.pushOperator(IC_GE);
+        break;
+      }
+      PrevState = CurrState;
+    }
     void onLShift() {
       IntelExprState CurrState = State;
       switch (State) {
@@ -510,6 +677,12 @@ private:
       case IES_OR:
       case IES_XOR:
       case IES_AND:
+      case IES_EQ:
+      case IES_NE:
+      case IES_LT:
+      case IES_LE:
+      case IES_GT:
+      case IES_GE:
       case IES_LSHIFT:
       case IES_RSHIFT:
       case IES_PLUS:
@@ -565,6 +738,12 @@ private:
       case IES_OR:
       case IES_XOR:
       case IES_AND:
+      case IES_EQ:
+      case IES_NE:
+      case IES_LT:
+      case IES_LE:
+      case IES_GT:
+      case IES_GE:
       case IES_LSHIFT:
       case IES_RSHIFT:
       case IES_PLUS:
@@ -620,7 +799,8 @@ private:
     }
     bool onIdentifierExpr(const MCExpr *SymRef, StringRef SymRefName,
                           const InlineAsmIdentifierInfo &IDInfo,
-                          bool ParsingMSInlineAsm, StringRef &ErrMsg) {
+                          const AsmTypeInfo &Type, bool ParsingMSInlineAsm,
+                          StringRef &ErrMsg) {
       // InlineAsm: Treat an enum value as an integer
       if (ParsingMSInlineAsm)
         if (IDInfo.isKind(InlineAsmIdentifierInfo::IK_EnumVal))
@@ -639,6 +819,7 @@ private:
       case IES_NOT:
       case IES_INIT:
       case IES_LBRAC:
+      case IES_LPAREN:
         if (setSymRef(SymRef, SymRefName, ErrMsg))
           return true;
         MemExpr = true;
@@ -646,6 +827,7 @@ private:
         IC.pushOperand(IC_IMM);
         if (ParsingMSInlineAsm)
           Info = IDInfo;
+        setTypeInfo(Type);
         break;
       }
       return false;
@@ -662,6 +844,12 @@ private:
       case IES_OR:
       case IES_XOR:
       case IES_AND:
+      case IES_EQ:
+      case IES_NE:
+      case IES_LT:
+      case IES_LE:
+      case IES_GT:
+      case IES_GE:
       case IES_LSHIFT:
       case IES_RSHIFT:
       case IES_DIVIDE:
@@ -744,6 +932,8 @@ private:
       case IES_RPAREN:
         State = IES_PLUS;
         IC.pushOperator(IC_PLUS);
+        CurType.Length = 1;
+        CurType.Size = CurType.ElementSize;
         break;
       case IES_INIT:
       case IES_CAST:
@@ -796,6 +986,12 @@ private:
       case IES_OR:
       case IES_XOR:
       case IES_AND:
+      case IES_EQ:
+      case IES_NE:
+      case IES_LT:
+      case IES_LE:
+      case IES_GT:
+      case IES_GE:
       case IES_LSHIFT:
       case IES_RSHIFT:
       case IES_MULTIPLY:
@@ -827,8 +1023,8 @@ private:
       }
     }
     bool onOffset(const MCExpr *Val, SMLoc OffsetLoc, StringRef ID,
-                  const InlineAsmIdentifierInfo &IDInfo, bool ParsingMSInlineAsm,
-                  StringRef &ErrMsg) {
+                  const InlineAsmIdentifierInfo &IDInfo,
+                  bool ParsingMSInlineAsm, StringRef &ErrMsg) {
       PrevState = State;
       switch (State) {
       default:
@@ -852,19 +1048,19 @@ private:
       }
       return false;
     }
-    void onCast(StringRef Type) {
+    void onCast(AsmTypeInfo Info) {
       PrevState = State;
       switch (State) {
       default:
         State = IES_ERROR;
         break;
       case IES_LPAREN:
-        setType(Type);
+        setTypeInfo(Info);
         State = IES_CAST;
         break;
       }
     }
-    void setType(StringRef Type) { CurType = Type; }
+    void setTypeInfo(AsmTypeInfo Type) { CurType = Type; }
   };
 
   bool Error(SMLoc L, const Twine &Msg, SMRange Range = None,
@@ -878,11 +1074,6 @@ private:
     return Parser.Error(L, Msg, Range);
   }
 
-  std::nullptr_t ErrorOperand(SMLoc Loc, StringRef Msg, SMRange R = SMRange()) {
-    Error(Loc, Msg, R);
-    return nullptr;
-  }
-
   bool MatchRegisterByName(unsigned &RegNo, StringRef RegName, SMLoc StartLoc,
                            SMLoc EndLoc);
   bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc,
@@ -898,17 +1089,21 @@ private:
                             std::unique_ptr<llvm::MCParsedAsmOperand> &&Dst);
   bool VerifyAndAdjustOperands(OperandVector &OrigOperands,
                                OperandVector &FinalOperands);
-  std::unique_ptr<X86Operand> ParseOperand();
-  std::unique_ptr<X86Operand> ParseATTOperand();
-  std::unique_ptr<X86Operand> ParseIntelOperand();
+  bool ParseOperand(OperandVector &Operands);
+  bool ParseATTOperand(OperandVector &Operands);
+  bool ParseIntelOperand(OperandVector &Operands);
   bool ParseIntelOffsetOperator(const MCExpr *&Val, StringRef &ID,
                                 InlineAsmIdentifierInfo &Info, SMLoc &End);
   bool ParseIntelDotOperator(IntelExprStateMachine &SM, SMLoc &End);
   unsigned IdentifyIntelInlineAsmOperator(StringRef Name);
   unsigned ParseIntelInlineAsmOperator(unsigned OpKind);
-  std::unique_ptr<X86Operand> ParseRoundingModeOp(SMLoc Start);
+  unsigned IdentifyMasmOperator(StringRef Name);
+  bool ParseMasmOperator(unsigned OpKind, int64_t &Val);
+  bool ParseRoundingModeOp(SMLoc Start, OperandVector &Operands);
   bool ParseIntelNamedOperator(StringRef Name, IntelExprStateMachine &SM,
                                bool &ParseError, SMLoc &End);
+  bool ParseMasmNamedOperator(StringRef Name, IntelExprStateMachine &SM,
+                              bool &ParseError, SMLoc &End);
   void RewriteIntelExpression(IntelExprStateMachine &SM, SMLoc Start,
                               SMLoc End);
   bool ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End);
@@ -917,20 +1112,21 @@ private:
                                      bool IsUnevaluatedOperand, SMLoc &End,
                                      bool IsParsingOffsetOperator = false);
 
-  std::unique_ptr<X86Operand> ParseMemOperand(unsigned SegReg,
-                                              const MCExpr *&Disp,
-                                              const SMLoc &StartLoc,
-                                              SMLoc &EndLoc);
+  bool ParseMemOperand(unsigned SegReg, const MCExpr *Disp, SMLoc StartLoc,
+                       SMLoc EndLoc, OperandVector &Operands);
 
   X86::CondCode ParseConditionCode(StringRef CCode);
 
   bool ParseIntelMemoryOperandSize(unsigned &Size);
-  std::unique_ptr<X86Operand>
-  CreateMemForMSInlineAsm(unsigned SegReg, const MCExpr *Disp, unsigned BaseReg,
-                          unsigned IndexReg, unsigned Scale, SMLoc Start,
-                          SMLoc End, unsigned Size, StringRef Identifier,
-                          const InlineAsmIdentifierInfo &Info);
-
+  bool CreateMemForMSInlineAsm(unsigned SegReg, const MCExpr *Disp,
+                               unsigned BaseReg, unsigned IndexReg,
+                               unsigned Scale, SMLoc Start, SMLoc End,
+                               unsigned Size, StringRef Identifier,
+                               const InlineAsmIdentifierInfo &Info,
+                               OperandVector &Operands);
+
+  bool parseDirectiveArch();
+  bool parseDirectiveNops(SMLoc L);
   bool parseDirectiveEven(SMLoc L);
   bool ParseDirectiveCode(StringRef IDVal, SMLoc L);
 
@@ -942,7 +1138,6 @@ private:
   bool parseDirectiveFPOStackAlign(SMLoc L);
   bool parseDirectiveFPOEndPrologue(SMLoc L);
   bool parseDirectiveFPOEndProc(SMLoc L);
-  bool parseDirectiveFPOData(SMLoc L);
 
   /// SEH directives.
   bool parseSEHRegisterNumber(unsigned RegClassID, unsigned &RegNo);
@@ -992,8 +1187,7 @@ private:
   /// Parses AVX512 specific operand primitives: masked registers ({%k<NUM>}, {z})
   /// and memory broadcasting ({1to<NUM>}) primitives, updating Operands vector if required.
   /// return false if no parsing errors occurred, true otherwise.
-  bool HandleAVX512Operand(OperandVector &Operands,
-                           const MCParsedAsmOperand &Op);
+  bool HandleAVX512Operand(OperandVector &Operands);
 
   bool ParseZ(std::unique_ptr<X86Operand> &Z, const SMLoc &StartLoc);
 
@@ -1188,8 +1382,6 @@ bool X86AsmParser::MatchRegisterByName(unsigned &RegNo, StringRef RegName,
     // FIXME: This should be done using Requires<Not64BitMode> and
     // Requires<In64BitMode> so "eiz" usage in 64-bit instructions can be also
     // checked.
-    // FIXME: Check AH, CH, DH, BH cannot be used in an instruction requiring a
-    // REX prefix.
     if (RegNo == X86::RIZ || RegNo == X86::RIP ||
         X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo) ||
         X86II::isX86_64NonExtLowByteReg(RegNo) ||
@@ -1524,16 +1716,17 @@ bool X86AsmParser::VerifyAndAdjustOperands(OperandVector &OrigOperands,
   return false;
 }
 
-std::unique_ptr<X86Operand> X86AsmParser::ParseOperand() {
+bool X86AsmParser::ParseOperand(OperandVector &Operands) {
   if (isParsingIntelSyntax())
-    return ParseIntelOperand();
-  return ParseATTOperand();
+    return ParseIntelOperand(Operands);
+
+  return ParseATTOperand(Operands);
 }
 
-std::unique_ptr<X86Operand> X86AsmParser::CreateMemForMSInlineAsm(
+bool X86AsmParser::CreateMemForMSInlineAsm(
     unsigned SegReg, const MCExpr *Disp, unsigned BaseReg, unsigned IndexReg,
     unsigned Scale, SMLoc Start, SMLoc End, unsigned Size, StringRef Identifier,
-    const InlineAsmIdentifierInfo &Info) {
+    const InlineAsmIdentifierInfo &Info, OperandVector &Operands) {
   // If we found a decl other than a VarDecl, then assume it is a FuncDecl or
   // some other label reference.
   if (Info.isKind(InlineAsmIdentifierInfo::IK_Label)) {
@@ -1545,8 +1738,10 @@ std::unique_ptr<X86Operand> X86AsmParser::CreateMemForMSInlineAsm(
     }
     // Create an absolute memory reference in order to match against
     // instructions taking a PC relative operand.
-    return X86Operand::CreateMem(getPointerWidth(), Disp, Start, End, Size,
-                                 Identifier, Info.Label.Decl);
+    Operands.push_back(X86Operand::CreateMem(getPointerWidth(), Disp, Start,
+                                             End, Size, Identifier,
+                                             Info.Label.Decl));
+    return false;
   }
   // We either have a direct symbol reference, or an offset from a symbol.  The
   // parser always puts the symbol on the LHS, so look there for size
@@ -1563,17 +1758,19 @@ std::unique_ptr<X86Operand> X86AsmParser::CreateMemForMSInlineAsm(
   // It is widely common for MS InlineAsm to use a global variable and one/two
   // registers in a mmory expression, and though unaccessible via rip/eip.
   if (IsGlobalLV && (BaseReg || IndexReg)) {
-    return X86Operand::CreateMem(getPointerWidth(), Disp, Start, End);
+    Operands.push_back(
+        X86Operand::CreateMem(getPointerWidth(), Disp, Start, End));
+    return false;
+  }
   // Otherwise, we set the base register to a non-zero value
   // if we don't know the actual value at this time.  This is necessary to
   // get the matching correct in some cases.
-  } else {
-    BaseReg = BaseReg ? BaseReg : 1;
-    return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, BaseReg,
-                                 IndexReg, Scale, Start, End, Size,
-                                 /*DefaultBaseReg=*/X86::RIP, Identifier, Decl,
-                                 FrontendSize);
-  }
+  BaseReg = BaseReg ? BaseReg : 1;
+  Operands.push_back(X86Operand::CreateMem(
+      getPointerWidth(), SegReg, Disp, BaseReg, IndexReg, Scale, Start, End,
+      Size,
+      /*DefaultBaseReg=*/X86::RIP, Identifier, Decl, FrontendSize));
+  return false;
 }
 
 // Some binary bitwise operators have a named synonymous
@@ -1582,8 +1779,10 @@ std::unique_ptr<X86Operand> X86AsmParser::CreateMemForMSInlineAsm(
 bool X86AsmParser::ParseIntelNamedOperator(StringRef Name,
                                            IntelExprStateMachine &SM,
                                            bool &ParseError, SMLoc &End) {
-  // A named operator should be either lower or upper case, but not a mix
-  if (Name.compare(Name.lower()) && Name.compare(Name.upper()))
+  // A named operator should be either lower or upper case, but not a mix...
+  // except in MASM, which uses full case-insensitivity.
+  if (Name.compare(Name.lower()) && Name.compare(Name.upper()) &&
+      !getParser().isParsingMasm())
     return false;
   if (Name.equals_lower("not")) {
     SM.onNot();
@@ -1619,15 +1818,39 @@ bool X86AsmParser::ParseIntelNamedOperator(StringRef Name,
     End = consumeToken();
   return true;
 }
+bool X86AsmParser::ParseMasmNamedOperator(StringRef Name,
+                                          IntelExprStateMachine &SM,
+                                          bool &ParseError, SMLoc &End) {
+  if (Name.equals_lower("eq")) {
+    SM.onEq();
+  } else if (Name.equals_lower("ne")) {
+    SM.onNE();
+  } else if (Name.equals_lower("lt")) {
+    SM.onLT();
+  } else if (Name.equals_lower("le")) {
+    SM.onLE();
+  } else if (Name.equals_lower("gt")) {
+    SM.onGT();
+  } else if (Name.equals_lower("ge")) {
+    SM.onGE();
+  } else {
+    return false;
+  }
+  End = consumeToken();
+  return true;
+}
 
 bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
   MCAsmParser &Parser = getParser();
-  const AsmToken &Tok = Parser.getTok();
   StringRef ErrMsg;
 
   AsmToken::TokenKind PrevTK = AsmToken::Error;
   bool Done = false;
   while (!Done) {
+    // Get a fresh reference on each loop iteration in case the previous
+    // iteration moved the token storage during UnLex().
+    const AsmToken &Tok = Parser.getTok();
+
     bool UpdateLocLex = true;
     AsmToken::TokenKind TK = getLexer().getKind();
 
@@ -1636,6 +1859,9 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
       if ((Done = SM.isValidEndState()))
         break;
       return Error(Tok.getLoc(), "unknown token in expression");
+    case AsmToken::Error:
+      return Error(getLexer().getErrLoc(), getLexer().getErr());
+      break;
     case AsmToken::EndOfStatement:
       Done = true;
       break;
@@ -1645,18 +1871,73 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
       if (ParseIntelDotOperator(SM, End))
         return true;
       break;
+    case AsmToken::Dot:
+      if (!Parser.isParsingMasm()) {
+        if ((Done = SM.isValidEndState()))
+          break;
+        return Error(Tok.getLoc(), "unknown token in expression");
+      }
+      // MASM allows spaces around the dot operator (e.g., "var . x")
+      Lex();
+      UpdateLocLex = false;
+      if (ParseIntelDotOperator(SM, End))
+        return true;
+      break;
+    case AsmToken::Dollar:
+      if (!Parser.isParsingMasm()) {
+        if ((Done = SM.isValidEndState()))
+          break;
+        return Error(Tok.getLoc(), "unknown token in expression");
+      }
+      LLVM_FALLTHROUGH;
+    case AsmToken::String: {
+      if (Parser.isParsingMasm()) {
+        // MASM parsers handle strings in expressions as constants.
+        SMLoc ValueLoc = Tok.getLoc();
+        int64_t Res;
+        const MCExpr *Val;
+        if (Parser.parsePrimaryExpr(Val, End, nullptr))
+          return true;
+        UpdateLocLex = false;
+        if (!Val->evaluateAsAbsolute(Res, getStreamer().getAssemblerPtr()))
+          return Error(ValueLoc, "expected absolute value");
+        if (SM.onInteger(Res, ErrMsg))
+          return Error(ValueLoc, ErrMsg);
+        break;
+      }
+      LLVM_FALLTHROUGH;
+    }
     case AsmToken::At:
-    case AsmToken::String:
     case AsmToken::Identifier: {
       SMLoc IdentLoc = Tok.getLoc();
       StringRef Identifier = Tok.getString();
       UpdateLocLex = false;
+      if (Parser.isParsingMasm()) {
+        size_t DotOffset = Identifier.find_first_of('.');
+        if (DotOffset != StringRef::npos) {
+          consumeToken();
+          StringRef LHS = Identifier.slice(0, DotOffset);
+          StringRef Dot = Identifier.slice(DotOffset, DotOffset + 1);
+          StringRef RHS = Identifier.slice(DotOffset + 1, StringRef::npos);
+          if (!RHS.empty()) {
+            getLexer().UnLex(AsmToken(AsmToken::Identifier, RHS));
+          }
+          getLexer().UnLex(AsmToken(AsmToken::Dot, Dot));
+          if (!LHS.empty()) {
+            getLexer().UnLex(AsmToken(AsmToken::Identifier, LHS));
+          }
+          break;
+        }
+      }
       // (MASM only) <TYPE> PTR operator
       if (Parser.isParsingMasm()) {
         const AsmToken &NextTok = getLexer().peekTok();
         if (NextTok.is(AsmToken::Identifier) &&
             NextTok.getIdentifier().equals_lower("ptr")) {
-          SM.onCast(Identifier);
+          AsmTypeInfo Info;
+          if (Parser.lookUpType(Identifier, Info))
+            return Error(Tok.getLoc(), "unknown type");
+          SM.onCast(Info);
           // Eat type and PTR.
           consumeToken();
           End = consumeToken();
@@ -1681,16 +1962,15 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
             if (SM.onRegister(Reg, ErrMsg))
               return Error(IdentLoc, ErrMsg);
 
-            StringRef Type;
-            unsigned Offset = 0;
+            AsmFieldInfo Info;
             SMLoc FieldStartLoc = SMLoc::getFromPointer(Field.data());
-            if (Parser.lookUpField(Field, Type, Offset))
+            if (Parser.lookUpField(Field, Info))
               return Error(FieldStartLoc, "unknown offset");
             else if (SM.onPlus(ErrMsg))
               return Error(getTok().getLoc(), ErrMsg);
-            else if (SM.onInteger(Offset, ErrMsg))
+            else if (SM.onInteger(Info.Offset, ErrMsg))
               return Error(IdentLoc, ErrMsg);
-            SM.setType(Type);
+            SM.setTypeInfo(Info.Type);
 
             End = consumeToken();
             break;
@@ -1704,8 +1984,15 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
           return true;
         break;
       }
+      if (Parser.isParsingMasm() &&
+          ParseMasmNamedOperator(Identifier, SM, ParseError, End)) {
+        if (ParseError)
+          return true;
+        break;
+      }
       // Symbol reference, when parsing assembly content
       InlineAsmIdentifierInfo Info;
+      AsmFieldInfo FieldInfo;
       const MCExpr *Val;
       if (isParsingMSInlineAsm() || Parser.isParsingMasm()) {
         // MS Dot Operator expression
@@ -1722,8 +2009,9 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
           if (int64_t Val = ParseIntelInlineAsmOperator(OpKind)) {
             if (SM.onInteger(Val, ErrMsg))
               return Error(IdentLoc, ErrMsg);
-          } else
+          } else {
             return true;
+          }
           break;
         }
         // MS InlineAsm identifier
@@ -1732,13 +2020,49 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
           return Error(IdentLoc, "expected identifier");
         if (ParseIntelInlineAsmIdentifier(Val, Identifier, Info, false, End))
           return true;
-        else if (SM.onIdentifierExpr(Val, Identifier, Info, true, ErrMsg))
+        else if (SM.onIdentifierExpr(Val, Identifier, Info, FieldInfo.Type,
+                                     true, ErrMsg))
           return Error(IdentLoc, ErrMsg);
         break;
       }
-      if (getParser().parsePrimaryExpr(Val, End)) {
+      if (Parser.isParsingMasm()) {
+        if (unsigned OpKind = IdentifyMasmOperator(Identifier)) {
+          int64_t Val;
+          if (ParseMasmOperator(OpKind, Val))
+            return true;
+          if (SM.onInteger(Val, ErrMsg))
+            return Error(IdentLoc, ErrMsg);
+          break;
+        }
+        if (!getParser().lookUpType(Identifier, FieldInfo.Type)) {
+          // Field offset immediate; <TYPE>.<field specification>
+          Lex(); // eat type
+          bool EndDot = parseOptionalToken(AsmToken::Dot);
+          while (EndDot || (getTok().is(AsmToken::Identifier) &&
+                            getTok().getString().startswith("."))) {
+            getParser().parseIdentifier(Identifier);
+            if (!EndDot)
+              Identifier.consume_front(".");
+            EndDot = Identifier.consume_back(".");
+            if (getParser().lookUpField(FieldInfo.Type.Name, Identifier,
+                                        FieldInfo)) {
+              SMLoc IDEnd =
+                  SMLoc::getFromPointer(Identifier.data() + Identifier.size());
+              return Error(IdentLoc, "Unable to lookup field reference!",
+                           SMRange(IdentLoc, IDEnd));
+            }
+            if (!EndDot)
+              EndDot = parseOptionalToken(AsmToken::Dot);
+          }
+          if (SM.onInteger(FieldInfo.Offset, ErrMsg))
+            return Error(IdentLoc, ErrMsg);
+          break;
+        }
+      }
+      if (getParser().parsePrimaryExpr(Val, End, &FieldInfo.Type)) {
         return Error(Tok.getLoc(), "Unexpected identifier!");
-      } else if (SM.onIdentifierExpr(Val, Identifier, Info, false, ErrMsg)) {
+      } else if (SM.onIdentifierExpr(Val, Identifier, Info, FieldInfo.Type,
+                                     false, ErrMsg)) {
         return Error(IdentLoc, ErrMsg);
       }
       break;
@@ -1761,8 +2085,9 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
             return Error(Loc, "invalid reference to undefined symbol");
           StringRef Identifier = Sym->getName();
           InlineAsmIdentifierInfo Info;
-          if (SM.onIdentifierExpr(Val, Identifier, Info, isParsingMSInlineAsm(),
-                                  ErrMsg))
+          AsmTypeInfo Type;
+          if (SM.onIdentifierExpr(Val, Identifier, Info, Type,
+                                  isParsingMSInlineAsm(), ErrMsg))
             return Error(Loc, ErrMsg);
           End = consumeToken();
         } else {
@@ -1904,14 +2229,13 @@ bool X86AsmParser::ParseIntelInlineAsmIdentifier(
 }
 
 //ParseRoundingModeOp - Parse AVX-512 rounding mode operand
-std::unique_ptr<X86Operand>
-X86AsmParser::ParseRoundingModeOp(SMLoc Start) {
+bool X86AsmParser::ParseRoundingModeOp(SMLoc Start, OperandVector &Operands) {
   MCAsmParser &Parser = getParser();
   const AsmToken &Tok = Parser.getTok();
   // Eat "{" and mark the current place.
   const SMLoc consumedToken = consumeToken();
   if (Tok.isNot(AsmToken::Identifier))
-    return ErrorOperand(Tok.getLoc(), "Expected an identifier after {");
+    return Error(Tok.getLoc(), "Expected an identifier after {");
   if (Tok.getIdentifier().startswith("r")){
     int rndMode = StringSwitch<int>(Tok.getIdentifier())
       .Case("rn", X86::STATIC_ROUNDING::TO_NEAREST_INT)
@@ -1920,67 +2244,76 @@ X86AsmParser::ParseRoundingModeOp(SMLoc Start) {
       .Case("rz", X86::STATIC_ROUNDING::TO_ZERO)
       .Default(-1);
     if (-1 == rndMode)
-      return ErrorOperand(Tok.getLoc(), "Invalid rounding mode.");
+      return Error(Tok.getLoc(), "Invalid rounding mode.");
      Parser.Lex();  // Eat "r*" of r*-sae
     if (!getLexer().is(AsmToken::Minus))
-      return ErrorOperand(Tok.getLoc(), "Expected - at this point");
+      return Error(Tok.getLoc(), "Expected - at this point");
     Parser.Lex();  // Eat "-"
     Parser.Lex();  // Eat the sae
     if (!getLexer().is(AsmToken::RCurly))
-      return ErrorOperand(Tok.getLoc(), "Expected } at this point");
+      return Error(Tok.getLoc(), "Expected } at this point");
     SMLoc End = Tok.getEndLoc();
     Parser.Lex();  // Eat "}"
     const MCExpr *RndModeOp =
       MCConstantExpr::create(rndMode, Parser.getContext());
-    return X86Operand::CreateImm(RndModeOp, Start, End);
+    Operands.push_back(X86Operand::CreateImm(RndModeOp, Start, End));
+    return false;
   }
   if(Tok.getIdentifier().equals("sae")){
     Parser.Lex();  // Eat the sae
     if (!getLexer().is(AsmToken::RCurly))
-      return ErrorOperand(Tok.getLoc(), "Expected } at this point");
+      return Error(Tok.getLoc(), "Expected } at this point");
     Parser.Lex();  // Eat "}"
-    return X86Operand::CreateToken("{sae}", consumedToken);
+    Operands.push_back(X86Operand::CreateToken("{sae}", consumedToken));
+    return false;
   }
-  return ErrorOperand(Tok.getLoc(), "unknown token in expression");
+  return Error(Tok.getLoc(), "unknown token in expression");
 }
 
 /// Parse the '.' operator.
 bool X86AsmParser::ParseIntelDotOperator(IntelExprStateMachine &SM,
                                          SMLoc &End) {
   const AsmToken &Tok = getTok();
-  StringRef Type;
-  unsigned Offset = 0;
+  AsmFieldInfo Info;
 
   // Drop the optional '.'.
   StringRef DotDispStr = Tok.getString();
   if (DotDispStr.startswith("."))
     DotDispStr = DotDispStr.drop_front(1);
+  StringRef TrailingDot;
 
   // .Imm gets lexed as a real.
   if (Tok.is(AsmToken::Real)) {
     APInt DotDisp;
     DotDispStr.getAsInteger(10, DotDisp);
-    Offset = DotDisp.getZExtValue();
+    Info.Offset = DotDisp.getZExtValue();
   } else if ((isParsingMSInlineAsm() || getParser().isParsingMasm()) &&
              Tok.is(AsmToken::Identifier)) {
+    if (DotDispStr.endswith(".")) {
+      TrailingDot = DotDispStr.substr(DotDispStr.size() - 1);
+      DotDispStr = DotDispStr.drop_back(1);
+    }
     const std::pair<StringRef, StringRef> BaseMember = DotDispStr.split('.');
     const StringRef Base = BaseMember.first, Member = BaseMember.second;
-    if (getParser().lookUpField(SM.getType(), DotDispStr, Type, Offset) &&
-        getParser().lookUpField(SM.getSymName(), DotDispStr, Type, Offset) &&
-        getParser().lookUpField(DotDispStr, Type, Offset) &&
+    if (getParser().lookUpField(SM.getType(), DotDispStr, Info) &&
+        getParser().lookUpField(SM.getSymName(), DotDispStr, Info) &&
+        getParser().lookUpField(DotDispStr, Info) &&
         (!SemaCallback ||
-         SemaCallback->LookupInlineAsmField(Base, Member, Offset)))
+         SemaCallback->LookupInlineAsmField(Base, Member, Info.Offset)))
       return Error(Tok.getLoc(), "Unable to lookup field reference!");
-  } else
+  } else {
     return Error(Tok.getLoc(), "Unexpected token type!");
+  }
 
   // Eat the DotExpression and update End
   End = SMLoc::getFromPointer(DotDispStr.data());
   const char *DotExprEndLoc = DotDispStr.data() + DotDispStr.size();
   while (Tok.getLoc().getPointer() < DotExprEndLoc)
     Lex();
-  SM.addImm(Offset);
-  SM.setType(Type);
+  if (!TrailingDot.empty())
+    getLexer().UnLex(AsmToken(AsmToken::Dot, TrailingDot));
+  SM.addImm(Info.Offset);
+  SM.setTypeInfo(Info.Type);
   return false;
 }
 
@@ -1995,7 +2328,7 @@ bool X86AsmParser::ParseIntelOffsetOperator(const MCExpr *&Val, StringRef &ID,
   if (!isParsingMSInlineAsm()) {
     if ((getTok().isNot(AsmToken::Identifier) &&
          getTok().isNot(AsmToken::String)) ||
-        getParser().parsePrimaryExpr(Val, End))
+        getParser().parsePrimaryExpr(Val, End, nullptr))
       return Error(Start, "unexpected token!");
   } else if (ParseIntelInlineAsmIdentifier(Val, ID, Info, false, End, true)) {
     return Error(Start, "unable to lookup expression");
@@ -2031,7 +2364,7 @@ unsigned X86AsmParser::ParseIntelInlineAsmOperator(unsigned OpKind) {
   SMLoc Start = Tok.getLoc(), End;
   StringRef Identifier = Tok.getString();
   if (ParseIntelInlineAsmIdentifier(Val, Identifier, Info,
-                                    /*Unevaluated=*/true, End))
+                                    /*IsUnevaluatedOperand=*/true, End))
     return 0;
 
   if (!Info.isKind(InlineAsmIdentifierInfo::IK_Var)) {
@@ -2050,6 +2383,73 @@ unsigned X86AsmParser::ParseIntelInlineAsmOperator(unsigned OpKind) {
   return CVal;
 }
 
+// Query a candidate string for being an Intel assembly operator
+// Report back its kind, or IOK_INVALID if does not evaluated as a known one
+unsigned X86AsmParser::IdentifyMasmOperator(StringRef Name) {
+  return StringSwitch<unsigned>(Name.lower())
+      .Case("type", MOK_TYPE)
+      .Cases("size", "sizeof", MOK_SIZEOF)
+      .Cases("length", "lengthof", MOK_LENGTHOF)
+      .Default(MOK_INVALID);
+}
+
+/// Parse the 'LENGTHOF', 'SIZEOF', and 'TYPE' operators.  The LENGTHOF operator
+/// returns the number of elements in an array.  It returns the value 1 for
+/// non-array variables.  The SIZEOF operator returns the size of a type or
+/// variable in bytes.  A variable's size is the product of its LENGTH and TYPE.
+/// The TYPE operator returns the size of a variable. If the variable is an
+/// array, TYPE returns the size of a single element.
+bool X86AsmParser::ParseMasmOperator(unsigned OpKind, int64_t &Val) {
+  MCAsmParser &Parser = getParser();
+  SMLoc OpLoc = Parser.getTok().getLoc();
+  Parser.Lex(); // Eat operator.
+
+  Val = 0;
+  if (OpKind == MOK_SIZEOF || OpKind == MOK_TYPE) {
+    // Check for SIZEOF(<type>) and TYPE(<type>).
+    bool InParens = Parser.getTok().is(AsmToken::LParen);
+    const AsmToken &IDTok = InParens ? getLexer().peekTok() : Parser.getTok();
+    AsmTypeInfo Type;
+    if (IDTok.is(AsmToken::Identifier) &&
+        !Parser.lookUpType(IDTok.getIdentifier(), Type)) {
+      Val = Type.Size;
+
+      // Eat tokens.
+      if (InParens)
+        parseToken(AsmToken::LParen);
+      parseToken(AsmToken::Identifier);
+      if (InParens)
+        parseToken(AsmToken::RParen);
+    }
+  }
+
+  if (!Val) {
+    IntelExprStateMachine SM;
+    SMLoc End, Start = Parser.getTok().getLoc();
+    if (ParseIntelExpression(SM, End))
+      return true;
+
+    switch (OpKind) {
+    default:
+      llvm_unreachable("Unexpected operand kind!");
+    case MOK_SIZEOF:
+      Val = SM.getSize();
+      break;
+    case MOK_LENGTHOF:
+      Val = SM.getLength();
+      break;
+    case MOK_TYPE:
+      Val = SM.getElementSize();
+      break;
+    }
+
+    if (!Val)
+      return Error(OpLoc, "expression has unknown type", SMRange(Start, End));
+  }
+
+  return false;
+}
+
 bool X86AsmParser::ParseIntelMemoryOperandSize(unsigned &Size) {
   Size = StringSwitch<unsigned>(getTok().getString())
     .Cases("BYTE", "byte", 8)
@@ -2076,7 +2476,7 @@ bool X86AsmParser::ParseIntelMemoryOperandSize(unsigned &Size) {
   return false;
 }
 
-std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
+bool X86AsmParser::ParseIntelOperand(OperandVector &Operands) {
   MCAsmParser &Parser = getParser();
   const AsmToken &Tok = Parser.getTok();
   SMLoc Start, End;
@@ -2084,28 +2484,31 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
   // Parse optional Size directive.
   unsigned Size;
   if (ParseIntelMemoryOperandSize(Size))
-    return nullptr;
+    return true;
   bool PtrInOperand = bool(Size);
 
   Start = Tok.getLoc();
 
   // Rounding mode operand.
   if (getLexer().is(AsmToken::LCurly))
-    return ParseRoundingModeOp(Start);
+    return ParseRoundingModeOp(Start, Operands);
 
   // Register operand.
   unsigned RegNo = 0;
   if (Tok.is(AsmToken::Identifier) && !ParseRegister(RegNo, Start, End)) {
     if (RegNo == X86::RIP)
-      return ErrorOperand(Start, "rip can only be used as a base register");
+      return Error(Start, "rip can only be used as a base register");
     // A Register followed by ':' is considered a segment override
-    if (Tok.isNot(AsmToken::Colon))
-      return !PtrInOperand ? X86Operand::CreateReg(RegNo, Start, End) :
-        ErrorOperand(Start, "expected memory operand after 'ptr', "
+    if (Tok.isNot(AsmToken::Colon)) {
+      if (PtrInOperand)
+        return Error(Start, "expected memory operand after 'ptr', "
                             "found register operand instead");
+      Operands.push_back(X86Operand::CreateReg(RegNo, Start, End));
+      return false;
+    }
     // An alleged segment override. check if we have a valid segment register
     if (!X86MCRegisterClasses[X86::SEGMENT_REGRegClassID].contains(RegNo))
-      return ErrorOperand(Start, "invalid segment register");
+      return Error(Start, "invalid segment register");
     // Eat ':' and update Start location
     Start = Lex().getLoc();
   }
@@ -2113,7 +2516,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
   // Immediates and Memory
   IntelExprStateMachine SM;
   if (ParseIntelExpression(SM, End))
-    return nullptr;
+    return true;
 
   if (isParsingMSInlineAsm())
     RewriteIntelExpression(SM, Start, Tok.getLoc());
@@ -2130,22 +2533,27 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
   // and we are parsing a segment override
   if (!SM.isMemExpr() && !RegNo) {
     if (isParsingMSInlineAsm() && SM.isOffsetOperator()) {
-      const InlineAsmIdentifierInfo Info = SM.getIdentifierInfo();
+      const InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo();
       if (Info.isKind(InlineAsmIdentifierInfo::IK_Var)) {
         // Disp includes the address of a variable; make sure this is recorded
         // for later handling.
-        return X86Operand::CreateImm(Disp, Start, End, SM.getSymName(),
-                                     Info.Var.Decl, Info.Var.IsGlobalLV);
+        Operands.push_back(X86Operand::CreateImm(Disp, Start, End,
+                                                 SM.getSymName(), Info.Var.Decl,
+                                                 Info.Var.IsGlobalLV));
+        return false;
       }
     }
 
-    return X86Operand::CreateImm(Disp, Start, End);
+    Operands.push_back(X86Operand::CreateImm(Disp, Start, End));
+    return false;
   }
 
   StringRef ErrMsg;
   unsigned BaseReg = SM.getBaseReg();
   unsigned IndexReg = SM.getIndexReg();
   unsigned Scale = SM.getScale();
+  if (!PtrInOperand)
+    Size = SM.getElementSize() << 3;
 
   if (Scale == 0 && BaseReg != X86::ESP && BaseReg != X86::RSP &&
       (IndexReg == X86::ESP || IndexReg == X86::RSP))
@@ -2164,7 +2572,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
 
   if (Scale != 0 &&
       X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg))
-    return ErrorOperand(Start, "16-bit addresses cannot have a scale");
+    return Error(Start, "16-bit addresses cannot have a scale");
 
   // If there was no explicit scale specified, change it to 1.
   if (Scale == 0)
@@ -2180,26 +2588,33 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
   if ((BaseReg || IndexReg) &&
       CheckBaseRegAndIndexRegAndScale(BaseReg, IndexReg, Scale, is64BitMode(),
                                       ErrMsg))
-    return ErrorOperand(Start, ErrMsg);
+    return Error(Start, ErrMsg);
   if (isParsingMSInlineAsm())
     return CreateMemForMSInlineAsm(RegNo, Disp, BaseReg, IndexReg, Scale, Start,
                                    End, Size, SM.getSymName(),
-                                   SM.getIdentifierInfo());
+                                   SM.getIdentifierInfo(), Operands);
 
   // When parsing x64 MS-style assembly, all memory operands default to
   // RIP-relative when interpreted as non-absolute references.
-  if (Parser.isParsingMasm() && is64BitMode())
-    return X86Operand::CreateMem(getPointerWidth(), RegNo, Disp, BaseReg,
-                                 IndexReg, Scale, Start, End, Size,
-                                 /*DefaultBaseReg=*/X86::RIP);
-
-  if (!(BaseReg || IndexReg || RegNo))
-    return X86Operand::CreateMem(getPointerWidth(), Disp, Start, End, Size);
-  return X86Operand::CreateMem(getPointerWidth(), RegNo, Disp,
-                               BaseReg, IndexReg, Scale, Start, End, Size);
+  if (Parser.isParsingMasm() && is64BitMode()) {
+    Operands.push_back(X86Operand::CreateMem(getPointerWidth(), RegNo, Disp,
+                                             BaseReg, IndexReg, Scale, Start,
+                                             End, Size,
+                                             /*DefaultBaseReg=*/X86::RIP));
+    return false;
+  }
+
+  if ((BaseReg || IndexReg || RegNo))
+    Operands.push_back(X86Operand::CreateMem(getPointerWidth(), RegNo, Disp,
+                                             BaseReg, IndexReg, Scale, Start,
+                                             End, Size));
+  else
+    Operands.push_back(
+        X86Operand::CreateMem(getPointerWidth(), Disp, Start, End, Size));
+  return false;
 }
 
-std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() {
+bool X86AsmParser::ParseATTOperand(OperandVector &Operands) {
   MCAsmParser &Parser = getParser();
   switch (getLexer().getKind()) {
   case AsmToken::Dollar: {
@@ -2214,12 +2629,13 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() {
               "expected immediate expression") ||
         getParser().parseExpression(Val, End) ||
         check(isa<X86MCExpr>(Val), L, "expected immediate expression"))
-      return nullptr;
-    return X86Operand::CreateImm(Val, Start, End);
+      return true;
+    Operands.push_back(X86Operand::CreateImm(Val, Start, End));
+    return false;
   }
   case AsmToken::LCurly: {
     SMLoc Start = Parser.getTok().getLoc();
-    return ParseRoundingModeOp(Start);
+    return ParseRoundingModeOp(Start, Operands);
   }
   default: {
     // This a memory operand or a register. We have some parsing complications
@@ -2233,7 +2649,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() {
     if (getLexer().isNot(AsmToken::LParen)) {
       // No '(' so this is either a displacement expression or a register.
       if (Parser.parseExpression(Expr, EndLoc))
-        return nullptr;
+        return true;
       if (auto *RE = dyn_cast<X86MCExpr>(Expr)) {
         // Segment Register. Reset Expr and copy value to register.
         Expr = nullptr;
@@ -2241,21 +2657,27 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() {
 
         // Sanity check register.
         if (Reg == X86::EIZ || Reg == X86::RIZ)
-          return ErrorOperand(
+          return Error(
               Loc, "%eiz and %riz can only be used as index registers",
               SMRange(Loc, EndLoc));
         if (Reg == X86::RIP)
-          return ErrorOperand(Loc, "%rip can only be used as a base register",
-                              SMRange(Loc, EndLoc));
+          return Error(Loc, "%rip can only be used as a base register",
+                       SMRange(Loc, EndLoc));
         // Return register that are not segment prefixes immediately.
-        if (!Parser.parseOptionalToken(AsmToken::Colon))
-          return X86Operand::CreateReg(Reg, Loc, EndLoc);
+        if (!Parser.parseOptionalToken(AsmToken::Colon)) {
+          Operands.push_back(X86Operand::CreateReg(Reg, Loc, EndLoc));
+          return false;
+        }
         if (!X86MCRegisterClasses[X86::SEGMENT_REGRegClassID].contains(Reg))
-          return ErrorOperand(Loc, "invalid segment register");
+          return Error(Loc, "invalid segment register");
+        // Accept a '*' absolute memory reference after the segment. Place it
+        // before the full memory operand.
+        if (getLexer().is(AsmToken::Star))
+          Operands.push_back(X86Operand::CreateToken("*", consumeToken()));
       }
     }
     // This is a Memory operand.
-    return ParseMemOperand(Reg, Expr, Loc, EndLoc);
+    return ParseMemOperand(Reg, Expr, Loc, EndLoc, Operands);
   }
   }
 }
@@ -2305,8 +2727,7 @@ bool X86AsmParser::ParseZ(std::unique_ptr<X86Operand> &Z,
 }
 
 // true on failure, false otherwise
-bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands,
-                                       const MCParsedAsmOperand &Op) {
+bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands) {
   MCAsmParser &Parser = getParser();
   if (getLexer().is(AsmToken::LCurly)) {
     // Eat "{" and mark the current place.
@@ -2316,21 +2737,26 @@ bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands,
       // Parse memory broadcasting ({1to<NUM>}).
       if (getLexer().getTok().getIntVal() != 1)
         return TokError("Expected 1to<NUM> at this point");
-      Parser.Lex();  // Eat "1" of 1to8
-      if (!getLexer().is(AsmToken::Identifier) ||
-          !getLexer().getTok().getIdentifier().startswith("to"))
+      StringRef Prefix = getLexer().getTok().getString();
+      Parser.Lex(); // Eat first token of 1to8
+      if (!getLexer().is(AsmToken::Identifier))
         return TokError("Expected 1to<NUM> at this point");
       // Recognize only reasonable suffixes.
+      SmallVector<char, 5> BroadcastVector;
+      StringRef BroadcastString = (Prefix + getLexer().getTok().getIdentifier())
+                                      .toStringRef(BroadcastVector);
+      if (!BroadcastString.startswith("1to"))
+        return TokError("Expected 1to<NUM> at this point");
       const char *BroadcastPrimitive =
-        StringSwitch<const char*>(getLexer().getTok().getIdentifier())
-          .Case("to2",  "{1to2}")
-          .Case("to4",  "{1to4}")
-          .Case("to8",  "{1to8}")
-          .Case("to16", "{1to16}")
-          .Default(nullptr);
+          StringSwitch<const char *>(BroadcastString)
+              .Case("1to2", "{1to2}")
+              .Case("1to4", "{1to4}")
+              .Case("1to8", "{1to8}")
+              .Case("1to16", "{1to16}")
+              .Default(nullptr);
       if (!BroadcastPrimitive)
         return TokError("Invalid memory broadcast primitive.");
-      Parser.Lex();  // Eat "toN" of 1toN
+      Parser.Lex(); // Eat trailing token of 1toN
       if (!getLexer().is(AsmToken::RCurly))
         return TokError("Expected } at this point");
       Parser.Lex();  // Eat "}"
@@ -2390,10 +2816,9 @@ bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands,
 
 /// ParseMemOperand: 'seg : disp(basereg, indexreg, scale)'.  The '%ds:' prefix
 /// has already been parsed if present. disp may be provided as well.
-std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg,
-                                                          const MCExpr *&Disp,
-                                                          const SMLoc &StartLoc,
-                                                          SMLoc &EndLoc) {
+bool X86AsmParser::ParseMemOperand(unsigned SegReg, const MCExpr *Disp,
+                                   SMLoc StartLoc, SMLoc EndLoc,
+                                   OperandVector &Operands) {
   MCAsmParser &Parser = getParser();
   SMLoc Loc;
   // Based on the initial passed values, we may be in any of these cases, we are
@@ -2455,7 +2880,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg,
     // Parse immediate if we're not at a mem operand yet.
     if (!isAtMemOperand()) {
       if (Parser.parseTokenLoc(Loc) || Parser.parseExpression(Disp, EndLoc))
-        return nullptr;
+        return true;
       assert(!isa<X86MCExpr>(Disp) && "Expected non-register here.");
     } else {
       // Disp is implicitly zero if we haven't parsed it yet.
@@ -2468,9 +2893,12 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg,
 
   if (!parseOptionalToken(AsmToken::LParen)) {
     if (SegReg == 0)
-      return X86Operand::CreateMem(getPointerWidth(), Disp, StartLoc, EndLoc);
-    return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, 0, 0, 1,
-                                 StartLoc, EndLoc);
+      Operands.push_back(
+          X86Operand::CreateMem(getPointerWidth(), Disp, StartLoc, EndLoc));
+    else
+      Operands.push_back(X86Operand::CreateMem(getPointerWidth(), SegReg, Disp,
+                                               0, 0, 1, StartLoc, EndLoc));
+    return false;
   }
 
   // If we reached here, then eat the '(' and Process
@@ -2484,14 +2912,13 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg,
   if (getLexer().isNot(AsmToken::Comma) && getLexer().isNot(AsmToken::RParen)) {
     if (Parser.parseExpression(E, EndLoc) ||
         check(!isa<X86MCExpr>(E), BaseLoc, "expected register here"))
-      return nullptr;
+      return true;
 
     // Sanity check register.
     BaseReg = cast<X86MCExpr>(E)->getRegNo();
     if (BaseReg == X86::EIZ || BaseReg == X86::RIZ)
-      return ErrorOperand(BaseLoc,
-                          "eiz and riz can only be used as index registers",
-                          SMRange(BaseLoc, EndLoc));
+      return Error(BaseLoc, "eiz and riz can only be used as index registers",
+                   SMRange(BaseLoc, EndLoc));
   }
 
   if (parseOptionalToken(AsmToken::Comma)) {
@@ -2503,14 +2930,14 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg,
     // "1(%eax,,1)", the assembler doesn't. Use "eiz" or "riz" for this.
     if (getLexer().isNot(AsmToken::RParen)) {
       if (Parser.parseTokenLoc(Loc) || Parser.parseExpression(E, EndLoc))
-        return nullptr;
+        return true;
 
       if (!isa<X86MCExpr>(E)) {
         // We've parsed an unexpected Scale Value instead of an index
         // register. Interpret it as an absolute.
         int64_t ScaleVal;
         if (!E->evaluateAsAbsolute(ScaleVal, getStreamer().getAssemblerPtr()))
-          return ErrorOperand(Loc, "expected absolute expression");
+          return Error(Loc, "expected absolute expression");
         if (ScaleVal != 1)
           Warning(Loc, "scale factor without index register is ignored");
         Scale = 1;
@@ -2518,10 +2945,10 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg,
         IndexReg = cast<X86MCExpr>(E)->getRegNo();
 
         if (BaseReg == X86::RIP)
-          return ErrorOperand(
-              Loc, "%rip as base register can not have an index register");
+          return Error(Loc,
+                       "%rip as base register can not have an index register");
         if (IndexReg == X86::RIP)
-          return ErrorOperand(Loc, "%rip is not allowed as an index register");
+          return Error(Loc, "%rip is not allowed as an index register");
 
         if (parseOptionalToken(AsmToken::Comma)) {
           // Parse the scale amount:
@@ -2532,15 +2959,14 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg,
             int64_t ScaleVal;
             if (Parser.parseTokenLoc(Loc) ||
                 Parser.parseAbsoluteExpression(ScaleVal))
-              return ErrorOperand(Loc, "expected scale expression");
+              return Error(Loc, "expected scale expression");
             Scale = (unsigned)ScaleVal;
             // Validate the scale amount.
             if (X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg) &&
                 Scale != 1)
-              return ErrorOperand(Loc,
-                                  "scale factor in 16-bit address must be 1");
+              return Error(Loc, "scale factor in 16-bit address must be 1");
             if (checkScale(Scale, ErrMsg))
-              return ErrorOperand(Loc, ErrMsg);
+              return Error(Loc, ErrMsg);
           }
         }
       }
@@ -2549,23 +2975,30 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg,
 
   // Ok, we've eaten the memory operand, verify we have a ')' and eat it too.
   if (parseToken(AsmToken::RParen, "unexpected token in memory operand"))
-    return nullptr;
+    return true;
 
   // This is to support otherwise illegal operand (%dx) found in various
   // unofficial manuals examples (e.g. "out[s]?[bwl]? %al, (%dx)") and must now
   // be supported. Mark such DX variants separately fix only in special cases.
   if (BaseReg == X86::DX && IndexReg == 0 && Scale == 1 && SegReg == 0 &&
-      isa<MCConstantExpr>(Disp) && cast<MCConstantExpr>(Disp)->getValue() == 0)
-    return X86Operand::CreateDXReg(BaseLoc, BaseLoc);
+      isa<MCConstantExpr>(Disp) &&
+      cast<MCConstantExpr>(Disp)->getValue() == 0) {
+    Operands.push_back(X86Operand::CreateDXReg(BaseLoc, BaseLoc));
+    return false;
+  }
 
   if (CheckBaseRegAndIndexRegAndScale(BaseReg, IndexReg, Scale, is64BitMode(),
                                       ErrMsg))
-    return ErrorOperand(BaseLoc, ErrMsg);
+    return Error(BaseLoc, ErrMsg);
 
   if (SegReg || BaseReg || IndexReg)
-    return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, BaseReg,
-                                 IndexReg, Scale, StartLoc, EndLoc);
-  return X86Operand::CreateMem(getPointerWidth(), Disp, StartLoc, EndLoc);
+    Operands.push_back(X86Operand::CreateMem(getPointerWidth(), SegReg, Disp,
+                                             BaseReg, IndexReg, Scale, StartLoc,
+                                             EndLoc));
+  else
+    Operands.push_back(
+        X86Operand::CreateMem(getPointerWidth(), Disp, StartLoc, EndLoc));
+  return false;
 }
 
 // Parse either a standard primary expression or a register.
@@ -2582,7 +3015,7 @@ bool X86AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
     Res = X86MCExpr::create(RegNo, Parser.getContext());
     return false;
   }
-  return Parser.parsePrimaryExpr(Res, EndLoc);
+  return Parser.parsePrimaryExpr(Res, EndLoc, nullptr);
 }
 
 bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
@@ -2592,6 +3025,7 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
 
   // Reset the forced VEX encoding.
   ForcedVEXEncoding = VEXEncoding_Default;
+  ForcedDispEncoding = DispEncoding_Default;
 
   // Parse pseudo prefixes.
   while (1) {
@@ -2604,12 +3038,18 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
         return Error(Parser.getTok().getLoc(), "Expected '}'");
       Parser.Lex(); // Eat curly.
 
-      if (Prefix == "vex" || Prefix == "vex2")
+      if (Prefix == "vex")
         ForcedVEXEncoding = VEXEncoding_VEX;
+      else if (Prefix == "vex2")
+        ForcedVEXEncoding = VEXEncoding_VEX2;
       else if (Prefix == "vex3")
         ForcedVEXEncoding = VEXEncoding_VEX3;
       else if (Prefix == "evex")
         ForcedVEXEncoding = VEXEncoding_EVEX;
+      else if (Prefix == "disp8")
+        ForcedDispEncoding = DispEncoding_Disp8;
+      else if (Prefix == "disp32")
+        ForcedDispEncoding = DispEncoding_Disp32;
       else
         return Error(NameLoc, "unknown prefix");
 
@@ -2626,10 +3066,36 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
       }
       continue;
     }
+    // Parse MASM style pseudo prefixes.
+    if (isParsingMSInlineAsm()) {
+      if (Name.equals_lower("vex"))
+        ForcedVEXEncoding = VEXEncoding_VEX;
+      else if (Name.equals_lower("vex2"))
+        ForcedVEXEncoding = VEXEncoding_VEX2;
+      else if (Name.equals_lower("vex3"))
+        ForcedVEXEncoding = VEXEncoding_VEX3;
+      else if (Name.equals_lower("evex"))
+        ForcedVEXEncoding = VEXEncoding_EVEX;
 
+      if (ForcedVEXEncoding != VEXEncoding_Default) {
+        if (getLexer().isNot(AsmToken::Identifier))
+          return Error(Parser.getTok().getLoc(), "Expected identifier");
+        // FIXME: The mnemonic won't match correctly if its not in lower case.
+        Name = Parser.getTok().getString();
+        NameLoc = Parser.getTok().getLoc();
+        Parser.Lex();
+      }
+    }
     break;
   }
 
+  // Support the suffix syntax for overriding displacement size as well.
+  if (Name.consume_back(".d32")) {
+    ForcedDispEncoding = DispEncoding_Disp32;
+  } else if (Name.consume_back(".d8")) {
+    ForcedDispEncoding = DispEncoding_Disp8;
+  }
+
   StringRef PatchedName = Name;
 
   // Hack to skip "short" following Jcc.
@@ -2797,11 +3263,13 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
   // repz repnz <insn>    ; GAS errors for the use of two similar prefixes
   // lock addq %rax, %rbx ; Destination operand must be of memory type
   // xacquire <insn>      ; xacquire must be accompanied by 'lock'
-  bool isPrefix = StringSwitch<bool>(Name)
-                      .Cases("rex64", "data32", "data16", true)
-                      .Cases("xacquire", "xrelease", true)
-                      .Cases("acquire", "release", isParsingIntelSyntax())
-                      .Default(false);
+  bool IsPrefix =
+      StringSwitch<bool>(Name)
+          .Cases("cs", "ds", "es", "fs", "gs", "ss", true)
+          .Cases("rex64", "data32", "data16", "addr32", "addr16", true)
+          .Cases("xacquire", "xrelease", true)
+          .Cases("acquire", "release", isParsingIntelSyntax())
+          .Default(false);
 
   auto isLockRepeatNtPrefix = [](StringRef N) {
     return StringSwitch<bool>(N)
@@ -2856,6 +3324,22 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
       return Error(NameLoc, "'data32' is not supported in 64-bit mode");
     // Hack to 'data16' for the table lookup.
     PatchedName = "data16";
+
+    if (getLexer().isNot(AsmToken::EndOfStatement)) {
+      StringRef Next = Parser.getTok().getString();
+      getLexer().Lex();
+      // data32 effectively changes the instruction suffix.
+      // TODO Generalize.
+      if (Next == "callw")
+        Next = "calll";
+      if (Next == "ljmpw")
+        Next = "ljmpl";
+
+      Name = Next;
+      PatchedName = Name;
+      ForcedDataPrefix = X86::Mode32Bit;
+      IsPrefix = false;
+    }
   }
 
   Operands.push_back(X86Operand::CreateToken(PatchedName, NameLoc));
@@ -2871,20 +3355,18 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
   // prefix juxtaposed with an operation like "lock incl 4(%rax)", because we
   // just want to parse the "lock" as the first instruction and the "incl" as
   // the next one.
-  if (getLexer().isNot(AsmToken::EndOfStatement) && !isPrefix) {
+  if (getLexer().isNot(AsmToken::EndOfStatement) && !IsPrefix) {
     // Parse '*' modifier.
     if (getLexer().is(AsmToken::Star))
       Operands.push_back(X86Operand::CreateToken("*", consumeToken()));
 
     // Read the operands.
     while(1) {
-      if (std::unique_ptr<X86Operand> Op = ParseOperand()) {
-        Operands.push_back(std::move(Op));
-        if (HandleAVX512Operand(Operands, *Operands.back()))
-          return true;
-      } else {
-         return true;
-      }
+      if (ParseOperand(Operands))
+        return true;
+      if (HandleAVX512Operand(Operands))
+        return true;
+
       // check for comma and eat it
       if (getLexer().is(AsmToken::Comma))
         Parser.Lex();
@@ -2910,7 +3392,7 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
 
   // Consume the EndOfStatement or the prefix separator Slash
   if (getLexer().is(AsmToken::EndOfStatement) ||
-      (isPrefix && getLexer().is(AsmToken::Slash)))
+      (IsPrefix && getLexer().is(AsmToken::Slash)))
     Parser.Lex();
   else if (CurlyAsEndOfStatement)
     // Add an actual EndOfStatement before the curly brace
@@ -3064,39 +3546,6 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
     return HadVerifyError;
   }
 
-  // FIXME: Hack to handle recognize s{hr,ar,hl} $1, <op>.  Canonicalize to
-  // "shift <op>".
-  if ((Name.startswith("shr") || Name.startswith("sar") ||
-       Name.startswith("shl") || Name.startswith("sal") ||
-       Name.startswith("rcl") || Name.startswith("rcr") ||
-       Name.startswith("rol") || Name.startswith("ror")) &&
-      Operands.size() == 3) {
-    if (isParsingIntelSyntax()) {
-      // Intel syntax
-      X86Operand &Op1 = static_cast<X86Operand &>(*Operands[2]);
-      if (Op1.isImm() && isa<MCConstantExpr>(Op1.getImm()) &&
-          cast<MCConstantExpr>(Op1.getImm())->getValue() == 1)
-        Operands.pop_back();
-    } else {
-      X86Operand &Op1 = static_cast<X86Operand &>(*Operands[1]);
-      if (Op1.isImm() && isa<MCConstantExpr>(Op1.getImm()) &&
-          cast<MCConstantExpr>(Op1.getImm())->getValue() == 1)
-        Operands.erase(Operands.begin() + 1);
-    }
-  }
-
-  // Transforms "int $3" into "int3" as a size optimization.  We can't write an
-  // instalias with an immediate operand yet.
-  if (Name == "int" && Operands.size() == 2) {
-    X86Operand &Op1 = static_cast<X86Operand &>(*Operands[1]);
-    if (Op1.isImm())
-      if (auto *CE = dyn_cast<MCConstantExpr>(Op1.getImm()))
-        if (CE->getValue() == 3) {
-          Operands.erase(Operands.begin() + 1);
-          static_cast<X86Operand &>(*Operands[0]).setTokenValue("int3");
-        }
-  }
-
   // Transforms "xlat mem8" into "xlatb"
   if ((Name == "xlat" || Name == "xlatb") && Operands.size() == 2) {
     X86Operand &Op1 = static_cast<X86Operand &>(*Operands[1]);
@@ -3118,6 +3567,26 @@ bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) {
 
   switch (Inst.getOpcode()) {
   default: return false;
+  case X86::JMP_1:
+    // {disp32} forces a larger displacement as if the instruction was relaxed.
+    // NOTE: 16-bit mode uses 16-bit displacement even though it says {disp32}.
+    // This matches GNU assembler.
+    if (ForcedDispEncoding == DispEncoding_Disp32) {
+      Inst.setOpcode(is16BitMode() ? X86::JMP_2 : X86::JMP_4);
+      return true;
+    }
+
+    return false;
+  case X86::JCC_1:
+    // {disp32} forces a larger displacement as if the instruction was relaxed.
+    // NOTE: 16-bit mode uses 16-bit displacement even though it says {disp32}.
+    // This matches GNU assembler.
+    if (ForcedDispEncoding == DispEncoding_Disp32) {
+      Inst.setOpcode(is16BitMode() ? X86::JCC_2 : X86::JCC_4);
+      return true;
+    }
+
+    return false;
   case X86::VMOVZPQILo2PQIrr:
   case X86::VMOVAPDrr:
   case X86::VMOVAPDYrr:
@@ -3176,6 +3645,122 @@ bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) {
     Inst.setOpcode(NewOpc);
     return true;
   }
+  case X86::RCR8ri: case X86::RCR16ri: case X86::RCR32ri: case X86::RCR64ri:
+  case X86::RCL8ri: case X86::RCL16ri: case X86::RCL32ri: case X86::RCL64ri:
+  case X86::ROR8ri: case X86::ROR16ri: case X86::ROR32ri: case X86::ROR64ri:
+  case X86::ROL8ri: case X86::ROL16ri: case X86::ROL32ri: case X86::ROL64ri:
+  case X86::SAR8ri: case X86::SAR16ri: case X86::SAR32ri: case X86::SAR64ri:
+  case X86::SHR8ri: case X86::SHR16ri: case X86::SHR32ri: case X86::SHR64ri:
+  case X86::SHL8ri: case X86::SHL16ri: case X86::SHL32ri: case X86::SHL64ri: {
+    // Optimize s{hr,ar,hl} $1, <op> to "shift <op>". Similar for rotate.
+    // FIXME: It would be great if we could just do this with an InstAlias.
+    if (!Inst.getOperand(2).isImm() || Inst.getOperand(2).getImm() != 1)
+      return false;
+
+    unsigned NewOpc;
+    switch (Inst.getOpcode()) {
+    default: llvm_unreachable("Invalid opcode");
+    case X86::RCR8ri:  NewOpc = X86::RCR8r1;  break;
+    case X86::RCR16ri: NewOpc = X86::RCR16r1; break;
+    case X86::RCR32ri: NewOpc = X86::RCR32r1; break;
+    case X86::RCR64ri: NewOpc = X86::RCR64r1; break;
+    case X86::RCL8ri:  NewOpc = X86::RCL8r1;  break;
+    case X86::RCL16ri: NewOpc = X86::RCL16r1; break;
+    case X86::RCL32ri: NewOpc = X86::RCL32r1; break;
+    case X86::RCL64ri: NewOpc = X86::RCL64r1; break;
+    case X86::ROR8ri:  NewOpc = X86::ROR8r1;  break;
+    case X86::ROR16ri: NewOpc = X86::ROR16r1; break;
+    case X86::ROR32ri: NewOpc = X86::ROR32r1; break;
+    case X86::ROR64ri: NewOpc = X86::ROR64r1; break;
+    case X86::ROL8ri:  NewOpc = X86::ROL8r1;  break;
+    case X86::ROL16ri: NewOpc = X86::ROL16r1; break;
+    case X86::ROL32ri: NewOpc = X86::ROL32r1; break;
+    case X86::ROL64ri: NewOpc = X86::ROL64r1; break;
+    case X86::SAR8ri:  NewOpc = X86::SAR8r1;  break;
+    case X86::SAR16ri: NewOpc = X86::SAR16r1; break;
+    case X86::SAR32ri: NewOpc = X86::SAR32r1; break;
+    case X86::SAR64ri: NewOpc = X86::SAR64r1; break;
+    case X86::SHR8ri:  NewOpc = X86::SHR8r1;  break;
+    case X86::SHR16ri: NewOpc = X86::SHR16r1; break;
+    case X86::SHR32ri: NewOpc = X86::SHR32r1; break;
+    case X86::SHR64ri: NewOpc = X86::SHR64r1; break;
+    case X86::SHL8ri:  NewOpc = X86::SHL8r1;  break;
+    case X86::SHL16ri: NewOpc = X86::SHL16r1; break;
+    case X86::SHL32ri: NewOpc = X86::SHL32r1; break;
+    case X86::SHL64ri: NewOpc = X86::SHL64r1; break;
+    }
+
+    MCInst TmpInst;
+    TmpInst.setOpcode(NewOpc);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    Inst = TmpInst;
+    return true;
+  }
+  case X86::RCR8mi: case X86::RCR16mi: case X86::RCR32mi: case X86::RCR64mi:
+  case X86::RCL8mi: case X86::RCL16mi: case X86::RCL32mi: case X86::RCL64mi:
+  case X86::ROR8mi: case X86::ROR16mi: case X86::ROR32mi: case X86::ROR64mi:
+  case X86::ROL8mi: case X86::ROL16mi: case X86::ROL32mi: case X86::ROL64mi:
+  case X86::SAR8mi: case X86::SAR16mi: case X86::SAR32mi: case X86::SAR64mi:
+  case X86::SHR8mi: case X86::SHR16mi: case X86::SHR32mi: case X86::SHR64mi:
+  case X86::SHL8mi: case X86::SHL16mi: case X86::SHL32mi: case X86::SHL64mi: {
+    // Optimize s{hr,ar,hl} $1, <op> to "shift <op>". Similar for rotate.
+    // FIXME: It would be great if we could just do this with an InstAlias.
+    if (!Inst.getOperand(X86::AddrNumOperands).isImm() ||
+        Inst.getOperand(X86::AddrNumOperands).getImm() != 1)
+      return false;
+
+    unsigned NewOpc;
+    switch (Inst.getOpcode()) {
+    default: llvm_unreachable("Invalid opcode");
+    case X86::RCR8mi:  NewOpc = X86::RCR8m1;  break;
+    case X86::RCR16mi: NewOpc = X86::RCR16m1; break;
+    case X86::RCR32mi: NewOpc = X86::RCR32m1; break;
+    case X86::RCR64mi: NewOpc = X86::RCR64m1; break;
+    case X86::RCL8mi:  NewOpc = X86::RCL8m1;  break;
+    case X86::RCL16mi: NewOpc = X86::RCL16m1; break;
+    case X86::RCL32mi: NewOpc = X86::RCL32m1; break;
+    case X86::RCL64mi: NewOpc = X86::RCL64m1; break;
+    case X86::ROR8mi:  NewOpc = X86::ROR8m1;  break;
+    case X86::ROR16mi: NewOpc = X86::ROR16m1; break;
+    case X86::ROR32mi: NewOpc = X86::ROR32m1; break;
+    case X86::ROR64mi: NewOpc = X86::ROR64m1; break;
+    case X86::ROL8mi:  NewOpc = X86::ROL8m1;  break;
+    case X86::ROL16mi: NewOpc = X86::ROL16m1; break;
+    case X86::ROL32mi: NewOpc = X86::ROL32m1; break;
+    case X86::ROL64mi: NewOpc = X86::ROL64m1; break;
+    case X86::SAR8mi:  NewOpc = X86::SAR8m1;  break;
+    case X86::SAR16mi: NewOpc = X86::SAR16m1; break;
+    case X86::SAR32mi: NewOpc = X86::SAR32m1; break;
+    case X86::SAR64mi: NewOpc = X86::SAR64m1; break;
+    case X86::SHR8mi:  NewOpc = X86::SHR8m1;  break;
+    case X86::SHR16mi: NewOpc = X86::SHR16m1; break;
+    case X86::SHR32mi: NewOpc = X86::SHR32m1; break;
+    case X86::SHR64mi: NewOpc = X86::SHR64m1; break;
+    case X86::SHL8mi:  NewOpc = X86::SHL8m1;  break;
+    case X86::SHL16mi: NewOpc = X86::SHL16m1; break;
+    case X86::SHL32mi: NewOpc = X86::SHL32m1; break;
+    case X86::SHL64mi: NewOpc = X86::SHL64m1; break;
+    }
+
+    MCInst TmpInst;
+    TmpInst.setOpcode(NewOpc);
+    for (int i = 0; i != X86::AddrNumOperands; ++i)
+      TmpInst.addOperand(Inst.getOperand(i));
+    Inst = TmpInst;
+    return true;
+  }
+  case X86::INT: {
+    // Transforms "int $3" into "int3" as a size optimization.  We can't write an
+    // instalias with an immediate operand yet.
+    if (!Inst.getOperand(0).isImm() || Inst.getOperand(0).getImm() != 3)
+      return false;
+
+    MCInst TmpInst;
+    TmpInst.setOpcode(X86::INT3);
+    Inst = TmpInst;
+    return true;
+  }
   }
 }
 
@@ -3275,6 +3860,33 @@ bool X86AsmParser::validateInstruction(MCInst &Inst, const OperandVector &Ops) {
   }
   }
 
+  const MCInstrDesc &MCID = MII.get(Inst.getOpcode());
+  // Check that we aren't mixing AH/BH/CH/DH with REX prefix. We only need to
+  // check this with the legacy encoding, VEX/EVEX/XOP don't use REX.
+  if ((MCID.TSFlags & X86II::EncodingMask) == 0) {
+    MCPhysReg HReg = X86::NoRegister;
+    bool UsesRex = MCID.TSFlags & X86II::REX_W;
+    unsigned NumOps = Inst.getNumOperands();
+    for (unsigned i = 0; i != NumOps; ++i) {
+      const MCOperand &MO = Inst.getOperand(i);
+      if (!MO.isReg())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (Reg == X86::AH || Reg == X86::BH || Reg == X86::CH || Reg == X86::DH)
+        HReg = Reg;
+      if (X86II::isX86_64NonExtLowByteReg(Reg) ||
+          X86II::isX86_64ExtendedReg(Reg))
+        UsesRex = true;
+    }
+
+    if (UsesRex && HReg != X86::NoRegister) {
+      StringRef RegName = X86IntelInstPrinter::getRegisterName(HReg);
+      return Error(Ops[0]->getStartLoc(),
+                   "can't encode '" + RegName + "' in an instruction requiring "
+                   "REX prefix");
+    }
+  }
+
   return false;
 }
 
@@ -3468,10 +4080,18 @@ unsigned X86AsmParser::checkTargetMatchPredicate(MCInst &Inst) {
     return Match_Unsupported;
 
   if ((ForcedVEXEncoding == VEXEncoding_VEX ||
+       ForcedVEXEncoding == VEXEncoding_VEX2 ||
        ForcedVEXEncoding == VEXEncoding_VEX3) &&
       (MCID.TSFlags & X86II::EncodingMask) != X86II::VEX)
     return Match_Unsupported;
 
+  // These instructions are only available with {vex}, {vex2} or {vex3} prefix
+  if (MCID.TSFlags & X86II::ExplicitVEXPrefix &&
+      (ForcedVEXEncoding != VEXEncoding_VEX &&
+       ForcedVEXEncoding != VEXEncoding_VEX2 &&
+       ForcedVEXEncoding != VEXEncoding_VEX3))
+    return Match_Unsupported;
+
   // These instructions match ambiguously with their VEX encoded counterparts
   // and appear first in the matching table. Reject them unless we're forcing
   // EVEX encoding.
@@ -3510,19 +4130,39 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
 
   MCInst Inst;
 
-  // If VEX3 encoding is forced, we need to pass the USE_VEX3 flag to the
-  // encoder.
-  if (ForcedVEXEncoding == VEXEncoding_VEX3)
+  // If VEX/EVEX encoding is forced, we need to pass the USE_* flag to the
+  // encoder and printer.
+  if (ForcedVEXEncoding == VEXEncoding_VEX)
+    Prefixes |= X86::IP_USE_VEX;
+  else if (ForcedVEXEncoding == VEXEncoding_VEX2)
+    Prefixes |= X86::IP_USE_VEX2;
+  else if (ForcedVEXEncoding == VEXEncoding_VEX3)
     Prefixes |= X86::IP_USE_VEX3;
+  else if (ForcedVEXEncoding == VEXEncoding_EVEX)
+    Prefixes |= X86::IP_USE_EVEX;
+
+  // Set encoded flags for {disp8} and {disp32}.
+  if (ForcedDispEncoding == DispEncoding_Disp8)
+    Prefixes |= X86::IP_USE_DISP8;
+  else if (ForcedDispEncoding == DispEncoding_Disp32)
+    Prefixes |= X86::IP_USE_DISP32;
 
   if (Prefixes)
     Inst.setFlags(Prefixes);
 
+  // In 16-bit mode, if data32 is specified, temporarily switch to 32-bit mode
+  // when matching the instruction.
+  if (ForcedDataPrefix == X86::Mode32Bit)
+    SwitchMode(X86::Mode32Bit);
   // First, try a direct match.
   FeatureBitset MissingFeatures;
   unsigned OriginalError = MatchInstruction(Operands, Inst, ErrorInfo,
                                             MissingFeatures, MatchingInlineAsm,
                                             isParsingIntelSyntax());
+  if (ForcedDataPrefix == X86::Mode32Bit) {
+    SwitchMode(X86::Mode16Bit);
+    ForcedDataPrefix = 0;
+  }
   switch (OriginalError) {
   default: llvm_unreachable("Unexpected match result!");
   case Match_Success:
@@ -3631,6 +4271,15 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
   unsigned NumSuccessfulMatches =
       std::count(std::begin(Match), std::end(Match), Match_Success);
   if (NumSuccessfulMatches == 1) {
+    if (!MatchingInlineAsm && validateInstruction(Inst, Operands))
+      return true;
+    // Some instructions need post-processing to, for example, tweak which
+    // encoding is selected. Loop on it while changes happen so the
+    // individual transformations can chain off each other.
+    if (!MatchingInlineAsm)
+      while (processInstruction(Inst, Operands))
+        ;
+
     Inst.setLoc(IDLoc);
     if (!MatchingInlineAsm)
       emitInstruction(Inst, Operands, Out);
@@ -3744,10 +4393,22 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
 
   MCInst Inst;
 
-  // If VEX3 encoding is forced, we need to pass the USE_VEX3 flag to the
-  // encoder.
-  if (ForcedVEXEncoding == VEXEncoding_VEX3)
+  // If VEX/EVEX encoding is forced, we need to pass the USE_* flag to the
+  // encoder and printer.
+  if (ForcedVEXEncoding == VEXEncoding_VEX)
+    Prefixes |= X86::IP_USE_VEX;
+  else if (ForcedVEXEncoding == VEXEncoding_VEX2)
+    Prefixes |= X86::IP_USE_VEX2;
+  else if (ForcedVEXEncoding == VEXEncoding_VEX3)
     Prefixes |= X86::IP_USE_VEX3;
+  else if (ForcedVEXEncoding == VEXEncoding_EVEX)
+    Prefixes |= X86::IP_USE_EVEX;
+
+  // Set encoded flags for {disp8} and {disp32}.
+  if (ForcedDispEncoding == DispEncoding_Disp8)
+    Prefixes |= X86::IP_USE_DISP8;
+  else if (ForcedDispEncoding == DispEncoding_Disp32)
+    Prefixes |= X86::IP_USE_DISP32;
 
   if (Prefixes)
     Inst.setFlags(Prefixes);
@@ -3942,6 +4603,8 @@ bool X86AsmParser::OmitRegisterFromClobberLists(unsigned RegNo) {
 bool X86AsmParser::ParseDirective(AsmToken DirectiveID) {
   MCAsmParser &Parser = getParser();
   StringRef IDVal = DirectiveID.getIdentifier();
+  if (IDVal.startswith(".arch"))
+    return parseDirectiveArch();
   if (IDVal.startswith(".code"))
     return ParseDirectiveCode(IDVal, DirectiveID.getLoc());
   else if (IDVal.startswith(".att_syntax")) {
@@ -3966,7 +4629,9 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) {
                                            "a '%' prefix in .intel_syntax");
     }
     return false;
-  } else if (IDVal == ".even")
+  } else if (IDVal == ".nops")
+    return parseDirectiveNops(DirectiveID.getLoc());
+  else if (IDVal == ".even")
     return parseDirectiveEven(DirectiveID.getLoc());
   else if (IDVal == ".cv_fpo_proc")
     return parseDirectiveFPOProc(DirectiveID.getLoc());
@@ -3982,20 +4647,67 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) {
     return parseDirectiveFPOEndPrologue(DirectiveID.getLoc());
   else if (IDVal == ".cv_fpo_endproc")
     return parseDirectiveFPOEndProc(DirectiveID.getLoc());
-  else if (IDVal == ".seh_pushreg")
+  else if (IDVal == ".seh_pushreg" ||
+           (Parser.isParsingMasm() && IDVal.equals_lower(".pushreg")))
     return parseDirectiveSEHPushReg(DirectiveID.getLoc());
-  else if (IDVal == ".seh_setframe")
+  else if (IDVal == ".seh_setframe" ||
+           (Parser.isParsingMasm() && IDVal.equals_lower(".setframe")))
     return parseDirectiveSEHSetFrame(DirectiveID.getLoc());
-  else if (IDVal == ".seh_savereg")
+  else if (IDVal == ".seh_savereg" ||
+           (Parser.isParsingMasm() && IDVal.equals_lower(".savereg")))
     return parseDirectiveSEHSaveReg(DirectiveID.getLoc());
-  else if (IDVal == ".seh_savexmm")
+  else if (IDVal == ".seh_savexmm" ||
+           (Parser.isParsingMasm() && IDVal.equals_lower(".savexmm128")))
     return parseDirectiveSEHSaveXMM(DirectiveID.getLoc());
-  else if (IDVal == ".seh_pushframe")
+  else if (IDVal == ".seh_pushframe" ||
+           (Parser.isParsingMasm() && IDVal.equals_lower(".pushframe")))
     return parseDirectiveSEHPushFrame(DirectiveID.getLoc());
 
   return true;
 }
 
+bool X86AsmParser::parseDirectiveArch() {
+  // Ignore .arch for now.
+  getParser().parseStringToEndOfStatement();
+  return false;
+}
+
+/// parseDirectiveNops
+///  ::= .nops size[, control]
+bool X86AsmParser::parseDirectiveNops(SMLoc L) {
+  int64_t NumBytes = 0, Control = 0;
+  SMLoc NumBytesLoc, ControlLoc;
+  const MCSubtargetInfo STI = getSTI();
+  NumBytesLoc = getTok().getLoc();
+  if (getParser().checkForValidSection() ||
+      getParser().parseAbsoluteExpression(NumBytes))
+    return true;
+
+  if (parseOptionalToken(AsmToken::Comma)) {
+    ControlLoc = getTok().getLoc();
+    if (getParser().parseAbsoluteExpression(Control))
+      return true;
+  }
+  if (getParser().parseToken(AsmToken::EndOfStatement,
+                             "unexpected token in '.nops' directive"))
+    return true;
+
+  if (NumBytes <= 0) {
+    Error(NumBytesLoc, "'.nops' directive with non-positive size");
+    return false;
+  }
+
+  if (Control < 0) {
+    Error(ControlLoc, "'.nops' directive with negative NOP size");
+    return false;
+  }
+
+  /// Emit nops
+  getParser().getStreamer().emitNops(NumBytes, Control, L);
+
+  return false;
+}
+
 /// parseDirectiveEven
 ///  ::= .even
 bool X86AsmParser::parseDirectiveEven(SMLoc L) {
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/contrib/llvm-project/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
index a7fa1eb9a5ee..4e6d8e8e1a54 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -492,6 +492,7 @@ static int readPrefixes(struct InternalInstruction *insn) {
       insn->addressSize = (insn->hasAdSize ? 4 : 8);
       insn->displacementSize = 4;
       insn->immediateSize = 4;
+      insn->hasOpSize = false;
     } else {
       insn->registerSize = (insn->hasOpSize ? 2 : 4);
       insn->addressSize = (insn->hasAdSize ? 4 : 8);
@@ -1662,9 +1663,9 @@ namespace X86 {
     sib   = 504,
     sib64 = 505
   };
-}
+} // namespace X86
 
-}
+} // namespace llvm
 
 static bool translateInstruction(MCInst &target,
                                 InternalInstruction &source,
@@ -1689,7 +1690,7 @@ private:
   DisassemblerMode              fMode;
 };
 
-}
+} // namespace
 
 X86GenericDisassembler::X86GenericDisassembler(
                                          const MCSubtargetInfo &STI,
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
index 0134b4efce72..c685d7e0db81 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
@@ -16,6 +16,7 @@
 #include "X86InstComments.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/Casting.h"
@@ -384,6 +385,16 @@ void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
 
 void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
                                           raw_ostream &O) {
+  // Do not print the exact form of the memory operand if it references a known
+  // binary object.
+  if (SymbolizeOperands && MIA) {
+    uint64_t Target;
+    if (MIA->evaluateBranch(*MI, 0, 0, Target))
+      return;
+    if (MIA->evaluateMemoryOperandAddress(*MI, 0, 0))
+      return;
+  }
+
   const MCOperand &BaseReg = MI->getOperand(Op + X86::AddrBaseReg);
   const MCOperand &IndexReg = MI->getOperand(Op + X86::AddrIndexReg);
   const MCOperand &DispSpec = MI->getOperand(Op + X86::AddrDisp);
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h
index 51ddae61d251..f7a850571260 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h
@@ -36,6 +36,7 @@ public:
                                raw_ostream &O);
 
   // Autogenerated by tblgen.
+  std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
   void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &OS);
   static const char *getRegisterName(unsigned RegNo);
 
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index bf3b6bcb5463..95012a148d83 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -109,7 +109,7 @@ cl::opt<unsigned> X86PadMaxPrefixSize(
     cl::desc("Maximum number of prefixes to use for padding"));
 
 cl::opt<bool> X86PadForAlign(
-    "x86-pad-for-align", cl::init(true), cl::Hidden,
+    "x86-pad-for-align", cl::init(false), cl::Hidden,
     cl::desc("Pad previous instructions to implement align directives"));
 
 cl::opt<bool> X86PadForBranchAlign(
@@ -207,6 +207,8 @@ public:
 
   void finishLayout(MCAssembler const &Asm, MCAsmLayout &Layout) const override;
 
+  unsigned getMaximumNopSize() const override;
+
   bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
 };
 } // end anonymous namespace
@@ -955,6 +957,9 @@ void X86AsmBackend::finishLayout(MCAssembler const &Asm,
   if (!X86PadForAlign && !X86PadForBranchAlign)
     return;
 
+  // The processed regions are delimitered by LabeledFragments. -g may have more
+  // MCSymbols and therefore different relaxation results. X86PadForAlign is
+  // disabled by default to eliminate the -g vs non -g difference.
   DenseSet<MCFragment *> LabeledFragments;
   for (const MCSymbol &S : Asm.symbols())
     LabeledFragments.insert(S.getFragment(false));
@@ -1067,6 +1072,21 @@ void X86AsmBackend::finishLayout(MCAssembler const &Asm,
   }
 }
 
+unsigned X86AsmBackend::getMaximumNopSize() const {
+  if (!STI.hasFeature(X86::FeatureNOPL) && !STI.hasFeature(X86::Mode64Bit))
+    return 1;
+  if (STI.getFeatureBits()[X86::FeatureFast7ByteNOP])
+    return 7;
+  if (STI.getFeatureBits()[X86::FeatureFast15ByteNOP])
+    return 15;
+  if (STI.getFeatureBits()[X86::FeatureFast11ByteNOP])
+    return 11;
+  // FIXME: handle 32-bit mode
+  // 15-bytes is the longest single NOP instruction, but 10-bytes is
+  // commonly the longest that can be efficiently decoded.
+  return 10;
+}
+
 /// Write a sequence of optimal nops to the output, covering \p Count
 /// bytes.
 /// \return - true on success, false on failure
@@ -1094,23 +1114,7 @@ bool X86AsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
     "\x66\x2e\x0f\x1f\x84\x00\x00\x00\x00\x00",
   };
 
-  // This CPU doesn't support long nops. If needed add more.
-  // FIXME: We could generated something better than plain 0x90.
-  if (!STI.hasFeature(X86::FeatureNOPL) && !STI.hasFeature(X86::Mode64Bit)) {
-    for (uint64_t i = 0; i < Count; ++i)
-      OS << '\x90';
-    return true;
-  }
-
-  // 15-bytes is the longest single NOP instruction, but 10-bytes is
-  // commonly the longest that can be efficiently decoded.
-  uint64_t MaxNopLength = 10;
-  if (STI.getFeatureBits()[X86::FeatureFast7ByteNOP])
-    MaxNopLength = 7;
-  else if (STI.getFeatureBits()[X86::FeatureFast15ByteNOP])
-    MaxNopLength = 15;
-  else if (STI.getFeatureBits()[X86::FeatureFast11ByteNOP])
-    MaxNopLength = 11;
+  uint64_t MaxNopLength = (uint64_t)getMaximumNopSize();
 
   // Emit as many MaxNopLength NOPs as needed, then emit a NOP of the remaining
   // length.
@@ -1237,7 +1241,7 @@ namespace CU {
     UNWIND_FRAMELESS_STACK_REG_PERMUTATION = 0x000003FF
   };
 
-} // end CU namespace
+} // namespace CU
 
 class DarwinX86AsmBackend : public X86AsmBackend {
   const MCRegisterInfo &MRI;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index 79f07d3c7792..4db1bfc25177 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -55,13 +55,18 @@ namespace X86 {
   /// The constants to describe instr prefixes if there are
   enum IPREFIXES {
     IP_NO_PREFIX = 0,
-    IP_HAS_OP_SIZE = 1,
-    IP_HAS_AD_SIZE = 2,
-    IP_HAS_REPEAT_NE = 4,
-    IP_HAS_REPEAT = 8,
-    IP_HAS_LOCK = 16,
-    IP_HAS_NOTRACK = 32,
-    IP_USE_VEX3 = 64,
+    IP_HAS_OP_SIZE =   1U << 0,
+    IP_HAS_AD_SIZE =   1U << 1,
+    IP_HAS_REPEAT_NE = 1U << 2,
+    IP_HAS_REPEAT =    1U << 3,
+    IP_HAS_LOCK =      1U << 4,
+    IP_HAS_NOTRACK =   1U << 5,
+    IP_USE_VEX =       1U << 6,
+    IP_USE_VEX2 =      1U << 7,
+    IP_USE_VEX3 =      1U << 8,
+    IP_USE_EVEX =      1U << 9,
+    IP_USE_DISP8 =     1U << 10,
+    IP_USE_DISP32 =    1U << 11,
   };
 
   enum OperandType : unsigned {
@@ -947,7 +952,11 @@ namespace X86II {
 
     // NOTRACK prefix
     NoTrackShift = EVEX_RCShift + 1,
-    NOTRACK = 1ULL << NoTrackShift
+    NOTRACK = 1ULL << NoTrackShift,
+
+    // Force VEX encoding
+    ExplicitVEXShift = NoTrackShift + 1,
+    ExplicitVEXPrefix = 1ULL << ExplicitVEXShift
   };
 
   /// \returns true if the instruction with given opcode is a prefix.
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
index 292dd17e2f51..fa937d381613 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
@@ -94,6 +94,12 @@ static void checkIs32(MCContext &Ctx, SMLoc Loc, X86_64RelType Type) {
                     "32 bit reloc applied to a field with a different size");
 }
 
+static void checkIs64(MCContext &Ctx, SMLoc Loc, X86_64RelType Type) {
+  if (Type != RT64_64)
+    Ctx.reportError(Loc,
+                    "64 bit reloc applied to a field with a different size");
+}
+
 static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc,
                                MCSymbolRefExpr::VariantKind Modifier,
                                X86_64RelType Type, bool IsPCRel,
@@ -212,6 +218,9 @@ static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc,
       return ELF::R_X86_64_REX_GOTPCRELX;
     }
     llvm_unreachable("unexpected relocation type!");
+  case MCSymbolRefExpr::VK_X86_PLTOFF:
+    checkIs64(Ctx, Loc, Type);
+    return ELF::R_X86_64_PLTOFF64;
   }
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
index 33d70fdb1214..d8dbbbbf2779 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
@@ -295,6 +295,10 @@ void X86InstPrinterCommon::printRoundingControl(const MCInst *MI, unsigned Op,
 /// \see MCInstPrinter::printInst
 void X86InstPrinterCommon::printPCRelImm(const MCInst *MI, uint64_t Address,
                                          unsigned OpNo, raw_ostream &O) {
+  // Do not print the numberic target address when symbolizing.
+  if (SymbolizeOperands)
+    return;
+
   const MCOperand &Op = MI->getOperand(OpNo);
   if (Op.isImm()) {
     if (PrintBranchImmAsAddress) {
@@ -342,6 +346,21 @@ void X86InstPrinterCommon::printInstFlags(const MCInst *MI, raw_ostream &O) {
     O << "\trepne\t";
   else if (Flags & X86::IP_HAS_REPEAT)
     O << "\trep\t";
+
+  // These all require a pseudo prefix
+  if ((Flags & X86::IP_USE_VEX) || (TSFlags & X86II::ExplicitVEXPrefix))
+    O << "\t{vex}";
+  else if (Flags & X86::IP_USE_VEX2)
+    O << "\t{vex2}";
+  else if (Flags & X86::IP_USE_VEX3)
+    O << "\t{vex3}";
+  else if (Flags & X86::IP_USE_EVEX)
+    O << "\t{evex}";
+
+  if (Flags & X86::IP_USE_DISP8)
+    O << "\t{disp8}";
+  else if (Flags & X86::IP_USE_DISP32)
+    O << "\t{disp32}";
 }
 
 void X86InstPrinterCommon::printVKPair(const MCInst *MI, unsigned OpNo,
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
index d1eb4d09851d..d5b205ad9a63 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
@@ -16,6 +16,7 @@
 #include "X86InstComments.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
@@ -342,6 +343,15 @@ void X86IntelInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
 
 void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
                                             raw_ostream &O) {
+  // Do not print the exact form of the memory operand if it references a known
+  // binary object.
+  if (SymbolizeOperands && MIA) {
+    uint64_t Target;
+    if (MIA->evaluateBranch(*MI, 0, 0, Target))
+      return;
+    if (MIA->evaluateMemoryOperandAddress(*MI, 0, 0))
+      return;
+  }
   const MCOperand &BaseReg  = MI->getOperand(Op+X86::AddrBaseReg);
   unsigned ScaleVal         = MI->getOperand(Op+X86::AddrScaleAmt).getImm();
   const MCOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg);
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h
index 82baf611df03..aa4d0545ea46 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h
@@ -37,6 +37,7 @@ public:
                                raw_ostream &O);
 
   // Autogenerated by tblgen.
+  std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
   void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &O);
   static const char *getRegisterName(unsigned RegNo);
 
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index 7dea0760a831..260253a5302d 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -93,7 +93,8 @@ private:
   bool emitOpcodePrefix(int MemOperand, const MCInst &MI,
                         const MCSubtargetInfo &STI, raw_ostream &OS) const;
 
-  bool emitREXPrefix(int MemOperand, const MCInst &MI, raw_ostream &OS) const;
+  bool emitREXPrefix(int MemOperand, const MCInst &MI,
+                     const MCSubtargetInfo &STI, raw_ostream &OS) const;
 };
 
 } // end anonymous namespace
@@ -113,33 +114,28 @@ static void emitConstant(uint64_t Val, unsigned Size, raw_ostream &OS) {
   }
 }
 
-/// \returns true if this signed displacement fits in a 8-bit sign-extended
-/// field.
-static bool isDisp8(int Value) { return Value == (int8_t)Value; }
-
-/// \returns true if this signed displacement fits in a 8-bit compressed
-/// dispacement field.
-static bool isCDisp8(uint64_t TSFlags, int Value, int &CValue) {
-  assert(((TSFlags & X86II::EncodingMask) == X86II::EVEX) &&
-         "Compressed 8-bit displacement is only valid for EVEX inst.");
+/// Determine if this immediate can fit in a disp8 or a compressed disp8 for
+/// EVEX instructions. \p will be set to the value to pass to the ImmOffset
+/// parameter of emitImmediate.
+static bool isDispOrCDisp8(uint64_t TSFlags, int Value, int &ImmOffset) {
+  bool HasEVEX = (TSFlags & X86II::EncodingMask) == X86II::EVEX;
 
-  unsigned CD8_Scale =
+  int CD8_Scale =
       (TSFlags & X86II::CD8_Scale_Mask) >> X86II::CD8_Scale_Shift;
-  if (CD8_Scale == 0) {
-    CValue = Value;
-    return isDisp8(Value);
-  }
+  if (!HasEVEX || CD8_Scale == 0)
+    return isInt<8>(Value);
+
+  assert(isPowerOf2_32(CD8_Scale) && "Unexpected CD8 scale!");
+  if (Value & (CD8_Scale - 1)) // Unaligned offset
+    return false;
 
-  unsigned Mask = CD8_Scale - 1;
-  assert((CD8_Scale & Mask) == 0 && "Invalid memory object size.");
-  if (Value & Mask) // Unaligned offset
+  int CDisp8 = Value / CD8_Scale;
+  if (!isInt<8>(CDisp8))
     return false;
-  Value /= (int)CD8_Scale;
-  bool Ret = (Value == (int8_t)Value);
 
-  if (Ret)
-    CValue = Value;
-  return Ret;
+  // ImmOffset will be added to Value in emitImmediate leaving just CDisp8.
+  ImmOffset = CDisp8 - Value;
+  return true;
 }
 
 /// \returns the appropriate fixup kind to use for an immediate in an
@@ -164,17 +160,18 @@ static MCFixupKind getImmFixupKind(uint64_t TSFlags) {
 /// \returns true if the specified instruction has a 16-bit memory operand.
 static bool is16BitMemOperand(const MCInst &MI, unsigned Op,
                               const MCSubtargetInfo &STI) {
-  const MCOperand &BaseReg = MI.getOperand(Op + X86::AddrBaseReg);
-  const MCOperand &IndexReg = MI.getOperand(Op + X86::AddrIndexReg);
-  const MCOperand &Disp = MI.getOperand(Op + X86::AddrDisp);
+  const MCOperand &Base = MI.getOperand(Op + X86::AddrBaseReg);
+  const MCOperand &Index = MI.getOperand(Op + X86::AddrIndexReg);
+
+  unsigned BaseReg = Base.getReg();
+  unsigned IndexReg = Index.getReg();
 
-  if (STI.hasFeature(X86::Mode16Bit) && BaseReg.getReg() == 0 && Disp.isImm() &&
-      Disp.getImm() < 0x10000)
+  if (STI.hasFeature(X86::Mode16Bit) && BaseReg == 0 && IndexReg == 0)
     return true;
-  if ((BaseReg.getReg() != 0 &&
-       X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg.getReg())) ||
-      (IndexReg.getReg() != 0 &&
-       X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg.getReg())))
+  if ((BaseReg != 0 &&
+       X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg)) ||
+      (IndexReg != 0 &&
+       X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg)))
     return true;
   return false;
 }
@@ -390,7 +387,6 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
   const MCOperand &Scale = MI.getOperand(Op + X86::AddrScaleAmt);
   const MCOperand &IndexReg = MI.getOperand(Op + X86::AddrIndexReg);
   unsigned BaseReg = Base.getReg();
-  bool HasEVEX = (TSFlags & X86II::EncodingMask) == X86II::EVEX;
 
   // Handle %rip relative addressing.
   if (BaseReg == X86::RIP ||
@@ -402,16 +398,33 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
     emitByte(modRMByte(0, RegOpcodeField, 5), OS);
 
     unsigned Opcode = MI.getOpcode();
-    // movq loads are handled with a special relocation form which allows the
-    // linker to eliminate some loads for GOT references which end up in the
-    // same linkage unit.
-    unsigned FixupKind = [=]() {
+    unsigned FixupKind = [&]() {
+      // Enable relaxed relocation only for a MCSymbolRefExpr.  We cannot use a
+      // relaxed relocation if an offset is present (e.g. x@GOTPCREL+4).
+      if (!(Disp.isExpr() && isa<MCSymbolRefExpr>(Disp.getExpr())))
+        return X86::reloc_riprel_4byte;
+
+      // Certain loads for GOT references can be relocated against the symbol
+      // directly if the symbol ends up in the same linkage unit.
       switch (Opcode) {
       default:
         return X86::reloc_riprel_4byte;
       case X86::MOV64rm:
+        // movq loads is a subset of reloc_riprel_4byte_relax_rex. It is a
+        // special case because COFF and Mach-O don't support ELF's more
+        // flexible R_X86_64_REX_GOTPCRELX relaxation.
         assert(HasREX);
         return X86::reloc_riprel_4byte_movq_load;
+      case X86::ADC32rm:
+      case X86::ADD32rm:
+      case X86::AND32rm:
+      case X86::CMP32rm:
+      case X86::MOV32rm:
+      case X86::OR32rm:
+      case X86::SBB32rm:
+      case X86::SUB32rm:
+      case X86::TEST32mr:
+      case X86::XOR32rm:
       case X86::CALL64m:
       case X86::JMP64m:
       case X86::TAILJMPm64:
@@ -484,7 +497,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
           RMfield = (IndexReg16 & 1) | ((7 - RMfield) << 1);
       }
 
-      if (Disp.isImm() && isDisp8(Disp.getImm())) {
+      if (Disp.isImm() && isInt<8>(Disp.getImm())) {
         if (Disp.getImm() == 0 && RMfield != 6) {
           // There is no displacement; just the register.
           emitByte(modRMByte(0, RegOpcodeField, RMfield), OS);
@@ -498,6 +511,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
       // This is the [REG]+disp16 case.
       emitByte(modRMByte(2, RegOpcodeField, RMfield), OS);
     } else {
+      assert(IndexReg.getReg() == 0 && "Unexpected index register!");
       // There is no BaseReg; this is the plain [disp16] case.
       emitByte(modRMByte(0, RegOpcodeField, 6), OS);
     }
@@ -507,12 +521,18 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
     return;
   }
 
-  // Determine whether a SIB byte is needed.
-  // If no BaseReg, issue a RIP relative instruction only if the MCE can
-  // resolve addresses on-the-fly, otherwise use SIB (Intel Manual 2A, table
-  // 2-7) and absolute references.
+  // Check for presence of {disp8} or {disp32} pseudo prefixes.
+  bool UseDisp8 = MI.getFlags() & X86::IP_USE_DISP8;
+  bool UseDisp32 = MI.getFlags() & X86::IP_USE_DISP32;
 
-  if ( // The SIB byte must be used if there is an index register.
+  // We only allow no displacement if no pseudo prefix is present.
+  bool AllowNoDisp = !UseDisp8 && !UseDisp32;
+  // Disp8 is allowed unless the {disp32} prefix is present.
+  bool AllowDisp8 = !UseDisp32;
+
+  // Determine whether a SIB byte is needed.
+  if (// The SIB byte must be used if there is an index register or the
+      // encoding requires a SIB byte.
       !ForceSIB && IndexReg.getReg() == 0 &&
       // The SIB byte must be used if the base is ESP/RSP/R12, all of which
       // encode to an R/M value of 4, which indicates that a SIB byte is
@@ -528,12 +548,12 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
       return;
     }
 
-    // If the base is not EBP/ESP and there is no displacement, use simple
-    // indirect register encoding, this handles addresses like [EAX].  The
-    // encoding for [EBP] with no displacement means [disp32] so we handle it
-    // by emitting a displacement of 0 below.
+    // If the base is not EBP/ESP/R12/R13 and there is no displacement, use
+    // simple indirect register encoding, this handles addresses like [EAX].
+    // The encoding for [EBP] or[R13] with no displacement means [disp32] so we
+    // handle it by emitting a displacement of 0 later.
     if (BaseRegNo != N86::EBP) {
-      if (Disp.isImm() && Disp.getImm() == 0) {
+      if (Disp.isImm() && Disp.getImm() == 0 && AllowNoDisp) {
         emitByte(modRMByte(0, RegOpcodeField, BaseRegNo), OS);
         return;
       }
@@ -552,24 +572,22 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
     }
 
     // Otherwise, if the displacement fits in a byte, encode as [REG+disp8].
-    if (Disp.isImm()) {
-      if (!HasEVEX && isDisp8(Disp.getImm())) {
-        emitByte(modRMByte(1, RegOpcodeField, BaseRegNo), OS);
-        emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, StartByte, OS, Fixups);
-        return;
-      }
-      // Try EVEX compressed 8-bit displacement first; if failed, fall back to
-      // 32-bit displacement.
-      int CDisp8 = 0;
-      if (HasEVEX && isCDisp8(TSFlags, Disp.getImm(), CDisp8)) {
+    // Including a compressed disp8 for EVEX instructions that support it.
+    // This also handles the 0 displacement for [EBP] or [R13]. We can't use
+    // disp8 if the {disp32} pseudo prefix is present.
+    if (Disp.isImm() && AllowDisp8) {
+      int ImmOffset = 0;
+      if (isDispOrCDisp8(TSFlags, Disp.getImm(), ImmOffset)) {
         emitByte(modRMByte(1, RegOpcodeField, BaseRegNo), OS);
         emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, StartByte, OS, Fixups,
-                      CDisp8 - Disp.getImm());
+                      ImmOffset);
         return;
       }
     }
 
-    // Otherwise, emit the most general non-SIB encoding: [REG+disp32]
+    // Otherwise, emit the most general non-SIB encoding: [REG+disp32].
+    // Displacement may be 0 for [EBP] or [R13] case if {disp32} pseudo prefix
+    // prevented using disp8 above.
     emitByte(modRMByte(2, RegOpcodeField, BaseRegNo), OS);
     unsigned Opcode = MI.getOpcode();
     unsigned FixupKind = Opcode == X86::MOV32rm ? X86::reloc_signed_4byte_relax
@@ -585,64 +603,47 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
 
   bool ForceDisp32 = false;
   bool ForceDisp8 = false;
-  int CDisp8 = 0;
   int ImmOffset = 0;
   if (BaseReg == 0) {
     // If there is no base register, we emit the special case SIB byte with
     // MOD=0, BASE=5, to JUST get the index, scale, and displacement.
+    BaseRegNo = 5;
     emitByte(modRMByte(0, RegOpcodeField, 4), OS);
     ForceDisp32 = true;
-  } else if (!Disp.isImm()) {
-    // Emit the normal disp32 encoding.
-    emitByte(modRMByte(2, RegOpcodeField, 4), OS);
-    ForceDisp32 = true;
-  } else if (Disp.getImm() == 0 &&
-             // Base reg can't be anything that ends up with '5' as the base
-             // reg, it is the magic [*] nomenclature that indicates no base.
+  } else if (Disp.isImm() && Disp.getImm() == 0 && AllowNoDisp &&
+             // Base reg can't be EBP/RBP/R13 as that would end up with '5' as
+             // the base field, but that is the magic [*] nomenclature that
+             // indicates no base when mod=0. For these cases we'll emit a 0
+             // displacement instead.
              BaseRegNo != N86::EBP) {
     // Emit no displacement ModR/M byte
     emitByte(modRMByte(0, RegOpcodeField, 4), OS);
-  } else if (!HasEVEX && isDisp8(Disp.getImm())) {
-    // Emit the disp8 encoding.
-    emitByte(modRMByte(1, RegOpcodeField, 4), OS);
-    ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP
-  } else if (HasEVEX && isCDisp8(TSFlags, Disp.getImm(), CDisp8)) {
-    // Emit the disp8 encoding.
+  } else if (Disp.isImm() && AllowDisp8 &&
+             isDispOrCDisp8(TSFlags, Disp.getImm(), ImmOffset)) {
+    // Displacement fits in a byte or matches an EVEX compressed disp8, use
+    // disp8 encoding. This also handles EBP/R13 base with 0 displacement unless
+    // {disp32} pseudo prefix was used.
     emitByte(modRMByte(1, RegOpcodeField, 4), OS);
-    ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP
-    ImmOffset = CDisp8 - Disp.getImm();
+    ForceDisp8 = true;
   } else {
-    // Emit the normal disp32 encoding.
+    // Otherwise, emit the normal disp32 encoding.
     emitByte(modRMByte(2, RegOpcodeField, 4), OS);
+    ForceDisp32 = true;
   }
 
   // Calculate what the SS field value should be...
   static const unsigned SSTable[] = {~0U, 0, 1, ~0U, 2, ~0U, ~0U, ~0U, 3};
   unsigned SS = SSTable[Scale.getImm()];
 
-  if (BaseReg == 0) {
-    // Handle the SIB byte for the case where there is no base, see Intel
-    // Manual 2A, table 2-7. The displacement has already been output.
-    unsigned IndexRegNo;
-    if (IndexReg.getReg())
-      IndexRegNo = getX86RegNum(IndexReg);
-    else // Examples: [ESP+1*<noreg>+4] or [scaled idx]+disp32 (MOD=0,BASE=5)
-      IndexRegNo = 4;
-    emitSIBByte(SS, IndexRegNo, 5, OS);
-  } else {
-    unsigned IndexRegNo;
-    if (IndexReg.getReg())
-      IndexRegNo = getX86RegNum(IndexReg);
-    else
-      IndexRegNo = 4; // For example [ESP+1*<noreg>+4]
-    emitSIBByte(SS, IndexRegNo, getX86RegNum(Base), OS);
-  }
+  unsigned IndexRegNo = IndexReg.getReg() ? getX86RegNum(IndexReg) : 4;
+
+  emitSIBByte(SS, IndexRegNo, BaseRegNo, OS);
 
   // Do we need to output a displacement?
   if (ForceDisp8)
     emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, StartByte, OS, Fixups,
                   ImmOffset);
-  else if (ForceDisp32 || Disp.getImm() != 0)
+  else if (ForceDisp32)
     emitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte),
                   StartByte, OS, Fixups);
 }
@@ -1200,6 +1201,7 @@ void X86MCCodeEmitter::emitVEXOpcodePrefix(int MemOperand, const MCInst &MI,
 ///
 /// \returns true if REX prefix is used, otherwise returns false.
 bool X86MCCodeEmitter::emitREXPrefix(int MemOperand, const MCInst &MI,
+                                     const MCSubtargetInfo &STI,
                                      raw_ostream &OS) const {
   uint8_t REX = [&, MemOperand]() {
     uint8_t REX = 0;
@@ -1220,15 +1222,28 @@ bool X86MCCodeEmitter::emitREXPrefix(int MemOperand, const MCInst &MI,
     // If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix.
     for (unsigned i = CurOp; i != NumOps; ++i) {
       const MCOperand &MO = MI.getOperand(i);
-      if (!MO.isReg())
-        continue;
-      unsigned Reg = MO.getReg();
-      if (Reg == X86::AH || Reg == X86::BH || Reg == X86::CH || Reg == X86::DH)
-        UsesHighByteReg = true;
-      if (X86II::isX86_64NonExtLowByteReg(Reg))
-        // FIXME: The caller of determineREXPrefix slaps this prefix onto
-        // anything that returns non-zero.
-        REX |= 0x40; // REX fixed encoding prefix
+      if (MO.isReg()) {
+        unsigned Reg = MO.getReg();
+        if (Reg == X86::AH || Reg == X86::BH || Reg == X86::CH ||
+            Reg == X86::DH)
+          UsesHighByteReg = true;
+        if (X86II::isX86_64NonExtLowByteReg(Reg))
+          // FIXME: The caller of determineREXPrefix slaps this prefix onto
+          // anything that returns non-zero.
+          REX |= 0x40; // REX fixed encoding prefix
+      } else if (MO.isExpr() &&
+                 STI.getTargetTriple().getEnvironment() == Triple::GNUX32) {
+        // GOTTPOFF and TLSDESC relocations require a REX prefix to allow
+        // linker optimizations: even if the instructions we see may not require
+        // any prefix, they may be replaced by instructions that do. This is
+        // handled as a special case here so that it also works for hand-written
+        // assembly without the user needing to write REX, as with GNU as.
+        const auto *Ref = dyn_cast<MCSymbolRefExpr>(MO.getExpr());
+        if (Ref && (Ref->getKind() == MCSymbolRefExpr::VK_GOTTPOFF ||
+                    Ref->getKind() == MCSymbolRefExpr::VK_TLSDESC)) {
+          REX |= 0x40; // REX fixed encoding prefix
+        }
+      }
     }
 
     switch (TSFlags & X86II::FormMask) {
@@ -1351,7 +1366,7 @@ bool X86MCCodeEmitter::emitOpcodePrefix(int MemOperand, const MCInst &MI,
   assert((STI.hasFeature(X86::Mode64Bit) || !(TSFlags & X86II::REX_W)) &&
          "REX.W requires 64bit mode.");
   bool HasREX = STI.hasFeature(X86::Mode64Bit)
-                    ? emitREXPrefix(MemOperand, MI, OS)
+                    ? emitREXPrefix(MemOperand, MI, STI, OS)
                     : false;
 
   // 0x0F escape code must be emitted just before the opcode.
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index 81110ba666e9..5cf8d77519d9 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -44,8 +44,10 @@ using namespace llvm;
 
 std::string X86_MC::ParseX86Triple(const Triple &TT) {
   std::string FS;
-  if (TT.getArch() == Triple::x86_64)
-    FS = "+64bit-mode,-32bit-mode,-16bit-mode";
+  // SSE2 should default to enabled in 64-bit mode, but can be turned off
+  // explicitly.
+  if (TT.isArch64Bit())
+    FS = "+64bit-mode,-32bit-mode,-16bit-mode,+sse2";
   else if (TT.getEnvironment() != Triple::CODE16)
     FS = "-64bit-mode,+32bit-mode,-16bit-mode";
   else
@@ -290,11 +292,10 @@ MCSubtargetInfo *X86_MC::createX86MCSubtargetInfo(const Triple &TT,
   if (!FS.empty())
     ArchFS = (Twine(ArchFS) + "," + FS).str();
 
-  std::string CPUName = std::string(CPU);
-  if (CPUName.empty())
-    CPUName = "generic";
+  if (CPU.empty())
+    CPU = "generic";
 
-  return createX86MCSubtargetInfoImpl(TT, CPUName, ArchFS);
+  return createX86MCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, ArchFS);
 }
 
 static MCInstrInfo *createX86MCInstrInfo() {
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
index e8c72be1d9b6..35604cd3ec0a 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -85,11 +85,11 @@ MCAsmBackend *createX86_64AsmBackend(const Target &T,
 /// Implements X86-only directives for assembly emission.
 MCTargetStreamer *createX86AsmTargetStreamer(MCStreamer &S,
                                              formatted_raw_ostream &OS,
-                                             MCInstPrinter *InstPrint,
-                                             bool isVerboseAsm);
+                                             MCInstPrinter *InstPrinter,
+                                             bool IsVerboseAsm);
 
 /// Implements X86-only directives for object files.
-MCTargetStreamer *createX86ObjectTargetStreamer(MCStreamer &OS,
+MCTargetStreamer *createX86ObjectTargetStreamer(MCStreamer &S,
                                                 const MCSubtargetInfo &STI);
 
 /// Construct an X86 Windows COFF machine code streamer which will generate
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
index b67a7508fe72..b98e58d653db 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
@@ -68,7 +68,7 @@ public:
                           FixedValue);
   }
 };
-}
+} // namespace
 
 static bool isFixupKindRIPRel(unsigned Kind) {
   return Kind == X86::reloc_riprel_4byte ||
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ShuffleDecode.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ShuffleDecode.cpp
index 62c1c399a606..201b22d6232d 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ShuffleDecode.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ShuffleDecode.cpp
@@ -568,4 +568,4 @@ void DecodeVPERMV3Mask(ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
   }
 }
 
-} // llvm namespace
+} // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
index 3bebcc24fd3a..c29211246123 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
@@ -26,6 +26,7 @@ public:
       : MCWinCOFFStreamer(C, std::move(AB), std::move(CE), std::move(OW)) {}
 
   void EmitWinEHHandlerData(SMLoc Loc) override;
+  void EmitWindowsUnwindTables(WinEH::FrameInfo *Frame) override;
   void EmitWindowsUnwindTables() override;
   void EmitCVFPOData(const MCSymbol *ProcSym, SMLoc Loc) override;
   void finishImpl() override;
@@ -37,7 +38,11 @@ void X86WinCOFFStreamer::EmitWinEHHandlerData(SMLoc Loc) {
   // We have to emit the unwind info now, because this directive
   // actually switches to the .xdata section.
   if (WinEH::FrameInfo *CurFrame = getCurrentWinFrameInfo())
-    EHStreamer.EmitUnwindInfo(*this, CurFrame);
+    EHStreamer.EmitUnwindInfo(*this, CurFrame, /* HandlerData = */ true);
+}
+
+void X86WinCOFFStreamer::EmitWindowsUnwindTables(WinEH::FrameInfo *Frame) {
+  EHStreamer.EmitUnwindInfo(*this, Frame, /* HandlerData = */ false);
 }
 
 void X86WinCOFFStreamer::EmitWindowsUnwindTables() {
@@ -58,7 +63,7 @@ void X86WinCOFFStreamer::finishImpl() {
 
   MCWinCOFFStreamer::finishImpl();
 }
-}
+} // namespace
 
 MCStreamer *llvm::createX86WinCOFFStreamer(MCContext &C,
                                            std::unique_ptr<MCAsmBackend> &&AB,
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86.h b/contrib/llvm-project/llvm/lib/Target/X86/X86.h
index 91ba4e3d091e..e17b9ba5500b 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86.h
@@ -67,9 +67,6 @@ FunctionPass *createX86OptimizeLEAs();
 /// Return a pass that transforms setcc + movzx pairs into xor + setcc.
 FunctionPass *createX86FixupSetCC();
 
-/// Return a pass that folds conditional branch jumps.
-FunctionPass *createX86CondBrFolding();
-
 /// Return a pass that avoids creating store forward block issues in the hardware.
 FunctionPass *createX86AvoidStoreForwardingBlocks();
 
@@ -79,6 +76,10 @@ FunctionPass *createX86FlagsCopyLoweringPass();
 /// Return a pass that expands WinAlloca pseudo-instructions.
 FunctionPass *createX86WinAllocaExpander();
 
+FunctionPass *createX86TileConfigPass();
+
+FunctionPass *createX86PreTileConfigPass();
+
 /// Return a pass that inserts int3 at the end of the function if it ends with a
 /// CALL instruction. The pass does the same for each funclet as well. This
 /// ensures that the open interval of function start and end PCs contains all
@@ -154,7 +155,6 @@ void initializeX86AvoidSFBPassPass(PassRegistry &);
 void initializeX86AvoidTrailingCallPassPass(PassRegistry &);
 void initializeX86CallFrameOptimizationPass(PassRegistry &);
 void initializeX86CmovConverterPassPass(PassRegistry &);
-void initializeX86CondBrFoldingPassPass(PassRegistry &);
 void initializeX86DomainReassignmentPass(PassRegistry &);
 void initializeX86ExecutionDomainFixPass(PassRegistry &);
 void initializeX86ExpandPseudoPass(PassRegistry &);
@@ -166,6 +166,9 @@ void initializeX86OptimizeLEAPassPass(PassRegistry &);
 void initializeX86PartialReductionPass(PassRegistry &);
 void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &);
 void initializeX86SpeculativeExecutionSideEffectSuppressionPass(PassRegistry &);
+void initializeX86PreTileConfigPass(PassRegistry &);
+void initializeX86TileConfigPass(PassRegistry &);
+void initializeX86LowerAMXTypeLegacyPassPass(PassRegistry &);
 
 namespace X86AS {
 enum : unsigned {
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86.td b/contrib/llvm-project/llvm/lib/Target/X86/X86.td
index dc1ff72add49..c492d686c52e 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86.td
@@ -171,6 +171,9 @@ def FeaturePKU   : SubtargetFeature<"pku", "HasPKU", "true",
 def FeatureVNNI    : SubtargetFeature<"avx512vnni", "HasVNNI", "true",
                           "Enable AVX-512 Vector Neural Network Instructions",
                                       [FeatureAVX512]>;
+def FeatureAVXVNNI    : SubtargetFeature<"avxvnni", "HasAVXVNNI", "true",
+                           "Support AVX_VNNI encoding",
+                                      [FeatureAVX2]>;
 def FeatureBF16    : SubtargetFeature<"avx512bf16", "HasBF16", "true",
                            "Support bfloat16 floating point",
                                       [FeatureBWI]>;
@@ -234,8 +237,8 @@ def FeaturePRFCHW  : SubtargetFeature<"prfchw", "HasPRFCHW", "true",
                                       "Support PRFCHW instructions">;
 def FeatureRDSEED  : SubtargetFeature<"rdseed", "HasRDSEED", "true",
                                       "Support RDSEED instruction">;
-def FeatureLAHFSAHF : SubtargetFeature<"sahf", "HasLAHFSAHF", "true",
-                                       "Support LAHF and SAHF instructions">;
+def FeatureLAHFSAHF : SubtargetFeature<"sahf", "HasLAHFSAHF64", "true",
+                           "Support LAHF and SAHF instructions in 64-bit mode">;
 def FeatureMWAITX  : SubtargetFeature<"mwaitx", "HasMWAITX", "true",
                                       "Enable MONITORX/MWAITX timer functionality">;
 def FeatureCLZERO  : SubtargetFeature<"clzero", "HasCLZERO", "true",
@@ -244,11 +247,6 @@ def FeatureCLDEMOTE  : SubtargetFeature<"cldemote", "HasCLDEMOTE", "true",
                                       "Enable Cache Demote">;
 def FeaturePTWRITE  : SubtargetFeature<"ptwrite", "HasPTWRITE", "true",
                                       "Support ptwrite instruction">;
-// FIXME: This feature is deprecated in 10.0 and should not be used for
-// anything, but removing it would break IR files that may contain it in a
-// target-feature attribute.
-def FeatureDeprecatedMPX : SubtargetFeature<"mpx", "DeprecatedHasMPX", "false",
-                                      "Deprecated. Support MPX instructions">;
 def FeatureAMXTILE     : SubtargetFeature<"amx-tile", "HasAMXTILE", "true",
                                       "Support AMX-TILE instructions">;
 def FeatureAMXINT8     : SubtargetFeature<"amx-int8", "HasAMXINT8", "true",
@@ -284,10 +282,20 @@ def FeatureWAITPKG  : SubtargetFeature<"waitpkg", "HasWAITPKG", "true",
                                       "Wait and pause enhancements">;
 def FeatureENQCMD : SubtargetFeature<"enqcmd", "HasENQCMD", "true",
                                      "Has ENQCMD instructions">;
+def FeatureKL  : SubtargetFeature<"kl", "HasKL", "true",
+                                  "Support Key Locker kl Instructions",
+                                  [FeatureSSE2]>;
+def FeatureWIDEKL  : SubtargetFeature<"widekl", "HasWIDEKL", "true",
+                                      "Support Key Locker wide Instructions",
+                                      [FeatureKL]>;
+def FeatureHRESET : SubtargetFeature<"hreset", "HasHRESET", "true",
+                                      "Has hreset instruction">;
 def FeatureSERIALIZE : SubtargetFeature<"serialize", "HasSERIALIZE", "true",
                                         "Has serialize instruction">;
 def FeatureTSXLDTRK : SubtargetFeature<"tsxldtrk", "HasTSXLDTRK", "true",
                                        "Support TSXLDTRK instructions">;
+def FeatureUINTR : SubtargetFeature<"uintr", "HasUINTR", "true",
+                                    "Has UINTR Instructions">;
 // On some processors, instructions that implicitly take two memory operands are
 // slow. In practice, this means that CALL, PUSH, and POP with memory operands
 // should be avoided in favor of a MOV + register CALL/PUSH/POP.
@@ -377,6 +385,12 @@ def FeatureERMSB
           "ermsb", "HasERMSB", "true",
           "REP MOVS/STOS are fast">;
 
+// Icelake and newer processors have Fast Short REP MOV.
+def FeatureFSRM
+    : SubtargetFeature<
+          "fsrm", "HasFSRM", "true",
+          "REP MOVSB of short lengths is faster">;
+
 // Bulldozer and newer processors can merge CMP/TEST (but not other
 // instructions) with conditional branches.
 def FeatureBranchFusion
@@ -504,12 +518,6 @@ def FeatureUseGLMDivSqrtCosts
     : SubtargetFeature<"use-glm-div-sqrt-costs", "UseGLMDivSqrtCosts", "true",
         "Use Goldmont specific floating point div/sqrt costs">;
 
-// Merge branches using three-way conditional code.
-def FeatureMergeToThreeWayBranch : SubtargetFeature<"merge-to-threeway-branch",
-                                        "ThreewayBranchProfitable", "true",
-                                        "Merge branches to a three-way "
-                                        "conditional branch">;
-
 // Enable use of alias analysis during code generation.
 def FeatureUseAA : SubtargetFeature<"use-aa", "UseAA", "true",
                                     "Use alias analysis during codegen">;
@@ -557,59 +565,59 @@ include "X86SchedSkylakeServer.td"
 //===----------------------------------------------------------------------===//
 
 def ProcessorFeatures {
+  // x86-64 and x86-64-v[234]
+  list<SubtargetFeature> X86_64V1Features = [
+    FeatureX87, FeatureCMPXCHG8B, FeatureCMOV, FeatureMMX, FeatureSSE2,
+    FeatureFXSR, FeatureNOPL, Feature64Bit
+  ];
+  list<SubtargetFeature> X86_64V2Features = !listconcat(
+      X86_64V1Features,
+      [FeatureCMPXCHG16B, FeatureLAHFSAHF, FeaturePOPCNT, FeatureSSE42]);
+  list<SubtargetFeature> X86_64V3Features = !listconcat(X86_64V2Features, [
+    FeatureAVX2, FeatureBMI, FeatureBMI2, FeatureF16C, FeatureFMA, FeatureLZCNT,
+    FeatureMOVBE, FeatureXSAVE
+  ]);
+  list<SubtargetFeature> X86_64V4Features = !listconcat(X86_64V3Features, [
+    FeatureBWI,
+    FeatureCDI,
+    FeatureDQI,
+    FeatureVLX,
+  ]);
+
   // Nehalem
-  list<SubtargetFeature> NHMInheritableFeatures = [FeatureX87,
-                                                   FeatureCMPXCHG8B,
-                                                   FeatureCMOV,
-                                                   FeatureMMX,
-                                                   FeatureSSE42,
-                                                   FeatureFXSR,
-                                                   FeatureNOPL,
-                                                   Feature64Bit,
-                                                   FeatureCMPXCHG16B,
-                                                   FeaturePOPCNT,
-                                                   FeatureLAHFSAHF,
-                                                   FeatureMacroFusion,
-                                                   FeatureInsertVZEROUPPER];
-  list<SubtargetFeature> NHMSpecificFeatures = [];
-  list<SubtargetFeature> NHMFeatures =
-    !listconcat(NHMInheritableFeatures, NHMSpecificFeatures);
+  list<SubtargetFeature> NHMFeatures = X86_64V2Features;
+  list<SubtargetFeature> NHMTuning = [FeatureMacroFusion,
+                                      FeatureInsertVZEROUPPER];
 
   // Westmere
   list<SubtargetFeature> WSMAdditionalFeatures = [FeaturePCLMUL];
-  list<SubtargetFeature> WSMSpecificFeatures = [];
-  list<SubtargetFeature> WSMInheritableFeatures =
-    !listconcat(NHMInheritableFeatures, WSMAdditionalFeatures);
+  list<SubtargetFeature> WSMTuning = NHMTuning;
   list<SubtargetFeature> WSMFeatures =
-    !listconcat(WSMInheritableFeatures, WSMSpecificFeatures);
+    !listconcat(NHMFeatures, WSMAdditionalFeatures);
 
   // Sandybridge
   list<SubtargetFeature> SNBAdditionalFeatures = [FeatureAVX,
-                                                  FeatureSlowDivide64,
                                                   FeatureXSAVE,
-                                                  FeatureXSAVEOPT,
-                                                  FeatureSlow3OpsLEA,
-                                                  FeatureFastScalarFSQRT,
-                                                  FeatureFastSHLDRotate,
-                                                  FeatureMergeToThreeWayBranch,
-                                                  FeatureFast15ByteNOP];
-  list<SubtargetFeature> SNBSpecificFeatures = [FeatureSlowUAMem32,
-                                                FeaturePOPCNTFalseDeps];
-  list<SubtargetFeature> SNBInheritableFeatures =
-    !listconcat(WSMInheritableFeatures, SNBAdditionalFeatures);
+                                                  FeatureXSAVEOPT];
+  list<SubtargetFeature> SNBTuning = [FeatureMacroFusion,
+                                      FeatureSlow3OpsLEA,
+                                      FeatureSlowDivide64,
+                                      FeatureSlowUAMem32,
+                                      FeatureFastScalarFSQRT,
+                                      FeatureFastSHLDRotate,
+                                      FeatureFast15ByteNOP,
+                                      FeaturePOPCNTFalseDeps,
+                                      FeatureInsertVZEROUPPER];
   list<SubtargetFeature> SNBFeatures =
-    !listconcat(SNBInheritableFeatures, SNBSpecificFeatures);
+    !listconcat(WSMFeatures, SNBAdditionalFeatures);
 
   // Ivybridge
   list<SubtargetFeature> IVBAdditionalFeatures = [FeatureRDRAND,
                                                   FeatureF16C,
                                                   FeatureFSGSBase];
-  list<SubtargetFeature> IVBSpecificFeatures = [FeatureSlowUAMem32,
-                                                FeaturePOPCNTFalseDeps];
-  list<SubtargetFeature> IVBInheritableFeatures =
-    !listconcat(SNBInheritableFeatures, IVBAdditionalFeatures);
+  list<SubtargetFeature> IVBTuning = SNBTuning;
   list<SubtargetFeature> IVBFeatures =
-    !listconcat(IVBInheritableFeatures, IVBSpecificFeatures);
+    !listconcat(SNBFeatures, IVBAdditionalFeatures);
 
   // Haswell
   list<SubtargetFeature> HSWAdditionalFeatures = [FeatureAVX2,
@@ -619,77 +627,89 @@ def ProcessorFeatures {
                                                   FeatureFMA,
                                                   FeatureINVPCID,
                                                   FeatureLZCNT,
-                                                  FeatureMOVBE,
-                                                  FeatureFastVariableShuffle];
-  list<SubtargetFeature> HSWSpecificFeatures = [FeaturePOPCNTFalseDeps,
-                                                FeatureLZCNTFalseDeps];
-  list<SubtargetFeature> HSWInheritableFeatures =
-    !listconcat(IVBInheritableFeatures, HSWAdditionalFeatures);
+                                                  FeatureMOVBE];
+  list<SubtargetFeature> HSWTuning = [FeatureMacroFusion,
+                                      FeatureSlow3OpsLEA,
+                                      FeatureSlowDivide64,
+                                      FeatureFastScalarFSQRT,
+                                      FeatureFastSHLDRotate,
+                                      FeatureFast15ByteNOP,
+                                      FeatureFastVariableShuffle,
+                                      FeaturePOPCNTFalseDeps,
+                                      FeatureLZCNTFalseDeps,
+                                      FeatureInsertVZEROUPPER];
   list<SubtargetFeature> HSWFeatures =
-    !listconcat(HSWInheritableFeatures, HSWSpecificFeatures);
+    !listconcat(IVBFeatures, HSWAdditionalFeatures);
 
   // Broadwell
   list<SubtargetFeature> BDWAdditionalFeatures = [FeatureADX,
                                                   FeatureRDSEED,
                                                   FeaturePRFCHW];
-  list<SubtargetFeature> BDWSpecificFeatures = [FeaturePOPCNTFalseDeps,
-                                                FeatureLZCNTFalseDeps];
-  list<SubtargetFeature> BDWInheritableFeatures =
-    !listconcat(HSWInheritableFeatures, BDWAdditionalFeatures);
+  list<SubtargetFeature> BDWTuning = HSWTuning;
   list<SubtargetFeature> BDWFeatures =
-    !listconcat(BDWInheritableFeatures, BDWSpecificFeatures);
+    !listconcat(HSWFeatures, BDWAdditionalFeatures);
 
   // Skylake
   list<SubtargetFeature> SKLAdditionalFeatures = [FeatureAES,
                                                   FeatureXSAVEC,
                                                   FeatureXSAVES,
                                                   FeatureCLFLUSHOPT,
-                                                  FeatureFastVectorFSQRT];
-  list<SubtargetFeature> SKLSpecificFeatures = [FeatureHasFastGather,
-                                                FeaturePOPCNTFalseDeps,
-                                                FeatureSGX];
-  list<SubtargetFeature> SKLInheritableFeatures =
-    !listconcat(BDWInheritableFeatures, SKLAdditionalFeatures);
+                                                  FeatureSGX];
+  list<SubtargetFeature> SKLTuning = [FeatureHasFastGather,
+                                      FeatureMacroFusion,
+                                      FeatureSlow3OpsLEA,
+                                      FeatureSlowDivide64,
+                                      FeatureFastScalarFSQRT,
+                                      FeatureFastVectorFSQRT,
+                                      FeatureFastSHLDRotate,
+                                      FeatureFast15ByteNOP,
+                                      FeatureFastVariableShuffle,
+                                      FeaturePOPCNTFalseDeps,
+                                      FeatureInsertVZEROUPPER];
   list<SubtargetFeature> SKLFeatures =
-    !listconcat(SKLInheritableFeatures, SKLSpecificFeatures);
+    !listconcat(BDWFeatures, SKLAdditionalFeatures);
 
   // Skylake-AVX512
-  list<SubtargetFeature> SKXAdditionalFeatures = [FeatureAVX512,
-                                                  FeaturePrefer256Bit,
+  list<SubtargetFeature> SKXAdditionalFeatures = [FeatureAES,
+                                                  FeatureXSAVEC,
+                                                  FeatureXSAVES,
+                                                  FeatureCLFLUSHOPT,
+                                                  FeatureAVX512,
                                                   FeatureCDI,
                                                   FeatureDQI,
                                                   FeatureBWI,
                                                   FeatureVLX,
                                                   FeaturePKU,
                                                   FeatureCLWB];
-  list<SubtargetFeature> SKXSpecificFeatures = [FeatureHasFastGather,
-                                                FeaturePOPCNTFalseDeps];
-  list<SubtargetFeature> SKXInheritableFeatures =
-    !listconcat(SKLInheritableFeatures, SKXAdditionalFeatures);
+  list<SubtargetFeature> SKXTuning = [FeatureHasFastGather,
+                                      FeatureMacroFusion,
+                                      FeatureSlow3OpsLEA,
+                                      FeatureSlowDivide64,
+                                      FeatureFastScalarFSQRT,
+                                      FeatureFastVectorFSQRT,
+                                      FeatureFastSHLDRotate,
+                                      FeatureFast15ByteNOP,
+                                      FeatureFastVariableShuffle,
+                                      FeaturePrefer256Bit,
+                                      FeaturePOPCNTFalseDeps,
+                                      FeatureInsertVZEROUPPER];
   list<SubtargetFeature> SKXFeatures =
-    !listconcat(SKXInheritableFeatures, SKXSpecificFeatures);
+    !listconcat(BDWFeatures, SKXAdditionalFeatures);
 
   // Cascadelake
   list<SubtargetFeature> CLXAdditionalFeatures = [FeatureVNNI];
-  list<SubtargetFeature> CLXSpecificFeatures = [FeatureHasFastGather,
-                                                FeaturePOPCNTFalseDeps];
-  list<SubtargetFeature> CLXInheritableFeatures =
-    !listconcat(SKXInheritableFeatures, CLXAdditionalFeatures);
+  list<SubtargetFeature> CLXTuning = SKXTuning;
   list<SubtargetFeature> CLXFeatures =
-    !listconcat(CLXInheritableFeatures, CLXSpecificFeatures);
+    !listconcat(SKXFeatures, CLXAdditionalFeatures);
 
   // Cooperlake
   list<SubtargetFeature> CPXAdditionalFeatures = [FeatureBF16];
-  list<SubtargetFeature> CPXSpecificFeatures = [FeatureHasFastGather,
-                                                FeaturePOPCNTFalseDeps];
-  list<SubtargetFeature> CPXInheritableFeatures =
-    !listconcat(CLXInheritableFeatures, CPXAdditionalFeatures);
+  list<SubtargetFeature> CPXTuning = SKXTuning;
   list<SubtargetFeature> CPXFeatures =
-    !listconcat(CPXInheritableFeatures, CPXSpecificFeatures);
+    !listconcat(CLXFeatures, CPXAdditionalFeatures);
 
   // Cannonlake
   list<SubtargetFeature> CNLAdditionalFeatures = [FeatureAVX512,
-                                                  FeaturePrefer256Bit,
                                                   FeatureCDI,
                                                   FeatureDQI,
                                                   FeatureBWI,
@@ -697,13 +717,20 @@ def ProcessorFeatures {
                                                   FeaturePKU,
                                                   FeatureVBMI,
                                                   FeatureIFMA,
-                                                  FeatureSHA,
-                                                  FeatureSGX];
-  list<SubtargetFeature> CNLSpecificFeatures = [FeatureHasFastGather];
-  list<SubtargetFeature> CNLInheritableFeatures =
-    !listconcat(SKLInheritableFeatures, CNLAdditionalFeatures);
+                                                  FeatureSHA];
+  list<SubtargetFeature> CNLTuning = [FeatureHasFastGather,
+                                      FeatureMacroFusion,
+                                      FeatureSlow3OpsLEA,
+                                      FeatureSlowDivide64,
+                                      FeatureFastScalarFSQRT,
+                                      FeatureFastVectorFSQRT,
+                                      FeatureFastSHLDRotate,
+                                      FeatureFast15ByteNOP,
+                                      FeatureFastVariableShuffle,
+                                      FeaturePrefer256Bit,
+                                      FeatureInsertVZEROUPPER];
   list<SubtargetFeature> CNLFeatures =
-    !listconcat(CNLInheritableFeatures, CNLSpecificFeatures);
+    !listconcat(SKLFeatures, CNLAdditionalFeatures);
 
   // Icelake
   list<SubtargetFeature> ICLAdditionalFeatures = [FeatureBITALG,
@@ -714,72 +741,99 @@ def ProcessorFeatures {
                                                   FeatureVPOPCNTDQ,
                                                   FeatureGFNI,
                                                   FeatureCLWB,
-                                                  FeatureRDPID];
-  list<SubtargetFeature> ICLSpecificFeatures = [FeatureHasFastGather];
-  list<SubtargetFeature> ICLInheritableFeatures =
-    !listconcat(CNLInheritableFeatures, ICLAdditionalFeatures);
+                                                  FeatureRDPID,
+                                                  FeatureFSRM];
+  list<SubtargetFeature> ICLTuning = CNLTuning;
   list<SubtargetFeature> ICLFeatures =
-    !listconcat(ICLInheritableFeatures, ICLSpecificFeatures);
+    !listconcat(CNLFeatures, ICLAdditionalFeatures);
 
   // Icelake Server
-  list<SubtargetFeature> ICXSpecificFeatures = [FeaturePCONFIG,
-                                                FeatureWBNOINVD,
-                                                FeatureHasFastGather];
+  list<SubtargetFeature> ICXAdditionalFeatures = [FeaturePCONFIG,
+                                                  FeatureWBNOINVD];
+  list<SubtargetFeature> ICXTuning = CNLTuning;
   list<SubtargetFeature> ICXFeatures =
-    !listconcat(ICLInheritableFeatures, ICXSpecificFeatures);
+    !listconcat(ICLFeatures, ICXAdditionalFeatures);
 
   //Tigerlake
   list<SubtargetFeature> TGLAdditionalFeatures = [FeatureVP2INTERSECT,
                                                   FeatureMOVDIRI,
                                                   FeatureMOVDIR64B,
                                                   FeatureSHSTK];
-  list<SubtargetFeature> TGLSpecificFeatures = [FeatureHasFastGather];
-  list<SubtargetFeature> TGLInheritableFeatures =
-    !listconcat(TGLAdditionalFeatures ,TGLSpecificFeatures);
+  list<SubtargetFeature> TGLTuning = CNLTuning;
   list<SubtargetFeature> TGLFeatures =
-    !listconcat(ICLFeatures, TGLInheritableFeatures );
+    !listconcat(ICLFeatures, TGLAdditionalFeatures );
+
+  //Sapphirerapids
+  list<SubtargetFeature> SPRAdditionalFeatures = [FeatureAMXTILE,
+                                                  FeatureAMXINT8,
+                                                  FeatureAMXBF16,
+                                                  FeatureBF16,
+                                                  FeatureSERIALIZE,
+                                                  FeatureCLDEMOTE,
+                                                  FeatureWAITPKG,
+                                                  FeaturePTWRITE,
+                                                  FeatureAVXVNNI,
+                                                  FeatureTSXLDTRK,
+                                                  FeatureENQCMD,
+                                                  FeatureSHSTK,
+                                                  FeatureVP2INTERSECT,
+                                                  FeatureMOVDIRI,
+                                                  FeatureMOVDIR64B,
+                                                  FeatureUINTR];
+  list<SubtargetFeature> SPRTuning = ICXTuning;
+  list<SubtargetFeature> SPRFeatures =
+    !listconcat(ICXFeatures, SPRAdditionalFeatures);
+
+  // Alderlake
+  list<SubtargetFeature> ADLAdditionalFeatures = [FeatureAVXVNNI,
+                                                  FeatureCLDEMOTE,
+                                                  FeatureHRESET,
+                                                  FeaturePTWRITE,
+                                                  FeatureSERIALIZE,
+                                                  FeatureWAITPKG];
+  list<SubtargetFeature> ADLTuning = SKLTuning;
+  list<SubtargetFeature> ADLFeatures =
+    !listconcat(SKLFeatures, ADLAdditionalFeatures);
 
   // Atom
-  list<SubtargetFeature> AtomInheritableFeatures = [FeatureX87,
-                                                    FeatureCMPXCHG8B,
-                                                    FeatureCMOV,
-                                                    FeatureMMX,
-                                                    FeatureSSSE3,
-                                                    FeatureFXSR,
-                                                    FeatureNOPL,
-                                                    Feature64Bit,
-                                                    FeatureCMPXCHG16B,
-                                                    FeatureMOVBE,
-                                                    FeatureSlowTwoMemOps,
-                                                    FeatureLAHFSAHF,
-                                                    FeatureInsertVZEROUPPER];
-  list<SubtargetFeature> AtomSpecificFeatures = [ProcIntelAtom,
-                                                 FeatureSlowUAMem16,
-                                                 FeatureLEAForSP,
-                                                 FeatureSlowDivide32,
-                                                 FeatureSlowDivide64,
-                                                 FeatureLEAUsesAG,
-                                                 FeaturePadShortFunctions];
-  list<SubtargetFeature> AtomFeatures =
-    !listconcat(AtomInheritableFeatures, AtomSpecificFeatures);
+  list<SubtargetFeature> AtomFeatures = [FeatureX87,
+                                         FeatureCMPXCHG8B,
+                                         FeatureCMOV,
+                                         FeatureMMX,
+                                         FeatureSSSE3,
+                                         FeatureFXSR,
+                                         FeatureNOPL,
+                                         Feature64Bit,
+                                         FeatureCMPXCHG16B,
+                                         FeatureMOVBE,
+                                         FeatureLAHFSAHF];
+  list<SubtargetFeature> AtomTuning = [ProcIntelAtom,
+                                       FeatureSlowUAMem16,
+                                       FeatureLEAForSP,
+                                       FeatureSlowDivide32,
+                                       FeatureSlowDivide64,
+                                       FeatureSlowTwoMemOps,
+                                       FeatureLEAUsesAG,
+                                       FeaturePadShortFunctions,
+                                       FeatureInsertVZEROUPPER];
 
   // Silvermont
   list<SubtargetFeature> SLMAdditionalFeatures = [FeatureSSE42,
                                                   FeaturePOPCNT,
                                                   FeaturePCLMUL,
                                                   FeaturePRFCHW,
-                                                  FeatureSlowLEA,
-                                                  FeatureSlowIncDec,
                                                   FeatureRDRAND];
-  list<SubtargetFeature> SLMSpecificFeatures = [ProcIntelSLM,
-                                                FeatureSlowDivide64,
-                                                FeatureSlowPMULLD,
-                                                FeatureFast7ByteNOP,
-                                                FeaturePOPCNTFalseDeps];
-  list<SubtargetFeature> SLMInheritableFeatures =
-    !listconcat(AtomInheritableFeatures, SLMAdditionalFeatures);
+  list<SubtargetFeature> SLMTuning = [ProcIntelSLM,
+                                      FeatureSlowTwoMemOps,
+                                      FeatureSlowLEA,
+                                      FeatureSlowIncDec,
+                                      FeatureSlowDivide64,
+                                      FeatureSlowPMULLD,
+                                      FeatureFast7ByteNOP,
+                                      FeaturePOPCNTFalseDeps,
+                                      FeatureInsertVZEROUPPER];
   list<SubtargetFeature> SLMFeatures =
-    !listconcat(SLMInheritableFeatures, SLMSpecificFeatures);
+    !listconcat(AtomFeatures, SLMAdditionalFeatures);
 
   // Goldmont
   list<SubtargetFeature> GLMAdditionalFeatures = [FeatureAES,
@@ -791,31 +845,33 @@ def ProcessorFeatures {
                                                   FeatureXSAVES,
                                                   FeatureCLFLUSHOPT,
                                                   FeatureFSGSBase];
-  list<SubtargetFeature> GLMSpecificFeatures = [FeatureUseGLMDivSqrtCosts,
-                                                FeaturePOPCNTFalseDeps];
-  list<SubtargetFeature> GLMInheritableFeatures =
-    !listconcat(SLMInheritableFeatures, GLMAdditionalFeatures);
+  list<SubtargetFeature> GLMTuning = [FeatureUseGLMDivSqrtCosts,
+                                      FeatureSlowTwoMemOps,
+                                      FeatureSlowLEA,
+                                      FeatureSlowIncDec,
+                                      FeaturePOPCNTFalseDeps,
+                                      FeatureInsertVZEROUPPER];
   list<SubtargetFeature> GLMFeatures =
-    !listconcat(GLMInheritableFeatures, GLMSpecificFeatures);
+    !listconcat(SLMFeatures, GLMAdditionalFeatures);
 
   // Goldmont Plus
   list<SubtargetFeature> GLPAdditionalFeatures = [FeaturePTWRITE,
                                                   FeatureRDPID,
                                                   FeatureSGX];
-  list<SubtargetFeature> GLPSpecificFeatures = [FeatureUseGLMDivSqrtCosts];
-  list<SubtargetFeature> GLPInheritableFeatures =
-    !listconcat(GLMInheritableFeatures, GLPAdditionalFeatures);
+  list<SubtargetFeature> GLPTuning = [FeatureUseGLMDivSqrtCosts,
+                                      FeatureSlowTwoMemOps,
+                                      FeatureSlowLEA,
+                                      FeatureSlowIncDec,
+                                      FeatureInsertVZEROUPPER];
   list<SubtargetFeature> GLPFeatures =
-    !listconcat(GLPInheritableFeatures, GLPSpecificFeatures);
+    !listconcat(GLMFeatures, GLPAdditionalFeatures);
 
   // Tremont
   list<SubtargetFeature> TRMAdditionalFeatures = [FeatureCLWB,
                                                   FeatureGFNI];
-  list<SubtargetFeature> TRMSpecificFeatures = [FeatureUseGLMDivSqrtCosts];
-  list<SubtargetFeature> TRMInheritableFeatures =
-    !listconcat(GLPInheritableFeatures, TRMAdditionalFeatures);
+  list<SubtargetFeature> TRMTuning = GLPTuning;
   list<SubtargetFeature> TRMFeatures =
-    !listconcat(TRMInheritableFeatures, TRMSpecificFeatures);
+    !listconcat(GLPFeatures, TRMAdditionalFeatures);
 
   // Knights Landing
   list<SubtargetFeature> KNLFeatures = [FeatureX87,
@@ -827,13 +883,10 @@ def ProcessorFeatures {
                                         Feature64Bit,
                                         FeatureCMPXCHG16B,
                                         FeaturePOPCNT,
-                                        FeatureSlowDivide64,
                                         FeaturePCLMUL,
                                         FeatureXSAVE,
                                         FeatureXSAVEOPT,
                                         FeatureLAHFSAHF,
-                                        FeatureSlow3OpsLEA,
-                                        FeatureSlowIncDec,
                                         FeatureAES,
                                         FeatureRDRAND,
                                         FeatureF16C,
@@ -850,56 +903,56 @@ def ProcessorFeatures {
                                         FeatureBMI,
                                         FeatureBMI2,
                                         FeatureFMA,
-                                        FeaturePRFCHW,
-                                        FeaturePreferMaskRegisters,
-                                        FeatureSlowTwoMemOps,
-                                        FeatureHasFastGather,
-                                        FeatureSlowPMADDWD];
+                                        FeaturePRFCHW];
+  list<SubtargetFeature> KNLTuning = [FeatureSlowDivide64,
+                                      FeatureSlow3OpsLEA,
+                                      FeatureSlowIncDec,
+                                      FeatureSlowTwoMemOps,
+                                      FeaturePreferMaskRegisters,
+                                      FeatureHasFastGather,
+                                      FeatureSlowPMADDWD];
   // TODO Add AVX5124FMAPS/AVX5124VNNIW features
   list<SubtargetFeature> KNMFeatures =
     !listconcat(KNLFeatures, [FeatureVPOPCNTDQ]);
 
   // Barcelona
-  list<SubtargetFeature> BarcelonaInheritableFeatures = [FeatureX87,
-                                                         FeatureCMPXCHG8B,
-                                                         FeatureSSE4A,
-                                                         Feature3DNowA,
-                                                         FeatureFXSR,
-                                                         FeatureNOPL,
-                                                         FeatureCMPXCHG16B,
-                                                         FeaturePRFCHW,
-                                                         FeatureLZCNT,
-                                                         FeaturePOPCNT,
-                                                         FeatureSlowSHLD,
-                                                         FeatureLAHFSAHF,
-                                                         FeatureCMOV,
-                                                         Feature64Bit,
-                                                         FeatureFastScalarShiftMasks,
-                                                         FeatureInsertVZEROUPPER];
-  list<SubtargetFeature> BarcelonaFeatures = BarcelonaInheritableFeatures;
+  list<SubtargetFeature> BarcelonaFeatures = [FeatureX87,
+                                              FeatureCMPXCHG8B,
+                                              FeatureSSE4A,
+                                              Feature3DNowA,
+                                              FeatureFXSR,
+                                              FeatureNOPL,
+                                              FeatureCMPXCHG16B,
+                                              FeaturePRFCHW,
+                                              FeatureLZCNT,
+                                              FeaturePOPCNT,
+                                              FeatureLAHFSAHF,
+                                              FeatureCMOV,
+                                              Feature64Bit];
+  list<SubtargetFeature> BarcelonaTuning = [FeatureFastScalarShiftMasks,
+                                            FeatureSlowSHLD,
+                                            FeatureInsertVZEROUPPER];
 
   // Bobcat
-  list<SubtargetFeature> BtVer1InheritableFeatures = [FeatureX87,
-                                                      FeatureCMPXCHG8B,
-                                                      FeatureCMOV,
-                                                      FeatureMMX,
-                                                      FeatureSSSE3,
-                                                      FeatureSSE4A,
-                                                      FeatureFXSR,
-                                                      FeatureNOPL,
-                                                      Feature64Bit,
-                                                      FeatureCMPXCHG16B,
-                                                      FeaturePRFCHW,
-                                                      FeatureLZCNT,
-                                                      FeaturePOPCNT,
-                                                      FeatureSlowSHLD,
-                                                      FeatureLAHFSAHF,
-                                                      FeatureFast15ByteNOP,
-                                                      FeatureFastScalarShiftMasks,
-                                                      FeatureFastVectorShiftMasks];
-  list<SubtargetFeature> BtVer1SpecificFeatures = [FeatureInsertVZEROUPPER];
-  list<SubtargetFeature> BtVer1Features =
-    !listconcat(BtVer1InheritableFeatures, BtVer1SpecificFeatures);
+  list<SubtargetFeature> BtVer1Features = [FeatureX87,
+                                           FeatureCMPXCHG8B,
+                                           FeatureCMOV,
+                                           FeatureMMX,
+                                           FeatureSSSE3,
+                                           FeatureSSE4A,
+                                           FeatureFXSR,
+                                           FeatureNOPL,
+                                           Feature64Bit,
+                                           FeatureCMPXCHG16B,
+                                           FeaturePRFCHW,
+                                           FeatureLZCNT,
+                                           FeaturePOPCNT,
+                                           FeatureLAHFSAHF];
+  list<SubtargetFeature> BtVer1Tuning = [FeatureFast15ByteNOP,
+                                         FeatureFastScalarShiftMasks,
+                                         FeatureFastVectorShiftMasks,
+                                         FeatureSlowSHLD,
+                                         FeatureInsertVZEROUPPER];
 
   // Jaguar
   list<SubtargetFeature> BtVer2AdditionalFeatures = [FeatureAVX,
@@ -910,38 +963,39 @@ def ProcessorFeatures {
                                                      FeatureMOVBE,
                                                      FeatureXSAVE,
                                                      FeatureXSAVEOPT];
-  list<SubtargetFeature> BtVer2SpecificFeatures = [FeatureFastLZCNT,
-                                                   FeatureFastBEXTR,
-                                                   FeatureFastHorizontalOps];
-  list<SubtargetFeature> BtVer2InheritableFeatures =
-    !listconcat(BtVer1InheritableFeatures, BtVer2AdditionalFeatures);
+  list<SubtargetFeature> BtVer2Tuning = [FeatureFastLZCNT,
+                                         FeatureFastBEXTR,
+                                         FeatureFastHorizontalOps,
+                                         FeatureFast15ByteNOP,
+                                         FeatureFastScalarShiftMasks,
+                                         FeatureFastVectorShiftMasks,
+                                         FeatureSlowSHLD];
   list<SubtargetFeature> BtVer2Features =
-    !listconcat(BtVer2InheritableFeatures, BtVer2SpecificFeatures);
+    !listconcat(BtVer1Features, BtVer2AdditionalFeatures);
 
   // Bulldozer
-  list<SubtargetFeature> BdVer1InheritableFeatures = [FeatureX87,
-                                                      FeatureCMPXCHG8B,
-                                                      FeatureCMOV,
-                                                      FeatureXOP,
-                                                      Feature64Bit,
-                                                      FeatureCMPXCHG16B,
-                                                      FeatureAES,
-                                                      FeaturePRFCHW,
-                                                      FeaturePCLMUL,
-                                                      FeatureMMX,
-                                                      FeatureFXSR,
-                                                      FeatureNOPL,
-                                                      FeatureLZCNT,
-                                                      FeaturePOPCNT,
-                                                      FeatureXSAVE,
-                                                      FeatureLWP,
-                                                      FeatureSlowSHLD,
-                                                      FeatureLAHFSAHF,
-                                                      FeatureFast11ByteNOP,
-                                                      FeatureFastScalarShiftMasks,
-                                                      FeatureBranchFusion,
-                                                      FeatureInsertVZEROUPPER];
-  list<SubtargetFeature> BdVer1Features = BdVer1InheritableFeatures;
+  list<SubtargetFeature> BdVer1Features = [FeatureX87,
+                                           FeatureCMPXCHG8B,
+                                           FeatureCMOV,
+                                           FeatureXOP,
+                                           Feature64Bit,
+                                           FeatureCMPXCHG16B,
+                                           FeatureAES,
+                                           FeaturePRFCHW,
+                                           FeaturePCLMUL,
+                                           FeatureMMX,
+                                           FeatureFXSR,
+                                           FeatureNOPL,
+                                           FeatureLZCNT,
+                                           FeaturePOPCNT,
+                                           FeatureXSAVE,
+                                           FeatureLWP,
+                                           FeatureLAHFSAHF];
+  list<SubtargetFeature> BdVer1Tuning = [FeatureSlowSHLD,
+                                         FeatureFast11ByteNOP,
+                                         FeatureFastScalarShiftMasks,
+                                         FeatureBranchFusion,
+                                         FeatureInsertVZEROUPPER];
 
   // PileDriver
   list<SubtargetFeature> BdVer2AdditionalFeatures = [FeatureF16C,
@@ -949,16 +1003,16 @@ def ProcessorFeatures {
                                                      FeatureTBM,
                                                      FeatureFMA,
                                                      FeatureFastBEXTR];
-  list<SubtargetFeature> BdVer2InheritableFeatures =
-    !listconcat(BdVer1InheritableFeatures, BdVer2AdditionalFeatures);
-  list<SubtargetFeature> BdVer2Features = BdVer2InheritableFeatures;
+  list<SubtargetFeature> BdVer2Tuning = BdVer1Tuning;
+  list<SubtargetFeature> BdVer2Features =
+    !listconcat(BdVer1Features, BdVer2AdditionalFeatures);
 
   // Steamroller
   list<SubtargetFeature> BdVer3AdditionalFeatures = [FeatureXSAVEOPT,
                                                      FeatureFSGSBase];
-  list<SubtargetFeature> BdVer3InheritableFeatures =
-    !listconcat(BdVer2InheritableFeatures, BdVer3AdditionalFeatures);
-  list<SubtargetFeature> BdVer3Features = BdVer3InheritableFeatures;
+  list<SubtargetFeature> BdVer3Tuning = BdVer2Tuning;
+  list<SubtargetFeature> BdVer3Features =
+    !listconcat(BdVer2Features, BdVer3AdditionalFeatures);
 
   // Excavator
   list<SubtargetFeature> BdVer4AdditionalFeatures = [FeatureAVX2,
@@ -966,9 +1020,9 @@ def ProcessorFeatures {
                                                      FeatureMOVBE,
                                                      FeatureRDRAND,
                                                      FeatureMWAITX];
-  list<SubtargetFeature> BdVer4InheritableFeatures =
-    !listconcat(BdVer3InheritableFeatures, BdVer4AdditionalFeatures);
-  list<SubtargetFeature> BdVer4Features = BdVer4InheritableFeatures;
+  list<SubtargetFeature> BdVer4Tuning = BdVer3Tuning;
+  list<SubtargetFeature> BdVer4Features =
+    !listconcat(BdVer3Features, BdVer4AdditionalFeatures);
 
 
   // AMD Zen Processors common ISAs
@@ -987,13 +1041,8 @@ def ProcessorFeatures {
                                        FeatureFSGSBase,
                                        FeatureFXSR,
                                        FeatureNOPL,
-                                       FeatureFastLZCNT,
                                        FeatureLAHFSAHF,
                                        FeatureLZCNT,
-                                       FeatureFastBEXTR,
-                                       FeatureFast15ByteNOP,
-                                       FeatureBranchFusion,
-                                       FeatureFastScalarShiftMasks,
                                        FeatureMMX,
                                        FeatureMOVBE,
                                        FeatureMWAITX,
@@ -1004,56 +1053,85 @@ def ProcessorFeatures {
                                        FeatureRDSEED,
                                        FeatureSHA,
                                        FeatureSSE4A,
-                                       FeatureSlowSHLD,
-                                       FeatureInsertVZEROUPPER,
                                        FeatureX87,
                                        FeatureXSAVE,
                                        FeatureXSAVEC,
                                        FeatureXSAVEOPT,
                                        FeatureXSAVES];
+  list<SubtargetFeature> ZNTuning = [FeatureFastLZCNT,
+                                     FeatureFastBEXTR,
+                                     FeatureFast15ByteNOP,
+                                     FeatureBranchFusion,
+                                     FeatureFastScalarShiftMasks,
+                                     FeatureSlowSHLD,
+                                     FeatureInsertVZEROUPPER];
   list<SubtargetFeature> ZN2AdditionalFeatures = [FeatureCLWB,
                                                   FeatureRDPID,
                                                   FeatureWBNOINVD];
+  list<SubtargetFeature> ZN2Tuning = ZNTuning;
   list<SubtargetFeature> ZN2Features =
     !listconcat(ZNFeatures, ZN2AdditionalFeatures);
+  list<SubtargetFeature> ZN3AdditionalFeatures = [FeatureFSRM,
+                                                  FeatureINVPCID,
+                                                  FeaturePKU,
+                                                  FeatureVAES,
+                                                  FeatureVPCLMULQDQ];
+  list<SubtargetFeature> ZN3Tuning = ZNTuning;
+  list<SubtargetFeature> ZN3Features =
+    !listconcat(ZN2Features, ZN3AdditionalFeatures);
 }
 
 //===----------------------------------------------------------------------===//
 // X86 processors supported.
 //===----------------------------------------------------------------------===//
 
-class Proc<string Name, list<SubtargetFeature> Features>
- : ProcessorModel<Name, GenericModel, Features>;
+class Proc<string Name, list<SubtargetFeature> Features,
+           list<SubtargetFeature> TuneFeatures>
+ : ProcessorModel<Name, GenericModel, Features, TuneFeatures>;
+
+class ProcModel<string Name, SchedMachineModel Model,
+                list<SubtargetFeature> Features,
+                list<SubtargetFeature> TuneFeatures>
+ : ProcessorModel<Name, Model, Features, TuneFeatures>;
 
 // NOTE: CMPXCHG8B is here for legacy compatibility so that it is only disabled
 // if i386/i486 is specifically requested.
-def : Proc<"generic",         [FeatureX87, FeatureSlowUAMem16,
-                               FeatureCMPXCHG8B, FeatureInsertVZEROUPPER]>;
-def : Proc<"i386",            [FeatureX87, FeatureSlowUAMem16,
-                               FeatureInsertVZEROUPPER]>;
-def : Proc<"i486",            [FeatureX87, FeatureSlowUAMem16,
-                               FeatureInsertVZEROUPPER]>;
-def : Proc<"i586",            [FeatureX87, FeatureSlowUAMem16,
-                               FeatureCMPXCHG8B, FeatureInsertVZEROUPPER]>;
-def : Proc<"pentium",         [FeatureX87, FeatureSlowUAMem16,
-                               FeatureCMPXCHG8B, FeatureInsertVZEROUPPER]>;
-def : Proc<"pentium-mmx",     [FeatureX87, FeatureSlowUAMem16,
-                               FeatureCMPXCHG8B, FeatureMMX,
-                               FeatureInsertVZEROUPPER]>;
-
-def : Proc<"i686", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
-                    FeatureCMOV, FeatureInsertVZEROUPPER]>;
-def : Proc<"pentiumpro", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
-                          FeatureCMOV, FeatureNOPL, FeatureInsertVZEROUPPER]>;
-
-def : Proc<"pentium2",        [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
-                               FeatureMMX, FeatureCMOV, FeatureFXSR,
-                               FeatureNOPL, FeatureInsertVZEROUPPER]>;
+// NOTE: 64Bit is here as "generic" is the default llc CPU. The X86Subtarget
+// constructor checks that any CPU used in 64-bit mode has Feature64Bit enabled.
+// It has no effect on code generation.
+def : ProcModel<"generic", SandyBridgeModel,
+                [FeatureX87, FeatureCMPXCHG8B, Feature64Bit],
+                [FeatureSlow3OpsLEA,
+                 FeatureSlowDivide64,
+                 FeatureSlowIncDec,
+                 FeatureMacroFusion,
+                 FeatureInsertVZEROUPPER]>;
+
+def : Proc<"i386",            [FeatureX87],
+                              [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+def : Proc<"i486",            [FeatureX87],
+                              [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+def : Proc<"i586",            [FeatureX87, FeatureCMPXCHG8B],
+                              [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+def : Proc<"pentium",         [FeatureX87, FeatureCMPXCHG8B],
+                              [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+def : Proc<"pentium-mmx",     [FeatureX87, FeatureCMPXCHG8B, FeatureMMX],
+                              [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+
+def : Proc<"i686", [FeatureX87, FeatureCMPXCHG8B, FeatureCMOV],
+                   [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+def : Proc<"pentiumpro", [FeatureX87, FeatureCMPXCHG8B, FeatureCMOV,
+                          FeatureNOPL],
+                         [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+
+def : Proc<"pentium2", [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureCMOV,
+                        FeatureFXSR, FeatureNOPL],
+                       [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
 
 foreach P = ["pentium3", "pentium3m"] in {
-  def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,FeatureMMX,
-                 FeatureSSE1, FeatureFXSR, FeatureNOPL, FeatureCMOV,
-                 FeatureInsertVZEROUPPER]>;
+  def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureMMX,
+                 FeatureSSE1, FeatureFXSR, FeatureNOPL, FeatureCMOV],
+                [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
 }
 
 // Enable the PostRAScheduler for SSE2 and SSE3 class cpus.
@@ -1066,35 +1144,35 @@ foreach P = ["pentium3", "pentium3m"] in {
 // measure to avoid performance surprises, in case clang's default cpu
 // changes slightly.
 
-def : ProcessorModel<"pentium-m", GenericPostRAModel,
-                     [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
-                      FeatureMMX, FeatureSSE2, FeatureFXSR, FeatureNOPL,
-                      FeatureCMOV, FeatureInsertVZEROUPPER]>;
+def : ProcModel<"pentium-m", GenericPostRAModel,
+                [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE2,
+                FeatureFXSR, FeatureNOPL, FeatureCMOV],
+                [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
 
 foreach P = ["pentium4", "pentium4m"] in {
-  def : ProcessorModel<P, GenericPostRAModel,
-                       [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
-                        FeatureMMX, FeatureSSE2, FeatureFXSR, FeatureNOPL,
-                        FeatureCMOV, FeatureInsertVZEROUPPER]>;
+  def : ProcModel<P, GenericPostRAModel,
+                  [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE2,
+                   FeatureFXSR, FeatureNOPL, FeatureCMOV],
+                  [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
 }
 
 // Intel Quark.
-def : Proc<"lakemont",        [FeatureInsertVZEROUPPER]>;
+def : Proc<"lakemont", [FeatureCMPXCHG8B],
+                       [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
 
 // Intel Core Duo.
-def : ProcessorModel<"yonah", SandyBridgeModel,
-                     [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
-                      FeatureMMX, FeatureSSE3, FeatureFXSR, FeatureNOPL,
-                      FeatureCMOV, FeatureInsertVZEROUPPER]>;
+def : ProcModel<"yonah", SandyBridgeModel,
+                [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE3,
+                 FeatureFXSR, FeatureNOPL, FeatureCMOV],
+                [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
 
 // NetBurst.
-def : ProcessorModel<"prescott", GenericPostRAModel,
-                     [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
-                      FeatureMMX, FeatureSSE3, FeatureFXSR, FeatureNOPL,
-                      FeatureCMOV, FeatureInsertVZEROUPPER]>;
-def : ProcessorModel<"nocona", GenericPostRAModel, [
+def : ProcModel<"prescott", GenericPostRAModel,
+                [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE3,
+                 FeatureFXSR, FeatureNOPL, FeatureCMOV],
+                [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+def : ProcModel<"nocona", GenericPostRAModel, [
   FeatureX87,
-  FeatureSlowUAMem16,
   FeatureCMPXCHG8B,
   FeatureCMOV,
   FeatureMMX,
@@ -1103,13 +1181,15 @@ def : ProcessorModel<"nocona", GenericPostRAModel, [
   FeatureNOPL,
   Feature64Bit,
   FeatureCMPXCHG16B,
+],
+[
+  FeatureSlowUAMem16,
   FeatureInsertVZEROUPPER
 ]>;
 
 // Intel Core 2 Solo/Duo.
-def : ProcessorModel<"core2", SandyBridgeModel, [
+def : ProcModel<"core2", SandyBridgeModel, [
   FeatureX87,
-  FeatureSlowUAMem16,
   FeatureCMPXCHG8B,
   FeatureCMOV,
   FeatureMMX,
@@ -1118,13 +1198,15 @@ def : ProcessorModel<"core2", SandyBridgeModel, [
   FeatureNOPL,
   Feature64Bit,
   FeatureCMPXCHG16B,
-  FeatureLAHFSAHF,
+  FeatureLAHFSAHF
+],
+[
   FeatureMacroFusion,
+  FeatureSlowUAMem16,
   FeatureInsertVZEROUPPER
 ]>;
-def : ProcessorModel<"penryn", SandyBridgeModel, [
+def : ProcModel<"penryn", SandyBridgeModel, [
   FeatureX87,
-  FeatureSlowUAMem16,
   FeatureCMPXCHG8B,
   FeatureCMOV,
   FeatureMMX,
@@ -1133,140 +1215,171 @@ def : ProcessorModel<"penryn", SandyBridgeModel, [
   FeatureNOPL,
   Feature64Bit,
   FeatureCMPXCHG16B,
-  FeatureLAHFSAHF,
+  FeatureLAHFSAHF
+],
+[
   FeatureMacroFusion,
+  FeatureSlowUAMem16,
   FeatureInsertVZEROUPPER
 ]>;
 
 // Atom CPUs.
 foreach P = ["bonnell", "atom"] in {
-  def : ProcessorModel<P, AtomModel, ProcessorFeatures.AtomFeatures>;
+  def : ProcModel<P, AtomModel, ProcessorFeatures.AtomFeatures,
+                  ProcessorFeatures.AtomTuning>;
 }
 
 foreach P = ["silvermont", "slm"] in {
-  def : ProcessorModel<P, SLMModel, ProcessorFeatures.SLMFeatures>;
+  def : ProcModel<P, SLMModel, ProcessorFeatures.SLMFeatures,
+                  ProcessorFeatures.SLMTuning>;
 }
 
-def : ProcessorModel<"goldmont", SLMModel, ProcessorFeatures.GLMFeatures>;
-def : ProcessorModel<"goldmont-plus", SLMModel, ProcessorFeatures.GLPFeatures>;
-def : ProcessorModel<"tremont", SLMModel, ProcessorFeatures.TRMFeatures>;
+def : ProcModel<"goldmont", SLMModel, ProcessorFeatures.GLMFeatures,
+                ProcessorFeatures.GLMTuning>;
+def : ProcModel<"goldmont-plus", SLMModel, ProcessorFeatures.GLPFeatures,
+                ProcessorFeatures.GLPTuning>;
+def : ProcModel<"tremont", SLMModel, ProcessorFeatures.TRMFeatures,
+                ProcessorFeatures.TRMTuning>;
 
 // "Arrandale" along with corei3 and corei5
 foreach P = ["nehalem", "corei7"] in {
-  def : ProcessorModel<P, SandyBridgeModel, ProcessorFeatures.NHMFeatures>;
+  def : ProcModel<P, SandyBridgeModel, ProcessorFeatures.NHMFeatures,
+                  ProcessorFeatures.NHMTuning>;
 }
 
 // Westmere is the corei3/i5/i7 path from nehalem to sandybridge
-def : ProcessorModel<"westmere", SandyBridgeModel,
-                     ProcessorFeatures.WSMFeatures>;
+def : ProcModel<"westmere", SandyBridgeModel, ProcessorFeatures.WSMFeatures,
+                ProcessorFeatures.WSMTuning>;
 
 foreach P = ["sandybridge", "corei7-avx"] in {
-  def : ProcessorModel<P, SandyBridgeModel, ProcessorFeatures.SNBFeatures>;
+  def : ProcModel<P, SandyBridgeModel, ProcessorFeatures.SNBFeatures,
+                  ProcessorFeatures.SNBTuning>;
 }
 
 foreach P = ["ivybridge", "core-avx-i"] in {
-  def : ProcessorModel<P, SandyBridgeModel, ProcessorFeatures.IVBFeatures>;
+  def : ProcModel<P, SandyBridgeModel, ProcessorFeatures.IVBFeatures,
+                  ProcessorFeatures.IVBTuning>;
 }
 
 foreach P = ["haswell", "core-avx2"] in {
-  def : ProcessorModel<P, HaswellModel, ProcessorFeatures.HSWFeatures>;
+  def : ProcModel<P, HaswellModel, ProcessorFeatures.HSWFeatures,
+                  ProcessorFeatures.HSWTuning>;
 }
 
-def : ProcessorModel<"broadwell", BroadwellModel,
-                     ProcessorFeatures.BDWFeatures>;
+def : ProcModel<"broadwell", BroadwellModel, ProcessorFeatures.BDWFeatures,
+                ProcessorFeatures.BDWTuning>;
 
-def : ProcessorModel<"skylake", SkylakeClientModel,
-                     ProcessorFeatures.SKLFeatures>;
+def : ProcModel<"skylake", SkylakeClientModel, ProcessorFeatures.SKLFeatures,
+                ProcessorFeatures.SKLTuning>;
 
 // FIXME: define KNL scheduler model
-def : ProcessorModel<"knl", HaswellModel, ProcessorFeatures.KNLFeatures>;
-def : ProcessorModel<"knm", HaswellModel, ProcessorFeatures.KNMFeatures>;
+def : ProcModel<"knl", HaswellModel, ProcessorFeatures.KNLFeatures,
+                ProcessorFeatures.KNLTuning>;
+def : ProcModel<"knm", HaswellModel, ProcessorFeatures.KNMFeatures,
+                ProcessorFeatures.KNLTuning>;
 
 foreach P = ["skylake-avx512", "skx"] in {
-  def : ProcessorModel<P, SkylakeServerModel, ProcessorFeatures.SKXFeatures>;
+  def : ProcModel<P, SkylakeServerModel, ProcessorFeatures.SKXFeatures,
+                  ProcessorFeatures.SKXTuning>;
 }
 
-def : ProcessorModel<"cascadelake", SkylakeServerModel,
-                     ProcessorFeatures.CLXFeatures>;
-def : ProcessorModel<"cooperlake", SkylakeServerModel,
-                     ProcessorFeatures.CPXFeatures>;
-def : ProcessorModel<"cannonlake", SkylakeServerModel,
-                     ProcessorFeatures.CNLFeatures>;
-def : ProcessorModel<"icelake-client", SkylakeServerModel,
-                     ProcessorFeatures.ICLFeatures>;
-def : ProcessorModel<"icelake-server", SkylakeServerModel,
-                     ProcessorFeatures.ICXFeatures>;
-def : ProcessorModel<"tigerlake", SkylakeServerModel,
-                     ProcessorFeatures.TGLFeatures>;
+def : ProcModel<"cascadelake", SkylakeServerModel,
+                ProcessorFeatures.CLXFeatures, ProcessorFeatures.CLXTuning>;
+def : ProcModel<"cooperlake", SkylakeServerModel,
+                ProcessorFeatures.CPXFeatures, ProcessorFeatures.CPXTuning>;
+def : ProcModel<"cannonlake", SkylakeServerModel,
+                ProcessorFeatures.CNLFeatures, ProcessorFeatures.CNLTuning>;
+def : ProcModel<"icelake-client", SkylakeServerModel,
+                ProcessorFeatures.ICLFeatures, ProcessorFeatures.ICLTuning>;
+def : ProcModel<"icelake-server", SkylakeServerModel,
+                ProcessorFeatures.ICXFeatures, ProcessorFeatures.ICXTuning>;
+def : ProcModel<"tigerlake", SkylakeServerModel,
+                ProcessorFeatures.TGLFeatures, ProcessorFeatures.TGLTuning>;
+def : ProcModel<"sapphirerapids", SkylakeServerModel,
+                ProcessorFeatures.SPRFeatures, ProcessorFeatures.SPRTuning>;
+def : ProcModel<"alderlake", SkylakeClientModel,
+                ProcessorFeatures.ADLFeatures, ProcessorFeatures.ADLTuning>;
 
 // AMD CPUs.
 
-def : Proc<"k6",   [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
-                    FeatureMMX, FeatureInsertVZEROUPPER]>;
-def : Proc<"k6-2", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
-                    Feature3DNow, FeatureInsertVZEROUPPER]>;
-def : Proc<"k6-3", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
-                    Feature3DNow, FeatureInsertVZEROUPPER]>;
+def : Proc<"k6",   [FeatureX87, FeatureCMPXCHG8B, FeatureMMX],
+                   [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+def : Proc<"k6-2", [FeatureX87, FeatureCMPXCHG8B, Feature3DNow],
+                   [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+def : Proc<"k6-3", [FeatureX87, FeatureCMPXCHG8B, Feature3DNow],
+                   [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
 
 foreach P = ["athlon", "athlon-tbird"] in {
-  def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, FeatureCMOV,
-                 Feature3DNowA, FeatureNOPL, FeatureSlowSHLD,
-                 FeatureInsertVZEROUPPER]>;
+  def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureCMOV, Feature3DNowA,
+                 FeatureNOPL],
+                [FeatureSlowSHLD, FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
 }
 
 foreach P = ["athlon-4", "athlon-xp", "athlon-mp"] in {
-  def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, FeatureCMOV,
-                 FeatureSSE1, Feature3DNowA, FeatureFXSR, FeatureNOPL,
-                 FeatureSlowSHLD, FeatureInsertVZEROUPPER]>;
+  def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureCMOV,
+                 FeatureSSE1, Feature3DNowA, FeatureFXSR, FeatureNOPL],
+                [FeatureSlowSHLD, FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
 }
 
 foreach P = ["k8", "opteron", "athlon64", "athlon-fx"] in {
-  def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
-                 FeatureSSE2, Feature3DNowA, FeatureFXSR, FeatureNOPL,
-                 Feature64Bit, FeatureSlowSHLD, FeatureCMOV,
-                 FeatureFastScalarShiftMasks, FeatureInsertVZEROUPPER]>;
+  def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureSSE2, Feature3DNowA,
+                 FeatureFXSR, FeatureNOPL, Feature64Bit, FeatureCMOV],
+                [FeatureFastScalarShiftMasks, FeatureSlowSHLD, FeatureSlowUAMem16,
+                 FeatureInsertVZEROUPPER]>;
 }
 
 foreach P = ["k8-sse3", "opteron-sse3", "athlon64-sse3"] in {
-  def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, FeatureSSE3,
-                 Feature3DNowA, FeatureFXSR, FeatureNOPL, FeatureCMPXCHG16B,
-                 FeatureSlowSHLD, FeatureCMOV, Feature64Bit,
-                 FeatureFastScalarShiftMasks, FeatureInsertVZEROUPPER]>;
+  def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureSSE3, Feature3DNowA,
+                 FeatureFXSR, FeatureNOPL, FeatureCMPXCHG16B, FeatureCMOV,
+                 Feature64Bit],
+                [FeatureFastScalarShiftMasks, FeatureSlowSHLD, FeatureSlowUAMem16,
+                 FeatureInsertVZEROUPPER]>;
 }
 
 foreach P = ["amdfam10", "barcelona"] in {
-  def : Proc<P, ProcessorFeatures.BarcelonaFeatures>;
+  def : Proc<P, ProcessorFeatures.BarcelonaFeatures,
+             ProcessorFeatures.BarcelonaTuning>;
 }
 
 // Bobcat
-def : Proc<"btver1", ProcessorFeatures.BtVer1Features>;
+def : Proc<"btver1", ProcessorFeatures.BtVer1Features,
+           ProcessorFeatures.BtVer1Tuning>;
 // Jaguar
-def : ProcessorModel<"btver2", BtVer2Model, ProcessorFeatures.BtVer2Features>;
+def : ProcModel<"btver2", BtVer2Model, ProcessorFeatures.BtVer2Features,
+                ProcessorFeatures.BtVer2Tuning>;
 
 // Bulldozer
-def : ProcessorModel<"bdver1", BdVer2Model, ProcessorFeatures.BdVer1Features>;
+def : ProcModel<"bdver1", BdVer2Model, ProcessorFeatures.BdVer1Features,
+                ProcessorFeatures.BdVer1Tuning>;
 // Piledriver
-def : ProcessorModel<"bdver2", BdVer2Model, ProcessorFeatures.BdVer2Features>;
+def : ProcModel<"bdver2", BdVer2Model, ProcessorFeatures.BdVer2Features,
+                ProcessorFeatures.BdVer2Tuning>;
 // Steamroller
-def : Proc<"bdver3", ProcessorFeatures.BdVer3Features>;
+def : Proc<"bdver3", ProcessorFeatures.BdVer3Features,
+           ProcessorFeatures.BdVer3Tuning>;
 // Excavator
-def : Proc<"bdver4", ProcessorFeatures.BdVer4Features>;
-
-def : ProcessorModel<"znver1", Znver1Model, ProcessorFeatures.ZNFeatures>;
-def : ProcessorModel<"znver2", Znver2Model, ProcessorFeatures.ZN2Features>;
-
-def : Proc<"geode",           [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
-                               Feature3DNowA, FeatureInsertVZEROUPPER]>;
-
-def : Proc<"winchip-c6",      [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
-                               FeatureInsertVZEROUPPER]>;
-def : Proc<"winchip2",        [FeatureX87, FeatureSlowUAMem16, Feature3DNow,
-                               FeatureInsertVZEROUPPER]>;
-def : Proc<"c3",              [FeatureX87, FeatureSlowUAMem16, Feature3DNow,
-                               FeatureInsertVZEROUPPER]>;
-def : Proc<"c3-2",            [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
-                               FeatureMMX, FeatureSSE1, FeatureFXSR,
-                               FeatureCMOV, FeatureInsertVZEROUPPER]>;
+def : Proc<"bdver4", ProcessorFeatures.BdVer4Features,
+           ProcessorFeatures.BdVer4Tuning>;
+
+def : ProcModel<"znver1", Znver1Model, ProcessorFeatures.ZNFeatures,
+                ProcessorFeatures.ZNTuning>;
+def : ProcModel<"znver2", Znver2Model, ProcessorFeatures.ZN2Features,
+                ProcessorFeatures.ZN2Tuning>;
+def : ProcModel<"znver3", Znver2Model, ProcessorFeatures.ZN3Features,
+                ProcessorFeatures.ZN3Tuning>;
+
+def : Proc<"geode",           [FeatureX87, FeatureCMPXCHG8B, Feature3DNowA],
+                              [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+
+def : Proc<"winchip-c6",      [FeatureX87, FeatureMMX],
+                              [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+def : Proc<"winchip2",        [FeatureX87, Feature3DNow],
+                              [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+def : Proc<"c3",              [FeatureX87, Feature3DNow],
+                              [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+def : Proc<"c3-2",            [FeatureX87, FeatureCMPXCHG8B, FeatureMMX,
+                               FeatureSSE1, FeatureFXSR, FeatureCMOV],
+                              [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
 
 // We also provide a generic 64-bit specific x86 processor model which tries to
 // be good for modern chips without enabling instruction set encodings past the
@@ -1278,15 +1391,8 @@ def : Proc<"c3-2",            [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
 // covers a huge swath of x86 processors. If there are specific scheduling
 // knobs which need to be tuned differently for AMD chips, we might consider
 // forming a common base for them.
-def : ProcessorModel<"x86-64", SandyBridgeModel, [
-  FeatureX87,
-  FeatureCMPXCHG8B,
-  FeatureCMOV,
-  FeatureMMX,
-  FeatureSSE2,
-  FeatureFXSR,
-  FeatureNOPL,
-  Feature64Bit,
+def : ProcModel<"x86-64", SandyBridgeModel, ProcessorFeatures.X86_64V1Features,
+[
   FeatureSlow3OpsLEA,
   FeatureSlowDivide64,
   FeatureSlowIncDec,
@@ -1294,6 +1400,16 @@ def : ProcessorModel<"x86-64", SandyBridgeModel, [
   FeatureInsertVZEROUPPER
 ]>;
 
+// x86-64 micro-architecture levels.
+def : ProcModel<"x86-64-v2", SandyBridgeModel, ProcessorFeatures.X86_64V2Features,
+                ProcessorFeatures.SNBTuning>;
+// Close to Haswell.
+def : ProcModel<"x86-64-v3", HaswellModel, ProcessorFeatures.X86_64V3Features,
+                ProcessorFeatures.HSWTuning>;
+// Close to the AVX-512 level implemented by Xeon Scalable Processors.
+def : ProcModel<"x86-64-v4", HaswellModel, ProcessorFeatures.X86_64V4Features,
+                ProcessorFeatures.SKXTuning>;
+
 //===----------------------------------------------------------------------===//
 // Calling Conventions
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86AsmPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86AsmPrinter.cpp
index aa03217d155d..2d434bda5530 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86AsmPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86AsmPrinter.cpp
@@ -404,7 +404,7 @@ void X86AsmPrinter::PrintIntelMemReference(const MachineInstr *MI,
   O << ']';
 }
 
-static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO,
+static bool printAsmMRegister(const X86AsmPrinter &P, const MachineOperand &MO,
                               char Mode, raw_ostream &O) {
   Register Reg = MO.getReg();
   bool EmitPercent = MO.getParent()->getInlineAsmDialect() == InlineAsm::AD_ATT;
@@ -446,9 +446,9 @@ static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO,
   return false;
 }
 
-static bool printAsmVRegister(X86AsmPrinter &P, const MachineOperand &MO,
-                              char Mode, raw_ostream &O) {
-  unsigned Reg = MO.getReg();
+static bool printAsmVRegister(const MachineOperand &MO, char Mode,
+                              raw_ostream &O) {
+  Register Reg = MO.getReg();
   bool EmitPercent = MO.getParent()->getInlineAsmDialect() == InlineAsm::AD_ATT;
 
   unsigned Index;
@@ -560,7 +560,7 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
     case 't': // Print V8SFmode register
     case 'g': // Print V16SFmode register
       if (MO.isReg())
-        return printAsmVRegister(*this, MO, ExtraCode[0], O);
+        return printAsmVRegister(MO, ExtraCode[0], O);
       PrintOperand(MI, OpNo, O);
       return false;
 
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86AsmPrinter.h b/contrib/llvm-project/llvm/lib/Target/X86/X86AsmPrinter.h
index eb485fa2ecef..a3b74c8ee387 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86AsmPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86AsmPrinter.h
@@ -134,9 +134,9 @@ public:
   }
 
   bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
-                       const char *ExtraCode, raw_ostream &OS) override;
+                       const char *ExtraCode, raw_ostream &O) override;
   bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
-                             const char *ExtraCode, raw_ostream &OS) override;
+                             const char *ExtraCode, raw_ostream &O) override;
 
   bool doInitialization(Module &M) override {
     SMShadowTracker.reset(0);
@@ -145,7 +145,7 @@ public:
     return AsmPrinter::doInitialization(M);
   }
 
-  bool runOnMachineFunction(MachineFunction &F) override;
+  bool runOnMachineFunction(MachineFunction &MF) override;
   void emitFunctionBodyStart() override;
   void emitFunctionBodyEnd() override;
 };
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
index 9f1fece1b9dd..fdc65acffe3d 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
@@ -154,7 +154,7 @@ static bool isPotentialBlockedMemCpyLd(unsigned Opcode) {
   return isXMMLoadOpcode(Opcode) || isYMMLoadOpcode(Opcode);
 }
 
-static bool isPotentialBlockedMemCpyPair(int LdOpcode, int StOpcode) {
+static bool isPotentialBlockedMemCpyPair(unsigned LdOpcode, unsigned StOpcode) {
   switch (LdOpcode) {
   case X86::MOVUPSrm:
   case X86::MOVAPSrm:
@@ -206,7 +206,7 @@ static bool isPotentialBlockedMemCpyPair(int LdOpcode, int StOpcode) {
   }
 }
 
-static bool isPotentialBlockingStoreInst(int Opcode, int LoadOpcode) {
+static bool isPotentialBlockingStoreInst(unsigned Opcode, unsigned LoadOpcode) {
   bool PBlock = false;
   PBlock |= Opcode == X86::MOV64mr || Opcode == X86::MOV64mi32 ||
             Opcode == X86::MOV32mr || Opcode == X86::MOV32mi ||
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86CallFrameOptimization.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
index caa1f7952475..fae4e688c8b4 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
@@ -105,7 +105,7 @@ private:
   void adjustCallSequence(MachineFunction &MF, const CallContext &Context);
 
   MachineInstr *canFoldIntoRegPush(MachineBasicBlock::iterator FrameSetup,
-                                   unsigned Reg);
+                                   Register Reg);
 
   enum InstClassification { Convert, Skip, Exit };
 
@@ -202,7 +202,7 @@ bool X86CallFrameOptimization::isProfitable(MachineFunction &MF,
   Align StackAlign = TFL->getStackAlign();
 
   int64_t Advantage = 0;
-  for (auto CC : CallSeqVector) {
+  for (const auto &CC : CallSeqVector) {
     // Call sites where no parameters are passed on the stack
     // do not affect the cost, since there needs to be no
     // stack adjustment.
@@ -265,7 +265,7 @@ bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) {
   if (!isProfitable(MF, CallSeqVector))
     return false;
 
-  for (auto CC : CallSeqVector) {
+  for (const auto &CC : CallSeqVector) {
     if (CC.UsePush) {
       adjustCallSequence(MF, CC);
       Changed = true;
@@ -288,13 +288,13 @@ X86CallFrameOptimization::classifyInstruction(
     case X86::AND16mi8:
     case X86::AND32mi8:
     case X86::AND64mi8: {
-      MachineOperand ImmOp = MI->getOperand(X86::AddrNumOperands);
+      const MachineOperand &ImmOp = MI->getOperand(X86::AddrNumOperands);
       return ImmOp.getImm() == 0 ? Convert : Exit;
     }
     case X86::OR16mi8:
     case X86::OR32mi8:
     case X86::OR64mi8: {
-      MachineOperand ImmOp = MI->getOperand(X86::AddrNumOperands);
+      const MachineOperand &ImmOp = MI->getOperand(X86::AddrNumOperands);
       return ImmOp.getImm() == -1 ? Convert : Exit;
     }
     case X86::MOV32mi:
@@ -336,7 +336,7 @@ X86CallFrameOptimization::classifyInstruction(
     if (!MO.isReg())
       continue;
     Register Reg = MO.getReg();
-    if (!Register::isPhysicalRegister(Reg))
+    if (!Reg.isPhysical())
       continue;
     if (RegInfo.regsOverlap(Reg, RegInfo.getStackRegister()))
       return Exit;
@@ -454,7 +454,7 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF,
       if (!MO.isReg())
         continue;
       Register Reg = MO.getReg();
-      if (Register::isPhysicalRegister(Reg))
+      if (Reg.isPhysical())
         UsedRegs.insert(Reg);
     }
   }
@@ -506,7 +506,7 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
   // replace uses.
   for (int Idx = (Context.ExpectedDist >> Log2SlotSize) - 1; Idx >= 0; --Idx) {
     MachineBasicBlock::iterator Store = *Context.ArgStoreVector[Idx];
-    MachineOperand PushOp = Store->getOperand(X86::AddrNumOperands);
+    const MachineOperand &PushOp = Store->getOperand(X86::AddrNumOperands);
     MachineBasicBlock::iterator Push = nullptr;
     unsigned PushOpcode;
     switch (Store->getOpcode()) {
@@ -563,8 +563,7 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
         unsigned NumOps = DefMov->getDesc().getNumOperands();
         for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i)
           Push->addOperand(DefMov->getOperand(i));
-        Push->cloneMergedMemRefs(MF, {&*DefMov, &*Store});
-
+        Push->cloneMergedMemRefs(MF, {DefMov, &*Store});
         DefMov->eraseFromParent();
       } else {
         PushOpcode = Is64Bit ? X86::PUSH64r : X86::PUSH32r;
@@ -600,7 +599,7 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
 }
 
 MachineInstr *X86CallFrameOptimization::canFoldIntoRegPush(
-    MachineBasicBlock::iterator FrameSetup, unsigned Reg) {
+    MachineBasicBlock::iterator FrameSetup, Register Reg) {
   // Do an extremely restricted form of load folding.
   // ISel will often create patterns like:
   // movl    4(%edi), %eax
@@ -611,7 +610,7 @@ MachineInstr *X86CallFrameOptimization::canFoldIntoRegPush(
   // movl    %eax, (%esp)
   // call
   // Get rid of those with prejudice.
-  if (!Register::isVirtualRegister(Reg))
+  if (!Reg.isVirtual())
     return nullptr;
 
   // Make sure this is the only use of Reg.
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86CallLowering.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86CallLowering.cpp
index 319dc9470604..53f57565d56e 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86CallLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86CallLowering.cpp
@@ -95,15 +95,14 @@ bool X86CallLowering::splitToValueTypes(const ArgInfo &OrigArg,
 
 namespace {
 
-struct OutgoingValueHandler : public CallLowering::ValueHandler {
-  OutgoingValueHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
-                       MachineInstrBuilder &MIB, CCAssignFn *AssignFn)
-      : ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB),
+struct X86OutgoingValueHandler : public CallLowering::OutgoingValueHandler {
+  X86OutgoingValueHandler(MachineIRBuilder &MIRBuilder,
+                          MachineRegisterInfo &MRI, MachineInstrBuilder &MIB,
+                          CCAssignFn *AssignFn)
+      : OutgoingValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB),
         DL(MIRBuilder.getMF().getDataLayout()),
         STI(MIRBuilder.getMF().getSubtarget<X86Subtarget>()) {}
 
-  bool isIncomingArgumentHandler() const override { return false; }
-
   Register getStackAddress(uint64_t Size, int64_t Offset,
                            MachinePointerInfo &MPO) override {
     LLT p0 = LLT::pointer(0, DL.getPointerSizeInBits(0));
@@ -135,9 +134,10 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler {
     unsigned ValSize = VA.getValVT().getSizeInBits();
     unsigned LocSize = VA.getLocVT().getSizeInBits();
     if (PhysRegSize > ValSize && LocSize == ValSize) {
-      assert((PhysRegSize == 128 || PhysRegSize == 80)  && "We expect that to be 128 bit");
-      auto MIB = MIRBuilder.buildAnyExt(LLT::scalar(PhysRegSize), ValVReg);
-      ExtReg = MIB.getReg(0);
+      assert((PhysRegSize == 128 || PhysRegSize == 80) &&
+             "We expect that to be 128 bit");
+      ExtReg =
+          MIRBuilder.buildAnyExt(LLT::scalar(PhysRegSize), ValVReg).getReg(0);
     } else
       ExtReg = extendRegister(ValVReg, VA);
 
@@ -149,9 +149,9 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler {
     MachineFunction &MF = MIRBuilder.getMF();
     Register ExtReg = extendRegister(ValVReg, VA);
 
-    auto MMO = MF.getMachineMemOperand(MPO, MachineMemOperand::MOStore,
-                                       VA.getLocVT().getStoreSize(),
-                                       inferAlignFromPtrInfo(MF, MPO));
+    auto *MMO = MF.getMachineMemOperand(MPO, MachineMemOperand::MOStore,
+                                        VA.getLocVT().getStoreSize(),
+                                        inferAlignFromPtrInfo(MF, MPO));
     MIRBuilder.buildStore(ExtReg, Addr, *MMO);
   }
 
@@ -184,9 +184,9 @@ protected:
 
 } // end anonymous namespace
 
-bool X86CallLowering::lowerReturn(
-    MachineIRBuilder &MIRBuilder, const Value *Val,
-    ArrayRef<Register> VRegs) const {
+bool X86CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
+                                  const Value *Val, ArrayRef<Register> VRegs,
+                                  FunctionLoweringInfo &FLI) const {
   assert(((Val && !VRegs.empty()) || (!Val && VRegs.empty())) &&
          "Return value without a vreg");
   auto MIB = MIRBuilder.buildInstrNoInsert(X86::RET).addImm(0);
@@ -195,7 +195,7 @@ bool X86CallLowering::lowerReturn(
     MachineFunction &MF = MIRBuilder.getMF();
     const Function &F = MF.getFunction();
     MachineRegisterInfo &MRI = MF.getRegInfo();
-    auto &DL = MF.getDataLayout();
+    const DataLayout &DL = MF.getDataLayout();
     LLVMContext &Ctx = Val->getType()->getContext();
     const X86TargetLowering &TLI = *getTLI<X86TargetLowering>();
 
@@ -215,7 +215,7 @@ bool X86CallLowering::lowerReturn(
         return false;
     }
 
-    OutgoingValueHandler Handler(MIRBuilder, MRI, MIB, RetCC_X86);
+    X86OutgoingValueHandler Handler(MIRBuilder, MRI, MIB, RetCC_X86);
     if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
       return false;
   }
@@ -226,14 +226,12 @@ bool X86CallLowering::lowerReturn(
 
 namespace {
 
-struct IncomingValueHandler : public CallLowering::ValueHandler {
-  IncomingValueHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
-                       CCAssignFn *AssignFn)
-      : ValueHandler(MIRBuilder, MRI, AssignFn),
+struct X86IncomingValueHandler : public CallLowering::IncomingValueHandler {
+  X86IncomingValueHandler(MachineIRBuilder &MIRBuilder,
+                          MachineRegisterInfo &MRI, CCAssignFn *AssignFn)
+      : IncomingValueHandler(MIRBuilder, MRI, AssignFn),
         DL(MIRBuilder.getMF().getDataLayout()) {}
 
-  bool isIncomingArgumentHandler() const override { return true; }
-
   Register getStackAddress(uint64_t Size, int64_t Offset,
                            MachinePointerInfo &MPO) override {
     auto &MFI = MIRBuilder.getMF().getFrameInfo();
@@ -248,7 +246,7 @@ struct IncomingValueHandler : public CallLowering::ValueHandler {
   void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
                             MachinePointerInfo &MPO, CCValAssign &VA) override {
     MachineFunction &MF = MIRBuilder.getMF();
-    auto MMO = MF.getMachineMemOperand(
+    auto *MMO = MF.getMachineMemOperand(
         MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size,
         inferAlignFromPtrInfo(MF, MPO));
     MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
@@ -298,10 +296,10 @@ protected:
   const DataLayout &DL;
 };
 
-struct FormalArgHandler : public IncomingValueHandler {
+struct FormalArgHandler : public X86IncomingValueHandler {
   FormalArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
                    CCAssignFn *AssignFn)
-      : IncomingValueHandler(MIRBuilder, MRI, AssignFn) {}
+      : X86IncomingValueHandler(MIRBuilder, MRI, AssignFn) {}
 
   void markPhysRegUsed(unsigned PhysReg) override {
     MIRBuilder.getMRI()->addLiveIn(PhysReg);
@@ -309,10 +307,10 @@ struct FormalArgHandler : public IncomingValueHandler {
   }
 };
 
-struct CallReturnHandler : public IncomingValueHandler {
+struct CallReturnHandler : public X86IncomingValueHandler {
   CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
                     CCAssignFn *AssignFn, MachineInstrBuilder &MIB)
-      : IncomingValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {}
+      : X86IncomingValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {}
 
   void markPhysRegUsed(unsigned PhysReg) override {
     MIB.addDef(PhysReg, RegState::Implicit);
@@ -324,9 +322,10 @@ protected:
 
 } // end anonymous namespace
 
-bool X86CallLowering::lowerFormalArguments(
-    MachineIRBuilder &MIRBuilder, const Function &F,
-    ArrayRef<ArrayRef<Register>> VRegs) const {
+bool X86CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
+                                           const Function &F,
+                                           ArrayRef<ArrayRef<Register>> VRegs,
+                                           FunctionLoweringInfo &FLI) const {
   if (F.arg_empty())
     return true;
 
@@ -340,8 +339,7 @@ bool X86CallLowering::lowerFormalArguments(
 
   SmallVector<ArgInfo, 8> SplitArgs;
   unsigned Idx = 0;
-  for (auto &Arg : F.args()) {
-
+  for (const auto &Arg : F.args()) {
     // TODO: handle not simple cases.
     if (Arg.hasAttribute(Attribute::ByVal) ||
         Arg.hasAttribute(Attribute::InReg) ||
@@ -380,10 +378,10 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
   MachineFunction &MF = MIRBuilder.getMF();
   const Function &F = MF.getFunction();
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  auto &DL = F.getParent()->getDataLayout();
+  const DataLayout &DL = F.getParent()->getDataLayout();
   const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
   const TargetInstrInfo &TII = *STI.getInstrInfo();
-  auto TRI = STI.getRegisterInfo();
+  const X86RegisterInfo *TRI = STI.getRegisterInfo();
 
   // Handle only Linux C, X86_64_SysV calling conventions for now.
   if (!STI.isTargetLinux() || !(Info.CallConv == CallingConv::C ||
@@ -421,7 +419,7 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
       return false;
   }
   // Do the actual argument marshalling.
-  OutgoingValueHandler Handler(MIRBuilder, MRI, MIB, CC_X86);
+  X86OutgoingValueHandler Handler(MIRBuilder, MRI, MIB, CC_X86);
   if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
     return false;
 
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86CallLowering.h b/contrib/llvm-project/llvm/lib/Target/X86/X86CallLowering.h
index b5ea7782896b..9390122d7647 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86CallLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86CallLowering.h
@@ -29,10 +29,12 @@ public:
   X86CallLowering(const X86TargetLowering &TLI);
 
   bool lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val,
-                   ArrayRef<Register> VRegs) const override;
+                   ArrayRef<Register> VRegs,
+                   FunctionLoweringInfo &FLI) const override;
 
   bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
-                            ArrayRef<ArrayRef<Register>> VRegs) const override;
+                            ArrayRef<ArrayRef<Register>> VRegs,
+                            FunctionLoweringInfo &FLI) const override;
 
   bool lowerCall(MachineIRBuilder &MIRBuilder,
                  CallLoweringInfo &Info) const override;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86CallingConv.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86CallingConv.cpp
index c899db60e016..c80a5d5bb332 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86CallingConv.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86CallingConv.cpp
@@ -330,5 +330,15 @@ static bool CC_X86_Intr(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
   return true;
 }
 
+static bool CC_X86_64_Pointer(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                              CCValAssign::LocInfo &LocInfo,
+                              ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+  if (LocVT != MVT::i64) {
+    LocVT = MVT::i64;
+    LocInfo = CCValAssign::ZExt;
+  }
+  return false;
+}
+
 // Provides entry points of CC_X86 and RetCC_X86.
 #include "X86GenCallingConv.inc"
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86CallingConv.td b/contrib/llvm-project/llvm/lib/Target/X86/X86CallingConv.td
index 802e694999b6..3735fab818ce 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86CallingConv.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86CallingConv.td
@@ -336,6 +336,9 @@ def RetCC_X86_64_C : CallingConv<[
   // MMX vector types are always returned in XMM0.
   CCIfType<[x86mmx], CCAssignToReg<[XMM0, XMM1]>>,
 
+  // Pointers are always returned in full 64-bit registers.
+  CCIfPtr<CCCustom<"CC_X86_64_Pointer">>,
+
   CCIfSwiftError<CCIfType<[i64], CCAssignToReg<[R12]>>>,
 
   CCDelegateTo<RetCC_X86Common>
@@ -518,6 +521,9 @@ def CC_X86_64_C : CallingConv<[
   CCIfCC<"CallingConv::Swift",
     CCIfSRet<CCIfType<[i64], CCAssignToReg<[RAX]>>>>,
 
+  // Pointers are always passed in full 64-bit registers.
+  CCIfPtr<CCCustom<"CC_X86_64_Pointer">>,
+
   // The first 6 integer arguments are passed in integer registers.
   CCIfType<[i32], CCAssignToReg<[EDI, ESI, EDX, ECX, R8D, R9D]>>,
   CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX, R8 , R9 ]>>,
@@ -1096,7 +1102,7 @@ def CSR_64_CXX_TLS_Darwin_ViaCopy : CalleeSavedRegs<(sub CSR_64_TLS_Darwin, RBP)
 
 // All GPRs - except r11
 def CSR_64_RT_MostRegs : CalleeSavedRegs<(add CSR_64, RAX, RCX, RDX, RSI, RDI,
-                                              R8, R9, R10, RSP)>;
+                                              R8, R9, R10)>;
 
 // All registers - except r11
 def CSR_64_RT_AllRegs     : CalleeSavedRegs<(add CSR_64_RT_MostRegs,
@@ -1154,17 +1160,16 @@ def CSR_64_Intel_OCL_BI_AVX512 : CalleeSavedRegs<(add RBX, RSI, R14, R15,
 def CSR_64_HHVM : CalleeSavedRegs<(add R12)>;
 
 // Register calling convention preserves few GPR and XMM8-15
-def CSR_32_RegCall_NoSSE : CalleeSavedRegs<(add ESI, EDI, EBX, EBP, ESP)>;
+def CSR_32_RegCall_NoSSE : CalleeSavedRegs<(add ESI, EDI, EBX, EBP)>;
 def CSR_32_RegCall       : CalleeSavedRegs<(add CSR_32_RegCall_NoSSE,
                                            (sequence "XMM%u", 4, 7))>;
 def CSR_Win32_CFGuard_Check_NoSSE : CalleeSavedRegs<(add CSR_32_RegCall_NoSSE, ECX)>;
 def CSR_Win32_CFGuard_Check       : CalleeSavedRegs<(add CSR_32_RegCall, ECX)>;
-def CSR_Win64_RegCall_NoSSE : CalleeSavedRegs<(add RBX, RBP, RSP,
+def CSR_Win64_RegCall_NoSSE : CalleeSavedRegs<(add RBX, RBP,
                                               (sequence "R%u", 10, 15))>;
 def CSR_Win64_RegCall       : CalleeSavedRegs<(add CSR_Win64_RegCall_NoSSE,                                  
                                               (sequence "XMM%u", 8, 15))>;
-def CSR_SysV64_RegCall_NoSSE : CalleeSavedRegs<(add RBX, RBP, RSP,
+def CSR_SysV64_RegCall_NoSSE : CalleeSavedRegs<(add RBX, RBP,
                                                (sequence "R%u", 12, 15))>;
 def CSR_SysV64_RegCall       : CalleeSavedRegs<(add CSR_SysV64_RegCall_NoSSE,               
                                                (sequence "XMM%u", 8, 15))>;
-                                               
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86CmovConversion.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86CmovConversion.cpp
index fe5cb3ae2bf6..a2de0dc08292 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86CmovConversion.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86CmovConversion.cpp
@@ -439,7 +439,7 @@ bool X86CmovConverterPass::checkForProfitableCmovCandidates(
           if (!MO.isReg() || !MO.isUse())
             continue;
           Register Reg = MO.getReg();
-          auto &RDM = RegDefMaps[Register::isVirtualRegister(Reg)];
+          auto &RDM = RegDefMaps[Reg.isVirtual()];
           if (MachineInstr *DefMI = RDM.lookup(Reg)) {
             OperandToDefMap[&MO] = DefMI;
             DepthInfo Info = DepthMap.lookup(DefMI);
@@ -459,7 +459,7 @@ bool X86CmovConverterPass::checkForProfitableCmovCandidates(
           if (!MO.isReg() || !MO.isDef())
             continue;
           Register Reg = MO.getReg();
-          RegDefMaps[Register::isVirtualRegister(Reg)][Reg] = &MI;
+          RegDefMaps[Reg.isVirtual()][Reg] = &MI;
         }
 
         unsigned Latency = TSchedModel.computeInstrLatency(&MI);
@@ -537,7 +537,7 @@ bool X86CmovConverterPass::checkForProfitableCmovCandidates(
       // This is another conservative check to avoid converting CMOV instruction
       // used with tree-search like algorithm, where the branch is unpredicted.
       auto UIs = MRI->use_instructions(MI->defs().begin()->getReg());
-      if (UIs.begin() != UIs.end() && ++UIs.begin() == UIs.end()) {
+      if (!UIs.empty() && ++UIs.begin() == UIs.end()) {
         unsigned Op = UIs.begin()->getOpcode();
         if (Op == X86::MOV64rm || Op == X86::MOV32rm) {
           WorthOpGroup = false;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86CondBrFolding.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86CondBrFolding.cpp
deleted file mode 100644
index 7ede94664bf6..000000000000
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86CondBrFolding.cpp
+++ /dev/null
@@ -1,579 +0,0 @@
-//===---- X86CondBrFolding.cpp - optimize conditional branches ------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// This file defines a pass that optimizes condition branches on x86 by taking
-// advantage of the three-way conditional code generated by compare
-// instructions.
-// Currently, it tries to hoisting EQ and NE conditional branch to a dominant
-// conditional branch condition where the same EQ/NE conditional code is
-// computed. An example:
-//   bb_0:
-//     cmp %0, 19
-//     jg bb_1
-//     jmp bb_2
-//   bb_1:
-//     cmp %0, 40
-//     jg bb_3
-//     jmp bb_4
-//   bb_4:
-//     cmp %0, 20
-//     je bb_5
-//     jmp bb_6
-// Here we could combine the two compares in bb_0 and bb_4 and have the
-// following code:
-//   bb_0:
-//     cmp %0, 20
-//     jg bb_1
-//     jl bb_2
-//     jmp bb_5
-//   bb_1:
-//     cmp %0, 40
-//     jg bb_3
-//     jmp bb_6
-// For the case of %0 == 20 (bb_5), we eliminate two jumps, and the control
-// height for bb_6 is also reduced. bb_4 is gone after the optimization.
-//
-// There are plenty of this code patterns, especially from the switch case
-// lowing where we generate compare of "pivot-1" for the inner nodes in the
-// binary search tree.
-//===----------------------------------------------------------------------===//
-
-#include "X86.h"
-#include "X86InstrInfo.h"
-#include "X86Subtarget.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/BranchProbability.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "x86-condbr-folding"
-
-STATISTIC(NumFixedCondBrs, "Number of x86 condbr folded");
-
-namespace {
-class X86CondBrFoldingPass : public MachineFunctionPass {
-public:
-  X86CondBrFoldingPass() : MachineFunctionPass(ID) { }
-  StringRef getPassName() const override { return "X86 CondBr Folding"; }
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    MachineFunctionPass::getAnalysisUsage(AU);
-    AU.addRequired<MachineBranchProbabilityInfo>();
-  }
-
-public:
-  static char ID;
-};
-} // namespace
-
-char X86CondBrFoldingPass::ID = 0;
-INITIALIZE_PASS(X86CondBrFoldingPass, "X86CondBrFolding", "X86CondBrFolding", false, false)
-
-FunctionPass *llvm::createX86CondBrFolding() {
-  return new X86CondBrFoldingPass();
-}
-
-namespace {
-// A class the stores the auxiliary information for each MBB.
-struct TargetMBBInfo {
-  MachineBasicBlock *TBB;
-  MachineBasicBlock *FBB;
-  MachineInstr *BrInstr;
-  MachineInstr *CmpInstr;
-  X86::CondCode BranchCode;
-  unsigned SrcReg;
-  int CmpValue;
-  bool Modified;
-  bool CmpBrOnly;
-};
-
-// A class that optimizes the conditional branch by hoisting and merge CondCode.
-class X86CondBrFolding {
-public:
-  X86CondBrFolding(const X86InstrInfo *TII,
-                   const MachineBranchProbabilityInfo *MBPI,
-                   MachineFunction &MF)
-      : TII(TII), MBPI(MBPI), MF(MF) {}
-  bool optimize();
-
-private:
-  const X86InstrInfo *TII;
-  const MachineBranchProbabilityInfo *MBPI;
-  MachineFunction &MF;
-  std::vector<std::unique_ptr<TargetMBBInfo>> MBBInfos;
-  SmallVector<MachineBasicBlock *, 4> RemoveList;
-
-  void optimizeCondBr(MachineBasicBlock &MBB,
-                      SmallVectorImpl<MachineBasicBlock *> &BranchPath);
-  void replaceBrDest(MachineBasicBlock *MBB, MachineBasicBlock *OrigDest,
-                     MachineBasicBlock *NewDest);
-  void fixupModifiedCond(MachineBasicBlock *MBB);
-  std::unique_ptr<TargetMBBInfo> analyzeMBB(MachineBasicBlock &MBB);
-  static bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
-                             int &CmpValue);
-  bool findPath(MachineBasicBlock *MBB,
-                SmallVectorImpl<MachineBasicBlock *> &BranchPath);
-  TargetMBBInfo *getMBBInfo(MachineBasicBlock *MBB) const {
-    return MBBInfos[MBB->getNumber()].get();
-  }
-};
-} // namespace
-
-// Find a valid path that we can reuse the CondCode.
-// The resulted path (if return true) is stored in BranchPath.
-// Return value:
-//  false: is no valid path is found.
-//  true: a valid path is found and the targetBB can be reached.
-bool X86CondBrFolding::findPath(
-    MachineBasicBlock *MBB, SmallVectorImpl<MachineBasicBlock *> &BranchPath) {
-  TargetMBBInfo *MBBInfo = getMBBInfo(MBB);
-  assert(MBBInfo && "Expecting a candidate MBB");
-  int CmpValue = MBBInfo->CmpValue;
-
-  MachineBasicBlock *PredMBB = *MBB->pred_begin();
-  MachineBasicBlock *SaveMBB = MBB;
-  while (PredMBB) {
-    TargetMBBInfo *PredMBBInfo = getMBBInfo(PredMBB);
-    if (!PredMBBInfo || PredMBBInfo->SrcReg != MBBInfo->SrcReg)
-      return false;
-
-    assert(SaveMBB == PredMBBInfo->TBB || SaveMBB == PredMBBInfo->FBB);
-    bool IsFalseBranch = (SaveMBB == PredMBBInfo->FBB);
-
-    X86::CondCode CC = PredMBBInfo->BranchCode;
-    assert(CC == X86::COND_L || CC == X86::COND_G || CC == X86::COND_E);
-    int PredCmpValue = PredMBBInfo->CmpValue;
-    bool ValueCmpTrue = ((CmpValue < PredCmpValue && CC == X86::COND_L) ||
-                         (CmpValue > PredCmpValue && CC == X86::COND_G) ||
-                         (CmpValue == PredCmpValue && CC == X86::COND_E));
-    // Check if both the result of value compare and the branch target match.
-    if (!(ValueCmpTrue ^ IsFalseBranch)) {
-      LLVM_DEBUG(dbgs() << "Dead BB detected!\n");
-      return false;
-    }
-
-    BranchPath.push_back(PredMBB);
-    // These are the conditions on which we could combine the compares.
-    if ((CmpValue == PredCmpValue) ||
-        (CmpValue == PredCmpValue - 1 && CC == X86::COND_L) ||
-        (CmpValue == PredCmpValue + 1 && CC == X86::COND_G))
-      return true;
-
-    // If PredMBB has more than on preds, or not a pure cmp and br, we bailout.
-    if (PredMBB->pred_size() != 1 || !PredMBBInfo->CmpBrOnly)
-      return false;
-
-    SaveMBB = PredMBB;
-    PredMBB = *PredMBB->pred_begin();
-  }
-  return false;
-}
-
-// Fix up any PHI node in the successor of MBB.
-static void fixPHIsInSucc(MachineBasicBlock *MBB, MachineBasicBlock *OldMBB,
-                          MachineBasicBlock *NewMBB) {
-  if (NewMBB == OldMBB)
-    return;
-  for (auto MI = MBB->instr_begin(), ME = MBB->instr_end();
-       MI != ME && MI->isPHI(); ++MI)
-    for (unsigned i = 2, e = MI->getNumOperands() + 1; i != e; i += 2) {
-      MachineOperand &MO = MI->getOperand(i);
-      if (MO.getMBB() == OldMBB)
-        MO.setMBB(NewMBB);
-    }
-}
-
-// Utility function to set branch probability for edge MBB->SuccMBB.
-static inline bool setBranchProb(MachineBasicBlock *MBB,
-                                 MachineBasicBlock *SuccMBB,
-                                 BranchProbability Prob) {
-  auto MBBI = std::find(MBB->succ_begin(), MBB->succ_end(), SuccMBB);
-  if (MBBI == MBB->succ_end())
-    return false;
-  MBB->setSuccProbability(MBBI, Prob);
-  return true;
-}
-
-// Utility function to find the unconditional br instruction in MBB.
-static inline MachineBasicBlock::iterator
-findUncondBrI(MachineBasicBlock *MBB) {
-  return std::find_if(MBB->begin(), MBB->end(), [](MachineInstr &MI) -> bool {
-    return MI.getOpcode() == X86::JMP_1;
-  });
-}
-
-// Replace MBB's original successor, OrigDest, with NewDest.
-// Also update the MBBInfo for MBB.
-void X86CondBrFolding::replaceBrDest(MachineBasicBlock *MBB,
-                                     MachineBasicBlock *OrigDest,
-                                     MachineBasicBlock *NewDest) {
-  TargetMBBInfo *MBBInfo = getMBBInfo(MBB);
-  MachineInstr *BrMI;
-  if (MBBInfo->TBB == OrigDest) {
-    BrMI = MBBInfo->BrInstr;
-    MachineInstrBuilder MIB =
-        BuildMI(*MBB, BrMI, MBB->findDebugLoc(BrMI), TII->get(X86::JCC_1))
-            .addMBB(NewDest).addImm(MBBInfo->BranchCode);
-    MBBInfo->TBB = NewDest;
-    MBBInfo->BrInstr = MIB.getInstr();
-  } else { // Should be the unconditional jump stmt.
-    MachineBasicBlock::iterator UncondBrI = findUncondBrI(MBB);
-    BuildMI(*MBB, UncondBrI, MBB->findDebugLoc(UncondBrI), TII->get(X86::JMP_1))
-        .addMBB(NewDest);
-    MBBInfo->FBB = NewDest;
-    BrMI = &*UncondBrI;
-  }
-  fixPHIsInSucc(NewDest, OrigDest, MBB);
-  BrMI->eraseFromParent();
-  MBB->addSuccessor(NewDest);
-  setBranchProb(MBB, NewDest, MBPI->getEdgeProbability(MBB, OrigDest));
-  MBB->removeSuccessor(OrigDest);
-}
-
-// Change the CondCode and BrInstr according to MBBInfo.
-void X86CondBrFolding::fixupModifiedCond(MachineBasicBlock *MBB) {
-  TargetMBBInfo *MBBInfo = getMBBInfo(MBB);
-  if (!MBBInfo->Modified)
-    return;
-
-  MachineInstr *BrMI = MBBInfo->BrInstr;
-  X86::CondCode CC = MBBInfo->BranchCode;
-  MachineInstrBuilder MIB = BuildMI(*MBB, BrMI, MBB->findDebugLoc(BrMI),
-                                    TII->get(X86::JCC_1))
-                                .addMBB(MBBInfo->TBB).addImm(CC);
-  BrMI->eraseFromParent();
-  MBBInfo->BrInstr = MIB.getInstr();
-
-  MachineBasicBlock::iterator UncondBrI = findUncondBrI(MBB);
-  BuildMI(*MBB, UncondBrI, MBB->findDebugLoc(UncondBrI), TII->get(X86::JMP_1))
-      .addMBB(MBBInfo->FBB);
-  MBB->erase(UncondBrI);
-  MBBInfo->Modified = false;
-}
-
-//
-// Apply the transformation:
-//  RootMBB -1-> ... PredMBB -3-> MBB -5-> TargetMBB
-//     \-2->           \-4->       \-6-> FalseMBB
-// ==>
-//             RootMBB -1-> ... PredMBB -7-> FalseMBB
-// TargetMBB <-8-/ \-2->           \-4->
-//
-// Note that PredMBB and RootMBB could be the same.
-// And in the case of dead TargetMBB, we will not have TargetMBB and edge 8.
-//
-// There are some special handling where the RootMBB is COND_E in which case
-// we directly short-cycle the brinstr.
-//
-void X86CondBrFolding::optimizeCondBr(
-    MachineBasicBlock &MBB, SmallVectorImpl<MachineBasicBlock *> &BranchPath) {
-
-  X86::CondCode CC;
-  TargetMBBInfo *MBBInfo = getMBBInfo(&MBB);
-  assert(MBBInfo && "Expecting a candidate MBB");
-  MachineBasicBlock *TargetMBB = MBBInfo->TBB;
-  BranchProbability TargetProb = MBPI->getEdgeProbability(&MBB, MBBInfo->TBB);
-
-  // Forward the jump from MBB's predecessor to MBB's false target.
-  MachineBasicBlock *PredMBB = BranchPath.front();
-  TargetMBBInfo *PredMBBInfo = getMBBInfo(PredMBB);
-  assert(PredMBBInfo && "Expecting a candidate MBB");
-  if (PredMBBInfo->Modified)
-    fixupModifiedCond(PredMBB);
-  CC = PredMBBInfo->BranchCode;
-  // Don't do this if depth of BranchPath is 1 and PredMBB is of COND_E.
-  // We will short-cycle directly for this case.
-  if (!(CC == X86::COND_E && BranchPath.size() == 1))
-    replaceBrDest(PredMBB, &MBB, MBBInfo->FBB);
-
-  MachineBasicBlock *RootMBB = BranchPath.back();
-  TargetMBBInfo *RootMBBInfo = getMBBInfo(RootMBB);
-  assert(RootMBBInfo && "Expecting a candidate MBB");
-  if (RootMBBInfo->Modified)
-    fixupModifiedCond(RootMBB);
-  CC = RootMBBInfo->BranchCode;
-
-  if (CC != X86::COND_E) {
-    MachineBasicBlock::iterator UncondBrI = findUncondBrI(RootMBB);
-    // RootMBB: Cond jump to the original not-taken MBB.
-    X86::CondCode NewCC;
-    switch (CC) {
-    case X86::COND_L:
-      NewCC = X86::COND_G;
-      break;
-    case X86::COND_G:
-      NewCC = X86::COND_L;
-      break;
-    default:
-      llvm_unreachable("unexpected condtional code.");
-    }
-    BuildMI(*RootMBB, UncondBrI, RootMBB->findDebugLoc(UncondBrI),
-            TII->get(X86::JCC_1))
-        .addMBB(RootMBBInfo->FBB).addImm(NewCC);
-
-    // RootMBB: Jump to TargetMBB
-    BuildMI(*RootMBB, UncondBrI, RootMBB->findDebugLoc(UncondBrI),
-            TII->get(X86::JMP_1))
-        .addMBB(TargetMBB);
-    RootMBB->addSuccessor(TargetMBB);
-    fixPHIsInSucc(TargetMBB, &MBB, RootMBB);
-    RootMBB->erase(UncondBrI);
-  } else {
-    replaceBrDest(RootMBB, RootMBBInfo->TBB, TargetMBB);
-  }
-
-  // Fix RootMBB's CmpValue to MBB's CmpValue to TargetMBB. Don't set Imm
-  // directly. Move MBB's stmt to here as the opcode might be different.
-  if (RootMBBInfo->CmpValue != MBBInfo->CmpValue) {
-    MachineInstr *NewCmp = MBBInfo->CmpInstr;
-    NewCmp->removeFromParent();
-    RootMBB->insert(RootMBBInfo->CmpInstr, NewCmp);
-    RootMBBInfo->CmpInstr->eraseFromParent();
-  }
-
-  // Fix branch Probabilities.
-  auto fixBranchProb = [&](MachineBasicBlock *NextMBB) {
-    BranchProbability Prob;
-    for (auto &I : BranchPath) {
-      MachineBasicBlock *ThisMBB = I;
-      if (!ThisMBB->hasSuccessorProbabilities() ||
-          !ThisMBB->isSuccessor(NextMBB))
-        break;
-      Prob = MBPI->getEdgeProbability(ThisMBB, NextMBB);
-      if (Prob.isUnknown())
-        break;
-      TargetProb = Prob * TargetProb;
-      Prob = Prob - TargetProb;
-      setBranchProb(ThisMBB, NextMBB, Prob);
-      if (ThisMBB == RootMBB) {
-        setBranchProb(ThisMBB, TargetMBB, TargetProb);
-      }
-      ThisMBB->normalizeSuccProbs();
-      if (ThisMBB == RootMBB)
-        break;
-      NextMBB = ThisMBB;
-    }
-    return true;
-  };
-  if (CC != X86::COND_E && !TargetProb.isUnknown())
-    fixBranchProb(MBBInfo->FBB);
-
-  if (CC != X86::COND_E)
-    RemoveList.push_back(&MBB);
-
-  // Invalidate MBBInfo just in case.
-  MBBInfos[MBB.getNumber()] = nullptr;
-  MBBInfos[RootMBB->getNumber()] = nullptr;
-
-  LLVM_DEBUG(dbgs() << "After optimization:\nRootMBB is: " << *RootMBB << "\n");
-  if (BranchPath.size() > 1)
-    LLVM_DEBUG(dbgs() << "PredMBB is: " << *(BranchPath[0]) << "\n");
-}
-
-// Driver function for optimization: find the valid candidate and apply
-// the transformation.
-bool X86CondBrFolding::optimize() {
-  bool Changed = false;
-  LLVM_DEBUG(dbgs() << "***** X86CondBr Folding on Function: " << MF.getName()
-                    << " *****\n");
-  // Setup data structures.
-  MBBInfos.resize(MF.getNumBlockIDs());
-  for (auto &MBB : MF)
-    MBBInfos[MBB.getNumber()] = analyzeMBB(MBB);
-
-  for (auto &MBB : MF) {
-    TargetMBBInfo *MBBInfo = getMBBInfo(&MBB);
-    if (!MBBInfo || !MBBInfo->CmpBrOnly)
-      continue;
-    if (MBB.pred_size() != 1)
-      continue;
-    LLVM_DEBUG(dbgs() << "Work on MBB." << MBB.getNumber()
-                      << " CmpValue: " << MBBInfo->CmpValue << "\n");
-    SmallVector<MachineBasicBlock *, 4> BranchPath;
-    if (!findPath(&MBB, BranchPath))
-      continue;
-
-#ifndef NDEBUG
-    LLVM_DEBUG(dbgs() << "Found one path (len=" << BranchPath.size() << "):\n");
-    int Index = 1;
-    LLVM_DEBUG(dbgs() << "Target MBB is: " << MBB << "\n");
-    for (auto I = BranchPath.rbegin(); I != BranchPath.rend(); ++I, ++Index) {
-      MachineBasicBlock *PMBB = *I;
-      TargetMBBInfo *PMBBInfo = getMBBInfo(PMBB);
-      LLVM_DEBUG(dbgs() << "Path MBB (" << Index << " of " << BranchPath.size()
-                        << ") is " << *PMBB);
-      LLVM_DEBUG(dbgs() << "CC=" << PMBBInfo->BranchCode
-                        << "  Val=" << PMBBInfo->CmpValue
-                        << "  CmpBrOnly=" << PMBBInfo->CmpBrOnly << "\n\n");
-    }
-#endif
-    optimizeCondBr(MBB, BranchPath);
-    Changed = true;
-  }
-  NumFixedCondBrs += RemoveList.size();
-  for (auto MBBI : RemoveList) {
-    while (!MBBI->succ_empty())
-      MBBI->removeSuccessor(MBBI->succ_end() - 1);
-
-    MBBI->eraseFromParent();
-  }
-
-  return Changed;
-}
-
-// Analyze instructions that generate CondCode and extract information.
-bool X86CondBrFolding::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
-                                      int &CmpValue) {
-  unsigned SrcRegIndex = 0;
-  unsigned ValueIndex = 0;
-  switch (MI.getOpcode()) {
-  // TODO: handle test instructions.
-  default:
-    return false;
-  case X86::CMP64ri32:
-  case X86::CMP64ri8:
-  case X86::CMP32ri:
-  case X86::CMP32ri8:
-  case X86::CMP16ri:
-  case X86::CMP16ri8:
-  case X86::CMP8ri:
-    SrcRegIndex = 0;
-    ValueIndex = 1;
-    break;
-  case X86::SUB64ri32:
-  case X86::SUB64ri8:
-  case X86::SUB32ri:
-  case X86::SUB32ri8:
-  case X86::SUB16ri:
-  case X86::SUB16ri8:
-  case X86::SUB8ri:
-    SrcRegIndex = 1;
-    ValueIndex = 2;
-    break;
-  }
-  SrcReg = MI.getOperand(SrcRegIndex).getReg();
-  if (!MI.getOperand(ValueIndex).isImm())
-    return false;
-  CmpValue = MI.getOperand(ValueIndex).getImm();
-  return true;
-}
-
-// Analyze a candidate MBB and set the extract all the information needed.
-// The valid candidate will have two successors.
-// It also should have a sequence of
-//  Branch_instr,
-//  CondBr,
-//  UnCondBr.
-// Return TargetMBBInfo if MBB is a valid candidate and nullptr otherwise.
-std::unique_ptr<TargetMBBInfo>
-X86CondBrFolding::analyzeMBB(MachineBasicBlock &MBB) {
-  MachineBasicBlock *TBB;
-  MachineBasicBlock *FBB;
-  MachineInstr *BrInstr;
-  MachineInstr *CmpInstr;
-  X86::CondCode CC;
-  unsigned SrcReg;
-  int CmpValue;
-  bool Modified;
-  bool CmpBrOnly;
-
-  if (MBB.succ_size() != 2)
-    return nullptr;
-
-  CmpBrOnly = true;
-  FBB = TBB = nullptr;
-  CmpInstr = nullptr;
-  MachineBasicBlock::iterator I = MBB.end();
-  while (I != MBB.begin()) {
-    --I;
-    if (I->isDebugValue())
-      continue;
-    if (I->getOpcode() == X86::JMP_1) {
-      if (FBB)
-        return nullptr;
-      FBB = I->getOperand(0).getMBB();
-      continue;
-    }
-    if (I->isBranch()) {
-      if (TBB)
-        return nullptr;
-      CC = X86::getCondFromBranch(*I);
-      switch (CC) {
-      default:
-        return nullptr;
-      case X86::COND_E:
-      case X86::COND_L:
-      case X86::COND_G:
-      case X86::COND_NE:
-      case X86::COND_LE:
-      case X86::COND_GE:
-        break;
-      }
-      TBB = I->getOperand(0).getMBB();
-      BrInstr = &*I;
-      continue;
-    }
-    if (analyzeCompare(*I, SrcReg, CmpValue)) {
-      if (CmpInstr)
-        return nullptr;
-      CmpInstr = &*I;
-      continue;
-    }
-    CmpBrOnly = false;
-    break;
-  }
-
-  if (!TBB || !FBB || !CmpInstr)
-    return nullptr;
-
-  // Simplify CondCode. Note this is only to simplify the findPath logic
-  // and will not change the instruction here.
-  switch (CC) {
-  case X86::COND_NE:
-    CC = X86::COND_E;
-    std::swap(TBB, FBB);
-    Modified = true;
-    break;
-  case X86::COND_LE:
-    if (CmpValue == INT_MAX)
-      return nullptr;
-    CC = X86::COND_L;
-    CmpValue += 1;
-    Modified = true;
-    break;
-  case X86::COND_GE:
-    if (CmpValue == INT_MIN)
-      return nullptr;
-    CC = X86::COND_G;
-    CmpValue -= 1;
-    Modified = true;
-    break;
-  default:
-    Modified = false;
-    break;
-  }
-  return std::make_unique<TargetMBBInfo>(TargetMBBInfo{
-      TBB, FBB, BrInstr, CmpInstr, CC, SrcReg, CmpValue, Modified, CmpBrOnly});
-}
-
-bool X86CondBrFoldingPass::runOnMachineFunction(MachineFunction &MF) {
-  const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
-  if (!ST.threewayBranchProfitable())
-    return false;
-  const X86InstrInfo *TII = ST.getInstrInfo();
-  const MachineBranchProbabilityInfo *MBPI =
-      &getAnalysis<MachineBranchProbabilityInfo>();
-
-  X86CondBrFolding CondBr(TII, MBPI, MF);
-  return CondBr.optimize();
-}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86DomainReassignment.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86DomainReassignment.cpp
index 488ee51f1d89..a2ae6345c006 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86DomainReassignment.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86DomainReassignment.cpp
@@ -141,7 +141,7 @@ public:
       return false;
     // It's illegal to replace an instruction that implicitly defines a register
     // with an instruction that doesn't, unless that register dead.
-    for (auto &MO : MI->implicit_operands())
+    for (const auto &MO : MI->implicit_operands())
       if (MO.isReg() && MO.isDef() && !MO.isDead() &&
           !TII->get(DstOpcode).hasImplicitDefOfPhysReg(MO.getReg()))
         return false;
@@ -180,7 +180,7 @@ public:
                     MachineRegisterInfo *MRI) const override {
     assert(isLegal(MI, TII) && "Cannot convert instruction");
     MachineBasicBlock *MBB = MI->getParent();
-    auto &DL = MI->getDebugLoc();
+    const DebugLoc &DL = MI->getDebugLoc();
 
     Register Reg = MRI->createVirtualRegister(
         TII->getRegClass(TII->get(DstOpcode), 0, MRI->getTargetRegisterInfo(),
@@ -220,14 +220,12 @@ public:
     // Don't allow copies to/flow GR8/GR16 physical registers.
     // FIXME: Is there some better way to support this?
     Register DstReg = MI->getOperand(0).getReg();
-    if (Register::isPhysicalRegister(DstReg) &&
-        (X86::GR8RegClass.contains(DstReg) ||
-         X86::GR16RegClass.contains(DstReg)))
+    if (DstReg.isPhysical() && (X86::GR8RegClass.contains(DstReg) ||
+                                X86::GR16RegClass.contains(DstReg)))
       return false;
     Register SrcReg = MI->getOperand(1).getReg();
-    if (Register::isPhysicalRegister(SrcReg) &&
-        (X86::GR8RegClass.contains(SrcReg) ||
-         X86::GR16RegClass.contains(SrcReg)))
+    if (SrcReg.isPhysical() && (X86::GR8RegClass.contains(SrcReg) ||
+                                X86::GR16RegClass.contains(SrcReg)))
       return false;
 
     return true;
@@ -237,7 +235,7 @@ public:
                       MachineRegisterInfo *MRI) const override {
     assert(MI->getOpcode() == TargetOpcode::COPY && "Expected a COPY");
 
-    for (auto &MO : MI->operands()) {
+    for (const auto &MO : MI->operands()) {
       // Physical registers will not be converted. Assume that converting the
       // COPY to the destination domain will eventually result in a actual
       // instruction.
@@ -300,7 +298,7 @@ typedef DenseMap<InstrConverterBaseKeyTy, std::unique_ptr<InstrConverterBase>>
 class Closure {
 private:
   /// Virtual registers in the closure.
-  DenseSet<unsigned> Edges;
+  DenseSet<Register> Edges;
 
   /// Instructions in the closure.
   SmallVector<MachineInstr *, 8> Instrs;
@@ -332,11 +330,9 @@ public:
 
   bool empty() const { return Edges.empty(); }
 
-  bool insertEdge(unsigned Reg) {
-    return Edges.insert(Reg).second;
-  }
+  bool insertEdge(Register Reg) { return Edges.insert(Reg).second; }
 
-  using const_edge_iterator = DenseSet<unsigned>::const_iterator;
+  using const_edge_iterator = DenseSet<Register>::const_iterator;
   iterator_range<const_edge_iterator> edges() const {
     return iterator_range<const_edge_iterator>(Edges.begin(), Edges.end());
   }
@@ -352,7 +348,7 @@ public:
   LLVM_DUMP_METHOD void dump(const MachineRegisterInfo *MRI) const {
     dbgs() << "Registers: ";
     bool First = true;
-    for (unsigned Reg : Edges) {
+    for (Register Reg : Edges) {
       if (!First)
         dbgs() << ", ";
       First = false;
@@ -407,10 +403,10 @@ private:
   void initConverters();
 
   /// Starting from \Reg, expand the closure as much as possible.
-  void buildClosure(Closure &, unsigned Reg);
+  void buildClosure(Closure &, Register Reg);
 
   /// Enqueue \p Reg to be considered for addition to the closure.
-  void visitRegister(Closure &, unsigned Reg, RegDomain &Domain,
+  void visitRegister(Closure &, Register Reg, RegDomain &Domain,
                      SmallVectorImpl<unsigned> &Worklist);
 
   /// Reassign the closure to \p Domain.
@@ -430,13 +426,13 @@ char X86DomainReassignment::ID = 0;
 
 } // End anonymous namespace.
 
-void X86DomainReassignment::visitRegister(Closure &C, unsigned Reg,
+void X86DomainReassignment::visitRegister(Closure &C, Register Reg,
                                           RegDomain &Domain,
                                           SmallVectorImpl<unsigned> &Worklist) {
   if (EnclosedEdges.count(Reg))
     return;
 
-  if (!Register::isVirtualRegister(Reg))
+  if (!Reg.isVirtual())
     return;
 
   if (!MRI->hasOneDef(Reg))
@@ -507,7 +503,7 @@ void X86DomainReassignment::reassign(const Closure &C, RegDomain Domain) const {
 
   // Iterate all registers in the closure, replace them with registers in the
   // destination domain.
-  for (unsigned Reg : C.edges()) {
+  for (Register Reg : C.edges()) {
     MRI->setRegClass(Reg, getDstRC(MRI->getRegClass(Reg), Domain));
     for (auto &MO : MRI->use_operands(Reg)) {
       if (MO.isReg())
@@ -517,13 +513,13 @@ void X86DomainReassignment::reassign(const Closure &C, RegDomain Domain) const {
     }
   }
 
-  for (auto MI : ToErase)
+  for (auto *MI : ToErase)
     MI->eraseFromParent();
 }
 
 /// \returns true when \p Reg is used as part of an address calculation in \p
 /// MI.
-static bool usedAsAddr(const MachineInstr &MI, unsigned Reg,
+static bool usedAsAddr(const MachineInstr &MI, Register Reg,
                        const TargetInstrInfo *TII) {
   if (!MI.mayLoadOrStore())
     return false;
@@ -537,14 +533,14 @@ static bool usedAsAddr(const MachineInstr &MI, unsigned Reg,
   for (unsigned MemOpIdx = MemOpStart,
                 MemOpEnd = MemOpStart + X86::AddrNumOperands;
        MemOpIdx < MemOpEnd; ++MemOpIdx) {
-    auto &Op = MI.getOperand(MemOpIdx);
+    const MachineOperand &Op = MI.getOperand(MemOpIdx);
     if (Op.isReg() && Op.getReg() == Reg)
       return true;
   }
   return false;
 }
 
-void X86DomainReassignment::buildClosure(Closure &C, unsigned Reg) {
+void X86DomainReassignment::buildClosure(Closure &C, Register Reg) {
   SmallVector<unsigned, 4> Worklist;
   RegDomain Domain = NoDomain;
   visitRegister(C, Reg, Domain, Worklist);
@@ -594,7 +590,7 @@ void X86DomainReassignment::buildClosure(Closure &C, unsigned Reg) {
           continue;
 
         Register DefReg = DefOp.getReg();
-        if (!Register::isVirtualRegister(DefReg)) {
+        if (!DefReg.isVirtual()) {
           C.setAllIllegal();
           continue;
         }
@@ -753,7 +749,7 @@ bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) {
   // Go over all virtual registers and calculate a closure.
   unsigned ClosureID = 0;
   for (unsigned Idx = 0; Idx < MRI->getNumVirtRegs(); ++Idx) {
-    unsigned Reg = Register::index2VirtReg(Idx);
+    Register Reg = Register::index2VirtReg(Idx);
 
     // GPR only current source domain supported.
     if (!isGPR(MRI->getRegClass(Reg)))
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86EvexToVex.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86EvexToVex.cpp
index 540ad98b6d54..97f843fa24eb 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86EvexToVex.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86EvexToVex.cpp
@@ -85,6 +85,8 @@ public:
 private:
   /// Machine instruction info used throughout the class.
   const X86InstrInfo *TII = nullptr;
+
+  const X86Subtarget *ST = nullptr;
 };
 
 } // end anonymous namespace
@@ -94,8 +96,8 @@ char EvexToVexInstPass::ID = 0;
 bool EvexToVexInstPass::runOnMachineFunction(MachineFunction &MF) {
   TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
 
-  const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
-  if (!ST.hasAVX512())
+  ST = &MF.getSubtarget<X86Subtarget>();
+  if (!ST->hasAVX512())
     return false;
 
   bool Changed = false;
@@ -144,10 +146,29 @@ static bool usesExtendedRegister(const MachineInstr &MI) {
 }
 
 // Do any custom cleanup needed to finalize the conversion.
-static bool performCustomAdjustments(MachineInstr &MI, unsigned NewOpc) {
+static bool performCustomAdjustments(MachineInstr &MI, unsigned NewOpc,
+                                     const X86Subtarget *ST) {
   (void)NewOpc;
   unsigned Opc = MI.getOpcode();
   switch (Opc) {
+  case X86::VPDPBUSDSZ256m:
+  case X86::VPDPBUSDSZ256r:
+  case X86::VPDPBUSDSZ128m:
+  case X86::VPDPBUSDSZ128r:
+  case X86::VPDPBUSDZ256m:
+  case X86::VPDPBUSDZ256r:
+  case X86::VPDPBUSDZ128m:
+  case X86::VPDPBUSDZ128r:
+  case X86::VPDPWSSDSZ256m:
+  case X86::VPDPWSSDSZ256r:
+  case X86::VPDPWSSDSZ128m:
+  case X86::VPDPWSSDSZ128r:
+  case X86::VPDPWSSDZ256m:
+  case X86::VPDPWSSDZ256r:
+  case X86::VPDPWSSDZ128m:
+  case X86::VPDPWSSDZ128r:
+    // These can only VEX convert if AVXVNNI is enabled.
+    return ST->hasAVXVNNI();
   case X86::VALIGNDZ128rri:
   case X86::VALIGNDZ128rmi:
   case X86::VALIGNQZ128rri:
@@ -250,7 +271,7 @@ bool EvexToVexInstPass::CompressEvexToVexImpl(MachineInstr &MI) const {
     (Desc.TSFlags & X86II::VEX_L) ? makeArrayRef(X86EvexToVex256CompressTable)
                                   : makeArrayRef(X86EvexToVex128CompressTable);
 
-  auto I = llvm::lower_bound(Table, MI.getOpcode());
+  const auto *I = llvm::lower_bound(Table, MI.getOpcode());
   if (I == Table.end() || I->EvexOpcode != MI.getOpcode())
     return false;
 
@@ -259,7 +280,7 @@ bool EvexToVexInstPass::CompressEvexToVexImpl(MachineInstr &MI) const {
   if (usesExtendedRegister(MI))
     return false;
 
-  if (!performCustomAdjustments(MI, NewOpc))
+  if (!performCustomAdjustments(MI, NewOpc, ST))
     return false;
 
   MI.setDesc(TII->get(NewOpc));
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86ExpandPseudo.cpp
index c47ef4708e91..15af0fb2e888 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -334,32 +334,28 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     MBB.erase(MBBI);
     return true;
   }
-  case X86::LCMPXCHG8B_SAVE_EBX:
   case X86::LCMPXCHG16B_SAVE_RBX: {
     // Perform the following transformation.
     // SaveRbx = pseudocmpxchg Addr, <4 opds for the address>, InArg, SaveRbx
     // =>
-    // [E|R]BX = InArg
+    // RBX = InArg
     // actualcmpxchg Addr
-    // [E|R]BX = SaveRbx
+    // RBX = SaveRbx
     const MachineOperand &InArg = MBBI->getOperand(6);
     Register SaveRbx = MBBI->getOperand(7).getReg();
 
-    unsigned ActualInArg =
-        Opcode == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
     // Copy the input argument of the pseudo into the argument of the
     // actual instruction.
-    TII->copyPhysReg(MBB, MBBI, DL, ActualInArg, InArg.getReg(),
-                     InArg.isKill());
+    // NOTE: We don't copy the kill flag since the input might be the same reg
+    // as one of the other operands of LCMPXCHG16B.
+    TII->copyPhysReg(MBB, MBBI, DL, X86::RBX, InArg.getReg(), false);
     // Create the actual instruction.
-    unsigned ActualOpc =
-        Opcode == X86::LCMPXCHG8B_SAVE_EBX ? X86::LCMPXCHG8B : X86::LCMPXCHG16B;
-    MachineInstr *NewInstr = BuildMI(MBB, MBBI, DL, TII->get(ActualOpc));
+    MachineInstr *NewInstr = BuildMI(MBB, MBBI, DL, TII->get(X86::LCMPXCHG16B));
     // Copy the operands related to the address.
     for (unsigned Idx = 1; Idx < 6; ++Idx)
       NewInstr->addOperand(MBBI->getOperand(Idx));
     // Finally, restore the value of RBX.
-    TII->copyPhysReg(MBB, MBBI, DL, ActualInArg, SaveRbx,
+    TII->copyPhysReg(MBB, MBBI, DL, X86::RBX, SaveRbx,
                      /*SrcIsKill*/ true);
 
     // Delete the pseudo.
@@ -442,9 +438,68 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     MBB.erase(MBBI);
     return true;
   }
+  case X86::MWAITX_SAVE_RBX: {
+    // Perform the following transformation.
+    // SaveRbx = pseudomwaitx InArg, SaveRbx
+    // =>
+    // [E|R]BX = InArg
+    // actualmwaitx
+    // [E|R]BX = SaveRbx
+    const MachineOperand &InArg = MBBI->getOperand(1);
+    // Copy the input argument of the pseudo into the argument of the
+    // actual instruction.
+    TII->copyPhysReg(MBB, MBBI, DL, X86::EBX, InArg.getReg(), InArg.isKill());
+    // Create the actual instruction.
+    BuildMI(MBB, MBBI, DL, TII->get(X86::MWAITXrrr));
+    // Finally, restore the value of RBX.
+    Register SaveRbx = MBBI->getOperand(2).getReg();
+    TII->copyPhysReg(MBB, MBBI, DL, X86::RBX, SaveRbx, /*SrcIsKill*/ true);
+    // Delete the pseudo.
+    MBBI->eraseFromParent();
+    return true;
+  }
   case TargetOpcode::ICALL_BRANCH_FUNNEL:
     ExpandICallBranchFunnel(&MBB, MBBI);
     return true;
+  case X86::PLDTILECFG: {
+    MI.RemoveOperand(0);
+    MI.setDesc(TII->get(X86::LDTILECFG));
+    return true;
+  }
+  case X86::PSTTILECFG: {
+    MI.RemoveOperand(MI.getNumOperands() - 1); // Remove $tmmcfg
+    MI.setDesc(TII->get(X86::STTILECFG));
+    return true;
+  }
+  case X86::PTILELOADDV: {
+    MI.RemoveOperand(8); // Remove $tmmcfg
+    for (unsigned i = 2; i > 0; --i)
+      MI.RemoveOperand(i);
+    MI.setDesc(TII->get(X86::TILELOADD));
+    return true;
+  }
+  case X86::PTDPBSSDV: {
+    MI.RemoveOperand(7); // Remove $tmmcfg
+    MI.untieRegOperand(4);
+    for (unsigned i = 3; i > 0; --i)
+      MI.RemoveOperand(i);
+    MI.setDesc(TII->get(X86::TDPBSSD));
+    MI.tieOperands(0, 1);
+    return true;
+  }
+  case X86::PTILESTOREDV: {
+    MI.RemoveOperand(8); // Remove $tmmcfg
+    for (int i = 1; i >= 0; --i)
+      MI.RemoveOperand(i);
+    MI.setDesc(TII->get(X86::TILESTORED));
+    return true;
+  }
+  case X86::PTILEZEROV: {
+    for (int i = 3; i > 0; --i) // Remove row, col, $tmmcfg
+      MI.RemoveOperand(i);
+    MI.setDesc(TII->get(X86::TILEZERO));
+    return true;
+  }
   }
   llvm_unreachable("Previous switch has a fallthrough?");
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86FastISel.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86FastISel.cpp
index b305940139c0..a1a16a19f5e5 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86FastISel.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86FastISel.cpp
@@ -284,6 +284,14 @@ bool X86FastISel::foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I,
       return false;
   }
 
+  // Make sure no potentially eflags clobbering phi moves can be inserted in
+  // between.
+  auto HasPhis = [](const BasicBlock *Succ) {
+    return !llvm::empty(Succ->phis());
+  };
+  if (I->isTerminator() && llvm::any_of(successors(I), HasPhis))
+    return false;
+
   CC = TmpCC;
   return true;
 }
@@ -779,14 +787,14 @@ bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) {
         if (TLI.getPointerTy(DL) == MVT::i64) {
           Opc = X86::MOV64rm;
           RC  = &X86::GR64RegClass;
-
-          if (Subtarget->isPICStyleRIPRel())
-            StubAM.Base.Reg = X86::RIP;
         } else {
           Opc = X86::MOV32rm;
           RC  = &X86::GR32RegClass;
         }
 
+        if (Subtarget->isPICStyleRIPRel() || GVFlags == X86II::MO_GOTPCREL)
+          StubAM.Base.Reg = X86::RIP;
+
         LoadReg = createResultReg(RC);
         MachineInstrBuilder LoadMI =
           BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), LoadReg);
@@ -1082,13 +1090,35 @@ bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) {
 
   // If all else fails, try to materialize the value in a register.
   if (!AM.GV || !Subtarget->isPICStyleRIPRel()) {
+    auto GetCallRegForValue = [this](const Value *V) {
+      Register Reg = getRegForValue(V);
+
+      // In 64-bit mode, we need a 64-bit register even if pointers are 32 bits.
+      if (Reg && Subtarget->isTarget64BitILP32()) {
+        Register CopyReg = createResultReg(&X86::GR32RegClass);
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV32rr),
+                CopyReg)
+            .addReg(Reg);
+
+        Register ExtReg = createResultReg(&X86::GR64RegClass);
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                TII.get(TargetOpcode::SUBREG_TO_REG), ExtReg)
+            .addImm(0)
+            .addReg(CopyReg)
+            .addImm(X86::sub_32bit);
+        Reg = ExtReg;
+      }
+
+      return Reg;
+    };
+
     if (AM.Base.Reg == 0) {
-      AM.Base.Reg = getRegForValue(V);
+      AM.Base.Reg = GetCallRegForValue(V);
       return AM.Base.Reg != 0;
     }
     if (AM.IndexReg == 0) {
       assert(AM.Scale == 1 && "Scale with no index!");
-      AM.IndexReg = getRegForValue(V);
+      AM.IndexReg = GetCallRegForValue(V);
       return AM.IndexReg != 0;
     }
   }
@@ -1231,13 +1261,15 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
       if (SrcVT == MVT::i1) {
         if (Outs[0].Flags.isSExt())
           return false;
-        SrcReg = fastEmitZExtFromI1(MVT::i8, SrcReg, /*TODO: Kill=*/false);
+        // TODO
+        SrcReg = fastEmitZExtFromI1(MVT::i8, SrcReg, /*Op0IsKill=*/false);
         SrcVT = MVT::i8;
       }
       unsigned Op = Outs[0].Flags.isZExt() ? ISD::ZERO_EXTEND :
                                              ISD::SIGN_EXTEND;
-      SrcReg = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Op,
-                          SrcReg, /*TODO: Kill=*/false);
+      // TODO
+      SrcReg = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Op, SrcReg,
+                          /*Op0IsKill=*/false);
     }
 
     // Make the copy.
@@ -1431,8 +1463,8 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) {
     ResultReg = createResultReg(&X86::GR32RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV32r0),
             ResultReg);
-    ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultReg, /*Kill=*/true,
-                                           X86::sub_8bit);
+    ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultReg,
+                                           /*Op0IsKill=*/true, X86::sub_8bit);
     if (!ResultReg)
       return false;
     break;
@@ -1555,11 +1587,11 @@ bool X86FastISel::X86SelectZExt(const Instruction *I) {
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOVZX32rr8),
             Result32).addReg(ResultReg);
 
-    ResultReg = fastEmitInst_extractsubreg(MVT::i16, Result32, /*Kill=*/true,
-                                           X86::sub_16bit);
+    ResultReg = fastEmitInst_extractsubreg(MVT::i16, Result32,
+                                           /*Op0IsKill=*/true, X86::sub_16bit);
   } else if (DstVT != MVT::i8) {
     ResultReg = fastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::ZERO_EXTEND,
-                           ResultReg, /*Kill=*/true);
+                           ResultReg, /*Op0IsKill=*/true);
     if (ResultReg == 0)
       return false;
   }
@@ -1601,11 +1633,11 @@ bool X86FastISel::X86SelectSExt(const Instruction *I) {
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOVSX32rr8),
             Result32).addReg(ResultReg);
 
-    ResultReg = fastEmitInst_extractsubreg(MVT::i16, Result32, /*Kill=*/true,
-                                           X86::sub_16bit);
+    ResultReg = fastEmitInst_extractsubreg(MVT::i16, Result32,
+                                           /*Op0IsKill=*/true, X86::sub_16bit);
   } else if (DstVT != MVT::i8) {
     ResultReg = fastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::SIGN_EXTEND,
-                           ResultReg, /*Kill=*/true);
+                           ResultReg, /*Op0IsKill=*/true);
     if (ResultReg == 0)
       return false;
   }
@@ -1757,7 +1789,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY), OpReg)
         .addReg(KOpReg);
-    OpReg = fastEmitInst_extractsubreg(MVT::i8, OpReg, /*Kill=*/true,
+    OpReg = fastEmitInst_extractsubreg(MVT::i8, OpReg, /*Op0IsKill=*/true,
                                        X86::sub_8bit);
   }
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
@@ -1989,7 +2021,7 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) {
 
     // Now reference the 8-bit subreg of the result.
     ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultSuperReg,
-                                           /*Kill=*/true, X86::sub_8bit);
+                                           /*Op0IsKill=*/true, X86::sub_8bit);
   }
   // Copy the result out of the physreg if we haven't already.
   if (!ResultReg) {
@@ -2103,7 +2135,7 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
               TII.get(TargetOpcode::COPY), CondReg)
           .addReg(KCondReg, getKillRegState(CondIsKill));
-      CondReg = fastEmitInst_extractsubreg(MVT::i8, CondReg, /*Kill=*/true,
+      CondReg = fastEmitInst_extractsubreg(MVT::i8, CondReg, /*Op0IsKill=*/true,
                                            X86::sub_8bit);
     }
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
@@ -2257,12 +2289,12 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) {
     const TargetRegisterClass *VR128 = &X86::VR128RegClass;
     Register CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill,
                                        CmpRHSReg, CmpRHSIsKill, CC);
-    Register AndReg = fastEmitInst_rr(Opc[1], VR128, CmpReg, /*IsKill=*/false,
-                                      LHSReg, LHSIsKill);
-    Register AndNReg = fastEmitInst_rr(Opc[2], VR128, CmpReg, /*IsKill=*/true,
-                                       RHSReg, RHSIsKill);
-    Register OrReg = fastEmitInst_rr(Opc[3], VR128, AndNReg, /*IsKill=*/true,
-                                     AndReg, /*IsKill=*/true);
+    Register AndReg = fastEmitInst_rr(Opc[1], VR128, CmpReg,
+                                      /*Op0IsKill=*/false, LHSReg, LHSIsKill);
+    Register AndNReg = fastEmitInst_rr(Opc[2], VR128, CmpReg,
+                                       /*Op0IsKill=*/true, RHSReg, RHSIsKill);
+    Register OrReg = fastEmitInst_rr(Opc[3], VR128, AndNReg, /*Op0IsKill=*/true,
+                                     AndReg, /*Op1IsKill=*/true);
     ResultReg = createResultReg(RC);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY), ResultReg).addReg(OrReg);
@@ -2321,7 +2353,7 @@ bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) {
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
               TII.get(TargetOpcode::COPY), CondReg)
           .addReg(KCondReg, getKillRegState(CondIsKill));
-      CondReg = fastEmitInst_extractsubreg(MVT::i8, CondReg, /*Kill=*/true,
+      CondReg = fastEmitInst_extractsubreg(MVT::i8, CondReg, /*Op0IsKill=*/true,
                                            X86::sub_8bit);
     }
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
@@ -2578,7 +2610,7 @@ bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM,
 
     unsigned Reg;
     bool RV = X86FastEmitLoad(VT, SrcAM, nullptr, Reg);
-    RV &= X86FastEmitStore(VT, Reg, /*Kill=*/true, DestAM);
+    RV &= X86FastEmitStore(VT, Reg, /*ValIsKill=*/true, DestAM);
     assert(RV && "Failed to emit load or store??");
 
     unsigned Size = VT.getSizeInBits()/8;
@@ -2642,15 +2674,15 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
       assert(Op->getType()->isIntegerTy(16) && "Expected a 16-bit integer!");
       // Explicitly zero-extend the input to 32-bit.
       InputReg = fastEmit_r(MVT::i16, MVT::i32, ISD::ZERO_EXTEND, InputReg,
-                            /*Kill=*/false);
+                            /*Op0IsKill=*/false);
 
       // The following SCALAR_TO_VECTOR will be expanded into a VMOVDI2PDIrr.
       InputReg = fastEmit_r(MVT::i32, MVT::v4i32, ISD::SCALAR_TO_VECTOR,
-                            InputReg, /*Kill=*/true);
+                            InputReg, /*Op0IsKill=*/true);
 
       unsigned Opc = Subtarget->hasVLX() ? X86::VCVTPH2PSZ128rr
                                          : X86::VCVTPH2PSrr;
-      InputReg = fastEmitInst_r(Opc, RC, InputReg, /*Kill=*/true);
+      InputReg = fastEmitInst_r(Opc, RC, InputReg, /*Op0IsKill=*/true);
 
       // The result value is in the lower 32-bits of ResultReg.
       // Emit an explicit copy from register class VR128 to register class FR32.
@@ -2706,10 +2738,9 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
     // movq (%rax), %rax
     // movq (%rax), %rax
     // ...
-    unsigned DestReg;
     unsigned Depth = cast<ConstantInt>(II->getOperand(0))->getZExtValue();
     while (Depth--) {
-      DestReg = createResultReg(RC);
+      Register DestReg = createResultReg(RC);
       addDirectMem(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                            TII.get(Opc), DestReg), SrcReg);
       SrcReg = DestReg;
@@ -2879,8 +2910,7 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
     const Value *RHS = II->getArgOperand(1);
 
     // Canonicalize immediate to the RHS.
-    if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) &&
-        isCommutativeIntrinsic(II))
+    if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) && II->isCommutative())
       std::swap(LHS, RHS);
 
     unsigned BaseOpc, CondCode;
@@ -3693,10 +3723,10 @@ unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) {
     default: llvm_unreachable("Unexpected value type");
     case MVT::i1:
     case MVT::i8:
-      return fastEmitInst_extractsubreg(MVT::i8, SrcReg, /*Kill=*/true,
+      return fastEmitInst_extractsubreg(MVT::i8, SrcReg, /*Op0IsKill=*/true,
                                         X86::sub_8bit);
     case MVT::i16:
-      return fastEmitInst_extractsubreg(MVT::i16, SrcReg, /*Kill=*/true,
+      return fastEmitInst_extractsubreg(MVT::i16, SrcReg, /*Op0IsKill=*/true,
                                         X86::sub_16bit);
     case MVT::i32:
       return SrcReg;
@@ -3793,7 +3823,7 @@ unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) {
       .addConstantPoolIndex(CPI, 0, OpFlag);
     MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                                       TII.get(Opc), ResultReg);
-    addDirectMem(MIB, AddrReg);
+    addRegReg(MIB, AddrReg, false, PICBase, false);
     MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
         MachinePointerInfo::getConstantPool(*FuncInfo.MF),
         MachineMemOperand::MOLoad, DL.getPointerSize(), Alignment);
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86FixupBWInsts.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86FixupBWInsts.cpp
index 78de041329e2..f8d822aebc5b 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86FixupBWInsts.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86FixupBWInsts.cpp
@@ -187,8 +187,7 @@ bool FixupBWInstPass::runOnMachineFunction(MachineFunction &MF) {
 /// If so, return that super register in \p SuperDestReg.
 bool FixupBWInstPass::getSuperRegDestIfDead(MachineInstr *OrigMI,
                                             Register &SuperDestReg) const {
-  auto *TRI = &TII->getRegisterInfo();
-
+  const X86RegisterInfo *TRI = &TII->getRegisterInfo();
   Register OrigDestReg = OrigMI->getOperand(0).getReg();
   SuperDestReg = getX86SubSuperRegister(OrigDestReg, 32);
 
@@ -320,7 +319,7 @@ MachineInstr *FixupBWInstPass::tryReplaceCopy(MachineInstr *MI) const {
 
   // This is only correct if we access the same subregister index: otherwise,
   // we could try to replace "movb %ah, %al" with "movl %eax, %eax".
-  auto *TRI = &TII->getRegisterInfo();
+  const X86RegisterInfo *TRI = &TII->getRegisterInfo();
   if (TRI->getSubRegIndex(NewSrcReg, OldSrc.getReg()) !=
       TRI->getSubRegIndex(NewDestReg, OldDest.getReg()))
     return nullptr;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86FixupLEAs.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86FixupLEAs.cpp
index 424279038921..0054d5818a96 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86FixupLEAs.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86FixupLEAs.cpp
@@ -376,7 +376,8 @@ bool FixupLEAPass::optTwoAddrLEA(MachineBasicBlock::iterator &I,
   const MachineOperand &Segment = MI.getOperand(1 + X86::AddrSegmentReg);
 
   if (Segment.getReg() != 0 || !Disp.isImm() || Scale.getImm() > 1 ||
-      !TII->isSafeToClobberEFLAGS(MBB, I))
+      MBB.computeRegisterLiveness(TRI, X86::EFLAGS, I) !=
+          MachineBasicBlock::LQR_Dead)
     return false;
 
   Register DestReg = MI.getOperand(0).getReg();
@@ -449,6 +450,7 @@ bool FixupLEAPass::optTwoAddrLEA(MachineBasicBlock::iterator &I,
   } else
     return false;
 
+  MBB.getParent()->substituteDebugValuesForInst(*I, *NewMI, 1);
   MBB.erase(I);
   I = NewMI;
   return true;
@@ -484,6 +486,7 @@ void FixupLEAPass::seekLEAFixup(MachineOperand &p,
       LLVM_DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MBI->dump(););
       // now to replace with an equivalent LEA...
       LLVM_DEBUG(dbgs() << "FixLEA: Replaced by: "; NewMI->dump(););
+      MBB.getParent()->substituteDebugValuesForInst(*MBI, *NewMI, 1);
       MBB.erase(MBI);
       MachineBasicBlock::iterator J =
           static_cast<MachineBasicBlock::iterator>(NewMI);
@@ -505,7 +508,8 @@ void FixupLEAPass::processInstructionForSlowLEA(MachineBasicBlock::iterator &I,
   const MachineOperand &Segment = MI.getOperand(1 + X86::AddrSegmentReg);
 
   if (Segment.getReg() != 0 || !Offset.isImm() ||
-      !TII->isSafeToClobberEFLAGS(MBB, I))
+      MBB.computeRegisterLiveness(TRI, X86::EFLAGS, I, 4) !=
+          MachineBasicBlock::LQR_Dead)
     return;
   const Register DstR = Dst.getReg();
   const Register SrcR1 = Base.getReg();
@@ -536,6 +540,7 @@ void FixupLEAPass::processInstructionForSlowLEA(MachineBasicBlock::iterator &I,
     LLVM_DEBUG(NewMI->dump(););
   }
   if (NewMI) {
+    MBB.getParent()->substituteDebugValuesForInst(*I, *NewMI, 1);
     MBB.erase(I);
     I = NewMI;
   }
@@ -555,7 +560,8 @@ void FixupLEAPass::processInstrForSlow3OpLEA(MachineBasicBlock::iterator &I,
   const MachineOperand &Segment = MI.getOperand(1 + X86::AddrSegmentReg);
 
   if (!(TII->isThreeOperandsLEA(MI) || hasInefficientLEABaseReg(Base, Index)) ||
-      !TII->isSafeToClobberEFLAGS(MBB, MI) ||
+      MBB.computeRegisterLiveness(TRI, X86::EFLAGS, I, 4) !=
+          MachineBasicBlock::LQR_Dead ||
       Segment.getReg() != X86::NoRegister)
     return;
 
@@ -641,6 +647,7 @@ void FixupLEAPass::processInstrForSlow3OpLEA(MachineBasicBlock::iterator &I,
       }
     }
 
+    MBB.getParent()->substituteDebugValuesForInst(*I, *NewMI, 1);
     MBB.erase(I);
     I = NewMI;
     return;
@@ -666,6 +673,7 @@ void FixupLEAPass::processInstrForSlow3OpLEA(MachineBasicBlock::iterator &I,
                 .add(Index);
     LLVM_DEBUG(NewMI->dump(););
 
+    MBB.getParent()->substituteDebugValuesForInst(*I, *NewMI, 1);
     MBB.erase(I);
     I = NewMI;
     return;
@@ -688,6 +696,7 @@ void FixupLEAPass::processInstrForSlow3OpLEA(MachineBasicBlock::iterator &I,
               .add(Base);
   LLVM_DEBUG(NewMI->dump(););
 
+  MBB.getParent()->substituteDebugValuesForInst(*I, *NewMI, 1);
   MBB.erase(I);
   I = NewMI;
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86FixupSetCC.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86FixupSetCC.cpp
index 09668d7c5468..269f8ce6bd7a 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86FixupSetCC.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86FixupSetCC.cpp
@@ -97,28 +97,31 @@ bool X86FixupSetCCPass::runOnMachineFunction(MachineFunction &MF) {
       if (FlagsDefMI->readsRegister(X86::EFLAGS))
         continue;
 
-      ++NumSubstZexts;
-      Changed = true;
-
       // On 32-bit, we need to be careful to force an ABCD register.
       const TargetRegisterClass *RC = MF.getSubtarget<X86Subtarget>().is64Bit()
                                           ? &X86::GR32RegClass
                                           : &X86::GR32_ABCDRegClass;
-      Register ZeroReg = MRI->createVirtualRegister(RC);
-      Register InsertReg = MRI->createVirtualRegister(RC);
+      if (!MRI->constrainRegClass(ZExt->getOperand(0).getReg(), RC)) {
+        // If we cannot constrain the register, we would need an additional copy
+        // and are better off keeping the MOVZX32rr8 we have now.
+        continue;
+      }
+
+      ++NumSubstZexts;
+      Changed = true;
 
       // Initialize a register with 0. This must go before the eflags def
+      Register ZeroReg = MRI->createVirtualRegister(RC);
       BuildMI(MBB, FlagsDefMI, MI.getDebugLoc(), TII->get(X86::MOV32r0),
               ZeroReg);
 
       // X86 setcc only takes an output GR8, so fake a GR32 input by inserting
       // the setcc result into the low byte of the zeroed register.
       BuildMI(*ZExt->getParent(), ZExt, ZExt->getDebugLoc(),
-              TII->get(X86::INSERT_SUBREG), InsertReg)
+              TII->get(X86::INSERT_SUBREG), ZExt->getOperand(0).getReg())
           .addReg(ZeroReg)
           .addReg(MI.getOperand(0).getReg())
           .addImm(X86::sub_8bit);
-      MRI->replaceRegWith(ZExt->getOperand(0).getReg(), InsertReg);
       ToErase.push_back(ZExt);
     }
   }
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
index 831695dabcd8..d43fd807a5a7 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
@@ -97,7 +97,7 @@ private:
   CondRegArray collectCondsInRegs(MachineBasicBlock &MBB,
                                   MachineBasicBlock::iterator CopyDefI);
 
-  unsigned promoteCondToReg(MachineBasicBlock &MBB,
+  Register promoteCondToReg(MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator TestPos,
                             DebugLoc TestLoc, X86::CondCode Cond);
   std::pair<unsigned, bool>
@@ -739,8 +739,7 @@ CondRegArray X86FlagsCopyLoweringPass::collectCondsInRegs(
        llvm::reverse(llvm::make_range(MBB.begin(), TestPos))) {
     X86::CondCode Cond = X86::getCondFromSETCC(MI);
     if (Cond != X86::COND_INVALID && !MI.mayStore() &&
-        MI.getOperand(0).isReg() &&
-        Register::isVirtualRegister(MI.getOperand(0).getReg())) {
+        MI.getOperand(0).isReg() && MI.getOperand(0).getReg().isVirtual()) {
       assert(MI.getOperand(0).isDef() &&
              "A non-storing SETcc should always define a register!");
       CondRegs[Cond] = MI.getOperand(0).getReg();
@@ -754,7 +753,7 @@ CondRegArray X86FlagsCopyLoweringPass::collectCondsInRegs(
   return CondRegs;
 }
 
-unsigned X86FlagsCopyLoweringPass::promoteCondToReg(
+Register X86FlagsCopyLoweringPass::promoteCondToReg(
     MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
     DebugLoc TestLoc, X86::CondCode Cond) {
   Register Reg = MRI->createVirtualRegister(PromoteRC);
@@ -982,5 +981,4 @@ void X86FlagsCopyLoweringPass::rewriteSetCC(MachineBasicBlock &TestMBB,
   MIB.setMemRefs(SetCCI.memoperands());
 
   SetCCI.eraseFromParent();
-  return;
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.cpp
index db6b68659493..866f11364004 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -28,6 +28,7 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/TargetOptions.h"
@@ -148,60 +149,6 @@ static unsigned getLEArOpcode(bool IsLP64) {
   return IsLP64 ? X86::LEA64r : X86::LEA32r;
 }
 
-/// findDeadCallerSavedReg - Return a caller-saved register that isn't live
-/// when it reaches the "return" instruction. We can then pop a stack object
-/// to this register without worry about clobbering it.
-static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB,
-                                       MachineBasicBlock::iterator &MBBI,
-                                       const X86RegisterInfo *TRI,
-                                       bool Is64Bit) {
-  const MachineFunction *MF = MBB.getParent();
-  if (MF->callsEHReturn())
-    return 0;
-
-  const TargetRegisterClass &AvailableRegs = *TRI->getGPRsForTailCall(*MF);
-
-  if (MBBI == MBB.end())
-    return 0;
-
-  switch (MBBI->getOpcode()) {
-  default: return 0;
-  case TargetOpcode::PATCHABLE_RET:
-  case X86::RET:
-  case X86::RETL:
-  case X86::RETQ:
-  case X86::RETIL:
-  case X86::RETIQ:
-  case X86::TCRETURNdi:
-  case X86::TCRETURNri:
-  case X86::TCRETURNmi:
-  case X86::TCRETURNdi64:
-  case X86::TCRETURNri64:
-  case X86::TCRETURNmi64:
-  case X86::EH_RETURN:
-  case X86::EH_RETURN64: {
-    SmallSet<uint16_t, 8> Uses;
-    for (unsigned i = 0, e = MBBI->getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = MBBI->getOperand(i);
-      if (!MO.isReg() || MO.isDef())
-        continue;
-      Register Reg = MO.getReg();
-      if (!Reg)
-        continue;
-      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
-        Uses.insert(*AI);
-    }
-
-    for (auto CS : AvailableRegs)
-      if (!Uses.count(CS) && CS != X86::RIP && CS != X86::RSP &&
-          CS != X86::ESP)
-        return CS;
-  }
-  }
-
-  return 0;
-}
-
 static bool isEAXLiveIn(MachineBasicBlock &MBB) {
   for (MachineBasicBlock::RegisterMaskPair RegMask : MBB.liveins()) {
     unsigned Reg = RegMask.PhysReg;
@@ -288,7 +235,7 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
     if (isSub && !isEAXLiveIn(MBB))
       Reg = Rax;
     else
-      Reg = findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit);
+      Reg = TRI->findDeadCallerSavedReg(MBB, MBBI);
 
     unsigned MovRIOpc = Is64Bit ? X86::MOV64ri : X86::MOV32ri;
     unsigned AddSubRROpc =
@@ -345,7 +292,7 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
       // need to find a dead register when using pop.
       unsigned Reg = isSub
         ? (unsigned)(Is64Bit ? X86::RAX : X86::EAX)
-        : findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit);
+        : TRI->findDeadCallerSavedReg(MBB, MBBI);
       if (Reg) {
         unsigned Opc = isSub
           ? (Is64Bit ? X86::PUSH64r : X86::PUSH32r)
@@ -490,9 +437,9 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(
   }
   const MachineModuleInfo &MMI = MF.getMMI();
   const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
-  const unsigned FramePtr = TRI->getFrameRegister(MF);
-  const unsigned MachineFramePtr =
-      STI.isTarget64BitILP32() ? unsigned(getX86SubSuperRegister(FramePtr, 64))
+  const Register FramePtr = TRI->getFrameRegister(MF);
+  const Register MachineFramePtr =
+      STI.isTarget64BitILP32() ? Register(getX86SubSuperRegister(FramePtr, 64))
                                : FramePtr;
   unsigned DwarfReg = MRI->getDwarfRegNum(MachineFramePtr, true);
   // Offset = space for return address + size of the frame pointer itself.
@@ -1743,7 +1690,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
       assert(Personality == EHPersonality::MSVC_CXX);
       Register FrameReg;
       int FI = MF.getWinEHFuncInfo()->EHRegNodeFrameIndex;
-      int64_t EHRegOffset = getFrameIndexReference(MF, FI, FrameReg);
+      int64_t EHRegOffset = getFrameIndexReference(MF, FI, FrameReg).getFixed();
       // ESP is the first field, so no extra displacement is needed.
       addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32mr)), FrameReg,
                    false, EHRegOffset)
@@ -1764,8 +1711,9 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
           if (IsWin64Prologue && IsFunclet)
             Offset = getWin64EHFrameIndexRef(MF, FI, IgnoredFrameReg);
           else
-            Offset = getFrameIndexReference(MF, FI, IgnoredFrameReg) +
-                     SEHFrameOffset;
+            Offset =
+                getFrameIndexReference(MF, FI, IgnoredFrameReg).getFixed() +
+                SEHFrameOffset;
 
           HasWinCFI = true;
           assert(!NeedsWinFPO && "SEH_SaveXMM incompatible with FPO data");
@@ -1837,7 +1785,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
       unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;
       Register UsedReg;
       int Offset =
-          getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg);
+          getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg)
+              .getFixed();
       assert(UsedReg == BasePtr);
       addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), UsedReg, true, Offset)
           .addReg(FramePtr)
@@ -1915,7 +1864,8 @@ X86FrameLowering::getPSPSlotOffsetFromSP(const MachineFunction &MF) const {
   const WinEHFuncInfo &Info = *MF.getWinEHFuncInfo();
   Register SPReg;
   int Offset = getFrameIndexReferencePreferSP(MF, Info.PSPSymFrameIdx, SPReg,
-                                              /*IgnoreSPUpdates*/ true);
+                                              /*IgnoreSPUpdates*/ true)
+                   .getFixed();
   assert(Offset >= 0 && SPReg == TRI->getStackRegister());
   return static_cast<unsigned>(Offset);
 }
@@ -1970,7 +1920,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
   // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit.
   const bool Is64BitILP32 = STI.isTarget64BitILP32();
   Register FramePtr = TRI->getFrameRegister(MF);
-  unsigned MachineFramePtr =
+  Register MachineFramePtr =
       Is64BitILP32 ? Register(getX86SubSuperRegister(FramePtr, 64)) : FramePtr;
 
   bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
@@ -2141,10 +2091,16 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
       emitSPUpdate(MBB, Terminator, DL, Offset, /*InEpilogue=*/true);
     }
   }
+
+  // Emit tilerelease for AMX kernel.
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  if (!MRI.reg_nodbg_empty(X86::TMMCFG))
+    BuildMI(MBB, Terminator, DL, TII.get(X86::TILERELEASE));
 }
 
-int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
-                                             Register &FrameReg) const {
+StackOffset X86FrameLowering::getFrameIndexReference(const MachineFunction &MF,
+                                                     int FI,
+                                                     Register &FrameReg) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
 
   bool IsFixed = MFI.isFixedObjectIndex(FI);
@@ -2191,7 +2147,7 @@ int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
 
     uint64_t SEHFrameOffset = calculateSetFPREG(NumBytes);
     if (FI && FI == X86FI->getFAIndex())
-      return -SEHFrameOffset;
+      return StackOffset::getFixed(-SEHFrameOffset);
 
     // FPDelta is the offset from the "traditional" FP location of the old base
     // pointer followed by return address and the location required by the
@@ -2207,23 +2163,23 @@ int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
     assert(HasFP && "VLAs and dynamic stack realign, but no FP?!");
     if (FI < 0) {
       // Skip the saved EBP.
-      return Offset + SlotSize + FPDelta;
+      return StackOffset::getFixed(Offset + SlotSize + FPDelta);
     } else {
       assert(isAligned(MFI.getObjectAlign(FI), -(Offset + StackSize)));
-      return Offset + StackSize;
+      return StackOffset::getFixed(Offset + StackSize);
     }
   } else if (TRI->needsStackRealignment(MF)) {
     if (FI < 0) {
       // Skip the saved EBP.
-      return Offset + SlotSize + FPDelta;
+      return StackOffset::getFixed(Offset + SlotSize + FPDelta);
     } else {
       assert(isAligned(MFI.getObjectAlign(FI), -(Offset + StackSize)));
-      return Offset + StackSize;
+      return StackOffset::getFixed(Offset + StackSize);
     }
     // FIXME: Support tail calls
   } else {
     if (!HasFP)
-      return Offset + StackSize;
+      return StackOffset::getFixed(Offset + StackSize);
 
     // Skip the saved EBP.
     Offset += SlotSize;
@@ -2234,7 +2190,7 @@ int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
       Offset -= TailCallReturnAddrDelta;
   }
 
-  return Offset + FPDelta;
+  return StackOffset::getFixed(Offset + FPDelta);
 }
 
 int X86FrameLowering::getWin64EHFrameIndexRef(const MachineFunction &MF, int FI,
@@ -2245,24 +2201,27 @@ int X86FrameLowering::getWin64EHFrameIndexRef(const MachineFunction &MF, int FI,
   const auto it = WinEHXMMSlotInfo.find(FI);
 
   if (it == WinEHXMMSlotInfo.end())
-    return getFrameIndexReference(MF, FI, FrameReg);
+    return getFrameIndexReference(MF, FI, FrameReg).getFixed();
 
   FrameReg = TRI->getStackRegister();
   return alignDown(MFI.getMaxCallFrameSize(), getStackAlign().value()) +
          it->second;
 }
 
-int X86FrameLowering::getFrameIndexReferenceSP(const MachineFunction &MF,
-                                               int FI, Register &FrameReg,
-                                               int Adjustment) const {
+StackOffset
+X86FrameLowering::getFrameIndexReferenceSP(const MachineFunction &MF, int FI,
+                                           Register &FrameReg,
+                                           int Adjustment) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   FrameReg = TRI->getStackRegister();
-  return MFI.getObjectOffset(FI) - getOffsetOfLocalArea() + Adjustment;
+  return StackOffset::getFixed(MFI.getObjectOffset(FI) -
+                               getOffsetOfLocalArea() + Adjustment);
 }
 
-int X86FrameLowering::getFrameIndexReferencePreferSP(
-    const MachineFunction &MF, int FI, Register &FrameReg,
-    bool IgnoreSPUpdates) const {
+StackOffset
+X86FrameLowering::getFrameIndexReferencePreferSP(const MachineFunction &MF,
+                                                 int FI, Register &FrameReg,
+                                                 bool IgnoreSPUpdates) const {
 
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   // Does not include any dynamic realign.
@@ -2939,8 +2898,8 @@ static unsigned getHiPELiteral(
 // non-meta instructions between MBBI and MBB.end().
 static bool blockEndIsUnreachable(const MachineBasicBlock &MBB,
                                   MachineBasicBlock::const_iterator MBBI) {
-  return std::all_of(
-             MBB.succ_begin(), MBB.succ_end(),
+  return llvm::all_of(
+             MBB.successors(),
              [](const MachineBasicBlock *Succ) { return Succ->isEHPad(); }) &&
          std::all_of(MBBI, MBB.end(), [](const MachineInstr &MI) {
            return MI.isMetaInstruction();
@@ -3101,7 +3060,6 @@ bool X86FrameLowering::adjustStackWithPops(MachineBasicBlock &MBB,
                                            MachineBasicBlock::iterator MBBI,
                                            const DebugLoc &DL,
                                            int Offset) const {
-
   if (Offset <= 0)
     return false;
 
@@ -3124,14 +3082,13 @@ bool X86FrameLowering::adjustStackWithPops(MachineBasicBlock &MBB,
   unsigned Regs[2];
   unsigned FoundRegs = 0;
 
-  auto &MRI = MBB.getParent()->getRegInfo();
-  auto RegMask = Prev->getOperand(1);
+  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  const MachineOperand &RegMask = Prev->getOperand(1);
 
   auto &RegClass =
       Is64Bit ? X86::GR64_NOREX_NOSPRegClass : X86::GR32_NOREX_NOSPRegClass;
   // Try to find up to NumPops free registers.
   for (auto Candidate : RegClass) {
-
     // Poor man's liveness:
     // Since we're immediately after a call, any register that is clobbered
     // by the call and not defined by it can be considered dead.
@@ -3312,10 +3269,14 @@ bool X86FrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const {
 bool X86FrameLowering::enableShrinkWrapping(const MachineFunction &MF) const {
   // If we may need to emit frameless compact unwind information, give
   // up as this is currently broken: PR25614.
-  return (MF.getFunction().hasFnAttribute(Attribute::NoUnwind) || hasFP(MF)) &&
-         // The lowering of segmented stack and HiPE only support entry blocks
-         // as prologue blocks: PR26107.
-         // This limitation may be lifted if we fix:
+  bool CompactUnwind =
+      MF.getMMI().getContext().getObjectFileInfo()->getCompactUnwindSection() !=
+      nullptr;
+  return (MF.getFunction().hasFnAttribute(Attribute::NoUnwind) || hasFP(MF) ||
+          !CompactUnwind) &&
+         // The lowering of segmented stack and HiPE only support entry
+         // blocks as prologue blocks: PR26107. This limitation may be
+         // lifted if we fix:
          // - adjustForSegmentedStacks
          // - adjustForHiPEPrologue
          MF.getFunction().getCallingConv() != CallingConv::HiPE &&
@@ -3350,7 +3311,7 @@ MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers(
   }
 
   Register UsedReg;
-  int EHRegOffset = getFrameIndexReference(MF, FI, UsedReg);
+  int EHRegOffset = getFrameIndexReference(MF, FI, UsedReg).getFixed();
   int EndOffset = -EHRegOffset - EHRegSize;
   FuncInfo.EHRegNodeEndOffset = EndOffset;
 
@@ -3373,7 +3334,8 @@ MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers(
     // MOV32rm SavedEBPOffset(%esi), %ebp
     assert(X86FI->getHasSEHFramePtrSave());
     int Offset =
-        getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg);
+        getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg)
+            .getFixed();
     assert(UsedReg == BasePtr);
     addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32rm), FramePtr),
                  UsedReg, true, Offset)
@@ -3418,7 +3380,7 @@ struct X86FrameSortingObject {
 // at the end of our list.
 struct X86FrameSortingComparator {
   inline bool operator()(const X86FrameSortingObject &A,
-                         const X86FrameSortingObject &B) {
+                         const X86FrameSortingObject &B) const {
     uint64_t DensityAScaled, DensityBScaled;
 
     // For consistency in our comparison, all invalid objects are placed
@@ -3554,13 +3516,21 @@ void X86FrameLowering::processFunctionBeforeFrameFinalized(
   // emitPrologue if it gets called and emits CFI.
   MF.setHasWinCFI(false);
 
+  // If we are using Windows x64 CFI, ensure that the stack is always 8 byte
+  // aligned. The format doesn't support misaligned stack adjustments.
+  if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI())
+    MF.getFrameInfo().ensureMaxAlignment(Align(SlotSize));
+
   // If this function isn't doing Win64-style C++ EH, we don't need to do
   // anything.
-  const Function &F = MF.getFunction();
-  if (!STI.is64Bit() || !MF.hasEHFunclets() ||
-      classifyEHPersonality(F.getPersonalityFn()) != EHPersonality::MSVC_CXX)
-    return;
+  if (STI.is64Bit() && MF.hasEHFunclets() &&
+      classifyEHPersonality(MF.getFunction().getPersonalityFn()) ==
+          EHPersonality::MSVC_CXX) {
+    adjustFrameForMsvcCxxEh(MF);
+  }
+}
 
+void X86FrameLowering::adjustFrameForMsvcCxxEh(MachineFunction &MF) const {
   // Win64 C++ EH needs to allocate the UnwindHelp object at some fixed offset
   // relative to RSP after the prologue.  Find the offset of the last fixed
   // object, so that we can allocate a slot immediately following it. If there
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.h b/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.h
index bb2e83205e71..26e80811af2e 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.h
@@ -14,6 +14,7 @@
 #define LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H
 
 #include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/Support/TypeSize.h"
 
 namespace llvm {
 
@@ -102,16 +103,17 @@ public:
   bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override;
   bool needsFrameIndexResolution(const MachineFunction &MF) const override;
 
-  int getFrameIndexReference(const MachineFunction &MF, int FI,
-                             Register &FrameReg) const override;
+  StackOffset getFrameIndexReference(const MachineFunction &MF, int FI,
+                                     Register &FrameReg) const override;
 
   int getWin64EHFrameIndexRef(const MachineFunction &MF, int FI,
                               Register &SPReg) const;
-  int getFrameIndexReferenceSP(const MachineFunction &MF, int FI,
-                               Register &SPReg, int Adjustment) const;
-  int getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI,
-                                     Register &FrameReg,
-                                     bool IgnoreSPUpdates) const override;
+  StackOffset getFrameIndexReferenceSP(const MachineFunction &MF, int FI,
+                                       Register &SPReg, int Adjustment) const;
+  StackOffset
+  getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI,
+                                 Register &FrameReg,
+                                 bool IgnoreSPUpdates) const override;
 
   MachineBasicBlock::iterator
   eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
@@ -222,12 +224,7 @@ private:
                                        const DebugLoc &DL, uint64_t Offset,
                                        uint64_t Align) const;
 
-  /// Emit a stub to later inline the target stack probe.
-  MachineInstr *emitStackProbeInlineStub(MachineFunction &MF,
-                                         MachineBasicBlock &MBB,
-                                         MachineBasicBlock::iterator MBBI,
-                                         const DebugLoc &DL,
-                                         bool InProlog) const;
+  void adjustFrameForMsvcCxxEh(MachineFunction &MF) const;
 
   /// Aligns the stack pointer by ANDing it with -MaxAlign.
   void BuildStackAlignAND(MachineBasicBlock &MBB,
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 4768c5aa543d..1df9a0d1700f 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -17,6 +17,7 @@
 #include "X86Subtarget.h"
 #include "X86TargetMachine.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/ConstantRange.h"
@@ -44,6 +45,8 @@ static cl::opt<bool> EnablePromoteAnyextLoad(
     "x86-promote-anyext-load", cl::init(true),
     cl::desc("Enable promoting aligned anyext load to wider load"), cl::Hidden);
 
+extern cl::opt<bool> IndirectBranchTracking;
+
 //===----------------------------------------------------------------------===//
 //                      Pattern Matcher Implementation
 //===----------------------------------------------------------------------===//
@@ -204,7 +207,8 @@ namespace {
     void Select(SDNode *N) override;
 
     bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
-    bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM);
+    bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
+                            bool AllowSegmentRegForX32 = false);
     bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
     bool matchAddress(SDValue N, X86ISelAddressMode &AM);
     bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM);
@@ -499,6 +503,8 @@ namespace {
     bool tryShiftAmountMod(SDNode *N);
     bool tryShrinkShlLogicImm(SDNode *N);
     bool tryVPTERNLOG(SDNode *N);
+    bool matchVPTERNLOG(SDNode *Root, SDNode *ParentA, SDNode *ParentBC,
+                        SDValue A, SDValue B, SDValue C, uint8_t Imm);
     bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
     bool tryMatchBitSelect(SDNode *N);
 
@@ -521,9 +527,9 @@ namespace {
 // type.
 static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
   unsigned Opcode = N->getOpcode();
-  if (Opcode == X86ISD::CMPM || Opcode == X86ISD::STRICT_CMPM ||
-      Opcode == ISD::SETCC || Opcode == X86ISD::CMPM_SAE ||
-      Opcode == X86ISD::VFPCLASS) {
+  if (Opcode == X86ISD::CMPM || Opcode == X86ISD::CMPMM ||
+      Opcode == X86ISD::STRICT_CMPM || Opcode == ISD::SETCC ||
+      Opcode == X86ISD::CMPMM_SAE || Opcode == X86ISD::VFPCLASS) {
     // We can get 256-bit 8 element types here without VLX being enabled. When
     // this happens we will use 512-bit operations and the mask will not be
     // zero extended.
@@ -795,12 +801,69 @@ static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
   return false;
 }
 
+static bool isEndbrImm64(uint64_t Imm) {
+// There may be some other prefix bytes between 0xF3 and 0x0F1EFA.
+// i.g: 0xF3660F1EFA, 0xF3670F1EFA
+  if ((Imm & 0x00FFFFFF) != 0x0F1EFA)
+    return false;
+
+  uint8_t OptionalPrefixBytes [] = {0x26, 0x2e, 0x36, 0x3e, 0x64,
+                                    0x65, 0x66, 0x67, 0xf0, 0xf2};
+  int i = 24; // 24bit 0x0F1EFA has matched
+  while (i < 64) {
+    uint8_t Byte = (Imm >> i) & 0xFF;
+    if (Byte == 0xF3)
+      return true;
+    if (!llvm::is_contained(OptionalPrefixBytes, Byte))
+      return false;
+    i += 8;
+  }
+
+  return false;
+}
+
 void X86DAGToDAGISel::PreprocessISelDAG() {
   bool MadeChange = false;
   for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
        E = CurDAG->allnodes_end(); I != E; ) {
     SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
 
+    // This is for CET enhancement.
+    //
+    // ENDBR32 and ENDBR64 have specific opcodes:
+    // ENDBR32: F3 0F 1E FB
+    // ENDBR64: F3 0F 1E FA
+    // And we want that attackers won’t find unintended ENDBR32/64
+    // opcode matches in the binary
+    // Here’s an example:
+    // If the compiler had to generate asm for the following code:
+    // a = 0xF30F1EFA
+    // it could, for example, generate:
+    // mov 0xF30F1EFA, dword ptr[a]
+    // In such a case, the binary would include a gadget that starts
+    // with a fake ENDBR64 opcode. Therefore, we split such generation
+    // into multiple operations, let it not shows in the binary
+    if (N->getOpcode() == ISD::Constant) {
+      MVT VT = N->getSimpleValueType(0);
+      int64_t Imm = cast<ConstantSDNode>(N)->getSExtValue();
+      int32_t EndbrImm = Subtarget->is64Bit() ? 0xF30F1EFA : 0xF30F1EFB;
+      if (Imm == EndbrImm || isEndbrImm64(Imm)) {
+        // Check that the cf-protection-branch is enabled.
+        Metadata *CFProtectionBranch =
+          MF->getMMI().getModule()->getModuleFlag("cf-protection-branch");
+        if (CFProtectionBranch || IndirectBranchTracking) {
+          SDLoc dl(N);
+          SDValue Complement = CurDAG->getConstant(~Imm, dl, VT, false, true);
+          Complement = CurDAG->getNOT(dl, Complement, VT);
+          --I;
+          CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Complement);
+          ++I;
+          MadeChange = true;
+          continue;
+        }
+      }
+    }
+
     // If this is a target specific AND node with no flag usages, turn it back
     // into ISD::AND to enable test instruction matching.
     if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(1)) {
@@ -1005,6 +1068,8 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
     case ISD::STRICT_FFLOOR:
     case ISD::FTRUNC:
     case ISD::STRICT_FTRUNC:
+    case ISD::FROUNDEVEN:
+    case ISD::STRICT_FROUNDEVEN:
     case ISD::FNEARBYINT:
     case ISD::STRICT_FNEARBYINT:
     case ISD::FRINT:
@@ -1020,6 +1085,8 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
       case ISD::FFLOOR:     Imm = 0x9; break;
       case ISD::STRICT_FTRUNC:
       case ISD::FTRUNC:     Imm = 0xB; break;
+      case ISD::STRICT_FROUNDEVEN:
+      case ISD::FROUNDEVEN: Imm = 0x8; break;
       case ISD::STRICT_FNEARBYINT:
       case ISD::FNEARBYINT: Imm = 0xC; break;
       case ISD::STRICT_FRINT:
@@ -1032,11 +1099,11 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
         Res = CurDAG->getNode(X86ISD::STRICT_VRNDSCALE, dl,
                               {N->getValueType(0), MVT::Other},
                               {N->getOperand(0), N->getOperand(1),
-                               CurDAG->getTargetConstant(Imm, dl, MVT::i8)});
+                               CurDAG->getTargetConstant(Imm, dl, MVT::i32)});
       else
         Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, N->getValueType(0),
                               N->getOperand(0),
-                              CurDAG->getTargetConstant(Imm, dl, MVT::i8));
+                              CurDAG->getTargetConstant(Imm, dl, MVT::i32));
       --I;
       CurDAG->ReplaceAllUsesWith(N, Res.getNode());
       ++I;
@@ -1547,20 +1614,26 @@ bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
 
 }
 
-bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){
+bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
+                                         bool AllowSegmentRegForX32) {
   SDValue Address = N->getOperand(1);
 
   // load gs:0 -> GS segment register.
   // load fs:0 -> FS segment register.
   //
-  // This optimization is valid because the GNU TLS model defines that
-  // gs:0 (or fs:0 on X86-64) contains its own address.
+  // This optimization is generally valid because the GNU TLS model defines that
+  // gs:0 (or fs:0 on X86-64) contains its own address. However, for X86-64 mode
+  // with 32-bit registers, as we get in ILP32 mode, those registers are first
+  // zero-extended to 64 bits and then added it to the base address, which gives
+  // unwanted results when the register holds a negative value.
   // For more information see http://people.redhat.com/drepper/tls.pdf
-  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address))
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address)) {
     if (C->getSExtValue() == 0 && AM.Segment.getNode() == nullptr &&
         !IndirectTlsSegRefs &&
         (Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() ||
-         Subtarget->isTargetFuchsia()))
+         Subtarget->isTargetFuchsia())) {
+      if (Subtarget->isTarget64BitILP32() && !AllowSegmentRegForX32)
+        return true;
       switch (N->getPointerInfo().getAddrSpace()) {
       case X86AS::GS:
         AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
@@ -1571,6 +1644,8 @@ bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){
       // Address space X86AS::SS is not handled here, because it is not used to
       // address TLS areas.
       }
+    }
+  }
 
   return true;
 }
@@ -1654,6 +1729,21 @@ bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
   if (matchAddressRecursively(N, AM, 0))
     return true;
 
+  // Post-processing: Make a second attempt to fold a load, if we now know
+  // that there will not be any other register. This is only performed for
+  // 64-bit ILP32 mode since 32-bit mode and 64-bit LP64 mode will have folded
+  // any foldable load the first time.
+  if (Subtarget->isTarget64BitILP32() &&
+      AM.BaseType == X86ISelAddressMode::RegBase &&
+      AM.Base_Reg.getNode() != nullptr && AM.IndexReg.getNode() == nullptr) {
+    SDValue Save_Base_Reg = AM.Base_Reg;
+    if (auto *LoadN = dyn_cast<LoadSDNode>(Save_Base_Reg)) {
+      AM.Base_Reg = SDValue();
+      if (matchLoadInAddress(LoadN, AM, /*AllowSegmentRegForX32=*/true))
+        AM.Base_Reg = Save_Base_Reg;
+    }
+  }
+
   // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
   // a smaller encoding and avoids a scaled-index.
   if (AM.Scale == 2 &&
@@ -2628,12 +2718,12 @@ bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
   AM.Disp += GA->getOffset();
   AM.SymbolFlags = GA->getTargetFlags();
 
-  MVT VT = N.getSimpleValueType();
-  if (VT == MVT::i32) {
+  if (Subtarget->is32Bit()) {
     AM.Scale = 1;
     AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
   }
 
+  MVT VT = N.getSimpleValueType();
   getAddressOperands(AM, SDLoc(N), VT, Base, Scale, Index, Disp, Segment);
   return true;
 }
@@ -2723,7 +2813,10 @@ bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
     return false;
 
   Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
-  return CR && CR->getSignedMin().sge(-1ull << Width) &&
+  if (!CR)
+    return Width == 32 && TM.getCodeModel() == CodeModel::Small;
+
+  return CR->getSignedMin().sge(-1ull << Width) &&
          CR->getSignedMax().slt(1ull << Width);
 }
 
@@ -3117,7 +3210,7 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
       bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1));
       // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
       if ((IsOne || IsNegOne) && hasNoCarryFlagUses(StoredVal.getValue(1))) {
-        unsigned NewOpc = 
+        unsigned NewOpc =
           ((Opc == X86ISD::ADD) == IsOne)
               ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m)
               : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m);
@@ -3373,7 +3466,7 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
     // Match the shift amount as: (bitwidth - y). It should go away, too.
     if (ShiftAmt.getOpcode() != ISD::SUB)
       return false;
-    auto V0 = dyn_cast<ConstantSDNode>(ShiftAmt.getOperand(0));
+    auto *V0 = dyn_cast<ConstantSDNode>(ShiftAmt.getOperand(0));
     if (!V0 || V0->getZExtValue() != Bitwidth)
       return false;
     NBits = ShiftAmt.getOperand(1);
@@ -3926,6 +4019,129 @@ bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
   return true;
 }
 
+bool X86DAGToDAGISel::matchVPTERNLOG(SDNode *Root, SDNode *ParentA,
+                                     SDNode *ParentBC, SDValue A, SDValue B,
+                                     SDValue C, uint8_t Imm) {
+  assert(A.isOperandOf(ParentA));
+  assert(B.isOperandOf(ParentBC));
+  assert(C.isOperandOf(ParentBC));
+
+  auto tryFoldLoadOrBCast =
+      [this](SDNode *Root, SDNode *P, SDValue &L, SDValue &Base, SDValue &Scale,
+             SDValue &Index, SDValue &Disp, SDValue &Segment) {
+        if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment))
+          return true;
+
+        // Not a load, check for broadcast which may be behind a bitcast.
+        if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
+          P = L.getNode();
+          L = L.getOperand(0);
+        }
+
+        if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
+          return false;
+
+        // Only 32 and 64 bit broadcasts are supported.
+        auto *MemIntr = cast<MemIntrinsicSDNode>(L);
+        unsigned Size = MemIntr->getMemoryVT().getSizeInBits();
+        if (Size != 32 && Size != 64)
+          return false;
+
+        return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment);
+      };
+
+  bool FoldedLoad = false;
+  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+  if (tryFoldLoadOrBCast(Root, ParentBC, C, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
+    FoldedLoad = true;
+  } else if (tryFoldLoadOrBCast(Root, ParentA, A, Tmp0, Tmp1, Tmp2, Tmp3,
+                                Tmp4)) {
+    FoldedLoad = true;
+    std::swap(A, C);
+    // Swap bits 1/4 and 3/6.
+    uint8_t OldImm = Imm;
+    Imm = OldImm & 0xa5;
+    if (OldImm & 0x02) Imm |= 0x10;
+    if (OldImm & 0x10) Imm |= 0x02;
+    if (OldImm & 0x08) Imm |= 0x40;
+    if (OldImm & 0x40) Imm |= 0x08;
+  } else if (tryFoldLoadOrBCast(Root, ParentBC, B, Tmp0, Tmp1, Tmp2, Tmp3,
+                                Tmp4)) {
+    FoldedLoad = true;
+    std::swap(B, C);
+    // Swap bits 1/2 and 5/6.
+    uint8_t OldImm = Imm;
+    Imm = OldImm & 0x99;
+    if (OldImm & 0x02) Imm |= 0x04;
+    if (OldImm & 0x04) Imm |= 0x02;
+    if (OldImm & 0x20) Imm |= 0x40;
+    if (OldImm & 0x40) Imm |= 0x20;
+  }
+
+  SDLoc DL(Root);
+
+  SDValue TImm = CurDAG->getTargetConstant(Imm, DL, MVT::i8);
+
+  MVT NVT = Root->getSimpleValueType(0);
+
+  MachineSDNode *MNode;
+  if (FoldedLoad) {
+    SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
+
+    unsigned Opc;
+    if (C.getOpcode() == X86ISD::VBROADCAST_LOAD) {
+      auto *MemIntr = cast<MemIntrinsicSDNode>(C);
+      unsigned EltSize = MemIntr->getMemoryVT().getSizeInBits();
+      assert((EltSize == 32 || EltSize == 64) && "Unexpected broadcast size!");
+
+      bool UseD = EltSize == 32;
+      if (NVT.is128BitVector())
+        Opc = UseD ? X86::VPTERNLOGDZ128rmbi : X86::VPTERNLOGQZ128rmbi;
+      else if (NVT.is256BitVector())
+        Opc = UseD ? X86::VPTERNLOGDZ256rmbi : X86::VPTERNLOGQZ256rmbi;
+      else if (NVT.is512BitVector())
+        Opc = UseD ? X86::VPTERNLOGDZrmbi : X86::VPTERNLOGQZrmbi;
+      else
+        llvm_unreachable("Unexpected vector size!");
+    } else {
+      bool UseD = NVT.getVectorElementType() == MVT::i32;
+      if (NVT.is128BitVector())
+        Opc = UseD ? X86::VPTERNLOGDZ128rmi : X86::VPTERNLOGQZ128rmi;
+      else if (NVT.is256BitVector())
+        Opc = UseD ? X86::VPTERNLOGDZ256rmi : X86::VPTERNLOGQZ256rmi;
+      else if (NVT.is512BitVector())
+        Opc = UseD ? X86::VPTERNLOGDZrmi : X86::VPTERNLOGQZrmi;
+      else
+        llvm_unreachable("Unexpected vector size!");
+    }
+
+    SDValue Ops[] = {A, B, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, TImm, C.getOperand(0)};
+    MNode = CurDAG->getMachineNode(Opc, DL, VTs, Ops);
+
+    // Update the chain.
+    ReplaceUses(C.getValue(1), SDValue(MNode, 1));
+    // Record the mem-refs
+    CurDAG->setNodeMemRefs(MNode, {cast<MemSDNode>(C)->getMemOperand()});
+  } else {
+    bool UseD = NVT.getVectorElementType() == MVT::i32;
+    unsigned Opc;
+    if (NVT.is128BitVector())
+      Opc = UseD ? X86::VPTERNLOGDZ128rri : X86::VPTERNLOGQZ128rri;
+    else if (NVT.is256BitVector())
+      Opc = UseD ? X86::VPTERNLOGDZ256rri : X86::VPTERNLOGQZ256rri;
+    else if (NVT.is512BitVector())
+      Opc = UseD ? X86::VPTERNLOGDZrri : X86::VPTERNLOGQZrri;
+    else
+      llvm_unreachable("Unexpected vector size!");
+
+    MNode = CurDAG->getMachineNode(Opc, DL, NVT, {A, B, C, TImm});
+  }
+
+  ReplaceUses(SDValue(Root, 0), SDValue(MNode, 0));
+  CurDAG->RemoveDeadNode(Root);
+  return true;
+}
+
 // Try to match two logic ops to a VPTERNLOG.
 // FIXME: Handle inverted inputs?
 // FIXME: Handle more complex patterns that use an operand more than once?
@@ -3941,68 +4157,65 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
   if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
     return false;
 
-  unsigned Opc1 = N->getOpcode();
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
 
-  auto isLogicOp = [](unsigned Opc) {
-    return Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR ||
-           Opc == X86ISD::ANDNP;
+  auto getFoldableLogicOp = [](SDValue Op) {
+    // Peek through single use bitcast.
+    if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse())
+      Op = Op.getOperand(0);
+
+    if (!Op.hasOneUse())
+      return SDValue();
+
+    unsigned Opc = Op.getOpcode();
+    if (Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR ||
+        Opc == X86ISD::ANDNP)
+      return Op;
+
+    return SDValue();
   };
 
-  SDValue A, B, C;
-  unsigned Opc2;
-  if (isLogicOp(N1.getOpcode()) && N1.hasOneUse()) {
-    Opc2 = N1.getOpcode();
+  SDValue A, FoldableOp;
+  if ((FoldableOp = getFoldableLogicOp(N1))) {
     A = N0;
-    B = N1.getOperand(0);
-    C = N1.getOperand(1);
-  } else if (isLogicOp(N0.getOpcode()) && N0.hasOneUse()) {
-    Opc2 = N0.getOpcode();
+  } else if ((FoldableOp = getFoldableLogicOp(N0))) {
     A = N1;
-    B = N0.getOperand(0);
-    C = N0.getOperand(1);
   } else
     return false;
 
-  uint64_t Imm;
-  switch (Opc1) {
+  SDValue B = FoldableOp.getOperand(0);
+  SDValue C = FoldableOp.getOperand(1);
+
+  // We can build the appropriate control immediate by performing the logic
+  // operation we're matching using these constants for A, B, and C.
+  const uint8_t TernlogMagicA = 0xf0;
+  const uint8_t TernlogMagicB = 0xcc;
+  const uint8_t TernlogMagicC = 0xaa;
+
+  uint8_t Imm;
+  switch (FoldableOp.getOpcode()) {
   default: llvm_unreachable("Unexpected opcode!");
-  case ISD::AND:
-    switch (Opc2) {
-    default: llvm_unreachable("Unexpected opcode!");
-    case ISD::AND:      Imm = 0x80; break;
-    case ISD::OR:       Imm = 0xe0; break;
-    case ISD::XOR:      Imm = 0x60; break;
-    case X86ISD::ANDNP: Imm = 0x20; break;
-    }
-    break;
-  case ISD::OR:
-    switch (Opc2) {
-    default: llvm_unreachable("Unexpected opcode!");
-    case ISD::AND:      Imm = 0xf8; break;
-    case ISD::OR:       Imm = 0xfe; break;
-    case ISD::XOR:      Imm = 0xf6; break;
-    case X86ISD::ANDNP: Imm = 0xf2; break;
-    }
-    break;
-  case ISD::XOR:
-    switch (Opc2) {
-    default: llvm_unreachable("Unexpected opcode!");
-    case ISD::AND:      Imm = 0x78; break;
-    case ISD::OR:       Imm = 0x1e; break;
-    case ISD::XOR:      Imm = 0x96; break;
-    case X86ISD::ANDNP: Imm = 0xd2; break;
-    }
+  case ISD::AND:      Imm = TernlogMagicB & TernlogMagicC; break;
+  case ISD::OR:       Imm = TernlogMagicB | TernlogMagicC; break;
+  case ISD::XOR:      Imm = TernlogMagicB ^ TernlogMagicC; break;
+  case X86ISD::ANDNP: Imm = ~(TernlogMagicB) & TernlogMagicC; break;
+  }
+
+  switch (N->getOpcode()) {
+  default: llvm_unreachable("Unexpected opcode!");
+  case X86ISD::ANDNP:
+    if (A == N0)
+      Imm &= ~TernlogMagicA;
+    else
+      Imm = ~(Imm) & TernlogMagicA;
     break;
+  case ISD::AND: Imm &= TernlogMagicA; break;
+  case ISD::OR:  Imm |= TernlogMagicA; break;
+  case ISD::XOR: Imm ^= TernlogMagicA; break;
   }
 
-  SDLoc DL(N);
-  SDValue New = CurDAG->getNode(X86ISD::VPTERNLOG, DL, NVT, A, B, C,
-                                CurDAG->getTargetConstant(Imm, DL, MVT::i8));
-  ReplaceNode(N, New.getNode());
-  SelectCode(New.getNode());
-  return true;
+  return matchVPTERNLOG(N, N, FoldableOp.getNode(), A, B, C, Imm);
 }
 
 /// If the high bits of an 'and' operand are known zero, try setting the
@@ -4069,6 +4282,7 @@ bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
 
   // A negative mask allows a smaller encoding. Create a new 'and' node.
   SDValue NewMask = CurDAG->getConstant(NegMaskVal, SDLoc(And), VT);
+  insertDAGNode(*CurDAG, SDValue(And, 0), NewMask);
   SDValue NewAnd = CurDAG->getNode(ISD::AND, SDLoc(And), VT, And0, NewMask);
   ReplaceNode(And, NewAnd.getNode());
   SelectCode(NewAnd.getNode());
@@ -4102,15 +4316,15 @@ VPTESTM_CASE(v16i16, WZ256##SUFFIX) \
 VPTESTM_CASE(v64i8, BZ##SUFFIX) \
 VPTESTM_CASE(v32i16, WZ##SUFFIX)
 
-  if (FoldedLoad) {
+  if (FoldedBCast) {
     switch (TestVT.SimpleTy) {
-    VPTESTM_FULL_CASES(rm)
+    VPTESTM_BROADCAST_CASES(rmb)
     }
   }
 
-  if (FoldedBCast) {
+  if (FoldedLoad) {
     switch (TestVT.SimpleTy) {
-    VPTESTM_BROADCAST_CASES(rmb)
+    VPTESTM_FULL_CASES(rm)
     }
   }
 
@@ -4169,79 +4383,56 @@ bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
     }
   }
 
-  // Without VLX we need to widen the load.
+  // Without VLX we need to widen the operation.
   bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector();
 
-  // We can only fold loads if the sources are unique.
-  bool CanFoldLoads = Src0 != Src1;
+  auto tryFoldLoadOrBCast = [&](SDNode *Root, SDNode *P, SDValue &L,
+                                SDValue &Base, SDValue &Scale, SDValue &Index,
+                                SDValue &Disp, SDValue &Segment) {
+    // If we need to widen, we can't fold the load.
+    if (!Widen)
+      if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment))
+        return true;
 
-  // Try to fold loads unless we need to widen.
-  bool FoldedLoad = false;
-  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Load;
-  if (!Widen && CanFoldLoads) {
-    Load = Src1;
-    FoldedLoad = tryFoldLoad(Root, N0.getNode(), Load, Tmp0, Tmp1, Tmp2, Tmp3,
-                             Tmp4);
-    if (!FoldedLoad) {
-      // And is computative.
-      Load = Src0;
-      FoldedLoad = tryFoldLoad(Root, N0.getNode(), Load, Tmp0, Tmp1, Tmp2,
-                               Tmp3, Tmp4);
-      if (FoldedLoad)
-        std::swap(Src0, Src1);
-    }
-  }
+    // If we didn't fold a load, try to match broadcast. No widening limitation
+    // for this. But only 32 and 64 bit types are supported.
+    if (CmpSVT != MVT::i32 && CmpSVT != MVT::i64)
+      return false;
 
-  auto findBroadcastedOp = [](SDValue Src, MVT CmpSVT, SDNode *&Parent) {
     // Look through single use bitcasts.
-    if (Src.getOpcode() == ISD::BITCAST && Src.hasOneUse()) {
-      Parent = Src.getNode();
-      Src = Src.getOperand(0);
+    if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
+      P = L.getNode();
+      L = L.getOperand(0);
     }
 
-    if (Src.getOpcode() == X86ISD::VBROADCAST_LOAD && Src.hasOneUse()) {
-      auto *MemIntr = cast<MemIntrinsicSDNode>(Src);
-      if (MemIntr->getMemoryVT().getSizeInBits() == CmpSVT.getSizeInBits())
-        return Src;
-    }
+    if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
+      return false;
 
-    return SDValue();
+    auto *MemIntr = cast<MemIntrinsicSDNode>(L);
+    if (MemIntr->getMemoryVT().getSizeInBits() != CmpSVT.getSizeInBits())
+      return false;
+
+    return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment);
   };
 
-  // If we didn't fold a load, try to match broadcast. No widening limitation
-  // for this. But only 32 and 64 bit types are supported.
-  bool FoldedBCast = false;
-  if (!FoldedLoad && CanFoldLoads &&
-      (CmpSVT == MVT::i32 || CmpSVT == MVT::i64)) {
-    SDNode *ParentNode = N0.getNode();
-    if ((Load = findBroadcastedOp(Src1, CmpSVT, ParentNode))) {
-      FoldedBCast = tryFoldBroadcast(Root, ParentNode, Load, Tmp0,
-                                     Tmp1, Tmp2, Tmp3, Tmp4);
-    }
+  // We can only fold loads if the sources are unique.
+  bool CanFoldLoads = Src0 != Src1;
 
-    // Try the other operand.
-    if (!FoldedBCast) {
-      SDNode *ParentNode = N0.getNode();
-      if ((Load = findBroadcastedOp(Src0, CmpSVT, ParentNode))) {
-        FoldedBCast = tryFoldBroadcast(Root, ParentNode, Load, Tmp0,
-                                       Tmp1, Tmp2, Tmp3, Tmp4);
-        if (FoldedBCast)
-          std::swap(Src0, Src1);
-      }
+  bool FoldedLoad = false;
+  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+  if (CanFoldLoads) {
+    FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src1, Tmp0, Tmp1, Tmp2,
+                                    Tmp3, Tmp4);
+    if (!FoldedLoad) {
+      // And is commutative.
+      FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src0, Tmp0, Tmp1,
+                                      Tmp2, Tmp3, Tmp4);
+      if (FoldedLoad)
+        std::swap(Src0, Src1);
     }
   }
 
-  auto getMaskRC = [](MVT MaskVT) {
-    switch (MaskVT.SimpleTy) {
-    default: llvm_unreachable("Unexpected VT!");
-    case MVT::v2i1:  return X86::VK2RegClassID;
-    case MVT::v4i1:  return X86::VK4RegClassID;
-    case MVT::v8i1:  return X86::VK8RegClassID;
-    case MVT::v16i1: return X86::VK16RegClassID;
-    case MVT::v32i1: return X86::VK32RegClassID;
-    case MVT::v64i1: return X86::VK64RegClassID;
-    }
-  };
+  bool FoldedBCast = FoldedLoad && Src1.getOpcode() == X86ISD::VBROADCAST_LOAD;
 
   bool IsMasked = InMask.getNode() != nullptr;
 
@@ -4260,13 +4451,12 @@ bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
                                                      CmpVT), 0);
     Src0 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src0);
 
-    assert(!FoldedLoad && "Shouldn't have folded the load");
     if (!FoldedBCast)
       Src1 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src1);
 
     if (IsMasked) {
       // Widen the mask.
-      unsigned RegClass = getMaskRC(MaskVT);
+      unsigned RegClass = TLI->getRegClassFor(MaskVT)->getID();
       SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
       InMask = SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
                                               dl, MaskVT, InMask, RC), 0);
@@ -4278,23 +4468,23 @@ bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
                                IsMasked);
 
   MachineSDNode *CNode;
-  if (FoldedLoad || FoldedBCast) {
+  if (FoldedLoad) {
     SDVTList VTs = CurDAG->getVTList(MaskVT, MVT::Other);
 
     if (IsMasked) {
       SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
-                        Load.getOperand(0) };
+                        Src1.getOperand(0) };
       CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
     } else {
       SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
-                        Load.getOperand(0) };
+                        Src1.getOperand(0) };
       CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
     }
 
     // Update the chain.
-    ReplaceUses(Load.getValue(1), SDValue(CNode, 1));
+    ReplaceUses(Src1.getValue(1), SDValue(CNode, 1));
     // Record the mem-refs
-    CurDAG->setNodeMemRefs(CNode, {cast<MemSDNode>(Load)->getMemOperand()});
+    CurDAG->setNodeMemRefs(CNode, {cast<MemSDNode>(Src1)->getMemOperand()});
   } else {
     if (IsMasked)
       CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1);
@@ -4304,7 +4494,7 @@ bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
 
   // If we widened, we need to shrink the mask VT.
   if (Widen) {
-    unsigned RegClass = getMaskRC(ResVT);
+    unsigned RegClass = TLI->getRegClassFor(ResVT)->getID();
     SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
     CNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
                                    dl, ResVT, SDValue(CNode, 0), RC);
@@ -4360,8 +4550,9 @@ bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) {
   SDValue Imm = CurDAG->getTargetConstant(0xCA, dl, MVT::i8);
   SDValue Ternlog = CurDAG->getNode(X86ISD::VPTERNLOG, dl, NVT, A, B, C, Imm);
   ReplaceNode(N, Ternlog.getNode());
-  SelectCode(Ternlog.getNode());
-  return true;
+
+  return matchVPTERNLOG(Ternlog.getNode(), Ternlog.getNode(), Ternlog.getNode(),
+                        A, B, C, 0xCA);
 }
 
 void X86DAGToDAGISel::Select(SDNode *Node) {
@@ -4377,6 +4568,95 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
 
   switch (Opcode) {
   default: break;
+  case ISD::INTRINSIC_W_CHAIN: {
+    unsigned IntNo = Node->getConstantOperandVal(1);
+    switch (IntNo) {
+    default: break;
+    case Intrinsic::x86_encodekey128:
+    case Intrinsic::x86_encodekey256: {
+      if (!Subtarget->hasKL())
+        break;
+
+      unsigned Opcode;
+      switch (IntNo) {
+      default: llvm_unreachable("Impossible intrinsic");
+      case Intrinsic::x86_encodekey128: Opcode = X86::ENCODEKEY128; break;
+      case Intrinsic::x86_encodekey256: Opcode = X86::ENCODEKEY256; break;
+      }
+
+      SDValue Chain = Node->getOperand(0);
+      Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(3),
+                                   SDValue());
+      if (Opcode == X86::ENCODEKEY256)
+        Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(4),
+                                     Chain.getValue(1));
+
+      MachineSDNode *Res = CurDAG->getMachineNode(
+          Opcode, dl, Node->getVTList(),
+          {Node->getOperand(2), Chain, Chain.getValue(1)});
+      ReplaceNode(Node, Res);
+      return;
+    }
+    case Intrinsic::x86_tileloadd64_internal: {
+      if (!Subtarget->hasAMXTILE())
+        break;
+      unsigned Opc = X86::PTILELOADDV;
+      // _tile_loadd_internal(row, col, buf, STRIDE)
+      SDValue Base = Node->getOperand(4);
+      SDValue Scale = getI8Imm(1, dl);
+      SDValue Index = Node->getOperand(5);
+      SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
+      SDValue Segment = CurDAG->getRegister(0, MVT::i16);
+      SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
+      SDValue Chain = Node->getOperand(0);
+      MachineSDNode *CNode;
+      SDValue Ops[] = {Node->getOperand(2),
+                       Node->getOperand(3),
+                       Base,
+                       Scale,
+                       Index,
+                       Disp,
+                       Segment,
+                       CFG,
+                       Chain};
+      CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
+      ReplaceNode(Node, CNode);
+      return;
+    }
+    case Intrinsic::x86_tdpbssd_internal: {
+      if (!Subtarget->hasAMXTILE())
+        break;
+      SDValue Chain = Node->getOperand(0);
+      unsigned Opc = X86::PTDPBSSDV;
+      SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
+      SDValue Ops[] = {Node->getOperand(2),
+                       Node->getOperand(3),
+                       Node->getOperand(4),
+                       Node->getOperand(5),
+                       Node->getOperand(6),
+                       Node->getOperand(7),
+                       CFG,
+                       Chain};
+      MachineSDNode *CNode =
+          CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
+      ReplaceNode(Node, CNode);
+      return;
+    }
+    case Intrinsic::x86_tilezero_internal: {
+      if (!Subtarget->hasAMXTILE())
+        break;
+      unsigned Opc = X86::PTILEZEROV;
+      SDValue Chain = Node->getOperand(0);
+      SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
+      SDValue Ops[] = {Node->getOperand(2), Node->getOperand(3), CFG, Chain};
+      MachineSDNode *CNode =
+          CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
+      ReplaceNode(Node, CNode);
+      return;
+    }
+    }
+    break;
+  }
   case ISD::INTRINSIC_VOID: {
     unsigned IntNo = Node->getConstantOperandVal(1);
     switch (IntNo) {
@@ -4431,6 +4711,31 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
 
       break;
     }
+    case Intrinsic::x86_tilestored64_internal: {
+      unsigned Opc = X86::PTILESTOREDV;
+      // _tile_stored_internal(row, col, buf, STRIDE, c)
+      SDValue Base = Node->getOperand(4);
+      SDValue Scale = getI8Imm(1, dl);
+      SDValue Index = Node->getOperand(5);
+      SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
+      SDValue Segment = CurDAG->getRegister(0, MVT::i16);
+      SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
+      SDValue Chain = Node->getOperand(0);
+      MachineSDNode *CNode;
+      SDValue Ops[] = {Node->getOperand(2),
+                       Node->getOperand(3),
+                       Base,
+                       Scale,
+                       Index,
+                       Disp,
+                       Segment,
+                       Node->getOperand(6),
+                       CFG,
+                       Chain};
+      CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
+      ReplaceNode(Node, CNode);
+      return;
+    }
     case Intrinsic::x86_tileloadd64:
     case Intrinsic::x86_tileloaddt164:
     case Intrinsic::x86_tilestored64: {
@@ -4511,6 +4816,19 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
       return;
     break;
 
+  case X86ISD::VPTERNLOG: {
+    uint8_t Imm = cast<ConstantSDNode>(Node->getOperand(3))->getZExtValue();
+    if (matchVPTERNLOG(Node, Node, Node, Node->getOperand(0),
+                       Node->getOperand(1), Node->getOperand(2), Imm))
+      return;
+    break;
+  }
+
+  case X86ISD::ANDNP:
+    if (tryVPTERNLOG(Node))
+      return;
+    break;
+
   case ISD::AND:
     if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) {
       // Try to form a masked VPTESTM. Operands can be in either order.
@@ -5609,6 +5927,62 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     CurDAG->RemoveDeadNode(Node);
     return;
   }
+  case X86ISD::AESENCWIDE128KL:
+  case X86ISD::AESDECWIDE128KL:
+  case X86ISD::AESENCWIDE256KL:
+  case X86ISD::AESDECWIDE256KL: {
+    if (!Subtarget->hasWIDEKL())
+      break;
+
+    unsigned Opcode;
+    switch (Node->getOpcode()) {
+    default:
+      llvm_unreachable("Unexpected opcode!");
+    case X86ISD::AESENCWIDE128KL:
+      Opcode = X86::AESENCWIDE128KL;
+      break;
+    case X86ISD::AESDECWIDE128KL:
+      Opcode = X86::AESDECWIDE128KL;
+      break;
+    case X86ISD::AESENCWIDE256KL:
+      Opcode = X86::AESENCWIDE256KL;
+      break;
+    case X86ISD::AESDECWIDE256KL:
+      Opcode = X86::AESDECWIDE256KL;
+      break;
+    }
+
+    SDValue Chain = Node->getOperand(0);
+    SDValue Addr = Node->getOperand(1);
+
+    SDValue Base, Scale, Index, Disp, Segment;
+    if (!selectAddr(Node, Addr, Base, Scale, Index, Disp, Segment))
+      break;
+
+    Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(2),
+                                 SDValue());
+    Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(3),
+                                 Chain.getValue(1));
+    Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM2, Node->getOperand(4),
+                                 Chain.getValue(1));
+    Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM3, Node->getOperand(5),
+                                 Chain.getValue(1));
+    Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM4, Node->getOperand(6),
+                                 Chain.getValue(1));
+    Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM5, Node->getOperand(7),
+                                 Chain.getValue(1));
+    Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM6, Node->getOperand(8),
+                                 Chain.getValue(1));
+    Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM7, Node->getOperand(9),
+                                 Chain.getValue(1));
+
+    MachineSDNode *Res = CurDAG->getMachineNode(
+        Opcode, dl, Node->getVTList(),
+        {Base, Scale, Index, Disp, Segment, Chain, Chain.getValue(1)});
+    CurDAG->setNodeMemRefs(Res, cast<MemSDNode>(Node)->getMemOperand());
+    ReplaceNode(Node, Res);
+    return;
+  }
   }
 
   SelectCode(Node);
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp
index 56690c3c555b..1e2407c7e7f6 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -35,6 +35,7 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
@@ -76,6 +77,14 @@ static cl::opt<int> ExperimentalPrefLoopAlignment(
         " of the loop header PC will be 0)."),
     cl::Hidden);
 
+static cl::opt<int> ExperimentalPrefInnermostLoopAlignment(
+    "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
+    cl::desc(
+        "Sets the preferable loop alignment for experiments (as log2 bytes) "
+        "for innermost loops only. If specified, this option overrides "
+        "alignment set by x86-experimental-pref-loop-alignment."),
+    cl::Hidden);
+
 static cl::opt<bool> MulConstantOptimization(
     "mul-constant-optimization", cl::init(true),
     cl::desc("Replace 'mul x, Const' with more effective instructions like "
@@ -135,19 +144,24 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       addBypassSlowDiv(64, 32);
   }
 
-  if (Subtarget.isTargetWindowsMSVC() ||
-      Subtarget.isTargetWindowsItanium()) {
-    // Setup Windows compiler runtime calls.
-    setLibcallName(RTLIB::SDIV_I64, "_alldiv");
-    setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
-    setLibcallName(RTLIB::SREM_I64, "_allrem");
-    setLibcallName(RTLIB::UREM_I64, "_aullrem");
-    setLibcallName(RTLIB::MUL_I64, "_allmul");
-    setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
-    setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
-    setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
-    setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
-    setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
+  // Setup Windows compiler runtime calls.
+  if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) {
+    static const struct {
+      const RTLIB::Libcall Op;
+      const char * const Name;
+      const CallingConv::ID CC;
+    } LibraryCalls[] = {
+      { RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall },
+      { RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall },
+      { RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall },
+      { RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall },
+      { RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall },
+    };
+
+    for (const auto &LC : LibraryCalls) {
+      setLibcallName(LC.Op, LC.Name);
+      setLibcallCallingConv(LC.Op, LC.CC);
+    }
   }
 
   if (Subtarget.getTargetTriple().isOSMSVCRT()) {
@@ -193,8 +207,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   if (Subtarget.hasCMov()) {
     setOperationAction(ISD::ABS            , MVT::i16  , Custom);
     setOperationAction(ISD::ABS            , MVT::i32  , Custom);
+    if (Subtarget.is64Bit())
+      setOperationAction(ISD::ABS          , MVT::i64  , Custom);
   }
-  setOperationAction(ISD::ABS              , MVT::i64  , Custom);
 
   // Funnel shifts.
   for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
@@ -278,6 +293,19 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     }
   }
 
+  if (Subtarget.hasSSE2()) {
+    // Custom lowering for saturating float to int conversions.
+    // We handle promotion to larger result types manually.
+    for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
+      setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom);
+      setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom);
+    }
+    if (Subtarget.is64Bit()) {
+      setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);
+      setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);
+    }
+  }
+
   // Handle address space casts between mixed sized pointers.
   setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
   setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
@@ -384,6 +412,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setTruncStoreAction(MVT::f80, MVT::f16, Expand);
   setTruncStoreAction(MVT::f128, MVT::f16, Expand);
 
+  setOperationAction(ISD::PARITY, MVT::i8, Custom);
   if (Subtarget.hasPOPCNT()) {
     setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
   } else {
@@ -394,6 +423,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
     else
       setOperationAction(ISD::CTPOP        , MVT::i64  , Custom);
+
+    setOperationAction(ISD::PARITY, MVT::i16, Custom);
+    setOperationAction(ISD::PARITY, MVT::i32, Custom);
+    if (Subtarget.is64Bit())
+      setOperationAction(ISD::PARITY, MVT::i64, Custom);
   }
 
   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
@@ -487,6 +521,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
   setOperationAction(ISD::TRAP, MVT::Other, Legal);
   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
+  setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
 
   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
@@ -915,9 +950,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::SADDSAT,            MVT::v8i16, Legal);
     setOperationAction(ISD::USUBSAT,            MVT::v8i16, Legal);
     setOperationAction(ISD::SSUBSAT,            MVT::v8i16, Legal);
-    setOperationAction(ISD::UADDSAT,            MVT::v4i32, Custom);
     setOperationAction(ISD::USUBSAT,            MVT::v4i32, Custom);
-    setOperationAction(ISD::UADDSAT,            MVT::v2i64, Custom);
     setOperationAction(ISD::USUBSAT,            MVT::v2i64, Custom);
 
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
@@ -1081,6 +1114,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::STRICT_FRINT,      RoundedTy,  Legal);
       setOperationAction(ISD::FNEARBYINT,        RoundedTy,  Legal);
       setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy,  Legal);
+      setOperationAction(ISD::FROUNDEVEN,        RoundedTy,  Legal);
+      setOperationAction(ISD::STRICT_FROUNDEVEN, RoundedTy,  Legal);
 
       setOperationAction(ISD::FROUND,            RoundedTy,  Custom);
     }
@@ -1094,6 +1129,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::UMIN,               MVT::v8i16, Legal);
     setOperationAction(ISD::UMIN,               MVT::v4i32, Legal);
 
+    setOperationAction(ISD::UADDSAT,            MVT::v4i32, Custom);
+
     // FIXME: Do we need to handle scalar-to-vector here?
     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
 
@@ -1134,6 +1171,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     }
   }
 
+  if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
+    setOperationAction(ISD::UADDSAT,            MVT::v2i64, Custom);
+  }
+
   if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
     for (auto VT : { MVT::v16i8, MVT::v8i16,  MVT::v4i32, MVT::v2i64,
                      MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
@@ -1175,6 +1216,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::STRICT_FRINT,      VT, Legal);
       setOperationAction(ISD::FNEARBYINT,        VT, Legal);
       setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
+      setOperationAction(ISD::FROUNDEVEN,        VT, Legal);
+      setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
 
       setOperationAction(ISD::FROUND,            VT, Custom);
 
@@ -1302,6 +1345,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::SADDSAT,   MVT::v16i16, HasInt256 ? Legal : Custom);
     setOperationAction(ISD::USUBSAT,   MVT::v16i16, HasInt256 ? Legal : Custom);
     setOperationAction(ISD::SSUBSAT,   MVT::v16i16, HasInt256 ? Legal : Custom);
+    setOperationAction(ISD::UADDSAT,   MVT::v8i32, Custom);
+    setOperationAction(ISD::USUBSAT,   MVT::v8i32, Custom);
+    setOperationAction(ISD::UADDSAT,   MVT::v4i64, Custom);
+    setOperationAction(ISD::USUBSAT,   MVT::v4i64, Custom);
 
     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
       setOperationAction(ISD::ABS,  VT, HasInt256 ? Legal : Custom);
@@ -1560,6 +1607,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::STRICT_FRINT,      VT, Legal);
       setOperationAction(ISD::FNEARBYINT,        VT, Legal);
       setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
+      setOperationAction(ISD::FROUNDEVEN,        VT, Legal);
+      setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
 
       setOperationAction(ISD::FROUND,            VT, Custom);
     }
@@ -1688,10 +1737,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     }
 
     if (Subtarget.hasVBMI2()) {
-      for (auto VT : { MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
+      for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64,
+                       MVT::v16i16, MVT::v8i32, MVT::v4i64,
+                       MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
         setOperationAction(ISD::FSHL, VT, Custom);
         setOperationAction(ISD::FSHR, VT, Custom);
       }
+
+      setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
+      setOperationAction(ISD::ROTR, MVT::v8i16,  Custom);
+      setOperationAction(ISD::ROTR, MVT::v16i16, Custom);
+      setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
     }
   }// useAVX512Regs
 
@@ -1858,20 +1914,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setTruncStoreAction(MVT::v8i16,   MVT::v8i8,  Legal);
     }
 
-    if (Subtarget.hasVBMI2()) {
-      // TODO: Make these legal even without VLX?
-      for (auto VT : { MVT::v8i16,  MVT::v4i32, MVT::v2i64,
-                       MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
-        setOperationAction(ISD::FSHL, VT, Custom);
-        setOperationAction(ISD::FSHR, VT, Custom);
-      }
-    }
-
     setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);
     setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);
     setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
   }
 
+  if (Subtarget.hasAMXTILE()) {
+    addRegisterClass(MVT::x86amx, &X86::TILERegClass);
+  }
+
   // We want to custom lower some of our intrinsics.
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
@@ -1901,6 +1952,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::ADDCARRY, VT, Custom);
     setOperationAction(ISD::SUBCARRY, VT, Custom);
     setOperationAction(ISD::SETCCCARRY, VT, Custom);
+    setOperationAction(ISD::SADDO_CARRY, VT, Custom);
+    setOperationAction(ISD::SSUBO_CARRY, VT, Custom);
   }
 
   if (!Subtarget.is64Bit()) {
@@ -1923,8 +1976,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::UDIV, MVT::i128, Custom);
     setOperationAction(ISD::SREM, MVT::i128, Custom);
     setOperationAction(ISD::UREM, MVT::i128, Custom);
-    setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
-    setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
   }
 
   // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
@@ -2456,13 +2507,23 @@ Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
       // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
       return SegmentOffset(IRB, 0x10, getAddressSpace());
     } else {
+      unsigned AddressSpace = getAddressSpace();
+      // Specially, some users may customize the base reg and offset.
+      unsigned Offset = getTargetMachine().Options.StackProtectorGuardOffset;
+      // If we don't set -stack-protector-guard-offset value:
       // %fs:0x28, unless we're using a Kernel code model, in which case
       // it's %gs:0x28.  gs:0x14 on i386.
-      unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
-      return SegmentOffset(IRB, Offset, getAddressSpace());
+      if (Offset == (unsigned)-1)
+        Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
+
+      const auto &GuardReg = getTargetMachine().Options.StackProtectorGuardReg;
+      if (GuardReg == "fs")
+        AddressSpace = X86AS::FS;
+      else if (GuardReg == "gs")
+        AddressSpace = X86AS::GS;
+      return SegmentOffset(IRB, Offset, AddressSpace);
     }
   }
-
   return TargetLowering::getIRStackGuard(IRB);
 }
 
@@ -2484,8 +2545,13 @@ void X86TargetLowering::insertSSPDeclarations(Module &M) const {
     }
     return;
   }
+
+  auto GuardMode = getTargetMachine().Options.StackProtectorGuard;
+
   // glibc, bionic, and Fuchsia have a special slot for the stack guard.
-  if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
+  if ((GuardMode == llvm::StackProtectorGuards::TLS ||
+       GuardMode == llvm::StackProtectorGuards::None)
+      && hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
     return;
   TargetLowering::insertSSPDeclarations(M);
 }
@@ -2531,17 +2597,6 @@ Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
   return TargetLowering::getSafeStackPointerLocation(IRB);
 }
 
-bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
-                                            unsigned DestAS) const {
-  assert(SrcAS != DestAS && "Expected different address spaces!");
-
-  const TargetMachine &TM = getTargetMachine();
-  if (TM.getPointerSize(SrcAS) != TM.getPointerSize(DestAS))
-    return false;
-
-  return SrcAS < 256 && DestAS < 256;
-}
-
 //===----------------------------------------------------------------------===//
 //               Return Value Calling Convention Implementation
 //===----------------------------------------------------------------------===//
@@ -3046,8 +3101,9 @@ SDValue X86TargetLowering::LowerCallResult(
                         // This truncation won't change the value.
                         DAG.getIntPtrConstant(1, dl));
 
-    if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
+    if (VA.isExtInLoc()) {
       if (VA.getValVT().isVector() &&
+          VA.getValVT().getScalarType() == MVT::i1 &&
           ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
            (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
         // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
@@ -3115,7 +3171,7 @@ argsAreStructReturn(ArrayRef<ISD::InputArg> Ins, bool IsMCU) {
 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
                                          SDValue Chain, ISD::ArgFlagsTy Flags,
                                          SelectionDAG &DAG, const SDLoc &dl) {
-  SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
+  SDValue SizeNode = DAG.getIntPtrConstant(Flags.getByValSize(), dl);
 
   return DAG.getMemcpy(
       Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(),
@@ -3364,8 +3420,8 @@ private:
 
   void forwardMustTailParameters(SDValue &Chain);
 
-  bool is64Bit() { return Subtarget.is64Bit(); }
-  bool isWin64() { return Subtarget.isCallingConvWin64(CallConv); }
+  bool is64Bit() const { return Subtarget.is64Bit(); }
+  bool isWin64() const { return Subtarget.isCallingConvWin64(CallConv); }
 
   X86MachineFunctionInfo *FuncInfo;
   const SDLoc &DL;
@@ -3476,11 +3532,10 @@ void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters(
       SaveXMMOps.push_back(Chain);
       SaveXMMOps.push_back(ALVal);
       SaveXMMOps.push_back(
-          DAG.getIntPtrConstant(FuncInfo->getRegSaveFrameIndex(), DL));
+          DAG.getTargetConstant(FuncInfo->getRegSaveFrameIndex(), DL, MVT::i32));
       SaveXMMOps.push_back(
-          DAG.getIntPtrConstant(FuncInfo->getVarArgsFPOffset(), DL));
-      SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
-                        LiveXMMRegs.end());
+          DAG.getTargetConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32));
+      llvm::append_range(SaveXMMOps, LiveXMMRegs);
       MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, DL,
                                    MVT::Other, SaveXMMOps));
     }
@@ -3754,7 +3809,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
       // same, so the size of funclets' (mostly empty) frames is dictated by
       // how far this slot is from the bottom (since they allocate just enough
       // space to accommodate holding this slot at the correct offset).
-      int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSS=*/false);
+      int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSpillSlot=*/false);
       EHInfo->PSPSymFrameIdx = PSPSymFI;
     }
   }
@@ -3861,6 +3916,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   const auto *II = dyn_cast_or_null<InvokeInst>(CLI.CB);
   bool HasNoCfCheck =
       (CI && CI->doesNoCfCheck()) || (II && II->doesNoCfCheck());
+	bool IsIndirectCall = (CI && CI->isIndirectCall());
   const Module *M = MF.getMMI().getModule();
   Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
 
@@ -4100,9 +4156,13 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   if (Subtarget.isPICStyleGOT()) {
     // ELF / PIC requires GOT in the EBX register before function calls via PLT
-    // GOT pointer.
+    // GOT pointer (except regcall).
     if (!isTailCall) {
-      RegsToPass.push_back(std::make_pair(
+      // Indirect call with RegCall calling convertion may use up all the
+      // general registers, so it is not suitable to bind EBX reister for
+      // GOT address, just let register allocator handle it.
+      if (CallConv != CallingConv::X86_RegCall)
+        RegsToPass.push_back(std::make_pair(
           Register(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
                                           getPointerTy(DAG.getDataLayout()))));
     } else {
@@ -4269,7 +4329,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   Ops.push_back(Callee);
 
   if (isTailCall)
-    Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
+    Ops.push_back(DAG.getTargetConstant(FPDiff, dl, MVT::i32));
 
   // Add argument registers to the end of the list so that they are known live
   // into the call.
@@ -4343,7 +4403,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     return Ret;
   }
 
-  if (HasNoCfCheck && IsCFProtectionSupported) {
+  if (HasNoCfCheck && IsCFProtectionSupported && IsIndirectCall) {
     Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
   } else {
     Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
@@ -4462,7 +4522,7 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
   int FI = INT_MAX;
   if (Arg.getOpcode() == ISD::CopyFromReg) {
     Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
-    if (!Register::isVirtualRegister(VR))
+    if (!VR.isVirtual())
       return false;
     MachineInstr *Def = MRI->getVRegDef(VR);
     if (!Def)
@@ -4514,7 +4574,8 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
   if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
     return false;
 
-  if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
+  if (VA.getLocVT().getFixedSizeInBits() >
+      Arg.getValueSizeInBits().getFixedSize()) {
     // If the argument location is wider than the argument type, check that any
     // extension flags match.
     if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
@@ -5022,13 +5083,47 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
                                            const CallInst &I,
                                            MachineFunction &MF,
                                            unsigned Intrinsic) const {
+  Info.flags = MachineMemOperand::MONone;
+  Info.offset = 0;
 
   const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
-  if (!IntrData)
+  if (!IntrData) {
+    switch (Intrinsic) {
+    case Intrinsic::x86_aesenc128kl:
+    case Intrinsic::x86_aesdec128kl:
+      Info.opc = ISD::INTRINSIC_W_CHAIN;
+      Info.ptrVal = I.getArgOperand(1);
+      Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
+      Info.align = Align(1);
+      Info.flags |= MachineMemOperand::MOLoad;
+      return true;
+    case Intrinsic::x86_aesenc256kl:
+    case Intrinsic::x86_aesdec256kl:
+      Info.opc = ISD::INTRINSIC_W_CHAIN;
+      Info.ptrVal = I.getArgOperand(1);
+      Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
+      Info.align = Align(1);
+      Info.flags |= MachineMemOperand::MOLoad;
+      return true;
+    case Intrinsic::x86_aesencwide128kl:
+    case Intrinsic::x86_aesdecwide128kl:
+      Info.opc = ISD::INTRINSIC_W_CHAIN;
+      Info.ptrVal = I.getArgOperand(0);
+      Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
+      Info.align = Align(1);
+      Info.flags |= MachineMemOperand::MOLoad;
+      return true;
+    case Intrinsic::x86_aesencwide256kl:
+    case Intrinsic::x86_aesdecwide256kl:
+      Info.opc = ISD::INTRINSIC_W_CHAIN;
+      Info.ptrVal = I.getArgOperand(0);
+      Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
+      Info.align = Align(1);
+      Info.flags |= MachineMemOperand::MOLoad;
+      return true;
+    }
     return false;
-
-  Info.flags = MachineMemOperand::MONone;
-  Info.offset = 0;
+  }
 
   switch (IntrData->Type) {
   case TRUNCATE_TO_MEM_VI8:
@@ -5098,7 +5193,7 @@ bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
                                               ISD::LoadExtType ExtTy,
                                               EVT NewVT) const {
   assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
-  
+
   // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
   // relocation target a movq or addq instruction: don't let the load shrink.
   SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
@@ -5271,6 +5366,7 @@ bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
   // width.
   if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
     return false;
+
   return true;
 }
 
@@ -5414,6 +5510,14 @@ static bool isUndefOrEqual(int Val, int CmpVal) {
   return ((Val == SM_SentinelUndef) || (Val == CmpVal));
 }
 
+/// Return true if every element in Mask is the undef sentinel value or equal to
+/// the specified value..
+static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
+  return llvm::all_of(Mask, [CmpVal](int M) {
+    return (M == SM_SentinelUndef) || (M == CmpVal);
+  });
+}
+
 /// Val is either the undef or zero sentinel value.
 static bool isUndefOrZero(int Val) {
   return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
@@ -5820,7 +5924,7 @@ static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
 static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
                               const X86Subtarget &Subtarget, SelectionDAG &DAG,
                               const SDLoc &dl) {
-  assert(Vec.getValueSizeInBits() < VT.getSizeInBits() &&
+  assert(Vec.getValueSizeInBits().getFixedSize() < VT.getFixedSizeInBits() &&
          Vec.getValueType().getScalarType() == VT.getScalarType() &&
          "Unsupported vector widening type");
   SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
@@ -6184,6 +6288,22 @@ static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
   return DAG.getBitcast(VT, Vec);
 }
 
+// Convert *_EXTEND_VECTOR_INREG to *_EXTEND opcode.
+static unsigned getOpcode_EXTEND(unsigned Opcode) {
+  switch (Opcode) {
+  case ISD::ANY_EXTEND:
+  case ISD::ANY_EXTEND_VECTOR_INREG:
+    return ISD::ANY_EXTEND;
+  case ISD::ZERO_EXTEND:
+  case ISD::ZERO_EXTEND_VECTOR_INREG:
+    return ISD::ZERO_EXTEND;
+  case ISD::SIGN_EXTEND:
+  case ISD::SIGN_EXTEND_VECTOR_INREG:
+    return ISD::SIGN_EXTEND;
+  }
+  llvm_unreachable("Unknown opcode");
+}
+
 // Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode.
 static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode) {
   switch (Opcode) {
@@ -6200,8 +6320,8 @@ static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode) {
   llvm_unreachable("Unknown opcode");
 }
 
-static SDValue getExtendInVec(unsigned Opcode, const SDLoc &DL, EVT VT,
-                              SDValue In, SelectionDAG &DAG) {
+static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
+                                      SDValue In, SelectionDAG &DAG) {
   EVT InVT = In.getValueType();
   assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
   assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||
@@ -6253,8 +6373,10 @@ static SDValue IsNOT(SDValue V, SelectionDAG &DAG, bool OneUse = false) {
   return SDValue();
 }
 
-void llvm::createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
+void llvm::createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask,
                                    bool Lo, bool Unary) {
+  assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&
+         "Illegal vector type to unpack");
   assert(Mask.empty() && "Expected an empty shuffle mask vector");
   int NumElts = VT.getVectorNumElements();
   int NumEltsInLane = 128 / VT.getScalarSizeInBits();
@@ -6283,7 +6405,7 @@ void llvm::createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
 }
 
 /// Returns a vector_shuffle node for an unpackl operation.
-static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
+static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
                           SDValue V1, SDValue V2) {
   SmallVector<int, 8> Mask;
   createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
@@ -6291,7 +6413,7 @@ static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
 }
 
 /// Returns a vector_shuffle node for an unpackh operation.
-static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
+static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
                           SDValue V1, SDValue V2) {
   SmallVector<int, 8> Mask;
   createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
@@ -6538,15 +6660,30 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
   }
 
   // Extract constant bits from a subvector broadcast.
-  if (Op.getOpcode() == X86ISD::SUBV_BROADCAST) {
-    SmallVector<APInt, 16> SubEltBits;
-    if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
-                                      UndefElts, SubEltBits, AllowWholeUndefs,
-                                      AllowPartialUndefs)) {
-      UndefElts = APInt::getSplat(NumElts, UndefElts);
-      while (EltBits.size() < NumElts)
-        EltBits.append(SubEltBits.begin(), SubEltBits.end());
-      return true;
+  if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
+    auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
+    SDValue Ptr = MemIntr->getBasePtr();
+    if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
+      Type *CstTy = Cst->getType();
+      unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
+      if (!CstTy->isVectorTy() || (SizeInBits % CstSizeInBits) != 0)
+        return false;
+      unsigned SubEltSizeInBits = CstTy->getScalarSizeInBits();
+      unsigned NumSubElts = CstSizeInBits / SubEltSizeInBits;
+      unsigned NumSubVecs = SizeInBits / CstSizeInBits;
+      APInt UndefSubElts(NumSubElts, 0);
+      SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
+                                        APInt(SubEltSizeInBits, 0));
+      for (unsigned i = 0; i != NumSubElts; ++i) {
+        if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
+                                 UndefSubElts, i))
+          return false;
+        for (unsigned j = 1; j != NumSubVecs; ++j)
+          SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
+      }
+      UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
+                                     UndefSubElts);
+      return CastBitData(UndefSubElts, SubEltBits);
     }
   }
 
@@ -6567,23 +6704,26 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
 
   // Insert constant bits from a base and sub vector sources.
   if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
-    // TODO - support insert_subvector through bitcasts.
-    if (EltSizeInBits != VT.getScalarSizeInBits())
-      return false;
+    // If bitcasts to larger elements we might lose track of undefs - don't
+    // allow any to be safe.
+    unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
+    bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
 
-    APInt UndefSubElts;
-    SmallVector<APInt, 32> EltSubBits;
-    if (getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
+    APInt UndefSrcElts, UndefSubElts;
+    SmallVector<APInt, 32> EltSrcBits, EltSubBits;
+    if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
                                       UndefSubElts, EltSubBits,
-                                      AllowWholeUndefs, AllowPartialUndefs) &&
-        getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
-                                      UndefElts, EltBits, AllowWholeUndefs,
-                                      AllowPartialUndefs)) {
+                                      AllowWholeUndefs && AllowUndefs,
+                                      AllowPartialUndefs && AllowUndefs) &&
+        getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
+                                      UndefSrcElts, EltSrcBits,
+                                      AllowWholeUndefs && AllowUndefs,
+                                      AllowPartialUndefs && AllowUndefs)) {
       unsigned BaseIdx = Op.getConstantOperandVal(2);
-      UndefElts.insertBits(UndefSubElts, BaseIdx);
+      UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
       for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
-        EltBits[BaseIdx + i] = EltSubBits[i];
-      return true;
+        EltSrcBits[BaseIdx + i] = EltSubBits[i];
+      return CastBitData(UndefSrcElts, EltSrcBits);
     }
   }
 
@@ -6696,7 +6836,7 @@ static bool getTargetShuffleMaskIndices(SDValue MaskNode,
     return false;
 
   // Insert the extracted elements into the mask.
-  for (APInt Elt : EltBits)
+  for (const APInt &Elt : EltBits)
     RawMask.push_back(Elt.getZExtValue());
 
   return true;
@@ -7375,44 +7515,10 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
     return true;
   }
   case ISD::OR: {
-    // Inspect each operand at the byte level. We can merge these into a
-    // blend shuffle mask if for each byte at least one is masked out (zero).
-    KnownBits Known0 =
-        DAG.computeKnownBits(N.getOperand(0), DemandedElts, Depth + 1);
-    KnownBits Known1 =
-        DAG.computeKnownBits(N.getOperand(1), DemandedElts, Depth + 1);
-    if (Known0.One.isNullValue() && Known1.One.isNullValue()) {
-      bool IsByteMask = true;
-      APInt ZeroMask = APInt::getNullValue(NumBytesPerElt);
-      APInt SelectMask = APInt::getNullValue(NumBytesPerElt);
-      for (unsigned i = 0; i != NumBytesPerElt && IsByteMask; ++i) {
-        unsigned LHS = Known0.Zero.extractBits(8, i * 8).getZExtValue();
-        unsigned RHS = Known1.Zero.extractBits(8, i * 8).getZExtValue();
-        if (LHS == 255 && RHS == 0)
-          SelectMask.setBit(i);
-        else if (LHS == 255 && RHS == 255)
-          ZeroMask.setBit(i);
-        else if (!(LHS == 0 && RHS == 255))
-          IsByteMask = false;
-      }
-      if (IsByteMask) {
-        for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt) {
-          for (unsigned j = 0; j != NumBytesPerElt; ++j) {
-            unsigned Ofs = (SelectMask[j] ? NumSizeInBytes : 0);
-            int Idx = (ZeroMask[j] ? (int)SM_SentinelZero : (i + j + Ofs));
-            Mask.push_back(Idx);
-          }
-        }
-        Ops.push_back(N.getOperand(0));
-        Ops.push_back(N.getOperand(1));
-        return true;
-      }
-    }
-
     // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
     // is a valid shuffle index.
-    SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
-    SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
+    SDValue N0 = peekThroughBitcasts(N.getOperand(0));
+    SDValue N1 = peekThroughBitcasts(N.getOperand(1));
     if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
       return false;
     SmallVector<int, 64> SrcMask0, SrcMask1;
@@ -7423,34 +7529,24 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
                                 true))
       return false;
 
-    // Shuffle inputs must be the same size as the result.
-    if (llvm::any_of(SrcInputs0, [VT](SDValue Op) {
-          return VT.getSizeInBits() != Op.getValueSizeInBits();
-        }))
-      return false;
-    if (llvm::any_of(SrcInputs1, [VT](SDValue Op) {
-          return VT.getSizeInBits() != Op.getValueSizeInBits();
-        }))
-      return false;
-
     size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
     SmallVector<int, 64> Mask0, Mask1;
     narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
     narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
-    for (size_t i = 0; i != MaskSize; ++i) {
+    for (int i = 0; i != (int)MaskSize; ++i) {
       if (Mask0[i] == SM_SentinelUndef && Mask1[i] == SM_SentinelUndef)
         Mask.push_back(SM_SentinelUndef);
       else if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
         Mask.push_back(SM_SentinelZero);
       else if (Mask1[i] == SM_SentinelZero)
-        Mask.push_back(Mask0[i]);
+        Mask.push_back(i);
       else if (Mask0[i] == SM_SentinelZero)
-        Mask.push_back(Mask1[i] + (int)(MaskSize * SrcInputs0.size()));
+        Mask.push_back(i + MaskSize);
       else
         return false;
     }
-    Ops.append(SrcInputs0.begin(), SrcInputs0.end());
-    Ops.append(SrcInputs1.begin(), SrcInputs1.end());
+    Ops.push_back(N0);
+    Ops.push_back(N1);
     return true;
   }
   case ISD::INSERT_SUBVECTOR: {
@@ -7482,7 +7578,8 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
 
     // Subvector shuffle inputs must not be larger than the subvector.
     if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
-          return SubVT.getSizeInBits() < SubInput.getValueSizeInBits();
+          return SubVT.getFixedSizeInBits() <
+                 SubInput.getValueSizeInBits().getFixedSize();
         }))
       return false;
 
@@ -7503,8 +7600,11 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
     }
     Ops.push_back(Src);
     Ops.append(SubInputs.begin(), SubInputs.end());
-    for (int i = 0; i != (int)NumElts; ++i)
-      Mask.push_back(i);
+    if (ISD::isBuildVectorAllZeros(Src.getNode()))
+      Mask.append(NumElts, SM_SentinelZero);
+    else
+      for (int i = 0; i != (int)NumElts; ++i)
+        Mask.push_back(i);
     for (int i = 0; i != (int)NumSubElts; ++i) {
       int M = SubMask[i];
       if (0 <= M) {
@@ -7605,19 +7705,33 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
     APInt EltsLHS, EltsRHS;
     getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
 
-    // If we know input saturation won't happen we can treat this
-    // as a truncation shuffle.
+    // If we know input saturation won't happen (or we don't care for particular
+    // lanes), we can treat this as a truncation shuffle.
+    bool Offset0 = false, Offset1 = false;
     if (Opcode == X86ISD::PACKSS) {
-      if ((!N0.isUndef() &&
+      if ((!(N0.isUndef() || EltsLHS.isNullValue()) &&
            DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
-          (!N1.isUndef() &&
+          (!(N1.isUndef() || EltsRHS.isNullValue()) &&
            DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
         return false;
+      // We can't easily fold ASHR into a shuffle, but if it was feeding a
+      // PACKSS then it was likely being used for sign-extension for a
+      // truncation, so just peek through and adjust the mask accordingly.
+      if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
+          N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
+        Offset0 = true;
+        N0 = N0.getOperand(0);
+      }
+      if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
+          N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
+        Offset1 = true;
+        N1 = N1.getOperand(0);
+      }
     } else {
       APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
-      if ((!N0.isUndef() &&
+      if ((!(N0.isUndef() || EltsLHS.isNullValue()) &&
            !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
-          (!N1.isUndef() &&
+          (!(N1.isUndef() || EltsRHS.isNullValue()) &&
            !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
         return false;
     }
@@ -7629,6 +7743,13 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
       Ops.push_back(N1);
 
     createPackShuffleMask(VT, Mask, IsUnary);
+
+    if (Offset0 || Offset1) {
+      for (int &M : Mask)
+        if ((Offset0 && isInRange(M, 0, NumElts)) ||
+            (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
+          ++M;
+    }
     return true;
   }
   case X86ISD::VTRUNC: {
@@ -7916,7 +8037,7 @@ static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
 }
 
 // Use PINSRB/PINSRW/PINSRD to create a build vector.
-static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros,
+static SDValue LowerBuildVectorAsInsert(SDValue Op, const APInt &NonZeroMask,
                                         unsigned NumNonZero, unsigned NumZero,
                                         SelectionDAG &DAG,
                                         const X86Subtarget &Subtarget) {
@@ -7931,7 +8052,7 @@ static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros,
   bool First = true;
 
   for (unsigned i = 0; i < NumElts; ++i) {
-    bool IsNonZero = (NonZeros & (1 << i)) != 0;
+    bool IsNonZero = NonZeroMask[i];
     if (!IsNonZero)
       continue;
 
@@ -7958,7 +8079,7 @@ static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros,
 }
 
 /// Custom lower build_vector of v16i8.
-static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
+static SDValue LowerBuildVectorv16i8(SDValue Op, const APInt &NonZeroMask,
                                      unsigned NumNonZero, unsigned NumZero,
                                      SelectionDAG &DAG,
                                      const X86Subtarget &Subtarget) {
@@ -7967,7 +8088,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
 
   // SSE4.1 - use PINSRB to insert each byte directly.
   if (Subtarget.hasSSE41())
-    return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
+    return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,
                                     Subtarget);
 
   SDLoc dl(Op);
@@ -7975,8 +8096,8 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
 
   // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
   for (unsigned i = 0; i < 16; i += 2) {
-    bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
-    bool NextIsNonZero = (NonZeros & (1 << (i + 1))) != 0;
+    bool ThisIsNonZero = NonZeroMask[i];
+    bool NextIsNonZero = NonZeroMask[i + 1];
     if (!ThisIsNonZero && !NextIsNonZero)
       continue;
 
@@ -8024,7 +8145,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
 }
 
 /// Custom lower build_vector of v8i16.
-static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
+static SDValue LowerBuildVectorv8i16(SDValue Op, const APInt &NonZeroMask,
                                      unsigned NumNonZero, unsigned NumZero,
                                      SelectionDAG &DAG,
                                      const X86Subtarget &Subtarget) {
@@ -8032,7 +8153,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
     return SDValue();
 
   // Use PINSRW to insert each byte directly.
-  return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
+  return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,
                                   Subtarget);
 }
 
@@ -8352,8 +8473,6 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
   // Handle Special Cases - all undef or undef/zero.
   if (UndefMask.countPopulation() == NumElems)
     return DAG.getUNDEF(VT);
-
-  // FIXME: Should we return this as a BUILD_VECTOR instead?
   if ((ZeroMask.countPopulation() + UndefMask.countPopulation()) == NumElems)
     return VT.isInteger() ? DAG.getConstant(0, DL, VT)
                           : DAG.getConstantFP(0.0, DL, VT);
@@ -8368,7 +8487,8 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
   assert(LDBase && "Did not find base load for merging consecutive loads");
   unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
   unsigned BaseSizeInBytes = BaseSizeInBits / 8;
-  int LoadSizeInBits = (1 + LastLoadedElt - FirstLoadedElt) * BaseSizeInBits;
+  int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
+  int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
   assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
 
   // TODO: Support offsetting the base load.
@@ -8430,7 +8550,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
   // base pointer. If the vector contains zeros, then attempt to shuffle those
   // elements.
   if (FirstLoadedElt == 0 &&
-      (LastLoadedElt == (int)(NumElems - 1) || IsDereferenceable) &&
+      (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
       (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
     if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
       return SDValue();
@@ -8518,6 +8638,11 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
       if (!Subtarget.hasAVX2() && ScalarSize < 32)
         continue;
 
+      // Don't attempt a 1:N subvector broadcast - it should be caught by
+      // combineConcatVectorOps, else will cause infinite loops.
+      if (RepeatSize > ScalarSize && SubElems == 1)
+        continue;
+
       bool Match = true;
       SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
       for (unsigned i = 0; i != NumElems && Match; ++i) {
@@ -8549,9 +8674,14 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
       if (TLI.isTypeLegal(BroadcastVT)) {
         if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
                 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, isAfterLegalize)) {
-          unsigned Opcode = RepeatSize > ScalarSize ? X86ISD::SUBV_BROADCAST
-                                                    : X86ISD::VBROADCAST;
-          SDValue Broadcast = DAG.getNode(Opcode, DL, BroadcastVT, RepeatLoad);
+          SDValue Broadcast = RepeatLoad;
+          if (RepeatSize > ScalarSize) {
+            while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
+              Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
+          } else {
+            Broadcast =
+                DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
+          }
           return DAG.getBitcast(VT, Broadcast);
         }
       }
@@ -8622,43 +8752,6 @@ static bool isFoldableUseOfShuffle(SDNode *N) {
   return false;
 }
 
-// Check if the current node of build vector is a zero extended vector.
-// // If so, return the value extended.
-// // For example: (0,0,0,a,0,0,0,a,0,0,0,a,0,0,0,a) returns a.
-// // NumElt - return the number of zero extended identical values.
-// // EltType - return the type of the value include the zero extend.
-static SDValue isSplatZeroExtended(const BuildVectorSDNode *Op,
-                                   unsigned &NumElt, MVT &EltType) {
-  SDValue ExtValue = Op->getOperand(0);
-  unsigned NumElts = Op->getNumOperands();
-  unsigned Delta = NumElts;
-
-  for (unsigned i = 1; i < NumElts; i++) {
-    if (Op->getOperand(i) == ExtValue) {
-      Delta = i;
-      break;
-    }
-    if (!(Op->getOperand(i).isUndef() || isNullConstant(Op->getOperand(i))))
-      return SDValue();
-  }
-  if (!isPowerOf2_32(Delta) || Delta == 1)
-    return SDValue();
-
-  for (unsigned i = Delta; i < NumElts; i++) {
-    if (i % Delta == 0) {
-      if (Op->getOperand(i) != ExtValue)
-        return SDValue();
-    } else if (!(isNullConstant(Op->getOperand(i)) ||
-                 Op->getOperand(i).isUndef()))
-      return SDValue();
-  }
-  unsigned EltSize = Op->getSimpleValueType(0).getScalarSizeInBits();
-  unsigned ExtVTSize = EltSize * Delta;
-  EltType = MVT::getIntegerVT(ExtVTSize);
-  NumElt = NumElts / Delta;
-  return ExtValue;
-}
-
 /// Attempt to use the vbroadcast instruction to generate a splat value
 /// from a splat BUILD_VECTOR which uses:
 ///  a. A single scalar load, or a constant.
@@ -8676,13 +8769,21 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
     return SDValue();
 
   MVT VT = BVOp->getSimpleValueType(0);
+  unsigned NumElts = VT.getVectorNumElements();
   SDLoc dl(BVOp);
 
   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
          "Unsupported vector type for broadcast.");
 
+  // See if the build vector is a repeating sequence of scalars (inc. splat).
+  SDValue Ld;
   BitVector UndefElements;
-  SDValue Ld = BVOp->getSplatValue(&UndefElements);
+  SmallVector<SDValue, 16> Sequence;
+  if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
+    assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.");
+    if (Sequence.size() == 1)
+      Ld = Sequence[0];
+  }
 
   // Attempt to use VBROADCASTM
   // From this pattern:
@@ -8690,30 +8791,38 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
   // b. t1 = (build_vector t0 t0)
   //
   // Create (VBROADCASTM v2i1 X)
-  if (Subtarget.hasCDI() && (VT.is512BitVector() || Subtarget.hasVLX())) {
-    MVT EltType = VT.getScalarType();
-    unsigned NumElts = VT.getVectorNumElements();
-    SDValue BOperand;
-    SDValue ZeroExtended = isSplatZeroExtended(BVOp, NumElts, EltType);
-    if ((ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) ||
-        (Ld && Ld.getOpcode() == ISD::ZERO_EXTEND &&
-         Ld.getOperand(0).getOpcode() == ISD::BITCAST)) {
-      if (ZeroExtended)
-        BOperand = ZeroExtended.getOperand(0);
-      else
-        BOperand = Ld.getOperand(0).getOperand(0);
+  if (!Sequence.empty() && Subtarget.hasCDI()) {
+    // If not a splat, are the upper sequence values zeroable?
+    unsigned SeqLen = Sequence.size();
+    bool UpperZeroOrUndef =
+        SeqLen == 1 ||
+        llvm::all_of(makeArrayRef(Sequence).drop_front(), [](SDValue V) {
+          return !V || V.isUndef() || isNullConstant(V);
+        });
+    SDValue Op0 = Sequence[0];
+    if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
+                             (Op0.getOpcode() == ISD::ZERO_EXTEND &&
+                              Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
+      SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
+                             ? Op0.getOperand(0)
+                             : Op0.getOperand(0).getOperand(0);
       MVT MaskVT = BOperand.getSimpleValueType();
-      if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
+      MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
+      if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) ||  // for broadcastmb2q
           (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
-        SDValue Brdcst =
-            DAG.getNode(X86ISD::VBROADCASTM, dl,
-                        MVT::getVectorVT(EltType, NumElts), BOperand);
-        return DAG.getBitcast(VT, Brdcst);
+        MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
+        if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
+          unsigned Scale = 512 / VT.getSizeInBits();
+          BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
+        }
+        SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
+        if (BcstVT.getSizeInBits() != VT.getSizeInBits())
+          Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
+        return DAG.getBitcast(VT, Bcst);
       }
     }
   }
 
-  unsigned NumElts = VT.getVectorNumElements();
   unsigned NumUndefElts = UndefElements.count();
   if (!Ld || (NumElts - NumUndefElts) <= 1) {
     APInt SplatValue, Undef;
@@ -8755,18 +8864,19 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
         }
         if (SplatBitSize > 64) {
           // Load the vector of constants and broadcast it.
-          MVT CVT = VT.getScalarType();
           Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
                                              *Ctx);
           SDValue VCP = DAG.getConstantPool(VecC, PVT);
           unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
+          MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
           Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
-          Ld = DAG.getLoad(
-              MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
-              MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
-              Alignment);
-          SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
-          return DAG.getBitcast(VT, Brdcst);
+          SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+          SDValue Ops[] = {DAG.getEntryNode(), VCP};
+          MachinePointerInfo MPI =
+              MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
+          return DAG.getMemIntrinsicNode(
+              X86ISD::SUBV_BROADCAST_LOAD, dl, Tys, Ops, VVT, MPI, Alignment,
+              MachineMemOperand::MOLoad);
         }
       }
     }
@@ -8787,6 +8897,8 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
       (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
   bool IsLoad = ISD::isNormalLoad(Ld.getNode());
 
+  // TODO: Handle broadcasts of non-constant sequences.
+
   // Make sure that all of the users of a non-constant load are from the
   // BUILD_VECTOR node.
   // FIXME: Is the use count needed for non-constant, non-load case?
@@ -10120,45 +10232,69 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
     return VectorConstant;
 
-  BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
-  if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
-    return AddSub;
-  if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
-    return HorizontalOp;
-  if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
-    return Broadcast;
-  if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG))
-    return BitOp;
-
   unsigned EVTBits = EltVT.getSizeInBits();
-
-  unsigned NumZero  = 0;
-  unsigned NumNonZero = 0;
-  uint64_t NonZeros = 0;
+  APInt UndefMask = APInt::getNullValue(NumElems);
+  APInt ZeroMask = APInt::getNullValue(NumElems);
+  APInt NonZeroMask = APInt::getNullValue(NumElems);
   bool IsAllConstants = true;
   SmallSet<SDValue, 8> Values;
   unsigned NumConstants = NumElems;
   for (unsigned i = 0; i < NumElems; ++i) {
     SDValue Elt = Op.getOperand(i);
-    if (Elt.isUndef())
+    if (Elt.isUndef()) {
+      UndefMask.setBit(i);
       continue;
+    }
     Values.insert(Elt);
     if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {
       IsAllConstants = false;
       NumConstants--;
     }
-    if (X86::isZeroNode(Elt))
-      NumZero++;
-    else {
-      assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
-      NonZeros |= ((uint64_t)1 << i);
-      NumNonZero++;
+    if (X86::isZeroNode(Elt)) {
+      ZeroMask.setBit(i);
+    } else {
+      NonZeroMask.setBit(i);
     }
   }
 
-  // All undef vector. Return an UNDEF.  All zero vectors were handled above.
-  if (NumNonZero == 0)
+  // All undef vector. Return an UNDEF. All zero vectors were handled above.
+  if (NonZeroMask == 0) {
+    assert(UndefMask.isAllOnesValue() && "Fully undef mask expected");
     return DAG.getUNDEF(VT);
+  }
+
+  BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
+
+  // If the upper elts of a ymm/zmm are undef/zero then we might be better off
+  // lowering to a smaller build vector and padding with undef/zero.
+  if ((VT.is256BitVector() || VT.is512BitVector()) &&
+      !isFoldableUseOfShuffle(BV)) {
+    unsigned UpperElems = NumElems / 2;
+    APInt UndefOrZeroMask = UndefMask | ZeroMask;
+    unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countLeadingOnes();
+    if (NumUpperUndefsOrZeros >= UpperElems) {
+      if (VT.is512BitVector() &&
+          NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
+        UpperElems = NumElems - (NumElems / 4);
+      bool UndefUpper = UndefMask.countLeadingOnes() >= UpperElems;
+      MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
+      SDValue NewBV =
+          DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
+      return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
+    }
+  }
+
+  if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
+    return AddSub;
+  if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
+    return HorizontalOp;
+  if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
+    return Broadcast;
+  if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG))
+    return BitOp;
+
+  unsigned NumZero = ZeroMask.countPopulation();
+  unsigned NumNonZero = NonZeroMask.countPopulation();
 
   // If we are inserting one variable into a vector of non-zero constants, try
   // to avoid loading each constant element as a scalar. Load the constants as a
@@ -10222,7 +10358,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
 
   // Special case for single non-zero, non-undef, element.
   if (NumNonZero == 1) {
-    unsigned Idx = countTrailingZeros(NonZeros);
+    unsigned Idx = NonZeroMask.countTrailingZeros();
     SDValue Item = Op.getOperand(Idx);
 
     // If we have a constant or non-constant insertion into the low element of
@@ -10286,7 +10422,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
       // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
       // Check if it's possible to issue this instead.
       // shuffle (vload ptr)), undef, <1, 1, 1, 1>
-      unsigned Idx = countTrailingZeros(NonZeros);
+      unsigned Idx = NonZeroMask.countTrailingZeros();
       SDValue Item = Op.getOperand(Idx);
       if (Op.getNode()->isOnlyUserOf(Item.getNode()))
         return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
@@ -10355,7 +10491,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   if (EVTBits == 64) {
     if (NumNonZero == 1) {
       // One half is zero or undef.
-      unsigned Idx = countTrailingZeros(NonZeros);
+      unsigned Idx = NonZeroMask.countTrailingZeros();
       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
                                Op.getOperand(Idx));
       return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
@@ -10365,12 +10501,12 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
 
   // If element VT is < 32 bits, convert it to inserts into a zero vector.
   if (EVTBits == 8 && NumElems == 16)
-    if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
+    if (SDValue V = LowerBuildVectorv16i8(Op, NonZeroMask, NumNonZero, NumZero,
                                           DAG, Subtarget))
       return V;
 
   if (EVTBits == 16 && NumElems == 8)
-    if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
+    if (SDValue V = LowerBuildVectorv8i16(Op, NonZeroMask, NumNonZero, NumZero,
                                           DAG, Subtarget))
       return V;
 
@@ -10383,7 +10519,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   if (NumElems == 4 && NumZero > 0) {
     SmallVector<SDValue, 8> Ops(NumElems);
     for (unsigned i = 0; i < 4; ++i) {
-      bool isZero = !(NonZeros & (1ULL << i));
+      bool isZero = !NonZeroMask[i];
       if (isZero)
         Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
       else
@@ -10391,7 +10527,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     }
 
     for (unsigned i = 0; i < 2; ++i) {
-      switch ((NonZeros >> (i*2)) & 0x3) {
+      switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
         default: llvm_unreachable("Unexpected NonZero count");
         case 0:
           Ops[i] = Ops[i*2];  // Must be a zero vector.
@@ -10408,8 +10544,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
       }
     }
 
-    bool Reverse1 = (NonZeros & 0x3) == 2;
-    bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
+    bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
+    bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
     int MaskVec[] = {
       Reverse1 ? 1 : 0,
       Reverse1 ? 0 : 1,
@@ -10681,6 +10817,35 @@ static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
   return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
 }
 
+/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
+/// from multiple lanes - this is different to isLaneCrossingShuffleMask to
+/// better support 'repeated mask + lane permute' style shuffles.
+static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
+                                   unsigned ScalarSizeInBits,
+                                   ArrayRef<int> Mask) {
+  assert(LaneSizeInBits && ScalarSizeInBits &&
+         (LaneSizeInBits % ScalarSizeInBits) == 0 &&
+         "Illegal shuffle lane size");
+  int NumElts = Mask.size();
+  int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
+  int NumLanes = NumElts / NumEltsPerLane;
+  if (NumLanes > 1) {
+    for (int i = 0; i != NumLanes; ++i) {
+      int SrcLane = -1;
+      for (int j = 0; j != NumEltsPerLane; ++j) {
+        int M = Mask[(i * NumEltsPerLane) + j];
+        if (M < 0)
+          continue;
+        int Lane = (M % NumElts) / NumEltsPerLane;
+        if (SrcLane >= 0 && SrcLane != Lane)
+          return true;
+        SrcLane = Lane;
+      }
+    }
+  }
+  return false;
+}
+
 /// Test whether a shuffle mask is equivalent within each sub-lane.
 ///
 /// This checks a shuffle mask to see if it is performing the same
@@ -10742,10 +10907,11 @@ is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
 
 /// Test whether a target shuffle mask is equivalent within each sub-lane.
 /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
-static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
+static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
+                                        unsigned EltSizeInBits,
                                         ArrayRef<int> Mask,
                                         SmallVectorImpl<int> &RepeatedMask) {
-  int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
+  int LaneSize = LaneSizeInBits / EltSizeInBits;
   RepeatedMask.assign(LaneSize, SM_SentinelUndef);
   int Size = Mask.size();
   for (int i = 0; i < Size; ++i) {
@@ -10776,6 +10942,67 @@ static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
   return true;
 }
 
+/// Test whether a target shuffle mask is equivalent within each sub-lane.
+/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
+static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
+                                        ArrayRef<int> Mask,
+                                        SmallVectorImpl<int> &RepeatedMask) {
+  return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
+                                     Mask, RepeatedMask);
+}
+
+/// Checks whether the vector elements referenced by two shuffle masks are
+/// equivalent.
+static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
+                                int Idx, int ExpectedIdx) {
+  assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&
+         ExpectedIdx < MaskSize && "Out of range element index");
+  if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
+    return false;
+
+  switch (Op.getOpcode()) {
+  case ISD::BUILD_VECTOR:
+    // If the values are build vectors, we can look through them to find
+    // equivalent inputs that make the shuffles equivalent.
+    // TODO: Handle MaskSize != Op.getNumOperands()?
+    if (MaskSize == (int)Op.getNumOperands() &&
+        MaskSize == (int)ExpectedOp.getNumOperands())
+      return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
+    break;
+  case X86ISD::VBROADCAST:
+  case X86ISD::VBROADCAST_LOAD:
+    // TODO: Handle MaskSize != Op.getValueType().getVectorNumElements()?
+    return (Op == ExpectedOp &&
+            (int)Op.getValueType().getVectorNumElements() == MaskSize);
+  case X86ISD::HADD:
+  case X86ISD::HSUB:
+  case X86ISD::FHADD:
+  case X86ISD::FHSUB:
+  case X86ISD::PACKSS:
+  case X86ISD::PACKUS:
+    // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
+    // TODO: Handle MaskSize != NumElts?
+    // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
+    if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
+      MVT VT = Op.getSimpleValueType();
+      int NumElts = VT.getVectorNumElements();
+      if (MaskSize == NumElts) {
+        int NumLanes = VT.getSizeInBits() / 128;
+        int NumEltsPerLane = NumElts / NumLanes;
+        int NumHalfEltsPerLane = NumEltsPerLane / 2;
+        bool SameLane =
+            (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
+        bool SameElt =
+            (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
+        return SameLane && SameElt;
+      }
+    }
+    break;
+  }
+
+  return false;
+}
+
 /// Checks whether a shuffle mask is equivalent to an explicit list of
 /// arguments.
 ///
@@ -10786,30 +11013,26 @@ static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
 /// It returns true if the mask is exactly as wide as the argument list, and
 /// each element of the mask is either -1 (signifying undef) or the value given
 /// in the argument.
-static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
-                                ArrayRef<int> ExpectedMask) {
-  if (Mask.size() != ExpectedMask.size())
-    return false;
-
+static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
+                                SDValue V1 = SDValue(),
+                                SDValue V2 = SDValue()) {
   int Size = Mask.size();
-
-  // If the values are build vectors, we can look through them to find
-  // equivalent inputs that make the shuffles equivalent.
-  auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
-  auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
+  if (Size != (int)ExpectedMask.size())
+    return false;
 
   for (int i = 0; i < Size; ++i) {
     assert(Mask[i] >= -1 && "Out of bound mask element!");
-    if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
-      auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
-      auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
-      if (!MaskBV || !ExpectedBV ||
-          MaskBV->getOperand(Mask[i] % Size) !=
-              ExpectedBV->getOperand(ExpectedMask[i] % Size))
+    int MaskIdx = Mask[i];
+    int ExpectedIdx = ExpectedMask[i];
+    if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
+      SDValue MaskV = MaskIdx < Size ? V1 : V2;
+      SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
+      MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
+      ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
+      if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
         return false;
     }
   }
-
   return true;
 }
 
@@ -10822,7 +11045,7 @@ static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
 ///
 /// SM_SentinelZero is accepted as a valid negative index but must match in
 /// both.
-static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
+static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask,
                                       ArrayRef<int> ExpectedMask,
                                       SDValue V1 = SDValue(),
                                       SDValue V2 = SDValue()) {
@@ -10836,22 +11059,23 @@ static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
   if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
     return false;
 
-  // If the values are build vectors, we can look through them to find
-  // equivalent inputs that make the shuffles equivalent.
-  auto *BV1 = dyn_cast_or_null<BuildVectorSDNode>(V1);
-  auto *BV2 = dyn_cast_or_null<BuildVectorSDNode>(V2);
-  BV1 = ((BV1 && Size != (int)BV1->getNumOperands()) ? nullptr : BV1);
-  BV2 = ((BV2 && Size != (int)BV2->getNumOperands()) ? nullptr : BV2);
+  // Don't use V1/V2 if they're not the same size as the shuffle mask type.
+  if (V1 && V1.getValueSizeInBits() != VT.getSizeInBits())
+    V1 = SDValue();
+  if (V2 && V2.getValueSizeInBits() != VT.getSizeInBits())
+    V2 = SDValue();
 
   for (int i = 0; i < Size; ++i) {
-    if (Mask[i] == SM_SentinelUndef || Mask[i] == ExpectedMask[i])
+    int MaskIdx = Mask[i];
+    int ExpectedIdx = ExpectedMask[i];
+    if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
       continue;
-    if (0 <= Mask[i] && 0 <= ExpectedMask[i]) {
-      auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
-      auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
-      if (MaskBV && ExpectedBV &&
-          MaskBV->getOperand(Mask[i] % Size) ==
-              ExpectedBV->getOperand(ExpectedMask[i] % Size))
+    if (0 <= MaskIdx && 0 <= ExpectedIdx) {
+      SDValue MaskV = MaskIdx < Size ? V1 : V2;
+      SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
+      MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
+      ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
+      if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
         continue;
     }
     // TODO - handle SM_Sentinel equivalences.
@@ -10863,20 +11087,25 @@ static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
 // Attempt to create a shuffle mask from a VSELECT condition mask.
 static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask,
                                          SDValue Cond) {
-  if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
+  EVT CondVT = Cond.getValueType();
+  unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
+  unsigned NumElts = CondVT.getVectorNumElements();
+
+  APInt UndefElts;
+  SmallVector<APInt, 32> EltBits;
+  if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
+                                     true, false))
     return false;
 
-  unsigned Size = Cond.getValueType().getVectorNumElements();
-  Mask.resize(Size, SM_SentinelUndef);
+  Mask.resize(NumElts, SM_SentinelUndef);
 
-  for (int i = 0; i != (int)Size; ++i) {
-    SDValue CondElt = Cond.getOperand(i);
+  for (int i = 0; i != (int)NumElts; ++i) {
     Mask[i] = i;
     // Arbitrarily choose from the 2nd operand if the select condition element
     // is undef.
     // TODO: Can we do better by matching patterns such as even/odd?
-    if (CondElt.isUndef() || isNullConstant(CondElt))
-      Mask[i] += Size;
+    if (UndefElts[i] || EltBits[i].isNullValue())
+      Mask[i] += NumElts;
   }
 
   return true;
@@ -10894,8 +11123,8 @@ static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
   SmallVector<int, 8> Unpckhwd;
   createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
                           /* Unary = */ false);
-  bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) ||
-                         isTargetShuffleEquivalent(Mask, Unpckhwd));
+  bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd) ||
+                         isTargetShuffleEquivalent(VT, Mask, Unpckhwd));
   return IsUnpackwdMask;
 }
 
@@ -10912,8 +11141,8 @@ static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask) {
   for (unsigned i = 0; i != 4; ++i) {
     SmallVector<int, 16> UnpackMask;
     createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
-    if (isTargetShuffleEquivalent(Mask, UnpackMask) ||
-        isTargetShuffleEquivalent(CommutedMask, UnpackMask))
+    if (isTargetShuffleEquivalent(VT, Mask, UnpackMask) ||
+        isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask))
       return true;
   }
   return false;
@@ -10948,6 +11177,15 @@ static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
   assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
   assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
 
+  // If the mask only uses one non-undef element, then fully 'splat' it to
+  // improve later broadcast matching.
+  int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
+  assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask");
+
+  int FirstElt = Mask[FirstIndex];
+  if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
+    return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
+
   unsigned Imm = 0;
   Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
   Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
@@ -11097,7 +11335,8 @@ static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
   // Attempt to match the target mask against the unpack lo/hi mask patterns.
   SmallVector<int, 64> Unpckl, Unpckh;
   createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
-  if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
+  if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, V1,
+                                (IsUnary ? V1 : V2))) {
     UnpackOpcode = X86ISD::UNPCKL;
     V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
     V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
@@ -11105,7 +11344,8 @@ static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
   }
 
   createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
-  if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
+  if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, V1,
+                                (IsUnary ? V1 : V2))) {
     UnpackOpcode = X86ISD::UNPCKH;
     V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
     V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
@@ -11143,14 +11383,14 @@ static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
   // If a binary shuffle, commute and try again.
   if (!IsUnary) {
     ShuffleVectorSDNode::commuteMask(Unpckl);
-    if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
+    if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl)) {
       UnpackOpcode = X86ISD::UNPCKL;
       std::swap(V1, V2);
       return true;
     }
 
     ShuffleVectorSDNode::commuteMask(Unpckh);
-    if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
+    if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh)) {
       UnpackOpcode = X86ISD::UNPCKH;
       std::swap(V1, V2);
       return true;
@@ -11167,21 +11407,21 @@ static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,
                                      SelectionDAG &DAG) {
   SmallVector<int, 8> Unpckl;
   createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
-  if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
+  if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
     return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
 
   SmallVector<int, 8> Unpckh;
   createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
-  if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
+  if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
     return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
 
   // Commute and try again.
   ShuffleVectorSDNode::commuteMask(Unpckl);
-  if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
+  if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
     return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
 
   ShuffleVectorSDNode::commuteMask(Unpckh);
-  if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
+  if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
     return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
 
   return SDValue();
@@ -11197,9 +11437,9 @@ static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT,
   createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
 
   unsigned UnpackOpcode;
-  if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
+  if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
     UnpackOpcode = X86ISD::UNPCKL;
-  else if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
+  else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
     UnpackOpcode = X86ISD::UNPCKH;
   else
     return SDValue();
@@ -11215,7 +11455,6 @@ static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT,
 
 // Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
 // source into the lower elements and zeroing the upper elements.
-// TODO: Merge with matchShuffleAsVPMOV.
 static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
                                  ArrayRef<int> Mask, const APInt &Zeroable,
                                  const X86Subtarget &Subtarget) {
@@ -11252,22 +11491,51 @@ static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
   return false;
 }
 
-static bool matchShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,
-                                int Delta) {
-  int Size = (int)Mask.size();
-  int Split = Size / Delta;
-  int TruncatedVectorStart = SwappedOps ? Size : 0;
+// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
+// element padding to the final DstVT.
+static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
+                                  const X86Subtarget &Subtarget,
+                                  SelectionDAG &DAG, bool ZeroUppers) {
+  MVT SrcVT = Src.getSimpleValueType();
+  MVT DstSVT = DstVT.getScalarType();
+  unsigned NumDstElts = DstVT.getVectorNumElements();
+  unsigned NumSrcElts = SrcVT.getVectorNumElements();
+  unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
 
-  // Match for mask starting with e.g.: <8, 10, 12, 14,... or <0, 2, 4, 6,...
-  if (!isSequentialOrUndefInRange(Mask, 0, Split, TruncatedVectorStart, Delta))
-    return false;
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
+    return SDValue();
 
-  // The rest of the mask should not refer to the truncated vector's elements.
-  if (isAnyInRange(Mask.slice(Split, Size - Split), TruncatedVectorStart,
-                   TruncatedVectorStart + Size))
-    return false;
+  // Perform a direct ISD::TRUNCATE if possible.
+  if (NumSrcElts == NumDstElts)
+    return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
 
-  return true;
+  if (NumSrcElts > NumDstElts) {
+    MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
+    SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
+    return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
+  }
+
+  if ((NumSrcElts * DstEltSizeInBits) >= 128) {
+    MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
+    SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
+    return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
+                          DstVT.getSizeInBits());
+  }
+
+  // Non-VLX targets must truncate from a 512-bit type, so we need to
+  // widen, truncate and then possibly extract the original subvector.
+  if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
+    SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
+    return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
+  }
+
+  // Fallback to a X86ISD::VTRUNC, padding if necessary.
+  MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
+  SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
+  if (DstVT != TruncVT)
+    Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
+                           DstVT.getSizeInBits());
+  return Trunc;
 }
 
 // Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
@@ -11283,66 +11551,99 @@ static bool matchShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,
 //     t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
 //   t18: v2i64 = bitcast t51
 //
-// Without avx512vl, this is lowered to:
-//
-// vpmovqd %zmm0, %ymm0
-// vpshufb {{.*#+}} xmm0 =
-// xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
-//
-// But when avx512vl is available, one can just use a single vpmovdw
-// instruction.
-static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask,
-                                     MVT VT, SDValue V1, SDValue V2,
-                                     SelectionDAG &DAG,
-                                     const X86Subtarget &Subtarget) {
-  if (VT != MVT::v16i8 && VT != MVT::v8i16)
+// One can just use a single vpmovdw instruction, without avx512vl we need to
+// use the zmm variant and extract the lower subvector, padding with zeroes.
+// TODO: Merge with lowerShuffleAsVTRUNC.
+static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1,
+                                     SDValue V2, ArrayRef<int> Mask,
+                                     const APInt &Zeroable,
+                                     const X86Subtarget &Subtarget,
+                                     SelectionDAG &DAG) {
+  assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
+  if (!Subtarget.hasAVX512())
     return SDValue();
 
-  if (Mask.size() != VT.getVectorNumElements())
-    return SDValue();
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned EltSizeInBits = VT.getScalarSizeInBits();
+  unsigned MaxScale = 64 / EltSizeInBits;
+  for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
+    unsigned NumSrcElts = NumElts / Scale;
+    unsigned UpperElts = NumElts - NumSrcElts;
+    if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
+        !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
+      continue;
 
-  bool SwappedOps = false;
+    SDValue Src = V1;
+    if (!Src.hasOneUse())
+      return SDValue();
 
-  if (!ISD::isBuildVectorAllZeros(V2.getNode())) {
-    if (!ISD::isBuildVectorAllZeros(V1.getNode()))
+    Src = peekThroughOneUseBitcasts(Src);
+    if (Src.getOpcode() != ISD::TRUNCATE ||
+        Src.getScalarValueSizeInBits() != (EltSizeInBits * Scale))
       return SDValue();
+    Src = Src.getOperand(0);
 
-    std::swap(V1, V2);
-    SwappedOps = true;
+    // VPMOVWB is only available with avx512bw.
+    MVT SrcVT = Src.getSimpleValueType();
+    if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 &&
+        !Subtarget.hasBWI())
+      return SDValue();
+
+    bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
+    return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
   }
 
-  // Look for:
-  //
-  // bitcast (truncate <8 x i32> %vec to <8 x i16>) to <16 x i8>
-  // bitcast (truncate <4 x i64> %vec to <4 x i32>) to <8 x i16>
-  //
-  // and similar ones.
-  if (V1.getOpcode() != ISD::BITCAST)
-    return SDValue();
-  if (V1.getOperand(0).getOpcode() != ISD::TRUNCATE)
+  return SDValue();
+}
+
+// Attempt to match binary shuffle patterns as a truncate.
+static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,
+                                    SDValue V2, ArrayRef<int> Mask,
+                                    const APInt &Zeroable,
+                                    const X86Subtarget &Subtarget,
+                                    SelectionDAG &DAG) {
+  assert((VT.is128BitVector() || VT.is256BitVector()) &&
+         "Unexpected VTRUNC type");
+  if (!Subtarget.hasAVX512())
     return SDValue();
 
-  SDValue Src = V1.getOperand(0).getOperand(0);
-  MVT SrcVT = Src.getSimpleValueType();
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned EltSizeInBits = VT.getScalarSizeInBits();
+  unsigned MaxScale = 64 / EltSizeInBits;
+  for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
+    // TODO: Support non-BWI VPMOVWB truncations?
+    unsigned SrcEltBits = EltSizeInBits * Scale;
+    if (SrcEltBits < 32 && !Subtarget.hasBWI())
+      continue;
 
-  // The vptrunc** instructions truncating 128 bit and 256 bit vectors
-  // are only available with avx512vl.
-  if (!SrcVT.is512BitVector() && !Subtarget.hasVLX())
-    return SDValue();
+    // Match shuffle <0,Scale,2*Scale,..,undef_or_zero,undef_or_zero,...>
+    // Bail if the V2 elements are undef.
+    unsigned NumHalfSrcElts = NumElts / Scale;
+    unsigned NumSrcElts = 2 * NumHalfSrcElts;
+    if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
+        isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
+      continue;
 
-  // Down Convert Word to Byte is only available with avx512bw. The case with
-  // 256-bit output doesn't contain a shuffle and is therefore not handled here.
-  if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 &&
-      !Subtarget.hasBWI())
-    return SDValue();
+    // The elements beyond the truncation must be undef/zero.
+    unsigned UpperElts = NumElts - NumSrcElts;
+    if (UpperElts > 0 &&
+        !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
+      continue;
+    bool UndefUppers =
+        UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
 
-  // The first half/quarter of the mask should refer to every second/fourth
-  // element of the vector truncated and bitcasted.
-  if (!matchShuffleAsVPMOV(Mask, SwappedOps, 2) &&
-      !matchShuffleAsVPMOV(Mask, SwappedOps, 4))
-    return SDValue();
+    // As we're using both sources then we need to concat them together
+    // and truncate from the double-sized src.
+    MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2);
+    SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
 
-  return DAG.getNode(X86ISD::VTRUNC, DL, VT, Src);
+    MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
+    MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
+    Src = DAG.getBitcast(SrcVT, Src);
+    return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
+  }
+
+  return SDValue();
 }
 
 /// Check whether a compaction lowering can be done by dropping even
@@ -11460,14 +11761,14 @@ static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
     // Try binary shuffle.
     SmallVector<int, 32> BinaryMask;
     createPackShuffleMask(VT, BinaryMask, false, NumStages);
-    if (isTargetShuffleEquivalent(TargetMask, BinaryMask, V1, V2))
+    if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, V1, V2))
       if (MatchPACK(V1, V2, PackVT))
         return true;
 
     // Try unary shuffle.
     SmallVector<int, 32> UnaryMask;
     createPackShuffleMask(VT, UnaryMask, true, NumStages);
-    if (isTargetShuffleEquivalent(TargetMask, UnaryMask, V1))
+    if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, V1))
       if (MatchPACK(V1, V1, PackVT))
         return true;
   }
@@ -12016,23 +12317,32 @@ static SDValue lowerShuffleAsByteRotateAndPermute(
 /// This matches the extremely common pattern for handling combined
 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
 /// operations. It will try to pick the best arrangement of shuffles and
-/// blends.
-static SDValue lowerShuffleAsDecomposedShuffleBlend(
+/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
+static SDValue lowerShuffleAsDecomposedShuffleMerge(
     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
+  int NumElts = Mask.size();
+  int NumLanes = VT.getSizeInBits() / 128;
+  int NumEltsPerLane = NumElts / NumLanes;
+
   // Shuffle the input elements into the desired positions in V1 and V2 and
-  // blend them together.
-  SmallVector<int, 32> V1Mask(Mask.size(), -1);
-  SmallVector<int, 32> V2Mask(Mask.size(), -1);
-  SmallVector<int, 32> BlendMask(Mask.size(), -1);
-  for (int i = 0, Size = Mask.size(); i < Size; ++i)
-    if (Mask[i] >= 0 && Mask[i] < Size) {
-      V1Mask[i] = Mask[i];
-      BlendMask[i] = i;
-    } else if (Mask[i] >= Size) {
-      V2Mask[i] = Mask[i] - Size;
-      BlendMask[i] = i + Size;
+  // unpack/blend them together.
+  bool IsAlternating = true;
+  SmallVector<int, 32> V1Mask(NumElts, -1);
+  SmallVector<int, 32> V2Mask(NumElts, -1);
+  SmallVector<int, 32> FinalMask(NumElts, -1);
+  for (int i = 0; i < NumElts; ++i) {
+    int M = Mask[i];
+    if (M >= 0 && M < NumElts) {
+      V1Mask[i] = M;
+      FinalMask[i] = i;
+      IsAlternating &= (i & 1) == 0;
+    } else if (M >= NumElts) {
+      V2Mask[i] = M - NumElts;
+      FinalMask[i] = i + NumElts;
+      IsAlternating &= (i & 1) == 1;
     }
+  }
 
   // Try to lower with the simpler initial blend/unpack/rotate strategies unless
   // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
@@ -12056,9 +12366,30 @@ static SDValue lowerShuffleAsDecomposedShuffleBlend(
       return BlendPerm;
   }
 
+  // If the final mask is an alternating blend of vXi8/vXi16, convert to an
+  // UNPCKL(SHUFFLE, SHUFFLE) pattern.
+  // TODO: It doesn't have to be alternating - but each lane mustn't have more
+  // than half the elements coming from each source.
+  if (IsAlternating && VT.getScalarSizeInBits() < 32) {
+    V1Mask.assign(NumElts, -1);
+    V2Mask.assign(NumElts, -1);
+    FinalMask.assign(NumElts, -1);
+    for (int i = 0; i != NumElts; i += NumEltsPerLane)
+      for (int j = 0; j != NumEltsPerLane; ++j) {
+        int M = Mask[i + j];
+        if (M >= 0 && M < NumElts) {
+          V1Mask[i + (j / 2)] = M;
+          FinalMask[i + j] = i + (j / 2);
+        } else if (M >= NumElts) {
+          V2Mask[i + (j / 2)] = M - NumElts;
+          FinalMask[i + j] = i + (j / 2) + NumElts;
+        }
+      }
+  }
+
   V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
   V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
-  return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
+  return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
 }
 
 /// Try to lower a vector shuffle as a bit rotation.
@@ -12716,8 +13047,8 @@ static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
     MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
                                  NumElements / Scale);
     InputV = ShuffleOffset(InputV);
-    InputV = getExtendInVec(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND, DL,
-                            ExtVT, InputV, DAG);
+    InputV = getEXTEND_VECTOR_INREG(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND,
+                                    DL, ExtVT, InputV, DAG);
     return DAG.getBitcast(VT, InputV);
   }
 
@@ -13325,7 +13656,8 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
     MVT SVT = VT.getScalarType();
     unsigned Offset = BroadcastIdx * SVT.getStoreSize();
     assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
-    SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
+    SDValue NewAddr =
+        DAG.getMemBasePlusOffset(BaseAddr, TypeSize::Fixed(Offset), DL);
 
     // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
     // than MOVDDUP.
@@ -13498,7 +13830,7 @@ static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2,
   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
 
   // Attempt to match the insertps pattern.
-  unsigned InsertPSMask;
+  unsigned InsertPSMask = 0;
   if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
     return SDValue();
 
@@ -13686,8 +14018,8 @@ static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
 
   // Try to use one of the special instruction patterns to handle two common
   // blend patterns if a zero-blend above didn't work.
-  if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
-      isShuffleEquivalent(V1, V2, Mask, {1, 3}))
+  if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
+      isShuffleEquivalent(Mask, {1, 3}, V1, V2))
     if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
       // We can either use a special instruction to load over the low double or
       // to move just the low double.
@@ -13733,9 +14065,10 @@ static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
     // onward this has a single fast instruction with no scary immediates.
     // We have to map the mask as it is actually a v4i32 shuffle instruction.
     V1 = DAG.getBitcast(MVT::v4i32, V1);
-    int WidenedMask[4] = {
-        std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
-        std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
+    int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
+                          Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
+                          Mask[1] < 0 ? -1 : (Mask[1] * 2),
+                          Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
     return DAG.getBitcast(
         MVT::v2i64,
         DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
@@ -13795,7 +14128,7 @@ static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
   // If we have direct support for blends, we should lower by decomposing into
   // a permute. That will be faster than the domain cross.
   if (IsBlendSupported)
-    return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2, Mask,
+    return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
                                                 Subtarget, DAG);
 
   // We implement this with SHUFPD which is pretty lame because it will likely
@@ -13889,6 +14222,12 @@ static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
       NewMask[2] = Mask[2] < 4 ? 1 : 3;
       NewMask[3] = Mask[2] < 4 ? 3 : 1;
     }
+  } else if (NumV2Elements == 3) {
+    // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
+    // we can get here due to other paths (e.g repeated mask matching) that we
+    // don't want to do another round of lowerVECTOR_SHUFFLE.
+    ShuffleVectorSDNode::commuteMask(NewMask);
+    return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
   }
   return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
                      getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
@@ -13917,9 +14256,9 @@ static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
 
     // Use even/odd duplicate instructions for masks that match their pattern.
     if (Subtarget.hasSSE3()) {
-      if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
+      if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
         return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
-      if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
+      if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
         return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
     }
 
@@ -13933,9 +14272,9 @@ static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
     // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
     // in SSE1 because otherwise they are widened to v2f64 and never get here.
     if (!Subtarget.hasSSE2()) {
-      if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}))
+      if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
         return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
-      if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 2, 3}))
+      if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
         return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
     }
 
@@ -13977,9 +14316,9 @@ static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
   // Use low/high mov instructions. These are only valid in SSE1 because
   // otherwise they are widened to v2f64 and never get here.
   if (!Subtarget.hasSSE2()) {
-    if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
+    if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
       return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
-    if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
+    if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
       return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
   }
 
@@ -14027,9 +14366,9 @@ static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
     // so prevents folding a load into this instruction or making a copy.
     const int UnpackLoMask[] = {0, 0, 1, 1};
     const int UnpackHiMask[] = {2, 2, 3, 3};
-    if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
+    if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
       Mask = UnpackLoMask;
-    else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
+    else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
       Mask = UnpackHiMask;
 
     return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
@@ -14087,7 +14426,7 @@ static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
     // If we have direct support for blends, we should lower by decomposing into
     // a permute. That will be faster than the domain cross.
     if (IsBlendSupported)
-      return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2, Mask,
+      return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
                                                   Subtarget, DAG);
 
     // Try to lower by permuting the inputs into an unpack instruction.
@@ -14696,6 +15035,11 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                                    Zeroable, Subtarget, DAG))
     return ZExt;
 
+  // Try to use lower using a truncation.
+  if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
+                                        Subtarget, DAG))
+    return V;
+
   int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
 
   if (NumV2Inputs == 0) {
@@ -14776,6 +15120,11 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                        Subtarget))
     return V;
 
+  // Try to use lower using a truncation.
+  if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
+                                       Subtarget, DAG))
+    return V;
+
   // Try to use byte rotation instructions.
   if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
                                                 Subtarget, DAG))
@@ -14827,22 +15176,49 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
   }
 
   // We can always bit-blend if we have to so the fallback strategy is to
-  // decompose into single-input permutes and blends.
-  return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
+  // decompose into single-input permutes and blends/unpacks.
+  return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2,
                                               Mask, Subtarget, DAG);
 }
 
+// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
+// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
+// the active subvector is extracted.
 static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,
-                                     ArrayRef<int> Mask, SDValue V1,
-                                     SDValue V2, SelectionDAG &DAG) {
-  MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
-  MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
+                                     ArrayRef<int> Mask, SDValue V1, SDValue V2,
+                                     const X86Subtarget &Subtarget,
+                                     SelectionDAG &DAG) {
+  MVT MaskVT = VT.changeTypeToInteger();
+  SDValue MaskNode;
+  MVT ShuffleVT = VT;
+  if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
+    V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
+    V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
+    ShuffleVT = V1.getSimpleValueType();
+
+    // Adjust mask to correct indices for the second input.
+    int NumElts = VT.getVectorNumElements();
+    unsigned Scale = 512 / VT.getSizeInBits();
+    SmallVector<int, 32> AdjustedMask(Mask.begin(), Mask.end());
+    for (int &M : AdjustedMask)
+      if (NumElts <= M)
+        M += (Scale - 1) * NumElts;
+    MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
+    MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
+  } else {
+    MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
+  }
 
-  SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
+  SDValue Result;
   if (V2.isUndef())
-    return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
+    Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
+  else
+    Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
+
+  if (VT != ShuffleVT)
+    Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
 
-  return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
+  return Result;
 }
 
 /// Generic lowering of v16i8 shuffles.
@@ -14880,6 +15256,15 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                                    Zeroable, Subtarget, DAG))
     return ZExt;
 
+  // Try to use lower using a truncation.
+  if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
+                                        Subtarget, DAG))
+    return V;
+
+  if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
+                                       Subtarget, DAG))
+    return V;
+
   // See if we can use SSE4A Extraction / Insertion.
   if (Subtarget.hasSSE4A())
     if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
@@ -15062,9 +15447,16 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
               DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
         return Unpack;
 
-      // If we have VBMI we can use one VPERM instead of multiple PSHUFBs.
-      if (Subtarget.hasVBMI() && Subtarget.hasVLX())
-        return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG);
+      // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
+      if (Subtarget.hasVBMI())
+        return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
+                                     DAG);
+
+      // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
+      if (Subtarget.hasXOP()) {
+        SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
+        return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
+      }
 
       // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
       // PALIGNR will be cheaper than the second PSHUFB+OR.
@@ -15120,9 +15512,9 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
     return Result;
   }
 
-  // Handle multi-input cases by blending single-input shuffles.
+  // Handle multi-input cases by blending/unpacking single-input shuffles.
   if (NumV2Elements > 0)
-    return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2, Mask,
+    return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
                                                 Subtarget, DAG);
 
   // The fallback path for single-input shuffles widens this into two v8i16
@@ -15302,7 +15694,7 @@ static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
 }
 
 /// Either split a vector in halves or decompose the shuffles and the
-/// blend.
+/// blend/unpack.
 ///
 /// This is provided as a good fallback for many lowerings of non-single-input
 /// shuffles with more than one 128-bit lane. In those cases, we want to select
@@ -15337,8 +15729,8 @@ static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
     return true;
   };
   if (DoBothBroadcast())
-    return lowerShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
-                                                Subtarget, DAG);
+    return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
+                                                DAG);
 
   // If the inputs all stem from a single 128-bit lane of each input, then we
   // split them rather than blending because the split will decompose to
@@ -15354,9 +15746,9 @@ static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
   if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
     return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
 
-  // Otherwise, just fall back to decomposed shuffles and a blend. This requires
-  // that the decomposed single-input shuffles don't end up here.
-  return lowerShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, Subtarget,
+  // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
+  // requires that the decomposed single-input shuffles don't end up here.
+  return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
                                               DAG);
 }
 
@@ -15404,53 +15796,94 @@ static SDValue lowerShuffleAsLanePermuteAndPermute(
   int NumElts = VT.getVectorNumElements();
   int NumLanes = VT.getSizeInBits() / 128;
   int NumEltsPerLane = NumElts / NumLanes;
+  bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
+
+  /// Attempts to find a sublane permute with the given size
+  /// that gets all elements into their target lanes.
+  ///
+  /// If successful, fills CrossLaneMask and InLaneMask and returns true.
+  /// If unsuccessful, returns false and may overwrite InLaneMask.
+  auto getSublanePermute = [&](int NumSublanes) -> SDValue {
+    int NumSublanesPerLane = NumSublanes / NumLanes;
+    int NumEltsPerSublane = NumElts / NumSublanes;
+
+    SmallVector<int, 16> CrossLaneMask;
+    SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
+    // CrossLaneMask but one entry == one sublane.
+    SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
+
+    for (int i = 0; i != NumElts; ++i) {
+      int M = Mask[i];
+      if (M < 0)
+        continue;
 
-  SmallVector<int, 4> SrcLaneMask(NumLanes, SM_SentinelUndef);
-  SmallVector<int, 16> PermMask(NumElts, SM_SentinelUndef);
+      int SrcSublane = M / NumEltsPerSublane;
+      int DstLane = i / NumEltsPerLane;
 
-  for (int i = 0; i != NumElts; ++i) {
-    int M = Mask[i];
-    if (M < 0)
-      continue;
+      // We only need to get the elements into the right lane, not sublane.
+      // So search all sublanes that make up the destination lane.
+      bool Found = false;
+      int DstSubStart = DstLane * NumSublanesPerLane;
+      int DstSubEnd = DstSubStart + NumSublanesPerLane;
+      for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
+        if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
+          continue;
 
-    // Ensure that each lane comes from a single source lane.
-    int SrcLane = M / NumEltsPerLane;
-    int DstLane = i / NumEltsPerLane;
-    if (!isUndefOrEqual(SrcLaneMask[DstLane], SrcLane))
-      return SDValue();
-    SrcLaneMask[DstLane] = SrcLane;
+        Found = true;
+        CrossLaneMaskLarge[DstSublane] = SrcSublane;
+        int DstSublaneOffset = DstSublane * NumEltsPerSublane;
+        InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
+        break;
+      }
+      if (!Found)
+        return SDValue();
+    }
 
-    PermMask[i] = (DstLane * NumEltsPerLane) + (M % NumEltsPerLane);
-  }
+    // Fill CrossLaneMask using CrossLaneMaskLarge.
+    narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
 
-  // Make sure we set all elements of the lane mask, to avoid undef propagation.
-  SmallVector<int, 16> LaneMask(NumElts, SM_SentinelUndef);
-  for (int DstLane = 0; DstLane != NumLanes; ++DstLane) {
-    int SrcLane = SrcLaneMask[DstLane];
-    if (0 <= SrcLane)
-      for (int j = 0; j != NumEltsPerLane; ++j) {
-        LaneMask[(DstLane * NumEltsPerLane) + j] =
-            (SrcLane * NumEltsPerLane) + j;
-      }
-  }
+    if (!CanUseSublanes) {
+      // If we're only shuffling a single lowest lane and the rest are identity
+      // then don't bother.
+      // TODO - isShuffleMaskInputInPlace could be extended to something like
+      // this.
+      int NumIdentityLanes = 0;
+      bool OnlyShuffleLowestLane = true;
+      for (int i = 0; i != NumLanes; ++i) {
+        int LaneOffset = i * NumEltsPerLane;
+        if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
+                                       i * NumEltsPerLane))
+          NumIdentityLanes++;
+        else if (CrossLaneMask[LaneOffset] != 0)
+          OnlyShuffleLowestLane = false;
+      }
+      if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
+        return SDValue();
+    }
 
-  // If we're only shuffling a single lowest lane and the rest are identity
-  // then don't bother.
-  // TODO - isShuffleMaskInputInPlace could be extended to something like this.
-  int NumIdentityLanes = 0;
-  bool OnlyShuffleLowestLane = true;
-  for (int i = 0; i != NumLanes; ++i) {
-    if (isSequentialOrUndefInRange(PermMask, i * NumEltsPerLane, NumEltsPerLane,
-                                   i * NumEltsPerLane))
-      NumIdentityLanes++;
-    else if (SrcLaneMask[i] != 0 && SrcLaneMask[i] != NumLanes)
-      OnlyShuffleLowestLane = false;
-  }
-  if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
+    SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
+    return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
+                                InLaneMask);
+  };
+
+  // First attempt a solution with full lanes.
+  if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
+    return V;
+
+  // The rest of the solutions use sublanes.
+  if (!CanUseSublanes)
+    return SDValue();
+
+  // Then attempt a solution with 64-bit sublanes (vpermq).
+  if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
+    return V;
+
+  // If that doesn't work and we have fast variable shuffle,
+  // attempt 32-bit sublanes (vpermd).
+  if (!Subtarget.hasFastVariableShuffle())
     return SDValue();
 
-  SDValue LanePermute = DAG.getVectorShuffle(VT, DL, V1, V2, LaneMask);
-  return DAG.getVectorShuffle(VT, DL, LanePermute, DAG.getUNDEF(VT), PermMask);
+  return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
 }
 
 /// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
@@ -15563,8 +15996,8 @@ static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
   if (!IsLowZero && !IsHighZero) {
     // Check for patterns which can be matched with a single insert of a 128-bit
     // subvector.
-    bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
-    if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
+    bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
+    if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
 
       // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
       // this will likely become vinsertf128 which can't fold a 256-bit memop.
@@ -16306,7 +16739,7 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
       return Broadcast;
 
     // Use low duplicate instructions for masks that match their pattern.
-    if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
+    if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
       return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
 
     if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
@@ -16367,7 +16800,7 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
   // If we have one input in place, then we can permute the other input and
   // blend the result.
   if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
-    return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, Mask,
+    return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
                                                 Subtarget, DAG);
 
   // Try to create an in-lane repeating shuffle mask and then shuffle the
@@ -16395,7 +16828,7 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
   // If we have AVX2 then we always want to lower with a blend because an v4 we
   // can fully permute the elements.
   if (Subtarget.hasAVX2())
-    return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, Mask,
+    return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
                                                 Subtarget, DAG);
 
   // Otherwise fall back on generic lowering.
@@ -16477,7 +16910,7 @@ static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
   // If we have one input in place, then we can permute the other input and
   // blend the result.
   if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
-    return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, Mask,
+    return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
                                                 Subtarget, DAG);
 
   // Try to create an in-lane repeating shuffle mask and then shuffle the
@@ -16497,7 +16930,7 @@ static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
       return Result;
 
   // Otherwise fall back on generic blend lowering.
-  return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, Mask,
+  return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
                                               Subtarget, DAG);
 }
 
@@ -16530,9 +16963,9 @@ static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
            "Repeated masks must be half the mask width!");
 
     // Use even/odd duplicate instructions for masks that match their pattern.
-    if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
+    if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
       return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
-    if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
+    if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
       return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
 
     if (V2.isUndef())
@@ -16586,14 +17019,13 @@ static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
   // since after split we get a more efficient code using vpunpcklwd and
   // vpunpckhwd instrs than vblend.
   if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
-    if (SDValue V = lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
-                                               Subtarget, DAG))
-      return V;
+    return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget,
+                                      DAG);
 
   // If we have AVX2 then we always want to lower with a blend because at v8 we
   // can fully permute the elements.
   if (Subtarget.hasAVX2())
-    return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2, Mask,
+    return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
                                                 Subtarget, DAG);
 
   // Otherwise fall back on generic lowering.
@@ -16626,9 +17058,8 @@ static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
   // vpunpcklwd and vpunpckhwd instrs.
   if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
       !Subtarget.hasAVX512())
-    if (SDValue V = lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask,
-                                               Subtarget, DAG))
-      return V;
+    return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget,
+                                      DAG);
 
   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
                                           Zeroable, Subtarget, DAG))
@@ -16713,7 +17144,7 @@ static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
     return Result;
 
   // Otherwise fall back on generic blend lowering.
-  return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2, Mask,
+  return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
                                               Subtarget, DAG);
 }
 
@@ -16755,6 +17186,11 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                        Subtarget))
     return V;
 
+  // Try to use lower using a truncation.
+  if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
+                                       Subtarget, DAG))
+    return V;
+
   // Try to use shift instructions.
   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
                                           Zeroable, Subtarget, DAG))
@@ -16807,9 +17243,9 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                               Zeroable, Subtarget, DAG))
     return PSHUFB;
 
-  // AVX512BWVL can lower to VPERMW.
-  if (Subtarget.hasBWI() && Subtarget.hasVLX())
-    return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
+  // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
+  if (Subtarget.hasBWI())
+    return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
 
   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   // shuffle.
@@ -16865,6 +17301,11 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                        Subtarget))
     return V;
 
+  // Try to use lower using a truncation.
+  if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
+                                       Subtarget, DAG))
+    return V;
+
   // Try to use shift instructions.
   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
                                           Zeroable, Subtarget, DAG))
@@ -16907,9 +17348,9 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                               Zeroable, Subtarget, DAG))
     return PSHUFB;
 
-  // AVX512VBMIVL can lower to VPERMB.
-  if (Subtarget.hasVBMI() && Subtarget.hasVLX())
-    return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, DAG);
+  // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
+  if (Subtarget.hasVBMI())
+    return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
 
   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   // shuffle.
@@ -17036,9 +17477,9 @@ static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
 
   // Check for patterns which can be matched with a single insert of a 256-bit
   // subvector.
-  bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 2, 3, 0, 1, 2, 3});
+  bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
   if (OnlyUsesV1 ||
-      isShuffleEquivalent(V1, V2, Mask, {0, 1, 2, 3, 8, 9, 10, 11})) {
+      isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
     MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
     SDValue SubVec =
         DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
@@ -17123,7 +17564,7 @@ static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
 
   if (V2.isUndef()) {
     // Use low duplicate instructions for masks that match their pattern.
-    if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
+    if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
       return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
 
     if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
@@ -17163,7 +17604,7 @@ static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                           Zeroable, Subtarget, DAG))
     return Blend;
 
-  return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
+  return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
 }
 
 /// Handle lowering of 16-lane 32-bit floating point shuffles.
@@ -17182,9 +17623,9 @@ static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
 
     // Use even/odd duplicate instructions for masks that match their pattern.
-    if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
+    if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
       return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
-    if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
+    if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
       return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
 
     if (V2.isUndef())
@@ -17222,7 +17663,7 @@ static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                              V1, V2, DAG, Subtarget))
     return V;
 
-  return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
+  return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
 }
 
 /// Handle lowering of 8-lane 64-bit integer shuffles.
@@ -17270,12 +17711,14 @@ static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
     return Rotate;
 
   // Try to use PALIGNR.
-  if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
-                                                Subtarget, DAG))
-    return Rotate;
+  if (Subtarget.hasBWI())
+    if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
+                                                  Subtarget, DAG))
+      return Rotate;
 
   if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
     return Unpck;
+
   // If we have AVX512F support, we can use VEXPAND.
   if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,
                                        DAG, Subtarget))
@@ -17285,7 +17728,7 @@ static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                           Zeroable, Subtarget, DAG))
     return Blend;
 
-  return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
+  return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
 }
 
 /// Handle lowering of 16-lane 32-bit integer shuffles.
@@ -17362,7 +17805,7 @@ static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                           Zeroable, Subtarget, DAG))
     return Blend;
 
-  return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
+  return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
 }
 
 /// Handle lowering of 32-lane 16-bit integer shuffles.
@@ -17425,7 +17868,7 @@ static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                               Zeroable, Subtarget, DAG))
     return PSHUFB;
 
-  return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
+  return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
 }
 
 /// Handle lowering of 64-lane 8-bit integer shuffles.
@@ -17481,7 +17924,7 @@ static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
 
   // VBMI can use VPERMV/VPERMV3 byte shuffles.
   if (Subtarget.hasVBMI())
-    return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
+    return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
 
   // Try to create an in-lane repeating shuffle mask and then shuffle the
   // results into the target lanes.
@@ -17935,7 +18378,7 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,
         // Modify the new Mask to take all zeros from the all-zero vector.
         // Choose indices that are blend-friendly.
         bool UsedZeroVector = false;
-        assert(find(WidenedMask, SM_SentinelZero) != WidenedMask.end() &&
+        assert(is_contained(WidenedMask, SM_SentinelZero) &&
                "V2's non-undef elements are used?!");
         for (int i = 0; i != NewNumElts; ++i)
           if (WidenedMask[i] == SM_SentinelZero) {
@@ -17961,9 +18404,6 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,
     std::swap(V1, V2);
   }
 
-  if (SDValue V = lowerShuffleWithVPMOV(DL, Mask, VT, V1, V2, DAG, Subtarget))
-    return V;
-
   // For each vector width, delegate to a specialized lowering routine.
   if (VT.is128BitVector())
     return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
@@ -17991,9 +18431,11 @@ static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
 
   // Only non-legal VSELECTs reach this lowering, convert those into generic
   // shuffles and re-use the shuffle lowering path for blends.
-  SmallVector<int, 32> Mask;
-  if (createShuffleMaskFromVSELECT(Mask, Cond))
-    return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
+  if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
+    SmallVector<int, 32> Mask;
+    if (createShuffleMaskFromVSELECT(Mask, Cond))
+      return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
+  }
 
   return SDValue();
 }
@@ -18107,7 +18549,9 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
                                      DAG.getBitcast(MVT::v4i32, Vec), Idx));
 
-    SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec, Idx);
+    unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+    SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
+                                  DAG.getTargetConstant(IdxVal, dl, MVT::i8));
     return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
   }
 
@@ -18262,7 +18706,8 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
                                      DAG.getBitcast(MVT::v4i32, Vec), Idx));
 
-    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec, Idx);
+    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
+                                  DAG.getTargetConstant(IdxVal, dl, MVT::i8));
     return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
   }
 
@@ -18456,10 +18901,9 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
       Opc = X86ISD::PINSRB;
     }
 
-    if (N1.getValueType() != MVT::i32)
-      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
-    if (N2.getValueType() != MVT::i32)
-      N2 = DAG.getIntPtrConstant(IdxVal, dl);
+    assert(N1.getValueType() != MVT::i32 && "Unexpected VT");
+    N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
+    N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
     return DAG.getNode(Opc, dl, VT, N0, N1, N2);
   }
 
@@ -18707,9 +19151,12 @@ SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
   if (GV) {
     // Create a target global address if this is a global. If possible, fold the
     // offset into the global address reference. Otherwise, ADD it on later.
+    // Suppress the folding if Offset is negative: movl foo-1, %eax is not
+    // allowed because if the address of foo is 0, the ELF R_X86_64_32
+    // relocation will compute to a negative value, which is invalid.
     int64_t GlobalOffset = 0;
-    if (OpFlags == X86II::MO_NO_FLAG &&
-        X86::isOffsetSuitableForCodeModel(Offset, M)) {
+    if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
+        X86::isOffsetSuitableForCodeModel(Offset, M, true)) {
       std::swap(GlobalOffset, Offset);
     }
     Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
@@ -18796,7 +19243,7 @@ LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
   return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
 }
 
-// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
+// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
 static SDValue
 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
                                 const EVT PtrVT) {
@@ -18804,10 +19251,17 @@ LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
                     X86::RAX, X86II::MO_TLSGD);
 }
 
+// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
+static SDValue
+LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+                                 const EVT PtrVT) {
+  return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
+                    X86::EAX, X86II::MO_TLSGD);
+}
+
 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
-                                           SelectionDAG &DAG,
-                                           const EVT PtrVT,
-                                           bool is64Bit) {
+                                           SelectionDAG &DAG, const EVT PtrVT,
+                                           bool Is64Bit, bool Is64BitLP64) {
   SDLoc dl(GA);
 
   // Get the start address of the TLS block for this module.
@@ -18816,8 +19270,9 @@ static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
   MFI->incNumLocalDynamicTLSAccesses();
 
   SDValue Base;
-  if (is64Bit) {
-    Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
+  if (Is64Bit) {
+    unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
+    Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, ReturnReg,
                       X86II::MO_TLSLD, /*LocalDynamic=*/true);
   } else {
     SDValue InFlag;
@@ -18914,12 +19369,15 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
     TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
     switch (model) {
       case TLSModel::GeneralDynamic:
-        if (Subtarget.is64Bit())
-          return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
+        if (Subtarget.is64Bit()) {
+          if (Subtarget.isTarget64BitLP64())
+            return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
+          return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
+        }
         return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
       case TLSModel::LocalDynamic:
-        return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
-                                           Subtarget.is64Bit());
+        return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
+                                           Subtarget.isTarget64BitLP64());
       case TLSModel::InitialExec:
       case TLSModel::LocalExec:
         return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
@@ -19019,7 +19477,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
       else
         IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
 
-      auto &DL = DAG.getDataLayout();
+      const DataLayout &DL = DAG.getDataLayout();
       SDValue Scale =
           DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
       IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
@@ -19112,15 +19570,29 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
     if (IsFSHR)
       std::swap(Op0, Op1);
 
+    // With AVX512, but not VLX we need to widen to get a 512-bit result type.
+    if (!Subtarget.hasVLX() && !VT.is512BitVector()) {
+      Op0 = widenSubVector(Op0, false, Subtarget, DAG, DL, 512);
+      Op1 = widenSubVector(Op1, false, Subtarget, DAG, DL, 512);
+    }
+
+    SDValue Funnel;
     APInt APIntShiftAmt;
+    MVT ResultVT = Op0.getSimpleValueType();
     if (X86::isConstantSplat(Amt, APIntShiftAmt)) {
       uint64_t ShiftAmt = APIntShiftAmt.urem(VT.getScalarSizeInBits());
-      return DAG.getNode(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT, Op0,
-                         Op1, DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
+      Funnel =
+          DAG.getNode(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, ResultVT, Op0,
+                      Op1, DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
+    } else {
+      if (!Subtarget.hasVLX() && !VT.is512BitVector())
+        Amt = widenSubVector(Amt, false, Subtarget, DAG, DL, 512);
+      Funnel = DAG.getNode(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL,
+                           ResultVT, Op0, Op1, Amt);
     }
-
-    return DAG.getNode(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
-                       Op0, Op1, Amt);
+    if (!Subtarget.hasVLX() && !VT.is512BitVector())
+      Funnel = extractSubVector(Funnel, 0, DAG, DL, VT.getSizeInBits());
+    return Funnel;
   }
   assert(
       (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
@@ -19472,7 +19944,7 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
   }
 
   if (VT == MVT::f128)
-    return LowerF128Call(Op, DAG, RTLIB::getSINTTOFP(SrcVT, VT));
+    return SDValue();
 
   SDValue ValueToStore = Src;
   if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
@@ -19553,6 +20025,10 @@ static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
 /// 64-bit unsigned integer to double expansion.
 static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
                                    const X86Subtarget &Subtarget) {
+  // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
+  // when converting 0 when rounding toward negative infinity. Caller will
+  // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
+  assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!");
   // This algorithm is not obvious. Here it is what we're trying to output:
   /*
      movq       %rax,  %xmm0
@@ -19566,8 +20042,6 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
      #endif
   */
 
-  bool IsStrict = Op->isStrictFPOpcode();
-  unsigned OpNo = IsStrict ? 1 : 0;
   SDLoc dl(Op);
   LLVMContext *Context = DAG.getContext();
 
@@ -19589,48 +20063,30 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
 
   // Load the 64-bit value into an XMM register.
   SDValue XR1 =
-      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(OpNo));
-  SDValue CLod0 =
-      DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
-                  MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
-                  /* Alignment = */ 16);
+      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
+  SDValue CLod0 = DAG.getLoad(
+      MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
+      MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
   SDValue Unpck1 =
       getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
 
-  SDValue CLod1 =
-      DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
-                  MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
-                  /* Alignment = */ 16);
+  SDValue CLod1 = DAG.getLoad(
+      MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
+      MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
   SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
-  SDValue Sub;
-  SDValue Chain;
   // TODO: Are there any fast-math-flags to propagate here?
-  if (IsStrict) {
-    Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
-                      {Op.getOperand(0), XR2F, CLod1});
-    Chain = Sub.getValue(1);
-  } else
-    Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
+  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
   SDValue Result;
 
-  if (!IsStrict && Subtarget.hasSSE3() &&
+  if (Subtarget.hasSSE3() &&
       shouldUseHorizontalOp(true, DAG, Subtarget)) {
-    // FIXME: Do we need a STRICT version of FHADD?
     Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
   } else {
     SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
-    if (IsStrict) {
-      Result = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v2f64, MVT::Other},
-                           {Chain, Shuffle, Sub});
-      Chain = Result.getValue(1);
-    } else
-      Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
+    Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
   }
   Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
                        DAG.getIntPtrConstant(0, dl));
-  if (IsStrict)
-    return DAG.getMergeValues({Result, Chain}, dl);
-
   return Result;
 }
 
@@ -19929,7 +20385,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
   SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
 
   if (DstVT == MVT::f128)
-    return LowerF128Call(Op, DAG, RTLIB::getUINTTOFP(SrcVT, DstVT));
+    return SDValue();
 
   if (DstVT.isVector())
     return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
@@ -19956,26 +20412,30 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
   if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
     return V;
 
-  if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
+  // The transform for i64->f64 isn't correct for 0 when rounding to negative
+  // infinity. It produces -0.0, so disable under strictfp.
+  if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64 && !IsStrict)
     return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
   if (SrcVT == MVT::i32 && X86ScalarSSEf64 && DstVT != MVT::f80)
     return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
-  if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
+  if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
+      (DstVT == MVT::f32 || DstVT == MVT::f64))
     return SDValue();
 
   // Make a 64-bit buffer, and use it to build an FILD.
   SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
   int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
+  Align SlotAlign(8);
   MachinePointerInfo MPI =
     MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
   if (SrcVT == MVT::i32) {
-    SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
-    SDValue Store1 =
-        DAG.getStore(Chain, dl, Src, StackSlot, MPI, 8 /*Align*/);
+    SDValue OffsetSlot =
+        DAG.getMemBasePlusOffset(StackSlot, TypeSize::Fixed(4), dl);
+    SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
     SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
-                                  OffsetSlot, MPI.getWithOffset(4), 4);
+                                  OffsetSlot, MPI.getWithOffset(4), SlotAlign);
     std::pair<SDValue, SDValue> Tmp =
-        BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, Align(8), DAG);
+        BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
     if (IsStrict)
       return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
 
@@ -19991,17 +20451,15 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
     ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
   }
   SDValue Store =
-      DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Align(8));
+      DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
   // For i64 source, we need to add the appropriate power of 2 if the input
-  // was negative.  This is the same as the optimization in
-  // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
-  // we must be careful to do the computation in x87 extended precision, not
-  // in SSE. (The generic code can't know it's OK to do this, or how to.)
+  // was negative. We must be careful to do the computation in x87 extended
+  // precision, not in SSE.
   SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
   SDValue Ops[] = { Store, StackSlot };
   SDValue Fild =
       DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
-                              Align(8), MachineMemOperand::MOLoad);
+                              SlotAlign, MachineMemOperand::MOLoad);
   Chain = Fild.getValue(1);
 
 
@@ -20104,8 +20562,8 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
     // of a signed i64.  Let Thresh be the FP equivalent of
     // 0x8000000000000000ULL.
     //
-    //  Adjust = (Value < Thresh) ? 0 : 0x80000000;
-    //  FltOfs = (Value < Thresh) ? 0 : 0x80000000;
+    //  Adjust = (Value >= Thresh) ? 0x80000000 : 0;
+    //  FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
     //  FistSrc = (Value - FltOfs);
     //  Fist-to-mem64 FistSrc
     //  Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
@@ -20135,20 +20593,30 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
                                    *DAG.getContext(), TheVT);
     SDValue Cmp;
     if (IsStrict) {
-      Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETLT,
-                         Chain, /*IsSignaling*/ true);
+      Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
+                         /*IsSignaling*/ true);
       Chain = Cmp.getValue(1);
     } else {
-      Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETLT);
+      Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
     }
 
-    Adjust = DAG.getSelect(DL, MVT::i64, Cmp,
-                           DAG.getConstant(0, DL, MVT::i64),
-                           DAG.getConstant(APInt::getSignMask(64),
-                                           DL, MVT::i64));
-    SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp,
-                                   DAG.getConstantFP(0.0, DL, TheVT),
-                                   ThreshVal);
+    // Our preferred lowering of
+    //
+    // (Value >= Thresh) ? 0x8000000000000000ULL : 0
+    //
+    // is
+    //
+    // (Value >= Thresh) << 63
+    //
+    // but since we can get here after LegalOperations, DAGCombine might do the
+    // wrong thing if we create a select. So, directly create the preferred
+    // version.
+    SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
+    SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
+    Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
+
+    SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
+                                   DAG.getConstantFP(0.0, DL, TheVT));
 
     if (IsStrict) {
       Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
@@ -20607,30 +21075,29 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
   assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
 
   if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
+    In = DAG.getBitcast(MVT::v8i32, In);
+
     // On AVX2, v4i64 -> v4i32 becomes VPERMD.
     if (Subtarget.hasInt256()) {
       static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
-      In = DAG.getBitcast(MVT::v8i32, In);
       In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
                          DAG.getIntPtrConstant(0, DL));
     }
 
-    SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
+    SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
                                DAG.getIntPtrConstant(0, DL));
-    SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
-                               DAG.getIntPtrConstant(2, DL));
-    OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
-    OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
+    SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
+                               DAG.getIntPtrConstant(4, DL));
     static const int ShufMask[] = {0, 2, 4, 6};
     return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
   }
 
   if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
+    In = DAG.getBitcast(MVT::v32i8, In);
+
     // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
     if (Subtarget.hasInt256()) {
-      In = DAG.getBitcast(MVT::v32i8, In);
-
       // The PSHUFB mask:
       static const int ShufMask1[] = { 0,  1,  4,  5,  8,  9, 12, 13,
                                       -1, -1, -1, -1, -1, -1, -1, -1,
@@ -20639,21 +21106,17 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
       In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
       In = DAG.getBitcast(MVT::v4i64, In);
 
-      static const int ShufMask2[] = {0,  2,  -1,  -1};
-      In = DAG.getVectorShuffle(MVT::v4i64, DL,  In, In, ShufMask2);
-      In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
-                       DAG.getIntPtrConstant(0, DL));
-      return DAG.getBitcast(VT, In);
+      static const int ShufMask2[] = {0, 2, -1, -1};
+      In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
+      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16,
+                         DAG.getBitcast(MVT::v16i16, In),
+                         DAG.getIntPtrConstant(0, DL));
     }
 
-    SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
+    SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, In,
                                DAG.getIntPtrConstant(0, DL));
-
-    SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
-                               DAG.getIntPtrConstant(4, DL));
-
-    OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
-    OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
+    SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, In,
+                               DAG.getIntPtrConstant(16, DL));
 
     // The PSHUFB mask:
     static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
@@ -20989,6 +21452,155 @@ SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
   return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
 }
 
+SDValue
+X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
+  // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,
+  // but making use of X86 specifics to produce better instruction sequences.
+  SDNode *Node = Op.getNode();
+  bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
+  unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
+  SDLoc dl(SDValue(Node, 0));
+  SDValue Src = Node->getOperand(0);
+
+  // There are three types involved here: SrcVT is the source floating point
+  // type, DstVT is the type of the result, and TmpVT is the result of the
+  // intermediate FP_TO_*INT operation we'll use (which may be a promotion of
+  // DstVT).
+  EVT SrcVT = Src.getValueType();
+  EVT DstVT = Node->getValueType(0);
+  EVT TmpVT = DstVT;
+
+  // This code is only for floats and doubles. Fall back to generic code for
+  // anything else.
+  if (!isScalarFPTypeInSSEReg(SrcVT))
+    return SDValue();
+
+  unsigned SatWidth = Node->getConstantOperandVal(1);
+  unsigned DstWidth = DstVT.getScalarSizeInBits();
+  unsigned TmpWidth = TmpVT.getScalarSizeInBits();
+  assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&
+         "Expected saturation width smaller than result width");
+
+  // Promote result of FP_TO_*INT to at least 32 bits.
+  if (TmpWidth < 32) {
+    TmpVT = MVT::i32;
+    TmpWidth = 32;
+  }
+
+  // Promote conversions to unsigned 32-bit to 64-bit, because it will allow
+  // us to use a native signed conversion instead.
+  if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {
+    TmpVT = MVT::i64;
+    TmpWidth = 64;
+  }
+
+  // If the saturation width is smaller than the size of the temporary result,
+  // we can always use signed conversion, which is native.
+  if (SatWidth < TmpWidth)
+    FpToIntOpcode = ISD::FP_TO_SINT;
+
+  // Determine minimum and maximum integer values and their corresponding
+  // floating-point values.
+  APInt MinInt, MaxInt;
+  if (IsSigned) {
+    MinInt = APInt::getSignedMinValue(SatWidth).sextOrSelf(DstWidth);
+    MaxInt = APInt::getSignedMaxValue(SatWidth).sextOrSelf(DstWidth);
+  } else {
+    MinInt = APInt::getMinValue(SatWidth).zextOrSelf(DstWidth);
+    MaxInt = APInt::getMaxValue(SatWidth).zextOrSelf(DstWidth);
+  }
+
+  APFloat MinFloat(DAG.EVTToAPFloatSemantics(SrcVT));
+  APFloat MaxFloat(DAG.EVTToAPFloatSemantics(SrcVT));
+
+  APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(
+    MinInt, IsSigned, APFloat::rmTowardZero);
+  APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(
+    MaxInt, IsSigned, APFloat::rmTowardZero);
+  bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)
+                          && !(MaxStatus & APFloat::opStatus::opInexact);
+
+  SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
+  SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
+
+  // If the integer bounds are exactly representable as floats, emit a
+  // min+max+fptoi sequence. Otherwise use comparisons and selects.
+  if (AreExactFloatBounds) {
+    if (DstVT != TmpVT) {
+      // Clamp by MinFloat from below. If Src is NaN, propagate NaN.
+      SDValue MinClamped = DAG.getNode(
+        X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
+      // Clamp by MaxFloat from above. If Src is NaN, propagate NaN.
+      SDValue BothClamped = DAG.getNode(
+        X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
+      // Convert clamped value to integer.
+      SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);
+
+      // NaN will become INDVAL, with the top bit set and the rest zero.
+      // Truncation will discard the top bit, resulting in zero.
+      return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
+    }
+
+    // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.
+    SDValue MinClamped = DAG.getNode(
+      X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
+    // Clamp by MaxFloat from above. NaN cannot occur.
+    SDValue BothClamped = DAG.getNode(
+      X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
+    // Convert clamped value to integer.
+    SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);
+
+    if (!IsSigned) {
+      // In the unsigned case we're done, because we mapped NaN to MinFloat,
+      // which is zero.
+      return FpToInt;
+    }
+
+    // Otherwise, select zero if Src is NaN.
+    SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
+    return DAG.getSelectCC(
+      dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
+  }
+
+  SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
+  SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);
+
+  // Result of direct conversion, which may be selected away.
+  SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);
+
+  if (DstVT != TmpVT) {
+    // NaN will become INDVAL, with the top bit set and the rest zero.
+    // Truncation will discard the top bit, resulting in zero.
+    FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
+  }
+
+  SDValue Select = FpToInt;
+  // For signed conversions where we saturate to the same size as the
+  // result type of the fptoi instructions, INDVAL coincides with integer
+  // minimum, so we don't need to explicitly check it.
+  if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {
+    // If Src ULT MinFloat, select MinInt. In particular, this also selects
+    // MinInt if Src is NaN.
+    Select = DAG.getSelectCC(
+      dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);
+  }
+
+  // If Src OGT MaxFloat, select MaxInt.
+  Select = DAG.getSelectCC(
+    dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);
+
+  // In the unsigned case we are done, because we mapped NaN to MinInt, which
+  // is already zero. The promoted case was already handled above.
+  if (!IsSigned || DstVT != TmpVT) {
+    return Select;
+  }
+
+  // Otherwise, select 0 if Src is NaN.
+  SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
+  return DAG.getSelectCC(
+    dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
+}
+
 SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
   bool IsStrict = Op->isStrictFPOpcode();
 
@@ -20997,10 +21609,8 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
   SDValue In = Op.getOperand(IsStrict ? 1 : 0);
   MVT SVT = In.getSimpleValueType();
 
-  if (VT == MVT::f128) {
-    RTLIB::Libcall LC = RTLIB::getFPEXT(SVT, VT);
-    return LowerF128Call(Op, DAG, LC);
-  }
+  if (VT == MVT::f128)
+    return SDValue();
 
   assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
 
@@ -21014,31 +21624,12 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
   bool IsStrict = Op->isStrictFPOpcode();
-
-  MVT VT = Op.getSimpleValueType();
   SDValue In = Op.getOperand(IsStrict ? 1 : 0);
-  MVT SVT = In.getSimpleValueType();
-
   // It's legal except when f128 is involved
-  if (SVT != MVT::f128)
+  if (In.getSimpleValueType() != MVT::f128)
     return Op;
 
-  RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, VT);
-
-  // FP_ROUND node has a second operand indicating whether it is known to be
-  // precise. That doesn't take part in the LibCall so we can't directly use
-  // LowerF128Call.
-
-  SDLoc dl(Op);
-  SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
-  MakeLibCallOptions CallOptions;
-  std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, In, CallOptions,
-                                                dl, Chain);
-
-  if (IsStrict)
-    return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
-
-  return Tmp.first;
+  return SDValue();
 }
 
 static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) {
@@ -21403,8 +21994,7 @@ static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,
     if (M == SrcOpMap.end()) {
       VT = Src.getValueType();
       // Quit if not the same type.
-      if (SrcOpMap.begin() != SrcOpMap.end() &&
-          VT != SrcOpMap.begin()->first.getValueType())
+      if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())
         return false;
       unsigned NumElts = VT.getVectorNumElements();
       APInt EltCount = APInt::getNullValue(NumElts);
@@ -21442,8 +22032,11 @@ static SDValue LowerVectorAllZero(const SDLoc &DL, SDValue V, ISD::CondCode CC,
                                   const X86Subtarget &Subtarget,
                                   SelectionDAG &DAG, X86::CondCode &X86CC) {
   EVT VT = V.getValueType();
-  assert(Mask.getBitWidth() == VT.getScalarSizeInBits() &&
-         "Element Mask vs Vector bitwidth mismatch");
+  unsigned ScalarSize = VT.getScalarSizeInBits();
+  if (Mask.getBitWidth() != ScalarSize) {
+    assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch");
+    return SDValue();
+  }
 
   assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
   X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
@@ -22347,7 +22940,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
             Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
     }
 
-    if (VT.getSizeInBits() > Op.getSimpleValueType().getSizeInBits()) {
+    if (VT.getFixedSizeInBits() >
+        Op.getSimpleValueType().getFixedSizeInBits()) {
       // We emitted a compare with an XMM/YMM result. Finish converting to a
       // mask register using a vptestm.
       EVT CastVT = EVT(VT).changeVectorElementTypeToInteger();
@@ -22522,8 +23116,10 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
   }
 
   // Try to use SUBUS and PCMPEQ.
-  if (SDValue V = LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
-    return V;
+  if (FlipSigns)
+    if (SDValue V =
+            LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
+      return V;
 
   // We are handling one of the integer comparisons here. Since SSE only has
   // GT and EQ comparisons for integer, swapping operands and multiple
@@ -23318,7 +23914,7 @@ static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
 
   MVT SVT = VT.getVectorElementType();
   MVT InSVT = InVT.getVectorElementType();
-  assert(SVT.getSizeInBits() > InSVT.getSizeInBits());
+  assert(SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits());
 
   if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
     return SDValue();
@@ -23493,7 +24089,8 @@ static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {
   std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
   unsigned HalfOffset = Value0.getValueType().getStoreSize();
   SDValue Ptr0 = Store->getBasePtr();
-  SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, HalfOffset, DL);
+  SDValue Ptr1 =
+      DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(HalfOffset), DL);
   SDValue Ch0 =
       DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
                    Store->getOriginalAlign(),
@@ -23528,7 +24125,8 @@ static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,
   SmallVector<SDValue, 4> Stores;
   for (unsigned i = 0; i != NumElems; ++i) {
     unsigned Offset = i * ScalarSize;
-    SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(), Offset, DL);
+    SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
+                                           TypeSize::Fixed(Offset), DL);
     SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
                               DAG.getIntPtrConstant(i, DL));
     SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
@@ -23549,17 +24147,22 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
   // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
   if (StoredVal.getValueType().isVector() &&
       StoredVal.getValueType().getVectorElementType() == MVT::i1) {
-    assert(StoredVal.getValueType().getVectorNumElements() <= 8 &&
-           "Unexpected VT");
+    unsigned NumElts = StoredVal.getValueType().getVectorNumElements();
+    assert(NumElts <= 8 && "Unexpected VT");
     assert(!St->isTruncatingStore() && "Expected non-truncating store");
     assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
            "Expected AVX512F without AVX512DQI");
 
+    // We must pad with zeros to ensure we store zeroes to any unused bits.
     StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
                             DAG.getUNDEF(MVT::v16i1), StoredVal,
                             DAG.getIntPtrConstant(0, dl));
     StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
     StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
+    // Make sure we store zeros in the extra bits.
+    if (NumElts < 8)
+      StoredVal = DAG.getZeroExtendInReg(
+          StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));
 
     return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
                         St->getPointerInfo(), St->getOriginalAlign(),
@@ -23815,7 +24418,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
   SDValue Result;
   if (!Lower) {
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-    unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
+    Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
     assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
                     " not tell us which reg is the stack pointer!");
 
@@ -23916,7 +24519,7 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   MemOps.push_back(Store);
 
   // Store fp_offset
-  FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
+  FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::Fixed(4), DL);
   Store = DAG.getStore(
       Op.getOperand(0), DL,
       DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
@@ -23981,15 +24584,18 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
            Subtarget.hasSSE1());
   }
 
-  // Insert VAARG_64 node into the DAG
-  // VAARG_64 returns two values: Variable Argument Address, Chain
-  SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
-                       DAG.getConstant(ArgMode, dl, MVT::i8),
-                       DAG.getConstant(Align, dl, MVT::i32)};
+  // Insert VAARG node into the DAG
+  // VAARG returns two values: Variable Argument Address, Chain
+  SDValue InstOps[] = {Chain, SrcPtr,
+                       DAG.getTargetConstant(ArgSize, dl, MVT::i32),
+                       DAG.getTargetConstant(ArgMode, dl, MVT::i8),
+                       DAG.getTargetConstant(Align, dl, MVT::i32)};
   SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
   SDValue VAARG = DAG.getMemIntrinsicNode(
-      X86ISD::VAARG_64, dl, VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
-      /*Align=*/None, MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
+      Subtarget.isTarget64BitLP64() ? X86ISD::VAARG_64 : X86ISD::VAARG_X32, dl,
+      VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
+      /*Alignment=*/None,
+      MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
   Chain = VAARG.getValue(1);
 
   // Load the next argument and return it
@@ -24013,9 +24619,11 @@ static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
   SDLoc DL(Op);
 
-  return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, DAG.getIntPtrConstant(24, DL),
-                       Align(8), /*isVolatile*/ false, false, false,
-                       MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
+  return DAG.getMemcpy(
+      Chain, DL, DstPtr, SrcPtr,
+      DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),
+      Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,
+      false, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
 }
 
 // Helper to get immediate/variable SSE shift opcode from other shift opcodes.
@@ -24462,6 +25070,12 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       SDValue Src2 = Op.getOperand(2);
       SDValue Src3 = Op.getOperand(3);
 
+      if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
+          Src3.getValueType() != MVT::i8) {
+        Src3 = DAG.getTargetConstant(
+            cast<ConstantSDNode>(Src3)->getZExtValue() & 0xff, dl, MVT::i8);
+      }
+
       // We specify 2 possible opcodes for intrinsics with rounding modes.
       // First, we check if the intrinsic may have non-default rounding mode,
       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
@@ -24480,9 +25094,18 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
                          {Src1, Src2, Src3});
     }
-    case INTR_TYPE_4OP:
-      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
-        Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
+    case INTR_TYPE_4OP_IMM8: {
+      assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant);
+      SDValue Src4 = Op.getOperand(4);
+      if (Src4.getValueType() != MVT::i8) {
+        Src4 = DAG.getTargetConstant(
+            cast<ConstantSDNode>(Src4)->getZExtValue() & 0xff, dl, MVT::i8);
+      }
+
+      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
+                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
+                         Src4);
+    }
     case INTR_TYPE_1OP_MASK: {
       SDValue Src = Op.getOperand(1);
       SDValue PassThru = Op.getOperand(2);
@@ -24715,20 +25338,21 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     case CMP_MASK_CC: {
       MVT MaskVT = Op.getSimpleValueType();
       SDValue CC = Op.getOperand(3);
+      SDValue Mask = Op.getOperand(4);
       // We specify 2 possible opcodes for intrinsics with rounding modes.
       // First, we check if the intrinsic may have non-default rounding mode,
       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
       if (IntrData->Opc1 != 0) {
-        SDValue Sae = Op.getOperand(4);
+        SDValue Sae = Op.getOperand(5);
         if (isRoundModeSAE(Sae))
           return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
-                             Op.getOperand(2), CC, Sae);
+                             Op.getOperand(2), CC, Mask, Sae);
         if (!isRoundModeCurDirection(Sae))
           return SDValue();
       }
       //default rounding mode
       return DAG.getNode(IntrData->Opc0, dl, MaskVT,
-                         {Op.getOperand(1), Op.getOperand(2), CC});
+                         {Op.getOperand(1), Op.getOperand(2), CC, Mask});
     }
     case CMP_MASK_SCALAR_CC: {
       SDValue Src1 = Op.getOperand(1);
@@ -24883,12 +25507,11 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                          Op.getOperand(1), Op.getOperand(2), RoundingMode);
     }
     case BEXTRI: {
-      assert(IntrData->Opc0 == X86ISD::BEXTR && "Unexpected opcode");
+      assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode");
 
-      // The control is a TargetConstant, but we need to convert it to a
-      // ConstantSDNode.
       uint64_t Imm = Op.getConstantOperandVal(2);
-      SDValue Control = DAG.getConstant(Imm, dl, Op.getValueType());
+      SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,
+                                              Op.getValueType());
       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
                          Op.getOperand(1), Control);
     }
@@ -25279,9 +25902,9 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     // MMX register.
     ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
     return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
-                       DAG.getConstant(NewIntrinsic, DL, MVT::i32),
+                       DAG.getTargetConstant(NewIntrinsic, DL,
+                                             getPointerTy(DAG.getDataLayout())),
                        Op.getOperand(1), ShAmt);
-
   }
   }
 }
@@ -25650,6 +26273,96 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
       return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
                          Operation.getValue(1));
     }
+    case Intrinsic::x86_aesenc128kl:
+    case Intrinsic::x86_aesdec128kl:
+    case Intrinsic::x86_aesenc256kl:
+    case Intrinsic::x86_aesdec256kl: {
+      SDLoc DL(Op);
+      SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);
+      SDValue Chain = Op.getOperand(0);
+      unsigned Opcode;
+
+      switch (IntNo) {
+      default: llvm_unreachable("Impossible intrinsic");
+      case Intrinsic::x86_aesenc128kl:
+        Opcode = X86ISD::AESENC128KL;
+        break;
+      case Intrinsic::x86_aesdec128kl:
+        Opcode = X86ISD::AESDEC128KL;
+        break;
+      case Intrinsic::x86_aesenc256kl:
+        Opcode = X86ISD::AESENC256KL;
+        break;
+      case Intrinsic::x86_aesdec256kl:
+        Opcode = X86ISD::AESDEC256KL;
+        break;
+      }
+
+      MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
+      MachineMemOperand *MMO = MemIntr->getMemOperand();
+      EVT MemVT = MemIntr->getMemoryVT();
+      SDValue Operation = DAG.getMemIntrinsicNode(
+          Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,
+          MMO);
+      SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);
+
+      return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
+                         {ZF, Operation.getValue(0), Operation.getValue(2)});
+    }
+    case Intrinsic::x86_aesencwide128kl:
+    case Intrinsic::x86_aesdecwide128kl:
+    case Intrinsic::x86_aesencwide256kl:
+    case Intrinsic::x86_aesdecwide256kl: {
+      SDLoc DL(Op);
+      SDVTList VTs = DAG.getVTList(
+          {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,
+           MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});
+      SDValue Chain = Op.getOperand(0);
+      unsigned Opcode;
+
+      switch (IntNo) {
+      default: llvm_unreachable("Impossible intrinsic");
+      case Intrinsic::x86_aesencwide128kl:
+        Opcode = X86ISD::AESENCWIDE128KL;
+        break;
+      case Intrinsic::x86_aesdecwide128kl:
+        Opcode = X86ISD::AESDECWIDE128KL;
+        break;
+      case Intrinsic::x86_aesencwide256kl:
+        Opcode = X86ISD::AESENCWIDE256KL;
+        break;
+      case Intrinsic::x86_aesdecwide256kl:
+        Opcode = X86ISD::AESDECWIDE256KL;
+        break;
+      }
+
+      MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
+      MachineMemOperand *MMO = MemIntr->getMemOperand();
+      EVT MemVT = MemIntr->getMemoryVT();
+      SDValue Operation = DAG.getMemIntrinsicNode(
+          Opcode, DL, VTs,
+          {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
+           Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),
+           Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},
+          MemVT, MMO);
+      SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);
+
+      return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
+                         {ZF, Operation.getValue(1), Operation.getValue(2),
+                          Operation.getValue(3), Operation.getValue(4),
+                          Operation.getValue(5), Operation.getValue(6),
+                          Operation.getValue(7), Operation.getValue(8),
+                          Operation.getValue(9)});
+    }
+    case Intrinsic::x86_testui: {
+      SDLoc dl(Op);
+      SDValue Chain = Op.getOperand(0);
+      SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
+      SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);
+      SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
+      return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
+                         Operation.getValue(1));
+    }
     }
     return SDValue();
   }
@@ -26020,9 +26733,8 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
 
     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
                        DAG.getConstant(2, dl, MVT::i64));
-    OutChains[1] =
-        DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
-                     /* Alignment = */ 2);
+    OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
+                                MachinePointerInfo(TrmpAddr, 2), Align(2));
 
     // Load the 'nest' parameter value into R10.
     // R10 is specified in X86CallingConv.td
@@ -26034,9 +26746,8 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
 
     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
                        DAG.getConstant(12, dl, MVT::i64));
-    OutChains[3] =
-        DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
-                     /* Alignment = */ 2);
+    OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
+                                MachinePointerInfo(TrmpAddr, 12), Align(2));
 
     // Jump to the nested function.
     OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
@@ -26078,7 +26789,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
         for (FunctionType::param_iterator I = FTy->param_begin(),
              E = FTy->param_end(); I != E; ++I, ++Idx)
           if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
-            auto &DL = DAG.getDataLayout();
+            const DataLayout &DL = DAG.getDataLayout();
             // FIXME: should only count parameters that are lowered to integers.
             InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
           }
@@ -26116,22 +26827,20 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
 
     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
                        DAG.getConstant(1, dl, MVT::i32));
-    OutChains[1] =
-        DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
-                     /* Alignment = */ 1);
+    OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
+                                MachinePointerInfo(TrmpAddr, 1), Align(1));
 
     const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
                        DAG.getConstant(5, dl, MVT::i32));
-    OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
-                                Addr, MachinePointerInfo(TrmpAddr, 5),
-                                /* Alignment = */ 1);
+    OutChains[2] =
+        DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,
+                     MachinePointerInfo(TrmpAddr, 5), Align(1));
 
     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
                        DAG.getConstant(6, dl, MVT::i32));
-    OutChains[3] =
-        DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
-                     /* Alignment = */ 1);
+    OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
+                                MachinePointerInfo(TrmpAddr, 6), Align(1));
 
     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
   }
@@ -26425,50 +27134,47 @@ static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,
   MVT VT = Op.getSimpleValueType();
   SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
   unsigned Opcode = Op.getOpcode();
+  SDLoc DL(Op);
+
   if (VT.getScalarType() == MVT::i1) {
-    SDLoc dl(Op);
     switch (Opcode) {
     default: llvm_unreachable("Expected saturated arithmetic opcode");
     case ISD::UADDSAT:
     case ISD::SADDSAT:
       // *addsat i1 X, Y --> X | Y
-      return DAG.getNode(ISD::OR, dl, VT, X, Y);
+      return DAG.getNode(ISD::OR, DL, VT, X, Y);
     case ISD::USUBSAT:
     case ISD::SSUBSAT:
       // *subsat i1 X, Y --> X & ~Y
-      return DAG.getNode(ISD::AND, dl, VT, X, DAG.getNOT(dl, Y, VT));
+      return DAG.getNode(ISD::AND, DL, VT, X, DAG.getNOT(DL, Y, VT));
     }
   }
 
-  if (VT.is128BitVector()) {
-    // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
-    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-    EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
-                                                 *DAG.getContext(), VT);
-    SDLoc DL(Op);
-    if (Opcode == ISD::UADDSAT && !TLI.isOperationLegal(ISD::UMIN, VT)) {
-      // uaddsat X, Y --> (X >u (X + Y)) ? -1 : X + Y
-      SDValue Add = DAG.getNode(ISD::ADD, DL, VT, X, Y);
-      SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Add, ISD::SETUGT);
-      return DAG.getSelect(DL, VT, Cmp, DAG.getAllOnesConstant(DL, VT), Add);
-    }
-    if (Opcode == ISD::USUBSAT && !TLI.isOperationLegal(ISD::UMAX, VT)) {
-      // usubsat X, Y --> (X >u Y) ? X - Y : 0
-      SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
-      SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
-      return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
-    }
-    // Use default expansion.
-    return SDValue();
+  if (VT == MVT::v32i16 || VT == MVT::v64i8 ||
+      (VT.is256BitVector() && !Subtarget.hasInt256())) {
+    assert(Op.getSimpleValueType().isInteger() &&
+           "Only handle AVX vector integer operation");
+    return splitVectorIntBinary(Op, DAG);
   }
 
-  if (VT == MVT::v32i16 || VT == MVT::v64i8)
-    return splitVectorIntBinary(Op, DAG);
+  // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT SetCCResultType =
+      TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
 
-  assert(Op.getSimpleValueType().is256BitVector() &&
-         Op.getSimpleValueType().isInteger() &&
-         "Only handle AVX 256-bit vector integer operation");
-  return splitVectorIntBinary(Op, DAG);
+  if (Opcode == ISD::USUBSAT && !TLI.isOperationLegal(ISD::UMAX, VT)) {
+    // usubsat X, Y --> (X >u Y) ? X - Y : 0
+    SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
+    SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
+    // TODO: Move this to DAGCombiner?
+    if (SetCCResultType == VT &&
+        DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
+      return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
+    return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
+  }
+
+  // Use default expansion.
+  return SDValue();
 }
 
 static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
@@ -26518,36 +27224,8 @@ static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
   if (VT == MVT::v32i16 || VT == MVT::v64i8)
     return splitVectorIntBinary(Op, DAG);
 
-  SDLoc DL(Op);
-  unsigned Opcode = Op.getOpcode();
-  SDValue N0 = Op.getOperand(0);
-  SDValue N1 = Op.getOperand(1);
-
-  // For pre-SSE41, we can perform UMIN/UMAX v8i16 by flipping the signbit,
-  // using the SMIN/SMAX instructions and flipping the signbit back.
-  if (VT == MVT::v8i16) {
-    assert((Opcode == ISD::UMIN || Opcode == ISD::UMAX) &&
-           "Unexpected MIN/MAX opcode");
-    SDValue Sign = DAG.getConstant(APInt::getSignedMinValue(16), DL, VT);
-    N0 = DAG.getNode(ISD::XOR, DL, VT, N0, Sign);
-    N1 = DAG.getNode(ISD::XOR, DL, VT, N1, Sign);
-    Opcode = (Opcode == ISD::UMIN ? ISD::SMIN : ISD::SMAX);
-    SDValue Result = DAG.getNode(Opcode, DL, VT, N0, N1);
-    return DAG.getNode(ISD::XOR, DL, VT, Result, Sign);
-  }
-
-  // Else, expand to a compare/select.
-  ISD::CondCode CC;
-  switch (Opcode) {
-  case ISD::SMIN: CC = ISD::CondCode::SETLT;  break;
-  case ISD::SMAX: CC = ISD::CondCode::SETGT;  break;
-  case ISD::UMIN: CC = ISD::CondCode::SETULT; break;
-  case ISD::UMAX: CC = ISD::CondCode::SETUGT; break;
-  default: llvm_unreachable("Unknown MINMAX opcode");
-  }
-
-  SDValue Cond = DAG.getSetCC(DL, VT, N0, N1, CC);
-  return DAG.getSelect(DL, VT, Cond, N0, N1);
+  // Default to expand.
+  return SDValue();
 }
 
 static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
@@ -26903,8 +27581,6 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons
   case ISD::UDIV:      isSigned = false; LC = RTLIB::UDIV_I128;    break;
   case ISD::SREM:      isSigned = true;  LC = RTLIB::SREM_I128;    break;
   case ISD::UREM:      isSigned = false; LC = RTLIB::UREM_I128;    break;
-  case ISD::SDIVREM:   isSigned = true;  LC = RTLIB::SDIVREM_I128; break;
-  case ISD::UDIVREM:   isSigned = false; LC = RTLIB::UDIVREM_I128; break;
   }
 
   SDLoc dl(Op);
@@ -26921,8 +27597,8 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons
     MachinePointerInfo MPI =
         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
     Entry.Node = StackPtr;
-    InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
-                           MPI, /* Alignment = */ 16);
+    InChain =
+        DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
     Entry.Ty = PointerType::get(ArgTy,0);
     Entry.IsSExt = false;
@@ -27213,6 +27889,7 @@ static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
   MVT VT = Amt.getSimpleValueType();
   if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
         (Subtarget.hasInt256() && VT == MVT::v16i16) ||
+        (Subtarget.hasVBMI2() && VT == MVT::v32i16) ||
         (!Subtarget.hasAVX512() && VT == MVT::v16i8)))
     return SDValue();
 
@@ -27790,6 +28467,12 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
     return Op;
   }
 
+  // AVX512 VBMI2 vXi16 - lower to funnel shifts.
+  if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {
+    unsigned FunnelOpc = (Opcode == ISD::ROTL ? ISD::FSHL : ISD::FSHR);
+    return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
+  }
+
   assert((Opcode == ISD::ROTL) && "Only ROTL supported");
 
   // XOP has 128-bit vector variable + immediate rotates.
@@ -27816,7 +28499,8 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
     return splitVectorIntBinary(Op, DAG);
 
   assert((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
-          ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&
+          ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8 ||
+            VT == MVT::v32i16) &&
            Subtarget.hasAVX2())) &&
          "Only vXi32/vXi16/vXi8 vector rotates supported");
 
@@ -28113,8 +28797,8 @@ bool X86TargetLowering::lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const {
 /// a) very likely accessed only by a single thread to minimize cache traffic,
 /// and b) definitely dereferenceable.  Returns the new Chain result.
 static SDValue emitLockedStackOp(SelectionDAG &DAG,
-                                 const X86Subtarget &Subtarget,
-                                 SDValue Chain, SDLoc DL) {
+                                 const X86Subtarget &Subtarget, SDValue Chain,
+                                 const SDLoc &DL) {
   // Implementation notes:
   // 1) LOCK prefix creates a full read/write reordering barrier for memory
   // operations issued by the current processor.  As such, the location
@@ -28552,18 +29236,28 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
   SDValue In = Op.getOperand(0);
   SDLoc DL(Op);
 
+  assert(VT.getScalarType() == MVT::i8 &&
+         "Only byte vector BITREVERSE supported");
+
   // Split v64i8 without BWI so that we can still use the PSHUFB lowering.
   if (VT == MVT::v64i8 && !Subtarget.hasBWI())
     return splitVectorIntUnary(Op, DAG);
 
-  unsigned NumElts = VT.getVectorNumElements();
-  assert(VT.getScalarType() == MVT::i8 &&
-         "Only byte vector BITREVERSE supported");
-
   // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
-  if (VT.is256BitVector() && !Subtarget.hasInt256())
+  if (VT == MVT::v32i8 && !Subtarget.hasInt256())
     return splitVectorIntUnary(Op, DAG);
 
+  unsigned NumElts = VT.getVectorNumElements();
+
+  // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
+  if (Subtarget.hasGFNI()) {
+    MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8);
+    SDValue Matrix = DAG.getConstant(0x8040201008040201ULL, DL, MatrixVT);
+    Matrix = DAG.getBitcast(VT, Matrix);
+    return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
+                       DAG.getTargetConstant(0, DL, MVT::i8));
+  }
+
   // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
   // two nibbles and a PSHUFB lookup to find the bitreverse of each
   // 0-15 value (moved to the other nibble).
@@ -28595,6 +29289,58 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
   return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
 }
 
+static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
+                           SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  SDValue X = Op.getOperand(0);
+  MVT VT = Op.getSimpleValueType();
+
+  // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
+  if (VT == MVT::i8 ||
+      DAG.MaskedValueIsZero(X, APInt::getBitsSetFrom(VT.getSizeInBits(), 8))) {
+    X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
+    SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
+                                DAG.getConstant(0, DL, MVT::i8));
+    // Copy the inverse of the parity flag into a register with setcc.
+    SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
+    // Extend to the original type.
+    return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
+  }
+
+  if (VT == MVT::i64) {
+    // Xor the high and low 16-bits together using a 32-bit operation.
+    SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
+                             DAG.getNode(ISD::SRL, DL, MVT::i64, X,
+                                         DAG.getConstant(32, DL, MVT::i8)));
+    SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
+    X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
+  }
+
+  if (VT != MVT::i16) {
+    // Xor the high and low 16-bits together using a 32-bit operation.
+    SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,
+                               DAG.getConstant(16, DL, MVT::i8));
+    X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);
+  } else {
+    // If the input is 16-bits, we need to extend to use an i32 shift below.
+    X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);
+  }
+
+  // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
+  // This should allow an h-reg to be used to save a shift.
+  SDValue Hi = DAG.getNode(
+      ISD::TRUNCATE, DL, MVT::i8,
+      DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));
+  SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
+  SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
+  SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
+
+  // Copy the inverse of the parity flag into a register with setcc.
+  SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
+  // Extend to the original type.
+  return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
+}
+
 static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
                                         const X86Subtarget &Subtarget) {
   unsigned NewOpc = 0;
@@ -28731,7 +29477,7 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
         Chain =
             DAG.getStore(Node->getChain(), dl, Node->getOperand(2), StackPtr,
-                         MPI, /*Align*/ 0, MachineMemOperand::MOStore);
+                         MPI, MaybeAlign(), MachineMemOperand::MOStore);
         SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
         SDValue LdOps[] = {Chain, StackPtr};
         SDValue Value =
@@ -28771,6 +29517,7 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
 static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
   SDNode *N = Op.getNode();
   MVT VT = N->getSimpleValueType(0);
+  unsigned Opc = Op.getOpcode();
 
   // Let legalize expand this if it isn't a legal type yet.
   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
@@ -28785,11 +29532,14 @@ static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
   Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
                       Carry, DAG.getAllOnesConstant(DL, CarryVT));
 
-  unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB;
-  SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),
-                            Op.getOperand(1), Carry.getValue(1));
+  bool IsAdd = Opc == ISD::ADDCARRY || Opc == ISD::SADDO_CARRY;
+  SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,
+                            Op.getOperand(0), Op.getOperand(1),
+                            Carry.getValue(1));
 
-  SDValue SetCC = getSETCC(X86::COND_B, Sum.getValue(1), DL, DAG);
+  bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;
+  SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,
+                           Sum.getValue(1), DL, DAG);
   if (N->getValueType(1) == MVT::i1)
     SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
 
@@ -29165,25 +29915,6 @@ SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
   return NOOP;
 }
 
-SDValue X86TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
-                                         RTLIB::Libcall Call) const {
-
-  bool IsStrict = Op->isStrictFPOpcode();
-  unsigned Offset = IsStrict ? 1 : 0;
-  SmallVector<SDValue, 2> Ops(Op->op_begin() + Offset, Op->op_end());
-
-  SDLoc dl(Op);
-  SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
-  MakeLibCallOptions CallOptions;
-  std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, Call, MVT::f128, Ops,
-                                                CallOptions, dl, Chain);
-
-  if (IsStrict)
-    return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
-
-  return Tmp.first;
-}
-
 // Custom split CVTPS2PH with wide types.
 static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG) {
   SDLoc dl(Op);
@@ -29213,6 +29944,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::ATOMIC_LOAD_AND:    return lowerAtomicArith(Op, DAG, Subtarget);
   case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op, DAG, Subtarget);
   case ISD::BITREVERSE:         return LowerBITREVERSE(Op, Subtarget, DAG);
+  case ISD::PARITY:             return LowerPARITY(Op, Subtarget, DAG);
   case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
   case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
   case ISD::VECTOR_SHUFFLE:     return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
@@ -29247,6 +29979,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::STRICT_FP_TO_SINT:
   case ISD::FP_TO_UINT:
   case ISD::STRICT_FP_TO_UINT:  return LowerFP_TO_INT(Op, DAG);
+  case ISD::FP_TO_SINT_SAT:
+  case ISD::FP_TO_UINT_SAT:     return LowerFP_TO_INT_SAT(Op, DAG);
   case ISD::FP_EXTEND:
   case ISD::STRICT_FP_EXTEND:   return LowerFP_EXTEND(Op, DAG);
   case ISD::FP_ROUND:
@@ -29313,6 +30047,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::UMULO:              return LowerXALUO(Op, DAG);
   case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
   case ISD::BITCAST:            return LowerBITCAST(Op, Subtarget, DAG);
+  case ISD::SADDO_CARRY:
+  case ISD::SSUBO_CARRY:
   case ISD::ADDCARRY:
   case ISD::SUBCARRY:           return LowerADDSUBCARRY(Op, DAG);
   case ISD::ADD:
@@ -29338,35 +30074,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   }
 }
 
-/// Places new result values for the node in Results (their number
-/// and types must exactly match those of the original return values of
-/// the node), or leaves Results empty, which indicates that the node is not
-/// to be custom lowered after all.
-void X86TargetLowering::LowerOperationWrapper(SDNode *N,
-                                              SmallVectorImpl<SDValue> &Results,
-                                              SelectionDAG &DAG) const {
-  SDValue Res = LowerOperation(SDValue(N, 0), DAG);
-
-  if (!Res.getNode())
-    return;
-
-  // If the original node has one result, take the return value from
-  // LowerOperation as is. It might not be result number 0.
-  if (N->getNumValues() == 1) {
-    Results.push_back(Res);
-    return;
-  }
-
-  // If the original node has multiple results, then the return node should
-  // have the same number of results.
-  assert((N->getNumValues() == Res->getNumValues()) &&
-      "Lowering returned the wrong number of results!");
-
-  // Places new result values base on N result number.
-  for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
-    Results.push_back(Res.getValue(I));
-}
-
 /// Replace a node with an illegal result type with a new node built out of
 /// custom code.
 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
@@ -29409,6 +30116,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     Results.push_back(Chain);
     return;
   }
+  case X86ISD::CVTPS2PH:
+    Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));
+    return;
   case ISD::CTPOP: {
     assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
     // Use a v2i64 if possible.
@@ -29477,28 +30187,6 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     Results.push_back(Res);
     return;
   }
-  case ISD::ABS: {
-    assert(N->getValueType(0) == MVT::i64 &&
-           "Unexpected type (!= i64) on ABS.");
-    MVT HalfT = MVT::i32;
-    SDValue Lo, Hi, Tmp;
-    SDVTList VTList = DAG.getVTList(HalfT, MVT::i1);
-
-    Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
-                     DAG.getConstant(0, dl, HalfT));
-    Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
-                     DAG.getConstant(1, dl, HalfT));
-    Tmp = DAG.getNode(
-        ISD::SRA, dl, HalfT, Hi,
-        DAG.getShiftAmountConstant(HalfT.getSizeInBits() - 1, HalfT, dl));
-    Lo = DAG.getNode(ISD::UADDO, dl, VTList, Tmp, Lo);
-    Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Tmp, Hi,
-                     SDValue(Lo.getNode(), 1));
-    Hi = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Hi);
-    Lo = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Lo);
-    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi));
-    return;
-  }
   // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
   case X86ISD::FMINC:
   case X86ISD::FMIN:
@@ -29539,10 +30227,6 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
       return;
     }
 
-    LLVM_FALLTHROUGH;
-  }
-  case ISD::SDIVREM:
-  case ISD::UDIVREM: {
     SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
     Results.push_back(V);
     return;
@@ -29676,7 +30360,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
       std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
       assert(isTypeLegal(LoVT) && "Split VT not legal?");
 
-      SDValue Lo = getExtendInVec(N->getOpcode(), dl, LoVT, In, DAG);
+      SDValue Lo = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, LoVT, In, DAG);
 
       // We need to shift the input over by half the number of elements.
       unsigned NumElts = InVT.getVectorNumElements();
@@ -29686,7 +30370,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
         ShufMask[i] = i + HalfNumElts;
 
       SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
-      Hi = getExtendInVec(N->getOpcode(), dl, HiVT, Hi, DAG);
+      Hi = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, HiVT, Hi, DAG);
 
       SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
       Results.push_back(Res);
@@ -30037,46 +30721,30 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     swapInH =
         DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
                          swapInH, cpInH.getValue(1));
-    // If the current function needs the base pointer, RBX,
-    // we shouldn't use cmpxchg directly.
-    // Indeed the lowering of that instruction will clobber
-    // that register and since RBX will be a reserved register
-    // the register allocator will not make sure its value will
-    // be properly saved and restored around this live-range.
-    const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
+
+    // In 64-bit mode we might need the base pointer in RBX, but we can't know
+    // until later. So we keep the RBX input in a vreg and use a custom
+    // inserter.
+    // Since RBX will be a reserved register the register allocator will not
+    // make sure its value will be properly saved and restored around this
+    // live-range.
     SDValue Result;
     SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
-    Register BasePtr = TRI->getBaseRegister();
     MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
-    if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
-        (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
-      // ISel prefers the LCMPXCHG64 variant.
-      // If that assert breaks, that means it is not the case anymore,
-      // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
-      // not just EBX. This is a matter of accepting i64 input for that
-      // pseudo, and restoring into the register of the right wide
-      // in expand pseudo. Everything else should just work.
-      assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&
-             "Saving only half of the RBX");
-      unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
-                                  : X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
-      SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
-                                           Regs64bit ? X86::RBX : X86::EBX,
-                                           HalfT, swapInH.getValue(1));
-      SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
-                       RBXSave,
-                       /*Glue*/ RBXSave.getValue(2)};
-      Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
+    if (Regs64bit) {
+      SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,
+                       swapInH.getValue(1)};
+      Result =
+          DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);
     } else {
-      unsigned Opcode =
-          Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
-      swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
-                                 Regs64bit ? X86::RBX : X86::EBX, swapInL,
+      swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,
                                  swapInH.getValue(1));
       SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
                        swapInL.getValue(1)};
-      Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
+      Result =
+          DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);
     }
+
     SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
                                         Regs64bit ? X86::RAX : X86::EAX,
                                         HalfT, Result.getValue(1));
@@ -30321,8 +30989,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(COMI)
   NODE_NAME_CASE(UCOMI)
   NODE_NAME_CASE(CMPM)
+  NODE_NAME_CASE(CMPMM)
   NODE_NAME_CASE(STRICT_CMPM)
-  NODE_NAME_CASE(CMPM_SAE)
+  NODE_NAME_CASE(CMPMM_SAE)
   NODE_NAME_CASE(SETCC)
   NODE_NAME_CASE(SETCC_CARRY)
   NODE_NAME_CASE(FSETCC)
@@ -30381,7 +31050,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(LCMPXCHG_DAG)
   NODE_NAME_CASE(LCMPXCHG8_DAG)
   NODE_NAME_CASE(LCMPXCHG16_DAG)
-  NODE_NAME_CASE(LCMPXCHG8_SAVE_EBX_DAG)
   NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
   NODE_NAME_CASE(LADD)
   NODE_NAME_CASE(LSUB)
@@ -30441,6 +31109,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(XOR)
   NODE_NAME_CASE(AND)
   NODE_NAME_CASE(BEXTR)
+  NODE_NAME_CASE(BEXTRI)
   NODE_NAME_CASE(BZHI)
   NODE_NAME_CASE(PDEP)
   NODE_NAME_CASE(PEXT)
@@ -30478,7 +31147,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(VBROADCAST)
   NODE_NAME_CASE(VBROADCAST_LOAD)
   NODE_NAME_CASE(VBROADCASTM)
-  NODE_NAME_CASE(SUBV_BROADCAST)
+  NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
   NODE_NAME_CASE(VPERMILPV)
   NODE_NAME_CASE(VPERMILPI)
   NODE_NAME_CASE(VPERM2X128)
@@ -30500,6 +31169,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(DBPSADBW)
   NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
   NODE_NAME_CASE(VAARG_64)
+  NODE_NAME_CASE(VAARG_X32)
   NODE_NAME_CASE(WIN_ALLOCA)
   NODE_NAME_CASE(MEMBARRIER)
   NODE_NAME_CASE(MFENCE)
@@ -30656,6 +31326,15 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(ENQCMD)
   NODE_NAME_CASE(ENQCMDS)
   NODE_NAME_CASE(VP2INTERSECT)
+  NODE_NAME_CASE(AESENC128KL)
+  NODE_NAME_CASE(AESDEC128KL)
+  NODE_NAME_CASE(AESENC256KL)
+  NODE_NAME_CASE(AESDEC256KL)
+  NODE_NAME_CASE(AESENCWIDE128KL)
+  NODE_NAME_CASE(AESDECWIDE128KL)
+  NODE_NAME_CASE(AESENCWIDE256KL)
+  NODE_NAME_CASE(AESDECWIDE256KL)
+  NODE_NAME_CASE(TESTUI)
   }
   return nullptr;
 #undef NODE_NAME_CASE
@@ -31001,7 +31680,7 @@ static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr,
 /// Utility function to emit xbegin specifying the start of an RTM region.
 static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
                                      const TargetInstrInfo *TII) {
-  DebugLoc DL = MI.getDebugLoc();
+  const DebugLoc &DL = MI.getDebugLoc();
 
   const BasicBlock *BB = MBB->getBasicBlock();
   MachineFunction::iterator I = ++MBB->getIterator();
@@ -31080,11 +31759,9 @@ static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
   return sinkMBB;
 }
 
-
-
 MachineBasicBlock *
-X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
-                                                 MachineBasicBlock *MBB) const {
+X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
+                                               MachineBasicBlock *MBB) const {
   // Emit va_arg instruction on X86-64.
 
   // Operands to this pseudo-instruction:
@@ -31095,9 +31772,8 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
   // 8  ) Align         : Alignment of type
   // 9  ) EFLAGS (implicit-def)
 
-  assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
-  static_assert(X86::AddrNumOperands == 5,
-                "VAARG_64 assumes 5 address operands");
+  assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!");
+  static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");
 
   Register DestReg = MI.getOperand(0).getReg();
   MachineOperand &Base = MI.getOperand(1);
@@ -31112,7 +31788,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
   MachineFunction *MF = MBB->getParent();
 
   // Memory Reference
-  assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
+  assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand");
 
   MachineMemOperand *OldMMO = MI.memoperands().front();
 
@@ -31125,9 +31801,10 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
   // Machine Information
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
-  const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
+  const TargetRegisterClass *AddrRegClass =
+      getRegClassFor(getPointerTy(MBB->getParent()->getDataLayout()));
   const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
-  DebugLoc DL = MI.getDebugLoc();
+  const DebugLoc &DL = MI.getDebugLoc();
 
   // struct va_list {
   //   i32   gp_offset
@@ -31236,25 +31913,35 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
 
     // Read the reg_save_area address.
     Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
-    BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
+    BuildMI(
+        offsetMBB, DL,
+        TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
+        RegSaveReg)
         .add(Base)
         .add(Scale)
         .add(Index)
-        .addDisp(Disp, 16)
+        .addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)
         .add(Segment)
         .setMemRefs(LoadOnlyMMO);
 
-    // Zero-extend the offset
-    Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
-    BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
-        .addImm(0)
-        .addReg(OffsetReg)
-        .addImm(X86::sub_32bit);
+    if (Subtarget.isTarget64BitLP64()) {
+      // Zero-extend the offset
+      Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
+      BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
+          .addImm(0)
+          .addReg(OffsetReg)
+          .addImm(X86::sub_32bit);
 
-    // Add the offset to the reg_save_area to get the final address.
-    BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
-      .addReg(OffsetReg64)
-      .addReg(RegSaveReg);
+      // Add the offset to the reg_save_area to get the final address.
+      BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
+          .addReg(OffsetReg64)
+          .addReg(RegSaveReg);
+    } else {
+      // Add the offset to the reg_save_area to get the final address.
+      BuildMI(offsetMBB, DL, TII->get(X86::ADD32rr), OffsetDestReg)
+          .addReg(OffsetReg)
+          .addReg(RegSaveReg);
+    }
 
     // Compute the offset for the next argument
     Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
@@ -31283,7 +31970,9 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
 
   // Load the overflow_area address into a register.
   Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
-  BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
+  BuildMI(overflowMBB, DL,
+          TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
+          OverflowAddrReg)
       .add(Base)
       .add(Scale)
       .add(Index)
@@ -31298,11 +31987,17 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
     Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
 
     // aligned_addr = (addr + (align-1)) & ~(align-1)
-    BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
+    BuildMI(
+        overflowMBB, DL,
+        TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
+        TmpReg)
         .addReg(OverflowAddrReg)
         .addImm(Alignment.value() - 1);
 
-    BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
+    BuildMI(
+        overflowMBB, DL,
+        TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),
+        OverflowDestReg)
         .addReg(TmpReg)
         .addImm(~(uint64_t)(Alignment.value() - 1));
   } else {
@@ -31313,12 +32008,16 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
   // Compute the next overflow address after this argument.
   // (the overflow address should be kept 8-byte aligned)
   Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
-  BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
-    .addReg(OverflowDestReg)
-    .addImm(ArgSizeA8);
+  BuildMI(
+      overflowMBB, DL,
+      TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
+      NextAddrReg)
+      .addReg(OverflowDestReg)
+      .addImm(ArgSizeA8);
 
   // Store the new overflow address.
-  BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
+  BuildMI(overflowMBB, DL,
+          TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))
       .add(Base)
       .add(Scale)
       .add(Index)
@@ -31374,10 +32073,10 @@ MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
 
   // Now add the instructions.
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
-  DebugLoc DL = MI.getDebugLoc();
+  const DebugLoc &DL = MI.getDebugLoc();
 
   Register CountReg = MI.getOperand(0).getReg();
-  int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
+  int RegSaveFrameIndex = MI.getOperand(1).getImm();
   int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
 
   if (!Subtarget.isCallingConvWin64(F->getFunction().getCallingConv())) {
@@ -31686,7 +32385,7 @@ MachineBasicBlock *
 X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
                                      MachineBasicBlock *ThisMBB) const {
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
-  DebugLoc DL = MI.getDebugLoc();
+  const DebugLoc &DL = MI.getDebugLoc();
 
   // To "insert" a SELECT_CC instruction, we actually have to insert the
   // diamond control-flow pattern.  The incoming instruction knows the
@@ -31841,7 +32540,7 @@ X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
   MachineFunction *MF = MBB->getParent();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
-  DebugLoc DL = MI.getDebugLoc();
+  const DebugLoc &DL = MI.getDebugLoc();
   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
 
   const unsigned ProbeSize = getStackProbeSize(*MF);
@@ -31934,7 +32633,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
                                         MachineBasicBlock *BB) const {
   MachineFunction *MF = BB->getParent();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
-  DebugLoc DL = MI.getDebugLoc();
+  const DebugLoc &DL = MI.getDebugLoc();
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
 
   assert(MF->shouldSplitStack());
@@ -31969,7 +32668,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
   const TargetRegisterClass *AddrRegClass =
       getRegClassFor(getPointerTy(MF->getDataLayout()));
 
-  unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
+  Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
            bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
            tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
            SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
@@ -32069,7 +32768,7 @@ X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
   MachineFunction *MF = BB->getParent();
   const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
   MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
-  DebugLoc DL = MI.getDebugLoc();
+  const DebugLoc &DL = MI.getDebugLoc();
 
   assert(!isAsynchronousEHPersonality(
              classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&
@@ -32107,7 +32806,7 @@ X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
   // inside MC, therefore without the two markers shrink-wrapping
   // may push the prologue/epilogue pass them.
   const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
-  DebugLoc DL = MI.getDebugLoc();
+  const DebugLoc &DL = MI.getDebugLoc();
   MachineFunction &MF = *BB->getParent();
 
   // Emit CALLSEQ_START right before the instruction.
@@ -32136,7 +32835,7 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
   // be in the normal return register.
   MachineFunction *F = BB->getParent();
   const X86InstrInfo *TII = Subtarget.getInstrInfo();
-  DebugLoc DL = MI.getDebugLoc();
+  const DebugLoc &DL = MI.getDebugLoc();
 
   assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
   assert(MI.getOperand(3).isGlobal() && "This should be a global");
@@ -32275,7 +32974,7 @@ X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
                                             MachineBasicBlock *BB) const {
   // Copy the virtual register into the R11 physical register and
   // call the retpoline thunk.
-  DebugLoc DL = MI.getDebugLoc();
+  const DebugLoc &DL = MI.getDebugLoc();
   const X86InstrInfo *TII = Subtarget.getInstrInfo();
   Register CalleeVReg = MI.getOperand(0).getReg();
   unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());
@@ -32337,7 +33036,7 @@ X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
 /// \param [in] MBB The Machine Basic Block that will be modified.
 void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
                                                  MachineBasicBlock *MBB) const {
-  DebugLoc DL = MI.getDebugLoc();
+  const DebugLoc &DL = MI.getDebugLoc();
   MachineFunction *MF = MBB->getParent();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &MRI = MF->getRegInfo();
@@ -32380,7 +33079,7 @@ void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
 MachineBasicBlock *
 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
                                     MachineBasicBlock *MBB) const {
-  DebugLoc DL = MI.getDebugLoc();
+  const DebugLoc &DL = MI.getDebugLoc();
   MachineFunction *MF = MBB->getParent();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
@@ -32540,7 +33239,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
 MachineBasicBlock *
 X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
                                              MachineBasicBlock *MBB) const {
-  DebugLoc DL = MI.getDebugLoc();
+  const DebugLoc &DL = MI.getDebugLoc();
   MachineFunction *MF = MBB->getParent();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &MRI = MF->getRegInfo();
@@ -32721,7 +33420,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
 MachineBasicBlock *
 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
                                      MachineBasicBlock *MBB) const {
-  DebugLoc DL = MI.getDebugLoc();
+  const DebugLoc &DL = MI.getDebugLoc();
   MachineFunction *MF = MBB->getParent();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &MRI = MF->getRegInfo();
@@ -32805,7 +33504,7 @@ void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
                                                MachineBasicBlock *MBB,
                                                MachineBasicBlock *DispatchBB,
                                                int FI) const {
-  DebugLoc DL = MI.getDebugLoc();
+  const DebugLoc &DL = MI.getDebugLoc();
   MachineFunction *MF = MBB->getParent();
   MachineRegisterInfo *MRI = &MF->getRegInfo();
   const X86InstrInfo *TII = Subtarget.getInstrInfo();
@@ -32854,7 +33553,7 @@ void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
 MachineBasicBlock *
 X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
                                          MachineBasicBlock *BB) const {
-  DebugLoc DL = MI.getDebugLoc();
+  const DebugLoc &DL = MI.getDebugLoc();
   MachineFunction *MF = BB->getParent();
   MachineRegisterInfo *MRI = &MF->getRegInfo();
   const X86InstrInfo *TII = Subtarget.getInstrInfo();
@@ -33084,7 +33783,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                MachineBasicBlock *BB) const {
   MachineFunction *MF = BB->getParent();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
-  DebugLoc DL = MI.getDebugLoc();
+  const DebugLoc &DL = MI.getDebugLoc();
 
   auto TMMImmToTMMReg = [](unsigned Imm) {
     assert (Imm < 8 && "Illegal tmm index");
@@ -33094,8 +33793,10 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   default: llvm_unreachable("Unexpected instr type to insert");
   case X86::TLS_addr32:
   case X86::TLS_addr64:
+  case X86::TLS_addrX32:
   case X86::TLS_base_addr32:
   case X86::TLS_base_addr64:
+  case X86::TLS_base_addrX32:
     return EmitLoweredTLSAddr(MI, BB);
   case X86::INDIRECT_THUNK_CALL32:
   case X86::INDIRECT_THUNK_CALL64:
@@ -33251,7 +33952,8 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
 
   case X86::VAARG_64:
-    return EmitVAARG64WithCustomInserter(MI, BB);
+  case X86::VAARG_X32:
+    return EmitVAARGWithCustomInserter(MI, BB);
 
   case X86::EH_SjLj_SetJmp32:
   case X86::EH_SjLj_SetJmp64:
@@ -33274,10 +33976,8 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     return emitPatchPoint(MI, BB);
 
   case TargetOpcode::PATCHABLE_EVENT_CALL:
-    return emitXRayCustomEvent(MI, BB);
-
   case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
-    return emitXRayTypedEvent(MI, BB);
+    return BB;
 
   case X86::LCMPXCHG8B: {
     const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
@@ -33332,14 +34032,75 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
 
     return BB;
   }
-  case X86::LCMPXCHG16B:
+  case X86::LCMPXCHG16B_NO_RBX: {
+    const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
+    Register BasePtr = TRI->getBaseRegister();
+    if (TRI->hasBasePointer(*MF) &&
+        (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
+      if (!BB->isLiveIn(BasePtr))
+        BB->addLiveIn(BasePtr);
+      // Save RBX into a virtual register.
+      Register SaveRBX =
+          MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
+      BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)
+          .addReg(X86::RBX);
+      Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
+      MachineInstrBuilder MIB =
+          BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
+      for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
+        MIB.add(MI.getOperand(Idx));
+      MIB.add(MI.getOperand(X86::AddrNumOperands));
+      MIB.addReg(SaveRBX);
+    } else {
+      // Simple case, just copy the virtual register to RBX.
+      BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::RBX)
+          .add(MI.getOperand(X86::AddrNumOperands));
+      MachineInstrBuilder MIB =
+          BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B));
+      for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
+        MIB.add(MI.getOperand(Idx));
+    }
+    MI.eraseFromParent();
     return BB;
-  case X86::LCMPXCHG8B_SAVE_EBX:
-  case X86::LCMPXCHG16B_SAVE_RBX: {
-    unsigned BasePtr =
-        MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
-    if (!BB->isLiveIn(BasePtr))
-      BB->addLiveIn(BasePtr);
+  }
+  case X86::MWAITX: {
+    const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
+    Register BasePtr = TRI->getBaseRegister();
+    bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);
+    // If no need to save the base pointer, we generate MWAITXrrr,
+    // else we generate pseudo MWAITX_SAVE_RBX.
+    if (!IsRBX || !TRI->hasBasePointer(*MF)) {
+      BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)
+          .addReg(MI.getOperand(0).getReg());
+      BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)
+          .addReg(MI.getOperand(1).getReg());
+      BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EBX)
+          .addReg(MI.getOperand(2).getReg());
+      BuildMI(*BB, MI, DL, TII->get(X86::MWAITXrrr));
+      MI.eraseFromParent();
+    } else {
+      if (!BB->isLiveIn(BasePtr)) {
+        BB->addLiveIn(BasePtr);
+      }
+      // Parameters can be copied into ECX and EAX but not EBX yet.
+      BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)
+          .addReg(MI.getOperand(0).getReg());
+      BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)
+          .addReg(MI.getOperand(1).getReg());
+      assert(Subtarget.is64Bit() && "Expected 64-bit mode!");
+      // Save RBX into a virtual register.
+      Register SaveRBX =
+          MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
+      BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)
+          .addReg(X86::RBX);
+      // Generate mwaitx pseudo.
+      Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
+      BuildMI(*BB, MI, DL, TII->get(X86::MWAITX_SAVE_RBX))
+          .addDef(Dst) // Destination tied in with SaveRBX.
+          .addReg(MI.getOperand(2).getReg()) // input value of EBX.
+          .addUse(SaveRBX);                  // Save of base pointer.
+      MI.eraseFromParent();
+    }
     return BB;
   }
   case TargetOpcode::PREALLOCATED_SETUP: {
@@ -33377,7 +34138,6 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   case X86::PTDPBUSD:
   case X86::PTDPBUUD:
   case X86::PTDPBF16PS: {
-    const DebugLoc &DL = MI.getDebugLoc();
     unsigned Opc;
     switch (MI.getOpcode()) {
     case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
@@ -33397,7 +34157,6 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     return BB;
   }
   case X86::PTILEZERO: {
-    const DebugLoc &DL = MI.getDebugLoc();
     unsigned Imm = MI.getOperand(0).getImm();
     BuildMI(*BB, MI, DL, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
     MI.eraseFromParent(); // The pseudo is gone now.
@@ -33406,7 +34165,6 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   case X86::PTILELOADD:
   case X86::PTILELOADDT1:
   case X86::PTILESTORED: {
-    const DebugLoc &DL = MI.getDebugLoc();
     unsigned Opc;
     switch (MI.getOpcode()) {
     case X86::PTILELOADD:   Opc = X86::TILELOADD;   break;
@@ -33607,13 +34365,11 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
     KnownBits Known2;
     if (!!DemandedLHS) {
       Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
-      Known.One &= Known2.One;
-      Known.Zero &= Known2.Zero;
+      Known = KnownBits::commonBits(Known, Known2);
     }
     if (!!DemandedRHS) {
       Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
-      Known.One &= Known2.One;
-      Known.Zero &= Known2.Zero;
+      Known = KnownBits::commonBits(Known, Known2);
     }
 
     if (Known.countMinLeadingZeros() < BitWidth)
@@ -33656,11 +34412,11 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
     KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
 
     // Only known if known in both the LHS and RHS.
-    Known.One &= Known2.One;
-    Known.Zero &= Known2.Zero;
+    Known = KnownBits::commonBits(Known, Known2);
     break;
   }
-  case X86ISD::BEXTR: {
+  case X86ISD::BEXTR:
+  case X86ISD::BEXTRI: {
     SDValue Op0 = Op.getOperand(0);
     SDValue Op1 = Op.getOperand(1);
 
@@ -33682,6 +34438,28 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
     }
     break;
   }
+  case X86ISD::PDEP: {
+    KnownBits Known2;
+    Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    // Zeros are retained from the mask operand. But not ones.
+    Known.One.clearAllBits();
+    // The result will have at least as many trailing zeros as the non-mask
+    // operand since bits can only map to the same or higher bit position.
+    Known.Zero.setLowBits(Known2.countMinTrailingZeros());
+    break;
+  }
+  case X86ISD::PEXT: {
+    Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    // The result has as many leading zeros as the number of zeroes in the mask.
+    unsigned Count = Known.Zero.countPopulation();
+    Known.Zero = APInt::getHighBitsSet(BitWidth, Count);
+    Known.One.clearAllBits();
+    break;
+  }
+  case X86ISD::VTRUNC:
+  case X86ISD::VTRUNCS:
+  case X86ISD::VTRUNCUS:
   case X86ISD::CVTSI2P:
   case X86ISD::CVTUI2P:
   case X86ISD::CVTP2SI:
@@ -33698,7 +34476,7 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
   case X86ISD::VMFPROUND:
   case X86ISD::CVTPS2PH:
   case X86ISD::MCVTPS2PH: {
-    // Conversions - upper elements are known zero.
+    // Truncations/Conversions - upper elements are known zero.
     EVT SrcVT = Op.getOperand(0).getValueType();
     if (SrcVT.isVector()) {
       unsigned NumSrcElts = SrcVT.getVectorNumElements();
@@ -33776,8 +34554,7 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
             continue;
           KnownBits Known2 =
               DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
-          Known.One &= Known2.One;
-          Known.Zero &= Known2.Zero;
+          Known = KnownBits::commonBits(Known, Known2);
         }
       }
     }
@@ -33956,11 +34733,18 @@ static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
   unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
 
   // Match against a VZEXT_MOVL vXi32 zero-extending instruction.
-  if (MaskEltSize == 32 && isUndefOrEqual(Mask[0], 0) &&
-      isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) {
-    Shuffle = X86ISD::VZEXT_MOVL;
-    SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
-    return true;
+  if (MaskEltSize == 32 && Mask[0] == 0) {
+    if (isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) {
+      Shuffle = X86ISD::VZEXT_MOVL;
+      SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
+      return true;
+    }
+    if (V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+        isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
+      Shuffle = X86ISD::VZEXT_MOVL;
+      SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
+      return true;
+    }
   }
 
   // Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction.
@@ -34014,17 +34798,17 @@ static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
   // instructions are no slower than UNPCKLPD but has the option to
   // fold the input operand into even an unaligned memory load.
   if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
-    if (isTargetShuffleEquivalent(Mask, {0, 0})) {
+    if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, V1)) {
       Shuffle = X86ISD::MOVDDUP;
       SrcVT = DstVT = MVT::v2f64;
       return true;
     }
-    if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
+    if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, V1)) {
       Shuffle = X86ISD::MOVSLDUP;
       SrcVT = DstVT = MVT::v4f32;
       return true;
     }
-    if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
+    if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, V1)) {
       Shuffle = X86ISD::MOVSHDUP;
       SrcVT = DstVT = MVT::v4f32;
       return true;
@@ -34033,17 +34817,17 @@ static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
 
   if (MaskVT.is256BitVector() && AllowFloatDomain) {
     assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
-    if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
+    if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, V1)) {
       Shuffle = X86ISD::MOVDDUP;
       SrcVT = DstVT = MVT::v4f64;
       return true;
     }
-    if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
+    if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1)) {
       Shuffle = X86ISD::MOVSLDUP;
       SrcVT = DstVT = MVT::v8f32;
       return true;
     }
-    if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
+    if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, V1)) {
       Shuffle = X86ISD::MOVSHDUP;
       SrcVT = DstVT = MVT::v8f32;
       return true;
@@ -34053,19 +34837,21 @@ static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
   if (MaskVT.is512BitVector() && AllowFloatDomain) {
     assert(Subtarget.hasAVX512() &&
            "AVX512 required for 512-bit vector shuffles");
-    if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
+    if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1)) {
       Shuffle = X86ISD::MOVDDUP;
       SrcVT = DstVT = MVT::v8f64;
       return true;
     }
     if (isTargetShuffleEquivalent(
-            Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
+            MaskVT, Mask,
+            {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, V1)) {
       Shuffle = X86ISD::MOVSLDUP;
       SrcVT = DstVT = MVT::v16f32;
       return true;
     }
     if (isTargetShuffleEquivalent(
-            Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
+            MaskVT, Mask,
+            {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, V1)) {
       Shuffle = X86ISD::MOVSHDUP;
       SrcVT = DstVT = MVT::v16f32;
       return true;
@@ -34147,7 +34933,10 @@ static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
   }
 
   // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
-  if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) {
+  if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&
+      ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
+       (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
+       (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
     SmallVector<int, 4> RepeatedMask;
     if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
       ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
@@ -34217,30 +35006,31 @@ static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
                                SelectionDAG &DAG, const X86Subtarget &Subtarget,
                                unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
                                bool IsUnary) {
+  unsigned NumMaskElts = Mask.size();
   unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
 
   if (MaskVT.is128BitVector()) {
-    if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
+    if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}) && AllowFloatDomain) {
       V2 = V1;
       V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
       Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
       SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
       return true;
     }
-    if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
+    if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}) && AllowFloatDomain) {
       V2 = V1;
       Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
       SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
       return true;
     }
-    if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
-        (AllowFloatDomain || !Subtarget.hasSSE41())) {
+    if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}) &&
+        Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {
       std::swap(V1, V2);
       Shuffle = X86ISD::MOVSD;
       SrcVT = DstVT = MVT::v2f64;
       return true;
     }
-    if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
+    if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}) &&
         (AllowFloatDomain || !Subtarget.hasSSE41())) {
       Shuffle = X86ISD::MOVSS;
       SrcVT = DstVT = MVT::v4f32;
@@ -34274,6 +35064,46 @@ static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
     }
   }
 
+  // Attempt to match against a OR if we're performing a blend shuffle and the
+  // non-blended source element is zero in each case.
+  if ((EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
+      (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
+    bool IsBlend = true;
+    unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
+    unsigned NumV2Elts = V2.getValueType().getVectorNumElements();
+    unsigned Scale1 = NumV1Elts / NumMaskElts;
+    unsigned Scale2 = NumV2Elts / NumMaskElts;
+    APInt DemandedZeroV1 = APInt::getNullValue(NumV1Elts);
+    APInt DemandedZeroV2 = APInt::getNullValue(NumV2Elts);
+    for (unsigned i = 0; i != NumMaskElts; ++i) {
+      int M = Mask[i];
+      if (M == SM_SentinelUndef)
+        continue;
+      if (M == SM_SentinelZero) {
+        DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
+        DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
+        continue;
+      }
+      if (M == (int)i) {
+        DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
+        continue;
+      }
+      if (M == (int)(i + NumMaskElts)) {
+        DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
+        continue;
+      }
+      IsBlend = false;
+      break;
+    }
+    if (IsBlend &&
+        DAG.computeKnownBits(V1, DemandedZeroV1).isZero() &&
+        DAG.computeKnownBits(V2, DemandedZeroV2).isZero()) {
+      Shuffle = ISD::OR;
+      SrcVT = DstVT = MaskVT.changeTypeToInteger();
+      return true;
+    }
+  }
+
   return false;
 }
 
@@ -34462,6 +35292,16 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
   assert((Inputs.size() == 1 || Inputs.size() == 2) &&
          "Unexpected number of shuffle inputs!");
 
+  MVT RootVT = Root.getSimpleValueType();
+  unsigned RootSizeInBits = RootVT.getSizeInBits();
+  unsigned NumRootElts = RootVT.getVectorNumElements();
+
+  // Canonicalize shuffle input op to the requested type.
+  // TODO: Support cases where Op is smaller than VT.
+  auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {
+    return DAG.getBitcast(VT, Op);
+  };
+
   // Find the inputs that enter the chain. Note that multiple uses are OK
   // here, we're not going to remove the operands we find.
   bool UnaryShuffle = (Inputs.size() == 1);
@@ -34471,10 +35311,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
 
   MVT VT1 = V1.getSimpleValueType();
   MVT VT2 = V2.getSimpleValueType();
-  MVT RootVT = Root.getSimpleValueType();
-  assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&
-         VT2.getSizeInBits() == RootVT.getSizeInBits() &&
-         "Vector size mismatch");
+  assert(VT1.getSizeInBits() == RootSizeInBits &&
+         VT2.getSizeInBits() == RootSizeInBits && "Vector size mismatch");
 
   SDLoc DL(Root);
   SDValue Res;
@@ -34482,12 +35320,10 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
   unsigned NumBaseMaskElts = BaseMask.size();
   if (NumBaseMaskElts == 1) {
     assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
-    return DAG.getBitcast(RootVT, V1);
+    return CanonicalizeShuffleInput(RootVT, V1);
   }
 
   bool OptForSize = DAG.shouldOptForSize();
-  unsigned RootSizeInBits = RootVT.getSizeInBits();
-  unsigned NumRootElts = RootVT.getVectorNumElements();
   unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
   bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
                      (RootVT.isFloatingPoint() && Depth >= 1) ||
@@ -34508,33 +35344,14 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
   // we can just use the broadcast directly. This works for smaller broadcast
   // elements as well as they already repeat across each mask element
   if (UnaryShuffle && isTargetShuffleSplat(V1) && !isAnyZero(BaseMask) &&
-      (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0) {
-    return DAG.getBitcast(RootVT, V1);
-  }
-
-  // Attempt to match a subvector broadcast.
-  // shuffle(insert_subvector(undef, sub, 0), undef, 0, 0, 0, 0)
-  if (UnaryShuffle &&
-      (BaseMaskEltSizeInBits == 128 || BaseMaskEltSizeInBits == 256)) {
-    SmallVector<int, 64> BroadcastMask(NumBaseMaskElts, 0);
-    if (isTargetShuffleEquivalent(BaseMask, BroadcastMask)) {
-      SDValue Src = Inputs[0];
-      if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
-          Src.getOperand(0).isUndef() &&
-          Src.getOperand(1).getValueSizeInBits() == BaseMaskEltSizeInBits &&
-          MayFoldLoad(Src.getOperand(1)) && isNullConstant(Src.getOperand(2))) {
-        return DAG.getBitcast(RootVT, DAG.getNode(X86ISD::SUBV_BROADCAST, DL,
-                                                  Src.getValueType(),
-                                                  Src.getOperand(1)));
-      }
-    }
+      (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
+      V1.getValueSizeInBits() >= RootSizeInBits) {
+    return CanonicalizeShuffleInput(RootVT, V1);
   }
 
   // Handle 128/256-bit lane shuffles of 512-bit vectors.
   if (RootVT.is512BitVector() &&
       (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {
-    MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
-
     // If the upper subvectors are zeroable, then an extract+insert is more
     // optimal than using X86ISD::SHUF128. The insertion is free, even if it has
     // to zero the upper subvectors.
@@ -34543,12 +35360,11 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
         return SDValue(); // Nothing to do!
       assert(isInRange(BaseMask[0], 0, NumBaseMaskElts) &&
              "Unexpected lane shuffle");
-      Res = DAG.getBitcast(ShuffleVT, V1);
-      unsigned SubIdx = BaseMask[0] * (8 / NumBaseMaskElts);
+      Res = CanonicalizeShuffleInput(RootVT, V1);
+      unsigned SubIdx = BaseMask[0] * (NumRootElts / NumBaseMaskElts);
       bool UseZero = isAnyZero(BaseMask);
       Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
-      Res = widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
-      return DAG.getBitcast(RootVT, Res);
+      return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
     }
 
     // Narrow shuffle mask to v4x128.
@@ -34557,8 +35373,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
     narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, BaseMask, Mask);
 
     // Try to lower to vshuf64x2/vshuf32x4.
-    auto MatchSHUF128 = [](MVT ShuffleVT, const SDLoc &DL, ArrayRef<int> Mask,
-                           SDValue V1, SDValue V2, SelectionDAG &DAG) {
+    auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL, ArrayRef<int> Mask,
+                            SDValue V1, SDValue V2, SelectionDAG &DAG) {
       unsigned PermMask = 0;
       // Insure elements came from the same Op.
       SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
@@ -34581,8 +35397,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
       }
 
       return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
-                         DAG.getBitcast(ShuffleVT, Ops[0]),
-                         DAG.getBitcast(ShuffleVT, Ops[1]),
+                         CanonicalizeShuffleInput(ShuffleVT, Ops[0]),
+                         CanonicalizeShuffleInput(ShuffleVT, Ops[1]),
                          DAG.getTargetConstant(PermMask, DL, MVT::i8));
     };
 
@@ -34597,6 +35413,9 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
         (Mask[1] < 0 || Mask[3] < 0 || Mask[1] == (Mask[3] % 2));
 
     if (!isAnyZero(Mask) && !PreferPERMQ) {
+      if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
+        return SDValue(); // Nothing to do!
+      MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
       if (SDValue V = MatchSHUF128(ShuffleVT, DL, Mask, V1, V2, DAG))
         return DAG.getBitcast(RootVT, V);
     }
@@ -34604,8 +35423,6 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
 
   // Handle 128-bit lane shuffles of 256-bit vectors.
   if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
-    MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
-
     // If the upper half is zeroable, then an extract+insert is more optimal
     // than using X86ISD::VPERM2X128. The insertion is free, even if it has to
     // zero the upper half.
@@ -34613,11 +35430,10 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
       if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
         return SDValue(); // Nothing to do!
       assert(isInRange(BaseMask[0], 0, 2) && "Unexpected lane shuffle");
-      Res = DAG.getBitcast(ShuffleVT, V1);
-      Res = extract128BitVector(Res, BaseMask[0] * 2, DAG, DL);
-      Res = widenSubVector(Res, BaseMask[1] == SM_SentinelZero, Subtarget, DAG,
-                           DL, 256);
-      return DAG.getBitcast(RootVT, Res);
+      Res = CanonicalizeShuffleInput(RootVT, V1);
+      Res = extract128BitVector(Res, BaseMask[0] * (NumRootElts / 2), DAG, DL);
+      return widenSubVector(Res, BaseMask[1] == SM_SentinelZero, Subtarget, DAG,
+                            DL, 256);
     }
 
     if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
@@ -34632,12 +35448,9 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
       unsigned PermMask = 0;
       PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
       PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
-
-      Res = DAG.getBitcast(ShuffleVT, V1);
-      Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
-                        DAG.getUNDEF(ShuffleVT),
-                        DAG.getTargetConstant(PermMask, DL, MVT::i8));
-      return DAG.getBitcast(RootVT, Res);
+      return DAG.getNode(
+          X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
+          DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
     }
 
     if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
@@ -34653,13 +35466,12 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
         unsigned PermMask = 0;
         PermMask |= ((BaseMask[0] & 3) << 0);
         PermMask |= ((BaseMask[1] & 3) << 4);
-
-        Res = DAG.getNode(
-            X86ISD::VPERM2X128, DL, ShuffleVT,
-            DAG.getBitcast(ShuffleVT, isInRange(BaseMask[0], 0, 2) ? V1 : V2),
-            DAG.getBitcast(ShuffleVT, isInRange(BaseMask[1], 0, 2) ? V1 : V2),
-            DAG.getTargetConstant(PermMask, DL, MVT::i8));
-        return DAG.getBitcast(RootVT, Res);
+        SDValue LHS = isInRange(BaseMask[0], 0, 2) ? V1 : V2;
+        SDValue RHS = isInRange(BaseMask[1], 0, 2) ? V1 : V2;
+        return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
+                          CanonicalizeShuffleInput(RootVT, LHS),
+                          CanonicalizeShuffleInput(RootVT, RHS),
+                          DAG.getTargetConstant(PermMask, DL, MVT::i8));
       }
     }
   }
@@ -34721,8 +35533,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
     if ((Subtarget.hasAVX2() ||
          (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
         (!IsMaskedShuffle || NumRootElts == NumMaskElts)) {
-      SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
-      if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
+      if (isUndefOrEqual(Mask, 0)) {
         if (V1.getValueType() == MaskVT &&
             V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
             MayFoldLoad(V1.getOperand(0))) {
@@ -34735,7 +35546,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
         if (Subtarget.hasAVX2()) {
           if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
             return SDValue(); // Nothing to do!
-          Res = DAG.getBitcast(MaskVT, V1);
+          Res = CanonicalizeShuffleInput(MaskVT, V1);
           Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
           return DAG.getBitcast(RootVT, Res);
         }
@@ -34750,7 +35561,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
          (NumRootElts == ShuffleVT.getVectorNumElements()))) {
       if (Depth == 0 && Root.getOpcode() == Shuffle)
         return SDValue(); // Nothing to do!
-      Res = DAG.getBitcast(ShuffleSrcVT, NewV1);
+      Res = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
       Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
       return DAG.getBitcast(RootVT, Res);
     }
@@ -34762,7 +35573,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
          (NumRootElts == ShuffleVT.getVectorNumElements()))) {
       if (Depth == 0 && Root.getOpcode() == Shuffle)
         return SDValue(); // Nothing to do!
-      Res = DAG.getBitcast(ShuffleVT, V1);
+      Res = CanonicalizeShuffleInput(ShuffleVT, V1);
       Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
                         DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
       return DAG.getBitcast(RootVT, Res);
@@ -34773,16 +35584,32 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
   // from a scalar.
   // TODO: Handle other insertions here as well?
   if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
-      MaskEltSizeInBits == 32 && Subtarget.hasSSE41() &&
-      !isTargetShuffleEquivalent(Mask, {4, 1, 2, 3})) {
-    SDValue SrcV1 = V1, SrcV2 = V2;
-    if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask, DAG) &&
-        SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+      Subtarget.hasSSE41() &&
+      !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3})) {
+    if (MaskEltSizeInBits == 32) {
+      SDValue SrcV1 = V1, SrcV2 = V2;
+      if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,
+                                 DAG) &&
+          SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+        if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
+          return SDValue(); // Nothing to do!
+        Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
+                          CanonicalizeShuffleInput(MVT::v4f32, SrcV1),
+                          CanonicalizeShuffleInput(MVT::v4f32, SrcV2),
+                          DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
+        return DAG.getBitcast(RootVT, Res);
+      }
+    }
+    if (MaskEltSizeInBits == 64 &&
+        isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}) &&
+        V2.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+        V2.getScalarValueSizeInBits() <= 32) {
       if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
         return SDValue(); // Nothing to do!
+      PermuteImm = (/*DstIdx*/2 << 4) | (/*SrcIdx*/0 << 0);
       Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
-                        DAG.getBitcast(MVT::v4f32, SrcV1),
-                        DAG.getBitcast(MVT::v4f32, SrcV2),
+                        CanonicalizeShuffleInput(MVT::v4f32, V1),
+                        CanonicalizeShuffleInput(MVT::v4f32, V2),
                         DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
       return DAG.getBitcast(RootVT, Res);
     }
@@ -34796,8 +35623,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
       (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
     if (Depth == 0 && Root.getOpcode() == Shuffle)
       return SDValue(); // Nothing to do!
-    NewV1 = DAG.getBitcast(ShuffleSrcVT, NewV1);
-    NewV2 = DAG.getBitcast(ShuffleSrcVT, NewV2);
+    NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
+    NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);
     Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
     return DAG.getBitcast(RootVT, Res);
   }
@@ -34810,8 +35637,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
       (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
     if (Depth == 0 && Root.getOpcode() == Shuffle)
       return SDValue(); // Nothing to do!
-    NewV1 = DAG.getBitcast(ShuffleVT, NewV1);
-    NewV2 = DAG.getBitcast(ShuffleVT, NewV2);
+    NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);
+    NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);
     Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
                       DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
     return DAG.getBitcast(RootVT, Res);
@@ -34828,7 +35655,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
                             Zeroable)) {
       if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)
         return SDValue(); // Nothing to do!
-      V1 = DAG.getBitcast(IntMaskVT, V1);
+      V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
       Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
                         DAG.getTargetConstant(BitLen, DL, MVT::i8),
                         DAG.getTargetConstant(BitIdx, DL, MVT::i8));
@@ -34838,8 +35665,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
     if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
       if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)
         return SDValue(); // Nothing to do!
-      V1 = DAG.getBitcast(IntMaskVT, V1);
-      V2 = DAG.getBitcast(IntMaskVT, V2);
+      V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
+      V2 = CanonicalizeShuffleInput(IntMaskVT, V2);
       Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
                         DAG.getTargetConstant(BitLen, DL, MVT::i8),
                         DAG.getTargetConstant(BitIdx, DL, MVT::i8));
@@ -34858,7 +35685,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
           IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC;
       if (Depth == 0 && Root.getOpcode() == Opc)
         return SDValue(); // Nothing to do!
-      V1 = DAG.getBitcast(ShuffleSrcVT, V1);
+      V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
       Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
       if (ShuffleVT.getSizeInBits() < RootSizeInBits)
         Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
@@ -34875,8 +35702,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
         return SDValue(); // Nothing to do!
       ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
       ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
-      V1 = DAG.getBitcast(ShuffleSrcVT, V1);
-      V2 = DAG.getBitcast(ShuffleSrcVT, V2);
+      V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
+      V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);
       ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
       ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
       Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
@@ -34893,49 +35720,56 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
   // Depth threshold above which we can efficiently use variable mask shuffles.
   int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 1 : 2;
   AllowVariableMask &= (Depth >= VariableShuffleDepth) || HasVariableMask;
+  // VPERMI2W/VPERMI2B are 3 uops on Skylake and Icelake so we require a
+  // higher depth before combining them.
+  bool AllowBWIVPERMV3 = (Depth >= 2 || HasVariableMask);
 
   bool MaskContainsZeros = isAnyZero(Mask);
 
   if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
     // If we have a single input lane-crossing shuffle then lower to VPERMV.
-    if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
-        ((Subtarget.hasAVX2() &&
-          (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
-         (Subtarget.hasAVX512() &&
-          (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
-           MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
-         (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
-         (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
-         (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
-         (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
-      SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
-      Res = DAG.getBitcast(MaskVT, V1);
-      Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
-      return DAG.getBitcast(RootVT, Res);
+    if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros) {
+      if (Subtarget.hasAVX2() &&
+          (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {
+        SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
+        Res = CanonicalizeShuffleInput(MaskVT, V1);
+        Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
+        return DAG.getBitcast(RootVT, Res);
+      }
+      // AVX512 variants (non-VLX will pad to 512-bit shuffles).
+      if ((Subtarget.hasAVX512() &&
+           (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
+            MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
+          (Subtarget.hasBWI() &&
+           (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
+          (Subtarget.hasVBMI() &&
+           (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {
+        V1 = CanonicalizeShuffleInput(MaskVT, V1);
+        V2 = DAG.getUNDEF(MaskVT);
+        Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
+        return DAG.getBitcast(RootVT, Res);
+      }
     }
 
     // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
-    // vector as the second source.
+    // vector as the second source (non-VLX will pad to 512-bit shuffles).
     if (UnaryShuffle && AllowVariableMask &&
         ((Subtarget.hasAVX512() &&
           (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
+           MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
+           MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||
            MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
-         (Subtarget.hasVLX() &&
-          (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
-           MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
-         (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
-         (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
-         (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
-         (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
+         (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
+          (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
+         (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
+          (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
       // Adjust shuffle mask - replace SM_SentinelZero with second source index.
       for (unsigned i = 0; i != NumMaskElts; ++i)
         if (Mask[i] == SM_SentinelZero)
           Mask[i] = NumMaskElts + i;
-
-      SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
-      Res = DAG.getBitcast(MaskVT, V1);
-      SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
-      Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
+      V1 = CanonicalizeShuffleInput(MaskVT, V1);
+      V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);
+      Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
       return DAG.getBitcast(RootVT, Res);
     }
 
@@ -34946,22 +35780,21 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
             DAG, Subtarget))
       return WideShuffle;
 
-    // If we have a dual input lane-crossing shuffle then lower to VPERMV3.
+    // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
+    // (non-VLX will pad to 512-bit shuffles).
     if (AllowVariableMask && !MaskContainsZeros &&
         ((Subtarget.hasAVX512() &&
           (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
-           MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
-         (Subtarget.hasVLX() &&
-          (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
+           MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
+           MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||
            MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
-         (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
-         (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
-         (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
-         (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
-      SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
-      V1 = DAG.getBitcast(MaskVT, V1);
-      V2 = DAG.getBitcast(MaskVT, V2);
-      Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
+         (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
+          (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
+         (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
+          (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
+      V1 = CanonicalizeShuffleInput(MaskVT, V1);
+      V2 = CanonicalizeShuffleInput(MaskVT, V2);
+      Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
       return DAG.getBitcast(RootVT, Res);
     }
     return SDValue();
@@ -34987,7 +35820,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
       EltBits[i] = AllOnes;
     }
     SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
-    Res = DAG.getBitcast(MaskVT, V1);
+    Res = CanonicalizeShuffleInput(MaskVT, V1);
     unsigned AndOpcode =
         MaskVT.isFloatingPoint() ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
     Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
@@ -35007,7 +35840,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
       VPermIdx.push_back(Idx);
     }
     SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
-    Res = DAG.getBitcast(MaskVT, V1);
+    Res = CanonicalizeShuffleInput(MaskVT, V1);
     Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
     return DAG.getBitcast(RootVT, Res);
   }
@@ -35039,8 +35872,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
       Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
       VPerm2Idx.push_back(Index);
     }
-    V1 = DAG.getBitcast(MaskVT, V1);
-    V2 = DAG.getBitcast(MaskVT, V2);
+    V1 = CanonicalizeShuffleInput(MaskVT, V1);
+    V2 = CanonicalizeShuffleInput(MaskVT, V2);
     SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
     Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
                       DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
@@ -35074,7 +35907,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
       PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
     }
     MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
-    Res = DAG.getBitcast(ByteVT, V1);
+    Res = CanonicalizeShuffleInput(ByteVT, V1);
     SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
     Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
     return DAG.getBitcast(RootVT, Res);
@@ -35104,8 +35937,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
       VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
     }
     MVT ByteVT = MVT::v16i8;
-    V1 = DAG.getBitcast(ByteVT, V1);
-    V2 = DAG.getBitcast(ByteVT, V2);
+    V1 = CanonicalizeShuffleInput(ByteVT, V1);
+    V2 = CanonicalizeShuffleInput(ByteVT, V2);
     SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
     Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
     return DAG.getBitcast(RootVT, Res);
@@ -35118,25 +35951,22 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
           DAG, Subtarget))
     return WideShuffle;
 
-  // If we have a dual input shuffle then lower to VPERMV3.
+  // If we have a dual input shuffle then lower to VPERMV3,
+  // (non-VLX will pad to 512-bit shuffles)
   if (!UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
       ((Subtarget.hasAVX512() &&
-        (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
-         MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
-       (Subtarget.hasVLX() &&
-        (MaskVT == MVT::v2f64 || MaskVT == MVT::v2i64 || MaskVT == MVT::v4f64 ||
-         MaskVT == MVT::v4i64 || MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 ||
-         MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
-       (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
-       (Subtarget.hasBWI() && Subtarget.hasVLX() &&
-        (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16)) ||
-       (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
-       (Subtarget.hasVBMI() && Subtarget.hasVLX() &&
-        (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8)))) {
-    SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
-    V1 = DAG.getBitcast(MaskVT, V1);
-    V2 = DAG.getBitcast(MaskVT, V2);
-    Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
+        (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||
+         MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||
+         MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||
+         MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
+         MaskVT == MVT::v16i32)) ||
+       (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
+        (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
+       (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
+        (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
+    V1 = CanonicalizeShuffleInput(MaskVT, V1);
+    V2 = CanonicalizeShuffleInput(MaskVT, V2);
+    Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
     return DAG.getBitcast(RootVT, Res);
   }
 
@@ -35161,12 +35991,16 @@ static SDValue combineX86ShuffleChainWithExtract(
   if (NumInputs == 0)
     return SDValue();
 
+  EVT RootVT = Root.getValueType();
+  unsigned RootSizeInBits = RootVT.getSizeInBits();
+  assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask");
+
   SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());
   SmallVector<unsigned, 4> Offsets(NumInputs, 0);
 
   // Peek through subvectors.
   // TODO: Support inter-mixed EXTRACT_SUBVECTORs + BITCASTs?
-  unsigned WideSizeInBits = WideInputs[0].getValueSizeInBits();
+  unsigned WideSizeInBits = RootSizeInBits;
   for (unsigned i = 0; i != NumInputs; ++i) {
     SDValue &Src = WideInputs[i];
     unsigned &Offset = Offsets[i];
@@ -35189,8 +36023,6 @@ static SDValue combineX86ShuffleChainWithExtract(
   if (llvm::all_of(Offsets, [](unsigned Offset) { return Offset == 0; }))
     return SDValue();
 
-  EVT RootVT = Root.getValueType();
-  unsigned RootSizeInBits = RootVT.getSizeInBits();
   unsigned Scale = WideSizeInBits / RootSizeInBits;
   assert((WideSizeInBits % RootSizeInBits) == 0 &&
          "Unexpected subvector extraction");
@@ -35250,6 +36082,149 @@ static SDValue combineX86ShuffleChainWithExtract(
   return SDValue();
 }
 
+// Canonicalize the combined shuffle mask chain with horizontal ops.
+// NOTE: This may update the Ops and Mask.
+static SDValue canonicalizeShuffleMaskWithHorizOp(
+    MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,
+    unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
+    const X86Subtarget &Subtarget) {
+  if (Mask.empty() || Ops.empty())
+    return SDValue();
+
+  SmallVector<SDValue> BC;
+  for (SDValue Op : Ops)
+    BC.push_back(peekThroughBitcasts(Op));
+
+  // All ops must be the same horizop + type.
+  SDValue BC0 = BC[0];
+  EVT VT0 = BC0.getValueType();
+  unsigned Opcode0 = BC0.getOpcode();
+  if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {
+        return V.getOpcode() != Opcode0 || V.getValueType() != VT0;
+      }))
+    return SDValue();
+
+  bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
+                  Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
+  bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);
+  if (!isHoriz && !isPack)
+    return SDValue();
+
+  int NumElts = VT0.getVectorNumElements();
+  int NumLanes = VT0.getSizeInBits() / 128;
+  int NumEltsPerLane = NumElts / NumLanes;
+  int NumHalfEltsPerLane = NumEltsPerLane / 2;
+
+  // See if we can remove the shuffle by resorting the HOP chain so that
+  // the HOP args are pre-shuffled.
+  // TODO: Generalize to any sized/depth chain.
+  // TODO: Add support for PACKSS/PACKUS.
+  if (isHoriz && NumEltsPerLane == 4 && VT0.is128BitVector() &&
+      shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget)) {
+    SmallVector<int> ScaledMask;
+    if (scaleShuffleElements(Mask, 4, ScaledMask)) {
+      // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.
+      auto GetHOpSrc = [&](int M) {
+        if (M == SM_SentinelUndef)
+          return DAG.getUNDEF(VT0);
+        if (M == SM_SentinelZero)
+          return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);
+        SDValue Src0 = BC[M / NumElts];
+        SDValue Src1 = Src0.getOperand((M % 4) >= 2);
+        if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
+          return Src1.getOperand(M % 2);
+        return SDValue();
+      };
+      SDValue M0 = GetHOpSrc(ScaledMask[0]);
+      SDValue M1 = GetHOpSrc(ScaledMask[1]);
+      SDValue M2 = GetHOpSrc(ScaledMask[2]);
+      SDValue M3 = GetHOpSrc(ScaledMask[3]);
+      if (M0 && M1 && M2 && M3) {
+        SDValue LHS = DAG.getNode(Opcode0, DL, VT0, M0, M1);
+        SDValue RHS = DAG.getNode(Opcode0, DL, VT0, M2, M3);
+        return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
+      }
+    }
+  }
+
+  if (2 < Ops.size())
+    return SDValue();
+
+  SDValue BC1 = BC[BC.size() - 1];
+  if (Mask.size() == VT0.getVectorNumElements()) {
+    // Canonicalize binary shuffles of horizontal ops that use the
+    // same sources to an unary shuffle.
+    // TODO: Try to perform this fold even if the shuffle remains.
+    if (Ops.size() == 2) {
+      auto ContainsOps = [](SDValue HOp, SDValue Op) {
+        return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);
+      };
+      // Commute if all BC0's ops are contained in BC1.
+      if (ContainsOps(BC1, BC0.getOperand(0)) &&
+          ContainsOps(BC1, BC0.getOperand(1))) {
+        ShuffleVectorSDNode::commuteMask(Mask);
+        std::swap(Ops[0], Ops[1]);
+        std::swap(BC0, BC1);
+      }
+
+      // If BC1 can be represented by BC0, then convert to unary shuffle.
+      if (ContainsOps(BC0, BC1.getOperand(0)) &&
+          ContainsOps(BC0, BC1.getOperand(1))) {
+        for (int &M : Mask) {
+          if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.
+            continue;
+          int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;
+          M -= NumElts + (SubLane * NumHalfEltsPerLane);
+          if (BC1.getOperand(SubLane) != BC0.getOperand(0))
+            M += NumHalfEltsPerLane;
+        }
+      }
+    }
+
+    // Canonicalize unary horizontal ops to only refer to lower halves.
+    for (int i = 0; i != NumElts; ++i) {
+      int &M = Mask[i];
+      if (isUndefOrZero(M))
+        continue;
+      if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&
+          (M % NumEltsPerLane) >= NumHalfEltsPerLane)
+        M -= NumHalfEltsPerLane;
+      if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&
+          (M % NumEltsPerLane) >= NumHalfEltsPerLane)
+        M -= NumHalfEltsPerLane;
+    }
+  }
+
+  // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
+  // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
+  // represents the LHS/RHS inputs for the lower/upper halves.
+  unsigned EltSizeInBits = RootSizeInBits / Mask.size();
+  SmallVector<int, 16> TargetMask128, WideMask128;
+  if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&
+      scaleShuffleElements(TargetMask128, 2, WideMask128)) {
+    assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle");
+    bool SingleOp = (Ops.size() == 1);
+    if (!isHoriz || shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
+      SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;
+      SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;
+      Lo = Lo.getOperand(WideMask128[0] & 1);
+      Hi = Hi.getOperand(WideMask128[1] & 1);
+      if (SingleOp) {
+        MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
+        SDValue Undef = DAG.getUNDEF(SrcVT);
+        SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
+        Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);
+        Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);
+        Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);
+        Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);
+      }
+      return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
+    }
+  }
+
+  return SDValue();
+}
+
 // Attempt to constant fold all of the constant source ops.
 // Returns true if the entire shuffle is folded to a constant.
 // TODO: Extend this to merge multiple constant Ops and update the mask.
@@ -35341,6 +36316,14 @@ static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
   return DAG.getBitcast(VT, CstOp);
 }
 
+namespace llvm {
+  namespace X86 {
+    enum {
+      MaxShuffleCombineDepth = 8
+    };
+  }
+} // namespace llvm
+
 /// Fully generic combining of x86 shuffle instructions.
 ///
 /// This should be the last combine run over the x86 shuffle instructions. Once
@@ -35373,31 +36356,30 @@ static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
 static SDValue combineX86ShufflesRecursively(
     ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
     ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
-    bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
-    const X86Subtarget &Subtarget) {
+    unsigned MaxDepth, bool HasVariableMask, bool AllowVariableMask,
+    SelectionDAG &DAG, const X86Subtarget &Subtarget) {
   assert(RootMask.size() > 0 &&
          (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&
          "Illegal shuffle root mask");
+  assert(Root.getSimpleValueType().isVector() &&
+         "Shuffles operate on vector types!");
+  unsigned RootSizeInBits = Root.getSimpleValueType().getSizeInBits();
 
   // Bound the depth of our recursive combine because this is ultimately
   // quadratic in nature.
-  const unsigned MaxRecursionDepth = 8;
-  if (Depth >= MaxRecursionDepth)
+  if (Depth >= MaxDepth)
     return SDValue();
 
   // Directly rip through bitcasts to find the underlying operand.
   SDValue Op = SrcOps[SrcOpIndex];
   Op = peekThroughOneUseBitcasts(Op);
 
-  MVT VT = Op.getSimpleValueType();
-  if (!VT.isVector())
-    return SDValue(); // Bail if we hit a non-vector.
+  EVT VT = Op.getValueType();
+  if (!VT.isVector() || !VT.isSimple())
+    return SDValue(); // Bail if we hit a non-simple non-vector.
 
-  assert(Root.getSimpleValueType().isVector() &&
-         "Shuffles operate on vector types!");
-  unsigned RootSizeInBits = Root.getSimpleValueType().getSizeInBits();
-  assert(VT.getSizeInBits() == RootSizeInBits &&
-         "Can only combine shuffles of the same vector register size.");
+  assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&
+         "Can only combine shuffles upto size of the root op.");
 
   // Extract target shuffle mask and resolve sentinels and inputs.
   // TODO - determine Op's demanded elts from RootMask.
@@ -35410,17 +36392,32 @@ static SDValue combineX86ShufflesRecursively(
                               OpZero, DAG, Depth, false))
     return SDValue();
 
-  // Shuffle inputs must be the same size as the result, bail on any larger
-  // inputs and widen any smaller inputs.
-  if (llvm::any_of(OpInputs, [RootSizeInBits](SDValue Op) {
-        return Op.getValueSizeInBits() > RootSizeInBits;
+  // Shuffle inputs must not be larger than the shuffle result.
+  // TODO: Relax this for single input faux shuffles (trunc/extract_subvector).
+  if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
+        return OpInput.getValueSizeInBits() > VT.getSizeInBits();
       }))
     return SDValue();
 
-  for (SDValue &Op : OpInputs)
-    if (Op.getValueSizeInBits() < RootSizeInBits)
-      Op = widenSubVector(peekThroughOneUseBitcasts(Op), false, Subtarget, DAG,
-                          SDLoc(Op), RootSizeInBits);
+  // If the shuffle result was smaller than the root, we need to adjust the
+  // mask indices and pad the mask with undefs.
+  if (RootSizeInBits > VT.getSizeInBits()) {
+    unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();
+    unsigned OpMaskSize = OpMask.size();
+    if (OpInputs.size() > 1) {
+      unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;
+      for (int &M : OpMask) {
+        if (M < 0)
+          continue;
+        int EltIdx = M % OpMaskSize;
+        int OpIdx = M / OpMaskSize;
+        M = (PaddedMaskSize * OpIdx) + EltIdx;
+      }
+    }
+    OpZero = OpZero.zext(NumSubVecs * OpMaskSize);
+    OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);
+    OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);
+  }
 
   SmallVector<int, 64> Mask;
   SmallVector<SDValue, 16> Ops;
@@ -35561,10 +36558,6 @@ static SDValue combineX86ShufflesRecursively(
   // Handle the all undef/zero cases early.
   if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
     return DAG.getUNDEF(Root.getValueType());
-
-  // TODO - should we handle the mixed zero/undef case as well? Just returning
-  // a zero mask will lose information on undef elements possibly reducing
-  // future combine possibilities.
   if (all_of(Mask, [](int Idx) { return Idx < 0; }))
     return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
                          SDLoc(Root));
@@ -35584,7 +36577,7 @@ static SDValue combineX86ShufflesRecursively(
   // shuffles to avoid constant pool bloat.
   // Don't recurse if we already have more source ops than we can combine in
   // the remaining recursion depth.
-  if (Ops.size() < (MaxRecursionDepth - Depth)) {
+  if (Ops.size() < (MaxDepth - Depth)) {
     for (int i = 0, e = Ops.size(); i < e; ++i) {
       // For empty roots, we need to resolve zeroable elements before combining
       // them with other shuffles.
@@ -35596,7 +36589,7 @@ static SDValue combineX86ShufflesRecursively(
           SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
         AllowVar = AllowVariableMask;
       if (SDValue Res = combineX86ShufflesRecursively(
-              Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1,
+              Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth,
               HasVariableMask, AllowVar, DAG, Subtarget))
         return Res;
     }
@@ -35607,6 +36600,24 @@ static SDValue combineX86ShufflesRecursively(
           Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
     return Cst;
 
+  // Canonicalize the combined shuffle mask chain with horizontal ops.
+  // NOTE: This will update the Ops and Mask.
+  if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(
+          Ops, Mask, RootSizeInBits, SDLoc(Root), DAG, Subtarget))
+    return DAG.getBitcast(Root.getValueType(), HOp);
+
+  // Widen any subvector shuffle inputs we've collected.
+  if (any_of(Ops, [RootSizeInBits](SDValue Op) {
+        return Op.getValueSizeInBits() < RootSizeInBits;
+      })) {
+    for (SDValue &Op : Ops)
+      if (Op.getValueSizeInBits() < RootSizeInBits)
+        Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),
+                            RootSizeInBits);
+    // Reresolve - we might have repeated subvector sources.
+    resolveTargetShuffleInputsAndMask(Ops, Mask);
+  }
+
   // We can only combine unary and binary shuffle mask cases.
   if (Ops.size() <= 2) {
     // Minor canonicalization of the accumulated shuffle mask to make it easier
@@ -35614,8 +36625,10 @@ static SDValue combineX86ShufflesRecursively(
     // elements, and shrink them to the half-width mask. It does this in a loop
     // so it will reduce the size of the mask to the minimal width mask which
     // performs an equivalent shuffle.
-    SmallVector<int, 64> WidenedMask;
-    while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
+    while (Mask.size() > 1) {
+      SmallVector<int, 64> WidenedMask;
+      if (!canWidenShuffleElements(Mask, WidenedMask))
+        break;
       Mask = std::move(WidenedMask);
     }
 
@@ -35642,6 +36655,7 @@ static SDValue combineX86ShufflesRecursively(
 static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG,
                                              const X86Subtarget &Subtarget) {
   return combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 0,
+                                       X86::MaxShuffleCombineDepth,
                                        /*HasVarMask*/ false,
                                        /*AllowVarMask*/ true, DAG, Subtarget);
 }
@@ -35875,6 +36889,61 @@ static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
   return SDValue();
 }
 
+/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
+static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V,
+                                                      SelectionDAG &DAG,
+                                                      const SDLoc &DL) {
+  assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle");
+
+  MVT VT = V.getSimpleValueType();
+  SDValue Src0 = peekThroughBitcasts(V.getOperand(0));
+  SDValue Src1 = peekThroughBitcasts(V.getOperand(1));
+  unsigned SrcOpc0 = Src0.getOpcode();
+  unsigned SrcOpc1 = Src1.getOpcode();
+  EVT SrcVT0 = Src0.getValueType();
+  EVT SrcVT1 = Src1.getValueType();
+
+  if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))
+    return SDValue();
+
+  switch (SrcOpc0) {
+  case X86ISD::MOVDDUP: {
+    SDValue LHS = DAG.getBitcast(VT, Src0.getOperand(0));
+    SDValue RHS =
+        DAG.getBitcast(VT, Src1.isUndef() ? Src1 : Src1.getOperand(0));
+    SDValue Res =
+        DAG.getNode(X86ISD::VPERM2X128, DL, VT, LHS, RHS, V.getOperand(2));
+    Res = DAG.getNode(SrcOpc0, DL, SrcVT0, DAG.getBitcast(SrcVT0, Res));
+    return DAG.getBitcast(VT, Res);
+  }
+  case X86ISD::VPERMILPI:
+    // TODO: Handle v4f64 permutes with different low/high lane masks.
+    if (SrcVT0 == MVT::v4f64) {
+      uint64_t Mask = Src0.getConstantOperandVal(1);
+      if ((Mask & 0x3) != ((Mask >> 2) & 0x3))
+        break;
+    }
+    LLVM_FALLTHROUGH;
+  case X86ISD::VSHLI:
+  case X86ISD::VSRLI:
+  case X86ISD::VSRAI:
+  case X86ISD::PSHUFD:
+    if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
+      SDValue LHS = DAG.getBitcast(VT, Src0.getOperand(0));
+      SDValue RHS =
+          DAG.getBitcast(VT, Src1.isUndef() ? Src1 : Src1.getOperand(0));
+      SDValue Res =
+          DAG.getNode(X86ISD::VPERM2X128, DL, VT, LHS, RHS, V.getOperand(2));
+      Res = DAG.getNode(SrcOpc0, DL, SrcVT0, DAG.getBitcast(SrcVT0, Res),
+                        Src0.getOperand(1));
+      return DAG.getBitcast(VT, Res);
+    }
+    break;
+  }
+
+  return SDValue();
+}
+
 /// Try to combine x86 target specific shuffles.
 static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
                                     TargetLowering::DAGCombinerInfo &DCI,
@@ -35884,59 +36953,6 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
   SmallVector<int, 4> Mask;
   unsigned Opcode = N.getOpcode();
 
-  bool IsUnary;
-  SmallVector<int, 64> TargetMask;
-  SmallVector<SDValue, 2> TargetOps;
-  if (isTargetShuffle(Opcode))
-    getTargetShuffleMask(N.getNode(), VT, true, TargetOps, TargetMask, IsUnary);
-
-  // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
-  // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
-  // represents the LHS/RHS inputs for the lower/upper halves.
-  SmallVector<int, 16> TargetMask128;
-  if (!TargetMask.empty() && 0 < TargetOps.size() && TargetOps.size() <= 2 &&
-      isRepeatedTargetShuffleMask(128, VT, TargetMask, TargetMask128)) {
-    SmallVector<int, 16> WidenedMask128 = TargetMask128;
-    while (WidenedMask128.size() > 2) {
-      SmallVector<int, 16> WidenedMask;
-      if (!canWidenShuffleElements(WidenedMask128, WidenedMask))
-        break;
-      WidenedMask128 = std::move(WidenedMask);
-    }
-    if (WidenedMask128.size() == 2) {
-      assert(isUndefOrZeroOrInRange(WidenedMask128, 0, 4) && "Illegal shuffle");
-      SDValue BC0 = peekThroughBitcasts(TargetOps.front());
-      SDValue BC1 = peekThroughBitcasts(TargetOps.back());
-      EVT VT0 = BC0.getValueType();
-      EVT VT1 = BC1.getValueType();
-      unsigned Opcode0 = BC0.getOpcode();
-      unsigned Opcode1 = BC1.getOpcode();
-      bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
-                      Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
-      if (Opcode0 == Opcode1 && VT0 == VT1 &&
-          (isHoriz || Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS)) {
-        bool SingleOp = (TargetOps.size() == 1);
-        if (!isHoriz || shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
-          SDValue Lo = isInRange(WidenedMask128[0], 0, 2) ? BC0 : BC1;
-          SDValue Hi = isInRange(WidenedMask128[1], 0, 2) ? BC0 : BC1;
-          Lo = Lo.getOperand(WidenedMask128[0] & 1);
-          Hi = Hi.getOperand(WidenedMask128[1] & 1);
-          if (SingleOp) {
-            MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
-            SDValue Undef = DAG.getUNDEF(SrcVT);
-            SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
-            Lo = (WidenedMask128[0] == SM_SentinelZero ? Zero : Lo);
-            Hi = (WidenedMask128[1] == SM_SentinelZero ? Zero : Hi);
-            Lo = (WidenedMask128[0] == SM_SentinelUndef ? Undef : Lo);
-            Hi = (WidenedMask128[1] == SM_SentinelUndef ? Undef : Hi);
-          }
-          SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
-          return DAG.getBitcast(VT, Horiz);
-        }
-      }
-    }
-  }
-
   if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
     return R;
 
@@ -36000,6 +37016,7 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
         DemandedMask[i] = i;
       if (SDValue Res = combineX86ShufflesRecursively(
               {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0,
+              X86::MaxShuffleCombineDepth,
               /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
         return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
                            DAG.getBitcast(SrcVT, Res));
@@ -36029,7 +37046,8 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
     for (SDNode *User : Src->uses())
       if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
           Src == User->getOperand(0) &&
-          User->getValueSizeInBits(0) > VT.getSizeInBits()) {
+          User->getValueSizeInBits(0).getFixedSize() >
+              VT.getFixedSizeInBits()) {
         return extractSubVector(SDValue(User, 0), 0, DAG, DL,
                                 VT.getSizeInBits());
       }
@@ -36115,7 +37133,8 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
             LN->isSimple()) {
           unsigned Offset = ShiftAmt / 8;
           SDVTList Tys = DAG.getVTList(VT, MVT::Other);
-          SDValue Ptr = DAG.getMemBasePlusOffset(LN->getBasePtr(), Offset, DL);
+          SDValue Ptr = DAG.getMemBasePlusOffset(LN->getBasePtr(),
+                                                 TypeSize::Fixed(Offset), DL);
           SDValue Ops[] = { LN->getChain(), Ptr };
           SDValue BcastLd = DAG.getMemIntrinsicNode(
               X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
@@ -36147,15 +37166,16 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
     }
 
     // vbroadcast(vector load X) -> vbroadcast_load
-    if (SrcVT == MVT::v2f64 && Src.hasOneUse() &&
-        ISD::isNormalLoad(Src.getNode())) {
+    if ((SrcVT == MVT::v2f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v2i64 ||
+         SrcVT == MVT::v4i32) &&
+        Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {
       LoadSDNode *LN = cast<LoadSDNode>(Src);
       // Unless the load is volatile or atomic.
       if (LN->isSimple()) {
         SDVTList Tys = DAG.getVTList(VT, MVT::Other);
-        SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
+        SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
         SDValue BcastLd = DAG.getMemIntrinsicNode(
-            X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
+            X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SrcVT.getScalarType(),
             LN->getPointerInfo(), LN->getOriginalAlign(),
             LN->getMemOperand()->getFlags());
         DCI.CombineTo(N.getNode(), BcastLd);
@@ -36242,6 +37262,27 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
       }
     }
 
+    // Pull subvector inserts into undef through VZEXT_MOVL by making it an
+    // insert into a zero vector. This helps get VZEXT_MOVL closer to
+    // scalar_to_vectors where 256/512 are canonicalized to an insert and a
+    // 128-bit scalar_to_vector. This reduces the number of isel patterns.
+    if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {
+      SDValue V = peekThroughOneUseBitcasts(N0);
+
+      if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
+          isNullConstant(V.getOperand(2))) {
+        SDValue In = V.getOperand(1);
+        MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
+                                     In.getValueSizeInBits() /
+                                         VT.getScalarSizeInBits());
+        In = DAG.getBitcast(SubVT, In);
+        SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);
+        return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
+                           getZeroVector(VT, Subtarget, DAG, DL), Movl,
+                           V.getOperand(2));
+      }
+    }
+
     return SDValue();
   }
   case X86ISD::BLENDI: {
@@ -36283,32 +37324,51 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
     return SDValue();
   }
   case X86ISD::VPERM2X128: {
-    // If both 128-bit values were inserted into high halves of 256-bit values,
-    // the shuffle can be reduced to a concatenation of subvectors:
-    // vperm2x128 (ins ?, X, C1), (ins ?, Y, C2), 0x31 --> concat X, Y
-    // Note: We are only looking for the exact high/high shuffle mask because we
-    //       expect to fold other similar patterns before creating this opcode.
-    SDValue Ins0 = peekThroughBitcasts(N.getOperand(0));
-    SDValue Ins1 = peekThroughBitcasts(N.getOperand(1));
-    unsigned Imm = N.getConstantOperandVal(2);
-    if (!(Imm == 0x31 &&
-          Ins0.getOpcode() == ISD::INSERT_SUBVECTOR &&
-          Ins1.getOpcode() == ISD::INSERT_SUBVECTOR &&
-          Ins0.getValueType() == Ins1.getValueType()))
-      return SDValue();
+    // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
+    SDValue LHS = N->getOperand(0);
+    SDValue RHS = N->getOperand(1);
+    if (LHS.getOpcode() == ISD::BITCAST &&
+        (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {
+      EVT SrcVT = LHS.getOperand(0).getValueType();
+      if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {
+        return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,
+                                              DAG.getBitcast(SrcVT, LHS),
+                                              DAG.getBitcast(SrcVT, RHS),
+                                              N->getOperand(2)));
+      }
+    }
+
+    // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
+    if (SDValue Res = canonicalizeLaneShuffleWithRepeatedOps(N, DAG, DL))
+      return Res;
 
-    SDValue X = Ins0.getOperand(1);
-    SDValue Y = Ins1.getOperand(1);
-    unsigned C1 = Ins0.getConstantOperandVal(2);
-    unsigned C2 = Ins1.getConstantOperandVal(2);
-    MVT SrcVT = X.getSimpleValueType();
-    unsigned SrcElts = SrcVT.getVectorNumElements();
-    if (SrcVT != Y.getSimpleValueType() || SrcVT.getSizeInBits() != 128 ||
-        C1 != SrcElts || C2 != SrcElts)
+    // Fold vperm2x128 subvector shuffle with an inner concat pattern.
+    // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.  
+    auto FindSubVector128 = [&](unsigned Idx) {
+      if (Idx > 3)
+        return SDValue();
+      SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));
+      SmallVector<SDValue> SubOps;
+      if (collectConcatOps(Src.getNode(), SubOps) && SubOps.size() == 2)
+        return SubOps[Idx & 1];
+      unsigned NumElts = Src.getValueType().getVectorNumElements();
+      if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
+          Src.getOperand(1).getValueSizeInBits() == 128 &&
+          Src.getConstantOperandAPInt(2) == (NumElts / 2)) {
+        return Src.getOperand(1);
+      }
       return SDValue();
-
-    return DAG.getBitcast(VT, DAG.getNode(ISD::CONCAT_VECTORS, DL,
-                                          Ins1.getValueType(), X, Y));
+    };
+    unsigned Imm = N.getConstantOperandVal(2);
+    if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {
+      if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {
+        MVT SubVT = VT.getHalfNumVectorElementsVT();
+        SubLo = DAG.getBitcast(SubVT, SubLo);
+        SubHi = DAG.getBitcast(SubVT, SubHi);
+        return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);
+      }
+    }
+    return SDValue();
   }
   case X86ISD::PSHUFD:
   case X86ISD::PSHUFLW:
@@ -36751,10 +37811,12 @@ static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
 
 /// Eliminate a redundant shuffle of a horizontal math op.
 static SDValue foldShuffleOfHorizOp(SDNode *N, SelectionDAG &DAG) {
+  // TODO: Can we use getTargetShuffleInputs instead?
   unsigned Opcode = N->getOpcode();
   if (Opcode != X86ISD::MOVDDUP && Opcode != X86ISD::VBROADCAST)
-    if (Opcode != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef())
-      return SDValue();
+    if (Opcode != X86ISD::UNPCKL && Opcode != X86ISD::UNPCKH)
+      if (Opcode != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef())
+        return SDValue();
 
   // For a broadcast, peek through an extract element of index 0 to find the
   // horizontal op: broadcast (ext_vec_elt HOp, 0)
@@ -36773,6 +37835,28 @@ static SDValue foldShuffleOfHorizOp(SDNode *N, SelectionDAG &DAG) {
       HOp.getOpcode() != X86ISD::HSUB && HOp.getOpcode() != X86ISD::FHSUB)
     return SDValue();
 
+  // unpcklo(hop(x,y),hop(z,w)) -> permute(hop(x,z)).
+  // unpckhi(hop(x,y),hop(z,w)) -> permute(hop(y,w)).
+  // Don't fold if hop(x,y) == hop(z,w).
+  if (Opcode == X86ISD::UNPCKL || Opcode == X86ISD::UNPCKH) {
+    SDValue HOp2 = N->getOperand(1);
+    if (HOp.getOpcode() != HOp2.getOpcode() || VT.getScalarSizeInBits() != 32)
+      return SDValue();
+    if (HOp == HOp2)
+      return SDValue();
+    SDLoc DL(HOp);
+    unsigned LoHi = Opcode == X86ISD::UNPCKL ? 0 : 1;
+    SDValue Res = DAG.getNode(HOp.getOpcode(), DL, VT, HOp.getOperand(LoHi),
+                              HOp2.getOperand(LoHi));
+    // Use SHUFPS for the permute so this will work on SSE3 targets, shuffle
+    // combining and domain handling will simplify this later on.
+    EVT ShuffleVT = VT.changeVectorElementType(MVT::f32);
+    Res = DAG.getBitcast(ShuffleVT, Res);
+    Res = DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,
+                      getV4X86ShuffleImm8ForMask({0, 2, 1, 3}, DL, DAG));
+    return DAG.getBitcast(VT, Res);
+  }
+
   // 128-bit horizontal math instructions are defined to operate on adjacent
   // lanes of each operand as:
   // v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3]
@@ -36805,6 +37889,8 @@ static SDValue foldShuffleOfHorizOp(SDNode *N, SelectionDAG &DAG) {
   // replicating low and high halves (and without changing the type/length of
   // the vector), we don't need the shuffle.
   if (Opcode == X86ISD::MOVDDUP || Opcode == X86ISD::VBROADCAST) {
+    if (Opcode == X86ISD::VBROADCAST && !VT.is128BitVector())
+      return SDValue();
     if (HOp.getScalarValueSizeInBits() == 64 && HOp.getValueType() == VT) {
       // movddup (hadd X, X) --> hadd X, X
       // broadcast (extract_vec_elt (hadd X, X), 0) --> hadd X, X
@@ -36817,19 +37903,20 @@ static SDValue foldShuffleOfHorizOp(SDNode *N, SelectionDAG &DAG) {
 
   // shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X
   ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
+
   // TODO: Other mask possibilities like {1,1} and {1,0} could be added here,
   // but this should be tied to whatever horizontal op matching and shuffle
   // canonicalization are producing.
   if (HOp.getValueSizeInBits() == 128 &&
-      (isTargetShuffleEquivalent(Mask, {0, 0}) ||
-       isTargetShuffleEquivalent(Mask, {0, 1, 0, 1}) ||
-       isTargetShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3})))
+      (isShuffleEquivalent(Mask, {0, 0}) ||
+       isShuffleEquivalent(Mask, {0, 1, 0, 1}) ||
+       isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3})))
     return updateHOp(HOp, DAG);
 
   if (HOp.getValueSizeInBits() == 256 &&
-      (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2}) ||
-       isTargetShuffleEquivalent(Mask, {0, 1, 0, 1, 4, 5, 4, 5}) ||
-       isTargetShuffleEquivalent(
+      (isShuffleEquivalent(Mask, {0, 0, 2, 2}) ||
+       isShuffleEquivalent(Mask, {0, 1, 0, 1, 4, 5, 4, 5}) ||
+       isShuffleEquivalent(
            Mask, {0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11})))
     return updateHOp(HOp, DAG);
 
@@ -36887,6 +37974,34 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
 
     if (SDValue HAddSub = foldShuffleOfHorizOp(N, DAG))
       return HAddSub;
+
+    // Merge shuffles through binops if its likely we'll be able to merge it
+    // with other shuffles (as long as they aren't splats).
+    // shuffle(bop(shuffle(x,y),shuffle(z,w)),bop(shuffle(a,b),shuffle(c,d)))
+    // TODO: We might be able to move this to DAGCombiner::visitVECTOR_SHUFFLE.
+    if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(N)) {
+      unsigned SrcOpcode = N->getOperand(0).getOpcode();
+      if (SrcOpcode == N->getOperand(1).getOpcode() && TLI.isBinOp(SrcOpcode) &&
+          N->isOnlyUserOf(N->getOperand(0).getNode()) &&
+          N->isOnlyUserOf(N->getOperand(1).getNode())) {
+        SDValue Op00 = N->getOperand(0).getOperand(0);
+        SDValue Op10 = N->getOperand(1).getOperand(0);
+        SDValue Op01 = N->getOperand(0).getOperand(1);
+        SDValue Op11 = N->getOperand(1).getOperand(1);
+        auto *SVN00 = dyn_cast<ShuffleVectorSDNode>(Op00);
+        auto *SVN10 = dyn_cast<ShuffleVectorSDNode>(Op10);
+        auto *SVN01 = dyn_cast<ShuffleVectorSDNode>(Op01);
+        auto *SVN11 = dyn_cast<ShuffleVectorSDNode>(Op11);
+        if (((SVN00 && !SVN00->isSplat()) || (SVN10 && !SVN10->isSplat())) &&
+            ((SVN01 && !SVN01->isSplat()) || (SVN11 && !SVN11->isSplat()))) {
+          SDLoc DL(N);
+          ArrayRef<int> Mask = SVN->getMask();
+          SDValue LHS = DAG.getVectorShuffle(VT, DL, Op00, Op10, Mask);
+          SDValue RHS = DAG.getVectorShuffle(VT, DL, Op01, Op11, Mask);
+          return DAG.getNode(SrcOpcode, DL, VT, LHS, RHS);
+        }
+      }
+    }
   }
 
   // Attempt to combine into a vector load/broadcast.
@@ -36920,32 +38035,11 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
     // TODO - merge this into combineX86ShufflesRecursively.
     APInt KnownUndef, KnownZero;
     APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
-    if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero, DCI))
+    if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero,
+                                       DCI))
       return SDValue(N, 0);
   }
 
-  // Pull subvector inserts into undef through VZEXT_MOVL by making it an
-  // insert into a zero vector. This helps get VZEXT_MOVL closer to
-  // scalar_to_vectors where 256/512 are canonicalized to an insert and a
-  // 128-bit scalar_to_vector. This reduces the number of isel patterns.
-  if (N->getOpcode() == X86ISD::VZEXT_MOVL && !DCI.isBeforeLegalizeOps() &&
-      N->getOperand(0).hasOneUse()) {
-    SDValue V = peekThroughOneUseBitcasts(N->getOperand(0));
-
-    if (V.getOpcode() == ISD::INSERT_SUBVECTOR &&
-        V.getOperand(0).isUndef() && isNullConstant(V.getOperand(2))) {
-      SDValue In = V.getOperand(1);
-      MVT SubVT =
-          MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(),
-                           In.getValueSizeInBits() / VT.getScalarSizeInBits());
-      In = DAG.getBitcast(SubVT, In);
-      SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, dl, SubVT, In);
-      return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT,
-                         getZeroVector(VT.getSimpleVT(), Subtarget, DAG, dl),
-                         Movl, V.getOperand(2));
-    }
-  }
-
   return SDValue();
 }
 
@@ -37076,7 +38170,13 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
     if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
                                    Depth + 1))
       return true;
-    // TODO convert SrcUndef to KnownUndef.
+
+    // Aggressively peek through ops to get at the demanded elts.
+    if (!DemandedElts.isAllOnesValue())
+      if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
+              Src, DemandedElts, TLO.DAG, Depth + 1))
+        return TLO.CombineTo(
+            Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
     break;
   }
   case X86ISD::KSHIFTL: {
@@ -37265,7 +38365,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
     SDValue Src = Op.getOperand(0);
     MVT SrcVT = Src.getSimpleValueType();
     if (!SrcVT.isVector())
-      return false;
+      break;
     // Don't bother broadcasting if we just need the 0'th element.
     if (DemandedElts == 1) {
       if (Src.getValueType() != VT)
@@ -37318,20 +38418,61 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
       ExtSizeInBits = SizeInBits / 4;
 
     switch (Opc) {
-      // Subvector broadcast.
-    case X86ISD::SUBV_BROADCAST: {
+      // Scalar broadcast.
+    case X86ISD::VBROADCAST: {
       SDLoc DL(Op);
       SDValue Src = Op.getOperand(0);
       if (Src.getValueSizeInBits() > ExtSizeInBits)
         Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
-      else if (Src.getValueSizeInBits() < ExtSizeInBits) {
-        MVT SrcSVT = Src.getSimpleValueType().getScalarType();
-        MVT SrcVT =
-            MVT::getVectorVT(SrcSVT, ExtSizeInBits / SrcSVT.getSizeInBits());
-        Src = TLO.DAG.getNode(X86ISD::SUBV_BROADCAST, DL, SrcVT, Src);
-      }
-      return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Src, 0,
+      EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
+                                    ExtSizeInBits / VT.getScalarSizeInBits());
+      SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);
+      return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
+                                               TLO.DAG, DL, ExtSizeInBits));
+    }
+    case X86ISD::VBROADCAST_LOAD: {
+      SDLoc DL(Op);
+      auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
+      EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
+                                    ExtSizeInBits / VT.getScalarSizeInBits());
+      SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
+      SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
+      SDValue Bcst = TLO.DAG.getMemIntrinsicNode(
+          X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
+          MemIntr->getMemOperand());
+      TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
+                                           Bcst.getValue(1));
+      return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
                                                TLO.DAG, DL, ExtSizeInBits));
+    }
+      // Subvector broadcast.
+    case X86ISD::SUBV_BROADCAST_LOAD: {
+      auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
+      EVT MemVT = MemIntr->getMemoryVT();
+      if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {
+        SDLoc DL(Op);
+        SDValue Ld =
+            TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),
+                            MemIntr->getBasePtr(), MemIntr->getMemOperand());
+        TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
+                                             Ld.getValue(1));
+        return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,
+                                                 TLO.DAG, DL, ExtSizeInBits));
+      } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {
+        SDLoc DL(Op);
+        EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
+                                      ExtSizeInBits / VT.getScalarSizeInBits());
+        SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
+        SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
+        SDValue Bcst =
+            TLO.DAG.getMemIntrinsicNode(X86ISD::SUBV_BROADCAST_LOAD, DL, Tys,
+                                        Ops, MemVT, MemIntr->getMemOperand());
+        TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
+                                             Bcst.getValue(1));
+        return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
+                                                 TLO.DAG, DL, ExtSizeInBits));
+      }
+      break;
     }
       // Byte shifts by immediate.
     case X86ISD::VSHLDQ:
@@ -37384,7 +38525,8 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
     case X86ISD::UNPCKL:
     case X86ISD::UNPCKH:
     case X86ISD::BLENDI:
-      // Saturated Packs.
+      // Integer ops.
+    case X86ISD::AVG:
     case X86ISD::PACKSS:
     case X86ISD::PACKUS:
       // Horizontal Ops.
@@ -37477,16 +38619,22 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
 
   // If we don't demand all elements, then attempt to combine to a simpler
   // shuffle.
-  // TODO: Handle other depths, but first we need to handle the fact that
-  // it might combine to the same shuffle.
-  if (!DemandedElts.isAllOnesValue() && Depth == 0) {
+  // We need to convert the depth to something combineX86ShufflesRecursively
+  // can handle - so pretend its Depth == 0 again, and reduce the max depth
+  // to match. This prevents combineX86ShuffleChain from returning a
+  // combined shuffle that's the same as the original root, causing an
+  // infinite loop.
+  if (!DemandedElts.isAllOnesValue()) {
+    assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range");
+
     SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
     for (int i = 0; i != NumElts; ++i)
       if (DemandedElts[i])
         DemandedMask[i] = i;
 
     SDValue NewShuffle = combineX86ShufflesRecursively(
-        {Op}, 0, Op, DemandedMask, {}, Depth, /*HasVarMask*/ false,
+        {Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth,
+        /*HasVarMask*/ false,
         /*AllowVarMask*/ true, TLO.DAG, Subtarget);
     if (NewShuffle)
       return TLO.CombineTo(Op, NewShuffle);
@@ -37589,7 +38737,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
 
     // Low bits known zero.
     Known.Zero.setLowBits(ShAmt);
-    break;
+    return false;
   }
   case X86ISD::VSRLI: {
     unsigned ShAmt = Op.getConstantOperandVal(1);
@@ -37608,7 +38756,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
 
     // High bits known zero.
     Known.Zero.setHighBits(ShAmt);
-    break;
+    return false;
   }
   case X86ISD::VSRAI: {
     SDValue Op0 = Op.getOperand(0);
@@ -37657,7 +38805,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
     // High bits are known one.
     if (Known.One[BitWidth - ShAmt - 1])
       Known.One.setHighBits(ShAmt);
-    break;
+    return false;
   }
   case X86ISD::PEXTRB:
   case X86ISD::PEXTRW: {
@@ -37723,8 +38871,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
         return true;
 
       KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
-      Known.One = KnownVec.One & KnownScl.One;
-      Known.Zero = KnownVec.Zero & KnownScl.Zero;
+      Known = KnownBits::commonBits(KnownVec, KnownScl);
       return false;
     }
     break;
@@ -37804,34 +38951,83 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
       return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
     return false;
   }
-  case X86ISD::BEXTR: {
+  case X86ISD::BEXTR:
+  case X86ISD::BEXTRI: {
     SDValue Op0 = Op.getOperand(0);
     SDValue Op1 = Op.getOperand(1);
 
     // Only bottom 16-bits of the control bits are required.
     if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
       // NOTE: SimplifyDemandedBits won't do this for constants.
-      const APInt &Val1 = Cst1->getAPIntValue();
-      APInt MaskedVal1 = Val1 & 0xFFFF;
-      if (MaskedVal1 != Val1) {
+      uint64_t Val1 = Cst1->getZExtValue();
+      uint64_t MaskedVal1 = Val1 & 0xFFFF;
+      if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {
         SDLoc DL(Op);
         return TLO.CombineTo(
             Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
                                 TLO.DAG.getConstant(MaskedVal1, DL, VT)));
       }
+
+      unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
+      unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
+
+      // If the length is 0, the result is 0.
+      if (Length == 0) {
+        Known.setAllZero();
+        return false;
+      }
+
+      if ((Shift + Length) <= BitWidth) {
+        APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);
+        if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))
+          return true;
+
+        Known = Known.extractBits(Length, Shift);
+        Known = Known.zextOrTrunc(BitWidth);
+        return false;
+      }
+    } else {
+      assert(Opc == X86ISD::BEXTR && "Unexpected opcode!");
+      KnownBits Known1;
+      APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
+      if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
+        return true;
+
+      // If the length is 0, replace with 0.
+      KnownBits LengthBits = Known1.extractBits(8, 8);
+      if (LengthBits.isZero())
+        return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
     }
 
-    KnownBits Known1;
-    APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
-    if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
+    break;
+  }
+  case X86ISD::PDEP: {
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+
+    unsigned DemandedBitsLZ = OriginalDemandedBits.countLeadingZeros();
+    APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
+
+    // If the demanded bits has leading zeroes, we don't demand those from the
+    // mask.
+    if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
       return true;
 
-    // If the length is 0, replace with 0.
-    KnownBits LengthBits = Known1.extractBits(8, 8);
-    if (LengthBits.isZero())
-      return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
+    // The number of possible 1s in the mask determines the number of LSBs of
+    // operand 0 used. Undemanded bits from the mask don't matter so filter
+    // them before counting.
+    KnownBits Known2;
+    uint64_t Count = (~Known.Zero & LoMask).countPopulation();
+    APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
+    if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
+      return true;
 
-    break;
+    // Zeroes are retained from the mask, but not ones.
+    Known.One.clearAllBits();
+    // The result will have at least as many trailing zeros as the non-mask
+    // operand since bits can only map to the same or higher bit position.
+    Known.Zero.setLowBits(Known2.countMinTrailingZeros());
+    return false;
   }
   }
 
@@ -38242,6 +39438,8 @@ static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
   // Convert build vector ops to MMX data in the bottom elements.
   SmallVector<SDValue, 8> Ops;
 
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
   // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
   if (Splat) {
     if (Splat.isUndef())
@@ -38254,14 +39452,16 @@ static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
       if (NumElts == 8)
         Splat = DAG.getNode(
             ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
-            DAG.getConstant(Intrinsic::x86_mmx_punpcklbw, DL, MVT::i32), Splat,
-            Splat);
+            DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,
+                                  TLI.getPointerTy(DAG.getDataLayout())),
+            Splat, Splat);
 
       // Use PSHUFW to repeat 16-bit elements.
       unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
       return DAG.getNode(
           ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
-          DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL, MVT::i32),
+          DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,
+                                TLI.getPointerTy(DAG.getDataLayout())),
           Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
     }
     Ops.append(NumElts, Splat);
@@ -38277,7 +39477,8 @@ static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
         (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
                      : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
                                     : Intrinsic::x86_mmx_punpcklbw));
-    SDValue Intrin = DAG.getConstant(IntrinOp, DL, MVT::i32);
+    SDValue Intrin = DAG.getTargetConstant(
+        IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));
     for (unsigned i = 0; i != NumOps; i += 2)
       Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
                                Ops[i], Ops[i + 1]);
@@ -38291,7 +39492,7 @@ static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
 // a vector/float/double that got truncated/extended/bitcast to/from a scalar
 // integer. If so, replace the scalar ops with bool vector equivalents back down
 // the chain.
-static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, SDLoc DL,
+static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL,
                                           SelectionDAG &DAG,
                                           const X86Subtarget &Subtarget) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -38344,6 +39545,10 @@ static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, SDLoc DL,
   case ISD::SHL: {
     // If we find a suitable source, a SHL becomes a KSHIFTL.
     SDValue Src0 = V.getOperand(0);
+    if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||
+        ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))
+      break;
+
     if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
       if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
         return DAG.getNode(
@@ -38686,8 +39891,8 @@ static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
 
 // Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
 // PHMINPOSUW.
-static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
-                                             const X86Subtarget &Subtarget) {
+static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG,
+                                      const X86Subtarget &Subtarget) {
   // Bail without SSE41.
   if (!Subtarget.hasSSE41())
     return SDValue();
@@ -38760,9 +39965,8 @@ static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
 }
 
 // Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
-static SDValue combineHorizontalPredicateResult(SDNode *Extract,
-                                                SelectionDAG &DAG,
-                                                const X86Subtarget &Subtarget) {
+static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG,
+                                         const X86Subtarget &Subtarget) {
   // Bail without SSE2.
   if (!Subtarget.hasSSE2())
     return SDValue();
@@ -38876,10 +40080,8 @@ static SDValue combineHorizontalPredicateResult(SDNode *Extract,
 
   MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
   if (BinOp == ISD::XOR) {
-    // parity -> (AND (CTPOP(MOVMSK X)), 1)
-    SDValue Mask = DAG.getConstant(1, DL, CmpVT);
-    SDValue Result = DAG.getNode(ISD::CTPOP, DL, CmpVT, Movmsk);
-    Result = DAG.getNode(ISD::AND, DL, CmpVT, Result, Mask);
+    // parity -> (PARITY(MOVMSK X))
+    SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);
     return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
   }
 
@@ -39067,10 +40269,12 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
   // Handle extract(truncate(x)) for 0'th index.
   // TODO: Treat this as a faux shuffle?
   // TODO: When can we use this for general indices?
-  if (ISD::TRUNCATE == Src.getOpcode() && SrcVT.is128BitVector() && IdxC == 0) {
+  if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&
+      (SrcVT.getSizeInBits() % 128) == 0) {
     Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
-    Src = DAG.getBitcast(SrcVT, Src);
-    return DAG.getNode(N->getOpcode(), dl, VT, Src, Idx);
+    MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);
+    return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),
+                       Idx);
   }
 
   // Resolve the target shuffle inputs and mask.
@@ -39146,7 +40350,7 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
     unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
     SrcOp = DAG.getBitcast(SrcVT, SrcOp);
     SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
-                                DAG.getIntPtrConstant(SrcIdx, dl));
+                                DAG.getTargetConstant(SrcIdx, dl, MVT::i8));
     return DAG.getZExtOrTrunc(ExtOp, dl, VT);
   }
 
@@ -39253,8 +40457,8 @@ static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) {
 
 /// Try to convert a vector reduction sequence composed of binops and shuffles
 /// into horizontal ops.
-static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG,
-                                            const X86Subtarget &Subtarget) {
+static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
+                                     const X86Subtarget &Subtarget) {
   assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
 
   // We need at least SSE2 to anything here.
@@ -39262,8 +40466,8 @@ static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG,
     return SDValue();
 
   ISD::NodeType Opc;
-  SDValue Rdx =
-      DAG.matchBinOpReduction(ExtElt, Opc, {ISD::ADD, ISD::FADD}, true);
+  SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,
+                                        {ISD::ADD, ISD::MUL, ISD::FADD}, true);
   if (!Rdx)
     return SDValue();
 
@@ -39278,7 +40482,46 @@ static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG,
 
   SDLoc DL(ExtElt);
 
-  // vXi8 reduction - sub 128-bit vector.
+  // vXi8 mul reduction - promote to vXi16 mul reduction.
+  if (Opc == ISD::MUL) {
+    unsigned NumElts = VecVT.getVectorNumElements();
+    if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
+      return SDValue();
+    if (VecVT.getSizeInBits() >= 128) {
+      EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);
+      SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
+      SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
+      Lo = DAG.getBitcast(WideVT, Lo);
+      Hi = DAG.getBitcast(WideVT, Hi);
+      Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);
+      while (Rdx.getValueSizeInBits() > 128) {
+        std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
+        Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);
+      }
+    } else {
+      if (VecVT == MVT::v4i8)
+        Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, Rdx,
+                          DAG.getUNDEF(MVT::v4i8));
+      Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx,
+                        DAG.getUNDEF(MVT::v8i8));
+      Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));
+      Rdx = DAG.getBitcast(MVT::v8i16, Rdx);
+    }
+    if (NumElts >= 8)
+      Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
+                        DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
+                                             {4, 5, 6, 7, -1, -1, -1, -1}));
+    Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
+                      DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
+                                           {2, 3, -1, -1, -1, -1, -1, -1}));
+    Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
+                      DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
+                                           {1, -1, -1, -1, -1, -1, -1, -1}));
+    Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
+  }
+
+  // vXi8 add reduction - sub 128-bit vector.
   if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
     if (VecVT == MVT::v4i8) {
       // Pad with zero.
@@ -39309,7 +40552,7 @@ static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG,
       !isPowerOf2_32(VecVT.getVectorNumElements()))
     return SDValue();
 
-  // vXi8 reduction - sum lo/hi halves then use PSADBW.
+  // vXi8 add reduction - sum lo/hi halves then use PSADBW.
   if (VT == MVT::i8) {
     while (Rdx.getValueSizeInBits() > 128) {
       SDValue Lo, Hi;
@@ -39415,7 +40658,7 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
     }
 
     // TODO - Remove this once we can handle the implicit zero-extension of
-    // X86ISD::PEXTRW/X86ISD::PEXTRB in combineHorizontalPredicateResult and
+    // X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and
     // combineBasicSADPattern.
     return SDValue();
   }
@@ -39447,14 +40690,15 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
     return SAD;
 
   // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
-  if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
+  if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))
     return Cmp;
 
   // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
-  if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))
+  if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))
     return MinMax;
 
-  if (SDValue V = combineReductionToHorizontal(N, DAG, Subtarget))
+  // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..
+  if (SDValue V = combineArithReduction(N, DAG, Subtarget))
     return V;
 
   if (SDValue V = scalarizeExtEltFP(N, DAG))
@@ -39578,7 +40822,7 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
   if (TValIsAllOnes && FValIsAllZeros)
     return DAG.getBitcast(VT, Cond);
 
-  if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(CondVT))
+  if (!TLI.isTypeLegal(CondVT))
     return SDValue();
 
   // vselect Cond, 111..., X -> or Cond, X
@@ -39901,6 +41145,36 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
       return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
   }
 
+  // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
+  // by forcing the unselected elements to zero.
+  // TODO: Can we handle more shuffles with this?
+  if (N->getOpcode() == ISD::VSELECT && CondVT.isVector() &&
+      LHS.getOpcode() == X86ISD::PSHUFB && RHS.getOpcode() == X86ISD::PSHUFB &&
+      LHS.hasOneUse() && RHS.hasOneUse()) {
+    MVT SimpleVT = VT.getSimpleVT();
+    bool LHSUnary, RHSUnary;
+    SmallVector<SDValue, 1> LHSOps, RHSOps;
+    SmallVector<int, 64> LHSMask, RHSMask, CondMask;
+    if (createShuffleMaskFromVSELECT(CondMask, Cond) &&
+        getTargetShuffleMask(LHS.getNode(), SimpleVT, true, LHSOps, LHSMask,
+                             LHSUnary) &&
+        getTargetShuffleMask(RHS.getNode(), SimpleVT, true, RHSOps, RHSMask,
+                             RHSUnary)) {
+      int NumElts = VT.getVectorNumElements();
+      for (int i = 0; i != NumElts; ++i) {
+        if (CondMask[i] < NumElts)
+          RHSMask[i] = 0x80;
+        else
+          LHSMask[i] = 0x80;
+      }
+      LHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, LHS.getOperand(0),
+                        getConstVector(LHSMask, SimpleVT, DAG, DL, true));
+      RHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, RHS.getOperand(0),
+                        getConstVector(RHSMask, SimpleVT, DAG, DL, true));
+      return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
+    }
+  }
+
   // If we have SSE[12] support, try to form min/max nodes. SSE min/max
   // instructions match the semantics of the common C idiom x<y?x:y but not
   // x<=y?x:y, because of how they handle negative zero (which can be
@@ -40127,13 +41401,12 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
   if (SDValue V = combineSelectOfTwoConstants(N, DAG))
     return V;
 
-  // Canonicalize max and min:
-  // (x > y) ? x : y -> (x >= y) ? x : y
-  // (x < y) ? x : y -> (x <= y) ? x : y
+  // Canonicalize min/max:
+  // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
+  // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
   // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
-  // the need for an extra compare
-  // against zero. e.g.
-  // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
+  // the need for an extra compare against zero. e.g.
+  // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
   // subl   %esi, %edi
   // testl  %edi, %edi
   // movl   $0, %eax
@@ -40142,142 +41415,27 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
   // xorl   %eax, %eax
   // subl   %esi, $edi
   // cmovsl %eax, %edi
+  //
+  // We can also canonicalize
+  //  (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
+  //  (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
+  // This allows the use of a test instruction for the compare.
   if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
       Cond.hasOneUse() &&
-      DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
-      DAG.isEqualTo(RHS, Cond.getOperand(1))) {
+      LHS == Cond.getOperand(0) && RHS == Cond.getOperand(1)) {
     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
-    switch (CC) {
-    default: break;
-    case ISD::SETLT:
-    case ISD::SETGT: {
-      ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
+    if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||
+        (CC == ISD::SETLT && isAllOnesConstant(RHS))) {
+      ISD::CondCode NewCC = CC == ISD::SETGT ? ISD::SETGE : ISD::SETLE;
       Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
                           Cond.getOperand(0), Cond.getOperand(1), NewCC);
       return DAG.getSelect(DL, VT, Cond, LHS, RHS);
     }
-    }
-  }
-
-  // Match VSELECTs into subs with unsigned saturation.
-  if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
-      // psubus is available in SSE2 for i8 and i16 vectors.
-      Subtarget.hasSSE2() && VT.getVectorNumElements() >= 2 &&
-      isPowerOf2_32(VT.getVectorNumElements()) &&
-      (VT.getVectorElementType() == MVT::i8 ||
-       VT.getVectorElementType() == MVT::i16)) {
-    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
-
-    // Check if one of the arms of the VSELECT is a zero vector. If it's on the
-    // left side invert the predicate to simplify logic below.
-    SDValue Other;
-    if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
-      Other = RHS;
-      CC = ISD::getSetCCInverse(CC, VT.getVectorElementType());
-    } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
-      Other = LHS;
-    }
-
-    if (Other.getNode() && Other->getNumOperands() == 2 &&
-        Other->getOperand(0) == Cond.getOperand(0)) {
-      SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
-      SDValue CondRHS = Cond->getOperand(1);
-
-      // Look for a general sub with unsigned saturation first.
-      // x >= y ? x-y : 0 --> subus x, y
-      // x >  y ? x-y : 0 --> subus x, y
-      if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
-          Other->getOpcode() == ISD::SUB && OpRHS == CondRHS)
-        return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
-
-      if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS)) {
-        if (isa<BuildVectorSDNode>(CondRHS)) {
-          // If the RHS is a constant we have to reverse the const
-          // canonicalization.
-          // x > C-1 ? x+-C : 0 --> subus x, C
-          auto MatchUSUBSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) {
-            return (!Op && !Cond) ||
-                   (Op && Cond &&
-                    Cond->getAPIntValue() == (-Op->getAPIntValue() - 1));
-          };
-          if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
-              ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUSUBSAT,
-                                        /*AllowUndefs*/ true)) {
-            OpRHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
-                                OpRHS);
-            return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
-          }
-
-          // Another special case: If C was a sign bit, the sub has been
-          // canonicalized into a xor.
-          // FIXME: Would it be better to use computeKnownBits to determine
-          //        whether it's safe to decanonicalize the xor?
-          // x s< 0 ? x^C : 0 --> subus x, C
-          if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
-            if (CC == ISD::SETLT && Other.getOpcode() == ISD::XOR &&
-                ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
-                OpRHSConst->getAPIntValue().isSignMask()) {
-              // Note that we have to rebuild the RHS constant here to ensure we
-              // don't rely on particular values of undef lanes.
-              OpRHS = DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT);
-              return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
-            }
-          }
-        }
-      }
-    }
-  }
-
-  // Match VSELECTs into add with unsigned saturation.
-  if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
-      // paddus is available in SSE2 for i8 and i16 vectors.
-      Subtarget.hasSSE2() && VT.getVectorNumElements() >= 2 &&
-      isPowerOf2_32(VT.getVectorNumElements()) &&
-      (VT.getVectorElementType() == MVT::i8 ||
-       VT.getVectorElementType() == MVT::i16)) {
-    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
-
-    SDValue CondLHS = Cond->getOperand(0);
-    SDValue CondRHS = Cond->getOperand(1);
-
-    // Check if one of the arms of the VSELECT is vector with all bits set.
-    // If it's on the left side invert the predicate to simplify logic below.
-    SDValue Other;
-    if (ISD::isBuildVectorAllOnes(LHS.getNode())) {
-      Other = RHS;
-      CC = ISD::getSetCCInverse(CC, VT.getVectorElementType());
-    } else if (ISD::isBuildVectorAllOnes(RHS.getNode())) {
-      Other = LHS;
-    }
-
-    if (Other.getNode() && Other.getOpcode() == ISD::ADD) {
-      SDValue OpLHS = Other.getOperand(0), OpRHS = Other.getOperand(1);
-
-      // Canonicalize condition operands.
-      if (CC == ISD::SETUGE) {
-        std::swap(CondLHS, CondRHS);
-        CC = ISD::SETULE;
-      }
-
-      // We can test against either of the addition operands.
-      // x <= x+y ? x+y : ~0 --> addus x, y
-      // x+y >= x ? x+y : ~0 --> addus x, y
-      if (CC == ISD::SETULE && Other == CondRHS &&
-          (OpLHS == CondLHS || OpRHS == CondLHS))
-        return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS);
-
-      if (isa<BuildVectorSDNode>(OpRHS) && isa<BuildVectorSDNode>(CondRHS) &&
-          CondLHS == OpLHS) {
-        // If the RHS is a constant we have to reverse the const
-        // canonicalization.
-        // x > ~C ? x+C : ~0 --> addus x, C
-        auto MatchUADDSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) {
-          return Cond->getAPIntValue() == ~Op->getAPIntValue();
-        };
-        if (CC == ISD::SETULE &&
-            ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUADDSAT))
-          return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS);
-      }
+    if (CC == ISD::SETUGT && isOneConstant(RHS)) {
+      ISD::CondCode NewCC = ISD::SETUGE;
+      Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
+                          Cond.getOperand(0), Cond.getOperand(1), NewCC);
+      return DAG.getSelect(DL, VT, Cond, LHS, RHS);
     }
   }
 
@@ -40308,10 +41466,18 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
     return V;
 
   // select(~Cond, X, Y) -> select(Cond, Y, X)
-  if (CondVT.getScalarType() != MVT::i1)
+  if (CondVT.getScalarType() != MVT::i1) {
     if (SDValue CondNot = IsNOT(Cond, DAG))
       return DAG.getNode(N->getOpcode(), DL, VT,
                          DAG.getBitcast(CondVT, CondNot), RHS, LHS);
+    // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the signbit.
+    if (Cond.getOpcode() == X86ISD::PCMPGT && Cond.hasOneUse() &&
+        ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode())) {
+      Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,
+                         DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));
+      return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
+    }
+  }
 
   // Try to optimize vXi1 selects if both operands are either all constants or
   // bitcasts from scalar integer type. In that case we can convert the operands
@@ -41928,73 +43094,115 @@ static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-static SDValue combineVectorPackWithShuffle(SDNode *N, SelectionDAG &DAG) {
+static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG,
+                                         const X86Subtarget &Subtarget) {
   unsigned Opcode = N->getOpcode();
-  assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
-         "Unexpected pack opcode");
+  assert((X86ISD::HADD == Opcode || X86ISD::FHADD == Opcode ||
+          X86ISD::HSUB == Opcode || X86ISD::FHSUB == Opcode ||
+          X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
+         "Unexpected hadd/hsub/pack opcode");
 
   EVT VT = N->getValueType(0);
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  unsigned NumDstElts = VT.getVectorNumElements();
+  EVT SrcVT = N0.getValueType();
 
-  // Attempt to fold PACK(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
-  // to SHUFFLE(PACK(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
+  // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
+  // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
   // truncation trees that help us avoid lane crossing shuffles.
   // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
+  // TODO: We don't handle vXf64 shuffles yet.
   if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
       N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
       N0.getConstantOperandAPInt(1) == 0 &&
-      N1.getConstantOperandAPInt(1) == (NumDstElts / 2) &&
+      N1.getConstantOperandAPInt(1) == SrcVT.getVectorNumElements() &&
       N0.getOperand(0) == N1.getOperand(0) && VT.is128BitVector() &&
-      N0.getOperand(0).getValueType().is256BitVector()) {
+      N0.getOperand(0).getValueType().is256BitVector() &&
+      SrcVT.getScalarSizeInBits() <= 32) {
     // TODO - support target/faux shuffles.
     SDValue Vec = peekThroughBitcasts(N0.getOperand(0));
     if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Vec)) {
-      // To keep the PACK LHS/RHS coherency, we must be able to scale the unary
+      // To keep the HOP LHS/RHS coherency, we must be able to scale the unary
       // shuffle to a vXi64 width - we can probably relax this in the future.
       SmallVector<int, 4> ShuffleMask;
       if (SVN->getOperand(1).isUndef() &&
           scaleShuffleElements(SVN->getMask(), 4, ShuffleMask)) {
         SDLoc DL(N);
         SDValue Lo, Hi;
+        MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
         std::tie(Lo, Hi) = DAG.SplitVector(SVN->getOperand(0), DL);
         Lo = DAG.getBitcast(N0.getValueType(), Lo);
         Hi = DAG.getBitcast(N1.getValueType(), Hi);
         SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
-        Res = DAG.getBitcast(MVT::v4i32, Res);
-        Res = DAG.getVectorShuffle(MVT::v4i32, DL, Res, Res, ShuffleMask);
+        Res = DAG.getBitcast(ShufVT, Res);
+        Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
         return DAG.getBitcast(VT, Res);
       }
     }
   }
 
-  // Attempt to fold PACK(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(PACK(X,Y)).
+  // Attempt to fold HOP(SHUFFLE(X),SHUFFLE(Y)) -> SHUFFLE(HOP(X,Y)).
+  // TODO: Merge with binary shuffle folds below.
+  if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
+    int PostShuffle[4] = {0, 1, 2, 3};
+
+    // If the op is an unary shuffle that can scale to v2x64,
+    // then we can perform this as a v4x32 post shuffle.
+    auto AdjustOp = [&](SDValue V, int Offset) {
+      auto *SVN = dyn_cast<ShuffleVectorSDNode>(V);
+      SmallVector<int, 2> ScaledMask;
+      if (!SVN || !SVN->getOperand(1).isUndef() ||
+          !scaleShuffleElements(SVN->getMask(), 2, ScaledMask) ||
+          !N->isOnlyUserOf(V.getNode()))
+        return SDValue();
+      PostShuffle[Offset + 0] = ScaledMask[0] < 0 ? -1 : Offset + ScaledMask[0];
+      PostShuffle[Offset + 1] = ScaledMask[1] < 0 ? -1 : Offset + ScaledMask[1];
+      return SVN->getOperand(0);
+    };
+
+    SDValue Src0 = AdjustOp(N0, 0);
+    SDValue Src1 = AdjustOp(N1, 2);
+    if (Src0 || Src1) {
+      Src0 = Src0 ? Src0 : N0;
+      Src1 = Src1 ? Src1 : N1;
+      SDLoc DL(N);
+      MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
+      SDValue Res = DAG.getNode(Opcode, DL, VT, Src0, Src1);
+      Res = DAG.getBitcast(ShufVT, Res);
+      Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);
+      return DAG.getBitcast(VT, Res);
+    }
+  }
+
+  // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).
   // TODO: Relax shuffle scaling to support sub-128-bit subvector shuffles.
-  if (VT.is256BitVector()) {
-    if (auto *SVN0 = dyn_cast<ShuffleVectorSDNode>(N0)) {
-      if (auto *SVN1 = dyn_cast<ShuffleVectorSDNode>(N1)) {
-        SmallVector<int, 2> ShuffleMask0, ShuffleMask1;
-        if (scaleShuffleElements(SVN0->getMask(), 2, ShuffleMask0) &&
-            scaleShuffleElements(SVN1->getMask(), 2, ShuffleMask1)) {
-          SDValue Op00 = SVN0->getOperand(0);
-          SDValue Op01 = SVN0->getOperand(1);
-          SDValue Op10 = SVN1->getOperand(0);
-          SDValue Op11 = SVN1->getOperand(1);
-          if ((Op00 == Op11) && (Op01 == Op10)) {
-            std::swap(Op10, Op11);
-            ShuffleVectorSDNode::commuteMask(ShuffleMask1);
-          }
-          if ((Op00 == Op10) && (Op01 == Op11)) {
-            SmallVector<int, 4> ShuffleMask;
-            ShuffleMask.append(ShuffleMask0.begin(), ShuffleMask0.end());
-            ShuffleMask.append(ShuffleMask1.begin(), ShuffleMask1.end());
-            SDLoc DL(N);
-            SDValue Res = DAG.getNode(Opcode, DL, VT, Op00, Op01);
-            Res = DAG.getBitcast(MVT::v4i64, Res);
-            Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, ShuffleMask);
-            return DAG.getBitcast(VT, Res);
-          }
+  if (VT.is256BitVector() && Subtarget.hasInt256()) {
+    SmallVector<int> Mask0, Mask1;
+    SmallVector<SDValue> Ops0, Ops1;
+    if (getTargetShuffleInputs(N0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
+        getTargetShuffleInputs(N1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
+        !Ops0.empty() && !Ops1.empty()) {
+      SDValue Op00 = Ops0.front(), Op01 = Ops0.back();
+      SDValue Op10 = Ops1.front(), Op11 = Ops1.back();
+      SmallVector<int, 2> ShuffleMask0, ShuffleMask1;
+      if (Op00.getValueType() == SrcVT && Op01.getValueType() == SrcVT &&
+          Op11.getValueType() == SrcVT && Op11.getValueType() == SrcVT &&
+          scaleShuffleElements(Mask0, 2, ShuffleMask0) &&
+          scaleShuffleElements(Mask1, 2, ShuffleMask1)) {
+        if ((Op00 == Op11) && (Op01 == Op10)) {
+          std::swap(Op10, Op11);
+          ShuffleVectorSDNode::commuteMask(ShuffleMask1);
+        }
+        if ((Op00 == Op10) && (Op01 == Op11)) {
+          SmallVector<int, 4> ShuffleMask;
+          ShuffleMask.append(ShuffleMask0.begin(), ShuffleMask0.end());
+          ShuffleMask.append(ShuffleMask1.begin(), ShuffleMask1.end());
+          SDLoc DL(N);
+          MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
+          SDValue Res = DAG.getNode(Opcode, DL, VT, Op00, Op01);
+          Res = DAG.getBitcast(ShufVT, Res);
+          Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
+          return DAG.getBitcast(VT, Res);
         }
       }
     }
@@ -42077,7 +43285,7 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
   }
 
   // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
-  if (SDValue V = combineVectorPackWithShuffle(N, DAG))
+  if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
     return V;
 
   // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
@@ -42099,6 +43307,28 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
     }
   }
 
+  // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.
+  if (VT.is128BitVector()) {
+    unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+    SDValue Src0, Src1;
+    if (N0.getOpcode() == ExtOpc &&
+        N0.getOperand(0).getValueType().is64BitVector() &&
+        N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
+      Src0 = N0.getOperand(0);
+    }
+    if (N1.getOpcode() == ExtOpc &&
+        N1.getOperand(0).getValueType().is64BitVector() &&
+        N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
+      Src1 = N1.getOperand(0);
+    }
+    if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {
+      assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)");
+      Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());
+      Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());
+      return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);
+    }
+  }
+
   // Attempt to combine as shuffle.
   SDValue Op(N, 0);
   if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
@@ -42107,6 +43337,20 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG,
+                                    TargetLowering::DAGCombinerInfo &DCI,
+                                    const X86Subtarget &Subtarget) {
+  assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||
+          X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
+         "Unexpected horizontal add/sub opcode");
+
+  // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).
+  if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
+    return V;
+
+  return SDValue();
+}
+
 static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG,
                                      TargetLowering::DAGCombinerInfo &DCI,
                                      const X86Subtarget &Subtarget) {
@@ -42735,74 +43979,6 @@ static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
   return SDValue();
 }
 
-// Look for (and (ctpop X), 1) which is the IR form of __builtin_parity.
-// Turn it into series of XORs and a setnp.
-static SDValue combineParity(SDNode *N, SelectionDAG &DAG,
-                             const X86Subtarget &Subtarget) {
-  EVT VT = N->getValueType(0);
-
-  // We only support 64-bit and 32-bit. 64-bit requires special handling
-  // unless the 64-bit popcnt instruction is legal.
-  if (VT != MVT::i32 && VT != MVT::i64)
-    return SDValue();
-
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  if (TLI.isTypeLegal(VT) && TLI.isOperationLegal(ISD::CTPOP, VT))
-    return SDValue();
-
-  SDValue N0 = N->getOperand(0);
-  SDValue N1 = N->getOperand(1);
-
-  // LHS needs to be a single use CTPOP.
-  if (N0.getOpcode() != ISD::CTPOP || !N0.hasOneUse())
-    return SDValue();
-
-  // RHS needs to be 1.
-  if (!isOneConstant(N1))
-    return SDValue();
-
-  SDLoc DL(N);
-  SDValue X = N0.getOperand(0);
-
-  // If this is 64-bit, its always best to xor the two 32-bit pieces together
-  // even if we have popcnt.
-  if (VT == MVT::i64) {
-    SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
-                             DAG.getNode(ISD::SRL, DL, VT, X,
-                                         DAG.getConstant(32, DL, MVT::i8)));
-    SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
-    X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
-    // Generate a 32-bit parity idiom. This will bring us back here if we need
-    // to expand it too.
-    SDValue Parity = DAG.getNode(ISD::AND, DL, MVT::i32,
-                                 DAG.getNode(ISD::CTPOP, DL, MVT::i32, X),
-                                 DAG.getConstant(1, DL, MVT::i32));
-    return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Parity);
-  }
-  assert(VT == MVT::i32 && "Unexpected VT!");
-
-  // Xor the high and low 16-bits together using a 32-bit operation.
-  SDValue Hi16 = DAG.getNode(ISD::SRL, DL, VT, X,
-                             DAG.getConstant(16, DL, MVT::i8));
-  X = DAG.getNode(ISD::XOR, DL, VT, X, Hi16);
-
-  // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
-  // This should allow an h-reg to be used to save a shift.
-  // FIXME: We only get an h-reg in 32-bit mode.
-  SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
-                           DAG.getNode(ISD::SRL, DL, VT, X,
-                                       DAG.getConstant(8, DL, MVT::i8)));
-  SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
-  SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
-  SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
-
-  // Copy the inverse of the parity flag into a register with setcc.
-  SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
-  // Zero extend to original type.
-  return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0), Setnp);
-}
-
-
 // Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
 // Where C is a mask containing the same number of bits as the setcc and
 // where the setcc will freely 0 upper bits of k-register. We can replace the
@@ -42894,10 +44070,6 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
     }
   }
 
-  // This must be done before legalization has expanded the ctpop.
-  if (SDValue V = combineParity(N, DAG, Subtarget))
-    return V;
-
   // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
   // TODO: Support multiple SrcOps.
   if (VT == MVT::i1) {
@@ -42967,7 +44139,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
     if (VT == SrcVecVT.getScalarType() &&
         N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) &&
         getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
-        llvm::all_of(EltBits, [](APInt M) {
+        llvm::all_of(EltBits, [](const APInt &M) {
           return M.isNullValue() || M.isAllOnesValue();
         })) {
       unsigned NumElts = SrcVecVT.getVectorNumElements();
@@ -42986,6 +44158,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
 
       if (SDValue Shuffle = combineX86ShufflesRecursively(
               {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1,
+              X86::MaxShuffleCombineDepth,
               /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
         return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,
                            N->getOperand(0).getOperand(1));
@@ -43653,14 +44826,13 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
   unsigned NumElems = VT.getVectorNumElements();
 
   EVT ScalarVT = VT.getVectorElementType();
-  if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
-        NumElems >= 2 && isPowerOf2_32(NumElems)))
+  if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && NumElems >= 2))
     return SDValue();
 
   // InScalarVT is the intermediate type in AVG pattern and it should be greater
   // than the original input type (i8/i16).
   EVT InScalarVT = InVT.getVectorElementType();
-  if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
+  if (InScalarVT.getFixedSizeInBits() <= ScalarVT.getFixedSizeInBits())
     return SDValue();
 
   if (!Subtarget.hasSSE2())
@@ -43688,8 +44860,8 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
   };
 
   // Check if each element of the vector is right-shifted by one.
-  auto LHS = In.getOperand(0);
-  auto RHS = In.getOperand(1);
+  SDValue LHS = In.getOperand(0);
+  SDValue RHS = In.getOperand(1);
   if (!IsConstVectorInRange(RHS, 1, 1))
     return SDValue();
   if (LHS.getOpcode() != ISD::ADD)
@@ -43705,6 +44877,29 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
     return DAG.getNode(X86ISD::AVG, DL, Ops[0].getValueType(), Ops);
   };
 
+  auto AVGSplitter = [&](SDValue Op0, SDValue Op1) {
+    // Pad to a power-of-2 vector, split+apply and extract the original vector.
+    unsigned NumElemsPow2 = PowerOf2Ceil(NumElems);
+    EVT Pow2VT = EVT::getVectorVT(*DAG.getContext(), ScalarVT, NumElemsPow2);
+    if (NumElemsPow2 != NumElems) {
+      SmallVector<SDValue, 32> Ops0(NumElemsPow2, DAG.getUNDEF(ScalarVT));
+      SmallVector<SDValue, 32> Ops1(NumElemsPow2, DAG.getUNDEF(ScalarVT));
+      for (unsigned i = 0; i != NumElems; ++i) {
+        SDValue Idx = DAG.getIntPtrConstant(i, DL);
+        Ops0[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op0, Idx);
+        Ops1[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op1, Idx);
+      }
+      Op0 = DAG.getBuildVector(Pow2VT, DL, Ops0);
+      Op1 = DAG.getBuildVector(Pow2VT, DL, Ops1);
+    }
+    SDValue Res =
+        SplitOpsAndApply(DAG, Subtarget, DL, Pow2VT, {Op0, Op1}, AVGBuilder);
+    if (NumElemsPow2 == NumElems)
+      return Res;
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
+                       DAG.getIntPtrConstant(0, DL));
+  };
+
   // Take care of the case when one of the operands is a constant vector whose
   // element is in the range [1, 256].
   if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
@@ -43715,9 +44910,7 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
     SDValue VecOnes = DAG.getConstant(1, DL, InVT);
     Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
     Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
-    return SplitOpsAndApply(DAG, Subtarget, DL, VT,
-                            { Operands[0].getOperand(0), Operands[1] },
-                            AVGBuilder);
+    return AVGSplitter(Operands[0].getOperand(0), Operands[1]);
   }
 
   // Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)).
@@ -43764,8 +44957,7 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
       }
 
     // The pattern is detected, emit X86ISD::AVG instruction(s).
-    return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Operands[0], Operands[1]},
-                            AVGBuilder);
+    return AVGSplitter(Operands[0], Operands[1]);
   }
 
   return SDValue();
@@ -43798,7 +44990,8 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
 
     unsigned HalfOffset = 16;
     SDValue Ptr1 = Ld->getBasePtr();
-    SDValue Ptr2 = DAG.getMemBasePlusOffset(Ptr1, HalfOffset, dl);
+    SDValue Ptr2 =
+        DAG.getMemBasePlusOffset(Ptr1, TypeSize::Fixed(HalfOffset), dl);
     EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
                                   NumElems / 2);
     SDValue Load1 =
@@ -43832,6 +45025,29 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
     }
   }
 
+  // If we also broadcast this as a subvector to a wider type, then just extract
+  // the lowest subvector.
+  if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&
+      (RegVT.is128BitVector() || RegVT.is256BitVector())) {
+    SDValue Ptr = Ld->getBasePtr();
+    SDValue Chain = Ld->getChain();
+    for (SDNode *User : Ptr->uses()) {
+      if (User != N && User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
+          cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
+          cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
+          cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
+              MemVT.getSizeInBits() &&
+          !User->hasAnyUseOfValue(1) &&
+          User->getValueSizeInBits(0).getFixedSize() >
+              RegVT.getFixedSizeInBits()) {
+        SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
+                                           RegVT.getSizeInBits());
+        Extract = DAG.getBitcast(RegVT, Extract);
+        return DCI.CombineTo(N, Extract, SDValue(User, 1));
+      }
+    }
+  }
+
   // Cast ptr32 and ptr64 pointers to the default address space before a load.
   unsigned AddrSpace = Ld->getAddressSpace();
   if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
@@ -43873,7 +45089,7 @@ static int getOneTrueElt(SDValue V) {
     auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
     if (!ConstNode)
       return -1;
-    if (ConstNode->getAPIntValue().isAllOnesValue()) {
+    if (ConstNode->getAPIntValue().countTrailingOnes() >= 1) {
       // If we already found a one, this is too many.
       if (TrueIndex >= 0)
         return -1;
@@ -43889,7 +45105,8 @@ static int getOneTrueElt(SDValue V) {
 /// scalar element, and the alignment for the scalar memory access.
 static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
                                          SelectionDAG &DAG, SDValue &Addr,
-                                         SDValue &Index, unsigned &Alignment) {
+                                         SDValue &Index, Align &Alignment,
+                                         unsigned &Offset) {
   int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
   if (TrueMaskElt < 0)
     return false;
@@ -43897,14 +45114,17 @@ static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
   // Get the address of the one scalar element that is specified by the mask
   // using the appropriate offset from the base pointer.
   EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
+  Offset = 0;
   Addr = MaskedOp->getBasePtr();
   if (TrueMaskElt != 0) {
-    unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
-    Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
+    Offset = TrueMaskElt * EltVT.getStoreSize();
+    Addr = DAG.getMemBasePlusOffset(Addr, TypeSize::Fixed(Offset),
+                                    SDLoc(MaskedOp));
   }
 
   Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
-  Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
+  Alignment = commonAlignment(MaskedOp->getOriginalAlign(),
+                              EltVT.getStoreSize());
   return true;
 }
 
@@ -43914,15 +45134,17 @@ static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
 /// mask have already been optimized in IR, so we don't bother with those here.
 static SDValue
 reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
-                             TargetLowering::DAGCombinerInfo &DCI) {
+                             TargetLowering::DAGCombinerInfo &DCI,
+                             const X86Subtarget &Subtarget) {
   assert(ML->isUnindexed() && "Unexpected indexed masked load!");
   // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
   // However, some target hooks may need to be added to know when the transform
   // is profitable. Endianness would also have to be considered.
 
   SDValue Addr, VecIndex;
-  unsigned Alignment;
-  if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
+  Align Alignment;
+  unsigned Offset;
+  if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))
     return SDValue();
 
   // Load the one scalar element that is specified by the mask using the
@@ -43930,13 +45152,25 @@ reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
   SDLoc DL(ML);
   EVT VT = ML->getValueType(0);
   EVT EltVT = VT.getVectorElementType();
+
+  EVT CastVT = VT;
+  if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
+    EltVT = MVT::f64;
+    CastVT =
+        EVT::getVectorVT(*DAG.getContext(), EltVT, VT.getVectorNumElements());
+  }
+
   SDValue Load =
-      DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
+      DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
+                  ML->getPointerInfo().getWithOffset(Offset),
                   Alignment, ML->getMemOperand()->getFlags());
 
+  SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
+
   // Insert the loaded element into the appropriate place in the vector.
-  SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT,
-                               ML->getPassThru(), Load, VecIndex);
+  SDValue Insert =
+      DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);
+  Insert = DAG.getBitcast(VT, Insert);
   return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
 }
 
@@ -43999,7 +45233,8 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
-    if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
+    if (SDValue ScalarLoad =
+            reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))
       return ScalarLoad;
 
     // TODO: Do some AVX512 subsets benefit from this transform?
@@ -44036,25 +45271,35 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
 /// mask have already been optimized in IR, so we don't bother with those here.
 static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
-                                              SelectionDAG &DAG) {
+                                              SelectionDAG &DAG,
+                                              const X86Subtarget &Subtarget) {
   // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
   // However, some target hooks may need to be added to know when the transform
   // is profitable. Endianness would also have to be considered.
 
   SDValue Addr, VecIndex;
-  unsigned Alignment;
-  if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
+  Align Alignment;
+  unsigned Offset;
+  if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))
     return SDValue();
 
   // Extract the one scalar element that is actually being stored.
   SDLoc DL(MS);
-  EVT VT = MS->getValue().getValueType();
+  SDValue Value = MS->getValue();
+  EVT VT = Value.getValueType();
   EVT EltVT = VT.getVectorElementType();
-  SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
-                                MS->getValue(), VecIndex);
+  if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
+    EltVT = MVT::f64;
+    EVT CastVT =
+        EVT::getVectorVT(*DAG.getContext(), EltVT, VT.getVectorNumElements());
+    Value = DAG.getBitcast(CastVT, Value);
+  }
+  SDValue Extract =
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);
 
   // Store that element at the appropriate offset from the base pointer.
-  return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
+  return DAG.getStore(MS->getChain(), DL, Extract, Addr,
+                      MS->getPointerInfo().getWithOffset(Offset),
                       Alignment, MS->getMemOperand()->getFlags());
 }
 
@@ -44072,7 +45317,7 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
   if (Mst->isTruncatingStore())
     return SDValue();
 
-  if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG))
+  if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
     return ScalarStore;
 
   // If the mask value has been legalized to a non-boolean vector, try to
@@ -44133,17 +45378,21 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
   if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
       StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
       StoredVal.getOperand(0).getValueType() == MVT::i8) {
-    return DAG.getStore(St->getChain(), dl, StoredVal.getOperand(0),
+    SDValue Val = StoredVal.getOperand(0);
+    // We must store zeros to the unused bits.
+    Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);
+    return DAG.getStore(St->getChain(), dl, Val,
                         St->getBasePtr(), St->getPointerInfo(),
                         St->getOriginalAlign(),
                         St->getMemOperand()->getFlags());
   }
 
   // Widen v2i1/v4i1 stores to v8i1.
-  if ((VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
+  if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
       Subtarget.hasAVX512()) {
     unsigned NumConcats = 8 / VT.getVectorNumElements();
-    SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(VT));
+    // We must store zeros to the unused bits.
+    SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));
     Ops[0] = StoredVal;
     StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
     return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
@@ -44165,7 +45414,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
       Hi = combinevXi1ConstantToInteger(Hi, DAG);
 
       SDValue Ptr0 = St->getBasePtr();
-      SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 4, dl);
+      SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(4), dl);
 
       SDValue Ch0 =
           DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
@@ -44244,6 +45493,36 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
                            VT, St->getMemOperand(), DAG);
   }
 
+  // Try to fold a extract_element(VTRUNC) pattern into a truncating store.
+  if (!St->isTruncatingStore() && StoredVal.hasOneUse()) {
+    auto IsExtractedElement = [](SDValue V) {
+      if (V.getOpcode() == ISD::TRUNCATE && V.getOperand(0).hasOneUse())
+        V = V.getOperand(0);
+      unsigned Opc = V.getOpcode();
+      if (Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) {
+        if (V.getOperand(0).hasOneUse() && isNullConstant(V.getOperand(1)))
+          return V.getOperand(0);
+      }
+      return SDValue();
+    };
+    if (SDValue Extract = IsExtractedElement(StoredVal)) {
+      SDValue Trunc = peekThroughOneUseBitcasts(Extract);
+      if (Trunc.getOpcode() == X86ISD::VTRUNC) {
+        SDValue Src = Trunc.getOperand(0);
+        MVT DstVT = Trunc.getSimpleValueType();
+        MVT SrcVT = Src.getSimpleValueType();
+        unsigned NumSrcElts = SrcVT.getVectorNumElements();
+        unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;
+        MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
+        if (NumTruncBits == VT.getSizeInBits() &&
+            TLI.isTruncStoreLegal(SrcVT, TruncVT)) {
+          return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),
+                                   TruncVT, St->getMemOperand());
+        }
+      }
+    }
+  }
+
   // Optimize trunc store (of multiple scalars) to shuffle and store.
   // First, pack all of the elements in one place. Next, store to memory
   // in fewer chunks.
@@ -44386,8 +45665,9 @@ static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,
 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
 /// A horizontal-op B, for some already available A and B, and if so then LHS is
 /// set to A, RHS to B, and the routine returns 'true'.
-static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
-                              const X86Subtarget &Subtarget, bool IsCommutative,
+static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
+                              SelectionDAG &DAG, const X86Subtarget &Subtarget,
+                              bool IsCommutative,
                               SmallVectorImpl<int> &PostShuffleMask) {
   // If either operand is undef, bail out. The binop should be simplified.
   if (LHS.isUndef() || RHS.isUndef())
@@ -44481,12 +45761,6 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
       RMask.push_back(i);
   }
 
-  // Avoid 128-bit lane crossing if pre-AVX2 and FP (integer will split).
-  if (!Subtarget.hasAVX2() && VT.isFloatingPoint() &&
-      (isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), LMask) ||
-       isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), RMask)))
-    return false;
-
   // If A and B occur in reverse order in RHS, then canonicalize by commuting
   // RHS operands and shuffle mask.
   if (A != C) {
@@ -44541,23 +45815,39 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
     }
   }
 
-  LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
-  RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
+  SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
+  SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
 
   bool IsIdentityPostShuffle =
       isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
   if (IsIdentityPostShuffle)
     PostShuffleMask.clear();
 
+  // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).
+  if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&
+      isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))
+    return false;
+
+  // If the source nodes are already used in HorizOps then always accept this.
+  // Shuffle folding should merge these back together.
+  bool FoundHorizLHS = llvm::any_of(NewLHS->uses(), [&](SDNode *User) {
+    return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
+  });
+  bool FoundHorizRHS = llvm::any_of(NewRHS->uses(), [&](SDNode *User) {
+    return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
+  });
+  bool ForceHorizOp = FoundHorizLHS && FoundHorizRHS;
+
   // Assume a SingleSource HOP if we only shuffle one input and don't need to
   // shuffle the result.
-  if (!shouldUseHorizontalOp(LHS == RHS &&
+  if (!ForceHorizOp &&
+      !shouldUseHorizontalOp(NewLHS == NewRHS &&
                                  (NumShuffles < 2 || !IsIdentityPostShuffle),
                              DAG, Subtarget))
     return false;
 
-  LHS = DAG.getBitcast(VT, LHS);
-  RHS = DAG.getBitcast(VT, RHS);
+  LHS = DAG.getBitcast(VT, NewLHS);
+  RHS = DAG.getBitcast(VT, NewRHS);
   return true;
 }
 
@@ -44575,7 +45865,8 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
   SmallVector<int, 8> PostShuffleMask;
   if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
        (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
-      isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd, PostShuffleMask)) {
+      isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsFadd,
+                        PostShuffleMask)) {
     SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
     if (!PostShuffleMask.empty())
       HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
@@ -44583,8 +45874,6 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
     return HorizBinOp;
   }
 
-  // NOTE: isHorizontalBinOp may have changed LHS/RHS variables.
-
   return SDValue();
 }
 
@@ -44722,7 +46011,7 @@ static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
 
   EVT OutSVT = OutVT.getVectorElementType();
   EVT InSVT = InVT.getVectorElementType();
-  if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
+  if (!((InSVT == MVT::i16 || InSVT == MVT::i32 || InSVT == MVT::i64) &&
         (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
         NumElems >= 8))
     return SDValue();
@@ -44784,8 +46073,13 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
   // there's no harm in trying pack.
   if (Subtarget.hasAVX512() &&
       !(!Subtarget.useAVX512Regs() && VT.is256BitVector() &&
-        InVT.is512BitVector()))
+        InVT.is512BitVector())) {
+    // PACK should still be worth it for 128-bit vectors if the sources were
+    // originally concatenated from subvectors.
+    SmallVector<SDValue> ConcatOps;
+    if (VT.getSizeInBits() > 128 || !collectConcatOps(In.getNode(), ConcatOps))
     return SDValue();
+  }
 
   unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
   unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
@@ -44807,9 +46101,23 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
   if (SVT == MVT::i32 && NumSignBits != InSVT.getSizeInBits())
     return SDValue();
 
-  if (NumSignBits > (InSVT.getSizeInBits() - NumPackedSignBits))
+  unsigned MinSignBits = InSVT.getSizeInBits() - NumPackedSignBits;
+  if (NumSignBits > MinSignBits)
     return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
 
+  // If we have a srl that only generates signbits that we will discard in
+  // the truncation then we can use PACKSS by converting the srl to a sra.
+  // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
+  if (In.getOpcode() == ISD::SRL && N->isOnlyUserOf(In.getNode()))
+    if (const APInt *ShAmt = DAG.getValidShiftAmountConstant(
+            In, APInt::getAllOnesValue(VT.getVectorNumElements()))) {
+      if (*ShAmt == MinSignBits) {
+        SDValue NewIn = DAG.getNode(ISD::SRA, DL, InVT, In->ops());
+        return truncateVectorWithPACK(X86ISD::PACKSS, VT, NewIn, DL, DAG,
+                                      Subtarget);
+      }
+    }
+
   return SDValue();
 }
 
@@ -46127,7 +47435,6 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
                            const X86Subtarget &Subtarget) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
-  EVT InVT = N0.getValueType();
   SDLoc DL(N);
 
   // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
@@ -46156,23 +47463,17 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
   if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
     return V;
 
-  if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
-      isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
-    // Invert and sign-extend a boolean is the same as zero-extend and subtract
-    // 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently
-    // lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.
-    // sext (xor Bool, -1) --> sub (zext Bool), 1
-    SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
-    return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
-  }
-
   if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
     return V;
 
-  if (VT.isVector())
+  if (VT.isVector()) {
     if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
       return R;
 
+    if (N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG)
+      return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
+  }
+
   if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
     return NewAdd;
 
@@ -46191,14 +47492,23 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
   if (!TLI.isTypeLegal(VT))
     return SDValue();
 
-  EVT ScalarVT = VT.getScalarType();
-  if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
-    return SDValue();
-
   SDValue A = N->getOperand(IsStrict ? 1 : 0);
   SDValue B = N->getOperand(IsStrict ? 2 : 1);
   SDValue C = N->getOperand(IsStrict ? 3 : 2);
 
+  // If the operation allows fast-math and the target does not support FMA,
+  // split this into mul+add to avoid libcall(s).
+  SDNodeFlags Flags = N->getFlags();
+  if (!IsStrict && Flags.hasAllowReassociation() &&
+      TLI.isOperationExpand(ISD::FMA, VT)) {
+    SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);
+    return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);
+  }
+
+  EVT ScalarVT = VT.getScalarType();
+  if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
+    return SDValue();
+
   auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
     bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
     bool LegalOperations = !DCI.isBeforeLegalizeOps();
@@ -46621,7 +47931,7 @@ static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
       Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
     return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
 
-  // Fold movmsk(not(x)) -> not(movmsk) to improve folding of movmsk results
+  // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results
   // with scalar comparisons.
   if (SDValue NotSrc = IsNOT(Src, DAG)) {
     SDLoc DL(N);
@@ -46632,6 +47942,17 @@ static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
                        DAG.getConstant(NotMask, DL, VT));
   }
 
+  // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk
+  // results with scalar comparisons.
+  if (Src.getOpcode() == X86ISD::PCMPGT &&
+      ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {
+    SDLoc DL(N);
+    APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
+    return DAG.getNode(ISD::XOR, DL, VT,
+                       DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),
+                       DAG.getConstant(NotMask, DL, VT));
+  }
+
   // Simplify the inputs.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   APInt DemandedMask(APInt::getAllOnesValue(NumBits));
@@ -46669,7 +47990,8 @@ static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS,
     return DAG.getMaskedGather(Gather->getVTList(),
                                Gather->getMemoryVT(), DL, Ops,
                                Gather->getMemOperand(),
-                               Gather->getIndexType());
+                               Gather->getIndexType(),
+                               Gather->getExtensionType());
   }
   auto *Scatter = cast<MaskedScatterSDNode>(GorS);
   SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
@@ -46677,7 +47999,8 @@ static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS,
   return DAG.getMaskedScatter(Scatter->getVTList(),
                               Scatter->getMemoryVT(), DL,
                               Ops, Scatter->getMemOperand(),
-                              Scatter->getIndexType());
+                              Scatter->getIndexType(),
+                              Scatter->isTruncatingStore());
 }
 
 static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
@@ -47672,17 +48995,18 @@ static SDValue combineAddOrSubToHADDorHSUB(SDNode *N, SelectionDAG &DAG,
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
   bool IsAdd = N->getOpcode() == ISD::ADD;
+  auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
   assert((IsAdd || N->getOpcode() == ISD::SUB) && "Wrong opcode");
 
   SmallVector<int, 8> PostShuffleMask;
   if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
        VT == MVT::v8i32) &&
       Subtarget.hasSSSE3() &&
-      isHorizontalBinOp(Op0, Op1, DAG, Subtarget, IsAdd, PostShuffleMask)) {
-    auto HOpBuilder = [IsAdd](SelectionDAG &DAG, const SDLoc &DL,
-                              ArrayRef<SDValue> Ops) {
-      return DAG.getNode(IsAdd ? X86ISD::HADD : X86ISD::HSUB, DL,
-                         Ops[0].getValueType(), Ops);
+      isHorizontalBinOp(HorizOpcode, Op0, Op1, DAG, Subtarget, IsAdd,
+                        PostShuffleMask)) {
+    auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
+                                    ArrayRef<SDValue> Ops) {
+      return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
     };
     SDValue HorizBinOp =
         SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1}, HOpBuilder);
@@ -47747,12 +49071,11 @@ static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
   if (!VT.isVector())
     return SDValue();
 
-  // PSUBUS is supported, starting from SSE2, but truncation for v8i32
-  // is only worth it with SSSE3 (PSHUFB).
+  // PSUBUS is supported, starting from SSE2.
   EVT EltVT = VT.getVectorElementType();
-  if (!(Subtarget.hasSSE2() && (EltVT == MVT::i8 || EltVT == MVT::i16)) &&
-      !(Subtarget.hasSSSE3() && (VT == MVT::v8i32 || VT == MVT::v8i64)) &&
-      !(Subtarget.useBWIRegs() && (VT == MVT::v16i32)))
+  if (!(Subtarget.hasSSE2() &&
+        (EltVT == MVT::i8 || EltVT == MVT::i16 || VT == MVT::v8i32 ||
+         VT == MVT::v8i64 || VT == MVT::v16i32)))
     return SDValue();
 
   SDValue SubusLHS, SubusRHS;
@@ -47788,9 +49111,9 @@ static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
     SDValue MinLHS = Op1.getOperand(0).getOperand(0);
     SDValue MinRHS = Op1.getOperand(0).getOperand(1);
     EVT TruncVT = Op1.getOperand(0).getValueType();
-    if (!(Subtarget.hasSSSE3() && (TruncVT == MVT::v8i32 ||
-                                   TruncVT == MVT::v8i64)) &&
-        !(Subtarget.useBWIRegs() && (TruncVT == MVT::v16i32)))
+    if (!(Subtarget.hasSSE2() &&
+          (TruncVT == MVT::v8i32 || TruncVT == MVT::v8i64 ||
+           TruncVT == MVT::v16i32)))
       return SDValue();
     SDValue OpToSaturate;
     if (MinLHS.getOpcode() == ISD::ZERO_EXTEND &&
@@ -47828,7 +49151,7 @@ static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
   // values, or first 48 bits for 64 bit values.
   KnownBits Known = DAG.computeKnownBits(SubusLHS);
   unsigned NumZeros = Known.countMinLeadingZeros();
-  if ((VT == MVT::v8i64 && NumZeros < 48) || NumZeros < 16)
+  if (NumZeros < (VT.getScalarSizeInBits() - 16))
     return SDValue();
 
   EVT ExtType = SubusLHS.getValueType();
@@ -47928,43 +49251,47 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
   SDValue Op0 = Ops[0];
   bool IsSplat = llvm::all_of(Ops, [&Op0](SDValue Op) { return Op == Op0; });
 
-  // Fold subvector loads into one.
-  // If needed, look through bitcasts to get to the load.
-  if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
-    bool Fast;
-    const X86TargetLowering *TLI = Subtarget.getTargetLowering();
-    if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
-                                *FirstLd->getMemOperand(), &Fast) &&
-        Fast) {
-      if (SDValue Ld =
-              EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
-        return Ld;
-    }
-  }
-
   // Repeated subvectors.
-  if (IsSplat) {
-    // If this broadcast/subv_broadcast is inserted into both halves, use a
-    // larger broadcast/subv_broadcast.
-    if (Op0.getOpcode() == X86ISD::VBROADCAST ||
-        Op0.getOpcode() == X86ISD::SUBV_BROADCAST)
+  if (IsSplat &&
+      (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
+    // If this broadcast is inserted into both halves, use a larger broadcast.
+    if (Op0.getOpcode() == X86ISD::VBROADCAST)
       return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
 
-    // If this broadcast_load is inserted into both halves, use a larger
-    // broadcast_load. Update other uses to use an extracted subvector.
-    if (Op0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
+    // If this scalar/subvector broadcast_load is inserted into both halves, use
+    // a larger broadcast_load. Update other uses to use an extracted subvector.
+    if (Op0.getOpcode() == X86ISD::VBROADCAST_LOAD ||
+        Op0.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
       auto *MemIntr = cast<MemIntrinsicSDNode>(Op0);
       SDVTList Tys = DAG.getVTList(VT, MVT::Other);
       SDValue Ops[] = {MemIntr->getChain(), MemIntr->getBasePtr()};
-      SDValue BcastLd = DAG.getMemIntrinsicNode(
-          X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
-          MemIntr->getMemOperand());
+      SDValue BcastLd = DAG.getMemIntrinsicNode(Op0.getOpcode(), DL, Tys, Ops,
+                                                MemIntr->getMemoryVT(),
+                                                MemIntr->getMemOperand());
       DAG.ReplaceAllUsesOfValueWith(
           Op0, extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits()));
       DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
       return BcastLd;
     }
 
+    // If this is a simple subvector load repeated across multiple lanes, then
+    // broadcast the load. Update other uses to use an extracted subvector.
+    if (auto *Ld = dyn_cast<LoadSDNode>(Op0)) {
+      if (Ld->isSimple() && !Ld->isNonTemporal() &&
+          Ld->getExtensionType() == ISD::NON_EXTLOAD) {
+        SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+        SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
+        SDValue BcastLd =
+            DAG.getMemIntrinsicNode(X86ISD::SUBV_BROADCAST_LOAD, DL, Tys, Ops,
+                                    Ld->getMemoryVT(), Ld->getMemOperand());
+        DAG.ReplaceAllUsesOfValueWith(
+            Op0,
+            extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits()));
+        DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), BcastLd.getValue(1));
+        return BcastLd;
+      }
+    }
+
     // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
     if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
         (Subtarget.hasAVX2() || MayFoldLoad(Op0.getOperand(0))))
@@ -48042,6 +49369,38 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
         return DAG.getBitcast(VT, Res);
       }
       break;
+    case X86ISD::VPERMV3:
+      if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
+        MVT OpVT = Op0.getSimpleValueType();
+        int NumSrcElts = OpVT.getVectorNumElements();
+        SmallVector<int, 64> ConcatMask;
+        for (unsigned i = 0; i != NumOps; ++i) {
+          bool IsUnary;
+          SmallVector<int, 64> SubMask;
+          SmallVector<SDValue, 2> SubOps;
+          if (!getTargetShuffleMask(Ops[i].getNode(), OpVT, false, SubOps,
+                                    SubMask, IsUnary))
+            break;
+          for (int M : SubMask) {
+            if (0 <= M) {
+              M += M < NumSrcElts ? 0 : NumSrcElts;
+              M += i * NumSrcElts;
+            }
+            ConcatMask.push_back(M);
+          }
+        }
+        if (ConcatMask.size() == (NumOps * NumSrcElts)) {
+          SDValue Src0 = concatSubVectors(Ops[0].getOperand(0),
+                                          Ops[1].getOperand(0), DAG, DL);
+          SDValue Src1 = concatSubVectors(Ops[0].getOperand(2),
+                                          Ops[1].getOperand(2), DAG, DL);
+          MVT IntMaskSVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
+          MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
+          SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
+          return DAG.getNode(X86ISD::VPERMV3, DL, VT, Src0, Mask, Src1);
+        }
+      }
+      break;
     case X86ISD::VSHLI:
     case X86ISD::VSRAI:
     case X86ISD::VSRLI:
@@ -48074,10 +49433,33 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
                            Op0.getOperand(1));
       }
       break;
+    case ISD::AND:
+    case ISD::OR:
+    case ISD::XOR:
+    case X86ISD::ANDNP:
+      // TODO: Add 256-bit support.
+      if (!IsSplat && VT.is512BitVector()) {
+        SmallVector<SDValue, 2> LHS, RHS;
+        for (unsigned i = 0; i != NumOps; ++i) {
+          LHS.push_back(Ops[i].getOperand(0));
+          RHS.push_back(Ops[i].getOperand(1));
+        }
+        MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
+        SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
+                                 NumOps * SrcVT.getVectorNumElements());
+        return DAG.getNode(Op0.getOpcode(), DL, VT,
+                           DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, LHS),
+                           DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, RHS));
+      }
+      break;
+    case X86ISD::HADD:
+    case X86ISD::HSUB:
+    case X86ISD::FHADD:
+    case X86ISD::FHSUB:
     case X86ISD::PACKSS:
     case X86ISD::PACKUS:
-      if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
-          Subtarget.hasInt256()) {
+      if (!IsSplat && VT.is256BitVector() &&
+          (VT.isFloatingPoint() || Subtarget.hasInt256())) {
         SmallVector<SDValue, 2> LHS, RHS;
         for (unsigned i = 0; i != NumOps; ++i) {
           LHS.push_back(Ops[i].getOperand(0));
@@ -48112,6 +49494,20 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
     }
   }
 
+  // Fold subvector loads into one.
+  // If needed, look through bitcasts to get to the load.
+  if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
+    bool Fast;
+    const X86TargetLowering *TLI = Subtarget.getTargetLowering();
+    if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
+                                *FirstLd->getMemOperand(), &Fast) &&
+        Fast) {
+      if (SDValue Ld =
+              EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
+        return Ld;
+    }
+  }
+
   return SDValue();
 }
 
@@ -48183,7 +49579,8 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
       SDValue Ins = SubVec.getOperand(0);
       if (isNullConstant(Ins.getOperand(2)) &&
           ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
-          Ins.getOperand(1).getValueSizeInBits() <= SubVecVT.getSizeInBits())
+          Ins.getOperand(1).getValueSizeInBits().getFixedSize() <=
+              SubVecVT.getFixedSizeInBits())
         return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
                            getZeroVector(OpVT, Subtarget, DAG, dl),
                            Ins.getOperand(1), N->getOperand(2));
@@ -48336,12 +49733,14 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
   unsigned IdxVal = N->getConstantOperandVal(1);
   SDValue InVecBC = peekThroughBitcasts(InVec);
   EVT InVecVT = InVec.getValueType();
+  unsigned SizeInBits = VT.getSizeInBits();
+  unsigned InSizeInBits = InVecVT.getSizeInBits();
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
       TLI.isTypeLegal(InVecVT) &&
-      InVecVT.getSizeInBits() == 256 && InVecBC.getOpcode() == ISD::AND) {
-    auto isConcatenatedNot = [] (SDValue V) {
+      InSizeInBits == 256 && InVecBC.getOpcode() == ISD::AND) {
+    auto isConcatenatedNot = [](SDValue V) {
       V = peekThroughBitcasts(V);
       if (!isBitwiseNot(V))
         return false;
@@ -48384,53 +49783,32 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
       InVec.getOpcode() == ISD::INSERT_SUBVECTOR && IdxVal == 0 &&
       InVec.hasOneUse() && isNullConstant(InVec.getOperand(2)) &&
       ISD::isBuildVectorAllZeros(InVec.getOperand(0).getNode()) &&
-      InVec.getOperand(1).getValueSizeInBits() <= VT.getSizeInBits()) {
+      InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) {
     SDLoc DL(N);
     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
                        getZeroVector(VT, Subtarget, DAG, DL),
                        InVec.getOperand(1), InVec.getOperand(2));
   }
 
-  // If we're extracting from a broadcast then we're better off just
-  // broadcasting to the smaller type directly, assuming this is the only use.
-  // As its a broadcast we don't care about the extraction index.
-  if (InVec.getOpcode() == X86ISD::VBROADCAST && InVec.hasOneUse() &&
-      InVec.getOperand(0).getValueSizeInBits() <= VT.getSizeInBits())
-    return DAG.getNode(X86ISD::VBROADCAST, SDLoc(N), VT, InVec.getOperand(0));
-
-  if (InVec.getOpcode() == X86ISD::VBROADCAST_LOAD && InVec.hasOneUse()) {
-    auto *MemIntr = cast<MemIntrinsicSDNode>(InVec);
-    if (MemIntr->getMemoryVT().getSizeInBits() <= VT.getSizeInBits()) {
-      SDVTList Tys = DAG.getVTList(VT, MVT::Other);
-      SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
-      SDValue BcastLd =
-          DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
-                                  MemIntr->getMemoryVT(),
-                                  MemIntr->getMemOperand());
-      DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
-      return BcastLd;
-    }
-  }
-
   // If we're extracting an upper subvector from a broadcast we should just
   // extract the lowest subvector instead which should allow
   // SimplifyDemandedVectorElts do more simplifications.
   if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST ||
                       InVec.getOpcode() == X86ISD::VBROADCAST_LOAD))
-    return extractSubVector(InVec, 0, DAG, SDLoc(N), VT.getSizeInBits());
+    return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
 
-  // If we're extracting a broadcasted subvector, just use the source.
-  if (InVec.getOpcode() == X86ISD::SUBV_BROADCAST &&
-      InVec.getOperand(0).getValueType() == VT)
-    return InVec.getOperand(0);
+  // If we're extracting a broadcasted subvector, just use the lowest subvector.
+  if (IdxVal != 0 && InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
+      cast<MemIntrinsicSDNode>(InVec)->getMemoryVT() == VT)
+    return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
 
   // Attempt to extract from the source of a shuffle vector.
-  if ((InVecVT.getSizeInBits() % VT.getSizeInBits()) == 0 &&
+  if ((InSizeInBits % SizeInBits) == 0 &&
       (IdxVal % VT.getVectorNumElements()) == 0) {
     SmallVector<int, 32> ShuffleMask;
     SmallVector<int, 32> ScaledMask;
     SmallVector<SDValue, 2> ShuffleInputs;
-    unsigned NumSubVecs = InVecVT.getSizeInBits() / VT.getSizeInBits();
+    unsigned NumSubVecs = InSizeInBits / SizeInBits;
     // Decode the shuffle mask and scale it so its shuffling subvectors.
     if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) &&
         scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
@@ -48440,19 +49818,19 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
       if (ScaledMask[SubVecIdx] == SM_SentinelZero)
         return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
       SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
-      if (Src.getValueSizeInBits() == InVecVT.getSizeInBits()) {
+      if (Src.getValueSizeInBits() == InSizeInBits) {
         unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
         unsigned SrcEltIdx = SrcSubVecIdx * VT.getVectorNumElements();
         return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
-                                SDLoc(N), VT.getSizeInBits());
+                                SDLoc(N), SizeInBits);
       }
     }
   }
 
   // If we're extracting the lowest subvector and we're the only user,
   // we may be able to perform this with a smaller vector width.
+  unsigned InOpcode = InVec.getOpcode();
   if (IdxVal == 0 && InVec.hasOneUse()) {
-    unsigned InOpcode = InVec.getOpcode();
     if (VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
       // v2f64 CVTDQ2PD(v4i32).
       if (InOpcode == ISD::SINT_TO_FP &&
@@ -48476,10 +49854,14 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
          InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG ||
          InOpcode == ISD::SIGN_EXTEND ||
          InOpcode == ISD::SIGN_EXTEND_VECTOR_INREG) &&
-        VT.is128BitVector() &&
-        InVec.getOperand(0).getSimpleValueType().is128BitVector()) {
+        (SizeInBits == 128 || SizeInBits == 256) &&
+        InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {
+      SDLoc DL(N);
+      SDValue Ext = InVec.getOperand(0);
+      if (Ext.getValueSizeInBits() > SizeInBits)
+        Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);
       unsigned ExtOp = getOpcode_EXTEND_VECTOR_INREG(InOpcode);
-      return DAG.getNode(ExtOp, SDLoc(N), VT, InVec.getOperand(0));
+      return DAG.getNode(ExtOp, DL, VT, Ext);
     }
     if (InOpcode == ISD::VSELECT &&
         InVec.getOperand(0).getValueType().is256BitVector() &&
@@ -48491,6 +49873,25 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
       SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
       return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
     }
+    if (InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
+        (VT.is128BitVector() || VT.is256BitVector())) {
+      SDLoc DL(N);
+      SDValue InVecSrc = InVec.getOperand(0);
+      unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;
+      SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
+      return DAG.getNode(InOpcode, DL, VT, Ext);
+    }
+  }
+
+  // Always split vXi64 logical shifts where we're extracting the upper 32-bits
+  // as this is very likely to fold into a shuffle/truncation.
+  if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&
+      InVecVT.getScalarSizeInBits() == 64 &&
+      InVec.getConstantOperandAPInt(1) == 32) {
+    SDLoc DL(N);
+    SDValue Ext =
+        extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
+    return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));
   }
 
   return SDValue();
@@ -48574,7 +49975,7 @@ static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
   // If the input is an extend_invec and the SimplifyDemandedBits call didn't
   // convert it to any_extend_invec, due to the LegalOperations check, do the
   // conversion directly to a vector shuffle manually. This exposes combine
-  // opportunities missed by combineExtInVec not calling
+  // opportunities missed by combineEXTEND_VECTOR_INREG not calling
   // combineX86ShufflesRecursively on SSE4.1 targets.
   // FIXME: This is basically a hack around several other issues related to
   // ANY_EXTEND_VECTOR_INREG.
@@ -48602,11 +50003,13 @@ static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG,
-                               TargetLowering::DAGCombinerInfo &DCI,
-                               const X86Subtarget &Subtarget) {
+static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,
+                                          TargetLowering::DAGCombinerInfo &DCI,
+                                          const X86Subtarget &Subtarget) {
   EVT VT = N->getValueType(0);
   SDValue In = N->getOperand(0);
+  unsigned Opcode = N->getOpcode();
+  unsigned InOpcode = In.getOpcode();
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   // Try to merge vector loads and extend_inreg to an extload.
@@ -48615,7 +50018,7 @@ static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG,
     auto *Ld = cast<LoadSDNode>(In);
     if (Ld->isSimple()) {
       MVT SVT = In.getSimpleValueType().getVectorElementType();
-      ISD::LoadExtType Ext = N->getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG
+      ISD::LoadExtType Ext = Opcode == ISD::SIGN_EXTEND_VECTOR_INREG
                                  ? ISD::SEXTLOAD
                                  : ISD::ZEXTLOAD;
       EVT MemVT =
@@ -48623,8 +50026,7 @@ static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG,
       if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
         SDValue Load =
             DAG.getExtLoad(Ext, SDLoc(N), VT, Ld->getChain(), Ld->getBasePtr(),
-                           Ld->getPointerInfo(), MemVT,
-                           Ld->getOriginalAlign(),
+                           Ld->getPointerInfo(), MemVT, Ld->getOriginalAlign(),
                            Ld->getMemOperand()->getFlags());
         DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
         return Load;
@@ -48632,9 +50034,23 @@ static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG,
     }
   }
 
+  // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
+  if (Opcode == InOpcode)
+    return DAG.getNode(Opcode, SDLoc(N), VT, In.getOperand(0));
+
+  // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))
+  // -> EXTEND_VECTOR_INREG(X).
+  // TODO: Handle non-zero subvector indices.
+  if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&
+      In.getOperand(0).getOpcode() == getOpcode_EXTEND(Opcode) &&
+      In.getOperand(0).getOperand(0).getValueSizeInBits() ==
+          In.getValueSizeInBits())
+    return DAG.getNode(Opcode, SDLoc(N), VT, In.getOperand(0).getOperand(0));
+
   // Attempt to combine as a shuffle.
-  // TODO: SSE41 support
-  if (Subtarget.hasAVX() && N->getOpcode() != ISD::SIGN_EXTEND_VECTOR_INREG) {
+  // TODO: General ZERO_EXTEND_VECTOR_INREG support.
+  if (Opcode == ISD::ANY_EXTEND_VECTOR_INREG ||
+      (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG && Subtarget.hasSSE41())) {
     SDValue Op(N, 0);
     if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
       if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
@@ -48755,11 +50171,15 @@ static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,
   return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
 }
 
-// Try to find a larger VBROADCAST_LOAD that we can extract from. Limit this to
-// cases where the loads have the same input chain and the output chains are
-// unused. This avoids any memory ordering issues.
-static SDValue combineVBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,
-                                      TargetLowering::DAGCombinerInfo &DCI) {
+// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract
+// from. Limit this to cases where the loads have the same input chain and the
+// output chains are unused. This avoids any memory ordering issues.
+static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,
+                                     TargetLowering::DAGCombinerInfo &DCI) {
+  assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||
+          N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
+         "Unknown broadcast load type");
+
   // Only do this if the chain result is unused.
   if (N->hasAnyUseOfValue(1))
     return SDValue();
@@ -48774,13 +50194,13 @@ static SDValue combineVBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,
   // Look at other users of our base pointer and try to find a wider broadcast.
   // The input chain and the size of the memory VT must match.
   for (SDNode *User : Ptr->uses())
-    if (User != N && User->getOpcode() == X86ISD::VBROADCAST_LOAD &&
+    if (User != N && User->getOpcode() == N->getOpcode() &&
         cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
         cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
         cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
             MemVT.getSizeInBits() &&
         !User->hasAnyUseOfValue(1) &&
-        User->getValueSizeInBits(0) > VT.getSizeInBits()) {
+        User->getValueSizeInBits(0).getFixedSize() > VT.getFixedSizeInBits()) {
       SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
                                          VT.getSizeInBits());
       Extract = DAG.getBitcast(VT, Extract);
@@ -48851,6 +50271,17 @@ static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
+static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG,
+                           TargetLowering::DAGCombinerInfo &DCI) {
+  unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (TLI.SimplifyDemandedBits(SDValue(N, 0),
+                               APInt::getAllOnesValue(NumBits), DCI))
+    return SDValue(N, 0);
+
+  return SDValue();
+}
+
 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -48887,7 +50318,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::AND:            return combineAnd(N, DAG, DCI, Subtarget);
   case ISD::OR:             return combineOr(N, DAG, DCI, Subtarget);
   case ISD::XOR:            return combineXor(N, DAG, DCI, Subtarget);
-  case X86ISD::BEXTR:       return combineBEXTR(N, DAG, DCI, Subtarget);
+  case X86ISD::BEXTR:
+  case X86ISD::BEXTRI:      return combineBEXTR(N, DAG, DCI, Subtarget);
   case ISD::LOAD:           return combineLoad(N, DAG, DCI, Subtarget);
   case ISD::MLOAD:          return combineMaskedLoad(N, DAG, DCI, Subtarget);
   case ISD::STORE:          return combineStore(N, DAG, DCI, Subtarget);
@@ -48932,13 +50364,17 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
   case ISD::ANY_EXTEND_VECTOR_INREG:
   case ISD::SIGN_EXTEND_VECTOR_INREG:
-  case ISD::ZERO_EXTEND_VECTOR_INREG: return combineExtInVec(N, DAG, DCI,
-                                                             Subtarget);
+  case ISD::ZERO_EXTEND_VECTOR_INREG:
+    return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);
   case ISD::SETCC:          return combineSetCC(N, DAG, Subtarget);
   case X86ISD::SETCC:       return combineX86SetCC(N, DAG, Subtarget);
   case X86ISD::BRCOND:      return combineBrCond(N, DAG, Subtarget);
   case X86ISD::PACKSS:
   case X86ISD::PACKUS:      return combineVectorPack(N, DAG, DCI, Subtarget);
+  case X86ISD::HADD:
+  case X86ISD::HSUB:
+  case X86ISD::FHADD:
+  case X86ISD::FHSUB:       return combineVectorHADDSUB(N, DAG, DCI, Subtarget);
   case X86ISD::VSHL:
   case X86ISD::VSRA:
   case X86ISD::VSRL:
@@ -49015,8 +50451,10 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::STRICT_FP_EXTEND:
   case ISD::FP_EXTEND:      return combineFP_EXTEND(N, DAG, Subtarget);
   case ISD::FP_ROUND:       return combineFP_ROUND(N, DAG, Subtarget);
-  case X86ISD::VBROADCAST_LOAD: return combineVBROADCAST_LOAD(N, DAG, DCI);
+  case X86ISD::VBROADCAST_LOAD:
+  case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
   case X86ISD::MOVDQ2Q:     return combineMOVDQ2Q(N, DAG);
+  case X86ISD::PDEP:        return combinePDEP(N, DAG, DCI);
   }
 
   return SDValue();
@@ -49305,7 +50743,7 @@ static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) {
                            .Case("{@ccnl}", X86::COND_GE)
                            .Case("{@ccnle}", X86::COND_G)
                            .Case("{@ccno}", X86::COND_NO)
-                           .Case("{@ccnp}", X86::COND_P)
+                           .Case("{@ccnp}", X86::COND_NP)
                            .Case("{@ccns}", X86::COND_NS)
                            .Case("{@cco}", X86::COND_O)
                            .Case("{@ccp}", X86::COND_P)
@@ -49541,8 +50979,8 @@ LowerXConstraint(EVT ConstraintVT) const {
 
 // Lower @cc targets via setcc.
 SDValue X86TargetLowering::LowerAsmOutputForConstraint(
-    SDValue &Chain, SDValue &Flag, SDLoc DL, const AsmOperandInfo &OpInfo,
-    SelectionDAG &DAG) const {
+    SDValue &Chain, SDValue &Flag, const SDLoc &DL,
+    const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
   X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
   if (Cond == X86::COND_INVALID)
     return SDValue();
@@ -49978,30 +51416,35 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
 
   // Not found as a standard register?
   if (!Res.second) {
-    // Map st(0) -> st(7) -> ST0
-    if (Constraint.size() == 7 && Constraint[0] == '{' &&
-        tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
-        Constraint[3] == '(' &&
-        (Constraint[4] >= '0' && Constraint[4] <= '7') &&
-        Constraint[5] == ')' && Constraint[6] == '}') {
-      // st(7) is not allocatable and thus not a member of RFP80. Return
-      // singleton class in cases where we have a reference to it.
-      if (Constraint[4] == '7')
-        return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
-      return std::make_pair(X86::FP0 + Constraint[4] - '0',
-                            &X86::RFP80RegClass);
-    }
-
-    // GCC allows "st(0)" to be called just plain "st".
-    if (StringRef("{st}").equals_lower(Constraint))
-      return std::make_pair(X86::FP0, &X86::RFP80RegClass);
+    // Only match x87 registers if the VT is one SelectionDAGBuilder can convert
+    // to/from f80.
+    if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {
+      // Map st(0) -> st(7) -> ST0
+      if (Constraint.size() == 7 && Constraint[0] == '{' &&
+          tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
+          Constraint[3] == '(' &&
+          (Constraint[4] >= '0' && Constraint[4] <= '7') &&
+          Constraint[5] == ')' && Constraint[6] == '}') {
+        // st(7) is not allocatable and thus not a member of RFP80. Return
+        // singleton class in cases where we have a reference to it.
+        if (Constraint[4] == '7')
+          return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
+        return std::make_pair(X86::FP0 + Constraint[4] - '0',
+                              &X86::RFP80RegClass);
+      }
+
+      // GCC allows "st(0)" to be called just plain "st".
+      if (StringRef("{st}").equals_lower(Constraint))
+        return std::make_pair(X86::FP0, &X86::RFP80RegClass);
+    }
 
     // flags -> EFLAGS
     if (StringRef("{flags}").equals_lower(Constraint))
       return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
 
     // dirflag -> DF
-    if (StringRef("{dirflag}").equals_lower(Constraint))
+    // Only allow for clobber.
+    if (StringRef("{dirflag}").equals_lower(Constraint) && VT == MVT::Other)
       return std::make_pair(X86::DF, &X86::DFCCRRegClass);
 
     // fpsr -> FPSW
@@ -50275,3 +51718,10 @@ X86TargetLowering::getStackProbeSize(MachineFunction &MF) const {
         .getAsInteger(0, StackProbeSize);
   return StackProbeSize;
 }
+
+Align X86TargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
+  if (ML->isInnermost() &&
+      ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())
+    return Align(1ULL << ExperimentalPrefInnermostLoopAlignment);
+  return TargetLowering::getPrefLoopAlignment();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.h b/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.h
index 7f3dc90a2d73..76c83b7df9eb 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.h
@@ -384,8 +384,10 @@ namespace llvm {
     /// Vector comparison generating mask bits for fp and
     /// integer signed and unsigned data types.
     CMPM,
-    // Vector comparison with SAE for FP values
-    CMPM_SAE,
+    // Vector mask comparison generating mask bits for FP values.
+    CMPMM,
+    // Vector mask comparison with SAE for FP values.
+    CMPMM_SAE,
 
     // Arithmetic operations with FLAGS results.
     ADD,
@@ -400,6 +402,7 @@ namespace llvm {
 
     // Bit field extract.
     BEXTR,
+    BEXTRI,
 
     // Zero High Bits Starting with Specified Bit Position.
     BZHI,
@@ -502,8 +505,6 @@ namespace llvm {
     VBROADCAST,
     // Broadcast mask to vector.
     VBROADCASTM,
-    // Broadcast subvector to vector.
-    SUBV_BROADCAST,
 
     /// SSE4A Extraction and Insertion.
     EXTRQI,
@@ -708,6 +709,9 @@ namespace llvm {
     // For avx512-vp2intersect
     VP2INTERSECT,
 
+    // User level interrupts - testui
+    TESTUI,
+
     /// X86 strict FP compare instructions.
     STRICT_FCMP = ISD::FIRST_TARGET_STRICTFP_OPCODE,
     STRICT_FCMPS,
@@ -747,11 +751,13 @@ namespace llvm {
     STRICT_CVTPS2PH,
     STRICT_CVTPH2PS,
 
+    // WARNING: Only add nodes here if they are stric FP nodes. Non-memory and
+    // non-strict FP nodes should be above FIRST_TARGET_STRICTFP_OPCODE.
+
     // Compare and swap.
     LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE,
     LCMPXCHG8_DAG,
     LCMPXCHG16_DAG,
-    LCMPXCHG8_SAVE_EBX_DAG,
     LCMPXCHG16_SAVE_RBX_DAG,
 
     /// LOCK-prefixed arithmetic read-modify-write instructions.
@@ -768,9 +774,12 @@ namespace llvm {
     // extract_vector_elt, store.
     VEXTRACT_STORE,
 
-    // scalar broadcast from memory
+    // scalar broadcast from memory.
     VBROADCAST_LOAD,
 
+    // subvector broadcast from memory.
+    SUBV_BROADCAST_LOAD,
+
     // Store FP control world into i16 memory.
     FNSTCW16m,
 
@@ -806,9 +815,10 @@ namespace llvm {
     /// specifies the type to store as.
     FST,
 
-    /// This instruction grabs the address of the next argument
+    /// These instructions grab the address of the next argument
     /// from a va_list. (reads and modifies the va_list in memory)
     VAARG_64,
+    VAARG_X32,
 
     // Vector truncating store with unsigned/signed saturation
     VTRUNCSTOREUS,
@@ -821,6 +831,16 @@ namespace llvm {
     MGATHER,
     MSCATTER,
 
+    // Key locker nodes that produce flags.
+    AESENC128KL,
+    AESDEC128KL,
+    AESENC256KL,
+    AESDEC256KL,
+    AESENCWIDE128KL,
+    AESDECWIDE128KL,
+    AESENCWIDE256KL,
+    AESDECWIDE256KL,
+
     // WARNING: Do not add anything in the end unless you want the node to
     // have memop! In fact, starting from FIRST_TARGET_MEMORY_OPCODE all
     // opcodes will be thought as target memory ops!
@@ -835,7 +855,7 @@ namespace llvm {
     /// Returns true of the given offset can be
     /// fit into displacement field of the instruction.
     bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
-                                      bool hasSymbolicDisplacement = true);
+                                      bool hasSymbolicDisplacement);
 
     /// Determines whether the callee is required to pop its
     /// own arguments. Callee pop is necessary to support tail calls.
@@ -907,14 +927,6 @@ namespace llvm {
     ///
     SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
 
-    /// Places new result values for the node in Results (their number
-    /// and types must exactly match those of the original return values of
-    /// the node), or leaves Results empty, which indicates that the node is not
-    /// to be custom lowered after all.
-    void LowerOperationWrapper(SDNode *N,
-                               SmallVectorImpl<SDValue> &Results,
-                               SelectionDAG &DAG) const override;
-
     /// Replace the results of node with an illegal result
     /// type with new values built out of custom code.
     ///
@@ -1116,7 +1128,8 @@ namespace llvm {
     }
 
     /// Handle Lowering flag assembly outputs.
-    SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag, SDLoc DL,
+    SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag,
+                                        const SDLoc &DL,
                                         const AsmOperandInfo &Constraint,
                                         SelectionDAG &DAG) const override;
 
@@ -1349,8 +1362,6 @@ namespace llvm {
                                           Align Alignment,
                                           SelectionDAG &DAG) const;
 
-    bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
-
     /// Customize the preferred legalization strategy for certain types.
     LegalizeTypeAction getPreferredVectorAction(MVT VT) const override;
 
@@ -1397,6 +1408,8 @@ namespace llvm {
                                    SDValue Addr, SelectionDAG &DAG)
                                    const override;
 
+    Align getPrefLoopAlignment(MachineLoop *ML) const override;
+
   protected:
     std::pair<const TargetRegisterClass *, uint8_t>
     findRepresentativeClass(const TargetRegisterInfo *TRI,
@@ -1488,6 +1501,7 @@ namespace llvm {
     SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerLRINT_LLRINT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const;
@@ -1514,9 +1528,6 @@ namespace llvm {
     SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
 
-    SDValue LowerF128Call(SDValue Op, SelectionDAG &DAG,
-                          RTLIB::Libcall Call) const;
-
     SDValue
     LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
                          const SmallVectorImpl<ISD::InputArg> &Ins,
@@ -1572,8 +1583,7 @@ namespace llvm {
 
     // Utility function to emit the low-level va_arg code for X86-64.
     MachineBasicBlock *
-    EmitVAARG64WithCustomInserter(MachineInstr &MI,
-                                  MachineBasicBlock *MBB) const;
+    EmitVAARGWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const;
 
     /// Utility function to emit the xmm reg save portion of va_start.
     MachineBasicBlock *
@@ -1689,7 +1699,7 @@ namespace llvm {
   };
 
   /// Generate unpacklo/unpackhi shuffle mask.
-  void createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo,
+  void createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask, bool Lo,
                                bool Unary);
 
   /// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp
index 1628f85da808..85410c54a4d2 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp
@@ -28,7 +28,7 @@ using namespace llvm;
 
 #define DEBUG_TYPE "x86-indirect-branch-tracking"
 
-static cl::opt<bool> IndirectBranchTracking(
+cl::opt<bool> IndirectBranchTracking(
     "x86-indirect-branch-tracking", cl::init(false), cl::Hidden,
     cl::desc("Enable X86 indirect branch tracking pass."));
 
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86IndirectThunks.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86IndirectThunks.cpp
index 828887d96129..3d96d198b409 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86IndirectThunks.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86IndirectThunks.cpp
@@ -95,7 +95,6 @@ struct LVIThunkInserter : ThunkInserter<LVIThunkInserter> {
     BuildMI(&MF.front(), DebugLoc(), TII->get(X86::LFENCE));
     BuildMI(&MF.front(), DebugLoc(), TII->get(X86::JMP64r)).addReg(X86::R11);
     MF.front().addLiveIn(X86::R11);
-    return;
   }
 };
 
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InsertPrefetch.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86InsertPrefetch.cpp
index 53925bbfd72f..004e6fa5ebf4 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InsertPrefetch.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InsertPrefetch.cpp
@@ -214,10 +214,10 @@ bool X86InsertPrefetch::runOnMachineFunction(MachineFunction &MF) {
             MF.CreateMachineInstr(Desc, Current->getDebugLoc(), true);
         MachineInstrBuilder MIB(MF, PFetch);
 
-        assert(X86::AddrBaseReg == 0 && X86::AddrScaleAmt == 1 &&
-               X86::AddrIndexReg == 2 && X86::AddrDisp == 3 &&
-               X86::AddrSegmentReg == 4 &&
-               "Unexpected change in X86 operand offset order.");
+        static_assert(X86::AddrBaseReg == 0 && X86::AddrScaleAmt == 1 &&
+                          X86::AddrIndexReg == 2 && X86::AddrDisp == 3 &&
+                          X86::AddrSegmentReg == 4,
+                      "Unexpected change in X86 operand offset order.");
 
         // This assumes X86::AddBaseReg = 0, {...}ScaleAmt = 1, etc.
         // FIXME(mtrofin): consider adding a:
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InsertWait.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86InsertWait.cpp
index a82d98d88b30..56d2709f5937 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InsertWait.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InsertWait.cpp
@@ -27,7 +27,6 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/Support/Debug.h"
 
@@ -48,9 +47,6 @@ public:
   StringRef getPassName() const override {
     return "X86 insert wait instruction";
   }
-
-private:
-  const TargetInstrInfo *TII; // Machine instruction info.
 };
 
 } // namespace
@@ -119,7 +115,7 @@ bool WaitInsert::runOnMachineFunction(MachineFunction &MF) {
     return false;
 
   const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
-  TII = ST.getInstrInfo();
+  const X86InstrInfo *TII = ST.getInstrInfo();
   bool Changed = false;
 
   for (MachineBasicBlock &MBB : MF) {
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
new file mode 100644
index 000000000000..c4150ed52854
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
@@ -0,0 +1,2017 @@
+//===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements a TargetTransformInfo analysis pass specific to the
+/// X86 target machine. It uses the target's detailed information to provide
+/// more precise answers to certain TTI queries, while letting the target
+/// independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#include "X86TargetTransformInfo.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsX86.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86tti"
+
+/// Return a constant boolean vector that has true elements in all positions
+/// where the input constant data vector has an element with the sign bit set.
+static Constant *getNegativeIsTrueBoolVec(Constant *V) {
+  VectorType *IntTy = VectorType::getInteger(cast<VectorType>(V->getType()));
+  V = ConstantExpr::getBitCast(V, IntTy);
+  V = ConstantExpr::getICmp(CmpInst::ICMP_SGT, Constant::getNullValue(IntTy),
+                            V);
+  return V;
+}
+
+/// Convert the x86 XMM integer vector mask to a vector of bools based on
+/// each element's most significant bit (the sign bit).
+static Value *getBoolVecFromMask(Value *Mask) {
+  // Fold Constant Mask.
+  if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask))
+    return getNegativeIsTrueBoolVec(ConstantMask);
+
+  // Mask was extended from a boolean vector.
+  Value *ExtMask;
+  if (PatternMatch::match(
+          Mask, PatternMatch::m_SExt(PatternMatch::m_Value(ExtMask))) &&
+      ExtMask->getType()->isIntOrIntVectorTy(1))
+    return ExtMask;
+
+  return nullptr;
+}
+
+// TODO: If the x86 backend knew how to convert a bool vector mask back to an
+// XMM register mask efficiently, we could transform all x86 masked intrinsics
+// to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
+static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) {
+  Value *Ptr = II.getOperand(0);
+  Value *Mask = II.getOperand(1);
+  Constant *ZeroVec = Constant::getNullValue(II.getType());
+
+  // Zero Mask - masked load instruction creates a zero vector.
+  if (isa<ConstantAggregateZero>(Mask))
+    return IC.replaceInstUsesWith(II, ZeroVec);
+
+  // The mask is constant or extended from a bool vector. Convert this x86
+  // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
+  if (Value *BoolMask = getBoolVecFromMask(Mask)) {
+    // First, cast the x86 intrinsic scalar pointer to a vector pointer to match
+    // the LLVM intrinsic definition for the pointer argument.
+    unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
+    PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace);
+    Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
+
+    // The pass-through vector for an x86 masked load is a zero vector.
+    CallInst *NewMaskedLoad =
+        IC.Builder.CreateMaskedLoad(PtrCast, Align(1), BoolMask, ZeroVec);
+    return IC.replaceInstUsesWith(II, NewMaskedLoad);
+  }
+
+  return nullptr;
+}
+
+// TODO: If the x86 backend knew how to convert a bool vector mask back to an
+// XMM register mask efficiently, we could transform all x86 masked intrinsics
+// to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
+static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) {
+  Value *Ptr = II.getOperand(0);
+  Value *Mask = II.getOperand(1);
+  Value *Vec = II.getOperand(2);
+
+  // Zero Mask - this masked store instruction does nothing.
+  if (isa<ConstantAggregateZero>(Mask)) {
+    IC.eraseInstFromFunction(II);
+    return true;
+  }
+
+  // The SSE2 version is too weird (eg, unaligned but non-temporal) to do
+  // anything else at this level.
+  if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu)
+    return false;
+
+  // The mask is constant or extended from a bool vector. Convert this x86
+  // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
+  if (Value *BoolMask = getBoolVecFromMask(Mask)) {
+    unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
+    PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace);
+    Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
+
+    IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask);
+
+    // 'Replace uses' doesn't work for stores. Erase the original masked store.
+    IC.eraseInstFromFunction(II);
+    return true;
+  }
+
+  return false;
+}
+
+static Value *simplifyX86immShift(const IntrinsicInst &II,
+                                  InstCombiner::BuilderTy &Builder) {
+  bool LogicalShift = false;
+  bool ShiftLeft = false;
+  bool IsImm = false;
+
+  switch (II.getIntrinsicID()) {
+  default:
+    llvm_unreachable("Unexpected intrinsic!");
+  case Intrinsic::x86_sse2_psrai_d:
+  case Intrinsic::x86_sse2_psrai_w:
+  case Intrinsic::x86_avx2_psrai_d:
+  case Intrinsic::x86_avx2_psrai_w:
+  case Intrinsic::x86_avx512_psrai_q_128:
+  case Intrinsic::x86_avx512_psrai_q_256:
+  case Intrinsic::x86_avx512_psrai_d_512:
+  case Intrinsic::x86_avx512_psrai_q_512:
+  case Intrinsic::x86_avx512_psrai_w_512:
+    IsImm = true;
+    LLVM_FALLTHROUGH;
+  case Intrinsic::x86_sse2_psra_d:
+  case Intrinsic::x86_sse2_psra_w:
+  case Intrinsic::x86_avx2_psra_d:
+  case Intrinsic::x86_avx2_psra_w:
+  case Intrinsic::x86_avx512_psra_q_128:
+  case Intrinsic::x86_avx512_psra_q_256:
+  case Intrinsic::x86_avx512_psra_d_512:
+  case Intrinsic::x86_avx512_psra_q_512:
+  case Intrinsic::x86_avx512_psra_w_512:
+    LogicalShift = false;
+    ShiftLeft = false;
+    break;
+  case Intrinsic::x86_sse2_psrli_d:
+  case Intrinsic::x86_sse2_psrli_q:
+  case Intrinsic::x86_sse2_psrli_w:
+  case Intrinsic::x86_avx2_psrli_d:
+  case Intrinsic::x86_avx2_psrli_q:
+  case Intrinsic::x86_avx2_psrli_w:
+  case Intrinsic::x86_avx512_psrli_d_512:
+  case Intrinsic::x86_avx512_psrli_q_512:
+  case Intrinsic::x86_avx512_psrli_w_512:
+    IsImm = true;
+    LLVM_FALLTHROUGH;
+  case Intrinsic::x86_sse2_psrl_d:
+  case Intrinsic::x86_sse2_psrl_q:
+  case Intrinsic::x86_sse2_psrl_w:
+  case Intrinsic::x86_avx2_psrl_d:
+  case Intrinsic::x86_avx2_psrl_q:
+  case Intrinsic::x86_avx2_psrl_w:
+  case Intrinsic::x86_avx512_psrl_d_512:
+  case Intrinsic::x86_avx512_psrl_q_512:
+  case Intrinsic::x86_avx512_psrl_w_512:
+    LogicalShift = true;
+    ShiftLeft = false;
+    break;
+  case Intrinsic::x86_sse2_pslli_d:
+  case Intrinsic::x86_sse2_pslli_q:
+  case Intrinsic::x86_sse2_pslli_w:
+  case Intrinsic::x86_avx2_pslli_d:
+  case Intrinsic::x86_avx2_pslli_q:
+  case Intrinsic::x86_avx2_pslli_w:
+  case Intrinsic::x86_avx512_pslli_d_512:
+  case Intrinsic::x86_avx512_pslli_q_512:
+  case Intrinsic::x86_avx512_pslli_w_512:
+    IsImm = true;
+    LLVM_FALLTHROUGH;
+  case Intrinsic::x86_sse2_psll_d:
+  case Intrinsic::x86_sse2_psll_q:
+  case Intrinsic::x86_sse2_psll_w:
+  case Intrinsic::x86_avx2_psll_d:
+  case Intrinsic::x86_avx2_psll_q:
+  case Intrinsic::x86_avx2_psll_w:
+  case Intrinsic::x86_avx512_psll_d_512:
+  case Intrinsic::x86_avx512_psll_q_512:
+  case Intrinsic::x86_avx512_psll_w_512:
+    LogicalShift = true;
+    ShiftLeft = true;
+    break;
+  }
+  assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
+
+  auto Vec = II.getArgOperand(0);
+  auto Amt = II.getArgOperand(1);
+  auto VT = cast<FixedVectorType>(Vec->getType());
+  auto SVT = VT->getElementType();
+  auto AmtVT = Amt->getType();
+  unsigned VWidth = VT->getNumElements();
+  unsigned BitWidth = SVT->getPrimitiveSizeInBits();
+
+  // If the shift amount is guaranteed to be in-range we can replace it with a
+  // generic shift. If its guaranteed to be out of range, logical shifts combine
+  // to zero and arithmetic shifts are clamped to (BitWidth - 1).
+  if (IsImm) {
+    assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type");
+    KnownBits KnownAmtBits =
+        llvm::computeKnownBits(Amt, II.getModule()->getDataLayout());
+    if (KnownAmtBits.getMaxValue().ult(BitWidth)) {
+      Amt = Builder.CreateZExtOrTrunc(Amt, SVT);
+      Amt = Builder.CreateVectorSplat(VWidth, Amt);
+      return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
+                                        : Builder.CreateLShr(Vec, Amt))
+                           : Builder.CreateAShr(Vec, Amt));
+    }
+    if (KnownAmtBits.getMinValue().uge(BitWidth)) {
+      if (LogicalShift)
+        return ConstantAggregateZero::get(VT);
+      Amt = ConstantInt::get(SVT, BitWidth - 1);
+      return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt));
+    }
+  } else {
+    // Ensure the first element has an in-range value and the rest of the
+    // elements in the bottom 64 bits are zero.
+    assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
+           cast<VectorType>(AmtVT)->getElementType() == SVT &&
+           "Unexpected shift-by-scalar type");
+    unsigned NumAmtElts = cast<FixedVectorType>(AmtVT)->getNumElements();
+    APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0);
+    APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2);
+    KnownBits KnownLowerBits = llvm::computeKnownBits(
+        Amt, DemandedLower, II.getModule()->getDataLayout());
+    KnownBits KnownUpperBits = llvm::computeKnownBits(
+        Amt, DemandedUpper, II.getModule()->getDataLayout());
+    if (KnownLowerBits.getMaxValue().ult(BitWidth) &&
+        (DemandedUpper.isNullValue() || KnownUpperBits.isZero())) {
+      SmallVector<int, 16> ZeroSplat(VWidth, 0);
+      Amt = Builder.CreateShuffleVector(Amt, ZeroSplat);
+      return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
+                                        : Builder.CreateLShr(Vec, Amt))
+                           : Builder.CreateAShr(Vec, Amt));
+    }
+  }
+
+  // Simplify if count is constant vector.
+  auto CDV = dyn_cast<ConstantDataVector>(Amt);
+  if (!CDV)
+    return nullptr;
+
+  // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector
+  // operand to compute the shift amount.
+  assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
+         cast<VectorType>(AmtVT)->getElementType() == SVT &&
+         "Unexpected shift-by-scalar type");
+
+  // Concatenate the sub-elements to create the 64-bit value.
+  APInt Count(64, 0);
+  for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) {
+    unsigned SubEltIdx = (NumSubElts - 1) - i;
+    auto SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx));
+    Count <<= BitWidth;
+    Count |= SubElt->getValue().zextOrTrunc(64);
+  }
+
+  // If shift-by-zero then just return the original value.
+  if (Count.isNullValue())
+    return Vec;
+
+  // Handle cases when Shift >= BitWidth.
+  if (Count.uge(BitWidth)) {
+    // If LogicalShift - just return zero.
+    if (LogicalShift)
+      return ConstantAggregateZero::get(VT);
+
+    // If ArithmeticShift - clamp Shift to (BitWidth - 1).
+    Count = APInt(64, BitWidth - 1);
+  }
+
+  // Get a constant vector of the same type as the first operand.
+  auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth));
+  auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt);
+
+  if (ShiftLeft)
+    return Builder.CreateShl(Vec, ShiftVec);
+
+  if (LogicalShift)
+    return Builder.CreateLShr(Vec, ShiftVec);
+
+  return Builder.CreateAShr(Vec, ShiftVec);
+}
+
+// Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift.
+// Unlike the generic IR shifts, the intrinsics have defined behaviour for out
+// of range shift amounts (logical - set to zero, arithmetic - splat sign bit).
+static Value *simplifyX86varShift(const IntrinsicInst &II,
+                                  InstCombiner::BuilderTy &Builder) {
+  bool LogicalShift = false;
+  bool ShiftLeft = false;
+
+  switch (II.getIntrinsicID()) {
+  default:
+    llvm_unreachable("Unexpected intrinsic!");
+  case Intrinsic::x86_avx2_psrav_d:
+  case Intrinsic::x86_avx2_psrav_d_256:
+  case Intrinsic::x86_avx512_psrav_q_128:
+  case Intrinsic::x86_avx512_psrav_q_256:
+  case Intrinsic::x86_avx512_psrav_d_512:
+  case Intrinsic::x86_avx512_psrav_q_512:
+  case Intrinsic::x86_avx512_psrav_w_128:
+  case Intrinsic::x86_avx512_psrav_w_256:
+  case Intrinsic::x86_avx512_psrav_w_512:
+    LogicalShift = false;
+    ShiftLeft = false;
+    break;
+  case Intrinsic::x86_avx2_psrlv_d:
+  case Intrinsic::x86_avx2_psrlv_d_256:
+  case Intrinsic::x86_avx2_psrlv_q:
+  case Intrinsic::x86_avx2_psrlv_q_256:
+  case Intrinsic::x86_avx512_psrlv_d_512:
+  case Intrinsic::x86_avx512_psrlv_q_512:
+  case Intrinsic::x86_avx512_psrlv_w_128:
+  case Intrinsic::x86_avx512_psrlv_w_256:
+  case Intrinsic::x86_avx512_psrlv_w_512:
+    LogicalShift = true;
+    ShiftLeft = false;
+    break;
+  case Intrinsic::x86_avx2_psllv_d:
+  case Intrinsic::x86_avx2_psllv_d_256:
+  case Intrinsic::x86_avx2_psllv_q:
+  case Intrinsic::x86_avx2_psllv_q_256:
+  case Intrinsic::x86_avx512_psllv_d_512:
+  case Intrinsic::x86_avx512_psllv_q_512:
+  case Intrinsic::x86_avx512_psllv_w_128:
+  case Intrinsic::x86_avx512_psllv_w_256:
+  case Intrinsic::x86_avx512_psllv_w_512:
+    LogicalShift = true;
+    ShiftLeft = true;
+    break;
+  }
+  assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
+
+  auto Vec = II.getArgOperand(0);
+  auto Amt = II.getArgOperand(1);
+  auto VT = cast<FixedVectorType>(II.getType());
+  auto SVT = VT->getElementType();
+  int NumElts = VT->getNumElements();
+  int BitWidth = SVT->getIntegerBitWidth();
+
+  // If the shift amount is guaranteed to be in-range we can replace it with a
+  // generic shift.
+  APInt UpperBits =
+      APInt::getHighBitsSet(BitWidth, BitWidth - Log2_32(BitWidth));
+  if (llvm::MaskedValueIsZero(Amt, UpperBits,
+                              II.getModule()->getDataLayout())) {
+    return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
+                                      : Builder.CreateLShr(Vec, Amt))
+                         : Builder.CreateAShr(Vec, Amt));
+  }
+
+  // Simplify if all shift amounts are constant/undef.
+  auto *CShift = dyn_cast<Constant>(Amt);
+  if (!CShift)
+    return nullptr;
+
+  // Collect each element's shift amount.
+  // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth.
+  bool AnyOutOfRange = false;
+  SmallVector<int, 8> ShiftAmts;
+  for (int I = 0; I < NumElts; ++I) {
+    auto *CElt = CShift->getAggregateElement(I);
+    if (isa_and_nonnull<UndefValue>(CElt)) {
+      ShiftAmts.push_back(-1);
+      continue;
+    }
+
+    auto *COp = dyn_cast_or_null<ConstantInt>(CElt);
+    if (!COp)
+      return nullptr;
+
+    // Handle out of range shifts.
+    // If LogicalShift - set to BitWidth (special case).
+    // If ArithmeticShift - set to (BitWidth - 1) (sign splat).
+    APInt ShiftVal = COp->getValue();
+    if (ShiftVal.uge(BitWidth)) {
+      AnyOutOfRange = LogicalShift;
+      ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1);
+      continue;
+    }
+
+    ShiftAmts.push_back((int)ShiftVal.getZExtValue());
+  }
+
+  // If all elements out of range or UNDEF, return vector of zeros/undefs.
+  // ArithmeticShift should only hit this if they are all UNDEF.
+  auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); };
+  if (llvm::all_of(ShiftAmts, OutOfRange)) {
+    SmallVector<Constant *, 8> ConstantVec;
+    for (int Idx : ShiftAmts) {
+      if (Idx < 0) {
+        ConstantVec.push_back(UndefValue::get(SVT));
+      } else {
+        assert(LogicalShift && "Logical shift expected");
+        ConstantVec.push_back(ConstantInt::getNullValue(SVT));
+      }
+    }
+    return ConstantVector::get(ConstantVec);
+  }
+
+  // We can't handle only some out of range values with generic logical shifts.
+  if (AnyOutOfRange)
+    return nullptr;
+
+  // Build the shift amount constant vector.
+  SmallVector<Constant *, 8> ShiftVecAmts;
+  for (int Idx : ShiftAmts) {
+    if (Idx < 0)
+      ShiftVecAmts.push_back(UndefValue::get(SVT));
+    else
+      ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx));
+  }
+  auto ShiftVec = ConstantVector::get(ShiftVecAmts);
+
+  if (ShiftLeft)
+    return Builder.CreateShl(Vec, ShiftVec);
+
+  if (LogicalShift)
+    return Builder.CreateLShr(Vec, ShiftVec);
+
+  return Builder.CreateAShr(Vec, ShiftVec);
+}
+
+static Value *simplifyX86pack(IntrinsicInst &II,
+                              InstCombiner::BuilderTy &Builder, bool IsSigned) {
+  Value *Arg0 = II.getArgOperand(0);
+  Value *Arg1 = II.getArgOperand(1);
+  Type *ResTy = II.getType();
+
+  // Fast all undef handling.
+  if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1))
+    return UndefValue::get(ResTy);
+
+  auto *ArgTy = cast<FixedVectorType>(Arg0->getType());
+  unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128;
+  unsigned NumSrcElts = ArgTy->getNumElements();
+  assert(cast<FixedVectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) &&
+         "Unexpected packing types");
+
+  unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
+  unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits();
+  unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits();
+  assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) &&
+         "Unexpected packing types");
+
+  // Constant folding.
+  if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
+    return nullptr;
+
+  // Clamp Values - signed/unsigned both use signed clamp values, but they
+  // differ on the min/max values.
+  APInt MinValue, MaxValue;
+  if (IsSigned) {
+    // PACKSS: Truncate signed value with signed saturation.
+    // Source values less than dst minint are saturated to minint.
+    // Source values greater than dst maxint are saturated to maxint.
+    MinValue =
+        APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
+    MaxValue =
+        APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
+  } else {
+    // PACKUS: Truncate signed value with unsigned saturation.
+    // Source values less than zero are saturated to zero.
+    // Source values greater than dst maxuint are saturated to maxuint.
+    MinValue = APInt::getNullValue(SrcScalarSizeInBits);
+    MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits);
+  }
+
+  auto *MinC = Constant::getIntegerValue(ArgTy, MinValue);
+  auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue);
+  Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0);
+  Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1);
+  Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0);
+  Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1);
+
+  // Shuffle clamped args together at the lane level.
+  SmallVector<int, 32> PackMask;
+  for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
+    for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
+      PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane));
+    for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
+      PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts);
+  }
+  auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask);
+
+  // Truncate to dst size.
+  return Builder.CreateTrunc(Shuffle, ResTy);
+}
+
+static Value *simplifyX86movmsk(const IntrinsicInst &II,
+                                InstCombiner::BuilderTy &Builder) {
+  Value *Arg = II.getArgOperand(0);
+  Type *ResTy = II.getType();
+
+  // movmsk(undef) -> zero as we must ensure the upper bits are zero.
+  if (isa<UndefValue>(Arg))
+    return Constant::getNullValue(ResTy);
+
+  auto *ArgTy = dyn_cast<FixedVectorType>(Arg->getType());
+  // We can't easily peek through x86_mmx types.
+  if (!ArgTy)
+    return nullptr;
+
+  // Expand MOVMSK to compare/bitcast/zext:
+  // e.g. PMOVMSKB(v16i8 x):
+  // %cmp = icmp slt <16 x i8> %x, zeroinitializer
+  // %int = bitcast <16 x i1> %cmp to i16
+  // %res = zext i16 %int to i32
+  unsigned NumElts = ArgTy->getNumElements();
+  Type *IntegerVecTy = VectorType::getInteger(ArgTy);
+  Type *IntegerTy = Builder.getIntNTy(NumElts);
+
+  Value *Res = Builder.CreateBitCast(Arg, IntegerVecTy);
+  Res = Builder.CreateICmpSLT(Res, Constant::getNullValue(IntegerVecTy));
+  Res = Builder.CreateBitCast(Res, IntegerTy);
+  Res = Builder.CreateZExtOrTrunc(Res, ResTy);
+  return Res;
+}
+
+static Value *simplifyX86addcarry(const IntrinsicInst &II,
+                                  InstCombiner::BuilderTy &Builder) {
+  Value *CarryIn = II.getArgOperand(0);
+  Value *Op1 = II.getArgOperand(1);
+  Value *Op2 = II.getArgOperand(2);
+  Type *RetTy = II.getType();
+  Type *OpTy = Op1->getType();
+  assert(RetTy->getStructElementType(0)->isIntegerTy(8) &&
+         RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() &&
+         "Unexpected types for x86 addcarry");
+
+  // If carry-in is zero, this is just an unsigned add with overflow.
+  if (match(CarryIn, PatternMatch::m_ZeroInt())) {
+    Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy,
+                                          {Op1, Op2});
+    // The types have to be adjusted to match the x86 call types.
+    Value *UAddResult = Builder.CreateExtractValue(UAdd, 0);
+    Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1),
+                                       Builder.getInt8Ty());
+    Value *Res = UndefValue::get(RetTy);
+    Res = Builder.CreateInsertValue(Res, UAddOV, 0);
+    return Builder.CreateInsertValue(Res, UAddResult, 1);
+  }
+
+  return nullptr;
+}
+
+static Value *simplifyX86insertps(const IntrinsicInst &II,
+                                  InstCombiner::BuilderTy &Builder) {
+  auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2));
+  if (!CInt)
+    return nullptr;
+
+  auto *VecTy = cast<FixedVectorType>(II.getType());
+  assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
+
+  // The immediate permute control byte looks like this:
+  //    [3:0] - zero mask for each 32-bit lane
+  //    [5:4] - select one 32-bit destination lane
+  //    [7:6] - select one 32-bit source lane
+
+  uint8_t Imm = CInt->getZExtValue();
+  uint8_t ZMask = Imm & 0xf;
+  uint8_t DestLane = (Imm >> 4) & 0x3;
+  uint8_t SourceLane = (Imm >> 6) & 0x3;
+
+  ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
+
+  // If all zero mask bits are set, this was just a weird way to
+  // generate a zero vector.
+  if (ZMask == 0xf)
+    return ZeroVector;
+
+  // Initialize by passing all of the first source bits through.
+  int ShuffleMask[4] = {0, 1, 2, 3};
+
+  // We may replace the second operand with the zero vector.
+  Value *V1 = II.getArgOperand(1);
+
+  if (ZMask) {
+    // If the zero mask is being used with a single input or the zero mask
+    // overrides the destination lane, this is a shuffle with the zero vector.
+    if ((II.getArgOperand(0) == II.getArgOperand(1)) ||
+        (ZMask & (1 << DestLane))) {
+      V1 = ZeroVector;
+      // We may still move 32-bits of the first source vector from one lane
+      // to another.
+      ShuffleMask[DestLane] = SourceLane;
+      // The zero mask may override the previous insert operation.
+      for (unsigned i = 0; i < 4; ++i)
+        if ((ZMask >> i) & 0x1)
+          ShuffleMask[i] = i + 4;
+    } else {
+      // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle?
+      return nullptr;
+    }
+  } else {
+    // Replace the selected destination lane with the selected source lane.
+    ShuffleMask[DestLane] = SourceLane + 4;
+  }
+
+  return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask);
+}
+
+/// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding
+/// or conversion to a shuffle vector.
+static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0,
+                               ConstantInt *CILength, ConstantInt *CIIndex,
+                               InstCombiner::BuilderTy &Builder) {
+  auto LowConstantHighUndef = [&](uint64_t Val) {
+    Type *IntTy64 = Type::getInt64Ty(II.getContext());
+    Constant *Args[] = {ConstantInt::get(IntTy64, Val),
+                        UndefValue::get(IntTy64)};
+    return ConstantVector::get(Args);
+  };
+
+  // See if we're dealing with constant values.
+  Constant *C0 = dyn_cast<Constant>(Op0);
+  ConstantInt *CI0 =
+      C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
+         : nullptr;
+
+  // Attempt to constant fold.
+  if (CILength && CIIndex) {
+    // From AMD documentation: "The bit index and field length are each six
+    // bits in length other bits of the field are ignored."
+    APInt APIndex = CIIndex->getValue().zextOrTrunc(6);
+    APInt APLength = CILength->getValue().zextOrTrunc(6);
+
+    unsigned Index = APIndex.getZExtValue();
+
+    // From AMD documentation: "a value of zero in the field length is
+    // defined as length of 64".
+    unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
+
+    // From AMD documentation: "If the sum of the bit index + length field
+    // is greater than 64, the results are undefined".
+    unsigned End = Index + Length;
+
+    // Note that both field index and field length are 8-bit quantities.
+    // Since variables 'Index' and 'Length' are unsigned values
+    // obtained from zero-extending field index and field length
+    // respectively, their sum should never wrap around.
+    if (End > 64)
+      return UndefValue::get(II.getType());
+
+    // If we are inserting whole bytes, we can convert this to a shuffle.
+    // Lowering can recognize EXTRQI shuffle masks.
+    if ((Length % 8) == 0 && (Index % 8) == 0) {
+      // Convert bit indices to byte indices.
+      Length /= 8;
+      Index /= 8;
+
+      Type *IntTy8 = Type::getInt8Ty(II.getContext());
+      auto *ShufTy = FixedVectorType::get(IntTy8, 16);
+
+      SmallVector<int, 16> ShuffleMask;
+      for (int i = 0; i != (int)Length; ++i)
+        ShuffleMask.push_back(i + Index);
+      for (int i = Length; i != 8; ++i)
+        ShuffleMask.push_back(i + 16);
+      for (int i = 8; i != 16; ++i)
+        ShuffleMask.push_back(-1);
+
+      Value *SV = Builder.CreateShuffleVector(
+          Builder.CreateBitCast(Op0, ShufTy),
+          ConstantAggregateZero::get(ShufTy), ShuffleMask);
+      return Builder.CreateBitCast(SV, II.getType());
+    }
+
+    // Constant Fold - shift Index'th bit to lowest position and mask off
+    // Length bits.
+    if (CI0) {
+      APInt Elt = CI0->getValue();
+      Elt.lshrInPlace(Index);
+      Elt = Elt.zextOrTrunc(Length);
+      return LowConstantHighUndef(Elt.getZExtValue());
+    }
+
+    // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI.
+    if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) {
+      Value *Args[] = {Op0, CILength, CIIndex};
+      Module *M = II.getModule();
+      Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi);
+      return Builder.CreateCall(F, Args);
+    }
+  }
+
+  // Constant Fold - extraction from zero is always {zero, undef}.
+  if (CI0 && CI0->isZero())
+    return LowConstantHighUndef(0);
+
+  return nullptr;
+}
+
+/// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant
+/// folding or conversion to a shuffle vector.
+static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1,
+                                 APInt APLength, APInt APIndex,
+                                 InstCombiner::BuilderTy &Builder) {
+  // From AMD documentation: "The bit index and field length are each six bits
+  // in length other bits of the field are ignored."
+  APIndex = APIndex.zextOrTrunc(6);
+  APLength = APLength.zextOrTrunc(6);
+
+  // Attempt to constant fold.
+  unsigned Index = APIndex.getZExtValue();
+
+  // From AMD documentation: "a value of zero in the field length is
+  // defined as length of 64".
+  unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
+
+  // From AMD documentation: "If the sum of the bit index + length field
+  // is greater than 64, the results are undefined".
+  unsigned End = Index + Length;
+
+  // Note that both field index and field length are 8-bit quantities.
+  // Since variables 'Index' and 'Length' are unsigned values
+  // obtained from zero-extending field index and field length
+  // respectively, their sum should never wrap around.
+  if (End > 64)
+    return UndefValue::get(II.getType());
+
+  // If we are inserting whole bytes, we can convert this to a shuffle.
+  // Lowering can recognize INSERTQI shuffle masks.
+  if ((Length % 8) == 0 && (Index % 8) == 0) {
+    // Convert bit indices to byte indices.
+    Length /= 8;
+    Index /= 8;
+
+    Type *IntTy8 = Type::getInt8Ty(II.getContext());
+    auto *ShufTy = FixedVectorType::get(IntTy8, 16);
+
+    SmallVector<int, 16> ShuffleMask;
+    for (int i = 0; i != (int)Index; ++i)
+      ShuffleMask.push_back(i);
+    for (int i = 0; i != (int)Length; ++i)
+      ShuffleMask.push_back(i + 16);
+    for (int i = Index + Length; i != 8; ++i)
+      ShuffleMask.push_back(i);
+    for (int i = 8; i != 16; ++i)
+      ShuffleMask.push_back(-1);
+
+    Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy),
+                                            Builder.CreateBitCast(Op1, ShufTy),
+                                            ShuffleMask);
+    return Builder.CreateBitCast(SV, II.getType());
+  }
+
+  // See if we're dealing with constant values.
+  Constant *C0 = dyn_cast<Constant>(Op0);
+  Constant *C1 = dyn_cast<Constant>(Op1);
+  ConstantInt *CI00 =
+      C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
+         : nullptr;
+  ConstantInt *CI10 =
+      C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
+         : nullptr;
+
+  // Constant Fold - insert bottom Length bits starting at the Index'th bit.
+  if (CI00 && CI10) {
+    APInt V00 = CI00->getValue();
+    APInt V10 = CI10->getValue();
+    APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index);
+    V00 = V00 & ~Mask;
+    V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index);
+    APInt Val = V00 | V10;
+    Type *IntTy64 = Type::getInt64Ty(II.getContext());
+    Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()),
+                        UndefValue::get(IntTy64)};
+    return ConstantVector::get(Args);
+  }
+
+  // If we were an INSERTQ call, we'll save demanded elements if we convert to
+  // INSERTQI.
+  if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) {
+    Type *IntTy8 = Type::getInt8Ty(II.getContext());
+    Constant *CILength = ConstantInt::get(IntTy8, Length, false);
+    Constant *CIIndex = ConstantInt::get(IntTy8, Index, false);
+
+    Value *Args[] = {Op0, Op1, CILength, CIIndex};
+    Module *M = II.getModule();
+    Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi);
+    return Builder.CreateCall(F, Args);
+  }
+
+  return nullptr;
+}
+
+/// Attempt to convert pshufb* to shufflevector if the mask is constant.
+static Value *simplifyX86pshufb(const IntrinsicInst &II,
+                                InstCombiner::BuilderTy &Builder) {
+  Constant *V = dyn_cast<Constant>(II.getArgOperand(1));
+  if (!V)
+    return nullptr;
+
+  auto *VecTy = cast<FixedVectorType>(II.getType());
+  unsigned NumElts = VecTy->getNumElements();
+  assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
+         "Unexpected number of elements in shuffle mask!");
+
+  // Construct a shuffle mask from constant integers or UNDEFs.
+  int Indexes[64];
+
+  // Each byte in the shuffle control mask forms an index to permute the
+  // corresponding byte in the destination operand.
+  for (unsigned I = 0; I < NumElts; ++I) {
+    Constant *COp = V->getAggregateElement(I);
+    if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
+      return nullptr;
+
+    if (isa<UndefValue>(COp)) {
+      Indexes[I] = -1;
+      continue;
+    }
+
+    int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue();
+
+    // If the most significant bit (bit[7]) of each byte of the shuffle
+    // control mask is set, then zero is written in the result byte.
+    // The zero vector is in the right-hand side of the resulting
+    // shufflevector.
+
+    // The value of each index for the high 128-bit lane is the least
+    // significant 4 bits of the respective shuffle control byte.
+    Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0);
+    Indexes[I] = Index;
+  }
+
+  auto V1 = II.getArgOperand(0);
+  auto V2 = Constant::getNullValue(VecTy);
+  return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes, NumElts));
+}
+
+/// Attempt to convert vpermilvar* to shufflevector if the mask is constant.
+static Value *simplifyX86vpermilvar(const IntrinsicInst &II,
+                                    InstCombiner::BuilderTy &Builder) {
+  Constant *V = dyn_cast<Constant>(II.getArgOperand(1));
+  if (!V)
+    return nullptr;
+
+  auto *VecTy = cast<FixedVectorType>(II.getType());
+  unsigned NumElts = VecTy->getNumElements();
+  bool IsPD = VecTy->getScalarType()->isDoubleTy();
+  unsigned NumLaneElts = IsPD ? 2 : 4;
+  assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2);
+
+  // Construct a shuffle mask from constant integers or UNDEFs.
+  int Indexes[16];
+
+  // The intrinsics only read one or two bits, clear the rest.
+  for (unsigned I = 0; I < NumElts; ++I) {
+    Constant *COp = V->getAggregateElement(I);
+    if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
+      return nullptr;
+
+    if (isa<UndefValue>(COp)) {
+      Indexes[I] = -1;
+      continue;
+    }
+
+    APInt Index = cast<ConstantInt>(COp)->getValue();
+    Index = Index.zextOrTrunc(32).getLoBits(2);
+
+    // The PD variants uses bit 1 to select per-lane element index, so
+    // shift down to convert to generic shuffle mask index.
+    if (IsPD)
+      Index.lshrInPlace(1);
+
+    // The _256 variants are a bit trickier since the mask bits always index
+    // into the corresponding 128 half. In order to convert to a generic
+    // shuffle, we have to make that explicit.
+    Index += APInt(32, (I / NumLaneElts) * NumLaneElts);
+
+    Indexes[I] = Index.getZExtValue();
+  }
+
+  auto V1 = II.getArgOperand(0);
+  return Builder.CreateShuffleVector(V1, makeArrayRef(Indexes, NumElts));
+}
+
+/// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant.
+static Value *simplifyX86vpermv(const IntrinsicInst &II,
+                                InstCombiner::BuilderTy &Builder) {
+  auto *V = dyn_cast<Constant>(II.getArgOperand(1));
+  if (!V)
+    return nullptr;
+
+  auto *VecTy = cast<FixedVectorType>(II.getType());
+  unsigned Size = VecTy->getNumElements();
+  assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) &&
+         "Unexpected shuffle mask size");
+
+  // Construct a shuffle mask from constant integers or UNDEFs.
+  int Indexes[64];
+
+  for (unsigned I = 0; I < Size; ++I) {
+    Constant *COp = V->getAggregateElement(I);
+    if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
+      return nullptr;
+
+    if (isa<UndefValue>(COp)) {
+      Indexes[I] = -1;
+      continue;
+    }
+
+    uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();
+    Index &= Size - 1;
+    Indexes[I] = Index;
+  }
+
+  auto V1 = II.getArgOperand(0);
+  return Builder.CreateShuffleVector(V1, makeArrayRef(Indexes, Size));
+}
+
+Optional<Instruction *>
+X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
+  auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width,
+                                             unsigned DemandedWidth) {
+    APInt UndefElts(Width, 0);
+    APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth);
+    return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);
+  };
+
+  Intrinsic::ID IID = II.getIntrinsicID();
+  switch (IID) {
+  case Intrinsic::x86_bmi_bextr_32:
+  case Intrinsic::x86_bmi_bextr_64:
+  case Intrinsic::x86_tbm_bextri_u32:
+  case Intrinsic::x86_tbm_bextri_u64:
+    // If the RHS is a constant we can try some simplifications.
+    if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
+      uint64_t Shift = C->getZExtValue();
+      uint64_t Length = (Shift >> 8) & 0xff;
+      Shift &= 0xff;
+      unsigned BitWidth = II.getType()->getIntegerBitWidth();
+      // If the length is 0 or the shift is out of range, replace with zero.
+      if (Length == 0 || Shift >= BitWidth) {
+        return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
+      }
+      // If the LHS is also a constant, we can completely constant fold this.
+      if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
+        uint64_t Result = InC->getZExtValue() >> Shift;
+        if (Length > BitWidth)
+          Length = BitWidth;
+        Result &= maskTrailingOnes<uint64_t>(Length);
+        return IC.replaceInstUsesWith(II,
+                                      ConstantInt::get(II.getType(), Result));
+      }
+      // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we
+      // are only masking bits that a shift already cleared?
+    }
+    break;
+
+  case Intrinsic::x86_bmi_bzhi_32:
+  case Intrinsic::x86_bmi_bzhi_64:
+    // If the RHS is a constant we can try some simplifications.
+    if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
+      uint64_t Index = C->getZExtValue() & 0xff;
+      unsigned BitWidth = II.getType()->getIntegerBitWidth();
+      if (Index >= BitWidth) {
+        return IC.replaceInstUsesWith(II, II.getArgOperand(0));
+      }
+      if (Index == 0) {
+        return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
+      }
+      // If the LHS is also a constant, we can completely constant fold this.
+      if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
+        uint64_t Result = InC->getZExtValue();
+        Result &= maskTrailingOnes<uint64_t>(Index);
+        return IC.replaceInstUsesWith(II,
+                                      ConstantInt::get(II.getType(), Result));
+      }
+      // TODO should we convert this to an AND if the RHS is constant?
+    }
+    break;
+  case Intrinsic::x86_bmi_pext_32:
+  case Intrinsic::x86_bmi_pext_64:
+    if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
+      if (MaskC->isNullValue()) {
+        return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
+      }
+      if (MaskC->isAllOnesValue()) {
+        return IC.replaceInstUsesWith(II, II.getArgOperand(0));
+      }
+
+      if (MaskC->getValue().isShiftedMask()) {
+        // any single contingous sequence of 1s anywhere in the mask simply
+        // describes a subset of the input bits shifted to the appropriate
+        // position.  Replace with the straight forward IR.
+        unsigned ShiftAmount = MaskC->getValue().countTrailingZeros();
+        Value *Input = II.getArgOperand(0);
+        Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1));
+        Value *Shifted = IC.Builder.CreateLShr(Masked,
+                                               ConstantInt::get(II.getType(),
+                                                                ShiftAmount));
+        return IC.replaceInstUsesWith(II, Shifted);
+      }
+
+
+      if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
+        uint64_t Src = SrcC->getZExtValue();
+        uint64_t Mask = MaskC->getZExtValue();
+        uint64_t Result = 0;
+        uint64_t BitToSet = 1;
+
+        while (Mask) {
+          // Isolate lowest set bit.
+          uint64_t BitToTest = Mask & -Mask;
+          if (BitToTest & Src)
+            Result |= BitToSet;
+
+          BitToSet <<= 1;
+          // Clear lowest set bit.
+          Mask &= Mask - 1;
+        }
+
+        return IC.replaceInstUsesWith(II,
+                                      ConstantInt::get(II.getType(), Result));
+      }
+    }
+    break;
+  case Intrinsic::x86_bmi_pdep_32:
+  case Intrinsic::x86_bmi_pdep_64:
+    if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
+      if (MaskC->isNullValue()) {
+        return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
+      }
+      if (MaskC->isAllOnesValue()) {
+        return IC.replaceInstUsesWith(II, II.getArgOperand(0));
+      }
+      if (MaskC->getValue().isShiftedMask()) {
+        // any single contingous sequence of 1s anywhere in the mask simply
+        // describes a subset of the input bits shifted to the appropriate
+        // position.  Replace with the straight forward IR.
+        unsigned ShiftAmount = MaskC->getValue().countTrailingZeros();
+        Value *Input = II.getArgOperand(0);
+        Value *Shifted = IC.Builder.CreateShl(Input,
+                                              ConstantInt::get(II.getType(),
+                                                               ShiftAmount));
+        Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1));
+        return IC.replaceInstUsesWith(II, Masked);
+      }
+
+      if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
+        uint64_t Src = SrcC->getZExtValue();
+        uint64_t Mask = MaskC->getZExtValue();
+        uint64_t Result = 0;
+        uint64_t BitToTest = 1;
+
+        while (Mask) {
+          // Isolate lowest set bit.
+          uint64_t BitToSet = Mask & -Mask;
+          if (BitToTest & Src)
+            Result |= BitToSet;
+
+          BitToTest <<= 1;
+          // Clear lowest set bit;
+          Mask &= Mask - 1;
+        }
+
+        return IC.replaceInstUsesWith(II,
+                                      ConstantInt::get(II.getType(), Result));
+      }
+    }
+    break;
+
+  case Intrinsic::x86_sse_cvtss2si:
+  case Intrinsic::x86_sse_cvtss2si64:
+  case Intrinsic::x86_sse_cvttss2si:
+  case Intrinsic::x86_sse_cvttss2si64:
+  case Intrinsic::x86_sse2_cvtsd2si:
+  case Intrinsic::x86_sse2_cvtsd2si64:
+  case Intrinsic::x86_sse2_cvttsd2si:
+  case Intrinsic::x86_sse2_cvttsd2si64:
+  case Intrinsic::x86_avx512_vcvtss2si32:
+  case Intrinsic::x86_avx512_vcvtss2si64:
+  case Intrinsic::x86_avx512_vcvtss2usi32:
+  case Intrinsic::x86_avx512_vcvtss2usi64:
+  case Intrinsic::x86_avx512_vcvtsd2si32:
+  case Intrinsic::x86_avx512_vcvtsd2si64:
+  case Intrinsic::x86_avx512_vcvtsd2usi32:
+  case Intrinsic::x86_avx512_vcvtsd2usi64:
+  case Intrinsic::x86_avx512_cvttss2si:
+  case Intrinsic::x86_avx512_cvttss2si64:
+  case Intrinsic::x86_avx512_cvttss2usi:
+  case Intrinsic::x86_avx512_cvttss2usi64:
+  case Intrinsic::x86_avx512_cvttsd2si:
+  case Intrinsic::x86_avx512_cvttsd2si64:
+  case Intrinsic::x86_avx512_cvttsd2usi:
+  case Intrinsic::x86_avx512_cvttsd2usi64: {
+    // These intrinsics only demand the 0th element of their input vectors. If
+    // we can simplify the input based on that, do so now.
+    Value *Arg = II.getArgOperand(0);
+    unsigned VWidth = cast<FixedVectorType>(Arg->getType())->getNumElements();
+    if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) {
+      return IC.replaceOperand(II, 0, V);
+    }
+    break;
+  }
+
+  case Intrinsic::x86_mmx_pmovmskb:
+  case Intrinsic::x86_sse_movmsk_ps:
+  case Intrinsic::x86_sse2_movmsk_pd:
+  case Intrinsic::x86_sse2_pmovmskb_128:
+  case Intrinsic::x86_avx_movmsk_pd_256:
+  case Intrinsic::x86_avx_movmsk_ps_256:
+  case Intrinsic::x86_avx2_pmovmskb:
+    if (Value *V = simplifyX86movmsk(II, IC.Builder)) {
+      return IC.replaceInstUsesWith(II, V);
+    }
+    break;
+
+  case Intrinsic::x86_sse_comieq_ss:
+  case Intrinsic::x86_sse_comige_ss:
+  case Intrinsic::x86_sse_comigt_ss:
+  case Intrinsic::x86_sse_comile_ss:
+  case Intrinsic::x86_sse_comilt_ss:
+  case Intrinsic::x86_sse_comineq_ss:
+  case Intrinsic::x86_sse_ucomieq_ss:
+  case Intrinsic::x86_sse_ucomige_ss:
+  case Intrinsic::x86_sse_ucomigt_ss:
+  case Intrinsic::x86_sse_ucomile_ss:
+  case Intrinsic::x86_sse_ucomilt_ss:
+  case Intrinsic::x86_sse_ucomineq_ss:
+  case Intrinsic::x86_sse2_comieq_sd:
+  case Intrinsic::x86_sse2_comige_sd:
+  case Intrinsic::x86_sse2_comigt_sd:
+  case Intrinsic::x86_sse2_comile_sd:
+  case Intrinsic::x86_sse2_comilt_sd:
+  case Intrinsic::x86_sse2_comineq_sd:
+  case Intrinsic::x86_sse2_ucomieq_sd:
+  case Intrinsic::x86_sse2_ucomige_sd:
+  case Intrinsic::x86_sse2_ucomigt_sd:
+  case Intrinsic::x86_sse2_ucomile_sd:
+  case Intrinsic::x86_sse2_ucomilt_sd:
+  case Intrinsic::x86_sse2_ucomineq_sd:
+  case Intrinsic::x86_avx512_vcomi_ss:
+  case Intrinsic::x86_avx512_vcomi_sd:
+  case Intrinsic::x86_avx512_mask_cmp_ss:
+  case Intrinsic::x86_avx512_mask_cmp_sd: {
+    // These intrinsics only demand the 0th element of their input vectors. If
+    // we can simplify the input based on that, do so now.
+    bool MadeChange = false;
+    Value *Arg0 = II.getArgOperand(0);
+    Value *Arg1 = II.getArgOperand(1);
+    unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements();
+    if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) {
+      IC.replaceOperand(II, 0, V);
+      MadeChange = true;
+    }
+    if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {
+      IC.replaceOperand(II, 1, V);
+      MadeChange = true;
+    }
+    if (MadeChange) {
+      return &II;
+    }
+    break;
+  }
+
+  case Intrinsic::x86_avx512_add_ps_512:
+  case Intrinsic::x86_avx512_div_ps_512:
+  case Intrinsic::x86_avx512_mul_ps_512:
+  case Intrinsic::x86_avx512_sub_ps_512:
+  case Intrinsic::x86_avx512_add_pd_512:
+  case Intrinsic::x86_avx512_div_pd_512:
+  case Intrinsic::x86_avx512_mul_pd_512:
+  case Intrinsic::x86_avx512_sub_pd_512:
+    // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
+    // IR operations.
+    if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
+      if (R->getValue() == 4) {
+        Value *Arg0 = II.getArgOperand(0);
+        Value *Arg1 = II.getArgOperand(1);
+
+        Value *V;
+        switch (IID) {
+        default:
+          llvm_unreachable("Case stmts out of sync!");
+        case Intrinsic::x86_avx512_add_ps_512:
+        case Intrinsic::x86_avx512_add_pd_512:
+          V = IC.Builder.CreateFAdd(Arg0, Arg1);
+          break;
+        case Intrinsic::x86_avx512_sub_ps_512:
+        case Intrinsic::x86_avx512_sub_pd_512:
+          V = IC.Builder.CreateFSub(Arg0, Arg1);
+          break;
+        case Intrinsic::x86_avx512_mul_ps_512:
+        case Intrinsic::x86_avx512_mul_pd_512:
+          V = IC.Builder.CreateFMul(Arg0, Arg1);
+          break;
+        case Intrinsic::x86_avx512_div_ps_512:
+        case Intrinsic::x86_avx512_div_pd_512:
+          V = IC.Builder.CreateFDiv(Arg0, Arg1);
+          break;
+        }
+
+        return IC.replaceInstUsesWith(II, V);
+      }
+    }
+    break;
+
+  case Intrinsic::x86_avx512_mask_add_ss_round:
+  case Intrinsic::x86_avx512_mask_div_ss_round:
+  case Intrinsic::x86_avx512_mask_mul_ss_round:
+  case Intrinsic::x86_avx512_mask_sub_ss_round:
+  case Intrinsic::x86_avx512_mask_add_sd_round:
+  case Intrinsic::x86_avx512_mask_div_sd_round:
+  case Intrinsic::x86_avx512_mask_mul_sd_round:
+  case Intrinsic::x86_avx512_mask_sub_sd_round:
+    // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
+    // IR operations.
+    if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) {
+      if (R->getValue() == 4) {
+        // Extract the element as scalars.
+        Value *Arg0 = II.getArgOperand(0);
+        Value *Arg1 = II.getArgOperand(1);
+        Value *LHS = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0);
+        Value *RHS = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0);
+
+        Value *V;
+        switch (IID) {
+        default:
+          llvm_unreachable("Case stmts out of sync!");
+        case Intrinsic::x86_avx512_mask_add_ss_round:
+        case Intrinsic::x86_avx512_mask_add_sd_round:
+          V = IC.Builder.CreateFAdd(LHS, RHS);
+          break;
+        case Intrinsic::x86_avx512_mask_sub_ss_round:
+        case Intrinsic::x86_avx512_mask_sub_sd_round:
+          V = IC.Builder.CreateFSub(LHS, RHS);
+          break;
+        case Intrinsic::x86_avx512_mask_mul_ss_round:
+        case Intrinsic::x86_avx512_mask_mul_sd_round:
+          V = IC.Builder.CreateFMul(LHS, RHS);
+          break;
+        case Intrinsic::x86_avx512_mask_div_ss_round:
+        case Intrinsic::x86_avx512_mask_div_sd_round:
+          V = IC.Builder.CreateFDiv(LHS, RHS);
+          break;
+        }
+
+        // Handle the masking aspect of the intrinsic.
+        Value *Mask = II.getArgOperand(3);
+        auto *C = dyn_cast<ConstantInt>(Mask);
+        // We don't need a select if we know the mask bit is a 1.
+        if (!C || !C->getValue()[0]) {
+          // Cast the mask to an i1 vector and then extract the lowest element.
+          auto *MaskTy = FixedVectorType::get(
+              IC.Builder.getInt1Ty(),
+              cast<IntegerType>(Mask->getType())->getBitWidth());
+          Mask = IC.Builder.CreateBitCast(Mask, MaskTy);
+          Mask = IC.Builder.CreateExtractElement(Mask, (uint64_t)0);
+          // Extract the lowest element from the passthru operand.
+          Value *Passthru =
+              IC.Builder.CreateExtractElement(II.getArgOperand(2), (uint64_t)0);
+          V = IC.Builder.CreateSelect(Mask, V, Passthru);
+        }
+
+        // Insert the result back into the original argument 0.
+        V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0);
+
+        return IC.replaceInstUsesWith(II, V);
+      }
+    }
+    break;
+
+  // Constant fold ashr( <A x Bi>, Ci ).
+  // Constant fold lshr( <A x Bi>, Ci ).
+  // Constant fold shl( <A x Bi>, Ci ).
+  case Intrinsic::x86_sse2_psrai_d:
+  case Intrinsic::x86_sse2_psrai_w:
+  case Intrinsic::x86_avx2_psrai_d:
+  case Intrinsic::x86_avx2_psrai_w:
+  case Intrinsic::x86_avx512_psrai_q_128:
+  case Intrinsic::x86_avx512_psrai_q_256:
+  case Intrinsic::x86_avx512_psrai_d_512:
+  case Intrinsic::x86_avx512_psrai_q_512:
+  case Intrinsic::x86_avx512_psrai_w_512:
+  case Intrinsic::x86_sse2_psrli_d:
+  case Intrinsic::x86_sse2_psrli_q:
+  case Intrinsic::x86_sse2_psrli_w:
+  case Intrinsic::x86_avx2_psrli_d:
+  case Intrinsic::x86_avx2_psrli_q:
+  case Intrinsic::x86_avx2_psrli_w:
+  case Intrinsic::x86_avx512_psrli_d_512:
+  case Intrinsic::x86_avx512_psrli_q_512:
+  case Intrinsic::x86_avx512_psrli_w_512:
+  case Intrinsic::x86_sse2_pslli_d:
+  case Intrinsic::x86_sse2_pslli_q:
+  case Intrinsic::x86_sse2_pslli_w:
+  case Intrinsic::x86_avx2_pslli_d:
+  case Intrinsic::x86_avx2_pslli_q:
+  case Intrinsic::x86_avx2_pslli_w:
+  case Intrinsic::x86_avx512_pslli_d_512:
+  case Intrinsic::x86_avx512_pslli_q_512:
+  case Intrinsic::x86_avx512_pslli_w_512:
+    if (Value *V = simplifyX86immShift(II, IC.Builder)) {
+      return IC.replaceInstUsesWith(II, V);
+    }
+    break;
+
+  case Intrinsic::x86_sse2_psra_d:
+  case Intrinsic::x86_sse2_psra_w:
+  case Intrinsic::x86_avx2_psra_d:
+  case Intrinsic::x86_avx2_psra_w:
+  case Intrinsic::x86_avx512_psra_q_128:
+  case Intrinsic::x86_avx512_psra_q_256:
+  case Intrinsic::x86_avx512_psra_d_512:
+  case Intrinsic::x86_avx512_psra_q_512:
+  case Intrinsic::x86_avx512_psra_w_512:
+  case Intrinsic::x86_sse2_psrl_d:
+  case Intrinsic::x86_sse2_psrl_q:
+  case Intrinsic::x86_sse2_psrl_w:
+  case Intrinsic::x86_avx2_psrl_d:
+  case Intrinsic::x86_avx2_psrl_q:
+  case Intrinsic::x86_avx2_psrl_w:
+  case Intrinsic::x86_avx512_psrl_d_512:
+  case Intrinsic::x86_avx512_psrl_q_512:
+  case Intrinsic::x86_avx512_psrl_w_512:
+  case Intrinsic::x86_sse2_psll_d:
+  case Intrinsic::x86_sse2_psll_q:
+  case Intrinsic::x86_sse2_psll_w:
+  case Intrinsic::x86_avx2_psll_d:
+  case Intrinsic::x86_avx2_psll_q:
+  case Intrinsic::x86_avx2_psll_w:
+  case Intrinsic::x86_avx512_psll_d_512:
+  case Intrinsic::x86_avx512_psll_q_512:
+  case Intrinsic::x86_avx512_psll_w_512: {
+    if (Value *V = simplifyX86immShift(II, IC.Builder)) {
+      return IC.replaceInstUsesWith(II, V);
+    }
+
+    // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector
+    // operand to compute the shift amount.
+    Value *Arg1 = II.getArgOperand(1);
+    assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 &&
+           "Unexpected packed shift size");
+    unsigned VWidth = cast<FixedVectorType>(Arg1->getType())->getNumElements();
+
+    if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) {
+      return IC.replaceOperand(II, 1, V);
+    }
+    break;
+  }
+
+  case Intrinsic::x86_avx2_psllv_d:
+  case Intrinsic::x86_avx2_psllv_d_256:
+  case Intrinsic::x86_avx2_psllv_q:
+  case Intrinsic::x86_avx2_psllv_q_256:
+  case Intrinsic::x86_avx512_psllv_d_512:
+  case Intrinsic::x86_avx512_psllv_q_512:
+  case Intrinsic::x86_avx512_psllv_w_128:
+  case Intrinsic::x86_avx512_psllv_w_256:
+  case Intrinsic::x86_avx512_psllv_w_512:
+  case Intrinsic::x86_avx2_psrav_d:
+  case Intrinsic::x86_avx2_psrav_d_256:
+  case Intrinsic::x86_avx512_psrav_q_128:
+  case Intrinsic::x86_avx512_psrav_q_256:
+  case Intrinsic::x86_avx512_psrav_d_512:
+  case Intrinsic::x86_avx512_psrav_q_512:
+  case Intrinsic::x86_avx512_psrav_w_128:
+  case Intrinsic::x86_avx512_psrav_w_256:
+  case Intrinsic::x86_avx512_psrav_w_512:
+  case Intrinsic::x86_avx2_psrlv_d:
+  case Intrinsic::x86_avx2_psrlv_d_256:
+  case Intrinsic::x86_avx2_psrlv_q:
+  case Intrinsic::x86_avx2_psrlv_q_256:
+  case Intrinsic::x86_avx512_psrlv_d_512:
+  case Intrinsic::x86_avx512_psrlv_q_512:
+  case Intrinsic::x86_avx512_psrlv_w_128:
+  case Intrinsic::x86_avx512_psrlv_w_256:
+  case Intrinsic::x86_avx512_psrlv_w_512:
+    if (Value *V = simplifyX86varShift(II, IC.Builder)) {
+      return IC.replaceInstUsesWith(II, V);
+    }
+    break;
+
+  case Intrinsic::x86_sse2_packssdw_128:
+  case Intrinsic::x86_sse2_packsswb_128:
+  case Intrinsic::x86_avx2_packssdw:
+  case Intrinsic::x86_avx2_packsswb:
+  case Intrinsic::x86_avx512_packssdw_512:
+  case Intrinsic::x86_avx512_packsswb_512:
+    if (Value *V = simplifyX86pack(II, IC.Builder, true)) {
+      return IC.replaceInstUsesWith(II, V);
+    }
+    break;
+
+  case Intrinsic::x86_sse2_packuswb_128:
+  case Intrinsic::x86_sse41_packusdw:
+  case Intrinsic::x86_avx2_packusdw:
+  case Intrinsic::x86_avx2_packuswb:
+  case Intrinsic::x86_avx512_packusdw_512:
+  case Intrinsic::x86_avx512_packuswb_512:
+    if (Value *V = simplifyX86pack(II, IC.Builder, false)) {
+      return IC.replaceInstUsesWith(II, V);
+    }
+    break;
+
+  case Intrinsic::x86_pclmulqdq:
+  case Intrinsic::x86_pclmulqdq_256:
+  case Intrinsic::x86_pclmulqdq_512: {
+    if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
+      unsigned Imm = C->getZExtValue();
+
+      bool MadeChange = false;
+      Value *Arg0 = II.getArgOperand(0);
+      Value *Arg1 = II.getArgOperand(1);
+      unsigned VWidth =
+          cast<FixedVectorType>(Arg0->getType())->getNumElements();
+
+      APInt UndefElts1(VWidth, 0);
+      APInt DemandedElts1 =
+          APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1));
+      if (Value *V =
+              IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) {
+        IC.replaceOperand(II, 0, V);
+        MadeChange = true;
+      }
+
+      APInt UndefElts2(VWidth, 0);
+      APInt DemandedElts2 =
+          APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1));
+      if (Value *V =
+              IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) {
+        IC.replaceOperand(II, 1, V);
+        MadeChange = true;
+      }
+
+      // If either input elements are undef, the result is zero.
+      if (DemandedElts1.isSubsetOf(UndefElts1) ||
+          DemandedElts2.isSubsetOf(UndefElts2)) {
+        return IC.replaceInstUsesWith(II,
+                                      ConstantAggregateZero::get(II.getType()));
+      }
+
+      if (MadeChange) {
+        return &II;
+      }
+    }
+    break;
+  }
+
+  case Intrinsic::x86_sse41_insertps:
+    if (Value *V = simplifyX86insertps(II, IC.Builder)) {
+      return IC.replaceInstUsesWith(II, V);
+    }
+    break;
+
+  case Intrinsic::x86_sse4a_extrq: {
+    Value *Op0 = II.getArgOperand(0);
+    Value *Op1 = II.getArgOperand(1);
+    unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
+    unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
+    assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
+           Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
+           VWidth1 == 16 && "Unexpected operand sizes");
+
+    // See if we're dealing with constant values.
+    Constant *C1 = dyn_cast<Constant>(Op1);
+    ConstantInt *CILength =
+        C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
+           : nullptr;
+    ConstantInt *CIIndex =
+        C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
+           : nullptr;
+
+    // Attempt to simplify to a constant, shuffle vector or EXTRQI call.
+    if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
+      return IC.replaceInstUsesWith(II, V);
+    }
+
+    // EXTRQ only uses the lowest 64-bits of the first 128-bit vector
+    // operands and the lowest 16-bits of the second.
+    bool MadeChange = false;
+    if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
+      IC.replaceOperand(II, 0, V);
+      MadeChange = true;
+    }
+    if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) {
+      IC.replaceOperand(II, 1, V);
+      MadeChange = true;
+    }
+    if (MadeChange) {
+      return &II;
+    }
+    break;
+  }
+
+  case Intrinsic::x86_sse4a_extrqi: {
+    // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining
+    // bits of the lower 64-bits. The upper 64-bits are undefined.
+    Value *Op0 = II.getArgOperand(0);
+    unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
+    assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
+           "Unexpected operand size");
+
+    // See if we're dealing with constant values.
+    ConstantInt *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1));
+    ConstantInt *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2));
+
+    // Attempt to simplify to a constant or shuffle vector.
+    if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
+      return IC.replaceInstUsesWith(II, V);
+    }
+
+    // EXTRQI only uses the lowest 64-bits of the first 128-bit vector
+    // operand.
+    if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
+      return IC.replaceOperand(II, 0, V);
+    }
+    break;
+  }
+
+  case Intrinsic::x86_sse4a_insertq: {
+    Value *Op0 = II.getArgOperand(0);
+    Value *Op1 = II.getArgOperand(1);
+    unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
+    assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
+           Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
+           cast<FixedVectorType>(Op1->getType())->getNumElements() == 2 &&
+           "Unexpected operand size");
+
+    // See if we're dealing with constant values.
+    Constant *C1 = dyn_cast<Constant>(Op1);
+    ConstantInt *CI11 =
+        C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
+           : nullptr;
+
+    // Attempt to simplify to a constant, shuffle vector or INSERTQI call.
+    if (CI11) {
+      const APInt &V11 = CI11->getValue();
+      APInt Len = V11.zextOrTrunc(6);
+      APInt Idx = V11.lshr(8).zextOrTrunc(6);
+      if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
+        return IC.replaceInstUsesWith(II, V);
+      }
+    }
+
+    // INSERTQ only uses the lowest 64-bits of the first 128-bit vector
+    // operand.
+    if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
+      return IC.replaceOperand(II, 0, V);
+    }
+    break;
+  }
+
+  case Intrinsic::x86_sse4a_insertqi: {
+    // INSERTQI: Extract lowest Length bits from lower half of second source and
+    // insert over first source starting at Index bit. The upper 64-bits are
+    // undefined.
+    Value *Op0 = II.getArgOperand(0);
+    Value *Op1 = II.getArgOperand(1);
+    unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
+    unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
+    assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
+           Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
+           VWidth1 == 2 && "Unexpected operand sizes");
+
+    // See if we're dealing with constant values.
+    ConstantInt *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2));
+    ConstantInt *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3));
+
+    // Attempt to simplify to a constant or shuffle vector.
+    if (CILength && CIIndex) {
+      APInt Len = CILength->getValue().zextOrTrunc(6);
+      APInt Idx = CIIndex->getValue().zextOrTrunc(6);
+      if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
+        return IC.replaceInstUsesWith(II, V);
+      }
+    }
+
+    // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector
+    // operands.
+    bool MadeChange = false;
+    if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
+      IC.replaceOperand(II, 0, V);
+      MadeChange = true;
+    }
+    if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) {
+      IC.replaceOperand(II, 1, V);
+      MadeChange = true;
+    }
+    if (MadeChange) {
+      return &II;
+    }
+    break;
+  }
+
+  case Intrinsic::x86_sse41_pblendvb:
+  case Intrinsic::x86_sse41_blendvps:
+  case Intrinsic::x86_sse41_blendvpd:
+  case Intrinsic::x86_avx_blendv_ps_256:
+  case Intrinsic::x86_avx_blendv_pd_256:
+  case Intrinsic::x86_avx2_pblendvb: {
+    // fold (blend A, A, Mask) -> A
+    Value *Op0 = II.getArgOperand(0);
+    Value *Op1 = II.getArgOperand(1);
+    Value *Mask = II.getArgOperand(2);
+    if (Op0 == Op1) {
+      return IC.replaceInstUsesWith(II, Op0);
+    }
+
+    // Zero Mask - select 1st argument.
+    if (isa<ConstantAggregateZero>(Mask)) {
+      return IC.replaceInstUsesWith(II, Op0);
+    }
+
+    // Constant Mask - select 1st/2nd argument lane based on top bit of mask.
+    if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) {
+      Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask);
+      return SelectInst::Create(NewSelector, Op1, Op0, "blendv");
+    }
+
+    // Convert to a vector select if we can bypass casts and find a boolean
+    // vector condition value.
+    Value *BoolVec;
+    Mask = InstCombiner::peekThroughBitcast(Mask);
+    if (match(Mask, PatternMatch::m_SExt(PatternMatch::m_Value(BoolVec))) &&
+        BoolVec->getType()->isVectorTy() &&
+        BoolVec->getType()->getScalarSizeInBits() == 1) {
+      assert(Mask->getType()->getPrimitiveSizeInBits() ==
+                 II.getType()->getPrimitiveSizeInBits() &&
+             "Not expecting mask and operands with different sizes");
+
+      unsigned NumMaskElts =
+          cast<FixedVectorType>(Mask->getType())->getNumElements();
+      unsigned NumOperandElts =
+          cast<FixedVectorType>(II.getType())->getNumElements();
+      if (NumMaskElts == NumOperandElts) {
+        return SelectInst::Create(BoolVec, Op1, Op0);
+      }
+
+      // If the mask has less elements than the operands, each mask bit maps to
+      // multiple elements of the operands. Bitcast back and forth.
+      if (NumMaskElts < NumOperandElts) {
+        Value *CastOp0 = IC.Builder.CreateBitCast(Op0, Mask->getType());
+        Value *CastOp1 = IC.Builder.CreateBitCast(Op1, Mask->getType());
+        Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0);
+        return new BitCastInst(Sel, II.getType());
+      }
+    }
+
+    break;
+  }
+
+  case Intrinsic::x86_ssse3_pshuf_b_128:
+  case Intrinsic::x86_avx2_pshuf_b:
+  case Intrinsic::x86_avx512_pshuf_b_512:
+    if (Value *V = simplifyX86pshufb(II, IC.Builder)) {
+      return IC.replaceInstUsesWith(II, V);
+    }
+    break;
+
+  case Intrinsic::x86_avx_vpermilvar_ps:
+  case Intrinsic::x86_avx_vpermilvar_ps_256:
+  case Intrinsic::x86_avx512_vpermilvar_ps_512:
+  case Intrinsic::x86_avx_vpermilvar_pd:
+  case Intrinsic::x86_avx_vpermilvar_pd_256:
+  case Intrinsic::x86_avx512_vpermilvar_pd_512:
+    if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) {
+      return IC.replaceInstUsesWith(II, V);
+    }
+    break;
+
+  case Intrinsic::x86_avx2_permd:
+  case Intrinsic::x86_avx2_permps:
+  case Intrinsic::x86_avx512_permvar_df_256:
+  case Intrinsic::x86_avx512_permvar_df_512:
+  case Intrinsic::x86_avx512_permvar_di_256:
+  case Intrinsic::x86_avx512_permvar_di_512:
+  case Intrinsic::x86_avx512_permvar_hi_128:
+  case Intrinsic::x86_avx512_permvar_hi_256:
+  case Intrinsic::x86_avx512_permvar_hi_512:
+  case Intrinsic::x86_avx512_permvar_qi_128:
+  case Intrinsic::x86_avx512_permvar_qi_256:
+  case Intrinsic::x86_avx512_permvar_qi_512:
+  case Intrinsic::x86_avx512_permvar_sf_512:
+  case Intrinsic::x86_avx512_permvar_si_512:
+    if (Value *V = simplifyX86vpermv(II, IC.Builder)) {
+      return IC.replaceInstUsesWith(II, V);
+    }
+    break;
+
+  case Intrinsic::x86_avx_maskload_ps:
+  case Intrinsic::x86_avx_maskload_pd:
+  case Intrinsic::x86_avx_maskload_ps_256:
+  case Intrinsic::x86_avx_maskload_pd_256:
+  case Intrinsic::x86_avx2_maskload_d:
+  case Intrinsic::x86_avx2_maskload_q:
+  case Intrinsic::x86_avx2_maskload_d_256:
+  case Intrinsic::x86_avx2_maskload_q_256:
+    if (Instruction *I = simplifyX86MaskedLoad(II, IC)) {
+      return I;
+    }
+    break;
+
+  case Intrinsic::x86_sse2_maskmov_dqu:
+  case Intrinsic::x86_avx_maskstore_ps:
+  case Intrinsic::x86_avx_maskstore_pd:
+  case Intrinsic::x86_avx_maskstore_ps_256:
+  case Intrinsic::x86_avx_maskstore_pd_256:
+  case Intrinsic::x86_avx2_maskstore_d:
+  case Intrinsic::x86_avx2_maskstore_q:
+  case Intrinsic::x86_avx2_maskstore_d_256:
+  case Intrinsic::x86_avx2_maskstore_q_256:
+    if (simplifyX86MaskedStore(II, IC)) {
+      return nullptr;
+    }
+    break;
+
+  case Intrinsic::x86_addcarry_32:
+  case Intrinsic::x86_addcarry_64:
+    if (Value *V = simplifyX86addcarry(II, IC.Builder)) {
+      return IC.replaceInstUsesWith(II, V);
+    }
+    break;
+
+  default:
+    break;
+  }
+  return None;
+}
+
+Optional<Value *> X86TTIImpl::simplifyDemandedUseBitsIntrinsic(
+    InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known,
+    bool &KnownBitsComputed) const {
+  switch (II.getIntrinsicID()) {
+  default:
+    break;
+  case Intrinsic::x86_mmx_pmovmskb:
+  case Intrinsic::x86_sse_movmsk_ps:
+  case Intrinsic::x86_sse2_movmsk_pd:
+  case Intrinsic::x86_sse2_pmovmskb_128:
+  case Intrinsic::x86_avx_movmsk_ps_256:
+  case Intrinsic::x86_avx_movmsk_pd_256:
+  case Intrinsic::x86_avx2_pmovmskb: {
+    // MOVMSK copies the vector elements' sign bits to the low bits
+    // and zeros the high bits.
+    unsigned ArgWidth;
+    if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) {
+      ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>.
+    } else {
+      auto Arg = II.getArgOperand(0);
+      auto ArgType = cast<FixedVectorType>(Arg->getType());
+      ArgWidth = ArgType->getNumElements();
+    }
+
+    // If we don't need any of low bits then return zero,
+    // we know that DemandedMask is non-zero already.
+    APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth);
+    Type *VTy = II.getType();
+    if (DemandedElts.isNullValue()) {
+      return ConstantInt::getNullValue(VTy);
+    }
+
+    // We know that the upper bits are set to zero.
+    Known.Zero.setBitsFrom(ArgWidth);
+    KnownBitsComputed = true;
+    break;
+  }
+  }
+  return None;
+}
+
+Optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic(
+    InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
+    APInt &UndefElts2, APInt &UndefElts3,
+    std::function<void(Instruction *, unsigned, APInt, APInt &)>
+        simplifyAndSetOp) const {
+  unsigned VWidth = cast<FixedVectorType>(II.getType())->getNumElements();
+  switch (II.getIntrinsicID()) {
+  default:
+    break;
+  case Intrinsic::x86_xop_vfrcz_ss:
+  case Intrinsic::x86_xop_vfrcz_sd:
+    // The instructions for these intrinsics are speced to zero upper bits not
+    // pass them through like other scalar intrinsics. So we shouldn't just
+    // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics.
+    // Instead we should return a zero vector.
+    if (!DemandedElts[0]) {
+      IC.addToWorklist(&II);
+      return ConstantAggregateZero::get(II.getType());
+    }
+
+    // Only the lower element is used.
+    DemandedElts = 1;
+    simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
+
+    // Only the lower element is undefined. The high elements are zero.
+    UndefElts = UndefElts[0];
+    break;
+
+  // Unary scalar-as-vector operations that work column-wise.
+  case Intrinsic::x86_sse_rcp_ss:
+  case Intrinsic::x86_sse_rsqrt_ss:
+    simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
+
+    // If lowest element of a scalar op isn't used then use Arg0.
+    if (!DemandedElts[0]) {
+      IC.addToWorklist(&II);
+      return II.getArgOperand(0);
+    }
+    // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions
+    // checks).
+    break;
+
+  // Binary scalar-as-vector operations that work column-wise. The high
+  // elements come from operand 0. The low element is a function of both
+  // operands.
+  case Intrinsic::x86_sse_min_ss:
+  case Intrinsic::x86_sse_max_ss:
+  case Intrinsic::x86_sse_cmp_ss:
+  case Intrinsic::x86_sse2_min_sd:
+  case Intrinsic::x86_sse2_max_sd:
+  case Intrinsic::x86_sse2_cmp_sd: {
+    simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
+
+    // If lowest element of a scalar op isn't used then use Arg0.
+    if (!DemandedElts[0]) {
+      IC.addToWorklist(&II);
+      return II.getArgOperand(0);
+    }
+
+    // Only lower element is used for operand 1.
+    DemandedElts = 1;
+    simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
+
+    // Lower element is undefined if both lower elements are undefined.
+    // Consider things like undef&0.  The result is known zero, not undef.
+    if (!UndefElts2[0])
+      UndefElts.clearBit(0);
+
+    break;
+  }
+
+  // Binary scalar-as-vector operations that work column-wise. The high
+  // elements come from operand 0 and the low element comes from operand 1.
+  case Intrinsic::x86_sse41_round_ss:
+  case Intrinsic::x86_sse41_round_sd: {
+    // Don't use the low element of operand 0.
+    APInt DemandedElts2 = DemandedElts;
+    DemandedElts2.clearBit(0);
+    simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts);
+
+    // If lowest element of a scalar op isn't used then use Arg0.
+    if (!DemandedElts[0]) {
+      IC.addToWorklist(&II);
+      return II.getArgOperand(0);
+    }
+
+    // Only lower element is used for operand 1.
+    DemandedElts = 1;
+    simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
+
+    // Take the high undef elements from operand 0 and take the lower element
+    // from operand 1.
+    UndefElts.clearBit(0);
+    UndefElts |= UndefElts2[0];
+    break;
+  }
+
+  // Three input scalar-as-vector operations that work column-wise. The high
+  // elements come from operand 0 and the low element is a function of all
+  // three inputs.
+  case Intrinsic::x86_avx512_mask_add_ss_round:
+  case Intrinsic::x86_avx512_mask_div_ss_round:
+  case Intrinsic::x86_avx512_mask_mul_ss_round:
+  case Intrinsic::x86_avx512_mask_sub_ss_round:
+  case Intrinsic::x86_avx512_mask_max_ss_round:
+  case Intrinsic::x86_avx512_mask_min_ss_round:
+  case Intrinsic::x86_avx512_mask_add_sd_round:
+  case Intrinsic::x86_avx512_mask_div_sd_round:
+  case Intrinsic::x86_avx512_mask_mul_sd_round:
+  case Intrinsic::x86_avx512_mask_sub_sd_round:
+  case Intrinsic::x86_avx512_mask_max_sd_round:
+  case Intrinsic::x86_avx512_mask_min_sd_round:
+    simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
+
+    // If lowest element of a scalar op isn't used then use Arg0.
+    if (!DemandedElts[0]) {
+      IC.addToWorklist(&II);
+      return II.getArgOperand(0);
+    }
+
+    // Only lower element is used for operand 1 and 2.
+    DemandedElts = 1;
+    simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
+    simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3);
+
+    // Lower element is undefined if all three lower elements are undefined.
+    // Consider things like undef&0.  The result is known zero, not undef.
+    if (!UndefElts2[0] || !UndefElts3[0])
+      UndefElts.clearBit(0);
+    break;
+
+  // TODO: Add fmaddsub support?
+  case Intrinsic::x86_sse3_addsub_pd:
+  case Intrinsic::x86_sse3_addsub_ps:
+  case Intrinsic::x86_avx_addsub_pd_256:
+  case Intrinsic::x86_avx_addsub_ps_256: {
+    // If none of the even or none of the odd lanes are required, turn this
+    // into a generic FP math instruction.
+    APInt SubMask = APInt::getSplat(VWidth, APInt(2, 0x1));
+    APInt AddMask = APInt::getSplat(VWidth, APInt(2, 0x2));
+    bool IsSubOnly = DemandedElts.isSubsetOf(SubMask);
+    bool IsAddOnly = DemandedElts.isSubsetOf(AddMask);
+    if (IsSubOnly || IsAddOnly) {
+      assert((IsSubOnly ^ IsAddOnly) && "Can't be both add-only and sub-only");
+      IRBuilderBase::InsertPointGuard Guard(IC.Builder);
+      IC.Builder.SetInsertPoint(&II);
+      Value *Arg0 = II.getArgOperand(0), *Arg1 = II.getArgOperand(1);
+      return IC.Builder.CreateBinOp(
+          IsSubOnly ? Instruction::FSub : Instruction::FAdd, Arg0, Arg1);
+    }
+
+    simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
+    simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
+    UndefElts &= UndefElts2;
+    break;
+  }
+
+  case Intrinsic::x86_sse2_packssdw_128:
+  case Intrinsic::x86_sse2_packsswb_128:
+  case Intrinsic::x86_sse2_packuswb_128:
+  case Intrinsic::x86_sse41_packusdw:
+  case Intrinsic::x86_avx2_packssdw:
+  case Intrinsic::x86_avx2_packsswb:
+  case Intrinsic::x86_avx2_packusdw:
+  case Intrinsic::x86_avx2_packuswb:
+  case Intrinsic::x86_avx512_packssdw_512:
+  case Intrinsic::x86_avx512_packsswb_512:
+  case Intrinsic::x86_avx512_packusdw_512:
+  case Intrinsic::x86_avx512_packuswb_512: {
+    auto *Ty0 = II.getArgOperand(0)->getType();
+    unsigned InnerVWidth = cast<FixedVectorType>(Ty0)->getNumElements();
+    assert(VWidth == (InnerVWidth * 2) && "Unexpected input size");
+
+    unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128;
+    unsigned VWidthPerLane = VWidth / NumLanes;
+    unsigned InnerVWidthPerLane = InnerVWidth / NumLanes;
+
+    // Per lane, pack the elements of the first input and then the second.
+    // e.g.
+    // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3])
+    // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15])
+    for (int OpNum = 0; OpNum != 2; ++OpNum) {
+      APInt OpDemandedElts(InnerVWidth, 0);
+      for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
+        unsigned LaneIdx = Lane * VWidthPerLane;
+        for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) {
+          unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum;
+          if (DemandedElts[Idx])
+            OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt);
+        }
+      }
+
+      // Demand elements from the operand.
+      APInt OpUndefElts(InnerVWidth, 0);
+      simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts);
+
+      // Pack the operand's UNDEF elements, one lane at a time.
+      OpUndefElts = OpUndefElts.zext(VWidth);
+      for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
+        APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane);
+        LaneElts = LaneElts.getLoBits(InnerVWidthPerLane);
+        LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum);
+        UndefElts |= LaneElts;
+      }
+    }
+    break;
+  }
+
+  // PSHUFB
+  case Intrinsic::x86_ssse3_pshuf_b_128:
+  case Intrinsic::x86_avx2_pshuf_b:
+  case Intrinsic::x86_avx512_pshuf_b_512:
+  // PERMILVAR
+  case Intrinsic::x86_avx_vpermilvar_ps:
+  case Intrinsic::x86_avx_vpermilvar_ps_256:
+  case Intrinsic::x86_avx512_vpermilvar_ps_512:
+  case Intrinsic::x86_avx_vpermilvar_pd:
+  case Intrinsic::x86_avx_vpermilvar_pd_256:
+  case Intrinsic::x86_avx512_vpermilvar_pd_512:
+  // PERMV
+  case Intrinsic::x86_avx2_permd:
+  case Intrinsic::x86_avx2_permps: {
+    simplifyAndSetOp(&II, 1, DemandedElts, UndefElts);
+    break;
+  }
+
+  // SSE4A instructions leave the upper 64-bits of the 128-bit result
+  // in an undefined state.
+  case Intrinsic::x86_sse4a_extrq:
+  case Intrinsic::x86_sse4a_extrqi:
+  case Intrinsic::x86_sse4a_insertq:
+  case Intrinsic::x86_sse4a_insertqi:
+    UndefElts.setHighBits(VWidth / 2);
+    break;
+  }
+  return None;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrAMX.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrAMX.td
index e26dd5050a23..e4f3290cab9f 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrAMX.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrAMX.td
@@ -16,17 +16,21 @@
 
 let Predicates = [HasAMXTILE, In64BitMode] in {
   let SchedRW = [WriteSystem] in {
-    let Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
+    let hasSideEffects = 1,
+        Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
     def LDTILECFG : I <0x49, MRM0m, (outs), (ins opaquemem:$src),
                        "ldtilecfg\t$src",
                        [(int_x86_ldtilecfg addr:$src)]>, VEX, T8PS;
+    let hasSideEffects = 1 in
     def STTILECFG : I <0x49, MRM0m, (outs), (ins opaquemem:$src),
                        "sttilecfg\t$src",
                        [(int_x86_sttilecfg addr:$src)]>, VEX, T8PD;
+    let mayLoad = 1 in
     def TILELOADD : I<0x4b, MRMSrcMemFSIB, (outs TILE:$dst),
                       (ins sibmem:$src),
                       "tileloadd\t{$src, $dst|$dst, $src}", []>,
                       VEX, T8XD;
+    let mayLoad = 1 in
     def TILELOADDT1 : I<0x4b, MRMSrcMemFSIB, (outs TILE:$dst),
                         (ins sibmem:$src),
                         "tileloaddt1\t{$src, $dst|$dst, $src}", []>,
@@ -34,6 +38,7 @@ let Predicates = [HasAMXTILE, In64BitMode] in {
     let Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
     def TILERELEASE : I<0x49, MRM_C0, (outs), (ins),
                         "tilerelease", [(int_x86_tilerelease)]>, VEX, T8PS;
+    let mayStore = 1 in
     def TILESTORED : I<0x4b, MRMDestMemFSIB, (outs),
                        (ins sibmem:$dst, TILE:$src),
                        "tilestored\t{$src, $dst|$dst, $src}", []>,
@@ -42,6 +47,25 @@ let Predicates = [HasAMXTILE, In64BitMode] in {
                      "tilezero\t$dst", []>,
                      VEX, T8XD;
 
+    // Pseduo instruction for RA.
+    let hasSideEffects = 1, mayLoad = 1,
+        Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
+    def PLDTILECFG : PseudoI <(outs TILECFG:$cfg), (ins opaquemem:$src), []>;
+
+    let hasSideEffects = 1, mayStore = 1 in
+    def PSTTILECFG : PseudoI<(outs), (ins opaquemem:$dst, TILECFG:$cfg), []>;
+
+    def PTILELOADDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
+                                                      GR16:$src2,
+                                                      opaquemem:$src3,
+                                                      TILECFG:$cfg), []>;
+    def PTILESTOREDV : PseudoI<(outs), (ins GR16:$src1,
+                                            GR16:$src2, opaquemem:$src3,
+                                            TILE:$src4, TILECFG:$cfg), []>;
+    def PTILEZEROV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
+                                                     GR16:$src2,
+                                                     TILECFG:$cfg), []>;
+
     let usesCustomInserter = 1 in {
       // Pseudo instructions, using immediates instead of tile registers.
       // To be translated to the actual instructions in X86ISelLowering.cpp
@@ -50,7 +74,7 @@ let Predicates = [HasAMXTILE, In64BitMode] in {
                                           sibmem:$src2), []>;
       def PTILESTORED : PseudoI<(outs), (ins i8mem:$dst, u8imm:$src), []>;
       def PTILEZERO : PseudoI<(outs), (ins u8imm:$src),
-                              [(int_x86_tilezero imm:$src)]>;
+                              [(int_x86_tilezero timm:$src)]>;
     }
   } // SchedRW
 } // HasAMXTILE
@@ -76,25 +100,31 @@ let Predicates = [HasAMXINT8, In64BitMode] in {
                       VEX_4V, T8PS;
     }
 
+    // Pseduo instruction for RA.
+    let Constraints = "$src4 = $dst" in
+    def PTDPBSSDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
+                            GR16:$src2, GR16:$src3, TILE:$src4,
+                            TILE:$src5, TILE:$src6, TILECFG:$cfg), []>;
+
     let usesCustomInserter = 1 in {
       // Pseudo instructions, using immediates instead of tile registers.
       // To be translated to the actual instructions in X86ISelLowering.cpp
       def PTDPBSSD : PseudoI<(outs), (ins u8imm:$src1,
                              u8imm:$src2, u8imm:$src3),
-                             [(int_x86_tdpbssd imm:$src1,
-                               imm:$src2, imm:$src3)]>;
+                             [(int_x86_tdpbssd timm:$src1,
+                               timm:$src2, timm:$src3)]>;
       def PTDPBSUD : PseudoI<(outs), (ins u8imm:$src1,
                              u8imm:$src2, u8imm:$src3),
-                             [(int_x86_tdpbsud imm:$src1,
-                               imm:$src2, imm:$src3)]>;
+                             [(int_x86_tdpbsud timm:$src1,
+                               timm:$src2, timm:$src3)]>;
       def PTDPBUSD : PseudoI<(outs), (ins u8imm:$src1,
                              u8imm:$src2, u8imm:$src3),
-                             [(int_x86_tdpbusd imm:$src1,
-                               imm:$src2, imm:$src3)]>;
+                             [(int_x86_tdpbusd timm:$src1,
+                               timm:$src2, timm:$src3)]>;
       def PTDPBUUD : PseudoI<(outs), (ins u8imm:$src1,
                              u8imm:$src2, u8imm:$src3),
-                             [(int_x86_tdpbuud imm:$src1,
-                               imm:$src2, imm:$src3)]>;
+                             [(int_x86_tdpbuud timm:$src1,
+                               timm:$src2, timm:$src3)]>;
     }
   }
 } // HasAMXTILE
@@ -112,8 +142,8 @@ let Predicates = [HasAMXBF16, In64BitMode] in {
       // To be translated to the actual instructions in X86ISelLowering.cpp
       def PTDPBF16PS : PseudoI<(outs), (ins u8imm:$src1,
                                u8imm:$src2, u8imm:$src3),
-                               [(int_x86_tdpbf16ps imm:$src1,
-                                 imm:$src2, imm:$src3)]>;
+                               [(int_x86_tdpbf16ps timm:$src1,
+                                 timm:$src2, timm:$src3)]>;
     }
   }
 } // HasAMXTILE, HasAMXBF16
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrAVX512.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrAVX512.td
index a3ad0b1c8dd6..19012797ae9a 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -1123,10 +1123,10 @@ defm : vextract_for_mask_cast<"VEXTRACTI64x4Z", v64i8_info, v32i8x_info,
                               EXTRACT_get_vextract256_imm, [HasAVX512]>;
 
 // vextractps - extract 32 bits from XMM
-def VEXTRACTPSZrr : AVX512AIi8<0x17, MRMDestReg, (outs GR32:$dst),
+def VEXTRACTPSZrr : AVX512AIi8<0x17, MRMDestReg, (outs GR32orGR64:$dst),
       (ins VR128X:$src1, u8imm:$src2),
       "vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-      [(set GR32:$dst, (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2))]>,
+      [(set GR32orGR64:$dst, (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2))]>,
       EVEX, VEX_WIG, Sched<[WriteVecExtract]>;
 
 def VEXTRACTPSZmr : AVX512AIi8<0x17, MRMDestMem, (outs),
@@ -1414,11 +1414,12 @@ defm VPBROADCASTQ  : avx512_int_broadcast_rm_vl<0x59, "vpbroadcastq",
                                            avx512vl_i64_info, HasAVX512, 1>, VEX_W1X;
 
 multiclass avx512_subvec_broadcast_rm<bits<8> opc, string OpcodeStr,
-                          X86VectorVTInfo _Dst, X86VectorVTInfo _Src> {
+                                      SDPatternOperator OpNode,
+                                      X86VectorVTInfo _Dst,
+                                      X86VectorVTInfo _Src> {
   defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
                            (ins _Src.MemOp:$src), OpcodeStr, "$src", "$src",
-                           (_Dst.VT (X86SubVBroadcast
-                             (_Src.VT (_Src.LdFrag addr:$src))))>,
+                           (_Dst.VT (OpNode addr:$src))>,
                            Sched<[SchedWriteShuffle.YMM.Folded]>,
                            AVX5128IBase, EVEX;
 }
@@ -1427,13 +1428,14 @@ multiclass avx512_subvec_broadcast_rm<bits<8> opc, string OpcodeStr,
 // the unmasked patterns so that we only use the DQ instructions when masking
 //  is requested.
 multiclass avx512_subvec_broadcast_rm_dq<bits<8> opc, string OpcodeStr,
-                          X86VectorVTInfo _Dst, X86VectorVTInfo _Src> {
+                                         SDPatternOperator OpNode,
+                                         X86VectorVTInfo _Dst,
+                                         X86VectorVTInfo _Src> {
   let hasSideEffects = 0, mayLoad = 1 in
   defm rm : AVX512_maskable_split<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
                            (ins _Src.MemOp:$src), OpcodeStr, "$src", "$src",
                            (null_frag),
-                           (_Dst.VT (X86SubVBroadcast
-                             (_Src.VT (_Src.LdFrag addr:$src))))>,
+                           (_Dst.VT (OpNode addr:$src))>,
                            Sched<[SchedWriteShuffle.YMM.Folded]>,
                            AVX5128IBase, EVEX;
 }
@@ -1443,225 +1445,194 @@ multiclass avx512_subvec_broadcast_rm_dq<bits<8> opc, string OpcodeStr,
 //
 
 defm VBROADCASTI32X4 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti32x4",
-                       v16i32_info, v4i32x_info>,
+                       X86SubVBroadcastld128, v16i32_info, v4i32x_info>,
                        EVEX_V512, EVEX_CD8<32, CD8VT4>;
 defm VBROADCASTF32X4 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf32x4",
-                       v16f32_info, v4f32x_info>,
+                       X86SubVBroadcastld128, v16f32_info, v4f32x_info>,
                        EVEX_V512, EVEX_CD8<32, CD8VT4>;
 defm VBROADCASTI64X4 : avx512_subvec_broadcast_rm<0x5b, "vbroadcasti64x4",
-                       v8i64_info, v4i64x_info>, VEX_W,
+                       X86SubVBroadcastld256, v8i64_info, v4i64x_info>, VEX_W,
                        EVEX_V512, EVEX_CD8<64, CD8VT4>;
 defm VBROADCASTF64X4 : avx512_subvec_broadcast_rm<0x1b, "vbroadcastf64x4",
-                       v8f64_info, v4f64x_info>, VEX_W,
+                       X86SubVBroadcastld256, v8f64_info, v4f64x_info>, VEX_W,
                        EVEX_V512, EVEX_CD8<64, CD8VT4>;
 
 let Predicates = [HasAVX512] in {
-def : Pat<(v16f32 (X86SubVBroadcast (loadv8f32 addr:$src))),
+def : Pat<(v8f64 (X86SubVBroadcastld256 addr:$src)),
           (VBROADCASTF64X4rm addr:$src)>;
-def : Pat<(v16i32 (X86SubVBroadcast (loadv8i32 addr:$src))),
+def : Pat<(v16f32 (X86SubVBroadcastld256 addr:$src)),
+          (VBROADCASTF64X4rm addr:$src)>;
+def : Pat<(v8i64 (X86SubVBroadcastld256 addr:$src)),
+          (VBROADCASTI64X4rm addr:$src)>;
+def : Pat<(v16i32 (X86SubVBroadcastld256 addr:$src)),
           (VBROADCASTI64X4rm addr:$src)>;
-def : Pat<(v32i16 (X86SubVBroadcast (loadv16i16 addr:$src))),
+def : Pat<(v32i16 (X86SubVBroadcastld256 addr:$src)),
           (VBROADCASTI64X4rm addr:$src)>;
-def : Pat<(v64i8 (X86SubVBroadcast (loadv32i8 addr:$src))),
+def : Pat<(v64i8 (X86SubVBroadcastld256 addr:$src)),
           (VBROADCASTI64X4rm addr:$src)>;
 
-// Provide fallback in case the load node that is used in the patterns above
-// is used by additional users, which prevents the pattern selection.
-def : Pat<(v8f64 (X86SubVBroadcast (v4f64 VR256X:$src))),
-          (VINSERTF64x4Zrr (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
-                           (v4f64 VR256X:$src), 1)>;
-def : Pat<(v16f32 (X86SubVBroadcast (v8f32 VR256X:$src))),
-          (VINSERTF64x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
-                           (v8f32 VR256X:$src), 1)>;
-def : Pat<(v8i64 (X86SubVBroadcast (v4i64 VR256X:$src))),
-          (VINSERTI64x4Zrr (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
-                           (v4i64 VR256X:$src), 1)>;
-def : Pat<(v16i32 (X86SubVBroadcast (v8i32 VR256X:$src))),
-          (VINSERTI64x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
-                           (v8i32 VR256X:$src), 1)>;
-def : Pat<(v32i16 (X86SubVBroadcast (v16i16 VR256X:$src))),
-          (VINSERTI64x4Zrr (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
-                           (v16i16 VR256X:$src), 1)>;
-def : Pat<(v64i8 (X86SubVBroadcast (v32i8 VR256X:$src))),
-          (VINSERTI64x4Zrr (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
-                           (v32i8 VR256X:$src), 1)>;
-
-def : Pat<(v8f64 (X86SubVBroadcast (loadv2f64 addr:$src))),
+def : Pat<(v8f64 (X86SubVBroadcastld128 addr:$src)),
           (VBROADCASTF32X4rm addr:$src)>;
-def : Pat<(v8i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
+def : Pat<(v16f32 (X86SubVBroadcastld128 addr:$src)),
+          (VBROADCASTF32X4rm addr:$src)>;
+def : Pat<(v8i64 (X86SubVBroadcastld128 addr:$src)),
+          (VBROADCASTI32X4rm addr:$src)>;
+def : Pat<(v16i32 (X86SubVBroadcastld128 addr:$src)),
           (VBROADCASTI32X4rm addr:$src)>;
-def : Pat<(v32i16 (X86SubVBroadcast (loadv8i16 addr:$src))),
+def : Pat<(v32i16 (X86SubVBroadcastld128 addr:$src)),
           (VBROADCASTI32X4rm addr:$src)>;
-def : Pat<(v64i8 (X86SubVBroadcast (loadv16i8 addr:$src))),
+def : Pat<(v64i8 (X86SubVBroadcastld128 addr:$src)),
           (VBROADCASTI32X4rm addr:$src)>;
 
 // Patterns for selects of bitcasted operations.
 def : Pat<(vselect_mask VK16WM:$mask,
-                        (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv2f64 addr:$src)))),
+                        (bc_v16f32 (v8f64 (X86SubVBroadcastld128 addr:$src))),
                         (v16f32 immAllZerosV)),
           (VBROADCASTF32X4rmkz VK16WM:$mask, addr:$src)>;
 def : Pat<(vselect_mask VK16WM:$mask,
-                        (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv2f64 addr:$src)))),
+                        (bc_v16f32 (v8f64 (X86SubVBroadcastld128 addr:$src))),
                         VR512:$src0),
           (VBROADCASTF32X4rmk VR512:$src0, VK16WM:$mask, addr:$src)>;
 def : Pat<(vselect_mask VK16WM:$mask,
-                        (bc_v16i32 (v8i64 (X86SubVBroadcast (loadv2i64 addr:$src)))),
+                        (bc_v16i32 (v8i64 (X86SubVBroadcastld128 addr:$src))),
                         (v16i32 immAllZerosV)),
           (VBROADCASTI32X4rmkz VK16WM:$mask, addr:$src)>;
 def : Pat<(vselect_mask VK16WM:$mask,
-                        (bc_v16i32 (v8i64 (X86SubVBroadcast (loadv2i64 addr:$src)))),
+                        (bc_v16i32 (v8i64 (X86SubVBroadcastld128 addr:$src))),
                         VR512:$src0),
           (VBROADCASTI32X4rmk VR512:$src0, VK16WM:$mask, addr:$src)>;
 
 def : Pat<(vselect_mask VK8WM:$mask,
-                        (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv8f32 addr:$src)))),
+                        (bc_v8f64 (v16f32 (X86SubVBroadcastld256 addr:$src))),
                         (v8f64 immAllZerosV)),
           (VBROADCASTF64X4rmkz VK8WM:$mask, addr:$src)>;
 def : Pat<(vselect_mask VK8WM:$mask,
-                        (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv8f32 addr:$src)))),
+                        (bc_v8f64 (v16f32 (X86SubVBroadcastld256 addr:$src))),
                         VR512:$src0),
           (VBROADCASTF64X4rmk VR512:$src0, VK8WM:$mask, addr:$src)>;
 def : Pat<(vselect_mask VK8WM:$mask,
-                        (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv8i32 addr:$src)))),
+                        (bc_v8i64 (v16i32 (X86SubVBroadcastld256 addr:$src))),
                         (v8i64 immAllZerosV)),
           (VBROADCASTI64X4rmkz VK8WM:$mask, addr:$src)>;
 def : Pat<(vselect_mask VK8WM:$mask,
-                        (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv8i32 addr:$src)))),
+                        (bc_v8i64 (v16i32 (X86SubVBroadcastld256 addr:$src))),
                         VR512:$src0),
           (VBROADCASTI64X4rmk VR512:$src0, VK8WM:$mask, addr:$src)>;
 }
 
 let Predicates = [HasVLX] in {
 defm VBROADCASTI32X4Z256 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti32x4",
-                           v8i32x_info, v4i32x_info>,
+                           X86SubVBroadcastld128, v8i32x_info, v4i32x_info>,
                            EVEX_V256, EVEX_CD8<32, CD8VT4>;
 defm VBROADCASTF32X4Z256 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf32x4",
-                           v8f32x_info, v4f32x_info>,
+                           X86SubVBroadcastld128, v8f32x_info, v4f32x_info>,
                            EVEX_V256, EVEX_CD8<32, CD8VT4>;
 
-def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))),
+def : Pat<(v4f64 (X86SubVBroadcastld128 addr:$src)),
           (VBROADCASTF32X4Z256rm addr:$src)>;
-def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
+def : Pat<(v8f32 (X86SubVBroadcastld128 addr:$src)),
+          (VBROADCASTF32X4Z256rm addr:$src)>;
+def : Pat<(v4i64 (X86SubVBroadcastld128 addr:$src)),
+          (VBROADCASTI32X4Z256rm addr:$src)>;
+def : Pat<(v8i32 (X86SubVBroadcastld128 addr:$src)),
           (VBROADCASTI32X4Z256rm addr:$src)>;
-def : Pat<(v16i16 (X86SubVBroadcast (loadv8i16 addr:$src))),
+def : Pat<(v16i16 (X86SubVBroadcastld128 addr:$src)),
           (VBROADCASTI32X4Z256rm addr:$src)>;
-def : Pat<(v32i8 (X86SubVBroadcast (loadv16i8 addr:$src))),
+def : Pat<(v32i8 (X86SubVBroadcastld128 addr:$src)),
           (VBROADCASTI32X4Z256rm addr:$src)>;
 
 // Patterns for selects of bitcasted operations.
 def : Pat<(vselect_mask VK8WM:$mask,
-                        (bc_v8f32 (v4f64 (X86SubVBroadcast (loadv2f64 addr:$src)))),
+                        (bc_v8f32 (v4f64 (X86SubVBroadcastld128 addr:$src))),
                         (v8f32 immAllZerosV)),
           (VBROADCASTF32X4Z256rmkz VK8WM:$mask, addr:$src)>;
 def : Pat<(vselect_mask VK8WM:$mask,
-                        (bc_v8f32 (v4f64 (X86SubVBroadcast (loadv2f64 addr:$src)))),
+                        (bc_v8f32 (v4f64 (X86SubVBroadcastld128 addr:$src))),
                         VR256X:$src0),
           (VBROADCASTF32X4Z256rmk VR256X:$src0, VK8WM:$mask, addr:$src)>;
 def : Pat<(vselect_mask VK8WM:$mask,
-                        (bc_v8i32 (v4i64 (X86SubVBroadcast (loadv2i64 addr:$src)))),
+                        (bc_v8i32 (v4i64 (X86SubVBroadcastld128 addr:$src))),
                         (v8i32 immAllZerosV)),
           (VBROADCASTI32X4Z256rmkz VK8WM:$mask, addr:$src)>;
 def : Pat<(vselect_mask VK8WM:$mask,
-                        (bc_v8i32 (v4i64 (X86SubVBroadcast (loadv2i64 addr:$src)))),
+                        (bc_v8i32 (v4i64 (X86SubVBroadcastld128 addr:$src))),
                         VR256X:$src0),
           (VBROADCASTI32X4Z256rmk VR256X:$src0, VK8WM:$mask, addr:$src)>;
-
-
-// Provide fallback in case the load node that is used in the patterns above
-// is used by additional users, which prevents the pattern selection.
-def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128X:$src))),
-          (VINSERTF32x4Z256rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
-                              (v2f64 VR128X:$src), 1)>;
-def : Pat<(v8f32 (X86SubVBroadcast (v4f32 VR128X:$src))),
-          (VINSERTF32x4Z256rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
-                              (v4f32 VR128X:$src), 1)>;
-def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128X:$src))),
-          (VINSERTI32x4Z256rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
-                              (v2i64 VR128X:$src), 1)>;
-def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128X:$src))),
-          (VINSERTI32x4Z256rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
-                              (v4i32 VR128X:$src), 1)>;
-def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128X:$src))),
-          (VINSERTI32x4Z256rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
-                              (v8i16 VR128X:$src), 1)>;
-def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128X:$src))),
-          (VINSERTI32x4Z256rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
-                              (v16i8 VR128X:$src), 1)>;
 }
 
 let Predicates = [HasVLX, HasDQI] in {
 defm VBROADCASTI64X2Z128 : avx512_subvec_broadcast_rm_dq<0x5a, "vbroadcasti64x2",
-                           v4i64x_info, v2i64x_info>, VEX_W1X,
+                           X86SubVBroadcastld128, v4i64x_info, v2i64x_info>, VEX_W1X,
                            EVEX_V256, EVEX_CD8<64, CD8VT2>;
 defm VBROADCASTF64X2Z128 : avx512_subvec_broadcast_rm_dq<0x1a, "vbroadcastf64x2",
-                           v4f64x_info, v2f64x_info>, VEX_W1X,
+                           X86SubVBroadcastld128, v4f64x_info, v2f64x_info>, VEX_W1X,
                            EVEX_V256, EVEX_CD8<64, CD8VT2>;
 
 // Patterns for selects of bitcasted operations.
 def : Pat<(vselect_mask VK4WM:$mask,
-                        (bc_v4f64 (v8f32 (X86SubVBroadcast (loadv4f32 addr:$src)))),
+                        (bc_v4f64 (v8f32 (X86SubVBroadcastld128 addr:$src))),
                         (v4f64 immAllZerosV)),
           (VBROADCASTF64X2Z128rmkz VK4WM:$mask, addr:$src)>;
 def : Pat<(vselect_mask VK4WM:$mask,
-                        (bc_v4f64 (v8f32 (X86SubVBroadcast (loadv4f32 addr:$src)))),
+                        (bc_v4f64 (v8f32 (X86SubVBroadcastld128 addr:$src))),
                         VR256X:$src0),
           (VBROADCASTF64X2Z128rmk VR256X:$src0, VK4WM:$mask, addr:$src)>;
 def : Pat<(vselect_mask VK4WM:$mask,
-                        (bc_v4i64 (v8i32 (X86SubVBroadcast (loadv4i32 addr:$src)))),
+                        (bc_v4i64 (v8i32 (X86SubVBroadcastld128 addr:$src))),
                         (v4i64 immAllZerosV)),
           (VBROADCASTI64X2Z128rmkz VK4WM:$mask, addr:$src)>;
 def : Pat<(vselect_mask VK4WM:$mask,
-                        (bc_v4i64 (v8i32 (X86SubVBroadcast (loadv4i32 addr:$src)))),
+                        (bc_v4i64 (v8i32 (X86SubVBroadcastld128 addr:$src))),
                         VR256X:$src0),
           (VBROADCASTI64X2Z128rmk VR256X:$src0, VK4WM:$mask, addr:$src)>;
 }
 
 let Predicates = [HasDQI] in {
 defm VBROADCASTI64X2 : avx512_subvec_broadcast_rm_dq<0x5a, "vbroadcasti64x2",
-                       v8i64_info, v2i64x_info>, VEX_W,
+                       X86SubVBroadcastld128, v8i64_info, v2i64x_info>, VEX_W,
                        EVEX_V512, EVEX_CD8<64, CD8VT2>;
 defm VBROADCASTI32X8 : avx512_subvec_broadcast_rm_dq<0x5b, "vbroadcasti32x8",
-                       v16i32_info, v8i32x_info>,
+                       X86SubVBroadcastld256, v16i32_info, v8i32x_info>,
                        EVEX_V512, EVEX_CD8<32, CD8VT8>;
 defm VBROADCASTF64X2 : avx512_subvec_broadcast_rm_dq<0x1a, "vbroadcastf64x2",
-                       v8f64_info, v2f64x_info>, VEX_W,
+                       X86SubVBroadcastld128, v8f64_info, v2f64x_info>, VEX_W,
                        EVEX_V512, EVEX_CD8<64, CD8VT2>;
 defm VBROADCASTF32X8 : avx512_subvec_broadcast_rm_dq<0x1b, "vbroadcastf32x8",
-                       v16f32_info, v8f32x_info>,
+                       X86SubVBroadcastld256, v16f32_info, v8f32x_info>,
                        EVEX_V512, EVEX_CD8<32, CD8VT8>;
 
 // Patterns for selects of bitcasted operations.
 def : Pat<(vselect_mask VK16WM:$mask,
-                        (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv4f64 addr:$src)))),
+                        (bc_v16f32 (v8f64 (X86SubVBroadcastld256 addr:$src))),
                         (v16f32 immAllZerosV)),
           (VBROADCASTF32X8rmkz VK16WM:$mask, addr:$src)>;
 def : Pat<(vselect_mask VK16WM:$mask,
-                        (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv4f64 addr:$src)))),
+                        (bc_v16f32 (v8f64 (X86SubVBroadcastld256 addr:$src))),
                         VR512:$src0),
           (VBROADCASTF32X8rmk VR512:$src0, VK16WM:$mask, addr:$src)>;
 def : Pat<(vselect_mask VK16WM:$mask,
-                        (bc_v16i32 (v8i64 (X86SubVBroadcast (loadv4i64 addr:$src)))),
+                        (bc_v16i32 (v8i64 (X86SubVBroadcastld256 addr:$src))),
                         (v16i32 immAllZerosV)),
           (VBROADCASTI32X8rmkz VK16WM:$mask, addr:$src)>;
 def : Pat<(vselect_mask VK16WM:$mask,
-                        (bc_v16i32 (v8i64 (X86SubVBroadcast (loadv4i64 addr:$src)))),
+                        (bc_v16i32 (v8i64 (X86SubVBroadcastld256 addr:$src))),
                         VR512:$src0),
           (VBROADCASTI32X8rmk VR512:$src0, VK16WM:$mask, addr:$src)>;
 
 def : Pat<(vselect_mask VK8WM:$mask,
-                        (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv4f32 addr:$src)))),
+                        (bc_v8f64 (v16f32 (X86SubVBroadcastld128 addr:$src))),
                         (v8f64 immAllZerosV)),
           (VBROADCASTF64X2rmkz VK8WM:$mask, addr:$src)>;
 def : Pat<(vselect_mask VK8WM:$mask,
-                        (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv4f32 addr:$src)))),
+                        (bc_v8f64 (v16f32 (X86SubVBroadcastld128 addr:$src))),
                         VR512:$src0),
           (VBROADCASTF64X2rmk VR512:$src0, VK8WM:$mask, addr:$src)>;
 def : Pat<(vselect_mask VK8WM:$mask,
-                        (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv4i32 addr:$src)))),
+                        (bc_v8i64 (v16i32 (X86SubVBroadcastld128 addr:$src))),
                         (v8i64 immAllZerosV)),
           (VBROADCASTI64X2rmkz VK8WM:$mask, addr:$src)>;
 def : Pat<(vselect_mask VK8WM:$mask,
-                        (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv4i32 addr:$src)))),
+                        (bc_v8i64 (v16i32 (X86SubVBroadcastld128 addr:$src))),
                         VR512:$src0),
           (VBROADCASTI64X2rmk VR512:$src0, VK8WM:$mask, addr:$src)>;
 }
@@ -2494,10 +2465,6 @@ def X86cmpm_su : PatFrag<(ops node:$src1, node:$src2, node:$cc),
                          (X86cmpm node:$src1, node:$src2, node:$cc), [{
   return N->hasOneUse();
 }]>;
-def X86cmpmSAE_su : PatFrag<(ops node:$src1, node:$src2, node:$cc),
-                            (X86cmpmSAE node:$src1, node:$src2, node:$cc), [{
-  return N->hasOneUse();
-}]>;
 
 def X86cmpm_imm_commute : SDNodeXForm<timm, [{
   uint8_t Imm = X86::getSwappedVCMPImm(N->getZExtValue() & 0x1f);
@@ -2564,19 +2531,71 @@ let Uses = [MXCSR], mayRaiseFPException = 1 in {
             (!cast<Instruction>(Name#_.ZSuffix#"rmbik") _.KRCWM:$mask,
                                                         _.RC:$src1, addr:$src2,
                                                         (X86cmpm_imm_commute timm:$cc))>;
+
+  // Patterns for mask intrinsics.
+  def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc,
+                      (_.KVT immAllOnesV)),
+            (!cast<Instruction>(Name#_.ZSuffix#"rri") _.RC:$src1, _.RC:$src2, timm:$cc)>;
+
+  def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc, _.KRCWM:$mask),
+            (!cast<Instruction>(Name#_.ZSuffix#"rrik") _.KRCWM:$mask, _.RC:$src1,
+                                                       _.RC:$src2, timm:$cc)>;
+
+  def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)), timm:$cc,
+                      (_.KVT immAllOnesV)),
+            (!cast<Instruction>(Name#_.ZSuffix#"rmi") _.RC:$src1, addr:$src2, timm:$cc)>;
+
+  def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)), timm:$cc,
+                      _.KRCWM:$mask),
+            (!cast<Instruction>(Name#_.ZSuffix#"rmik") _.KRCWM:$mask, _.RC:$src1,
+                                                       addr:$src2, timm:$cc)>;
+
+  def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT (_.BroadcastLdFrag addr:$src2)), timm:$cc,
+                      (_.KVT immAllOnesV)),
+            (!cast<Instruction>(Name#_.ZSuffix#"rmbi") _.RC:$src1, addr:$src2, timm:$cc)>;
+
+  def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT (_.BroadcastLdFrag addr:$src2)), timm:$cc,
+                      _.KRCWM:$mask),
+            (!cast<Instruction>(Name#_.ZSuffix#"rmbik") _.KRCWM:$mask, _.RC:$src1,
+                                                        addr:$src2, timm:$cc)>;
+
+  // Patterns for mask intrinsics with loads in other operand.
+  def : Pat<(X86cmpmm (_.VT (_.LdFrag addr:$src2)), (_.VT _.RC:$src1), timm:$cc,
+                      (_.KVT immAllOnesV)),
+            (!cast<Instruction>(Name#_.ZSuffix#"rmi") _.RC:$src1, addr:$src2,
+                                                      (X86cmpm_imm_commute timm:$cc))>;
+
+  def : Pat<(X86cmpmm (_.VT (_.LdFrag addr:$src2)), (_.VT _.RC:$src1), timm:$cc,
+                      _.KRCWM:$mask),
+            (!cast<Instruction>(Name#_.ZSuffix#"rmik") _.KRCWM:$mask,
+                                                       _.RC:$src1, addr:$src2,
+                                                       (X86cmpm_imm_commute timm:$cc))>;
+
+  def : Pat<(X86cmpmm (_.VT (_.BroadcastLdFrag addr:$src2)), (_.VT _.RC:$src1), timm:$cc,
+                      (_.KVT immAllOnesV)),
+            (!cast<Instruction>(Name#_.ZSuffix#"rmbi") _.RC:$src1, addr:$src2,
+                                                       (X86cmpm_imm_commute timm:$cc))>;
+
+  def : Pat<(X86cmpmm (_.VT (_.BroadcastLdFrag addr:$src2)), (_.VT _.RC:$src1), timm:$cc,
+                      _.KRCWM:$mask),
+            (!cast<Instruction>(Name#_.ZSuffix#"rmbik") _.KRCWM:$mask,
+                                                        _.RC:$src1, addr:$src2,
+                                                        (X86cmpm_imm_commute  timm:$cc))>;
 }
 
 multiclass avx512_vcmp_sae<X86FoldableSchedWrite sched, X86VectorVTInfo _> {
   // comparison code form (VCMP[EQ/LT/LE/...]
   let Uses = [MXCSR] in
-  defm  rrib  : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
-                     (outs _.KRC:$dst),(ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
+  defm  rrib  : AVX512_maskable_custom_cmp<0xC2, MRMSrcReg, (outs _.KRC:$dst),
+                     (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
+                     (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2, u8imm:$cc),
                      "vcmp"#_.Suffix,
                      "$cc, {sae}, $src2, $src1",
                      "$src1, $src2, {sae}, $cc",
-                     (X86cmpmSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc),
-                     (X86cmpmSAE_su (_.VT _.RC:$src1), (_.VT _.RC:$src2),
-                                    timm:$cc)>,
+                     [(set _.KRC:$dst, (X86cmpmmSAE (_.VT _.RC:$src1),
+                                        (_.VT _.RC:$src2), timm:$cc, (_.KVT immAllOnesV)))],
+                     [(set _.KRC:$dst, (X86cmpmmSAE (_.VT _.RC:$src1),
+                                        (_.VT _.RC:$src2), timm:$cc, _.KRCWM:$mask))]>,
                      EVEX_B, Sched<[sched]>;
 }
 
@@ -2836,6 +2855,8 @@ def : Pat<(v16i1 (bitconvert (i16 GR16:$src))),
           (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit)), VK16)>;
 def : Pat<(i16 (bitconvert (v16i1 VK16:$src))),
           (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK16:$src, GR32)), sub_16bit)>;
+def : Pat<(i8 (trunc (i16 (bitconvert (v16i1 VK16:$src))))),
+          (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK16:$src, GR32)), sub_8bit)>;
 
 def : Pat<(v8i1 (bitconvert (i8 GR8:$src))),
           (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR8:$src, sub_8bit)), VK8)>;
@@ -2871,9 +2892,6 @@ def : Pat<(i64 (bitconvert (v64i1 VK64:$src))),
 
 // Load/store kreg
 let Predicates = [HasDQI] in {
-  def : Pat<(store VK1:$src, addr:$dst),
-            (KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK1:$src, VK8))>;
-
   def : Pat<(v1i1 (load addr:$src)),
             (COPY_TO_REGCLASS (KMOVBkm addr:$src), VK1)>;
   def : Pat<(v2i1 (load addr:$src)),
@@ -2919,10 +2937,9 @@ let Predicates = [HasAVX512] in {
 
   def : Pat<(insert_subvector (v16i1 immAllZerosV),
                               (v1i1 (scalar_to_vector GR8:$src)), (iPTR 0)),
-            (COPY_TO_REGCLASS
-             (KMOVWkr (AND32ri8
-                       (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src, sub_8bit),
-                       (i32 1))), VK16)>;
+            (KMOVWkr (AND32ri8
+                      (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src, sub_8bit),
+                      (i32 1)))>;
 }
 
 // Mask unary operation
@@ -6487,8 +6504,8 @@ multiclass avx512_fma3p_213_f<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                       avx512vl_f64_info, "PD">, VEX_W;
 }
 
-defm VFMADD213    : avx512_fma3p_213_f<0xA8, "vfmadd213", X86any_Fmadd,
-                                       X86Fmadd, X86FmaddRnd>;
+defm VFMADD213    : avx512_fma3p_213_f<0xA8, "vfmadd213", any_fma,
+                                       fma, X86FmaddRnd>;
 defm VFMSUB213    : avx512_fma3p_213_f<0xAA, "vfmsub213", X86any_Fmsub,
                                        X86Fmsub, X86FmsubRnd>;
 defm VFMADDSUB213 : avx512_fma3p_213_f<0xA6, "vfmaddsub213", X86Fmaddsub,
@@ -6578,8 +6595,8 @@ multiclass avx512_fma3p_231_f<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                       avx512vl_f64_info, "PD">, VEX_W;
 }
 
-defm VFMADD231    : avx512_fma3p_231_f<0xB8, "vfmadd231", X86any_Fmadd,
-                                       X86Fmadd, X86FmaddRnd>;
+defm VFMADD231    : avx512_fma3p_231_f<0xB8, "vfmadd231", any_fma,
+                                       fma, X86FmaddRnd>;
 defm VFMSUB231    : avx512_fma3p_231_f<0xBA, "vfmsub231", X86any_Fmsub,
                                        X86Fmsub, X86FmsubRnd>;
 defm VFMADDSUB231 : avx512_fma3p_231_f<0xB6, "vfmaddsub231", X86Fmaddsub,
@@ -6670,8 +6687,8 @@ multiclass avx512_fma3p_132_f<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                       avx512vl_f64_info, "PD">, VEX_W;
 }
 
-defm VFMADD132    : avx512_fma3p_132_f<0x98, "vfmadd132", X86any_Fmadd,
-                                       X86Fmadd, X86FmaddRnd>;
+defm VFMADD132    : avx512_fma3p_132_f<0x98, "vfmadd132", any_fma,
+                                       fma, X86FmaddRnd>;
 defm VFMSUB132    : avx512_fma3p_132_f<0x9A, "vfmsub132", X86any_Fmsub,
                                        X86Fmsub, X86FmsubRnd>;
 defm VFMADDSUB132 : avx512_fma3p_132_f<0x96, "vfmaddsub132", X86Fmaddsub,
@@ -6773,7 +6790,7 @@ multiclass avx512_fma3s<bits<8> opc213, bits<8> opc231, bits<8> opc132,
   }
 }
 
-defm VFMADD  : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86any_Fmadd, X86FmaddRnd>;
+defm VFMADD  : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", any_fma, X86FmaddRnd>;
 defm VFMSUB  : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86any_Fmsub, X86FmsubRnd>;
 defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86any_Fnmadd, X86FnmaddRnd>;
 defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86any_Fnmsub, X86FnmsubRnd>;
@@ -6981,7 +6998,7 @@ multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode MaskedOp,
   }
 }
 
-defm : avx512_scalar_fma_patterns<X86any_Fmadd, X86Fmadd, X86FmaddRnd, "VFMADD",
+defm : avx512_scalar_fma_patterns<any_fma, fma, X86FmaddRnd, "VFMADD",
                                   "SS", X86Movss, v4f32x_info, fp32imm0>;
 defm : avx512_scalar_fma_patterns<X86any_Fmsub, X86Fmsub, X86FmsubRnd, "VFMSUB",
                                   "SS", X86Movss, v4f32x_info, fp32imm0>;
@@ -6990,7 +7007,7 @@ defm : avx512_scalar_fma_patterns<X86any_Fnmadd, X86Fnmadd, X86FnmaddRnd, "VFNMA
 defm : avx512_scalar_fma_patterns<X86any_Fnmsub, X86Fnmsub, X86FnmsubRnd, "VFNMSUB",
                                   "SS", X86Movss, v4f32x_info, fp32imm0>;
 
-defm : avx512_scalar_fma_patterns<X86any_Fmadd, X86Fmadd, X86FmaddRnd, "VFMADD",
+defm : avx512_scalar_fma_patterns<any_fma, fma, X86FmaddRnd, "VFMADD",
                                   "SD", X86Movsd, v2f64x_info, fp64imm0>;
 defm : avx512_scalar_fma_patterns<X86any_Fmsub, X86Fmsub, X86FmsubRnd, "VFMSUB",
                                   "SD", X86Movsd, v2f64x_info, fp64imm0>;
@@ -7523,7 +7540,7 @@ multiclass avx512_cvt_fp_scalar_sd2ss<bits<8> opc, string OpcodeStr,
                                       SDNode OpNode, SDNode OpNodeRnd,
                                       X86FoldableSchedWrite sched,
                                       X86VectorVTInfo _src, X86VectorVTInfo _dst> {
-  let Predicates = [HasAVX512] in {
+  let Predicates = [HasAVX512], ExeDomain = SSEPackedSingle in {
     defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNode, sched>,
              avx512_cvt_fp_rc_scalar<opc, OpcodeStr, _dst, _src,
                                OpNodeRnd, sched>, VEX_W, EVEX_CD8<64, CD8VT1>, XD;
@@ -7534,7 +7551,7 @@ multiclass avx512_cvt_fp_scalar_ss2sd<bits<8> opc, string OpcodeStr,
                                       SDNode OpNode, SDNode OpNodeSAE,
                                       X86FoldableSchedWrite sched,
                                       X86VectorVTInfo _src, X86VectorVTInfo _dst> {
-  let Predicates = [HasAVX512] in {
+  let Predicates = [HasAVX512], ExeDomain = SSEPackedSingle in {
     defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNode, sched>,
              avx512_cvt_fp_sae_scalar<opc, OpcodeStr, _dst, _src, OpNodeSAE, sched>,
              EVEX_CD8<32, CD8VT1>, XS;
@@ -10433,39 +10450,6 @@ defm VSHUFI32X4 : avx512_shuff_packed_128<"vshufi32x4", WriteFShuffle256,
 defm VSHUFI64X2 : avx512_shuff_packed_128<"vshufi64x2", WriteFShuffle256,
       avx512vl_i64_info, avx512vl_i64_info, 0x43, "VPERM2I128">, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
 
-let Predicates = [HasAVX512] in {
-// Provide fallback in case the load node that is used in the broadcast
-// patterns above is used by additional users, which prevents the pattern
-// selection.
-def : Pat<(v8f64 (X86SubVBroadcast (v2f64 VR128X:$src))),
-          (VSHUFF64X2Zrri (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
-                          (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
-                          0)>;
-def : Pat<(v8i64 (X86SubVBroadcast (v2i64 VR128X:$src))),
-          (VSHUFI64X2Zrri (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
-                          (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
-                          0)>;
-
-def : Pat<(v16f32 (X86SubVBroadcast (v4f32 VR128X:$src))),
-          (VSHUFF32X4Zrri (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
-                          (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
-                          0)>;
-def : Pat<(v16i32 (X86SubVBroadcast (v4i32 VR128X:$src))),
-          (VSHUFI32X4Zrri (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
-                          (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
-                          0)>;
-
-def : Pat<(v32i16 (X86SubVBroadcast (v8i16 VR128X:$src))),
-          (VSHUFI32X4Zrri (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
-                          (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
-                          0)>;
-
-def : Pat<(v64i8 (X86SubVBroadcast (v16i8 VR128X:$src))),
-          (VSHUFI32X4Zrri (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
-                          (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
-                          0)>;
-}
-
 multiclass avx512_valign<bits<8> opc, string OpcodeStr,
                          X86FoldableSchedWrite sched, X86VectorVTInfo _>{
   // NOTE: EVEX2VEXOverride changed back to Unset for 256-bit at the
@@ -10895,7 +10879,7 @@ multiclass avx512_extract_elt_bw_m<bits<8> opc, string OpcodeStr, SDNode OpNode,
   def mr : AVX512Ii8<opc, MRMDestMem, (outs),
               (ins _.ScalarMemOp:$dst, _.RC:$src1, u8imm:$src2),
               OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-              [(store (_.EltVT (trunc (OpNode (_.VT _.RC:$src1), imm:$src2))),
+              [(store (_.EltVT (trunc (OpNode (_.VT _.RC:$src1), timm:$src2))),
                        addr:$dst)]>,
               EVEX, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteVecExtractSt]>;
 }
@@ -10906,7 +10890,7 @@ multiclass avx512_extract_elt_b<string OpcodeStr, X86VectorVTInfo _> {
                   (ins _.RC:$src1, u8imm:$src2),
                   OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                   [(set GR32orGR64:$dst,
-                        (X86pextrb (_.VT _.RC:$src1), imm:$src2))]>,
+                        (X86pextrb (_.VT _.RC:$src1), timm:$src2))]>,
                   EVEX, TAPD, Sched<[WriteVecExtract]>;
 
     defm NAME : avx512_extract_elt_bw_m<0x14, OpcodeStr, X86pextrb, _>, TAPD;
@@ -10919,7 +10903,7 @@ multiclass avx512_extract_elt_w<string OpcodeStr, X86VectorVTInfo _> {
                   (ins _.RC:$src1, u8imm:$src2),
                   OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                   [(set GR32orGR64:$dst,
-                        (X86pextrw (_.VT _.RC:$src1), imm:$src2))]>,
+                        (X86pextrw (_.VT _.RC:$src1), timm:$src2))]>,
                   EVEX, PD, Sched<[WriteVecExtract]>;
 
     let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in
@@ -10959,12 +10943,13 @@ defm VPEXTRDZ : avx512_extract_elt_dq<"vpextrd", v4i32x_info, GR32>;
 defm VPEXTRQZ : avx512_extract_elt_dq<"vpextrq", v2i64x_info, GR64>, VEX_W;
 
 multiclass avx512_insert_elt_m<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                                            X86VectorVTInfo _, PatFrag LdFrag> {
+                                            X86VectorVTInfo _, PatFrag LdFrag,
+                                            SDPatternOperator immoperator> {
   def rm : AVX512Ii8<opc, MRMSrcMem, (outs _.RC:$dst),
       (ins _.RC:$src1,  _.ScalarMemOp:$src2, u8imm:$src3),
       OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
       [(set _.RC:$dst,
-          (_.VT (OpNode _.RC:$src1, (LdFrag addr:$src2), imm:$src3)))]>,
+          (_.VT (OpNode _.RC:$src1, (LdFrag addr:$src2), immoperator:$src3)))]>,
       EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
 }
 
@@ -10975,10 +10960,10 @@ multiclass avx512_insert_elt_bw<bits<8> opc, string OpcodeStr, SDNode OpNode,
         (ins _.RC:$src1, GR32orGR64:$src2, u8imm:$src3),
         OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
         [(set _.RC:$dst,
-            (OpNode _.RC:$src1, GR32orGR64:$src2, imm:$src3))]>, EVEX_4V,
+            (OpNode _.RC:$src1, GR32orGR64:$src2, timm:$src3))]>, EVEX_4V,
         Sched<[WriteVecInsert]>;
 
-    defm NAME : avx512_insert_elt_m<opc, OpcodeStr, OpNode, _, LdFrag>;
+    defm NAME : avx512_insert_elt_m<opc, OpcodeStr, OpNode, _, LdFrag, timm>;
   }
 }
 
@@ -10993,7 +10978,7 @@ multiclass avx512_insert_elt_dq<bits<8> opc, string OpcodeStr,
         EVEX_4V, TAPD, Sched<[WriteVecInsert]>;
 
     defm NAME : avx512_insert_elt_m<opc, OpcodeStr, insertelt, _,
-                                    _.ScalarLdFrag>, TAPD;
+                                    _.ScalarLdFrag, imm>, TAPD;
   }
 }
 
@@ -11205,17 +11190,6 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
             (!cast<Instruction>(Name#_.ZSuffix#rrik) _.RC:$src1, _.KRCWM:$mask,
              _.RC:$src2, _.RC:$src3, (VPTERNLOG213_imm8 timm:$src4))>;
 
-  // Additional patterns for matching loads in other positions.
-  def : Pat<(_.VT (OpNode (bitconvert (_.LdFrag addr:$src3)),
-                          _.RC:$src2, _.RC:$src1, (i8 timm:$src4))),
-            (!cast<Instruction>(Name#_.ZSuffix#rmi) _.RC:$src1, _.RC:$src2,
-                                   addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>;
-  def : Pat<(_.VT (OpNode _.RC:$src1,
-                          (bitconvert (_.LdFrag addr:$src3)),
-                          _.RC:$src2, (i8 timm:$src4))),
-            (!cast<Instruction>(Name#_.ZSuffix#rmi) _.RC:$src1, _.RC:$src2,
-                                   addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>;
-
   // Additional patterns for matching zero masking with loads in other
   // positions.
   def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
@@ -11264,17 +11238,6 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
             (!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
              _.RC:$src2, addr:$src3, (VPTERNLOG312_imm8 timm:$src4))>;
 
-  // Additional patterns for matching broadcasts in other positions.
-  def : Pat<(_.VT (OpNode (_.BroadcastLdFrag addr:$src3),
-                          _.RC:$src2, _.RC:$src1, (i8 timm:$src4))),
-            (!cast<Instruction>(Name#_.ZSuffix#rmbi) _.RC:$src1, _.RC:$src2,
-                                   addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>;
-  def : Pat<(_.VT (OpNode _.RC:$src1,
-                          (_.BroadcastLdFrag addr:$src3),
-                          _.RC:$src2, (i8 timm:$src4))),
-            (!cast<Instruction>(Name#_.ZSuffix#rmbi) _.RC:$src1, _.RC:$src2,
-                                   addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>;
-
   // Additional patterns for matching zero masking with broadcasts in other
   // positions.
   def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
@@ -11346,398 +11309,6 @@ defm VPTERNLOGD : avx512_common_ternlog<"vpternlogd", SchedWriteVecALU,
 defm VPTERNLOGQ : avx512_common_ternlog<"vpternlogq", SchedWriteVecALU,
                                         avx512vl_i64_info>, VEX_W;
 
-// Patterns to use VPTERNLOG for vXi16/vXi8 vectors.
-let Predicates = [HasVLX] in {
-  def : Pat<(v16i8 (X86vpternlog VR128X:$src1, VR128X:$src2, VR128X:$src3,
-                                 (i8 timm:$src4))),
-            (VPTERNLOGQZ128rri VR128X:$src1, VR128X:$src2, VR128X:$src3,
-                               timm:$src4)>;
-  def : Pat<(v16i8 (X86vpternlog VR128X:$src1, VR128X:$src2,
-                                 (loadv16i8 addr:$src3), (i8 timm:$src4))),
-            (VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3,
-                               timm:$src4)>;
-  def : Pat<(v16i8 (X86vpternlog (loadv16i8 addr:$src3), VR128X:$src2,
-                                 VR128X:$src1, (i8 timm:$src4))),
-            (VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3,
-                               (VPTERNLOG321_imm8 timm:$src4))>;
-  def : Pat<(v16i8 (X86vpternlog VR128X:$src1, (loadv16i8 addr:$src3),
-                                 VR128X:$src2, (i8 timm:$src4))),
-            (VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3,
-                               (VPTERNLOG132_imm8 timm:$src4))>;
-
-  def : Pat<(v16i8 (X86vpternlog VR128X:$src1, VR128X:$src2,
-                                 (bitconvert (v4i32 (X86VBroadcastld32 addr:$src3))),
-                                 (i8 timm:$src4))),
-            (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
-                                timm:$src4)>;
-  def : Pat<(v16i8 (X86vpternlog (bitconvert (v4i32 (X86VBroadcastld32 addr:$src3))),
-                                 VR128X:$src2, VR128X:$src1, (i8 timm:$src4))),
-            (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
-                                (VPTERNLOG321_imm8 timm:$src4))>;
-  def : Pat<(v16i8 (X86vpternlog VR128X:$src1,
-                                 (bitconvert (v4i32 (X86VBroadcastld32 addr:$src3))),
-                                 VR128X:$src2, (i8 timm:$src4))),
-            (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
-                               (VPTERNLOG132_imm8 timm:$src4))>;
-
-  def : Pat<(v16i8 (X86vpternlog VR128X:$src1, VR128X:$src2,
-                                 (bitconvert (v2i64 (X86VBroadcastld64 addr:$src3))),
-                                 (i8 timm:$src4))),
-            (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
-                                timm:$src4)>;
-  def : Pat<(v16i8 (X86vpternlog (bitconvert (v2i64 (X86VBroadcastld64 addr:$src3))),
-                                 VR128X:$src2, VR128X:$src1, (i8 timm:$src4))),
-            (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
-                                (VPTERNLOG321_imm8 timm:$src4))>;
-  def : Pat<(v16i8 (X86vpternlog VR128X:$src1,
-                                 (bitconvert (v2i64 (X86VBroadcastld64 addr:$src3))),
-                                 VR128X:$src2, (i8 timm:$src4))),
-            (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
-                                (VPTERNLOG132_imm8 timm:$src4))>;
-
-  def : Pat<(v8i16 (X86vpternlog VR128X:$src1, VR128X:$src2, VR128X:$src3,
-                                 (i8 timm:$src4))),
-            (VPTERNLOGQZ128rri VR128X:$src1, VR128X:$src2, VR128X:$src3,
-                               timm:$src4)>;
-  def : Pat<(v8i16 (X86vpternlog VR128X:$src1, VR128X:$src2,
-                                 (loadv8i16 addr:$src3), (i8 timm:$src4))),
-            (VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3,
-                               timm:$src4)>;
-  def : Pat<(v8i16 (X86vpternlog (loadv8i16 addr:$src3), VR128X:$src2,
-                                 VR128X:$src1, (i8 timm:$src4))),
-            (VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3,
-                               (VPTERNLOG321_imm8 timm:$src4))>;
-  def : Pat<(v8i16 (X86vpternlog VR128X:$src1, (loadv8i16 addr:$src3),
-                                 VR128X:$src2, (i8 timm:$src4))),
-            (VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3,
-                               (VPTERNLOG132_imm8 timm:$src4))>;
-
-  def : Pat<(v8i16 (X86vpternlog VR128X:$src1, VR128X:$src2,
-                                 (bitconvert (v4i32 (X86VBroadcastld32 addr:$src3))),
-                                 (i8 timm:$src4))),
-            (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
-                                timm:$src4)>;
-  def : Pat<(v8i16 (X86vpternlog (bitconvert (v4i32 (X86VBroadcastld32 addr:$src3))),
-                                 VR128X:$src2, VR128X:$src1, (i8 timm:$src4))),
-            (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
-                                (VPTERNLOG321_imm8 timm:$src4))>;
-  def : Pat<(v8i16 (X86vpternlog VR128X:$src1,
-                                 (bitconvert (v4i32 (X86VBroadcastld32 addr:$src3))),
-                                 VR128X:$src2, (i8 timm:$src4))),
-            (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
-                               (VPTERNLOG132_imm8 timm:$src4))>;
-
-  def : Pat<(v8i16 (X86vpternlog VR128X:$src1, VR128X:$src2,
-                                 (bitconvert (v2i64 (X86VBroadcastld64 addr:$src3))),
-                                 (i8 timm:$src4))),
-            (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
-                                timm:$src4)>;
-  def : Pat<(v8i16 (X86vpternlog (bitconvert (v2i64 (X86VBroadcastld64 addr:$src3))),
-                                 VR128X:$src2, VR128X:$src1, (i8 timm:$src4))),
-            (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
-                                (VPTERNLOG321_imm8 timm:$src4))>;
-  def : Pat<(v8i16 (X86vpternlog VR128X:$src1,
-                                 (bitconvert (v2i64 (X86VBroadcastld64 addr:$src3))),
-                                 VR128X:$src2, (i8 timm:$src4))),
-            (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
-                                (VPTERNLOG132_imm8 timm:$src4))>;
-
-  def : Pat<(v4i32 (X86vpternlog VR128X:$src1, VR128X:$src2,
-                                 (bitconvert (v2i64 (X86VBroadcastld64 addr:$src3))),
-                                 (i8 timm:$src4))),
-            (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
-                                timm:$src4)>;
-  def : Pat<(v4i32 (X86vpternlog (bitconvert (v2i64 (X86VBroadcastld64 addr:$src3))),
-                                 VR128X:$src2, VR128X:$src1, (i8 timm:$src4))),
-            (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
-                                (VPTERNLOG321_imm8 timm:$src4))>;
-  def : Pat<(v4i32 (X86vpternlog VR128X:$src1,
-                                 (bitconvert (v2i64 (X86VBroadcastld64 addr:$src3))),
-                                 VR128X:$src2, (i8 timm:$src4))),
-            (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
-                                (VPTERNLOG132_imm8 timm:$src4))>;
-
-  def : Pat<(v2i64 (X86vpternlog VR128X:$src1, VR128X:$src2,
-                                 (bitconvert (v4i32 (X86VBroadcastld32 addr:$src3))),
-                                 (i8 timm:$src4))),
-            (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
-                                timm:$src4)>;
-  def : Pat<(v2i64 (X86vpternlog (bitconvert (v4i32 (X86VBroadcastld32 addr:$src3))),
-                                 VR128X:$src2, VR128X:$src1, (i8 timm:$src4))),
-            (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
-                                (VPTERNLOG321_imm8 timm:$src4))>;
-  def : Pat<(v2i64 (X86vpternlog VR128X:$src1,
-                                 (bitconvert (v4i32 (X86VBroadcastld32 addr:$src3))),
-                                 VR128X:$src2, (i8 timm:$src4))),
-            (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
-                               (VPTERNLOG132_imm8 timm:$src4))>;
-
-  def : Pat<(v32i8 (X86vpternlog VR256X:$src1, VR256X:$src2, VR256X:$src3,
-                                 (i8 timm:$src4))),
-            (VPTERNLOGQZ256rri VR256X:$src1, VR256X:$src2, VR256X:$src3,
-                               timm:$src4)>;
-  def : Pat<(v32i8 (X86vpternlog VR256X:$src1, VR256X:$src2,
-                                 (loadv32i8 addr:$src3), (i8 timm:$src4))),
-            (VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3,
-                               timm:$src4)>;
-  def : Pat<(v32i8 (X86vpternlog (loadv32i8 addr:$src3), VR256X:$src2,
-                                 VR256X:$src1, (i8 timm:$src4))),
-            (VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3,
-                               (VPTERNLOG321_imm8 timm:$src4))>;
-  def : Pat<(v32i8 (X86vpternlog VR256X:$src1, (loadv32i8 addr:$src3),
-                                 VR256X:$src2, (i8 timm:$src4))),
-            (VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3,
-                               (VPTERNLOG132_imm8 timm:$src4))>;
-
-  def : Pat<(v32i8 (X86vpternlog VR256X:$src1, VR256X:$src2,
-                                 (bitconvert (v8i32 (X86VBroadcastld32 addr:$src3))),
-                                 (i8 timm:$src4))),
-            (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
-                                timm:$src4)>;
-  def : Pat<(v32i8 (X86vpternlog (bitconvert (v8i32 (X86VBroadcastld32 addr:$src3))),
-                                 VR256X:$src2, VR256X:$src1, (i8 timm:$src4))),
-            (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
-                                (VPTERNLOG321_imm8 timm:$src4))>;
-  def : Pat<(v32i8 (X86vpternlog VR256X:$src1,
-                                 (bitconvert (v8i32 (X86VBroadcastld32 addr:$src3))),
-                                 VR256X:$src2, (i8 timm:$src4))),
-            (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
-                               (VPTERNLOG132_imm8 timm:$src4))>;
-
-  def : Pat<(v32i8 (X86vpternlog VR256X:$src1, VR256X:$src2,
-                                 (bitconvert (v4i64 (X86VBroadcastld64 addr:$src3))),
-                                 (i8 timm:$src4))),
-            (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
-                                timm:$src4)>;
-  def : Pat<(v32i8 (X86vpternlog (bitconvert (v4i64 (X86VBroadcastld64 addr:$src3))),
-                                 VR256X:$src2, VR256X:$src1, (i8 timm:$src4))),
-            (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
-                                (VPTERNLOG321_imm8 timm:$src4))>;
-  def : Pat<(v32i8 (X86vpternlog VR256X:$src1,
-                                 (bitconvert (v4i64 (X86VBroadcastld64 addr:$src3))),
-                                 VR256X:$src2, (i8 timm:$src4))),
-            (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
-                                (VPTERNLOG132_imm8 timm:$src4))>;
-
-  def : Pat<(v16i16 (X86vpternlog VR256X:$src1, VR256X:$src2, VR256X:$src3,
-                                  (i8 timm:$src4))),
-            (VPTERNLOGQZ256rri VR256X:$src1, VR256X:$src2, VR256X:$src3,
-                               timm:$src4)>;
-  def : Pat<(v16i16 (X86vpternlog VR256X:$src1, VR256X:$src2,
-                                  (loadv16i16 addr:$src3), (i8 timm:$src4))),
-            (VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3,
-                               timm:$src4)>;
-  def : Pat<(v16i16 (X86vpternlog (loadv16i16 addr:$src3), VR256X:$src2,
-                                  VR256X:$src1, (i8 timm:$src4))),
-            (VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3,
-                               (VPTERNLOG321_imm8 timm:$src4))>;
-  def : Pat<(v16i16 (X86vpternlog VR256X:$src1, (loadv16i16 addr:$src3),
-                                  VR256X:$src2, (i8 timm:$src4))),
-            (VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3,
-                               (VPTERNLOG132_imm8 timm:$src4))>;
-
-  def : Pat<(v16i16 (X86vpternlog VR256X:$src1, VR256X:$src2,
-                                  (bitconvert (v8i32 (X86VBroadcastld32 addr:$src3))),
-                                  (i8 timm:$src4))),
-            (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
-                                timm:$src4)>;
-  def : Pat<(v16i16 (X86vpternlog (bitconvert (v8i32 (X86VBroadcastld32 addr:$src3))),
-                                 VR256X:$src2, VR256X:$src1, (i8 timm:$src4))),
-            (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
-                                (VPTERNLOG321_imm8 timm:$src4))>;
-  def : Pat<(v16i16 (X86vpternlog VR256X:$src1,
-                                  (bitconvert (v8i32 (X86VBroadcastld32 addr:$src3))),
-                                  VR256X:$src2, (i8 timm:$src4))),
-            (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
-                               (VPTERNLOG132_imm8 timm:$src4))>;
-
-  def : Pat<(v16i16 (X86vpternlog VR256X:$src1, VR256X:$src2,
-                                  (bitconvert (v4i64 (X86VBroadcastld64 addr:$src3))),
-                                  (i8 timm:$src4))),
-            (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
-                                timm:$src4)>;
-  def : Pat<(v16i16 (X86vpternlog (bitconvert (v4i64 (X86VBroadcastld64 addr:$src3))),
-                                  VR256X:$src2, VR256X:$src1, (i8 timm:$src4))),
-            (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
-                                (VPTERNLOG321_imm8 timm:$src4))>;
-  def : Pat<(v16i16 (X86vpternlog VR256X:$src1,
-                                  (bitconvert (v4i64 (X86VBroadcastld64 addr:$src3))),
-                                  VR256X:$src2, (i8 timm:$src4))),
-            (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
-                                (VPTERNLOG132_imm8 timm:$src4))>;
-
-  def : Pat<(v8i32 (X86vpternlog VR256X:$src1, VR256X:$src2,
-                                 (bitconvert (v4i64 (X86VBroadcastld64 addr:$src3))),
-                                 (i8 timm:$src4))),
-            (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
-                                timm:$src4)>;
-  def : Pat<(v8i32 (X86vpternlog (bitconvert (v4i64 (X86VBroadcastld64 addr:$src3))),
-                                  VR256X:$src2, VR256X:$src1, (i8 timm:$src4))),
-            (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
-                                (VPTERNLOG321_imm8 timm:$src4))>;
-  def : Pat<(v8i32 (X86vpternlog VR256X:$src1,
-                                 (bitconvert (v4i64 (X86VBroadcastld64 addr:$src3))),
-                                 VR256X:$src2, (i8 timm:$src4))),
-            (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
-                                (VPTERNLOG132_imm8 timm:$src4))>;
-
-  def : Pat<(v4i64 (X86vpternlog VR256X:$src1, VR256X:$src2,
-                                 (bitconvert (v8i32 (X86VBroadcastld32 addr:$src3))),
-                                 (i8 timm:$src4))),
-            (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
-                                timm:$src4)>;
-  def : Pat<(v4i64 (X86vpternlog (bitconvert (v8i32 (X86VBroadcastld32 addr:$src3))),
-                                 VR256X:$src2, VR256X:$src1, (i8 timm:$src4))),
-            (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
-                                (VPTERNLOG321_imm8 timm:$src4))>;
-  def : Pat<(v4i64 (X86vpternlog VR256X:$src1,
-                                 (bitconvert (v8i32 (X86VBroadcastld32 addr:$src3))),
-                                 VR256X:$src2, (i8 timm:$src4))),
-            (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
-                               (VPTERNLOG132_imm8 timm:$src4))>;
-}
-
-let Predicates = [HasAVX512] in {
-  def : Pat<(v64i8 (X86vpternlog VR512:$src1, VR512:$src2, VR512:$src3,
-                                 (i8 timm:$src4))),
-            (VPTERNLOGQZrri VR512:$src1, VR512:$src2, VR512:$src3,
-                            timm:$src4)>;
-  def : Pat<(v64i8 (X86vpternlog VR512:$src1, VR512:$src2,
-                                 (loadv64i8 addr:$src3), (i8 timm:$src4))),
-            (VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3,
-                            timm:$src4)>;
-  def : Pat<(v64i8 (X86vpternlog (loadv64i8 addr:$src3), VR512:$src2,
-                                  VR512:$src1, (i8 timm:$src4))),
-            (VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3,
-                            (VPTERNLOG321_imm8 timm:$src4))>;
-  def : Pat<(v64i8 (X86vpternlog VR512:$src1, (loadv64i8 addr:$src3),
-                                 VR512:$src2, (i8 timm:$src4))),
-            (VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3,
-                            (VPTERNLOG132_imm8 timm:$src4))>;
-
-  def : Pat<(v64i8 (X86vpternlog VR512:$src1, VR512:$src2,
-                                 (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))),
-                                 (i8 timm:$src4))),
-            (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3,
-                             timm:$src4)>;
-  def : Pat<(v64i8 (X86vpternlog (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))),
-                                 VR512:$src2, VR512:$src1, (i8 timm:$src4))),
-            (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3,
-                             (VPTERNLOG321_imm8 timm:$src4))>;
-  def : Pat<(v64i8 (X86vpternlog VR512:$src1,
-                                 (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))),
-                                 VR512:$src2, (i8 timm:$src4))),
-            (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3,
-                             (VPTERNLOG132_imm8 timm:$src4))>;
-
-  def : Pat<(v64i8 (X86vpternlog VR512:$src1, VR512:$src2,
-                                 (bitconvert (v8i64 (X86VBroadcastld64 addr:$src3))),
-                                 (i8 timm:$src4))),
-            (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3,
-                                timm:$src4)>;
-  def : Pat<(v64i8 (X86vpternlog (bitconvert (v8i64 (X86VBroadcastld64 addr:$src3))),
-                                 VR512:$src2, VR512:$src1, (i8 timm:$src4))),
-            (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3,
-                                (VPTERNLOG321_imm8 timm:$src4))>;
-  def : Pat<(v64i8 (X86vpternlog VR512:$src1,
-                                 (bitconvert (v8i64 (X86VBroadcastld64 addr:$src3))),
-                                 VR512:$src2, (i8 timm:$src4))),
-            (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3,
-                                (VPTERNLOG132_imm8 timm:$src4))>;
-
-  def : Pat<(v32i16 (X86vpternlog VR512:$src1, VR512:$src2, VR512:$src3,
-                                  (i8 timm:$src4))),
-            (VPTERNLOGQZrri VR512:$src1, VR512:$src2, VR512:$src3,
-                            timm:$src4)>;
-  def : Pat<(v32i16 (X86vpternlog VR512:$src1, VR512:$src2,
-                                  (loadv32i16 addr:$src3), (i8 timm:$src4))),
-            (VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3,
-                            timm:$src4)>;
-  def : Pat<(v32i16 (X86vpternlog (loadv32i16 addr:$src3), VR512:$src2,
-                                  VR512:$src1, (i8 timm:$src4))),
-            (VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3,
-                            (VPTERNLOG321_imm8 timm:$src4))>;
-  def : Pat<(v32i16 (X86vpternlog VR512:$src1, (loadv32i16 addr:$src3),
-                                  VR512:$src2, (i8 timm:$src4))),
-            (VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3,
-                            (VPTERNLOG132_imm8 timm:$src4))>;
-
-  def : Pat<(v32i16 (X86vpternlog VR512:$src1, VR512:$src2,
-                                  (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))),
-                                  (i8 timm:$src4))),
-            (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3,
-                             timm:$src4)>;
-  def : Pat<(v32i16 (X86vpternlog (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))),
-                                 VR512:$src2, VR512:$src1, (i8 timm:$src4))),
-            (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3,
-                                (VPTERNLOG321_imm8 timm:$src4))>;
-  def : Pat<(v32i16 (X86vpternlog VR512:$src1,
-                                  (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))),
-                                  VR512:$src2, (i8 timm:$src4))),
-            (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3,
-                               (VPTERNLOG132_imm8 timm:$src4))>;
-
-  def : Pat<(v32i16 (X86vpternlog VR512:$src1, VR512:$src2,
-                                  (bitconvert (v8i64 (X86VBroadcastld64 addr:$src3))),
-                                  (i8 timm:$src4))),
-            (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3,
-                             timm:$src4)>;
-  def : Pat<(v32i16 (X86vpternlog (bitconvert (v8i64 (X86VBroadcastld64 addr:$src3))),
-                                  VR512:$src2, VR512:$src1, (i8 timm:$src4))),
-            (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3,
-                             (VPTERNLOG321_imm8 timm:$src4))>;
-  def : Pat<(v32i16 (X86vpternlog VR512:$src1,
-                                  (bitconvert (v8i64 (X86VBroadcastld64 addr:$src3))),
-                                  VR512:$src2, (i8 timm:$src4))),
-            (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3,
-                             (VPTERNLOG132_imm8 timm:$src4))>;
-
-  def : Pat<(v32i16 (X86vpternlog VR512:$src1, VR512:$src2,
-                                  (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))),
-                                  (i8 timm:$src4))),
-            (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3,
-                             timm:$src4)>;
-  def : Pat<(v32i16 (X86vpternlog (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))),
-                                 VR512:$src2, VR512:$src1, (i8 timm:$src4))),
-            (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3,
-                                (VPTERNLOG321_imm8 timm:$src4))>;
-  def : Pat<(v32i16 (X86vpternlog VR512:$src1,
-                                  (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))),
-                                  VR512:$src2, (i8 timm:$src4))),
-            (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3,
-                               (VPTERNLOG132_imm8 timm:$src4))>;
-
-  def : Pat<(v16i32 (X86vpternlog VR512:$src1, VR512:$src2,
-                                  (bitconvert (v8i64 (X86VBroadcastld64 addr:$src3))),
-                                  (i8 timm:$src4))),
-            (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3,
-                             timm:$src4)>;
-  def : Pat<(v16i32 (X86vpternlog (bitconvert (v8i64 (X86VBroadcastld64 addr:$src3))),
-                                  VR512:$src2, VR512:$src1, (i8 timm:$src4))),
-            (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3,
-                             (VPTERNLOG321_imm8 timm:$src4))>;
-  def : Pat<(v16i32 (X86vpternlog VR512:$src1,
-                                  (bitconvert (v8i64 (X86VBroadcastld64 addr:$src3))),
-                                  VR512:$src2, (i8 timm:$src4))),
-            (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3,
-                             (VPTERNLOG132_imm8 timm:$src4))>;
-
-  def : Pat<(v8i64 (X86vpternlog VR512:$src1, VR512:$src2,
-                                  (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))),
-                                  (i8 timm:$src4))),
-            (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3,
-                             timm:$src4)>;
-  def : Pat<(v8i64 (X86vpternlog (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))),
-                                 VR512:$src2, VR512:$src1, (i8 timm:$src4))),
-            (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3,
-                                (VPTERNLOG321_imm8 timm:$src4))>;
-  def : Pat<(v8i64 (X86vpternlog VR512:$src1,
-                                  (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))),
-                                  VR512:$src2, (i8 timm:$src4))),
-            (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3,
-                               (VPTERNLOG132_imm8 timm:$src4))>;
-}
-
 // Patterns to implement vnot using vpternlog instead of creating all ones
 // using pcmpeq or vpternlog and then xoring with that. The value 15 is chosen
 // so that the result is only dependent on src0. But we use the same source
@@ -12281,11 +11852,6 @@ defm VPDPBUSDS  : VNNI_common<0x51, "vpdpbusds", X86Vpdpbusds, SchedWriteVecIMul
 defm VPDPWSSD   : VNNI_common<0x52, "vpdpwssd", X86Vpdpwssd, SchedWriteVecIMul, 1>;
 defm VPDPWSSDS  : VNNI_common<0x53, "vpdpwssds", X86Vpdpwssds, SchedWriteVecIMul, 1>;
 
-def X86vpmaddwd_su : PatFrag<(ops node:$lhs, node:$rhs),
-                             (X86vpmaddwd node:$lhs, node:$rhs), [{
-  return N->hasOneUse();
-}]>;
-
 // Patterns to match VPDPWSSD from existing instructions/intrinsics.
 let Predicates = [HasVNNI] in {
   def : Pat<(v16i32 (add VR512:$src1,
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrArithmetic.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrArithmetic.td
index f7f22285bd15..e83e1e74ff52 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrArithmetic.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrArithmetic.td
@@ -1182,6 +1182,15 @@ defm SUB : ArithBinOp_RF<0x28, 0x2A, 0x2C, "sub", MRM5r, MRM5m,
                          X86sub_flag, sub, 0, 1, 0>;
 }
 
+// Version of XOR8rr_NOREX that use GR8_NOREX. This is used by the handling of
+// __builtin_parity where the last step xors an h-register with an l-register.
+let isCodeGenOnly = 1, hasSideEffects = 0, Constraints = "$src1 = $dst",
+    Defs = [EFLAGS], isCommutable = 1 in
+def XOR8rr_NOREX : I<0x30, MRMDestReg, (outs GR8_NOREX:$dst),
+                     (ins GR8_NOREX:$src1, GR8_NOREX:$src2),
+                     "xor{b}\t{$src2, $dst|$dst, $src2}", []>,
+                     Sched<[WriteALU]>;
+
 // Arithmetic.
 defm ADC : ArithBinOp_RFF<0x10, 0x12, 0x14, "adc", MRM2r, MRM2m, X86adc_flag,
                           1, 0>;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrCompiler.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrCompiler.td
index 4df93fb2ed60..dc6361aecc60 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -73,25 +73,32 @@ let usesCustomInserter = 1, Defs = [EFLAGS] in {
 def VASTART_SAVE_XMM_REGS : I<0, Pseudo,
                               (outs),
                               (ins GR8:$al,
-                                   i64imm:$regsavefi, i64imm:$offset,
+                                   i32imm:$regsavefi, i32imm:$offset,
                                    variable_ops),
                               "#VASTART_SAVE_XMM_REGS $al, $regsavefi, $offset",
                               [(X86vastart_save_xmm_regs GR8:$al,
-                                                         imm:$regsavefi,
-                                                         imm:$offset),
+                                                         timm:$regsavefi,
+                                                         timm:$offset),
                                (implicit EFLAGS)]>;
 
-// The VAARG_64 pseudo-instruction takes the address of the va_list,
-// and places the address of the next argument into a register.
-let Defs = [EFLAGS] in
+// The VAARG_64 and VAARG_X32 pseudo-instructions take the address of the
+// va_list, and place the address of the next argument into a register.
+let Defs = [EFLAGS] in {
 def VAARG_64 : I<0, Pseudo,
                  (outs GR64:$dst),
                  (ins i8mem:$ap, i32imm:$size, i8imm:$mode, i32imm:$align),
                  "#VAARG_64 $dst, $ap, $size, $mode, $align",
                  [(set GR64:$dst,
-                    (X86vaarg64 addr:$ap, imm:$size, imm:$mode, imm:$align)),
-                  (implicit EFLAGS)]>;
-
+                    (X86vaarg64 addr:$ap, timm:$size, timm:$mode, timm:$align)),
+                  (implicit EFLAGS)]>, Requires<[In64BitMode, IsLP64]>;
+def VAARG_X32 : I<0, Pseudo,
+                 (outs GR32:$dst),
+                 (ins i8mem:$ap, i32imm:$size, i8imm:$mode, i32imm:$align),
+                 "#VAARG_X32 $dst, $ap, $size, $mode, $align",
+                 [(set GR32:$dst,
+                    (X86vaargx32 addr:$ap, timm:$size, timm:$mode, timm:$align)),
+                  (implicit EFLAGS)]>, Requires<[In64BitMode, NotLP64]>;
+}
 
 // When using segmented stacks these are lowered into instructions which first
 // check if the current stacklet has enough free memory. If it does, memory is
@@ -467,11 +474,19 @@ let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
 def TLS_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
                    "# TLS_addr64",
                   [(X86tlsaddr tls64addr:$sym)]>,
-                  Requires<[In64BitMode]>;
+                  Requires<[In64BitMode, IsLP64]>;
 def TLS_base_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
                    "# TLS_base_addr64",
                   [(X86tlsbaseaddr tls64baseaddr:$sym)]>,
-                  Requires<[In64BitMode]>;
+                  Requires<[In64BitMode, IsLP64]>;
+def TLS_addrX32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
+                   "# TLS_addrX32",
+                  [(X86tlsaddr tls32addr:$sym)]>,
+                  Requires<[In64BitMode, NotLP64]>;
+def TLS_base_addrX32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
+                   "# TLS_base_addrX32",
+                  [(X86tlsbaseaddr tls32baseaddr:$sym)]>,
+                  Requires<[In64BitMode, NotLP64]>;
 }
 
 // Darwin TLS Support
@@ -809,15 +824,6 @@ let Predicates = [UseIncDec] in {
 }
 
 // Atomic compare and swap.
-multiclass LCMPXCHG_UnOp<bits<8> Opc, Format Form, string mnemonic,
-                         SDPatternOperator frag, X86MemOperand x86memop> {
-let isCodeGenOnly = 1, usesCustomInserter = 1 in {
-  def NAME : I<Opc, Form, (outs), (ins x86memop:$ptr),
-               !strconcat(mnemonic, "\t$ptr"),
-               [(frag addr:$ptr)]>, TB, LOCK;
-}
-}
-
 multiclass LCMPXCHG_BinOp<bits<8> Opc8, bits<8> Opc, Format Form,
                           string mnemonic, SDPatternOperator frag> {
 let isCodeGenOnly = 1, SchedRW = [WriteCMPXCHGRMW] in {
@@ -841,8 +847,19 @@ let isCodeGenOnly = 1, SchedRW = [WriteCMPXCHGRMW] in {
 }
 
 let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX],
-    Predicates = [HasCmpxchg8b], SchedRW = [WriteCMPXCHGRMW] in {
-defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b", X86cas8, i64mem>;
+    Predicates = [HasCmpxchg8b], SchedRW = [WriteCMPXCHGRMW],
+    isCodeGenOnly = 1, usesCustomInserter = 1 in {
+def LCMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$ptr),
+                   "cmpxchg8b\t$ptr",
+                   [(X86cas8 addr:$ptr)]>, TB, LOCK;
+}
+
+let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX],
+    Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW],
+    isCodeGenOnly = 1, mayLoad = 1, mayStore = 1, hasSideEffects = 0 in {
+def LCMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$ptr),
+                     "cmpxchg16b\t$ptr",
+                     []>, TB, LOCK;
 }
 
 // This pseudo must be used when the frame uses RBX as
@@ -852,50 +869,64 @@ defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b", X86cas8, i64mem>;
 // RBX that will happen when setting the arguments for the instrucion.
 //
 // Unlike the actual related instruction, we mark that this one
-// defines EBX (instead of using EBX).
+// defines RBX (instead of using RBX).
 // The rationale is that we will define RBX during the expansion of
-// the pseudo. The argument feeding EBX is ebx_input.
+// the pseudo. The argument feeding RBX is rbx_input.
 //
-// The additional argument, $ebx_save, is a temporary register used to
+// The additional argument, $rbx_save, is a temporary register used to
 // save the value of RBX across the actual instruction.
 //
-// To make sure the register assigned to $ebx_save does not interfere with
+// To make sure the register assigned to $rbx_save does not interfere with
 // the definition of the actual instruction, we use a definition $dst which
 // is tied to $rbx_save. That way, the live-range of $rbx_save spans across
 // the instruction and we are sure we will have a valid register to restore
 // the value of RBX.
-let Defs = [EAX, EDX, EBX, EFLAGS], Uses = [EAX, ECX, EDX],
-    Predicates = [HasCmpxchg8b], SchedRW = [WriteCMPXCHGRMW],
-    isCodeGenOnly = 1, isPseudo = 1, Constraints = "$ebx_save = $dst",
-    usesCustomInserter = 1 in {
-def LCMPXCHG8B_SAVE_EBX :
-    I<0, Pseudo, (outs GR32:$dst),
-      (ins i64mem:$ptr, GR32:$ebx_input, GR32:$ebx_save),
-      !strconcat("cmpxchg8b", "\t$ptr"),
-      [(set GR32:$dst, (X86cas8save_ebx addr:$ptr, GR32:$ebx_input,
-                                        GR32:$ebx_save))]>;
+let Defs = [RAX, RDX, RBX, EFLAGS], Uses = [RAX, RCX, RDX],
+    Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW],
+    isCodeGenOnly = 1, isPseudo = 1,
+    mayLoad = 1, mayStore = 1, hasSideEffects = 0,
+    Constraints = "$rbx_save = $dst" in {
+def LCMPXCHG16B_SAVE_RBX :
+    I<0, Pseudo, (outs GR64:$dst),
+      (ins i128mem:$ptr, GR64:$rbx_input, GR64:$rbx_save), "", []>;
 }
 
+// Pseudo instruction that doesn't read/write RBX. Will be turned into either
+// LCMPXCHG16B_SAVE_RBX or LCMPXCHG16B via a custom inserter.
+let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RCX, RDX],
+    Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW],
+    isCodeGenOnly = 1, isPseudo = 1,
+    mayLoad = 1, mayStore = 1, hasSideEffects = 0,
+    usesCustomInserter = 1 in {
+def LCMPXCHG16B_NO_RBX :
+    I<0, Pseudo, (outs), (ins i128mem:$ptr, GR64:$rbx_input), "",
+      [(X86cas16 addr:$ptr, GR64:$rbx_input)]>;
+}
 
-let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX],
-    Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW] in {
-defm LCMPXCHG16B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg16b",
-                                 X86cas16, i128mem>, REX_W;
+// This pseudo must be used when the frame uses RBX/EBX as
+// the base pointer.
+// cf comment for LCMPXCHG16B_SAVE_RBX.
+let Defs = [EBX], Uses = [ECX, EAX],
+    Predicates = [HasMWAITX], SchedRW = [WriteSystem],
+    isCodeGenOnly = 1, isPseudo = 1, Constraints = "$rbx_save = $dst" in {
+def MWAITX_SAVE_RBX :
+    I<0, Pseudo, (outs GR64:$dst),
+      (ins GR32:$ebx_input, GR64:$rbx_save),
+      "mwaitx",
+      []>;
 }
 
-// Same as LCMPXCHG8B_SAVE_RBX but for the 16 Bytes variant.
-let Defs = [RAX, RDX, RBX, EFLAGS], Uses = [RAX, RCX, RDX],
-    Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW],
-    isCodeGenOnly = 1, isPseudo = 1, Constraints = "$rbx_save = $dst",
+// Pseudo mwaitx instruction to use for custom insertion.
+let Predicates = [HasMWAITX], SchedRW = [WriteSystem],
+    isCodeGenOnly = 1, isPseudo = 1,
     usesCustomInserter = 1 in {
-def LCMPXCHG16B_SAVE_RBX :
-    I<0, Pseudo, (outs GR64:$dst),
-      (ins i128mem:$ptr, GR64:$rbx_input, GR64:$rbx_save),
-      !strconcat("cmpxchg16b", "\t$ptr"),
-      [(set GR64:$dst, (X86cas16save_rbx addr:$ptr, GR64:$rbx_input,
-                                                    GR64:$rbx_save))]>;
+def MWAITX :
+    I<0, Pseudo, (outs), (ins GR32:$ecx, GR32:$eax, GR32:$ebx),
+      "mwaitx",
+      [(int_x86_mwaitx GR32:$ecx, GR32:$eax, GR32:$ebx)]>;
 }
 
+
 defm LCMPXCHG : LCMPXCHG_BinOp<0xB0, 0xB1, MRMDestMem, "cmpxchg", X86cas>;
 
 // Atomic exchange and add
@@ -1182,49 +1213,49 @@ def X86tcret_6regs : PatFrag<(ops node:$ptr, node:$off),
   return true;
 }]>;
 
-def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
-          (TCRETURNri ptr_rc_tailcall:$dst, imm:$off)>,
+def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off),
+          (TCRETURNri ptr_rc_tailcall:$dst, timm:$off)>,
           Requires<[Not64BitMode, NotUseIndirectThunkCalls]>;
 
 // FIXME: This is disabled for 32-bit PIC mode because the global base
 // register which is part of the address mode may be assigned a
 // callee-saved register.
-def : Pat<(X86tcret (load addr:$dst), imm:$off),
-          (TCRETURNmi addr:$dst, imm:$off)>,
+def : Pat<(X86tcret (load addr:$dst), timm:$off),
+          (TCRETURNmi addr:$dst, timm:$off)>,
           Requires<[Not64BitMode, IsNotPIC, NotUseIndirectThunkCalls]>;
 
-def : Pat<(X86tcret (i32 tglobaladdr:$dst), imm:$off),
-          (TCRETURNdi tglobaladdr:$dst, imm:$off)>,
+def : Pat<(X86tcret (i32 tglobaladdr:$dst), timm:$off),
+          (TCRETURNdi tglobaladdr:$dst, timm:$off)>,
           Requires<[NotLP64]>;
 
-def : Pat<(X86tcret (i32 texternalsym:$dst), imm:$off),
-          (TCRETURNdi texternalsym:$dst, imm:$off)>,
+def : Pat<(X86tcret (i32 texternalsym:$dst), timm:$off),
+          (TCRETURNdi texternalsym:$dst, timm:$off)>,
           Requires<[NotLP64]>;
 
-def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
-          (TCRETURNri64 ptr_rc_tailcall:$dst, imm:$off)>,
+def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off),
+          (TCRETURNri64 ptr_rc_tailcall:$dst, timm:$off)>,
           Requires<[In64BitMode, NotUseIndirectThunkCalls]>;
 
 // Don't fold loads into X86tcret requiring more than 6 regs.
 // There wouldn't be enough scratch registers for base+index.
-def : Pat<(X86tcret_6regs (load addr:$dst), imm:$off),
-          (TCRETURNmi64 addr:$dst, imm:$off)>,
+def : Pat<(X86tcret_6regs (load addr:$dst), timm:$off),
+          (TCRETURNmi64 addr:$dst, timm:$off)>,
           Requires<[In64BitMode, NotUseIndirectThunkCalls]>;
 
-def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
-          (INDIRECT_THUNK_TCRETURN64 ptr_rc_tailcall:$dst, imm:$off)>,
+def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off),
+          (INDIRECT_THUNK_TCRETURN64 ptr_rc_tailcall:$dst, timm:$off)>,
           Requires<[In64BitMode, UseIndirectThunkCalls]>;
 
-def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
-          (INDIRECT_THUNK_TCRETURN32 ptr_rc_tailcall:$dst, imm:$off)>,
+def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off),
+          (INDIRECT_THUNK_TCRETURN32 ptr_rc_tailcall:$dst, timm:$off)>,
           Requires<[Not64BitMode, UseIndirectThunkCalls]>;
 
-def : Pat<(X86tcret (i64 tglobaladdr:$dst), imm:$off),
-          (TCRETURNdi64 tglobaladdr:$dst, imm:$off)>,
+def : Pat<(X86tcret (i64 tglobaladdr:$dst), timm:$off),
+          (TCRETURNdi64 tglobaladdr:$dst, timm:$off)>,
           Requires<[IsLP64]>;
 
-def : Pat<(X86tcret (i64 texternalsym:$dst), imm:$off),
-          (TCRETURNdi64 texternalsym:$dst, imm:$off)>,
+def : Pat<(X86tcret (i64 texternalsym:$dst), timm:$off),
+          (TCRETURNdi64 texternalsym:$dst, timm:$off)>,
           Requires<[IsLP64]>;
 
 // Normal calls, with various flavors of addresses.
@@ -1313,15 +1344,18 @@ def : Pat<(i32 (anyext_sdiv GR8:$src)), (MOVSX32rr8 GR8:$src)>;
 
 // Any instruction that defines a 32-bit result leaves the high half of the
 // register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
-// be copying from a truncate. Any other 32-bit operation will zero-extend
-// up to 64 bits. AssertSext/AssertZext aren't saying anything about the upper
-// 32 bits, they're probably just qualifying a CopyFromReg.
+// be copying from a truncate. AssertSext/AssertZext/AssertAlign aren't saying
+// anything about the upper 32 bits, they're probably just qualifying a
+// CopyFromReg. FREEZE may be coming from a a truncate. Any other 32-bit
+// operation will zero-extend up to 64 bits.
 def def32 : PatLeaf<(i32 GR32:$src), [{
   return N->getOpcode() != ISD::TRUNCATE &&
          N->getOpcode() != TargetOpcode::EXTRACT_SUBREG &&
          N->getOpcode() != ISD::CopyFromReg &&
          N->getOpcode() != ISD::AssertSext &&
-         N->getOpcode() != ISD::AssertZext;
+         N->getOpcode() != ISD::AssertZext &&
+         N->getOpcode() != ISD::AssertAlign &&
+         N->getOpcode() != ISD::FREEZE;
 }]>;
 
 // In the case of a 32-bit def that is known to implicitly zero-extend,
@@ -1698,6 +1732,16 @@ def : Pat<(store (i8 (trunc_su (srl_su GR16:$src, (i8 8)))), addr:$dst),
             (EXTRACT_SUBREG GR16:$src, sub_8bit_hi))>,
       Requires<[In64BitMode]>;
 
+// Special pattern to catch the last step of __builtin_parity handling. Our
+// goal is to use an xor of an h-register with the corresponding l-register.
+// The above patterns would handle this on non 64-bit targets, but for 64-bit
+// we need to be more careful. We're using a NOREX instruction here in case
+// register allocation fails to keep the two registers together. So we need to
+// make sure we can't accidentally mix R8-R15 with an h-register.
+def : Pat<(X86xor_flag (i8 (trunc GR32:$src)),
+                       (i8 (trunc (srl_su GR32:$src, (i8 8))))),
+          (XOR8rr_NOREX (EXTRACT_SUBREG GR32:$src, sub_8bit),
+                        (EXTRACT_SUBREG GR32:$src, sub_8bit_hi))>;
 
 // (shl x, 1) ==> (add x, x)
 // Note that if x is undef (immediate or otherwise), we could theoretically
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFMA.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFMA.td
index 4dbd6bb8cd7e..f9be3a783279 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFMA.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFMA.td
@@ -123,7 +123,7 @@ multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
 // Fused Multiply-Add
 let ExeDomain = SSEPackedSingle in {
   defm VFMADD    : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps", "PS",
-                               loadv4f32, loadv8f32, X86any_Fmadd, v4f32, v8f32,
+                               loadv4f32, loadv8f32, any_fma, v4f32, v8f32,
                                SchedWriteFMA>;
   defm VFMSUB    : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", "PS",
                                loadv4f32, loadv8f32, X86any_Fmsub, v4f32, v8f32,
@@ -138,7 +138,7 @@ let ExeDomain = SSEPackedSingle in {
 
 let ExeDomain = SSEPackedDouble in {
   defm VFMADD    : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd", "PD",
-                               loadv2f64, loadv4f64, X86any_Fmadd, v2f64,
+                               loadv2f64, loadv4f64, any_fma, v2f64,
                                v4f64, SchedWriteFMA>, VEX_W;
   defm VFMSUB    : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd", "PD",
                                loadv2f64, loadv4f64, X86any_Fmsub, v2f64,
@@ -319,7 +319,7 @@ multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231,
                               VR128, sdmem, sched>, VEX_W;
 }
 
-defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", X86any_Fmadd,
+defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", any_fma,
                     SchedWriteFMA.Scl>, VEX_LIG;
 defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", X86any_Fmsub,
                     SchedWriteFMA.Scl>, VEX_LIG;
@@ -372,12 +372,12 @@ multiclass scalar_fma_patterns<SDNode Op, string Prefix, string Suffix,
   }
 }
 
-defm : scalar_fma_patterns<X86any_Fmadd, "VFMADD", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
+defm : scalar_fma_patterns<any_fma, "VFMADD", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
 defm : scalar_fma_patterns<X86any_Fmsub, "VFMSUB", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
 defm : scalar_fma_patterns<X86any_Fnmadd, "VFNMADD", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
 defm : scalar_fma_patterns<X86any_Fnmsub, "VFNMSUB", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
 
-defm : scalar_fma_patterns<X86any_Fmadd, "VFMADD", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
+defm : scalar_fma_patterns<any_fma, "VFMADD", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
 defm : scalar_fma_patterns<X86any_Fmsub, "VFMSUB", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
 defm : scalar_fma_patterns<X86any_Fnmadd, "VFNMADD", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
 defm : scalar_fma_patterns<X86any_Fnmsub, "VFNMSUB", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
@@ -538,7 +538,7 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
 
 let ExeDomain = SSEPackedSingle in {
   // Scalar Instructions
-  defm VFMADDSS4  : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86any_Fmadd, loadf32,
+  defm VFMADDSS4  : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, any_fma, loadf32,
                           SchedWriteFMA.Scl>,
                     fma4s_int<0x6A, "vfmaddss", ssmem, v4f32,
                               SchedWriteFMA.Scl>;
@@ -555,7 +555,7 @@ let ExeDomain = SSEPackedSingle in {
                     fma4s_int<0x7E, "vfnmsubss", ssmem, v4f32,
                               SchedWriteFMA.Scl>;
   // Packed Instructions
-  defm VFMADDPS4    : fma4p<0x68, "vfmaddps", X86any_Fmadd, v4f32, v8f32,
+  defm VFMADDPS4    : fma4p<0x68, "vfmaddps", any_fma, v4f32, v8f32,
                             loadv4f32, loadv8f32, SchedWriteFMA>;
   defm VFMSUBPS4    : fma4p<0x6C, "vfmsubps", X86any_Fmsub, v4f32, v8f32,
                             loadv4f32, loadv8f32, SchedWriteFMA>;
@@ -571,7 +571,7 @@ let ExeDomain = SSEPackedSingle in {
 
 let ExeDomain = SSEPackedDouble in {
   // Scalar Instructions
-  defm VFMADDSD4  : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86any_Fmadd, loadf64,
+  defm VFMADDSD4  : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, any_fma, loadf64,
                           SchedWriteFMA.Scl>,
                     fma4s_int<0x6B, "vfmaddsd", sdmem, v2f64,
                               SchedWriteFMA.Scl>;
@@ -588,7 +588,7 @@ let ExeDomain = SSEPackedDouble in {
                     fma4s_int<0x7F, "vfnmsubsd", sdmem, v2f64,
                               SchedWriteFMA.Scl>;
   // Packed Instructions
-  defm VFMADDPD4    : fma4p<0x69, "vfmaddpd", X86any_Fmadd, v2f64, v4f64,
+  defm VFMADDPD4    : fma4p<0x69, "vfmaddpd", any_fma, v2f64, v4f64,
                             loadv2f64, loadv4f64, SchedWriteFMA>;
   defm VFMSUBPD4    : fma4p<0x6D, "vfmsubpd", X86any_Fmsub, v2f64, v4f64,
                             loadv2f64, loadv4f64, SchedWriteFMA>;
@@ -629,12 +629,12 @@ multiclass scalar_fma4_patterns<SDNode Op, string Name,
   }
 }
 
-defm : scalar_fma4_patterns<X86any_Fmadd, "VFMADDSS4", v4f32, f32, FR32, loadf32>;
+defm : scalar_fma4_patterns<any_fma, "VFMADDSS4", v4f32, f32, FR32, loadf32>;
 defm : scalar_fma4_patterns<X86any_Fmsub, "VFMSUBSS4", v4f32, f32, FR32, loadf32>;
 defm : scalar_fma4_patterns<X86any_Fnmadd, "VFNMADDSS4", v4f32, f32, FR32, loadf32>;
 defm : scalar_fma4_patterns<X86any_Fnmsub, "VFNMSUBSS4", v4f32, f32, FR32, loadf32>;
 
-defm : scalar_fma4_patterns<X86any_Fmadd, "VFMADDSD4", v2f64, f64, FR64, loadf64>;
+defm : scalar_fma4_patterns<any_fma, "VFMADDSD4", v2f64, f64, FR64, loadf64>;
 defm : scalar_fma4_patterns<X86any_Fmsub, "VFMSUBSD4", v2f64, f64, FR64, loadf64>;
 defm : scalar_fma4_patterns<X86any_Fnmadd, "VFNMADDSD4", v2f64, f64, FR64, loadf64>;
 defm : scalar_fma4_patterns<X86any_Fnmsub, "VFNMSUBSD4", v2f64, f64, FR64, loadf64>;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFPStack.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFPStack.td
index 67dcb8d00ea5..961b4e590365 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFPStack.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFPStack.td
@@ -392,13 +392,13 @@ def FICOMP32m: FPI<0xDA, MRM3m, (outs), (ins i32mem:$src), "ficomp{l}\t$src">;
 
 let SchedRW = [WriteMicrocoded] in {
 let Defs = [FPSW, FPCW], mayLoad = 1 in {
-def FLDENVm  : FPI<0xD9, MRM4m, (outs), (ins f32mem:$src), "fldenv\t$src">;
-def FRSTORm  : FPI<0xDD, MRM4m, (outs), (ins f32mem:$dst), "frstor\t$dst">;
+def FLDENVm  : FPI<0xD9, MRM4m, (outs), (ins anymem:$src), "fldenv\t$src">;
+def FRSTORm  : FPI<0xDD, MRM4m, (outs), (ins anymem:$src), "frstor\t$src">;
 }
 
 let Defs = [FPSW, FPCW], Uses = [FPSW, FPCW], mayStore = 1 in {
-def FSTENVm  : FPI<0xD9, MRM6m, (outs), (ins f32mem:$dst), "fnstenv\t$dst">;
-def FSAVEm   : FPI<0xDD, MRM6m, (outs), (ins f32mem:$dst), "fnsave\t$dst">;
+def FSTENVm  : FPI<0xD9, MRM6m, (outs), (ins anymem:$dst), "fnstenv\t$dst">;
+def FSAVEm   : FPI<0xDD, MRM6m, (outs), (ins anymem:$dst), "fnsave\t$dst">;
 }
 
 let Uses = [FPSW], mayStore = 1 in
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFoldTables.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFoldTables.cpp
index e16382e956c5..17fe7f0bd310 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFoldTables.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFoldTables.cpp
@@ -300,11 +300,13 @@ static const X86MemoryFoldTableEntry MemoryFoldTable0[] = {
   { X86::MOV32rr,             X86::MOV32mr,             TB_FOLDED_STORE },
   { X86::MOV64ri32,           X86::MOV64mi32,           TB_FOLDED_STORE },
   { X86::MOV64rr,             X86::MOV64mr,             TB_FOLDED_STORE },
+  { X86::MOV64toSDrr,         X86::MOV64mr,             TB_FOLDED_STORE | TB_NO_REVERSE },
   { X86::MOV8ri,              X86::MOV8mi,              TB_FOLDED_STORE },
   { X86::MOV8rr,              X86::MOV8mr,              TB_FOLDED_STORE },
   { X86::MOV8rr_NOREX,        X86::MOV8mr_NOREX,        TB_FOLDED_STORE },
   { X86::MOVAPDrr,            X86::MOVAPDmr,            TB_FOLDED_STORE | TB_ALIGN_16 },
   { X86::MOVAPSrr,            X86::MOVAPSmr,            TB_FOLDED_STORE | TB_ALIGN_16 },
+  { X86::MOVDI2SSrr,          X86::MOV32mr,             TB_FOLDED_STORE | TB_NO_REVERSE },
   { X86::MOVDQArr,            X86::MOVDQAmr,            TB_FOLDED_STORE | TB_ALIGN_16 },
   { X86::MOVDQUrr,            X86::MOVDQUmr,            TB_FOLDED_STORE },
   { X86::MOVPDI2DIrr,         X86::MOVPDI2DImr,         TB_FOLDED_STORE },
@@ -357,6 +359,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable0[] = {
   { X86::VEXTRACTI64x4Zrr,    X86::VEXTRACTI64x4Zmr,    TB_FOLDED_STORE },
   { X86::VEXTRACTPSZrr,       X86::VEXTRACTPSZmr,       TB_FOLDED_STORE },
   { X86::VEXTRACTPSrr,        X86::VEXTRACTPSmr,        TB_FOLDED_STORE },
+  { X86::VMOV64toSDZrr,       X86::MOV64mr,             TB_FOLDED_STORE | TB_NO_REVERSE },
+  { X86::VMOV64toSDrr,        X86::MOV64mr,             TB_FOLDED_STORE | TB_NO_REVERSE },
   { X86::VMOVAPDYrr,          X86::VMOVAPDYmr,          TB_FOLDED_STORE | TB_ALIGN_32 },
   { X86::VMOVAPDZ128rr,       X86::VMOVAPDZ128mr,       TB_FOLDED_STORE | TB_ALIGN_16 },
   { X86::VMOVAPDZ256rr,       X86::VMOVAPDZ256mr,       TB_FOLDED_STORE | TB_ALIGN_32 },
@@ -367,6 +371,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable0[] = {
   { X86::VMOVAPSZ256rr,       X86::VMOVAPSZ256mr,       TB_FOLDED_STORE | TB_ALIGN_32 },
   { X86::VMOVAPSZrr,          X86::VMOVAPSZmr,          TB_FOLDED_STORE | TB_ALIGN_64 },
   { X86::VMOVAPSrr,           X86::VMOVAPSmr,           TB_FOLDED_STORE | TB_ALIGN_16 },
+  { X86::VMOVDI2SSZrr,        X86::MOV32mr,             TB_FOLDED_STORE | TB_NO_REVERSE },
+  { X86::VMOVDI2SSrr,         X86::MOV32mr,             TB_FOLDED_STORE | TB_NO_REVERSE },
   { X86::VMOVDQA32Z128rr,     X86::VMOVDQA32Z128mr,     TB_FOLDED_STORE | TB_ALIGN_16 },
   { X86::VMOVDQA32Z256rr,     X86::VMOVDQA32Z256mr,     TB_FOLDED_STORE | TB_ALIGN_32 },
   { X86::VMOVDQA32Zrr,        X86::VMOVDQA32Zmr,        TB_FOLDED_STORE | TB_ALIGN_64 },
@@ -3742,18 +3748,26 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VPCONFLICTQZ128rrk,         X86::VPCONFLICTQZ128rmk,         0 },
   { X86::VPCONFLICTQZ256rrk,         X86::VPCONFLICTQZ256rmk,         0 },
   { X86::VPCONFLICTQZrrk,            X86::VPCONFLICTQZrmk,            0 },
+  { X86::VPDPBUSDSYrr,               X86::VPDPBUSDSYrm,               0 },
   { X86::VPDPBUSDSZ128r,             X86::VPDPBUSDSZ128m,             0 },
   { X86::VPDPBUSDSZ256r,             X86::VPDPBUSDSZ256m,             0 },
   { X86::VPDPBUSDSZr,                X86::VPDPBUSDSZm,                0 },
+  { X86::VPDPBUSDSrr,                X86::VPDPBUSDSrm,                0 },
+  { X86::VPDPBUSDYrr,                X86::VPDPBUSDYrm,                0 },
   { X86::VPDPBUSDZ128r,              X86::VPDPBUSDZ128m,              0 },
   { X86::VPDPBUSDZ256r,              X86::VPDPBUSDZ256m,              0 },
   { X86::VPDPBUSDZr,                 X86::VPDPBUSDZm,                 0 },
+  { X86::VPDPBUSDrr,                 X86::VPDPBUSDrm,                 0 },
+  { X86::VPDPWSSDSYrr,               X86::VPDPWSSDSYrm,               0 },
   { X86::VPDPWSSDSZ128r,             X86::VPDPWSSDSZ128m,             0 },
   { X86::VPDPWSSDSZ256r,             X86::VPDPWSSDSZ256m,             0 },
   { X86::VPDPWSSDSZr,                X86::VPDPWSSDSZm,                0 },
+  { X86::VPDPWSSDSrr,                X86::VPDPWSSDSrm,                0 },
+  { X86::VPDPWSSDYrr,                X86::VPDPWSSDYrm,                0 },
   { X86::VPDPWSSDZ128r,              X86::VPDPWSSDZ128m,              0 },
   { X86::VPDPWSSDZ256r,              X86::VPDPWSSDZ256m,              0 },
   { X86::VPDPWSSDZr,                 X86::VPDPWSSDZm,                 0 },
+  { X86::VPDPWSSDrr,                 X86::VPDPWSSDrm,                 0 },
   { X86::VPERMBZ128rrkz,             X86::VPERMBZ128rmkz,             0 },
   { X86::VPERMBZ256rrkz,             X86::VPERMBZ256rmkz,             0 },
   { X86::VPERMBZrrkz,                X86::VPERMBZrmkz,                0 },
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFormats.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFormats.td
index d7752e656b55..686b19fc0a6c 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFormats.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFormats.td
@@ -216,6 +216,7 @@ class T8XS : T8 { Prefix OpPrefix = XS; }
 class TAPS : TA { Prefix OpPrefix = PS; }
 class TAPD : TA { Prefix OpPrefix = PD; }
 class TAXD : TA { Prefix OpPrefix = XD; }
+class TAXS : TA { Prefix OpPrefix = XS; }
 class VEX    { Encoding OpEnc = EncVEX; }
 class VEX_W    { bit HasVEX_W = 1; }
 class VEX_WIG  { bit IgnoresVEX_W = 1; }
@@ -263,6 +264,9 @@ class NotMemoryFoldable { bit isMemoryFoldable = 0; }
 // Prevent EVEX->VEX conversion from considering this instruction.
 class NotEVEX2VEXConvertible { bit notEVEX2VEXConvertible = 1; }
 
+// Force the instruction to use VEX encoding.
+class ExplicitVEXPrefix { bit ExplicitVEXPrefix = 1; }
+
 class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
               string AsmStr, Domain d = GenericDomain>
   : Instruction {
@@ -347,6 +351,7 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
 
   bit isMemoryFoldable = 1;     // Is it allowed to memory fold/unfold this instruction?
   bit notEVEX2VEXConvertible = 0; // Prevent EVEX->VEX conversion.
+  bit ExplicitVEXPrefix = 0; // Force the instruction to use VEX encoding.
 
   // TSFlags layout should be kept in sync with X86BaseInfo.h.
   let TSFlags{6-0}   = FormBits;
@@ -375,6 +380,7 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
   let TSFlags{51-45} = CD8_Scale;
   let TSFlags{52}    = hasEVEX_RC;
   let TSFlags{53}    = hasNoTrackPrefix;
+  let TSFlags{54}    = ExplicitVEXPrefix;
 }
 
 class PseudoI<dag oops, dag iops, list<dag> pattern>
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
index f3f7d17d9b3c..777c5a158b4c 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -87,16 +87,16 @@ def X86multishift   : SDNode<"X86ISD::MULTISHIFT",
                                       SDTCisSameAs<1,2>]>>;
 def X86pextrb  : SDNode<"X86ISD::PEXTRB",
                  SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVT<1, v16i8>,
-                                      SDTCisPtrTy<2>]>>;
+                                      SDTCisVT<2, i8>]>>;
 def X86pextrw  : SDNode<"X86ISD::PEXTRW",
                  SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVT<1, v8i16>,
-                                      SDTCisPtrTy<2>]>>;
+                                      SDTCisVT<2, i8>]>>;
 def X86pinsrb  : SDNode<"X86ISD::PINSRB",
                  SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
-                                      SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>;
+                                      SDTCisVT<2, i32>, SDTCisVT<3, i8>]>>;
 def X86pinsrw  : SDNode<"X86ISD::PINSRW",
                  SDTypeProfile<1, 3, [SDTCisVT<0, v8i16>, SDTCisSameAs<0,1>,
-                                      SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>;
+                                      SDTCisVT<2, i32>, SDTCisVT<3, i8>]>>;
 def X86insertps : SDNode<"X86ISD::INSERTPS",
                  SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisSameAs<0,1>,
                                       SDTCisVT<2, v4f32>, SDTCisVT<3, i8>]>>;
@@ -109,6 +109,8 @@ def X86vextractst  : SDNode<"X86ISD::VEXTRACT_STORE", SDTStore,
                      [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
 def X86VBroadcastld  : SDNode<"X86ISD::VBROADCAST_LOAD", SDTLoad,
                       [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def X86SubVBroadcastld : SDNode<"X86ISD::SUBV_BROADCAST_LOAD", SDTLoad,
+                         [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 
 def SDTVtrunc    : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
                                         SDTCisInt<0>, SDTCisInt<1>,
@@ -207,16 +209,21 @@ def X86CmpMaskCC :
       SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCVecEltisVT<0, i1>,
                        SDTCisVec<1>, SDTCisSameAs<2, 1>,
                        SDTCisSameNumEltsAs<0, 1>, SDTCisVT<3, i8>]>;
+def X86MaskCmpMaskCC :
+      SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCVecEltisVT<0, i1>,
+                       SDTCisVec<1>, SDTCisSameAs<2, 1>,
+                       SDTCisSameNumEltsAs<0, 1>, SDTCisVT<3, i8>, SDTCisSameAs<4, 0>]>;
 def X86CmpMaskCCScalar :
       SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisFP<1>, SDTCisSameAs<1, 2>,
                            SDTCisVT<3, i8>]>;
 
 def X86cmpm     : SDNode<"X86ISD::CMPM",     X86CmpMaskCC>;
+def X86cmpmm    : SDNode<"X86ISD::CMPMM",    X86MaskCmpMaskCC>;
 def X86strict_cmpm : SDNode<"X86ISD::STRICT_CMPM", X86CmpMaskCC, [SDNPHasChain]>;
 def X86any_cmpm    : PatFrags<(ops node:$src1, node:$src2, node:$src3),
                                [(X86strict_cmpm node:$src1, node:$src2, node:$src3),
                                 (X86cmpm node:$src1, node:$src2, node:$src3)]>;
-def X86cmpmSAE  : SDNode<"X86ISD::CMPM_SAE", X86CmpMaskCC>;
+def X86cmpmmSAE : SDNode<"X86ISD::CMPMM_SAE", X86MaskCmpMaskCC>;
 def X86cmpms    : SDNode<"X86ISD::FSETCCM",   X86CmpMaskCCScalar>;
 def X86cmpmsSAE : SDNode<"X86ISD::FSETCCM_SAE",   X86CmpMaskCCScalar>;
 
@@ -488,10 +495,6 @@ def X86Vfpclasss   : SDNode<"X86ISD::VFPCLASSS",
                        SDTypeProfile<1, 2, [SDTCisVT<0, v1i1>,
                                             SDTCisFP<1>, SDTCisVT<2, i32>]>,[]>;
 
-def X86SubVBroadcast : SDNode<"X86ISD::SUBV_BROADCAST",
-                    SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
-                                         SDTCisSubVecOfVec<1, 0>]>, []>;
-
 def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>;
 def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>;
 
@@ -533,11 +536,6 @@ def X86fgetexpSAE   : SDNode<"X86ISD::FGETEXP_SAE", SDTFPUnaryOp>;
 def X86fgetexps     : SDNode<"X86ISD::FGETEXPS", SDTFPBinOp>;
 def X86fgetexpSAEs  : SDNode<"X86ISD::FGETEXPS_SAE", SDTFPBinOp>;
 
-def X86Fmadd        : SDNode<"ISD::FMA",          SDTFPTernaryOp, [SDNPCommutative]>;
-def X86strict_Fmadd : SDNode<"ISD::STRICT_FMA",   SDTFPTernaryOp, [SDNPCommutative, SDNPHasChain]>;
-def X86any_Fmadd    : PatFrags<(ops node:$src1, node:$src2, node:$src3),
-                               [(X86strict_Fmadd node:$src1, node:$src2, node:$src3),
-                                (X86Fmadd node:$src1, node:$src2, node:$src3)]>;
 def X86Fnmadd    : SDNode<"X86ISD::FNMADD",    SDTFPTernaryOp, [SDNPCommutative]>;
 def X86strict_Fnmadd : SDNode<"X86ISD::STRICT_FNMADD", SDTFPTernaryOp, [SDNPCommutative, SDNPHasChain]>;
 def X86any_Fnmadd : PatFrags<(ops node:$src1, node:$src2, node:$src3),
@@ -963,6 +961,16 @@ def X86VBroadcastld64 : PatFrag<(ops node:$src),
   return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 8;
 }]>;
 
+def X86SubVBroadcastld128 : PatFrag<(ops node:$src),
+                                    (X86SubVBroadcastld node:$src), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 16;
+}]>;
+
+def X86SubVBroadcastld256 : PatFrag<(ops node:$src),
+                                    (X86SubVBroadcastld node:$src), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 32;
+}]>;
+
 // Scalar SSE intrinsic fragments to match several different types of loads.
 // Used by scalar SSE intrinsic instructions which have 128 bit types, but
 // only load a single element.
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.cpp
index 42c111173570..d9bab14f0c08 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -28,9 +28,9 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/StackMaps.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
@@ -947,9 +947,9 @@ unsigned X86InstrInfo::isStoreToStackSlotPostFE(const MachineInstr &MI,
 }
 
 /// Return true if register is PIC base; i.e.g defined by X86::MOVPC32r.
-static bool regIsPICBase(unsigned BaseReg, const MachineRegisterInfo &MRI) {
+static bool regIsPICBase(Register BaseReg, const MachineRegisterInfo &MRI) {
   // Don't waste compile time scanning use-def chains of physregs.
-  if (!Register::isVirtualRegister(BaseReg))
+  if (!BaseReg.isVirtual())
     return false;
   bool isPICBase = false;
   for (MachineRegisterInfo::def_instr_iterator I = MRI.def_instr_begin(BaseReg),
@@ -1127,7 +1127,8 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
                                  const MachineInstr &Orig,
                                  const TargetRegisterInfo &TRI) const {
   bool ClobbersEFLAGS = Orig.modifiesRegister(X86::EFLAGS, &TRI);
-  if (ClobbersEFLAGS && !isSafeToClobberEFLAGS(MBB, I)) {
+  if (ClobbersEFLAGS && MBB.computeRegisterLiveness(&TRI, X86::EFLAGS, I) !=
+                            MachineBasicBlock::LQR_Dead) {
     // The instruction clobbers EFLAGS. Re-materialize as MOV32ri to avoid side
     // effects.
     int Value;
@@ -1205,8 +1206,7 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
     isKill = Src.isKill();
     assert(!Src.isUndef() && "Undef op doesn't need optimization");
 
-    if (Register::isVirtualRegister(NewSrc) &&
-        !MF.getRegInfo().constrainRegClass(NewSrc, RC))
+    if (NewSrc.isVirtual() && !MF.getRegInfo().constrainRegClass(NewSrc, RC))
       return false;
 
     return true;
@@ -1214,7 +1214,7 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
 
   // This is for an LEA64_32r and incoming registers are 32-bit. One way or
   // another we need to add 64-bit registers to the final MI.
-  if (Register::isPhysicalRegister(SrcReg)) {
+  if (SrcReg.isPhysical()) {
     ImplicitOp = Src;
     ImplicitOp.setImplicit();
 
@@ -1409,9 +1409,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
 
     // LEA can't handle RSP.
-    if (Register::isVirtualRegister(Src.getReg()) &&
-        !MF.getRegInfo().constrainRegClass(Src.getReg(),
-                                           &X86::GR64_NOSPRegClass))
+    if (Src.getReg().isVirtual() && !MF.getRegInfo().constrainRegClass(
+                                        Src.getReg(), &X86::GR64_NOSPRegClass))
       return nullptr;
 
     NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r))
@@ -2567,6 +2566,10 @@ bool X86InstrInfo::findCommutedOpIndices(const MachineInstr &MI,
   case X86::VPTERNLOGQZ256rmbikz:
   case X86::VPTERNLOGQZrmbikz:
     return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
+  case X86::VPDPWSSDYrr:
+  case X86::VPDPWSSDrr:
+  case X86::VPDPWSSDSYrr:
+  case X86::VPDPWSSDSrr:
   case X86::VPDPWSSDZ128r:
   case X86::VPDPWSSDZ128rk:
   case X86::VPDPWSSDZ128rkz:
@@ -3527,11 +3530,10 @@ X86InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
   return None;
 }
 
-static unsigned getLoadStoreRegOpcode(unsigned Reg,
+static unsigned getLoadStoreRegOpcode(Register Reg,
                                       const TargetRegisterClass *RC,
-                                      bool isStackAligned,
-                                      const X86Subtarget &STI,
-                                      bool load) {
+                                      bool IsStackAligned,
+                                      const X86Subtarget &STI, bool load) {
   bool HasAVX = STI.hasAVX();
   bool HasAVX512 = STI.hasAVX512();
   bool HasVLX = STI.hasVLX();
@@ -3604,7 +3606,7 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
   case 16: {
     if (X86::VR128XRegClass.hasSubClassEq(RC)) {
       // If stack is realigned we can use aligned stores.
-      if (isStackAligned)
+      if (IsStackAligned)
         return load ?
           (HasVLX    ? X86::VMOVAPSZ128rm :
            HasAVX512 ? X86::VMOVAPSZ128rm_NOVLX :
@@ -3636,7 +3638,7 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
   case 32:
     assert(X86::VR256XRegClass.hasSubClassEq(RC) && "Unknown 32-byte regclass");
     // If stack is realigned we can use aligned stores.
-    if (isStackAligned)
+    if (IsStackAligned)
       return load ?
         (HasVLX    ? X86::VMOVAPSZ256rm :
          HasAVX512 ? X86::VMOVAPSZ256rm_NOVLX :
@@ -3655,13 +3657,80 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
   case 64:
     assert(X86::VR512RegClass.hasSubClassEq(RC) && "Unknown 64-byte regclass");
     assert(STI.hasAVX512() && "Using 512-bit register requires AVX512");
-    if (isStackAligned)
+    if (IsStackAligned)
       return load ? X86::VMOVAPSZrm : X86::VMOVAPSZmr;
     else
       return load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr;
   }
 }
 
+Optional<ExtAddrMode>
+X86InstrInfo::getAddrModeFromMemoryOp(const MachineInstr &MemI,
+                                      const TargetRegisterInfo *TRI) const {
+  const MCInstrDesc &Desc = MemI.getDesc();
+  int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
+  if (MemRefBegin < 0)
+    return None;
+
+  MemRefBegin += X86II::getOperandBias(Desc);
+
+  auto &BaseOp = MemI.getOperand(MemRefBegin + X86::AddrBaseReg);
+  if (!BaseOp.isReg()) // Can be an MO_FrameIndex
+    return None;
+
+  const MachineOperand &DispMO = MemI.getOperand(MemRefBegin + X86::AddrDisp);
+  // Displacement can be symbolic
+  if (!DispMO.isImm())
+    return None;
+
+  ExtAddrMode AM;
+  AM.BaseReg = BaseOp.getReg();
+  AM.ScaledReg = MemI.getOperand(MemRefBegin + X86::AddrIndexReg).getReg();
+  AM.Scale = MemI.getOperand(MemRefBegin + X86::AddrScaleAmt).getImm();
+  AM.Displacement = DispMO.getImm();
+  return AM;
+}
+
+bool X86InstrInfo::getConstValDefinedInReg(const MachineInstr &MI,
+                                           const Register Reg,
+                                           int64_t &ImmVal) const {
+  if (MI.getOpcode() != X86::MOV32ri && MI.getOpcode() != X86::MOV64ri)
+    return false;
+  // Mov Src can be a global address.
+  if (!MI.getOperand(1).isImm() || MI.getOperand(0).getReg() != Reg)
+    return false;
+  ImmVal = MI.getOperand(1).getImm();
+  return true;
+}
+
+bool X86InstrInfo::preservesZeroValueInReg(
+    const MachineInstr *MI, const Register NullValueReg,
+    const TargetRegisterInfo *TRI) const {
+  if (!MI->modifiesRegister(NullValueReg, TRI))
+    return true;
+  switch (MI->getOpcode()) {
+  // Shift right/left of a null unto itself is still a null, i.e. rax = shl rax
+  // X.
+  case X86::SHR64ri:
+  case X86::SHR32ri:
+  case X86::SHL64ri:
+  case X86::SHL32ri:
+    assert(MI->getOperand(0).isDef() && MI->getOperand(1).isUse() &&
+           "expected for shift opcode!");
+    return MI->getOperand(0).getReg() == NullValueReg &&
+           MI->getOperand(1).getReg() == NullValueReg;
+  // Zero extend of a sub-reg of NullValueReg into itself does not change the
+  // null value.
+  case X86::MOV32rr:
+    return llvm::all_of(MI->operands(), [&](const MachineOperand &MO) {
+      return TRI->isSubRegisterEq(NullValueReg, MO.getReg());
+    });
+  default:
+    return false;
+  }
+  llvm_unreachable("Should be handled above!");
+}
+
 bool X86InstrInfo::getMemOperandsWithOffsetWidth(
     const MachineInstr &MemOp, SmallVectorImpl<const MachineOperand *> &BaseOps,
     int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
@@ -3706,19 +3775,17 @@ bool X86InstrInfo::getMemOperandsWithOffsetWidth(
   return true;
 }
 
-static unsigned getStoreRegOpcode(unsigned SrcReg,
+static unsigned getStoreRegOpcode(Register SrcReg,
                                   const TargetRegisterClass *RC,
-                                  bool isStackAligned,
+                                  bool IsStackAligned,
                                   const X86Subtarget &STI) {
-  return getLoadStoreRegOpcode(SrcReg, RC, isStackAligned, STI, false);
+  return getLoadStoreRegOpcode(SrcReg, RC, IsStackAligned, STI, false);
 }
 
-
-static unsigned getLoadRegOpcode(unsigned DestReg,
+static unsigned getLoadRegOpcode(Register DestReg,
                                  const TargetRegisterClass *RC,
-                                 bool isStackAligned,
-                                 const X86Subtarget &STI) {
-  return getLoadStoreRegOpcode(DestReg, RC, isStackAligned, STI, true);
+                                 bool IsStackAligned, const X86Subtarget &STI) {
+  return getLoadStoreRegOpcode(DestReg, RC, IsStackAligned, STI, true);
 }
 
 void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
@@ -3729,13 +3796,31 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   const MachineFunction &MF = *MBB.getParent();
   assert(MF.getFrameInfo().getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) &&
          "Stack slot too small for store");
-  unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
-  bool isAligned =
-      (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
-      RI.canRealignStack(MF);
-  unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
-  addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
-    .addReg(SrcReg, getKillRegState(isKill));
+  if (RC->getID() == X86::TILERegClassID) {
+    unsigned Opc = X86::TILESTORED;
+    // tilestored %tmm, (%sp, %idx)
+    MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
+    Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
+    BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64);
+    MachineInstr *NewMI =
+        addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
+            .addReg(SrcReg, getKillRegState(isKill));
+    MachineOperand &MO = NewMI->getOperand(2);
+    MO.setReg(VirtReg);
+    MO.setIsKill(true);
+  } else if (RC->getID() == X86::TILECFGRegClassID) {
+    unsigned Opc = X86::PSTTILECFG;
+    addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
+        .addReg(SrcReg, getKillRegState(isKill));
+  } else {
+    unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
+    bool isAligned =
+        (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
+        RI.canRealignStack(MF);
+    unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
+    addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
+        .addReg(SrcReg, getKillRegState(isKill));
+  }
 }
 
 void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
@@ -3743,13 +3828,32 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                         Register DestReg, int FrameIdx,
                                         const TargetRegisterClass *RC,
                                         const TargetRegisterInfo *TRI) const {
-  const MachineFunction &MF = *MBB.getParent();
-  unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
-  bool isAligned =
-      (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
-      RI.canRealignStack(MF);
-  unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
-  addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg), FrameIdx);
+  if (RC->getID() == X86::TILERegClassID) {
+    unsigned Opc = X86::TILELOADD;
+    // tileloadd (%sp, %idx), %tmm
+    MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
+    Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
+    MachineInstr *NewMI =
+        BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64);
+    NewMI = addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg),
+                              FrameIdx);
+    MachineOperand &MO = NewMI->getOperand(3);
+    MO.setReg(VirtReg);
+    MO.setIsKill(true);
+  } else if (RC->getID() == X86::TILECFGRegClassID) {
+    unsigned Opc = X86::PLDTILECFG;
+    addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg),
+                      FrameIdx);
+  } else {
+    const MachineFunction &MF = *MBB.getParent();
+    unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
+    bool isAligned =
+        (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
+        RI.canRealignStack(MF);
+    unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
+    addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg),
+                      FrameIdx);
+  }
 }
 
 bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
@@ -4312,7 +4416,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
 /// instructions in-between do not load or store, and have no side effects.
 MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI,
                                               const MachineRegisterInfo *MRI,
-                                              unsigned &FoldAsLoadDefReg,
+                                              Register &FoldAsLoadDefReg,
                                               MachineInstr *&DefMI) const {
   // Check whether we can move DefMI here.
   DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
@@ -4375,8 +4479,8 @@ static bool Expand2AddrUndef(MachineInstrBuilder &MIB,
 ///   %k4 = K_SET1
 /// to:
 ///   %k4 = KXNORrr %k0, %k0
-static bool Expand2AddrKreg(MachineInstrBuilder &MIB,
-                            const MCInstrDesc &Desc, unsigned Reg) {
+static bool Expand2AddrKreg(MachineInstrBuilder &MIB, const MCInstrDesc &Desc,
+                            Register Reg) {
   assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
   MIB->setDesc(Desc);
   MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef);
@@ -4822,7 +4926,7 @@ unsigned X86InstrInfo::getPartialRegUpdateClearance(
   // If MI is marked as reading Reg, the partial register update is wanted.
   const MachineOperand &MO = MI.getOperand(0);
   Register Reg = MO.getReg();
-  if (Register::isVirtualRegister(Reg)) {
+  if (Reg.isVirtual()) {
     if (MO.readsReg() || MI.readsVirtualRegister(Reg))
       return 0;
   } else {
@@ -5120,18 +5224,12 @@ static bool hasUndefRegUpdate(unsigned Opcode, unsigned OpNum,
 /// Like getPartialRegUpdateClearance, this makes a strong assumption that the
 /// high bits that are passed-through are not live.
 unsigned
-X86InstrInfo::getUndefRegClearance(const MachineInstr &MI, unsigned &OpNum,
+X86InstrInfo::getUndefRegClearance(const MachineInstr &MI, unsigned OpNum,
                                    const TargetRegisterInfo *TRI) const {
-  for (unsigned i = MI.getNumExplicitDefs(), e = MI.getNumExplicitOperands();
-         i != e; ++i) {
-    const MachineOperand &MO = MI.getOperand(i);
-    if (MO.isReg() && MO.isUndef() &&
-        Register::isPhysicalRegister(MO.getReg()) &&
-        hasUndefRegUpdate(MI.getOpcode(), i)) {
-      OpNum = i;
-      return UndefRegClearance;
-    }
-  }
+  const MachineOperand &MO = MI.getOperand(OpNum);
+  if (Register::isPhysicalRegister(MO.getReg()) &&
+      hasUndefRegUpdate(MI.getOpcode(), OpNum))
+    return UndefRegClearance;
 
   return 0;
 }
@@ -5213,7 +5311,7 @@ static void updateOperandRegConstraints(MachineFunction &MF,
     if (!MO.isReg())
       continue;
     Register Reg = MO.getReg();
-    if (!Register::isVirtualRegister(Reg))
+    if (!Reg.isVirtual())
       continue;
 
     auto *NewRC = MRI.constrainRegClass(
@@ -5464,6 +5562,10 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
 
   if (I != nullptr) {
     unsigned Opcode = I->DstOp;
+    bool FoldedLoad =
+        isTwoAddrFold || (OpNum == 0 && I->Flags & TB_FOLDED_LOAD) || OpNum > 0;
+    bool FoldedStore =
+        isTwoAddrFold || (OpNum == 0 && I->Flags & TB_FOLDED_STORE);
     MaybeAlign MinAlign =
         decodeMaybeAlign((I->Flags & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT);
     if (MinAlign && Alignment < *MinAlign)
@@ -5474,20 +5576,25 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
       const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum,
                                                   &RI, MF);
       unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
-      if (Size < RCSize) {
-        // FIXME: Allow scalar intrinsic instructions like ADDSSrm_Int.
-        // Check if it's safe to fold the load. If the size of the object is
-        // narrower than the load width, then it's not.
-        if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4)
-          return nullptr;
+      // Check if it's safe to fold the load. If the size of the object is
+      // narrower than the load width, then it's not.
+      // FIXME: Allow scalar intrinsic instructions like ADDSSrm_Int.
+      if (FoldedLoad && Size < RCSize) {
         // If this is a 64-bit load, but the spill slot is 32, then we can do
         // a 32-bit load which is implicitly zero-extended. This likely is
         // due to live interval analysis remat'ing a load from stack slot.
+        if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4)
+          return nullptr;
         if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
           return nullptr;
         Opcode = X86::MOV32rm;
         NarrowToMOV32rm = true;
       }
+      // For stores, make sure the size of the object is equal to the size of
+      // the store. If the object is larger, the extra bits would be garbage. If
+      // the object is smaller we might overwrite another object or fault.
+      if (FoldedStore && Size != RCSize)
+        return nullptr;
     }
 
     if (isTwoAddrFold)
@@ -5500,7 +5607,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
       // value and zero-extend the top bits. Change the destination register
       // to a 32-bit one.
       Register DstReg = NewMI->getOperand(0).getReg();
-      if (Register::isPhysicalRegister(DstReg))
+      if (DstReg.isPhysical())
         NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, X86::sub_32bit));
       else
         NewMI->getOperand(0).setSubReg(X86::sub_32bit);
@@ -6357,7 +6464,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
   }
   if (Load)
     BeforeOps.push_back(SDValue(Load, 0));
-  BeforeOps.insert(BeforeOps.end(), AfterOps.begin(), AfterOps.end());
+  llvm::append_range(BeforeOps, AfterOps);
   // Change CMP32ri r, 0 back to TEST32rr r, r, etc.
   switch (Opc) {
     default: break;
@@ -6675,6 +6782,18 @@ bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
   return true;
 }
 
+bool X86InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
+                                        const MachineBasicBlock *MBB,
+                                        const MachineFunction &MF) const {
+
+  // ENDBR instructions should not be scheduled around.
+  unsigned Opcode = MI.getOpcode();
+  if (Opcode == X86::ENDBR64 || Opcode == X86::ENDBR32)
+    return true;
+
+  return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF);
+}
+
 bool X86InstrInfo::
 reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
   assert(Cond.size() == 1 && "Invalid X86 branch condition!");
@@ -6705,7 +6824,7 @@ unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
          "X86-64 PIC uses RIP relative addressing");
 
   X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
-  unsigned GlobalBaseReg = X86FI->getGlobalBaseReg();
+  Register GlobalBaseReg = X86FI->getGlobalBaseReg();
   if (GlobalBaseReg != 0)
     return GlobalBaseReg;
 
@@ -8261,7 +8380,7 @@ describeMOVrrLoadedValue(const MachineInstr &MI, Register DescribedReg,
   // If the described register is a sub-register of the destination register,
   // then pick out the source register's corresponding sub-register.
   if (unsigned SubRegIdx = TRI->getSubRegIndex(DestReg, DescribedReg)) {
-    unsigned SrcSubReg = TRI->getSubReg(SrcReg, SubRegIdx);
+    Register SrcSubReg = TRI->getSubReg(SrcReg, SubRegIdx);
     return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr);
   }
 
@@ -8525,7 +8644,7 @@ namespace {
         return false;
 
       X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
-      unsigned GlobalBaseReg = X86FI->getGlobalBaseReg();
+      Register GlobalBaseReg = X86FI->getGlobalBaseReg();
 
       // If we didn't need a GlobalBaseReg, don't insert code.
       if (GlobalBaseReg == 0)
@@ -8538,7 +8657,7 @@ namespace {
       MachineRegisterInfo &RegInfo = MF.getRegInfo();
       const X86InstrInfo *TII = STI.getInstrInfo();
 
-      unsigned PC;
+      Register PC;
       if (STI.isPICStyleGOT())
         PC = RegInfo.createVirtualRegister(&X86::GR32RegClass);
       else
@@ -8608,7 +8727,7 @@ namespace {
       MachineFunctionPass::getAnalysisUsage(AU);
     }
   };
-}
+} // namespace
 
 char CGBR::ID = 0;
 FunctionPass*
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.h b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.h
index 89f2ff118c37..d7d2370c6f67 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.h
@@ -317,6 +317,17 @@ public:
                      SmallVectorImpl<MachineOperand> &Cond,
                      bool AllowModify) const override;
 
+  Optional<ExtAddrMode>
+  getAddrModeFromMemoryOp(const MachineInstr &MemI,
+                          const TargetRegisterInfo *TRI) const override;
+
+  bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg,
+                               int64_t &ImmVal) const override;
+
+  bool preservesZeroValueInReg(const MachineInstr *MI,
+                               const Register NullValueReg,
+                               const TargetRegisterInfo *TRI) const override;
+
   bool getMemOperandsWithOffsetWidth(
       const MachineInstr &LdSt,
       SmallVectorImpl<const MachineOperand *> &BaseOps, int64_t &Offset,
@@ -409,6 +420,13 @@ public:
   bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, int64_t &Offset1,
                                int64_t &Offset2) const override;
 
+  /// isSchedulingBoundary - Overrides the isSchedulingBoundary from
+  ///	Codegen/TargetInstrInfo.cpp to make it capable of identifying ENDBR
+  /// intructions and prevent it from being re-scheduled.
+  bool isSchedulingBoundary(const MachineInstr &MI,
+                            const MachineBasicBlock *MBB,
+                            const MachineFunction &MF) const override;
+
   /// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to
   /// determine (in conjunction with areLoadsFromSameBasePtr) if two loads
   /// should be scheduled togther. On some targets if two loads are loading from
@@ -430,16 +448,6 @@ public:
   /// instruction that defines the specified register class.
   bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override;
 
-  /// isSafeToClobberEFLAGS - Return true if it's safe insert an instruction tha
-  /// would clobber the EFLAGS condition register. Note the result may be
-  /// conservative. If it cannot definitely determine the safety after visiting
-  /// a few instructions in each direction it assumes it's not safe.
-  bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
-                             MachineBasicBlock::iterator I) const {
-    return MBB.computeRegisterLiveness(&RI, X86::EFLAGS, I, 4) ==
-           MachineBasicBlock::LQR_Dead;
-  }
-
   /// True if MI has a condition code def, e.g. EFLAGS, that is
   /// not marked dead.
   bool hasLiveCondCodeDef(MachineInstr &MI) const;
@@ -462,7 +470,7 @@ public:
   unsigned
   getPartialRegUpdateClearance(const MachineInstr &MI, unsigned OpNum,
                                const TargetRegisterInfo *TRI) const override;
-  unsigned getUndefRegClearance(const MachineInstr &MI, unsigned &OpNum,
+  unsigned getUndefRegClearance(const MachineInstr &MI, unsigned OpNum,
                                 const TargetRegisterInfo *TRI) const override;
   void breakPartialRegDependency(MachineInstr &MI, unsigned OpNum,
                                  const TargetRegisterInfo *TRI) const override;
@@ -517,7 +525,7 @@ public:
   /// the machine instruction generated due to folding.
   MachineInstr *optimizeLoadInstr(MachineInstr &MI,
                                   const MachineRegisterInfo *MRI,
-                                  unsigned &FoldAsLoadDefReg,
+                                  Register &FoldAsLoadDefReg,
                                   MachineInstr *&DefMI) const override;
 
   std::pair<unsigned, unsigned>
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.td
index 3ea0ae8a8840..b006d1d9aa3a 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.td
@@ -69,13 +69,8 @@ def SDTX86wrpkru : SDTypeProfile<0, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
 
 def SDTX86cas : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisInt<1>,
                                      SDTCisVT<2, i8>]>;
-def SDTX86caspair : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
-def SDTX86caspairSaveEbx8 : SDTypeProfile<1, 3,
-                                          [SDTCisVT<0, i32>, SDTCisPtrTy<1>,
-                                          SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
-def SDTX86caspairSaveRbx16 : SDTypeProfile<1, 3,
-                                           [SDTCisVT<0, i64>, SDTCisPtrTy<1>,
-                                           SDTCisVT<2, i64>, SDTCisVT<3, i64>]>;
+def SDTX86cas8pair : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+def SDTX86cas16pair : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i64>]>;
 
 def SDTLockBinaryArithWithFlags : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
                                                        SDTCisPtrTy<1>,
@@ -99,11 +94,11 @@ def SDT_X86VASTART_SAVE_XMM_REGS : SDTypeProfile<0, -1, [SDTCisVT<0, i8>,
                                                          SDTCisVT<1, iPTR>,
                                                          SDTCisVT<2, iPTR>]>;
 
-def SDT_X86VAARG_64 : SDTypeProfile<1, -1, [SDTCisPtrTy<0>,
-                                            SDTCisPtrTy<1>,
-                                            SDTCisVT<2, i32>,
-                                            SDTCisVT<3, i8>,
-                                            SDTCisVT<4, i32>]>;
+def SDT_X86VAARG : SDTypeProfile<1, -1, [SDTCisPtrTy<0>,
+                                         SDTCisPtrTy<1>,
+                                         SDTCisVT<2, i32>,
+                                         SDTCisVT<3, i8>,
+                                         SDTCisVT<4, i32>]>;
 
 def SDTX86RepStr  : SDTypeProfile<0, 1, [SDTCisVT<0, OtherVT>]>;
 
@@ -132,6 +127,11 @@ def SDT_X86MEMBARRIER : SDTypeProfile<0, 0, []>;
 def SDT_X86ENQCMD : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
                                          SDTCisPtrTy<1>, SDTCisSameAs<1, 2>]>;
 
+def SDT_X86AESENCDECKL : SDTypeProfile<2, 2, [SDTCisVT<0, v2i64>,
+                                              SDTCisVT<1, i32>,
+                                              SDTCisVT<2, v2i64>,
+                                              SDTCisPtrTy<3>]>;
+
 def X86MemBarrier : SDNode<"X86ISD::MEMBARRIER", SDT_X86MEMBARRIER,
                             [SDNPHasChain,SDNPSideEffect]>;
 def X86MFence : SDNode<"X86ISD::MFENCE", SDT_X86MEMBARRIER,
@@ -169,20 +169,12 @@ def X86wrpkru : SDNode<"X86ISD::WRPKRU",    SDTX86wrpkru,
 def X86cas : SDNode<"X86ISD::LCMPXCHG_DAG", SDTX86cas,
                         [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
                          SDNPMayLoad, SDNPMemOperand]>;
-def X86cas8 : SDNode<"X86ISD::LCMPXCHG8_DAG", SDTX86caspair,
+def X86cas8 : SDNode<"X86ISD::LCMPXCHG8_DAG", SDTX86cas8pair,
                         [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
                          SDNPMayLoad, SDNPMemOperand]>;
-def X86cas16 : SDNode<"X86ISD::LCMPXCHG16_DAG", SDTX86caspair,
+def X86cas16 : SDNode<"X86ISD::LCMPXCHG16_DAG", SDTX86cas16pair,
                         [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
                          SDNPMayLoad, SDNPMemOperand]>;
-def X86cas8save_ebx : SDNode<"X86ISD::LCMPXCHG8_SAVE_EBX_DAG",
-                                SDTX86caspairSaveEbx8,
-                                [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
-                                SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
-def X86cas16save_rbx : SDNode<"X86ISD::LCMPXCHG16_SAVE_RBX_DAG",
-                                SDTX86caspairSaveRbx16,
-                                [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
-                                SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
 
 def X86retflag : SDNode<"X86ISD::RET_FLAG", SDTX86Ret,
                         [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
@@ -194,7 +186,11 @@ def X86vastart_save_xmm_regs :
                         SDT_X86VASTART_SAVE_XMM_REGS,
                         [SDNPHasChain, SDNPVariadic]>;
 def X86vaarg64 :
-                 SDNode<"X86ISD::VAARG_64", SDT_X86VAARG_64,
+                 SDNode<"X86ISD::VAARG_64", SDT_X86VAARG,
+                        [SDNPHasChain, SDNPMayLoad, SDNPMayStore,
+                         SDNPMemOperand]>;
+def X86vaargx32 :
+                 SDNode<"X86ISD::VAARG_X32", SDT_X86VAARG,
                         [SDNPHasChain, SDNPMayLoad, SDNPMayStore,
                          SDNPMemOperand]>;
 def X86callseq_start :
@@ -284,6 +280,7 @@ def X86lock_and  : SDNode<"X86ISD::LAND",  SDTLockBinaryArithWithFlags,
                            SDNPMemOperand]>;
 
 def X86bextr  : SDNode<"X86ISD::BEXTR",  SDTIntBinOp>;
+def X86bextri : SDNode<"X86ISD::BEXTRI", SDTIntBinOp>;
 
 def X86bzhi   : SDNode<"X86ISD::BZHI",   SDTIntBinOp>;
 
@@ -323,6 +320,22 @@ def X86enqcmd : SDNode<"X86ISD::ENQCMD", SDT_X86ENQCMD,
                        [SDNPHasChain, SDNPSideEffect]>;
 def X86enqcmds : SDNode<"X86ISD::ENQCMDS", SDT_X86ENQCMD,
                        [SDNPHasChain, SDNPSideEffect]>;
+def X86testui : SDNode<"X86ISD::TESTUI",
+                       SDTypeProfile<1, 0, [SDTCisVT<0, i32>]>,
+                       [SDNPHasChain, SDNPSideEffect]>;
+
+def X86aesenc128kl : SDNode<"X86ISD::AESENC128KL", SDT_X86AESENCDECKL,
+                            [SDNPHasChain, SDNPMayLoad, SDNPSideEffect,
+                             SDNPMemOperand]>;
+def X86aesdec128kl : SDNode<"X86ISD::AESDEC128KL", SDT_X86AESENCDECKL,
+                            [SDNPHasChain, SDNPMayLoad, SDNPSideEffect,
+                             SDNPMemOperand]>;
+def X86aesenc256kl : SDNode<"X86ISD::AESENC256KL", SDT_X86AESENCDECKL,
+                            [SDNPHasChain, SDNPMayLoad, SDNPSideEffect,
+                             SDNPMemOperand]>;
+def X86aesdec256kl : SDNode<"X86ISD::AESDEC256KL", SDT_X86AESENCDECKL,
+                            [SDNPHasChain, SDNPMayLoad, SDNPSideEffect,
+                             SDNPMemOperand]>;
 
 //===----------------------------------------------------------------------===//
 // X86 Operand Definitions.
@@ -901,6 +914,8 @@ def PKU        : Predicate<"Subtarget->hasPKU()">;
 def HasVNNI    : Predicate<"Subtarget->hasVNNI()">;
 def HasVP2INTERSECT : Predicate<"Subtarget->hasVP2INTERSECT()">;
 def HasBF16      : Predicate<"Subtarget->hasBF16()">;
+def HasAVXVNNI : Predicate <"Subtarget->hasAVXVNNI()">;
+def NoVLX_Or_NoVNNI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasVNNI()">;
 
 def HasBITALG    : Predicate<"Subtarget->hasBITALG()">;
 def HasPOPCNT    : Predicate<"Subtarget->hasPOPCNT()">;
@@ -964,11 +979,15 @@ def HasCmpxchg8b : Predicate<"Subtarget->hasCmpxchg8b()">;
 def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">;
 def HasPCONFIG   : Predicate<"Subtarget->hasPCONFIG()">;
 def HasENQCMD    : Predicate<"Subtarget->hasENQCMD()">;
+def HasKL        : Predicate<"Subtarget->hasKL()">;
+def HasWIDEKL    : Predicate<"Subtarget->hasWIDEKL()">;
+def HasHRESET    : Predicate<"Subtarget->hasHRESET()">;
 def HasSERIALIZE : Predicate<"Subtarget->hasSERIALIZE()">;
 def HasTSXLDTRK  : Predicate<"Subtarget->hasTSXLDTRK()">;
 def HasAMXTILE   : Predicate<"Subtarget->hasAMXTILE()">;
 def HasAMXBF16   : Predicate<"Subtarget->hasAMXBF16()">;
 def HasAMXINT8   : Predicate<"Subtarget->hasAMXINT8()">;
+def HasUINTR     : Predicate<"Subtarget->hasUINTR()">;
 def Not64BitMode : Predicate<"!Subtarget->is64Bit()">,
                              AssemblerPredicate<(all_of (not Mode64Bit)), "Not 64-bit mode">;
 def In64BitMode  : Predicate<"Subtarget->is64Bit()">,
@@ -1016,6 +1035,7 @@ def HasFastMem32 : Predicate<"!Subtarget->isUnalignedMem32Slow()">;
 def HasFastLZCNT : Predicate<"Subtarget->hasFastLZCNT()">;
 def HasFastSHLDRotate : Predicate<"Subtarget->hasFastSHLDRotate()">;
 def HasERMSB : Predicate<"Subtarget->hasERMSB()">;
+def HasFSRM : Predicate<"Subtarget->hasFSRM()">;
 def HasMFence    : Predicate<"Subtarget->hasMFence()">;
 def UseIndirectThunkCalls : Predicate<"Subtarget->useIndirectThunkCalls()">;
 def NotUseIndirectThunkCalls : Predicate<"!Subtarget->useIndirectThunkCalls()">;
@@ -1053,6 +1073,7 @@ def i16immSExt8  : ImmLeaf<i16, [{ return isInt<8>(Imm); }]>;
 def i32immSExt8  : ImmLeaf<i32, [{ return isInt<8>(Imm); }]>;
 def i64immSExt8  : ImmLeaf<i64, [{ return isInt<8>(Imm); }]>;
 def i64immSExt32 : ImmLeaf<i64, [{ return isInt<32>(Imm); }]>;
+def i64timmSExt32 : TImmLeaf<i64, [{ return isInt<32>(Imm); }]>;
 
 def i16relocImmSExt8 : PatLeaf<(i16 relocImm), [{
   return isSExtAbsoluteSymbolRef(8, N);
@@ -2658,11 +2679,11 @@ let Predicates = [HasBMI2] in {
 //
 let Predicates = [HasTBM], Defs = [EFLAGS] in {
 
-multiclass tbm_ternary_imm<bits<8> opc, RegisterClass RC, string OpcodeStr,
-                           X86MemOperand x86memop, PatFrag ld_frag,
-                           SDNode OpNode, Operand immtype,
-                           SDPatternOperator immoperator,
-                           X86FoldableSchedWrite Sched> {
+multiclass tbm_bextri<bits<8> opc, RegisterClass RC, string OpcodeStr,
+                      X86MemOperand x86memop, PatFrag ld_frag,
+                      SDNode OpNode, Operand immtype,
+                      SDPatternOperator immoperator,
+                      X86FoldableSchedWrite Sched> {
   def ri : Ii32<opc,  MRMSrcReg, (outs RC:$dst), (ins RC:$src1, immtype:$cntl),
                 !strconcat(OpcodeStr,
                            "\t{$cntl, $src1, $dst|$dst, $src1, $cntl}"),
@@ -2676,12 +2697,12 @@ multiclass tbm_ternary_imm<bits<8> opc, RegisterClass RC, string OpcodeStr,
                 XOP, XOPA, Sched<[Sched.Folded]>;
 }
 
-defm BEXTRI32 : tbm_ternary_imm<0x10, GR32, "bextr{l}", i32mem, loadi32,
-                                X86bextr, i32imm, imm, WriteBEXTR>;
+defm BEXTRI32 : tbm_bextri<0x10, GR32, "bextr{l}", i32mem, loadi32,
+                           X86bextri, i32imm, timm, WriteBEXTR>;
 let ImmT = Imm32S in
-defm BEXTRI64 : tbm_ternary_imm<0x10, GR64, "bextr{q}", i64mem, loadi64,
-                                X86bextr, i64i32imm,
-                                i64immSExt32, WriteBEXTR>, VEX_W;
+defm BEXTRI64 : tbm_bextri<0x10, GR64, "bextr{q}", i64mem, loadi64,
+                           X86bextri, i64i32imm,
+                           i64timmSExt32, WriteBEXTR>, VEX_W;
 
 multiclass tbm_binary_rm<bits<8> opc, Format FormReg, Format FormMem,
                          RegisterClass RC, string OpcodeStr,
@@ -2787,8 +2808,7 @@ let SchedRW = [ WriteSystem ] in {
 
   let Uses = [ ECX, EAX, EBX ] in {
     def MWAITXrrr : I<0x01, MRM_FB, (outs), (ins), "mwaitx",
-                    [(int_x86_mwaitx ECX, EAX, EBX)]>,
-                    TB, Requires<[ HasMWAITX ]>;
+                    []>, TB, Requires<[ HasMWAITX ]>;
   }
 } // SchedRW
 
@@ -2904,6 +2924,41 @@ let SchedRW = [WriteLoad] in {
 def : InstAlias<"clzero\t{%eax|eax}", (CLZERO32r)>, Requires<[Not64BitMode]>;
 def : InstAlias<"clzero\t{%rax|rax}", (CLZERO64r)>, Requires<[In64BitMode]>;
 
+//===----------------------------------------------------------------------===//
+// INVLPGB Instruction
+// OPCODE 0F 01 FE
+//
+let SchedRW = [WriteSystem] in {
+  let Uses = [EAX, EDX] in
+  def INVLPGB32 : I<0x01, MRM_FE, (outs), (ins),
+                  "invlpgb}", []>,
+                  PS, Requires<[Not64BitMode]>;
+  let Uses = [RAX, EDX] in
+  def INVLPGB64 : I<0x01, MRM_FE, (outs), (ins),
+                  "invlpgb", []>,
+                  PS, Requires<[In64BitMode]>;
+} // SchedRW
+
+def : InstAlias<"invlpgb\t{%eax, %edx|eax, edx}", (INVLPGB32)>, Requires<[Not64BitMode]>;
+def : InstAlias<"invlpgb\t{%rax, %edx|rax, edx}", (INVLPGB64)>, Requires<[In64BitMode]>;
+
+//===----------------------------------------------------------------------===//
+// TLBSYNC Instruction
+// OPCODE 0F 01 FF
+//
+let SchedRW = [WriteSystem] in {
+  def TLBSYNC   : I<0x01, MRM_FF, (outs), (ins),
+                  "tlbsync", []>,
+                  PS, Requires<[]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// HRESET Instruction
+//
+let Uses = [EAX], SchedRW = [WriteSystem] in
+  def HRESET : Ii8<0xF0, MRM_C0, (outs), (ins i32u8imm:$imm), "hreset\t$imm", []>,
+                   Requires<[HasHRESET]>, TAXS;
+
 //===----------------------------------------------------------------------===//
 // SERIALIZE Instruction
 //
@@ -2921,6 +2976,25 @@ let Predicates = [HasTSXLDTRK] in {
                     [(int_x86_xresldtrk)]>, XD;
 }
 
+//===----------------------------------------------------------------------===//
+// UINTR Instructions
+//
+let Predicates = [HasUINTR, In64BitMode] in {
+  def UIRET : I<0x01, MRM_EC, (outs), (ins), "uiret",
+               []>, XS;
+  def CLUI : I<0x01, MRM_EE, (outs), (ins), "clui",
+               [(int_x86_clui)]>, XS;
+  def STUI : I<0x01, MRM_EF, (outs), (ins), "stui",
+               [(int_x86_stui)]>, XS;
+
+  def SENDUIPI : I<0xC7, MRM6r, (outs), (ins GR64:$arg), "senduipi\t$arg",
+                   [(int_x86_senduipi GR64:$arg)]>, XS;
+
+  let Defs = [EFLAGS] in
+    def TESTUI : I<0x01, MRM_ED, (outs), (ins), "testui",
+                   [(set EFLAGS, (X86testui))]>, XS;
+}
+
 //===----------------------------------------------------------------------===//
 // Pattern fragments to auto generate TBM instructions.
 //===----------------------------------------------------------------------===//
@@ -3080,10 +3154,16 @@ include "X86InstrMPX.td"
 
 include "X86InstrVMX.td"
 include "X86InstrSVM.td"
+include "X86InstrSNP.td"
 
 include "X86InstrTSX.td"
 include "X86InstrSGX.td"
 
+include "X86InstrTDX.td"
+
+// Key Locker instructions
+include "X86InstrKL.td"
+
 // AMX instructions
 include "X86InstrAMX.td"
 
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrKL.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrKL.td
new file mode 100644
index 000000000000..b91e563a15f3
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrKL.td
@@ -0,0 +1,86 @@
+//===---------------------------*-tablegen-*-------------------------------===//
+//===------------- X86InstrKL.td - KL Instruction Set Extension -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the instructions that make up the Intel key locker
+// instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Key Locker instructions
+
+let SchedRW = [WriteSystem], Predicates = [HasKL] in {
+  let Uses = [XMM0, EAX], Defs = [EFLAGS] in {
+    def LOADIWKEY : I<0xDC, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
+                      "loadiwkey\t{$src2, $src1|$src1, $src2}",
+                      [(int_x86_loadiwkey XMM0, VR128:$src1, VR128:$src2, EAX)]>, T8XS,
+                      NotMemoryFoldable;
+  }
+
+  let Uses = [XMM0], Defs = [XMM0, XMM1, XMM2, XMM4, XMM5, XMM6, EFLAGS] in {
+    def ENCODEKEY128 : I<0xFA, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+                         "encodekey128\t{$src, $dst|$dst, $src}", []>, T8XS,
+                       NotMemoryFoldable;
+  }
+
+  let Uses = [XMM0, XMM1], Defs = [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, EFLAGS] in {
+    def ENCODEKEY256 : I<0xFB, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+                         "encodekey256\t{$src, $dst|$dst, $src}", []>, T8XS,
+                       NotMemoryFoldable;
+  }
+
+  let Constraints = "$src1 = $dst",
+      Defs = [EFLAGS] in {
+   def AESENC128KL : I<0xDC, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, opaquemem:$src2),
+                       "aesenc128kl\t{$src2, $src1|$src1, $src2}",
+                       [(set VR128:$dst, EFLAGS,
+                         (X86aesenc128kl VR128:$src1, addr:$src2))]>, T8XS,
+                       NotMemoryFoldable;
+
+   def AESDEC128KL : I<0xDD, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, opaquemem:$src2),
+                       "aesdec128kl\t{$src2, $src1|$src1, $src2}",
+                       [(set VR128:$dst, EFLAGS,
+                         (X86aesdec128kl VR128:$src1, addr:$src2))]>, T8XS,
+                       NotMemoryFoldable;
+
+   def AESENC256KL : I<0xDE, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, opaquemem:$src2),
+                       "aesenc256kl\t{$src2, $src1|$src1, $src2}",
+                       [(set VR128:$dst, EFLAGS,
+                         (X86aesenc256kl VR128:$src1, addr:$src2))]>, T8XS,
+                       NotMemoryFoldable;
+
+   def AESDEC256KL : I<0xDF, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, opaquemem:$src2),
+                       "aesdec256kl\t{$src2, $src1|$src1, $src2}",
+                       [(set VR128:$dst, EFLAGS,
+                         (X86aesdec256kl VR128:$src1, addr:$src2))]>, T8XS,
+                       NotMemoryFoldable;
+  }
+
+} // SchedRW, Predicates
+
+let SchedRW = [WriteSystem], Predicates = [HasWIDEKL] in {
+  let Uses = [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7],
+      Defs = [EFLAGS, XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7],
+      mayLoad = 1 in {
+    def AESENCWIDE128KL : I<0xD8, MRM0m, (outs), (ins opaquemem:$src),
+                            "aesencwide128kl\t$src", []>, T8XS,
+                            NotMemoryFoldable;
+    def AESDECWIDE128KL : I<0xD8, MRM1m, (outs), (ins opaquemem:$src),
+                            "aesdecwide128kl\t$src", []>, T8XS,
+                            NotMemoryFoldable;
+    def AESENCWIDE256KL : I<0xD8, MRM2m, (outs), (ins opaquemem:$src),
+                            "aesencwide256kl\t$src", []>, T8XS,
+                            NotMemoryFoldable;
+    def AESDECWIDE256KL : I<0xD8, MRM3m, (outs), (ins opaquemem:$src),
+                            "aesdecwide256kl\t$src", []>, T8XS,
+                            NotMemoryFoldable;
+  }
+
+} // SchedRW, Predicates
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrMMX.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrMMX.td
index 49940204c25a..bb3e6df3bf3e 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrMMX.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrMMX.td
@@ -472,6 +472,7 @@ defm MMX_PACKUSWB : MMXI_binop_rm_int<0x67, "packuswb", int_x86_mmx_packuswb,
 defm MMX_PSHUFB : SS3I_binop_rm_int_mm<0x00, "pshufb", int_x86_ssse3_pshuf_b,
                                        SchedWriteVarShuffle.MMX>;
 
+let Predicates = [HasMMX, HasSSE1] in {
 def MMX_PSHUFWri : MMXIi8<0x70, MRMSrcReg,
                           (outs VR64:$dst), (ins VR64:$src1, u8imm:$src2),
                           "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -485,6 +486,7 @@ def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem,
                              (int_x86_sse_pshuf_w (load_mmx addr:$src1),
                                                    timm:$src2))]>,
                           Sched<[SchedWriteShuffle.MMX.Folded]>;
+}
 
 // -- Conversion Instructions
 defm MMX_CVTPS2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtps2pi,
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSNP.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSNP.td
new file mode 100644
index 000000000000..de59f3fe2750
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSNP.td
@@ -0,0 +1,47 @@
+//===-- X86InstrSNP.td - SNP Instruction Set Extension -----*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the instructions that make up the AMD Secure Nested
+// Paging (SNP) instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SNP instructions
+
+let SchedRW = [WriteSystem] in {
+// F3 0F 01 FF
+let Uses = [RAX] in
+def PSMASH: I<0x01, MRM_FF, (outs), (ins), "psmash", []>, XS,
+            Requires<[In64BitMode]>;
+
+// F2 0F 01 FF
+let Uses = [RAX] in
+def PVALIDATE64: I<0x01, MRM_FF, (outs), (ins), "pvalidate",[]>,
+                 XD, Requires<[In64BitMode]>;
+
+let Uses = [EAX] in
+def PVALIDATE32: I<0x01, MRM_FF, (outs), (ins), "pvalidate",[]>,
+                 XD, Requires<[Not64BitMode]>;
+
+// F2 0F 01 FE
+let Uses = [RAX] in
+def RMPUPDATE: I<0x01, MRM_FE, (outs), (ins), "rmpupdate", []>, XD,
+               Requires<[In64BitMode]>;
+
+// F3 0F 01 FE
+let Uses = [RAX] in
+def RMPADJUST: I<0x01, MRM_FE, (outs), (ins), "rmpadjust", []>, XS,
+               Requires<[In64BitMode]>;
+} // SchedRW
+
+def : InstAlias<"psmash\t{%rax|rax}", (PSMASH)>, Requires<[In64BitMode]>;
+def : InstAlias<"pvalidate\t{%rax|rax}", (PVALIDATE64)>, Requires<[In64BitMode]>;
+def : InstAlias<"pvalidate\t{%eax|eax}", (PVALIDATE32)>, Requires<[Not64BitMode]>;
+def : InstAlias<"rmpupdate\t{%rax|rax}", (RMPUPDATE)>, Requires<[In64BitMode]>;
+def : InstAlias<"rmpadjust\t{%rax|rax}", (RMPADJUST)>, Requires<[In64BitMode]>;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSSE.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSSE.td
index c3c9f22381f8..a185a2007b72 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSSE.td
@@ -1242,7 +1242,8 @@ def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
 /// SSE 2 Only
 
 // Convert scalar double to scalar single
-let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [UseAVX] in {
+let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [UseAVX],
+    ExeDomain = SSEPackedSingle in {
 def VCVTSD2SSrr  : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
                         (ins FR32:$src1, FR64:$src2),
                         "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
@@ -1260,7 +1261,7 @@ def : Pat<(f32 (any_fpround FR64:$src)),
             (VCVTSD2SSrr (f32 (IMPLICIT_DEF)), FR64:$src)>,
           Requires<[UseAVX]>;
 
-let isCodeGenOnly = 1 in {
+let isCodeGenOnly = 1, ExeDomain = SSEPackedSingle in {
 def CVTSD2SSrr  : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
                       "cvtsd2ss\t{$src, $dst|$dst, $src}",
                       [(set FR32:$dst, (any_fpround FR64:$src))]>,
@@ -1272,7 +1273,7 @@ def CVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
                     Sched<[WriteCvtSD2SS.Folded]>, SIMD_EXC;
 }
 
-let Uses = [MXCSR], mayRaiseFPException = 1 in {
+let Uses = [MXCSR], mayRaiseFPException = 1, ExeDomain = SSEPackedSingle in {
 def VCVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
                        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
                        "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -1306,7 +1307,7 @@ def CVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
 
 // Convert scalar single to scalar double
 // SSE2 instructions with XS prefix
-let isCodeGenOnly = 1, hasSideEffects = 0 in {
+let isCodeGenOnly = 1, hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
 def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
                     (ins FR64:$src1, FR32:$src2),
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
@@ -1326,7 +1327,7 @@ def : Pat<(f64 (any_fpextend FR32:$src)),
 def : Pat<(any_fpextend (loadf32 addr:$src)),
     (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX, OptForSize]>;
 
-let isCodeGenOnly = 1 in {
+let isCodeGenOnly = 1, ExeDomain = SSEPackedSingle in {
 def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
                    "cvtss2sd\t{$src, $dst|$dst, $src}",
                    [(set FR64:$dst, (any_fpextend FR32:$src))]>,
@@ -1338,7 +1339,8 @@ def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
                    Sched<[WriteCvtSS2SD.Folded]>, SIMD_EXC;
 } // isCodeGenOnly = 1
 
-let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1 in {
+let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1,
+    ExeDomain = SSEPackedSingle in {
 def VCVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -3776,7 +3778,7 @@ let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
                              VEX_4V, VEX_WIG;
   defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, VR128,
                              i128mem, SchedWriteShuffle.XMM, load, 0>,
-                             VEX_4V;
+                             VEX_4V, VEX_WIG;
 }
 
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
@@ -3792,7 +3794,7 @@ let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
                               VEX_4V, VEX_L, VEX_WIG;
   defm VPACKUSDWY : sse4_pack<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, VR256,
                               i256mem, SchedWriteShuffle.YMM, load, 0>,
-                              VEX_4V, VEX_L;
+                              VEX_4V, VEX_L, VEX_WIG;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -3928,7 +3930,7 @@ multiclass sse2_pinsrw<bit Is2Addr = 1> {
            "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
            "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
        [(set VR128:$dst,
-         (X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
+         (X86pinsrw VR128:$src1, GR32orGR64:$src2, timm:$src3))]>,
        Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
   def rm : Ii8<0xC4, MRMSrcMem,
                       (outs VR128:$dst), (ins VR128:$src1,
@@ -3938,7 +3940,7 @@ multiclass sse2_pinsrw<bit Is2Addr = 1> {
            "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
        [(set VR128:$dst,
          (X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
-                    imm:$src3))]>,
+                    timm:$src3))]>,
        Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
 }
 
@@ -3948,13 +3950,13 @@ def VPEXTRWrr : Ii8<0xC5, MRMSrcReg,
                     (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
                     "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
-                                            imm:$src2))]>,
+                                            timm:$src2))]>,
                 PD, VEX, VEX_WIG, Sched<[WriteVecExtract]>;
 def PEXTRWrr : PDIi8<0xC5, MRMSrcReg,
                     (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
                     "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
-                                            imm:$src2))]>,
+                                            timm:$src2))]>,
                Sched<[WriteVecExtract]>;
 
 // Insert
@@ -4754,7 +4756,7 @@ let isCommutable = 0 in {
                                   SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
   defm VPHSUBD    : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128,
                                   load, i128mem,
-                                  SchedWritePHAdd.XMM, 0>, VEX_4V;
+                                  SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
   defm VPSIGNB    : SS3I_binop_rm_int<0x08, "vpsignb",
                                       int_x86_ssse3_psign_b_128,
                                       SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
@@ -4800,7 +4802,7 @@ let isCommutable = 0 in {
                                   SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
   defm VPHSUBDY   : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256,
                                   load, i256mem,
-                                  SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L;
+                                  SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
   defm VPSIGNB   : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b,
                                        SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
   defm VPSIGNW   : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w,
@@ -5151,14 +5153,14 @@ multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
                  !strconcat(OpcodeStr,
                             "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                  [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1),
-                                         imm:$src2))]>,
+                                         timm:$src2))]>,
                   Sched<[WriteVecExtract]>;
   let hasSideEffects = 0, mayStore = 1 in
   def mr : SS4AIi8<opc, MRMDestMem, (outs),
                  (ins i8mem:$dst, VR128:$src1, u8imm:$src2),
                  !strconcat(OpcodeStr,
                             "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                 [(store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))),
+                 [(store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), timm:$src2))),
                           addr:$dst)]>, Sched<[WriteVecExtractSt]>;
 }
 
@@ -5182,7 +5184,7 @@ multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
                  (ins i16mem:$dst, VR128:$src1, u8imm:$src2),
                  !strconcat(OpcodeStr,
                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                 [(store (i16 (trunc (X86pextrw (v8i16 VR128:$src1), imm:$src2))),
+                 [(store (i16 (trunc (X86pextrw (v8i16 VR128:$src1), timm:$src2))),
                           addr:$dst)]>, Sched<[WriteVecExtractSt]>;
 }
 
@@ -5272,7 +5274,7 @@ multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
         !strconcat(asm,
                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
       [(set VR128:$dst,
-        (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
+        (X86pinsrb VR128:$src1, GR32orGR64:$src2, timm:$src3))]>,
       Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
   def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
       (ins VR128:$src1, i8mem:$src2, u8imm:$src3),
@@ -5281,7 +5283,7 @@ multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
         !strconcat(asm,
                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
       [(set VR128:$dst,
-        (X86pinsrb VR128:$src1, (extloadi8 addr:$src2), imm:$src3))]>,
+        (X86pinsrb VR128:$src1, (extloadi8 addr:$src2), timm:$src3))]>,
                    Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
 }
 
@@ -6501,7 +6503,7 @@ multiclass pcmpistrm_SS42AI<string asm> {
 
 let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in {
   let Predicates = [HasAVX] in
-  defm VPCMPISTRM : pcmpistrm_SS42AI<"vpcmpistrm">, VEX;
+  defm VPCMPISTRM : pcmpistrm_SS42AI<"vpcmpistrm">, VEX, VEX_WIG;
   defm PCMPISTRM  : pcmpistrm_SS42AI<"pcmpistrm"> ;
 }
 
@@ -6519,7 +6521,7 @@ multiclass SS42AI_pcmpestrm<string asm> {
 
 let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
   let Predicates = [HasAVX] in
-  defm VPCMPESTRM : SS42AI_pcmpestrm<"vpcmpestrm">, VEX;
+  defm VPCMPESTRM : SS42AI_pcmpestrm<"vpcmpestrm">, VEX, VEX_WIG;
   defm PCMPESTRM :  SS42AI_pcmpestrm<"pcmpestrm">;
 }
 
@@ -6537,7 +6539,7 @@ multiclass SS42AI_pcmpistri<string asm> {
 
 let Defs = [ECX, EFLAGS], hasSideEffects = 0 in {
   let Predicates = [HasAVX] in
-  defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX;
+  defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX, VEX_WIG;
   defm PCMPISTRI  : SS42AI_pcmpistri<"pcmpistri">;
 }
 
@@ -6555,7 +6557,7 @@ multiclass SS42AI_pcmpestri<string asm> {
 
 let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
   let Predicates = [HasAVX] in
-  defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX;
+  defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX, VEX_WIG;
   defm PCMPESTRI  : SS42AI_pcmpestri<"pcmpestri">;
 }
 
@@ -7014,22 +7016,19 @@ def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst),
                            Sched<[SchedWriteFShuffle.XMM.Folded]>, VEX, VEX_L;
 
 let Predicates = [HasAVX, NoVLX] in {
-def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))),
+def : Pat<(v4f64 (X86SubVBroadcastld128 addr:$src)),
           (VBROADCASTF128 addr:$src)>;
-def : Pat<(v8f32 (X86SubVBroadcast (loadv4f32 addr:$src))),
+def : Pat<(v8f32 (X86SubVBroadcastld128 addr:$src)),
           (VBROADCASTF128 addr:$src)>;
-}
-
 // NOTE: We're using FP instructions here, but execution domain fixing can
 // convert to integer when profitable.
-let Predicates = [HasAVX, NoVLX] in {
-def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
+def : Pat<(v4i64 (X86SubVBroadcastld128 addr:$src)),
           (VBROADCASTF128 addr:$src)>;
-def : Pat<(v8i32 (X86SubVBroadcast (loadv4i32 addr:$src))),
+def : Pat<(v8i32 (X86SubVBroadcastld128 addr:$src)),
           (VBROADCASTF128 addr:$src)>;
-def : Pat<(v16i16 (X86SubVBroadcast (loadv8i16 addr:$src))),
+def : Pat<(v16i16 (X86SubVBroadcastld128 addr:$src)),
           (VBROADCASTF128 addr:$src)>;
-def : Pat<(v32i8 (X86SubVBroadcast (loadv16i8 addr:$src))),
+def : Pat<(v32i8 (X86SubVBroadcastld128 addr:$src)),
           (VBROADCASTF128 addr:$src)>;
 }
 
@@ -7164,6 +7163,68 @@ defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
                                  int_x86_avx_maskstore_pd_256,
                                  WriteFMaskMove64, WriteFMaskMove64Y>;
 
+//===----------------------------------------------------------------------===//
+// AVX_VNNI
+//===----------------------------------------------------------------------===//
+let Predicates = [HasAVXVNNI, NoVLX_Or_NoVNNI], Constraints = "$src1 = $dst" in
+multiclass avx_vnni_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                       bit IsCommutable> {
+  let isCommutable = IsCommutable in
+  def rr  : AVX8I<opc, MRMSrcReg, (outs VR128:$dst),
+             (ins VR128:$src1, VR128:$src2, VR128:$src3),
+             !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+             [(set VR128:$dst, (v4i32 (OpNode VR128:$src1,
+                                       VR128:$src2, VR128:$src3)))]>,
+             VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
+
+  def rm  : AVX8I<opc, MRMSrcMem, (outs VR128:$dst),
+             (ins VR128:$src1, VR128:$src2, i128mem:$src3),
+             !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+             [(set VR128:$dst, (v4i32 (OpNode VR128:$src1, VR128:$src2,
+                                      (loadv4i32 addr:$src3))))]>,
+             VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
+
+  let isCommutable = IsCommutable in
+  def Yrr  : AVX8I<opc, MRMSrcReg, (outs VR256:$dst),
+             (ins VR256:$src1, VR256:$src2, VR256:$src3),
+             !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+             [(set VR256:$dst, (v8i32 (OpNode VR256:$src1,
+                                       VR256:$src2, VR256:$src3)))]>,
+             VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.XMM]>;
+
+  def Yrm  : AVX8I<opc, MRMSrcMem, (outs VR256:$dst),
+             (ins VR256:$src1, VR256:$src2, i256mem:$src3),
+             !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+             [(set VR256:$dst, (v8i32 (OpNode VR256:$src1, VR256:$src2,
+                                      (loadv8i32 addr:$src3))))]>,
+             VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.XMM]>;
+}
+
+defm VPDPBUSD   : avx_vnni_rm<0x50, "vpdpbusd", X86Vpdpbusd, 0>, ExplicitVEXPrefix;
+defm VPDPBUSDS  : avx_vnni_rm<0x51, "vpdpbusds", X86Vpdpbusds, 0>, ExplicitVEXPrefix;
+defm VPDPWSSD   : avx_vnni_rm<0x52, "vpdpwssd",  X86Vpdpwssd, 1>, ExplicitVEXPrefix;
+defm VPDPWSSDS  : avx_vnni_rm<0x53, "vpdpwssds", X86Vpdpwssds, 1>, ExplicitVEXPrefix;
+
+def X86vpmaddwd_su : PatFrag<(ops node:$lhs, node:$rhs),
+                             (X86vpmaddwd node:$lhs, node:$rhs), [{
+  return N->hasOneUse();
+}]>;
+
+let Predicates = [HasAVXVNNI, NoVLX_Or_NoVNNI] in {
+  def : Pat<(v8i32 (add VR256:$src1,
+                        (X86vpmaddwd_su VR256:$src2, VR256:$src3))),
+            (VPDPWSSDYrr VR256:$src1, VR256:$src2, VR256:$src3)>;
+  def : Pat<(v8i32 (add VR256:$src1,
+                        (X86vpmaddwd_su VR256:$src2, (load addr:$src3)))),
+            (VPDPWSSDYrm VR256:$src1, VR256:$src2, addr:$src3)>;
+  def : Pat<(v4i32 (add VR128:$src1,
+                        (X86vpmaddwd_su VR128:$src2, VR128:$src3))),
+            (VPDPWSSDrr VR128:$src1, VR128:$src2, VR128:$src3)>;
+  def : Pat<(v4i32 (add VR128:$src1,
+                        (X86vpmaddwd_su VR128:$src2, (load addr:$src3)))),
+            (VPDPWSSDrm VR128:$src1, VR128:$src2, addr:$src3)>;
+}
+
 //===----------------------------------------------------------------------===//
 // VPERMIL - Permute Single and Double Floating-Point Values
 //
@@ -7226,16 +7287,12 @@ let ExeDomain = SSEPackedSingle in {
 let isCommutable = 1 in
 def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
           (ins VR256:$src1, VR256:$src2, u8imm:$src3),
-          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-          [(set VR256:$dst, (v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
-                              (i8 timm:$src3))))]>, VEX_4V, VEX_L,
-          Sched<[WriteFShuffle256]>;
+          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
+          VEX_4V, VEX_L, Sched<[WriteFShuffle256]>;
 def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
           (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
-          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-          [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4f64 addr:$src2),
-                             (i8 timm:$src3)))]>, VEX_4V, VEX_L,
-          Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>;
+          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
+          VEX_4V, VEX_L, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>;
 }
 
 // Immediate transform to help with commuting.
@@ -7243,23 +7300,27 @@ def Perm2XCommuteImm : SDNodeXForm<timm, [{
   return getI8Imm(N->getZExtValue() ^ 0x22, SDLoc(N));
 }]>;
 
+multiclass vperm2x128_lowering<string InstrStr, ValueType VT, PatFrag memop_frag> {
+  def : Pat<(VT (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 timm:$imm))),
+            (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR256:$src2, timm:$imm)>;
+  def : Pat<(VT (X86VPerm2x128 VR256:$src1, (memop_frag addr:$src2), (i8 timm:$imm))),
+            (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2, timm:$imm)>;
+  // Pattern with load in other operand.
+  def : Pat<(VT (X86VPerm2x128 (memop_frag addr:$src2), VR256:$src1, (i8 timm:$imm))),
+            (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2,
+                                             (Perm2XCommuteImm timm:$imm))>;
+}
+
 let Predicates = [HasAVX] in {
-// Pattern with load in other operand.
-def : Pat<(v4f64 (X86VPerm2x128 (loadv4f64 addr:$src2),
-                                VR256:$src1, (i8 timm:$imm))),
-          (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm timm:$imm))>;
+  defm : vperm2x128_lowering<"VPERM2F128", v4f64, loadv4f64>;
+  defm : vperm2x128_lowering<"VPERM2F128", v8f32, loadv8f32>;
 }
 
 let Predicates = [HasAVX1Only] in {
-def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 timm:$imm))),
-          (VPERM2F128rr VR256:$src1, VR256:$src2, timm:$imm)>;
-def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1,
-                  (loadv4i64 addr:$src2), (i8 timm:$imm))),
-          (VPERM2F128rm VR256:$src1, addr:$src2, timm:$imm)>;
-// Pattern with load in other operand.
-def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2),
-                                VR256:$src1, (i8 timm:$imm))),
-          (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm timm:$imm))>;
+  defm : vperm2x128_lowering<"VPERM2F128", v4i64,  loadv4i64>;
+  defm : vperm2x128_lowering<"VPERM2F128", v8i32,  loadv8i32>;
+  defm : vperm2x128_lowering<"VPERM2F128", v16i16, loadv16i16>;
+  defm : vperm2x128_lowering<"VPERM2F128", v32i8,  loadv32i8>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -7628,27 +7689,24 @@ defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64,
                              WriteFShuffle256, f256mem>, VEX_W;
 
 //===----------------------------------------------------------------------===//
-// VPERM2I128 - Permute Floating-Point Values in 128-bit chunks
+// VPERM2I128 - Permute Integer vector Values in 128-bit chunks
 //
 let isCommutable = 1 in
 def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
           (ins VR256:$src1, VR256:$src2, u8imm:$src3),
-          "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-          [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
-                            (i8 timm:$src3))))]>, Sched<[WriteShuffle256]>,
-          VEX_4V, VEX_L;
+          "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
+          Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
 def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
           (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
-          "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-          [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2),
-                             (i8 timm:$src3)))]>,
+          "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
           Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
 
-let Predicates = [HasAVX2] in
-def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2),
-                                VR256:$src1, (i8 timm:$imm))),
-          (VPERM2I128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm timm:$imm))>;
-
+let Predicates = [HasAVX2] in {
+  defm : vperm2x128_lowering<"VPERM2I128", v4i64,  loadv4i64>;
+  defm : vperm2x128_lowering<"VPERM2I128", v8i32,  loadv8i32>;
+  defm : vperm2x128_lowering<"VPERM2I128", v16i16, loadv16i16>;
+  defm : vperm2x128_lowering<"VPERM2I128", v32i8,  loadv32i8>;
+}
 
 //===----------------------------------------------------------------------===//
 // VINSERTI128 - Insert packed integer values
@@ -7767,37 +7825,6 @@ let Predicates = [HasAVX2] in {
   defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64>;
 }
 
-//===----------------------------------------------------------------------===//
-// SubVector Broadcasts
-// Provide fallback in case the load node that is used in the patterns above
-// is used by additional users, which prevents the pattern selection.
-
-let Predicates = [HasAVX, NoVLX] in {
-def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128:$src))),
-          (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
-                         (v2f64 VR128:$src), 1)>;
-def : Pat<(v8f32 (X86SubVBroadcast (v4f32 VR128:$src))),
-          (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
-                         (v4f32 VR128:$src), 1)>;
-}
-
-// NOTE: We're using FP instructions here, but execution domain fixing can
-// convert to integer when profitable.
-let Predicates = [HasAVX, NoVLX] in {
-def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128:$src))),
-          (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
-                         (v2i64 VR128:$src), 1)>;
-def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128:$src))),
-          (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
-                         (v4i32 VR128:$src), 1)>;
-def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128:$src))),
-          (VINSERTF128rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
-                         (v8i16 VR128:$src), 1)>;
-def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128:$src))),
-          (VINSERTF128rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
-                         (v16i8 VR128:$src), 1)>;
-}
-
 //===----------------------------------------------------------------------===//
 // Variable Bit Shifts
 //
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSVM.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSVM.td
index 82c8e74156b2..d8f70b016c7b 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSVM.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSVM.td
@@ -26,37 +26,47 @@ def CLGI : I<0x01, MRM_DD, (outs), (ins), "clgi", []>, TB;
 
 // 0F 01 DE
 let Uses = [EAX] in
-def SKINIT : I<0x01, MRM_DE, (outs), (ins), "skinit\t{%eax|eax}", []>, TB;
+def SKINIT : I<0x01, MRM_DE, (outs), (ins), "skinit", []>, TB;
 
 // 0F 01 D8
 let Uses = [EAX] in
-def VMRUN32 : I<0x01, MRM_D8, (outs), (ins), "vmrun\t{%eax|eax}", []>, TB,
+def VMRUN32 : I<0x01, MRM_D8, (outs), (ins), "vmrun", []>, TB,
                 Requires<[Not64BitMode]>;
 let Uses = [RAX] in
-def VMRUN64 : I<0x01, MRM_D8, (outs), (ins), "vmrun\t{%rax|rax}", []>, TB,
+def VMRUN64 : I<0x01, MRM_D8, (outs), (ins), "vmrun", []>, TB,
                 Requires<[In64BitMode]>;
 
 // 0F 01 DA
 let Uses = [EAX] in
-def VMLOAD32 : I<0x01, MRM_DA, (outs), (ins), "vmload\t{%eax|eax}", []>, TB,
+def VMLOAD32 : I<0x01, MRM_DA, (outs), (ins), "vmload", []>, TB,
                  Requires<[Not64BitMode]>;
 let Uses = [RAX] in
-def VMLOAD64 : I<0x01, MRM_DA, (outs), (ins), "vmload\t{%rax|rax}", []>, TB,
+def VMLOAD64 : I<0x01, MRM_DA, (outs), (ins), "vmload", []>, TB,
                  Requires<[In64BitMode]>;
 
 // 0F 01 DB
 let Uses = [EAX] in
-def VMSAVE32 : I<0x01, MRM_DB, (outs), (ins), "vmsave\t{%eax|eax}", []>, TB,
+def VMSAVE32 : I<0x01, MRM_DB, (outs), (ins), "vmsave", []>, TB,
                  Requires<[Not64BitMode]>;
 let Uses = [RAX] in
-def VMSAVE64 : I<0x01, MRM_DB, (outs), (ins), "vmsave\t{%rax|rax}", []>, TB,
+def VMSAVE64 : I<0x01, MRM_DB, (outs), (ins), "vmsave", []>, TB,
                  Requires<[In64BitMode]>;
 
 // 0F 01 DF
 let Uses = [EAX, ECX] in
 def INVLPGA32 : I<0x01, MRM_DF, (outs), (ins),
-                "invlpga\t{%eax, %ecx|eax, ecx}", []>, TB, Requires<[Not64BitMode]>;
+                "invlpga", []>, TB, Requires<[Not64BitMode]>;
 let Uses = [RAX, ECX] in
 def INVLPGA64 : I<0x01, MRM_DF, (outs), (ins),
-                "invlpga\t{%rax, %ecx|rax, ecx}", []>, TB, Requires<[In64BitMode]>;
+                "invlpga", []>, TB, Requires<[In64BitMode]>;
 } // SchedRW
+
+def : InstAlias<"skinit\t{%eax|eax}", (SKINIT), 0>;
+def : InstAlias<"vmrun\t{%eax|eax}", (VMRUN32), 0>, Requires<[Not64BitMode]>;
+def : InstAlias<"vmrun\t{%rax|rax}", (VMRUN64), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"vmload\t{%eax|eax}", (VMLOAD32), 0>, Requires<[Not64BitMode]>;
+def : InstAlias<"vmload\t{%rax|rax}", (VMLOAD64), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"vmsave\t{%eax|eax}", (VMSAVE32), 0>, Requires<[Not64BitMode]>;
+def : InstAlias<"vmsave\t{%rax|rax}", (VMSAVE64), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"invlpga\t{%eax, %ecx|eax, ecx}", (INVLPGA32), 0>, Requires<[Not64BitMode]>;
+def : InstAlias<"invlpga\t{%rax, %ecx|rax, ecx}", (INVLPGA64), 0>, Requires<[In64BitMode]>;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSystem.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSystem.td
index 13659b5c456e..eb8740896e5d 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSystem.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSystem.td
@@ -49,6 +49,7 @@ let Uses = [EFLAGS] in
 def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3", [(int_x86_int (i8 3))]>;
 } // SchedRW
 
+def UBSAN_UD1 : PseudoI<(outs), (ins i32imm:$kind), [(ubsantrap (i32 timm:$kind))]>;
 // The long form of "int $3" turns into int3 as a size optimization.
 // FIXME: This doesn't work because InstAlias can't match immediate constants.
 //def : InstAlias<"int\t$3", (INT3)>;
@@ -170,6 +171,17 @@ def FS_PREFIX : I<0x64, PrefixByte, (outs), (ins), "fs", []>;
 def GS_PREFIX : I<0x65, PrefixByte, (outs), (ins), "gs", []>;
 } // SchedRW
 
+//===----------------------------------------------------------------------===//
+// Address-size override prefixes.
+//
+
+let SchedRW = [WriteNop] in {
+def ADDR16_PREFIX : I<0x67, PrefixByte, (outs), (ins), "addr16", []>,
+                      Requires<[In32BitMode]>;
+def ADDR32_PREFIX : I<0x67, PrefixByte, (outs), (ins), "addr32", []>,
+                      Requires<[In64BitMode]>;
+} // SchedRW
+
 //===----------------------------------------------------------------------===//
 // Moves to and from segment registers.
 //
@@ -447,7 +459,7 @@ let Defs = [EAX, EBX, ECX, EDX], Uses = [EAX, ECX] in
 // Cache instructions
 let SchedRW = [WriteSystem] in {
 def INVD : I<0x08, RawFrm, (outs), (ins), "invd", []>, TB;
-def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", [(int_x86_wbinvd)]>, TB;
+def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", [(int_x86_wbinvd)]>, PS;
 
 // wbnoinvd is like wbinvd, except without invalidation
 // encoding: like wbinvd + an 0xF3 prefix
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrTDX.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrTDX.td
new file mode 100644
index 000000000000..8d7cd6082095
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrTDX.td
@@ -0,0 +1,39 @@
+//===- X86InstrTDX.td - TDX Instruction Set Extension -*- tablegen -*===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the instructions that make up the Intel TDX instruction
+// set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// TDX instructions
+
+// 64-bit only instructions
+let SchedRW = [WriteSystem], Predicates = [In64BitMode] in {
+// SEAMCALL - Call to SEAM VMX-root Operation Module
+def SEAMCALL : I<0x01, MRM_CF, (outs), (ins),
+             "seamcall", []>, PD;
+
+// SEAMRET - Return to Legacy VMX-root Operation
+def SEAMRET : I<0x01, MRM_CD, (outs), (ins),
+             "seamret", []>, PD;
+
+// SEAMOPS - SEAM Operations
+def SEAMOPS : I<0x01, MRM_CE, (outs), (ins),
+             "seamops", []>, PD;
+
+} // SchedRW
+
+// common instructions
+let SchedRW = [WriteSystem] in {
+// TDCALL - Call SEAM Module Functions
+def TDCALL : I<0x01, MRM_CC, (outs), (ins),
+             "tdcall", []>, PD;
+
+} // SchedRW
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstructionSelector.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86InstructionSelector.cpp
index 60fb4d2ef4bf..ff531713037c 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstructionSelector.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstructionSelector.cpp
@@ -214,8 +214,8 @@ static unsigned getSubRegIndex(const TargetRegisterClass *RC) {
   return SubIdx;
 }
 
-static const TargetRegisterClass *getRegClassFromGRPhysReg(unsigned Reg) {
-  assert(Register::isPhysicalRegister(Reg));
+static const TargetRegisterClass *getRegClassFromGRPhysReg(Register Reg) {
+  assert(Reg.isPhysical());
   if (X86::GR64RegClass.contains(Reg))
     return &X86::GR64RegClass;
   if (X86::GR32RegClass.contains(Reg))
@@ -239,7 +239,7 @@ bool X86InstructionSelector::selectCopy(MachineInstr &I,
   const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
   const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
 
-  if (Register::isPhysicalRegister(DstReg)) {
+  if (DstReg.isPhysical()) {
     assert(I.isCopy() && "Generic operators do not allow physical registers");
 
     if (DstSize > SrcSize && SrcRegBank.getID() == X86::GPRRegBankID &&
@@ -266,12 +266,12 @@ bool X86InstructionSelector::selectCopy(MachineInstr &I,
     return true;
   }
 
-  assert((!Register::isPhysicalRegister(SrcReg) || I.isCopy()) &&
+  assert((!SrcReg.isPhysical() || I.isCopy()) &&
          "No phys reg on generic operators");
   assert((DstSize == SrcSize ||
           // Copies are a mean to setup initial types, the number of
           // bits may not exactly match.
-          (Register::isPhysicalRegister(SrcReg) &&
+          (SrcReg.isPhysical() &&
            DstSize <= RBI.getSizeInBits(SrcReg, MRI, TRI))) &&
          "Copy with different width?!");
 
@@ -280,7 +280,7 @@ bool X86InstructionSelector::selectCopy(MachineInstr &I,
 
   if (SrcRegBank.getID() == X86::GPRRegBankID &&
       DstRegBank.getID() == X86::GPRRegBankID && SrcSize > DstSize &&
-      Register::isPhysicalRegister(SrcReg)) {
+      SrcReg.isPhysical()) {
     // Change the physical register to performe truncate.
 
     const TargetRegisterClass *SrcRC = getRegClassFromGRPhysReg(SrcReg);
@@ -479,7 +479,7 @@ static void X86SelectAddress(const MachineInstr &I,
          "unsupported type.");
 
   if (I.getOpcode() == TargetOpcode::G_PTR_ADD) {
-    if (auto COff = getConstantVRegVal(I.getOperand(2).getReg(), MRI)) {
+    if (auto COff = getConstantVRegSExtVal(I.getOperand(2).getReg(), MRI)) {
       int64_t Imm = *COff;
       if (isInt<32>(Imm)) { // Check for displacement overflow.
         AM.Disp = static_cast<int32_t>(Imm);
@@ -780,69 +780,18 @@ bool X86InstructionSelector::selectZext(MachineInstr &I,
   const LLT DstTy = MRI.getType(DstReg);
   const LLT SrcTy = MRI.getType(SrcReg);
 
+  assert(!(SrcTy == LLT::scalar(8) && DstTy == LLT::scalar(16)) &&
+         "8=>16 Zext is handled by tablegen");
   assert(!(SrcTy == LLT::scalar(8) && DstTy == LLT::scalar(32)) &&
          "8=>32 Zext is handled by tablegen");
   assert(!(SrcTy == LLT::scalar(16) && DstTy == LLT::scalar(32)) &&
          "16=>32 Zext is handled by tablegen");
-
-  const static struct ZextEntry {
-    LLT SrcTy;
-    LLT DstTy;
-    unsigned MovOp;
-    bool NeedSubregToReg;
-  } OpTable[] = {
-      {LLT::scalar(8), LLT::scalar(16), X86::MOVZX16rr8, false},  // i8  => i16
-      {LLT::scalar(8), LLT::scalar(64), X86::MOVZX32rr8, true},   // i8  => i64
-      {LLT::scalar(16), LLT::scalar(64), X86::MOVZX32rr16, true}, // i16 => i64
-      {LLT::scalar(32), LLT::scalar(64), 0, true}                 // i32 => i64
-  };
-
-  auto ZextEntryIt =
-      std::find_if(std::begin(OpTable), std::end(OpTable),
-                   [SrcTy, DstTy](const ZextEntry &El) {
-                     return El.DstTy == DstTy && El.SrcTy == SrcTy;
-                   });
-
-  // Here we try to select Zext into a MOVZ and/or SUBREG_TO_REG instruction.
-  if (ZextEntryIt != std::end(OpTable)) {
-    const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
-    const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
-    const TargetRegisterClass *DstRC = getRegClass(DstTy, DstRB);
-    const TargetRegisterClass *SrcRC = getRegClass(SrcTy, SrcRB);
-
-    if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
-        !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
-      LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
-                        << " operand\n");
-      return false;
-    }
-
-    unsigned TransitRegTo = DstReg;
-    unsigned TransitRegFrom = SrcReg;
-    if (ZextEntryIt->MovOp) {
-      // If we select Zext into MOVZ + SUBREG_TO_REG, we need to have
-      // a transit register in between: create it here.
-      if (ZextEntryIt->NeedSubregToReg) {
-        TransitRegFrom = MRI.createVirtualRegister(
-            getRegClass(LLT::scalar(32), DstReg, MRI));
-        TransitRegTo = TransitRegFrom;
-      }
-
-      BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(ZextEntryIt->MovOp))
-          .addDef(TransitRegTo)
-          .addReg(SrcReg);
-    }
-    if (ZextEntryIt->NeedSubregToReg) {
-      BuildMI(*I.getParent(), I, I.getDebugLoc(),
-              TII.get(TargetOpcode::SUBREG_TO_REG))
-          .addDef(DstReg)
-          .addImm(0)
-          .addReg(TransitRegFrom)
-          .addImm(X86::sub_32bit);
-    }
-    I.eraseFromParent();
-    return true;
-  }
+  assert(!(SrcTy == LLT::scalar(8) && DstTy == LLT::scalar(64)) &&
+         "8=>64 Zext is handled by tablegen");
+  assert(!(SrcTy == LLT::scalar(16) && DstTy == LLT::scalar(64)) &&
+         "16=>64 Zext is handled by tablegen");
+  assert(!(SrcTy == LLT::scalar(32) && DstTy == LLT::scalar(64)) &&
+         "32=>64 Zext is handled by tablegen");
 
   if (SrcTy != LLT::scalar(1))
     return false;
@@ -859,12 +808,17 @@ bool X86InstructionSelector::selectZext(MachineInstr &I,
   else
     return false;
 
-  unsigned DefReg = SrcReg;
+  Register DefReg = SrcReg;
   if (DstTy != LLT::scalar(8)) {
+    Register ImpDefReg =
+        MRI.createVirtualRegister(getRegClass(DstTy, DstReg, MRI));
+    BuildMI(*I.getParent(), I, I.getDebugLoc(),
+            TII.get(TargetOpcode::IMPLICIT_DEF), ImpDefReg);
+
     DefReg = MRI.createVirtualRegister(getRegClass(DstTy, DstReg, MRI));
     BuildMI(*I.getParent(), I, I.getDebugLoc(),
-            TII.get(TargetOpcode::SUBREG_TO_REG), DefReg)
-        .addImm(0)
+            TII.get(TargetOpcode::INSERT_SUBREG), DefReg)
+        .addReg(ImpDefReg)
         .addReg(SrcReg)
         .addImm(X86::sub_8bit);
   }
@@ -1605,10 +1559,9 @@ bool X86InstructionSelector::selectDivRem(MachineInstr &I,
        }},                                                 // i64
   };
 
-  auto OpEntryIt = std::find_if(std::begin(OpTable), std::end(OpTable),
-                                [RegTy](const DivRemEntry &El) {
-                                  return El.SizeInBits == RegTy.getSizeInBits();
-                                });
+  auto OpEntryIt = llvm::find_if(OpTable, [RegTy](const DivRemEntry &El) {
+    return El.SizeInBits == RegTy.getSizeInBits();
+  });
   if (OpEntryIt == std::end(OpTable))
     return false;
 
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86InterleavedAccess.cpp
index a19e12766e10..95655dd4723b 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InterleavedAccess.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InterleavedAccess.cpp
@@ -44,8 +44,8 @@ namespace {
 ///  E.g. A group of interleaving access loads (Factor = 2; accessing every
 ///       other element)
 ///        %wide.vec = load <8 x i32>, <8 x i32>* %ptr
-///        %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <0, 2, 4, 6>
-///        %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <1, 3, 5, 7>
+///        %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <0, 2, 4, 6>
+///        %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <1, 3, 5, 7>
 class X86InterleavedAccessGroup {
   /// Reference to the wide-load instruction of an interleaved access
   /// group.
@@ -211,7 +211,7 @@ void X86InterleavedAccessGroup::decompose(
     VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);
   }
   // Generate N loads of T type.
-  assert(VecBaseTy->getPrimitiveSizeInBits().isByteSized() &&
+  assert(VecBaseTy->getPrimitiveSizeInBits().isKnownMultipleOf(8) &&
          "VecBaseTy's size must be a multiple of 8");
   const Align FirstAlignment = LI->getAlign();
   const Align SubsequentAlignment = commonAlignment(
@@ -295,8 +295,7 @@ static void reorderSubVector(MVT VT, SmallVectorImpl<Value *> &TransposedMatrix,
 
   if (VecElems == 16) {
     for (unsigned i = 0; i < Stride; i++)
-      TransposedMatrix[i] = Builder.CreateShuffleVector(
-          Vec[i], UndefValue::get(Vec[i]->getType()), VPShuf);
+      TransposedMatrix[i] = Builder.CreateShuffleVector(Vec[i], VPShuf);
     return;
   }
 
@@ -577,8 +576,7 @@ void X86InterleavedAccessGroup::deinterleave8bitStride3(
   // Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7
 
   for (int i = 0; i < 3; i++)
-    Vec[i] = Builder.CreateShuffleVector(
-        Vec[i], UndefValue::get(Vec[0]->getType()), VPShuf);
+    Vec[i] = Builder.CreateShuffleVector(Vec[i], VPShuf);
 
   // TempVector[0]= a6 a7 a0 a1 a2 b0 b1 b2
   // TempVector[1]= c0 c1 c2 c3 c4 a3 a4 a5
@@ -600,10 +598,8 @@ void X86InterleavedAccessGroup::deinterleave8bitStride3(
   // TransposedMatrix[1]= b0 b1 b2 b3 b4 b5 b6 b7
   // TransposedMatrix[2]= c0 c1 c2 c3 c4 c5 c6 c7
 
-  Value *TempVec = Builder.CreateShuffleVector(
-      Vec[1], UndefValue::get(Vec[1]->getType()), VPAlign3);
-  TransposedMatrix[0] = Builder.CreateShuffleVector(
-      Vec[0], UndefValue::get(Vec[1]->getType()), VPAlign2);
+  Value *TempVec = Builder.CreateShuffleVector(Vec[1], VPAlign3);
+  TransposedMatrix[0] = Builder.CreateShuffleVector(Vec[0], VPAlign2);
   TransposedMatrix[1] = VecElems == 8 ? Vec[2] : TempVec;
   TransposedMatrix[2] = VecElems == 8 ? TempVec : Vec[2];
 }
@@ -660,10 +656,8 @@ void X86InterleavedAccessGroup::interleave8bitStride3(
   // Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4
   // Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7
 
-  Vec[0] = Builder.CreateShuffleVector(
-      InVec[0], UndefValue::get(InVec[0]->getType()), VPAlign2);
-  Vec[1] = Builder.CreateShuffleVector(
-      InVec[1], UndefValue::get(InVec[1]->getType()), VPAlign3);
+  Vec[0] = Builder.CreateShuffleVector(InVec[0], VPAlign2);
+  Vec[1] = Builder.CreateShuffleVector(InVec[1], VPAlign3);
   Vec[2] = InVec[2];
 
   // Vec[0]= a6 a7 a0 a1 a2 b0 b1 b2
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/contrib/llvm-project/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index 1c10c07abeee..72ab3e9cf78d 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -22,7 +22,7 @@ namespace llvm {
 enum IntrinsicType : uint16_t {
   CVTNEPS2BF16_MASK,
   GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, XGETBV, ADX, FPCLASSS,
-  INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, INTR_TYPE_4OP,
+  INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, INTR_TYPE_4OP_IMM8,
   INTR_TYPE_3OP_IMM8,
   CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, COMI, COMI_RM, BLENDV, BEXTRI,
   CVTPD2PS_MASK,
@@ -417,12 +417,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx2_psrlv_q_256, INTR_TYPE_2OP, X86ISD::VSRLV, 0),
   X86_INTRINSIC_DATA(avx512_add_pd_512, INTR_TYPE_2OP, ISD::FADD, X86ISD::FADD_RND),
   X86_INTRINSIC_DATA(avx512_add_ps_512, INTR_TYPE_2OP, ISD::FADD, X86ISD::FADD_RND),
-  X86_INTRINSIC_DATA(avx512_cmp_pd_128, CMP_MASK_CC, X86ISD::CMPM, 0),
-  X86_INTRINSIC_DATA(avx512_cmp_pd_256, CMP_MASK_CC, X86ISD::CMPM, 0),
-  X86_INTRINSIC_DATA(avx512_cmp_pd_512, CMP_MASK_CC, X86ISD::CMPM, X86ISD::CMPM_SAE),
-  X86_INTRINSIC_DATA(avx512_cmp_ps_128, CMP_MASK_CC, X86ISD::CMPM, 0),
-  X86_INTRINSIC_DATA(avx512_cmp_ps_256, CMP_MASK_CC, X86ISD::CMPM, 0),
-  X86_INTRINSIC_DATA(avx512_cmp_ps_512, CMP_MASK_CC, X86ISD::CMPM, X86ISD::CMPM_SAE),
   X86_INTRINSIC_DATA(avx512_conflict_d_128, INTR_TYPE_1OP, X86ISD::CONFLICT, 0),
   X86_INTRINSIC_DATA(avx512_conflict_d_256, INTR_TYPE_1OP, X86ISD::CONFLICT, 0),
   X86_INTRINSIC_DATA(avx512_conflict_d_512, INTR_TYPE_1OP, X86ISD::CONFLICT, 0),
@@ -464,6 +458,12 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
                      X86ISD::FADDS, X86ISD::FADDS_RND),
   X86_INTRINSIC_DATA(avx512_mask_add_ss_round, INTR_TYPE_SCALAR_MASK,
                      X86ISD::FADDS, X86ISD::FADDS_RND),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_pd_128, CMP_MASK_CC, X86ISD::CMPMM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_pd_256, CMP_MASK_CC, X86ISD::CMPMM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_pd_512, CMP_MASK_CC, X86ISD::CMPMM, X86ISD::CMPMM_SAE),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_ps_128, CMP_MASK_CC, X86ISD::CMPMM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_ps_256, CMP_MASK_CC, X86ISD::CMPMM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_ps_512, CMP_MASK_CC, X86ISD::CMPMM, X86ISD::CMPMM_SAE),
   X86_INTRINSIC_DATA(avx512_mask_cmp_sd,     CMP_MASK_SCALAR_CC,
                      X86ISD::FSETCCM, X86ISD::FSETCCM_SAE),
   X86_INTRINSIC_DATA(avx512_mask_cmp_ss,     CMP_MASK_SCALAR_CC,
@@ -882,12 +882,12 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_psrlv_w_128, INTR_TYPE_2OP, X86ISD::VSRLV, 0),
   X86_INTRINSIC_DATA(avx512_psrlv_w_256, INTR_TYPE_2OP, X86ISD::VSRLV, 0),
   X86_INTRINSIC_DATA(avx512_psrlv_w_512, INTR_TYPE_2OP, X86ISD::VSRLV, 0),
-  X86_INTRINSIC_DATA(avx512_pternlog_d_128, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0),
-  X86_INTRINSIC_DATA(avx512_pternlog_d_256, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0),
-  X86_INTRINSIC_DATA(avx512_pternlog_d_512, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0),
-  X86_INTRINSIC_DATA(avx512_pternlog_q_128, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0),
-  X86_INTRINSIC_DATA(avx512_pternlog_q_256, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0),
-  X86_INTRINSIC_DATA(avx512_pternlog_q_512, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0),
+  X86_INTRINSIC_DATA(avx512_pternlog_d_128, INTR_TYPE_4OP_IMM8, X86ISD::VPTERNLOG, 0),
+  X86_INTRINSIC_DATA(avx512_pternlog_d_256, INTR_TYPE_4OP_IMM8, X86ISD::VPTERNLOG, 0),
+  X86_INTRINSIC_DATA(avx512_pternlog_d_512, INTR_TYPE_4OP_IMM8, X86ISD::VPTERNLOG, 0),
+  X86_INTRINSIC_DATA(avx512_pternlog_q_128, INTR_TYPE_4OP_IMM8, X86ISD::VPTERNLOG, 0),
+  X86_INTRINSIC_DATA(avx512_pternlog_q_256, INTR_TYPE_4OP_IMM8, X86ISD::VPTERNLOG, 0),
+  X86_INTRINSIC_DATA(avx512_pternlog_q_512, INTR_TYPE_4OP_IMM8, X86ISD::VPTERNLOG, 0),
   X86_INTRINSIC_DATA(avx512_rcp14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
   X86_INTRINSIC_DATA(avx512_rcp14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
   X86_INTRINSIC_DATA(avx512_rcp14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
@@ -1098,7 +1098,7 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(sse41_round_sd,    ROUNDS, X86ISD::VRNDSCALES, 0),
   X86_INTRINSIC_DATA(sse41_round_ss,    ROUNDS, X86ISD::VRNDSCALES, 0),
   X86_INTRINSIC_DATA(sse4a_extrqi,      INTR_TYPE_3OP, X86ISD::EXTRQI, 0),
-  X86_INTRINSIC_DATA(sse4a_insertqi,    INTR_TYPE_4OP, X86ISD::INSERTQI, 0),
+  X86_INTRINSIC_DATA(sse4a_insertqi,    INTR_TYPE_4OP_IMM8, X86ISD::INSERTQI, 0),
   X86_INTRINSIC_DATA(ssse3_phadd_d_128, INTR_TYPE_2OP, X86ISD::HADD, 0),
   X86_INTRINSIC_DATA(ssse3_phadd_w_128, INTR_TYPE_2OP, X86ISD::HADD, 0),
   X86_INTRINSIC_DATA(ssse3_phsub_d_128, INTR_TYPE_2OP, X86ISD::HSUB, 0),
@@ -1108,8 +1108,8 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(ssse3_pshuf_b_128, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
   X86_INTRINSIC_DATA(subborrow_32,      ADX, X86ISD::SBB, X86ISD::SUB),
   X86_INTRINSIC_DATA(subborrow_64,      ADX, X86ISD::SBB, X86ISD::SUB),
-  X86_INTRINSIC_DATA(tbm_bextri_u32,    BEXTRI, X86ISD::BEXTR, 0),
-  X86_INTRINSIC_DATA(tbm_bextri_u64,    BEXTRI, X86ISD::BEXTR, 0),
+  X86_INTRINSIC_DATA(tbm_bextri_u32,    BEXTRI, X86ISD::BEXTRI, 0),
+  X86_INTRINSIC_DATA(tbm_bextri_u64,    BEXTRI, X86ISD::BEXTRI, 0),
   X86_INTRINSIC_DATA(vcvtps2ph_128,     INTR_TYPE_2OP, X86ISD::CVTPS2PH, 0),
   X86_INTRINSIC_DATA(vcvtps2ph_256,     INTR_TYPE_2OP, X86ISD::CVTPS2PH, 0),
 
@@ -1132,10 +1132,10 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(vgf2p8mulb_512, INTR_TYPE_2OP,
                      X86ISD::GF2P8MULB, 0),
 
-  X86_INTRINSIC_DATA(xop_vpermil2pd,     INTR_TYPE_4OP, X86ISD::VPERMIL2, 0),
-  X86_INTRINSIC_DATA(xop_vpermil2pd_256, INTR_TYPE_4OP, X86ISD::VPERMIL2, 0),
-  X86_INTRINSIC_DATA(xop_vpermil2ps,     INTR_TYPE_4OP, X86ISD::VPERMIL2, 0),
-  X86_INTRINSIC_DATA(xop_vpermil2ps_256, INTR_TYPE_4OP, X86ISD::VPERMIL2, 0),
+  X86_INTRINSIC_DATA(xop_vpermil2pd,     INTR_TYPE_4OP_IMM8, X86ISD::VPERMIL2, 0),
+  X86_INTRINSIC_DATA(xop_vpermil2pd_256, INTR_TYPE_4OP_IMM8, X86ISD::VPERMIL2, 0),
+  X86_INTRINSIC_DATA(xop_vpermil2ps,     INTR_TYPE_4OP_IMM8, X86ISD::VPERMIL2, 0),
+  X86_INTRINSIC_DATA(xop_vpermil2ps_256, INTR_TYPE_4OP_IMM8, X86ISD::VPERMIL2, 0),
   X86_INTRINSIC_DATA(xop_vpperm,        INTR_TYPE_3OP, X86ISD::VPPERM, 0),
   X86_INTRINSIC_DATA(xop_vpshab,        INTR_TYPE_2OP, X86ISD::VPSHA, 0),
   X86_INTRINSIC_DATA(xop_vpshad,        INTR_TYPE_2OP, X86ISD::VPSHA, 0),
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86LegalizerInfo.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86LegalizerInfo.cpp
index 84f560f2f9ee..1b371ac2a108 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86LegalizerInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86LegalizerInfo.cpp
@@ -70,6 +70,11 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
   setLegalizerInfoAVX512DQ();
   setLegalizerInfoAVX512BW();
 
+  getActionDefinitionsBuilder(G_INTRINSIC_ROUNDEVEN)
+    .scalarize(0)
+    .minScalar(0, LLT::scalar(32))
+    .libcall();
+
   setLegalizeScalarToDifferentSizeStrategy(G_PHI, 0, widen_1);
   for (unsigned BinOp : {G_SUB, G_MUL, G_AND, G_OR, G_XOR})
     setLegalizeScalarToDifferentSizeStrategy(BinOp, 0, widen_1);
@@ -81,25 +86,14 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
   setLegalizeScalarToDifferentSizeStrategy(
       G_CONSTANT, 0, widenToLargerTypesAndNarrowToLargest);
 
+  getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE, G_MEMSET}).libcall();
+
   computeTables();
   verify(*STI.getInstrInfo());
 }
 
 bool X86LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
                                          MachineInstr &MI) const {
-  MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
-  switch (MI.getIntrinsicID()) {
-  case Intrinsic::memcpy:
-  case Intrinsic::memset:
-  case Intrinsic::memmove:
-    if (createMemLibcall(MIRBuilder, *MIRBuilder.getMRI(), MI) ==
-        LegalizerHelper::UnableToLegalize)
-      return false;
-    MI.eraseFromParent();
-    return true;
-  default:
-    break;
-  }
   return true;
 }
 
@@ -161,6 +155,11 @@ void X86LegalizerInfo::setLegalizerInfo32bit() {
       .legalFor({{s8, s8}, {s16, s8}, {s32, s8}})
       .clampScalar(0, s8, s32)
       .clampScalar(1, s8, s8);
+
+    // Comparison
+    getActionDefinitionsBuilder(G_ICMP)
+        .legalForCartesianProduct({s8}, {s8, s16, s32, p0})
+        .clampScalar(0, s8, s8);
   }
 
   // Control-flow
@@ -179,12 +178,6 @@ void X86LegalizerInfo::setLegalizerInfo32bit() {
   setAction({G_ANYEXT, s128}, Legal);
   getActionDefinitionsBuilder(G_SEXT_INREG).lower();
 
-  // Comparison
-  setAction({G_ICMP, s1}, Legal);
-
-  for (auto Ty : {s8, s16, s32, p0})
-    setAction({G_ICMP, 1, Ty}, Legal);
-
   // Merge/Unmerge
   for (const auto &Ty : {s16, s32, s64}) {
     setAction({G_MERGE_VALUES, Ty}, Legal);
@@ -253,7 +246,9 @@ void X86LegalizerInfo::setLegalizerInfo64bit() {
       .widenScalarToNextPow2(1);
 
   // Comparison
-  setAction({G_ICMP, 1, s64}, Legal);
+  getActionDefinitionsBuilder(G_ICMP)
+      .legalForCartesianProduct({s8}, {s8, s16, s32, s64, p0})
+      .clampScalar(0, s8, s8);
 
   getActionDefinitionsBuilder(G_FCMP)
       .legalForCartesianProduct({s8}, {s32, s64})
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
index 50f8b3477acc..810fee052b5a 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
@@ -42,6 +42,7 @@
 #include "X86TargetMachine.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
@@ -104,9 +105,9 @@ static cl::opt<bool> EmitDotVerify(
     cl::init(false), cl::Hidden);
 
 static llvm::sys::DynamicLibrary OptimizeDL;
-typedef int (*OptimizeCutT)(unsigned int *nodes, unsigned int nodes_size,
-                            unsigned int *edges, int *edge_values,
-                            int *cut_edges /* out */, unsigned int edges_size);
+typedef int (*OptimizeCutT)(unsigned int *Nodes, unsigned int NodesSize,
+                            unsigned int *Edges, int *EdgeValues,
+                            int *CutEdges /* out */, unsigned int EdgesSize);
 static OptimizeCutT OptimizeCut = nullptr;
 
 namespace {
@@ -148,9 +149,10 @@ public:
 
 private:
   using GraphBuilder = ImmutableGraphBuilder<MachineGadgetGraph>;
+  using Edge = MachineGadgetGraph::Edge;
+  using Node = MachineGadgetGraph::Node;
   using EdgeSet = MachineGadgetGraph::EdgeSet;
   using NodeSet = MachineGadgetGraph::NodeSet;
-  using Gadget = std::pair<MachineInstr *, MachineInstr *>;
 
   const X86Subtarget *STI;
   const TargetInstrInfo *TII;
@@ -162,15 +164,13 @@ private:
                  const MachineDominanceFrontier &MDF) const;
   int hardenLoadsWithPlugin(MachineFunction &MF,
                             std::unique_ptr<MachineGadgetGraph> Graph) const;
-  int hardenLoadsWithGreedyHeuristic(
-      MachineFunction &MF, std::unique_ptr<MachineGadgetGraph> Graph) const;
+  int hardenLoadsWithHeuristic(MachineFunction &MF,
+                               std::unique_ptr<MachineGadgetGraph> Graph) const;
   int elimMitigatedEdgesAndNodes(MachineGadgetGraph &G,
                                  EdgeSet &ElimEdges /* in, out */,
                                  NodeSet &ElimNodes /* in, out */) const;
   std::unique_ptr<MachineGadgetGraph>
   trimMitigatedEdges(std::unique_ptr<MachineGadgetGraph> Graph) const;
-  void findAndCutEdges(MachineGadgetGraph &G,
-                       EdgeSet &CutEdges /* out */) const;
   int insertFences(MachineFunction &MF, MachineGadgetGraph &G,
                    EdgeSet &CutEdges /* in, out */) const;
   bool instrUsesRegToAccessMemory(const MachineInstr &I, unsigned Reg) const;
@@ -198,7 +198,7 @@ struct DOTGraphTraits<MachineGadgetGraph *> : DefaultDOTGraphTraits {
   using ChildIteratorType = typename Traits::ChildIteratorType;
   using ChildEdgeIteratorType = typename Traits::ChildEdgeIteratorType;
 
-  DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
+  DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
 
   std::string getNodeLabel(NodeRef Node, GraphType *) {
     if (Node->getValue() == MachineGadgetGraph::ArgNodeSentinel)
@@ -243,7 +243,7 @@ void X86LoadValueInjectionLoadHardeningPass::getAnalysisUsage(
   AU.setPreservesCFG();
 }
 
-static void WriteGadgetGraph(raw_ostream &OS, MachineFunction &MF,
+static void writeGadgetGraph(raw_ostream &OS, MachineFunction &MF,
                              MachineGadgetGraph *G) {
   WriteGraph(OS, G, /*ShortNames*/ false,
              "Speculative gadgets for \"" + MF.getName() + "\" function");
@@ -279,7 +279,7 @@ bool X86LoadValueInjectionLoadHardeningPass::runOnMachineFunction(
     return false; // didn't find any gadgets
 
   if (EmitDotVerify) {
-    WriteGadgetGraph(outs(), MF, Graph.get());
+    writeGadgetGraph(outs(), MF, Graph.get());
     return false;
   }
 
@@ -292,7 +292,7 @@ bool X86LoadValueInjectionLoadHardeningPass::runOnMachineFunction(
     raw_fd_ostream FileOut(FileName, FileError);
     if (FileError)
       errs() << FileError.message();
-    WriteGadgetGraph(FileOut, MF, Graph.get());
+    writeGadgetGraph(FileOut, MF, Graph.get());
     FileOut.close();
     LLVM_DEBUG(dbgs() << "Emitting gadget graph... Done\n");
     if (EmitDotOnly)
@@ -313,7 +313,7 @@ bool X86LoadValueInjectionLoadHardeningPass::runOnMachineFunction(
     }
     FencesInserted = hardenLoadsWithPlugin(MF, std::move(Graph));
   } else { // Use the default greedy heuristic
-    FencesInserted = hardenLoadsWithGreedyHeuristic(MF, std::move(Graph));
+    FencesInserted = hardenLoadsWithHeuristic(MF, std::move(Graph));
   }
 
   if (FencesInserted > 0)
@@ -367,7 +367,7 @@ X86LoadValueInjectionLoadHardeningPass::getGadgetGraph(
 
           // Use RDF to find all the uses of `Def`
           rdf::NodeSet Uses;
-          RegisterRef DefReg = DFG.getPRI().normalize(Def.Addr->getRegRef(DFG));
+          RegisterRef DefReg = Def.Addr->getRegRef(DFG);
           for (auto UseID : L.getAllReachedUses(DefReg, Def)) {
             auto Use = DFG.addr<UseNode *>(UseID);
             if (Use.Addr->getFlags() & NodeAttrs::PhiRef) { // phi node
@@ -540,17 +540,17 @@ X86LoadValueInjectionLoadHardeningPass::getGadgetGraph(
 
 // Returns the number of remaining gadget edges that could not be eliminated
 int X86LoadValueInjectionLoadHardeningPass::elimMitigatedEdgesAndNodes(
-    MachineGadgetGraph &G, MachineGadgetGraph::EdgeSet &ElimEdges /* in, out */,
-    MachineGadgetGraph::NodeSet &ElimNodes /* in, out */) const {
+    MachineGadgetGraph &G, EdgeSet &ElimEdges /* in, out */,
+    NodeSet &ElimNodes /* in, out */) const {
   if (G.NumFences > 0) {
     // Eliminate fences and CFG edges that ingress and egress the fence, as
     // they are trivially mitigated.
-    for (const auto &E : G.edges()) {
-      const MachineGadgetGraph::Node *Dest = E.getDest();
+    for (const Edge &E : G.edges()) {
+      const Node *Dest = E.getDest();
       if (isFence(Dest->getValue())) {
         ElimNodes.insert(*Dest);
         ElimEdges.insert(E);
-        for (const auto &DE : Dest->edges())
+        for (const Edge &DE : Dest->edges())
           ElimEdges.insert(DE);
       }
     }
@@ -558,29 +558,28 @@ int X86LoadValueInjectionLoadHardeningPass::elimMitigatedEdgesAndNodes(
 
   // Find and eliminate gadget edges that have been mitigated.
   int MitigatedGadgets = 0, RemainingGadgets = 0;
-  MachineGadgetGraph::NodeSet ReachableNodes{G};
-  for (const auto &RootN : G.nodes()) {
+  NodeSet ReachableNodes{G};
+  for (const Node &RootN : G.nodes()) {
     if (llvm::none_of(RootN.edges(), MachineGadgetGraph::isGadgetEdge))
       continue; // skip this node if it isn't a gadget source
 
     // Find all of the nodes that are CFG-reachable from RootN using DFS
     ReachableNodes.clear();
-    std::function<void(const MachineGadgetGraph::Node *, bool)>
-        FindReachableNodes =
-            [&](const MachineGadgetGraph::Node *N, bool FirstNode) {
-              if (!FirstNode)
-                ReachableNodes.insert(*N);
-              for (const auto &E : N->edges()) {
-                const MachineGadgetGraph::Node *Dest = E.getDest();
-                if (MachineGadgetGraph::isCFGEdge(E) &&
-                    !ElimEdges.contains(E) && !ReachableNodes.contains(*Dest))
-                  FindReachableNodes(Dest, false);
-              }
-            };
+    std::function<void(const Node *, bool)> FindReachableNodes =
+        [&](const Node *N, bool FirstNode) {
+          if (!FirstNode)
+            ReachableNodes.insert(*N);
+          for (const Edge &E : N->edges()) {
+            const Node *Dest = E.getDest();
+            if (MachineGadgetGraph::isCFGEdge(E) && !ElimEdges.contains(E) &&
+                !ReachableNodes.contains(*Dest))
+              FindReachableNodes(Dest, false);
+          }
+        };
     FindReachableNodes(&RootN, true);
 
     // Any gadget whose sink is unreachable has been mitigated
-    for (const auto &E : RootN.edges()) {
+    for (const Edge &E : RootN.edges()) {
       if (MachineGadgetGraph::isGadgetEdge(E)) {
         if (ReachableNodes.contains(*E.getDest())) {
           // This gadget's sink is reachable
@@ -598,8 +597,8 @@ int X86LoadValueInjectionLoadHardeningPass::elimMitigatedEdgesAndNodes(
 std::unique_ptr<MachineGadgetGraph>
 X86LoadValueInjectionLoadHardeningPass::trimMitigatedEdges(
     std::unique_ptr<MachineGadgetGraph> Graph) const {
-  MachineGadgetGraph::NodeSet ElimNodes{*Graph};
-  MachineGadgetGraph::EdgeSet ElimEdges{*Graph};
+  NodeSet ElimNodes{*Graph};
+  EdgeSet ElimEdges{*Graph};
   int RemainingGadgets =
       elimMitigatedEdgesAndNodes(*Graph, ElimEdges, ElimNodes);
   if (ElimEdges.empty() && ElimNodes.empty()) {
@@ -630,11 +629,11 @@ int X86LoadValueInjectionLoadHardeningPass::hardenLoadsWithPlugin(
     auto Edges = std::make_unique<unsigned int[]>(Graph->edges_size());
     auto EdgeCuts = std::make_unique<int[]>(Graph->edges_size());
     auto EdgeValues = std::make_unique<int[]>(Graph->edges_size());
-    for (const auto &N : Graph->nodes()) {
+    for (const Node &N : Graph->nodes()) {
       Nodes[Graph->getNodeIndex(N)] = Graph->getEdgeIndex(*N.edges_begin());
     }
     Nodes[Graph->nodes_size()] = Graph->edges_size(); // terminator node
-    for (const auto &E : Graph->edges()) {
+    for (const Edge &E : Graph->edges()) {
       Edges[Graph->getEdgeIndex(E)] = Graph->getNodeIndex(*E.getDest());
       EdgeValues[Graph->getEdgeIndex(E)] = E.getValue();
     }
@@ -651,74 +650,67 @@ int X86LoadValueInjectionLoadHardeningPass::hardenLoadsWithPlugin(
     LLVM_DEBUG(dbgs() << "Inserting LFENCEs... Done\n");
     LLVM_DEBUG(dbgs() << "Inserted " << FencesInserted << " fences\n");
 
-    Graph = GraphBuilder::trim(*Graph, MachineGadgetGraph::NodeSet{*Graph},
-                               CutEdges);
+    Graph = GraphBuilder::trim(*Graph, NodeSet{*Graph}, CutEdges);
   } while (true);
 
   return FencesInserted;
 }
 
-int X86LoadValueInjectionLoadHardeningPass::hardenLoadsWithGreedyHeuristic(
+int X86LoadValueInjectionLoadHardeningPass::hardenLoadsWithHeuristic(
     MachineFunction &MF, std::unique_ptr<MachineGadgetGraph> Graph) const {
-  LLVM_DEBUG(dbgs() << "Eliminating mitigated paths...\n");
-  Graph = trimMitigatedEdges(std::move(Graph));
-  LLVM_DEBUG(dbgs() << "Eliminating mitigated paths... Done\n");
+  // If `MF` does not have any fences, then no gadgets would have been
+  // mitigated at this point.
+  if (Graph->NumFences > 0) {
+    LLVM_DEBUG(dbgs() << "Eliminating mitigated paths...\n");
+    Graph = trimMitigatedEdges(std::move(Graph));
+    LLVM_DEBUG(dbgs() << "Eliminating mitigated paths... Done\n");
+  }
+
   if (Graph->NumGadgets == 0)
     return 0;
 
   LLVM_DEBUG(dbgs() << "Cutting edges...\n");
-  MachineGadgetGraph::NodeSet ElimNodes{*Graph}, GadgetSinks{*Graph};
-  MachineGadgetGraph::EdgeSet ElimEdges{*Graph}, CutEdges{*Graph};
-  auto IsCFGEdge = [&ElimEdges, &CutEdges](const MachineGadgetGraph::Edge &E) {
-    return !ElimEdges.contains(E) && !CutEdges.contains(E) &&
-           MachineGadgetGraph::isCFGEdge(E);
-  };
-  auto IsGadgetEdge = [&ElimEdges,
-                       &CutEdges](const MachineGadgetGraph::Edge &E) {
-    return !ElimEdges.contains(E) && !CutEdges.contains(E) &&
-           MachineGadgetGraph::isGadgetEdge(E);
-  };
-
-  // FIXME: this is O(E^2), we could probably do better.
-  do {
-    // Find the cheapest CFG edge that will eliminate a gadget (by being
-    // egress from a SOURCE node or ingress to a SINK node), and cut it.
-    const MachineGadgetGraph::Edge *CheapestSoFar = nullptr;
-
-    // First, collect all gadget source and sink nodes.
-    MachineGadgetGraph::NodeSet GadgetSources{*Graph}, GadgetSinks{*Graph};
-    for (const auto &N : Graph->nodes()) {
-      if (ElimNodes.contains(N))
+  EdgeSet CutEdges{*Graph};
+
+  // Begin by collecting all ingress CFG edges for each node
+  DenseMap<const Node *, SmallVector<const Edge *, 2>> IngressEdgeMap;
+  for (const Edge &E : Graph->edges())
+    if (MachineGadgetGraph::isCFGEdge(E))
+      IngressEdgeMap[E.getDest()].push_back(&E);
+
+  // For each gadget edge, make cuts that guarantee the gadget will be
+  // mitigated. A computationally efficient way to achieve this is to either:
+  // (a) cut all egress CFG edges from the gadget source, or
+  // (b) cut all ingress CFG edges to the gadget sink.
+  //
+  // Moreover, the algorithm tries not to make a cut into a loop by preferring
+  // to make a (b)-type cut if the gadget source resides at a greater loop depth
+  // than the gadget sink, or an (a)-type cut otherwise.
+  for (const Node &N : Graph->nodes()) {
+    for (const Edge &E : N.edges()) {
+      if (!MachineGadgetGraph::isGadgetEdge(E))
         continue;
-      for (const auto &E : N.edges()) {
-        if (IsGadgetEdge(E)) {
-          GadgetSources.insert(N);
-          GadgetSinks.insert(*E.getDest());
-        }
-      }
-    }
 
-    // Next, look for the cheapest CFG edge which, when cut, is guaranteed to
-    // mitigate at least one gadget by either:
-    // (a) being egress from a gadget source, or
-    // (b) being ingress to a gadget sink.
-    for (const auto &N : Graph->nodes()) {
-      if (ElimNodes.contains(N))
-        continue;
-      for (const auto &E : N.edges()) {
-        if (IsCFGEdge(E)) {
-          if (GadgetSources.contains(N) || GadgetSinks.contains(*E.getDest())) {
-            if (!CheapestSoFar || E.getValue() < CheapestSoFar->getValue())
-              CheapestSoFar = &E;
-          }
-        }
-      }
+      SmallVector<const Edge *, 2> EgressEdges;
+      SmallVector<const Edge *, 2> &IngressEdges = IngressEdgeMap[E.getDest()];
+      for (const Edge &EgressEdge : N.edges())
+        if (MachineGadgetGraph::isCFGEdge(EgressEdge))
+          EgressEdges.push_back(&EgressEdge);
+
+      int EgressCutCost = 0, IngressCutCost = 0;
+      for (const Edge *EgressEdge : EgressEdges)
+        if (!CutEdges.contains(*EgressEdge))
+          EgressCutCost += EgressEdge->getValue();
+      for (const Edge *IngressEdge : IngressEdges)
+        if (!CutEdges.contains(*IngressEdge))
+          IngressCutCost += IngressEdge->getValue();
+
+      auto &EdgesToCut =
+          IngressCutCost < EgressCutCost ? IngressEdges : EgressEdges;
+      for (const Edge *E : EdgesToCut)
+        CutEdges.insert(*E);
     }
-
-    assert(CheapestSoFar && "Failed to cut an edge");
-    CutEdges.insert(*CheapestSoFar);
-    ElimEdges.insert(*CheapestSoFar);
-  } while (elimMitigatedEdgesAndNodes(*Graph, ElimEdges, ElimNodes));
+  }
   LLVM_DEBUG(dbgs() << "Cutting edges... Done\n");
   LLVM_DEBUG(dbgs() << "Cut " << CutEdges.count() << " edges\n");
 
@@ -734,8 +726,8 @@ int X86LoadValueInjectionLoadHardeningPass::insertFences(
     MachineFunction &MF, MachineGadgetGraph &G,
     EdgeSet &CutEdges /* in, out */) const {
   int FencesInserted = 0;
-  for (const auto &N : G.nodes()) {
-    for (const auto &E : N.edges()) {
+  for (const Node &N : G.nodes()) {
+    for (const Edge &E : N.edges()) {
       if (CutEdges.contains(E)) {
         MachineInstr *MI = N.getValue(), *Prev;
         MachineBasicBlock *MBB;                  // Insert an LFENCE in this MBB
@@ -751,7 +743,7 @@ int X86LoadValueInjectionLoadHardeningPass::insertFences(
           Prev = MI->getPrevNode();
           // Remove all egress CFG edges from this branch because the inserted
           // LFENCE prevents gadgets from crossing the branch.
-          for (const auto &E : N.edges()) {
+          for (const Edge &E : N.edges()) {
             if (MachineGadgetGraph::isCFGEdge(E))
               CutEdges.insert(E);
           }
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86LoadValueInjectionRetHardening.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86LoadValueInjectionRetHardening.cpp
index 6e1134a25950..7b6276c1d87e 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86LoadValueInjectionRetHardening.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86LoadValueInjectionRetHardening.cpp
@@ -72,62 +72,39 @@ bool X86LoadValueInjectionRetHardeningPass::runOnMachineFunction(
   ++NumFunctionsConsidered;
   const X86RegisterInfo *TRI = Subtarget->getRegisterInfo();
   const X86InstrInfo *TII = Subtarget->getInstrInfo();
-  unsigned ClobberReg = X86::NoRegister;
-  std::bitset<X86::NUM_TARGET_REGS> UnclobberableGR64s;
-  UnclobberableGR64s.set(X86::RSP); // can't clobber stack pointer
-  UnclobberableGR64s.set(X86::RIP); // can't clobber instruction pointer
-  UnclobberableGR64s.set(X86::RAX); // used for function return
-  UnclobberableGR64s.set(X86::RDX); // used for function return
-
-  // We can clobber any register allowed by the function's calling convention.
-  for (const MCPhysReg *PR = TRI->getCalleeSavedRegs(&MF); auto Reg = *PR; ++PR)
-    UnclobberableGR64s.set(Reg);
-  for (auto &Reg : X86::GR64RegClass) {
-    if (!UnclobberableGR64s.test(Reg)) {
-      ClobberReg = Reg;
-      break;
-    }
-  }
-
-  if (ClobberReg != X86::NoRegister) {
-    LLVM_DEBUG(dbgs() << "Selected register "
-                      << Subtarget->getRegisterInfo()->getRegAsmName(ClobberReg)
-                      << " to clobber\n");
-  } else {
-    LLVM_DEBUG(dbgs() << "Could not find a register to clobber\n");
-  }
 
   bool Modified = false;
   for (auto &MBB : MF) {
-    if (MBB.empty())
-      continue;
-
-    MachineInstr &MI = MBB.back();
-    if (MI.getOpcode() != X86::RETQ)
-      continue;
-
-    if (ClobberReg != X86::NoRegister) {
-      MBB.erase_instr(&MI);
-      BuildMI(MBB, MBB.end(), DebugLoc(), TII->get(X86::POP64r))
-          .addReg(ClobberReg, RegState::Define)
-          .setMIFlag(MachineInstr::FrameDestroy);
-      BuildMI(MBB, MBB.end(), DebugLoc(), TII->get(X86::LFENCE));
-      BuildMI(MBB, MBB.end(), DebugLoc(), TII->get(X86::JMP64r))
-          .addReg(ClobberReg);
-    } else {
-      // In case there is no available scratch register, we can still read from
-      // RSP to assert that RSP points to a valid page. The write to RSP is
-      // also helpful because it verifies that the stack's write permissions
-      // are intact.
-      MachineInstr *Fence = BuildMI(MBB, MI, DebugLoc(), TII->get(X86::LFENCE));
-      addRegOffset(BuildMI(MBB, Fence, DebugLoc(), TII->get(X86::SHL64mi)),
-                   X86::RSP, false, 0)
-          .addImm(0)
-          ->addRegisterDead(X86::EFLAGS, TRI);
+    for (auto MBBI = MBB.begin(); MBBI != MBB.end(); ++MBBI) {
+      if (MBBI->getOpcode() != X86::RETQ)
+        continue;
+
+      unsigned ClobberReg = TRI->findDeadCallerSavedReg(MBB, MBBI);
+      if (ClobberReg != X86::NoRegister) {
+        BuildMI(MBB, MBBI, DebugLoc(), TII->get(X86::POP64r))
+            .addReg(ClobberReg, RegState::Define)
+            .setMIFlag(MachineInstr::FrameDestroy);
+        BuildMI(MBB, MBBI, DebugLoc(), TII->get(X86::LFENCE));
+        BuildMI(MBB, MBBI, DebugLoc(), TII->get(X86::JMP64r))
+            .addReg(ClobberReg);
+        MBB.erase(MBBI);
+      } else {
+        // In case there is no available scratch register, we can still read
+        // from RSP to assert that RSP points to a valid page. The write to RSP
+        // is also helpful because it verifies that the stack's write
+        // permissions are intact.
+        MachineInstr *Fence =
+            BuildMI(MBB, MBBI, DebugLoc(), TII->get(X86::LFENCE));
+        addRegOffset(BuildMI(MBB, Fence, DebugLoc(), TII->get(X86::SHL64mi)),
+                     X86::RSP, false, 0)
+            .addImm(0)
+            ->addRegisterDead(X86::EFLAGS, TRI);
+      }
+
+      ++NumFences;
+      Modified = true;
+      break;
     }
-
-    ++NumFences;
-    Modified = true;
   }
 
   if (Modified)
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86LowerAMXType.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86LowerAMXType.cpp
new file mode 100644
index 000000000000..85166decd8cd
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86LowerAMXType.cpp
@@ -0,0 +1,351 @@
+//===- llvm/CodeGen/TileShapeInfo.h - ---------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file Pass to transform <256 x i32> load/store
+/// <256 x i32> is bitcasted to x86_amx on X86, and AMX instruction set only
+/// provides simple operation on x86_amx. The basic elementwise operation
+/// is not supported by AMX. Since x86_amx is bitcasted from vector <256 x i32>
+/// and only AMX intrinsics can operate on the type, we need transform
+/// load/store <256 x i32> instruction to AMX load/store. If the bitcast can
+/// not be combined with load/store, we transform the bitcast to amx load/store
+/// and <256 x i32> store/load.
+//
+//===----------------------------------------------------------------------===//
+//
+#include "X86.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsX86.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "lower-amx-type"
+
+static AllocaInst *CreateAllocaInst(IRBuilder<> &Builder, BasicBlock *BB) {
+  Function &F = *BB->getParent();
+  Module *M = BB->getModule();
+  const DataLayout &DL = M->getDataLayout();
+
+  Type *V256I32Ty = VectorType::get(Builder.getInt32Ty(), 256, false);
+  LLVMContext &Ctx = Builder.getContext();
+  auto AllocaAlignment = DL.getPrefTypeAlign(Type::getX86_AMXTy(Ctx));
+  unsigned AllocaAS = DL.getAllocaAddrSpace();
+  AllocaInst *AllocaRes =
+      new AllocaInst(V256I32Ty, AllocaAS, "", &F.getEntryBlock().front());
+  AllocaRes->setAlignment(AllocaAlignment);
+  return AllocaRes;
+}
+
+static std::pair<Value *, Value *> getShape(IntrinsicInst *II, unsigned OpNo) {
+  Value *Row = nullptr, *Col = nullptr;
+  switch (II->getIntrinsicID()) {
+  default:
+    llvm_unreachable("Expect amx intrinsics");
+  case Intrinsic::x86_tileloadd64_internal:
+  case Intrinsic::x86_tilestored64_internal: {
+    Row = II->getArgOperand(0);
+    Col = II->getArgOperand(1);
+    break;
+  }
+  // a * b + c
+  // The shape depends on which operand.
+  case Intrinsic::x86_tdpbssd_internal: {
+    switch (OpNo) {
+    case 3:
+      Row = II->getArgOperand(0);
+      Col = II->getArgOperand(1);
+      break;
+    case 4:
+      Row = II->getArgOperand(0);
+      Col = II->getArgOperand(2);
+      break;
+    case 5:
+      Row = II->getArgOperand(2);
+      Col = II->getArgOperand(1);
+      break;
+    }
+    break;
+  }
+  }
+
+  return std::make_pair(Row, Col);
+}
+
+// %src = load <256 x i32>, <256 x i32>* %addr, align 64
+// %2 = bitcast <256 x i32> %src to x86_amx
+// -->
+// %2 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col,
+// i8* %addr, i64 %stride64)
+static void combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast) {
+  Value *Row = nullptr, *Col = nullptr;
+  Use &U = *(Bitcast->use_begin());
+  unsigned OpNo = U.getOperandNo();
+  auto *II = cast<IntrinsicInst>(U.getUser());
+  std::tie(Row, Col) = getShape(II, OpNo);
+  IRBuilder<> Builder(Bitcast);
+  // Use the maximun column as stride.
+  Value *Stride = Builder.getInt64(64);
+  Value *I8Ptr =
+      Builder.CreateBitCast(LD->getOperand(0), Builder.getInt8PtrTy());
+  std::array<Value *, 4> Args = {Row, Col, I8Ptr, Stride};
+
+  Value *NewInst =
+      Builder.CreateIntrinsic(Intrinsic::x86_tileloadd64_internal, None, Args);
+  Bitcast->replaceAllUsesWith(NewInst);
+}
+
+// %src = call x86_amx @llvm.x86.tileloadd64.internal(%row, %col, %addr,
+//                                                    %stride);
+// %13 = bitcast x86_amx %src to <256 x i32>
+// store <256 x i32> %13, <256 x i32>* %addr, align 64
+// -->
+// call void @llvm.x86.tilestored64.internal(%row, %col, %addr,
+//                                           %stride64, %13)
+static void combineBitcastStore(BitCastInst *Bitcast, StoreInst *ST) {
+
+  Value *Tile = Bitcast->getOperand(0);
+  auto *II = cast<IntrinsicInst>(Tile);
+  // Tile is output from AMX intrinsic. The first operand of the
+  // intrinsic is row, the second operand of the intrinsic is column.
+  Value *Row = II->getOperand(0);
+  Value *Col = II->getOperand(1);
+  IRBuilder<> Builder(ST);
+  // Use the maximum column as stride. It must be the same with load
+  // stride.
+  Value *Stride = Builder.getInt64(64);
+  Value *I8Ptr =
+      Builder.CreateBitCast(ST->getOperand(1), Builder.getInt8PtrTy());
+  std::array<Value *, 5> Args = {Row, Col, I8Ptr, Stride, Tile};
+  Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, None, Args);
+  if (Bitcast->hasOneUse())
+    return;
+  // %13 = bitcast x86_amx %src to <256 x i32>
+  // store <256 x i32> %13, <256 x i32>* %addr, align 64
+  // %add = <256 x i32> %13, <256 x i32> %src2
+  // -->
+  // %13 = bitcast x86_amx %src to <256 x i32>
+  // call void @llvm.x86.tilestored64.internal(%row, %col, %addr,
+  //                                           %stride64, %13)
+  // %14 = load <256 x i32>, %addr
+  // %add = <256 x i32> %14, <256 x i32> %src2
+  Value *Vec = Builder.CreateLoad(Bitcast->getType(), ST->getOperand(1));
+  Bitcast->replaceAllUsesWith(Vec);
+}
+
+// transform bitcast to <store, load> instructions.
+static bool transformBitcast(BitCastInst *Bitcast) {
+  IRBuilder<> Builder(Bitcast);
+  AllocaInst *AllocaAddr;
+  Value *I8Ptr, *Stride;
+  auto *Src = Bitcast->getOperand(0);
+
+  auto Prepare = [&]() {
+    AllocaAddr = CreateAllocaInst(Builder, Bitcast->getParent());
+    I8Ptr = Builder.CreateBitCast(AllocaAddr, Builder.getInt8PtrTy());
+    Stride = Builder.getInt64(64);
+  };
+
+  if (Bitcast->getType()->isX86_AMXTy()) {
+    // %2 = bitcast <256 x i32> %src to x86_amx
+    // -->
+    // %addr = alloca <256 x i32>, align 64
+    // store <256 x i32> %src, <256 x i32>* %addr, align 64
+    // %addr2 = bitcast <256 x i32>* to i8*
+    // %2 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col,
+    //                                                  i8* %addr2,
+    //                                                  i64 64)
+    Use &U = *(Bitcast->use_begin());
+    unsigned OpNo = U.getOperandNo();
+    auto *II = dyn_cast<IntrinsicInst>(U.getUser());
+    if (!II)
+      return false; // May be bitcast from x86amx to <256 x i32>.
+    Prepare();
+    Builder.CreateStore(Src, AllocaAddr);
+    // TODO we can pick an constant operand for the shape.
+    Value *Row = nullptr, *Col = nullptr;
+    std::tie(Row, Col) = getShape(II, OpNo);
+    std::array<Value *, 4> Args = {Row, Col, I8Ptr, Stride};
+    Value *NewInst = Builder.CreateIntrinsic(
+        Intrinsic::x86_tileloadd64_internal, None, Args);
+    Bitcast->replaceAllUsesWith(NewInst);
+  } else {
+    // %2 = bitcast x86_amx %src to <256 x i32>
+    // -->
+    // %addr = alloca <256 x i32>, align 64
+    // %addr2 = bitcast <256 x i32>* to i8*
+    // call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col,
+    //                                           i8* %addr2, i64 %stride)
+    // %2 = load <256 x i32>, <256 x i32>* %addr, align 64
+    auto *II = dyn_cast<IntrinsicInst>(Src);
+    if (!II)
+      return false; // May be bitcast from <256 x i32> to x86amx.
+    Prepare();
+    Value *Row = II->getOperand(0);
+    Value *Col = II->getOperand(1);
+    std::array<Value *, 5> Args = {Row, Col, I8Ptr, Stride, Src};
+    Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, None, Args);
+    Value *NewInst = Builder.CreateLoad(Bitcast->getType(), AllocaAddr);
+    Bitcast->replaceAllUsesWith(NewInst);
+  }
+
+  return true;
+}
+
+namespace {
+class X86LowerAMXType {
+  Function &Func;
+
+public:
+  X86LowerAMXType(Function &F) : Func(F) {}
+  bool visit();
+};
+
+bool X86LowerAMXType::visit() {
+  SmallVector<Instruction *, 8> DeadInsts;
+
+  for (BasicBlock *BB : post_order(&Func)) {
+    for (BasicBlock::reverse_iterator II = BB->rbegin(), IE = BB->rend();
+         II != IE;) {
+      Instruction &Inst = *II++;
+      auto *Bitcast = dyn_cast<BitCastInst>(&Inst);
+      if (!Bitcast)
+        continue;
+
+      Value *Src = Bitcast->getOperand(0);
+      if (Bitcast->getType()->isX86_AMXTy()) {
+        if (Bitcast->user_empty()) {
+          DeadInsts.push_back(Bitcast);
+          continue;
+        }
+        LoadInst *LD = dyn_cast<LoadInst>(Src);
+        if (!LD) {
+          if (transformBitcast(Bitcast))
+            DeadInsts.push_back(Bitcast);
+          continue;
+        }
+        // If load has mutli-user, duplicate a vector load.
+        // %src = load <256 x i32>, <256 x i32>* %addr, align 64
+        // %2 = bitcast <256 x i32> %src to x86_amx
+        // %add = add <256 x i32> %src, <256 x i32> %src2
+        // -->
+        // %src = load <256 x i32>, <256 x i32>* %addr, align 64
+        // %2 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col,
+        //                                            i8* %addr, i64 %stride64)
+        // %add = add <256 x i32> %src, <256 x i32> %src2
+
+        // If load has one user, the load will be eliminated in DAG ISel.
+        // %src = load <256 x i32>, <256 x i32>* %addr, align 64
+        // %2 = bitcast <256 x i32> %src to x86_amx
+        // -->
+        // %2 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col,
+        //                                            i8* %addr, i64 %stride64)
+        combineLoadBitcast(LD, Bitcast);
+        DeadInsts.push_back(Bitcast);
+        if (LD->hasOneUse())
+          DeadInsts.push_back(LD);
+      } else if (Src->getType()->isX86_AMXTy()) {
+        if (Bitcast->user_empty()) {
+          DeadInsts.push_back(Bitcast);
+          continue;
+        }
+        StoreInst *ST = nullptr;
+        for (auto UI = Bitcast->use_begin(), UE = Bitcast->use_end();
+             UI != UE;) {
+          Value *I = (UI++)->getUser();
+          ST = dyn_cast<StoreInst>(I);
+          if (ST)
+            break;
+        }
+        if (!ST) {
+          if (transformBitcast(Bitcast))
+            DeadInsts.push_back(Bitcast);
+          continue;
+        }
+        // If bitcast (%13) has one use, combine bitcast and store to amx store.
+        // %src = call x86_amx @llvm.x86.tileloadd64.internal(%row, %col, %addr,
+        //                                                    %stride);
+        // %13 = bitcast x86_amx %src to <256 x i32>
+        // store <256 x i32> %13, <256 x i32>* %addr, align 64
+        // -->
+        // call void @llvm.x86.tilestored64.internal(%row, %col, %addr,
+        //                                           %stride64, %13)
+        //
+        // If bitcast (%13) has multi-use, transform as below.
+        // %13 = bitcast x86_amx %src to <256 x i32>
+        // store <256 x i32> %13, <256 x i32>* %addr, align 64
+        // %add = <256 x i32> %13, <256 x i32> %src2
+        // -->
+        // %13 = bitcast x86_amx %src to <256 x i32>
+        // call void @llvm.x86.tilestored64.internal(%row, %col, %addr,
+        //                                           %stride64, %13)
+        // %14 = load <256 x i32>, %addr
+        // %add = <256 x i32> %14, <256 x i32> %src2
+        //
+        combineBitcastStore(Bitcast, ST);
+        // Delete user first.
+        DeadInsts.push_back(ST);
+        DeadInsts.push_back(Bitcast);
+      }
+    }
+  }
+
+  bool C = !DeadInsts.empty();
+
+  for (auto *Inst : DeadInsts)
+    Inst->eraseFromParent();
+
+  return C;
+}
+} // anonymous namespace
+
+namespace {
+
+class X86LowerAMXTypeLegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  X86LowerAMXTypeLegacyPass() : FunctionPass(ID) {
+    initializeX86LowerAMXTypeLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    X86LowerAMXType LAT(F);
+    bool C = LAT.visit();
+    return C;
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+  }
+};
+
+} // anonymous namespace
+
+static const char PassName[] = "Lower AMX type for load/store";
+char X86LowerAMXTypeLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(X86LowerAMXTypeLegacyPass, DEBUG_TYPE, PassName, false,
+                      false)
+INITIALIZE_PASS_END(X86LowerAMXTypeLegacyPass, DEBUG_TYPE, PassName, false,
+                    false)
+
+FunctionPass *llvm::createX86LowerAMXTypePass() {
+  return new X86LowerAMXTypeLegacyPass();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86MCInstLower.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86MCInstLower.cpp
index 9ce2a4637e2e..89fa3ae3a3f4 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -977,20 +977,24 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
 void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering,
                                  const MachineInstr &MI) {
   NoAutoPaddingScope NoPadScope(*OutStreamer);
-  bool Is64Bits = MI.getOpcode() == X86::TLS_addr64 ||
-                  MI.getOpcode() == X86::TLS_base_addr64;
+  bool Is64Bits = MI.getOpcode() != X86::TLS_addr32 &&
+                  MI.getOpcode() != X86::TLS_base_addr32;
+  bool Is64BitsLP64 = MI.getOpcode() == X86::TLS_addr64 ||
+                      MI.getOpcode() == X86::TLS_base_addr64;
   MCContext &Ctx = OutStreamer->getContext();
 
   MCSymbolRefExpr::VariantKind SRVK;
   switch (MI.getOpcode()) {
   case X86::TLS_addr32:
   case X86::TLS_addr64:
+  case X86::TLS_addrX32:
     SRVK = MCSymbolRefExpr::VK_TLSGD;
     break;
   case X86::TLS_base_addr32:
     SRVK = MCSymbolRefExpr::VK_TLSLDM;
     break;
   case X86::TLS_base_addr64:
+  case X86::TLS_base_addrX32:
     SRVK = MCSymbolRefExpr::VK_TLSLD;
     break;
   default:
@@ -1010,7 +1014,7 @@ void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering,
 
   if (Is64Bits) {
     bool NeedsPadding = SRVK == MCSymbolRefExpr::VK_TLSGD;
-    if (NeedsPadding)
+    if (NeedsPadding && Is64BitsLP64)
       EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX));
     EmitAndCountInstruction(MCInstBuilder(X86::LEA64r)
                                 .addReg(X86::RDI)
@@ -1079,29 +1083,30 @@ void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering,
   }
 }
 
-/// Return the longest nop which can be efficiently decoded for the given
-/// target cpu.  15-bytes is the longest single NOP instruction, but some
-/// platforms can't decode the longest forms efficiently.
-static unsigned maxLongNopLength(const X86Subtarget *Subtarget) {
-  if (Subtarget->getFeatureBits()[X86::ProcIntelSLM])
-    return 7;
-  if (Subtarget->getFeatureBits()[X86::FeatureFast15ByteNOP])
-    return 15;
-  if (Subtarget->getFeatureBits()[X86::FeatureFast11ByteNOP])
-    return 11;
-  if (Subtarget->getFeatureBits()[X86::FeatureNOPL] || Subtarget->is64Bit())
-    return 10;
-  if (Subtarget->is32Bit())
-    return 2;
-  return 1;
-}
-
 /// Emit the largest nop instruction smaller than or equal to \p NumBytes
 /// bytes.  Return the size of nop emitted.
 static unsigned emitNop(MCStreamer &OS, unsigned NumBytes,
                         const X86Subtarget *Subtarget) {
+  // Determine the longest nop which can be efficiently decoded for the given
+  // target cpu.  15-bytes is the longest single NOP instruction, but some
+  // platforms can't decode the longest forms efficiently.
+  unsigned MaxNopLength = 1;
+  if (Subtarget->is64Bit()) {
+    // FIXME: We can use NOOPL on 32-bit targets with FeatureNOPL, but the
+    // IndexReg/BaseReg below need to be updated.
+    if (Subtarget->hasFeature(X86::FeatureFast7ByteNOP))
+      MaxNopLength = 7;
+    else if (Subtarget->hasFeature(X86::FeatureFast15ByteNOP))
+      MaxNopLength = 15;
+    else if (Subtarget->hasFeature(X86::FeatureFast11ByteNOP))
+      MaxNopLength = 11;
+    else
+      MaxNopLength = 10;
+  } if (Subtarget->is32Bit())
+    MaxNopLength = 2;
+
   // Cap a single nop emission at the profitable value for the target
-  NumBytes = std::min(NumBytes, maxLongNopLength(Subtarget));
+  NumBytes = std::min(NumBytes, MaxNopLength);
 
   unsigned NopSize;
   unsigned Opc, BaseReg, ScaleVal, IndexReg, Displacement, SegmentReg;
@@ -1329,7 +1334,7 @@ void X86AsmPrinter::LowerPATCHABLE_OP(const MachineInstr &MI,
 
   MCInst MCI;
   MCI.setOpcode(Opcode);
-  for (auto &MO : make_range(MI.operands_begin() + 2, MI.operands_end()))
+  for (auto &MO : drop_begin(MI.operands(), 2))
     if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO))
       MCI.addOperand(MaybeOperand.getValue());
 
@@ -1705,7 +1710,7 @@ void X86AsmPrinter::LowerPATCHABLE_RET(const MachineInstr &MI,
   unsigned OpCode = MI.getOperand(0).getImm();
   MCInst Ret;
   Ret.setOpcode(OpCode);
-  for (auto &MO : make_range(MI.operands_begin() + 1, MI.operands_end()))
+  for (auto &MO : drop_begin(MI.operands()))
     if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO))
       Ret.addOperand(MaybeOperand.getValue());
   OutStreamer->emitInstruction(Ret, getSubtargetInfo());
@@ -1744,7 +1749,7 @@ void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI,
   // Before emitting the instruction, add a comment to indicate that this is
   // indeed a tail call.
   OutStreamer->AddComment("TAILCALL");
-  for (auto &MO : make_range(MI.operands_begin() + 1, MI.operands_end()))
+  for (auto &MO : drop_begin(MI.operands()))
     if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO))
       TC.addOperand(MaybeOperand.getValue());
   OutStreamer->emitInstruction(TC, getSubtargetInfo());
@@ -1779,10 +1784,7 @@ static const Constant *getConstantFromPool(const MachineInstr &MI,
   if (ConstantEntry.isMachineConstantPoolEntry())
     return nullptr;
 
-  const Constant *C = ConstantEntry.Val.ConstVal;
-  assert((!C || ConstantEntry.getType() == C->getType()) &&
-         "Expected a constant of the same type!");
-  return C;
+  return ConstantEntry.Val.ConstVal;
 }
 
 static std::string getShuffleComment(const MachineInstr *MI, unsigned SrcOp1Idx,
@@ -2444,8 +2446,10 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
 
   case X86::TLS_addr32:
   case X86::TLS_addr64:
+  case X86::TLS_addrX32:
   case X86::TLS_base_addr32:
   case X86::TLS_base_addr64:
+  case X86::TLS_base_addrX32:
     return LowerTlsAddr(MCInstLowering, *MI);
 
   case X86::MOVPC32r: {
@@ -2594,6 +2598,15 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
     }
     return;
   }
+  case X86::UBSAN_UD1:
+    EmitAndCountInstruction(MCInstBuilder(X86::UD1Lm)
+                                .addReg(X86::EAX)
+                                .addReg(X86::EAX)
+                                .addImm(1)
+                                .addReg(X86::NoRegister)
+                                .addImm(MI->getOperand(0).getImm())
+                                .addReg(X86::NoRegister));
+    return;
   }
 
   MCInst TmpInst;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86PartialReduction.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86PartialReduction.cpp
index 8784a3df1773..babd923e7496 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86PartialReduction.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86PartialReduction.cpp
@@ -392,8 +392,7 @@ static void collectLeaves(Value *Root, SmallVectorImpl<Instruction *> &Leaves) {
         break;
 
       // Push incoming values to the worklist.
-      for (Value *InV : PN->incoming_values())
-        Worklist.push_back(InV);
+      append_range(Worklist, PN->incoming_values());
 
       continue;
     }
@@ -402,8 +401,7 @@ static void collectLeaves(Value *Root, SmallVectorImpl<Instruction *> &Leaves) {
       if (BO->getOpcode() == Instruction::Add) {
         // Simple case. Single use, just push its operands to the worklist.
         if (BO->hasNUses(BO == Root ? 2 : 1)) {
-          for (Value *Op : BO->operands())
-            Worklist.push_back(Op);
+          append_range(Worklist, BO->operands());
           continue;
         }
 
@@ -426,8 +424,7 @@ static void collectLeaves(Value *Root, SmallVectorImpl<Instruction *> &Leaves) {
             continue;
 
           // The phi forms a loop with this Add, push its operands.
-          for (Value *Op : BO->operands())
-            Worklist.push_back(Op);
+          append_range(Worklist, BO->operands());
         }
       }
     }
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86PreTileConfig.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86PreTileConfig.cpp
new file mode 100644
index 000000000000..05ee6c6c8384
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86PreTileConfig.cpp
@@ -0,0 +1,265 @@
+//===-- X86PreTileConfig.cpp - Tile Register Configure---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file Pass to pre-config the shape of AMX register
+/// AMX register need to be configured before use. The shape of AMX register
+/// is encoded in the 1st and 2nd machine operand of AMX pseudo instructions.
+/// The pldtilecfg is to config tile registers. It should dominator all AMX
+/// instructions. The pldtilecfg produce a virtual cfg register and the cfg
+/// register is used by all AMX instructions.
+/// This pass is to find the common dominator of all AMX instructions and
+/// insert the pldtilecfg instruction. Besides the cfg register that pldtilecfg
+/// produces is inserted as the last operand of each AMX instruction. We use
+/// this scheme to model the def-use relationship between AMX config instruction
+/// and other AMX instructions. Below is an example.
+///
+///                        ----B1----
+///                       /           \
+///                      /             \
+///                    B2               B3
+///    %1:tile = PTILELOADDV        %2:tile = PTILELOADDV
+///
+///  is transformed to
+///
+///                            B1
+///                 %25:tilecfg = PLDTILECFG
+///                       /           \
+///                      /             \
+///  %1:tile = PTILELOADDV %25    %2:tile = PTILELOADDV %25
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TileShapeInfo.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "tile-pre-config"
+
+namespace {
+
+class X86PreTileConfig : public MachineFunctionPass {
+  // context
+  MachineFunction *MF = nullptr;
+  const X86Subtarget *ST = nullptr;
+  const TargetRegisterInfo *TRI;
+  const TargetInstrInfo *TII;
+  MachineDominatorTree *DomTree = nullptr;
+  MachineRegisterInfo *MRI = nullptr;
+
+  MachineInstr *getTileConfigPoint();
+
+public:
+  X86PreTileConfig() : MachineFunctionPass(ID) {}
+
+  /// Return the pass name.
+  StringRef getPassName() const override {
+    return "Tile Register Pre-configure";
+  }
+
+  /// X86PreTileConfig analysis usage.
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  /// Perform register allocation.
+  bool runOnMachineFunction(MachineFunction &mf) override;
+
+  static char ID;
+};
+
+} // end anonymous namespace
+
+char X86PreTileConfig::ID = 0;
+
+INITIALIZE_PASS_BEGIN(X86PreTileConfig, "tilepreconfig",
+                      "Tile Register Configure", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(X86PreTileConfig, "tilepreconfig",
+                    "Tile Register Configure", false, false)
+
+void X86PreTileConfig::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<MachineDominatorTree>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+static Register buildConfigMI(MachineBasicBlock::iterator MI, int FrameIdx,
+                              const TargetInstrInfo *TII,
+                              MachineRegisterInfo *MRI,
+                              const X86Subtarget *ST) {
+  auto *MBB = MI->getParent();
+
+  // FIXME: AMX should assume AVX512 enabled.
+  if (ST->hasAVX512()) {
+    // Zero stack slot.
+    Register Zmm = MRI->createVirtualRegister(&X86::VR512RegClass);
+    BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::VPXORDZrr), Zmm)
+        .addReg(Zmm, RegState::Undef)
+        .addReg(Zmm, RegState::Undef);
+    addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::VMOVUPSZmr)),
+                      FrameIdx)
+        .addReg(Zmm);
+  }
+
+  // build psuedo ldtilecfg
+  Register VReg = MRI->createVirtualRegister(&X86::TILECFGRegClass);
+
+  addFrameReference(
+      BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::PLDTILECFG), VReg), FrameIdx);
+
+  return VReg;
+}
+
+static ShapeT getShape(const MachineInstr &MI, MachineRegisterInfo *MRI) {
+  unsigned Opcode = MI.getOpcode();
+  switch (Opcode) {
+  default:
+    llvm_unreachable("Unexpected machine instruction on tile");
+  case X86::PTILELOADDV:
+  case X86::PTDPBSSDV:
+  case X86::PTILEZEROV:
+    MachineOperand &MO1 = const_cast<MachineOperand &>(MI.getOperand(1));
+    MachineOperand &MO2 = const_cast<MachineOperand &>(MI.getOperand(2));
+    ShapeT Shape(&MO1, &MO2, MRI);
+    return Shape;
+  }
+}
+
+MachineInstr *X86PreTileConfig::getTileConfigPoint() {
+  DenseMap<Register, ShapeT> PhysShapeInfo;
+  MachineBasicBlock *MBB = nullptr;
+  DenseSet<const MachineInstr *> MIs;
+  for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+    Register VirtReg = Register::index2VirtReg(i);
+    if (MRI->reg_nodbg_empty(VirtReg))
+      continue;
+    const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
+    if (RC.getID() != X86::TILERegClassID)
+      continue;
+
+    // Find the common dominator for all MI that define tile register.
+    for (const MachineOperand &MO : MRI->def_operands(VirtReg)) {
+      if (MO.isUndef())
+        continue;
+      const auto *MI = MO.getParent();
+      // PHI or IMPLICIT_DEF instructiion.
+      // There must be a input tile before PHI instruction.
+      if (MI->isTransient())
+        continue;
+      if (!MBB)
+        MBB = const_cast<MachineBasicBlock *>(MI->getParent());
+      MBB = DomTree->findNearestCommonDominator(
+          MBB, const_cast<MachineBasicBlock *>(MI->getParent()));
+
+      // Collect the instructions that define shape.
+      ShapeT Shape = getShape(*MI, MRI);
+      std::array<MachineOperand *, 2> ShapeMOs = {Shape.getRow(),
+                                                  Shape.getCol()};
+      for (auto *ShapeMO : ShapeMOs) {
+        Register ShapeReg = ShapeMO->getReg();
+        for (const MachineOperand &MO : MRI->def_operands(ShapeReg)) {
+          const auto *ShapeMI = MO.getParent();
+          MIs.insert(ShapeMI);
+        }
+      }
+    }
+  }
+  if (!MBB)
+    return nullptr;
+  // This pass is before the pass of eliminating PHI node, so it
+  // is in SSA form.
+  assert(MRI->isSSA() && "Not SSA form in pre-tile config");
+  // Shape def should dominate tile config MBB.
+  //    def s           s1    s2
+  //     / \             \   /
+  //    /   \             \ /
+  //  conf               s3=phi(s1,s2)
+  //                       |
+  //                       c
+  //
+  for (const auto *MI : MIs) {
+    const MachineBasicBlock *ShapeMBB = MI->getParent();
+    if (DomTree->dominates(ShapeMBB, MBB))
+      continue;
+    if (MI->isMoveImmediate())
+      continue;
+    report_fatal_error(MF->getName() + ": Failed to config tile register, "
+                                       "please define the shape earlier");
+  }
+
+  // ldtilecfg should be inserted after the MI that define the shape.
+  MachineBasicBlock::reverse_instr_iterator I, E;
+  for (I = MBB->instr_rbegin(), E = MBB->instr_rend(); I != E; ++I) {
+    auto *MI = &*I;
+    if (MIs.count(MI) && (!MI->isMoveImmediate()))
+      break;
+  }
+  MachineBasicBlock::iterator MII;
+  if (I == E)
+    MII = MBB->getFirstNonPHI();
+  else {
+    MII = MachineBasicBlock::iterator(&*I);
+    MII++;
+  }
+  return &*MII;
+}
+
+static void addTileCFGUse(MachineFunction &MF, Register CFG) {
+  for (MachineBasicBlock &MBB : MF) {
+
+    // Traverse the basic block.
+    for (MachineInstr &MI : MBB) {
+      unsigned Opcode = MI.getOpcode();
+      switch (Opcode) {
+      default:
+        break;
+      case X86::PTILELOADDV:
+      case X86::PTILESTOREDV:
+      case X86::PTDPBSSDV:
+      case X86::PTILEZEROV:
+        unsigned NumOperands = MI.getNumOperands();
+        MI.RemoveOperand(NumOperands - 1);
+        MI.addOperand(MF, MachineOperand::CreateReg(CFG, false));
+        break;
+      }
+    }
+  }
+}
+
+bool X86PreTileConfig::runOnMachineFunction(MachineFunction &mf) {
+  MF = &mf;
+  MRI = &mf.getRegInfo();
+  ST = &mf.getSubtarget<X86Subtarget>();
+  TRI = ST->getRegisterInfo();
+  TII = mf.getSubtarget().getInstrInfo();
+  DomTree = &getAnalysis<MachineDominatorTree>();
+
+  MachineInstr *MI = getTileConfigPoint();
+  if (!MI)
+    return false;
+  unsigned Size = ST->getTileConfigSize();
+  Align Alignment = ST->getTileConfigAlignment();
+  int SS = mf.getFrameInfo().CreateStackObject(Size, Alignment, false);
+  Register CFG = buildConfigMI(MI, SS, TII, MRI, ST);
+  addTileCFGUse(mf, CFG);
+  return true;
+}
+
+FunctionPass *llvm::createX86PreTileConfigPass() {
+  return new X86PreTileConfig();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.cpp
index f456728cf47b..d90b4e7bdc7e 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -18,6 +18,8 @@
 #include "X86Subtarget.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/LiveRegMatrix.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -664,13 +666,6 @@ bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const {
   return true;
 }
 
-bool X86RegisterInfo::hasReservedSpillSlot(const MachineFunction &MF,
-                                           Register Reg, int &FrameIdx) const {
-  // Since X86 defines assignCalleeSavedSpillSlots which always return true
-  // this function neither used nor tested.
-  llvm_unreachable("Unused function on X86. Otherwise need a test case.");
-}
-
 // tryOptimizeLEAtoMOV - helper function that tries to replace a LEA instruction
 // of the form 'lea (%esp), %ebx' --> 'mov %esp, %ebx'.
 // TODO: In this case we should be really trying first to entirely eliminate
@@ -731,11 +726,12 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     assert((!needsStackRealignment(MF) ||
            MF.getFrameInfo().isFixedObjectIndex(FrameIndex)) &&
            "Return instruction can only reference SP relative frame objects");
-    FIOffset = TFI->getFrameIndexReferenceSP(MF, FrameIndex, BasePtr, 0);
+    FIOffset =
+        TFI->getFrameIndexReferenceSP(MF, FrameIndex, BasePtr, 0).getFixed();
   } else if (TFI->Is64Bit && (MBB.isEHFuncletEntry() || IsEHFuncletEpilogue)) {
     FIOffset = TFI->getWin64EHFrameIndexRef(MF, FrameIndex, BasePtr);
   } else {
-    FIOffset = TFI->getFrameIndexReference(MF, FrameIndex, BasePtr);
+    FIOffset = TFI->getFrameIndexReference(MF, FrameIndex, BasePtr).getFixed();
   }
 
   // LOCAL_ESCAPE uses a single offset, with no register. It only works in the
@@ -790,6 +786,55 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   }
 }
 
+unsigned X86RegisterInfo::findDeadCallerSavedReg(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI) const {
+  const MachineFunction *MF = MBB.getParent();
+  if (MF->callsEHReturn())
+    return 0;
+
+  const TargetRegisterClass &AvailableRegs = *getGPRsForTailCall(*MF);
+
+  if (MBBI == MBB.end())
+    return 0;
+
+  switch (MBBI->getOpcode()) {
+  default:
+    return 0;
+  case TargetOpcode::PATCHABLE_RET:
+  case X86::RET:
+  case X86::RETL:
+  case X86::RETQ:
+  case X86::RETIL:
+  case X86::RETIQ:
+  case X86::TCRETURNdi:
+  case X86::TCRETURNri:
+  case X86::TCRETURNmi:
+  case X86::TCRETURNdi64:
+  case X86::TCRETURNri64:
+  case X86::TCRETURNmi64:
+  case X86::EH_RETURN:
+  case X86::EH_RETURN64: {
+    SmallSet<uint16_t, 8> Uses;
+    for (unsigned I = 0, E = MBBI->getNumOperands(); I != E; ++I) {
+      MachineOperand &MO = MBBI->getOperand(I);
+      if (!MO.isReg() || MO.isDef())
+        continue;
+      Register Reg = MO.getReg();
+      if (!Reg)
+        continue;
+      for (MCRegAliasIterator AI(Reg, this, true); AI.isValid(); ++AI)
+        Uses.insert(*AI);
+    }
+
+    for (auto CS : AvailableRegs)
+      if (!Uses.count(CS) && CS != X86::RIP && CS != X86::RSP && CS != X86::ESP)
+        return CS;
+  }
+  }
+
+  return 0;
+}
+
 Register X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   const X86FrameLowering *TFI = getFrameLowering(MF);
   return TFI->hasFP(MF) ? FramePtr : StackPtr;
@@ -812,3 +857,79 @@ X86RegisterInfo::getPtrSizedStackRegister(const MachineFunction &MF) const {
     StackReg = getX86SubSuperRegister(StackReg, 32);
   return StackReg;
 }
+
+static ShapeT getTileShape(Register VirtReg, VirtRegMap *VRM,
+                           const MachineRegisterInfo *MRI) {
+  if (VRM->hasShape(VirtReg))
+    return VRM->getShape(VirtReg);
+
+  const MachineOperand &Def = *MRI->def_begin(VirtReg);
+  MachineInstr *MI = const_cast<MachineInstr *>(Def.getParent());
+  unsigned OpCode = MI->getOpcode();
+  switch (OpCode) {
+  default:
+    llvm_unreachable("Unexpected machine instruction on tile register!");
+    break;
+  // We only collect the tile shape that is defined.
+  case X86::PTILELOADDV:
+  case X86::PTDPBSSDV:
+  case X86::PTILEZEROV:
+    MachineOperand &MO1 = MI->getOperand(1);
+    MachineOperand &MO2 = MI->getOperand(2);
+    ShapeT Shape(&MO1, &MO2, MRI);
+    VRM->assignVirt2Shape(VirtReg, Shape);
+    return Shape;
+  }
+}
+
+bool X86RegisterInfo::getRegAllocationHints(Register VirtReg,
+                                            ArrayRef<MCPhysReg> Order,
+                                            SmallVectorImpl<MCPhysReg> &Hints,
+                                            const MachineFunction &MF,
+                                            const VirtRegMap *VRM,
+                                            const LiveRegMatrix *Matrix) const {
+  const MachineRegisterInfo *MRI = &MF.getRegInfo();
+  const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
+  bool BaseImplRetVal = TargetRegisterInfo::getRegAllocationHints(
+      VirtReg, Order, Hints, MF, VRM, Matrix);
+
+  if (RC.getID() != X86::TILERegClassID)
+    return BaseImplRetVal;
+
+  ShapeT VirtShape = getTileShape(VirtReg, const_cast<VirtRegMap *>(VRM), MRI);
+  auto AddHint = [&](MCPhysReg PhysReg) {
+    Register VReg = Matrix->getOneVReg(PhysReg);
+    if (VReg == MCRegister::NoRegister) { // Not allocated yet
+      Hints.push_back(PhysReg);
+      return;
+    }
+    ShapeT PhysShape = getTileShape(VReg, const_cast<VirtRegMap *>(VRM), MRI);
+    if (PhysShape == VirtShape)
+      Hints.push_back(PhysReg);
+  };
+
+  SmallSet<MCPhysReg, 4> CopyHints;
+  CopyHints.insert(Hints.begin(), Hints.end());
+  Hints.clear();
+  for (auto Hint : CopyHints) {
+    if (RC.contains(Hint) && !MRI->isReserved(Hint))
+      AddHint(Hint);
+  }
+  for (MCPhysReg PhysReg : Order) {
+    if (!CopyHints.count(PhysReg) && RC.contains(PhysReg) &&
+        !MRI->isReserved(PhysReg))
+      AddHint(PhysReg);
+  }
+
+#define DEBUG_TYPE "tile-hint"
+  LLVM_DEBUG({
+    dbgs() << "Hints for virtual register " << format_hex(VirtReg, 8) << "\n";
+    for (auto Hint : Hints) {
+      dbgs() << "tmm" << Hint << ",";
+    }
+    dbgs() << "\n";
+  });
+#undef DEBUG_TYPE
+
+  return true;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.h b/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.h
index 3435c0a10b04..7fd10ddd1a15 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.h
@@ -121,13 +121,16 @@ public:
 
   bool canRealignStack(const MachineFunction &MF) const override;
 
-  bool hasReservedSpillSlot(const MachineFunction &MF, Register Reg,
-                            int &FrameIdx) const override;
-
   void eliminateFrameIndex(MachineBasicBlock::iterator MI,
                            int SPAdj, unsigned FIOperandNum,
                            RegScavenger *RS = nullptr) const override;
 
+  /// findDeadCallerSavedReg - Return a caller-saved register that isn't live
+  /// when it reaches the "return" instruction. We can then pop a stack object
+  /// to this register without worry about clobbering it.
+  unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator &MBBI) const;
+
   // Debug information queries.
   Register getFrameRegister(const MachineFunction &MF) const override;
   unsigned getPtrSizedFrameRegister(const MachineFunction &MF) const;
@@ -141,6 +144,11 @@ public:
   Register getFramePtr() const { return FramePtr; }
   // FIXME: Move to FrameInfok
   unsigned getSlotSize() const { return SlotSize; }
+
+  bool getRegAllocationHints(Register VirtReg, ArrayRef<MCPhysReg> Order,
+                             SmallVectorImpl<MCPhysReg> &Hints,
+                             const MachineFunction &MF, const VirtRegMap *VRM,
+                             const LiveRegMatrix *Matrix) const override;
 };
 
 } // End llvm namespace
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.td b/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.td
index 8de5b94bbffa..75cbd4e1cff1 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.td
@@ -265,6 +265,9 @@ let SubRegIndices = [sub_ymm] in {
   }
 }
 
+// Tile config registers.
+def TMMCFG: X86Reg<"tmmcfg", 0>;
+
 // Tile "registers".
 def TMM0:  X86Reg<"tmm0",   0>;
 def TMM1:  X86Reg<"tmm1",   1>;
@@ -633,6 +636,11 @@ def VK64WM  : RegisterClass<"X86", [v64i1], 64, (add VK32WM)> {let Size = 64;}
 def BNDR : RegisterClass<"X86", [v2i64], 128, (sequence "BND%u", 0, 3)>;
 
 // Tiles
-let isAllocatable = 0 in
-def TILE : RegisterClass<"X86", [untyped], 0,
+let CopyCost = -1 in // Don't allow copying of tile registers
+def TILE : RegisterClass<"X86", [x86amx], 8192,
                          (sequence "TMM%u", 0, 7)> {let Size = 8192;}
+def TILECFG : RegisterClass<"X86", [untyped], 512, (add TMMCFG)> {
+  let CopyCost = -1;  // Don't allow copying of tile config registers.
+  let isAllocatable = 1;
+  let Size = 512;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
index ce8d1d464da9..e76908ef4bc4 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -24,6 +24,10 @@ using namespace llvm;
 
 #define DEBUG_TYPE "x86-selectiondag-info"
 
+static cl::opt<bool>
+    UseFSRMForMemcpy("x86-use-fsrm-for-memcpy", cl::Hidden, cl::init(false),
+                     cl::desc("Use fast short rep mov in memcpy lowering"));
+
 bool X86SelectionDAGInfo::isBaseRegConflictPossible(
     SelectionDAG &DAG, ArrayRef<MCPhysReg> ClobberSet) const {
   // We cannot use TRI->hasBasePointer() until *after* we select all basic
@@ -306,6 +310,10 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
   const X86Subtarget &Subtarget =
       DAG.getMachineFunction().getSubtarget<X86Subtarget>();
 
+  // If enabled and available, use fast short rep mov.
+  if (UseFSRMForMemcpy && Subtarget.hasFSRM())
+    return emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src, Size, MVT::i8);
+
   /// Handle constant sizes,
   if (ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size))
     return emitConstantSizeRepmov(
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
index de528299654c..14a3fea240e7 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
@@ -293,55 +293,4 @@ void DecodeVPPERMMask(const Constant *C, unsigned Width,
   }
 }
 
-void DecodeVPERMVMask(const Constant *C, unsigned ElSize, unsigned Width,
-                      SmallVectorImpl<int> &ShuffleMask) {
-  assert((Width == 128 || Width == 256 || Width == 512) &&
-         C->getType()->getPrimitiveSizeInBits() >= Width &&
-         "Unexpected vector size.");
-  assert((ElSize == 8 || ElSize == 16 || ElSize == 32 || ElSize == 64) &&
-         "Unexpected vector element size.");
-
-  // The shuffle mask requires elements the same size as the target.
-  APInt UndefElts;
-  SmallVector<uint64_t, 64> RawMask;
-  if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
-    return;
-
-  unsigned NumElts = Width / ElSize;
-
-  for (unsigned i = 0; i != NumElts; ++i) {
-    if (UndefElts[i]) {
-      ShuffleMask.push_back(SM_SentinelUndef);
-      continue;
-    }
-    int Index = RawMask[i] & (NumElts - 1);
-    ShuffleMask.push_back(Index);
-  }
-}
-
-void DecodeVPERMV3Mask(const Constant *C, unsigned ElSize, unsigned Width,
-                       SmallVectorImpl<int> &ShuffleMask) {
-  assert((Width == 128 || Width == 256 || Width == 512) &&
-         C->getType()->getPrimitiveSizeInBits() >= Width &&
-         "Unexpected vector size.");
-  assert((ElSize == 8 || ElSize == 16 || ElSize == 32 || ElSize == 64) &&
-         "Unexpected vector element size.");
-
-  // The shuffle mask requires elements the same size as the target.
-  APInt UndefElts;
-  SmallVector<uint64_t, 64> RawMask;
-  if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
-    return;
-
-  unsigned NumElts = Width / ElSize;
-
-  for (unsigned i = 0; i != NumElts; ++i) {
-    if (UndefElts[i]) {
-      ShuffleMask.push_back(SM_SentinelUndef);
-      continue;
-    }
-    int Index = RawMask[i] & (NumElts*2 - 1);
-    ShuffleMask.push_back(Index);
-  }
-}
-} // llvm namespace
+} // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h b/contrib/llvm-project/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h
index 51229a69a626..77236f6aac9f 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h
@@ -38,14 +38,6 @@ void DecodeVPERMIL2PMask(const Constant *C, unsigned M2Z, unsigned ElSize,
 void DecodeVPPERMMask(const Constant *C, unsigned Width,
                       SmallVectorImpl<int> &ShuffleMask);
 
-/// Decode a VPERM W/D/Q/PS/PD mask from an IR-level vector constant.
-void DecodeVPERMVMask(const Constant *C, unsigned ElSize, unsigned Width,
-                      SmallVectorImpl<int> &ShuffleMask);
-
-/// Decode a VPERMT2 W/D/Q/PS/PD mask from an IR-level vector constant.
-void DecodeVPERMV3Mask(const Constant *C, unsigned ElSize, unsigned Width,
-                       SmallVectorImpl<int> &ShuffleMask);
-
 } // llvm namespace
 
 #endif
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86SpeculativeExecutionSideEffectSuppression.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86SpeculativeExecutionSideEffectSuppression.cpp
index 7e91c37367d2..d57871130b0c 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86SpeculativeExecutionSideEffectSuppression.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86SpeculativeExecutionSideEffectSuppression.cpp
@@ -161,6 +161,7 @@ bool X86SpeculativeExecutionSideEffectSuppression::runOnMachineFunction(
 
       // This branch requires adding an LFENCE.
       if (!PrevInstIsLFENCE) {
+        assert(FirstTerminator && "Unknown terminator instruction");
         BuildMI(MBB, FirstTerminator, DebugLoc(), TII->get(X86::LFENCE));
         NumLFENCEsInserted++;
         Modified = true;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
index fe5b9a05f811..aa73d4bce65a 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
@@ -184,7 +184,7 @@ private:
                       MachineBasicBlock::iterator InsertPt, DebugLoc Loc);
   void restoreEFLAGS(MachineBasicBlock &MBB,
                      MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
-                     unsigned OFReg);
+                     Register Reg);
 
   void mergePredStateIntoSP(MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
@@ -200,8 +200,8 @@ private:
   MachineInstr *
   sinkPostLoadHardenedInst(MachineInstr &MI,
                            SmallPtrSetImpl<MachineInstr *> &HardenedInstrs);
-  bool canHardenRegister(unsigned Reg);
-  unsigned hardenValueInRegister(unsigned Reg, MachineBasicBlock &MBB,
+  bool canHardenRegister(Register Reg);
+  unsigned hardenValueInRegister(Register Reg, MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator InsertPt,
                                  DebugLoc Loc);
   unsigned hardenPostLoad(MachineInstr &MI);
@@ -1520,7 +1520,7 @@ unsigned X86SpeculativeLoadHardeningPass::saveEFLAGS(
 /// reliably lower.
 void X86SpeculativeLoadHardeningPass::restoreEFLAGS(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
-    unsigned Reg) {
+    Register Reg) {
   BuildMI(MBB, InsertPt, Loc, TII->get(X86::COPY), X86::EFLAGS).addReg(Reg);
   ++NumInstsInserted;
 }
@@ -1842,8 +1842,7 @@ MachineInstr *X86SpeculativeLoadHardeningPass::sinkPostLoadHardenedInst(
       // just bail. Also check that its register class is one of the ones we
       // can harden.
       Register UseDefReg = UseMI.getOperand(0).getReg();
-      if (!Register::isVirtualRegister(UseDefReg) ||
-          !canHardenRegister(UseDefReg))
+      if (!UseDefReg.isVirtual() || !canHardenRegister(UseDefReg))
         return {};
 
       SingleUseMI = &UseMI;
@@ -1865,7 +1864,7 @@ MachineInstr *X86SpeculativeLoadHardeningPass::sinkPostLoadHardenedInst(
   return MI;
 }
 
-bool X86SpeculativeLoadHardeningPass::canHardenRegister(unsigned Reg) {
+bool X86SpeculativeLoadHardeningPass::canHardenRegister(Register Reg) {
   auto *RC = MRI->getRegClass(Reg);
   int RegBytes = TRI->getRegSizeInBits(*RC) / 8;
   if (RegBytes > 8)
@@ -1909,10 +1908,10 @@ bool X86SpeculativeLoadHardeningPass::canHardenRegister(unsigned Reg) {
 /// The new, hardened virtual register is returned. It will have the same
 /// register class as `Reg`.
 unsigned X86SpeculativeLoadHardeningPass::hardenValueInRegister(
-    unsigned Reg, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
+    Register Reg, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
     DebugLoc Loc) {
   assert(canHardenRegister(Reg) && "Cannot harden this register!");
-  assert(Register::isVirtualRegister(Reg) && "Cannot harden a physical register!");
+  assert(Reg.isVirtual() && "Cannot harden a physical register!");
 
   auto *RC = MRI->getRegClass(Reg);
   int Bytes = TRI->getRegSizeInBits(*RC) / 8;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86Subtarget.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86Subtarget.cpp
index 975cbabb30fd..c95213c3539d 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86Subtarget.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86Subtarget.cpp
@@ -166,6 +166,10 @@ unsigned char X86Subtarget::classifyGlobalReference(const GlobalValue *GV,
     return X86II::MO_DARWIN_NONLAZY_PIC_BASE;
   }
 
+  // 32-bit ELF references GlobalAddress directly in static relocation model.
+  // We cannot use MO_GOT because EBX may not be set up.
+  if (TM.getRelocationModel() == Reloc::Static)
+    return X86II::MO_NO_FLAG;
   return X86II::MO_GOT;
 }
 
@@ -202,6 +206,9 @@ X86Subtarget::classifyGlobalFunctionReference(const GlobalValue *GV,
          (!F && M.getRtLibUseGOT())) &&
         is64Bit())
        return X86II::MO_GOTPCREL;
+    // Reference ExternalSymbol directly in static relocation model.
+    if (!is64Bit() && !GV && TM.getRelocationModel() == Reloc::Static)
+      return X86II::MO_NO_FLAG;
     return X86II::MO_PLT;
   }
 
@@ -227,39 +234,22 @@ bool X86Subtarget::isLegalToCallImmediateAddr() const {
   return isTargetELF() || TM.getRelocationModel() == Reloc::Static;
 }
 
-void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
-  std::string CPUName = std::string(CPU);
-  if (CPUName.empty())
-    CPUName = "generic";
-
-  std::string FullFS = std::string(FS);
-  if (In64BitMode) {
-    // SSE2 should default to enabled in 64-bit mode, but can be turned off
-    // explicitly.
-    if (!FullFS.empty())
-      FullFS = "+sse2," + FullFS;
-    else
-      FullFS = "+sse2";
-
-    // If no CPU was specified, enable 64bit feature to satisy later check.
-    if (CPUName == "generic") {
-      if (!FullFS.empty())
-        FullFS = "+64bit," + FullFS;
-      else
-        FullFS = "+64bit";
-    }
-  }
+void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef TuneCPU,
+                                         StringRef FS) {
+  if (CPU.empty())
+    CPU = "generic";
 
-  // LAHF/SAHF are always supported in non-64-bit mode.
-  if (!In64BitMode) {
-    if (!FullFS.empty())
-      FullFS = "+sahf," + FullFS;
-    else
-      FullFS = "+sahf";
-  }
+  if (TuneCPU.empty())
+    TuneCPU = "i586"; // FIXME: "generic" is more modern than llc tests expect.
+
+  std::string FullFS = X86_MC::ParseX86Triple(TargetTriple);
+  assert(!FullFS.empty() && "Failed to parse X86 triple");
+
+  if (!FS.empty())
+    FullFS = (Twine(FullFS) + "," + FS).str();
 
   // Parse features string and set the CPU.
-  ParseSubtargetFeatures(CPUName, FullFS);
+  ParseSubtargetFeatures(CPU, TuneCPU, FullFS);
 
   // All CPUs that implement SSE4.2 or SSE4A support unaligned accesses of
   // 16-bytes and under that are reasonably fast. These features were
@@ -268,17 +258,6 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
   if (hasSSE42() || hasSSE4A())
     IsUAMem16Slow = false;
 
-  // It's important to keep the MCSubtargetInfo feature bits in sync with
-  // target data structure which is shared with MC code emitter, etc.
-  if (In64BitMode)
-    ToggleFeature(X86::Mode64Bit);
-  else if (In32BitMode)
-    ToggleFeature(X86::Mode32Bit);
-  else if (In16BitMode)
-    ToggleFeature(X86::Mode16Bit);
-  else
-    llvm_unreachable("Not 16-bit, 32-bit or 64-bit mode!");
-
   LLVM_DEBUG(dbgs() << "Subtarget features: SSELevel " << X86SSELevel
                     << ", 3DNowLevel " << X863DNowLevel << ", 64bit "
                     << HasX86_64 << "\n");
@@ -286,25 +265,15 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
     report_fatal_error("64-bit code requested on a subtarget that doesn't "
                        "support it!");
 
-  // Stack alignment is 16 bytes on Darwin, Linux, kFreeBSD and Solaris (both
-  // 32 and 64 bit) and for all 64-bit targets.
+  // Stack alignment is 16 bytes on Darwin, Linux, kFreeBSD and for all
+  // 64-bit targets.  On Solaris (32-bit), stack alignment is 4 bytes
+  // following the i386 psABI, while on Illumos it is always 16 bytes.
   if (StackAlignOverride)
     stackAlignment = *StackAlignOverride;
-  else if (isTargetDarwin() || isTargetLinux() || isTargetSolaris() ||
-           isTargetKFreeBSD() || In64BitMode)
+  else if (isTargetDarwin() || isTargetLinux() || isTargetKFreeBSD() ||
+           In64BitMode)
     stackAlignment = Align(16);
 
-  // Some CPUs have more overhead for gather. The specified overhead is relative
-  // to the Load operation. "2" is the number provided by Intel architects. This
-  // parameter is used for cost estimation of Gather Op and comparison with
-  // other alternatives.
-  // TODO: Remove the explicit hasAVX512()?, That would mean we would only
-  // enable gather with a -march.
-  if (hasAVX512() || (hasAVX2() && hasFastGather()))
-    GatherOverhead = 2;
-  if (hasAVX512())
-    ScatterOverhead = 2;
-
   // Consume the vector width attribute or apply any target specific limit.
   if (PreferVectorWidthOverride)
     PreferVectorWidth = PreferVectorWidthOverride;
@@ -315,27 +284,24 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
 }
 
 X86Subtarget &X86Subtarget::initializeSubtargetDependencies(StringRef CPU,
+                                                            StringRef TuneCPU,
                                                             StringRef FS) {
-  initSubtargetFeatures(CPU, FS);
+  initSubtargetFeatures(CPU, TuneCPU, FS);
   return *this;
 }
 
-X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
-                           const X86TargetMachine &TM,
+X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef TuneCPU,
+                           StringRef FS, const X86TargetMachine &TM,
                            MaybeAlign StackAlignOverride,
                            unsigned PreferVectorWidthOverride,
                            unsigned RequiredVectorWidth)
-    : X86GenSubtargetInfo(TT, CPU, FS), PICStyle(PICStyles::Style::None),
-      TM(TM), TargetTriple(TT), StackAlignOverride(StackAlignOverride),
+    : X86GenSubtargetInfo(TT, CPU, TuneCPU, FS),
+      PICStyle(PICStyles::Style::None), TM(TM), TargetTriple(TT),
+      StackAlignOverride(StackAlignOverride),
       PreferVectorWidthOverride(PreferVectorWidthOverride),
       RequiredVectorWidth(RequiredVectorWidth),
-      In64BitMode(TargetTriple.getArch() == Triple::x86_64),
-      In32BitMode(TargetTriple.getArch() == Triple::x86 &&
-                  TargetTriple.getEnvironment() != Triple::CODE16),
-      In16BitMode(TargetTriple.getArch() == Triple::x86 &&
-                  TargetTriple.getEnvironment() == Triple::CODE16),
-      InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this),
-      FrameLowering(*this, getStackAlignment()) {
+      InstrInfo(initializeSubtargetDependencies(CPU, TuneCPU, FS)),
+      TLInfo(TM, *this), FrameLowering(*this, getStackAlignment()) {
   // Determine the PICStyle based on the target selected.
   if (!isPositionIndependent())
     setPICStyle(PICStyles::Style::None);
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86Subtarget.h b/contrib/llvm-project/llvm/lib/Target/X86/X86Subtarget.h
index de45d357e3c2..fa2622333d60 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86Subtarget.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86Subtarget.h
@@ -50,7 +50,6 @@ enum class Style {
 } // end namespace PICStyles
 
 class X86Subtarget final : public X86GenSubtargetInfo {
-public:
   // NOTE: Do not add anything new to this list. Coarse, CPU name based flags
   // are not a good idea. We should be migrating away from these.
   enum X86ProcFamilyEnum {
@@ -59,7 +58,6 @@ public:
     IntelSLM
   };
 
-protected:
   enum X86SSEEnum {
     NoSSE, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F
   };
@@ -191,8 +189,8 @@ protected:
   /// Processor has RDSEED instructions.
   bool HasRDSEED = false;
 
-  /// Processor has LAHF/SAHF instructions.
-  bool HasLAHFSAHF = false;
+  /// Processor has LAHF/SAHF instructions in 64-bit mode.
+  bool HasLAHFSAHF64 = false;
 
   /// Processor has MONITORX/MWAITX instructions.
   bool HasMWAITX = false;
@@ -304,6 +302,9 @@ protected:
   /// True if the processor has enhanced REP MOVSB/STOSB.
   bool HasERMSB = false;
 
+  /// True if the processor has fast short REP MOV.
+  bool HasFSRM = false;
+
   /// True if the short functions should be padded to prevent
   /// a stall when returning too early.
   bool PadShortFunctions = false;
@@ -354,6 +355,9 @@ protected:
   /// Processor has AVX-512 Vector Neural Network Instructions
   bool HasVNNI = false;
 
+  /// Processor has AVX Vector Neural Network Instructions
+  bool HasAVXVNNI = false;
+
   /// Processor has AVX-512 bfloat16 floating-point extensions
   bool HasBF16 = false;
 
@@ -366,9 +370,6 @@ protected:
   /// Processor has AVX-512 vp2intersect instructions
   bool HasVP2INTERSECT = false;
 
-  /// Deprecated flag for MPX instructions.
-  bool DeprecatedHasMPX = false;
-
   /// Processor supports CET SHSTK - Control-Flow Enforcement Technology
   /// using Shadow Stack
   bool HasSHSTK = false;
@@ -397,6 +398,15 @@ protected:
   /// Processor supports PCONFIG instruction
   bool HasPCONFIG = false;
 
+  /// Processor support key locker instructions
+  bool HasKL = false;
+
+  /// Processor support key locker wide instructions
+  bool HasWIDEKL = false;
+
+  /// Processor supports HRESET instruction
+  bool HasHRESET = false;
+
   /// Processor supports SERIALIZE instruction
   bool HasSERIALIZE = false;
 
@@ -408,6 +418,9 @@ protected:
   bool HasAMXBF16 = false;
   bool HasAMXINT8 = false;
 
+  /// Processor supports User Level Interrupt instructions
+  bool HasUINTR = false;
+
   /// Processor has a single uop BEXTR implementation.
   bool HasFastBEXTR = false;
 
@@ -459,6 +472,8 @@ protected:
   /// entry to the function and which must be maintained by every function.
   Align stackAlignment = Align(4);
 
+  Align TileConfigAlignment = Align(4);
+
   /// Max. memset / memcpy size that is turned into rep/movs, rep/stos ops.
   ///
   // FIXME: this is a known good value for Yonah. How about others?
@@ -473,9 +488,6 @@ protected:
   /// Indicates target prefers AVX512 mask registers.
   bool PreferMaskRegisters = false;
 
-  /// Threeway branch is profitable in this subtarget.
-  bool ThreewayBranchProfitable = false;
-
   /// Use Goldmont specific floating point div/sqrt costs.
   bool UseGLMDivSqrtCosts = false;
 
@@ -503,17 +515,13 @@ private:
   unsigned RequiredVectorWidth;
 
   /// True if compiling for 64-bit, false for 16-bit or 32-bit.
-  bool In64BitMode;
+  bool In64BitMode = false;
 
   /// True if compiling for 32-bit, false for 16-bit or 64-bit.
-  bool In32BitMode;
+  bool In32BitMode = false;
 
   /// True if compiling for 16-bit, false for 32-bit or 64-bit.
-  bool In16BitMode;
-
-  /// Contains the Overhead of gather\scatter instructions
-  int GatherOverhead = 1024;
-  int ScatterOverhead = 1024;
+  bool In16BitMode = false;
 
   X86SelectionDAGInfo TSInfo;
   // Ordering here is important. X86InstrInfo initializes X86RegisterInfo which
@@ -526,7 +534,7 @@ public:
   /// This constructor initializes the data members to match that
   /// of the specified triple.
   ///
-  X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
+  X86Subtarget(const Triple &TT, StringRef CPU, StringRef TuneCPU, StringRef FS,
                const X86TargetMachine &TM, MaybeAlign StackAlignOverride,
                unsigned PreferVectorWidthOverride,
                unsigned RequiredVectorWidth);
@@ -549,6 +557,9 @@ public:
     return &getInstrInfo()->getRegisterInfo();
   }
 
+  unsigned getTileConfigSize() const { return 64; }
+  Align getTileConfigAlignment() const { return TileConfigAlignment; }
+
   /// Returns the minimum alignment known to hold of the
   /// stack frame on entry to the function and which must be maintained by every
   /// function for this subtarget.
@@ -560,7 +571,7 @@ public:
 
   /// ParseSubtargetFeatures - Parses features string setting specified
   /// subtarget options.  Definition of function is auto generated by tblgen.
-  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+  void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
 
   /// Methods used by Global ISel
   const CallLowering *getCallLowering() const override;
@@ -571,8 +582,10 @@ public:
 private:
   /// Initialize the full set of dependencies so we can use an initializer
   /// list for X86Subtarget.
-  X86Subtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
-  void initSubtargetFeatures(StringRef CPU, StringRef FS);
+  X86Subtarget &initializeSubtargetDependencies(StringRef CPU,
+                                                StringRef TuneCPU,
+                                                StringRef FS);
+  void initSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
 
 public:
   /// Is this x86_64? (disregarding specific ABI / programming model)
@@ -671,7 +684,7 @@ public:
     return hasSSE1() || (hasPRFCHW() && !has3DNow()) || hasPREFETCHWT1();
   }
   bool hasRDSEED() const { return HasRDSEED; }
-  bool hasLAHFSAHF() const { return HasLAHFSAHF; }
+  bool hasLAHFSAHF() const { return HasLAHFSAHF64 || !is64Bit(); }
   bool hasMWAITX() const { return HasMWAITX; }
   bool hasCLZERO() const { return HasCLZERO; }
   bool hasCLDEMOTE() const { return HasCLDEMOTE; }
@@ -683,8 +696,6 @@ public:
   bool isPMADDWDSlow() const { return IsPMADDWDSlow; }
   bool isUnalignedMem16Slow() const { return IsUAMem16Slow; }
   bool isUnalignedMem32Slow() const { return IsUAMem32Slow; }
-  int getGatherOverhead() const { return GatherOverhead; }
-  int getScatterOverhead() const { return ScatterOverhead; }
   bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; }
   bool hasCmpxchg16b() const { return HasCmpxchg16b && is64Bit(); }
   bool useLeaForSP() const { return UseLeaForSP; }
@@ -706,6 +717,7 @@ public:
   bool hasMacroFusion() const { return HasMacroFusion; }
   bool hasBranchFusion() const { return HasBranchFusion; }
   bool hasERMSB() const { return HasERMSB; }
+  bool hasFSRM() const { return HasFSRM; }
   bool hasSlowDivide32() const { return HasSlowDivide32; }
   bool hasSlowDivide64() const { return HasSlowDivide64; }
   bool padShortFunctions() const { return PadShortFunctions; }
@@ -734,15 +746,19 @@ public:
   bool hasWAITPKG() const { return HasWAITPKG; }
   bool hasPCONFIG() const { return HasPCONFIG; }
   bool hasSGX() const { return HasSGX; }
-  bool threewayBranchProfitable() const { return ThreewayBranchProfitable; }
   bool hasINVPCID() const { return HasINVPCID; }
   bool hasENQCMD() const { return HasENQCMD; }
+  bool hasKL() const { return HasKL; }
+  bool hasWIDEKL() const { return HasWIDEKL; }
+  bool hasHRESET() const { return HasHRESET; }
   bool hasSERIALIZE() const { return HasSERIALIZE; }
   bool hasTSXLDTRK() const { return HasTSXLDTRK; }
+  bool hasUINTR() const { return HasUINTR; }
   bool useRetpolineIndirectCalls() const { return UseRetpolineIndirectCalls; }
   bool useRetpolineIndirectBranches() const {
     return UseRetpolineIndirectBranches;
   }
+  bool hasAVXVNNI() const { return HasAVXVNNI; }
   bool hasAMXTILE() const { return HasAMXTILE; }
   bool hasAMXBF16() const { return HasAMXBF16; }
   bool hasAMXINT8() const { return HasAMXINT8; }
@@ -792,8 +808,6 @@ public:
 
   bool isXRaySupported() const override { return is64Bit(); }
 
-  X86ProcFamilyEnum getProcFamily() const { return X86ProcFamily; }
-
   /// TODO: to be removed later and replaced with suitable properties
   bool isAtom() const { return X86ProcFamily == IntelAtom; }
   bool isSLM() const { return X86ProcFamily == IntelSLM; }
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetMachine.cpp
index 7344116e14af..c8f76c210a3f 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -56,17 +56,13 @@ static cl::opt<bool> EnableMachineCombinerPass("x86-machine-combiner",
                                cl::desc("Enable the machine combiner pass"),
                                cl::init(true), cl::Hidden);
 
-static cl::opt<bool> EnableCondBrFoldingPass("x86-condbr-folding",
-                               cl::desc("Enable the conditional branch "
-                                        "folding pass"),
-                               cl::init(false), cl::Hidden);
-
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() {
   // Register the target.
   RegisterTargetMachine<X86TargetMachine> X(getTheX86_32Target());
   RegisterTargetMachine<X86TargetMachine> Y(getTheX86_64Target());
 
   PassRegistry &PR = *PassRegistry::getPassRegistry();
+  initializeX86LowerAMXTypeLegacyPassPass(PR);
   initializeGlobalISel(PR);
   initializeWinEHStatePassPass(PR);
   initializeFixupBWInstPassPass(PR);
@@ -76,6 +72,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() {
   initializeX86FixupSetCCPassPass(PR);
   initializeX86CallFrameOptimizationPass(PR);
   initializeX86CmovConverterPassPass(PR);
+  initializeX86TileConfigPass(PR);
   initializeX86ExpandPseudoPass(PR);
   initializeX86ExecutionDomainFixPass(PR);
   initializeX86DomainReassignmentPass(PR);
@@ -84,11 +81,11 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() {
   initializeX86SpeculativeLoadHardeningPassPass(PR);
   initializeX86SpeculativeExecutionSideEffectSuppressionPass(PR);
   initializeX86FlagsCopyLoweringPassPass(PR);
-  initializeX86CondBrFoldingPassPass(PR);
   initializeX86LoadValueInjectionLoadHardeningPassPass(PR);
   initializeX86LoadValueInjectionRetHardeningPassPass(PR);
   initializeX86OptimizeLEAPassPass(PR);
   initializeX86PartialReductionPass(PR);
+  initializePseudoProbeInserterPass(PR);
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -239,43 +236,30 @@ X86TargetMachine::~X86TargetMachine() = default;
 const X86Subtarget *
 X86TargetMachine::getSubtargetImpl(const Function &F) const {
   Attribute CPUAttr = F.getFnAttribute("target-cpu");
+  Attribute TuneAttr = F.getFnAttribute("tune-cpu");
   Attribute FSAttr = F.getFnAttribute("target-features");
 
-  StringRef CPU = !CPUAttr.hasAttribute(Attribute::None)
-                      ? CPUAttr.getValueAsString()
-                      : (StringRef)TargetCPU;
-  StringRef FS = !FSAttr.hasAttribute(Attribute::None)
-                     ? FSAttr.getValueAsString()
-                     : (StringRef)TargetFS;
+  StringRef CPU =
+      CPUAttr.isValid() ? CPUAttr.getValueAsString() : (StringRef)TargetCPU;
+  StringRef TuneCPU =
+      TuneAttr.isValid() ? TuneAttr.getValueAsString() : (StringRef)CPU;
+  StringRef FS =
+      FSAttr.isValid() ? FSAttr.getValueAsString() : (StringRef)TargetFS;
 
   SmallString<512> Key;
-  Key.reserve(CPU.size() + FS.size());
-  Key += CPU;
-  Key += FS;
-
-  // FIXME: This is related to the code below to reset the target options,
-  // we need to know whether or not the soft float flag is set on the
-  // function before we can generate a subtarget. We also need to use
-  // it as a key for the subtarget since that can be the only difference
-  // between two functions.
-  bool SoftFloat =
-      F.getFnAttribute("use-soft-float").getValueAsString() == "true";
-  // If the soft float attribute is set on the function turn on the soft float
-  // subtarget feature.
-  if (SoftFloat)
-    Key += FS.empty() ? "+soft-float" : ",+soft-float";
-
-  // Keep track of the key width after all features are added so we can extract
-  // the feature string out later.
-  unsigned CPUFSWidth = Key.size();
+  // The additions here are ordered so that the definitely short strings are
+  // added first so we won't exceed the small size. We append the
+  // much longer FS string at the end so that we only heap allocate at most
+  // one time.
 
   // Extract prefer-vector-width attribute.
   unsigned PreferVectorWidthOverride = 0;
-  if (F.hasFnAttribute("prefer-vector-width")) {
-    StringRef Val = F.getFnAttribute("prefer-vector-width").getValueAsString();
+  Attribute PreferVecWidthAttr = F.getFnAttribute("prefer-vector-width");
+  if (PreferVecWidthAttr.isValid()) {
+    StringRef Val = PreferVecWidthAttr.getValueAsString();
     unsigned Width;
     if (!Val.getAsInteger(0, Width)) {
-      Key += ",prefer-vector-width=";
+      Key += "prefer-vector-width=";
       Key += Val;
       PreferVectorWidthOverride = Width;
     }
@@ -283,21 +267,44 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const {
 
   // Extract min-legal-vector-width attribute.
   unsigned RequiredVectorWidth = UINT32_MAX;
-  if (F.hasFnAttribute("min-legal-vector-width")) {
-    StringRef Val =
-        F.getFnAttribute("min-legal-vector-width").getValueAsString();
+  Attribute MinLegalVecWidthAttr = F.getFnAttribute("min-legal-vector-width");
+  if (MinLegalVecWidthAttr.isValid()) {
+    StringRef Val = MinLegalVecWidthAttr.getValueAsString();
     unsigned Width;
     if (!Val.getAsInteger(0, Width)) {
-      Key += ",min-legal-vector-width=";
+      Key += "min-legal-vector-width=";
       Key += Val;
       RequiredVectorWidth = Width;
     }
   }
 
-  // Extracted here so that we make sure there is backing for the StringRef. If
-  // we assigned earlier, its possible the SmallString reallocated leaving a
-  // dangling StringRef.
-  FS = Key.slice(CPU.size(), CPUFSWidth);
+  // Add CPU to the Key.
+  Key += CPU;
+
+  // Add tune CPU to the Key.
+  Key += "tune=";
+  Key += TuneCPU;
+
+  // Keep track of the start of the feature portion of the string.
+  unsigned FSStart = Key.size();
+
+  // FIXME: This is related to the code below to reset the target options,
+  // we need to know whether or not the soft float flag is set on the
+  // function before we can generate a subtarget. We also need to use
+  // it as a key for the subtarget since that can be the only difference
+  // between two functions.
+  bool SoftFloat =
+      F.getFnAttribute("use-soft-float").getValueAsString() == "true";
+  // If the soft float attribute is set on the function turn on the soft float
+  // subtarget feature.
+  if (SoftFloat)
+    Key += FS.empty() ? "+soft-float" : "+soft-float,";
+
+  Key += FS;
+
+  // We may have added +soft-float to the features so move the StringRef to
+  // point to the full string in the Key.
+  FS = Key.substr(FSStart);
 
   auto &I = SubtargetMap[Key];
   if (!I) {
@@ -306,13 +313,21 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const {
     // function that reside in TargetOptions.
     resetTargetOptions(F);
     I = std::make_unique<X86Subtarget>(
-        TargetTriple, CPU, FS, *this,
+        TargetTriple, CPU, TuneCPU, FS, *this,
         MaybeAlign(Options.StackAlignmentOverride), PreferVectorWidthOverride,
         RequiredVectorWidth);
   }
   return I.get();
 }
 
+bool X86TargetMachine::isNoopAddrSpaceCast(unsigned SrcAS,
+                                           unsigned DestAS) const {
+  assert(SrcAS != DestAS && "Expected different address spaces!");
+  if (getPointerSize(SrcAS) != getPointerSize(DestAS))
+    return false;
+  return SrcAS < 256 && DestAS < 256;
+}
+
 //===----------------------------------------------------------------------===//
 // X86 TTI query.
 //===----------------------------------------------------------------------===//
@@ -366,6 +381,7 @@ public:
   void addPreEmitPass() override;
   void addPreEmitPass2() override;
   void addPreSched2() override;
+  bool addPreRewrite() override;
 
   std::unique_ptr<CSEConfigBase> getCSEConfig() const override;
 };
@@ -394,6 +410,7 @@ TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) {
 
 void X86PassConfig::addIRPasses() {
   addPass(createAtomicExpandPass());
+  addPass(createX86LowerAMXTypePass());
 
   TargetPassConfig::addIRPasses();
 
@@ -432,7 +449,7 @@ bool X86PassConfig::addInstSelector() {
 }
 
 bool X86PassConfig::addIRTranslator() {
-  addPass(new IRTranslator());
+  addPass(new IRTranslator(getOptLevel()));
   return false;
 }
 
@@ -452,8 +469,6 @@ bool X86PassConfig::addGlobalInstructionSelect() {
 }
 
 bool X86PassConfig::addILPOpts() {
-  if (EnableCondBrFoldingPass)
-    addPass(createX86CondBrFolding());
   addPass(&EarlyIfConverterID);
   if (EnableMachineCombinerPass)
     addPass(&MachineCombinerID);
@@ -481,7 +496,12 @@ void X86PassConfig::addPreRegAlloc() {
   addPass(createX86SpeculativeLoadHardeningPass());
   addPass(createX86FlagsCopyLoweringPass());
   addPass(createX86WinAllocaExpander());
+
+  if (getOptLevel() != CodeGenOpt::None) {
+    addPass(createX86PreTileConfigPass());
+  }
 }
+
 void X86PassConfig::addMachineSSAOptimization() {
   addPass(createX86DomainReassignmentPass());
   TargetPassConfig::addMachineSSAOptimization();
@@ -554,6 +574,11 @@ void X86PassConfig::addPreEmitPass2() {
   addPass(createX86LoadValueInjectionRetHardeningPass());
 }
 
+bool X86PassConfig::addPreRewrite() {
+  addPass(createX86TileConfigPass());
+  return true;
+}
+
 std::unique_ptr<CSEConfigBase> X86PassConfig::getCSEConfig() const {
   return getStandardCSEConfigForOpt(TM->getOptLevel());
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetMachine.h b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetMachine.h
index 8d98474a39c0..69d7e48b8977 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetMachine.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetMachine.h
@@ -54,6 +54,8 @@ public:
   }
 
   bool isJIT() const { return IsJIT; }
+
+  bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetObjectFile.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetObjectFile.cpp
index 2b48baccc01f..b88ad5a478f3 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetObjectFile.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetObjectFile.cpp
@@ -7,16 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "X86TargetObjectFile.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/BinaryFormat/COFF.h"
 #include "llvm/BinaryFormat/Dwarf.h"
-#include "llvm/CodeGen/TargetLowering.h"
-#include "llvm/IR/Mangler.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCSectionCOFF.h"
-#include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/Target/TargetMachine.h"
 
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetObjectFile.h b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetObjectFile.h
index acea772eb036..f4bf52c83771 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetObjectFile.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetObjectFile.h
@@ -36,7 +36,7 @@ namespace llvm {
                                             MCStreamer &Streamer) const override;
   };
 
-  /// This implemenatation is used for X86 ELF targets that don't
+  /// This implementation is used for X86 ELF targets that don't
   /// have a further specialization.
   class X86ELFTargetObjectFile : public TargetLoweringObjectFileELF {
   public:
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index cc18e55656ef..71455237fb61 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -232,16 +232,16 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
       bool Op2Signed = false;
       unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
 
-      bool signedMode = Op1Signed | Op2Signed;
+      bool SignedMode = Op1Signed || Op2Signed;
       unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
 
       if (OpMinSize <= 7)
         return LT.first * 3; // pmullw/sext
-      if (!signedMode && OpMinSize <= 8)
+      if (!SignedMode && OpMinSize <= 8)
         return LT.first * 3; // pmullw/zext
       if (OpMinSize <= 15)
         return LT.first * 5; // pmullw/pmulhw/pshuf
-      if (!signedMode && OpMinSize <= 16)
+      if (!SignedMode && OpMinSize <= 16)
         return LT.first * 5; // pmullw/pmulhw/pshuf
     }
 
@@ -321,6 +321,11 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
     { ISD::SHL,  MVT::v64i8,   4 }, // psllw + pand.
     { ISD::SRL,  MVT::v64i8,   4 }, // psrlw + pand.
     { ISD::SRA,  MVT::v64i8,   8 }, // psrlw, pand, pxor, psubb.
+
+    { ISD::SDIV, MVT::v16i32,  6 }, // pmuludq sequence
+    { ISD::SREM, MVT::v16i32,  8 }, // pmuludq+mul+sub sequence
+    { ISD::UDIV, MVT::v16i32,  5 }, // pmuludq sequence
+    { ISD::UREM, MVT::v16i32,  7 }, // pmuludq+mul+sub sequence
   };
 
   if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
@@ -336,6 +341,11 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
     { ISD::SRA,  MVT::v32i8,   4 }, // psrlw, pand, pxor, psubb.
 
     { ISD::SRA,  MVT::v4i64,   4 }, // 2 x psrad + shuffle.
+
+    { ISD::SDIV, MVT::v8i32,   6 }, // pmuludq sequence
+    { ISD::SREM, MVT::v8i32,   8 }, // pmuludq+mul+sub sequence
+    { ISD::UDIV, MVT::v8i32,   5 }, // pmuludq sequence
+    { ISD::UREM, MVT::v8i32,   7 }, // pmuludq+mul+sub sequence
   };
 
   if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
@@ -353,6 +363,15 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
     { ISD::SHL,  MVT::v32i8,   4+2 }, // 2*(psllw + pand) + split.
     { ISD::SRL,  MVT::v32i8,   4+2 }, // 2*(psrlw + pand) + split.
     { ISD::SRA,  MVT::v32i8,   8+2 }, // 2*(psrlw, pand, pxor, psubb) + split.
+
+    { ISD::SDIV, MVT::v8i32,  12+2 }, // 2*pmuludq sequence + split.
+    { ISD::SREM, MVT::v8i32,  16+2 }, // 2*pmuludq+mul+sub sequence + split.
+    { ISD::SDIV, MVT::v4i32,     6 }, // pmuludq sequence
+    { ISD::SREM, MVT::v4i32,     8 }, // pmuludq+mul+sub sequence
+    { ISD::UDIV, MVT::v8i32,  10+2 }, // 2*pmuludq sequence + split.
+    { ISD::UREM, MVT::v8i32,  14+2 }, // 2*pmuludq+mul+sub sequence + split.
+    { ISD::UDIV, MVT::v4i32,     5 }, // pmuludq sequence
+    { ISD::UREM, MVT::v4i32,     7 }, // pmuludq+mul+sub sequence
   };
 
   // XOP has faster vXi8 shifts.
@@ -1109,6 +1128,9 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *BaseTp,
       {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w
       {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2},  // vpermt2w
       {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
+
+      {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw
+      {TTI::SK_Select, MVT::v64i8,  1}, // vblendmb
   };
 
   if (ST->hasBWI())
@@ -1162,6 +1184,13 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *BaseTp,
       {TTI::SK_PermuteSingleSrc, MVT::v64i8,  14},
       {TTI::SK_PermuteTwoSrc,    MVT::v32i16, 42},
       {TTI::SK_PermuteTwoSrc,    MVT::v64i8,  42},
+
+      {TTI::SK_Select, MVT::v32i16, 1}, // vpternlogq
+      {TTI::SK_Select, MVT::v64i8,  1}, // vpternlogq
+      {TTI::SK_Select, MVT::v8f64,  1}, // vblendmpd
+      {TTI::SK_Select, MVT::v16f32, 1}, // vblendmps
+      {TTI::SK_Select, MVT::v8i64,  1}, // vblendmq
+      {TTI::SK_Select, MVT::v16i32, 1}, // vblendmd
   };
 
   if (ST->hasAVX512())
@@ -1367,6 +1396,7 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *BaseTp,
 }
 
 int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                                 TTI::CastContextHint CCH,
                                  TTI::TargetCostKind CostKind,
                                  const Instruction *I) {
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
@@ -1988,7 +2018,7 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
 
   // The function getSimpleVT only handles simple value types.
   if (!SrcTy.isSimple() || !DstTy.isSimple())
-    return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind));
+    return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind));
 
   MVT SimpleSrcTy = SrcTy.getSimpleVT();
   MVT SimpleDstTy = DstTy.getSimpleVT();
@@ -2049,15 +2079,18 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
       return AdjustCost(Entry->Cost);
   }
 
-  return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I));
+  return AdjustCost(
+      BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
 }
 
 int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                                   CmpInst::Predicate VecPred,
                                    TTI::TargetCostKind CostKind,
                                    const Instruction *I) {
   // TODO: Handle other cost kinds.
   if (CostKind != TTI::TCK_RecipThroughput)
-    return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I);
+    return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
+                                     I);
 
   // Legalize the type.
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
@@ -2241,7 +2274,7 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
     if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
       return LT.first * (ExtraCost + Entry->Cost);
 
-  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I);
+  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
 }
 
 unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; }
@@ -2255,6 +2288,9 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
   // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
   // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
   // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
+
+  // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not
+  //       specialized in these tables yet.
   static const CostTblEntry AVX512CDCostTbl[] = {
     { ISD::CTLZ,       MVT::v8i64,   1 },
     { ISD::CTLZ,       MVT::v16i32,  1 },
@@ -2270,6 +2306,8 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
     { ISD::CTLZ,       MVT::v16i8,   4 },
   };
   static const CostTblEntry AVX512BWCostTbl[] = {
+    { ISD::ABS,        MVT::v32i16,  1 },
+    { ISD::ABS,        MVT::v64i8,   1 },
     { ISD::BITREVERSE, MVT::v8i64,   5 },
     { ISD::BITREVERSE, MVT::v16i32,  5 },
     { ISD::BITREVERSE, MVT::v32i16,  5 },
@@ -2288,14 +2326,28 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
     { ISD::CTTZ,       MVT::v64i8,   9 },
     { ISD::SADDSAT,    MVT::v32i16,  1 },
     { ISD::SADDSAT,    MVT::v64i8,   1 },
+    { ISD::SMAX,       MVT::v32i16,  1 },
+    { ISD::SMAX,       MVT::v64i8,   1 },
+    { ISD::SMIN,       MVT::v32i16,  1 },
+    { ISD::SMIN,       MVT::v64i8,   1 },
     { ISD::SSUBSAT,    MVT::v32i16,  1 },
     { ISD::SSUBSAT,    MVT::v64i8,   1 },
     { ISD::UADDSAT,    MVT::v32i16,  1 },
     { ISD::UADDSAT,    MVT::v64i8,   1 },
+    { ISD::UMAX,       MVT::v32i16,  1 },
+    { ISD::UMAX,       MVT::v64i8,   1 },
+    { ISD::UMIN,       MVT::v32i16,  1 },
+    { ISD::UMIN,       MVT::v64i8,   1 },
     { ISD::USUBSAT,    MVT::v32i16,  1 },
     { ISD::USUBSAT,    MVT::v64i8,   1 },
   };
   static const CostTblEntry AVX512CostTbl[] = {
+    { ISD::ABS,        MVT::v8i64,   1 },
+    { ISD::ABS,        MVT::v16i32,  1 },
+    { ISD::ABS,        MVT::v32i16,  2 }, // FIXME: include split
+    { ISD::ABS,        MVT::v64i8,   2 }, // FIXME: include split
+    { ISD::ABS,        MVT::v4i64,   1 },
+    { ISD::ABS,        MVT::v2i64,   1 },
     { ISD::BITREVERSE, MVT::v8i64,  36 },
     { ISD::BITREVERSE, MVT::v16i32, 24 },
     { ISD::BITREVERSE, MVT::v32i16, 10 },
@@ -2312,6 +2364,30 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
     { ISD::CTTZ,       MVT::v16i32, 28 },
     { ISD::CTTZ,       MVT::v32i16, 24 },
     { ISD::CTTZ,       MVT::v64i8,  18 },
+    { ISD::SMAX,       MVT::v8i64,   1 },
+    { ISD::SMAX,       MVT::v16i32,  1 },
+    { ISD::SMAX,       MVT::v32i16,  2 }, // FIXME: include split
+    { ISD::SMAX,       MVT::v64i8,   2 }, // FIXME: include split
+    { ISD::SMAX,       MVT::v4i64,   1 },
+    { ISD::SMAX,       MVT::v2i64,   1 },
+    { ISD::SMIN,       MVT::v8i64,   1 },
+    { ISD::SMIN,       MVT::v16i32,  1 },
+    { ISD::SMIN,       MVT::v32i16,  2 }, // FIXME: include split
+    { ISD::SMIN,       MVT::v64i8,   2 }, // FIXME: include split
+    { ISD::SMIN,       MVT::v4i64,   1 },
+    { ISD::SMIN,       MVT::v2i64,   1 },
+    { ISD::UMAX,       MVT::v8i64,   1 },
+    { ISD::UMAX,       MVT::v16i32,  1 },
+    { ISD::UMAX,       MVT::v32i16,  2 }, // FIXME: include split
+    { ISD::UMAX,       MVT::v64i8,   2 }, // FIXME: include split
+    { ISD::UMAX,       MVT::v4i64,   1 },
+    { ISD::UMAX,       MVT::v2i64,   1 },
+    { ISD::UMIN,       MVT::v8i64,   1 },
+    { ISD::UMIN,       MVT::v16i32,  1 },
+    { ISD::UMIN,       MVT::v32i16,  2 }, // FIXME: include split
+    { ISD::UMIN,       MVT::v64i8,   2 }, // FIXME: include split
+    { ISD::UMIN,       MVT::v4i64,   1 },
+    { ISD::UMIN,       MVT::v2i64,   1 },
     { ISD::USUBSAT,    MVT::v16i32,  2 }, // pmaxud + psubd
     { ISD::USUBSAT,    MVT::v2i64,   2 }, // pmaxuq + psubq
     { ISD::USUBSAT,    MVT::v4i64,   2 }, // pmaxuq + psubq
@@ -2352,6 +2428,10 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
     { ISD::BITREVERSE, MVT::i8,      3 }
   };
   static const CostTblEntry AVX2CostTbl[] = {
+    { ISD::ABS,        MVT::v4i64,   2 }, // VBLENDVPD(X,VPSUBQ(0,X),X)
+    { ISD::ABS,        MVT::v8i32,   1 },
+    { ISD::ABS,        MVT::v16i16,  1 },
+    { ISD::ABS,        MVT::v32i8,   1 },
     { ISD::BITREVERSE, MVT::v4i64,   5 },
     { ISD::BITREVERSE, MVT::v8i32,   5 },
     { ISD::BITREVERSE, MVT::v16i16,  5 },
@@ -2373,14 +2453,28 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
     { ISD::CTTZ,       MVT::v32i8,   9 },
     { ISD::SADDSAT,    MVT::v16i16,  1 },
     { ISD::SADDSAT,    MVT::v32i8,   1 },
+    { ISD::SMAX,       MVT::v8i32,   1 },
+    { ISD::SMAX,       MVT::v16i16,  1 },
+    { ISD::SMAX,       MVT::v32i8,   1 },
+    { ISD::SMIN,       MVT::v8i32,   1 },
+    { ISD::SMIN,       MVT::v16i16,  1 },
+    { ISD::SMIN,       MVT::v32i8,   1 },
     { ISD::SSUBSAT,    MVT::v16i16,  1 },
     { ISD::SSUBSAT,    MVT::v32i8,   1 },
     { ISD::UADDSAT,    MVT::v16i16,  1 },
     { ISD::UADDSAT,    MVT::v32i8,   1 },
     { ISD::UADDSAT,    MVT::v8i32,   3 }, // not + pminud + paddd
+    { ISD::UMAX,       MVT::v8i32,   1 },
+    { ISD::UMAX,       MVT::v16i16,  1 },
+    { ISD::UMAX,       MVT::v32i8,   1 },
+    { ISD::UMIN,       MVT::v8i32,   1 },
+    { ISD::UMIN,       MVT::v16i16,  1 },
+    { ISD::UMIN,       MVT::v32i8,   1 },
     { ISD::USUBSAT,    MVT::v16i16,  1 },
     { ISD::USUBSAT,    MVT::v32i8,   1 },
     { ISD::USUBSAT,    MVT::v8i32,   2 }, // pmaxud + psubd
+    { ISD::FMAXNUM,    MVT::v8f32,   3 }, // MAXPS + CMPUNORDPS + BLENDVPS
+    { ISD::FMAXNUM,    MVT::v4f64,   3 }, // MAXPD + CMPUNORDPD + BLENDVPD
     { ISD::FSQRT,      MVT::f32,     7 }, // Haswell from http://www.agner.org/
     { ISD::FSQRT,      MVT::v4f32,   7 }, // Haswell from http://www.agner.org/
     { ISD::FSQRT,      MVT::v8f32,  14 }, // Haswell from http://www.agner.org/
@@ -2389,6 +2483,10 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
     { ISD::FSQRT,      MVT::v4f64,  28 }, // Haswell from http://www.agner.org/
   };
   static const CostTblEntry AVX1CostTbl[] = {
+    { ISD::ABS,        MVT::v4i64,   5 }, // VBLENDVPD(X,VPSUBQ(0,X),X)
+    { ISD::ABS,        MVT::v8i32,   3 },
+    { ISD::ABS,        MVT::v16i16,  3 },
+    { ISD::ABS,        MVT::v32i8,   3 },
     { ISD::BITREVERSE, MVT::v4i64,  12 }, // 2 x 128-bit Op + extract/insert
     { ISD::BITREVERSE, MVT::v8i32,  12 }, // 2 x 128-bit Op + extract/insert
     { ISD::BITREVERSE, MVT::v16i16, 12 }, // 2 x 128-bit Op + extract/insert
@@ -2410,20 +2508,32 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
     { ISD::CTTZ,       MVT::v32i8,  20 }, // 2 x 128-bit Op + extract/insert
     { ISD::SADDSAT,    MVT::v16i16,  4 }, // 2 x 128-bit Op + extract/insert
     { ISD::SADDSAT,    MVT::v32i8,   4 }, // 2 x 128-bit Op + extract/insert
+    { ISD::SMAX,       MVT::v8i32,   4 }, // 2 x 128-bit Op + extract/insert
+    { ISD::SMAX,       MVT::v16i16,  4 }, // 2 x 128-bit Op + extract/insert
+    { ISD::SMAX,       MVT::v32i8,   4 }, // 2 x 128-bit Op + extract/insert
+    { ISD::SMIN,       MVT::v8i32,   4 }, // 2 x 128-bit Op + extract/insert
+    { ISD::SMIN,       MVT::v16i16,  4 }, // 2 x 128-bit Op + extract/insert
+    { ISD::SMIN,       MVT::v32i8,   4 }, // 2 x 128-bit Op + extract/insert
     { ISD::SSUBSAT,    MVT::v16i16,  4 }, // 2 x 128-bit Op + extract/insert
     { ISD::SSUBSAT,    MVT::v32i8,   4 }, // 2 x 128-bit Op + extract/insert
     { ISD::UADDSAT,    MVT::v16i16,  4 }, // 2 x 128-bit Op + extract/insert
     { ISD::UADDSAT,    MVT::v32i8,   4 }, // 2 x 128-bit Op + extract/insert
     { ISD::UADDSAT,    MVT::v8i32,   8 }, // 2 x 128-bit Op + extract/insert
+    { ISD::UMAX,       MVT::v8i32,   4 }, // 2 x 128-bit Op + extract/insert
+    { ISD::UMAX,       MVT::v16i16,  4 }, // 2 x 128-bit Op + extract/insert
+    { ISD::UMAX,       MVT::v32i8,   4 }, // 2 x 128-bit Op + extract/insert
+    { ISD::UMIN,       MVT::v8i32,   4 }, // 2 x 128-bit Op + extract/insert
+    { ISD::UMIN,       MVT::v16i16,  4 }, // 2 x 128-bit Op + extract/insert
+    { ISD::UMIN,       MVT::v32i8,   4 }, // 2 x 128-bit Op + extract/insert
     { ISD::USUBSAT,    MVT::v16i16,  4 }, // 2 x 128-bit Op + extract/insert
     { ISD::USUBSAT,    MVT::v32i8,   4 }, // 2 x 128-bit Op + extract/insert
     { ISD::USUBSAT,    MVT::v8i32,   6 }, // 2 x 128-bit Op + extract/insert
-    { ISD::FMAXNUM,    MVT::f32,     3 },
-    { ISD::FMAXNUM,    MVT::v4f32,   3 },
-    { ISD::FMAXNUM,    MVT::v8f32,   5 },
-    { ISD::FMAXNUM,    MVT::f64,     3 },
-    { ISD::FMAXNUM,    MVT::v2f64,   3 },
-    { ISD::FMAXNUM,    MVT::v4f64,   5 },
+    { ISD::FMAXNUM,    MVT::f32,     3 }, // MAXSS + CMPUNORDSS + BLENDVPS
+    { ISD::FMAXNUM,    MVT::v4f32,   3 }, // MAXPS + CMPUNORDPS + BLENDVPS
+    { ISD::FMAXNUM,    MVT::v8f32,   5 }, // MAXPS + CMPUNORDPS + BLENDVPS + ?
+    { ISD::FMAXNUM,    MVT::f64,     3 }, // MAXSD + CMPUNORDSD + BLENDVPD
+    { ISD::FMAXNUM,    MVT::v2f64,   3 }, // MAXPD + CMPUNORDPD + BLENDVPD
+    { ISD::FMAXNUM,    MVT::v4f64,   5 }, // MAXPD + CMPUNORDPD + BLENDVPD + ?
     { ISD::FSQRT,      MVT::f32,    14 }, // SNB from http://www.agner.org/
     { ISD::FSQRT,      MVT::v4f32,  14 }, // SNB from http://www.agner.org/
     { ISD::FSQRT,      MVT::v8f32,  28 }, // SNB from http://www.agner.org/
@@ -2449,7 +2559,21 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
     { ISD::FSQRT,      MVT::f32,    18 }, // Nehalem from http://www.agner.org/
     { ISD::FSQRT,      MVT::v4f32,  18 }, // Nehalem from http://www.agner.org/
   };
+  static const CostTblEntry SSE41CostTbl[] = {
+    { ISD::ABS,        MVT::v2i64,   2 }, // BLENDVPD(X,PSUBQ(0,X),X)
+    { ISD::SMAX,       MVT::v4i32,   1 },
+    { ISD::SMAX,       MVT::v16i8,   1 },
+    { ISD::SMIN,       MVT::v4i32,   1 },
+    { ISD::SMIN,       MVT::v16i8,   1 },
+    { ISD::UMAX,       MVT::v4i32,   1 },
+    { ISD::UMAX,       MVT::v8i16,   1 },
+    { ISD::UMIN,       MVT::v4i32,   1 },
+    { ISD::UMIN,       MVT::v8i16,   1 },
+  };
   static const CostTblEntry SSSE3CostTbl[] = {
+    { ISD::ABS,        MVT::v4i32,   1 },
+    { ISD::ABS,        MVT::v8i16,   1 },
+    { ISD::ABS,        MVT::v16i8,   1 },
     { ISD::BITREVERSE, MVT::v2i64,   5 },
     { ISD::BITREVERSE, MVT::v4i32,   5 },
     { ISD::BITREVERSE, MVT::v8i16,   5 },
@@ -2471,6 +2595,10 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
     { ISD::CTTZ,       MVT::v16i8,   9 }
   };
   static const CostTblEntry SSE2CostTbl[] = {
+    { ISD::ABS,        MVT::v2i64,   4 },
+    { ISD::ABS,        MVT::v4i32,   3 },
+    { ISD::ABS,        MVT::v8i16,   2 },
+    { ISD::ABS,        MVT::v16i8,   2 },
     { ISD::BITREVERSE, MVT::v2i64,  29 },
     { ISD::BITREVERSE, MVT::v4i32,  27 },
     { ISD::BITREVERSE, MVT::v8i16,  27 },
@@ -2492,10 +2620,16 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
     { ISD::CTTZ,       MVT::v16i8,  13 },
     { ISD::SADDSAT,    MVT::v8i16,   1 },
     { ISD::SADDSAT,    MVT::v16i8,   1 },
+    { ISD::SMAX,       MVT::v8i16,   1 },
+    { ISD::SMIN,       MVT::v8i16,   1 },
     { ISD::SSUBSAT,    MVT::v8i16,   1 },
     { ISD::SSUBSAT,    MVT::v16i8,   1 },
     { ISD::UADDSAT,    MVT::v8i16,   1 },
     { ISD::UADDSAT,    MVT::v16i8,   1 },
+    { ISD::UMAX,       MVT::v8i16,   2 },
+    { ISD::UMAX,       MVT::v16i8,   1 },
+    { ISD::UMIN,       MVT::v8i16,   2 },
+    { ISD::UMIN,       MVT::v16i8,   1 },
     { ISD::USUBSAT,    MVT::v8i16,   1 },
     { ISD::USUBSAT,    MVT::v16i8,   1 },
     { ISD::FMAXNUM,    MVT::f64,     4 },
@@ -2534,14 +2668,18 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
     { ISD::CTPOP,      MVT::i8,      1 },
   };
   static const CostTblEntry X64CostTbl[] = { // 64-bit targets
+    { ISD::ABS,        MVT::i64,     2 }, // SUB+CMOV
     { ISD::BITREVERSE, MVT::i64,    14 },
     { ISD::CTLZ,       MVT::i64,     4 }, // BSR+XOR or BSR+XOR+CMOV
     { ISD::CTTZ,       MVT::i64,     3 }, // TEST+BSF+CMOV/BRANCH
     { ISD::CTPOP,      MVT::i64,    10 },
     { ISD::SADDO,      MVT::i64,     1 },
     { ISD::UADDO,      MVT::i64,     1 },
+    { ISD::UMULO,      MVT::i64,     2 }, // mulq + seto
   };
   static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
+    { ISD::ABS,        MVT::i32,     2 }, // SUB+CMOV
+    { ISD::ABS,        MVT::i16,     2 }, // SUB+CMOV
     { ISD::BITREVERSE, MVT::i32,    14 },
     { ISD::BITREVERSE, MVT::i16,    14 },
     { ISD::BITREVERSE, MVT::i8,     11 },
@@ -2560,6 +2698,9 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
     { ISD::UADDO,      MVT::i32,     1 },
     { ISD::UADDO,      MVT::i16,     1 },
     { ISD::UADDO,      MVT::i8,      1 },
+    { ISD::UMULO,      MVT::i32,     2 }, // mul + seto
+    { ISD::UMULO,      MVT::i16,     2 },
+    { ISD::UMULO,      MVT::i8,      2 },
   };
 
   Type *RetTy = ICA.getReturnType();
@@ -2569,6 +2710,9 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
   switch (IID) {
   default:
     break;
+  case Intrinsic::abs:
+    ISD = ISD::ABS;
+    break;
   case Intrinsic::bitreverse:
     ISD = ISD::BITREVERSE;
     break;
@@ -2592,12 +2736,24 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
   case Intrinsic::sadd_sat:
     ISD = ISD::SADDSAT;
     break;
+  case Intrinsic::smax:
+    ISD = ISD::SMAX;
+    break;
+  case Intrinsic::smin:
+    ISD = ISD::SMIN;
+    break;
   case Intrinsic::ssub_sat:
     ISD = ISD::SSUBSAT;
     break;
   case Intrinsic::uadd_sat:
     ISD = ISD::UADDSAT;
     break;
+  case Intrinsic::umax:
+    ISD = ISD::UMAX;
+    break;
+  case Intrinsic::umin:
+    ISD = ISD::UMIN;
+    break;
   case Intrinsic::usub_sat:
     ISD = ISD::USUBSAT;
     break;
@@ -2616,6 +2772,12 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
     ISD = ISD::UADDO;
     OpTy = RetTy->getContainedType(0);
     break;
+  case Intrinsic::umul_with_overflow:
+  case Intrinsic::smul_with_overflow:
+    // SMULO has same costs so don't duplicate.
+    ISD = ISD::UMULO;
+    OpTy = RetTy->getContainedType(0);
+    break;
   }
 
   if (ISD != ISD::DELETED_NODE) {
@@ -2624,89 +2786,121 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
     MVT MTy = LT.second;
 
     // Attempt to lookup cost.
+    if (ISD == ISD::BITREVERSE && ST->hasGFNI() && ST->hasSSSE3() &&
+        MTy.isVector()) {
+      // With PSHUFB the code is very similar for all types. If we have integer
+      // byte operations, we just need a GF2P8AFFINEQB for vXi8. For other types
+      // we also need a PSHUFB.
+      unsigned Cost = MTy.getVectorElementType() == MVT::i8 ? 1 : 2;
+
+      // Without byte operations, we need twice as many GF2P8AFFINEQB and PSHUFB
+      // instructions. We also need an extract and an insert.
+      if (!(MTy.is128BitVector() || (ST->hasAVX2() && MTy.is256BitVector()) ||
+            (ST->hasBWI() && MTy.is512BitVector())))
+        Cost = Cost * 2 + 2;
+
+      return LT.first * Cost;
+    }
+
+    auto adjustTableCost = [](const CostTblEntry &Entry, int LegalizationCost,
+                              FastMathFlags FMF) {
+      // If there are no NANs to deal with, then these are reduced to a
+      // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we
+      // assume is used in the non-fast case.
+      if (Entry.ISD == ISD::FMAXNUM || Entry.ISD == ISD::FMINNUM) {
+        if (FMF.noNaNs())
+          return LegalizationCost * 1;
+      }
+      return LegalizationCost * (int)Entry.Cost;
+    };
+
     if (ST->useGLMDivSqrtCosts())
       if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
-        return LT.first * Entry->Cost;
+        return adjustTableCost(*Entry, LT.first, ICA.getFlags());
 
     if (ST->isSLM())
       if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
-        return LT.first * Entry->Cost;
+        return adjustTableCost(*Entry, LT.first, ICA.getFlags());
 
     if (ST->hasCDI())
       if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
-        return LT.first * Entry->Cost;
+        return adjustTableCost(*Entry, LT.first, ICA.getFlags());
 
     if (ST->hasBWI())
       if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
-        return LT.first * Entry->Cost;
+        return adjustTableCost(*Entry, LT.first, ICA.getFlags());
 
     if (ST->hasAVX512())
       if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
-        return LT.first * Entry->Cost;
+        return adjustTableCost(*Entry, LT.first, ICA.getFlags());
 
     if (ST->hasXOP())
       if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
-        return LT.first * Entry->Cost;
+        return adjustTableCost(*Entry, LT.first, ICA.getFlags());
 
     if (ST->hasAVX2())
       if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
-        return LT.first * Entry->Cost;
+        return adjustTableCost(*Entry, LT.first, ICA.getFlags());
 
     if (ST->hasAVX())
       if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
-        return LT.first * Entry->Cost;
+        return adjustTableCost(*Entry, LT.first, ICA.getFlags());
 
     if (ST->hasSSE42())
       if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
-        return LT.first * Entry->Cost;
+        return adjustTableCost(*Entry, LT.first, ICA.getFlags());
+
+    if (ST->hasSSE41())
+      if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
+        return adjustTableCost(*Entry, LT.first, ICA.getFlags());
 
     if (ST->hasSSSE3())
       if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
-        return LT.first * Entry->Cost;
+        return adjustTableCost(*Entry, LT.first, ICA.getFlags());
 
     if (ST->hasSSE2())
       if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
-        return LT.first * Entry->Cost;
+        return adjustTableCost(*Entry, LT.first, ICA.getFlags());
 
     if (ST->hasSSE1())
       if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
-        return LT.first * Entry->Cost;
+        return adjustTableCost(*Entry, LT.first, ICA.getFlags());
 
     if (ST->hasBMI()) {
       if (ST->is64Bit())
         if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy))
-          return LT.first * Entry->Cost;
+          return adjustTableCost(*Entry, LT.first, ICA.getFlags());
 
       if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy))
-        return LT.first * Entry->Cost;
+        return adjustTableCost(*Entry, LT.first, ICA.getFlags());
     }
 
     if (ST->hasLZCNT()) {
       if (ST->is64Bit())
         if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
-          return LT.first * Entry->Cost;
+          return adjustTableCost(*Entry, LT.first, ICA.getFlags());
 
       if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
-        return LT.first * Entry->Cost;
+        return adjustTableCost(*Entry, LT.first, ICA.getFlags());
     }
 
     if (ST->hasPOPCNT()) {
       if (ST->is64Bit())
         if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
-          return LT.first * Entry->Cost;
+          return adjustTableCost(*Entry, LT.first, ICA.getFlags());
 
       if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
-        return LT.first * Entry->Cost;
+        return adjustTableCost(*Entry, LT.first, ICA.getFlags());
     }
 
     // TODO - add BMI (TZCNT) scalar handling
 
     if (ST->is64Bit())
       if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
-        return LT.first * Entry->Cost;
+        return adjustTableCost(*Entry, LT.first, ICA.getFlags());
 
     if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
-      return LT.first * Entry->Cost;
+      return adjustTableCost(*Entry, LT.first, ICA.getFlags());
   }
 
   return BaseT::getIntrinsicInstrCost(ICA, CostKind);
@@ -2714,9 +2908,6 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
 
 int X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                       TTI::TargetCostKind CostKind) {
-  if (CostKind != TTI::TCK_RecipThroughput)
-    return BaseT::getIntrinsicInstrCost(ICA, CostKind);
-
   if (ICA.isTypeBasedOnly())
     return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
 
@@ -2928,8 +3119,32 @@ unsigned X86TTIImpl::getScalarizationOverhead(VectorType *Ty,
         Cost +=
             BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, false);
       } else {
-        unsigned NumSubVecs = LT.second.getSizeInBits() / 128;
-        Cost += (PowerOf2Ceil(NumSubVecs) - 1) * LT.first;
+        // In each 128-lane, if at least one index is demanded but not all
+        // indices are demanded and this 128-lane is not the first 128-lane of
+        // the legalized-vector, then this 128-lane needs a extracti128; If in
+        // each 128-lane, there is at least one demanded index, this 128-lane
+        // needs a inserti128.
+
+        // The following cases will help you build a better understanding:
+        // Assume we insert several elements into a v8i32 vector in avx2,
+        // Case#1: inserting into 1th index needs vpinsrd + inserti128.
+        // Case#2: inserting into 5th index needs extracti128 + vpinsrd +
+        // inserti128.
+        // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128.
+        unsigned Num128Lanes = LT.second.getSizeInBits() / 128 * LT.first;
+        unsigned NumElts = LT.second.getVectorNumElements() * LT.first;
+        APInt WidenedDemandedElts = DemandedElts.zextOrSelf(NumElts);
+        unsigned Scale = NumElts / Num128Lanes;
+        // We iterate each 128-lane, and check if we need a
+        // extracti128/inserti128 for this 128-lane.
+        for (unsigned I = 0; I < NumElts; I += Scale) {
+          APInt Mask = WidenedDemandedElts.getBitsSet(NumElts, I, I + Scale);
+          APInt MaskedDE = Mask & WidenedDemandedElts;
+          unsigned Population = MaskedDE.countPopulation();
+          Cost += (Population > 0 && Population != Scale &&
+                   I % LT.second.getVectorNumElements() != 0);
+          Cost += Population > 0;
+        }
         Cost += DemandedElts.countPopulation();
 
         // For vXf32 cases, insertion into the 0'th index in each v4f32
@@ -2973,11 +3188,10 @@ int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
                                 const Instruction *I) {
   // TODO: Handle other cost kinds.
   if (CostKind != TTI::TCK_RecipThroughput) {
-    if (isa_and_nonnull<StoreInst>(I)) {
-      Value *Ptr = I->getOperand(1);
+    if (auto *SI = dyn_cast_or_null<StoreInst>(I)) {
       // Store instruction with index and scale costs 2 Uops.
       // Check the preceding GEP to identify non-const indices.
-      if (auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
+      if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) {
         if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
           return TTI::TCC_Basic * 2;
       }
@@ -3056,7 +3270,7 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
         getScalarizationOverhead(MaskTy, DemandedElts, false, true);
     int ScalarCompareCost = getCmpSelInstrCost(
         Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr,
-        CostKind);
+        CmpInst::BAD_ICMP_PREDICATE, CostKind);
     int BranchCost = getCFInstrCost(Instruction::Br, CostKind);
     int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
     int ValueSplitCost =
@@ -3477,8 +3691,10 @@ int X86TTIImpl::getMinMaxCost(Type *Ty, Type *CondTy, bool IsUnsigned) {
 
   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
   // Otherwise fall back to cmp+select.
-  return getCmpSelInstrCost(CmpOpcode, Ty, CondTy, CostKind) +
-         getCmpSelInstrCost(Instruction::Select, Ty, CondTy, CostKind);
+  return getCmpSelInstrCost(CmpOpcode, Ty, CondTy, CmpInst::BAD_ICMP_PREDICATE,
+                            CostKind) +
+         getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
+                            CmpInst::BAD_ICMP_PREDICATE, CostKind);
 }
 
 int X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy,
@@ -3707,8 +3923,10 @@ int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
   return std::max(1, Cost);
 }
 
-int X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,
-                                  Type *Ty, TTI::TargetCostKind CostKind) {
+int X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
+                                  const APInt &Imm, Type *Ty,
+                                  TTI::TargetCostKind CostKind,
+                                  Instruction *Inst) {
   assert(Ty->isIntegerTy());
 
   unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -3848,7 +4066,28 @@ X86TTIImpl::getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) {
   return CostKind == TTI::TCK_RecipThroughput ? 0 : 1;
 }
 
-// Return an average cost of Gather / Scatter instruction, maybe improved later
+int X86TTIImpl::getGatherOverhead() const {
+  // Some CPUs have more overhead for gather. The specified overhead is relative
+  // to the Load operation. "2" is the number provided by Intel architects. This
+  // parameter is used for cost estimation of Gather Op and comparison with
+  // other alternatives.
+  // TODO: Remove the explicit hasAVX512()?, That would mean we would only
+  // enable gather with a -march.
+  if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather()))
+    return 2;
+
+  return 1024;
+}
+
+int X86TTIImpl::getScatterOverhead() const {
+  if (ST->hasAVX512())
+    return 2;
+
+  return 1024;
+}
+
+// Return an average cost of Gather / Scatter instruction, maybe improved later.
+// FIXME: Add TargetCostKind support.
 int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, const Value *Ptr,
                                 Align Alignment, unsigned AddressSpace) {
 
@@ -3906,8 +4145,8 @@ int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, const Value *Ptr,
   // The gather / scatter cost is given by Intel architects. It is a rough
   // number since we are looking at one instruction in a time.
   const int GSOverhead = (Opcode == Instruction::Load)
-                             ? ST->getGatherOverhead()
-                             : ST->getScatterOverhead();
+                             ? getGatherOverhead()
+                             : getScatterOverhead();
   return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
                                            MaybeAlign(Alignment), AddressSpace,
                                            TTI::TCK_RecipThroughput);
@@ -3921,6 +4160,7 @@ int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, const Value *Ptr,
 /// Alignment - Alignment for one element.
 /// AddressSpace - pointer[s] address space.
 ///
+/// FIXME: Add TargetCostKind support.
 int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
                                 bool VariableMask, Align Alignment,
                                 unsigned AddressSpace) {
@@ -3934,9 +4174,9 @@ int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
         FixedVectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);
     MaskUnpackCost =
         getScalarizationOverhead(MaskTy, DemandedElts, false, true);
-    int ScalarCompareCost =
-      getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()),
-                         nullptr, CostKind);
+    int ScalarCompareCost = getCmpSelInstrCost(
+        Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()), nullptr,
+        CmpInst::BAD_ICMP_PREDICATE, CostKind);
     int BranchCost = getCFInstrCost(Instruction::Br, CostKind);
     MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
   }
@@ -3967,9 +4207,15 @@ int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
                                        Align Alignment,
                                        TTI::TargetCostKind CostKind,
                                        const Instruction *I = nullptr) {
-
-  if (CostKind != TTI::TCK_RecipThroughput)
-    return 1;
+  if (CostKind != TTI::TCK_RecipThroughput) {
+    if ((Opcode == Instruction::Load &&
+         isLegalMaskedGather(SrcVTy, Align(Alignment))) ||
+        (Opcode == Instruction::Store &&
+         isLegalMaskedScatter(SrcVTy, Align(Alignment))))
+      return 1;
+    return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask,
+                                         Alignment, CostKind, I);
+  }
 
   assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
   unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
@@ -4129,7 +4375,7 @@ bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) {
   // scalarize it.
   if (auto *DataVTy = dyn_cast<FixedVectorType>(DataTy)) {
     unsigned NumElts = DataVTy->getNumElements();
-    if (NumElts == 1 || !isPowerOf2_32(NumElts))
+    if (NumElts == 1)
       return false;
   }
   Type *ScalarTy = DataTy->getScalarType();
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.h b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.h
index d462e1f96ca2..17570f1c04a6 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -22,6 +22,8 @@
 
 namespace llvm {
 
+class InstCombiner;
+
 class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
   typedef BasicTTIImplBase<X86TTIImpl> BaseT;
   typedef TargetTransformInfo TTI;
@@ -60,7 +62,6 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
       X86::FeatureLZCNTFalseDeps,
       X86::FeatureBranchFusion,
       X86::FeatureMacroFusion,
-      X86::FeatureMergeToThreeWayBranch,
       X86::FeaturePadShortFunctions,
       X86::FeaturePOPCNTFalseDeps,
       X86::FeatureSSEUnalignedMem,
@@ -129,9 +130,10 @@ public:
   int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index,
                      VectorType *SubTp);
   int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
-                       TTI::TargetCostKind CostKind,
+                       TTI::CastContextHint CCH, TTI::TargetCostKind CostKind,
                        const Instruction *I = nullptr);
   int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                         CmpInst::Predicate VecPred,
                          TTI::TargetCostKind CostKind,
                          const Instruction *I = nullptr);
   int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
@@ -151,6 +153,18 @@ public:
   int getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
                                 const SCEV *Ptr);
 
+  Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
+                                               IntrinsicInst &II) const;
+  Optional<Value *>
+  simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II,
+                                   APInt DemandedMask, KnownBits &Known,
+                                   bool &KnownBitsComputed) const;
+  Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
+      InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
+      APInt &UndefElts2, APInt &UndefElts3,
+      std::function<void(Instruction *, unsigned, APInt, APInt &)>
+          SimplifyAndSetOp) const;
+
   unsigned getAtomicMemIntrinsicMaxElementSize() const;
 
   int getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
@@ -190,8 +204,9 @@ public:
 
   unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind);
 
-  int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty,
-                        TTI::TargetCostKind CostKind);
+  int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,
+                        Type *Ty, TTI::TargetCostKind CostKind,
+                        Instruction *Inst = nullptr);
   int getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
                           Type *Ty, TTI::TargetCostKind CostKind);
   bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
@@ -230,6 +245,9 @@ private:
   int getGSVectorCost(unsigned Opcode, Type *DataTy, const Value *Ptr,
                       Align Alignment, unsigned AddressSpace);
 
+  int getGatherOverhead() const;
+  int getScatterOverhead() const;
+
   /// @}
 };
 
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86TileConfig.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86TileConfig.cpp
new file mode 100644
index 000000000000..ef010bcd38b7
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86TileConfig.cpp
@@ -0,0 +1,248 @@
+//===-- X86TileConfig.cpp - Tile Register Configure----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file Pass to config the shape of AMX physical registers
+/// AMX register need to be configured before use. In X86PreTileConfig pass
+/// the pldtilecfg instruction is inserted, however at that time we don't
+/// know the shape of each physical tile registers, because the register
+/// allocation is not done yet. This pass runs after egister allocation
+/// pass. It collects the shape information of each physical tile register
+/// and store the shape in the stack slot that is allocated for load config
+/// to tile config register.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TileShapeInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "tile-config"
+
+namespace {
+
+class X86TileConfig : public MachineFunctionPass {
+  // context
+  MachineFunction *MF = nullptr;
+  const X86Subtarget *ST = nullptr;
+  const TargetRegisterInfo *TRI;
+  const TargetInstrInfo *TII;
+  MachineDominatorTree *DomTree = nullptr;
+  MachineRegisterInfo *MRI = nullptr;
+  VirtRegMap *VRM = nullptr;
+  LiveIntervals *LIS = nullptr;
+
+  MachineInstr *getTileConfigPoint();
+  void tileConfig();
+
+public:
+  X86TileConfig() : MachineFunctionPass(ID) {}
+
+  /// Return the pass name.
+  StringRef getPassName() const override { return "Tile Register Configure"; }
+
+  /// X86TileConfig analysis usage.
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  /// Perform register allocation.
+  bool runOnMachineFunction(MachineFunction &mf) override;
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoPHIs);
+  }
+
+  static char ID;
+};
+
+} // end anonymous namespace
+
+char X86TileConfig::ID = 0;
+
+INITIALIZE_PASS_BEGIN(X86TileConfig, "tileconfig", "Tile Register Configure",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
+INITIALIZE_PASS_END(X86TileConfig, "tileconfig", "Tile Register Configure",
+                    false, false)
+
+void X86TileConfig::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<MachineDominatorTree>();
+  AU.addRequired<LiveIntervals>();
+  AU.addPreserved<SlotIndexes>();
+  AU.addRequired<VirtRegMap>();
+  AU.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+static unsigned getTilePhysRegIndex(Register PhysReg) {
+  assert((PhysReg >= X86::TMM0 && X86::TMM0 <= X86::TMM7) &&
+         "Tile register number is invalid");
+  return (PhysReg - X86::TMM0);
+}
+
+static MachineInstr *
+storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                    Register SrcReg, unsigned BitSize, int FrameIdx, int Offset,
+                    const TargetInstrInfo *TII, const TargetRegisterClass *RC,
+                    const TargetRegisterInfo *TRI) {
+
+  unsigned SubIdx = (BitSize == 8) ? X86::sub_8bit : X86::sub_16bit;
+  unsigned Opc = (BitSize == 8) ? X86::MOV8mr : X86::MOV16mr;
+  if (BitSize == TRI->getRegSizeInBits(*RC))
+    SubIdx = 0;
+  MachineInstr *NewMI =
+      addFrameReference(BuildMI(MBB, MI, DebugLoc(), TII->get(Opc)), FrameIdx,
+                        Offset)
+          .addReg(SrcReg, 0, SubIdx);
+  return NewMI;
+}
+
+static MachineInstr *storeImmToStackSlot(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator MI,
+                                         int64_t Imm, unsigned BitSize,
+                                         int FrameIdx, int Offset,
+                                         const TargetInstrInfo *TII) {
+  unsigned Opc = (BitSize == 8) ? X86::MOV8mi : X86::MOV16mi;
+  return addFrameReference(BuildMI(MBB, MI, DebugLoc(), TII->get(Opc)),
+                           FrameIdx, Offset)
+      .addImm(Imm);
+}
+
+MachineInstr *X86TileConfig::getTileConfigPoint() {
+  for (MachineBasicBlock &MBB : *MF) {
+
+    // Traverse the basic block.
+    for (MachineInstr &MI : MBB)
+      // Refer X86PreTileConfig.cpp.
+      // We only support one tile config for now.
+      if (MI.getOpcode() == X86::PLDTILECFG)
+        return &MI;
+  }
+
+  return nullptr;
+}
+
+void X86TileConfig::tileConfig() {
+  MachineInstr *MI = getTileConfigPoint();
+  if (!MI)
+    return;
+  MachineBasicBlock *MBB = MI->getParent();
+  int SS = MI->getOperand(1).getIndex();
+  BitVector PhysRegs(TRI->getNumRegs());
+
+  // Fill in the palette first.
+  auto *NewMI = storeImmToStackSlot(*MBB, *MI, 1, 8, SS, 0, TII);
+  LIS->InsertMachineInstrInMaps(*NewMI);
+  // Fill in the shape of each tile physical register.
+  for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+    Register VirtReg = Register::index2VirtReg(i);
+    if (MRI->reg_nodbg_empty(VirtReg))
+      continue;
+    const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
+    if (RC.getID() != X86::TILERegClassID)
+      continue;
+    Register PhysReg = VRM->getPhys(VirtReg);
+    if (PhysRegs.test(PhysReg))
+      continue;
+    PhysRegs.set(PhysReg);
+    ShapeT Shape = VRM->getShape(VirtReg);
+    Register RowReg = Shape.getRow()->getReg();
+    Register ColReg = Shape.getCol()->getReg();
+
+    // Here is the data format for the tile config.
+    // 0      palette
+    // 1      start_row
+    // 2-15   reserved, must be zero
+    // 16-17  tile0.colsb Tile 0 bytes per row.
+    // 18-19  tile1.colsb Tile 1 bytes per row.
+    // 20-21  tile2.colsb Tile 2 bytes per row.
+    // ... (sequence continues)
+    // 30-31  tile7.colsb Tile 7 bytes per row.
+    // 32-47  reserved, must be zero
+    // 48     tile0.rows Tile 0 rows.
+    // 49     tile1.rows Tile 1 rows.
+    // 50     tile2.rows Tile 2 rows.
+    // ... (sequence continues)
+    // 55     tile7.rows Tile 7 rows.
+    // 56-63  reserved, must be zero
+    unsigned Index = getTilePhysRegIndex(PhysReg);
+    int RowOffset = 48 + Index;
+    int ColOffset = 16 + Index * 2;
+
+    unsigned BitSize = 8;
+    for (const auto &Pair : {std::make_pair(RowReg, RowOffset),
+                             std::make_pair(ColReg, ColOffset)}) {
+      int64_t Imm;
+      int ImmCount = 0;
+      // All def must be the same value, otherwise it is invalid MIs.
+      // Immediate is prefered.
+      for (const MachineOperand &MO : MRI->def_operands(Pair.first)) {
+        const auto *Inst = MO.getParent();
+        if (Inst->isMoveImmediate()) {
+          ImmCount++;
+          Imm = Inst->getOperand(1).getImm();
+          break;
+        }
+      }
+      auto StoreConfig = [&](int Offset) {
+        MachineInstr *NewMI = nullptr;
+        if (ImmCount)
+          NewMI = storeImmToStackSlot(*MBB, *MI, Imm, BitSize, SS, Offset, TII);
+        else {
+          const TargetRegisterClass *RC = MRI->getRegClass(Pair.first);
+          NewMI = storeRegToStackSlot(*MBB, *MI, Pair.first, BitSize, SS,
+                                      Offset, TII, RC, TRI);
+        }
+        SlotIndex SIdx = LIS->InsertMachineInstrInMaps(*NewMI);
+        if (!ImmCount) {
+          // Extend the live interval.
+          SmallVector<SlotIndex, 8> EndPoints = {SIdx.getRegSlot()};
+          LiveInterval &Int = LIS->getInterval(Pair.first);
+          LIS->extendToIndices(Int, EndPoints);
+        }
+      };
+      StoreConfig(Pair.second);
+      BitSize += 8;
+    }
+  }
+}
+
+bool X86TileConfig::runOnMachineFunction(MachineFunction &mf) {
+  MF = &mf;
+  MRI = &mf.getRegInfo();
+  ST = &mf.getSubtarget<X86Subtarget>();
+  TRI = ST->getRegisterInfo();
+  TII = mf.getSubtarget().getInstrInfo();
+  DomTree = &getAnalysis<MachineDominatorTree>();
+  VRM = &getAnalysis<VirtRegMap>();
+  LIS = &getAnalysis<LiveIntervals>();
+
+  if (VRM->isShapeMapEmpty())
+    return false;
+
+  tileConfig();
+  return true;
+}
+
+FunctionPass *llvm::createX86TileConfigPass() { return new X86TileConfig(); }
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86WinEHState.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86WinEHState.cpp
index 8627bbbf18d2..8d8bd5e6b326 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86WinEHState.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86WinEHState.cpp
@@ -109,7 +109,7 @@ private:
   /// The linked list node subobject inside of RegNode.
   Value *Link = nullptr;
 };
-}
+} // namespace
 
 FunctionPass *llvm::createX86WinEHStatePass() { return new WinEHStatePass(); }
 
diff --git a/contrib/llvm-project/llvm/lib/Target/XCore/MCTargetDesc/XCoreInstPrinter.h b/contrib/llvm-project/llvm/lib/Target/XCore/MCTargetDesc/XCoreInstPrinter.h
index c7868bf4cf8e..0ea47106434c 100644
--- a/contrib/llvm-project/llvm/lib/Target/XCore/MCTargetDesc/XCoreInstPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/XCore/MCTargetDesc/XCoreInstPrinter.h
@@ -27,6 +27,7 @@ public:
     : MCInstPrinter(MAI, MII, MRI) {}
 
   // Autogenerated by tblgen.
+  std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
   void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &O);
   static const char *getRegisterName(unsigned RegNo);
 
diff --git a/contrib/llvm-project/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp b/contrib/llvm-project/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
index 4de252548961..b44984ff6b4c 100644
--- a/contrib/llvm-project/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
@@ -51,7 +51,7 @@ static MCRegisterInfo *createXCoreMCRegisterInfo(const Triple &TT) {
 
 static MCSubtargetInfo *
 createXCoreMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
-  return createXCoreMCSubtargetInfoImpl(TT, CPU, FS);
+  return createXCoreMCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FS);
 }
 
 static MCAsmInfo *createXCoreMCAsmInfo(const MCRegisterInfo &MRI,
diff --git a/contrib/llvm-project/llvm/lib/Target/XCore/XCoreISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/XCore/XCoreISelLowering.cpp
index c32653137a10..db3dd7fb1438 100644
--- a/contrib/llvm-project/llvm/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/XCore/XCoreISelLowering.cpp
@@ -22,7 +22,6 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Constants.h"
@@ -444,16 +443,15 @@ SDValue XCoreTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   }
 
   if (LD->getAlignment() == 2) {
-    SDValue Low =
-        DAG.getExtLoad(ISD::ZEXTLOAD, DL, MVT::i32, Chain, BasePtr,
-                       LD->getPointerInfo(), MVT::i16,
-                       /* Alignment = */ 2, LD->getMemOperand()->getFlags());
+    SDValue Low = DAG.getExtLoad(ISD::ZEXTLOAD, DL, MVT::i32, Chain, BasePtr,
+                                 LD->getPointerInfo(), MVT::i16, Align(2),
+                                 LD->getMemOperand()->getFlags());
     SDValue HighAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
                                    DAG.getConstant(2, DL, MVT::i32));
     SDValue High =
         DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, HighAddr,
                        LD->getPointerInfo().getWithOffset(2), MVT::i16,
-                       /* Alignment = */ 2, LD->getMemOperand()->getFlags());
+                       Align(2), LD->getMemOperand()->getFlags());
     SDValue HighShifted = DAG.getNode(ISD::SHL, DL, MVT::i32, High,
                                       DAG.getConstant(16, DL, MVT::i32));
     SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i32, Low, HighShifted);
@@ -503,14 +501,14 @@ SDValue XCoreTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
     SDValue Low = Value;
     SDValue High = DAG.getNode(ISD::SRL, dl, MVT::i32, Value,
                                DAG.getConstant(16, dl, MVT::i32));
-    SDValue StoreLow = DAG.getTruncStore(
-        Chain, dl, Low, BasePtr, ST->getPointerInfo(), MVT::i16,
-        /* Alignment = */ 2, ST->getMemOperand()->getFlags());
+    SDValue StoreLow =
+        DAG.getTruncStore(Chain, dl, Low, BasePtr, ST->getPointerInfo(),
+                          MVT::i16, Align(2), ST->getMemOperand()->getFlags());
     SDValue HighAddr = DAG.getNode(ISD::ADD, dl, MVT::i32, BasePtr,
                                    DAG.getConstant(2, dl, MVT::i32));
     SDValue StoreHigh = DAG.getTruncStore(
         Chain, dl, High, HighAddr, ST->getPointerInfo().getWithOffset(2),
-        MVT::i16, /* Alignment = */ 2, ST->getMemOperand()->getFlags());
+        MVT::i16, Align(2), ST->getMemOperand()->getFlags());
     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StoreLow, StoreHigh);
   }
 
diff --git a/contrib/llvm-project/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp b/contrib/llvm-project/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp
index 83fc16ed98fc..6528154ab0e2 100644
--- a/contrib/llvm-project/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp
@@ -21,6 +21,7 @@
 #include "llvm/IR/IntrinsicsXCore.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/NoFolder.h"
+#include "llvm/IR/ReplaceConstant.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
@@ -74,61 +75,10 @@ createLoweredInitializer(ArrayType *NewType, Constant *OriginalInitializer) {
   return ConstantArray::get(NewType, Elements);
 }
 
-static Instruction *
-createReplacementInstr(ConstantExpr *CE, Instruction *Instr) {
-  IRBuilder<NoFolder> Builder(Instr);
-  unsigned OpCode = CE->getOpcode();
-  switch (OpCode) {
-    case Instruction::GetElementPtr: {
-      SmallVector<Value *,4> CEOpVec(CE->op_begin(), CE->op_end());
-      ArrayRef<Value *> CEOps(CEOpVec);
-      return dyn_cast<Instruction>(Builder.CreateInBoundsGEP(
-          cast<GEPOperator>(CE)->getSourceElementType(), CEOps[0],
-          CEOps.slice(1)));
-    }
-    case Instruction::Add:
-    case Instruction::Sub:
-    case Instruction::Mul:
-    case Instruction::UDiv:
-    case Instruction::SDiv:
-    case Instruction::FDiv:
-    case Instruction::URem:
-    case Instruction::SRem:
-    case Instruction::FRem:
-    case Instruction::Shl:
-    case Instruction::LShr:
-    case Instruction::AShr:
-    case Instruction::And:
-    case Instruction::Or:
-    case Instruction::Xor:
-      return dyn_cast<Instruction>(
-                  Builder.CreateBinOp((Instruction::BinaryOps)OpCode,
-                                      CE->getOperand(0), CE->getOperand(1),
-                                      CE->getName()));
-    case Instruction::Trunc:
-    case Instruction::ZExt:
-    case Instruction::SExt:
-    case Instruction::FPToUI:
-    case Instruction::FPToSI:
-    case Instruction::UIToFP:
-    case Instruction::SIToFP:
-    case Instruction::FPTrunc:
-    case Instruction::FPExt:
-    case Instruction::PtrToInt:
-    case Instruction::IntToPtr:
-    case Instruction::BitCast:
-      return dyn_cast<Instruction>(
-                  Builder.CreateCast((Instruction::CastOps)OpCode,
-                                     CE->getOperand(0), CE->getType(),
-                                     CE->getName()));
-    default:
-      llvm_unreachable("Unhandled constant expression!\n");
-  }
-}
 
 static bool replaceConstantExprOp(ConstantExpr *CE, Pass *P) {
   do {
-    SmallVector<WeakTrackingVH, 8> WUsers(CE->user_begin(), CE->user_end());
+    SmallVector<WeakTrackingVH, 8> WUsers(CE->users());
     llvm::sort(WUsers);
     WUsers.erase(std::unique(WUsers.begin(), WUsers.end()), WUsers.end());
     while (!WUsers.empty())
@@ -201,7 +151,7 @@ bool XCoreLowerThreadLocal::lowerGlobal(GlobalVariable *GV) {
                        GV->isExternallyInitialized());
 
   // Update uses.
-  SmallVector<User *, 16> Users(GV->user_begin(), GV->user_end());
+  SmallVector<User *, 16> Users(GV->users());
   for (unsigned I = 0, E = Users.size(); I != E; ++I) {
     User *U = Users[I];
     Instruction *Inst = cast<Instruction>(U);
diff --git a/contrib/llvm-project/llvm/lib/Target/XCore/XCoreSubtarget.cpp b/contrib/llvm-project/llvm/lib/Target/XCore/XCoreSubtarget.cpp
index ffeb0862c945..4b29751c7d06 100644
--- a/contrib/llvm-project/llvm/lib/Target/XCore/XCoreSubtarget.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/XCore/XCoreSubtarget.cpp
@@ -26,5 +26,5 @@ void XCoreSubtarget::anchor() { }
 
 XCoreSubtarget::XCoreSubtarget(const Triple &TT, const std::string &CPU,
                                const std::string &FS, const TargetMachine &TM)
-    : XCoreGenSubtargetInfo(TT, CPU, FS), InstrInfo(), FrameLowering(*this),
-      TLInfo(TM, *this), TSInfo() {}
+    : XCoreGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), InstrInfo(),
+      FrameLowering(*this), TLInfo(TM, *this), TSInfo() {}
diff --git a/contrib/llvm-project/llvm/lib/Target/XCore/XCoreSubtarget.h b/contrib/llvm-project/llvm/lib/Target/XCore/XCoreSubtarget.h
index 68139da9d1d0..d3979b275beb 100644
--- a/contrib/llvm-project/llvm/lib/Target/XCore/XCoreSubtarget.h
+++ b/contrib/llvm-project/llvm/lib/Target/XCore/XCoreSubtarget.h
@@ -44,7 +44,7 @@ public:
 
   /// ParseSubtargetFeatures - Parses features string setting specified
   /// subtarget options.  Definition of function is auto generated by tblgen.
-  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+  void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
 
   const XCoreInstrInfo *getInstrInfo() const override { return &InstrInfo; }
   const XCoreFrameLowering *getFrameLowering() const override {
diff --git a/contrib/llvm-project/llvm/lib/Target/XCore/XCoreTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/XCore/XCoreTargetMachine.cpp
index 1eea1e37c253..046cd6b5db7d 100644
--- a/contrib/llvm-project/llvm/lib/Target/XCore/XCoreTargetMachine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/XCore/XCoreTargetMachine.cpp
@@ -26,9 +26,7 @@
 using namespace llvm;
 
 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
-  if (!RM.hasValue())
-    return Reloc::Static;
-  return *RM;
+  return RM.getValueOr(Reloc::Static);
 }
 
 static CodeModel::Model
diff --git a/contrib/llvm-project/llvm/lib/Testing/Support/Annotations.cpp b/contrib/llvm-project/llvm/lib/Testing/Support/Annotations.cpp
index 09c572011d36..24607bd4c7d8 100644
--- a/contrib/llvm-project/llvm/lib/Testing/Support/Annotations.cpp
+++ b/contrib/llvm-project/llvm/lib/Testing/Support/Annotations.cpp
@@ -72,8 +72,10 @@ size_t Annotations::point(llvm::StringRef Name) const {
 }
 
 std::vector<size_t> Annotations::points(llvm::StringRef Name) const {
-  auto P = Points.lookup(Name);
-  return {P.begin(), P.end()};
+  auto I = Points.find(Name);
+  if (I == Points.end())
+    return {};
+  return {I->getValue().begin(), I->getValue().end()};
 }
 
 Annotations::Range Annotations::range(llvm::StringRef Name) const {
@@ -85,8 +87,10 @@ Annotations::Range Annotations::range(llvm::StringRef Name) const {
 
 std::vector<Annotations::Range>
 Annotations::ranges(llvm::StringRef Name) const {
-  auto R = Ranges.lookup(Name);
-  return {R.begin(), R.end()};
+  auto I = Ranges.find(Name);
+  if (I == Ranges.end())
+    return {};
+  return {I->getValue().begin(), I->getValue().end()};
 }
 
 llvm::raw_ostream &llvm::operator<<(llvm::raw_ostream &O,
diff --git a/contrib/llvm-project/llvm/lib/TextAPI/ELF/ELFStub.cpp b/contrib/llvm-project/llvm/lib/TextAPI/ELF/ELFStub.cpp
deleted file mode 100644
index f8463497093b..000000000000
--- a/contrib/llvm-project/llvm/lib/TextAPI/ELF/ELFStub.cpp
+++ /dev/null
@@ -1,28 +0,0 @@
-//===- ELFStub.cpp --------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===-----------------------------------------------------------------------===/
-
-#include "llvm/TextAPI/ELF/ELFStub.h"
-
-using namespace llvm;
-using namespace llvm::elfabi;
-
-ELFStub::ELFStub(ELFStub const &Stub) {
-  TbeVersion = Stub.TbeVersion;
-  Arch = Stub.Arch;
-  SoName = Stub.SoName;
-  NeededLibs = Stub.NeededLibs;
-  Symbols = Stub.Symbols;
-}
-
-ELFStub::ELFStub(ELFStub &&Stub) {
-  TbeVersion = std::move(Stub.TbeVersion);
-  Arch = std::move(Stub.Arch);
-  SoName = std::move(Stub.SoName);
-  NeededLibs = std::move(Stub.NeededLibs);
-  Symbols = std::move(Stub.Symbols);
-}
diff --git a/contrib/llvm-project/llvm/lib/TextAPI/ELF/TBEHandler.cpp b/contrib/llvm-project/llvm/lib/TextAPI/ELF/TBEHandler.cpp
deleted file mode 100644
index cb597d8896e8..000000000000
--- a/contrib/llvm-project/llvm/lib/TextAPI/ELF/TBEHandler.cpp
+++ /dev/null
@@ -1,160 +0,0 @@
-//===- TBEHandler.cpp -----------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===-----------------------------------------------------------------------===/
-
-#include "llvm/TextAPI/ELF/TBEHandler.h"
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/YAMLTraits.h"
-#include "llvm/TextAPI/ELF/ELFStub.h"
-
-using namespace llvm;
-using namespace llvm::elfabi;
-
-LLVM_YAML_STRONG_TYPEDEF(ELFArch, ELFArchMapper)
-
-namespace llvm {
-namespace yaml {
-
-/// YAML traits for ELFSymbolType.
-template <> struct ScalarEnumerationTraits<ELFSymbolType> {
-  static void enumeration(IO &IO, ELFSymbolType &SymbolType) {
-    IO.enumCase(SymbolType, "NoType", ELFSymbolType::NoType);
-    IO.enumCase(SymbolType, "Func", ELFSymbolType::Func);
-    IO.enumCase(SymbolType, "Object", ELFSymbolType::Object);
-    IO.enumCase(SymbolType, "TLS", ELFSymbolType::TLS);
-    IO.enumCase(SymbolType, "Unknown", ELFSymbolType::Unknown);
-    // Treat other symbol types as noise, and map to Unknown.
-    if (!IO.outputting() && IO.matchEnumFallback())
-      SymbolType = ELFSymbolType::Unknown;
-  }
-};
-
-/// YAML traits for ELFArch.
-template <> struct ScalarTraits<ELFArchMapper> {
-  static void output(const ELFArchMapper &Value, void *,
-                     llvm::raw_ostream &Out) {
-    // Map from integer to architecture string.
-    switch (Value) {
-    case (ELFArch)ELF::EM_X86_64:
-      Out << "x86_64";
-      break;
-    case (ELFArch)ELF::EM_AARCH64:
-      Out << "AArch64";
-      break;
-    case (ELFArch)ELF::EM_NONE:
-    default:
-      Out << "Unknown";
-    }
-  }
-
-  static StringRef input(StringRef Scalar, void *, ELFArchMapper &Value) {
-    // Map from architecture string to integer.
-    Value = StringSwitch<ELFArch>(Scalar)
-                .Case("x86_64", ELF::EM_X86_64)
-                .Case("AArch64", ELF::EM_AARCH64)
-                .Case("Unknown", ELF::EM_NONE)
-                .Default(ELF::EM_NONE);
-
-    // Returning empty StringRef indicates successful parse.
-    return StringRef();
-  }
-
-  // Don't place quotation marks around architecture value.
-  static QuotingType mustQuote(StringRef) { return QuotingType::None; }
-};
-
-/// YAML traits for TbeVersion.
-template <> struct ScalarTraits<VersionTuple> {
-  static void output(const VersionTuple &Value, void *,
-                     llvm::raw_ostream &Out) {
-    Out << Value.getAsString();
-  }
-
-  static StringRef input(StringRef Scalar, void *, VersionTuple &Value) {
-    if (Value.tryParse(Scalar))
-      return StringRef("Can't parse version: invalid version format.");
-
-    if (Value > TBEVersionCurrent)
-      return StringRef("Unsupported TBE version.");
-
-    // Returning empty StringRef indicates successful parse.
-    return StringRef();
-  }
-
-  // Don't place quotation marks around version value.
-  static QuotingType mustQuote(StringRef) { return QuotingType::None; }
-};
-
-/// YAML traits for ELFSymbol.
-template <> struct MappingTraits<ELFSymbol> {
-  static void mapping(IO &IO, ELFSymbol &Symbol) {
-    IO.mapRequired("Type", Symbol.Type);
-    // The need for symbol size depends on the symbol type.
-    if (Symbol.Type == ELFSymbolType::NoType) {
-      IO.mapOptional("Size", Symbol.Size, (uint64_t)0);
-    } else if (Symbol.Type == ELFSymbolType::Func) {
-      Symbol.Size = 0;
-    } else {
-      IO.mapRequired("Size", Symbol.Size);
-    }
-    IO.mapOptional("Undefined", Symbol.Undefined, false);
-    IO.mapOptional("Weak", Symbol.Weak, false);
-    IO.mapOptional("Warning", Symbol.Warning);
-  }
-
-  // Compacts symbol information into a single line.
-  static const bool flow = true;
-};
-
-/// YAML traits for set of ELFSymbols.
-template <> struct CustomMappingTraits<std::set<ELFSymbol>> {
-  static void inputOne(IO &IO, StringRef Key, std::set<ELFSymbol> &Set) {
-    ELFSymbol Sym(Key.str());
-    IO.mapRequired(Key.str().c_str(), Sym);
-    Set.insert(Sym);
-  }
-
-  static void output(IO &IO, std::set<ELFSymbol> &Set) {
-    for (auto &Sym : Set)
-      IO.mapRequired(Sym.Name.c_str(), const_cast<ELFSymbol &>(Sym));
-  }
-};
-
-/// YAML traits for ELFStub objects.
-template <> struct MappingTraits<ELFStub> {
-  static void mapping(IO &IO, ELFStub &Stub) {
-    if (!IO.mapTag("!tapi-tbe", true))
-      IO.setError("Not a .tbe YAML file.");
-    IO.mapRequired("TbeVersion", Stub.TbeVersion);
-    IO.mapOptional("SoName", Stub.SoName);
-    IO.mapRequired("Arch", (ELFArchMapper &)Stub.Arch);
-    IO.mapOptional("NeededLibs", Stub.NeededLibs);
-    IO.mapRequired("Symbols", Stub.Symbols);
-  }
-};
-
-} // end namespace yaml
-} // end namespace llvm
-
-Expected<std::unique_ptr<ELFStub>> elfabi::readTBEFromBuffer(StringRef Buf) {
-  yaml::Input YamlIn(Buf);
-  std::unique_ptr<ELFStub> Stub(new ELFStub());
-  YamlIn >> *Stub;
-  if (std::error_code Err = YamlIn.error())
-    return createStringError(Err, "YAML failed reading as TBE");
-
-  return std::move(Stub);
-}
-
-Error elfabi::writeTBEToOutputStream(raw_ostream &OS, const ELFStub &Stub) {
-  yaml::Output YamlOut(OS, NULL, /*WrapColumn =*/0);
-
-  YamlOut << const_cast<ELFStub &>(Stub);
-  return Error::success();
-}
diff --git a/contrib/llvm-project/llvm/lib/TextAPI/MachO/InterfaceFile.cpp b/contrib/llvm-project/llvm/lib/TextAPI/MachO/InterfaceFile.cpp
index 64d2c3e865ab..cfc1c584d496 100644
--- a/contrib/llvm-project/llvm/lib/TextAPI/MachO/InterfaceFile.cpp
+++ b/contrib/llvm-project/llvm/lib/TextAPI/MachO/InterfaceFile.cpp
@@ -69,7 +69,6 @@ void InterfaceFile::addParentUmbrella(const Target &Target_, StringRef Parent) {
   }
 
   ParentUmbrellas.emplace(Iter, Target_, std::string(Parent));
-  return;
 }
 
 void InterfaceFile::addUUID(const Target &Target_, StringRef UUID) {
@@ -83,7 +82,6 @@ void InterfaceFile::addUUID(const Target &Target_, StringRef UUID) {
   }
 
   UUIDs.emplace(Iter, Target_, std::string(UUID));
-  return;
 }
 
 void InterfaceFile::addUUID(const Target &Target, uint8_t UUID[16]) {
diff --git a/contrib/llvm-project/llvm/lib/TextAPI/MachO/Platform.cpp b/contrib/llvm-project/llvm/lib/TextAPI/MachO/Platform.cpp
index 588ec9a4d83b..f454c1cb6b16 100644
--- a/contrib/llvm-project/llvm/lib/TextAPI/MachO/Platform.cpp
+++ b/contrib/llvm-project/llvm/lib/TextAPI/MachO/Platform.cpp
@@ -49,7 +49,7 @@ PlatformKind mapToPlatformKind(const Triple &Target) {
   case Triple::WatchOS:
     return Target.isSimulatorEnvironment() ? PlatformKind::watchOSSimulator
                                            : PlatformKind::watchOS;
-    // TODO: add bridgeOS once in llvm::Triple
+    // TODO: add bridgeOS & driverKit once in llvm::Triple
   }
   llvm_unreachable("Unknown Target Triple");
 }
@@ -83,6 +83,8 @@ StringRef getPlatformName(PlatformKind Platform) {
     return "tvOS Simulator";
   case PlatformKind::watchOSSimulator:
     return "watchOS Simulator";
+  case PlatformKind::driverKit:
+    return "DriverKit";
   }
   llvm_unreachable("Unknown llvm.MachO.PlatformKind enum");
 }
diff --git a/contrib/llvm-project/llvm/lib/TextAPI/MachO/Target.cpp b/contrib/llvm-project/llvm/lib/TextAPI/MachO/Target.cpp
index aee8ef421425..6f8d9bb4e19a 100644
--- a/contrib/llvm-project/llvm/lib/TextAPI/MachO/Target.cpp
+++ b/contrib/llvm-project/llvm/lib/TextAPI/MachO/Target.cpp
@@ -33,6 +33,7 @@ Expected<Target> Target::create(StringRef TargetValue) {
                  .Case("ios-simulator", PlatformKind::iOSSimulator)
                  .Case("tvos-simulator", PlatformKind::tvOSSimulator)
                  .Case("watchos-simulator", PlatformKind::watchOSSimulator)
+                 .Case("driverkit", PlatformKind::driverKit)
                  .Default(PlatformKind::unknown);
 
   if (Platform == PlatformKind::unknown) {
diff --git a/contrib/llvm-project/llvm/lib/TextAPI/MachO/TextStub.cpp b/contrib/llvm-project/llvm/lib/TextAPI/MachO/TextStub.cpp
index 141f897fb564..1d6352b2e126 100644
--- a/contrib/llvm-project/llvm/lib/TextAPI/MachO/TextStub.cpp
+++ b/contrib/llvm-project/llvm/lib/TextAPI/MachO/TextStub.cpp
@@ -407,6 +407,9 @@ template <> struct ScalarTraits<Target> {
     case PlatformKind::watchOSSimulator:
       OS << "watchos-simulator";
       break;
+    case PlatformKind::driverKit:
+      OS << "driverkit";
+      break;
     }
   }
 
@@ -518,13 +521,12 @@ template <> struct MappingTraits<const InterfaceFile *> {
             break;
           }
         }
-        llvm::sort(Section.Symbols.begin(), Section.Symbols.end());
-        llvm::sort(Section.Classes.begin(), Section.Classes.end());
-        llvm::sort(Section.ClassEHs.begin(), Section.ClassEHs.end());
-        llvm::sort(Section.IVars.begin(), Section.IVars.end());
-        llvm::sort(Section.WeakDefSymbols.begin(),
-                   Section.WeakDefSymbols.end());
-        llvm::sort(Section.TLVSymbols.begin(), Section.TLVSymbols.end());
+        llvm::sort(Section.Symbols);
+        llvm::sort(Section.Classes);
+        llvm::sort(Section.ClassEHs);
+        llvm::sort(Section.IVars);
+        llvm::sort(Section.WeakDefSymbols);
+        llvm::sort(Section.TLVSymbols);
         Exports.emplace_back(std::move(Section));
       }
 
@@ -576,12 +578,11 @@ template <> struct MappingTraits<const InterfaceFile *> {
             break;
           }
         }
-        llvm::sort(Section.Symbols.begin(), Section.Symbols.end());
-        llvm::sort(Section.Classes.begin(), Section.Classes.end());
-        llvm::sort(Section.ClassEHs.begin(), Section.ClassEHs.end());
-        llvm::sort(Section.IVars.begin(), Section.IVars.end());
-        llvm::sort(Section.WeakRefSymbols.begin(),
-                   Section.WeakRefSymbols.end());
+        llvm::sort(Section.Symbols);
+        llvm::sort(Section.Classes);
+        llvm::sort(Section.ClassEHs);
+        llvm::sort(Section.IVars);
+        llvm::sort(Section.WeakRefSymbols);
         Undefineds.emplace_back(std::move(Section));
       }
     }
diff --git a/contrib/llvm-project/llvm/lib/TextAPI/MachO/TextStubCommon.cpp b/contrib/llvm-project/llvm/lib/TextAPI/MachO/TextStubCommon.cpp
index 4a82df6beac1..0d3614b0a24c 100644
--- a/contrib/llvm-project/llvm/lib/TextAPI/MachO/TextStubCommon.cpp
+++ b/contrib/llvm-project/llvm/lib/TextAPI/MachO/TextStubCommon.cpp
@@ -84,6 +84,9 @@ void ScalarTraits<PlatformSet>::output(const PlatformSet &Values, void *IO,
   case PlatformKind::macCatalyst:
     OS << "iosmac";
     break;
+  case PlatformKind::driverKit:
+    OS << "driverkit";
+    break;
   }
 }
 
diff --git a/contrib/llvm-project/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp b/contrib/llvm-project/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp
index cd39428b9c38..f3904b921e60 100644
--- a/contrib/llvm-project/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp
+++ b/contrib/llvm-project/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp
@@ -139,35 +139,28 @@ static void doList(opt::InputArgList& Args) {
   fatalOpenError(std::move(Err), B->getBufferIdentifier());
 }
 
-static COFF::MachineTypes getCOFFFileMachine(MemoryBufferRef MB) {
+static Expected<COFF::MachineTypes> getCOFFFileMachine(MemoryBufferRef MB) {
   std::error_code EC;
   auto Obj = object::COFFObjectFile::create(MB);
-  if (!Obj) {
-    llvm::errs() << MB.getBufferIdentifier()
-                 << ": failed to open: " << Obj.takeError() << '\n';
-    exit(1);
-  }
+  if (!Obj)
+    return Obj.takeError();
 
   uint16_t Machine = (*Obj)->getMachine();
   if (Machine != COFF::IMAGE_FILE_MACHINE_I386 &&
       Machine != COFF::IMAGE_FILE_MACHINE_AMD64 &&
       Machine != COFF::IMAGE_FILE_MACHINE_ARMNT &&
       Machine != COFF::IMAGE_FILE_MACHINE_ARM64) {
-    llvm::errs() << MB.getBufferIdentifier() << ": unknown machine: " << Machine
-                 << '\n';
-    exit(1);
+    return createStringError(inconvertibleErrorCode(),
+                             "unknown machine: " + std::to_string(Machine));
   }
 
   return static_cast<COFF::MachineTypes>(Machine);
 }
 
-static COFF::MachineTypes getBitcodeFileMachine(MemoryBufferRef MB) {
+static Expected<COFF::MachineTypes> getBitcodeFileMachine(MemoryBufferRef MB) {
   Expected<std::string> TripleStr = getBitcodeTargetTriple(MB);
-  if (!TripleStr) {
-    llvm::errs() << MB.getBufferIdentifier()
-                 << ": failed to get target triple from bitcode\n";
-    exit(1);
-  }
+  if (!TripleStr)
+    return TripleStr.takeError();
 
   switch (Triple(*TripleStr).getArch()) {
   case Triple::x86:
@@ -179,9 +172,8 @@ static COFF::MachineTypes getBitcodeFileMachine(MemoryBufferRef MB) {
   case Triple::aarch64:
     return COFF::IMAGE_FILE_MACHINE_ARM64;
   default:
-    llvm::errs() << MB.getBufferIdentifier()
-                 << ": unknown arch in target triple " << *TripleStr << '\n';
-    exit(1);
+    return createStringError(inconvertibleErrorCode(),
+                             "unknown arch in target triple: " + *TripleStr);
   }
 }
 
@@ -201,7 +193,7 @@ static void appendFile(std::vector<NewArchiveMember> &Members,
 
   // If a user attempts to add an archive to another archive, llvm-lib doesn't
   // handle the first archive file as a single file. Instead, it extracts all
-  // members from the archive and add them to the second archive. This beahvior
+  // members from the archive and add them to the second archive. This behavior
   // is for compatibility with Microsoft's lib command.
   if (Magic == file_magic::archive) {
     Error Err = Error::success();
@@ -233,9 +225,17 @@ static void appendFile(std::vector<NewArchiveMember> &Members,
   // in writeArchive() which needs to support many tools, can't assume the
   // input is COFF, and doesn't have a good way to report errors.
   if (Magic == file_magic::coff_object || Magic == file_magic::bitcode) {
-    COFF::MachineTypes FileMachine = (Magic == file_magic::coff_object)
-                                         ? getCOFFFileMachine(MB)
-                                         : getBitcodeFileMachine(MB);
+    Expected<COFF::MachineTypes> MaybeFileMachine =
+        (Magic == file_magic::coff_object) ? getCOFFFileMachine(MB)
+                                           : getBitcodeFileMachine(MB);
+    if (!MaybeFileMachine) {
+      handleAllErrors(MaybeFileMachine.takeError(), [&](const ErrorInfoBase &EIB) {
+        llvm::errs() << MB.getBufferIdentifier() << ": " << EIB.message()
+                     << "\n";
+      });
+      exit(1);
+    }
+    COFF::MachineTypes FileMachine = *MaybeFileMachine;
 
     // FIXME: Once lld-link rejects multiple resource .obj files:
     // Call convertResToCOFF() on .res files and add the resulting
diff --git a/contrib/llvm-project/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/contrib/llvm-project/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index d315c7f13ac2..a7ae10d156d5 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -21,8 +21,10 @@
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/PatternMatch.h"
@@ -38,6 +40,8 @@ using namespace PatternMatch;
 STATISTIC(NumAnyOrAllBitsSet, "Number of any/all-bits-set patterns folded");
 STATISTIC(NumGuardedRotates,
           "Number of guarded rotates transformed into funnel shifts");
+STATISTIC(NumGuardedFunnelShifts,
+          "Number of guarded funnel shifts transformed into funnel shifts");
 STATISTIC(NumPopCountRecognized, "Number of popcount idioms recognized");
 
 namespace {
@@ -66,96 +70,127 @@ public:
 };
 } // namespace
 
-/// Match a pattern for a bitwise rotate operation that partially guards
-/// against undefined behavior by branching around the rotation when the shift
-/// amount is 0.
-static bool foldGuardedRotateToFunnelShift(Instruction &I) {
+/// Match a pattern for a bitwise funnel/rotate operation that partially guards
+/// against undefined behavior by branching around the funnel-shift/rotation
+/// when the shift amount is 0.
+static bool foldGuardedFunnelShift(Instruction &I, const DominatorTree &DT) {
   if (I.getOpcode() != Instruction::PHI || I.getNumOperands() != 2)
     return false;
 
   // As with the one-use checks below, this is not strictly necessary, but we
   // are being cautious to avoid potential perf regressions on targets that
-  // do not actually have a rotate instruction (where the funnel shift would be
-  // expanded back into math/shift/logic ops).
+  // do not actually have a funnel/rotate instruction (where the funnel shift
+  // would be expanded back into math/shift/logic ops).
   if (!isPowerOf2_32(I.getType()->getScalarSizeInBits()))
     return false;
 
-  // Match V to funnel shift left/right and capture the source operand and
-  // shift amount in X and Y.
-  auto matchRotate = [](Value *V, Value *&X, Value *&Y) {
-    Value *L0, *L1, *R0, *R1;
+  // Match V to funnel shift left/right and capture the source operands and
+  // shift amount.
+  auto matchFunnelShift = [](Value *V, Value *&ShVal0, Value *&ShVal1,
+                             Value *&ShAmt) {
+    Value *SubAmt;
     unsigned Width = V->getType()->getScalarSizeInBits();
-    auto Sub = m_Sub(m_SpecificInt(Width), m_Value(R1));
-
-    // rotate_left(X, Y) == (X << Y) | (X >> (Width - Y))
-    auto RotL = m_OneUse(
-        m_c_Or(m_Shl(m_Value(L0), m_Value(L1)), m_LShr(m_Value(R0), Sub)));
-    if (RotL.match(V) && L0 == R0 && L1 == R1) {
-      X = L0;
-      Y = L1;
-      return Intrinsic::fshl;
+
+    // fshl(ShVal0, ShVal1, ShAmt)
+    //  == (ShVal0 << ShAmt) | (ShVal1 >> (Width -ShAmt))
+    if (match(V, m_OneUse(m_c_Or(
+                     m_Shl(m_Value(ShVal0), m_Value(ShAmt)),
+                     m_LShr(m_Value(ShVal1),
+                            m_Sub(m_SpecificInt(Width), m_Value(SubAmt))))))) {
+      if (ShAmt == SubAmt) // TODO: Use m_Specific
+        return Intrinsic::fshl;
     }
 
-    // rotate_right(X, Y) == (X >> Y) | (X << (Width - Y))
-    auto RotR = m_OneUse(
-        m_c_Or(m_LShr(m_Value(L0), m_Value(L1)), m_Shl(m_Value(R0), Sub)));
-    if (RotR.match(V) && L0 == R0 && L1 == R1) {
-      X = L0;
-      Y = L1;
-      return Intrinsic::fshr;
+    // fshr(ShVal0, ShVal1, ShAmt)
+    //  == (ShVal0 >> ShAmt) | (ShVal1 << (Width - ShAmt))
+    if (match(V,
+              m_OneUse(m_c_Or(m_Shl(m_Value(ShVal0), m_Sub(m_SpecificInt(Width),
+                                                           m_Value(SubAmt))),
+                              m_LShr(m_Value(ShVal1), m_Value(ShAmt)))))) {
+      if (ShAmt == SubAmt) // TODO: Use m_Specific
+        return Intrinsic::fshr;
     }
 
     return Intrinsic::not_intrinsic;
   };
 
-  // One phi operand must be a rotate operation, and the other phi operand must
-  // be the source value of that rotate operation:
-  // phi [ rotate(RotSrc, RotAmt), RotBB ], [ RotSrc, GuardBB ]
+  // One phi operand must be a funnel/rotate operation, and the other phi
+  // operand must be the source value of that funnel/rotate operation:
+  // phi [ rotate(RotSrc, ShAmt), FunnelBB ], [ RotSrc, GuardBB ]
+  // phi [ fshl(ShVal0, ShVal1, ShAmt), FunnelBB ], [ ShVal0, GuardBB ]
+  // phi [ fshr(ShVal0, ShVal1, ShAmt), FunnelBB ], [ ShVal1, GuardBB ]
   PHINode &Phi = cast<PHINode>(I);
+  unsigned FunnelOp = 0, GuardOp = 1;
   Value *P0 = Phi.getOperand(0), *P1 = Phi.getOperand(1);
-  Value *RotSrc, *RotAmt;
-  Intrinsic::ID IID = matchRotate(P0, RotSrc, RotAmt);
-  if (IID == Intrinsic::not_intrinsic || RotSrc != P1) {
-    IID = matchRotate(P1, RotSrc, RotAmt);
-    if (IID == Intrinsic::not_intrinsic || RotSrc != P0)
+  Value *ShVal0, *ShVal1, *ShAmt;
+  Intrinsic::ID IID = matchFunnelShift(P0, ShVal0, ShVal1, ShAmt);
+  if (IID == Intrinsic::not_intrinsic ||
+      (IID == Intrinsic::fshl && ShVal0 != P1) ||
+      (IID == Intrinsic::fshr && ShVal1 != P1)) {
+    IID = matchFunnelShift(P1, ShVal0, ShVal1, ShAmt);
+    if (IID == Intrinsic::not_intrinsic ||
+        (IID == Intrinsic::fshl && ShVal0 != P0) ||
+        (IID == Intrinsic::fshr && ShVal1 != P0))
       return false;
     assert((IID == Intrinsic::fshl || IID == Intrinsic::fshr) &&
            "Pattern must match funnel shift left or right");
+    std::swap(FunnelOp, GuardOp);
   }
 
   // The incoming block with our source operand must be the "guard" block.
-  // That must contain a cmp+branch to avoid the rotate when the shift amount
-  // is equal to 0. The other incoming block is the block with the rotate.
-  BasicBlock *GuardBB = Phi.getIncomingBlock(RotSrc == P1);
-  BasicBlock *RotBB = Phi.getIncomingBlock(RotSrc != P1);
+  // That must contain a cmp+branch to avoid the funnel/rotate when the shift
+  // amount is equal to 0. The other incoming block is the block with the
+  // funnel/rotate.
+  BasicBlock *GuardBB = Phi.getIncomingBlock(GuardOp);
+  BasicBlock *FunnelBB = Phi.getIncomingBlock(FunnelOp);
   Instruction *TermI = GuardBB->getTerminator();
+
+  // Ensure that the shift values dominate each block.
+  if (!DT.dominates(ShVal0, TermI) || !DT.dominates(ShVal1, TermI))
+    return false;
+
   ICmpInst::Predicate Pred;
   BasicBlock *PhiBB = Phi.getParent();
-  if (!match(TermI, m_Br(m_ICmp(Pred, m_Specific(RotAmt), m_ZeroInt()),
-                         m_SpecificBB(PhiBB), m_SpecificBB(RotBB))))
+  if (!match(TermI, m_Br(m_ICmp(Pred, m_Specific(ShAmt), m_ZeroInt()),
+                         m_SpecificBB(PhiBB), m_SpecificBB(FunnelBB))))
     return false;
 
   if (Pred != CmpInst::ICMP_EQ)
     return false;
 
+  IRBuilder<> Builder(PhiBB, PhiBB->getFirstInsertionPt());
+
+  if (ShVal0 == ShVal1)
+    ++NumGuardedRotates;
+  else
+    ++NumGuardedFunnelShifts;
+
+  // If this is not a rotate then the select was blocking poison from the
+  // 'shift-by-zero' non-TVal, but a funnel shift won't - so freeze it.
+  bool IsFshl = IID == Intrinsic::fshl;
+  if (ShVal0 != ShVal1) {
+    if (IsFshl && !llvm::isGuaranteedNotToBePoison(ShVal1))
+      ShVal1 = Builder.CreateFreeze(ShVal1);
+    else if (!IsFshl && !llvm::isGuaranteedNotToBePoison(ShVal0))
+      ShVal0 = Builder.CreateFreeze(ShVal0);
+  }
+
   // We matched a variation of this IR pattern:
   // GuardBB:
-  //   %cmp = icmp eq i32 %RotAmt, 0
-  //   br i1 %cmp, label %PhiBB, label %RotBB
-  // RotBB:
-  //   %sub = sub i32 32, %RotAmt
-  //   %shr = lshr i32 %X, %sub
-  //   %shl = shl i32 %X, %RotAmt
-  //   %rot = or i32 %shr, %shl
+  //   %cmp = icmp eq i32 %ShAmt, 0
+  //   br i1 %cmp, label %PhiBB, label %FunnelBB
+  // FunnelBB:
+  //   %sub = sub i32 32, %ShAmt
+  //   %shr = lshr i32 %ShVal1, %sub
+  //   %shl = shl i32 %ShVal0, %ShAmt
+  //   %fsh = or i32 %shr, %shl
   //   br label %PhiBB
   // PhiBB:
-  //   %cond = phi i32 [ %rot, %RotBB ], [ %X, %GuardBB ]
+  //   %cond = phi i32 [ %fsh, %FunnelBB ], [ %ShVal0, %GuardBB ]
   // -->
-  // llvm.fshl.i32(i32 %X, i32 %RotAmt)
-  IRBuilder<> Builder(PhiBB, PhiBB->getFirstInsertionPt());
+  // llvm.fshl.i32(i32 %ShVal0, i32 %ShVal1, i32 %ShAmt)
   Function *F = Intrinsic::getDeclaration(Phi.getModule(), IID, Phi.getType());
-  Phi.replaceAllUsesWith(Builder.CreateCall(F, {RotSrc, RotSrc, RotAmt}));
-  ++NumGuardedRotates;
+  Phi.replaceAllUsesWith(Builder.CreateCall(F, {ShVal0, ShVal1, ShAmt}));
   return true;
 }
 
@@ -202,8 +237,8 @@ static bool matchAndOrChain(Value *V, MaskOps &MOps) {
   // We need a shift-right or a bare value representing a compare of bit 0 of
   // the original source operand.
   Value *Candidate;
-  uint64_t BitIndex = 0;
-  if (!match(V, m_LShr(m_Value(Candidate), m_ConstantInt(BitIndex))))
+  const APInt *BitIndex = nullptr;
+  if (!match(V, m_LShr(m_Value(Candidate), m_APInt(BitIndex))))
     Candidate = V;
 
   // Initialize result source operand.
@@ -211,11 +246,11 @@ static bool matchAndOrChain(Value *V, MaskOps &MOps) {
     MOps.Root = Candidate;
 
   // The shift constant is out-of-range? This code hasn't been simplified.
-  if (BitIndex >= MOps.Mask.getBitWidth())
+  if (BitIndex && BitIndex->uge(MOps.Mask.getBitWidth()))
     return false;
 
   // Fill in the mask bit derived from the shift constant.
-  MOps.Mask.setBit(BitIndex);
+  MOps.Mask.setBit(BitIndex ? BitIndex->getZExtValue() : 0);
   return MOps.Root == Candidate;
 }
 
@@ -344,7 +379,7 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT) {
     // iteratively in this loop rather than waiting until the end.
     for (Instruction &I : make_range(BB.rbegin(), BB.rend())) {
       MadeChange |= foldAnyOrAllBitsSet(I);
-      MadeChange |= foldGuardedRotateToFunnelShift(I);
+      MadeChange |= foldGuardedFunnelShift(I, DT);
       MadeChange |= tryToRecognizePopCount(I); 
     }
   }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp b/contrib/llvm-project/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
index 5cd40c66227f..16b82219e8ca 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
@@ -31,8 +31,8 @@
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
-#include "llvm/IR/Instruction.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
 
 using namespace llvm;
 
@@ -130,8 +130,7 @@ bool TruncInstCombine::buildTruncExpressionDag() {
     case Instruction::Select: {
       SmallVector<Value *, 2> Operands;
       getRelevantOperands(I, Operands);
-      for (Value *Operand : Operands)
-        Worklist.push_back(Operand);
+      append_range(Worklist, Operands);
       break;
     }
     default:
@@ -289,10 +288,8 @@ Type *TruncInstCombine::getBestTruncatedType() {
 /// version of \p Ty, otherwise return \p Ty.
 static Type *getReducedType(Value *V, Type *Ty) {
   assert(Ty && !Ty->isVectorTy() && "Expect Scalar Type");
-  if (auto *VTy = dyn_cast<VectorType>(V->getType())) {
-    // FIXME: should this handle scalable vectors?
-    return FixedVectorType::get(Ty, VTy->getNumElements());
-  }
+  if (auto *VTy = dyn_cast<VectorType>(V->getType()))
+    return VectorType::get(Ty, VTy->getElementCount());
   return Ty;
 }
 
@@ -344,7 +341,7 @@ void TruncInstCombine::ReduceExpressionDag(Type *SclTy) {
       // 1. Update Old-TruncInst -> New-TruncInst.
       // 2. Remove Old-TruncInst (if New node is not TruncInst).
       // 3. Add New-TruncInst (if Old node was not TruncInst).
-      auto Entry = find(Worklist, I);
+      auto *Entry = find(Worklist, I);
       if (Entry != Worklist.end()) {
         if (auto *NewCI = dyn_cast<TruncInst>(Res))
           *Entry = NewCI;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp
index 233eae37c497..298149f8b546 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp
@@ -74,6 +74,7 @@ bool Lowerer::lowerRemainingCoroIntrinsics(Function &F) {
       case Intrinsic::coro_id:
       case Intrinsic::coro_id_retcon:
       case Intrinsic::coro_id_retcon_once:
+      case Intrinsic::coro_id_async:
         II->replaceAllUsesWith(ConstantTokenNone::get(Context));
         break;
       case Intrinsic::coro_subfn_addr:
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroEarly.cpp b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
index 242e6c3f6b23..5e5e513cdfda 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
@@ -149,6 +149,7 @@ bool Lowerer::lowerEarlyIntrinsics(Function &F) {
   bool Changed = false;
   CoroIdInst *CoroId = nullptr;
   SmallVector<CoroFreeInst *, 4> CoroFrees;
+  bool HasCoroSuspend = false;
   for (auto IB = inst_begin(F), IE = inst_end(F); IB != IE;) {
     Instruction &I = *IB++;
     if (auto *CB = dyn_cast<CallBase>(&I)) {
@@ -163,11 +164,13 @@ bool Lowerer::lowerEarlyIntrinsics(Function &F) {
         // pass expects that there is at most one final suspend point.
         if (cast<CoroSuspendInst>(&I)->isFinal())
           CB->setCannotDuplicate();
+        HasCoroSuspend = true;
         break;
+      case Intrinsic::coro_end_async:
       case Intrinsic::coro_end:
         // Make sure that fallthrough coro.end is not duplicated as CoroSplit
         // pass expects that there is at most one fallthrough coro.end.
-        if (cast<CoroEndInst>(&I)->isFallthrough())
+        if (cast<AnyCoroEndInst>(&I)->isFallthrough())
           CB->setCannotDuplicate();
         break;
       case Intrinsic::coro_noop:
@@ -187,6 +190,7 @@ bool Lowerer::lowerEarlyIntrinsics(Function &F) {
         break;
       case Intrinsic::coro_id_retcon:
       case Intrinsic::coro_id_retcon_once:
+      case Intrinsic::coro_id_async:
         F.addFnAttr(CORO_PRESPLIT_ATTR, PREPARED_FOR_SPLIT);
         break;
       case Intrinsic::coro_resume:
@@ -211,15 +215,23 @@ bool Lowerer::lowerEarlyIntrinsics(Function &F) {
   if (CoroId)
     for (CoroFreeInst *CF : CoroFrees)
       CF->setArgOperand(0, CoroId);
+  // Coroutine suspention could potentially lead to any argument modified
+  // outside of the function, hence arguments should not have noalias
+  // attributes.
+  if (HasCoroSuspend)
+    for (Argument &A : F.args())
+      if (A.hasNoAliasAttr())
+        A.removeAttr(Attribute::NoAlias);
   return Changed;
 }
 
 static bool declaresCoroEarlyIntrinsics(const Module &M) {
   return coro::declaresIntrinsics(
       M, {"llvm.coro.id", "llvm.coro.id.retcon", "llvm.coro.id.retcon.once",
-          "llvm.coro.destroy", "llvm.coro.done", "llvm.coro.end",
-          "llvm.coro.noop", "llvm.coro.free", "llvm.coro.promise",
-          "llvm.coro.resume", "llvm.coro.suspend"});
+          "llvm.coro.id.async", "llvm.coro.destroy", "llvm.coro.done",
+          "llvm.coro.end", "llvm.coro.end.async", "llvm.coro.noop",
+          "llvm.coro.free", "llvm.coro.promise", "llvm.coro.resume",
+          "llvm.coro.suspend"});
 }
 
 PreservedAnalyses CoroEarlyPass::run(Function &F, FunctionAnalysisManager &) {
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroElide.cpp b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroElide.cpp
index 9d364b3097c1..07a183cfc66b 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroElide.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroElide.cpp
@@ -83,14 +83,8 @@ static void removeTailCallAttribute(AllocaInst *Frame, AAResults &AA) {
   Function &F = *Frame->getFunction();
   for (Instruction &I : instructions(F))
     if (auto *Call = dyn_cast<CallInst>(&I))
-      if (Call->isTailCall() && operandReferences(Call, Frame, AA)) {
-        // FIXME: If we ever hit this check. Evaluate whether it is more
-        // appropriate to retain musttail and allow the code to compile.
-        if (Call->isMustTailCall())
-          report_fatal_error("Call referring to the coroutine frame cannot be "
-                             "marked as musttail");
+      if (Call->isTailCall() && operandReferences(Call, Frame, AA))
         Call->setTailCall(false);
-      }
 }
 
 // Given a resume function @f.resume(%f.frame* %frame), returns the size
@@ -252,7 +246,20 @@ bool Lowerer::shouldElide(Function *F, DominatorTree &DT) const {
   // If size of the set is the same as total number of coro.begin, that means we
   // found a coro.free or coro.destroy referencing each coro.begin, so we can
   // perform heap elision.
-  return ReferencedCoroBegins.size() == CoroBegins.size();
+  if (ReferencedCoroBegins.size() != CoroBegins.size())
+    return false;
+
+  // If any call in the function is a musttail call, it usually won't work
+  // because we cannot drop the tailcall attribute, and a tail call will reuse
+  // the entire stack where we are going to put the new frame. In theory a more
+  // precise analysis can be done to check whether the new frame aliases with
+  // the call, however it's challenging to do so before the elision actually
+  // happened.
+  for (BasicBlock &BB : *F)
+    if (BB.getTerminatingMustTailCall())
+      return false;
+
+  return true;
 }
 
 void Lowerer::collectPostSplitCoroIds(Function *F) {
@@ -366,7 +373,7 @@ static bool replaceDevirtTrigger(Function &F) {
 }
 
 static bool declaresCoroElideIntrinsics(Module &M) {
-  return coro::declaresIntrinsics(M, {"llvm.coro.id"});
+  return coro::declaresIntrinsics(M, {"llvm.coro.id", "llvm.coro.id.async"});
 }
 
 PreservedAnalyses CoroElidePass::run(Function &F, FunctionAnalysisManager &AM) {
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
index f55501a05d85..e1e0d50979dc 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -20,16 +20,18 @@
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Analysis/PtrUseVisitor.h"
+#include "llvm/Analysis/StackLifetime.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Support/circular_raw_ostream.h"
 #include "llvm/Support/OptimizedStructLayout.h"
+#include "llvm/Support/circular_raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
@@ -41,6 +43,13 @@ using namespace llvm;
 // "coro-frame", which results in leaner debug spew.
 #define DEBUG_TYPE "coro-suspend-crossing"
 
+static cl::opt<bool> EnableReuseStorageInFrame(
+    "reuse-storage-in-coroutine-frame", cl::Hidden,
+    cl::desc(
+        "Enable the optimization which would reuse the storage in the coroutine \
+         frame for allocas whose liferanges are not overlapped, for testing purposes"),
+    llvm::cl::init(false));
+
 enum { SmallVectorThreshold = 32 };
 
 // Provides two way mapping between the blocks and numbers.
@@ -126,10 +135,10 @@ struct SuspendCrossingInfo {
 
     BasicBlock *UseBB = I->getParent();
 
-    // As a special case, treat uses by an llvm.coro.suspend.retcon
-    // as if they were uses in the suspend's single predecessor: the
-    // uses conceptually occur before the suspend.
-    if (isa<CoroSuspendRetconInst>(I)) {
+    // As a special case, treat uses by an llvm.coro.suspend.retcon or an
+    // llvm.coro.suspend.async as if they were uses in the suspend's single
+    // predecessor: the uses conceptually occur before the suspend.
+    if (isa<CoroSuspendRetconInst>(I) || isa<CoroSuspendAsyncInst>(I)) {
       UseBB = UseBB->getSinglePredecessor();
       assert(UseBB && "should have split coro.suspend into its own block");
     }
@@ -283,71 +292,96 @@ SuspendCrossingInfo::SuspendCrossingInfo(Function &F, coro::Shape &Shape)
 #undef DEBUG_TYPE // "coro-suspend-crossing"
 #define DEBUG_TYPE "coro-frame"
 
-// We build up the list of spills for every case where a use is separated
-// from the definition by a suspend point.
-
-static const unsigned InvalidFieldIndex = ~0U;
-
 namespace {
-class Spill {
-  Value *Def = nullptr;
-  Instruction *User = nullptr;
-  unsigned FieldNo = InvalidFieldIndex;
-
-public:
-  Spill(Value *Def, llvm::User *U) : Def(Def), User(cast<Instruction>(U)) {}
-
-  Value *def() const { return Def; }
-  Instruction *user() const { return User; }
-  BasicBlock *userBlock() const { return User->getParent(); }
+class FrameTypeBuilder;
+// Mapping from the to-be-spilled value to all the users that need reload.
+using SpillInfo = SmallMapVector<Value *, SmallVector<Instruction *, 2>, 8>;
+struct AllocaInfo {
+  AllocaInst *Alloca;
+  DenseMap<Instruction *, llvm::Optional<APInt>> Aliases;
+  bool MayWriteBeforeCoroBegin;
+  AllocaInfo(AllocaInst *Alloca,
+             DenseMap<Instruction *, llvm::Optional<APInt>> Aliases,
+             bool MayWriteBeforeCoroBegin)
+      : Alloca(Alloca), Aliases(std::move(Aliases)),
+        MayWriteBeforeCoroBegin(MayWriteBeforeCoroBegin) {}
+};
+struct FrameDataInfo {
+  // All the values (that are not allocas) that needs to be spilled to the
+  // frame.
+  SpillInfo Spills;
+  // Allocas contains all values defined as allocas that need to live in the
+  // frame.
+  SmallVector<AllocaInfo, 8> Allocas;
+
+  SmallVector<Value *, 8> getAllDefs() const {
+    SmallVector<Value *, 8> Defs;
+    for (const auto &P : Spills)
+      Defs.push_back(P.first);
+    for (const auto &A : Allocas)
+      Defs.push_back(A.Alloca);
+    return Defs;
+  }
 
-  // Note that field index is stored in the first SpillEntry for a particular
-  // definition. Subsequent mentions of a defintion do not have fieldNo
-  // assigned. This works out fine as the users of Spills capture the info about
-  // the definition the first time they encounter it. Consider refactoring
-  // SpillInfo into two arrays to normalize the spill representation.
-  unsigned fieldIndex() const {
-    assert(FieldNo != InvalidFieldIndex && "Accessing unassigned field");
-    return FieldNo;
+  uint32_t getFieldIndex(Value *V) const {
+    auto Itr = FieldIndexMap.find(V);
+    assert(Itr != FieldIndexMap.end() &&
+           "Value does not have a frame field index");
+    return Itr->second;
   }
-  void setFieldIndex(unsigned FieldNumber) {
-    assert(FieldNo == InvalidFieldIndex && "Reassigning field number");
-    FieldNo = FieldNumber;
+
+  void setFieldIndex(Value *V, uint32_t Index) {
+    assert((LayoutIndexUpdateStarted || FieldIndexMap.count(V) == 0) &&
+           "Cannot set the index for the same field twice.");
+    FieldIndexMap[V] = Index;
   }
+
+  // Remap the index of every field in the frame, using the final layout index.
+  void updateLayoutIndex(FrameTypeBuilder &B);
+
+private:
+  // LayoutIndexUpdateStarted is used to avoid updating the index of any field
+  // twice by mistake.
+  bool LayoutIndexUpdateStarted = false;
+  // Map from values to their slot indexes on the frame. They will be first set
+  // with their original insertion field index. After the frame is built, their
+  // indexes will be updated into the final layout index.
+  DenseMap<Value *, uint32_t> FieldIndexMap;
 };
 } // namespace
 
-// Note that there may be more than one record with the same value of Def in
-// the SpillInfo vector.
-using SpillInfo = SmallVector<Spill, 8>;
-
 #ifndef NDEBUG
-static void dump(StringRef Title, SpillInfo const &Spills) {
+static void dumpSpills(StringRef Title, const SpillInfo &Spills) {
   dbgs() << "------------- " << Title << "--------------\n";
-  Value *CurrentValue = nullptr;
-  for (auto const &E : Spills) {
-    if (CurrentValue != E.def()) {
-      CurrentValue = E.def();
-      CurrentValue->dump();
-    }
+  for (const auto &E : Spills) {
+    E.first->dump();
     dbgs() << "   user: ";
-    E.user()->dump();
+    for (auto *I : E.second)
+      I->dump();
+  }
+}
+
+static void dumpAllocas(const SmallVectorImpl<AllocaInfo> &Allocas) {
+  dbgs() << "------------- Allocas --------------\n";
+  for (const auto &A : Allocas) {
+    A.Alloca->dump();
   }
 }
 #endif
 
 namespace {
+using FieldIDType = size_t;
 // We cannot rely solely on natural alignment of a type when building a
 // coroutine frame and if the alignment specified on the Alloca instruction
 // differs from the natural alignment of the alloca type we will need to insert
 // padding.
 class FrameTypeBuilder {
+private:
   struct Field {
     uint64_t Size;
     uint64_t Offset;
-    Spill *ForSpill;
     Type *Ty;
-    unsigned FieldIndex;
+    FieldIDType LayoutFieldIndex;
     Align Alignment;
     Align TyAlignment;
   };
@@ -363,19 +397,12 @@ class FrameTypeBuilder {
 
 public:
   FrameTypeBuilder(LLVMContext &Context, DataLayout const &DL)
-    : DL(DL), Context(Context) {}
-
-  class FieldId {
-    size_t Value;
-    explicit FieldId(size_t Value) : Value(Value) {}
-
-    friend class FrameTypeBuilder;
-  };
+      : DL(DL), Context(Context) {}
 
   /// Add a field to this structure for the storage of an `alloca`
   /// instruction.
-  FieldId addFieldForAlloca(AllocaInst *AI, Spill *ForSpill = nullptr,
-                            bool IsHeader = false) {
+  LLVM_NODISCARD FieldIDType addFieldForAlloca(AllocaInst *AI,
+                                               bool IsHeader = false) {
     Type *Ty = AI->getAllocatedType();
 
     // Make an array type if this is a static array allocation.
@@ -386,13 +413,42 @@ public:
         report_fatal_error("Coroutines cannot handle non static allocas yet");
     }
 
-    return addField(Ty, AI->getAlign(), ForSpill, IsHeader);
+    return addField(Ty, AI->getAlign(), IsHeader);
   }
 
+  /// We want to put the allocas whose lifetime-ranges are not overlapped
+  /// into one slot of coroutine frame.
+  /// Consider the example at:https://bugs.llvm.org/show_bug.cgi?id=45566
+  ///
+  ///     cppcoro::task<void> alternative_paths(bool cond) {
+  ///         if (cond) {
+  ///             big_structure a;
+  ///             process(a);
+  ///             co_await something();
+  ///         } else {
+  ///             big_structure b;
+  ///             process2(b);
+  ///             co_await something();
+  ///         }
+  ///     }
+  ///
+  /// We want to put variable a and variable b in the same slot to
+  /// reduce the size of coroutine frame.
+  ///
+  /// This function use StackLifetime algorithm to partition the AllocaInsts in
+  /// Spills to non-overlapped sets in order to put Alloca in the same
+  /// non-overlapped set into the same slot in the Coroutine Frame. Then add
+  /// field for the allocas in the same non-overlapped set by using the largest
+  /// type as the field type.
+  ///
+  /// Side Effects: Because We sort the allocas, the order of allocas in the
+  /// frame may be different with the order in the source code.
+  void addFieldForAllocas(const Function &F, FrameDataInfo &FrameData,
+                          coro::Shape &Shape);
+
   /// Add a field to this structure.
-  FieldId addField(Type *Ty, MaybeAlign FieldAlignment,
-                   Spill *ForSpill = nullptr,
-                   bool IsHeader = false) {
+  LLVM_NODISCARD FieldIDType addField(Type *Ty, MaybeAlign FieldAlignment,
+                                      bool IsHeader = false) {
     assert(!IsFinished && "adding fields to a finished builder");
     assert(Ty && "must provide a type for a field");
 
@@ -415,9 +471,8 @@ public:
       Offset = OptimizedStructLayoutField::FlexibleOffset;
     }
 
-    Fields.push_back({FieldSize, Offset, ForSpill, Ty, 0,
-                      *FieldAlignment, TyAlignment});
-    return FieldId(Fields.size() - 1);
+    Fields.push_back({FieldSize, Offset, Ty, 0, *FieldAlignment, TyAlignment});
+    return Fields.size() - 1;
   }
 
   /// Finish the layout and set the body on the given type.
@@ -433,13 +488,162 @@ public:
     return StructAlign;
   }
 
-  unsigned getFieldIndex(FieldId Id) const {
+  FieldIDType getLayoutFieldIndex(FieldIDType Id) const {
     assert(IsFinished && "not yet finished!");
-    return Fields[Id.Value].FieldIndex;
+    return Fields[Id].LayoutFieldIndex;
   }
 };
 } // namespace
 
+void FrameDataInfo::updateLayoutIndex(FrameTypeBuilder &B) {
+  auto Updater = [&](Value *I) {
+    setFieldIndex(I, B.getLayoutFieldIndex(getFieldIndex(I)));
+  };
+  LayoutIndexUpdateStarted = true;
+  for (auto &S : Spills)
+    Updater(S.first);
+  for (const auto &A : Allocas)
+    Updater(A.Alloca);
+  LayoutIndexUpdateStarted = false;
+}
+
+void FrameTypeBuilder::addFieldForAllocas(const Function &F,
+                                          FrameDataInfo &FrameData,
+                                          coro::Shape &Shape) {
+  DenseMap<AllocaInst *, unsigned int> AllocaIndex;
+  using AllocaSetType = SmallVector<AllocaInst *, 4>;
+  SmallVector<AllocaSetType, 4> NonOverlapedAllocas;
+
+  // We need to add field for allocas at the end of this function. However, this
+  // function has multiple exits, so we use this helper to avoid redundant code.
+  struct RTTIHelper {
+    std::function<void()> func;
+    RTTIHelper(std::function<void()> &&func) : func(func) {}
+    ~RTTIHelper() { func(); }
+  } Helper([&]() {
+    for (auto AllocaList : NonOverlapedAllocas) {
+      auto *LargestAI = *AllocaList.begin();
+      FieldIDType Id = addFieldForAlloca(LargestAI);
+      for (auto *Alloca : AllocaList)
+        FrameData.setFieldIndex(Alloca, Id);
+    }
+  });
+
+  if (!Shape.ReuseFrameSlot && !EnableReuseStorageInFrame) {
+    for (const auto &A : FrameData.Allocas) {
+      AllocaInst *Alloca = A.Alloca;
+      AllocaIndex[Alloca] = NonOverlapedAllocas.size();
+      NonOverlapedAllocas.emplace_back(AllocaSetType(1, Alloca));
+    }
+    return;
+  }
+
+  // Because there are pathes from the lifetime.start to coro.end
+  // for each alloca, the liferanges for every alloca is overlaped
+  // in the blocks who contain coro.end and the successor blocks.
+  // So we choose to skip there blocks when we calculates the liferange
+  // for each alloca. It should be reasonable since there shouldn't be uses
+  // in these blocks and the coroutine frame shouldn't be used outside the
+  // coroutine body.
+  //
+  // Note that the user of coro.suspend may not be SwitchInst. However, this
+  // case seems too complex to handle. And it is harmless to skip these
+  // patterns since it just prevend putting the allocas to live in the same
+  // slot.
+  DenseMap<SwitchInst *, BasicBlock *> DefaultSuspendDest;
+  for (auto CoroSuspendInst : Shape.CoroSuspends) {
+    for (auto U : CoroSuspendInst->users()) {
+      if (auto *ConstSWI = dyn_cast<SwitchInst>(U)) {
+        auto *SWI = const_cast<SwitchInst *>(ConstSWI);
+        DefaultSuspendDest[SWI] = SWI->getDefaultDest();
+        SWI->setDefaultDest(SWI->getSuccessor(1));
+      }
+    }
+  }
+
+  auto ExtractAllocas = [&]() {
+    AllocaSetType Allocas;
+    Allocas.reserve(FrameData.Allocas.size());
+    for (const auto &A : FrameData.Allocas)
+      Allocas.push_back(A.Alloca);
+    return Allocas;
+  };
+  StackLifetime StackLifetimeAnalyzer(F, ExtractAllocas(),
+                                      StackLifetime::LivenessType::May);
+  StackLifetimeAnalyzer.run();
+  auto IsAllocaInferenre = [&](const AllocaInst *AI1, const AllocaInst *AI2) {
+    return StackLifetimeAnalyzer.getLiveRange(AI1).overlaps(
+        StackLifetimeAnalyzer.getLiveRange(AI2));
+  };
+  auto GetAllocaSize = [&](const AllocaInfo &A) {
+    Optional<TypeSize> RetSize = A.Alloca->getAllocationSizeInBits(DL);
+    assert(RetSize && "Variable Length Arrays (VLA) are not supported.\n");
+    assert(!RetSize->isScalable() && "Scalable vectors are not yet supported");
+    return RetSize->getFixedSize();
+  };
+  // Put larger allocas in the front. So the larger allocas have higher
+  // priority to merge, which can save more space potentially. Also each
+  // AllocaSet would be ordered. So we can get the largest Alloca in one
+  // AllocaSet easily.
+  sort(FrameData.Allocas, [&](const auto &Iter1, const auto &Iter2) {
+    return GetAllocaSize(Iter1) > GetAllocaSize(Iter2);
+  });
+  for (const auto &A : FrameData.Allocas) {
+    AllocaInst *Alloca = A.Alloca;
+    bool Merged = false;
+    // Try to find if the Alloca is not inferenced with any existing
+    // NonOverlappedAllocaSet. If it is true, insert the alloca to that
+    // NonOverlappedAllocaSet.
+    for (auto &AllocaSet : NonOverlapedAllocas) {
+      assert(!AllocaSet.empty() && "Processing Alloca Set is not empty.\n");
+      bool NoInference = none_of(AllocaSet, [&](auto Iter) {
+        return IsAllocaInferenre(Alloca, Iter);
+      });
+      // If the alignment of A is multiple of the alignment of B, the address
+      // of A should satisfy the requirement for aligning for B.
+      //
+      // There may be other more fine-grained strategies to handle the alignment
+      // infomation during the merging process. But it seems hard to handle
+      // these strategies and benefit little.
+      bool Alignable = [&]() -> bool {
+        auto *LargestAlloca = *AllocaSet.begin();
+        return LargestAlloca->getAlign().value() % Alloca->getAlign().value() ==
+               0;
+      }();
+      bool CouldMerge = NoInference && Alignable;
+      if (!CouldMerge)
+        continue;
+      AllocaIndex[Alloca] = AllocaIndex[*AllocaSet.begin()];
+      AllocaSet.push_back(Alloca);
+      Merged = true;
+      break;
+    }
+    if (!Merged) {
+      AllocaIndex[Alloca] = NonOverlapedAllocas.size();
+      NonOverlapedAllocas.emplace_back(AllocaSetType(1, Alloca));
+    }
+  }
+  // Recover the default target destination for each Switch statement
+  // reserved.
+  for (auto SwitchAndDefaultDest : DefaultSuspendDest) {
+    SwitchInst *SWI = SwitchAndDefaultDest.first;
+    BasicBlock *DestBB = SwitchAndDefaultDest.second;
+    SWI->setDefaultDest(DestBB);
+  }
+  // This Debug Info could tell us which allocas are merged into one slot.
+  LLVM_DEBUG(for (auto &AllocaSet
+                  : NonOverlapedAllocas) {
+    if (AllocaSet.size() > 1) {
+      dbgs() << "In Function:" << F.getName() << "\n";
+      dbgs() << "Find Union Set "
+             << "\n";
+      dbgs() << "\tAllocas are \n";
+      for (auto Alloca : AllocaSet)
+        dbgs() << "\t\t" << *Alloca << "\n";
+    }
+  });
+}
+
 void FrameTypeBuilder::finish(StructType *Ty) {
   assert(!IsFinished && "already finished!");
 
@@ -491,13 +695,8 @@ void FrameTypeBuilder::finish(StructType *Ty) {
                                             Offset - LastOffset));
     }
 
-    // Record the layout information into both the Field and the
-    // original Spill, if there is one.
     F.Offset = Offset;
-    F.FieldIndex = FieldTypes.size();
-    if (F.ForSpill) {
-      F.ForSpill->setFieldIndex(F.FieldIndex);
-    }
+    F.LayoutFieldIndex = FieldTypes.size();
 
     FieldTypes.push_back(F.Ty);
     LastOffset = Offset + F.Size;
@@ -509,8 +708,8 @@ void FrameTypeBuilder::finish(StructType *Ty) {
   // Check that the IR layout matches the offsets we expect.
   auto Layout = DL.getStructLayout(Ty);
   for (auto &F : Fields) {
-    assert(Ty->getElementType(F.FieldIndex) == F.Ty);
-    assert(Layout->getElementOffset(F.FieldIndex) == F.Offset);
+    assert(Ty->getElementType(F.LayoutFieldIndex) == F.Ty);
+    assert(Layout->getElementOffset(F.LayoutFieldIndex) == F.Offset);
   }
 #endif
 
@@ -526,7 +725,7 @@ void FrameTypeBuilder::finish(StructType *Ty) {
 //     ... spills ...
 //   };
 static StructType *buildFrameType(Function &F, coro::Shape &Shape,
-                                  SpillInfo &Spills) {
+                                  FrameDataInfo &FrameData) {
   LLVMContext &C = F.getContext();
   const DataLayout &DL = F.getParent()->getDataLayout();
   StructType *FrameTy = [&] {
@@ -538,8 +737,7 @@ static StructType *buildFrameType(Function &F, coro::Shape &Shape,
   FrameTypeBuilder B(C, DL);
 
   AllocaInst *PromiseAlloca = Shape.getPromiseAlloca();
-  Optional<FrameTypeBuilder::FieldId> PromiseFieldId;
-  Optional<FrameTypeBuilder::FieldId> SwitchIndexFieldId;
+  Optional<FieldIDType> SwitchIndexFieldId;
 
   if (Shape.ABI == coro::ABI::Switch) {
     auto *FramePtrTy = FrameTy->getPointerTo();
@@ -549,14 +747,15 @@ static StructType *buildFrameType(Function &F, coro::Shape &Shape,
 
     // Add header fields for the resume and destroy functions.
     // We can rely on these being perfectly packed.
-    B.addField(FnPtrTy, None, nullptr, /*header*/ true);
-    B.addField(FnPtrTy, None, nullptr, /*header*/ true);
+    (void)B.addField(FnPtrTy, None, /*header*/ true);
+    (void)B.addField(FnPtrTy, None, /*header*/ true);
 
-    // Add a header field for the promise if there is one.
-    if (PromiseAlloca) {
-      PromiseFieldId =
-        B.addFieldForAlloca(PromiseAlloca, nullptr, /*header*/ true);
-    }
+    // PromiseAlloca field needs to be explicitly added here because it's
+    // a header field with a fixed offset based on its alignment. Hence it
+    // needs special handling and cannot be added to FrameData.Allocas.
+    if (PromiseAlloca)
+      FrameData.setFieldIndex(
+          PromiseAlloca, B.addFieldForAlloca(PromiseAlloca, /*header*/ true));
 
     // Add a field to store the suspend index.  This doesn't need to
     // be in the header.
@@ -568,40 +767,40 @@ static StructType *buildFrameType(Function &F, coro::Shape &Shape,
     assert(PromiseAlloca == nullptr && "lowering doesn't support promises");
   }
 
-  Value *CurrentDef = nullptr;
-
+  // Because multiple allocas may own the same field slot,
+  // we add allocas to field here.
+  B.addFieldForAllocas(F, FrameData, Shape);
+  // Add PromiseAlloca to Allocas list so that
+  // 1. updateLayoutIndex could update its index after
+  // `performOptimizedStructLayout`
+  // 2. it is processed in insertSpills.
+  if (Shape.ABI == coro::ABI::Switch && PromiseAlloca)
+    // We assume that the promise alloca won't be modified before
+    // CoroBegin and no alias will be create before CoroBegin.
+    FrameData.Allocas.emplace_back(
+        PromiseAlloca, DenseMap<Instruction *, llvm::Optional<APInt>>{}, false);
   // Create an entry for every spilled value.
-  for (auto &S : Spills) {
-    // We can have multiple entries in Spills for a single value, but
-    // they should form a contiguous run.  Ignore all but the first.
-    if (CurrentDef == S.def())
-      continue;
-
-    CurrentDef = S.def();
-
-    assert(CurrentDef != PromiseAlloca &&
-           "recorded spill use of promise alloca?");
-
-    if (auto *AI = dyn_cast<AllocaInst>(CurrentDef)) {
-      B.addFieldForAlloca(AI, &S);
-    } else {
-      Type *Ty = CurrentDef->getType();
-      B.addField(Ty, None, &S);
-    }
+  for (auto &S : FrameData.Spills) {
+    Type *FieldType = S.first->getType();
+    // For byval arguments, we need to store the pointed value in the frame,
+    // instead of the pointer itself.
+    if (const Argument *A = dyn_cast<Argument>(S.first))
+      if (A->hasByValAttr())
+        FieldType = FieldType->getPointerElementType();
+    FieldIDType Id = B.addField(FieldType, None);
+    FrameData.setFieldIndex(S.first, Id);
   }
 
   B.finish(FrameTy);
+  FrameData.updateLayoutIndex(B);
   Shape.FrameAlign = B.getStructAlign();
   Shape.FrameSize = B.getStructSize();
 
   switch (Shape.ABI) {
-  // In the switch ABI, remember the field indices for the promise and
-  // switch-index fields.
   case coro::ABI::Switch:
+    // In the switch ABI, remember the switch-index field.
     Shape.SwitchLowering.IndexField =
-      B.getFieldIndex(*SwitchIndexFieldId);
-    Shape.SwitchLowering.PromiseField =
-      (PromiseAlloca ? B.getFieldIndex(*PromiseFieldId) : 0);
+        B.getLayoutFieldIndex(*SwitchIndexFieldId);
 
     // Also round the frame size up to a multiple of its alignment, as is
     // generally expected in C/C++.
@@ -617,65 +816,246 @@ static StructType *buildFrameType(Function &F, coro::Shape &Shape,
          B.getStructAlign() <= Id->getStorageAlignment());
     break;
   }
+  case coro::ABI::Async: {
+    Shape.AsyncLowering.FrameOffset =
+        alignTo(Shape.AsyncLowering.ContextHeaderSize, Shape.FrameAlign);
+    // Also make the final context size a multiple of the context alignment to
+    // make allocation easier for allocators.
+    Shape.AsyncLowering.ContextSize =
+        alignTo(Shape.AsyncLowering.FrameOffset + Shape.FrameSize,
+                Shape.AsyncLowering.getContextAlignment());
+    if (Shape.AsyncLowering.getContextAlignment() < Shape.FrameAlign) {
+      report_fatal_error(
+          "The alignment requirment of frame variables cannot be higher than "
+          "the alignment of the async function context");
+    }
+    break;
+  }
   }
 
   return FrameTy;
 }
 
-// We use a pointer use visitor to discover if there are any writes into an
-// alloca that dominates CoroBegin. If that is the case, insertSpills will copy
-// the value from the alloca into the coroutine frame spill slot corresponding
-// to that alloca.
+// We use a pointer use visitor to track how an alloca is being used.
+// The goal is to be able to answer the following three questions:
+// 1. Should this alloca be allocated on the frame instead.
+// 2. Could the content of the alloca be modified prior to CoroBegn, which would
+// require copying the data from alloca to the frame after CoroBegin.
+// 3. Is there any alias created for this alloca prior to CoroBegin, but used
+// after CoroBegin. In that case, we will need to recreate the alias after
+// CoroBegin based off the frame. To answer question 1, we track two things:
+//   a. List of all BasicBlocks that use this alloca or any of the aliases of
+//   the alloca. In the end, we check if there exists any two basic blocks that
+//   cross suspension points. If so, this alloca must be put on the frame. b.
+//   Whether the alloca or any alias of the alloca is escaped at some point,
+//   either by storing the address somewhere, or the address is used in a
+//   function call that might capture. If it's ever escaped, this alloca must be
+//   put on the frame conservatively.
+// To answer quetion 2, we track through the variable MayWriteBeforeCoroBegin.
+// Whenever a potential write happens, either through a store instruction, a
+// function call or any of the memory intrinsics, we check whether this
+// instruction is prior to CoroBegin. To answer question 3, we track the offsets
+// of all aliases created for the alloca prior to CoroBegin but used after
+// CoroBegin. llvm::Optional is used to be able to represent the case when the
+// offset is unknown (e.g. when you have a PHINode that takes in different
+// offset values). We cannot handle unknown offsets and will assert. This is the
+// potential issue left out. An ideal solution would likely require a
+// significant redesign.
 namespace {
 struct AllocaUseVisitor : PtrUseVisitor<AllocaUseVisitor> {
   using Base = PtrUseVisitor<AllocaUseVisitor>;
   AllocaUseVisitor(const DataLayout &DL, const DominatorTree &DT,
-                   const CoroBeginInst &CB)
-      : PtrUseVisitor(DL), DT(DT), CoroBegin(CB) {}
+                   const CoroBeginInst &CB, const SuspendCrossingInfo &Checker)
+      : PtrUseVisitor(DL), DT(DT), CoroBegin(CB), Checker(Checker) {}
 
-  // We are only interested in uses that dominate coro.begin.
   void visit(Instruction &I) {
-    if (DT.dominates(&I, &CoroBegin))
-      Base::visit(I);
+    UserBBs.insert(I.getParent());
+    Base::visit(I);
+    // If the pointer is escaped prior to CoroBegin, we have to assume it would
+    // be written into before CoroBegin as well.
+    if (PI.isEscaped() && !DT.dominates(&CoroBegin, PI.getEscapingInst())) {
+      MayWriteBeforeCoroBegin = true;
+    }
   }
   // We need to provide this overload as PtrUseVisitor uses a pointer based
   // visiting function.
   void visit(Instruction *I) { return visit(*I); }
 
-  void visitLoadInst(LoadInst &) {} // Good. Nothing to do.
+  void visitPHINode(PHINode &I) {
+    enqueueUsers(I);
+    handleAlias(I);
+  }
+
+  void visitSelectInst(SelectInst &I) {
+    enqueueUsers(I);
+    handleAlias(I);
+  }
+
+  void visitStoreInst(StoreInst &SI) {
+    // Regardless whether the alias of the alloca is the value operand or the
+    // pointer operand, we need to assume the alloca is been written.
+    handleMayWrite(SI);
+
+    if (SI.getValueOperand() != U->get())
+      return;
+
+    // We are storing the pointer into a memory location, potentially escaping.
+    // As an optimization, we try to detect simple cases where it doesn't
+    // actually escape, for example:
+    //   %ptr = alloca ..
+    //   %addr = alloca ..
+    //   store %ptr, %addr
+    //   %x = load %addr
+    //   ..
+    // If %addr is only used by loading from it, we could simply treat %x as
+    // another alias of %ptr, and not considering %ptr being escaped.
+    auto IsSimpleStoreThenLoad = [&]() {
+      auto *AI = dyn_cast<AllocaInst>(SI.getPointerOperand());
+      // If the memory location we are storing to is not an alloca, it
+      // could be an alias of some other memory locations, which is difficult
+      // to analyze.
+      if (!AI)
+        return false;
+      // StoreAliases contains aliases of the memory location stored into.
+      SmallVector<Instruction *, 4> StoreAliases = {AI};
+      while (!StoreAliases.empty()) {
+        Instruction *I = StoreAliases.pop_back_val();
+        for (User *U : I->users()) {
+          // If we are loading from the memory location, we are creating an
+          // alias of the original pointer.
+          if (auto *LI = dyn_cast<LoadInst>(U)) {
+            enqueueUsers(*LI);
+            handleAlias(*LI);
+            continue;
+          }
+          // If we are overriding the memory location, the pointer certainly
+          // won't escape.
+          if (auto *S = dyn_cast<StoreInst>(U))
+            if (S->getPointerOperand() == I)
+              continue;
+          if (auto *II = dyn_cast<IntrinsicInst>(U))
+            if (II->isLifetimeStartOrEnd())
+              continue;
+          // BitCastInst creats aliases of the memory location being stored
+          // into.
+          if (auto *BI = dyn_cast<BitCastInst>(U)) {
+            StoreAliases.push_back(BI);
+            continue;
+          }
+          return false;
+        }
+      }
+
+      return true;
+    };
+
+    if (!IsSimpleStoreThenLoad())
+      PI.setEscaped(&SI);
+  }
 
-  // If the use is an operand, the pointer escaped and anything can write into
-  // that memory. If the use is the pointer, we are definitely writing into the
-  // alloca and therefore we need to copy.
-  void visitStoreInst(StoreInst &SI) { PI.setAborted(&SI); }
+  // All mem intrinsics modify the data.
+  void visitMemIntrinsic(MemIntrinsic &MI) { handleMayWrite(MI); }
 
-  // Any other instruction that is not filtered out by PtrUseVisitor, will
-  // result in the copy.
-  void visitInstruction(Instruction &I) { PI.setAborted(&I); }
+  void visitBitCastInst(BitCastInst &BC) {
+    Base::visitBitCastInst(BC);
+    handleAlias(BC);
+  }
+
+  void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
+    Base::visitAddrSpaceCastInst(ASC);
+    handleAlias(ASC);
+  }
+
+  void visitGetElementPtrInst(GetElementPtrInst &GEPI) {
+    // The base visitor will adjust Offset accordingly.
+    Base::visitGetElementPtrInst(GEPI);
+    handleAlias(GEPI);
+  }
+
+  void visitCallBase(CallBase &CB) {
+    for (unsigned Op = 0, OpCount = CB.getNumArgOperands(); Op < OpCount; ++Op)
+      if (U->get() == CB.getArgOperand(Op) && !CB.doesNotCapture(Op))
+        PI.setEscaped(&CB);
+    handleMayWrite(CB);
+  }
+
+  bool getShouldLiveOnFrame() const {
+    if (!ShouldLiveOnFrame)
+      ShouldLiveOnFrame = computeShouldLiveOnFrame();
+    return ShouldLiveOnFrame.getValue();
+  }
+
+  bool getMayWriteBeforeCoroBegin() const { return MayWriteBeforeCoroBegin; }
+
+  DenseMap<Instruction *, llvm::Optional<APInt>> getAliasesCopy() const {
+    assert(getShouldLiveOnFrame() && "This method should only be called if the "
+                                     "alloca needs to live on the frame.");
+    for (const auto &P : AliasOffetMap)
+      if (!P.second)
+        report_fatal_error("Unable to handle an alias with unknown offset "
+                           "created before CoroBegin.");
+    return AliasOffetMap;
+  }
 
 private:
   const DominatorTree &DT;
   const CoroBeginInst &CoroBegin;
-};
-} // namespace
-static bool mightWriteIntoAllocaPtr(AllocaInst &A, const DominatorTree &DT,
-                                    const CoroBeginInst &CB) {
-  const DataLayout &DL = A.getModule()->getDataLayout();
-  AllocaUseVisitor Visitor(DL, DT, CB);
-  auto PtrI = Visitor.visitPtr(A);
-  if (PtrI.isEscaped() || PtrI.isAborted()) {
-    auto *PointerEscapingInstr = PtrI.getEscapingInst()
-                                     ? PtrI.getEscapingInst()
-                                     : PtrI.getAbortingInst();
-    if (PointerEscapingInstr) {
-      LLVM_DEBUG(
-          dbgs() << "AllocaInst copy was triggered by instruction: "
-                 << *PointerEscapingInstr << "\n");
+  const SuspendCrossingInfo &Checker;
+  // All alias to the original AllocaInst, created before CoroBegin and used
+  // after CoroBegin. Each entry contains the instruction and the offset in the
+  // original Alloca. They need to be recreated after CoroBegin off the frame.
+  DenseMap<Instruction *, llvm::Optional<APInt>> AliasOffetMap{};
+  SmallPtrSet<BasicBlock *, 2> UserBBs{};
+  bool MayWriteBeforeCoroBegin{false};
+
+  mutable llvm::Optional<bool> ShouldLiveOnFrame{};
+
+  bool computeShouldLiveOnFrame() const {
+    if (PI.isEscaped())
+      return true;
+
+    for (auto *BB1 : UserBBs)
+      for (auto *BB2 : UserBBs)
+        if (Checker.hasPathCrossingSuspendPoint(BB1, BB2))
+          return true;
+
+    return false;
+  }
+
+  void handleMayWrite(const Instruction &I) {
+    if (!DT.dominates(&CoroBegin, &I))
+      MayWriteBeforeCoroBegin = true;
+  }
+
+  bool usedAfterCoroBegin(Instruction &I) {
+    for (auto &U : I.uses())
+      if (DT.dominates(&CoroBegin, U))
+        return true;
+    return false;
+  }
+
+  void handleAlias(Instruction &I) {
+    // We track all aliases created prior to CoroBegin but used after.
+    // These aliases may need to be recreated after CoroBegin if the alloca
+    // need to live on the frame.
+    if (DT.dominates(&CoroBegin, &I) || !usedAfterCoroBegin(I))
+      return;
+
+    if (!IsOffsetKnown) {
+      AliasOffetMap[&I].reset();
+    } else {
+      auto Itr = AliasOffetMap.find(&I);
+      if (Itr == AliasOffetMap.end()) {
+        AliasOffetMap[&I] = Offset;
+      } else if (Itr->second.hasValue() && Itr->second.getValue() != Offset) {
+        // If we have seen two different possible values for this alias, we set
+        // it to empty.
+        AliasOffetMap[&I].reset();
+      }
     }
-    return true;
   }
-  return false;
-}
+};
+} // namespace
 
 // We need to make room to insert a spill after initial PHIs, but before
 // catchswitch instruction. Placing it before violates the requirement that
@@ -720,7 +1100,8 @@ static Instruction *splitBeforeCatchSwitch(CatchSwitchInst *CatchSwitch) {
 //    whatever
 //
 //
-static Instruction *insertSpills(const SpillInfo &Spills, coro::Shape &Shape) {
+static Instruction *insertSpills(const FrameDataInfo &FrameData,
+                                 coro::Shape &Shape) {
   auto *CB = Shape.CoroBegin;
   LLVMContext &C = CB->getContext();
   IRBuilder<> Builder(CB->getNextNode());
@@ -729,32 +1110,13 @@ static Instruction *insertSpills(const SpillInfo &Spills, coro::Shape &Shape) {
   auto *FramePtr =
       cast<Instruction>(Builder.CreateBitCast(CB, FramePtrTy, "FramePtr"));
   DominatorTree DT(*CB->getFunction());
-
-  Value *CurrentValue = nullptr;
-  BasicBlock *CurrentBlock = nullptr;
-  Value *CurrentReload = nullptr;
-
-  // Proper field number will be read from field definition.
-  unsigned Index = InvalidFieldIndex;
-
-  // We need to keep track of any allocas that need "spilling"
-  // since they will live in the coroutine frame now, all access to them
-  // need to be changed, not just the access across suspend points
-  // we remember allocas and their indices to be handled once we processed
-  // all the spills.
-  SmallVector<std::pair<AllocaInst *, unsigned>, 4> Allocas;
-
-  // Promise alloca (if present) doesn't show in the spills and has a
-  // special field number.
-  if (auto *PromiseAlloca = Shape.getPromiseAlloca()) {
-    assert(Shape.ABI == coro::ABI::Switch);
-    Allocas.emplace_back(PromiseAlloca, Shape.getPromiseField());
-  }
+  SmallDenseMap<llvm::Value *, llvm::AllocaInst *, 4> DbgPtrAllocaCache;
 
   // Create a GEP with the given index into the coroutine frame for the original
   // value Orig. Appends an extra 0 index for array-allocas, preserving the
   // original type.
-  auto GetFramePointer = [&](uint32_t Index, Value *Orig) -> Value * {
+  auto GetFramePointer = [&](Value *Orig) -> Value * {
+    FieldIDType Index = FrameData.getFieldIndex(Orig);
     SmallVector<Value *, 3> Indices = {
         ConstantInt::get(Type::getInt32Ty(C), 0),
         ConstantInt::get(Type::getInt32Ty(C), Index),
@@ -771,147 +1133,160 @@ static Instruction *insertSpills(const SpillInfo &Spills, coro::Shape &Shape) {
       }
     }
 
-    return Builder.CreateInBoundsGEP(FrameTy, FramePtr, Indices);
-  };
-
-  // Create a load instruction to reload the spilled value from the coroutine
-  // frame. Populates the Value pointer reference provided with the frame GEP.
-  auto CreateReload = [&](Instruction *InsertBefore, Value *&G) {
-    assert(Index != InvalidFieldIndex && "accessing unassigned field number");
-    Builder.SetInsertPoint(InsertBefore);
-
-    G = GetFramePointer(Index, CurrentValue);
-    G->setName(CurrentValue->getName() + Twine(".reload.addr"));
-
-    return isa<AllocaInst>(CurrentValue)
-               ? G
-               : Builder.CreateLoad(FrameTy->getElementType(Index), G,
-                                    CurrentValue->getName() + Twine(".reload"));
+    auto GEP = cast<GetElementPtrInst>(
+        Builder.CreateInBoundsGEP(FrameTy, FramePtr, Indices));
+    if (isa<AllocaInst>(Orig)) {
+      // If the type of GEP is not equal to the type of AllocaInst, it implies
+      // that the AllocaInst may be reused in the Frame slot of other
+      // AllocaInst. So We cast GEP to the AllocaInst here to re-use
+      // the Frame storage.
+      //
+      // Note: If we change the strategy dealing with alignment, we need to refine
+      // this casting.
+      if (GEP->getResultElementType() != Orig->getType())
+        return Builder.CreateBitCast(GEP, Orig->getType(),
+                                     Orig->getName() + Twine(".cast"));
+    }
+    return GEP;
   };
 
-  Value *GEP = nullptr, *CurrentGEP = nullptr;
-  for (auto const &E : Spills) {
-    // If we have not seen the value, generate a spill.
-    if (CurrentValue != E.def()) {
-      CurrentValue = E.def();
-      CurrentBlock = nullptr;
-      CurrentReload = nullptr;
-
-      Index = E.fieldIndex();
-
-      if (auto *AI = dyn_cast<AllocaInst>(CurrentValue)) {
-        // Spilled AllocaInst will be replaced with GEP from the coroutine frame
-        // there is no spill required.
-        Allocas.emplace_back(AI, Index);
-        if (!AI->isStaticAlloca())
-          report_fatal_error("Coroutines cannot handle non static allocas yet");
+  for (auto const &E : FrameData.Spills) {
+    Value *Def = E.first;
+    // Create a store instruction storing the value into the
+    // coroutine frame.
+    Instruction *InsertPt = nullptr;
+    bool NeedToCopyArgPtrValue = false;
+    if (auto *Arg = dyn_cast<Argument>(Def)) {
+      // For arguments, we will place the store instruction right after
+      // the coroutine frame pointer instruction, i.e. bitcast of
+      // coro.begin from i8* to %f.frame*.
+      InsertPt = FramePtr->getNextNode();
+
+      // If we're spilling an Argument, make sure we clear 'nocapture'
+      // from the coroutine function.
+      Arg->getParent()->removeParamAttr(Arg->getArgNo(), Attribute::NoCapture);
+
+      if (Arg->hasByValAttr())
+        NeedToCopyArgPtrValue = true;
+
+    } else if (auto *CSI = dyn_cast<AnyCoroSuspendInst>(Def)) {
+      // Don't spill immediately after a suspend; splitting assumes
+      // that the suspend will be followed by a branch.
+      InsertPt = CSI->getParent()->getSingleSuccessor()->getFirstNonPHI();
+    } else {
+      auto *I = cast<Instruction>(Def);
+      if (!DT.dominates(CB, I)) {
+        // If it is not dominated by CoroBegin, then spill should be
+        // inserted immediately after CoroFrame is computed.
+        InsertPt = FramePtr->getNextNode();
+      } else if (auto *II = dyn_cast<InvokeInst>(I)) {
+        // If we are spilling the result of the invoke instruction, split
+        // the normal edge and insert the spill in the new block.
+        auto *NewBB = SplitEdge(II->getParent(), II->getNormalDest());
+        InsertPt = NewBB->getTerminator();
+      } else if (isa<PHINode>(I)) {
+        // Skip the PHINodes and EH pads instructions.
+        BasicBlock *DefBlock = I->getParent();
+        if (auto *CSI = dyn_cast<CatchSwitchInst>(DefBlock->getTerminator()))
+          InsertPt = splitBeforeCatchSwitch(CSI);
+        else
+          InsertPt = &*DefBlock->getFirstInsertionPt();
       } else {
-        // Otherwise, create a store instruction storing the value into the
-        // coroutine frame.
-
-        Instruction *InsertPt = nullptr;
-        if (auto Arg = dyn_cast<Argument>(CurrentValue)) {
-          // For arguments, we will place the store instruction right after
-          // the coroutine frame pointer instruction, i.e. bitcast of
-          // coro.begin from i8* to %f.frame*.
-          InsertPt = FramePtr->getNextNode();
-
-          // If we're spilling an Argument, make sure we clear 'nocapture'
-          // from the coroutine function.
-          Arg->getParent()->removeParamAttr(Arg->getArgNo(),
-                                            Attribute::NoCapture);
-
-        } else if (auto *II = dyn_cast<InvokeInst>(CurrentValue)) {
-          // If we are spilling the result of the invoke instruction, split the
-          // normal edge and insert the spill in the new block.
-          auto NewBB = SplitEdge(II->getParent(), II->getNormalDest());
-          InsertPt = NewBB->getTerminator();
-        } else if (isa<PHINode>(CurrentValue)) {
-          // Skip the PHINodes and EH pads instructions.
-          BasicBlock *DefBlock = cast<Instruction>(E.def())->getParent();
-          if (auto *CSI = dyn_cast<CatchSwitchInst>(DefBlock->getTerminator()))
-            InsertPt = splitBeforeCatchSwitch(CSI);
-          else
-            InsertPt = &*DefBlock->getFirstInsertionPt();
-        } else if (auto CSI = dyn_cast<AnyCoroSuspendInst>(CurrentValue)) {
-          // Don't spill immediately after a suspend; splitting assumes
-          // that the suspend will be followed by a branch.
-          InsertPt = CSI->getParent()->getSingleSuccessor()->getFirstNonPHI();
-        } else {
-          auto *I = cast<Instruction>(E.def());
-          assert(!I->isTerminator() && "unexpected terminator");
-          // For all other values, the spill is placed immediately after
-          // the definition.
-          if (DT.dominates(CB, I)) {
-            InsertPt = I->getNextNode();
-          } else {
-            // Unless, it is not dominated by CoroBegin, then it will be
-            // inserted immediately after CoroFrame is computed.
-            InsertPt = FramePtr->getNextNode();
-          }
-        }
-
-        Builder.SetInsertPoint(InsertPt);
-        auto *G = Builder.CreateConstInBoundsGEP2_32(
-            FrameTy, FramePtr, 0, Index,
-            CurrentValue->getName() + Twine(".spill.addr"));
-        Builder.CreateStore(CurrentValue, G);
+        assert(!I->isTerminator() && "unexpected terminator");
+        // For all other values, the spill is placed immediately after
+        // the definition.
+        InsertPt = I->getNextNode();
       }
     }
 
-    // If we have not seen the use block, generate a reload in it.
-    if (CurrentBlock != E.userBlock()) {
-      CurrentBlock = E.userBlock();
-      CurrentReload = CreateReload(&*CurrentBlock->getFirstInsertionPt(), GEP);
+    auto Index = FrameData.getFieldIndex(Def);
+    Builder.SetInsertPoint(InsertPt);
+    auto *G = Builder.CreateConstInBoundsGEP2_32(
+        FrameTy, FramePtr, 0, Index, Def->getName() + Twine(".spill.addr"));
+    if (NeedToCopyArgPtrValue) {
+      // For byval arguments, we need to store the pointed value in the frame,
+      // instead of the pointer itself.
+      auto *Value =
+          Builder.CreateLoad(Def->getType()->getPointerElementType(), Def);
+      Builder.CreateStore(Value, G);
+    } else {
+      Builder.CreateStore(Def, G);
     }
 
-    // If we have a single edge PHINode, remove it and replace it with a reload
-    // from the coroutine frame. (We already took care of multi edge PHINodes
-    // by rewriting them in the rewritePHIs function).
-    if (auto *PN = dyn_cast<PHINode>(E.user())) {
-      assert(PN->getNumIncomingValues() == 1 && "unexpected number of incoming "
-                                                "values in the PHINode");
-      PN->replaceAllUsesWith(CurrentReload);
-      PN->eraseFromParent();
-      continue;
-    }
+    BasicBlock *CurrentBlock = nullptr;
+    Value *CurrentReload = nullptr;
+    for (auto *U : E.second) {
+      // If we have not seen the use block, create a load instruction to reload
+      // the spilled value from the coroutine frame. Populates the Value pointer
+      // reference provided with the frame GEP.
+      if (CurrentBlock != U->getParent()) {
+        CurrentBlock = U->getParent();
+        Builder.SetInsertPoint(&*CurrentBlock->getFirstInsertionPt());
+
+        auto *GEP = GetFramePointer(E.first);
+        GEP->setName(E.first->getName() + Twine(".reload.addr"));
+        if (NeedToCopyArgPtrValue)
+          CurrentReload = GEP;
+        else
+          CurrentReload = Builder.CreateLoad(
+              FrameTy->getElementType(FrameData.getFieldIndex(E.first)), GEP,
+              E.first->getName() + Twine(".reload"));
+
+        TinyPtrVector<DbgDeclareInst *> DIs = FindDbgDeclareUses(Def);
+        for (DbgDeclareInst *DDI : DIs) {
+          bool AllowUnresolved = false;
+          // This dbg.declare is preserved for all coro-split function
+          // fragments. It will be unreachable in the main function, and
+          // processed by coro::salvageDebugInfo() by CoroCloner.
+          DIBuilder(*CurrentBlock->getParent()->getParent(), AllowUnresolved)
+              .insertDeclare(CurrentReload, DDI->getVariable(),
+                             DDI->getExpression(), DDI->getDebugLoc(),
+                             &*Builder.GetInsertPoint());
+          // This dbg.declare is for the main function entry point.  It
+          // will be deleted in all coro-split functions.
+          coro::salvageDebugInfo(DbgPtrAllocaCache, DDI);
+        }
+      }
 
-    // If we have not seen this GEP instruction, migrate any dbg.declare from
-    // the alloca to it.
-    if (CurrentGEP != GEP) {
-      CurrentGEP = GEP;
-      TinyPtrVector<DbgDeclareInst *> DIs = FindDbgDeclareUses(CurrentValue);
-      if (!DIs.empty())
-        DIBuilder(*CurrentBlock->getParent()->getParent(),
-                  /*AllowUnresolved*/ false)
-            .insertDeclare(CurrentGEP, DIs.front()->getVariable(),
-                           DIs.front()->getExpression(),
-                           DIs.front()->getDebugLoc(), DIs.front());
-    }
+      // If we have a single edge PHINode, remove it and replace it with a
+      // reload from the coroutine frame. (We already took care of multi edge
+      // PHINodes by rewriting them in the rewritePHIs function).
+      if (auto *PN = dyn_cast<PHINode>(U)) {
+        assert(PN->getNumIncomingValues() == 1 &&
+               "unexpected number of incoming "
+               "values in the PHINode");
+        PN->replaceAllUsesWith(CurrentReload);
+        PN->eraseFromParent();
+        continue;
+      }
 
-    // Replace all uses of CurrentValue in the current instruction with reload.
-    E.user()->replaceUsesOfWith(CurrentValue, CurrentReload);
+      // Replace all uses of CurrentValue in the current instruction with
+      // reload.
+      U->replaceUsesOfWith(Def, CurrentReload);
+    }
   }
 
   BasicBlock *FramePtrBB = FramePtr->getParent();
 
   auto SpillBlock =
-    FramePtrBB->splitBasicBlock(FramePtr->getNextNode(), "AllocaSpillBB");      
+      FramePtrBB->splitBasicBlock(FramePtr->getNextNode(), "AllocaSpillBB");
   SpillBlock->splitBasicBlock(&SpillBlock->front(), "PostSpill");
   Shape.AllocaSpillBlock = SpillBlock;
 
   // retcon and retcon.once lowering assumes all uses have been sunk.
-  if (Shape.ABI == coro::ABI::Retcon || Shape.ABI == coro::ABI::RetconOnce) {
+  if (Shape.ABI == coro::ABI::Retcon || Shape.ABI == coro::ABI::RetconOnce ||
+      Shape.ABI == coro::ABI::Async) {
     // If we found any allocas, replace all of their remaining uses with Geps.
     Builder.SetInsertPoint(&SpillBlock->front());
-    for (auto &P : Allocas) {
-      auto *G = GetFramePointer(P.second, P.first);
+    for (const auto &P : FrameData.Allocas) {
+      AllocaInst *Alloca = P.Alloca;
+      auto *G = GetFramePointer(Alloca);
 
       // We are not using ReplaceInstWithInst(P.first, cast<Instruction>(G))
       // here, as we are changing location of the instruction.
-      G->takeName(P.first);
-      P.first->replaceAllUsesWith(G);
-      P.first->eraseFromParent();
+      G->takeName(Alloca);
+      Alloca->replaceAllUsesWith(G);
+      Alloca->eraseFromParent();
     }
     return FramePtr;
   }
@@ -921,49 +1296,63 @@ static Instruction *insertSpills(const SpillInfo &Spills, coro::Shape &Shape) {
   // we also delete the original dbg.declare and replace other uses with undef.
   // Note: We cannot replace the alloca with GEP instructions indiscriminately,
   // as some of the uses may not be dominated by CoroBegin.
-  bool MightNeedToCopy = false;
   Builder.SetInsertPoint(&Shape.AllocaSpillBlock->front());
   SmallVector<Instruction *, 4> UsersToUpdate;
-  for (auto &P : Allocas) {
-    AllocaInst *const A = P.first;
-
-    for (auto *DI : FindDbgDeclareUses(A))
-      DI->eraseFromParent();
-    replaceDbgUsesWithUndef(A);
-
+  for (const auto &A : FrameData.Allocas) {
+    AllocaInst *Alloca = A.Alloca;
     UsersToUpdate.clear();
-    for (User *U : A->users()) {
+    for (User *U : Alloca->users()) {
       auto *I = cast<Instruction>(U);
       if (DT.dominates(CB, I))
         UsersToUpdate.push_back(I);
-      else
-        MightNeedToCopy = true;
-    }
-    if (!UsersToUpdate.empty()) {
-      auto *G = GetFramePointer(P.second, A);
-      G->takeName(A);
-      for (Instruction *I : UsersToUpdate)
-        I->replaceUsesOfWith(A, G);
     }
-  }
-  // If we discovered such uses not dominated by CoroBegin, see if any of them
-  // preceed coro begin and have instructions that can modify the
-  // value of the alloca and therefore would require a copying the value into
-  // the spill slot in the coroutine frame.
-  if (MightNeedToCopy) {
-    Builder.SetInsertPoint(FramePtr->getNextNode());
-
-    for (auto &P : Allocas) {
-      AllocaInst *const A = P.first;
-      if (mightWriteIntoAllocaPtr(*A, DT, *CB)) {
-        if (A->isArrayAllocation())
-          report_fatal_error(
-              "Coroutines cannot handle copying of array allocas yet");
+    if (UsersToUpdate.empty())
+      continue;
+    auto *G = GetFramePointer(Alloca);
+    G->setName(Alloca->getName() + Twine(".reload.addr"));
+
+    SmallPtrSet<BasicBlock *, 4> SeenDbgBBs;
+    TinyPtrVector<DbgDeclareInst *> DIs = FindDbgDeclareUses(Alloca);
+    if (!DIs.empty())
+      DIBuilder(*Alloca->getModule(),
+                /*AllowUnresolved*/ false)
+          .insertDeclare(G, DIs.front()->getVariable(),
+                         DIs.front()->getExpression(),
+                         DIs.front()->getDebugLoc(), DIs.front());
+    for (auto *DI : FindDbgDeclareUses(Alloca))
+      DI->eraseFromParent();
+    replaceDbgUsesWithUndef(Alloca);
 
-        auto *G = GetFramePointer(P.second, A);
-        auto *Value = Builder.CreateLoad(A->getAllocatedType(), A);
-        Builder.CreateStore(Value, G);
-      }
+    for (Instruction *I : UsersToUpdate)
+      I->replaceUsesOfWith(Alloca, G);
+  }
+  Builder.SetInsertPoint(FramePtr->getNextNode());
+  for (const auto &A : FrameData.Allocas) {
+    AllocaInst *Alloca = A.Alloca;
+    if (A.MayWriteBeforeCoroBegin) {
+      // isEscaped really means potentially modified before CoroBegin.
+      if (Alloca->isArrayAllocation())
+        report_fatal_error(
+            "Coroutines cannot handle copying of array allocas yet");
+
+      auto *G = GetFramePointer(Alloca);
+      auto *Value = Builder.CreateLoad(Alloca->getAllocatedType(), Alloca);
+      Builder.CreateStore(Value, G);
+    }
+    // For each alias to Alloca created before CoroBegin but used after
+    // CoroBegin, we recreate them after CoroBegin by appplying the offset
+    // to the pointer in the frame.
+    for (const auto &Alias : A.Aliases) {
+      auto *FramePtr = GetFramePointer(Alloca);
+      auto *FramePtrRaw =
+          Builder.CreateBitCast(FramePtr, Type::getInt8PtrTy(C));
+      auto *AliasPtr = Builder.CreateGEP(
+          FramePtrRaw,
+          ConstantInt::get(Type::getInt64Ty(C), Alias.second.getValue()));
+      auto *AliasPtrTyped =
+          Builder.CreateBitCast(AliasPtr, Alias.first->getType());
+      Alias.first->replaceUsesWithIf(
+          AliasPtrTyped, [&](Use &U) { return DT.dominates(CB, U); });
     }
   }
   return FramePtr;
@@ -984,15 +1373,14 @@ static void setUnwindEdgeTo(Instruction *TI, BasicBlock *Succ) {
 // Replaces all uses of OldPred with the NewPred block in all PHINodes in a
 // block.
 static void updatePhiNodes(BasicBlock *DestBB, BasicBlock *OldPred,
-                           BasicBlock *NewPred,
-                           PHINode *LandingPadReplacement) {
+                           BasicBlock *NewPred, PHINode *Until = nullptr) {
   unsigned BBIdx = 0;
   for (BasicBlock::iterator I = DestBB->begin(); isa<PHINode>(I); ++I) {
     PHINode *PN = cast<PHINode>(I);
 
     // We manually update the LandingPadReplacement PHINode and it is the last
     // PHI Node. So, if we find it, we are done.
-    if (LandingPadReplacement == PN)
+    if (Until == PN)
       break;
 
     // Reuse the previous value of BBIdx if it lines up.  In cases where we
@@ -1041,6 +1429,101 @@ static BasicBlock *ehAwareSplitEdge(BasicBlock *BB, BasicBlock *Succ,
   return NewBB;
 }
 
+// Moves the values in the PHIs in SuccBB that correspong to PredBB into a new
+// PHI in InsertedBB.
+static void movePHIValuesToInsertedBlock(BasicBlock *SuccBB,
+                                         BasicBlock *InsertedBB,
+                                         BasicBlock *PredBB,
+                                         PHINode *UntilPHI = nullptr) {
+  auto *PN = cast<PHINode>(&SuccBB->front());
+  do {
+    int Index = PN->getBasicBlockIndex(InsertedBB);
+    Value *V = PN->getIncomingValue(Index);
+    PHINode *InputV = PHINode::Create(
+        V->getType(), 1, V->getName() + Twine(".") + SuccBB->getName(),
+        &InsertedBB->front());
+    InputV->addIncoming(V, PredBB);
+    PN->setIncomingValue(Index, InputV);
+    PN = dyn_cast<PHINode>(PN->getNextNode());
+  } while (PN != UntilPHI);
+}
+
+// Rewrites the PHI Nodes in a cleanuppad.
+static void rewritePHIsForCleanupPad(BasicBlock *CleanupPadBB,
+                                     CleanupPadInst *CleanupPad) {
+  // For every incoming edge to a CleanupPad we will create a new block holding
+  // all incoming values in single-value PHI nodes. We will then create another
+  // block to act as a dispather (as all unwind edges for related EH blocks
+  // must be the same).
+  //
+  // cleanuppad:
+  //    %2 = phi i32[%0, %catchswitch], [%1, %catch.1]
+  //    %3 = cleanuppad within none []
+  //
+  // It will create:
+  //
+  // cleanuppad.corodispatch
+  //    %2 = phi i8[0, %catchswitch], [1, %catch.1]
+  //    %3 = cleanuppad within none []
+  //    switch i8 % 2, label %unreachable
+  //            [i8 0, label %cleanuppad.from.catchswitch
+  //             i8 1, label %cleanuppad.from.catch.1]
+  // cleanuppad.from.catchswitch:
+  //    %4 = phi i32 [%0, %catchswitch]
+  //    br %label cleanuppad
+  // cleanuppad.from.catch.1:
+  //    %6 = phi i32 [%1, %catch.1]
+  //    br %label cleanuppad
+  // cleanuppad:
+  //    %8 = phi i32 [%4, %cleanuppad.from.catchswitch],
+  //                 [%6, %cleanuppad.from.catch.1]
+
+  // Unreachable BB, in case switching on an invalid value in the dispatcher.
+  auto *UnreachBB = BasicBlock::Create(
+      CleanupPadBB->getContext(), "unreachable", CleanupPadBB->getParent());
+  IRBuilder<> Builder(UnreachBB);
+  Builder.CreateUnreachable();
+
+  // Create a new cleanuppad which will be the dispatcher.
+  auto *NewCleanupPadBB =
+      BasicBlock::Create(CleanupPadBB->getContext(),
+                         CleanupPadBB->getName() + Twine(".corodispatch"),
+                         CleanupPadBB->getParent(), CleanupPadBB);
+  Builder.SetInsertPoint(NewCleanupPadBB);
+  auto *SwitchType = Builder.getInt8Ty();
+  auto *SetDispatchValuePN =
+      Builder.CreatePHI(SwitchType, pred_size(CleanupPadBB));
+  CleanupPad->removeFromParent();
+  CleanupPad->insertAfter(SetDispatchValuePN);
+  auto *SwitchOnDispatch = Builder.CreateSwitch(SetDispatchValuePN, UnreachBB,
+                                                pred_size(CleanupPadBB));
+
+  int SwitchIndex = 0;
+  SmallVector<BasicBlock *, 8> Preds(predecessors(CleanupPadBB));
+  for (BasicBlock *Pred : Preds) {
+    // Create a new cleanuppad and move the PHI values to there.
+    auto *CaseBB = BasicBlock::Create(CleanupPadBB->getContext(),
+                                      CleanupPadBB->getName() +
+                                          Twine(".from.") + Pred->getName(),
+                                      CleanupPadBB->getParent(), CleanupPadBB);
+    updatePhiNodes(CleanupPadBB, Pred, CaseBB);
+    CaseBB->setName(CleanupPadBB->getName() + Twine(".from.") +
+                    Pred->getName());
+    Builder.SetInsertPoint(CaseBB);
+    Builder.CreateBr(CleanupPadBB);
+    movePHIValuesToInsertedBlock(CleanupPadBB, CaseBB, NewCleanupPadBB);
+
+    // Update this Pred to the new unwind point.
+    setUnwindEdgeTo(Pred->getTerminator(), NewCleanupPadBB);
+
+    // Setup the switch in the dispatcher.
+    auto *SwitchConstant = ConstantInt::get(SwitchType, SwitchIndex);
+    SetDispatchValuePN->addIncoming(SwitchConstant, Pred);
+    SwitchOnDispatch->addCase(SwitchConstant, CaseBB);
+    SwitchIndex++;
+  }
+}
+
 static void rewritePHIs(BasicBlock &BB) {
   // For every incoming edge we will create a block holding all
   // incoming values in a single PHI nodes.
@@ -1063,6 +1546,24 @@ static void rewritePHIs(BasicBlock &BB) {
   // TODO: Simplify PHINodes in the basic block to remove duplicate
   // predecessors.
 
+  // Special case for CleanupPad: all EH blocks must have the same unwind edge
+  // so we need to create an additional "dispatcher" block.
+  if (auto *CleanupPad =
+          dyn_cast_or_null<CleanupPadInst>(BB.getFirstNonPHI())) {
+    SmallVector<BasicBlock *, 8> Preds(predecessors(&BB));
+    for (BasicBlock *Pred : Preds) {
+      if (CatchSwitchInst *CS =
+              dyn_cast<CatchSwitchInst>(Pred->getTerminator())) {
+        // CleanupPad with a CatchSwitch predecessor: therefore this is an
+        // unwind destination that needs to be handle specially.
+        assert(CS->getUnwindDest() == &BB);
+        (void)CS;
+        rewritePHIsForCleanupPad(&BB, CleanupPad);
+        return;
+      }
+    }
+  }
+
   LandingPadInst *LandingPad = nullptr;
   PHINode *ReplPHI = nullptr;
   if ((LandingPad = dyn_cast_or_null<LandingPadInst>(BB.getFirstNonPHI()))) {
@@ -1076,22 +1577,14 @@ static void rewritePHIs(BasicBlock &BB) {
     // ehAwareSplitEdge cloned it in the transition blocks.
   }
 
-  SmallVector<BasicBlock *, 8> Preds(pred_begin(&BB), pred_end(&BB));
+  SmallVector<BasicBlock *, 8> Preds(predecessors(&BB));
   for (BasicBlock *Pred : Preds) {
     auto *IncomingBB = ehAwareSplitEdge(Pred, &BB, LandingPad, ReplPHI);
     IncomingBB->setName(BB.getName() + Twine(".from.") + Pred->getName());
-    auto *PN = cast<PHINode>(&BB.front());
-    do {
-      int Index = PN->getBasicBlockIndex(IncomingBB);
-      Value *V = PN->getIncomingValue(Index);
-      PHINode *InputV = PHINode::Create(
-          V->getType(), 1, V->getName() + Twine(".") + BB.getName(),
-          &IncomingBB->front());
-      InputV->addIncoming(V, Pred);
-      PN->setIncomingValue(Index, InputV);
-      PN = dyn_cast<PHINode>(PN->getNextNode());
-    } while (PN != ReplPHI); // ReplPHI is either null or the PHI that replaced
-                             // the landing pad.
+
+    // Stop the moving of values at ReplPHI, as this is either null or the PHI
+    // that replaced the landing pad.
+    movePHIValuesToInsertedBlock(&BB, IncomingBB, Pred, ReplPHI);
   }
 
   if (LandingPad) {
@@ -1130,39 +1623,32 @@ static bool isCoroutineStructureIntrinsic(Instruction &I) {
 // For every use of the value that is across suspend point, recreate that value
 // after a suspend point.
 static void rewriteMaterializableInstructions(IRBuilder<> &IRB,
-                                              SpillInfo const &Spills) {
-  BasicBlock *CurrentBlock = nullptr;
-  Instruction *CurrentMaterialization = nullptr;
-  Instruction *CurrentDef = nullptr;
-
-  for (auto const &E : Spills) {
-    // If it is a new definition, update CurrentXXX variables.
-    if (CurrentDef != E.def()) {
-      CurrentDef = cast<Instruction>(E.def());
-      CurrentBlock = nullptr;
-      CurrentMaterialization = nullptr;
-    }
-
-    // If we have not seen this block, materialize the value.
-    if (CurrentBlock != E.userBlock()) {
-      CurrentBlock = E.userBlock();
-      CurrentMaterialization = cast<Instruction>(CurrentDef)->clone();
-      CurrentMaterialization->setName(CurrentDef->getName());
-      CurrentMaterialization->insertBefore(
-          &*CurrentBlock->getFirstInsertionPt());
-    }
-
-    if (auto *PN = dyn_cast<PHINode>(E.user())) {
-      assert(PN->getNumIncomingValues() == 1 && "unexpected number of incoming "
-                                                "values in the PHINode");
-      PN->replaceAllUsesWith(CurrentMaterialization);
-      PN->eraseFromParent();
-      continue;
+                                              const SpillInfo &Spills) {
+  for (const auto &E : Spills) {
+    Value *Def = E.first;
+    BasicBlock *CurrentBlock = nullptr;
+    Instruction *CurrentMaterialization = nullptr;
+    for (Instruction *U : E.second) {
+      // If we have not seen this block, materialize the value.
+      if (CurrentBlock != U->getParent()) {
+        CurrentBlock = U->getParent();
+        CurrentMaterialization = cast<Instruction>(Def)->clone();
+        CurrentMaterialization->setName(Def->getName());
+        CurrentMaterialization->insertBefore(
+            &*CurrentBlock->getFirstInsertionPt());
+      }
+      if (auto *PN = dyn_cast<PHINode>(U)) {
+        assert(PN->getNumIncomingValues() == 1 &&
+               "unexpected number of incoming "
+               "values in the PHINode");
+        PN->replaceAllUsesWith(CurrentMaterialization);
+        PN->eraseFromParent();
+        continue;
+      }
+      // Replace all uses of Def in the current instruction with the
+      // CurrentMaterialization for the block.
+      U->replaceUsesOfWith(Def, CurrentMaterialization);
     }
-
-    // Replace all uses of CurrentDef in the current instruction with the
-    // CurrentMaterialization for the block.
-    E.user()->replaceUsesOfWith(CurrentDef, CurrentMaterialization);
   }
 }
 
@@ -1501,7 +1987,8 @@ static void eliminateSwiftError(Function &F, coro::Shape &Shape) {
 
 /// retcon and retcon.once conventions assume that all spill uses can be sunk
 /// after the coro.begin intrinsic.
-static void sinkSpillUsesAfterCoroBegin(Function &F, const SpillInfo &Spills,
+static void sinkSpillUsesAfterCoroBegin(Function &F,
+                                        const FrameDataInfo &FrameData,
                                         CoroBeginInst *CoroBegin) {
   DominatorTree Dom(F);
 
@@ -1509,9 +1996,8 @@ static void sinkSpillUsesAfterCoroBegin(Function &F, const SpillInfo &Spills,
   SmallVector<Instruction *, 32> Worklist;
 
   // Collect all users that precede coro.begin.
-  for (auto const &Entry : Spills) {
-    auto *SpillDef = Entry.def();
-    for (User *U : SpillDef->users()) {
+  for (auto *Def : FrameData.getAllDefs()) {
+    for (User *U : Def->users()) {
       auto Inst = cast<Instruction>(U);
       if (Inst->getParent() != CoroBegin->getParent() ||
           Dom.dominates(CoroBegin, Inst))
@@ -1522,8 +2008,7 @@ static void sinkSpillUsesAfterCoroBegin(Function &F, const SpillInfo &Spills,
   }
   // Recursively collect users before coro.begin.
   while (!Worklist.empty()) {
-    auto *Def = Worklist.back();
-    Worklist.pop_back();
+    auto *Def = Worklist.pop_back_val();
     for (User *U : Def->users()) {
       auto Inst = cast<Instruction>(U);
       if (Dom.dominates(CoroBegin, Inst))
@@ -1535,17 +2020,14 @@ static void sinkSpillUsesAfterCoroBegin(Function &F, const SpillInfo &Spills,
 
   // Sort by dominance.
   SmallVector<Instruction *, 64> InsertionList(ToMove.begin(), ToMove.end());
-  std::sort(InsertionList.begin(), InsertionList.end(),
-            [&Dom](Instruction *A, Instruction *B) -> bool {
-              // If a dominates b it should preceed (<) b.
-              return Dom.dominates(A, B);
-            });
+  llvm::sort(InsertionList, [&Dom](Instruction *A, Instruction *B) -> bool {
+    // If a dominates b it should preceed (<) b.
+    return Dom.dominates(A, B);
+  });
 
   Instruction *InsertPt = CoroBegin->getNextNode();
   for (Instruction *Inst : InsertionList)
     Inst->moveBefore(InsertPt);
-
-  return;
 }
 
 /// For each local variable that all of its user are only used inside one of
@@ -1567,56 +2049,186 @@ static void sinkLifetimeStartMarkers(Function &F, coro::Shape &Shape,
   }
 
   for (Instruction &I : instructions(F)) {
-    if (!isa<AllocaInst>(&I))
+    AllocaInst* AI = dyn_cast<AllocaInst>(&I);
+    if (!AI)
       continue;
 
     for (BasicBlock *DomBB : DomSet) {
       bool Valid = true;
-      SmallVector<Instruction *, 1> BCInsts;
+      SmallVector<Instruction *, 1> Lifetimes;
 
-      auto isUsedByLifetimeStart = [&](Instruction *I) {
-        if (isa<BitCastInst>(I) && I->hasOneUse())
-          if (auto *IT = dyn_cast<IntrinsicInst>(I->user_back()))
-            return IT->getIntrinsicID() == Intrinsic::lifetime_start;
+      auto isLifetimeStart = [](Instruction* I) {
+        if (auto* II = dyn_cast<IntrinsicInst>(I))
+          return II->getIntrinsicID() == Intrinsic::lifetime_start;
         return false;
       };
 
-      for (User *U : I.users()) {
+      auto collectLifetimeStart = [&](Instruction *U, AllocaInst *AI) {
+        if (isLifetimeStart(U)) {
+          Lifetimes.push_back(U);
+          return true;
+        }
+        if (!U->hasOneUse() || U->stripPointerCasts() != AI)
+          return false;
+        if (isLifetimeStart(U->user_back())) {
+          Lifetimes.push_back(U->user_back());
+          return true;
+        }
+        return false;
+      };
+
+      for (User *U : AI->users()) {
         Instruction *UI = cast<Instruction>(U);
         // For all users except lifetime.start markers, if they are all
         // dominated by one of the basic blocks and do not cross
         // suspend points as well, then there is no need to spill the
         // instruction.
         if (!DT.dominates(DomBB, UI->getParent()) ||
-            Checker.isDefinitionAcrossSuspend(DomBB, U)) {
-          // Skip bitcast used by lifetime.start markers.
-          if (isUsedByLifetimeStart(UI)) {
-            BCInsts.push_back(UI);
+            Checker.isDefinitionAcrossSuspend(DomBB, UI)) {
+          // Skip lifetime.start, GEP and bitcast used by lifetime.start
+          // markers.
+          if (collectLifetimeStart(UI, AI))
             continue;
-          }
           Valid = false;
           break;
         }
       }
       // Sink lifetime.start markers to dominate block when they are
       // only used outside the region.
-      if (Valid && BCInsts.size() != 0) {
-        auto *NewBitcast = BCInsts[0]->clone();
-        auto *NewLifetime = cast<Instruction>(BCInsts[0]->user_back())->clone();
-        NewLifetime->replaceUsesOfWith(BCInsts[0], NewBitcast);
-        NewBitcast->insertBefore(DomBB->getTerminator());
+      if (Valid && Lifetimes.size() != 0) {
+        // May be AI itself, when the type of AI is i8*
+        auto *NewBitCast = [&](AllocaInst *AI) -> Value* {
+          if (isa<AllocaInst>(Lifetimes[0]->getOperand(1)))
+            return AI;
+          auto *Int8PtrTy = Type::getInt8PtrTy(F.getContext());
+          return CastInst::Create(Instruction::BitCast, AI, Int8PtrTy, "",
+                                  DomBB->getTerminator());
+        }(AI);
+
+        auto *NewLifetime = Lifetimes[0]->clone();
+        NewLifetime->replaceUsesOfWith(NewLifetime->getOperand(1), NewBitCast);
         NewLifetime->insertBefore(DomBB->getTerminator());
 
         // All the outsided lifetime.start markers are no longer necessary.
-        for (Instruction *S : BCInsts) {
-          S->user_back()->eraseFromParent();
-        }
+        for (Instruction *S : Lifetimes)
+          S->eraseFromParent();
+
         break;
       }
     }
   }
 }
 
+static void collectFrameAllocas(Function &F, coro::Shape &Shape,
+                                const SuspendCrossingInfo &Checker,
+                                SmallVectorImpl<AllocaInfo> &Allocas) {
+  // Collect lifetime.start info for each alloca.
+  using LifetimeStart = SmallPtrSet<Instruction *, 2>;
+  llvm::DenseMap<AllocaInst *, std::unique_ptr<LifetimeStart>> LifetimeMap;
+  for (Instruction &I : instructions(F)) {
+    auto *II = dyn_cast<IntrinsicInst>(&I);
+    if (!II || II->getIntrinsicID() != Intrinsic::lifetime_start)
+      continue;
+
+    if (auto *OpInst = dyn_cast<Instruction>(II->getOperand(1))) {
+      if (auto *AI = dyn_cast<AllocaInst>(OpInst->stripPointerCasts())) {
+
+        if (LifetimeMap.find(AI) == LifetimeMap.end())
+          LifetimeMap[AI] = std::make_unique<LifetimeStart>();
+        LifetimeMap[AI]->insert(isa<AllocaInst>(OpInst) ? II : OpInst);
+      }
+    }
+  }
+
+  for (Instruction &I : instructions(F)) {
+    auto *AI = dyn_cast<AllocaInst>(&I);
+    if (!AI)
+      continue;
+    // The PromiseAlloca will be specially handled since it needs to be in a
+    // fixed position in the frame.
+    if (AI == Shape.SwitchLowering.PromiseAlloca) {
+      continue;
+    }
+    bool ShouldLiveOnFrame = false;
+    auto Iter = LifetimeMap.find(AI);
+    if (Iter != LifetimeMap.end()) {
+      // Check against lifetime.start if the instruction has the info.
+      for (User *U : I.users()) {
+        for (auto *S : *Iter->second)
+          if ((ShouldLiveOnFrame = Checker.isDefinitionAcrossSuspend(*S, U)))
+            break;
+        if (ShouldLiveOnFrame)
+          break;
+      }
+      if (!ShouldLiveOnFrame)
+        continue;
+    }
+    // At this point, either ShouldLiveOnFrame is true or we didn't have
+    // lifetime information. We will need to rely on more precise pointer
+    // tracking.
+    DominatorTree DT(F);
+    AllocaUseVisitor Visitor{F.getParent()->getDataLayout(), DT,
+                             *Shape.CoroBegin, Checker};
+    Visitor.visitPtr(*AI);
+    if (!Visitor.getShouldLiveOnFrame())
+      continue;
+    Allocas.emplace_back(AI, Visitor.getAliasesCopy(),
+                         Visitor.getMayWriteBeforeCoroBegin());
+  }
+}
+
+void coro::salvageDebugInfo(
+    SmallDenseMap<llvm::Value *, llvm::AllocaInst *, 4> &DbgPtrAllocaCache,
+    DbgDeclareInst *DDI, bool LoadFromFramePtr) {
+  Function *F = DDI->getFunction();
+  IRBuilder<> Builder(F->getContext());
+  auto InsertPt = F->getEntryBlock().getFirstInsertionPt();
+  while (isa<IntrinsicInst>(InsertPt))
+    ++InsertPt;
+  Builder.SetInsertPoint(&F->getEntryBlock(), InsertPt);
+  DIExpression *Expr = DDI->getExpression();
+  // Follow the pointer arithmetic all the way to the incoming
+  // function argument and convert into a DIExpression.
+  Value *Storage = DDI->getAddress();
+  while (Storage) {
+    if (auto *LdInst = dyn_cast<LoadInst>(Storage)) {
+      Storage = LdInst->getOperand(0);
+    } else if (auto *StInst = dyn_cast<StoreInst>(Storage)) {
+      Storage = StInst->getOperand(0);
+    } else if (auto *GEPInst = dyn_cast<GetElementPtrInst>(Storage)) {
+      Expr = llvm::salvageDebugInfoImpl(*GEPInst, Expr,
+                                        /*WithStackValue=*/false);
+      Storage = GEPInst->getOperand(0);
+    } else if (auto *BCInst = dyn_cast<llvm::BitCastInst>(Storage))
+      Storage = BCInst->getOperand(0);
+    else
+      break;
+  }
+  // Store a pointer to the coroutine frame object in an alloca so it
+  // is available throughout the function when producing unoptimized
+  // code. Extending the lifetime this way is correct because the
+  // variable has been declared by a dbg.declare intrinsic.
+  if (auto Arg = dyn_cast_or_null<llvm::Argument>(Storage)) {
+    auto &Cached = DbgPtrAllocaCache[Storage];
+    if (!Cached) {
+      Cached = Builder.CreateAlloca(Storage->getType(), 0, nullptr,
+                                    Arg->getName() + ".debug");
+      Builder.CreateStore(Storage, Cached);
+    }
+    Storage = Cached;
+  }
+  // The FramePtr object adds one extra layer of indirection that
+  // needs to be unwrapped.
+  if (LoadFromFramePtr)
+    Expr = DIExpression::prepend(Expr, DIExpression::DerefBefore);
+  auto &VMContext = DDI->getFunction()->getContext();
+  DDI->setOperand(
+      0, MetadataAsValue::get(VMContext, ValueAsMetadata::get(Storage)));
+  DDI->setOperand(2, MetadataAsValue::get(VMContext, Expr));
+  if (auto *InsertPt = dyn_cast_or_null<Instruction>(Storage))
+    DDI->moveAfter(InsertPt);
+}
+
 void coro::buildCoroutineFrame(Function &F, Shape &Shape) {
   eliminateSwiftError(F, Shape);
 
@@ -1635,9 +2247,26 @@ void coro::buildCoroutineFrame(Function &F, Shape &Shape) {
   }
 
   // Put CoroEnds into their own blocks.
-  for (CoroEndInst *CE : Shape.CoroEnds)
+  for (AnyCoroEndInst *CE : Shape.CoroEnds) {
     splitAround(CE, "CoroEnd");
 
+    // Emit the musttail call function in a new block before the CoroEnd.
+    // We do this here so that the right suspend crossing info is computed for
+    // the uses of the musttail call function call. (Arguments to the coro.end
+    // instructions would be ignored)
+    if (auto *AsyncEnd = dyn_cast<CoroAsyncEndInst>(CE)) {
+      auto *MustTailCallFn = AsyncEnd->getMustTailCallFunction();
+      if (!MustTailCallFn)
+        continue;
+      IRBuilder<> Builder(AsyncEnd);
+      SmallVector<Value *, 8> Args(AsyncEnd->args());
+      auto Arguments = ArrayRef<Value *>(Args).drop_front(3);
+      auto *Call = createMustTailCall(AsyncEnd->getDebugLoc(), MustTailCallFn,
+                                      Arguments, Builder);
+      splitAround(Call, "MustTailCall.Before.CoroEnd");
+    }
+  }
+
   // Transforms multi-edge PHI Nodes, so that any value feeding into a PHI will
   // never has its definition separated from the PHI by the suspend point.
   rewritePHIs(F);
@@ -1646,51 +2275,40 @@ void coro::buildCoroutineFrame(Function &F, Shape &Shape) {
   SuspendCrossingInfo Checker(F, Shape);
 
   IRBuilder<> Builder(F.getContext());
-  SpillInfo Spills;
+  FrameDataInfo FrameData;
   SmallVector<CoroAllocaAllocInst*, 4> LocalAllocas;
   SmallVector<Instruction*, 4> DeadInstructions;
 
-  for (int Repeat = 0; Repeat < 4; ++Repeat) {
-    // See if there are materializable instructions across suspend points.
-    for (Instruction &I : instructions(F))
-      if (materializable(I))
-        for (User *U : I.users())
-          if (Checker.isDefinitionAcrossSuspend(I, U))
-            Spills.emplace_back(&I, U);
-
-    if (Spills.empty())
-      break;
+  {
+    SpillInfo Spills;
+    for (int Repeat = 0; Repeat < 4; ++Repeat) {
+      // See if there are materializable instructions across suspend points.
+      for (Instruction &I : instructions(F))
+        if (materializable(I))
+          for (User *U : I.users())
+            if (Checker.isDefinitionAcrossSuspend(I, U))
+              Spills[&I].push_back(cast<Instruction>(U));
+
+      if (Spills.empty())
+        break;
 
-    // Rewrite materializable instructions to be materialized at the use point.
-    LLVM_DEBUG(dump("Materializations", Spills));
-    rewriteMaterializableInstructions(Builder, Spills);
-    Spills.clear();
+      // Rewrite materializable instructions to be materialized at the use
+      // point.
+      LLVM_DEBUG(dumpSpills("Materializations", Spills));
+      rewriteMaterializableInstructions(Builder, Spills);
+      Spills.clear();
+    }
   }
 
   sinkLifetimeStartMarkers(F, Shape, Checker);
-  // Collect lifetime.start info for each alloca.
-  using LifetimeStart = SmallPtrSet<Instruction *, 2>;
-  llvm::DenseMap<Instruction *, std::unique_ptr<LifetimeStart>> LifetimeMap;
-  for (Instruction &I : instructions(F)) {
-    auto *II = dyn_cast<IntrinsicInst>(&I);
-    if (!II || II->getIntrinsicID() != Intrinsic::lifetime_start)
-      continue;
-
-    if (auto *OpInst = dyn_cast<BitCastInst>(I.getOperand(1)))
-      if (auto *AI = dyn_cast<AllocaInst>(OpInst->getOperand(0))) {
-
-        if (LifetimeMap.find(AI) == LifetimeMap.end())
-          LifetimeMap[AI] = std::make_unique<LifetimeStart>();
-
-        LifetimeMap[AI]->insert(OpInst);
-      }
-  }
+  collectFrameAllocas(F, Shape, Checker, FrameData.Allocas);
+  LLVM_DEBUG(dumpAllocas(FrameData.Allocas));
 
   // Collect the spills for arguments and other not-materializable values.
   for (Argument &A : F.args())
     for (User *U : A.users())
       if (Checker.isDefinitionAcrossSuspend(A, U))
-        Spills.emplace_back(&A, U);
+        FrameData.Spills[&A].push_back(cast<Instruction>(U));
 
   for (Instruction &I : instructions(F)) {
     // Values returned from coroutine structure intrinsics should not be part
@@ -1721,43 +2339,33 @@ void coro::buildCoroutineFrame(Function &F, Shape &Shape) {
 
       for (User *U : Alloc->users()) {
         if (Checker.isDefinitionAcrossSuspend(*Alloc, U))
-          Spills.emplace_back(Alloc, U);
+          FrameData.Spills[Alloc].push_back(cast<Instruction>(U));
       }
       continue;
     }
 
     // Ignore alloca.get; we process this as part of coro.alloca.alloc.
-    if (isa<CoroAllocaGetInst>(I)) {
+    if (isa<CoroAllocaGetInst>(I))
       continue;
-    }
 
-    auto Iter = LifetimeMap.find(&I);
-    for (User *U : I.users()) {
-      bool NeedSpill = false;
-
-      // Check against lifetime.start if the instruction has the info.
-      if (Iter != LifetimeMap.end())
-        for (auto *S : *Iter->second) {
-          if ((NeedSpill = Checker.isDefinitionAcrossSuspend(*S, U)))
-            break;
-        }
-      else
-        NeedSpill = Checker.isDefinitionAcrossSuspend(I, U);
+    if (isa<AllocaInst>(I))
+      continue;
 
-      if (NeedSpill) {
+    for (User *U : I.users())
+      if (Checker.isDefinitionAcrossSuspend(I, U)) {
         // We cannot spill a token.
         if (I.getType()->isTokenTy())
           report_fatal_error(
               "token definition is separated from the use by a suspend point");
-        Spills.emplace_back(&I, U);
+        FrameData.Spills[&I].push_back(cast<Instruction>(U));
       }
-    }
   }
-  LLVM_DEBUG(dump("Spills", Spills));
-  if (Shape.ABI == coro::ABI::Retcon || Shape.ABI == coro::ABI::RetconOnce)
-    sinkSpillUsesAfterCoroBegin(F, Spills, Shape.CoroBegin);
-  Shape.FrameTy = buildFrameType(F, Shape, Spills);
-  Shape.FramePtr = insertSpills(Spills, Shape);
+  LLVM_DEBUG(dumpSpills("Spills", FrameData.Spills));
+  if (Shape.ABI == coro::ABI::Retcon || Shape.ABI == coro::ABI::RetconOnce ||
+      Shape.ABI == coro::ABI::Async)
+    sinkSpillUsesAfterCoroBegin(F, FrameData, Shape.CoroBegin);
+  Shape.FrameTy = buildFrameType(F, Shape, FrameData);
+  Shape.FramePtr = insertSpills(FrameData, Shape);
   lowerLocalAllocas(LocalAllocas, DeadInstructions);
 
   for (auto I : DeadInstructions)
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroInstr.h b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroInstr.h
index 320137526db8..9fa2fd12f80b 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroInstr.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroInstr.h
@@ -99,9 +99,9 @@ public:
   // Methods to support type inquiry through isa, cast, and dyn_cast:
   static bool classof(const IntrinsicInst *I) {
     auto ID = I->getIntrinsicID();
-    return ID == Intrinsic::coro_id ||
-           ID == Intrinsic::coro_id_retcon ||
-           ID == Intrinsic::coro_id_retcon_once;
+    return ID == Intrinsic::coro_id || ID == Intrinsic::coro_id_retcon ||
+           ID == Intrinsic::coro_id_retcon_once ||
+           ID == Intrinsic::coro_id_async;
   }
 
   static bool classof(const Value *V) {
@@ -273,6 +273,109 @@ public:
   }
 };
 
+/// This represents the llvm.coro.id.async instruction.
+class LLVM_LIBRARY_VISIBILITY CoroIdAsyncInst : public AnyCoroIdInst {
+  enum { SizeArg, AlignArg, StorageArg, AsyncFuncPtrArg };
+
+public:
+  void checkWellFormed() const;
+
+  /// The initial async function context size. The fields of which are reserved
+  /// for use by the frontend. The frame will be allocated as a tail of this
+  /// context.
+  uint64_t getStorageSize() const {
+    return cast<ConstantInt>(getArgOperand(SizeArg))->getZExtValue();
+  }
+
+  /// The alignment of the initial async function context.
+  Align getStorageAlignment() const {
+    return cast<ConstantInt>(getArgOperand(AlignArg))->getAlignValue();
+  }
+
+  /// The async context parameter.
+  Value *getStorage() const {
+    return getParent()->getParent()->getArg(getStorageArgumentIndex());
+  }
+
+  unsigned getStorageArgumentIndex() const {
+    auto *Arg = cast<ConstantInt>(getArgOperand(StorageArg));
+    return Arg->getZExtValue();
+  }
+
+  /// Return the async function pointer address. This should be the address of
+  /// a async function pointer struct for the current async function.
+  /// struct async_function_pointer {
+  ///   uint32_t context_size;
+  ///   uint32_t relative_async_function_pointer;
+  ///  };
+  GlobalVariable *getAsyncFunctionPointer() const {
+    return cast<GlobalVariable>(
+        getArgOperand(AsyncFuncPtrArg)->stripPointerCasts());
+  }
+
+  // Methods to support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const IntrinsicInst *I) {
+    auto ID = I->getIntrinsicID();
+    return ID == Intrinsic::coro_id_async;
+  }
+
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
+/// This represents the llvm.coro.context.alloc instruction.
+class LLVM_LIBRARY_VISIBILITY CoroAsyncContextAllocInst : public IntrinsicInst {
+  enum { AsyncFuncPtrArg };
+
+public:
+  GlobalVariable *getAsyncFunctionPointer() const {
+    return cast<GlobalVariable>(
+        getArgOperand(AsyncFuncPtrArg)->stripPointerCasts());
+  }
+
+  // Methods to support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::coro_async_context_alloc;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
+/// This represents the llvm.coro.context.dealloc instruction.
+class LLVM_LIBRARY_VISIBILITY CoroAsyncContextDeallocInst
+    : public IntrinsicInst {
+  enum { AsyncContextArg };
+
+public:
+  Value *getAsyncContext() const {
+    return getArgOperand(AsyncContextArg)->stripPointerCasts();
+  }
+
+  // Methods to support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::coro_async_context_dealloc;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
+/// This represents the llvm.coro.async.resume instruction.
+/// During lowering this is replaced by the resume function of a suspend point
+/// (the continuation function).
+class LLVM_LIBRARY_VISIBILITY CoroAsyncResumeInst : public IntrinsicInst {
+public:
+  // Methods to support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::coro_async_resume;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
 /// This represents the llvm.coro.frame instruction.
 class LLVM_LIBRARY_VISIBILITY CoroFrameInst : public IntrinsicInst {
 public:
@@ -366,6 +469,7 @@ public:
   // Methods to support type inquiry through isa, cast, and dyn_cast:
   static bool classof(const IntrinsicInst *I) {
     return I->getIntrinsicID() == Intrinsic::coro_suspend ||
+           I->getIntrinsicID() == Intrinsic::coro_suspend_async ||
            I->getIntrinsicID() == Intrinsic::coro_suspend_retcon;
   }
   static bool classof(const Value *V) {
@@ -405,6 +509,37 @@ inline CoroSaveInst *AnyCoroSuspendInst::getCoroSave() const {
   return nullptr;
 }
 
+/// This represents the llvm.coro.suspend.async instruction.
+class LLVM_LIBRARY_VISIBILITY CoroSuspendAsyncInst : public AnyCoroSuspendInst {
+  enum { ResumeFunctionArg, AsyncContextProjectionArg, MustTailCallFuncArg };
+
+public:
+  void checkWellFormed() const;
+
+  Function *getAsyncContextProjectionFunction() const {
+    return cast<Function>(
+        getArgOperand(AsyncContextProjectionArg)->stripPointerCasts());
+  }
+
+  CoroAsyncResumeInst *getResumeFunction() const {
+    return cast<CoroAsyncResumeInst>(
+        getArgOperand(ResumeFunctionArg)->stripPointerCasts());
+  }
+
+  Function *getMustTailCallFunction() const {
+    return cast<Function>(
+        getArgOperand(MustTailCallFuncArg)->stripPointerCasts());
+  }
+
+  // Methods to support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::coro_suspend_async;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
 /// This represents the llvm.coro.suspend.retcon instruction.
 class LLVM_LIBRARY_VISIBILITY CoroSuspendRetconInst : public AnyCoroSuspendInst {
 public:
@@ -442,8 +577,7 @@ public:
   }
 };
 
-/// This represents the llvm.coro.end instruction.
-class LLVM_LIBRARY_VISIBILITY CoroEndInst : public IntrinsicInst {
+class LLVM_LIBRARY_VISIBILITY AnyCoroEndInst : public IntrinsicInst {
   enum { FrameArg, UnwindArg };
 
 public:
@@ -452,6 +586,19 @@ public:
     return cast<Constant>(getArgOperand(UnwindArg))->isOneValue();
   }
 
+  // Methods to support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const IntrinsicInst *I) {
+    auto ID = I->getIntrinsicID();
+    return ID == Intrinsic::coro_end || ID == Intrinsic::coro_end_async;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
+/// This represents the llvm.coro.end instruction.
+class LLVM_LIBRARY_VISIBILITY CoroEndInst : public AnyCoroEndInst {
+public:
   // Methods to support type inquiry through isa, cast, and dyn_cast:
   static bool classof(const IntrinsicInst *I) {
     return I->getIntrinsicID() == Intrinsic::coro_end;
@@ -461,6 +608,30 @@ public:
   }
 };
 
+/// This represents the llvm.coro.end instruction.
+class LLVM_LIBRARY_VISIBILITY CoroAsyncEndInst : public AnyCoroEndInst {
+  enum { FrameArg, UnwindArg, MustTailCallFuncArg };
+
+public:
+  void checkWellFormed() const;
+
+  Function *getMustTailCallFunction() const {
+    if (getNumArgOperands() < 3)
+      return nullptr;
+
+    return cast<Function>(
+        getArgOperand(MustTailCallFuncArg)->stripPointerCasts());
+  }
+
+  // Methods to support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::coro_end_async;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
 /// This represents the llvm.coro.alloca.alloc instruction.
 class LLVM_LIBRARY_VISIBILITY CoroAllocaAllocInst : public IntrinsicInst {
   enum { SizeArg, AlignArg };
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroInternal.h b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroInternal.h
index bd76e93c9124..6c0e52f24542 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroInternal.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroInternal.h
@@ -34,10 +34,12 @@ void initializeCoroCleanupLegacyPass(PassRegistry &);
 // CoroElide pass that triggers a restart of the pipeline by CGPassManager.
 // When CoroSplit pass sees the same coroutine the second time, it splits it up,
 // adds coroutine subfunctions to the SCC to be processed by IPO pipeline.
-
+// Async lowering similarily triggers a restart of the pipeline after it has
+// split the coroutine.
 #define CORO_PRESPLIT_ATTR "coroutine.presplit"
 #define UNPREPARED_FOR_SPLIT "0"
 #define PREPARED_FOR_SPLIT "1"
+#define ASYNC_RESTART_AFTER_SPLIT "2"
 
 #define CORO_DEVIRT_TRIGGER_FN "coro.devirt.trigger"
 
@@ -45,11 +47,14 @@ namespace coro {
 
 bool declaresIntrinsics(const Module &M,
                         const std::initializer_list<StringRef>);
-void replaceAllCoroAllocs(CoroBeginInst *CB, bool Replacement);
-void replaceAllCoroFrees(CoroBeginInst *CB, Value *Replacement);
 void replaceCoroFree(CoroIdInst *CoroId, bool Elide);
 void updateCallGraph(Function &Caller, ArrayRef<Function *> Funcs,
                      CallGraph &CG, CallGraphSCC &SCC);
+/// Recover a dbg.declare prepared by the frontend and emit an alloca
+/// holding a pointer to the coroutine frame.
+void salvageDebugInfo(
+    SmallDenseMap<llvm::Value *, llvm::AllocaInst *, 4> &DbgPtrAllocaCache,
+    DbgDeclareInst *DDI, bool LoadFromCoroFrame = false);
 
 // Keeps data and helper functions for lowering coroutine intrinsics.
 struct LowererBase {
@@ -81,13 +86,18 @@ enum class ABI {
   /// suspend at most once during its execution, and the return value of
   /// the continuation is void.
   RetconOnce,
+
+  /// The "async continuation" lowering, where each suspend point creates a
+  /// single continuation function. The continuation function is available as an
+  /// intrinsic.
+  Async,
 };
 
 // Holds structural Coroutine Intrinsics for a particular function and other
 // values used during CoroSplit pass.
 struct LLVM_LIBRARY_VISIBILITY Shape {
   CoroBeginInst *CoroBegin;
-  SmallVector<CoroEndInst *, 4> CoroEnds;
+  SmallVector<AnyCoroEndInst *, 4> CoroEnds;
   SmallVector<CoroSizeInst *, 2> CoroSizes;
   SmallVector<AnyCoroSuspendInst *, 4> CoroSuspends;
   SmallVector<CallInst*, 2> SwiftErrorOps;
@@ -115,12 +125,13 @@ struct LLVM_LIBRARY_VISIBILITY Shape {
   Instruction *FramePtr;
   BasicBlock *AllocaSpillBlock;
 
+  bool ReuseFrameSlot;
+
   struct SwitchLoweringStorage {
     SwitchInst *ResumeSwitch;
     AllocaInst *PromiseAlloca;
     BasicBlock *ResumeEntryBlock;
     unsigned IndexField;
-    unsigned PromiseField;
     bool HasFinalSuspend;
   };
 
@@ -132,9 +143,23 @@ struct LLVM_LIBRARY_VISIBILITY Shape {
     bool IsFrameInlineInStorage;
   };
 
+  struct AsyncLoweringStorage {
+    FunctionType *AsyncFuncTy;
+    Value *Context;
+    unsigned ContextArgNo;
+    uint64_t ContextHeaderSize;
+    uint64_t ContextAlignment;
+    uint64_t FrameOffset; // Start of the frame.
+    uint64_t ContextSize; // Includes frame size.
+    GlobalVariable *AsyncFuncPointer;
+
+    Align getContextAlignment() const { return Align(ContextAlignment); }
+  };
+
   union {
     SwitchLoweringStorage SwitchLowering;
     RetconLoweringStorage RetconLowering;
+    AsyncLoweringStorage AsyncLowering;
   };
 
   CoroIdInst *getSwitchCoroId() const {
@@ -148,6 +173,11 @@ struct LLVM_LIBRARY_VISIBILITY Shape {
     return cast<AnyCoroIdRetconInst>(CoroBegin->getId());
   }
 
+  CoroIdAsyncInst *getAsyncCoroId() const {
+    assert(ABI == coro::ABI::Async);
+    return cast<CoroIdAsyncInst>(CoroBegin->getId());
+  }
+
   unsigned getSwitchIndexField() const {
     assert(ABI == coro::ABI::Switch);
     assert(FrameTy && "frame type not assigned");
@@ -177,7 +207,10 @@ struct LLVM_LIBRARY_VISIBILITY Shape {
     case coro::ABI::Retcon:
     case coro::ABI::RetconOnce:
       return RetconLowering.ResumePrototype->getFunctionType();
+    case coro::ABI::Async:
+      return AsyncLowering.AsyncFuncTy;
     }
+
     llvm_unreachable("Unknown coro::ABI enum");
   }
 
@@ -211,6 +244,8 @@ struct LLVM_LIBRARY_VISIBILITY Shape {
     case coro::ABI::Retcon:
     case coro::ABI::RetconOnce:
       return RetconLowering.ResumePrototype->getCallingConv();
+    case coro::ABI::Async:
+      return CallingConv::Swift;
     }
     llvm_unreachable("Unknown coro::ABI enum");
   }
@@ -220,12 +255,6 @@ struct LLVM_LIBRARY_VISIBILITY Shape {
       return SwitchLowering.PromiseAlloca;
     return nullptr;
   }
-  unsigned getPromiseField() const {
-    assert(ABI == coro::ABI::Switch);
-    assert(FrameTy && "frame type not assigned");
-    assert(SwitchLowering.PromiseAlloca && "no promise alloca");
-    return SwitchLowering.PromiseField;
-  }
 
   /// Allocate memory according to the rules of the active lowering.
   ///
@@ -238,12 +267,16 @@ struct LLVM_LIBRARY_VISIBILITY Shape {
   void emitDealloc(IRBuilder<> &Builder, Value *Ptr, CallGraph *CG) const;
 
   Shape() = default;
-  explicit Shape(Function &F) { buildFrom(F); }
+  explicit Shape(Function &F, bool ReuseFrameSlot = false)
+      : ReuseFrameSlot(ReuseFrameSlot) {
+    buildFrom(F);
+  }
   void buildFrom(Function &F);
 };
 
 void buildCoroutineFrame(Function &F, Shape &Shape);
-
+CallInst *createMustTailCall(DebugLoc Loc, Function *MustTailCallFn,
+                             ArrayRef<Value *> Arguments, IRBuilder<> &);
 } // End namespace coro.
 } // End namespace llvm
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
index 9c4392e7999b..c4d7db9153e2 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -28,6 +28,7 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/Analysis/LazyCallGraph.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
@@ -90,7 +91,11 @@ public:
 
     /// An individual continuation function.
     Continuation,
+
+    /// An async resume function.
+    Async,
   };
+
 private:
   Function &OrigF;
   Function *NewF;
@@ -100,9 +105,9 @@ private:
   ValueToValueMapTy VMap;
   IRBuilder<> Builder;
   Value *NewFramePtr = nullptr;
-  Value *SwiftErrorSlot = nullptr;
 
-  /// The active suspend instruction; meaningful only for continuation ABIs.
+  /// The active suspend instruction; meaningful only for continuation and async
+  /// ABIs.
   AnyCoroSuspendInst *ActiveSuspend = nullptr;
 
 public:
@@ -117,11 +122,11 @@ public:
   /// Create a cloner for a continuation lowering.
   CoroCloner(Function &OrigF, const Twine &Suffix, coro::Shape &Shape,
              Function *NewF, AnyCoroSuspendInst *ActiveSuspend)
-    : OrigF(OrigF), NewF(NewF), Suffix(Suffix), Shape(Shape),
-      FKind(Kind::Continuation), Builder(OrigF.getContext()),
-      ActiveSuspend(ActiveSuspend) {
+      : OrigF(OrigF), NewF(NewF), Suffix(Suffix), Shape(Shape),
+        FKind(Shape.ABI == coro::ABI::Async ? Kind::Async : Kind::Continuation),
+        Builder(OrigF.getContext()), ActiveSuspend(ActiveSuspend) {
     assert(Shape.ABI == coro::ABI::Retcon ||
-           Shape.ABI == coro::ABI::RetconOnce);
+           Shape.ABI == coro::ABI::RetconOnce || Shape.ABI == coro::ABI::Async);
     assert(NewF && "need existing function for continuation");
     assert(ActiveSuspend && "need active suspend point for continuation");
   }
@@ -136,6 +141,7 @@ public:
 private:
   bool isSwitchDestroyFunction() {
     switch (FKind) {
+    case Kind::Async:
     case Kind::Continuation:
     case Kind::SwitchResume:
       return false;
@@ -146,15 +152,14 @@ private:
     llvm_unreachable("Unknown CoroCloner::Kind enum");
   }
 
-  void createDeclaration();
   void replaceEntryBlock();
   Value *deriveNewFramePointer();
-  void replaceRetconSuspendUses();
+  void replaceRetconOrAsyncSuspendUses();
   void replaceCoroSuspends();
   void replaceCoroEnds();
   void replaceSwiftErrorOps();
+  void salvageDebugInfo();
   void handleFinalSuspend();
-  void maybeFreeContinuationStorage();
 };
 
 } // end anonymous namespace
@@ -170,8 +175,53 @@ static void maybeFreeRetconStorage(IRBuilder<> &Builder,
   Shape.emitDealloc(Builder, FramePtr, CG);
 }
 
+/// Replace an llvm.coro.end.async.
+/// Will inline the must tail call function call if there is one.
+/// \returns true if cleanup of the coro.end block is needed, false otherwise.
+static bool replaceCoroEndAsync(AnyCoroEndInst *End) {
+  IRBuilder<> Builder(End);
+
+  auto *EndAsync = dyn_cast<CoroAsyncEndInst>(End);
+  if (!EndAsync) {
+    Builder.CreateRetVoid();
+    return true /*needs cleanup of coro.end block*/;
+  }
+
+  auto *MustTailCallFunc = EndAsync->getMustTailCallFunction();
+  if (!MustTailCallFunc) {
+    Builder.CreateRetVoid();
+    return true /*needs cleanup of coro.end block*/;
+  }
+
+  // Move the must tail call from the predecessor block into the end block.
+  auto *CoroEndBlock = End->getParent();
+  auto *MustTailCallFuncBlock = CoroEndBlock->getSinglePredecessor();
+  assert(MustTailCallFuncBlock && "Must have a single predecessor block");
+  auto It = MustTailCallFuncBlock->getTerminator()->getIterator();
+  auto *MustTailCall = cast<CallInst>(&*std::prev(It));
+  CoroEndBlock->getInstList().splice(
+      End->getIterator(), MustTailCallFuncBlock->getInstList(), MustTailCall);
+
+  // Insert the return instruction.
+  Builder.SetInsertPoint(End);
+  Builder.CreateRetVoid();
+  InlineFunctionInfo FnInfo;
+
+  // Remove the rest of the block, by splitting it into an unreachable block.
+  auto *BB = End->getParent();
+  BB->splitBasicBlock(End);
+  BB->getTerminator()->eraseFromParent();
+
+  auto InlineRes = InlineFunction(*MustTailCall, FnInfo);
+  assert(InlineRes.isSuccess() && "Expected inlining to succeed");
+  (void)InlineRes;
+
+  // We have cleaned up the coro.end block above.
+  return false;
+}
+
 /// Replace a non-unwind call to llvm.coro.end.
-static void replaceFallthroughCoroEnd(CoroEndInst *End,
+static void replaceFallthroughCoroEnd(AnyCoroEndInst *End,
                                       const coro::Shape &Shape, Value *FramePtr,
                                       bool InResume, CallGraph *CG) {
   // Start inserting right before the coro.end.
@@ -188,6 +238,14 @@ static void replaceFallthroughCoroEnd(CoroEndInst *End,
     Builder.CreateRetVoid();
     break;
 
+  // In async lowering this returns.
+  case coro::ABI::Async: {
+    bool CoroEndBlockNeedsCleanup = replaceCoroEndAsync(End);
+    if (!CoroEndBlockNeedsCleanup)
+      return;
+    break;
+  }
+
   // In unique continuation lowering, the continuations always return void.
   // But we may have implicitly allocated storage.
   case coro::ABI::RetconOnce:
@@ -221,8 +279,9 @@ static void replaceFallthroughCoroEnd(CoroEndInst *End,
 }
 
 /// Replace an unwind call to llvm.coro.end.
-static void replaceUnwindCoroEnd(CoroEndInst *End, const coro::Shape &Shape,
-                                 Value *FramePtr, bool InResume, CallGraph *CG){
+static void replaceUnwindCoroEnd(AnyCoroEndInst *End, const coro::Shape &Shape,
+                                 Value *FramePtr, bool InResume,
+                                 CallGraph *CG) {
   IRBuilder<> Builder(End);
 
   switch (Shape.ABI) {
@@ -231,7 +290,9 @@ static void replaceUnwindCoroEnd(CoroEndInst *End, const coro::Shape &Shape,
     if (!InResume)
       return;
     break;
-
+  // In async lowering this does nothing.
+  case coro::ABI::Async:
+    break;
   // In continuation-lowering, this frees the continuation storage.
   case coro::ABI::Retcon:
   case coro::ABI::RetconOnce:
@@ -248,7 +309,7 @@ static void replaceUnwindCoroEnd(CoroEndInst *End, const coro::Shape &Shape,
   }
 }
 
-static void replaceCoroEnd(CoroEndInst *End, const coro::Shape &Shape,
+static void replaceCoroEnd(AnyCoroEndInst *End, const coro::Shape &Shape,
                            Value *FramePtr, bool InResume, CallGraph *CG) {
   if (End->isUnwind())
     replaceUnwindCoroEnd(End, Shape, FramePtr, InResume, CG);
@@ -403,20 +464,24 @@ static Function *createCloneDeclaration(Function &OrigF, coro::Shape &Shape,
       Function::Create(FnTy, GlobalValue::LinkageTypes::InternalLinkage,
                        OrigF.getName() + Suffix);
   NewF->addParamAttr(0, Attribute::NonNull);
-  NewF->addParamAttr(0, Attribute::NoAlias);
+
+  // For the async lowering ABI we can't guarantee that the context argument is
+  // not access via a different pointer not based on the argument.
+  if (Shape.ABI != coro::ABI::Async)
+    NewF->addParamAttr(0, Attribute::NoAlias);
 
   M->getFunctionList().insert(InsertBefore, NewF);
 
   return NewF;
 }
 
-/// Replace uses of the active llvm.coro.suspend.retcon call with the
+/// Replace uses of the active llvm.coro.suspend.retcon/async call with the
 /// arguments to the continuation function.
 ///
 /// This assumes that the builder has a meaningful insertion point.
-void CoroCloner::replaceRetconSuspendUses() {
-  assert(Shape.ABI == coro::ABI::Retcon ||
-         Shape.ABI == coro::ABI::RetconOnce);
+void CoroCloner::replaceRetconOrAsyncSuspendUses() {
+  assert(Shape.ABI == coro::ABI::Retcon || Shape.ABI == coro::ABI::RetconOnce ||
+         Shape.ABI == coro::ABI::Async);
 
   auto NewS = VMap[ActiveSuspend];
   if (NewS->use_empty()) return;
@@ -424,7 +489,11 @@ void CoroCloner::replaceRetconSuspendUses() {
   // Copy out all the continuation arguments after the buffer pointer into
   // an easily-indexed data structure for convenience.
   SmallVector<Value*, 8> Args;
-  for (auto I = std::next(NewF->arg_begin()), E = NewF->arg_end(); I != E; ++I)
+  // The async ABI includes all arguments -- including the first argument.
+  bool IsAsyncABI = Shape.ABI == coro::ABI::Async;
+  for (auto I = IsAsyncABI ? NewF->arg_begin() : std::next(NewF->arg_begin()),
+            E = NewF->arg_end();
+       I != E; ++I)
     Args.push_back(&*I);
 
   // If the suspend returns a single scalar value, we can just do a simple
@@ -470,6 +539,10 @@ void CoroCloner::replaceCoroSuspends() {
     SuspendResult = Builder.getInt8(isSwitchDestroyFunction() ? 1 : 0);
     break;
 
+  // In async lowering there are no uses of the result.
+  case coro::ABI::Async:
+    return;
+
   // In returned-continuation lowering, the arguments from earlier
   // continuations are theoretically arbitrary, and they should have been
   // spilled.
@@ -489,10 +562,10 @@ void CoroCloner::replaceCoroSuspends() {
 }
 
 void CoroCloner::replaceCoroEnds() {
-  for (CoroEndInst *CE : Shape.CoroEnds) {
+  for (AnyCoroEndInst *CE : Shape.CoroEnds) {
     // We use a null call graph because there's no call graph node for
     // the cloned function yet.  We'll just be rebuilding that later.
-    auto NewCE = cast<CoroEndInst>(VMap[CE]);
+    auto *NewCE = cast<AnyCoroEndInst>(VMap[CE]);
     replaceCoroEnd(NewCE, Shape, NewFramePtr, /*in resume*/ true, nullptr);
   }
 }
@@ -559,6 +632,39 @@ void CoroCloner::replaceSwiftErrorOps() {
   ::replaceSwiftErrorOps(*NewF, Shape, &VMap);
 }
 
+void CoroCloner::salvageDebugInfo() {
+  SmallVector<DbgDeclareInst *, 8> Worklist;
+  SmallDenseMap<llvm::Value *, llvm::AllocaInst *, 4> DbgPtrAllocaCache;
+  for (auto &BB : *NewF)
+    for (auto &I : BB)
+      if (auto *DDI = dyn_cast<DbgDeclareInst>(&I))
+        Worklist.push_back(DDI);
+  for (DbgDeclareInst *DDI : Worklist) {
+    // This is a heuristic that detects declares left by CoroFrame.
+    bool LoadFromFramePtr = !isa<AllocaInst>(DDI->getAddress());
+    coro::salvageDebugInfo(DbgPtrAllocaCache, DDI, LoadFromFramePtr);
+  }
+  // Remove all salvaged dbg.declare intrinsics that became
+  // either unreachable or stale due to the CoroSplit transformation.
+  auto IsUnreachableBlock = [&](BasicBlock *BB) {
+    return BB->hasNPredecessors(0) && BB != &NewF->getEntryBlock();
+  };
+  for (DbgDeclareInst *DDI : Worklist) {
+    if (IsUnreachableBlock(DDI->getParent()))
+      DDI->eraseFromParent();
+    else if (auto *Alloca = dyn_cast_or_null<AllocaInst>(DDI->getAddress())) {
+      // Count all non-debuginfo uses in reachable blocks.
+      unsigned Uses = 0;
+      for (auto *User : DDI->getAddress()->users())
+        if (auto *I = dyn_cast<Instruction>(User))
+          if (!isa<AllocaInst>(I) && !IsUnreachableBlock(I->getParent()))
+            ++Uses;
+      if (!Uses)
+        DDI->eraseFromParent();
+    }
+  }
+}
+
 void CoroCloner::replaceEntryBlock() {
   // In the original function, the AllocaSpillBlock is a block immediately
   // following the allocation of the frame object which defines GEPs for
@@ -581,15 +687,6 @@ void CoroCloner::replaceEntryBlock() {
   Builder.CreateUnreachable();
   BranchToEntry->eraseFromParent();
 
-  // Move any allocas into Entry that weren't moved into the frame.
-  for (auto IT = OldEntry->begin(), End = OldEntry->end(); IT != End;) {
-    Instruction &I = *IT++;
-    if (!isa<AllocaInst>(&I) || I.use_empty())
-      continue;
-
-    I.moveBefore(*Entry, Entry->getFirstInsertionPt());
-  }
-
   // Branch from the entry to the appropriate place.
   Builder.SetInsertPoint(Entry);
   switch (Shape.ABI) {
@@ -601,19 +698,37 @@ void CoroCloner::replaceEntryBlock() {
     Builder.CreateBr(SwitchBB);
     break;
   }
-
+  case coro::ABI::Async:
   case coro::ABI::Retcon:
   case coro::ABI::RetconOnce: {
     // In continuation ABIs, we want to branch to immediately after the
     // active suspend point.  Earlier phases will have put the suspend in its
     // own basic block, so just thread our jump directly to its successor.
-    auto MappedCS = cast<CoroSuspendRetconInst>(VMap[ActiveSuspend]);
+    assert((Shape.ABI == coro::ABI::Async &&
+            isa<CoroSuspendAsyncInst>(ActiveSuspend)) ||
+           ((Shape.ABI == coro::ABI::Retcon ||
+             Shape.ABI == coro::ABI::RetconOnce) &&
+            isa<CoroSuspendRetconInst>(ActiveSuspend)));
+    auto *MappedCS = cast<AnyCoroSuspendInst>(VMap[ActiveSuspend]);
     auto Branch = cast<BranchInst>(MappedCS->getNextNode());
     assert(Branch->isUnconditional());
     Builder.CreateBr(Branch->getSuccessor(0));
     break;
   }
   }
+
+  // Any alloca that's still being used but not reachable from the new entry
+  // needs to be moved to the new entry.
+  Function *F = OldEntry->getParent();
+  DominatorTree DT{*F};
+  for (auto IT = inst_begin(F), End = inst_end(F); IT != End;) {
+    Instruction &I = *IT++;
+    if (!isa<AllocaInst>(&I) || I.use_empty())
+      continue;
+    if (DT.isReachableFromEntry(I.getParent()))
+      continue;
+    I.moveBefore(*Entry, Entry->getFirstInsertionPt());
+  }
 }
 
 /// Derive the value of the new frame pointer.
@@ -624,7 +739,36 @@ Value *CoroCloner::deriveNewFramePointer() {
   // In switch-lowering, the argument is the frame pointer.
   case coro::ABI::Switch:
     return &*NewF->arg_begin();
-
+  // In async-lowering, one of the arguments is an async context as determined
+  // by the `llvm.coro.id.async` intrinsic. We can retrieve the async context of
+  // the resume function from the async context projection function associated
+  // with the active suspend. The frame is located as a tail to the async
+  // context header.
+  case coro::ABI::Async: {
+    auto *CalleeContext = NewF->getArg(Shape.AsyncLowering.ContextArgNo);
+    auto *FramePtrTy = Shape.FrameTy->getPointerTo();
+    auto *ProjectionFunc = cast<CoroSuspendAsyncInst>(ActiveSuspend)
+                               ->getAsyncContextProjectionFunction();
+    auto DbgLoc =
+        cast<CoroSuspendAsyncInst>(VMap[ActiveSuspend])->getDebugLoc();
+    // Calling i8* (i8*)
+    auto *CallerContext = Builder.CreateCall(
+        cast<FunctionType>(ProjectionFunc->getType()->getPointerElementType()),
+        ProjectionFunc, CalleeContext);
+    CallerContext->setCallingConv(ProjectionFunc->getCallingConv());
+    CallerContext->setDebugLoc(DbgLoc);
+    // The frame is located after the async_context header.
+    auto &Context = Builder.getContext();
+    auto *FramePtrAddr = Builder.CreateConstInBoundsGEP1_32(
+        Type::getInt8Ty(Context), CallerContext,
+        Shape.AsyncLowering.FrameOffset, "async.ctx.frameptr");
+    // Inline the projection function.
+    InlineFunctionInfo InlineInfo;
+    auto InlineRes = InlineFunction(*CallerContext, InlineInfo);
+    assert(InlineRes.isSuccess());
+    (void)InlineRes;
+    return Builder.CreateBitCast(FramePtrAddr, FramePtrTy);
+  }
   // In continuation-lowering, the argument is the opaque storage.
   case coro::ABI::Retcon:
   case coro::ABI::RetconOnce: {
@@ -707,7 +851,8 @@ void CoroCloner::create() {
     addFramePointerAttrs(NewAttrs, Context, 0,
                          Shape.FrameSize, Shape.FrameAlign);
     break;
-
+  case coro::ABI::Async:
+    break;
   case coro::ABI::Retcon:
   case coro::ABI::RetconOnce:
     // If we have a continuation prototype, just use its attributes,
@@ -737,6 +882,12 @@ void CoroCloner::create() {
   // so we want to leave any returns in place.
   case coro::ABI::Retcon:
     break;
+  // Async lowering will insert musttail call functions at all suspend points
+  // followed by a return.
+  // Don't change returns to unreachable because that will trip up the verifier.
+  // These returns should be unreachable from the clone.
+  case coro::ABI::Async:
+    break;
   }
 
   NewF->setAttributes(NewAttrs);
@@ -767,14 +918,14 @@ void CoroCloner::create() {
     if (Shape.SwitchLowering.HasFinalSuspend)
       handleFinalSuspend();
     break;
-
+  case coro::ABI::Async:
   case coro::ABI::Retcon:
   case coro::ABI::RetconOnce:
     // Replace uses of the active suspend with the corresponding
     // continuation-function arguments.
     assert(ActiveSuspend != nullptr &&
            "no active suspend when lowering a continuation-style coroutine");
-    replaceRetconSuspendUses();
+    replaceRetconOrAsyncSuspendUses();
     break;
   }
 
@@ -787,6 +938,9 @@ void CoroCloner::create() {
   // Remove coro.end intrinsics.
   replaceCoroEnds();
 
+  // Salvage debug info that points into the coroutine frame.
+  salvageDebugInfo();
+
   // Eliminate coro.free from the clones, replacing it with 'null' in cleanup,
   // to suppress deallocation code.
   if (Shape.ABI == coro::ABI::Switch)
@@ -811,7 +965,25 @@ static void removeCoroEnds(const coro::Shape &Shape, CallGraph *CG) {
   }
 }
 
+static void updateAsyncFuncPointerContextSize(coro::Shape &Shape) {
+  assert(Shape.ABI == coro::ABI::Async);
+
+  auto *FuncPtrStruct = cast<ConstantStruct>(
+      Shape.AsyncLowering.AsyncFuncPointer->getInitializer());
+  auto *OrigRelativeFunOffset = FuncPtrStruct->getOperand(0);
+  auto *OrigContextSize = FuncPtrStruct->getOperand(1);
+  auto *NewContextSize = ConstantInt::get(OrigContextSize->getType(),
+                                          Shape.AsyncLowering.ContextSize);
+  auto *NewFuncPtrStruct = ConstantStruct::get(
+      FuncPtrStruct->getType(), OrigRelativeFunOffset, NewContextSize);
+
+  Shape.AsyncLowering.AsyncFuncPointer->setInitializer(NewFuncPtrStruct);
+}
+
 static void replaceFrameSize(coro::Shape &Shape) {
+  if (Shape.ABI == coro::ABI::Async)
+    updateAsyncFuncPointerContextSize(Shape);
+
   if (Shape.CoroSizes.empty())
     return;
 
@@ -1075,7 +1247,7 @@ static void handleNoSuspendCoroutine(coro::Shape &Shape) {
     }
     break;
   }
-
+  case coro::ABI::Async:
   case coro::ABI::Retcon:
   case coro::ABI::RetconOnce:
     CoroBegin->replaceAllUsesWith(UndefValue::get(CoroBegin->getType()));
@@ -1271,6 +1443,133 @@ static void splitSwitchCoroutine(Function &F, coro::Shape &Shape,
   setCoroInfo(F, Shape, Clones);
 }
 
+static void replaceAsyncResumeFunction(CoroSuspendAsyncInst *Suspend,
+                                       Value *Continuation) {
+  auto *ResumeIntrinsic = Suspend->getResumeFunction();
+  auto &Context = Suspend->getParent()->getParent()->getContext();
+  auto *Int8PtrTy = Type::getInt8PtrTy(Context);
+
+  IRBuilder<> Builder(ResumeIntrinsic);
+  auto *Val = Builder.CreateBitOrPointerCast(Continuation, Int8PtrTy);
+  ResumeIntrinsic->replaceAllUsesWith(Val);
+  ResumeIntrinsic->eraseFromParent();
+  Suspend->setOperand(0, UndefValue::get(Int8PtrTy));
+}
+
+/// Coerce the arguments in \p FnArgs according to \p FnTy in \p CallArgs.
+static void coerceArguments(IRBuilder<> &Builder, FunctionType *FnTy,
+                            ArrayRef<Value *> FnArgs,
+                            SmallVectorImpl<Value *> &CallArgs) {
+  size_t ArgIdx = 0;
+  for (auto paramTy : FnTy->params()) {
+    assert(ArgIdx < FnArgs.size());
+    if (paramTy != FnArgs[ArgIdx]->getType())
+      CallArgs.push_back(
+          Builder.CreateBitOrPointerCast(FnArgs[ArgIdx], paramTy));
+    else
+      CallArgs.push_back(FnArgs[ArgIdx]);
+    ++ArgIdx;
+  }
+}
+
+CallInst *coro::createMustTailCall(DebugLoc Loc, Function *MustTailCallFn,
+                                   ArrayRef<Value *> Arguments,
+                                   IRBuilder<> &Builder) {
+  auto *FnTy =
+      cast<FunctionType>(MustTailCallFn->getType()->getPointerElementType());
+  // Coerce the arguments, llvm optimizations seem to ignore the types in
+  // vaarg functions and throws away casts in optimized mode.
+  SmallVector<Value *, 8> CallArgs;
+  coerceArguments(Builder, FnTy, Arguments, CallArgs);
+
+  auto *TailCall = Builder.CreateCall(FnTy, MustTailCallFn, CallArgs);
+  TailCall->setTailCallKind(CallInst::TCK_MustTail);
+  TailCall->setDebugLoc(Loc);
+  TailCall->setCallingConv(MustTailCallFn->getCallingConv());
+  return TailCall;
+}
+
+static void splitAsyncCoroutine(Function &F, coro::Shape &Shape,
+                                SmallVectorImpl<Function *> &Clones) {
+  assert(Shape.ABI == coro::ABI::Async);
+  assert(Clones.empty());
+  // Reset various things that the optimizer might have decided it
+  // "knows" about the coroutine function due to not seeing a return.
+  F.removeFnAttr(Attribute::NoReturn);
+  F.removeAttribute(AttributeList::ReturnIndex, Attribute::NoAlias);
+  F.removeAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
+
+  auto &Context = F.getContext();
+  auto *Int8PtrTy = Type::getInt8PtrTy(Context);
+
+  auto *Id = cast<CoroIdAsyncInst>(Shape.CoroBegin->getId());
+  IRBuilder<> Builder(Id);
+
+  auto *FramePtr = Id->getStorage();
+  FramePtr = Builder.CreateBitOrPointerCast(FramePtr, Int8PtrTy);
+  FramePtr = Builder.CreateConstInBoundsGEP1_32(
+      Type::getInt8Ty(Context), FramePtr, Shape.AsyncLowering.FrameOffset,
+      "async.ctx.frameptr");
+
+  // Map all uses of llvm.coro.begin to the allocated frame pointer.
+  {
+    // Make sure we don't invalidate Shape.FramePtr.
+    TrackingVH<Instruction> Handle(Shape.FramePtr);
+    Shape.CoroBegin->replaceAllUsesWith(FramePtr);
+    Shape.FramePtr = Handle.getValPtr();
+  }
+
+  // Create all the functions in order after the main function.
+  auto NextF = std::next(F.getIterator());
+
+  // Create a continuation function for each of the suspend points.
+  Clones.reserve(Shape.CoroSuspends.size());
+  for (size_t Idx = 0, End = Shape.CoroSuspends.size(); Idx != End; ++Idx) {
+    auto *Suspend = cast<CoroSuspendAsyncInst>(Shape.CoroSuspends[Idx]);
+
+    // Create the clone declaration.
+    auto *Continuation =
+        createCloneDeclaration(F, Shape, ".resume." + Twine(Idx), NextF);
+    Clones.push_back(Continuation);
+
+    // Insert a branch to a new return block immediately before the suspend
+    // point.
+    auto *SuspendBB = Suspend->getParent();
+    auto *NewSuspendBB = SuspendBB->splitBasicBlock(Suspend);
+    auto *Branch = cast<BranchInst>(SuspendBB->getTerminator());
+
+    // Place it before the first suspend.
+    auto *ReturnBB =
+        BasicBlock::Create(F.getContext(), "coro.return", &F, NewSuspendBB);
+    Branch->setSuccessor(0, ReturnBB);
+
+    IRBuilder<> Builder(ReturnBB);
+
+    // Insert the call to the tail call function and inline it.
+    auto *Fn = Suspend->getMustTailCallFunction();
+    SmallVector<Value *, 8> Args(Suspend->args());
+    auto FnArgs = ArrayRef<Value *>(Args).drop_front(3);
+    auto *TailCall =
+        coro::createMustTailCall(Suspend->getDebugLoc(), Fn, FnArgs, Builder);
+    Builder.CreateRetVoid();
+    InlineFunctionInfo FnInfo;
+    auto InlineRes = InlineFunction(*TailCall, FnInfo);
+    assert(InlineRes.isSuccess() && "Expected inlining to succeed");
+    (void)InlineRes;
+
+    // Replace the lvm.coro.async.resume intrisic call.
+    replaceAsyncResumeFunction(Suspend, Continuation);
+  }
+
+  assert(Clones.size() == Shape.CoroSuspends.size());
+  for (size_t Idx = 0, End = Shape.CoroSuspends.size(); Idx != End; ++Idx) {
+    auto *Suspend = Shape.CoroSuspends[Idx];
+    auto *Clone = Clones[Idx];
+
+    CoroCloner(F, "resume." + Twine(Idx), Shape, Clone, Suspend).create();
+  }
+}
+
 static void splitRetconCoroutine(Function &F, coro::Shape &Shape,
                                  SmallVectorImpl<Function *> &Clones) {
   assert(Shape.ABI == coro::ABI::Retcon ||
@@ -1416,14 +1715,15 @@ namespace {
 }
 
 static coro::Shape splitCoroutine(Function &F,
-                                  SmallVectorImpl<Function *> &Clones) {
+                                  SmallVectorImpl<Function *> &Clones,
+                                  bool ReuseFrameSlot) {
   PrettyStackTraceFunction prettyStackTrace(F);
 
   // The suspend-crossing algorithm in buildCoroutineFrame get tripped
   // up by uses in unreachable blocks, so remove them as a first pass.
   removeUnreachableBlocks(F);
 
-  coro::Shape Shape(F);
+  coro::Shape Shape(F, ReuseFrameSlot);
   if (!Shape.CoroBegin)
     return Shape;
 
@@ -1440,6 +1740,9 @@ static coro::Shape splitCoroutine(Function &F,
     case coro::ABI::Switch:
       splitSwitchCoroutine(F, Shape, Clones);
       break;
+    case coro::ABI::Async:
+      splitAsyncCoroutine(F, Shape, Clones);
+      break;
     case coro::ABI::Retcon:
     case coro::ABI::RetconOnce:
       splitRetconCoroutine(F, Shape, Clones);
@@ -1476,40 +1779,46 @@ static void updateCallGraphAfterCoroutineSplit(
   if (!Shape.CoroBegin)
     return;
 
-  for (llvm::CoroEndInst *End : Shape.CoroEnds) {
+  for (llvm::AnyCoroEndInst *End : Shape.CoroEnds) {
     auto &Context = End->getContext();
     End->replaceAllUsesWith(ConstantInt::getFalse(Context));
     End->eraseFromParent();
   }
 
-  postSplitCleanup(N.getFunction());
+  if (!Clones.empty()) {
+    switch (Shape.ABI) {
+    case coro::ABI::Switch:
+      // Each clone in the Switch lowering is independent of the other clones.
+      // Let the LazyCallGraph know about each one separately.
+      for (Function *Clone : Clones)
+        CG.addSplitFunction(N.getFunction(), *Clone);
+      break;
+    case coro::ABI::Async:
+    case coro::ABI::Retcon:
+    case coro::ABI::RetconOnce:
+      // Each clone in the Async/Retcon lowering references of the other clones.
+      // Let the LazyCallGraph know about all of them at once.
+      CG.addSplitRefRecursiveFunctions(N.getFunction(), Clones);
+      break;
+    }
 
-  // To insert the newly created coroutine funclets 'f.resume', 'f.destroy', and
-  // 'f.cleanup' into the same SCC as the coroutine 'f' they were outlined from,
-  // we make use of the CallGraphUpdater class, which can modify the internal
-  // state of the LazyCallGraph.
-  for (Function *Clone : Clones)
-    CG.addNewFunctionIntoRefSCC(*Clone, C.getOuterRefSCC());
-
-  // We've inserted instructions into coroutine 'f' that reference the three new
-  // coroutine funclets. We must now update the call graph so that reference
-  // edges between 'f' and its funclets are added to it. LazyCallGraph only
-  // allows CGSCC passes to insert "trivial" reference edges. We've ensured
-  // above, by inserting the funclets into the same SCC as the corutine, that
-  // the edges are trivial.
-  //
-  // N.B.: If we didn't update the call graph here, a CGSCCToFunctionPassAdaptor
-  // later in this CGSCC pass pipeline may be run, triggering a call graph
-  // update of its own. Function passes run by the adaptor are not permitted to
-  // add new edges of any kind to the graph, and the new edges inserted by this
-  // pass would be misattributed to that unrelated function pass.
-  updateCGAndAnalysisManagerForCGSCCPass(CG, C, N, AM, UR, FAM);
+    // Let the CGSCC infra handle the changes to the original function.
+    updateCGAndAnalysisManagerForCGSCCPass(CG, C, N, AM, UR, FAM);
+  }
+
+  // Do some cleanup and let the CGSCC infra see if we've cleaned up any edges
+  // to the split functions.
+  postSplitCleanup(N.getFunction());
+  updateCGAndAnalysisManagerForFunctionPass(CG, C, N, AM, UR, FAM);
 }
 
 // When we see the coroutine the first time, we insert an indirect call to a
 // devirt trigger function and mark the coroutine that it is now ready for
 // split.
-static void prepareForSplit(Function &F, CallGraph &CG) {
+// Async lowering uses this after it has split the function to restart the
+// pipeline.
+static void prepareForSplit(Function &F, CallGraph &CG,
+                            bool MarkForAsyncRestart = false) {
   Module &M = *F.getParent();
   LLVMContext &Context = F.getContext();
 #ifndef NDEBUG
@@ -1517,7 +1826,9 @@ static void prepareForSplit(Function &F, CallGraph &CG) {
   assert(DevirtFn && "coro.devirt.trigger function not found");
 #endif
 
-  F.addFnAttr(CORO_PRESPLIT_ATTR, PREPARED_FOR_SPLIT);
+  F.addFnAttr(CORO_PRESPLIT_ATTR, MarkForAsyncRestart
+                                      ? ASYNC_RESTART_AFTER_SPLIT
+                                      : PREPARED_FOR_SPLIT);
 
   // Insert an indirect call sequence that will be devirtualized by CoroElide
   // pass:
@@ -1525,7 +1836,9 @@ static void prepareForSplit(Function &F, CallGraph &CG) {
   //    %1 = bitcast i8* %0 to void(i8*)*
   //    call void %1(i8* null)
   coro::LowererBase Lowerer(M);
-  Instruction *InsertPt = F.getEntryBlock().getTerminator();
+  Instruction *InsertPt =
+      MarkForAsyncRestart ? F.getEntryBlock().getFirstNonPHIOrDbgOrLifetime()
+                          : F.getEntryBlock().getTerminator();
   auto *Null = ConstantPointerNull::get(Type::getInt8PtrTy(Context));
   auto *DevirtFnAddr =
       Lowerer.makeSubFnCall(Null, CoroSubFnInst::RestartTrigger, InsertPt);
@@ -1563,6 +1876,42 @@ static void createDevirtTriggerFunc(CallGraph &CG, CallGraphSCC &SCC) {
   SCC.initialize(Nodes);
 }
 
+/// Replace a call to llvm.coro.prepare.retcon.
+static void replacePrepare(CallInst *Prepare, LazyCallGraph &CG,
+                           LazyCallGraph::SCC &C) {
+  auto CastFn = Prepare->getArgOperand(0); // as an i8*
+  auto Fn = CastFn->stripPointerCasts();   // as its original type
+
+  // Attempt to peephole this pattern:
+  //    %0 = bitcast [[TYPE]] @some_function to i8*
+  //    %1 = call @llvm.coro.prepare.retcon(i8* %0)
+  //    %2 = bitcast %1 to [[TYPE]]
+  // ==>
+  //    %2 = @some_function
+  for (auto UI = Prepare->use_begin(), UE = Prepare->use_end(); UI != UE;) {
+    // Look for bitcasts back to the original function type.
+    auto *Cast = dyn_cast<BitCastInst>((UI++)->getUser());
+    if (!Cast || Cast->getType() != Fn->getType())
+      continue;
+
+    // Replace and remove the cast.
+    Cast->replaceAllUsesWith(Fn);
+    Cast->eraseFromParent();
+  }
+
+  // Replace any remaining uses with the function as an i8*.
+  // This can never directly be a callee, so we don't need to update CG.
+  Prepare->replaceAllUsesWith(CastFn);
+  Prepare->eraseFromParent();
+
+  // Kill dead bitcasts.
+  while (auto *Cast = dyn_cast<BitCastInst>(CastFn)) {
+    if (!Cast->use_empty())
+      break;
+    CastFn = Cast->getOperand(0);
+    Cast->eraseFromParent();
+  }
+}
 /// Replace a call to llvm.coro.prepare.retcon.
 static void replacePrepare(CallInst *Prepare, CallGraph &CG) {
   auto CastFn = Prepare->getArgOperand(0); // as an i8*
@@ -1618,6 +1967,19 @@ static void replacePrepare(CallInst *Prepare, CallGraph &CG) {
   }
 }
 
+static bool replaceAllPrepares(Function *PrepareFn, LazyCallGraph &CG,
+                               LazyCallGraph::SCC &C) {
+  bool Changed = false;
+  for (auto PI = PrepareFn->use_begin(), PE = PrepareFn->use_end(); PI != PE;) {
+    // Intrinsics can only be used in calls.
+    auto *Prepare = cast<CallInst>((PI++)->getUser());
+    replacePrepare(Prepare, CG, C);
+    Changed = true;
+  }
+
+  return Changed;
+}
+
 /// Remove calls to llvm.coro.prepare.retcon, a barrier meant to prevent
 /// IPO from operating on calls to a retcon coroutine before it's been
 /// split.  This is only safe to do after we've split all retcon
@@ -1638,8 +2000,17 @@ static bool replaceAllPrepares(Function *PrepareFn, CallGraph &CG) {
 }
 
 static bool declaresCoroSplitIntrinsics(const Module &M) {
-  return coro::declaresIntrinsics(
-      M, {"llvm.coro.begin", "llvm.coro.prepare.retcon"});
+  return coro::declaresIntrinsics(M, {"llvm.coro.begin",
+                                      "llvm.coro.prepare.retcon",
+                                      "llvm.coro.prepare.async"});
+}
+
+static void addPrepareFunction(const Module &M,
+                               SmallVectorImpl<Function *> &Fns,
+                               StringRef Name) {
+  auto *PrepareFn = M.getFunction(Name);
+  if (PrepareFn && !PrepareFn->use_empty())
+    Fns.push_back(PrepareFn);
 }
 
 PreservedAnalyses CoroSplitPass::run(LazyCallGraph::SCC &C,
@@ -1655,10 +2026,10 @@ PreservedAnalyses CoroSplitPass::run(LazyCallGraph::SCC &C,
   if (!declaresCoroSplitIntrinsics(M))
     return PreservedAnalyses::all();
 
-  // Check for uses of llvm.coro.prepare.retcon.
-  const auto *PrepareFn = M.getFunction("llvm.coro.prepare.retcon");
-  if (PrepareFn && PrepareFn->use_empty())
-    PrepareFn = nullptr;
+  // Check for uses of llvm.coro.prepare.retcon/async.
+  SmallVector<Function *, 2> PrepareFns;
+  addPrepareFunction(M, PrepareFns, "llvm.coro.prepare.retcon");
+  addPrepareFunction(M, PrepareFns, "llvm.coro.prepare.async");
 
   // Find coroutines for processing.
   SmallVector<LazyCallGraph::Node *, 4> Coroutines;
@@ -1666,12 +2037,14 @@ PreservedAnalyses CoroSplitPass::run(LazyCallGraph::SCC &C,
     if (N.getFunction().hasFnAttribute(CORO_PRESPLIT_ATTR))
       Coroutines.push_back(&N);
 
-  if (Coroutines.empty() && !PrepareFn)
+  if (Coroutines.empty() && PrepareFns.empty())
     return PreservedAnalyses::all();
 
-  if (Coroutines.empty())
-    llvm_unreachable("new pass manager cannot yet handle "
-                     "'llvm.coro.prepare.retcon'");
+  if (Coroutines.empty()) {
+    for (auto *PrepareFn : PrepareFns) {
+      replaceAllPrepares(PrepareFn, CG, C);
+    }
+  }
 
   // Split all the coroutines.
   for (LazyCallGraph::Node *N : Coroutines) {
@@ -1681,17 +2054,7 @@ PreservedAnalyses CoroSplitPass::run(LazyCallGraph::SCC &C,
     LLVM_DEBUG(dbgs() << "CoroSplit: Processing coroutine '" << F.getName()
                       << "' state: " << Value << "\n");
     if (Value == UNPREPARED_FOR_SPLIT) {
-      // Enqueue a second iteration of the CGSCC pipeline.
-      // N.B.:
-      // The CoroSplitLegacy pass "triggers" a restart of the CGSCC pass
-      // pipeline by inserting an indirect function call that the
-      // CoroElideLegacy pass then replaces with a direct function call. The
-      // legacy CGSCC pipeline's implicit behavior was as if wrapped in the new
-      // pass manager abstraction DevirtSCCRepeatedPass.
-      //
-      // This pass does not need to "trigger" another run of the pipeline.
-      // Instead, it simply enqueues the same RefSCC onto the pipeline's
-      // worklist.
+      // Enqueue a second iteration of the CGSCC pipeline on this SCC.
       UR.CWorklist.insert(&C);
       F.addFnAttr(CORO_PRESPLIT_ATTR, PREPARED_FOR_SPLIT);
       continue;
@@ -1699,13 +2062,23 @@ PreservedAnalyses CoroSplitPass::run(LazyCallGraph::SCC &C,
     F.removeFnAttr(CORO_PRESPLIT_ATTR);
 
     SmallVector<Function *, 4> Clones;
-    const coro::Shape Shape = splitCoroutine(F, Clones);
+    const coro::Shape Shape = splitCoroutine(F, Clones, ReuseFrameSlot);
     updateCallGraphAfterCoroutineSplit(*N, Shape, Clones, C, CG, AM, UR, FAM);
+
+    if ((Shape.ABI == coro::ABI::Async || Shape.ABI == coro::ABI::Retcon ||
+         Shape.ABI == coro::ABI::RetconOnce) &&
+        !Shape.CoroSuspends.empty()) {
+      // Run the CGSCC pipeline on the newly split functions.
+      // All clones will be in the same RefSCC, so choose a random clone.
+      UR.RCWorklist.insert(CG.lookupRefSCC(CG.get(*Clones[0])));
+    }
   }
 
-  if (PrepareFn)
-    llvm_unreachable("new pass manager cannot yet handle "
-                     "'llvm.coro.prepare.retcon'");
+  if (!PrepareFns.empty()) {
+    for (auto *PrepareFn : PrepareFns) {
+      replaceAllPrepares(PrepareFn, CG, C);
+    }
+  }
 
   return PreservedAnalyses::none();
 }
@@ -1723,11 +2096,13 @@ namespace {
 struct CoroSplitLegacy : public CallGraphSCCPass {
   static char ID; // Pass identification, replacement for typeid
 
-  CoroSplitLegacy() : CallGraphSCCPass(ID) {
+  CoroSplitLegacy(bool ReuseFrameSlot = false)
+      : CallGraphSCCPass(ID), ReuseFrameSlot(ReuseFrameSlot) {
     initializeCoroSplitLegacyPass(*PassRegistry::getPassRegistry());
   }
 
   bool Run = false;
+  bool ReuseFrameSlot;
 
   // A coroutine is identified by the presence of coro.begin intrinsic, if
   // we don't have any, this pass has nothing to do.
@@ -1741,10 +2116,10 @@ struct CoroSplitLegacy : public CallGraphSCCPass {
       return false;
 
     // Check for uses of llvm.coro.prepare.retcon.
-    auto PrepareFn =
-      SCC.getCallGraph().getModule().getFunction("llvm.coro.prepare.retcon");
-    if (PrepareFn && PrepareFn->use_empty())
-      PrepareFn = nullptr;
+    SmallVector<Function *, 2> PrepareFns;
+    auto &M = SCC.getCallGraph().getModule();
+    addPrepareFunction(M, PrepareFns, "llvm.coro.prepare.retcon");
+    addPrepareFunction(M, PrepareFns, "llvm.coro.prepare.async");
 
     // Find coroutines for processing.
     SmallVector<Function *, 4> Coroutines;
@@ -1753,13 +2128,17 @@ struct CoroSplitLegacy : public CallGraphSCCPass {
         if (F->hasFnAttribute(CORO_PRESPLIT_ATTR))
           Coroutines.push_back(F);
 
-    if (Coroutines.empty() && !PrepareFn)
+    if (Coroutines.empty() && PrepareFns.empty())
       return false;
 
     CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
 
-    if (Coroutines.empty())
-      return replaceAllPrepares(PrepareFn, CG);
+    if (Coroutines.empty()) {
+      bool Changed = false;
+      for (auto *PrepareFn : PrepareFns)
+        Changed |= replaceAllPrepares(PrepareFn, CG);
+      return Changed;
+    }
 
     createDevirtTriggerFunc(CG, SCC);
 
@@ -1769,6 +2148,12 @@ struct CoroSplitLegacy : public CallGraphSCCPass {
       StringRef Value = Attr.getValueAsString();
       LLVM_DEBUG(dbgs() << "CoroSplit: Processing coroutine '" << F->getName()
                         << "' state: " << Value << "\n");
+      // Async lowering marks coroutines to trigger a restart of the pipeline
+      // after it has split them.
+      if (Value == ASYNC_RESTART_AFTER_SPLIT) {
+        F->removeFnAttr(CORO_PRESPLIT_ATTR);
+        continue;
+      }
       if (Value == UNPREPARED_FOR_SPLIT) {
         prepareForSplit(*F, CG);
         continue;
@@ -1776,11 +2161,17 @@ struct CoroSplitLegacy : public CallGraphSCCPass {
       F->removeFnAttr(CORO_PRESPLIT_ATTR);
 
       SmallVector<Function *, 4> Clones;
-      const coro::Shape Shape = splitCoroutine(*F, Clones);
+      const coro::Shape Shape = splitCoroutine(*F, Clones, ReuseFrameSlot);
       updateCallGraphAfterCoroutineSplit(*F, Shape, Clones, CG, SCC);
+      if (Shape.ABI == coro::ABI::Async) {
+        // Restart SCC passes.
+        // Mark function for CoroElide pass. It will devirtualize causing a
+        // restart of the SCC pipeline.
+        prepareForSplit(*F, CG, true /*MarkForAsyncRestart*/);
+      }
     }
 
-    if (PrepareFn)
+    for (auto *PrepareFn : PrepareFns)
       replaceAllPrepares(PrepareFn, CG);
 
     return true;
@@ -1807,4 +2198,6 @@ INITIALIZE_PASS_END(
     "Split coroutine into a set of functions driving its state machine", false,
     false)
 
-Pass *llvm::createCoroSplitLegacyPass() { return new CoroSplitLegacy(); }
+Pass *llvm::createCoroSplitLegacyPass(bool ReuseFrameSlot) {
+  return new CoroSplitLegacy(ReuseFrameSlot);
+}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/Coroutines.cpp b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/Coroutines.cpp
index 87c3a8b0d0cf..6699a5c46313 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/Coroutines.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/Coroutines.cpp
@@ -69,7 +69,7 @@ static void addCoroutineScalarOptimizerPasses(const PassManagerBuilder &Builder,
 
 static void addCoroutineSCCPasses(const PassManagerBuilder &Builder,
                                   legacy::PassManagerBase &PM) {
-  PM.add(createCoroSplitLegacyPass());
+  PM.add(createCoroSplitLegacyPass(Builder.OptLevel != 0));
 }
 
 static void addCoroutineOptimizerLastPasses(const PassManagerBuilder &Builder,
@@ -124,17 +124,23 @@ static bool isCoroutineIntrinsicName(StringRef Name) {
   // NOTE: Must be sorted!
   static const char *const CoroIntrinsics[] = {
       "llvm.coro.alloc",
+      "llvm.coro.async.context.alloc",
+      "llvm.coro.async.context.dealloc",
+      "llvm.coro.async.store_resume",
       "llvm.coro.begin",
       "llvm.coro.destroy",
       "llvm.coro.done",
       "llvm.coro.end",
+      "llvm.coro.end.async",
       "llvm.coro.frame",
       "llvm.coro.free",
       "llvm.coro.id",
+      "llvm.coro.id.async",
       "llvm.coro.id.retcon",
       "llvm.coro.id.retcon.once",
       "llvm.coro.noop",
       "llvm.coro.param",
+      "llvm.coro.prepare.async",
       "llvm.coro.prepare.retcon",
       "llvm.coro.promise",
       "llvm.coro.resume",
@@ -142,6 +148,7 @@ static bool isCoroutineIntrinsicName(StringRef Name) {
       "llvm.coro.size",
       "llvm.coro.subfn.addr",
       "llvm.coro.suspend",
+      "llvm.coro.suspend.async",
       "llvm.coro.suspend.retcon",
   };
   return Intrinsic::lookupLLVMIntrinsicByName(CoroIntrinsics, Name) != -1;
@@ -269,6 +276,12 @@ void coro::Shape::buildFrom(Function &F) {
         if (II->use_empty())
           UnusedCoroSaves.push_back(cast<CoroSaveInst>(II));
         break;
+      case Intrinsic::coro_suspend_async: {
+        auto *Suspend = cast<CoroSuspendAsyncInst>(II);
+        Suspend->checkWellFormed();
+        CoroSuspends.push_back(Suspend);
+        break;
+      }
       case Intrinsic::coro_suspend_retcon: {
         auto Suspend = cast<CoroSuspendRetconInst>(II);
         CoroSuspends.push_back(Suspend);
@@ -304,11 +317,16 @@ void coro::Shape::buildFrom(Function &F) {
         CoroBegin = CB;
         break;
       }
+      case Intrinsic::coro_end_async:
       case Intrinsic::coro_end:
-        CoroEnds.push_back(cast<CoroEndInst>(II));
-        if (CoroEnds.back()->isFallthrough()) {
+        CoroEnds.push_back(cast<AnyCoroEndInst>(II));
+        if (auto *AsyncEnd = dyn_cast<CoroAsyncEndInst>(II)) {
+          AsyncEnd->checkWellFormed();
+        }
+        if (CoroEnds.back()->isFallthrough() && isa<CoroEndInst>(II)) {
           // Make sure that the fallthrough coro.end is the first element in the
           // CoroEnds vector.
+          // Note: I don't think this is neccessary anymore.
           if (CoroEnds.size() > 1) {
             if (CoroEnds.front()->isFallthrough())
               report_fatal_error(
@@ -341,7 +359,7 @@ void coro::Shape::buildFrom(Function &F) {
     }
 
     // Replace all coro.ends with unreachable instruction.
-    for (CoroEndInst *CE : CoroEnds)
+    for (AnyCoroEndInst *CE : CoroEnds)
       changeToUnreachable(CE, /*UseLLVMTrap=*/false);
 
     return;
@@ -371,7 +389,23 @@ void coro::Shape::buildFrom(Function &F) {
     }
     break;
   }
-
+  case Intrinsic::coro_id_async: {
+    auto *AsyncId = cast<CoroIdAsyncInst>(Id);
+    AsyncId->checkWellFormed();
+    this->ABI = coro::ABI::Async;
+    this->AsyncLowering.Context = AsyncId->getStorage();
+    this->AsyncLowering.ContextArgNo = AsyncId->getStorageArgumentIndex();
+    this->AsyncLowering.ContextHeaderSize = AsyncId->getStorageSize();
+    this->AsyncLowering.ContextAlignment =
+        AsyncId->getStorageAlignment().value();
+    this->AsyncLowering.AsyncFuncPointer = AsyncId->getAsyncFunctionPointer();
+    auto &Context = F.getContext();
+    auto *Int8PtrTy = Type::getInt8PtrTy(Context);
+    auto *VoidTy = Type::getVoidTy(Context);
+    this->AsyncLowering.AsyncFuncTy =
+        FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy, Int8PtrTy}, false);
+    break;
+  };
   case Intrinsic::coro_id_retcon:
   case Intrinsic::coro_id_retcon_once: {
     auto ContinuationId = cast<AnyCoroIdRetconInst>(Id);
@@ -512,6 +546,8 @@ Value *coro::Shape::emitAlloc(IRBuilder<> &Builder, Value *Size,
     addCallToCallGraph(CG, Call, Alloc);
     return Call;
   }
+  case coro::ABI::Async:
+    llvm_unreachable("can't allocate memory in coro async-lowering");
   }
   llvm_unreachable("Unknown coro::ABI enum");
 }
@@ -532,6 +568,8 @@ void coro::Shape::emitDealloc(IRBuilder<> &Builder, Value *Ptr,
     addCallToCallGraph(CG, Call, Dealloc);
     return;
   }
+  case coro::ABI::Async:
+    llvm_unreachable("can't allocate memory in coro async-lowering");
   }
   llvm_unreachable("Unknown coro::ABI enum");
 }
@@ -633,6 +671,67 @@ void AnyCoroIdRetconInst::checkWellFormed() const {
   checkWFDealloc(this, getArgOperand(DeallocArg));
 }
 
+static void checkAsyncFuncPointer(const Instruction *I, Value *V) {
+  auto *AsyncFuncPtrAddr = dyn_cast<GlobalVariable>(V->stripPointerCasts());
+  if (!AsyncFuncPtrAddr)
+    fail(I, "llvm.coro.id.async async function pointer not a global", V);
+
+  auto *StructTy =
+      cast<StructType>(AsyncFuncPtrAddr->getType()->getPointerElementType());
+  if (StructTy->isOpaque() || !StructTy->isPacked() ||
+      StructTy->getNumElements() != 2 ||
+      !StructTy->getElementType(0)->isIntegerTy(32) ||
+      !StructTy->getElementType(1)->isIntegerTy(32))
+    fail(I,
+         "llvm.coro.id.async async function pointer argument's type is not "
+         "<{i32, i32}>",
+         V);
+}
+
+void CoroIdAsyncInst::checkWellFormed() const {
+  checkConstantInt(this, getArgOperand(SizeArg),
+                   "size argument to coro.id.async must be constant");
+  checkConstantInt(this, getArgOperand(AlignArg),
+                   "alignment argument to coro.id.async must be constant");
+  checkConstantInt(this, getArgOperand(StorageArg),
+                   "storage argument offset to coro.id.async must be constant");
+  checkAsyncFuncPointer(this, getArgOperand(AsyncFuncPtrArg));
+}
+
+static void checkAsyncContextProjectFunction(const Instruction *I,
+                                             Function *F) {
+  auto *FunTy = cast<FunctionType>(F->getType()->getPointerElementType());
+  if (!FunTy->getReturnType()->isPointerTy() ||
+      !FunTy->getReturnType()->getPointerElementType()->isIntegerTy(8))
+    fail(I,
+         "llvm.coro.suspend.async resume function projection function must "
+         "return an i8* type",
+         F);
+  if (FunTy->getNumParams() != 1 || !FunTy->getParamType(0)->isPointerTy() ||
+      !FunTy->getParamType(0)->getPointerElementType()->isIntegerTy(8))
+    fail(I,
+         "llvm.coro.suspend.async resume function projection function must "
+         "take one i8* type as parameter",
+         F);
+}
+
+void CoroSuspendAsyncInst::checkWellFormed() const {
+  checkAsyncContextProjectFunction(this, getAsyncContextProjectionFunction());
+}
+
+void CoroAsyncEndInst::checkWellFormed() const {
+  auto *MustTailCallFunc = getMustTailCallFunction();
+  if (!MustTailCallFunc)
+    return;
+  auto *FnTy =
+      cast<FunctionType>(MustTailCallFunc->getType()->getPointerElementType());
+  if (FnTy->getNumParams() != (getNumArgOperands() - 3))
+    fail(this,
+         "llvm.coro.end.async must tail call function argument type must "
+         "match the tail arguments",
+         MustTailCallFunc);
+}
+
 void LLVMAddCoroEarlyPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createCoroEarlyLegacyPass());
 }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/HelloNew/HelloWorld.cpp b/contrib/llvm-project/llvm/lib/Transforms/HelloNew/HelloWorld.cpp
new file mode 100644
index 000000000000..dea94f8a8f62
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Transforms/HelloNew/HelloWorld.cpp
@@ -0,0 +1,17 @@
+//===-- HelloWorld.cpp - Example Transformations --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/HelloNew/HelloWorld.h"
+
+using namespace llvm;
+
+PreservedAnalyses HelloWorldPass::run(Function &F,
+                                      FunctionAnalysisManager &AM) {
+  errs() << F.getName() << "\n";
+  return PreservedAnalyses::all();
+}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/AlwaysInliner.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/AlwaysInliner.cpp
index 53f9512f86f3..532599b42e0d 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/AlwaysInliner.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/AlwaysInliner.cpp
@@ -13,8 +13,10 @@
 
 #include "llvm/Transforms/IPO/AlwaysInliner.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/InlineCost.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/DataLayout.h"
@@ -39,12 +41,19 @@ PreservedAnalyses AlwaysInlinerPass::run(Module &M,
   auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {
     return FAM.getResult<AssumptionAnalysis>(F);
   };
-  InlineFunctionInfo IFI(/*cg=*/nullptr, GetAssumptionCache);
+  auto &PSI = MAM.getResult<ProfileSummaryAnalysis>(M);
 
   SmallSetVector<CallBase *, 16> Calls;
   bool Changed = false;
   SmallVector<Function *, 16> InlinedFunctions;
-  for (Function &F : M)
+  for (Function &F : M) {
+    // When callee coroutine function is inlined into caller coroutine function
+    // before coro-split pass,
+    // coro-early pass can not handle this quiet well.
+    // So we won't inline the coroutine function if it have not been unsplited
+    if (F.isPresplitCoroutine())
+      continue;
+
     if (!F.isDeclaration() && F.hasFnAttribute(Attribute::AlwaysInline) &&
         isInlineViable(F).isSuccess()) {
       Calls.clear();
@@ -54,18 +63,41 @@ PreservedAnalyses AlwaysInlinerPass::run(Module &M,
           if (CB->getCalledFunction() == &F)
             Calls.insert(CB);
 
-      for (CallBase *CB : Calls)
-        // FIXME: We really shouldn't be able to fail to inline at this point!
-        // We should do something to log or check the inline failures here.
-        Changed |=
-            InlineFunction(*CB, IFI, /*CalleeAAR=*/nullptr, InsertLifetime)
-                .isSuccess();
+      for (CallBase *CB : Calls) {
+        Function *Caller = CB->getCaller();
+        OptimizationRemarkEmitter ORE(Caller);
+        auto OIC = shouldInline(
+            *CB,
+            [&](CallBase &CB) {
+              return InlineCost::getAlways("always inline attribute");
+            },
+            ORE);
+        assert(OIC);
+        emitInlinedInto(ORE, CB->getDebugLoc(), CB->getParent(), F, *Caller,
+                        *OIC, false, DEBUG_TYPE);
+
+        InlineFunctionInfo IFI(
+            /*cg=*/nullptr, GetAssumptionCache, &PSI,
+            &FAM.getResult<BlockFrequencyAnalysis>(*(CB->getCaller())),
+            &FAM.getResult<BlockFrequencyAnalysis>(F));
+
+        InlineResult Res = InlineFunction(
+            *CB, IFI, &FAM.getResult<AAManager>(F), InsertLifetime);
+        assert(Res.isSuccess() && "unexpected failure to inline");
+        (void)Res;
+
+        // Merge the attributes based on the inlining.
+        AttributeFuncs::mergeAttributesForInlining(*Caller, F);
+
+        Changed = true;
+      }
 
       // Remember to try and delete this function afterward. This both avoids
       // re-walking the rest of the module and avoids dealing with any iterator
       // invalidation issues while deleting functions.
       InlinedFunctions.push_back(&F);
     }
+  }
 
   // Remove any live functions.
   erase_if(InlinedFunctions, [&](Function *F) {
@@ -158,6 +190,13 @@ InlineCost AlwaysInlinerLegacyPass::getInlineCost(CallBase &CB) {
   if (!Callee)
     return InlineCost::getNever("indirect call");
 
+  // When callee coroutine function is inlined into caller coroutine function
+  // before coro-split pass,
+  // coro-early pass can not handle this quiet well.
+  // So we won't inline the coroutine function if it have not been unsplited
+  if (Callee->isPresplitCoroutine())
+    return InlineCost::getNever("unsplited coroutine call");
+
   // FIXME: We shouldn't even get here for declarations.
   if (Callee->isDeclaration())
     return InlineCost::getNever("no definition");
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/Annotation2Metadata.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/Annotation2Metadata.cpp
new file mode 100644
index 000000000000..5ca4e24df8fc
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/Annotation2Metadata.cpp
@@ -0,0 +1,106 @@
+//===-- Annotation2Metadata.cpp - Add !annotation metadata. ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Add !annotation metadata for entries in @llvm.global.anotations, generated
+// using __attribute__((annotate("_name"))) on functions in Clang.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/Annotation2Metadata.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/IPO.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "annotation2metadata"
+
+static bool convertAnnotation2Metadata(Module &M) {
+  // Only add !annotation metadata if the corresponding remarks pass is also
+  // enabled.
+  if (!OptimizationRemarkEmitter::allowExtraAnalysis(M.getContext(),
+                                                     "annotation-remarks"))
+    return false;
+
+  auto *Annotations = M.getGlobalVariable("llvm.global.annotations");
+  auto *C = dyn_cast_or_null<Constant>(Annotations);
+  if (!C || C->getNumOperands() != 1)
+    return false;
+
+  C = cast<Constant>(C->getOperand(0));
+
+  // Iterate over all entries in C and attach !annotation metadata to suitable
+  // entries.
+  for (auto &Op : C->operands()) {
+    // Look at the operands to check if we can use the entry to generate
+    // !annotation metadata.
+    auto *OpC = dyn_cast<ConstantStruct>(&Op);
+    if (!OpC || OpC->getNumOperands() != 4)
+      continue;
+    auto *StrGEP = dyn_cast<ConstantExpr>(OpC->getOperand(1));
+    if (!StrGEP || StrGEP->getNumOperands() < 2)
+      continue;
+    auto *StrC = dyn_cast<GlobalValue>(StrGEP->getOperand(0));
+    if (!StrC)
+      continue;
+    auto *StrData = dyn_cast<ConstantDataSequential>(StrC->getOperand(0));
+    if (!StrData)
+      continue;
+    // Look through bitcast.
+    auto *Bitcast = dyn_cast<ConstantExpr>(OpC->getOperand(0));
+    if (!Bitcast || Bitcast->getOpcode() != Instruction::BitCast)
+      continue;
+    auto *Fn = dyn_cast<Function>(Bitcast->getOperand(0));
+    if (!Fn)
+      continue;
+
+    // Add annotation to all instructions in the function.
+    for (auto &I : instructions(Fn))
+      I.addAnnotationMetadata(StrData->getAsCString());
+  }
+  return true;
+}
+
+namespace {
+struct Annotation2MetadataLegacy : public ModulePass {
+  static char ID;
+
+  Annotation2MetadataLegacy() : ModulePass(ID) {
+    initializeAnnotation2MetadataLegacyPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override { return convertAnnotation2Metadata(M); }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+  }
+};
+
+} // end anonymous namespace
+
+char Annotation2MetadataLegacy::ID = 0;
+
+INITIALIZE_PASS_BEGIN(Annotation2MetadataLegacy, DEBUG_TYPE,
+                      "Annotation2Metadata", false, false)
+INITIALIZE_PASS_END(Annotation2MetadataLegacy, DEBUG_TYPE,
+                    "Annotation2Metadata", false, false)
+
+ModulePass *llvm::createAnnotation2MetadataLegacyPass() {
+  return new Annotation2MetadataLegacy();
+}
+
+PreservedAnalyses Annotation2MetadataPass::run(Module &M,
+                                               ModuleAnalysisManager &AM) {
+  convertAnnotation2Metadata(M);
+  return PreservedAnalyses::all();
+}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
index ad0d7eb51507..7998a1ae5c6e 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -33,11 +33,11 @@
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/Twine.h"
-#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/CGSCCPassManager.h"
@@ -142,7 +142,7 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
       // Simple byval argument? Just add all the struct element types.
       Type *AgTy = cast<PointerType>(I->getType())->getElementType();
       StructType *STy = cast<StructType>(AgTy);
-      Params.insert(Params.end(), STy->element_begin(), STy->element_end());
+      llvm::append_range(Params, STy->elements());
       ArgAttrVec.insert(ArgAttrVec.end(), STy->getNumElements(),
                         AttributeSet());
       ++NumByValArgsPromoted;
@@ -153,10 +153,6 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
     } else if (I->use_empty()) {
       // Dead argument (which are always marked as promotable)
       ++NumArgumentsDead;
-
-      // There may be remaining metadata uses of the argument for things like
-      // llvm.dbg.value. Replace them with undef.
-      I->replaceAllUsesWith(UndefValue::get(I->getType()));
     } else {
       // Okay, this is being promoted. This means that the only uses are loads
       // or GEPs which are only used by loads
@@ -164,13 +160,19 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
       // In this table, we will track which indices are loaded from the argument
       // (where direct loads are tracked as no indices).
       ScalarizeTable &ArgIndices = ScalarizedElements[&*I];
-      for (User *U : I->users()) {
+      for (User *U : make_early_inc_range(I->users())) {
         Instruction *UI = cast<Instruction>(U);
         Type *SrcTy;
         if (LoadInst *L = dyn_cast<LoadInst>(UI))
           SrcTy = L->getType();
         else
           SrcTy = cast<GetElementPtrInst>(UI)->getSourceElementType();
+        // Skip dead GEPs and remove them.
+        if (isa<GetElementPtrInst>(UI) && UI->use_empty()) {
+          UI->eraseFromParent();
+          continue;
+        }
+
         IndicesVector Indices;
         Indices.reserve(UI->getNumOperands() - 1);
         // Since loads will only have a single operand, and GEPs only a single
@@ -218,9 +220,11 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
   Function *NF = Function::Create(NFTy, F->getLinkage(), F->getAddressSpace(),
                                   F->getName());
   NF->copyAttributesFrom(F);
+  NF->copyMetadata(F, 0);
 
-  // Patch the pointer to LLVM function in debug info descriptor.
-  NF->setSubprogram(F->getSubprogram());
+  // The new function will have the !dbg metadata copied from the original
+  // function. The original function may not be deleted, and dbg metadata need
+  // to be unique so we need to drop it.
   F->setSubprogram(nullptr);
 
   LLVM_DEBUG(dbgs() << "ARG PROMOTION:  Promoting to:" << *NF << "\n"
@@ -414,6 +418,11 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
       continue;
     }
 
+    // There potentially are metadata uses for things like llvm.dbg.value.
+    // Replace them with undef, after handling the other regular uses.
+    auto RauwUndefMetadata = make_scope_exit(
+        [&]() { I->replaceAllUsesWith(UndefValue::get(I->getType())); });
+
     if (I->use_empty())
       continue;
 
@@ -433,6 +442,8 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
                           << "' in function '" << F->getName() << "'\n");
       } else {
         GetElementPtrInst *GEP = cast<GetElementPtrInst>(I->user_back());
+        assert(!GEP->use_empty() &&
+               "GEPs without uses should be cleaned up already");
         IndicesVector Operands;
         Operands.reserve(GEP->getNumIndices());
         for (User::op_iterator II = GEP->idx_begin(), IE = GEP->idx_end();
@@ -465,7 +476,6 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
         GEP->eraseFromParent();
       }
     }
-
     // Increment I2 past all of the arguments added for this promoted pointer.
     std::advance(I2, ArgIndices.size());
   }
@@ -672,11 +682,7 @@ static bool isSafeToPromoteArgument(Argument *Arg, Type *ByValTy, AAResults &AAR
       if (GEP->use_empty()) {
         // Dead GEP's cause trouble later.  Just remove them if we run into
         // them.
-        GEP->eraseFromParent();
-        // TODO: This runs the above loop over and over again for dead GEPs
-        // Couldn't we just do increment the UI iterator earlier and erase the
-        // use?
-        return isSafeToPromoteArgument(Arg, ByValTy, AAR, MaxElements);
+        continue;
       }
 
       if (!UpdateBaseTy(GEP->getSourceElementType()))
@@ -816,14 +822,12 @@ static bool canPaddingBeAccessed(Argument *arg) {
 
   // Scan through the uses recursively to make sure the pointer is always used
   // sanely.
-  SmallVector<Value *, 16> WorkList;
-  WorkList.insert(WorkList.end(), arg->user_begin(), arg->user_end());
+  SmallVector<Value *, 16> WorkList(arg->users());
   while (!WorkList.empty()) {
-    Value *V = WorkList.back();
-    WorkList.pop_back();
+    Value *V = WorkList.pop_back_val();
     if (isa<GetElementPtrInst>(V) || isa<PHINode>(V)) {
       if (PtrValues.insert(V).second)
-        WorkList.insert(WorkList.end(), V->user_begin(), V->user_end());
+        llvm::append_range(WorkList, V->users());
     } else if (StoreInst *Store = dyn_cast<StoreInst>(V)) {
       Stores.push_back(Store);
     } else if (!isa<LoadInst>(V)) {
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/Attributor.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/Attributor.cpp
index f96dac5f3515..03ad45135001 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/Attributor.cpp
@@ -15,31 +15,47 @@
 
 #include "llvm/Transforms/IPO/Attributor.h"
 
+#include "llvm/ADT/GraphTraits.h"
+#include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/TinyPtrVector.h"
+#include "llvm/Analysis/InlineCost.h"
 #include "llvm/Analysis/LazyValueInfo.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/MustExecute.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/NoFolder.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/DebugCounter.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
 
 #include <cassert>
+#include <string>
 
 using namespace llvm;
 
 #define DEBUG_TYPE "attributor"
 
+DEBUG_COUNTER(ManifestDBGCounter, "attributor-manifest",
+              "Determine what attributes are manifested in the IR");
+
 STATISTIC(NumFnDeleted, "Number of function deleted");
 STATISTIC(NumFnWithExactDefinition,
           "Number of functions with exact definitions");
 STATISTIC(NumFnWithoutExactDefinition,
           "Number of functions without exact definitions");
-STATISTIC(NumFnShallowWrapperCreated, "Number of shallow wrappers created");
+STATISTIC(NumFnShallowWrappersCreated, "Number of shallow wrappers created");
 STATISTIC(NumAttributesTimedOut,
           "Number of abstract attributes timed out before fixpoint");
 STATISTIC(NumAttributesValidFixpoint,
@@ -61,6 +77,14 @@ static cl::opt<unsigned>
     MaxFixpointIterations("attributor-max-iterations", cl::Hidden,
                           cl::desc("Maximal number of fixpoint iterations."),
                           cl::init(32));
+
+static cl::opt<unsigned, true> MaxInitializationChainLengthX(
+    "attributor-max-initialization-chain-length", cl::Hidden,
+    cl::desc(
+        "Maximal number of chained initializations (to avoid stack overflows)"),
+    cl::location(MaxInitializationChainLength), cl::init(1024));
+unsigned llvm::MaxInitializationChainLength;
+
 static cl::opt<bool> VerifyMaxFixpointIterations(
     "attributor-max-iterations-verify", cl::Hidden,
     cl::desc("Verify that max-iterations is a tight bound for a fixpoint"),
@@ -79,20 +103,52 @@ static cl::opt<bool>
                                   "wrappers for non-exact definitions."),
                          cl::init(false));
 
+static cl::opt<bool>
+    AllowDeepWrapper("attributor-allow-deep-wrappers", cl::Hidden,
+                     cl::desc("Allow the Attributor to use IP information "
+                              "derived from non-exact functions via cloning"),
+                     cl::init(false));
+
+// These options can only used for debug builds.
+#ifndef NDEBUG
 static cl::list<std::string>
     SeedAllowList("attributor-seed-allow-list", cl::Hidden,
-                  cl::desc("Comma seperated list of attrbute names that are "
+                  cl::desc("Comma seperated list of attribute names that are "
                            "allowed to be seeded."),
                   cl::ZeroOrMore, cl::CommaSeparated);
 
+static cl::list<std::string> FunctionSeedAllowList(
+    "attributor-function-seed-allow-list", cl::Hidden,
+    cl::desc("Comma seperated list of function names that are "
+             "allowed to be seeded."),
+    cl::ZeroOrMore, cl::CommaSeparated);
+#endif
+
+static cl::opt<bool>
+    DumpDepGraph("attributor-dump-dep-graph", cl::Hidden,
+                 cl::desc("Dump the dependency graph to dot files."),
+                 cl::init(false));
+
+static cl::opt<std::string> DepGraphDotFileNamePrefix(
+    "attributor-depgraph-dot-filename-prefix", cl::Hidden,
+    cl::desc("The prefix used for the CallGraph dot file names."));
+
+static cl::opt<bool> ViewDepGraph("attributor-view-dep-graph", cl::Hidden,
+                                  cl::desc("View the dependency graph."),
+                                  cl::init(false));
+
+static cl::opt<bool> PrintDependencies("attributor-print-dep", cl::Hidden,
+                                       cl::desc("Print attribute dependencies"),
+                                       cl::init(false));
+
 /// Logic operators for the change status enum class.
 ///
 ///{
-ChangeStatus llvm::operator|(ChangeStatus l, ChangeStatus r) {
-  return l == ChangeStatus::CHANGED ? l : r;
+ChangeStatus llvm::operator|(ChangeStatus L, ChangeStatus R) {
+  return L == ChangeStatus::CHANGED ? L : R;
 }
-ChangeStatus llvm::operator&(ChangeStatus l, ChangeStatus r) {
-  return l == ChangeStatus::UNCHANGED ? l : r;
+ChangeStatus llvm::operator&(ChangeStatus L, ChangeStatus R) {
+  return L == ChangeStatus::UNCHANGED ? L : R;
 }
 ///}
 
@@ -145,7 +201,7 @@ Argument *IRPosition::getAssociatedArgument() const {
 
   // Not an Argument and no argument number means this is not a call site
   // argument, thus we cannot find a callback argument to return.
-  int ArgNo = getArgNo();
+  int ArgNo = getCallSiteArgNo();
   if (ArgNo < 0)
     return nullptr;
 
@@ -273,6 +329,13 @@ const IRPosition
 SubsumingPositionIterator::SubsumingPositionIterator(const IRPosition &IRP) {
   IRPositions.emplace_back(IRP);
 
+  // Helper to determine if operand bundles on a call site are benin or
+  // potentially problematic. We handle only llvm.assume for now.
+  auto CanIgnoreOperandBundles = [](const CallBase &CB) {
+    return (isa<IntrinsicInst>(CB) &&
+            cast<IntrinsicInst>(CB).getIntrinsicID() == Intrinsic ::assume);
+  };
+
   const auto *CB = dyn_cast<CallBase>(&IRP.getAnchorValue());
   switch (IRP.getPositionKind()) {
   case IRPosition::IRP_INVALID:
@@ -287,7 +350,7 @@ SubsumingPositionIterator::SubsumingPositionIterator(const IRPosition &IRP) {
     assert(CB && "Expected call site!");
     // TODO: We need to look at the operand bundles similar to the redirection
     //       in CallBase.
-    if (!CB->hasOperandBundles())
+    if (!CB->hasOperandBundles() || CanIgnoreOperandBundles(*CB))
       if (const Function *Callee = CB->getCalledFunction())
         IRPositions.emplace_back(IRPosition::function(*Callee));
     return;
@@ -295,7 +358,7 @@ SubsumingPositionIterator::SubsumingPositionIterator(const IRPosition &IRP) {
     assert(CB && "Expected call site!");
     // TODO: We need to look at the operand bundles similar to the redirection
     //       in CallBase.
-    if (!CB->hasOperandBundles()) {
+    if (!CB->hasOperandBundles() || CanIgnoreOperandBundles(*CB)) {
       if (const Function *Callee = CB->getCalledFunction()) {
         IRPositions.emplace_back(IRPosition::returned(*Callee));
         IRPositions.emplace_back(IRPosition::function(*Callee));
@@ -312,16 +375,16 @@ SubsumingPositionIterator::SubsumingPositionIterator(const IRPosition &IRP) {
     IRPositions.emplace_back(IRPosition::callsite_function(*CB));
     return;
   case IRPosition::IRP_CALL_SITE_ARGUMENT: {
-    int ArgNo = IRP.getArgNo();
-    assert(CB && ArgNo >= 0 && "Expected call site!");
+    assert(CB && "Expected call site!");
     // TODO: We need to look at the operand bundles similar to the redirection
     //       in CallBase.
-    if (!CB->hasOperandBundles()) {
+    if (!CB->hasOperandBundles() || CanIgnoreOperandBundles(*CB)) {
       const Function *Callee = CB->getCalledFunction();
-      if (Callee && Callee->arg_size() > unsigned(ArgNo))
-        IRPositions.emplace_back(IRPosition::argument(*Callee->getArg(ArgNo)));
-      if (Callee)
+      if (Callee) {
+        if (Argument *Arg = IRP.getAssociatedArgument())
+          IRPositions.emplace_back(IRPosition::argument(*Arg));
         IRPositions.emplace_back(IRPosition::function(*Callee));
+      }
     }
     IRPositions.emplace_back(IRPosition::value(IRP.getAssociatedValue()));
     return;
@@ -459,7 +522,7 @@ void IRPosition::verify() {
            "Expected call base argument operand for a 'call site argument' "
            "position");
     assert(cast<CallBase>(U->getUser())->getArgOperandNo(U) ==
-               unsigned(getArgNo()) &&
+               unsigned(getCallSiteArgNo()) &&
            "Argument number mismatch!");
     assert(U->get() == &getAssociatedValue() && "Associated value mismatch!");
     return;
@@ -498,8 +561,10 @@ Attributor::getAssumedConstant(const Value &V, const AbstractAttribute &AA,
 Attributor::~Attributor() {
   // The abstract attributes are allocated via the BumpPtrAllocator Allocator,
   // thus we cannot delete them. We can, and want to, destruct them though.
-  for (AbstractAttribute *AA : AllAbstractAttributes)
+  for (auto &DepAA : DG.SyntheticRoot.Deps) {
+    AbstractAttribute *AA = cast<AbstractAttribute>(DepAA.getPointer());
     AA->~AbstractAttribute();
+  }
 }
 
 bool Attributor::isAssumedDead(const AbstractAttribute &AA,
@@ -864,13 +929,15 @@ bool Attributor::checkForAllInstructions(function_ref<bool(Instruction &)> Pred,
 
   // TODO: use the function scope once we have call site AAReturnedValues.
   const IRPosition &QueryIRP = IRPosition::function(*AssociatedFunction);
-  const auto &LivenessAA =
-      getAAFor<AAIsDead>(QueryingAA, QueryIRP, /* TrackDependence */ false);
+  const auto *LivenessAA =
+      CheckBBLivenessOnly ? nullptr
+                          : &(getAAFor<AAIsDead>(QueryingAA, QueryIRP,
+                                                 /* TrackDependence */ false));
 
   auto &OpcodeInstMap =
       InfoCache.getOpcodeInstMapForFunction(*AssociatedFunction);
   if (!checkForAllInstructionsImpl(this, OpcodeInstMap, Pred, &QueryingAA,
-                                   &LivenessAA, Opcodes, CheckBBLivenessOnly))
+                                   LivenessAA, Opcodes, CheckBBLivenessOnly))
     return false;
 
   return true;
@@ -903,8 +970,9 @@ bool Attributor::checkForAllReadWriteInstructions(
 }
 
 void Attributor::runTillFixpoint() {
+  TimeTraceScope TimeScope("Attributor::runTillFixpoint");
   LLVM_DEBUG(dbgs() << "[Attributor] Identified and initialized "
-                    << AllAbstractAttributes.size()
+                    << DG.SyntheticRoot.Deps.size()
                     << " abstract attributes.\n");
 
   // Now that all abstract attributes are collected and initialized we start
@@ -914,11 +982,11 @@ void Attributor::runTillFixpoint() {
 
   SmallVector<AbstractAttribute *, 32> ChangedAAs;
   SetVector<AbstractAttribute *> Worklist, InvalidAAs;
-  Worklist.insert(AllAbstractAttributes.begin(), AllAbstractAttributes.end());
+  Worklist.insert(DG.SyntheticRoot.begin(), DG.SyntheticRoot.end());
 
   do {
     // Remember the size to determine new attributes.
-    size_t NumAAs = AllAbstractAttributes.size();
+    size_t NumAAs = DG.SyntheticRoot.Deps.size();
     LLVM_DEBUG(dbgs() << "\n\n[Attributor] #Iteration: " << IterationCounter
                       << ", Worklist size: " << Worklist.size() << "\n");
 
@@ -935,7 +1003,7 @@ void Attributor::runTillFixpoint() {
       while (!InvalidAA->Deps.empty()) {
         const auto &Dep = InvalidAA->Deps.back();
         InvalidAA->Deps.pop_back();
-        AbstractAttribute *DepAA = Dep.getPointer();
+        AbstractAttribute *DepAA = cast<AbstractAttribute>(Dep.getPointer());
         if (Dep.getInt() == unsigned(DepClassTy::OPTIONAL)) {
           Worklist.insert(DepAA);
           continue;
@@ -953,7 +1021,8 @@ void Attributor::runTillFixpoint() {
     // changed to the work list.
     for (AbstractAttribute *ChangedAA : ChangedAAs)
       while (!ChangedAA->Deps.empty()) {
-        Worklist.insert(ChangedAA->Deps.back().getPointer());
+        Worklist.insert(
+            cast<AbstractAttribute>(ChangedAA->Deps.back().getPointer()));
         ChangedAA->Deps.pop_back();
       }
 
@@ -981,8 +1050,8 @@ void Attributor::runTillFixpoint() {
 
     // Add attributes to the changed set if they have been created in the last
     // iteration.
-    ChangedAAs.append(AllAbstractAttributes.begin() + NumAAs,
-                      AllAbstractAttributes.end());
+    ChangedAAs.append(DG.SyntheticRoot.begin() + NumAAs,
+                      DG.SyntheticRoot.end());
 
     // Reset the work list and repopulate with the changed abstract attributes.
     // Note that dependent ones are added above.
@@ -1015,7 +1084,8 @@ void Attributor::runTillFixpoint() {
     }
 
     while (!ChangedAA->Deps.empty()) {
-      ChangedAAs.push_back(ChangedAA->Deps.back().getPointer());
+      ChangedAAs.push_back(
+          cast<AbstractAttribute>(ChangedAA->Deps.back().getPointer()));
       ChangedAA->Deps.pop_back();
     }
   }
@@ -1037,12 +1107,14 @@ void Attributor::runTillFixpoint() {
 }
 
 ChangeStatus Attributor::manifestAttributes() {
-  size_t NumFinalAAs = AllAbstractAttributes.size();
+  TimeTraceScope TimeScope("Attributor::manifestAttributes");
+  size_t NumFinalAAs = DG.SyntheticRoot.Deps.size();
 
   unsigned NumManifested = 0;
   unsigned NumAtFixpoint = 0;
   ChangeStatus ManifestChange = ChangeStatus::UNCHANGED;
-  for (AbstractAttribute *AA : AllAbstractAttributes) {
+  for (auto &DepAA : DG.SyntheticRoot.Deps) {
+    AbstractAttribute *AA = cast<AbstractAttribute>(DepAA.getPointer());
     AbstractState &State = AA->getState();
 
     // If there is not already a fixpoint reached, we can now take the
@@ -1059,6 +1131,10 @@ ChangeStatus Attributor::manifestAttributes() {
     // Skip dead code.
     if (isAssumedDead(*AA, nullptr, /* CheckBBLivenessOnly */ true))
       continue;
+    // Check if the manifest debug counter that allows skipping manifestation of
+    // AAs
+    if (!DebugCounter::shouldExecute(ManifestDBGCounter))
+      continue;
     // Manifest the state and record if we changed the IR.
     ChangeStatus LocalChange = AA->manifest(*this);
     if (LocalChange == ChangeStatus::CHANGED && AreStatisticsEnabled())
@@ -1082,11 +1158,14 @@ ChangeStatus Attributor::manifestAttributes() {
   NumAttributesValidFixpoint += NumAtFixpoint;
 
   (void)NumFinalAAs;
-  if (NumFinalAAs != AllAbstractAttributes.size()) {
-    for (unsigned u = NumFinalAAs; u < AllAbstractAttributes.size(); ++u)
-      errs() << "Unexpected abstract attribute: " << *AllAbstractAttributes[u]
+  if (NumFinalAAs != DG.SyntheticRoot.Deps.size()) {
+    for (unsigned u = NumFinalAAs; u < DG.SyntheticRoot.Deps.size(); ++u)
+      errs() << "Unexpected abstract attribute: "
+             << cast<AbstractAttribute>(DG.SyntheticRoot.Deps[u].getPointer())
              << " :: "
-             << AllAbstractAttributes[u]->getIRPosition().getAssociatedValue()
+             << cast<AbstractAttribute>(DG.SyntheticRoot.Deps[u].getPointer())
+                    ->getIRPosition()
+                    .getAssociatedValue()
              << "\n";
     llvm_unreachable("Expected the final number of abstract attributes to "
                      "remain unchanged!");
@@ -1094,7 +1173,50 @@ ChangeStatus Attributor::manifestAttributes() {
   return ManifestChange;
 }
 
+void Attributor::identifyDeadInternalFunctions() {
+  // Identify dead internal functions and delete them. This happens outside
+  // the other fixpoint analysis as we might treat potentially dead functions
+  // as live to lower the number of iterations. If they happen to be dead, the
+  // below fixpoint loop will identify and eliminate them.
+  SmallVector<Function *, 8> InternalFns;
+  for (Function *F : Functions)
+    if (F->hasLocalLinkage())
+      InternalFns.push_back(F);
+
+  SmallPtrSet<Function *, 8> LiveInternalFns;
+  bool FoundLiveInternal = true;
+  while (FoundLiveInternal) {
+    FoundLiveInternal = false;
+    for (unsigned u = 0, e = InternalFns.size(); u < e; ++u) {
+      Function *F = InternalFns[u];
+      if (!F)
+        continue;
+
+      bool AllCallSitesKnown;
+      if (checkForAllCallSites(
+              [&](AbstractCallSite ACS) {
+                Function *Callee = ACS.getInstruction()->getFunction();
+                return ToBeDeletedFunctions.count(Callee) ||
+                       (Functions.count(Callee) && Callee->hasLocalLinkage() &&
+                        !LiveInternalFns.count(Callee));
+              },
+              *F, true, nullptr, AllCallSitesKnown)) {
+        continue;
+      }
+
+      LiveInternalFns.insert(F);
+      InternalFns[u] = nullptr;
+      FoundLiveInternal = true;
+    }
+  }
+
+  for (unsigned u = 0, e = InternalFns.size(); u < e; ++u)
+    if (Function *F = InternalFns[u])
+      ToBeDeletedFunctions.insert(F);
+}
+
 ChangeStatus Attributor::cleanupIR() {
+  TimeTraceScope TimeScope("Attributor::cleanupIR");
   // Delete stuff at the end to avoid invalid references and a nice order.
   LLVM_DEBUG(dbgs() << "\n[Attributor] Delete at least "
                     << ToBeDeletedFunctions.size() << " functions and "
@@ -1205,50 +1327,45 @@ ChangeStatus Attributor::cleanupIR() {
     DetatchDeadBlocks(ToBeDeletedBBs, nullptr);
   }
 
-  // Identify dead internal functions and delete them. This happens outside
-  // the other fixpoint analysis as we might treat potentially dead functions
-  // as live to lower the number of iterations. If they happen to be dead, the
-  // below fixpoint loop will identify and eliminate them.
-  SmallVector<Function *, 8> InternalFns;
-  for (Function *F : Functions)
-    if (F->hasLocalLinkage())
-      InternalFns.push_back(F);
-
-  bool FoundDeadFn = true;
-  while (FoundDeadFn) {
-    FoundDeadFn = false;
-    for (unsigned u = 0, e = InternalFns.size(); u < e; ++u) {
-      Function *F = InternalFns[u];
-      if (!F)
-        continue;
-
-      bool AllCallSitesKnown;
-      if (!checkForAllCallSites(
-              [this](AbstractCallSite ACS) {
-                return ToBeDeletedFunctions.count(
-                    ACS.getInstruction()->getFunction());
-              },
-              *F, true, nullptr, AllCallSitesKnown))
-        continue;
-
-      ToBeDeletedFunctions.insert(F);
-      InternalFns[u] = nullptr;
-      FoundDeadFn = true;
-    }
-  }
+  identifyDeadInternalFunctions();
 
   // Rewrite the functions as requested during manifest.
   ChangeStatus ManifestChange = rewriteFunctionSignatures(CGModifiedFunctions);
 
   for (Function *Fn : CGModifiedFunctions)
-    CGUpdater.reanalyzeFunction(*Fn);
+    if (!ToBeDeletedFunctions.count(Fn))
+      CGUpdater.reanalyzeFunction(*Fn);
 
-  for (Function *Fn : ToBeDeletedFunctions)
+  for (Function *Fn : ToBeDeletedFunctions) {
+    if (!Functions.count(Fn))
+      continue;
     CGUpdater.removeFunction(*Fn);
+  }
+
+  if (!ToBeChangedUses.empty())
+    ManifestChange = ChangeStatus::CHANGED;
+
+  if (!ToBeChangedToUnreachableInsts.empty())
+    ManifestChange = ChangeStatus::CHANGED;
+
+  if (!ToBeDeletedFunctions.empty())
+    ManifestChange = ChangeStatus::CHANGED;
+
+  if (!ToBeDeletedBlocks.empty())
+    ManifestChange = ChangeStatus::CHANGED;
+
+  if (!ToBeDeletedInsts.empty())
+    ManifestChange = ChangeStatus::CHANGED;
+
+  if (!InvokeWithDeadSuccessor.empty())
+    ManifestChange = ChangeStatus::CHANGED;
+
+  if (!DeadInsts.empty())
+    ManifestChange = ChangeStatus::CHANGED;
 
   NumFnDeleted += ToBeDeletedFunctions.size();
 
-  LLVM_DEBUG(dbgs() << "[Attributor] Deleted " << NumFnDeleted
+  LLVM_DEBUG(dbgs() << "[Attributor] Deleted " << ToBeDeletedFunctions.size()
                     << " functions after manifest.\n");
 
 #ifdef EXPENSIVE_CHECKS
@@ -1263,14 +1380,37 @@ ChangeStatus Attributor::cleanupIR() {
 }
 
 ChangeStatus Attributor::run() {
-  SeedingPeriod = false;
+  TimeTraceScope TimeScope("Attributor::run");
+
+  Phase = AttributorPhase::UPDATE;
   runTillFixpoint();
+
+  // dump graphs on demand
+  if (DumpDepGraph)
+    DG.dumpGraph();
+
+  if (ViewDepGraph)
+    DG.viewGraph();
+
+  if (PrintDependencies)
+    DG.print();
+
+  Phase = AttributorPhase::MANIFEST;
   ChangeStatus ManifestChange = manifestAttributes();
+
+  Phase = AttributorPhase::CLEANUP;
   ChangeStatus CleanupChange = cleanupIR();
+
   return ManifestChange | CleanupChange;
 }
 
 ChangeStatus Attributor::updateAA(AbstractAttribute &AA) {
+  TimeTraceScope TimeScope(
+      AA.getName() + std::to_string(AA.getIRPosition().getPositionKind()) +
+      "::updateAA");
+  assert(Phase == AttributorPhase::UPDATE &&
+         "We can update AA only in the update stage!");
+
   // Use a new dependence vector for this update.
   DependenceVector DV;
   DependenceStack.push_back(&DV);
@@ -1298,23 +1438,7 @@ ChangeStatus Attributor::updateAA(AbstractAttribute &AA) {
   return CS;
 }
 
-/// Create a shallow wrapper for \p F such that \p F has internal linkage
-/// afterwards. It also sets the original \p F 's name to anonymous
-///
-/// A wrapper is a function with the same type (and attributes) as \p F
-/// that will only call \p F and return the result, if any.
-///
-/// Assuming the declaration of looks like:
-///   rty F(aty0 arg0, ..., atyN argN);
-///
-/// The wrapper will then look as follows:
-///   rty wrapper(aty0 arg0, ..., atyN argN) {
-///     return F(arg0, ..., argN);
-///   }
-///
-static void createShallowWrapper(Function &F) {
-  assert(AllowShallowWrappers &&
-         "Cannot create a wrapper if it is not allowed!");
+void Attributor::createShallowWrapper(Function &F) {
   assert(!F.isDeclaration() && "Cannot create a wrapper around a declaration!");
 
   Module &M = *F.getParent();
@@ -1347,7 +1471,7 @@ static void createShallowWrapper(Function &F) {
   BasicBlock *EntryBB = BasicBlock::Create(Ctx, "entry", Wrapper);
 
   SmallVector<Value *, 8> Args;
-  auto FArgIt = F.arg_begin();
+  Argument *FArgIt = F.arg_begin();
   for (Argument &Arg : Wrapper->args()) {
     Args.push_back(&Arg);
     Arg.setName((FArgIt++)->getName());
@@ -1358,7 +1482,57 @@ static void createShallowWrapper(Function &F) {
   CI->addAttribute(AttributeList::FunctionIndex, Attribute::NoInline);
   ReturnInst::Create(Ctx, CI->getType()->isVoidTy() ? nullptr : CI, EntryBB);
 
-  NumFnShallowWrapperCreated++;
+  NumFnShallowWrappersCreated++;
+}
+
+/// Make another copy of the function \p F such that the copied version has
+/// internal linkage afterwards and can be analysed. Then we replace all uses
+/// of the original function to the copied one
+///
+/// Only non-exactly defined functions that have `linkonce_odr` or `weak_odr`
+/// linkage can be internalized because these linkages guarantee that other
+/// definitions with the same name have the same semantics as this one
+///
+static Function *internalizeFunction(Function &F) {
+  assert(AllowDeepWrapper && "Cannot create a copy if not allowed.");
+  assert(!F.isDeclaration() && !F.hasExactDefinition() &&
+         !GlobalValue::isInterposableLinkage(F.getLinkage()) &&
+         "Trying to internalize function which cannot be internalized.");
+
+  Module &M = *F.getParent();
+  FunctionType *FnTy = F.getFunctionType();
+
+  // create a copy of the current function
+  Function *Copied = Function::Create(FnTy, F.getLinkage(), F.getAddressSpace(),
+                                      F.getName() + ".internalized");
+  ValueToValueMapTy VMap;
+  auto *NewFArgIt = Copied->arg_begin();
+  for (auto &Arg : F.args()) {
+    auto ArgName = Arg.getName();
+    NewFArgIt->setName(ArgName);
+    VMap[&Arg] = &(*NewFArgIt++);
+  }
+  SmallVector<ReturnInst *, 8> Returns;
+
+  // Copy the body of the original function to the new one
+  CloneFunctionInto(Copied, &F, VMap, /* ModuleLevelChanges */ false, Returns);
+
+  // Set the linakage and visibility late as CloneFunctionInto has some implicit
+  // requirements.
+  Copied->setVisibility(GlobalValue::DefaultVisibility);
+  Copied->setLinkage(GlobalValue::PrivateLinkage);
+
+  // Copy metadata
+  SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
+  F.getAllMetadata(MDs);
+  for (auto MDIt : MDs)
+    Copied->addMetadata(MDIt.first, *MDIt.second);
+
+  M.getFunctionList().insert(F.getIterator(), Copied);
+  F.replaceAllUsesWith(Copied);
+  Copied->setDSOLocal(true);
+
+  return Copied;
 }
 
 bool Attributor::isValidFunctionSignatureRewrite(
@@ -1461,9 +1635,17 @@ bool Attributor::registerFunctionSignatureRewrite(
 }
 
 bool Attributor::shouldSeedAttribute(AbstractAttribute &AA) {
-  if (SeedAllowList.size() == 0)
-    return true;
-  return std::count(SeedAllowList.begin(), SeedAllowList.end(), AA.getName());
+  bool Result = true;
+#ifndef NDEBUG
+  if (SeedAllowList.size() != 0)
+    Result =
+        std::count(SeedAllowList.begin(), SeedAllowList.end(), AA.getName());
+  Function *Fn = AA.getAnchorScope();
+  if (FunctionSeedAllowList.size() != 0 && Fn)
+    Result &= std::count(FunctionSeedAllowList.begin(),
+                         FunctionSeedAllowList.end(), Fn->getName());
+#endif
+  return Result;
 }
 
 ChangeStatus Attributor::rewriteFunctionSignatures(
@@ -1474,7 +1656,7 @@ ChangeStatus Attributor::rewriteFunctionSignatures(
     Function *OldFn = It.getFirst();
 
     // Deleted functions do not require rewrites.
-    if (ToBeDeletedFunctions.count(OldFn))
+    if (!Functions.count(OldFn) || ToBeDeletedFunctions.count(OldFn))
       continue;
 
     const SmallVectorImpl<std::unique_ptr<ArgumentReplacementInfo>> &ARIs =
@@ -1617,8 +1799,8 @@ ChangeStatus Attributor::rewriteFunctionSignatures(
     assert(Success && "Assumed call site replacement to succeed!");
 
     // Rewire the arguments.
-    auto OldFnArgIt = OldFn->arg_begin();
-    auto NewFnArgIt = NewFn->arg_begin();
+    Argument *OldFnArgIt = OldFn->arg_begin();
+    Argument *NewFnArgIt = NewFn->arg_begin();
     for (unsigned OldArgNum = 0; OldArgNum < ARIs.size();
          ++OldArgNum, ++OldFnArgIt) {
       if (const std::unique_ptr<ArgumentReplacementInfo> &ARI =
@@ -1727,6 +1909,10 @@ void InformationCache::initializeInformationCache(const Function &CF,
     InlineableFunctions.insert(&F);
 }
 
+AAResults *InformationCache::getAAResultsForFunction(const Function &F) {
+  return AG.getAnalysis<AAManager>(F);
+}
+
 InformationCache::FunctionInfo::~FunctionInfo() {
   // The instruction vectors are allocated using a BumpPtrAllocator, we need to
   // manually destroy them.
@@ -1827,6 +2013,9 @@ void Attributor::identifyDefaultAbstractAttributes(Function &F) {
     // Every function might be simplified.
     getOrCreateAAFor<AAValueSimplify>(RetPos);
 
+    // Every returned value might be marked noundef.
+    getOrCreateAAFor<AANoUndef>(RetPos);
+
     if (ReturnType->isPointerTy()) {
 
       // Every function with pointer return type might be marked align.
@@ -1853,6 +2042,9 @@ void Attributor::identifyDefaultAbstractAttributes(Function &F) {
     // Every argument might be dead.
     getOrCreateAAFor<AAIsDead>(ArgPos);
 
+    // Every argument might be marked noundef.
+    getOrCreateAAFor<AANoUndef>(ArgPos);
+
     if (Arg.getType()->isPointerTy()) {
       // Every argument with pointer type might be marked nonnull.
       getOrCreateAAFor<AANonNull>(ArgPos);
@@ -1920,6 +2112,9 @@ void Attributor::identifyDefaultAbstractAttributes(Function &F) {
       // Call site argument might be simplified.
       getOrCreateAAFor<AAValueSimplify>(CBArgPos);
 
+      // Every call site argument might be marked "noundef".
+      getOrCreateAAFor<AANoUndef>(CBArgPos);
+
       if (!CB.getArgOperand(I)->getType()->isPointerTy())
         continue;
 
@@ -2005,7 +2200,8 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, IRPosition::Kind AP) {
 raw_ostream &llvm::operator<<(raw_ostream &OS, const IRPosition &Pos) {
   const Value &AV = Pos.getAssociatedValue();
   return OS << "{" << Pos.getPositionKind() << ":" << AV.getName() << " ["
-            << Pos.getAnchorValue().getName() << "@" << Pos.getArgNo() << "]}";
+            << Pos.getAnchorValue().getName() << "@" << Pos.getCallSiteArgNo()
+            << "]}";
 }
 
 raw_ostream &llvm::operator<<(raw_ostream &OS, const IntegerRangeState &S) {
@@ -2027,9 +2223,48 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const AbstractAttribute &AA) {
   return OS;
 }
 
+raw_ostream &llvm::operator<<(raw_ostream &OS,
+                              const PotentialConstantIntValuesState &S) {
+  OS << "set-state(< {";
+  if (!S.isValidState())
+    OS << "full-set";
+  else {
+    for (auto &it : S.getAssumedSet())
+      OS << it << ", ";
+    if (S.undefIsContained())
+      OS << "undef ";
+  }
+  OS << "} >)";
+
+  return OS;
+}
+
 void AbstractAttribute::print(raw_ostream &OS) const {
-  OS << "[P: " << getIRPosition() << "][" << getAsStr() << "][S: " << getState()
-     << "]";
+  OS << "[";
+  OS << getName();
+  OS << "] for CtxI ";
+
+  if (auto *I = getCtxI()) {
+    OS << "'";
+    I->print(OS);
+    OS << "'";
+  } else
+    OS << "<<null inst>>";
+
+  OS << " at position " << getIRPosition() << " with state " << getAsStr()
+     << '\n';
+}
+
+void AbstractAttribute::printWithDeps(raw_ostream &OS) const {
+  print(OS);
+
+  for (const auto &DepAA : Deps) {
+    auto *AA = DepAA.getPointer();
+    OS << "  updates ";
+    AA->print(OS);
+  }
+
+  OS << '\n';
 }
 ///}
 
@@ -2055,7 +2290,31 @@ static bool runAttributorOnFunctions(InformationCache &InfoCache,
   if (AllowShallowWrappers)
     for (Function *F : Functions)
       if (!A.isFunctionIPOAmendable(*F))
-        createShallowWrapper(*F);
+        Attributor::createShallowWrapper(*F);
+
+  // Internalize non-exact functions
+  // TODO: for now we eagerly internalize functions without calculating the
+  //       cost, we need a cost interface to determine whether internalizing
+  //       a function is "benefitial"
+  if (AllowDeepWrapper) {
+    unsigned FunSize = Functions.size();
+    for (unsigned u = 0; u < FunSize; u++) {
+      Function *F = Functions[u];
+      if (!F->isDeclaration() && !F->isDefinitionExact() && F->getNumUses() &&
+          !GlobalValue::isInterposableLinkage(F->getLinkage())) {
+        Function *NewF = internalizeFunction(*F);
+        Functions.insert(NewF);
+
+        // Update call graph
+        CGUpdater.replaceFunctionWith(*F, *NewF);
+        for (const Use &U : NewF->uses())
+          if (CallBase *CB = dyn_cast<CallBase>(U.getUser())) {
+            auto *CallerF = CB->getCaller();
+            CGUpdater.reanalyzeFunction(*CallerF);
+          }
+      }
+    }
+  }
 
   for (Function *F : Functions) {
     if (F->hasExactDefinition())
@@ -2064,8 +2323,8 @@ static bool runAttributorOnFunctions(InformationCache &InfoCache,
       NumFnWithoutExactDefinition++;
 
     // We look at internal functions only on-demand but if any use is not a
-    // direct call or outside the current set of analyzed functions, we have to
-    // do it eagerly.
+    // direct call or outside the current set of analyzed functions, we have
+    // to do it eagerly.
     if (F->hasLocalLinkage()) {
       if (llvm::all_of(F->uses(), [&Functions](const Use &U) {
             const auto *CB = dyn_cast<CallBase>(U.getUser());
@@ -2081,11 +2340,41 @@ static bool runAttributorOnFunctions(InformationCache &InfoCache,
   }
 
   ChangeStatus Changed = A.run();
+
   LLVM_DEBUG(dbgs() << "[Attributor] Done with " << Functions.size()
                     << " functions, result: " << Changed << ".\n");
   return Changed == ChangeStatus::CHANGED;
 }
 
+void AADepGraph::viewGraph() { llvm::ViewGraph(this, "Dependency Graph"); }
+
+void AADepGraph::dumpGraph() {
+  static std::atomic<int> CallTimes;
+  std::string Prefix;
+
+  if (!DepGraphDotFileNamePrefix.empty())
+    Prefix = DepGraphDotFileNamePrefix;
+  else
+    Prefix = "dep_graph";
+  std::string Filename =
+      Prefix + "_" + std::to_string(CallTimes.load()) + ".dot";
+
+  outs() << "Dependency graph dump to " << Filename << ".\n";
+
+  std::error_code EC;
+
+  raw_fd_ostream File(Filename, EC, sys::fs::OF_Text);
+  if (!EC)
+    llvm::WriteGraph(File, this);
+
+  CallTimes++;
+}
+
+void AADepGraph::print() {
+  for (auto DepAA : SyntheticRoot.Deps)
+    cast<AbstractAttribute>(DepAA.getPointer())->printWithDeps(outs());
+}
+
 PreservedAnalyses AttributorPass::run(Module &M, ModuleAnalysisManager &AM) {
   FunctionAnalysisManager &FAM =
       AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
@@ -2127,11 +2416,58 @@ PreservedAnalyses AttributorCGSCCPass::run(LazyCallGraph::SCC &C,
   InformationCache InfoCache(M, AG, Allocator, /* CGSCC */ &Functions);
   if (runAttributorOnFunctions(InfoCache, Functions, AG, CGUpdater)) {
     // FIXME: Think about passes we will preserve and add them here.
-    return PreservedAnalyses::none();
+    PreservedAnalyses PA;
+    PA.preserve<FunctionAnalysisManagerCGSCCProxy>();
+    return PA;
   }
   return PreservedAnalyses::all();
 }
 
+namespace llvm {
+
+template <> struct GraphTraits<AADepGraphNode *> {
+  using NodeRef = AADepGraphNode *;
+  using DepTy = PointerIntPair<AADepGraphNode *, 1>;
+  using EdgeRef = PointerIntPair<AADepGraphNode *, 1>;
+
+  static NodeRef getEntryNode(AADepGraphNode *DGN) { return DGN; }
+  static NodeRef DepGetVal(DepTy &DT) { return DT.getPointer(); }
+
+  using ChildIteratorType =
+      mapped_iterator<TinyPtrVector<DepTy>::iterator, decltype(&DepGetVal)>;
+  using ChildEdgeIteratorType = TinyPtrVector<DepTy>::iterator;
+
+  static ChildIteratorType child_begin(NodeRef N) { return N->child_begin(); }
+
+  static ChildIteratorType child_end(NodeRef N) { return N->child_end(); }
+};
+
+template <>
+struct GraphTraits<AADepGraph *> : public GraphTraits<AADepGraphNode *> {
+  static NodeRef getEntryNode(AADepGraph *DG) { return DG->GetEntryNode(); }
+
+  using nodes_iterator =
+      mapped_iterator<TinyPtrVector<DepTy>::iterator, decltype(&DepGetVal)>;
+
+  static nodes_iterator nodes_begin(AADepGraph *DG) { return DG->begin(); }
+
+  static nodes_iterator nodes_end(AADepGraph *DG) { return DG->end(); }
+};
+
+template <> struct DOTGraphTraits<AADepGraph *> : public DefaultDOTGraphTraits {
+  DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
+
+  static std::string getNodeLabel(const AADepGraphNode *Node,
+                                  const AADepGraph *DG) {
+    std::string AAString;
+    raw_string_ostream O(AAString);
+    Node->print(O);
+    return AAString;
+  }
+};
+
+} // end namespace llvm
+
 namespace {
 
 struct AttributorLegacyPass : public ModulePass {
@@ -2163,7 +2499,6 @@ struct AttributorLegacyPass : public ModulePass {
 };
 
 struct AttributorCGSCCLegacyPass : public CallGraphSCCPass {
-  CallGraphUpdater CGUpdater;
   static char ID;
 
   AttributorCGSCCLegacyPass() : CallGraphSCCPass(ID) {
@@ -2185,6 +2520,7 @@ struct AttributorCGSCCLegacyPass : public CallGraphSCCPass {
 
     AnalysisGetter AG;
     CallGraph &CG = const_cast<CallGraph &>(SCC.getCallGraph());
+    CallGraphUpdater CGUpdater;
     CGUpdater.initialize(CG, SCC);
     Module &M = *Functions.back()->getParent();
     BumpPtrAllocator Allocator;
@@ -2192,8 +2528,6 @@ struct AttributorCGSCCLegacyPass : public CallGraphSCCPass {
     return runAttributorOnFunctions(InfoCache, Functions, AG, CGUpdater);
   }
 
-  bool doFinalization(CallGraph &CG) override { return CGUpdater.finalize(); }
-
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     // FIXME: Think about passes we will preserve and add them here.
     AU.addRequired<TargetLibraryInfoWrapperPass>();
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 7e9fd61eeb41..d6127a8df628 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -13,15 +13,20 @@
 
 #include "llvm/Transforms/IPO/Attributor.h"
 
+#include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumeBundleQueries.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/LazyValueInfo.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/NoFolder.h"
 #include "llvm/Support/CommandLine.h"
@@ -42,6 +47,16 @@ static cl::opt<bool> ManifestInternal(
 static cl::opt<int> MaxHeapToStackSize("max-heap-to-stack-size", cl::init(128),
                                        cl::Hidden);
 
+template <>
+unsigned llvm::PotentialConstantIntValuesState::MaxPotentialValues = 0;
+
+static cl::opt<unsigned, true> MaxPotentialValues(
+    "attributor-max-potential-values", cl::Hidden,
+    cl::desc("Maximum number of potential values to be "
+             "tracked for each position."),
+    cl::location(llvm::PotentialConstantIntValuesState::MaxPotentialValues),
+    cl::init(7));
+
 STATISTIC(NumAAs, "Number of abstract attributes created");
 
 // Some helper macros to deal with statistics tracking.
@@ -117,6 +132,8 @@ PIPE_OPERATOR(AAMemoryLocation)
 PIPE_OPERATOR(AAValueConstantRange)
 PIPE_OPERATOR(AAPrivatizablePtr)
 PIPE_OPERATOR(AAUndefinedBehavior)
+PIPE_OPERATOR(AAPotentialValues)
+PIPE_OPERATOR(AANoUndef)
 
 #undef PIPE_OPERATOR
 } // namespace llvm
@@ -435,7 +452,7 @@ static void clampReturnedValueStates(Attributor &A, const AAType &QueryingAA,
     const AAType &AA = A.getAAFor<AAType>(QueryingAA, RVPos);
     LLVM_DEBUG(dbgs() << "[Attributor] RV: " << RV << " AA: " << AA.getAsStr()
                       << " @ " << RVPos << "\n");
-    const StateType &AAS = static_cast<const StateType &>(AA.getState());
+    const StateType &AAS = AA.getState();
     if (T.hasValue())
       *T &= AAS;
     else
@@ -485,7 +502,7 @@ static void clampCallSiteArgumentStates(Attributor &A, const AAType &QueryingAA,
   Optional<StateType> T;
 
   // The argument number which is also the call site argument number.
-  unsigned ArgNo = QueryingAA.getIRPosition().getArgNo();
+  unsigned ArgNo = QueryingAA.getIRPosition().getCallSiteArgNo();
 
   auto CallSiteCheck = [&](AbstractCallSite ACS) {
     const IRPosition &ACSArgPos = IRPosition::callsite_argument(ACS, ArgNo);
@@ -497,7 +514,7 @@ static void clampCallSiteArgumentStates(Attributor &A, const AAType &QueryingAA,
     const AAType &AA = A.getAAFor<AAType>(QueryingAA, ACSArgPos);
     LLVM_DEBUG(dbgs() << "[Attributor] ACS: " << *ACS.getInstruction()
                       << " AA: " << AA.getAsStr() << " @" << ACSArgPos << "\n");
-    const StateType &AAS = static_cast<const StateType &>(AA.getState());
+    const StateType &AAS = AA.getState();
     if (T.hasValue())
       *T &= AAS;
     else
@@ -554,8 +571,7 @@ struct AACallSiteReturnedFromReturned : public BaseType {
 
     IRPosition FnPos = IRPosition::returned(*AssociatedFunction);
     const AAType &AA = A.getAAFor<AAType>(*this, FnPos);
-    return clampStateAndIndicateChange(
-        S, static_cast<const StateType &>(AA.getState()));
+    return clampStateAndIndicateChange(S, AA.getState());
   }
 };
 
@@ -722,7 +738,7 @@ struct AANoUnwindCallSite final : AANoUnwindImpl {
   void initialize(Attributor &A) override {
     AANoUnwindImpl::initialize(A);
     Function *F = getAssociatedFunction();
-    if (!F)
+    if (!F || F->isDeclaration())
       indicatePessimisticFixpoint();
   }
 
@@ -735,9 +751,7 @@ struct AANoUnwindCallSite final : AANoUnwindImpl {
     Function *F = getAssociatedFunction();
     const IRPosition &FnPos = IRPosition::function(*F);
     auto &FnAA = A.getAAFor<AANoUnwind>(*this, FnPos);
-    return clampStateAndIndicateChange(
-        getState(),
-        static_cast<const AANoUnwind::StateType &>(FnAA.getState()));
+    return clampStateAndIndicateChange(getState(), FnAA.getState());
   }
 
   /// See AbstractAttribute::trackStatistics()
@@ -783,7 +797,7 @@ public:
     ReturnedValues.clear();
 
     Function *F = getAssociatedFunction();
-    if (!F) {
+    if (!F || F->isDeclaration()) {
       indicatePessimisticFixpoint();
       return;
     }
@@ -1052,9 +1066,10 @@ ChangeStatus AAReturnedValuesImpl::updateImpl(Attributor &A) {
   // map, NewRVsMap.
   decltype(ReturnedValues) NewRVsMap;
 
-  auto HandleReturnValue = [&](Value *RV, SmallSetVector<ReturnInst *, 4> &RIs) {
-    LLVM_DEBUG(dbgs() << "[AAReturnedValues] Returned value: " << *RV
-                      << " by #" << RIs.size() << " RIs\n");
+  auto HandleReturnValue = [&](Value *RV,
+                               SmallSetVector<ReturnInst *, 4> &RIs) {
+    LLVM_DEBUG(dbgs() << "[AAReturnedValues] Returned value: " << *RV << " by #"
+                      << RIs.size() << " RIs\n");
     CallBase *CB = dyn_cast<CallBase>(RV);
     if (!CB || UnresolvedCalls.count(CB))
       return;
@@ -1128,11 +1143,13 @@ ChangeStatus AAReturnedValuesImpl::updateImpl(Attributor &A) {
         RVState RVS({NewRVsMap, Unused, RetValAAIt.second});
         VisitReturnedValue(*CB->getArgOperand(Arg->getArgNo()), RVS, CB);
         continue;
-      } else if (isa<CallBase>(RetVal)) {
+      }
+      if (isa<CallBase>(RetVal)) {
         // Call sites are resolved by the callee attribute over time, no need to
         // do anything for us.
         continue;
-      } else if (isa<Constant>(RetVal)) {
+      }
+      if (isa<Constant>(RetVal)) {
         // Constants are valid everywhere, we can simply take them.
         NewRVsMap[RetVal].insert(RIs.begin(), RIs.end());
         continue;
@@ -1373,7 +1390,7 @@ struct AANoSyncCallSite final : AANoSyncImpl {
   void initialize(Attributor &A) override {
     AANoSyncImpl::initialize(A);
     Function *F = getAssociatedFunction();
-    if (!F)
+    if (!F || F->isDeclaration())
       indicatePessimisticFixpoint();
   }
 
@@ -1386,8 +1403,7 @@ struct AANoSyncCallSite final : AANoSyncImpl {
     Function *F = getAssociatedFunction();
     const IRPosition &FnPos = IRPosition::function(*F);
     auto &FnAA = A.getAAFor<AANoSync>(*this, FnPos);
-    return clampStateAndIndicateChange(
-        getState(), static_cast<const AANoSync::StateType &>(FnAA.getState()));
+    return clampStateAndIndicateChange(getState(), FnAA.getState());
   }
 
   /// See AbstractAttribute::trackStatistics()
@@ -1439,7 +1455,7 @@ struct AANoFreeCallSite final : AANoFreeImpl {
   void initialize(Attributor &A) override {
     AANoFreeImpl::initialize(A);
     Function *F = getAssociatedFunction();
-    if (!F)
+    if (!F || F->isDeclaration())
       indicatePessimisticFixpoint();
   }
 
@@ -1452,8 +1468,7 @@ struct AANoFreeCallSite final : AANoFreeImpl {
     Function *F = getAssociatedFunction();
     const IRPosition &FnPos = IRPosition::function(*F);
     auto &FnAA = A.getAAFor<AANoFree>(*this, FnPos);
-    return clampStateAndIndicateChange(
-        getState(), static_cast<const AANoFree::StateType &>(FnAA.getState()));
+    return clampStateAndIndicateChange(getState(), FnAA.getState());
   }
 
   /// See AbstractAttribute::trackStatistics()
@@ -1535,8 +1550,7 @@ struct AANoFreeCallSiteArgument final : AANoFreeFloating {
       return indicatePessimisticFixpoint();
     const IRPosition &ArgPos = IRPosition::argument(*Arg);
     auto &ArgAA = A.getAAFor<AANoFree>(*this, ArgPos);
-    return clampStateAndIndicateChange(
-        getState(), static_cast<const AANoFree::StateType &>(ArgAA.getState()));
+    return clampStateAndIndicateChange(getState(), ArgAA.getState());
   }
 
   /// See AbstractAttribute::trackStatistics()
@@ -1672,21 +1686,33 @@ struct AANonNullImpl : AANonNull {
     Value &V = getAssociatedValue();
     if (!NullIsDefined &&
         hasAttr({Attribute::NonNull, Attribute::Dereferenceable},
-                /* IgnoreSubsumingPositions */ false, &A))
+                /* IgnoreSubsumingPositions */ false, &A)) {
       indicateOptimisticFixpoint();
-    else if (isa<ConstantPointerNull>(V))
+      return;
+    }
+
+    if (isa<ConstantPointerNull>(V)) {
       indicatePessimisticFixpoint();
-    else
-      AANonNull::initialize(A);
+      return;
+    }
+
+    AANonNull::initialize(A);
 
     bool CanBeNull = true;
-    if (V.getPointerDereferenceableBytes(A.getDataLayout(), CanBeNull))
-      if (!CanBeNull)
+    if (V.getPointerDereferenceableBytes(A.getDataLayout(), CanBeNull)) {
+      if (!CanBeNull) {
         indicateOptimisticFixpoint();
+        return;
+      }
+    }
 
-    if (!getState().isAtFixpoint())
-      if (Instruction *CtxI = getCtxI())
-        followUsesInMBEC(*this, A, getState(), *CtxI);
+    if (isa<GlobalValue>(&getAssociatedValue())) {
+      indicatePessimisticFixpoint();
+      return;
+    }
+
+    if (Instruction *CtxI = getCtxI())
+      followUsesInMBEC(*this, A, getState(), *CtxI);
   }
 
   /// See followUsesInMBEC
@@ -1717,13 +1743,6 @@ struct AANonNullFloating : public AANonNullImpl {
 
   /// See AbstractAttribute::updateImpl(...).
   ChangeStatus updateImpl(Attributor &A) override {
-    if (!NullIsDefined) {
-      const auto &DerefAA =
-          A.getAAFor<AADereferenceable>(*this, getIRPosition());
-      if (DerefAA.getAssumedDereferenceableBytes())
-        return ChangeStatus::UNCHANGED;
-    }
-
     const DataLayout &DL = A.getDataLayout();
 
     DominatorTree *DT = nullptr;
@@ -1742,8 +1761,7 @@ struct AANonNullFloating : public AANonNullImpl {
           T.indicatePessimisticFixpoint();
       } else {
         // Use abstract attribute information.
-        const AANonNull::StateType &NS =
-            static_cast<const AANonNull::StateType &>(AA.getState());
+        const AANonNull::StateType &NS = AA.getState();
         T ^= NS;
       }
       return T.isValidState();
@@ -1763,9 +1781,14 @@ struct AANonNullFloating : public AANonNullImpl {
 
 /// NonNull attribute for function return value.
 struct AANonNullReturned final
-    : AAReturnedFromReturnedValues<AANonNull, AANonNullImpl> {
+    : AAReturnedFromReturnedValues<AANonNull, AANonNull> {
   AANonNullReturned(const IRPosition &IRP, Attributor &A)
-      : AAReturnedFromReturnedValues<AANonNull, AANonNullImpl>(IRP, A) {}
+      : AAReturnedFromReturnedValues<AANonNull, AANonNull>(IRP, A) {}
+
+  /// See AbstractAttribute::getAsStr().
+  const std::string getAsStr() const override {
+    return getAssumed() ? "nonnull" : "may-null";
+  }
 
   /// See AbstractAttribute::trackStatistics()
   void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(nonnull) }
@@ -1879,7 +1902,7 @@ struct AANoRecurseCallSite final : AANoRecurseImpl {
   void initialize(Attributor &A) override {
     AANoRecurseImpl::initialize(A);
     Function *F = getAssociatedFunction();
-    if (!F)
+    if (!F || F->isDeclaration())
       indicatePessimisticFixpoint();
   }
 
@@ -1892,9 +1915,7 @@ struct AANoRecurseCallSite final : AANoRecurseImpl {
     Function *F = getAssociatedFunction();
     const IRPosition &FnPos = IRPosition::function(*F);
     auto &FnAA = A.getAAFor<AANoRecurse>(*this, FnPos);
-    return clampStateAndIndicateChange(
-        getState(),
-        static_cast<const AANoRecurse::StateType &>(FnAA.getState()));
+    return clampStateAndIndicateChange(getState(), FnAA.getState());
   }
 
   /// See AbstractAttribute::trackStatistics()
@@ -1979,6 +2000,98 @@ struct AAUndefinedBehaviorImpl : public AAUndefinedBehavior {
       return true;
     };
 
+    auto InspectCallSiteForUB = [&](Instruction &I) {
+      // Check whether a callsite always cause UB or not
+
+      // Skip instructions that are already saved.
+      if (AssumedNoUBInsts.count(&I) || KnownUBInsts.count(&I))
+        return true;
+
+      // Check nonnull and noundef argument attribute violation for each
+      // callsite.
+      CallBase &CB = cast<CallBase>(I);
+      Function *Callee = CB.getCalledFunction();
+      if (!Callee)
+        return true;
+      for (unsigned idx = 0; idx < CB.getNumArgOperands(); idx++) {
+        // If current argument is known to be simplified to null pointer and the
+        // corresponding argument position is known to have nonnull attribute,
+        // the argument is poison. Furthermore, if the argument is poison and
+        // the position is known to have noundef attriubte, this callsite is
+        // considered UB.
+        if (idx >= Callee->arg_size())
+          break;
+        Value *ArgVal = CB.getArgOperand(idx);
+        if (!ArgVal)
+          continue;
+        // Here, we handle three cases.
+        //   (1) Not having a value means it is dead. (we can replace the value
+        //       with undef)
+        //   (2) Simplified to undef. The argument violate noundef attriubte.
+        //   (3) Simplified to null pointer where known to be nonnull.
+        //       The argument is a poison value and violate noundef attribute.
+        IRPosition CalleeArgumentIRP = IRPosition::callsite_argument(CB, idx);
+        auto &NoUndefAA = A.getAAFor<AANoUndef>(*this, CalleeArgumentIRP,
+                                                /* TrackDependence */ false);
+        if (!NoUndefAA.isKnownNoUndef())
+          continue;
+        auto &ValueSimplifyAA = A.getAAFor<AAValueSimplify>(
+            *this, IRPosition::value(*ArgVal), /* TrackDependence */ false);
+        if (!ValueSimplifyAA.isKnown())
+          continue;
+        Optional<Value *> SimplifiedVal =
+            ValueSimplifyAA.getAssumedSimplifiedValue(A);
+        if (!SimplifiedVal.hasValue() ||
+            isa<UndefValue>(*SimplifiedVal.getValue())) {
+          KnownUBInsts.insert(&I);
+          continue;
+        }
+        if (!ArgVal->getType()->isPointerTy() ||
+            !isa<ConstantPointerNull>(*SimplifiedVal.getValue()))
+          continue;
+        auto &NonNullAA = A.getAAFor<AANonNull>(*this, CalleeArgumentIRP,
+                                                /* TrackDependence */ false);
+        if (NonNullAA.isKnownNonNull())
+          KnownUBInsts.insert(&I);
+      }
+      return true;
+    };
+
+    auto InspectReturnInstForUB =
+        [&](Value &V, const SmallSetVector<ReturnInst *, 4> RetInsts) {
+          // Check if a return instruction always cause UB or not
+          // Note: It is guaranteed that the returned position of the anchor
+          //       scope has noundef attribute when this is called.
+          //       We also ensure the return position is not "assumed dead"
+          //       because the returned value was then potentially simplified to
+          //       `undef` in AAReturnedValues without removing the `noundef`
+          //       attribute yet.
+
+          // When the returned position has noundef attriubte, UB occur in the
+          // following cases.
+          //   (1) Returned value is known to be undef.
+          //   (2) The value is known to be a null pointer and the returned
+          //       position has nonnull attribute (because the returned value is
+          //       poison).
+          bool FoundUB = false;
+          if (isa<UndefValue>(V)) {
+            FoundUB = true;
+          } else {
+            if (isa<ConstantPointerNull>(V)) {
+              auto &NonNullAA = A.getAAFor<AANonNull>(
+                  *this, IRPosition::returned(*getAnchorScope()),
+                  /* TrackDependence */ false);
+              if (NonNullAA.isKnownNonNull())
+                FoundUB = true;
+            }
+          }
+
+          if (FoundUB)
+            for (ReturnInst *RI : RetInsts)
+              KnownUBInsts.insert(RI);
+          return true;
+        };
+
     A.checkForAllInstructions(InspectMemAccessInstForUB, *this,
                               {Instruction::Load, Instruction::Store,
                                Instruction::AtomicCmpXchg,
@@ -1986,6 +2099,22 @@ struct AAUndefinedBehaviorImpl : public AAUndefinedBehavior {
                               /* CheckBBLivenessOnly */ true);
     A.checkForAllInstructions(InspectBrInstForUB, *this, {Instruction::Br},
                               /* CheckBBLivenessOnly */ true);
+    A.checkForAllCallLikeInstructions(InspectCallSiteForUB, *this);
+
+    // If the returned position of the anchor scope has noundef attriubte, check
+    // all returned instructions.
+    if (!getAnchorScope()->getReturnType()->isVoidTy()) {
+      const IRPosition &ReturnIRP = IRPosition::returned(*getAnchorScope());
+      if (!A.isAssumedDead(ReturnIRP, this, nullptr)) {
+        auto &RetPosNoUndefAA =
+            A.getAAFor<AANoUndef>(*this, ReturnIRP,
+                                  /* TrackDependence */ false);
+        if (RetPosNoUndefAA.isKnownNoUndef())
+          A.checkForAllReturnedValuesAndReturnInsts(InspectReturnInstForUB,
+                                                    *this);
+      }
+    }
+
     if (NoUBPrevSize != AssumedNoUBInsts.size() ||
         UBPrevSize != KnownUBInsts.size())
       return ChangeStatus::CHANGED;
@@ -2153,7 +2282,7 @@ struct AAWillReturnImpl : public AAWillReturn {
     AAWillReturn::initialize(A);
 
     Function *F = getAnchorScope();
-    if (!F || !A.isFunctionIPOAmendable(*F) || mayContainUnboundedCycle(*F, A))
+    if (!F || F->isDeclaration() || mayContainUnboundedCycle(*F, A))
       indicatePessimisticFixpoint();
   }
 
@@ -2197,9 +2326,9 @@ struct AAWillReturnCallSite final : AAWillReturnImpl {
 
   /// See AbstractAttribute::initialize(...).
   void initialize(Attributor &A) override {
-    AAWillReturnImpl::initialize(A);
+    AAWillReturn::initialize(A);
     Function *F = getAssociatedFunction();
-    if (!F)
+    if (!F || !A.isFunctionIPOAmendable(*F))
       indicatePessimisticFixpoint();
   }
 
@@ -2212,9 +2341,7 @@ struct AAWillReturnCallSite final : AAWillReturnImpl {
     Function *F = getAssociatedFunction();
     const IRPosition &FnPos = IRPosition::function(*F);
     auto &FnAA = A.getAAFor<AAWillReturn>(*this, FnPos);
-    return clampStateAndIndicateChange(
-        getState(),
-        static_cast<const AAWillReturn::StateType &>(FnAA.getState()));
+    return clampStateAndIndicateChange(getState(), FnAA.getState());
   }
 
   /// See AbstractAttribute::trackStatistics()
@@ -2374,7 +2501,7 @@ struct AANoAliasCallSiteArgument final : AANoAliasImpl {
   void initialize(Attributor &A) override {
     // See callsite argument attribute and callee argument attribute.
     const auto &CB = cast<CallBase>(getAnchorValue());
-    if (CB.paramHasAttr(getArgNo(), Attribute::NoAlias))
+    if (CB.paramHasAttr(getCallSiteArgNo(), Attribute::NoAlias))
       indicateOptimisticFixpoint();
     Value &Val = getAssociatedValue();
     if (isa<ConstantPointerNull>(Val) &&
@@ -2389,7 +2516,7 @@ struct AANoAliasCallSiteArgument final : AANoAliasImpl {
                             const AAMemoryBehavior &MemBehaviorAA,
                             const CallBase &CB, unsigned OtherArgNo) {
     // We do not need to worry about aliasing with the underlying IRP.
-    if (this->getArgNo() == (int)OtherArgNo)
+    if (this->getCalleeArgNo() == (int)OtherArgNo)
       return false;
 
     // If it is not a pointer or pointer vector we do not alias.
@@ -2451,6 +2578,7 @@ struct AANoAliasCallSiteArgument final : AANoAliasImpl {
     A.recordDependence(NoAliasAA, *this, DepClassTy::OPTIONAL);
 
     const IRPosition &VIRP = IRPosition::value(getAssociatedValue());
+    const Function *ScopeFn = VIRP.getAnchorScope();
     auto &NoCaptureAA =
         A.getAAFor<AANoCapture>(*this, VIRP, /* TrackDependence */ false);
     // Check whether the value is captured in the scope using AANoCapture.
@@ -2459,16 +2587,18 @@ struct AANoAliasCallSiteArgument final : AANoAliasImpl {
     auto UsePred = [&](const Use &U, bool &Follow) -> bool {
       Instruction *UserI = cast<Instruction>(U.getUser());
 
-      // If user if curr instr and only use.
-      if (UserI == getCtxI() && UserI->hasOneUse())
+      // If UserI is the curr instruction and there is a single potential use of
+      // the value in UserI we allow the use.
+      // TODO: We should inspect the operands and allow those that cannot alias
+      //       with the value.
+      if (UserI == getCtxI() && UserI->getNumOperands() == 1)
         return true;
 
-      const Function *ScopeFn = VIRP.getAnchorScope();
       if (ScopeFn) {
         const auto &ReachabilityAA =
             A.getAAFor<AAReachability>(*this, IRPosition::function(*ScopeFn));
 
-        if (!ReachabilityAA.isAssumedReachable(UserI, getCtxI()))
+        if (!ReachabilityAA.isAssumedReachable(A, *UserI, *getCtxI()))
           return true;
 
         if (auto *CB = dyn_cast<CallBase>(UserI)) {
@@ -2554,6 +2684,14 @@ struct AANoAliasReturned final : AANoAliasImpl {
   AANoAliasReturned(const IRPosition &IRP, Attributor &A)
       : AANoAliasImpl(IRP, A) {}
 
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AANoAliasImpl::initialize(A);
+    Function *F = getAssociatedFunction();
+    if (!F || F->isDeclaration())
+      indicatePessimisticFixpoint();
+  }
+
   /// See AbstractAttribute::updateImpl(...).
   virtual ChangeStatus updateImpl(Attributor &A) override {
 
@@ -2595,7 +2733,7 @@ struct AANoAliasCallSiteReturned final : AANoAliasImpl {
   void initialize(Attributor &A) override {
     AANoAliasImpl::initialize(A);
     Function *F = getAssociatedFunction();
-    if (!F)
+    if (!F || F->isDeclaration())
       indicatePessimisticFixpoint();
   }
 
@@ -2608,8 +2746,7 @@ struct AANoAliasCallSiteReturned final : AANoAliasImpl {
     Function *F = getAssociatedFunction();
     const IRPosition &FnPos = IRPosition::returned(*F);
     auto &FnAA = A.getAAFor<AANoAlias>(*this, FnPos);
-    return clampStateAndIndicateChange(
-        getState(), static_cast<const AANoAlias::StateType &>(FnAA.getState()));
+    return clampStateAndIndicateChange(getState(), FnAA.getState());
   }
 
   /// See AbstractAttribute::trackStatistics()
@@ -2799,14 +2936,13 @@ struct AAIsDeadCallSiteArgument : public AAIsDeadValueImpl {
       return indicatePessimisticFixpoint();
     const IRPosition &ArgPos = IRPosition::argument(*Arg);
     auto &ArgAA = A.getAAFor<AAIsDead>(*this, ArgPos);
-    return clampStateAndIndicateChange(
-        getState(), static_cast<const AAIsDead::StateType &>(ArgAA.getState()));
+    return clampStateAndIndicateChange(getState(), ArgAA.getState());
   }
 
   /// See AbstractAttribute::manifest(...).
   ChangeStatus manifest(Attributor &A) override {
     CallBase &CB = cast<CallBase>(getAnchorValue());
-    Use &U = CB.getArgOperandUse(getArgNo());
+    Use &U = CB.getArgOperandUse(getCallSiteArgNo());
     assert(!isa<UndefValue>(U.get()) &&
            "Expected undef values to be filtered out!");
     UndefValue &UV = *UndefValue::get(U->getType());
@@ -2921,8 +3057,14 @@ struct AAIsDeadFunction : public AAIsDead {
   void initialize(Attributor &A) override {
     const Function *F = getAnchorScope();
     if (F && !F->isDeclaration()) {
-      ToBeExploredFrom.insert(&F->getEntryBlock().front());
-      assumeLive(A, F->getEntryBlock());
+      // We only want to compute liveness once. If the function is not part of
+      // the SCC, skip it.
+      if (A.isRunOn(*const_cast<Function *>(F))) {
+        ToBeExploredFrom.insert(&F->getEntryBlock().front());
+        assumeLive(A, F->getEntryBlock());
+      } else {
+        indicatePessimisticFixpoint();
+      }
     }
   }
 
@@ -2985,6 +3127,10 @@ struct AAIsDeadFunction : public AAIsDead {
   /// See AbstractAttribute::updateImpl(...).
   ChangeStatus updateImpl(Attributor &A) override;
 
+  bool isEdgeDead(const BasicBlock *From, const BasicBlock *To) const override {
+    return !AssumedLiveEdges.count(std::make_pair(From, To));
+  }
+
   /// See AbstractAttribute::trackStatistics()
   void trackStatistics() const override {}
 
@@ -3062,6 +3208,9 @@ struct AAIsDeadFunction : public AAIsDead {
   /// Collection of instructions that are known to not transfer control.
   SmallSetVector<const Instruction *, 8> KnownDeadEnds;
 
+  /// Collection of all assumed live edges
+  DenseSet<std::pair<const BasicBlock *, const BasicBlock *>> AssumedLiveEdges;
+
   /// Collection of all assumed live BasicBlocks.
   DenseSet<const BasicBlock *> AssumedLiveBlocks;
 };
@@ -3177,18 +3326,23 @@ ChangeStatus AAIsDeadFunction::updateImpl(Attributor &A) {
     const Instruction *I = Worklist.pop_back_val();
     LLVM_DEBUG(dbgs() << "[AAIsDead] Exploration inst: " << *I << "\n");
 
+    // Fast forward for uninteresting instructions. We could look for UB here
+    // though.
+    while (!I->isTerminator() && !isa<CallBase>(I)) {
+      Change = ChangeStatus::CHANGED;
+      I = I->getNextNode();
+    }
+
     AliveSuccessors.clear();
 
     bool UsedAssumedInformation = false;
     switch (I->getOpcode()) {
     // TODO: look for (assumed) UB to backwards propagate "deadness".
     default:
-      if (I->isTerminator()) {
-        for (const BasicBlock *SuccBB : successors(I->getParent()))
-          AliveSuccessors.push_back(&SuccBB->front());
-      } else {
-        AliveSuccessors.push_back(I->getNextNode());
-      }
+      assert(I->isTerminator() &&
+             "Expected non-terminators to be handled already!");
+      for (const BasicBlock *SuccBB : successors(I->getParent()))
+        AliveSuccessors.push_back(&SuccBB->front());
       break;
     case Instruction::Call:
       UsedAssumedInformation = identifyAliveSuccessors(A, cast<CallInst>(*I),
@@ -3227,6 +3381,9 @@ ChangeStatus AAIsDeadFunction::updateImpl(Attributor &A) {
                "Non-terminator expected to have a single successor!");
         Worklist.push_back(AliveSuccessor);
       } else {
+        // record the assumed live edge
+        AssumedLiveEdges.insert(
+            std::make_pair(I->getParent(), AliveSuccessor->getParent()));
         if (assumeLive(A, *AliveSuccessor->getParent()))
           Worklist.push_back(AliveSuccessor);
       }
@@ -3342,7 +3499,6 @@ struct AADereferenceableImpl : AADereferenceable {
         State.addAccessedBytes(Offset, Size);
       }
     }
-    return;
   }
 
   /// See followUsesInMBEC
@@ -3420,12 +3576,11 @@ struct AADereferenceableFloating : AADereferenceableImpl {
         DerefBytes = Base->getPointerDereferenceableBytes(DL, CanBeNull);
         T.GlobalState.indicatePessimisticFixpoint();
       } else {
-        const DerefState &DS = static_cast<const DerefState &>(AA.getState());
+        const DerefState &DS = AA.getState();
         DerefBytes = DS.DerefBytesState.getAssumed();
         T.GlobalState &= DS.GlobalState;
       }
 
-
       // For now we do not try to "increase" dereferenceability due to negative
       // indices as we first have to come up with code to deal with loops and
       // for overflows of the dereferenceable bytes.
@@ -3697,14 +3852,27 @@ struct AAAlignFloating : AAAlignImpl {
                             AAAlign::StateType &T, bool Stripped) -> bool {
       const auto &AA = A.getAAFor<AAAlign>(*this, IRPosition::value(V));
       if (!Stripped && this == &AA) {
+        int64_t Offset;
+        unsigned Alignment = 1;
+        if (const Value *Base =
+                GetPointerBaseWithConstantOffset(&V, Offset, DL)) {
+          Align PA = Base->getPointerAlignment(DL);
+          // BasePointerAddr + Offset = Alignment * Q for some integer Q.
+          // So we can say that the maximum power of two which is a divisor of
+          // gcd(Offset, Alignment) is an alignment.
+
+          uint32_t gcd = greatestCommonDivisor(uint32_t(abs((int32_t)Offset)),
+                                               uint32_t(PA.value()));
+          Alignment = llvm::PowerOf2Floor(gcd);
+        } else {
+          Alignment = V.getPointerAlignment(DL).value();
+        }
         // Use only IR information if we did not strip anything.
-        Align PA = V.getPointerAlignment(DL);
-        T.takeKnownMaximum(PA.value());
+        T.takeKnownMaximum(Alignment);
         T.indicatePessimisticFixpoint();
       } else {
         // Use abstract attribute information.
-        const AAAlign::StateType &DS =
-            static_cast<const AAAlign::StateType &>(AA.getState());
+        const AAAlign::StateType &DS = AA.getState();
         T ^= DS;
       }
       return T.isValidState();
@@ -3727,8 +3895,16 @@ struct AAAlignFloating : AAAlignImpl {
 /// Align attribute for function return value.
 struct AAAlignReturned final
     : AAReturnedFromReturnedValues<AAAlign, AAAlignImpl> {
-  AAAlignReturned(const IRPosition &IRP, Attributor &A)
-      : AAReturnedFromReturnedValues<AAAlign, AAAlignImpl>(IRP, A) {}
+  using Base = AAReturnedFromReturnedValues<AAAlign, AAAlignImpl>;
+  AAAlignReturned(const IRPosition &IRP, Attributor &A) : Base(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    Base::initialize(A);
+    Function *F = getAssociatedFunction();
+    if (!F || F->isDeclaration())
+      indicatePessimisticFixpoint();
+  }
 
   /// See AbstractAttribute::trackStatistics()
   void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(aligned) }
@@ -3802,7 +3978,7 @@ struct AAAlignCallSiteReturned final
   void initialize(Attributor &A) override {
     Base::initialize(A);
     Function *F = getAssociatedFunction();
-    if (!F)
+    if (!F || F->isDeclaration())
       indicatePessimisticFixpoint();
   }
 
@@ -3818,7 +3994,7 @@ struct AANoReturnImpl : public AANoReturn {
   void initialize(Attributor &A) override {
     AANoReturn::initialize(A);
     Function *F = getAssociatedFunction();
-    if (!F)
+    if (!F || F->isDeclaration())
       indicatePessimisticFixpoint();
   }
 
@@ -3850,6 +4026,17 @@ struct AANoReturnCallSite final : AANoReturnImpl {
   AANoReturnCallSite(const IRPosition &IRP, Attributor &A)
       : AANoReturnImpl(IRP, A) {}
 
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AANoReturnImpl::initialize(A);
+    if (Function *F = getAssociatedFunction()) {
+      const IRPosition &FnPos = IRPosition::function(*F);
+      auto &FnAA = A.getAAFor<AANoReturn>(*this, FnPos);
+      if (!FnAA.isAssumedNoReturn())
+        indicatePessimisticFixpoint();
+    }
+  }
+
   /// See AbstractAttribute::updateImpl(...).
   ChangeStatus updateImpl(Attributor &A) override {
     // TODO: Once we have call site specific value information we can provide
@@ -3859,9 +4046,7 @@ struct AANoReturnCallSite final : AANoReturnImpl {
     Function *F = getAssociatedFunction();
     const IRPosition &FnPos = IRPosition::function(*F);
     auto &FnAA = A.getAAFor<AANoReturn>(*this, FnPos);
-    return clampStateAndIndicateChange(
-        getState(),
-        static_cast<const AANoReturn::StateType &>(FnAA.getState()));
+    return clampStateAndIndicateChange(getState(), FnAA.getState());
   }
 
   /// See AbstractAttribute::trackStatistics()
@@ -3894,7 +4079,8 @@ struct AANoCaptureImpl : public AANoCapture {
       return;
     }
 
-    const Function *F = getArgNo() >= 0 ? getAssociatedFunction() : AnchorScope;
+    const Function *F =
+        isArgumentPosition() ? getAssociatedFunction() : AnchorScope;
 
     // Check what state the associated function can actually capture.
     if (F)
@@ -3913,7 +4099,7 @@ struct AANoCaptureImpl : public AANoCapture {
     if (!isAssumedNoCaptureMaybeReturned())
       return;
 
-    if (getArgNo() >= 0) {
+    if (isArgumentPosition()) {
       if (isAssumedNoCapture())
         Attrs.emplace_back(Attribute::get(Ctx, Attribute::NoCapture));
       else if (ManifestInternal)
@@ -3949,7 +4135,7 @@ struct AANoCaptureImpl : public AANoCapture {
       State.addKnownBits(NOT_CAPTURED_IN_RET);
 
     // Check existing "returned" attributes.
-    int ArgNo = IRP.getArgNo();
+    int ArgNo = IRP.getCalleeArgNo();
     if (F.doesNotThrow() && ArgNo >= 0) {
       for (unsigned u = 0, e = F.arg_size(); u < e; ++u)
         if (F.hasParamAttribute(u, Attribute::Returned)) {
@@ -4125,13 +4311,13 @@ private:
 
 ChangeStatus AANoCaptureImpl::updateImpl(Attributor &A) {
   const IRPosition &IRP = getIRPosition();
-  const Value *V =
-      getArgNo() >= 0 ? IRP.getAssociatedArgument() : &IRP.getAssociatedValue();
+  const Value *V = isArgumentPosition() ? IRP.getAssociatedArgument()
+                                        : &IRP.getAssociatedValue();
   if (!V)
     return indicatePessimisticFixpoint();
 
   const Function *F =
-      getArgNo() >= 0 ? IRP.getAssociatedFunction() : IRP.getAnchorScope();
+      isArgumentPosition() ? IRP.getAssociatedFunction() : IRP.getAnchorScope();
   assert(F && "Expected a function!");
   const IRPosition &FnPos = IRPosition::function(*F);
   const auto &IsDeadAA =
@@ -4248,9 +4434,7 @@ struct AANoCaptureCallSiteArgument final : AANoCaptureImpl {
       return indicatePessimisticFixpoint();
     const IRPosition &ArgPos = IRPosition::argument(*Arg);
     auto &ArgAA = A.getAAFor<AANoCapture>(*this, ArgPos);
-    return clampStateAndIndicateChange(
-        getState(),
-        static_cast<const AANoCapture::StateType &>(ArgAA.getState()));
+    return clampStateAndIndicateChange(getState(), ArgAA.getState());
   }
 
   /// See AbstractAttribute::trackStatistics()
@@ -4366,24 +4550,35 @@ struct AAValueSimplifyImpl : AAValueSimplify {
     return true;
   }
 
-  bool askSimplifiedValueForAAValueConstantRange(Attributor &A) {
+  /// Returns a candidate is found or not
+  template <typename AAType> bool askSimplifiedValueFor(Attributor &A) {
     if (!getAssociatedValue().getType()->isIntegerTy())
       return false;
 
-    const auto &ValueConstantRangeAA =
-        A.getAAFor<AAValueConstantRange>(*this, getIRPosition());
+    const auto &AA =
+        A.getAAFor<AAType>(*this, getIRPosition(), /* TrackDependence */ false);
 
-    Optional<ConstantInt *> COpt =
-        ValueConstantRangeAA.getAssumedConstantInt(A);
-    if (COpt.hasValue()) {
-      if (auto *C = COpt.getValue())
-        SimplifiedAssociatedValue = C;
-      else
-        return false;
-    } else {
+    Optional<ConstantInt *> COpt = AA.getAssumedConstantInt(A);
+
+    if (!COpt.hasValue()) {
       SimplifiedAssociatedValue = llvm::None;
+      A.recordDependence(AA, *this, DepClassTy::OPTIONAL);
+      return true;
     }
-    return true;
+    if (auto *C = COpt.getValue()) {
+      SimplifiedAssociatedValue = C;
+      A.recordDependence(AA, *this, DepClassTy::OPTIONAL);
+      return true;
+    }
+    return false;
+  }
+
+  bool askSimplifiedValueForOtherAAs(Attributor &A) {
+    if (askSimplifiedValueFor<AAValueConstantRange>(A))
+      return true;
+    if (askSimplifiedValueFor<AAPotentialValues>(A))
+      return true;
+    return false;
   }
 
   /// See AbstractAttribute::manifest(...).
@@ -4468,7 +4663,7 @@ struct AAValueSimplifyArgument final : AAValueSimplifyImpl {
 
     auto PredForCallSite = [&](AbstractCallSite ACS) {
       const IRPosition &ACSArgPos =
-          IRPosition::callsite_argument(ACS, getArgNo());
+          IRPosition::callsite_argument(ACS, getCallSiteArgNo());
       // Check if a coresponding argument was found or if it is on not
       // associated (which can happen for callback calls).
       if (ACSArgPos.getPositionKind() == IRPosition::IRP_INVALID)
@@ -4490,7 +4685,7 @@ struct AAValueSimplifyArgument final : AAValueSimplifyImpl {
     bool AllCallSitesKnown;
     if (!A.checkForAllCallSites(PredForCallSite, *this, true,
                                 AllCallSitesKnown))
-      if (!askSimplifiedValueForAAValueConstantRange(A))
+      if (!askSimplifiedValueForOtherAAs(A))
         return indicatePessimisticFixpoint();
 
     // If a candicate was found in this update, return CHANGED.
@@ -4518,7 +4713,7 @@ struct AAValueSimplifyReturned : AAValueSimplifyImpl {
     };
 
     if (!A.checkForAllReturnedValues(PredForReturned, *this))
-      if (!askSimplifiedValueForAAValueConstantRange(A))
+      if (!askSimplifiedValueForOtherAAs(A))
         return indicatePessimisticFixpoint();
 
     // If a candicate was found in this update, return CHANGED.
@@ -4587,10 +4782,76 @@ struct AAValueSimplifyFloating : AAValueSimplifyImpl {
       indicatePessimisticFixpoint();
   }
 
+  /// Check if \p ICmp is an equality comparison (==/!=) with at least one
+  /// nullptr. If so, try to simplify it using AANonNull on the other operand.
+  /// Return true if successful, in that case SimplifiedAssociatedValue will be
+  /// updated and \p Changed is set appropriately.
+  bool checkForNullPtrCompare(Attributor &A, ICmpInst *ICmp,
+                              ChangeStatus &Changed) {
+    if (!ICmp)
+      return false;
+    if (!ICmp->isEquality())
+      return false;
+
+    // This is a comparison with == or !-. We check for nullptr now.
+    bool Op0IsNull = isa<ConstantPointerNull>(ICmp->getOperand(0));
+    bool Op1IsNull = isa<ConstantPointerNull>(ICmp->getOperand(1));
+    if (!Op0IsNull && !Op1IsNull)
+      return false;
+
+    LLVMContext &Ctx = ICmp->getContext();
+    // Check for `nullptr ==/!= nullptr` first:
+    if (Op0IsNull && Op1IsNull) {
+      Value *NewVal = ConstantInt::get(
+          Type::getInt1Ty(Ctx), ICmp->getPredicate() == CmpInst::ICMP_EQ);
+      assert(!SimplifiedAssociatedValue.hasValue() &&
+             "Did not expect non-fixed value for constant comparison");
+      SimplifiedAssociatedValue = NewVal;
+      indicateOptimisticFixpoint();
+      Changed = ChangeStatus::CHANGED;
+      return true;
+    }
+
+    // Left is the nullptr ==/!= non-nullptr case. We'll use AANonNull on the
+    // non-nullptr operand and if we assume it's non-null we can conclude the
+    // result of the comparison.
+    assert((Op0IsNull || Op1IsNull) &&
+           "Expected nullptr versus non-nullptr comparison at this point");
+
+    // The index is the operand that we assume is not null.
+    unsigned PtrIdx = Op0IsNull;
+    auto &PtrNonNullAA = A.getAAFor<AANonNull>(
+        *this, IRPosition::value(*ICmp->getOperand(PtrIdx)));
+    if (!PtrNonNullAA.isAssumedNonNull())
+      return false;
+
+    // The new value depends on the predicate, true for != and false for ==.
+    Value *NewVal = ConstantInt::get(Type::getInt1Ty(Ctx),
+                                     ICmp->getPredicate() == CmpInst::ICMP_NE);
+
+    assert((!SimplifiedAssociatedValue.hasValue() ||
+            SimplifiedAssociatedValue == NewVal) &&
+           "Did not expect to change value for zero-comparison");
+
+    bool HasValueBefore = SimplifiedAssociatedValue.hasValue();
+    SimplifiedAssociatedValue = NewVal;
+
+    if (PtrNonNullAA.isKnownNonNull())
+      indicateOptimisticFixpoint();
+
+    Changed = HasValueBefore ? ChangeStatus::UNCHANGED : ChangeStatus ::CHANGED;
+    return true;
+  }
+
   /// See AbstractAttribute::updateImpl(...).
   ChangeStatus updateImpl(Attributor &A) override {
     bool HasValueBefore = SimplifiedAssociatedValue.hasValue();
 
+    ChangeStatus Changed;
+    if (checkForNullPtrCompare(A, dyn_cast<ICmpInst>(&getAnchorValue()),
+                               Changed))
+      return Changed;
+
     auto VisitValueCB = [&](Value &V, const Instruction *CtxI, bool &,
                             bool Stripped) -> bool {
       auto &AA = A.getAAFor<AAValueSimplify>(*this, IRPosition::value(V));
@@ -4608,7 +4869,7 @@ struct AAValueSimplifyFloating : AAValueSimplifyImpl {
     if (!genericValueTraversal<AAValueSimplify, bool>(
             A, getIRPosition(), *this, Dummy, VisitValueCB, getCtxI(),
             /* UseValueSimplify */ false))
-      if (!askSimplifiedValueForAAValueConstantRange(A))
+      if (!askSimplifiedValueForOtherAAs(A))
         return indicatePessimisticFixpoint();
 
     // If a candicate was found in this update, return CHANGED.
@@ -4683,7 +4944,8 @@ struct AAValueSimplifyCallSiteArgument : AAValueSimplifyFloating {
                   ? dyn_cast<Constant>(SimplifiedAssociatedValue.getValue())
                   : UndefValue::get(V.getType());
     if (C) {
-      Use &U = cast<CallBase>(&getAnchorValue())->getArgOperandUse(getArgNo());
+      Use &U = cast<CallBase>(&getAnchorValue())
+                   ->getArgOperandUse(getCallSiteArgNo());
       // We can replace the AssociatedValue with the constant.
       if (&V != C && V.getType() == C->getType()) {
         if (A.changeUseAfterManifest(U, *C))
@@ -5002,7 +5264,7 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl {
       return getAssociatedValue().getType()->getPointerElementType();
 
     Optional<Type *> Ty;
-    unsigned ArgNo = getIRPosition().getArgNo();
+    unsigned ArgNo = getIRPosition().getCallSiteArgNo();
 
     // Make sure the associated call site argument has the same type at all call
     // sites and it is an allocation we know is safe to privatize, for now that
@@ -5265,8 +5527,9 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl {
         new StoreInst(F.getArg(ArgNo + u), Ptr, &IP);
       }
     } else if (auto *PrivArrayType = dyn_cast<ArrayType>(PrivType)) {
-      Type *PointeePtrTy = PrivArrayType->getElementType()->getPointerTo();
-      uint64_t PointeeTySize = DL.getTypeStoreSize(PointeePtrTy);
+      Type *PointeeTy = PrivArrayType->getElementType();
+      Type *PointeePtrTy = PointeeTy->getPointerTo();
+      uint64_t PointeeTySize = DL.getTypeStoreSize(PointeeTy);
       for (unsigned u = 0, e = PrivArrayType->getNumElements(); u < e; u++) {
         Value *Ptr =
             constructPointer(PointeePtrTy, &Base, u * PointeeTySize, IRB, DL);
@@ -5312,7 +5575,7 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl {
       for (unsigned u = 0, e = PrivArrayType->getNumElements(); u < e; u++) {
         Value *Ptr =
             constructPointer(PointeePtrTy, Base, u * PointeeTySize, IRB, DL);
-        LoadInst *L = new LoadInst(PointeePtrTy, Ptr, "", IP);
+        LoadInst *L = new LoadInst(PointeeTy, Ptr, "", IP);
         L->setAlignment(Alignment);
         ReplacementValues.push_back(L);
       }
@@ -5356,10 +5619,14 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl {
             Function &ReplacementFn, Function::arg_iterator ArgIt) {
           BasicBlock &EntryBB = ReplacementFn.getEntryBlock();
           Instruction *IP = &*EntryBB.getFirstInsertionPt();
-          auto *AI = new AllocaInst(PrivatizableType.getValue(), 0,
-                                    Arg->getName() + ".priv", IP);
+          Instruction *AI = new AllocaInst(PrivatizableType.getValue(), 0,
+                                           Arg->getName() + ".priv", IP);
           createInitialization(PrivatizableType.getValue(), *AI, ReplacementFn,
                                ArgIt->getArgNo(), *IP);
+
+          if (AI->getType() != Arg->getType())
+            AI =
+                BitCastInst::CreateBitOrPointerCast(AI, Arg->getType(), "", IP);
           Arg->replaceAllUsesWith(AI);
 
           for (CallInst *CI : TailCalls)
@@ -5418,8 +5685,7 @@ struct AAPrivatizablePtrFloating : public AAPrivatizablePtrImpl {
 
   /// See AAPrivatizablePtrImpl::identifyPrivatizableType(...)
   Optional<Type *> identifyPrivatizableType(Attributor &A) override {
-    Value *Obj =
-        GetUnderlyingObject(&getAssociatedValue(), A.getInfoCache().getDL());
+    Value *Obj = getUnderlyingObject(&getAssociatedValue());
     if (!Obj) {
       LLVM_DEBUG(dbgs() << "[AAPrivatizablePtr] No underlying object found!\n");
       return nullptr;
@@ -5539,7 +5805,7 @@ struct AAMemoryBehaviorImpl : public AAMemoryBehavior {
   void initialize(Attributor &A) override {
     intersectAssumedBits(BEST_STATE);
     getKnownStateFromValue(getIRPosition(), getState());
-    IRAttribute::initialize(A);
+    AAMemoryBehavior::initialize(A);
   }
 
   /// Return the memory behavior information encoded in the IR for \p IRP.
@@ -5634,9 +5900,7 @@ struct AAMemoryBehaviorFloating : AAMemoryBehaviorImpl {
   /// See AbstractAttribute::initialize(...).
   void initialize(Attributor &A) override {
     AAMemoryBehaviorImpl::initialize(A);
-    // Initialize the use vector with all direct uses of the associated value.
-    for (const Use &U : getAssociatedValue().uses())
-      Uses.insert(&U);
+    addUsesOf(A, getAssociatedValue());
   }
 
   /// See AbstractAttribute::updateImpl(...).
@@ -5662,8 +5926,14 @@ private:
   void analyzeUseIn(Attributor &A, const Use *U, const Instruction *UserI);
 
 protected:
+  /// Add the uses of \p V to the `Uses` set we look at during the update step.
+  void addUsesOf(Attributor &A, const Value &V);
+
   /// Container for (transitive) uses of the associated argument.
-  SetVector<const Use *> Uses;
+  SmallVector<const Use *, 8> Uses;
+
+  /// Set to remember the uses we already traversed.
+  SmallPtrSet<const Use *, 8> Visited;
 };
 
 /// Memory behavior attribute for function argument.
@@ -5688,9 +5958,7 @@ struct AAMemoryBehaviorArgument : AAMemoryBehaviorFloating {
     if (!Arg || !A.isFunctionIPOAmendable(*(Arg->getParent()))) {
       indicatePessimisticFixpoint();
     } else {
-      // Initialize the use vector with all direct uses of the associated value.
-      for (const Use &U : Arg->uses())
-        Uses.insert(&U);
+      addUsesOf(A, *Arg);
     }
   }
 
@@ -5725,14 +5993,21 @@ struct AAMemoryBehaviorCallSiteArgument final : AAMemoryBehaviorArgument {
 
   /// See AbstractAttribute::initialize(...).
   void initialize(Attributor &A) override {
-    if (Argument *Arg = getAssociatedArgument()) {
-      if (Arg->hasByValAttr()) {
-        addKnownBits(NO_WRITES);
-        removeKnownBits(NO_READS);
-        removeAssumedBits(NO_READS);
-      }
+    // If we don't have an associated attribute this is either a variadic call
+    // or an indirect call, either way, nothing to do here.
+    Argument *Arg = getAssociatedArgument();
+    if (!Arg) {
+      indicatePessimisticFixpoint();
+      return;
+    }
+    if (Arg->hasByValAttr()) {
+      addKnownBits(NO_WRITES);
+      removeKnownBits(NO_READS);
+      removeAssumedBits(NO_READS);
     }
     AAMemoryBehaviorArgument::initialize(A);
+    if (getAssociatedFunction()->isDeclaration())
+      indicatePessimisticFixpoint();
   }
 
   /// See AbstractAttribute::updateImpl(...).
@@ -5744,9 +6019,7 @@ struct AAMemoryBehaviorCallSiteArgument final : AAMemoryBehaviorArgument {
     Argument *Arg = getAssociatedArgument();
     const IRPosition &ArgPos = IRPosition::argument(*Arg);
     auto &ArgAA = A.getAAFor<AAMemoryBehavior>(*this, ArgPos);
-    return clampStateAndIndicateChange(
-        getState(),
-        static_cast<const AAMemoryBehavior::StateType &>(ArgAA.getState()));
+    return clampStateAndIndicateChange(getState(), ArgAA.getState());
   }
 
   /// See AbstractAttribute::trackStatistics()
@@ -5765,6 +6038,14 @@ struct AAMemoryBehaviorCallSiteReturned final : AAMemoryBehaviorFloating {
   AAMemoryBehaviorCallSiteReturned(const IRPosition &IRP, Attributor &A)
       : AAMemoryBehaviorFloating(IRP, A) {}
 
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AAMemoryBehaviorImpl::initialize(A);
+    Function *F = getAssociatedFunction();
+    if (!F || F->isDeclaration())
+      indicatePessimisticFixpoint();
+  }
+
   /// See AbstractAttribute::manifest(...).
   ChangeStatus manifest(Attributor &A) override {
     // We do not annotate returned values.
@@ -5814,10 +6095,8 @@ struct AAMemoryBehaviorCallSite final : AAMemoryBehaviorImpl {
   void initialize(Attributor &A) override {
     AAMemoryBehaviorImpl::initialize(A);
     Function *F = getAssociatedFunction();
-    if (!F || !A.isFunctionIPOAmendable(*F)) {
+    if (!F || F->isDeclaration())
       indicatePessimisticFixpoint();
-      return;
-    }
   }
 
   /// See AbstractAttribute::updateImpl(...).
@@ -5829,9 +6108,7 @@ struct AAMemoryBehaviorCallSite final : AAMemoryBehaviorImpl {
     Function *F = getAssociatedFunction();
     const IRPosition &FnPos = IRPosition::function(*F);
     auto &FnAA = A.getAAFor<AAMemoryBehavior>(*this, FnPos);
-    return clampStateAndIndicateChange(
-        getState(),
-        static_cast<const AAMemoryBehavior::StateType &>(FnAA.getState()));
+    return clampStateAndIndicateChange(getState(), FnAA.getState());
   }
 
   /// See AbstractAttribute::trackStatistics()
@@ -5933,8 +6210,7 @@ ChangeStatus AAMemoryBehaviorFloating::updateImpl(Attributor &A) {
 
     // Check if the users of UserI should also be visited.
     if (followUsersOfUseIn(A, U, UserI))
-      for (const Use &UserIUse : UserI->uses())
-        Uses.insert(&UserIUse);
+      addUsesOf(A, *UserI);
 
     // If UserI might touch memory we analyze the use in detail.
     if (UserI->mayReadOrWriteMemory())
@@ -5945,6 +6221,28 @@ ChangeStatus AAMemoryBehaviorFloating::updateImpl(Attributor &A) {
                                         : ChangeStatus::UNCHANGED;
 }
 
+void AAMemoryBehaviorFloating::addUsesOf(Attributor &A, const Value &V) {
+  SmallVector<const Use *, 8> WL;
+  for (const Use &U : V.uses())
+    WL.push_back(&U);
+
+  while (!WL.empty()) {
+    const Use *U = WL.pop_back_val();
+    if (!Visited.insert(U).second)
+      continue;
+
+    const Instruction *UserI = cast<Instruction>(U->getUser());
+    if (UserI->mayReadOrWriteMemory()) {
+      Uses.push_back(U);
+      continue;
+    }
+    if (!followUsersOfUseIn(A, U, UserI))
+      continue;
+    for (const Use &UU : UserI->uses())
+      WL.push_back(&UU);
+  }
+}
+
 bool AAMemoryBehaviorFloating::followUsersOfUseIn(Attributor &A, const Use *U,
                                                   const Instruction *UserI) {
   // The loaded value is unrelated to the pointer argument, no need to
@@ -6096,7 +6394,7 @@ struct AAMemoryLocationImpl : public AAMemoryLocation {
   void initialize(Attributor &A) override {
     intersectAssumedBits(BEST_STATE);
     getKnownStateFromValue(A, getIRPosition(), getState());
-    IRAttribute::initialize(A);
+    AAMemoryLocation::initialize(A);
   }
 
   /// Return the memory behavior information encoded in the IR for \p IRP.
@@ -6259,6 +6557,13 @@ protected:
   using AccessSet = SmallSet<AccessInfo, 2, AccessInfo>;
   AccessSet *AccessKind2Accesses[llvm::CTLog2<VALID_STATE>()];
 
+  /// Categorize the pointer arguments of CB that might access memory in
+  /// AccessedLoc and update the state and access map accordingly.
+  void
+  categorizeArgumentPointerLocations(Attributor &A, CallBase &CB,
+                                     AAMemoryLocation::StateType &AccessedLocs,
+                                     bool &Changed);
+
   /// Return the kind(s) of location that may be accessed by \p V.
   AAMemoryLocation::MemoryLocationsKind
   categorizeAccessedLocations(Attributor &A, Instruction &I, bool &Changed);
@@ -6324,6 +6629,7 @@ void AAMemoryLocationImpl::categorizePtrValue(
   auto VisitValueCB = [&](Value &V, const Instruction *,
                           AAMemoryLocation::StateType &T,
                           bool Stripped) -> bool {
+    // TODO: recognize the TBAA used for constant accesses.
     MemoryLocationsKind MLK = NO_LOCATIONS;
     assert(!isa<GEPOperator>(V) && "GEPs should have been stripped.");
     if (isa<UndefValue>(V))
@@ -6334,6 +6640,13 @@ void AAMemoryLocationImpl::categorizePtrValue(
       else
         MLK = NO_ARGUMENT_MEM;
     } else if (auto *GV = dyn_cast<GlobalValue>(&V)) {
+      // Reading constant memory is not treated as a read "effect" by the
+      // function attr pass so we won't neither. Constants defined by TBAA are
+      // similar. (We know we do not write it because it is constant.)
+      if (auto *GVar = dyn_cast<GlobalVariable>(GV))
+        if (GVar->isConstant())
+          return true;
+
       if (GV->hasLocalLinkage())
         MLK = NO_GLOBAL_INTERNAL_MEM;
       else
@@ -6380,6 +6693,30 @@ void AAMemoryLocationImpl::categorizePtrValue(
   }
 }
 
+void AAMemoryLocationImpl::categorizeArgumentPointerLocations(
+    Attributor &A, CallBase &CB, AAMemoryLocation::StateType &AccessedLocs,
+    bool &Changed) {
+  for (unsigned ArgNo = 0, E = CB.getNumArgOperands(); ArgNo < E; ++ArgNo) {
+
+    // Skip non-pointer arguments.
+    const Value *ArgOp = CB.getArgOperand(ArgNo);
+    if (!ArgOp->getType()->isPtrOrPtrVectorTy())
+      continue;
+
+    // Skip readnone arguments.
+    const IRPosition &ArgOpIRP = IRPosition::callsite_argument(CB, ArgNo);
+    const auto &ArgOpMemLocationAA = A.getAAFor<AAMemoryBehavior>(
+        *this, ArgOpIRP, /* TrackDependence */ true, DepClassTy::OPTIONAL);
+
+    if (ArgOpMemLocationAA.isAssumedReadNone())
+      continue;
+
+    // Categorize potentially accessed pointer arguments as if there was an
+    // access instruction with them as pointer.
+    categorizePtrValue(A, CB, *ArgOp, AccessedLocs, Changed);
+  }
+}
+
 AAMemoryLocation::MemoryLocationsKind
 AAMemoryLocationImpl::categorizeAccessedLocations(Attributor &A, Instruction &I,
                                                   bool &Changed) {
@@ -6441,28 +6778,8 @@ AAMemoryLocationImpl::categorizeAccessedLocations(Attributor &A, Instruction &I,
 
     // Now handle argument memory if it might be accessed.
     bool HasArgAccesses = ((~CBAssumedNotAccessedLocs) & NO_ARGUMENT_MEM);
-    if (HasArgAccesses) {
-      for (unsigned ArgNo = 0, E = CB->getNumArgOperands(); ArgNo < E;
-           ++ArgNo) {
-
-        // Skip non-pointer arguments.
-        const Value *ArgOp = CB->getArgOperand(ArgNo);
-        if (!ArgOp->getType()->isPtrOrPtrVectorTy())
-          continue;
-
-        // Skip readnone arguments.
-        const IRPosition &ArgOpIRP = IRPosition::callsite_argument(*CB, ArgNo);
-        const auto &ArgOpMemLocationAA = A.getAAFor<AAMemoryBehavior>(
-            *this, ArgOpIRP, /* TrackDependence */ true, DepClassTy::OPTIONAL);
-
-        if (ArgOpMemLocationAA.isAssumedReadNone())
-          continue;
-
-        // Categorize potentially accessed pointer arguments as if there was an
-        // access instruction with them as pointer.
-        categorizePtrValue(A, I, *ArgOp, AccessedLocs, Changed);
-      }
-    }
+    if (HasArgAccesses)
+      categorizeArgumentPointerLocations(A, *CB, AccessedLocs, Changed);
 
     LLVM_DEBUG(
         dbgs() << "[AAMemoryLocation] Accessed state after argument handling: "
@@ -6514,7 +6831,9 @@ struct AAMemoryLocationFunction final : public AAMemoryLocationImpl {
       LLVM_DEBUG(dbgs() << "[AAMemoryLocation] Accessed locations for " << I
                         << ": " << getMemoryLocationsAsStr(MLK) << "\n");
       removeAssumedBits(inverseLocation(MLK, false, false));
-      return true;
+      // Stop once only the valid bit set in the *not assumed location*, thus
+      // once we don't actually exclude any memory locations in the state.
+      return getAssumedNotAccessedLocation() != VALID_STATE;
     };
 
     if (!A.checkForAllReadWriteInstructions(CheckRWInst, *this))
@@ -6546,10 +6865,8 @@ struct AAMemoryLocationCallSite final : AAMemoryLocationImpl {
   void initialize(Attributor &A) override {
     AAMemoryLocationImpl::initialize(A);
     Function *F = getAssociatedFunction();
-    if (!F || !A.isFunctionIPOAmendable(*F)) {
+    if (!F || F->isDeclaration())
       indicatePessimisticFixpoint();
-      return;
-    }
   }
 
   /// See AbstractAttribute::updateImpl(...).
@@ -6655,7 +6972,6 @@ struct AAValueConstantRangeImpl : AAValueConstantRange {
     if (!LVI || !CtxI)
       return getWorstState(getBitWidth());
     return LVI->getConstantRange(&getAssociatedValue(),
-                                 const_cast<BasicBlock *>(CtxI->getParent()),
                                  const_cast<Instruction *>(CtxI));
   }
 
@@ -6759,10 +7075,13 @@ struct AAValueConstantRangeImpl : AAValueConstantRange {
     auto &V = getAssociatedValue();
     if (!AssumedConstantRange.isEmptySet() &&
         !AssumedConstantRange.isSingleElement()) {
-      if (Instruction *I = dyn_cast<Instruction>(&V))
+      if (Instruction *I = dyn_cast<Instruction>(&V)) {
+        assert(I == getCtxI() && "Should not annotate an instruction which is "
+                                 "not the context instruction");
         if (isa<CallInst>(I) || isa<LoadInst>(I))
           if (setRangeMetadataIfisBetterRange(I, AssumedConstantRange))
             Changed = ChangeStatus::CHANGED;
+      }
     }
 
     return Changed;
@@ -6831,6 +7150,9 @@ struct AAValueConstantRangeFloating : AAValueConstantRangeImpl {
       return;
     }
 
+    if (isa<CallBase>(&V))
+      return;
+
     if (isa<BinaryOperator>(&V) || isa<CmpInst>(&V) || isa<CastInst>(&V))
       return;
     // If it is a load instruction with range metadata, use it.
@@ -7068,11 +7390,641 @@ struct AAValueConstantRangeCallSiteArgument : AAValueConstantRangeFloating {
   AAValueConstantRangeCallSiteArgument(const IRPosition &IRP, Attributor &A)
       : AAValueConstantRangeFloating(IRP, A) {}
 
+  /// See AbstractAttribute::manifest()
+  ChangeStatus manifest(Attributor &A) override {
+    return ChangeStatus::UNCHANGED;
+  }
+
   /// See AbstractAttribute::trackStatistics()
   void trackStatistics() const override {
     STATS_DECLTRACK_CSARG_ATTR(value_range)
   }
 };
+
+/// ------------------ Potential Values Attribute -------------------------
+
+struct AAPotentialValuesImpl : AAPotentialValues {
+  using StateType = PotentialConstantIntValuesState;
+
+  AAPotentialValuesImpl(const IRPosition &IRP, Attributor &A)
+      : AAPotentialValues(IRP, A) {}
+
+  /// See AbstractAttribute::getAsStr().
+  const std::string getAsStr() const override {
+    std::string Str;
+    llvm::raw_string_ostream OS(Str);
+    OS << getState();
+    return OS.str();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    return indicatePessimisticFixpoint();
+  }
+};
+
+struct AAPotentialValuesArgument final
+    : AAArgumentFromCallSiteArguments<AAPotentialValues, AAPotentialValuesImpl,
+                                      PotentialConstantIntValuesState> {
+  using Base =
+      AAArgumentFromCallSiteArguments<AAPotentialValues, AAPotentialValuesImpl,
+                                      PotentialConstantIntValuesState>;
+  AAPotentialValuesArgument(const IRPosition &IRP, Attributor &A)
+      : Base(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(..).
+  void initialize(Attributor &A) override {
+    if (!getAnchorScope() || getAnchorScope()->isDeclaration()) {
+      indicatePessimisticFixpoint();
+    } else {
+      Base::initialize(A);
+    }
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_ARG_ATTR(potential_values)
+  }
+};
+
+struct AAPotentialValuesReturned
+    : AAReturnedFromReturnedValues<AAPotentialValues, AAPotentialValuesImpl> {
+  using Base =
+      AAReturnedFromReturnedValues<AAPotentialValues, AAPotentialValuesImpl>;
+  AAPotentialValuesReturned(const IRPosition &IRP, Attributor &A)
+      : Base(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_FNRET_ATTR(potential_values)
+  }
+};
+
+struct AAPotentialValuesFloating : AAPotentialValuesImpl {
+  AAPotentialValuesFloating(const IRPosition &IRP, Attributor &A)
+      : AAPotentialValuesImpl(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(..).
+  void initialize(Attributor &A) override {
+    Value &V = getAssociatedValue();
+
+    if (auto *C = dyn_cast<ConstantInt>(&V)) {
+      unionAssumed(C->getValue());
+      indicateOptimisticFixpoint();
+      return;
+    }
+
+    if (isa<UndefValue>(&V)) {
+      unionAssumedWithUndef();
+      indicateOptimisticFixpoint();
+      return;
+    }
+
+    if (isa<BinaryOperator>(&V) || isa<ICmpInst>(&V) || isa<CastInst>(&V))
+      return;
+
+    if (isa<SelectInst>(V) || isa<PHINode>(V))
+      return;
+
+    indicatePessimisticFixpoint();
+
+    LLVM_DEBUG(dbgs() << "[AAPotentialValues] We give up: "
+                      << getAssociatedValue() << "\n");
+  }
+
+  static bool calculateICmpInst(const ICmpInst *ICI, const APInt &LHS,
+                                const APInt &RHS) {
+    ICmpInst::Predicate Pred = ICI->getPredicate();
+    switch (Pred) {
+    case ICmpInst::ICMP_UGT:
+      return LHS.ugt(RHS);
+    case ICmpInst::ICMP_SGT:
+      return LHS.sgt(RHS);
+    case ICmpInst::ICMP_EQ:
+      return LHS.eq(RHS);
+    case ICmpInst::ICMP_UGE:
+      return LHS.uge(RHS);
+    case ICmpInst::ICMP_SGE:
+      return LHS.sge(RHS);
+    case ICmpInst::ICMP_ULT:
+      return LHS.ult(RHS);
+    case ICmpInst::ICMP_SLT:
+      return LHS.slt(RHS);
+    case ICmpInst::ICMP_NE:
+      return LHS.ne(RHS);
+    case ICmpInst::ICMP_ULE:
+      return LHS.ule(RHS);
+    case ICmpInst::ICMP_SLE:
+      return LHS.sle(RHS);
+    default:
+      llvm_unreachable("Invalid ICmp predicate!");
+    }
+  }
+
+  static APInt calculateCastInst(const CastInst *CI, const APInt &Src,
+                                 uint32_t ResultBitWidth) {
+    Instruction::CastOps CastOp = CI->getOpcode();
+    switch (CastOp) {
+    default:
+      llvm_unreachable("unsupported or not integer cast");
+    case Instruction::Trunc:
+      return Src.trunc(ResultBitWidth);
+    case Instruction::SExt:
+      return Src.sext(ResultBitWidth);
+    case Instruction::ZExt:
+      return Src.zext(ResultBitWidth);
+    case Instruction::BitCast:
+      return Src;
+    }
+  }
+
+  static APInt calculateBinaryOperator(const BinaryOperator *BinOp,
+                                       const APInt &LHS, const APInt &RHS,
+                                       bool &SkipOperation, bool &Unsupported) {
+    Instruction::BinaryOps BinOpcode = BinOp->getOpcode();
+    // Unsupported is set to true when the binary operator is not supported.
+    // SkipOperation is set to true when UB occur with the given operand pair
+    // (LHS, RHS).
+    // TODO: we should look at nsw and nuw keywords to handle operations
+    //       that create poison or undef value.
+    switch (BinOpcode) {
+    default:
+      Unsupported = true;
+      return LHS;
+    case Instruction::Add:
+      return LHS + RHS;
+    case Instruction::Sub:
+      return LHS - RHS;
+    case Instruction::Mul:
+      return LHS * RHS;
+    case Instruction::UDiv:
+      if (RHS.isNullValue()) {
+        SkipOperation = true;
+        return LHS;
+      }
+      return LHS.udiv(RHS);
+    case Instruction::SDiv:
+      if (RHS.isNullValue()) {
+        SkipOperation = true;
+        return LHS;
+      }
+      return LHS.sdiv(RHS);
+    case Instruction::URem:
+      if (RHS.isNullValue()) {
+        SkipOperation = true;
+        return LHS;
+      }
+      return LHS.urem(RHS);
+    case Instruction::SRem:
+      if (RHS.isNullValue()) {
+        SkipOperation = true;
+        return LHS;
+      }
+      return LHS.srem(RHS);
+    case Instruction::Shl:
+      return LHS.shl(RHS);
+    case Instruction::LShr:
+      return LHS.lshr(RHS);
+    case Instruction::AShr:
+      return LHS.ashr(RHS);
+    case Instruction::And:
+      return LHS & RHS;
+    case Instruction::Or:
+      return LHS | RHS;
+    case Instruction::Xor:
+      return LHS ^ RHS;
+    }
+  }
+
+  bool calculateBinaryOperatorAndTakeUnion(const BinaryOperator *BinOp,
+                                           const APInt &LHS, const APInt &RHS) {
+    bool SkipOperation = false;
+    bool Unsupported = false;
+    APInt Result =
+        calculateBinaryOperator(BinOp, LHS, RHS, SkipOperation, Unsupported);
+    if (Unsupported)
+      return false;
+    // If SkipOperation is true, we can ignore this operand pair (L, R).
+    if (!SkipOperation)
+      unionAssumed(Result);
+    return isValidState();
+  }
+
+  ChangeStatus updateWithICmpInst(Attributor &A, ICmpInst *ICI) {
+    auto AssumedBefore = getAssumed();
+    Value *LHS = ICI->getOperand(0);
+    Value *RHS = ICI->getOperand(1);
+    if (!LHS->getType()->isIntegerTy() || !RHS->getType()->isIntegerTy())
+      return indicatePessimisticFixpoint();
+
+    auto &LHSAA = A.getAAFor<AAPotentialValues>(*this, IRPosition::value(*LHS));
+    if (!LHSAA.isValidState())
+      return indicatePessimisticFixpoint();
+
+    auto &RHSAA = A.getAAFor<AAPotentialValues>(*this, IRPosition::value(*RHS));
+    if (!RHSAA.isValidState())
+      return indicatePessimisticFixpoint();
+
+    const DenseSet<APInt> &LHSAAPVS = LHSAA.getAssumedSet();
+    const DenseSet<APInt> &RHSAAPVS = RHSAA.getAssumedSet();
+
+    // TODO: make use of undef flag to limit potential values aggressively.
+    bool MaybeTrue = false, MaybeFalse = false;
+    const APInt Zero(RHS->getType()->getIntegerBitWidth(), 0);
+    if (LHSAA.undefIsContained() && RHSAA.undefIsContained()) {
+      // The result of any comparison between undefs can be soundly replaced
+      // with undef.
+      unionAssumedWithUndef();
+    } else if (LHSAA.undefIsContained()) {
+      bool MaybeTrue = false, MaybeFalse = false;
+      for (const APInt &R : RHSAAPVS) {
+        bool CmpResult = calculateICmpInst(ICI, Zero, R);
+        MaybeTrue |= CmpResult;
+        MaybeFalse |= !CmpResult;
+        if (MaybeTrue & MaybeFalse)
+          return indicatePessimisticFixpoint();
+      }
+    } else if (RHSAA.undefIsContained()) {
+      for (const APInt &L : LHSAAPVS) {
+        bool CmpResult = calculateICmpInst(ICI, L, Zero);
+        MaybeTrue |= CmpResult;
+        MaybeFalse |= !CmpResult;
+        if (MaybeTrue & MaybeFalse)
+          return indicatePessimisticFixpoint();
+      }
+    } else {
+      for (const APInt &L : LHSAAPVS) {
+        for (const APInt &R : RHSAAPVS) {
+          bool CmpResult = calculateICmpInst(ICI, L, R);
+          MaybeTrue |= CmpResult;
+          MaybeFalse |= !CmpResult;
+          if (MaybeTrue & MaybeFalse)
+            return indicatePessimisticFixpoint();
+        }
+      }
+    }
+    if (MaybeTrue)
+      unionAssumed(APInt(/* numBits */ 1, /* val */ 1));
+    if (MaybeFalse)
+      unionAssumed(APInt(/* numBits */ 1, /* val */ 0));
+    return AssumedBefore == getAssumed() ? ChangeStatus::UNCHANGED
+                                         : ChangeStatus::CHANGED;
+  }
+
+  ChangeStatus updateWithSelectInst(Attributor &A, SelectInst *SI) {
+    auto AssumedBefore = getAssumed();
+    Value *LHS = SI->getTrueValue();
+    Value *RHS = SI->getFalseValue();
+    if (!LHS->getType()->isIntegerTy() || !RHS->getType()->isIntegerTy())
+      return indicatePessimisticFixpoint();
+
+    // TODO: Use assumed simplified condition value
+    auto &LHSAA = A.getAAFor<AAPotentialValues>(*this, IRPosition::value(*LHS));
+    if (!LHSAA.isValidState())
+      return indicatePessimisticFixpoint();
+
+    auto &RHSAA = A.getAAFor<AAPotentialValues>(*this, IRPosition::value(*RHS));
+    if (!RHSAA.isValidState())
+      return indicatePessimisticFixpoint();
+
+    if (LHSAA.undefIsContained() && RHSAA.undefIsContained())
+      // select i1 *, undef , undef => undef
+      unionAssumedWithUndef();
+    else {
+      unionAssumed(LHSAA);
+      unionAssumed(RHSAA);
+    }
+    return AssumedBefore == getAssumed() ? ChangeStatus::UNCHANGED
+                                         : ChangeStatus::CHANGED;
+  }
+
+  ChangeStatus updateWithCastInst(Attributor &A, CastInst *CI) {
+    auto AssumedBefore = getAssumed();
+    if (!CI->isIntegerCast())
+      return indicatePessimisticFixpoint();
+    assert(CI->getNumOperands() == 1 && "Expected cast to be unary!");
+    uint32_t ResultBitWidth = CI->getDestTy()->getIntegerBitWidth();
+    Value *Src = CI->getOperand(0);
+    auto &SrcAA = A.getAAFor<AAPotentialValues>(*this, IRPosition::value(*Src));
+    if (!SrcAA.isValidState())
+      return indicatePessimisticFixpoint();
+    const DenseSet<APInt> &SrcAAPVS = SrcAA.getAssumedSet();
+    if (SrcAA.undefIsContained())
+      unionAssumedWithUndef();
+    else {
+      for (const APInt &S : SrcAAPVS) {
+        APInt T = calculateCastInst(CI, S, ResultBitWidth);
+        unionAssumed(T);
+      }
+    }
+    return AssumedBefore == getAssumed() ? ChangeStatus::UNCHANGED
+                                         : ChangeStatus::CHANGED;
+  }
+
+  ChangeStatus updateWithBinaryOperator(Attributor &A, BinaryOperator *BinOp) {
+    auto AssumedBefore = getAssumed();
+    Value *LHS = BinOp->getOperand(0);
+    Value *RHS = BinOp->getOperand(1);
+    if (!LHS->getType()->isIntegerTy() || !RHS->getType()->isIntegerTy())
+      return indicatePessimisticFixpoint();
+
+    auto &LHSAA = A.getAAFor<AAPotentialValues>(*this, IRPosition::value(*LHS));
+    if (!LHSAA.isValidState())
+      return indicatePessimisticFixpoint();
+
+    auto &RHSAA = A.getAAFor<AAPotentialValues>(*this, IRPosition::value(*RHS));
+    if (!RHSAA.isValidState())
+      return indicatePessimisticFixpoint();
+
+    const DenseSet<APInt> &LHSAAPVS = LHSAA.getAssumedSet();
+    const DenseSet<APInt> &RHSAAPVS = RHSAA.getAssumedSet();
+    const APInt Zero = APInt(LHS->getType()->getIntegerBitWidth(), 0);
+
+    // TODO: make use of undef flag to limit potential values aggressively.
+    if (LHSAA.undefIsContained() && RHSAA.undefIsContained()) {
+      if (!calculateBinaryOperatorAndTakeUnion(BinOp, Zero, Zero))
+        return indicatePessimisticFixpoint();
+    } else if (LHSAA.undefIsContained()) {
+      for (const APInt &R : RHSAAPVS) {
+        if (!calculateBinaryOperatorAndTakeUnion(BinOp, Zero, R))
+          return indicatePessimisticFixpoint();
+      }
+    } else if (RHSAA.undefIsContained()) {
+      for (const APInt &L : LHSAAPVS) {
+        if (!calculateBinaryOperatorAndTakeUnion(BinOp, L, Zero))
+          return indicatePessimisticFixpoint();
+      }
+    } else {
+      for (const APInt &L : LHSAAPVS) {
+        for (const APInt &R : RHSAAPVS) {
+          if (!calculateBinaryOperatorAndTakeUnion(BinOp, L, R))
+            return indicatePessimisticFixpoint();
+        }
+      }
+    }
+    return AssumedBefore == getAssumed() ? ChangeStatus::UNCHANGED
+                                         : ChangeStatus::CHANGED;
+  }
+
+  ChangeStatus updateWithPHINode(Attributor &A, PHINode *PHI) {
+    auto AssumedBefore = getAssumed();
+    for (unsigned u = 0, e = PHI->getNumIncomingValues(); u < e; u++) {
+      Value *IncomingValue = PHI->getIncomingValue(u);
+      auto &PotentialValuesAA = A.getAAFor<AAPotentialValues>(
+          *this, IRPosition::value(*IncomingValue));
+      if (!PotentialValuesAA.isValidState())
+        return indicatePessimisticFixpoint();
+      if (PotentialValuesAA.undefIsContained())
+        unionAssumedWithUndef();
+      else
+        unionAssumed(PotentialValuesAA.getAssumed());
+    }
+    return AssumedBefore == getAssumed() ? ChangeStatus::UNCHANGED
+                                         : ChangeStatus::CHANGED;
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    Value &V = getAssociatedValue();
+    Instruction *I = dyn_cast<Instruction>(&V);
+
+    if (auto *ICI = dyn_cast<ICmpInst>(I))
+      return updateWithICmpInst(A, ICI);
+
+    if (auto *SI = dyn_cast<SelectInst>(I))
+      return updateWithSelectInst(A, SI);
+
+    if (auto *CI = dyn_cast<CastInst>(I))
+      return updateWithCastInst(A, CI);
+
+    if (auto *BinOp = dyn_cast<BinaryOperator>(I))
+      return updateWithBinaryOperator(A, BinOp);
+
+    if (auto *PHI = dyn_cast<PHINode>(I))
+      return updateWithPHINode(A, PHI);
+
+    return indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_FLOATING_ATTR(potential_values)
+  }
+};
+
+struct AAPotentialValuesFunction : AAPotentialValuesImpl {
+  AAPotentialValuesFunction(const IRPosition &IRP, Attributor &A)
+      : AAPotentialValuesImpl(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    llvm_unreachable("AAPotentialValues(Function|CallSite)::updateImpl will "
+                     "not be called");
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_FN_ATTR(potential_values)
+  }
+};
+
+struct AAPotentialValuesCallSite : AAPotentialValuesFunction {
+  AAPotentialValuesCallSite(const IRPosition &IRP, Attributor &A)
+      : AAPotentialValuesFunction(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_CS_ATTR(potential_values)
+  }
+};
+
+struct AAPotentialValuesCallSiteReturned
+    : AACallSiteReturnedFromReturned<AAPotentialValues, AAPotentialValuesImpl> {
+  AAPotentialValuesCallSiteReturned(const IRPosition &IRP, Attributor &A)
+      : AACallSiteReturnedFromReturned<AAPotentialValues,
+                                       AAPotentialValuesImpl>(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_CSRET_ATTR(potential_values)
+  }
+};
+
+struct AAPotentialValuesCallSiteArgument : AAPotentialValuesFloating {
+  AAPotentialValuesCallSiteArgument(const IRPosition &IRP, Attributor &A)
+      : AAPotentialValuesFloating(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(..).
+  void initialize(Attributor &A) override {
+    Value &V = getAssociatedValue();
+
+    if (auto *C = dyn_cast<ConstantInt>(&V)) {
+      unionAssumed(C->getValue());
+      indicateOptimisticFixpoint();
+      return;
+    }
+
+    if (isa<UndefValue>(&V)) {
+      unionAssumedWithUndef();
+      indicateOptimisticFixpoint();
+      return;
+    }
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    Value &V = getAssociatedValue();
+    auto AssumedBefore = getAssumed();
+    auto &AA = A.getAAFor<AAPotentialValues>(*this, IRPosition::value(V));
+    const auto &S = AA.getAssumed();
+    unionAssumed(S);
+    return AssumedBefore == getAssumed() ? ChangeStatus::UNCHANGED
+                                         : ChangeStatus::CHANGED;
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_CSARG_ATTR(potential_values)
+  }
+};
+
+/// ------------------------ NoUndef Attribute ---------------------------------
+struct AANoUndefImpl : AANoUndef {
+  AANoUndefImpl(const IRPosition &IRP, Attributor &A) : AANoUndef(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    if (getIRPosition().hasAttr({Attribute::NoUndef})) {
+      indicateOptimisticFixpoint();
+      return;
+    }
+    Value &V = getAssociatedValue();
+    if (isa<UndefValue>(V))
+      indicatePessimisticFixpoint();
+    else if (isa<FreezeInst>(V))
+      indicateOptimisticFixpoint();
+    else if (getPositionKind() != IRPosition::IRP_RETURNED &&
+             isGuaranteedNotToBeUndefOrPoison(&V))
+      indicateOptimisticFixpoint();
+    else
+      AANoUndef::initialize(A);
+  }
+
+  /// See followUsesInMBEC
+  bool followUseInMBEC(Attributor &A, const Use *U, const Instruction *I,
+                       AANoUndef::StateType &State) {
+    const Value *UseV = U->get();
+    const DominatorTree *DT = nullptr;
+    AssumptionCache *AC = nullptr;
+    InformationCache &InfoCache = A.getInfoCache();
+    if (Function *F = getAnchorScope()) {
+      DT = InfoCache.getAnalysisResultForFunction<DominatorTreeAnalysis>(*F);
+      AC = InfoCache.getAnalysisResultForFunction<AssumptionAnalysis>(*F);
+    }
+    State.setKnown(isGuaranteedNotToBeUndefOrPoison(UseV, AC, I, DT));
+    bool TrackUse = false;
+    // Track use for instructions which must produce undef or poison bits when
+    // at least one operand contains such bits.
+    if (isa<CastInst>(*I) || isa<GetElementPtrInst>(*I))
+      TrackUse = true;
+    return TrackUse;
+  }
+
+  /// See AbstractAttribute::getAsStr().
+  const std::string getAsStr() const override {
+    return getAssumed() ? "noundef" : "may-undef-or-poison";
+  }
+
+  ChangeStatus manifest(Attributor &A) override {
+    // We don't manifest noundef attribute for dead positions because the
+    // associated values with dead positions would be replaced with undef
+    // values.
+    if (A.isAssumedDead(getIRPosition(), nullptr, nullptr))
+      return ChangeStatus::UNCHANGED;
+    // A position whose simplified value does not have any value is
+    // considered to be dead. We don't manifest noundef in such positions for
+    // the same reason above.
+    auto &ValueSimplifyAA = A.getAAFor<AAValueSimplify>(
+        *this, getIRPosition(), /* TrackDependence */ false);
+    if (!ValueSimplifyAA.getAssumedSimplifiedValue(A).hasValue())
+      return ChangeStatus::UNCHANGED;
+    return AANoUndef::manifest(A);
+  }
+};
+
+struct AANoUndefFloating : public AANoUndefImpl {
+  AANoUndefFloating(const IRPosition &IRP, Attributor &A)
+      : AANoUndefImpl(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AANoUndefImpl::initialize(A);
+    if (!getState().isAtFixpoint())
+      if (Instruction *CtxI = getCtxI())
+        followUsesInMBEC(*this, A, getState(), *CtxI);
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    auto VisitValueCB = [&](Value &V, const Instruction *CtxI,
+                            AANoUndef::StateType &T, bool Stripped) -> bool {
+      const auto &AA = A.getAAFor<AANoUndef>(*this, IRPosition::value(V));
+      if (!Stripped && this == &AA) {
+        T.indicatePessimisticFixpoint();
+      } else {
+        const AANoUndef::StateType &S =
+            static_cast<const AANoUndef::StateType &>(AA.getState());
+        T ^= S;
+      }
+      return T.isValidState();
+    };
+
+    StateType T;
+    if (!genericValueTraversal<AANoUndef, StateType>(
+            A, getIRPosition(), *this, T, VisitValueCB, getCtxI()))
+      return indicatePessimisticFixpoint();
+
+    return clampStateAndIndicateChange(getState(), T);
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(noundef) }
+};
+
+struct AANoUndefReturned final
+    : AAReturnedFromReturnedValues<AANoUndef, AANoUndefImpl> {
+  AANoUndefReturned(const IRPosition &IRP, Attributor &A)
+      : AAReturnedFromReturnedValues<AANoUndef, AANoUndefImpl>(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(noundef) }
+};
+
+struct AANoUndefArgument final
+    : AAArgumentFromCallSiteArguments<AANoUndef, AANoUndefImpl> {
+  AANoUndefArgument(const IRPosition &IRP, Attributor &A)
+      : AAArgumentFromCallSiteArguments<AANoUndef, AANoUndefImpl>(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(noundef) }
+};
+
+struct AANoUndefCallSiteArgument final : AANoUndefFloating {
+  AANoUndefCallSiteArgument(const IRPosition &IRP, Attributor &A)
+      : AANoUndefFloating(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CSARG_ATTR(noundef) }
+};
+
+struct AANoUndefCallSiteReturned final
+    : AACallSiteReturnedFromReturned<AANoUndef, AANoUndefImpl> {
+  AANoUndefCallSiteReturned(const IRPosition &IRP, Attributor &A)
+      : AACallSiteReturnedFromReturned<AANoUndef, AANoUndefImpl>(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CSRET_ATTR(noundef) }
+};
 } // namespace
 
 const char AAReturnedValues::ID = 0;
@@ -7096,6 +8048,8 @@ const char AAPrivatizablePtr::ID = 0;
 const char AAMemoryBehavior::ID = 0;
 const char AAMemoryLocation::ID = 0;
 const char AAValueConstantRange::ID = 0;
+const char AAPotentialValues::ID = 0;
+const char AANoUndef::ID = 0;
 
 // Macro magic to create the static generator function for attributes that
 // follow the naming scheme.
@@ -7205,6 +8159,8 @@ CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AADereferenceable)
 CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAAlign)
 CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoCapture)
 CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAValueConstantRange)
+CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAPotentialValues)
+CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoUndef)
 
 CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAValueSimplify)
 CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAIsDead)
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/BlockExtractor.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/BlockExtractor.cpp
index 1d1300c6cd1d..c6e222a096eb 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/BlockExtractor.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/BlockExtractor.cpp
@@ -11,10 +11,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/IPO/BlockExtractor.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
@@ -38,13 +40,10 @@ cl::opt<bool> BlockExtractorEraseFuncs("extract-blocks-erase-funcs",
                                        cl::desc("Erase the existing functions"),
                                        cl::Hidden);
 namespace {
-class BlockExtractor : public ModulePass {
-  SmallVector<SmallVector<BasicBlock *, 16>, 4> GroupsOfBlocks;
-  bool EraseFunctions;
-  /// Map a function name to groups of blocks.
-  SmallVector<std::pair<std::string, SmallVector<std::string, 4>>, 4>
-      BlocksByName;
-
+class BlockExtractor {
+public:
+  BlockExtractor(bool EraseFunctions) : EraseFunctions(EraseFunctions) {}
+  bool runOnModule(Module &M);
   void init(const SmallVectorImpl<SmallVector<BasicBlock *, 16>>
                 &GroupsOfBlocksToExtract) {
     for (const SmallVectorImpl<BasicBlock *> &GroupOfBlocks :
@@ -57,11 +56,26 @@ class BlockExtractor : public ModulePass {
       loadFile();
   }
 
+private:
+  SmallVector<SmallVector<BasicBlock *, 16>, 4> GroupsOfBlocks;
+  bool EraseFunctions;
+  /// Map a function name to groups of blocks.
+  SmallVector<std::pair<std::string, SmallVector<std::string, 4>>, 4>
+      BlocksByName;
+
+  void loadFile();
+  void splitLandingPadPreds(Function &F);
+};
+
+class BlockExtractorLegacyPass : public ModulePass {
+  BlockExtractor BE;
+  bool runOnModule(Module &M) override;
+
 public:
   static char ID;
-  BlockExtractor(const SmallVectorImpl<BasicBlock *> &BlocksToExtract,
-                 bool EraseFunctions)
-      : ModulePass(ID), EraseFunctions(EraseFunctions) {
+  BlockExtractorLegacyPass(const SmallVectorImpl<BasicBlock *> &BlocksToExtract,
+                           bool EraseFunctions)
+      : ModulePass(ID), BE(EraseFunctions) {
     // We want one group per element of the input list.
     SmallVector<SmallVector<BasicBlock *, 16>, 4> MassagedGroupsOfBlocks;
     for (BasicBlock *BB : BlocksToExtract) {
@@ -69,39 +83,38 @@ public:
       NewGroup.push_back(BB);
       MassagedGroupsOfBlocks.push_back(NewGroup);
     }
-    init(MassagedGroupsOfBlocks);
+    BE.init(MassagedGroupsOfBlocks);
   }
 
-  BlockExtractor(const SmallVectorImpl<SmallVector<BasicBlock *, 16>>
-                     &GroupsOfBlocksToExtract,
-                 bool EraseFunctions)
-      : ModulePass(ID), EraseFunctions(EraseFunctions) {
-    init(GroupsOfBlocksToExtract);
+  BlockExtractorLegacyPass(const SmallVectorImpl<SmallVector<BasicBlock *, 16>>
+                               &GroupsOfBlocksToExtract,
+                           bool EraseFunctions)
+      : ModulePass(ID), BE(EraseFunctions) {
+    BE.init(GroupsOfBlocksToExtract);
   }
 
-  BlockExtractor() : BlockExtractor(SmallVector<BasicBlock *, 0>(), false) {}
-  bool runOnModule(Module &M) override;
-
-private:
-  void loadFile();
-  void splitLandingPadPreds(Function &F);
+  BlockExtractorLegacyPass()
+      : BlockExtractorLegacyPass(SmallVector<BasicBlock *, 0>(), false) {}
 };
+
 } // end anonymous namespace
 
-char BlockExtractor::ID = 0;
-INITIALIZE_PASS(BlockExtractor, "extract-blocks",
+char BlockExtractorLegacyPass::ID = 0;
+INITIALIZE_PASS(BlockExtractorLegacyPass, "extract-blocks",
                 "Extract basic blocks from module", false, false)
 
-ModulePass *llvm::createBlockExtractorPass() { return new BlockExtractor(); }
+ModulePass *llvm::createBlockExtractorPass() {
+  return new BlockExtractorLegacyPass();
+}
 ModulePass *llvm::createBlockExtractorPass(
     const SmallVectorImpl<BasicBlock *> &BlocksToExtract, bool EraseFunctions) {
-  return new BlockExtractor(BlocksToExtract, EraseFunctions);
+  return new BlockExtractorLegacyPass(BlocksToExtract, EraseFunctions);
 }
 ModulePass *llvm::createBlockExtractorPass(
     const SmallVectorImpl<SmallVector<BasicBlock *, 16>>
         &GroupsOfBlocksToExtract,
     bool EraseFunctions) {
-  return new BlockExtractor(GroupsOfBlocksToExtract, EraseFunctions);
+  return new BlockExtractorLegacyPass(GroupsOfBlocksToExtract, EraseFunctions);
 }
 
 /// Gets all of the blocks specified in the input file.
@@ -233,3 +246,15 @@ bool BlockExtractor::runOnModule(Module &M) {
 
   return Changed;
 }
+
+bool BlockExtractorLegacyPass::runOnModule(Module &M) {
+  return BE.runOnModule(M);
+}
+
+PreservedAnalyses BlockExtractorPass::run(Module &M,
+                                          ModuleAnalysisManager &AM) {
+  BlockExtractor BE(false);
+  BE.init(SmallVector<SmallVector<BasicBlock *, 16>, 0>());
+  return BE.runOnModule(M) ? PreservedAnalyses::none()
+                           : PreservedAnalyses::all();
+}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/ConstantMerge.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/ConstantMerge.cpp
index 67f1438b9b6a..8e81f4bad4af 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/ConstantMerge.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/ConstantMerge.cpp
@@ -95,6 +95,8 @@ isUnmergeableGlobal(GlobalVariable *GV,
   // Only process constants with initializers in the default address space.
   return !GV->isConstant() || !GV->hasDefinitiveInitializer() ||
          GV->getType()->getAddressSpace() != 0 || GV->hasSection() ||
+         // Don't touch thread-local variables.
+         GV->isThreadLocal() ||
          // Don't touch values marked with attribute(used).
          UsedGlobals.count(GV);
 }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
index f2588938d964..0b763e423fe0 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -289,7 +289,7 @@ bool DeadArgumentEliminationPass::RemoveDeadArgumentsFromCallers(Function &Fn) {
 
   for (Argument &Arg : Fn.args()) {
     if (!Arg.hasSwiftErrorAttr() && Arg.use_empty() &&
-        !Arg.hasPassPointeeByValueAttr()) {
+        !Arg.hasPassPointeeByValueCopyAttr()) {
       if (Arg.isUsedByMetadata()) {
         Arg.replaceAllUsesWith(UndefValue::get(Arg.getType()));
         Changed = true;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp
index 2cb184e8d4f4..1a8bb225a626 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp
@@ -26,6 +26,13 @@ static cl::list<std::string>
                              "example -force-attribute=foo:noinline. This "
                              "option can be specified multiple times."));
 
+static cl::list<std::string> ForceRemoveAttributes(
+    "force-remove-attribute", cl::Hidden,
+    cl::desc("Remove an attribute from a function. This should be a "
+             "pair of 'function-name:attribute-name', for "
+             "example -force-remove-attribute=foo:noinline. This "
+             "option can be specified multiple times."));
+
 static Attribute::AttrKind parseAttrKind(StringRef Kind) {
   return StringSwitch<Attribute::AttrKind>(Kind)
       .Case("alwaysinline", Attribute::AlwaysInline)
@@ -70,31 +77,49 @@ static Attribute::AttrKind parseAttrKind(StringRef Kind) {
 }
 
 /// If F has any forced attributes given on the command line, add them.
-static void addForcedAttributes(Function &F) {
-  for (auto &S : ForceAttributes) {
+/// If F has any forced remove attributes given on the command line, remove
+/// them. When both force and force-remove are given to a function, the latter
+/// takes precedence.
+static void forceAttributes(Function &F) {
+  auto ParseFunctionAndAttr = [&](StringRef S) {
+    auto Kind = Attribute::None;
     auto KV = StringRef(S).split(':');
     if (KV.first != F.getName())
-      continue;
-
-    auto Kind = parseAttrKind(KV.second);
+      return Kind;
+    Kind = parseAttrKind(KV.second);
     if (Kind == Attribute::None) {
       LLVM_DEBUG(dbgs() << "ForcedAttribute: " << KV.second
                         << " unknown or not handled!\n");
-      continue;
     }
-    if (F.hasFnAttribute(Kind))
+    return Kind;
+  };
+
+  for (auto &S : ForceAttributes) {
+    auto Kind = ParseFunctionAndAttr(S);
+    if (Kind == Attribute::None || F.hasFnAttribute(Kind))
       continue;
     F.addFnAttr(Kind);
   }
+
+  for (auto &S : ForceRemoveAttributes) {
+    auto Kind = ParseFunctionAndAttr(S);
+    if (Kind == Attribute::None || !F.hasFnAttribute(Kind))
+      continue;
+    F.removeFnAttr(Kind);
+  }
+}
+
+static bool hasForceAttributes() {
+  return !ForceAttributes.empty() || !ForceRemoveAttributes.empty();
 }
 
 PreservedAnalyses ForceFunctionAttrsPass::run(Module &M,
                                               ModuleAnalysisManager &) {
-  if (ForceAttributes.empty())
+  if (!hasForceAttributes())
     return PreservedAnalyses::all();
 
   for (Function &F : M.functions())
-    addForcedAttributes(F);
+    forceAttributes(F);
 
   // Just conservatively invalidate analyses, this isn't likely to be important.
   return PreservedAnalyses::none();
@@ -109,11 +134,11 @@ struct ForceFunctionAttrsLegacyPass : public ModulePass {
   }
 
   bool runOnModule(Module &M) override {
-    if (ForceAttributes.empty())
+    if (!hasForceAttributes())
       return false;
 
     for (Function &F : M.functions())
-      addForcedAttributes(F);
+      forceAttributes(F);
 
     // Conservatively assume we changed something.
     return true;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
index 4baeaa6e1630..6730824e860a 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -13,15 +13,16 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/IPO/FunctionAttrs.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/CGSCCPassManager.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
@@ -63,7 +64,7 @@
 
 using namespace llvm;
 
-#define DEBUG_TYPE "functionattrs"
+#define DEBUG_TYPE "function-attrs"
 
 STATISTIC(NumReadNone, "Number of functions marked readnone");
 STATISTIC(NumReadOnly, "Number of functions marked readonly");
@@ -77,6 +78,7 @@ STATISTIC(NumNonNullReturn, "Number of function returns marked nonnull");
 STATISTIC(NumNoRecurse, "Number of functions marked as norecurse");
 STATISTIC(NumNoUnwind, "Number of functions marked as nounwind");
 STATISTIC(NumNoFree, "Number of functions marked as nofree");
+STATISTIC(NumWillReturn, "Number of functions marked as willreturn");
 
 static cl::opt<bool> EnableNonnullArgPropagation(
     "enable-nonnull-arg-prop", cl::init(true), cl::Hidden,
@@ -147,6 +149,13 @@ static MemoryAccessKind checkFunctionMemoryAccess(Function &F, bool ThisBody,
       if (isNoModRef(MRI))
         continue;
 
+      // A pseudo probe call shouldn't change any function attribute since it
+      // doesn't translate to a real instruction. It comes with a memory access
+      // tag to prevent itself being removed by optimizations and not block
+      // other instructions being optimized.
+      if (isa<PseudoProbeInst>(I))
+        continue;
+
       if (!AliasAnalysis::onlyAccessesArgPointees(MRB)) {
         // The call could access any memory. If that includes writes, note it.
         if (isModSet(MRI))
@@ -166,7 +175,7 @@ static MemoryAccessKind checkFunctionMemoryAccess(Function &F, bool ThisBody,
 
         AAMDNodes AAInfo;
         I->getAAMetadata(AAInfo);
-        MemoryLocation Loc(Arg, LocationSize::unknown(), AAInfo);
+        MemoryLocation Loc = MemoryLocation::getBeforeOrAfter(Arg, AAInfo);
 
         // Skip accesses to local or constant memory as they don't impact the
         // externally visible mod/ref behavior.
@@ -281,16 +290,18 @@ static bool addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT &&AARGetter) {
     MadeChange = true;
 
     // Clear out any existing attributes.
-    F->removeFnAttr(Attribute::ReadOnly);
-    F->removeFnAttr(Attribute::ReadNone);
-    F->removeFnAttr(Attribute::WriteOnly);
+    AttrBuilder AttrsToRemove;
+    AttrsToRemove.addAttribute(Attribute::ReadOnly);
+    AttrsToRemove.addAttribute(Attribute::ReadNone);
+    AttrsToRemove.addAttribute(Attribute::WriteOnly);
 
     if (!WritesMemory && !ReadsMemory) {
       // Clear out any "access range attributes" if readnone was deduced.
-      F->removeFnAttr(Attribute::ArgMemOnly);
-      F->removeFnAttr(Attribute::InaccessibleMemOnly);
-      F->removeFnAttr(Attribute::InaccessibleMemOrArgMemOnly);
+      AttrsToRemove.addAttribute(Attribute::ArgMemOnly);
+      AttrsToRemove.addAttribute(Attribute::InaccessibleMemOnly);
+      AttrsToRemove.addAttribute(Attribute::InaccessibleMemOrArgMemOnly);
     }
+    F->removeAttributes(AttributeList::FunctionIndex, AttrsToRemove);
 
     // Add in the new attribute.
     if (WritesMemory && !ReadsMemory)
@@ -639,7 +650,7 @@ static bool addArgumentAttrsFromCallsites(Function &F) {
     if (auto *CB = dyn_cast<CallBase>(&I)) {
       if (auto *CalledFunc = CB->getCalledFunction()) {
         for (auto &CSArg : CalledFunc->args()) {
-          if (!CSArg.hasNonNullAttr())
+          if (!CSArg.hasNonNullAttr(/* AllowUndefOrPoison */ false))
             continue;
 
           // If the non-null callsite argument operand is an argument to 'F'
@@ -1216,6 +1227,11 @@ bool AttributeInferer::run(const SCCNodeSet &SCCNodes) {
   return Changed;
 }
 
+struct SCCNodesResult {
+  SCCNodeSet SCCNodes;
+  bool HasUnknownCall;
+};
+
 } // end anonymous namespace
 
 /// Helper for non-Convergent inference predicate InstrBreaksAttribute.
@@ -1237,7 +1253,7 @@ static bool InstrBreaksNonThrowing(Instruction &I, const SCCNodeSet &SCCNodes) {
       // I is a may-throw call to a function inside our SCC. This doesn't
       // invalidate our current working assumption that the SCC is no-throw; we
       // just have to scan that other function.
-      if (SCCNodes.count(Callee) > 0)
+      if (SCCNodes.contains(Callee))
         return false;
     }
   }
@@ -1257,21 +1273,16 @@ static bool InstrBreaksNoFree(Instruction &I, const SCCNodeSet &SCCNodes) {
   if (Callee->doesNotFreeMemory())
     return false;
 
-  if (SCCNodes.count(Callee) > 0)
+  if (SCCNodes.contains(Callee))
     return false;
 
   return true;
 }
 
-/// Infer attributes from all functions in the SCC by scanning every
-/// instruction for compliance to the attribute assumptions. Currently it
-/// does:
-///   - removal of Convergent attribute
-///   - addition of NoUnwind attribute
+/// Attempt to remove convergent function attribute when possible.
 ///
 /// Returns true if any changes to function attributes were made.
-static bool inferAttrsFromFunctionBodies(const SCCNodeSet &SCCNodes) {
-
+static bool inferConvergent(const SCCNodeSet &SCCNodes) {
   AttributeInferer AI;
 
   // Request to remove the convergent attribute from all functions in the SCC
@@ -1293,6 +1304,18 @@ static bool inferAttrsFromFunctionBodies(const SCCNodeSet &SCCNodes) {
         F.setNotConvergent();
       },
       /* RequiresExactDefinition= */ false});
+  // Perform all the requested attribute inference actions.
+  return AI.run(SCCNodes);
+}
+
+/// Infer attributes from all functions in the SCC by scanning every
+/// instruction for compliance to the attribute assumptions. Currently it
+/// does:
+///   - addition of NoUnwind attribute
+///
+/// Returns true if any changes to function attributes were made.
+static bool inferAttrsFromFunctionBodies(const SCCNodeSet &SCCNodes) {
+  AttributeInferer AI;
 
   if (!DisableNoUnwindInference)
     // Request to infer nounwind attribute for all the functions in the SCC if
@@ -1343,14 +1366,6 @@ static bool inferAttrsFromFunctionBodies(const SCCNodeSet &SCCNodes) {
   return AI.run(SCCNodes);
 }
 
-static bool setDoesNotRecurse(Function &F) {
-  if (F.doesNotRecurse())
-    return false;
-  F.setDoesNotRecurse();
-  ++NumNoRecurse;
-  return true;
-}
-
 static bool addNoRecurseAttrs(const SCCNodeSet &SCCNodes) {
   // Try and identify functions that do not recurse.
 
@@ -1377,30 +1392,139 @@ static bool addNoRecurseAttrs(const SCCNodeSet &SCCNodes) {
   // Every call was to a non-recursive function other than this function, and
   // we have no indirect recursion as the SCC size is one. This function cannot
   // recurse.
-  return setDoesNotRecurse(*F);
+  F->setDoesNotRecurse();
+  ++NumNoRecurse;
+  return true;
+}
+
+static bool instructionDoesNotReturn(Instruction &I) {
+  if (auto *CB = dyn_cast<CallBase>(&I)) {
+    Function *Callee = CB->getCalledFunction();
+    return Callee && Callee->doesNotReturn();
+  }
+  return false;
+}
+
+// A basic block can only return if it terminates with a ReturnInst and does not
+// contain calls to noreturn functions.
+static bool basicBlockCanReturn(BasicBlock &BB) {
+  if (!isa<ReturnInst>(BB.getTerminator()))
+    return false;
+  return none_of(BB, instructionDoesNotReturn);
+}
+
+// Set the noreturn function attribute if possible.
+static bool addNoReturnAttrs(const SCCNodeSet &SCCNodes) {
+  bool Changed = false;
+
+  for (Function *F : SCCNodes) {
+    if (!F || !F->hasExactDefinition() || F->hasFnAttribute(Attribute::Naked) ||
+        F->doesNotReturn())
+      continue;
+
+    // The function can return if any basic blocks can return.
+    // FIXME: this doesn't handle recursion or unreachable blocks.
+    if (none_of(*F, basicBlockCanReturn)) {
+      F->setDoesNotReturn();
+      Changed = true;
+    }
+  }
+
+  return Changed;
+}
+
+static bool functionWillReturn(const Function &F) {
+  // Must-progress function without side-effects must return.
+  if (F.mustProgress() && F.onlyReadsMemory())
+    return true;
+
+  // Can only analyze functions with a definition.
+  if (F.isDeclaration())
+    return false;
+
+  // Functions with loops require more sophisticated analysis, as the loop
+  // may be infinite. For now, don't try to handle them.
+  SmallVector<std::pair<const BasicBlock *, const BasicBlock *>> Backedges;
+  FindFunctionBackedges(F, Backedges);
+  if (!Backedges.empty())
+    return false;
+
+  // If there are no loops, then the function is willreturn if all calls in
+  // it are willreturn.
+  return all_of(instructions(F), [](const Instruction &I) {
+    return I.willReturn();
+  });
+}
+
+// Set the willreturn function attribute if possible.
+static bool addWillReturn(const SCCNodeSet &SCCNodes) {
+  bool Changed = false;
+
+  for (Function *F : SCCNodes) {
+    if (!F || F->willReturn() || !functionWillReturn(*F))
+      continue;
+
+    F->setWillReturn();
+    NumWillReturn++;
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+static SCCNodesResult createSCCNodeSet(ArrayRef<Function *> Functions) {
+  SCCNodesResult Res;
+  Res.HasUnknownCall = false;
+  for (Function *F : Functions) {
+    if (!F || F->hasOptNone() || F->hasFnAttribute(Attribute::Naked)) {
+      // Treat any function we're trying not to optimize as if it were an
+      // indirect call and omit it from the node set used below.
+      Res.HasUnknownCall = true;
+      continue;
+    }
+    // Track whether any functions in this SCC have an unknown call edge.
+    // Note: if this is ever a performance hit, we can common it with
+    // subsequent routines which also do scans over the instructions of the
+    // function.
+    if (!Res.HasUnknownCall) {
+      for (Instruction &I : instructions(*F)) {
+        if (auto *CB = dyn_cast<CallBase>(&I)) {
+          if (!CB->getCalledFunction()) {
+            Res.HasUnknownCall = true;
+            break;
+          }
+        }
+      }
+    }
+    Res.SCCNodes.insert(F);
+  }
+  return Res;
 }
 
 template <typename AARGetterT>
-static bool deriveAttrsInPostOrder(SCCNodeSet &SCCNodes,
-                                   AARGetterT &&AARGetter,
-                                   bool HasUnknownCall) {
+static bool deriveAttrsInPostOrder(ArrayRef<Function *> Functions,
+                                   AARGetterT &&AARGetter) {
+  SCCNodesResult Nodes = createSCCNodeSet(Functions);
   bool Changed = false;
 
   // Bail if the SCC only contains optnone functions.
-  if (SCCNodes.empty())
+  if (Nodes.SCCNodes.empty())
     return Changed;
 
-  Changed |= addArgumentReturnedAttrs(SCCNodes);
-  Changed |= addReadAttrs(SCCNodes, AARGetter);
-  Changed |= addArgumentAttrs(SCCNodes);
+  Changed |= addArgumentReturnedAttrs(Nodes.SCCNodes);
+  Changed |= addReadAttrs(Nodes.SCCNodes, AARGetter);
+  Changed |= addArgumentAttrs(Nodes.SCCNodes);
+  Changed |= inferConvergent(Nodes.SCCNodes);
+  Changed |= addNoReturnAttrs(Nodes.SCCNodes);
+  Changed |= addWillReturn(Nodes.SCCNodes);
 
   // If we have no external nodes participating in the SCC, we can deduce some
   // more precise attributes as well.
-  if (!HasUnknownCall) {
-    Changed |= addNoAliasAttrs(SCCNodes);
-    Changed |= addNonNullAttrs(SCCNodes);
-    Changed |= inferAttrsFromFunctionBodies(SCCNodes);
-    Changed |= addNoRecurseAttrs(SCCNodes);
+  if (!Nodes.HasUnknownCall) {
+    Changed |= addNoAliasAttrs(Nodes.SCCNodes);
+    Changed |= addNonNullAttrs(Nodes.SCCNodes);
+    Changed |= inferAttrsFromFunctionBodies(Nodes.SCCNodes);
+    Changed |= addNoRecurseAttrs(Nodes.SCCNodes);
   }
 
   return Changed;
@@ -1419,35 +1543,12 @@ PreservedAnalyses PostOrderFunctionAttrsPass::run(LazyCallGraph::SCC &C,
     return FAM.getResult<AAManager>(F);
   };
 
-  // Fill SCCNodes with the elements of the SCC. Also track whether there are
-  // any external or opt-none nodes that will prevent us from optimizing any
-  // part of the SCC.
-  SCCNodeSet SCCNodes;
-  bool HasUnknownCall = false;
+  SmallVector<Function *, 8> Functions;
   for (LazyCallGraph::Node &N : C) {
-    Function &F = N.getFunction();
-    if (F.hasOptNone() || F.hasFnAttribute(Attribute::Naked)) {
-      // Treat any function we're trying not to optimize as if it were an
-      // indirect call and omit it from the node set used below.
-      HasUnknownCall = true;
-      continue;
-    }
-    // Track whether any functions in this SCC have an unknown call edge.
-    // Note: if this is ever a performance hit, we can common it with
-    // subsequent routines which also do scans over the instructions of the
-    // function.
-    if (!HasUnknownCall)
-      for (Instruction &I : instructions(F))
-        if (auto *CB = dyn_cast<CallBase>(&I))
-          if (!CB->getCalledFunction()) {
-            HasUnknownCall = true;
-            break;
-          }
-
-    SCCNodes.insert(&F);
+    Functions.push_back(&N.getFunction());
   }
 
-  if (deriveAttrsInPostOrder(SCCNodes, AARGetter, HasUnknownCall))
+  if (deriveAttrsInPostOrder(Functions, AARGetter))
     return PreservedAnalyses::none();
 
   return PreservedAnalyses::all();
@@ -1477,11 +1578,11 @@ struct PostOrderFunctionAttrsLegacyPass : public CallGraphSCCPass {
 } // end anonymous namespace
 
 char PostOrderFunctionAttrsLegacyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(PostOrderFunctionAttrsLegacyPass, "functionattrs",
+INITIALIZE_PASS_BEGIN(PostOrderFunctionAttrsLegacyPass, "function-attrs",
                       "Deduce function attributes", false, false)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
-INITIALIZE_PASS_END(PostOrderFunctionAttrsLegacyPass, "functionattrs",
+INITIALIZE_PASS_END(PostOrderFunctionAttrsLegacyPass, "function-attrs",
                     "Deduce function attributes", false, false)
 
 Pass *llvm::createPostOrderFunctionAttrsLegacyPass() {
@@ -1490,26 +1591,12 @@ Pass *llvm::createPostOrderFunctionAttrsLegacyPass() {
 
 template <typename AARGetterT>
 static bool runImpl(CallGraphSCC &SCC, AARGetterT AARGetter) {
-
-  // Fill SCCNodes with the elements of the SCC. Used for quickly looking up
-  // whether a given CallGraphNode is in this SCC. Also track whether there are
-  // any external or opt-none nodes that will prevent us from optimizing any
-  // part of the SCC.
-  SCCNodeSet SCCNodes;
-  bool ExternalNode = false;
+  SmallVector<Function *, 8> Functions;
   for (CallGraphNode *I : SCC) {
-    Function *F = I->getFunction();
-    if (!F || F->hasOptNone() || F->hasFnAttribute(Attribute::Naked)) {
-      // External node or function we're trying not to optimize - we both avoid
-      // transform them and avoid leveraging information they provide.
-      ExternalNode = true;
-      continue;
-    }
-
-    SCCNodes.insert(F);
+    Functions.push_back(I->getFunction());
   }
 
-  return deriveAttrsInPostOrder(SCCNodes, AARGetter, ExternalNode);
+  return deriveAttrsInPostOrder(Functions, AARGetter);
 }
 
 bool PostOrderFunctionAttrsLegacyPass::runOnSCC(CallGraphSCC &SCC) {
@@ -1542,11 +1629,13 @@ struct ReversePostOrderFunctionAttrsLegacyPass : public ModulePass {
 
 char ReversePostOrderFunctionAttrsLegacyPass::ID = 0;
 
-INITIALIZE_PASS_BEGIN(ReversePostOrderFunctionAttrsLegacyPass, "rpo-functionattrs",
-                      "Deduce function attributes in RPO", false, false)
+INITIALIZE_PASS_BEGIN(ReversePostOrderFunctionAttrsLegacyPass,
+                      "rpo-function-attrs", "Deduce function attributes in RPO",
+                      false, false)
 INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
-INITIALIZE_PASS_END(ReversePostOrderFunctionAttrsLegacyPass, "rpo-functionattrs",
-                    "Deduce function attributes in RPO", false, false)
+INITIALIZE_PASS_END(ReversePostOrderFunctionAttrsLegacyPass,
+                    "rpo-function-attrs", "Deduce function attributes in RPO",
+                    false, false)
 
 Pass *llvm::createReversePostOrderFunctionAttrsPass() {
   return new ReversePostOrderFunctionAttrsLegacyPass();
@@ -1578,7 +1667,9 @@ static bool addNoRecurseAttrsTopDown(Function &F) {
     if (!CB || !CB->getParent()->getParent()->doesNotRecurse())
       return false;
   }
-  return setDoesNotRecurse(F);
+  F.setDoesNotRecurse();
+  ++NumNoRecurse;
+  return true;
 }
 
 static bool deduceFunctionAttributeInRPO(Module &M, CallGraph &CG) {
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionImport.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionImport.cpp
index 468bf19f2e48..18343030bc6a 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionImport.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionImport.cpp
@@ -124,14 +124,8 @@ static cl::opt<bool> ComputeDead("compute-dead", cl::init(true), cl::Hidden,
                                  cl::desc("Compute dead symbols"));
 
 static cl::opt<bool> EnableImportMetadata(
-    "enable-import-metadata", cl::init(
-#if !defined(NDEBUG)
-                                  true /*Enabled with asserts.*/
-#else
-                                  false
-#endif
-                                  ),
-    cl::Hidden, cl::desc("Enable import metadata like 'thinlto_src_module'"));
+    "enable-import-metadata", cl::init(false), cl::Hidden,
+    cl::desc("Enable import metadata like 'thinlto_src_module'"));
 
 /// Summary file to use for function importing when using -function-import from
 /// the command line.
@@ -261,8 +255,8 @@ selectCallee(const ModuleSummaryIndex &Index,
 
 namespace {
 
-using EdgeInfo = std::tuple<const FunctionSummary *, unsigned /* Threshold */,
-                            GlobalValue::GUID>;
+using EdgeInfo =
+    std::tuple<const GlobalValueSummary *, unsigned /* Threshold */>;
 
 } // anonymous namespace
 
@@ -282,8 +276,9 @@ updateValueInfoForIndirectCalls(const ModuleSummaryIndex &Index, ValueInfo VI) {
 }
 
 static void computeImportForReferencedGlobals(
-    const FunctionSummary &Summary, const ModuleSummaryIndex &Index,
+    const GlobalValueSummary &Summary, const ModuleSummaryIndex &Index,
     const GVSummaryMapTy &DefinedGVSummaries,
+    SmallVectorImpl<EdgeInfo> &Worklist,
     FunctionImporter::ImportMapTy &ImportList,
     StringMap<FunctionImporter::ExportSetTy> *ExportLists) {
   for (auto &VI : Summary.refs()) {
@@ -321,6 +316,11 @@ static void computeImportForReferencedGlobals(
         // which is more efficient than adding them here.
         if (ExportLists)
           (*ExportLists)[RefSummary->modulePath()].insert(VI);
+
+        // If variable is not writeonly we attempt to recursively analyze
+        // its references in order to import referenced constants.
+        if (!Index.isWriteOnly(cast<GlobalVarSummary>(RefSummary.get())))
+          Worklist.emplace_back(RefSummary.get(), 0);
         break;
       }
   }
@@ -360,7 +360,7 @@ static void computeImportForFunction(
     StringMap<FunctionImporter::ExportSetTy> *ExportLists,
     FunctionImporter::ImportThresholdsTy &ImportThresholds) {
   computeImportForReferencedGlobals(Summary, Index, DefinedGVSummaries,
-                                    ImportList, ExportLists);
+                                    Worklist, ImportList, ExportLists);
   static int ImportCount = 0;
   for (auto &Edge : Summary.calls()) {
     ValueInfo VI = Edge.first;
@@ -508,7 +508,7 @@ static void computeImportForFunction(
     ImportCount++;
 
     // Insert the newly imported function to the worklist.
-    Worklist.emplace_back(ResolvedCalleeSummary, AdjThreshold, VI.getGUID());
+    Worklist.emplace_back(ResolvedCalleeSummary, AdjThreshold);
   }
 }
 
@@ -549,13 +549,17 @@ static void ComputeImportForModule(
 
   // Process the newly imported functions and add callees to the worklist.
   while (!Worklist.empty()) {
-    auto FuncInfo = Worklist.pop_back_val();
-    auto *Summary = std::get<0>(FuncInfo);
-    auto Threshold = std::get<1>(FuncInfo);
-
-    computeImportForFunction(*Summary, Index, Threshold, DefinedGVSummaries,
-                             Worklist, ImportList, ExportLists,
-                             ImportThresholds);
+    auto GVInfo = Worklist.pop_back_val();
+    auto *Summary = std::get<0>(GVInfo);
+    auto Threshold = std::get<1>(GVInfo);
+
+    if (auto *FS = dyn_cast<FunctionSummary>(Summary))
+      computeImportForFunction(*FS, Index, Threshold, DefinedGVSummaries,
+                               Worklist, ImportList, ExportLists,
+                               ImportThresholds);
+    else
+      computeImportForReferencedGlobals(*Summary, Index, DefinedGVSummaries,
+                                        Worklist, ImportList, ExportLists);
   }
 
   // Print stats about functions considered but rejected for importing
@@ -884,6 +888,7 @@ void llvm::computeDeadSymbols(
   while (!Worklist.empty()) {
     auto VI = Worklist.pop_back_val();
     for (auto &Summary : VI.getSummaryList()) {
+      Summary->setLive(true);
       if (auto *AS = dyn_cast<AliasSummary>(Summary.get())) {
         // If this is an alias, visit the aliasee VI to ensure that all copies
         // are marked live and it is added to the worklist for further
@@ -891,8 +896,6 @@ void llvm::computeDeadSymbols(
         visit(AS->getAliaseeVI(), true);
         continue;
       }
-
-      Summary->setLive(true);
       for (auto Ref : Summary->refs())
         visit(Ref, false);
       if (auto *FS = dyn_cast<FunctionSummary>(Summary.get()))
@@ -1311,7 +1314,7 @@ static bool doImportingForModule(Module &M) {
 
   // Next we need to promote to global scope and rename any local values that
   // are potentially exported to other modules.
-  if (renameModuleForThinLTO(M, *Index, /*clearDSOOnDeclarations=*/false,
+  if (renameModuleForThinLTO(M, *Index, /*ClearDSOLocalOnDeclarations=*/false,
                              /*GlobalsToImport=*/nullptr)) {
     errs() << "Error renaming module\n";
     return false;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index 9524d9a36204..223a05e8ea02 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -268,6 +268,7 @@ CleanupPointerRootUsers(GlobalVariable *GV,
         I = J;
       } while (true);
       I->eraseFromParent();
+      Changed = true;
     }
   }
 
@@ -285,7 +286,7 @@ static bool CleanupConstantGlobalUsers(
   // we delete a constant array, we may also be holding pointer to one of its
   // elements (or an element of one of its elements if we're dealing with an
   // array of arrays) in the worklist.
-  SmallVector<WeakTrackingVH, 8> WorkList(V->user_begin(), V->user_end());
+  SmallVector<WeakTrackingVH, 8> WorkList(V->users());
   while (!WorkList.empty()) {
     Value *UV = WorkList.pop_back_val();
     if (!UV)
@@ -1879,7 +1880,8 @@ static bool isPointerValueDeadOnEntryToFunction(
           // and the number of bits loaded in L is less than or equal to
           // the number of bits stored in S.
           return DT.dominates(S, L) &&
-                 DL.getTypeStoreSize(LTy) <= DL.getTypeStoreSize(STy);
+                 DL.getTypeStoreSize(LTy).getFixedSize() <=
+                     DL.getTypeStoreSize(STy).getFixedSize();
         }))
       return false;
   }
@@ -1931,8 +1933,7 @@ static void makeAllConstantUsesInstructions(Constant *C) {
   SmallVector<Value*,4> UUsers;
   for (auto *U : Users) {
     UUsers.clear();
-    for (auto *UU : U->users())
-      UUsers.push_back(UU);
+    append_range(UUsers, U->users());
     for (auto *UU : UUsers) {
       Instruction *UI = cast<Instruction>(UU);
       Instruction *NewU = U->getAsInstruction();
@@ -1989,12 +1990,13 @@ processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS,
     return true;
   }
 
+  bool Changed = false;
+
   // If the global is never loaded (but may be stored to), it is dead.
   // Delete it now.
   if (!GS.IsLoaded) {
     LLVM_DEBUG(dbgs() << "GLOBAL NEVER LOADED: " << *GV << "\n");
 
-    bool Changed;
     if (isLeakCheckerRoot(GV)) {
       // Delete any constant stores to the global.
       Changed = CleanupPointerRootUsers(GV, GetTLI);
@@ -2020,11 +2022,14 @@ processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS,
     // Don't actually mark a global constant if it's atomic because atomic loads
     // are implemented by a trivial cmpxchg in some edge-cases and that usually
     // requires write access to the variable even if it's not actually changed.
-    if (GS.Ordering == AtomicOrdering::NotAtomic)
+    if (GS.Ordering == AtomicOrdering::NotAtomic) {
+      assert(!GV->isConstant() && "Expected a non-constant global");
       GV->setConstant(true);
+      Changed = true;
+    }
 
     // Clean up any obviously simplifiable users now.
-    CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, GetTLI);
+    Changed |= CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, GetTLI);
 
     // If the global is dead now, just nuke it.
     if (GV->use_empty()) {
@@ -2084,7 +2089,7 @@ processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS,
     }
   }
 
-  return false;
+  return Changed;
 }
 
 /// Analyze the specified global variable and optimize it if possible.  If we
@@ -2219,8 +2224,7 @@ isValidCandidateForColdCC(Function &F,
     BlockFrequencyInfo &CallerBFI = GetBFI(*CallerFunc);
     if (!isColdCallSite(CB, CallerBFI))
       return false;
-    auto It = std::find(AllCallsCold.begin(), AllCallsCold.end(), CallerFunc);
-    if (It == AllCallsCold.end())
+    if (!llvm::is_contained(AllCallsCold, CallerFunc))
       return false;
   }
   return true;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/HotColdSplitting.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
index d0bd0166534a..aa708ee520b1 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
@@ -29,7 +29,6 @@
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/CFG.h"
@@ -68,7 +67,9 @@
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <algorithm>
+#include <limits>
 #include <cassert>
+#include <string>
 
 #define DEBUG_TYPE "hotcoldsplit"
 
@@ -77,14 +78,29 @@ STATISTIC(NumColdRegionsOutlined, "Number of cold regions outlined.");
 
 using namespace llvm;
 
-static cl::opt<bool> EnableStaticAnalyis("hot-cold-static-analysis",
-                              cl::init(true), cl::Hidden);
+static cl::opt<bool> EnableStaticAnalysis("hot-cold-static-analysis",
+                                          cl::init(true), cl::Hidden);
 
 static cl::opt<int>
     SplittingThreshold("hotcoldsplit-threshold", cl::init(2), cl::Hidden,
                        cl::desc("Base penalty for splitting cold code (as a "
                                 "multiple of TCC_Basic)"));
 
+static cl::opt<bool> EnableColdSection(
+    "enable-cold-section", cl::init(false), cl::Hidden,
+    cl::desc("Enable placement of extracted cold functions"
+             " into a separate section after hot-cold splitting."));
+
+static cl::opt<std::string>
+    ColdSectionName("hotcoldsplit-cold-section-name", cl::init("__llvm_cold"),
+                    cl::Hidden,
+                    cl::desc("Name for the section containing cold functions "
+                             "extracted by hot-cold splitting."));
+
+static cl::opt<int> MaxParametersForSplit(
+    "hotcoldsplit-max-params", cl::init(4), cl::Hidden,
+    cl::desc("Maximum number of parameters for a split function"));
+
 namespace {
 // Same as blockEndsInUnreachable in CodeGen/BranchFolding.cpp. Do not modify
 // this function unless you modify the MBB version as well.
@@ -221,11 +237,11 @@ bool HotColdSplitting::shouldOutlineFrom(const Function &F) const {
 }
 
 /// Get the benefit score of outlining \p Region.
-static int getOutliningBenefit(ArrayRef<BasicBlock *> Region,
-                               TargetTransformInfo &TTI) {
+static InstructionCost getOutliningBenefit(ArrayRef<BasicBlock *> Region,
+                                           TargetTransformInfo &TTI) {
   // Sum up the code size costs of non-terminator instructions. Tight coupling
   // with \ref getOutliningPenalty is needed to model the costs of terminators.
-  int Benefit = 0;
+  InstructionCost Benefit = 0;
   for (BasicBlock *BB : Region)
     for (Instruction &I : BB->instructionsWithoutDebug())
       if (&I != BB->getTerminator())
@@ -246,18 +262,6 @@ static int getOutliningPenalty(ArrayRef<BasicBlock *> Region,
   if (SplittingThreshold <= 0)
     return Penalty;
 
-  // The typical code size cost for materializing an argument for the outlined
-  // call.
-  LLVM_DEBUG(dbgs() << "Applying penalty for: " << NumInputs << " inputs\n");
-  const int CostForArgMaterialization = TargetTransformInfo::TCC_Basic;
-  Penalty += CostForArgMaterialization * NumInputs;
-
-  // The typical code size cost for an output alloca, its associated store, and
-  // its associated reload.
-  LLVM_DEBUG(dbgs() << "Applying penalty for: " << NumOutputs << " outputs\n");
-  const int CostForRegionOutput = 3 * TargetTransformInfo::TCC_Basic;
-  Penalty += CostForRegionOutput * NumOutputs;
-
   // Find the number of distinct exit blocks for the region. Use a conservative
   // check to determine whether control returns from the region.
   bool NoBlocksReturn = true;
@@ -271,13 +275,55 @@ static int getOutliningPenalty(ArrayRef<BasicBlock *> Region,
     }
 
     for (BasicBlock *SuccBB : successors(BB)) {
-      if (find(Region, SuccBB) == Region.end()) {
+      if (!is_contained(Region, SuccBB)) {
         NoBlocksReturn = false;
         SuccsOutsideRegion.insert(SuccBB);
       }
     }
   }
 
+  // Count the number of phis in exit blocks with >= 2 incoming values from the
+  // outlining region. These phis are split (\ref severSplitPHINodesOfExits),
+  // and new outputs are created to supply the split phis. CodeExtractor can't
+  // report these new outputs until extraction begins, but it's important to
+  // factor the cost of the outputs into the cost calculation.
+  unsigned NumSplitExitPhis = 0;
+  for (BasicBlock *ExitBB : SuccsOutsideRegion) {
+    for (PHINode &PN : ExitBB->phis()) {
+      // Find all incoming values from the outlining region.
+      int NumIncomingVals = 0;
+      for (unsigned i = 0; i < PN.getNumIncomingValues(); ++i)
+        if (find(Region, PN.getIncomingBlock(i)) != Region.end()) {
+          ++NumIncomingVals;
+          if (NumIncomingVals > 1) {
+            ++NumSplitExitPhis;
+            break;
+          }
+        }
+    }
+  }
+
+  // Apply a penalty for calling the split function. Factor in the cost of
+  // materializing all of the parameters.
+  int NumOutputsAndSplitPhis = NumOutputs + NumSplitExitPhis;
+  int NumParams = NumInputs + NumOutputsAndSplitPhis;
+  if (NumParams > MaxParametersForSplit) {
+    LLVM_DEBUG(dbgs() << NumInputs << " inputs and " << NumOutputsAndSplitPhis
+                      << " outputs exceeds parameter limit ("
+                      << MaxParametersForSplit << ")\n");
+    return std::numeric_limits<int>::max();
+  }
+  const int CostForArgMaterialization = 2 * TargetTransformInfo::TCC_Basic;
+  LLVM_DEBUG(dbgs() << "Applying penalty for: " << NumParams << " params\n");
+  Penalty += CostForArgMaterialization * NumParams;
+
+  // Apply the typical code size cost for an output alloca and its associated
+  // reload in the caller. Also penalize the associated store in the callee.
+  LLVM_DEBUG(dbgs() << "Applying penalty for: " << NumOutputsAndSplitPhis
+                    << " outputs/split phis\n");
+  const int CostForRegionOutput = 3 * TargetTransformInfo::TCC_Basic;
+  Penalty += CostForRegionOutput * NumOutputsAndSplitPhis;
+
   // Apply a `noreturn` bonus.
   if (NoBlocksReturn) {
     LLVM_DEBUG(dbgs() << "Applying bonus for: " << Region.size()
@@ -287,7 +333,7 @@ static int getOutliningPenalty(ArrayRef<BasicBlock *> Region,
 
   // Apply a penalty for having more than one successor outside of the region.
   // This penalty accounts for the switch needed in the caller.
-  if (!SuccsOutsideRegion.empty()) {
+  if (SuccsOutsideRegion.size() > 1) {
     LLVM_DEBUG(dbgs() << "Applying penalty for: " << SuccsOutsideRegion.size()
                       << " non-region successors\n");
     Penalty += (SuccsOutsideRegion.size() - 1) * TargetTransformInfo::TCC_Basic;
@@ -312,12 +358,12 @@ Function *HotColdSplitting::extractColdRegion(
   // splitting.
   SetVector<Value *> Inputs, Outputs, Sinks;
   CE.findInputsOutputs(Inputs, Outputs, Sinks);
-  int OutliningBenefit = getOutliningBenefit(Region, TTI);
+  InstructionCost OutliningBenefit = getOutliningBenefit(Region, TTI);
   int OutliningPenalty =
       getOutliningPenalty(Region, Inputs.size(), Outputs.size());
   LLVM_DEBUG(dbgs() << "Split profitability: benefit = " << OutliningBenefit
                     << ", penalty = " << OutliningPenalty << "\n");
-  if (OutliningBenefit <= OutliningPenalty)
+  if (!OutliningBenefit.isValid() || OutliningBenefit <= OutliningPenalty)
     return nullptr;
 
   Function *OrigF = Region[0]->getParent();
@@ -331,8 +377,12 @@ Function *HotColdSplitting::extractColdRegion(
     }
     CI->setIsNoInline();
 
-    if (OrigF->hasSection())
-      OutF->setSection(OrigF->getSection());
+    if (EnableColdSection)
+      OutF->setSection(ColdSectionName);
+    else {
+      if (OrigF->hasSection())
+        OutF->setSection(OrigF->getSection());
+    }
 
     markFunctionCold(*OutF, BFI != nullptr);
 
@@ -575,7 +625,7 @@ bool HotColdSplitting::outlineColdRegions(Function &F, bool HasProfileSummary) {
       continue;
 
     bool Cold = (BFI && PSI->isColdBlock(BB, BFI)) ||
-                (EnableStaticAnalyis && unlikelyExecuted(*BB));
+                (EnableStaticAnalysis && unlikelyExecuted(*BB));
     if (!Cold)
       continue;
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/IPConstantPropagation.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/IPConstantPropagation.cpp
deleted file mode 100644
index 8d05a72d68da..000000000000
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/IPConstantPropagation.cpp
+++ /dev/null
@@ -1,308 +0,0 @@
-//===-- IPConstantPropagation.cpp - Propagate constants through calls -----===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass implements an _extremely_ simple interprocedural constant
-// propagation pass.  It could certainly be improved in many different ways,
-// like using a worklist.  This pass makes arguments dead, but does not remove
-// them.  The existing dead argument elimination pass should be run after this
-// to clean up the mess.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/AbstractCallSite.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Transforms/IPO.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "ipconstprop"
-
-STATISTIC(NumArgumentsProped, "Number of args turned into constants");
-STATISTIC(NumReturnValProped, "Number of return values turned into constants");
-
-namespace {
-  /// IPCP - The interprocedural constant propagation pass
-  ///
-  struct IPCP : public ModulePass {
-    static char ID; // Pass identification, replacement for typeid
-    IPCP() : ModulePass(ID) {
-      initializeIPCPPass(*PassRegistry::getPassRegistry());
-    }
-
-    bool runOnModule(Module &M) override;
-  };
-}
-
-/// PropagateConstantsIntoArguments - Look at all uses of the specified
-/// function.  If all uses are direct call sites, and all pass a particular
-/// constant in for an argument, propagate that constant in as the argument.
-///
-static bool PropagateConstantsIntoArguments(Function &F) {
-  if (F.arg_empty() || F.use_empty()) return false; // No arguments? Early exit.
-
-  // For each argument, keep track of its constant value and whether it is a
-  // constant or not.  The bool is driven to true when found to be non-constant.
-  SmallVector<PointerIntPair<Constant *, 1, bool>, 16> ArgumentConstants;
-  ArgumentConstants.resize(F.arg_size());
-
-  unsigned NumNonconstant = 0;
-  for (Use &U : F.uses()) {
-    User *UR = U.getUser();
-    // Ignore blockaddress uses.
-    if (isa<BlockAddress>(UR)) continue;
-
-    // If no abstract call site was created we did not understand the use, bail.
-    AbstractCallSite ACS(&U);
-    if (!ACS)
-      return false;
-
-    // Mismatched argument count is undefined behavior. Simply bail out to avoid
-    // handling of such situations below (avoiding asserts/crashes).
-    unsigned NumActualArgs = ACS.getNumArgOperands();
-    if (F.isVarArg() ? ArgumentConstants.size() > NumActualArgs
-                     : ArgumentConstants.size() != NumActualArgs)
-      return false;
-
-    // Check out all of the potentially constant arguments.  Note that we don't
-    // inspect varargs here.
-    Function::arg_iterator Arg = F.arg_begin();
-    for (unsigned i = 0, e = ArgumentConstants.size(); i != e; ++i, ++Arg) {
-
-      // If this argument is known non-constant, ignore it.
-      if (ArgumentConstants[i].getInt())
-        continue;
-
-      Value *V = ACS.getCallArgOperand(i);
-      Constant *C = dyn_cast_or_null<Constant>(V);
-
-      // Mismatched argument type is undefined behavior. Simply bail out to avoid
-      // handling of such situations below (avoiding asserts/crashes).
-      if (C && Arg->getType() != C->getType())
-        return false;
-
-      // We can only propagate thread independent values through callbacks.
-      // This is different to direct/indirect call sites because for them we
-      // know the thread executing the caller and callee is the same. For
-      // callbacks this is not guaranteed, thus a thread dependent value could
-      // be different for the caller and callee, making it invalid to propagate.
-      if (C && ACS.isCallbackCall() && C->isThreadDependent()) {
-        // Argument became non-constant. If all arguments are non-constant now,
-        // give up on this function.
-        if (++NumNonconstant == ArgumentConstants.size())
-          return false;
-
-        ArgumentConstants[i].setInt(true);
-        continue;
-      }
-
-      if (C && ArgumentConstants[i].getPointer() == nullptr) {
-        ArgumentConstants[i].setPointer(C); // First constant seen.
-      } else if (C && ArgumentConstants[i].getPointer() == C) {
-        // Still the constant value we think it is.
-      } else if (V == &*Arg) {
-        // Ignore recursive calls passing argument down.
-      } else {
-        // Argument became non-constant.  If all arguments are non-constant now,
-        // give up on this function.
-        if (++NumNonconstant == ArgumentConstants.size())
-          return false;
-        ArgumentConstants[i].setInt(true);
-      }
-    }
-  }
-
-  // If we got to this point, there is a constant argument!
-  assert(NumNonconstant != ArgumentConstants.size());
-  bool MadeChange = false;
-  Function::arg_iterator AI = F.arg_begin();
-  for (unsigned i = 0, e = ArgumentConstants.size(); i != e; ++i, ++AI) {
-    // Do we have a constant argument?
-    if (ArgumentConstants[i].getInt() || AI->use_empty() ||
-        (AI->hasByValAttr() && !F.onlyReadsMemory()))
-      continue;
-
-    Value *V = ArgumentConstants[i].getPointer();
-    if (!V) V = UndefValue::get(AI->getType());
-    AI->replaceAllUsesWith(V);
-    ++NumArgumentsProped;
-    MadeChange = true;
-  }
-  return MadeChange;
-}
-
-
-// Check to see if this function returns one or more constants. If so, replace
-// all callers that use those return values with the constant value. This will
-// leave in the actual return values and instructions, but deadargelim will
-// clean that up.
-//
-// Additionally if a function always returns one of its arguments directly,
-// callers will be updated to use the value they pass in directly instead of
-// using the return value.
-static bool PropagateConstantReturn(Function &F) {
-  if (F.getReturnType()->isVoidTy())
-    return false; // No return value.
-
-  // We can infer and propagate the return value only when we know that the
-  // definition we'll get at link time is *exactly* the definition we see now.
-  // For more details, see GlobalValue::mayBeDerefined.
-  if (!F.isDefinitionExact())
-    return false;
-
-  // Don't touch naked functions. The may contain asm returning
-  // value we don't see, so we may end up interprocedurally propagating
-  // the return value incorrectly.
-  if (F.hasFnAttribute(Attribute::Naked))
-    return false;
-
-  // Check to see if this function returns a constant.
-  SmallVector<Value *,4> RetVals;
-  StructType *STy = dyn_cast<StructType>(F.getReturnType());
-  if (STy)
-    for (unsigned i = 0, e = STy->getNumElements(); i < e; ++i)
-      RetVals.push_back(UndefValue::get(STy->getElementType(i)));
-  else
-    RetVals.push_back(UndefValue::get(F.getReturnType()));
-
-  unsigned NumNonConstant = 0;
-  for (BasicBlock &BB : F)
-    if (ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator())) {
-      for (unsigned i = 0, e = RetVals.size(); i != e; ++i) {
-        // Already found conflicting return values?
-        Value *RV = RetVals[i];
-        if (!RV)
-          continue;
-
-        // Find the returned value
-        Value *V;
-        if (!STy)
-          V = RI->getOperand(0);
-        else
-          V = FindInsertedValue(RI->getOperand(0), i);
-
-        if (V) {
-          // Ignore undefs, we can change them into anything
-          if (isa<UndefValue>(V))
-            continue;
-
-          // Try to see if all the rets return the same constant or argument.
-          if (isa<Constant>(V) || isa<Argument>(V)) {
-            if (isa<UndefValue>(RV)) {
-              // No value found yet? Try the current one.
-              RetVals[i] = V;
-              continue;
-            }
-            // Returning the same value? Good.
-            if (RV == V)
-              continue;
-          }
-        }
-        // Different or no known return value? Don't propagate this return
-        // value.
-        RetVals[i] = nullptr;
-        // All values non-constant? Stop looking.
-        if (++NumNonConstant == RetVals.size())
-          return false;
-      }
-    }
-
-  // If we got here, the function returns at least one constant value.  Loop
-  // over all users, replacing any uses of the return value with the returned
-  // constant.
-  bool MadeChange = false;
-  for (Use &U : F.uses()) {
-    CallBase *CB = dyn_cast<CallBase>(U.getUser());
-
-    // Not a call instruction or a call instruction that's not calling F
-    // directly?
-    if (!CB || !CB->isCallee(&U))
-      continue;
-
-    // Call result not used?
-    if (CB->use_empty())
-      continue;
-
-    MadeChange = true;
-
-    if (!STy) {
-      Value* New = RetVals[0];
-      if (Argument *A = dyn_cast<Argument>(New))
-        // Was an argument returned? Then find the corresponding argument in
-        // the call instruction and use that.
-        New = CB->getArgOperand(A->getArgNo());
-      CB->replaceAllUsesWith(New);
-      continue;
-    }
-
-    for (auto I = CB->user_begin(), E = CB->user_end(); I != E;) {
-      Instruction *Ins = cast<Instruction>(*I);
-
-      // Increment now, so we can remove the use
-      ++I;
-
-      // Find the index of the retval to replace with
-      int index = -1;
-      if (ExtractValueInst *EV = dyn_cast<ExtractValueInst>(Ins))
-        if (EV->getNumIndices() == 1)
-          index = *EV->idx_begin();
-
-      // If this use uses a specific return value, and we have a replacement,
-      // replace it.
-      if (index != -1) {
-        Value *New = RetVals[index];
-        if (New) {
-          if (Argument *A = dyn_cast<Argument>(New))
-            // Was an argument returned? Then find the corresponding argument in
-            // the call instruction and use that.
-            New = CB->getArgOperand(A->getArgNo());
-          Ins->replaceAllUsesWith(New);
-          Ins->eraseFromParent();
-        }
-      }
-    }
-  }
-
-  if (MadeChange) ++NumReturnValProped;
-  return MadeChange;
-}
-
-char IPCP::ID = 0;
-INITIALIZE_PASS(IPCP, "ipconstprop",
-                "Interprocedural constant propagation", false, false)
-
-ModulePass *llvm::createIPConstantPropagationPass() { return new IPCP(); }
-
-bool IPCP::runOnModule(Module &M) {
-  if (skipModule(M))
-    return false;
-
-  bool Changed = false;
-  bool LocalChange = true;
-
-  // FIXME: instead of using smart algorithms, we just iterate until we stop
-  // making changes.
-  while (LocalChange) {
-    LocalChange = false;
-    for (Function &F : M)
-      if (!F.isDeclaration()) {
-        // Delete any klingons.
-        F.removeDeadConstantUsers();
-        if (F.hasLocalLinkage())
-          LocalChange |= PropagateConstantsIntoArguments(F);
-        Changed |= PropagateConstantReturn(F);
-      }
-    Changed |= LocalChange;
-  }
-  return Changed;
-}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/IPO.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/IPO.cpp
index d37b9236380d..f4c12dd7f4cd 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/IPO.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/IPO.cpp
@@ -25,6 +25,7 @@ using namespace llvm;
 void llvm::initializeIPO(PassRegistry &Registry) {
   initializeOpenMPOptLegacyPassPass(Registry);
   initializeArgPromotionPass(Registry);
+  initializeAnnotation2MetadataLegacyPass(Registry);
   initializeCalledValuePropagationLegacyPassPass(Registry);
   initializeConstantMergeLegacyPassPass(Registry);
   initializeCrossDSOCFIPass(Registry);
@@ -35,13 +36,13 @@ void llvm::initializeIPO(PassRegistry &Registry) {
   initializeGlobalOptLegacyPassPass(Registry);
   initializeGlobalSplitPass(Registry);
   initializeHotColdSplittingLegacyPassPass(Registry);
-  initializeIPCPPass(Registry);
+  initializeIROutlinerLegacyPassPass(Registry);
   initializeAlwaysInlinerLegacyPassPass(Registry);
   initializeSimpleInlinerPass(Registry);
   initializeInferFunctionAttrsLegacyPassPass(Registry);
   initializeInternalizeLegacyPassPass(Registry);
-  initializeLoopExtractorPass(Registry);
-  initializeBlockExtractorPass(Registry);
+  initializeLoopExtractorLegacyPassPass(Registry);
+  initializeBlockExtractorLegacyPassPass(Registry);
   initializeSingleLoopExtractorPass(Registry);
   initializeLowerTypeTestsPass(Registry);
   initializeMergeFunctionsLegacyPassPass(Registry);
@@ -104,10 +105,6 @@ void LLVMAddGlobalOptimizerPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createGlobalOptimizerPass());
 }
 
-void LLVMAddIPConstantPropagationPass(LLVMPassManagerRef PM) {
-  unwrap(PM)->add(createIPConstantPropagationPass());
-}
-
 void LLVMAddPruneEHPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createPruneEHPass());
 }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/IROutliner.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/IROutliner.cpp
new file mode 100644
index 000000000000..4b6a4f3d8fc4
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/IROutliner.cpp
@@ -0,0 +1,1764 @@
+//===- IROutliner.cpp -- Outline Similar Regions ----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+// Implementation for the IROutliner which is used by the IROutliner Pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/IROutliner.h"
+#include "llvm/Analysis/IRSimilarityIdentifier.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/IPO.h"
+#include <map>
+#include <set>
+#include <vector>
+
+#define DEBUG_TYPE "iroutliner"
+
+using namespace llvm;
+using namespace IRSimilarity;
+
+// Set to true if the user wants the ir outliner to run on linkonceodr linkage
+// functions. This is false by default because the linker can dedupe linkonceodr
+// functions. Since the outliner is confined to a single module (modulo LTO),
+// this is off by default. It should, however, be the default behavior in
+// LTO.
+static cl::opt<bool> EnableLinkOnceODRIROutlining(
+    "enable-linkonceodr-ir-outlining", cl::Hidden,
+    cl::desc("Enable the IR outliner on linkonceodr functions"),
+    cl::init(false));
+
+// This is a debug option to test small pieces of code to ensure that outlining
+// works correctly.
+static cl::opt<bool> NoCostModel(
+    "ir-outlining-no-cost", cl::init(false), cl::ReallyHidden,
+    cl::desc("Debug option to outline greedily, without restriction that "
+             "calculated benefit outweighs cost"));
+
+/// The OutlinableGroup holds all the overarching information for outlining
+/// a set of regions that are structurally similar to one another, such as the
+/// types of the overall function, the output blocks, the sets of stores needed
+/// and a list of the different regions. This information is used in the
+/// deduplication of extracted regions with the same structure.
+struct OutlinableGroup {
+  /// The sections that could be outlined
+  std::vector<OutlinableRegion *> Regions;
+
+  /// The argument types for the function created as the overall function to
+  /// replace the extracted function for each region.
+  std::vector<Type *> ArgumentTypes;
+  /// The FunctionType for the overall function.
+  FunctionType *OutlinedFunctionType = nullptr;
+  /// The Function for the collective overall function.
+  Function *OutlinedFunction = nullptr;
+
+  /// Flag for whether we should not consider this group of OutlinableRegions
+  /// for extraction.
+  bool IgnoreGroup = false;
+
+  /// The return block for the overall function.
+  BasicBlock *EndBB = nullptr;
+
+  /// A set containing the different GVN store sets needed. Each array contains
+  /// a sorted list of the different values that need to be stored into output
+  /// registers.
+  DenseSet<ArrayRef<unsigned>> OutputGVNCombinations;
+
+  /// Flag for whether the \ref ArgumentTypes have been defined after the
+  /// extraction of the first region.
+  bool InputTypesSet = false;
+
+  /// The number of input values in \ref ArgumentTypes.  Anything after this
+  /// index in ArgumentTypes is an output argument.
+  unsigned NumAggregateInputs = 0;
+
+  /// The number of instructions that will be outlined by extracting \ref
+  /// Regions.
+  InstructionCost Benefit = 0;
+  /// The number of added instructions needed for the outlining of the \ref
+  /// Regions.
+  InstructionCost Cost = 0;
+
+  /// The argument that needs to be marked with the swifterr attribute.  If not
+  /// needed, there is no value.
+  Optional<unsigned> SwiftErrorArgument;
+
+  /// For the \ref Regions, we look at every Value.  If it is a constant,
+  /// we check whether it is the same in Region.
+  ///
+  /// \param [in,out] NotSame contains the global value numbers where the
+  /// constant is not always the same, and must be passed in as an argument.
+  void findSameConstants(DenseSet<unsigned> &NotSame);
+
+  /// For the regions, look at each set of GVN stores needed and account for
+  /// each combination.  Add an argument to the argument types if there is
+  /// more than one combination.
+  ///
+  /// \param [in] M - The module we are outlining from.
+  void collectGVNStoreSets(Module &M);
+};
+
+/// Move the contents of \p SourceBB to before the last instruction of \p
+/// TargetBB.
+/// \param SourceBB - the BasicBlock to pull Instructions from.
+/// \param TargetBB - the BasicBlock to put Instruction into.
+static void moveBBContents(BasicBlock &SourceBB, BasicBlock &TargetBB) {
+  BasicBlock::iterator BBCurr, BBEnd, BBNext;
+  for (BBCurr = SourceBB.begin(), BBEnd = SourceBB.end(); BBCurr != BBEnd;
+       BBCurr = BBNext) {
+    BBNext = std::next(BBCurr);
+    BBCurr->moveBefore(TargetBB, TargetBB.end());
+  }
+}
+
+void OutlinableRegion::splitCandidate() {
+  assert(!CandidateSplit && "Candidate already split!");
+
+  Instruction *StartInst = (*Candidate->begin()).Inst;
+  Instruction *EndInst = (*Candidate->end()).Inst;
+  assert(StartInst && EndInst && "Expected a start and end instruction?");
+  StartBB = StartInst->getParent();
+  PrevBB = StartBB;
+
+  // The basic block gets split like so:
+  // block:                 block:
+  //   inst1                  inst1
+  //   inst2                  inst2
+  //   region1               br block_to_outline
+  //   region2              block_to_outline:
+  //   region3          ->    region1
+  //   region4                region2
+  //   inst3                  region3
+  //   inst4                  region4
+  //                          br block_after_outline
+  //                        block_after_outline:
+  //                          inst3
+  //                          inst4
+
+  std::string OriginalName = PrevBB->getName().str();
+
+  StartBB = PrevBB->splitBasicBlock(StartInst, OriginalName + "_to_outline");
+
+  // This is the case for the inner block since we do not have to include
+  // multiple blocks.
+  EndBB = StartBB;
+  FollowBB = EndBB->splitBasicBlock(EndInst, OriginalName + "_after_outline");
+
+  CandidateSplit = true;
+}
+
+void OutlinableRegion::reattachCandidate() {
+  assert(CandidateSplit && "Candidate is not split!");
+
+  // The basic block gets reattached like so:
+  // block:                        block:
+  //   inst1                         inst1
+  //   inst2                         inst2
+  //   br block_to_outline           region1
+  // block_to_outline:        ->     region2
+  //   region1                       region3
+  //   region2                       region4
+  //   region3                       inst3
+  //   region4                       inst4
+  //   br block_after_outline
+  // block_after_outline:
+  //   inst3
+  //   inst4
+  assert(StartBB != nullptr && "StartBB for Candidate is not defined!");
+  assert(FollowBB != nullptr && "StartBB for Candidate is not defined!");
+
+  // StartBB should only have one predecessor since we put an unconditional
+  // branch at the end of PrevBB when we split the BasicBlock.
+  PrevBB = StartBB->getSinglePredecessor();
+  assert(PrevBB != nullptr &&
+         "No Predecessor for the region start basic block!");
+
+  assert(PrevBB->getTerminator() && "Terminator removed from PrevBB!");
+  assert(EndBB->getTerminator() && "Terminator removed from EndBB!");
+  PrevBB->getTerminator()->eraseFromParent();
+  EndBB->getTerminator()->eraseFromParent();
+
+  moveBBContents(*StartBB, *PrevBB);
+
+  BasicBlock *PlacementBB = PrevBB;
+  if (StartBB != EndBB)
+    PlacementBB = EndBB;
+  moveBBContents(*FollowBB, *PlacementBB);
+
+  PrevBB->replaceSuccessorsPhiUsesWith(StartBB, PrevBB);
+  PrevBB->replaceSuccessorsPhiUsesWith(FollowBB, PlacementBB);
+  StartBB->eraseFromParent();
+  FollowBB->eraseFromParent();
+
+  // Make sure to save changes back to the StartBB.
+  StartBB = PrevBB;
+  EndBB = nullptr;
+  PrevBB = nullptr;
+  FollowBB = nullptr;
+
+  CandidateSplit = false;
+}
+
+/// Find whether \p V matches the Constants previously found for the \p GVN.
+///
+/// \param V - The value to check for consistency.
+/// \param GVN - The global value number assigned to \p V.
+/// \param GVNToConstant - The mapping of global value number to Constants.
+/// \returns true if the Value matches the Constant mapped to by V and false if
+/// it \p V is a Constant but does not match.
+/// \returns None if \p V is not a Constant.
+static Optional<bool>
+constantMatches(Value *V, unsigned GVN,
+                DenseMap<unsigned, Constant *> &GVNToConstant) {
+  // See if we have a constants
+  Constant *CST = dyn_cast<Constant>(V);
+  if (!CST)
+    return None;
+
+  // Holds a mapping from a global value number to a Constant.
+  DenseMap<unsigned, Constant *>::iterator GVNToConstantIt;
+  bool Inserted;
+
+
+  // If we have a constant, try to make a new entry in the GVNToConstant.
+  std::tie(GVNToConstantIt, Inserted) =
+      GVNToConstant.insert(std::make_pair(GVN, CST));
+  // If it was found and is not equal, it is not the same. We do not
+  // handle this case yet, and exit early.
+  if (Inserted || (GVNToConstantIt->second == CST))
+    return true;
+
+  return false;
+}
+
+InstructionCost OutlinableRegion::getBenefit(TargetTransformInfo &TTI) {
+  InstructionCost Benefit = 0;
+
+  // Estimate the benefit of outlining a specific sections of the program.  We
+  // delegate mostly this task to the TargetTransformInfo so that if the target
+  // has specific changes, we can have a more accurate estimate.
+
+  // However, getInstructionCost delegates the code size calculation for
+  // arithmetic instructions to getArithmeticInstrCost in
+  // include/Analysis/TargetTransformImpl.h, where it always estimates that the
+  // code size for a division and remainder instruction to be equal to 4, and
+  // everything else to 1.  This is not an accurate representation of the
+  // division instruction for targets that have a native division instruction.
+  // To be overly conservative, we only add 1 to the number of instructions for
+  // each division instruction.
+  for (Instruction &I : *StartBB) {
+    switch (I.getOpcode()) {
+    case Instruction::FDiv:
+    case Instruction::FRem:
+    case Instruction::SDiv:
+    case Instruction::SRem:
+    case Instruction::UDiv:
+    case Instruction::URem:
+      Benefit += 1;
+      break;
+    default:
+      Benefit += TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize);
+      break;
+    }
+  }
+
+  return Benefit;
+}
+
+/// Find whether \p Region matches the global value numbering to Constant
+/// mapping found so far.
+///
+/// \param Region - The OutlinableRegion we are checking for constants
+/// \param GVNToConstant - The mapping of global value number to Constants.
+/// \param NotSame - The set of global value numbers that do not have the same
+/// constant in each region.
+/// \returns true if all Constants are the same in every use of a Constant in \p
+/// Region and false if not
+static bool
+collectRegionsConstants(OutlinableRegion &Region,
+                        DenseMap<unsigned, Constant *> &GVNToConstant,
+                        DenseSet<unsigned> &NotSame) {
+  bool ConstantsTheSame = true;
+
+  IRSimilarityCandidate &C = *Region.Candidate;
+  for (IRInstructionData &ID : C) {
+
+    // Iterate over the operands in an instruction. If the global value number,
+    // assigned by the IRSimilarityCandidate, has been seen before, we check if
+    // the the number has been found to be not the same value in each instance.
+    for (Value *V : ID.OperVals) {
+      Optional<unsigned> GVNOpt = C.getGVN(V);
+      assert(GVNOpt.hasValue() && "Expected a GVN for operand?");
+      unsigned GVN = GVNOpt.getValue();
+
+      // Check if this global value has been found to not be the same already.
+      if (NotSame.contains(GVN)) {
+        if (isa<Constant>(V))
+          ConstantsTheSame = false;
+        continue;
+      }
+
+      // If it has been the same so far, we check the value for if the
+      // associated Constant value match the previous instances of the same
+      // global value number.  If the global value does not map to a Constant,
+      // it is considered to not be the same value.
+      Optional<bool> ConstantMatches = constantMatches(V, GVN, GVNToConstant);
+      if (ConstantMatches.hasValue()) {
+        if (ConstantMatches.getValue())
+          continue;
+        else
+          ConstantsTheSame = false;
+      }
+
+      // While this value is a register, it might not have been previously,
+      // make sure we don't already have a constant mapped to this global value
+      // number.
+      if (GVNToConstant.find(GVN) != GVNToConstant.end())
+        ConstantsTheSame = false;
+
+      NotSame.insert(GVN);
+    }
+  }
+
+  return ConstantsTheSame;
+}
+
+void OutlinableGroup::findSameConstants(DenseSet<unsigned> &NotSame) {
+  DenseMap<unsigned, Constant *> GVNToConstant;
+
+  for (OutlinableRegion *Region : Regions)
+    collectRegionsConstants(*Region, GVNToConstant, NotSame);
+}
+
+void OutlinableGroup::collectGVNStoreSets(Module &M) {
+  for (OutlinableRegion *OS : Regions)
+    OutputGVNCombinations.insert(OS->GVNStores);
+
+  // We are adding an extracted argument to decide between which output path
+  // to use in the basic block.  It is used in a switch statement and only
+  // needs to be an integer.
+  if (OutputGVNCombinations.size() > 1)
+    ArgumentTypes.push_back(Type::getInt32Ty(M.getContext()));
+}
+
+Function *IROutliner::createFunction(Module &M, OutlinableGroup &Group,
+                                     unsigned FunctionNameSuffix) {
+  assert(!Group.OutlinedFunction && "Function is already defined!");
+
+  Group.OutlinedFunctionType = FunctionType::get(
+      Type::getVoidTy(M.getContext()), Group.ArgumentTypes, false);
+
+  // These functions will only be called from within the same module, so
+  // we can set an internal linkage.
+  Group.OutlinedFunction = Function::Create(
+      Group.OutlinedFunctionType, GlobalValue::InternalLinkage,
+      "outlined_ir_func_" + std::to_string(FunctionNameSuffix), M);
+
+  // Transfer the swifterr attribute to the correct function parameter.
+  if (Group.SwiftErrorArgument.hasValue())
+    Group.OutlinedFunction->addParamAttr(Group.SwiftErrorArgument.getValue(),
+                                         Attribute::SwiftError);
+
+  Group.OutlinedFunction->addFnAttr(Attribute::OptimizeForSize);
+  Group.OutlinedFunction->addFnAttr(Attribute::MinSize);
+
+  return Group.OutlinedFunction;
+}
+
+/// Move each BasicBlock in \p Old to \p New.
+///
+/// \param [in] Old - the function to move the basic blocks from.
+/// \param [in] New - The function to move the basic blocks to.
+/// \returns the first return block for the function in New.
+static BasicBlock *moveFunctionData(Function &Old, Function &New) {
+  Function::iterator CurrBB, NextBB, FinalBB;
+  BasicBlock *NewEnd = nullptr;
+  std::vector<Instruction *> DebugInsts;
+  for (CurrBB = Old.begin(), FinalBB = Old.end(); CurrBB != FinalBB;
+       CurrBB = NextBB) {
+    NextBB = std::next(CurrBB);
+    CurrBB->removeFromParent();
+    CurrBB->insertInto(&New);
+    Instruction *I = CurrBB->getTerminator();
+    if (isa<ReturnInst>(I))
+      NewEnd = &(*CurrBB);
+  }
+
+  assert(NewEnd && "No return instruction for new function?");
+  return NewEnd;
+}
+
+/// Find the the constants that will need to be lifted into arguments
+/// as they are not the same in each instance of the region.
+///
+/// \param [in] C - The IRSimilarityCandidate containing the region we are
+/// analyzing.
+/// \param [in] NotSame - The set of global value numbers that do not have a
+/// single Constant across all OutlinableRegions similar to \p C.
+/// \param [out] Inputs - The list containing the global value numbers of the
+/// arguments needed for the region of code.
+static void findConstants(IRSimilarityCandidate &C, DenseSet<unsigned> &NotSame,
+                          std::vector<unsigned> &Inputs) {
+  DenseSet<unsigned> Seen;
+  // Iterate over the instructions, and find what constants will need to be
+  // extracted into arguments.
+  for (IRInstructionDataList::iterator IDIt = C.begin(), EndIDIt = C.end();
+       IDIt != EndIDIt; IDIt++) {
+    for (Value *V : (*IDIt).OperVals) {
+      // Since these are stored before any outlining, they will be in the
+      // global value numbering.
+      unsigned GVN = C.getGVN(V).getValue();
+      if (isa<Constant>(V))
+        if (NotSame.contains(GVN) && !Seen.contains(GVN)) {
+          Inputs.push_back(GVN);
+          Seen.insert(GVN);
+        }
+    }
+  }
+}
+
+/// Find the GVN for the inputs that have been found by the CodeExtractor.
+///
+/// \param [in] C - The IRSimilarityCandidate containing the region we are
+/// analyzing.
+/// \param [in] CurrentInputs - The set of inputs found by the
+/// CodeExtractor.
+/// \param [out] EndInputNumbers - The global value numbers for the extracted
+/// arguments.
+/// \param [in] OutputMappings - The mapping of values that have been replaced
+/// by a new output value.
+/// \param [out] EndInputs - The global value numbers for the extracted
+/// arguments.
+static void mapInputsToGVNs(IRSimilarityCandidate &C,
+                            SetVector<Value *> &CurrentInputs,
+                            const DenseMap<Value *, Value *> &OutputMappings,
+                            std::vector<unsigned> &EndInputNumbers) {
+  // Get the Global Value Number for each input.  We check if the Value has been
+  // replaced by a different value at output, and use the original value before
+  // replacement.
+  for (Value *Input : CurrentInputs) {
+    assert(Input && "Have a nullptr as an input");
+    if (OutputMappings.find(Input) != OutputMappings.end())
+      Input = OutputMappings.find(Input)->second;
+    assert(C.getGVN(Input).hasValue() &&
+           "Could not find a numbering for the given input");
+    EndInputNumbers.push_back(C.getGVN(Input).getValue());
+  }
+}
+
+/// Find the original value for the \p ArgInput values if any one of them was
+/// replaced during a previous extraction.
+///
+/// \param [in] ArgInputs - The inputs to be extracted by the code extractor.
+/// \param [in] OutputMappings - The mapping of values that have been replaced
+/// by a new output value.
+/// \param [out] RemappedArgInputs - The remapped values according to
+/// \p OutputMappings that will be extracted.
+static void
+remapExtractedInputs(const ArrayRef<Value *> ArgInputs,
+                     const DenseMap<Value *, Value *> &OutputMappings,
+                     SetVector<Value *> &RemappedArgInputs) {
+  // Get the global value number for each input that will be extracted as an
+  // argument by the code extractor, remapping if needed for reloaded values.
+  for (Value *Input : ArgInputs) {
+    if (OutputMappings.find(Input) != OutputMappings.end())
+      Input = OutputMappings.find(Input)->second;
+    RemappedArgInputs.insert(Input);
+  }
+}
+
+/// Find the input GVNs and the output values for a region of Instructions.
+/// Using the code extractor, we collect the inputs to the extracted function.
+///
+/// The \p Region can be identified as needing to be ignored in this function.
+/// It should be checked whether it should be ignored after a call to this
+/// function.
+///
+/// \param [in,out] Region - The region of code to be analyzed.
+/// \param [out] InputGVNs - The global value numbers for the extracted
+/// arguments.
+/// \param [in] NotSame - The global value numbers in the region that do not
+/// have the same constant value in the regions structurally similar to
+/// \p Region.
+/// \param [in] OutputMappings - The mapping of values that have been replaced
+/// by a new output value after extraction.
+/// \param [out] ArgInputs - The values of the inputs to the extracted function.
+/// \param [out] Outputs - The set of values extracted by the CodeExtractor
+/// as outputs.
+static void getCodeExtractorArguments(
+    OutlinableRegion &Region, std::vector<unsigned> &InputGVNs,
+    DenseSet<unsigned> &NotSame, DenseMap<Value *, Value *> &OutputMappings,
+    SetVector<Value *> &ArgInputs, SetVector<Value *> &Outputs) {
+  IRSimilarityCandidate &C = *Region.Candidate;
+
+  // OverallInputs are the inputs to the region found by the CodeExtractor,
+  // SinkCands and HoistCands are used by the CodeExtractor to find sunken
+  // allocas of values whose lifetimes are contained completely within the
+  // outlined region. PremappedInputs are the arguments found by the
+  // CodeExtractor, removing conditions such as sunken allocas, but that
+  // may need to be remapped due to the extracted output values replacing
+  // the original values. We use DummyOutputs for this first run of finding
+  // inputs and outputs since the outputs could change during findAllocas,
+  // the correct set of extracted outputs will be in the final Outputs ValueSet.
+  SetVector<Value *> OverallInputs, PremappedInputs, SinkCands, HoistCands,
+      DummyOutputs;
+
+  // Use the code extractor to get the inputs and outputs, without sunken
+  // allocas or removing llvm.assumes.
+  CodeExtractor *CE = Region.CE;
+  CE->findInputsOutputs(OverallInputs, DummyOutputs, SinkCands);
+  assert(Region.StartBB && "Region must have a start BasicBlock!");
+  Function *OrigF = Region.StartBB->getParent();
+  CodeExtractorAnalysisCache CEAC(*OrigF);
+  BasicBlock *Dummy = nullptr;
+
+  // The region may be ineligible due to VarArgs in the parent function. In this
+  // case we ignore the region.
+  if (!CE->isEligible()) {
+    Region.IgnoreRegion = true;
+    return;
+  }
+
+  // Find if any values are going to be sunk into the function when extracted
+  CE->findAllocas(CEAC, SinkCands, HoistCands, Dummy);
+  CE->findInputsOutputs(PremappedInputs, Outputs, SinkCands);
+
+  // TODO: Support regions with sunken allocas: values whose lifetimes are
+  // contained completely within the outlined region.  These are not guaranteed
+  // to be the same in every region, so we must elevate them all to arguments
+  // when they appear.  If these values are not equal, it means there is some
+  // Input in OverallInputs that was removed for ArgInputs.
+  if (OverallInputs.size() != PremappedInputs.size()) {
+    Region.IgnoreRegion = true;
+    return;
+  }
+
+  findConstants(C, NotSame, InputGVNs);
+
+  mapInputsToGVNs(C, OverallInputs, OutputMappings, InputGVNs);
+
+  remapExtractedInputs(PremappedInputs.getArrayRef(), OutputMappings,
+                       ArgInputs);
+
+  // Sort the GVNs, since we now have constants included in the \ref InputGVNs
+  // we need to make sure they are in a deterministic order.
+  stable_sort(InputGVNs);
+}
+
+/// Look over the inputs and map each input argument to an argument in the
+/// overall function for the OutlinableRegions.  This creates a way to replace
+/// the arguments of the extracted function with the arguments of the new
+/// overall function.
+///
+/// \param [in,out] Region - The region of code to be analyzed.
+/// \param [in] InputsGVNs - The global value numbering of the input values
+/// collected.
+/// \param [in] ArgInputs - The values of the arguments to the extracted
+/// function.
+static void
+findExtractedInputToOverallInputMapping(OutlinableRegion &Region,
+                                        std::vector<unsigned> &InputGVNs,
+                                        SetVector<Value *> &ArgInputs) {
+
+  IRSimilarityCandidate &C = *Region.Candidate;
+  OutlinableGroup &Group = *Region.Parent;
+
+  // This counts the argument number in the overall function.
+  unsigned TypeIndex = 0;
+
+  // This counts the argument number in the extracted function.
+  unsigned OriginalIndex = 0;
+
+  // Find the mapping of the extracted arguments to the arguments for the
+  // overall function. Since there may be extra arguments in the overall
+  // function to account for the extracted constants, we have two different
+  // counters as we find extracted arguments, and as we come across overall
+  // arguments.
+  for (unsigned InputVal : InputGVNs) {
+    Optional<Value *> InputOpt = C.fromGVN(InputVal);
+    assert(InputOpt.hasValue() && "Global value number not found?");
+    Value *Input = InputOpt.getValue();
+
+    if (!Group.InputTypesSet) {
+      Group.ArgumentTypes.push_back(Input->getType());
+      // If the input value has a swifterr attribute, make sure to mark the
+      // argument in the overall function.
+      if (Input->isSwiftError()) {
+        assert(
+            !Group.SwiftErrorArgument.hasValue() &&
+            "Argument already marked with swifterr for this OutlinableGroup!");
+        Group.SwiftErrorArgument = TypeIndex;
+      }
+    }
+
+    // Check if we have a constant. If we do add it to the overall argument
+    // number to Constant map for the region, and continue to the next input.
+    if (Constant *CST = dyn_cast<Constant>(Input)) {
+      Region.AggArgToConstant.insert(std::make_pair(TypeIndex, CST));
+      TypeIndex++;
+      continue;
+    }
+
+    // It is not a constant, we create the mapping from extracted argument list
+    // to the overall argument list.
+    assert(ArgInputs.count(Input) && "Input cannot be found!");
+
+    Region.ExtractedArgToAgg.insert(std::make_pair(OriginalIndex, TypeIndex));
+    Region.AggArgToExtracted.insert(std::make_pair(TypeIndex, OriginalIndex));
+    OriginalIndex++;
+    TypeIndex++;
+  }
+
+  // If the function type definitions for the OutlinableGroup holding the region
+  // have not been set, set the length of the inputs here.  We should have the
+  // same inputs for all of the different regions contained in the
+  // OutlinableGroup since they are all structurally similar to one another.
+  if (!Group.InputTypesSet) {
+    Group.NumAggregateInputs = TypeIndex;
+    Group.InputTypesSet = true;
+  }
+
+  Region.NumExtractedInputs = OriginalIndex;
+}
+
+/// Create a mapping of the output arguments for the \p Region to the output
+/// arguments of the overall outlined function.
+///
+/// \param [in,out] Region - The region of code to be analyzed.
+/// \param [in] Outputs - The values found by the code extractor.
+static void
+findExtractedOutputToOverallOutputMapping(OutlinableRegion &Region,
+                                          ArrayRef<Value *> Outputs) {
+  OutlinableGroup &Group = *Region.Parent;
+  IRSimilarityCandidate &C = *Region.Candidate;
+
+  // This counts the argument number in the extracted function.
+  unsigned OriginalIndex = Region.NumExtractedInputs;
+
+  // This counts the argument number in the overall function.
+  unsigned TypeIndex = Group.NumAggregateInputs;
+  bool TypeFound;
+  DenseSet<unsigned> AggArgsUsed;
+
+  // Iterate over the output types and identify if there is an aggregate pointer
+  // type whose base type matches the current output type. If there is, we mark
+  // that we will use this output register for this value. If not we add another
+  // type to the overall argument type list. We also store the GVNs used for
+  // stores to identify which values will need to be moved into an special
+  // block that holds the stores to the output registers.
+  for (Value *Output : Outputs) {
+    TypeFound = false;
+    // We can do this since it is a result value, and will have a number
+    // that is necessarily the same. BUT if in the future, the instructions
+    // do not have to be in same order, but are functionally the same, we will
+    // have to use a different scheme, as one-to-one correspondence is not
+    // guaranteed.
+    unsigned GlobalValue = C.getGVN(Output).getValue();
+    unsigned ArgumentSize = Group.ArgumentTypes.size();
+
+    for (unsigned Jdx = TypeIndex; Jdx < ArgumentSize; Jdx++) {
+      if (Group.ArgumentTypes[Jdx] != PointerType::getUnqual(Output->getType()))
+        continue;
+
+      if (AggArgsUsed.contains(Jdx))
+        continue;
+
+      TypeFound = true;
+      AggArgsUsed.insert(Jdx);
+      Region.ExtractedArgToAgg.insert(std::make_pair(OriginalIndex, Jdx));
+      Region.AggArgToExtracted.insert(std::make_pair(Jdx, OriginalIndex));
+      Region.GVNStores.push_back(GlobalValue);
+      break;
+    }
+
+    // We were unable to find an unused type in the output type set that matches
+    // the output, so we add a pointer type to the argument types of the overall
+    // function to handle this output and create a mapping to it.
+    if (!TypeFound) {
+      Group.ArgumentTypes.push_back(PointerType::getUnqual(Output->getType()));
+      AggArgsUsed.insert(Group.ArgumentTypes.size() - 1);
+      Region.ExtractedArgToAgg.insert(
+          std::make_pair(OriginalIndex, Group.ArgumentTypes.size() - 1));
+      Region.AggArgToExtracted.insert(
+          std::make_pair(Group.ArgumentTypes.size() - 1, OriginalIndex));
+      Region.GVNStores.push_back(GlobalValue);
+    }
+
+    stable_sort(Region.GVNStores);
+    OriginalIndex++;
+    TypeIndex++;
+  }
+}
+
+void IROutliner::findAddInputsOutputs(Module &M, OutlinableRegion &Region,
+                                      DenseSet<unsigned> &NotSame) {
+  std::vector<unsigned> Inputs;
+  SetVector<Value *> ArgInputs, Outputs;
+
+  getCodeExtractorArguments(Region, Inputs, NotSame, OutputMappings, ArgInputs,
+                            Outputs);
+
+  if (Region.IgnoreRegion)
+    return;
+
+  // Map the inputs found by the CodeExtractor to the arguments found for
+  // the overall function.
+  findExtractedInputToOverallInputMapping(Region, Inputs, ArgInputs);
+
+  // Map the outputs found by the CodeExtractor to the arguments found for
+  // the overall function.
+  findExtractedOutputToOverallOutputMapping(Region, Outputs.getArrayRef());
+}
+
+/// Replace the extracted function in the Region with a call to the overall
+/// function constructed from the deduplicated similar regions, replacing and
+/// remapping the values passed to the extracted function as arguments to the
+/// new arguments of the overall function.
+///
+/// \param [in] M - The module to outline from.
+/// \param [in] Region - The regions of extracted code to be replaced with a new
+/// function.
+/// \returns a call instruction with the replaced function.
+CallInst *replaceCalledFunction(Module &M, OutlinableRegion &Region) {
+  std::vector<Value *> NewCallArgs;
+  DenseMap<unsigned, unsigned>::iterator ArgPair;
+
+  OutlinableGroup &Group = *Region.Parent;
+  CallInst *Call = Region.Call;
+  assert(Call && "Call to replace is nullptr?");
+  Function *AggFunc = Group.OutlinedFunction;
+  assert(AggFunc && "Function to replace with is nullptr?");
+
+  // If the arguments are the same size, there are not values that need to be
+  // made argument, or different output registers to handle.  We can simply
+  // replace the called function in this case.
+  if (AggFunc->arg_size() == Call->arg_size()) {
+    LLVM_DEBUG(dbgs() << "Replace call to " << *Call << " with call to "
+                      << *AggFunc << " with same number of arguments\n");
+    Call->setCalledFunction(AggFunc);
+    return Call;
+  }
+
+  // We have a different number of arguments than the new function, so
+  // we need to use our previously mappings off extracted argument to overall
+  // function argument, and constants to overall function argument to create the
+  // new argument list.
+  for (unsigned AggArgIdx = 0; AggArgIdx < AggFunc->arg_size(); AggArgIdx++) {
+
+    if (AggArgIdx == AggFunc->arg_size() - 1 &&
+        Group.OutputGVNCombinations.size() > 1) {
+      // If we are on the last argument, and we need to differentiate between
+      // output blocks, add an integer to the argument list to determine
+      // what block to take
+      LLVM_DEBUG(dbgs() << "Set switch block argument to "
+                        << Region.OutputBlockNum << "\n");
+      NewCallArgs.push_back(ConstantInt::get(Type::getInt32Ty(M.getContext()),
+                                             Region.OutputBlockNum));
+      continue;
+    }
+
+    ArgPair = Region.AggArgToExtracted.find(AggArgIdx);
+    if (ArgPair != Region.AggArgToExtracted.end()) {
+      Value *ArgumentValue = Call->getArgOperand(ArgPair->second);
+      // If we found the mapping from the extracted function to the overall
+      // function, we simply add it to the argument list.  We use the same
+      // value, it just needs to honor the new order of arguments.
+      LLVM_DEBUG(dbgs() << "Setting argument " << AggArgIdx << " to value "
+                        << *ArgumentValue << "\n");
+      NewCallArgs.push_back(ArgumentValue);
+      continue;
+    }
+
+    // If it is a constant, we simply add it to the argument list as a value.
+    if (Region.AggArgToConstant.find(AggArgIdx) !=
+        Region.AggArgToConstant.end()) {
+      Constant *CST = Region.AggArgToConstant.find(AggArgIdx)->second;
+      LLVM_DEBUG(dbgs() << "Setting argument " << AggArgIdx << " to value "
+                        << *CST << "\n");
+      NewCallArgs.push_back(CST);
+      continue;
+    }
+
+    // Add a nullptr value if the argument is not found in the extracted
+    // function.  If we cannot find a value, it means it is not in use
+    // for the region, so we should not pass anything to it.
+    LLVM_DEBUG(dbgs() << "Setting argument " << AggArgIdx << " to nullptr\n");
+    NewCallArgs.push_back(ConstantPointerNull::get(
+        static_cast<PointerType *>(AggFunc->getArg(AggArgIdx)->getType())));
+  }
+
+  LLVM_DEBUG(dbgs() << "Replace call to " << *Call << " with call to "
+                    << *AggFunc << " with new set of arguments\n");
+  // Create the new call instruction and erase the old one.
+  Call = CallInst::Create(AggFunc->getFunctionType(), AggFunc, NewCallArgs, "",
+                          Call);
+
+  // It is possible that the call to the outlined function is either the first
+  // instruction is in the new block, the last instruction, or both.  If either
+  // of these is the case, we need to make sure that we replace the instruction
+  // in the IRInstructionData struct with the new call.
+  CallInst *OldCall = Region.Call;
+  if (Region.NewFront->Inst == OldCall)
+    Region.NewFront->Inst = Call;
+  if (Region.NewBack->Inst == OldCall)
+    Region.NewBack->Inst = Call;
+
+  // Transfer any debug information.
+  Call->setDebugLoc(Region.Call->getDebugLoc());
+
+  // Remove the old instruction.
+  OldCall->eraseFromParent();
+  Region.Call = Call;
+
+  // Make sure that the argument in the new function has the SwiftError
+  // argument.
+  if (Group.SwiftErrorArgument.hasValue())
+    Call->addParamAttr(Group.SwiftErrorArgument.getValue(),
+                       Attribute::SwiftError);
+
+  return Call;
+}
+
+// Within an extracted function, replace the argument uses of the extracted
+// region with the arguments of the function for an OutlinableGroup.
+//
+/// \param [in] Region - The region of extracted code to be changed.
+/// \param [in,out] OutputBB - The BasicBlock for the output stores for this
+/// region.
+static void replaceArgumentUses(OutlinableRegion &Region,
+                                BasicBlock *OutputBB) {
+  OutlinableGroup &Group = *Region.Parent;
+  assert(Region.ExtractedFunction && "Region has no extracted function?");
+
+  for (unsigned ArgIdx = 0; ArgIdx < Region.ExtractedFunction->arg_size();
+       ArgIdx++) {
+    assert(Region.ExtractedArgToAgg.find(ArgIdx) !=
+               Region.ExtractedArgToAgg.end() &&
+           "No mapping from extracted to outlined?");
+    unsigned AggArgIdx = Region.ExtractedArgToAgg.find(ArgIdx)->second;
+    Argument *AggArg = Group.OutlinedFunction->getArg(AggArgIdx);
+    Argument *Arg = Region.ExtractedFunction->getArg(ArgIdx);
+    // The argument is an input, so we can simply replace it with the overall
+    // argument value
+    if (ArgIdx < Region.NumExtractedInputs) {
+      LLVM_DEBUG(dbgs() << "Replacing uses of input " << *Arg << " in function "
+                        << *Region.ExtractedFunction << " with " << *AggArg
+                        << " in function " << *Group.OutlinedFunction << "\n");
+      Arg->replaceAllUsesWith(AggArg);
+      continue;
+    }
+
+    // If we are replacing an output, we place the store value in its own
+    // block inside the overall function before replacing the use of the output
+    // in the function.
+    assert(Arg->hasOneUse() && "Output argument can only have one use");
+    User *InstAsUser = Arg->user_back();
+    assert(InstAsUser && "User is nullptr!");
+
+    Instruction *I = cast<Instruction>(InstAsUser);
+    I->setDebugLoc(DebugLoc());
+    LLVM_DEBUG(dbgs() << "Move store for instruction " << *I << " to "
+                      << *OutputBB << "\n");
+
+    I->moveBefore(*OutputBB, OutputBB->end());
+
+    LLVM_DEBUG(dbgs() << "Replacing uses of output " << *Arg << " in function "
+                      << *Region.ExtractedFunction << " with " << *AggArg
+                      << " in function " << *Group.OutlinedFunction << "\n");
+    Arg->replaceAllUsesWith(AggArg);
+  }
+}
+
+/// Within an extracted function, replace the constants that need to be lifted
+/// into arguments with the actual argument.
+///
+/// \param Region [in] - The region of extracted code to be changed.
+void replaceConstants(OutlinableRegion &Region) {
+  OutlinableGroup &Group = *Region.Parent;
+  // Iterate over the constants that need to be elevated into arguments
+  for (std::pair<unsigned, Constant *> &Const : Region.AggArgToConstant) {
+    unsigned AggArgIdx = Const.first;
+    Function *OutlinedFunction = Group.OutlinedFunction;
+    assert(OutlinedFunction && "Overall Function is not defined?");
+    Constant *CST = Const.second;
+    Argument *Arg = Group.OutlinedFunction->getArg(AggArgIdx);
+    // Identify the argument it will be elevated to, and replace instances of
+    // that constant in the function.
+
+    // TODO: If in the future constants do not have one global value number,
+    // i.e. a constant 1 could be mapped to several values, this check will
+    // have to be more strict.  It cannot be using only replaceUsesWithIf.
+
+    LLVM_DEBUG(dbgs() << "Replacing uses of constant " << *CST
+                      << " in function " << *OutlinedFunction << " with "
+                      << *Arg << "\n");
+    CST->replaceUsesWithIf(Arg, [OutlinedFunction](Use &U) {
+      if (Instruction *I = dyn_cast<Instruction>(U.getUser()))
+        return I->getFunction() == OutlinedFunction;
+      return false;
+    });
+  }
+}
+
+/// For the given function, find all the nondebug or lifetime instructions,
+/// and return them as a vector. Exclude any blocks in \p ExludeBlocks.
+///
+/// \param [in] F - The function we collect the instructions from.
+/// \param [in] ExcludeBlocks - BasicBlocks to ignore.
+/// \returns the list of instructions extracted.
+static std::vector<Instruction *>
+collectRelevantInstructions(Function &F,
+                            DenseSet<BasicBlock *> &ExcludeBlocks) {
+  std::vector<Instruction *> RelevantInstructions;
+
+  for (BasicBlock &BB : F) {
+    if (ExcludeBlocks.contains(&BB))
+      continue;
+
+    for (Instruction &Inst : BB) {
+      if (Inst.isLifetimeStartOrEnd())
+        continue;
+      if (isa<DbgInfoIntrinsic>(Inst))
+        continue;
+
+      RelevantInstructions.push_back(&Inst);
+    }
+  }
+
+  return RelevantInstructions;
+}
+
+/// It is possible that there is a basic block that already performs the same
+/// stores. This returns a duplicate block, if it exists
+///
+/// \param OutputBB [in] the block we are looking for a duplicate of.
+/// \param OutputStoreBBs [in] The existing output blocks.
+/// \returns an optional value with the number output block if there is a match.
+Optional<unsigned>
+findDuplicateOutputBlock(BasicBlock *OutputBB,
+                         ArrayRef<BasicBlock *> OutputStoreBBs) {
+
+  bool WrongInst = false;
+  bool WrongSize = false;
+  unsigned MatchingNum = 0;
+  for (BasicBlock *CompBB : OutputStoreBBs) {
+    WrongInst = false;
+    if (CompBB->size() - 1 != OutputBB->size()) {
+      WrongSize = true;
+      MatchingNum++;
+      continue;
+    }
+
+    WrongSize = false;
+    BasicBlock::iterator NIt = OutputBB->begin();
+    for (Instruction &I : *CompBB) {
+      if (isa<BranchInst>(&I))
+        continue;
+
+      if (!I.isIdenticalTo(&(*NIt))) {
+        WrongInst = true;
+        break;
+      }
+
+      NIt++;
+    }
+    if (!WrongInst && !WrongSize)
+      return MatchingNum;
+
+    MatchingNum++;
+  }
+
+  return None;
+}
+
+/// For the outlined section, move needed the StoreInsts for the output
+/// registers into their own block. Then, determine if there is a duplicate
+/// output block already created.
+///
+/// \param [in] OG - The OutlinableGroup of regions to be outlined.
+/// \param [in] Region - The OutlinableRegion that is being analyzed.
+/// \param [in,out] OutputBB - the block that stores for this region will be
+/// placed in.
+/// \param [in] EndBB - the final block of the extracted function.
+/// \param [in] OutputMappings - OutputMappings the mapping of values that have
+/// been replaced by a new output value.
+/// \param [in,out] OutputStoreBBs - The existing output blocks.
+static void
+alignOutputBlockWithAggFunc(OutlinableGroup &OG, OutlinableRegion &Region,
+                            BasicBlock *OutputBB, BasicBlock *EndBB,
+                            const DenseMap<Value *, Value *> &OutputMappings,
+                            std::vector<BasicBlock *> &OutputStoreBBs) {
+  DenseSet<unsigned> ValuesToFind(Region.GVNStores.begin(),
+                                  Region.GVNStores.end());
+
+  // We iterate over the instructions in the extracted function, and find the
+  // global value number of the instructions.  If we find a value that should
+  // be contained in a store, we replace the uses of the value with the value
+  // from the overall function, so that the store is storing the correct
+  // value from the overall function.
+  DenseSet<BasicBlock *> ExcludeBBs(OutputStoreBBs.begin(),
+                                    OutputStoreBBs.end());
+  ExcludeBBs.insert(OutputBB);
+  std::vector<Instruction *> ExtractedFunctionInsts =
+      collectRelevantInstructions(*(Region.ExtractedFunction), ExcludeBBs);
+  std::vector<Instruction *> OverallFunctionInsts =
+      collectRelevantInstructions(*OG.OutlinedFunction, ExcludeBBs);
+
+  assert(ExtractedFunctionInsts.size() == OverallFunctionInsts.size() &&
+         "Number of relevant instructions not equal!");
+
+  unsigned NumInstructions = ExtractedFunctionInsts.size();
+  for (unsigned Idx = 0; Idx < NumInstructions; Idx++) {
+    Value *V = ExtractedFunctionInsts[Idx];
+
+    if (OutputMappings.find(V) != OutputMappings.end())
+      V = OutputMappings.find(V)->second;
+    Optional<unsigned> GVN = Region.Candidate->getGVN(V);
+
+    // If we have found one of the stored values for output, replace the value
+    // with the corresponding one from the overall function.
+    if (GVN.hasValue() && ValuesToFind.erase(GVN.getValue())) {
+      V->replaceAllUsesWith(OverallFunctionInsts[Idx]);
+      if (ValuesToFind.size() == 0)
+        break;
+    }
+
+    if (ValuesToFind.size() == 0)
+      break;
+  }
+
+  assert(ValuesToFind.size() == 0 && "Not all store values were handled!");
+
+  // If the size of the block is 0, then there are no stores, and we do not
+  // need to save this block.
+  if (OutputBB->size() == 0) {
+    Region.OutputBlockNum = -1;
+    OutputBB->eraseFromParent();
+    return;
+  }
+
+  // Determine is there is a duplicate block.
+  Optional<unsigned> MatchingBB =
+      findDuplicateOutputBlock(OutputBB, OutputStoreBBs);
+
+  // If there is, we remove the new output block.  If it does not,
+  // we add it to our list of output blocks.
+  if (MatchingBB.hasValue()) {
+    LLVM_DEBUG(dbgs() << "Set output block for region in function"
+                      << Region.ExtractedFunction << " to "
+                      << MatchingBB.getValue());
+
+    Region.OutputBlockNum = MatchingBB.getValue();
+    OutputBB->eraseFromParent();
+    return;
+  }
+
+  Region.OutputBlockNum = OutputStoreBBs.size();
+
+  LLVM_DEBUG(dbgs() << "Create output block for region in"
+                    << Region.ExtractedFunction << " to "
+                    << *OutputBB);
+  OutputStoreBBs.push_back(OutputBB);
+  BranchInst::Create(EndBB, OutputBB);
+}
+
+/// Create the switch statement for outlined function to differentiate between
+/// all the output blocks.
+///
+/// For the outlined section, determine if an outlined block already exists that
+/// matches the needed stores for the extracted section.
+/// \param [in] M - The module we are outlining from.
+/// \param [in] OG - The group of regions to be outlined.
+/// \param [in] OS - The region that is being analyzed.
+/// \param [in] EndBB - The final block of the extracted function.
+/// \param [in,out] OutputStoreBBs - The existing output blocks.
+void createSwitchStatement(Module &M, OutlinableGroup &OG, BasicBlock *EndBB,
+                           ArrayRef<BasicBlock *> OutputStoreBBs) {
+  // We only need the switch statement if there is more than one store
+  // combination.
+  if (OG.OutputGVNCombinations.size() > 1) {
+    Function *AggFunc = OG.OutlinedFunction;
+    // Create a final block
+    BasicBlock *ReturnBlock =
+        BasicBlock::Create(M.getContext(), "final_block", AggFunc);
+    Instruction *Term = EndBB->getTerminator();
+    Term->moveBefore(*ReturnBlock, ReturnBlock->end());
+    // Put the switch statement in the old end basic block for the function with
+    // a fall through to the new return block
+    LLVM_DEBUG(dbgs() << "Create switch statement in " << *AggFunc << " for "
+                      << OutputStoreBBs.size() << "\n");
+    SwitchInst *SwitchI =
+        SwitchInst::Create(AggFunc->getArg(AggFunc->arg_size() - 1),
+                           ReturnBlock, OutputStoreBBs.size(), EndBB);
+
+    unsigned Idx = 0;
+    for (BasicBlock *BB : OutputStoreBBs) {
+      SwitchI->addCase(ConstantInt::get(Type::getInt32Ty(M.getContext()), Idx),
+                       BB);
+      Term = BB->getTerminator();
+      Term->setSuccessor(0, ReturnBlock);
+      Idx++;
+    }
+    return;
+  }
+
+  // If there needs to be stores, move them from the output block to the end
+  // block to save on branching instructions.
+  if (OutputStoreBBs.size() == 1) {
+    LLVM_DEBUG(dbgs() << "Move store instructions to the end block in "
+                      << *OG.OutlinedFunction << "\n");
+    BasicBlock *OutputBlock = OutputStoreBBs[0];
+    Instruction *Term = OutputBlock->getTerminator();
+    Term->eraseFromParent();
+    Term = EndBB->getTerminator();
+    moveBBContents(*OutputBlock, *EndBB);
+    Term->moveBefore(*EndBB, EndBB->end());
+    OutputBlock->eraseFromParent();
+  }
+}
+
+/// Fill the new function that will serve as the replacement function for all of
+/// the extracted regions of a certain structure from the first region in the
+/// list of regions.  Replace this first region's extracted function with the
+/// new overall function.
+///
+/// \param [in] M - The module we are outlining from.
+/// \param [in] CurrentGroup - The group of regions to be outlined.
+/// \param [in,out] OutputStoreBBs - The output blocks for each different
+/// set of stores needed for the different functions.
+/// \param [in,out] FuncsToRemove - Extracted functions to erase from module
+/// once outlining is complete.
+static void fillOverallFunction(Module &M, OutlinableGroup &CurrentGroup,
+                                std::vector<BasicBlock *> &OutputStoreBBs,
+                                std::vector<Function *> &FuncsToRemove) {
+  OutlinableRegion *CurrentOS = CurrentGroup.Regions[0];
+
+  // Move first extracted function's instructions into new function.
+  LLVM_DEBUG(dbgs() << "Move instructions from "
+                    << *CurrentOS->ExtractedFunction << " to instruction "
+                    << *CurrentGroup.OutlinedFunction << "\n");
+
+  CurrentGroup.EndBB = moveFunctionData(*CurrentOS->ExtractedFunction,
+                                        *CurrentGroup.OutlinedFunction);
+
+  // Transfer the attributes from the function to the new function.
+  for (Attribute A :
+       CurrentOS->ExtractedFunction->getAttributes().getFnAttributes())
+    CurrentGroup.OutlinedFunction->addFnAttr(A);
+
+  // Create an output block for the first extracted function.
+  BasicBlock *NewBB = BasicBlock::Create(
+      M.getContext(), Twine("output_block_") + Twine(static_cast<unsigned>(0)),
+      CurrentGroup.OutlinedFunction);
+  CurrentOS->OutputBlockNum = 0;
+
+  replaceArgumentUses(*CurrentOS, NewBB);
+  replaceConstants(*CurrentOS);
+
+  // If the new basic block has no new stores, we can erase it from the module.
+  // It it does, we create a branch instruction to the last basic block from the
+  // new one.
+  if (NewBB->size() == 0) {
+    CurrentOS->OutputBlockNum = -1;
+    NewBB->eraseFromParent();
+  } else {
+    BranchInst::Create(CurrentGroup.EndBB, NewBB);
+    OutputStoreBBs.push_back(NewBB);
+  }
+
+  // Replace the call to the extracted function with the outlined function.
+  CurrentOS->Call = replaceCalledFunction(M, *CurrentOS);
+
+  // We only delete the extracted functions at the end since we may need to
+  // reference instructions contained in them for mapping purposes.
+  FuncsToRemove.push_back(CurrentOS->ExtractedFunction);
+}
+
+void IROutliner::deduplicateExtractedSections(
+    Module &M, OutlinableGroup &CurrentGroup,
+    std::vector<Function *> &FuncsToRemove, unsigned &OutlinedFunctionNum) {
+  createFunction(M, CurrentGroup, OutlinedFunctionNum);
+
+  std::vector<BasicBlock *> OutputStoreBBs;
+
+  OutlinableRegion *CurrentOS;
+
+  fillOverallFunction(M, CurrentGroup, OutputStoreBBs, FuncsToRemove);
+
+  for (unsigned Idx = 1; Idx < CurrentGroup.Regions.size(); Idx++) {
+    CurrentOS = CurrentGroup.Regions[Idx];
+    AttributeFuncs::mergeAttributesForOutlining(*CurrentGroup.OutlinedFunction,
+                                               *CurrentOS->ExtractedFunction);
+
+    // Create a new BasicBlock to hold the needed store instructions.
+    BasicBlock *NewBB = BasicBlock::Create(
+        M.getContext(), "output_block_" + std::to_string(Idx),
+        CurrentGroup.OutlinedFunction);
+    replaceArgumentUses(*CurrentOS, NewBB);
+
+    alignOutputBlockWithAggFunc(CurrentGroup, *CurrentOS, NewBB,
+                                CurrentGroup.EndBB, OutputMappings,
+                                OutputStoreBBs);
+
+    CurrentOS->Call = replaceCalledFunction(M, *CurrentOS);
+    FuncsToRemove.push_back(CurrentOS->ExtractedFunction);
+  }
+
+  // Create a switch statement to handle the different output schemes.
+  createSwitchStatement(M, CurrentGroup, CurrentGroup.EndBB, OutputStoreBBs);
+
+  OutlinedFunctionNum++;
+}
+
+void IROutliner::pruneIncompatibleRegions(
+    std::vector<IRSimilarityCandidate> &CandidateVec,
+    OutlinableGroup &CurrentGroup) {
+  bool PreviouslyOutlined;
+
+  // Sort from beginning to end, so the IRSimilarityCandidates are in order.
+  stable_sort(CandidateVec, [](const IRSimilarityCandidate &LHS,
+                               const IRSimilarityCandidate &RHS) {
+    return LHS.getStartIdx() < RHS.getStartIdx();
+  });
+
+  unsigned CurrentEndIdx = 0;
+  for (IRSimilarityCandidate &IRSC : CandidateVec) {
+    PreviouslyOutlined = false;
+    unsigned StartIdx = IRSC.getStartIdx();
+    unsigned EndIdx = IRSC.getEndIdx();
+
+    for (unsigned Idx = StartIdx; Idx <= EndIdx; Idx++)
+      if (Outlined.contains(Idx)) {
+        PreviouslyOutlined = true;
+        break;
+      }
+
+    if (PreviouslyOutlined)
+      continue;
+
+    // TODO: If in the future we can outline across BasicBlocks, we will need to
+    // check all BasicBlocks contained in the region.
+    if (IRSC.getStartBB()->hasAddressTaken())
+      continue;
+
+    if (IRSC.front()->Inst->getFunction()->hasLinkOnceODRLinkage() &&
+        !OutlineFromLinkODRs)
+      continue;
+
+    // Greedily prune out any regions that will overlap with already chosen
+    // regions.
+    if (CurrentEndIdx != 0 && StartIdx <= CurrentEndIdx)
+      continue;
+
+    bool BadInst = any_of(IRSC, [this](IRInstructionData &ID) {
+      // We check if there is a discrepancy between the InstructionDataList
+      // and the actual next instruction in the module.  If there is, it means
+      // that an extra instruction was added, likely by the CodeExtractor.
+
+      // Since we do not have any similarity data about this particular
+      // instruction, we cannot confidently outline it, and must discard this
+      // candidate.
+      if (std::next(ID.getIterator())->Inst !=
+          ID.Inst->getNextNonDebugInstruction())
+        return true;
+      return !this->InstructionClassifier.visit(ID.Inst);
+    });
+
+    if (BadInst)
+      continue;
+
+    OutlinableRegion *OS = new (RegionAllocator.Allocate())
+        OutlinableRegion(IRSC, CurrentGroup);
+    CurrentGroup.Regions.push_back(OS);
+
+    CurrentEndIdx = EndIdx;
+  }
+}
+
+InstructionCost
+IROutliner::findBenefitFromAllRegions(OutlinableGroup &CurrentGroup) {
+  InstructionCost RegionBenefit = 0;
+  for (OutlinableRegion *Region : CurrentGroup.Regions) {
+    TargetTransformInfo &TTI = getTTI(*Region->StartBB->getParent());
+    // We add the number of instructions in the region to the benefit as an
+    // estimate as to how much will be removed.
+    RegionBenefit += Region->getBenefit(TTI);
+    LLVM_DEBUG(dbgs() << "Adding: " << RegionBenefit
+                      << " saved instructions to overfall benefit.\n");
+  }
+
+  return RegionBenefit;
+}
+
+InstructionCost
+IROutliner::findCostOutputReloads(OutlinableGroup &CurrentGroup) {
+  InstructionCost OverallCost = 0;
+  for (OutlinableRegion *Region : CurrentGroup.Regions) {
+    TargetTransformInfo &TTI = getTTI(*Region->StartBB->getParent());
+
+    // Each output incurs a load after the call, so we add that to the cost.
+    for (unsigned OutputGVN : Region->GVNStores) {
+      Optional<Value *> OV = Region->Candidate->fromGVN(OutputGVN);
+      assert(OV.hasValue() && "Could not find value for GVN?");
+      Value *V = OV.getValue();
+      InstructionCost LoadCost =
+          TTI.getMemoryOpCost(Instruction::Load, V->getType(), Align(1), 0,
+                              TargetTransformInfo::TCK_CodeSize);
+
+      LLVM_DEBUG(dbgs() << "Adding: " << LoadCost
+                        << " instructions to cost for output of type "
+                        << *V->getType() << "\n");
+      OverallCost += LoadCost;
+    }
+  }
+
+  return OverallCost;
+}
+
+/// Find the extra instructions needed to handle any output values for the
+/// region.
+///
+/// \param [in] M - The Module to outline from.
+/// \param [in] CurrentGroup - The collection of OutlinableRegions to analyze.
+/// \param [in] TTI - The TargetTransformInfo used to collect information for
+/// new instruction costs.
+/// \returns the additional cost to handle the outputs.
+static InstructionCost findCostForOutputBlocks(Module &M,
+                                               OutlinableGroup &CurrentGroup,
+                                               TargetTransformInfo &TTI) {
+  InstructionCost OutputCost = 0;
+
+  for (const ArrayRef<unsigned> &OutputUse :
+       CurrentGroup.OutputGVNCombinations) {
+    IRSimilarityCandidate &Candidate = *CurrentGroup.Regions[0]->Candidate;
+    for (unsigned GVN : OutputUse) {
+      Optional<Value *> OV = Candidate.fromGVN(GVN);
+      assert(OV.hasValue() && "Could not find value for GVN?");
+      Value *V = OV.getValue();
+      InstructionCost StoreCost =
+          TTI.getMemoryOpCost(Instruction::Load, V->getType(), Align(1), 0,
+                              TargetTransformInfo::TCK_CodeSize);
+
+      // An instruction cost is added for each store set that needs to occur for
+      // various output combinations inside the function, plus a branch to
+      // return to the exit block.
+      LLVM_DEBUG(dbgs() << "Adding: " << StoreCost
+                        << " instructions to cost for output of type "
+                        << *V->getType() << "\n");
+      OutputCost += StoreCost;
+    }
+
+    InstructionCost BranchCost =
+        TTI.getCFInstrCost(Instruction::Br, TargetTransformInfo::TCK_CodeSize);
+    LLVM_DEBUG(dbgs() << "Adding " << BranchCost << " to the current cost for"
+                      << " a branch instruction\n");
+    OutputCost += BranchCost;
+  }
+
+  // If there is more than one output scheme, we must have a comparison and
+  // branch for each different item in the switch statement.
+  if (CurrentGroup.OutputGVNCombinations.size() > 1) {
+    InstructionCost ComparisonCost = TTI.getCmpSelInstrCost(
+        Instruction::ICmp, Type::getInt32Ty(M.getContext()),
+        Type::getInt32Ty(M.getContext()), CmpInst::BAD_ICMP_PREDICATE,
+        TargetTransformInfo::TCK_CodeSize);
+    InstructionCost BranchCost =
+        TTI.getCFInstrCost(Instruction::Br, TargetTransformInfo::TCK_CodeSize);
+
+    unsigned DifferentBlocks = CurrentGroup.OutputGVNCombinations.size();
+    InstructionCost TotalCost = ComparisonCost * BranchCost * DifferentBlocks;
+
+    LLVM_DEBUG(dbgs() << "Adding: " << TotalCost
+                      << " instructions for each switch case for each different"
+                      << " output path in a function\n");
+    OutputCost += TotalCost;
+  }
+
+  return OutputCost;
+}
+
+void IROutliner::findCostBenefit(Module &M, OutlinableGroup &CurrentGroup) {
+  InstructionCost RegionBenefit = findBenefitFromAllRegions(CurrentGroup);
+  CurrentGroup.Benefit += RegionBenefit;
+  LLVM_DEBUG(dbgs() << "Current Benefit: " << CurrentGroup.Benefit << "\n");
+
+  InstructionCost OutputReloadCost = findCostOutputReloads(CurrentGroup);
+  CurrentGroup.Cost += OutputReloadCost;
+  LLVM_DEBUG(dbgs() << "Current Cost: " << CurrentGroup.Cost << "\n");
+
+  InstructionCost AverageRegionBenefit =
+      RegionBenefit / CurrentGroup.Regions.size();
+  unsigned OverallArgumentNum = CurrentGroup.ArgumentTypes.size();
+  unsigned NumRegions = CurrentGroup.Regions.size();
+  TargetTransformInfo &TTI =
+      getTTI(*CurrentGroup.Regions[0]->Candidate->getFunction());
+
+  // We add one region to the cost once, to account for the instructions added
+  // inside of the newly created function.
+  LLVM_DEBUG(dbgs() << "Adding: " << AverageRegionBenefit
+                    << " instructions to cost for body of new function.\n");
+  CurrentGroup.Cost += AverageRegionBenefit;
+  LLVM_DEBUG(dbgs() << "Current Cost: " << CurrentGroup.Cost << "\n");
+
+  // For each argument, we must add an instruction for loading the argument
+  // out of the register and into a value inside of the newly outlined function.
+  LLVM_DEBUG(dbgs() << "Adding: " << OverallArgumentNum
+                    << " instructions to cost for each argument in the new"
+                    << " function.\n");
+  CurrentGroup.Cost +=
+      OverallArgumentNum * TargetTransformInfo::TCC_Basic;
+  LLVM_DEBUG(dbgs() << "Current Cost: " << CurrentGroup.Cost << "\n");
+
+  // Each argument needs to either be loaded into a register or onto the stack.
+  // Some arguments will only be loaded into the stack once the argument
+  // registers are filled.
+  LLVM_DEBUG(dbgs() << "Adding: " << OverallArgumentNum
+                    << " instructions to cost for each argument in the new"
+                    << " function " << NumRegions << " times for the "
+                    << "needed argument handling at the call site.\n");
+  CurrentGroup.Cost +=
+      2 * OverallArgumentNum * TargetTransformInfo::TCC_Basic * NumRegions;
+  LLVM_DEBUG(dbgs() << "Current Cost: " << CurrentGroup.Cost << "\n");
+
+  CurrentGroup.Cost += findCostForOutputBlocks(M, CurrentGroup, TTI);
+  LLVM_DEBUG(dbgs() << "Current Cost: " << CurrentGroup.Cost << "\n");
+}
+
+void IROutliner::updateOutputMapping(OutlinableRegion &Region,
+                                     ArrayRef<Value *> Outputs,
+                                     LoadInst *LI) {
+  // For and load instructions following the call
+  Value *Operand = LI->getPointerOperand();
+  Optional<unsigned> OutputIdx = None;
+  // Find if the operand it is an output register.
+  for (unsigned ArgIdx = Region.NumExtractedInputs;
+       ArgIdx < Region.Call->arg_size(); ArgIdx++) {
+    if (Operand == Region.Call->getArgOperand(ArgIdx)) {
+      OutputIdx = ArgIdx - Region.NumExtractedInputs;
+      break;
+    }
+  }
+
+  // If we found an output register, place a mapping of the new value
+  // to the original in the mapping.
+  if (!OutputIdx.hasValue())
+    return;
+
+  if (OutputMappings.find(Outputs[OutputIdx.getValue()]) ==
+      OutputMappings.end()) {
+    LLVM_DEBUG(dbgs() << "Mapping extracted output " << *LI << " to "
+                      << *Outputs[OutputIdx.getValue()] << "\n");
+    OutputMappings.insert(std::make_pair(LI, Outputs[OutputIdx.getValue()]));
+  } else {
+    Value *Orig = OutputMappings.find(Outputs[OutputIdx.getValue()])->second;
+    LLVM_DEBUG(dbgs() << "Mapping extracted output " << *Orig << " to "
+                      << *Outputs[OutputIdx.getValue()] << "\n");
+    OutputMappings.insert(std::make_pair(LI, Orig));
+  }
+}
+
+bool IROutliner::extractSection(OutlinableRegion &Region) {
+  SetVector<Value *> ArgInputs, Outputs, SinkCands;
+  Region.CE->findInputsOutputs(ArgInputs, Outputs, SinkCands);
+
+  assert(Region.StartBB && "StartBB for the OutlinableRegion is nullptr!");
+  assert(Region.FollowBB && "FollowBB for the OutlinableRegion is nullptr!");
+  Function *OrigF = Region.StartBB->getParent();
+  CodeExtractorAnalysisCache CEAC(*OrigF);
+  Region.ExtractedFunction = Region.CE->extractCodeRegion(CEAC);
+
+  // If the extraction was successful, find the BasicBlock, and reassign the
+  // OutlinableRegion blocks
+  if (!Region.ExtractedFunction) {
+    LLVM_DEBUG(dbgs() << "CodeExtractor failed to outline " << Region.StartBB
+                      << "\n");
+    Region.reattachCandidate();
+    return false;
+  }
+
+  BasicBlock *RewrittenBB = Region.FollowBB->getSinglePredecessor();
+  Region.StartBB = RewrittenBB;
+  Region.EndBB = RewrittenBB;
+
+  // The sequences of outlinable regions has now changed.  We must fix the
+  // IRInstructionDataList for consistency.  Although they may not be illegal
+  // instructions, they should not be compared with anything else as they
+  // should not be outlined in this round.  So marking these as illegal is
+  // allowed.
+  IRInstructionDataList *IDL = Region.Candidate->front()->IDL;
+  Instruction *BeginRewritten = &*RewrittenBB->begin();
+  Instruction *EndRewritten = &*RewrittenBB->begin();
+  Region.NewFront = new (InstDataAllocator.Allocate()) IRInstructionData(
+      *BeginRewritten, InstructionClassifier.visit(*BeginRewritten), *IDL);
+  Region.NewBack = new (InstDataAllocator.Allocate()) IRInstructionData(
+      *EndRewritten, InstructionClassifier.visit(*EndRewritten), *IDL);
+
+  // Insert the first IRInstructionData of the new region in front of the
+  // first IRInstructionData of the IRSimilarityCandidate.
+  IDL->insert(Region.Candidate->begin(), *Region.NewFront);
+  // Insert the first IRInstructionData of the new region after the
+  // last IRInstructionData of the IRSimilarityCandidate.
+  IDL->insert(Region.Candidate->end(), *Region.NewBack);
+  // Remove the IRInstructionData from the IRSimilarityCandidate.
+  IDL->erase(Region.Candidate->begin(), std::prev(Region.Candidate->end()));
+
+  assert(RewrittenBB != nullptr &&
+         "Could not find a predecessor after extraction!");
+
+  // Iterate over the new set of instructions to find the new call
+  // instruction.
+  for (Instruction &I : *RewrittenBB)
+    if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+      if (Region.ExtractedFunction == CI->getCalledFunction())
+        Region.Call = CI;
+    } else if (LoadInst *LI = dyn_cast<LoadInst>(&I))
+      updateOutputMapping(Region, Outputs.getArrayRef(), LI);
+  Region.reattachCandidate();
+  return true;
+}
+
+unsigned IROutliner::doOutline(Module &M) {
+  // Find the possible similarity sections.
+  IRSimilarityIdentifier &Identifier = getIRSI(M);
+  SimilarityGroupList &SimilarityCandidates = *Identifier.getSimilarity();
+
+  // Sort them by size of extracted sections
+  unsigned OutlinedFunctionNum = 0;
+  // If we only have one SimilarityGroup in SimilarityCandidates, we do not have
+  // to sort them by the potential number of instructions to be outlined
+  if (SimilarityCandidates.size() > 1)
+    llvm::stable_sort(SimilarityCandidates,
+                      [](const std::vector<IRSimilarityCandidate> &LHS,
+                         const std::vector<IRSimilarityCandidate> &RHS) {
+                        return LHS[0].getLength() * LHS.size() >
+                               RHS[0].getLength() * RHS.size();
+                      });
+
+  DenseSet<unsigned> NotSame;
+  std::vector<Function *> FuncsToRemove;
+  // Iterate over the possible sets of similarity.
+  for (SimilarityGroup &CandidateVec : SimilarityCandidates) {
+    OutlinableGroup CurrentGroup;
+
+    // Remove entries that were previously outlined
+    pruneIncompatibleRegions(CandidateVec, CurrentGroup);
+
+    // We pruned the number of regions to 0 to 1, meaning that it's not worth
+    // trying to outlined since there is no compatible similar instance of this
+    // code.
+    if (CurrentGroup.Regions.size() < 2)
+      continue;
+
+    // Determine if there are any values that are the same constant throughout
+    // each section in the set.
+    NotSame.clear();
+    CurrentGroup.findSameConstants(NotSame);
+
+    if (CurrentGroup.IgnoreGroup)
+      continue;
+
+    // Create a CodeExtractor for each outlinable region. Identify inputs and
+    // outputs for each section using the code extractor and create the argument
+    // types for the Aggregate Outlining Function.
+    std::vector<OutlinableRegion *> OutlinedRegions;
+    for (OutlinableRegion *OS : CurrentGroup.Regions) {
+      // Break the outlinable region out of its parent BasicBlock into its own
+      // BasicBlocks (see function implementation).
+      OS->splitCandidate();
+      std::vector<BasicBlock *> BE = {OS->StartBB};
+      OS->CE = new (ExtractorAllocator.Allocate())
+          CodeExtractor(BE, nullptr, false, nullptr, nullptr, nullptr, false,
+                        false, "outlined");
+      findAddInputsOutputs(M, *OS, NotSame);
+      if (!OS->IgnoreRegion)
+        OutlinedRegions.push_back(OS);
+      else
+        OS->reattachCandidate();
+    }
+
+    CurrentGroup.Regions = std::move(OutlinedRegions);
+
+    if (CurrentGroup.Regions.empty())
+      continue;
+
+    CurrentGroup.collectGVNStoreSets(M);
+
+    if (CostModel)
+      findCostBenefit(M, CurrentGroup);
+
+    // If we are adhering to the cost model, reattach all the candidates
+    if (CurrentGroup.Cost >= CurrentGroup.Benefit && CostModel) {
+      for (OutlinableRegion *OS : CurrentGroup.Regions)
+        OS->reattachCandidate();
+      OptimizationRemarkEmitter &ORE = getORE(
+          *CurrentGroup.Regions[0]->Candidate->getFunction());
+      ORE.emit([&]() {
+        IRSimilarityCandidate *C = CurrentGroup.Regions[0]->Candidate;
+        OptimizationRemarkMissed R(DEBUG_TYPE, "WouldNotDecreaseSize",
+                                   C->frontInstruction());
+        R << "did not outline "
+          << ore::NV(std::to_string(CurrentGroup.Regions.size()))
+          << " regions due to estimated increase of "
+          << ore::NV("InstructionIncrease",
+                     CurrentGroup.Cost - CurrentGroup.Benefit)
+          << " instructions at locations ";
+        interleave(
+            CurrentGroup.Regions.begin(), CurrentGroup.Regions.end(),
+            [&R](OutlinableRegion *Region) {
+              R << ore::NV(
+                  "DebugLoc",
+                  Region->Candidate->frontInstruction()->getDebugLoc());
+            },
+            [&R]() { R << " "; });
+        return R;
+      });
+      continue;
+    }
+
+    LLVM_DEBUG(dbgs() << "Outlining regions with cost " << CurrentGroup.Cost
+                      << " and benefit " << CurrentGroup.Benefit << "\n");
+
+    // Create functions out of all the sections, and mark them as outlined.
+    OutlinedRegions.clear();
+    for (OutlinableRegion *OS : CurrentGroup.Regions) {
+      bool FunctionOutlined = extractSection(*OS);
+      if (FunctionOutlined) {
+        unsigned StartIdx = OS->Candidate->getStartIdx();
+        unsigned EndIdx = OS->Candidate->getEndIdx();
+        for (unsigned Idx = StartIdx; Idx <= EndIdx; Idx++)
+          Outlined.insert(Idx);
+
+        OutlinedRegions.push_back(OS);
+      }
+    }
+
+    LLVM_DEBUG(dbgs() << "Outlined " << OutlinedRegions.size()
+                      << " with benefit " << CurrentGroup.Benefit
+                      << " and cost " << CurrentGroup.Cost << "\n");
+
+    CurrentGroup.Regions = std::move(OutlinedRegions);
+
+    if (CurrentGroup.Regions.empty())
+      continue;
+
+    OptimizationRemarkEmitter &ORE =
+        getORE(*CurrentGroup.Regions[0]->Call->getFunction());
+    ORE.emit([&]() {
+      IRSimilarityCandidate *C = CurrentGroup.Regions[0]->Candidate;
+      OptimizationRemark R(DEBUG_TYPE, "Outlined", C->front()->Inst);
+      R << "outlined " << ore::NV(std::to_string(CurrentGroup.Regions.size()))
+        << " regions with decrease of "
+        << ore::NV("Benefit", CurrentGroup.Benefit - CurrentGroup.Cost)
+        << " instructions at locations ";
+      interleave(
+          CurrentGroup.Regions.begin(), CurrentGroup.Regions.end(),
+          [&R](OutlinableRegion *Region) {
+            R << ore::NV("DebugLoc",
+                         Region->Candidate->frontInstruction()->getDebugLoc());
+          },
+          [&R]() { R << " "; });
+      return R;
+    });
+
+    deduplicateExtractedSections(M, CurrentGroup, FuncsToRemove,
+                                 OutlinedFunctionNum);
+  }
+
+  for (Function *F : FuncsToRemove)
+    F->eraseFromParent();
+
+  return OutlinedFunctionNum;
+}
+
+bool IROutliner::run(Module &M) {
+  CostModel = !NoCostModel;
+  OutlineFromLinkODRs = EnableLinkOnceODRIROutlining;
+
+  return doOutline(M) > 0;
+}
+
+// Pass Manager Boilerplate
+class IROutlinerLegacyPass : public ModulePass {
+public:
+  static char ID;
+  IROutlinerLegacyPass() : ModulePass(ID) {
+    initializeIROutlinerLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<IRSimilarityIdentifierWrapperPass>();
+  }
+
+  bool runOnModule(Module &M) override;
+};
+
+bool IROutlinerLegacyPass::runOnModule(Module &M) {
+  if (skipModule(M))
+    return false;
+
+  std::unique_ptr<OptimizationRemarkEmitter> ORE;
+  auto GORE = [&ORE](Function &F) -> OptimizationRemarkEmitter & {
+    ORE.reset(new OptimizationRemarkEmitter(&F));
+    return *ORE.get();
+  };
+
+  auto GTTI = [this](Function &F) -> TargetTransformInfo & {
+    return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+  };
+
+  auto GIRSI = [this](Module &) -> IRSimilarityIdentifier & {
+    return this->getAnalysis<IRSimilarityIdentifierWrapperPass>().getIRSI();
+  };
+
+  return IROutliner(GTTI, GIRSI, GORE).run(M);
+}
+
+PreservedAnalyses IROutlinerPass::run(Module &M, ModuleAnalysisManager &AM) {
+  auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+
+  std::function<TargetTransformInfo &(Function &)> GTTI =
+      [&FAM](Function &F) -> TargetTransformInfo & {
+    return FAM.getResult<TargetIRAnalysis>(F);
+  };
+
+  std::function<IRSimilarityIdentifier &(Module &)> GIRSI =
+      [&AM](Module &M) -> IRSimilarityIdentifier & {
+    return AM.getResult<IRSimilarityAnalysis>(M);
+  };
+
+  std::unique_ptr<OptimizationRemarkEmitter> ORE;
+  std::function<OptimizationRemarkEmitter &(Function &)> GORE =
+      [&ORE](Function &F) -> OptimizationRemarkEmitter & {
+    ORE.reset(new OptimizationRemarkEmitter(&F));
+    return *ORE.get();
+  };
+
+  if (IROutliner(GTTI, GIRSI, GORE).run(M))
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+}
+
+char IROutlinerLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(IROutlinerLegacyPass, "iroutliner", "IR Outliner", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(IRSimilarityIdentifierWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(IROutlinerLegacyPass, "iroutliner", "IR Outliner", false,
+                    false)
+
+ModulePass *llvm::createIROutlinerPass() { return new IROutlinerLegacyPass(); }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/Inliner.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/Inliner.cpp
index 7d2260f4c169..e91b6c9b1d26 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/Inliner.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/Inliner.cpp
@@ -23,7 +23,6 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
@@ -37,6 +36,7 @@
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/Utils/ImportedFunctionsInliningStatistics.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/DataLayout.h"
@@ -60,7 +60,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/CallPromotionUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/ImportedFunctionsInliningStatistics.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 #include <algorithm>
@@ -91,24 +90,14 @@ static cl::opt<bool>
     DisableInlinedAllocaMerging("disable-inlined-alloca-merging",
                                 cl::init(false), cl::Hidden);
 
-namespace {
+extern cl::opt<InlinerFunctionImportStatsOpts> InlinerFunctionImportStats;
 
-enum class InlinerFunctionImportStatsOpts {
-  No = 0,
-  Basic = 1,
-  Verbose = 2,
-};
-
-} // end anonymous namespace
-
-static cl::opt<InlinerFunctionImportStatsOpts> InlinerFunctionImportStats(
-    "inliner-function-import-stats",
-    cl::init(InlinerFunctionImportStatsOpts::No),
-    cl::values(clEnumValN(InlinerFunctionImportStatsOpts::Basic, "basic",
-                          "basic statistics"),
-               clEnumValN(InlinerFunctionImportStatsOpts::Verbose, "verbose",
-                          "printing of statistics for each inlined function")),
-    cl::Hidden, cl::desc("Enable inliner stats for imported functions"));
+static cl::opt<std::string> CGSCCInlineReplayFile(
+    "cgscc-inline-replay", cl::init(""), cl::value_desc("filename"),
+    cl::desc(
+        "Optimization remarks file containing inline remarks to be replayed "
+        "by inlining from cgscc inline remarks."),
+    cl::Hidden);
 
 LegacyInlinerBase::LegacyInlinerBase(char &ID) : CallGraphSCCPass(ID) {}
 
@@ -648,17 +637,12 @@ bool LegacyInlinerBase::removeDeadFunctions(CallGraph &CG,
   return true;
 }
 
-InlinerPass::~InlinerPass() {
-  if (ImportedFunctionsStats) {
-    assert(InlinerFunctionImportStats != InlinerFunctionImportStatsOpts::No);
-    ImportedFunctionsStats->dump(InlinerFunctionImportStats ==
-                                 InlinerFunctionImportStatsOpts::Verbose);
-  }
-}
-
 InlineAdvisor &
 InlinerPass::getAdvisor(const ModuleAnalysisManagerCGSCCProxy::Result &MAM,
                         FunctionAnalysisManager &FAM, Module &M) {
+  if (OwnedAdvisor)
+    return *OwnedAdvisor;
+
   auto *IAA = MAM.getCachedResult<InlineAdvisorAnalysis>(M);
   if (!IAA) {
     // It should still be possible to run the inliner as a stand-alone SCC pass,
@@ -669,8 +653,16 @@ InlinerPass::getAdvisor(const ModuleAnalysisManagerCGSCCProxy::Result &MAM,
     // duration of the inliner pass, and thus the lifetime of the owned advisor.
     // The one we would get from the MAM can be invalidated as a result of the
     // inliner's activity.
-    OwnedDefaultAdvisor.emplace(FAM, getInlineParams());
-    return *OwnedDefaultAdvisor;
+    OwnedAdvisor =
+        std::make_unique<DefaultInlineAdvisor>(M, FAM, getInlineParams());
+
+    if (!CGSCCInlineReplayFile.empty())
+      OwnedAdvisor = std::make_unique<ReplayInlineAdvisor>(
+          M, FAM, M.getContext(), std::move(OwnedAdvisor),
+          CGSCCInlineReplayFile,
+          /*EmitRemarks=*/true);
+
+    return *OwnedAdvisor;
   }
   assert(IAA->getAdvisor() &&
          "Expected a present InlineAdvisorAnalysis also have an "
@@ -698,20 +690,13 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
 
   auto AdvisorOnExit = make_scope_exit([&] { Advisor.onPassExit(); });
 
-  if (!ImportedFunctionsStats &&
-      InlinerFunctionImportStats != InlinerFunctionImportStatsOpts::No) {
-    ImportedFunctionsStats =
-        std::make_unique<ImportedFunctionsInliningStatistics>();
-    ImportedFunctionsStats->setModuleInfo(M);
-  }
-
   // We use a single common worklist for calls across the entire SCC. We
   // process these in-order and append new calls introduced during inlining to
   // the end.
   //
   // Note that this particular order of processing is actually critical to
   // avoid very bad behaviors. Consider *highly connected* call graphs where
-  // each function contains a small amonut of code and a couple of calls to
+  // each function contains a small amount of code and a couple of calls to
   // other functions. Because the LLVM inliner is fundamentally a bottom-up
   // inliner, it can handle gracefully the fact that these all appear to be
   // reasonable inlining candidates as it will flatten things until they become
@@ -761,9 +746,8 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
   if (Calls.empty())
     return PreservedAnalyses::all();
 
-  // Capture updatable variables for the current SCC and RefSCC.
+  // Capture updatable variable for the current SCC.
   auto *C = &InitialC;
-  auto *RC = &C->getOuterRefSCC();
 
   // When inlining a callee produces new call sites, we want to keep track of
   // the fact that they were inlined from the callee.  This allows us to avoid
@@ -791,12 +775,6 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
     LazyCallGraph::Node &N = *CG.lookup(F);
     if (CG.lookupSCC(N) != C)
       continue;
-    if (!Calls[I].first->getCalledFunction()->hasFnAttribute(
-            Attribute::AlwaysInline) &&
-        F.hasOptNone()) {
-      setInlineRemark(*Calls[I].first, "optnone attribute");
-      continue;
-    }
 
     LLVM_DEBUG(dbgs() << "Inlining calls in: " << F.getName() << "\n");
 
@@ -834,7 +812,7 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
         continue;
       }
 
-      auto Advice = Advisor.getAdvice(*CB);
+      auto Advice = Advisor.getAdvice(*CB, OnlyMandatory);
       // Check whether we want to inline this callsite.
       if (!Advice->isInliningRecommended()) {
         Advice->recordUnattemptedInlining();
@@ -848,7 +826,8 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
           &FAM.getResult<BlockFrequencyAnalysis>(*(CB->getCaller())),
           &FAM.getResult<BlockFrequencyAnalysis>(Callee));
 
-      InlineResult IR = InlineFunction(*CB, IFI);
+      InlineResult IR =
+          InlineFunction(*CB, IFI, &FAM.getResult<AAManager>(*CB->getCaller()));
       if (!IR.isSuccess()) {
         Advice->recordUnsuccessfulInlining(IR);
         continue;
@@ -879,9 +858,6 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
         }
       }
 
-      if (InlinerFunctionImportStats != InlinerFunctionImportStatsOpts::No)
-        ImportedFunctionsStats->recordInline(F, Callee);
-
       // Merge the attributes based on the inlining.
       AttributeFuncs::mergeAttributesForInlining(F, Callee);
 
@@ -906,7 +882,7 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
           // Note that after this point, it is an error to do anything other
           // than use the callee's address or delete it.
           Callee.dropAllReferences();
-          assert(find(DeadFunctions, &Callee) == DeadFunctions.end() &&
+          assert(!is_contained(DeadFunctions, &Callee) &&
                  "Cannot put cause a function to become dead twice!");
           DeadFunctions.push_back(&Callee);
           CalleeWasDeleted = true;
@@ -926,20 +902,6 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
       continue;
     Changed = true;
 
-    // Add all the inlined callees' edges as ref edges to the caller. These are
-    // by definition trivial edges as we always have *some* transitive ref edge
-    // chain. While in some cases these edges are direct calls inside the
-    // callee, they have to be modeled in the inliner as reference edges as
-    // there may be a reference edge anywhere along the chain from the current
-    // caller to the callee that causes the whole thing to appear like
-    // a (transitive) reference edge that will require promotion to a call edge
-    // below.
-    for (Function *InlinedCallee : InlinedCallees) {
-      LazyCallGraph::Node &CalleeN = *CG.lookup(*InlinedCallee);
-      for (LazyCallGraph::Edge &E : *CalleeN)
-        RC->insertTrivialRefEdge(N, E.getNode());
-    }
-
     // At this point, since we have made changes we have at least removed
     // a call instruction. However, in the process we do some incremental
     // simplification of the surrounding code. This simplification can
@@ -952,9 +914,8 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
     // as we're going to mutate this particular function we want to make sure
     // the proxy is in place to forward any invalidation events.
     LazyCallGraph::SCC *OldC = C;
-    C = &updateCGAndAnalysisManagerForFunctionPass(CG, *C, N, AM, UR, FAM);
+    C = &updateCGAndAnalysisManagerForCGSCCPass(CG, *C, N, AM, UR, FAM);
     LLVM_DEBUG(dbgs() << "Updated inlining SCC: " << *C << "\n");
-    RC = &C->getOuterRefSCC();
 
     // If this causes an SCC to split apart into multiple smaller SCCs, there
     // is a subtle risk we need to prepare for. Other transformations may
@@ -1033,6 +994,7 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
 
 ModuleInlinerWrapperPass::ModuleInlinerWrapperPass(InlineParams Params,
                                                    bool Debugging,
+                                                   bool MandatoryFirst,
                                                    InliningAdvisorMode Mode,
                                                    unsigned MaxDevirtIterations)
     : Params(Params), Mode(Mode), MaxDevirtIterations(MaxDevirtIterations),
@@ -1042,13 +1004,15 @@ ModuleInlinerWrapperPass::ModuleInlinerWrapperPass(InlineParams Params,
   // into the callers so that our optimizations can reflect that.
   // For PreLinkThinLTO pass, we disable hot-caller heuristic for sample PGO
   // because it makes profile annotation in the backend inaccurate.
+  if (MandatoryFirst)
+    PM.addPass(InlinerPass(/*OnlyMandatory*/ true));
   PM.addPass(InlinerPass());
 }
 
 PreservedAnalyses ModuleInlinerWrapperPass::run(Module &M,
                                                 ModuleAnalysisManager &MAM) {
   auto &IAA = MAM.getResult<InlineAdvisorAnalysis>(M);
-  if (!IAA.tryCreate(Params, Mode)) {
+  if (!IAA.tryCreate(Params, Mode, CGSCCInlineReplayFile)) {
     M.getContext().emitError(
         "Could not setup Inlining Advisor for the requested "
         "mode and/or options");
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/LoopExtractor.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/LoopExtractor.cpp
index f7f5b4cf6704..a497c0390bce 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/LoopExtractor.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/LoopExtractor.cpp
@@ -13,12 +13,14 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/IPO/LoopExtractor.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
@@ -36,51 +38,71 @@ using namespace llvm;
 STATISTIC(NumExtracted, "Number of loops extracted");
 
 namespace {
-  struct LoopExtractor : public ModulePass {
-    static char ID; // Pass identification, replacement for typeid
+struct LoopExtractorLegacyPass : public ModulePass {
+  static char ID; // Pass identification, replacement for typeid
 
-    // The number of natural loops to extract from the program into functions.
-    unsigned NumLoops;
+  unsigned NumLoops;
 
-    explicit LoopExtractor(unsigned numLoops = ~0)
-        : ModulePass(ID), NumLoops(numLoops) {
-      initializeLoopExtractorPass(*PassRegistry::getPassRegistry());
-    }
-
-    bool runOnModule(Module &M) override;
-    bool runOnFunction(Function &F);
+  explicit LoopExtractorLegacyPass(unsigned NumLoops = ~0)
+      : ModulePass(ID), NumLoops(NumLoops) {
+    initializeLoopExtractorLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
 
-    bool extractLoops(Loop::iterator From, Loop::iterator To, LoopInfo &LI,
-                      DominatorTree &DT);
-    bool extractLoop(Loop *L, LoopInfo &LI, DominatorTree &DT);
+  bool runOnModule(Module &M) override;
 
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequiredID(BreakCriticalEdgesID);
-      AU.addRequired<DominatorTreeWrapperPass>();
-      AU.addRequired<LoopInfoWrapperPass>();
-      AU.addPreserved<LoopInfoWrapperPass>();
-      AU.addRequiredID(LoopSimplifyID);
-      AU.addUsedIfAvailable<AssumptionCacheTracker>();
-    }
-  };
-}
-
-char LoopExtractor::ID = 0;
-INITIALIZE_PASS_BEGIN(LoopExtractor, "loop-extract",
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequiredID(BreakCriticalEdgesID);
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
+    AU.addRequiredID(LoopSimplifyID);
+    AU.addUsedIfAvailable<AssumptionCacheTracker>();
+  }
+};
+
+struct LoopExtractor {
+  explicit LoopExtractor(
+      unsigned NumLoops,
+      function_ref<DominatorTree &(Function &)> LookupDomTree,
+      function_ref<LoopInfo &(Function &)> LookupLoopInfo,
+      function_ref<AssumptionCache *(Function &)> LookupAssumptionCache)
+      : NumLoops(NumLoops), LookupDomTree(LookupDomTree),
+        LookupLoopInfo(LookupLoopInfo),
+        LookupAssumptionCache(LookupAssumptionCache) {}
+  bool runOnModule(Module &M);
+
+private:
+  // The number of natural loops to extract from the program into functions.
+  unsigned NumLoops;
+
+  function_ref<DominatorTree &(Function &)> LookupDomTree;
+  function_ref<LoopInfo &(Function &)> LookupLoopInfo;
+  function_ref<AssumptionCache *(Function &)> LookupAssumptionCache;
+
+  bool runOnFunction(Function &F);
+
+  bool extractLoops(Loop::iterator From, Loop::iterator To, LoopInfo &LI,
+                    DominatorTree &DT);
+  bool extractLoop(Loop *L, LoopInfo &LI, DominatorTree &DT);
+};
+} // namespace
+
+char LoopExtractorLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopExtractorLegacyPass, "loop-extract",
                       "Extract loops into new functions", false, false)
 INITIALIZE_PASS_DEPENDENCY(BreakCriticalEdges)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
-INITIALIZE_PASS_END(LoopExtractor, "loop-extract",
+INITIALIZE_PASS_END(LoopExtractorLegacyPass, "loop-extract",
                     "Extract loops into new functions", false, false)
 
 namespace {
   /// SingleLoopExtractor - For bugpoint.
-  struct SingleLoopExtractor : public LoopExtractor {
-    static char ID; // Pass identification, replacement for typeid
-    SingleLoopExtractor() : LoopExtractor(1) {}
-  };
+struct SingleLoopExtractor : public LoopExtractorLegacyPass {
+  static char ID; // Pass identification, replacement for typeid
+  SingleLoopExtractor() : LoopExtractorLegacyPass(1) {}
+};
 } // End anonymous namespace
 
 char SingleLoopExtractor::ID = 0;
@@ -90,12 +112,30 @@ INITIALIZE_PASS(SingleLoopExtractor, "loop-extract-single",
 // createLoopExtractorPass - This pass extracts all natural loops from the
 // program into a function if it can.
 //
-Pass *llvm::createLoopExtractorPass() { return new LoopExtractor(); }
+Pass *llvm::createLoopExtractorPass() { return new LoopExtractorLegacyPass(); }
 
-bool LoopExtractor::runOnModule(Module &M) {
+bool LoopExtractorLegacyPass::runOnModule(Module &M) {
   if (skipModule(M))
     return false;
 
+  bool Changed = false;
+  auto LookupDomTree = [this](Function &F) -> DominatorTree & {
+    return this->getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
+  };
+  auto LookupLoopInfo = [this, &Changed](Function &F) -> LoopInfo & {
+    return this->getAnalysis<LoopInfoWrapperPass>(F, &Changed).getLoopInfo();
+  };
+  auto LookupACT = [this](Function &F) -> AssumptionCache * {
+    if (auto *ACT = this->getAnalysisIfAvailable<AssumptionCacheTracker>())
+      return ACT->lookupAssumptionCache(F);
+    return nullptr;
+  };
+  return LoopExtractor(NumLoops, LookupDomTree, LookupLoopInfo, LookupACT)
+             .runOnModule(M) ||
+         Changed;
+}
+
+bool LoopExtractor::runOnModule(Module &M) {
   if (M.empty())
     return false;
 
@@ -132,13 +172,13 @@ bool LoopExtractor::runOnFunction(Function &F) {
     return false;
 
   bool Changed = false;
-  LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>(F, &Changed).getLoopInfo();
+  LoopInfo &LI = LookupLoopInfo(F);
 
   // If there are no loops in the function.
   if (LI.empty())
     return Changed;
 
-  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
+  DominatorTree &DT = LookupDomTree(F);
 
   // If there is more than one top-level loop in this function, extract all of
   // the loops.
@@ -203,10 +243,8 @@ bool LoopExtractor::extractLoops(Loop::iterator From, Loop::iterator To,
 
 bool LoopExtractor::extractLoop(Loop *L, LoopInfo &LI, DominatorTree &DT) {
   assert(NumLoops != 0);
-  AssumptionCache *AC = nullptr;
   Function &Func = *L->getHeader()->getParent();
-  if (auto *ACT = getAnalysisIfAvailable<AssumptionCacheTracker>())
-    AC = ACT->lookupAssumptionCache(Func);
+  AssumptionCache *AC = LookupAssumptionCache(Func);
   CodeExtractorAnalysisCache CEAC(Func);
   CodeExtractor Extractor(DT, *L, false, nullptr, nullptr, AC);
   if (Extractor.extractCodeRegion(CEAC)) {
@@ -224,3 +262,24 @@ bool LoopExtractor::extractLoop(Loop *L, LoopInfo &LI, DominatorTree &DT) {
 Pass *llvm::createSingleLoopExtractorPass() {
   return new SingleLoopExtractor();
 }
+
+PreservedAnalyses LoopExtractorPass::run(Module &M, ModuleAnalysisManager &AM) {
+  auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  auto LookupDomTree = [&FAM](Function &F) -> DominatorTree & {
+    return FAM.getResult<DominatorTreeAnalysis>(F);
+  };
+  auto LookupLoopInfo = [&FAM](Function &F) -> LoopInfo & {
+    return FAM.getResult<LoopAnalysis>(F);
+  };
+  auto LookupAssumptionCache = [&FAM](Function &F) -> AssumptionCache * {
+    return FAM.getCachedResult<AssumptionAnalysis>(F);
+  };
+  if (!LoopExtractor(NumLoops, LookupDomTree, LookupLoopInfo,
+                     LookupAssumptionCache)
+           .runOnModule(M))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserve<LoopAnalysis>();
+  return PA;
+}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
index 8eef7e3e7e99..8bd3036f1fc3 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -198,7 +198,7 @@ void GlobalLayoutBuilder::addFragment(const std::set<uint64_t> &F) {
       // indices from the old fragment in this fragment do not insert any more
       // indices.
       std::vector<uint64_t> &OldFragment = Fragments[OldFragmentIndex];
-      Fragment.insert(Fragment.end(), OldFragment.begin(), OldFragment.end());
+      llvm::append_range(Fragment, OldFragment);
       OldFragment.clear();
     }
   }
@@ -1205,6 +1205,7 @@ void LowerTypeTestsModule::verifyTypeMDNode(GlobalObject *GO, MDNode *Type) {
 
 static const unsigned kX86JumpTableEntrySize = 8;
 static const unsigned kARMJumpTableEntrySize = 4;
+static const unsigned kARMBTIJumpTableEntrySize = 8;
 
 unsigned LowerTypeTestsModule::getJumpTableEntrySize() {
   switch (Arch) {
@@ -1213,7 +1214,12 @@ unsigned LowerTypeTestsModule::getJumpTableEntrySize() {
       return kX86JumpTableEntrySize;
     case Triple::arm:
     case Triple::thumb:
+      return kARMJumpTableEntrySize;
     case Triple::aarch64:
+      if (const auto *BTE = mdconst::extract_or_null<ConstantInt>(
+            M.getModuleFlag("branch-target-enforcement")))
+        if (BTE->getZExtValue())
+          return kARMBTIJumpTableEntrySize;
       return kARMJumpTableEntrySize;
     default:
       report_fatal_error("Unsupported architecture for jump tables");
@@ -1232,7 +1238,13 @@ void LowerTypeTestsModule::createJumpTableEntry(
   if (JumpTableArch == Triple::x86 || JumpTableArch == Triple::x86_64) {
     AsmOS << "jmp ${" << ArgIndex << ":c}@plt\n";
     AsmOS << "int3\nint3\nint3\n";
-  } else if (JumpTableArch == Triple::arm || JumpTableArch == Triple::aarch64) {
+  } else if (JumpTableArch == Triple::arm) {
+    AsmOS << "b $" << ArgIndex << "\n";
+  } else if (JumpTableArch == Triple::aarch64) {
+    if (const auto *BTE = mdconst::extract_or_null<ConstantInt>(
+          Dest->getParent()->getModuleFlag("branch-target-enforcement")))
+      if (BTE->getZExtValue())
+        AsmOS << "bti c\n";
     AsmOS << "b $" << ArgIndex << "\n";
   } else if (JumpTableArch == Triple::thumb) {
     AsmOS << "b.w $" << ArgIndex << "\n";
@@ -1326,7 +1338,7 @@ void LowerTypeTestsModule::replaceWeakDeclarationWithJumpTablePtr(
 
 static bool isThumbFunction(Function *F, Triple::ArchType ModuleArch) {
   Attribute TFAttr = F->getFnAttribute("target-features");
-  if (!TFAttr.hasAttribute(Attribute::None)) {
+  if (TFAttr.isValid()) {
     SmallVector<StringRef, 6> Features;
     TFAttr.getValueAsString().split(Features, ',');
     for (StringRef Feature : Features) {
@@ -1394,6 +1406,10 @@ void LowerTypeTestsModule::createJumpTable(
     // by Clang for -march=armv7.
     F->addFnAttr("target-cpu", "cortex-a8");
   }
+  if (JumpTableArch == Triple::aarch64) {
+    F->addFnAttr("branch-target-enforcement", "false");
+    F->addFnAttr("sign-return-address", "none");
+  }
   // Make sure we don't emit .eh_frame for this function.
   F->addFnAttr(Attribute::NoUnwind);
 
@@ -2239,9 +2255,13 @@ bool LowerTypeTestsModule::lower() {
 
 PreservedAnalyses LowerTypeTestsPass::run(Module &M,
                                           ModuleAnalysisManager &AM) {
-  bool Changed =
-      LowerTypeTestsModule(M, ExportSummary, ImportSummary, DropTypeTests)
-          .lower();
+  bool Changed;
+  if (UseCommandLine)
+    Changed = LowerTypeTestsModule::runForTesting(M);
+  else
+    Changed =
+        LowerTypeTestsModule(M, ExportSummary, ImportSummary, DropTypeTests)
+            .lower();
   if (!Changed)
     return PreservedAnalyses::all();
   return PreservedAnalyses::none();
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/MergeFunctions.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/MergeFunctions.cpp
index 8cc19515f3db..ec5d86b72a1f 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/MergeFunctions.cpp
@@ -725,8 +725,10 @@ void MergeFunctions::writeThunk(Function *F, Function *G) {
   if (MergeFunctionsPDI) {
     DISubprogram *DIS = G->getSubprogram();
     if (DIS) {
-      DebugLoc CIDbgLoc = DebugLoc::get(DIS->getScopeLine(), 0, DIS);
-      DebugLoc RIDbgLoc = DebugLoc::get(DIS->getScopeLine(), 0, DIS);
+      DebugLoc CIDbgLoc =
+          DILocation::get(DIS->getContext(), DIS->getScopeLine(), 0, DIS);
+      DebugLoc RIDbgLoc =
+          DILocation::get(DIS->getContext(), DIS->getScopeLine(), 0, DIS);
       CI->setDebugLoc(CIDbgLoc);
       RI->setDebugLoc(RIDbgLoc);
     } else {
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index f664a2417374..a5ba6edb9a00 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -19,13 +19,16 @@
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
 #include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/Attributor.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/CallGraphUpdater.h"
+#include "llvm/Transforms/Utils/CodeExtractor.h"
 
 using namespace llvm;
 using namespace omp;
@@ -37,11 +40,22 @@ static cl::opt<bool> DisableOpenMPOptimizations(
     cl::desc("Disable OpenMP specific optimizations."), cl::Hidden,
     cl::init(false));
 
+static cl::opt<bool> EnableParallelRegionMerging(
+    "openmp-opt-enable-merging", cl::ZeroOrMore,
+    cl::desc("Enable the OpenMP region merging optimization."), cl::Hidden,
+    cl::init(false));
+
 static cl::opt<bool> PrintICVValues("openmp-print-icv-values", cl::init(false),
                                     cl::Hidden);
 static cl::opt<bool> PrintOpenMPKernels("openmp-print-gpu-kernels",
                                         cl::init(false), cl::Hidden);
 
+static cl::opt<bool> HideMemoryTransferLatency(
+    "openmp-hide-memory-transfer-latency",
+    cl::desc("[WIP] Tries to hide the latency of host to device memory"
+             " transfers"),
+    cl::Hidden, cl::init(false));
+
 STATISTIC(NumOpenMPRuntimeCallsDeduplicated,
           "Number of OpenMP runtime calls deduplicated");
 STATISTIC(NumOpenMPParallelRegionsDeleted,
@@ -55,70 +69,13 @@ STATISTIC(NumOpenMPTargetRegionKernels,
 STATISTIC(
     NumOpenMPParallelRegionsReplacedInGPUStateMachine,
     "Number of OpenMP parallel regions replaced with ID in GPU state machines");
+STATISTIC(NumOpenMPParallelRegionsMerged,
+          "Number of OpenMP parallel regions merged");
 
 #if !defined(NDEBUG)
 static constexpr auto TAG = "[" DEBUG_TYPE "]";
 #endif
 
-/// Apply \p CB to all uses of \p F. If \p LookThroughConstantExprUses is
-/// true, constant expression users are not given to \p CB but their uses are
-/// traversed transitively.
-template <typename CBTy>
-static void foreachUse(Function &F, CBTy CB,
-                       bool LookThroughConstantExprUses = true) {
-  SmallVector<Use *, 8> Worklist(make_pointer_range(F.uses()));
-
-  for (unsigned idx = 0; idx < Worklist.size(); ++idx) {
-    Use &U = *Worklist[idx];
-
-    // Allow use in constant bitcasts and simply look through them.
-    if (LookThroughConstantExprUses && isa<ConstantExpr>(U.getUser())) {
-      for (Use &CEU : cast<ConstantExpr>(U.getUser())->uses())
-        Worklist.push_back(&CEU);
-      continue;
-    }
-
-    CB(U);
-  }
-}
-
-/// Helper struct to store tracked ICV values at specif instructions.
-struct ICVValue {
-  Instruction *Inst;
-  Value *TrackedValue;
-
-  ICVValue(Instruction *I, Value *Val) : Inst(I), TrackedValue(Val) {}
-};
-
-namespace llvm {
-
-// Provide DenseMapInfo for ICVValue
-template <> struct DenseMapInfo<ICVValue> {
-  using InstInfo = DenseMapInfo<Instruction *>;
-  using ValueInfo = DenseMapInfo<Value *>;
-
-  static inline ICVValue getEmptyKey() {
-    return ICVValue(InstInfo::getEmptyKey(), ValueInfo::getEmptyKey());
-  };
-
-  static inline ICVValue getTombstoneKey() {
-    return ICVValue(InstInfo::getTombstoneKey(), ValueInfo::getTombstoneKey());
-  };
-
-  static unsigned getHashValue(const ICVValue &ICVVal) {
-    return detail::combineHashValue(
-        InstInfo::getHashValue(ICVVal.Inst),
-        ValueInfo::getHashValue(ICVVal.TrackedValue));
-  }
-
-  static bool isEqual(const ICVValue &LHS, const ICVValue &RHS) {
-    return InstInfo::isEqual(LHS.Inst, RHS.Inst) &&
-           ValueInfo::isEqual(LHS.TrackedValue, RHS.TrackedValue);
-  }
-};
-
-} // end namespace llvm
-
 namespace {
 
 struct AAICVTracker;
@@ -131,7 +88,6 @@ struct OMPInformationCache : public InformationCache {
                       SmallPtrSetImpl<Kernel> &Kernels)
       : InformationCache(M, AG, Allocator, &CGSCC), OMPBuilder(M),
         Kernels(Kernels) {
-    initializeModuleSlice(CGSCC);
 
     OMPBuilder.initialize();
     initializeRuntimeFunctions();
@@ -258,46 +214,6 @@ struct OMPInformationCache : public InformationCache {
     DenseMap<Function *, std::shared_ptr<UseVector>> UsesMap;
   };
 
-  /// Initialize the ModuleSlice member based on \p SCC. ModuleSlices contains
-  /// (a subset of) all functions that we can look at during this SCC traversal.
-  /// This includes functions (transitively) called from the SCC and the
-  /// (transitive) callers of SCC functions. We also can look at a function if
-  /// there is a "reference edge", i.a., if the function somehow uses (!=calls)
-  /// a function in the SCC or a caller of a function in the SCC.
-  void initializeModuleSlice(SetVector<Function *> &SCC) {
-    ModuleSlice.insert(SCC.begin(), SCC.end());
-
-    SmallPtrSet<Function *, 16> Seen;
-    SmallVector<Function *, 16> Worklist(SCC.begin(), SCC.end());
-    while (!Worklist.empty()) {
-      Function *F = Worklist.pop_back_val();
-      ModuleSlice.insert(F);
-
-      for (Instruction &I : instructions(*F))
-        if (auto *CB = dyn_cast<CallBase>(&I))
-          if (Function *Callee = CB->getCalledFunction())
-            if (Seen.insert(Callee).second)
-              Worklist.push_back(Callee);
-    }
-
-    Seen.clear();
-    Worklist.append(SCC.begin(), SCC.end());
-    while (!Worklist.empty()) {
-      Function *F = Worklist.pop_back_val();
-      ModuleSlice.insert(F);
-
-      // Traverse all transitive uses.
-      foreachUse(*F, [&](Use &U) {
-        if (auto *UsrI = dyn_cast<Instruction>(U.getUser()))
-          if (Seen.insert(UsrI->getFunction()).second)
-            Worklist.push_back(UsrI->getFunction());
-      });
-    }
-  }
-
-  /// The slice of the module we are allowed to look at.
-  SmallPtrSet<Function *, 8> ModuleSlice;
-
   /// An OpenMP-IR-Builder instance
   OpenMPIRBuilder OMPBuilder;
 
@@ -402,13 +318,17 @@ struct OMPInformationCache : public InformationCache {
     return NumUses;
   }
 
+  // Helper function to recollect uses of a runtime function.
+  void recollectUsesForFunction(RuntimeFunction RTF) {
+    auto &RFI = RFIs[RTF];
+    RFI.clearUsesMap();
+    collectUses(RFI, /*CollectStats*/ false);
+  }
+
   // Helper function to recollect uses of all runtime functions.
   void recollectUses() {
-    for (int Idx = 0; Idx < RFIs.size(); ++Idx) {
-      auto &RFI = RFIs[static_cast<RuntimeFunction>(Idx)];
-      RFI.clearUsesMap();
-      collectUses(RFI, /*CollectStats*/ false);
-    }
+    for (int Idx = 0; Idx < RFIs.size(); ++Idx)
+      recollectUsesForFunction(static_cast<RuntimeFunction>(Idx));
   }
 
   /// Helper to initialize all runtime function information for those defined
@@ -472,6 +392,91 @@ struct OMPInformationCache : public InformationCache {
   SmallPtrSetImpl<Kernel> &Kernels;
 };
 
+/// Used to map the values physically (in the IR) stored in an offload
+/// array, to a vector in memory.
+struct OffloadArray {
+  /// Physical array (in the IR).
+  AllocaInst *Array = nullptr;
+  /// Mapped values.
+  SmallVector<Value *, 8> StoredValues;
+  /// Last stores made in the offload array.
+  SmallVector<StoreInst *, 8> LastAccesses;
+
+  OffloadArray() = default;
+
+  /// Initializes the OffloadArray with the values stored in \p Array before
+  /// instruction \p Before is reached. Returns false if the initialization
+  /// fails.
+  /// This MUST be used immediately after the construction of the object.
+  bool initialize(AllocaInst &Array, Instruction &Before) {
+    if (!Array.getAllocatedType()->isArrayTy())
+      return false;
+
+    if (!getValues(Array, Before))
+      return false;
+
+    this->Array = &Array;
+    return true;
+  }
+
+  static const unsigned DeviceIDArgNum = 1;
+  static const unsigned BasePtrsArgNum = 3;
+  static const unsigned PtrsArgNum = 4;
+  static const unsigned SizesArgNum = 5;
+
+private:
+  /// Traverses the BasicBlock where \p Array is, collecting the stores made to
+  /// \p Array, leaving StoredValues with the values stored before the
+  /// instruction \p Before is reached.
+  bool getValues(AllocaInst &Array, Instruction &Before) {
+    // Initialize container.
+    const uint64_t NumValues = Array.getAllocatedType()->getArrayNumElements();
+    StoredValues.assign(NumValues, nullptr);
+    LastAccesses.assign(NumValues, nullptr);
+
+    // TODO: This assumes the instruction \p Before is in the same
+    //  BasicBlock as Array. Make it general, for any control flow graph.
+    BasicBlock *BB = Array.getParent();
+    if (BB != Before.getParent())
+      return false;
+
+    const DataLayout &DL = Array.getModule()->getDataLayout();
+    const unsigned int PointerSize = DL.getPointerSize();
+
+    for (Instruction &I : *BB) {
+      if (&I == &Before)
+        break;
+
+      if (!isa<StoreInst>(&I))
+        continue;
+
+      auto *S = cast<StoreInst>(&I);
+      int64_t Offset = -1;
+      auto *Dst =
+          GetPointerBaseWithConstantOffset(S->getPointerOperand(), Offset, DL);
+      if (Dst == &Array) {
+        int64_t Idx = Offset / PointerSize;
+        StoredValues[Idx] = getUnderlyingObject(S->getValueOperand());
+        LastAccesses[Idx] = S;
+      }
+    }
+
+    return isFilled();
+  }
+
+  /// Returns true if all values in StoredValues and
+  /// LastAccesses are not nullptrs.
+  bool isFilled() {
+    const unsigned NumValues = StoredValues.size();
+    for (unsigned I = 0; I < NumValues; ++I) {
+      if (!StoredValues[I] || !LastAccesses[I])
+        return false;
+    }
+
+    return true;
+  }
+};
+
 struct OpenMPOpt {
 
   using OptimizationRemarkGetter =
@@ -483,6 +488,12 @@ struct OpenMPOpt {
       : M(*(*SCC.begin())->getParent()), SCC(SCC), CGUpdater(CGUpdater),
         OREGetter(OREGetter), OMPInfoCache(OMPInfoCache), A(A) {}
 
+  /// Check if any remarks are enabled for openmp-opt
+  bool remarksEnabled() {
+    auto &Ctx = M.getContext();
+    return Ctx.getDiagHandlerPtr()->isAnyRemarkEnabled(DEBUG_TYPE);
+  }
+
   /// Run all OpenMP optimizations on the underlying SCC/ModuleSlice.
   bool run() {
     if (SCC.empty())
@@ -506,8 +517,18 @@ struct OpenMPOpt {
     // Recollect uses, in case Attributor deleted any.
     OMPInfoCache.recollectUses();
 
-    Changed |= deduplicateRuntimeCalls();
     Changed |= deleteParallelRegions();
+    if (HideMemoryTransferLatency)
+      Changed |= hideMemTransfersLatency();
+    if (remarksEnabled())
+      analysisGlobalization();
+    Changed |= deduplicateRuntimeCalls();
+    if (EnableParallelRegionMerging) {
+      if (mergeParallelRegions()) {
+        deduplicateRuntimeCalls();
+        Changed = true;
+      }
+    }
 
     return Changed;
   }
@@ -515,7 +536,8 @@ struct OpenMPOpt {
   /// Print initial ICV values for testing.
   /// FIXME: This should be done from the Attributor once it is added.
   void printICVs() const {
-    InternalControlVar ICVs[] = {ICV_nthreads, ICV_active_levels, ICV_cancel};
+    InternalControlVar ICVs[] = {ICV_nthreads, ICV_active_levels, ICV_cancel,
+                                 ICV_proc_bind};
 
     for (Function *F : OMPInfoCache.ModuleSlice) {
       for (auto ICV : ICVs) {
@@ -571,6 +593,394 @@ struct OpenMPOpt {
   }
 
 private:
+  /// Merge parallel regions when it is safe.
+  bool mergeParallelRegions() {
+    const unsigned CallbackCalleeOperand = 2;
+    const unsigned CallbackFirstArgOperand = 3;
+    using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
+
+    // Check if there are any __kmpc_fork_call calls to merge.
+    OMPInformationCache::RuntimeFunctionInfo &RFI =
+        OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
+
+    if (!RFI.Declaration)
+      return false;
+
+    // Unmergable calls that prevent merging a parallel region.
+    OMPInformationCache::RuntimeFunctionInfo UnmergableCallsInfo[] = {
+        OMPInfoCache.RFIs[OMPRTL___kmpc_push_proc_bind],
+        OMPInfoCache.RFIs[OMPRTL___kmpc_push_num_threads],
+    };
+
+    bool Changed = false;
+    LoopInfo *LI = nullptr;
+    DominatorTree *DT = nullptr;
+
+    SmallDenseMap<BasicBlock *, SmallPtrSet<Instruction *, 4>> BB2PRMap;
+
+    BasicBlock *StartBB = nullptr, *EndBB = nullptr;
+    auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
+                         BasicBlock &ContinuationIP) {
+      BasicBlock *CGStartBB = CodeGenIP.getBlock();
+      BasicBlock *CGEndBB =
+          SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
+      assert(StartBB != nullptr && "StartBB should not be null");
+      CGStartBB->getTerminator()->setSuccessor(0, StartBB);
+      assert(EndBB != nullptr && "EndBB should not be null");
+      EndBB->getTerminator()->setSuccessor(0, CGEndBB);
+    };
+
+    auto PrivCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value &,
+                      Value &Inner, Value *&ReplacementValue) -> InsertPointTy {
+      ReplacementValue = &Inner;
+      return CodeGenIP;
+    };
+
+    auto FiniCB = [&](InsertPointTy CodeGenIP) {};
+
+    /// Create a sequential execution region within a merged parallel region,
+    /// encapsulated in a master construct with a barrier for synchronization.
+    auto CreateSequentialRegion = [&](Function *OuterFn,
+                                      BasicBlock *OuterPredBB,
+                                      Instruction *SeqStartI,
+                                      Instruction *SeqEndI) {
+      // Isolate the instructions of the sequential region to a separate
+      // block.
+      BasicBlock *ParentBB = SeqStartI->getParent();
+      BasicBlock *SeqEndBB =
+          SplitBlock(ParentBB, SeqEndI->getNextNode(), DT, LI);
+      BasicBlock *SeqAfterBB =
+          SplitBlock(SeqEndBB, &*SeqEndBB->getFirstInsertionPt(), DT, LI);
+      BasicBlock *SeqStartBB =
+          SplitBlock(ParentBB, SeqStartI, DT, LI, nullptr, "seq.par.merged");
+
+      assert(ParentBB->getUniqueSuccessor() == SeqStartBB &&
+             "Expected a different CFG");
+      const DebugLoc DL = ParentBB->getTerminator()->getDebugLoc();
+      ParentBB->getTerminator()->eraseFromParent();
+
+      auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
+                           BasicBlock &ContinuationIP) {
+        BasicBlock *CGStartBB = CodeGenIP.getBlock();
+        BasicBlock *CGEndBB =
+            SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
+        assert(SeqStartBB != nullptr && "SeqStartBB should not be null");
+        CGStartBB->getTerminator()->setSuccessor(0, SeqStartBB);
+        assert(SeqEndBB != nullptr && "SeqEndBB should not be null");
+        SeqEndBB->getTerminator()->setSuccessor(0, CGEndBB);
+      };
+      auto FiniCB = [&](InsertPointTy CodeGenIP) {};
+
+      // Find outputs from the sequential region to outside users and
+      // broadcast their values to them.
+      for (Instruction &I : *SeqStartBB) {
+        SmallPtrSet<Instruction *, 4> OutsideUsers;
+        for (User *Usr : I.users()) {
+          Instruction &UsrI = *cast<Instruction>(Usr);
+          // Ignore outputs to LT intrinsics, code extraction for the merged
+          // parallel region will fix them.
+          if (UsrI.isLifetimeStartOrEnd())
+            continue;
+
+          if (UsrI.getParent() != SeqStartBB)
+            OutsideUsers.insert(&UsrI);
+        }
+
+        if (OutsideUsers.empty())
+          continue;
+
+        // Emit an alloca in the outer region to store the broadcasted
+        // value.
+        const DataLayout &DL = M.getDataLayout();
+        AllocaInst *AllocaI = new AllocaInst(
+            I.getType(), DL.getAllocaAddrSpace(), nullptr,
+            I.getName() + ".seq.output.alloc", &OuterFn->front().front());
+
+        // Emit a store instruction in the sequential BB to update the
+        // value.
+        new StoreInst(&I, AllocaI, SeqStartBB->getTerminator());
+
+        // Emit a load instruction and replace the use of the output value
+        // with it.
+        for (Instruction *UsrI : OutsideUsers) {
+          LoadInst *LoadI = new LoadInst(I.getType(), AllocaI,
+                                         I.getName() + ".seq.output.load", UsrI);
+          UsrI->replaceUsesOfWith(&I, LoadI);
+        }
+      }
+
+      OpenMPIRBuilder::LocationDescription Loc(
+          InsertPointTy(ParentBB, ParentBB->end()), DL);
+      InsertPointTy SeqAfterIP =
+          OMPInfoCache.OMPBuilder.createMaster(Loc, BodyGenCB, FiniCB);
+
+      OMPInfoCache.OMPBuilder.createBarrier(SeqAfterIP, OMPD_parallel);
+
+      BranchInst::Create(SeqAfterBB, SeqAfterIP.getBlock());
+
+      LLVM_DEBUG(dbgs() << TAG << "After sequential inlining " << *OuterFn
+                        << "\n");
+    };
+
+    // Helper to merge the __kmpc_fork_call calls in MergableCIs. They are all
+    // contained in BB and only separated by instructions that can be
+    // redundantly executed in parallel. The block BB is split before the first
+    // call (in MergableCIs) and after the last so the entire region we merge
+    // into a single parallel region is contained in a single basic block
+    // without any other instructions. We use the OpenMPIRBuilder to outline
+    // that block and call the resulting function via __kmpc_fork_call.
+    auto Merge = [&](SmallVectorImpl<CallInst *> &MergableCIs, BasicBlock *BB) {
+      // TODO: Change the interface to allow single CIs expanded, e.g, to
+      // include an outer loop.
+      assert(MergableCIs.size() > 1 && "Assumed multiple mergable CIs");
+
+      auto Remark = [&](OptimizationRemark OR) {
+        OR << "Parallel region at "
+           << ore::NV("OpenMPParallelMergeFront",
+                      MergableCIs.front()->getDebugLoc())
+           << " merged with parallel regions at ";
+        for (auto *CI : llvm::drop_begin(MergableCIs)) {
+          OR << ore::NV("OpenMPParallelMerge", CI->getDebugLoc());
+          if (CI != MergableCIs.back())
+            OR << ", ";
+        }
+        return OR;
+      };
+
+      emitRemark<OptimizationRemark>(MergableCIs.front(),
+                                     "OpenMPParallelRegionMerging", Remark);
+
+      Function *OriginalFn = BB->getParent();
+      LLVM_DEBUG(dbgs() << TAG << "Merge " << MergableCIs.size()
+                        << " parallel regions in " << OriginalFn->getName()
+                        << "\n");
+
+      // Isolate the calls to merge in a separate block.
+      EndBB = SplitBlock(BB, MergableCIs.back()->getNextNode(), DT, LI);
+      BasicBlock *AfterBB =
+          SplitBlock(EndBB, &*EndBB->getFirstInsertionPt(), DT, LI);
+      StartBB = SplitBlock(BB, MergableCIs.front(), DT, LI, nullptr,
+                           "omp.par.merged");
+
+      assert(BB->getUniqueSuccessor() == StartBB && "Expected a different CFG");
+      const DebugLoc DL = BB->getTerminator()->getDebugLoc();
+      BB->getTerminator()->eraseFromParent();
+
+      // Create sequential regions for sequential instructions that are
+      // in-between mergable parallel regions.
+      for (auto *It = MergableCIs.begin(), *End = MergableCIs.end() - 1;
+           It != End; ++It) {
+        Instruction *ForkCI = *It;
+        Instruction *NextForkCI = *(It + 1);
+
+        // Continue if there are not in-between instructions.
+        if (ForkCI->getNextNode() == NextForkCI)
+          continue;
+
+        CreateSequentialRegion(OriginalFn, BB, ForkCI->getNextNode(),
+                               NextForkCI->getPrevNode());
+      }
+
+      OpenMPIRBuilder::LocationDescription Loc(InsertPointTy(BB, BB->end()),
+                                               DL);
+      IRBuilder<>::InsertPoint AllocaIP(
+          &OriginalFn->getEntryBlock(),
+          OriginalFn->getEntryBlock().getFirstInsertionPt());
+      // Create the merged parallel region with default proc binding, to
+      // avoid overriding binding settings, and without explicit cancellation.
+      InsertPointTy AfterIP = OMPInfoCache.OMPBuilder.createParallel(
+          Loc, AllocaIP, BodyGenCB, PrivCB, FiniCB, nullptr, nullptr,
+          OMP_PROC_BIND_default, /* IsCancellable */ false);
+      BranchInst::Create(AfterBB, AfterIP.getBlock());
+
+      // Perform the actual outlining.
+      OMPInfoCache.OMPBuilder.finalize(/* AllowExtractorSinking */ true);
+
+      Function *OutlinedFn = MergableCIs.front()->getCaller();
+
+      // Replace the __kmpc_fork_call calls with direct calls to the outlined
+      // callbacks.
+      SmallVector<Value *, 8> Args;
+      for (auto *CI : MergableCIs) {
+        Value *Callee =
+            CI->getArgOperand(CallbackCalleeOperand)->stripPointerCasts();
+        FunctionType *FT =
+            cast<FunctionType>(Callee->getType()->getPointerElementType());
+        Args.clear();
+        Args.push_back(OutlinedFn->getArg(0));
+        Args.push_back(OutlinedFn->getArg(1));
+        for (unsigned U = CallbackFirstArgOperand, E = CI->getNumArgOperands();
+             U < E; ++U)
+          Args.push_back(CI->getArgOperand(U));
+
+        CallInst *NewCI = CallInst::Create(FT, Callee, Args, "", CI);
+        if (CI->getDebugLoc())
+          NewCI->setDebugLoc(CI->getDebugLoc());
+
+        // Forward parameter attributes from the callback to the callee.
+        for (unsigned U = CallbackFirstArgOperand, E = CI->getNumArgOperands();
+             U < E; ++U)
+          for (const Attribute &A : CI->getAttributes().getParamAttributes(U))
+            NewCI->addParamAttr(
+                U - (CallbackFirstArgOperand - CallbackCalleeOperand), A);
+
+        // Emit an explicit barrier to replace the implicit fork-join barrier.
+        if (CI != MergableCIs.back()) {
+          // TODO: Remove barrier if the merged parallel region includes the
+          // 'nowait' clause.
+          OMPInfoCache.OMPBuilder.createBarrier(
+              InsertPointTy(NewCI->getParent(),
+                            NewCI->getNextNode()->getIterator()),
+              OMPD_parallel);
+        }
+
+        auto Remark = [&](OptimizationRemark OR) {
+          return OR << "Parallel region at "
+                    << ore::NV("OpenMPParallelMerge", CI->getDebugLoc())
+                    << " merged with "
+                    << ore::NV("OpenMPParallelMergeFront",
+                               MergableCIs.front()->getDebugLoc());
+        };
+        if (CI != MergableCIs.front())
+          emitRemark<OptimizationRemark>(CI, "OpenMPParallelRegionMerging",
+                                         Remark);
+
+        CI->eraseFromParent();
+      }
+
+      assert(OutlinedFn != OriginalFn && "Outlining failed");
+      CGUpdater.registerOutlinedFunction(*OriginalFn, *OutlinedFn);
+      CGUpdater.reanalyzeFunction(*OriginalFn);
+
+      NumOpenMPParallelRegionsMerged += MergableCIs.size();
+
+      return true;
+    };
+
+    // Helper function that identifes sequences of
+    // __kmpc_fork_call uses in a basic block.
+    auto DetectPRsCB = [&](Use &U, Function &F) {
+      CallInst *CI = getCallIfRegularCall(U, &RFI);
+      BB2PRMap[CI->getParent()].insert(CI);
+
+      return false;
+    };
+
+    BB2PRMap.clear();
+    RFI.foreachUse(SCC, DetectPRsCB);
+    SmallVector<SmallVector<CallInst *, 4>, 4> MergableCIsVector;
+    // Find mergable parallel regions within a basic block that are
+    // safe to merge, that is any in-between instructions can safely
+    // execute in parallel after merging.
+    // TODO: support merging across basic-blocks.
+    for (auto &It : BB2PRMap) {
+      auto &CIs = It.getSecond();
+      if (CIs.size() < 2)
+        continue;
+
+      BasicBlock *BB = It.getFirst();
+      SmallVector<CallInst *, 4> MergableCIs;
+
+      /// Returns true if the instruction is mergable, false otherwise.
+      /// A terminator instruction is unmergable by definition since merging
+      /// works within a BB. Instructions before the mergable region are
+      /// mergable if they are not calls to OpenMP runtime functions that may
+      /// set different execution parameters for subsequent parallel regions.
+      /// Instructions in-between parallel regions are mergable if they are not
+      /// calls to any non-intrinsic function since that may call a non-mergable
+      /// OpenMP runtime function.
+      auto IsMergable = [&](Instruction &I, bool IsBeforeMergableRegion) {
+        // We do not merge across BBs, hence return false (unmergable) if the
+        // instruction is a terminator.
+        if (I.isTerminator())
+          return false;
+
+        if (!isa<CallInst>(&I))
+          return true;
+
+        CallInst *CI = cast<CallInst>(&I);
+        if (IsBeforeMergableRegion) {
+          Function *CalledFunction = CI->getCalledFunction();
+          if (!CalledFunction)
+            return false;
+          // Return false (unmergable) if the call before the parallel
+          // region calls an explicit affinity (proc_bind) or number of
+          // threads (num_threads) compiler-generated function. Those settings
+          // may be incompatible with following parallel regions.
+          // TODO: ICV tracking to detect compatibility.
+          for (const auto &RFI : UnmergableCallsInfo) {
+            if (CalledFunction == RFI.Declaration)
+              return false;
+          }
+        } else {
+          // Return false (unmergable) if there is a call instruction
+          // in-between parallel regions when it is not an intrinsic. It
+          // may call an unmergable OpenMP runtime function in its callpath.
+          // TODO: Keep track of possible OpenMP calls in the callpath.
+          if (!isa<IntrinsicInst>(CI))
+            return false;
+        }
+
+        return true;
+      };
+      // Find maximal number of parallel region CIs that are safe to merge.
+      for (auto It = BB->begin(), End = BB->end(); It != End;) {
+        Instruction &I = *It;
+        ++It;
+
+        if (CIs.count(&I)) {
+          MergableCIs.push_back(cast<CallInst>(&I));
+          continue;
+        }
+
+        // Continue expanding if the instruction is mergable.
+        if (IsMergable(I, MergableCIs.empty()))
+          continue;
+
+        // Forward the instruction iterator to skip the next parallel region
+        // since there is an unmergable instruction which can affect it.
+        for (; It != End; ++It) {
+          Instruction &SkipI = *It;
+          if (CIs.count(&SkipI)) {
+            LLVM_DEBUG(dbgs() << TAG << "Skip parallel region " << SkipI
+                              << " due to " << I << "\n");
+            ++It;
+            break;
+          }
+        }
+
+        // Store mergable regions found.
+        if (MergableCIs.size() > 1) {
+          MergableCIsVector.push_back(MergableCIs);
+          LLVM_DEBUG(dbgs() << TAG << "Found " << MergableCIs.size()
+                            << " parallel regions in block " << BB->getName()
+                            << " of function " << BB->getParent()->getName()
+                            << "\n";);
+        }
+
+        MergableCIs.clear();
+      }
+
+      if (!MergableCIsVector.empty()) {
+        Changed = true;
+
+        for (auto &MergableCIs : MergableCIsVector)
+          Merge(MergableCIs, BB);
+      }
+    }
+
+    if (Changed) {
+      /// Re-collect use for fork calls, emitted barrier calls, and
+      /// any emitted master/end_master calls.
+      OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_fork_call);
+      OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_barrier);
+      OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_master);
+      OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_end_master);
+    }
+
+    return Changed;
+  }
+
   /// Try to delete parallel regions if possible.
   bool deleteParallelRegions() {
     const unsigned CallbackCalleeOperand = 2;
@@ -648,8 +1058,8 @@ private:
 
     for (Function *F : SCC) {
       for (auto DeduplicableRuntimeCallID : DeduplicableRuntimeCallIDs)
-        deduplicateRuntimeCalls(*F,
-                                OMPInfoCache.RFIs[DeduplicableRuntimeCallID]);
+        Changed |= deduplicateRuntimeCalls(
+            *F, OMPInfoCache.RFIs[DeduplicableRuntimeCallID]);
 
       // __kmpc_global_thread_num is special as we can replace it with an
       // argument in enough cases to make it worth trying.
@@ -666,6 +1076,223 @@ private:
     return Changed;
   }
 
+  /// Tries to hide the latency of runtime calls that involve host to
+  /// device memory transfers by splitting them into their "issue" and "wait"
+  /// versions. The "issue" is moved upwards as much as possible. The "wait" is
+  /// moved downards as much as possible. The "issue" issues the memory transfer
+  /// asynchronously, returning a handle. The "wait" waits in the returned
+  /// handle for the memory transfer to finish.
+  bool hideMemTransfersLatency() {
+    auto &RFI = OMPInfoCache.RFIs[OMPRTL___tgt_target_data_begin_mapper];
+    bool Changed = false;
+    auto SplitMemTransfers = [&](Use &U, Function &Decl) {
+      auto *RTCall = getCallIfRegularCall(U, &RFI);
+      if (!RTCall)
+        return false;
+
+      OffloadArray OffloadArrays[3];
+      if (!getValuesInOffloadArrays(*RTCall, OffloadArrays))
+        return false;
+
+      LLVM_DEBUG(dumpValuesInOffloadArrays(OffloadArrays));
+
+      // TODO: Check if can be moved upwards.
+      bool WasSplit = false;
+      Instruction *WaitMovementPoint = canBeMovedDownwards(*RTCall);
+      if (WaitMovementPoint)
+        WasSplit = splitTargetDataBeginRTC(*RTCall, *WaitMovementPoint);
+
+      Changed |= WasSplit;
+      return WasSplit;
+    };
+    RFI.foreachUse(SCC, SplitMemTransfers);
+
+    return Changed;
+  }
+
+  void analysisGlobalization() {
+    RuntimeFunction GlobalizationRuntimeIDs[] = {
+        OMPRTL___kmpc_data_sharing_coalesced_push_stack,
+        OMPRTL___kmpc_data_sharing_push_stack};
+
+    for (const auto GlobalizationCallID : GlobalizationRuntimeIDs) {
+      auto &RFI = OMPInfoCache.RFIs[GlobalizationCallID];
+
+      auto CheckGlobalization = [&](Use &U, Function &Decl) {
+        if (CallInst *CI = getCallIfRegularCall(U, &RFI)) {
+          auto Remark = [&](OptimizationRemarkAnalysis ORA) {
+            return ORA
+                   << "Found thread data sharing on the GPU. "
+                   << "Expect degraded performance due to data globalization.";
+          };
+          emitRemark<OptimizationRemarkAnalysis>(CI, "OpenMPGlobalization",
+                                                 Remark);
+        }
+
+        return false;
+      };
+
+      RFI.foreachUse(SCC, CheckGlobalization);
+    }
+  }
+
+  /// Maps the values stored in the offload arrays passed as arguments to
+  /// \p RuntimeCall into the offload arrays in \p OAs.
+  bool getValuesInOffloadArrays(CallInst &RuntimeCall,
+                                MutableArrayRef<OffloadArray> OAs) {
+    assert(OAs.size() == 3 && "Need space for three offload arrays!");
+
+    // A runtime call that involves memory offloading looks something like:
+    // call void @__tgt_target_data_begin_mapper(arg0, arg1,
+    //   i8** %offload_baseptrs, i8** %offload_ptrs, i64* %offload_sizes,
+    // ...)
+    // So, the idea is to access the allocas that allocate space for these
+    // offload arrays, offload_baseptrs, offload_ptrs, offload_sizes.
+    // Therefore:
+    // i8** %offload_baseptrs.
+    Value *BasePtrsArg =
+        RuntimeCall.getArgOperand(OffloadArray::BasePtrsArgNum);
+    // i8** %offload_ptrs.
+    Value *PtrsArg = RuntimeCall.getArgOperand(OffloadArray::PtrsArgNum);
+    // i8** %offload_sizes.
+    Value *SizesArg = RuntimeCall.getArgOperand(OffloadArray::SizesArgNum);
+
+    // Get values stored in **offload_baseptrs.
+    auto *V = getUnderlyingObject(BasePtrsArg);
+    if (!isa<AllocaInst>(V))
+      return false;
+    auto *BasePtrsArray = cast<AllocaInst>(V);
+    if (!OAs[0].initialize(*BasePtrsArray, RuntimeCall))
+      return false;
+
+    // Get values stored in **offload_baseptrs.
+    V = getUnderlyingObject(PtrsArg);
+    if (!isa<AllocaInst>(V))
+      return false;
+    auto *PtrsArray = cast<AllocaInst>(V);
+    if (!OAs[1].initialize(*PtrsArray, RuntimeCall))
+      return false;
+
+    // Get values stored in **offload_sizes.
+    V = getUnderlyingObject(SizesArg);
+    // If it's a [constant] global array don't analyze it.
+    if (isa<GlobalValue>(V))
+      return isa<Constant>(V);
+    if (!isa<AllocaInst>(V))
+      return false;
+
+    auto *SizesArray = cast<AllocaInst>(V);
+    if (!OAs[2].initialize(*SizesArray, RuntimeCall))
+      return false;
+
+    return true;
+  }
+
+  /// Prints the values in the OffloadArrays \p OAs using LLVM_DEBUG.
+  /// For now this is a way to test that the function getValuesInOffloadArrays
+  /// is working properly.
+  /// TODO: Move this to a unittest when unittests are available for OpenMPOpt.
+  void dumpValuesInOffloadArrays(ArrayRef<OffloadArray> OAs) {
+    assert(OAs.size() == 3 && "There are three offload arrays to debug!");
+
+    LLVM_DEBUG(dbgs() << TAG << " Successfully got offload values:\n");
+    std::string ValuesStr;
+    raw_string_ostream Printer(ValuesStr);
+    std::string Separator = " --- ";
+
+    for (auto *BP : OAs[0].StoredValues) {
+      BP->print(Printer);
+      Printer << Separator;
+    }
+    LLVM_DEBUG(dbgs() << "\t\toffload_baseptrs: " << Printer.str() << "\n");
+    ValuesStr.clear();
+
+    for (auto *P : OAs[1].StoredValues) {
+      P->print(Printer);
+      Printer << Separator;
+    }
+    LLVM_DEBUG(dbgs() << "\t\toffload_ptrs: " << Printer.str() << "\n");
+    ValuesStr.clear();
+
+    for (auto *S : OAs[2].StoredValues) {
+      S->print(Printer);
+      Printer << Separator;
+    }
+    LLVM_DEBUG(dbgs() << "\t\toffload_sizes: " << Printer.str() << "\n");
+  }
+
+  /// Returns the instruction where the "wait" counterpart \p RuntimeCall can be
+  /// moved. Returns nullptr if the movement is not possible, or not worth it.
+  Instruction *canBeMovedDownwards(CallInst &RuntimeCall) {
+    // FIXME: This traverses only the BasicBlock where RuntimeCall is.
+    //  Make it traverse the CFG.
+
+    Instruction *CurrentI = &RuntimeCall;
+    bool IsWorthIt = false;
+    while ((CurrentI = CurrentI->getNextNode())) {
+
+      // TODO: Once we detect the regions to be offloaded we should use the
+      //  alias analysis manager to check if CurrentI may modify one of
+      //  the offloaded regions.
+      if (CurrentI->mayHaveSideEffects() || CurrentI->mayReadFromMemory()) {
+        if (IsWorthIt)
+          return CurrentI;
+
+        return nullptr;
+      }
+
+      // FIXME: For now if we move it over anything without side effect
+      //  is worth it.
+      IsWorthIt = true;
+    }
+
+    // Return end of BasicBlock.
+    return RuntimeCall.getParent()->getTerminator();
+  }
+
+  /// Splits \p RuntimeCall into its "issue" and "wait" counterparts.
+  bool splitTargetDataBeginRTC(CallInst &RuntimeCall,
+                               Instruction &WaitMovementPoint) {
+    // Create stack allocated handle (__tgt_async_info) at the beginning of the
+    // function. Used for storing information of the async transfer, allowing to
+    // wait on it later.
+    auto &IRBuilder = OMPInfoCache.OMPBuilder;
+    auto *F = RuntimeCall.getCaller();
+    Instruction *FirstInst = &(F->getEntryBlock().front());
+    AllocaInst *Handle = new AllocaInst(
+        IRBuilder.AsyncInfo, F->getAddressSpace(), "handle", FirstInst);
+
+    // Add "issue" runtime call declaration:
+    // declare %struct.tgt_async_info @__tgt_target_data_begin_issue(i64, i32,
+    //   i8**, i8**, i64*, i64*)
+    FunctionCallee IssueDecl = IRBuilder.getOrCreateRuntimeFunction(
+        M, OMPRTL___tgt_target_data_begin_mapper_issue);
+
+    // Change RuntimeCall call site for its asynchronous version.
+    SmallVector<Value *, 16> Args;
+    for (auto &Arg : RuntimeCall.args())
+      Args.push_back(Arg.get());
+    Args.push_back(Handle);
+
+    CallInst *IssueCallsite =
+        CallInst::Create(IssueDecl, Args, /*NameStr=*/"", &RuntimeCall);
+    RuntimeCall.eraseFromParent();
+
+    // Add "wait" runtime call declaration:
+    // declare void @__tgt_target_data_begin_wait(i64, %struct.__tgt_async_info)
+    FunctionCallee WaitDecl = IRBuilder.getOrCreateRuntimeFunction(
+        M, OMPRTL___tgt_target_data_begin_mapper_wait);
+
+    Value *WaitParams[2] = {
+        IssueCallsite->getArgOperand(
+            OffloadArray::DeviceIDArgNum), // device_id.
+        Handle                             // handle to wait on.
+    };
+    CallInst::Create(WaitDecl, WaitParams, /*NameStr=*/"", &WaitMovementPoint);
+
+    return true;
+  }
+
   static Value *combinedIdentStruct(Value *CurrentIdent, Value *NextIdent,
                                     bool GlobalOnly, bool &SingleChoice) {
     if (CurrentIdent == NextIdent)
@@ -951,11 +1578,28 @@ private:
   /// Populate the Attributor with abstract attribute opportunities in the
   /// function.
   void registerAAs() {
-    for (Function *F : SCC) {
-      if (F->isDeclaration())
-        continue;
+    if (SCC.empty())
+      return;
+
+    // Create CallSite AA for all Getters.
+    for (int Idx = 0; Idx < OMPInfoCache.ICVs.size() - 1; ++Idx) {
+      auto ICVInfo = OMPInfoCache.ICVs[static_cast<InternalControlVar>(Idx)];
+
+      auto &GetterRFI = OMPInfoCache.RFIs[ICVInfo.Getter];
+
+      auto CreateAA = [&](Use &U, Function &Caller) {
+        CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &GetterRFI);
+        if (!CI)
+          return false;
+
+        auto &CB = cast<CallBase>(*CI);
+
+        IRPosition CBPos = IRPosition::callsite_function(CB);
+        A.getOrCreateAAFor<AAICVTracker>(CBPos);
+        return false;
+      };
 
-      A.getOrCreateAAFor<AAICVTracker>(IRPosition::function(*F));
+      GetterRFI.foreachUse(SCC, CreateAA);
     }
   }
 };
@@ -979,8 +1623,16 @@ Kernel OpenMPOpt::getUniqueKernelFor(Function &F) {
     }
 
     CachedKernel = nullptr;
-    if (!F.hasLocalLinkage())
+    if (!F.hasLocalLinkage()) {
+
+      // See https://openmp.llvm.org/remarks/OptimizationRemarks.html
+      auto Remark = [&](OptimizationRemark OR) {
+        return OR << "[OMP100] Potentially unknown OpenMP target region caller";
+      };
+      emitRemarkOnFunction(&F, "OMP100", Remark);
+
       return nullptr;
+    }
   }
 
   auto GetUniqueKernelForUse = [&](const Use &U) -> Kernel {
@@ -1006,7 +1658,7 @@ Kernel OpenMPOpt::getUniqueKernelFor(Function &F) {
 
   // TODO: In the future we want to track more than just a unique kernel.
   SmallPtrSet<Kernel, 2> PotentialKernels;
-  foreachUse(F, [&](const Use &U) {
+  OMPInformationCache::foreachUse(F, [&](const Use &U) {
     PotentialKernels.insert(GetUniqueKernelForUse(U));
   });
 
@@ -1037,7 +1689,7 @@ bool OpenMPOpt::rewriteDeviceCodeStateMachine() {
     unsigned NumDirectCalls = 0;
 
     SmallVector<Use *, 2> ToBeReplacedStateMachineUses;
-    foreachUse(*F, [&](Use &U) {
+    OMPInformationCache::foreachUse(*F, [&](Use &U) {
       if (auto *CB = dyn_cast<CallBase>(U.getUser()))
         if (CB->isCallee(&U)) {
           ++NumDirectCalls;
@@ -1157,6 +1809,12 @@ struct AAICVTracker : public StateWrapper<BooleanState, AbstractAttribute> {
   using Base = StateWrapper<BooleanState, AbstractAttribute>;
   AAICVTracker(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
 
+  void initialize(Attributor &A) override {
+    Function *F = getAnchorScope();
+    if (!F || !A.isFunctionIPOAmendable(*F))
+      indicatePessimisticFixpoint();
+  }
+
   /// Returns true if value is assumed to be tracked.
   bool isAssumedTracked() const { return getAssumed(); }
 
@@ -1167,8 +1825,21 @@ struct AAICVTracker : public StateWrapper<BooleanState, AbstractAttribute> {
   static AAICVTracker &createForPosition(const IRPosition &IRP, Attributor &A);
 
   /// Return the value with which \p I can be replaced for specific \p ICV.
-  virtual Value *getReplacementValue(InternalControlVar ICV,
-                                     const Instruction *I, Attributor &A) = 0;
+  virtual Optional<Value *> getReplacementValue(InternalControlVar ICV,
+                                                const Instruction *I,
+                                                Attributor &A) const {
+    return None;
+  }
+
+  /// Return an assumed unique ICV value if a single candidate is found. If
+  /// there cannot be one, return a nullptr. If it is not clear yet, return the
+  /// Optional::NoneType.
+  virtual Optional<Value *>
+  getUniqueReplacementValue(InternalControlVar ICV) const = 0;
+
+  // Currently only nthreads is being tracked.
+  // this array will only grow with time.
+  InternalControlVar TrackableICVs[1] = {ICV_nthreads};
 
   /// See AbstractAttribute::getName()
   const std::string getName() const override { return "AAICVTracker"; }
@@ -1189,57 +1860,20 @@ struct AAICVTrackerFunction : public AAICVTracker {
       : AAICVTracker(IRP, A) {}
 
   // FIXME: come up with better string.
-  const std::string getAsStr() const override { return "ICVTracker"; }
+  const std::string getAsStr() const override { return "ICVTrackerFunction"; }
 
   // FIXME: come up with some stats.
   void trackStatistics() const override {}
 
-  /// TODO: decide whether to deduplicate here, or use current
-  /// deduplicateRuntimeCalls function.
+  /// We don't manifest anything for this AA.
   ChangeStatus manifest(Attributor &A) override {
-    ChangeStatus Changed = ChangeStatus::UNCHANGED;
-
-    for (InternalControlVar &ICV : TrackableICVs)
-      if (deduplicateICVGetters(ICV, A))
-        Changed = ChangeStatus::CHANGED;
-
-    return Changed;
-  }
-
-  bool deduplicateICVGetters(InternalControlVar &ICV, Attributor &A) {
-    auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
-    auto &ICVInfo = OMPInfoCache.ICVs[ICV];
-    auto &GetterRFI = OMPInfoCache.RFIs[ICVInfo.Getter];
-
-    bool Changed = false;
-
-    auto ReplaceAndDeleteCB = [&](Use &U, Function &Caller) {
-      CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &GetterRFI);
-      Instruction *UserI = cast<Instruction>(U.getUser());
-      Value *ReplVal = getReplacementValue(ICV, UserI, A);
-
-      if (!ReplVal || !CI)
-        return false;
-
-      A.removeCallSite(CI);
-      CI->replaceAllUsesWith(ReplVal);
-      CI->eraseFromParent();
-      Changed = true;
-      return true;
-    };
-
-    GetterRFI.foreachUse(ReplaceAndDeleteCB, getAnchorScope());
-    return Changed;
+    return ChangeStatus::UNCHANGED;
   }
 
   // Map of ICV to their values at specific program point.
-  EnumeratedArray<SmallSetVector<ICVValue, 4>, InternalControlVar,
+  EnumeratedArray<DenseMap<Instruction *, Value *>, InternalControlVar,
                   InternalControlVar::ICV___last>
-      ICVValuesMap;
-
-  // Currently only nthreads is being tracked.
-  // this array will only grow with time.
-  InternalControlVar TrackableICVs[1] = {ICV_nthreads};
+      ICVReplacementValuesMap;
 
   ChangeStatus updateImpl(Attributor &A) override {
     ChangeStatus HasChanged = ChangeStatus::UNCHANGED;
@@ -1251,6 +1885,7 @@ struct AAICVTrackerFunction : public AAICVTracker {
     for (InternalControlVar ICV : TrackableICVs) {
       auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];
 
+      auto &ValuesMap = ICVReplacementValuesMap[ICV];
       auto TrackValues = [&](Use &U, Function &) {
         CallInst *CI = OpenMPOpt::getCallIfRegularCall(U);
         if (!CI)
@@ -1258,51 +1893,342 @@ struct AAICVTrackerFunction : public AAICVTracker {
 
         // FIXME: handle setters with more that 1 arguments.
         /// Track new value.
-        if (ICVValuesMap[ICV].insert(ICVValue(CI, CI->getArgOperand(0))))
+        if (ValuesMap.insert(std::make_pair(CI, CI->getArgOperand(0))).second)
           HasChanged = ChangeStatus::CHANGED;
 
         return false;
       };
 
+      auto CallCheck = [&](Instruction &I) {
+        Optional<Value *> ReplVal = getValueForCall(A, &I, ICV);
+        if (ReplVal.hasValue() &&
+            ValuesMap.insert(std::make_pair(&I, *ReplVal)).second)
+          HasChanged = ChangeStatus::CHANGED;
+
+        return true;
+      };
+
+      // Track all changes of an ICV.
       SetterRFI.foreachUse(TrackValues, F);
+
+      A.checkForAllInstructions(CallCheck, *this, {Instruction::Call},
+                                /* CheckBBLivenessOnly */ true);
+
+      /// TODO: Figure out a way to avoid adding entry in
+      /// ICVReplacementValuesMap
+      Instruction *Entry = &F->getEntryBlock().front();
+      if (HasChanged == ChangeStatus::CHANGED && !ValuesMap.count(Entry))
+        ValuesMap.insert(std::make_pair(Entry, nullptr));
     }
 
     return HasChanged;
   }
 
-  /// Return the value with which \p I can be replaced for specific \p ICV.
-  Value *getReplacementValue(InternalControlVar ICV, const Instruction *I,
-                             Attributor &A) override {
-    const BasicBlock *CurrBB = I->getParent();
+  /// Hepler to check if \p I is a call and get the value for it if it is
+  /// unique.
+  Optional<Value *> getValueForCall(Attributor &A, const Instruction *I,
+                                    InternalControlVar &ICV) const {
+
+    const auto *CB = dyn_cast<CallBase>(I);
+    if (!CB || CB->hasFnAttr("no_openmp") ||
+        CB->hasFnAttr("no_openmp_routines"))
+      return None;
 
-    auto &ValuesSet = ICVValuesMap[ICV];
     auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
     auto &GetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Getter];
+    auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];
+    Function *CalledFunction = CB->getCalledFunction();
 
-    for (const auto &ICVVal : ValuesSet) {
-      if (CurrBB == ICVVal.Inst->getParent()) {
-        if (!ICVVal.Inst->comesBefore(I))
-          continue;
+    // Indirect call, assume ICV changes.
+    if (CalledFunction == nullptr)
+      return nullptr;
+    if (CalledFunction == GetterRFI.Declaration)
+      return None;
+    if (CalledFunction == SetterRFI.Declaration) {
+      if (ICVReplacementValuesMap[ICV].count(I))
+        return ICVReplacementValuesMap[ICV].lookup(I);
 
-        // both instructions are in the same BB and at \p I we know the ICV
-        // value.
-        while (I != ICVVal.Inst) {
-          // we don't yet know if a call might update an ICV.
-          // TODO: check callsite AA for value.
-          if (const auto *CB = dyn_cast<CallBase>(I))
-            if (CB->getCalledFunction() != GetterRFI.Declaration)
+      return nullptr;
+    }
+
+    // Since we don't know, assume it changes the ICV.
+    if (CalledFunction->isDeclaration())
+      return nullptr;
+
+    const auto &ICVTrackingAA =
+        A.getAAFor<AAICVTracker>(*this, IRPosition::callsite_returned(*CB));
+
+    if (ICVTrackingAA.isAssumedTracked())
+      return ICVTrackingAA.getUniqueReplacementValue(ICV);
+
+    // If we don't know, assume it changes.
+    return nullptr;
+  }
+
+  // We don't check unique value for a function, so return None.
+  Optional<Value *>
+  getUniqueReplacementValue(InternalControlVar ICV) const override {
+    return None;
+  }
+
+  /// Return the value with which \p I can be replaced for specific \p ICV.
+  Optional<Value *> getReplacementValue(InternalControlVar ICV,
+                                        const Instruction *I,
+                                        Attributor &A) const override {
+    const auto &ValuesMap = ICVReplacementValuesMap[ICV];
+    if (ValuesMap.count(I))
+      return ValuesMap.lookup(I);
+
+    SmallVector<const Instruction *, 16> Worklist;
+    SmallPtrSet<const Instruction *, 16> Visited;
+    Worklist.push_back(I);
+
+    Optional<Value *> ReplVal;
+
+    while (!Worklist.empty()) {
+      const Instruction *CurrInst = Worklist.pop_back_val();
+      if (!Visited.insert(CurrInst).second)
+        continue;
+
+      const BasicBlock *CurrBB = CurrInst->getParent();
+
+      // Go up and look for all potential setters/calls that might change the
+      // ICV.
+      while ((CurrInst = CurrInst->getPrevNode())) {
+        if (ValuesMap.count(CurrInst)) {
+          Optional<Value *> NewReplVal = ValuesMap.lookup(CurrInst);
+          // Unknown value, track new.
+          if (!ReplVal.hasValue()) {
+            ReplVal = NewReplVal;
+            break;
+          }
+
+          // If we found a new value, we can't know the icv value anymore.
+          if (NewReplVal.hasValue())
+            if (ReplVal != NewReplVal)
               return nullptr;
 
-          I = I->getPrevNode();
+          break;
         }
 
-        // No call in between, return the value.
-        return ICVVal.TrackedValue;
+        Optional<Value *> NewReplVal = getValueForCall(A, CurrInst, ICV);
+        if (!NewReplVal.hasValue())
+          continue;
+
+        // Unknown value, track new.
+        if (!ReplVal.hasValue()) {
+          ReplVal = NewReplVal;
+          break;
+        }
+
+        // if (NewReplVal.hasValue())
+        // We found a new value, we can't know the icv value anymore.
+        if (ReplVal != NewReplVal)
+          return nullptr;
       }
+
+      // If we are in the same BB and we have a value, we are done.
+      if (CurrBB == I->getParent() && ReplVal.hasValue())
+        return ReplVal;
+
+      // Go through all predecessors and add terminators for analysis.
+      for (const BasicBlock *Pred : predecessors(CurrBB))
+        if (const Instruction *Terminator = Pred->getTerminator())
+          Worklist.push_back(Terminator);
     }
 
-    // No value was tracked.
-    return nullptr;
+    return ReplVal;
+  }
+};
+
+struct AAICVTrackerFunctionReturned : AAICVTracker {
+  AAICVTrackerFunctionReturned(const IRPosition &IRP, Attributor &A)
+      : AAICVTracker(IRP, A) {}
+
+  // FIXME: come up with better string.
+  const std::string getAsStr() const override {
+    return "ICVTrackerFunctionReturned";
+  }
+
+  // FIXME: come up with some stats.
+  void trackStatistics() const override {}
+
+  /// We don't manifest anything for this AA.
+  ChangeStatus manifest(Attributor &A) override {
+    return ChangeStatus::UNCHANGED;
+  }
+
+  // Map of ICV to their values at specific program point.
+  EnumeratedArray<Optional<Value *>, InternalControlVar,
+                  InternalControlVar::ICV___last>
+      ICVReplacementValuesMap;
+
+  /// Return the value with which \p I can be replaced for specific \p ICV.
+  Optional<Value *>
+  getUniqueReplacementValue(InternalControlVar ICV) const override {
+    return ICVReplacementValuesMap[ICV];
+  }
+
+  ChangeStatus updateImpl(Attributor &A) override {
+    ChangeStatus Changed = ChangeStatus::UNCHANGED;
+    const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>(
+        *this, IRPosition::function(*getAnchorScope()));
+
+    if (!ICVTrackingAA.isAssumedTracked())
+      return indicatePessimisticFixpoint();
+
+    for (InternalControlVar ICV : TrackableICVs) {
+      Optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV];
+      Optional<Value *> UniqueICVValue;
+
+      auto CheckReturnInst = [&](Instruction &I) {
+        Optional<Value *> NewReplVal =
+            ICVTrackingAA.getReplacementValue(ICV, &I, A);
+
+        // If we found a second ICV value there is no unique returned value.
+        if (UniqueICVValue.hasValue() && UniqueICVValue != NewReplVal)
+          return false;
+
+        UniqueICVValue = NewReplVal;
+
+        return true;
+      };
+
+      if (!A.checkForAllInstructions(CheckReturnInst, *this, {Instruction::Ret},
+                                     /* CheckBBLivenessOnly */ true))
+        UniqueICVValue = nullptr;
+
+      if (UniqueICVValue == ReplVal)
+        continue;
+
+      ReplVal = UniqueICVValue;
+      Changed = ChangeStatus::CHANGED;
+    }
+
+    return Changed;
+  }
+};
+
+struct AAICVTrackerCallSite : AAICVTracker {
+  AAICVTrackerCallSite(const IRPosition &IRP, Attributor &A)
+      : AAICVTracker(IRP, A) {}
+
+  void initialize(Attributor &A) override {
+    Function *F = getAnchorScope();
+    if (!F || !A.isFunctionIPOAmendable(*F))
+      indicatePessimisticFixpoint();
+
+    // We only initialize this AA for getters, so we need to know which ICV it
+    // gets.
+    auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
+    for (InternalControlVar ICV : TrackableICVs) {
+      auto ICVInfo = OMPInfoCache.ICVs[ICV];
+      auto &Getter = OMPInfoCache.RFIs[ICVInfo.Getter];
+      if (Getter.Declaration == getAssociatedFunction()) {
+        AssociatedICV = ICVInfo.Kind;
+        return;
+      }
+    }
+
+    /// Unknown ICV.
+    indicatePessimisticFixpoint();
+  }
+
+  ChangeStatus manifest(Attributor &A) override {
+    if (!ReplVal.hasValue() || !ReplVal.getValue())
+      return ChangeStatus::UNCHANGED;
+
+    A.changeValueAfterManifest(*getCtxI(), **ReplVal);
+    A.deleteAfterManifest(*getCtxI());
+
+    return ChangeStatus::CHANGED;
+  }
+
+  // FIXME: come up with better string.
+  const std::string getAsStr() const override { return "ICVTrackerCallSite"; }
+
+  // FIXME: come up with some stats.
+  void trackStatistics() const override {}
+
+  InternalControlVar AssociatedICV;
+  Optional<Value *> ReplVal;
+
+  ChangeStatus updateImpl(Attributor &A) override {
+    const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>(
+        *this, IRPosition::function(*getAnchorScope()));
+
+    // We don't have any information, so we assume it changes the ICV.
+    if (!ICVTrackingAA.isAssumedTracked())
+      return indicatePessimisticFixpoint();
+
+    Optional<Value *> NewReplVal =
+        ICVTrackingAA.getReplacementValue(AssociatedICV, getCtxI(), A);
+
+    if (ReplVal == NewReplVal)
+      return ChangeStatus::UNCHANGED;
+
+    ReplVal = NewReplVal;
+    return ChangeStatus::CHANGED;
+  }
+
+  // Return the value with which associated value can be replaced for specific
+  // \p ICV.
+  Optional<Value *>
+  getUniqueReplacementValue(InternalControlVar ICV) const override {
+    return ReplVal;
+  }
+};
+
+struct AAICVTrackerCallSiteReturned : AAICVTracker {
+  AAICVTrackerCallSiteReturned(const IRPosition &IRP, Attributor &A)
+      : AAICVTracker(IRP, A) {}
+
+  // FIXME: come up with better string.
+  const std::string getAsStr() const override {
+    return "ICVTrackerCallSiteReturned";
+  }
+
+  // FIXME: come up with some stats.
+  void trackStatistics() const override {}
+
+  /// We don't manifest anything for this AA.
+  ChangeStatus manifest(Attributor &A) override {
+    return ChangeStatus::UNCHANGED;
+  }
+
+  // Map of ICV to their values at specific program point.
+  EnumeratedArray<Optional<Value *>, InternalControlVar,
+                  InternalControlVar::ICV___last>
+      ICVReplacementValuesMap;
+
+  /// Return the value with which associated value can be replaced for specific
+  /// \p ICV.
+  Optional<Value *>
+  getUniqueReplacementValue(InternalControlVar ICV) const override {
+    return ICVReplacementValuesMap[ICV];
+  }
+
+  ChangeStatus updateImpl(Attributor &A) override {
+    ChangeStatus Changed = ChangeStatus::UNCHANGED;
+    const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>(
+        *this, IRPosition::returned(*getAssociatedFunction()));
+
+    // We don't have any information, so we assume it changes the ICV.
+    if (!ICVTrackingAA.isAssumedTracked())
+      return indicatePessimisticFixpoint();
+
+    for (InternalControlVar ICV : TrackableICVs) {
+      Optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV];
+      Optional<Value *> NewReplVal =
+          ICVTrackingAA.getUniqueReplacementValue(ICV);
+
+      if (ReplVal == NewReplVal)
+        continue;
+
+      ReplVal = NewReplVal;
+      Changed = ChangeStatus::CHANGED;
+    }
+    return Changed;
   }
 };
 } // namespace
@@ -1316,11 +2242,17 @@ AAICVTracker &AAICVTracker::createForPosition(const IRPosition &IRP,
   case IRPosition::IRP_INVALID:
   case IRPosition::IRP_FLOAT:
   case IRPosition::IRP_ARGUMENT:
+  case IRPosition::IRP_CALL_SITE_ARGUMENT:
+    llvm_unreachable("ICVTracker can only be created for function position!");
   case IRPosition::IRP_RETURNED:
+    AA = new (A.Allocator) AAICVTrackerFunctionReturned(IRP, A);
+    break;
   case IRPosition::IRP_CALL_SITE_RETURNED:
-  case IRPosition::IRP_CALL_SITE_ARGUMENT:
+    AA = new (A.Allocator) AAICVTrackerCallSiteReturned(IRP, A);
+    break;
   case IRPosition::IRP_CALL_SITE:
-    llvm_unreachable("ICVTracker can only be created for function position!");
+    AA = new (A.Allocator) AAICVTrackerCallSite(IRP, A);
+    break;
   case IRPosition::IRP_FUNCTION:
     AA = new (A.Allocator) AAICVTrackerFunction(IRP, A);
     break;
@@ -1339,10 +2271,21 @@ PreservedAnalyses OpenMPOptPass::run(LazyCallGraph::SCC &C,
     return PreservedAnalyses::all();
 
   SmallVector<Function *, 16> SCC;
-  for (LazyCallGraph::Node &N : C)
-    SCC.push_back(&N.getFunction());
+  // If there are kernels in the module, we have to run on all SCC's.
+  bool SCCIsInteresting = !OMPInModule.getKernels().empty();
+  for (LazyCallGraph::Node &N : C) {
+    Function *Fn = &N.getFunction();
+    SCC.push_back(Fn);
+
+    // Do we already know that the SCC contains kernels,
+    // or that OpenMP functions are called from this SCC?
+    if (SCCIsInteresting)
+      continue;
+    // If not, let's check that.
+    SCCIsInteresting |= OMPInModule.containsOMPRuntimeCalls(Fn);
+  }
 
-  if (SCC.empty())
+  if (!SCCIsInteresting || SCC.empty())
     return PreservedAnalyses::all();
 
   FunctionAnalysisManager &FAM =
@@ -1364,7 +2307,6 @@ PreservedAnalyses OpenMPOptPass::run(LazyCallGraph::SCC &C,
 
   Attributor A(Functions, InfoCache, CGUpdater);
 
-  // TODO: Compute the module slice we are allowed to look at.
   OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
   bool Changed = OMPOpt.run();
   if (Changed)
@@ -1401,12 +2343,23 @@ struct OpenMPOptLegacyPass : public CallGraphSCCPass {
       return false;
 
     SmallVector<Function *, 16> SCC;
-    for (CallGraphNode *CGN : CGSCC)
-      if (Function *Fn = CGN->getFunction())
-        if (!Fn->isDeclaration())
-          SCC.push_back(Fn);
+    // If there are kernels in the module, we have to run on all SCC's.
+    bool SCCIsInteresting = !OMPInModule.getKernels().empty();
+    for (CallGraphNode *CGN : CGSCC) {
+      Function *Fn = CGN->getFunction();
+      if (!Fn || Fn->isDeclaration())
+        continue;
+      SCC.push_back(Fn);
 
-    if (SCC.empty())
+      // Do we already know that the SCC contains kernels,
+      // or that OpenMP functions are called from this SCC?
+      if (SCCIsInteresting)
+        continue;
+      // If not, let's check that.
+      SCCIsInteresting |= OMPInModule.containsOMPRuntimeCalls(Fn);
+    }
+
+    if (!SCCIsInteresting || SCC.empty())
       return false;
 
     CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
@@ -1430,7 +2383,6 @@ struct OpenMPOptLegacyPass : public CallGraphSCCPass {
 
     Attributor A(Functions, InfoCache, CGUpdater);
 
-    // TODO: Compute the module slice we are allowed to look at.
     OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
     return OMPOpt.run();
   }
@@ -1468,13 +2420,19 @@ bool llvm::omp::containsOpenMP(Module &M, OpenMPInModule &OMPInModule) {
   if (OMPInModule.isKnown())
     return OMPInModule;
 
+  auto RecordFunctionsContainingUsesOf = [&](Function *F) {
+    for (User *U : F->users())
+      if (auto *I = dyn_cast<Instruction>(U))
+        OMPInModule.FuncsWithOMPRuntimeCalls.insert(I->getFunction());
+  };
+
   // MSVC doesn't like long if-else chains for some reason and instead just
   // issues an error. Work around it..
   do {
 #define OMP_RTL(_Enum, _Name, ...)                                             \
-  if (M.getFunction(_Name)) {                                                  \
+  if (Function *F = M.getFunction(_Name)) {                                    \
+    RecordFunctionsContainingUsesOf(F);                                        \
     OMPInModule = true;                                                        \
-    break;                                                                     \
   }
 #include "llvm/Frontend/OpenMP/OMPKinds.def"
   } while (false);
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/PartialInlining.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/PartialInlining.cpp
index 5d863f1330a4..2bbf4bf110ae 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/PartialInlining.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/PartialInlining.cpp
@@ -97,13 +97,6 @@ static cl::opt<bool>
     MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden,
                        cl::desc("Mark outline function calls with ColdCC"));
 
-#ifndef NDEBUG
-// Command line option to debug partial-inlining. The default is none:
-static cl::opt<bool> TracePartialInlining("trace-partial-inlining",
-                                          cl::init(false), cl::Hidden,
-                                          cl::desc("Trace partial inlining."));
-#endif
-
 // This is an option used by testing:
 static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
                                       cl::init(false), cl::ZeroOrMore,
@@ -159,7 +152,7 @@ struct FunctionOutliningInfo {
 
   // Returns the number of blocks to be inlined including all blocks
   // in Entries and one return block.
-  unsigned GetNumInlinedBlocks() const { return Entries.size() + 1; }
+  unsigned getNumInlinedBlocks() const { return Entries.size() + 1; }
 
   // A set of blocks including the function entry that guard
   // the region to be outlined.
@@ -215,7 +208,7 @@ struct PartialInlinerImpl {
   // function (only if we partially inlined early returns) as there is a
   // possibility to further "peel" early return statements that were left in the
   // outline function due to code size.
-  std::pair<bool, Function *> unswitchFunction(Function *F);
+  std::pair<bool, Function *> unswitchFunction(Function &F);
 
   // This class speculatively clones the function to be partial inlined.
   // At the end of partial inlining, the remaining callsites to the cloned
@@ -226,16 +219,19 @@ struct PartialInlinerImpl {
     // multi-region outlining.
     FunctionCloner(Function *F, FunctionOutliningInfo *OI,
                    OptimizationRemarkEmitter &ORE,
-                   function_ref<AssumptionCache *(Function &)> LookupAC);
+                   function_ref<AssumptionCache *(Function &)> LookupAC,
+                   function_ref<TargetTransformInfo &(Function &)> GetTTI);
     FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI,
                    OptimizationRemarkEmitter &ORE,
-                   function_ref<AssumptionCache *(Function &)> LookupAC);
+                   function_ref<AssumptionCache *(Function &)> LookupAC,
+                   function_ref<TargetTransformInfo &(Function &)> GetTTI);
+
     ~FunctionCloner();
 
     // Prepare for function outlining: making sure there is only
     // one incoming edge from the extracted/outlined region to
     // the return block.
-    void NormalizeReturnBlock();
+    void normalizeReturnBlock() const;
 
     // Do function outlining for cold regions.
     bool doMultiRegionFunctionOutlining();
@@ -266,6 +262,7 @@ struct PartialInlinerImpl {
     std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr;
     OptimizationRemarkEmitter &ORE;
     function_ref<AssumptionCache *(Function &)> LookupAC;
+    function_ref<TargetTransformInfo &(Function &)> GetTTI;
   };
 
 private:
@@ -281,13 +278,14 @@ private:
   // The result is no larger than 1 and is represented using BP.
   // (Note that the outlined region's 'head' block can only have incoming
   // edges from the guarding entry blocks).
-  BranchProbability getOutliningCallBBRelativeFreq(FunctionCloner &Cloner);
+  BranchProbability
+  getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) const;
 
   // Return true if the callee of CB should be partially inlined with
   // profit.
   bool shouldPartialInline(CallBase &CB, FunctionCloner &Cloner,
                            BlockFrequency WeightedOutliningRcost,
-                           OptimizationRemarkEmitter &ORE);
+                           OptimizationRemarkEmitter &ORE) const;
 
   // Try to inline DuplicateFunction (cloned from F with call to
   // the OutlinedFunction into its callers. Return true
@@ -296,10 +294,11 @@ private:
 
   // Compute the mapping from use site of DuplicationFunction to the enclosing
   // BB's profile count.
-  void computeCallsiteToProfCountMap(Function *DuplicateFunction,
-                                     DenseMap<User *, uint64_t> &SiteCountMap);
+  void
+  computeCallsiteToProfCountMap(Function *DuplicateFunction,
+                                DenseMap<User *, uint64_t> &SiteCountMap) const;
 
-  bool IsLimitReached() {
+  bool isLimitReached() const {
     return (MaxNumPartialInlining != -1 &&
             NumPartialInlining >= MaxNumPartialInlining);
   }
@@ -311,12 +310,12 @@ private:
     return nullptr;
   }
 
-  static CallBase *getOneCallSiteTo(Function *F) {
-    User *User = *F->user_begin();
+  static CallBase *getOneCallSiteTo(Function &F) {
+    User *User = *F.user_begin();
     return getSupportedCallBase(User);
   }
 
-  std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function *F) {
+  std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function &F) const {
     CallBase *CB = getOneCallSiteTo(F);
     DebugLoc DLoc = CB->getDebugLoc();
     BasicBlock *Block = CB->getParent();
@@ -329,16 +328,19 @@ private:
   //    outlined function itself;
   // - The second value is the estimated size of the new call sequence in
   //   basic block Cloner.OutliningCallBB;
-  std::tuple<int, int> computeOutliningCosts(FunctionCloner &Cloner);
+  std::tuple<int, int> computeOutliningCosts(FunctionCloner &Cloner) const;
 
   // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
   // approximate both the size and runtime cost (Note that in the current
   // inline cost analysis, there is no clear distinction there either).
-  static int computeBBInlineCost(BasicBlock *BB);
+  static int computeBBInlineCost(BasicBlock *BB, TargetTransformInfo *TTI);
+
+  std::unique_ptr<FunctionOutliningInfo>
+  computeOutliningInfo(Function &F) const;
 
-  std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F);
   std::unique_ptr<FunctionOutliningMultiRegionInfo>
-  computeOutliningColdRegionsInfo(Function *F, OptimizationRemarkEmitter &ORE);
+  computeOutliningColdRegionsInfo(Function &F,
+                                  OptimizationRemarkEmitter &ORE) const;
 };
 
 struct PartialInlinerLegacyPass : public ModulePass {
@@ -390,20 +392,20 @@ struct PartialInlinerLegacyPass : public ModulePass {
 } // end anonymous namespace
 
 std::unique_ptr<FunctionOutliningMultiRegionInfo>
-PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F,
-                                                    OptimizationRemarkEmitter &ORE) {
-  BasicBlock *EntryBlock = &F->front();
+PartialInlinerImpl::computeOutliningColdRegionsInfo(
+    Function &F, OptimizationRemarkEmitter &ORE) const {
+  BasicBlock *EntryBlock = &F.front();
 
-  DominatorTree DT(*F);
+  DominatorTree DT(F);
   LoopInfo LI(DT);
-  BranchProbabilityInfo BPI(*F, LI);
+  BranchProbabilityInfo BPI(F, LI);
   std::unique_ptr<BlockFrequencyInfo> ScopedBFI;
   BlockFrequencyInfo *BFI;
   if (!GetBFI) {
-    ScopedBFI.reset(new BlockFrequencyInfo(*F, BPI, LI));
+    ScopedBFI.reset(new BlockFrequencyInfo(F, BPI, LI));
     BFI = ScopedBFI.get();
   } else
-    BFI = &(GetBFI(*F));
+    BFI = &(GetBFI(F));
 
   // Return if we don't have profiling information.
   if (!PSI.hasInstrumentationProfile())
@@ -412,11 +414,6 @@ PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F,
   std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo =
       std::make_unique<FunctionOutliningMultiRegionInfo>();
 
-  auto IsSingleEntry = [](SmallVectorImpl<BasicBlock *> &BlockList) {
-    BasicBlock *Dom = BlockList.front();
-    return BlockList.size() > 1 && Dom->hasNPredecessors(1);
-  };
-
   auto IsSingleExit =
       [&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * {
     BasicBlock *ExitBlock = nullptr;
@@ -432,8 +429,9 @@ PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F,
                      << " has more than one region exit edge.";
             });
             return nullptr;
-          } else
-            ExitBlock = Block;
+          }
+
+          ExitBlock = Block;
         }
       }
     }
@@ -448,14 +446,14 @@ PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F,
 
   // Use the same computeBBInlineCost function to compute the cost savings of
   // the outlining the candidate region.
+  TargetTransformInfo *FTTI = &GetTTI(F);
   int OverallFunctionCost = 0;
-  for (auto &BB : *F)
-    OverallFunctionCost += computeBBInlineCost(&BB);
+  for (auto &BB : F)
+    OverallFunctionCost += computeBBInlineCost(&BB, FTTI);
+
+  LLVM_DEBUG(dbgs() << "OverallFunctionCost = " << OverallFunctionCost
+                    << "\n";);
 
-#ifndef NDEBUG
-  if (TracePartialInlining)
-    dbgs() << "OverallFunctionCost = " << OverallFunctionCost << "\n";
-#endif
   int MinOutlineRegionCost =
       static_cast<int>(OverallFunctionCost * MinRegionSizeRatio);
   BranchProbability MinBranchProbability(
@@ -467,6 +465,7 @@ PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F,
   DenseMap<BasicBlock *, bool> VisitedMap;
   DFS.push_back(CurrEntry);
   VisitedMap[CurrEntry] = true;
+
   // Use Depth First Search on the basic blocks to find CFG edges that are
   // considered cold.
   // Cold regions considered must also have its inline cost compared to the
@@ -474,88 +473,98 @@ PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F,
   // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or
   // more.
   while (!DFS.empty()) {
-    auto *thisBB = DFS.back();
+    auto *ThisBB = DFS.back();
     DFS.pop_back();
     // Only consider regions with predecessor blocks that are considered
     // not-cold (default: part of the top 99.99% of all block counters)
     // AND greater than our minimum block execution count (default: 100).
-    if (PSI.isColdBlock(thisBB, BFI) ||
-        BBProfileCount(thisBB) < MinBlockCounterExecution)
+    if (PSI.isColdBlock(ThisBB, BFI) ||
+        BBProfileCount(ThisBB) < MinBlockCounterExecution)
       continue;
-    for (auto SI = succ_begin(thisBB); SI != succ_end(thisBB); ++SI) {
+    for (auto SI = succ_begin(ThisBB); SI != succ_end(ThisBB); ++SI) {
       if (VisitedMap[*SI])
         continue;
       VisitedMap[*SI] = true;
       DFS.push_back(*SI);
       // If branch isn't cold, we skip to the next one.
-      BranchProbability SuccProb = BPI.getEdgeProbability(thisBB, *SI);
+      BranchProbability SuccProb = BPI.getEdgeProbability(ThisBB, *SI);
       if (SuccProb > MinBranchProbability)
         continue;
-#ifndef NDEBUG
-      if (TracePartialInlining) {
-        dbgs() << "Found cold edge: " << thisBB->getName() << "->"
-               << (*SI)->getName() << "\nBranch Probability = " << SuccProb
-               << "\n";
-      }
-#endif
+
+      LLVM_DEBUG(dbgs() << "Found cold edge: " << ThisBB->getName() << "->"
+                        << SI->getName()
+                        << "\nBranch Probability = " << SuccProb << "\n";);
+
       SmallVector<BasicBlock *, 8> DominateVector;
       DT.getDescendants(*SI, DominateVector);
+      assert(!DominateVector.empty() &&
+             "SI should be reachable and have at least itself as descendant");
+
       // We can only outline single entry regions (for now).
-      if (!IsSingleEntry(DominateVector))
+      if (!DominateVector.front()->hasNPredecessors(1)) {
+        LLVM_DEBUG(dbgs() << "ABORT: Block " << SI->getName()
+                          << " doesn't have a single predecessor in the "
+                             "dominator tree\n";);
         continue;
+      }
+
       BasicBlock *ExitBlock = nullptr;
       // We can only outline single exit regions (for now).
-      if (!(ExitBlock = IsSingleExit(DominateVector)))
+      if (!(ExitBlock = IsSingleExit(DominateVector))) {
+        LLVM_DEBUG(dbgs() << "ABORT: Block " << SI->getName()
+                          << " doesn't have a unique successor\n";);
         continue;
+      }
+
       int OutlineRegionCost = 0;
       for (auto *BB : DominateVector)
-        OutlineRegionCost += computeBBInlineCost(BB);
+        OutlineRegionCost += computeBBInlineCost(BB, &GetTTI(*BB->getParent()));
 
-#ifndef NDEBUG
-      if (TracePartialInlining)
-        dbgs() << "OutlineRegionCost = " << OutlineRegionCost << "\n";
-#endif
+      LLVM_DEBUG(dbgs() << "OutlineRegionCost = " << OutlineRegionCost
+                        << "\n";);
 
-      if (OutlineRegionCost < MinOutlineRegionCost) {
+      if (!SkipCostAnalysis && OutlineRegionCost < MinOutlineRegionCost) {
         ORE.emit([&]() {
           return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly",
                                             &SI->front())
-                 << ore::NV("Callee", F) << " inline cost-savings smaller than "
+                 << ore::NV("Callee", &F)
+                 << " inline cost-savings smaller than "
                  << ore::NV("Cost", MinOutlineRegionCost);
         });
+
+        LLVM_DEBUG(dbgs() << "ABORT: Outline region cost is smaller than "
+                          << MinOutlineRegionCost << "\n";);
         continue;
       }
+
       // For now, ignore blocks that belong to a SISE region that is a
       // candidate for outlining.  In the future, we may want to look
       // at inner regions because the outer region may have live-exit
       // variables.
       for (auto *BB : DominateVector)
         VisitedMap[BB] = true;
+
       // ReturnBlock here means the block after the outline call
       BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor();
-      // assert(ReturnBlock && "ReturnBlock is NULL somehow!");
       FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo(
           DominateVector, DominateVector.front(), ExitBlock, ReturnBlock);
       OutliningInfo->ORI.push_back(RegInfo);
-#ifndef NDEBUG
-      if (TracePartialInlining) {
-        dbgs() << "Found Cold Candidate starting at block: "
-               << DominateVector.front()->getName() << "\n";
-      }
-#endif
+      LLVM_DEBUG(dbgs() << "Found Cold Candidate starting at block: "
+                        << DominateVector.front()->getName() << "\n";);
       ColdCandidateFound = true;
       NumColdRegionsFound++;
     }
   }
+
   if (ColdCandidateFound)
     return OutliningInfo;
-  else
-    return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
+
+  return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
 }
 
 std::unique_ptr<FunctionOutliningInfo>
-PartialInlinerImpl::computeOutliningInfo(Function *F) {
-  BasicBlock *EntryBlock = &F->front();
+PartialInlinerImpl::computeOutliningInfo(Function &F) const {
+  BasicBlock *EntryBlock = &F.front();
   BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator());
   if (!BR || BR->isUnconditional())
     return std::unique_ptr<FunctionOutliningInfo>();
@@ -598,7 +607,7 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) {
     // The number of blocks to be inlined has already reached
     // the limit. When MaxNumInlineBlocks is set to 0 or 1, this
     // disables partial inlining for the function.
-    if (OutliningInfo->GetNumInlinedBlocks() >= MaxNumInlineBlocks)
+    if (OutliningInfo->getNumInlinedBlocks() >= MaxNumInlineBlocks)
       break;
 
     if (succ_size(CurrEntry) != 2)
@@ -618,8 +627,7 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) {
       break;
     }
 
-    BasicBlock *CommSucc;
-    BasicBlock *OtherSucc;
+    BasicBlock *CommSucc, *OtherSucc;
     std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2);
 
     if (!CommSucc)
@@ -635,7 +643,7 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) {
   // Do sanity check of the entries: threre should not
   // be any successors (not in the entry set) other than
   // {ReturnBlock, NonReturnBlock}
-  assert(OutliningInfo->Entries[0] == &F->front() &&
+  assert(OutliningInfo->Entries[0] == &F.front() &&
          "Function Entry must be the first in Entries vector");
   DenseSet<BasicBlock *> Entries;
   for (BasicBlock *E : OutliningInfo->Entries)
@@ -644,7 +652,7 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) {
   // Returns true of BB has Predecessor which is not
   // in Entries set.
   auto HasNonEntryPred = [Entries](BasicBlock *BB) {
-    for (auto Pred : predecessors(BB)) {
+    for (auto *Pred : predecessors(BB)) {
       if (!Entries.count(Pred))
         return true;
     }
@@ -653,7 +661,7 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) {
   auto CheckAndNormalizeCandidate =
       [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) {
         for (BasicBlock *E : OutliningInfo->Entries) {
-          for (auto Succ : successors(E)) {
+          for (auto *Succ : successors(E)) {
             if (Entries.count(Succ))
               continue;
             if (Succ == OutliningInfo->ReturnBlock)
@@ -673,7 +681,7 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) {
 
   // Now further growing the candidate's inlining region by
   // peeling off dominating blocks from the outlining region:
-  while (OutliningInfo->GetNumInlinedBlocks() < MaxNumInlineBlocks) {
+  while (OutliningInfo->getNumInlinedBlocks() < MaxNumInlineBlocks) {
     BasicBlock *Cand = OutliningInfo->NonReturnBlock;
     if (succ_size(Cand) != 2)
       break;
@@ -703,11 +711,11 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) {
 }
 
 // Check if there is PGO data or user annotated branch data:
-static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) {
-  if (F->hasProfileData())
+static bool hasProfileData(const Function &F, const FunctionOutliningInfo &OI) {
+  if (F.hasProfileData())
     return true;
   // Now check if any of the entry block has MD_prof data:
-  for (auto *E : OI->Entries) {
+  for (auto *E : OI.Entries) {
     BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator());
     if (!BR || BR->isUnconditional())
       continue;
@@ -718,8 +726,8 @@ static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) {
   return false;
 }
 
-BranchProbability
-PartialInlinerImpl::getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) {
+BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq(
+    FunctionCloner &Cloner) const {
   BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second;
   auto EntryFreq =
       Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock());
@@ -728,13 +736,13 @@ PartialInlinerImpl::getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) {
   // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE
   // we outlined any regions, so we may encounter situations where the
   // OutliningCallFreq is *slightly* bigger than the EntryFreq.
-  if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency()) {
+  if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency())
     OutliningCallFreq = EntryFreq;
-  }
+
   auto OutlineRegionRelFreq = BranchProbability::getBranchProbability(
       OutliningCallFreq.getFrequency(), EntryFreq.getFrequency());
 
-  if (hasProfileData(Cloner.OrigFunc, Cloner.ClonedOI.get()))
+  if (hasProfileData(*Cloner.OrigFunc, *Cloner.ClonedOI.get()))
     return OutlineRegionRelFreq;
 
   // When profile data is not available, we need to be conservative in
@@ -760,7 +768,7 @@ PartialInlinerImpl::getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) {
 
 bool PartialInlinerImpl::shouldPartialInline(
     CallBase &CB, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost,
-    OptimizationRemarkEmitter &ORE) {
+    OptimizationRemarkEmitter &ORE) const {
   using namespace ore;
 
   Function *Callee = CB.getCalledFunction();
@@ -843,7 +851,8 @@ bool PartialInlinerImpl::shouldPartialInline(
 // TODO: Ideally  we should share Inliner's InlineCost Analysis code.
 // For now use a simplified version. The returned 'InlineCost' will be used
 // to esimate the size cost as well as runtime cost of the BB.
-int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) {
+int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB,
+                                            TargetTransformInfo *TTI) {
   int InlineCost = 0;
   const DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
   for (Instruction &I : BB->instructionsWithoutDebug()) {
@@ -866,6 +875,21 @@ int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) {
     if (I.isLifetimeStartOrEnd())
       continue;
 
+    if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
+      Intrinsic::ID IID = II->getIntrinsicID();
+      SmallVector<Type *, 4> Tys;
+      FastMathFlags FMF;
+      for (Value *Val : II->args())
+        Tys.push_back(Val->getType());
+
+      if (auto *FPMO = dyn_cast<FPMathOperator>(II))
+        FMF = FPMO->getFastMathFlags();
+
+      IntrinsicCostAttributes ICA(IID, II->getType(), Tys, FMF);
+      InlineCost += TTI->getIntrinsicInstrCost(ICA, TTI::TCK_SizeAndLatency);
+      continue;
+    }
+
     if (CallInst *CI = dyn_cast<CallInst>(&I)) {
       InlineCost += getCallsiteCost(*CI, DL);
       continue;
@@ -886,18 +910,20 @@ int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) {
 }
 
 std::tuple<int, int>
-PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) {
+PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) const {
   int OutliningFuncCallCost = 0, OutlinedFunctionCost = 0;
   for (auto FuncBBPair : Cloner.OutlinedFunctions) {
     Function *OutlinedFunc = FuncBBPair.first;
     BasicBlock* OutliningCallBB = FuncBBPair.second;
     // Now compute the cost of the call sequence to the outlined function
     // 'OutlinedFunction' in BB 'OutliningCallBB':
-    OutliningFuncCallCost += computeBBInlineCost(OutliningCallBB);
+    auto *OutlinedFuncTTI = &GetTTI(*OutlinedFunc);
+    OutliningFuncCallCost +=
+        computeBBInlineCost(OutliningCallBB, OutlinedFuncTTI);
 
     // Now compute the cost of the extracted/outlined function itself:
     for (BasicBlock &BB : *OutlinedFunc)
-      OutlinedFunctionCost += computeBBInlineCost(&BB);
+      OutlinedFunctionCost += computeBBInlineCost(&BB, OutlinedFuncTTI);
   }
   assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost &&
          "Outlined function cost should be no less than the outlined region");
@@ -921,7 +947,7 @@ PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) {
 // after the function is partially inlined into the callsite.
 void PartialInlinerImpl::computeCallsiteToProfCountMap(
     Function *DuplicateFunction,
-    DenseMap<User *, uint64_t> &CallSiteToProfCountMap) {
+    DenseMap<User *, uint64_t> &CallSiteToProfCountMap) const {
   std::vector<User *> Users(DuplicateFunction->user_begin(),
                             DuplicateFunction->user_end());
   Function *CurrentCaller = nullptr;
@@ -962,8 +988,9 @@ void PartialInlinerImpl::computeCallsiteToProfCountMap(
 
 PartialInlinerImpl::FunctionCloner::FunctionCloner(
     Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE,
-    function_ref<AssumptionCache *(Function &)> LookupAC)
-    : OrigFunc(F), ORE(ORE), LookupAC(LookupAC) {
+    function_ref<AssumptionCache *(Function &)> LookupAC,
+    function_ref<TargetTransformInfo &(Function &)> GetTTI)
+    : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) {
   ClonedOI = std::make_unique<FunctionOutliningInfo>();
 
   // Clone the function, so that we can hack away on it.
@@ -972,9 +999,9 @@ PartialInlinerImpl::FunctionCloner::FunctionCloner(
 
   ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]);
   ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]);
-  for (BasicBlock *BB : OI->Entries) {
+  for (BasicBlock *BB : OI->Entries)
     ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB]));
-  }
+
   for (BasicBlock *E : OI->ReturnBlockPreds) {
     BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
     ClonedOI->ReturnBlockPreds.push_back(NewE);
@@ -987,8 +1014,9 @@ PartialInlinerImpl::FunctionCloner::FunctionCloner(
 PartialInlinerImpl::FunctionCloner::FunctionCloner(
     Function *F, FunctionOutliningMultiRegionInfo *OI,
     OptimizationRemarkEmitter &ORE,
-    function_ref<AssumptionCache *(Function &)> LookupAC)
-    : OrigFunc(F), ORE(ORE), LookupAC(LookupAC) {
+    function_ref<AssumptionCache *(Function &)> LookupAC,
+    function_ref<TargetTransformInfo &(Function &)> GetTTI)
+    : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) {
   ClonedOMRI = std::make_unique<FunctionOutliningMultiRegionInfo>();
 
   // Clone the function, so that we can hack away on it.
@@ -1000,9 +1028,9 @@ PartialInlinerImpl::FunctionCloner::FunctionCloner(
   for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
        OI->ORI) {
     SmallVector<BasicBlock *, 8> Region;
-    for (BasicBlock *BB : RegionInfo.Region) {
+    for (BasicBlock *BB : RegionInfo.Region)
       Region.push_back(cast<BasicBlock>(VMap[BB]));
-    }
+
     BasicBlock *NewEntryBlock = cast<BasicBlock>(VMap[RegionInfo.EntryBlock]);
     BasicBlock *NewExitBlock = cast<BasicBlock>(VMap[RegionInfo.ExitBlock]);
     BasicBlock *NewReturnBlock = nullptr;
@@ -1017,8 +1045,8 @@ PartialInlinerImpl::FunctionCloner::FunctionCloner(
   F->replaceAllUsesWith(ClonedFunc);
 }
 
-void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() {
-  auto getFirstPHI = [](BasicBlock *BB) {
+void PartialInlinerImpl::FunctionCloner::normalizeReturnBlock() const {
+  auto GetFirstPHI = [](BasicBlock *BB) {
     BasicBlock::iterator I = BB->begin();
     PHINode *FirstPhi = nullptr;
     while (I != BB->end()) {
@@ -1044,7 +1072,7 @@ void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() {
   // of which will go outside.
   BasicBlock *PreReturn = ClonedOI->ReturnBlock;
   // only split block when necessary:
-  PHINode *FirstPhi = getFirstPHI(PreReturn);
+  PHINode *FirstPhi = GetFirstPHI(PreReturn);
   unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size();
 
   if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1)
@@ -1092,17 +1120,16 @@ void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() {
   for (auto *DP : DeadPhis)
     DP->eraseFromParent();
 
-  for (auto E : ClonedOI->ReturnBlockPreds) {
+  for (auto *E : ClonedOI->ReturnBlockPreds)
     E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock);
-  }
 }
 
 bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() {
 
-  auto ComputeRegionCost = [](SmallVectorImpl<BasicBlock *> &Region) {
+  auto ComputeRegionCost = [&](SmallVectorImpl<BasicBlock *> &Region) {
     int Cost = 0;
     for (BasicBlock* BB : Region)
-      Cost += computeBBInlineCost(BB);
+      Cost += computeBBInlineCost(BB, &GetTTI(*BB->getParent()));
     return Cost;
   };
 
@@ -1135,24 +1162,21 @@ bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() {
 
     CE.findInputsOutputs(Inputs, Outputs, Sinks);
 
-#ifndef NDEBUG
-    if (TracePartialInlining) {
+    LLVM_DEBUG({
       dbgs() << "inputs: " << Inputs.size() << "\n";
       dbgs() << "outputs: " << Outputs.size() << "\n";
       for (Value *value : Inputs)
         dbgs() << "value used in func: " << *value << "\n";
       for (Value *output : Outputs)
         dbgs() << "instr used in func: " << *output << "\n";
-    }
-#endif
+    });
+
     // Do not extract regions that have live exit variables.
     if (Outputs.size() > 0 && !ForceLiveExit)
       continue;
 
-    Function *OutlinedFunc = CE.extractCodeRegion(CEAC);
-
-    if (OutlinedFunc) {
-      CallBase *OCS = PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc);
+    if (Function *OutlinedFunc = CE.extractCodeRegion(CEAC)) {
+      CallBase *OCS = PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc);
       BasicBlock *OutliningCallBB = OCS->getParent();
       assert(OutliningCallBB->getParent() == ClonedFunc);
       OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB));
@@ -1181,8 +1205,7 @@ PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() {
   // (i.e. not to be extracted to the out of line function)
   auto ToBeInlined = [&, this](BasicBlock *BB) {
     return BB == ClonedOI->ReturnBlock ||
-           (std::find(ClonedOI->Entries.begin(), ClonedOI->Entries.end(), BB) !=
-            ClonedOI->Entries.end());
+           llvm::is_contained(ClonedOI->Entries, BB);
   };
 
   assert(ClonedOI && "Expecting OutlineInfo for single region outline");
@@ -1197,9 +1220,10 @@ PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() {
 
   // Gather up the blocks that we're going to extract.
   std::vector<BasicBlock *> ToExtract;
+  auto *ClonedFuncTTI = &GetTTI(*ClonedFunc);
   ToExtract.push_back(ClonedOI->NonReturnBlock);
-  OutlinedRegionCost +=
-      PartialInlinerImpl::computeBBInlineCost(ClonedOI->NonReturnBlock);
+  OutlinedRegionCost += PartialInlinerImpl::computeBBInlineCost(
+      ClonedOI->NonReturnBlock, ClonedFuncTTI);
   for (BasicBlock &BB : *ClonedFunc)
     if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) {
       ToExtract.push_back(&BB);
@@ -1207,7 +1231,7 @@ PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() {
       // into the outlined function which may make the outlining
       // overhead (the difference of the outlined function cost
       // and OutliningRegionCost) look larger.
-      OutlinedRegionCost += computeBBInlineCost(&BB);
+      OutlinedRegionCost += computeBBInlineCost(&BB, ClonedFuncTTI);
     }
 
   // Extract the body of the if.
@@ -1220,8 +1244,7 @@ PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() {
 
   if (OutlinedFunc) {
     BasicBlock *OutliningCallBB =
-        PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc)
-            ->getParent();
+        PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc)->getParent();
     assert(OutliningCallBB->getParent() == ClonedFunc);
     OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB));
   } else
@@ -1250,52 +1273,48 @@ PartialInlinerImpl::FunctionCloner::~FunctionCloner() {
   }
 }
 
-std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function *F) {
-
-  if (F->hasAddressTaken())
+std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function &F) {
+  if (F.hasAddressTaken())
     return {false, nullptr};
 
   // Let inliner handle it
-  if (F->hasFnAttribute(Attribute::AlwaysInline))
+  if (F.hasFnAttribute(Attribute::AlwaysInline))
     return {false, nullptr};
 
-  if (F->hasFnAttribute(Attribute::NoInline))
+  if (F.hasFnAttribute(Attribute::NoInline))
     return {false, nullptr};
 
-  if (PSI.isFunctionEntryCold(F))
+  if (PSI.isFunctionEntryCold(&F))
     return {false, nullptr};
 
-  if (F->users().empty())
+  if (F.users().empty())
     return {false, nullptr};
 
-  OptimizationRemarkEmitter ORE(F);
+  OptimizationRemarkEmitter ORE(&F);
 
   // Only try to outline cold regions if we have a profile summary, which
   // implies we have profiling information.
-  if (PSI.hasProfileSummary() && F->hasProfileData() &&
+  if (PSI.hasProfileSummary() && F.hasProfileData() &&
       !DisableMultiRegionPartialInline) {
     std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI =
         computeOutliningColdRegionsInfo(F, ORE);
     if (OMRI) {
-      FunctionCloner Cloner(F, OMRI.get(), ORE, LookupAssumptionCache);
+      FunctionCloner Cloner(&F, OMRI.get(), ORE, LookupAssumptionCache, GetTTI);
 
-#ifndef NDEBUG
-      if (TracePartialInlining) {
+      LLVM_DEBUG({
         dbgs() << "HotCountThreshold = " << PSI.getHotCountThreshold() << "\n";
         dbgs() << "ColdCountThreshold = " << PSI.getColdCountThreshold()
                << "\n";
-      }
-#endif
+      });
+
       bool DidOutline = Cloner.doMultiRegionFunctionOutlining();
 
       if (DidOutline) {
-#ifndef NDEBUG
-        if (TracePartialInlining) {
+        LLVM_DEBUG({
           dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n";
           Cloner.ClonedFunc->print(dbgs());
           dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n";
-        }
-#endif
+        });
 
         if (tryPartialInline(Cloner))
           return {true, nullptr};
@@ -1310,17 +1329,15 @@ std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function *F) {
   if (!OI)
     return {false, nullptr};
 
-  FunctionCloner Cloner(F, OI.get(), ORE, LookupAssumptionCache);
-  Cloner.NormalizeReturnBlock();
+  FunctionCloner Cloner(&F, OI.get(), ORE, LookupAssumptionCache, GetTTI);
+  Cloner.normalizeReturnBlock();
 
   Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining();
 
   if (!OutlinedFunction)
     return {false, nullptr};
 
-  bool AnyInline = tryPartialInline(Cloner);
-
-  if (AnyInline)
+  if (tryPartialInline(Cloner))
     return {true, OutlinedFunction};
 
   return {false, nullptr};
@@ -1338,9 +1355,9 @@ bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
   // Only calculate RelativeToEntryFreq when we are doing single region
   // outlining.
   BranchProbability RelativeToEntryFreq;
-  if (Cloner.ClonedOI) {
+  if (Cloner.ClonedOI)
     RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner);
-  } else
+  else
     // RelativeToEntryFreq doesn't make sense when we have more than one
     // outlined call because each call will have a different relative frequency
     // to the entry block.  We can consider using the average, but the
@@ -1358,7 +1375,7 @@ bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
     OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
     DebugLoc DLoc;
     BasicBlock *Block;
-    std::tie(DLoc, Block) = getOneDebugLoc(Cloner.ClonedFunc);
+    std::tie(DLoc, Block) = getOneDebugLoc(*Cloner.ClonedFunc);
     OrigFuncORE.emit([&]() {
       return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
                                         DLoc, Block)
@@ -1389,7 +1406,7 @@ bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
   for (User *User : Users) {
     CallBase *CB = getSupportedCallBase(User);
 
-    if (IsLimitReached())
+    if (isLimitReached())
       continue;
 
     OptimizationRemarkEmitter CallerORE(CB->getCaller());
@@ -1426,7 +1443,6 @@ bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
       NumPartialInlined++;
     else
       NumColdOutlinePartialInlined++;
-
   }
 
   if (AnyInline) {
@@ -1439,7 +1455,6 @@ bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
       return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc)
              << "Partially inlined into at least one caller";
     });
-
   }
 
   return AnyInline;
@@ -1473,7 +1488,7 @@ bool PartialInlinerImpl::run(Module &M) {
     if (Recursive)
       continue;
 
-    std::pair<bool, Function * > Result = unswitchFunction(CurrFunc);
+    std::pair<bool, Function *> Result = unswitchFunction(*CurrFunc);
     if (Result.second)
       Worklist.push_back(Result.second);
     Changed |= Result.first;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
index d73d42c52074..068328391dff 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -51,16 +51,16 @@
 
 using namespace llvm;
 
-static cl::opt<bool>
-    RunPartialInlining("enable-partial-inlining", cl::init(false), cl::Hidden,
-                       cl::ZeroOrMore, cl::desc("Run Partial inlinining pass"));
+cl::opt<bool> RunPartialInlining("enable-partial-inlining", cl::init(false),
+                                 cl::Hidden, cl::ZeroOrMore,
+                                 cl::desc("Run Partial inlinining pass"));
 
 static cl::opt<bool>
 UseGVNAfterVectorization("use-gvn-after-vectorization",
   cl::init(false), cl::Hidden,
   cl::desc("Run GVN instead of Early CSE after vectorization passes"));
 
-static cl::opt<bool> ExtraVectorizerPasses(
+cl::opt<bool> ExtraVectorizerPasses(
     "extra-vectorizer-passes", cl::init(false), cl::Hidden,
     cl::desc("Run cleanup optimization passes after vectorization."));
 
@@ -68,29 +68,33 @@ static cl::opt<bool>
 RunLoopRerolling("reroll-loops", cl::Hidden,
                  cl::desc("Run the loop rerolling pass"));
 
-static cl::opt<bool> RunNewGVN("enable-newgvn", cl::init(false), cl::Hidden,
-                               cl::desc("Run the NewGVN pass"));
+cl::opt<bool> RunNewGVN("enable-newgvn", cl::init(false), cl::Hidden,
+                        cl::desc("Run the NewGVN pass"));
 
 // Experimental option to use CFL-AA
 enum class CFLAAType { None, Steensgaard, Andersen, Both };
-static cl::opt<CFLAAType>
-    UseCFLAA("use-cfl-aa", cl::init(CFLAAType::None), cl::Hidden,
+static cl::opt<::CFLAAType>
+    UseCFLAA("use-cfl-aa", cl::init(::CFLAAType::None), cl::Hidden,
              cl::desc("Enable the new, experimental CFL alias analysis"),
-             cl::values(clEnumValN(CFLAAType::None, "none", "Disable CFL-AA"),
-                        clEnumValN(CFLAAType::Steensgaard, "steens",
+             cl::values(clEnumValN(::CFLAAType::None, "none", "Disable CFL-AA"),
+                        clEnumValN(::CFLAAType::Steensgaard, "steens",
                                    "Enable unification-based CFL-AA"),
-                        clEnumValN(CFLAAType::Andersen, "anders",
+                        clEnumValN(::CFLAAType::Andersen, "anders",
                                    "Enable inclusion-based CFL-AA"),
-                        clEnumValN(CFLAAType::Both, "both",
+                        clEnumValN(::CFLAAType::Both, "both",
                                    "Enable both variants of CFL-AA")));
 
 static cl::opt<bool> EnableLoopInterchange(
     "enable-loopinterchange", cl::init(false), cl::Hidden,
     cl::desc("Enable the new, experimental LoopInterchange Pass"));
 
-static cl::opt<bool> EnableUnrollAndJam("enable-unroll-and-jam",
-                                        cl::init(false), cl::Hidden,
-                                        cl::desc("Enable Unroll And Jam Pass"));
+cl::opt<bool> EnableUnrollAndJam("enable-unroll-and-jam", cl::init(false),
+                                 cl::Hidden,
+                                 cl::desc("Enable Unroll And Jam Pass"));
+
+cl::opt<bool> EnableLoopFlatten("enable-loop-flatten", cl::init(false),
+                                cl::Hidden,
+                                cl::desc("Enable the LoopFlatten Pass"));
 
 static cl::opt<bool>
     EnablePrepareForThinLTO("prepare-for-thinlto", cl::init(false), cl::Hidden,
@@ -103,22 +107,25 @@ static cl::opt<bool>
 cl::opt<bool> EnableHotColdSplit("hot-cold-split", cl::init(false),
     cl::ZeroOrMore, cl::desc("Enable hot-cold splitting pass"));
 
+cl::opt<bool> EnableIROutliner("ir-outliner", cl::init(false), cl::Hidden,
+    cl::desc("Enable ir outliner pass"));
+
 static cl::opt<bool> UseLoopVersioningLICM(
     "enable-loop-versioning-licm", cl::init(false), cl::Hidden,
     cl::desc("Enable the experimental Loop Versioning LICM pass"));
 
-static cl::opt<bool>
+cl::opt<bool>
     DisablePreInliner("disable-preinline", cl::init(false), cl::Hidden,
                       cl::desc("Disable pre-instrumentation inliner"));
 
-static cl::opt<int> PreInlineThreshold(
+cl::opt<int> PreInlineThreshold(
     "preinline-threshold", cl::Hidden, cl::init(75), cl::ZeroOrMore,
     cl::desc("Control the amount of inlining in pre-instrumentation inliner "
              "(default = 75)"));
 
-static cl::opt<bool> EnableGVNHoist(
-    "enable-gvn-hoist", cl::init(false), cl::ZeroOrMore,
-    cl::desc("Enable the GVN hoisting pass (default = off)"));
+cl::opt<bool>
+    EnableGVNHoist("enable-gvn-hoist", cl::init(false), cl::ZeroOrMore,
+                   cl::desc("Enable the GVN hoisting pass (default = off)"));
 
 static cl::opt<bool>
     DisableLibCallsShrinkWrap("disable-libcalls-shrinkwrap", cl::init(false),
@@ -130,13 +137,13 @@ static cl::opt<bool> EnableSimpleLoopUnswitch(
     cl::desc("Enable the simple loop unswitch pass. Also enables independent "
              "cleanup passes integrated into the loop pass manager pipeline."));
 
-static cl::opt<bool> EnableGVNSink(
-    "enable-gvn-sink", cl::init(false), cl::ZeroOrMore,
-    cl::desc("Enable the GVN sinking pass (default = off)"));
+cl::opt<bool>
+    EnableGVNSink("enable-gvn-sink", cl::init(false), cl::ZeroOrMore,
+                  cl::desc("Enable the GVN sinking pass (default = off)"));
 
 // This option is used in simplifying testing SampleFDO optimizations for
 // profile loading.
-static cl::opt<bool>
+cl::opt<bool>
     EnableCHR("enable-chr", cl::init(true), cl::Hidden,
               cl::desc("Enable control height reduction optimization (CHR)"));
 
@@ -149,9 +156,14 @@ cl::opt<bool> EnableOrderFileInstrumentation(
     "enable-order-file-instrumentation", cl::init(false), cl::Hidden,
     cl::desc("Enable order file instrumentation (default = off)"));
 
-static cl::opt<bool>
-    EnableMatrix("enable-matrix", cl::init(false), cl::Hidden,
-                 cl::desc("Enable lowering of the matrix intrinsics"));
+cl::opt<bool> EnableMatrix(
+    "enable-matrix", cl::init(false), cl::Hidden,
+    cl::desc("Enable lowering of the matrix intrinsics"));
+
+cl::opt<bool> EnableConstraintElimination(
+    "enable-constraint-elimination", cl::init(false), cl::Hidden,
+    cl::desc(
+        "Enable pass to eliminate conditions based on linear constraints."));
 
 cl::opt<AttributorRunOption> AttributorRun(
     "attributor-enable", cl::Hidden, cl::init(AttributorRunOption::NONE),
@@ -264,13 +276,13 @@ void PassManagerBuilder::addExtensionsToPM(ExtensionPointTy ETy,
 void PassManagerBuilder::addInitialAliasAnalysisPasses(
     legacy::PassManagerBase &PM) const {
   switch (UseCFLAA) {
-  case CFLAAType::Steensgaard:
+  case ::CFLAAType::Steensgaard:
     PM.add(createCFLSteensAAWrapperPass());
     break;
-  case CFLAAType::Andersen:
+  case ::CFLAAType::Andersen:
     PM.add(createCFLAndersAAWrapperPass());
     break;
-  case CFLAAType::Both:
+  case ::CFLAAType::Both:
     PM.add(createCFLSteensAAWrapperPass());
     PM.add(createCFLAndersAAWrapperPass());
     break;
@@ -294,6 +306,13 @@ void PassManagerBuilder::populateFunctionPassManager(
   if (LibraryInfo)
     FPM.add(new TargetLibraryInfoWrapperPass(*LibraryInfo));
 
+  // The backends do not handle matrix intrinsics currently.
+  // Make sure they are also lowered in O0.
+  // FIXME: A lightweight version of the pass should run in the backend
+  //        pipeline on demand.
+  if (EnableMatrix && OptLevel == 0)
+    FPM.add(createLowerMatrixIntrinsicsMinimalPass());
+
   if (OptLevel == 0) return;
 
   addInitialAliasAnalysisPasses(FPM);
@@ -314,19 +333,21 @@ void PassManagerBuilder::addPGOInstrPasses(legacy::PassManagerBase &MPM,
     return;
 
   // Perform the preinline and cleanup passes for O1 and above.
-  // And avoid doing them if optimizing for size.
   // We will not do this inline for context sensitive PGO (when IsCS is true).
-  if (OptLevel > 0 && SizeLevel == 0 && !DisablePreInliner &&
-      PGOSampleUse.empty() && !IsCS) {
+  if (OptLevel > 0 && !DisablePreInliner && PGOSampleUse.empty() && !IsCS) {
     // Create preinline pass. We construct an InlineParams object and specify
     // the threshold here to avoid the command line options of the regular
     // inliner to influence pre-inlining. The only fields of InlineParams we
     // care about are DefaultThreshold and HintThreshold.
     InlineParams IP;
     IP.DefaultThreshold = PreInlineThreshold;
-    // FIXME: The hint threshold has the same value used by the regular inliner.
-    // This should probably be lowered after performance testing.
-    IP.HintThreshold = 325;
+    // FIXME: The hint threshold has the same value used by the regular inliner
+    // when not optimzing for size. This should probably be lowered after
+    // performance testing.
+    // Use PreInlineThreshold for both -Os and -Oz. Not running preinliner makes
+    // the instrumented binary unusably large. Even if PreInlineThreshold is not
+    // correct thresold for -Oz, it is better than not running preinliner.
+    IP.HintThreshold = SizeLevel > 0 ? PreInlineThreshold : 325;
 
     MPM.add(createFunctionInliningPass(IP));
     MPM.add(createSROAPass());
@@ -374,6 +395,9 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
     }
   }
 
+  if (EnableConstraintElimination)
+    MPM.add(createConstraintEliminationPass());
+
   if (OptLevel > 1) {
     // Speculative execution if the target has divergent branches; otherwise nop.
     MPM.add(createSpeculativeExecutionIfHasBranchDivergencePass());
@@ -409,7 +433,7 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
     MPM.add(createLoopSimplifyCFGPass());
   }
   // Rotate Loop - disable header duplication at -Oz
-  MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1));
+  MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1, PrepareForLTO));
   // TODO: Investigate promotion cap for O1.
   MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
   if (EnableSimpleLoopUnswitch)
@@ -422,20 +446,27 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
   MPM.add(createCFGSimplificationPass());
   MPM.add(createInstructionCombiningPass());
   // We resume loop passes creating a second loop pipeline here.
-  MPM.add(createIndVarSimplifyPass());        // Canonicalize indvars
+  if (EnableLoopFlatten) {
+    MPM.add(createLoopFlattenPass()); // Flatten loops
+    MPM.add(createLoopSimplifyCFGPass());
+  }
   MPM.add(createLoopIdiomPass());             // Recognize idioms like memset.
+  MPM.add(createIndVarSimplifyPass());        // Canonicalize indvars
   addExtensionsToPM(EP_LateLoopOptimizations, MPM);
   MPM.add(createLoopDeletionPass());          // Delete dead loops
 
   if (EnableLoopInterchange)
     MPM.add(createLoopInterchangePass()); // Interchange loops
 
-  // Unroll small loops
+  // Unroll small loops and perform peeling.
   MPM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops,
                                      ForgetAllSCEVInLoopUnroll));
   addExtensionsToPM(EP_LoopOptimizerEnd, MPM);
   // This ends the loop pass pipelines.
 
+  // Break up allocas that may now be splittable after loop unrolling.
+  MPM.add(createSROAPass());
+
   if (OptLevel > 1) {
     MPM.add(createMergedLoadStoreMotionPass()); // Merge ld/st in diamonds
     MPM.add(NewGVN ? createNewGVNPass()
@@ -444,6 +475,9 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
   MPM.add(createMemCpyOptPass());             // Remove memcpy / form memset
   MPM.add(createSCCPPass());                  // Constant prop with SCCP
 
+  if (EnableConstraintElimination)
+    MPM.add(createConstraintEliminationPass());
+
   // Delete dead bit computations (instcombine runs after to fold away the dead
   // computations, and then ADCE will run later to exploit any new DCE
   // opportunities that creates).
@@ -456,6 +490,11 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
   if (OptLevel > 1) {
     MPM.add(createJumpThreadingPass());         // Thread jumps
     MPM.add(createCorrelatedValuePropagationPass());
+  }
+  MPM.add(createAggressiveDCEPass()); // Delete dead instructions
+
+  // TODO: Investigate if this is too expensive at O1.
+  if (OptLevel > 1) {
     MPM.add(createDeadStoreEliminationPass());  // Delete dead stores
     MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
   }
@@ -465,8 +504,6 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
   if (RerollLoops)
     MPM.add(createLoopRerollPass());
 
-  // TODO: Investigate if this is too expensive at O1.
-  MPM.add(createAggressiveDCEPass());         // Delete dead instructions
   MPM.add(createCFGSimplificationPass()); // Merge & remove BBs
   // Clean up after everything.
   MPM.add(createInstructionCombiningPass());
@@ -483,6 +520,8 @@ void PassManagerBuilder::populateModulePassManager(
   // is handled separately, so just check this is not the ThinLTO post-link.
   bool DefaultOrPreLinkPipeline = !PerformThinLTO;
 
+  MPM.add(createAnnotation2MetadataLegacyPass());
+
   if (!PGOSampleUse.empty()) {
     MPM.add(createPruneEHPass());
     // In ThinLTO mode, when flattened profile is used, all the available
@@ -533,6 +572,8 @@ void PassManagerBuilder::populateModulePassManager(
       // new unnamed globals.
       MPM.add(createNameAnonGlobalPass());
     }
+
+    MPM.add(createAnnotationRemarksLegacyPass());
     return;
   }
 
@@ -736,7 +777,7 @@ void PassManagerBuilder::populateModulePassManager(
   // Re-rotate loops in all our loop nests. These may have fallout out of
   // rotated form due to GVN or other transformations, and the vectorizer relies
   // on the rotated form. Disable header duplication at -Oz.
-  MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1));
+  MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1, PrepareForLTO));
 
   // Distribute loops to allow partial vectorization.  I.e. isolate dependences
   // into separate loop that would otherwise inhibit vectorization.  This is
@@ -777,7 +818,14 @@ void PassManagerBuilder::populateModulePassManager(
   // convert to more optimized IR using more aggressive simplify CFG options.
   // The extra sinking transform can create larger basic blocks, so do this
   // before SLP vectorization.
-  MPM.add(createCFGSimplificationPass(1, true, true, false, true));
+  // FIXME: study whether hoisting and/or sinking of common instructions should
+  //        be delayed until after SLP vectorizer.
+  MPM.add(createCFGSimplificationPass(SimplifyCFGOptions()
+                                          .forwardSwitchCondToPhi(true)
+                                          .convertSwitchToLookupTable(true)
+                                          .needCanonicalLoops(false)
+                                          .hoistCommonInsts(true)
+                                          .sinkCommonInsts(true)));
 
   if (SLPVectorize) {
     MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains.
@@ -835,6 +883,9 @@ void PassManagerBuilder::populateModulePassManager(
   if (EnableHotColdSplit && !(PrepareForLTO || PrepareForThinLTO))
     MPM.add(createHotColdSplittingPass());
 
+  if (EnableIROutliner)
+    MPM.add(createIROutlinerPass());
+
   if (MergeFunctions)
     MPM.add(createMergeFunctionsPass());
 
@@ -866,6 +917,8 @@ void PassManagerBuilder::populateModulePassManager(
     // Rename anon globals to be able to handle them in the summary
     MPM.add(createNameAnonGlobalPass());
   }
+
+  MPM.add(createAnnotationRemarksLegacyPass());
 }
 
 void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
@@ -984,7 +1037,7 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
   // The IPO passes may leave cruft around.  Clean up after them.
   PM.add(createInstructionCombiningPass());
   addExtensionsToPM(EP_Peephole, PM);
-  PM.add(createJumpThreadingPass());
+  PM.add(createJumpThreadingPass(/*FreezeSelectCond*/ true));
 
   // Break up allocas
   PM.add(createSROAPass());
@@ -1000,23 +1053,29 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
   PM.add(createGlobalsAAWrapperPass()); // IP alias analysis.
 
   PM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
-  PM.add(createMergedLoadStoreMotionPass()); // Merge ld/st in diamonds.
   PM.add(NewGVN ? createNewGVNPass()
                 : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies.
   PM.add(createMemCpyOptPass());            // Remove dead memcpys.
 
   // Nuke dead stores.
   PM.add(createDeadStoreEliminationPass());
+  PM.add(createMergedLoadStoreMotionPass()); // Merge ld/st in diamonds.
 
   // More loops are countable; try to optimize them.
+  if (EnableLoopFlatten)
+    PM.add(createLoopFlattenPass());
   PM.add(createIndVarSimplifyPass());
   PM.add(createLoopDeletionPass());
   if (EnableLoopInterchange)
     PM.add(createLoopInterchangePass());
 
-  // Unroll small loops
+  if (EnableConstraintElimination)
+    PM.add(createConstraintEliminationPass());
+
+  // Unroll small loops and perform peeling.
   PM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops,
                                     ForgetAllSCEVInLoopUnroll));
+  PM.add(createLoopDistributePass());
   PM.add(createLoopVectorizePass(true, !LoopVectorize));
   // The vectorizer may have significantly shortened a loop body; unroll again.
   PM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops,
@@ -1028,7 +1087,8 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
   // we may have exposed more scalar opportunities. Run parts of the scalar
   // optimizer again at this point.
   PM.add(createInstructionCombiningPass()); // Initial cleanup
-  PM.add(createCFGSimplificationPass()); // if-convert
+  PM.add(createCFGSimplificationPass(SimplifyCFGOptions() // if-convert
+                                         .hoistCommonInsts(true)));
   PM.add(createSCCPPass()); // Propagate exposed constants
   PM.add(createInstructionCombiningPass()); // Clean up again
   PM.add(createBitTrackingDCEPass());
@@ -1047,7 +1107,7 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
   PM.add(createInstructionCombiningPass());
   addExtensionsToPM(EP_Peephole, PM);
 
-  PM.add(createJumpThreadingPass());
+  PM.add(createJumpThreadingPass(/*FreezeSelectCond*/ true));
 }
 
 void PassManagerBuilder::addLateLTOOptimizationPasses(
@@ -1058,7 +1118,8 @@ void PassManagerBuilder::addLateLTOOptimizationPasses(
     PM.add(createHotColdSplittingPass());
 
   // Delete basic blocks, which optimization passes may have killed.
-  PM.add(createCFGSimplificationPass());
+  PM.add(
+      createCFGSimplificationPass(SimplifyCFGOptions().hoistCommonInsts(true)));
 
   // Drop bodies of available externally objects to improve GlobalDCE.
   PM.add(createEliminateAvailableExternallyPass());
@@ -1140,6 +1201,8 @@ void PassManagerBuilder::populateLTOPassManager(legacy::PassManagerBase &PM) {
 
   addExtensionsToPM(EP_FullLinkTimeOptimizationLast, PM);
 
+  PM.add(createAnnotationRemarksLegacyPass());
+
   if (VerifyOutput)
     PM.add(createVerifierPass());
 }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/PruneEH.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/PruneEH.cpp
index a16dc664db64..3f3b18771cd5 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/PruneEH.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/PruneEH.cpp
@@ -13,6 +13,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/CallGraph.h"
@@ -27,8 +28,10 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/CallGraphUpdater.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "prune-eh"
@@ -45,11 +48,10 @@ namespace {
 
     // runOnSCC - Analyze the SCC, performing the transformation if possible.
     bool runOnSCC(CallGraphSCC &SCC) override;
-
   };
 }
-static bool SimplifyFunction(Function *F, CallGraph &CG);
-static void DeleteBasicBlock(BasicBlock *BB, CallGraph &CG);
+static bool SimplifyFunction(Function *F, CallGraphUpdater &CGU);
+static void DeleteBasicBlock(BasicBlock *BB, CallGraphUpdater &CGU);
 
 char PruneEH::ID = 0;
 INITIALIZE_PASS_BEGIN(PruneEH, "prune-eh",
@@ -60,20 +62,17 @@ INITIALIZE_PASS_END(PruneEH, "prune-eh",
 
 Pass *llvm::createPruneEHPass() { return new PruneEH(); }
 
-static bool runImpl(CallGraphSCC &SCC, CallGraph &CG) {
-  SmallPtrSet<CallGraphNode *, 8> SCCNodes;
+static bool runImpl(CallGraphUpdater &CGU, SetVector<Function *> &Functions) {
+#ifndef NDEBUG
+  for (auto *F : Functions)
+    assert(F && "null Function");
+#endif
   bool MadeChange = false;
 
-  // Fill SCCNodes with the elements of the SCC.  Used for quickly
-  // looking up whether a given CallGraphNode is in this SCC.
-  for (CallGraphNode *I : SCC)
-    SCCNodes.insert(I);
-
   // First pass, scan all of the functions in the SCC, simplifying them
   // according to what we know.
-  for (CallGraphNode *I : SCC)
-    if (Function *F = I->getFunction())
-      MadeChange |= SimplifyFunction(F, CG);
+  for (Function *F : Functions)
+    MadeChange |= SimplifyFunction(F, CGU);
 
   // Next, check to see if any callees might throw or if there are any external
   // functions in this SCC: if so, we cannot prune any functions in this SCC.
@@ -83,13 +82,8 @@ static bool runImpl(CallGraphSCC &SCC, CallGraph &CG) {
   // obviously the SCC might throw.
   //
   bool SCCMightUnwind = false, SCCMightReturn = false;
-  for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end();
-       (!SCCMightUnwind || !SCCMightReturn) && I != E; ++I) {
-    Function *F = (*I)->getFunction();
-    if (!F) {
-      SCCMightUnwind = true;
-      SCCMightReturn = true;
-    } else if (!F->hasExactDefinition()) {
+  for (Function *F : Functions) {
+    if (!F->hasExactDefinition()) {
       SCCMightUnwind |= !F->doesNotThrow();
       SCCMightReturn |= !F->doesNotReturn();
     } else {
@@ -125,10 +119,9 @@ static bool runImpl(CallGraphSCC &SCC, CallGraph &CG) {
             bool InstMightUnwind = true;
             if (const auto *CI = dyn_cast<CallInst>(&I)) {
               if (Function *Callee = CI->getCalledFunction()) {
-                CallGraphNode *CalleeNode = CG[Callee];
                 // If the callee is outside our current SCC then we may throw
                 // because it might.  If it is inside, do nothing.
-                if (SCCNodes.count(CalleeNode) > 0)
+                if (Functions.contains(Callee))
                   InstMightUnwind = false;
               }
             }
@@ -140,18 +133,15 @@ static bool runImpl(CallGraphSCC &SCC, CallGraph &CG) {
                 if (IA->hasSideEffects())
                   SCCMightReturn = true;
         }
-
+      }
         if (SCCMightUnwind && SCCMightReturn)
           break;
-      }
     }
   }
 
   // If the SCC doesn't unwind or doesn't throw, note this fact.
   if (!SCCMightUnwind || !SCCMightReturn)
-    for (CallGraphNode *I : SCC) {
-      Function *F = I->getFunction();
-
+    for (Function *F : Functions) {
       if (!SCCMightUnwind && !F->hasFnAttribute(Attribute::NoUnwind)) {
         F->addFnAttr(Attribute::NoUnwind);
         MadeChange = true;
@@ -163,30 +153,35 @@ static bool runImpl(CallGraphSCC &SCC, CallGraph &CG) {
       }
     }
 
-  for (CallGraphNode *I : SCC) {
+  for (Function *F : Functions) {
     // Convert any invoke instructions to non-throwing functions in this node
     // into call instructions with a branch.  This makes the exception blocks
     // dead.
-    if (Function *F = I->getFunction())
-      MadeChange |= SimplifyFunction(F, CG);
+    MadeChange |= SimplifyFunction(F, CGU);
   }
 
   return MadeChange;
 }
 
-
 bool PruneEH::runOnSCC(CallGraphSCC &SCC) {
   if (skipSCC(SCC))
     return false;
+  SetVector<Function *> Functions;
+  for (auto &N : SCC) {
+    if (auto *F = N->getFunction())
+      Functions.insert(F);
+  }
   CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
-  return runImpl(SCC, CG);
+  CallGraphUpdater CGU;
+  CGU.initialize(CG, SCC);
+  return runImpl(CGU, Functions);
 }
 
 
 // SimplifyFunction - Given information about callees, simplify the specified
 // function if we have invokes to non-unwinding functions or code after calls to
 // no-return functions.
-static bool SimplifyFunction(Function *F, CallGraph &CG) {
+static bool SimplifyFunction(Function *F, CallGraphUpdater &CGU) {
   bool MadeChange = false;
   for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
     if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator()))
@@ -196,7 +191,7 @@ static bool SimplifyFunction(Function *F, CallGraph &CG) {
 
         // If the unwind block is now dead, nuke it.
         if (pred_empty(UnwindBlock))
-          DeleteBasicBlock(UnwindBlock, CG);  // Delete the new BB.
+          DeleteBasicBlock(UnwindBlock, CGU); // Delete the new BB.
 
         ++NumRemoved;
         MadeChange = true;
@@ -216,7 +211,7 @@ static bool SimplifyFunction(Function *F, CallGraph &CG) {
           BB->getInstList().pop_back();
           new UnreachableInst(BB->getContext(), &*BB);
 
-          DeleteBasicBlock(New, CG);  // Delete the new BB.
+          DeleteBasicBlock(New, CGU); // Delete the new BB.
           MadeChange = true;
           ++NumUnreach;
           break;
@@ -229,12 +224,11 @@ static bool SimplifyFunction(Function *F, CallGraph &CG) {
 /// DeleteBasicBlock - remove the specified basic block from the program,
 /// updating the callgraph to reflect any now-obsolete edges due to calls that
 /// exist in the BB.
-static void DeleteBasicBlock(BasicBlock *BB, CallGraph &CG) {
+static void DeleteBasicBlock(BasicBlock *BB, CallGraphUpdater &CGU) {
   assert(pred_empty(BB) && "BB is not dead!");
 
   Instruction *TokenInst = nullptr;
 
-  CallGraphNode *CGN = CG[BB->getParent()];
   for (BasicBlock::iterator I = BB->end(), E = BB->begin(); I != E; ) {
     --I;
 
@@ -246,9 +240,9 @@ static void DeleteBasicBlock(BasicBlock *BB, CallGraph &CG) {
     if (auto *Call = dyn_cast<CallBase>(&*I)) {
       const Function *Callee = Call->getCalledFunction();
       if (!Callee || !Intrinsic::isLeaf(Callee->getIntrinsicID()))
-        CGN->removeCallEdgeFor(*Call);
+        CGU.removeCallSite(*Call);
       else if (!Callee->isIntrinsic())
-        CGN->removeCallEdgeFor(*Call);
+        CGU.removeCallSite(*Call);
     }
 
     if (!I->use_empty())
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleContextTracker.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleContextTracker.cpp
new file mode 100644
index 000000000000..158fa0771c3b
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleContextTracker.cpp
@@ -0,0 +1,585 @@
+//===- SampleContextTracker.cpp - Context-sensitive Profile Tracker -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SampleContextTracker used by CSSPGO.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/SampleContextTracker.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/ProfileData/SampleProf.h"
+#include <map>
+#include <queue>
+#include <vector>
+
+using namespace llvm;
+using namespace sampleprof;
+
+#define DEBUG_TYPE "sample-context-tracker"
+
+namespace llvm {
+
+ContextTrieNode *ContextTrieNode::getChildContext(const LineLocation &CallSite,
+                                                  StringRef CalleeName) {
+  if (CalleeName.empty())
+    return getHottestChildContext(CallSite);
+
+  uint32_t Hash = nodeHash(CalleeName, CallSite);
+  auto It = AllChildContext.find(Hash);
+  if (It != AllChildContext.end())
+    return &It->second;
+  return nullptr;
+}
+
+ContextTrieNode *
+ContextTrieNode::getHottestChildContext(const LineLocation &CallSite) {
+  // CSFDO-TODO: This could be slow, change AllChildContext so we can
+  // do point look up for child node by call site alone.
+  // Retrieve the child node with max count for indirect call
+  ContextTrieNode *ChildNodeRet = nullptr;
+  uint64_t MaxCalleeSamples = 0;
+  for (auto &It : AllChildContext) {
+    ContextTrieNode &ChildNode = It.second;
+    if (ChildNode.CallSiteLoc != CallSite)
+      continue;
+    FunctionSamples *Samples = ChildNode.getFunctionSamples();
+    if (!Samples)
+      continue;
+    if (Samples->getTotalSamples() > MaxCalleeSamples) {
+      ChildNodeRet = &ChildNode;
+      MaxCalleeSamples = Samples->getTotalSamples();
+    }
+  }
+
+  return ChildNodeRet;
+}
+
+ContextTrieNode &ContextTrieNode::moveToChildContext(
+    const LineLocation &CallSite, ContextTrieNode &&NodeToMove,
+    StringRef ContextStrToRemove, bool DeleteNode) {
+  uint32_t Hash = nodeHash(NodeToMove.getFuncName(), CallSite);
+  assert(!AllChildContext.count(Hash) && "Node to remove must exist");
+  LineLocation OldCallSite = NodeToMove.CallSiteLoc;
+  ContextTrieNode &OldParentContext = *NodeToMove.getParentContext();
+  AllChildContext[Hash] = NodeToMove;
+  ContextTrieNode &NewNode = AllChildContext[Hash];
+  NewNode.CallSiteLoc = CallSite;
+
+  // Walk through nodes in the moved the subtree, and update
+  // FunctionSamples' context as for the context promotion.
+  // We also need to set new parant link for all children.
+  std::queue<ContextTrieNode *> NodeToUpdate;
+  NewNode.setParentContext(this);
+  NodeToUpdate.push(&NewNode);
+
+  while (!NodeToUpdate.empty()) {
+    ContextTrieNode *Node = NodeToUpdate.front();
+    NodeToUpdate.pop();
+    FunctionSamples *FSamples = Node->getFunctionSamples();
+
+    if (FSamples) {
+      FSamples->getContext().promoteOnPath(ContextStrToRemove);
+      FSamples->getContext().setState(SyntheticContext);
+      LLVM_DEBUG(dbgs() << "  Context promoted to: " << FSamples->getContext()
+                        << "\n");
+    }
+
+    for (auto &It : Node->getAllChildContext()) {
+      ContextTrieNode *ChildNode = &It.second;
+      ChildNode->setParentContext(Node);
+      NodeToUpdate.push(ChildNode);
+    }
+  }
+
+  // Original context no longer needed, destroy if requested.
+  if (DeleteNode)
+    OldParentContext.removeChildContext(OldCallSite, NewNode.getFuncName());
+
+  return NewNode;
+}
+
+void ContextTrieNode::removeChildContext(const LineLocation &CallSite,
+                                         StringRef CalleeName) {
+  uint32_t Hash = nodeHash(CalleeName, CallSite);
+  // Note this essentially calls dtor and destroys that child context
+  AllChildContext.erase(Hash);
+}
+
+std::map<uint32_t, ContextTrieNode> &ContextTrieNode::getAllChildContext() {
+  return AllChildContext;
+}
+
+const StringRef ContextTrieNode::getFuncName() const { return FuncName; }
+
+FunctionSamples *ContextTrieNode::getFunctionSamples() const {
+  return FuncSamples;
+}
+
+void ContextTrieNode::setFunctionSamples(FunctionSamples *FSamples) {
+  FuncSamples = FSamples;
+}
+
+LineLocation ContextTrieNode::getCallSiteLoc() const { return CallSiteLoc; }
+
+ContextTrieNode *ContextTrieNode::getParentContext() const {
+  return ParentContext;
+}
+
+void ContextTrieNode::setParentContext(ContextTrieNode *Parent) {
+  ParentContext = Parent;
+}
+
+void ContextTrieNode::dump() {
+  dbgs() << "Node: " << FuncName << "\n"
+         << "  Callsite: " << CallSiteLoc << "\n"
+         << "  Children:\n";
+
+  for (auto &It : AllChildContext) {
+    dbgs() << "    Node: " << It.second.getFuncName() << "\n";
+  }
+}
+
+uint32_t ContextTrieNode::nodeHash(StringRef ChildName,
+                                   const LineLocation &Callsite) {
+  // We still use child's name for child hash, this is
+  // because for children of root node, we don't have
+  // different line/discriminator, and we'll rely on name
+  // to differentiate children.
+  uint32_t NameHash = std::hash<std::string>{}(ChildName.str());
+  uint32_t LocId = (Callsite.LineOffset << 16) | Callsite.Discriminator;
+  return NameHash + (LocId << 5) + LocId;
+}
+
+ContextTrieNode *ContextTrieNode::getOrCreateChildContext(
+    const LineLocation &CallSite, StringRef CalleeName, bool AllowCreate) {
+  uint32_t Hash = nodeHash(CalleeName, CallSite);
+  auto It = AllChildContext.find(Hash);
+  if (It != AllChildContext.end()) {
+    assert(It->second.getFuncName() == CalleeName &&
+           "Hash collision for child context node");
+    return &It->second;
+  }
+
+  if (!AllowCreate)
+    return nullptr;
+
+  AllChildContext[Hash] = ContextTrieNode(this, CalleeName, nullptr, CallSite);
+  return &AllChildContext[Hash];
+}
+
+// Profiler tracker than manages profiles and its associated context
+SampleContextTracker::SampleContextTracker(
+    StringMap<FunctionSamples> &Profiles) {
+  for (auto &FuncSample : Profiles) {
+    FunctionSamples *FSamples = &FuncSample.second;
+    SampleContext Context(FuncSample.first(), RawContext);
+    LLVM_DEBUG(dbgs() << "Tracking Context for function: " << Context << "\n");
+    if (!Context.isBaseContext())
+      FuncToCtxtProfileSet[Context.getNameWithoutContext()].insert(FSamples);
+    ContextTrieNode *NewNode = getOrCreateContextPath(Context, true);
+    assert(!NewNode->getFunctionSamples() &&
+           "New node can't have sample profile");
+    NewNode->setFunctionSamples(FSamples);
+  }
+}
+
+FunctionSamples *
+SampleContextTracker::getCalleeContextSamplesFor(const CallBase &Inst,
+                                                 StringRef CalleeName) {
+  LLVM_DEBUG(dbgs() << "Getting callee context for instr: " << Inst << "\n");
+  DILocation *DIL = Inst.getDebugLoc();
+  if (!DIL)
+    return nullptr;
+
+  // For indirect call, CalleeName will be empty, in which case the context
+  // profile for callee with largest total samples will be returned.
+  ContextTrieNode *CalleeContext = getCalleeContextFor(DIL, CalleeName);
+  if (CalleeContext) {
+    FunctionSamples *FSamples = CalleeContext->getFunctionSamples();
+    LLVM_DEBUG(if (FSamples) {
+      dbgs() << "  Callee context found: " << FSamples->getContext() << "\n";
+    });
+    return FSamples;
+  }
+
+  return nullptr;
+}
+
+std::vector<const FunctionSamples *>
+SampleContextTracker::getIndirectCalleeContextSamplesFor(
+    const DILocation *DIL) {
+  std::vector<const FunctionSamples *> R;
+  if (!DIL)
+    return R;
+
+  ContextTrieNode *CallerNode = getContextFor(DIL);
+  LineLocation CallSite = FunctionSamples::getCallSiteIdentifier(DIL);
+  for (auto &It : CallerNode->getAllChildContext()) {
+    ContextTrieNode &ChildNode = It.second;
+    if (ChildNode.getCallSiteLoc() != CallSite)
+      continue;
+    if (FunctionSamples *CalleeSamples = ChildNode.getFunctionSamples())
+      R.push_back(CalleeSamples);
+  }
+
+  return R;
+}
+
+FunctionSamples *
+SampleContextTracker::getContextSamplesFor(const DILocation *DIL) {
+  assert(DIL && "Expect non-null location");
+
+  ContextTrieNode *ContextNode = getContextFor(DIL);
+  if (!ContextNode)
+    return nullptr;
+
+  // We may have inlined callees during pre-LTO compilation, in which case
+  // we need to rely on the inline stack from !dbg to mark context profile
+  // as inlined, instead of `MarkContextSamplesInlined` during inlining.
+  // Sample profile loader walks through all instructions to get profile,
+  // which calls this function. So once that is done, all previously inlined
+  // context profile should be marked properly.
+  FunctionSamples *Samples = ContextNode->getFunctionSamples();
+  if (Samples && ContextNode->getParentContext() != &RootContext)
+    Samples->getContext().setState(InlinedContext);
+
+  return Samples;
+}
+
+FunctionSamples *
+SampleContextTracker::getContextSamplesFor(const SampleContext &Context) {
+  ContextTrieNode *Node = getContextFor(Context);
+  if (!Node)
+    return nullptr;
+
+  return Node->getFunctionSamples();
+}
+
+SampleContextTracker::ContextSamplesTy &
+SampleContextTracker::getAllContextSamplesFor(const Function &Func) {
+  StringRef CanonName = FunctionSamples::getCanonicalFnName(Func);
+  return FuncToCtxtProfileSet[CanonName];
+}
+
+SampleContextTracker::ContextSamplesTy &
+SampleContextTracker::getAllContextSamplesFor(StringRef Name) {
+  return FuncToCtxtProfileSet[Name];
+}
+
+FunctionSamples *SampleContextTracker::getBaseSamplesFor(const Function &Func,
+                                                         bool MergeContext) {
+  StringRef CanonName = FunctionSamples::getCanonicalFnName(Func);
+  return getBaseSamplesFor(CanonName, MergeContext);
+}
+
+FunctionSamples *SampleContextTracker::getBaseSamplesFor(StringRef Name,
+                                                         bool MergeContext) {
+  LLVM_DEBUG(dbgs() << "Getting base profile for function: " << Name << "\n");
+  // Base profile is top-level node (child of root node), so try to retrieve
+  // existing top-level node for given function first. If it exists, it could be
+  // that we've merged base profile before, or there's actually context-less
+  // profile from the input (e.g. due to unreliable stack walking).
+  ContextTrieNode *Node = getTopLevelContextNode(Name);
+  if (MergeContext) {
+    LLVM_DEBUG(dbgs() << "  Merging context profile into base profile: " << Name
+                      << "\n");
+
+    // We have profile for function under different contexts,
+    // create synthetic base profile and merge context profiles
+    // into base profile.
+    for (auto *CSamples : FuncToCtxtProfileSet[Name]) {
+      SampleContext &Context = CSamples->getContext();
+      ContextTrieNode *FromNode = getContextFor(Context);
+      if (FromNode == Node)
+        continue;
+
+      // Skip inlined context profile and also don't re-merge any context
+      if (Context.hasState(InlinedContext) || Context.hasState(MergedContext))
+        continue;
+
+      ContextTrieNode &ToNode = promoteMergeContextSamplesTree(*FromNode);
+      assert((!Node || Node == &ToNode) && "Expect only one base profile");
+      Node = &ToNode;
+    }
+  }
+
+  // Still no profile even after merge/promotion (if allowed)
+  if (!Node)
+    return nullptr;
+
+  return Node->getFunctionSamples();
+}
+
+void SampleContextTracker::markContextSamplesInlined(
+    const FunctionSamples *InlinedSamples) {
+  assert(InlinedSamples && "Expect non-null inlined samples");
+  LLVM_DEBUG(dbgs() << "Marking context profile as inlined: "
+                    << InlinedSamples->getContext() << "\n");
+  InlinedSamples->getContext().setState(InlinedContext);
+}
+
+void SampleContextTracker::promoteMergeContextSamplesTree(
+    const Instruction &Inst, StringRef CalleeName) {
+  LLVM_DEBUG(dbgs() << "Promoting and merging context tree for instr: \n"
+                    << Inst << "\n");
+  // Get the caller context for the call instruction, we don't use callee
+  // name from call because there can be context from indirect calls too.
+  DILocation *DIL = Inst.getDebugLoc();
+  ContextTrieNode *CallerNode = getContextFor(DIL);
+  if (!CallerNode)
+    return;
+
+  // Get the context that needs to be promoted
+  LineLocation CallSite = FunctionSamples::getCallSiteIdentifier(DIL);
+  // For indirect call, CalleeName will be empty, in which case we need to
+  // promote all non-inlined child context profiles.
+  if (CalleeName.empty()) {
+    for (auto &It : CallerNode->getAllChildContext()) {
+      ContextTrieNode *NodeToPromo = &It.second;
+      if (CallSite != NodeToPromo->getCallSiteLoc())
+        continue;
+      FunctionSamples *FromSamples = NodeToPromo->getFunctionSamples();
+      if (FromSamples && FromSamples->getContext().hasState(InlinedContext))
+        continue;
+      promoteMergeContextSamplesTree(*NodeToPromo);
+    }
+    return;
+  }
+
+  // Get the context for the given callee that needs to be promoted
+  ContextTrieNode *NodeToPromo =
+      CallerNode->getChildContext(CallSite, CalleeName);
+  if (!NodeToPromo)
+    return;
+
+  promoteMergeContextSamplesTree(*NodeToPromo);
+}
+
+ContextTrieNode &SampleContextTracker::promoteMergeContextSamplesTree(
+    ContextTrieNode &NodeToPromo) {
+  // Promote the input node to be directly under root. This can happen
+  // when we decided to not inline a function under context represented
+  // by the input node. The promote and merge is then needed to reflect
+  // the context profile in the base (context-less) profile.
+  FunctionSamples *FromSamples = NodeToPromo.getFunctionSamples();
+  assert(FromSamples && "Shouldn't promote a context without profile");
+  LLVM_DEBUG(dbgs() << "  Found context tree root to promote: "
+                    << FromSamples->getContext() << "\n");
+
+  assert(!FromSamples->getContext().hasState(InlinedContext) &&
+         "Shouldn't promote inlined context profile");
+  StringRef ContextStrToRemove = FromSamples->getContext().getCallingContext();
+  return promoteMergeContextSamplesTree(NodeToPromo, RootContext,
+                                        ContextStrToRemove);
+}
+
+void SampleContextTracker::dump() {
+  dbgs() << "Context Profile Tree:\n";
+  std::queue<ContextTrieNode *> NodeQueue;
+  NodeQueue.push(&RootContext);
+
+  while (!NodeQueue.empty()) {
+    ContextTrieNode *Node = NodeQueue.front();
+    NodeQueue.pop();
+    Node->dump();
+
+    for (auto &It : Node->getAllChildContext()) {
+      ContextTrieNode *ChildNode = &It.second;
+      NodeQueue.push(ChildNode);
+    }
+  }
+}
+
+ContextTrieNode *
+SampleContextTracker::getContextFor(const SampleContext &Context) {
+  return getOrCreateContextPath(Context, false);
+}
+
+ContextTrieNode *
+SampleContextTracker::getCalleeContextFor(const DILocation *DIL,
+                                          StringRef CalleeName) {
+  assert(DIL && "Expect non-null location");
+
+  ContextTrieNode *CallContext = getContextFor(DIL);
+  if (!CallContext)
+    return nullptr;
+
+  // When CalleeName is empty, the child context profile with max
+  // total samples will be returned.
+  return CallContext->getChildContext(
+      FunctionSamples::getCallSiteIdentifier(DIL), CalleeName);
+}
+
+ContextTrieNode *SampleContextTracker::getContextFor(const DILocation *DIL) {
+  assert(DIL && "Expect non-null location");
+  SmallVector<std::pair<LineLocation, StringRef>, 10> S;
+
+  // Use C++ linkage name if possible.
+  const DILocation *PrevDIL = DIL;
+  for (DIL = DIL->getInlinedAt(); DIL; DIL = DIL->getInlinedAt()) {
+    StringRef Name = PrevDIL->getScope()->getSubprogram()->getLinkageName();
+    if (Name.empty())
+      Name = PrevDIL->getScope()->getSubprogram()->getName();
+    S.push_back(
+        std::make_pair(FunctionSamples::getCallSiteIdentifier(DIL),
+                       PrevDIL->getScope()->getSubprogram()->getLinkageName()));
+    PrevDIL = DIL;
+  }
+
+  // Push root node, note that root node like main may only
+  // a name, but not linkage name.
+  StringRef RootName = PrevDIL->getScope()->getSubprogram()->getLinkageName();
+  if (RootName.empty())
+    RootName = PrevDIL->getScope()->getSubprogram()->getName();
+  S.push_back(std::make_pair(LineLocation(0, 0), RootName));
+
+  ContextTrieNode *ContextNode = &RootContext;
+  int I = S.size();
+  while (--I >= 0 && ContextNode) {
+    LineLocation &CallSite = S[I].first;
+    StringRef &CalleeName = S[I].second;
+    ContextNode = ContextNode->getChildContext(CallSite, CalleeName);
+  }
+
+  if (I < 0)
+    return ContextNode;
+
+  return nullptr;
+}
+
+ContextTrieNode *
+SampleContextTracker::getOrCreateContextPath(const SampleContext &Context,
+                                             bool AllowCreate) {
+  ContextTrieNode *ContextNode = &RootContext;
+  StringRef ContextRemain = Context;
+  StringRef ChildContext;
+  StringRef CalleeName;
+  LineLocation CallSiteLoc(0, 0);
+
+  while (ContextNode && !ContextRemain.empty()) {
+    auto ContextSplit = SampleContext::splitContextString(ContextRemain);
+    ChildContext = ContextSplit.first;
+    ContextRemain = ContextSplit.second;
+    LineLocation NextCallSiteLoc(0, 0);
+    SampleContext::decodeContextString(ChildContext, CalleeName,
+                                       NextCallSiteLoc);
+
+    // Create child node at parent line/disc location
+    if (AllowCreate) {
+      ContextNode =
+          ContextNode->getOrCreateChildContext(CallSiteLoc, CalleeName);
+    } else {
+      ContextNode = ContextNode->getChildContext(CallSiteLoc, CalleeName);
+    }
+    CallSiteLoc = NextCallSiteLoc;
+  }
+
+  assert((!AllowCreate || ContextNode) &&
+         "Node must exist if creation is allowed");
+  return ContextNode;
+}
+
+ContextTrieNode *SampleContextTracker::getTopLevelContextNode(StringRef FName) {
+  return RootContext.getChildContext(LineLocation(0, 0), FName);
+}
+
+ContextTrieNode &SampleContextTracker::addTopLevelContextNode(StringRef FName) {
+  assert(!getTopLevelContextNode(FName) && "Node to add must not exist");
+  return *RootContext.getOrCreateChildContext(LineLocation(0, 0), FName);
+}
+
+void SampleContextTracker::mergeContextNode(ContextTrieNode &FromNode,
+                                            ContextTrieNode &ToNode,
+                                            StringRef ContextStrToRemove) {
+  FunctionSamples *FromSamples = FromNode.getFunctionSamples();
+  FunctionSamples *ToSamples = ToNode.getFunctionSamples();
+  if (FromSamples && ToSamples) {
+    // Merge/duplicate FromSamples into ToSamples
+    ToSamples->merge(*FromSamples);
+    ToSamples->getContext().setState(SyntheticContext);
+    FromSamples->getContext().setState(MergedContext);
+  } else if (FromSamples) {
+    // Transfer FromSamples from FromNode to ToNode
+    ToNode.setFunctionSamples(FromSamples);
+    FromSamples->getContext().setState(SyntheticContext);
+    FromSamples->getContext().promoteOnPath(ContextStrToRemove);
+    FromNode.setFunctionSamples(nullptr);
+  }
+}
+
+ContextTrieNode &SampleContextTracker::promoteMergeContextSamplesTree(
+    ContextTrieNode &FromNode, ContextTrieNode &ToNodeParent,
+    StringRef ContextStrToRemove) {
+  assert(!ContextStrToRemove.empty() && "Context to remove can't be empty");
+
+  // Ignore call site location if destination is top level under root
+  LineLocation NewCallSiteLoc = LineLocation(0, 0);
+  LineLocation OldCallSiteLoc = FromNode.getCallSiteLoc();
+  ContextTrieNode &FromNodeParent = *FromNode.getParentContext();
+  ContextTrieNode *ToNode = nullptr;
+  bool MoveToRoot = (&ToNodeParent == &RootContext);
+  if (!MoveToRoot) {
+    NewCallSiteLoc = OldCallSiteLoc;
+  }
+
+  // Locate destination node, create/move if not existing
+  ToNode = ToNodeParent.getChildContext(NewCallSiteLoc, FromNode.getFuncName());
+  if (!ToNode) {
+    // Do not delete node to move from its parent here because
+    // caller is iterating over children of that parent node.
+    ToNode = &ToNodeParent.moveToChildContext(
+        NewCallSiteLoc, std::move(FromNode), ContextStrToRemove, false);
+  } else {
+    // Destination node exists, merge samples for the context tree
+    mergeContextNode(FromNode, *ToNode, ContextStrToRemove);
+    LLVM_DEBUG(dbgs() << "  Context promoted and merged to: "
+                      << ToNode->getFunctionSamples()->getContext() << "\n");
+
+    // Recursively promote and merge children
+    for (auto &It : FromNode.getAllChildContext()) {
+      ContextTrieNode &FromChildNode = It.second;
+      promoteMergeContextSamplesTree(FromChildNode, *ToNode,
+                                     ContextStrToRemove);
+    }
+
+    // Remove children once they're all merged
+    FromNode.getAllChildContext().clear();
+  }
+
+  // For root of subtree, remove itself from old parent too
+  if (MoveToRoot)
+    FromNodeParent.removeChildContext(OldCallSiteLoc, ToNode->getFuncName());
+
+  return *ToNode;
+}
+
+// Replace call graph edges with dynamic call edges from the profile.
+void SampleContextTracker::addCallGraphEdges(CallGraph &CG,
+                                             StringMap<Function *> &SymbolMap) {
+  // Add profile call edges to the call graph.
+  std::queue<ContextTrieNode *> NodeQueue;
+  NodeQueue.push(&RootContext);
+  while (!NodeQueue.empty()) {
+    ContextTrieNode *Node = NodeQueue.front();
+    NodeQueue.pop();
+    Function *F = SymbolMap.lookup(Node->getFuncName());
+    for (auto &I : Node->getAllChildContext()) {
+      ContextTrieNode *ChildNode = &I.second;
+      NodeQueue.push(ChildNode);
+      if (F && !F->isDeclaration()) {
+        Function *Callee = SymbolMap.lookup(ChildNode->getFuncName());
+        if (Callee && !Callee->isDeclaration())
+          CG[F]->addCalledFunction(nullptr, CG[Callee]);
+      }
+    }
+  }
+}
+} // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfile.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfile.cpp
index b6871e260532..a6a419bfe742 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -26,6 +26,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/None.h"
+#include "llvm/ADT/PriorityQueue.h"
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
@@ -43,6 +44,7 @@
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/ReplayInlineAdvisor.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/BasicBlock.h"
@@ -75,10 +77,11 @@
 #include "llvm/Support/GenericDomTree.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/SampleContextTracker.h"
+#include "llvm/Transforms/IPO/SampleProfileProbe.h"
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Utils/CallPromotionUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/MisExpect.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -102,6 +105,19 @@ STATISTIC(NumCSInlined,
           "Number of functions inlined with context sensitive profile");
 STATISTIC(NumCSNotInlined,
           "Number of functions not inlined with context sensitive profile");
+STATISTIC(NumMismatchedProfile,
+          "Number of functions with CFG mismatched profile");
+STATISTIC(NumMatchedProfile, "Number of functions with CFG matched profile");
+STATISTIC(NumDuplicatedInlinesite,
+          "Number of inlined callsites with a partial distribution factor");
+
+STATISTIC(NumCSInlinedHitMinLimit,
+          "Number of functions with FDO inline stopped due to min size limit");
+STATISTIC(NumCSInlinedHitMaxLimit,
+          "Number of functions with FDO inline stopped due to max size limit");
+STATISTIC(
+    NumCSInlinedHitGrowthLimit,
+    "Number of functions with FDO inline stopped due to growth size limit");
 
 // Command line option to specify the file to read samples from. This is
 // mainly used for debugging.
@@ -161,15 +177,64 @@ static cl::opt<bool> ProfileTopDownLoad(
              "order of call graph during sample profile loading. It only "
              "works for new pass manager. "));
 
+static cl::opt<bool> UseProfileIndirectCallEdges(
+    "use-profile-indirect-call-edges", cl::init(true), cl::Hidden,
+    cl::desc("Considering indirect call samples from profile when top-down "
+             "processing functions. Only CSSPGO is supported."));
+
+static cl::opt<bool> UseProfileTopDownOrder(
+    "use-profile-top-down-order", cl::init(false), cl::Hidden,
+    cl::desc("Process functions in one SCC in a top-down order "
+             "based on the input profile."));
+
 static cl::opt<bool> ProfileSizeInline(
     "sample-profile-inline-size", cl::Hidden, cl::init(false),
     cl::desc("Inline cold call sites in profile loader if it's beneficial "
              "for code size."));
 
+static cl::opt<int> ProfileInlineGrowthLimit(
+    "sample-profile-inline-growth-limit", cl::Hidden, cl::init(12),
+    cl::desc("The size growth ratio limit for proirity-based sample profile "
+             "loader inlining."));
+
+static cl::opt<int> ProfileInlineLimitMin(
+    "sample-profile-inline-limit-min", cl::Hidden, cl::init(100),
+    cl::desc("The lower bound of size growth limit for "
+             "proirity-based sample profile loader inlining."));
+
+static cl::opt<int> ProfileInlineLimitMax(
+    "sample-profile-inline-limit-max", cl::Hidden, cl::init(10000),
+    cl::desc("The upper bound of size growth limit for "
+             "proirity-based sample profile loader inlining."));
+
+static cl::opt<int> ProfileICPThreshold(
+    "sample-profile-icp-threshold", cl::Hidden, cl::init(5),
+    cl::desc(
+        "Relative hotness threshold for indirect "
+        "call promotion in proirity-based sample profile loader inlining."));
+
+static cl::opt<int> SampleHotCallSiteThreshold(
+    "sample-profile-hot-inline-threshold", cl::Hidden, cl::init(3000),
+    cl::desc("Hot callsite threshold for proirity-based sample profile loader "
+             "inlining."));
+
+static cl::opt<bool> CallsitePrioritizedInline(
+    "sample-profile-prioritized-inline", cl::Hidden, cl::ZeroOrMore,
+    cl::init(false),
+    cl::desc("Use call site prioritized inlining for sample profile loader."
+             "Currently only CSSPGO is supported."));
+
 static cl::opt<int> SampleColdCallSiteThreshold(
     "sample-profile-cold-inline-threshold", cl::Hidden, cl::init(45),
     cl::desc("Threshold for inlining cold callsites"));
 
+static cl::opt<std::string> ProfileInlineReplayFile(
+    "sample-profile-inline-replay", cl::init(""), cl::value_desc("filename"),
+    cl::desc(
+        "Optimization remarks file containing inline remarks to be replayed "
+        "by inlining from sample profile loader."),
+    cl::Hidden);
+
 namespace {
 
 using BlockWeightMap = DenseMap<const BasicBlock *, uint64_t>;
@@ -301,6 +366,38 @@ private:
   DenseMap<uint64_t, StringRef> &CurrentGUIDToFuncNameMap;
 };
 
+// Inline candidate used by iterative callsite prioritized inliner
+struct InlineCandidate {
+  CallBase *CallInstr;
+  const FunctionSamples *CalleeSamples;
+  // Prorated callsite count, which will be used to guide inlining. For example,
+  // if a callsite is duplicated in LTO prelink, then in LTO postlink the two
+  // copies will get their own distribution factors and their prorated counts
+  // will be used to decide if they should be inlined independently.
+  uint64_t CallsiteCount;
+  // Call site distribution factor to prorate the profile samples for a
+  // duplicated callsite. Default value is 1.0.
+  float CallsiteDistribution;
+};
+
+// Inline candidate comparer using call site weight
+struct CandidateComparer {
+  bool operator()(const InlineCandidate &LHS, const InlineCandidate &RHS) {
+    if (LHS.CallsiteCount != RHS.CallsiteCount)
+      return LHS.CallsiteCount < RHS.CallsiteCount;
+
+    // Tie breaker using GUID so we have stable/deterministic inlining order
+    assert(LHS.CalleeSamples && RHS.CalleeSamples &&
+           "Expect non-null FunctionSamples");
+    return LHS.CalleeSamples->getGUID(LHS.CalleeSamples->getName()) <
+           RHS.CalleeSamples->getGUID(RHS.CalleeSamples->getName());
+  }
+};
+
+using CandidateQueue =
+    PriorityQueue<InlineCandidate, std::vector<InlineCandidate>,
+                  CandidateComparer>;
+
 /// Sample profile pass.
 ///
 /// This pass reads profile data from the file specified by
@@ -309,17 +406,16 @@ private:
 class SampleProfileLoader {
 public:
   SampleProfileLoader(
-      StringRef Name, StringRef RemapName, bool IsThinLTOPreLink,
+      StringRef Name, StringRef RemapName, ThinOrFullLTOPhase LTOPhase,
       std::function<AssumptionCache &(Function &)> GetAssumptionCache,
       std::function<TargetTransformInfo &(Function &)> GetTargetTransformInfo,
       std::function<const TargetLibraryInfo &(Function &)> GetTLI)
       : GetAC(std::move(GetAssumptionCache)),
         GetTTI(std::move(GetTargetTransformInfo)), GetTLI(std::move(GetTLI)),
         CoverageTracker(*this), Filename(std::string(Name)),
-        RemappingFilename(std::string(RemapName)),
-        IsThinLTOPreLink(IsThinLTOPreLink) {}
+        RemappingFilename(std::string(RemapName)), LTOPhase(LTOPhase) {}
 
-  bool doInitialization(Module &M);
+  bool doInitialization(Module &M, FunctionAnalysisManager *FAM = nullptr);
   bool runOnModule(Module &M, ModuleAnalysisManager *AM,
                    ProfileSummaryInfo *_PSI, CallGraph *CG);
 
@@ -332,15 +428,28 @@ protected:
   unsigned getFunctionLoc(Function &F);
   bool emitAnnotations(Function &F);
   ErrorOr<uint64_t> getInstWeight(const Instruction &I);
+  ErrorOr<uint64_t> getProbeWeight(const Instruction &I);
   ErrorOr<uint64_t> getBlockWeight(const BasicBlock *BB);
   const FunctionSamples *findCalleeFunctionSamples(const CallBase &I) const;
   std::vector<const FunctionSamples *>
   findIndirectCallFunctionSamples(const Instruction &I, uint64_t &Sum) const;
   mutable DenseMap<const DILocation *, const FunctionSamples *> DILocation2SampleMap;
   const FunctionSamples *findFunctionSamples(const Instruction &I) const;
-  bool inlineCallInstruction(CallBase &CB);
+  // Attempt to promote indirect call and also inline the promoted call
+  bool tryPromoteAndInlineCandidate(
+      Function &F, InlineCandidate &Candidate, uint64_t SumOrigin,
+      uint64_t &Sum, DenseSet<Instruction *> &PromotedInsns,
+      SmallVector<CallBase *, 8> *InlinedCallSites = nullptr);
   bool inlineHotFunctions(Function &F,
                           DenseSet<GlobalValue::GUID> &InlinedGUIDs);
+  InlineCost shouldInlineCandidate(InlineCandidate &Candidate);
+  bool getInlineCandidate(InlineCandidate *NewCandidate, CallBase *CB);
+  bool
+  tryInlineCandidate(InlineCandidate &Candidate,
+                     SmallVector<CallBase *, 8> *InlinedCallSites = nullptr);
+  bool
+  inlineHotFunctionsWithPriority(Function &F,
+                                 DenseSet<GlobalValue::GUID> &InlinedGUIDs);
   // Inline cold/small functions in addition to hot ones
   bool shouldInlineColdCallee(CallBase &CallInst);
   void emitOptimizationRemarksForInlineCandidates(
@@ -359,6 +468,8 @@ protected:
   uint64_t visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge);
   void buildEdges(Function &F);
   std::vector<Function *> buildFunctionOrder(Module &M, CallGraph *CG);
+  void addCallGraphEdges(CallGraph &CG, const FunctionSamples &Samples);
+  void replaceCallGraphEdges(CallGraph &CG, StringMap<Function *> &SymbolMap);
   bool propagateThroughEdges(Function &F, bool UpdateBlockCount);
   void computeDominanceAndLoopInfo(Function &F);
   void clearFunctionData();
@@ -417,6 +528,9 @@ protected:
   /// Profile reader object.
   std::unique_ptr<SampleProfileReader> Reader;
 
+  /// Profile tracker for different context.
+  std::unique_ptr<SampleContextTracker> ContextTracker;
+
   /// Samples collected for the body of this function.
   FunctionSamples *Samples = nullptr;
 
@@ -429,11 +543,15 @@ protected:
   /// Flag indicating whether the profile input loaded successfully.
   bool ProfileIsValid = false;
 
-  /// Flag indicating if the pass is invoked in ThinLTO compile phase.
+  /// Flag indicating whether input profile is context-sensitive
+  bool ProfileIsCS = false;
+
+  /// Flag indicating which LTO/ThinLTO phase the pass is invoked in.
   ///
-  /// In this phase, in annotation, we should not promote indirect calls.
-  /// Instead, we will mark GUIDs that needs to be annotated to the function.
-  bool IsThinLTOPreLink;
+  /// We need to know the LTO phase because for example in ThinLTOPrelink
+  /// phase, in annotation, we should not promote indirect calls. Instead,
+  /// we will mark GUIDs that needs to be annotated to the function.
+  ThinOrFullLTOPhase LTOPhase;
 
   /// Profile Summary Info computed from sample profile.
   ProfileSummaryInfo *PSI = nullptr;
@@ -473,6 +591,12 @@ protected:
   // overriden by -profile-sample-accurate or profile-sample-accurate
   // attribute.
   bool ProfAccForSymsInList;
+
+  // External inline advisor used to replay inline decision from remarks.
+  std::unique_ptr<ReplayInlineAdvisor> ExternalInlineAdvisor;
+
+  // A pseudo probe helper to correlate the imported sample counts.
+  std::unique_ptr<PseudoProbeManager> ProbeManager;
 };
 
 class SampleProfileLoaderLegacyPass : public ModulePass {
@@ -480,10 +604,11 @@ public:
   // Class identification, replacement for typeinfo
   static char ID;
 
-  SampleProfileLoaderLegacyPass(StringRef Name = SampleProfileFile,
-                                bool IsThinLTOPreLink = false)
+  SampleProfileLoaderLegacyPass(
+      StringRef Name = SampleProfileFile,
+      ThinOrFullLTOPhase LTOPhase = ThinOrFullLTOPhase::None)
       : ModulePass(ID), SampleLoader(
-                            Name, SampleProfileRemappingFile, IsThinLTOPreLink,
+                            Name, SampleProfileRemappingFile, LTOPhase,
                             [&](Function &F) -> AssumptionCache & {
                               return ACT->getAssumptionCache(F);
                             },
@@ -705,6 +830,9 @@ void SampleProfileLoader::printBlockWeight(raw_ostream &OS,
 ///
 /// \returns the weight of \p Inst.
 ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) {
+  if (FunctionSamples::ProfileIsProbeBased)
+    return getProbeWeight(Inst);
+
   const DebugLoc &DLoc = Inst.getDebugLoc();
   if (!DLoc)
     return std::error_code();
@@ -723,9 +851,10 @@ ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) {
   // (findCalleeFunctionSamples returns non-empty result), but not inlined here,
   // it means that the inlined callsite has no sample, thus the call
   // instruction should have 0 count.
-  if (auto *CB = dyn_cast<CallBase>(&Inst))
-    if (!CB->isIndirectCall() && findCalleeFunctionSamples(*CB))
-      return 0;
+  if (!ProfileIsCS)
+    if (const auto *CB = dyn_cast<CallBase>(&Inst))
+      if (!CB->isIndirectCall() && findCalleeFunctionSamples(*CB))
+        return 0;
 
   const DILocation *DIL = DLoc;
   uint32_t LineOffset = FunctionSamples::getOffset(DIL);
@@ -757,6 +886,51 @@ ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) {
   return R;
 }
 
+ErrorOr<uint64_t> SampleProfileLoader::getProbeWeight(const Instruction &Inst) {
+  assert(FunctionSamples::ProfileIsProbeBased &&
+         "Profile is not pseudo probe based");
+  Optional<PseudoProbe> Probe = extractProbe(Inst);
+  if (!Probe)
+    return std::error_code();
+
+  const FunctionSamples *FS = findFunctionSamples(Inst);
+  if (!FS)
+    return std::error_code();
+
+  // If a direct call/invoke instruction is inlined in profile
+  // (findCalleeFunctionSamples returns non-empty result), but not inlined here,
+  // it means that the inlined callsite has no sample, thus the call
+  // instruction should have 0 count.
+  if (const auto *CB = dyn_cast<CallBase>(&Inst))
+    if (!CB->isIndirectCall() && findCalleeFunctionSamples(*CB))
+      return 0;
+
+  const ErrorOr<uint64_t> &R = FS->findSamplesAt(Probe->Id, 0);
+  if (R) {
+    uint64_t Samples = R.get() * Probe->Factor;
+    bool FirstMark = CoverageTracker.markSamplesUsed(FS, Probe->Id, 0, Samples);
+    if (FirstMark) {
+      ORE->emit([&]() {
+        OptimizationRemarkAnalysis Remark(DEBUG_TYPE, "AppliedSamples", &Inst);
+        Remark << "Applied " << ore::NV("NumSamples", Samples);
+        Remark << " samples from profile (ProbeId=";
+        Remark << ore::NV("ProbeId", Probe->Id);
+        Remark << ", Factor=";
+        Remark << ore::NV("Factor", Probe->Factor);
+        Remark << ", OriginalSamples=";
+        Remark << ore::NV("OriginalSamples", R.get());
+        Remark << ")";
+        return Remark;
+      });
+    }
+    LLVM_DEBUG(dbgs() << "    " << Probe->Id << ":" << Inst
+                      << " - weight: " << R.get() << " - factor: "
+                      << format("%0.2f", Probe->Factor) << ")\n");
+    return Samples;
+  }
+  return R;
+}
+
 /// Compute the weight of a basic block.
 ///
 /// The weight of basic block \p BB is the maximum weight of all the
@@ -820,17 +994,18 @@ SampleProfileLoader::findCalleeFunctionSamples(const CallBase &Inst) const {
   }
 
   StringRef CalleeName;
-  if (const CallInst *CI = dyn_cast<CallInst>(&Inst))
-    if (Function *Callee = CI->getCalledFunction())
-      CalleeName = Callee->getName();
+  if (Function *Callee = Inst.getCalledFunction())
+    CalleeName = FunctionSamples::getCanonicalFnName(*Callee);
+
+  if (ProfileIsCS)
+    return ContextTracker->getCalleeContextSamplesFor(Inst, CalleeName);
 
   const FunctionSamples *FS = findFunctionSamples(Inst);
   if (FS == nullptr)
     return nullptr;
 
-  return FS->findFunctionSamplesAt(LineLocation(FunctionSamples::getOffset(DIL),
-                                                DIL->getBaseDiscriminator()),
-                                   CalleeName);
+  return FS->findFunctionSamplesAt(FunctionSamples::getCallSiteIdentifier(DIL),
+                                   CalleeName, Reader->getRemapper());
 }
 
 /// Returns a vector of FunctionSamples that are the indirect call targets
@@ -846,32 +1021,49 @@ SampleProfileLoader::findIndirectCallFunctionSamples(
     return R;
   }
 
+  auto FSCompare = [](const FunctionSamples *L, const FunctionSamples *R) {
+    assert(L && R && "Expect non-null FunctionSamples");
+    if (L->getEntrySamples() != R->getEntrySamples())
+      return L->getEntrySamples() > R->getEntrySamples();
+    return FunctionSamples::getGUID(L->getName()) <
+           FunctionSamples::getGUID(R->getName());
+  };
+
+  if (ProfileIsCS) {
+    auto CalleeSamples =
+        ContextTracker->getIndirectCalleeContextSamplesFor(DIL);
+    if (CalleeSamples.empty())
+      return R;
+
+    // For CSSPGO, we only use target context profile's entry count
+    // as that already includes both inlined callee and non-inlined ones..
+    Sum = 0;
+    for (const auto *const FS : CalleeSamples) {
+      Sum += FS->getEntrySamples();
+      R.push_back(FS);
+    }
+    llvm::sort(R, FSCompare);
+    return R;
+  }
+
   const FunctionSamples *FS = findFunctionSamples(Inst);
   if (FS == nullptr)
     return R;
 
-  uint32_t LineOffset = FunctionSamples::getOffset(DIL);
-  uint32_t Discriminator = DIL->getBaseDiscriminator();
-
-  auto T = FS->findCallTargetMapAt(LineOffset, Discriminator);
+  auto CallSite = FunctionSamples::getCallSiteIdentifier(DIL);
+  auto T = FS->findCallTargetMapAt(CallSite);
   Sum = 0;
   if (T)
     for (const auto &T_C : T.get())
       Sum += T_C.second;
-  if (const FunctionSamplesMap *M = FS->findFunctionSamplesMapAt(LineLocation(
-          FunctionSamples::getOffset(DIL), DIL->getBaseDiscriminator()))) {
+  if (const FunctionSamplesMap *M = FS->findFunctionSamplesMapAt(CallSite)) {
     if (M->empty())
       return R;
     for (const auto &NameFS : *M) {
       Sum += NameFS.second.getEntrySamples();
       R.push_back(&NameFS.second);
     }
-    llvm::sort(R, [](const FunctionSamples *L, const FunctionSamples *R) {
-      if (L->getEntrySamples() != R->getEntrySamples())
-        return L->getEntrySamples() > R->getEntrySamples();
-      return FunctionSamples::getGUID(L->getName()) <
-             FunctionSamples::getGUID(R->getName());
-    });
+    llvm::sort(R, FSCompare);
   }
   return R;
 }
@@ -887,42 +1079,85 @@ SampleProfileLoader::findIndirectCallFunctionSamples(
 /// \returns the FunctionSamples pointer to the inlined instance.
 const FunctionSamples *
 SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const {
+  if (FunctionSamples::ProfileIsProbeBased) {
+    Optional<PseudoProbe> Probe = extractProbe(Inst);
+    if (!Probe)
+      return nullptr;
+  }
+
   const DILocation *DIL = Inst.getDebugLoc();
   if (!DIL)
     return Samples;
 
   auto it = DILocation2SampleMap.try_emplace(DIL,nullptr);
-  if (it.second)
-    it.first->second = Samples->findFunctionSamples(DIL);
+  if (it.second) {
+    if (ProfileIsCS)
+      it.first->second = ContextTracker->getContextSamplesFor(DIL);
+    else
+      it.first->second =
+          Samples->findFunctionSamples(DIL, Reader->getRemapper());
+  }
   return it.first->second;
 }
 
-bool SampleProfileLoader::inlineCallInstruction(CallBase &CB) {
-  Function *CalledFunction = CB.getCalledFunction();
-  assert(CalledFunction);
-  DebugLoc DLoc = CB.getDebugLoc();
-  BasicBlock *BB = CB.getParent();
-  InlineParams Params = getInlineParams();
-  Params.ComputeFullInlineCost = true;
-  // Checks if there is anything in the reachable portion of the callee at
-  // this callsite that makes this inlining potentially illegal. Need to
-  // set ComputeFullInlineCost, otherwise getInlineCost may return early
-  // when cost exceeds threshold without checking all IRs in the callee.
-  // The acutal cost does not matter because we only checks isNever() to
-  // see if it is legal to inline the callsite.
-  InlineCost Cost =
-      getInlineCost(CB, Params, GetTTI(*CalledFunction), GetAC, GetTLI);
-  if (Cost.isNever()) {
-    ORE->emit(OptimizationRemarkAnalysis(CSINLINE_DEBUG, "InlineFail", DLoc, BB)
-              << "incompatible inlining");
-    return false;
-  }
-  InlineFunctionInfo IFI(nullptr, GetAC);
-  if (InlineFunction(CB, IFI).isSuccess()) {
-    // The call to InlineFunction erases I, so we can't pass it here.
-    emitInlinedInto(*ORE, DLoc, BB, *CalledFunction, *BB->getParent(), Cost,
-                    true, CSINLINE_DEBUG);
-    return true;
+/// Attempt to promote indirect call and also inline the promoted call.
+///
+/// \param F  Caller function.
+/// \param Candidate  ICP and inline candidate.
+/// \param Sum  Sum of target counts for indirect call.
+/// \param PromotedInsns  Map to keep track of indirect call already processed.
+/// \param Candidate  ICP and inline candidate.
+/// \param InlinedCallSite  Output vector for new call sites exposed after
+/// inlining.
+bool SampleProfileLoader::tryPromoteAndInlineCandidate(
+    Function &F, InlineCandidate &Candidate, uint64_t SumOrigin, uint64_t &Sum,
+    DenseSet<Instruction *> &PromotedInsns,
+    SmallVector<CallBase *, 8> *InlinedCallSite) {
+  const char *Reason = "Callee function not available";
+  // R->getValue() != &F is to prevent promoting a recursive call.
+  // If it is a recursive call, we do not inline it as it could bloat
+  // the code exponentially. There is way to better handle this, e.g.
+  // clone the caller first, and inline the cloned caller if it is
+  // recursive. As llvm does not inline recursive calls, we will
+  // simply ignore it instead of handling it explicitly.
+  auto R = SymbolMap.find(Candidate.CalleeSamples->getFuncName());
+  if (R != SymbolMap.end() && R->getValue() &&
+      !R->getValue()->isDeclaration() && R->getValue()->getSubprogram() &&
+      R->getValue()->hasFnAttribute("use-sample-profile") &&
+      R->getValue() != &F &&
+      isLegalToPromote(*Candidate.CallInstr, R->getValue(), &Reason)) {
+    auto *DI =
+        &pgo::promoteIndirectCall(*Candidate.CallInstr, R->getValue(),
+                                  Candidate.CallsiteCount, Sum, false, ORE);
+    if (DI) {
+      Sum -= Candidate.CallsiteCount;
+      // Prorate the indirect callsite distribution.
+      // Do not update the promoted direct callsite distribution at this
+      // point since the original distribution combined with the callee
+      // profile will be used to prorate callsites from the callee if
+      // inlined. Once not inlined, the direct callsite distribution should
+      // be prorated so that the it will reflect the real callsite counts.
+      setProbeDistributionFactor(*Candidate.CallInstr,
+                                 Candidate.CallsiteDistribution * Sum /
+                                     SumOrigin);
+      PromotedInsns.insert(Candidate.CallInstr);
+      Candidate.CallInstr = DI;
+      if (isa<CallInst>(DI) || isa<InvokeInst>(DI)) {
+        bool Inlined = tryInlineCandidate(Candidate, InlinedCallSite);
+        if (!Inlined) {
+          // Prorate the direct callsite distribution so that it reflects real
+          // callsite counts.
+          setProbeDistributionFactor(*DI, Candidate.CallsiteDistribution *
+                                              Candidate.CallsiteCount /
+                                              SumOrigin);
+        }
+        return Inlined;
+      }
+    }
+  } else {
+    LLVM_DEBUG(dbgs() << "\nFailed to promote indirect call to "
+                      << Candidate.CalleeSamples->getFuncName() << " because "
+                      << Reason << "\n");
   }
   return false;
 }
@@ -938,6 +1173,12 @@ bool SampleProfileLoader::shouldInlineColdCallee(CallBase &CallInst) {
   InlineCost Cost = getInlineCost(CallInst, getInlineParams(), GetTTI(*Callee),
                                   GetAC, GetTLI);
 
+  if (Cost.isNever())
+    return false;
+
+  if (Cost.isAlways())
+    return true;
+
   return Cost.getCost() <= SampleColdCallSiteThreshold;
 }
 
@@ -982,10 +1223,11 @@ bool SampleProfileLoader::inlineHotFunctions(
          "ProfAccForSymsInList should be false when profile-sample-accurate "
          "is enabled");
 
-  DenseMap<CallBase *, const FunctionSamples *> localNotInlinedCallSites;
+  DenseMap<CallBase *, const FunctionSamples *> LocalNotInlinedCallSites;
   bool Changed = false;
-  while (true) {
-    bool LocalChanged = false;
+  bool LocalChanged = true;
+  while (LocalChanged) {
+    LocalChanged = false;
     SmallVector<CallBase *, 10> CIS;
     for (auto &BB : F) {
       bool Hot = false;
@@ -995,9 +1237,11 @@ bool SampleProfileLoader::inlineHotFunctions(
         const FunctionSamples *FS = nullptr;
         if (auto *CB = dyn_cast<CallBase>(&I)) {
           if (!isa<IntrinsicInst>(I) && (FS = findCalleeFunctionSamples(*CB))) {
+            assert((!FunctionSamples::UseMD5 || FS->GUIDToFuncNameMap) &&
+                   "GUIDToFuncNameMap has to be populated");
             AllCandidates.push_back(CB);
-            if (FS->getEntrySamples() > 0)
-              localNotInlinedCallSites.try_emplace(CB, FS);
+            if (FS->getEntrySamples() > 0 || ProfileIsCS)
+              LocalNotInlinedCallSites.try_emplace(CB, FS);
             if (callsiteIsHot(FS, PSI))
               Hot = true;
             else if (shouldInlineColdCallee(*CB))
@@ -1005,7 +1249,7 @@ bool SampleProfileLoader::inlineHotFunctions(
           }
         }
       }
-      if (Hot) {
+      if (Hot || ExternalInlineAdvisor) {
         CIS.insert(CIS.begin(), AllCandidates.begin(), AllCandidates.end());
         emitOptimizationRemarksForInlineCandidates(AllCandidates, F, true);
       } else {
@@ -1015,6 +1259,11 @@ bool SampleProfileLoader::inlineHotFunctions(
     }
     for (CallBase *I : CIS) {
       Function *CalledFunction = I->getCalledFunction();
+      InlineCandidate Candidate = {
+          I,
+          LocalNotInlinedCallSites.count(I) ? LocalNotInlinedCallSites[I]
+                                            : nullptr,
+          0 /* dummy count */, 1.0 /* dummy distribution factor */};
       // Do not inline recursive calls.
       if (CalledFunction == &F)
         continue;
@@ -1023,69 +1272,43 @@ bool SampleProfileLoader::inlineHotFunctions(
           continue;
         uint64_t Sum;
         for (const auto *FS : findIndirectCallFunctionSamples(*I, Sum)) {
-          if (IsThinLTOPreLink) {
+          uint64_t SumOrigin = Sum;
+          if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
             FS->findInlinedFunctions(InlinedGUIDs, F.getParent(),
                                      PSI->getOrCompHotCountThreshold());
             continue;
           }
-          auto CalleeFunctionName = FS->getFuncName();
-          // If it is a recursive call, we do not inline it as it could bloat
-          // the code exponentially. There is way to better handle this, e.g.
-          // clone the caller first, and inline the cloned caller if it is
-          // recursive. As llvm does not inline recursive calls, we will
-          // simply ignore it instead of handling it explicitly.
-          if (CalleeFunctionName == F.getName())
-            continue;
-
           if (!callsiteIsHot(FS, PSI))
             continue;
 
-          const char *Reason = "Callee function not available";
-          auto R = SymbolMap.find(CalleeFunctionName);
-          if (R != SymbolMap.end() && R->getValue() &&
-              !R->getValue()->isDeclaration() &&
-              R->getValue()->getSubprogram() &&
-              R->getValue()->hasFnAttribute("use-sample-profile") &&
-              isLegalToPromote(*I, R->getValue(), &Reason)) {
-            uint64_t C = FS->getEntrySamples();
-            auto &DI =
-                pgo::promoteIndirectCall(*I, R->getValue(), C, Sum, false, ORE);
-            Sum -= C;
-            PromotedInsns.insert(I);
-            // If profile mismatches, we should not attempt to inline DI.
-            if ((isa<CallInst>(DI) || isa<InvokeInst>(DI)) &&
-                inlineCallInstruction(cast<CallBase>(DI))) {
-              localNotInlinedCallSites.erase(I);
-              LocalChanged = true;
-              ++NumCSInlined;
-            }
-          } else {
-            LLVM_DEBUG(dbgs()
-                       << "\nFailed to promote indirect call to "
-                       << CalleeFunctionName << " because " << Reason << "\n");
+          Candidate = {I, FS, FS->getEntrySamples(), 1.0};
+          if (tryPromoteAndInlineCandidate(F, Candidate, SumOrigin, Sum,
+                                           PromotedInsns)) {
+            LocalNotInlinedCallSites.erase(I);
+            LocalChanged = true;
           }
         }
       } else if (CalledFunction && CalledFunction->getSubprogram() &&
                  !CalledFunction->isDeclaration()) {
-        if (inlineCallInstruction(*I)) {
-          localNotInlinedCallSites.erase(I);
+        if (tryInlineCandidate(Candidate)) {
+          LocalNotInlinedCallSites.erase(I);
           LocalChanged = true;
-          ++NumCSInlined;
         }
-      } else if (IsThinLTOPreLink) {
+      } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
         findCalleeFunctionSamples(*I)->findInlinedFunctions(
             InlinedGUIDs, F.getParent(), PSI->getOrCompHotCountThreshold());
       }
     }
-    if (LocalChanged) {
-      Changed = true;
-    } else {
-      break;
-    }
+    Changed |= LocalChanged;
   }
 
+  // For CS profile, profile for not inlined context will be merged when
+  // base profile is being trieved
+  if (ProfileIsCS)
+    return Changed;
+
   // Accumulate not inlined callsite information into notInlinedSamples
-  for (const auto &Pair : localNotInlinedCallSites) {
+  for (const auto &Pair : LocalNotInlinedCallSites) {
     CallBase *I = Pair.getFirst();
     Function *Callee = I->getCalledFunction();
     if (!Callee || Callee->isDeclaration())
@@ -1104,16 +1327,23 @@ bool SampleProfileLoader::inlineHotFunctions(
     }
 
     if (ProfileMergeInlinee) {
-      // Use entry samples as head samples during the merge, as inlinees
-      // don't have head samples.
-      assert(FS->getHeadSamples() == 0 && "Expect 0 head sample for inlinee");
-      const_cast<FunctionSamples *>(FS)->addHeadSamples(FS->getEntrySamples());
-
-      // Note that we have to do the merge right after processing function.
-      // This allows OutlineFS's profile to be used for annotation during
-      // top-down processing of functions' annotation.
-      FunctionSamples *OutlineFS = Reader->getOrCreateSamplesFor(*Callee);
-      OutlineFS->merge(*FS);
+      // A function call can be replicated by optimizations like callsite
+      // splitting or jump threading and the replicates end up sharing the
+      // sample nested callee profile instead of slicing the original inlinee's
+      // profile. We want to do merge exactly once by filtering out callee
+      // profiles with a non-zero head sample count.
+      if (FS->getHeadSamples() == 0) {
+        // Use entry samples as head samples during the merge, as inlinees
+        // don't have head samples.
+        const_cast<FunctionSamples *>(FS)->addHeadSamples(
+            FS->getEntrySamples());
+
+        // Note that we have to do the merge right after processing function.
+        // This allows OutlineFS's profile to be used for annotation during
+        // top-down processing of functions' annotation.
+        FunctionSamples *OutlineFS = Reader->getOrCreateSamplesFor(*Callee);
+        OutlineFS->merge(*FS);
+      }
     } else {
       auto pair =
           notInlinedCallInfo.try_emplace(Callee, NotInlinedProfileInfo{0});
@@ -1123,6 +1353,266 @@ bool SampleProfileLoader::inlineHotFunctions(
   return Changed;
 }
 
+bool SampleProfileLoader::tryInlineCandidate(
+    InlineCandidate &Candidate, SmallVector<CallBase *, 8> *InlinedCallSites) {
+
+  CallBase &CB = *Candidate.CallInstr;
+  Function *CalledFunction = CB.getCalledFunction();
+  assert(CalledFunction && "Expect a callee with definition");
+  DebugLoc DLoc = CB.getDebugLoc();
+  BasicBlock *BB = CB.getParent();
+
+  InlineCost Cost = shouldInlineCandidate(Candidate);
+  if (Cost.isNever()) {
+    ORE->emit(OptimizationRemarkAnalysis(CSINLINE_DEBUG, "InlineFail", DLoc, BB)
+              << "incompatible inlining");
+    return false;
+  }
+
+  if (!Cost)
+    return false;
+
+  InlineFunctionInfo IFI(nullptr, GetAC);
+  if (InlineFunction(CB, IFI).isSuccess()) {
+    // The call to InlineFunction erases I, so we can't pass it here.
+    emitInlinedInto(*ORE, DLoc, BB, *CalledFunction, *BB->getParent(), Cost,
+                    true, CSINLINE_DEBUG);
+
+    // Now populate the list of newly exposed call sites.
+    if (InlinedCallSites) {
+      InlinedCallSites->clear();
+      for (auto &I : IFI.InlinedCallSites)
+        InlinedCallSites->push_back(I);
+    }
+
+    if (ProfileIsCS)
+      ContextTracker->markContextSamplesInlined(Candidate.CalleeSamples);
+    ++NumCSInlined;
+
+    // Prorate inlined probes for a duplicated inlining callsite which probably
+    // has a distribution less than 100%. Samples for an inlinee should be
+    // distributed among the copies of the original callsite based on each
+    // callsite's distribution factor for counts accuracy. Note that an inlined
+    // probe may come with its own distribution factor if it has been duplicated
+    // in the inlinee body. The two factor are multiplied to reflect the
+    // aggregation of duplication.
+    if (Candidate.CallsiteDistribution < 1) {
+      for (auto &I : IFI.InlinedCallSites) {
+        if (Optional<PseudoProbe> Probe = extractProbe(*I))
+          setProbeDistributionFactor(*I, Probe->Factor *
+                                             Candidate.CallsiteDistribution);
+      }
+      NumDuplicatedInlinesite++;
+    }
+
+    return true;
+  }
+  return false;
+}
+
+bool SampleProfileLoader::getInlineCandidate(InlineCandidate *NewCandidate,
+                                             CallBase *CB) {
+  assert(CB && "Expect non-null call instruction");
+
+  if (isa<IntrinsicInst>(CB))
+    return false;
+
+  // Find the callee's profile. For indirect call, find hottest target profile.
+  const FunctionSamples *CalleeSamples = findCalleeFunctionSamples(*CB);
+  if (!CalleeSamples)
+    return false;
+
+  float Factor = 1.0;
+  if (Optional<PseudoProbe> Probe = extractProbe(*CB))
+    Factor = Probe->Factor;
+
+  uint64_t CallsiteCount = 0;
+  ErrorOr<uint64_t> Weight = getBlockWeight(CB->getParent());
+  if (Weight)
+    CallsiteCount = Weight.get();
+  if (CalleeSamples)
+    CallsiteCount = std::max(
+        CallsiteCount, uint64_t(CalleeSamples->getEntrySamples() * Factor));
+
+  *NewCandidate = {CB, CalleeSamples, CallsiteCount, Factor};
+  return true;
+}
+
+InlineCost
+SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) {
+  std::unique_ptr<InlineAdvice> Advice = nullptr;
+  if (ExternalInlineAdvisor) {
+    Advice = ExternalInlineAdvisor->getAdvice(*Candidate.CallInstr);
+    if (!Advice->isInliningRecommended()) {
+      Advice->recordUnattemptedInlining();
+      return InlineCost::getNever("not previously inlined");
+    }
+    Advice->recordInlining();
+    return InlineCost::getAlways("previously inlined");
+  }
+
+  // Adjust threshold based on call site hotness, only do this for callsite
+  // prioritized inliner because otherwise cost-benefit check is done earlier.
+  int SampleThreshold = SampleColdCallSiteThreshold;
+  if (CallsitePrioritizedInline) {
+    if (Candidate.CallsiteCount > PSI->getHotCountThreshold())
+      SampleThreshold = SampleHotCallSiteThreshold;
+    else if (!ProfileSizeInline)
+      return InlineCost::getNever("cold callsite");
+  }
+
+  Function *Callee = Candidate.CallInstr->getCalledFunction();
+  assert(Callee && "Expect a definition for inline candidate of direct call");
+
+  InlineParams Params = getInlineParams();
+  Params.ComputeFullInlineCost = true;
+  // Checks if there is anything in the reachable portion of the callee at
+  // this callsite that makes this inlining potentially illegal. Need to
+  // set ComputeFullInlineCost, otherwise getInlineCost may return early
+  // when cost exceeds threshold without checking all IRs in the callee.
+  // The acutal cost does not matter because we only checks isNever() to
+  // see if it is legal to inline the callsite.
+  InlineCost Cost = getInlineCost(*Candidate.CallInstr, Callee, Params,
+                                  GetTTI(*Callee), GetAC, GetTLI);
+
+  // Honor always inline and never inline from call analyzer
+  if (Cost.isNever() || Cost.isAlways())
+    return Cost;
+
+  // For old FDO inliner, we inline the call site as long as cost is not
+  // "Never". The cost-benefit check is done earlier.
+  if (!CallsitePrioritizedInline) {
+    return InlineCost::get(Cost.getCost(), INT_MAX);
+  }
+
+  // Otherwise only use the cost from call analyzer, but overwite threshold with
+  // Sample PGO threshold.
+  return InlineCost::get(Cost.getCost(), SampleThreshold);
+}
+
+bool SampleProfileLoader::inlineHotFunctionsWithPriority(
+    Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) {
+  DenseSet<Instruction *> PromotedInsns;
+  assert(ProfileIsCS && "Prioritiy based inliner only works with CSSPGO now");
+
+  // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure
+  // Profile symbol list is ignored when profile-sample-accurate is on.
+  assert((!ProfAccForSymsInList ||
+          (!ProfileSampleAccurate &&
+           !F.hasFnAttribute("profile-sample-accurate"))) &&
+         "ProfAccForSymsInList should be false when profile-sample-accurate "
+         "is enabled");
+
+  // Populating worklist with initial call sites from root inliner, along
+  // with call site weights.
+  CandidateQueue CQueue;
+  InlineCandidate NewCandidate;
+  for (auto &BB : F) {
+    for (auto &I : BB.getInstList()) {
+      auto *CB = dyn_cast<CallBase>(&I);
+      if (!CB)
+        continue;
+      if (getInlineCandidate(&NewCandidate, CB))
+        CQueue.push(NewCandidate);
+    }
+  }
+
+  // Cap the size growth from profile guided inlining. This is needed even
+  // though cost of each inline candidate already accounts for callee size,
+  // because with top-down inlining, we can grow inliner size significantly
+  // with large number of smaller inlinees each pass the cost check.
+  assert(ProfileInlineLimitMax >= ProfileInlineLimitMin &&
+         "Max inline size limit should not be smaller than min inline size "
+         "limit.");
+  unsigned SizeLimit = F.getInstructionCount() * ProfileInlineGrowthLimit;
+  SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax);
+  SizeLimit = std::max(SizeLimit, (unsigned)ProfileInlineLimitMin);
+  if (ExternalInlineAdvisor)
+    SizeLimit = std::numeric_limits<unsigned>::max();
+
+  // Perform iterative BFS call site prioritized inlining
+  bool Changed = false;
+  while (!CQueue.empty() && F.getInstructionCount() < SizeLimit) {
+    InlineCandidate Candidate = CQueue.top();
+    CQueue.pop();
+    CallBase *I = Candidate.CallInstr;
+    Function *CalledFunction = I->getCalledFunction();
+
+    if (CalledFunction == &F)
+      continue;
+    if (I->isIndirectCall()) {
+      if (PromotedInsns.count(I))
+        continue;
+      uint64_t Sum;
+      auto CalleeSamples = findIndirectCallFunctionSamples(*I, Sum);
+      uint64_t SumOrigin = Sum;
+      Sum *= Candidate.CallsiteDistribution;
+      for (const auto *FS : CalleeSamples) {
+        // TODO: Consider disable pre-lTO ICP for MonoLTO as well
+        if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
+          FS->findInlinedFunctions(InlinedGUIDs, F.getParent(),
+                                   PSI->getOrCompHotCountThreshold());
+          continue;
+        }
+        uint64_t EntryCountDistributed =
+            FS->getEntrySamples() * Candidate.CallsiteDistribution;
+        // In addition to regular inline cost check, we also need to make sure
+        // ICP isn't introducing excessive speculative checks even if individual
+        // target looks beneficial to promote and inline. That means we should
+        // only do ICP when there's a small number dominant targets.
+        if (EntryCountDistributed < SumOrigin / ProfileICPThreshold)
+          break;
+        // TODO: Fix CallAnalyzer to handle all indirect calls.
+        // For indirect call, we don't run CallAnalyzer to get InlineCost
+        // before actual inlining. This is because we could see two different
+        // types from the same definition, which makes CallAnalyzer choke as
+        // it's expecting matching parameter type on both caller and callee
+        // side. See example from PR18962 for the triggering cases (the bug was
+        // fixed, but we generate different types).
+        if (!PSI->isHotCount(EntryCountDistributed))
+          break;
+        SmallVector<CallBase *, 8> InlinedCallSites;
+        // Attach function profile for promoted indirect callee, and update
+        // call site count for the promoted inline candidate too.
+        Candidate = {I, FS, EntryCountDistributed,
+                     Candidate.CallsiteDistribution};
+        if (tryPromoteAndInlineCandidate(F, Candidate, SumOrigin, Sum,
+                                         PromotedInsns, &InlinedCallSites)) {
+          for (auto *CB : InlinedCallSites) {
+            if (getInlineCandidate(&NewCandidate, CB))
+              CQueue.emplace(NewCandidate);
+          }
+          Changed = true;
+        }
+      }
+    } else if (CalledFunction && CalledFunction->getSubprogram() &&
+               !CalledFunction->isDeclaration()) {
+      SmallVector<CallBase *, 8> InlinedCallSites;
+      if (tryInlineCandidate(Candidate, &InlinedCallSites)) {
+        for (auto *CB : InlinedCallSites) {
+          if (getInlineCandidate(&NewCandidate, CB))
+            CQueue.emplace(NewCandidate);
+        }
+        Changed = true;
+      }
+    } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
+      findCalleeFunctionSamples(*I)->findInlinedFunctions(
+          InlinedGUIDs, F.getParent(), PSI->getOrCompHotCountThreshold());
+    }
+  }
+
+  if (!CQueue.empty()) {
+    if (SizeLimit == (unsigned)ProfileInlineLimitMax)
+      ++NumCSInlinedHitMaxLimit;
+    else if (SizeLimit == (unsigned)ProfileInlineLimitMin)
+      ++NumCSInlinedHitMinLimit;
+    else
+      ++NumCSInlinedHitGrowthLimit;
+  }
+
+  return Changed;
+}
+
 /// Find equivalence classes for the given block.
 ///
 /// This finds all the blocks that are guaranteed to execute the same
@@ -1538,15 +2028,21 @@ void SampleProfileLoader::propagateWeights(Function &F) {
           if (!DLoc)
             continue;
           const DILocation *DIL = DLoc;
-          uint32_t LineOffset = FunctionSamples::getOffset(DIL);
-          uint32_t Discriminator = DIL->getBaseDiscriminator();
-
           const FunctionSamples *FS = findFunctionSamples(I);
           if (!FS)
             continue;
-          auto T = FS->findCallTargetMapAt(LineOffset, Discriminator);
+          auto CallSite = FunctionSamples::getCallSiteIdentifier(DIL);
+          auto T = FS->findCallTargetMapAt(CallSite);
           if (!T || T.get().empty())
             continue;
+          // Prorate the callsite counts to reflect what is already done to the
+          // callsite, such as ICP or calliste cloning.
+          if (FunctionSamples::ProfileIsProbeBased) {
+            if (Optional<PseudoProbe> Probe = extractProbe(I)) {
+              if (Probe->Factor < 1)
+                T = SampleRecord::adjustCallTargets(T.get(), Probe->Factor);
+            }
+          }
           SmallVector<InstrProfValueData, 2> SortedCallTargets =
               GetSortedValueDataFromCallTargets(T.get());
           uint64_t Sum;
@@ -1598,8 +2094,6 @@ void SampleProfileLoader::propagateWeights(Function &F) {
       }
     }
 
-    misexpect::verifyMisExpect(TI, Weights, TI->getContext());
-
     uint64_t TempWeight;
     // Only set weights if there is at least one non-zero weight.
     // In any other case, let the analyzer set weights.
@@ -1710,14 +2204,28 @@ void SampleProfileLoader::computeDominanceAndLoopInfo(Function &F) {
 bool SampleProfileLoader::emitAnnotations(Function &F) {
   bool Changed = false;
 
-  if (getFunctionLoc(F) == 0)
-    return false;
+  if (FunctionSamples::ProfileIsProbeBased) {
+    if (!ProbeManager->profileIsValid(F, *Samples)) {
+      LLVM_DEBUG(
+          dbgs() << "Profile is invalid due to CFG mismatch for Function "
+                 << F.getName());
+      ++NumMismatchedProfile;
+      return false;
+    }
+    ++NumMatchedProfile;
+  } else {
+    if (getFunctionLoc(F) == 0)
+      return false;
 
-  LLVM_DEBUG(dbgs() << "Line number for the first instruction in "
-                    << F.getName() << ": " << getFunctionLoc(F) << "\n");
+    LLVM_DEBUG(dbgs() << "Line number for the first instruction in "
+                      << F.getName() << ": " << getFunctionLoc(F) << "\n");
+  }
 
   DenseSet<GlobalValue::GUID> InlinedGUIDs;
-  Changed |= inlineHotFunctions(F, InlinedGUIDs);
+  if (ProfileIsCS && CallsitePrioritizedInline)
+    Changed |= inlineHotFunctionsWithPriority(F, InlinedGUIDs);
+  else
+    Changed |= inlineHotFunctions(F, InlinedGUIDs);
 
   // Compute basic block weights.
   Changed |= computeBlockWeights(F);
@@ -1782,6 +2290,45 @@ INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
 INITIALIZE_PASS_END(SampleProfileLoaderLegacyPass, "sample-profile",
                     "Sample Profile loader", false, false)
 
+// Add inlined profile call edges to the call graph.
+void SampleProfileLoader::addCallGraphEdges(CallGraph &CG,
+                                            const FunctionSamples &Samples) {
+  Function *Caller = SymbolMap.lookup(Samples.getFuncName());
+  if (!Caller || Caller->isDeclaration())
+    return;
+
+  // Skip non-inlined call edges which are not important since top down inlining
+  // for non-CS profile is to get more precise profile matching, not to enable
+  // more inlining.
+
+  for (const auto &CallsiteSamples : Samples.getCallsiteSamples()) {
+    for (const auto &InlinedSamples : CallsiteSamples.second) {
+      Function *Callee = SymbolMap.lookup(InlinedSamples.first);
+      if (Callee && !Callee->isDeclaration())
+        CG[Caller]->addCalledFunction(nullptr, CG[Callee]);
+      addCallGraphEdges(CG, InlinedSamples.second);
+    }
+  }
+}
+
+// Replace call graph edges with dynamic call edges from the profile.
+void SampleProfileLoader::replaceCallGraphEdges(
+    CallGraph &CG, StringMap<Function *> &SymbolMap) {
+  // Remove static call edges from the call graph except for the ones from the
+  // root which make the call graph connected.
+  for (const auto &Node : CG)
+    if (Node.second.get() != CG.getExternalCallingNode())
+      Node.second->removeAllCalledFunctions();
+
+  // Add profile call edges to the call graph.
+  if (ProfileIsCS) {
+    ContextTracker->addCallGraphEdges(CG, SymbolMap);
+  } else {
+    for (const auto &Samples : Reader->getProfiles())
+      addCallGraphEdges(CG, Samples.second);
+  }
+}
+
 std::vector<Function *>
 SampleProfileLoader::buildFunctionOrder(Module &M, CallGraph *CG) {
   std::vector<Function *> FunctionOrderList;
@@ -1804,24 +2351,105 @@ SampleProfileLoader::buildFunctionOrder(Module &M, CallGraph *CG) {
   }
 
   assert(&CG->getModule() == &M);
+
+  // Add indirect call edges from profile to augment the static call graph.
+  // Functions will be processed in a top-down order defined by the static call
+  // graph. Adjusting the order by considering indirect call edges from the
+  // profile (which don't exist in the static call graph) can enable the
+  // inlining of indirect call targets by processing the caller before them.
+  // TODO: enable this for non-CS profile and fix the counts returning logic to
+  // have a full support for indirect calls.
+  if (UseProfileIndirectCallEdges && ProfileIsCS) {
+    for (auto &Entry : *CG) {
+      const auto *F = Entry.first;
+      if (!F || F->isDeclaration() || !F->hasFnAttribute("use-sample-profile"))
+        continue;
+      auto &AllContexts = ContextTracker->getAllContextSamplesFor(F->getName());
+      if (AllContexts.empty())
+        continue;
+
+      for (const auto &BB : *F) {
+        for (const auto &I : BB.getInstList()) {
+          const auto *CB = dyn_cast<CallBase>(&I);
+          if (!CB || !CB->isIndirectCall())
+            continue;
+          const DebugLoc &DLoc = I.getDebugLoc();
+          if (!DLoc)
+            continue;
+          auto CallSite = FunctionSamples::getCallSiteIdentifier(DLoc);
+          for (FunctionSamples *Samples : AllContexts) {
+            if (auto CallTargets = Samples->findCallTargetMapAt(CallSite)) {
+              for (const auto &Target : CallTargets.get()) {
+                Function *Callee = SymbolMap.lookup(Target.first());
+                if (Callee && !Callee->isDeclaration())
+                  Entry.second->addCalledFunction(nullptr, (*CG)[Callee]);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // Compute a top-down order the profile which is used to sort functions in
+  // one SCC later. The static processing order computed for an SCC may not
+  // reflect the call contexts in the context-sensitive profile, thus may cause
+  // potential inlining to be overlooked. The function order in one SCC is being
+  // adjusted to a top-down order based on the profile to favor more inlining.
+  DenseMap<Function *, uint64_t> ProfileOrderMap;
+  if (UseProfileTopDownOrder ||
+      (ProfileIsCS && !UseProfileTopDownOrder.getNumOccurrences())) {
+    // Create a static call graph. The call edges are not important since they
+    // will be replaced by dynamic edges from the profile.
+    CallGraph ProfileCG(M);
+    replaceCallGraphEdges(ProfileCG, SymbolMap);
+    scc_iterator<CallGraph *> CGI = scc_begin(&ProfileCG);
+    uint64_t I = 0;
+    while (!CGI.isAtEnd()) {
+      for (CallGraphNode *Node : *CGI) {
+        if (auto *F = Node->getFunction())
+          ProfileOrderMap[F] = ++I;
+      }
+      ++CGI;
+    }
+  }
+
   scc_iterator<CallGraph *> CGI = scc_begin(CG);
   while (!CGI.isAtEnd()) {
-    for (CallGraphNode *node : *CGI) {
-      auto F = node->getFunction();
+    uint64_t Start = FunctionOrderList.size();
+    for (CallGraphNode *Node : *CGI) {
+      auto *F = Node->getFunction();
       if (F && !F->isDeclaration() && F->hasFnAttribute("use-sample-profile"))
         FunctionOrderList.push_back(F);
     }
+
+    // Sort nodes in SCC based on the profile top-down order.
+    if (!ProfileOrderMap.empty()) {
+      std::stable_sort(FunctionOrderList.begin() + Start,
+                       FunctionOrderList.end(),
+                       [&ProfileOrderMap](Function *Left, Function *Right) {
+                         return ProfileOrderMap[Left] < ProfileOrderMap[Right];
+                       });
+    }
+
     ++CGI;
   }
 
+  LLVM_DEBUG({
+    dbgs() << "Function processing order:\n";
+    for (auto F : reverse(FunctionOrderList)) {
+      dbgs() << F->getName() << "\n";
+    }
+  });
+
   std::reverse(FunctionOrderList.begin(), FunctionOrderList.end());
   return FunctionOrderList;
 }
 
-bool SampleProfileLoader::doInitialization(Module &M) {
+bool SampleProfileLoader::doInitialization(Module &M,
+                                           FunctionAnalysisManager *FAM) {
   auto &Ctx = M.getContext();
 
-  std::unique_ptr<SampleProfileReaderItaniumRemapper> RemapReader;
   auto ReaderOrErr =
       SampleProfileReader::create(Filename, Ctx, RemappingFilename);
   if (std::error_code EC = ReaderOrErr.getError()) {
@@ -1830,8 +2458,14 @@ bool SampleProfileLoader::doInitialization(Module &M) {
     return false;
   }
   Reader = std::move(ReaderOrErr.get());
+  Reader->setSkipFlatProf(LTOPhase == ThinOrFullLTOPhase::ThinLTOPostLink);
   Reader->collectFuncsFrom(M);
-  ProfileIsValid = (Reader->read() == sampleprof_error::success);
+  if (std::error_code EC = Reader->read()) {
+    std::string Msg = "profile reading failed: " + EC.message();
+    Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));
+    return false;
+  }
+
   PSL = Reader->getProfileSymbolList();
 
   // While profile-sample-accurate is on, ignore symbol list.
@@ -1843,6 +2477,41 @@ bool SampleProfileLoader::doInitialization(Module &M) {
       NamesInProfile.insert(NameTable->begin(), NameTable->end());
   }
 
+  if (FAM && !ProfileInlineReplayFile.empty()) {
+    ExternalInlineAdvisor = std::make_unique<ReplayInlineAdvisor>(
+        M, *FAM, Ctx, /*OriginalAdvisor=*/nullptr, ProfileInlineReplayFile,
+        /*EmitRemarks=*/false);
+    if (!ExternalInlineAdvisor->areReplayRemarksLoaded())
+      ExternalInlineAdvisor.reset();
+  }
+
+  // Apply tweaks if context-sensitive profile is available.
+  if (Reader->profileIsCS()) {
+    ProfileIsCS = true;
+    FunctionSamples::ProfileIsCS = true;
+
+    // Enable priority-base inliner and size inline by default for CSSPGO.
+    if (!ProfileSizeInline.getNumOccurrences())
+      ProfileSizeInline = true;
+    if (!CallsitePrioritizedInline.getNumOccurrences())
+      CallsitePrioritizedInline = true;
+
+    // Tracker for profiles under different context
+    ContextTracker =
+        std::make_unique<SampleContextTracker>(Reader->getProfiles());
+  }
+
+  // Load pseudo probe descriptors for probe-based function samples.
+  if (Reader->profileIsProbeBased()) {
+    ProbeManager = std::make_unique<PseudoProbeManager>(M);
+    if (!ProbeManager->moduleIsProbed(M)) {
+      const char *Msg =
+          "Pseudo-probe-based profile requires SampleProfileProbePass";
+      Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));
+      return false;
+    }
+  }
+
   return true;
 }
 
@@ -1856,8 +2525,6 @@ ModulePass *llvm::createSampleProfileLoaderPass(StringRef Name) {
 
 bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM,
                                       ProfileSummaryInfo *_PSI, CallGraph *CG) {
-  if (!ProfileIsValid)
-    return false;
   GUIDToFuncNameMapper Mapper(M, *Reader, GUIDToFuncNameMap);
 
   PSI = _PSI;
@@ -1870,6 +2537,7 @@ bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM,
   for (const auto &I : Reader->getProfiles())
     TotalCollectedSamples += I.second.getTotalSamples();
 
+  auto Remapper = Reader->getRemapper();
   // Populate the symbol map.
   for (const auto &N_F : M.getValueSymbolTable()) {
     StringRef OrigName = N_F.getKey();
@@ -1887,6 +2555,15 @@ bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM,
       // to nullptr to avoid confusion.
       if (!r.second)
         r.first->second = nullptr;
+      OrigName = NewName;
+    }
+    // Insert the remapped names into SymbolMap.
+    if (Remapper) {
+      if (auto MapName = Remapper->lookUpNameInProfile(OrigName)) {
+        if (*MapName == OrigName)
+          continue;
+        SymbolMap.insert(std::make_pair(*MapName, F));
+      }
     }
   }
 
@@ -1898,9 +2575,10 @@ bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM,
   }
 
   // Account for cold calls not inlined....
-  for (const std::pair<Function *, NotInlinedProfileInfo> &pair :
-       notInlinedCallInfo)
-    updateProfileCallee(pair.first, pair.second.entryCount);
+  if (!ProfileIsCS)
+    for (const std::pair<Function *, NotInlinedProfileInfo> &pair :
+         notInlinedCallInfo)
+      updateProfileCallee(pair.first, pair.second.entryCount);
 
   return retval;
 }
@@ -1915,7 +2593,7 @@ bool SampleProfileLoaderLegacyPass::runOnModule(Module &M) {
 }
 
 bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) {
-
+  LLVM_DEBUG(dbgs() << "\n\nProcessing Function " << F.getName() << "\n");
   DILocation2SampleMap.clear();
   // By default the entry count is initialized to -1, which will be treated
   // conservatively by getEntryCount as the same as unknown (None). This is
@@ -1957,7 +2635,10 @@ bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM)
       initialEntryCount = -1;
   }
 
-  F.setEntryCount(ProfileCount(initialEntryCount, Function::PCT_Real));
+  // Initialize entry count when the function has no existing entry
+  // count value.
+  if (!F.getEntryCount().hasValue())
+    F.setEntryCount(ProfileCount(initialEntryCount, Function::PCT_Real));
   std::unique_ptr<OptimizationRemarkEmitter> OwnedORE;
   if (AM) {
     auto &FAM =
@@ -1968,7 +2649,12 @@ bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM)
     OwnedORE = std::make_unique<OptimizationRemarkEmitter>(&F);
     ORE = OwnedORE.get();
   }
-  Samples = Reader->getSamplesFor(F);
+
+  if (ProfileIsCS)
+    Samples = ContextTracker->getBaseSamplesFor(F);
+  else
+    Samples = Reader->getSamplesFor(F);
+
   if (Samples && !Samples->empty())
     return emitAnnotations(F);
   return false;
@@ -1993,9 +2679,9 @@ PreservedAnalyses SampleProfileLoaderPass::run(Module &M,
       ProfileFileName.empty() ? SampleProfileFile : ProfileFileName,
       ProfileRemappingFileName.empty() ? SampleProfileRemappingFile
                                        : ProfileRemappingFileName,
-      IsThinLTOPreLink, GetAssumptionCache, GetTTI, GetTLI);
+      LTOPhase, GetAssumptionCache, GetTTI, GetTLI);
 
-  if (!SampleLoader.doInitialization(M))
+  if (!SampleLoader.doInitialization(M, &FAM))
     return PreservedAnalyses::all();
 
   ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
new file mode 100644
index 000000000000..a885c3ee4ded
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
@@ -0,0 +1,434 @@
+//===- SampleProfileProbe.cpp - Pseudo probe Instrumentation -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SampleProfileProber transformation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/SampleProfileProbe.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/ProfileData/SampleProf.h"
+#include "llvm/Support/CRC.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include <unordered_set>
+#include <vector>
+
+using namespace llvm;
+#define DEBUG_TYPE "sample-profile-probe"
+
+STATISTIC(ArtificialDbgLine,
+          "Number of probes that have an artificial debug line");
+
+static cl::opt<bool>
+    VerifyPseudoProbe("verify-pseudo-probe", cl::init(false), cl::Hidden,
+                      cl::desc("Do pseudo probe verification"));
+
+static cl::list<std::string> VerifyPseudoProbeFuncList(
+    "verify-pseudo-probe-funcs", cl::Hidden,
+    cl::desc("The option to specify the name of the functions to verify."));
+
+static cl::opt<bool>
+    UpdatePseudoProbe("update-pseudo-probe", cl::init(true), cl::Hidden,
+                      cl::desc("Update pseudo probe distribution factor"));
+
+bool PseudoProbeVerifier::shouldVerifyFunction(const Function *F) {
+  // Skip function declaration.
+  if (F->isDeclaration())
+    return false;
+  // Skip function that will not be emitted into object file. The prevailing
+  // defintion will be verified instead.
+  if (F->hasAvailableExternallyLinkage())
+    return false;
+  // Do a name matching.
+  static std::unordered_set<std::string> VerifyFuncNames(
+      VerifyPseudoProbeFuncList.begin(), VerifyPseudoProbeFuncList.end());
+  return VerifyFuncNames.empty() || VerifyFuncNames.count(F->getName().str());
+}
+
+void PseudoProbeVerifier::registerCallbacks(PassInstrumentationCallbacks &PIC) {
+  if (VerifyPseudoProbe) {
+    PIC.registerAfterPassCallback(
+        [this](StringRef P, Any IR, const PreservedAnalyses &) {
+          this->runAfterPass(P, IR);
+        });
+  }
+}
+
+// Callback to run after each transformation for the new pass manager.
+void PseudoProbeVerifier::runAfterPass(StringRef PassID, Any IR) {
+  std::string Banner =
+      "\n*** Pseudo Probe Verification After " + PassID.str() + " ***\n";
+  dbgs() << Banner;
+  if (any_isa<const Module *>(IR))
+    runAfterPass(any_cast<const Module *>(IR));
+  else if (any_isa<const Function *>(IR))
+    runAfterPass(any_cast<const Function *>(IR));
+  else if (any_isa<const LazyCallGraph::SCC *>(IR))
+    runAfterPass(any_cast<const LazyCallGraph::SCC *>(IR));
+  else if (any_isa<const Loop *>(IR))
+    runAfterPass(any_cast<const Loop *>(IR));
+  else
+    llvm_unreachable("Unknown IR unit");
+}
+
+void PseudoProbeVerifier::runAfterPass(const Module *M) {
+  for (const Function &F : *M)
+    runAfterPass(&F);
+}
+
+void PseudoProbeVerifier::runAfterPass(const LazyCallGraph::SCC *C) {
+  for (const LazyCallGraph::Node &N : *C)
+    runAfterPass(&N.getFunction());
+}
+
+void PseudoProbeVerifier::runAfterPass(const Function *F) {
+  if (!shouldVerifyFunction(F))
+    return;
+  ProbeFactorMap ProbeFactors;
+  for (const auto &BB : *F)
+    collectProbeFactors(&BB, ProbeFactors);
+  verifyProbeFactors(F, ProbeFactors);
+}
+
+void PseudoProbeVerifier::runAfterPass(const Loop *L) {
+  const Function *F = L->getHeader()->getParent();
+  runAfterPass(F);
+}
+
+void PseudoProbeVerifier::collectProbeFactors(const BasicBlock *Block,
+                                              ProbeFactorMap &ProbeFactors) {
+  for (const auto &I : *Block) {
+    if (Optional<PseudoProbe> Probe = extractProbe(I))
+      ProbeFactors[Probe->Id] += Probe->Factor;
+  }
+}
+
+void PseudoProbeVerifier::verifyProbeFactors(
+    const Function *F, const ProbeFactorMap &ProbeFactors) {
+  bool BannerPrinted = false;
+  auto &PrevProbeFactors = FunctionProbeFactors[F->getName()];
+  for (const auto &I : ProbeFactors) {
+    float CurProbeFactor = I.second;
+    if (PrevProbeFactors.count(I.first)) {
+      float PrevProbeFactor = PrevProbeFactors[I.first];
+      if (std::abs(CurProbeFactor - PrevProbeFactor) >
+          DistributionFactorVariance) {
+        if (!BannerPrinted) {
+          dbgs() << "Function " << F->getName() << ":\n";
+          BannerPrinted = true;
+        }
+        dbgs() << "Probe " << I.first << "\tprevious factor "
+               << format("%0.2f", PrevProbeFactor) << "\tcurrent factor "
+               << format("%0.2f", CurProbeFactor) << "\n";
+      }
+    }
+
+    // Update
+    PrevProbeFactors[I.first] = I.second;
+  }
+}
+
+PseudoProbeManager::PseudoProbeManager(const Module &M) {
+  if (NamedMDNode *FuncInfo = M.getNamedMetadata(PseudoProbeDescMetadataName)) {
+    for (const auto *Operand : FuncInfo->operands()) {
+      const auto *MD = cast<MDNode>(Operand);
+      auto GUID =
+          mdconst::dyn_extract<ConstantInt>(MD->getOperand(0))->getZExtValue();
+      auto Hash =
+          mdconst::dyn_extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
+      GUIDToProbeDescMap.try_emplace(GUID, PseudoProbeDescriptor(GUID, Hash));
+    }
+  }
+}
+
+const PseudoProbeDescriptor *
+PseudoProbeManager::getDesc(const Function &F) const {
+  auto I = GUIDToProbeDescMap.find(
+      Function::getGUID(FunctionSamples::getCanonicalFnName(F)));
+  return I == GUIDToProbeDescMap.end() ? nullptr : &I->second;
+}
+
+bool PseudoProbeManager::moduleIsProbed(const Module &M) const {
+  return M.getNamedMetadata(PseudoProbeDescMetadataName);
+}
+
+bool PseudoProbeManager::profileIsValid(const Function &F,
+                                        const FunctionSamples &Samples) const {
+  const auto *Desc = getDesc(F);
+  if (!Desc) {
+    LLVM_DEBUG(dbgs() << "Probe descriptor missing for Function " << F.getName()
+                      << "\n");
+    return false;
+  } else {
+    if (Desc->getFunctionHash() != Samples.getFunctionHash()) {
+      LLVM_DEBUG(dbgs() << "Hash mismatch for Function " << F.getName()
+                        << "\n");
+      return false;
+    }
+  }
+  return true;
+}
+
+SampleProfileProber::SampleProfileProber(Function &Func,
+                                         const std::string &CurModuleUniqueId)
+    : F(&Func), CurModuleUniqueId(CurModuleUniqueId) {
+  BlockProbeIds.clear();
+  CallProbeIds.clear();
+  LastProbeId = (uint32_t)PseudoProbeReservedId::Last;
+  computeProbeIdForBlocks();
+  computeProbeIdForCallsites();
+  computeCFGHash();
+}
+
+// Compute Hash value for the CFG: the lower 32 bits are CRC32 of the index
+// value of each BB in the CFG. The higher 32 bits record the number of edges
+// preceded by the number of indirect calls.
+// This is derived from FuncPGOInstrumentation<Edge, BBInfo>::computeCFGHash().
+void SampleProfileProber::computeCFGHash() {
+  std::vector<uint8_t> Indexes;
+  JamCRC JC;
+  for (auto &BB : *F) {
+    auto *TI = BB.getTerminator();
+    for (unsigned I = 0, E = TI->getNumSuccessors(); I != E; ++I) {
+      auto *Succ = TI->getSuccessor(I);
+      auto Index = getBlockId(Succ);
+      for (int J = 0; J < 4; J++)
+        Indexes.push_back((uint8_t)(Index >> (J * 8)));
+    }
+  }
+
+  JC.update(Indexes);
+
+  FunctionHash = (uint64_t)CallProbeIds.size() << 48 |
+                 (uint64_t)Indexes.size() << 32 | JC.getCRC();
+  // Reserve bit 60-63 for other information purpose.
+  FunctionHash &= 0x0FFFFFFFFFFFFFFF;
+  assert(FunctionHash && "Function checksum should not be zero");
+  LLVM_DEBUG(dbgs() << "\nFunction Hash Computation for " << F->getName()
+                    << ":\n"
+                    << " CRC = " << JC.getCRC() << ", Edges = "
+                    << Indexes.size() << ", ICSites = " << CallProbeIds.size()
+                    << ", Hash = " << FunctionHash << "\n");
+}
+
+void SampleProfileProber::computeProbeIdForBlocks() {
+  for (auto &BB : *F) {
+    BlockProbeIds[&BB] = ++LastProbeId;
+  }
+}
+
+void SampleProfileProber::computeProbeIdForCallsites() {
+  for (auto &BB : *F) {
+    for (auto &I : BB) {
+      if (!isa<CallBase>(I))
+        continue;
+      if (isa<IntrinsicInst>(&I))
+        continue;
+      CallProbeIds[&I] = ++LastProbeId;
+    }
+  }
+}
+
+uint32_t SampleProfileProber::getBlockId(const BasicBlock *BB) const {
+  auto I = BlockProbeIds.find(const_cast<BasicBlock *>(BB));
+  return I == BlockProbeIds.end() ? 0 : I->second;
+}
+
+uint32_t SampleProfileProber::getCallsiteId(const Instruction *Call) const {
+  auto Iter = CallProbeIds.find(const_cast<Instruction *>(Call));
+  return Iter == CallProbeIds.end() ? 0 : Iter->second;
+}
+
+void SampleProfileProber::instrumentOneFunc(Function &F, TargetMachine *TM) {
+  Module *M = F.getParent();
+  MDBuilder MDB(F.getContext());
+  // Compute a GUID without considering the function's linkage type. This is
+  // fine since function name is the only key in the profile database.
+  uint64_t Guid = Function::getGUID(F.getName());
+
+  // Assign an artificial debug line to a probe that doesn't come with a real
+  // line. A probe not having a debug line will get an incomplete inline
+  // context. This will cause samples collected on the probe to be counted
+  // into the base profile instead of a context profile. The line number
+  // itself is not important though.
+  auto AssignDebugLoc = [&](Instruction *I) {
+    assert((isa<PseudoProbeInst>(I) || isa<CallBase>(I)) &&
+           "Expecting pseudo probe or call instructions");
+    if (!I->getDebugLoc()) {
+      if (auto *SP = F.getSubprogram()) {
+        auto DIL = DILocation::get(SP->getContext(), 0, 0, SP);
+        I->setDebugLoc(DIL);
+        ArtificialDbgLine++;
+        LLVM_DEBUG({
+          dbgs() << "\nIn Function " << F.getName()
+                 << " Probe gets an artificial debug line\n";
+          I->dump();
+        });
+      }
+    }
+  };
+
+  // Probe basic blocks.
+  for (auto &I : BlockProbeIds) {
+    BasicBlock *BB = I.first;
+    uint32_t Index = I.second;
+    // Insert a probe before an instruction with a valid debug line number which
+    // will be assigned to the probe. The line number will be used later to
+    // model the inline context when the probe is inlined into other functions.
+    // Debug instructions, phi nodes and lifetime markers do not have an valid
+    // line number. Real instructions generated by optimizations may not come
+    // with a line number either.
+    auto HasValidDbgLine = [](Instruction *J) {
+      return !isa<PHINode>(J) && !isa<DbgInfoIntrinsic>(J) &&
+             !J->isLifetimeStartOrEnd() && J->getDebugLoc();
+    };
+
+    Instruction *J = &*BB->getFirstInsertionPt();
+    while (J != BB->getTerminator() && !HasValidDbgLine(J)) {
+      J = J->getNextNode();
+    }
+
+    IRBuilder<> Builder(J);
+    assert(Builder.GetInsertPoint() != BB->end() &&
+           "Cannot get the probing point");
+    Function *ProbeFn =
+        llvm::Intrinsic::getDeclaration(M, Intrinsic::pseudoprobe);
+    Value *Args[] = {Builder.getInt64(Guid), Builder.getInt64(Index),
+                     Builder.getInt32(0),
+                     Builder.getInt64(PseudoProbeFullDistributionFactor)};
+    auto *Probe = Builder.CreateCall(ProbeFn, Args);
+    AssignDebugLoc(Probe);
+  }
+
+  // Probe both direct calls and indirect calls. Direct calls are probed so that
+  // their probe ID can be used as an call site identifier to represent a
+  // calling context.
+  for (auto &I : CallProbeIds) {
+    auto *Call = I.first;
+    uint32_t Index = I.second;
+    uint32_t Type = cast<CallBase>(Call)->getCalledFunction()
+                        ? (uint32_t)PseudoProbeType::DirectCall
+                        : (uint32_t)PseudoProbeType::IndirectCall;
+    AssignDebugLoc(Call);
+    // Levarge the 32-bit discriminator field of debug data to store the ID and
+    // type of a callsite probe. This gets rid of the dependency on plumbing a
+    // customized metadata through the codegen pipeline.
+    uint32_t V = PseudoProbeDwarfDiscriminator::packProbeData(
+        Index, Type, 0, PseudoProbeDwarfDiscriminator::FullDistributionFactor);
+    if (auto DIL = Call->getDebugLoc()) {
+      DIL = DIL->cloneWithDiscriminator(V);
+      Call->setDebugLoc(DIL);
+    }
+  }
+
+  // Create module-level metadata that contains function info necessary to
+  // synthesize probe-based sample counts,  which are
+  // - FunctionGUID
+  // - FunctionHash.
+  // - FunctionName
+  auto Hash = getFunctionHash();
+  auto *MD = MDB.createPseudoProbeDesc(Guid, Hash, &F);
+  auto *NMD = M->getNamedMetadata(PseudoProbeDescMetadataName);
+  assert(NMD && "llvm.pseudo_probe_desc should be pre-created");
+  NMD->addOperand(MD);
+
+  // Preserve a comdat group to hold all probes materialized later. This
+  // allows that when the function is considered dead and removed, the
+  // materialized probes are disposed too.
+  // Imported functions are defined in another module. They do not need
+  // the following handling since same care will be taken for them in their
+  // original module. The pseudo probes inserted into an imported functions
+  // above will naturally not be emitted since the imported function is free
+  // from object emission. However they will be emitted together with the
+  // inliner functions that the imported function is inlined into. We are not
+  // creating a comdat group for an import function since it's useless anyway.
+  if (!F.isDeclarationForLinker()) {
+    if (TM) {
+      auto Triple = TM->getTargetTriple();
+      if (Triple.supportsCOMDAT() && TM->getFunctionSections()) {
+        GetOrCreateFunctionComdat(F, Triple, CurModuleUniqueId);
+      }
+    }
+  }
+}
+
+PreservedAnalyses SampleProfileProbePass::run(Module &M,
+                                              ModuleAnalysisManager &AM) {
+  auto ModuleId = getUniqueModuleId(&M);
+  // Create the pseudo probe desc metadata beforehand.
+  // Note that modules with only data but no functions will require this to
+  // be set up so that they will be known as probed later.
+  M.getOrInsertNamedMetadata(PseudoProbeDescMetadataName);
+
+  for (auto &F : M) {
+    if (F.isDeclaration())
+      continue;
+    SampleProfileProber ProbeManager(F, ModuleId);
+    ProbeManager.instrumentOneFunc(F, TM);
+  }
+
+  return PreservedAnalyses::none();
+}
+
+void PseudoProbeUpdatePass::runOnFunction(Function &F,
+                                          FunctionAnalysisManager &FAM) {
+  BlockFrequencyInfo &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
+  auto BBProfileCount = [&BFI](BasicBlock *BB) {
+    return BFI.getBlockProfileCount(BB)
+               ? BFI.getBlockProfileCount(BB).getValue()
+               : 0;
+  };
+
+  // Collect the sum of execution weight for each probe.
+  ProbeFactorMap ProbeFactors;
+  for (auto &Block : F) {
+    for (auto &I : Block) {
+      if (Optional<PseudoProbe> Probe = extractProbe(I))
+        ProbeFactors[Probe->Id] += BBProfileCount(&Block);
+    }
+  }
+
+  // Fix up over-counted probes.
+  for (auto &Block : F) {
+    for (auto &I : Block) {
+      if (Optional<PseudoProbe> Probe = extractProbe(I)) {
+        float Sum = ProbeFactors[Probe->Id];
+        if (Sum != 0)
+          setProbeDistributionFactor(I, BBProfileCount(&Block) / Sum);
+      }
+    }
+  }
+}
+
+PreservedAnalyses PseudoProbeUpdatePass::run(Module &M,
+                                             ModuleAnalysisManager &AM) {
+  if (UpdatePseudoProbe) {
+    for (auto &F : M) {
+      if (F.isDeclaration())
+        continue;
+      FunctionAnalysisManager &FAM =
+          AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+      runOnFunction(F, FAM);
+    }
+  }
+  return PreservedAnalyses::none();
+}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/StripSymbols.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/StripSymbols.cpp
index 088091df770f..4fc71847a070 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/StripSymbols.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/StripSymbols.cpp
@@ -19,18 +19,21 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/IPO/StripSymbols.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/IR/TypeFinder.h"
 #include "llvm/IR/ValueSymbolTable.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Utils/Local.h"
+
 using namespace llvm;
 
 namespace {
@@ -249,9 +252,7 @@ bool StripNonDebugSymbols::runOnModule(Module &M) {
   return StripSymbolNames(M, true);
 }
 
-bool StripDebugDeclare::runOnModule(Module &M) {
-  if (skipModule(M))
-    return false;
+static bool stripDebugDeclareImpl(Module &M) {
 
   Function *Declare = M.getFunction("llvm.dbg.declare");
   std::vector<Constant*> DeadConstants;
@@ -289,17 +290,13 @@ bool StripDebugDeclare::runOnModule(Module &M) {
   return true;
 }
 
-/// Remove any debug info for global variables/functions in the given module for
-/// which said global variable/function no longer exists (i.e. is null).
-///
-/// Debugging information is encoded in llvm IR using metadata. This is designed
-/// such a way that debug info for symbols preserved even if symbols are
-/// optimized away by the optimizer. This special pass removes debug info for
-/// such symbols.
-bool StripDeadDebugInfo::runOnModule(Module &M) {
+bool StripDebugDeclare::runOnModule(Module &M) {
   if (skipModule(M))
     return false;
+  return stripDebugDeclareImpl(M);
+}
 
+static bool stripDeadDebugInfoImpl(Module &M) {
   bool Changed = false;
 
   LLVMContext &C = M.getContext();
@@ -380,3 +377,40 @@ bool StripDeadDebugInfo::runOnModule(Module &M) {
 
   return Changed;
 }
+
+/// Remove any debug info for global variables/functions in the given module for
+/// which said global variable/function no longer exists (i.e. is null).
+///
+/// Debugging information is encoded in llvm IR using metadata. This is designed
+/// such a way that debug info for symbols preserved even if symbols are
+/// optimized away by the optimizer. This special pass removes debug info for
+/// such symbols.
+bool StripDeadDebugInfo::runOnModule(Module &M) {
+  if (skipModule(M))
+    return false;
+  return stripDeadDebugInfoImpl(M);
+}
+
+PreservedAnalyses StripSymbolsPass::run(Module &M, ModuleAnalysisManager &AM) {
+  StripDebugInfo(M);
+  StripSymbolNames(M, false);
+  return PreservedAnalyses::all();
+}
+
+PreservedAnalyses StripNonDebugSymbolsPass::run(Module &M,
+                                                ModuleAnalysisManager &AM) {
+  StripSymbolNames(M, true);
+  return PreservedAnalyses::all();
+}
+
+PreservedAnalyses StripDebugDeclarePass::run(Module &M,
+                                             ModuleAnalysisManager &AM) {
+  stripDebugDeclareImpl(M);
+  return PreservedAnalyses::all();
+}
+
+PreservedAnalyses StripDeadDebugInfoPass::run(Module &M,
+                                              ModuleAnalysisManager &AM) {
+  stripDeadDebugInfoImpl(M);
+  return PreservedAnalyses::all();
+}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
index 87a18171787f..225b4fe95f67 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
@@ -14,6 +14,7 @@
 #include "llvm/Bitcode/BitcodeWriter.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
@@ -260,7 +261,7 @@ void splitAndWriteThinLTOBitcode(
         if (!RT || RT->getBitWidth() > 64 || F->arg_empty() ||
             !F->arg_begin()->use_empty())
           return;
-        for (auto &Arg : make_range(std::next(F->arg_begin()), F->arg_end())) {
+        for (auto &Arg : drop_begin(F->args())) {
           auto *ArgT = dyn_cast<IntegerType>(Arg.getType());
           if (!ArgT || ArgT->getBitWidth() > 64)
             return;
@@ -333,8 +334,7 @@ void splitAndWriteThinLTOBitcode(
       Linkage = CFL_Declaration;
     Elts.push_back(ConstantAsMetadata::get(
         llvm::ConstantInt::get(Type::getInt8Ty(Ctx), Linkage)));
-    for (auto Type : Types)
-      Elts.push_back(Type);
+    append_range(Elts, Types);
     CfiFunctionMDs.push_back(MDTuple::get(Ctx, Elts));
   }
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
index 5a25f9857665..cf1ff405c493 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -59,7 +59,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/iterator_range.h"
-#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/TypeMetadataUtils.h"
@@ -470,7 +470,7 @@ CallSiteInfo &VTableSlotInfo::findCallSiteInfo(CallBase &CB) {
   auto *CBType = dyn_cast<IntegerType>(CB.getType());
   if (!CBType || CBType->getBitWidth() > 64 || CB.arg_empty())
     return CSInfo;
-  for (auto &&Arg : make_range(CB.arg_begin() + 1, CB.arg_end())) {
+  for (auto &&Arg : drop_begin(CB.args())) {
     auto *CI = dyn_cast<ConstantInt>(Arg);
     if (!CI || CI->getBitWidth() > 64)
       return CSInfo;
@@ -753,6 +753,11 @@ PreservedAnalyses WholeProgramDevirtPass::run(Module &M,
   auto LookupDomTree = [&FAM](Function &F) -> DominatorTree & {
     return FAM.getResult<DominatorTreeAnalysis>(F);
   };
+  if (UseCommandLine) {
+    if (DevirtModule::runForTesting(M, AARGetter, OREGetter, LookupDomTree))
+      return PreservedAnalyses::all();
+    return PreservedAnalyses::none();
+  }
   if (!DevirtModule(M, AARGetter, OREGetter, LookupDomTree, ExportSummary,
                     ImportSummary)
            .run())
@@ -1025,6 +1030,10 @@ bool DevirtIndex::tryFindVirtualCallTargets(
 
 void DevirtModule::applySingleImplDevirt(VTableSlotInfo &SlotInfo,
                                          Constant *TheFn, bool &IsExported) {
+  // Don't devirtualize function if we're told to skip it
+  // in -wholeprogramdevirt-skip.
+  if (FunctionsToSkip.match(TheFn->stripPointerCasts()->getName()))
+    return;
   auto Apply = [&](CallSiteInfo &CSInfo) {
     for (auto &&VCallSite : CSInfo.CallSites) {
       if (RemarksEnabled)
@@ -1258,7 +1267,7 @@ void DevirtModule::applyICallBranchFunnel(VTableSlotInfo &SlotInfo,
 
       // Jump tables are only profitable if the retpoline mitigation is enabled.
       Attribute FSAttr = CB.getCaller()->getFnAttribute("target-features");
-      if (FSAttr.hasAttribute(Attribute::None) ||
+      if (!FSAttr.isValid() ||
           !FSAttr.getValueAsString().contains("+retpoline"))
         continue;
 
@@ -1270,8 +1279,7 @@ void DevirtModule::applyICallBranchFunnel(VTableSlotInfo &SlotInfo,
       // x86_64.
       std::vector<Type *> NewArgs;
       NewArgs.push_back(Int8PtrTy);
-      for (Type *T : CB.getFunctionType()->params())
-        NewArgs.push_back(T);
+      append_range(NewArgs, CB.getFunctionType()->params());
       FunctionType *NewFT =
           FunctionType::get(CB.getFunctionType()->getReturnType(), NewArgs,
                             CB.getFunctionType()->isVarArg());
@@ -1280,7 +1288,7 @@ void DevirtModule::applyICallBranchFunnel(VTableSlotInfo &SlotInfo,
       IRBuilder<> IRB(&CB);
       std::vector<Value *> Args;
       Args.push_back(IRB.CreateBitCast(VCallSite.VTable, Int8PtrTy));
-      Args.insert(Args.end(), CB.arg_begin(), CB.arg_end());
+      llvm::append_range(Args, CB.args());
 
       CallBase *NewCS = nullptr;
       if (isa<CallInst>(CB))
@@ -2205,6 +2213,4 @@ void DevirtIndex::run() {
   if (PrintSummaryDevirt)
     for (const auto &DT : DevirtTargets)
       errs() << "Devirtualized call to " << DT << "\n";
-
-  return;
 }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index a7f5e0a7774d..bacb8689892a 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -29,6 +29,7 @@
 #include "llvm/Support/AlignOf.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/KnownBits.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
 #include <cassert>
 #include <utility>
 
@@ -81,11 +82,11 @@ namespace {
   private:
     bool insaneIntVal(int V) { return V > 4 || V < -4; }
 
-    APFloat *getFpValPtr()
-      { return reinterpret_cast<APFloat *>(&FpValBuf.buffer[0]); }
+    APFloat *getFpValPtr() { return reinterpret_cast<APFloat *>(&FpValBuf); }
 
-    const APFloat *getFpValPtr() const
-      { return reinterpret_cast<const APFloat *>(&FpValBuf.buffer[0]); }
+    const APFloat *getFpValPtr() const {
+      return reinterpret_cast<const APFloat *>(&FpValBuf);
+    }
 
     const APFloat &getFpVal() const {
       assert(IsFp && BufHasFpVal && "Incorret state");
@@ -860,7 +861,7 @@ static Instruction *foldNoWrapAdd(BinaryOperator &Add,
   return nullptr;
 }
 
-Instruction *InstCombiner::foldAddWithConstant(BinaryOperator &Add) {
+Instruction *InstCombinerImpl::foldAddWithConstant(BinaryOperator &Add) {
   Value *Op0 = Add.getOperand(0), *Op1 = Add.getOperand(1);
   Constant *Op1C;
   if (!match(Op1, m_Constant(Op1C)))
@@ -886,15 +887,15 @@ Instruction *InstCombiner::foldAddWithConstant(BinaryOperator &Add) {
   // zext(bool) + C -> bool ? C + 1 : C
   if (match(Op0, m_ZExt(m_Value(X))) &&
       X->getType()->getScalarSizeInBits() == 1)
-    return SelectInst::Create(X, AddOne(Op1C), Op1);
+    return SelectInst::Create(X, InstCombiner::AddOne(Op1C), Op1);
   // sext(bool) + C -> bool ? C - 1 : C
   if (match(Op0, m_SExt(m_Value(X))) &&
       X->getType()->getScalarSizeInBits() == 1)
-    return SelectInst::Create(X, SubOne(Op1C), Op1);
+    return SelectInst::Create(X, InstCombiner::SubOne(Op1C), Op1);
 
   // ~X + C --> (C-1) - X
   if (match(Op0, m_Not(m_Value(X))))
-    return BinaryOperator::CreateSub(SubOne(Op1C), X);
+    return BinaryOperator::CreateSub(InstCombiner::SubOne(Op1C), X);
 
   const APInt *C;
   if (!match(Op1, m_APInt(C)))
@@ -923,6 +924,39 @@ Instruction *InstCombiner::foldAddWithConstant(BinaryOperator &Add) {
       C2->isMinSignedValue() && C2->sext(Ty->getScalarSizeInBits()) == *C)
     return CastInst::Create(Instruction::SExt, X, Ty);
 
+  if (match(Op0, m_Xor(m_Value(X), m_APInt(C2)))) {
+    // (X ^ signmask) + C --> (X + (signmask ^ C))
+    if (C2->isSignMask())
+      return BinaryOperator::CreateAdd(X, ConstantInt::get(Ty, *C2 ^ *C));
+
+    // If X has no high-bits set above an xor mask:
+    // add (xor X, LowMaskC), C --> sub (LowMaskC + C), X
+    if (C2->isMask()) {
+      KnownBits LHSKnown = computeKnownBits(X, 0, &Add);
+      if ((*C2 | LHSKnown.Zero).isAllOnesValue())
+        return BinaryOperator::CreateSub(ConstantInt::get(Ty, *C2 + *C), X);
+    }
+
+    // Look for a math+logic pattern that corresponds to sext-in-register of a
+    // value with cleared high bits. Convert that into a pair of shifts:
+    // add (xor X, 0x80), 0xF..F80 --> (X << ShAmtC) >>s ShAmtC
+    // add (xor X, 0xF..F80), 0x80 --> (X << ShAmtC) >>s ShAmtC
+    if (Op0->hasOneUse() && *C2 == -(*C)) {
+      unsigned BitWidth = Ty->getScalarSizeInBits();
+      unsigned ShAmt = 0;
+      if (C->isPowerOf2())
+        ShAmt = BitWidth - C->logBase2() - 1;
+      else if (C2->isPowerOf2())
+        ShAmt = BitWidth - C2->logBase2() - 1;
+      if (ShAmt && MaskedValueIsZero(X, APInt::getHighBitsSet(BitWidth, ShAmt),
+                                     0, &Add)) {
+        Constant *ShAmtC = ConstantInt::get(Ty, ShAmt);
+        Value *NewShl = Builder.CreateShl(X, ShAmtC, "sext");
+        return BinaryOperator::CreateAShr(NewShl, ShAmtC);
+      }
+    }
+  }
+
   if (C->isOneValue() && Op0->hasOneUse()) {
     // add (sext i1 X), 1 --> zext (not X)
     // TODO: The smallest IR representation is (select X, 0, 1), and that would
@@ -943,6 +977,15 @@ Instruction *InstCombiner::foldAddWithConstant(BinaryOperator &Add) {
     }
   }
 
+  // If all bits affected by the add are included in a high-bit-mask, do the
+  // add before the mask op:
+  // (X & 0xFF00) + xx00 --> (X + xx00) & 0xFF00
+  if (match(Op0, m_OneUse(m_And(m_Value(X), m_APInt(C2)))) &&
+      C2->isNegative() && C2->isShiftedMask() && *C == (*C & *C2)) {
+    Value *NewAdd = Builder.CreateAdd(X, ConstantInt::get(Ty, *C));
+    return BinaryOperator::CreateAnd(NewAdd, ConstantInt::get(Ty, *C2));
+  }
+
   return nullptr;
 }
 
@@ -1021,7 +1064,7 @@ static bool MulWillOverflow(APInt &C0, APInt &C1, bool IsSigned) {
 
 // Simplifies X % C0 + (( X / C0 ) % C1) * C0 to X % (C0 * C1), where (C0 * C1)
 // does not overflow.
-Value *InstCombiner::SimplifyAddWithRemainder(BinaryOperator &I) {
+Value *InstCombinerImpl::SimplifyAddWithRemainder(BinaryOperator &I) {
   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
   Value *X, *MulOpV;
   APInt C0, MulOpC;
@@ -1097,9 +1140,9 @@ static Instruction *foldToUnsignedSaturatedAdd(BinaryOperator &I) {
   return nullptr;
 }
 
-Instruction *
-InstCombiner::canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract(
-    BinaryOperator &I) {
+Instruction *InstCombinerImpl::
+    canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract(
+        BinaryOperator &I) {
   assert((I.getOpcode() == Instruction::Add ||
           I.getOpcode() == Instruction::Or ||
           I.getOpcode() == Instruction::Sub) &&
@@ -1198,7 +1241,44 @@ InstCombiner::canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract(
   return TruncInst::CreateTruncOrBitCast(NewAShr, I.getType());
 }
 
-Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
+/// This is a specialization of a more general transform from
+/// SimplifyUsingDistributiveLaws. If that code can be made to work optimally
+/// for multi-use cases or propagating nsw/nuw, then we would not need this.
+static Instruction *factorizeMathWithShlOps(BinaryOperator &I,
+                                            InstCombiner::BuilderTy &Builder) {
+  // TODO: Also handle mul by doubling the shift amount?
+  assert((I.getOpcode() == Instruction::Add ||
+          I.getOpcode() == Instruction::Sub) &&
+         "Expected add/sub");
+  auto *Op0 = dyn_cast<BinaryOperator>(I.getOperand(0));
+  auto *Op1 = dyn_cast<BinaryOperator>(I.getOperand(1));
+  if (!Op0 || !Op1 || !(Op0->hasOneUse() || Op1->hasOneUse()))
+    return nullptr;
+
+  Value *X, *Y, *ShAmt;
+  if (!match(Op0, m_Shl(m_Value(X), m_Value(ShAmt))) ||
+      !match(Op1, m_Shl(m_Value(Y), m_Specific(ShAmt))))
+    return nullptr;
+
+  // No-wrap propagates only when all ops have no-wrap.
+  bool HasNSW = I.hasNoSignedWrap() && Op0->hasNoSignedWrap() &&
+                Op1->hasNoSignedWrap();
+  bool HasNUW = I.hasNoUnsignedWrap() && Op0->hasNoUnsignedWrap() &&
+                Op1->hasNoUnsignedWrap();
+
+  // add/sub (X << ShAmt), (Y << ShAmt) --> (add/sub X, Y) << ShAmt
+  Value *NewMath = Builder.CreateBinOp(I.getOpcode(), X, Y);
+  if (auto *NewI = dyn_cast<BinaryOperator>(NewMath)) {
+    NewI->setHasNoSignedWrap(HasNSW);
+    NewI->setHasNoUnsignedWrap(HasNUW);
+  }
+  auto *NewShl = BinaryOperator::CreateShl(NewMath, ShAmt);
+  NewShl->setHasNoSignedWrap(HasNSW);
+  NewShl->setHasNoUnsignedWrap(HasNUW);
+  return NewShl;
+}
+
+Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
   if (Value *V = SimplifyAddInst(I.getOperand(0), I.getOperand(1),
                                  I.hasNoSignedWrap(), I.hasNoUnsignedWrap(),
                                  SQ.getWithInstruction(&I)))
@@ -1214,59 +1294,17 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
   if (Value *V = SimplifyUsingDistributiveLaws(I))
     return replaceInstUsesWith(I, V);
 
+  if (Instruction *R = factorizeMathWithShlOps(I, Builder))
+    return R;
+
   if (Instruction *X = foldAddWithConstant(I))
     return X;
 
   if (Instruction *X = foldNoWrapAdd(I, Builder))
     return X;
 
-  // FIXME: This should be moved into the above helper function to allow these
-  // transforms for general constant or constant splat vectors.
   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
   Type *Ty = I.getType();
-  if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
-    Value *XorLHS = nullptr; ConstantInt *XorRHS = nullptr;
-    if (match(LHS, m_Xor(m_Value(XorLHS), m_ConstantInt(XorRHS)))) {
-      unsigned TySizeBits = Ty->getScalarSizeInBits();
-      const APInt &RHSVal = CI->getValue();
-      unsigned ExtendAmt = 0;
-      // If we have ADD(XOR(AND(X, 0xFF), 0x80), 0xF..F80), it's a sext.
-      // If we have ADD(XOR(AND(X, 0xFF), 0xF..F80), 0x80), it's a sext.
-      if (XorRHS->getValue() == -RHSVal) {
-        if (RHSVal.isPowerOf2())
-          ExtendAmt = TySizeBits - RHSVal.logBase2() - 1;
-        else if (XorRHS->getValue().isPowerOf2())
-          ExtendAmt = TySizeBits - XorRHS->getValue().logBase2() - 1;
-      }
-
-      if (ExtendAmt) {
-        APInt Mask = APInt::getHighBitsSet(TySizeBits, ExtendAmt);
-        if (!MaskedValueIsZero(XorLHS, Mask, 0, &I))
-          ExtendAmt = 0;
-      }
-
-      if (ExtendAmt) {
-        Constant *ShAmt = ConstantInt::get(Ty, ExtendAmt);
-        Value *NewShl = Builder.CreateShl(XorLHS, ShAmt, "sext");
-        return BinaryOperator::CreateAShr(NewShl, ShAmt);
-      }
-
-      // If this is a xor that was canonicalized from a sub, turn it back into
-      // a sub and fuse this add with it.
-      if (LHS->hasOneUse() && (XorRHS->getValue()+1).isPowerOf2()) {
-        KnownBits LHSKnown = computeKnownBits(XorLHS, 0, &I);
-        if ((XorRHS->getValue() | LHSKnown.Zero).isAllOnesValue())
-          return BinaryOperator::CreateSub(ConstantExpr::getAdd(XorRHS, CI),
-                                           XorLHS);
-      }
-      // (X + signmask) + C could have gotten canonicalized to (X^signmask) + C,
-      // transform them into (X + (signmask ^ C))
-      if (XorRHS->getValue().isSignMask())
-        return BinaryOperator::CreateAdd(XorLHS,
-                                         ConstantExpr::getXor(XorRHS, CI));
-    }
-  }
-
   if (Ty->isIntOrIntVectorTy(1))
     return BinaryOperator::CreateXor(LHS, RHS);
 
@@ -1329,34 +1367,6 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
   if (haveNoCommonBitsSet(LHS, RHS, DL, &AC, &I, &DT))
     return BinaryOperator::CreateOr(LHS, RHS);
 
-  // FIXME: We already did a check for ConstantInt RHS above this.
-  // FIXME: Is this pattern covered by another fold? No regression tests fail on
-  // removal.
-  if (ConstantInt *CRHS = dyn_cast<ConstantInt>(RHS)) {
-    // (X & FF00) + xx00  -> (X+xx00) & FF00
-    Value *X;
-    ConstantInt *C2;
-    if (LHS->hasOneUse() &&
-        match(LHS, m_And(m_Value(X), m_ConstantInt(C2))) &&
-        CRHS->getValue() == (CRHS->getValue() & C2->getValue())) {
-      // See if all bits from the first bit set in the Add RHS up are included
-      // in the mask.  First, get the rightmost bit.
-      const APInt &AddRHSV = CRHS->getValue();
-
-      // Form a mask of all bits from the lowest bit added through the top.
-      APInt AddRHSHighBits(~((AddRHSV & -AddRHSV)-1));
-
-      // See if the and mask includes all of these bits.
-      APInt AddRHSHighBitsAnd(AddRHSHighBits & C2->getValue());
-
-      if (AddRHSHighBits == AddRHSHighBitsAnd) {
-        // Okay, the xform is safe.  Insert the new add pronto.
-        Value *NewAdd = Builder.CreateAdd(X, CRHS, LHS->getName());
-        return BinaryOperator::CreateAnd(NewAdd, C2);
-      }
-    }
-  }
-
   // add (select X 0 (sub n A)) A  -->  select X A n
   {
     SelectInst *SI = dyn_cast<SelectInst>(LHS);
@@ -1424,6 +1434,14 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
   if (Instruction *SatAdd = foldToUnsignedSaturatedAdd(I))
     return SatAdd;
 
+  // usub.sat(A, B) + B => umax(A, B)
+  if (match(&I, m_c_BinOp(
+          m_OneUse(m_Intrinsic<Intrinsic::usub_sat>(m_Value(A), m_Value(B))),
+          m_Deferred(B)))) {
+    return replaceInstUsesWith(I,
+        Builder.CreateIntrinsic(Intrinsic::umax, {I.getType()}, {A, B}));
+  }
+
   return Changed ? &I : nullptr;
 }
 
@@ -1486,7 +1504,7 @@ static Instruction *factorizeFAddFSub(BinaryOperator &I,
                 : BinaryOperator::CreateFDivFMF(XY, Z, &I);
 }
 
-Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitFAdd(BinaryOperator &I) {
   if (Value *V = SimplifyFAddInst(I.getOperand(0), I.getOperand(1),
                                   I.getFastMathFlags(),
                                   SQ.getWithInstruction(&I)))
@@ -1600,49 +1618,33 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
 /// Optimize pointer differences into the same array into a size.  Consider:
 ///  &A[10] - &A[0]: we should compile this to "10".  LHS/RHS are the pointer
 /// operands to the ptrtoint instructions for the LHS/RHS of the subtract.
-Value *InstCombiner::OptimizePointerDifference(Value *LHS, Value *RHS,
-                                               Type *Ty, bool IsNUW) {
+Value *InstCombinerImpl::OptimizePointerDifference(Value *LHS, Value *RHS,
+                                                   Type *Ty, bool IsNUW) {
   // If LHS is a gep based on RHS or RHS is a gep based on LHS, we can optimize
   // this.
   bool Swapped = false;
   GEPOperator *GEP1 = nullptr, *GEP2 = nullptr;
+  if (!isa<GEPOperator>(LHS) && isa<GEPOperator>(RHS)) {
+    std::swap(LHS, RHS);
+    Swapped = true;
+  }
 
-  // For now we require one side to be the base pointer "A" or a constant
-  // GEP derived from it.
-  if (GEPOperator *LHSGEP = dyn_cast<GEPOperator>(LHS)) {
+  // Require at least one GEP with a common base pointer on both sides.
+  if (auto *LHSGEP = dyn_cast<GEPOperator>(LHS)) {
     // (gep X, ...) - X
     if (LHSGEP->getOperand(0) == RHS) {
       GEP1 = LHSGEP;
-      Swapped = false;
-    } else if (GEPOperator *RHSGEP = dyn_cast<GEPOperator>(RHS)) {
+    } else if (auto *RHSGEP = dyn_cast<GEPOperator>(RHS)) {
       // (gep X, ...) - (gep X, ...)
       if (LHSGEP->getOperand(0)->stripPointerCasts() ==
-            RHSGEP->getOperand(0)->stripPointerCasts()) {
-        GEP2 = RHSGEP;
+          RHSGEP->getOperand(0)->stripPointerCasts()) {
         GEP1 = LHSGEP;
-        Swapped = false;
-      }
-    }
-  }
-
-  if (GEPOperator *RHSGEP = dyn_cast<GEPOperator>(RHS)) {
-    // X - (gep X, ...)
-    if (RHSGEP->getOperand(0) == LHS) {
-      GEP1 = RHSGEP;
-      Swapped = true;
-    } else if (GEPOperator *LHSGEP = dyn_cast<GEPOperator>(LHS)) {
-      // (gep X, ...) - (gep X, ...)
-      if (RHSGEP->getOperand(0)->stripPointerCasts() ==
-            LHSGEP->getOperand(0)->stripPointerCasts()) {
-        GEP2 = LHSGEP;
-        GEP1 = RHSGEP;
-        Swapped = true;
+        GEP2 = RHSGEP;
       }
     }
   }
 
   if (!GEP1)
-    // No GEP found.
     return nullptr;
 
   if (GEP2) {
@@ -1670,19 +1672,18 @@ Value *InstCombiner::OptimizePointerDifference(Value *LHS, Value *RHS,
   Value *Result = EmitGEPOffset(GEP1);
 
   // If this is a single inbounds GEP and the original sub was nuw,
-  // then the final multiplication is also nuw. We match an extra add zero
-  // here, because that's what EmitGEPOffset() generates.
-  Instruction *I;
-  if (IsNUW && !GEP2 && !Swapped && GEP1->isInBounds() &&
-      match(Result, m_Add(m_Instruction(I), m_Zero())) &&
-      I->getOpcode() == Instruction::Mul)
-    I->setHasNoUnsignedWrap();
-
-  // If we had a constant expression GEP on the other side offsetting the
-  // pointer, subtract it from the offset we have.
+  // then the final multiplication is also nuw.
+  if (auto *I = dyn_cast<Instruction>(Result))
+    if (IsNUW && !GEP2 && !Swapped && GEP1->isInBounds() &&
+        I->getOpcode() == Instruction::Mul)
+      I->setHasNoUnsignedWrap();
+
+  // If we have a 2nd GEP of the same base pointer, subtract the offsets.
+  // If both GEPs are inbounds, then the subtract does not have signed overflow.
   if (GEP2) {
     Value *Offset = EmitGEPOffset(GEP2);
-    Result = Builder.CreateSub(Result, Offset);
+    Result = Builder.CreateSub(Result, Offset, "gepdiff", /* NUW */ false,
+                               GEP1->isInBounds() && GEP2->isInBounds());
   }
 
   // If we have p - gep(p, ...)  then we have to negate the result.
@@ -1692,7 +1693,7 @@ Value *InstCombiner::OptimizePointerDifference(Value *LHS, Value *RHS,
   return Builder.CreateIntCast(Result, Ty, true);
 }
 
-Instruction *InstCombiner::visitSub(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) {
   if (Value *V = SimplifySubInst(I.getOperand(0), I.getOperand(1),
                                  I.hasNoSignedWrap(), I.hasNoUnsignedWrap(),
                                  SQ.getWithInstruction(&I)))
@@ -1721,6 +1722,19 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
     return Res;
   }
 
+  // Try this before Negator to preserve NSW flag.
+  if (Instruction *R = factorizeMathWithShlOps(I, Builder))
+    return R;
+
+  if (Constant *C = dyn_cast<Constant>(Op0)) {
+    Value *X;
+    Constant *C2;
+
+    // C-(X+C2) --> (C-C2)-X
+    if (match(Op1, m_Add(m_Value(X), m_Constant(C2))))
+      return BinaryOperator::CreateSub(ConstantExpr::getSub(C, C2), X);
+  }
+
   auto TryToNarrowDeduceFlags = [this, &I, &Op0, &Op1]() -> Instruction * {
     if (Instruction *Ext = narrowMathIfNoOverflow(I))
       return Ext;
@@ -1788,8 +1802,7 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
   }
 
   auto m_AddRdx = [](Value *&Vec) {
-    return m_OneUse(
-        m_Intrinsic<Intrinsic::experimental_vector_reduce_add>(m_Value(Vec)));
+    return m_OneUse(m_Intrinsic<Intrinsic::vector_reduce_add>(m_Value(Vec)));
   };
   Value *V0, *V1;
   if (match(Op0, m_AddRdx(V0)) && match(Op1, m_AddRdx(V1)) &&
@@ -1797,8 +1810,8 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
     // Difference of sums is sum of differences:
     // add_rdx(V0) - add_rdx(V1) --> add_rdx(V0 - V1)
     Value *Sub = Builder.CreateSub(V0, V1);
-    Value *Rdx = Builder.CreateIntrinsic(
-        Intrinsic::experimental_vector_reduce_add, {Sub->getType()}, {Sub});
+    Value *Rdx = Builder.CreateIntrinsic(Intrinsic::vector_reduce_add,
+                                         {Sub->getType()}, {Sub});
     return replaceInstUsesWith(I, Rdx);
   }
 
@@ -1806,14 +1819,14 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
     Value *X;
     if (match(Op1, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1))
       // C - (zext bool) --> bool ? C - 1 : C
-      return SelectInst::Create(X, SubOne(C), C);
+      return SelectInst::Create(X, InstCombiner::SubOne(C), C);
     if (match(Op1, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1))
       // C - (sext bool) --> bool ? C + 1 : C
-      return SelectInst::Create(X, AddOne(C), C);
+      return SelectInst::Create(X, InstCombiner::AddOne(C), C);
 
     // C - ~X == X + (1+C)
     if (match(Op1, m_Not(m_Value(X))))
-      return BinaryOperator::CreateAdd(X, AddOne(C));
+      return BinaryOperator::CreateAdd(X, InstCombiner::AddOne(C));
 
     // Try to fold constant sub into select arguments.
     if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
@@ -1828,12 +1841,8 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
     Constant *C2;
 
     // C-(C2-X) --> X+(C-C2)
-    if (match(Op1, m_Sub(m_Constant(C2), m_Value(X))) && !isa<ConstantExpr>(C2))
+    if (match(Op1, m_Sub(m_ImmConstant(C2), m_Value(X))))
       return BinaryOperator::CreateAdd(X, ConstantExpr::getSub(C, C2));
-
-    // C-(X+C2) --> (C-C2)-X
-    if (match(Op1, m_Add(m_Value(X), m_Constant(C2))))
-      return BinaryOperator::CreateSub(ConstantExpr::getSub(C, C2), X);
   }
 
   const APInt *Op0C;
@@ -1864,6 +1873,22 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
       return BinaryOperator::CreateXor(A, B);
   }
 
+  // (sub (add A, B) (or A, B)) --> (and A, B)
+  {
+    Value *A, *B;
+    if (match(Op0, m_Add(m_Value(A), m_Value(B))) &&
+        match(Op1, m_c_Or(m_Specific(A), m_Specific(B))))
+      return BinaryOperator::CreateAnd(A, B);
+  }
+
+  // (sub (add A, B) (and A, B)) --> (or A, B)
+  {
+    Value *A, *B;
+    if (match(Op0, m_Add(m_Value(A), m_Value(B))) &&
+        match(Op1, m_c_And(m_Specific(A), m_Specific(B))))
+      return BinaryOperator::CreateOr(A, B);
+  }
+
   // (sub (and A, B) (or A, B)) --> neg (xor A, B)
   {
     Value *A, *B;
@@ -2042,6 +2067,20 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
     return SelectInst::Create(Cmp, Neg, A);
   }
 
+  // If we are subtracting a low-bit masked subset of some value from an add
+  // of that same value with no low bits changed, that is clearing some low bits
+  // of the sum:
+  // sub (X + AddC), (X & AndC) --> and (X + AddC), ~AndC
+  const APInt *AddC, *AndC;
+  if (match(Op0, m_Add(m_Value(X), m_APInt(AddC))) &&
+      match(Op1, m_And(m_Specific(X), m_APInt(AndC)))) {
+    unsigned BitWidth = Ty->getScalarSizeInBits();
+    unsigned Cttz = AddC->countTrailingZeros();
+    APInt HighMask(APInt::getHighBitsSet(BitWidth, BitWidth - Cttz));
+    if ((HighMask & *AndC).isNullValue())
+      return BinaryOperator::CreateAnd(Op0, ConstantInt::get(Ty, ~(*AndC)));
+  }
+
   if (Instruction *V =
           canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract(I))
     return V;
@@ -2094,11 +2133,11 @@ static Instruction *hoistFNegAboveFMulFDiv(Instruction &I,
   return nullptr;
 }
 
-Instruction *InstCombiner::visitFNeg(UnaryOperator &I) {
+Instruction *InstCombinerImpl::visitFNeg(UnaryOperator &I) {
   Value *Op = I.getOperand(0);
 
   if (Value *V = SimplifyFNegInst(Op, I.getFastMathFlags(),
-                                  SQ.getWithInstruction(&I)))
+                                  getSimplifyQuery().getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
   if (Instruction *X = foldFNegIntoConstant(I))
@@ -2117,10 +2156,10 @@ Instruction *InstCombiner::visitFNeg(UnaryOperator &I) {
   return nullptr;
 }
 
-Instruction *InstCombiner::visitFSub(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitFSub(BinaryOperator &I) {
   if (Value *V = SimplifyFSubInst(I.getOperand(0), I.getOperand(1),
                                   I.getFastMathFlags(),
-                                  SQ.getWithInstruction(&I)))
+                                  getSimplifyQuery().getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
   if (Instruction *X = foldVectorBinop(I))
@@ -2175,7 +2214,7 @@ Instruction *InstCombiner::visitFSub(BinaryOperator &I) {
   // X - C --> X + (-C)
   // But don't transform constant expressions because there's an inverse fold
   // for X + (-Y) --> X - Y.
-  if (match(Op1, m_Constant(C)) && !isa<ConstantExpr>(Op1))
+  if (match(Op1, m_ImmConstant(C)))
     return BinaryOperator::CreateFAddFMF(Op0, ConstantExpr::getFNeg(C), &I);
 
   // X - (-Y) --> X + Y
@@ -2244,9 +2283,8 @@ Instruction *InstCombiner::visitFSub(BinaryOperator &I) {
     }
 
     auto m_FaddRdx = [](Value *&Sum, Value *&Vec) {
-      return m_OneUse(
-          m_Intrinsic<Intrinsic::experimental_vector_reduce_v2_fadd>(
-              m_Value(Sum), m_Value(Vec)));
+      return m_OneUse(m_Intrinsic<Intrinsic::vector_reduce_fadd>(m_Value(Sum),
+                                                                 m_Value(Vec)));
     };
     Value *A0, *A1, *V0, *V1;
     if (match(Op0, m_FaddRdx(A0, V0)) && match(Op1, m_FaddRdx(A1, V1)) &&
@@ -2254,9 +2292,8 @@ Instruction *InstCombiner::visitFSub(BinaryOperator &I) {
       // Difference of sums is sum of differences:
       // add_rdx(A0, V0) - add_rdx(A1, V1) --> add_rdx(A0, V0 - V1) - A1
       Value *Sub = Builder.CreateFSubFMF(V0, V1, &I);
-      Value *Rdx = Builder.CreateIntrinsic(
-          Intrinsic::experimental_vector_reduce_v2_fadd,
-          {A0->getType(), Sub->getType()}, {A0, Sub}, &I);
+      Value *Rdx = Builder.CreateIntrinsic(Intrinsic::vector_reduce_fadd,
+                                           {Sub->getType()}, {A0, Sub}, &I);
       return BinaryOperator::CreateFSubFMF(Rdx, A1, &I);
     }
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 1304d46fdef4..85a7abe211b3 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -13,10 +13,12 @@
 #include "InstCombineInternal.h"
 #include "llvm/Analysis/CmpInstAnalysis.h"
 #include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
+#include "llvm/Transforms/Utils/Local.h"
+
 using namespace llvm;
 using namespace PatternMatch;
 
@@ -112,57 +114,12 @@ static Value *SimplifyBSwap(BinaryOperator &I,
   return Builder.CreateCall(F, BinOp);
 }
 
-/// This handles expressions of the form ((val OP C1) & C2).  Where
-/// the Op parameter is 'OP', OpRHS is 'C1', and AndRHS is 'C2'.
-Instruction *InstCombiner::OptAndOp(BinaryOperator *Op,
-                                    ConstantInt *OpRHS,
-                                    ConstantInt *AndRHS,
-                                    BinaryOperator &TheAnd) {
-  Value *X = Op->getOperand(0);
-
-  switch (Op->getOpcode()) {
-  default: break;
-  case Instruction::Add:
-    if (Op->hasOneUse()) {
-      // Adding a one to a single bit bit-field should be turned into an XOR
-      // of the bit.  First thing to check is to see if this AND is with a
-      // single bit constant.
-      const APInt &AndRHSV = AndRHS->getValue();
-
-      // If there is only one bit set.
-      if (AndRHSV.isPowerOf2()) {
-        // Ok, at this point, we know that we are masking the result of the
-        // ADD down to exactly one bit.  If the constant we are adding has
-        // no bits set below this bit, then we can eliminate the ADD.
-        const APInt& AddRHS = OpRHS->getValue();
-
-        // Check to see if any bits below the one bit set in AndRHSV are set.
-        if ((AddRHS & (AndRHSV - 1)).isNullValue()) {
-          // If not, the only thing that can effect the output of the AND is
-          // the bit specified by AndRHSV.  If that bit is set, the effect of
-          // the XOR is to toggle the bit.  If it is clear, then the ADD has
-          // no effect.
-          if ((AddRHS & AndRHSV).isNullValue()) { // Bit is not set, noop
-            return replaceOperand(TheAnd, 0, X);
-          } else {
-            // Pull the XOR out of the AND.
-            Value *NewAnd = Builder.CreateAnd(X, AndRHS);
-            NewAnd->takeName(Op);
-            return BinaryOperator::CreateXor(NewAnd, AndRHS);
-          }
-        }
-      }
-    }
-    break;
-  }
-  return nullptr;
-}
-
 /// Emit a computation of: (V >= Lo && V < Hi) if Inside is true, otherwise
 /// (V < Lo || V >= Hi). This method expects that Lo < Hi. IsSigned indicates
 /// whether to treat V, Lo, and Hi as signed or not.
-Value *InstCombiner::insertRangeTest(Value *V, const APInt &Lo, const APInt &Hi,
-                                     bool isSigned, bool Inside) {
+Value *InstCombinerImpl::insertRangeTest(Value *V, const APInt &Lo,
+                                         const APInt &Hi, bool isSigned,
+                                         bool Inside) {
   assert((isSigned ? Lo.slt(Hi) : Lo.ult(Hi)) &&
          "Lo is not < Hi in range emission code!");
 
@@ -437,11 +394,10 @@ getMaskedTypeForICmpPair(Value *&A, Value *&B, Value *&C,
 /// (icmp(A & X) ==/!= Y), where the left-hand side is of type Mask_NotAllZeros
 /// and the right hand side is of type BMask_Mixed. For example,
 /// (icmp (A & 12) != 0) & (icmp (A & 15) == 8) -> (icmp (A & 15) == 8).
-static Value * foldLogOpOfMaskedICmps_NotAllZeros_BMask_Mixed(
-    ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
-    Value *A, Value *B, Value *C, Value *D, Value *E,
-    ICmpInst::Predicate PredL, ICmpInst::Predicate PredR,
-    llvm::InstCombiner::BuilderTy &Builder) {
+static Value *foldLogOpOfMaskedICmps_NotAllZeros_BMask_Mixed(
+    ICmpInst *LHS, ICmpInst *RHS, bool IsAnd, Value *A, Value *B, Value *C,
+    Value *D, Value *E, ICmpInst::Predicate PredL, ICmpInst::Predicate PredR,
+    InstCombiner::BuilderTy &Builder) {
   // We are given the canonical form:
   //   (icmp ne (A & B), 0) & (icmp eq (A & D), E).
   // where D & E == E.
@@ -452,17 +408,9 @@ static Value * foldLogOpOfMaskedICmps_NotAllZeros_BMask_Mixed(
   //
   // We currently handle the case of B, C, D, E are constant.
   //
-  ConstantInt *BCst = dyn_cast<ConstantInt>(B);
-  if (!BCst)
-    return nullptr;
-  ConstantInt *CCst = dyn_cast<ConstantInt>(C);
-  if (!CCst)
-    return nullptr;
-  ConstantInt *DCst = dyn_cast<ConstantInt>(D);
-  if (!DCst)
-    return nullptr;
-  ConstantInt *ECst = dyn_cast<ConstantInt>(E);
-  if (!ECst)
+  ConstantInt *BCst, *CCst, *DCst, *ECst;
+  if (!match(B, m_ConstantInt(BCst)) || !match(C, m_ConstantInt(CCst)) ||
+      !match(D, m_ConstantInt(DCst)) || !match(E, m_ConstantInt(ECst)))
     return nullptr;
 
   ICmpInst::Predicate NewCC = IsAnd ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE;
@@ -568,11 +516,9 @@ static Value * foldLogOpOfMaskedICmps_NotAllZeros_BMask_Mixed(
 /// (icmp(A & X) ==/!= Y), where the left-hand side and the right hand side
 /// aren't of the common mask pattern type.
 static Value *foldLogOpOfMaskedICmpsAsymmetric(
-    ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
-    Value *A, Value *B, Value *C, Value *D, Value *E,
-    ICmpInst::Predicate PredL, ICmpInst::Predicate PredR,
-    unsigned LHSMask, unsigned RHSMask,
-    llvm::InstCombiner::BuilderTy &Builder) {
+    ICmpInst *LHS, ICmpInst *RHS, bool IsAnd, Value *A, Value *B, Value *C,
+    Value *D, Value *E, ICmpInst::Predicate PredL, ICmpInst::Predicate PredR,
+    unsigned LHSMask, unsigned RHSMask, InstCombiner::BuilderTy &Builder) {
   assert(ICmpInst::isEquality(PredL) && ICmpInst::isEquality(PredR) &&
          "Expected equality predicates for masked type of icmps.");
   // Handle Mask_NotAllZeros-BMask_Mixed cases.
@@ -603,7 +549,7 @@ static Value *foldLogOpOfMaskedICmpsAsymmetric(
 /// Try to fold (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E)
 /// into a single (icmp(A & X) ==/!= Y).
 static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
-                                     llvm::InstCombiner::BuilderTy &Builder) {
+                                     InstCombiner::BuilderTy &Builder) {
   Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr, *E = nullptr;
   ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
   Optional<std::pair<unsigned, unsigned>> MaskPair =
@@ -673,11 +619,8 @@ static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
   // Remaining cases assume at least that B and D are constant, and depend on
   // their actual values. This isn't strictly necessary, just a "handle the
   // easy cases for now" decision.
-  ConstantInt *BCst = dyn_cast<ConstantInt>(B);
-  if (!BCst)
-    return nullptr;
-  ConstantInt *DCst = dyn_cast<ConstantInt>(D);
-  if (!DCst)
+  ConstantInt *BCst, *DCst;
+  if (!match(B, m_ConstantInt(BCst)) || !match(D, m_ConstantInt(DCst)))
     return nullptr;
 
   if (Mask & (Mask_NotAllZeros | BMask_NotAllOnes)) {
@@ -718,11 +661,8 @@ static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
     // We can't simply use C and E because we might actually handle
     //   (icmp ne (A & B), B) & (icmp eq (A & D), D)
     // with B and D, having a single bit set.
-    ConstantInt *CCst = dyn_cast<ConstantInt>(C);
-    if (!CCst)
-      return nullptr;
-    ConstantInt *ECst = dyn_cast<ConstantInt>(E);
-    if (!ECst)
+    ConstantInt *CCst, *ECst;
+    if (!match(C, m_ConstantInt(CCst)) || !match(E, m_ConstantInt(ECst)))
       return nullptr;
     if (PredL != NewCC)
       CCst = cast<ConstantInt>(ConstantExpr::getXor(BCst, CCst));
@@ -748,8 +688,8 @@ static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
 /// Example: (icmp sge x, 0) & (icmp slt x, n) --> icmp ult x, n
 /// If \p Inverted is true then the check is for the inverted range, e.g.
 /// (icmp slt x, 0) | (icmp sgt x, n) --> icmp ugt x, n
-Value *InstCombiner::simplifyRangeCheck(ICmpInst *Cmp0, ICmpInst *Cmp1,
-                                        bool Inverted) {
+Value *InstCombinerImpl::simplifyRangeCheck(ICmpInst *Cmp0, ICmpInst *Cmp1,
+                                            bool Inverted) {
   // Check the lower range comparison, e.g. x >= 0
   // InstCombine already ensured that if there is a constant it's on the RHS.
   ConstantInt *RangeStart = dyn_cast<ConstantInt>(Cmp0->getOperand(1));
@@ -856,8 +796,9 @@ foldAndOrOfEqualityCmpsWithConstants(ICmpInst *LHS, ICmpInst *RHS,
 
 // Fold (iszero(A & K1) | iszero(A & K2)) -> (A & (K1 | K2)) != (K1 | K2)
 // Fold (!iszero(A & K1) & !iszero(A & K2)) -> (A & (K1 | K2)) == (K1 | K2)
-Value *InstCombiner::foldAndOrOfICmpsOfAndWithPow2(ICmpInst *LHS, ICmpInst *RHS,
-                                                   BinaryOperator &Logic) {
+Value *InstCombinerImpl::foldAndOrOfICmpsOfAndWithPow2(ICmpInst *LHS,
+                                                       ICmpInst *RHS,
+                                                       BinaryOperator &Logic) {
   bool JoinedByAnd = Logic.getOpcode() == Instruction::And;
   assert((JoinedByAnd || Logic.getOpcode() == Instruction::Or) &&
          "Wrong opcode");
@@ -869,10 +810,8 @@ Value *InstCombiner::foldAndOrOfICmpsOfAndWithPow2(ICmpInst *LHS, ICmpInst *RHS,
   if (!JoinedByAnd && Pred != ICmpInst::ICMP_EQ)
     return nullptr;
 
-  // TODO support vector splats
-  ConstantInt *LHSC = dyn_cast<ConstantInt>(LHS->getOperand(1));
-  ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS->getOperand(1));
-  if (!LHSC || !RHSC || !LHSC->isZero() || !RHSC->isZero())
+  if (!match(LHS->getOperand(1), m_Zero()) ||
+      !match(RHS->getOperand(1), m_Zero()))
     return nullptr;
 
   Value *A, *B, *C, *D;
@@ -1184,8 +1123,8 @@ static Value *foldAndOrOfICmpsWithConstEq(ICmpInst *Cmp0, ICmpInst *Cmp1,
 }
 
 /// Fold (icmp)&(icmp) if possible.
-Value *InstCombiner::foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS,
-                                    BinaryOperator &And) {
+Value *InstCombinerImpl::foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS,
+                                        BinaryOperator &And) {
   const SimplifyQuery Q = SQ.getWithInstruction(&And);
 
   // Fold (!iszero(A & K1) & !iszero(A & K2)) ->  (A & (K1 | K2)) == (K1 | K2)
@@ -1244,9 +1183,10 @@ Value *InstCombiner::foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS,
 
   // This only handles icmp of constants: (icmp1 A, C1) & (icmp2 B, C2).
   Value *LHS0 = LHS->getOperand(0), *RHS0 = RHS->getOperand(0);
-  ConstantInt *LHSC = dyn_cast<ConstantInt>(LHS->getOperand(1));
-  ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS->getOperand(1));
-  if (!LHSC || !RHSC)
+
+  ConstantInt *LHSC, *RHSC;
+  if (!match(LHS->getOperand(1), m_ConstantInt(LHSC)) ||
+      !match(RHS->getOperand(1), m_ConstantInt(RHSC)))
     return nullptr;
 
   if (LHSC == RHSC && PredL == PredR) {
@@ -1404,7 +1344,8 @@ Value *InstCombiner::foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS,
   return nullptr;
 }
 
-Value *InstCombiner::foldLogicOfFCmps(FCmpInst *LHS, FCmpInst *RHS, bool IsAnd) {
+Value *InstCombinerImpl::foldLogicOfFCmps(FCmpInst *LHS, FCmpInst *RHS,
+                                          bool IsAnd) {
   Value *LHS0 = LHS->getOperand(0), *LHS1 = LHS->getOperand(1);
   Value *RHS0 = RHS->getOperand(0), *RHS1 = RHS->getOperand(1);
   FCmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
@@ -1514,8 +1455,8 @@ static Instruction *matchDeMorgansLaws(BinaryOperator &I,
   Value *A, *B;
   if (match(I.getOperand(0), m_OneUse(m_Not(m_Value(A)))) &&
       match(I.getOperand(1), m_OneUse(m_Not(m_Value(B)))) &&
-      !isFreeToInvert(A, A->hasOneUse()) &&
-      !isFreeToInvert(B, B->hasOneUse())) {
+      !InstCombiner::isFreeToInvert(A, A->hasOneUse()) &&
+      !InstCombiner::isFreeToInvert(B, B->hasOneUse())) {
     Value *AndOr = Builder.CreateBinOp(Opcode, A, B, I.getName() + ".demorgan");
     return BinaryOperator::CreateNot(AndOr);
   }
@@ -1523,7 +1464,7 @@ static Instruction *matchDeMorgansLaws(BinaryOperator &I,
   return nullptr;
 }
 
-bool InstCombiner::shouldOptimizeCast(CastInst *CI) {
+bool InstCombinerImpl::shouldOptimizeCast(CastInst *CI) {
   Value *CastSrc = CI->getOperand(0);
 
   // Noop casts and casts of constants should be eliminated trivially.
@@ -1579,7 +1520,7 @@ static Instruction *foldLogicCastConstant(BinaryOperator &Logic, CastInst *Cast,
 }
 
 /// Fold {and,or,xor} (cast X), Y.
-Instruction *InstCombiner::foldCastedBitwiseLogic(BinaryOperator &I) {
+Instruction *InstCombinerImpl::foldCastedBitwiseLogic(BinaryOperator &I) {
   auto LogicOpc = I.getOpcode();
   assert(I.isBitwiseLogicOp() && "Unexpected opcode for bitwise logic folding");
 
@@ -1686,6 +1627,14 @@ static Instruction *foldOrToXor(BinaryOperator &I,
         match(Op1, m_Not(m_c_Or(m_Specific(A), m_Specific(B)))))
       return BinaryOperator::CreateNot(Builder.CreateXor(A, B));
 
+  // Operand complexity canonicalization guarantees that the 'xor' is Op0.
+  // (A ^ B) | ~(A | B) --> ~(A & B)
+  // (A ^ B) | ~(B | A) --> ~(A & B)
+  if (Op0->hasOneUse() || Op1->hasOneUse())
+    if (match(Op0, m_Xor(m_Value(A), m_Value(B))) &&
+        match(Op1, m_Not(m_c_Or(m_Specific(A), m_Specific(B)))))
+      return BinaryOperator::CreateNot(Builder.CreateAnd(A, B));
+
   // (A & ~B) | (~A & B) --> A ^ B
   // (A & ~B) | (B & ~A) --> A ^ B
   // (~B & A) | (~A & B) --> A ^ B
@@ -1700,32 +1649,13 @@ static Instruction *foldOrToXor(BinaryOperator &I,
 /// Return true if a constant shift amount is always less than the specified
 /// bit-width. If not, the shift could create poison in the narrower type.
 static bool canNarrowShiftAmt(Constant *C, unsigned BitWidth) {
-  if (auto *ScalarC = dyn_cast<ConstantInt>(C))
-    return ScalarC->getZExtValue() < BitWidth;
-
-  if (C->getType()->isVectorTy()) {
-    // Check each element of a constant vector.
-    unsigned NumElts = cast<VectorType>(C->getType())->getNumElements();
-    for (unsigned i = 0; i != NumElts; ++i) {
-      Constant *Elt = C->getAggregateElement(i);
-      if (!Elt)
-        return false;
-      if (isa<UndefValue>(Elt))
-        continue;
-      auto *CI = dyn_cast<ConstantInt>(Elt);
-      if (!CI || CI->getZExtValue() >= BitWidth)
-        return false;
-    }
-    return true;
-  }
-
-  // The constant is a constant expression or unknown.
-  return false;
+  APInt Threshold(C->getType()->getScalarSizeInBits(), BitWidth);
+  return match(C, m_SpecificInt_ICMP(ICmpInst::ICMP_ULT, Threshold));
 }
 
 /// Try to use narrower ops (sink zext ops) for an 'and' with binop operand and
 /// a common zext operand: and (binop (zext X), C), (zext X).
-Instruction *InstCombiner::narrowMaskedBinOp(BinaryOperator &And) {
+Instruction *InstCombinerImpl::narrowMaskedBinOp(BinaryOperator &And) {
   // This transform could also apply to {or, and, xor}, but there are better
   // folds for those cases, so we don't expect those patterns here. AShr is not
   // handled because it should always be transformed to LShr in this sequence.
@@ -1767,7 +1697,9 @@ Instruction *InstCombiner::narrowMaskedBinOp(BinaryOperator &And) {
 // FIXME: We use commutative matchers (m_c_*) for some, but not all, matches
 // here. We should standardize that construct where it is needed or choose some
 // other way to ensure that commutated variants of patterns are not missed.
-Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
+  Type *Ty = I.getType();
+
   if (Value *V = SimplifyAndInst(I.getOperand(0), I.getOperand(1),
                                  SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
@@ -1795,21 +1727,22 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
     return replaceInstUsesWith(I, V);
 
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  Value *X, *Y;
+  if (match(Op0, m_OneUse(m_LogicalShift(m_One(), m_Value(X)))) &&
+      match(Op1, m_One())) {
+    // (1 << X) & 1 --> zext(X == 0)
+    // (1 >> X) & 1 --> zext(X == 0)
+    Value *IsZero = Builder.CreateICmpEQ(X, ConstantInt::get(Ty, 0));
+    return new ZExtInst(IsZero, Ty);
+  }
+
   const APInt *C;
   if (match(Op1, m_APInt(C))) {
-    Value *X, *Y;
-    if (match(Op0, m_OneUse(m_LogicalShift(m_One(), m_Value(X)))) &&
-        C->isOneValue()) {
-      // (1 << X) & 1 --> zext(X == 0)
-      // (1 >> X) & 1 --> zext(X == 0)
-      Value *IsZero = Builder.CreateICmpEQ(X, ConstantInt::get(I.getType(), 0));
-      return new ZExtInst(IsZero, I.getType());
-    }
-
     const APInt *XorC;
     if (match(Op0, m_OneUse(m_Xor(m_Value(X), m_APInt(XorC))))) {
       // (X ^ C1) & C2 --> (X & C2) ^ (C1&C2)
-      Constant *NewC = ConstantInt::get(I.getType(), *C & *XorC);
+      Constant *NewC = ConstantInt::get(Ty, *C & *XorC);
       Value *And = Builder.CreateAnd(X, Op1);
       And->takeName(Op0);
       return BinaryOperator::CreateXor(And, NewC);
@@ -1824,11 +1757,9 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
       // that aren't set in C2. Meaning we can replace (C1&C2) with C1 in
       // above, but this feels safer.
       APInt Together = *C & *OrC;
-      Value *And = Builder.CreateAnd(X, ConstantInt::get(I.getType(),
-                                                         Together ^ *C));
+      Value *And = Builder.CreateAnd(X, ConstantInt::get(Ty, Together ^ *C));
       And->takeName(Op0);
-      return BinaryOperator::CreateOr(And, ConstantInt::get(I.getType(),
-                                                            Together));
+      return BinaryOperator::CreateOr(And, ConstantInt::get(Ty, Together));
     }
 
     // If the mask is only needed on one incoming arm, push the 'and' op up.
@@ -1849,27 +1780,49 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
         return BinaryOperator::Create(BinOp, NewLHS, Y);
       }
     }
+
+    unsigned Width = Ty->getScalarSizeInBits();
     const APInt *ShiftC;
     if (match(Op0, m_OneUse(m_SExt(m_AShr(m_Value(X), m_APInt(ShiftC)))))) {
-      unsigned Width = I.getType()->getScalarSizeInBits();
       if (*C == APInt::getLowBitsSet(Width, Width - ShiftC->getZExtValue())) {
         // We are clearing high bits that were potentially set by sext+ashr:
         // and (sext (ashr X, ShiftC)), C --> lshr (sext X), ShiftC
-        Value *Sext = Builder.CreateSExt(X, I.getType());
-        Constant *ShAmtC = ConstantInt::get(I.getType(), ShiftC->zext(Width));
+        Value *Sext = Builder.CreateSExt(X, Ty);
+        Constant *ShAmtC = ConstantInt::get(Ty, ShiftC->zext(Width));
         return BinaryOperator::CreateLShr(Sext, ShAmtC);
       }
     }
+
+    const APInt *AddC;
+    if (match(Op0, m_Add(m_Value(X), m_APInt(AddC)))) {
+      // If we add zeros to every bit below a mask, the add has no effect:
+      // (X + AddC) & LowMaskC --> X & LowMaskC
+      unsigned Ctlz = C->countLeadingZeros();
+      APInt LowMask(APInt::getLowBitsSet(Width, Width - Ctlz));
+      if ((*AddC & LowMask).isNullValue())
+        return BinaryOperator::CreateAnd(X, Op1);
+
+      // If we are masking the result of the add down to exactly one bit and
+      // the constant we are adding has no bits set below that bit, then the
+      // add is flipping a single bit. Example:
+      // (X + 4) & 4 --> (X & 4) ^ 4
+      if (Op0->hasOneUse() && C->isPowerOf2() && (*AddC & (*C - 1)) == 0) {
+        assert((*C & *AddC) != 0 && "Expected common bit");
+        Value *NewAnd = Builder.CreateAnd(X, Op1);
+        return BinaryOperator::CreateXor(NewAnd, Op1);
+      }
+    }
   }
 
-  if (ConstantInt *AndRHS = dyn_cast<ConstantInt>(Op1)) {
+  ConstantInt *AndRHS;
+  if (match(Op1, m_ConstantInt(AndRHS))) {
     const APInt &AndRHSMask = AndRHS->getValue();
 
     // Optimize a variety of ((val OP C1) & C2) combinations...
     if (BinaryOperator *Op0I = dyn_cast<BinaryOperator>(Op0)) {
       // ((C1 OP zext(X)) & C2) -> zext((C1-X) & C2) if C2 fits in the bitwidth
       // of X and OP behaves well when given trunc(C1) and X.
-      // TODO: Do this for vectors by using m_APInt isntead of m_ConstantInt.
+      // TODO: Do this for vectors by using m_APInt instead of m_ConstantInt.
       switch (Op0I->getOpcode()) {
       default:
         break;
@@ -1894,31 +1847,30 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
               BinOp = Builder.CreateBinOp(Op0I->getOpcode(), TruncC1, X);
             auto *TruncC2 = ConstantExpr::getTrunc(AndRHS, X->getType());
             auto *And = Builder.CreateAnd(BinOp, TruncC2);
-            return new ZExtInst(And, I.getType());
+            return new ZExtInst(And, Ty);
           }
         }
       }
-
-      if (ConstantInt *Op0CI = dyn_cast<ConstantInt>(Op0I->getOperand(1)))
-        if (Instruction *Res = OptAndOp(Op0I, Op0CI, AndRHS, I))
-          return Res;
     }
+  }
 
-    // If this is an integer truncation, and if the source is an 'and' with
-    // immediate, transform it.  This frequently occurs for bitfield accesses.
-    {
-      Value *X = nullptr; ConstantInt *YC = nullptr;
-      if (match(Op0, m_Trunc(m_And(m_Value(X), m_ConstantInt(YC))))) {
-        // Change: and (trunc (and X, YC) to T), C2
-        // into  : and (trunc X to T), trunc(YC) & C2
-        // This will fold the two constants together, which may allow
-        // other simplifications.
-        Value *NewCast = Builder.CreateTrunc(X, I.getType(), "and.shrunk");
-        Constant *C3 = ConstantExpr::getTrunc(YC, I.getType());
-        C3 = ConstantExpr::getAnd(C3, AndRHS);
-        return BinaryOperator::CreateAnd(NewCast, C3);
-      }
-    }
+  if (match(&I, m_And(m_OneUse(m_Shl(m_ZExt(m_Value(X)), m_Value(Y))),
+                      m_SignMask())) &&
+      match(Y, m_SpecificInt_ICMP(
+                   ICmpInst::Predicate::ICMP_EQ,
+                   APInt(Ty->getScalarSizeInBits(),
+                         Ty->getScalarSizeInBits() -
+                             X->getType()->getScalarSizeInBits())))) {
+    auto *SExt = Builder.CreateSExt(X, Ty, X->getName() + ".signext");
+    auto *SanitizedSignMask = cast<Constant>(Op1);
+    // We must be careful with the undef elements of the sign bit mask, however:
+    // the mask elt can be undef iff the shift amount for that lane was undef,
+    // otherwise we need to sanitize undef masks to zero.
+    SanitizedSignMask = Constant::replaceUndefsWith(
+        SanitizedSignMask, ConstantInt::getNullValue(Ty->getScalarType()));
+    SanitizedSignMask =
+        Constant::mergeUndefsWith(SanitizedSignMask, cast<Constant>(Y));
+    return BinaryOperator::CreateAnd(SExt, SanitizedSignMask);
   }
 
   if (Instruction *Z = narrowMaskedBinOp(I))
@@ -1939,6 +1891,13 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
     if (match(Op0, m_OneUse(m_c_Xor(m_Specific(Op1), m_Value(B)))))
       return BinaryOperator::CreateAnd(Op1, Builder.CreateNot(B));
 
+    // A & ~(A ^ B) --> A & B
+    if (match(Op1, m_Not(m_c_Xor(m_Specific(Op0), m_Value(B)))))
+      return BinaryOperator::CreateAnd(Op0, B);
+    // ~(A ^ B) & A --> A & B
+    if (match(Op0, m_Not(m_c_Xor(m_Specific(Op1), m_Value(B)))))
+      return BinaryOperator::CreateAnd(Op1, B);
+
     // (A ^ B) & ((B ^ C) ^ A) -> (A ^ B) & ~C
     if (match(Op0, m_Xor(m_Value(A), m_Value(B))))
       if (match(Op1, m_Xor(m_Xor(m_Specific(B), m_Value(C)), m_Specific(A))))
@@ -1977,7 +1936,6 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
 
     // TODO: Make this recursive; it's a little tricky because an arbitrary
     // number of 'and' instructions might have to be created.
-    Value *X, *Y;
     if (LHS && match(Op1, m_OneUse(m_And(m_Value(X), m_Value(Y))))) {
       if (auto *Cmp = dyn_cast<ICmpInst>(X))
         if (Value *Res = foldAndOfICmps(LHS, Cmp, I))
@@ -2011,29 +1969,30 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
   Value *A;
   if (match(Op0, m_OneUse(m_SExt(m_Value(A)))) &&
       A->getType()->isIntOrIntVectorTy(1))
-    return SelectInst::Create(A, Op1, Constant::getNullValue(I.getType()));
+    return SelectInst::Create(A, Op1, Constant::getNullValue(Ty));
   if (match(Op1, m_OneUse(m_SExt(m_Value(A)))) &&
       A->getType()->isIntOrIntVectorTy(1))
-    return SelectInst::Create(A, Op0, Constant::getNullValue(I.getType()));
+    return SelectInst::Create(A, Op0, Constant::getNullValue(Ty));
 
   // and(ashr(subNSW(Y, X), ScalarSizeInBits(Y)-1), X) --> X s> Y ? X : 0.
-  {
-    Value *X, *Y;
-    const APInt *ShAmt;
-    Type *Ty = I.getType();
-    if (match(&I, m_c_And(m_OneUse(m_AShr(m_NSWSub(m_Value(Y), m_Value(X)),
-                                          m_APInt(ShAmt))),
-                          m_Deferred(X))) &&
-        *ShAmt == Ty->getScalarSizeInBits() - 1) {
-      Value *NewICmpInst = Builder.CreateICmpSGT(X, Y);
-      return SelectInst::Create(NewICmpInst, X, ConstantInt::getNullValue(Ty));
-    }
+  if (match(&I, m_c_And(m_OneUse(m_AShr(
+                            m_NSWSub(m_Value(Y), m_Value(X)),
+                            m_SpecificInt(Ty->getScalarSizeInBits() - 1))),
+                        m_Deferred(X)))) {
+    Value *NewICmpInst = Builder.CreateICmpSGT(X, Y);
+    return SelectInst::Create(NewICmpInst, X, ConstantInt::getNullValue(Ty));
   }
 
+  // (~x) & y  -->  ~(x | (~y))  iff that gets rid of inversions
+  if (sinkNotIntoOtherHandOfAndOrOr(I))
+    return &I;
+
   return nullptr;
 }
 
-Instruction *InstCombiner::matchBSwap(BinaryOperator &Or) {
+Instruction *InstCombinerImpl::matchBSwapOrBitReverse(BinaryOperator &Or,
+                                                      bool MatchBSwaps,
+                                                      bool MatchBitReversals) {
   assert(Or.getOpcode() == Instruction::Or && "bswap requires an 'or'");
   Value *Op0 = Or.getOperand(0), *Op1 = Or.getOperand(1);
 
@@ -2045,33 +2004,32 @@ Instruction *InstCombiner::matchBSwap(BinaryOperator &Or) {
     Op1 = Ext->getOperand(0);
 
   // (A | B) | C  and  A | (B | C)                  -> bswap if possible.
-  bool OrOfOrs = match(Op0, m_Or(m_Value(), m_Value())) ||
-                 match(Op1, m_Or(m_Value(), m_Value()));
+  bool OrWithOrs = match(Op0, m_Or(m_Value(), m_Value())) ||
+                   match(Op1, m_Or(m_Value(), m_Value()));
 
-  // (A >> B) | (C << D)  and  (A << B) | (B >> C)  -> bswap if possible.
-  bool OrOfShifts = match(Op0, m_LogicalShift(m_Value(), m_Value())) &&
-                    match(Op1, m_LogicalShift(m_Value(), m_Value()));
+  // (A >> B) | C  and  (A << B) | C                -> bswap if possible.
+  bool OrWithShifts = match(Op0, m_LogicalShift(m_Value(), m_Value())) ||
+                      match(Op1, m_LogicalShift(m_Value(), m_Value()));
 
-  // (A & B) | (C & D)                              -> bswap if possible.
-  bool OrOfAnds = match(Op0, m_And(m_Value(), m_Value())) &&
-                  match(Op1, m_And(m_Value(), m_Value()));
+  // (A & B) | C  and  A | (B & C)                  -> bswap if possible.
+  bool OrWithAnds = match(Op0, m_And(m_Value(), m_Value())) ||
+                    match(Op1, m_And(m_Value(), m_Value()));
 
-  // (A << B) | (C & D)                              -> bswap if possible.
-  // The bigger pattern here is ((A & C1) << C2) | ((B >> C2) & C1), which is a
-  // part of the bswap idiom for specific values of C1, C2 (e.g. C1 = 16711935,
-  // C2 = 8 for i32).
-  // This pattern can occur when the operands of the 'or' are not canonicalized
-  // for some reason (not having only one use, for example).
-  bool OrOfAndAndSh = (match(Op0, m_LogicalShift(m_Value(), m_Value())) &&
-                       match(Op1, m_And(m_Value(), m_Value()))) ||
-                      (match(Op0, m_And(m_Value(), m_Value())) &&
-                       match(Op1, m_LogicalShift(m_Value(), m_Value())));
+  // fshl(A,B,C) | D  and  A | fshl(B,C,D)          -> bswap if possible.
+  // fshr(A,B,C) | D  and  A | fshr(B,C,D)          -> bswap if possible.
+  bool OrWithFunnels = match(Op0, m_FShl(m_Value(), m_Value(), m_Value())) ||
+                       match(Op0, m_FShr(m_Value(), m_Value(), m_Value())) ||
+                       match(Op0, m_FShl(m_Value(), m_Value(), m_Value())) ||
+                       match(Op0, m_FShr(m_Value(), m_Value(), m_Value()));
 
-  if (!OrOfOrs && !OrOfShifts && !OrOfAnds && !OrOfAndAndSh)
+  // TODO: Do we need all these filtering checks or should we just rely on
+  // recognizeBSwapOrBitReverseIdiom + collectBitParts to reject them quickly?
+  if (!OrWithOrs && !OrWithShifts && !OrWithAnds && !OrWithFunnels)
     return nullptr;
 
-  SmallVector<Instruction*, 4> Insts;
-  if (!recognizeBSwapOrBitReverseIdiom(&Or, true, false, Insts))
+  SmallVector<Instruction *, 4> Insts;
+  if (!recognizeBSwapOrBitReverseIdiom(&Or, MatchBSwaps, MatchBitReversals,
+                                       Insts))
     return nullptr;
   Instruction *LastInst = Insts.pop_back_val();
   LastInst->removeFromParent();
@@ -2081,34 +2039,72 @@ Instruction *InstCombiner::matchBSwap(BinaryOperator &Or) {
   return LastInst;
 }
 
-/// Transform UB-safe variants of bitwise rotate to the funnel shift intrinsic.
-static Instruction *matchRotate(Instruction &Or) {
+/// Match UB-safe variants of the funnel shift intrinsic.
+static Instruction *matchFunnelShift(Instruction &Or, InstCombinerImpl &IC) {
   // TODO: Can we reduce the code duplication between this and the related
   // rotate matching code under visitSelect and visitTrunc?
   unsigned Width = Or.getType()->getScalarSizeInBits();
-  if (!isPowerOf2_32(Width))
-    return nullptr;
 
-  // First, find an or'd pair of opposite shifts with the same shifted operand:
-  // or (lshr ShVal, ShAmt0), (shl ShVal, ShAmt1)
+  // First, find an or'd pair of opposite shifts:
+  // or (lshr ShVal0, ShAmt0), (shl ShVal1, ShAmt1)
   BinaryOperator *Or0, *Or1;
   if (!match(Or.getOperand(0), m_BinOp(Or0)) ||
       !match(Or.getOperand(1), m_BinOp(Or1)))
     return nullptr;
 
-  Value *ShVal, *ShAmt0, *ShAmt1;
-  if (!match(Or0, m_OneUse(m_LogicalShift(m_Value(ShVal), m_Value(ShAmt0)))) ||
-      !match(Or1, m_OneUse(m_LogicalShift(m_Specific(ShVal), m_Value(ShAmt1)))))
-    return nullptr;
+  Value *ShVal0, *ShVal1, *ShAmt0, *ShAmt1;
+  if (!match(Or0, m_OneUse(m_LogicalShift(m_Value(ShVal0), m_Value(ShAmt0)))) ||
+      !match(Or1, m_OneUse(m_LogicalShift(m_Value(ShVal1), m_Value(ShAmt1)))) ||
+      Or0->getOpcode() == Or1->getOpcode())
+    return nullptr;
+
+  // Canonicalize to or(shl(ShVal0, ShAmt0), lshr(ShVal1, ShAmt1)).
+  if (Or0->getOpcode() == BinaryOperator::LShr) {
+    std::swap(Or0, Or1);
+    std::swap(ShVal0, ShVal1);
+    std::swap(ShAmt0, ShAmt1);
+  }
+  assert(Or0->getOpcode() == BinaryOperator::Shl &&
+         Or1->getOpcode() == BinaryOperator::LShr &&
+         "Illegal or(shift,shift) pair");
+
+  // Match the shift amount operands for a funnel shift pattern. This always
+  // matches a subtraction on the R operand.
+  auto matchShiftAmount = [&](Value *L, Value *R, unsigned Width) -> Value * {
+    // Check for constant shift amounts that sum to the bitwidth.
+    const APInt *LI, *RI;
+    if (match(L, m_APIntAllowUndef(LI)) && match(R, m_APIntAllowUndef(RI)))
+      if (LI->ult(Width) && RI->ult(Width) && (*LI + *RI) == Width)
+        return ConstantInt::get(L->getType(), *LI);
+
+    Constant *LC, *RC;
+    if (match(L, m_Constant(LC)) && match(R, m_Constant(RC)) &&
+        match(L, m_SpecificInt_ICMP(ICmpInst::ICMP_ULT, APInt(Width, Width))) &&
+        match(R, m_SpecificInt_ICMP(ICmpInst::ICMP_ULT, APInt(Width, Width))) &&
+        match(ConstantExpr::getAdd(LC, RC), m_SpecificIntAllowUndef(Width)))
+      return ConstantExpr::mergeUndefsWith(LC, RC);
+
+    // (shl ShVal, X) | (lshr ShVal, (Width - x)) iff X < Width.
+    // We limit this to X < Width in case the backend re-expands the intrinsic,
+    // and has to reintroduce a shift modulo operation (InstCombine might remove
+    // it after this fold). This still doesn't guarantee that the final codegen
+    // will match this original pattern.
+    if (match(R, m_OneUse(m_Sub(m_SpecificInt(Width), m_Specific(L))))) {
+      KnownBits KnownL = IC.computeKnownBits(L, /*Depth*/ 0, &Or);
+      return KnownL.getMaxValue().ult(Width) ? L : nullptr;
+    }
 
-  BinaryOperator::BinaryOps ShiftOpcode0 = Or0->getOpcode();
-  BinaryOperator::BinaryOps ShiftOpcode1 = Or1->getOpcode();
-  if (ShiftOpcode0 == ShiftOpcode1)
-    return nullptr;
+    // For non-constant cases, the following patterns currently only work for
+    // rotation patterns.
+    // TODO: Add general funnel-shift compatible patterns.
+    if (ShVal0 != ShVal1)
+      return nullptr;
+
+    // For non-constant cases we don't support non-pow2 shift masks.
+    // TODO: Is it worth matching urem as well?
+    if (!isPowerOf2_32(Width))
+      return nullptr;
 
-  // Match the shift amount operands for a rotate pattern. This always matches
-  // a subtraction on the R operand.
-  auto matchShiftAmount = [](Value *L, Value *R, unsigned Width) -> Value * {
     // The shift amount may be masked with negation:
     // (shl ShVal, (X & (Width - 1))) | (lshr ShVal, ((-X) & (Width - 1)))
     Value *X;
@@ -2124,23 +2120,25 @@ static Instruction *matchRotate(Instruction &Or) {
                        m_SpecificInt(Mask))))
       return L;
 
+    if (match(L, m_ZExt(m_And(m_Value(X), m_SpecificInt(Mask)))) &&
+        match(R, m_ZExt(m_And(m_Neg(m_Specific(X)), m_SpecificInt(Mask)))))
+      return L;
+
     return nullptr;
   };
 
   Value *ShAmt = matchShiftAmount(ShAmt0, ShAmt1, Width);
-  bool SubIsOnLHS = false;
+  bool IsFshl = true; // Sub on LSHR.
   if (!ShAmt) {
     ShAmt = matchShiftAmount(ShAmt1, ShAmt0, Width);
-    SubIsOnLHS = true;
+    IsFshl = false; // Sub on SHL.
   }
   if (!ShAmt)
     return nullptr;
 
-  bool IsFshl = (!SubIsOnLHS && ShiftOpcode0 == BinaryOperator::Shl) ||
-                (SubIsOnLHS && ShiftOpcode1 == BinaryOperator::Shl);
   Intrinsic::ID IID = IsFshl ? Intrinsic::fshl : Intrinsic::fshr;
   Function *F = Intrinsic::getDeclaration(Or.getModule(), IID, Or.getType());
-  return IntrinsicInst::Create(F, { ShVal, ShVal, ShAmt });
+  return IntrinsicInst::Create(F, {ShVal0, ShVal1, ShAmt});
 }
 
 /// Attempt to combine or(zext(x),shl(zext(y),bw/2) concat packing patterns.
@@ -2198,7 +2196,7 @@ static Instruction *matchOrConcat(Instruction &Or,
 
 /// If all elements of two constant vectors are 0/-1 and inverses, return true.
 static bool areInverseVectorBitmasks(Constant *C1, Constant *C2) {
-  unsigned NumElts = cast<VectorType>(C1->getType())->getNumElements();
+  unsigned NumElts = cast<FixedVectorType>(C1->getType())->getNumElements();
   for (unsigned i = 0; i != NumElts; ++i) {
     Constant *EltC1 = C1->getAggregateElement(i);
     Constant *EltC2 = C2->getAggregateElement(i);
@@ -2216,7 +2214,7 @@ static bool areInverseVectorBitmasks(Constant *C1, Constant *C2) {
 /// We have an expression of the form (A & C) | (B & D). If A is a scalar or
 /// vector composed of all-zeros or all-ones values and is the bitwise 'not' of
 /// B, it can be used as the condition operand of a select instruction.
-Value *InstCombiner::getSelectCondition(Value *A, Value *B) {
+Value *InstCombinerImpl::getSelectCondition(Value *A, Value *B) {
   // Step 1: We may have peeked through bitcasts in the caller.
   // Exit immediately if we don't have (vector) integer types.
   Type *Ty = A->getType();
@@ -2273,8 +2271,8 @@ Value *InstCombiner::getSelectCondition(Value *A, Value *B) {
 
 /// We have an expression of the form (A & C) | (B & D). Try to simplify this
 /// to "A' ? C : D", where A' is a boolean or vector of booleans.
-Value *InstCombiner::matchSelectFromAndOr(Value *A, Value *C, Value *B,
-                                          Value *D) {
+Value *InstCombinerImpl::matchSelectFromAndOr(Value *A, Value *C, Value *B,
+                                              Value *D) {
   // The potential condition of the select may be bitcasted. In that case, look
   // through its bitcast and the corresponding bitcast of the 'not' condition.
   Type *OrigType = A->getType();
@@ -2294,8 +2292,8 @@ Value *InstCombiner::matchSelectFromAndOr(Value *A, Value *C, Value *B,
 }
 
 /// Fold (icmp)|(icmp) if possible.
-Value *InstCombiner::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
-                                   BinaryOperator &Or) {
+Value *InstCombinerImpl::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
+                                       BinaryOperator &Or) {
   const SimplifyQuery Q = SQ.getWithInstruction(&Or);
 
   // Fold (iszero(A & K1) | iszero(A & K2)) ->  (A & (K1 | K2)) != (K1 | K2)
@@ -2304,9 +2302,10 @@ Value *InstCombiner::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
     return V;
 
   ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
-
-  ConstantInt *LHSC = dyn_cast<ConstantInt>(LHS->getOperand(1));
-  ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS->getOperand(1));
+  Value *LHS0 = LHS->getOperand(0), *RHS0 = RHS->getOperand(0);
+  Value *LHS1 = LHS->getOperand(1), *RHS1 = RHS->getOperand(1);
+  auto *LHSC = dyn_cast<ConstantInt>(LHS1);
+  auto *RHSC = dyn_cast<ConstantInt>(RHS1);
 
   // Fold (icmp ult/ule (A + C1), C3) | (icmp ult/ule (A + C2), C3)
   //                   -->  (icmp ult/ule ((A & ~(C1 ^ C2)) + max(C1, C2)), C3)
@@ -2318,24 +2317,20 @@ Value *InstCombiner::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
   // 3) C1 ^ C2 is one-bit mask.
   // 4) LowRange1 ^ LowRange2 and HighRange1 ^ HighRange2 are one-bit mask.
   // This implies all values in the two ranges differ by exactly one bit.
-
   if ((PredL == ICmpInst::ICMP_ULT || PredL == ICmpInst::ICMP_ULE) &&
       PredL == PredR && LHSC && RHSC && LHS->hasOneUse() && RHS->hasOneUse() &&
       LHSC->getType() == RHSC->getType() &&
       LHSC->getValue() == (RHSC->getValue())) {
 
-    Value *LAdd = LHS->getOperand(0);
-    Value *RAdd = RHS->getOperand(0);
-
-    Value *LAddOpnd, *RAddOpnd;
+    Value *AddOpnd;
     ConstantInt *LAddC, *RAddC;
-    if (match(LAdd, m_Add(m_Value(LAddOpnd), m_ConstantInt(LAddC))) &&
-        match(RAdd, m_Add(m_Value(RAddOpnd), m_ConstantInt(RAddC))) &&
+    if (match(LHS0, m_Add(m_Value(AddOpnd), m_ConstantInt(LAddC))) &&
+        match(RHS0, m_Add(m_Specific(AddOpnd), m_ConstantInt(RAddC))) &&
         LAddC->getValue().ugt(LHSC->getValue()) &&
         RAddC->getValue().ugt(LHSC->getValue())) {
 
       APInt DiffC = LAddC->getValue() ^ RAddC->getValue();
-      if (LAddOpnd == RAddOpnd && DiffC.isPowerOf2()) {
+      if (DiffC.isPowerOf2()) {
         ConstantInt *MaxAddC = nullptr;
         if (LAddC->getValue().ult(RAddC->getValue()))
           MaxAddC = RAddC;
@@ -2355,7 +2350,7 @@ Value *InstCombiner::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
             RangeDiff.ugt(LHSC->getValue())) {
           Value *MaskC = ConstantInt::get(LAddC->getType(), ~DiffC);
 
-          Value *NewAnd = Builder.CreateAnd(LAddOpnd, MaskC);
+          Value *NewAnd = Builder.CreateAnd(AddOpnd, MaskC);
           Value *NewAdd = Builder.CreateAdd(NewAnd, MaxAddC);
           return Builder.CreateICmp(LHS->getPredicate(), NewAdd, LHSC);
         }
@@ -2365,15 +2360,12 @@ Value *InstCombiner::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
 
   // (icmp1 A, B) | (icmp2 A, B) --> (icmp3 A, B)
   if (predicatesFoldable(PredL, PredR)) {
-    if (LHS->getOperand(0) == RHS->getOperand(1) &&
-        LHS->getOperand(1) == RHS->getOperand(0))
+    if (LHS0 == RHS1 && LHS1 == RHS0)
       LHS->swapOperands();
-    if (LHS->getOperand(0) == RHS->getOperand(0) &&
-        LHS->getOperand(1) == RHS->getOperand(1)) {
-      Value *Op0 = LHS->getOperand(0), *Op1 = LHS->getOperand(1);
+    if (LHS0 == RHS0 && LHS1 == RHS1) {
       unsigned Code = getICmpCode(LHS) | getICmpCode(RHS);
       bool IsSigned = LHS->isSigned() || RHS->isSigned();
-      return getNewICmpValue(Code, IsSigned, Op0, Op1, Builder);
+      return getNewICmpValue(Code, IsSigned, LHS0, LHS1, Builder);
     }
   }
 
@@ -2382,31 +2374,30 @@ Value *InstCombiner::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
   if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, false, Builder))
     return V;
 
-  Value *LHS0 = LHS->getOperand(0), *RHS0 = RHS->getOperand(0);
   if (LHS->hasOneUse() || RHS->hasOneUse()) {
     // (icmp eq B, 0) | (icmp ult A, B) -> (icmp ule A, B-1)
     // (icmp eq B, 0) | (icmp ugt B, A) -> (icmp ule A, B-1)
     Value *A = nullptr, *B = nullptr;
-    if (PredL == ICmpInst::ICMP_EQ && LHSC && LHSC->isZero()) {
+    if (PredL == ICmpInst::ICMP_EQ && match(LHS1, m_Zero())) {
       B = LHS0;
-      if (PredR == ICmpInst::ICMP_ULT && LHS0 == RHS->getOperand(1))
+      if (PredR == ICmpInst::ICMP_ULT && LHS0 == RHS1)
         A = RHS0;
       else if (PredR == ICmpInst::ICMP_UGT && LHS0 == RHS0)
-        A = RHS->getOperand(1);
+        A = RHS1;
     }
     // (icmp ult A, B) | (icmp eq B, 0) -> (icmp ule A, B-1)
     // (icmp ugt B, A) | (icmp eq B, 0) -> (icmp ule A, B-1)
-    else if (PredR == ICmpInst::ICMP_EQ && RHSC && RHSC->isZero()) {
+    else if (PredR == ICmpInst::ICMP_EQ && match(RHS1, m_Zero())) {
       B = RHS0;
-      if (PredL == ICmpInst::ICMP_ULT && RHS0 == LHS->getOperand(1))
+      if (PredL == ICmpInst::ICMP_ULT && RHS0 == LHS1)
         A = LHS0;
-      else if (PredL == ICmpInst::ICMP_UGT && LHS0 == RHS0)
-        A = LHS->getOperand(1);
+      else if (PredL == ICmpInst::ICMP_UGT && RHS0 == LHS0)
+        A = LHS1;
     }
-    if (A && B)
+    if (A && B && B->getType()->isIntOrIntVectorTy())
       return Builder.CreateICmp(
           ICmpInst::ICMP_UGE,
-          Builder.CreateAdd(B, ConstantInt::getSigned(B->getType(), -1)), A);
+          Builder.CreateAdd(B, Constant::getAllOnesValue(B->getType())), A);
   }
 
   if (Value *V = foldAndOrOfICmpsWithConstEq(LHS, RHS, Or, Builder, Q))
@@ -2435,18 +2426,21 @@ Value *InstCombiner::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
           foldUnsignedUnderflowCheck(RHS, LHS, /*IsAnd=*/false, Q, Builder))
     return X;
 
+  // (icmp ne A, 0) | (icmp ne B, 0) --> (icmp ne (A|B), 0)
+  // TODO: Remove this when foldLogOpOfMaskedICmps can handle vectors.
+  if (PredL == ICmpInst::ICMP_NE && match(LHS1, m_Zero()) &&
+      PredR == ICmpInst::ICMP_NE && match(RHS1, m_Zero()) &&
+      LHS0->getType()->isIntOrIntVectorTy() &&
+      LHS0->getType() == RHS0->getType()) {
+    Value *NewOr = Builder.CreateOr(LHS0, RHS0);
+    return Builder.CreateICmp(PredL, NewOr,
+                              Constant::getNullValue(NewOr->getType()));
+  }
+
   // This only handles icmp of constants: (icmp1 A, C1) | (icmp2 B, C2).
   if (!LHSC || !RHSC)
     return nullptr;
 
-  if (LHSC == RHSC && PredL == PredR) {
-    // (icmp ne A, 0) | (icmp ne B, 0) --> (icmp ne (A|B), 0)
-    if (PredL == ICmpInst::ICMP_NE && LHSC->isZero()) {
-      Value *NewOr = Builder.CreateOr(LHS0, RHS0);
-      return Builder.CreateICmp(PredL, NewOr, LHSC);
-    }
-  }
-
   // (icmp ult (X + CA), C1) | (icmp eq X, C2) -> (icmp ule (X + CA), C1)
   //   iff C2 + CA == C1.
   if (PredL == ICmpInst::ICMP_ULT && PredR == ICmpInst::ICMP_EQ) {
@@ -2560,7 +2554,7 @@ Value *InstCombiner::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
 // FIXME: We use commutative matchers (m_c_*) for some, but not all, matches
 // here. We should standardize that construct where it is needed or choose some
 // other way to ensure that commutated variants of patterns are not missed.
-Instruction *InstCombiner::visitOr(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
   if (Value *V = SimplifyOrInst(I.getOperand(0), I.getOperand(1),
                                 SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
@@ -2590,11 +2584,12 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
   if (Instruction *FoldedLogic = foldBinOpIntoSelectOrPhi(I))
     return FoldedLogic;
 
-  if (Instruction *BSwap = matchBSwap(I))
+  if (Instruction *BSwap = matchBSwapOrBitReverse(I, /*MatchBSwaps*/ true,
+                                                  /*MatchBitReversals*/ false))
     return BSwap;
 
-  if (Instruction *Rotate = matchRotate(I))
-    return Rotate;
+  if (Instruction *Funnel = matchFunnelShift(I, *this))
+    return Funnel;
 
   if (Instruction *Concat = matchOrConcat(I, Builder))
     return replaceInstUsesWith(I, Concat);
@@ -2614,9 +2609,9 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
   Value *A, *B, *C, *D;
   if (match(Op0, m_And(m_Value(A), m_Value(C))) &&
       match(Op1, m_And(m_Value(B), m_Value(D)))) {
-    ConstantInt *C1 = dyn_cast<ConstantInt>(C);
-    ConstantInt *C2 = dyn_cast<ConstantInt>(D);
-    if (C1 && C2) {  // (A & C1)|(B & C2)
+    // (A & C1)|(B & C2)
+    ConstantInt *C1, *C2;
+    if (match(C, m_ConstantInt(C1)) && match(D, m_ConstantInt(C2))) {
       Value *V1 = nullptr, *V2 = nullptr;
       if ((C1->getValue() & C2->getValue()).isNullValue()) {
         // ((V | N) & C1) | (V & C2) --> (V|N) & (C1|C2)
@@ -2807,7 +2802,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
   // ORs in the hopes that we'll be able to simplify it this way.
   // (X|C) | V --> (X|V) | C
   ConstantInt *CI;
-  if (Op0->hasOneUse() && !isa<ConstantInt>(Op1) &&
+  if (Op0->hasOneUse() && !match(Op1, m_ConstantInt()) &&
       match(Op0, m_Or(m_Value(A), m_ConstantInt(CI)))) {
     Value *Inner = Builder.CreateOr(A, Op1);
     Inner->takeName(Op0);
@@ -2828,18 +2823,17 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
     }
   }
 
-  // or(ashr(subNSW(Y, X), ScalarSizeInBits(Y)-1), X)  --> X s> Y ? -1 : X.
+  // or(ashr(subNSW(Y, X), ScalarSizeInBits(Y) - 1), X)  --> X s> Y ? -1 : X.
   {
     Value *X, *Y;
-    const APInt *ShAmt;
     Type *Ty = I.getType();
-    if (match(&I, m_c_Or(m_OneUse(m_AShr(m_NSWSub(m_Value(Y), m_Value(X)),
-                                         m_APInt(ShAmt))),
-                         m_Deferred(X))) &&
-        *ShAmt == Ty->getScalarSizeInBits() - 1) {
+    if (match(&I, m_c_Or(m_OneUse(m_AShr(
+                             m_NSWSub(m_Value(Y), m_Value(X)),
+                             m_SpecificInt(Ty->getScalarSizeInBits() - 1))),
+                         m_Deferred(X)))) {
       Value *NewICmpInst = Builder.CreateICmpSGT(X, Y);
-      return SelectInst::Create(NewICmpInst, ConstantInt::getAllOnesValue(Ty),
-                                X);
+      Value *AllOnes = ConstantInt::getAllOnesValue(Ty);
+      return SelectInst::Create(NewICmpInst, AllOnes, X);
     }
   }
 
@@ -2873,6 +2867,10 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
     }
   }
 
+  // (~x) | y  -->  ~(x & (~y))  iff that gets rid of inversions
+  if (sinkNotIntoOtherHandOfAndOrOr(I))
+    return &I;
+
   return nullptr;
 }
 
@@ -2929,8 +2927,8 @@ static Instruction *foldXorToXor(BinaryOperator &I,
   return nullptr;
 }
 
-Value *InstCombiner::foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS,
-                                    BinaryOperator &I) {
+Value *InstCombinerImpl::foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS,
+                                        BinaryOperator &I) {
   assert(I.getOpcode() == Instruction::Xor && I.getOperand(0) == LHS &&
          I.getOperand(1) == RHS && "Should be 'xor' with these operands");
 
@@ -3088,9 +3086,9 @@ static Instruction *sinkNotIntoXor(BinaryOperator &I,
     return nullptr;
 
   // We only want to do the transform if it is free to do.
-  if (isFreeToInvert(X, X->hasOneUse())) {
+  if (InstCombiner::isFreeToInvert(X, X->hasOneUse())) {
     // Ok, good.
-  } else if (isFreeToInvert(Y, Y->hasOneUse())) {
+  } else if (InstCombiner::isFreeToInvert(Y, Y->hasOneUse())) {
     std::swap(X, Y);
   } else
     return nullptr;
@@ -3099,10 +3097,52 @@ static Instruction *sinkNotIntoXor(BinaryOperator &I,
   return BinaryOperator::CreateXor(NotX, Y, I.getName() + ".demorgan");
 }
 
+// Transform
+//   z = (~x) &/| y
+// into:
+//   z = ~(x |/& (~y))
+// iff y is free to invert and all uses of z can be freely updated.
+bool InstCombinerImpl::sinkNotIntoOtherHandOfAndOrOr(BinaryOperator &I) {
+  Instruction::BinaryOps NewOpc;
+  switch (I.getOpcode()) {
+  case Instruction::And:
+    NewOpc = Instruction::Or;
+    break;
+  case Instruction::Or:
+    NewOpc = Instruction::And;
+    break;
+  default:
+    return false;
+  };
+
+  Value *X, *Y;
+  if (!match(&I, m_c_BinOp(m_Not(m_Value(X)), m_Value(Y))))
+    return false;
+
+  // Will we be able to fold the `not` into Y eventually?
+  if (!InstCombiner::isFreeToInvert(Y, Y->hasOneUse()))
+    return false;
+
+  // And can our users be adapted?
+  if (!InstCombiner::canFreelyInvertAllUsersOf(&I, /*IgnoredUser=*/nullptr))
+    return false;
+
+  Value *NotY = Builder.CreateNot(Y, Y->getName() + ".not");
+  Value *NewBinOp =
+      BinaryOperator::Create(NewOpc, X, NotY, I.getName() + ".not");
+  Builder.Insert(NewBinOp);
+  replaceInstUsesWith(I, NewBinOp);
+  // We can not just create an outer `not`, it will most likely be immediately
+  // folded back, reconstructing our initial pattern, and causing an
+  // infinite combine loop, so immediately manually fold it away.
+  freelyInvertAllUsersOf(NewBinOp);
+  return true;
+}
+
 // FIXME: We use commutative matchers (m_c_*) for some, but not all, matches
 // here. We should standardize that construct where it is needed or choose some
 // other way to ensure that commutated variants of patterns are not missed.
-Instruction *InstCombiner::visitXor(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) {
   if (Value *V = SimplifyXorInst(I.getOperand(0), I.getOperand(1),
                                  SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
@@ -3129,6 +3169,7 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
     return replaceInstUsesWith(I, V);
 
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  Type *Ty = I.getType();
 
   // Fold (X & M) ^ (Y & ~M) -> (X & M) | (Y & ~M)
   // This it a special case in haveNoCommonBitsSet, but the computeKnownBits
@@ -3180,11 +3221,6 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
       }
     }
 
-    // ~(X - Y) --> ~X + Y
-    if (match(NotVal, m_Sub(m_Value(X), m_Value(Y))))
-      if (isa<Constant>(X) || NotVal->hasOneUse())
-        return BinaryOperator::CreateAdd(Builder.CreateNot(X), Y);
-
     // ~(~X >>s Y) --> (X >>s Y)
     if (match(NotVal, m_AShr(m_Not(m_Value(X)), m_Value(Y))))
       return BinaryOperator::CreateAShr(X, Y);
@@ -3200,7 +3236,7 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
         match(C, m_Negative())) {
       // We matched a negative constant, so propagating undef is unsafe.
       // Clamp undef elements to -1.
-      Type *EltTy = C->getType()->getScalarType();
+      Type *EltTy = Ty->getScalarType();
       C = Constant::replaceUndefsWith(C, ConstantInt::getAllOnesValue(EltTy));
       return BinaryOperator::CreateLShr(ConstantExpr::getNot(C), Y);
     }
@@ -3210,14 +3246,25 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
         match(C, m_NonNegative())) {
       // We matched a non-negative constant, so propagating undef is unsafe.
       // Clamp undef elements to 0.
-      Type *EltTy = C->getType()->getScalarType();
+      Type *EltTy = Ty->getScalarType();
       C = Constant::replaceUndefsWith(C, ConstantInt::getNullValue(EltTy));
       return BinaryOperator::CreateAShr(ConstantExpr::getNot(C), Y);
     }
 
-    // ~(X + C) --> -(C + 1) - X
-    if (match(Op0, m_Add(m_Value(X), m_Constant(C))))
-      return BinaryOperator::CreateSub(ConstantExpr::getNeg(AddOne(C)), X);
+    // ~(X + C) --> ~C - X
+    if (match(NotVal, m_c_Add(m_Value(X), m_ImmConstant(C))))
+      return BinaryOperator::CreateSub(ConstantExpr::getNot(C), X);
+
+    // ~(X - Y) --> ~X + Y
+    // FIXME: is it really beneficial to sink the `not` here?
+    if (match(NotVal, m_Sub(m_Value(X), m_Value(Y))))
+      if (isa<Constant>(X) || NotVal->hasOneUse())
+        return BinaryOperator::CreateAdd(Builder.CreateNot(X), Y);
+
+    // ~(~X + Y) --> X - Y
+    if (match(NotVal, m_c_Add(m_Not(m_Value(X)), m_Value(Y))))
+      return BinaryOperator::CreateWithCopiedFlags(Instruction::Sub, X, Y,
+                                                   NotVal);
   }
 
   // Use DeMorgan and reassociation to eliminate a 'not' op.
@@ -3248,52 +3295,56 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
     if (match(Op1, m_APInt(RHSC))) {
       Value *X;
       const APInt *C;
-      if (RHSC->isSignMask() && match(Op0, m_Sub(m_APInt(C), m_Value(X)))) {
-        // (C - X) ^ signmask -> (C + signmask - X)
-        Constant *NewC = ConstantInt::get(I.getType(), *C + *RHSC);
-        return BinaryOperator::CreateSub(NewC, X);
-      }
-      if (RHSC->isSignMask() && match(Op0, m_Add(m_Value(X), m_APInt(C)))) {
-        // (X + C) ^ signmask -> (X + C + signmask)
-        Constant *NewC = ConstantInt::get(I.getType(), *C + *RHSC);
-        return BinaryOperator::CreateAdd(X, NewC);
-      }
+      // (C - X) ^ signmaskC --> (C + signmaskC) - X
+      if (RHSC->isSignMask() && match(Op0, m_Sub(m_APInt(C), m_Value(X))))
+        return BinaryOperator::CreateSub(ConstantInt::get(Ty, *C + *RHSC), X);
 
-      // (X|C1)^C2 -> X^(C1^C2) iff X&~C1 == 0
+      // (X + C) ^ signmaskC --> X + (C + signmaskC)
+      if (RHSC->isSignMask() && match(Op0, m_Add(m_Value(X), m_APInt(C))))
+        return BinaryOperator::CreateAdd(X, ConstantInt::get(Ty, *C + *RHSC));
+
+      // (X | C) ^ RHSC --> X ^ (C ^ RHSC) iff X & C == 0
       if (match(Op0, m_Or(m_Value(X), m_APInt(C))) &&
-          MaskedValueIsZero(X, *C, 0, &I)) {
-        Constant *NewC = ConstantInt::get(I.getType(), *C ^ *RHSC);
-        return BinaryOperator::CreateXor(X, NewC);
+          MaskedValueIsZero(X, *C, 0, &I))
+        return BinaryOperator::CreateXor(X, ConstantInt::get(Ty, *C ^ *RHSC));
+
+      // If RHSC is inverting the remaining bits of shifted X,
+      // canonicalize to a 'not' before the shift to help SCEV and codegen:
+      // (X << C) ^ RHSC --> ~X << C
+      if (match(Op0, m_OneUse(m_Shl(m_Value(X), m_APInt(C)))) &&
+          *RHSC == APInt::getAllOnesValue(Ty->getScalarSizeInBits()).shl(*C)) {
+        Value *NotX = Builder.CreateNot(X);
+        return BinaryOperator::CreateShl(NotX, ConstantInt::get(Ty, *C));
+      }
+      // (X >>u C) ^ RHSC --> ~X >>u C
+      if (match(Op0, m_OneUse(m_LShr(m_Value(X), m_APInt(C)))) &&
+          *RHSC == APInt::getAllOnesValue(Ty->getScalarSizeInBits()).lshr(*C)) {
+        Value *NotX = Builder.CreateNot(X);
+        return BinaryOperator::CreateLShr(NotX, ConstantInt::get(Ty, *C));
       }
+      // TODO: We could handle 'ashr' here as well. That would be matching
+      //       a 'not' op and moving it before the shift. Doing that requires
+      //       preventing the inverse fold in canShiftBinOpWithConstantRHS().
     }
   }
 
-  if (ConstantInt *RHSC = dyn_cast<ConstantInt>(Op1)) {
-    if (BinaryOperator *Op0I = dyn_cast<BinaryOperator>(Op0)) {
-      if (ConstantInt *Op0CI = dyn_cast<ConstantInt>(Op0I->getOperand(1))) {
-        if (Op0I->getOpcode() == Instruction::LShr) {
-          // ((X^C1) >> C2) ^ C3 -> (X>>C2) ^ ((C1>>C2)^C3)
-          // E1 = "X ^ C1"
-          BinaryOperator *E1;
-          ConstantInt *C1;
-          if (Op0I->hasOneUse() &&
-              (E1 = dyn_cast<BinaryOperator>(Op0I->getOperand(0))) &&
-              E1->getOpcode() == Instruction::Xor &&
-              (C1 = dyn_cast<ConstantInt>(E1->getOperand(1)))) {
-            // fold (C1 >> C2) ^ C3
-            ConstantInt *C2 = Op0CI, *C3 = RHSC;
-            APInt FoldConst = C1->getValue().lshr(C2->getValue());
-            FoldConst ^= C3->getValue();
-            // Prepare the two operands.
-            Value *Opnd0 = Builder.CreateLShr(E1->getOperand(0), C2);
-            Opnd0->takeName(Op0I);
-            cast<Instruction>(Opnd0)->setDebugLoc(I.getDebugLoc());
-            Value *FoldVal = ConstantInt::get(Opnd0->getType(), FoldConst);
-
-            return BinaryOperator::CreateXor(Opnd0, FoldVal);
-          }
-        }
-      }
+  // FIXME: This should not be limited to scalar (pull into APInt match above).
+  {
+    Value *X;
+    ConstantInt *C1, *C2, *C3;
+    // ((X^C1) >> C2) ^ C3 -> (X>>C2) ^ ((C1>>C2)^C3)
+    if (match(Op1, m_ConstantInt(C3)) &&
+        match(Op0, m_LShr(m_Xor(m_Value(X), m_ConstantInt(C1)),
+                          m_ConstantInt(C2))) &&
+        Op0->hasOneUse()) {
+      // fold (C1 >> C2) ^ C3
+      APInt FoldConst = C1->getValue().lshr(C2->getValue());
+      FoldConst ^= C3->getValue();
+      // Prepare the two operands.
+      auto *Opnd0 = cast<Instruction>(Builder.CreateLShr(X, C2));
+      Opnd0->takeName(cast<Instruction>(Op0));
+      Opnd0->setDebugLoc(I.getDebugLoc());
+      return BinaryOperator::CreateXor(Opnd0, ConstantInt::get(Ty, FoldConst));
     }
   }
 
@@ -3350,6 +3401,25 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
       match(Op1, m_Not(m_Specific(A))))
     return BinaryOperator::CreateNot(Builder.CreateAnd(A, B));
 
+  // (~A & B) ^ A --> A | B -- There are 4 commuted variants.
+  if (match(&I, m_c_Xor(m_c_And(m_Not(m_Value(A)), m_Value(B)), m_Deferred(A))))
+    return BinaryOperator::CreateOr(A, B);
+
+  // (A | B) ^ (A | C) --> (B ^ C) & ~A -- There are 4 commuted variants.
+  // TODO: Loosen one-use restriction if common operand is a constant.
+  Value *D;
+  if (match(Op0, m_OneUse(m_Or(m_Value(A), m_Value(B)))) &&
+      match(Op1, m_OneUse(m_Or(m_Value(C), m_Value(D))))) {
+    if (B == C || B == D)
+      std::swap(A, B);
+    if (A == C)
+      std::swap(C, D);
+    if (A == D) {
+      Value *NotA = Builder.CreateNot(A);
+      return BinaryOperator::CreateAnd(Builder.CreateXor(B, C), NotA);
+    }
+  }
+
   if (auto *LHS = dyn_cast<ICmpInst>(I.getOperand(0)))
     if (auto *RHS = dyn_cast<ICmpInst>(I.getOperand(1)))
       if (Value *V = foldXorOfICmps(LHS, RHS, I))
@@ -3367,7 +3437,6 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
     std::swap(Op0, Op1);
 
   const APInt *ShAmt;
-  Type *Ty = I.getType();
   if (match(Op1, m_AShr(m_Value(A), m_APInt(ShAmt))) &&
       Op1->hasNUses(2) && *ShAmt == Ty->getScalarSizeInBits() - 1 &&
       match(Op0, m_OneUse(m_c_Add(m_Specific(A), m_Specific(Op1))))) {
@@ -3426,19 +3495,30 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
       }
     }
 
-    // Pull 'not' into operands of select if both operands are one-use compares.
+    // Pull 'not' into operands of select if both operands are one-use compares
+    // or one is one-use compare and the other one is a constant.
     // Inverting the predicates eliminates the 'not' operation.
     // Example:
-    //     not (select ?, (cmp TPred, ?, ?), (cmp FPred, ?, ?) -->
+    //   not (select ?, (cmp TPred, ?, ?), (cmp FPred, ?, ?) -->
     //     select ?, (cmp InvTPred, ?, ?), (cmp InvFPred, ?, ?)
-    // TODO: Canonicalize by hoisting 'not' into an arm of the select if only
-    //       1 select operand is a cmp?
+    //   not (select ?, (cmp TPred, ?, ?), true -->
+    //     select ?, (cmp InvTPred, ?, ?), false
     if (auto *Sel = dyn_cast<SelectInst>(Op0)) {
-      auto *CmpT = dyn_cast<CmpInst>(Sel->getTrueValue());
-      auto *CmpF = dyn_cast<CmpInst>(Sel->getFalseValue());
-      if (CmpT && CmpF && CmpT->hasOneUse() && CmpF->hasOneUse()) {
-        CmpT->setPredicate(CmpT->getInversePredicate());
-        CmpF->setPredicate(CmpF->getInversePredicate());
+      Value *TV = Sel->getTrueValue();
+      Value *FV = Sel->getFalseValue();
+      auto *CmpT = dyn_cast<CmpInst>(TV);
+      auto *CmpF = dyn_cast<CmpInst>(FV);
+      bool InvertibleT = (CmpT && CmpT->hasOneUse()) || isa<Constant>(TV);
+      bool InvertibleF = (CmpF && CmpF->hasOneUse()) || isa<Constant>(FV);
+      if (InvertibleT && InvertibleF) {
+        if (CmpT)
+          CmpT->setPredicate(CmpT->getInversePredicate());
+        else
+          Sel->setTrueValue(ConstantExpr::getNot(cast<Constant>(TV)));
+        if (CmpF)
+          CmpF->setPredicate(CmpF->getInversePredicate());
+        else
+          Sel->setFalseValue(ConstantExpr::getNot(cast<Constant>(FV)));
         return replaceInstUsesWith(I, Sel);
       }
     }
@@ -3447,5 +3527,15 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
   if (Instruction *NewXor = sinkNotIntoXor(I, Builder))
     return NewXor;
 
+  // Otherwise, if all else failed, try to hoist the xor-by-constant:
+  //   (X ^ C) ^ Y --> (X ^ Y) ^ C
+  // Just like we do in other places, we completely avoid the fold
+  // for constantexprs, at least to avoid endless combine loop.
+  if (match(&I, m_c_Xor(m_OneUse(m_Xor(m_CombineAnd(m_Value(X),
+                                                    m_Unless(m_ConstantExpr())),
+                                       m_ImmConstant(C1))),
+                        m_Value(Y))))
+    return BinaryOperator::CreateXor(Builder.CreateXor(X, Y), C1);
+
   return nullptr;
 }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp
index ba1cf982229d..495493aab4b5 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp
@@ -9,8 +9,10 @@
 // This file implements the visit functions for atomic rmw instructions.
 //
 //===----------------------------------------------------------------------===//
+
 #include "InstCombineInternal.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
 
 using namespace llvm;
 
@@ -30,7 +32,7 @@ bool isIdempotentRMW(AtomicRMWInst& RMWI) {
     default:
       return false;
     };
-  
+
   auto C = dyn_cast<ConstantInt>(RMWI.getValOperand());
   if(!C)
     return false;
@@ -91,13 +93,13 @@ bool isSaturating(AtomicRMWInst& RMWI) {
     return C->isMaxValue(false);
   };
 }
-}
+} // namespace
 
-Instruction *InstCombiner::visitAtomicRMWInst(AtomicRMWInst &RMWI) {
+Instruction *InstCombinerImpl::visitAtomicRMWInst(AtomicRMWInst &RMWI) {
 
   // Volatile RMWs perform a load and a store, we cannot replace this by just a
   // load or just a store. We chose not to canonicalize out of general paranoia
-  // about user expectations around volatile. 
+  // about user expectations around volatile.
   if (RMWI.isVolatile())
     return nullptr;
 
@@ -115,7 +117,7 @@ Instruction *InstCombiner::visitAtomicRMWInst(AtomicRMWInst &RMWI) {
          "AtomicRMWs don't make sense with Unordered or NotAtomic");
 
   // Any atomicrmw xchg with no uses can be converted to a atomic store if the
-  // ordering is compatible. 
+  // ordering is compatible.
   if (RMWI.getOperation() == AtomicRMWInst::Xchg &&
       RMWI.use_empty()) {
     if (Ordering != AtomicOrdering::Release &&
@@ -127,14 +129,14 @@ Instruction *InstCombiner::visitAtomicRMWInst(AtomicRMWInst &RMWI) {
     SI->setAlignment(DL.getABITypeAlign(RMWI.getType()));
     return eraseInstFromFunction(RMWI);
   }
-  
+
   if (!isIdempotentRMW(RMWI))
     return nullptr;
 
   // We chose to canonicalize all idempotent operations to an single
   // operation code and constant.  This makes it easier for the rest of the
   // optimizer to match easily.  The choices of or w/0 and fadd w/-0.0 are
-  // arbitrary. 
+  // arbitrary.
   if (RMWI.getType()->isIntegerTy() &&
       RMWI.getOperation() != AtomicRMWInst::Or) {
     RMWI.setOperation(AtomicRMWInst::Or);
@@ -149,7 +151,7 @@ Instruction *InstCombiner::visitAtomicRMWInst(AtomicRMWInst &RMWI) {
   if (Ordering != AtomicOrdering::Acquire &&
       Ordering != AtomicOrdering::Monotonic)
     return nullptr;
-  
+
   LoadInst *Load = new LoadInst(RMWI.getType(), RMWI.getPointerOperand(), "",
                                 false, DL.getABITypeAlign(RMWI.getType()),
                                 Ordering, RMWI.getSyncScopeID());
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 836af6234ad5..5482b944e347 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -28,6 +28,7 @@
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/Attributes.h"
@@ -47,9 +48,6 @@
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/IntrinsicsARM.h"
 #include "llvm/IR/IntrinsicsHexagon.h"
-#include "llvm/IR/IntrinsicsNVPTX.h"
-#include "llvm/IR/IntrinsicsPowerPC.h"
-#include "llvm/IR/IntrinsicsX86.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/PatternMatch.h"
@@ -68,6 +66,7 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SimplifyLibCalls.h"
 #include <algorithm>
@@ -100,24 +99,7 @@ static Type *getPromotedType(Type *Ty) {
   return Ty;
 }
 
-/// Return a constant boolean vector that has true elements in all positions
-/// where the input constant data vector has an element with the sign bit set.
-static Constant *getNegativeIsTrueBoolVec(ConstantDataVector *V) {
-  SmallVector<Constant *, 32> BoolVec;
-  IntegerType *BoolTy = Type::getInt1Ty(V->getContext());
-  for (unsigned I = 0, E = V->getNumElements(); I != E; ++I) {
-    Constant *Elt = V->getElementAsConstant(I);
-    assert((isa<ConstantInt>(Elt) || isa<ConstantFP>(Elt)) &&
-           "Unexpected constant data vector element type");
-    bool Sign = V->getElementType()->isIntegerTy()
-                    ? cast<ConstantInt>(Elt)->isNegative()
-                    : cast<ConstantFP>(Elt)->isNegative();
-    BoolVec.push_back(ConstantInt::get(BoolTy, Sign));
-  }
-  return ConstantVector::get(BoolVec);
-}
-
-Instruction *InstCombiner::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) {
+Instruction *InstCombinerImpl::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) {
   Align DstAlign = getKnownAlignment(MI->getRawDest(), DL, MI, &AC, &DT);
   MaybeAlign CopyDstAlign = MI->getDestAlign();
   if (!CopyDstAlign || *CopyDstAlign < DstAlign) {
@@ -232,7 +214,7 @@ Instruction *InstCombiner::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) {
   return MI;
 }
 
-Instruction *InstCombiner::SimplifyAnyMemSet(AnyMemSetInst *MI) {
+Instruction *InstCombinerImpl::SimplifyAnyMemSet(AnyMemSetInst *MI) {
   const Align KnownAlignment =
       getKnownAlignment(MI->getDest(), DL, MI, &AC, &DT);
   MaybeAlign MemSetAlign = MI->getDestAlign();
@@ -292,820 +274,9 @@ Instruction *InstCombiner::SimplifyAnyMemSet(AnyMemSetInst *MI) {
   return nullptr;
 }
 
-static Value *simplifyX86immShift(const IntrinsicInst &II,
-                                  InstCombiner::BuilderTy &Builder) {
-  bool LogicalShift = false;
-  bool ShiftLeft = false;
-  bool IsImm = false;
-
-  switch (II.getIntrinsicID()) {
-  default: llvm_unreachable("Unexpected intrinsic!");
-  case Intrinsic::x86_sse2_psrai_d:
-  case Intrinsic::x86_sse2_psrai_w:
-  case Intrinsic::x86_avx2_psrai_d:
-  case Intrinsic::x86_avx2_psrai_w:
-  case Intrinsic::x86_avx512_psrai_q_128:
-  case Intrinsic::x86_avx512_psrai_q_256:
-  case Intrinsic::x86_avx512_psrai_d_512:
-  case Intrinsic::x86_avx512_psrai_q_512:
-  case Intrinsic::x86_avx512_psrai_w_512:
-    IsImm = true;
-    LLVM_FALLTHROUGH;
-  case Intrinsic::x86_sse2_psra_d:
-  case Intrinsic::x86_sse2_psra_w:
-  case Intrinsic::x86_avx2_psra_d:
-  case Intrinsic::x86_avx2_psra_w:
-  case Intrinsic::x86_avx512_psra_q_128:
-  case Intrinsic::x86_avx512_psra_q_256:
-  case Intrinsic::x86_avx512_psra_d_512:
-  case Intrinsic::x86_avx512_psra_q_512:
-  case Intrinsic::x86_avx512_psra_w_512:
-    LogicalShift = false;
-    ShiftLeft = false;
-    break;
-  case Intrinsic::x86_sse2_psrli_d:
-  case Intrinsic::x86_sse2_psrli_q:
-  case Intrinsic::x86_sse2_psrli_w:
-  case Intrinsic::x86_avx2_psrli_d:
-  case Intrinsic::x86_avx2_psrli_q:
-  case Intrinsic::x86_avx2_psrli_w:
-  case Intrinsic::x86_avx512_psrli_d_512:
-  case Intrinsic::x86_avx512_psrli_q_512:
-  case Intrinsic::x86_avx512_psrli_w_512:
-    IsImm = true;
-    LLVM_FALLTHROUGH;
-  case Intrinsic::x86_sse2_psrl_d:
-  case Intrinsic::x86_sse2_psrl_q:
-  case Intrinsic::x86_sse2_psrl_w:
-  case Intrinsic::x86_avx2_psrl_d:
-  case Intrinsic::x86_avx2_psrl_q:
-  case Intrinsic::x86_avx2_psrl_w:
-  case Intrinsic::x86_avx512_psrl_d_512:
-  case Intrinsic::x86_avx512_psrl_q_512:
-  case Intrinsic::x86_avx512_psrl_w_512:
-    LogicalShift = true;
-    ShiftLeft = false;
-    break;
-  case Intrinsic::x86_sse2_pslli_d:
-  case Intrinsic::x86_sse2_pslli_q:
-  case Intrinsic::x86_sse2_pslli_w:
-  case Intrinsic::x86_avx2_pslli_d:
-  case Intrinsic::x86_avx2_pslli_q:
-  case Intrinsic::x86_avx2_pslli_w:
-  case Intrinsic::x86_avx512_pslli_d_512:
-  case Intrinsic::x86_avx512_pslli_q_512:
-  case Intrinsic::x86_avx512_pslli_w_512:
-    IsImm = true;
-    LLVM_FALLTHROUGH;
-  case Intrinsic::x86_sse2_psll_d:
-  case Intrinsic::x86_sse2_psll_q:
-  case Intrinsic::x86_sse2_psll_w:
-  case Intrinsic::x86_avx2_psll_d:
-  case Intrinsic::x86_avx2_psll_q:
-  case Intrinsic::x86_avx2_psll_w:
-  case Intrinsic::x86_avx512_psll_d_512:
-  case Intrinsic::x86_avx512_psll_q_512:
-  case Intrinsic::x86_avx512_psll_w_512:
-    LogicalShift = true;
-    ShiftLeft = true;
-    break;
-  }
-  assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
-
-  auto Vec = II.getArgOperand(0);
-  auto Amt = II.getArgOperand(1);
-  auto VT = cast<VectorType>(Vec->getType());
-  auto SVT = VT->getElementType();
-  auto AmtVT = Amt->getType();
-  unsigned VWidth = VT->getNumElements();
-  unsigned BitWidth = SVT->getPrimitiveSizeInBits();
-
-  // If the shift amount is guaranteed to be in-range we can replace it with a
-  // generic shift. If its guaranteed to be out of range, logical shifts combine to
-  // zero and arithmetic shifts are clamped to (BitWidth - 1).
-  if (IsImm) {
-    assert(AmtVT ->isIntegerTy(32) &&
-           "Unexpected shift-by-immediate type");
-    KnownBits KnownAmtBits =
-        llvm::computeKnownBits(Amt, II.getModule()->getDataLayout());
-    if (KnownAmtBits.getMaxValue().ult(BitWidth)) {
-      Amt = Builder.CreateZExtOrTrunc(Amt, SVT);
-      Amt = Builder.CreateVectorSplat(VWidth, Amt);
-      return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
-                                        : Builder.CreateLShr(Vec, Amt))
-                           : Builder.CreateAShr(Vec, Amt));
-    }
-    if (KnownAmtBits.getMinValue().uge(BitWidth)) {
-      if (LogicalShift)
-        return ConstantAggregateZero::get(VT);
-      Amt = ConstantInt::get(SVT, BitWidth - 1);
-      return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt));
-    }
-  } else {
-    // Ensure the first element has an in-range value and the rest of the
-    // elements in the bottom 64 bits are zero.
-    assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
-           cast<VectorType>(AmtVT)->getElementType() == SVT &&
-           "Unexpected shift-by-scalar type");
-    unsigned NumAmtElts = cast<VectorType>(AmtVT)->getNumElements();
-    APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0);
-    APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2);
-    KnownBits KnownLowerBits = llvm::computeKnownBits(
-        Amt, DemandedLower, II.getModule()->getDataLayout());
-    KnownBits KnownUpperBits = llvm::computeKnownBits(
-        Amt, DemandedUpper, II.getModule()->getDataLayout());
-    if (KnownLowerBits.getMaxValue().ult(BitWidth) &&
-        (DemandedUpper.isNullValue() || KnownUpperBits.isZero())) {
-      SmallVector<int, 16> ZeroSplat(VWidth, 0);
-      Amt = Builder.CreateShuffleVector(Amt, Amt, ZeroSplat);
-      return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
-                                        : Builder.CreateLShr(Vec, Amt))
-                           : Builder.CreateAShr(Vec, Amt));
-    }
-  }
-
-  // Simplify if count is constant vector.
-  auto CDV = dyn_cast<ConstantDataVector>(Amt);
-  if (!CDV)
-    return nullptr;
-
-  // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector
-  // operand to compute the shift amount.
-  assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
-         cast<VectorType>(AmtVT)->getElementType() == SVT &&
-         "Unexpected shift-by-scalar type");
-
-  // Concatenate the sub-elements to create the 64-bit value.
-  APInt Count(64, 0);
-  for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) {
-    unsigned SubEltIdx = (NumSubElts - 1) - i;
-    auto SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx));
-    Count <<= BitWidth;
-    Count |= SubElt->getValue().zextOrTrunc(64);
-  }
-
-  // If shift-by-zero then just return the original value.
-  if (Count.isNullValue())
-    return Vec;
-
-  // Handle cases when Shift >= BitWidth.
-  if (Count.uge(BitWidth)) {
-    // If LogicalShift - just return zero.
-    if (LogicalShift)
-      return ConstantAggregateZero::get(VT);
-
-    // If ArithmeticShift - clamp Shift to (BitWidth - 1).
-    Count = APInt(64, BitWidth - 1);
-  }
-
-  // Get a constant vector of the same type as the first operand.
-  auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth));
-  auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt);
-
-  if (ShiftLeft)
-    return Builder.CreateShl(Vec, ShiftVec);
-
-  if (LogicalShift)
-    return Builder.CreateLShr(Vec, ShiftVec);
-
-  return Builder.CreateAShr(Vec, ShiftVec);
-}
-
-// Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift.
-// Unlike the generic IR shifts, the intrinsics have defined behaviour for out
-// of range shift amounts (logical - set to zero, arithmetic - splat sign bit).
-static Value *simplifyX86varShift(const IntrinsicInst &II,
-                                  InstCombiner::BuilderTy &Builder) {
-  bool LogicalShift = false;
-  bool ShiftLeft = false;
-
-  switch (II.getIntrinsicID()) {
-  default: llvm_unreachable("Unexpected intrinsic!");
-  case Intrinsic::x86_avx2_psrav_d:
-  case Intrinsic::x86_avx2_psrav_d_256:
-  case Intrinsic::x86_avx512_psrav_q_128:
-  case Intrinsic::x86_avx512_psrav_q_256:
-  case Intrinsic::x86_avx512_psrav_d_512:
-  case Intrinsic::x86_avx512_psrav_q_512:
-  case Intrinsic::x86_avx512_psrav_w_128:
-  case Intrinsic::x86_avx512_psrav_w_256:
-  case Intrinsic::x86_avx512_psrav_w_512:
-    LogicalShift = false;
-    ShiftLeft = false;
-    break;
-  case Intrinsic::x86_avx2_psrlv_d:
-  case Intrinsic::x86_avx2_psrlv_d_256:
-  case Intrinsic::x86_avx2_psrlv_q:
-  case Intrinsic::x86_avx2_psrlv_q_256:
-  case Intrinsic::x86_avx512_psrlv_d_512:
-  case Intrinsic::x86_avx512_psrlv_q_512:
-  case Intrinsic::x86_avx512_psrlv_w_128:
-  case Intrinsic::x86_avx512_psrlv_w_256:
-  case Intrinsic::x86_avx512_psrlv_w_512:
-    LogicalShift = true;
-    ShiftLeft = false;
-    break;
-  case Intrinsic::x86_avx2_psllv_d:
-  case Intrinsic::x86_avx2_psllv_d_256:
-  case Intrinsic::x86_avx2_psllv_q:
-  case Intrinsic::x86_avx2_psllv_q_256:
-  case Intrinsic::x86_avx512_psllv_d_512:
-  case Intrinsic::x86_avx512_psllv_q_512:
-  case Intrinsic::x86_avx512_psllv_w_128:
-  case Intrinsic::x86_avx512_psllv_w_256:
-  case Intrinsic::x86_avx512_psllv_w_512:
-    LogicalShift = true;
-    ShiftLeft = true;
-    break;
-  }
-  assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
-
-  auto Vec = II.getArgOperand(0);
-  auto Amt = II.getArgOperand(1);
-  auto VT = cast<VectorType>(II.getType());
-  auto SVT = VT->getElementType();
-  int NumElts = VT->getNumElements();
-  int BitWidth = SVT->getIntegerBitWidth();
-
-  // If the shift amount is guaranteed to be in-range we can replace it with a
-  // generic shift.
-  APInt UpperBits =
-      APInt::getHighBitsSet(BitWidth, BitWidth - Log2_32(BitWidth));
-  if (llvm::MaskedValueIsZero(Amt, UpperBits,
-                              II.getModule()->getDataLayout())) {
-    return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
-                                      : Builder.CreateLShr(Vec, Amt))
-                         : Builder.CreateAShr(Vec, Amt));
-  }
-
-  // Simplify if all shift amounts are constant/undef.
-  auto *CShift = dyn_cast<Constant>(Amt);
-  if (!CShift)
-    return nullptr;
-
-  // Collect each element's shift amount.
-  // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth.
-  bool AnyOutOfRange = false;
-  SmallVector<int, 8> ShiftAmts;
-  for (int I = 0; I < NumElts; ++I) {
-    auto *CElt = CShift->getAggregateElement(I);
-    if (CElt && isa<UndefValue>(CElt)) {
-      ShiftAmts.push_back(-1);
-      continue;
-    }
-
-    auto *COp = dyn_cast_or_null<ConstantInt>(CElt);
-    if (!COp)
-      return nullptr;
-
-    // Handle out of range shifts.
-    // If LogicalShift - set to BitWidth (special case).
-    // If ArithmeticShift - set to (BitWidth - 1) (sign splat).
-    APInt ShiftVal = COp->getValue();
-    if (ShiftVal.uge(BitWidth)) {
-      AnyOutOfRange = LogicalShift;
-      ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1);
-      continue;
-    }
-
-    ShiftAmts.push_back((int)ShiftVal.getZExtValue());
-  }
-
-  // If all elements out of range or UNDEF, return vector of zeros/undefs.
-  // ArithmeticShift should only hit this if they are all UNDEF.
-  auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); };
-  if (llvm::all_of(ShiftAmts, OutOfRange)) {
-    SmallVector<Constant *, 8> ConstantVec;
-    for (int Idx : ShiftAmts) {
-      if (Idx < 0) {
-        ConstantVec.push_back(UndefValue::get(SVT));
-      } else {
-        assert(LogicalShift && "Logical shift expected");
-        ConstantVec.push_back(ConstantInt::getNullValue(SVT));
-      }
-    }
-    return ConstantVector::get(ConstantVec);
-  }
-
-  // We can't handle only some out of range values with generic logical shifts.
-  if (AnyOutOfRange)
-    return nullptr;
-
-  // Build the shift amount constant vector.
-  SmallVector<Constant *, 8> ShiftVecAmts;
-  for (int Idx : ShiftAmts) {
-    if (Idx < 0)
-      ShiftVecAmts.push_back(UndefValue::get(SVT));
-    else
-      ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx));
-  }
-  auto ShiftVec = ConstantVector::get(ShiftVecAmts);
-
-  if (ShiftLeft)
-    return Builder.CreateShl(Vec, ShiftVec);
-
-  if (LogicalShift)
-    return Builder.CreateLShr(Vec, ShiftVec);
-
-  return Builder.CreateAShr(Vec, ShiftVec);
-}
-
-static Value *simplifyX86pack(IntrinsicInst &II,
-                              InstCombiner::BuilderTy &Builder, bool IsSigned) {
-  Value *Arg0 = II.getArgOperand(0);
-  Value *Arg1 = II.getArgOperand(1);
-  Type *ResTy = II.getType();
-
-  // Fast all undef handling.
-  if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1))
-    return UndefValue::get(ResTy);
-
-  auto *ArgTy = cast<VectorType>(Arg0->getType());
-  unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128;
-  unsigned NumSrcElts = ArgTy->getNumElements();
-  assert(cast<VectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) &&
-         "Unexpected packing types");
-
-  unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
-  unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits();
-  unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits();
-  assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) &&
-         "Unexpected packing types");
-
-  // Constant folding.
-  if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
-    return nullptr;
-
-  // Clamp Values - signed/unsigned both use signed clamp values, but they
-  // differ on the min/max values.
-  APInt MinValue, MaxValue;
-  if (IsSigned) {
-    // PACKSS: Truncate signed value with signed saturation.
-    // Source values less than dst minint are saturated to minint.
-    // Source values greater than dst maxint are saturated to maxint.
-    MinValue =
-        APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
-    MaxValue =
-        APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
-  } else {
-    // PACKUS: Truncate signed value with unsigned saturation.
-    // Source values less than zero are saturated to zero.
-    // Source values greater than dst maxuint are saturated to maxuint.
-    MinValue = APInt::getNullValue(SrcScalarSizeInBits);
-    MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits);
-  }
-
-  auto *MinC = Constant::getIntegerValue(ArgTy, MinValue);
-  auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue);
-  Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0);
-  Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1);
-  Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0);
-  Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1);
-
-  // Shuffle clamped args together at the lane level.
-  SmallVector<int, 32> PackMask;
-  for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
-    for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
-      PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane));
-    for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
-      PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts);
-  }
-  auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask);
-
-  // Truncate to dst size.
-  return Builder.CreateTrunc(Shuffle, ResTy);
-}
-
-static Value *simplifyX86movmsk(const IntrinsicInst &II,
-                                InstCombiner::BuilderTy &Builder) {
-  Value *Arg = II.getArgOperand(0);
-  Type *ResTy = II.getType();
-
-  // movmsk(undef) -> zero as we must ensure the upper bits are zero.
-  if (isa<UndefValue>(Arg))
-    return Constant::getNullValue(ResTy);
-
-  auto *ArgTy = dyn_cast<VectorType>(Arg->getType());
-  // We can't easily peek through x86_mmx types.
-  if (!ArgTy)
-    return nullptr;
-
-  // Expand MOVMSK to compare/bitcast/zext:
-  // e.g. PMOVMSKB(v16i8 x):
-  // %cmp = icmp slt <16 x i8> %x, zeroinitializer
-  // %int = bitcast <16 x i1> %cmp to i16
-  // %res = zext i16 %int to i32
-  unsigned NumElts = ArgTy->getNumElements();
-  Type *IntegerVecTy = VectorType::getInteger(ArgTy);
-  Type *IntegerTy = Builder.getIntNTy(NumElts);
-
-  Value *Res = Builder.CreateBitCast(Arg, IntegerVecTy);
-  Res = Builder.CreateICmpSLT(Res, Constant::getNullValue(IntegerVecTy));
-  Res = Builder.CreateBitCast(Res, IntegerTy);
-  Res = Builder.CreateZExtOrTrunc(Res, ResTy);
-  return Res;
-}
-
-static Value *simplifyX86addcarry(const IntrinsicInst &II,
-                                  InstCombiner::BuilderTy &Builder) {
-  Value *CarryIn = II.getArgOperand(0);
-  Value *Op1 = II.getArgOperand(1);
-  Value *Op2 = II.getArgOperand(2);
-  Type *RetTy = II.getType();
-  Type *OpTy = Op1->getType();
-  assert(RetTy->getStructElementType(0)->isIntegerTy(8) &&
-         RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() &&
-         "Unexpected types for x86 addcarry");
-
-  // If carry-in is zero, this is just an unsigned add with overflow.
-  if (match(CarryIn, m_ZeroInt())) {
-    Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy,
-                                          { Op1, Op2 });
-    // The types have to be adjusted to match the x86 call types.
-    Value *UAddResult = Builder.CreateExtractValue(UAdd, 0);
-    Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1),
-                                       Builder.getInt8Ty());
-    Value *Res = UndefValue::get(RetTy);
-    Res = Builder.CreateInsertValue(Res, UAddOV, 0);
-    return Builder.CreateInsertValue(Res, UAddResult, 1);
-  }
-
-  return nullptr;
-}
-
-static Value *simplifyX86insertps(const IntrinsicInst &II,
-                                  InstCombiner::BuilderTy &Builder) {
-  auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2));
-  if (!CInt)
-    return nullptr;
-
-  VectorType *VecTy = cast<VectorType>(II.getType());
-  assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
-
-  // The immediate permute control byte looks like this:
-  //    [3:0] - zero mask for each 32-bit lane
-  //    [5:4] - select one 32-bit destination lane
-  //    [7:6] - select one 32-bit source lane
-
-  uint8_t Imm = CInt->getZExtValue();
-  uint8_t ZMask = Imm & 0xf;
-  uint8_t DestLane = (Imm >> 4) & 0x3;
-  uint8_t SourceLane = (Imm >> 6) & 0x3;
-
-  ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
-
-  // If all zero mask bits are set, this was just a weird way to
-  // generate a zero vector.
-  if (ZMask == 0xf)
-    return ZeroVector;
-
-  // Initialize by passing all of the first source bits through.
-  int ShuffleMask[4] = {0, 1, 2, 3};
-
-  // We may replace the second operand with the zero vector.
-  Value *V1 = II.getArgOperand(1);
-
-  if (ZMask) {
-    // If the zero mask is being used with a single input or the zero mask
-    // overrides the destination lane, this is a shuffle with the zero vector.
-    if ((II.getArgOperand(0) == II.getArgOperand(1)) ||
-        (ZMask & (1 << DestLane))) {
-      V1 = ZeroVector;
-      // We may still move 32-bits of the first source vector from one lane
-      // to another.
-      ShuffleMask[DestLane] = SourceLane;
-      // The zero mask may override the previous insert operation.
-      for (unsigned i = 0; i < 4; ++i)
-        if ((ZMask >> i) & 0x1)
-          ShuffleMask[i] = i + 4;
-    } else {
-      // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle?
-      return nullptr;
-    }
-  } else {
-    // Replace the selected destination lane with the selected source lane.
-    ShuffleMask[DestLane] = SourceLane + 4;
-  }
-
-  return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask);
-}
-
-/// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding
-/// or conversion to a shuffle vector.
-static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0,
-                               ConstantInt *CILength, ConstantInt *CIIndex,
-                               InstCombiner::BuilderTy &Builder) {
-  auto LowConstantHighUndef = [&](uint64_t Val) {
-    Type *IntTy64 = Type::getInt64Ty(II.getContext());
-    Constant *Args[] = {ConstantInt::get(IntTy64, Val),
-                        UndefValue::get(IntTy64)};
-    return ConstantVector::get(Args);
-  };
-
-  // See if we're dealing with constant values.
-  Constant *C0 = dyn_cast<Constant>(Op0);
-  ConstantInt *CI0 =
-      C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
-         : nullptr;
-
-  // Attempt to constant fold.
-  if (CILength && CIIndex) {
-    // From AMD documentation: "The bit index and field length are each six
-    // bits in length other bits of the field are ignored."
-    APInt APIndex = CIIndex->getValue().zextOrTrunc(6);
-    APInt APLength = CILength->getValue().zextOrTrunc(6);
-
-    unsigned Index = APIndex.getZExtValue();
-
-    // From AMD documentation: "a value of zero in the field length is
-    // defined as length of 64".
-    unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
-
-    // From AMD documentation: "If the sum of the bit index + length field
-    // is greater than 64, the results are undefined".
-    unsigned End = Index + Length;
-
-    // Note that both field index and field length are 8-bit quantities.
-    // Since variables 'Index' and 'Length' are unsigned values
-    // obtained from zero-extending field index and field length
-    // respectively, their sum should never wrap around.
-    if (End > 64)
-      return UndefValue::get(II.getType());
-
-    // If we are inserting whole bytes, we can convert this to a shuffle.
-    // Lowering can recognize EXTRQI shuffle masks.
-    if ((Length % 8) == 0 && (Index % 8) == 0) {
-      // Convert bit indices to byte indices.
-      Length /= 8;
-      Index /= 8;
-
-      Type *IntTy8 = Type::getInt8Ty(II.getContext());
-      auto *ShufTy = FixedVectorType::get(IntTy8, 16);
-
-      SmallVector<int, 16> ShuffleMask;
-      for (int i = 0; i != (int)Length; ++i)
-        ShuffleMask.push_back(i + Index);
-      for (int i = Length; i != 8; ++i)
-        ShuffleMask.push_back(i + 16);
-      for (int i = 8; i != 16; ++i)
-        ShuffleMask.push_back(-1);
-
-      Value *SV = Builder.CreateShuffleVector(
-          Builder.CreateBitCast(Op0, ShufTy),
-          ConstantAggregateZero::get(ShufTy), ShuffleMask);
-      return Builder.CreateBitCast(SV, II.getType());
-    }
-
-    // Constant Fold - shift Index'th bit to lowest position and mask off
-    // Length bits.
-    if (CI0) {
-      APInt Elt = CI0->getValue();
-      Elt.lshrInPlace(Index);
-      Elt = Elt.zextOrTrunc(Length);
-      return LowConstantHighUndef(Elt.getZExtValue());
-    }
-
-    // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI.
-    if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) {
-      Value *Args[] = {Op0, CILength, CIIndex};
-      Module *M = II.getModule();
-      Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi);
-      return Builder.CreateCall(F, Args);
-    }
-  }
-
-  // Constant Fold - extraction from zero is always {zero, undef}.
-  if (CI0 && CI0->isZero())
-    return LowConstantHighUndef(0);
-
-  return nullptr;
-}
-
-/// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant
-/// folding or conversion to a shuffle vector.
-static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1,
-                                 APInt APLength, APInt APIndex,
-                                 InstCombiner::BuilderTy &Builder) {
-  // From AMD documentation: "The bit index and field length are each six bits
-  // in length other bits of the field are ignored."
-  APIndex = APIndex.zextOrTrunc(6);
-  APLength = APLength.zextOrTrunc(6);
-
-  // Attempt to constant fold.
-  unsigned Index = APIndex.getZExtValue();
-
-  // From AMD documentation: "a value of zero in the field length is
-  // defined as length of 64".
-  unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
-
-  // From AMD documentation: "If the sum of the bit index + length field
-  // is greater than 64, the results are undefined".
-  unsigned End = Index + Length;
-
-  // Note that both field index and field length are 8-bit quantities.
-  // Since variables 'Index' and 'Length' are unsigned values
-  // obtained from zero-extending field index and field length
-  // respectively, their sum should never wrap around.
-  if (End > 64)
-    return UndefValue::get(II.getType());
-
-  // If we are inserting whole bytes, we can convert this to a shuffle.
-  // Lowering can recognize INSERTQI shuffle masks.
-  if ((Length % 8) == 0 && (Index % 8) == 0) {
-    // Convert bit indices to byte indices.
-    Length /= 8;
-    Index /= 8;
-
-    Type *IntTy8 = Type::getInt8Ty(II.getContext());
-    auto *ShufTy = FixedVectorType::get(IntTy8, 16);
-
-    SmallVector<int, 16> ShuffleMask;
-    for (int i = 0; i != (int)Index; ++i)
-      ShuffleMask.push_back(i);
-    for (int i = 0; i != (int)Length; ++i)
-      ShuffleMask.push_back(i + 16);
-    for (int i = Index + Length; i != 8; ++i)
-      ShuffleMask.push_back(i);
-    for (int i = 8; i != 16; ++i)
-      ShuffleMask.push_back(-1);
-
-    Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy),
-                                            Builder.CreateBitCast(Op1, ShufTy),
-                                            ShuffleMask);
-    return Builder.CreateBitCast(SV, II.getType());
-  }
-
-  // See if we're dealing with constant values.
-  Constant *C0 = dyn_cast<Constant>(Op0);
-  Constant *C1 = dyn_cast<Constant>(Op1);
-  ConstantInt *CI00 =
-      C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
-         : nullptr;
-  ConstantInt *CI10 =
-      C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
-         : nullptr;
-
-  // Constant Fold - insert bottom Length bits starting at the Index'th bit.
-  if (CI00 && CI10) {
-    APInt V00 = CI00->getValue();
-    APInt V10 = CI10->getValue();
-    APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index);
-    V00 = V00 & ~Mask;
-    V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index);
-    APInt Val = V00 | V10;
-    Type *IntTy64 = Type::getInt64Ty(II.getContext());
-    Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()),
-                        UndefValue::get(IntTy64)};
-    return ConstantVector::get(Args);
-  }
-
-  // If we were an INSERTQ call, we'll save demanded elements if we convert to
-  // INSERTQI.
-  if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) {
-    Type *IntTy8 = Type::getInt8Ty(II.getContext());
-    Constant *CILength = ConstantInt::get(IntTy8, Length, false);
-    Constant *CIIndex = ConstantInt::get(IntTy8, Index, false);
-
-    Value *Args[] = {Op0, Op1, CILength, CIIndex};
-    Module *M = II.getModule();
-    Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi);
-    return Builder.CreateCall(F, Args);
-  }
-
-  return nullptr;
-}
-
-/// Attempt to convert pshufb* to shufflevector if the mask is constant.
-static Value *simplifyX86pshufb(const IntrinsicInst &II,
-                                InstCombiner::BuilderTy &Builder) {
-  Constant *V = dyn_cast<Constant>(II.getArgOperand(1));
-  if (!V)
-    return nullptr;
-
-  auto *VecTy = cast<VectorType>(II.getType());
-  unsigned NumElts = VecTy->getNumElements();
-  assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
-         "Unexpected number of elements in shuffle mask!");
-
-  // Construct a shuffle mask from constant integers or UNDEFs.
-  int Indexes[64];
-
-  // Each byte in the shuffle control mask forms an index to permute the
-  // corresponding byte in the destination operand.
-  for (unsigned I = 0; I < NumElts; ++I) {
-    Constant *COp = V->getAggregateElement(I);
-    if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
-      return nullptr;
-
-    if (isa<UndefValue>(COp)) {
-      Indexes[I] = -1;
-      continue;
-    }
-
-    int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue();
-
-    // If the most significant bit (bit[7]) of each byte of the shuffle
-    // control mask is set, then zero is written in the result byte.
-    // The zero vector is in the right-hand side of the resulting
-    // shufflevector.
-
-    // The value of each index for the high 128-bit lane is the least
-    // significant 4 bits of the respective shuffle control byte.
-    Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0);
-    Indexes[I] = Index;
-  }
-
-  auto V1 = II.getArgOperand(0);
-  auto V2 = Constant::getNullValue(VecTy);
-  return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes, NumElts));
-}
-
-/// Attempt to convert vpermilvar* to shufflevector if the mask is constant.
-static Value *simplifyX86vpermilvar(const IntrinsicInst &II,
-                                    InstCombiner::BuilderTy &Builder) {
-  Constant *V = dyn_cast<Constant>(II.getArgOperand(1));
-  if (!V)
-    return nullptr;
-
-  auto *VecTy = cast<VectorType>(II.getType());
-  unsigned NumElts = VecTy->getNumElements();
-  bool IsPD = VecTy->getScalarType()->isDoubleTy();
-  unsigned NumLaneElts = IsPD ? 2 : 4;
-  assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2);
-
-  // Construct a shuffle mask from constant integers or UNDEFs.
-  int Indexes[16];
-
-  // The intrinsics only read one or two bits, clear the rest.
-  for (unsigned I = 0; I < NumElts; ++I) {
-    Constant *COp = V->getAggregateElement(I);
-    if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
-      return nullptr;
-
-    if (isa<UndefValue>(COp)) {
-      Indexes[I] = -1;
-      continue;
-    }
-
-    APInt Index = cast<ConstantInt>(COp)->getValue();
-    Index = Index.zextOrTrunc(32).getLoBits(2);
-
-    // The PD variants uses bit 1 to select per-lane element index, so
-    // shift down to convert to generic shuffle mask index.
-    if (IsPD)
-      Index.lshrInPlace(1);
-
-    // The _256 variants are a bit trickier since the mask bits always index
-    // into the corresponding 128 half. In order to convert to a generic
-    // shuffle, we have to make that explicit.
-    Index += APInt(32, (I / NumLaneElts) * NumLaneElts);
-
-    Indexes[I] = Index.getZExtValue();
-  }
-
-  auto V1 = II.getArgOperand(0);
-  auto V2 = UndefValue::get(V1->getType());
-  return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes, NumElts));
-}
-
-/// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant.
-static Value *simplifyX86vpermv(const IntrinsicInst &II,
-                                InstCombiner::BuilderTy &Builder) {
-  auto *V = dyn_cast<Constant>(II.getArgOperand(1));
-  if (!V)
-    return nullptr;
-
-  auto *VecTy = cast<VectorType>(II.getType());
-  unsigned Size = VecTy->getNumElements();
-  assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) &&
-         "Unexpected shuffle mask size");
-
-  // Construct a shuffle mask from constant integers or UNDEFs.
-  int Indexes[64];
-
-  for (unsigned I = 0; I < Size; ++I) {
-    Constant *COp = V->getAggregateElement(I);
-    if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
-      return nullptr;
-
-    if (isa<UndefValue>(COp)) {
-      Indexes[I] = -1;
-      continue;
-    }
-
-    uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();
-    Index &= Size - 1;
-    Indexes[I] = Index;
-  }
-
-  auto V1 = II.getArgOperand(0);
-  auto V2 = UndefValue::get(VecTy);
-  return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes, Size));
-}
-
 // TODO, Obvious Missing Transforms:
 // * Narrow width by halfs excluding zero/undef lanes
-Value *InstCombiner::simplifyMaskedLoad(IntrinsicInst &II) {
+Value *InstCombinerImpl::simplifyMaskedLoad(IntrinsicInst &II) {
   Value *LoadPtr = II.getArgOperand(0);
   const Align Alignment =
       cast<ConstantInt>(II.getArgOperand(1))->getAlignValue();
@@ -1118,9 +289,8 @@ Value *InstCombiner::simplifyMaskedLoad(IntrinsicInst &II) {
 
   // If we can unconditionally load from this address, replace with a
   // load/select idiom. TODO: use DT for context sensitive query
-  if (isDereferenceableAndAlignedPointer(LoadPtr, II.getType(), Alignment,
-                                         II.getModule()->getDataLayout(), &II,
-                                         nullptr)) {
+  if (isDereferenceablePointer(LoadPtr, II.getType(),
+                               II.getModule()->getDataLayout(), &II, nullptr)) {
     Value *LI = Builder.CreateAlignedLoad(II.getType(), LoadPtr, Alignment,
                                          "unmaskedload");
     return Builder.CreateSelect(II.getArgOperand(2), LI, II.getArgOperand(3));
@@ -1132,7 +302,7 @@ Value *InstCombiner::simplifyMaskedLoad(IntrinsicInst &II) {
 // TODO, Obvious Missing Transforms:
 // * Single constant active lane -> store
 // * Narrow width by halfs excluding zero/undef lanes
-Instruction *InstCombiner::simplifyMaskedStore(IntrinsicInst &II) {
+Instruction *InstCombinerImpl::simplifyMaskedStore(IntrinsicInst &II) {
   auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3));
   if (!ConstMask)
     return nullptr;
@@ -1148,11 +318,14 @@ Instruction *InstCombiner::simplifyMaskedStore(IntrinsicInst &II) {
     return new StoreInst(II.getArgOperand(0), StorePtr, false, Alignment);
   }
 
+  if (isa<ScalableVectorType>(ConstMask->getType()))
+    return nullptr;
+
   // Use masked off lanes to simplify operands via SimplifyDemandedVectorElts
   APInt DemandedElts = possiblyDemandedEltsInMask(ConstMask);
   APInt UndefElts(DemandedElts.getBitWidth(), 0);
-  if (Value *V = SimplifyDemandedVectorElts(II.getOperand(0),
-                                            DemandedElts, UndefElts))
+  if (Value *V =
+          SimplifyDemandedVectorElts(II.getOperand(0), DemandedElts, UndefElts))
     return replaceOperand(II, 0, V);
 
   return nullptr;
@@ -1165,7 +338,7 @@ Instruction *InstCombiner::simplifyMaskedStore(IntrinsicInst &II) {
 // * Narrow width by halfs excluding zero/undef lanes
 // * Vector splat address w/known mask -> scalar load
 // * Vector incrementing address -> vector masked load
-Instruction *InstCombiner::simplifyMaskedGather(IntrinsicInst &II) {
+Instruction *InstCombinerImpl::simplifyMaskedGather(IntrinsicInst &II) {
   return nullptr;
 }
 
@@ -1175,7 +348,7 @@ Instruction *InstCombiner::simplifyMaskedGather(IntrinsicInst &II) {
 // * Narrow store width by halfs excluding zero/undef lanes
 // * Vector splat address w/known mask -> scalar store
 // * Vector incrementing address -> vector masked store
-Instruction *InstCombiner::simplifyMaskedScatter(IntrinsicInst &II) {
+Instruction *InstCombinerImpl::simplifyMaskedScatter(IntrinsicInst &II) {
   auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3));
   if (!ConstMask)
     return nullptr;
@@ -1184,14 +357,17 @@ Instruction *InstCombiner::simplifyMaskedScatter(IntrinsicInst &II) {
   if (ConstMask->isNullValue())
     return eraseInstFromFunction(II);
 
+  if (isa<ScalableVectorType>(ConstMask->getType()))
+    return nullptr;
+
   // Use masked off lanes to simplify operands via SimplifyDemandedVectorElts
   APInt DemandedElts = possiblyDemandedEltsInMask(ConstMask);
   APInt UndefElts(DemandedElts.getBitWidth(), 0);
-  if (Value *V = SimplifyDemandedVectorElts(II.getOperand(0),
-                                            DemandedElts, UndefElts))
+  if (Value *V =
+          SimplifyDemandedVectorElts(II.getOperand(0), DemandedElts, UndefElts))
     return replaceOperand(II, 0, V);
-  if (Value *V = SimplifyDemandedVectorElts(II.getOperand(1),
-                                            DemandedElts, UndefElts))
+  if (Value *V =
+          SimplifyDemandedVectorElts(II.getOperand(1), DemandedElts, UndefElts))
     return replaceOperand(II, 1, V);
 
   return nullptr;
@@ -1206,7 +382,7 @@ Instruction *InstCombiner::simplifyMaskedScatter(IntrinsicInst &II) {
 /// This is legal because it preserves the most recent information about
 /// the presence or absence of invariant.group.
 static Instruction *simplifyInvariantGroupIntrinsic(IntrinsicInst &II,
-                                                    InstCombiner &IC) {
+                                                    InstCombinerImpl &IC) {
   auto *Arg = II.getArgOperand(0);
   auto *StrippedArg = Arg->stripPointerCasts();
   auto *StrippedInvariantGroupsArg = Arg->stripPointerCastsAndInvariantGroups();
@@ -1231,7 +407,7 @@ static Instruction *simplifyInvariantGroupIntrinsic(IntrinsicInst &II,
   return cast<Instruction>(Result);
 }
 
-static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombiner &IC) {
+static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombinerImpl &IC) {
   assert((II.getIntrinsicID() == Intrinsic::cttz ||
           II.getIntrinsicID() == Intrinsic::ctlz) &&
          "Expected cttz or ctlz intrinsic");
@@ -1257,6 +433,9 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombiner &IC) {
     SelectPatternFlavor SPF = matchSelectPattern(Op0, X, Y).Flavor;
     if (SPF == SPF_ABS || SPF == SPF_NABS)
       return IC.replaceOperand(II, 0, X);
+
+    if (match(Op0, m_Intrinsic<Intrinsic::abs>(m_Value(X))))
+      return IC.replaceOperand(II, 0, X);
   }
 
   KnownBits Known = IC.computeKnownBits(Op0, 0, &II);
@@ -1301,7 +480,7 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombiner &IC) {
   return nullptr;
 }
 
-static Instruction *foldCtpop(IntrinsicInst &II, InstCombiner &IC) {
+static Instruction *foldCtpop(IntrinsicInst &II, InstCombinerImpl &IC) {
   assert(II.getIntrinsicID() == Intrinsic::ctpop &&
          "Expected ctpop intrinsic");
   Type *Ty = II.getType();
@@ -1356,107 +535,6 @@ static Instruction *foldCtpop(IntrinsicInst &II, InstCombiner &IC) {
   return nullptr;
 }
 
-// TODO: If the x86 backend knew how to convert a bool vector mask back to an
-// XMM register mask efficiently, we could transform all x86 masked intrinsics
-// to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
-static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) {
-  Value *Ptr = II.getOperand(0);
-  Value *Mask = II.getOperand(1);
-  Constant *ZeroVec = Constant::getNullValue(II.getType());
-
-  // Special case a zero mask since that's not a ConstantDataVector.
-  // This masked load instruction creates a zero vector.
-  if (isa<ConstantAggregateZero>(Mask))
-    return IC.replaceInstUsesWith(II, ZeroVec);
-
-  auto *ConstMask = dyn_cast<ConstantDataVector>(Mask);
-  if (!ConstMask)
-    return nullptr;
-
-  // The mask is constant. Convert this x86 intrinsic to the LLVM instrinsic
-  // to allow target-independent optimizations.
-
-  // First, cast the x86 intrinsic scalar pointer to a vector pointer to match
-  // the LLVM intrinsic definition for the pointer argument.
-  unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
-  PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace);
-  Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
-
-  // Second, convert the x86 XMM integer vector mask to a vector of bools based
-  // on each element's most significant bit (the sign bit).
-  Constant *BoolMask = getNegativeIsTrueBoolVec(ConstMask);
-
-  // The pass-through vector for an x86 masked load is a zero vector.
-  CallInst *NewMaskedLoad =
-      IC.Builder.CreateMaskedLoad(PtrCast, Align(1), BoolMask, ZeroVec);
-  return IC.replaceInstUsesWith(II, NewMaskedLoad);
-}
-
-// TODO: If the x86 backend knew how to convert a bool vector mask back to an
-// XMM register mask efficiently, we could transform all x86 masked intrinsics
-// to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
-static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) {
-  Value *Ptr = II.getOperand(0);
-  Value *Mask = II.getOperand(1);
-  Value *Vec = II.getOperand(2);
-
-  // Special case a zero mask since that's not a ConstantDataVector:
-  // this masked store instruction does nothing.
-  if (isa<ConstantAggregateZero>(Mask)) {
-    IC.eraseInstFromFunction(II);
-    return true;
-  }
-
-  // The SSE2 version is too weird (eg, unaligned but non-temporal) to do
-  // anything else at this level.
-  if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu)
-    return false;
-
-  auto *ConstMask = dyn_cast<ConstantDataVector>(Mask);
-  if (!ConstMask)
-    return false;
-
-  // The mask is constant. Convert this x86 intrinsic to the LLVM instrinsic
-  // to allow target-independent optimizations.
-
-  // First, cast the x86 intrinsic scalar pointer to a vector pointer to match
-  // the LLVM intrinsic definition for the pointer argument.
-  unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
-  PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace);
-  Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
-
-  // Second, convert the x86 XMM integer vector mask to a vector of bools based
-  // on each element's most significant bit (the sign bit).
-  Constant *BoolMask = getNegativeIsTrueBoolVec(ConstMask);
-
-  IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask);
-
-  // 'Replace uses' doesn't work for stores. Erase the original masked store.
-  IC.eraseInstFromFunction(II);
-  return true;
-}
-
-// Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
-//
-// A single NaN input is folded to minnum, so we rely on that folding for
-// handling NaNs.
-static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
-                           const APFloat &Src2) {
-  APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);
-
-  APFloat::cmpResult Cmp0 = Max3.compare(Src0);
-  assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
-  if (Cmp0 == APFloat::cmpEqual)
-    return maxnum(Src1, Src2);
-
-  APFloat::cmpResult Cmp1 = Max3.compare(Src1);
-  assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
-  if (Cmp1 == APFloat::cmpEqual)
-    return maxnum(Src0, Src2);
-
-  return maxnum(Src0, Src1);
-}
-
 /// Convert a table lookup to shufflevector if the mask is constant.
 /// This could benefit tbl1 if the mask is { 7,6,5,4,3,2,1,0 }, in
 /// which case we could lower the shufflevector with rev64 instructions
@@ -1468,7 +546,7 @@ static Value *simplifyNeonTbl1(const IntrinsicInst &II,
   if (!C)
     return nullptr;
 
-  auto *VecTy = cast<VectorType>(II.getType());
+  auto *VecTy = cast<FixedVectorType>(II.getType());
   unsigned NumElts = VecTy->getNumElements();
 
   // Only perform this transformation for <8 x i8> vector types.
@@ -1495,28 +573,6 @@ static Value *simplifyNeonTbl1(const IntrinsicInst &II,
   return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes));
 }
 
-/// Convert a vector load intrinsic into a simple llvm load instruction.
-/// This is beneficial when the underlying object being addressed comes
-/// from a constant, since we get constant-folding for free.
-static Value *simplifyNeonVld1(const IntrinsicInst &II,
-                               unsigned MemAlign,
-                               InstCombiner::BuilderTy &Builder) {
-  auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));
-
-  if (!IntrAlign)
-    return nullptr;
-
-  unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign ?
-                       MemAlign : IntrAlign->getLimitedValue();
-
-  if (!isPowerOf2_32(Alignment))
-    return nullptr;
-
-  auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0),
-                                          PointerType::get(II.getType(), 0));
-  return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment));
-}
-
 // Returns true iff the 2 intrinsics have the same operands, limiting the
 // comparison to the first NumOperands.
 static bool haveSameOperands(const IntrinsicInst &I, const IntrinsicInst &E,
@@ -1538,9 +594,9 @@ static bool haveSameOperands(const IntrinsicInst &I, const IntrinsicInst &E,
 //   call @llvm.foo.start(i1 0) ; This one won't be skipped: it will be removed
 //   call @llvm.foo.end(i1 0)
 //   call @llvm.foo.end(i1 0) ; &I
-static bool removeTriviallyEmptyRange(
-    IntrinsicInst &EndI, InstCombiner &IC,
-    std::function<bool(const IntrinsicInst &)> IsStart) {
+static bool
+removeTriviallyEmptyRange(IntrinsicInst &EndI, InstCombinerImpl &IC,
+                          std::function<bool(const IntrinsicInst &)> IsStart) {
   // We start from the end intrinsic and scan backwards, so that InstCombine
   // has already processed (and potentially removed) all the instructions
   // before the end intrinsic.
@@ -1566,256 +622,7 @@ static bool removeTriviallyEmptyRange(
   return false;
 }
 
-// Convert NVVM intrinsics to target-generic LLVM code where possible.
-static Instruction *SimplifyNVVMIntrinsic(IntrinsicInst *II, InstCombiner &IC) {
-  // Each NVVM intrinsic we can simplify can be replaced with one of:
-  //
-  //  * an LLVM intrinsic,
-  //  * an LLVM cast operation,
-  //  * an LLVM binary operation, or
-  //  * ad-hoc LLVM IR for the particular operation.
-
-  // Some transformations are only valid when the module's
-  // flush-denormals-to-zero (ftz) setting is true/false, whereas other
-  // transformations are valid regardless of the module's ftz setting.
-  enum FtzRequirementTy {
-    FTZ_Any,       // Any ftz setting is ok.
-    FTZ_MustBeOn,  // Transformation is valid only if ftz is on.
-    FTZ_MustBeOff, // Transformation is valid only if ftz is off.
-  };
-  // Classes of NVVM intrinsics that can't be replaced one-to-one with a
-  // target-generic intrinsic, cast op, or binary op but that we can nonetheless
-  // simplify.
-  enum SpecialCase {
-    SPC_Reciprocal,
-  };
-
-  // SimplifyAction is a poor-man's variant (plus an additional flag) that
-  // represents how to replace an NVVM intrinsic with target-generic LLVM IR.
-  struct SimplifyAction {
-    // Invariant: At most one of these Optionals has a value.
-    Optional<Intrinsic::ID> IID;
-    Optional<Instruction::CastOps> CastOp;
-    Optional<Instruction::BinaryOps> BinaryOp;
-    Optional<SpecialCase> Special;
-
-    FtzRequirementTy FtzRequirement = FTZ_Any;
-
-    SimplifyAction() = default;
-
-    SimplifyAction(Intrinsic::ID IID, FtzRequirementTy FtzReq)
-        : IID(IID), FtzRequirement(FtzReq) {}
-
-    // Cast operations don't have anything to do with FTZ, so we skip that
-    // argument.
-    SimplifyAction(Instruction::CastOps CastOp) : CastOp(CastOp) {}
-
-    SimplifyAction(Instruction::BinaryOps BinaryOp, FtzRequirementTy FtzReq)
-        : BinaryOp(BinaryOp), FtzRequirement(FtzReq) {}
-
-    SimplifyAction(SpecialCase Special, FtzRequirementTy FtzReq)
-        : Special(Special), FtzRequirement(FtzReq) {}
-  };
-
-  // Try to generate a SimplifyAction describing how to replace our
-  // IntrinsicInstr with target-generic LLVM IR.
-  const SimplifyAction Action = [II]() -> SimplifyAction {
-    switch (II->getIntrinsicID()) {
-    // NVVM intrinsics that map directly to LLVM intrinsics.
-    case Intrinsic::nvvm_ceil_d:
-      return {Intrinsic::ceil, FTZ_Any};
-    case Intrinsic::nvvm_ceil_f:
-      return {Intrinsic::ceil, FTZ_MustBeOff};
-    case Intrinsic::nvvm_ceil_ftz_f:
-      return {Intrinsic::ceil, FTZ_MustBeOn};
-    case Intrinsic::nvvm_fabs_d:
-      return {Intrinsic::fabs, FTZ_Any};
-    case Intrinsic::nvvm_fabs_f:
-      return {Intrinsic::fabs, FTZ_MustBeOff};
-    case Intrinsic::nvvm_fabs_ftz_f:
-      return {Intrinsic::fabs, FTZ_MustBeOn};
-    case Intrinsic::nvvm_floor_d:
-      return {Intrinsic::floor, FTZ_Any};
-    case Intrinsic::nvvm_floor_f:
-      return {Intrinsic::floor, FTZ_MustBeOff};
-    case Intrinsic::nvvm_floor_ftz_f:
-      return {Intrinsic::floor, FTZ_MustBeOn};
-    case Intrinsic::nvvm_fma_rn_d:
-      return {Intrinsic::fma, FTZ_Any};
-    case Intrinsic::nvvm_fma_rn_f:
-      return {Intrinsic::fma, FTZ_MustBeOff};
-    case Intrinsic::nvvm_fma_rn_ftz_f:
-      return {Intrinsic::fma, FTZ_MustBeOn};
-    case Intrinsic::nvvm_fmax_d:
-      return {Intrinsic::maxnum, FTZ_Any};
-    case Intrinsic::nvvm_fmax_f:
-      return {Intrinsic::maxnum, FTZ_MustBeOff};
-    case Intrinsic::nvvm_fmax_ftz_f:
-      return {Intrinsic::maxnum, FTZ_MustBeOn};
-    case Intrinsic::nvvm_fmin_d:
-      return {Intrinsic::minnum, FTZ_Any};
-    case Intrinsic::nvvm_fmin_f:
-      return {Intrinsic::minnum, FTZ_MustBeOff};
-    case Intrinsic::nvvm_fmin_ftz_f:
-      return {Intrinsic::minnum, FTZ_MustBeOn};
-    case Intrinsic::nvvm_round_d:
-      return {Intrinsic::round, FTZ_Any};
-    case Intrinsic::nvvm_round_f:
-      return {Intrinsic::round, FTZ_MustBeOff};
-    case Intrinsic::nvvm_round_ftz_f:
-      return {Intrinsic::round, FTZ_MustBeOn};
-    case Intrinsic::nvvm_sqrt_rn_d:
-      return {Intrinsic::sqrt, FTZ_Any};
-    case Intrinsic::nvvm_sqrt_f:
-      // nvvm_sqrt_f is a special case.  For  most intrinsics, foo_ftz_f is the
-      // ftz version, and foo_f is the non-ftz version.  But nvvm_sqrt_f adopts
-      // the ftz-ness of the surrounding code.  sqrt_rn_f and sqrt_rn_ftz_f are
-      // the versions with explicit ftz-ness.
-      return {Intrinsic::sqrt, FTZ_Any};
-    case Intrinsic::nvvm_sqrt_rn_f:
-      return {Intrinsic::sqrt, FTZ_MustBeOff};
-    case Intrinsic::nvvm_sqrt_rn_ftz_f:
-      return {Intrinsic::sqrt, FTZ_MustBeOn};
-    case Intrinsic::nvvm_trunc_d:
-      return {Intrinsic::trunc, FTZ_Any};
-    case Intrinsic::nvvm_trunc_f:
-      return {Intrinsic::trunc, FTZ_MustBeOff};
-    case Intrinsic::nvvm_trunc_ftz_f:
-      return {Intrinsic::trunc, FTZ_MustBeOn};
-
-    // NVVM intrinsics that map to LLVM cast operations.
-    //
-    // Note that llvm's target-generic conversion operators correspond to the rz
-    // (round to zero) versions of the nvvm conversion intrinsics, even though
-    // most everything else here uses the rn (round to nearest even) nvvm ops.
-    case Intrinsic::nvvm_d2i_rz:
-    case Intrinsic::nvvm_f2i_rz:
-    case Intrinsic::nvvm_d2ll_rz:
-    case Intrinsic::nvvm_f2ll_rz:
-      return {Instruction::FPToSI};
-    case Intrinsic::nvvm_d2ui_rz:
-    case Intrinsic::nvvm_f2ui_rz:
-    case Intrinsic::nvvm_d2ull_rz:
-    case Intrinsic::nvvm_f2ull_rz:
-      return {Instruction::FPToUI};
-    case Intrinsic::nvvm_i2d_rz:
-    case Intrinsic::nvvm_i2f_rz:
-    case Intrinsic::nvvm_ll2d_rz:
-    case Intrinsic::nvvm_ll2f_rz:
-      return {Instruction::SIToFP};
-    case Intrinsic::nvvm_ui2d_rz:
-    case Intrinsic::nvvm_ui2f_rz:
-    case Intrinsic::nvvm_ull2d_rz:
-    case Intrinsic::nvvm_ull2f_rz:
-      return {Instruction::UIToFP};
-
-    // NVVM intrinsics that map to LLVM binary ops.
-    case Intrinsic::nvvm_add_rn_d:
-      return {Instruction::FAdd, FTZ_Any};
-    case Intrinsic::nvvm_add_rn_f:
-      return {Instruction::FAdd, FTZ_MustBeOff};
-    case Intrinsic::nvvm_add_rn_ftz_f:
-      return {Instruction::FAdd, FTZ_MustBeOn};
-    case Intrinsic::nvvm_mul_rn_d:
-      return {Instruction::FMul, FTZ_Any};
-    case Intrinsic::nvvm_mul_rn_f:
-      return {Instruction::FMul, FTZ_MustBeOff};
-    case Intrinsic::nvvm_mul_rn_ftz_f:
-      return {Instruction::FMul, FTZ_MustBeOn};
-    case Intrinsic::nvvm_div_rn_d:
-      return {Instruction::FDiv, FTZ_Any};
-    case Intrinsic::nvvm_div_rn_f:
-      return {Instruction::FDiv, FTZ_MustBeOff};
-    case Intrinsic::nvvm_div_rn_ftz_f:
-      return {Instruction::FDiv, FTZ_MustBeOn};
-
-    // The remainder of cases are NVVM intrinsics that map to LLVM idioms, but
-    // need special handling.
-    //
-    // We seem to be missing intrinsics for rcp.approx.{ftz.}f32, which is just
-    // as well.
-    case Intrinsic::nvvm_rcp_rn_d:
-      return {SPC_Reciprocal, FTZ_Any};
-    case Intrinsic::nvvm_rcp_rn_f:
-      return {SPC_Reciprocal, FTZ_MustBeOff};
-    case Intrinsic::nvvm_rcp_rn_ftz_f:
-      return {SPC_Reciprocal, FTZ_MustBeOn};
-
-    // We do not currently simplify intrinsics that give an approximate answer.
-    // These include:
-    //
-    //   - nvvm_cos_approx_{f,ftz_f}
-    //   - nvvm_ex2_approx_{d,f,ftz_f}
-    //   - nvvm_lg2_approx_{d,f,ftz_f}
-    //   - nvvm_sin_approx_{f,ftz_f}
-    //   - nvvm_sqrt_approx_{f,ftz_f}
-    //   - nvvm_rsqrt_approx_{d,f,ftz_f}
-    //   - nvvm_div_approx_{ftz_d,ftz_f,f}
-    //   - nvvm_rcp_approx_ftz_d
-    //
-    // Ideally we'd encode them as e.g. "fast call @llvm.cos", where "fast"
-    // means that fastmath is enabled in the intrinsic.  Unfortunately only
-    // binary operators (currently) have a fastmath bit in SelectionDAG, so this
-    // information gets lost and we can't select on it.
-    //
-    // TODO: div and rcp are lowered to a binary op, so these we could in theory
-    // lower them to "fast fdiv".
-
-    default:
-      return {};
-    }
-  }();
-
-  // If Action.FtzRequirementTy is not satisfied by the module's ftz state, we
-  // can bail out now.  (Notice that in the case that IID is not an NVVM
-  // intrinsic, we don't have to look up any module metadata, as
-  // FtzRequirementTy will be FTZ_Any.)
-  if (Action.FtzRequirement != FTZ_Any) {
-    StringRef Attr = II->getFunction()
-                         ->getFnAttribute("denormal-fp-math-f32")
-                         .getValueAsString();
-    DenormalMode Mode = parseDenormalFPAttribute(Attr);
-    bool FtzEnabled = Mode.Output != DenormalMode::IEEE;
-
-    if (FtzEnabled != (Action.FtzRequirement == FTZ_MustBeOn))
-      return nullptr;
-  }
-
-  // Simplify to target-generic intrinsic.
-  if (Action.IID) {
-    SmallVector<Value *, 4> Args(II->arg_operands());
-    // All the target-generic intrinsics currently of interest to us have one
-    // type argument, equal to that of the nvvm intrinsic's argument.
-    Type *Tys[] = {II->getArgOperand(0)->getType()};
-    return CallInst::Create(
-        Intrinsic::getDeclaration(II->getModule(), *Action.IID, Tys), Args);
-  }
-
-  // Simplify to target-generic binary op.
-  if (Action.BinaryOp)
-    return BinaryOperator::Create(*Action.BinaryOp, II->getArgOperand(0),
-                                  II->getArgOperand(1), II->getName());
-
-  // Simplify to target-generic cast op.
-  if (Action.CastOp)
-    return CastInst::Create(*Action.CastOp, II->getArgOperand(0), II->getType(),
-                            II->getName());
-
-  // All that's left are the special cases.
-  if (!Action.Special)
-    return nullptr;
-
-  switch (*Action.Special) {
-  case SPC_Reciprocal:
-    // Simplify reciprocal.
-    return BinaryOperator::Create(
-        Instruction::FDiv, ConstantFP::get(II->getArgOperand(0)->getType(), 1),
-        II->getArgOperand(0), II->getName());
-  }
-  llvm_unreachable("All SpecialCase enumerators should be handled in switch.");
-}
-
-Instruction *InstCombiner::visitVAEndInst(VAEndInst &I) {
+Instruction *InstCombinerImpl::visitVAEndInst(VAEndInst &I) {
   removeTriviallyEmptyRange(I, *this, [](const IntrinsicInst &I) {
     return I.getIntrinsicID() == Intrinsic::vastart ||
            I.getIntrinsicID() == Intrinsic::vacopy;
@@ -1823,7 +630,7 @@ Instruction *InstCombiner::visitVAEndInst(VAEndInst &I) {
   return nullptr;
 }
 
-static Instruction *canonicalizeConstantArg0ToArg1(CallInst &Call) {
+static CallInst *canonicalizeConstantArg0ToArg1(CallInst &Call) {
   assert(Call.getNumArgOperands() > 1 && "Need at least 2 args to swap");
   Value *Arg0 = Call.getArgOperand(0), *Arg1 = Call.getArgOperand(1);
   if (isa<Constant>(Arg0) && !isa<Constant>(Arg1)) {
@@ -1834,20 +641,44 @@ static Instruction *canonicalizeConstantArg0ToArg1(CallInst &Call) {
   return nullptr;
 }
 
-Instruction *InstCombiner::foldIntrinsicWithOverflowCommon(IntrinsicInst *II) {
+/// Creates a result tuple for an overflow intrinsic \p II with a given
+/// \p Result and a constant \p Overflow value.
+static Instruction *createOverflowTuple(IntrinsicInst *II, Value *Result,
+                                        Constant *Overflow) {
+  Constant *V[] = {UndefValue::get(Result->getType()), Overflow};
+  StructType *ST = cast<StructType>(II->getType());
+  Constant *Struct = ConstantStruct::get(ST, V);
+  return InsertValueInst::Create(Struct, Result, 0);
+}
+
+Instruction *
+InstCombinerImpl::foldIntrinsicWithOverflowCommon(IntrinsicInst *II) {
   WithOverflowInst *WO = cast<WithOverflowInst>(II);
   Value *OperationResult = nullptr;
   Constant *OverflowResult = nullptr;
   if (OptimizeOverflowCheck(WO->getBinaryOp(), WO->isSigned(), WO->getLHS(),
                             WO->getRHS(), *WO, OperationResult, OverflowResult))
-    return CreateOverflowTuple(WO, OperationResult, OverflowResult);
+    return createOverflowTuple(WO, OperationResult, OverflowResult);
   return nullptr;
 }
 
+static Optional<bool> getKnownSign(Value *Op, Instruction *CxtI,
+                                   const DataLayout &DL, AssumptionCache *AC,
+                                   DominatorTree *DT) {
+  KnownBits Known = computeKnownBits(Op, DL, 0, AC, CxtI, DT);
+  if (Known.isNonNegative())
+    return false;
+  if (Known.isNegative())
+    return true;
+
+  return isImpliedByDomCondition(
+      ICmpInst::ICMP_SLT, Op, Constant::getNullValue(Op->getType()), CxtI, DL);
+}
+
 /// CallInst simplification. This mostly only handles folding of intrinsic
 /// instructions. For normal calls, it allows visitCallBase to do the heavy
 /// lifting.
-Instruction *InstCombiner::visitCallInst(CallInst &CI) {
+Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
   // Don't try to simplify calls without uses. It will not do anything useful,
   // but will result in the following folds being skipped.
   if (!CI.use_empty())
@@ -1953,31 +784,84 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     }
   }
 
-  if (Instruction *I = SimplifyNVVMIntrinsic(II, *this))
-    return I;
-
-  auto SimplifyDemandedVectorEltsLow = [this](Value *Op, unsigned Width,
-                                              unsigned DemandedWidth) {
-    APInt UndefElts(Width, 0);
-    APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth);
-    return SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);
-  };
+  if (II->isCommutative()) {
+    if (CallInst *NewCall = canonicalizeConstantArg0ToArg1(CI))
+      return NewCall;
+  }
 
   Intrinsic::ID IID = II->getIntrinsicID();
   switch (IID) {
-  default: break;
   case Intrinsic::objectsize:
     if (Value *V = lowerObjectSizeCall(II, DL, &TLI, /*MustSucceed=*/false))
       return replaceInstUsesWith(CI, V);
     return nullptr;
+  case Intrinsic::abs: {
+    Value *IIOperand = II->getArgOperand(0);
+    bool IntMinIsPoison = cast<Constant>(II->getArgOperand(1))->isOneValue();
+
+    // abs(-x) -> abs(x)
+    // TODO: Copy nsw if it was present on the neg?
+    Value *X;
+    if (match(IIOperand, m_Neg(m_Value(X))))
+      return replaceOperand(*II, 0, X);
+    if (match(IIOperand, m_Select(m_Value(), m_Value(X), m_Neg(m_Deferred(X)))))
+      return replaceOperand(*II, 0, X);
+    if (match(IIOperand, m_Select(m_Value(), m_Neg(m_Value(X)), m_Deferred(X))))
+      return replaceOperand(*II, 0, X);
+
+    if (Optional<bool> Sign = getKnownSign(IIOperand, II, DL, &AC, &DT)) {
+      // abs(x) -> x if x >= 0
+      if (!*Sign)
+        return replaceInstUsesWith(*II, IIOperand);
+
+      // abs(x) -> -x if x < 0
+      if (IntMinIsPoison)
+        return BinaryOperator::CreateNSWNeg(IIOperand);
+      return BinaryOperator::CreateNeg(IIOperand);
+    }
+
+    // abs (sext X) --> zext (abs X*)
+    // Clear the IsIntMin (nsw) bit on the abs to allow narrowing.
+    if (match(IIOperand, m_OneUse(m_SExt(m_Value(X))))) {
+      Value *NarrowAbs =
+          Builder.CreateBinaryIntrinsic(Intrinsic::abs, X, Builder.getFalse());
+      return CastInst::Create(Instruction::ZExt, NarrowAbs, II->getType());
+    }
+
+    break;
+  }
+  case Intrinsic::umax:
+  case Intrinsic::umin: {
+    Value *I0 = II->getArgOperand(0), *I1 = II->getArgOperand(1);
+    Value *X, *Y;
+    if (match(I0, m_ZExt(m_Value(X))) && match(I1, m_ZExt(m_Value(Y))) &&
+        (I0->hasOneUse() || I1->hasOneUse()) && X->getType() == Y->getType()) {
+      Value *NarrowMaxMin = Builder.CreateBinaryIntrinsic(IID, X, Y);
+      return CastInst::Create(Instruction::ZExt, NarrowMaxMin, II->getType());
+    }
+    // If both operands of unsigned min/max are sign-extended, it is still ok
+    // to narrow the operation.
+    LLVM_FALLTHROUGH;
+  }
+  case Intrinsic::smax:
+  case Intrinsic::smin: {
+    Value *I0 = II->getArgOperand(0), *I1 = II->getArgOperand(1);
+    Value *X, *Y;
+    if (match(I0, m_SExt(m_Value(X))) && match(I1, m_SExt(m_Value(Y))) &&
+        (I0->hasOneUse() || I1->hasOneUse()) && X->getType() == Y->getType()) {
+      Value *NarrowMaxMin = Builder.CreateBinaryIntrinsic(IID, X, Y);
+      return CastInst::Create(Instruction::SExt, NarrowMaxMin, II->getType());
+    }
+    break;
+  }
   case Intrinsic::bswap: {
     Value *IIOperand = II->getArgOperand(0);
     Value *X = nullptr;
 
     // bswap(trunc(bswap(x))) -> trunc(lshr(x, c))
     if (match(IIOperand, m_Trunc(m_BSwap(m_Value(X))))) {
-      unsigned C = X->getType()->getPrimitiveSizeInBits() -
-        IIOperand->getType()->getPrimitiveSizeInBits();
+      unsigned C = X->getType()->getScalarSizeInBits() -
+                   IIOperand->getType()->getScalarSizeInBits();
       Value *CV = ConstantInt::get(X->getType(), C);
       Value *V = Builder.CreateLShr(X, CV);
       return new TruncInst(V, IIOperand->getType());
@@ -2002,15 +886,14 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::powi:
     if (ConstantInt *Power = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
       // 0 and 1 are handled in instsimplify
-
       // powi(x, -1) -> 1/x
       if (Power->isMinusOne())
-        return BinaryOperator::CreateFDiv(ConstantFP::get(CI.getType(), 1.0),
-                                          II->getArgOperand(0));
+        return BinaryOperator::CreateFDivFMF(ConstantFP::get(CI.getType(), 1.0),
+                                             II->getArgOperand(0), II);
       // powi(x, 2) -> x*x
       if (Power->equalsInt(2))
-        return BinaryOperator::CreateFMul(II->getArgOperand(0),
-                                          II->getArgOperand(0));
+        return BinaryOperator::CreateFMulFMF(II->getArgOperand(0),
+                                             II->getArgOperand(0), II);
     }
     break;
 
@@ -2031,8 +914,8 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     Type *Ty = II->getType();
     unsigned BitWidth = Ty->getScalarSizeInBits();
     Constant *ShAmtC;
-    if (match(II->getArgOperand(2), m_Constant(ShAmtC)) &&
-        !isa<ConstantExpr>(ShAmtC) && !ShAmtC->containsConstantExpression()) {
+    if (match(II->getArgOperand(2), m_ImmConstant(ShAmtC)) &&
+        !ShAmtC->containsConstantExpression()) {
       // Canonicalize a shift amount constant operand to modulo the bit-width.
       Constant *WidthC = ConstantInt::get(Ty, BitWidth);
       Constant *ModuloC = ConstantExpr::getURem(ShAmtC, WidthC);
@@ -2092,8 +975,6 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   }
   case Intrinsic::uadd_with_overflow:
   case Intrinsic::sadd_with_overflow: {
-    if (Instruction *I = canonicalizeConstantArg0ToArg1(CI))
-      return I;
     if (Instruction *I = foldIntrinsicWithOverflowCommon(II))
       return I;
 
@@ -2121,10 +1002,6 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
   case Intrinsic::umul_with_overflow:
   case Intrinsic::smul_with_overflow:
-    if (Instruction *I = canonicalizeConstantArg0ToArg1(CI))
-      return I;
-    LLVM_FALLTHROUGH;
-
   case Intrinsic::usub_with_overflow:
     if (Instruction *I = foldIntrinsicWithOverflowCommon(II))
       return I;
@@ -2155,9 +1032,6 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
   case Intrinsic::uadd_sat:
   case Intrinsic::sadd_sat:
-    if (Instruction *I = canonicalizeConstantArg0ToArg1(CI))
-      return I;
-    LLVM_FALLTHROUGH;
   case Intrinsic::usub_sat:
   case Intrinsic::ssub_sat: {
     SaturatingInst *SI = cast<SaturatingInst>(II);
@@ -2238,8 +1112,6 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::maxnum:
   case Intrinsic::minimum:
   case Intrinsic::maximum: {
-    if (Instruction *I = canonicalizeConstantArg0ToArg1(CI))
-      return I;
     Value *Arg0 = II->getArgOperand(0);
     Value *Arg1 = II->getArgOperand(1);
     Value *X, *Y;
@@ -2348,9 +1220,6 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     LLVM_FALLTHROUGH;
   }
   case Intrinsic::fma: {
-    if (Instruction *I = canonicalizeConstantArg0ToArg1(CI))
-      return I;
-
     // fma fneg(x), fneg(y), z -> fma x, y, z
     Value *Src0 = II->getArgOperand(0);
     Value *Src1 = II->getArgOperand(1);
@@ -2390,40 +1259,52 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     break;
   }
   case Intrinsic::copysign: {
-    if (SignBitMustBeZero(II->getArgOperand(1), &TLI)) {
+    Value *Mag = II->getArgOperand(0), *Sign = II->getArgOperand(1);
+    if (SignBitMustBeZero(Sign, &TLI)) {
       // If we know that the sign argument is positive, reduce to FABS:
-      // copysign X, Pos --> fabs X
-      Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs,
-                                                 II->getArgOperand(0), II);
+      // copysign Mag, +Sign --> fabs Mag
+      Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, Mag, II);
       return replaceInstUsesWith(*II, Fabs);
     }
     // TODO: There should be a ValueTracking sibling like SignBitMustBeOne.
     const APFloat *C;
-    if (match(II->getArgOperand(1), m_APFloat(C)) && C->isNegative()) {
+    if (match(Sign, m_APFloat(C)) && C->isNegative()) {
       // If we know that the sign argument is negative, reduce to FNABS:
-      // copysign X, Neg --> fneg (fabs X)
-      Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs,
-                                                 II->getArgOperand(0), II);
+      // copysign Mag, -Sign --> fneg (fabs Mag)
+      Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, Mag, II);
       return replaceInstUsesWith(*II, Builder.CreateFNegFMF(Fabs, II));
     }
 
     // Propagate sign argument through nested calls:
-    // copysign X, (copysign ?, SignArg) --> copysign X, SignArg
-    Value *SignArg;
-    if (match(II->getArgOperand(1),
-              m_Intrinsic<Intrinsic::copysign>(m_Value(), m_Value(SignArg))))
-      return replaceOperand(*II, 1, SignArg);
+    // copysign Mag, (copysign ?, X) --> copysign Mag, X
+    Value *X;
+    if (match(Sign, m_Intrinsic<Intrinsic::copysign>(m_Value(), m_Value(X))))
+      return replaceOperand(*II, 1, X);
+
+    // Peek through changes of magnitude's sign-bit. This call rewrites those:
+    // copysign (fabs X), Sign --> copysign X, Sign
+    // copysign (fneg X), Sign --> copysign X, Sign
+    if (match(Mag, m_FAbs(m_Value(X))) || match(Mag, m_FNeg(m_Value(X))))
+      return replaceOperand(*II, 0, X);
 
     break;
   }
   case Intrinsic::fabs: {
-    Value *Cond;
-    Constant *LHS, *RHS;
+    Value *Cond, *TVal, *FVal;
     if (match(II->getArgOperand(0),
-              m_Select(m_Value(Cond), m_Constant(LHS), m_Constant(RHS)))) {
-      CallInst *Call0 = Builder.CreateCall(II->getCalledFunction(), {LHS});
-      CallInst *Call1 = Builder.CreateCall(II->getCalledFunction(), {RHS});
-      return SelectInst::Create(Cond, Call0, Call1);
+              m_Select(m_Value(Cond), m_Value(TVal), m_Value(FVal)))) {
+      // fabs (select Cond, TrueC, FalseC) --> select Cond, AbsT, AbsF
+      if (isa<Constant>(TVal) && isa<Constant>(FVal)) {
+        CallInst *AbsT = Builder.CreateCall(II->getCalledFunction(), {TVal});
+        CallInst *AbsF = Builder.CreateCall(II->getCalledFunction(), {FVal});
+        return SelectInst::Create(Cond, AbsT, AbsF);
+      }
+      // fabs (select Cond, -FVal, FVal) --> fabs FVal
+      if (match(TVal, m_FNeg(m_Specific(FVal))))
+        return replaceOperand(*II, 0, FVal);
+      // fabs (select Cond, TVal, -TVal) --> fabs TVal
+      if (match(FVal, m_FNeg(m_Specific(TVal))))
+        return replaceOperand(*II, 0, TVal);
     }
 
     LLVM_FALLTHROUGH;
@@ -2460,935 +1341,9 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       // sin(-x) --> -sin(x)
       Value *NewSin = Builder.CreateUnaryIntrinsic(Intrinsic::sin, X, II);
       Instruction *FNeg = UnaryOperator::CreateFNeg(NewSin);
-      FNeg->copyFastMathFlags(II);
-      return FNeg;
-    }
-    break;
-  }
-  case Intrinsic::ppc_altivec_lvx:
-  case Intrinsic::ppc_altivec_lvxl:
-    // Turn PPC lvx -> load if the pointer is known aligned.
-    if (getOrEnforceKnownAlignment(II->getArgOperand(0), Align(16), DL, II, &AC,
-                                   &DT) >= 16) {
-      Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0),
-                                         PointerType::getUnqual(II->getType()));
-      return new LoadInst(II->getType(), Ptr, "", false, Align(16));
-    }
-    break;
-  case Intrinsic::ppc_vsx_lxvw4x:
-  case Intrinsic::ppc_vsx_lxvd2x: {
-    // Turn PPC VSX loads into normal loads.
-    Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0),
-                                       PointerType::getUnqual(II->getType()));
-    return new LoadInst(II->getType(), Ptr, Twine(""), false, Align(1));
-  }
-  case Intrinsic::ppc_altivec_stvx:
-  case Intrinsic::ppc_altivec_stvxl:
-    // Turn stvx -> store if the pointer is known aligned.
-    if (getOrEnforceKnownAlignment(II->getArgOperand(1), Align(16), DL, II, &AC,
-                                   &DT) >= 16) {
-      Type *OpPtrTy =
-        PointerType::getUnqual(II->getArgOperand(0)->getType());
-      Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy);
-      return new StoreInst(II->getArgOperand(0), Ptr, false, Align(16));
-    }
-    break;
-  case Intrinsic::ppc_vsx_stxvw4x:
-  case Intrinsic::ppc_vsx_stxvd2x: {
-    // Turn PPC VSX stores into normal stores.
-    Type *OpPtrTy = PointerType::getUnqual(II->getArgOperand(0)->getType());
-    Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy);
-    return new StoreInst(II->getArgOperand(0), Ptr, false, Align(1));
-  }
-  case Intrinsic::ppc_qpx_qvlfs:
-    // Turn PPC QPX qvlfs -> load if the pointer is known aligned.
-    if (getOrEnforceKnownAlignment(II->getArgOperand(0), Align(16), DL, II, &AC,
-                                   &DT) >= 16) {
-      Type *VTy =
-          VectorType::get(Builder.getFloatTy(),
-                          cast<VectorType>(II->getType())->getElementCount());
-      Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0),
-                                         PointerType::getUnqual(VTy));
-      Value *Load = Builder.CreateLoad(VTy, Ptr);
-      return new FPExtInst(Load, II->getType());
-    }
-    break;
-  case Intrinsic::ppc_qpx_qvlfd:
-    // Turn PPC QPX qvlfd -> load if the pointer is known aligned.
-    if (getOrEnforceKnownAlignment(II->getArgOperand(0), Align(32), DL, II, &AC,
-                                   &DT) >= 32) {
-      Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0),
-                                         PointerType::getUnqual(II->getType()));
-      return new LoadInst(II->getType(), Ptr, "", false, Align(32));
-    }
-    break;
-  case Intrinsic::ppc_qpx_qvstfs:
-    // Turn PPC QPX qvstfs -> store if the pointer is known aligned.
-    if (getOrEnforceKnownAlignment(II->getArgOperand(1), Align(16), DL, II, &AC,
-                                   &DT) >= 16) {
-      Type *VTy = VectorType::get(
-          Builder.getFloatTy(),
-          cast<VectorType>(II->getArgOperand(0)->getType())->getElementCount());
-      Value *TOp = Builder.CreateFPTrunc(II->getArgOperand(0), VTy);
-      Type *OpPtrTy = PointerType::getUnqual(VTy);
-      Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy);
-      return new StoreInst(TOp, Ptr, false, Align(16));
-    }
-    break;
-  case Intrinsic::ppc_qpx_qvstfd:
-    // Turn PPC QPX qvstfd -> store if the pointer is known aligned.
-    if (getOrEnforceKnownAlignment(II->getArgOperand(1), Align(32), DL, II, &AC,
-                                   &DT) >= 32) {
-      Type *OpPtrTy =
-        PointerType::getUnqual(II->getArgOperand(0)->getType());
-      Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy);
-      return new StoreInst(II->getArgOperand(0), Ptr, false, Align(32));
-    }
-    break;
-
-  case Intrinsic::x86_bmi_bextr_32:
-  case Intrinsic::x86_bmi_bextr_64:
-  case Intrinsic::x86_tbm_bextri_u32:
-  case Intrinsic::x86_tbm_bextri_u64:
-    // If the RHS is a constant we can try some simplifications.
-    if (auto *C = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
-      uint64_t Shift = C->getZExtValue();
-      uint64_t Length = (Shift >> 8) & 0xff;
-      Shift &= 0xff;
-      unsigned BitWidth = II->getType()->getIntegerBitWidth();
-      // If the length is 0 or the shift is out of range, replace with zero.
-      if (Length == 0 || Shift >= BitWidth)
-        return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), 0));
-      // If the LHS is also a constant, we can completely constant fold this.
-      if (auto *InC = dyn_cast<ConstantInt>(II->getArgOperand(0))) {
-        uint64_t Result = InC->getZExtValue() >> Shift;
-        if (Length > BitWidth)
-          Length = BitWidth;
-        Result &= maskTrailingOnes<uint64_t>(Length);
-        return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Result));
-      }
-      // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we
-      // are only masking bits that a shift already cleared?
-    }
-    break;
-
-  case Intrinsic::x86_bmi_bzhi_32:
-  case Intrinsic::x86_bmi_bzhi_64:
-    // If the RHS is a constant we can try some simplifications.
-    if (auto *C = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
-      uint64_t Index = C->getZExtValue() & 0xff;
-      unsigned BitWidth = II->getType()->getIntegerBitWidth();
-      if (Index >= BitWidth)
-        return replaceInstUsesWith(CI, II->getArgOperand(0));
-      if (Index == 0)
-        return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), 0));
-      // If the LHS is also a constant, we can completely constant fold this.
-      if (auto *InC = dyn_cast<ConstantInt>(II->getArgOperand(0))) {
-        uint64_t Result = InC->getZExtValue();
-        Result &= maskTrailingOnes<uint64_t>(Index);
-        return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Result));
-      }
-      // TODO should we convert this to an AND if the RHS is constant?
-    }
-    break;
-  case Intrinsic::x86_bmi_pext_32:
-  case Intrinsic::x86_bmi_pext_64:
-    if (auto *MaskC = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
-      if (MaskC->isNullValue())
-        return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), 0));
-      if (MaskC->isAllOnesValue())
-        return replaceInstUsesWith(CI, II->getArgOperand(0));
-
-      if (auto *SrcC = dyn_cast<ConstantInt>(II->getArgOperand(0))) {
-        uint64_t Src = SrcC->getZExtValue();
-        uint64_t Mask = MaskC->getZExtValue();
-        uint64_t Result = 0;
-        uint64_t BitToSet = 1;
-
-        while (Mask) {
-          // Isolate lowest set bit.
-          uint64_t BitToTest = Mask & -Mask;
-          if (BitToTest & Src)
-            Result |= BitToSet;
-
-          BitToSet <<= 1;
-          // Clear lowest set bit.
-          Mask &= Mask - 1;
-        }
-
-        return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Result));
-      }
-    }
-    break;
-  case Intrinsic::x86_bmi_pdep_32:
-  case Intrinsic::x86_bmi_pdep_64:
-    if (auto *MaskC = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
-      if (MaskC->isNullValue())
-        return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), 0));
-      if (MaskC->isAllOnesValue())
-        return replaceInstUsesWith(CI, II->getArgOperand(0));
-
-      if (auto *SrcC = dyn_cast<ConstantInt>(II->getArgOperand(0))) {
-        uint64_t Src = SrcC->getZExtValue();
-        uint64_t Mask = MaskC->getZExtValue();
-        uint64_t Result = 0;
-        uint64_t BitToTest = 1;
-
-        while (Mask) {
-          // Isolate lowest set bit.
-          uint64_t BitToSet = Mask & -Mask;
-          if (BitToTest & Src)
-            Result |= BitToSet;
-
-          BitToTest <<= 1;
-          // Clear lowest set bit;
-          Mask &= Mask - 1;
-        }
-
-        return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Result));
-      }
-    }
-    break;
-
-  case Intrinsic::x86_sse_cvtss2si:
-  case Intrinsic::x86_sse_cvtss2si64:
-  case Intrinsic::x86_sse_cvttss2si:
-  case Intrinsic::x86_sse_cvttss2si64:
-  case Intrinsic::x86_sse2_cvtsd2si:
-  case Intrinsic::x86_sse2_cvtsd2si64:
-  case Intrinsic::x86_sse2_cvttsd2si:
-  case Intrinsic::x86_sse2_cvttsd2si64:
-  case Intrinsic::x86_avx512_vcvtss2si32:
-  case Intrinsic::x86_avx512_vcvtss2si64:
-  case Intrinsic::x86_avx512_vcvtss2usi32:
-  case Intrinsic::x86_avx512_vcvtss2usi64:
-  case Intrinsic::x86_avx512_vcvtsd2si32:
-  case Intrinsic::x86_avx512_vcvtsd2si64:
-  case Intrinsic::x86_avx512_vcvtsd2usi32:
-  case Intrinsic::x86_avx512_vcvtsd2usi64:
-  case Intrinsic::x86_avx512_cvttss2si:
-  case Intrinsic::x86_avx512_cvttss2si64:
-  case Intrinsic::x86_avx512_cvttss2usi:
-  case Intrinsic::x86_avx512_cvttss2usi64:
-  case Intrinsic::x86_avx512_cvttsd2si:
-  case Intrinsic::x86_avx512_cvttsd2si64:
-  case Intrinsic::x86_avx512_cvttsd2usi:
-  case Intrinsic::x86_avx512_cvttsd2usi64: {
-    // These intrinsics only demand the 0th element of their input vectors. If
-    // we can simplify the input based on that, do so now.
-    Value *Arg = II->getArgOperand(0);
-    unsigned VWidth = cast<VectorType>(Arg->getType())->getNumElements();
-    if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1))
-      return replaceOperand(*II, 0, V);
-    break;
-  }
-
-  case Intrinsic::x86_mmx_pmovmskb:
-  case Intrinsic::x86_sse_movmsk_ps:
-  case Intrinsic::x86_sse2_movmsk_pd:
-  case Intrinsic::x86_sse2_pmovmskb_128:
-  case Intrinsic::x86_avx_movmsk_pd_256:
-  case Intrinsic::x86_avx_movmsk_ps_256:
-  case Intrinsic::x86_avx2_pmovmskb:
-    if (Value *V = simplifyX86movmsk(*II, Builder))
-      return replaceInstUsesWith(*II, V);
-    break;
-
-  case Intrinsic::x86_sse_comieq_ss:
-  case Intrinsic::x86_sse_comige_ss:
-  case Intrinsic::x86_sse_comigt_ss:
-  case Intrinsic::x86_sse_comile_ss:
-  case Intrinsic::x86_sse_comilt_ss:
-  case Intrinsic::x86_sse_comineq_ss:
-  case Intrinsic::x86_sse_ucomieq_ss:
-  case Intrinsic::x86_sse_ucomige_ss:
-  case Intrinsic::x86_sse_ucomigt_ss:
-  case Intrinsic::x86_sse_ucomile_ss:
-  case Intrinsic::x86_sse_ucomilt_ss:
-  case Intrinsic::x86_sse_ucomineq_ss:
-  case Intrinsic::x86_sse2_comieq_sd:
-  case Intrinsic::x86_sse2_comige_sd:
-  case Intrinsic::x86_sse2_comigt_sd:
-  case Intrinsic::x86_sse2_comile_sd:
-  case Intrinsic::x86_sse2_comilt_sd:
-  case Intrinsic::x86_sse2_comineq_sd:
-  case Intrinsic::x86_sse2_ucomieq_sd:
-  case Intrinsic::x86_sse2_ucomige_sd:
-  case Intrinsic::x86_sse2_ucomigt_sd:
-  case Intrinsic::x86_sse2_ucomile_sd:
-  case Intrinsic::x86_sse2_ucomilt_sd:
-  case Intrinsic::x86_sse2_ucomineq_sd:
-  case Intrinsic::x86_avx512_vcomi_ss:
-  case Intrinsic::x86_avx512_vcomi_sd:
-  case Intrinsic::x86_avx512_mask_cmp_ss:
-  case Intrinsic::x86_avx512_mask_cmp_sd: {
-    // These intrinsics only demand the 0th element of their input vectors. If
-    // we can simplify the input based on that, do so now.
-    bool MadeChange = false;
-    Value *Arg0 = II->getArgOperand(0);
-    Value *Arg1 = II->getArgOperand(1);
-    unsigned VWidth = cast<VectorType>(Arg0->getType())->getNumElements();
-    if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) {
-      replaceOperand(*II, 0, V);
-      MadeChange = true;
-    }
-    if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {
-      replaceOperand(*II, 1, V);
-      MadeChange = true;
-    }
-    if (MadeChange)
-      return II;
-    break;
-  }
-  case Intrinsic::x86_avx512_cmp_pd_128:
-  case Intrinsic::x86_avx512_cmp_pd_256:
-  case Intrinsic::x86_avx512_cmp_pd_512:
-  case Intrinsic::x86_avx512_cmp_ps_128:
-  case Intrinsic::x86_avx512_cmp_ps_256:
-  case Intrinsic::x86_avx512_cmp_ps_512: {
-    // Folding cmp(sub(a,b),0) -> cmp(a,b) and cmp(0,sub(a,b)) -> cmp(b,a)
-    Value *Arg0 = II->getArgOperand(0);
-    Value *Arg1 = II->getArgOperand(1);
-    bool Arg0IsZero = match(Arg0, m_PosZeroFP());
-    if (Arg0IsZero)
-      std::swap(Arg0, Arg1);
-    Value *A, *B;
-    // This fold requires only the NINF(not +/- inf) since inf minus
-    // inf is nan.
-    // NSZ(No Signed Zeros) is not needed because zeros of any sign are
-    // equal for both compares.
-    // NNAN is not needed because nans compare the same for both compares.
-    // The compare intrinsic uses the above assumptions and therefore
-    // doesn't require additional flags.
-    if ((match(Arg0, m_OneUse(m_FSub(m_Value(A), m_Value(B)))) &&
-         match(Arg1, m_PosZeroFP()) && isa<Instruction>(Arg0) &&
-         cast<Instruction>(Arg0)->getFastMathFlags().noInfs())) {
-      if (Arg0IsZero)
-        std::swap(A, B);
-      replaceOperand(*II, 0, A);
-      replaceOperand(*II, 1, B);
-      return II;
-    }
-    break;
-  }
-
-  case Intrinsic::x86_avx512_add_ps_512:
-  case Intrinsic::x86_avx512_div_ps_512:
-  case Intrinsic::x86_avx512_mul_ps_512:
-  case Intrinsic::x86_avx512_sub_ps_512:
-  case Intrinsic::x86_avx512_add_pd_512:
-  case Intrinsic::x86_avx512_div_pd_512:
-  case Intrinsic::x86_avx512_mul_pd_512:
-  case Intrinsic::x86_avx512_sub_pd_512:
-    // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
-    // IR operations.
-    if (auto *R = dyn_cast<ConstantInt>(II->getArgOperand(2))) {
-      if (R->getValue() == 4) {
-        Value *Arg0 = II->getArgOperand(0);
-        Value *Arg1 = II->getArgOperand(1);
-
-        Value *V;
-        switch (IID) {
-        default: llvm_unreachable("Case stmts out of sync!");
-        case Intrinsic::x86_avx512_add_ps_512:
-        case Intrinsic::x86_avx512_add_pd_512:
-          V = Builder.CreateFAdd(Arg0, Arg1);
-          break;
-        case Intrinsic::x86_avx512_sub_ps_512:
-        case Intrinsic::x86_avx512_sub_pd_512:
-          V = Builder.CreateFSub(Arg0, Arg1);
-          break;
-        case Intrinsic::x86_avx512_mul_ps_512:
-        case Intrinsic::x86_avx512_mul_pd_512:
-          V = Builder.CreateFMul(Arg0, Arg1);
-          break;
-        case Intrinsic::x86_avx512_div_ps_512:
-        case Intrinsic::x86_avx512_div_pd_512:
-          V = Builder.CreateFDiv(Arg0, Arg1);
-          break;
-        }
-
-        return replaceInstUsesWith(*II, V);
-      }
-    }
-    break;
-
-  case Intrinsic::x86_avx512_mask_add_ss_round:
-  case Intrinsic::x86_avx512_mask_div_ss_round:
-  case Intrinsic::x86_avx512_mask_mul_ss_round:
-  case Intrinsic::x86_avx512_mask_sub_ss_round:
-  case Intrinsic::x86_avx512_mask_add_sd_round:
-  case Intrinsic::x86_avx512_mask_div_sd_round:
-  case Intrinsic::x86_avx512_mask_mul_sd_round:
-  case Intrinsic::x86_avx512_mask_sub_sd_round:
-    // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
-    // IR operations.
-    if (auto *R = dyn_cast<ConstantInt>(II->getArgOperand(4))) {
-      if (R->getValue() == 4) {
-        // Extract the element as scalars.
-        Value *Arg0 = II->getArgOperand(0);
-        Value *Arg1 = II->getArgOperand(1);
-        Value *LHS = Builder.CreateExtractElement(Arg0, (uint64_t)0);
-        Value *RHS = Builder.CreateExtractElement(Arg1, (uint64_t)0);
-
-        Value *V;
-        switch (IID) {
-        default: llvm_unreachable("Case stmts out of sync!");
-        case Intrinsic::x86_avx512_mask_add_ss_round:
-        case Intrinsic::x86_avx512_mask_add_sd_round:
-          V = Builder.CreateFAdd(LHS, RHS);
-          break;
-        case Intrinsic::x86_avx512_mask_sub_ss_round:
-        case Intrinsic::x86_avx512_mask_sub_sd_round:
-          V = Builder.CreateFSub(LHS, RHS);
-          break;
-        case Intrinsic::x86_avx512_mask_mul_ss_round:
-        case Intrinsic::x86_avx512_mask_mul_sd_round:
-          V = Builder.CreateFMul(LHS, RHS);
-          break;
-        case Intrinsic::x86_avx512_mask_div_ss_round:
-        case Intrinsic::x86_avx512_mask_div_sd_round:
-          V = Builder.CreateFDiv(LHS, RHS);
-          break;
-        }
-
-        // Handle the masking aspect of the intrinsic.
-        Value *Mask = II->getArgOperand(3);
-        auto *C = dyn_cast<ConstantInt>(Mask);
-        // We don't need a select if we know the mask bit is a 1.
-        if (!C || !C->getValue()[0]) {
-          // Cast the mask to an i1 vector and then extract the lowest element.
-          auto *MaskTy = FixedVectorType::get(
-              Builder.getInt1Ty(),
-              cast<IntegerType>(Mask->getType())->getBitWidth());
-          Mask = Builder.CreateBitCast(Mask, MaskTy);
-          Mask = Builder.CreateExtractElement(Mask, (uint64_t)0);
-          // Extract the lowest element from the passthru operand.
-          Value *Passthru = Builder.CreateExtractElement(II->getArgOperand(2),
-                                                          (uint64_t)0);
-          V = Builder.CreateSelect(Mask, V, Passthru);
-        }
-
-        // Insert the result back into the original argument 0.
-        V = Builder.CreateInsertElement(Arg0, V, (uint64_t)0);
-
-        return replaceInstUsesWith(*II, V);
-      }
-    }
-    break;
-
-  // Constant fold ashr( <A x Bi>, Ci ).
-  // Constant fold lshr( <A x Bi>, Ci ).
-  // Constant fold shl( <A x Bi>, Ci ).
-  case Intrinsic::x86_sse2_psrai_d:
-  case Intrinsic::x86_sse2_psrai_w:
-  case Intrinsic::x86_avx2_psrai_d:
-  case Intrinsic::x86_avx2_psrai_w:
-  case Intrinsic::x86_avx512_psrai_q_128:
-  case Intrinsic::x86_avx512_psrai_q_256:
-  case Intrinsic::x86_avx512_psrai_d_512:
-  case Intrinsic::x86_avx512_psrai_q_512:
-  case Intrinsic::x86_avx512_psrai_w_512:
-  case Intrinsic::x86_sse2_psrli_d:
-  case Intrinsic::x86_sse2_psrli_q:
-  case Intrinsic::x86_sse2_psrli_w:
-  case Intrinsic::x86_avx2_psrli_d:
-  case Intrinsic::x86_avx2_psrli_q:
-  case Intrinsic::x86_avx2_psrli_w:
-  case Intrinsic::x86_avx512_psrli_d_512:
-  case Intrinsic::x86_avx512_psrli_q_512:
-  case Intrinsic::x86_avx512_psrli_w_512:
-  case Intrinsic::x86_sse2_pslli_d:
-  case Intrinsic::x86_sse2_pslli_q:
-  case Intrinsic::x86_sse2_pslli_w:
-  case Intrinsic::x86_avx2_pslli_d:
-  case Intrinsic::x86_avx2_pslli_q:
-  case Intrinsic::x86_avx2_pslli_w:
-  case Intrinsic::x86_avx512_pslli_d_512:
-  case Intrinsic::x86_avx512_pslli_q_512:
-  case Intrinsic::x86_avx512_pslli_w_512:
-    if (Value *V = simplifyX86immShift(*II, Builder))
-      return replaceInstUsesWith(*II, V);
-    break;
-
-  case Intrinsic::x86_sse2_psra_d:
-  case Intrinsic::x86_sse2_psra_w:
-  case Intrinsic::x86_avx2_psra_d:
-  case Intrinsic::x86_avx2_psra_w:
-  case Intrinsic::x86_avx512_psra_q_128:
-  case Intrinsic::x86_avx512_psra_q_256:
-  case Intrinsic::x86_avx512_psra_d_512:
-  case Intrinsic::x86_avx512_psra_q_512:
-  case Intrinsic::x86_avx512_psra_w_512:
-  case Intrinsic::x86_sse2_psrl_d:
-  case Intrinsic::x86_sse2_psrl_q:
-  case Intrinsic::x86_sse2_psrl_w:
-  case Intrinsic::x86_avx2_psrl_d:
-  case Intrinsic::x86_avx2_psrl_q:
-  case Intrinsic::x86_avx2_psrl_w:
-  case Intrinsic::x86_avx512_psrl_d_512:
-  case Intrinsic::x86_avx512_psrl_q_512:
-  case Intrinsic::x86_avx512_psrl_w_512:
-  case Intrinsic::x86_sse2_psll_d:
-  case Intrinsic::x86_sse2_psll_q:
-  case Intrinsic::x86_sse2_psll_w:
-  case Intrinsic::x86_avx2_psll_d:
-  case Intrinsic::x86_avx2_psll_q:
-  case Intrinsic::x86_avx2_psll_w:
-  case Intrinsic::x86_avx512_psll_d_512:
-  case Intrinsic::x86_avx512_psll_q_512:
-  case Intrinsic::x86_avx512_psll_w_512: {
-    if (Value *V = simplifyX86immShift(*II, Builder))
-      return replaceInstUsesWith(*II, V);
-
-    // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector
-    // operand to compute the shift amount.
-    Value *Arg1 = II->getArgOperand(1);
-    assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 &&
-           "Unexpected packed shift size");
-    unsigned VWidth = cast<VectorType>(Arg1->getType())->getNumElements();
-
-    if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2))
-      return replaceOperand(*II, 1, V);
-    break;
-  }
-
-  case Intrinsic::x86_avx2_psllv_d:
-  case Intrinsic::x86_avx2_psllv_d_256:
-  case Intrinsic::x86_avx2_psllv_q:
-  case Intrinsic::x86_avx2_psllv_q_256:
-  case Intrinsic::x86_avx512_psllv_d_512:
-  case Intrinsic::x86_avx512_psllv_q_512:
-  case Intrinsic::x86_avx512_psllv_w_128:
-  case Intrinsic::x86_avx512_psllv_w_256:
-  case Intrinsic::x86_avx512_psllv_w_512:
-  case Intrinsic::x86_avx2_psrav_d:
-  case Intrinsic::x86_avx2_psrav_d_256:
-  case Intrinsic::x86_avx512_psrav_q_128:
-  case Intrinsic::x86_avx512_psrav_q_256:
-  case Intrinsic::x86_avx512_psrav_d_512:
-  case Intrinsic::x86_avx512_psrav_q_512:
-  case Intrinsic::x86_avx512_psrav_w_128:
-  case Intrinsic::x86_avx512_psrav_w_256:
-  case Intrinsic::x86_avx512_psrav_w_512:
-  case Intrinsic::x86_avx2_psrlv_d:
-  case Intrinsic::x86_avx2_psrlv_d_256:
-  case Intrinsic::x86_avx2_psrlv_q:
-  case Intrinsic::x86_avx2_psrlv_q_256:
-  case Intrinsic::x86_avx512_psrlv_d_512:
-  case Intrinsic::x86_avx512_psrlv_q_512:
-  case Intrinsic::x86_avx512_psrlv_w_128:
-  case Intrinsic::x86_avx512_psrlv_w_256:
-  case Intrinsic::x86_avx512_psrlv_w_512:
-    if (Value *V = simplifyX86varShift(*II, Builder))
-      return replaceInstUsesWith(*II, V);
-    break;
-
-  case Intrinsic::x86_sse2_packssdw_128:
-  case Intrinsic::x86_sse2_packsswb_128:
-  case Intrinsic::x86_avx2_packssdw:
-  case Intrinsic::x86_avx2_packsswb:
-  case Intrinsic::x86_avx512_packssdw_512:
-  case Intrinsic::x86_avx512_packsswb_512:
-    if (Value *V = simplifyX86pack(*II, Builder, true))
-      return replaceInstUsesWith(*II, V);
-    break;
-
-  case Intrinsic::x86_sse2_packuswb_128:
-  case Intrinsic::x86_sse41_packusdw:
-  case Intrinsic::x86_avx2_packusdw:
-  case Intrinsic::x86_avx2_packuswb:
-  case Intrinsic::x86_avx512_packusdw_512:
-  case Intrinsic::x86_avx512_packuswb_512:
-    if (Value *V = simplifyX86pack(*II, Builder, false))
-      return replaceInstUsesWith(*II, V);
-    break;
-
-  case Intrinsic::x86_pclmulqdq:
-  case Intrinsic::x86_pclmulqdq_256:
-  case Intrinsic::x86_pclmulqdq_512: {
-    if (auto *C = dyn_cast<ConstantInt>(II->getArgOperand(2))) {
-      unsigned Imm = C->getZExtValue();
-
-      bool MadeChange = false;
-      Value *Arg0 = II->getArgOperand(0);
-      Value *Arg1 = II->getArgOperand(1);
-      unsigned VWidth = cast<VectorType>(Arg0->getType())->getNumElements();
-
-      APInt UndefElts1(VWidth, 0);
-      APInt DemandedElts1 = APInt::getSplat(VWidth,
-                                            APInt(2, (Imm & 0x01) ? 2 : 1));
-      if (Value *V = SimplifyDemandedVectorElts(Arg0, DemandedElts1,
-                                                UndefElts1)) {
-        replaceOperand(*II, 0, V);
-        MadeChange = true;
-      }
-
-      APInt UndefElts2(VWidth, 0);
-      APInt DemandedElts2 = APInt::getSplat(VWidth,
-                                            APInt(2, (Imm & 0x10) ? 2 : 1));
-      if (Value *V = SimplifyDemandedVectorElts(Arg1, DemandedElts2,
-                                                UndefElts2)) {
-        replaceOperand(*II, 1, V);
-        MadeChange = true;
-      }
-
-      // If either input elements are undef, the result is zero.
-      if (DemandedElts1.isSubsetOf(UndefElts1) ||
-          DemandedElts2.isSubsetOf(UndefElts2))
-        return replaceInstUsesWith(*II,
-                                   ConstantAggregateZero::get(II->getType()));
-
-      if (MadeChange)
-        return II;
-    }
-    break;
-  }
-
-  case Intrinsic::x86_sse41_insertps:
-    if (Value *V = simplifyX86insertps(*II, Builder))
-      return replaceInstUsesWith(*II, V);
-    break;
-
-  case Intrinsic::x86_sse4a_extrq: {
-    Value *Op0 = II->getArgOperand(0);
-    Value *Op1 = II->getArgOperand(1);
-    unsigned VWidth0 = cast<VectorType>(Op0->getType())->getNumElements();
-    unsigned VWidth1 = cast<VectorType>(Op1->getType())->getNumElements();
-    assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
-           Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
-           VWidth1 == 16 && "Unexpected operand sizes");
-
-    // See if we're dealing with constant values.
-    Constant *C1 = dyn_cast<Constant>(Op1);
-    ConstantInt *CILength =
-        C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
-           : nullptr;
-    ConstantInt *CIIndex =
-        C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
-           : nullptr;
-
-    // Attempt to simplify to a constant, shuffle vector or EXTRQI call.
-    if (Value *V = simplifyX86extrq(*II, Op0, CILength, CIIndex, Builder))
-      return replaceInstUsesWith(*II, V);
-
-    // EXTRQ only uses the lowest 64-bits of the first 128-bit vector
-    // operands and the lowest 16-bits of the second.
-    bool MadeChange = false;
-    if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
-      replaceOperand(*II, 0, V);
-      MadeChange = true;
-    }
-    if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) {
-      replaceOperand(*II, 1, V);
-      MadeChange = true;
-    }
-    if (MadeChange)
-      return II;
-    break;
-  }
-
-  case Intrinsic::x86_sse4a_extrqi: {
-    // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining
-    // bits of the lower 64-bits. The upper 64-bits are undefined.
-    Value *Op0 = II->getArgOperand(0);
-    unsigned VWidth = cast<VectorType>(Op0->getType())->getNumElements();
-    assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
-           "Unexpected operand size");
-
-    // See if we're dealing with constant values.
-    ConstantInt *CILength = dyn_cast<ConstantInt>(II->getArgOperand(1));
-    ConstantInt *CIIndex = dyn_cast<ConstantInt>(II->getArgOperand(2));
-
-    // Attempt to simplify to a constant or shuffle vector.
-    if (Value *V = simplifyX86extrq(*II, Op0, CILength, CIIndex, Builder))
-      return replaceInstUsesWith(*II, V);
-
-    // EXTRQI only uses the lowest 64-bits of the first 128-bit vector
-    // operand.
-    if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1))
-      return replaceOperand(*II, 0, V);
-    break;
-  }
-
-  case Intrinsic::x86_sse4a_insertq: {
-    Value *Op0 = II->getArgOperand(0);
-    Value *Op1 = II->getArgOperand(1);
-    unsigned VWidth = cast<VectorType>(Op0->getType())->getNumElements();
-    assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
-           Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
-           cast<VectorType>(Op1->getType())->getNumElements() == 2 &&
-           "Unexpected operand size");
-
-    // See if we're dealing with constant values.
-    Constant *C1 = dyn_cast<Constant>(Op1);
-    ConstantInt *CI11 =
-        C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
-           : nullptr;
-
-    // Attempt to simplify to a constant, shuffle vector or INSERTQI call.
-    if (CI11) {
-      const APInt &V11 = CI11->getValue();
-      APInt Len = V11.zextOrTrunc(6);
-      APInt Idx = V11.lshr(8).zextOrTrunc(6);
-      if (Value *V = simplifyX86insertq(*II, Op0, Op1, Len, Idx, Builder))
-        return replaceInstUsesWith(*II, V);
-    }
-
-    // INSERTQ only uses the lowest 64-bits of the first 128-bit vector
-    // operand.
-    if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1))
-      return replaceOperand(*II, 0, V);
-    break;
-  }
-
-  case Intrinsic::x86_sse4a_insertqi: {
-    // INSERTQI: Extract lowest Length bits from lower half of second source and
-    // insert over first source starting at Index bit. The upper 64-bits are
-    // undefined.
-    Value *Op0 = II->getArgOperand(0);
-    Value *Op1 = II->getArgOperand(1);
-    unsigned VWidth0 = cast<VectorType>(Op0->getType())->getNumElements();
-    unsigned VWidth1 = cast<VectorType>(Op1->getType())->getNumElements();
-    assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
-           Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
-           VWidth1 == 2 && "Unexpected operand sizes");
-
-    // See if we're dealing with constant values.
-    ConstantInt *CILength = dyn_cast<ConstantInt>(II->getArgOperand(2));
-    ConstantInt *CIIndex = dyn_cast<ConstantInt>(II->getArgOperand(3));
-
-    // Attempt to simplify to a constant or shuffle vector.
-    if (CILength && CIIndex) {
-      APInt Len = CILength->getValue().zextOrTrunc(6);
-      APInt Idx = CIIndex->getValue().zextOrTrunc(6);
-      if (Value *V = simplifyX86insertq(*II, Op0, Op1, Len, Idx, Builder))
-        return replaceInstUsesWith(*II, V);
-    }
-
-    // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector
-    // operands.
-    bool MadeChange = false;
-    if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
-      replaceOperand(*II, 0, V);
-      MadeChange = true;
-    }
-    if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) {
-      replaceOperand(*II, 1, V);
-      MadeChange = true;
-    }
-    if (MadeChange)
-      return II;
-    break;
-  }
-
-  case Intrinsic::x86_sse41_pblendvb:
-  case Intrinsic::x86_sse41_blendvps:
-  case Intrinsic::x86_sse41_blendvpd:
-  case Intrinsic::x86_avx_blendv_ps_256:
-  case Intrinsic::x86_avx_blendv_pd_256:
-  case Intrinsic::x86_avx2_pblendvb: {
-    // fold (blend A, A, Mask) -> A
-    Value *Op0 = II->getArgOperand(0);
-    Value *Op1 = II->getArgOperand(1);
-    Value *Mask = II->getArgOperand(2);
-    if (Op0 == Op1)
-      return replaceInstUsesWith(CI, Op0);
-
-    // Zero Mask - select 1st argument.
-    if (isa<ConstantAggregateZero>(Mask))
-      return replaceInstUsesWith(CI, Op0);
-
-    // Constant Mask - select 1st/2nd argument lane based on top bit of mask.
-    if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) {
-      Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask);
-      return SelectInst::Create(NewSelector, Op1, Op0, "blendv");
-    }
-
-    // Convert to a vector select if we can bypass casts and find a boolean
-    // vector condition value.
-    Value *BoolVec;
-    Mask = peekThroughBitcast(Mask);
-    if (match(Mask, m_SExt(m_Value(BoolVec))) &&
-        BoolVec->getType()->isVectorTy() &&
-        BoolVec->getType()->getScalarSizeInBits() == 1) {
-      assert(Mask->getType()->getPrimitiveSizeInBits() ==
-             II->getType()->getPrimitiveSizeInBits() &&
-             "Not expecting mask and operands with different sizes");
-
-      unsigned NumMaskElts =
-          cast<VectorType>(Mask->getType())->getNumElements();
-      unsigned NumOperandElts =
-          cast<VectorType>(II->getType())->getNumElements();
-      if (NumMaskElts == NumOperandElts)
-        return SelectInst::Create(BoolVec, Op1, Op0);
-
-      // If the mask has less elements than the operands, each mask bit maps to
-      // multiple elements of the operands. Bitcast back and forth.
-      if (NumMaskElts < NumOperandElts) {
-        Value *CastOp0 = Builder.CreateBitCast(Op0, Mask->getType());
-        Value *CastOp1 = Builder.CreateBitCast(Op1, Mask->getType());
-        Value *Sel = Builder.CreateSelect(BoolVec, CastOp1, CastOp0);
-        return new BitCastInst(Sel, II->getType());
-      }
-    }
-
-    break;
-  }
-
-  case Intrinsic::x86_ssse3_pshuf_b_128:
-  case Intrinsic::x86_avx2_pshuf_b:
-  case Intrinsic::x86_avx512_pshuf_b_512:
-    if (Value *V = simplifyX86pshufb(*II, Builder))
-      return replaceInstUsesWith(*II, V);
-    break;
-
-  case Intrinsic::x86_avx_vpermilvar_ps:
-  case Intrinsic::x86_avx_vpermilvar_ps_256:
-  case Intrinsic::x86_avx512_vpermilvar_ps_512:
-  case Intrinsic::x86_avx_vpermilvar_pd:
-  case Intrinsic::x86_avx_vpermilvar_pd_256:
-  case Intrinsic::x86_avx512_vpermilvar_pd_512:
-    if (Value *V = simplifyX86vpermilvar(*II, Builder))
-      return replaceInstUsesWith(*II, V);
-    break;
-
-  case Intrinsic::x86_avx2_permd:
-  case Intrinsic::x86_avx2_permps:
-  case Intrinsic::x86_avx512_permvar_df_256:
-  case Intrinsic::x86_avx512_permvar_df_512:
-  case Intrinsic::x86_avx512_permvar_di_256:
-  case Intrinsic::x86_avx512_permvar_di_512:
-  case Intrinsic::x86_avx512_permvar_hi_128:
-  case Intrinsic::x86_avx512_permvar_hi_256:
-  case Intrinsic::x86_avx512_permvar_hi_512:
-  case Intrinsic::x86_avx512_permvar_qi_128:
-  case Intrinsic::x86_avx512_permvar_qi_256:
-  case Intrinsic::x86_avx512_permvar_qi_512:
-  case Intrinsic::x86_avx512_permvar_sf_512:
-  case Intrinsic::x86_avx512_permvar_si_512:
-    if (Value *V = simplifyX86vpermv(*II, Builder))
-      return replaceInstUsesWith(*II, V);
-    break;
-
-  case Intrinsic::x86_avx_maskload_ps:
-  case Intrinsic::x86_avx_maskload_pd:
-  case Intrinsic::x86_avx_maskload_ps_256:
-  case Intrinsic::x86_avx_maskload_pd_256:
-  case Intrinsic::x86_avx2_maskload_d:
-  case Intrinsic::x86_avx2_maskload_q:
-  case Intrinsic::x86_avx2_maskload_d_256:
-  case Intrinsic::x86_avx2_maskload_q_256:
-    if (Instruction *I = simplifyX86MaskedLoad(*II, *this))
-      return I;
-    break;
-
-  case Intrinsic::x86_sse2_maskmov_dqu:
-  case Intrinsic::x86_avx_maskstore_ps:
-  case Intrinsic::x86_avx_maskstore_pd:
-  case Intrinsic::x86_avx_maskstore_ps_256:
-  case Intrinsic::x86_avx_maskstore_pd_256:
-  case Intrinsic::x86_avx2_maskstore_d:
-  case Intrinsic::x86_avx2_maskstore_q:
-  case Intrinsic::x86_avx2_maskstore_d_256:
-  case Intrinsic::x86_avx2_maskstore_q_256:
-    if (simplifyX86MaskedStore(*II, *this))
-      return nullptr;
-    break;
-
-  case Intrinsic::x86_addcarry_32:
-  case Intrinsic::x86_addcarry_64:
-    if (Value *V = simplifyX86addcarry(*II, Builder))
-      return replaceInstUsesWith(*II, V);
-    break;
-
-  case Intrinsic::ppc_altivec_vperm:
-    // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant.
-    // Note that ppc_altivec_vperm has a big-endian bias, so when creating
-    // a vectorshuffle for little endian, we must undo the transformation
-    // performed on vec_perm in altivec.h.  That is, we must complement
-    // the permutation mask with respect to 31 and reverse the order of
-    // V1 and V2.
-    if (Constant *Mask = dyn_cast<Constant>(II->getArgOperand(2))) {
-      assert(cast<VectorType>(Mask->getType())->getNumElements() == 16 &&
-             "Bad type for intrinsic!");
-
-      // Check that all of the elements are integer constants or undefs.
-      bool AllEltsOk = true;
-      for (unsigned i = 0; i != 16; ++i) {
-        Constant *Elt = Mask->getAggregateElement(i);
-        if (!Elt || !(isa<ConstantInt>(Elt) || isa<UndefValue>(Elt))) {
-          AllEltsOk = false;
-          break;
-        }
-      }
-
-      if (AllEltsOk) {
-        // Cast the input vectors to byte vectors.
-        Value *Op0 = Builder.CreateBitCast(II->getArgOperand(0),
-                                           Mask->getType());
-        Value *Op1 = Builder.CreateBitCast(II->getArgOperand(1),
-                                           Mask->getType());
-        Value *Result = UndefValue::get(Op0->getType());
-
-        // Only extract each element once.
-        Value *ExtractedElts[32];
-        memset(ExtractedElts, 0, sizeof(ExtractedElts));
-
-        for (unsigned i = 0; i != 16; ++i) {
-          if (isa<UndefValue>(Mask->getAggregateElement(i)))
-            continue;
-          unsigned Idx =
-            cast<ConstantInt>(Mask->getAggregateElement(i))->getZExtValue();
-          Idx &= 31;  // Match the hardware behavior.
-          if (DL.isLittleEndian())
-            Idx = 31 - Idx;
-
-          if (!ExtractedElts[Idx]) {
-            Value *Op0ToUse = (DL.isLittleEndian()) ? Op1 : Op0;
-            Value *Op1ToUse = (DL.isLittleEndian()) ? Op0 : Op1;
-            ExtractedElts[Idx] =
-              Builder.CreateExtractElement(Idx < 16 ? Op0ToUse : Op1ToUse,
-                                           Builder.getInt32(Idx&15));
-          }
-
-          // Insert this value into the result vector.
-          Result = Builder.CreateInsertElement(Result, ExtractedElts[Idx],
-                                               Builder.getInt32(i));
-        }
-        return CastInst::Create(Instruction::BitCast, Result, CI.getType());
-      }
-    }
-    break;
-
-  case Intrinsic::arm_neon_vld1: {
-    Align MemAlign = getKnownAlignment(II->getArgOperand(0), DL, II, &AC, &DT);
-    if (Value *V = simplifyNeonVld1(*II, MemAlign.value(), Builder))
-      return replaceInstUsesWith(*II, V);
-    break;
-  }
-
-  case Intrinsic::arm_neon_vld2:
-  case Intrinsic::arm_neon_vld3:
-  case Intrinsic::arm_neon_vld4:
-  case Intrinsic::arm_neon_vld2lane:
-  case Intrinsic::arm_neon_vld3lane:
-  case Intrinsic::arm_neon_vld4lane:
-  case Intrinsic::arm_neon_vst1:
-  case Intrinsic::arm_neon_vst2:
-  case Intrinsic::arm_neon_vst3:
-  case Intrinsic::arm_neon_vst4:
-  case Intrinsic::arm_neon_vst2lane:
-  case Intrinsic::arm_neon_vst3lane:
-  case Intrinsic::arm_neon_vst4lane: {
-    Align MemAlign = getKnownAlignment(II->getArgOperand(0), DL, II, &AC, &DT);
-    unsigned AlignArg = II->getNumArgOperands() - 1;
-    Value *AlignArgOp = II->getArgOperand(AlignArg);
-    MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue();
-    if (Align && *Align < MemAlign)
-      return replaceOperand(*II, AlignArg,
-                            ConstantInt::get(Type::getInt32Ty(II->getContext()),
-                                             MemAlign.value(), false));
+      FNeg->copyFastMathFlags(II);
+      return FNeg;
+    }
     break;
   }
 
@@ -3453,690 +1408,6 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     }
     break;
   }
-  case Intrinsic::arm_mve_pred_i2v: {
-    Value *Arg = II->getArgOperand(0);
-    Value *ArgArg;
-    if (match(Arg, m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(m_Value(ArgArg))) &&
-        II->getType() == ArgArg->getType())
-      return replaceInstUsesWith(*II, ArgArg);
-    Constant *XorMask;
-    if (match(Arg,
-              m_Xor(m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(m_Value(ArgArg)),
-                    m_Constant(XorMask))) &&
-        II->getType() == ArgArg->getType()) {
-      if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
-        if (CI->getValue().trunc(16).isAllOnesValue()) {
-          auto TrueVector = Builder.CreateVectorSplat(
-              cast<VectorType>(II->getType())->getNumElements(),
-              Builder.getTrue());
-          return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
-        }
-      }
-    }
-    KnownBits ScalarKnown(32);
-    if (SimplifyDemandedBits(II, 0, APInt::getLowBitsSet(32, 16),
-                             ScalarKnown, 0))
-      return II;
-    break;
-  }
-  case Intrinsic::arm_mve_pred_v2i: {
-    Value *Arg = II->getArgOperand(0);
-    Value *ArgArg;
-    if (match(Arg, m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(m_Value(ArgArg))))
-      return replaceInstUsesWith(*II, ArgArg);
-    if (!II->getMetadata(LLVMContext::MD_range)) {
-      Type *IntTy32 = Type::getInt32Ty(II->getContext());
-      Metadata *M[] = {
-        ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0)),
-        ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0xFFFF))
-      };
-      II->setMetadata(LLVMContext::MD_range, MDNode::get(II->getContext(), M));
-      return II;
-    }
-    break;
-  }
-  case Intrinsic::arm_mve_vadc:
-  case Intrinsic::arm_mve_vadc_predicated: {
-    unsigned CarryOp =
-        (II->getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
-    assert(II->getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&
-           "Bad type for intrinsic!");
-
-    KnownBits CarryKnown(32);
-    if (SimplifyDemandedBits(II, CarryOp, APInt::getOneBitSet(32, 29),
-                             CarryKnown))
-      return II;
-    break;
-  }
-  case Intrinsic::amdgcn_rcp: {
-    Value *Src = II->getArgOperand(0);
-
-    // TODO: Move to ConstantFolding/InstSimplify?
-    if (isa<UndefValue>(Src)) {
-      Type *Ty = II->getType();
-      auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
-      return replaceInstUsesWith(CI, QNaN);
-    }
-
-    if (II->isStrictFP())
-      break;
-
-    if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
-      const APFloat &ArgVal = C->getValueAPF();
-      APFloat Val(ArgVal.getSemantics(), 1);
-      Val.divide(ArgVal, APFloat::rmNearestTiesToEven);
-
-      // This is more precise than the instruction may give.
-      //
-      // TODO: The instruction always flushes denormal results (except for f16),
-      // should this also?
-      return replaceInstUsesWith(CI, ConstantFP::get(II->getContext(), Val));
-    }
-
-    break;
-  }
-  case Intrinsic::amdgcn_rsq: {
-    Value *Src = II->getArgOperand(0);
-
-    // TODO: Move to ConstantFolding/InstSimplify?
-    if (isa<UndefValue>(Src)) {
-      Type *Ty = II->getType();
-      auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
-      return replaceInstUsesWith(CI, QNaN);
-    }
-
-    break;
-  }
-  case Intrinsic::amdgcn_frexp_mant:
-  case Intrinsic::amdgcn_frexp_exp: {
-    Value *Src = II->getArgOperand(0);
-    if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
-      int Exp;
-      APFloat Significand = frexp(C->getValueAPF(), Exp,
-                                  APFloat::rmNearestTiesToEven);
-
-      if (IID == Intrinsic::amdgcn_frexp_mant) {
-        return replaceInstUsesWith(CI, ConstantFP::get(II->getContext(),
-                                                       Significand));
-      }
-
-      // Match instruction special case behavior.
-      if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
-        Exp = 0;
-
-      return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Exp));
-    }
-
-    if (isa<UndefValue>(Src))
-      return replaceInstUsesWith(CI, UndefValue::get(II->getType()));
-
-    break;
-  }
-  case Intrinsic::amdgcn_class: {
-    enum  {
-      S_NAN = 1 << 0,        // Signaling NaN
-      Q_NAN = 1 << 1,        // Quiet NaN
-      N_INFINITY = 1 << 2,   // Negative infinity
-      N_NORMAL = 1 << 3,     // Negative normal
-      N_SUBNORMAL = 1 << 4,  // Negative subnormal
-      N_ZERO = 1 << 5,       // Negative zero
-      P_ZERO = 1 << 6,       // Positive zero
-      P_SUBNORMAL = 1 << 7,  // Positive subnormal
-      P_NORMAL = 1 << 8,     // Positive normal
-      P_INFINITY = 1 << 9    // Positive infinity
-    };
-
-    const uint32_t FullMask = S_NAN | Q_NAN | N_INFINITY | N_NORMAL |
-      N_SUBNORMAL | N_ZERO | P_ZERO | P_SUBNORMAL | P_NORMAL | P_INFINITY;
-
-    Value *Src0 = II->getArgOperand(0);
-    Value *Src1 = II->getArgOperand(1);
-    const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
-    if (!CMask) {
-      if (isa<UndefValue>(Src0))
-        return replaceInstUsesWith(*II, UndefValue::get(II->getType()));
-
-      if (isa<UndefValue>(Src1))
-        return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), false));
-      break;
-    }
-
-    uint32_t Mask = CMask->getZExtValue();
-
-    // If all tests are made, it doesn't matter what the value is.
-    if ((Mask & FullMask) == FullMask)
-      return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), true));
-
-    if ((Mask & FullMask) == 0)
-      return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), false));
-
-    if (Mask == (S_NAN | Q_NAN)) {
-      // Equivalent of isnan. Replace with standard fcmp.
-      Value *FCmp = Builder.CreateFCmpUNO(Src0, Src0);
-      FCmp->takeName(II);
-      return replaceInstUsesWith(*II, FCmp);
-    }
-
-    if (Mask == (N_ZERO | P_ZERO)) {
-      // Equivalent of == 0.
-      Value *FCmp = Builder.CreateFCmpOEQ(
-        Src0, ConstantFP::get(Src0->getType(), 0.0));
-
-      FCmp->takeName(II);
-      return replaceInstUsesWith(*II, FCmp);
-    }
-
-    // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other
-    if (((Mask & S_NAN) || (Mask & Q_NAN)) && isKnownNeverNaN(Src0, &TLI))
-      return replaceOperand(*II, 1, ConstantInt::get(Src1->getType(),
-                                                     Mask & ~(S_NAN | Q_NAN)));
-
-    const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0);
-    if (!CVal) {
-      if (isa<UndefValue>(Src0))
-        return replaceInstUsesWith(*II, UndefValue::get(II->getType()));
-
-      // Clamp mask to used bits
-      if ((Mask & FullMask) != Mask) {
-        CallInst *NewCall = Builder.CreateCall(II->getCalledFunction(),
-          { Src0, ConstantInt::get(Src1->getType(), Mask & FullMask) }
-        );
-
-        NewCall->takeName(II);
-        return replaceInstUsesWith(*II, NewCall);
-      }
-
-      break;
-    }
-
-    const APFloat &Val = CVal->getValueAPF();
-
-    bool Result =
-      ((Mask & S_NAN) && Val.isNaN() && Val.isSignaling()) ||
-      ((Mask & Q_NAN) && Val.isNaN() && !Val.isSignaling()) ||
-      ((Mask & N_INFINITY) && Val.isInfinity() && Val.isNegative()) ||
-      ((Mask & N_NORMAL) && Val.isNormal() && Val.isNegative()) ||
-      ((Mask & N_SUBNORMAL) && Val.isDenormal() && Val.isNegative()) ||
-      ((Mask & N_ZERO) && Val.isZero() && Val.isNegative()) ||
-      ((Mask & P_ZERO) && Val.isZero() && !Val.isNegative()) ||
-      ((Mask & P_SUBNORMAL) && Val.isDenormal() && !Val.isNegative()) ||
-      ((Mask & P_NORMAL) && Val.isNormal() && !Val.isNegative()) ||
-      ((Mask & P_INFINITY) && Val.isInfinity() && !Val.isNegative());
-
-    return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), Result));
-  }
-  case Intrinsic::amdgcn_cvt_pkrtz: {
-    Value *Src0 = II->getArgOperand(0);
-    Value *Src1 = II->getArgOperand(1);
-    if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
-      if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
-        const fltSemantics &HalfSem
-          = II->getType()->getScalarType()->getFltSemantics();
-        bool LosesInfo;
-        APFloat Val0 = C0->getValueAPF();
-        APFloat Val1 = C1->getValueAPF();
-        Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
-        Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
-
-        Constant *Folded = ConstantVector::get({
-            ConstantFP::get(II->getContext(), Val0),
-            ConstantFP::get(II->getContext(), Val1) });
-        return replaceInstUsesWith(*II, Folded);
-      }
-    }
-
-    if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1))
-      return replaceInstUsesWith(*II, UndefValue::get(II->getType()));
-
-    break;
-  }
-  case Intrinsic::amdgcn_cvt_pknorm_i16:
-  case Intrinsic::amdgcn_cvt_pknorm_u16:
-  case Intrinsic::amdgcn_cvt_pk_i16:
-  case Intrinsic::amdgcn_cvt_pk_u16: {
-    Value *Src0 = II->getArgOperand(0);
-    Value *Src1 = II->getArgOperand(1);
-
-    if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1))
-      return replaceInstUsesWith(*II, UndefValue::get(II->getType()));
-
-    break;
-  }
-  case Intrinsic::amdgcn_ubfe:
-  case Intrinsic::amdgcn_sbfe: {
-    // Decompose simple cases into standard shifts.
-    Value *Src = II->getArgOperand(0);
-    if (isa<UndefValue>(Src))
-      return replaceInstUsesWith(*II, Src);
-
-    unsigned Width;
-    Type *Ty = II->getType();
-    unsigned IntSize = Ty->getIntegerBitWidth();
-
-    ConstantInt *CWidth = dyn_cast<ConstantInt>(II->getArgOperand(2));
-    if (CWidth) {
-      Width = CWidth->getZExtValue();
-      if ((Width & (IntSize - 1)) == 0)
-        return replaceInstUsesWith(*II, ConstantInt::getNullValue(Ty));
-
-      // Hardware ignores high bits, so remove those.
-      if (Width >= IntSize)
-        return replaceOperand(*II, 2, ConstantInt::get(CWidth->getType(),
-                                                       Width & (IntSize - 1)));
-    }
-
-    unsigned Offset;
-    ConstantInt *COffset = dyn_cast<ConstantInt>(II->getArgOperand(1));
-    if (COffset) {
-      Offset = COffset->getZExtValue();
-      if (Offset >= IntSize)
-        return replaceOperand(*II, 1, ConstantInt::get(COffset->getType(),
-                                                       Offset & (IntSize - 1)));
-    }
-
-    bool Signed = IID == Intrinsic::amdgcn_sbfe;
-
-    if (!CWidth || !COffset)
-      break;
-
-    // The case of Width == 0 is handled above, which makes this tranformation
-    // safe.  If Width == 0, then the ashr and lshr instructions become poison
-    // value since the shift amount would be equal to the bit size.
-    assert(Width != 0);
-
-    // TODO: This allows folding to undef when the hardware has specific
-    // behavior?
-    if (Offset + Width < IntSize) {
-      Value *Shl = Builder.CreateShl(Src, IntSize - Offset - Width);
-      Value *RightShift = Signed ? Builder.CreateAShr(Shl, IntSize - Width)
-                                 : Builder.CreateLShr(Shl, IntSize - Width);
-      RightShift->takeName(II);
-      return replaceInstUsesWith(*II, RightShift);
-    }
-
-    Value *RightShift = Signed ? Builder.CreateAShr(Src, Offset)
-                               : Builder.CreateLShr(Src, Offset);
-
-    RightShift->takeName(II);
-    return replaceInstUsesWith(*II, RightShift);
-  }
-  case Intrinsic::amdgcn_exp:
-  case Intrinsic::amdgcn_exp_compr: {
-    ConstantInt *En = cast<ConstantInt>(II->getArgOperand(1));
-    unsigned EnBits = En->getZExtValue();
-    if (EnBits == 0xf)
-      break; // All inputs enabled.
-
-    bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
-    bool Changed = false;
-    for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
-      if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
-          (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
-        Value *Src = II->getArgOperand(I + 2);
-        if (!isa<UndefValue>(Src)) {
-          replaceOperand(*II, I + 2, UndefValue::get(Src->getType()));
-          Changed = true;
-        }
-      }
-    }
-
-    if (Changed)
-      return II;
-
-    break;
-  }
-  case Intrinsic::amdgcn_fmed3: {
-    // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled
-    // for the shader.
-
-    Value *Src0 = II->getArgOperand(0);
-    Value *Src1 = II->getArgOperand(1);
-    Value *Src2 = II->getArgOperand(2);
-
-    // Checking for NaN before canonicalization provides better fidelity when
-    // mapping other operations onto fmed3 since the order of operands is
-    // unchanged.
-    CallInst *NewCall = nullptr;
-    if (match(Src0, m_NaN()) || isa<UndefValue>(Src0)) {
-      NewCall = Builder.CreateMinNum(Src1, Src2);
-    } else if (match(Src1, m_NaN()) || isa<UndefValue>(Src1)) {
-      NewCall = Builder.CreateMinNum(Src0, Src2);
-    } else if (match(Src2, m_NaN()) || isa<UndefValue>(Src2)) {
-      NewCall = Builder.CreateMaxNum(Src0, Src1);
-    }
-
-    if (NewCall) {
-      NewCall->copyFastMathFlags(II);
-      NewCall->takeName(II);
-      return replaceInstUsesWith(*II, NewCall);
-    }
-
-    bool Swap = false;
-    // Canonicalize constants to RHS operands.
-    //
-    // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
-    if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
-      std::swap(Src0, Src1);
-      Swap = true;
-    }
-
-    if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
-      std::swap(Src1, Src2);
-      Swap = true;
-    }
-
-    if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
-      std::swap(Src0, Src1);
-      Swap = true;
-    }
-
-    if (Swap) {
-      II->setArgOperand(0, Src0);
-      II->setArgOperand(1, Src1);
-      II->setArgOperand(2, Src2);
-      return II;
-    }
-
-    if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
-      if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
-        if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
-          APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
-                                       C2->getValueAPF());
-          return replaceInstUsesWith(*II,
-            ConstantFP::get(Builder.getContext(), Result));
-        }
-      }
-    }
-
-    break;
-  }
-  case Intrinsic::amdgcn_icmp:
-  case Intrinsic::amdgcn_fcmp: {
-    const ConstantInt *CC = cast<ConstantInt>(II->getArgOperand(2));
-    // Guard against invalid arguments.
-    int64_t CCVal = CC->getZExtValue();
-    bool IsInteger = IID == Intrinsic::amdgcn_icmp;
-    if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
-                       CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
-        (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
-                        CCVal > CmpInst::LAST_FCMP_PREDICATE)))
-      break;
-
-    Value *Src0 = II->getArgOperand(0);
-    Value *Src1 = II->getArgOperand(1);
-
-    if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
-      if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
-        Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1);
-        if (CCmp->isNullValue()) {
-          return replaceInstUsesWith(
-              *II, ConstantExpr::getSExt(CCmp, II->getType()));
-        }
-
-        // The result of V_ICMP/V_FCMP assembly instructions (which this
-        // intrinsic exposes) is one bit per thread, masked with the EXEC
-        // register (which contains the bitmask of live threads). So a
-        // comparison that always returns true is the same as a read of the
-        // EXEC register.
-        Function *NewF = Intrinsic::getDeclaration(
-            II->getModule(), Intrinsic::read_register, II->getType());
-        Metadata *MDArgs[] = {MDString::get(II->getContext(), "exec")};
-        MDNode *MD = MDNode::get(II->getContext(), MDArgs);
-        Value *Args[] = {MetadataAsValue::get(II->getContext(), MD)};
-        CallInst *NewCall = Builder.CreateCall(NewF, Args);
-        NewCall->addAttribute(AttributeList::FunctionIndex,
-                              Attribute::Convergent);
-        NewCall->takeName(II);
-        return replaceInstUsesWith(*II, NewCall);
-      }
-
-      // Canonicalize constants to RHS.
-      CmpInst::Predicate SwapPred
-        = CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal));
-      II->setArgOperand(0, Src1);
-      II->setArgOperand(1, Src0);
-      II->setArgOperand(2, ConstantInt::get(CC->getType(),
-                                            static_cast<int>(SwapPred)));
-      return II;
-    }
-
-    if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
-      break;
-
-    // Canonicalize compare eq with true value to compare != 0
-    // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
-    //   -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
-    // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
-    //   -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
-    Value *ExtSrc;
-    if (CCVal == CmpInst::ICMP_EQ &&
-        ((match(Src1, m_One()) && match(Src0, m_ZExt(m_Value(ExtSrc)))) ||
-         (match(Src1, m_AllOnes()) && match(Src0, m_SExt(m_Value(ExtSrc))))) &&
-        ExtSrc->getType()->isIntegerTy(1)) {
-      replaceOperand(*II, 1, ConstantInt::getNullValue(Src1->getType()));
-      replaceOperand(*II, 2, ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
-      return II;
-    }
-
-    CmpInst::Predicate SrcPred;
-    Value *SrcLHS;
-    Value *SrcRHS;
-
-    // Fold compare eq/ne with 0 from a compare result as the predicate to the
-    // intrinsic. The typical use is a wave vote function in the library, which
-    // will be fed from a user code condition compared with 0. Fold in the
-    // redundant compare.
-
-    // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
-    //   -> llvm.amdgcn.[if]cmp(a, b, pred)
-    //
-    // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
-    //   -> llvm.amdgcn.[if]cmp(a, b, inv pred)
-    if (match(Src1, m_Zero()) &&
-        match(Src0,
-              m_ZExtOrSExt(m_Cmp(SrcPred, m_Value(SrcLHS), m_Value(SrcRHS))))) {
-      if (CCVal == CmpInst::ICMP_EQ)
-        SrcPred = CmpInst::getInversePredicate(SrcPred);
-
-      Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred) ?
-        Intrinsic::amdgcn_fcmp : Intrinsic::amdgcn_icmp;
-
-      Type *Ty = SrcLHS->getType();
-      if (auto *CmpType = dyn_cast<IntegerType>(Ty)) {
-        // Promote to next legal integer type.
-        unsigned Width = CmpType->getBitWidth();
-        unsigned NewWidth = Width;
-
-        // Don't do anything for i1 comparisons.
-        if (Width == 1)
-          break;
-
-        if (Width <= 16)
-          NewWidth = 16;
-        else if (Width <= 32)
-          NewWidth = 32;
-        else if (Width <= 64)
-          NewWidth = 64;
-        else if (Width > 64)
-          break; // Can't handle this.
-
-        if (Width != NewWidth) {
-          IntegerType *CmpTy = Builder.getIntNTy(NewWidth);
-          if (CmpInst::isSigned(SrcPred)) {
-            SrcLHS = Builder.CreateSExt(SrcLHS, CmpTy);
-            SrcRHS = Builder.CreateSExt(SrcRHS, CmpTy);
-          } else {
-            SrcLHS = Builder.CreateZExt(SrcLHS, CmpTy);
-            SrcRHS = Builder.CreateZExt(SrcRHS, CmpTy);
-          }
-        }
-      } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
-        break;
-
-      Function *NewF =
-          Intrinsic::getDeclaration(II->getModule(), NewIID,
-                                    { II->getType(),
-                                      SrcLHS->getType() });
-      Value *Args[] = { SrcLHS, SrcRHS,
-                        ConstantInt::get(CC->getType(), SrcPred) };
-      CallInst *NewCall = Builder.CreateCall(NewF, Args);
-      NewCall->takeName(II);
-      return replaceInstUsesWith(*II, NewCall);
-    }
-
-    break;
-  }
-  case Intrinsic::amdgcn_ballot: {
-    if (auto *Src = dyn_cast<ConstantInt>(II->getArgOperand(0))) {
-      if (Src->isZero()) {
-        // amdgcn.ballot(i1 0) is zero.
-        return replaceInstUsesWith(*II, Constant::getNullValue(II->getType()));
-      }
-
-      if (Src->isOne()) {
-        // amdgcn.ballot(i1 1) is exec.
-        const char *RegName = "exec";
-        if (II->getType()->isIntegerTy(32))
-          RegName = "exec_lo";
-        else if (!II->getType()->isIntegerTy(64))
-          break;
-
-        Function *NewF = Intrinsic::getDeclaration(
-            II->getModule(), Intrinsic::read_register, II->getType());
-        Metadata *MDArgs[] = {MDString::get(II->getContext(), RegName)};
-        MDNode *MD = MDNode::get(II->getContext(), MDArgs);
-        Value *Args[] = {MetadataAsValue::get(II->getContext(), MD)};
-        CallInst *NewCall = Builder.CreateCall(NewF, Args);
-        NewCall->addAttribute(AttributeList::FunctionIndex,
-                              Attribute::Convergent);
-        NewCall->takeName(II);
-        return replaceInstUsesWith(*II, NewCall);
-      }
-    }
-    break;
-  }
-  case Intrinsic::amdgcn_wqm_vote: {
-    // wqm_vote is identity when the argument is constant.
-    if (!isa<Constant>(II->getArgOperand(0)))
-      break;
-
-    return replaceInstUsesWith(*II, II->getArgOperand(0));
-  }
-  case Intrinsic::amdgcn_kill: {
-    const ConstantInt *C = dyn_cast<ConstantInt>(II->getArgOperand(0));
-    if (!C || !C->getZExtValue())
-      break;
-
-    // amdgcn.kill(i1 1) is a no-op
-    return eraseInstFromFunction(CI);
-  }
-  case Intrinsic::amdgcn_update_dpp: {
-    Value *Old = II->getArgOperand(0);
-
-    auto BC = cast<ConstantInt>(II->getArgOperand(5));
-    auto RM = cast<ConstantInt>(II->getArgOperand(3));
-    auto BM = cast<ConstantInt>(II->getArgOperand(4));
-    if (BC->isZeroValue() ||
-        RM->getZExtValue() != 0xF ||
-        BM->getZExtValue() != 0xF ||
-        isa<UndefValue>(Old))
-      break;
-
-    // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
-    return replaceOperand(*II, 0, UndefValue::get(Old->getType()));
-  }
-  case Intrinsic::amdgcn_permlane16:
-  case Intrinsic::amdgcn_permlanex16: {
-    // Discard vdst_in if it's not going to be read.
-    Value *VDstIn = II->getArgOperand(0);
-   if (isa<UndefValue>(VDstIn))
-     break;
-
-    ConstantInt *FetchInvalid = cast<ConstantInt>(II->getArgOperand(4));
-    ConstantInt *BoundCtrl = cast<ConstantInt>(II->getArgOperand(5));
-    if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
-      break;
-
-    return replaceOperand(*II, 0, UndefValue::get(VDstIn->getType()));
-  }
-  case Intrinsic::amdgcn_readfirstlane:
-  case Intrinsic::amdgcn_readlane: {
-    // A constant value is trivially uniform.
-    if (Constant *C = dyn_cast<Constant>(II->getArgOperand(0)))
-      return replaceInstUsesWith(*II, C);
-
-    // The rest of these may not be safe if the exec may not be the same between
-    // the def and use.
-    Value *Src = II->getArgOperand(0);
-    Instruction *SrcInst = dyn_cast<Instruction>(Src);
-    if (SrcInst && SrcInst->getParent() != II->getParent())
-      break;
-
-    // readfirstlane (readfirstlane x) -> readfirstlane x
-    // readlane (readfirstlane x), y -> readfirstlane x
-    if (match(Src, m_Intrinsic<Intrinsic::amdgcn_readfirstlane>()))
-      return replaceInstUsesWith(*II, Src);
-
-    if (IID == Intrinsic::amdgcn_readfirstlane) {
-      // readfirstlane (readlane x, y) -> readlane x, y
-      if (match(Src, m_Intrinsic<Intrinsic::amdgcn_readlane>()))
-        return replaceInstUsesWith(*II, Src);
-    } else {
-      // readlane (readlane x, y), y -> readlane x, y
-      if (match(Src, m_Intrinsic<Intrinsic::amdgcn_readlane>(
-                  m_Value(), m_Specific(II->getArgOperand(1)))))
-        return replaceInstUsesWith(*II, Src);
-    }
-
-    break;
-  }
-  case Intrinsic::amdgcn_ldexp: {
-    // FIXME: This doesn't introduce new instructions and belongs in
-    // InstructionSimplify.
-    Type *Ty = II->getType();
-    Value *Op0 = II->getArgOperand(0);
-    Value *Op1 = II->getArgOperand(1);
-
-    // Folding undef to qnan is safe regardless of the FP mode.
-    if (isa<UndefValue>(Op0)) {
-      auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
-      return replaceInstUsesWith(*II, QNaN);
-    }
-
-    const APFloat *C = nullptr;
-    match(Op0, m_APFloat(C));
-
-    // FIXME: Should flush denorms depending on FP mode, but that's ignored
-    // everywhere else.
-    //
-    // These cases should be safe, even with strictfp.
-    // ldexp(0.0, x) -> 0.0
-    // ldexp(-0.0, x) -> -0.0
-    // ldexp(inf, x) -> inf
-    // ldexp(-inf, x) -> -inf
-    if (C && (C->isZero() || C->isInfinity()))
-      return replaceInstUsesWith(*II, Op0);
-
-    // With strictfp, be more careful about possibly needing to flush denormals
-    // or not, and snan behavior depends on ieee_mode.
-    if (II->isStrictFP())
-      break;
-
-    if (C && C->isNaN()) {
-      // FIXME: We just need to make the nan quiet here, but that's unavailable
-      // on APFloat, only IEEEfloat
-      auto *Quieted = ConstantFP::get(
-        Ty, scalbn(*C, 0, APFloat::rmNearestTiesToEven));
-      return replaceInstUsesWith(*II, Quieted);
-    }
-
-    // ldexp(x, 0) -> x
-    // ldexp(x, undef) -> x
-    if (isa<UndefValue>(Op1) || match(Op1, m_ZeroInt()))
-      return replaceInstUsesWith(*II, Op0);
-
-    break;
-  }
   case Intrinsic::hexagon_V6_vandvrt:
   case Intrinsic::hexagon_V6_vandvrt_128B: {
     // Simplify Q -> V -> Q conversion.
@@ -4220,11 +1491,16 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     break;
   case Intrinsic::assume: {
     Value *IIOperand = II->getArgOperand(0);
+    SmallVector<OperandBundleDef, 4> OpBundles;
+    II->getOperandBundlesAsDefs(OpBundles);
+    bool HasOpBundles = !OpBundles.empty();
     // Remove an assume if it is followed by an identical assume.
     // TODO: Do we need this? Unless there are conflicting assumptions, the
     // computeKnownBits(IIOperand) below here eliminates redundant assumes.
     Instruction *Next = II->getNextNonDebugInstruction();
-    if (match(Next, m_Intrinsic<Intrinsic::assume>(m_Specific(IIOperand))))
+    if (HasOpBundles &&
+        match(Next, m_Intrinsic<Intrinsic::assume>(m_Specific(IIOperand))) &&
+        !cast<IntrinsicInst>(Next)->hasOperandBundles())
       return eraseInstFromFunction(CI);
 
     // Canonicalize assume(a && b) -> assume(a); assume(b);
@@ -4233,15 +1509,16 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     FunctionType *AssumeIntrinsicTy = II->getFunctionType();
     Value *AssumeIntrinsic = II->getCalledOperand();
     Value *A, *B;
-    if (match(IIOperand, m_And(m_Value(A), m_Value(B)))) {
-      Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic, A, II->getName());
+    if (match(IIOperand, m_LogicalAnd(m_Value(A), m_Value(B)))) {
+      Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic, A, OpBundles,
+                         II->getName());
       Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic, B, II->getName());
       return eraseInstFromFunction(*II);
     }
     // assume(!(a || b)) -> assume(!a); assume(!b);
-    if (match(IIOperand, m_Not(m_Or(m_Value(A), m_Value(B))))) {
+    if (match(IIOperand, m_Not(m_LogicalOr(m_Value(A), m_Value(B))))) {
       Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic,
-                         Builder.CreateNot(A), II->getName());
+                         Builder.CreateNot(A), OpBundles, II->getName());
       Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic,
                          Builder.CreateNot(B), II->getName());
       return eraseInstFromFunction(*II);
@@ -4257,7 +1534,8 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
         isValidAssumeForContext(II, LHS, &DT)) {
       MDNode *MD = MDNode::get(II->getContext(), None);
       LHS->setMetadata(LLVMContext::MD_nonnull, MD);
-      return eraseInstFromFunction(*II);
+      if (!HasOpBundles)
+        return eraseInstFromFunction(*II);
 
       // TODO: apply nonnull return attributes to calls and invokes
       // TODO: apply range metadata for range check patterns?
@@ -4275,59 +1553,104 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     AC.updateAffectedValues(II);
     break;
   }
-  case Intrinsic::experimental_gc_relocate: {
-    auto &GCR = *cast<GCRelocateInst>(II);
-
-    // If we have two copies of the same pointer in the statepoint argument
-    // list, canonicalize to one.  This may let us common gc.relocates.
-    if (GCR.getBasePtr() == GCR.getDerivedPtr() &&
-        GCR.getBasePtrIndex() != GCR.getDerivedPtrIndex()) {
-      auto *OpIntTy = GCR.getOperand(2)->getType();
-      return replaceOperand(*II, 2,
-          ConstantInt::get(OpIntTy, GCR.getBasePtrIndex()));
-    }
+  case Intrinsic::experimental_gc_statepoint: {
+    GCStatepointInst &GCSP = *cast<GCStatepointInst>(II);
+    SmallPtrSet<Value *, 32> LiveGcValues;
+    for (const GCRelocateInst *Reloc : GCSP.getGCRelocates()) {
+      GCRelocateInst &GCR = *const_cast<GCRelocateInst *>(Reloc);
 
-    // Translate facts known about a pointer before relocating into
-    // facts about the relocate value, while being careful to
-    // preserve relocation semantics.
-    Value *DerivedPtr = GCR.getDerivedPtr();
+      // Remove the relocation if unused.
+      if (GCR.use_empty()) {
+        eraseInstFromFunction(GCR);
+        continue;
+      }
 
-    // Remove the relocation if unused, note that this check is required
-    // to prevent the cases below from looping forever.
-    if (II->use_empty())
-      return eraseInstFromFunction(*II);
+      Value *DerivedPtr = GCR.getDerivedPtr();
+      Value *BasePtr = GCR.getBasePtr();
 
-    // Undef is undef, even after relocation.
-    // TODO: provide a hook for this in GCStrategy.  This is clearly legal for
-    // most practical collectors, but there was discussion in the review thread
-    // about whether it was legal for all possible collectors.
-    if (isa<UndefValue>(DerivedPtr))
-      // Use undef of gc_relocate's type to replace it.
-      return replaceInstUsesWith(*II, UndefValue::get(II->getType()));
-
-    if (auto *PT = dyn_cast<PointerType>(II->getType())) {
-      // The relocation of null will be null for most any collector.
-      // TODO: provide a hook for this in GCStrategy.  There might be some
-      // weird collector this property does not hold for.
-      if (isa<ConstantPointerNull>(DerivedPtr))
-        // Use null-pointer of gc_relocate's type to replace it.
-        return replaceInstUsesWith(*II, ConstantPointerNull::get(PT));
-
-      // isKnownNonNull -> nonnull attribute
-      if (!II->hasRetAttr(Attribute::NonNull) &&
-          isKnownNonZero(DerivedPtr, DL, 0, &AC, II, &DT)) {
-        II->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
-        return II;
+      // Undef is undef, even after relocation.
+      if (isa<UndefValue>(DerivedPtr) || isa<UndefValue>(BasePtr)) {
+        replaceInstUsesWith(GCR, UndefValue::get(GCR.getType()));
+        eraseInstFromFunction(GCR);
+        continue;
       }
-    }
 
-    // TODO: bitcast(relocate(p)) -> relocate(bitcast(p))
-    // Canonicalize on the type from the uses to the defs
+      if (auto *PT = dyn_cast<PointerType>(GCR.getType())) {
+        // The relocation of null will be null for most any collector.
+        // TODO: provide a hook for this in GCStrategy.  There might be some
+        // weird collector this property does not hold for.
+        if (isa<ConstantPointerNull>(DerivedPtr)) {
+          // Use null-pointer of gc_relocate's type to replace it.
+          replaceInstUsesWith(GCR, ConstantPointerNull::get(PT));
+          eraseInstFromFunction(GCR);
+          continue;
+        }
+
+        // isKnownNonNull -> nonnull attribute
+        if (!GCR.hasRetAttr(Attribute::NonNull) &&
+            isKnownNonZero(DerivedPtr, DL, 0, &AC, II, &DT)) {
+          GCR.addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
+          // We discovered new fact, re-check users.
+          Worklist.pushUsersToWorkList(GCR);
+        }
+      }
+
+      // If we have two copies of the same pointer in the statepoint argument
+      // list, canonicalize to one.  This may let us common gc.relocates.
+      if (GCR.getBasePtr() == GCR.getDerivedPtr() &&
+          GCR.getBasePtrIndex() != GCR.getDerivedPtrIndex()) {
+        auto *OpIntTy = GCR.getOperand(2)->getType();
+        GCR.setOperand(2, ConstantInt::get(OpIntTy, GCR.getBasePtrIndex()));
+      }
 
-    // TODO: relocate((gep p, C, C2, ...)) -> gep(relocate(p), C, C2, ...)
+      // TODO: bitcast(relocate(p)) -> relocate(bitcast(p))
+      // Canonicalize on the type from the uses to the defs
+
+      // TODO: relocate((gep p, C, C2, ...)) -> gep(relocate(p), C, C2, ...)
+      LiveGcValues.insert(BasePtr);
+      LiveGcValues.insert(DerivedPtr);
+    }
+    Optional<OperandBundleUse> Bundle =
+        GCSP.getOperandBundle(LLVMContext::OB_gc_live);
+    unsigned NumOfGCLives = LiveGcValues.size();
+    if (!Bundle.hasValue() || NumOfGCLives == Bundle->Inputs.size())
+      break;
+    // We can reduce the size of gc live bundle.
+    DenseMap<Value *, unsigned> Val2Idx;
+    std::vector<Value *> NewLiveGc;
+    for (unsigned I = 0, E = Bundle->Inputs.size(); I < E; ++I) {
+      Value *V = Bundle->Inputs[I];
+      if (Val2Idx.count(V))
+        continue;
+      if (LiveGcValues.count(V)) {
+        Val2Idx[V] = NewLiveGc.size();
+        NewLiveGc.push_back(V);
+      } else
+        Val2Idx[V] = NumOfGCLives;
+    }
+    // Update all gc.relocates
+    for (const GCRelocateInst *Reloc : GCSP.getGCRelocates()) {
+      GCRelocateInst &GCR = *const_cast<GCRelocateInst *>(Reloc);
+      Value *BasePtr = GCR.getBasePtr();
+      assert(Val2Idx.count(BasePtr) && Val2Idx[BasePtr] != NumOfGCLives &&
+             "Missed live gc for base pointer");
+      auto *OpIntTy1 = GCR.getOperand(1)->getType();
+      GCR.setOperand(1, ConstantInt::get(OpIntTy1, Val2Idx[BasePtr]));
+      Value *DerivedPtr = GCR.getDerivedPtr();
+      assert(Val2Idx.count(DerivedPtr) && Val2Idx[DerivedPtr] != NumOfGCLives &&
+             "Missed live gc for derived pointer");
+      auto *OpIntTy2 = GCR.getOperand(2)->getType();
+      GCR.setOperand(2, ConstantInt::get(OpIntTy2, Val2Idx[DerivedPtr]));
+    }
+    // Create new statepoint instruction.
+    OperandBundleDef NewBundle("gc-live", NewLiveGc);
+    if (isa<CallInst>(II))
+      return CallInst::CreateWithReplacedBundle(cast<CallInst>(II), NewBundle);
+    else
+      return InvokeInst::CreateWithReplacedBundle(cast<InvokeInst>(II),
+                                                  NewBundle);
     break;
   }
-
   case Intrinsic::experimental_guard: {
     // Is this guard followed by another guard?  We scan forward over a small
     // fixed window of instructions to handle common cases with conditions
@@ -4360,12 +1683,114 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     }
     break;
   }
+  case Intrinsic::experimental_vector_insert: {
+    Value *Vec = II->getArgOperand(0);
+    Value *SubVec = II->getArgOperand(1);
+    Value *Idx = II->getArgOperand(2);
+    auto *DstTy = dyn_cast<FixedVectorType>(II->getType());
+    auto *VecTy = dyn_cast<FixedVectorType>(Vec->getType());
+    auto *SubVecTy = dyn_cast<FixedVectorType>(SubVec->getType());
+
+    // Only canonicalize if the destination vector, Vec, and SubVec are all
+    // fixed vectors.
+    if (DstTy && VecTy && SubVecTy) {
+      unsigned DstNumElts = DstTy->getNumElements();
+      unsigned VecNumElts = VecTy->getNumElements();
+      unsigned SubVecNumElts = SubVecTy->getNumElements();
+      unsigned IdxN = cast<ConstantInt>(Idx)->getZExtValue();
+
+      // The result of this call is undefined if IdxN is not a constant multiple
+      // of the SubVec's minimum vector length OR the insertion overruns Vec.
+      if (IdxN % SubVecNumElts != 0 || IdxN + SubVecNumElts > VecNumElts) {
+        replaceInstUsesWith(CI, UndefValue::get(CI.getType()));
+        return eraseInstFromFunction(CI);
+      }
+
+      // An insert that entirely overwrites Vec with SubVec is a nop.
+      if (VecNumElts == SubVecNumElts) {
+        replaceInstUsesWith(CI, SubVec);
+        return eraseInstFromFunction(CI);
+      }
+
+      // Widen SubVec into a vector of the same width as Vec, since
+      // shufflevector requires the two input vectors to be the same width.
+      // Elements beyond the bounds of SubVec within the widened vector are
+      // undefined.
+      SmallVector<int, 8> WidenMask;
+      unsigned i;
+      for (i = 0; i != SubVecNumElts; ++i)
+        WidenMask.push_back(i);
+      for (; i != VecNumElts; ++i)
+        WidenMask.push_back(UndefMaskElem);
+
+      Value *WidenShuffle = Builder.CreateShuffleVector(SubVec, WidenMask);
+
+      SmallVector<int, 8> Mask;
+      for (unsigned i = 0; i != IdxN; ++i)
+        Mask.push_back(i);
+      for (unsigned i = DstNumElts; i != DstNumElts + SubVecNumElts; ++i)
+        Mask.push_back(i);
+      for (unsigned i = IdxN + SubVecNumElts; i != DstNumElts; ++i)
+        Mask.push_back(i);
+
+      Value *Shuffle = Builder.CreateShuffleVector(Vec, WidenShuffle, Mask);
+      replaceInstUsesWith(CI, Shuffle);
+      return eraseInstFromFunction(CI);
+    }
+    break;
+  }
+  case Intrinsic::experimental_vector_extract: {
+    Value *Vec = II->getArgOperand(0);
+    Value *Idx = II->getArgOperand(1);
+
+    auto *DstTy = dyn_cast<FixedVectorType>(II->getType());
+    auto *VecTy = dyn_cast<FixedVectorType>(Vec->getType());
+
+    // Only canonicalize if the the destination vector and Vec are fixed
+    // vectors.
+    if (DstTy && VecTy) {
+      unsigned DstNumElts = DstTy->getNumElements();
+      unsigned VecNumElts = VecTy->getNumElements();
+      unsigned IdxN = cast<ConstantInt>(Idx)->getZExtValue();
+
+      // The result of this call is undefined if IdxN is not a constant multiple
+      // of the result type's minimum vector length OR the extraction overruns
+      // Vec.
+      if (IdxN % DstNumElts != 0 || IdxN + DstNumElts > VecNumElts) {
+        replaceInstUsesWith(CI, UndefValue::get(CI.getType()));
+        return eraseInstFromFunction(CI);
+      }
+
+      // Extracting the entirety of Vec is a nop.
+      if (VecNumElts == DstNumElts) {
+        replaceInstUsesWith(CI, Vec);
+        return eraseInstFromFunction(CI);
+      }
+
+      SmallVector<int, 8> Mask;
+      for (unsigned i = 0; i != DstNumElts; ++i)
+        Mask.push_back(IdxN + i);
+
+      Value *Shuffle =
+          Builder.CreateShuffleVector(Vec, UndefValue::get(VecTy), Mask);
+      replaceInstUsesWith(CI, Shuffle);
+      return eraseInstFromFunction(CI);
+    }
+    break;
+  }
+  default: {
+    // Handle target specific intrinsics
+    Optional<Instruction *> V = targetInstCombineIntrinsic(*II);
+    if (V.hasValue())
+      return V.getValue();
+    break;
+  }
   }
   return visitCallBase(*II);
 }
 
 // Fence instruction simplification
-Instruction *InstCombiner::visitFenceInst(FenceInst &FI) {
+Instruction *InstCombinerImpl::visitFenceInst(FenceInst &FI) {
   // Remove identical consecutive fences.
   Instruction *Next = FI.getNextNonDebugInstruction();
   if (auto *NFI = dyn_cast<FenceInst>(Next))
@@ -4375,12 +1800,12 @@ Instruction *InstCombiner::visitFenceInst(FenceInst &FI) {
 }
 
 // InvokeInst simplification
-Instruction *InstCombiner::visitInvokeInst(InvokeInst &II) {
+Instruction *InstCombinerImpl::visitInvokeInst(InvokeInst &II) {
   return visitCallBase(II);
 }
 
 // CallBrInst simplification
-Instruction *InstCombiner::visitCallBrInst(CallBrInst &CBI) {
+Instruction *InstCombinerImpl::visitCallBrInst(CallBrInst &CBI) {
   return visitCallBase(CBI);
 }
 
@@ -4420,7 +1845,7 @@ static bool isSafeToEliminateVarargsCast(const CallBase &Call,
   return true;
 }
 
-Instruction *InstCombiner::tryOptimizeCall(CallInst *CI) {
+Instruction *InstCombinerImpl::tryOptimizeCall(CallInst *CI) {
   if (!CI->getCalledFunction()) return nullptr;
 
   auto InstCombineRAUW = [this](Instruction *From, Value *With) {
@@ -4577,7 +2002,7 @@ static void annotateAnyAllocSite(CallBase &Call, const TargetLibraryInfo *TLI) {
 }
 
 /// Improvements for call, callbr and invoke instructions.
-Instruction *InstCombiner::visitCallBase(CallBase &Call) {
+Instruction *InstCombinerImpl::visitCallBase(CallBase &Call) {
   if (isAllocationFn(&Call, &TLI))
     annotateAnyAllocSite(Call, &TLI);
 
@@ -4633,7 +2058,7 @@ Instruction *InstCombiner::visitCallBase(CallBase &Call) {
         !CalleeF->isDeclaration()) {
       Instruction *OldCall = &Call;
       CreateNonTerminatorUnreachable(OldCall);
-      // If OldCall does not return void then replaceAllUsesWith undef.
+      // If OldCall does not return void then replaceInstUsesWith undef.
       // This allows ValueHandlers and custom metadata to adjust itself.
       if (!OldCall->getType()->isVoidTy())
         replaceInstUsesWith(*OldCall, UndefValue::get(OldCall->getType()));
@@ -4652,7 +2077,7 @@ Instruction *InstCombiner::visitCallBase(CallBase &Call) {
   if ((isa<ConstantPointerNull>(Callee) &&
        !NullPointerIsDefined(Call.getFunction())) ||
       isa<UndefValue>(Callee)) {
-    // If Call does not return void then replaceAllUsesWith undef.
+    // If Call does not return void then replaceInstUsesWith undef.
     // This allows ValueHandlers and custom metadata to adjust itself.
     if (!Call.getType()->isVoidTy())
       replaceInstUsesWith(Call, UndefValue::get(Call.getType()));
@@ -4728,7 +2153,7 @@ Instruction *InstCombiner::visitCallBase(CallBase &Call) {
 
 /// If the callee is a constexpr cast of a function, attempt to move the cast to
 /// the arguments of the call/callbr/invoke.
-bool InstCombiner::transformConstExprCastCall(CallBase &Call) {
+bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) {
   auto *Callee =
       dyn_cast<Function>(Call.getCalledOperand()->stripPointerCasts());
   if (!Callee)
@@ -4827,6 +2252,9 @@ bool InstCombiner::transformConstExprCastCall(CallBase &Call) {
     if (Call.isInAllocaArgument(i))
       return false;   // Cannot transform to and from inalloca.
 
+    if (CallerPAL.hasParamAttribute(i, Attribute::SwiftError))
+      return false;
+
     // If the parameter is passed as a byval argument, then we have to have a
     // sized type and the sized type has to have the same size as the old type.
     if (ParamTy != ActTy && CallerPAL.hasParamAttribute(i, Attribute::ByVal)) {
@@ -5012,8 +2440,8 @@ bool InstCombiner::transformConstExprCastCall(CallBase &Call) {
 /// Turn a call to a function created by init_trampoline / adjust_trampoline
 /// intrinsic pair into a direct call to the underlying function.
 Instruction *
-InstCombiner::transformCallThroughTrampoline(CallBase &Call,
-                                             IntrinsicInst &Tramp) {
+InstCombinerImpl::transformCallThroughTrampoline(CallBase &Call,
+                                                 IntrinsicInst &Tramp) {
   Value *Callee = Call.getCalledOperand();
   Type *CalleeTy = Callee->getType();
   FunctionType *FTy = Call.getFunctionType();
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 3639edb5df4d..07e68c44416d 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -14,10 +14,11 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/KnownBits.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
 #include <numeric>
 using namespace llvm;
 using namespace PatternMatch;
@@ -81,8 +82,8 @@ static Value *decomposeSimpleLinearExpr(Value *Val, unsigned &Scale,
 
 /// If we find a cast of an allocation instruction, try to eliminate the cast by
 /// moving the type information into the alloc.
-Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
-                                                   AllocaInst &AI) {
+Instruction *InstCombinerImpl::PromoteCastOfAllocation(BitCastInst &CI,
+                                                       AllocaInst &AI) {
   PointerType *PTy = cast<PointerType>(CI.getType());
 
   IRBuilderBase::InsertPointGuard Guard(Builder);
@@ -93,6 +94,18 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
   Type *CastElTy = PTy->getElementType();
   if (!AllocElTy->isSized() || !CastElTy->isSized()) return nullptr;
 
+  // This optimisation does not work for cases where the cast type
+  // is scalable and the allocated type is not. This because we need to
+  // know how many times the casted type fits into the allocated type.
+  // For the opposite case where the allocated type is scalable and the
+  // cast type is not this leads to poor code quality due to the
+  // introduction of 'vscale' into the calculations. It seems better to
+  // bail out for this case too until we've done a proper cost-benefit
+  // analysis.
+  bool AllocIsScalable = isa<ScalableVectorType>(AllocElTy);
+  bool CastIsScalable = isa<ScalableVectorType>(CastElTy);
+  if (AllocIsScalable != CastIsScalable) return nullptr;
+
   Align AllocElTyAlign = DL.getABITypeAlign(AllocElTy);
   Align CastElTyAlign = DL.getABITypeAlign(CastElTy);
   if (CastElTyAlign < AllocElTyAlign) return nullptr;
@@ -102,14 +115,15 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
   // same, we open the door to infinite loops of various kinds.
   if (!AI.hasOneUse() && CastElTyAlign == AllocElTyAlign) return nullptr;
 
-  uint64_t AllocElTySize = DL.getTypeAllocSize(AllocElTy);
-  uint64_t CastElTySize = DL.getTypeAllocSize(CastElTy);
+  // The alloc and cast types should be either both fixed or both scalable.
+  uint64_t AllocElTySize = DL.getTypeAllocSize(AllocElTy).getKnownMinSize();
+  uint64_t CastElTySize = DL.getTypeAllocSize(CastElTy).getKnownMinSize();
   if (CastElTySize == 0 || AllocElTySize == 0) return nullptr;
 
   // If the allocation has multiple uses, only promote it if we're not
   // shrinking the amount of memory being allocated.
-  uint64_t AllocElTyStoreSize = DL.getTypeStoreSize(AllocElTy);
-  uint64_t CastElTyStoreSize = DL.getTypeStoreSize(CastElTy);
+  uint64_t AllocElTyStoreSize = DL.getTypeStoreSize(AllocElTy).getKnownMinSize();
+  uint64_t CastElTyStoreSize = DL.getTypeStoreSize(CastElTy).getKnownMinSize();
   if (!AI.hasOneUse() && CastElTyStoreSize < AllocElTyStoreSize) return nullptr;
 
   // See if we can satisfy the modulus by pulling a scale out of the array
@@ -124,6 +138,9 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
   if ((AllocElTySize*ArraySizeScale) % CastElTySize != 0 ||
       (AllocElTySize*ArrayOffset   ) % CastElTySize != 0) return nullptr;
 
+  // We don't currently support arrays of scalable types.
+  assert(!AllocIsScalable || (ArrayOffset == 1 && ArraySizeScale == 0));
+
   unsigned Scale = (AllocElTySize*ArraySizeScale)/CastElTySize;
   Value *Amt = nullptr;
   if (Scale == 1) {
@@ -160,8 +177,8 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
 
 /// Given an expression that CanEvaluateTruncated or CanEvaluateSExtd returns
 /// true for, actually insert the code to evaluate the expression.
-Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty,
-                                             bool isSigned) {
+Value *InstCombinerImpl::EvaluateInDifferentType(Value *V, Type *Ty,
+                                                 bool isSigned) {
   if (Constant *C = dyn_cast<Constant>(V)) {
     C = ConstantExpr::getIntegerCast(C, Ty, isSigned /*Sext or ZExt*/);
     // If we got a constantexpr back, try to simplify it with DL info.
@@ -229,8 +246,9 @@ Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty,
   return InsertNewInstWith(Res, *I);
 }
 
-Instruction::CastOps InstCombiner::isEliminableCastPair(const CastInst *CI1,
-                                                        const CastInst *CI2) {
+Instruction::CastOps
+InstCombinerImpl::isEliminableCastPair(const CastInst *CI1,
+                                       const CastInst *CI2) {
   Type *SrcTy = CI1->getSrcTy();
   Type *MidTy = CI1->getDestTy();
   Type *DstTy = CI2->getDestTy();
@@ -257,7 +275,7 @@ Instruction::CastOps InstCombiner::isEliminableCastPair(const CastInst *CI1,
 }
 
 /// Implement the transforms common to all CastInst visitors.
-Instruction *InstCombiner::commonCastTransforms(CastInst &CI) {
+Instruction *InstCombinerImpl::commonCastTransforms(CastInst &CI) {
   Value *Src = CI.getOperand(0);
 
   // Try to eliminate a cast of a cast.
@@ -342,7 +360,7 @@ static bool canNotEvaluateInType(Value *V, Type *Ty) {
 ///
 /// This function works on both vectors and scalars.
 ///
-static bool canEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC,
+static bool canEvaluateTruncated(Value *V, Type *Ty, InstCombinerImpl &IC,
                                  Instruction *CxtI) {
   if (canAlwaysEvaluateInType(V, Ty))
     return true;
@@ -459,7 +477,8 @@ static bool canEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC,
 ///   trunc (lshr (bitcast <4 x i32> %X to i128), 32) to i32
 ///   --->
 ///   extractelement <4 x i32> %X, 1
-static Instruction *foldVecTruncToExtElt(TruncInst &Trunc, InstCombiner &IC) {
+static Instruction *foldVecTruncToExtElt(TruncInst &Trunc,
+                                         InstCombinerImpl &IC) {
   Value *TruncOp = Trunc.getOperand(0);
   Type *DestType = Trunc.getType();
   if (!TruncOp->hasOneUse() || !isa<IntegerType>(DestType))
@@ -496,9 +515,9 @@ static Instruction *foldVecTruncToExtElt(TruncInst &Trunc, InstCombiner &IC) {
   return ExtractElementInst::Create(VecInput, IC.Builder.getInt32(Elt));
 }
 
-/// Rotate left/right may occur in a wider type than necessary because of type
-/// promotion rules. Try to narrow the inputs and convert to funnel shift.
-Instruction *InstCombiner::narrowRotate(TruncInst &Trunc) {
+/// Funnel/Rotate left/right may occur in a wider type than necessary because of
+/// type promotion rules. Try to narrow the inputs and convert to funnel shift.
+Instruction *InstCombinerImpl::narrowFunnelShift(TruncInst &Trunc) {
   assert((isa<VectorType>(Trunc.getSrcTy()) ||
           shouldChangeType(Trunc.getSrcTy(), Trunc.getType())) &&
          "Don't narrow to an illegal scalar type");
@@ -510,32 +529,43 @@ Instruction *InstCombiner::narrowRotate(TruncInst &Trunc) {
   if (!isPowerOf2_32(NarrowWidth))
     return nullptr;
 
-  // First, find an or'd pair of opposite shifts with the same shifted operand:
-  // trunc (or (lshr ShVal, ShAmt0), (shl ShVal, ShAmt1))
-  Value *Or0, *Or1;
-  if (!match(Trunc.getOperand(0), m_OneUse(m_Or(m_Value(Or0), m_Value(Or1)))))
+  // First, find an or'd pair of opposite shifts:
+  // trunc (or (lshr ShVal0, ShAmt0), (shl ShVal1, ShAmt1))
+  BinaryOperator *Or0, *Or1;
+  if (!match(Trunc.getOperand(0), m_OneUse(m_Or(m_BinOp(Or0), m_BinOp(Or1)))))
     return nullptr;
 
-  Value *ShVal, *ShAmt0, *ShAmt1;
-  if (!match(Or0, m_OneUse(m_LogicalShift(m_Value(ShVal), m_Value(ShAmt0)))) ||
-      !match(Or1, m_OneUse(m_LogicalShift(m_Specific(ShVal), m_Value(ShAmt1)))))
+  Value *ShVal0, *ShVal1, *ShAmt0, *ShAmt1;
+  if (!match(Or0, m_OneUse(m_LogicalShift(m_Value(ShVal0), m_Value(ShAmt0)))) ||
+      !match(Or1, m_OneUse(m_LogicalShift(m_Value(ShVal1), m_Value(ShAmt1)))) ||
+      Or0->getOpcode() == Or1->getOpcode())
     return nullptr;
 
-  auto ShiftOpcode0 = cast<BinaryOperator>(Or0)->getOpcode();
-  auto ShiftOpcode1 = cast<BinaryOperator>(Or1)->getOpcode();
-  if (ShiftOpcode0 == ShiftOpcode1)
-    return nullptr;
+  // Canonicalize to or(shl(ShVal0, ShAmt0), lshr(ShVal1, ShAmt1)).
+  if (Or0->getOpcode() == BinaryOperator::LShr) {
+    std::swap(Or0, Or1);
+    std::swap(ShVal0, ShVal1);
+    std::swap(ShAmt0, ShAmt1);
+  }
+  assert(Or0->getOpcode() == BinaryOperator::Shl &&
+         Or1->getOpcode() == BinaryOperator::LShr &&
+         "Illegal or(shift,shift) pair");
 
-  // Match the shift amount operands for a rotate pattern. This always matches
-  // a subtraction on the R operand.
-  auto matchShiftAmount = [](Value *L, Value *R, unsigned Width) -> Value * {
+  // Match the shift amount operands for a funnel/rotate pattern. This always
+  // matches a subtraction on the R operand.
+  auto matchShiftAmount = [&](Value *L, Value *R, unsigned Width) -> Value * {
     // The shift amounts may add up to the narrow bit width:
-    // (shl ShVal, L) | (lshr ShVal, Width - L)
+    // (shl ShVal0, L) | (lshr ShVal1, Width - L)
     if (match(R, m_OneUse(m_Sub(m_SpecificInt(Width), m_Specific(L)))))
       return L;
 
+    // The following patterns currently only work for rotation patterns.
+    // TODO: Add more general funnel-shift compatible patterns.
+    if (ShVal0 != ShVal1)
+      return nullptr;
+
     // The shift amount may be masked with negation:
-    // (shl ShVal, (X & (Width - 1))) | (lshr ShVal, ((-X) & (Width - 1)))
+    // (shl ShVal0, (X & (Width - 1))) | (lshr ShVal1, ((-X) & (Width - 1)))
     Value *X;
     unsigned Mask = Width - 1;
     if (match(L, m_And(m_Value(X), m_SpecificInt(Mask))) &&
@@ -551,10 +581,10 @@ Instruction *InstCombiner::narrowRotate(TruncInst &Trunc) {
   };
 
   Value *ShAmt = matchShiftAmount(ShAmt0, ShAmt1, NarrowWidth);
-  bool SubIsOnLHS = false;
+  bool IsFshl = true; // Sub on LSHR.
   if (!ShAmt) {
     ShAmt = matchShiftAmount(ShAmt1, ShAmt0, NarrowWidth);
-    SubIsOnLHS = true;
+    IsFshl = false; // Sub on SHL.
   }
   if (!ShAmt)
     return nullptr;
@@ -563,26 +593,28 @@ Instruction *InstCombiner::narrowRotate(TruncInst &Trunc) {
   // will be a zext, but it could also be the result of an 'and' or 'shift'.
   unsigned WideWidth = Trunc.getSrcTy()->getScalarSizeInBits();
   APInt HiBitMask = APInt::getHighBitsSet(WideWidth, WideWidth - NarrowWidth);
-  if (!MaskedValueIsZero(ShVal, HiBitMask, 0, &Trunc))
+  if (!MaskedValueIsZero(ShVal0, HiBitMask, 0, &Trunc) ||
+      !MaskedValueIsZero(ShVal1, HiBitMask, 0, &Trunc))
     return nullptr;
 
   // We have an unnecessarily wide rotate!
-  // trunc (or (lshr ShVal, ShAmt), (shl ShVal, BitWidth - ShAmt))
+  // trunc (or (lshr ShVal0, ShAmt), (shl ShVal1, BitWidth - ShAmt))
   // Narrow the inputs and convert to funnel shift intrinsic:
   // llvm.fshl.i8(trunc(ShVal), trunc(ShVal), trunc(ShAmt))
   Value *NarrowShAmt = Builder.CreateTrunc(ShAmt, DestTy);
-  Value *X = Builder.CreateTrunc(ShVal, DestTy);
-  bool IsFshl = (!SubIsOnLHS && ShiftOpcode0 == BinaryOperator::Shl) ||
-                (SubIsOnLHS && ShiftOpcode1 == BinaryOperator::Shl);
+  Value *X, *Y;
+  X = Y = Builder.CreateTrunc(ShVal0, DestTy);
+  if (ShVal0 != ShVal1)
+    Y = Builder.CreateTrunc(ShVal1, DestTy);
   Intrinsic::ID IID = IsFshl ? Intrinsic::fshl : Intrinsic::fshr;
   Function *F = Intrinsic::getDeclaration(Trunc.getModule(), IID, DestTy);
-  return IntrinsicInst::Create(F, { X, X, NarrowShAmt });
+  return IntrinsicInst::Create(F, {X, Y, NarrowShAmt});
 }
 
 /// Try to narrow the width of math or bitwise logic instructions by pulling a
 /// truncate ahead of binary operators.
 /// TODO: Transforms for truncated shifts should be moved into here.
-Instruction *InstCombiner::narrowBinOp(TruncInst &Trunc) {
+Instruction *InstCombinerImpl::narrowBinOp(TruncInst &Trunc) {
   Type *SrcTy = Trunc.getSrcTy();
   Type *DestTy = Trunc.getType();
   if (!isa<VectorType>(SrcTy) && !shouldChangeType(SrcTy, DestTy))
@@ -631,7 +663,7 @@ Instruction *InstCombiner::narrowBinOp(TruncInst &Trunc) {
   default: break;
   }
 
-  if (Instruction *NarrowOr = narrowRotate(Trunc))
+  if (Instruction *NarrowOr = narrowFunnelShift(Trunc))
     return NarrowOr;
 
   return nullptr;
@@ -687,7 +719,7 @@ static Instruction *shrinkInsertElt(CastInst &Trunc,
   return nullptr;
 }
 
-Instruction *InstCombiner::visitTrunc(TruncInst &Trunc) {
+Instruction *InstCombinerImpl::visitTrunc(TruncInst &Trunc) {
   if (Instruction *Result = commonCastTransforms(Trunc))
     return Result;
 
@@ -695,7 +727,6 @@ Instruction *InstCombiner::visitTrunc(TruncInst &Trunc) {
   Type *DestTy = Trunc.getType(), *SrcTy = Src->getType();
   unsigned DestWidth = DestTy->getScalarSizeInBits();
   unsigned SrcWidth = SrcTy->getScalarSizeInBits();
-  ConstantInt *Cst;
 
   // Attempt to truncate the entire input expression tree to the destination
   // type.   Only do this if the dest type is a simple type, don't convert the
@@ -782,56 +813,60 @@ Instruction *InstCombiner::visitTrunc(TruncInst &Trunc) {
     }
   }
 
-  // FIXME: Maybe combine the next two transforms to handle the no cast case
-  // more efficiently. Support vector types. Cleanup code by using m_OneUse.
-
-  // Transform trunc(lshr (zext A), Cst) to eliminate one type conversion.
-  Value *A = nullptr;
-  if (Src->hasOneUse() &&
-      match(Src, m_LShr(m_ZExt(m_Value(A)), m_ConstantInt(Cst)))) {
-    // We have three types to worry about here, the type of A, the source of
-    // the truncate (MidSize), and the destination of the truncate. We know that
-    // ASize < MidSize   and MidSize > ResultSize, but don't know the relation
-    // between ASize and ResultSize.
-    unsigned ASize = A->getType()->getPrimitiveSizeInBits();
-
-    // If the shift amount is larger than the size of A, then the result is
-    // known to be zero because all the input bits got shifted out.
-    if (Cst->getZExtValue() >= ASize)
-      return replaceInstUsesWith(Trunc, Constant::getNullValue(DestTy));
-
-    // Since we're doing an lshr and a zero extend, and know that the shift
-    // amount is smaller than ASize, it is always safe to do the shift in A's
-    // type, then zero extend or truncate to the result.
-    Value *Shift = Builder.CreateLShr(A, Cst->getZExtValue());
-    Shift->takeName(Src);
-    return CastInst::CreateIntegerCast(Shift, DestTy, false);
-  }
-
-  const APInt *C;
-  if (match(Src, m_LShr(m_SExt(m_Value(A)), m_APInt(C)))) {
+  Value *A;
+  Constant *C;
+  if (match(Src, m_LShr(m_SExt(m_Value(A)), m_Constant(C)))) {
     unsigned AWidth = A->getType()->getScalarSizeInBits();
     unsigned MaxShiftAmt = SrcWidth - std::max(DestWidth, AWidth);
+    auto *OldSh = cast<Instruction>(Src);
+    bool IsExact = OldSh->isExact();
 
     // If the shift is small enough, all zero bits created by the shift are
     // removed by the trunc.
-    if (C->getZExtValue() <= MaxShiftAmt) {
+    if (match(C, m_SpecificInt_ICMP(ICmpInst::ICMP_ULE,
+                                    APInt(SrcWidth, MaxShiftAmt)))) {
       // trunc (lshr (sext A), C) --> ashr A, C
       if (A->getType() == DestTy) {
-        unsigned ShAmt = std::min((unsigned)C->getZExtValue(), DestWidth - 1);
-        return BinaryOperator::CreateAShr(A, ConstantInt::get(DestTy, ShAmt));
+        Constant *MaxAmt = ConstantInt::get(SrcTy, DestWidth - 1, false);
+        Constant *ShAmt = ConstantExpr::getUMin(C, MaxAmt);
+        ShAmt = ConstantExpr::getTrunc(ShAmt, A->getType());
+        ShAmt = Constant::mergeUndefsWith(ShAmt, C);
+        return IsExact ? BinaryOperator::CreateExactAShr(A, ShAmt)
+                       : BinaryOperator::CreateAShr(A, ShAmt);
       }
       // The types are mismatched, so create a cast after shifting:
       // trunc (lshr (sext A), C) --> sext/trunc (ashr A, C)
       if (Src->hasOneUse()) {
-        unsigned ShAmt = std::min((unsigned)C->getZExtValue(), AWidth - 1);
-        Value *Shift = Builder.CreateAShr(A, ShAmt);
+        Constant *MaxAmt = ConstantInt::get(SrcTy, AWidth - 1, false);
+        Constant *ShAmt = ConstantExpr::getUMin(C, MaxAmt);
+        ShAmt = ConstantExpr::getTrunc(ShAmt, A->getType());
+        Value *Shift = Builder.CreateAShr(A, ShAmt, "", IsExact);
         return CastInst::CreateIntegerCast(Shift, DestTy, true);
       }
     }
     // TODO: Mask high bits with 'and'.
   }
 
+  // trunc (*shr (trunc A), C) --> trunc(*shr A, C)
+  if (match(Src, m_OneUse(m_Shr(m_Trunc(m_Value(A)), m_Constant(C))))) {
+    unsigned MaxShiftAmt = SrcWidth - DestWidth;
+
+    // If the shift is small enough, all zero/sign bits created by the shift are
+    // removed by the trunc.
+    if (match(C, m_SpecificInt_ICMP(ICmpInst::ICMP_ULE,
+                                    APInt(SrcWidth, MaxShiftAmt)))) {
+      auto *OldShift = cast<Instruction>(Src);
+      bool IsExact = OldShift->isExact();
+      auto *ShAmt = ConstantExpr::getIntegerCast(C, A->getType(), true);
+      ShAmt = Constant::mergeUndefsWith(ShAmt, C);
+      Value *Shift =
+          OldShift->getOpcode() == Instruction::AShr
+              ? Builder.CreateAShr(A, ShAmt, OldShift->getName(), IsExact)
+              : Builder.CreateLShr(A, ShAmt, OldShift->getName(), IsExact);
+      return CastInst::CreateTruncOrBitCast(Shift, DestTy);
+    }
+  }
+
   if (Instruction *I = narrowBinOp(Trunc))
     return I;
 
@@ -841,20 +876,19 @@ Instruction *InstCombiner::visitTrunc(TruncInst &Trunc) {
   if (Instruction *I = shrinkInsertElt(Trunc, Builder))
     return I;
 
-  if (Src->hasOneUse() && isa<IntegerType>(SrcTy) &&
-      shouldChangeType(SrcTy, DestTy)) {
+  if (Src->hasOneUse() &&
+      (isa<VectorType>(SrcTy) || shouldChangeType(SrcTy, DestTy))) {
     // Transform "trunc (shl X, cst)" -> "shl (trunc X), cst" so long as the
     // dest type is native and cst < dest size.
-    if (match(Src, m_Shl(m_Value(A), m_ConstantInt(Cst))) &&
+    if (match(Src, m_Shl(m_Value(A), m_Constant(C))) &&
         !match(A, m_Shr(m_Value(), m_Constant()))) {
       // Skip shifts of shift by constants. It undoes a combine in
       // FoldShiftByConstant and is the extend in reg pattern.
-      if (Cst->getValue().ult(DestWidth)) {
+      APInt Threshold = APInt(C->getType()->getScalarSizeInBits(), DestWidth);
+      if (match(C, m_SpecificInt_ICMP(ICmpInst::ICMP_ULT, Threshold))) {
         Value *NewTrunc = Builder.CreateTrunc(A, DestTy, A->getName() + ".tr");
-
-        return BinaryOperator::Create(
-          Instruction::Shl, NewTrunc,
-          ConstantInt::get(DestTy, Cst->getValue().trunc(DestWidth)));
+        return BinaryOperator::Create(Instruction::Shl, NewTrunc,
+                                      ConstantExpr::getTrunc(C, DestTy));
       }
     }
   }
@@ -871,21 +905,23 @@ Instruction *InstCombiner::visitTrunc(TruncInst &Trunc) {
   //   --->
   //   extractelement <8 x i32> (bitcast <4 x i64> %X to <8 x i32>), i32 0
   Value *VecOp;
+  ConstantInt *Cst;
   if (match(Src, m_OneUse(m_ExtractElt(m_Value(VecOp), m_ConstantInt(Cst))))) {
     auto *VecOpTy = cast<VectorType>(VecOp->getType());
-    unsigned VecNumElts = VecOpTy->getNumElements();
+    auto VecElts = VecOpTy->getElementCount();
 
     // A badly fit destination size would result in an invalid cast.
     if (SrcWidth % DestWidth == 0) {
       uint64_t TruncRatio = SrcWidth / DestWidth;
-      uint64_t BitCastNumElts = VecNumElts * TruncRatio;
+      uint64_t BitCastNumElts = VecElts.getKnownMinValue() * TruncRatio;
       uint64_t VecOpIdx = Cst->getZExtValue();
       uint64_t NewIdx = DL.isBigEndian() ? (VecOpIdx + 1) * TruncRatio - 1
                                          : VecOpIdx * TruncRatio;
       assert(BitCastNumElts <= std::numeric_limits<uint32_t>::max() &&
              "overflow 32-bits");
 
-      auto *BitCastTo = FixedVectorType::get(DestTy, BitCastNumElts);
+      auto *BitCastTo =
+          VectorType::get(DestTy, BitCastNumElts, VecElts.isScalable());
       Value *BitCast = Builder.CreateBitCast(VecOp, BitCastTo);
       return ExtractElementInst::Create(BitCast, Builder.getInt32(NewIdx));
     }
@@ -894,8 +930,8 @@ Instruction *InstCombiner::visitTrunc(TruncInst &Trunc) {
   return nullptr;
 }
 
-Instruction *InstCombiner::transformZExtICmp(ICmpInst *Cmp, ZExtInst &Zext,
-                                             bool DoTransform) {
+Instruction *InstCombinerImpl::transformZExtICmp(ICmpInst *Cmp, ZExtInst &Zext,
+                                                 bool DoTransform) {
   // If we are just checking for a icmp eq of a single bit and zext'ing it
   // to an integer, then shift the bit to the appropriate place and then
   // cast to integer to avoid the comparison.
@@ -1031,7 +1067,7 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *Cmp, ZExtInst &Zext,
 ///
 /// This function works on both vectors and scalars.
 static bool canEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear,
-                             InstCombiner &IC, Instruction *CxtI) {
+                             InstCombinerImpl &IC, Instruction *CxtI) {
   BitsToClear = 0;
   if (canAlwaysEvaluateInType(V, Ty))
     return true;
@@ -1136,7 +1172,7 @@ static bool canEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear,
   }
 }
 
-Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
+Instruction *InstCombinerImpl::visitZExt(ZExtInst &CI) {
   // If this zero extend is only used by a truncate, let the truncate be
   // eliminated before we try to optimize this zext.
   if (CI.hasOneUse() && isa<TruncInst>(CI.user_back()))
@@ -1234,6 +1270,7 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
     ICmpInst *LHS = dyn_cast<ICmpInst>(SrcI->getOperand(0));
     ICmpInst *RHS = dyn_cast<ICmpInst>(SrcI->getOperand(1));
     if (LHS && RHS && LHS->hasOneUse() && RHS->hasOneUse() &&
+        LHS->getOperand(0)->getType() == RHS->getOperand(0)->getType() &&
         (transformZExtICmp(LHS, CI, false) ||
          transformZExtICmp(RHS, CI, false))) {
       // zext (or icmp, icmp) -> or (zext icmp), (zext icmp)
@@ -1274,7 +1311,8 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
 }
 
 /// Transform (sext icmp) to bitwise / integer operations to eliminate the icmp.
-Instruction *InstCombiner::transformSExtICmp(ICmpInst *ICI, Instruction &CI) {
+Instruction *InstCombinerImpl::transformSExtICmp(ICmpInst *ICI,
+                                                 Instruction &CI) {
   Value *Op0 = ICI->getOperand(0), *Op1 = ICI->getOperand(1);
   ICmpInst::Predicate Pred = ICI->getPredicate();
 
@@ -1410,7 +1448,7 @@ static bool canEvaluateSExtd(Value *V, Type *Ty) {
   return false;
 }
 
-Instruction *InstCombiner::visitSExt(SExtInst &CI) {
+Instruction *InstCombinerImpl::visitSExt(SExtInst &CI) {
   // If this sign extend is only used by a truncate, let the truncate be
   // eliminated before we try to optimize this sext.
   if (CI.hasOneUse() && isa<TruncInst>(CI.user_back()))
@@ -1473,31 +1511,33 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) {
   // for a truncate.  If the source and dest are the same type, eliminate the
   // trunc and extend and just do shifts.  For example, turn:
   //   %a = trunc i32 %i to i8
-  //   %b = shl i8 %a, 6
-  //   %c = ashr i8 %b, 6
+  //   %b = shl i8 %a, C
+  //   %c = ashr i8 %b, C
   //   %d = sext i8 %c to i32
   // into:
-  //   %a = shl i32 %i, 30
-  //   %d = ashr i32 %a, 30
+  //   %a = shl i32 %i, 32-(8-C)
+  //   %d = ashr i32 %a, 32-(8-C)
   Value *A = nullptr;
   // TODO: Eventually this could be subsumed by EvaluateInDifferentType.
   Constant *BA = nullptr, *CA = nullptr;
   if (match(Src, m_AShr(m_Shl(m_Trunc(m_Value(A)), m_Constant(BA)),
                         m_Constant(CA))) &&
-      BA == CA && A->getType() == CI.getType()) {
-    unsigned MidSize = Src->getType()->getScalarSizeInBits();
-    unsigned SrcDstSize = CI.getType()->getScalarSizeInBits();
-    Constant *SizeDiff = ConstantInt::get(CA->getType(), SrcDstSize - MidSize);
-    Constant *ShAmt = ConstantExpr::getAdd(CA, SizeDiff);
-    Constant *ShAmtExt = ConstantExpr::getSExt(ShAmt, CI.getType());
-    A = Builder.CreateShl(A, ShAmtExt, CI.getName());
-    return BinaryOperator::CreateAShr(A, ShAmtExt);
+      BA->isElementWiseEqual(CA) && A->getType() == DestTy) {
+    Constant *WideCurrShAmt = ConstantExpr::getSExt(CA, DestTy);
+    Constant *NumLowbitsLeft = ConstantExpr::getSub(
+        ConstantInt::get(DestTy, SrcTy->getScalarSizeInBits()), WideCurrShAmt);
+    Constant *NewShAmt = ConstantExpr::getSub(
+        ConstantInt::get(DestTy, DestTy->getScalarSizeInBits()),
+        NumLowbitsLeft);
+    NewShAmt =
+        Constant::mergeUndefsWith(Constant::mergeUndefsWith(NewShAmt, BA), CA);
+    A = Builder.CreateShl(A, NewShAmt, CI.getName());
+    return BinaryOperator::CreateAShr(A, NewShAmt);
   }
 
   return nullptr;
 }
 
-
 /// Return a Constant* for the specified floating-point constant if it fits
 /// in the specified FP type without changing its value.
 static bool fitsInFPType(ConstantFP *CFP, const fltSemantics &Sem) {
@@ -1535,7 +1575,7 @@ static Type *shrinkFPConstantVector(Value *V) {
 
   Type *MinType = nullptr;
 
-  unsigned NumElts = CVVTy->getNumElements();
+  unsigned NumElts = cast<FixedVectorType>(CVVTy)->getNumElements();
   for (unsigned i = 0; i != NumElts; ++i) {
     auto *CFP = dyn_cast_or_null<ConstantFP>(CV->getAggregateElement(i));
     if (!CFP)
@@ -1616,7 +1656,7 @@ static bool isKnownExactCastIntToFP(CastInst &I) {
   return false;
 }
 
-Instruction *InstCombiner::visitFPTrunc(FPTruncInst &FPT) {
+Instruction *InstCombinerImpl::visitFPTrunc(FPTruncInst &FPT) {
   if (Instruction *I = commonCastTransforms(FPT))
     return I;
 
@@ -1800,7 +1840,7 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &FPT) {
   return nullptr;
 }
 
-Instruction *InstCombiner::visitFPExt(CastInst &FPExt) {
+Instruction *InstCombinerImpl::visitFPExt(CastInst &FPExt) {
   // If the source operand is a cast from integer to FP and known exact, then
   // cast the integer operand directly to the destination type.
   Type *Ty = FPExt.getType();
@@ -1818,7 +1858,7 @@ Instruction *InstCombiner::visitFPExt(CastInst &FPExt) {
 /// This is safe if the intermediate type has enough bits in its mantissa to
 /// accurately represent all values of X.  For example, this won't work with
 /// i64 -> float -> i64.
-Instruction *InstCombiner::foldItoFPtoI(CastInst &FI) {
+Instruction *InstCombinerImpl::foldItoFPtoI(CastInst &FI) {
   if (!isa<UIToFPInst>(FI.getOperand(0)) && !isa<SIToFPInst>(FI.getOperand(0)))
     return nullptr;
 
@@ -1858,29 +1898,29 @@ Instruction *InstCombiner::foldItoFPtoI(CastInst &FI) {
   return replaceInstUsesWith(FI, X);
 }
 
-Instruction *InstCombiner::visitFPToUI(FPToUIInst &FI) {
+Instruction *InstCombinerImpl::visitFPToUI(FPToUIInst &FI) {
   if (Instruction *I = foldItoFPtoI(FI))
     return I;
 
   return commonCastTransforms(FI);
 }
 
-Instruction *InstCombiner::visitFPToSI(FPToSIInst &FI) {
+Instruction *InstCombinerImpl::visitFPToSI(FPToSIInst &FI) {
   if (Instruction *I = foldItoFPtoI(FI))
     return I;
 
   return commonCastTransforms(FI);
 }
 
-Instruction *InstCombiner::visitUIToFP(CastInst &CI) {
+Instruction *InstCombinerImpl::visitUIToFP(CastInst &CI) {
   return commonCastTransforms(CI);
 }
 
-Instruction *InstCombiner::visitSIToFP(CastInst &CI) {
+Instruction *InstCombinerImpl::visitSIToFP(CastInst &CI) {
   return commonCastTransforms(CI);
 }
 
-Instruction *InstCombiner::visitIntToPtr(IntToPtrInst &CI) {
+Instruction *InstCombinerImpl::visitIntToPtr(IntToPtrInst &CI) {
   // If the source integer type is not the intptr_t type for this target, do a
   // trunc or zext to the intptr_t type, then inttoptr of it.  This allows the
   // cast to be exposed to other transforms.
@@ -1903,7 +1943,7 @@ Instruction *InstCombiner::visitIntToPtr(IntToPtrInst &CI) {
 }
 
 /// Implement the transforms for cast of pointer (bitcast/ptrtoint)
-Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) {
+Instruction *InstCombinerImpl::commonPointerCastTransforms(CastInst &CI) {
   Value *Src = CI.getOperand(0);
 
   if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Src)) {
@@ -1925,26 +1965,37 @@ Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) {
   return commonCastTransforms(CI);
 }
 
-Instruction *InstCombiner::visitPtrToInt(PtrToIntInst &CI) {
+Instruction *InstCombinerImpl::visitPtrToInt(PtrToIntInst &CI) {
   // If the destination integer type is not the intptr_t type for this target,
   // do a ptrtoint to intptr_t then do a trunc or zext.  This allows the cast
   // to be exposed to other transforms.
-
+  Value *SrcOp = CI.getPointerOperand();
   Type *Ty = CI.getType();
   unsigned AS = CI.getPointerAddressSpace();
+  unsigned TySize = Ty->getScalarSizeInBits();
+  unsigned PtrSize = DL.getPointerSizeInBits(AS);
+  if (TySize != PtrSize) {
+    Type *IntPtrTy = DL.getIntPtrType(CI.getContext(), AS);
+    // Handle vectors of pointers.
+    if (auto *VecTy = dyn_cast<VectorType>(Ty))
+      IntPtrTy = VectorType::get(IntPtrTy, VecTy->getElementCount());
 
-  if (Ty->getScalarSizeInBits() == DL.getPointerSizeInBits(AS))
-    return commonPointerCastTransforms(CI);
+    Value *P = Builder.CreatePtrToInt(SrcOp, IntPtrTy);
+    return CastInst::CreateIntegerCast(P, Ty, /*isSigned=*/false);
+  }
 
-  Type *PtrTy = DL.getIntPtrType(CI.getContext(), AS);
-  if (auto *VTy = dyn_cast<VectorType>(Ty)) {
-    // Handle vectors of pointers.
-    // FIXME: what should happen for scalable vectors?
-    PtrTy = FixedVectorType::get(PtrTy, VTy->getNumElements());
+  Value *Vec, *Scalar, *Index;
+  if (match(SrcOp, m_OneUse(m_InsertElt(m_IntToPtr(m_Value(Vec)),
+                                        m_Value(Scalar), m_Value(Index)))) &&
+      Vec->getType() == Ty) {
+    assert(Vec->getType()->getScalarSizeInBits() == PtrSize && "Wrong type");
+    // Convert the scalar to int followed by insert to eliminate one cast:
+    // p2i (ins (i2p Vec), Scalar, Index --> ins Vec, (p2i Scalar), Index
+    Value *NewCast = Builder.CreatePtrToInt(Scalar, Ty->getScalarType());
+    return InsertElementInst::Create(Vec, NewCast, Index);
   }
 
-  Value *P = Builder.CreatePtrToInt(CI.getOperand(0), PtrTy);
-  return CastInst::CreateIntegerCast(P, Ty, /*isSigned=*/false);
+  return commonPointerCastTransforms(CI);
 }
 
 /// This input value (which is known to have vector type) is being zero extended
@@ -1963,9 +2014,9 @@ Instruction *InstCombiner::visitPtrToInt(PtrToIntInst &CI) {
 /// Try to replace it with a shuffle (and vector/vector bitcast) if possible.
 ///
 /// The source and destination vector types may have different element types.
-static Instruction *optimizeVectorResizeWithIntegerBitCasts(Value *InVal,
-                                                            VectorType *DestTy,
-                                                            InstCombiner &IC) {
+static Instruction *
+optimizeVectorResizeWithIntegerBitCasts(Value *InVal, VectorType *DestTy,
+                                        InstCombinerImpl &IC) {
   // We can only do this optimization if the output is a multiple of the input
   // element size, or the input is a multiple of the output element size.
   // Convert the input type to have the same element type as the output.
@@ -1981,13 +2032,14 @@ static Instruction *optimizeVectorResizeWithIntegerBitCasts(Value *InVal,
       return nullptr;
 
     SrcTy =
-        FixedVectorType::get(DestTy->getElementType(), SrcTy->getNumElements());
+        FixedVectorType::get(DestTy->getElementType(),
+                             cast<FixedVectorType>(SrcTy)->getNumElements());
     InVal = IC.Builder.CreateBitCast(InVal, SrcTy);
   }
 
   bool IsBigEndian = IC.getDataLayout().isBigEndian();
-  unsigned SrcElts = SrcTy->getNumElements();
-  unsigned DestElts = DestTy->getNumElements();
+  unsigned SrcElts = cast<FixedVectorType>(SrcTy)->getNumElements();
+  unsigned DestElts = cast<FixedVectorType>(DestTy)->getNumElements();
 
   assert(SrcElts != DestElts && "Element counts should be different.");
 
@@ -2165,8 +2217,8 @@ static bool collectInsertionElements(Value *V, unsigned Shift,
 ///
 /// Into two insertelements that do "buildvector{%inc, %inc5}".
 static Value *optimizeIntegerToVectorInsertions(BitCastInst &CI,
-                                                InstCombiner &IC) {
-  VectorType *DestVecTy = cast<VectorType>(CI.getType());
+                                                InstCombinerImpl &IC) {
+  auto *DestVecTy = cast<FixedVectorType>(CI.getType());
   Value *IntInput = CI.getOperand(0);
 
   SmallVector<Value*, 8> Elements(DestVecTy->getNumElements());
@@ -2194,7 +2246,7 @@ static Value *optimizeIntegerToVectorInsertions(BitCastInst &CI,
 /// vectors better than bitcasts of scalars because vector registers are
 /// usually not type-specific like scalar integer or scalar floating-point.
 static Instruction *canonicalizeBitCastExtElt(BitCastInst &BitCast,
-                                              InstCombiner &IC) {
+                                              InstCombinerImpl &IC) {
   // TODO: Create and use a pattern matcher for ExtractElementInst.
   auto *ExtElt = dyn_cast<ExtractElementInst>(BitCast.getOperand(0));
   if (!ExtElt || !ExtElt->hasOneUse())
@@ -2206,8 +2258,7 @@ static Instruction *canonicalizeBitCastExtElt(BitCastInst &BitCast,
   if (!VectorType::isValidElementType(DestType))
     return nullptr;
 
-  unsigned NumElts = ExtElt->getVectorOperandType()->getNumElements();
-  auto *NewVecType = FixedVectorType::get(DestType, NumElts);
+  auto *NewVecType = VectorType::get(DestType, ExtElt->getVectorOperandType());
   auto *NewBC = IC.Builder.CreateBitCast(ExtElt->getVectorOperand(),
                                          NewVecType, "bc");
   return ExtractElementInst::Create(NewBC, ExtElt->getIndexOperand());
@@ -2270,12 +2321,11 @@ static Instruction *foldBitCastSelect(BitCastInst &BitCast,
   // A vector select must maintain the same number of elements in its operands.
   Type *CondTy = Cond->getType();
   Type *DestTy = BitCast.getType();
-  if (auto *CondVTy = dyn_cast<VectorType>(CondTy)) {
-    if (!DestTy->isVectorTy())
+  if (auto *CondVTy = dyn_cast<VectorType>(CondTy))
+    if (!DestTy->isVectorTy() ||
+        CondVTy->getElementCount() !=
+            cast<VectorType>(DestTy)->getElementCount())
       return nullptr;
-    if (cast<VectorType>(DestTy)->getNumElements() != CondVTy->getNumElements())
-      return nullptr;
-  }
 
   // FIXME: This transform is restricted from changing the select between
   // scalars and vectors to avoid backend problems caused by creating
@@ -2320,7 +2370,8 @@ static bool hasStoreUsersOnly(CastInst &CI) {
 ///
 /// All the related PHI nodes can be replaced by new PHI nodes with type A.
 /// The uses of \p CI can be changed to the new PHI node corresponding to \p PN.
-Instruction *InstCombiner::optimizeBitCastFromPhi(CastInst &CI, PHINode *PN) {
+Instruction *InstCombinerImpl::optimizeBitCastFromPhi(CastInst &CI,
+                                                      PHINode *PN) {
   // BitCast used by Store can be handled in InstCombineLoadStoreAlloca.cpp.
   if (hasStoreUsersOnly(CI))
     return nullptr;
@@ -2450,10 +2501,7 @@ Instruction *InstCombiner::optimizeBitCastFromPhi(CastInst &CI, PHINode *PN) {
   Instruction *RetVal = nullptr;
   for (auto *OldPN : OldPhiNodes) {
     PHINode *NewPN = NewPNodes[OldPN];
-    for (auto It = OldPN->user_begin(), End = OldPN->user_end(); It != End; ) {
-      User *V = *It;
-      // We may remove this user, advance to avoid iterator invalidation.
-      ++It;
+    for (User *V : make_early_inc_range(OldPN->users())) {
       if (auto *SI = dyn_cast<StoreInst>(V)) {
         assert(SI->isSimple() && SI->getOperand(0) == OldPN);
         Builder.SetInsertPoint(SI);
@@ -2473,7 +2521,7 @@ Instruction *InstCombiner::optimizeBitCastFromPhi(CastInst &CI, PHINode *PN) {
         if (BCI == &CI)
           RetVal = I;
       } else if (auto *PHI = dyn_cast<PHINode>(V)) {
-        assert(OldPhiNodes.count(PHI) > 0);
+        assert(OldPhiNodes.contains(PHI));
         (void) PHI;
       } else {
         llvm_unreachable("all uses should be handled");
@@ -2484,7 +2532,7 @@ Instruction *InstCombiner::optimizeBitCastFromPhi(CastInst &CI, PHINode *PN) {
   return RetVal;
 }
 
-Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
+Instruction *InstCombinerImpl::visitBitCast(BitCastInst &CI) {
   // If the operands are integer typed then apply the integer transforms,
   // otherwise just apply the common ones.
   Value *Src = CI.getOperand(0);
@@ -2608,12 +2656,11 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
     // a bitcast to a vector with the same # elts.
     Value *ShufOp0 = Shuf->getOperand(0);
     Value *ShufOp1 = Shuf->getOperand(1);
-    unsigned NumShufElts = Shuf->getType()->getNumElements();
-    unsigned NumSrcVecElts =
-        cast<VectorType>(ShufOp0->getType())->getNumElements();
+    auto ShufElts = cast<VectorType>(Shuf->getType())->getElementCount();
+    auto SrcVecElts = cast<VectorType>(ShufOp0->getType())->getElementCount();
     if (Shuf->hasOneUse() && DestTy->isVectorTy() &&
-        cast<VectorType>(DestTy)->getNumElements() == NumShufElts &&
-        NumShufElts == NumSrcVecElts) {
+        cast<VectorType>(DestTy)->getElementCount() == ShufElts &&
+        ShufElts == SrcVecElts) {
       BitCastInst *Tmp;
       // If either of the operands is a cast from CI.getType(), then
       // evaluating the shuffle in the casted destination's type will allow
@@ -2636,8 +2683,9 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
     // TODO: We should match the related pattern for bitreverse.
     if (DestTy->isIntegerTy() &&
         DL.isLegalInteger(DestTy->getScalarSizeInBits()) &&
-        SrcTy->getScalarSizeInBits() == 8 && NumShufElts % 2 == 0 &&
-        Shuf->hasOneUse() && Shuf->isReverse()) {
+        SrcTy->getScalarSizeInBits() == 8 &&
+        ShufElts.getKnownMinValue() % 2 == 0 && Shuf->hasOneUse() &&
+        Shuf->isReverse()) {
       assert(ShufOp0->getType() == SrcTy && "Unexpected shuffle mask");
       assert(isa<UndefValue>(ShufOp1) && "Unexpected shuffle op");
       Function *Bswap =
@@ -2666,7 +2714,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
   return commonCastTransforms(CI);
 }
 
-Instruction *InstCombiner::visitAddrSpaceCast(AddrSpaceCastInst &CI) {
+Instruction *InstCombinerImpl::visitAddrSpaceCast(AddrSpaceCastInst &CI) {
   // If the destination pointer element type is not the same as the source's
   // first do a bitcast to the destination type, and then the addrspacecast.
   // This allows the cast to be exposed to other transforms.
@@ -2677,11 +2725,9 @@ Instruction *InstCombiner::visitAddrSpaceCast(AddrSpaceCastInst &CI) {
   Type *DestElemTy = DestTy->getElementType();
   if (SrcTy->getElementType() != DestElemTy) {
     Type *MidTy = PointerType::get(DestElemTy, SrcTy->getAddressSpace());
-    if (VectorType *VT = dyn_cast<VectorType>(CI.getType())) {
-      // Handle vectors of pointers.
-      // FIXME: what should happen for scalable vectors?
-      MidTy = FixedVectorType::get(MidTy, VT->getNumElements());
-    }
+    // Handle vectors of pointers.
+    if (VectorType *VT = dyn_cast<VectorType>(CI.getType()))
+      MidTy = VectorType::get(MidTy, VT->getElementCount());
 
     Value *NewBitCast = Builder.CreateBitCast(Src, MidTy);
     return new AddrSpaceCastInst(NewBitCast, CI.getType());
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index f1233b62445d..cd9a036179b6 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -24,6 +24,7 @@
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/KnownBits.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
 
 using namespace llvm;
 using namespace PatternMatch;
@@ -95,45 +96,6 @@ static bool isSignTest(ICmpInst::Predicate &Pred, const APInt &C) {
   return false;
 }
 
-/// Given a signed integer type and a set of known zero and one bits, compute
-/// the maximum and minimum values that could have the specified known zero and
-/// known one bits, returning them in Min/Max.
-/// TODO: Move to method on KnownBits struct?
-static void computeSignedMinMaxValuesFromKnownBits(const KnownBits &Known,
-                                                   APInt &Min, APInt &Max) {
-  assert(Known.getBitWidth() == Min.getBitWidth() &&
-         Known.getBitWidth() == Max.getBitWidth() &&
-         "KnownZero, KnownOne and Min, Max must have equal bitwidth.");
-  APInt UnknownBits = ~(Known.Zero|Known.One);
-
-  // The minimum value is when all unknown bits are zeros, EXCEPT for the sign
-  // bit if it is unknown.
-  Min = Known.One;
-  Max = Known.One|UnknownBits;
-
-  if (UnknownBits.isNegative()) { // Sign bit is unknown
-    Min.setSignBit();
-    Max.clearSignBit();
-  }
-}
-
-/// Given an unsigned integer type and a set of known zero and one bits, compute
-/// the maximum and minimum values that could have the specified known zero and
-/// known one bits, returning them in Min/Max.
-/// TODO: Move to method on KnownBits struct?
-static void computeUnsignedMinMaxValuesFromKnownBits(const KnownBits &Known,
-                                                     APInt &Min, APInt &Max) {
-  assert(Known.getBitWidth() == Min.getBitWidth() &&
-         Known.getBitWidth() == Max.getBitWidth() &&
-         "Ty, KnownZero, KnownOne and Min, Max must have equal bitwidth.");
-  APInt UnknownBits = ~(Known.Zero|Known.One);
-
-  // The minimum value is when the unknown bits are all zeros.
-  Min = Known.One;
-  // The maximum value is when the unknown bits are all ones.
-  Max = Known.One|UnknownBits;
-}
-
 /// This is called when we see this pattern:
 ///   cmp pred (load (gep GV, ...)), cmpcst
 /// where GV is a global variable with a constant initializer. Try to simplify
@@ -142,10 +104,10 @@ static void computeUnsignedMinMaxValuesFromKnownBits(const KnownBits &Known,
 ///
 /// If AndCst is non-null, then the loaded value is masked with that constant
 /// before doing the comparison. This handles cases like "A[i]&4 == 0".
-Instruction *InstCombiner::foldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP,
-                                                        GlobalVariable *GV,
-                                                        CmpInst &ICI,
-                                                        ConstantInt *AndCst) {
+Instruction *
+InstCombinerImpl::foldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP,
+                                               GlobalVariable *GV, CmpInst &ICI,
+                                               ConstantInt *AndCst) {
   Constant *Init = GV->getInitializer();
   if (!isa<ConstantArray>(Init) && !isa<ConstantDataArray>(Init))
     return nullptr;
@@ -313,7 +275,7 @@ Instruction *InstCombiner::foldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP,
   if (!GEP->isInBounds()) {
     Type *IntPtrTy = DL.getIntPtrType(GEP->getType());
     unsigned PtrSize = IntPtrTy->getIntegerBitWidth();
-    if (Idx->getType()->getPrimitiveSizeInBits() > PtrSize)
+    if (Idx->getType()->getPrimitiveSizeInBits().getFixedSize() > PtrSize)
       Idx = Builder.CreateTrunc(Idx, IntPtrTy);
   }
 
@@ -422,7 +384,7 @@ Instruction *InstCombiner::foldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP,
 ///
 /// If we can't emit an optimized form for this expression, this returns null.
 ///
-static Value *evaluateGEPOffsetExpression(User *GEP, InstCombiner &IC,
+static Value *evaluateGEPOffsetExpression(User *GEP, InstCombinerImpl &IC,
                                           const DataLayout &DL) {
   gep_type_iterator GTI = gep_type_begin(GEP);
 
@@ -486,7 +448,8 @@ static Value *evaluateGEPOffsetExpression(User *GEP, InstCombiner &IC,
     // Cast to intptrty in case a truncation occurs.  If an extension is needed,
     // we don't need to bother extending: the extension won't affect where the
     // computation crosses zero.
-    if (VariableIdx->getType()->getPrimitiveSizeInBits() > IntPtrWidth) {
+    if (VariableIdx->getType()->getPrimitiveSizeInBits().getFixedSize() >
+        IntPtrWidth) {
       VariableIdx = IC.Builder.CreateTrunc(VariableIdx, IntPtrTy);
     }
     return VariableIdx;
@@ -539,7 +502,7 @@ static bool canRewriteGEPAsOffset(Value *Start, Value *Base,
 
       Value *V = WorkList.back();
 
-      if (Explored.count(V) != 0) {
+      if (Explored.contains(V)) {
         WorkList.pop_back();
         continue;
       }
@@ -551,7 +514,7 @@ static bool canRewriteGEPAsOffset(Value *Start, Value *Base,
         return false;
 
       if (isa<IntToPtrInst>(V) || isa<PtrToIntInst>(V)) {
-        auto *CI = dyn_cast<CastInst>(V);
+        auto *CI = cast<CastInst>(V);
         if (!CI->isNoopCast(DL))
           return false;
 
@@ -841,9 +804,9 @@ static Instruction *transformToIndexedCompare(GEPOperator *GEPLHS, Value *RHS,
 
 /// Fold comparisons between a GEP instruction and something else. At this point
 /// we know that the GEP is on the LHS of the comparison.
-Instruction *InstCombiner::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
-                                       ICmpInst::Predicate Cond,
-                                       Instruction &I) {
+Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
+                                           ICmpInst::Predicate Cond,
+                                           Instruction &I) {
   // Don't transform signed compares of GEPs into index compares. Even if the
   // GEP is inbounds, the final add of the base pointer can have signed overflow
   // and would change the result of the icmp.
@@ -897,8 +860,8 @@ Instruction *InstCombiner::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
     // For vectors, we apply the same reasoning on a per-lane basis.
     auto *Base = GEPLHS->getPointerOperand();
     if (GEPLHS->getType()->isVectorTy() && Base->getType()->isPointerTy()) {
-      int NumElts = cast<VectorType>(GEPLHS->getType())->getNumElements();
-      Base = Builder.CreateVectorSplat(NumElts, Base);
+      auto EC = cast<VectorType>(GEPLHS->getType())->getElementCount();
+      Base = Builder.CreateVectorSplat(EC, Base);
     }
     return new ICmpInst(Cond, Base,
                         ConstantExpr::getPointerBitCastOrAddrSpaceCast(
@@ -941,8 +904,8 @@ Instruction *InstCombiner::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
         Type *LHSIndexTy = LOffset->getType();
         Type *RHSIndexTy = ROffset->getType();
         if (LHSIndexTy != RHSIndexTy) {
-          if (LHSIndexTy->getPrimitiveSizeInBits() <
-              RHSIndexTy->getPrimitiveSizeInBits()) {
+          if (LHSIndexTy->getPrimitiveSizeInBits().getFixedSize() <
+              RHSIndexTy->getPrimitiveSizeInBits().getFixedSize()) {
             ROffset = Builder.CreateTrunc(ROffset, LHSIndexTy);
           } else
             LOffset = Builder.CreateTrunc(LOffset, RHSIndexTy);
@@ -1021,9 +984,9 @@ Instruction *InstCombiner::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
   return transformToIndexedCompare(GEPLHS, RHS, Cond, DL);
 }
 
-Instruction *InstCombiner::foldAllocaCmp(ICmpInst &ICI,
-                                         const AllocaInst *Alloca,
-                                         const Value *Other) {
+Instruction *InstCombinerImpl::foldAllocaCmp(ICmpInst &ICI,
+                                             const AllocaInst *Alloca,
+                                             const Value *Other) {
   assert(ICI.isEquality() && "Cannot fold non-equality comparison.");
 
   // It would be tempting to fold away comparisons between allocas and any
@@ -1099,8 +1062,8 @@ Instruction *InstCombiner::foldAllocaCmp(ICmpInst &ICI,
 }
 
 /// Fold "icmp pred (X+C), X".
-Instruction *InstCombiner::foldICmpAddOpConst(Value *X, const APInt &C,
-                                              ICmpInst::Predicate Pred) {
+Instruction *InstCombinerImpl::foldICmpAddOpConst(Value *X, const APInt &C,
+                                                  ICmpInst::Predicate Pred) {
   // From this point on, we know that (X+C <= X) --> (X+C < X) because C != 0,
   // so the values can never be equal.  Similarly for all other "or equals"
   // operators.
@@ -1149,9 +1112,9 @@ Instruction *InstCombiner::foldICmpAddOpConst(Value *X, const APInt &C,
 /// Handle "(icmp eq/ne (ashr/lshr AP2, A), AP1)" ->
 /// (icmp eq/ne A, Log2(AP2/AP1)) ->
 /// (icmp eq/ne A, Log2(AP2) - Log2(AP1)).
-Instruction *InstCombiner::foldICmpShrConstConst(ICmpInst &I, Value *A,
-                                                 const APInt &AP1,
-                                                 const APInt &AP2) {
+Instruction *InstCombinerImpl::foldICmpShrConstConst(ICmpInst &I, Value *A,
+                                                     const APInt &AP1,
+                                                     const APInt &AP2) {
   assert(I.isEquality() && "Cannot fold icmp gt/lt");
 
   auto getICmp = [&I](CmpInst::Predicate Pred, Value *LHS, Value *RHS) {
@@ -1208,9 +1171,9 @@ Instruction *InstCombiner::foldICmpShrConstConst(ICmpInst &I, Value *A,
 
 /// Handle "(icmp eq/ne (shl AP2, A), AP1)" ->
 /// (icmp eq/ne A, TrailingZeros(AP1) - TrailingZeros(AP2)).
-Instruction *InstCombiner::foldICmpShlConstConst(ICmpInst &I, Value *A,
-                                                 const APInt &AP1,
-                                                 const APInt &AP2) {
+Instruction *InstCombinerImpl::foldICmpShlConstConst(ICmpInst &I, Value *A,
+                                                     const APInt &AP1,
+                                                     const APInt &AP2) {
   assert(I.isEquality() && "Cannot fold icmp gt/lt");
 
   auto getICmp = [&I](CmpInst::Predicate Pred, Value *LHS, Value *RHS) {
@@ -1254,7 +1217,7 @@ Instruction *InstCombiner::foldICmpShlConstConst(ICmpInst &I, Value *A,
 ///
 static Instruction *processUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B,
                                           ConstantInt *CI2, ConstantInt *CI1,
-                                          InstCombiner &IC) {
+                                          InstCombinerImpl &IC) {
   // The transformation we're trying to do here is to transform this into an
   // llvm.sadd.with.overflow.  To do this, we have to replace the original add
   // with a narrower add, and discard the add-with-constant that is part of the
@@ -1340,7 +1303,7 @@ static Instruction *processUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B,
 ///   icmp eq/ne (urem/srem %x, %y), 0
 /// iff %y is a power-of-two, we can replace this with a bit test:
 ///   icmp eq/ne (and %x, (add %y, -1)), 0
-Instruction *InstCombiner::foldIRemByPowerOfTwoToBitTest(ICmpInst &I) {
+Instruction *InstCombinerImpl::foldIRemByPowerOfTwoToBitTest(ICmpInst &I) {
   // This fold is only valid for equality predicates.
   if (!I.isEquality())
     return nullptr;
@@ -1359,7 +1322,7 @@ Instruction *InstCombiner::foldIRemByPowerOfTwoToBitTest(ICmpInst &I) {
 
 /// Fold equality-comparison between zero and any (maybe truncated) right-shift
 /// by one-less-than-bitwidth into a sign test on the original value.
-Instruction *InstCombiner::foldSignBitTest(ICmpInst &I) {
+Instruction *InstCombinerImpl::foldSignBitTest(ICmpInst &I) {
   Instruction *Val;
   ICmpInst::Predicate Pred;
   if (!I.isEquality() || !match(&I, m_ICmp(Pred, m_Instruction(Val), m_Zero())))
@@ -1390,7 +1353,7 @@ Instruction *InstCombiner::foldSignBitTest(ICmpInst &I) {
 }
 
 // Handle  icmp pred X, 0
-Instruction *InstCombiner::foldICmpWithZero(ICmpInst &Cmp) {
+Instruction *InstCombinerImpl::foldICmpWithZero(ICmpInst &Cmp) {
   CmpInst::Predicate Pred = Cmp.getPredicate();
   if (!match(Cmp.getOperand(1), m_Zero()))
     return nullptr;
@@ -1431,7 +1394,7 @@ Instruction *InstCombiner::foldICmpWithZero(ICmpInst &Cmp) {
 /// should be moved to some other helper and extended as noted below (it is also
 /// possible that code has been made unnecessary - do we canonicalize IR to
 /// overflow/saturating intrinsics or not?).
-Instruction *InstCombiner::foldICmpWithConstant(ICmpInst &Cmp) {
+Instruction *InstCombinerImpl::foldICmpWithConstant(ICmpInst &Cmp) {
   // Match the following pattern, which is a common idiom when writing
   // overflow-safe integer arithmetic functions. The source performs an addition
   // in wider type and explicitly checks for overflow using comparisons against
@@ -1477,7 +1440,7 @@ Instruction *InstCombiner::foldICmpWithConstant(ICmpInst &Cmp) {
 }
 
 /// Canonicalize icmp instructions based on dominating conditions.
-Instruction *InstCombiner::foldICmpWithDominatingICmp(ICmpInst &Cmp) {
+Instruction *InstCombinerImpl::foldICmpWithDominatingICmp(ICmpInst &Cmp) {
   // This is a cheap/incomplete check for dominance - just match a single
   // predecessor with a conditional branch.
   BasicBlock *CmpBB = Cmp.getParent();
@@ -1547,9 +1510,9 @@ Instruction *InstCombiner::foldICmpWithDominatingICmp(ICmpInst &Cmp) {
 }
 
 /// Fold icmp (trunc X, Y), C.
-Instruction *InstCombiner::foldICmpTruncConstant(ICmpInst &Cmp,
-                                                 TruncInst *Trunc,
-                                                 const APInt &C) {
+Instruction *InstCombinerImpl::foldICmpTruncConstant(ICmpInst &Cmp,
+                                                     TruncInst *Trunc,
+                                                     const APInt &C) {
   ICmpInst::Predicate Pred = Cmp.getPredicate();
   Value *X = Trunc->getOperand(0);
   if (C.isOneValue() && C.getBitWidth() > 1) {
@@ -1580,9 +1543,9 @@ Instruction *InstCombiner::foldICmpTruncConstant(ICmpInst &Cmp,
 }
 
 /// Fold icmp (xor X, Y), C.
-Instruction *InstCombiner::foldICmpXorConstant(ICmpInst &Cmp,
-                                               BinaryOperator *Xor,
-                                               const APInt &C) {
+Instruction *InstCombinerImpl::foldICmpXorConstant(ICmpInst &Cmp,
+                                                   BinaryOperator *Xor,
+                                                   const APInt &C) {
   Value *X = Xor->getOperand(0);
   Value *Y = Xor->getOperand(1);
   const APInt *XorC;
@@ -1612,15 +1575,13 @@ Instruction *InstCombiner::foldICmpXorConstant(ICmpInst &Cmp,
   if (Xor->hasOneUse()) {
     // (icmp u/s (xor X SignMask), C) -> (icmp s/u X, (xor C SignMask))
     if (!Cmp.isEquality() && XorC->isSignMask()) {
-      Pred = Cmp.isSigned() ? Cmp.getUnsignedPredicate()
-                            : Cmp.getSignedPredicate();
+      Pred = Cmp.getFlippedSignednessPredicate();
       return new ICmpInst(Pred, X, ConstantInt::get(X->getType(), C ^ *XorC));
     }
 
     // (icmp u/s (xor X ~SignMask), C) -> (icmp s/u X, (xor C ~SignMask))
     if (!Cmp.isEquality() && XorC->isMaxSignedValue()) {
-      Pred = Cmp.isSigned() ? Cmp.getUnsignedPredicate()
-                            : Cmp.getSignedPredicate();
+      Pred = Cmp.getFlippedSignednessPredicate();
       Pred = Cmp.getSwappedPredicate(Pred);
       return new ICmpInst(Pred, X, ConstantInt::get(X->getType(), C ^ *XorC));
     }
@@ -1649,8 +1610,10 @@ Instruction *InstCombiner::foldICmpXorConstant(ICmpInst &Cmp,
 }
 
 /// Fold icmp (and (sh X, Y), C2), C1.
-Instruction *InstCombiner::foldICmpAndShift(ICmpInst &Cmp, BinaryOperator *And,
-                                            const APInt &C1, const APInt &C2) {
+Instruction *InstCombinerImpl::foldICmpAndShift(ICmpInst &Cmp,
+                                                BinaryOperator *And,
+                                                const APInt &C1,
+                                                const APInt &C2) {
   BinaryOperator *Shift = dyn_cast<BinaryOperator>(And->getOperand(0));
   if (!Shift || !Shift->isShift())
     return nullptr;
@@ -1733,9 +1696,9 @@ Instruction *InstCombiner::foldICmpAndShift(ICmpInst &Cmp, BinaryOperator *And,
 }
 
 /// Fold icmp (and X, C2), C1.
-Instruction *InstCombiner::foldICmpAndConstConst(ICmpInst &Cmp,
-                                                 BinaryOperator *And,
-                                                 const APInt &C1) {
+Instruction *InstCombinerImpl::foldICmpAndConstConst(ICmpInst &Cmp,
+                                                     BinaryOperator *And,
+                                                     const APInt &C1) {
   bool isICMP_NE = Cmp.getPredicate() == ICmpInst::ICMP_NE;
 
   // For vectors: icmp ne (and X, 1), 0 --> trunc X to N x i1
@@ -1841,9 +1804,9 @@ Instruction *InstCombiner::foldICmpAndConstConst(ICmpInst &Cmp,
 }
 
 /// Fold icmp (and X, Y), C.
-Instruction *InstCombiner::foldICmpAndConstant(ICmpInst &Cmp,
-                                               BinaryOperator *And,
-                                               const APInt &C) {
+Instruction *InstCombinerImpl::foldICmpAndConstant(ICmpInst &Cmp,
+                                                   BinaryOperator *And,
+                                                   const APInt &C) {
   if (Instruction *I = foldICmpAndConstConst(Cmp, And, C))
     return I;
 
@@ -1883,7 +1846,7 @@ Instruction *InstCombiner::foldICmpAndConstant(ICmpInst &Cmp,
     if (ExactLogBase2 != -1 && DL.isLegalInteger(ExactLogBase2 + 1)) {
       Type *NTy = IntegerType::get(Cmp.getContext(), ExactLogBase2 + 1);
       if (auto *AndVTy = dyn_cast<VectorType>(And->getType()))
-        NTy = FixedVectorType::get(NTy, AndVTy->getNumElements());
+        NTy = VectorType::get(NTy, AndVTy->getElementCount());
       Value *Trunc = Builder.CreateTrunc(X, NTy);
       auto NewPred = Cmp.getPredicate() == CmpInst::ICMP_EQ ? CmpInst::ICMP_SGE
                                                             : CmpInst::ICMP_SLT;
@@ -1895,8 +1858,9 @@ Instruction *InstCombiner::foldICmpAndConstant(ICmpInst &Cmp,
 }
 
 /// Fold icmp (or X, Y), C.
-Instruction *InstCombiner::foldICmpOrConstant(ICmpInst &Cmp, BinaryOperator *Or,
-                                              const APInt &C) {
+Instruction *InstCombinerImpl::foldICmpOrConstant(ICmpInst &Cmp,
+                                                  BinaryOperator *Or,
+                                                  const APInt &C) {
   ICmpInst::Predicate Pred = Cmp.getPredicate();
   if (C.isOneValue()) {
     // icmp slt signum(V) 1 --> icmp slt V, 1
@@ -1960,9 +1924,9 @@ Instruction *InstCombiner::foldICmpOrConstant(ICmpInst &Cmp, BinaryOperator *Or,
 }
 
 /// Fold icmp (mul X, Y), C.
-Instruction *InstCombiner::foldICmpMulConstant(ICmpInst &Cmp,
-                                               BinaryOperator *Mul,
-                                               const APInt &C) {
+Instruction *InstCombinerImpl::foldICmpMulConstant(ICmpInst &Cmp,
+                                                   BinaryOperator *Mul,
+                                                   const APInt &C) {
   const APInt *MulC;
   if (!match(Mul->getOperand(1), m_APInt(MulC)))
     return nullptr;
@@ -1977,6 +1941,21 @@ Instruction *InstCombiner::foldICmpMulConstant(ICmpInst &Cmp,
                         Constant::getNullValue(Mul->getType()));
   }
 
+  // If the multiply does not wrap, try to divide the compare constant by the
+  // multiplication factor.
+  if (Cmp.isEquality() && !MulC->isNullValue()) {
+    // (mul nsw X, MulC) == C --> X == C /s MulC
+    if (Mul->hasNoSignedWrap() && C.srem(*MulC).isNullValue()) {
+      Constant *NewC = ConstantInt::get(Mul->getType(), C.sdiv(*MulC));
+      return new ICmpInst(Pred, Mul->getOperand(0), NewC);
+    }
+    // (mul nuw X, MulC) == C --> X == C /u MulC
+    if (Mul->hasNoUnsignedWrap() && C.urem(*MulC).isNullValue()) {
+      Constant *NewC = ConstantInt::get(Mul->getType(), C.udiv(*MulC));
+      return new ICmpInst(Pred, Mul->getOperand(0), NewC);
+    }
+  }
+
   return nullptr;
 }
 
@@ -2043,9 +2022,9 @@ static Instruction *foldICmpShlOne(ICmpInst &Cmp, Instruction *Shl,
 }
 
 /// Fold icmp (shl X, Y), C.
-Instruction *InstCombiner::foldICmpShlConstant(ICmpInst &Cmp,
-                                               BinaryOperator *Shl,
-                                               const APInt &C) {
+Instruction *InstCombinerImpl::foldICmpShlConstant(ICmpInst &Cmp,
+                                                   BinaryOperator *Shl,
+                                                   const APInt &C) {
   const APInt *ShiftVal;
   if (Cmp.isEquality() && match(Shl->getOperand(0), m_APInt(ShiftVal)))
     return foldICmpShlConstConst(Cmp, Shl->getOperand(1), C, *ShiftVal);
@@ -2173,7 +2152,7 @@ Instruction *InstCombiner::foldICmpShlConstant(ICmpInst &Cmp,
       DL.isLegalInteger(TypeBits - Amt)) {
     Type *TruncTy = IntegerType::get(Cmp.getContext(), TypeBits - Amt);
     if (auto *ShVTy = dyn_cast<VectorType>(ShType))
-      TruncTy = FixedVectorType::get(TruncTy, ShVTy->getNumElements());
+      TruncTy = VectorType::get(TruncTy, ShVTy->getElementCount());
     Constant *NewC =
         ConstantInt::get(TruncTy, C.ashr(*ShiftAmt).trunc(TypeBits - Amt));
     return new ICmpInst(Pred, Builder.CreateTrunc(X, TruncTy), NewC);
@@ -2183,9 +2162,9 @@ Instruction *InstCombiner::foldICmpShlConstant(ICmpInst &Cmp,
 }
 
 /// Fold icmp ({al}shr X, Y), C.
-Instruction *InstCombiner::foldICmpShrConstant(ICmpInst &Cmp,
-                                               BinaryOperator *Shr,
-                                               const APInt &C) {
+Instruction *InstCombinerImpl::foldICmpShrConstant(ICmpInst &Cmp,
+                                                   BinaryOperator *Shr,
+                                                   const APInt &C) {
   // An exact shr only shifts out zero bits, so:
   // icmp eq/ne (shr X, Y), 0 --> icmp eq/ne X, 0
   Value *X = Shr->getOperand(0);
@@ -2231,6 +2210,21 @@ Instruction *InstCombiner::foldICmpShrConstant(ICmpInst &Cmp,
           (ShiftedC + 1).ashr(ShAmtVal) == (C + 1))
         return new ICmpInst(Pred, X, ConstantInt::get(ShrTy, ShiftedC));
     }
+
+    // If the compare constant has significant bits above the lowest sign-bit,
+    // then convert an unsigned cmp to a test of the sign-bit:
+    // (ashr X, ShiftC) u> C --> X s< 0
+    // (ashr X, ShiftC) u< C --> X s> -1
+    if (C.getBitWidth() > 2 && C.getNumSignBits() <= ShAmtVal) {
+      if (Pred == CmpInst::ICMP_UGT) {
+        return new ICmpInst(CmpInst::ICMP_SLT, X,
+                            ConstantInt::getNullValue(ShrTy));
+      }
+      if (Pred == CmpInst::ICMP_ULT) {
+        return new ICmpInst(CmpInst::ICMP_SGT, X,
+                            ConstantInt::getAllOnesValue(ShrTy));
+      }
+    }
   } else {
     if (Pred == CmpInst::ICMP_ULT || (Pred == CmpInst::ICMP_UGT && IsExact)) {
       // icmp ult (lshr X, ShAmtC), C --> icmp ult X, (C << ShAmtC)
@@ -2276,9 +2270,9 @@ Instruction *InstCombiner::foldICmpShrConstant(ICmpInst &Cmp,
   return nullptr;
 }
 
-Instruction *InstCombiner::foldICmpSRemConstant(ICmpInst &Cmp,
-                                                BinaryOperator *SRem,
-                                                const APInt &C) {
+Instruction *InstCombinerImpl::foldICmpSRemConstant(ICmpInst &Cmp,
+                                                    BinaryOperator *SRem,
+                                                    const APInt &C) {
   // Match an 'is positive' or 'is negative' comparison of remainder by a
   // constant power-of-2 value:
   // (X % pow2C) sgt/slt 0
@@ -2315,9 +2309,9 @@ Instruction *InstCombiner::foldICmpSRemConstant(ICmpInst &Cmp,
 }
 
 /// Fold icmp (udiv X, Y), C.
-Instruction *InstCombiner::foldICmpUDivConstant(ICmpInst &Cmp,
-                                                BinaryOperator *UDiv,
-                                                const APInt &C) {
+Instruction *InstCombinerImpl::foldICmpUDivConstant(ICmpInst &Cmp,
+                                                    BinaryOperator *UDiv,
+                                                    const APInt &C) {
   const APInt *C2;
   if (!match(UDiv->getOperand(0), m_APInt(C2)))
     return nullptr;
@@ -2344,9 +2338,9 @@ Instruction *InstCombiner::foldICmpUDivConstant(ICmpInst &Cmp,
 }
 
 /// Fold icmp ({su}div X, Y), C.
-Instruction *InstCombiner::foldICmpDivConstant(ICmpInst &Cmp,
-                                               BinaryOperator *Div,
-                                               const APInt &C) {
+Instruction *InstCombinerImpl::foldICmpDivConstant(ICmpInst &Cmp,
+                                                   BinaryOperator *Div,
+                                                   const APInt &C) {
   // Fold: icmp pred ([us]div X, C2), C -> range test
   // Fold this div into the comparison, producing a range check.
   // Determine, based on the divide type, what the range is being
@@ -2514,9 +2508,9 @@ Instruction *InstCombiner::foldICmpDivConstant(ICmpInst &Cmp,
 }
 
 /// Fold icmp (sub X, Y), C.
-Instruction *InstCombiner::foldICmpSubConstant(ICmpInst &Cmp,
-                                               BinaryOperator *Sub,
-                                               const APInt &C) {
+Instruction *InstCombinerImpl::foldICmpSubConstant(ICmpInst &Cmp,
+                                                   BinaryOperator *Sub,
+                                                   const APInt &C) {
   Value *X = Sub->getOperand(0), *Y = Sub->getOperand(1);
   ICmpInst::Predicate Pred = Cmp.getPredicate();
   const APInt *C2;
@@ -2576,9 +2570,9 @@ Instruction *InstCombiner::foldICmpSubConstant(ICmpInst &Cmp,
 }
 
 /// Fold icmp (add X, Y), C.
-Instruction *InstCombiner::foldICmpAddConstant(ICmpInst &Cmp,
-                                               BinaryOperator *Add,
-                                               const APInt &C) {
+Instruction *InstCombinerImpl::foldICmpAddConstant(ICmpInst &Cmp,
+                                                   BinaryOperator *Add,
+                                                   const APInt &C) {
   Value *Y = Add->getOperand(1);
   const APInt *C2;
   if (Cmp.isEquality() || !match(Y, m_APInt(C2)))
@@ -2642,10 +2636,10 @@ Instruction *InstCombiner::foldICmpAddConstant(ICmpInst &Cmp,
   return nullptr;
 }
 
-bool InstCombiner::matchThreeWayIntCompare(SelectInst *SI, Value *&LHS,
-                                           Value *&RHS, ConstantInt *&Less,
-                                           ConstantInt *&Equal,
-                                           ConstantInt *&Greater) {
+bool InstCombinerImpl::matchThreeWayIntCompare(SelectInst *SI, Value *&LHS,
+                                               Value *&RHS, ConstantInt *&Less,
+                                               ConstantInt *&Equal,
+                                               ConstantInt *&Greater) {
   // TODO: Generalize this to work with other comparison idioms or ensure
   // they get canonicalized into this form.
 
@@ -2682,7 +2676,8 @@ bool InstCombiner::matchThreeWayIntCompare(SelectInst *SI, Value *&LHS,
   if (PredB == ICmpInst::ICMP_SGT && isa<Constant>(RHS2)) {
     // x sgt C-1  <-->  x sge C  <-->  not(x slt C)
     auto FlippedStrictness =
-        getFlippedStrictnessPredicateAndConstant(PredB, cast<Constant>(RHS2));
+        InstCombiner::getFlippedStrictnessPredicateAndConstant(
+            PredB, cast<Constant>(RHS2));
     if (!FlippedStrictness)
       return false;
     assert(FlippedStrictness->first == ICmpInst::ICMP_SGE && "Sanity check");
@@ -2694,9 +2689,9 @@ bool InstCombiner::matchThreeWayIntCompare(SelectInst *SI, Value *&LHS,
   return PredB == ICmpInst::ICMP_SLT && RHS == RHS2;
 }
 
-Instruction *InstCombiner::foldICmpSelectConstant(ICmpInst &Cmp,
-                                                  SelectInst *Select,
-                                                  ConstantInt *C) {
+Instruction *InstCombinerImpl::foldICmpSelectConstant(ICmpInst &Cmp,
+                                                      SelectInst *Select,
+                                                      ConstantInt *C) {
 
   assert(C && "Cmp RHS should be a constant int!");
   // If we're testing a constant value against the result of a three way
@@ -2794,7 +2789,7 @@ static Instruction *foldICmpBitCast(ICmpInst &Cmp,
     const APInt *C;
     bool TrueIfSigned;
     if (match(Op1, m_APInt(C)) && Bitcast->hasOneUse() &&
-        isSignBitCheck(Pred, *C, TrueIfSigned)) {
+        InstCombiner::isSignBitCheck(Pred, *C, TrueIfSigned)) {
       if (match(BCSrcOp, m_FPExt(m_Value(X))) ||
           match(BCSrcOp, m_FPTrunc(m_Value(X)))) {
         // (bitcast (fpext/fptrunc X)) to iX) < 0 --> (bitcast X to iY) < 0
@@ -2806,7 +2801,7 @@ static Instruction *foldICmpBitCast(ICmpInst &Cmp,
 
           Type *NewType = Builder.getIntNTy(XType->getScalarSizeInBits());
           if (auto *XVTy = dyn_cast<VectorType>(XType))
-            NewType = FixedVectorType::get(NewType, XVTy->getNumElements());
+            NewType = VectorType::get(NewType, XVTy->getElementCount());
           Value *NewBitcast = Builder.CreateBitCast(X, NewType);
           if (TrueIfSigned)
             return new ICmpInst(ICmpInst::ICMP_SLT, NewBitcast,
@@ -2870,7 +2865,7 @@ static Instruction *foldICmpBitCast(ICmpInst &Cmp,
 
 /// Try to fold integer comparisons with a constant operand: icmp Pred X, C
 /// where X is some kind of instruction.
-Instruction *InstCombiner::foldICmpInstWithConstant(ICmpInst &Cmp) {
+Instruction *InstCombinerImpl::foldICmpInstWithConstant(ICmpInst &Cmp) {
   const APInt *C;
   if (!match(Cmp.getOperand(1), m_APInt(C)))
     return nullptr;
@@ -2955,9 +2950,8 @@ Instruction *InstCombiner::foldICmpInstWithConstant(ICmpInst &Cmp) {
 
 /// Fold an icmp equality instruction with binary operator LHS and constant RHS:
 /// icmp eq/ne BO, C.
-Instruction *InstCombiner::foldICmpBinOpEqualityWithConstant(ICmpInst &Cmp,
-                                                             BinaryOperator *BO,
-                                                             const APInt &C) {
+Instruction *InstCombinerImpl::foldICmpBinOpEqualityWithConstant(
+    ICmpInst &Cmp, BinaryOperator *BO, const APInt &C) {
   // TODO: Some of these folds could work with arbitrary constants, but this
   // function is limited to scalar and vector splat constants.
   if (!Cmp.isEquality())
@@ -3047,17 +3041,6 @@ Instruction *InstCombiner::foldICmpBinOpEqualityWithConstant(ICmpInst &Cmp,
     }
     break;
   }
-  case Instruction::Mul:
-    if (C.isNullValue() && BO->hasNoSignedWrap()) {
-      const APInt *BOC;
-      if (match(BOp1, m_APInt(BOC)) && !BOC->isNullValue()) {
-        // The trivial case (mul X, 0) is handled by InstSimplify.
-        // General case : (mul X, C) != 0 iff X != 0
-        //                (mul X, C) == 0 iff X == 0
-        return new ICmpInst(Pred, BOp0, Constant::getNullValue(RHS->getType()));
-      }
-    }
-    break;
   case Instruction::UDiv:
     if (C.isNullValue()) {
       // (icmp eq/ne (udiv A, B), 0) -> (icmp ugt/ule i32 B, A)
@@ -3072,12 +3055,19 @@ Instruction *InstCombiner::foldICmpBinOpEqualityWithConstant(ICmpInst &Cmp,
 }
 
 /// Fold an equality icmp with LLVM intrinsic and constant operand.
-Instruction *InstCombiner::foldICmpEqIntrinsicWithConstant(ICmpInst &Cmp,
-                                                           IntrinsicInst *II,
-                                                           const APInt &C) {
+Instruction *InstCombinerImpl::foldICmpEqIntrinsicWithConstant(
+    ICmpInst &Cmp, IntrinsicInst *II, const APInt &C) {
   Type *Ty = II->getType();
   unsigned BitWidth = C.getBitWidth();
   switch (II->getIntrinsicID()) {
+  case Intrinsic::abs:
+    // abs(A) == 0  ->  A == 0
+    // abs(A) == INT_MIN  ->  A == INT_MIN
+    if (C.isNullValue() || C.isMinSignedValue())
+      return new ICmpInst(Cmp.getPredicate(), II->getArgOperand(0),
+                          ConstantInt::get(Ty, C));
+    break;
+
   case Intrinsic::bswap:
     // bswap(A) == C  ->  A == bswap(C)
     return new ICmpInst(Cmp.getPredicate(), II->getArgOperand(0),
@@ -3145,18 +3135,31 @@ Instruction *InstCombiner::foldICmpEqIntrinsicWithConstant(ICmpInst &Cmp,
 }
 
 /// Fold an icmp with LLVM intrinsic and constant operand: icmp Pred II, C.
-Instruction *InstCombiner::foldICmpIntrinsicWithConstant(ICmpInst &Cmp,
-                                                         IntrinsicInst *II,
-                                                         const APInt &C) {
+Instruction *InstCombinerImpl::foldICmpIntrinsicWithConstant(ICmpInst &Cmp,
+                                                             IntrinsicInst *II,
+                                                             const APInt &C) {
   if (Cmp.isEquality())
     return foldICmpEqIntrinsicWithConstant(Cmp, II, C);
 
   Type *Ty = II->getType();
   unsigned BitWidth = C.getBitWidth();
+  ICmpInst::Predicate Pred = Cmp.getPredicate();
   switch (II->getIntrinsicID()) {
+  case Intrinsic::ctpop: {
+    // (ctpop X > BitWidth - 1) --> X == -1
+    Value *X = II->getArgOperand(0);
+    if (C == BitWidth - 1 && Pred == ICmpInst::ICMP_UGT)
+      return CmpInst::Create(Instruction::ICmp, ICmpInst::ICMP_EQ, X,
+                             ConstantInt::getAllOnesValue(Ty));
+    // (ctpop X < BitWidth) --> X != -1
+    if (C == BitWidth && Pred == ICmpInst::ICMP_ULT)
+      return CmpInst::Create(Instruction::ICmp, ICmpInst::ICMP_NE, X,
+                             ConstantInt::getAllOnesValue(Ty));
+    break;
+  }
   case Intrinsic::ctlz: {
     // ctlz(0bXXXXXXXX) > 3 -> 0bXXXXXXXX < 0b00010000
-    if (Cmp.getPredicate() == ICmpInst::ICMP_UGT && C.ult(BitWidth)) {
+    if (Pred == ICmpInst::ICMP_UGT && C.ult(BitWidth)) {
       unsigned Num = C.getLimitedValue();
       APInt Limit = APInt::getOneBitSet(BitWidth, BitWidth - Num - 1);
       return CmpInst::Create(Instruction::ICmp, ICmpInst::ICMP_ULT,
@@ -3164,8 +3167,7 @@ Instruction *InstCombiner::foldICmpIntrinsicWithConstant(ICmpInst &Cmp,
     }
 
     // ctlz(0bXXXXXXXX) < 3 -> 0bXXXXXXXX > 0b00011111
-    if (Cmp.getPredicate() == ICmpInst::ICMP_ULT &&
-        C.uge(1) && C.ule(BitWidth)) {
+    if (Pred == ICmpInst::ICMP_ULT && C.uge(1) && C.ule(BitWidth)) {
       unsigned Num = C.getLimitedValue();
       APInt Limit = APInt::getLowBitsSet(BitWidth, BitWidth - Num);
       return CmpInst::Create(Instruction::ICmp, ICmpInst::ICMP_UGT,
@@ -3179,7 +3181,7 @@ Instruction *InstCombiner::foldICmpIntrinsicWithConstant(ICmpInst &Cmp,
       return nullptr;
 
     // cttz(0bXXXXXXXX) > 3 -> 0bXXXXXXXX & 0b00001111 == 0
-    if (Cmp.getPredicate() == ICmpInst::ICMP_UGT && C.ult(BitWidth)) {
+    if (Pred == ICmpInst::ICMP_UGT && C.ult(BitWidth)) {
       APInt Mask = APInt::getLowBitsSet(BitWidth, C.getLimitedValue() + 1);
       return CmpInst::Create(Instruction::ICmp, ICmpInst::ICMP_EQ,
                              Builder.CreateAnd(II->getArgOperand(0), Mask),
@@ -3187,8 +3189,7 @@ Instruction *InstCombiner::foldICmpIntrinsicWithConstant(ICmpInst &Cmp,
     }
 
     // cttz(0bXXXXXXXX) < 3 -> 0bXXXXXXXX & 0b00000111 != 0
-    if (Cmp.getPredicate() == ICmpInst::ICMP_ULT &&
-        C.uge(1) && C.ule(BitWidth)) {
+    if (Pred == ICmpInst::ICMP_ULT && C.uge(1) && C.ule(BitWidth)) {
       APInt Mask = APInt::getLowBitsSet(BitWidth, C.getLimitedValue());
       return CmpInst::Create(Instruction::ICmp, ICmpInst::ICMP_NE,
                              Builder.CreateAnd(II->getArgOperand(0), Mask),
@@ -3204,7 +3205,7 @@ Instruction *InstCombiner::foldICmpIntrinsicWithConstant(ICmpInst &Cmp,
 }
 
 /// Handle icmp with constant (but not simple integer constant) RHS.
-Instruction *InstCombiner::foldICmpInstWithConstantNotInt(ICmpInst &I) {
+Instruction *InstCombinerImpl::foldICmpInstWithConstantNotInt(ICmpInst &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
   Constant *RHSC = dyn_cast<Constant>(Op1);
   Instruction *LHSI = dyn_cast<Instruction>(Op0);
@@ -3383,8 +3384,8 @@ static Value *foldICmpWithLowBitMaskedVal(ICmpInst &I,
   // those elements by copying an existing, defined, and safe scalar constant.
   Type *OpTy = M->getType();
   auto *VecC = dyn_cast<Constant>(M);
-  if (OpTy->isVectorTy() && VecC && VecC->containsUndefElement()) {
-    auto *OpVTy = cast<VectorType>(OpTy);
+  auto *OpVTy = dyn_cast<FixedVectorType>(OpTy);
+  if (OpVTy && VecC && VecC->containsUndefOrPoisonElement()) {
     Constant *SafeReplacementConstant = nullptr;
     for (unsigned i = 0, e = OpVTy->getNumElements(); i != e; ++i) {
       if (!isa<UndefValue>(VecC->getAggregateElement(i))) {
@@ -3650,7 +3651,7 @@ foldShiftIntoShiftInAnotherHandOfAndInICmp(ICmpInst &I, const SimplifyQuery SQ,
 ///   @llvm.umul.with.overflow(x, y) plus extraction of overflow bit
 /// Note that the comparison is commutative, while inverted (u>=, ==) predicate
 /// will mean that we are looking for the opposite answer.
-Value *InstCombiner::foldUnsignedMultiplicationOverflowCheck(ICmpInst &I) {
+Value *InstCombinerImpl::foldUnsignedMultiplicationOverflowCheck(ICmpInst &I) {
   ICmpInst::Predicate Pred;
   Value *X, *Y;
   Instruction *Mul;
@@ -3712,11 +3713,28 @@ Value *InstCombiner::foldUnsignedMultiplicationOverflowCheck(ICmpInst &I) {
   return Res;
 }
 
+static Instruction *foldICmpXNegX(ICmpInst &I) {
+  CmpInst::Predicate Pred;
+  Value *X;
+  if (!match(&I, m_c_ICmp(Pred, m_NSWNeg(m_Value(X)), m_Deferred(X))))
+    return nullptr;
+
+  if (ICmpInst::isSigned(Pred))
+    Pred = ICmpInst::getSwappedPredicate(Pred);
+  else if (ICmpInst::isUnsigned(Pred))
+    Pred = ICmpInst::getSignedPredicate(Pred);
+  // else for equality-comparisons just keep the predicate.
+
+  return ICmpInst::Create(Instruction::ICmp, Pred, X,
+                          Constant::getNullValue(X->getType()), I.getName());
+}
+
 /// Try to fold icmp (binop), X or icmp X, (binop).
 /// TODO: A large part of this logic is duplicated in InstSimplify's
 /// simplifyICmpWithBinOp(). We should be able to share that and avoid the code
 /// duplication.
-Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I, const SimplifyQuery &SQ) {
+Instruction *InstCombinerImpl::foldICmpBinOp(ICmpInst &I,
+                                             const SimplifyQuery &SQ) {
   const SimplifyQuery Q = SQ.getWithInstruction(&I);
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
@@ -3726,6 +3744,9 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I, const SimplifyQuery &SQ) {
   if (!BO0 && !BO1)
     return nullptr;
 
+  if (Instruction *NewICmp = foldICmpXNegX(I))
+    return NewICmp;
+
   const CmpInst::Predicate Pred = I.getPredicate();
   Value *X;
 
@@ -3946,6 +3967,19 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I, const SimplifyQuery &SQ) {
                               ConstantExpr::getNeg(RHSC));
   }
 
+  {
+    // Try to remove shared constant multiplier from equality comparison:
+    // X * C == Y * C (with no overflowing/aliasing) --> X == Y
+    Value *X, *Y;
+    const APInt *C;
+    if (match(Op0, m_Mul(m_Value(X), m_APInt(C))) && *C != 0 &&
+        match(Op1, m_Mul(m_Value(Y), m_SpecificInt(*C))) && I.isEquality())
+      if (!C->countTrailingZeros() ||
+          (BO0->hasNoSignedWrap() && BO1->hasNoSignedWrap()) ||
+          (BO0->hasNoUnsignedWrap() && BO1->hasNoUnsignedWrap()))
+      return new ICmpInst(Pred, X, Y);
+  }
+
   BinaryOperator *SRem = nullptr;
   // icmp (srem X, Y), Y
   if (BO0 && BO0->getOpcode() == Instruction::SRem && Op1 == BO0->getOperand(1))
@@ -3990,15 +4024,13 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I, const SimplifyQuery &SQ) {
       if (match(BO0->getOperand(1), m_APInt(C))) {
         // icmp u/s (a ^ signmask), (b ^ signmask) --> icmp s/u a, b
         if (C->isSignMask()) {
-          ICmpInst::Predicate NewPred =
-              I.isSigned() ? I.getUnsignedPredicate() : I.getSignedPredicate();
+          ICmpInst::Predicate NewPred = I.getFlippedSignednessPredicate();
           return new ICmpInst(NewPred, BO0->getOperand(0), BO1->getOperand(0));
         }
 
         // icmp u/s (a ^ maxsignval), (b ^ maxsignval) --> icmp s/u' a, b
         if (BO0->getOpcode() == Instruction::Xor && C->isMaxSignedValue()) {
-          ICmpInst::Predicate NewPred =
-              I.isSigned() ? I.getUnsignedPredicate() : I.getSignedPredicate();
+          ICmpInst::Predicate NewPred = I.getFlippedSignednessPredicate();
           NewPred = I.getSwappedPredicate(NewPred);
           return new ICmpInst(NewPred, BO0->getOperand(0), BO1->getOperand(0));
         }
@@ -4022,10 +4054,6 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I, const SimplifyQuery &SQ) {
           Value *And2 = Builder.CreateAnd(BO1->getOperand(0), Mask);
           return new ICmpInst(Pred, And1, And2);
         }
-        // If there are no trailing zeros in the multiplier, just eliminate
-        // the multiplies (no masking is needed):
-        // icmp eq/ne (X * C), (Y * C) --> icmp eq/ne X, Y
-        return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
       }
       break;
     }
@@ -4170,7 +4198,7 @@ static Instruction *foldICmpWithMinMax(ICmpInst &Cmp) {
   return nullptr;
 }
 
-Instruction *InstCombiner::foldICmpEquality(ICmpInst &I) {
+Instruction *InstCombinerImpl::foldICmpEquality(ICmpInst &I) {
   if (!I.isEquality())
     return nullptr;
 
@@ -4438,7 +4466,7 @@ static Instruction *foldICmpWithZextOrSext(ICmpInst &ICmp,
 }
 
 /// Handle icmp (cast x), (cast or constant).
-Instruction *InstCombiner::foldICmpWithCastOp(ICmpInst &ICmp) {
+Instruction *InstCombinerImpl::foldICmpWithCastOp(ICmpInst &ICmp) {
   auto *CastOp0 = dyn_cast<CastInst>(ICmp.getOperand(0));
   if (!CastOp0)
     return nullptr;
@@ -4493,9 +4521,10 @@ static bool isNeutralValue(Instruction::BinaryOps BinaryOp, Value *RHS) {
   }
 }
 
-OverflowResult InstCombiner::computeOverflow(
-    Instruction::BinaryOps BinaryOp, bool IsSigned,
-    Value *LHS, Value *RHS, Instruction *CxtI) const {
+OverflowResult
+InstCombinerImpl::computeOverflow(Instruction::BinaryOps BinaryOp,
+                                  bool IsSigned, Value *LHS, Value *RHS,
+                                  Instruction *CxtI) const {
   switch (BinaryOp) {
     default:
       llvm_unreachable("Unsupported binary op");
@@ -4517,9 +4546,11 @@ OverflowResult InstCombiner::computeOverflow(
   }
 }
 
-bool InstCombiner::OptimizeOverflowCheck(
-    Instruction::BinaryOps BinaryOp, bool IsSigned, Value *LHS, Value *RHS,
-    Instruction &OrigI, Value *&Result, Constant *&Overflow) {
+bool InstCombinerImpl::OptimizeOverflowCheck(Instruction::BinaryOps BinaryOp,
+                                             bool IsSigned, Value *LHS,
+                                             Value *RHS, Instruction &OrigI,
+                                             Value *&Result,
+                                             Constant *&Overflow) {
   if (OrigI.isCommutative() && isa<Constant>(LHS) && !isa<Constant>(RHS))
     std::swap(LHS, RHS);
 
@@ -4529,9 +4560,13 @@ bool InstCombiner::OptimizeOverflowCheck(
   // compare.
   Builder.SetInsertPoint(&OrigI);
 
+  Type *OverflowTy = Type::getInt1Ty(LHS->getContext());
+  if (auto *LHSTy = dyn_cast<VectorType>(LHS->getType()))
+    OverflowTy = VectorType::get(OverflowTy, LHSTy->getElementCount());
+
   if (isNeutralValue(BinaryOp, RHS)) {
     Result = LHS;
-    Overflow = Builder.getFalse();
+    Overflow = ConstantInt::getFalse(OverflowTy);
     return true;
   }
 
@@ -4542,12 +4577,12 @@ bool InstCombiner::OptimizeOverflowCheck(
     case OverflowResult::AlwaysOverflowsHigh:
       Result = Builder.CreateBinOp(BinaryOp, LHS, RHS);
       Result->takeName(&OrigI);
-      Overflow = Builder.getTrue();
+      Overflow = ConstantInt::getTrue(OverflowTy);
       return true;
     case OverflowResult::NeverOverflows:
       Result = Builder.CreateBinOp(BinaryOp, LHS, RHS);
       Result->takeName(&OrigI);
-      Overflow = Builder.getFalse();
+      Overflow = ConstantInt::getFalse(OverflowTy);
       if (auto *Inst = dyn_cast<Instruction>(Result)) {
         if (IsSigned)
           Inst->setHasNoSignedWrap();
@@ -4575,7 +4610,8 @@ bool InstCombiner::OptimizeOverflowCheck(
 /// \returns Instruction which must replace the compare instruction, NULL if no
 ///          replacement required.
 static Instruction *processUMulZExtIdiom(ICmpInst &I, Value *MulVal,
-                                         Value *OtherVal, InstCombiner &IC) {
+                                         Value *OtherVal,
+                                         InstCombinerImpl &IC) {
   // Don't bother doing this transformation for pointers, don't do it for
   // vectors.
   if (!isa<IntegerType>(MulVal->getType()))
@@ -4723,15 +4759,14 @@ static Instruction *processUMulZExtIdiom(ICmpInst &I, Value *MulVal,
   Function *F = Intrinsic::getDeclaration(
       I.getModule(), Intrinsic::umul_with_overflow, MulType);
   CallInst *Call = Builder.CreateCall(F, {MulA, MulB}, "umul");
-  IC.Worklist.push(MulInstr);
+  IC.addToWorklist(MulInstr);
 
   // If there are uses of mul result other than the comparison, we know that
   // they are truncation or binary AND. Change them to use result of
   // mul.with.overflow and adjust properly mask/size.
   if (MulVal->hasNUsesOrMore(2)) {
     Value *Mul = Builder.CreateExtractValue(Call, 0, "umul.value");
-    for (auto UI = MulVal->user_begin(), UE = MulVal->user_end(); UI != UE;) {
-      User *U = *UI++;
+    for (User *U : make_early_inc_range(MulVal->users())) {
       if (U == &I || U == OtherVal)
         continue;
       if (TruncInst *TI = dyn_cast<TruncInst>(U)) {
@@ -4750,11 +4785,11 @@ static Instruction *processUMulZExtIdiom(ICmpInst &I, Value *MulVal,
       } else {
         llvm_unreachable("Unexpected Binary operation");
       }
-      IC.Worklist.push(cast<Instruction>(U));
+      IC.addToWorklist(cast<Instruction>(U));
     }
   }
   if (isa<Instruction>(OtherVal))
-    IC.Worklist.push(cast<Instruction>(OtherVal));
+    IC.addToWorklist(cast<Instruction>(OtherVal));
 
   // The original icmp gets replaced with the overflow value, maybe inverted
   // depending on predicate.
@@ -4799,7 +4834,7 @@ static APInt getDemandedBitsLHSMask(ICmpInst &I, unsigned BitWidth) {
   // If this is a normal comparison, it demands all bits. If it is a sign bit
   // comparison, it only demands the sign bit.
   bool UnusedBit;
-  if (isSignBitCheck(I.getPredicate(), *RHS, UnusedBit))
+  if (InstCombiner::isSignBitCheck(I.getPredicate(), *RHS, UnusedBit))
     return APInt::getSignMask(BitWidth);
 
   switch (I.getPredicate()) {
@@ -4856,9 +4891,9 @@ static bool swapMayExposeCSEOpportunities(const Value *Op0, const Value *Op1) {
 /// \return true when \p UI is the only use of \p DI in the parent block
 /// and all other uses of \p DI are in blocks dominated by \p DB.
 ///
-bool InstCombiner::dominatesAllUses(const Instruction *DI,
-                                    const Instruction *UI,
-                                    const BasicBlock *DB) const {
+bool InstCombinerImpl::dominatesAllUses(const Instruction *DI,
+                                        const Instruction *UI,
+                                        const BasicBlock *DB) const {
   assert(DI && UI && "Instruction not defined\n");
   // Ignore incomplete definitions.
   if (!DI->getParent())
@@ -4931,9 +4966,9 @@ static bool isChainSelectCmpBranch(const SelectInst *SI) {
 /// major restriction since a NE compare should be 'normalized' to an equal
 /// compare, which usually happens in the combiner and test case
 /// select-cmp-br.ll checks for it.
-bool InstCombiner::replacedSelectWithOperand(SelectInst *SI,
-                                             const ICmpInst *Icmp,
-                                             const unsigned SIOpd) {
+bool InstCombinerImpl::replacedSelectWithOperand(SelectInst *SI,
+                                                 const ICmpInst *Icmp,
+                                                 const unsigned SIOpd) {
   assert((SIOpd == 1 || SIOpd == 2) && "Invalid select operand!");
   if (isChainSelectCmpBranch(SI) && Icmp->getPredicate() == ICmpInst::ICMP_EQ) {
     BasicBlock *Succ = SI->getParent()->getTerminator()->getSuccessor(1);
@@ -4959,7 +4994,7 @@ bool InstCombiner::replacedSelectWithOperand(SelectInst *SI,
 
 /// Try to fold the comparison based on range information we can get by checking
 /// whether bits are known to be zero or one in the inputs.
-Instruction *InstCombiner::foldICmpUsingKnownBits(ICmpInst &I) {
+Instruction *InstCombinerImpl::foldICmpUsingKnownBits(ICmpInst &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
   Type *Ty = Op0->getType();
   ICmpInst::Predicate Pred = I.getPredicate();
@@ -4990,11 +5025,15 @@ Instruction *InstCombiner::foldICmpUsingKnownBits(ICmpInst &I) {
   APInt Op0Min(BitWidth, 0), Op0Max(BitWidth, 0);
   APInt Op1Min(BitWidth, 0), Op1Max(BitWidth, 0);
   if (I.isSigned()) {
-    computeSignedMinMaxValuesFromKnownBits(Op0Known, Op0Min, Op0Max);
-    computeSignedMinMaxValuesFromKnownBits(Op1Known, Op1Min, Op1Max);
+    Op0Min = Op0Known.getSignedMinValue();
+    Op0Max = Op0Known.getSignedMaxValue();
+    Op1Min = Op1Known.getSignedMinValue();
+    Op1Max = Op1Known.getSignedMaxValue();
   } else {
-    computeUnsignedMinMaxValuesFromKnownBits(Op0Known, Op0Min, Op0Max);
-    computeUnsignedMinMaxValuesFromKnownBits(Op1Known, Op1Min, Op1Max);
+    Op0Min = Op0Known.getMinValue();
+    Op0Max = Op0Known.getMaxValue();
+    Op1Min = Op1Known.getMinValue();
+    Op1Max = Op1Known.getMaxValue();
   }
 
   // If Min and Max are known to be the same, then SimplifyDemandedBits figured
@@ -5012,11 +5051,9 @@ Instruction *InstCombiner::foldICmpUsingKnownBits(ICmpInst &I) {
     llvm_unreachable("Unknown icmp opcode!");
   case ICmpInst::ICMP_EQ:
   case ICmpInst::ICMP_NE: {
-    if (Op0Max.ult(Op1Min) || Op0Min.ugt(Op1Max)) {
-      return Pred == CmpInst::ICMP_EQ
-                 ? replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()))
-                 : replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
-    }
+    if (Op0Max.ult(Op1Min) || Op0Min.ugt(Op1Max))
+      return replaceInstUsesWith(
+          I, ConstantInt::getBool(I.getType(), Pred == CmpInst::ICMP_NE));
 
     // If all bits are known zero except for one, then we know at most one bit
     // is set. If the comparison is against zero, then this is a check to see if
@@ -5186,8 +5223,8 @@ Instruction *InstCombiner::foldICmpUsingKnownBits(ICmpInst &I) {
 }
 
 llvm::Optional<std::pair<CmpInst::Predicate, Constant *>>
-llvm::getFlippedStrictnessPredicateAndConstant(CmpInst::Predicate Pred,
-                                               Constant *C) {
+InstCombiner::getFlippedStrictnessPredicateAndConstant(CmpInst::Predicate Pred,
+                                                       Constant *C) {
   assert(ICmpInst::isRelational(Pred) && ICmpInst::isIntPredicate(Pred) &&
          "Only for relational integer predicates.");
 
@@ -5209,8 +5246,8 @@ llvm::getFlippedStrictnessPredicateAndConstant(CmpInst::Predicate Pred,
     // Bail out if the constant can't be safely incremented/decremented.
     if (!ConstantIsOk(CI))
       return llvm::None;
-  } else if (auto *VTy = dyn_cast<VectorType>(Type)) {
-    unsigned NumElts = VTy->getNumElements();
+  } else if (auto *FVTy = dyn_cast<FixedVectorType>(Type)) {
+    unsigned NumElts = FVTy->getNumElements();
     for (unsigned i = 0; i != NumElts; ++i) {
       Constant *Elt = C->getAggregateElement(i);
       if (!Elt)
@@ -5236,7 +5273,8 @@ llvm::getFlippedStrictnessPredicateAndConstant(CmpInst::Predicate Pred,
   // It may not be safe to change a compare predicate in the presence of
   // undefined elements, so replace those elements with the first safe constant
   // that we found.
-  if (C->containsUndefElement()) {
+  // TODO: in case of poison, it is safe; let's replace undefs only.
+  if (C->containsUndefOrPoisonElement()) {
     assert(SafeReplacementConstant && "Replacement constant not set");
     C = Constant::replaceUndefsWith(C, SafeReplacementConstant);
   }
@@ -5256,7 +5294,7 @@ llvm::getFlippedStrictnessPredicateAndConstant(CmpInst::Predicate Pred,
 static ICmpInst *canonicalizeCmpWithConstant(ICmpInst &I) {
   ICmpInst::Predicate Pred = I.getPredicate();
   if (ICmpInst::isEquality(Pred) || !ICmpInst::isIntPredicate(Pred) ||
-      isCanonicalPredicate(Pred))
+      InstCombiner::isCanonicalPredicate(Pred))
     return nullptr;
 
   Value *Op0 = I.getOperand(0);
@@ -5265,7 +5303,8 @@ static ICmpInst *canonicalizeCmpWithConstant(ICmpInst &I) {
   if (!Op1C)
     return nullptr;
 
-  auto FlippedStrictness = getFlippedStrictnessPredicateAndConstant(Pred, Op1C);
+  auto FlippedStrictness =
+      InstCombiner::getFlippedStrictnessPredicateAndConstant(Pred, Op1C);
   if (!FlippedStrictness)
     return nullptr;
 
@@ -5274,14 +5313,14 @@ static ICmpInst *canonicalizeCmpWithConstant(ICmpInst &I) {
 
 /// If we have a comparison with a non-canonical predicate, if we can update
 /// all the users, invert the predicate and adjust all the users.
-static CmpInst *canonicalizeICmpPredicate(CmpInst &I) {
+CmpInst *InstCombinerImpl::canonicalizeICmpPredicate(CmpInst &I) {
   // Is the predicate already canonical?
   CmpInst::Predicate Pred = I.getPredicate();
-  if (isCanonicalPredicate(Pred))
+  if (InstCombiner::isCanonicalPredicate(Pred))
     return nullptr;
 
   // Can all users be adjusted to predicate inversion?
-  if (!canFreelyInvertAllUsersOf(&I, /*IgnoredUser=*/nullptr))
+  if (!InstCombiner::canFreelyInvertAllUsersOf(&I, /*IgnoredUser=*/nullptr))
     return nullptr;
 
   // Ok, we can canonicalize comparison!
@@ -5289,26 +5328,8 @@ static CmpInst *canonicalizeICmpPredicate(CmpInst &I) {
   I.setPredicate(CmpInst::getInversePredicate(Pred));
   I.setName(I.getName() + ".not");
 
-  // And now let's adjust every user.
-  for (User *U : I.users()) {
-    switch (cast<Instruction>(U)->getOpcode()) {
-    case Instruction::Select: {
-      auto *SI = cast<SelectInst>(U);
-      SI->swapValues();
-      SI->swapProfMetadata();
-      break;
-    }
-    case Instruction::Br:
-      cast<BranchInst>(U)->swapSuccessors(); // swaps prof metadata too
-      break;
-    case Instruction::Xor:
-      U->replaceAllUsesWith(&I);
-      break;
-    default:
-      llvm_unreachable("Got unexpected user - out of sync with "
-                       "canFreelyInvertAllUsersOf() ?");
-    }
-  }
+  // And, adapt users.
+  freelyInvertAllUsersOf(&I);
 
   return &I;
 }
@@ -5510,7 +5531,7 @@ static Instruction *foldICmpOfUAddOv(ICmpInst &I) {
   return ExtractValueInst::Create(UAddOv, 1);
 }
 
-Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
+Instruction *InstCombinerImpl::visitICmpInst(ICmpInst &I) {
   bool Changed = false;
   const SimplifyQuery Q = SQ.getWithInstruction(&I);
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
@@ -5634,10 +5655,10 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
   // Try to optimize equality comparisons against alloca-based pointers.
   if (Op0->getType()->isPointerTy() && I.isEquality()) {
     assert(Op1->getType()->isPointerTy() && "Comparing pointer with non-pointer?");
-    if (auto *Alloca = dyn_cast<AllocaInst>(GetUnderlyingObject(Op0, DL)))
+    if (auto *Alloca = dyn_cast<AllocaInst>(getUnderlyingObject(Op0)))
       if (Instruction *New = foldAllocaCmp(I, Alloca, Op1))
         return New;
-    if (auto *Alloca = dyn_cast<AllocaInst>(GetUnderlyingObject(Op1, DL)))
+    if (auto *Alloca = dyn_cast<AllocaInst>(getUnderlyingObject(Op1)))
       if (Instruction *New = foldAllocaCmp(I, Alloca, Op0))
         return New;
   }
@@ -5748,8 +5769,9 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
 }
 
 /// Fold fcmp ([us]itofp x, cst) if possible.
-Instruction *InstCombiner::foldFCmpIntToFPConst(FCmpInst &I, Instruction *LHSI,
-                                                Constant *RHSC) {
+Instruction *InstCombinerImpl::foldFCmpIntToFPConst(FCmpInst &I,
+                                                    Instruction *LHSI,
+                                                    Constant *RHSC) {
   if (!isa<ConstantFP>(RHSC)) return nullptr;
   const APFloat &RHS = cast<ConstantFP>(RHSC)->getValueAPF();
 
@@ -6034,9 +6056,9 @@ static Instruction *foldFCmpReciprocalAndZero(FCmpInst &I, Instruction *LHSI,
 }
 
 /// Optimize fabs(X) compared with zero.
-static Instruction *foldFabsWithFcmpZero(FCmpInst &I, InstCombiner &IC) {
+static Instruction *foldFabsWithFcmpZero(FCmpInst &I, InstCombinerImpl &IC) {
   Value *X;
-  if (!match(I.getOperand(0), m_Intrinsic<Intrinsic::fabs>(m_Value(X))) ||
+  if (!match(I.getOperand(0), m_FAbs(m_Value(X))) ||
       !match(I.getOperand(1), m_PosZeroFP()))
     return nullptr;
 
@@ -6096,7 +6118,7 @@ static Instruction *foldFabsWithFcmpZero(FCmpInst &I, InstCombiner &IC) {
   }
 }
 
-Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
+Instruction *InstCombinerImpl::visitFCmpInst(FCmpInst &I) {
   bool Changed = false;
 
   /// Orders the operands of the compare so that they are listed from most
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index ca51f37af4d9..79e9d5c46c70 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -15,40 +15,32 @@
 #ifndef LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINEINTERNAL_H
 #define LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINEINTERNAL_H
 
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/TargetFolder.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/Argument.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstVisitor.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/PatternMatch.h"
-#include "llvm/IR/Use.h"
-#include "llvm/IR/Value.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/KnownBits.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
-#include <cstdint>
 
 #define DEBUG_TYPE "instcombine"
 
 using namespace llvm::PatternMatch;
 
+// As a default, let's assume that we want to be aggressive,
+// and attempt to traverse with no limits in attempt to sink negation.
+static constexpr unsigned NegatorDefaultMaxDepth = ~0U;
+
+// Let's guesstimate that most often we will end up visiting/producing
+// fairly small number of new instructions.
+static constexpr unsigned NegatorMaxNodesSSO = 16;
+
 namespace llvm {
 
 class AAResults;
@@ -65,305 +57,26 @@ class ProfileSummaryInfo;
 class TargetLibraryInfo;
 class User;
 
-/// Assign a complexity or rank value to LLVM Values. This is used to reduce
-/// the amount of pattern matching needed for compares and commutative
-/// instructions. For example, if we have:
-///   icmp ugt X, Constant
-/// or
-///   xor (add X, Constant), cast Z
-///
-/// We do not have to consider the commuted variants of these patterns because
-/// canonicalization based on complexity guarantees the above ordering.
-///
-/// This routine maps IR values to various complexity ranks:
-///   0 -> undef
-///   1 -> Constants
-///   2 -> Other non-instructions
-///   3 -> Arguments
-///   4 -> Cast and (f)neg/not instructions
-///   5 -> Other instructions
-static inline unsigned getComplexity(Value *V) {
-  if (isa<Instruction>(V)) {
-    if (isa<CastInst>(V) || match(V, m_Neg(m_Value())) ||
-        match(V, m_Not(m_Value())) || match(V, m_FNeg(m_Value())))
-      return 4;
-    return 5;
-  }
-  if (isa<Argument>(V))
-    return 3;
-  return isa<Constant>(V) ? (isa<UndefValue>(V) ? 0 : 1) : 2;
-}
-
-/// Predicate canonicalization reduces the number of patterns that need to be
-/// matched by other transforms. For example, we may swap the operands of a
-/// conditional branch or select to create a compare with a canonical (inverted)
-/// predicate which is then more likely to be matched with other values.
-static inline bool isCanonicalPredicate(CmpInst::Predicate Pred) {
-  switch (Pred) {
-  case CmpInst::ICMP_NE:
-  case CmpInst::ICMP_ULE:
-  case CmpInst::ICMP_SLE:
-  case CmpInst::ICMP_UGE:
-  case CmpInst::ICMP_SGE:
-  // TODO: There are 16 FCMP predicates. Should others be (not) canonical?
-  case CmpInst::FCMP_ONE:
-  case CmpInst::FCMP_OLE:
-  case CmpInst::FCMP_OGE:
-    return false;
-  default:
-    return true;
-  }
-}
-
-/// Given an exploded icmp instruction, return true if the comparison only
-/// checks the sign bit. If it only checks the sign bit, set TrueIfSigned if the
-/// result of the comparison is true when the input value is signed.
-inline bool isSignBitCheck(ICmpInst::Predicate Pred, const APInt &RHS,
-                           bool &TrueIfSigned) {
-  switch (Pred) {
-  case ICmpInst::ICMP_SLT: // True if LHS s< 0
-    TrueIfSigned = true;
-    return RHS.isNullValue();
-  case ICmpInst::ICMP_SLE: // True if LHS s<= -1
-    TrueIfSigned = true;
-    return RHS.isAllOnesValue();
-  case ICmpInst::ICMP_SGT: // True if LHS s> -1
-    TrueIfSigned = false;
-    return RHS.isAllOnesValue();
-  case ICmpInst::ICMP_SGE: // True if LHS s>= 0
-    TrueIfSigned = false;
-    return RHS.isNullValue();
-  case ICmpInst::ICMP_UGT:
-    // True if LHS u> RHS and RHS == sign-bit-mask - 1
-    TrueIfSigned = true;
-    return RHS.isMaxSignedValue();
-  case ICmpInst::ICMP_UGE:
-    // True if LHS u>= RHS and RHS == sign-bit-mask (2^7, 2^15, 2^31, etc)
-    TrueIfSigned = true;
-    return RHS.isMinSignedValue();
-  case ICmpInst::ICMP_ULT:
-    // True if LHS u< RHS and RHS == sign-bit-mask (2^7, 2^15, 2^31, etc)
-    TrueIfSigned = false;
-    return RHS.isMinSignedValue();
-  case ICmpInst::ICMP_ULE:
-    // True if LHS u<= RHS and RHS == sign-bit-mask - 1
-    TrueIfSigned = false;
-    return RHS.isMaxSignedValue();
-  default:
-    return false;
-  }
-}
-
-llvm::Optional<std::pair<CmpInst::Predicate, Constant *>>
-getFlippedStrictnessPredicateAndConstant(CmpInst::Predicate Pred, Constant *C);
-
-/// Return the source operand of a potentially bitcasted value while optionally
-/// checking if it has one use. If there is no bitcast or the one use check is
-/// not met, return the input value itself.
-static inline Value *peekThroughBitcast(Value *V, bool OneUseOnly = false) {
-  if (auto *BitCast = dyn_cast<BitCastInst>(V))
-    if (!OneUseOnly || BitCast->hasOneUse())
-      return BitCast->getOperand(0);
-
-  // V is not a bitcast or V has more than one use and OneUseOnly is true.
-  return V;
-}
-
-/// Add one to a Constant
-static inline Constant *AddOne(Constant *C) {
-  return ConstantExpr::getAdd(C, ConstantInt::get(C->getType(), 1));
-}
-
-/// Subtract one from a Constant
-static inline Constant *SubOne(Constant *C) {
-  return ConstantExpr::getSub(C, ConstantInt::get(C->getType(), 1));
-}
-
-/// Return true if the specified value is free to invert (apply ~ to).
-/// This happens in cases where the ~ can be eliminated.  If WillInvertAllUses
-/// is true, work under the assumption that the caller intends to remove all
-/// uses of V and only keep uses of ~V.
-///
-/// See also: canFreelyInvertAllUsersOf()
-static inline bool isFreeToInvert(Value *V, bool WillInvertAllUses) {
-  // ~(~(X)) -> X.
-  if (match(V, m_Not(m_Value())))
-    return true;
-
-  // Constants can be considered to be not'ed values.
-  if (match(V, m_AnyIntegralConstant()))
-    return true;
-
-  // Compares can be inverted if all of their uses are being modified to use the
-  // ~V.
-  if (isa<CmpInst>(V))
-    return WillInvertAllUses;
-
-  // If `V` is of the form `A + Constant` then `-1 - V` can be folded into `(-1
-  // - Constant) - A` if we are willing to invert all of the uses.
-  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(V))
-    if (BO->getOpcode() == Instruction::Add ||
-        BO->getOpcode() == Instruction::Sub)
-      if (isa<Constant>(BO->getOperand(0)) || isa<Constant>(BO->getOperand(1)))
-        return WillInvertAllUses;
-
-  // Selects with invertible operands are freely invertible
-  if (match(V, m_Select(m_Value(), m_Not(m_Value()), m_Not(m_Value()))))
-    return WillInvertAllUses;
-
-  return false;
-}
-
-/// Given i1 V, can every user of V be freely adapted if V is changed to !V ?
-/// InstCombine's canonicalizeICmpPredicate() must be kept in sync with this fn.
-///
-/// See also: isFreeToInvert()
-static inline bool canFreelyInvertAllUsersOf(Value *V, Value *IgnoredUser) {
-  // Look at every user of V.
-  for (Use &U : V->uses()) {
-    if (U.getUser() == IgnoredUser)
-      continue; // Don't consider this user.
-
-    auto *I = cast<Instruction>(U.getUser());
-    switch (I->getOpcode()) {
-    case Instruction::Select:
-      if (U.getOperandNo() != 0) // Only if the value is used as select cond.
-        return false;
-      break;
-    case Instruction::Br:
-      assert(U.getOperandNo() == 0 && "Must be branching on that value.");
-      break; // Free to invert by swapping true/false values/destinations.
-    case Instruction::Xor: // Can invert 'xor' if it's a 'not', by ignoring it.
-      if (!match(I, m_Not(m_Value())))
-        return false; // Not a 'not'.
-      break;
-    default:
-      return false; // Don't know, likely not freely invertible.
-    }
-    // So far all users were free to invert...
-  }
-  return true; // Can freely invert all users!
-}
-
-/// Some binary operators require special handling to avoid poison and undefined
-/// behavior. If a constant vector has undef elements, replace those undefs with
-/// identity constants if possible because those are always safe to execute.
-/// If no identity constant exists, replace undef with some other safe constant.
-static inline Constant *getSafeVectorConstantForBinop(
-      BinaryOperator::BinaryOps Opcode, Constant *In, bool IsRHSConstant) {
-  auto *InVTy = dyn_cast<VectorType>(In->getType());
-  assert(InVTy && "Not expecting scalars here");
-
-  Type *EltTy = InVTy->getElementType();
-  auto *SafeC = ConstantExpr::getBinOpIdentity(Opcode, EltTy, IsRHSConstant);
-  if (!SafeC) {
-    // TODO: Should this be available as a constant utility function? It is
-    // similar to getBinOpAbsorber().
-    if (IsRHSConstant) {
-      switch (Opcode) {
-      case Instruction::SRem: // X % 1 = 0
-      case Instruction::URem: // X %u 1 = 0
-        SafeC = ConstantInt::get(EltTy, 1);
-        break;
-      case Instruction::FRem: // X % 1.0 (doesn't simplify, but it is safe)
-        SafeC = ConstantFP::get(EltTy, 1.0);
-        break;
-      default:
-        llvm_unreachable("Only rem opcodes have no identity constant for RHS");
-      }
-    } else {
-      switch (Opcode) {
-      case Instruction::Shl:  // 0 << X = 0
-      case Instruction::LShr: // 0 >>u X = 0
-      case Instruction::AShr: // 0 >> X = 0
-      case Instruction::SDiv: // 0 / X = 0
-      case Instruction::UDiv: // 0 /u X = 0
-      case Instruction::SRem: // 0 % X = 0
-      case Instruction::URem: // 0 %u X = 0
-      case Instruction::Sub:  // 0 - X (doesn't simplify, but it is safe)
-      case Instruction::FSub: // 0.0 - X (doesn't simplify, but it is safe)
-      case Instruction::FDiv: // 0.0 / X (doesn't simplify, but it is safe)
-      case Instruction::FRem: // 0.0 % X = 0
-        SafeC = Constant::getNullValue(EltTy);
-        break;
-      default:
-        llvm_unreachable("Expected to find identity constant for opcode");
-      }
-    }
-  }
-  assert(SafeC && "Must have safe constant for binop");
-  unsigned NumElts = InVTy->getNumElements();
-  SmallVector<Constant *, 16> Out(NumElts);
-  for (unsigned i = 0; i != NumElts; ++i) {
-    Constant *C = In->getAggregateElement(i);
-    Out[i] = isa<UndefValue>(C) ? SafeC : C;
-  }
-  return ConstantVector::get(Out);
-}
-
-/// The core instruction combiner logic.
-///
-/// This class provides both the logic to recursively visit instructions and
-/// combine them.
-class LLVM_LIBRARY_VISIBILITY InstCombiner
-    : public InstVisitor<InstCombiner, Instruction *> {
-  // FIXME: These members shouldn't be public.
+class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
+    : public InstCombiner,
+      public InstVisitor<InstCombinerImpl, Instruction *> {
 public:
-  /// A worklist of the instructions that need to be simplified.
-  InstCombineWorklist &Worklist;
-
-  /// An IRBuilder that automatically inserts new instructions into the
-  /// worklist.
-  using BuilderTy = IRBuilder<TargetFolder, IRBuilderCallbackInserter>;
-  BuilderTy &Builder;
-
-private:
-  // Mode in which we are running the combiner.
-  const bool MinimizeSize;
-
-  AAResults *AA;
-
-  // Required analyses.
-  AssumptionCache &AC;
-  TargetLibraryInfo &TLI;
-  DominatorTree &DT;
-  const DataLayout &DL;
-  const SimplifyQuery SQ;
-  OptimizationRemarkEmitter &ORE;
-  BlockFrequencyInfo *BFI;
-  ProfileSummaryInfo *PSI;
+  InstCombinerImpl(InstCombineWorklist &Worklist, BuilderTy &Builder,
+                   bool MinimizeSize, AAResults *AA, AssumptionCache &AC,
+                   TargetLibraryInfo &TLI, TargetTransformInfo &TTI,
+                   DominatorTree &DT, OptimizationRemarkEmitter &ORE,
+                   BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
+                   const DataLayout &DL, LoopInfo *LI)
+      : InstCombiner(Worklist, Builder, MinimizeSize, AA, AC, TLI, TTI, DT, ORE,
+                     BFI, PSI, DL, LI) {}
 
-  // Optional analyses. When non-null, these can both be used to do better
-  // combining and will be updated to reflect any changes.
-  LoopInfo *LI;
-
-  bool MadeIRChange = false;
-
-public:
-  InstCombiner(InstCombineWorklist &Worklist, BuilderTy &Builder,
-               bool MinimizeSize, AAResults *AA,
-               AssumptionCache &AC, TargetLibraryInfo &TLI, DominatorTree &DT,
-               OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI,
-               ProfileSummaryInfo *PSI, const DataLayout &DL, LoopInfo *LI)
-      : Worklist(Worklist), Builder(Builder), MinimizeSize(MinimizeSize),
-        AA(AA), AC(AC), TLI(TLI), DT(DT),
-        DL(DL), SQ(DL, &TLI, &DT, &AC), ORE(ORE), BFI(BFI), PSI(PSI), LI(LI) {}
+  virtual ~InstCombinerImpl() {}
 
   /// Run the combiner over the entire worklist until it is empty.
   ///
   /// \returns true if the IR is changed.
   bool run();
 
-  AssumptionCache &getAssumptionCache() const { return AC; }
-
-  const DataLayout &getDataLayout() const { return DL; }
-
-  DominatorTree &getDominatorTree() const { return DT; }
-
-  LoopInfo *getLoopInfo() const { return LI; }
-
-  TargetLibraryInfo &getTargetLibraryInfo() const { return TLI; }
-
   // Visitation implementation - Implement instruction combining for different
   // instruction types.  The semantics are as follows:
   // Return Value:
@@ -384,9 +97,7 @@ public:
   Instruction *visitSRem(BinaryOperator &I);
   Instruction *visitFRem(BinaryOperator &I);
   bool simplifyDivRemOfSelectWithZeroOp(BinaryOperator &I);
-  Instruction *commonRemTransforms(BinaryOperator &I);
   Instruction *commonIRemTransforms(BinaryOperator &I);
-  Instruction *commonDivTransforms(BinaryOperator &I);
   Instruction *commonIDivTransforms(BinaryOperator &I);
   Instruction *visitUDiv(BinaryOperator &I);
   Instruction *visitSDiv(BinaryOperator &I);
@@ -394,6 +105,7 @@ public:
   Value *simplifyRangeCheck(ICmpInst *Cmp0, ICmpInst *Cmp1, bool Inverted);
   Instruction *visitAnd(BinaryOperator &I);
   Instruction *visitOr(BinaryOperator &I);
+  bool sinkNotIntoOtherHandOfAndOrOr(BinaryOperator &I);
   Instruction *visitXor(BinaryOperator &I);
   Instruction *visitShl(BinaryOperator &I);
   Value *reassociateShiftAmtsOfTwoSameDirectionShifts(
@@ -407,6 +119,7 @@ public:
   Instruction *visitLShr(BinaryOperator &I);
   Instruction *commonShiftTransforms(BinaryOperator &I);
   Instruction *visitFCmpInst(FCmpInst &I);
+  CmpInst *canonicalizeICmpPredicate(CmpInst &I);
   Instruction *visitICmpInst(ICmpInst &I);
   Instruction *FoldShiftByConstant(Value *Op0, Constant *Op1,
                                    BinaryOperator &I);
@@ -445,6 +158,9 @@ public:
   Instruction *visitFenceInst(FenceInst &FI);
   Instruction *visitSwitchInst(SwitchInst &SI);
   Instruction *visitReturnInst(ReturnInst &RI);
+  Instruction *visitUnreachableInst(UnreachableInst &I);
+  Instruction *
+  foldAggregateConstructionIntoAggregateReuse(InsertValueInst &OrigIVI);
   Instruction *visitInsertValueInst(InsertValueInst &IV);
   Instruction *visitInsertElementInst(InsertElementInst &IE);
   Instruction *visitExtractElementInst(ExtractElementInst &EI);
@@ -467,11 +183,6 @@ public:
   bool replacedSelectWithOperand(SelectInst *SI, const ICmpInst *Icmp,
                                  const unsigned SIOpd);
 
-  /// Try to replace instruction \p I with value \p V which are pointers
-  /// in different address space.
-  /// \return true if successful.
-  bool replacePointer(Instruction &I, Value *V);
-
   LoadInst *combineLoadToNewType(LoadInst &LI, Type *NewTy,
                                  const Twine &Suffix = "");
 
@@ -609,10 +320,12 @@ private:
   Instruction *narrowBinOp(TruncInst &Trunc);
   Instruction *narrowMaskedBinOp(BinaryOperator &And);
   Instruction *narrowMathIfNoOverflow(BinaryOperator &I);
-  Instruction *narrowRotate(TruncInst &Trunc);
+  Instruction *narrowFunnelShift(TruncInst &Trunc);
   Instruction *optimizeBitCastFromPhi(CastInst &CI, PHINode *PN);
   Instruction *matchSAddSubSat(SelectInst &MinMax1);
 
+  void freelyInvertAllUsersOf(Value *V);
+
   /// Determine if a pair of casts can be replaced by a single cast.
   ///
   /// \param CI1 The first of a pair of casts.
@@ -685,6 +398,7 @@ public:
                       << "    with " << *V << '\n');
 
     I.replaceAllUsesWith(V);
+    MadeIRChange = true;
     return &I;
   }
 
@@ -726,7 +440,7 @@ public:
   /// When dealing with an instruction that has side effects or produces a void
   /// value, we can't rely on DCE to delete the instruction. Instead, visit
   /// methods should return the value returned by this function.
-  Instruction *eraseInstFromFunction(Instruction &I) {
+  Instruction *eraseInstFromFunction(Instruction &I) override {
     LLVM_DEBUG(dbgs() << "IC: ERASE " << I << '\n');
     assert(I.use_empty() && "Cannot erase instruction that is used!");
     salvageDebugInfo(I);
@@ -808,10 +522,6 @@ public:
       Instruction::BinaryOps BinaryOp, bool IsSigned,
       Value *LHS, Value *RHS, Instruction *CxtI) const;
 
-  /// Maximum size of array considered when transforming.
-  uint64_t MaxArraySizeForCombine = 0;
-
-private:
   /// Performs a few simplifications for operators which are associative
   /// or commutative.
   bool SimplifyAssociativeOrCommutative(BinaryOperator &I);
@@ -857,7 +567,7 @@ private:
                                  unsigned Depth, Instruction *CxtI);
   bool SimplifyDemandedBits(Instruction *I, unsigned Op,
                             const APInt &DemandedMask, KnownBits &Known,
-                            unsigned Depth = 0);
+                            unsigned Depth = 0) override;
 
   /// Helper routine of SimplifyDemandedUseBits. It computes KnownZero/KnownOne
   /// bits. It also tries to handle simplifications that can be done based on
@@ -877,13 +587,10 @@ private:
   /// demanded bits.
   bool SimplifyDemandedInstructionBits(Instruction &Inst);
 
-  Value *simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
-                                               APInt DemandedElts,
-                                               int DmaskIdx = -1);
-
-  Value *SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
-                                    APInt &UndefElts, unsigned Depth = 0,
-                                    bool AllowMultipleUsers = false);
+  virtual Value *
+  SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, APInt &UndefElts,
+                             unsigned Depth = 0,
+                             bool AllowMultipleUsers = false) override;
 
   /// Canonicalize the position of binops relative to shufflevector.
   Instruction *foldVectorBinop(BinaryOperator &Inst);
@@ -907,16 +614,18 @@ private:
 
   /// Try to rotate an operation below a PHI node, using PHI nodes for
   /// its operands.
-  Instruction *FoldPHIArgOpIntoPHI(PHINode &PN);
-  Instruction *FoldPHIArgBinOpIntoPHI(PHINode &PN);
-  Instruction *FoldPHIArgGEPIntoPHI(PHINode &PN);
-  Instruction *FoldPHIArgLoadIntoPHI(PHINode &PN);
-  Instruction *FoldPHIArgZextsIntoPHI(PHINode &PN);
+  Instruction *foldPHIArgOpIntoPHI(PHINode &PN);
+  Instruction *foldPHIArgBinOpIntoPHI(PHINode &PN);
+  Instruction *foldPHIArgInsertValueInstructionIntoPHI(PHINode &PN);
+  Instruction *foldPHIArgExtractValueInstructionIntoPHI(PHINode &PN);
+  Instruction *foldPHIArgGEPIntoPHI(PHINode &PN);
+  Instruction *foldPHIArgLoadIntoPHI(PHINode &PN);
+  Instruction *foldPHIArgZextsIntoPHI(PHINode &PN);
 
   /// If an integer typed PHI has only one use which is an IntToPtr operation,
   /// replace the PHI with an existing pointer typed PHI if it exists. Otherwise
   /// insert a new pointer typed PHI and replace the original one.
-  Instruction *FoldIntegerTypedPHI(PHINode &PN);
+  Instruction *foldIntegerTypedPHI(PHINode &PN);
 
   /// Helper function for FoldPHIArgXIntoPHI() to set debug location for the
   /// folded operation.
@@ -999,18 +708,18 @@ private:
                             Value *A, Value *B, Instruction &Outer,
                             SelectPatternFlavor SPF2, Value *C);
   Instruction *foldSelectInstWithICmp(SelectInst &SI, ICmpInst *ICI);
-
-  Instruction *OptAndOp(BinaryOperator *Op, ConstantInt *OpRHS,
-                        ConstantInt *AndRHS, BinaryOperator &TheAnd);
+  Instruction *foldSelectValueEquivalence(SelectInst &SI, ICmpInst &ICI);
 
   Value *insertRangeTest(Value *V, const APInt &Lo, const APInt &Hi,
                          bool isSigned, bool Inside);
   Instruction *PromoteCastOfAllocation(BitCastInst &CI, AllocaInst &AI);
   bool mergeStoreIntoSuccessor(StoreInst &SI);
 
-  /// Given an 'or' instruction, check to see if it is part of a bswap idiom.
-  /// If so, return the equivalent bswap intrinsic.
-  Instruction *matchBSwap(BinaryOperator &Or);
+  /// Given an 'or' instruction, check to see if it is part of a
+  /// bswap/bitreverse idiom. If so, return the equivalent bswap/bitreverse
+  /// intrinsic.
+  Instruction *matchBSwapOrBitReverse(BinaryOperator &Or, bool MatchBSwaps,
+                                      bool MatchBitReversals);
 
   Instruction *SimplifyAnyMemTransfer(AnyMemTransferInst *MI);
   Instruction *SimplifyAnyMemSet(AnyMemSetInst *MI);
@@ -1023,18 +732,6 @@ private:
   Value *Descale(Value *Val, APInt Scale, bool &NoSignedWrap);
 };
 
-namespace {
-
-// As a default, let's assume that we want to be aggressive,
-// and attempt to traverse with no limits in attempt to sink negation.
-static constexpr unsigned NegatorDefaultMaxDepth = ~0U;
-
-// Let's guesstimate that most often we will end up visiting/producing
-// fairly small number of new instructions.
-static constexpr unsigned NegatorMaxNodesSSO = 16;
-
-} // namespace
-
 class Negator final {
   /// Top-to-bottom, def-to-use negated instruction tree we produced.
   SmallVector<Instruction *, NegatorMaxNodesSSO> NewInstructions;
@@ -1061,6 +758,8 @@ class Negator final {
   using Result = std::pair<ArrayRef<Instruction *> /*NewInstructions*/,
                            Value * /*NegatedRoot*/>;
 
+  std::array<Value *, 2> getSortedOperandsOfBinOp(Instruction *I);
+
   LLVM_NODISCARD Value *visitImpl(Value *V, unsigned Depth);
 
   LLVM_NODISCARD Value *negate(Value *V, unsigned Depth);
@@ -1078,7 +777,7 @@ public:
   /// Attempt to negate \p Root. Retuns nullptr if negation can't be performed,
   /// otherwise returns negated value.
   LLVM_NODISCARD static Value *Negate(bool LHSIsZero, Value *Root,
-                                      InstCombiner &IC);
+                                      InstCombinerImpl &IC);
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index dad2f23120bd..c7b5f6f78069 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -23,6 +23,7 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
@@ -166,7 +167,8 @@ static bool isDereferenceableForAllocaSize(const Value *V, const AllocaInst *AI,
                                             APInt(64, AllocaSize), DL);
 }
 
-static Instruction *simplifyAllocaArraySize(InstCombiner &IC, AllocaInst &AI) {
+static Instruction *simplifyAllocaArraySize(InstCombinerImpl &IC,
+                                            AllocaInst &AI) {
   // Check for array size of 1 (scalar allocation).
   if (!AI.isArrayAllocation()) {
     // i32 1 is the canonical array size for scalar allocations.
@@ -234,47 +236,45 @@ namespace {
 // instruction.
 class PointerReplacer {
 public:
-  PointerReplacer(InstCombiner &IC) : IC(IC) {}
+  PointerReplacer(InstCombinerImpl &IC) : IC(IC) {}
+
+  bool collectUsers(Instruction &I);
   void replacePointer(Instruction &I, Value *V);
 
 private:
-  void findLoadAndReplace(Instruction &I);
   void replace(Instruction *I);
   Value *getReplacement(Value *I);
 
-  SmallVector<Instruction *, 4> Path;
+  SmallSetVector<Instruction *, 4> Worklist;
   MapVector<Value *, Value *> WorkMap;
-  InstCombiner &IC;
+  InstCombinerImpl &IC;
 };
 } // end anonymous namespace
 
-void PointerReplacer::findLoadAndReplace(Instruction &I) {
+bool PointerReplacer::collectUsers(Instruction &I) {
   for (auto U : I.users()) {
-    auto *Inst = dyn_cast<Instruction>(&*U);
-    if (!Inst)
-      return;
-    LLVM_DEBUG(dbgs() << "Found pointer user: " << *U << '\n');
-    if (isa<LoadInst>(Inst)) {
-      for (auto P : Path)
-        replace(P);
-      replace(Inst);
+    Instruction *Inst = cast<Instruction>(&*U);
+    if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
+      if (Load->isVolatile())
+        return false;
+      Worklist.insert(Load);
     } else if (isa<GetElementPtrInst>(Inst) || isa<BitCastInst>(Inst)) {
-      Path.push_back(Inst);
-      findLoadAndReplace(*Inst);
-      Path.pop_back();
+      Worklist.insert(Inst);
+      if (!collectUsers(*Inst))
+        return false;
+    } else if (isa<MemTransferInst>(Inst)) {
+      Worklist.insert(Inst);
     } else {
-      return;
+      LLVM_DEBUG(dbgs() << "Cannot handle pointer user: " << *U << '\n');
+      return false;
     }
   }
-}
 
-Value *PointerReplacer::getReplacement(Value *V) {
-  auto Loc = WorkMap.find(V);
-  if (Loc != WorkMap.end())
-    return Loc->second;
-  return nullptr;
+  return true;
 }
 
+Value *PointerReplacer::getReplacement(Value *V) { return WorkMap.lookup(V); }
+
 void PointerReplacer::replace(Instruction *I) {
   if (getReplacement(I))
     return;
@@ -282,9 +282,12 @@ void PointerReplacer::replace(Instruction *I) {
   if (auto *LT = dyn_cast<LoadInst>(I)) {
     auto *V = getReplacement(LT->getPointerOperand());
     assert(V && "Operand not replaced");
-    auto *NewI = new LoadInst(I->getType(), V, "", false,
-                              IC.getDataLayout().getABITypeAlign(I->getType()));
+    auto *NewI = new LoadInst(LT->getType(), V, "", LT->isVolatile(),
+                              LT->getAlign(), LT->getOrdering(),
+                              LT->getSyncScopeID());
     NewI->takeName(LT);
+    copyMetadataForLoad(*NewI, *LT);
+
     IC.InsertNewInstWith(NewI, *LT);
     IC.replaceInstUsesWith(*LT, NewI);
     WorkMap[LT] = NewI;
@@ -307,6 +310,28 @@ void PointerReplacer::replace(Instruction *I) {
     IC.InsertNewInstWith(NewI, *BC);
     NewI->takeName(BC);
     WorkMap[BC] = NewI;
+  } else if (auto *MemCpy = dyn_cast<MemTransferInst>(I)) {
+    auto *SrcV = getReplacement(MemCpy->getRawSource());
+    // The pointer may appear in the destination of a copy, but we don't want to
+    // replace it.
+    if (!SrcV) {
+      assert(getReplacement(MemCpy->getRawDest()) &&
+             "destination not in replace list");
+      return;
+    }
+
+    IC.Builder.SetInsertPoint(MemCpy);
+    auto *NewI = IC.Builder.CreateMemTransferInst(
+        MemCpy->getIntrinsicID(), MemCpy->getRawDest(), MemCpy->getDestAlign(),
+        SrcV, MemCpy->getSourceAlign(), MemCpy->getLength(),
+        MemCpy->isVolatile());
+    AAMDNodes AAMD;
+    MemCpy->getAAMetadata(AAMD);
+    if (AAMD)
+      NewI->setAAMetadata(AAMD);
+
+    IC.eraseInstFromFunction(*MemCpy);
+    WorkMap[MemCpy] = NewI;
   } else {
     llvm_unreachable("should never reach here");
   }
@@ -320,10 +345,12 @@ void PointerReplacer::replacePointer(Instruction &I, Value *V) {
          "Invalid usage");
 #endif
   WorkMap[&I] = V;
-  findLoadAndReplace(I);
+
+  for (Instruction *Workitem : Worklist)
+    replace(Workitem);
 }
 
-Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
+Instruction *InstCombinerImpl::visitAllocaInst(AllocaInst &AI) {
   if (auto *I = simplifyAllocaArraySize(*this, AI))
     return I;
 
@@ -374,23 +401,21 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
   // read.
   SmallVector<Instruction *, 4> ToDelete;
   if (MemTransferInst *Copy = isOnlyCopiedFromConstantMemory(AA, &AI, ToDelete)) {
+    Value *TheSrc = Copy->getSource();
     Align AllocaAlign = AI.getAlign();
     Align SourceAlign = getOrEnforceKnownAlignment(
-        Copy->getSource(), AllocaAlign, DL, &AI, &AC, &DT);
+      TheSrc, AllocaAlign, DL, &AI, &AC, &DT);
     if (AllocaAlign <= SourceAlign &&
-        isDereferenceableForAllocaSize(Copy->getSource(), &AI, DL)) {
+        isDereferenceableForAllocaSize(TheSrc, &AI, DL)) {
       LLVM_DEBUG(dbgs() << "Found alloca equal to global: " << AI << '\n');
       LLVM_DEBUG(dbgs() << "  memcpy = " << *Copy << '\n');
-      for (unsigned i = 0, e = ToDelete.size(); i != e; ++i)
-        eraseInstFromFunction(*ToDelete[i]);
-      Value *TheSrc = Copy->getSource();
-      auto *SrcTy = TheSrc->getType();
-      auto *DestTy = PointerType::get(AI.getType()->getPointerElementType(),
-                                      SrcTy->getPointerAddressSpace());
-      Value *Cast =
-        Builder.CreatePointerBitCastOrAddrSpaceCast(TheSrc, DestTy);
-      if (AI.getType()->getPointerAddressSpace() ==
-          SrcTy->getPointerAddressSpace()) {
+      unsigned SrcAddrSpace = TheSrc->getType()->getPointerAddressSpace();
+      auto *DestTy = PointerType::get(AI.getAllocatedType(), SrcAddrSpace);
+      if (AI.getType()->getAddressSpace() == SrcAddrSpace) {
+        for (Instruction *Delete : ToDelete)
+          eraseInstFromFunction(*Delete);
+
+        Value *Cast = Builder.CreateBitCast(TheSrc, DestTy);
         Instruction *NewI = replaceInstUsesWith(AI, Cast);
         eraseInstFromFunction(*Copy);
         ++NumGlobalCopies;
@@ -398,8 +423,14 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
       }
 
       PointerReplacer PtrReplacer(*this);
-      PtrReplacer.replacePointer(AI, Cast);
-      ++NumGlobalCopies;
+      if (PtrReplacer.collectUsers(AI)) {
+        for (Instruction *Delete : ToDelete)
+          eraseInstFromFunction(*Delete);
+
+        Value *Cast = Builder.CreateBitCast(TheSrc, DestTy);
+        PtrReplacer.replacePointer(AI, Cast);
+        ++NumGlobalCopies;
+      }
     }
   }
 
@@ -421,9 +452,9 @@ static bool isSupportedAtomicType(Type *Ty) {
 /// that pointer type, load it, etc.
 ///
 /// Note that this will create all of the instructions with whatever insert
-/// point the \c InstCombiner currently is using.
-LoadInst *InstCombiner::combineLoadToNewType(LoadInst &LI, Type *NewTy,
-                                             const Twine &Suffix) {
+/// point the \c InstCombinerImpl currently is using.
+LoadInst *InstCombinerImpl::combineLoadToNewType(LoadInst &LI, Type *NewTy,
+                                                 const Twine &Suffix) {
   assert((!LI.isAtomic() || isSupportedAtomicType(NewTy)) &&
          "can't fold an atomic load to requested type");
 
@@ -445,7 +476,8 @@ LoadInst *InstCombiner::combineLoadToNewType(LoadInst &LI, Type *NewTy,
 /// Combine a store to a new type.
 ///
 /// Returns the newly created store instruction.
-static StoreInst *combineStoreToNewValue(InstCombiner &IC, StoreInst &SI, Value *V) {
+static StoreInst *combineStoreToNewValue(InstCombinerImpl &IC, StoreInst &SI,
+                                         Value *V) {
   assert((!SI.isAtomic() || isSupportedAtomicType(V->getType())) &&
          "can't fold an atomic store of requested type");
 
@@ -485,6 +517,7 @@ static StoreInst *combineStoreToNewValue(InstCombiner &IC, StoreInst &SI, Value
       break;
     case LLVMContext::MD_invariant_load:
     case LLVMContext::MD_nonnull:
+    case LLVMContext::MD_noundef:
     case LLVMContext::MD_range:
     case LLVMContext::MD_align:
     case LLVMContext::MD_dereferenceable:
@@ -502,7 +535,7 @@ static StoreInst *combineStoreToNewValue(InstCombiner &IC, StoreInst &SI, Value
 static bool isMinMaxWithLoads(Value *V, Type *&LoadTy) {
   assert(V->getType()->isPointerTy() && "Expected pointer type.");
   // Ignore possible ty* to ixx* bitcast.
-  V = peekThroughBitcast(V);
+  V = InstCombiner::peekThroughBitcast(V);
   // Check that select is select ((cmp load V1, load V2), V1, V2) - minmax
   // pattern.
   CmpInst::Predicate Pred;
@@ -537,7 +570,8 @@ static bool isMinMaxWithLoads(Value *V, Type *&LoadTy) {
 /// or a volatile load. This is debatable, and might be reasonable to change
 /// later. However, it is risky in case some backend or other part of LLVM is
 /// relying on the exact type loaded to select appropriate atomic operations.
-static Instruction *combineLoadToOperationType(InstCombiner &IC, LoadInst &LI) {
+static Instruction *combineLoadToOperationType(InstCombinerImpl &IC,
+                                               LoadInst &LI) {
   // FIXME: We could probably with some care handle both volatile and ordered
   // atomic loads here but it isn't clear that this is important.
   if (!LI.isUnordered())
@@ -550,62 +584,38 @@ static Instruction *combineLoadToOperationType(InstCombiner &IC, LoadInst &LI) {
   if (LI.getPointerOperand()->isSwiftError())
     return nullptr;
 
-  Type *Ty = LI.getType();
   const DataLayout &DL = IC.getDataLayout();
 
-  // Try to canonicalize loads which are only ever stored to operate over
-  // integers instead of any other type. We only do this when the loaded type
-  // is sized and has a size exactly the same as its store size and the store
-  // size is a legal integer type.
-  // Do not perform canonicalization if minmax pattern is found (to avoid
-  // infinite loop).
-  Type *Dummy;
-  if (!Ty->isIntegerTy() && Ty->isSized() && !isa<ScalableVectorType>(Ty) &&
-      DL.isLegalInteger(DL.getTypeStoreSizeInBits(Ty)) &&
-      DL.typeSizeEqualsStoreSize(Ty) && !DL.isNonIntegralPointerType(Ty) &&
-      !isMinMaxWithLoads(
-          peekThroughBitcast(LI.getPointerOperand(), /*OneUseOnly=*/true),
-          Dummy)) {
-    if (all_of(LI.users(), [&LI](User *U) {
-          auto *SI = dyn_cast<StoreInst>(U);
-          return SI && SI->getPointerOperand() != &LI &&
-                 !SI->getPointerOperand()->isSwiftError();
-        })) {
-      LoadInst *NewLoad = IC.combineLoadToNewType(
-          LI, Type::getIntNTy(LI.getContext(), DL.getTypeStoreSizeInBits(Ty)));
-      // Replace all the stores with stores of the newly loaded value.
-      for (auto UI = LI.user_begin(), UE = LI.user_end(); UI != UE;) {
-        auto *SI = cast<StoreInst>(*UI++);
-        IC.Builder.SetInsertPoint(SI);
-        combineStoreToNewValue(IC, *SI, NewLoad);
-        IC.eraseInstFromFunction(*SI);
-      }
-      assert(LI.use_empty() && "Failed to remove all users of the load!");
-      // Return the old load so the combiner can delete it safely.
-      return &LI;
+  // Fold away bit casts of the loaded value by loading the desired type.
+  // Note that we should not do this for pointer<->integer casts,
+  // because that would result in type punning.
+  if (LI.hasOneUse()) {
+    // Don't transform when the type is x86_amx, it makes the pass that lower
+    // x86_amx type happy.
+    if (auto *BC = dyn_cast<BitCastInst>(LI.user_back())) {
+      assert(!LI.getType()->isX86_AMXTy() &&
+             "load from x86_amx* should not happen!");
+      if (BC->getType()->isX86_AMXTy())
+        return nullptr;
     }
-  }
 
-  // Fold away bit casts of the loaded value by loading the desired type.
-  // We can do this for BitCastInsts as well as casts from and to pointer types,
-  // as long as those are noops (i.e., the source or dest type have the same
-  // bitwidth as the target's pointers).
-  if (LI.hasOneUse())
     if (auto* CI = dyn_cast<CastInst>(LI.user_back()))
-      if (CI->isNoopCast(DL))
+      if (CI->isNoopCast(DL) && LI.getType()->isPtrOrPtrVectorTy() ==
+                                    CI->getDestTy()->isPtrOrPtrVectorTy())
         if (!LI.isAtomic() || isSupportedAtomicType(CI->getDestTy())) {
           LoadInst *NewLoad = IC.combineLoadToNewType(LI, CI->getDestTy());
           CI->replaceAllUsesWith(NewLoad);
           IC.eraseInstFromFunction(*CI);
           return &LI;
         }
+  }
 
   // FIXME: We should also canonicalize loads of vectors when their elements are
   // cast to other types.
   return nullptr;
 }
 
-static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) {
+static Instruction *unpackLoadToAggregate(InstCombinerImpl &IC, LoadInst &LI) {
   // FIXME: We could probably with some care handle both volatile and atomic
   // stores here but it isn't clear that this is important.
   if (!LI.isSimple())
@@ -743,8 +753,7 @@ static bool isObjectSizeLessThanOrEq(Value *V, uint64_t MaxSize,
     }
 
     if (PHINode *PN = dyn_cast<PHINode>(P)) {
-      for (Value *IncValue : PN->incoming_values())
-        Worklist.push_back(IncValue);
+      append_range(Worklist, PN->incoming_values());
       continue;
     }
 
@@ -804,8 +813,9 @@ static bool isObjectSizeLessThanOrEq(Value *V, uint64_t MaxSize,
 // not zero. Currently, we only handle the first such index. Also, we could
 // also search through non-zero constant indices if we kept track of the
 // offsets those indices implied.
-static bool canReplaceGEPIdxWithZero(InstCombiner &IC, GetElementPtrInst *GEPI,
-                                     Instruction *MemI, unsigned &Idx) {
+static bool canReplaceGEPIdxWithZero(InstCombinerImpl &IC,
+                                     GetElementPtrInst *GEPI, Instruction *MemI,
+                                     unsigned &Idx) {
   if (GEPI->getNumOperands() < 2)
     return false;
 
@@ -834,12 +844,17 @@ static bool canReplaceGEPIdxWithZero(InstCombiner &IC, GetElementPtrInst *GEPI,
     return false;
 
   SmallVector<Value *, 4> Ops(GEPI->idx_begin(), GEPI->idx_begin() + Idx);
-  Type *AllocTy =
-    GetElementPtrInst::getIndexedType(GEPI->getSourceElementType(), Ops);
+  Type *SourceElementType = GEPI->getSourceElementType();
+  // Size information about scalable vectors is not available, so we cannot
+  // deduce whether indexing at n is undefined behaviour or not. Bail out.
+  if (isa<ScalableVectorType>(SourceElementType))
+    return false;
+
+  Type *AllocTy = GetElementPtrInst::getIndexedType(SourceElementType, Ops);
   if (!AllocTy || !AllocTy->isSized())
     return false;
   const DataLayout &DL = IC.getDataLayout();
-  uint64_t TyAllocSize = DL.getTypeAllocSize(AllocTy);
+  uint64_t TyAllocSize = DL.getTypeAllocSize(AllocTy).getFixedSize();
 
   // If there are more indices after the one we might replace with a zero, make
   // sure they're all non-negative. If any of them are negative, the overall
@@ -874,7 +889,7 @@ static bool canReplaceGEPIdxWithZero(InstCombiner &IC, GetElementPtrInst *GEPI,
 // access, but the object has only one element, we can assume that the index
 // will always be zero. If we replace the GEP, return it.
 template <typename T>
-static Instruction *replaceGEPIdxWithZero(InstCombiner &IC, Value *Ptr,
+static Instruction *replaceGEPIdxWithZero(InstCombinerImpl &IC, Value *Ptr,
                                           T &MemI) {
   if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Ptr)) {
     unsigned Idx;
@@ -916,7 +931,7 @@ static bool canSimplifyNullLoadOrGEP(LoadInst &LI, Value *Op) {
   return false;
 }
 
-Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
+Instruction *InstCombinerImpl::visitLoadInst(LoadInst &LI) {
   Value *Op = LI.getOperand(0);
 
   // Try to canonicalize the loaded type.
@@ -1033,7 +1048,7 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
 /// and the layout of a <2 x double> is isomorphic to a [2 x double],
 /// then %V1 can be safely approximated by a conceptual "bitcast" of %U.
 /// Note that %U may contain non-undef values where %V1 has undef.
-static Value *likeBitCastFromVector(InstCombiner &IC, Value *V) {
+static Value *likeBitCastFromVector(InstCombinerImpl &IC, Value *V) {
   Value *U = nullptr;
   while (auto *IV = dyn_cast<InsertValueInst>(V)) {
     auto *E = dyn_cast<ExtractElementInst>(IV->getInsertedValueOperand());
@@ -1060,11 +1075,11 @@ static Value *likeBitCastFromVector(InstCombiner &IC, Value *V) {
     return nullptr;
   }
   if (auto *AT = dyn_cast<ArrayType>(VT)) {
-    if (AT->getNumElements() != UT->getNumElements())
+    if (AT->getNumElements() != cast<FixedVectorType>(UT)->getNumElements())
       return nullptr;
   } else {
     auto *ST = cast<StructType>(VT);
-    if (ST->getNumElements() != UT->getNumElements())
+    if (ST->getNumElements() != cast<FixedVectorType>(UT)->getNumElements())
       return nullptr;
     for (const auto *EltT : ST->elements()) {
       if (EltT != UT->getElementType())
@@ -1094,7 +1109,7 @@ static Value *likeBitCastFromVector(InstCombiner &IC, Value *V) {
 /// the caller must erase the store instruction. We have to let the caller erase
 /// the store instruction as otherwise there is no way to signal whether it was
 /// combined or not: IC.EraseInstFromFunction returns a null pointer.
-static bool combineStoreToValueType(InstCombiner &IC, StoreInst &SI) {
+static bool combineStoreToValueType(InstCombinerImpl &IC, StoreInst &SI) {
   // FIXME: We could probably with some care handle both volatile and ordered
   // atomic stores here but it isn't clear that this is important.
   if (!SI.isUnordered())
@@ -1108,7 +1123,13 @@ static bool combineStoreToValueType(InstCombiner &IC, StoreInst &SI) {
 
   // Fold away bit casts of the stored value by storing the original type.
   if (auto *BC = dyn_cast<BitCastInst>(V)) {
+    assert(!BC->getType()->isX86_AMXTy() &&
+           "store to x86_amx* should not happen!");
     V = BC->getOperand(0);
+    // Don't transform when the type is x86_amx, it makes the pass that lower
+    // x86_amx type happy.
+    if (V->getType()->isX86_AMXTy())
+      return false;
     if (!SI.isAtomic() || isSupportedAtomicType(V->getType())) {
       combineStoreToNewValue(IC, SI, V);
       return true;
@@ -1126,7 +1147,7 @@ static bool combineStoreToValueType(InstCombiner &IC, StoreInst &SI) {
   return false;
 }
 
-static bool unpackStoreToAggregate(InstCombiner &IC, StoreInst &SI) {
+static bool unpackStoreToAggregate(InstCombinerImpl &IC, StoreInst &SI) {
   // FIXME: We could probably with some care handle both volatile and atomic
   // stores here but it isn't clear that this is important.
   if (!SI.isSimple())
@@ -1266,7 +1287,7 @@ static bool equivalentAddressValues(Value *A, Value *B) {
 /// Converts store (bitcast (load (bitcast (select ...)))) to
 /// store (load (select ...)), where select is minmax:
 /// select ((cmp load V1, load V2), V1, V2).
-static bool removeBitcastsFromLoadStoreOnMinMax(InstCombiner &IC,
+static bool removeBitcastsFromLoadStoreOnMinMax(InstCombinerImpl &IC,
                                                 StoreInst &SI) {
   // bitcast?
   if (!match(SI.getPointerOperand(), m_BitCast(m_Value())))
@@ -1296,7 +1317,8 @@ static bool removeBitcastsFromLoadStoreOnMinMax(InstCombiner &IC,
   if (!all_of(LI->users(), [LI, LoadAddr](User *U) {
         auto *SI = dyn_cast<StoreInst>(U);
         return SI && SI->getPointerOperand() != LI &&
-               peekThroughBitcast(SI->getPointerOperand()) != LoadAddr &&
+               InstCombiner::peekThroughBitcast(SI->getPointerOperand()) !=
+                   LoadAddr &&
                !SI->getPointerOperand()->isSwiftError();
       }))
     return false;
@@ -1314,7 +1336,7 @@ static bool removeBitcastsFromLoadStoreOnMinMax(InstCombiner &IC,
   return true;
 }
 
-Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
+Instruction *InstCombinerImpl::visitStoreInst(StoreInst &SI) {
   Value *Val = SI.getOperand(0);
   Value *Ptr = SI.getOperand(1);
 
@@ -1433,7 +1455,7 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
 /// or:
 ///   *P = v1; if () { *P = v2; }
 /// into a phi node with a store in the successor.
-bool InstCombiner::mergeStoreIntoSuccessor(StoreInst &SI) {
+bool InstCombinerImpl::mergeStoreIntoSuccessor(StoreInst &SI) {
   if (!SI.isUnordered())
     return false; // This code has not been audited for volatile/ordered case.
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 2f1325e80d2f..4b485a0ad85e 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -32,6 +32,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include <cassert>
 #include <cstddef>
@@ -46,7 +47,7 @@ using namespace PatternMatch;
 /// The specific integer value is used in a context where it is known to be
 /// non-zero.  If this allows us to simplify the computation, do so and return
 /// the new operand, otherwise return null.
-static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC,
+static Value *simplifyValueKnownNonZero(Value *V, InstCombinerImpl &IC,
                                         Instruction &CxtI) {
   // If V has multiple uses, then we would have to do more analysis to determine
   // if this is safe.  For example, the use could be in dynamically unreached
@@ -94,39 +95,6 @@ static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC,
   return MadeChange ? V : nullptr;
 }
 
-/// A helper routine of InstCombiner::visitMul().
-///
-/// If C is a scalar/fixed width vector of known powers of 2, then this
-/// function returns a new scalar/fixed width vector obtained from logBase2
-/// of C.
-/// Return a null pointer otherwise.
-static Constant *getLogBase2(Type *Ty, Constant *C) {
-  const APInt *IVal;
-  if (match(C, m_APInt(IVal)) && IVal->isPowerOf2())
-    return ConstantInt::get(Ty, IVal->logBase2());
-
-  // FIXME: We can extract pow of 2 of splat constant for scalable vectors.
-  if (!isa<FixedVectorType>(Ty))
-    return nullptr;
-
-  SmallVector<Constant *, 4> Elts;
-  for (unsigned I = 0, E = cast<FixedVectorType>(Ty)->getNumElements(); I != E;
-       ++I) {
-    Constant *Elt = C->getAggregateElement(I);
-    if (!Elt)
-      return nullptr;
-    if (isa<UndefValue>(Elt)) {
-      Elts.push_back(UndefValue::get(Ty->getScalarType()));
-      continue;
-    }
-    if (!match(Elt, m_APInt(IVal)) || !IVal->isPowerOf2())
-      return nullptr;
-    Elts.push_back(ConstantInt::get(Ty->getScalarType(), IVal->logBase2()));
-  }
-
-  return ConstantVector::get(Elts);
-}
-
 // TODO: This is a specific form of a much more general pattern.
 //       We could detect a select with any binop identity constant, or we
 //       could use SimplifyBinOp to see if either arm of the select reduces.
@@ -171,7 +139,7 @@ static Value *foldMulSelectToNegate(BinaryOperator &I,
   return nullptr;
 }
 
-Instruction *InstCombiner::visitMul(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitMul(BinaryOperator &I) {
   if (Value *V = SimplifyMulInst(I.getOperand(0), I.getOperand(1),
                                  SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
@@ -185,8 +153,10 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
   if (Value *V = SimplifyUsingDistributiveLaws(I))
     return replaceInstUsesWith(I, V);
 
-  // X * -1 == 0 - X
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  unsigned BitWidth = I.getType()->getScalarSizeInBits();
+
+  // X * -1 == 0 - X
   if (match(Op1, m_AllOnes())) {
     BinaryOperator *BO = BinaryOperator::CreateNeg(Op0, I.getName());
     if (I.hasNoSignedWrap())
@@ -216,11 +186,7 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
 
     if (match(&I, m_Mul(m_Value(NewOp), m_Constant(C1)))) {
       // Replace X*(2^C) with X << C, where C is either a scalar or a vector.
-      // Note that we need to sanitize undef multipliers to 1,
-      // to avoid introducing poison.
-      Constant *SafeC1 = Constant::replaceUndefsWith(
-          C1, ConstantInt::get(C1->getType()->getScalarType(), 1));
-      if (Constant *NewCst = getLogBase2(NewOp->getType(), SafeC1)) {
+      if (Constant *NewCst = ConstantExpr::getExactLogBase2(C1)) {
         BinaryOperator *Shl = BinaryOperator::CreateShl(NewOp, NewCst);
 
         if (I.hasNoUnsignedWrap())
@@ -236,29 +202,12 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
     }
   }
 
-  if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
-    // (Y - X) * (-(2**n)) -> (X - Y) * (2**n), for positive nonzero n
-    // (Y + const) * (-(2**n)) -> (-constY) * (2**n), for positive nonzero n
-    // The "* (2**n)" thus becomes a potential shifting opportunity.
-    {
-      const APInt &   Val = CI->getValue();
-      const APInt &PosVal = Val.abs();
-      if (Val.isNegative() && PosVal.isPowerOf2()) {
-        Value *X = nullptr, *Y = nullptr;
-        if (Op0->hasOneUse()) {
-          ConstantInt *C1;
-          Value *Sub = nullptr;
-          if (match(Op0, m_Sub(m_Value(Y), m_Value(X))))
-            Sub = Builder.CreateSub(X, Y, "suba");
-          else if (match(Op0, m_Add(m_Value(Y), m_ConstantInt(C1))))
-            Sub = Builder.CreateSub(Builder.CreateNeg(C1), Y, "subc");
-          if (Sub)
-            return
-              BinaryOperator::CreateMul(Sub,
-                                        ConstantInt::get(Y->getType(), PosVal));
-        }
-      }
-    }
+  if (Op0->hasOneUse() && match(Op1, m_NegatedPower2())) {
+    // Interpret  X * (-1<<C)  as  (-X) * (1<<C)  and try to sink the negation.
+    // The "* (1<<C)" thus becomes a potential shifting opportunity.
+    if (Value *NegOp0 = Negator::Negate(/*IsNegation*/ true, Op0, *this))
+      return BinaryOperator::CreateMul(
+          NegOp0, ConstantExpr::getNeg(cast<Constant>(Op1)), I.getName());
   }
 
   if (Instruction *FoldedMul = foldBinOpIntoSelectOrPhi(I))
@@ -288,6 +237,9 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
     SelectPatternFlavor SPF = matchSelectPattern(Op0, X, Y).Flavor;
     if (SPF == SPF_ABS || SPF == SPF_NABS)
       return BinaryOperator::CreateMul(X, X);
+
+    if (match(Op0, m_Intrinsic<Intrinsic::abs>(m_Value(X))))
+      return BinaryOperator::CreateMul(X, X);
   }
 
   // -X * C --> X * -C
@@ -410,6 +362,19 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
   if (match(Op1, m_LShr(m_Value(X), m_APInt(C))) && *C == C->getBitWidth() - 1)
     return BinaryOperator::CreateAnd(Builder.CreateAShr(X, *C), Op0);
 
+  // ((ashr X, 31) | 1) * X --> abs(X)
+  // X * ((ashr X, 31) | 1) --> abs(X)
+  if (match(&I, m_c_BinOp(m_Or(m_AShr(m_Value(X),
+                                    m_SpecificIntAllowUndef(BitWidth - 1)),
+                             m_One()),
+                        m_Deferred(X)))) {
+    Value *Abs = Builder.CreateBinaryIntrinsic(
+        Intrinsic::abs, X,
+        ConstantInt::getBool(I.getContext(), I.hasNoSignedWrap()));
+    Abs->takeName(&I);
+    return replaceInstUsesWith(I, Abs);
+  }
+
   if (Instruction *Ext = narrowMathIfNoOverflow(I))
     return Ext;
 
@@ -427,7 +392,7 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
   return Changed ? &I : nullptr;
 }
 
-Instruction *InstCombiner::foldFPSignBitOps(BinaryOperator &I) {
+Instruction *InstCombinerImpl::foldFPSignBitOps(BinaryOperator &I) {
   BinaryOperator::BinaryOps Opcode = I.getOpcode();
   assert((Opcode == Instruction::FMul || Opcode == Instruction::FDiv) &&
          "Expected fmul or fdiv");
@@ -442,13 +407,12 @@ Instruction *InstCombiner::foldFPSignBitOps(BinaryOperator &I) {
 
   // fabs(X) * fabs(X) -> X * X
   // fabs(X) / fabs(X) -> X / X
-  if (Op0 == Op1 && match(Op0, m_Intrinsic<Intrinsic::fabs>(m_Value(X))))
+  if (Op0 == Op1 && match(Op0, m_FAbs(m_Value(X))))
     return BinaryOperator::CreateWithCopiedFlags(Opcode, X, X, &I);
 
   // fabs(X) * fabs(Y) --> fabs(X * Y)
   // fabs(X) / fabs(Y) --> fabs(X / Y)
-  if (match(Op0, m_Intrinsic<Intrinsic::fabs>(m_Value(X))) &&
-      match(Op1, m_Intrinsic<Intrinsic::fabs>(m_Value(Y))) &&
+  if (match(Op0, m_FAbs(m_Value(X))) && match(Op1, m_FAbs(m_Value(Y))) &&
       (Op0->hasOneUse() || Op1->hasOneUse())) {
     IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
     Builder.setFastMathFlags(I.getFastMathFlags());
@@ -461,7 +425,7 @@ Instruction *InstCombiner::foldFPSignBitOps(BinaryOperator &I) {
   return nullptr;
 }
 
-Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitFMul(BinaryOperator &I) {
   if (Value *V = SimplifyFMulInst(I.getOperand(0), I.getOperand(1),
                                   I.getFastMathFlags(),
                                   SQ.getWithInstruction(&I)))
@@ -557,6 +521,21 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
       return replaceInstUsesWith(I, Sqrt);
     }
 
+    // The following transforms are done irrespective of the number of uses
+    // for the expression "1.0/sqrt(X)".
+    //  1) 1.0/sqrt(X) * X -> X/sqrt(X)
+    //  2) X * 1.0/sqrt(X) -> X/sqrt(X)
+    // We always expect the backend to reduce X/sqrt(X) to sqrt(X), if it
+    // has the necessary (reassoc) fast-math-flags.
+    if (I.hasNoSignedZeros() &&
+        match(Op0, (m_FDiv(m_SpecificFP(1.0), m_Value(Y)))) &&
+        match(Y, m_Intrinsic<Intrinsic::sqrt>(m_Value(X))) && Op1 == X)
+      return BinaryOperator::CreateFDivFMF(X, Y, &I);
+    if (I.hasNoSignedZeros() &&
+        match(Op1, (m_FDiv(m_SpecificFP(1.0), m_Value(Y)))) &&
+        match(Y, m_Intrinsic<Intrinsic::sqrt>(m_Value(X))) && Op0 == X)
+      return BinaryOperator::CreateFDivFMF(X, Y, &I);
+
     // Like the similar transform in instsimplify, this requires 'nsz' because
     // sqrt(-0.0) = -0.0, and -0.0 * -0.0 does not simplify to -0.0.
     if (I.hasNoNaNs() && I.hasNoSignedZeros() && Op0 == Op1 &&
@@ -641,7 +620,7 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
 /// Fold a divide or remainder with a select instruction divisor when one of the
 /// select operands is zero. In that case, we can use the other select operand
 /// because div/rem by zero is undefined.
-bool InstCombiner::simplifyDivRemOfSelectWithZeroOp(BinaryOperator &I) {
+bool InstCombinerImpl::simplifyDivRemOfSelectWithZeroOp(BinaryOperator &I) {
   SelectInst *SI = dyn_cast<SelectInst>(I.getOperand(1));
   if (!SI)
     return false;
@@ -742,7 +721,7 @@ static bool isMultiple(const APInt &C1, const APInt &C2, APInt &Quotient,
 /// instructions (udiv and sdiv). It is called by the visitors to those integer
 /// division instructions.
 /// Common integer divide transforms
-Instruction *InstCombiner::commonIDivTransforms(BinaryOperator &I) {
+Instruction *InstCombinerImpl::commonIDivTransforms(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
   bool IsSigned = I.getOpcode() == Instruction::SDiv;
   Type *Ty = I.getType();
@@ -878,7 +857,7 @@ namespace {
 
 using FoldUDivOperandCb = Instruction *(*)(Value *Op0, Value *Op1,
                                            const BinaryOperator &I,
-                                           InstCombiner &IC);
+                                           InstCombinerImpl &IC);
 
 /// Used to maintain state for visitUDivOperand().
 struct UDivFoldAction {
@@ -907,8 +886,9 @@ struct UDivFoldAction {
 
 // X udiv 2^C -> X >> C
 static Instruction *foldUDivPow2Cst(Value *Op0, Value *Op1,
-                                    const BinaryOperator &I, InstCombiner &IC) {
-  Constant *C1 = getLogBase2(Op0->getType(), cast<Constant>(Op1));
+                                    const BinaryOperator &I,
+                                    InstCombinerImpl &IC) {
+  Constant *C1 = ConstantExpr::getExactLogBase2(cast<Constant>(Op1));
   if (!C1)
     llvm_unreachable("Failed to constant fold udiv -> logbase2");
   BinaryOperator *LShr = BinaryOperator::CreateLShr(Op0, C1);
@@ -920,7 +900,7 @@ static Instruction *foldUDivPow2Cst(Value *Op0, Value *Op1,
 // X udiv (C1 << N), where C1 is "1<<C2"  -->  X >> (N+C2)
 // X udiv (zext (C1 << N)), where C1 is "1<<C2"  -->  X >> (N+C2)
 static Instruction *foldUDivShl(Value *Op0, Value *Op1, const BinaryOperator &I,
-                                InstCombiner &IC) {
+                                InstCombinerImpl &IC) {
   Value *ShiftLeft;
   if (!match(Op1, m_ZExt(m_Value(ShiftLeft))))
     ShiftLeft = Op1;
@@ -929,7 +909,7 @@ static Instruction *foldUDivShl(Value *Op0, Value *Op1, const BinaryOperator &I,
   Value *N;
   if (!match(ShiftLeft, m_Shl(m_Constant(CI), m_Value(N))))
     llvm_unreachable("match should never fail here!");
-  Constant *Log2Base = getLogBase2(N->getType(), CI);
+  Constant *Log2Base = ConstantExpr::getExactLogBase2(CI);
   if (!Log2Base)
     llvm_unreachable("getLogBase2 should never fail here!");
   N = IC.Builder.CreateAdd(N, Log2Base);
@@ -948,6 +928,8 @@ static Instruction *foldUDivShl(Value *Op0, Value *Op1, const BinaryOperator &I,
 static size_t visitUDivOperand(Value *Op0, Value *Op1, const BinaryOperator &I,
                                SmallVectorImpl<UDivFoldAction> &Actions,
                                unsigned Depth = 0) {
+  // FIXME: assert that Op1 isn't/doesn't contain undef.
+
   // Check to see if this is an unsigned division with an exact power of 2,
   // if so, convert to a right shift.
   if (match(Op1, m_Power2())) {
@@ -967,6 +949,9 @@ static size_t visitUDivOperand(Value *Op0, Value *Op1, const BinaryOperator &I,
     return 0;
 
   if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
+    // FIXME: missed optimization: if one of the hands of select is/contains
+    //        undef, just directly pick the other one.
+    // FIXME: can both hands contain undef?
     if (size_t LHSIdx =
             visitUDivOperand(Op0, SI->getOperand(1), I, Actions, Depth))
       if (visitUDivOperand(Op0, SI->getOperand(2), I, Actions, Depth)) {
@@ -1014,7 +999,7 @@ static Instruction *narrowUDivURem(BinaryOperator &I,
   return nullptr;
 }
 
-Instruction *InstCombiner::visitUDiv(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitUDiv(BinaryOperator &I) {
   if (Value *V = SimplifyUDivInst(I.getOperand(0), I.getOperand(1),
                                   SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
@@ -1108,7 +1093,7 @@ Instruction *InstCombiner::visitUDiv(BinaryOperator &I) {
   return nullptr;
 }
 
-Instruction *InstCombiner::visitSDiv(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitSDiv(BinaryOperator &I) {
   if (Value *V = SimplifySDivInst(I.getOperand(0), I.getOperand(1),
                                   SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
@@ -1121,6 +1106,7 @@ Instruction *InstCombiner::visitSDiv(BinaryOperator &I) {
     return Common;
 
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  Type *Ty = I.getType();
   Value *X;
   // sdiv Op0, -1 --> -Op0
   // sdiv Op0, (sext i1 X) --> -Op0 (because if X is 0, the op is undefined)
@@ -1130,16 +1116,26 @@ Instruction *InstCombiner::visitSDiv(BinaryOperator &I) {
 
   // X / INT_MIN --> X == INT_MIN
   if (match(Op1, m_SignMask()))
-    return new ZExtInst(Builder.CreateICmpEQ(Op0, Op1), I.getType());
+    return new ZExtInst(Builder.CreateICmpEQ(Op0, Op1), Ty);
+
+  // sdiv exact X,  1<<C  -->    ashr exact X, C   iff  1<<C  is non-negative
+  // sdiv exact X, -1<<C  -->  -(ashr exact X, C)
+  if (I.isExact() && ((match(Op1, m_Power2()) && match(Op1, m_NonNegative())) ||
+                      match(Op1, m_NegatedPower2()))) {
+    bool DivisorWasNegative = match(Op1, m_NegatedPower2());
+    if (DivisorWasNegative)
+      Op1 = ConstantExpr::getNeg(cast<Constant>(Op1));
+    auto *AShr = BinaryOperator::CreateExactAShr(
+        Op0, ConstantExpr::getExactLogBase2(cast<Constant>(Op1)), I.getName());
+    if (!DivisorWasNegative)
+      return AShr;
+    Builder.Insert(AShr);
+    AShr->setName(I.getName() + ".neg");
+    return BinaryOperator::CreateNeg(AShr, I.getName());
+  }
 
   const APInt *Op1C;
   if (match(Op1, m_APInt(Op1C))) {
-    // sdiv exact X, C  -->  ashr exact X, log2(C)
-    if (I.isExact() && Op1C->isNonNegative() && Op1C->isPowerOf2()) {
-      Value *ShAmt = ConstantInt::get(Op1->getType(), Op1C->exactLogBase2());
-      return BinaryOperator::CreateExactAShr(Op0, ShAmt, I.getName());
-    }
-
     // If the dividend is sign-extended and the constant divisor is small enough
     // to fit in the source type, shrink the division to the narrower type:
     // (sext X) sdiv C --> sext (X sdiv C)
@@ -1154,7 +1150,7 @@ Instruction *InstCombiner::visitSDiv(BinaryOperator &I) {
       Constant *NarrowDivisor =
           ConstantExpr::getTrunc(cast<Constant>(Op1), Op0Src->getType());
       Value *NarrowOp = Builder.CreateSDiv(Op0Src, NarrowDivisor);
-      return new SExtInst(NarrowOp, Op0->getType());
+      return new SExtInst(NarrowOp, Ty);
     }
 
     // -X / C --> X / -C (if the negation doesn't overflow).
@@ -1162,7 +1158,7 @@ Instruction *InstCombiner::visitSDiv(BinaryOperator &I) {
     //       checking if all elements are not the min-signed-val.
     if (!Op1C->isMinSignedValue() &&
         match(Op0, m_NSWSub(m_Zero(), m_Value(X)))) {
-      Constant *NegC = ConstantInt::get(I.getType(), -(*Op1C));
+      Constant *NegC = ConstantInt::get(Ty, -(*Op1C));
       Instruction *BO = BinaryOperator::CreateSDiv(X, NegC);
       BO->setIsExact(I.isExact());
       return BO;
@@ -1175,9 +1171,19 @@ Instruction *InstCombiner::visitSDiv(BinaryOperator &I) {
     return BinaryOperator::CreateNSWNeg(
         Builder.CreateSDiv(X, Y, I.getName(), I.isExact()));
 
+  // abs(X) / X --> X > -1 ? 1 : -1
+  // X / abs(X) --> X > -1 ? 1 : -1
+  if (match(&I, m_c_BinOp(
+                    m_OneUse(m_Intrinsic<Intrinsic::abs>(m_Value(X), m_One())),
+                    m_Deferred(X)))) {
+    Constant *NegOne = ConstantInt::getAllOnesValue(Ty);
+    Value *Cond = Builder.CreateICmpSGT(X, NegOne);
+    return SelectInst::Create(Cond, ConstantInt::get(Ty, 1), NegOne);
+  }
+
   // If the sign bits of both operands are zero (i.e. we can prove they are
   // unsigned inputs), turn this into a udiv.
-  APInt Mask(APInt::getSignMask(I.getType()->getScalarSizeInBits()));
+  APInt Mask(APInt::getSignMask(Ty->getScalarSizeInBits()));
   if (MaskedValueIsZero(Op0, Mask, 0, &I)) {
     if (MaskedValueIsZero(Op1, Mask, 0, &I)) {
       // X sdiv Y -> X udiv Y, iff X and Y don't have sign bit set
@@ -1186,6 +1192,13 @@ Instruction *InstCombiner::visitSDiv(BinaryOperator &I) {
       return BO;
     }
 
+    if (match(Op1, m_NegatedPower2())) {
+      // X sdiv (-(1 << C)) -> -(X sdiv (1 << C)) ->
+      //                    -> -(X udiv (1 << C)) -> -(X u>> C)
+      return BinaryOperator::CreateNeg(Builder.Insert(foldUDivPow2Cst(
+          Op0, ConstantExpr::getNeg(cast<Constant>(Op1)), I, *this)));
+    }
+
     if (isKnownToBeAPowerOfTwo(Op1, /*OrZero*/ true, 0, &I)) {
       // X sdiv (1 << Y) -> X udiv (1 << Y) ( -> X u>> Y)
       // Safe because the only negative value (1 << Y) can take on is
@@ -1262,7 +1275,7 @@ static Instruction *foldFDivConstantDividend(BinaryOperator &I) {
   return BinaryOperator::CreateFDivFMF(NewC, X, &I);
 }
 
-Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitFDiv(BinaryOperator &I) {
   if (Value *V = SimplifyFDivInst(I.getOperand(0), I.getOperand(1),
                                   I.getFastMathFlags(),
                                   SQ.getWithInstruction(&I)))
@@ -1354,10 +1367,8 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
   // X / fabs(X) -> copysign(1.0, X)
   // fabs(X) / X -> copysign(1.0, X)
   if (I.hasNoNaNs() && I.hasNoInfs() &&
-      (match(&I,
-             m_FDiv(m_Value(X), m_Intrinsic<Intrinsic::fabs>(m_Deferred(X)))) ||
-       match(&I, m_FDiv(m_Intrinsic<Intrinsic::fabs>(m_Value(X)),
-                        m_Deferred(X))))) {
+      (match(&I, m_FDiv(m_Value(X), m_FAbs(m_Deferred(X)))) ||
+       match(&I, m_FDiv(m_FAbs(m_Value(X)), m_Deferred(X))))) {
     Value *V = Builder.CreateBinaryIntrinsic(
         Intrinsic::copysign, ConstantFP::get(I.getType(), 1.0), X, &I);
     return replaceInstUsesWith(I, V);
@@ -1369,7 +1380,7 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
 /// instructions (urem and srem). It is called by the visitors to those integer
 /// remainder instructions.
 /// Common integer remainder transforms
-Instruction *InstCombiner::commonIRemTransforms(BinaryOperator &I) {
+Instruction *InstCombinerImpl::commonIRemTransforms(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
   // The RHS is known non-zero.
@@ -1407,7 +1418,7 @@ Instruction *InstCombiner::commonIRemTransforms(BinaryOperator &I) {
   return nullptr;
 }
 
-Instruction *InstCombiner::visitURem(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitURem(BinaryOperator &I) {
   if (Value *V = SimplifyURemInst(I.getOperand(0), I.getOperand(1),
                                   SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
@@ -1458,7 +1469,7 @@ Instruction *InstCombiner::visitURem(BinaryOperator &I) {
   return nullptr;
 }
 
-Instruction *InstCombiner::visitSRem(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitSRem(BinaryOperator &I) {
   if (Value *V = SimplifySRemInst(I.getOperand(0), I.getOperand(1),
                                   SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
@@ -1481,7 +1492,7 @@ Instruction *InstCombiner::visitSRem(BinaryOperator &I) {
   // -X srem Y --> -(X srem Y)
   Value *X, *Y;
   if (match(&I, m_SRem(m_OneUse(m_NSWSub(m_Zero(), m_Value(X))), m_Value(Y))))
-    return BinaryOperator::CreateNSWNeg(Builder.CreateSRem(X, Y)); 
+    return BinaryOperator::CreateNSWNeg(Builder.CreateSRem(X, Y));
 
   // If the sign bits of both operands are zero (i.e. we can prove they are
   // unsigned inputs), turn this into a urem.
@@ -1495,7 +1506,7 @@ Instruction *InstCombiner::visitSRem(BinaryOperator &I) {
   // If it's a constant vector, flip any negative values positive.
   if (isa<ConstantVector>(Op1) || isa<ConstantDataVector>(Op1)) {
     Constant *C = cast<Constant>(Op1);
-    unsigned VWidth = cast<VectorType>(C->getType())->getNumElements();
+    unsigned VWidth = cast<FixedVectorType>(C->getType())->getNumElements();
 
     bool hasNegative = false;
     bool hasMissing = false;
@@ -1530,7 +1541,7 @@ Instruction *InstCombiner::visitSRem(BinaryOperator &I) {
   return nullptr;
 }
 
-Instruction *InstCombiner::visitFRem(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitFRem(BinaryOperator &I) {
   if (Value *V = SimplifyFRemInst(I.getOperand(0), I.getOperand(1),
                                   I.getFastMathFlags(),
                                   SQ.getWithInstruction(&I)))
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineNegator.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineNegator.cpp
index 3fe615ac5439..7718c8b0eedd 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineNegator.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineNegator.cpp
@@ -42,6 +42,9 @@
 #include "llvm/Support/DebugCounter.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
+#include <cassert>
+#include <cstdint>
 #include <functional>
 #include <tuple>
 #include <type_traits>
@@ -112,6 +115,19 @@ Negator::~Negator() {
 }
 #endif
 
+// Due to the InstCombine's worklist management, there are no guarantees that
+// each instruction we'll encounter has been visited by InstCombine already.
+// In particular, most importantly for us, that means we have to canonicalize
+// constants to RHS ourselves, since that is helpful sometimes.
+std::array<Value *, 2> Negator::getSortedOperandsOfBinOp(Instruction *I) {
+  assert(I->getNumOperands() == 2 && "Only for binops!");
+  std::array<Value *, 2> Ops{I->getOperand(0), I->getOperand(1)};
+  if (I->isCommutative() && InstCombiner::getComplexity(I->getOperand(0)) <
+                                InstCombiner::getComplexity(I->getOperand(1)))
+    std::swap(Ops[0], Ops[1]);
+  return Ops;
+}
+
 // FIXME: can this be reworked into a worklist-based algorithm while preserving
 // the depth-first, early bailout traversal?
 LLVM_NODISCARD Value *Negator::visitImpl(Value *V, unsigned Depth) {
@@ -156,11 +172,13 @@ LLVM_NODISCARD Value *Negator::visitImpl(Value *V, unsigned Depth) {
 
   // In some cases we can give the answer without further recursion.
   switch (I->getOpcode()) {
-  case Instruction::Add:
+  case Instruction::Add: {
+    std::array<Value *, 2> Ops = getSortedOperandsOfBinOp(I);
     // `inc` is always negatible.
-    if (match(I->getOperand(1), m_One()))
-      return Builder.CreateNot(I->getOperand(0), I->getName() + ".neg");
+    if (match(Ops[1], m_One()))
+      return Builder.CreateNot(Ops[0], I->getName() + ".neg");
     break;
+  }
   case Instruction::Xor:
     // `not` is always negatible.
     if (match(I, m_Not(m_Value(X))))
@@ -181,6 +199,10 @@ LLVM_NODISCARD Value *Negator::visitImpl(Value *V, unsigned Depth) {
       }
       return BO;
     }
+    // While we could negate exact arithmetic shift:
+    //   ashr exact %x, C  -->   sdiv exact i8 %x, -1<<C
+    // iff C != 0 and C u< bitwidth(%x), we don't want to,
+    // because division is *THAT* much worse than a shift.
     break;
   }
   case Instruction::SExt:
@@ -197,26 +219,28 @@ LLVM_NODISCARD Value *Negator::visitImpl(Value *V, unsigned Depth) {
     break; // Other instructions require recursive reasoning.
   }
 
+  if (I->getOpcode() == Instruction::Sub &&
+      (I->hasOneUse() || match(I->getOperand(0), m_ImmConstant()))) {
+    // `sub` is always negatible.
+    // However, only do this either if the old `sub` doesn't stick around, or
+    // it was subtracting from a constant. Otherwise, this isn't profitable.
+    return Builder.CreateSub(I->getOperand(1), I->getOperand(0),
+                             I->getName() + ".neg");
+  }
+
   // Some other cases, while still don't require recursion,
   // are restricted to the one-use case.
   if (!V->hasOneUse())
     return nullptr;
 
   switch (I->getOpcode()) {
-  case Instruction::Sub:
-    // `sub` is always negatible.
-    // But if the old `sub` sticks around, even thought we don't increase
-    // instruction count, this is a likely regression since we increased
-    // live-range of *both* of the operands, which might lead to more spilling.
-    return Builder.CreateSub(I->getOperand(1), I->getOperand(0),
-                             I->getName() + ".neg");
   case Instruction::SDiv:
     // `sdiv` is negatible if divisor is not undef/INT_MIN/1.
     // While this is normally not behind a use-check,
     // let's consider division to be special since it's costly.
     if (auto *Op1C = dyn_cast<Constant>(I->getOperand(1))) {
-      if (!Op1C->containsUndefElement() && Op1C->isNotMinSignedValue() &&
-          Op1C->isNotOneValue()) {
+      if (!Op1C->containsUndefOrPoisonElement() &&
+          Op1C->isNotMinSignedValue() && Op1C->isNotOneValue()) {
         Value *BO =
             Builder.CreateSDiv(I->getOperand(0), ConstantExpr::getNeg(Op1C),
                                I->getName() + ".neg");
@@ -237,6 +261,13 @@ LLVM_NODISCARD Value *Negator::visitImpl(Value *V, unsigned Depth) {
   }
 
   switch (I->getOpcode()) {
+  case Instruction::Freeze: {
+    // `freeze` is negatible if its operand is negatible.
+    Value *NegOp = negate(I->getOperand(0), Depth + 1);
+    if (!NegOp) // Early return.
+      return nullptr;
+    return Builder.CreateFreeze(NegOp, I->getName() + ".neg");
+  }
   case Instruction::PHI: {
     // `phi` is negatible if all the incoming values are negatible.
     auto *PHI = cast<PHINode>(I);
@@ -254,20 +285,16 @@ LLVM_NODISCARD Value *Negator::visitImpl(Value *V, unsigned Depth) {
     return NegatedPHI;
   }
   case Instruction::Select: {
-    {
-      // `abs`/`nabs` is always negatible.
-      Value *LHS, *RHS;
-      SelectPatternFlavor SPF =
-          matchSelectPattern(I, LHS, RHS, /*CastOp=*/nullptr, Depth).Flavor;
-      if (SPF == SPF_ABS || SPF == SPF_NABS) {
-        auto *NewSelect = cast<SelectInst>(I->clone());
-        // Just swap the operands of the select.
-        NewSelect->swapValues();
-        // Don't swap prof metadata, we didn't change the branch behavior.
-        NewSelect->setName(I->getName() + ".neg");
-        Builder.Insert(NewSelect);
-        return NewSelect;
-      }
+    if (isKnownNegation(I->getOperand(1), I->getOperand(2))) {
+      // Of one hand of select is known to be negation of another hand,
+      // just swap the hands around.
+      auto *NewSelect = cast<SelectInst>(I->clone());
+      // Just swap the operands of the select.
+      NewSelect->swapValues();
+      // Don't swap prof metadata, we didn't change the branch behavior.
+      NewSelect->setName(I->getName() + ".neg");
+      Builder.Insert(NewSelect);
+      return NewSelect;
     }
     // `select` is negatible if both hands of `select` are negatible.
     Value *NegOp1 = negate(I->getOperand(1), Depth + 1);
@@ -323,51 +350,81 @@ LLVM_NODISCARD Value *Negator::visitImpl(Value *V, unsigned Depth) {
   }
   case Instruction::Shl: {
     // `shl` is negatible if the first operand is negatible.
-    Value *NegOp0 = negate(I->getOperand(0), Depth + 1);
-    if (!NegOp0) // Early return.
+    if (Value *NegOp0 = negate(I->getOperand(0), Depth + 1))
+      return Builder.CreateShl(NegOp0, I->getOperand(1), I->getName() + ".neg");
+    // Otherwise, `shl %x, C` can be interpreted as `mul %x, 1<<C`.
+    auto *Op1C = dyn_cast<Constant>(I->getOperand(1));
+    if (!Op1C) // Early return.
       return nullptr;
-    return Builder.CreateShl(NegOp0, I->getOperand(1), I->getName() + ".neg");
+    return Builder.CreateMul(
+        I->getOperand(0),
+        ConstantExpr::getShl(Constant::getAllOnesValue(Op1C->getType()), Op1C),
+        I->getName() + ".neg");
   }
-  case Instruction::Or:
+  case Instruction::Or: {
     if (!haveNoCommonBitsSet(I->getOperand(0), I->getOperand(1), DL, &AC, I,
                              &DT))
       return nullptr; // Don't know how to handle `or` in general.
+    std::array<Value *, 2> Ops = getSortedOperandsOfBinOp(I);
     // `or`/`add` are interchangeable when operands have no common bits set.
     // `inc` is always negatible.
-    if (match(I->getOperand(1), m_One()))
-      return Builder.CreateNot(I->getOperand(0), I->getName() + ".neg");
+    if (match(Ops[1], m_One()))
+      return Builder.CreateNot(Ops[0], I->getName() + ".neg");
     // Else, just defer to Instruction::Add handling.
     LLVM_FALLTHROUGH;
+  }
   case Instruction::Add: {
     // `add` is negatible if both of its operands are negatible.
-    Value *NegOp0 = negate(I->getOperand(0), Depth + 1);
-    if (!NegOp0) // Early return.
-      return nullptr;
-    Value *NegOp1 = negate(I->getOperand(1), Depth + 1);
-    if (!NegOp1)
+    SmallVector<Value *, 2> NegatedOps, NonNegatedOps;
+    for (Value *Op : I->operands()) {
+      // Can we sink the negation into this operand?
+      if (Value *NegOp = negate(Op, Depth + 1)) {
+        NegatedOps.emplace_back(NegOp); // Successfully negated operand!
+        continue;
+      }
+      // Failed to sink negation into this operand. IFF we started from negation
+      // and we manage to sink negation into one operand, we can still do this.
+      if (!IsTrulyNegation)
+        return nullptr;
+      NonNegatedOps.emplace_back(Op); // Just record which operand that was.
+    }
+    assert((NegatedOps.size() + NonNegatedOps.size()) == 2 &&
+           "Internal consistency sanity check.");
+    // Did we manage to sink negation into both of the operands?
+    if (NegatedOps.size() == 2) // Then we get to keep the `add`!
+      return Builder.CreateAdd(NegatedOps[0], NegatedOps[1],
+                               I->getName() + ".neg");
+    assert(IsTrulyNegation && "We should have early-exited then.");
+    // Completely failed to sink negation?
+    if (NonNegatedOps.size() == 2)
       return nullptr;
-    return Builder.CreateAdd(NegOp0, NegOp1, I->getName() + ".neg");
+    // 0-(a+b) --> (-a)-b
+    return Builder.CreateSub(NegatedOps[0], NonNegatedOps[0],
+                             I->getName() + ".neg");
   }
-  case Instruction::Xor:
+  case Instruction::Xor: {
+    std::array<Value *, 2> Ops = getSortedOperandsOfBinOp(I);
     // `xor` is negatible if one of its operands is invertible.
     // FIXME: InstCombineInverter? But how to connect Inverter and Negator?
-    if (auto *C = dyn_cast<Constant>(I->getOperand(1))) {
-      Value *Xor = Builder.CreateXor(I->getOperand(0), ConstantExpr::getNot(C));
+    if (auto *C = dyn_cast<Constant>(Ops[1])) {
+      Value *Xor = Builder.CreateXor(Ops[0], ConstantExpr::getNot(C));
       return Builder.CreateAdd(Xor, ConstantInt::get(Xor->getType(), 1),
                                I->getName() + ".neg");
     }
     return nullptr;
+  }
   case Instruction::Mul: {
+    std::array<Value *, 2> Ops = getSortedOperandsOfBinOp(I);
     // `mul` is negatible if one of its operands is negatible.
     Value *NegatedOp, *OtherOp;
     // First try the second operand, in case it's a constant it will be best to
     // just invert it instead of sinking the `neg` deeper.
-    if (Value *NegOp1 = negate(I->getOperand(1), Depth + 1)) {
+    if (Value *NegOp1 = negate(Ops[1], Depth + 1)) {
       NegatedOp = NegOp1;
-      OtherOp = I->getOperand(0);
-    } else if (Value *NegOp0 = negate(I->getOperand(0), Depth + 1)) {
+      OtherOp = Ops[0];
+    } else if (Value *NegOp0 = negate(Ops[0], Depth + 1)) {
       NegatedOp = NegOp0;
-      OtherOp = I->getOperand(1);
+      OtherOp = Ops[1];
     } else
       // Can't negate either of them.
       return nullptr;
@@ -430,7 +487,7 @@ LLVM_NODISCARD Optional<Negator::Result> Negator::run(Value *Root) {
 }
 
 LLVM_NODISCARD Value *Negator::Negate(bool LHSIsZero, Value *Root,
-                                      InstCombiner &IC) {
+                                      InstCombinerImpl &IC) {
   ++NegatorTotalNegationsAttempted;
   LLVM_DEBUG(dbgs() << "Negator: attempting to sink negation into " << *Root
                     << "\n");
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
index 2b2f2e1b9470..b211b0813611 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -13,11 +13,14 @@
 #include "InstCombineInternal.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
 #include "llvm/Transforms/Utils/Local.h"
+
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
@@ -27,10 +30,16 @@ static cl::opt<unsigned>
 MaxNumPhis("instcombine-max-num-phis", cl::init(512),
            cl::desc("Maximum number phis to handle in intptr/ptrint folding"));
 
+STATISTIC(NumPHIsOfInsertValues,
+          "Number of phi-of-insertvalue turned into insertvalue-of-phis");
+STATISTIC(NumPHIsOfExtractValues,
+          "Number of phi-of-extractvalue turned into extractvalue-of-phi");
+STATISTIC(NumPHICSEs, "Number of PHI's that got CSE'd");
+
 /// The PHI arguments will be folded into a single operation with a PHI node
 /// as input. The debug location of the single operation will be the merged
 /// locations of the original PHI node arguments.
-void InstCombiner::PHIArgMergedDebugLoc(Instruction *Inst, PHINode &PN) {
+void InstCombinerImpl::PHIArgMergedDebugLoc(Instruction *Inst, PHINode &PN) {
   auto *FirstInst = cast<Instruction>(PN.getIncomingValue(0));
   Inst->setDebugLoc(FirstInst->getDebugLoc());
   // We do not expect a CallInst here, otherwise, N-way merging of DebugLoc
@@ -93,7 +102,7 @@ void InstCombiner::PHIArgMergedDebugLoc(Instruction *Inst, PHINode &PN) {
 //    ptr_val_inc = ...
 //    ...
 //
-Instruction *InstCombiner::FoldIntegerTypedPHI(PHINode &PN) {
+Instruction *InstCombinerImpl::foldIntegerTypedPHI(PHINode &PN) {
   if (!PN.getType()->isIntegerTy())
     return nullptr;
   if (!PN.hasOneUse())
@@ -290,9 +299,86 @@ Instruction *InstCombiner::FoldIntegerTypedPHI(PHINode &PN) {
                                           IntToPtr->getOperand(0)->getType());
 }
 
+/// If we have something like phi [insertvalue(a,b,0), insertvalue(c,d,0)],
+/// turn this into a phi[a,c] and phi[b,d] and a single insertvalue.
+Instruction *
+InstCombinerImpl::foldPHIArgInsertValueInstructionIntoPHI(PHINode &PN) {
+  auto *FirstIVI = cast<InsertValueInst>(PN.getIncomingValue(0));
+
+  // Scan to see if all operands are `insertvalue`'s with the same indicies,
+  // and all have a single use.
+  for (unsigned i = 1; i != PN.getNumIncomingValues(); ++i) {
+    auto *I = dyn_cast<InsertValueInst>(PN.getIncomingValue(i));
+    if (!I || !I->hasOneUser() || I->getIndices() != FirstIVI->getIndices())
+      return nullptr;
+  }
+
+  // For each operand of an `insertvalue`
+  std::array<PHINode *, 2> NewOperands;
+  for (int OpIdx : {0, 1}) {
+    auto *&NewOperand = NewOperands[OpIdx];
+    // Create a new PHI node to receive the values the operand has in each
+    // incoming basic block.
+    NewOperand = PHINode::Create(
+        FirstIVI->getOperand(OpIdx)->getType(), PN.getNumIncomingValues(),
+        FirstIVI->getOperand(OpIdx)->getName() + ".pn");
+    // And populate each operand's PHI with said values.
+    for (auto Incoming : zip(PN.blocks(), PN.incoming_values()))
+      NewOperand->addIncoming(
+          cast<InsertValueInst>(std::get<1>(Incoming))->getOperand(OpIdx),
+          std::get<0>(Incoming));
+    InsertNewInstBefore(NewOperand, PN);
+  }
+
+  // And finally, create `insertvalue` over the newly-formed PHI nodes.
+  auto *NewIVI = InsertValueInst::Create(NewOperands[0], NewOperands[1],
+                                         FirstIVI->getIndices(), PN.getName());
+
+  PHIArgMergedDebugLoc(NewIVI, PN);
+  ++NumPHIsOfInsertValues;
+  return NewIVI;
+}
+
+/// If we have something like phi [extractvalue(a,0), extractvalue(b,0)],
+/// turn this into a phi[a,b] and a single extractvalue.
+Instruction *
+InstCombinerImpl::foldPHIArgExtractValueInstructionIntoPHI(PHINode &PN) {
+  auto *FirstEVI = cast<ExtractValueInst>(PN.getIncomingValue(0));
+
+  // Scan to see if all operands are `extractvalue`'s with the same indicies,
+  // and all have a single use.
+  for (unsigned i = 1; i != PN.getNumIncomingValues(); ++i) {
+    auto *I = dyn_cast<ExtractValueInst>(PN.getIncomingValue(i));
+    if (!I || !I->hasOneUser() || I->getIndices() != FirstEVI->getIndices() ||
+        I->getAggregateOperand()->getType() !=
+            FirstEVI->getAggregateOperand()->getType())
+      return nullptr;
+  }
+
+  // Create a new PHI node to receive the values the aggregate operand has
+  // in each incoming basic block.
+  auto *NewAggregateOperand = PHINode::Create(
+      FirstEVI->getAggregateOperand()->getType(), PN.getNumIncomingValues(),
+      FirstEVI->getAggregateOperand()->getName() + ".pn");
+  // And populate the PHI with said values.
+  for (auto Incoming : zip(PN.blocks(), PN.incoming_values()))
+    NewAggregateOperand->addIncoming(
+        cast<ExtractValueInst>(std::get<1>(Incoming))->getAggregateOperand(),
+        std::get<0>(Incoming));
+  InsertNewInstBefore(NewAggregateOperand, PN);
+
+  // And finally, create `extractvalue` over the newly-formed PHI nodes.
+  auto *NewEVI = ExtractValueInst::Create(NewAggregateOperand,
+                                          FirstEVI->getIndices(), PN.getName());
+
+  PHIArgMergedDebugLoc(NewEVI, PN);
+  ++NumPHIsOfExtractValues;
+  return NewEVI;
+}
+
 /// If we have something like phi [add (a,b), add(a,c)] and if a/b/c and the
-/// adds all have a single use, turn this into a phi and a single binop.
-Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) {
+/// adds all have a single user, turn this into a phi and a single binop.
+Instruction *InstCombinerImpl::foldPHIArgBinOpIntoPHI(PHINode &PN) {
   Instruction *FirstInst = cast<Instruction>(PN.getIncomingValue(0));
   assert(isa<BinaryOperator>(FirstInst) || isa<CmpInst>(FirstInst));
   unsigned Opc = FirstInst->getOpcode();
@@ -302,10 +388,10 @@ Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) {
   Type *LHSType = LHSVal->getType();
   Type *RHSType = RHSVal->getType();
 
-  // Scan to see if all operands are the same opcode, and all have one use.
+  // Scan to see if all operands are the same opcode, and all have one user.
   for (unsigned i = 1; i != PN.getNumIncomingValues(); ++i) {
     Instruction *I = dyn_cast<Instruction>(PN.getIncomingValue(i));
-    if (!I || I->getOpcode() != Opc || !I->hasOneUse() ||
+    if (!I || I->getOpcode() != Opc || !I->hasOneUser() ||
         // Verify type of the LHS matches so we don't fold cmp's of different
         // types.
         I->getOperand(0)->getType() != LHSType ||
@@ -385,7 +471,7 @@ Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) {
   return NewBinOp;
 }
 
-Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) {
+Instruction *InstCombinerImpl::foldPHIArgGEPIntoPHI(PHINode &PN) {
   GetElementPtrInst *FirstInst =cast<GetElementPtrInst>(PN.getIncomingValue(0));
 
   SmallVector<Value*, 16> FixedOperands(FirstInst->op_begin(),
@@ -401,11 +487,12 @@ Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) {
 
   bool AllInBounds = true;
 
-  // Scan to see if all operands are the same opcode, and all have one use.
+  // Scan to see if all operands are the same opcode, and all have one user.
   for (unsigned i = 1; i != PN.getNumIncomingValues(); ++i) {
-    GetElementPtrInst *GEP= dyn_cast<GetElementPtrInst>(PN.getIncomingValue(i));
-    if (!GEP || !GEP->hasOneUse() || GEP->getType() != FirstInst->getType() ||
-      GEP->getNumOperands() != FirstInst->getNumOperands())
+    GetElementPtrInst *GEP =
+        dyn_cast<GetElementPtrInst>(PN.getIncomingValue(i));
+    if (!GEP || !GEP->hasOneUser() || GEP->getType() != FirstInst->getType() ||
+        GEP->getNumOperands() != FirstInst->getNumOperands())
       return nullptr;
 
     AllInBounds &= GEP->isInBounds();
@@ -494,7 +581,6 @@ Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) {
   return NewGEP;
 }
 
-
 /// Return true if we know that it is safe to sink the load out of the block
 /// that defines it. This means that it must be obvious the value of the load is
 /// not changed from the point of the load to the end of the block it is in.
@@ -506,8 +592,14 @@ static bool isSafeAndProfitableToSinkLoad(LoadInst *L) {
   BasicBlock::iterator BBI = L->getIterator(), E = L->getParent()->end();
 
   for (++BBI; BBI != E; ++BBI)
-    if (BBI->mayWriteToMemory())
+    if (BBI->mayWriteToMemory()) {
+      // Calls that only access inaccessible memory do not block sinking the
+      // load.
+      if (auto *CB = dyn_cast<CallBase>(BBI))
+        if (CB->onlyAccessesInaccessibleMemory())
+          continue;
       return false;
+    }
 
   // Check for non-address taken alloca.  If not address-taken already, it isn't
   // profitable to do this xform.
@@ -540,7 +632,7 @@ static bool isSafeAndProfitableToSinkLoad(LoadInst *L) {
   return true;
 }
 
-Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) {
+Instruction *InstCombinerImpl::foldPHIArgLoadIntoPHI(PHINode &PN) {
   LoadInst *FirstLI = cast<LoadInst>(PN.getIncomingValue(0));
 
   // FIXME: This is overconservative; this transform is allowed in some cases
@@ -573,7 +665,7 @@ Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) {
   // Check to see if all arguments are the same operation.
   for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
     LoadInst *LI = dyn_cast<LoadInst>(PN.getIncomingValue(i));
-    if (!LI || !LI->hasOneUse())
+    if (!LI || !LI->hasOneUser())
       return nullptr;
 
     // We can't sink the load if the loaded value could be modified between
@@ -654,7 +746,7 @@ Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) {
 /// TODO: This function could handle other cast types, but then it might
 /// require special-casing a cast from the 'i1' type. See the comment in
 /// FoldPHIArgOpIntoPHI() about pessimizing illegal integer types.
-Instruction *InstCombiner::FoldPHIArgZextsIntoPHI(PHINode &Phi) {
+Instruction *InstCombinerImpl::foldPHIArgZextsIntoPHI(PHINode &Phi) {
   // We cannot create a new instruction after the PHI if the terminator is an
   // EHPad because there is no valid insertion point.
   if (Instruction *TI = Phi.getParent()->getTerminator())
@@ -686,8 +778,8 @@ Instruction *InstCombiner::FoldPHIArgZextsIntoPHI(PHINode &Phi) {
   unsigned NumConsts = 0;
   for (Value *V : Phi.incoming_values()) {
     if (auto *Zext = dyn_cast<ZExtInst>(V)) {
-      // All zexts must be identical and have one use.
-      if (Zext->getSrcTy() != NarrowType || !Zext->hasOneUse())
+      // All zexts must be identical and have one user.
+      if (Zext->getSrcTy() != NarrowType || !Zext->hasOneUser())
         return nullptr;
       NewIncoming.push_back(Zext->getOperand(0));
       NumZexts++;
@@ -728,7 +820,7 @@ Instruction *InstCombiner::FoldPHIArgZextsIntoPHI(PHINode &Phi) {
 /// If all operands to a PHI node are the same "unary" operator and they all are
 /// only used by the PHI, PHI together their inputs, and do the operation once,
 /// to the result of the PHI.
-Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) {
+Instruction *InstCombinerImpl::foldPHIArgOpIntoPHI(PHINode &PN) {
   // We cannot create a new instruction after the PHI if the terminator is an
   // EHPad because there is no valid insertion point.
   if (Instruction *TI = PN.getParent()->getTerminator())
@@ -738,9 +830,13 @@ Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) {
   Instruction *FirstInst = cast<Instruction>(PN.getIncomingValue(0));
 
   if (isa<GetElementPtrInst>(FirstInst))
-    return FoldPHIArgGEPIntoPHI(PN);
+    return foldPHIArgGEPIntoPHI(PN);
   if (isa<LoadInst>(FirstInst))
-    return FoldPHIArgLoadIntoPHI(PN);
+    return foldPHIArgLoadIntoPHI(PN);
+  if (isa<InsertValueInst>(FirstInst))
+    return foldPHIArgInsertValueInstructionIntoPHI(PN);
+  if (isa<ExtractValueInst>(FirstInst))
+    return foldPHIArgExtractValueInstructionIntoPHI(PN);
 
   // Scan the instruction, looking for input operations that can be folded away.
   // If all input operands to the phi are the same instruction (e.g. a cast from
@@ -763,7 +859,7 @@ Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) {
     // otherwise call FoldPHIArgBinOpIntoPHI.
     ConstantOp = dyn_cast<Constant>(FirstInst->getOperand(1));
     if (!ConstantOp)
-      return FoldPHIArgBinOpIntoPHI(PN);
+      return foldPHIArgBinOpIntoPHI(PN);
   } else {
     return nullptr;  // Cannot fold this operation.
   }
@@ -771,7 +867,7 @@ Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) {
   // Check to see if all arguments are the same operation.
   for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
     Instruction *I = dyn_cast<Instruction>(PN.getIncomingValue(i));
-    if (!I || !I->hasOneUse() || !I->isSameOperationAs(FirstInst))
+    if (!I || !I->hasOneUser() || !I->isSameOperationAs(FirstInst))
       return nullptr;
     if (CastSrcTy) {
       if (I->getOperand(0)->getType() != CastSrcTy)
@@ -923,7 +1019,7 @@ struct LoweredPHIRecord {
   LoweredPHIRecord(PHINode *pn, unsigned Sh)
     : PN(pn), Shift(Sh), Width(0) {}
 };
-}
+} // namespace
 
 namespace llvm {
   template<>
@@ -944,7 +1040,7 @@ namespace llvm {
              LHS.Width == RHS.Width;
     }
   };
-}
+} // namespace llvm
 
 
 /// This is an integer PHI and we know that it has an illegal type: see if it is
@@ -955,7 +1051,7 @@ namespace llvm {
 /// TODO: The user of the trunc may be an bitcast to float/double/vector or an
 /// inttoptr.  We should produce new PHIs in the right type.
 ///
-Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
+Instruction *InstCombinerImpl::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
   // PHIUsers - Keep track of all of the truncated values extracted from a set
   // of PHIs, along with their offset.  These are the things we want to rewrite.
   SmallVector<PHIUsageRecord, 16> PHIUsers;
@@ -1129,13 +1225,85 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
   return replaceInstUsesWith(FirstPhi, Undef);
 }
 
+static Value *SimplifyUsingControlFlow(InstCombiner &Self, PHINode &PN,
+                                       const DominatorTree &DT) {
+  // Simplify the following patterns:
+  //       if (cond)
+  //       /       \
+  //      ...      ...
+  //       \       /
+  //    phi [true] [false]
+  if (!PN.getType()->isIntegerTy(1))
+    return nullptr;
+
+  if (PN.getNumOperands() != 2)
+    return nullptr;
+
+  // Make sure all inputs are constants.
+  if (!all_of(PN.operands(), [](Value *V) { return isa<ConstantInt>(V); }))
+    return nullptr;
+
+  BasicBlock *BB = PN.getParent();
+  // Do not bother with unreachable instructions.
+  if (!DT.isReachableFromEntry(BB))
+    return nullptr;
+
+  // Same inputs.
+  if (PN.getOperand(0) == PN.getOperand(1))
+    return PN.getOperand(0);
+
+  BasicBlock *TruePred = nullptr, *FalsePred = nullptr;
+  for (auto *Pred : predecessors(BB)) {
+    auto *Input = cast<ConstantInt>(PN.getIncomingValueForBlock(Pred));
+    if (Input->isAllOnesValue())
+      TruePred = Pred;
+    else
+      FalsePred = Pred;
+  }
+  assert(TruePred && FalsePred && "Must be!");
+
+  // Check which edge of the dominator dominates the true input. If it is the
+  // false edge, we should invert the condition.
+  auto *IDom = DT.getNode(BB)->getIDom()->getBlock();
+  auto *BI = dyn_cast<BranchInst>(IDom->getTerminator());
+  if (!BI || BI->isUnconditional())
+    return nullptr;
+
+  // Check that edges outgoing from the idom's terminators dominate respective
+  // inputs of the Phi.
+  BasicBlockEdge TrueOutEdge(IDom, BI->getSuccessor(0));
+  BasicBlockEdge FalseOutEdge(IDom, BI->getSuccessor(1));
+
+  BasicBlockEdge TrueIncEdge(TruePred, BB);
+  BasicBlockEdge FalseIncEdge(FalsePred, BB);
+
+  auto *Cond = BI->getCondition();
+  if (DT.dominates(TrueOutEdge, TrueIncEdge) &&
+      DT.dominates(FalseOutEdge, FalseIncEdge))
+    // This Phi is actually equivalent to branching condition of IDom.
+    return Cond;
+  else if (DT.dominates(TrueOutEdge, FalseIncEdge) &&
+           DT.dominates(FalseOutEdge, TrueIncEdge)) {
+    // This Phi is actually opposite to branching condition of IDom. We invert
+    // the condition that will potentially open up some opportunities for
+    // sinking.
+    auto InsertPt = BB->getFirstInsertionPt();
+    if (InsertPt != BB->end()) {
+      Self.Builder.SetInsertPoint(&*InsertPt);
+      return Self.Builder.CreateNot(Cond);
+    }
+  }
+
+  return nullptr;
+}
+
 // PHINode simplification
 //
-Instruction *InstCombiner::visitPHINode(PHINode &PN) {
+Instruction *InstCombinerImpl::visitPHINode(PHINode &PN) {
   if (Value *V = SimplifyInstruction(&PN, SQ.getWithInstruction(&PN)))
     return replaceInstUsesWith(PN, V);
 
-  if (Instruction *Result = FoldPHIArgZextsIntoPHI(PN))
+  if (Instruction *Result = foldPHIArgZextsIntoPHI(PN))
     return Result;
 
   // If all PHI operands are the same operation, pull them through the PHI,
@@ -1143,18 +1311,16 @@ Instruction *InstCombiner::visitPHINode(PHINode &PN) {
   if (isa<Instruction>(PN.getIncomingValue(0)) &&
       isa<Instruction>(PN.getIncomingValue(1)) &&
       cast<Instruction>(PN.getIncomingValue(0))->getOpcode() ==
-      cast<Instruction>(PN.getIncomingValue(1))->getOpcode() &&
-      // FIXME: The hasOneUse check will fail for PHIs that use the value more
-      // than themselves more than once.
-      PN.getIncomingValue(0)->hasOneUse())
-    if (Instruction *Result = FoldPHIArgOpIntoPHI(PN))
+          cast<Instruction>(PN.getIncomingValue(1))->getOpcode() &&
+      PN.getIncomingValue(0)->hasOneUser())
+    if (Instruction *Result = foldPHIArgOpIntoPHI(PN))
       return Result;
 
   // If this is a trivial cycle in the PHI node graph, remove it.  Basically, if
   // this PHI only has a single use (a PHI), and if that PHI only has one use (a
   // PHI)... break the cycle.
   if (PN.hasOneUse()) {
-    if (Instruction *Result = FoldIntegerTypedPHI(PN))
+    if (Instruction *Result = foldIntegerTypedPHI(PN))
       return Result;
 
     Instruction *PHIUser = cast<Instruction>(PN.user_back());
@@ -1267,6 +1433,21 @@ Instruction *InstCombiner::visitPHINode(PHINode &PN) {
       }
     }
 
+  // Is there an identical PHI node in this basic block?
+  for (PHINode &IdenticalPN : PN.getParent()->phis()) {
+    // Ignore the PHI node itself.
+    if (&IdenticalPN == &PN)
+      continue;
+    // Note that even though we've just canonicalized this PHI, due to the
+    // worklist visitation order, there are no guarantess that *every* PHI
+    // has been canonicalized, so we can't just compare operands ranges.
+    if (!PN.isIdenticalToWhenDefined(&IdenticalPN))
+      continue;
+    // Just use that PHI instead then.
+    ++NumPHICSEs;
+    return replaceInstUsesWith(PN, &IdenticalPN);
+  }
+
   // If this is an integer PHI and we know that it has an illegal type, see if
   // it is only used by trunc or trunc(lshr) operations.  If so, we split the
   // PHI into the various pieces being extracted.  This sort of thing is
@@ -1276,5 +1457,9 @@ Instruction *InstCombiner::visitPHINode(PHINode &PN) {
     if (Instruction *Res = SliceUpIllegalIntegerPHI(PN))
       return Res;
 
+  // Ultimately, try to replace this Phi with a dominating condition.
+  if (auto *V = SimplifyUsingControlFlow(*this, PN, DT))
+    return replaceInstUsesWith(PN, V);
+
   return nullptr;
 }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 1e43014e7d32..5f174aae09ec 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -38,6 +38,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
 #include <cassert>
 #include <utility>
 
@@ -46,6 +47,11 @@ using namespace PatternMatch;
 
 #define DEBUG_TYPE "instcombine"
 
+/// FIXME: Enabled by default until the pattern is supported well.
+static cl::opt<bool> EnableUnsafeSelectTransform(
+    "instcombine-unsafe-select-transform", cl::init(true),
+    cl::desc("Enable poison-unsafe select to and/or transform"));
+
 static Value *createMinMax(InstCombiner::BuilderTy &Builder,
                            SelectPatternFlavor SPF, Value *A, Value *B) {
   CmpInst::Predicate Pred = getMinMaxPred(SPF);
@@ -57,7 +63,7 @@ static Value *createMinMax(InstCombiner::BuilderTy &Builder,
 /// constant of a binop.
 static Instruction *foldSelectBinOpIdentity(SelectInst &Sel,
                                             const TargetLibraryInfo &TLI,
-                                            InstCombiner &IC) {
+                                            InstCombinerImpl &IC) {
   // The select condition must be an equality compare with a constant operand.
   Value *X;
   Constant *C;
@@ -258,29 +264,9 @@ static unsigned getSelectFoldableOperands(BinaryOperator *I) {
   }
 }
 
-/// For the same transformation as the previous function, return the identity
-/// constant that goes into the select.
-static APInt getSelectFoldableConstant(BinaryOperator *I) {
-  switch (I->getOpcode()) {
-  default: llvm_unreachable("This cannot happen!");
-  case Instruction::Add:
-  case Instruction::Sub:
-  case Instruction::Or:
-  case Instruction::Xor:
-  case Instruction::Shl:
-  case Instruction::LShr:
-  case Instruction::AShr:
-    return APInt::getNullValue(I->getType()->getScalarSizeInBits());
-  case Instruction::And:
-    return APInt::getAllOnesValue(I->getType()->getScalarSizeInBits());
-  case Instruction::Mul:
-    return APInt(I->getType()->getScalarSizeInBits(), 1);
-  }
-}
-
 /// We have (select c, TI, FI), and we know that TI and FI have the same opcode.
-Instruction *InstCombiner::foldSelectOpOp(SelectInst &SI, Instruction *TI,
-                                          Instruction *FI) {
+Instruction *InstCombinerImpl::foldSelectOpOp(SelectInst &SI, Instruction *TI,
+                                              Instruction *FI) {
   // Don't break up min/max patterns. The hasOneUse checks below prevent that
   // for most cases, but vector min/max with bitcasts can be transformed. If the
   // one-use restrictions are eased for other patterns, we still don't want to
@@ -302,10 +288,9 @@ Instruction *InstCombiner::foldSelectOpOp(SelectInst &SI, Instruction *TI,
     // The select condition may be a vector. We may only change the operand
     // type if the vector width remains the same (and matches the condition).
     if (auto *CondVTy = dyn_cast<VectorType>(CondTy)) {
-      if (!FIOpndTy->isVectorTy())
-        return nullptr;
-      if (CondVTy->getNumElements() !=
-          cast<VectorType>(FIOpndTy)->getNumElements())
+      if (!FIOpndTy->isVectorTy() ||
+          CondVTy->getElementCount() !=
+              cast<VectorType>(FIOpndTy)->getElementCount())
         return nullptr;
 
       // TODO: If the backend knew how to deal with casts better, we could
@@ -418,8 +403,8 @@ static bool isSelect01(const APInt &C1I, const APInt &C2I) {
 
 /// Try to fold the select into one of the operands to allow further
 /// optimization.
-Instruction *InstCombiner::foldSelectIntoOp(SelectInst &SI, Value *TrueVal,
-                                            Value *FalseVal) {
+Instruction *InstCombinerImpl::foldSelectIntoOp(SelectInst &SI, Value *TrueVal,
+                                                Value *FalseVal) {
   // See the comment above GetSelectFoldableOperands for a description of the
   // transformation we are doing here.
   if (auto *TVI = dyn_cast<BinaryOperator>(TrueVal)) {
@@ -433,14 +418,15 @@ Instruction *InstCombiner::foldSelectIntoOp(SelectInst &SI, Value *TrueVal,
         }
 
         if (OpToFold) {
-          APInt CI = getSelectFoldableConstant(TVI);
+          Constant *C = ConstantExpr::getBinOpIdentity(TVI->getOpcode(),
+                                                       TVI->getType(), true);
           Value *OOp = TVI->getOperand(2-OpToFold);
           // Avoid creating select between 2 constants unless it's selecting
           // between 0, 1 and -1.
           const APInt *OOpC;
           bool OOpIsAPInt = match(OOp, m_APInt(OOpC));
-          if (!isa<Constant>(OOp) || (OOpIsAPInt && isSelect01(CI, *OOpC))) {
-            Value *C = ConstantInt::get(OOp->getType(), CI);
+          if (!isa<Constant>(OOp) ||
+              (OOpIsAPInt && isSelect01(C->getUniqueInteger(), *OOpC))) {
             Value *NewSel = Builder.CreateSelect(SI.getCondition(), OOp, C);
             NewSel->takeName(TVI);
             BinaryOperator *BO = BinaryOperator::Create(TVI->getOpcode(),
@@ -464,14 +450,15 @@ Instruction *InstCombiner::foldSelectIntoOp(SelectInst &SI, Value *TrueVal,
         }
 
         if (OpToFold) {
-          APInt CI = getSelectFoldableConstant(FVI);
+          Constant *C = ConstantExpr::getBinOpIdentity(FVI->getOpcode(),
+                                                       FVI->getType(), true);
           Value *OOp = FVI->getOperand(2-OpToFold);
           // Avoid creating select between 2 constants unless it's selecting
           // between 0, 1 and -1.
           const APInt *OOpC;
           bool OOpIsAPInt = match(OOp, m_APInt(OOpC));
-          if (!isa<Constant>(OOp) || (OOpIsAPInt && isSelect01(CI, *OOpC))) {
-            Value *C = ConstantInt::get(OOp->getType(), CI);
+          if (!isa<Constant>(OOp) ||
+              (OOpIsAPInt && isSelect01(C->getUniqueInteger(), *OOpC))) {
             Value *NewSel = Builder.CreateSelect(SI.getCondition(), C, OOp);
             NewSel->takeName(FVI);
             BinaryOperator *BO = BinaryOperator::Create(FVI->getOpcode(),
@@ -1026,9 +1013,9 @@ static bool adjustMinMax(SelectInst &Sel, ICmpInst &Cmp) {
 /// select (icmp Pred X, C1), C2, X --> select (icmp Pred' X, C2), X, C2
 /// Note: if C1 != C2, this will change the icmp constant to the existing
 /// constant operand of the select.
-static Instruction *
-canonicalizeMinMaxWithConstant(SelectInst &Sel, ICmpInst &Cmp,
-                               InstCombiner &IC) {
+static Instruction *canonicalizeMinMaxWithConstant(SelectInst &Sel,
+                                                   ICmpInst &Cmp,
+                                                   InstCombinerImpl &IC) {
   if (!Cmp.hasOneUse() || !isa<Constant>(Cmp.getOperand(1)))
     return nullptr;
 
@@ -1065,89 +1052,29 @@ canonicalizeMinMaxWithConstant(SelectInst &Sel, ICmpInst &Cmp,
   return &Sel;
 }
 
-/// There are many select variants for each of ABS/NABS.
-/// In matchSelectPattern(), there are different compare constants, compare
-/// predicates/operands and select operands.
-/// In isKnownNegation(), there are different formats of negated operands.
-/// Canonicalize all these variants to 1 pattern.
-/// This makes CSE more likely.
 static Instruction *canonicalizeAbsNabs(SelectInst &Sel, ICmpInst &Cmp,
-                                        InstCombiner &IC) {
+                                        InstCombinerImpl &IC) {
   if (!Cmp.hasOneUse() || !isa<Constant>(Cmp.getOperand(1)))
     return nullptr;
 
-  // Choose a sign-bit check for the compare (likely simpler for codegen).
-  // ABS:  (X <s 0) ? -X : X
-  // NABS: (X <s 0) ? X : -X
   Value *LHS, *RHS;
   SelectPatternFlavor SPF = matchSelectPattern(&Sel, LHS, RHS).Flavor;
   if (SPF != SelectPatternFlavor::SPF_ABS &&
       SPF != SelectPatternFlavor::SPF_NABS)
     return nullptr;
 
-  Value *TVal = Sel.getTrueValue();
-  Value *FVal = Sel.getFalseValue();
-  assert(isKnownNegation(TVal, FVal) &&
-         "Unexpected result from matchSelectPattern");
-
-  // The compare may use the negated abs()/nabs() operand, or it may use
-  // negation in non-canonical form such as: sub A, B.
-  bool CmpUsesNegatedOp = match(Cmp.getOperand(0), m_Neg(m_Specific(TVal))) ||
-                          match(Cmp.getOperand(0), m_Neg(m_Specific(FVal)));
+  // Note that NSW flag can only be propagated for normal, non-negated abs!
+  bool IntMinIsPoison = SPF == SelectPatternFlavor::SPF_ABS &&
+                        match(RHS, m_NSWNeg(m_Specific(LHS)));
+  Constant *IntMinIsPoisonC =
+      ConstantInt::get(Type::getInt1Ty(Sel.getContext()), IntMinIsPoison);
+  Instruction *Abs =
+      IC.Builder.CreateBinaryIntrinsic(Intrinsic::abs, LHS, IntMinIsPoisonC);
 
-  bool CmpCanonicalized = !CmpUsesNegatedOp &&
-                          match(Cmp.getOperand(1), m_ZeroInt()) &&
-                          Cmp.getPredicate() == ICmpInst::ICMP_SLT;
-  bool RHSCanonicalized = match(RHS, m_Neg(m_Specific(LHS)));
+  if (SPF == SelectPatternFlavor::SPF_NABS)
+    return BinaryOperator::CreateNeg(Abs); // Always without NSW flag!
 
-  // Is this already canonical?
-  if (CmpCanonicalized && RHSCanonicalized)
-    return nullptr;
-
-  // If RHS is not canonical but is used by other instructions, don't
-  // canonicalize it and potentially increase the instruction count.
-  if (!RHSCanonicalized)
-    if (!(RHS->hasOneUse() || (RHS->hasNUses(2) && CmpUsesNegatedOp)))
-      return nullptr;
-
-  // Create the canonical compare: icmp slt LHS 0.
-  if (!CmpCanonicalized) {
-    Cmp.setPredicate(ICmpInst::ICMP_SLT);
-    Cmp.setOperand(1, ConstantInt::getNullValue(Cmp.getOperand(0)->getType()));
-    if (CmpUsesNegatedOp)
-      Cmp.setOperand(0, LHS);
-  }
-
-  // Create the canonical RHS: RHS = sub (0, LHS).
-  if (!RHSCanonicalized) {
-    assert(RHS->hasOneUse() && "RHS use number is not right");
-    RHS = IC.Builder.CreateNeg(LHS);
-    if (TVal == LHS) {
-      // Replace false value.
-      IC.replaceOperand(Sel, 2, RHS);
-      FVal = RHS;
-    } else {
-      // Replace true value.
-      IC.replaceOperand(Sel, 1, RHS);
-      TVal = RHS;
-    }
-  }
-
-  // If the select operands do not change, we're done.
-  if (SPF == SelectPatternFlavor::SPF_NABS) {
-    if (TVal == LHS)
-      return &Sel;
-    assert(FVal == LHS && "Unexpected results from matchSelectPattern");
-  } else {
-    if (FVal == LHS)
-      return &Sel;
-    assert(TVal == LHS && "Unexpected results from matchSelectPattern");
-  }
-
-  // We are swapping the select operands, so swap the metadata too.
-  Sel.swapValues();
-  Sel.swapProfMetadata();
-  return &Sel;
+  return IC.replaceInstUsesWith(Sel, Abs);
 }
 
 /// If we have a select with an equality comparison, then we know the value in
@@ -1166,15 +1093,55 @@ static Instruction *canonicalizeAbsNabs(SelectInst &Sel, ICmpInst &Cmp,
 ///
 /// We can't replace %sel with %add unless we strip away the flags.
 /// TODO: Wrapping flags could be preserved in some cases with better analysis.
-static Value *foldSelectValueEquivalence(SelectInst &Sel, ICmpInst &Cmp,
-                                         const SimplifyQuery &Q) {
-  if (!Cmp.isEquality())
+Instruction *InstCombinerImpl::foldSelectValueEquivalence(SelectInst &Sel,
+                                                          ICmpInst &Cmp) {
+  // Value equivalence substitution requires an all-or-nothing replacement.
+  // It does not make sense for a vector compare where each lane is chosen
+  // independently.
+  if (!Cmp.isEquality() || Cmp.getType()->isVectorTy())
     return nullptr;
 
   // Canonicalize the pattern to ICMP_EQ by swapping the select operands.
   Value *TrueVal = Sel.getTrueValue(), *FalseVal = Sel.getFalseValue();
-  if (Cmp.getPredicate() == ICmpInst::ICMP_NE)
+  bool Swapped = false;
+  if (Cmp.getPredicate() == ICmpInst::ICMP_NE) {
     std::swap(TrueVal, FalseVal);
+    Swapped = true;
+  }
+
+  // In X == Y ? f(X) : Z, try to evaluate f(Y) and replace the operand.
+  // Make sure Y cannot be undef though, as we might pick different values for
+  // undef in the icmp and in f(Y). Additionally, take care to avoid replacing
+  // X == Y ? X : Z with X == Y ? Y : Z, as that would lead to an infinite
+  // replacement cycle.
+  Value *CmpLHS = Cmp.getOperand(0), *CmpRHS = Cmp.getOperand(1);
+  if (TrueVal != CmpLHS &&
+      isGuaranteedNotToBeUndefOrPoison(CmpRHS, SQ.AC, &Sel, &DT)) {
+    if (Value *V = SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, SQ,
+                                          /* AllowRefinement */ true))
+      return replaceOperand(Sel, Swapped ? 2 : 1, V);
+
+    // Even if TrueVal does not simplify, we can directly replace a use of
+    // CmpLHS with CmpRHS, as long as the instruction is not used anywhere
+    // else and is safe to speculatively execute (we may end up executing it
+    // with different operands, which should not cause side-effects or trigger
+    // undefined behavior). Only do this if CmpRHS is a constant, as
+    // profitability is not clear for other cases.
+    // FIXME: The replacement could be performed recursively.
+    if (match(CmpRHS, m_ImmConstant()) && !match(CmpLHS, m_ImmConstant()))
+      if (auto *I = dyn_cast<Instruction>(TrueVal))
+        if (I->hasOneUse() && isSafeToSpeculativelyExecute(I))
+          for (Use &U : I->operands())
+            if (U == CmpLHS) {
+              replaceUse(U, CmpRHS);
+              return &Sel;
+            }
+  }
+  if (TrueVal != CmpRHS &&
+      isGuaranteedNotToBeUndefOrPoison(CmpLHS, SQ.AC, &Sel, &DT))
+    if (Value *V = SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, SQ,
+                                          /* AllowRefinement */ true))
+      return replaceOperand(Sel, Swapped ? 2 : 1, V);
 
   auto *FalseInst = dyn_cast<Instruction>(FalseVal);
   if (!FalseInst)
@@ -1183,7 +1150,7 @@ static Value *foldSelectValueEquivalence(SelectInst &Sel, ICmpInst &Cmp,
   // InstSimplify already performed this fold if it was possible subject to
   // current poison-generating flags. Try the transform again with
   // poison-generating flags temporarily dropped.
-  bool WasNUW = false, WasNSW = false, WasExact = false;
+  bool WasNUW = false, WasNSW = false, WasExact = false, WasInBounds = false;
   if (auto *OBO = dyn_cast<OverflowingBinaryOperator>(FalseVal)) {
     WasNUW = OBO->hasNoUnsignedWrap();
     WasNSW = OBO->hasNoSignedWrap();
@@ -1194,17 +1161,20 @@ static Value *foldSelectValueEquivalence(SelectInst &Sel, ICmpInst &Cmp,
     WasExact = PEO->isExact();
     FalseInst->setIsExact(false);
   }
+  if (auto *GEP = dyn_cast<GetElementPtrInst>(FalseVal)) {
+    WasInBounds = GEP->isInBounds();
+    GEP->setIsInBounds(false);
+  }
 
   // Try each equivalence substitution possibility.
   // We have an 'EQ' comparison, so the select's false value will propagate.
   // Example:
   // (X == 42) ? 43 : (X + 1) --> (X == 42) ? (X + 1) : (X + 1) --> X + 1
-  Value *CmpLHS = Cmp.getOperand(0), *CmpRHS = Cmp.getOperand(1);
-  if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, Q,
+  if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, SQ,
                              /* AllowRefinement */ false) == TrueVal ||
-      SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, Q,
+      SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, SQ,
                              /* AllowRefinement */ false) == TrueVal) {
-    return FalseVal;
+    return replaceInstUsesWith(Sel, FalseVal);
   }
 
   // Restore poison-generating flags if the transform did not apply.
@@ -1214,6 +1184,8 @@ static Value *foldSelectValueEquivalence(SelectInst &Sel, ICmpInst &Cmp,
     FalseInst->setHasNoSignedWrap();
   if (WasExact)
     FalseInst->setIsExact();
+  if (WasInBounds)
+    cast<GetElementPtrInst>(FalseInst)->setIsInBounds();
 
   return nullptr;
 }
@@ -1264,7 +1236,7 @@ static Instruction *canonicalizeClampLike(SelectInst &Sel0, ICmpInst &Cmp0,
                                   APInt::getAllOnesValue(
                                       C0->getType()->getScalarSizeInBits()))))
       return nullptr; // Can't do, have all-ones element[s].
-    C0 = AddOne(C0);
+    C0 = InstCombiner::AddOne(C0);
     std::swap(X, Sel1);
     break;
   case ICmpInst::Predicate::ICMP_UGE:
@@ -1324,7 +1296,7 @@ static Instruction *canonicalizeClampLike(SelectInst &Sel0, ICmpInst &Cmp0,
                                   APInt::getSignedMaxValue(
                                       C2->getType()->getScalarSizeInBits()))))
       return nullptr; // Can't do, have signed max element[s].
-    C2 = AddOne(C2);
+    C2 = InstCombiner::AddOne(C2);
     LLVM_FALLTHROUGH;
   case ICmpInst::Predicate::ICMP_SGE:
     // Also non-canonical, but here we don't need to change C2,
@@ -1371,7 +1343,7 @@ static Instruction *canonicalizeClampLike(SelectInst &Sel0, ICmpInst &Cmp0,
 // and swap the hands of select.
 static Instruction *
 tryToReuseConstantFromSelectInComparison(SelectInst &Sel, ICmpInst &Cmp,
-                                         InstCombiner &IC) {
+                                         InstCombinerImpl &IC) {
   ICmpInst::Predicate Pred;
   Value *X;
   Constant *C0;
@@ -1386,7 +1358,7 @@ tryToReuseConstantFromSelectInComparison(SelectInst &Sel, ICmpInst &Cmp,
 
   // If comparison predicate is non-canonical, then we certainly won't be able
   // to make it canonical; canonicalizeCmpWithConstant() already tried.
-  if (!isCanonicalPredicate(Pred))
+  if (!InstCombiner::isCanonicalPredicate(Pred))
     return nullptr;
 
   // If the [input] type of comparison and select type are different, lets abort
@@ -1414,7 +1386,8 @@ tryToReuseConstantFromSelectInComparison(SelectInst &Sel, ICmpInst &Cmp,
     return nullptr;
 
   // Check the constant we'd have with flipped-strictness predicate.
-  auto FlippedStrictness = getFlippedStrictnessPredicateAndConstant(Pred, C0);
+  auto FlippedStrictness =
+      InstCombiner::getFlippedStrictnessPredicateAndConstant(Pred, C0);
   if (!FlippedStrictness)
     return nullptr;
 
@@ -1437,10 +1410,10 @@ tryToReuseConstantFromSelectInComparison(SelectInst &Sel, ICmpInst &Cmp,
 }
 
 /// Visit a SelectInst that has an ICmpInst as its first operand.
-Instruction *InstCombiner::foldSelectInstWithICmp(SelectInst &SI,
-                                                  ICmpInst *ICI) {
-  if (Value *V = foldSelectValueEquivalence(SI, *ICI, SQ))
-    return replaceInstUsesWith(SI, V);
+Instruction *InstCombinerImpl::foldSelectInstWithICmp(SelectInst &SI,
+                                                      ICmpInst *ICI) {
+  if (Instruction *NewSel = foldSelectValueEquivalence(SI, *ICI))
+    return NewSel;
 
   if (Instruction *NewSel = canonicalizeMinMaxWithConstant(SI, *ICI, *this))
     return NewSel;
@@ -1590,11 +1563,11 @@ static bool canSelectOperandBeMappingIntoPredBlock(const Value *V,
 
 /// We have an SPF (e.g. a min or max) of an SPF of the form:
 ///   SPF2(SPF1(A, B), C)
-Instruction *InstCombiner::foldSPFofSPF(Instruction *Inner,
-                                        SelectPatternFlavor SPF1,
-                                        Value *A, Value *B,
-                                        Instruction &Outer,
-                                        SelectPatternFlavor SPF2, Value *C) {
+Instruction *InstCombinerImpl::foldSPFofSPF(Instruction *Inner,
+                                            SelectPatternFlavor SPF1, Value *A,
+                                            Value *B, Instruction &Outer,
+                                            SelectPatternFlavor SPF2,
+                                            Value *C) {
   if (Outer.getType() != Inner->getType())
     return nullptr;
 
@@ -1911,7 +1884,7 @@ foldOverflowingAddSubSelect(SelectInst &SI, InstCombiner::BuilderTy &Builder) {
   return CallInst::Create(F, {X, Y});
 }
 
-Instruction *InstCombiner::foldSelectExtConst(SelectInst &Sel) {
+Instruction *InstCombinerImpl::foldSelectExtConst(SelectInst &Sel) {
   Constant *C;
   if (!match(Sel.getTrueValue(), m_Constant(C)) &&
       !match(Sel.getFalseValue(), m_Constant(C)))
@@ -1977,10 +1950,11 @@ Instruction *InstCombiner::foldSelectExtConst(SelectInst &Sel) {
 static Instruction *canonicalizeSelectToShuffle(SelectInst &SI) {
   Value *CondVal = SI.getCondition();
   Constant *CondC;
-  if (!CondVal->getType()->isVectorTy() || !match(CondVal, m_Constant(CondC)))
+  auto *CondValTy = dyn_cast<FixedVectorType>(CondVal->getType());
+  if (!CondValTy || !match(CondVal, m_Constant(CondC)))
     return nullptr;
 
-  unsigned NumElts = cast<VectorType>(CondVal->getType())->getNumElements();
+  unsigned NumElts = CondValTy->getNumElements();
   SmallVector<int, 16> Mask;
   Mask.reserve(NumElts);
   for (unsigned i = 0; i != NumElts; ++i) {
@@ -2012,8 +1986,8 @@ static Instruction *canonicalizeSelectToShuffle(SelectInst &SI) {
 /// to a vector select by splatting the condition. A splat may get folded with
 /// other operations in IR and having all operands of a select be vector types
 /// is likely better for vector codegen.
-static Instruction *canonicalizeScalarSelectOfVecs(
-    SelectInst &Sel, InstCombiner &IC) {
+static Instruction *canonicalizeScalarSelectOfVecs(SelectInst &Sel,
+                                                   InstCombinerImpl &IC) {
   auto *Ty = dyn_cast<VectorType>(Sel.getType());
   if (!Ty)
     return nullptr;
@@ -2026,8 +2000,8 @@ static Instruction *canonicalizeScalarSelectOfVecs(
   // select (extelt V, Index), T, F --> select (splat V, Index), T, F
   // Splatting the extracted condition reduces code (we could directly create a
   // splat shuffle of the source vector to eliminate the intermediate step).
-  unsigned NumElts = Ty->getNumElements();
-  return IC.replaceOperand(Sel, 0, IC.Builder.CreateVectorSplat(NumElts, Cond));
+  return IC.replaceOperand(
+      Sel, 0, IC.Builder.CreateVectorSplat(Ty->getElementCount(), Cond));
 }
 
 /// Reuse bitcasted operands between a compare and select:
@@ -2183,7 +2157,7 @@ static Instruction *moveAddAfterMinMax(SelectPatternFlavor SPF, Value *X,
 }
 
 /// Match a sadd_sat or ssub_sat which is using min/max to clamp the value.
-Instruction *InstCombiner::matchSAddSubSat(SelectInst &MinMax1) {
+Instruction *InstCombinerImpl::matchSAddSubSat(SelectInst &MinMax1) {
   Type *Ty = MinMax1.getType();
 
   // We are looking for a tree of:
@@ -2304,34 +2278,42 @@ static Instruction *factorizeMinMaxTree(SelectPatternFlavor SPF, Value *LHS,
   return SelectInst::Create(CmpABC, MinMaxOp, ThirdOp);
 }
 
-/// Try to reduce a rotate pattern that includes a compare and select into a
-/// funnel shift intrinsic. Example:
+/// Try to reduce a funnel/rotate pattern that includes a compare and select
+/// into a funnel shift intrinsic. Example:
 /// rotl32(a, b) --> (b == 0 ? a : ((a >> (32 - b)) | (a << b)))
 ///              --> call llvm.fshl.i32(a, a, b)
-static Instruction *foldSelectRotate(SelectInst &Sel) {
-  // The false value of the select must be a rotate of the true value.
-  Value *Or0, *Or1;
-  if (!match(Sel.getFalseValue(), m_OneUse(m_Or(m_Value(Or0), m_Value(Or1)))))
+/// fshl32(a, b, c) --> (c == 0 ? a : ((b >> (32 - c)) | (a << c)))
+///                 --> call llvm.fshl.i32(a, b, c)
+/// fshr32(a, b, c) --> (c == 0 ? b : ((a >> (32 - c)) | (b << c)))
+///                 --> call llvm.fshr.i32(a, b, c)
+static Instruction *foldSelectFunnelShift(SelectInst &Sel,
+                                          InstCombiner::BuilderTy &Builder) {
+  // This must be a power-of-2 type for a bitmasking transform to be valid.
+  unsigned Width = Sel.getType()->getScalarSizeInBits();
+  if (!isPowerOf2_32(Width))
     return nullptr;
 
-  Value *TVal = Sel.getTrueValue();
-  Value *SA0, *SA1;
-  if (!match(Or0, m_OneUse(m_LogicalShift(m_Specific(TVal), m_Value(SA0)))) ||
-      !match(Or1, m_OneUse(m_LogicalShift(m_Specific(TVal), m_Value(SA1)))))
+  BinaryOperator *Or0, *Or1;
+  if (!match(Sel.getFalseValue(), m_OneUse(m_Or(m_BinOp(Or0), m_BinOp(Or1)))))
     return nullptr;
 
-  auto ShiftOpcode0 = cast<BinaryOperator>(Or0)->getOpcode();
-  auto ShiftOpcode1 = cast<BinaryOperator>(Or1)->getOpcode();
-  if (ShiftOpcode0 == ShiftOpcode1)
+  Value *SV0, *SV1, *SA0, *SA1;
+  if (!match(Or0, m_OneUse(m_LogicalShift(m_Value(SV0),
+                                          m_ZExtOrSelf(m_Value(SA0))))) ||
+      !match(Or1, m_OneUse(m_LogicalShift(m_Value(SV1),
+                                          m_ZExtOrSelf(m_Value(SA1))))) ||
+      Or0->getOpcode() == Or1->getOpcode())
     return nullptr;
 
-  // We have one of these patterns so far:
-  // select ?, TVal, (or (lshr TVal, SA0), (shl TVal, SA1))
-  // select ?, TVal, (or (shl TVal, SA0), (lshr TVal, SA1))
-  // This must be a power-of-2 rotate for a bitmasking transform to be valid.
-  unsigned Width = Sel.getType()->getScalarSizeInBits();
-  if (!isPowerOf2_32(Width))
-    return nullptr;
+  // Canonicalize to or(shl(SV0, SA0), lshr(SV1, SA1)).
+  if (Or0->getOpcode() == BinaryOperator::LShr) {
+    std::swap(Or0, Or1);
+    std::swap(SV0, SV1);
+    std::swap(SA0, SA1);
+  }
+  assert(Or0->getOpcode() == BinaryOperator::Shl &&
+         Or1->getOpcode() == BinaryOperator::LShr &&
+         "Illegal or(shift,shift) pair");
 
   // Check the shift amounts to see if they are an opposite pair.
   Value *ShAmt;
@@ -2342,6 +2324,15 @@ static Instruction *foldSelectRotate(SelectInst &Sel) {
   else
     return nullptr;
 
+  // We should now have this pattern:
+  // select ?, TVal, (or (shl SV0, SA0), (lshr SV1, SA1))
+  // The false value of the select must be a funnel-shift of the true value:
+  // IsFShl -> TVal must be SV0 else TVal must be SV1.
+  bool IsFshl = (ShAmt == SA0);
+  Value *TVal = Sel.getTrueValue();
+  if ((IsFshl && TVal != SV0) || (!IsFshl && TVal != SV1))
+    return nullptr;
+
   // Finally, see if the select is filtering out a shift-by-zero.
   Value *Cond = Sel.getCondition();
   ICmpInst::Predicate Pred;
@@ -2349,13 +2340,21 @@ static Instruction *foldSelectRotate(SelectInst &Sel) {
       Pred != ICmpInst::ICMP_EQ)
     return nullptr;
 
-  // This is a rotate that avoids shift-by-bitwidth UB in a suboptimal way.
+  // If this is not a rotate then the select was blocking poison from the
+  // 'shift-by-zero' non-TVal, but a funnel shift won't - so freeze it.
+  if (SV0 != SV1) {
+    if (IsFshl && !llvm::isGuaranteedNotToBePoison(SV1))
+      SV1 = Builder.CreateFreeze(SV1);
+    else if (!IsFshl && !llvm::isGuaranteedNotToBePoison(SV0))
+      SV0 = Builder.CreateFreeze(SV0);
+  }
+
+  // This is a funnel/rotate that avoids shift-by-bitwidth UB in a suboptimal way.
   // Convert to funnel shift intrinsic.
-  bool IsFshl = (ShAmt == SA0 && ShiftOpcode0 == BinaryOperator::Shl) ||
-                (ShAmt == SA1 && ShiftOpcode1 == BinaryOperator::Shl);
   Intrinsic::ID IID = IsFshl ? Intrinsic::fshl : Intrinsic::fshr;
   Function *F = Intrinsic::getDeclaration(Sel.getModule(), IID, Sel.getType());
-  return IntrinsicInst::Create(F, { TVal, TVal, ShAmt });
+  ShAmt = Builder.CreateZExt(ShAmt, Sel.getType());
+  return IntrinsicInst::Create(F, { SV0, SV1, ShAmt });
 }
 
 static Instruction *foldSelectToCopysign(SelectInst &Sel,
@@ -2379,7 +2378,8 @@ static Instruction *foldSelectToCopysign(SelectInst &Sel,
   bool IsTrueIfSignSet;
   ICmpInst::Predicate Pred;
   if (!match(Cond, m_OneUse(m_ICmp(Pred, m_BitCast(m_Value(X)), m_APInt(C)))) ||
-      !isSignBitCheck(Pred, *C, IsTrueIfSignSet) || X->getType() != SelType)
+      !InstCombiner::isSignBitCheck(Pred, *C, IsTrueIfSignSet) ||
+      X->getType() != SelType)
     return nullptr;
 
   // If needed, negate the value that will be the sign argument of the copysign:
@@ -2400,7 +2400,7 @@ static Instruction *foldSelectToCopysign(SelectInst &Sel,
   return CopySign;
 }
 
-Instruction *InstCombiner::foldVectorSelect(SelectInst &Sel) {
+Instruction *InstCombinerImpl::foldVectorSelect(SelectInst &Sel) {
   auto *VecTy = dyn_cast<FixedVectorType>(Sel.getType());
   if (!VecTy)
     return nullptr;
@@ -2530,7 +2530,33 @@ static Instruction *foldSelectToPhi(SelectInst &Sel, const DominatorTree &DT,
   return nullptr;
 }
 
-Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
+static Value *foldSelectWithFrozenICmp(SelectInst &Sel, InstCombiner::BuilderTy &Builder) {
+  FreezeInst *FI = dyn_cast<FreezeInst>(Sel.getCondition());
+  if (!FI)
+    return nullptr;
+
+  Value *Cond = FI->getOperand(0);
+  Value *TrueVal = Sel.getTrueValue(), *FalseVal = Sel.getFalseValue();
+
+  //   select (freeze(x == y)), x, y --> y
+  //   select (freeze(x != y)), x, y --> x
+  // The freeze should be only used by this select. Otherwise, remaining uses of
+  // the freeze can observe a contradictory value.
+  //   c = freeze(x == y)   ; Let's assume that y = poison & x = 42; c is 0 or 1
+  //   a = select c, x, y   ;
+  //   f(a, c)              ; f(poison, 1) cannot happen, but if a is folded
+  //                        ; to y, this can happen.
+  CmpInst::Predicate Pred;
+  if (FI->hasOneUse() &&
+      match(Cond, m_c_ICmp(Pred, m_Specific(TrueVal), m_Specific(FalseVal))) &&
+      (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE)) {
+    return Pred == ICmpInst::ICMP_EQ ? FalseVal : TrueVal;
+  }
+
+  return nullptr;
+}
+
+Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
   Value *CondVal = SI.getCondition();
   Value *TrueVal = SI.getTrueValue();
   Value *FalseVal = SI.getFalseValue();
@@ -2566,38 +2592,45 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
 
   if (SelType->isIntOrIntVectorTy(1) &&
       TrueVal->getType() == CondVal->getType()) {
-    if (match(TrueVal, m_One())) {
+    if (match(TrueVal, m_One()) &&
+        (EnableUnsafeSelectTransform || impliesPoison(FalseVal, CondVal))) {
       // Change: A = select B, true, C --> A = or B, C
       return BinaryOperator::CreateOr(CondVal, FalseVal);
     }
-    if (match(TrueVal, m_Zero())) {
-      // Change: A = select B, false, C --> A = and !B, C
-      Value *NotCond = Builder.CreateNot(CondVal, "not." + CondVal->getName());
-      return BinaryOperator::CreateAnd(NotCond, FalseVal);
-    }
-    if (match(FalseVal, m_Zero())) {
+    if (match(FalseVal, m_Zero()) &&
+        (EnableUnsafeSelectTransform || impliesPoison(TrueVal, CondVal))) {
       // Change: A = select B, C, false --> A = and B, C
       return BinaryOperator::CreateAnd(CondVal, TrueVal);
     }
+
+    // select a, false, b -> select !a, b, false
+    if (match(TrueVal, m_Zero())) {
+      Value *NotCond = Builder.CreateNot(CondVal, "not." + CondVal->getName());
+      return SelectInst::Create(NotCond, FalseVal,
+                                ConstantInt::getFalse(SelType));
+    }
+    // select a, b, true -> select !a, true, b
     if (match(FalseVal, m_One())) {
-      // Change: A = select B, C, true --> A = or !B, C
       Value *NotCond = Builder.CreateNot(CondVal, "not." + CondVal->getName());
-      return BinaryOperator::CreateOr(NotCond, TrueVal);
+      return SelectInst::Create(NotCond, ConstantInt::getTrue(SelType),
+                                TrueVal);
     }
 
-    // select a, a, b  -> a | b
-    // select a, b, a  -> a & b
+    // select a, a, b -> select a, true, b
     if (CondVal == TrueVal)
-      return BinaryOperator::CreateOr(CondVal, FalseVal);
+      return replaceOperand(SI, 1, ConstantInt::getTrue(SelType));
+    // select a, b, a -> select a, b, false
     if (CondVal == FalseVal)
-      return BinaryOperator::CreateAnd(CondVal, TrueVal);
+      return replaceOperand(SI, 2, ConstantInt::getFalse(SelType));
 
-    // select a, ~a, b -> (~a) & b
-    // select a, b, ~a -> (~a) | b
+    // select a, !a, b -> select !a, b, false
     if (match(TrueVal, m_Not(m_Specific(CondVal))))
-      return BinaryOperator::CreateAnd(TrueVal, FalseVal);
+      return SelectInst::Create(TrueVal, FalseVal,
+                                ConstantInt::getFalse(SelType));
+    // select a, b, !a -> select !a, true, b
     if (match(FalseVal, m_Not(m_Specific(CondVal))))
-      return BinaryOperator::CreateOr(TrueVal, FalseVal);
+      return SelectInst::Create(FalseVal, ConstantInt::getTrue(SelType),
+                                TrueVal);
   }
 
   // Selecting between two integer or vector splat integer constants?
@@ -2606,7 +2639,10 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
   // select i1 %c, <2 x i8> <1, 1>, <2 x i8> <0, 0>
   // because that may need 3 instructions to splat the condition value:
   // extend, insertelement, shufflevector.
-  if (SelType->isIntOrIntVectorTy() &&
+  //
+  // Do not handle i1 TrueVal and FalseVal otherwise would result in
+  // zext/sext i1 to i1.
+  if (SelType->isIntOrIntVectorTy() && !SelType->isIntOrIntVectorTy(1) &&
       CondVal->getType()->isVectorTy() == SelType->isVectorTy()) {
     // select C, 1, 0 -> zext C to int
     if (match(TrueVal, m_One()) && match(FalseVal, m_Zero()))
@@ -2853,8 +2889,9 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
         return replaceOperand(SI, 1, TrueSI->getTrueValue());
       }
       // select(C0, select(C1, a, b), b) -> select(C0&C1, a, b)
-      // We choose this as normal form to enable folding on the And and shortening
-      // paths for the values (this helps GetUnderlyingObjects() for example).
+      // We choose this as normal form to enable folding on the And and
+      // shortening paths for the values (this helps getUnderlyingObjects() for
+      // example).
       if (TrueSI->getFalseValue() == FalseVal && TrueSI->hasOneUse()) {
         Value *And = Builder.CreateAnd(CondVal, TrueSI->getCondition());
         replaceOperand(SI, 0, And);
@@ -2937,7 +2974,8 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
   }
 
   Value *NotCond;
-  if (match(CondVal, m_Not(m_Value(NotCond)))) {
+  if (match(CondVal, m_Not(m_Value(NotCond))) &&
+      !InstCombiner::shouldAvoidAbsorbingNotIntoSelect(SI)) {
     replaceOperand(SI, 0, NotCond);
     SI.swapValues();
     SI.swapProfMetadata();
@@ -2971,8 +3009,8 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
   if (Instruction *Select = foldSelectBinOpIdentity(SI, TLI, *this))
     return Select;
 
-  if (Instruction *Rot = foldSelectRotate(SI))
-    return Rot;
+  if (Instruction *Funnel = foldSelectFunnelShift(SI, Builder))
+    return Funnel;
 
   if (Instruction *Copysign = foldSelectToCopysign(SI, Builder))
     return Copysign;
@@ -2980,5 +3018,8 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
   if (Instruction *PN = foldSelectToPhi(SI, DT, Builder))
     return replaceInstUsesWith(SI, PN);
 
+  if (Value *Fr = foldSelectWithFrozenICmp(SI, Builder))
+    return replaceInstUsesWith(SI, Fr);
+
   return nullptr;
 }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
index 0a842b4e1047..127bf8080959 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -15,11 +15,36 @@
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
 using namespace llvm;
 using namespace PatternMatch;
 
 #define DEBUG_TYPE "instcombine"
 
+bool canTryToConstantAddTwoShiftAmounts(Value *Sh0, Value *ShAmt0, Value *Sh1,
+                                        Value *ShAmt1) {
+  // We have two shift amounts from two different shifts. The types of those
+  // shift amounts may not match. If that's the case let's bailout now..
+  if (ShAmt0->getType() != ShAmt1->getType())
+    return false;
+
+  // As input, we have the following pattern:
+  //   Sh0 (Sh1 X, Q), K
+  // We want to rewrite that as:
+  //   Sh x, (Q+K)  iff (Q+K) u< bitwidth(x)
+  // While we know that originally (Q+K) would not overflow
+  // (because  2 * (N-1) u<= iN -1), we have looked past extensions of
+  // shift amounts. so it may now overflow in smaller bitwidth.
+  // To ensure that does not happen, we need to ensure that the total maximal
+  // shift amount is still representable in that smaller bit width.
+  unsigned MaximalPossibleTotalShiftAmount =
+      (Sh0->getType()->getScalarSizeInBits() - 1) +
+      (Sh1->getType()->getScalarSizeInBits() - 1);
+  APInt MaximalRepresentableShiftAmount =
+      APInt::getAllOnesValue(ShAmt0->getType()->getScalarSizeInBits());
+  return MaximalRepresentableShiftAmount.uge(MaximalPossibleTotalShiftAmount);
+}
+
 // Given pattern:
 //   (x shiftopcode Q) shiftopcode K
 // we should rewrite it as
@@ -31,7 +56,7 @@ using namespace PatternMatch;
 //
 // AnalyzeForSignBitExtraction indicates that we will only analyze whether this
 // pattern has any 2 right-shifts that sum to 1 less than original bit width.
-Value *InstCombiner::reassociateShiftAmtsOfTwoSameDirectionShifts(
+Value *InstCombinerImpl::reassociateShiftAmtsOfTwoSameDirectionShifts(
     BinaryOperator *Sh0, const SimplifyQuery &SQ,
     bool AnalyzeForSignBitExtraction) {
   // Look for a shift of some instruction, ignore zext of shift amount if any.
@@ -56,26 +81,8 @@ Value *InstCombiner::reassociateShiftAmtsOfTwoSameDirectionShifts(
   if (!match(Sh1, m_Shift(m_Value(X), m_ZExtOrSelf(m_Value(ShAmt1)))))
     return nullptr;
 
-  // We have two shift amounts from two different shifts. The types of those
-  // shift amounts may not match. If that's the case let's bailout now..
-  if (ShAmt0->getType() != ShAmt1->getType())
-    return nullptr;
-
-  // As input, we have the following pattern:
-  //   Sh0 (Sh1 X, Q), K
-  // We want to rewrite that as:
-  //   Sh x, (Q+K)  iff (Q+K) u< bitwidth(x)
-  // While we know that originally (Q+K) would not overflow
-  // (because  2 * (N-1) u<= iN -1), we have looked past extensions of
-  // shift amounts. so it may now overflow in smaller bitwidth.
-  // To ensure that does not happen, we need to ensure that the total maximal
-  // shift amount is still representable in that smaller bit width.
-  unsigned MaximalPossibleTotalShiftAmount =
-      (Sh0->getType()->getScalarSizeInBits() - 1) +
-      (Sh1->getType()->getScalarSizeInBits() - 1);
-  APInt MaximalRepresentableShiftAmount =
-      APInt::getAllOnesValue(ShAmt0->getType()->getScalarSizeInBits());
-  if (MaximalRepresentableShiftAmount.ult(MaximalPossibleTotalShiftAmount))
+  // Verify that it would be safe to try to add those two shift amounts.
+  if (!canTryToConstantAddTwoShiftAmounts(Sh0, ShAmt0, Sh1, ShAmt1))
     return nullptr;
 
   // We are only looking for signbit extraction if we have two right shifts.
@@ -219,9 +226,9 @@ dropRedundantMaskingOfLeftShiftInput(BinaryOperator *OuterShift,
     // Peek through an optional zext of the shift amount.
     match(MaskShAmt, m_ZExtOrSelf(m_Value(MaskShAmt)));
 
-    // We have two shift amounts from two different shifts. The types of those
-    // shift amounts may not match. If that's the case let's bailout now.
-    if (MaskShAmt->getType() != ShiftShAmt->getType())
+    // Verify that it would be safe to try to add those two shift amounts.
+    if (!canTryToConstantAddTwoShiftAmounts(OuterShift, ShiftShAmt, Masked,
+                                            MaskShAmt))
       return nullptr;
 
     // Can we simplify (MaskShAmt+ShiftShAmt) ?
@@ -251,9 +258,9 @@ dropRedundantMaskingOfLeftShiftInput(BinaryOperator *OuterShift,
     // Peek through an optional zext of the shift amount.
     match(MaskShAmt, m_ZExtOrSelf(m_Value(MaskShAmt)));
 
-    // We have two shift amounts from two different shifts. The types of those
-    // shift amounts may not match. If that's the case let's bailout now.
-    if (MaskShAmt->getType() != ShiftShAmt->getType())
+    // Verify that it would be safe to try to add those two shift amounts.
+    if (!canTryToConstantAddTwoShiftAmounts(OuterShift, ShiftShAmt, Masked,
+                                            MaskShAmt))
       return nullptr;
 
     // Can we simplify (ShiftShAmt-MaskShAmt) ?
@@ -327,8 +334,8 @@ static Instruction *foldShiftOfShiftedLogic(BinaryOperator &I,
   if (!LogicInst || !LogicInst->isBitwiseLogicOp() || !LogicInst->hasOneUse())
     return nullptr;
 
-  const APInt *C0, *C1;
-  if (!match(I.getOperand(1), m_APInt(C1)))
+  Constant *C0, *C1;
+  if (!match(I.getOperand(1), m_Constant(C1)))
     return nullptr;
 
   Instruction::BinaryOps ShiftOpcode = I.getOpcode();
@@ -339,10 +346,12 @@ static Instruction *foldShiftOfShiftedLogic(BinaryOperator &I,
   // TODO: Remove the one-use check if the other logic operand (Y) is constant.
   Value *X, *Y;
   auto matchFirstShift = [&](Value *V) {
-    return !isa<ConstantExpr>(V) &&
-           match(V, m_OneUse(m_Shift(m_Value(X), m_APInt(C0)))) &&
-           cast<BinaryOperator>(V)->getOpcode() == ShiftOpcode &&
-           (*C0 + *C1).ult(Ty->getScalarSizeInBits());
+    BinaryOperator *BO;
+    APInt Threshold(Ty->getScalarSizeInBits(), Ty->getScalarSizeInBits());
+    return match(V, m_BinOp(BO)) && BO->getOpcode() == ShiftOpcode &&
+           match(V, m_OneUse(m_Shift(m_Value(X), m_Constant(C0)))) &&
+           match(ConstantExpr::getAdd(C0, C1),
+                 m_SpecificInt_ICMP(ICmpInst::ICMP_ULT, Threshold));
   };
 
   // Logic ops are commutative, so check each operand for a match.
@@ -354,13 +363,13 @@ static Instruction *foldShiftOfShiftedLogic(BinaryOperator &I,
     return nullptr;
 
   // shift (logic (shift X, C0), Y), C1 -> logic (shift X, C0+C1), (shift Y, C1)
-  Constant *ShiftSumC = ConstantInt::get(Ty, *C0 + *C1);
+  Constant *ShiftSumC = ConstantExpr::getAdd(C0, C1);
   Value *NewShift1 = Builder.CreateBinOp(ShiftOpcode, X, ShiftSumC);
   Value *NewShift2 = Builder.CreateBinOp(ShiftOpcode, Y, I.getOperand(1));
   return BinaryOperator::Create(LogicInst->getOpcode(), NewShift1, NewShift2);
 }
 
-Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) {
+Instruction *InstCombinerImpl::commonShiftTransforms(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
   assert(Op0->getType() == Op1->getType());
 
@@ -399,15 +408,15 @@ Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) {
       return BinaryOperator::Create(
           I.getOpcode(), Builder.CreateBinOp(I.getOpcode(), Op0, C), A);
 
-  // X shift (A srem B) -> X shift (A and B-1) iff B is a power of 2.
+  // X shift (A srem C) -> X shift (A and (C - 1)) iff C is a power of 2.
   // Because shifts by negative values (which could occur if A were negative)
   // are undefined.
-  const APInt *B;
-  if (Op1->hasOneUse() && match(Op1, m_SRem(m_Value(A), m_Power2(B)))) {
+  if (Op1->hasOneUse() && match(Op1, m_SRem(m_Value(A), m_Constant(C))) &&
+      match(C, m_Power2())) {
     // FIXME: Should this get moved into SimplifyDemandedBits by saying we don't
     // demand the sign bit (and many others) here??
-    Value *Rem = Builder.CreateAnd(A, ConstantInt::get(I.getType(), *B - 1),
-                                   Op1->getName());
+    Constant *Mask = ConstantExpr::getSub(C, ConstantInt::get(I.getType(), 1));
+    Value *Rem = Builder.CreateAnd(A, Mask, Op1->getName());
     return replaceOperand(I, 1, Rem);
   }
 
@@ -420,8 +429,8 @@ Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) {
 /// Return true if we can simplify two logical (either left or right) shifts
 /// that have constant shift amounts: OuterShift (InnerShift X, C1), C2.
 static bool canEvaluateShiftedShift(unsigned OuterShAmt, bool IsOuterShl,
-                                    Instruction *InnerShift, InstCombiner &IC,
-                                    Instruction *CxtI) {
+                                    Instruction *InnerShift,
+                                    InstCombinerImpl &IC, Instruction *CxtI) {
   assert(InnerShift->isLogicalShift() && "Unexpected instruction type");
 
   // We need constant scalar or constant splat shifts.
@@ -472,7 +481,7 @@ static bool canEvaluateShiftedShift(unsigned OuterShAmt, bool IsOuterShl,
 /// where the client will ask if E can be computed shifted right by 64-bits. If
 /// this succeeds, getShiftedValue() will be called to produce the value.
 static bool canEvaluateShifted(Value *V, unsigned NumBits, bool IsLeftShift,
-                               InstCombiner &IC, Instruction *CxtI) {
+                               InstCombinerImpl &IC, Instruction *CxtI) {
   // We can always evaluate constants shifted.
   if (isa<Constant>(V))
     return true;
@@ -480,31 +489,6 @@ static bool canEvaluateShifted(Value *V, unsigned NumBits, bool IsLeftShift,
   Instruction *I = dyn_cast<Instruction>(V);
   if (!I) return false;
 
-  // If this is the opposite shift, we can directly reuse the input of the shift
-  // if the needed bits are already zero in the input.  This allows us to reuse
-  // the value which means that we don't care if the shift has multiple uses.
-  //  TODO:  Handle opposite shift by exact value.
-  ConstantInt *CI = nullptr;
-  if ((IsLeftShift && match(I, m_LShr(m_Value(), m_ConstantInt(CI)))) ||
-      (!IsLeftShift && match(I, m_Shl(m_Value(), m_ConstantInt(CI))))) {
-    if (CI->getValue() == NumBits) {
-      // TODO: Check that the input bits are already zero with MaskedValueIsZero
-#if 0
-      // If this is a truncate of a logical shr, we can truncate it to a smaller
-      // lshr iff we know that the bits we would otherwise be shifting in are
-      // already zeros.
-      uint32_t OrigBitWidth = OrigTy->getScalarSizeInBits();
-      uint32_t BitWidth = Ty->getScalarSizeInBits();
-      if (MaskedValueIsZero(I->getOperand(0),
-            APInt::getHighBitsSet(OrigBitWidth, OrigBitWidth-BitWidth)) &&
-          CI->getLimitedValue(BitWidth) < BitWidth) {
-        return CanEvaluateTruncated(I->getOperand(0), Ty);
-      }
-#endif
-
-    }
-  }
-
   // We can't mutate something that has multiple uses: doing so would
   // require duplicating the instruction in general, which isn't profitable.
   if (!I->hasOneUse()) return false;
@@ -608,7 +592,7 @@ static Value *foldShiftedShift(BinaryOperator *InnerShift, unsigned OuterShAmt,
 /// When canEvaluateShifted() returns true for an expression, this function
 /// inserts the new computation that produces the shifted value.
 static Value *getShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
-                              InstCombiner &IC, const DataLayout &DL) {
+                              InstCombinerImpl &IC, const DataLayout &DL) {
   // We can always evaluate constants shifted.
   if (Constant *C = dyn_cast<Constant>(V)) {
     if (isLeftShift)
@@ -618,7 +602,7 @@ static Value *getShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
   }
 
   Instruction *I = cast<Instruction>(V);
-  IC.Worklist.push(I);
+  IC.addToWorklist(I);
 
   switch (I->getOpcode()) {
   default: llvm_unreachable("Inconsistency with CanEvaluateShifted");
@@ -666,14 +650,17 @@ static bool canShiftBinOpWithConstantRHS(BinaryOperator &Shift,
   case Instruction::Add:
     return Shift.getOpcode() == Instruction::Shl;
   case Instruction::Or:
-  case Instruction::Xor:
   case Instruction::And:
     return true;
+  case Instruction::Xor:
+    // Do not change a 'not' of logical shift because that would create a normal
+    // 'xor'. The 'not' is likely better for analysis, SCEV, and codegen.
+    return !(Shift.isLogicalShift() && match(BO, m_Not(m_Value())));
   }
 }
 
-Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
-                                               BinaryOperator &I) {
+Instruction *InstCombinerImpl::FoldShiftByConstant(Value *Op0, Constant *Op1,
+                                                   BinaryOperator &I) {
   bool isLeftShift = I.getOpcode() == Instruction::Shl;
 
   const APInt *Op1C;
@@ -695,8 +682,8 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
 
   // See if we can simplify any instructions used by the instruction whose sole
   // purpose is to compute bits we don't care about.
-  unsigned TypeBits = Op0->getType()->getScalarSizeInBits();
-
+  Type *Ty = I.getType();
+  unsigned TypeBits = Ty->getScalarSizeInBits();
   assert(!Op1C->uge(TypeBits) &&
          "Shift over the type width should have been removed already");
 
@@ -704,18 +691,20 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
     return FoldedShift;
 
   // Fold shift2(trunc(shift1(x,c1)), c2) -> trunc(shift2(shift1(x,c1),c2))
-  if (TruncInst *TI = dyn_cast<TruncInst>(Op0)) {
-    Instruction *TrOp = dyn_cast<Instruction>(TI->getOperand(0));
+  if (auto *TI = dyn_cast<TruncInst>(Op0)) {
     // If 'shift2' is an ashr, we would have to get the sign bit into a funny
     // place.  Don't try to do this transformation in this case.  Also, we
     // require that the input operand is a shift-by-constant so that we have
     // confidence that the shifts will get folded together.  We could do this
     // xform in more cases, but it is unlikely to be profitable.
-    if (TrOp && I.isLogicalShift() && TrOp->isShift() &&
-        isa<ConstantInt>(TrOp->getOperand(1))) {
+    const APInt *TrShiftAmt;
+    if (I.isLogicalShift() &&
+        match(TI->getOperand(0), m_Shift(m_Value(), m_APInt(TrShiftAmt)))) {
+      auto *TrOp = cast<Instruction>(TI->getOperand(0));
+      Type *SrcTy = TrOp->getType();
+
       // Okay, we'll do this xform.  Make the shift of shift.
-      Constant *ShAmt =
-          ConstantExpr::getZExt(cast<Constant>(Op1), TrOp->getType());
+      Constant *ShAmt = ConstantExpr::getZExt(Op1, SrcTy);
       // (shift2 (shift1 & 0x00FF), c2)
       Value *NSh = Builder.CreateBinOp(I.getOpcode(), TrOp, ShAmt, I.getName());
 
@@ -723,36 +712,27 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
       // part of the register be zeros.  Emulate this by inserting an AND to
       // clear the top bits as needed.  This 'and' will usually be zapped by
       // other xforms later if dead.
-      unsigned SrcSize = TrOp->getType()->getScalarSizeInBits();
-      unsigned DstSize = TI->getType()->getScalarSizeInBits();
-      APInt MaskV(APInt::getLowBitsSet(SrcSize, DstSize));
+      unsigned SrcSize = SrcTy->getScalarSizeInBits();
+      Constant *MaskV =
+          ConstantInt::get(SrcTy, APInt::getLowBitsSet(SrcSize, TypeBits));
 
       // The mask we constructed says what the trunc would do if occurring
       // between the shifts.  We want to know the effect *after* the second
       // shift.  We know that it is a logical shift by a constant, so adjust the
       // mask as appropriate.
-      if (I.getOpcode() == Instruction::Shl)
-        MaskV <<= Op1C->getZExtValue();
-      else {
-        assert(I.getOpcode() == Instruction::LShr && "Unknown logical shift");
-        MaskV.lshrInPlace(Op1C->getZExtValue());
-      }
-
+      MaskV = ConstantExpr::get(I.getOpcode(), MaskV, ShAmt);
       // shift1 & 0x00FF
-      Value *And = Builder.CreateAnd(NSh,
-                                     ConstantInt::get(I.getContext(), MaskV),
-                                     TI->getName());
-
+      Value *And = Builder.CreateAnd(NSh, MaskV, TI->getName());
       // Return the value truncated to the interesting size.
-      return new TruncInst(And, I.getType());
+      return new TruncInst(And, Ty);
     }
   }
 
   if (Op0->hasOneUse()) {
     if (BinaryOperator *Op0BO = dyn_cast<BinaryOperator>(Op0)) {
       // Turn ((X >> C) + Y) << C  ->  (X + (Y << C)) & (~0 << C)
-      Value *V1, *V2;
-      ConstantInt *CC;
+      Value *V1;
+      const APInt *CC;
       switch (Op0BO->getOpcode()) {
       default: break;
       case Instruction::Add:
@@ -770,25 +750,22 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
           Value *X = Builder.CreateBinOp(Op0BO->getOpcode(), YS, V1,
                                          Op0BO->getOperand(1)->getName());
           unsigned Op1Val = Op1C->getLimitedValue(TypeBits);
-
           APInt Bits = APInt::getHighBitsSet(TypeBits, TypeBits - Op1Val);
-          Constant *Mask = ConstantInt::get(I.getContext(), Bits);
-          if (VectorType *VT = dyn_cast<VectorType>(X->getType()))
-            Mask = ConstantVector::getSplat(VT->getElementCount(), Mask);
+          Constant *Mask = ConstantInt::get(Ty, Bits);
           return BinaryOperator::CreateAnd(X, Mask);
         }
 
         // Turn (Y + ((X >> C) & CC)) << C  ->  ((X & (CC << C)) + (Y << C))
         Value *Op0BOOp1 = Op0BO->getOperand(1);
         if (isLeftShift && Op0BOOp1->hasOneUse() &&
-            match(Op0BOOp1,
-                  m_And(m_OneUse(m_Shr(m_Value(V1), m_Specific(Op1))),
-                        m_ConstantInt(CC)))) {
-          Value *YS =   // (Y << C)
-            Builder.CreateShl(Op0BO->getOperand(0), Op1, Op0BO->getName());
+            match(Op0BOOp1, m_And(m_OneUse(m_Shr(m_Value(V1), m_Specific(Op1))),
+                                  m_APInt(CC)))) {
+          Value *YS = // (Y << C)
+              Builder.CreateShl(Op0BO->getOperand(0), Op1, Op0BO->getName());
           // X & (CC << C)
-          Value *XM = Builder.CreateAnd(V1, ConstantExpr::getShl(CC, Op1),
-                                        V1->getName()+".mask");
+          Value *XM = Builder.CreateAnd(
+              V1, ConstantExpr::getShl(ConstantInt::get(Ty, *CC), Op1),
+              V1->getName() + ".mask");
           return BinaryOperator::Create(Op0BO->getOpcode(), YS, XM);
         }
         LLVM_FALLTHROUGH;
@@ -805,25 +782,22 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
           Value *X = Builder.CreateBinOp(Op0BO->getOpcode(), V1, YS,
                                          Op0BO->getOperand(0)->getName());
           unsigned Op1Val = Op1C->getLimitedValue(TypeBits);
-
           APInt Bits = APInt::getHighBitsSet(TypeBits, TypeBits - Op1Val);
-          Constant *Mask = ConstantInt::get(I.getContext(), Bits);
-          if (VectorType *VT = dyn_cast<VectorType>(X->getType()))
-            Mask = ConstantVector::getSplat(VT->getElementCount(), Mask);
+          Constant *Mask = ConstantInt::get(Ty, Bits);
           return BinaryOperator::CreateAnd(X, Mask);
         }
 
         // Turn (((X >> C)&CC) + Y) << C  ->  (X + (Y << C)) & (CC << C)
         if (isLeftShift && Op0BO->getOperand(0)->hasOneUse() &&
             match(Op0BO->getOperand(0),
-                  m_And(m_OneUse(m_Shr(m_Value(V1), m_Value(V2))),
-                        m_ConstantInt(CC))) && V2 == Op1) {
+                  m_And(m_OneUse(m_Shr(m_Value(V1), m_Specific(Op1))),
+                        m_APInt(CC)))) {
           Value *YS = // (Y << C)
-            Builder.CreateShl(Op0BO->getOperand(1), Op1, Op0BO->getName());
+              Builder.CreateShl(Op0BO->getOperand(1), Op1, Op0BO->getName());
           // X & (CC << C)
-          Value *XM = Builder.CreateAnd(V1, ConstantExpr::getShl(CC, Op1),
-                                        V1->getName()+".mask");
-
+          Value *XM = Builder.CreateAnd(
+              V1, ConstantExpr::getShl(ConstantInt::get(Ty, *CC), Op1),
+              V1->getName() + ".mask");
           return BinaryOperator::Create(Op0BO->getOpcode(), XM, YS);
         }
 
@@ -831,7 +805,6 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
       }
       }
 
-
       // If the operand is a bitwise operator with a constant RHS, and the
       // shift is the only use, we can pull it out of the shift.
       const APInt *Op0C;
@@ -915,7 +888,7 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
   return nullptr;
 }
 
-Instruction *InstCombiner::visitShl(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitShl(BinaryOperator &I) {
   const SimplifyQuery Q = SQ.getWithInstruction(&I);
 
   if (Value *V = SimplifyShlInst(I.getOperand(0), I.getOperand(1),
@@ -955,10 +928,9 @@ Instruction *InstCombiner::visitShl(BinaryOperator &I) {
       return BinaryOperator::CreateAnd(X, ConstantInt::get(Ty, Mask));
     }
 
-    // FIXME: we do not yet transform non-exact shr's. The backend (DAGCombine)
-    // needs a few fixes for the rotate pattern recognition first.
     const APInt *ShOp1;
-    if (match(Op0, m_Exact(m_Shr(m_Value(X), m_APInt(ShOp1))))) {
+    if (match(Op0, m_Exact(m_Shr(m_Value(X), m_APInt(ShOp1)))) &&
+        ShOp1->ult(BitWidth)) {
       unsigned ShrAmt = ShOp1->getZExtValue();
       if (ShrAmt < ShAmt) {
         // If C1 < C2: (X >>?,exact C1) << C2 --> X << (C2 - C1)
@@ -978,7 +950,33 @@ Instruction *InstCombiner::visitShl(BinaryOperator &I) {
       }
     }
 
-    if (match(Op0, m_Shl(m_Value(X), m_APInt(ShOp1)))) {
+    if (match(Op0, m_OneUse(m_Shr(m_Value(X), m_APInt(ShOp1)))) &&
+        ShOp1->ult(BitWidth)) {
+      unsigned ShrAmt = ShOp1->getZExtValue();
+      if (ShrAmt < ShAmt) {
+        // If C1 < C2: (X >>? C1) << C2 --> X << (C2 - C1) & (-1 << C2)
+        Constant *ShiftDiff = ConstantInt::get(Ty, ShAmt - ShrAmt);
+        auto *NewShl = BinaryOperator::CreateShl(X, ShiftDiff);
+        NewShl->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
+        NewShl->setHasNoSignedWrap(I.hasNoSignedWrap());
+        Builder.Insert(NewShl);
+        APInt Mask(APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt));
+        return BinaryOperator::CreateAnd(NewShl, ConstantInt::get(Ty, Mask));
+      }
+      if (ShrAmt > ShAmt) {
+        // If C1 > C2: (X >>? C1) << C2 --> X >>? (C1 - C2) & (-1 << C2)
+        Constant *ShiftDiff = ConstantInt::get(Ty, ShrAmt - ShAmt);
+        auto *OldShr = cast<BinaryOperator>(Op0);
+        auto *NewShr =
+            BinaryOperator::Create(OldShr->getOpcode(), X, ShiftDiff);
+        NewShr->setIsExact(OldShr->isExact());
+        Builder.Insert(NewShr);
+        APInt Mask(APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt));
+        return BinaryOperator::CreateAnd(NewShr, ConstantInt::get(Ty, Mask));
+      }
+    }
+
+    if (match(Op0, m_Shl(m_Value(X), m_APInt(ShOp1))) && ShOp1->ult(BitWidth)) {
       unsigned AmtSum = ShAmt + ShOp1->getZExtValue();
       // Oversized shifts are simplified to zero in InstSimplify.
       if (AmtSum < BitWidth)
@@ -1037,7 +1035,7 @@ Instruction *InstCombiner::visitShl(BinaryOperator &I) {
   return nullptr;
 }
 
-Instruction *InstCombiner::visitLShr(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitLShr(BinaryOperator &I) {
   if (Value *V = SimplifyLShrInst(I.getOperand(0), I.getOperand(1), I.isExact(),
                                   SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
@@ -1139,6 +1137,12 @@ Instruction *InstCombiner::visitLShr(BinaryOperator &I) {
       }
     }
 
+    // lshr i32 (X -nsw Y), 31 --> zext (X < Y)
+    Value *Y;
+    if (ShAmt == BitWidth - 1 &&
+        match(Op0, m_OneUse(m_NSWSub(m_Value(X), m_Value(Y)))))
+      return new ZExtInst(Builder.CreateICmpSLT(X, Y), Ty);
+
     if (match(Op0, m_LShr(m_Value(X), m_APInt(ShOp1)))) {
       unsigned AmtSum = ShAmt + ShOp1->getZExtValue();
       // Oversized shifts are simplified to zero in InstSimplify.
@@ -1167,7 +1171,7 @@ Instruction *InstCombiner::visitLShr(BinaryOperator &I) {
 }
 
 Instruction *
-InstCombiner::foldVariableSignZeroExtensionOfVariableHighBitExtract(
+InstCombinerImpl::foldVariableSignZeroExtensionOfVariableHighBitExtract(
     BinaryOperator &OldAShr) {
   assert(OldAShr.getOpcode() == Instruction::AShr &&
          "Must be called with arithmetic right-shift instruction only.");
@@ -1235,7 +1239,7 @@ InstCombiner::foldVariableSignZeroExtensionOfVariableHighBitExtract(
   return TruncInst::CreateTruncOrBitCast(NewAShr, OldAShr.getType());
 }
 
-Instruction *InstCombiner::visitAShr(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitAShr(BinaryOperator &I) {
   if (Value *V = SimplifyAShrInst(I.getOperand(0), I.getOperand(1), I.isExact(),
                                   SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
@@ -1301,6 +1305,12 @@ Instruction *InstCombiner::visitAShr(BinaryOperator &I) {
       return new SExtInst(NewSh, Ty);
     }
 
+    // ashr i32 (X -nsw Y), 31 --> sext (X < Y)
+    Value *Y;
+    if (ShAmt == BitWidth - 1 &&
+        match(Op0, m_OneUse(m_NSWSub(m_Value(X), m_Value(Y)))))
+      return new SExtInst(Builder.CreateICmpSLT(X, Y), Ty);
+
     // If the shifted-out value is known-zero, then this is an exact shift.
     if (!I.isExact() &&
         MaskedValueIsZero(Op0, APInt::getLowBitsSet(BitWidth, ShAmt), 0, &I)) {
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index c7f2f4ec3ca1..16efe863779a 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -12,29 +12,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "InstCombineInternal.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/IntrinsicsAMDGPU.h"
-#include "llvm/IR/IntrinsicsX86.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/KnownBits.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
 
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
 #define DEBUG_TYPE "instcombine"
 
-namespace {
-
-struct AMDGPUImageDMaskIntrinsic {
-  unsigned Intr;
-};
-
-#define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
-#include "InstCombineTables.inc"
-
-} // end anonymous namespace
-
 /// Check to see if the specified operand of the specified instruction is a
 /// constant integer. If so, check to see if there are any bits set in the
 /// constant that are not demanded. If so, shrink the constant and return true.
@@ -63,7 +52,7 @@ static bool ShrinkDemandedConstant(Instruction *I, unsigned OpNo,
 
 /// Inst is an integer instruction that SimplifyDemandedBits knows about. See if
 /// the instruction has any properties that allow us to simplify its operands.
-bool InstCombiner::SimplifyDemandedInstructionBits(Instruction &Inst) {
+bool InstCombinerImpl::SimplifyDemandedInstructionBits(Instruction &Inst) {
   unsigned BitWidth = Inst.getType()->getScalarSizeInBits();
   KnownBits Known(BitWidth);
   APInt DemandedMask(APInt::getAllOnesValue(BitWidth));
@@ -79,22 +68,20 @@ bool InstCombiner::SimplifyDemandedInstructionBits(Instruction &Inst) {
 /// This form of SimplifyDemandedBits simplifies the specified instruction
 /// operand if possible, updating it in place. It returns true if it made any
 /// change and false otherwise.
-bool InstCombiner::SimplifyDemandedBits(Instruction *I, unsigned OpNo,
-                                        const APInt &DemandedMask,
-                                        KnownBits &Known,
-                                        unsigned Depth) {
+bool InstCombinerImpl::SimplifyDemandedBits(Instruction *I, unsigned OpNo,
+                                            const APInt &DemandedMask,
+                                            KnownBits &Known, unsigned Depth) {
   Use &U = I->getOperandUse(OpNo);
   Value *NewVal = SimplifyDemandedUseBits(U.get(), DemandedMask, Known,
                                           Depth, I);
   if (!NewVal) return false;
   if (Instruction* OpInst = dyn_cast<Instruction>(U))
     salvageDebugInfo(*OpInst);
-    
+
   replaceUse(U, NewVal);
   return true;
 }
 
-
 /// This function attempts to replace V with a simpler value based on the
 /// demanded bits. When this function is called, it is known that only the bits
 /// set in DemandedMask of the result of V are ever used downstream.
@@ -118,11 +105,12 @@ bool InstCombiner::SimplifyDemandedBits(Instruction *I, unsigned OpNo,
 /// operands based on the information about what bits are demanded. This returns
 /// some other non-null value if it found out that V is equal to another value
 /// in the context where the specified bits are demanded, but not for all users.
-Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
-                                             KnownBits &Known, unsigned Depth,
-                                             Instruction *CxtI) {
+Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
+                                                 KnownBits &Known,
+                                                 unsigned Depth,
+                                                 Instruction *CxtI) {
   assert(V != nullptr && "Null pointer of Value???");
-  assert(Depth <= 6 && "Limit Search Depth");
+  assert(Depth <= MaxAnalysisRecursionDepth && "Limit Search Depth");
   uint32_t BitWidth = DemandedMask.getBitWidth();
   Type *VTy = V->getType();
   assert(
@@ -139,7 +127,10 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
   if (DemandedMask.isNullValue())     // Not demanding any bits from V.
     return UndefValue::get(VTy);
 
-  if (Depth == 6)        // Limit search depth.
+  if (Depth == MaxAnalysisRecursionDepth)
+    return nullptr;
+
+  if (isa<ScalableVectorType>(VTy))
     return nullptr;
 
   Instruction *I = dyn_cast<Instruction>(V);
@@ -268,35 +259,44 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       return InsertNewInstWith(And, *I);
     }
 
-    // If the RHS is a constant, see if we can simplify it.
-    // FIXME: for XOR, we prefer to force bits to 1 if they will make a -1.
-    if (ShrinkDemandedConstant(I, 1, DemandedMask))
-      return I;
+    // If the RHS is a constant, see if we can change it. Don't alter a -1
+    // constant because that's a canonical 'not' op, and that is better for
+    // combining, SCEV, and codegen.
+    const APInt *C;
+    if (match(I->getOperand(1), m_APInt(C)) && !C->isAllOnesValue()) {
+      if ((*C | ~DemandedMask).isAllOnesValue()) {
+        // Force bits to 1 to create a 'not' op.
+        I->setOperand(1, ConstantInt::getAllOnesValue(VTy));
+        return I;
+      }
+      // If we can't turn this into a 'not', try to shrink the constant.
+      if (ShrinkDemandedConstant(I, 1, DemandedMask))
+        return I;
+    }
 
     // If our LHS is an 'and' and if it has one use, and if any of the bits we
     // are flipping are known to be set, then the xor is just resetting those
     // bits to zero.  We can just knock out bits from the 'and' and the 'xor',
     // simplifying both of them.
-    if (Instruction *LHSInst = dyn_cast<Instruction>(I->getOperand(0)))
+    if (Instruction *LHSInst = dyn_cast<Instruction>(I->getOperand(0))) {
+      ConstantInt *AndRHS, *XorRHS;
       if (LHSInst->getOpcode() == Instruction::And && LHSInst->hasOneUse() &&
-          isa<ConstantInt>(I->getOperand(1)) &&
-          isa<ConstantInt>(LHSInst->getOperand(1)) &&
+          match(I->getOperand(1), m_ConstantInt(XorRHS)) &&
+          match(LHSInst->getOperand(1), m_ConstantInt(AndRHS)) &&
           (LHSKnown.One & RHSKnown.One & DemandedMask) != 0) {
-        ConstantInt *AndRHS = cast<ConstantInt>(LHSInst->getOperand(1));
-        ConstantInt *XorRHS = cast<ConstantInt>(I->getOperand(1));
         APInt NewMask = ~(LHSKnown.One & RHSKnown.One & DemandedMask);
 
         Constant *AndC =
-          ConstantInt::get(I->getType(), NewMask & AndRHS->getValue());
+            ConstantInt::get(I->getType(), NewMask & AndRHS->getValue());
         Instruction *NewAnd = BinaryOperator::CreateAnd(I->getOperand(0), AndC);
         InsertNewInstWith(NewAnd, *I);
 
         Constant *XorC =
-          ConstantInt::get(I->getType(), NewMask & XorRHS->getValue());
+            ConstantInt::get(I->getType(), NewMask & XorRHS->getValue());
         Instruction *NewXor = BinaryOperator::CreateXor(NewAnd, XorC);
         return InsertNewInstWith(NewXor, *I);
       }
-
+    }
     break;
   }
   case Instruction::Select: {
@@ -339,16 +339,20 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     // we can. This helps not break apart (or helps put back together)
     // canonical patterns like min and max.
     auto CanonicalizeSelectConstant = [](Instruction *I, unsigned OpNo,
-                                         APInt DemandedMask) {
+                                         const APInt &DemandedMask) {
       const APInt *SelC;
       if (!match(I->getOperand(OpNo), m_APInt(SelC)))
         return false;
 
       // Get the constant out of the ICmp, if there is one.
+      // Only try this when exactly 1 operand is a constant (if both operands
+      // are constant, the icmp should eventually simplify). Otherwise, we may
+      // invert the transform that reduces set bits and infinite-loop.
+      Value *X;
       const APInt *CmpC;
       ICmpInst::Predicate Pred;
-      if (!match(I->getOperand(0), m_c_ICmp(Pred, m_APInt(CmpC), m_Value())) ||
-          CmpC->getBitWidth() != SelC->getBitWidth())
+      if (!match(I->getOperand(0), m_ICmp(Pred, m_Value(X), m_APInt(CmpC))) ||
+          isa<Constant>(X) || CmpC->getBitWidth() != SelC->getBitWidth())
         return ShrinkDemandedConstant(I, OpNo, DemandedMask);
 
       // If the constant is already the same as the ICmp, leave it as-is.
@@ -367,8 +371,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       return I;
 
     // Only known if known in both the LHS and RHS.
-    Known.One = RHSKnown.One & LHSKnown.One;
-    Known.Zero = RHSKnown.Zero & LHSKnown.Zero;
+    Known = KnownBits::commonBits(LHSKnown, RHSKnown);
     break;
   }
   case Instruction::ZExt:
@@ -391,7 +394,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     if (VectorType *DstVTy = dyn_cast<VectorType>(I->getType())) {
       if (VectorType *SrcVTy =
             dyn_cast<VectorType>(I->getOperand(0)->getType())) {
-        if (DstVTy->getNumElements() != SrcVTy->getNumElements())
+        if (cast<FixedVectorType>(DstVTy)->getNumElements() !=
+            cast<FixedVectorType>(SrcVTy)->getNumElements())
           // Don't touch a bitcast between vectors of different element counts.
           return nullptr;
       } else
@@ -669,8 +673,9 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     }
     break;
   }
-  case Instruction::SRem:
-    if (ConstantInt *Rem = dyn_cast<ConstantInt>(I->getOperand(1))) {
+  case Instruction::SRem: {
+    ConstantInt *Rem;
+    if (match(I->getOperand(1), m_ConstantInt(Rem))) {
       // X % -1 demands all the bits because we don't want to introduce
       // INT_MIN % -1 (== undef) by accident.
       if (Rem->isMinusOne())
@@ -713,6 +718,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
         Known.makeNonNegative();
     }
     break;
+  }
   case Instruction::URem: {
     KnownBits Known2(BitWidth);
     APInt AllOnes = APInt::getAllOnesValue(BitWidth);
@@ -728,7 +734,6 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     bool KnownBitsComputed = false;
     if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
       switch (II->getIntrinsicID()) {
-      default: break;
       case Intrinsic::bswap: {
         // If the only bits demanded come from one byte of the bswap result,
         // just shift the input byte into position to eliminate the bswap.
@@ -784,39 +789,14 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
         KnownBitsComputed = true;
         break;
       }
-      case Intrinsic::x86_mmx_pmovmskb:
-      case Intrinsic::x86_sse_movmsk_ps:
-      case Intrinsic::x86_sse2_movmsk_pd:
-      case Intrinsic::x86_sse2_pmovmskb_128:
-      case Intrinsic::x86_avx_movmsk_ps_256:
-      case Intrinsic::x86_avx_movmsk_pd_256:
-      case Intrinsic::x86_avx2_pmovmskb: {
-        // MOVMSK copies the vector elements' sign bits to the low bits
-        // and zeros the high bits.
-        unsigned ArgWidth;
-        if (II->getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) {
-          ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>.
-        } else {
-          auto Arg = II->getArgOperand(0);
-          auto ArgType = cast<VectorType>(Arg->getType());
-          ArgWidth = ArgType->getNumElements();
-        }
-
-        // If we don't need any of low bits then return zero,
-        // we know that DemandedMask is non-zero already.
-        APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth);
-        if (DemandedElts.isNullValue())
-          return ConstantInt::getNullValue(VTy);
-
-        // We know that the upper bits are set to zero.
-        Known.Zero.setBitsFrom(ArgWidth);
-        KnownBitsComputed = true;
+      default: {
+        // Handle target specific intrinsics
+        Optional<Value *> V = targetSimplifyDemandedUseBitsIntrinsic(
+            *II, DemandedMask, Known, KnownBitsComputed);
+        if (V.hasValue())
+          return V.getValue();
         break;
       }
-      case Intrinsic::x86_sse42_crc32_64_64:
-        Known.Zero.setBitsFrom(32);
-        KnownBitsComputed = true;
-        break;
       }
     }
 
@@ -836,11 +816,9 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
 /// Helper routine of SimplifyDemandedUseBits. It computes Known
 /// bits. It also tries to handle simplifications that can be done based on
 /// DemandedMask, but without modifying the Instruction.
-Value *InstCombiner::SimplifyMultipleUseDemandedBits(Instruction *I,
-                                                     const APInt &DemandedMask,
-                                                     KnownBits &Known,
-                                                     unsigned Depth,
-                                                     Instruction *CxtI) {
+Value *InstCombinerImpl::SimplifyMultipleUseDemandedBits(
+    Instruction *I, const APInt &DemandedMask, KnownBits &Known, unsigned Depth,
+    Instruction *CxtI) {
   unsigned BitWidth = DemandedMask.getBitWidth();
   Type *ITy = I->getType();
 
@@ -925,6 +903,33 @@ Value *InstCombiner::SimplifyMultipleUseDemandedBits(Instruction *I,
 
     break;
   }
+  case Instruction::AShr: {
+    // Compute the Known bits to simplify things downstream.
+    computeKnownBits(I, Known, Depth, CxtI);
+
+    // If this user is only demanding bits that we know, return the known
+    // constant.
+    if (DemandedMask.isSubsetOf(Known.Zero | Known.One))
+      return Constant::getIntegerValue(ITy, Known.One);
+
+    // If the right shift operand 0 is a result of a left shift by the same
+    // amount, this is probably a zero/sign extension, which may be unnecessary,
+    // if we do not demand any of the new sign bits. So, return the original
+    // operand instead.
+    const APInt *ShiftRC;
+    const APInt *ShiftLC;
+    Value *X;
+    unsigned BitWidth = DemandedMask.getBitWidth();
+    if (match(I,
+              m_AShr(m_Shl(m_Value(X), m_APInt(ShiftLC)), m_APInt(ShiftRC))) &&
+        ShiftLC == ShiftRC &&
+        DemandedMask.isSubsetOf(APInt::getLowBitsSet(
+            BitWidth, BitWidth - ShiftRC->getZExtValue()))) {
+      return X;
+    }
+
+    break;
+  }
   default:
     // Compute the Known bits to simplify things downstream.
     computeKnownBits(I, Known, Depth, CxtI);
@@ -940,7 +945,6 @@ Value *InstCombiner::SimplifyMultipleUseDemandedBits(Instruction *I,
   return nullptr;
 }
 
-
 /// Helper routine of SimplifyDemandedUseBits. It tries to simplify
 /// "E1 = (X lsr C1) << C2", where the C1 and C2 are constant, into
 /// "E2 = X << (C2 - C1)" or "E2 = X >> (C1 - C2)", depending on the sign
@@ -958,11 +962,9 @@ Value *InstCombiner::SimplifyMultipleUseDemandedBits(Instruction *I,
 ///
 /// As with SimplifyDemandedUseBits, it returns NULL if the simplification was
 /// not successful.
-Value *
-InstCombiner::simplifyShrShlDemandedBits(Instruction *Shr, const APInt &ShrOp1,
-                                         Instruction *Shl, const APInt &ShlOp1,
-                                         const APInt &DemandedMask,
-                                         KnownBits &Known) {
+Value *InstCombinerImpl::simplifyShrShlDemandedBits(
+    Instruction *Shr, const APInt &ShrOp1, Instruction *Shl,
+    const APInt &ShlOp1, const APInt &DemandedMask, KnownBits &Known) {
   if (!ShlOp1 || !ShrOp1)
     return nullptr; // No-op.
 
@@ -1022,155 +1024,9 @@ InstCombiner::simplifyShrShlDemandedBits(Instruction *Shr, const APInt &ShrOp1,
   return nullptr;
 }
 
-/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
-///
-/// Note: This only supports non-TFE/LWE image intrinsic calls; those have
-///       struct returns.
-Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
-                                                           APInt DemandedElts,
-                                                           int DMaskIdx) {
-
-  auto *IIVTy = cast<VectorType>(II->getType());
-  unsigned VWidth = IIVTy->getNumElements();
-  if (VWidth == 1)
-    return nullptr;
-
-  IRBuilderBase::InsertPointGuard Guard(Builder);
-  Builder.SetInsertPoint(II);
-
-  // Assume the arguments are unchanged and later override them, if needed.
-  SmallVector<Value *, 16> Args(II->arg_begin(), II->arg_end());
-
-  if (DMaskIdx < 0) {
-    // Buffer case.
-
-    const unsigned ActiveBits = DemandedElts.getActiveBits();
-    const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros();
-
-    // Start assuming the prefix of elements is demanded, but possibly clear
-    // some other bits if there are trailing zeros (unused components at front)
-    // and update offset.
-    DemandedElts = (1 << ActiveBits) - 1;
-
-    if (UnusedComponentsAtFront > 0) {
-      static const unsigned InvalidOffsetIdx = 0xf;
-
-      unsigned OffsetIdx;
-      switch (II->getIntrinsicID()) {
-      case Intrinsic::amdgcn_raw_buffer_load:
-        OffsetIdx = 1;
-        break;
-      case Intrinsic::amdgcn_s_buffer_load:
-        // If resulting type is vec3, there is no point in trimming the
-        // load with updated offset, as the vec3 would most likely be widened to
-        // vec4 anyway during lowering.
-        if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
-          OffsetIdx = InvalidOffsetIdx;
-        else
-          OffsetIdx = 1;
-        break;
-      case Intrinsic::amdgcn_struct_buffer_load:
-        OffsetIdx = 2;
-        break;
-      default:
-        // TODO: handle tbuffer* intrinsics.
-        OffsetIdx = InvalidOffsetIdx;
-        break;
-      }
-
-      if (OffsetIdx != InvalidOffsetIdx) {
-        // Clear demanded bits and update the offset.
-        DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
-        auto *Offset = II->getArgOperand(OffsetIdx);
-        unsigned SingleComponentSizeInBits =
-            getDataLayout().getTypeSizeInBits(II->getType()->getScalarType());
-        unsigned OffsetAdd =
-            UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
-        auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
-        Args[OffsetIdx] = Builder.CreateAdd(Offset, OffsetAddVal);
-      }
-    }
-  } else {
-    // Image case.
-
-    ConstantInt *DMask = cast<ConstantInt>(II->getArgOperand(DMaskIdx));
-    unsigned DMaskVal = DMask->getZExtValue() & 0xf;
-
-    // Mask off values that are undefined because the dmask doesn't cover them
-    DemandedElts &= (1 << countPopulation(DMaskVal)) - 1;
-
-    unsigned NewDMaskVal = 0;
-    unsigned OrigLoadIdx = 0;
-    for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
-      const unsigned Bit = 1 << SrcIdx;
-      if (!!(DMaskVal & Bit)) {
-        if (!!DemandedElts[OrigLoadIdx])
-          NewDMaskVal |= Bit;
-        OrigLoadIdx++;
-      }
-    }
-
-    if (DMaskVal != NewDMaskVal)
-      Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
-  }
-
-  unsigned NewNumElts = DemandedElts.countPopulation();
-  if (!NewNumElts)
-    return UndefValue::get(II->getType());
-
-  // FIXME: Allow v3i16/v3f16 in buffer and image intrinsics when the types are
-  // fully supported.
-  if (II->getType()->getScalarSizeInBits() == 16 && NewNumElts == 3)
-    return nullptr;
-
-  if (NewNumElts >= VWidth && DemandedElts.isMask()) {
-    if (DMaskIdx >= 0)
-      II->setArgOperand(DMaskIdx, Args[DMaskIdx]);
-    return nullptr;
-  }
-
-  // Validate function argument and return types, extracting overloaded types
-  // along the way.
-  SmallVector<Type *, 6> OverloadTys;
-  if (!Intrinsic::getIntrinsicSignature(II->getCalledFunction(), OverloadTys))
-    return nullptr;
-
-  Module *M = II->getParent()->getParent()->getParent();
-  Type *EltTy = IIVTy->getElementType();
-  Type *NewTy =
-      (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
-
-  OverloadTys[0] = NewTy;
-  Function *NewIntrin =
-      Intrinsic::getDeclaration(M, II->getIntrinsicID(), OverloadTys);
-
-  CallInst *NewCall = Builder.CreateCall(NewIntrin, Args);
-  NewCall->takeName(II);
-  NewCall->copyMetadata(*II);
-
-  if (NewNumElts == 1) {
-    return Builder.CreateInsertElement(UndefValue::get(II->getType()), NewCall,
-                                       DemandedElts.countTrailingZeros());
-  }
-
-  SmallVector<int, 8> EltMask;
-  unsigned NewLoadIdx = 0;
-  for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
-    if (!!DemandedElts[OrigLoadIdx])
-      EltMask.push_back(NewLoadIdx++);
-    else
-      EltMask.push_back(NewNumElts);
-  }
-
-  Value *Shuffle =
-      Builder.CreateShuffleVector(NewCall, UndefValue::get(NewTy), EltMask);
-
-  return Shuffle;
-}
-
 /// The specified value produces a vector with any number of elements.
-/// This method analyzes which elements of the operand are undef and returns
-/// that information in UndefElts.
+/// This method analyzes which elements of the operand are undef or poison and
+/// returns that information in UndefElts.
 ///
 /// DemandedElts contains the set of elements that are actually used by the
 /// caller, and by default (AllowMultipleUsers equals false) the value is
@@ -1181,10 +1037,11 @@ Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
 /// If the information about demanded elements can be used to simplify the
 /// operation, the operation is simplified, then the resultant value is
 /// returned.  This returns null if no change was made.
-Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
-                                                APInt &UndefElts,
-                                                unsigned Depth,
-                                                bool AllowMultipleUsers) {
+Value *InstCombinerImpl::SimplifyDemandedVectorElts(Value *V,
+                                                    APInt DemandedElts,
+                                                    APInt &UndefElts,
+                                                    unsigned Depth,
+                                                    bool AllowMultipleUsers) {
   // Cannot analyze scalable type. The number of vector elements is not a
   // compile-time constant.
   if (isa<ScalableVectorType>(V->getType()))
@@ -1195,14 +1052,14 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
   assert((DemandedElts & ~EltMask) == 0 && "Invalid DemandedElts!");
 
   if (isa<UndefValue>(V)) {
-    // If the entire vector is undefined, just return this info.
+    // If the entire vector is undef or poison, just return this info.
     UndefElts = EltMask;
     return nullptr;
   }
 
-  if (DemandedElts.isNullValue()) { // If nothing is demanded, provide undef.
+  if (DemandedElts.isNullValue()) { // If nothing is demanded, provide poison.
     UndefElts = EltMask;
-    return UndefValue::get(V->getType());
+    return PoisonValue::get(V->getType());
   }
 
   UndefElts = 0;
@@ -1214,11 +1071,11 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
       return nullptr;
 
     Type *EltTy = cast<VectorType>(V->getType())->getElementType();
-    Constant *Undef = UndefValue::get(EltTy);
+    Constant *Poison = PoisonValue::get(EltTy);
     SmallVector<Constant*, 16> Elts;
     for (unsigned i = 0; i != VWidth; ++i) {
-      if (!DemandedElts[i]) {   // If not demanded, set to undef.
-        Elts.push_back(Undef);
+      if (!DemandedElts[i]) {   // If not demanded, set to poison.
+        Elts.push_back(Poison);
         UndefElts.setBit(i);
         continue;
       }
@@ -1226,12 +1083,9 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
       Constant *Elt = C->getAggregateElement(i);
       if (!Elt) return nullptr;
 
-      if (isa<UndefValue>(Elt)) {   // Already undef.
-        Elts.push_back(Undef);
+      Elts.push_back(Elt);
+      if (isa<UndefValue>(Elt))   // Already undef or poison.
         UndefElts.setBit(i);
-      } else {                               // Otherwise, defined.
-        Elts.push_back(Elt);
-      }
     }
 
     // If we changed the constant, return it.
@@ -1291,12 +1145,12 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
     };
     if (mayIndexStructType(cast<GetElementPtrInst>(*I)))
       break;
-    
+
     // Conservatively track the demanded elements back through any vector
     // operands we may have.  We know there must be at least one, or we
     // wouldn't have a vector result to get here. Note that we intentionally
     // merge the undef bits here since gepping with either an undef base or
-    // index results in undef. 
+    // index results in undef.
     for (unsigned i = 0; i < I->getNumOperands(); i++) {
       if (isa<UndefValue>(I->getOperand(i))) {
         // If the entire vector is undefined, just return this info.
@@ -1330,6 +1184,19 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
     if (IdxNo < VWidth)
       PreInsertDemandedElts.clearBit(IdxNo);
 
+    // If we only demand the element that is being inserted and that element
+    // was extracted from the same index in another vector with the same type,
+    // replace this insert with that other vector.
+    // Note: This is attempted before the call to simplifyAndSetOp because that
+    //       may change UndefElts to a value that does not match with Vec.
+    Value *Vec;
+    if (PreInsertDemandedElts == 0 &&
+        match(I->getOperand(1),
+              m_ExtractElt(m_Value(Vec), m_SpecificInt(IdxNo))) &&
+        Vec->getType() == I->getType()) {
+      return Vec;
+    }
+
     simplifyAndSetOp(I, 0, PreInsertDemandedElts, UndefElts);
 
     // If this is inserting an element that isn't demanded, remove this
@@ -1348,8 +1215,8 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
     assert(Shuffle->getOperand(0)->getType() ==
            Shuffle->getOperand(1)->getType() &&
            "Expected shuffle operands to have same type");
-    unsigned OpWidth =
-        cast<VectorType>(Shuffle->getOperand(0)->getType())->getNumElements();
+    unsigned OpWidth = cast<FixedVectorType>(Shuffle->getOperand(0)->getType())
+                           ->getNumElements();
     // Handle trivial case of a splat. Only check the first element of LHS
     // operand.
     if (all_of(Shuffle->getShuffleMask(), [](int Elt) { return Elt == 0; }) &&
@@ -1450,7 +1317,8 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
     // this constant vector to single insertelement instruction.
     // shufflevector V, C, <v1, v2, .., ci, .., vm> ->
     // insertelement V, C[ci], ci-n
-    if (OpWidth == Shuffle->getType()->getNumElements()) {
+    if (OpWidth ==
+        cast<FixedVectorType>(Shuffle->getType())->getNumElements()) {
       Value *Op = nullptr;
       Constant *Value = nullptr;
       unsigned Idx = -1u;
@@ -1537,7 +1405,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
     // Vector->vector casts only.
     VectorType *VTy = dyn_cast<VectorType>(I->getOperand(0)->getType());
     if (!VTy) break;
-    unsigned InVWidth = VTy->getNumElements();
+    unsigned InVWidth = cast<FixedVectorType>(VTy)->getNumElements();
     APInt InputDemandedElts(InVWidth, 0);
     UndefElts2 = APInt(InVWidth, 0);
     unsigned Ratio;
@@ -1620,227 +1488,19 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
       if (II->getIntrinsicID() == Intrinsic::masked_gather)
         simplifyAndSetOp(II, 0, DemandedPtrs, UndefElts2);
       simplifyAndSetOp(II, 3, DemandedPassThrough, UndefElts3);
-      
+
       // Output elements are undefined if the element from both sources are.
       // TODO: can strengthen via mask as well.
       UndefElts = UndefElts2 & UndefElts3;
       break;
     }
-    case Intrinsic::x86_xop_vfrcz_ss:
-    case Intrinsic::x86_xop_vfrcz_sd:
-      // The instructions for these intrinsics are speced to zero upper bits not
-      // pass them through like other scalar intrinsics. So we shouldn't just
-      // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics.
-      // Instead we should return a zero vector.
-      if (!DemandedElts[0]) {
-        Worklist.push(II);
-        return ConstantAggregateZero::get(II->getType());
-      }
-
-      // Only the lower element is used.
-      DemandedElts = 1;
-      simplifyAndSetOp(II, 0, DemandedElts, UndefElts);
-
-      // Only the lower element is undefined. The high elements are zero.
-      UndefElts = UndefElts[0];
-      break;
-
-    // Unary scalar-as-vector operations that work column-wise.
-    case Intrinsic::x86_sse_rcp_ss:
-    case Intrinsic::x86_sse_rsqrt_ss:
-      simplifyAndSetOp(II, 0, DemandedElts, UndefElts);
-
-      // If lowest element of a scalar op isn't used then use Arg0.
-      if (!DemandedElts[0]) {
-        Worklist.push(II);
-        return II->getArgOperand(0);
-      }
-      // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions
-      // checks).
-      break;
-
-    // Binary scalar-as-vector operations that work column-wise. The high
-    // elements come from operand 0. The low element is a function of both
-    // operands.
-    case Intrinsic::x86_sse_min_ss:
-    case Intrinsic::x86_sse_max_ss:
-    case Intrinsic::x86_sse_cmp_ss:
-    case Intrinsic::x86_sse2_min_sd:
-    case Intrinsic::x86_sse2_max_sd:
-    case Intrinsic::x86_sse2_cmp_sd: {
-      simplifyAndSetOp(II, 0, DemandedElts, UndefElts);
-
-      // If lowest element of a scalar op isn't used then use Arg0.
-      if (!DemandedElts[0]) {
-        Worklist.push(II);
-        return II->getArgOperand(0);
-      }
-
-      // Only lower element is used for operand 1.
-      DemandedElts = 1;
-      simplifyAndSetOp(II, 1, DemandedElts, UndefElts2);
-
-      // Lower element is undefined if both lower elements are undefined.
-      // Consider things like undef&0.  The result is known zero, not undef.
-      if (!UndefElts2[0])
-        UndefElts.clearBit(0);
-
-      break;
-    }
-
-    // Binary scalar-as-vector operations that work column-wise. The high
-    // elements come from operand 0 and the low element comes from operand 1.
-    case Intrinsic::x86_sse41_round_ss:
-    case Intrinsic::x86_sse41_round_sd: {
-      // Don't use the low element of operand 0.
-      APInt DemandedElts2 = DemandedElts;
-      DemandedElts2.clearBit(0);
-      simplifyAndSetOp(II, 0, DemandedElts2, UndefElts);
-
-      // If lowest element of a scalar op isn't used then use Arg0.
-      if (!DemandedElts[0]) {
-        Worklist.push(II);
-        return II->getArgOperand(0);
-      }
-
-      // Only lower element is used for operand 1.
-      DemandedElts = 1;
-      simplifyAndSetOp(II, 1, DemandedElts, UndefElts2);
-
-      // Take the high undef elements from operand 0 and take the lower element
-      // from operand 1.
-      UndefElts.clearBit(0);
-      UndefElts |= UndefElts2[0];
-      break;
-    }
-
-    // Three input scalar-as-vector operations that work column-wise. The high
-    // elements come from operand 0 and the low element is a function of all
-    // three inputs.
-    case Intrinsic::x86_avx512_mask_add_ss_round:
-    case Intrinsic::x86_avx512_mask_div_ss_round:
-    case Intrinsic::x86_avx512_mask_mul_ss_round:
-    case Intrinsic::x86_avx512_mask_sub_ss_round:
-    case Intrinsic::x86_avx512_mask_max_ss_round:
-    case Intrinsic::x86_avx512_mask_min_ss_round:
-    case Intrinsic::x86_avx512_mask_add_sd_round:
-    case Intrinsic::x86_avx512_mask_div_sd_round:
-    case Intrinsic::x86_avx512_mask_mul_sd_round:
-    case Intrinsic::x86_avx512_mask_sub_sd_round:
-    case Intrinsic::x86_avx512_mask_max_sd_round:
-    case Intrinsic::x86_avx512_mask_min_sd_round:
-      simplifyAndSetOp(II, 0, DemandedElts, UndefElts);
-
-      // If lowest element of a scalar op isn't used then use Arg0.
-      if (!DemandedElts[0]) {
-        Worklist.push(II);
-        return II->getArgOperand(0);
-      }
-
-      // Only lower element is used for operand 1 and 2.
-      DemandedElts = 1;
-      simplifyAndSetOp(II, 1, DemandedElts, UndefElts2);
-      simplifyAndSetOp(II, 2, DemandedElts, UndefElts3);
-
-      // Lower element is undefined if all three lower elements are undefined.
-      // Consider things like undef&0.  The result is known zero, not undef.
-      if (!UndefElts2[0] || !UndefElts3[0])
-        UndefElts.clearBit(0);
-
-      break;
-
-    case Intrinsic::x86_sse2_packssdw_128:
-    case Intrinsic::x86_sse2_packsswb_128:
-    case Intrinsic::x86_sse2_packuswb_128:
-    case Intrinsic::x86_sse41_packusdw:
-    case Intrinsic::x86_avx2_packssdw:
-    case Intrinsic::x86_avx2_packsswb:
-    case Intrinsic::x86_avx2_packusdw:
-    case Intrinsic::x86_avx2_packuswb:
-    case Intrinsic::x86_avx512_packssdw_512:
-    case Intrinsic::x86_avx512_packsswb_512:
-    case Intrinsic::x86_avx512_packusdw_512:
-    case Intrinsic::x86_avx512_packuswb_512: {
-      auto *Ty0 = II->getArgOperand(0)->getType();
-      unsigned InnerVWidth = cast<VectorType>(Ty0)->getNumElements();
-      assert(VWidth == (InnerVWidth * 2) && "Unexpected input size");
-
-      unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128;
-      unsigned VWidthPerLane = VWidth / NumLanes;
-      unsigned InnerVWidthPerLane = InnerVWidth / NumLanes;
-
-      // Per lane, pack the elements of the first input and then the second.
-      // e.g.
-      // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3])
-      // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15])
-      for (int OpNum = 0; OpNum != 2; ++OpNum) {
-        APInt OpDemandedElts(InnerVWidth, 0);
-        for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
-          unsigned LaneIdx = Lane * VWidthPerLane;
-          for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) {
-            unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum;
-            if (DemandedElts[Idx])
-              OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt);
-          }
-        }
-
-        // Demand elements from the operand.
-        APInt OpUndefElts(InnerVWidth, 0);
-        simplifyAndSetOp(II, OpNum, OpDemandedElts, OpUndefElts);
-
-        // Pack the operand's UNDEF elements, one lane at a time.
-        OpUndefElts = OpUndefElts.zext(VWidth);
-        for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
-          APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane);
-          LaneElts = LaneElts.getLoBits(InnerVWidthPerLane);
-          LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum);
-          UndefElts |= LaneElts;
-        }
-      }
-      break;
-    }
-
-    // PSHUFB
-    case Intrinsic::x86_ssse3_pshuf_b_128:
-    case Intrinsic::x86_avx2_pshuf_b:
-    case Intrinsic::x86_avx512_pshuf_b_512:
-    // PERMILVAR
-    case Intrinsic::x86_avx_vpermilvar_ps:
-    case Intrinsic::x86_avx_vpermilvar_ps_256:
-    case Intrinsic::x86_avx512_vpermilvar_ps_512:
-    case Intrinsic::x86_avx_vpermilvar_pd:
-    case Intrinsic::x86_avx_vpermilvar_pd_256:
-    case Intrinsic::x86_avx512_vpermilvar_pd_512:
-    // PERMV
-    case Intrinsic::x86_avx2_permd:
-    case Intrinsic::x86_avx2_permps: {
-      simplifyAndSetOp(II, 1, DemandedElts, UndefElts);
-      break;
-    }
-
-    // SSE4A instructions leave the upper 64-bits of the 128-bit result
-    // in an undefined state.
-    case Intrinsic::x86_sse4a_extrq:
-    case Intrinsic::x86_sse4a_extrqi:
-    case Intrinsic::x86_sse4a_insertq:
-    case Intrinsic::x86_sse4a_insertqi:
-      UndefElts.setHighBits(VWidth / 2);
-      break;
-    case Intrinsic::amdgcn_buffer_load:
-    case Intrinsic::amdgcn_buffer_load_format:
-    case Intrinsic::amdgcn_raw_buffer_load:
-    case Intrinsic::amdgcn_raw_buffer_load_format:
-    case Intrinsic::amdgcn_raw_tbuffer_load:
-    case Intrinsic::amdgcn_s_buffer_load:
-    case Intrinsic::amdgcn_struct_buffer_load:
-    case Intrinsic::amdgcn_struct_buffer_load_format:
-    case Intrinsic::amdgcn_struct_tbuffer_load:
-    case Intrinsic::amdgcn_tbuffer_load:
-      return simplifyAMDGCNMemoryIntrinsicDemanded(II, DemandedElts);
     default: {
-      if (getAMDGPUImageDMaskIntrinsic(II->getIntrinsicID()))
-        return simplifyAMDGCNMemoryIntrinsicDemanded(II, DemandedElts, 0);
-
+      // Handle target specific intrinsics
+      Optional<Value *> V = targetSimplifyDemandedVectorEltsIntrinsic(
+          *II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
+          simplifyAndSetOp);
+      if (V.hasValue())
+        return V.getValue();
       break;
     }
     } // switch on IntrinsicID
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineTables.td b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineTables.td
deleted file mode 100644
index 98b2adc442fa..000000000000
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineTables.td
+++ /dev/null
@@ -1,11 +0,0 @@
-include "llvm/TableGen/SearchableTable.td"
-include "llvm/IR/Intrinsics.td"
-
-def AMDGPUImageDMaskIntrinsicTable : GenericTable {
-  let FilterClass = "AMDGPUImageDMaskIntrinsic";
-  let Fields = ["Intr"];
-
-  let PrimaryKey = ["Intr"];
-  let PrimaryKeyName = "getAMDGPUImageDMaskIntrinsic";
-  let PrimaryKeyEarlyOut = 1;
-}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index ff70347569ab..06f22cdfb63d 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/BasicBlock.h"
@@ -35,6 +36,7 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
 #include <cassert>
 #include <cstdint>
 #include <iterator>
@@ -45,6 +47,10 @@ using namespace PatternMatch;
 
 #define DEBUG_TYPE "instcombine"
 
+STATISTIC(NumAggregateReconstructionsSimplified,
+          "Number of aggregate reconstructions turned into reuse of the "
+          "original aggregate");
+
 /// Return true if the value is cheaper to scalarize than it is to leave as a
 /// vector operation. IsConstantExtractIndex indicates whether we are extracting
 /// one known element from a vector constant.
@@ -85,7 +91,8 @@ static bool cheapToScalarize(Value *V, bool IsConstantExtractIndex) {
 // If we have a PHI node with a vector type that is only used to feed
 // itself and be an operand of extractelement at a constant location,
 // try to replace the PHI of the vector type with a PHI of a scalar type.
-Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) {
+Instruction *InstCombinerImpl::scalarizePHI(ExtractElementInst &EI,
+                                            PHINode *PN) {
   SmallVector<Instruction *, 2> Extracts;
   // The users we want the PHI to have are:
   // 1) The EI ExtractElement (we already know this)
@@ -178,15 +185,19 @@ static Instruction *foldBitcastExtElt(ExtractElementInst &Ext,
   // extelt (bitcast VecX), IndexC --> bitcast X[IndexC]
   auto *SrcTy = cast<VectorType>(X->getType());
   Type *DestTy = Ext.getType();
-  unsigned NumSrcElts = SrcTy->getNumElements();
-  unsigned NumElts = Ext.getVectorOperandType()->getNumElements();
+  ElementCount NumSrcElts = SrcTy->getElementCount();
+  ElementCount NumElts =
+      cast<VectorType>(Ext.getVectorOperandType())->getElementCount();
   if (NumSrcElts == NumElts)
     if (Value *Elt = findScalarElement(X, ExtIndexC))
       return new BitCastInst(Elt, DestTy);
 
+  assert(NumSrcElts.isScalable() == NumElts.isScalable() &&
+         "Src and Dst must be the same sort of vector type");
+
   // If the source elements are wider than the destination, try to shift and
   // truncate a subset of scalar bits of an insert op.
-  if (NumSrcElts < NumElts) {
+  if (NumSrcElts.getKnownMinValue() < NumElts.getKnownMinValue()) {
     Value *Scalar;
     uint64_t InsIndexC;
     if (!match(X, m_InsertElt(m_Value(), m_Value(Scalar),
@@ -197,7 +208,8 @@ static Instruction *foldBitcastExtElt(ExtractElementInst &Ext,
     // into. Example: if we inserted element 1 of a <2 x i64> and we are
     // extracting an i16 (narrowing ratio = 4), then this extract must be from 1
     // of elements 4-7 of the bitcasted vector.
-    unsigned NarrowingRatio = NumElts / NumSrcElts;
+    unsigned NarrowingRatio =
+        NumElts.getKnownMinValue() / NumSrcElts.getKnownMinValue();
     if (ExtIndexC / NarrowingRatio != InsIndexC)
       return nullptr;
 
@@ -259,7 +271,7 @@ static Instruction *foldBitcastExtElt(ExtractElementInst &Ext,
 
 /// Find elements of V demanded by UserInstr.
 static APInt findDemandedEltsBySingleUser(Value *V, Instruction *UserInstr) {
-  unsigned VWidth = cast<VectorType>(V->getType())->getNumElements();
+  unsigned VWidth = cast<FixedVectorType>(V->getType())->getNumElements();
 
   // Conservatively assume that all elements are needed.
   APInt UsedElts(APInt::getAllOnesValue(VWidth));
@@ -277,7 +289,7 @@ static APInt findDemandedEltsBySingleUser(Value *V, Instruction *UserInstr) {
   case Instruction::ShuffleVector: {
     ShuffleVectorInst *Shuffle = cast<ShuffleVectorInst>(UserInstr);
     unsigned MaskNumElts =
-        cast<VectorType>(UserInstr->getType())->getNumElements();
+        cast<FixedVectorType>(UserInstr->getType())->getNumElements();
 
     UsedElts = APInt(VWidth, 0);
     for (unsigned i = 0; i < MaskNumElts; i++) {
@@ -303,7 +315,7 @@ static APInt findDemandedEltsBySingleUser(Value *V, Instruction *UserInstr) {
 /// no user demands an element of V, then the corresponding bit
 /// remains unset in the returned value.
 static APInt findDemandedEltsByAllUsers(Value *V) {
-  unsigned VWidth = cast<VectorType>(V->getType())->getNumElements();
+  unsigned VWidth = cast<FixedVectorType>(V->getType())->getNumElements();
 
   APInt UnionUsedElts(VWidth, 0);
   for (const Use &U : V->uses()) {
@@ -321,7 +333,7 @@ static APInt findDemandedEltsByAllUsers(Value *V) {
   return UnionUsedElts;
 }
 
-Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
+Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) {
   Value *SrcVec = EI.getVectorOperand();
   Value *Index = EI.getIndexOperand();
   if (Value *V = SimplifyExtractElementInst(SrcVec, Index,
@@ -333,17 +345,17 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
   auto *IndexC = dyn_cast<ConstantInt>(Index);
   if (IndexC) {
     ElementCount EC = EI.getVectorOperandType()->getElementCount();
-    unsigned NumElts = EC.Min;
+    unsigned NumElts = EC.getKnownMinValue();
 
     // InstSimplify should handle cases where the index is invalid.
     // For fixed-length vector, it's invalid to extract out-of-range element.
-    if (!EC.Scalable && IndexC->getValue().uge(NumElts))
+    if (!EC.isScalable() && IndexC->getValue().uge(NumElts))
       return nullptr;
 
     // This instruction only demands the single element from the input vector.
     // Skip for scalable type, the number of elements is unknown at
     // compile-time.
-    if (!EC.Scalable && NumElts != 1) {
+    if (!EC.isScalable() && NumElts != 1) {
       // If the input vector has a single use, simplify it based on this use
       // property.
       if (SrcVec->hasOneUse()) {
@@ -460,7 +472,7 @@ static bool collectSingleShuffleElements(Value *V, Value *LHS, Value *RHS,
                                          SmallVectorImpl<int> &Mask) {
   assert(LHS->getType() == RHS->getType() &&
          "Invalid CollectSingleShuffleElements");
-  unsigned NumElts = cast<VectorType>(V->getType())->getNumElements();
+  unsigned NumElts = cast<FixedVectorType>(V->getType())->getNumElements();
 
   if (isa<UndefValue>(V)) {
     Mask.assign(NumElts, -1);
@@ -502,7 +514,7 @@ static bool collectSingleShuffleElements(Value *V, Value *LHS, Value *RHS,
         unsigned ExtractedIdx =
         cast<ConstantInt>(EI->getOperand(1))->getZExtValue();
         unsigned NumLHSElts =
-            cast<VectorType>(LHS->getType())->getNumElements();
+            cast<FixedVectorType>(LHS->getType())->getNumElements();
 
         // This must be extracting from either LHS or RHS.
         if (EI->getOperand(0) == LHS || EI->getOperand(0) == RHS) {
@@ -531,9 +543,9 @@ static bool collectSingleShuffleElements(Value *V, Value *LHS, Value *RHS,
 /// shufflevector to replace one or more insert/extract pairs.
 static void replaceExtractElements(InsertElementInst *InsElt,
                                    ExtractElementInst *ExtElt,
-                                   InstCombiner &IC) {
-  VectorType *InsVecType = InsElt->getType();
-  VectorType *ExtVecType = ExtElt->getVectorOperandType();
+                                   InstCombinerImpl &IC) {
+  auto *InsVecType = cast<FixedVectorType>(InsElt->getType());
+  auto *ExtVecType = cast<FixedVectorType>(ExtElt->getVectorOperandType());
   unsigned NumInsElts = InsVecType->getNumElements();
   unsigned NumExtElts = ExtVecType->getNumElements();
 
@@ -614,7 +626,7 @@ using ShuffleOps = std::pair<Value *, Value *>;
 
 static ShuffleOps collectShuffleElements(Value *V, SmallVectorImpl<int> &Mask,
                                          Value *PermittedRHS,
-                                         InstCombiner &IC) {
+                                         InstCombinerImpl &IC) {
   assert(V->getType()->isVectorTy() && "Invalid shuffle!");
   unsigned NumElts = cast<FixedVectorType>(V->getType())->getNumElements();
 
@@ -661,7 +673,7 @@ static ShuffleOps collectShuffleElements(Value *V, SmallVectorImpl<int> &Mask,
           }
 
           unsigned NumLHSElts =
-              cast<VectorType>(RHS->getType())->getNumElements();
+              cast<FixedVectorType>(RHS->getType())->getNumElements();
           Mask[InsertedIdx % NumElts] = NumLHSElts + ExtractedIdx;
           return std::make_pair(LR.first, RHS);
         }
@@ -670,7 +682,8 @@ static ShuffleOps collectShuffleElements(Value *V, SmallVectorImpl<int> &Mask,
           // We've gone as far as we can: anything on the other side of the
           // extractelement will already have been converted into a shuffle.
           unsigned NumLHSElts =
-              cast<VectorType>(EI->getOperand(0)->getType())->getNumElements();
+              cast<FixedVectorType>(EI->getOperand(0)->getType())
+                  ->getNumElements();
           for (unsigned i = 0; i != NumElts; ++i)
             Mask.push_back(i == InsertedIdx ? ExtractedIdx : NumLHSElts + i);
           return std::make_pair(EI->getOperand(0), PermittedRHS);
@@ -692,6 +705,285 @@ static ShuffleOps collectShuffleElements(Value *V, SmallVectorImpl<int> &Mask,
   return std::make_pair(V, nullptr);
 }
 
+/// Look for chain of insertvalue's that fully define an aggregate, and trace
+/// back the values inserted, see if they are all were extractvalue'd from
+/// the same source aggregate from the exact same element indexes.
+/// If they were, just reuse the source aggregate.
+/// This potentially deals with PHI indirections.
+Instruction *InstCombinerImpl::foldAggregateConstructionIntoAggregateReuse(
+    InsertValueInst &OrigIVI) {
+  Type *AggTy = OrigIVI.getType();
+  unsigned NumAggElts;
+  switch (AggTy->getTypeID()) {
+  case Type::StructTyID:
+    NumAggElts = AggTy->getStructNumElements();
+    break;
+  case Type::ArrayTyID:
+    NumAggElts = AggTy->getArrayNumElements();
+    break;
+  default:
+    llvm_unreachable("Unhandled aggregate type?");
+  }
+
+  // Arbitrary aggregate size cut-off. Motivation for limit of 2 is to be able
+  // to handle clang C++ exception struct (which is hardcoded as {i8*, i32}),
+  // FIXME: any interesting patterns to be caught with larger limit?
+  assert(NumAggElts > 0 && "Aggregate should have elements.");
+  if (NumAggElts > 2)
+    return nullptr;
+
+  static constexpr auto NotFound = None;
+  static constexpr auto FoundMismatch = nullptr;
+
+  // Try to find a value of each element of an aggregate.
+  // FIXME: deal with more complex, not one-dimensional, aggregate types
+  SmallVector<Optional<Value *>, 2> AggElts(NumAggElts, NotFound);
+
+  // Do we know values for each element of the aggregate?
+  auto KnowAllElts = [&AggElts]() {
+    return all_of(AggElts,
+                  [](Optional<Value *> Elt) { return Elt != NotFound; });
+  };
+
+  int Depth = 0;
+
+  // Arbitrary `insertvalue` visitation depth limit. Let's be okay with
+  // every element being overwritten twice, which should never happen.
+  static const int DepthLimit = 2 * NumAggElts;
+
+  // Recurse up the chain of `insertvalue` aggregate operands until either we've
+  // reconstructed full initializer or can't visit any more `insertvalue`'s.
+  for (InsertValueInst *CurrIVI = &OrigIVI;
+       Depth < DepthLimit && CurrIVI && !KnowAllElts();
+       CurrIVI = dyn_cast<InsertValueInst>(CurrIVI->getAggregateOperand()),
+                       ++Depth) {
+    Value *InsertedValue = CurrIVI->getInsertedValueOperand();
+    ArrayRef<unsigned int> Indices = CurrIVI->getIndices();
+
+    // Don't bother with more than single-level aggregates.
+    if (Indices.size() != 1)
+      return nullptr; // FIXME: deal with more complex aggregates?
+
+    // Now, we may have already previously recorded the value for this element
+    // of an aggregate. If we did, that means the CurrIVI will later be
+    // overwritten with the already-recorded value. But if not, let's record it!
+    Optional<Value *> &Elt = AggElts[Indices.front()];
+    Elt = Elt.getValueOr(InsertedValue);
+
+    // FIXME: should we handle chain-terminating undef base operand?
+  }
+
+  // Was that sufficient to deduce the full initializer for the aggregate?
+  if (!KnowAllElts())
+    return nullptr; // Give up then.
+
+  // We now want to find the source[s] of the aggregate elements we've found.
+  // And with "source" we mean the original aggregate[s] from which
+  // the inserted elements were extracted. This may require PHI translation.
+
+  enum class AggregateDescription {
+    /// When analyzing the value that was inserted into an aggregate, we did
+    /// not manage to find defining `extractvalue` instruction to analyze.
+    NotFound,
+    /// When analyzing the value that was inserted into an aggregate, we did
+    /// manage to find defining `extractvalue` instruction[s], and everything
+    /// matched perfectly - aggregate type, element insertion/extraction index.
+    Found,
+    /// When analyzing the value that was inserted into an aggregate, we did
+    /// manage to find defining `extractvalue` instruction, but there was
+    /// a mismatch: either the source type from which the extraction was didn't
+    /// match the aggregate type into which the insertion was,
+    /// or the extraction/insertion channels mismatched,
+    /// or different elements had different source aggregates.
+    FoundMismatch
+  };
+  auto Describe = [](Optional<Value *> SourceAggregate) {
+    if (SourceAggregate == NotFound)
+      return AggregateDescription::NotFound;
+    if (*SourceAggregate == FoundMismatch)
+      return AggregateDescription::FoundMismatch;
+    return AggregateDescription::Found;
+  };
+
+  // Given the value \p Elt that was being inserted into element \p EltIdx of an
+  // aggregate AggTy, see if \p Elt was originally defined by an
+  // appropriate extractvalue (same element index, same aggregate type).
+  // If found, return the source aggregate from which the extraction was.
+  // If \p PredBB is provided, does PHI translation of an \p Elt first.
+  auto FindSourceAggregate =
+      [&](Value *Elt, unsigned EltIdx, Optional<BasicBlock *> UseBB,
+          Optional<BasicBlock *> PredBB) -> Optional<Value *> {
+    // For now(?), only deal with, at most, a single level of PHI indirection.
+    if (UseBB && PredBB)
+      Elt = Elt->DoPHITranslation(*UseBB, *PredBB);
+    // FIXME: deal with multiple levels of PHI indirection?
+
+    // Did we find an extraction?
+    auto *EVI = dyn_cast<ExtractValueInst>(Elt);
+    if (!EVI)
+      return NotFound;
+
+    Value *SourceAggregate = EVI->getAggregateOperand();
+
+    // Is the extraction from the same type into which the insertion was?
+    if (SourceAggregate->getType() != AggTy)
+      return FoundMismatch;
+    // And the element index doesn't change between extraction and insertion?
+    if (EVI->getNumIndices() != 1 || EltIdx != EVI->getIndices().front())
+      return FoundMismatch;
+
+    return SourceAggregate; // AggregateDescription::Found
+  };
+
+  // Given elements AggElts that were constructing an aggregate OrigIVI,
+  // see if we can find appropriate source aggregate for each of the elements,
+  // and see it's the same aggregate for each element. If so, return it.
+  auto FindCommonSourceAggregate =
+      [&](Optional<BasicBlock *> UseBB,
+          Optional<BasicBlock *> PredBB) -> Optional<Value *> {
+    Optional<Value *> SourceAggregate;
+
+    for (auto I : enumerate(AggElts)) {
+      assert(Describe(SourceAggregate) != AggregateDescription::FoundMismatch &&
+             "We don't store nullptr in SourceAggregate!");
+      assert((Describe(SourceAggregate) == AggregateDescription::Found) ==
+                 (I.index() != 0) &&
+             "SourceAggregate should be valid after the the first element,");
+
+      // For this element, is there a plausible source aggregate?
+      // FIXME: we could special-case undef element, IFF we know that in the
+      //        source aggregate said element isn't poison.
+      Optional<Value *> SourceAggregateForElement =
+          FindSourceAggregate(*I.value(), I.index(), UseBB, PredBB);
+
+      // Okay, what have we found? Does that correlate with previous findings?
+
+      // Regardless of whether or not we have previously found source
+      // aggregate for previous elements (if any), if we didn't find one for
+      // this element, passthrough whatever we have just found.
+      if (Describe(SourceAggregateForElement) != AggregateDescription::Found)
+        return SourceAggregateForElement;
+
+      // Okay, we have found source aggregate for this element.
+      // Let's see what we already know from previous elements, if any.
+      switch (Describe(SourceAggregate)) {
+      case AggregateDescription::NotFound:
+        // This is apparently the first element that we have examined.
+        SourceAggregate = SourceAggregateForElement; // Record the aggregate!
+        continue; // Great, now look at next element.
+      case AggregateDescription::Found:
+        // We have previously already successfully examined other elements.
+        // Is this the same source aggregate we've found for other elements?
+        if (*SourceAggregateForElement != *SourceAggregate)
+          return FoundMismatch;
+        continue; // Still the same aggregate, look at next element.
+      case AggregateDescription::FoundMismatch:
+        llvm_unreachable("Can't happen. We would have early-exited then.");
+      };
+    }
+
+    assert(Describe(SourceAggregate) == AggregateDescription::Found &&
+           "Must be a valid Value");
+    return *SourceAggregate;
+  };
+
+  Optional<Value *> SourceAggregate;
+
+  // Can we find the source aggregate without looking at predecessors?
+  SourceAggregate = FindCommonSourceAggregate(/*UseBB=*/None, /*PredBB=*/None);
+  if (Describe(SourceAggregate) != AggregateDescription::NotFound) {
+    if (Describe(SourceAggregate) == AggregateDescription::FoundMismatch)
+      return nullptr; // Conflicting source aggregates!
+    ++NumAggregateReconstructionsSimplified;
+    return replaceInstUsesWith(OrigIVI, *SourceAggregate);
+  }
+
+  // Okay, apparently we need to look at predecessors.
+
+  // We should be smart about picking the "use" basic block, which will be the
+  // merge point for aggregate, where we'll insert the final PHI that will be
+  // used instead of OrigIVI. Basic block of OrigIVI is *not* the right choice.
+  // We should look in which blocks each of the AggElts is being defined,
+  // they all should be defined in the same basic block.
+  BasicBlock *UseBB = nullptr;
+
+  for (const Optional<Value *> &Elt : AggElts) {
+    // If this element's value was not defined by an instruction, ignore it.
+    auto *I = dyn_cast<Instruction>(*Elt);
+    if (!I)
+      continue;
+    // Otherwise, in which basic block is this instruction located?
+    BasicBlock *BB = I->getParent();
+    // If it's the first instruction we've encountered, record the basic block.
+    if (!UseBB) {
+      UseBB = BB;
+      continue;
+    }
+    // Otherwise, this must be the same basic block we've seen previously.
+    if (UseBB != BB)
+      return nullptr;
+  }
+
+  // If *all* of the elements are basic-block-independent, meaning they are
+  // either function arguments, or constant expressions, then if we didn't
+  // handle them without predecessor-aware handling, we won't handle them now.
+  if (!UseBB)
+    return nullptr;
+
+  // If we didn't manage to find source aggregate without looking at
+  // predecessors, and there are no predecessors to look at, then we're done.
+  if (pred_empty(UseBB))
+    return nullptr;
+
+  // Arbitrary predecessor count limit.
+  static const int PredCountLimit = 64;
+
+  // Cache the (non-uniqified!) list of predecessors in a vector,
+  // checking the limit at the same time for efficiency.
+  SmallVector<BasicBlock *, 4> Preds; // May have duplicates!
+  for (BasicBlock *Pred : predecessors(UseBB)) {
+    // Don't bother if there are too many predecessors.
+    if (Preds.size() >= PredCountLimit) // FIXME: only count duplicates once?
+      return nullptr;
+    Preds.emplace_back(Pred);
+  }
+
+  // For each predecessor, what is the source aggregate,
+  // from which all the elements were originally extracted from?
+  // Note that we want for the map to have stable iteration order!
+  SmallDenseMap<BasicBlock *, Value *, 4> SourceAggregates;
+  for (BasicBlock *Pred : Preds) {
+    std::pair<decltype(SourceAggregates)::iterator, bool> IV =
+        SourceAggregates.insert({Pred, nullptr});
+    // Did we already evaluate this predecessor?
+    if (!IV.second)
+      continue;
+
+    // Let's hope that when coming from predecessor Pred, all elements of the
+    // aggregate produced by OrigIVI must have been originally extracted from
+    // the same aggregate. Is that so? Can we find said original aggregate?
+    SourceAggregate = FindCommonSourceAggregate(UseBB, Pred);
+    if (Describe(SourceAggregate) != AggregateDescription::Found)
+      return nullptr; // Give up.
+    IV.first->second = *SourceAggregate;
+  }
+
+  // All good! Now we just need to thread the source aggregates here.
+  // Note that we have to insert the new PHI here, ourselves, because we can't
+  // rely on InstCombinerImpl::run() inserting it into the right basic block.
+  // Note that the same block can be a predecessor more than once,
+  // and we need to preserve that invariant for the PHI node.
+  BuilderTy::InsertPointGuard Guard(Builder);
+  Builder.SetInsertPoint(UseBB->getFirstNonPHI());
+  auto *PHI =
+      Builder.CreatePHI(AggTy, Preds.size(), OrigIVI.getName() + ".merged");
+  for (BasicBlock *Pred : Preds)
+    PHI->addIncoming(SourceAggregates[Pred], Pred);
+
+  ++NumAggregateReconstructionsSimplified;
+  return replaceInstUsesWith(OrigIVI, PHI);
+}
+
 /// Try to find redundant insertvalue instructions, like the following ones:
 ///  %0 = insertvalue { i8, i32 } undef, i8 %x, 0
 ///  %1 = insertvalue { i8, i32 } %0,    i8 %y, 0
@@ -699,7 +991,7 @@ static ShuffleOps collectShuffleElements(Value *V, SmallVectorImpl<int> &Mask,
 /// first one, making the first one redundant.
 /// It should be transformed to:
 ///  %0 = insertvalue { i8, i32 } undef, i8 %y, 0
-Instruction *InstCombiner::visitInsertValueInst(InsertValueInst &I) {
+Instruction *InstCombinerImpl::visitInsertValueInst(InsertValueInst &I) {
   bool IsRedundant = false;
   ArrayRef<unsigned int> FirstIndices = I.getIndices();
 
@@ -724,6 +1016,10 @@ Instruction *InstCombiner::visitInsertValueInst(InsertValueInst &I) {
 
   if (IsRedundant)
     return replaceInstUsesWith(I, I.getOperand(0));
+
+  if (Instruction *NewI = foldAggregateConstructionIntoAggregateReuse(I))
+    return NewI;
+
   return nullptr;
 }
 
@@ -854,7 +1150,8 @@ static Instruction *foldInsEltIntoSplat(InsertElementInst &InsElt) {
   // For example:
   // inselt (shuf (inselt undef, X, 0), undef, <0,undef,0,undef>), X, 1
   //   --> shuf (inselt undef, X, 0), undef, <0,0,0,undef>
-  unsigned NumMaskElts = Shuf->getType()->getNumElements();
+  unsigned NumMaskElts =
+      cast<FixedVectorType>(Shuf->getType())->getNumElements();
   SmallVector<int, 16> NewMask(NumMaskElts);
   for (unsigned i = 0; i != NumMaskElts; ++i)
     NewMask[i] = i == IdxC ? 0 : Shuf->getMaskValue(i);
@@ -892,7 +1189,8 @@ static Instruction *foldInsEltIntoIdentityShuffle(InsertElementInst &InsElt) {
   // that same index value.
   // For example:
   // inselt (shuf X, IdMask), (extelt X, IdxC), IdxC --> shuf X, IdMask'
-  unsigned NumMaskElts = Shuf->getType()->getNumElements();
+  unsigned NumMaskElts =
+      cast<FixedVectorType>(Shuf->getType())->getNumElements();
   SmallVector<int, 16> NewMask(NumMaskElts);
   ArrayRef<int> OldMask = Shuf->getShuffleMask();
   for (unsigned i = 0; i != NumMaskElts; ++i) {
@@ -1041,7 +1339,7 @@ static Instruction *foldConstantInsEltIntoShuffle(InsertElementInst &InsElt) {
   return nullptr;
 }
 
-Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) {
+Instruction *InstCombinerImpl::visitInsertElementInst(InsertElementInst &IE) {
   Value *VecOp    = IE.getOperand(0);
   Value *ScalarOp = IE.getOperand(1);
   Value *IdxOp    = IE.getOperand(2);
@@ -1189,7 +1487,7 @@ static bool canEvaluateShuffled(Value *V, ArrayRef<int> Mask,
       // Propagating an undefined shuffle mask element to integer div/rem is not
       // allowed because those opcodes can create immediate undefined behavior
       // from an undefined element in an operand.
-      if (llvm::any_of(Mask, [](int M){ return M == -1; }))
+      if (llvm::is_contained(Mask, -1))
         return false;
       LLVM_FALLTHROUGH;
     case Instruction::Add:
@@ -1222,7 +1520,7 @@ static bool canEvaluateShuffled(Value *V, ArrayRef<int> Mask,
       // longer vector ops, but that may result in more expensive codegen.
       Type *ITy = I->getType();
       if (ITy->isVectorTy() &&
-          Mask.size() > cast<VectorType>(ITy)->getNumElements())
+          Mask.size() > cast<FixedVectorType>(ITy)->getNumElements())
         return false;
       for (Value *Operand : I->operands()) {
         if (!canEvaluateShuffled(Operand, Mask, Depth - 1))
@@ -1380,7 +1678,8 @@ static Value *evaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask) {
     case Instruction::GetElementPtr: {
       SmallVector<Value*, 8> NewOps;
       bool NeedsRebuild =
-          (Mask.size() != cast<VectorType>(I->getType())->getNumElements());
+          (Mask.size() !=
+           cast<FixedVectorType>(I->getType())->getNumElements());
       for (int i = 0, e = I->getNumOperands(); i != e; ++i) {
         Value *V;
         // Recursively call evaluateInDifferentElementOrder on vector arguments
@@ -1435,7 +1734,7 @@ static Value *evaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask) {
 static bool isShuffleExtractingFromLHS(ShuffleVectorInst &SVI,
                                        ArrayRef<int> Mask) {
   unsigned LHSElems =
-      cast<VectorType>(SVI.getOperand(0)->getType())->getNumElements();
+      cast<FixedVectorType>(SVI.getOperand(0)->getType())->getNumElements();
   unsigned MaskElems = Mask.size();
   unsigned BegIdx = Mask.front();
   unsigned EndIdx = Mask.back();
@@ -1525,7 +1824,7 @@ static Instruction *foldSelectShuffleWith1Binop(ShuffleVectorInst &Shuf) {
       is_contained(Mask, UndefMaskElem) &&
       (Instruction::isIntDivRem(BOpcode) || Instruction::isShift(BOpcode));
   if (MightCreatePoisonOrUB)
-    NewC = getSafeVectorConstantForBinop(BOpcode, NewC, true);
+    NewC = InstCombiner::getSafeVectorConstantForBinop(BOpcode, NewC, true);
 
   // shuf (bop X, C), X, M --> bop X, C'
   // shuf X, (bop X, C), M --> bop X, C'
@@ -1567,7 +1866,8 @@ static Instruction *canonicalizeInsertSplat(ShuffleVectorInst &Shuf,
   // For example:
   // shuf (inselt undef, X, 2), undef, <2,2,undef>
   //   --> shuf (inselt undef, X, 0), undef, <0,0,undef>
-  unsigned NumMaskElts = Shuf.getType()->getNumElements();
+  unsigned NumMaskElts =
+      cast<FixedVectorType>(Shuf.getType())->getNumElements();
   SmallVector<int, 16> NewMask(NumMaskElts, 0);
   for (unsigned i = 0; i != NumMaskElts; ++i)
     if (Mask[i] == UndefMaskElem)
@@ -1585,7 +1885,7 @@ static Instruction *foldSelectShuffle(ShuffleVectorInst &Shuf,
 
   // Canonicalize to choose from operand 0 first unless operand 1 is undefined.
   // Commuting undef to operand 0 conflicts with another canonicalization.
-  unsigned NumElts = Shuf.getType()->getNumElements();
+  unsigned NumElts = cast<FixedVectorType>(Shuf.getType())->getNumElements();
   if (!isa<UndefValue>(Shuf.getOperand(1)) &&
       Shuf.getMaskValue(0) >= (int)NumElts) {
     // TODO: Can we assert that both operands of a shuffle-select are not undef
@@ -1652,7 +1952,8 @@ static Instruction *foldSelectShuffle(ShuffleVectorInst &Shuf,
       is_contained(Mask, UndefMaskElem) &&
       (Instruction::isIntDivRem(BOpc) || Instruction::isShift(BOpc));
   if (MightCreatePoisonOrUB)
-    NewC = getSafeVectorConstantForBinop(BOpc, NewC, ConstantsAreOp1);
+    NewC = InstCombiner::getSafeVectorConstantForBinop(BOpc, NewC,
+                                                       ConstantsAreOp1);
 
   Value *V;
   if (X == Y) {
@@ -1719,8 +2020,8 @@ static Instruction *foldTruncShuffle(ShuffleVectorInst &Shuf,
   // and the source element type must be larger than the shuffle element type.
   Type *SrcType = X->getType();
   if (!SrcType->isVectorTy() || !SrcType->isIntOrIntVectorTy() ||
-      cast<VectorType>(SrcType)->getNumElements() !=
-          cast<VectorType>(DestType)->getNumElements() ||
+      cast<FixedVectorType>(SrcType)->getNumElements() !=
+          cast<FixedVectorType>(DestType)->getNumElements() ||
       SrcType->getScalarSizeInBits() % DestType->getScalarSizeInBits() != 0)
     return nullptr;
 
@@ -1736,8 +2037,7 @@ static Instruction *foldTruncShuffle(ShuffleVectorInst &Shuf,
     if (Mask[i] == UndefMaskElem)
       continue;
     uint64_t LSBIndex = IsBigEndian ? (i + 1) * TruncRatio - 1 : i * TruncRatio;
-    assert(LSBIndex <= std::numeric_limits<int32_t>::max() &&
-           "Overflowed 32-bits");
+    assert(LSBIndex <= INT32_MAX && "Overflowed 32-bits");
     if (Mask[i] != (int)LSBIndex)
       return nullptr;
   }
@@ -1764,19 +2064,19 @@ static Instruction *narrowVectorSelect(ShuffleVectorInst &Shuf,
 
   // We need a narrow condition value. It must be extended with undef elements
   // and have the same number of elements as this shuffle.
-  unsigned NarrowNumElts = Shuf.getType()->getNumElements();
+  unsigned NarrowNumElts =
+      cast<FixedVectorType>(Shuf.getType())->getNumElements();
   Value *NarrowCond;
   if (!match(Cond, m_OneUse(m_Shuffle(m_Value(NarrowCond), m_Undef()))) ||
-      cast<VectorType>(NarrowCond->getType())->getNumElements() !=
+      cast<FixedVectorType>(NarrowCond->getType())->getNumElements() !=
           NarrowNumElts ||
       !cast<ShuffleVectorInst>(Cond)->isIdentityWithPadding())
     return nullptr;
 
   // shuf (sel (shuf NarrowCond, undef, WideMask), X, Y), undef, NarrowMask) -->
   // sel NarrowCond, (shuf X, undef, NarrowMask), (shuf Y, undef, NarrowMask)
-  Value *Undef = UndefValue::get(X->getType());
-  Value *NarrowX = Builder.CreateShuffleVector(X, Undef, Shuf.getShuffleMask());
-  Value *NarrowY = Builder.CreateShuffleVector(Y, Undef, Shuf.getShuffleMask());
+  Value *NarrowX = Builder.CreateShuffleVector(X, Shuf.getShuffleMask());
+  Value *NarrowY = Builder.CreateShuffleVector(Y, Shuf.getShuffleMask());
   return SelectInst::Create(NarrowCond, NarrowX, NarrowY);
 }
 
@@ -1807,7 +2107,7 @@ static Instruction *foldIdentityExtractShuffle(ShuffleVectorInst &Shuf) {
   // new shuffle mask. Otherwise, copy the original mask element. Example:
   //   shuf (shuf X, Y, <C0, C1, C2, undef, C4>), undef, <0, undef, 2, 3> -->
   //   shuf X, Y, <C0, undef, C2, undef>
-  unsigned NumElts = Shuf.getType()->getNumElements();
+  unsigned NumElts = cast<FixedVectorType>(Shuf.getType())->getNumElements();
   SmallVector<int, 16> NewMask(NumElts);
   assert(NumElts < Mask.size() &&
          "Identity with extract must have less elements than its inputs");
@@ -1823,7 +2123,7 @@ static Instruction *foldIdentityExtractShuffle(ShuffleVectorInst &Shuf) {
 /// Try to replace a shuffle with an insertelement or try to replace a shuffle
 /// operand with the operand of an insertelement.
 static Instruction *foldShuffleWithInsert(ShuffleVectorInst &Shuf,
-                                          InstCombiner &IC) {
+                                          InstCombinerImpl &IC) {
   Value *V0 = Shuf.getOperand(0), *V1 = Shuf.getOperand(1);
   SmallVector<int, 16> Mask;
   Shuf.getShuffleMask(Mask);
@@ -1832,7 +2132,7 @@ static Instruction *foldShuffleWithInsert(ShuffleVectorInst &Shuf,
   // TODO: This restriction could be removed if the insert has only one use
   //       (because the transform would require a new length-changing shuffle).
   int NumElts = Mask.size();
-  if (NumElts != (int)(cast<VectorType>(V0->getType())->getNumElements()))
+  if (NumElts != (int)(cast<FixedVectorType>(V0->getType())->getNumElements()))
     return nullptr;
 
   // This is a specialization of a fold in SimplifyDemandedVectorElts. We may
@@ -1844,7 +2144,7 @@ static Instruction *foldShuffleWithInsert(ShuffleVectorInst &Shuf,
   uint64_t IdxC;
   if (match(V0, m_InsertElt(m_Value(X), m_Value(), m_ConstantInt(IdxC)))) {
     // shuf (inselt X, ?, IdxC), ?, Mask --> shuf X, ?, Mask
-    if (none_of(Mask, [IdxC](int MaskElt) { return MaskElt == (int)IdxC; }))
+    if (!is_contained(Mask, (int)IdxC))
       return IC.replaceOperand(Shuf, 0, X);
   }
   if (match(V1, m_InsertElt(m_Value(X), m_Value(), m_ConstantInt(IdxC)))) {
@@ -1852,7 +2152,7 @@ static Instruction *foldShuffleWithInsert(ShuffleVectorInst &Shuf,
     // accesses to the 2nd vector input of the shuffle.
     IdxC += NumElts;
     // shuf ?, (inselt X, ?, IdxC), Mask --> shuf ?, X, Mask
-    if (none_of(Mask, [IdxC](int MaskElt) { return MaskElt == (int)IdxC; }))
+    if (!is_contained(Mask, (int)IdxC))
       return IC.replaceOperand(Shuf, 1, X);
   }
 
@@ -1927,9 +2227,10 @@ static Instruction *foldIdentityPaddedShuffles(ShuffleVectorInst &Shuf) {
   Value *X = Shuffle0->getOperand(0);
   Value *Y = Shuffle1->getOperand(0);
   if (X->getType() != Y->getType() ||
-      !isPowerOf2_32(Shuf.getType()->getNumElements()) ||
-      !isPowerOf2_32(Shuffle0->getType()->getNumElements()) ||
-      !isPowerOf2_32(cast<VectorType>(X->getType())->getNumElements()) ||
+      !isPowerOf2_32(cast<FixedVectorType>(Shuf.getType())->getNumElements()) ||
+      !isPowerOf2_32(
+          cast<FixedVectorType>(Shuffle0->getType())->getNumElements()) ||
+      !isPowerOf2_32(cast<FixedVectorType>(X->getType())->getNumElements()) ||
       isa<UndefValue>(X) || isa<UndefValue>(Y))
     return nullptr;
   assert(isa<UndefValue>(Shuffle0->getOperand(1)) &&
@@ -1940,8 +2241,8 @@ static Instruction *foldIdentityPaddedShuffles(ShuffleVectorInst &Shuf) {
   // operands directly by adjusting the shuffle mask to account for the narrower
   // types:
   // shuf (widen X), (widen Y), Mask --> shuf X, Y, Mask'
-  int NarrowElts = cast<VectorType>(X->getType())->getNumElements();
-  int WideElts = Shuffle0->getType()->getNumElements();
+  int NarrowElts = cast<FixedVectorType>(X->getType())->getNumElements();
+  int WideElts = cast<FixedVectorType>(Shuffle0->getType())->getNumElements();
   assert(WideElts > NarrowElts && "Unexpected types for identity with padding");
 
   ArrayRef<int> Mask = Shuf.getShuffleMask();
@@ -1974,7 +2275,7 @@ static Instruction *foldIdentityPaddedShuffles(ShuffleVectorInst &Shuf) {
   return new ShuffleVectorInst(X, Y, NewMask);
 }
 
-Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
+Instruction *InstCombinerImpl::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   Value *LHS = SVI.getOperand(0);
   Value *RHS = SVI.getOperand(1);
   SimplifyQuery ShufQuery = SQ.getWithInstruction(&SVI);
@@ -1982,9 +2283,13 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
                                           SVI.getType(), ShufQuery))
     return replaceInstUsesWith(SVI, V);
 
+  // Bail out for scalable vectors
+  if (isa<ScalableVectorType>(LHS->getType()))
+    return nullptr;
+
   // shuffle x, x, mask --> shuffle x, undef, mask'
-  unsigned VWidth = SVI.getType()->getNumElements();
-  unsigned LHSWidth = cast<VectorType>(LHS->getType())->getNumElements();
+  unsigned VWidth = cast<FixedVectorType>(SVI.getType())->getNumElements();
+  unsigned LHSWidth = cast<FixedVectorType>(LHS->getType())->getNumElements();
   ArrayRef<int> Mask = SVI.getShuffleMask();
   Type *Int32Ty = Type::getInt32Ty(SVI.getContext());
 
@@ -1998,7 +2303,7 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   if (match(LHS, m_BitCast(m_Value(X))) && match(RHS, m_Undef()) &&
       X->getType()->isVectorTy() && VWidth == LHSWidth) {
     // Try to create a scaled mask constant.
-    auto *XType = cast<VectorType>(X->getType());
+    auto *XType = cast<FixedVectorType>(X->getType());
     unsigned XNumElts = XType->getNumElements();
     SmallVector<int, 16> ScaledMask;
     if (XNumElts >= VWidth) {
@@ -2106,7 +2411,7 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   if (isShuffleExtractingFromLHS(SVI, Mask)) {
     Value *V = LHS;
     unsigned MaskElems = Mask.size();
-    VectorType *SrcTy = cast<VectorType>(V->getType());
+    auto *SrcTy = cast<FixedVectorType>(V->getType());
     unsigned VecBitWidth = SrcTy->getPrimitiveSizeInBits().getFixedSize();
     unsigned SrcElemBitWidth = DL.getTypeSizeInBits(SrcTy->getElementType());
     assert(SrcElemBitWidth && "vector elements must have a bitwidth");
@@ -2138,8 +2443,7 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
         SmallVector<int, 16> ShuffleMask(SrcNumElems, -1);
         for (unsigned I = 0, E = MaskElems, Idx = BegIdx; I != E; ++Idx, ++I)
           ShuffleMask[I] = Idx;
-        V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()),
-                                        ShuffleMask,
+        V = Builder.CreateShuffleVector(V, ShuffleMask,
                                         SVI.getName() + ".extract");
         BegIdx = 0;
       }
@@ -2224,11 +2528,11 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   if (LHSShuffle) {
     LHSOp0 = LHSShuffle->getOperand(0);
     LHSOp1 = LHSShuffle->getOperand(1);
-    LHSOp0Width = cast<VectorType>(LHSOp0->getType())->getNumElements();
+    LHSOp0Width = cast<FixedVectorType>(LHSOp0->getType())->getNumElements();
   }
   if (RHSShuffle) {
     RHSOp0 = RHSShuffle->getOperand(0);
-    RHSOp0Width = cast<VectorType>(RHSOp0->getType())->getNumElements();
+    RHSOp0Width = cast<FixedVectorType>(RHSOp0->getType())->getNumElements();
   }
   Value* newLHS = LHS;
   Value* newRHS = RHS;
@@ -2331,17 +2635,9 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   // If the result mask is equal to one of the original shuffle masks,
   // or is a splat, do the replacement.
   if (isSplat || newMask == LHSMask || newMask == RHSMask || newMask == Mask) {
-    SmallVector<Constant*, 16> Elts;
-    for (unsigned i = 0, e = newMask.size(); i != e; ++i) {
-      if (newMask[i] < 0) {
-        Elts.push_back(UndefValue::get(Int32Ty));
-      } else {
-        Elts.push_back(ConstantInt::get(Int32Ty, newMask[i]));
-      }
-    }
     if (!newRHS)
       newRHS = UndefValue::get(newLHS->getType());
-    return new ShuffleVectorInst(newLHS, newRHS, ConstantVector::get(Elts));
+    return new ShuffleVectorInst(newLHS, newRHS, newMask);
   }
 
   return MadeChange ? &SVI : nullptr;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 17a5ec3f87fa..828fd49524ec 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -59,6 +59,7 @@
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/TargetFolder.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/BasicBlock.h"
@@ -113,6 +114,9 @@ using namespace llvm::PatternMatch;
 
 #define DEBUG_TYPE "instcombine"
 
+STATISTIC(NumWorklistIterations,
+          "Number of instruction combining iterations performed");
+
 STATISTIC(NumCombined , "Number of insts combined");
 STATISTIC(NumConstProp, "Number of constant folds");
 STATISTIC(NumDeadInst , "Number of dead inst eliminated");
@@ -123,8 +127,13 @@ STATISTIC(NumReassoc  , "Number of reassociations");
 DEBUG_COUNTER(VisitCounter, "instcombine-visit",
               "Controls which instructions are visited");
 
+// FIXME: these limits eventually should be as low as 2.
 static constexpr unsigned InstCombineDefaultMaxIterations = 1000;
+#ifndef NDEBUG
+static constexpr unsigned InstCombineDefaultInfiniteLoopThreshold = 100;
+#else
 static constexpr unsigned InstCombineDefaultInfiniteLoopThreshold = 1000;
+#endif
 
 static cl::opt<bool>
 EnableCodeSinking("instcombine-code-sinking", cl::desc("Enable code sinking"),
@@ -155,7 +164,41 @@ MaxArraySize("instcombine-maxarray-size", cl::init(1024),
 static cl::opt<unsigned> ShouldLowerDbgDeclare("instcombine-lower-dbg-declare",
                                                cl::Hidden, cl::init(true));
 
-Value *InstCombiner::EmitGEPOffset(User *GEP) {
+Optional<Instruction *>
+InstCombiner::targetInstCombineIntrinsic(IntrinsicInst &II) {
+  // Handle target specific intrinsics
+  if (II.getCalledFunction()->isTargetIntrinsic()) {
+    return TTI.instCombineIntrinsic(*this, II);
+  }
+  return None;
+}
+
+Optional<Value *> InstCombiner::targetSimplifyDemandedUseBitsIntrinsic(
+    IntrinsicInst &II, APInt DemandedMask, KnownBits &Known,
+    bool &KnownBitsComputed) {
+  // Handle target specific intrinsics
+  if (II.getCalledFunction()->isTargetIntrinsic()) {
+    return TTI.simplifyDemandedUseBitsIntrinsic(*this, II, DemandedMask, Known,
+                                                KnownBitsComputed);
+  }
+  return None;
+}
+
+Optional<Value *> InstCombiner::targetSimplifyDemandedVectorEltsIntrinsic(
+    IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2,
+    APInt &UndefElts3,
+    std::function<void(Instruction *, unsigned, APInt, APInt &)>
+        SimplifyAndSetOp) {
+  // Handle target specific intrinsics
+  if (II.getCalledFunction()->isTargetIntrinsic()) {
+    return TTI.simplifyDemandedVectorEltsIntrinsic(
+        *this, II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
+        SimplifyAndSetOp);
+  }
+  return None;
+}
+
+Value *InstCombinerImpl::EmitGEPOffset(User *GEP) {
   return llvm::EmitGEPOffset(&Builder, DL, GEP);
 }
 
@@ -168,8 +211,8 @@ Value *InstCombiner::EmitGEPOffset(User *GEP) {
 /// legal to convert to, in order to open up more combining opportunities.
 /// NOTE: this treats i8, i16 and i32 specially, due to them being so common
 /// from frontend languages.
-bool InstCombiner::shouldChangeType(unsigned FromWidth,
-                                    unsigned ToWidth) const {
+bool InstCombinerImpl::shouldChangeType(unsigned FromWidth,
+                                        unsigned ToWidth) const {
   bool FromLegal = FromWidth == 1 || DL.isLegalInteger(FromWidth);
   bool ToLegal = ToWidth == 1 || DL.isLegalInteger(ToWidth);
 
@@ -196,7 +239,7 @@ bool InstCombiner::shouldChangeType(unsigned FromWidth,
 /// to a larger illegal type. i1 is always treated as a legal type because it is
 /// a fundamental type in IR, and there are many specialized optimizations for
 /// i1 types.
-bool InstCombiner::shouldChangeType(Type *From, Type *To) const {
+bool InstCombinerImpl::shouldChangeType(Type *From, Type *To) const {
   // TODO: This could be extended to allow vectors. Datalayout changes might be
   // needed to properly support that.
   if (!From->isIntegerTy() || !To->isIntegerTy())
@@ -264,7 +307,8 @@ static void ClearSubclassDataAfterReassociation(BinaryOperator &I) {
 /// cast to eliminate one of the associative operations:
 /// (op (cast (op X, C2)), C1) --> (cast (op X, op (C1, C2)))
 /// (op (cast (op X, C2)), C1) --> (op (cast X), op (C1, C2))
-static bool simplifyAssocCastAssoc(BinaryOperator *BinOp1, InstCombiner &IC) {
+static bool simplifyAssocCastAssoc(BinaryOperator *BinOp1,
+                                   InstCombinerImpl &IC) {
   auto *Cast = dyn_cast<CastInst>(BinOp1->getOperand(0));
   if (!Cast || !Cast->hasOneUse())
     return false;
@@ -322,7 +366,7 @@ static bool simplifyAssocCastAssoc(BinaryOperator *BinOp1, InstCombiner &IC) {
 ///  5. Transform: "A op (B op C)" ==> "B op (C op A)" if "C op A" simplifies.
 ///  6. Transform: "(A op C1) op (B op C2)" ==> "(A op B) op (C1 op C2)"
 ///     if C1 and C2 are constants.
-bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
+bool InstCombinerImpl::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
   Instruction::BinaryOps Opcode = I.getOpcode();
   bool Changed = false;
 
@@ -550,9 +594,10 @@ getBinOpsForFactorization(Instruction::BinaryOps TopOpcode, BinaryOperator *Op,
 
 /// This tries to simplify binary operations by factorizing out common terms
 /// (e. g. "(A*B)+(A*C)" -> "A*(B+C)").
-Value *InstCombiner::tryFactorization(BinaryOperator &I,
-                                      Instruction::BinaryOps InnerOpcode,
-                                      Value *A, Value *B, Value *C, Value *D) {
+Value *InstCombinerImpl::tryFactorization(BinaryOperator &I,
+                                          Instruction::BinaryOps InnerOpcode,
+                                          Value *A, Value *B, Value *C,
+                                          Value *D) {
   assert(A && B && C && D && "All values must be provided");
 
   Value *V = nullptr;
@@ -655,7 +700,7 @@ Value *InstCombiner::tryFactorization(BinaryOperator &I,
 /// (eg "(A*B)+(A*C)" -> "A*(B+C)") or expanding out if this results in
 /// simplifications (eg: "A & (B | C) -> (A&B) | (A&C)" if this is a win).
 /// Returns the simplified value, or null if it didn't simplify.
-Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) {
+Value *InstCombinerImpl::SimplifyUsingDistributiveLaws(BinaryOperator &I) {
   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
   BinaryOperator *Op0 = dyn_cast<BinaryOperator>(LHS);
   BinaryOperator *Op1 = dyn_cast<BinaryOperator>(RHS);
@@ -698,8 +743,10 @@ Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) {
     Value *A = Op0->getOperand(0), *B = Op0->getOperand(1), *C = RHS;
     Instruction::BinaryOps InnerOpcode = Op0->getOpcode(); // op'
 
-    Value *L = SimplifyBinOp(TopLevelOpcode, A, C, SQ.getWithInstruction(&I));
-    Value *R = SimplifyBinOp(TopLevelOpcode, B, C, SQ.getWithInstruction(&I));
+    // Disable the use of undef because it's not safe to distribute undef.
+    auto SQDistributive = SQ.getWithInstruction(&I).getWithoutUndef();
+    Value *L = SimplifyBinOp(TopLevelOpcode, A, C, SQDistributive);
+    Value *R = SimplifyBinOp(TopLevelOpcode, B, C, SQDistributive);
 
     // Do "A op C" and "B op C" both simplify?
     if (L && R) {
@@ -735,8 +782,10 @@ Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) {
     Value *A = LHS, *B = Op1->getOperand(0), *C = Op1->getOperand(1);
     Instruction::BinaryOps InnerOpcode = Op1->getOpcode(); // op'
 
-    Value *L = SimplifyBinOp(TopLevelOpcode, A, B, SQ.getWithInstruction(&I));
-    Value *R = SimplifyBinOp(TopLevelOpcode, A, C, SQ.getWithInstruction(&I));
+    // Disable the use of undef because it's not safe to distribute undef.
+    auto SQDistributive = SQ.getWithInstruction(&I).getWithoutUndef();
+    Value *L = SimplifyBinOp(TopLevelOpcode, A, B, SQDistributive);
+    Value *R = SimplifyBinOp(TopLevelOpcode, A, C, SQDistributive);
 
     // Do "A op B" and "A op C" both simplify?
     if (L && R) {
@@ -769,8 +818,9 @@ Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) {
   return SimplifySelectsFeedingBinaryOp(I, LHS, RHS);
 }
 
-Value *InstCombiner::SimplifySelectsFeedingBinaryOp(BinaryOperator &I,
-                                                    Value *LHS, Value *RHS) {
+Value *InstCombinerImpl::SimplifySelectsFeedingBinaryOp(BinaryOperator &I,
+                                                        Value *LHS,
+                                                        Value *RHS) {
   Value *A, *B, *C, *D, *E, *F;
   bool LHSIsSelect = match(LHS, m_Select(m_Value(A), m_Value(B), m_Value(C)));
   bool RHSIsSelect = match(RHS, m_Select(m_Value(D), m_Value(E), m_Value(F)));
@@ -820,9 +870,33 @@ Value *InstCombiner::SimplifySelectsFeedingBinaryOp(BinaryOperator &I,
   return SI;
 }
 
+/// Freely adapt every user of V as-if V was changed to !V.
+/// WARNING: only if canFreelyInvertAllUsersOf() said this can be done.
+void InstCombinerImpl::freelyInvertAllUsersOf(Value *I) {
+  for (User *U : I->users()) {
+    switch (cast<Instruction>(U)->getOpcode()) {
+    case Instruction::Select: {
+      auto *SI = cast<SelectInst>(U);
+      SI->swapValues();
+      SI->swapProfMetadata();
+      break;
+    }
+    case Instruction::Br:
+      cast<BranchInst>(U)->swapSuccessors(); // swaps prof metadata too
+      break;
+    case Instruction::Xor:
+      replaceInstUsesWith(cast<Instruction>(*U), I);
+      break;
+    default:
+      llvm_unreachable("Got unexpected user - out of sync with "
+                       "canFreelyInvertAllUsersOf() ?");
+    }
+  }
+}
+
 /// Given a 'sub' instruction, return the RHS of the instruction if the LHS is a
 /// constant zero (which is the 'negate' form).
-Value *InstCombiner::dyn_castNegVal(Value *V) const {
+Value *InstCombinerImpl::dyn_castNegVal(Value *V) const {
   Value *NegV;
   if (match(V, m_Neg(m_Value(NegV))))
     return NegV;
@@ -883,7 +957,8 @@ static Value *foldOperationIntoSelectOperand(Instruction &I, Value *SO,
   return RI;
 }
 
-Instruction *InstCombiner::FoldOpIntoSelect(Instruction &Op, SelectInst *SI) {
+Instruction *InstCombinerImpl::FoldOpIntoSelect(Instruction &Op,
+                                                SelectInst *SI) {
   // Don't modify shared select instructions.
   if (!SI->hasOneUse())
     return nullptr;
@@ -908,7 +983,7 @@ Instruction *InstCombiner::FoldOpIntoSelect(Instruction &Op, SelectInst *SI) {
       return nullptr;
 
     // If vectors, verify that they have the same number of elements.
-    if (SrcTy && SrcTy->getNumElements() != DestTy->getNumElements())
+    if (SrcTy && SrcTy->getElementCount() != DestTy->getElementCount())
       return nullptr;
   }
 
@@ -978,7 +1053,7 @@ static Value *foldOperationIntoPhiValue(BinaryOperator *I, Value *InV,
   return RI;
 }
 
-Instruction *InstCombiner::foldOpIntoPhi(Instruction &I, PHINode *PN) {
+Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) {
   unsigned NumPHIValues = PN->getNumIncomingValues();
   if (NumPHIValues == 0)
     return nullptr;
@@ -1004,7 +1079,9 @@ Instruction *InstCombiner::foldOpIntoPhi(Instruction &I, PHINode *PN) {
   BasicBlock *NonConstBB = nullptr;
   for (unsigned i = 0; i != NumPHIValues; ++i) {
     Value *InVal = PN->getIncomingValue(i);
-    if (isa<Constant>(InVal) && !isa<ConstantExpr>(InVal))
+    // If I is a freeze instruction, count undef as a non-constant.
+    if (match(InVal, m_ImmConstant()) &&
+        (!isa<FreezeInst>(I) || isGuaranteedNotToBeUndefOrPoison(InVal)))
       continue;
 
     if (isa<PHINode>(InVal)) return nullptr;  // Itself a phi.
@@ -1029,9 +1106,11 @@ Instruction *InstCombiner::foldOpIntoPhi(Instruction &I, PHINode *PN) {
   // operation in that block.  However, if this is a critical edge, we would be
   // inserting the computation on some other paths (e.g. inside a loop).  Only
   // do this if the pred block is unconditionally branching into the phi block.
+  // Also, make sure that the pred block is not dead code.
   if (NonConstBB != nullptr) {
     BranchInst *BI = dyn_cast<BranchInst>(NonConstBB->getTerminator());
-    if (!BI || !BI->isUnconditional()) return nullptr;
+    if (!BI || !BI->isUnconditional() || !DT.isReachableFromEntry(NonConstBB))
+      return nullptr;
   }
 
   // Okay, we can do the transformation: create the new PHI node.
@@ -1063,7 +1142,7 @@ Instruction *InstCombiner::foldOpIntoPhi(Instruction &I, PHINode *PN) {
       // FalseVInPred versus TrueVInPred. When we have individual nonzero
       // elements in the vector, we will incorrectly fold InC to
       // `TrueVInPred`.
-      if (InC && !isa<ConstantExpr>(InC) && isa<ConstantInt>(InC))
+      if (InC && isa<ConstantInt>(InC))
         InV = InC->isNullValue() ? FalseVInPred : TrueVInPred;
       else {
         // Generate the select in the same block as PN's current incoming block.
@@ -1097,6 +1176,15 @@ Instruction *InstCombiner::foldOpIntoPhi(Instruction &I, PHINode *PN) {
                                              Builder);
       NewPN->addIncoming(InV, PN->getIncomingBlock(i));
     }
+  } else if (isa<FreezeInst>(&I)) {
+    for (unsigned i = 0; i != NumPHIValues; ++i) {
+      Value *InV;
+      if (NonConstBB == PN->getIncomingBlock(i))
+        InV = Builder.CreateFreeze(PN->getIncomingValue(i), "phi.fr");
+      else
+        InV = PN->getIncomingValue(i);
+      NewPN->addIncoming(InV, PN->getIncomingBlock(i));
+    }
   } else {
     CastInst *CI = cast<CastInst>(&I);
     Type *RetTy = CI->getType();
@@ -1111,8 +1199,8 @@ Instruction *InstCombiner::foldOpIntoPhi(Instruction &I, PHINode *PN) {
     }
   }
 
-  for (auto UI = PN->user_begin(), E = PN->user_end(); UI != E;) {
-    Instruction *User = cast<Instruction>(*UI++);
+  for (User *U : make_early_inc_range(PN->users())) {
+    Instruction *User = cast<Instruction>(U);
     if (User == &I) continue;
     replaceInstUsesWith(*User, NewPN);
     eraseInstFromFunction(*User);
@@ -1120,7 +1208,7 @@ Instruction *InstCombiner::foldOpIntoPhi(Instruction &I, PHINode *PN) {
   return replaceInstUsesWith(I, NewPN);
 }
 
-Instruction *InstCombiner::foldBinOpIntoSelectOrPhi(BinaryOperator &I) {
+Instruction *InstCombinerImpl::foldBinOpIntoSelectOrPhi(BinaryOperator &I) {
   if (!isa<Constant>(I.getOperand(1)))
     return nullptr;
 
@@ -1138,8 +1226,9 @@ Instruction *InstCombiner::foldBinOpIntoSelectOrPhi(BinaryOperator &I) {
 /// is a sequence of GEP indices into the pointed type that will land us at the
 /// specified offset. If so, fill them into NewIndices and return the resultant
 /// element type, otherwise return null.
-Type *InstCombiner::FindElementAtOffset(PointerType *PtrTy, int64_t Offset,
-                                        SmallVectorImpl<Value *> &NewIndices) {
+Type *
+InstCombinerImpl::FindElementAtOffset(PointerType *PtrTy, int64_t Offset,
+                                      SmallVectorImpl<Value *> &NewIndices) {
   Type *Ty = PtrTy->getElementType();
   if (!Ty->isSized())
     return nullptr;
@@ -1208,7 +1297,7 @@ static bool shouldMergeGEPs(GEPOperator &GEP, GEPOperator &Src) {
 
 /// Return a value X such that Val = X * Scale, or null if none.
 /// If the multiplication is known not to overflow, then NoSignedWrap is set.
-Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) {
+Value *InstCombinerImpl::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) {
   assert(isa<IntegerType>(Val->getType()) && "Can only descale integers!");
   assert(cast<IntegerType>(Val->getType())->getBitWidth() ==
          Scale.getBitWidth() && "Scale not compatible with value!");
@@ -1448,9 +1537,8 @@ Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) {
   } while (true);
 }
 
-Instruction *InstCombiner::foldVectorBinop(BinaryOperator &Inst) {
-  // FIXME: some of this is likely fine for scalable vectors
-  if (!isa<FixedVectorType>(Inst.getType()))
+Instruction *InstCombinerImpl::foldVectorBinop(BinaryOperator &Inst) {
+  if (!isa<VectorType>(Inst.getType()))
     return nullptr;
 
   BinaryOperator::BinaryOps Opcode = Inst.getOpcode();
@@ -1539,13 +1627,15 @@ Instruction *InstCombiner::foldVectorBinop(BinaryOperator &Inst) {
   // intends to move shuffles closer to other shuffles and binops closer to
   // other binops, so they can be folded. It may also enable demanded elements
   // transforms.
-  unsigned NumElts = cast<FixedVectorType>(Inst.getType())->getNumElements();
   Constant *C;
-  if (match(&Inst,
+  auto *InstVTy = dyn_cast<FixedVectorType>(Inst.getType());
+  if (InstVTy &&
+      match(&Inst,
             m_c_BinOp(m_OneUse(m_Shuffle(m_Value(V1), m_Undef(), m_Mask(Mask))),
-                      m_Constant(C))) && !isa<ConstantExpr>(C) &&
-      cast<FixedVectorType>(V1->getType())->getNumElements() <= NumElts) {
-    assert(Inst.getType()->getScalarType() == V1->getType()->getScalarType() &&
+                      m_ImmConstant(C))) &&
+      cast<FixedVectorType>(V1->getType())->getNumElements() <=
+          InstVTy->getNumElements()) {
+    assert(InstVTy->getScalarType() == V1->getType()->getScalarType() &&
            "Shuffle should not change scalar type");
 
     // Find constant NewC that has property:
@@ -1560,6 +1650,7 @@ Instruction *InstCombiner::foldVectorBinop(BinaryOperator &Inst) {
     UndefValue *UndefScalar = UndefValue::get(C->getType()->getScalarType());
     SmallVector<Constant *, 16> NewVecC(SrcVecNumElts, UndefScalar);
     bool MayChange = true;
+    unsigned NumElts = InstVTy->getNumElements();
     for (unsigned I = 0; I < NumElts; ++I) {
       Constant *CElt = C->getAggregateElement(I);
       if (ShMask[I] >= 0) {
@@ -1648,9 +1739,8 @@ Instruction *InstCombiner::foldVectorBinop(BinaryOperator &Inst) {
     // values followed by a splat followed by the 2nd binary operation:
     // bo (splat X), (bo Y, OtherOp) --> bo (splat (bo X, Y)), OtherOp
     Value *NewBO = Builder.CreateBinOp(Opcode, X, Y);
-    UndefValue *Undef = UndefValue::get(Inst.getType());
     SmallVector<int, 8> NewMask(MaskC.size(), SplatIndex);
-    Value *NewSplat = Builder.CreateShuffleVector(NewBO, Undef, NewMask);
+    Value *NewSplat = Builder.CreateShuffleVector(NewBO, NewMask);
     Instruction *R = BinaryOperator::Create(Opcode, NewSplat, OtherOp);
 
     // Intersect FMF on both new binops. Other (poison-generating) flags are
@@ -1670,7 +1760,7 @@ Instruction *InstCombiner::foldVectorBinop(BinaryOperator &Inst) {
 /// Try to narrow the width of a binop if at least 1 operand is an extend of
 /// of a value. This requires a potentially expensive known bits check to make
 /// sure the narrow op does not overflow.
-Instruction *InstCombiner::narrowMathIfNoOverflow(BinaryOperator &BO) {
+Instruction *InstCombinerImpl::narrowMathIfNoOverflow(BinaryOperator &BO) {
   // We need at least one extended operand.
   Value *Op0 = BO.getOperand(0), *Op1 = BO.getOperand(1);
 
@@ -1750,7 +1840,7 @@ static Instruction *foldSelectGEP(GetElementPtrInst &GEP,
   // gep (select Cond, TrueC, FalseC), IndexC --> select Cond, TrueC', FalseC'
   // Propagate 'inbounds' and metadata from existing instructions.
   // Note: using IRBuilder to create the constants for efficiency.
-  SmallVector<Value *, 4> IndexC(GEP.idx_begin(), GEP.idx_end());
+  SmallVector<Value *, 4> IndexC(GEP.indices());
   bool IsInBounds = GEP.isInBounds();
   Value *NewTrueC = IsInBounds ? Builder.CreateInBoundsGEP(TrueC, IndexC)
                                : Builder.CreateGEP(TrueC, IndexC);
@@ -1759,8 +1849,8 @@ static Instruction *foldSelectGEP(GetElementPtrInst &GEP,
   return SelectInst::Create(Cond, NewTrueC, NewFalseC, "", nullptr, Sel);
 }
 
-Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
-  SmallVector<Value*, 8> Ops(GEP.op_begin(), GEP.op_end());
+Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
+  SmallVector<Value *, 8> Ops(GEP.operands());
   Type *GEPType = GEP.getType();
   Type *GEPEltType = GEP.getSourceElementType();
   bool IsGEPSrcEleScalable = isa<ScalableVectorType>(GEPEltType);
@@ -2130,7 +2220,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
         // GEP (bitcast i8* X to [0 x i8]*), i32 0, ... ?
         if (CATy->getElementType() == StrippedPtrEltTy) {
           // -> GEP i8* X, ...
-          SmallVector<Value*, 8> Idx(GEP.idx_begin()+1, GEP.idx_end());
+          SmallVector<Value *, 8> Idx(drop_begin(GEP.indices()));
           GetElementPtrInst *Res = GetElementPtrInst::Create(
               StrippedPtrEltTy, StrippedPtr, Idx, GEP.getName());
           Res->setIsInBounds(GEP.isInBounds());
@@ -2166,7 +2256,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
             // ->
             // %0 = GEP [10 x i8] addrspace(1)* X, ...
             // addrspacecast i8 addrspace(1)* %0 to i8*
-            SmallVector<Value*, 8> Idx(GEP.idx_begin(), GEP.idx_end());
+            SmallVector<Value *, 8> Idx(GEP.indices());
             Value *NewGEP =
                 GEP.isInBounds()
                     ? Builder.CreateInBoundsGEP(StrippedPtrEltTy, StrippedPtr,
@@ -2308,15 +2398,15 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
     // gep (bitcast [c x ty]* X to <c x ty>*), Y, Z --> gep X, Y, Z
     auto areMatchingArrayAndVecTypes = [](Type *ArrTy, Type *VecTy,
                                           const DataLayout &DL) {
-      auto *VecVTy = cast<VectorType>(VecTy);
+      auto *VecVTy = cast<FixedVectorType>(VecTy);
       return ArrTy->getArrayElementType() == VecVTy->getElementType() &&
              ArrTy->getArrayNumElements() == VecVTy->getNumElements() &&
              DL.getTypeAllocSize(ArrTy) == DL.getTypeAllocSize(VecTy);
     };
     if (GEP.getNumOperands() == 3 &&
-        ((GEPEltType->isArrayTy() && SrcEltType->isVectorTy() &&
+        ((GEPEltType->isArrayTy() && isa<FixedVectorType>(SrcEltType) &&
           areMatchingArrayAndVecTypes(GEPEltType, SrcEltType, DL)) ||
-         (GEPEltType->isVectorTy() && SrcEltType->isArrayTy() &&
+         (isa<FixedVectorType>(GEPEltType) && SrcEltType->isArrayTy() &&
           areMatchingArrayAndVecTypes(SrcEltType, GEPEltType, DL)))) {
 
       // Create a new GEP here, as using `setOperand()` followed by
@@ -2511,7 +2601,7 @@ static bool isAllocSiteRemovable(Instruction *AI,
   return true;
 }
 
-Instruction *InstCombiner::visitAllocSite(Instruction &MI) {
+Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
   // If we have a malloc call which is only used in any amount of comparisons to
   // null and free calls, delete the calls and replace the comparisons with true
   // or false as appropriate.
@@ -2526,10 +2616,10 @@ Instruction *InstCombiner::visitAllocSite(Instruction &MI) {
 
   // If we are removing an alloca with a dbg.declare, insert dbg.value calls
   // before each store.
-  TinyPtrVector<DbgVariableIntrinsic *> DIIs;
+  SmallVector<DbgVariableIntrinsic *, 8> DVIs;
   std::unique_ptr<DIBuilder> DIB;
   if (isa<AllocaInst>(MI)) {
-    DIIs = FindDbgAddrUses(&MI);
+    findDbgUsers(DVIs, &MI);
     DIB.reset(new DIBuilder(*MI.getModule(), /*AllowUnresolved=*/false));
   }
 
@@ -2563,8 +2653,9 @@ Instruction *InstCombiner::visitAllocSite(Instruction &MI) {
                             ConstantInt::get(Type::getInt1Ty(C->getContext()),
                                              C->isFalseWhenEqual()));
       } else if (auto *SI = dyn_cast<StoreInst>(I)) {
-        for (auto *DII : DIIs)
-          ConvertDebugDeclareToDebugValue(DII, SI, *DIB);
+        for (auto *DVI : DVIs)
+          if (DVI->isAddressOfVariable())
+            ConvertDebugDeclareToDebugValue(DVI, SI, *DIB);
       } else {
         // Casts, GEP, or anything else: we're about to delete this instruction,
         // so it can not have any valid uses.
@@ -2581,8 +2672,31 @@ Instruction *InstCombiner::visitAllocSite(Instruction &MI) {
                          None, "", II->getParent());
     }
 
-    for (auto *DII : DIIs)
-      eraseInstFromFunction(*DII);
+    // Remove debug intrinsics which describe the value contained within the
+    // alloca. In addition to removing dbg.{declare,addr} which simply point to
+    // the alloca, remove dbg.value(<alloca>, ..., DW_OP_deref)'s as well, e.g.:
+    //
+    // ```
+    //   define void @foo(i32 %0) {
+    //     %a = alloca i32                              ; Deleted.
+    //     store i32 %0, i32* %a
+    //     dbg.value(i32 %0, "arg0")                    ; Not deleted.
+    //     dbg.value(i32* %a, "arg0", DW_OP_deref)      ; Deleted.
+    //     call void @trivially_inlinable_no_op(i32* %a)
+    //     ret void
+    //  }
+    // ```
+    //
+    // This may not be required if we stop describing the contents of allocas
+    // using dbg.value(<alloca>, ..., DW_OP_deref), but we currently do this in
+    // the LowerDbgDeclare utility.
+    //
+    // If there is a dead store to `%a` in @trivially_inlinable_no_op, the
+    // "arg0" dbg.value may be stale after the call. However, failing to remove
+    // the DW_OP_deref dbg.value causes large gaps in location coverage.
+    for (auto *DVI : DVIs)
+      if (DVI->isAddressOfVariable() || DVI->getExpression()->startsWithDeref())
+        DVI->eraseFromParent();
 
     return eraseInstFromFunction(MI);
   }
@@ -2670,7 +2784,7 @@ static Instruction *tryToMoveFreeBeforeNullTest(CallInst &FI,
   return &FI;
 }
 
-Instruction *InstCombiner::visitFree(CallInst &FI) {
+Instruction *InstCombinerImpl::visitFree(CallInst &FI) {
   Value *Op = FI.getArgOperand(0);
 
   // free undef -> unreachable.
@@ -2711,7 +2825,7 @@ static bool isMustTailCall(Value *V) {
   return false;
 }
 
-Instruction *InstCombiner::visitReturnInst(ReturnInst &RI) {
+Instruction *InstCombinerImpl::visitReturnInst(ReturnInst &RI) {
   if (RI.getNumOperands() == 0) // ret void
     return nullptr;
 
@@ -2734,7 +2848,31 @@ Instruction *InstCombiner::visitReturnInst(ReturnInst &RI) {
   return nullptr;
 }
 
-Instruction *InstCombiner::visitUnconditionalBranchInst(BranchInst &BI) {
+Instruction *InstCombinerImpl::visitUnreachableInst(UnreachableInst &I) {
+  // Try to remove the previous instruction if it must lead to unreachable.
+  // This includes instructions like stores and "llvm.assume" that may not get
+  // removed by simple dead code elimination.
+  Instruction *Prev = I.getPrevNonDebugInstruction();
+  if (Prev && !Prev->isEHPad() &&
+      isGuaranteedToTransferExecutionToSuccessor(Prev)) {
+    // Temporarily disable removal of volatile stores preceding unreachable,
+    // pending a potential LangRef change permitting volatile stores to trap.
+    // TODO: Either remove this code, or properly integrate the check into
+    // isGuaranteedToTransferExecutionToSuccessor().
+    if (auto *SI = dyn_cast<StoreInst>(Prev))
+      if (SI->isVolatile())
+        return nullptr;
+
+    // A value may still have uses before we process it here (for example, in
+    // another unreachable block), so convert those to undef.
+    replaceInstUsesWith(*Prev, UndefValue::get(Prev->getType()));
+    eraseInstFromFunction(*Prev);
+    return &I;
+  }
+  return nullptr;
+}
+
+Instruction *InstCombinerImpl::visitUnconditionalBranchInst(BranchInst &BI) {
   assert(BI.isUnconditional() && "Only for unconditional branches.");
 
   // If this store is the second-to-last instruction in the basic block
@@ -2763,7 +2901,7 @@ Instruction *InstCombiner::visitUnconditionalBranchInst(BranchInst &BI) {
   return nullptr;
 }
 
-Instruction *InstCombiner::visitBranchInst(BranchInst &BI) {
+Instruction *InstCombinerImpl::visitBranchInst(BranchInst &BI) {
   if (BI.isUnconditional())
     return visitUnconditionalBranchInst(BI);
 
@@ -2799,7 +2937,7 @@ Instruction *InstCombiner::visitBranchInst(BranchInst &BI) {
   return nullptr;
 }
 
-Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) {
+Instruction *InstCombinerImpl::visitSwitchInst(SwitchInst &SI) {
   Value *Cond = SI.getCondition();
   Value *Op0;
   ConstantInt *AddRHS;
@@ -2830,7 +2968,7 @@ Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) {
   unsigned NewWidth = Known.getBitWidth() - std::max(LeadingKnownZeros, LeadingKnownOnes);
 
   // Shrink the condition operand if the new type is smaller than the old type.
-  // But do not shrink to a non-standard type, because backend can't generate 
+  // But do not shrink to a non-standard type, because backend can't generate
   // good code for that yet.
   // TODO: We can make it aggressive again after fixing PR39569.
   if (NewWidth > 0 && NewWidth < Known.getBitWidth() &&
@@ -2849,7 +2987,7 @@ Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) {
   return nullptr;
 }
 
-Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
+Instruction *InstCombinerImpl::visitExtractValueInst(ExtractValueInst &EV) {
   Value *Agg = EV.getAggregateOperand();
 
   if (!EV.hasIndices())
@@ -2994,10 +3132,11 @@ static bool isCatchAll(EHPersonality Personality, Constant *TypeInfo) {
   case EHPersonality::GNU_CXX_SjLj:
   case EHPersonality::GNU_ObjC:
   case EHPersonality::MSVC_X86SEH:
-  case EHPersonality::MSVC_Win64SEH:
+  case EHPersonality::MSVC_TableSEH:
   case EHPersonality::MSVC_CXX:
   case EHPersonality::CoreCLR:
   case EHPersonality::Wasm_CXX:
+  case EHPersonality::XL_CXX:
     return TypeInfo->isNullValue();
   }
   llvm_unreachable("invalid enum");
@@ -3010,7 +3149,7 @@ static bool shorter_filter(const Value *LHS, const Value *RHS) {
     cast<ArrayType>(RHS->getType())->getNumElements();
 }
 
-Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) {
+Instruction *InstCombinerImpl::visitLandingPadInst(LandingPadInst &LI) {
   // The logic here should be correct for any real-world personality function.
   // However if that turns out not to be true, the offending logic can always
   // be conditioned on the personality function, like the catch-all logic is.
@@ -3319,12 +3458,46 @@ Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) {
   return nullptr;
 }
 
-Instruction *InstCombiner::visitFreeze(FreezeInst &I) {
+Instruction *InstCombinerImpl::visitFreeze(FreezeInst &I) {
   Value *Op0 = I.getOperand(0);
 
   if (Value *V = SimplifyFreezeInst(Op0, SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
+  // freeze (phi const, x) --> phi const, (freeze x)
+  if (auto *PN = dyn_cast<PHINode>(Op0)) {
+    if (Instruction *NV = foldOpIntoPhi(I, PN))
+      return NV;
+  }
+
+  if (match(Op0, m_Undef())) {
+    // If I is freeze(undef), see its uses and fold it to the best constant.
+    // - or: pick -1
+    // - select's condition: pick the value that leads to choosing a constant
+    // - other ops: pick 0
+    Constant *BestValue = nullptr;
+    Constant *NullValue = Constant::getNullValue(I.getType());
+    for (const auto *U : I.users()) {
+      Constant *C = NullValue;
+
+      if (match(U, m_Or(m_Value(), m_Value())))
+        C = Constant::getAllOnesValue(I.getType());
+      else if (const auto *SI = dyn_cast<SelectInst>(U)) {
+        if (SI->getCondition() == &I) {
+          APInt CondVal(1, isa<Constant>(SI->getFalseValue()) ? 0 : 1);
+          C = Constant::getIntegerValue(I.getType(), CondVal);
+        }
+      }
+
+      if (!BestValue)
+        BestValue = C;
+      else if (BestValue != C)
+        BestValue = NullValue;
+    }
+
+    return replaceInstUsesWith(I, BestValue);
+  }
+
   return nullptr;
 }
 
@@ -3430,7 +3603,7 @@ static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) {
   return true;
 }
 
-bool InstCombiner::run() {
+bool InstCombinerImpl::run() {
   while (!Worklist.isEmpty()) {
     // Walk deferred instructions in reverse order, and push them to the
     // worklist, which means they'll end up popped from the worklist in-order.
@@ -3492,7 +3665,9 @@ bool InstCombiner::run() {
         else
           UserParent = UserInst->getParent();
 
-        if (UserParent != BB) {
+        // Try sinking to another block. If that block is unreachable, then do
+        // not bother. SimplifyCFG should handle it.
+        if (UserParent != BB && DT.isReachableFromEntry(UserParent)) {
           // See if the user is one of our successors that has only one
           // predecessor, so that we don't have to split the critical edge.
           bool ShouldSink = UserParent->getUniquePredecessor() == BB;
@@ -3526,7 +3701,8 @@ bool InstCombiner::run() {
 
     // Now that we have an instruction, try combining it to simplify it.
     Builder.SetInsertPoint(I);
-    Builder.SetCurrentDebugLocation(I->getDebugLoc());
+    Builder.CollectMetadataToCopy(
+        I, {LLVMContext::MD_dbg, LLVMContext::MD_annotation});
 
 #ifndef NDEBUG
     std::string OrigI;
@@ -3541,8 +3717,8 @@ bool InstCombiner::run() {
         LLVM_DEBUG(dbgs() << "IC: Old = " << *I << '\n'
                           << "    New = " << *Result << '\n');
 
-        if (I->getDebugLoc())
-          Result->setDebugLoc(I->getDebugLoc());
+        Result->copyMetadata(*I,
+                             {LLVMContext::MD_dbg, LLVMContext::MD_annotation});
         // Everything uses the new instruction now.
         I->replaceAllUsesWith(Result);
 
@@ -3553,10 +3729,14 @@ bool InstCombiner::run() {
         BasicBlock *InstParent = I->getParent();
         BasicBlock::iterator InsertPos = I->getIterator();
 
-        // If we replace a PHI with something that isn't a PHI, fix up the
-        // insertion point.
-        if (!isa<PHINode>(Result) && isa<PHINode>(InsertPos))
-          InsertPos = InstParent->getFirstInsertionPt();
+        // Are we replace a PHI with something that isn't a PHI, or vice versa?
+        if (isa<PHINode>(Result) != isa<PHINode>(I)) {
+          // We need to fix up the insertion point.
+          if (isa<PHINode>(I)) // PHI -> Non-PHI
+            InsertPos = InstParent->getFirstInsertionPt();
+          else // Non-PHI -> PHI
+            InsertPos = InstParent->getFirstNonPHI()->getIterator();
+        }
 
         InstParent->getInstList().insert(InsertPos, Result);
 
@@ -3586,6 +3766,55 @@ bool InstCombiner::run() {
   return MadeIRChange;
 }
 
+// Track the scopes used by !alias.scope and !noalias. In a function, a
+// @llvm.experimental.noalias.scope.decl is only useful if that scope is used
+// by both sets. If not, the declaration of the scope can be safely omitted.
+// The MDNode of the scope can be omitted as well for the instructions that are
+// part of this function. We do not do that at this point, as this might become
+// too time consuming to do.
+class AliasScopeTracker {
+  SmallPtrSet<const MDNode *, 8> UsedAliasScopesAndLists;
+  SmallPtrSet<const MDNode *, 8> UsedNoAliasScopesAndLists;
+
+public:
+  void analyse(Instruction *I) {
+    // This seems to be faster than checking 'mayReadOrWriteMemory()'.
+    if (!I->hasMetadataOtherThanDebugLoc())
+      return;
+
+    auto Track = [](Metadata *ScopeList, auto &Container) {
+      const auto *MDScopeList = dyn_cast_or_null<MDNode>(ScopeList);
+      if (!MDScopeList || !Container.insert(MDScopeList).second)
+        return;
+      for (auto &MDOperand : MDScopeList->operands())
+        if (auto *MDScope = dyn_cast<MDNode>(MDOperand))
+          Container.insert(MDScope);
+    };
+
+    Track(I->getMetadata(LLVMContext::MD_alias_scope), UsedAliasScopesAndLists);
+    Track(I->getMetadata(LLVMContext::MD_noalias), UsedNoAliasScopesAndLists);
+  }
+
+  bool isNoAliasScopeDeclDead(Instruction *Inst) {
+    NoAliasScopeDeclInst *Decl = dyn_cast<NoAliasScopeDeclInst>(Inst);
+    if (!Decl)
+      return false;
+
+    assert(Decl->use_empty() &&
+           "llvm.experimental.noalias.scope.decl in use ?");
+    const MDNode *MDSL = Decl->getScopeList();
+    assert(MDSL->getNumOperands() == 1 &&
+           "llvm.experimental.noalias.scope should refer to a single scope");
+    auto &MDOperand = MDSL->getOperand(0);
+    if (auto *MD = dyn_cast<MDNode>(MDOperand))
+      return !UsedAliasScopesAndLists.contains(MD) ||
+             !UsedNoAliasScopesAndLists.contains(MD);
+
+    // Not an MDNode ? throw away.
+    return true;
+  }
+};
+
 /// Populate the IC worklist from a function, by walking it in depth-first
 /// order and adding all reachable code to the worklist.
 ///
@@ -3604,6 +3833,7 @@ static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL,
 
   SmallVector<Instruction*, 128> InstrsForInstCombineWorklist;
   DenseMap<Constant *, Constant *> FoldedConstants;
+  AliasScopeTracker SeenAliasScopes;
 
   do {
     BasicBlock *BB = Worklist.pop_back_val();
@@ -3648,10 +3878,13 @@ static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL,
         }
       }
 
-      // Skip processing debug intrinsics in InstCombine. Processing these call instructions
-      // consumes non-trivial amount of time and provides no value for the optimization.
-      if (!isa<DbgInfoIntrinsic>(Inst))
+      // Skip processing debug and pseudo intrinsics in InstCombine. Processing
+      // these call instructions consumes non-trivial amount of time and
+      // provides no value for the optimization.
+      if (!Inst->isDebugOrPseudoInst()) {
         InstrsForInstCombineWorklist.push_back(Inst);
+        SeenAliasScopes.analyse(Inst);
+      }
     }
 
     // Recursively visit successors.  If this is a branch or switch on a
@@ -3671,8 +3904,7 @@ static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL,
       }
     }
 
-    for (BasicBlock *SuccBB : successors(TI))
-      Worklist.push_back(SuccBB);
+    append_range(Worklist, successors(TI));
   } while (!Worklist.empty());
 
   // Remove instructions inside unreachable blocks. This prevents the
@@ -3682,8 +3914,12 @@ static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL,
     if (Visited.count(&BB))
       continue;
 
-    unsigned NumDeadInstInBB = removeAllNonTerminatorAndEHPadInstructions(&BB);
-    MadeIRChange |= NumDeadInstInBB > 0;
+    unsigned NumDeadInstInBB;
+    unsigned NumDeadDbgInstInBB;
+    std::tie(NumDeadInstInBB, NumDeadDbgInstInBB) =
+        removeAllNonTerminatorAndEHPadInstructions(&BB);
+
+    MadeIRChange |= NumDeadInstInBB + NumDeadDbgInstInBB > 0;
     NumDeadInst += NumDeadInstInBB;
   }
 
@@ -3696,7 +3932,8 @@ static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL,
   for (Instruction *Inst : reverse(InstrsForInstCombineWorklist)) {
     // DCE instruction if trivially dead. As we iterate in reverse program
     // order here, we will clean up whole chains of dead instructions.
-    if (isInstructionTriviallyDead(Inst, TLI)) {
+    if (isInstructionTriviallyDead(Inst, TLI) ||
+        SeenAliasScopes.isNoAliasScopeDeclDead(Inst)) {
       ++NumDeadInst;
       LLVM_DEBUG(dbgs() << "IC: DCE: " << *Inst << '\n');
       salvageDebugInfo(*Inst);
@@ -3713,8 +3950,8 @@ static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL,
 
 static bool combineInstructionsOverFunction(
     Function &F, InstCombineWorklist &Worklist, AliasAnalysis *AA,
-    AssumptionCache &AC, TargetLibraryInfo &TLI, DominatorTree &DT,
-    OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI,
+    AssumptionCache &AC, TargetLibraryInfo &TLI, TargetTransformInfo &TTI,
+    DominatorTree &DT, OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI,
     ProfileSummaryInfo *PSI, unsigned MaxIterations, LoopInfo *LI) {
   auto &DL = F.getParent()->getDataLayout();
   MaxIterations = std::min(MaxIterations, LimitMaxIterations.getValue());
@@ -3738,6 +3975,7 @@ static bool combineInstructionsOverFunction(
   // Iterate while there is work to do.
   unsigned Iteration = 0;
   while (true) {
+    ++NumWorklistIterations;
     ++Iteration;
 
     if (Iteration > InfiniteLoopDetectionThreshold) {
@@ -3758,8 +3996,8 @@ static bool combineInstructionsOverFunction(
 
     MadeIRChange |= prepareICWorklistFromFunction(F, DL, &TLI, Worklist);
 
-    InstCombiner IC(Worklist, Builder, F.hasMinSize(), AA,
-                    AC, TLI, DT, ORE, BFI, PSI, DL, LI);
+    InstCombinerImpl IC(Worklist, Builder, F.hasMinSize(), AA, AC, TLI, TTI, DT,
+                        ORE, BFI, PSI, DL, LI);
     IC.MaxArraySizeForCombine = MaxArraySize;
 
     if (!IC.run())
@@ -3782,6 +4020,7 @@ PreservedAnalyses InstCombinePass::run(Function &F,
   auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
   auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
   auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
 
   auto *LI = AM.getCachedResult<LoopAnalysis>(F);
 
@@ -3792,8 +4031,8 @@ PreservedAnalyses InstCombinePass::run(Function &F,
   auto *BFI = (PSI && PSI->hasProfileSummary()) ?
       &AM.getResult<BlockFrequencyAnalysis>(F) : nullptr;
 
-  if (!combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, DT, ORE, BFI,
-                                       PSI, MaxIterations, LI))
+  if (!combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, TTI, DT, ORE,
+                                       BFI, PSI, MaxIterations, LI))
     // No changes, all analyses are preserved.
     return PreservedAnalyses::all();
 
@@ -3811,6 +4050,7 @@ void InstructionCombiningPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<AAResultsWrapperPass>();
   AU.addRequired<AssumptionCacheTracker>();
   AU.addRequired<TargetLibraryInfoWrapperPass>();
+  AU.addRequired<TargetTransformInfoWrapperPass>();
   AU.addRequired<DominatorTreeWrapperPass>();
   AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
   AU.addPreserved<DominatorTreeWrapperPass>();
@@ -3829,6 +4069,7 @@ bool InstructionCombiningPass::runOnFunction(Function &F) {
   auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+  auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
   auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
 
@@ -3842,8 +4083,8 @@ bool InstructionCombiningPass::runOnFunction(Function &F) {
       &getAnalysis<LazyBlockFrequencyInfoPass>().getBFI() :
       nullptr;
 
-  return combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, DT, ORE, BFI,
-                                         PSI, MaxIterations, LI);
+  return combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, TTI, DT, ORE,
+                                         BFI, PSI, MaxIterations, LI);
 }
 
 char InstructionCombiningPass::ID = 0;
@@ -3862,6 +4103,7 @@ INITIALIZE_PASS_BEGIN(InstructionCombiningPass, "instcombine",
                       "Combine redundant instructions", false, false)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 5ce21cc3616a..d393523e16e5 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -105,6 +105,7 @@ static const uint64_t kSystemZ_ShadowOffset64 = 1ULL << 52;
 static const uint64_t kMIPS32_ShadowOffset32 = 0x0aaa0000;
 static const uint64_t kMIPS64_ShadowOffset64 = 1ULL << 37;
 static const uint64_t kAArch64_ShadowOffset64 = 1ULL << 36;
+static const uint64_t kRISCV64_ShadowOffset64 = 0x20000000;
 static const uint64_t kFreeBSD_ShadowOffset32 = 1ULL << 30;
 static const uint64_t kFreeBSD_ShadowOffset64 = 1ULL << 46;
 static const uint64_t kFreeBSDKasan_ShadowOffset64 = 0xdffff7c000000000;
@@ -130,56 +131,48 @@ static const size_t kMaxStackMallocSize = 1 << 16;  // 64K
 static const uintptr_t kCurrentStackFrameMagic = 0x41B58AB3;
 static const uintptr_t kRetiredStackFrameMagic = 0x45E0360E;
 
-static const char *const kAsanModuleCtorName = "asan.module_ctor";
-static const char *const kAsanModuleDtorName = "asan.module_dtor";
+const char kAsanModuleCtorName[] = "asan.module_ctor";
+const char kAsanModuleDtorName[] = "asan.module_dtor";
 static const uint64_t kAsanCtorAndDtorPriority = 1;
 // On Emscripten, the system needs more than one priorities for constructors.
 static const uint64_t kAsanEmscriptenCtorAndDtorPriority = 50;
-static const char *const kAsanReportErrorTemplate = "__asan_report_";
-static const char *const kAsanRegisterGlobalsName = "__asan_register_globals";
-static const char *const kAsanUnregisterGlobalsName =
-    "__asan_unregister_globals";
-static const char *const kAsanRegisterImageGlobalsName =
-  "__asan_register_image_globals";
-static const char *const kAsanUnregisterImageGlobalsName =
-  "__asan_unregister_image_globals";
-static const char *const kAsanRegisterElfGlobalsName =
-  "__asan_register_elf_globals";
-static const char *const kAsanUnregisterElfGlobalsName =
-  "__asan_unregister_elf_globals";
-static const char *const kAsanPoisonGlobalsName = "__asan_before_dynamic_init";
-static const char *const kAsanUnpoisonGlobalsName = "__asan_after_dynamic_init";
-static const char *const kAsanInitName = "__asan_init";
-static const char *const kAsanVersionCheckNamePrefix =
-    "__asan_version_mismatch_check_v";
-static const char *const kAsanPtrCmp = "__sanitizer_ptr_cmp";
-static const char *const kAsanPtrSub = "__sanitizer_ptr_sub";
-static const char *const kAsanHandleNoReturnName = "__asan_handle_no_return";
+const char kAsanReportErrorTemplate[] = "__asan_report_";
+const char kAsanRegisterGlobalsName[] = "__asan_register_globals";
+const char kAsanUnregisterGlobalsName[] = "__asan_unregister_globals";
+const char kAsanRegisterImageGlobalsName[] = "__asan_register_image_globals";
+const char kAsanUnregisterImageGlobalsName[] =
+    "__asan_unregister_image_globals";
+const char kAsanRegisterElfGlobalsName[] = "__asan_register_elf_globals";
+const char kAsanUnregisterElfGlobalsName[] = "__asan_unregister_elf_globals";
+const char kAsanPoisonGlobalsName[] = "__asan_before_dynamic_init";
+const char kAsanUnpoisonGlobalsName[] = "__asan_after_dynamic_init";
+const char kAsanInitName[] = "__asan_init";
+const char kAsanVersionCheckNamePrefix[] = "__asan_version_mismatch_check_v";
+const char kAsanPtrCmp[] = "__sanitizer_ptr_cmp";
+const char kAsanPtrSub[] = "__sanitizer_ptr_sub";
+const char kAsanHandleNoReturnName[] = "__asan_handle_no_return";
 static const int kMaxAsanStackMallocSizeClass = 10;
-static const char *const kAsanStackMallocNameTemplate = "__asan_stack_malloc_";
-static const char *const kAsanStackFreeNameTemplate = "__asan_stack_free_";
-static const char *const kAsanGenPrefix = "___asan_gen_";
-static const char *const kODRGenPrefix = "__odr_asan_gen_";
-static const char *const kSanCovGenPrefix = "__sancov_gen_";
-static const char *const kAsanSetShadowPrefix = "__asan_set_shadow_";
-static const char *const kAsanPoisonStackMemoryName =
-    "__asan_poison_stack_memory";
-static const char *const kAsanUnpoisonStackMemoryName =
-    "__asan_unpoison_stack_memory";
+const char kAsanStackMallocNameTemplate[] = "__asan_stack_malloc_";
+const char kAsanStackFreeNameTemplate[] = "__asan_stack_free_";
+const char kAsanGenPrefix[] = "___asan_gen_";
+const char kODRGenPrefix[] = "__odr_asan_gen_";
+const char kSanCovGenPrefix[] = "__sancov_gen_";
+const char kAsanSetShadowPrefix[] = "__asan_set_shadow_";
+const char kAsanPoisonStackMemoryName[] = "__asan_poison_stack_memory";
+const char kAsanUnpoisonStackMemoryName[] = "__asan_unpoison_stack_memory";
 
 // ASan version script has __asan_* wildcard. Triple underscore prevents a
 // linker (gold) warning about attempting to export a local symbol.
-static const char *const kAsanGlobalsRegisteredFlagName =
-    "___asan_globals_registered";
+const char kAsanGlobalsRegisteredFlagName[] = "___asan_globals_registered";
 
-static const char *const kAsanOptionDetectUseAfterReturn =
+const char kAsanOptionDetectUseAfterReturn[] =
     "__asan_option_detect_stack_use_after_return";
 
-static const char *const kAsanShadowMemoryDynamicAddress =
+const char kAsanShadowMemoryDynamicAddress[] =
     "__asan_shadow_memory_dynamic_address";
 
-static const char *const kAsanAllocaPoison = "__asan_alloca_poison";
-static const char *const kAsanAllocasUnpoison = "__asan_allocas_unpoison";
+const char kAsanAllocaPoison[] = "__asan_alloca_poison";
+const char kAsanAllocasUnpoison[] = "__asan_allocas_unpoison";
 
 // Accesses sizes are powers of two: 1, 2, 4, 8, 16.
 static const size_t kNumberOfAccessSizes = 5;
@@ -435,6 +428,7 @@ static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize,
                                       bool IsKasan) {
   bool IsAndroid = TargetTriple.isAndroid();
   bool IsIOS = TargetTriple.isiOS() || TargetTriple.isWatchOS();
+  bool IsMacOS = TargetTriple.isMacOSX();
   bool IsFreeBSD = TargetTriple.isOSFreeBSD();
   bool IsNetBSD = TargetTriple.isOSNetBSD();
   bool IsPS4CPU = TargetTriple.isPS4CPU();
@@ -447,6 +441,7 @@ static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize,
   bool IsMIPS64 = TargetTriple.isMIPS64();
   bool IsArmOrThumb = TargetTriple.isARM() || TargetTriple.isThumb();
   bool IsAArch64 = TargetTriple.getArch() == Triple::aarch64;
+  bool IsRISCV64 = TargetTriple.getArch() == Triple::riscv64;
   bool IsWindows = TargetTriple.isOSWindows();
   bool IsFuchsia = TargetTriple.isOSFuchsia();
   bool IsMyriad = TargetTriple.getVendor() == llvm::Triple::Myriad;
@@ -514,8 +509,12 @@ static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize,
       Mapping.Offset = kMIPS64_ShadowOffset64;
     else if (IsIOS)
       Mapping.Offset = kDynamicShadowSentinel;
+    else if (IsMacOS && IsAArch64)
+      Mapping.Offset = kDynamicShadowSentinel;
     else if (IsAArch64)
       Mapping.Offset = kAArch64_ShadowOffset64;
+    else if (IsRISCV64)
+      Mapping.Offset = kRISCV64_ShadowOffset64;
     else
       Mapping.Offset = kDefaultShadowOffset64;
   }
@@ -534,6 +533,7 @@ static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize,
   // we could OR the constant in a single instruction, but it's more
   // efficient to load it once and use indexed addressing.
   Mapping.OrShadowOffset = !IsAArch64 && !IsPPC64 && !IsSystemZ && !IsPS4CPU &&
+                           !IsRISCV64 &&
                            !(Mapping.Offset & (Mapping.Offset - 1)) &&
                            Mapping.Offset != kDynamicShadowSentinel;
   bool IsAndroidWithIfuncSupport =
@@ -912,10 +912,6 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
   AllocaInst *DynamicAllocaLayout = nullptr;
   IntrinsicInst *LocalEscapeCall = nullptr;
 
-  // Maps Value to an AllocaInst from which the Value is originated.
-  using AllocaForValueMapTy = DenseMap<Value *, AllocaInst *>;
-  AllocaForValueMapTy AllocaForValue;
-
   bool HasInlineAsm = false;
   bool HasReturnsTwiceCall = false;
 
@@ -969,8 +965,14 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
   void createDynamicAllocasInitStorage();
 
   // ----------------------- Visitors.
-  /// Collect all Ret instructions.
-  void visitReturnInst(ReturnInst &RI) { RetVec.push_back(&RI); }
+  /// Collect all Ret instructions, or the musttail call instruction if it
+  /// precedes the return instruction.
+  void visitReturnInst(ReturnInst &RI) {
+    if (CallInst *CI = RI.getParent()->getTerminatingMustTailCall())
+      RetVec.push_back(CI);
+    else
+      RetVec.push_back(&RI);
+  }
 
   /// Collect all Resume instructions.
   void visitResumeInst(ResumeInst &RI) { RetVec.push_back(&RI); }
@@ -1004,10 +1006,10 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
 
   // Unpoison dynamic allocas redzones.
   void unpoisonDynamicAllocas() {
-    for (auto &Ret : RetVec)
+    for (Instruction *Ret : RetVec)
       unpoisonDynamicAllocasBeforeInst(Ret, DynamicAllocaLayout);
 
-    for (auto &StackRestoreInst : StackRestoreVec)
+    for (Instruction *StackRestoreInst : StackRestoreVec)
       unpoisonDynamicAllocasBeforeInst(StackRestoreInst,
                                        StackRestoreInst->getOperand(0));
   }
@@ -1066,8 +1068,9 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
         !ConstantInt::isValueValidForType(IntptrTy, SizeValue))
       return;
     // Find alloca instruction that corresponds to llvm.lifetime argument.
-    AllocaInst *AI =
-        llvm::findAllocaForValue(II.getArgOperand(1), AllocaForValue);
+    // Currently we can only handle lifetime markers pointing to the
+    // beginning of the alloca.
+    AllocaInst *AI = findAllocaForValue(II.getArgOperand(1), true);
     if (!AI) {
       HasUntracedLifetimeIntrinsic = true;
       return;
@@ -1562,7 +1565,7 @@ void AddressSanitizer::instrumentMop(ObjectSizeOffsetVisitor &ObjSizeVis,
   if (ClOpt && ClOptGlobals) {
     // If initialization order checking is disabled, a simple access to a
     // dynamically initialized global is always valid.
-    GlobalVariable *G = dyn_cast<GlobalVariable>(GetUnderlyingObject(Addr, DL));
+    GlobalVariable *G = dyn_cast<GlobalVariable>(getUnderlyingObject(Addr));
     if (G && (!ClInitializers || GlobalIsLinkerInitialized(G)) &&
         isSafeAccess(ObjSizeVis, Addr, O.TypeSize)) {
       NumOptimizedAccessesToGlobalVar++;
@@ -1572,7 +1575,7 @@ void AddressSanitizer::instrumentMop(ObjectSizeOffsetVisitor &ObjSizeVis,
 
   if (ClOpt && ClOptStack) {
     // A direct inbounds access to a stack variable is always valid.
-    if (isa<AllocaInst>(GetUnderlyingObject(Addr, DL)) &&
+    if (isa<AllocaInst>(getUnderlyingObject(Addr)) &&
         isSafeAccess(ObjSizeVis, Addr, O.TypeSize)) {
       NumOptimizedAccessesToStackVar++;
       return;
@@ -1874,6 +1877,14 @@ bool ModuleAddressSanitizer::shouldInstrumentGlobal(GlobalVariable *G) const {
       return false;
     }
 
+    // Do not instrument user-defined sections (with names resembling
+    // valid C identifiers)
+    if (TargetTriple.isOSBinFormatELF()) {
+      if (llvm::all_of(Section,
+                       [](char c) { return llvm::isAlnum(c) || c == '_'; }))
+        return false;
+    }
+
     // On COFF, if the section name contains '$', it is highly likely that the
     // user is using section sorting to create an array of globals similar to
     // the way initialization callbacks are registered in .init_array and
@@ -1958,9 +1969,10 @@ StringRef ModuleAddressSanitizer::getGlobalMetadataSection() const {
   case Triple::ELF:   return "asan_globals";
   case Triple::MachO: return "__DATA,__asan_globals,regular";
   case Triple::Wasm:
+  case Triple::GOFF:
   case Triple::XCOFF:
     report_fatal_error(
-        "ModuleAddressSanitizer not implemented for object file format.");
+        "ModuleAddressSanitizer not implemented for object file format");
   case Triple::UnknownObjectFormat:
     break;
   }
@@ -2109,23 +2121,10 @@ void ModuleAddressSanitizer::InstrumentGlobalsELF(
     SetComdatForGlobalMetadata(G, Metadata, UniqueModuleId);
   }
 
-  // This should never be called when there are no globals, by the logic that
-  // computes the UniqueModuleId string, which is "" when there are no globals.
-  // It's important that this path is only used when there are actually some
-  // globals, because that means that there will certainly be a live
-  // `asan_globals` input section at link time and thus `__start_asan_globals`
-  // and `__stop_asan_globals` symbols will definitely be defined at link time.
-  // This means there's no need for the references to them to be weak, which
-  // enables better code generation because ExternalWeakLinkage implies
-  // isInterposable() and thus requires GOT indirection for PIC.  Since these
-  // are known-defined hidden/dso_local symbols, direct PIC accesses without
-  // dynamic relocation are always sufficient.
-  assert(!MetadataGlobals.empty());
-  assert(!UniqueModuleId.empty());
-
   // Update llvm.compiler.used, adding the new metadata globals. This is
   // needed so that during LTO these variables stay alive.
-  appendToCompilerUsed(M, MetadataGlobals);
+  if (!MetadataGlobals.empty())
+    appendToCompilerUsed(M, MetadataGlobals);
 
   // RegisteredFlag serves two purposes. First, we can pass it to dladdr()
   // to look up the loaded image that contains it. Second, we can store in it
@@ -2138,18 +2137,15 @@ void ModuleAddressSanitizer::InstrumentGlobalsELF(
       ConstantInt::get(IntptrTy, 0), kAsanGlobalsRegisteredFlagName);
   RegisteredFlag->setVisibility(GlobalVariable::HiddenVisibility);
 
-  // Create start and stop symbols.  These are known to be defined by
-  // the linker, see comment above.
-  auto MakeStartStopGV = [&](const char *Prefix) {
-    GlobalVariable *StartStop =
-        new GlobalVariable(M, IntptrTy, false, GlobalVariable::ExternalLinkage,
-                           nullptr, Prefix + getGlobalMetadataSection());
-    StartStop->setVisibility(GlobalVariable::HiddenVisibility);
-    assert(StartStop->isImplicitDSOLocal());
-    return StartStop;
-  };
-  GlobalVariable *StartELFMetadata = MakeStartStopGV("__start_");
-  GlobalVariable *StopELFMetadata = MakeStartStopGV("__stop_");
+  // Create start and stop symbols.
+  GlobalVariable *StartELFMetadata = new GlobalVariable(
+      M, IntptrTy, false, GlobalVariable::ExternalWeakLinkage, nullptr,
+      "__start_" + getGlobalMetadataSection());
+  StartELFMetadata->setVisibility(GlobalVariable::HiddenVisibility);
+  GlobalVariable *StopELFMetadata = new GlobalVariable(
+      M, IntptrTy, false, GlobalVariable::ExternalWeakLinkage, nullptr,
+      "__stop_" + getGlobalMetadataSection());
+  StopELFMetadata->setVisibility(GlobalVariable::HiddenVisibility);
 
   // Create a call to register the globals with the runtime.
   IRB.CreateCall(AsanRegisterElfGlobals,
@@ -2353,12 +2349,9 @@ bool ModuleAddressSanitizer::InstrumentGlobals(IRBuilder<> &IRB, Module &M,
         NewGlobal->setSection("__TEXT,__asan_cstring,regular");
     }
 
-    // Transfer the debug info.  The payload starts at offset zero so we can
-    // copy the debug info over as is.
-    SmallVector<DIGlobalVariableExpression *, 1> GVs;
-    G->getDebugInfo(GVs);
-    for (auto *GV : GVs)
-      NewGlobal->addDebugInfo(GV);
+    // Transfer the debug info and type metadata.  The payload starts at offset
+    // zero so we can copy the metadata over as is.
+    NewGlobal->copyMetadata(G, 0);
 
     Value *Indices2[2];
     Indices2[0] = IRB.getInt32(0);
@@ -3116,7 +3109,8 @@ void FunctionStackPoisoner::processStaticAllocas() {
   int StackMallocIdx = -1;
   DebugLoc EntryDebugLocation;
   if (auto SP = F.getSubprogram())
-    EntryDebugLocation = DebugLoc::get(SP->getScopeLine(), 0, SP);
+    EntryDebugLocation =
+        DILocation::get(SP->getContext(), SP->getScopeLine(), 0, SP);
 
   Instruction *InsBefore = AllocaVec[0];
   IRBuilder<> IRB(InsBefore);
@@ -3324,7 +3318,7 @@ void FunctionStackPoisoner::processStaticAllocas() {
   SmallVector<uint8_t, 64> ShadowAfterReturn;
 
   // (Un)poison the stack before all ret instructions.
-  for (auto Ret : RetVec) {
+  for (Instruction *Ret : RetVec) {
     IRBuilder<> IRBRet(Ret);
     // Mark the current frame as retired.
     IRBRet.CreateStore(ConstantInt::get(IntptrTy, kRetiredStackFrameMagic),
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/CFGMST.h b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/CFGMST.h
index 9addb5d1ba93..6580b6d7d73e 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/CFGMST.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/CFGMST.h
@@ -20,7 +20,6 @@
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Support/BranchProbability.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -30,9 +29,6 @@
 #define DEBUG_TYPE "cfgmst"
 
 using namespace llvm;
-static cl::opt<bool> PGOInstrumentEntry(
-    "pgo-instrument-entry", cl::init(false), cl::Hidden,
-    cl::desc("Force to instrument function entry basicblock."));
 
 namespace llvm {
 
@@ -107,7 +103,7 @@ public:
     const BasicBlock *Entry = &(F.getEntryBlock());
     uint64_t EntryWeight = (BFI != nullptr ? BFI->getEntryFreq() : 2);
     // If we want to instrument the entry count, lower the weight to 0.
-    if (PGOInstrumentEntry)
+    if (InstrumentFuncEntry)
       EntryWeight = 0;
     Edge *EntryIncoming = nullptr, *EntryOutgoing = nullptr,
          *ExitOutgoing = nullptr, *ExitIncoming = nullptr;
@@ -282,14 +278,19 @@ public:
   BranchProbabilityInfo *BPI;
   BlockFrequencyInfo *BFI;
 
+  // If function entry will be always instrumented.
+  bool InstrumentFuncEntry;
+
 public:
-  CFGMST(Function &Func, BranchProbabilityInfo *BPI_ = nullptr,
+  CFGMST(Function &Func, bool InstrumentFuncEntry_,
+         BranchProbabilityInfo *BPI_ = nullptr,
          BlockFrequencyInfo *BFI_ = nullptr)
-      : F(Func), BPI(BPI_), BFI(BFI_) {
+      : F(Func), BPI(BPI_), BFI(BFI_),
+        InstrumentFuncEntry(InstrumentFuncEntry_) {
     buildEdges();
     sortEdgesByWeight();
     computeMinimumSpanningTree();
-    if (PGOInstrumentEntry && (AllEdges.size() > 1))
+    if (AllEdges.size() > 1 && InstrumentFuncEntry)
       std::iter_swap(std::move(AllEdges.begin()),
                      std::move(AllEdges.begin() + AllEdges.size() - 1));
   }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/CGProfile.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/CGProfile.cpp
index 0cc0d9b07387..9acd82c005e6 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/CGProfile.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/CGProfile.cpp
@@ -53,7 +53,8 @@ static bool runCGProfilePass(
   InstrProfSymtab Symtab;
   auto UpdateCounts = [&](TargetTransformInfo &TTI, Function *F,
                           Function *CalledF, uint64_t NewCount) {
-    if (!CalledF || !TTI.isLoweredToCall(CalledF))
+    if (!CalledF || !TTI.isLoweredToCall(CalledF) ||
+        CalledF->hasDLLImportStorageClass())
       return;
     uint64_t &Count = Counts[std::make_pair(F, CalledF)];
     Count = SaturatingAdd(Count, NewCount);
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
index a99c58b74fb1..927c34180db9 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
@@ -260,10 +260,9 @@ class CHRScope {
           if (TailRegionSet.count(Parent))
             return false;
 
-          assert(llvm::find_if(RegInfos,
-                               [&Parent](const RegInfo &RI) {
-                                 return Parent == RI.R;
-                               }) != RegInfos.end() &&
+          assert(llvm::any_of(
+                     RegInfos,
+                     [&Parent](const RegInfo &RI) { return Parent == RI.R; }) &&
                  "Must be in head");
           return true;
         });
@@ -732,7 +731,7 @@ static Instruction* getBranchInsertPoint(RegInfo &RI) {
     }
   }
   for (Instruction &I : *EntryBB) {
-    if (EntryBlockSelectSet.count(&I) > 0) {
+    if (EntryBlockSelectSet.contains(&I)) {
       assert(&I == HoistPoint &&
              "HoistPoint must be the first one in Selects");
       break;
@@ -950,10 +949,9 @@ void CHR::checkScopeHoistable(CHRScope *Scope) {
                 << "Dropped select due to unhoistable branch";
           });
         }
-        Selects.erase(std::remove_if(Selects.begin(), Selects.end(),
-                                     [EntryBB](SelectInst *SI) {
-                                       return SI->getParent() == EntryBB;
-                                     }), Selects.end());
+        llvm::erase_if(Selects, [EntryBB](SelectInst *SI) {
+          return SI->getParent() == EntryBB;
+        });
         Unhoistables.clear();
         InsertPoint = Branch;
       }
@@ -1249,7 +1247,7 @@ SmallVector<CHRScope *, 8> CHR::splitScope(
       SmallVector<CHRScope *, 8> SubSplits = splitScope(
           Sub, Split, &SplitConditionValues, SplitInsertPoint, Output,
           SplitUnhoistables);
-      NewSubs.insert(NewSubs.end(), SubSplits.begin(), SubSplits.end());
+      llvm::append_range(NewSubs, SubSplits);
     }
     Split->Subs = NewSubs;
   }
@@ -1306,17 +1304,17 @@ void CHR::classifyBiasedScopes(CHRScope *Scope, CHRScope *OutermostScope) {
   for (RegInfo &RI : Scope->RegInfos) {
     if (RI.HasBranch) {
       Region *R = RI.R;
-      if (TrueBiasedRegionsGlobal.count(R) > 0)
+      if (TrueBiasedRegionsGlobal.contains(R))
         OutermostScope->TrueBiasedRegions.insert(R);
-      else if (FalseBiasedRegionsGlobal.count(R) > 0)
+      else if (FalseBiasedRegionsGlobal.contains(R))
         OutermostScope->FalseBiasedRegions.insert(R);
       else
         llvm_unreachable("Must be biased");
     }
     for (SelectInst *SI : RI.Selects) {
-      if (TrueBiasedSelectsGlobal.count(SI) > 0)
+      if (TrueBiasedSelectsGlobal.contains(SI))
         OutermostScope->TrueBiasedSelects.insert(SI);
-      else if (FalseBiasedSelectsGlobal.count(SI) > 0)
+      else if (FalseBiasedSelectsGlobal.contains(SI))
         OutermostScope->FalseBiasedSelects.insert(SI);
       else
         llvm_unreachable("Must be biased");
@@ -1399,8 +1397,8 @@ void CHR::setCHRRegions(CHRScope *Scope, CHRScope *OutermostScope) {
     DenseSet<Instruction *> HoistStops;
     bool IsHoisted = false;
     if (RI.HasBranch) {
-      assert((OutermostScope->TrueBiasedRegions.count(R) > 0 ||
-              OutermostScope->FalseBiasedRegions.count(R) > 0) &&
+      assert((OutermostScope->TrueBiasedRegions.contains(R) ||
+              OutermostScope->FalseBiasedRegions.contains(R)) &&
              "Must be truthy or falsy");
       auto *BI = cast<BranchInst>(R->getEntry()->getTerminator());
       // Note checkHoistValue fills in HoistStops.
@@ -1412,8 +1410,8 @@ void CHR::setCHRRegions(CHRScope *Scope, CHRScope *OutermostScope) {
       IsHoisted = true;
     }
     for (SelectInst *SI : RI.Selects) {
-      assert((OutermostScope->TrueBiasedSelects.count(SI) > 0 ||
-              OutermostScope->FalseBiasedSelects.count(SI) > 0) &&
+      assert((OutermostScope->TrueBiasedSelects.contains(SI) ||
+              OutermostScope->FalseBiasedSelects.contains(SI)) &&
              "Must be true or false biased");
       // Note checkHoistValue fills in HoistStops.
       DenseMap<Instruction *, bool> Visited;
@@ -1609,9 +1607,7 @@ static void insertTrivialPHIs(CHRScope *Scope,
         // Insert a trivial phi for I (phi [&I, P0], [&I, P1], ...) at
         // ExitBlock. Replace I with the new phi in UI unless UI is another
         // phi at ExitBlock.
-        unsigned PredCount = std::distance(pred_begin(ExitBlock),
-                                           pred_end(ExitBlock));
-        PHINode *PN = PHINode::Create(I.getType(), PredCount, "",
+        PHINode *PN = PHINode::Create(I.getType(), pred_size(ExitBlock), "",
                                       &ExitBlock->front());
         for (BasicBlock *Pred : predecessors(ExitBlock)) {
           PN->addIncoming(&I, Pred);
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
index 284631900731..1b14b8d56994 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -46,6 +46,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Instrumentation/DataFlowSanitizer.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/DepthFirstIterator.h"
@@ -78,6 +79,7 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
@@ -104,10 +106,18 @@
 
 using namespace llvm;
 
+// This must be consistent with ShadowWidthBits.
+static const Align kShadowTLSAlignment = Align(2);
+
+// The size of TLS variables. These constants must be kept in sync with the ones
+// in dfsan.cpp.
+static const unsigned kArgTLSSize = 800;
+static const unsigned kRetvalTLSSize = 800;
+
 // External symbol to be used when generating the shadow address for
 // architectures with multiple VMAs. Instead of using a constant integer
 // the runtime will set the external mask based on the VMA range.
-static const char *const kDFSanExternShadowPtrMask = "__dfsan_shadow_ptr_mask";
+const char kDFSanExternShadowPtrMask[] = "__dfsan_shadow_ptr_mask";
 
 // The -dfsan-preserve-alignment flag controls whether this pass assumes that
 // alignment requirements provided by the input IR are correct.  For example,
@@ -167,8 +177,8 @@ static cl::opt<bool> ClDebugNonzeroLabels(
 //
 // If this flag is set to true, the user must provide definitions for the
 // following callback functions:
-//   void __dfsan_load_callback(dfsan_label Label);
-//   void __dfsan_store_callback(dfsan_label Label);
+//   void __dfsan_load_callback(dfsan_label Label, void* addr);
+//   void __dfsan_store_callback(dfsan_label Label, void* addr);
 //   void __dfsan_mem_transfer_callback(dfsan_label *Start, size_t Len);
 //   void __dfsan_cmp_callback(dfsan_label CombinedLabel);
 static cl::opt<bool> ClEventCallbacks(
@@ -176,6 +186,21 @@ static cl::opt<bool> ClEventCallbacks(
     cl::desc("Insert calls to __dfsan_*_callback functions on data events."),
     cl::Hidden, cl::init(false));
 
+// Use a distinct bit for each base label, enabling faster unions with less
+// instrumentation.  Limits the max number of base labels to 16.
+static cl::opt<bool> ClFast16Labels(
+    "dfsan-fast-16-labels",
+    cl::desc("Use more efficient instrumentation, limiting the number of "
+             "labels to 16."),
+    cl::Hidden, cl::init(false));
+
+// Controls whether the pass tracks the control flow of select instructions.
+static cl::opt<bool> ClTrackSelectControlFlow(
+    "dfsan-track-select-control-flow",
+    cl::desc("Propagate labels from condition values of select instructions "
+             "to results."),
+    cl::Hidden, cl::init(true));
+
 static StringRef GetGlobalTypeString(const GlobalValue &G) {
   // Types of GlobalVariables are always pointer types.
   Type *GType = G.getValueType();
@@ -292,7 +317,7 @@ AttributeList TransformFunctionAttributes(
       llvm::makeArrayRef(ArgumentAttributes));
 }
 
-class DataFlowSanitizer : public ModulePass {
+class DataFlowSanitizer {
   friend struct DFSanFunction;
   friend class DFSanVisitor;
 
@@ -336,20 +361,16 @@ class DataFlowSanitizer : public ModulePass {
 
   Module *Mod;
   LLVMContext *Ctx;
-  IntegerType *ShadowTy;
-  PointerType *ShadowPtrTy;
+  Type *Int8Ptr;
+  /// The shadow type for all primitive types and vector types.
+  IntegerType *PrimitiveShadowTy;
+  PointerType *PrimitiveShadowPtrTy;
   IntegerType *IntptrTy;
-  ConstantInt *ZeroShadow;
+  ConstantInt *ZeroPrimitiveShadow;
   ConstantInt *ShadowPtrMask;
   ConstantInt *ShadowPtrMul;
   Constant *ArgTLS;
   Constant *RetvalTLS;
-  void *(*GetArgTLSPtr)();
-  void *(*GetRetvalTLSPtr)();
-  FunctionType *GetArgTLSTy;
-  FunctionType *GetRetvalTLSTy;
-  Constant *GetArgTLS;
-  Constant *GetRetvalTLS;
   Constant *ExternalShadowMask;
   FunctionType *DFSanUnionFnTy;
   FunctionType *DFSanUnionLoadFnTy;
@@ -357,11 +378,13 @@ class DataFlowSanitizer : public ModulePass {
   FunctionType *DFSanSetLabelFnTy;
   FunctionType *DFSanNonzeroLabelFnTy;
   FunctionType *DFSanVarargWrapperFnTy;
-  FunctionType *DFSanLoadStoreCmpCallbackFnTy;
+  FunctionType *DFSanCmpCallbackFnTy;
+  FunctionType *DFSanLoadStoreCallbackFnTy;
   FunctionType *DFSanMemTransferCallbackFnTy;
   FunctionCallee DFSanUnionFn;
   FunctionCallee DFSanCheckedUnionFn;
   FunctionCallee DFSanUnionLoadFn;
+  FunctionCallee DFSanUnionLoadFast16LabelsFn;
   FunctionCallee DFSanUnimplementedFn;
   FunctionCallee DFSanSetLabelFn;
   FunctionCallee DFSanNonzeroLabelFn;
@@ -392,15 +415,43 @@ class DataFlowSanitizer : public ModulePass {
   void initializeCallbackFunctions(Module &M);
   void initializeRuntimeFunctions(Module &M);
 
-public:
-  static char ID;
+  bool init(Module &M);
+
+  /// Returns whether the pass tracks labels for struct fields and array
+  /// indices. Support only fast16 mode in TLS ABI mode.
+  bool shouldTrackFieldsAndIndices();
+
+  /// Returns a zero constant with the shadow type of OrigTy.
+  ///
+  /// getZeroShadow({T1,T2,...}) = {getZeroShadow(T1),getZeroShadow(T2,...}
+  /// getZeroShadow([n x T]) = [n x getZeroShadow(T)]
+  /// getZeroShadow(other type) = i16(0)
+  ///
+  /// Note that a zero shadow is always i16(0) when shouldTrackFieldsAndIndices
+  /// returns false.
+  Constant *getZeroShadow(Type *OrigTy);
+  /// Returns a zero constant with the shadow type of V's type.
+  Constant *getZeroShadow(Value *V);
+
+  /// Checks if V is a zero shadow.
+  bool isZeroShadow(Value *V);
+
+  /// Returns the shadow type of OrigTy.
+  ///
+  /// getShadowTy({T1,T2,...}) = {getShadowTy(T1),getShadowTy(T2),...}
+  /// getShadowTy([n x T]) = [n x getShadowTy(T)]
+  /// getShadowTy(other type) = i16
+  ///
+  /// Note that a shadow type is always i16 when shouldTrackFieldsAndIndices
+  /// returns false.
+  Type *getShadowTy(Type *OrigTy);
+  /// Returns the shadow type of of V's type.
+  Type *getShadowTy(Value *V);
 
-  DataFlowSanitizer(
-      const std::vector<std::string> &ABIListFiles = std::vector<std::string>(),
-      void *(*getArgTLS)() = nullptr, void *(*getRetValTLS)() = nullptr);
+public:
+  DataFlowSanitizer(const std::vector<std::string> &ABIListFiles);
 
-  bool doInitialization(Module &M) override;
-  bool runOnModule(Module &M) override;
+  bool runImpl(Module &M);
 };
 
 struct DFSanFunction {
@@ -409,8 +460,6 @@ struct DFSanFunction {
   DominatorTree DT;
   DataFlowSanitizer::InstrumentedABI IA;
   bool IsNativeABI;
-  Value *ArgTLSPtr = nullptr;
-  Value *RetvalTLSPtr = nullptr;
   AllocaInst *LabelReturnAlloca = nullptr;
   DenseMap<Value *, Value *> ValShadowMap;
   DenseMap<AllocaInst *, AllocaInst *> AllocaShadowMap;
@@ -419,12 +468,17 @@ struct DFSanFunction {
   std::vector<Value *> NonZeroChecks;
   bool AvoidNewBlocks;
 
-  struct CachedCombinedShadow {
-    BasicBlock *Block;
+  struct CachedShadow {
+    BasicBlock *Block; // The block where Shadow is defined.
     Value *Shadow;
   };
-  DenseMap<std::pair<Value *, Value *>, CachedCombinedShadow>
-      CachedCombinedShadows;
+  /// Maps a value to its latest shadow value in terms of domination tree.
+  DenseMap<std::pair<Value *, Value *>, CachedShadow> CachedShadows;
+  /// Maps a value to its latest collapsed shadow value it was converted to in
+  /// terms of domination tree. When ClDebugNonzeroLabels is on, this cache is
+  /// used at a post process where CFG blocks are split. So it does not cache
+  /// BasicBlock like CachedShadows, but uses domination between values.
+  DenseMap<Value *, Value *> CachedCollapsedShadows;
   DenseMap<Value *, std::set<Value *>> ShadowElements;
 
   DFSanFunction(DataFlowSanitizer &DFS, Function *F, bool IsNativeABI)
@@ -435,17 +489,56 @@ struct DFSanFunction {
     AvoidNewBlocks = F->size() > 1000;
   }
 
-  Value *getArgTLSPtr();
-  Value *getArgTLS(unsigned Index, Instruction *Pos);
-  Value *getRetvalTLS();
+  /// Computes the shadow address for a given function argument.
+  ///
+  /// Shadow = ArgTLS+ArgOffset.
+  Value *getArgTLS(Type *T, unsigned ArgOffset, IRBuilder<> &IRB);
+
+  /// Computes the shadow address for a retval.
+  Value *getRetvalTLS(Type *T, IRBuilder<> &IRB);
+
   Value *getShadow(Value *V);
   void setShadow(Instruction *I, Value *Shadow);
+  /// Generates IR to compute the union of the two given shadows, inserting it
+  /// before Pos. The combined value is with primitive type.
   Value *combineShadows(Value *V1, Value *V2, Instruction *Pos);
+  /// Combines the shadow values of V1 and V2, then converts the combined value
+  /// with primitive type into a shadow value with the original type T.
+  Value *combineShadowsThenConvert(Type *T, Value *V1, Value *V2,
+                                   Instruction *Pos);
   Value *combineOperandShadows(Instruction *Inst);
   Value *loadShadow(Value *ShadowAddr, uint64_t Size, uint64_t Align,
                     Instruction *Pos);
-  void storeShadow(Value *Addr, uint64_t Size, Align Alignment, Value *Shadow,
-                   Instruction *Pos);
+  void storePrimitiveShadow(Value *Addr, uint64_t Size, Align Alignment,
+                            Value *PrimitiveShadow, Instruction *Pos);
+  /// Applies PrimitiveShadow to all primitive subtypes of T, returning
+  /// the expanded shadow value.
+  ///
+  /// EFP({T1,T2, ...}, PS) = {EFP(T1,PS),EFP(T2,PS),...}
+  /// EFP([n x T], PS) = [n x EFP(T,PS)]
+  /// EFP(other types, PS) = PS
+  Value *expandFromPrimitiveShadow(Type *T, Value *PrimitiveShadow,
+                                   Instruction *Pos);
+  /// Collapses Shadow into a single primitive shadow value, unioning all
+  /// primitive shadow values in the process. Returns the final primitive
+  /// shadow value.
+  ///
+  /// CTP({V1,V2, ...}) = UNION(CFP(V1,PS),CFP(V2,PS),...)
+  /// CTP([V1,V2,...]) = UNION(CFP(V1,PS),CFP(V2,PS),...)
+  /// CTP(other types, PS) = PS
+  Value *collapseToPrimitiveShadow(Value *Shadow, Instruction *Pos);
+
+private:
+  /// Collapses the shadow with aggregate type into a single primitive shadow
+  /// value.
+  template <class AggregateType>
+  Value *collapseAggregateShadow(AggregateType *AT, Value *Shadow,
+                                 IRBuilder<> &IRB);
+
+  Value *collapseToPrimitiveShadow(Value *Shadow, IRBuilder<> &IRB);
+
+  /// Returns the shadow value of an argument A.
+  Value *getShadowForTLSArgument(Argument *A);
 };
 
 class DFSanVisitor : public InstVisitor<DFSanVisitor> {
@@ -485,25 +578,10 @@ public:
 
 } // end anonymous namespace
 
-char DataFlowSanitizer::ID;
-
-INITIALIZE_PASS(DataFlowSanitizer, "dfsan",
-                "DataFlowSanitizer: dynamic data flow analysis.", false, false)
-
-ModulePass *
-llvm::createDataFlowSanitizerPass(const std::vector<std::string> &ABIListFiles,
-                                  void *(*getArgTLS)(),
-                                  void *(*getRetValTLS)()) {
-  return new DataFlowSanitizer(ABIListFiles, getArgTLS, getRetValTLS);
-}
-
 DataFlowSanitizer::DataFlowSanitizer(
-    const std::vector<std::string> &ABIListFiles, void *(*getArgTLS)(),
-    void *(*getRetValTLS)())
-    : ModulePass(ID), GetArgTLSPtr(getArgTLS), GetRetvalTLSPtr(getRetValTLS) {
+    const std::vector<std::string> &ABIListFiles) {
   std::vector<std::string> AllABIListFiles(std::move(ABIListFiles));
-  AllABIListFiles.insert(AllABIListFiles.end(), ClABIListFiles.begin(),
-                         ClABIListFiles.end());
+  llvm::append_range(AllABIListFiles, ClABIListFiles);
   // FIXME: should we propagate vfs::FileSystem to this constructor?
   ABIList.set(
       SpecialCaseList::createOrDie(AllABIListFiles, *vfs::getRealFileSystem()));
@@ -511,12 +589,12 @@ DataFlowSanitizer::DataFlowSanitizer(
 
 FunctionType *DataFlowSanitizer::getArgsFunctionType(FunctionType *T) {
   SmallVector<Type *, 4> ArgTypes(T->param_begin(), T->param_end());
-  ArgTypes.append(T->getNumParams(), ShadowTy);
+  ArgTypes.append(T->getNumParams(), PrimitiveShadowTy);
   if (T->isVarArg())
-    ArgTypes.push_back(ShadowPtrTy);
+    ArgTypes.push_back(PrimitiveShadowPtrTy);
   Type *RetType = T->getReturnType();
   if (!RetType->isVoidTy())
-    RetType = StructType::get(RetType, ShadowTy);
+    RetType = StructType::get(RetType, PrimitiveShadowTy);
   return FunctionType::get(RetType, ArgTypes, T->isVarArg());
 }
 
@@ -525,10 +603,10 @@ FunctionType *DataFlowSanitizer::getTrampolineFunctionType(FunctionType *T) {
   SmallVector<Type *, 4> ArgTypes;
   ArgTypes.push_back(T->getPointerTo());
   ArgTypes.append(T->param_begin(), T->param_end());
-  ArgTypes.append(T->getNumParams(), ShadowTy);
+  ArgTypes.append(T->getNumParams(), PrimitiveShadowTy);
   Type *RetType = T->getReturnType();
   if (!RetType->isVoidTy())
-    ArgTypes.push_back(ShadowPtrTy);
+    ArgTypes.push_back(PrimitiveShadowPtrTy);
   return FunctionType::get(T->getReturnType(), ArgTypes, false);
 }
 
@@ -554,18 +632,174 @@ TransformedFunction DataFlowSanitizer::getCustomFunctionType(FunctionType *T) {
     }
   }
   for (unsigned i = 0, e = T->getNumParams(); i != e; ++i)
-    ArgTypes.push_back(ShadowTy);
+    ArgTypes.push_back(PrimitiveShadowTy);
   if (T->isVarArg())
-    ArgTypes.push_back(ShadowPtrTy);
+    ArgTypes.push_back(PrimitiveShadowPtrTy);
   Type *RetType = T->getReturnType();
   if (!RetType->isVoidTy())
-    ArgTypes.push_back(ShadowPtrTy);
+    ArgTypes.push_back(PrimitiveShadowPtrTy);
   return TransformedFunction(
       T, FunctionType::get(T->getReturnType(), ArgTypes, T->isVarArg()),
       ArgumentIndexMapping);
 }
 
-bool DataFlowSanitizer::doInitialization(Module &M) {
+bool DataFlowSanitizer::isZeroShadow(Value *V) {
+  if (!shouldTrackFieldsAndIndices())
+    return ZeroPrimitiveShadow == V;
+
+  Type *T = V->getType();
+  if (!isa<ArrayType>(T) && !isa<StructType>(T)) {
+    if (const ConstantInt *CI = dyn_cast<ConstantInt>(V))
+      return CI->isZero();
+    return false;
+  }
+
+  return isa<ConstantAggregateZero>(V);
+}
+
+bool DataFlowSanitizer::shouldTrackFieldsAndIndices() {
+  return getInstrumentedABI() == DataFlowSanitizer::IA_TLS && ClFast16Labels;
+}
+
+Constant *DataFlowSanitizer::getZeroShadow(Type *OrigTy) {
+  if (!shouldTrackFieldsAndIndices())
+    return ZeroPrimitiveShadow;
+
+  if (!isa<ArrayType>(OrigTy) && !isa<StructType>(OrigTy))
+    return ZeroPrimitiveShadow;
+  Type *ShadowTy = getShadowTy(OrigTy);
+  return ConstantAggregateZero::get(ShadowTy);
+}
+
+Constant *DataFlowSanitizer::getZeroShadow(Value *V) {
+  return getZeroShadow(V->getType());
+}
+
+static Value *expandFromPrimitiveShadowRecursive(
+    Value *Shadow, SmallVector<unsigned, 4> &Indices, Type *SubShadowTy,
+    Value *PrimitiveShadow, IRBuilder<> &IRB) {
+  if (!isa<ArrayType>(SubShadowTy) && !isa<StructType>(SubShadowTy))
+    return IRB.CreateInsertValue(Shadow, PrimitiveShadow, Indices);
+
+  if (ArrayType *AT = dyn_cast<ArrayType>(SubShadowTy)) {
+    for (unsigned Idx = 0; Idx < AT->getNumElements(); Idx++) {
+      Indices.push_back(Idx);
+      Shadow = expandFromPrimitiveShadowRecursive(
+          Shadow, Indices, AT->getElementType(), PrimitiveShadow, IRB);
+      Indices.pop_back();
+    }
+    return Shadow;
+  }
+
+  if (StructType *ST = dyn_cast<StructType>(SubShadowTy)) {
+    for (unsigned Idx = 0; Idx < ST->getNumElements(); Idx++) {
+      Indices.push_back(Idx);
+      Shadow = expandFromPrimitiveShadowRecursive(
+          Shadow, Indices, ST->getElementType(Idx), PrimitiveShadow, IRB);
+      Indices.pop_back();
+    }
+    return Shadow;
+  }
+  llvm_unreachable("Unexpected shadow type");
+}
+
+Value *DFSanFunction::expandFromPrimitiveShadow(Type *T, Value *PrimitiveShadow,
+                                                Instruction *Pos) {
+  Type *ShadowTy = DFS.getShadowTy(T);
+
+  if (!isa<ArrayType>(ShadowTy) && !isa<StructType>(ShadowTy))
+    return PrimitiveShadow;
+
+  if (DFS.isZeroShadow(PrimitiveShadow))
+    return DFS.getZeroShadow(ShadowTy);
+
+  IRBuilder<> IRB(Pos);
+  SmallVector<unsigned, 4> Indices;
+  Value *Shadow = UndefValue::get(ShadowTy);
+  Shadow = expandFromPrimitiveShadowRecursive(Shadow, Indices, ShadowTy,
+                                              PrimitiveShadow, IRB);
+
+  // Caches the primitive shadow value that built the shadow value.
+  CachedCollapsedShadows[Shadow] = PrimitiveShadow;
+  return Shadow;
+}
+
+template <class AggregateType>
+Value *DFSanFunction::collapseAggregateShadow(AggregateType *AT, Value *Shadow,
+                                              IRBuilder<> &IRB) {
+  if (!AT->getNumElements())
+    return DFS.ZeroPrimitiveShadow;
+
+  Value *FirstItem = IRB.CreateExtractValue(Shadow, 0);
+  Value *Aggregator = collapseToPrimitiveShadow(FirstItem, IRB);
+
+  for (unsigned Idx = 1; Idx < AT->getNumElements(); Idx++) {
+    Value *ShadowItem = IRB.CreateExtractValue(Shadow, Idx);
+    Value *ShadowInner = collapseToPrimitiveShadow(ShadowItem, IRB);
+    Aggregator = IRB.CreateOr(Aggregator, ShadowInner);
+  }
+  return Aggregator;
+}
+
+Value *DFSanFunction::collapseToPrimitiveShadow(Value *Shadow,
+                                                IRBuilder<> &IRB) {
+  Type *ShadowTy = Shadow->getType();
+  if (!isa<ArrayType>(ShadowTy) && !isa<StructType>(ShadowTy))
+    return Shadow;
+  if (ArrayType *AT = dyn_cast<ArrayType>(ShadowTy))
+    return collapseAggregateShadow<>(AT, Shadow, IRB);
+  if (StructType *ST = dyn_cast<StructType>(ShadowTy))
+    return collapseAggregateShadow<>(ST, Shadow, IRB);
+  llvm_unreachable("Unexpected shadow type");
+}
+
+Value *DFSanFunction::collapseToPrimitiveShadow(Value *Shadow,
+                                                Instruction *Pos) {
+  Type *ShadowTy = Shadow->getType();
+  if (!isa<ArrayType>(ShadowTy) && !isa<StructType>(ShadowTy))
+    return Shadow;
+
+  assert(DFS.shouldTrackFieldsAndIndices());
+
+  // Checks if the cached collapsed shadow value dominates Pos.
+  Value *&CS = CachedCollapsedShadows[Shadow];
+  if (CS && DT.dominates(CS, Pos))
+    return CS;
+
+  IRBuilder<> IRB(Pos);
+  Value *PrimitiveShadow = collapseToPrimitiveShadow(Shadow, IRB);
+  // Caches the converted primitive shadow value.
+  CS = PrimitiveShadow;
+  return PrimitiveShadow;
+}
+
+Type *DataFlowSanitizer::getShadowTy(Type *OrigTy) {
+  if (!shouldTrackFieldsAndIndices())
+    return PrimitiveShadowTy;
+
+  if (!OrigTy->isSized())
+    return PrimitiveShadowTy;
+  if (isa<IntegerType>(OrigTy))
+    return PrimitiveShadowTy;
+  if (isa<VectorType>(OrigTy))
+    return PrimitiveShadowTy;
+  if (ArrayType *AT = dyn_cast<ArrayType>(OrigTy))
+    return ArrayType::get(getShadowTy(AT->getElementType()),
+                          AT->getNumElements());
+  if (StructType *ST = dyn_cast<StructType>(OrigTy)) {
+    SmallVector<Type *, 4> Elements;
+    for (unsigned I = 0, N = ST->getNumElements(); I < N; ++I)
+      Elements.push_back(getShadowTy(ST->getElementType(I)));
+    return StructType::get(*Ctx, Elements);
+  }
+  return PrimitiveShadowTy;
+}
+
+Type *DataFlowSanitizer::getShadowTy(Value *V) {
+  return getShadowTy(V->getType());
+}
+
+bool DataFlowSanitizer::init(Module &M) {
   Triple TargetTriple(M.getTargetTriple());
   bool IsX86_64 = TargetTriple.getArch() == Triple::x86_64;
   bool IsMIPS64 = TargetTriple.isMIPS64();
@@ -576,10 +810,11 @@ bool DataFlowSanitizer::doInitialization(Module &M) {
 
   Mod = &M;
   Ctx = &M.getContext();
-  ShadowTy = IntegerType::get(*Ctx, ShadowWidthBits);
-  ShadowPtrTy = PointerType::getUnqual(ShadowTy);
+  Int8Ptr = Type::getInt8PtrTy(*Ctx);
+  PrimitiveShadowTy = IntegerType::get(*Ctx, ShadowWidthBits);
+  PrimitiveShadowPtrTy = PointerType::getUnqual(PrimitiveShadowTy);
   IntptrTy = DL.getIntPtrType(*Ctx);
-  ZeroShadow = ConstantInt::getSigned(ShadowTy, 0);
+  ZeroPrimitiveShadow = ConstantInt::getSigned(PrimitiveShadowTy, 0);
   ShadowPtrMul = ConstantInt::getSigned(IntptrTy, ShadowWidthBytes);
   if (IsX86_64)
     ShadowPtrMask = ConstantInt::getSigned(IntptrTy, ~0x700000000000LL);
@@ -591,44 +826,34 @@ bool DataFlowSanitizer::doInitialization(Module &M) {
   else
     report_fatal_error("unsupported triple");
 
-  Type *DFSanUnionArgs[2] = { ShadowTy, ShadowTy };
+  Type *DFSanUnionArgs[2] = {PrimitiveShadowTy, PrimitiveShadowTy};
   DFSanUnionFnTy =
-      FunctionType::get(ShadowTy, DFSanUnionArgs, /*isVarArg=*/ false);
-  Type *DFSanUnionLoadArgs[2] = { ShadowPtrTy, IntptrTy };
-  DFSanUnionLoadFnTy =
-      FunctionType::get(ShadowTy, DFSanUnionLoadArgs, /*isVarArg=*/ false);
+      FunctionType::get(PrimitiveShadowTy, DFSanUnionArgs, /*isVarArg=*/false);
+  Type *DFSanUnionLoadArgs[2] = {PrimitiveShadowPtrTy, IntptrTy};
+  DFSanUnionLoadFnTy = FunctionType::get(PrimitiveShadowTy, DFSanUnionLoadArgs,
+                                         /*isVarArg=*/false);
   DFSanUnimplementedFnTy = FunctionType::get(
       Type::getVoidTy(*Ctx), Type::getInt8PtrTy(*Ctx), /*isVarArg=*/false);
-  Type *DFSanSetLabelArgs[3] = { ShadowTy, Type::getInt8PtrTy(*Ctx), IntptrTy };
+  Type *DFSanSetLabelArgs[3] = {PrimitiveShadowTy, Type::getInt8PtrTy(*Ctx),
+                                IntptrTy};
   DFSanSetLabelFnTy = FunctionType::get(Type::getVoidTy(*Ctx),
                                         DFSanSetLabelArgs, /*isVarArg=*/false);
-  DFSanNonzeroLabelFnTy = FunctionType::get(
-      Type::getVoidTy(*Ctx), None, /*isVarArg=*/false);
+  DFSanNonzeroLabelFnTy =
+      FunctionType::get(Type::getVoidTy(*Ctx), None, /*isVarArg=*/false);
   DFSanVarargWrapperFnTy = FunctionType::get(
       Type::getVoidTy(*Ctx), Type::getInt8PtrTy(*Ctx), /*isVarArg=*/false);
-  DFSanLoadStoreCmpCallbackFnTy =
-      FunctionType::get(Type::getVoidTy(*Ctx), ShadowTy, /*isVarArg=*/false);
-  Type *DFSanMemTransferCallbackArgs[2] = {ShadowPtrTy, IntptrTy};
+  DFSanCmpCallbackFnTy =
+      FunctionType::get(Type::getVoidTy(*Ctx), PrimitiveShadowTy,
+                        /*isVarArg=*/false);
+  Type *DFSanLoadStoreCallbackArgs[2] = {PrimitiveShadowTy, Int8Ptr};
+  DFSanLoadStoreCallbackFnTy =
+      FunctionType::get(Type::getVoidTy(*Ctx), DFSanLoadStoreCallbackArgs,
+                        /*isVarArg=*/false);
+  Type *DFSanMemTransferCallbackArgs[2] = {PrimitiveShadowPtrTy, IntptrTy};
   DFSanMemTransferCallbackFnTy =
       FunctionType::get(Type::getVoidTy(*Ctx), DFSanMemTransferCallbackArgs,
                         /*isVarArg=*/false);
 
-  if (GetArgTLSPtr) {
-    Type *ArgTLSTy = ArrayType::get(ShadowTy, 64);
-    ArgTLS = nullptr;
-    GetArgTLSTy = FunctionType::get(PointerType::getUnqual(ArgTLSTy), false);
-    GetArgTLS = ConstantExpr::getIntToPtr(
-        ConstantInt::get(IntptrTy, uintptr_t(GetArgTLSPtr)),
-        PointerType::getUnqual(GetArgTLSTy));
-  }
-  if (GetRetvalTLSPtr) {
-    RetvalTLS = nullptr;
-    GetRetvalTLSTy = FunctionType::get(PointerType::getUnqual(ShadowTy), false);
-    GetRetvalTLS = ConstantExpr::getIntToPtr(
-        ConstantInt::get(IntptrTy, uintptr_t(GetRetvalTLSPtr)),
-        PointerType::getUnqual(GetRetvalTLSTy));
-  }
-
   ColdCallWeights = MDBuilder(*Ctx).createBranchWeights(1, 1000);
   return true;
 }
@@ -729,14 +954,21 @@ Constant *DataFlowSanitizer::getOrBuildTrampolineFunction(FunctionType *FT,
     else
       RI = ReturnInst::Create(*Ctx, CI, BB);
 
+    // F is called by a wrapped custom function with primitive shadows. So
+    // its arguments and return value need conversion.
     DFSanFunction DFSF(*this, F, /*IsNativeABI=*/true);
     Function::arg_iterator ValAI = F->arg_begin(), ShadowAI = AI; ++ValAI;
-    for (unsigned N = FT->getNumParams(); N != 0; ++ValAI, ++ShadowAI, --N)
-      DFSF.ValShadowMap[&*ValAI] = &*ShadowAI;
+    for (unsigned N = FT->getNumParams(); N != 0; ++ValAI, ++ShadowAI, --N) {
+      Value *Shadow =
+          DFSF.expandFromPrimitiveShadow(ValAI->getType(), &*ShadowAI, CI);
+      DFSF.ValShadowMap[&*ValAI] = Shadow;
+    }
     DFSanVisitor(DFSF).visitCallInst(*CI);
-    if (!FT->getReturnType()->isVoidTy())
-      new StoreInst(DFSF.getShadow(RI->getReturnValue()),
-                    &*std::prev(F->arg_end()), RI);
+    if (!FT->getReturnType()->isVoidTy()) {
+      Value *PrimitiveShadow = DFSF.collapseToPrimitiveShadow(
+          DFSF.getShadow(RI->getReturnValue()), RI);
+      new StoreInst(PrimitiveShadow, &*std::prev(F->arg_end()), RI);
+    }
   }
 
   return cast<Constant>(C.getCallee());
@@ -781,6 +1013,17 @@ void DataFlowSanitizer::initializeRuntimeFunctions(Module &M) {
     DFSanUnionLoadFn =
         Mod->getOrInsertFunction("__dfsan_union_load", DFSanUnionLoadFnTy, AL);
   }
+  {
+    AttributeList AL;
+    AL = AL.addAttribute(M.getContext(), AttributeList::FunctionIndex,
+                         Attribute::NoUnwind);
+    AL = AL.addAttribute(M.getContext(), AttributeList::FunctionIndex,
+                         Attribute::ReadOnly);
+    AL = AL.addAttribute(M.getContext(), AttributeList::ReturnIndex,
+                         Attribute::ZExt);
+    DFSanUnionLoadFast16LabelsFn = Mod->getOrInsertFunction(
+        "__dfsan_union_load_fast16labels", DFSanUnionLoadFnTy, AL);
+  }
   DFSanUnimplementedFn =
       Mod->getOrInsertFunction("__dfsan_unimplemented", DFSanUnimplementedFnTy);
   {
@@ -798,16 +1041,18 @@ void DataFlowSanitizer::initializeRuntimeFunctions(Module &M) {
 // Initializes event callback functions and declare them in the module
 void DataFlowSanitizer::initializeCallbackFunctions(Module &M) {
   DFSanLoadCallbackFn = Mod->getOrInsertFunction("__dfsan_load_callback",
-                                                 DFSanLoadStoreCmpCallbackFnTy);
-  DFSanStoreCallbackFn = Mod->getOrInsertFunction(
-      "__dfsan_store_callback", DFSanLoadStoreCmpCallbackFnTy);
+                                                 DFSanLoadStoreCallbackFnTy);
+  DFSanStoreCallbackFn = Mod->getOrInsertFunction("__dfsan_store_callback",
+                                                  DFSanLoadStoreCallbackFnTy);
   DFSanMemTransferCallbackFn = Mod->getOrInsertFunction(
       "__dfsan_mem_transfer_callback", DFSanMemTransferCallbackFnTy);
-  DFSanCmpCallbackFn = Mod->getOrInsertFunction("__dfsan_cmp_callback",
-                                                DFSanLoadStoreCmpCallbackFnTy);
+  DFSanCmpCallbackFn =
+      Mod->getOrInsertFunction("__dfsan_cmp_callback", DFSanCmpCallbackFnTy);
 }
 
-bool DataFlowSanitizer::runOnModule(Module &M) {
+bool DataFlowSanitizer::runImpl(Module &M) {
+  init(M);
+
   if (ABIList.isIn(M, "skip"))
     return false;
 
@@ -816,20 +1061,18 @@ bool DataFlowSanitizer::runOnModule(Module &M) {
 
   bool Changed = false;
 
-  if (!GetArgTLSPtr) {
-    Type *ArgTLSTy = ArrayType::get(ShadowTy, 64);
-    ArgTLS = Mod->getOrInsertGlobal("__dfsan_arg_tls", ArgTLSTy);
-    if (GlobalVariable *G = dyn_cast<GlobalVariable>(ArgTLS)) {
-      Changed |= G->getThreadLocalMode() != GlobalVariable::InitialExecTLSModel;
-      G->setThreadLocalMode(GlobalVariable::InitialExecTLSModel);
-    }
+  Type *ArgTLSTy = ArrayType::get(Type::getInt64Ty(*Ctx), kArgTLSSize / 8);
+  ArgTLS = Mod->getOrInsertGlobal("__dfsan_arg_tls", ArgTLSTy);
+  if (GlobalVariable *G = dyn_cast<GlobalVariable>(ArgTLS)) {
+    Changed |= G->getThreadLocalMode() != GlobalVariable::InitialExecTLSModel;
+    G->setThreadLocalMode(GlobalVariable::InitialExecTLSModel);
   }
-  if (!GetRetvalTLSPtr) {
-    RetvalTLS = Mod->getOrInsertGlobal("__dfsan_retval_tls", ShadowTy);
-    if (GlobalVariable *G = dyn_cast<GlobalVariable>(RetvalTLS)) {
-      Changed |= G->getThreadLocalMode() != GlobalVariable::InitialExecTLSModel;
-      G->setThreadLocalMode(GlobalVariable::InitialExecTLSModel);
-    }
+  Type *RetvalTLSTy =
+      ArrayType::get(Type::getInt64Ty(*Ctx), kRetvalTLSSize / 8);
+  RetvalTLS = Mod->getOrInsertGlobal("__dfsan_retval_tls", RetvalTLSTy);
+  if (GlobalVariable *G = dyn_cast<GlobalVariable>(RetvalTLS)) {
+    Changed |= G->getThreadLocalMode() != GlobalVariable::InitialExecTLSModel;
+    G->setThreadLocalMode(GlobalVariable::InitialExecTLSModel);
   }
 
   ExternalShadowMask =
@@ -845,6 +1088,7 @@ bool DataFlowSanitizer::runOnModule(Module &M) {
         &i != DFSanUnionFn.getCallee()->stripPointerCasts() &&
         &i != DFSanCheckedUnionFn.getCallee()->stripPointerCasts() &&
         &i != DFSanUnionLoadFn.getCallee()->stripPointerCasts() &&
+        &i != DFSanUnionLoadFast16LabelsFn.getCallee()->stripPointerCasts() &&
         &i != DFSanUnimplementedFn.getCallee()->stripPointerCasts() &&
         &i != DFSanSetLabelFn.getCallee()->stripPointerCasts() &&
         &i != DFSanNonzeroLabelFn.getCallee()->stripPointerCasts() &&
@@ -1044,7 +1288,9 @@ bool DataFlowSanitizer::runOnModule(Module &M) {
         while (isa<PHINode>(Pos) || isa<AllocaInst>(Pos))
           Pos = Pos->getNextNode();
         IRBuilder<> IRB(Pos);
-        Value *Ne = IRB.CreateICmpNE(V, DFSF.DFS.ZeroShadow);
+        Value *PrimitiveShadow = DFSF.collapseToPrimitiveShadow(V, Pos);
+        Value *Ne =
+            IRB.CreateICmpNE(PrimitiveShadow, DFSF.DFS.ZeroPrimitiveShadow);
         BranchInst *BI = cast<BranchInst>(SplitBlockAndInsertIfThen(
             Ne, Pos, /*Unreachable=*/false, ColdCallWeights));
         IRBuilder<> ThenIRB(BI);
@@ -1057,50 +1303,61 @@ bool DataFlowSanitizer::runOnModule(Module &M) {
          M.global_size() != InitialGlobalSize || M.size() != InitialModuleSize;
 }
 
-Value *DFSanFunction::getArgTLSPtr() {
-  if (ArgTLSPtr)
-    return ArgTLSPtr;
-  if (DFS.ArgTLS)
-    return ArgTLSPtr = DFS.ArgTLS;
+Value *DFSanFunction::getArgTLS(Type *T, unsigned ArgOffset, IRBuilder<> &IRB) {
+  Value *Base = IRB.CreatePointerCast(DFS.ArgTLS, DFS.IntptrTy);
+  if (ArgOffset)
+    Base = IRB.CreateAdd(Base, ConstantInt::get(DFS.IntptrTy, ArgOffset));
+  return IRB.CreateIntToPtr(Base, PointerType::get(DFS.getShadowTy(T), 0),
+                            "_dfsarg");
+}
 
-  IRBuilder<> IRB(&F->getEntryBlock().front());
-  return ArgTLSPtr = IRB.CreateCall(DFS.GetArgTLSTy, DFS.GetArgTLS, {});
+Value *DFSanFunction::getRetvalTLS(Type *T, IRBuilder<> &IRB) {
+  return IRB.CreatePointerCast(
+      DFS.RetvalTLS, PointerType::get(DFS.getShadowTy(T), 0), "_dfsret");
 }
 
-Value *DFSanFunction::getRetvalTLS() {
-  if (RetvalTLSPtr)
-    return RetvalTLSPtr;
-  if (DFS.RetvalTLS)
-    return RetvalTLSPtr = DFS.RetvalTLS;
+Value *DFSanFunction::getShadowForTLSArgument(Argument *A) {
+  unsigned ArgOffset = 0;
+  const DataLayout &DL = F->getParent()->getDataLayout();
+  for (auto &FArg : F->args()) {
+    if (!FArg.getType()->isSized()) {
+      if (A == &FArg)
+        break;
+      continue;
+    }
 
-  IRBuilder<> IRB(&F->getEntryBlock().front());
-  return RetvalTLSPtr =
-             IRB.CreateCall(DFS.GetRetvalTLSTy, DFS.GetRetvalTLS, {});
-}
+    unsigned Size = DL.getTypeAllocSize(DFS.getShadowTy(&FArg));
+    if (A != &FArg) {
+      ArgOffset += alignTo(Size, kShadowTLSAlignment);
+      if (ArgOffset > kArgTLSSize)
+        break; // ArgTLS overflows, uses a zero shadow.
+      continue;
+    }
 
-Value *DFSanFunction::getArgTLS(unsigned Idx, Instruction *Pos) {
-  IRBuilder<> IRB(Pos);
-  return IRB.CreateConstGEP2_64(ArrayType::get(DFS.ShadowTy, 64),
-                                getArgTLSPtr(), 0, Idx);
+    if (ArgOffset + Size > kArgTLSSize)
+      break; // ArgTLS overflows, uses a zero shadow.
+
+    Instruction *ArgTLSPos = &*F->getEntryBlock().begin();
+    IRBuilder<> IRB(ArgTLSPos);
+    Value *ArgShadowPtr = getArgTLS(FArg.getType(), ArgOffset, IRB);
+    return IRB.CreateAlignedLoad(DFS.getShadowTy(&FArg), ArgShadowPtr,
+                                 kShadowTLSAlignment);
+  }
+
+  return DFS.getZeroShadow(A);
 }
 
 Value *DFSanFunction::getShadow(Value *V) {
   if (!isa<Argument>(V) && !isa<Instruction>(V))
-    return DFS.ZeroShadow;
+    return DFS.getZeroShadow(V);
   Value *&Shadow = ValShadowMap[V];
   if (!Shadow) {
     if (Argument *A = dyn_cast<Argument>(V)) {
       if (IsNativeABI)
-        return DFS.ZeroShadow;
+        return DFS.getZeroShadow(V);
       switch (IA) {
       case DataFlowSanitizer::IA_TLS: {
-        Value *ArgTLSPtr = getArgTLSPtr();
-        Instruction *ArgTLSPos =
-            DFS.ArgTLS ? &*F->getEntryBlock().begin()
-                       : cast<Instruction>(ArgTLSPtr)->getNextNode();
-        IRBuilder<> IRB(ArgTLSPos);
-        Shadow =
-            IRB.CreateLoad(DFS.ShadowTy, getArgTLS(A->getArgNo(), ArgTLSPos));
+        Shadow = getShadowForTLSArgument(A);
         break;
       }
       case DataFlowSanitizer::IA_Args: {
@@ -1109,13 +1366,13 @@ Value *DFSanFunction::getShadow(Value *V) {
         while (ArgIdx--)
           ++i;
         Shadow = &*i;
-        assert(Shadow->getType() == DFS.ShadowTy);
+        assert(Shadow->getType() == DFS.PrimitiveShadowTy);
         break;
       }
       }
       NonZeroChecks.push_back(Shadow);
     } else {
-      Shadow = DFS.ZeroShadow;
+      Shadow = DFS.getZeroShadow(V);
     }
   }
   return Shadow;
@@ -1123,7 +1380,8 @@ Value *DFSanFunction::getShadow(Value *V) {
 
 void DFSanFunction::setShadow(Instruction *I, Value *Shadow) {
   assert(!ValShadowMap.count(I));
-  assert(Shadow->getType() == DFS.ShadowTy);
+  assert(DFS.shouldTrackFieldsAndIndices() ||
+         Shadow->getType() == DFS.PrimitiveShadowTy);
   ValShadowMap[I] = Shadow;
 }
 
@@ -1140,47 +1398,60 @@ Value *DataFlowSanitizer::getShadowAddress(Value *Addr, Instruction *Pos) {
           IRB.CreateAnd(IRB.CreatePtrToInt(Addr, IntptrTy),
                         IRB.CreatePtrToInt(ShadowPtrMaskValue, IntptrTy)),
           ShadowPtrMul),
-      ShadowPtrTy);
+      PrimitiveShadowPtrTy);
+}
+
+Value *DFSanFunction::combineShadowsThenConvert(Type *T, Value *V1, Value *V2,
+                                                Instruction *Pos) {
+  Value *PrimitiveValue = combineShadows(V1, V2, Pos);
+  return expandFromPrimitiveShadow(T, PrimitiveValue, Pos);
 }
 
 // Generates IR to compute the union of the two given shadows, inserting it
-// before Pos.  Returns the computed union Value.
+// before Pos. The combined value is with primitive type.
 Value *DFSanFunction::combineShadows(Value *V1, Value *V2, Instruction *Pos) {
-  if (V1 == DFS.ZeroShadow)
-    return V2;
-  if (V2 == DFS.ZeroShadow)
-    return V1;
+  if (DFS.isZeroShadow(V1))
+    return collapseToPrimitiveShadow(V2, Pos);
+  if (DFS.isZeroShadow(V2))
+    return collapseToPrimitiveShadow(V1, Pos);
   if (V1 == V2)
-    return V1;
+    return collapseToPrimitiveShadow(V1, Pos);
 
   auto V1Elems = ShadowElements.find(V1);
   auto V2Elems = ShadowElements.find(V2);
   if (V1Elems != ShadowElements.end() && V2Elems != ShadowElements.end()) {
     if (std::includes(V1Elems->second.begin(), V1Elems->second.end(),
                       V2Elems->second.begin(), V2Elems->second.end())) {
-      return V1;
+      return collapseToPrimitiveShadow(V1, Pos);
     } else if (std::includes(V2Elems->second.begin(), V2Elems->second.end(),
                              V1Elems->second.begin(), V1Elems->second.end())) {
-      return V2;
+      return collapseToPrimitiveShadow(V2, Pos);
     }
   } else if (V1Elems != ShadowElements.end()) {
     if (V1Elems->second.count(V2))
-      return V1;
+      return collapseToPrimitiveShadow(V1, Pos);
   } else if (V2Elems != ShadowElements.end()) {
     if (V2Elems->second.count(V1))
-      return V2;
+      return collapseToPrimitiveShadow(V2, Pos);
   }
 
   auto Key = std::make_pair(V1, V2);
   if (V1 > V2)
     std::swap(Key.first, Key.second);
-  CachedCombinedShadow &CCS = CachedCombinedShadows[Key];
+  CachedShadow &CCS = CachedShadows[Key];
   if (CCS.Block && DT.dominates(CCS.Block, Pos->getParent()))
     return CCS.Shadow;
 
+  // Converts inputs shadows to shadows with primitive types.
+  Value *PV1 = collapseToPrimitiveShadow(V1, Pos);
+  Value *PV2 = collapseToPrimitiveShadow(V2, Pos);
+
   IRBuilder<> IRB(Pos);
-  if (AvoidNewBlocks) {
-    CallInst *Call = IRB.CreateCall(DFS.DFSanCheckedUnionFn, {V1, V2});
+  if (ClFast16Labels) {
+    CCS.Block = Pos->getParent();
+    CCS.Shadow = IRB.CreateOr(PV1, PV2);
+  } else if (AvoidNewBlocks) {
+    CallInst *Call = IRB.CreateCall(DFS.DFSanCheckedUnionFn, {PV1, PV2});
     Call->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
     Call->addParamAttr(0, Attribute::ZExt);
     Call->addParamAttr(1, Attribute::ZExt);
@@ -1189,19 +1460,20 @@ Value *DFSanFunction::combineShadows(Value *V1, Value *V2, Instruction *Pos) {
     CCS.Shadow = Call;
   } else {
     BasicBlock *Head = Pos->getParent();
-    Value *Ne = IRB.CreateICmpNE(V1, V2);
+    Value *Ne = IRB.CreateICmpNE(PV1, PV2);
     BranchInst *BI = cast<BranchInst>(SplitBlockAndInsertIfThen(
         Ne, Pos, /*Unreachable=*/false, DFS.ColdCallWeights, &DT));
     IRBuilder<> ThenIRB(BI);
-    CallInst *Call = ThenIRB.CreateCall(DFS.DFSanUnionFn, {V1, V2});
+    CallInst *Call = ThenIRB.CreateCall(DFS.DFSanUnionFn, {PV1, PV2});
     Call->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
     Call->addParamAttr(0, Attribute::ZExt);
     Call->addParamAttr(1, Attribute::ZExt);
 
     BasicBlock *Tail = BI->getSuccessor(0);
-    PHINode *Phi = PHINode::Create(DFS.ShadowTy, 2, "", &Tail->front());
+    PHINode *Phi =
+        PHINode::Create(DFS.PrimitiveShadowTy, 2, "", &Tail->front());
     Phi->addIncoming(Call, Call->getParent());
-    Phi->addIncoming(V1, Head);
+    Phi->addIncoming(PV1, Head);
 
     CCS.Block = Tail;
     CCS.Shadow = Phi;
@@ -1228,13 +1500,13 @@ Value *DFSanFunction::combineShadows(Value *V1, Value *V2, Instruction *Pos) {
 // the computed union Value.
 Value *DFSanFunction::combineOperandShadows(Instruction *Inst) {
   if (Inst->getNumOperands() == 0)
-    return DFS.ZeroShadow;
+    return DFS.getZeroShadow(Inst);
 
   Value *Shadow = getShadow(Inst->getOperand(0));
   for (unsigned i = 1, n = Inst->getNumOperands(); i != n; ++i) {
     Shadow = combineShadows(Shadow, getShadow(Inst->getOperand(i)), Inst);
   }
-  return Shadow;
+  return expandFromPrimitiveShadow(Inst->getType(), Shadow, Inst);
 }
 
 Value *DFSanVisitor::visitOperandShadowInst(Instruction &I) {
@@ -1244,20 +1516,21 @@ Value *DFSanVisitor::visitOperandShadowInst(Instruction &I) {
 }
 
 // Generates IR to load shadow corresponding to bytes [Addr, Addr+Size), where
-// Addr has alignment Align, and take the union of each of those shadows.
+// Addr has alignment Align, and take the union of each of those shadows. The
+// returned shadow always has primitive type.
 Value *DFSanFunction::loadShadow(Value *Addr, uint64_t Size, uint64_t Align,
                                  Instruction *Pos) {
   if (AllocaInst *AI = dyn_cast<AllocaInst>(Addr)) {
     const auto i = AllocaShadowMap.find(AI);
     if (i != AllocaShadowMap.end()) {
       IRBuilder<> IRB(Pos);
-      return IRB.CreateLoad(DFS.ShadowTy, i->second);
+      return IRB.CreateLoad(DFS.PrimitiveShadowTy, i->second);
     }
   }
 
   const llvm::Align ShadowAlign(Align * DFS.ShadowWidthBytes);
   SmallVector<const Value *, 2> Objs;
-  GetUnderlyingObjects(Addr, Objs, Pos->getModule()->getDataLayout());
+  getUnderlyingObjects(Addr, Objs);
   bool AllConstants = true;
   for (const Value *Obj : Objs) {
     if (isa<Function>(Obj) || isa<BlockAddress>(Obj))
@@ -1269,26 +1542,51 @@ Value *DFSanFunction::loadShadow(Value *Addr, uint64_t Size, uint64_t Align,
     break;
   }
   if (AllConstants)
-    return DFS.ZeroShadow;
+    return DFS.ZeroPrimitiveShadow;
 
   Value *ShadowAddr = DFS.getShadowAddress(Addr, Pos);
   switch (Size) {
   case 0:
-    return DFS.ZeroShadow;
+    return DFS.ZeroPrimitiveShadow;
   case 1: {
-    LoadInst *LI = new LoadInst(DFS.ShadowTy, ShadowAddr, "", Pos);
+    LoadInst *LI = new LoadInst(DFS.PrimitiveShadowTy, ShadowAddr, "", Pos);
     LI->setAlignment(ShadowAlign);
     return LI;
   }
   case 2: {
     IRBuilder<> IRB(Pos);
-    Value *ShadowAddr1 = IRB.CreateGEP(DFS.ShadowTy, ShadowAddr,
+    Value *ShadowAddr1 = IRB.CreateGEP(DFS.PrimitiveShadowTy, ShadowAddr,
                                        ConstantInt::get(DFS.IntptrTy, 1));
     return combineShadows(
-        IRB.CreateAlignedLoad(DFS.ShadowTy, ShadowAddr, ShadowAlign),
-        IRB.CreateAlignedLoad(DFS.ShadowTy, ShadowAddr1, ShadowAlign), Pos);
+        IRB.CreateAlignedLoad(DFS.PrimitiveShadowTy, ShadowAddr, ShadowAlign),
+        IRB.CreateAlignedLoad(DFS.PrimitiveShadowTy, ShadowAddr1, ShadowAlign),
+        Pos);
   }
   }
+
+  if (ClFast16Labels && Size % (64 / DFS.ShadowWidthBits) == 0) {
+    // First OR all the WideShadows, then OR individual shadows within the
+    // combined WideShadow.  This is fewer instructions than ORing shadows
+    // individually.
+    IRBuilder<> IRB(Pos);
+    Value *WideAddr =
+        IRB.CreateBitCast(ShadowAddr, Type::getInt64PtrTy(*DFS.Ctx));
+    Value *CombinedWideShadow =
+        IRB.CreateAlignedLoad(IRB.getInt64Ty(), WideAddr, ShadowAlign);
+    for (uint64_t Ofs = 64 / DFS.ShadowWidthBits; Ofs != Size;
+         Ofs += 64 / DFS.ShadowWidthBits) {
+      WideAddr = IRB.CreateGEP(Type::getInt64Ty(*DFS.Ctx), WideAddr,
+                               ConstantInt::get(DFS.IntptrTy, 1));
+      Value *NextWideShadow =
+          IRB.CreateAlignedLoad(IRB.getInt64Ty(), WideAddr, ShadowAlign);
+      CombinedWideShadow = IRB.CreateOr(CombinedWideShadow, NextWideShadow);
+    }
+    for (unsigned Width = 32; Width >= DFS.ShadowWidthBits; Width >>= 1) {
+      Value *ShrShadow = IRB.CreateLShr(CombinedWideShadow, Width);
+      CombinedWideShadow = IRB.CreateOr(CombinedWideShadow, ShrShadow);
+    }
+    return IRB.CreateTrunc(CombinedWideShadow, DFS.PrimitiveShadowTy);
+  }
   if (!AvoidNewBlocks && Size % (64 / DFS.ShadowWidthBits) == 0) {
     // Fast path for the common case where each byte has identical shadow: load
     // shadow 64 bits at a time, fall out to a __dfsan_union_load call if any
@@ -1307,7 +1605,7 @@ Value *DFSanFunction::loadShadow(Value *Addr, uint64_t Size, uint64_t Align,
         IRB.CreateBitCast(ShadowAddr, Type::getInt64PtrTy(*DFS.Ctx));
     Value *WideShadow =
         IRB.CreateAlignedLoad(IRB.getInt64Ty(), WideAddr, ShadowAlign);
-    Value *TruncShadow = IRB.CreateTrunc(WideShadow, DFS.ShadowTy);
+    Value *TruncShadow = IRB.CreateTrunc(WideShadow, DFS.PrimitiveShadowTy);
     Value *ShlShadow = IRB.CreateShl(WideShadow, DFS.ShadowWidthBits);
     Value *ShrShadow = IRB.CreateLShr(WideShadow, 64 - DFS.ShadowWidthBits);
     Value *RotShadow = IRB.CreateOr(ShlShadow, ShrShadow);
@@ -1348,15 +1646,18 @@ Value *DFSanFunction::loadShadow(Value *Addr, uint64_t Size, uint64_t Align,
 
     LastBr->setSuccessor(0, Tail);
     FallbackIRB.CreateBr(Tail);
-    PHINode *Shadow = PHINode::Create(DFS.ShadowTy, 2, "", &Tail->front());
+    PHINode *Shadow =
+        PHINode::Create(DFS.PrimitiveShadowTy, 2, "", &Tail->front());
     Shadow->addIncoming(FallbackCall, FallbackBB);
     Shadow->addIncoming(TruncShadow, LastBr->getParent());
     return Shadow;
   }
 
   IRBuilder<> IRB(Pos);
+  FunctionCallee &UnionLoadFn =
+      ClFast16Labels ? DFS.DFSanUnionLoadFast16LabelsFn : DFS.DFSanUnionLoadFn;
   CallInst *FallbackCall = IRB.CreateCall(
-      DFS.DFSanUnionLoadFn, {ShadowAddr, ConstantInt::get(DFS.IntptrTy, Size)});
+      UnionLoadFn, {ShadowAddr, ConstantInt::get(DFS.IntptrTy, Size)});
   FallbackCall->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
   return FallbackCall;
 }
@@ -1365,34 +1666,39 @@ void DFSanVisitor::visitLoadInst(LoadInst &LI) {
   auto &DL = LI.getModule()->getDataLayout();
   uint64_t Size = DL.getTypeStoreSize(LI.getType());
   if (Size == 0) {
-    DFSF.setShadow(&LI, DFSF.DFS.ZeroShadow);
+    DFSF.setShadow(&LI, DFSF.DFS.getZeroShadow(&LI));
     return;
   }
 
   Align Alignment = ClPreserveAlignment ? LI.getAlign() : Align(1);
-  Value *Shadow =
+  Value *PrimitiveShadow =
       DFSF.loadShadow(LI.getPointerOperand(), Size, Alignment.value(), &LI);
   if (ClCombinePointerLabelsOnLoad) {
     Value *PtrShadow = DFSF.getShadow(LI.getPointerOperand());
-    Shadow = DFSF.combineShadows(Shadow, PtrShadow, &LI);
+    PrimitiveShadow = DFSF.combineShadows(PrimitiveShadow, PtrShadow, &LI);
   }
-  if (Shadow != DFSF.DFS.ZeroShadow)
-    DFSF.NonZeroChecks.push_back(Shadow);
+  if (!DFSF.DFS.isZeroShadow(PrimitiveShadow))
+    DFSF.NonZeroChecks.push_back(PrimitiveShadow);
 
+  Value *Shadow =
+      DFSF.expandFromPrimitiveShadow(LI.getType(), PrimitiveShadow, &LI);
   DFSF.setShadow(&LI, Shadow);
   if (ClEventCallbacks) {
     IRBuilder<> IRB(&LI);
-    IRB.CreateCall(DFSF.DFS.DFSanLoadCallbackFn, Shadow);
+    Value *Addr8 = IRB.CreateBitCast(LI.getPointerOperand(), DFSF.DFS.Int8Ptr);
+    IRB.CreateCall(DFSF.DFS.DFSanLoadCallbackFn, {PrimitiveShadow, Addr8});
   }
 }
 
-void DFSanFunction::storeShadow(Value *Addr, uint64_t Size, Align Alignment,
-                                Value *Shadow, Instruction *Pos) {
+void DFSanFunction::storePrimitiveShadow(Value *Addr, uint64_t Size,
+                                         Align Alignment,
+                                         Value *PrimitiveShadow,
+                                         Instruction *Pos) {
   if (AllocaInst *AI = dyn_cast<AllocaInst>(Addr)) {
     const auto i = AllocaShadowMap.find(AI);
     if (i != AllocaShadowMap.end()) {
       IRBuilder<> IRB(Pos);
-      IRB.CreateStore(Shadow, i->second);
+      IRB.CreateStore(PrimitiveShadow, i->second);
       return;
     }
   }
@@ -1400,7 +1706,7 @@ void DFSanFunction::storeShadow(Value *Addr, uint64_t Size, Align Alignment,
   const Align ShadowAlign(Alignment.value() * DFS.ShadowWidthBytes);
   IRBuilder<> IRB(Pos);
   Value *ShadowAddr = DFS.getShadowAddress(Addr, Pos);
-  if (Shadow == DFS.ZeroShadow) {
+  if (DFS.isZeroShadow(PrimitiveShadow)) {
     IntegerType *ShadowTy =
         IntegerType::get(*DFS.Ctx, Size * DFS.ShadowWidthBits);
     Value *ExtZeroShadow = ConstantInt::get(ShadowTy, 0);
@@ -1413,11 +1719,13 @@ void DFSanFunction::storeShadow(Value *Addr, uint64_t Size, Align Alignment,
   const unsigned ShadowVecSize = 128 / DFS.ShadowWidthBits;
   uint64_t Offset = 0;
   if (Size >= ShadowVecSize) {
-    auto *ShadowVecTy = FixedVectorType::get(DFS.ShadowTy, ShadowVecSize);
+    auto *ShadowVecTy =
+        FixedVectorType::get(DFS.PrimitiveShadowTy, ShadowVecSize);
     Value *ShadowVec = UndefValue::get(ShadowVecTy);
     for (unsigned i = 0; i != ShadowVecSize; ++i) {
       ShadowVec = IRB.CreateInsertElement(
-          ShadowVec, Shadow, ConstantInt::get(Type::getInt32Ty(*DFS.Ctx), i));
+          ShadowVec, PrimitiveShadow,
+          ConstantInt::get(Type::getInt32Ty(*DFS.Ctx), i));
     }
     Value *ShadowVecAddr =
         IRB.CreateBitCast(ShadowAddr, PointerType::getUnqual(ShadowVecTy));
@@ -1432,8 +1740,8 @@ void DFSanFunction::storeShadow(Value *Addr, uint64_t Size, Align Alignment,
   }
   while (Size > 0) {
     Value *CurShadowAddr =
-        IRB.CreateConstGEP1_32(DFS.ShadowTy, ShadowAddr, Offset);
-    IRB.CreateAlignedStore(Shadow, CurShadowAddr, ShadowAlign);
+        IRB.CreateConstGEP1_32(DFS.PrimitiveShadowTy, ShadowAddr, Offset);
+    IRB.CreateAlignedStore(PrimitiveShadow, CurShadowAddr, ShadowAlign);
     --Size;
     ++Offset;
   }
@@ -1448,14 +1756,19 @@ void DFSanVisitor::visitStoreInst(StoreInst &SI) {
   const Align Alignment = ClPreserveAlignment ? SI.getAlign() : Align(1);
 
   Value* Shadow = DFSF.getShadow(SI.getValueOperand());
+  Value *PrimitiveShadow;
   if (ClCombinePointerLabelsOnStore) {
     Value *PtrShadow = DFSF.getShadow(SI.getPointerOperand());
-    Shadow = DFSF.combineShadows(Shadow, PtrShadow, &SI);
+    PrimitiveShadow = DFSF.combineShadows(Shadow, PtrShadow, &SI);
+  } else {
+    PrimitiveShadow = DFSF.collapseToPrimitiveShadow(Shadow, &SI);
   }
-  DFSF.storeShadow(SI.getPointerOperand(), Size, Alignment, Shadow, &SI);
+  DFSF.storePrimitiveShadow(SI.getPointerOperand(), Size, Alignment,
+                            PrimitiveShadow, &SI);
   if (ClEventCallbacks) {
     IRBuilder<> IRB(&SI);
-    IRB.CreateCall(DFSF.DFS.DFSanStoreCallbackFn, Shadow);
+    Value *Addr8 = IRB.CreateBitCast(SI.getPointerOperand(), DFSF.DFS.Int8Ptr);
+    IRB.CreateCall(DFSF.DFS.DFSanStoreCallbackFn, {PrimitiveShadow, Addr8});
   }
 }
 
@@ -1494,11 +1807,29 @@ void DFSanVisitor::visitShuffleVectorInst(ShuffleVectorInst &I) {
 }
 
 void DFSanVisitor::visitExtractValueInst(ExtractValueInst &I) {
-  visitOperandShadowInst(I);
+  if (!DFSF.DFS.shouldTrackFieldsAndIndices()) {
+    visitOperandShadowInst(I);
+    return;
+  }
+
+  IRBuilder<> IRB(&I);
+  Value *Agg = I.getAggregateOperand();
+  Value *AggShadow = DFSF.getShadow(Agg);
+  Value *ResShadow = IRB.CreateExtractValue(AggShadow, I.getIndices());
+  DFSF.setShadow(&I, ResShadow);
 }
 
 void DFSanVisitor::visitInsertValueInst(InsertValueInst &I) {
-  visitOperandShadowInst(I);
+  if (!DFSF.DFS.shouldTrackFieldsAndIndices()) {
+    visitOperandShadowInst(I);
+    return;
+  }
+
+  IRBuilder<> IRB(&I);
+  Value *AggShadow = DFSF.getShadow(I.getAggregateOperand());
+  Value *InsShadow = DFSF.getShadow(I.getInsertedValueOperand());
+  Value *Res = IRB.CreateInsertValue(AggShadow, InsShadow, I.getIndices());
+  DFSF.setShadow(&I, Res);
 }
 
 void DFSanVisitor::visitAllocaInst(AllocaInst &I) {
@@ -1517,31 +1848,32 @@ void DFSanVisitor::visitAllocaInst(AllocaInst &I) {
   }
   if (AllLoadsStores) {
     IRBuilder<> IRB(&I);
-    DFSF.AllocaShadowMap[&I] = IRB.CreateAlloca(DFSF.DFS.ShadowTy);
+    DFSF.AllocaShadowMap[&I] = IRB.CreateAlloca(DFSF.DFS.PrimitiveShadowTy);
   }
-  DFSF.setShadow(&I, DFSF.DFS.ZeroShadow);
+  DFSF.setShadow(&I, DFSF.DFS.ZeroPrimitiveShadow);
 }
 
 void DFSanVisitor::visitSelectInst(SelectInst &I) {
   Value *CondShadow = DFSF.getShadow(I.getCondition());
   Value *TrueShadow = DFSF.getShadow(I.getTrueValue());
   Value *FalseShadow = DFSF.getShadow(I.getFalseValue());
+  Value *ShadowSel = nullptr;
 
   if (isa<VectorType>(I.getCondition()->getType())) {
-    DFSF.setShadow(
-        &I,
-        DFSF.combineShadows(
-            CondShadow, DFSF.combineShadows(TrueShadow, FalseShadow, &I), &I));
+    ShadowSel = DFSF.combineShadowsThenConvert(I.getType(), TrueShadow,
+                                               FalseShadow, &I);
   } else {
-    Value *ShadowSel;
     if (TrueShadow == FalseShadow) {
       ShadowSel = TrueShadow;
     } else {
       ShadowSel =
           SelectInst::Create(I.getCondition(), TrueShadow, FalseShadow, "", &I);
     }
-    DFSF.setShadow(&I, DFSF.combineShadows(CondShadow, ShadowSel, &I));
   }
+  DFSF.setShadow(&I, ClTrackSelectControlFlow
+                         ? DFSF.combineShadowsThenConvert(
+                               I.getType(), CondShadow, ShadowSel, &I)
+                         : ShadowSel);
 }
 
 void DFSanVisitor::visitMemSetInst(MemSetInst &I) {
@@ -1585,7 +1917,15 @@ void DFSanVisitor::visitReturnInst(ReturnInst &RI) {
     case DataFlowSanitizer::IA_TLS: {
       Value *S = DFSF.getShadow(RI.getReturnValue());
       IRBuilder<> IRB(&RI);
-      IRB.CreateStore(S, DFSF.getRetvalTLS());
+      Type *RT = DFSF.F->getFunctionType()->getReturnType();
+      unsigned Size =
+          getDataLayout().getTypeAllocSize(DFSF.DFS.getShadowTy(RT));
+      if (Size <= kRetvalTLSSize) {
+        // If the size overflows, stores nothing. At callsite, oversized return
+        // shadows are set to zero.
+        IRB.CreateAlignedStore(S, DFSF.getRetvalTLS(RT, IRB),
+                               kShadowTLSAlignment);
+      }
       break;
     }
     case DataFlowSanitizer::IA_Args: {
@@ -1625,11 +1965,11 @@ void DFSanVisitor::visitCallBase(CallBase &CB) {
       CB.setCalledFunction(F);
       IRB.CreateCall(DFSF.DFS.DFSanUnimplementedFn,
                      IRB.CreateGlobalStringPtr(F->getName()));
-      DFSF.setShadow(&CB, DFSF.DFS.ZeroShadow);
+      DFSF.setShadow(&CB, DFSF.DFS.getZeroShadow(&CB));
       return;
     case DataFlowSanitizer::WK_Discard:
       CB.setCalledFunction(F);
-      DFSF.setShadow(&CB, DFSF.DFS.ZeroShadow);
+      DFSF.setShadow(&CB, DFSF.DFS.getZeroShadow(&CB));
       return;
     case DataFlowSanitizer::WK_Functional:
       CB.setCalledFunction(F);
@@ -1681,10 +2021,11 @@ void DFSanVisitor::visitCallBase(CallBase &CB) {
         i = CB.arg_begin();
         const unsigned ShadowArgStart = Args.size();
         for (unsigned n = FT->getNumParams(); n != 0; ++i, --n)
-          Args.push_back(DFSF.getShadow(*i));
+          Args.push_back(
+              DFSF.collapseToPrimitiveShadow(DFSF.getShadow(*i), &CB));
 
         if (FT->isVarArg()) {
-          auto *LabelVATy = ArrayType::get(DFSF.DFS.ShadowTy,
+          auto *LabelVATy = ArrayType::get(DFSF.DFS.PrimitiveShadowTy,
                                            CB.arg_size() - FT->getNumParams());
           auto *LabelVAAlloca = new AllocaInst(
               LabelVATy, getDataLayout().getAllocaAddrSpace(),
@@ -1692,7 +2033,9 @@ void DFSanVisitor::visitCallBase(CallBase &CB) {
 
           for (unsigned n = 0; i != CB.arg_end(); ++i, ++n) {
             auto LabelVAPtr = IRB.CreateStructGEP(LabelVATy, LabelVAAlloca, n);
-            IRB.CreateStore(DFSF.getShadow(*i), LabelVAPtr);
+            IRB.CreateStore(
+                DFSF.collapseToPrimitiveShadow(DFSF.getShadow(*i), &CB),
+                LabelVAPtr);
           }
 
           Args.push_back(IRB.CreateStructGEP(LabelVATy, LabelVAAlloca, 0));
@@ -1701,9 +2044,9 @@ void DFSanVisitor::visitCallBase(CallBase &CB) {
         if (!FT->getReturnType()->isVoidTy()) {
           if (!DFSF.LabelReturnAlloca) {
             DFSF.LabelReturnAlloca =
-              new AllocaInst(DFSF.DFS.ShadowTy,
-                             getDataLayout().getAllocaAddrSpace(),
-                             "labelreturn", &DFSF.F->getEntryBlock().front());
+                new AllocaInst(DFSF.DFS.PrimitiveShadowTy,
+                               getDataLayout().getAllocaAddrSpace(),
+                               "labelreturn", &DFSF.F->getEntryBlock().front());
           }
           Args.push_back(DFSF.LabelReturnAlloca);
         }
@@ -1718,17 +2061,19 @@ void DFSanVisitor::visitCallBase(CallBase &CB) {
 
         // Update the parameter attributes of the custom call instruction to
         // zero extend the shadow parameters. This is required for targets
-        // which consider ShadowTy an illegal type.
+        // which consider PrimitiveShadowTy an illegal type.
         for (unsigned n = 0; n < FT->getNumParams(); n++) {
           const unsigned ArgNo = ShadowArgStart + n;
-          if (CustomCI->getArgOperand(ArgNo)->getType() == DFSF.DFS.ShadowTy)
+          if (CustomCI->getArgOperand(ArgNo)->getType() ==
+              DFSF.DFS.PrimitiveShadowTy)
             CustomCI->addParamAttr(ArgNo, Attribute::ZExt);
         }
 
         if (!FT->getReturnType()->isVoidTy()) {
-          LoadInst *LabelLoad =
-              IRB.CreateLoad(DFSF.DFS.ShadowTy, DFSF.LabelReturnAlloca);
-          DFSF.setShadow(CustomCI, LabelLoad);
+          LoadInst *LabelLoad = IRB.CreateLoad(DFSF.DFS.PrimitiveShadowTy,
+                                               DFSF.LabelReturnAlloca);
+          DFSF.setShadow(CustomCI, DFSF.expandFromPrimitiveShadow(
+                                       FT->getReturnType(), LabelLoad, &CB));
         }
 
         CI->replaceAllUsesWith(CustomCI);
@@ -1741,9 +2086,20 @@ void DFSanVisitor::visitCallBase(CallBase &CB) {
 
   FunctionType *FT = CB.getFunctionType();
   if (DFSF.DFS.getInstrumentedABI() == DataFlowSanitizer::IA_TLS) {
-    for (unsigned i = 0, n = FT->getNumParams(); i != n; ++i) {
-      IRB.CreateStore(DFSF.getShadow(CB.getArgOperand(i)),
-                      DFSF.getArgTLS(i, &CB));
+    unsigned ArgOffset = 0;
+    const DataLayout &DL = getDataLayout();
+    for (unsigned I = 0, N = FT->getNumParams(); I != N; ++I) {
+      unsigned Size =
+          DL.getTypeAllocSize(DFSF.DFS.getShadowTy(FT->getParamType(I)));
+      // Stop storing if arguments' size overflows. Inside a function, arguments
+      // after overflow have zero shadow values.
+      if (ArgOffset + Size > kArgTLSSize)
+        break;
+      IRB.CreateAlignedStore(
+          DFSF.getShadow(CB.getArgOperand(I)),
+          DFSF.getArgTLS(FT->getParamType(I), ArgOffset, IRB),
+          kShadowTLSAlignment);
+      ArgOffset += alignTo(Size, kShadowTLSAlignment);
     }
   }
 
@@ -1764,10 +2120,19 @@ void DFSanVisitor::visitCallBase(CallBase &CB) {
 
     if (DFSF.DFS.getInstrumentedABI() == DataFlowSanitizer::IA_TLS) {
       IRBuilder<> NextIRB(Next);
-      LoadInst *LI = NextIRB.CreateLoad(DFSF.DFS.ShadowTy, DFSF.getRetvalTLS());
-      DFSF.SkipInsts.insert(LI);
-      DFSF.setShadow(&CB, LI);
-      DFSF.NonZeroChecks.push_back(LI);
+      const DataLayout &DL = getDataLayout();
+      unsigned Size = DL.getTypeAllocSize(DFSF.DFS.getShadowTy(&CB));
+      if (Size > kRetvalTLSSize) {
+        // Set overflowed return shadow to be zero.
+        DFSF.setShadow(&CB, DFSF.DFS.getZeroShadow(&CB));
+      } else {
+        LoadInst *LI = NextIRB.CreateAlignedLoad(
+            DFSF.DFS.getShadowTy(&CB), DFSF.getRetvalTLS(CB.getType(), NextIRB),
+            kShadowTLSAlignment, "_dfsret");
+        DFSF.SkipInsts.insert(LI);
+        DFSF.setShadow(&CB, LI);
+        DFSF.NonZeroChecks.push_back(LI);
+      }
     }
   }
 
@@ -1789,7 +2154,8 @@ void DFSanVisitor::visitCallBase(CallBase &CB) {
 
     if (FT->isVarArg()) {
       unsigned VarArgSize = CB.arg_size() - FT->getNumParams();
-      ArrayType *VarArgArrayTy = ArrayType::get(DFSF.DFS.ShadowTy, VarArgSize);
+      ArrayType *VarArgArrayTy =
+          ArrayType::get(DFSF.DFS.PrimitiveShadowTy, VarArgSize);
       AllocaInst *VarArgShadow =
         new AllocaInst(VarArgArrayTy, getDataLayout().getAllocaAddrSpace(),
                        "", &DFSF.F->getEntryBlock().front());
@@ -1830,11 +2196,12 @@ void DFSanVisitor::visitCallBase(CallBase &CB) {
 }
 
 void DFSanVisitor::visitPHINode(PHINode &PN) {
+  Type *ShadowTy = DFSF.DFS.getShadowTy(&PN);
   PHINode *ShadowPN =
-      PHINode::Create(DFSF.DFS.ShadowTy, PN.getNumIncomingValues(), "", &PN);
+      PHINode::Create(ShadowTy, PN.getNumIncomingValues(), "", &PN);
 
   // Give the shadow phi node valid predecessors to fool SplitEdge into working.
-  Value *UndefShadow = UndefValue::get(DFSF.DFS.ShadowTy);
+  Value *UndefShadow = UndefValue::get(ShadowTy);
   for (PHINode::block_iterator i = PN.block_begin(), e = PN.block_end(); i != e;
        ++i) {
     ShadowPN->addIncoming(UndefShadow, *i);
@@ -1843,3 +2210,39 @@ void DFSanVisitor::visitPHINode(PHINode &PN) {
   DFSF.PHIFixups.push_back(std::make_pair(&PN, ShadowPN));
   DFSF.setShadow(&PN, ShadowPN);
 }
+
+namespace {
+class DataFlowSanitizerLegacyPass : public ModulePass {
+private:
+  std::vector<std::string> ABIListFiles;
+
+public:
+  static char ID;
+
+  DataFlowSanitizerLegacyPass(
+      const std::vector<std::string> &ABIListFiles = std::vector<std::string>())
+      : ModulePass(ID), ABIListFiles(ABIListFiles) {}
+
+  bool runOnModule(Module &M) override {
+    return DataFlowSanitizer(ABIListFiles).runImpl(M);
+  }
+};
+} // namespace
+
+char DataFlowSanitizerLegacyPass::ID;
+
+INITIALIZE_PASS(DataFlowSanitizerLegacyPass, "dfsan",
+                "DataFlowSanitizer: dynamic data flow analysis.", false, false)
+
+ModulePass *llvm::createDataFlowSanitizerLegacyPassPass(
+    const std::vector<std::string> &ABIListFiles) {
+  return new DataFlowSanitizerLegacyPass(ABIListFiles);
+}
+
+PreservedAnalyses DataFlowSanitizerPass::run(Module &M,
+                                             ModuleAnalysisManager &AM) {
+  if (DataFlowSanitizer(ABIListFiles).runImpl(M)) {
+    return PreservedAnalyses::none();
+  }
+  return PreservedAnalyses::all();
+}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index d8a965a90127..527644a69d91 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -13,13 +13,17 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "CFGMST.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Sequence.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/CFG.h"
@@ -32,6 +36,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/CRC.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FileSystem.h"
@@ -52,6 +57,8 @@ namespace endian = llvm::support::endian;
 #define DEBUG_TYPE "insert-gcov-profiling"
 
 enum : uint32_t {
+  GCOV_ARC_ON_TREE = 1 << 0,
+
   GCOV_TAG_FUNCTION = 0x01000000,
   GCOV_TAG_BLOCKS = 0x01410000,
   GCOV_TAG_ARCS = 0x01430000,
@@ -62,6 +69,9 @@ static cl::opt<std::string> DefaultGCOVVersion("default-gcov-version",
                                                cl::init("408*"), cl::Hidden,
                                                cl::ValueRequired);
 
+static cl::opt<bool> AtomicCounter("gcov-atomic-counter", cl::Hidden,
+                                   cl::desc("Make counter updates atomic"));
+
 // Returns the number of words which will be used to represent this string.
 static unsigned wordsOfString(StringRef s) {
   // Length + NUL-terminated string + 0~3 padding NULs.
@@ -73,6 +83,7 @@ GCOVOptions GCOVOptions::getDefault() {
   Options.EmitNotes = true;
   Options.EmitData = true;
   Options.NoRedZone = false;
+  Options.Atomic = AtomicCounter;
 
   if (DefaultGCOVVersion.size() != 4) {
     llvm::report_fatal_error(std::string("Invalid -default-gcov-version: ") +
@@ -90,7 +101,8 @@ public:
   GCOVProfiler() : GCOVProfiler(GCOVOptions::getDefault()) {}
   GCOVProfiler(const GCOVOptions &Opts) : Options(Opts) {}
   bool
-  runOnModule(Module &M,
+  runOnModule(Module &M, function_ref<BlockFrequencyInfo *(Function &F)> GetBFI,
+              function_ref<BranchProbabilityInfo *(Function &F)> GetBPI,
               std::function<const TargetLibraryInfo &(Function &F)> GetTLI);
 
   void write(uint32_t i) {
@@ -107,11 +119,14 @@ public:
 
 private:
   // Create the .gcno files for the Module based on DebugInfo.
-  void emitProfileNotes();
+  bool
+  emitProfileNotes(NamedMDNode *CUNode, bool HasExecOrFork,
+                   function_ref<BlockFrequencyInfo *(Function &F)> GetBFI,
+                   function_ref<BranchProbabilityInfo *(Function &F)> GetBPI,
+                   function_ref<const TargetLibraryInfo &(Function &F)> GetTLI);
 
-  // Modify the program to track transitions along edges and call into the
-  // profiling runtime to emit .gcda files when run.
-  bool emitProfileArcs();
+  void emitGlobalConstructor(
+      SmallVectorImpl<std::pair<GlobalVariable *, MDNode *>> &CountersBySP);
 
   bool isFunctionInstrumented(const Function &F);
   std::vector<Regex> createRegexesFromString(StringRef RegexesStr);
@@ -130,7 +145,6 @@ private:
   Function *
   insertCounterWriteout(ArrayRef<std::pair<GlobalVariable *, MDNode *>>);
   Function *insertReset(ArrayRef<std::pair<GlobalVariable *, MDNode *>>);
-  Function *insertFlush(Function *ResetF);
 
   bool AddFlushBeforeForkAndExec();
 
@@ -150,6 +164,7 @@ private:
   SmallVector<std::unique_ptr<GCOVFunction>, 16> Funcs;
   std::vector<Regex> FilterRe;
   std::vector<Regex> ExcludeRe;
+  DenseSet<const BasicBlock *> ExecBlocks;
   StringMap<bool> InstrumentedFiles;
 };
 
@@ -165,24 +180,68 @@ public:
   StringRef getPassName() const override { return "GCOV Profiler"; }
 
   bool runOnModule(Module &M) override {
-    return Profiler.runOnModule(M, [this](Function &F) -> TargetLibraryInfo & {
-      return getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
-    });
+    auto GetBFI = [this](Function &F) {
+      return &this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
+    };
+    auto GetBPI = [this](Function &F) {
+      return &this->getAnalysis<BranchProbabilityInfoWrapperPass>(F).getBPI();
+    };
+    auto GetTLI = [this](Function &F) -> const TargetLibraryInfo & {
+      return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+    };
+    return Profiler.runOnModule(M, GetBFI, GetBPI, GetTLI);
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<BlockFrequencyInfoWrapperPass>();
     AU.addRequired<TargetLibraryInfoWrapperPass>();
   }
 
 private:
   GCOVProfiler Profiler;
 };
+
+struct BBInfo {
+  BBInfo *Group;
+  uint32_t Index;
+  uint32_t Rank = 0;
+
+  BBInfo(unsigned Index) : Group(this), Index(Index) {}
+  const std::string infoString() const {
+    return (Twine("Index=") + Twine(Index)).str();
+  }
+};
+
+struct Edge {
+  // This class implements the CFG edges. Note the CFG can be a multi-graph.
+  // So there might be multiple edges with same SrcBB and DestBB.
+  const BasicBlock *SrcBB;
+  const BasicBlock *DestBB;
+  uint64_t Weight;
+  BasicBlock *Place = nullptr;
+  uint32_t SrcNumber, DstNumber;
+  bool InMST = false;
+  bool Removed = false;
+  bool IsCritical = false;
+
+  Edge(const BasicBlock *Src, const BasicBlock *Dest, uint64_t W = 1)
+      : SrcBB(Src), DestBB(Dest), Weight(W) {}
+
+  // Return the information string of an edge.
+  const std::string infoString() const {
+    return (Twine(Removed ? "-" : " ") + (InMST ? " " : "*") +
+            (IsCritical ? "c" : " ") + "  W=" + Twine(Weight))
+        .str();
+  }
+};
 }
 
 char GCOVProfilerLegacyPass::ID = 0;
 INITIALIZE_PASS_BEGIN(
     GCOVProfilerLegacyPass, "insert-gcov-profiling",
     "Insert instrumentation for GCOV profiling", false, false)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(
     GCOVProfilerLegacyPass, "insert-gcov-profiling",
@@ -267,8 +326,8 @@ namespace {
       return LinesByFile.try_emplace(Filename, P, Filename).first->second;
     }
 
-    void addEdge(GCOVBlock &Successor) {
-      OutEdges.push_back(&Successor);
+    void addEdge(GCOVBlock &Successor, uint32_t Flags) {
+      OutEdges.emplace_back(&Successor, Flags);
     }
 
     void writeOut() {
@@ -301,15 +360,16 @@ namespace {
       assert(OutEdges.empty());
     }
 
-   private:
+    uint32_t Number;
+    SmallVector<std::pair<GCOVBlock *, uint32_t>, 4> OutEdges;
+
+  private:
     friend class GCOVFunction;
 
     GCOVBlock(GCOVProfiler *P, uint32_t Number)
         : GCOVRecord(P), Number(Number) {}
 
-    uint32_t Number;
     StringMap<GCOVLines> LinesByFile;
-    SmallVector<GCOVBlock *, 4> OutEdges;
   };
 
   // A function has a unique identifier, a checksum (we leave as zero) and a
@@ -320,16 +380,12 @@ namespace {
     GCOVFunction(GCOVProfiler *P, Function *F, const DISubprogram *SP,
                  unsigned EndLine, uint32_t Ident, int Version)
         : GCOVRecord(P), SP(SP), EndLine(EndLine), Ident(Ident),
-          Version(Version), ReturnBlock(P, 1) {
+          Version(Version), EntryBlock(P, 0), ReturnBlock(P, 1) {
       LLVM_DEBUG(dbgs() << "Function: " << getFunctionName(SP) << "\n");
       bool ExitBlockBeforeBody = Version >= 48;
-      uint32_t i = 0;
-      for (auto &BB : *F) {
-        // Skip index 1 if it's assigned to the ReturnBlock.
-        if (i == 1 && ExitBlockBeforeBody)
-          ++i;
+      uint32_t i = ExitBlockBeforeBody ? 2 : 1;
+      for (BasicBlock &BB : *F)
         Blocks.insert(std::make_pair(&BB, GCOVBlock(P, i++)));
-      }
       if (!ExitBlockBeforeBody)
         ReturnBlock.Number = i;
 
@@ -340,26 +396,15 @@ namespace {
       FuncChecksum = hash_value(FunctionNameAndLine);
     }
 
-    GCOVBlock &getBlock(BasicBlock *BB) {
-      return Blocks.find(BB)->second;
+    GCOVBlock &getBlock(const BasicBlock *BB) {
+      return Blocks.find(const_cast<BasicBlock *>(BB))->second;
     }
 
+    GCOVBlock &getEntryBlock() { return EntryBlock; }
     GCOVBlock &getReturnBlock() {
       return ReturnBlock;
     }
 
-    std::string getEdgeDestinations() {
-      std::string EdgeDestinations;
-      raw_string_ostream EDOS(EdgeDestinations);
-      Function *F = Blocks.begin()->first->getParent();
-      for (BasicBlock &I : *F) {
-        GCOVBlock &Block = getBlock(&I);
-        for (int i = 0, e = Block.OutEdges.size(); i != e; ++i)
-          EDOS << Block.OutEdges[i]->Number;
-      }
-      return EdgeDestinations;
-    }
-
     uint32_t getFuncChecksum() const {
       return FuncChecksum;
     }
@@ -398,44 +443,52 @@ namespace {
       // Emit count of blocks.
       write(GCOV_TAG_BLOCKS);
       if (Version < 80) {
-        write(Blocks.size() + 1);
-        for (int i = Blocks.size() + 1; i; --i)
+        write(Blocks.size() + 2);
+        for (int i = Blocks.size() + 2; i; --i)
           write(0);
       } else {
         write(1);
-        write(Blocks.size() + 1);
+        write(Blocks.size() + 2);
       }
       LLVM_DEBUG(dbgs() << (Blocks.size() + 1) << " blocks\n");
 
       // Emit edges between blocks.
-      Function *F = Blocks.begin()->first->getParent();
-      for (BasicBlock &I : *F) {
-        GCOVBlock &Block = getBlock(&I);
+      const uint32_t Outgoing = EntryBlock.OutEdges.size();
+      if (Outgoing) {
+        write(GCOV_TAG_ARCS);
+        write(Outgoing * 2 + 1);
+        write(EntryBlock.Number);
+        for (const auto &E : EntryBlock.OutEdges) {
+          write(E.first->Number);
+          write(E.second);
+        }
+      }
+      for (auto &It : Blocks) {
+        const GCOVBlock &Block = It.second;
         if (Block.OutEdges.empty()) continue;
 
         write(GCOV_TAG_ARCS);
         write(Block.OutEdges.size() * 2 + 1);
         write(Block.Number);
-        for (int i = 0, e = Block.OutEdges.size(); i != e; ++i) {
-          LLVM_DEBUG(dbgs() << Block.Number << " -> "
-                            << Block.OutEdges[i]->Number << "\n");
-          write(Block.OutEdges[i]->Number);
-          write(0);  // no flags
+        for (const auto &E : Block.OutEdges) {
+          write(E.first->Number);
+          write(E.second);
         }
       }
 
       // Emit lines for each block.
-      for (BasicBlock &I : *F)
-        getBlock(&I).writeOut();
+      for (auto &It : Blocks)
+        It.second.writeOut();
     }
 
-  private:
+  public:
     const DISubprogram *SP;
     unsigned EndLine;
     uint32_t Ident;
     uint32_t FuncChecksum;
     int Version;
-    DenseMap<BasicBlock *, GCOVBlock> Blocks;
+    MapVector<BasicBlock *, GCOVBlock> Blocks;
+    GCOVBlock EntryBlock;
     GCOVBlock ReturnBlock;
   };
 }
@@ -549,20 +602,23 @@ std::string GCOVProfiler::mangleName(const DICompileUnit *CU,
 }
 
 bool GCOVProfiler::runOnModule(
-    Module &M, std::function<const TargetLibraryInfo &(Function &F)> GetTLI) {
+    Module &M, function_ref<BlockFrequencyInfo *(Function &F)> GetBFI,
+    function_ref<BranchProbabilityInfo *(Function &F)> GetBPI,
+    std::function<const TargetLibraryInfo &(Function &F)> GetTLI) {
   this->M = &M;
   this->GetTLI = std::move(GetTLI);
   Ctx = &M.getContext();
 
-  bool Modified = AddFlushBeforeForkAndExec();
+  NamedMDNode *CUNode = M.getNamedMetadata("llvm.dbg.cu");
+  if (!CUNode || (!Options.EmitNotes && !Options.EmitData))
+    return false;
+
+  bool HasExecOrFork = AddFlushBeforeForkAndExec();
 
   FilterRe = createRegexesFromString(Options.Filter);
   ExcludeRe = createRegexesFromString(Options.Exclude);
-
-  if (Options.EmitNotes) emitProfileNotes();
-  if (Options.EmitData)
-    Modified |= emitProfileArcs();
-  return Modified;
+  emitProfileNotes(CUNode, HasExecOrFork, GetBFI, GetBPI, this->GetTLI);
+  return true;
 }
 
 PreservedAnalyses GCOVProfilerPass::run(Module &M,
@@ -572,9 +628,17 @@ PreservedAnalyses GCOVProfilerPass::run(Module &M,
   FunctionAnalysisManager &FAM =
       AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
 
-  if (!Profiler.runOnModule(M, [&](Function &F) -> TargetLibraryInfo & {
-        return FAM.getResult<TargetLibraryAnalysis>(F);
-      }))
+  auto GetBFI = [&FAM](Function &F) {
+    return &FAM.getResult<BlockFrequencyAnalysis>(F);
+  };
+  auto GetBPI = [&FAM](Function &F) {
+    return &FAM.getResult<BranchProbabilityAnalysis>(F);
+  };
+  auto GetTLI = [&FAM](Function &F) -> const TargetLibraryInfo & {
+    return FAM.getResult<TargetLibraryAnalysis>(F);
+  };
+
+  if (!Profiler.runOnModule(M, GetBFI, GetBPI, GetTLI))
     return PreservedAnalyses::all();
 
   return PreservedAnalyses::none();
@@ -611,16 +675,6 @@ static bool isUsingScopeBasedEH(Function &F) {
   return isScopedEHPersonality(Personality);
 }
 
-static bool shouldKeepInEntry(BasicBlock::iterator It) {
-	if (isa<AllocaInst>(*It)) return true;
-	if (isa<DbgInfoIntrinsic>(*It)) return true;
-	if (auto *II = dyn_cast<IntrinsicInst>(It)) {
-		if (II->getIntrinsicID() == llvm::Intrinsic::localescape) return true;
-	}
-
-	return false;
-}
-
 bool GCOVProfiler::AddFlushBeforeForkAndExec() {
   SmallVector<CallInst *, 2> Forks;
   SmallVector<CallInst *, 2> Execs;
@@ -690,6 +744,7 @@ bool GCOVProfiler::AddFlushBeforeForkAndExec() {
     // dumped
     FunctionCallee ResetF = M->getOrInsertFunction("llvm_reset_counters", FTy);
     Builder.CreateCall(ResetF)->setDebugLoc(Loc);
+    ExecBlocks.insert(Parent);
     Parent->splitBasicBlock(NextInst);
     Parent->back().setDebugLoc(Loc);
   }
@@ -697,10 +752,67 @@ bool GCOVProfiler::AddFlushBeforeForkAndExec() {
   return !Forks.empty() || !Execs.empty();
 }
 
-void GCOVProfiler::emitProfileNotes() {
-  NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu");
-  if (!CU_Nodes) return;
+static BasicBlock *getInstrBB(CFGMST<Edge, BBInfo> &MST, Edge &E,
+                              const DenseSet<const BasicBlock *> &ExecBlocks) {
+  if (E.InMST || E.Removed)
+    return nullptr;
+
+  BasicBlock *SrcBB = const_cast<BasicBlock *>(E.SrcBB);
+  BasicBlock *DestBB = const_cast<BasicBlock *>(E.DestBB);
+  // For a fake edge, instrument the real BB.
+  if (SrcBB == nullptr)
+    return DestBB;
+  if (DestBB == nullptr)
+    return SrcBB;
+
+  auto CanInstrument = [](BasicBlock *BB) -> BasicBlock * {
+    // There are basic blocks (such as catchswitch) cannot be instrumented.
+    // If the returned first insertion point is the end of BB, skip this BB.
+    if (BB->getFirstInsertionPt() == BB->end())
+      return nullptr;
+    return BB;
+  };
+
+  // Instrument the SrcBB if it has a single successor,
+  // otherwise, the DestBB if this is not a critical edge.
+  Instruction *TI = SrcBB->getTerminator();
+  if (TI->getNumSuccessors() <= 1 && !ExecBlocks.count(SrcBB))
+    return CanInstrument(SrcBB);
+  if (!E.IsCritical)
+    return CanInstrument(DestBB);
+
+  // Some IndirectBr critical edges cannot be split by the previous
+  // SplitIndirectBrCriticalEdges call. Bail out.
+  const unsigned SuccNum = GetSuccessorNumber(SrcBB, DestBB);
+  BasicBlock *InstrBB =
+      isa<IndirectBrInst>(TI) ? nullptr : SplitCriticalEdge(TI, SuccNum);
+  if (!InstrBB)
+    return nullptr;
+
+  MST.addEdge(SrcBB, InstrBB, 0);
+  MST.addEdge(InstrBB, DestBB, 0).InMST = true;
+  E.Removed = true;
+
+  return CanInstrument(InstrBB);
+}
 
+#ifndef NDEBUG
+static void dumpEdges(CFGMST<Edge, BBInfo> &MST, GCOVFunction &GF) {
+  size_t ID = 0;
+  for (auto &E : make_pointee_range(MST.AllEdges)) {
+    GCOVBlock &Src = E.SrcBB ? GF.getBlock(E.SrcBB) : GF.getEntryBlock();
+    GCOVBlock &Dst = E.DestBB ? GF.getBlock(E.DestBB) : GF.getReturnBlock();
+    dbgs() << "  Edge " << ID++ << ": " << Src.Number << "->" << Dst.Number
+           << E.infoString() << "\n";
+  }
+}
+#endif
+
+bool GCOVProfiler::emitProfileNotes(
+    NamedMDNode *CUNode, bool HasExecOrFork,
+    function_ref<BlockFrequencyInfo *(Function &F)> GetBFI,
+    function_ref<BranchProbabilityInfo *(Function &F)> GetBPI,
+    function_ref<const TargetLibraryInfo &(Function &F)> GetTLI) {
   int Version;
   {
     uint8_t c3 = Options.Version[0];
@@ -710,27 +822,20 @@ void GCOVProfiler::emitProfileNotes() {
                         : (c3 - '0') * 10 + c1 - '0';
   }
 
-  for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) {
+  bool EmitGCDA = Options.EmitData;
+  for (unsigned i = 0, e = CUNode->getNumOperands(); i != e; ++i) {
     // Each compile unit gets its own .gcno file. This means that whether we run
     // this pass over the original .o's as they're produced, or run it after
     // LTO, we'll generate the same .gcno files.
 
-    auto *CU = cast<DICompileUnit>(CU_Nodes->getOperand(i));
+    auto *CU = cast<DICompileUnit>(CUNode->getOperand(i));
 
     // Skip module skeleton (and module) CUs.
     if (CU->getDWOId())
       continue;
 
-    std::error_code EC;
-    raw_fd_ostream out(mangleName(CU, GCovFileType::GCNO), EC,
-                       sys::fs::OF_None);
-    if (EC) {
-      Ctx->emitError(Twine("failed to open coverage notes file for writing: ") +
-                     EC.message());
-      continue;
-    }
-
-    std::string EdgeDestinations;
+    std::vector<uint8_t> EdgeDestinations;
+    SmallVector<std::pair<GlobalVariable *, MDNode *>, 8> CountersBySP;
 
     Endian = M->getDataLayout().isLittleEndian() ? support::endianness::little
                                                  : support::endianness::big;
@@ -744,36 +849,82 @@ void GCOVProfiler::emitProfileNotes() {
       // TODO: Functions using scope-based EH are currently not supported.
       if (isUsingScopeBasedEH(F)) continue;
 
-      // gcov expects every function to start with an entry block that has a
-      // single successor, so split the entry block to make sure of that.
-      BasicBlock &EntryBlock = F.getEntryBlock();
-      BasicBlock::iterator It = EntryBlock.begin();
-      while (shouldKeepInEntry(It))
-        ++It;
-      EntryBlock.splitBasicBlock(It);
+      // Add the function line number to the lines of the entry block
+      // to have a counter for the function definition.
+      uint32_t Line = SP->getLine();
+      auto Filename = getFilename(SP);
+
+      BranchProbabilityInfo *BPI = GetBPI(F);
+      BlockFrequencyInfo *BFI = GetBFI(F);
 
+      // Split indirectbr critical edges here before computing the MST rather
+      // than later in getInstrBB() to avoid invalidating it.
+      SplitIndirectBrCriticalEdges(F, BPI, BFI);
+
+      CFGMST<Edge, BBInfo> MST(F, /*InstrumentFuncEntry_=*/false, BPI, BFI);
+
+      // getInstrBB can split basic blocks and push elements to AllEdges.
+      for (size_t I : llvm::seq<size_t>(0, MST.AllEdges.size())) {
+        auto &E = *MST.AllEdges[I];
+        // For now, disable spanning tree optimization when fork or exec* is
+        // used.
+        if (HasExecOrFork)
+          E.InMST = false;
+        E.Place = getInstrBB(MST, E, ExecBlocks);
+      }
+      // Basic blocks in F are finalized at this point.
+      BasicBlock &EntryBlock = F.getEntryBlock();
       Funcs.push_back(std::make_unique<GCOVFunction>(this, &F, SP, EndLine,
                                                      FunctionIdent++, Version));
       GCOVFunction &Func = *Funcs.back();
 
-      // Add the function line number to the lines of the entry block
-      // to have a counter for the function definition.
-      uint32_t Line = SP->getLine();
-      auto Filename = getFilename(SP);
+      // Some non-tree edges are IndirectBr which cannot be split. Ignore them
+      // as well.
+      llvm::erase_if(MST.AllEdges, [](std::unique_ptr<Edge> &E) {
+        return E->Removed || (!E->InMST && !E->Place);
+      });
+      const size_t Measured =
+          std::stable_partition(
+              MST.AllEdges.begin(), MST.AllEdges.end(),
+              [](std::unique_ptr<Edge> &E) { return E->Place; }) -
+          MST.AllEdges.begin();
+      for (size_t I : llvm::seq<size_t>(0, Measured)) {
+        Edge &E = *MST.AllEdges[I];
+        GCOVBlock &Src =
+            E.SrcBB ? Func.getBlock(E.SrcBB) : Func.getEntryBlock();
+        GCOVBlock &Dst =
+            E.DestBB ? Func.getBlock(E.DestBB) : Func.getReturnBlock();
+        E.SrcNumber = Src.Number;
+        E.DstNumber = Dst.Number;
+      }
+      std::stable_sort(
+          MST.AllEdges.begin(), MST.AllEdges.begin() + Measured,
+          [](const std::unique_ptr<Edge> &L, const std::unique_ptr<Edge> &R) {
+            return L->SrcNumber != R->SrcNumber ? L->SrcNumber < R->SrcNumber
+                                                : L->DstNumber < R->DstNumber;
+          });
+
+      for (const Edge &E : make_pointee_range(MST.AllEdges)) {
+        GCOVBlock &Src =
+            E.SrcBB ? Func.getBlock(E.SrcBB) : Func.getEntryBlock();
+        GCOVBlock &Dst =
+            E.DestBB ? Func.getBlock(E.DestBB) : Func.getReturnBlock();
+        Src.addEdge(Dst, E.Place ? 0 : uint32_t(GCOV_ARC_ON_TREE));
+      }
 
       // Artificial functions such as global initializers
       if (!SP->isArtificial())
         Func.getBlock(&EntryBlock).getFile(Filename).addLine(Line);
 
-      for (auto &BB : F) {
-        GCOVBlock &Block = Func.getBlock(&BB);
-        Instruction *TI = BB.getTerminator();
-        if (int successors = TI->getNumSuccessors()) {
-          for (int i = 0; i != successors; ++i) {
-            Block.addEdge(Func.getBlock(TI->getSuccessor(i)));
-          }
-        } else if (isa<ReturnInst>(TI)) {
-          Block.addEdge(Func.getReturnBlock());
+      LLVM_DEBUG(dumpEdges(MST, Func));
+
+      for (auto &GB : Func.Blocks) {
+        const BasicBlock &BB = *GB.first;
+        auto &Block = GB.second;
+        for (auto Succ : Block.OutEdges) {
+          uint32_t Idx = Succ.first->Number;
+          do EdgeDestinations.push_back(Idx & 255);
+          while ((Idx >>= 8) > 0);
         }
 
         for (auto &I : BB) {
@@ -799,149 +950,110 @@ void GCOVProfiler::emitProfileNotes() {
         }
         Line = 0;
       }
-      EdgeDestinations += Func.getEdgeDestinations();
+      if (EmitGCDA) {
+        DISubprogram *SP = F.getSubprogram();
+        ArrayType *CounterTy = ArrayType::get(Type::getInt64Ty(*Ctx), Measured);
+        GlobalVariable *Counters = new GlobalVariable(
+            *M, CounterTy, false, GlobalValue::InternalLinkage,
+            Constant::getNullValue(CounterTy), "__llvm_gcov_ctr");
+        CountersBySP.emplace_back(Counters, SP);
+
+        for (size_t I : llvm::seq<size_t>(0, Measured)) {
+          const Edge &E = *MST.AllEdges[I];
+          IRBuilder<> Builder(E.Place, E.Place->getFirstInsertionPt());
+          Value *V = Builder.CreateConstInBoundsGEP2_64(
+              Counters->getValueType(), Counters, 0, I);
+          if (Options.Atomic) {
+            Builder.CreateAtomicRMW(AtomicRMWInst::Add, V, Builder.getInt64(1),
+                                    AtomicOrdering::Monotonic);
+          } else {
+            Value *Count =
+                Builder.CreateLoad(Builder.getInt64Ty(), V, "gcov_ctr");
+            Count = Builder.CreateAdd(Count, Builder.getInt64(1));
+            Builder.CreateStore(Count, V);
+          }
+        }
+      }
     }
 
     char Tmp[4];
-    os = &out;
-    auto Stamp = static_cast<uint32_t>(hash_value(EdgeDestinations));
+    JamCRC JC;
+    JC.update(EdgeDestinations);
+    uint32_t Stamp = JC.getCRC();
     FileChecksums.push_back(Stamp);
-    if (Endian == support::endianness::big) {
-      out.write("gcno", 4);
-      out.write(Options.Version, 4);
-    } else {
-      out.write("oncg", 4);
-      std::reverse_copy(Options.Version, Options.Version + 4, Tmp);
-      out.write(Tmp, 4);
-    }
-    write(Stamp);
-    if (Version >= 90)
-      writeString(""); // unuseful current_working_directory
-    if (Version >= 80)
-      write(0); // unuseful has_unexecuted_blocks
-
-    for (auto &Func : Funcs)
-      Func->writeOut(Stamp);
-
-    write(0);
-    write(0);
-    out.close();
-  }
-}
-
-bool GCOVProfiler::emitProfileArcs() {
-  NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu");
-  if (!CU_Nodes) return false;
 
-  bool Result = false;
-  for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) {
-    SmallVector<std::pair<GlobalVariable *, MDNode *>, 8> CountersBySP;
-    for (auto &F : M->functions()) {
-      DISubprogram *SP = F.getSubprogram();
-      unsigned EndLine;
-      if (!SP) continue;
-      if (!functionHasLines(F, EndLine) || !isFunctionInstrumented(F))
+    if (Options.EmitNotes) {
+      std::error_code EC;
+      raw_fd_ostream out(mangleName(CU, GCovFileType::GCNO), EC,
+                         sys::fs::OF_None);
+      if (EC) {
+        Ctx->emitError(
+            Twine("failed to open coverage notes file for writing: ") +
+            EC.message());
         continue;
-      // TODO: Functions using scope-based EH are currently not supported.
-      if (isUsingScopeBasedEH(F)) continue;
-
-      DenseMap<std::pair<BasicBlock *, BasicBlock *>, unsigned> EdgeToCounter;
-      unsigned Edges = 0;
-      for (auto &BB : F) {
-        Instruction *TI = BB.getTerminator();
-        if (isa<ReturnInst>(TI)) {
-          EdgeToCounter[{&BB, nullptr}] = Edges++;
-        } else {
-          for (BasicBlock *Succ : successors(TI)) {
-            EdgeToCounter[{&BB, Succ}] = Edges++;
-          }
-        }
       }
+      os = &out;
+      if (Endian == support::endianness::big) {
+        out.write("gcno", 4);
+        out.write(Options.Version, 4);
+      } else {
+        out.write("oncg", 4);
+        std::reverse_copy(Options.Version, Options.Version + 4, Tmp);
+        out.write(Tmp, 4);
+      }
+      write(Stamp);
+      if (Version >= 90)
+        writeString(""); // unuseful current_working_directory
+      if (Version >= 80)
+        write(0); // unuseful has_unexecuted_blocks
 
-      ArrayType *CounterTy =
-        ArrayType::get(Type::getInt64Ty(*Ctx), Edges);
-      GlobalVariable *Counters =
-        new GlobalVariable(*M, CounterTy, false,
-                           GlobalValue::InternalLinkage,
-                           Constant::getNullValue(CounterTy),
-                           "__llvm_gcov_ctr");
-      CountersBySP.push_back(std::make_pair(Counters, SP));
-
-      // If a BB has several predecessors, use a PHINode to select
-      // the correct counter.
-      for (auto &BB : F) {
-        const unsigned EdgeCount =
-            std::distance(pred_begin(&BB), pred_end(&BB));
-        if (EdgeCount) {
-          // The phi node must be at the begin of the BB.
-          IRBuilder<> BuilderForPhi(&*BB.begin());
-          Type *Int64PtrTy = Type::getInt64PtrTy(*Ctx);
-          PHINode *Phi = BuilderForPhi.CreatePHI(Int64PtrTy, EdgeCount);
-          for (BasicBlock *Pred : predecessors(&BB)) {
-            auto It = EdgeToCounter.find({Pred, &BB});
-            assert(It != EdgeToCounter.end());
-            const unsigned Edge = It->second;
-            Value *EdgeCounter = BuilderForPhi.CreateConstInBoundsGEP2_64(
-                Counters->getValueType(), Counters, 0, Edge);
-            Phi->addIncoming(EdgeCounter, Pred);
-          }
+      for (auto &Func : Funcs)
+        Func->writeOut(Stamp);
 
-          // Skip phis, landingpads.
-          IRBuilder<> Builder(&*BB.getFirstInsertionPt());
-          Value *Count = Builder.CreateLoad(Builder.getInt64Ty(), Phi);
-          Count = Builder.CreateAdd(Count, Builder.getInt64(1));
-          Builder.CreateStore(Count, Phi);
-
-          Instruction *TI = BB.getTerminator();
-          if (isa<ReturnInst>(TI)) {
-            auto It = EdgeToCounter.find({&BB, nullptr});
-            assert(It != EdgeToCounter.end());
-            const unsigned Edge = It->second;
-            Value *Counter = Builder.CreateConstInBoundsGEP2_64(
-                Counters->getValueType(), Counters, 0, Edge);
-            Value *Count = Builder.CreateLoad(Builder.getInt64Ty(), Counter);
-            Count = Builder.CreateAdd(Count, Builder.getInt64(1));
-            Builder.CreateStore(Count, Counter);
-          }
-        }
-      }
+      write(0);
+      write(0);
+      out.close();
     }
 
-    Function *WriteoutF = insertCounterWriteout(CountersBySP);
-    Function *ResetF = insertReset(CountersBySP);
-    Function *FlushF = insertFlush(ResetF);
-
-    // Create a small bit of code that registers the "__llvm_gcov_writeout" to
-    // be executed at exit and the "__llvm_gcov_flush" function to be executed
-    // when "__gcov_flush" is called.
-    FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
-    Function *F = Function::Create(FTy, GlobalValue::InternalLinkage,
-                                   "__llvm_gcov_init", M);
-    F->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
-    F->setLinkage(GlobalValue::InternalLinkage);
-    F->addFnAttr(Attribute::NoInline);
-    if (Options.NoRedZone)
-      F->addFnAttr(Attribute::NoRedZone);
-
-    BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", F);
-    IRBuilder<> Builder(BB);
-
-    FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
-    Type *Params[] = {PointerType::get(FTy, 0), PointerType::get(FTy, 0),
-                      PointerType::get(FTy, 0)};
-    FTy = FunctionType::get(Builder.getVoidTy(), Params, false);
-
-    // Initialize the environment and register the local writeout, flush and
-    // reset functions.
-    FunctionCallee GCOVInit = M->getOrInsertFunction("llvm_gcov_init", FTy);
-    Builder.CreateCall(GCOVInit, {WriteoutF, FlushF, ResetF});
-    Builder.CreateRetVoid();
-
-    appendToGlobalCtors(*M, F, 0);
-    Result = true;
+    if (EmitGCDA) {
+      emitGlobalConstructor(CountersBySP);
+      EmitGCDA = false;
+    }
   }
+  return true;
+}
+
+void GCOVProfiler::emitGlobalConstructor(
+    SmallVectorImpl<std::pair<GlobalVariable *, MDNode *>> &CountersBySP) {
+  Function *WriteoutF = insertCounterWriteout(CountersBySP);
+  Function *ResetF = insertReset(CountersBySP);
+
+  // Create a small bit of code that registers the "__llvm_gcov_writeout" to
+  // be executed at exit and the "__llvm_gcov_flush" function to be executed
+  // when "__gcov_flush" is called.
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+  Function *F = Function::Create(FTy, GlobalValue::InternalLinkage,
+                                 "__llvm_gcov_init", M);
+  F->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+  F->setLinkage(GlobalValue::InternalLinkage);
+  F->addFnAttr(Attribute::NoInline);
+  if (Options.NoRedZone)
+    F->addFnAttr(Attribute::NoRedZone);
 
-  return Result;
+  BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", F);
+  IRBuilder<> Builder(BB);
+
+  FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+  auto *PFTy = PointerType::get(FTy, 0);
+  FTy = FunctionType::get(Builder.getVoidTy(), {PFTy, PFTy}, false);
+
+  // Initialize the environment and register the local writeout, flush and
+  // reset functions.
+  FunctionCallee GCOVInit = M->getOrInsertFunction("llvm_gcov_init", FTy);
+  Builder.CreateCall(GCOVInit, {WriteoutF, ResetF});
+  Builder.CreateRetVoid();
+
+  appendToGlobalCtors(*M, F, 0);
 }
 
 FunctionCallee GCOVProfiler::getStartFileFunc(const TargetLibraryInfo *TLI) {
@@ -1028,15 +1140,19 @@ Function *GCOVProfiler::insertCounterWriteout(
   // Collect the relevant data into a large constant data structure that we can
   // walk to write out everything.
   StructType *StartFileCallArgsTy = StructType::create(
-      {Builder.getInt8PtrTy(), Builder.getInt32Ty(), Builder.getInt32Ty()});
+      {Builder.getInt8PtrTy(), Builder.getInt32Ty(), Builder.getInt32Ty()},
+      "start_file_args_ty");
   StructType *EmitFunctionCallArgsTy = StructType::create(
-      {Builder.getInt32Ty(), Builder.getInt32Ty(), Builder.getInt32Ty()});
+      {Builder.getInt32Ty(), Builder.getInt32Ty(), Builder.getInt32Ty()},
+      "emit_function_args_ty");
   StructType *EmitArcsCallArgsTy = StructType::create(
-      {Builder.getInt32Ty(), Builder.getInt64Ty()->getPointerTo()});
+      {Builder.getInt32Ty(), Builder.getInt64Ty()->getPointerTo()},
+      "emit_arcs_args_ty");
   StructType *FileInfoTy =
       StructType::create({StartFileCallArgsTy, Builder.getInt32Ty(),
                           EmitFunctionCallArgsTy->getPointerTo(),
-                          EmitArcsCallArgsTy->getPointerTo()});
+                          EmitArcsCallArgsTy->getPointerTo()},
+                         "file_info");
 
   Constant *Zero32 = Builder.getInt32(0);
   // Build an explicit array of two zeros for use in ConstantExpr GEP building.
@@ -1146,41 +1262,46 @@ Function *GCOVProfiler::insertCounterWriteout(
 
   // The index into the files structure is our loop induction variable.
   Builder.SetInsertPoint(FileLoopHeader);
-  PHINode *IV =
-      Builder.CreatePHI(Builder.getInt32Ty(), /*NumReservedValues*/ 2);
+  PHINode *IV = Builder.CreatePHI(Builder.getInt32Ty(), /*NumReservedValues*/ 2,
+                                  "file_idx");
   IV->addIncoming(Builder.getInt32(0), BB);
   auto *FileInfoPtr = Builder.CreateInBoundsGEP(
       FileInfoArrayTy, FileInfoArrayGV, {Builder.getInt32(0), IV});
   auto *StartFileCallArgsPtr =
-      Builder.CreateStructGEP(FileInfoTy, FileInfoPtr, 0);
+      Builder.CreateStructGEP(FileInfoTy, FileInfoPtr, 0, "start_file_args");
   auto *StartFileCall = Builder.CreateCall(
       StartFile,
       {Builder.CreateLoad(StartFileCallArgsTy->getElementType(0),
                           Builder.CreateStructGEP(StartFileCallArgsTy,
-                                                  StartFileCallArgsPtr, 0)),
+                                                  StartFileCallArgsPtr, 0),
+                          "filename"),
        Builder.CreateLoad(StartFileCallArgsTy->getElementType(1),
                           Builder.CreateStructGEP(StartFileCallArgsTy,
-                                                  StartFileCallArgsPtr, 1)),
+                                                  StartFileCallArgsPtr, 1),
+                          "version"),
        Builder.CreateLoad(StartFileCallArgsTy->getElementType(2),
                           Builder.CreateStructGEP(StartFileCallArgsTy,
-                                                  StartFileCallArgsPtr, 2))});
+                                                  StartFileCallArgsPtr, 2),
+                          "stamp")});
   if (auto AK = TLI->getExtAttrForI32Param(false))
     StartFileCall->addParamAttr(2, AK);
-  auto *NumCounters =
-      Builder.CreateLoad(FileInfoTy->getElementType(1),
-                         Builder.CreateStructGEP(FileInfoTy, FileInfoPtr, 1));
+  auto *NumCounters = Builder.CreateLoad(
+      FileInfoTy->getElementType(1),
+      Builder.CreateStructGEP(FileInfoTy, FileInfoPtr, 1), "num_ctrs");
   auto *EmitFunctionCallArgsArray =
       Builder.CreateLoad(FileInfoTy->getElementType(2),
-                         Builder.CreateStructGEP(FileInfoTy, FileInfoPtr, 2));
-  auto *EmitArcsCallArgsArray =
-      Builder.CreateLoad(FileInfoTy->getElementType(3),
-                         Builder.CreateStructGEP(FileInfoTy, FileInfoPtr, 3));
+                         Builder.CreateStructGEP(FileInfoTy, FileInfoPtr, 2),
+                         "emit_function_args");
+  auto *EmitArcsCallArgsArray = Builder.CreateLoad(
+      FileInfoTy->getElementType(3),
+      Builder.CreateStructGEP(FileInfoTy, FileInfoPtr, 3), "emit_arcs_args");
   auto *EnterCounterLoopCond =
       Builder.CreateICmpSLT(Builder.getInt32(0), NumCounters);
   Builder.CreateCondBr(EnterCounterLoopCond, CounterLoopHeader, FileLoopLatch);
 
   Builder.SetInsertPoint(CounterLoopHeader);
-  auto *JV = Builder.CreatePHI(Builder.getInt32Ty(), /*NumReservedValues*/ 2);
+  auto *JV = Builder.CreatePHI(Builder.getInt32Ty(), /*NumReservedValues*/ 2,
+                               "ctr_idx");
   JV->addIncoming(Builder.getInt32(0), FileLoopHeader);
   auto *EmitFunctionCallArgsPtr = Builder.CreateInBoundsGEP(
       EmitFunctionCallArgsTy, EmitFunctionCallArgsArray, JV);
@@ -1188,14 +1309,16 @@ Function *GCOVProfiler::insertCounterWriteout(
       EmitFunction,
       {Builder.CreateLoad(EmitFunctionCallArgsTy->getElementType(0),
                           Builder.CreateStructGEP(EmitFunctionCallArgsTy,
-                                                  EmitFunctionCallArgsPtr, 0)),
+                                                  EmitFunctionCallArgsPtr, 0),
+                          "ident"),
        Builder.CreateLoad(EmitFunctionCallArgsTy->getElementType(1),
                           Builder.CreateStructGEP(EmitFunctionCallArgsTy,
-                                                  EmitFunctionCallArgsPtr, 1)),
+                                                  EmitFunctionCallArgsPtr, 1),
+                          "func_checkssum"),
        Builder.CreateLoad(EmitFunctionCallArgsTy->getElementType(2),
                           Builder.CreateStructGEP(EmitFunctionCallArgsTy,
-                                                  EmitFunctionCallArgsPtr,
-                                                  2))});
+                                                  EmitFunctionCallArgsPtr, 2),
+                          "cfg_checksum")});
   if (auto AK = TLI->getExtAttrForI32Param(false)) {
     EmitFunctionCall->addParamAttr(0, AK);
     EmitFunctionCall->addParamAttr(1, AK);
@@ -1207,10 +1330,12 @@ Function *GCOVProfiler::insertCounterWriteout(
       EmitArcs,
       {Builder.CreateLoad(
            EmitArcsCallArgsTy->getElementType(0),
-           Builder.CreateStructGEP(EmitArcsCallArgsTy, EmitArcsCallArgsPtr, 0)),
-       Builder.CreateLoad(EmitArcsCallArgsTy->getElementType(1),
-                          Builder.CreateStructGEP(EmitArcsCallArgsTy,
-                                                  EmitArcsCallArgsPtr, 1))});
+           Builder.CreateStructGEP(EmitArcsCallArgsTy, EmitArcsCallArgsPtr, 0),
+           "num_counters"),
+       Builder.CreateLoad(
+           EmitArcsCallArgsTy->getElementType(1),
+           Builder.CreateStructGEP(EmitArcsCallArgsTy, EmitArcsCallArgsPtr, 1),
+           "counters")});
   if (auto AK = TLI->getExtAttrForI32Param(false))
     EmitArcsCall->addParamAttr(0, AK);
   auto *NextJV = Builder.CreateAdd(JV, Builder.getInt32(1));
@@ -1221,7 +1346,7 @@ Function *GCOVProfiler::insertCounterWriteout(
   Builder.SetInsertPoint(FileLoopLatch);
   Builder.CreateCall(SummaryInfo, {});
   Builder.CreateCall(EndFile, {});
-  auto *NextIV = Builder.CreateAdd(IV, Builder.getInt32(1));
+  auto *NextIV = Builder.CreateAdd(IV, Builder.getInt32(1), "next_file_idx");
   auto *FileLoopCond =
       Builder.CreateICmpSLT(NextIV, Builder.getInt32(FileInfos.size()));
   Builder.CreateCondBr(FileLoopCond, FileLoopHeader, ExitBB);
@@ -1266,36 +1391,3 @@ Function *GCOVProfiler::insertReset(
 
   return ResetF;
 }
-
-Function *GCOVProfiler::insertFlush(Function *ResetF) {
-  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
-  Function *FlushF = M->getFunction("__llvm_gcov_flush");
-  if (!FlushF)
-    FlushF = Function::Create(FTy, GlobalValue::InternalLinkage,
-                              "__llvm_gcov_flush", M);
-  FlushF->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
-  FlushF->addFnAttr(Attribute::NoInline);
-  if (Options.NoRedZone)
-    FlushF->addFnAttr(Attribute::NoRedZone);
-
-  BasicBlock *Entry = BasicBlock::Create(*Ctx, "entry", FlushF);
-
-  // Write out the current counters.
-  Function *WriteoutF = M->getFunction("__llvm_gcov_writeout");
-  assert(WriteoutF && "Need to create the writeout function first!");
-
-  IRBuilder<> Builder(Entry);
-  Builder.CreateCall(WriteoutF, {});
-  Builder.CreateCall(ResetF, {});
-
-  Type *RetTy = FlushF->getReturnType();
-  if (RetTy->isVoidTy())
-    Builder.CreateRetVoid();
-  else if (RetTy->isIntegerTy())
-    // Used if __llvm_gcov_flush was implicitly declared.
-    Builder.CreateRet(ConstantInt::get(RetTy, 0));
-  else
-    report_fatal_error("invalid return type for __llvm_gcov_flush");
-
-  return FlushF;
-}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 2e71d613714a..fedd9bfc977e 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -55,13 +55,12 @@ using namespace llvm;
 
 #define DEBUG_TYPE "hwasan"
 
-static const char *const kHwasanModuleCtorName = "hwasan.module_ctor";
-static const char *const kHwasanNoteName = "hwasan.note";
-static const char *const kHwasanInitName = "__hwasan_init";
-static const char *const kHwasanPersonalityThunkName =
-    "__hwasan_personality_thunk";
+const char kHwasanModuleCtorName[] = "hwasan.module_ctor";
+const char kHwasanNoteName[] = "hwasan.note";
+const char kHwasanInitName[] = "__hwasan_init";
+const char kHwasanPersonalityThunkName[] = "__hwasan_personality_thunk";
 
-static const char *const kHwasanShadowMemoryDynamicAddress =
+const char kHwasanShadowMemoryDynamicAddress[] =
     "__hwasan_shadow_memory_dynamic_address";
 
 // Accesses sizes are powers of two: 1, 2, 4, 8, 16.
@@ -203,14 +202,16 @@ public:
 
   bool sanitizeFunction(Function &F);
   void initializeModule();
+  void createHwasanCtorComdat();
 
   void initializeCallbacks(Module &M);
 
+  Value *getOpaqueNoopCast(IRBuilder<> &IRB, Value *Val);
+
   Value *getDynamicShadowIfunc(IRBuilder<> &IRB);
-  Value *getDynamicShadowNonTls(IRBuilder<> &IRB);
+  Value *getShadowNonTls(IRBuilder<> &IRB);
 
   void untagPointerOperand(Instruction *I, Value *Addr);
-  Value *shadowBase();
   Value *memToShadow(Value *Shadow, IRBuilder<> &IRB);
   void instrumentMemAccessInline(Value *Ptr, bool IsWrite,
                                  unsigned AccessSizeIndex,
@@ -280,9 +281,13 @@ private:
 
   bool CompileKernel;
   bool Recover;
+  bool OutlinedChecks;
   bool UseShortGranules;
   bool InstrumentLandingPads;
 
+  bool HasMatchAllTag = false;
+  uint8_t MatchAllTag = 0;
+
   Function *HwasanCtorFunction;
 
   FunctionCallee HwasanMemoryAccessCallback[2][kNumberOfAccessSizes];
@@ -293,7 +298,7 @@ private:
 
   Constant *ShadowGlobal;
 
-  Value *LocalDynamicShadow = nullptr;
+  Value *ShadowBase = nullptr;
   Value *StackBaseTag = nullptr;
   GlobalValue *ThreadPtrGlobal = nullptr;
 };
@@ -365,6 +370,106 @@ PreservedAnalyses HWAddressSanitizerPass::run(Module &M,
   return PreservedAnalyses::all();
 }
 
+void HWAddressSanitizer::createHwasanCtorComdat() {
+  std::tie(HwasanCtorFunction, std::ignore) =
+      getOrCreateSanitizerCtorAndInitFunctions(
+          M, kHwasanModuleCtorName, kHwasanInitName,
+          /*InitArgTypes=*/{},
+          /*InitArgs=*/{},
+          // This callback is invoked when the functions are created the first
+          // time. Hook them into the global ctors list in that case:
+          [&](Function *Ctor, FunctionCallee) {
+            Comdat *CtorComdat = M.getOrInsertComdat(kHwasanModuleCtorName);
+            Ctor->setComdat(CtorComdat);
+            appendToGlobalCtors(M, Ctor, 0, Ctor);
+          });
+
+  // Create a note that contains pointers to the list of global
+  // descriptors. Adding a note to the output file will cause the linker to
+  // create a PT_NOTE program header pointing to the note that we can use to
+  // find the descriptor list starting from the program headers. A function
+  // provided by the runtime initializes the shadow memory for the globals by
+  // accessing the descriptor list via the note. The dynamic loader needs to
+  // call this function whenever a library is loaded.
+  //
+  // The reason why we use a note for this instead of a more conventional
+  // approach of having a global constructor pass a descriptor list pointer to
+  // the runtime is because of an order of initialization problem. With
+  // constructors we can encounter the following problematic scenario:
+  //
+  // 1) library A depends on library B and also interposes one of B's symbols
+  // 2) B's constructors are called before A's (as required for correctness)
+  // 3) during construction, B accesses one of its "own" globals (actually
+  //    interposed by A) and triggers a HWASAN failure due to the initialization
+  //    for A not having happened yet
+  //
+  // Even without interposition it is possible to run into similar situations in
+  // cases where two libraries mutually depend on each other.
+  //
+  // We only need one note per binary, so put everything for the note in a
+  // comdat. This needs to be a comdat with an .init_array section to prevent
+  // newer versions of lld from discarding the note.
+  //
+  // Create the note even if we aren't instrumenting globals. This ensures that
+  // binaries linked from object files with both instrumented and
+  // non-instrumented globals will end up with a note, even if a comdat from an
+  // object file with non-instrumented globals is selected. The note is harmless
+  // if the runtime doesn't support it, since it will just be ignored.
+  Comdat *NoteComdat = M.getOrInsertComdat(kHwasanModuleCtorName);
+
+  Type *Int8Arr0Ty = ArrayType::get(Int8Ty, 0);
+  auto Start =
+      new GlobalVariable(M, Int8Arr0Ty, true, GlobalVariable::ExternalLinkage,
+                         nullptr, "__start_hwasan_globals");
+  Start->setVisibility(GlobalValue::HiddenVisibility);
+  Start->setDSOLocal(true);
+  auto Stop =
+      new GlobalVariable(M, Int8Arr0Ty, true, GlobalVariable::ExternalLinkage,
+                         nullptr, "__stop_hwasan_globals");
+  Stop->setVisibility(GlobalValue::HiddenVisibility);
+  Stop->setDSOLocal(true);
+
+  // Null-terminated so actually 8 bytes, which are required in order to align
+  // the note properly.
+  auto *Name = ConstantDataArray::get(*C, "LLVM\0\0\0");
+
+  auto *NoteTy = StructType::get(Int32Ty, Int32Ty, Int32Ty, Name->getType(),
+                                 Int32Ty, Int32Ty);
+  auto *Note =
+      new GlobalVariable(M, NoteTy, /*isConstant=*/true,
+                         GlobalValue::PrivateLinkage, nullptr, kHwasanNoteName);
+  Note->setSection(".note.hwasan.globals");
+  Note->setComdat(NoteComdat);
+  Note->setAlignment(Align(4));
+  Note->setDSOLocal(true);
+
+  // The pointers in the note need to be relative so that the note ends up being
+  // placed in rodata, which is the standard location for notes.
+  auto CreateRelPtr = [&](Constant *Ptr) {
+    return ConstantExpr::getTrunc(
+        ConstantExpr::getSub(ConstantExpr::getPtrToInt(Ptr, Int64Ty),
+                             ConstantExpr::getPtrToInt(Note, Int64Ty)),
+        Int32Ty);
+  };
+  Note->setInitializer(ConstantStruct::getAnon(
+      {ConstantInt::get(Int32Ty, 8),                           // n_namesz
+       ConstantInt::get(Int32Ty, 8),                           // n_descsz
+       ConstantInt::get(Int32Ty, ELF::NT_LLVM_HWASAN_GLOBALS), // n_type
+       Name, CreateRelPtr(Start), CreateRelPtr(Stop)}));
+  appendToCompilerUsed(M, Note);
+
+  // Create a zero-length global in hwasan_globals so that the linker will
+  // always create start and stop symbols.
+  auto Dummy = new GlobalVariable(
+      M, Int8Arr0Ty, /*isConstantGlobal*/ true, GlobalVariable::PrivateLinkage,
+      Constant::getNullValue(Int8Arr0Ty), "hwasan.dummy.global");
+  Dummy->setSection("hwasan_globals");
+  Dummy->setComdat(NoteComdat);
+  Dummy->setMetadata(LLVMContext::MD_associated,
+                     MDNode::get(*C, ValueAsMetadata::get(Note)));
+  appendToCompilerUsed(M, Dummy);
+}
+
 /// Module-level initialization.
 ///
 /// inserts a call to __hwasan_init to the module's constructor list.
@@ -393,6 +498,19 @@ void HWAddressSanitizer::initializeModule() {
 
   UseShortGranules =
       ClUseShortGranules.getNumOccurrences() ? ClUseShortGranules : NewRuntime;
+  OutlinedChecks =
+      TargetTriple.isAArch64() && TargetTriple.isOSBinFormatELF() &&
+      (ClInlineAllChecks.getNumOccurrences() ? !ClInlineAllChecks : !Recover);
+
+  if (ClMatchAllTag.getNumOccurrences()) {
+    if (ClMatchAllTag != -1) {
+      HasMatchAllTag = true;
+      MatchAllTag = ClMatchAllTag & 0xFF;
+    }
+  } else if (CompileKernel) {
+    HasMatchAllTag = true;
+    MatchAllTag = 0xFF;
+  }
 
   // If we don't have personality function support, fall back to landing pads.
   InstrumentLandingPads = ClInstrumentLandingPads.getNumOccurrences()
@@ -400,19 +518,7 @@ void HWAddressSanitizer::initializeModule() {
                               : !NewRuntime;
 
   if (!CompileKernel) {
-    std::tie(HwasanCtorFunction, std::ignore) =
-        getOrCreateSanitizerCtorAndInitFunctions(
-            M, kHwasanModuleCtorName, kHwasanInitName,
-            /*InitArgTypes=*/{},
-            /*InitArgs=*/{},
-            // This callback is invoked when the functions are created the first
-            // time. Hook them into the global ctors list in that case:
-            [&](Function *Ctor, FunctionCallee) {
-              Comdat *CtorComdat = M.getOrInsertComdat(kHwasanModuleCtorName);
-              Ctor->setComdat(CtorComdat);
-              appendToGlobalCtors(M, Ctor, 0, Ctor);
-            });
-
+    createHwasanCtorComdat();
     bool InstrumentGlobals =
         ClGlobals.getNumOccurrences() ? ClGlobals : NewRuntime;
     if (InstrumentGlobals)
@@ -483,20 +589,27 @@ void HWAddressSanitizer::initializeCallbacks(Module &M) {
       M.getOrInsertFunction("__hwasan_handle_vfork", IRB.getVoidTy(), IntptrTy);
 }
 
-Value *HWAddressSanitizer::getDynamicShadowIfunc(IRBuilder<> &IRB) {
+Value *HWAddressSanitizer::getOpaqueNoopCast(IRBuilder<> &IRB, Value *Val) {
   // An empty inline asm with input reg == output reg.
   // An opaque no-op cast, basically.
-  InlineAsm *Asm = InlineAsm::get(
-      FunctionType::get(Int8PtrTy, {ShadowGlobal->getType()}, false),
-      StringRef(""), StringRef("=r,0"),
-      /*hasSideEffects=*/false);
-  return IRB.CreateCall(Asm, {ShadowGlobal}, ".hwasan.shadow");
+  // This prevents code bloat as a result of rematerializing trivial definitions
+  // such as constants or global addresses at every load and store.
+  InlineAsm *Asm =
+      InlineAsm::get(FunctionType::get(Int8PtrTy, {Val->getType()}, false),
+                     StringRef(""), StringRef("=r,0"),
+                     /*hasSideEffects=*/false);
+  return IRB.CreateCall(Asm, {Val}, ".hwasan.shadow");
+}
+
+Value *HWAddressSanitizer::getDynamicShadowIfunc(IRBuilder<> &IRB) {
+  return getOpaqueNoopCast(IRB, ShadowGlobal);
 }
 
-Value *HWAddressSanitizer::getDynamicShadowNonTls(IRBuilder<> &IRB) {
-  // Generate code only when dynamic addressing is needed.
+Value *HWAddressSanitizer::getShadowNonTls(IRBuilder<> &IRB) {
   if (Mapping.Offset != kDynamicShadowSentinel)
-    return nullptr;
+    return getOpaqueNoopCast(
+        IRB, ConstantExpr::getIntToPtr(
+                 ConstantInt::get(IntptrTy, Mapping.Offset), Int8PtrTy));
 
   if (Mapping.InGlobal) {
     return getDynamicShadowIfunc(IRB);
@@ -532,7 +645,7 @@ void HWAddressSanitizer::getInterestingMemoryOperands(
     return;
 
   // Do not instrument the load fetching the dynamic shadow address.
-  if (LocalDynamicShadow == I)
+  if (ShadowBase == I)
     return;
 
   if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
@@ -596,37 +709,35 @@ void HWAddressSanitizer::untagPointerOperand(Instruction *I, Value *Addr) {
   I->setOperand(getPointerOperandIndex(I), UntaggedPtr);
 }
 
-Value *HWAddressSanitizer::shadowBase() {
-  if (LocalDynamicShadow)
-    return LocalDynamicShadow;
-  return ConstantExpr::getIntToPtr(ConstantInt::get(IntptrTy, Mapping.Offset),
-                                   Int8PtrTy);
-}
-
 Value *HWAddressSanitizer::memToShadow(Value *Mem, IRBuilder<> &IRB) {
   // Mem >> Scale
   Value *Shadow = IRB.CreateLShr(Mem, Mapping.Scale);
   if (Mapping.Offset == 0)
     return IRB.CreateIntToPtr(Shadow, Int8PtrTy);
   // (Mem >> Scale) + Offset
-  return IRB.CreateGEP(Int8Ty, shadowBase(), Shadow);
+  return IRB.CreateGEP(Int8Ty, ShadowBase, Shadow);
 }
 
 void HWAddressSanitizer::instrumentMemAccessInline(Value *Ptr, bool IsWrite,
                                                    unsigned AccessSizeIndex,
                                                    Instruction *InsertBefore) {
-  const int64_t AccessInfo = Recover * 0x20 + IsWrite * 0x10 + AccessSizeIndex;
+  const int64_t AccessInfo =
+      (CompileKernel << HWASanAccessInfo::CompileKernelShift) +
+      (HasMatchAllTag << HWASanAccessInfo::HasMatchAllShift) +
+      (MatchAllTag << HWASanAccessInfo::MatchAllShift) +
+      (Recover << HWASanAccessInfo::RecoverShift) +
+      (IsWrite << HWASanAccessInfo::IsWriteShift) +
+      (AccessSizeIndex << HWASanAccessInfo::AccessSizeShift);
   IRBuilder<> IRB(InsertBefore);
 
-  if (!ClInlineAllChecks && TargetTriple.isAArch64() &&
-      TargetTriple.isOSBinFormatELF() && !Recover) {
+  if (OutlinedChecks) {
     Module *M = IRB.GetInsertBlock()->getParent()->getParent();
     Ptr = IRB.CreateBitCast(Ptr, Int8PtrTy);
     IRB.CreateCall(Intrinsic::getDeclaration(
                        M, UseShortGranules
                               ? Intrinsic::hwasan_check_memaccess_shortgranules
                               : Intrinsic::hwasan_check_memaccess),
-                   {shadowBase(), Ptr, ConstantInt::get(Int32Ty, AccessInfo)});
+                   {ShadowBase, Ptr, ConstantInt::get(Int32Ty, AccessInfo)});
     return;
   }
 
@@ -638,11 +749,9 @@ void HWAddressSanitizer::instrumentMemAccessInline(Value *Ptr, bool IsWrite,
   Value *MemTag = IRB.CreateLoad(Int8Ty, Shadow);
   Value *TagMismatch = IRB.CreateICmpNE(PtrTag, MemTag);
 
-  int matchAllTag = ClMatchAllTag.getNumOccurrences() > 0 ?
-      ClMatchAllTag : (CompileKernel ? 0xFF : -1);
-  if (matchAllTag != -1) {
-    Value *TagNotIgnored = IRB.CreateICmpNE(PtrTag,
-        ConstantInt::get(PtrTag->getType(), matchAllTag));
+  if (HasMatchAllTag) {
+    Value *TagNotIgnored = IRB.CreateICmpNE(
+        PtrTag, ConstantInt::get(PtrTag->getType(), MatchAllTag));
     TagMismatch = IRB.CreateAnd(TagMismatch, TagNotIgnored);
   }
 
@@ -664,7 +773,8 @@ void HWAddressSanitizer::instrumentMemAccessInline(Value *Ptr, bool IsWrite,
   Value *PtrLowBitsOOB = IRB.CreateICmpUGE(PtrLowBits, MemTag);
   SplitBlockAndInsertIfThen(PtrLowBitsOOB, CheckTerm, false,
                             MDBuilder(*C).createBranchWeights(1, 100000),
-                            nullptr, nullptr, CheckFailTerm->getParent());
+                            (DomTreeUpdater *)nullptr, nullptr,
+                            CheckFailTerm->getParent());
 
   IRB.SetInsertPoint(CheckTerm);
   Value *InlineTagAddr = IRB.CreateOr(AddrLong, 15);
@@ -673,7 +783,8 @@ void HWAddressSanitizer::instrumentMemAccessInline(Value *Ptr, bool IsWrite,
   Value *InlineTagMismatch = IRB.CreateICmpNE(PtrTag, InlineTag);
   SplitBlockAndInsertIfThen(InlineTagMismatch, CheckTerm, false,
                             MDBuilder(*C).createBranchWeights(1, 100000),
-                            nullptr, nullptr, CheckFailTerm->getParent());
+                            (DomTreeUpdater *)nullptr, nullptr,
+                            CheckFailTerm->getParent());
 
   IRB.SetInsertPoint(CheckFailTerm);
   InlineAsm *Asm;
@@ -682,7 +793,9 @@ void HWAddressSanitizer::instrumentMemAccessInline(Value *Ptr, bool IsWrite,
       // The signal handler will find the data address in rdi.
       Asm = InlineAsm::get(
           FunctionType::get(IRB.getVoidTy(), {PtrLong->getType()}, false),
-          "int3\nnopl " + itostr(0x40 + AccessInfo) + "(%rax)",
+          "int3\nnopl " +
+              itostr(0x40 + (AccessInfo & HWASanAccessInfo::RuntimeMask)) +
+              "(%rax)",
           "{rdi}",
           /*hasSideEffects=*/true);
       break;
@@ -691,7 +804,8 @@ void HWAddressSanitizer::instrumentMemAccessInline(Value *Ptr, bool IsWrite,
       // The signal handler will find the data address in x0.
       Asm = InlineAsm::get(
           FunctionType::get(IRB.getVoidTy(), {PtrLong->getType()}, false),
-          "brk #" + itostr(0x900 + AccessInfo),
+          "brk #" +
+              itostr(0x900 + (AccessInfo & HWASanAccessInfo::RuntimeMask)),
           "{x0}",
           /*hasSideEffects=*/true);
       break;
@@ -914,12 +1028,12 @@ Value *HWAddressSanitizer::getHwasanThreadSlotPtr(IRBuilder<> &IRB, Type *Ty) {
 
 void HWAddressSanitizer::emitPrologue(IRBuilder<> &IRB, bool WithFrameRecord) {
   if (!Mapping.InTls) {
-    LocalDynamicShadow = getDynamicShadowNonTls(IRB);
+    ShadowBase = getShadowNonTls(IRB);
     return;
   }
 
   if (!WithFrameRecord && TargetTriple.isAndroid()) {
-    LocalDynamicShadow = getDynamicShadowIfunc(IRB);
+    ShadowBase = getDynamicShadowIfunc(IRB);
     return;
   }
 
@@ -980,12 +1094,12 @@ void HWAddressSanitizer::emitPrologue(IRBuilder<> &IRB, bool WithFrameRecord) {
   // Get shadow base address by aligning RecordPtr up.
   // Note: this is not correct if the pointer is already aligned.
   // Runtime library will make sure this never happens.
-  LocalDynamicShadow = IRB.CreateAdd(
+  ShadowBase = IRB.CreateAdd(
       IRB.CreateOr(
           ThreadLongMaybeUntagged,
           ConstantInt::get(IntptrTy, (1ULL << kShadowBaseAlignment) - 1)),
       ConstantInt::get(IntptrTy, 1), "hwasan.shadow");
-  LocalDynamicShadow = IRB.CreateIntToPtr(LocalDynamicShadow, Int8PtrTy);
+  ShadowBase = IRB.CreateIntToPtr(ShadowBase, Int8PtrTy);
 }
 
 Value *HWAddressSanitizer::readRegister(IRBuilder<> &IRB, StringRef Name) {
@@ -1137,7 +1251,7 @@ bool HWAddressSanitizer::sanitizeFunction(Function &F) {
       IntrinToInstrument.empty())
     return Changed;
 
-  assert(!LocalDynamicShadow);
+  assert(!ShadowBase);
 
   Instruction *InsertPt = &*F.getEntryBlock().begin();
   IRBuilder<> EntryIRB(InsertPt);
@@ -1217,7 +1331,7 @@ bool HWAddressSanitizer::sanitizeFunction(Function &F) {
       instrumentMemIntrinsic(cast<MemIntrinsic>(Inst));
   }
 
-  LocalDynamicShadow = nullptr;
+  ShadowBase = nullptr;
   StackBaseTag = nullptr;
 
   return true;
@@ -1300,85 +1414,6 @@ void HWAddressSanitizer::instrumentGlobal(GlobalVariable *GV, uint8_t Tag) {
 }
 
 void HWAddressSanitizer::instrumentGlobals() {
-  // Start by creating a note that contains pointers to the list of global
-  // descriptors. Adding a note to the output file will cause the linker to
-  // create a PT_NOTE program header pointing to the note that we can use to
-  // find the descriptor list starting from the program headers. A function
-  // provided by the runtime initializes the shadow memory for the globals by
-  // accessing the descriptor list via the note. The dynamic loader needs to
-  // call this function whenever a library is loaded.
-  //
-  // The reason why we use a note for this instead of a more conventional
-  // approach of having a global constructor pass a descriptor list pointer to
-  // the runtime is because of an order of initialization problem. With
-  // constructors we can encounter the following problematic scenario:
-  //
-  // 1) library A depends on library B and also interposes one of B's symbols
-  // 2) B's constructors are called before A's (as required for correctness)
-  // 3) during construction, B accesses one of its "own" globals (actually
-  //    interposed by A) and triggers a HWASAN failure due to the initialization
-  //    for A not having happened yet
-  //
-  // Even without interposition it is possible to run into similar situations in
-  // cases where two libraries mutually depend on each other.
-  //
-  // We only need one note per binary, so put everything for the note in a
-  // comdat. This need to be a comdat with an .init_array section to prevent
-  // newer versions of lld from discarding the note.
-  Comdat *NoteComdat = M.getOrInsertComdat(kHwasanModuleCtorName);
-
-  Type *Int8Arr0Ty = ArrayType::get(Int8Ty, 0);
-  auto Start =
-      new GlobalVariable(M, Int8Arr0Ty, true, GlobalVariable::ExternalLinkage,
-                         nullptr, "__start_hwasan_globals");
-  Start->setVisibility(GlobalValue::HiddenVisibility);
-  Start->setDSOLocal(true);
-  auto Stop =
-      new GlobalVariable(M, Int8Arr0Ty, true, GlobalVariable::ExternalLinkage,
-                         nullptr, "__stop_hwasan_globals");
-  Stop->setVisibility(GlobalValue::HiddenVisibility);
-  Stop->setDSOLocal(true);
-
-  // Null-terminated so actually 8 bytes, which are required in order to align
-  // the note properly.
-  auto *Name = ConstantDataArray::get(*C, "LLVM\0\0\0");
-
-  auto *NoteTy = StructType::get(Int32Ty, Int32Ty, Int32Ty, Name->getType(),
-                                 Int32Ty, Int32Ty);
-  auto *Note =
-      new GlobalVariable(M, NoteTy, /*isConstantGlobal=*/true,
-                         GlobalValue::PrivateLinkage, nullptr, kHwasanNoteName);
-  Note->setSection(".note.hwasan.globals");
-  Note->setComdat(NoteComdat);
-  Note->setAlignment(Align(4));
-  Note->setDSOLocal(true);
-
-  // The pointers in the note need to be relative so that the note ends up being
-  // placed in rodata, which is the standard location for notes.
-  auto CreateRelPtr = [&](Constant *Ptr) {
-    return ConstantExpr::getTrunc(
-        ConstantExpr::getSub(ConstantExpr::getPtrToInt(Ptr, Int64Ty),
-                             ConstantExpr::getPtrToInt(Note, Int64Ty)),
-        Int32Ty);
-  };
-  Note->setInitializer(ConstantStruct::getAnon(
-      {ConstantInt::get(Int32Ty, 8),                           // n_namesz
-       ConstantInt::get(Int32Ty, 8),                           // n_descsz
-       ConstantInt::get(Int32Ty, ELF::NT_LLVM_HWASAN_GLOBALS), // n_type
-       Name, CreateRelPtr(Start), CreateRelPtr(Stop)}));
-  appendToCompilerUsed(M, Note);
-
-  // Create a zero-length global in hwasan_globals so that the linker will
-  // always create start and stop symbols.
-  auto Dummy = new GlobalVariable(
-      M, Int8Arr0Ty, /*isConstantGlobal*/ true, GlobalVariable::PrivateLinkage,
-      Constant::getNullValue(Int8Arr0Ty), "hwasan.dummy.global");
-  Dummy->setSection("hwasan_globals");
-  Dummy->setComdat(NoteComdat);
-  Dummy->setMetadata(LLVMContext::MD_associated,
-                     MDNode::get(*C, ValueAsMetadata::get(Note)));
-  appendToCompilerUsed(M, Dummy);
-
   std::vector<GlobalVariable *> Globals;
   for (GlobalVariable &GV : M.globals()) {
     if (GV.isDeclarationForLinker() || GV.getName().startswith("llvm.") ||
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
index bcd4e2e8e33c..5b9557a9b328 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
@@ -263,8 +263,15 @@ ICallPromotionFunc::getPromotionCandidatesForCallSite(
       break;
     }
 
+    // Don't promote if the symbol is not defined in the module. This avoids
+    // creating a reference to a symbol that doesn't exist in the module
+    // This can happen when we compile with a sample profile collected from
+    // one binary but used for another, which may have profiled targets that
+    // aren't used in the new binary. We might have a declaration initially in
+    // the case where the symbol is globally dead in the binary and removed by
+    // ThinLTO.
     Function *TargetFunction = Symtab->getFunction(Target);
-    if (TargetFunction == nullptr) {
+    if (TargetFunction == nullptr || TargetFunction->isDeclaration()) {
       LLVM_DEBUG(dbgs() << " Not promote: Cannot find the target\n");
       ORE.emit([&]() {
         return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget", &CB)
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index 7b03bbfcdfe4..9efc7d1ac500 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -57,21 +57,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "instrprof"
 
-// The start and end values of precise value profile range for memory
-// intrinsic sizes
-cl::opt<std::string> MemOPSizeRange(
-    "memop-size-range",
-    cl::desc("Set the range of size in memory intrinsic calls to be profiled "
-             "precisely, in a format of <start_val>:<end_val>"),
-    cl::init(""));
-
-// The value that considered to be large value in  memory intrinsic.
-cl::opt<unsigned> MemOPSizeLarge(
-    "memop-size-large",
-    cl::desc("Set large value thresthold in memory intrinsic size profiling. "
-             "Value of 0 disables the large value profiling."),
-    cl::init(8192));
-
 namespace {
 
 cl::opt<bool> DoHashBasedCounterSplit(
@@ -150,6 +135,10 @@ cl::opt<bool> IterativeCounterPromotion(
     cl::ZeroOrMore, "iterative-counter-promotion", cl::init(true),
     cl::desc("Allow counter promotion across the whole loop nest."));
 
+cl::opt<bool> SkipRetExitBlock(
+    cl::ZeroOrMore, "skip-ret-exit-block", cl::init(true),
+    cl::desc("Suppress counter promotion if exit blocks contain ret."));
+
 class InstrProfilingLegacyPass : public ModulePass {
   InstrProfiling InstrProf;
 
@@ -272,6 +261,18 @@ public:
     // Skip 'infinite' loops:
     if (ExitBlocks.size() == 0)
       return false;
+
+    // Skip if any of the ExitBlocks contains a ret instruction.
+    // This is to prevent dumping of incomplete profile -- if the
+    // the loop is a long running loop and dump is called in the middle
+    // of the loop, the result profile is incomplete.
+    // FIXME: add other heuristics to detect long running loops.
+    if (SkipRetExitBlock) {
+      for (auto BB : ExitBlocks)
+        if (isa<ReturnInst>(BB->getTerminator()))
+          return false;
+    }
+
     unsigned MaxProm = getMaxNumOfPromotionsInLoop(&L);
     if (MaxProm == 0)
       return false;
@@ -395,6 +396,15 @@ private:
   BlockFrequencyInfo *BFI;
 };
 
+enum class ValueProfilingCallType {
+  // Individual values are tracked. Currently used for indiret call target
+  // profiling.
+  Default,
+
+  // MemOp: the memop size value profiling.
+  MemOp
+};
+
 } // end anonymous namespace
 
 PreservedAnalyses InstrProfiling::run(Module &M, ModuleAnalysisManager &AM) {
@@ -529,8 +539,6 @@ bool InstrProfiling::run(
   NamesSize = 0;
   ProfileDataMap.clear();
   UsedVars.clear();
-  getMemOPSizeRangeFromOption(MemOPSizeRange, MemOPSizeRangeStart,
-                              MemOPSizeRangeLast);
   TT = Triple(M.getTargetTriple());
 
   // Emit the runtime hook even if no counters are present.
@@ -579,9 +587,9 @@ bool InstrProfiling::run(
   return true;
 }
 
-static FunctionCallee
-getOrInsertValueProfilingCall(Module &M, const TargetLibraryInfo &TLI,
-                              bool IsRange = false) {
+static FunctionCallee getOrInsertValueProfilingCall(
+    Module &M, const TargetLibraryInfo &TLI,
+    ValueProfilingCallType CallType = ValueProfilingCallType::Default) {
   LLVMContext &Ctx = M.getContext();
   auto *ReturnTy = Type::getVoidTy(M.getContext());
 
@@ -589,27 +597,19 @@ getOrInsertValueProfilingCall(Module &M, const TargetLibraryInfo &TLI,
   if (auto AK = TLI.getExtAttrForI32Param(false))
     AL = AL.addParamAttribute(M.getContext(), 2, AK);
 
-  if (!IsRange) {
-    Type *ParamTypes[] = {
-#define VALUE_PROF_FUNC_PARAM(ParamType, ParamName, ParamLLVMType) ParamLLVMType
-#include "llvm/ProfileData/InstrProfData.inc"
-    };
-    auto *ValueProfilingCallTy =
-        FunctionType::get(ReturnTy, makeArrayRef(ParamTypes), false);
-    return M.getOrInsertFunction(getInstrProfValueProfFuncName(),
-                                 ValueProfilingCallTy, AL);
-  } else {
-    Type *RangeParamTypes[] = {
-#define VALUE_RANGE_PROF 1
+  assert((CallType == ValueProfilingCallType::Default ||
+          CallType == ValueProfilingCallType::MemOp) &&
+         "Must be Default or MemOp");
+  Type *ParamTypes[] = {
 #define VALUE_PROF_FUNC_PARAM(ParamType, ParamName, ParamLLVMType) ParamLLVMType
 #include "llvm/ProfileData/InstrProfData.inc"
-#undef VALUE_RANGE_PROF
-    };
-    auto *ValueRangeProfilingCallTy =
-        FunctionType::get(ReturnTy, makeArrayRef(RangeParamTypes), false);
-    return M.getOrInsertFunction(getInstrProfValueRangeProfFuncName(),
-                                 ValueRangeProfilingCallTy, AL);
-  }
+  };
+  auto *ValueProfilingCallTy =
+      FunctionType::get(ReturnTy, makeArrayRef(ParamTypes), false);
+  StringRef FuncName = CallType == ValueProfilingCallType::Default
+                           ? getInstrProfValueProfFuncName()
+                           : getInstrProfValueProfMemOpFuncName();
+  return M.getOrInsertFunction(FuncName, ValueProfilingCallTy, AL);
 }
 
 void InstrProfiling::computeNumValueSiteCounts(InstrProfValueProfileInst *Ind) {
@@ -638,8 +638,8 @@ void InstrProfiling::lowerValueProfileInst(InstrProfValueProfileInst *Ind) {
     Index += It->second.NumValueSites[Kind];
 
   IRBuilder<> Builder(Ind);
-  bool IsRange = (Ind->getValueKind()->getZExtValue() ==
-                  llvm::InstrProfValueKind::IPVK_MemOPSize);
+  bool IsMemOpSize = (Ind->getValueKind()->getZExtValue() ==
+                      llvm::InstrProfValueKind::IPVK_MemOPSize);
   CallInst *Call = nullptr;
   auto *TLI = &GetTLI(*Ind->getFunction());
 
@@ -649,22 +649,19 @@ void InstrProfiling::lowerValueProfileInst(InstrProfValueProfileInst *Ind) {
   // WinEHPrepare pass.
   SmallVector<OperandBundleDef, 1> OpBundles;
   Ind->getOperandBundlesAsDefs(OpBundles);
-  if (!IsRange) {
+  if (!IsMemOpSize) {
     Value *Args[3] = {Ind->getTargetValue(),
                       Builder.CreateBitCast(DataVar, Builder.getInt8PtrTy()),
                       Builder.getInt32(Index)};
     Call = Builder.CreateCall(getOrInsertValueProfilingCall(*M, *TLI), Args,
                               OpBundles);
   } else {
-    Value *Args[6] = {
-        Ind->getTargetValue(),
-        Builder.CreateBitCast(DataVar, Builder.getInt8PtrTy()),
-        Builder.getInt32(Index),
-        Builder.getInt64(MemOPSizeRangeStart),
-        Builder.getInt64(MemOPSizeRangeLast),
-        Builder.getInt64(MemOPSizeLarge == 0 ? INT64_MIN : MemOPSizeLarge)};
-    Call = Builder.CreateCall(getOrInsertValueProfilingCall(*M, *TLI, true),
-                              Args, OpBundles);
+    Value *Args[3] = {Ind->getTargetValue(),
+                      Builder.CreateBitCast(DataVar, Builder.getInt8PtrTy()),
+                      Builder.getInt32(Index)};
+    Call = Builder.CreateCall(
+        getOrInsertValueProfilingCall(*M, *TLI, ValueProfilingCallType::MemOp),
+        Args, OpBundles);
   }
   if (auto AK = TLI->getExtAttrForI32Param(false))
     Call->addParamAttr(2, AK);
@@ -831,9 +828,11 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
       Visibility = GlobalValue::HiddenVisibility;
     }
   }
+  std::string DataVarName = getVarName(Inc, getInstrProfDataVarPrefix());
   auto MaybeSetComdat = [=](GlobalVariable *GV) {
     if (NeedComdat)
-      GV->setComdat(M->getOrInsertComdat(GV->getName()));
+      GV->setComdat(M->getOrInsertComdat(TT.isOSBinFormatCOFF() ? GV->getName()
+                                                                : DataVarName));
   };
 
   uint64_t NumCounters = Inc->getNumCounters()->getZExtValue();
@@ -898,9 +897,9 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
 #define INSTR_PROF_DATA(Type, LLVMType, Name, Init) Init,
 #include "llvm/ProfileData/InstrProfData.inc"
   };
-  auto *Data = new GlobalVariable(*M, DataTy, false, Linkage,
-                                  ConstantStruct::get(DataTy, DataVals),
-                                  getVarName(Inc, getInstrProfDataVarPrefix()));
+  auto *Data =
+      new GlobalVariable(*M, DataTy, false, Linkage,
+                         ConstantStruct::get(DataTy, DataVals), DataVarName);
   Data->setVisibility(Visibility);
   Data->setSection(getInstrProfSectionName(IPSK_data, TT.getObjectFormat()));
   Data->setAlignment(Align(INSTR_PROF_DATA_ALIGNMENT));
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp
index ad238f1357c6..cfdf3cad97f7 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp
@@ -105,6 +105,8 @@ Comdat *llvm::GetOrCreateFunctionComdat(Function &F, Triple &T,
 void llvm::initializeInstrumentation(PassRegistry &Registry) {
   initializeAddressSanitizerLegacyPassPass(Registry);
   initializeModuleAddressSanitizerLegacyPassPass(Registry);
+  initializeMemProfilerLegacyPassPass(Registry);
+  initializeModuleMemProfilerLegacyPassPass(Registry);
   initializeBoundsCheckingLegacyPassPass(Registry);
   initializeControlHeightReductionLegacyPassPass(Registry);
   initializeGCOVProfilerLegacyPassPass(Registry);
@@ -119,7 +121,7 @@ void llvm::initializeInstrumentation(PassRegistry &Registry) {
   initializeHWAddressSanitizerLegacyPassPass(Registry);
   initializeThreadSanitizerLegacyPassPass(Registry);
   initializeModuleSanitizerCoverageLegacyPassPass(Registry);
-  initializeDataFlowSanitizerPass(Registry);
+  initializeDataFlowSanitizerLegacyPassPass(Registry);
 }
 
 /// LLVMInitializeInstrumentation - C binding for
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
new file mode 100644
index 000000000000..0e6a404a9e0b
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
@@ -0,0 +1,638 @@
+//===- MemProfiler.cpp - memory allocation and access profiler ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of MemProfiler. Memory accesses are instrumented
+// to increment the access count held in a shadow memory location, or
+// alternatively to call into the runtime. Memory intrinsic calls (memmove,
+// memcpy, memset) are changed to call the memory profiling runtime version
+// instead.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/MemProfiler.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "memprof"
+
+constexpr int LLVM_MEM_PROFILER_VERSION = 1;
+
+// Size of memory mapped to a single shadow location.
+constexpr uint64_t DefaultShadowGranularity = 64;
+
+// Scale from granularity down to shadow size.
+constexpr uint64_t DefaultShadowScale = 3;
+
+constexpr char MemProfModuleCtorName[] = "memprof.module_ctor";
+constexpr uint64_t MemProfCtorAndDtorPriority = 1;
+// On Emscripten, the system needs more than one priorities for constructors.
+constexpr uint64_t MemProfEmscriptenCtorAndDtorPriority = 50;
+constexpr char MemProfInitName[] = "__memprof_init";
+constexpr char MemProfVersionCheckNamePrefix[] =
+    "__memprof_version_mismatch_check_v";
+
+constexpr char MemProfShadowMemoryDynamicAddress[] =
+    "__memprof_shadow_memory_dynamic_address";
+
+constexpr char MemProfFilenameVar[] = "__memprof_profile_filename";
+
+// Command-line flags.
+
+static cl::opt<bool> ClInsertVersionCheck(
+    "memprof-guard-against-version-mismatch",
+    cl::desc("Guard against compiler/runtime version mismatch."), cl::Hidden,
+    cl::init(true));
+
+// This flag may need to be replaced with -f[no-]memprof-reads.
+static cl::opt<bool> ClInstrumentReads("memprof-instrument-reads",
+                                       cl::desc("instrument read instructions"),
+                                       cl::Hidden, cl::init(true));
+
+static cl::opt<bool>
+    ClInstrumentWrites("memprof-instrument-writes",
+                       cl::desc("instrument write instructions"), cl::Hidden,
+                       cl::init(true));
+
+static cl::opt<bool> ClInstrumentAtomics(
+    "memprof-instrument-atomics",
+    cl::desc("instrument atomic instructions (rmw, cmpxchg)"), cl::Hidden,
+    cl::init(true));
+
+static cl::opt<bool> ClUseCalls(
+    "memprof-use-callbacks",
+    cl::desc("Use callbacks instead of inline instrumentation sequences."),
+    cl::Hidden, cl::init(false));
+
+static cl::opt<std::string>
+    ClMemoryAccessCallbackPrefix("memprof-memory-access-callback-prefix",
+                                 cl::desc("Prefix for memory access callbacks"),
+                                 cl::Hidden, cl::init("__memprof_"));
+
+// These flags allow to change the shadow mapping.
+// The shadow mapping looks like
+//    Shadow = ((Mem & mask) >> scale) + offset
+
+static cl::opt<int> ClMappingScale("memprof-mapping-scale",
+                                   cl::desc("scale of memprof shadow mapping"),
+                                   cl::Hidden, cl::init(DefaultShadowScale));
+
+static cl::opt<int>
+    ClMappingGranularity("memprof-mapping-granularity",
+                         cl::desc("granularity of memprof shadow mapping"),
+                         cl::Hidden, cl::init(DefaultShadowGranularity));
+
+// Debug flags.
+
+static cl::opt<int> ClDebug("memprof-debug", cl::desc("debug"), cl::Hidden,
+                            cl::init(0));
+
+static cl::opt<std::string> ClDebugFunc("memprof-debug-func", cl::Hidden,
+                                        cl::desc("Debug func"));
+
+static cl::opt<int> ClDebugMin("memprof-debug-min", cl::desc("Debug min inst"),
+                               cl::Hidden, cl::init(-1));
+
+static cl::opt<int> ClDebugMax("memprof-debug-max", cl::desc("Debug max inst"),
+                               cl::Hidden, cl::init(-1));
+
+STATISTIC(NumInstrumentedReads, "Number of instrumented reads");
+STATISTIC(NumInstrumentedWrites, "Number of instrumented writes");
+
+namespace {
+
+/// This struct defines the shadow mapping using the rule:
+///   shadow = ((mem & mask) >> Scale) ADD DynamicShadowOffset.
+struct ShadowMapping {
+  ShadowMapping() {
+    Scale = ClMappingScale;
+    Granularity = ClMappingGranularity;
+    Mask = ~(Granularity - 1);
+  }
+
+  int Scale;
+  int Granularity;
+  uint64_t Mask; // Computed as ~(Granularity-1)
+};
+
+static uint64_t getCtorAndDtorPriority(Triple &TargetTriple) {
+  return TargetTriple.isOSEmscripten() ? MemProfEmscriptenCtorAndDtorPriority
+                                       : MemProfCtorAndDtorPriority;
+}
+
+struct InterestingMemoryAccess {
+  Value *Addr = nullptr;
+  bool IsWrite;
+  unsigned Alignment;
+  uint64_t TypeSize;
+  Value *MaybeMask = nullptr;
+};
+
+/// Instrument the code in module to profile memory accesses.
+class MemProfiler {
+public:
+  MemProfiler(Module &M) {
+    C = &(M.getContext());
+    LongSize = M.getDataLayout().getPointerSizeInBits();
+    IntptrTy = Type::getIntNTy(*C, LongSize);
+  }
+
+  /// If it is an interesting memory access, populate information
+  /// about the access and return a InterestingMemoryAccess struct.
+  /// Otherwise return None.
+  Optional<InterestingMemoryAccess>
+  isInterestingMemoryAccess(Instruction *I) const;
+
+  void instrumentMop(Instruction *I, const DataLayout &DL,
+                     InterestingMemoryAccess &Access);
+  void instrumentAddress(Instruction *OrigIns, Instruction *InsertBefore,
+                         Value *Addr, uint32_t TypeSize, bool IsWrite);
+  void instrumentMaskedLoadOrStore(const DataLayout &DL, Value *Mask,
+                                   Instruction *I, Value *Addr,
+                                   unsigned Alignment, uint32_t TypeSize,
+                                   bool IsWrite);
+  void instrumentMemIntrinsic(MemIntrinsic *MI);
+  Value *memToShadow(Value *Shadow, IRBuilder<> &IRB);
+  bool instrumentFunction(Function &F);
+  bool maybeInsertMemProfInitAtFunctionEntry(Function &F);
+  bool insertDynamicShadowAtFunctionEntry(Function &F);
+
+private:
+  void initializeCallbacks(Module &M);
+
+  LLVMContext *C;
+  int LongSize;
+  Type *IntptrTy;
+  ShadowMapping Mapping;
+
+  // These arrays is indexed by AccessIsWrite
+  FunctionCallee MemProfMemoryAccessCallback[2];
+  FunctionCallee MemProfMemoryAccessCallbackSized[2];
+
+  FunctionCallee MemProfMemmove, MemProfMemcpy, MemProfMemset;
+  Value *DynamicShadowOffset = nullptr;
+};
+
+class MemProfilerLegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  explicit MemProfilerLegacyPass() : FunctionPass(ID) {
+    initializeMemProfilerLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override { return "MemProfilerFunctionPass"; }
+
+  bool runOnFunction(Function &F) override {
+    MemProfiler Profiler(*F.getParent());
+    return Profiler.instrumentFunction(F);
+  }
+};
+
+class ModuleMemProfiler {
+public:
+  ModuleMemProfiler(Module &M) { TargetTriple = Triple(M.getTargetTriple()); }
+
+  bool instrumentModule(Module &);
+
+private:
+  Triple TargetTriple;
+  ShadowMapping Mapping;
+  Function *MemProfCtorFunction = nullptr;
+};
+
+class ModuleMemProfilerLegacyPass : public ModulePass {
+public:
+  static char ID;
+
+  explicit ModuleMemProfilerLegacyPass() : ModulePass(ID) {
+    initializeModuleMemProfilerLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override { return "ModuleMemProfiler"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {}
+
+  bool runOnModule(Module &M) override {
+    ModuleMemProfiler MemProfiler(M);
+    return MemProfiler.instrumentModule(M);
+  }
+};
+
+} // end anonymous namespace
+
+MemProfilerPass::MemProfilerPass() {}
+
+PreservedAnalyses MemProfilerPass::run(Function &F,
+                                       AnalysisManager<Function> &AM) {
+  Module &M = *F.getParent();
+  MemProfiler Profiler(M);
+  if (Profiler.instrumentFunction(F))
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+
+  return PreservedAnalyses::all();
+}
+
+ModuleMemProfilerPass::ModuleMemProfilerPass() {}
+
+PreservedAnalyses ModuleMemProfilerPass::run(Module &M,
+                                             AnalysisManager<Module> &AM) {
+  ModuleMemProfiler Profiler(M);
+  if (Profiler.instrumentModule(M))
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+}
+
+char MemProfilerLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(MemProfilerLegacyPass, "memprof",
+                      "MemProfiler: profile memory allocations and accesses.",
+                      false, false)
+INITIALIZE_PASS_END(MemProfilerLegacyPass, "memprof",
+                    "MemProfiler: profile memory allocations and accesses.",
+                    false, false)
+
+FunctionPass *llvm::createMemProfilerFunctionPass() {
+  return new MemProfilerLegacyPass();
+}
+
+char ModuleMemProfilerLegacyPass::ID = 0;
+
+INITIALIZE_PASS(ModuleMemProfilerLegacyPass, "memprof-module",
+                "MemProfiler: profile memory allocations and accesses."
+                "ModulePass",
+                false, false)
+
+ModulePass *llvm::createModuleMemProfilerLegacyPassPass() {
+  return new ModuleMemProfilerLegacyPass();
+}
+
+Value *MemProfiler::memToShadow(Value *Shadow, IRBuilder<> &IRB) {
+  // (Shadow & mask) >> scale
+  Shadow = IRB.CreateAnd(Shadow, Mapping.Mask);
+  Shadow = IRB.CreateLShr(Shadow, Mapping.Scale);
+  // (Shadow >> scale) | offset
+  assert(DynamicShadowOffset);
+  return IRB.CreateAdd(Shadow, DynamicShadowOffset);
+}
+
+// Instrument memset/memmove/memcpy
+void MemProfiler::instrumentMemIntrinsic(MemIntrinsic *MI) {
+  IRBuilder<> IRB(MI);
+  if (isa<MemTransferInst>(MI)) {
+    IRB.CreateCall(
+        isa<MemMoveInst>(MI) ? MemProfMemmove : MemProfMemcpy,
+        {IRB.CreatePointerCast(MI->getOperand(0), IRB.getInt8PtrTy()),
+         IRB.CreatePointerCast(MI->getOperand(1), IRB.getInt8PtrTy()),
+         IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false)});
+  } else if (isa<MemSetInst>(MI)) {
+    IRB.CreateCall(
+        MemProfMemset,
+        {IRB.CreatePointerCast(MI->getOperand(0), IRB.getInt8PtrTy()),
+         IRB.CreateIntCast(MI->getOperand(1), IRB.getInt32Ty(), false),
+         IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false)});
+  }
+  MI->eraseFromParent();
+}
+
+Optional<InterestingMemoryAccess>
+MemProfiler::isInterestingMemoryAccess(Instruction *I) const {
+  // Do not instrument the load fetching the dynamic shadow address.
+  if (DynamicShadowOffset == I)
+    return None;
+
+  InterestingMemoryAccess Access;
+
+  const DataLayout &DL = I->getModule()->getDataLayout();
+  if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+    if (!ClInstrumentReads)
+      return None;
+    Access.IsWrite = false;
+    Access.TypeSize = DL.getTypeStoreSizeInBits(LI->getType());
+    Access.Alignment = LI->getAlignment();
+    Access.Addr = LI->getPointerOperand();
+  } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+    if (!ClInstrumentWrites)
+      return None;
+    Access.IsWrite = true;
+    Access.TypeSize =
+        DL.getTypeStoreSizeInBits(SI->getValueOperand()->getType());
+    Access.Alignment = SI->getAlignment();
+    Access.Addr = SI->getPointerOperand();
+  } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {
+    if (!ClInstrumentAtomics)
+      return None;
+    Access.IsWrite = true;
+    Access.TypeSize =
+        DL.getTypeStoreSizeInBits(RMW->getValOperand()->getType());
+    Access.Alignment = 0;
+    Access.Addr = RMW->getPointerOperand();
+  } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I)) {
+    if (!ClInstrumentAtomics)
+      return None;
+    Access.IsWrite = true;
+    Access.TypeSize =
+        DL.getTypeStoreSizeInBits(XCHG->getCompareOperand()->getType());
+    Access.Alignment = 0;
+    Access.Addr = XCHG->getPointerOperand();
+  } else if (auto *CI = dyn_cast<CallInst>(I)) {
+    auto *F = CI->getCalledFunction();
+    if (F && (F->getIntrinsicID() == Intrinsic::masked_load ||
+              F->getIntrinsicID() == Intrinsic::masked_store)) {
+      unsigned OpOffset = 0;
+      if (F->getIntrinsicID() == Intrinsic::masked_store) {
+        if (!ClInstrumentWrites)
+          return None;
+        // Masked store has an initial operand for the value.
+        OpOffset = 1;
+        Access.IsWrite = true;
+      } else {
+        if (!ClInstrumentReads)
+          return None;
+        Access.IsWrite = false;
+      }
+
+      auto *BasePtr = CI->getOperand(0 + OpOffset);
+      auto *Ty = cast<PointerType>(BasePtr->getType())->getElementType();
+      Access.TypeSize = DL.getTypeStoreSizeInBits(Ty);
+      if (auto *AlignmentConstant =
+              dyn_cast<ConstantInt>(CI->getOperand(1 + OpOffset)))
+        Access.Alignment = (unsigned)AlignmentConstant->getZExtValue();
+      else
+        Access.Alignment = 1; // No alignment guarantees. We probably got Undef
+      Access.MaybeMask = CI->getOperand(2 + OpOffset);
+      Access.Addr = BasePtr;
+    }
+  }
+
+  if (!Access.Addr)
+    return None;
+
+  // Do not instrument acesses from different address spaces; we cannot deal
+  // with them.
+  Type *PtrTy = cast<PointerType>(Access.Addr->getType()->getScalarType());
+  if (PtrTy->getPointerAddressSpace() != 0)
+    return None;
+
+  // Ignore swifterror addresses.
+  // swifterror memory addresses are mem2reg promoted by instruction
+  // selection. As such they cannot have regular uses like an instrumentation
+  // function and it makes no sense to track them as memory.
+  if (Access.Addr->isSwiftError())
+    return None;
+
+  return Access;
+}
+
+void MemProfiler::instrumentMaskedLoadOrStore(const DataLayout &DL, Value *Mask,
+                                              Instruction *I, Value *Addr,
+                                              unsigned Alignment,
+                                              uint32_t TypeSize, bool IsWrite) {
+  auto *VTy = cast<FixedVectorType>(
+      cast<PointerType>(Addr->getType())->getElementType());
+  uint64_t ElemTypeSize = DL.getTypeStoreSizeInBits(VTy->getScalarType());
+  unsigned Num = VTy->getNumElements();
+  auto *Zero = ConstantInt::get(IntptrTy, 0);
+  for (unsigned Idx = 0; Idx < Num; ++Idx) {
+    Value *InstrumentedAddress = nullptr;
+    Instruction *InsertBefore = I;
+    if (auto *Vector = dyn_cast<ConstantVector>(Mask)) {
+      // dyn_cast as we might get UndefValue
+      if (auto *Masked = dyn_cast<ConstantInt>(Vector->getOperand(Idx))) {
+        if (Masked->isZero())
+          // Mask is constant false, so no instrumentation needed.
+          continue;
+        // If we have a true or undef value, fall through to instrumentAddress.
+        // with InsertBefore == I
+      }
+    } else {
+      IRBuilder<> IRB(I);
+      Value *MaskElem = IRB.CreateExtractElement(Mask, Idx);
+      Instruction *ThenTerm = SplitBlockAndInsertIfThen(MaskElem, I, false);
+      InsertBefore = ThenTerm;
+    }
+
+    IRBuilder<> IRB(InsertBefore);
+    InstrumentedAddress =
+        IRB.CreateGEP(VTy, Addr, {Zero, ConstantInt::get(IntptrTy, Idx)});
+    instrumentAddress(I, InsertBefore, InstrumentedAddress, ElemTypeSize,
+                      IsWrite);
+  }
+}
+
+void MemProfiler::instrumentMop(Instruction *I, const DataLayout &DL,
+                                InterestingMemoryAccess &Access) {
+  if (Access.IsWrite)
+    NumInstrumentedWrites++;
+  else
+    NumInstrumentedReads++;
+
+  if (Access.MaybeMask) {
+    instrumentMaskedLoadOrStore(DL, Access.MaybeMask, I, Access.Addr,
+                                Access.Alignment, Access.TypeSize,
+                                Access.IsWrite);
+  } else {
+    // Since the access counts will be accumulated across the entire allocation,
+    // we only update the shadow access count for the first location and thus
+    // don't need to worry about alignment and type size.
+    instrumentAddress(I, I, Access.Addr, Access.TypeSize, Access.IsWrite);
+  }
+}
+
+void MemProfiler::instrumentAddress(Instruction *OrigIns,
+                                    Instruction *InsertBefore, Value *Addr,
+                                    uint32_t TypeSize, bool IsWrite) {
+  IRBuilder<> IRB(InsertBefore);
+  Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
+
+  if (ClUseCalls) {
+    IRB.CreateCall(MemProfMemoryAccessCallback[IsWrite], AddrLong);
+    return;
+  }
+
+  // Create an inline sequence to compute shadow location, and increment the
+  // value by one.
+  Type *ShadowTy = Type::getInt64Ty(*C);
+  Type *ShadowPtrTy = PointerType::get(ShadowTy, 0);
+  Value *ShadowPtr = memToShadow(AddrLong, IRB);
+  Value *ShadowAddr = IRB.CreateIntToPtr(ShadowPtr, ShadowPtrTy);
+  Value *ShadowValue = IRB.CreateLoad(ShadowTy, ShadowAddr);
+  Value *Inc = ConstantInt::get(Type::getInt64Ty(*C), 1);
+  ShadowValue = IRB.CreateAdd(ShadowValue, Inc);
+  IRB.CreateStore(ShadowValue, ShadowAddr);
+}
+
+// Create the variable for the profile file name.
+void createProfileFileNameVar(Module &M) {
+  const MDString *MemProfFilename =
+      dyn_cast_or_null<MDString>(M.getModuleFlag("MemProfProfileFilename"));
+  if (!MemProfFilename)
+    return;
+  assert(!MemProfFilename->getString().empty() &&
+         "Unexpected MemProfProfileFilename metadata with empty string");
+  Constant *ProfileNameConst = ConstantDataArray::getString(
+      M.getContext(), MemProfFilename->getString(), true);
+  GlobalVariable *ProfileNameVar = new GlobalVariable(
+      M, ProfileNameConst->getType(), /*isConstant=*/true,
+      GlobalValue::WeakAnyLinkage, ProfileNameConst, MemProfFilenameVar);
+  Triple TT(M.getTargetTriple());
+  if (TT.supportsCOMDAT()) {
+    ProfileNameVar->setLinkage(GlobalValue::ExternalLinkage);
+    ProfileNameVar->setComdat(M.getOrInsertComdat(MemProfFilenameVar));
+  }
+}
+
+bool ModuleMemProfiler::instrumentModule(Module &M) {
+  // Create a module constructor.
+  std::string MemProfVersion = std::to_string(LLVM_MEM_PROFILER_VERSION);
+  std::string VersionCheckName =
+      ClInsertVersionCheck ? (MemProfVersionCheckNamePrefix + MemProfVersion)
+                           : "";
+  std::tie(MemProfCtorFunction, std::ignore) =
+      createSanitizerCtorAndInitFunctions(M, MemProfModuleCtorName,
+                                          MemProfInitName, /*InitArgTypes=*/{},
+                                          /*InitArgs=*/{}, VersionCheckName);
+
+  const uint64_t Priority = getCtorAndDtorPriority(TargetTriple);
+  appendToGlobalCtors(M, MemProfCtorFunction, Priority);
+
+  createProfileFileNameVar(M);
+
+  return true;
+}
+
+void MemProfiler::initializeCallbacks(Module &M) {
+  IRBuilder<> IRB(*C);
+
+  for (size_t AccessIsWrite = 0; AccessIsWrite <= 1; AccessIsWrite++) {
+    const std::string TypeStr = AccessIsWrite ? "store" : "load";
+
+    SmallVector<Type *, 3> Args2 = {IntptrTy, IntptrTy};
+    SmallVector<Type *, 2> Args1{1, IntptrTy};
+    MemProfMemoryAccessCallbackSized[AccessIsWrite] =
+        M.getOrInsertFunction(ClMemoryAccessCallbackPrefix + TypeStr + "N",
+                              FunctionType::get(IRB.getVoidTy(), Args2, false));
+
+    MemProfMemoryAccessCallback[AccessIsWrite] =
+        M.getOrInsertFunction(ClMemoryAccessCallbackPrefix + TypeStr,
+                              FunctionType::get(IRB.getVoidTy(), Args1, false));
+  }
+  MemProfMemmove = M.getOrInsertFunction(
+      ClMemoryAccessCallbackPrefix + "memmove", IRB.getInt8PtrTy(),
+      IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IntptrTy);
+  MemProfMemcpy = M.getOrInsertFunction(ClMemoryAccessCallbackPrefix + "memcpy",
+                                        IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+                                        IRB.getInt8PtrTy(), IntptrTy);
+  MemProfMemset = M.getOrInsertFunction(ClMemoryAccessCallbackPrefix + "memset",
+                                        IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+                                        IRB.getInt32Ty(), IntptrTy);
+}
+
+bool MemProfiler::maybeInsertMemProfInitAtFunctionEntry(Function &F) {
+  // For each NSObject descendant having a +load method, this method is invoked
+  // by the ObjC runtime before any of the static constructors is called.
+  // Therefore we need to instrument such methods with a call to __memprof_init
+  // at the beginning in order to initialize our runtime before any access to
+  // the shadow memory.
+  // We cannot just ignore these methods, because they may call other
+  // instrumented functions.
+  if (F.getName().find(" load]") != std::string::npos) {
+    FunctionCallee MemProfInitFunction =
+        declareSanitizerInitFunction(*F.getParent(), MemProfInitName, {});
+    IRBuilder<> IRB(&F.front(), F.front().begin());
+    IRB.CreateCall(MemProfInitFunction, {});
+    return true;
+  }
+  return false;
+}
+
+bool MemProfiler::insertDynamicShadowAtFunctionEntry(Function &F) {
+  IRBuilder<> IRB(&F.front().front());
+  Value *GlobalDynamicAddress = F.getParent()->getOrInsertGlobal(
+      MemProfShadowMemoryDynamicAddress, IntptrTy);
+  if (F.getParent()->getPICLevel() == PICLevel::NotPIC)
+    cast<GlobalVariable>(GlobalDynamicAddress)->setDSOLocal(true);
+  DynamicShadowOffset = IRB.CreateLoad(IntptrTy, GlobalDynamicAddress);
+  return true;
+}
+
+bool MemProfiler::instrumentFunction(Function &F) {
+  if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage)
+    return false;
+  if (ClDebugFunc == F.getName())
+    return false;
+  if (F.getName().startswith("__memprof_"))
+    return false;
+
+  bool FunctionModified = false;
+
+  // If needed, insert __memprof_init.
+  // This function needs to be called even if the function body is not
+  // instrumented.
+  if (maybeInsertMemProfInitAtFunctionEntry(F))
+    FunctionModified = true;
+
+  LLVM_DEBUG(dbgs() << "MEMPROF instrumenting:\n" << F << "\n");
+
+  initializeCallbacks(*F.getParent());
+
+  FunctionModified |= insertDynamicShadowAtFunctionEntry(F);
+
+  SmallVector<Instruction *, 16> ToInstrument;
+
+  // Fill the set of memory operations to instrument.
+  for (auto &BB : F) {
+    for (auto &Inst : BB) {
+      if (isInterestingMemoryAccess(&Inst) || isa<MemIntrinsic>(Inst))
+        ToInstrument.push_back(&Inst);
+    }
+  }
+
+  int NumInstrumented = 0;
+  for (auto *Inst : ToInstrument) {
+    if (ClDebugMin < 0 || ClDebugMax < 0 ||
+        (NumInstrumented >= ClDebugMin && NumInstrumented <= ClDebugMax)) {
+      Optional<InterestingMemoryAccess> Access =
+          isInterestingMemoryAccess(Inst);
+      if (Access)
+        instrumentMop(Inst, F.getParent()->getDataLayout(), *Access);
+      else
+        instrumentMemIntrinsic(cast<MemIntrinsic>(Inst));
+    }
+    NumInstrumented++;
+  }
+
+  if (NumInstrumented > 0)
+    FunctionModified = true;
+
+  LLVM_DEBUG(dbgs() << "MEMPROF done instrumenting: " << FunctionModified << " "
+                    << F << "\n");
+
+  return FunctionModified;
+}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index fcf7f470b3e1..7a6874584d59 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -153,6 +153,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
@@ -337,8 +338,8 @@ static cl::opt<uint64_t> ClOriginBase("msan-origin-base",
                                       cl::desc("Define custom MSan OriginBase"),
                                       cl::Hidden, cl::init(0));
 
-static const char *const kMsanModuleCtorName = "msan.module_ctor";
-static const char *const kMsanInitName = "__msan_init";
+const char kMsanModuleCtorName[] = "msan.module_ctor";
+const char kMsanInitName[] = "__msan_init";
 
 namespace {
 
@@ -572,6 +573,9 @@ private:
   /// uninitialized value and returns an updated origin id encoding this info.
   FunctionCallee MsanChainOriginFn;
 
+  /// Run-time helper that paints an origin over a region.
+  FunctionCallee MsanSetOriginFn;
+
   /// MSan runtime replacements for memmove, memcpy and memset.
   FunctionCallee MemmoveFn, MemcpyFn, MemsetFn;
 
@@ -850,6 +854,9 @@ void MemorySanitizer::initializeCallbacks(Module &M) {
   // instrumentation.
   MsanChainOriginFn = M.getOrInsertFunction(
     "__msan_chain_origin", IRB.getInt32Ty(), IRB.getInt32Ty());
+  MsanSetOriginFn =
+      M.getOrInsertFunction("__msan_set_origin", IRB.getVoidTy(),
+                            IRB.getInt8PtrTy(), IntptrTy, IRB.getInt32Ty());
   MemmoveFn = M.getOrInsertFunction(
     "__msan_memmove", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
     IRB.getInt8PtrTy(), IntptrTy);
@@ -1049,7 +1056,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   ValueMap<Value*, Value*> ShadowMap, OriginMap;
   std::unique_ptr<VarArgHelper> VAHelper;
   const TargetLibraryInfo *TLI;
-  BasicBlock *ActualFnStart;
+  Instruction *FnPrologueEnd;
 
   // The following flags disable parts of MSan instrumentation based on
   // exclusion list contents and command-line options.
@@ -1081,17 +1088,31 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     PoisonStack = SanitizeFunction && ClPoisonStack;
     PoisonUndef = SanitizeFunction && ClPoisonUndef;
 
+    // In the presence of unreachable blocks, we may see Phi nodes with
+    // incoming nodes from such blocks. Since InstVisitor skips unreachable
+    // blocks, such nodes will not have any shadow value associated with them.
+    // It's easier to remove unreachable blocks than deal with missing shadow.
+    removeUnreachableBlocks(F);
+
     MS.initializeCallbacks(*F.getParent());
-    if (MS.CompileKernel)
-      ActualFnStart = insertKmsanPrologue(F);
-    else
-      ActualFnStart = &F.getEntryBlock();
+    FnPrologueEnd = IRBuilder<>(F.getEntryBlock().getFirstNonPHI())
+                        .CreateIntrinsic(Intrinsic::donothing, {}, {});
+
+    if (MS.CompileKernel) {
+      IRBuilder<> IRB(FnPrologueEnd);
+      insertKmsanPrologue(IRB);
+    }
 
     LLVM_DEBUG(if (!InsertChecks) dbgs()
                << "MemorySanitizer is not inserting checks into '"
                << F.getName() << "'\n");
   }
 
+  bool isInPrologue(Instruction &I) {
+    return I.getParent() == FnPrologueEnd->getParent() &&
+           (&I == FnPrologueEnd || I.comesBefore(FnPrologueEnd));
+  }
+
   Value *updateOrigin(Value *V, IRBuilder<> &IRB) {
     if (MS.TrackOrigins <= 1) return V;
     return IRB.CreateCall(MS.MsanChainOriginFn, V);
@@ -1143,37 +1164,30 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     const DataLayout &DL = F.getParent()->getDataLayout();
     const Align OriginAlignment = std::max(kMinOriginAlignment, Alignment);
     unsigned StoreSize = DL.getTypeStoreSize(Shadow->getType());
-    if (Shadow->getType()->isAggregateType()) {
-      paintOrigin(IRB, updateOrigin(Origin, IRB), OriginPtr, StoreSize,
-                  OriginAlignment);
-    } else {
-      Value *ConvertedShadow = convertToShadowTyNoVec(Shadow, IRB);
-      if (auto *ConstantShadow = dyn_cast<Constant>(ConvertedShadow)) {
-        if (ClCheckConstantShadow && !ConstantShadow->isZeroValue())
-          paintOrigin(IRB, updateOrigin(Origin, IRB), OriginPtr, StoreSize,
-                      OriginAlignment);
-        return;
-      }
-
-      unsigned TypeSizeInBits =
-          DL.getTypeSizeInBits(ConvertedShadow->getType());
-      unsigned SizeIndex = TypeSizeToSizeIndex(TypeSizeInBits);
-      if (AsCall && SizeIndex < kNumberOfAccessSizes && !MS.CompileKernel) {
-        FunctionCallee Fn = MS.MaybeStoreOriginFn[SizeIndex];
-        Value *ConvertedShadow2 = IRB.CreateZExt(
-            ConvertedShadow, IRB.getIntNTy(8 * (1 << SizeIndex)));
-        IRB.CreateCall(Fn, {ConvertedShadow2,
-                            IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()),
-                            Origin});
-      } else {
-        Value *Cmp = IRB.CreateICmpNE(
-            ConvertedShadow, getCleanShadow(ConvertedShadow), "_mscmp");
-        Instruction *CheckTerm = SplitBlockAndInsertIfThen(
-            Cmp, &*IRB.GetInsertPoint(), false, MS.OriginStoreWeights);
-        IRBuilder<> IRBNew(CheckTerm);
-        paintOrigin(IRBNew, updateOrigin(Origin, IRBNew), OriginPtr, StoreSize,
+    Value *ConvertedShadow = convertShadowToScalar(Shadow, IRB);
+    if (auto *ConstantShadow = dyn_cast<Constant>(ConvertedShadow)) {
+      if (ClCheckConstantShadow && !ConstantShadow->isZeroValue())
+        paintOrigin(IRB, updateOrigin(Origin, IRB), OriginPtr, StoreSize,
                     OriginAlignment);
-      }
+      return;
+    }
+
+    unsigned TypeSizeInBits = DL.getTypeSizeInBits(ConvertedShadow->getType());
+    unsigned SizeIndex = TypeSizeToSizeIndex(TypeSizeInBits);
+    if (AsCall && SizeIndex < kNumberOfAccessSizes && !MS.CompileKernel) {
+      FunctionCallee Fn = MS.MaybeStoreOriginFn[SizeIndex];
+      Value *ConvertedShadow2 =
+          IRB.CreateZExt(ConvertedShadow, IRB.getIntNTy(8 * (1 << SizeIndex)));
+      IRB.CreateCall(Fn,
+                     {ConvertedShadow2,
+                      IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()), Origin});
+    } else {
+      Value *Cmp = convertToBool(ConvertedShadow, IRB, "_mscmp");
+      Instruction *CheckTerm = SplitBlockAndInsertIfThen(
+          Cmp, &*IRB.GetInsertPoint(), false, MS.OriginStoreWeights);
+      IRBuilder<> IRBNew(CheckTerm);
+      paintOrigin(IRBNew, updateOrigin(Origin, IRBNew), OriginPtr, StoreSize,
+                  OriginAlignment);
     }
   }
 
@@ -1218,7 +1232,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
                            bool AsCall) {
     IRBuilder<> IRB(OrigIns);
     LLVM_DEBUG(dbgs() << "  SHAD0 : " << *Shadow << "\n");
-    Value *ConvertedShadow = convertToShadowTyNoVec(Shadow, IRB);
+    Value *ConvertedShadow = convertShadowToScalar(Shadow, IRB);
     LLVM_DEBUG(dbgs() << "  SHAD1 : " << *ConvertedShadow << "\n");
 
     if (auto *ConstantShadow = dyn_cast<Constant>(ConvertedShadow)) {
@@ -1240,8 +1254,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
                                                 ? Origin
                                                 : (Value *)IRB.getInt32(0)});
     } else {
-      Value *Cmp = IRB.CreateICmpNE(ConvertedShadow,
-                                    getCleanShadow(ConvertedShadow), "_mscmp");
+      Value *Cmp = convertToBool(ConvertedShadow, IRB, "_mscmp");
       Instruction *CheckTerm = SplitBlockAndInsertIfThen(
           Cmp, OrigIns,
           /* Unreachable */ !MS.Recover, MS.ColdCallWeights);
@@ -1262,10 +1275,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     LLVM_DEBUG(dbgs() << "DONE:\n" << F);
   }
 
-  BasicBlock *insertKmsanPrologue(Function &F) {
-    BasicBlock *ret =
-        SplitBlock(&F.getEntryBlock(), F.getEntryBlock().getFirstNonPHI());
-    IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI());
+  // Returns the last instruction in the new prologue
+  void insertKmsanPrologue(IRBuilder<> &IRB) {
     Value *ContextState = IRB.CreateCall(MS.MsanGetContextStateFn, {});
     Constant *Zero = IRB.getInt32(0);
     MS.ParamTLS = IRB.CreateGEP(MS.MsanContextStateTy, ContextState,
@@ -1284,21 +1295,14 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     MS.RetvalOriginTLS =
         IRB.CreateGEP(MS.MsanContextStateTy, ContextState,
                       {Zero, IRB.getInt32(6)}, "retval_origin");
-    return ret;
   }
 
   /// Add MemorySanitizer instrumentation to a function.
   bool runOnFunction() {
-    // In the presence of unreachable blocks, we may see Phi nodes with
-    // incoming nodes from such blocks. Since InstVisitor skips unreachable
-    // blocks, such nodes will not have any shadow value associated with them.
-    // It's easier to remove unreachable blocks than deal with missing shadow.
-    removeUnreachableBlocks(F);
-
     // Iterate all BBs in depth-first order and create shadow instructions
     // for all instructions (where applicable).
     // For PHI nodes we create dummy shadow PHIs which will be finalized later.
-    for (BasicBlock *BB : depth_first(ActualFnStart))
+    for (BasicBlock *BB : depth_first(FnPrologueEnd->getParent()))
       visit(*BB);
 
     // Finalize PHI nodes.
@@ -1385,14 +1389,68 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     return ty;
   }
 
-  /// Convert a shadow value to it's flattened variant.
-  Value *convertToShadowTyNoVec(Value *V, IRBuilder<> &IRB) {
+  /// Extract combined shadow of struct elements as a bool
+  Value *collapseStructShadow(StructType *Struct, Value *Shadow,
+                              IRBuilder<> &IRB) {
+    Value *FalseVal = IRB.getIntN(/* width */ 1, /* value */ 0);
+    Value *Aggregator = FalseVal;
+
+    for (unsigned Idx = 0; Idx < Struct->getNumElements(); Idx++) {
+      // Combine by ORing together each element's bool shadow
+      Value *ShadowItem = IRB.CreateExtractValue(Shadow, Idx);
+      Value *ShadowInner = convertShadowToScalar(ShadowItem, IRB);
+      Value *ShadowBool = convertToBool(ShadowInner, IRB);
+
+      if (Aggregator != FalseVal)
+        Aggregator = IRB.CreateOr(Aggregator, ShadowBool);
+      else
+        Aggregator = ShadowBool;
+    }
+
+    return Aggregator;
+  }
+
+  // Extract combined shadow of array elements
+  Value *collapseArrayShadow(ArrayType *Array, Value *Shadow,
+                             IRBuilder<> &IRB) {
+    if (!Array->getNumElements())
+      return IRB.getIntN(/* width */ 1, /* value */ 0);
+
+    Value *FirstItem = IRB.CreateExtractValue(Shadow, 0);
+    Value *Aggregator = convertShadowToScalar(FirstItem, IRB);
+
+    for (unsigned Idx = 1; Idx < Array->getNumElements(); Idx++) {
+      Value *ShadowItem = IRB.CreateExtractValue(Shadow, Idx);
+      Value *ShadowInner = convertShadowToScalar(ShadowItem, IRB);
+      Aggregator = IRB.CreateOr(Aggregator, ShadowInner);
+    }
+    return Aggregator;
+  }
+
+  /// Convert a shadow value to it's flattened variant. The resulting
+  /// shadow may not necessarily have the same bit width as the input
+  /// value, but it will always be comparable to zero.
+  Value *convertShadowToScalar(Value *V, IRBuilder<> &IRB) {
+    if (StructType *Struct = dyn_cast<StructType>(V->getType()))
+      return collapseStructShadow(Struct, V, IRB);
+    if (ArrayType *Array = dyn_cast<ArrayType>(V->getType()))
+      return collapseArrayShadow(Array, V, IRB);
     Type *Ty = V->getType();
     Type *NoVecTy = getShadowTyNoVec(Ty);
     if (Ty == NoVecTy) return V;
     return IRB.CreateBitCast(V, NoVecTy);
   }
 
+  // Convert a scalar value to an i1 by comparing with 0
+  Value *convertToBool(Value *V, IRBuilder<> &IRB, const Twine &name = "") {
+    Type *VTy = V->getType();
+    assert(VTy->isIntegerTy());
+    if (VTy->getIntegerBitWidth() == 1)
+      // Just converting a bool to a bool, so do nothing.
+      return V;
+    return IRB.CreateICmpNE(V, ConstantInt::get(VTy, 0), name);
+  }
+
   /// Compute the integer shadow offset that corresponds to a given
   /// application address.
   ///
@@ -1611,7 +1669,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       if (*ShadowPtr)
         return *ShadowPtr;
       Function *F = A->getParent();
-      IRBuilder<> EntryIRB(ActualFnStart->getFirstNonPHI());
+      IRBuilder<> EntryIRB(FnPrologueEnd);
       unsigned ArgOffset = 0;
       const DataLayout &DL = F->getParent()->getDataLayout();
       for (auto &FArg : F->args()) {
@@ -1679,6 +1737,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
           } else {
             setOrigin(A, getCleanOrigin());
           }
+
+          break;
         }
 
         if (!FArgEagerCheck)
@@ -1726,8 +1786,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     if (!InsertChecks) return;
 #ifndef NDEBUG
     Type *ShadowTy = Shadow->getType();
-    assert((isa<IntegerType>(ShadowTy) || isa<VectorType>(ShadowTy)) &&
-           "Can only insert checks for integer and vector shadow types");
+    assert((isa<IntegerType>(ShadowTy) || isa<VectorType>(ShadowTy) ||
+            isa<StructType>(ShadowTy) || isa<ArrayType>(ShadowTy)) &&
+           "Can only insert checks for integer, vector, and aggregate shadow "
+           "types");
 #endif
     InstrumentationList.push_back(
         ShadowOriginAndInsertPoint(Shadow, Origin, OrigIns));
@@ -1769,6 +1831,24 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     llvm_unreachable("Unknown ordering");
   }
 
+  Value *makeAddReleaseOrderingTable(IRBuilder<> &IRB) {
+    constexpr int NumOrderings = (int)AtomicOrderingCABI::seq_cst + 1;
+    uint32_t OrderingTable[NumOrderings] = {};
+
+    OrderingTable[(int)AtomicOrderingCABI::relaxed] =
+        OrderingTable[(int)AtomicOrderingCABI::release] =
+            (int)AtomicOrderingCABI::release;
+    OrderingTable[(int)AtomicOrderingCABI::consume] =
+        OrderingTable[(int)AtomicOrderingCABI::acquire] =
+            OrderingTable[(int)AtomicOrderingCABI::acq_rel] =
+                (int)AtomicOrderingCABI::acq_rel;
+    OrderingTable[(int)AtomicOrderingCABI::seq_cst] =
+        (int)AtomicOrderingCABI::seq_cst;
+
+    return ConstantDataVector::get(IRB.getContext(),
+                                   makeArrayRef(OrderingTable, NumOrderings));
+  }
+
   AtomicOrdering addAcquireOrdering(AtomicOrdering a) {
     switch (a) {
       case AtomicOrdering::NotAtomic:
@@ -1786,11 +1866,33 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     llvm_unreachable("Unknown ordering");
   }
 
+  Value *makeAddAcquireOrderingTable(IRBuilder<> &IRB) {
+    constexpr int NumOrderings = (int)AtomicOrderingCABI::seq_cst + 1;
+    uint32_t OrderingTable[NumOrderings] = {};
+
+    OrderingTable[(int)AtomicOrderingCABI::relaxed] =
+        OrderingTable[(int)AtomicOrderingCABI::acquire] =
+            OrderingTable[(int)AtomicOrderingCABI::consume] =
+                (int)AtomicOrderingCABI::acquire;
+    OrderingTable[(int)AtomicOrderingCABI::release] =
+        OrderingTable[(int)AtomicOrderingCABI::acq_rel] =
+            (int)AtomicOrderingCABI::acq_rel;
+    OrderingTable[(int)AtomicOrderingCABI::seq_cst] =
+        (int)AtomicOrderingCABI::seq_cst;
+
+    return ConstantDataVector::get(IRB.getContext(),
+                                   makeArrayRef(OrderingTable, NumOrderings));
+  }
+
   // ------------------- Visitors.
   using InstVisitor<MemorySanitizerVisitor>::visit;
   void visit(Instruction &I) {
-    if (!I.getMetadata("nosanitize"))
-      InstVisitor<MemorySanitizerVisitor>::visit(I);
+    if (I.getMetadata("nosanitize"))
+      return;
+    // Don't want to visit if we're in the prologue
+    if (isInPrologue(I))
+      return;
+    InstVisitor<MemorySanitizerVisitor>::visit(I);
   }
 
   /// Instrument LoadInst
@@ -2046,7 +2148,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
           Constant *ConstOrigin = dyn_cast<Constant>(OpOrigin);
           // No point in adding something that might result in 0 origin value.
           if (!ConstOrigin || !ConstOrigin->isNullValue()) {
-            Value *FlatShadow = MSV->convertToShadowTyNoVec(OpShadow, IRB);
+            Value *FlatShadow = MSV->convertShadowToScalar(OpShadow, IRB);
             Value *Cond =
                 IRB.CreateICmpNE(FlatShadow, MSV->getCleanShadow(FlatShadow));
             Origin = IRB.CreateSelect(Cond, OpOrigin, Origin);
@@ -2538,7 +2640,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       return false;
 
     unsigned NumArgOperands = I.getNumArgOperands();
-
     for (unsigned i = 0; i < NumArgOperands; ++i) {
       Type *Ty = I.getArgOperand(i)->getType();
       if (Ty != RetTy)
@@ -2602,9 +2703,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   void handleLifetimeStart(IntrinsicInst &I) {
     if (!PoisonStack)
       return;
-    DenseMap<Value *, AllocaInst *> AllocaForValue;
-    AllocaInst *AI =
-        llvm::findAllocaForValue(I.getArgOperand(1), AllocaForValue);
+    AllocaInst *AI = llvm::findAllocaForValue(I.getArgOperand(1));
     if (!AI)
       InstrumentLifetimeStart = false;
     LifetimeStartList.push_back(std::make_pair(&I, AI));
@@ -2635,14 +2734,16 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   // We copy the shadow of \p CopyOp[NumUsedElements:] to \p
   // Out[NumUsedElements:]. This means that intrinsics without \p CopyOp always
   // return a fully initialized value.
-  void handleVectorConvertIntrinsic(IntrinsicInst &I, int NumUsedElements) {
+  void handleVectorConvertIntrinsic(IntrinsicInst &I, int NumUsedElements,
+                                    bool HasRoundingMode = false) {
     IRBuilder<> IRB(&I);
     Value *CopyOp, *ConvertOp;
 
-    switch (I.getNumArgOperands()) {
-    case 3:
-      assert(isa<ConstantInt>(I.getArgOperand(2)) && "Invalid rounding mode");
-      LLVM_FALLTHROUGH;
+    assert((!HasRoundingMode ||
+            isa<ConstantInt>(I.getArgOperand(I.getNumArgOperands() - 1))) &&
+           "Invalid rounding mode");
+
+    switch (I.getNumArgOperands() - HasRoundingMode) {
     case 2:
       CopyOp = I.getArgOperand(0);
       ConvertOp = I.getArgOperand(1);
@@ -2898,7 +2999,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOrigin(&I, getOrigin(&I, 0));
   }
 
-  // Instrument experimental.vector.reduce.or intrinsic.
+  // Instrument vector.reduce.or intrinsic.
   // Valid (non-poisoned) set bits in the operand pull low the
   // corresponding shadow bits.
   void handleVectorReduceOrIntrinsic(IntrinsicInst &I) {
@@ -2916,7 +3017,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOrigin(&I, getOrigin(&I, 0));
   }
 
-  // Instrument experimental.vector.reduce.or intrinsic.
+  // Instrument vector.reduce.and intrinsic.
   // Valid (non-poisoned) unset bits in the operand pull down the
   // corresponding shadow bits.
   void handleVectorReduceAndIntrinsic(IntrinsicInst &I) {
@@ -3089,18 +3190,15 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   // and then apply the usual shadow combining logic.
   void handlePclmulIntrinsic(IntrinsicInst &I) {
     IRBuilder<> IRB(&I);
-    Type *ShadowTy = getShadowTy(&I);
     unsigned Width =
         cast<FixedVectorType>(I.getArgOperand(0)->getType())->getNumElements();
     assert(isa<ConstantInt>(I.getArgOperand(2)) &&
            "pclmul 3rd operand must be a constant");
     unsigned Imm = cast<ConstantInt>(I.getArgOperand(2))->getZExtValue();
-    Value *Shuf0 =
-        IRB.CreateShuffleVector(getShadow(&I, 0), UndefValue::get(ShadowTy),
-                                getPclmulMask(Width, Imm & 0x01));
-    Value *Shuf1 =
-        IRB.CreateShuffleVector(getShadow(&I, 1), UndefValue::get(ShadowTy),
-                                getPclmulMask(Width, Imm & 0x10));
+    Value *Shuf0 = IRB.CreateShuffleVector(getShadow(&I, 0),
+                                           getPclmulMask(Width, Imm & 0x01));
+    Value *Shuf1 = IRB.CreateShuffleVector(getShadow(&I, 1),
+                                           getPclmulMask(Width, Imm & 0x10));
     ShadowAndOriginCombiner SOC(this, IRB);
     SOC.Add(Shuf0, getOrigin(&I, 0));
     SOC.Add(Shuf1, getOrigin(&I, 1));
@@ -3133,8 +3231,24 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOriginForNaryOp(I);
   }
 
+  // Instrument abs intrinsic.
+  // handleUnknownIntrinsic can't handle it because of the last
+  // is_int_min_poison argument which does not match the result type.
+  void handleAbsIntrinsic(IntrinsicInst &I) {
+    assert(I.getType()->isIntOrIntVectorTy());
+    assert(I.getArgOperand(0)->getType() == I.getType());
+
+    // FIXME: Handle is_int_min_poison.
+    IRBuilder<> IRB(&I);
+    setShadow(&I, getShadow(&I, 0));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
   void visitIntrinsicInst(IntrinsicInst &I) {
     switch (I.getIntrinsicID()) {
+    case Intrinsic::abs:
+      handleAbsIntrinsic(I);
+      break;
     case Intrinsic::lifetime_start:
       handleLifetimeStart(I);
       break;
@@ -3151,15 +3265,15 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     case Intrinsic::masked_load:
       handleMaskedLoad(I);
       break;
-    case Intrinsic::experimental_vector_reduce_and:
+    case Intrinsic::vector_reduce_and:
       handleVectorReduceAndIntrinsic(I);
       break;
-    case Intrinsic::experimental_vector_reduce_or:
+    case Intrinsic::vector_reduce_or:
       handleVectorReduceOrIntrinsic(I);
       break;
-    case Intrinsic::experimental_vector_reduce_add:
-    case Intrinsic::experimental_vector_reduce_xor:
-    case Intrinsic::experimental_vector_reduce_mul:
+    case Intrinsic::vector_reduce_add:
+    case Intrinsic::vector_reduce_xor:
+    case Intrinsic::vector_reduce_mul:
       handleVectorReduceIntrinsic(I);
       break;
     case Intrinsic::x86_sse_stmxcsr:
@@ -3179,6 +3293,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     case Intrinsic::x86_avx512_cvtusi2ss:
     case Intrinsic::x86_avx512_cvtusi642sd:
     case Intrinsic::x86_avx512_cvtusi642ss:
+      handleVectorConvertIntrinsic(I, 1, true);
+      break;
     case Intrinsic::x86_sse2_cvtsd2si64:
     case Intrinsic::x86_sse2_cvtsd2si:
     case Intrinsic::x86_sse2_cvtsd2ss:
@@ -3404,6 +3520,63 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     }
   }
 
+  void visitLibAtomicLoad(CallBase &CB) {
+    // Since we use getNextNode here, we can't have CB terminate the BB.
+    assert(isa<CallInst>(CB));
+
+    IRBuilder<> IRB(&CB);
+    Value *Size = CB.getArgOperand(0);
+    Value *SrcPtr = CB.getArgOperand(1);
+    Value *DstPtr = CB.getArgOperand(2);
+    Value *Ordering = CB.getArgOperand(3);
+    // Convert the call to have at least Acquire ordering to make sure
+    // the shadow operations aren't reordered before it.
+    Value *NewOrdering =
+        IRB.CreateExtractElement(makeAddAcquireOrderingTable(IRB), Ordering);
+    CB.setArgOperand(3, NewOrdering);
+
+    IRBuilder<> NextIRB(CB.getNextNode());
+    NextIRB.SetCurrentDebugLocation(CB.getDebugLoc());
+
+    Value *SrcShadowPtr, *SrcOriginPtr;
+    std::tie(SrcShadowPtr, SrcOriginPtr) =
+        getShadowOriginPtr(SrcPtr, NextIRB, NextIRB.getInt8Ty(), Align(1),
+                           /*isStore*/ false);
+    Value *DstShadowPtr =
+        getShadowOriginPtr(DstPtr, NextIRB, NextIRB.getInt8Ty(), Align(1),
+                           /*isStore*/ true)
+            .first;
+
+    NextIRB.CreateMemCpy(DstShadowPtr, Align(1), SrcShadowPtr, Align(1), Size);
+    if (MS.TrackOrigins) {
+      Value *SrcOrigin = NextIRB.CreateAlignedLoad(MS.OriginTy, SrcOriginPtr,
+                                                   kMinOriginAlignment);
+      Value *NewOrigin = updateOrigin(SrcOrigin, NextIRB);
+      NextIRB.CreateCall(MS.MsanSetOriginFn, {DstPtr, Size, NewOrigin});
+    }
+  }
+
+  void visitLibAtomicStore(CallBase &CB) {
+    IRBuilder<> IRB(&CB);
+    Value *Size = CB.getArgOperand(0);
+    Value *DstPtr = CB.getArgOperand(2);
+    Value *Ordering = CB.getArgOperand(3);
+    // Convert the call to have at least Release ordering to make sure
+    // the shadow operations aren't reordered after it.
+    Value *NewOrdering =
+        IRB.CreateExtractElement(makeAddReleaseOrderingTable(IRB), Ordering);
+    CB.setArgOperand(3, NewOrdering);
+
+    Value *DstShadowPtr =
+        getShadowOriginPtr(DstPtr, IRB, IRB.getInt8Ty(), Align(1),
+                           /*isStore*/ true)
+            .first;
+
+    // Atomic store always paints clean shadow/origin. See file header.
+    IRB.CreateMemSet(DstShadowPtr, getCleanShadow(IRB.getInt8Ty()), Size,
+                     Align(1));
+  }
+
   void visitCallBase(CallBase &CB) {
     assert(!CB.getMetadata("nosanitize"));
     if (CB.isInlineAsm()) {
@@ -3417,6 +3590,28 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
         visitInstruction(CB);
       return;
     }
+    LibFunc LF;
+    if (TLI->getLibFunc(CB, LF)) {
+      // libatomic.a functions need to have special handling because there isn't
+      // a good way to intercept them or compile the library with
+      // instrumentation.
+      switch (LF) {
+      case LibFunc_atomic_load:
+        if (!isa<CallInst>(CB)) {
+          llvm::errs() << "MSAN -- cannot instrument invoke of libatomic load."
+                          "Ignoring!\n";
+          break;
+        }
+        visitLibAtomicLoad(CB);
+        return;
+      case LibFunc_atomic_store:
+        visitLibAtomicStore(CB);
+        return;
+      default:
+        break;
+      }
+    }
+
     if (auto *Call = dyn_cast<CallInst>(&CB)) {
       assert(!isa<IntrinsicInst>(Call) && "intrinsics are handled elsewhere");
 
@@ -3424,20 +3619,27 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       // will become a non-readonly function after it is instrumented by us. To
       // prevent this code from being optimized out, mark that function
       // non-readonly in advance.
+      AttrBuilder B;
+      B.addAttribute(Attribute::ReadOnly)
+          .addAttribute(Attribute::ReadNone)
+          .addAttribute(Attribute::WriteOnly)
+          .addAttribute(Attribute::ArgMemOnly)
+          .addAttribute(Attribute::Speculatable);
+
+      Call->removeAttributes(AttributeList::FunctionIndex, B);
       if (Function *Func = Call->getCalledFunction()) {
-        // Clear out readonly/readnone attributes.
-        AttrBuilder B;
-        B.addAttribute(Attribute::ReadOnly)
-            .addAttribute(Attribute::ReadNone)
-            .addAttribute(Attribute::WriteOnly)
-            .addAttribute(Attribute::ArgMemOnly)
-            .addAttribute(Attribute::Speculatable);
         Func->removeAttributes(AttributeList::FunctionIndex, B);
       }
 
       maybeMarkSanitizerLibraryCallNoBuiltin(Call, TLI);
     }
     IRBuilder<> IRB(&CB);
+    bool MayCheckCall = ClEagerChecks;
+    if (Function *Func = CB.getCalledFunction()) {
+      // __sanitizer_unaligned_{load,store} functions may be called by users
+      // and always expects shadows in the TLS. So don't check them.
+      MayCheckCall &= !Func->getName().startswith("__sanitizer_unaligned_");
+    }
 
     unsigned ArgOffset = 0;
     LLVM_DEBUG(dbgs() << "  CallSite: " << CB << "\n");
@@ -3463,7 +3665,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
       bool ByVal = CB.paramHasAttr(i, Attribute::ByVal);
       bool NoUndef = CB.paramHasAttr(i, Attribute::NoUndef);
-      bool EagerCheck = ClEagerChecks && !ByVal && NoUndef;
+      bool EagerCheck = MayCheckCall && !ByVal && NoUndef;
 
       if (EagerCheck) {
         insertShadowCheck(A, &CB);
@@ -3503,7 +3705,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       (void)Store;
       assert(Size != 0 && Store != nullptr);
       LLVM_DEBUG(dbgs() << "  Param:" << *Store << "\n");
-      ArgOffset += alignTo(Size, 8);
+      ArgOffset += alignTo(Size, kShadowTLSAlignment);
     }
     LLVM_DEBUG(dbgs() << "  done with call args\n");
 
@@ -3519,7 +3721,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     if (isa<CallInst>(CB) && cast<CallInst>(CB).isMustTailCall())
       return;
 
-    if (ClEagerChecks && CB.hasRetAttr(Attribute::NoUndef)) {
+    if (MayCheckCall && CB.hasRetAttr(Attribute::NoUndef)) {
       setShadow(&CB, getCleanShadow(&CB));
       setOrigin(&CB, getCleanOrigin());
       return;
@@ -3902,6 +4104,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOrigin(&I, getCleanOrigin());
   }
 
+  void visitFreezeInst(FreezeInst &I) {
+    // Freeze always returns a fully defined value.
+    setShadow(&I, getCleanShadow(&I));
+    setOrigin(&I, getCleanOrigin());
+  }
+
   void visitInstruction(Instruction &I) {
     // Everything else: stop propagating and check for poisoned shadow.
     if (ClDumpStrictInstructions)
@@ -4125,7 +4333,7 @@ struct VarArgAMD64Helper : public VarArgHelper {
     if (!VAStartInstrumentationList.empty()) {
       // If there is a va_start in this function, make a backup copy of
       // va_arg_tls somewhere in the function entry block.
-      IRBuilder<> IRB(MSV.ActualFnStart->getFirstNonPHI());
+      IRBuilder<> IRB(MSV.FnPrologueEnd);
       VAArgOverflowSize =
           IRB.CreateLoad(IRB.getInt64Ty(), MS.VAArgOverflowSizeTLS);
       Value *CopySize =
@@ -4271,7 +4479,7 @@ struct VarArgMIPS64Helper : public VarArgHelper {
   void finalizeInstrumentation() override {
     assert(!VAArgSize && !VAArgTLSCopy &&
            "finalizeInstrumentation called twice");
-    IRBuilder<> IRB(MSV.ActualFnStart->getFirstNonPHI());
+    IRBuilder<> IRB(MSV.FnPrologueEnd);
     VAArgSize = IRB.CreateLoad(IRB.getInt64Ty(), MS.VAArgOverflowSizeTLS);
     Value *CopySize = IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, 0),
                                     VAArgSize);
@@ -4464,7 +4672,7 @@ struct VarArgAArch64Helper : public VarArgHelper {
     if (!VAStartInstrumentationList.empty()) {
       // If there is a va_start in this function, make a backup copy of
       // va_arg_tls somewhere in the function entry block.
-      IRBuilder<> IRB(MSV.ActualFnStart->getFirstNonPHI());
+      IRBuilder<> IRB(MSV.FnPrologueEnd);
       VAArgOverflowSize =
           IRB.CreateLoad(IRB.getInt64Ty(), MS.VAArgOverflowSizeTLS);
       Value *CopySize =
@@ -4584,15 +4792,14 @@ struct VarArgPowerPC64Helper : public VarArgHelper {
     // For PowerPC, we need to deal with alignment of stack arguments -
     // they are mostly aligned to 8 bytes, but vectors and i128 arrays
     // are aligned to 16 bytes, byvals can be aligned to 8 or 16 bytes,
-    // and QPX vectors are aligned to 32 bytes.  For that reason, we
-    // compute current offset from stack pointer (which is always properly
-    // aligned), and offset for the first vararg, then subtract them.
+    // For that reason, we compute current offset from stack pointer (which is
+    // always properly aligned), and offset for the first vararg, then subtract
+    // them.
     unsigned VAArgBase;
     Triple TargetTriple(F.getParent()->getTargetTriple());
     // Parameter save area starts at 48 bytes from frame pointer for ABIv1,
     // and 32 bytes for ABIv2.  This is usually determined by target
     // endianness, but in theory could be overridden by function attribute.
-    // For simplicity, we ignore it here (it'd only matter for QPX vectors).
     if (TargetTriple.getArch() == Triple::ppc64)
       VAArgBase = 48;
     else
@@ -4710,7 +4917,7 @@ struct VarArgPowerPC64Helper : public VarArgHelper {
   void finalizeInstrumentation() override {
     assert(!VAArgSize && !VAArgTLSCopy &&
            "finalizeInstrumentation called twice");
-    IRBuilder<> IRB(MSV.ActualFnStart->getFirstNonPHI());
+    IRBuilder<> IRB(MSV.FnPrologueEnd);
     VAArgSize = IRB.CreateLoad(IRB.getInt64Ty(), MS.VAArgOverflowSizeTLS);
     Value *CopySize = IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, 0),
                                     VAArgSize);
@@ -5029,7 +5236,7 @@ struct VarArgSystemZHelper : public VarArgHelper {
     if (!VAStartInstrumentationList.empty()) {
       // If there is a va_start in this function, make a backup copy of
       // va_arg_tls somewhere in the function entry block.
-      IRBuilder<> IRB(MSV.ActualFnStart->getFirstNonPHI());
+      IRBuilder<> IRB(MSV.FnPrologueEnd);
       VAArgOverflowSize =
           IRB.CreateLoad(IRB.getInt64Ty(), MS.VAArgOverflowSizeTLS);
       Value *CopySize =
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index dcfc28887a48..be6c8c631001 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -110,7 +110,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/MisExpect.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -249,6 +248,38 @@ static cl::opt<bool>
                                    "optimization remarks: -{Rpass|"
                                    "pass-remarks}=pgo-instrumentation"));
 
+static cl::opt<bool> PGOInstrumentEntry(
+    "pgo-instrument-entry", cl::init(false), cl::Hidden,
+    cl::desc("Force to instrument function entry basicblock."));
+
+static cl::opt<bool>
+    PGOFixEntryCount("pgo-fix-entry-count", cl::init(true), cl::Hidden,
+                     cl::desc("Fix function entry count in profile use."));
+
+static cl::opt<bool> PGOVerifyHotBFI(
+    "pgo-verify-hot-bfi", cl::init(false), cl::Hidden,
+    cl::desc("Print out the non-match BFI count if a hot raw profile count "
+             "becomes non-hot, or a cold raw profile count becomes hot. "
+             "The print is enabled under -Rpass-analysis=pgo, or "
+             "internal option -pass-remakrs-analysis=pgo."));
+
+static cl::opt<bool> PGOVerifyBFI(
+    "pgo-verify-bfi", cl::init(false), cl::Hidden,
+    cl::desc("Print out mismatched BFI counts after setting profile metadata "
+             "The print is enabled under -Rpass-analysis=pgo, or "
+             "internal option -pass-remakrs-analysis=pgo."));
+
+static cl::opt<unsigned> PGOVerifyBFIRatio(
+    "pgo-verify-bfi-ratio", cl::init(5), cl::Hidden,
+    cl::desc("Set the threshold for pgo-verify-big -- only print out "
+             "mismatched BFI if the difference percentage is greater than "
+             "this value (in percentage)."));
+
+static cl::opt<unsigned> PGOVerifyBFICutoff(
+    "pgo-verify-bfi-cutoff", cl::init(1), cl::Hidden,
+    cl::desc("Set the threshold for pgo-verify-bfi -- skip the counts whose "
+             "profile count value is below."));
+
 // Command line option to turn on CFG dot dump after profile annotation.
 // Defined in Analysis/BlockFrequencyInfo.cpp:  -pgo-view-counts
 extern cl::opt<PGOViewCountsType> PGOViewCounts;
@@ -257,6 +288,10 @@ extern cl::opt<PGOViewCountsType> PGOViewCounts;
 // Defined in Analysis/BlockFrequencyInfo.cpp:  -view-bfi-func-name=
 extern cl::opt<std::string> ViewBlockFreqFuncName;
 
+static cl::opt<bool>
+    PGOOldCFGHashing("pgo-instr-old-cfg-hashing", cl::init(false), cl::Hidden,
+                     cl::desc("Use the old CFG function hashing"));
+
 // Return a string describing the branch condition that can be
 // used in static branch probability heuristics:
 static std::string getBranchCondString(Instruction *TI) {
@@ -425,7 +460,7 @@ public:
 private:
   bool runOnModule(Module &M) override {
     createProfileFileNameVar(M, InstrProfileOutput);
-    createIRLevelProfileFlagVar(M, true);
+    createIRLevelProfileFlagVar(M, /* IsCS */ true, PGOInstrumentEntry);
     return false;
   }
   std::string InstrProfileOutput;
@@ -572,9 +607,11 @@ public:
       Function &Func, TargetLibraryInfo &TLI,
       std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers,
       bool CreateGlobalVar = false, BranchProbabilityInfo *BPI = nullptr,
-      BlockFrequencyInfo *BFI = nullptr, bool IsCS = false)
+      BlockFrequencyInfo *BFI = nullptr, bool IsCS = false,
+      bool InstrumentFuncEntry = true)
       : F(Func), IsCS(IsCS), ComdatMembers(ComdatMembers), VPC(Func, TLI),
-        ValueSites(IPVK_Last + 1), SIVisitor(Func), MST(F, BPI, BFI) {
+        ValueSites(IPVK_Last + 1), SIVisitor(Func),
+        MST(F, InstrumentFuncEntry, BPI, BFI) {
     // This should be done before CFG hash computation.
     SIVisitor.countSelects(Func);
     ValueSites[IPVK_MemOPSize] = VPC.get(IPVK_MemOPSize);
@@ -611,7 +648,8 @@ public:
 } // end anonymous namespace
 
 // Compute Hash value for the CFG: the lower 32 bits are CRC32 of the index
-// value of each BB in the CFG. The higher 32 bits record the number of edges.
+// value of each BB in the CFG. The higher 32 bits are the CRC32 of the numbers
+// of selects, indirect calls, mem ops and edges.
 template <class Edge, class BBInfo>
 void FuncPGOInstrumentation<Edge, BBInfo>::computeCFGHash() {
   std::vector<uint8_t> Indexes;
@@ -630,12 +668,31 @@ void FuncPGOInstrumentation<Edge, BBInfo>::computeCFGHash() {
   }
   JC.update(Indexes);
 
-  // Hash format for context sensitive profile. Reserve 4 bits for other
-  // information.
-  FunctionHash = (uint64_t)SIVisitor.getNumOfSelectInsts() << 56 |
-                 (uint64_t)ValueSites[IPVK_IndirectCallTarget].size() << 48 |
-                 //(uint64_t)ValueSites[IPVK_MemOPSize].size() << 40 |
-                 (uint64_t)MST.AllEdges.size() << 32 | JC.getCRC();
+  JamCRC JCH;
+  if (PGOOldCFGHashing) {
+    // Hash format for context sensitive profile. Reserve 4 bits for other
+    // information.
+    FunctionHash = (uint64_t)SIVisitor.getNumOfSelectInsts() << 56 |
+                   (uint64_t)ValueSites[IPVK_IndirectCallTarget].size() << 48 |
+                   //(uint64_t)ValueSites[IPVK_MemOPSize].size() << 40 |
+                   (uint64_t)MST.AllEdges.size() << 32 | JC.getCRC();
+  } else {
+    // The higher 32 bits.
+    auto updateJCH = [&JCH](uint64_t Num) {
+      uint8_t Data[8];
+      support::endian::write64le(Data, Num);
+      JCH.update(Data);
+    };
+    updateJCH((uint64_t)SIVisitor.getNumOfSelectInsts());
+    updateJCH((uint64_t)ValueSites[IPVK_IndirectCallTarget].size());
+    updateJCH((uint64_t)ValueSites[IPVK_MemOPSize].size());
+    updateJCH((uint64_t)MST.AllEdges.size());
+
+    // Hash format for context sensitive profile. Reserve 4 bits for other
+    // information.
+    FunctionHash = (((uint64_t)JCH.getCRC()) << 28) + JC.getCRC();
+  }
+
   // Reserve bit 60-63 for other information purpose.
   FunctionHash &= 0x0FFFFFFFFFFFFFFF;
   if (IsCS)
@@ -644,8 +701,12 @@ void FuncPGOInstrumentation<Edge, BBInfo>::computeCFGHash() {
                     << " CRC = " << JC.getCRC()
                     << ", Selects = " << SIVisitor.getNumOfSelectInsts()
                     << ", Edges = " << MST.AllEdges.size() << ", ICSites = "
-                    << ValueSites[IPVK_IndirectCallTarget].size()
-                    << ", Hash = " << FunctionHash << "\n";);
+                    << ValueSites[IPVK_IndirectCallTarget].size());
+  if (!PGOOldCFGHashing) {
+    LLVM_DEBUG(dbgs() << ", Memops = " << ValueSites[IPVK_MemOPSize].size()
+                      << ", High32 CRC = " << JCH.getCRC());
+  }
+  LLVM_DEBUG(dbgs() << ", Hash = " << FunctionHash << "\n";);
 }
 
 // Check if we can safely rename this Comdat function.
@@ -656,7 +717,7 @@ static bool canRenameComdat(
     return false;
 
   // FIXME: Current only handle those Comdat groups that only containing one
-  // function and function aliases.
+  // function.
   // (1) For a Comdat group containing multiple functions, we need to have a
   // unique postfix based on the hashes for each function. There is a
   // non-trivial code refactoring to do this efficiently.
@@ -664,8 +725,7 @@ static bool canRenameComdat(
   // group including global vars.
   Comdat *C = F.getComdat();
   for (auto &&CM : make_range(ComdatMembers.equal_range(C))) {
-    if (dyn_cast<GlobalAlias>(CM.second))
-      continue;
+    assert(!isa<GlobalAlias>(CM.second));
     Function *FM = dyn_cast<Function>(CM.second);
     if (FM != &F)
       return false;
@@ -705,18 +765,8 @@ void FuncPGOInstrumentation<Edge, BBInfo>::renameComdatFunction() {
   NewComdat->setSelectionKind(OrigComdat->getSelectionKind());
 
   for (auto &&CM : make_range(ComdatMembers.equal_range(OrigComdat))) {
-    if (GlobalAlias *GA = dyn_cast<GlobalAlias>(CM.second)) {
-      // For aliases, change the name directly.
-      assert(dyn_cast<Function>(GA->getAliasee()->stripPointerCasts()) == &F);
-      std::string OrigGAName = GA->getName().str();
-      GA->setName(Twine(GA->getName() + "." + Twine(FunctionHash)));
-      GlobalAlias::create(GlobalValue::WeakAnyLinkage, OrigGAName, GA);
-      continue;
-    }
     // Must be a function.
-    Function *CF = dyn_cast<Function>(CM.second);
-    assert(CF);
-    CF->setComdat(NewComdat);
+    cast<Function>(CM.second)->setComdat(NewComdat);
   }
 }
 
@@ -781,8 +831,11 @@ BasicBlock *FuncPGOInstrumentation<Edge, BBInfo>::getInstrBB(Edge *E) {
   if (!E->IsCritical)
     return canInstrument(DestBB);
 
+  // Some IndirectBr critical edges cannot be split by the previous
+  // SplitIndirectBrCriticalEdges call. Bail out.
   unsigned SuccNum = GetSuccessorNumber(SrcBB, DestBB);
-  BasicBlock *InstrBB = SplitCriticalEdge(TI, SuccNum);
+  BasicBlock *InstrBB =
+      isa<IndirectBrInst>(TI) ? nullptr : SplitCriticalEdge(TI, SuccNum);
   if (!InstrBB) {
     LLVM_DEBUG(
         dbgs() << "Fail to split critical edge: not instrument this edge.\n");
@@ -845,8 +898,8 @@ static void instrumentOneFunc(
   // later in getInstrBB() to avoid invalidating it.
   SplitIndirectBrCriticalEdges(F, BPI, BFI);
 
-  FuncPGOInstrumentation<PGOEdge, BBInfo> FuncInfo(F, TLI, ComdatMembers, true,
-                                                   BPI, BFI, IsCS);
+  FuncPGOInstrumentation<PGOEdge, BBInfo> FuncInfo(
+      F, TLI, ComdatMembers, true, BPI, BFI, IsCS, PGOInstrumentEntry);
   std::vector<BasicBlock *> InstrumentBBs;
   FuncInfo.getInstrumentBBs(InstrumentBBs);
   unsigned NumCounters =
@@ -1004,13 +1057,15 @@ public:
   PGOUseFunc(Function &Func, Module *Modu, TargetLibraryInfo &TLI,
              std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers,
              BranchProbabilityInfo *BPI, BlockFrequencyInfo *BFIin,
-             ProfileSummaryInfo *PSI, bool IsCS)
+             ProfileSummaryInfo *PSI, bool IsCS, bool InstrumentFuncEntry)
       : F(Func), M(Modu), BFI(BFIin), PSI(PSI),
-        FuncInfo(Func, TLI, ComdatMembers, false, BPI, BFIin, IsCS),
+        FuncInfo(Func, TLI, ComdatMembers, false, BPI, BFIin, IsCS,
+                 InstrumentFuncEntry),
         FreqAttr(FFA_Normal), IsCS(IsCS) {}
 
   // Read counts for the instrumented BB from profile.
-  bool readCounters(IndexedInstrProfReader *PGOReader, bool &AllZeros);
+  bool readCounters(IndexedInstrProfReader *PGOReader, bool &AllZeros,
+                    bool &AllMinusOnes);
 
   // Populate the counts for all BBs.
   void populateCounters();
@@ -1121,11 +1176,18 @@ bool PGOUseFunc::setInstrumentedCounts(
   if (NumCounters != CountFromProfile.size()) {
     return false;
   }
+  auto *FuncEntry = &*F.begin();
+
   // Set the profile count to the Instrumented BBs.
   uint32_t I = 0;
   for (BasicBlock *InstrBB : InstrumentBBs) {
     uint64_t CountValue = CountFromProfile[I++];
     UseBBInfo &Info = getBBInfo(InstrBB);
+    // If we reach here, we know that we have some nonzero count
+    // values in this function. The entry count should not be 0.
+    // Fix it if necessary.
+    if (InstrBB == FuncEntry && CountValue == 0)
+      CountValue = 1;
     Info.setBBInfoCount(CountValue);
   }
   ProfileCountSize = CountFromProfile.size();
@@ -1186,7 +1248,8 @@ void PGOUseFunc::setEdgeCount(DirectEdges &Edges, uint64_t Value) {
 // Read the profile from ProfileFileName and assign the value to the
 // instrumented BB and the edges. This function also updates ProgramMaxCount.
 // Return true if the profile are successfully read, and false on errors.
-bool PGOUseFunc::readCounters(IndexedInstrProfReader *PGOReader, bool &AllZeros) {
+bool PGOUseFunc::readCounters(IndexedInstrProfReader *PGOReader, bool &AllZeros,
+                              bool &AllMinusOnes) {
   auto &Ctx = M->getContext();
   Expected<InstrProfRecord> Result =
       PGOReader->getInstrProfRecord(FuncInfo.FuncName, FuncInfo.FunctionHash);
@@ -1229,10 +1292,13 @@ bool PGOUseFunc::readCounters(IndexedInstrProfReader *PGOReader, bool &AllZeros)
 
   IsCS ? NumOfCSPGOFunc++ : NumOfPGOFunc++;
   LLVM_DEBUG(dbgs() << CountFromProfile.size() << " counts\n");
+  AllMinusOnes = (CountFromProfile.size() > 0);
   uint64_t ValueSum = 0;
   for (unsigned I = 0, S = CountFromProfile.size(); I < S; I++) {
     LLVM_DEBUG(dbgs() << "  " << I << ": " << CountFromProfile[I] << "\n");
     ValueSum += CountFromProfile[I];
+    if (CountFromProfile[I] != (uint64_t)-1)
+      AllMinusOnes = false;
   }
   AllZeros = (ValueSum == 0);
 
@@ -1316,7 +1382,6 @@ void PGOUseFunc::populateCounters() {
   }
 #endif
   uint64_t FuncEntryCount = getBBInfo(&*F.begin()).CountValue;
-  F.setEntryCount(ProfileCount(FuncEntryCount, Function::PCT_Real));
   uint64_t FuncMaxCount = FuncEntryCount;
   for (auto &BB : F) {
     auto BI = findBBInfo(&BB);
@@ -1324,6 +1389,11 @@ void PGOUseFunc::populateCounters() {
       continue;
     FuncMaxCount = std::max(FuncMaxCount, BI->CountValue);
   }
+
+  // Fix the obviously inconsistent entry count.
+  if (FuncMaxCount > 0 && FuncEntryCount == 0)
+    FuncEntryCount = 1;
+  F.setEntryCount(ProfileCount(FuncEntryCount, Function::PCT_Real));
   markFunctionAttributes(FuncEntryCount, FuncMaxCount);
 
   // Now annotate select instructions
@@ -1514,13 +1584,15 @@ static bool InstrumentAllFunctions(
   // For the context-sensitve instrumentation, we should have a separated pass
   // (before LTO/ThinLTO linking) to create these variables.
   if (!IsCS)
-    createIRLevelProfileFlagVar(M, /* IsCS */ false);
+    createIRLevelProfileFlagVar(M, /* IsCS */ false, PGOInstrumentEntry);
   std::unordered_multimap<Comdat *, GlobalValue *> ComdatMembers;
   collectComdatMembers(M, ComdatMembers);
 
   for (auto &F : M) {
     if (F.isDeclaration())
       continue;
+    if (F.hasFnAttribute(llvm::Attribute::NoProfile))
+      continue;
     auto &TLI = LookupTLI(F);
     auto *BPI = LookupBPI(F);
     auto *BFI = LookupBFI(F);
@@ -1532,7 +1604,7 @@ static bool InstrumentAllFunctions(
 PreservedAnalyses
 PGOInstrumentationGenCreateVar::run(Module &M, ModuleAnalysisManager &AM) {
   createProfileFileNameVar(M, CSInstrName);
-  createIRLevelProfileFlagVar(M, /* IsCS */ true);
+  createIRLevelProfileFlagVar(M, /* IsCS */ true, PGOInstrumentEntry);
   return PreservedAnalyses::all();
 }
 
@@ -1571,6 +1643,129 @@ PreservedAnalyses PGOInstrumentationGen::run(Module &M,
   return PreservedAnalyses::none();
 }
 
+// Using the ratio b/w sums of profile count values and BFI count values to
+// adjust the func entry count.
+static void fixFuncEntryCount(PGOUseFunc &Func, LoopInfo &LI,
+                              BranchProbabilityInfo &NBPI) {
+  Function &F = Func.getFunc();
+  BlockFrequencyInfo NBFI(F, NBPI, LI);
+#ifndef NDEBUG
+  auto BFIEntryCount = F.getEntryCount();
+  assert(BFIEntryCount.hasValue() && (BFIEntryCount.getCount() > 0) &&
+         "Invalid BFI Entrycount");
+#endif
+  auto SumCount = APFloat::getZero(APFloat::IEEEdouble());
+  auto SumBFICount = APFloat::getZero(APFloat::IEEEdouble());
+  for (auto &BBI : F) {
+    uint64_t CountValue = 0;
+    uint64_t BFICountValue = 0;
+    if (!Func.findBBInfo(&BBI))
+      continue;
+    auto BFICount = NBFI.getBlockProfileCount(&BBI);
+    CountValue = Func.getBBInfo(&BBI).CountValue;
+    BFICountValue = BFICount.getValue();
+    SumCount.add(APFloat(CountValue * 1.0), APFloat::rmNearestTiesToEven);
+    SumBFICount.add(APFloat(BFICountValue * 1.0), APFloat::rmNearestTiesToEven);
+  }
+  if (SumCount.isZero())
+    return;
+
+  assert(SumBFICount.compare(APFloat(0.0)) == APFloat::cmpGreaterThan &&
+         "Incorrect sum of BFI counts");
+  if (SumBFICount.compare(SumCount) == APFloat::cmpEqual)
+    return;
+  double Scale = (SumCount / SumBFICount).convertToDouble();
+  if (Scale < 1.001 && Scale > 0.999)
+    return;
+
+  uint64_t FuncEntryCount = Func.getBBInfo(&*F.begin()).CountValue;
+  uint64_t NewEntryCount = 0.5 + FuncEntryCount * Scale;
+  if (NewEntryCount == 0)
+    NewEntryCount = 1;
+  if (NewEntryCount != FuncEntryCount) {
+    F.setEntryCount(ProfileCount(NewEntryCount, Function::PCT_Real));
+    LLVM_DEBUG(dbgs() << "FixFuncEntryCount: in " << F.getName()
+                      << ", entry_count " << FuncEntryCount << " --> "
+                      << NewEntryCount << "\n");
+  }
+}
+
+// Compare the profile count values with BFI count values, and print out
+// the non-matching ones.
+static void verifyFuncBFI(PGOUseFunc &Func, LoopInfo &LI,
+                          BranchProbabilityInfo &NBPI,
+                          uint64_t HotCountThreshold,
+                          uint64_t ColdCountThreshold) {
+  Function &F = Func.getFunc();
+  BlockFrequencyInfo NBFI(F, NBPI, LI);
+  //  bool PrintFunc = false;
+  bool HotBBOnly = PGOVerifyHotBFI;
+  std::string Msg;
+  OptimizationRemarkEmitter ORE(&F);
+
+  unsigned BBNum = 0, BBMisMatchNum = 0, NonZeroBBNum = 0;
+  for (auto &BBI : F) {
+    uint64_t CountValue = 0;
+    uint64_t BFICountValue = 0;
+
+    if (Func.getBBInfo(&BBI).CountValid)
+      CountValue = Func.getBBInfo(&BBI).CountValue;
+
+    BBNum++;
+    if (CountValue)
+      NonZeroBBNum++;
+    auto BFICount = NBFI.getBlockProfileCount(&BBI);
+    if (BFICount)
+      BFICountValue = BFICount.getValue();
+
+    if (HotBBOnly) {
+      bool rawIsHot = CountValue >= HotCountThreshold;
+      bool BFIIsHot = BFICountValue >= HotCountThreshold;
+      bool rawIsCold = CountValue <= ColdCountThreshold;
+      bool ShowCount = false;
+      if (rawIsHot && !BFIIsHot) {
+        Msg = "raw-Hot to BFI-nonHot";
+        ShowCount = true;
+      } else if (rawIsCold && BFIIsHot) {
+        Msg = "raw-Cold to BFI-Hot";
+        ShowCount = true;
+      }
+      if (!ShowCount)
+        continue;
+    } else {
+      if ((CountValue < PGOVerifyBFICutoff) &&
+          (BFICountValue < PGOVerifyBFICutoff))
+        continue;
+      uint64_t Diff = (BFICountValue >= CountValue)
+                          ? BFICountValue - CountValue
+                          : CountValue - BFICountValue;
+      if (Diff < CountValue / 100 * PGOVerifyBFIRatio)
+        continue;
+    }
+    BBMisMatchNum++;
+
+    ORE.emit([&]() {
+      OptimizationRemarkAnalysis Remark(DEBUG_TYPE, "bfi-verify",
+                                        F.getSubprogram(), &BBI);
+      Remark << "BB " << ore::NV("Block", BBI.getName())
+             << " Count=" << ore::NV("Count", CountValue)
+             << " BFI_Count=" << ore::NV("Count", BFICountValue);
+      if (!Msg.empty())
+        Remark << " (" << Msg << ")";
+      return Remark;
+    });
+  }
+  if (BBMisMatchNum)
+    ORE.emit([&]() {
+      return OptimizationRemarkAnalysis(DEBUG_TYPE, "bfi-verify",
+                                        F.getSubprogram(), &F.getEntryBlock())
+             << "In Func " << ore::NV("Function", F.getName())
+             << ": Num_of_BB=" << ore::NV("Count", BBNum)
+             << ", Num_of_non_zerovalue_BB=" << ore::NV("Count", NonZeroBBNum)
+             << ", Num_of_mis_matching_BB=" << ore::NV("Count", BBMisMatchNum);
+    });
+}
+
 static bool annotateAllFunctions(
     Module &M, StringRef ProfileFileName, StringRef ProfileRemappingFileName,
     function_ref<TargetLibraryInfo &(Function &)> LookupTLI,
@@ -1619,6 +1814,12 @@ static bool annotateAllFunctions(
   collectComdatMembers(M, ComdatMembers);
   std::vector<Function *> HotFunctions;
   std::vector<Function *> ColdFunctions;
+
+  // If the profile marked as always instrument the entry BB, do the
+  // same. Note this can be overwritten by the internal option in CFGMST.h
+  bool InstrumentFuncEntry = PGOReader->instrEntryBBEnabled();
+  if (PGOInstrumentEntry.getNumOccurrences() > 0)
+    InstrumentFuncEntry = PGOInstrumentEntry;
   for (auto &F : M) {
     if (F.isDeclaration())
       continue;
@@ -1628,9 +1829,15 @@ static bool annotateAllFunctions(
     // Split indirectbr critical edges here before computing the MST rather than
     // later in getInstrBB() to avoid invalidating it.
     SplitIndirectBrCriticalEdges(F, BPI, BFI);
-    PGOUseFunc Func(F, &M, TLI, ComdatMembers, BPI, BFI, PSI, IsCS);
+    PGOUseFunc Func(F, &M, TLI, ComdatMembers, BPI, BFI, PSI, IsCS,
+                    InstrumentFuncEntry);
+    // When AllMinusOnes is true, it means the profile for the function
+    // is unrepresentative and this function is actually hot. Set the
+    // entry count of the function to be multiple times of hot threshold
+    // and drop all its internal counters.
+    bool AllMinusOnes = false;
     bool AllZeros = false;
-    if (!Func.readCounters(PGOReader.get(), AllZeros))
+    if (!Func.readCounters(PGOReader.get(), AllZeros, AllMinusOnes))
       continue;
     if (AllZeros) {
       F.setEntryCount(ProfileCount(0, Function::PCT_Real));
@@ -1638,6 +1845,15 @@ static bool annotateAllFunctions(
         ColdFunctions.push_back(&F);
       continue;
     }
+    const unsigned MultiplyFactor = 3;
+    if (AllMinusOnes) {
+      uint64_t HotThreshold = PSI->getHotCountThreshold();
+      if (HotThreshold)
+        F.setEntryCount(
+            ProfileCount(HotThreshold * MultiplyFactor, Function::PCT_Real));
+      HotFunctions.push_back(&F);
+      continue;
+    }
     Func.populateCounters();
     Func.setBranchWeights();
     Func.annotateValueSites();
@@ -1675,6 +1891,23 @@ static bool annotateAllFunctions(
         Func.dumpInfo();
       }
     }
+
+    if (PGOVerifyBFI || PGOVerifyHotBFI || PGOFixEntryCount) {
+      LoopInfo LI{DominatorTree(F)};
+      BranchProbabilityInfo NBPI(F, LI);
+
+      // Fix func entry count.
+      if (PGOFixEntryCount)
+        fixFuncEntryCount(Func, LI, NBPI);
+
+      // Verify BlockFrequency information.
+      uint64_t HotCountThreshold = 0, ColdCountThreshold = 0;
+      if (PGOVerifyHotBFI) {
+        HotCountThreshold = PSI->getOrCompHotCountThreshold();
+        ColdCountThreshold = PSI->getOrCompColdCountThreshold();
+      }
+      verifyFuncBFI(Func, LI, NBPI, HotCountThreshold, ColdCountThreshold);
+    }
   }
 
   // Set function hotness attribute from the profile.
@@ -1687,6 +1920,17 @@ static bool annotateAllFunctions(
                       << "\n");
   }
   for (auto &F : ColdFunctions) {
+    // Only set when there is no Attribute::Hot set by the user. For Hot
+    // attribute, user's annotation has the precedence over the profile.
+    if (F->hasFnAttribute(Attribute::Hot)) {
+      auto &Ctx = M.getContext();
+      std::string Msg = std::string("Function ") + F->getName().str() +
+                        std::string(" is annotated as a hot function but"
+                                    " the profile is cold");
+      Ctx.diagnose(
+          DiagnosticInfoPGOProfile(M.getName().data(), Msg, DS_Warning));
+      continue;
+    }
     F->addFnAttr(Attribute::Cold);
     LLVM_DEBUG(dbgs() << "Set cold attribute to function: " << F->getName()
                       << "\n");
@@ -1772,8 +2016,6 @@ void llvm::setProfMetadata(Module *M, Instruction *TI,
     dbgs() << W << " ";
   } dbgs() << "\n";);
 
-  misexpect::verifyMisExpect(TI, Weights, TI->getContext());
-
   TI->setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(Weights));
   if (EmitBranchProbability) {
     std::string BrCondStr = getBranchCondString(TI);
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
index 2b7b859891dc..55a93b6152dc 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
@@ -22,6 +22,7 @@
 #include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Dominators.h"
@@ -38,6 +39,8 @@
 #include "llvm/Pass.h"
 #include "llvm/PassRegistry.h"
 #include "llvm/ProfileData/InstrProf.h"
+#define INSTR_PROF_VALUE_PROF_MEMOP_API
+#include "llvm/ProfileData/InstrProfData.inc"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -89,17 +92,15 @@ static cl::opt<bool>
                     cl::desc("Scale the memop size counts using the basic "
                              " block count value"));
 
-// This option sets the rangge of precise profile memop sizes.
-extern cl::opt<std::string> MemOPSizeRange;
-
-// This option sets the value that groups large memop sizes
-extern cl::opt<unsigned> MemOPSizeLarge;
-
 cl::opt<bool>
     MemOPOptMemcmpBcmp("pgo-memop-optimize-memcmp-bcmp", cl::init(true),
                        cl::Hidden,
                        cl::desc("Size-specialize memcmp and bcmp calls"));
 
+static cl::opt<unsigned>
+    MemOpMaxOptSize("memop-value-prof-max-opt-size", cl::Hidden, cl::init(128),
+                    cl::desc("Optimize the memop size <= this value"));
+
 namespace {
 class PGOMemOPSizeOptLegacyPass : public FunctionPass {
 public:
@@ -224,9 +225,6 @@ public:
       : Func(Func), BFI(BFI), ORE(ORE), DT(DT), TLI(TLI), Changed(false) {
     ValueDataArray =
         std::make_unique<InstrProfValueData[]>(MemOPMaxVersion + 2);
-    // Get the MemOPSize range information from option MemOPSizeRange,
-    getMemOPSizeRangeFromOption(MemOPSizeRange, PreciseRangeStart,
-                                PreciseRangeLast);
   }
   bool isChanged() const { return Changed; }
   void perform() {
@@ -256,7 +254,7 @@ public:
     LibFunc Func;
     if (TLI.getLibFunc(CI, Func) &&
         (Func == LibFunc_memcmp || Func == LibFunc_bcmp) &&
-        !dyn_cast<ConstantInt>(CI.getArgOperand(2))) {
+        !isa<ConstantInt>(CI.getArgOperand(2))) {
       WorkList.push_back(MemOp(&CI));
     }
   }
@@ -269,26 +267,9 @@ private:
   TargetLibraryInfo &TLI;
   bool Changed;
   std::vector<MemOp> WorkList;
-  // Start of the previse range.
-  int64_t PreciseRangeStart;
-  // Last value of the previse range.
-  int64_t PreciseRangeLast;
   // The space to read the profile annotation.
   std::unique_ptr<InstrProfValueData[]> ValueDataArray;
   bool perform(MemOp MO);
-
-  // This kind shows which group the value falls in. For PreciseValue, we have
-  // the profile count for that value. LargeGroup groups the values that are in
-  // range [LargeValue, +inf). NonLargeGroup groups the rest of values.
-  enum MemOPSizeKind { PreciseValue, NonLargeGroup, LargeGroup };
-
-  MemOPSizeKind getMemOPSizeKind(int64_t Value) const {
-    if (Value == MemOPSizeLarge && MemOPSizeLarge != 0)
-      return LargeGroup;
-    if (Value == PreciseRangeLast + 1)
-      return NonLargeGroup;
-    return PreciseValue;
-  }
 };
 
 static bool isProfitable(uint64_t Count, uint64_t TotalCount) {
@@ -365,8 +346,7 @@ bool MemOPSizeOpt::perform(MemOp MO) {
     if (MemOPScaleCount)
       C = getScaledCount(C, ActualCount, SavedTotalCount);
 
-    // Only care precise value here.
-    if (getMemOPSizeKind(V) != PreciseValue)
+    if (!InstrProfIsSingleValRange(V) || V > MemOpMaxOptSize)
       continue;
 
     // ValueCounts are sorted on the count. Break at the first un-profitable
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp
index 85e096112fca..fc5267261851 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp
@@ -282,8 +282,10 @@ static bool rewrite(Function &F) {
 
       // Note: There are many more sources of documented UB, but this pass only
       // attempts to find UB triggered by propagation of poison.
-      if (Value *Op = const_cast<Value*>(getGuaranteedNonPoisonOp(&I)))
-        CreateAssertNot(B, getPoisonFor(ValToPoison, Op));
+      SmallPtrSet<const Value *, 4> NonPoisonOps;
+      getGuaranteedNonPoisonOps(&I, NonPoisonOps);
+      for (const Value *Op : NonPoisonOps)
+        CreateAssertNot(B, getPoisonFor(ValToPoison, const_cast<Value *>(Op)));
 
       if (LocalCheck)
         if (auto *RI = dyn_cast<ReturnInst>(&I))
@@ -293,11 +295,11 @@ static bool rewrite(Function &F) {
           }
 
       SmallVector<Value*, 4> Checks;
-      if (propagatesPoison(&I))
+      if (propagatesPoison(cast<Operator>(&I)))
         for (Value *V : I.operands())
           Checks.push_back(getPoisonFor(ValToPoison, V));
 
-      if (canCreatePoison(&I))
+      if (canCreatePoison(cast<Operator>(&I)))
         generateCreationChecks(I, Checks);
       ValToPoison[&I] = buildOrChain(B, Checks);
     }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
index b6a9df57e431..2d4b07939463 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
@@ -45,49 +45,39 @@ using namespace llvm;
 
 #define DEBUG_TYPE "sancov"
 
-static const char *const SanCovTracePCIndirName =
-    "__sanitizer_cov_trace_pc_indir";
-static const char *const SanCovTracePCName = "__sanitizer_cov_trace_pc";
-static const char *const SanCovTraceCmp1 = "__sanitizer_cov_trace_cmp1";
-static const char *const SanCovTraceCmp2 = "__sanitizer_cov_trace_cmp2";
-static const char *const SanCovTraceCmp4 = "__sanitizer_cov_trace_cmp4";
-static const char *const SanCovTraceCmp8 = "__sanitizer_cov_trace_cmp8";
-static const char *const SanCovTraceConstCmp1 =
-    "__sanitizer_cov_trace_const_cmp1";
-static const char *const SanCovTraceConstCmp2 =
-    "__sanitizer_cov_trace_const_cmp2";
-static const char *const SanCovTraceConstCmp4 =
-    "__sanitizer_cov_trace_const_cmp4";
-static const char *const SanCovTraceConstCmp8 =
-    "__sanitizer_cov_trace_const_cmp8";
-static const char *const SanCovTraceDiv4 = "__sanitizer_cov_trace_div4";
-static const char *const SanCovTraceDiv8 = "__sanitizer_cov_trace_div8";
-static const char *const SanCovTraceGep = "__sanitizer_cov_trace_gep";
-static const char *const SanCovTraceSwitchName = "__sanitizer_cov_trace_switch";
-static const char *const SanCovModuleCtorTracePcGuardName =
+const char SanCovTracePCIndirName[] = "__sanitizer_cov_trace_pc_indir";
+const char SanCovTracePCName[] = "__sanitizer_cov_trace_pc";
+const char SanCovTraceCmp1[] = "__sanitizer_cov_trace_cmp1";
+const char SanCovTraceCmp2[] = "__sanitizer_cov_trace_cmp2";
+const char SanCovTraceCmp4[] = "__sanitizer_cov_trace_cmp4";
+const char SanCovTraceCmp8[] = "__sanitizer_cov_trace_cmp8";
+const char SanCovTraceConstCmp1[] = "__sanitizer_cov_trace_const_cmp1";
+const char SanCovTraceConstCmp2[] = "__sanitizer_cov_trace_const_cmp2";
+const char SanCovTraceConstCmp4[] = "__sanitizer_cov_trace_const_cmp4";
+const char SanCovTraceConstCmp8[] = "__sanitizer_cov_trace_const_cmp8";
+const char SanCovTraceDiv4[] = "__sanitizer_cov_trace_div4";
+const char SanCovTraceDiv8[] = "__sanitizer_cov_trace_div8";
+const char SanCovTraceGep[] = "__sanitizer_cov_trace_gep";
+const char SanCovTraceSwitchName[] = "__sanitizer_cov_trace_switch";
+const char SanCovModuleCtorTracePcGuardName[] =
     "sancov.module_ctor_trace_pc_guard";
-static const char *const SanCovModuleCtor8bitCountersName =
+const char SanCovModuleCtor8bitCountersName[] =
     "sancov.module_ctor_8bit_counters";
-static const char *const SanCovModuleCtorBoolFlagName =
-    "sancov.module_ctor_bool_flag";
+const char SanCovModuleCtorBoolFlagName[] = "sancov.module_ctor_bool_flag";
 static const uint64_t SanCtorAndDtorPriority = 2;
 
-static const char *const SanCovTracePCGuardName =
-    "__sanitizer_cov_trace_pc_guard";
-static const char *const SanCovTracePCGuardInitName =
-    "__sanitizer_cov_trace_pc_guard_init";
-static const char *const SanCov8bitCountersInitName =
-    "__sanitizer_cov_8bit_counters_init";
-static const char *const SanCovBoolFlagInitName =
-    "__sanitizer_cov_bool_flag_init";
-static const char *const SanCovPCsInitName = "__sanitizer_cov_pcs_init";
+const char SanCovTracePCGuardName[] = "__sanitizer_cov_trace_pc_guard";
+const char SanCovTracePCGuardInitName[] = "__sanitizer_cov_trace_pc_guard_init";
+const char SanCov8bitCountersInitName[] = "__sanitizer_cov_8bit_counters_init";
+const char SanCovBoolFlagInitName[] = "__sanitizer_cov_bool_flag_init";
+const char SanCovPCsInitName[] = "__sanitizer_cov_pcs_init";
 
-static const char *const SanCovGuardsSectionName = "sancov_guards";
-static const char *const SanCovCountersSectionName = "sancov_cntrs";
-static const char *const SanCovBoolFlagSectionName = "sancov_bools";
-static const char *const SanCovPCsSectionName = "sancov_pcs";
+const char SanCovGuardsSectionName[] = "sancov_guards";
+const char SanCovCountersSectionName[] = "sancov_cntrs";
+const char SanCovBoolFlagSectionName[] = "sancov_bools";
+const char SanCovPCsSectionName[] = "sancov_pcs";
 
-static const char *const SanCovLowestStackName = "__sancov_lowest_stack";
+const char SanCovLowestStackName[] = "__sancov_lowest_stack";
 
 static cl::opt<int> ClCoverageLevel(
     "sanitizer-coverage-level",
@@ -338,25 +328,24 @@ PreservedAnalyses ModuleSanitizerCoveragePass::run(Module &M,
 std::pair<Value *, Value *>
 ModuleSanitizerCoverage::CreateSecStartEnd(Module &M, const char *Section,
                                            Type *Ty) {
-  GlobalVariable *SecStart =
-      new GlobalVariable(M, Ty, false, GlobalVariable::ExternalLinkage, nullptr,
-                         getSectionStart(Section));
+  GlobalVariable *SecStart = new GlobalVariable(
+      M, Ty->getPointerElementType(), false, GlobalVariable::ExternalLinkage,
+      nullptr, getSectionStart(Section));
   SecStart->setVisibility(GlobalValue::HiddenVisibility);
-  GlobalVariable *SecEnd =
-      new GlobalVariable(M, Ty, false, GlobalVariable::ExternalLinkage,
-                         nullptr, getSectionEnd(Section));
+  GlobalVariable *SecEnd = new GlobalVariable(
+      M, Ty->getPointerElementType(), false, GlobalVariable::ExternalLinkage,
+      nullptr, getSectionEnd(Section));
   SecEnd->setVisibility(GlobalValue::HiddenVisibility);
   IRBuilder<> IRB(M.getContext());
-  Value *SecEndPtr = IRB.CreatePointerCast(SecEnd, Ty);
   if (!TargetTriple.isOSBinFormatCOFF())
-    return std::make_pair(IRB.CreatePointerCast(SecStart, Ty), SecEndPtr);
+    return std::make_pair(SecStart, SecEnd);
 
   // Account for the fact that on windows-msvc __start_* symbols actually
   // point to a uint64_t before the start of the array.
   auto SecStartI8Ptr = IRB.CreatePointerCast(SecStart, Int8PtrTy);
   auto GEP = IRB.CreateGEP(Int8Ty, SecStartI8Ptr,
                            ConstantInt::get(IntptrTy, sizeof(uint64_t)));
-  return std::make_pair(IRB.CreatePointerCast(GEP, Ty), SecEndPtr);
+  return std::make_pair(IRB.CreatePointerCast(GEP, Ty), SecEnd);
 }
 
 Function *ModuleSanitizerCoverage::CreateInitCallsForSections(
@@ -426,15 +415,13 @@ bool ModuleSanitizerCoverage::instrumentModule(
 
   SanCovTracePCIndir =
       M.getOrInsertFunction(SanCovTracePCIndirName, VoidTy, IntptrTy);
-  // Make sure smaller parameters are zero-extended to i64 as required by the
-  // x86_64 ABI.
+  // Make sure smaller parameters are zero-extended to i64 if required by the
+  // target ABI.
   AttributeList SanCovTraceCmpZeroExtAL;
-  if (TargetTriple.getArch() == Triple::x86_64) {
-    SanCovTraceCmpZeroExtAL =
-        SanCovTraceCmpZeroExtAL.addParamAttribute(*C, 0, Attribute::ZExt);
-    SanCovTraceCmpZeroExtAL =
-        SanCovTraceCmpZeroExtAL.addParamAttribute(*C, 1, Attribute::ZExt);
-  }
+  SanCovTraceCmpZeroExtAL =
+      SanCovTraceCmpZeroExtAL.addParamAttribute(*C, 0, Attribute::ZExt);
+  SanCovTraceCmpZeroExtAL =
+      SanCovTraceCmpZeroExtAL.addParamAttribute(*C, 1, Attribute::ZExt);
 
   SanCovTraceCmpFunction[0] =
       M.getOrInsertFunction(SanCovTraceCmp1, SanCovTraceCmpZeroExtAL, VoidTy,
@@ -459,8 +446,7 @@ bool ModuleSanitizerCoverage::instrumentModule(
 
   {
     AttributeList AL;
-    if (TargetTriple.getArch() == Triple::x86_64)
-      AL = AL.addParamAttribute(*C, 0, Attribute::ZExt);
+    AL = AL.addParamAttribute(*C, 0, Attribute::ZExt);
     SanCovTraceDivFunction[0] =
         M.getOrInsertFunction(SanCovTraceDiv4, AL, VoidTy, IRB.getInt32Ty());
   }
@@ -523,29 +509,23 @@ bool ModuleSanitizerCoverage::instrumentModule(
 
 // True if block has successors and it dominates all of them.
 static bool isFullDominator(const BasicBlock *BB, const DominatorTree *DT) {
-  if (succ_begin(BB) == succ_end(BB))
+  if (succ_empty(BB))
     return false;
 
-  for (const BasicBlock *SUCC : make_range(succ_begin(BB), succ_end(BB))) {
-    if (!DT->dominates(BB, SUCC))
-      return false;
-  }
-
-  return true;
+  return llvm::all_of(successors(BB), [&](const BasicBlock *SUCC) {
+    return DT->dominates(BB, SUCC);
+  });
 }
 
 // True if block has predecessors and it postdominates all of them.
 static bool isFullPostDominator(const BasicBlock *BB,
                                 const PostDominatorTree *PDT) {
-  if (pred_begin(BB) == pred_end(BB))
+  if (pred_empty(BB))
     return false;
 
-  for (const BasicBlock *PRED : make_range(pred_begin(BB), pred_end(BB))) {
-    if (!PDT->dominates(BB, PRED))
-      return false;
-  }
-
-  return true;
+  return llvm::all_of(predecessors(BB), [&](const BasicBlock *PRED) {
+    return PDT->dominates(BB, PRED);
+  });
 }
 
 static bool shouldInstrumentBlock(const Function &F, const BasicBlock *BB,
@@ -815,7 +795,7 @@ void ModuleSanitizerCoverage::InjectTraceForSwitch(
           C = ConstantExpr::getCast(CastInst::ZExt, It.getCaseValue(), Int64Ty);
         Initializers.push_back(C);
       }
-      llvm::sort(Initializers.begin() + 2, Initializers.end(),
+      llvm::sort(drop_begin(Initializers, 2),
                  [](const Constant *A, const Constant *B) {
                    return cast<ConstantInt>(A)->getLimitedValue() <
                           cast<ConstantInt>(B)->getLimitedValue();
@@ -903,7 +883,7 @@ void ModuleSanitizerCoverage::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
   DebugLoc EntryLoc;
   if (IsEntryBB) {
     if (auto SP = F.getSubprogram())
-      EntryLoc = DebugLoc::get(SP->getScopeLine(), 0, SP);
+      EntryLoc = DILocation::get(SP->getContext(), SP->getScopeLine(), 0, SP);
     // Keep static allocas and llvm.localescape calls in the entry block.  Even
     // if we aren't splitting the block, it's nice for allocas to be before
     // calls.
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index c911b37afac7..783878cf1ec0 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -19,7 +19,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Instrumentation/ThreadSanitizer.h"
-#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -52,30 +53,36 @@ using namespace llvm;
 
 #define DEBUG_TYPE "tsan"
 
-static cl::opt<bool>  ClInstrumentMemoryAccesses(
+static cl::opt<bool> ClInstrumentMemoryAccesses(
     "tsan-instrument-memory-accesses", cl::init(true),
     cl::desc("Instrument memory accesses"), cl::Hidden);
-static cl::opt<bool>  ClInstrumentFuncEntryExit(
-    "tsan-instrument-func-entry-exit", cl::init(true),
-    cl::desc("Instrument function entry and exit"), cl::Hidden);
-static cl::opt<bool>  ClHandleCxxExceptions(
+static cl::opt<bool>
+    ClInstrumentFuncEntryExit("tsan-instrument-func-entry-exit", cl::init(true),
+                              cl::desc("Instrument function entry and exit"),
+                              cl::Hidden);
+static cl::opt<bool> ClHandleCxxExceptions(
     "tsan-handle-cxx-exceptions", cl::init(true),
     cl::desc("Handle C++ exceptions (insert cleanup blocks for unwinding)"),
     cl::Hidden);
-static cl::opt<bool>  ClInstrumentAtomics(
-    "tsan-instrument-atomics", cl::init(true),
-    cl::desc("Instrument atomics"), cl::Hidden);
-static cl::opt<bool>  ClInstrumentMemIntrinsics(
+static cl::opt<bool> ClInstrumentAtomics("tsan-instrument-atomics",
+                                         cl::init(true),
+                                         cl::desc("Instrument atomics"),
+                                         cl::Hidden);
+static cl::opt<bool> ClInstrumentMemIntrinsics(
     "tsan-instrument-memintrinsics", cl::init(true),
     cl::desc("Instrument memintrinsics (memset/memcpy/memmove)"), cl::Hidden);
-static cl::opt<bool>  ClDistinguishVolatile(
+static cl::opt<bool> ClDistinguishVolatile(
     "tsan-distinguish-volatile", cl::init(false),
     cl::desc("Emit special instrumentation for accesses to volatiles"),
     cl::Hidden);
-static cl::opt<bool>  ClInstrumentReadBeforeWrite(
+static cl::opt<bool> ClInstrumentReadBeforeWrite(
     "tsan-instrument-read-before-write", cl::init(false),
     cl::desc("Do not eliminate read instrumentation for read-before-writes"),
     cl::Hidden);
+static cl::opt<bool> ClCompoundReadBeforeWrite(
+    "tsan-compound-read-before-write", cl::init(false),
+    cl::desc("Emit special compound instrumentation for reads-before-writes"),
+    cl::Hidden);
 
 STATISTIC(NumInstrumentedReads, "Number of instrumented reads");
 STATISTIC(NumInstrumentedWrites, "Number of instrumented writes");
@@ -89,8 +96,8 @@ STATISTIC(NumOmittedReadsFromConstantGlobals,
 STATISTIC(NumOmittedReadsFromVtable, "Number of vtable reads");
 STATISTIC(NumOmittedNonCaptured, "Number of accesses ignored due to capturing");
 
-static const char *const kTsanModuleCtorName = "tsan.module_ctor";
-static const char *const kTsanInitName = "__tsan_init";
+const char kTsanModuleCtorName[] = "tsan.module_ctor";
+const char kTsanInitName[] = "__tsan_init";
 
 namespace {
 
@@ -101,15 +108,37 @@ namespace {
 /// ensures the __tsan_init function is in the list of global constructors for
 /// the module.
 struct ThreadSanitizer {
+  ThreadSanitizer() {
+    // Sanity check options and warn user.
+    if (ClInstrumentReadBeforeWrite && ClCompoundReadBeforeWrite) {
+      errs()
+          << "warning: Option -tsan-compound-read-before-write has no effect "
+             "when -tsan-instrument-read-before-write is set.\n";
+    }
+  }
+
   bool sanitizeFunction(Function &F, const TargetLibraryInfo &TLI);
 
 private:
+  // Internal Instruction wrapper that contains more information about the
+  // Instruction from prior analysis.
+  struct InstructionInfo {
+    // Instrumentation emitted for this instruction is for a compounded set of
+    // read and write operations in the same basic block.
+    static constexpr unsigned kCompoundRW = (1U << 0);
+
+    explicit InstructionInfo(Instruction *Inst) : Inst(Inst) {}
+
+    Instruction *Inst;
+    unsigned Flags = 0;
+  };
+
   void initialize(Module &M);
-  bool instrumentLoadOrStore(Instruction *I, const DataLayout &DL);
+  bool instrumentLoadOrStore(const InstructionInfo &II, const DataLayout &DL);
   bool instrumentAtomic(Instruction *I, const DataLayout &DL);
   bool instrumentMemIntrinsic(Instruction *I);
   void chooseInstructionsToInstrument(SmallVectorImpl<Instruction *> &Local,
-                                      SmallVectorImpl<Instruction *> &All,
+                                      SmallVectorImpl<InstructionInfo> &All,
                                       const DataLayout &DL);
   bool addrPointsToConstantData(Value *Addr);
   int getMemoryAccessFuncIndex(Value *Addr, const DataLayout &DL);
@@ -130,6 +159,8 @@ private:
   FunctionCallee TsanVolatileWrite[kNumberOfAccessSizes];
   FunctionCallee TsanUnalignedVolatileRead[kNumberOfAccessSizes];
   FunctionCallee TsanUnalignedVolatileWrite[kNumberOfAccessSizes];
+  FunctionCallee TsanCompoundRW[kNumberOfAccessSizes];
+  FunctionCallee TsanUnalignedCompoundRW[kNumberOfAccessSizes];
   FunctionCallee TsanAtomicLoad[kNumberOfAccessSizes];
   FunctionCallee TsanAtomicStore[kNumberOfAccessSizes];
   FunctionCallee TsanAtomicRMW[AtomicRMWInst::LAST_BINOP + 1]
@@ -268,6 +299,15 @@ void ThreadSanitizer::initialize(Module &M) {
     TsanUnalignedVolatileWrite[i] = M.getOrInsertFunction(
         UnalignedVolatileWriteName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy());
 
+    SmallString<64> CompoundRWName("__tsan_read_write" + ByteSizeStr);
+    TsanCompoundRW[i] = M.getOrInsertFunction(
+        CompoundRWName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy());
+
+    SmallString<64> UnalignedCompoundRWName("__tsan_unaligned_read_write" +
+                                            ByteSizeStr);
+    TsanUnalignedCompoundRW[i] = M.getOrInsertFunction(
+        UnalignedCompoundRWName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy());
+
     Type *Ty = Type::getIntNTy(M.getContext(), BitSize);
     Type *PtrTy = Ty->getPointerTo();
     SmallString<32> AtomicLoadName("__tsan_atomic" + BitSizeStr + "_load");
@@ -402,35 +442,43 @@ bool ThreadSanitizer::addrPointsToConstantData(Value *Addr) {
 // 'Local' is a vector of insns within the same BB (no calls between).
 // 'All' is a vector of insns that will be instrumented.
 void ThreadSanitizer::chooseInstructionsToInstrument(
-    SmallVectorImpl<Instruction *> &Local, SmallVectorImpl<Instruction *> &All,
-    const DataLayout &DL) {
-  SmallPtrSet<Value*, 8> WriteTargets;
+    SmallVectorImpl<Instruction *> &Local,
+    SmallVectorImpl<InstructionInfo> &All, const DataLayout &DL) {
+  DenseMap<Value *, size_t> WriteTargets; // Map of addresses to index in All
   // Iterate from the end.
   for (Instruction *I : reverse(Local)) {
-    if (StoreInst *Store = dyn_cast<StoreInst>(I)) {
-      Value *Addr = Store->getPointerOperand();
-      if (!shouldInstrumentReadWriteFromAddress(I->getModule(), Addr))
-        continue;
-      WriteTargets.insert(Addr);
-    } else {
-      LoadInst *Load = cast<LoadInst>(I);
-      Value *Addr = Load->getPointerOperand();
-      if (!shouldInstrumentReadWriteFromAddress(I->getModule(), Addr))
-        continue;
-      if (!ClInstrumentReadBeforeWrite && WriteTargets.count(Addr)) {
-        // We will write to this temp, so no reason to analyze the read.
-        NumOmittedReadsBeforeWrite++;
-        continue;
+    const bool IsWrite = isa<StoreInst>(*I);
+    Value *Addr = IsWrite ? cast<StoreInst>(I)->getPointerOperand()
+                          : cast<LoadInst>(I)->getPointerOperand();
+
+    if (!shouldInstrumentReadWriteFromAddress(I->getModule(), Addr))
+      continue;
+
+    if (!IsWrite) {
+      const auto WriteEntry = WriteTargets.find(Addr);
+      if (!ClInstrumentReadBeforeWrite && WriteEntry != WriteTargets.end()) {
+        auto &WI = All[WriteEntry->second];
+        // If we distinguish volatile accesses and if either the read or write
+        // is volatile, do not omit any instrumentation.
+        const bool AnyVolatile =
+            ClDistinguishVolatile && (cast<LoadInst>(I)->isVolatile() ||
+                                      cast<StoreInst>(WI.Inst)->isVolatile());
+        if (!AnyVolatile) {
+          // We will write to this temp, so no reason to analyze the read.
+          // Mark the write instruction as compound.
+          WI.Flags |= InstructionInfo::kCompoundRW;
+          NumOmittedReadsBeforeWrite++;
+          continue;
+        }
       }
+
       if (addrPointsToConstantData(Addr)) {
         // Addr points to some constant data -- it can not race with any writes.
         continue;
       }
     }
-    Value *Addr = isa<StoreInst>(*I)
-        ? cast<StoreInst>(I)->getPointerOperand()
-        : cast<LoadInst>(I)->getPointerOperand();
-    if (isa<AllocaInst>(GetUnderlyingObject(Addr, DL)) &&
+
+    if (isa<AllocaInst>(getUnderlyingObject(Addr)) &&
         !PointerMayBeCaptured(Addr, true, true)) {
       // The variable is addressable but not captured, so it cannot be
       // referenced from a different thread and participate in a data race
@@ -438,7 +486,14 @@ void ThreadSanitizer::chooseInstructionsToInstrument(
       NumOmittedNonCaptured++;
       continue;
     }
-    All.push_back(I);
+
+    // Instrument this instruction.
+    All.emplace_back(I);
+    if (IsWrite) {
+      // For read-before-write and compound instrumentation we only need one
+      // write target, and we can override any previous entry if it exists.
+      WriteTargets[Addr] = All.size() - 1;
+    }
   }
   Local.clear();
 }
@@ -479,7 +534,7 @@ bool ThreadSanitizer::sanitizeFunction(Function &F,
   if (F.hasFnAttribute(Attribute::Naked))
     return false;
   initialize(*F.getParent());
-  SmallVector<Instruction*, 8> AllLoadsAndStores;
+  SmallVector<InstructionInfo, 8> AllLoadsAndStores;
   SmallVector<Instruction*, 8> LocalLoadsAndStores;
   SmallVector<Instruction*, 8> AtomicAccesses;
   SmallVector<Instruction*, 8> MemIntrinCalls;
@@ -514,8 +569,8 @@ bool ThreadSanitizer::sanitizeFunction(Function &F,
 
   // Instrument memory accesses only if we want to report bugs in the function.
   if (ClInstrumentMemoryAccesses && SanitizeFunction)
-    for (auto Inst : AllLoadsAndStores) {
-      Res |= instrumentLoadOrStore(Inst, DL);
+    for (const auto &II : AllLoadsAndStores) {
+      Res |= instrumentLoadOrStore(II, DL);
     }
 
   // Instrument atomic memory accesses in any case (they can be used to
@@ -553,13 +608,12 @@ bool ThreadSanitizer::sanitizeFunction(Function &F,
   return Res;
 }
 
-bool ThreadSanitizer::instrumentLoadOrStore(Instruction *I,
+bool ThreadSanitizer::instrumentLoadOrStore(const InstructionInfo &II,
                                             const DataLayout &DL) {
-  IRBuilder<> IRB(I);
-  bool IsWrite = isa<StoreInst>(*I);
-  Value *Addr = IsWrite
-      ? cast<StoreInst>(I)->getPointerOperand()
-      : cast<LoadInst>(I)->getPointerOperand();
+  IRBuilder<> IRB(II.Inst);
+  const bool IsWrite = isa<StoreInst>(*II.Inst);
+  Value *Addr = IsWrite ? cast<StoreInst>(II.Inst)->getPointerOperand()
+                        : cast<LoadInst>(II.Inst)->getPointerOperand();
 
   // swifterror memory addresses are mem2reg promoted by instruction selection.
   // As such they cannot have regular uses like an instrumentation function and
@@ -570,9 +624,9 @@ bool ThreadSanitizer::instrumentLoadOrStore(Instruction *I,
   int Idx = getMemoryAccessFuncIndex(Addr, DL);
   if (Idx < 0)
     return false;
-  if (IsWrite && isVtableAccess(I)) {
-    LLVM_DEBUG(dbgs() << "  VPTR : " << *I << "\n");
-    Value *StoredValue = cast<StoreInst>(I)->getValueOperand();
+  if (IsWrite && isVtableAccess(II.Inst)) {
+    LLVM_DEBUG(dbgs() << "  VPTR : " << *II.Inst << "\n");
+    Value *StoredValue = cast<StoreInst>(II.Inst)->getValueOperand();
     // StoredValue may be a vector type if we are storing several vptrs at once.
     // In this case, just take the first element of the vector since this is
     // enough to find vptr races.
@@ -588,36 +642,46 @@ bool ThreadSanitizer::instrumentLoadOrStore(Instruction *I,
     NumInstrumentedVtableWrites++;
     return true;
   }
-  if (!IsWrite && isVtableAccess(I)) {
+  if (!IsWrite && isVtableAccess(II.Inst)) {
     IRB.CreateCall(TsanVptrLoad,
                    IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()));
     NumInstrumentedVtableReads++;
     return true;
   }
-  const unsigned Alignment = IsWrite
-      ? cast<StoreInst>(I)->getAlignment()
-      : cast<LoadInst>(I)->getAlignment();
-  const bool IsVolatile =
-      ClDistinguishVolatile && (IsWrite ? cast<StoreInst>(I)->isVolatile()
-                                        : cast<LoadInst>(I)->isVolatile());
+
+  const unsigned Alignment = IsWrite ? cast<StoreInst>(II.Inst)->getAlignment()
+                                     : cast<LoadInst>(II.Inst)->getAlignment();
+  const bool IsCompoundRW =
+      ClCompoundReadBeforeWrite && (II.Flags & InstructionInfo::kCompoundRW);
+  const bool IsVolatile = ClDistinguishVolatile &&
+                          (IsWrite ? cast<StoreInst>(II.Inst)->isVolatile()
+                                   : cast<LoadInst>(II.Inst)->isVolatile());
+  assert((!IsVolatile || !IsCompoundRW) && "Compound volatile invalid!");
+
   Type *OrigTy = cast<PointerType>(Addr->getType())->getElementType();
   const uint32_t TypeSize = DL.getTypeStoreSizeInBits(OrigTy);
   FunctionCallee OnAccessFunc = nullptr;
   if (Alignment == 0 || Alignment >= 8 || (Alignment % (TypeSize / 8)) == 0) {
-    if (IsVolatile)
+    if (IsCompoundRW)
+      OnAccessFunc = TsanCompoundRW[Idx];
+    else if (IsVolatile)
       OnAccessFunc = IsWrite ? TsanVolatileWrite[Idx] : TsanVolatileRead[Idx];
     else
       OnAccessFunc = IsWrite ? TsanWrite[Idx] : TsanRead[Idx];
   } else {
-    if (IsVolatile)
+    if (IsCompoundRW)
+      OnAccessFunc = TsanUnalignedCompoundRW[Idx];
+    else if (IsVolatile)
       OnAccessFunc = IsWrite ? TsanUnalignedVolatileWrite[Idx]
                              : TsanUnalignedVolatileRead[Idx];
     else
       OnAccessFunc = IsWrite ? TsanUnalignedWrite[Idx] : TsanUnalignedRead[Idx];
   }
   IRB.CreateCall(OnAccessFunc, IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()));
-  if (IsWrite) NumInstrumentedWrites++;
-  else         NumInstrumentedReads++;
+  if (IsCompoundRW || IsWrite)
+    NumInstrumentedWrites++;
+  if (IsCompoundRW || !IsWrite)
+    NumInstrumentedReads++;
   return true;
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.cpp
index cd4f636ff132..fb6216bb2177 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.cpp
@@ -11,10 +11,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "ValueProfilePlugins.inc"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/InitializePasses.h"
-
 #include <cassert>
 
 using namespace llvm;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.h b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.h
index c3f549c2e7cc..584a60ab451e 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.h
@@ -17,13 +17,16 @@
 #define LLVM_ANALYSIS_PROFILE_GEN_ANALYSIS_H
 
 #include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/Pass.h"
 #include "llvm/ProfileData/InstrProf.h"
+#include <memory>
+#include <vector>
 
 namespace llvm {
 
+class Function;
+class Instruction;
+class Value;
+
 /// Utility analysis that determines what values are worth profiling.
 /// The actual logic is inside the ValueProfileCollectorImpl, whose job is to
 /// populate the Candidates vector.
diff --git a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
index 46bc586fe688..7f7f2dc89b7e 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
@@ -22,6 +22,7 @@
 #include "DependencyAnalysis.h"
 #include "ObjCARC.h"
 #include "ProvenanceAnalysis.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/IR/CFG.h"
 
 using namespace llvm;
@@ -51,10 +52,8 @@ bool llvm::objcarc::CanAlterRefCount(const Instruction *Inst, const Value *Ptr,
   if (AliasAnalysis::onlyReadsMemory(MRB))
     return false;
   if (AliasAnalysis::onlyAccessesArgPointees(MRB)) {
-    const DataLayout &DL = Inst->getModule()->getDataLayout();
     for (const Value *Op : Call->args()) {
-      if (IsPotentialRetainableObjPtr(Op, *PA.getAA()) &&
-          PA.related(Ptr, Op, DL))
+      if (IsPotentialRetainableObjPtr(Op, *PA.getAA()) && PA.related(Ptr, Op))
         return true;
     }
     return false;
@@ -85,8 +84,6 @@ bool llvm::objcarc::CanUse(const Instruction *Inst, const Value *Ptr,
   if (Class == ARCInstKind::Call)
     return false;
 
-  const DataLayout &DL = Inst->getModule()->getDataLayout();
-
   // Consider various instructions which may have pointer arguments which are
   // not "uses".
   if (const ICmpInst *ICI = dyn_cast<ICmpInst>(Inst)) {
@@ -99,26 +96,24 @@ bool llvm::objcarc::CanUse(const Instruction *Inst, const Value *Ptr,
     // For calls, just check the arguments (and not the callee operand).
     for (auto OI = CS->arg_begin(), OE = CS->arg_end(); OI != OE; ++OI) {
       const Value *Op = *OI;
-      if (IsPotentialRetainableObjPtr(Op, *PA.getAA()) &&
-          PA.related(Ptr, Op, DL))
+      if (IsPotentialRetainableObjPtr(Op, *PA.getAA()) && PA.related(Ptr, Op))
         return true;
     }
     return false;
   } else if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
     // Special-case stores, because we don't care about the stored value, just
     // the store address.
-    const Value *Op = GetUnderlyingObjCPtr(SI->getPointerOperand(), DL);
+    const Value *Op = GetUnderlyingObjCPtr(SI->getPointerOperand());
     // If we can't tell what the underlying object was, assume there is a
     // dependence.
-    return IsPotentialRetainableObjPtr(Op, *PA.getAA()) &&
-           PA.related(Op, Ptr, DL);
+    return IsPotentialRetainableObjPtr(Op, *PA.getAA()) && PA.related(Op, Ptr);
   }
 
   // Check each operand for a match.
   for (User::const_op_iterator OI = Inst->op_begin(), OE = Inst->op_end();
        OI != OE; ++OI) {
     const Value *Op = *OI;
-    if (IsPotentialRetainableObjPtr(Op, *PA.getAA()) && PA.related(Ptr, Op, DL))
+    if (IsPotentialRetainableObjPtr(Op, *PA.getAA()) && PA.related(Ptr, Op))
       return true;
   }
   return false;
@@ -214,15 +209,13 @@ llvm::objcarc::Depends(DependenceKind Flavor, Instruction *Inst,
 /// non-local dependencies on Arg.
 ///
 /// TODO: Cache results?
-void
-llvm::objcarc::FindDependencies(DependenceKind Flavor,
-                                const Value *Arg,
-                                BasicBlock *StartBB, Instruction *StartInst,
-                                SmallPtrSetImpl<Instruction *> &DependingInsts,
-                                SmallPtrSetImpl<const BasicBlock *> &Visited,
-                                ProvenanceAnalysis &PA) {
+static bool findDependencies(DependenceKind Flavor, const Value *Arg,
+                             BasicBlock *StartBB, Instruction *StartInst,
+                             SmallPtrSetImpl<Instruction *> &DependingInsts,
+                             ProvenanceAnalysis &PA) {
   BasicBlock::iterator StartPos = StartInst->getIterator();
 
+  SmallPtrSet<const BasicBlock *, 4> Visited;
   SmallVector<std::pair<BasicBlock *, BasicBlock::iterator>, 4> Worklist;
   Worklist.push_back(std::make_pair(StartBB, StartPos));
   do {
@@ -235,15 +228,14 @@ llvm::objcarc::FindDependencies(DependenceKind Flavor,
       if (LocalStartPos == StartBBBegin) {
         pred_iterator PI(LocalStartBB), PE(LocalStartBB, false);
         if (PI == PE)
-          // If we've reached the function entry, produce a null dependence.
-          DependingInsts.insert(nullptr);
-        else
-          // Add the predecessors to the worklist.
-          do {
-            BasicBlock *PredBB = *PI;
-            if (Visited.insert(PredBB).second)
-              Worklist.push_back(std::make_pair(PredBB, PredBB->end()));
-          } while (++PI != PE);
+          // Return if we've reached the function entry.
+          return false;
+        // Add the predecessors to the worklist.
+        do {
+          BasicBlock *PredBB = *PI;
+          if (Visited.insert(PredBB).second)
+            Worklist.push_back(std::make_pair(PredBB, PredBB->end()));
+        } while (++PI != PE);
         break;
       }
 
@@ -262,9 +254,22 @@ llvm::objcarc::FindDependencies(DependenceKind Flavor,
     if (BB == StartBB)
       continue;
     for (const BasicBlock *Succ : successors(BB))
-      if (Succ != StartBB && !Visited.count(Succ)) {
-        DependingInsts.insert(reinterpret_cast<Instruction *>(-1));
-        return;
-      }
+      if (Succ != StartBB && !Visited.count(Succ))
+        return false;
   }
+
+  return true;
+}
+
+llvm::Instruction *llvm::objcarc::findSingleDependency(DependenceKind Flavor,
+                                                       const Value *Arg,
+                                                       BasicBlock *StartBB,
+                                                       Instruction *StartInst,
+                                                       ProvenanceAnalysis &PA) {
+  SmallPtrSet<Instruction *, 4> DependingInsts;
+
+  if (!findDependencies(Flavor, Arg, StartBB, StartInst, DependingInsts, PA) ||
+      DependingInsts.size() != 1)
+    return nullptr;
+  return *DependingInsts.begin();
 }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.h b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.h
index ed89c8c8fc89..cf4c05ebe91c 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.h
@@ -50,12 +50,12 @@ enum DependenceKind {
   RetainRVDep                 ///< Blocks objc_retainAutoreleasedReturnValue.
 };
 
-void FindDependencies(DependenceKind Flavor,
-                      const Value *Arg,
-                      BasicBlock *StartBB, Instruction *StartInst,
-                      SmallPtrSetImpl<Instruction *> &DependingInstructions,
-                      SmallPtrSetImpl<const BasicBlock *> &Visited,
-                      ProvenanceAnalysis &PA);
+/// Find dependent instructions. If there is exactly one dependent instruction,
+/// return it. Otherwise, return null.
+llvm::Instruction *findSingleDependency(DependenceKind Flavor, const Value *Arg,
+                                        BasicBlock *StartBB,
+                                        Instruction *StartInst,
+                                        ProvenanceAnalysis &PA);
 
 bool
 Depends(DependenceKind Flavor, Instruction *Inst, const Value *Arg,
diff --git a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARC.cpp b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARC.cpp
index f4da51650a7d..970136392fdd 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARC.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARC.cpp
@@ -29,8 +29,8 @@ void llvm::initializeObjCARCOpts(PassRegistry &Registry) {
   initializeObjCARCAAWrapperPassPass(Registry);
   initializeObjCARCAPElimPass(Registry);
   initializeObjCARCExpandPass(Registry);
-  initializeObjCARCContractPass(Registry);
-  initializeObjCARCOptPass(Registry);
+  initializeObjCARCContractLegacyPassPass(Registry);
+  initializeObjCARCOptLegacyPassPass(Registry);
   initializePAEvalPass(Registry);
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARC.h b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARC.h
index b496842fcfc5..8227a8c6f75f 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARC.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARC.h
@@ -23,9 +23,7 @@
 #define LLVM_LIB_TRANSFORMS_OBJCARC_OBJCARC_H
 
 #include "llvm/Analysis/ObjCARCAnalysisUtils.h"
-#include "llvm/Analysis/ObjCARCInstKind.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/ObjCARC.h"
 
 namespace llvm {
 namespace objcarc {
diff --git a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
index ac1db27f5e64..6a928f2c7ffb 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
@@ -26,9 +26,11 @@
 #include "ObjCARC.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/ObjCARC.h"
 
 using namespace llvm;
 using namespace llvm::objcarc;
@@ -36,39 +38,10 @@ using namespace llvm::objcarc;
 #define DEBUG_TYPE "objc-arc-ap-elim"
 
 namespace {
-  /// Autorelease pool elimination.
-  class ObjCARCAPElim : public ModulePass {
-    void getAnalysisUsage(AnalysisUsage &AU) const override;
-    bool runOnModule(Module &M) override;
-
-    static bool MayAutorelease(const CallBase &CB, unsigned Depth = 0);
-    static bool OptimizeBB(BasicBlock *BB);
-
-  public:
-    static char ID;
-    ObjCARCAPElim() : ModulePass(ID) {
-      initializeObjCARCAPElimPass(*PassRegistry::getPassRegistry());
-    }
-  };
-}
-
-char ObjCARCAPElim::ID = 0;
-INITIALIZE_PASS(ObjCARCAPElim,
-                "objc-arc-apelim",
-                "ObjC ARC autorelease pool elimination",
-                false, false)
-
-Pass *llvm::createObjCARCAPElimPass() {
-  return new ObjCARCAPElim();
-}
-
-void ObjCARCAPElim::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.setPreservesCFG();
-}
 
 /// Interprocedurally determine if calls made by the given call site can
 /// possibly produce autoreleases.
-bool ObjCARCAPElim::MayAutorelease(const CallBase &CB, unsigned Depth) {
+bool MayAutorelease(const CallBase &CB, unsigned Depth = 0) {
   if (const Function *Callee = CB.getCalledFunction()) {
     if (!Callee->hasExactDefinition())
       return true;
@@ -87,7 +60,7 @@ bool ObjCARCAPElim::MayAutorelease(const CallBase &CB, unsigned Depth) {
   return true;
 }
 
-bool ObjCARCAPElim::OptimizeBB(BasicBlock *BB) {
+bool OptimizeBB(BasicBlock *BB) {
   bool Changed = false;
 
   Instruction *Push = nullptr;
@@ -125,17 +98,13 @@ bool ObjCARCAPElim::OptimizeBB(BasicBlock *BB) {
   return Changed;
 }
 
-bool ObjCARCAPElim::runOnModule(Module &M) {
+bool runImpl(Module &M) {
   if (!EnableARCOpts)
     return false;
 
   // If nothing in the Module uses ARC, don't do anything.
   if (!ModuleHasARC(M))
     return false;
-
-  if (skipModule(M))
-    return false;
-
   // Find the llvm.global_ctors variable, as the first step in
   // identifying the global constructors. In theory, unnecessary autorelease
   // pools could occur anywhere, but in practice it's pretty rare. Global
@@ -175,3 +144,40 @@ bool ObjCARCAPElim::runOnModule(Module &M) {
 
   return Changed;
 }
+
+/// Autorelease pool elimination.
+class ObjCARCAPElim : public ModulePass {
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnModule(Module &M) override;
+
+public:
+  static char ID;
+  ObjCARCAPElim() : ModulePass(ID) {
+    initializeObjCARCAPElimPass(*PassRegistry::getPassRegistry());
+  }
+};
+} // namespace
+
+char ObjCARCAPElim::ID = 0;
+INITIALIZE_PASS(ObjCARCAPElim, "objc-arc-apelim",
+                "ObjC ARC autorelease pool elimination", false, false)
+
+Pass *llvm::createObjCARCAPElimPass() { return new ObjCARCAPElim(); }
+
+void ObjCARCAPElim::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+}
+
+bool ObjCARCAPElim::runOnModule(Module &M) {
+  if (skipModule(M))
+    return false;
+  return runImpl(M);
+}
+
+PreservedAnalyses ObjCARCAPElimPass::run(Module &M, ModuleAnalysisManager &AM) {
+  if (!runImpl(M))
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp
index 7fd4857c4490..86d161116e8c 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp
@@ -30,15 +30,18 @@
 #include "ObjCARC.h"
 #include "ProvenanceAnalysis.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/ObjCARC.h"
 
 using namespace llvm;
 using namespace llvm::objcarc;
@@ -53,59 +56,63 @@ STATISTIC(NumStoreStrongs, "Number objc_storeStrong calls formed");
 //===----------------------------------------------------------------------===//
 
 namespace {
-  /// Late ARC optimizations
-  ///
-  /// These change the IR in a way that makes it difficult to be analyzed by
-  /// ObjCARCOpt, so it's run late.
-  class ObjCARCContract : public FunctionPass {
-    bool Changed;
-    AliasAnalysis *AA;
-    DominatorTree *DT;
-    ProvenanceAnalysis PA;
-    ARCRuntimeEntryPoints EP;
-
-    /// A flag indicating whether this optimization pass should run.
-    bool Run;
-
-    /// The inline asm string to insert between calls and RetainRV calls to make
-    /// the optimization work on targets which need it.
-    const MDString *RVInstMarker;
-
-    /// The set of inserted objc_storeStrong calls. If at the end of walking the
-    /// function we have found no alloca instructions, these calls can be marked
-    /// "tail".
-    SmallPtrSet<CallInst *, 8> StoreStrongCalls;
-
-    /// Returns true if we eliminated Inst.
-    bool tryToPeepholeInstruction(
-        Function &F, Instruction *Inst, inst_iterator &Iter,
-        SmallPtrSetImpl<Instruction *> &DepInsts,
-        SmallPtrSetImpl<const BasicBlock *> &Visited,
-        bool &TailOkForStoreStrong,
-        const DenseMap<BasicBlock *, ColorVector> &BlockColors);
-
-    bool optimizeRetainCall(Function &F, Instruction *Retain);
-
-    bool
-    contractAutorelease(Function &F, Instruction *Autorelease,
-                        ARCInstKind Class,
-                        SmallPtrSetImpl<Instruction *> &DependingInstructions,
-                        SmallPtrSetImpl<const BasicBlock *> &Visited);
-
-    void tryToContractReleaseIntoStoreStrong(
-        Instruction *Release, inst_iterator &Iter,
-        const DenseMap<BasicBlock *, ColorVector> &BlockColors);
-
-    void getAnalysisUsage(AnalysisUsage &AU) const override;
-    bool doInitialization(Module &M) override;
-    bool runOnFunction(Function &F) override;
-
-  public:
-    static char ID;
-    ObjCARCContract() : FunctionPass(ID) {
-      initializeObjCARCContractPass(*PassRegistry::getPassRegistry());
-    }
-  };
+/// Late ARC optimizations
+///
+/// These change the IR in a way that makes it difficult to be analyzed by
+/// ObjCARCOpt, so it's run late.
+
+class ObjCARCContract {
+  bool Changed;
+  AAResults *AA;
+  DominatorTree *DT;
+  ProvenanceAnalysis PA;
+  ARCRuntimeEntryPoints EP;
+
+  /// A flag indicating whether this optimization pass should run.
+  bool Run;
+
+  /// The inline asm string to insert between calls and RetainRV calls to make
+  /// the optimization work on targets which need it.
+  const MDString *RVInstMarker;
+
+  /// The set of inserted objc_storeStrong calls. If at the end of walking the
+  /// function we have found no alloca instructions, these calls can be marked
+  /// "tail".
+  SmallPtrSet<CallInst *, 8> StoreStrongCalls;
+
+  /// Returns true if we eliminated Inst.
+  bool tryToPeepholeInstruction(
+      Function &F, Instruction *Inst, inst_iterator &Iter,
+      bool &TailOkForStoreStrong,
+      const DenseMap<BasicBlock *, ColorVector> &BlockColors);
+
+  bool optimizeRetainCall(Function &F, Instruction *Retain);
+
+  bool contractAutorelease(Function &F, Instruction *Autorelease,
+                           ARCInstKind Class);
+
+  void tryToContractReleaseIntoStoreStrong(
+      Instruction *Release, inst_iterator &Iter,
+      const DenseMap<BasicBlock *, ColorVector> &BlockColors);
+
+public:
+  bool init(Module &M);
+  bool run(Function &F, AAResults *AA, DominatorTree *DT);
+};
+
+class ObjCARCContractLegacyPass : public FunctionPass {
+  ObjCARCContract OCARCC;
+
+public:
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool doInitialization(Module &M) override;
+  bool runOnFunction(Function &F) override;
+
+  static char ID;
+  ObjCARCContractLegacyPass() : FunctionPass(ID) {
+    initializeObjCARCContractLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+};
 }
 
 //===----------------------------------------------------------------------===//
@@ -149,32 +156,17 @@ bool ObjCARCContract::optimizeRetainCall(Function &F, Instruction *Retain) {
 }
 
 /// Merge an autorelease with a retain into a fused call.
-bool ObjCARCContract::contractAutorelease(
-    Function &F, Instruction *Autorelease, ARCInstKind Class,
-    SmallPtrSetImpl<Instruction *> &DependingInstructions,
-    SmallPtrSetImpl<const BasicBlock *> &Visited) {
+bool ObjCARCContract::contractAutorelease(Function &F, Instruction *Autorelease,
+                                          ARCInstKind Class) {
   const Value *Arg = GetArgRCIdentityRoot(Autorelease);
 
   // Check that there are no instructions between the retain and the autorelease
   // (such as an autorelease_pop) which may change the count.
-  CallInst *Retain = nullptr;
-  if (Class == ARCInstKind::AutoreleaseRV)
-    FindDependencies(RetainAutoreleaseRVDep, Arg,
-                     Autorelease->getParent(), Autorelease,
-                     DependingInstructions, Visited, PA);
-  else
-    FindDependencies(RetainAutoreleaseDep, Arg,
-                     Autorelease->getParent(), Autorelease,
-                     DependingInstructions, Visited, PA);
-
-  Visited.clear();
-  if (DependingInstructions.size() != 1) {
-    DependingInstructions.clear();
-    return false;
-  }
-
-  Retain = dyn_cast_or_null<CallInst>(*DependingInstructions.begin());
-  DependingInstructions.clear();
+  DependenceKind DK = Class == ARCInstKind::AutoreleaseRV
+                          ? RetainAutoreleaseRVDep
+                          : RetainAutoreleaseDep;
+  auto *Retain = dyn_cast_or_null<CallInst>(
+      findSingleDependency(DK, Arg, Autorelease->getParent(), Autorelease, PA));
 
   if (!Retain || GetBasicARCInstKind(Retain) != ARCInstKind::Retain ||
       GetArgRCIdentityRoot(Retain) != Arg)
@@ -204,7 +196,7 @@ bool ObjCARCContract::contractAutorelease(
 static StoreInst *findSafeStoreForStoreStrongContraction(LoadInst *Load,
                                                          Instruction *Release,
                                                          ProvenanceAnalysis &PA,
-                                                         AliasAnalysis *AA) {
+                                                         AAResults *AA) {
   StoreInst *Store = nullptr;
   bool SawRelease = false;
 
@@ -442,8 +434,7 @@ void ObjCARCContract::tryToContractReleaseIntoStoreStrong(
 
 bool ObjCARCContract::tryToPeepholeInstruction(
     Function &F, Instruction *Inst, inst_iterator &Iter,
-    SmallPtrSetImpl<Instruction *> &DependingInsts,
-    SmallPtrSetImpl<const BasicBlock *> &Visited, bool &TailOkForStoreStrongs,
+    bool &TailOkForStoreStrongs,
     const DenseMap<BasicBlock *, ColorVector> &BlockColors) {
   // Only these library routines return their argument. In particular,
   // objc_retainBlock does not necessarily return its argument.
@@ -454,7 +445,7 @@ bool ObjCARCContract::tryToPeepholeInstruction(
     return false;
   case ARCInstKind::Autorelease:
   case ARCInstKind::AutoreleaseRV:
-    return contractAutorelease(F, Inst, Class, DependingInsts, Visited);
+    return contractAutorelease(F, Inst, Class);
   case ARCInstKind::Retain:
     // Attempt to convert retains to retainrvs if they are next to function
     // calls.
@@ -485,7 +476,7 @@ bool ObjCARCContract::tryToPeepholeInstruction(
       --BBI;
     } while (IsNoopInstruction(&*BBI));
 
-    if (&*BBI == GetArgRCIdentityRoot(Inst)) {
+    if (GetRCIdentityRoot(&*BBI) == GetArgRCIdentityRoot(Inst)) {
       LLVM_DEBUG(dbgs() << "Adding inline asm marker for the return value "
                            "optimization.\n");
       Changed = true;
@@ -542,7 +533,22 @@ bool ObjCARCContract::tryToPeepholeInstruction(
 //                              Top Level Driver
 //===----------------------------------------------------------------------===//
 
-bool ObjCARCContract::runOnFunction(Function &F) {
+bool ObjCARCContract::init(Module &M) {
+  // If nothing in the Module uses ARC, don't do anything.
+  Run = ModuleHasARC(M);
+  if (!Run)
+    return false;
+
+  EP.init(&M);
+
+  // Initialize RVInstMarker.
+  const char *MarkerKey = "clang.arc.retainAutoreleasedReturnValueMarker";
+  RVInstMarker = dyn_cast_or_null<MDString>(M.getModuleFlag(MarkerKey));
+
+  return false;
+}
+
+bool ObjCARCContract::run(Function &F, AAResults *A, DominatorTree *D) {
   if (!EnableARCOpts)
     return false;
 
@@ -551,10 +557,9 @@ bool ObjCARCContract::runOnFunction(Function &F) {
     return false;
 
   Changed = false;
-  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
-  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-
-  PA.setAA(&getAnalysis<AAResultsWrapperPass>().getAAResults());
+  AA = A;
+  DT = D;
+  PA.setAA(A);
 
   DenseMap<BasicBlock *, ColorVector> BlockColors;
   if (F.hasPersonalityFn() &&
@@ -574,9 +579,6 @@ bool ObjCARCContract::runOnFunction(Function &F) {
   // For ObjC library calls which return their argument, replace uses of the
   // argument with uses of the call return value, if it dominates the use. This
   // reduces register pressure.
-  SmallPtrSet<Instruction *, 4> DependingInstructions;
-  SmallPtrSet<const BasicBlock *, 4> Visited;
-
   for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E;) {
     Instruction *Inst = &*I++;
 
@@ -584,8 +586,8 @@ bool ObjCARCContract::runOnFunction(Function &F) {
 
     // First try to peephole Inst. If there is nothing further we can do in
     // terms of undoing objc-arc-expand, process the next inst.
-    if (tryToPeepholeInstruction(F, Inst, I, DependingInstructions, Visited,
-                                 TailOkForStoreStrongs, BlockColors))
+    if (tryToPeepholeInstruction(F, Inst, I, TailOkForStoreStrongs,
+                                 BlockColors))
       continue;
 
     // Otherwise, try to undo objc-arc-expand.
@@ -720,33 +722,45 @@ bool ObjCARCContract::runOnFunction(Function &F) {
 //                             Misc Pass Manager
 //===----------------------------------------------------------------------===//
 
-char ObjCARCContract::ID = 0;
-INITIALIZE_PASS_BEGIN(ObjCARCContract, "objc-arc-contract",
+char ObjCARCContractLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(ObjCARCContractLegacyPass, "objc-arc-contract",
                       "ObjC ARC contraction", false, false)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_END(ObjCARCContract, "objc-arc-contract",
+INITIALIZE_PASS_END(ObjCARCContractLegacyPass, "objc-arc-contract",
                     "ObjC ARC contraction", false, false)
 
-void ObjCARCContract::getAnalysisUsage(AnalysisUsage &AU) const {
+void ObjCARCContractLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<AAResultsWrapperPass>();
   AU.addRequired<DominatorTreeWrapperPass>();
   AU.setPreservesCFG();
 }
 
-Pass *llvm::createObjCARCContractPass() { return new ObjCARCContract(); }
-
-bool ObjCARCContract::doInitialization(Module &M) {
-  // If nothing in the Module uses ARC, don't do anything.
-  Run = ModuleHasARC(M);
-  if (!Run)
-    return false;
+Pass *llvm::createObjCARCContractPass() {
+  return new ObjCARCContractLegacyPass();
+}
 
-  EP.init(&M);
+bool ObjCARCContractLegacyPass::doInitialization(Module &M) {
+  return OCARCC.init(M);
+}
 
-  // Initialize RVInstMarker.
-  const char *MarkerKey = "clang.arc.retainAutoreleasedReturnValueMarker";
-  RVInstMarker = dyn_cast_or_null<MDString>(M.getModuleFlag(MarkerKey));
+bool ObjCARCContractLegacyPass::runOnFunction(Function &F) {
+  auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  return OCARCC.run(F, AA, DT);
+}
 
-  return false;
+PreservedAnalyses ObjCARCContractPass::run(Function &F,
+                                           FunctionAnalysisManager &AM) {
+  ObjCARCContract OCAC;
+  OCAC.init(*F.getParent());
+
+  bool Changed = OCAC.run(F, &AM.getResult<AAManager>(F),
+                          &AM.getResult<DominatorTreeAnalysis>(F));
+  if (Changed) {
+    PreservedAnalyses PA;
+    PA.preserveSet<CFGAnalyses>();
+    return PA;
+  }
+  return PreservedAnalyses::all();
 }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCExpand.cpp b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCExpand.cpp
index f8d872a7c995..d2121dcebe91 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCExpand.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCExpand.cpp
@@ -27,6 +27,7 @@
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/IR/Value.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
@@ -34,57 +35,20 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/ObjCARC.h"
 
 #define DEBUG_TYPE "objc-arc-expand"
 
-namespace llvm {
-  class Module;
-}
-
 using namespace llvm;
 using namespace llvm::objcarc;
 
 namespace {
-  /// Early ARC transformations.
-  class ObjCARCExpand : public FunctionPass {
-    void getAnalysisUsage(AnalysisUsage &AU) const override;
-    bool doInitialization(Module &M) override;
-    bool runOnFunction(Function &F) override;
-
-    /// A flag indicating whether this optimization pass should run.
-    bool Run;
-
-  public:
-    static char ID;
-    ObjCARCExpand() : FunctionPass(ID) {
-      initializeObjCARCExpandPass(*PassRegistry::getPassRegistry());
-    }
-  };
-}
-
-char ObjCARCExpand::ID = 0;
-INITIALIZE_PASS(ObjCARCExpand,
-                "objc-arc-expand", "ObjC ARC expansion", false, false)
-
-Pass *llvm::createObjCARCExpandPass() {
-  return new ObjCARCExpand();
-}
-
-void ObjCARCExpand::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.setPreservesCFG();
-}
-
-bool ObjCARCExpand::doInitialization(Module &M) {
-  Run = ModuleHasARC(M);
-  return false;
-}
-
-bool ObjCARCExpand::runOnFunction(Function &F) {
+static bool runImpl(Function &F) {
   if (!EnableARCOpts)
     return false;
 
   // If nothing in the Module uses ARC, don't do anything.
-  if (!Run)
+  if (!ModuleHasARC(*F.getParent()))
     return false;
 
   bool Changed = false;
@@ -126,3 +90,37 @@ bool ObjCARCExpand::runOnFunction(Function &F) {
 
   return Changed;
 }
+
+/// Early ARC transformations.
+class ObjCARCExpand : public FunctionPass {
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnFunction(Function &F) override;
+
+public:
+  static char ID;
+  ObjCARCExpand() : FunctionPass(ID) {
+    initializeObjCARCExpandPass(*PassRegistry::getPassRegistry());
+  }
+};
+} // namespace
+
+char ObjCARCExpand::ID = 0;
+INITIALIZE_PASS(ObjCARCExpand, "objc-arc-expand", "ObjC ARC expansion", false,
+                false)
+
+Pass *llvm::createObjCARCExpandPass() { return new ObjCARCExpand(); }
+
+void ObjCARCExpand::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+}
+
+bool ObjCARCExpand::runOnFunction(Function &F) { return runImpl(F); }
+
+PreservedAnalyses ObjCARCExpandPass::run(Function &F,
+                                         FunctionAnalysisManager &AM) {
+  if (!runImpl(F))
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
index cb1fa804fa11..1c447499519c 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
@@ -65,6 +65,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/ObjCARC.h"
 #include <cassert>
 #include <iterator>
 #include <utility>
@@ -480,123 +481,133 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, BBState &BBInfo) {
 namespace {
 
   /// The main ARC optimization pass.
-  class ObjCARCOpt : public FunctionPass {
-    bool Changed;
-    ProvenanceAnalysis PA;
-
-    /// A cache of references to runtime entry point constants.
-    ARCRuntimeEntryPoints EP;
-
-    /// A cache of MDKinds that can be passed into other functions to propagate
-    /// MDKind identifiers.
-    ARCMDKindCache MDKindCache;
-
-    /// A flag indicating whether this optimization pass should run.
-    bool Run;
-
-    /// A flag indicating whether the optimization that removes or moves
-    /// retain/release pairs should be performed.
-    bool DisableRetainReleasePairing = false;
-
-    /// Flags which determine whether each of the interesting runtime functions
-    /// is in fact used in the current function.
-    unsigned UsedInThisFunction;
-
-    bool OptimizeRetainRVCall(Function &F, Instruction *RetainRV);
-    void OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV,
-                                   ARCInstKind &Class);
-    void OptimizeIndividualCalls(Function &F);
-
-    /// Optimize an individual call, optionally passing the
-    /// GetArgRCIdentityRoot if it has already been computed.
-    void OptimizeIndividualCallImpl(
-        Function &F, DenseMap<BasicBlock *, ColorVector> &BlockColors,
-        Instruction *Inst, ARCInstKind Class, const Value *Arg);
-
-    /// Try to optimize an AutoreleaseRV with a RetainRV or ClaimRV.  If the
-    /// optimization occurs, returns true to indicate that the caller should
-    /// assume the instructions are dead.
-    bool OptimizeInlinedAutoreleaseRVCall(
-        Function &F, DenseMap<BasicBlock *, ColorVector> &BlockColors,
-        Instruction *Inst, const Value *&Arg, ARCInstKind Class,
-        Instruction *AutoreleaseRV, const Value *&AutoreleaseRVArg);
-
-    void CheckForCFGHazards(const BasicBlock *BB,
-                            DenseMap<const BasicBlock *, BBState> &BBStates,
-                            BBState &MyStates) const;
-    bool VisitInstructionBottomUp(Instruction *Inst, BasicBlock *BB,
-                                  BlotMapVector<Value *, RRInfo> &Retains,
-                                  BBState &MyStates);
-    bool VisitBottomUp(BasicBlock *BB,
-                       DenseMap<const BasicBlock *, BBState> &BBStates,
-                       BlotMapVector<Value *, RRInfo> &Retains);
-    bool VisitInstructionTopDown(Instruction *Inst,
-                                 DenseMap<Value *, RRInfo> &Releases,
-                                 BBState &MyStates);
-    bool VisitTopDown(BasicBlock *BB,
-                      DenseMap<const BasicBlock *, BBState> &BBStates,
-                      DenseMap<Value *, RRInfo> &Releases);
-    bool Visit(Function &F, DenseMap<const BasicBlock *, BBState> &BBStates,
-               BlotMapVector<Value *, RRInfo> &Retains,
-               DenseMap<Value *, RRInfo> &Releases);
-
-    void MoveCalls(Value *Arg, RRInfo &RetainsToMove, RRInfo &ReleasesToMove,
-                   BlotMapVector<Value *, RRInfo> &Retains,
-                   DenseMap<Value *, RRInfo> &Releases,
-                   SmallVectorImpl<Instruction *> &DeadInsts, Module *M);
-
-    bool
-    PairUpRetainsAndReleases(DenseMap<const BasicBlock *, BBState> &BBStates,
-                             BlotMapVector<Value *, RRInfo> &Retains,
-                             DenseMap<Value *, RRInfo> &Releases, Module *M,
-                             Instruction * Retain,
-                             SmallVectorImpl<Instruction *> &DeadInsts,
-                             RRInfo &RetainsToMove, RRInfo &ReleasesToMove,
-                             Value *Arg, bool KnownSafe,
-                             bool &AnyPairsCompletelyEliminated);
-
-    bool PerformCodePlacement(DenseMap<const BasicBlock *, BBState> &BBStates,
-                              BlotMapVector<Value *, RRInfo> &Retains,
-                              DenseMap<Value *, RRInfo> &Releases, Module *M);
-
-    void OptimizeWeakCalls(Function &F);
-
-    bool OptimizeSequences(Function &F);
-
-    void OptimizeReturns(Function &F);
+class ObjCARCOpt {
+  bool Changed;
+  ProvenanceAnalysis PA;
+
+  /// A cache of references to runtime entry point constants.
+  ARCRuntimeEntryPoints EP;
+
+  /// A cache of MDKinds that can be passed into other functions to propagate
+  /// MDKind identifiers.
+  ARCMDKindCache MDKindCache;
+
+  /// A flag indicating whether this optimization pass should run.
+  bool Run;
+
+  /// A flag indicating whether the optimization that removes or moves
+  /// retain/release pairs should be performed.
+  bool DisableRetainReleasePairing = false;
+
+  /// Flags which determine whether each of the interesting runtime functions
+  /// is in fact used in the current function.
+  unsigned UsedInThisFunction;
+
+  bool OptimizeRetainRVCall(Function &F, Instruction *RetainRV);
+  void OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV,
+                                 ARCInstKind &Class);
+  void OptimizeIndividualCalls(Function &F);
+
+  /// Optimize an individual call, optionally passing the
+  /// GetArgRCIdentityRoot if it has already been computed.
+  void OptimizeIndividualCallImpl(
+      Function &F, DenseMap<BasicBlock *, ColorVector> &BlockColors,
+      Instruction *Inst, ARCInstKind Class, const Value *Arg);
+
+  /// Try to optimize an AutoreleaseRV with a RetainRV or ClaimRV.  If the
+  /// optimization occurs, returns true to indicate that the caller should
+  /// assume the instructions are dead.
+  bool OptimizeInlinedAutoreleaseRVCall(
+      Function &F, DenseMap<BasicBlock *, ColorVector> &BlockColors,
+      Instruction *Inst, const Value *&Arg, ARCInstKind Class,
+      Instruction *AutoreleaseRV, const Value *&AutoreleaseRVArg);
+
+  void CheckForCFGHazards(const BasicBlock *BB,
+                          DenseMap<const BasicBlock *, BBState> &BBStates,
+                          BBState &MyStates) const;
+  bool VisitInstructionBottomUp(Instruction *Inst, BasicBlock *BB,
+                                BlotMapVector<Value *, RRInfo> &Retains,
+                                BBState &MyStates);
+  bool VisitBottomUp(BasicBlock *BB,
+                     DenseMap<const BasicBlock *, BBState> &BBStates,
+                     BlotMapVector<Value *, RRInfo> &Retains);
+  bool VisitInstructionTopDown(Instruction *Inst,
+                               DenseMap<Value *, RRInfo> &Releases,
+                               BBState &MyStates);
+  bool VisitTopDown(BasicBlock *BB,
+                    DenseMap<const BasicBlock *, BBState> &BBStates,
+                    DenseMap<Value *, RRInfo> &Releases);
+  bool Visit(Function &F, DenseMap<const BasicBlock *, BBState> &BBStates,
+             BlotMapVector<Value *, RRInfo> &Retains,
+             DenseMap<Value *, RRInfo> &Releases);
+
+  void MoveCalls(Value *Arg, RRInfo &RetainsToMove, RRInfo &ReleasesToMove,
+                 BlotMapVector<Value *, RRInfo> &Retains,
+                 DenseMap<Value *, RRInfo> &Releases,
+                 SmallVectorImpl<Instruction *> &DeadInsts, Module *M);
+
+  bool PairUpRetainsAndReleases(DenseMap<const BasicBlock *, BBState> &BBStates,
+                                BlotMapVector<Value *, RRInfo> &Retains,
+                                DenseMap<Value *, RRInfo> &Releases, Module *M,
+                                Instruction *Retain,
+                                SmallVectorImpl<Instruction *> &DeadInsts,
+                                RRInfo &RetainsToMove, RRInfo &ReleasesToMove,
+                                Value *Arg, bool KnownSafe,
+                                bool &AnyPairsCompletelyEliminated);
+
+  bool PerformCodePlacement(DenseMap<const BasicBlock *, BBState> &BBStates,
+                            BlotMapVector<Value *, RRInfo> &Retains,
+                            DenseMap<Value *, RRInfo> &Releases, Module *M);
+
+  void OptimizeWeakCalls(Function &F);
+
+  bool OptimizeSequences(Function &F);
+
+  void OptimizeReturns(Function &F);
 
 #ifndef NDEBUG
-    void GatherStatistics(Function &F, bool AfterOptimization = false);
+  void GatherStatistics(Function &F, bool AfterOptimization = false);
 #endif
 
-    void getAnalysisUsage(AnalysisUsage &AU) const override;
-    bool doInitialization(Module &M) override;
-    bool runOnFunction(Function &F) override;
-    void releaseMemory() override;
-
   public:
-    static char ID;
-
-    ObjCARCOpt() : FunctionPass(ID) {
-      initializeObjCARCOptPass(*PassRegistry::getPassRegistry());
-    }
-  };
+    void init(Module &M);
+    bool run(Function &F, AAResults &AA);
+    void releaseMemory();
+};
+
+/// The main ARC optimization pass.
+class ObjCARCOptLegacyPass : public FunctionPass {
+public:
+  ObjCARCOptLegacyPass() : FunctionPass(ID) {
+    initializeObjCARCOptLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool doInitialization(Module &M) override {
+    OCAO.init(M);
+    return false;
+  }
+  bool runOnFunction(Function &F) override {
+    return OCAO.run(F, getAnalysis<AAResultsWrapperPass>().getAAResults());
+  }
+  void releaseMemory() override { OCAO.releaseMemory(); }
+  static char ID;
 
+private:
+  ObjCARCOpt OCAO;
+};
 } // end anonymous namespace
 
-char ObjCARCOpt::ID = 0;
+char ObjCARCOptLegacyPass::ID = 0;
 
-INITIALIZE_PASS_BEGIN(ObjCARCOpt,
-                      "objc-arc", "ObjC ARC optimization", false, false)
+INITIALIZE_PASS_BEGIN(ObjCARCOptLegacyPass, "objc-arc", "ObjC ARC optimization",
+                      false, false)
 INITIALIZE_PASS_DEPENDENCY(ObjCARCAAWrapperPass)
-INITIALIZE_PASS_END(ObjCARCOpt,
-                    "objc-arc", "ObjC ARC optimization", false, false)
+INITIALIZE_PASS_END(ObjCARCOptLegacyPass, "objc-arc", "ObjC ARC optimization",
+                    false, false)
 
-Pass *llvm::createObjCARCOptPass() {
-  return new ObjCARCOpt();
-}
+Pass *llvm::createObjCARCOptPass() { return new ObjCARCOptLegacyPass(); }
 
-void ObjCARCOpt::getAnalysisUsage(AnalysisUsage &AU) const {
+void ObjCARCOptLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<ObjCARCAAWrapperPass>();
   AU.addRequired<AAResultsWrapperPass>();
   // ARC optimization doesn't currently split critical edges.
@@ -664,7 +675,7 @@ bool ObjCARCOpt::OptimizeInlinedAutoreleaseRVCall(
 
     SmallVector<const Value *, 4> ArgUsers;
     getEquivalentPHIs(*PN, ArgUsers);
-    if (llvm::find(ArgUsers, AutoreleaseRVArg) == ArgUsers.end())
+    if (!llvm::is_contained(ArgUsers, AutoreleaseRVArg))
       return false;
   }
 
@@ -1114,8 +1125,7 @@ void ObjCARCOpt::OptimizeIndividualCallImpl(
     if (!HasNull)
       continue;
 
-    SmallPtrSet<Instruction *, 4> DependingInstructions;
-    SmallPtrSet<const BasicBlock *, 4> Visited;
+    Instruction *DepInst = nullptr;
 
     // Check that there is nothing that cares about the reference
     // count between the call and the phi.
@@ -1127,13 +1137,13 @@ void ObjCARCOpt::OptimizeIndividualCallImpl(
     case ARCInstKind::Release:
       // These can't be moved across things that care about the retain
       // count.
-      FindDependencies(NeedsPositiveRetainCount, Arg, Inst->getParent(), Inst,
-                       DependingInstructions, Visited, PA);
+      DepInst = findSingleDependency(NeedsPositiveRetainCount, Arg,
+                                     Inst->getParent(), Inst, PA);
       break;
     case ARCInstKind::Autorelease:
       // These can't be moved across autorelease pool scope boundaries.
-      FindDependencies(AutoreleasePoolBoundary, Arg, Inst->getParent(), Inst,
-                       DependingInstructions, Visited, PA);
+      DepInst = findSingleDependency(AutoreleasePoolBoundary, Arg,
+                                     Inst->getParent(), Inst, PA);
       break;
     case ARCInstKind::ClaimRV:
     case ARCInstKind::RetainRV:
@@ -1147,9 +1157,7 @@ void ObjCARCOpt::OptimizeIndividualCallImpl(
       llvm_unreachable("Invalid dependence flavor");
     }
 
-    if (DependingInstructions.size() != 1)
-      continue;
-    if (*DependingInstructions.begin() != PN)
+    if (DepInst != PN)
       continue;
 
     Changed = true;
@@ -2223,25 +2231,21 @@ bool ObjCARCOpt::OptimizeSequences(Function &F) {
 /// Check if there is a dependent call earlier that does not have anything in
 /// between the Retain and the call that can affect the reference count of their
 /// shared pointer argument. Note that Retain need not be in BB.
-static bool
-HasSafePathToPredecessorCall(const Value *Arg, Instruction *Retain,
-                             SmallPtrSetImpl<Instruction *> &DepInsts,
-                             SmallPtrSetImpl<const BasicBlock *> &Visited,
-                             ProvenanceAnalysis &PA) {
-  FindDependencies(CanChangeRetainCount, Arg, Retain->getParent(), Retain,
-                   DepInsts, Visited, PA);
-  if (DepInsts.size() != 1)
-    return false;
-
-  auto *Call = dyn_cast_or_null<CallInst>(*DepInsts.begin());
+static CallInst *HasSafePathToPredecessorCall(const Value *Arg,
+                                              Instruction *Retain,
+                                              ProvenanceAnalysis &PA) {
+  auto *Call = dyn_cast_or_null<CallInst>(findSingleDependency(
+      CanChangeRetainCount, Arg, Retain->getParent(), Retain, PA));
 
   // Check that the pointer is the return value of the call.
   if (!Call || Arg != Call)
-    return false;
+    return nullptr;
 
   // Check that the call is a regular call.
   ARCInstKind Class = GetBasicARCInstKind(Call);
-  return Class == ARCInstKind::CallOrUser || Class == ARCInstKind::Call;
+  return Class == ARCInstKind::CallOrUser || Class == ARCInstKind::Call
+             ? Call
+             : nullptr;
 }
 
 /// Find a dependent retain that precedes the given autorelease for which there
@@ -2250,15 +2254,9 @@ HasSafePathToPredecessorCall(const Value *Arg, Instruction *Retain,
 static CallInst *
 FindPredecessorRetainWithSafePath(const Value *Arg, BasicBlock *BB,
                                   Instruction *Autorelease,
-                                  SmallPtrSetImpl<Instruction *> &DepInsts,
-                                  SmallPtrSetImpl<const BasicBlock *> &Visited,
                                   ProvenanceAnalysis &PA) {
-  FindDependencies(CanChangeRetainCount, Arg,
-                   BB, Autorelease, DepInsts, Visited, PA);
-  if (DepInsts.size() != 1)
-    return nullptr;
-
-  auto *Retain = dyn_cast_or_null<CallInst>(*DepInsts.begin());
+  auto *Retain = dyn_cast_or_null<CallInst>(
+      findSingleDependency(CanChangeRetainCount, Arg, BB, Autorelease, PA));
 
   // Check that we found a retain with the same argument.
   if (!Retain || !IsRetain(GetBasicARCInstKind(Retain)) ||
@@ -2275,15 +2273,11 @@ FindPredecessorRetainWithSafePath(const Value *Arg, BasicBlock *BB,
 static CallInst *
 FindPredecessorAutoreleaseWithSafePath(const Value *Arg, BasicBlock *BB,
                                        ReturnInst *Ret,
-                                       SmallPtrSetImpl<Instruction *> &DepInsts,
-                                       SmallPtrSetImpl<const BasicBlock *> &V,
                                        ProvenanceAnalysis &PA) {
-  FindDependencies(NeedsPositiveRetainCount, Arg,
-                   BB, Ret, DepInsts, V, PA);
-  if (DepInsts.size() != 1)
-    return nullptr;
+  SmallPtrSet<Instruction *, 4> DepInsts;
+  auto *Autorelease = dyn_cast_or_null<CallInst>(
+      findSingleDependency(NeedsPositiveRetainCount, Arg, BB, Ret, PA));
 
-  auto *Autorelease = dyn_cast_or_null<CallInst>(*DepInsts.begin());
   if (!Autorelease)
     return nullptr;
   ARCInstKind AutoreleaseClass = GetBasicARCInstKind(Autorelease);
@@ -2309,8 +2303,6 @@ void ObjCARCOpt::OptimizeReturns(Function &F) {
 
   LLVM_DEBUG(dbgs() << "\n== ObjCARCOpt::OptimizeReturns ==\n");
 
-  SmallPtrSet<Instruction *, 4> DependingInstructions;
-  SmallPtrSet<const BasicBlock *, 4> Visited;
   for (BasicBlock &BB: F) {
     ReturnInst *Ret = dyn_cast<ReturnInst>(&BB.back());
     if (!Ret)
@@ -2323,40 +2315,27 @@ void ObjCARCOpt::OptimizeReturns(Function &F) {
     // Look for an ``autorelease'' instruction that is a predecessor of Ret and
     // dependent on Arg such that there are no instructions dependent on Arg
     // that need a positive ref count in between the autorelease and Ret.
-    CallInst *Autorelease = FindPredecessorAutoreleaseWithSafePath(
-        Arg, &BB, Ret, DependingInstructions, Visited, PA);
-    DependingInstructions.clear();
-    Visited.clear();
+    CallInst *Autorelease =
+        FindPredecessorAutoreleaseWithSafePath(Arg, &BB, Ret, PA);
 
     if (!Autorelease)
       continue;
 
     CallInst *Retain = FindPredecessorRetainWithSafePath(
-        Arg, Autorelease->getParent(), Autorelease, DependingInstructions,
-        Visited, PA);
-    DependingInstructions.clear();
-    Visited.clear();
+        Arg, Autorelease->getParent(), Autorelease, PA);
 
     if (!Retain)
       continue;
 
     // Check that there is nothing that can affect the reference count
     // between the retain and the call.  Note that Retain need not be in BB.
-    bool HasSafePathToCall = HasSafePathToPredecessorCall(Arg, Retain,
-                                                          DependingInstructions,
-                                                          Visited, PA);
+    CallInst *Call = HasSafePathToPredecessorCall(Arg, Retain, PA);
 
     // Don't remove retainRV/autoreleaseRV pairs if the call isn't a tail call.
-    if (HasSafePathToCall &&
-        GetBasicARCInstKind(Retain) == ARCInstKind::RetainRV &&
-        GetBasicARCInstKind(Autorelease) == ARCInstKind::AutoreleaseRV &&
-        !cast<CallInst>(*DependingInstructions.begin())->isTailCall())
-      continue;
-
-    DependingInstructions.clear();
-    Visited.clear();
-
-    if (!HasSafePathToCall)
+    if (!Call ||
+        (!Call->isTailCall() &&
+         GetBasicARCInstKind(Retain) == ARCInstKind::RetainRV &&
+         GetBasicARCInstKind(Autorelease) == ARCInstKind::AutoreleaseRV))
       continue;
 
     // If so, we can zap the retain and autorelease.
@@ -2393,14 +2372,14 @@ ObjCARCOpt::GatherStatistics(Function &F, bool AfterOptimization) {
 }
 #endif
 
-bool ObjCARCOpt::doInitialization(Module &M) {
+void ObjCARCOpt::init(Module &M) {
   if (!EnableARCOpts)
-    return false;
+    return;
 
   // If nothing in the Module uses ARC, don't do anything.
   Run = ModuleHasARC(M);
   if (!Run)
-    return false;
+    return;
 
   // Intuitively, objc_retain and others are nocapture, however in practice
   // they are not, because they return their argument value. And objc_release
@@ -2409,11 +2388,9 @@ bool ObjCARCOpt::doInitialization(Module &M) {
 
   // Initialize our runtime entry point cache.
   EP.init(&M);
-
-  return false;
 }
 
-bool ObjCARCOpt::runOnFunction(Function &F) {
+bool ObjCARCOpt::run(Function &F, AAResults &AA) {
   if (!EnableARCOpts)
     return false;
 
@@ -2427,7 +2404,7 @@ bool ObjCARCOpt::runOnFunction(Function &F) {
                     << " >>>"
                        "\n");
 
-  PA.setAA(&getAnalysis<AAResultsWrapperPass>().getAAResults());
+  PA.setAA(&AA);
 
 #ifndef NDEBUG
   if (AreStatisticsEnabled()) {
@@ -2484,3 +2461,17 @@ void ObjCARCOpt::releaseMemory() {
 
 /// @}
 ///
+
+PreservedAnalyses ObjCARCOptPass::run(Function &F,
+                                      FunctionAnalysisManager &AM) {
+  ObjCARCOpt OCAO;
+  OCAO.init(*F.getParent());
+
+  bool Changed = OCAO.run(F, AM.getResult<AAManager>(F));
+  if (Changed) {
+    PreservedAnalyses PA;
+    PA.preserveSet<CFGAnalyses>();
+    return PA;
+  }
+  return PreservedAnalyses::all();
+}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp
index c6138edba95a..3d59b2edc55c 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp
@@ -40,22 +40,19 @@ using namespace llvm::objcarc;
 
 bool ProvenanceAnalysis::relatedSelect(const SelectInst *A,
                                        const Value *B) {
-  const DataLayout &DL = A->getModule()->getDataLayout();
   // If the values are Selects with the same condition, we can do a more precise
   // check: just check for relations between the values on corresponding arms.
   if (const SelectInst *SB = dyn_cast<SelectInst>(B))
     if (A->getCondition() == SB->getCondition())
-      return related(A->getTrueValue(), SB->getTrueValue(), DL) ||
-             related(A->getFalseValue(), SB->getFalseValue(), DL);
+      return related(A->getTrueValue(), SB->getTrueValue()) ||
+             related(A->getFalseValue(), SB->getFalseValue());
 
   // Check both arms of the Select node individually.
-  return related(A->getTrueValue(), B, DL) ||
-         related(A->getFalseValue(), B, DL);
+  return related(A->getTrueValue(), B) || related(A->getFalseValue(), B);
 }
 
 bool ProvenanceAnalysis::relatedPHI(const PHINode *A,
                                     const Value *B) {
-  const DataLayout &DL = A->getModule()->getDataLayout();
   // If the values are PHIs in the same block, we can do a more precise as well
   // as efficient check: just check for relations between the values on
   // corresponding edges.
@@ -63,7 +60,7 @@ bool ProvenanceAnalysis::relatedPHI(const PHINode *A,
     if (PNB->getParent() == A->getParent()) {
       for (unsigned i = 0, e = A->getNumIncomingValues(); i != e; ++i)
         if (related(A->getIncomingValue(i),
-                    PNB->getIncomingValueForBlock(A->getIncomingBlock(i)), DL))
+                    PNB->getIncomingValueForBlock(A->getIncomingBlock(i))))
           return true;
       return false;
     }
@@ -71,7 +68,7 @@ bool ProvenanceAnalysis::relatedPHI(const PHINode *A,
   // Check each unique source of the PHI node against B.
   SmallPtrSet<const Value *, 4> UniqueSrc;
   for (Value *PV1 : A->incoming_values()) {
-    if (UniqueSrc.insert(PV1).second && related(PV1, B, DL))
+    if (UniqueSrc.insert(PV1).second && related(PV1, B))
       return true;
   }
 
@@ -112,8 +109,7 @@ static bool IsStoredObjCPointer(const Value *P) {
   return false;
 }
 
-bool ProvenanceAnalysis::relatedCheck(const Value *A, const Value *B,
-                                      const DataLayout &DL) {
+bool ProvenanceAnalysis::relatedCheck(const Value *A, const Value *B) {
   // Ask regular AliasAnalysis, for a first approximation.
   switch (AA->alias(A, B)) {
   case NoAlias:
@@ -160,10 +156,9 @@ bool ProvenanceAnalysis::relatedCheck(const Value *A, const Value *B,
   return true;
 }
 
-bool ProvenanceAnalysis::related(const Value *A, const Value *B,
-                                 const DataLayout &DL) {
-  A = GetUnderlyingObjCPtrCached(A, DL, UnderlyingObjCPtrCache);
-  B = GetUnderlyingObjCPtrCached(B, DL, UnderlyingObjCPtrCache);
+bool ProvenanceAnalysis::related(const Value *A, const Value *B) {
+  A = GetUnderlyingObjCPtrCached(A, UnderlyingObjCPtrCache);
+  B = GetUnderlyingObjCPtrCached(B, UnderlyingObjCPtrCache);
 
   // Quick check.
   if (A == B)
@@ -178,7 +173,7 @@ bool ProvenanceAnalysis::related(const Value *A, const Value *B,
   if (!Pair.second)
     return Pair.first->second;
 
-  bool Result = relatedCheck(A, B, DL);
+  bool Result = relatedCheck(A, B);
   CachedResults[ValuePairTy(A, B)] = Result;
   return Result;
 }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h
index 8fd842fd42d6..a63e356ce1fc 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h
@@ -26,12 +26,12 @@
 #define LLVM_LIB_TRANSFORMS_OBJCARC_PROVENANCEANALYSIS_H
 
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/IR/ValueHandle.h"
 #include <utility>
 
 namespace llvm {
 
+class AAResults;
 class DataLayout;
 class PHINode;
 class SelectInst;
@@ -49,7 +49,7 @@ namespace objcarc {
 /// not two pointers have the same provenance source and thus could
 /// potentially be related.
 class ProvenanceAnalysis {
-  AliasAnalysis *AA;
+  AAResults *AA;
 
   using ValuePairTy = std::pair<const Value *, const Value *>;
   using CachedResultsTy = DenseMap<ValuePairTy, bool>;
@@ -58,7 +58,7 @@ class ProvenanceAnalysis {
 
   DenseMap<const Value *, WeakTrackingVH> UnderlyingObjCPtrCache;
 
-  bool relatedCheck(const Value *A, const Value *B, const DataLayout &DL);
+  bool relatedCheck(const Value *A, const Value *B);
   bool relatedSelect(const SelectInst *A, const Value *B);
   bool relatedPHI(const PHINode *A, const Value *B);
 
@@ -67,11 +67,11 @@ public:
   ProvenanceAnalysis(const ProvenanceAnalysis &) = delete;
   ProvenanceAnalysis &operator=(const ProvenanceAnalysis &) = delete;
 
-  void setAA(AliasAnalysis *aa) { AA = aa; }
+  void setAA(AAResults *aa) { AA = aa; }
 
-  AliasAnalysis *getAA() const { return AA; }
+  AAResults *getAA() const { return AA; }
 
-  bool related(const Value *A, const Value *B, const DataLayout &DL);
+  bool related(const Value *A, const Value *B);
 
   void clear() {
     CachedResults.clear();
diff --git a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp
index 99a2055aba94..6fdfe787d438 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp
@@ -12,6 +12,7 @@
 #include "llvm/Analysis/Passes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstIterator.h"
+#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
@@ -66,7 +67,6 @@ bool PAEval::runOnFunction(Function &F) {
 
   ProvenanceAnalysis PA;
   PA.setAA(&getAnalysis<AAResultsWrapperPass>().getAAResults());
-  const DataLayout &DL = F.getParent()->getDataLayout();
 
   for (Value *V1 : Values) {
     StringRef NameV1 = getName(V1);
@@ -75,7 +75,7 @@ bool PAEval::runOnFunction(Function &F) {
       if (NameV1 >= NameV2)
         continue;
       errs() << NameV1 << " and " << NameV2;
-      if (PA.related(V1, V2, DL))
+      if (PA.related(V1, V2))
         errs() << " are related.\n";
       else
         errs() << " are not related.\n";
diff --git a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/PtrState.cpp b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/PtrState.cpp
index 26dd416d6184..6071ec3e4d91 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/PtrState.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/PtrState.cpp
@@ -232,7 +232,7 @@ bool BottomUpPtrState::HandlePotentialAlterRefCount(Instruction *Inst,
   Sequence S = GetSeq();
 
   // Check for possible releases.
-  if (!CanAlterRefCount(Inst, Ptr, PA, Class))
+  if (!CanDecrementRefCount(Inst, Ptr, PA, Class))
     return false;
 
   LLVM_DEBUG(dbgs() << "            CanAlterRefCount: Seq: " << S << "; "
@@ -383,7 +383,7 @@ bool TopDownPtrState::HandlePotentialAlterRefCount(Instruction *Inst,
                                                    ARCInstKind Class) {
   // Check for possible releases. Treat clang.arc.use as a releasing instruction
   // to prevent sinking a retain past it.
-  if (!CanAlterRefCount(Inst, Ptr, PA, Class) &&
+  if (!CanDecrementRefCount(Inst, Ptr, PA, Class) &&
       Class != ARCInstKind::IntrinsicUser)
     return false;
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/ADCE.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ADCE.cpp
index c3709b9afffb..ce4e5e575fbf 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/ADCE.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ADCE.cpp
@@ -325,7 +325,7 @@ void AggressiveDeadCodeElimination::initialize() {
 
 bool AggressiveDeadCodeElimination::isAlwaysLive(Instruction &I) {
   // TODO -- use llvm::isInstructionTriviallyDead
-  if (I.isEHPad() || I.mayHaveSideEffects()) {
+  if (I.isEHPad() || I.mayHaveSideEffects() || !I.willReturn()) {
     // Skip any value profile instrumentation calls if they are
     // instrumenting constants.
     if (isInstrumentsConstant(I))
@@ -643,7 +643,7 @@ void AggressiveDeadCodeElimination::computeReversePostOrder() {
   SmallPtrSet<BasicBlock*, 16> Visited;
   unsigned PostOrder = 0;
   for (auto &BB : F) {
-    if (succ_begin(&BB) != succ_end(&BB))
+    if (!succ_empty(&BB))
       continue;
     for (BasicBlock *Block : inverse_post_order_ext(&BB,Visited))
       BlockInfo[Block].PostOrder = PostOrder++;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
index 5c008585869c..bccf94fc217f 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
@@ -15,6 +15,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/IR/Instructions.h"
 #include "llvm/InitializePasses.h"
 #define AA_NAME "alignment-from-assumptions"
 #define DEBUG_TYPE AA_NAME
@@ -203,103 +204,33 @@ static Align getNewAlignment(const SCEV *AASCEV, const SCEV *AlignSCEV,
 }
 
 bool AlignmentFromAssumptionsPass::extractAlignmentInfo(CallInst *I,
+                                                        unsigned Idx,
                                                         Value *&AAPtr,
                                                         const SCEV *&AlignSCEV,
                                                         const SCEV *&OffSCEV) {
-  // An alignment assume must be a statement about the least-significant
-  // bits of the pointer being zero, possibly with some offset.
-  ICmpInst *ICI = dyn_cast<ICmpInst>(I->getArgOperand(0));
-  if (!ICI)
+  Type *Int64Ty = Type::getInt64Ty(I->getContext());
+  OperandBundleUse AlignOB = I->getOperandBundleAt(Idx);
+  if (AlignOB.getTagName() != "align")
     return false;
-
-  // This must be an expression of the form: x & m == 0.
-  if (ICI->getPredicate() != ICmpInst::ICMP_EQ)
-    return false;
-
-  // Swap things around so that the RHS is 0.
-  Value *CmpLHS = ICI->getOperand(0);
-  Value *CmpRHS = ICI->getOperand(1);
-  const SCEV *CmpLHSSCEV = SE->getSCEV(CmpLHS);
-  const SCEV *CmpRHSSCEV = SE->getSCEV(CmpRHS);
-  if (CmpLHSSCEV->isZero())
-    std::swap(CmpLHS, CmpRHS);
-  else if (!CmpRHSSCEV->isZero())
-    return false;
-
-  BinaryOperator *CmpBO = dyn_cast<BinaryOperator>(CmpLHS);
-  if (!CmpBO || CmpBO->getOpcode() != Instruction::And)
-    return false;
-
-  // Swap things around so that the right operand of the and is a constant
-  // (the mask); we cannot deal with variable masks.
-  Value *AndLHS = CmpBO->getOperand(0);
-  Value *AndRHS = CmpBO->getOperand(1);
-  const SCEV *AndLHSSCEV = SE->getSCEV(AndLHS);
-  const SCEV *AndRHSSCEV = SE->getSCEV(AndRHS);
-  if (isa<SCEVConstant>(AndLHSSCEV)) {
-    std::swap(AndLHS, AndRHS);
-    std::swap(AndLHSSCEV, AndRHSSCEV);
-  }
-
-  const SCEVConstant *MaskSCEV = dyn_cast<SCEVConstant>(AndRHSSCEV);
-  if (!MaskSCEV)
-    return false;
-
-  // The mask must have some trailing ones (otherwise the condition is
-  // trivial and tells us nothing about the alignment of the left operand).
-  unsigned TrailingOnes = MaskSCEV->getAPInt().countTrailingOnes();
-  if (!TrailingOnes)
-    return false;
-
-  // Cap the alignment at the maximum with which LLVM can deal (and make sure
-  // we don't overflow the shift).
-  uint64_t Alignment;
-  TrailingOnes = std::min(TrailingOnes,
-    unsigned(sizeof(unsigned) * CHAR_BIT - 1));
-  Alignment = std::min(1u << TrailingOnes, +Value::MaximumAlignment);
-
-  Type *Int64Ty = Type::getInt64Ty(I->getParent()->getParent()->getContext());
-  AlignSCEV = SE->getConstant(Int64Ty, Alignment);
-
-  // The LHS might be a ptrtoint instruction, or it might be the pointer
-  // with an offset.
-  AAPtr = nullptr;
-  OffSCEV = nullptr;
-  if (PtrToIntInst *PToI = dyn_cast<PtrToIntInst>(AndLHS)) {
-    AAPtr = PToI->getPointerOperand();
+  assert(AlignOB.Inputs.size() >= 2);
+  AAPtr = AlignOB.Inputs[0].get();
+  // TODO: Consider accumulating the offset to the base.
+  AAPtr = AAPtr->stripPointerCastsSameRepresentation();
+  AlignSCEV = SE->getSCEV(AlignOB.Inputs[1].get());
+  AlignSCEV = SE->getTruncateOrZeroExtend(AlignSCEV, Int64Ty);
+  if (AlignOB.Inputs.size() == 3)
+    OffSCEV = SE->getSCEV(AlignOB.Inputs[2].get());
+  else
     OffSCEV = SE->getZero(Int64Ty);
-  } else if (const SCEVAddExpr* AndLHSAddSCEV =
-             dyn_cast<SCEVAddExpr>(AndLHSSCEV)) {
-    // Try to find the ptrtoint; subtract it and the rest is the offset.
-    for (SCEVAddExpr::op_iterator J = AndLHSAddSCEV->op_begin(),
-         JE = AndLHSAddSCEV->op_end(); J != JE; ++J)
-      if (const SCEVUnknown *OpUnk = dyn_cast<SCEVUnknown>(*J))
-        if (PtrToIntInst *PToI = dyn_cast<PtrToIntInst>(OpUnk->getValue())) {
-          AAPtr = PToI->getPointerOperand();
-          OffSCEV = SE->getMinusSCEV(AndLHSAddSCEV, *J);
-          break;
-        }
-  }
-
-  if (!AAPtr)
-    return false;
-
-  // Sign extend the offset to 64 bits (so that it is like all of the other
-  // expressions).
-  unsigned OffSCEVBits = OffSCEV->getType()->getPrimitiveSizeInBits();
-  if (OffSCEVBits < 64)
-    OffSCEV = SE->getSignExtendExpr(OffSCEV, Int64Ty);
-  else if (OffSCEVBits > 64)
-    return false;
-
-  AAPtr = AAPtr->stripPointerCasts();
+  OffSCEV = SE->getTruncateOrZeroExtend(OffSCEV, Int64Ty);
   return true;
 }
 
-bool AlignmentFromAssumptionsPass::processAssumption(CallInst *ACall) {
+bool AlignmentFromAssumptionsPass::processAssumption(CallInst *ACall,
+                                                     unsigned Idx) {
   Value *AAPtr;
   const SCEV *AlignSCEV, *OffSCEV;
-  if (!extractAlignmentInfo(ACall, AAPtr, AlignSCEV, OffSCEV))
+  if (!extractAlignmentInfo(ACall, Idx, AAPtr, AlignSCEV, OffSCEV))
     return false;
 
   // Skip ConstantPointerNull and UndefValue.  Assumptions on these shouldn't
@@ -317,13 +248,14 @@ bool AlignmentFromAssumptionsPass::processAssumption(CallInst *ACall) {
       continue;
 
     if (Instruction *K = dyn_cast<Instruction>(J))
-      if (isValidAssumeForContext(ACall, K, DT))
         WorkList.push_back(K);
   }
 
   while (!WorkList.empty()) {
     Instruction *J = WorkList.pop_back_val();
     if (LoadInst *LI = dyn_cast<LoadInst>(J)) {
+      if (!isValidAssumeForContext(ACall, J, DT))
+        continue;
       Align NewAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV,
                                            LI->getPointerOperand(), SE);
       if (NewAlignment > LI->getAlign()) {
@@ -331,6 +263,8 @@ bool AlignmentFromAssumptionsPass::processAssumption(CallInst *ACall) {
         ++NumLoadAlignChanged;
       }
     } else if (StoreInst *SI = dyn_cast<StoreInst>(J)) {
+      if (!isValidAssumeForContext(ACall, J, DT))
+        continue;
       Align NewAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV,
                                            SI->getPointerOperand(), SE);
       if (NewAlignment > SI->getAlign()) {
@@ -338,6 +272,8 @@ bool AlignmentFromAssumptionsPass::processAssumption(CallInst *ACall) {
         ++NumStoreAlignChanged;
       }
     } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(J)) {
+      if (!isValidAssumeForContext(ACall, J, DT))
+        continue;
       Align NewDestAlignment =
           getNewAlignment(AASCEV, AlignSCEV, OffSCEV, MI->getDest(), SE);
 
@@ -369,7 +305,7 @@ bool AlignmentFromAssumptionsPass::processAssumption(CallInst *ACall) {
     Visited.insert(J);
     for (User *UJ : J->users()) {
       Instruction *K = cast<Instruction>(UJ);
-      if (!Visited.count(K) && isValidAssumeForContext(ACall, K, DT))
+      if (!Visited.count(K))
         WorkList.push_back(K);
     }
   }
@@ -396,8 +332,11 @@ bool AlignmentFromAssumptionsPass::runImpl(Function &F, AssumptionCache &AC,
 
   bool Changed = false;
   for (auto &AssumeVH : AC.assumptions())
-    if (AssumeVH)
-      Changed |= processAssumption(cast<CallInst>(AssumeVH));
+    if (AssumeVH) {
+      CallInst *Call = cast<CallInst>(AssumeVH);
+      for (unsigned Idx = 0; Idx < Call->getNumOperandBundles(); Idx++)
+        Changed |= processAssumption(Call, Idx);
+    }
 
   return Changed;
 }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/AnnotationRemarks.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/AnnotationRemarks.cpp
new file mode 100644
index 000000000000..a02d88fe066f
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/AnnotationRemarks.cpp
@@ -0,0 +1,90 @@
+//===-- AnnotationRemarks.cpp - Generate remarks for annotated instrs. ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Generate remarks for instructions marked with !annotation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/AnnotationRemarks.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+using namespace llvm::ore;
+
+#define DEBUG_TYPE "annotation-remarks"
+#define REMARK_PASS DEBUG_TYPE
+
+static void runImpl(Function &F) {
+  if (!OptimizationRemarkEmitter::allowExtraAnalysis(F, REMARK_PASS))
+    return;
+
+  OptimizationRemarkEmitter ORE(&F);
+  // For now, just generate a summary of the annotated instructions.
+  MapVector<StringRef, unsigned> Mapping;
+  for (Instruction &I : instructions(F)) {
+    if (!I.hasMetadata(LLVMContext::MD_annotation))
+      continue;
+    for (const MDOperand &Op :
+         I.getMetadata(LLVMContext::MD_annotation)->operands()) {
+      auto Iter = Mapping.insert({cast<MDString>(Op.get())->getString(), 0});
+      Iter.first->second++;
+    }
+  }
+
+  Instruction *IP = &*F.begin()->begin();
+  for (const auto &KV : Mapping)
+    ORE.emit(OptimizationRemarkAnalysis(REMARK_PASS, "AnnotationSummary", IP)
+             << "Annotated " << NV("count", KV.second) << " instructions with "
+             << NV("type", KV.first));
+}
+
+namespace {
+
+struct AnnotationRemarksLegacy : public FunctionPass {
+  static char ID;
+
+  AnnotationRemarksLegacy() : FunctionPass(ID) {
+    initializeAnnotationRemarksLegacyPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    runImpl(F);
+    return false;
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+  }
+};
+
+} // end anonymous namespace
+
+char AnnotationRemarksLegacy::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AnnotationRemarksLegacy, "annotation-remarks",
+                      "Annotation Remarks", false, false)
+INITIALIZE_PASS_END(AnnotationRemarksLegacy, "annotation-remarks",
+                    "Annotation Remarks", false, false)
+
+FunctionPass *llvm::createAnnotationRemarksLegacyPass() {
+  return new AnnotationRemarksLegacy();
+}
+
+PreservedAnalyses AnnotationRemarksPass::run(Function &F,
+                                             FunctionAnalysisManager &AM) {
+  runImpl(F);
+  return PreservedAnalyses::all();
+}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp
index b26bd1114bd4..2eb94b721d96 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp
@@ -208,7 +208,7 @@ static bool canSplitCallSite(CallBase &CB, TargetTransformInfo &TTI) {
   // instructions before the call is less then DuplicationThreshold. The
   // instructions before the call will be duplicated in the split blocks and
   // corresponding uses will be updated.
-  unsigned Cost = 0;
+  InstructionCost Cost = 0;
   for (auto &InstBeforeCall :
        llvm::make_range(CallSiteBB->begin(), CB.getIterator())) {
     Cost += TTI.getInstructionCost(&InstBeforeCall,
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
index 7c14b69d658d..fdab74fc94c5 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
@@ -366,9 +366,9 @@ void ConstantHoistingPass::collectConstantCandidates(
                                     ConstInt->getValue(), ConstInt->getType(),
                                     TargetTransformInfo::TCK_SizeAndLatency);
   else
-    Cost = TTI->getIntImmCostInst(Inst->getOpcode(), Idx, ConstInt->getValue(),
-                                  ConstInt->getType(),
-                                  TargetTransformInfo::TCK_SizeAndLatency);
+    Cost = TTI->getIntImmCostInst(
+        Inst->getOpcode(), Idx, ConstInt->getValue(), ConstInt->getType(),
+        TargetTransformInfo::TCK_SizeAndLatency, Inst);
 
   // Ignore cheap integer constants.
   if (Cost > TargetTransformInfo::TCC_Basic) {
@@ -418,8 +418,9 @@ void ConstantHoistingPass::collectConstantCandidates(
   // usually lowered to a load from constant pool. Such operation is unlikely
   // to be cheaper than compute it by <Base + Offset>, which can be lowered to
   // an ADD instruction or folded into Load/Store instruction.
-  int Cost = TTI->getIntImmCostInst(Instruction::Add, 1, Offset, PtrIntTy,
-                                    TargetTransformInfo::TCK_SizeAndLatency);
+  int Cost =
+      TTI->getIntImmCostInst(Instruction::Add, 1, Offset, PtrIntTy,
+                             TargetTransformInfo::TCK_SizeAndLatency, Inst);
   ConstCandVecType &ExprCandVec = ConstGEPCandMap[BaseGV];
   ConstCandMapType::iterator Itr;
   bool Inserted;
@@ -950,7 +951,7 @@ bool ConstantHoistingPass::runImpl(Function &Fn, TargetTransformInfo &TTI,
   // base constant.
   if (!ConstIntCandVec.empty())
     findBaseConstants(nullptr);
-  for (auto &MapEntry : ConstGEPCandMap)
+  for (const auto &MapEntry : ConstGEPCandMap)
     if (!MapEntry.second.empty())
       findBaseConstants(MapEntry.first);
 
@@ -959,7 +960,7 @@ bool ConstantHoistingPass::runImpl(Function &Fn, TargetTransformInfo &TTI,
   bool MadeChange = false;
   if (!ConstIntInfoVec.empty())
     MadeChange = emitBaseConstants(nullptr);
-  for (auto MapEntry : ConstGEPInfoMap)
+  for (const auto &MapEntry : ConstGEPInfoMap)
     if (!MapEntry.second.empty())
       MadeChange |= emitBaseConstants(MapEntry.first);
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstantProp.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstantProp.cpp
deleted file mode 100644
index 73bf1d521b1d..000000000000
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstantProp.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-//===- ConstantProp.cpp - Code to perform Simple Constant Propagation -----===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements constant propagation and merging:
-//
-// Specifically, this:
-//   * Converts instructions like "add int 1, 2" into 3
-//
-// Notice that:
-//   * This pass has a habit of making definitions be dead.  It is a good idea
-//     to run a DIE pass sometime after running this pass.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/DebugCounter.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "constprop"
-
-STATISTIC(NumInstKilled, "Number of instructions killed");
-DEBUG_COUNTER(CPCounter, "constprop-transform",
-              "Controls which instructions are killed");
-
-namespace {
-  struct ConstantPropagation : public FunctionPass {
-    static char ID; // Pass identification, replacement for typeid
-    ConstantPropagation() : FunctionPass(ID) {
-      initializeConstantPropagationPass(*PassRegistry::getPassRegistry());
-    }
-
-    bool runOnFunction(Function &F) override;
-
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.setPreservesCFG();
-      AU.addRequired<TargetLibraryInfoWrapperPass>();
-    }
-  };
-}
-
-char ConstantPropagation::ID = 0;
-INITIALIZE_PASS_BEGIN(ConstantPropagation, "constprop",
-                "Simple constant propagation", false, false)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(ConstantPropagation, "constprop",
-                "Simple constant propagation", false, false)
-
-FunctionPass *llvm::createConstantPropagationPass() {
-  return new ConstantPropagation();
-}
-
-bool ConstantPropagation::runOnFunction(Function &F) {
-  if (skipFunction(F))
-    return false;
-
-  // Initialize the worklist to all of the instructions ready to process...
-  SmallPtrSet<Instruction *, 16> WorkList;
-  // The SmallVector of WorkList ensures that we do iteration at stable order.
-  // We use two containers rather than one SetVector, since remove is
-  // linear-time, and we don't care enough to remove from Vec.
-  SmallVector<Instruction *, 16> WorkListVec;
-  for (Instruction &I : instructions(&F)) {
-    WorkList.insert(&I);
-    WorkListVec.push_back(&I);
-  }
-
-  bool Changed = false;
-  const DataLayout &DL = F.getParent()->getDataLayout();
-  TargetLibraryInfo *TLI =
-      &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
-
-  while (!WorkList.empty()) {
-    SmallVector<Instruction*, 16> NewWorkListVec;
-    for (auto *I : WorkListVec) {
-      WorkList.erase(I); // Remove element from the worklist...
-
-      if (!I->use_empty()) // Don't muck with dead instructions...
-        if (Constant *C = ConstantFoldInstruction(I, DL, TLI)) {
-          if (!DebugCounter::shouldExecute(CPCounter))
-            continue;
-
-          // Add all of the users of this instruction to the worklist, they might
-          // be constant propagatable now...
-          for (User *U : I->users()) {
-            // If user not in the set, then add it to the vector.
-            if (WorkList.insert(cast<Instruction>(U)).second)
-              NewWorkListVec.push_back(cast<Instruction>(U));
-          }
-
-          // Replace all of the uses of a variable with uses of the constant.
-          I->replaceAllUsesWith(C);
-
-          if (isInstructionTriviallyDead(I, TLI)) {
-            I->eraseFromParent();
-            ++NumInstKilled;
-          }
-
-          // We made a change to the function...
-          Changed = true;
-        }
-    }
-    WorkListVec = std::move(NewWorkListVec);
-  }
-  return Changed;
-}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
new file mode 100644
index 000000000000..3b8af6f21ce5
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -0,0 +1,407 @@
+//===-- ConstraintElimination.cpp - Eliminate conds using constraints. ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Eliminate conditions based on constraints collected from dominating
+// conditions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/ConstraintElimination.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ConstraintSystem.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/DebugCounter.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "constraint-elimination"
+
+STATISTIC(NumCondsRemoved, "Number of instructions removed");
+DEBUG_COUNTER(EliminatedCounter, "conds-eliminated",
+              "Controls which conditions are eliminated");
+
+static int64_t MaxConstraintValue = std::numeric_limits<int64_t>::max();
+
+// Decomposes \p V into a vector of pairs of the form { c, X } where c * X. The
+// sum of the pairs equals \p V.  The first pair is the constant-factor and X
+// must be nullptr. If the expression cannot be decomposed, returns an empty
+// vector.
+static SmallVector<std::pair<int64_t, Value *>, 4> decompose(Value *V) {
+  if (auto *CI = dyn_cast<ConstantInt>(V)) {
+    if (CI->isNegative() || CI->uge(MaxConstraintValue))
+      return {};
+    return {{CI->getSExtValue(), nullptr}};
+  }
+  auto *GEP = dyn_cast<GetElementPtrInst>(V);
+  if (GEP && GEP->getNumOperands() == 2) {
+    if (isa<ConstantInt>(GEP->getOperand(GEP->getNumOperands() - 1))) {
+      return {{cast<ConstantInt>(GEP->getOperand(GEP->getNumOperands() - 1))
+                   ->getSExtValue(),
+               nullptr},
+              {1, GEP->getPointerOperand()}};
+    }
+    Value *Op0;
+    ConstantInt *CI;
+    if (match(GEP->getOperand(GEP->getNumOperands() - 1),
+              m_NUWShl(m_Value(Op0), m_ConstantInt(CI))))
+      return {{0, nullptr},
+              {1, GEP->getPointerOperand()},
+              {std::pow(int64_t(2), CI->getSExtValue()), Op0}};
+    if (match(GEP->getOperand(GEP->getNumOperands() - 1),
+              m_ZExt(m_NUWShl(m_Value(Op0), m_ConstantInt(CI)))))
+      return {{0, nullptr},
+              {1, GEP->getPointerOperand()},
+              {std::pow(int64_t(2), CI->getSExtValue()), Op0}};
+
+    return {{0, nullptr},
+            {1, GEP->getPointerOperand()},
+            {1, GEP->getOperand(GEP->getNumOperands() - 1)}};
+  }
+
+  Value *Op0;
+  Value *Op1;
+  ConstantInt *CI;
+  if (match(V, m_NUWAdd(m_Value(Op0), m_ConstantInt(CI))))
+    return {{CI->getSExtValue(), nullptr}, {1, Op0}};
+  if (match(V, m_NUWAdd(m_Value(Op0), m_Value(Op1))))
+    return {{0, nullptr}, {1, Op0}, {1, Op1}};
+
+  if (match(V, m_NUWSub(m_Value(Op0), m_ConstantInt(CI))))
+    return {{-1 * CI->getSExtValue(), nullptr}, {1, Op0}};
+  if (match(V, m_NUWSub(m_Value(Op0), m_Value(Op1))))
+    return {{0, nullptr}, {1, Op0}, {1, Op1}};
+
+  return {{0, nullptr}, {1, V}};
+}
+
+/// Turn a condition \p CmpI into a constraint vector, using indices from \p
+/// Value2Index. If \p ShouldAdd is true, new indices are added for values not
+/// yet in \p Value2Index.
+static SmallVector<int64_t, 8>
+getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1,
+              DenseMap<Value *, unsigned> &Value2Index, bool ShouldAdd) {
+  int64_t Offset1 = 0;
+  int64_t Offset2 = 0;
+
+  auto TryToGetIndex = [ShouldAdd,
+                        &Value2Index](Value *V) -> Optional<unsigned> {
+    if (ShouldAdd) {
+      Value2Index.insert({V, Value2Index.size() + 1});
+      return Value2Index[V];
+    }
+    auto I = Value2Index.find(V);
+    if (I == Value2Index.end())
+      return None;
+    return I->second;
+  };
+
+  if (Pred == CmpInst::ICMP_UGT || Pred == CmpInst::ICMP_UGE)
+    return getConstraint(CmpInst::getSwappedPredicate(Pred), Op1, Op0,
+                         Value2Index, ShouldAdd);
+
+  // Only ULE and ULT predicates are supported at the moment.
+  if (Pred != CmpInst::ICMP_ULE && Pred != CmpInst::ICMP_ULT)
+    return {};
+
+  auto ADec = decompose(Op0);
+  auto BDec = decompose(Op1);
+  // Skip if decomposing either of the values failed.
+  if (ADec.empty() || BDec.empty())
+    return {};
+
+  // Skip trivial constraints without any variables.
+  if (ADec.size() == 1 && BDec.size() == 1)
+    return {};
+
+  Offset1 = ADec[0].first;
+  Offset2 = BDec[0].first;
+  Offset1 *= -1;
+
+  // Create iterator ranges that skip the constant-factor.
+  auto VariablesA = make_range(std::next(ADec.begin()), ADec.end());
+  auto VariablesB = make_range(std::next(BDec.begin()), BDec.end());
+
+  // Check if each referenced value in the constraint is already in the system
+  // or can be added (if ShouldAdd is true).
+  for (const auto &KV :
+       concat<std::pair<int64_t, Value *>>(VariablesA, VariablesB))
+    if (!TryToGetIndex(KV.second))
+      return {};
+
+  // Build result constraint, by first adding all coefficients from A and then
+  // subtracting all coefficients from B.
+  SmallVector<int64_t, 8> R(Value2Index.size() + 1, 0);
+  for (const auto &KV : VariablesA)
+    R[Value2Index[KV.second]] += KV.first;
+
+  for (const auto &KV : VariablesB)
+    R[Value2Index[KV.second]] -= KV.first;
+
+  R[0] = Offset1 + Offset2 + (Pred == CmpInst::ICMP_ULT ? -1 : 0);
+  return R;
+}
+
+static SmallVector<int64_t, 8>
+getConstraint(CmpInst *Cmp, DenseMap<Value *, unsigned> &Value2Index,
+              bool ShouldAdd) {
+  return getConstraint(Cmp->getPredicate(), Cmp->getOperand(0),
+                       Cmp->getOperand(1), Value2Index, ShouldAdd);
+}
+
+namespace {
+/// Represents either a condition that holds on entry to a block or a basic
+/// block, with their respective Dominator DFS in and out numbers.
+struct ConstraintOrBlock {
+  unsigned NumIn;
+  unsigned NumOut;
+  bool IsBlock;
+  bool Not;
+  union {
+    BasicBlock *BB;
+    CmpInst *Condition;
+  };
+
+  ConstraintOrBlock(DomTreeNode *DTN)
+      : NumIn(DTN->getDFSNumIn()), NumOut(DTN->getDFSNumOut()), IsBlock(true),
+        BB(DTN->getBlock()) {}
+  ConstraintOrBlock(DomTreeNode *DTN, CmpInst *Condition, bool Not)
+      : NumIn(DTN->getDFSNumIn()), NumOut(DTN->getDFSNumOut()), IsBlock(false),
+        Not(Not), Condition(Condition) {}
+};
+
+struct StackEntry {
+  unsigned NumIn;
+  unsigned NumOut;
+  CmpInst *Condition;
+  bool IsNot;
+
+  StackEntry(unsigned NumIn, unsigned NumOut, CmpInst *Condition, bool IsNot)
+      : NumIn(NumIn), NumOut(NumOut), Condition(Condition), IsNot(IsNot) {}
+};
+} // namespace
+
+static bool eliminateConstraints(Function &F, DominatorTree &DT) {
+  bool Changed = false;
+  DT.updateDFSNumbers();
+  ConstraintSystem CS;
+
+  SmallVector<ConstraintOrBlock, 64> WorkList;
+
+  // First, collect conditions implied by branches and blocks with their
+  // Dominator DFS in and out numbers.
+  for (BasicBlock &BB : F) {
+    if (!DT.getNode(&BB))
+      continue;
+    WorkList.emplace_back(DT.getNode(&BB));
+
+    auto *Br = dyn_cast<BranchInst>(BB.getTerminator());
+    if (!Br || !Br->isConditional())
+      continue;
+
+    // If the condition is an OR of 2 compares and the false successor only has
+    // the current block as predecessor, queue both negated conditions for the
+    // false successor.
+    Value *Op0, *Op1;
+    if (match(Br->getCondition(), m_LogicalOr(m_Value(Op0), m_Value(Op1))) &&
+        match(Op0, m_Cmp()) && match(Op1, m_Cmp())) {
+      BasicBlock *FalseSuccessor = Br->getSuccessor(1);
+      if (FalseSuccessor->getSinglePredecessor()) {
+        WorkList.emplace_back(DT.getNode(FalseSuccessor), cast<CmpInst>(Op0),
+                              true);
+        WorkList.emplace_back(DT.getNode(FalseSuccessor), cast<CmpInst>(Op1),
+                              true);
+      }
+      continue;
+    }
+
+    // If the condition is an AND of 2 compares and the true successor only has
+    // the current block as predecessor, queue both conditions for the true
+    // successor.
+    if (match(Br->getCondition(), m_LogicalAnd(m_Value(Op0), m_Value(Op1))) &&
+        match(Op0, m_Cmp()) && match(Op1, m_Cmp())) {
+      BasicBlock *TrueSuccessor = Br->getSuccessor(0);
+      if (TrueSuccessor->getSinglePredecessor()) {
+        WorkList.emplace_back(DT.getNode(TrueSuccessor), cast<CmpInst>(Op0),
+                              false);
+        WorkList.emplace_back(DT.getNode(TrueSuccessor), cast<CmpInst>(Op1),
+                              false);
+      }
+      continue;
+    }
+
+    auto *CmpI = dyn_cast<CmpInst>(Br->getCondition());
+    if (!CmpI)
+      continue;
+    if (Br->getSuccessor(0)->getSinglePredecessor())
+      WorkList.emplace_back(DT.getNode(Br->getSuccessor(0)), CmpI, false);
+    if (Br->getSuccessor(1)->getSinglePredecessor())
+      WorkList.emplace_back(DT.getNode(Br->getSuccessor(1)), CmpI, true);
+  }
+
+  // Next, sort worklist by dominance, so that dominating blocks and conditions
+  // come before blocks and conditions dominated by them. If a block and a
+  // condition have the same numbers, the condition comes before the block, as
+  // it holds on entry to the block.
+  sort(WorkList, [](const ConstraintOrBlock &A, const ConstraintOrBlock &B) {
+    return std::tie(A.NumIn, A.IsBlock) < std::tie(B.NumIn, B.IsBlock);
+  });
+
+  // Finally, process ordered worklist and eliminate implied conditions.
+  SmallVector<StackEntry, 16> DFSInStack;
+  DenseMap<Value *, unsigned> Value2Index;
+  for (ConstraintOrBlock &CB : WorkList) {
+    // First, pop entries from the stack that are out-of-scope for CB. Remove
+    // the corresponding entry from the constraint system.
+    while (!DFSInStack.empty()) {
+      auto &E = DFSInStack.back();
+      LLVM_DEBUG(dbgs() << "Top of stack : " << E.NumIn << " " << E.NumOut
+                        << "\n");
+      LLVM_DEBUG(dbgs() << "CB: " << CB.NumIn << " " << CB.NumOut << "\n");
+      assert(E.NumIn <= CB.NumIn);
+      if (CB.NumOut <= E.NumOut)
+        break;
+      LLVM_DEBUG(dbgs() << "Removing " << *E.Condition << " " << E.IsNot
+                        << "\n");
+      DFSInStack.pop_back();
+      CS.popLastConstraint();
+    }
+
+    LLVM_DEBUG({
+      dbgs() << "Processing ";
+      if (CB.IsBlock)
+        dbgs() << *CB.BB;
+      else
+        dbgs() << *CB.Condition;
+      dbgs() << "\n";
+    });
+
+    // For a block, check if any CmpInsts become known based on the current set
+    // of constraints.
+    if (CB.IsBlock) {
+      for (Instruction &I : *CB.BB) {
+        auto *Cmp = dyn_cast<CmpInst>(&I);
+        if (!Cmp)
+          continue;
+        auto R = getConstraint(Cmp, Value2Index, false);
+        if (R.empty() || R.size() == 1)
+          continue;
+        if (CS.isConditionImplied(R)) {
+          if (!DebugCounter::shouldExecute(EliminatedCounter))
+            continue;
+
+          LLVM_DEBUG(dbgs() << "Condition " << *Cmp
+                            << " implied by dominating constraints\n");
+          LLVM_DEBUG({
+            for (auto &E : reverse(DFSInStack))
+              dbgs() << "   C " << *E.Condition << " " << E.IsNot << "\n";
+          });
+          Cmp->replaceAllUsesWith(
+              ConstantInt::getTrue(F.getParent()->getContext()));
+          NumCondsRemoved++;
+          Changed = true;
+        }
+        if (CS.isConditionImplied(ConstraintSystem::negate(R))) {
+          if (!DebugCounter::shouldExecute(EliminatedCounter))
+            continue;
+
+          LLVM_DEBUG(dbgs() << "Condition !" << *Cmp
+                            << " implied by dominating constraints\n");
+          LLVM_DEBUG({
+            for (auto &E : reverse(DFSInStack))
+              dbgs() << "   C " << *E.Condition << " " << E.IsNot << "\n";
+          });
+          Cmp->replaceAllUsesWith(
+              ConstantInt::getFalse(F.getParent()->getContext()));
+          NumCondsRemoved++;
+          Changed = true;
+        }
+      }
+      continue;
+    }
+
+    // Otherwise, add the condition to the system and stack, if we can transform
+    // it into a constraint.
+    auto R = getConstraint(CB.Condition, Value2Index, true);
+    if (R.empty())
+      continue;
+
+    LLVM_DEBUG(dbgs() << "Adding " << *CB.Condition << " " << CB.Not << "\n");
+    if (CB.Not)
+      R = ConstraintSystem::negate(R);
+
+    // If R has been added to the system, queue it for removal once it goes
+    // out-of-scope.
+    if (CS.addVariableRowFill(R))
+      DFSInStack.emplace_back(CB.NumIn, CB.NumOut, CB.Condition, CB.Not);
+  }
+
+  return Changed;
+}
+
+PreservedAnalyses ConstraintEliminationPass::run(Function &F,
+                                                 FunctionAnalysisManager &AM) {
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  if (!eliminateConstraints(F, DT))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<GlobalsAA>();
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
+
+namespace {
+
+class ConstraintElimination : public FunctionPass {
+public:
+  static char ID;
+
+  ConstraintElimination() : FunctionPass(ID) {
+    initializeConstraintEliminationPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    return eliminateConstraints(F, DT);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+  }
+};
+
+} // end anonymous namespace
+
+char ConstraintElimination::ID = 0;
+
+INITIALIZE_PASS_BEGIN(ConstraintElimination, "constraint-elimination",
+                      "Constraint Elimination", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass)
+INITIALIZE_PASS_END(ConstraintElimination, "constraint-elimination",
+                    "Constraint Elimination", false, false)
+
+FunctionPass *llvm::createConstraintEliminationPass() {
+  return new ConstraintElimination();
+}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index cd2f4ca36f3b..b671d68031a8 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -58,8 +58,11 @@ STATISTIC(NumMemAccess, "Number of memory access targets propagated");
 STATISTIC(NumCmps,      "Number of comparisons propagated");
 STATISTIC(NumReturns,   "Number of return values propagated");
 STATISTIC(NumDeadCases, "Number of switch cases removed");
+STATISTIC(NumSDivSRemsNarrowed,
+          "Number of sdivs/srems whose width was decreased");
 STATISTIC(NumSDivs,     "Number of sdiv converted to udiv");
-STATISTIC(NumUDivs,     "Number of udivs whose width was decreased");
+STATISTIC(NumUDivURemsNarrowed,
+          "Number of udivs/urems whose width was decreased");
 STATISTIC(NumAShrs,     "Number of ashr converted to lshr");
 STATISTIC(NumSRems,     "Number of srem converted to urem");
 STATISTIC(NumSExt,      "Number of sext converted to zext");
@@ -126,7 +129,7 @@ static bool processSelect(SelectInst *S, LazyValueInfo *LVI) {
   if (S->getType()->isVectorTy()) return false;
   if (isa<Constant>(S->getCondition())) return false;
 
-  Constant *C = LVI->getConstant(S->getCondition(), S->getParent(), S);
+  Constant *C = LVI->getConstant(S->getCondition(), S);
   if (!C) return false;
 
   ConstantInt *CI = dyn_cast<ConstantInt>(C);
@@ -283,7 +286,7 @@ static bool processMemAccess(Instruction *I, LazyValueInfo *LVI) {
 
   if (isa<Constant>(Pointer)) return false;
 
-  Constant *C = LVI->getConstant(Pointer, I->getParent(), I);
+  Constant *C = LVI->getConstant(Pointer, I);
   if (!C) return false;
 
   ++NumMemAccess;
@@ -301,18 +304,9 @@ static bool processCmp(CmpInst *Cmp, LazyValueInfo *LVI) {
   if (!C)
     return false;
 
-  // As a policy choice, we choose not to waste compile time on anything where
-  // the comparison is testing local values.  While LVI can sometimes reason
-  // about such cases, it's not its primary purpose.  We do make sure to do
-  // the block local query for uses from terminator instructions, but that's
-  // handled in the code for each terminator. As an exception, we allow phi
-  // nodes, for which LVI can thread the condition into predecessors.
-  auto *I = dyn_cast<Instruction>(Op0);
-  if (I && I->getParent() == Cmp->getParent() && !isa<PHINode>(I))
-    return false;
-
   LazyValueInfo::Tristate Result =
-      LVI->getPredicateAt(Cmp->getPredicate(), Op0, C, Cmp);
+      LVI->getPredicateAt(Cmp->getPredicate(), Op0, C, Cmp,
+                          /*UseBlockValue=*/true);
   if (Result == LazyValueInfo::Unknown)
     return false;
 
@@ -336,15 +330,6 @@ static bool processSwitch(SwitchInst *I, LazyValueInfo *LVI,
   Value *Cond = I->getCondition();
   BasicBlock *BB = I->getParent();
 
-  // If the condition was defined in same block as the switch then LazyValueInfo
-  // currently won't say anything useful about it, though in theory it could.
-  if (isa<Instruction>(Cond) && cast<Instruction>(Cond)->getParent() == BB)
-    return false;
-
-  // If the switch is unreachable then trying to improve it is a waste of time.
-  pred_iterator PB = pred_begin(BB), PE = pred_end(BB);
-  if (PB == PE) return false;
-
   // Analyse each switch case in turn.
   bool Changed = false;
   DenseMap<BasicBlock*, int> SuccessorsCount;
@@ -357,35 +342,9 @@ static bool processSwitch(SwitchInst *I, LazyValueInfo *LVI,
 
     for (auto CI = SI->case_begin(), CE = SI->case_end(); CI != CE;) {
       ConstantInt *Case = CI->getCaseValue();
-
-      // Check to see if the switch condition is equal to/not equal to the case
-      // value on every incoming edge, equal/not equal being the same each time.
-      LazyValueInfo::Tristate State = LazyValueInfo::Unknown;
-      for (pred_iterator PI = PB; PI != PE; ++PI) {
-        // Is the switch condition equal to the case value?
-        LazyValueInfo::Tristate Value = LVI->getPredicateOnEdge(CmpInst::ICMP_EQ,
-                                                                Cond, Case, *PI,
-                                                                BB, SI);
-        // Give up on this case if nothing is known.
-        if (Value == LazyValueInfo::Unknown) {
-          State = LazyValueInfo::Unknown;
-          break;
-        }
-
-        // If this was the first edge to be visited, record that all other edges
-        // need to give the same result.
-        if (PI == PB) {
-          State = Value;
-          continue;
-        }
-
-        // If this case is known to fire for some edges and known not to fire for
-        // others then there is nothing we can do - give up.
-        if (Value != State) {
-          State = LazyValueInfo::Unknown;
-          break;
-        }
-      }
+      LazyValueInfo::Tristate State =
+          LVI->getPredicateAt(CmpInst::ICMP_EQ, Cond, Case, I,
+                              /* UseBlockValue */ true);
 
       if (State == LazyValueInfo::False) {
         // This case never fires - remove it.
@@ -429,10 +388,8 @@ static bool processSwitch(SwitchInst *I, LazyValueInfo *LVI,
 
 // See if we can prove that the given binary op intrinsic will not overflow.
 static bool willNotOverflow(BinaryOpIntrinsic *BO, LazyValueInfo *LVI) {
-  ConstantRange LRange = LVI->getConstantRange(
-      BO->getLHS(), BO->getParent(), BO);
-  ConstantRange RRange = LVI->getConstantRange(
-      BO->getRHS(), BO->getParent(), BO);
+  ConstantRange LRange = LVI->getConstantRange(BO->getLHS(), BO);
+  ConstantRange RRange = LVI->getConstantRange(BO->getRHS(), BO);
   ConstantRange NWRegion = ConstantRange::makeGuaranteedNoWrapRegion(
       BO->getBinaryOp(), RRange, BO->getNoWrapKind());
   return NWRegion.contains(LRange);
@@ -532,8 +489,6 @@ static void processSaturatingInst(SaturatingInst *SI, LazyValueInfo *LVI) {
 
 /// Infer nonnull attributes for the arguments at the specified callsite.
 static bool processCallSite(CallBase &CB, LazyValueInfo *LVI) {
-  SmallVector<unsigned, 4> ArgNos;
-  unsigned ArgNo = 0;
 
   if (auto *WO = dyn_cast<WithOverflowInst>(&CB)) {
     if (WO->getLHS()->getType()->isIntegerTy() && willNotOverflow(WO, LVI)) {
@@ -549,6 +504,8 @@ static bool processCallSite(CallBase &CB, LazyValueInfo *LVI) {
     }
   }
 
+  bool Changed = false;
+
   // Deopt bundle operands are intended to capture state with minimal
   // perturbance of the code otherwise.  If we can find a constant value for
   // any such operand and remove a use of the original value, that's
@@ -557,22 +514,22 @@ static bool processCallSite(CallBase &CB, LazyValueInfo *LVI) {
   // idiomatically, appear along rare conditional paths, it's reasonable likely
   // we may have a conditional fact with which LVI can fold.
   if (auto DeoptBundle = CB.getOperandBundle(LLVMContext::OB_deopt)) {
-    bool Progress = false;
     for (const Use &ConstU : DeoptBundle->Inputs) {
       Use &U = const_cast<Use&>(ConstU);
       Value *V = U.get();
       if (V->getType()->isVectorTy()) continue;
       if (isa<Constant>(V)) continue;
 
-      Constant *C = LVI->getConstant(V, CB.getParent(), &CB);
+      Constant *C = LVI->getConstant(V, &CB);
       if (!C) continue;
       U.set(C);
-      Progress = true;
+      Changed = true;
     }
-    if (Progress)
-      return true;
   }
 
+  SmallVector<unsigned, 4> ArgNos;
+  unsigned ArgNo = 0;
+
   for (Value *V : CB.args()) {
     PointerType *Type = dyn_cast<PointerType>(V->getType());
     // Try to mark pointer typed parameters as non-null.  We skip the
@@ -590,7 +547,7 @@ static bool processCallSite(CallBase &CB, LazyValueInfo *LVI) {
   assert(ArgNo == CB.arg_size() && "sanity check");
 
   if (ArgNos.empty())
-    return false;
+    return Changed;
 
   AttributeList AS = CB.getAttributes();
   LLVMContext &Ctx = CB.getContext();
@@ -601,13 +558,79 @@ static bool processCallSite(CallBase &CB, LazyValueInfo *LVI) {
   return true;
 }
 
-static bool hasPositiveOperands(BinaryOperator *SDI, LazyValueInfo *LVI) {
-  Constant *Zero = ConstantInt::get(SDI->getType(), 0);
-  for (Value *O : SDI->operands()) {
-    auto Result = LVI->getPredicateAt(ICmpInst::ICMP_SGE, O, Zero, SDI);
-    if (Result != LazyValueInfo::True)
-      return false;
+static bool isNonNegative(Value *V, LazyValueInfo *LVI, Instruction *CxtI) {
+  Constant *Zero = ConstantInt::get(V->getType(), 0);
+  auto Result = LVI->getPredicateAt(ICmpInst::ICMP_SGE, V, Zero, CxtI);
+  return Result == LazyValueInfo::True;
+}
+
+static bool isNonPositive(Value *V, LazyValueInfo *LVI, Instruction *CxtI) {
+  Constant *Zero = ConstantInt::get(V->getType(), 0);
+  auto Result = LVI->getPredicateAt(ICmpInst::ICMP_SLE, V, Zero, CxtI);
+  return Result == LazyValueInfo::True;
+}
+
+enum class Domain { NonNegative, NonPositive, Unknown };
+
+Domain getDomain(Value *V, LazyValueInfo *LVI, Instruction *CxtI) {
+  if (isNonNegative(V, LVI, CxtI))
+    return Domain::NonNegative;
+  if (isNonPositive(V, LVI, CxtI))
+    return Domain::NonPositive;
+  return Domain::Unknown;
+}
+
+/// Try to shrink a sdiv/srem's width down to the smallest power of two that's
+/// sufficient to contain its operands.
+static bool narrowSDivOrSRem(BinaryOperator *Instr, LazyValueInfo *LVI) {
+  assert(Instr->getOpcode() == Instruction::SDiv ||
+         Instr->getOpcode() == Instruction::SRem);
+  if (Instr->getType()->isVectorTy())
+    return false;
+
+  // Find the smallest power of two bitwidth that's sufficient to hold Instr's
+  // operands.
+  unsigned OrigWidth = Instr->getType()->getIntegerBitWidth();
+
+  // What is the smallest bit width that can accomodate the entire value ranges
+  // of both of the operands?
+  std::array<Optional<ConstantRange>, 2> CRs;
+  unsigned MinSignedBits = 0;
+  for (auto I : zip(Instr->operands(), CRs)) {
+    std::get<1>(I) = LVI->getConstantRange(std::get<0>(I), Instr);
+    MinSignedBits = std::max(std::get<1>(I)->getMinSignedBits(), MinSignedBits);
   }
+
+  // sdiv/srem is UB if divisor is -1 and divident is INT_MIN, so unless we can
+  // prove that such a combination is impossible, we need to bump the bitwidth.
+  if (CRs[1]->contains(APInt::getAllOnesValue(OrigWidth)) &&
+      CRs[0]->contains(
+          APInt::getSignedMinValue(MinSignedBits).sextOrSelf(OrigWidth)))
+    ++MinSignedBits;
+
+  // Don't shrink below 8 bits wide.
+  unsigned NewWidth = std::max<unsigned>(PowerOf2Ceil(MinSignedBits), 8);
+
+  // NewWidth might be greater than OrigWidth if OrigWidth is not a power of
+  // two.
+  if (NewWidth >= OrigWidth)
+    return false;
+
+  ++NumSDivSRemsNarrowed;
+  IRBuilder<> B{Instr};
+  auto *TruncTy = Type::getIntNTy(Instr->getContext(), NewWidth);
+  auto *LHS = B.CreateTruncOrBitCast(Instr->getOperand(0), TruncTy,
+                                     Instr->getName() + ".lhs.trunc");
+  auto *RHS = B.CreateTruncOrBitCast(Instr->getOperand(1), TruncTy,
+                                     Instr->getName() + ".rhs.trunc");
+  auto *BO = B.CreateBinOp(Instr->getOpcode(), LHS, RHS, Instr->getName());
+  auto *Sext = B.CreateSExt(BO, Instr->getType(), Instr->getName() + ".sext");
+  if (auto *BinOp = dyn_cast<BinaryOperator>(BO))
+    if (BinOp->getOpcode() == Instruction::SDiv)
+      BinOp->setIsExact(Instr->isExact());
+
+  Instr->replaceAllUsesWith(Sext);
+  Instr->eraseFromParent();
   return true;
 }
 
@@ -621,21 +644,23 @@ static bool processUDivOrURem(BinaryOperator *Instr, LazyValueInfo *LVI) {
 
   // Find the smallest power of two bitwidth that's sufficient to hold Instr's
   // operands.
-  auto OrigWidth = Instr->getType()->getIntegerBitWidth();
-  ConstantRange OperandRange(OrigWidth, /*isFullSet=*/false);
+
+  // What is the smallest bit width that can accomodate the entire value ranges
+  // of both of the operands?
+  unsigned MaxActiveBits = 0;
   for (Value *Operand : Instr->operands()) {
-    OperandRange = OperandRange.unionWith(
-        LVI->getConstantRange(Operand, Instr->getParent()));
+    ConstantRange CR = LVI->getConstantRange(Operand, Instr);
+    MaxActiveBits = std::max(CR.getActiveBits(), MaxActiveBits);
   }
   // Don't shrink below 8 bits wide.
-  unsigned NewWidth = std::max<unsigned>(
-      PowerOf2Ceil(OperandRange.getUnsignedMax().getActiveBits()), 8);
+  unsigned NewWidth = std::max<unsigned>(PowerOf2Ceil(MaxActiveBits), 8);
+
   // NewWidth might be greater than OrigWidth if OrigWidth is not a power of
   // two.
-  if (NewWidth >= OrigWidth)
+  if (NewWidth >= Instr->getType()->getIntegerBitWidth())
     return false;
 
-  ++NumUDivs;
+  ++NumUDivURemsNarrowed;
   IRBuilder<> B{Instr};
   auto *TruncTy = Type::getIntNTy(Instr->getContext(), NewWidth);
   auto *LHS = B.CreateTruncOrBitCast(Instr->getOperand(0), TruncTy,
@@ -654,52 +679,135 @@ static bool processUDivOrURem(BinaryOperator *Instr, LazyValueInfo *LVI) {
 }
 
 static bool processSRem(BinaryOperator *SDI, LazyValueInfo *LVI) {
-  if (SDI->getType()->isVectorTy() || !hasPositiveOperands(SDI, LVI))
+  assert(SDI->getOpcode() == Instruction::SRem);
+  if (SDI->getType()->isVectorTy())
     return false;
 
+  struct Operand {
+    Value *V;
+    Domain D;
+  };
+  std::array<Operand, 2> Ops;
+
+  for (const auto I : zip(Ops, SDI->operands())) {
+    Operand &Op = std::get<0>(I);
+    Op.V = std::get<1>(I);
+    Op.D = getDomain(Op.V, LVI, SDI);
+    if (Op.D == Domain::Unknown)
+      return false;
+  }
+
+  // We know domains of both of the operands!
   ++NumSRems;
-  auto *BO = BinaryOperator::CreateURem(SDI->getOperand(0), SDI->getOperand(1),
-                                        SDI->getName(), SDI);
-  BO->setDebugLoc(SDI->getDebugLoc());
-  SDI->replaceAllUsesWith(BO);
+
+  // We need operands to be non-negative, so negate each one that isn't.
+  for (Operand &Op : Ops) {
+    if (Op.D == Domain::NonNegative)
+      continue;
+    auto *BO =
+        BinaryOperator::CreateNeg(Op.V, Op.V->getName() + ".nonneg", SDI);
+    BO->setDebugLoc(SDI->getDebugLoc());
+    Op.V = BO;
+  }
+
+  auto *URem =
+      BinaryOperator::CreateURem(Ops[0].V, Ops[1].V, SDI->getName(), SDI);
+  URem->setDebugLoc(SDI->getDebugLoc());
+
+  Value *Res = URem;
+
+  // If the divident was non-positive, we need to negate the result.
+  if (Ops[0].D == Domain::NonPositive)
+    Res = BinaryOperator::CreateNeg(Res, Res->getName() + ".neg", SDI);
+
+  SDI->replaceAllUsesWith(Res);
   SDI->eraseFromParent();
 
-  // Try to process our new urem.
-  processUDivOrURem(BO, LVI);
+  // Try to simplify our new urem.
+  processUDivOrURem(URem, LVI);
 
   return true;
 }
 
 /// See if LazyValueInfo's ability to exploit edge conditions or range
-/// information is sufficient to prove the both operands of this SDiv are
-/// positive.  If this is the case, replace the SDiv with a UDiv. Even for local
+/// information is sufficient to prove the signs of both operands of this SDiv.
+/// If this is the case, replace the SDiv with a UDiv. Even for local
 /// conditions, this can sometimes prove conditions instcombine can't by
 /// exploiting range information.
 static bool processSDiv(BinaryOperator *SDI, LazyValueInfo *LVI) {
-  if (SDI->getType()->isVectorTy() || !hasPositiveOperands(SDI, LVI))
+  assert(SDI->getOpcode() == Instruction::SDiv);
+  if (SDI->getType()->isVectorTy())
     return false;
 
+  struct Operand {
+    Value *V;
+    Domain D;
+  };
+  std::array<Operand, 2> Ops;
+
+  for (const auto I : zip(Ops, SDI->operands())) {
+    Operand &Op = std::get<0>(I);
+    Op.V = std::get<1>(I);
+    Op.D = getDomain(Op.V, LVI, SDI);
+    if (Op.D == Domain::Unknown)
+      return false;
+  }
+
+  // We know domains of both of the operands!
   ++NumSDivs;
-  auto *BO = BinaryOperator::CreateUDiv(SDI->getOperand(0), SDI->getOperand(1),
-                                        SDI->getName(), SDI);
-  BO->setDebugLoc(SDI->getDebugLoc());
-  BO->setIsExact(SDI->isExact());
-  SDI->replaceAllUsesWith(BO);
+
+  // We need operands to be non-negative, so negate each one that isn't.
+  for (Operand &Op : Ops) {
+    if (Op.D == Domain::NonNegative)
+      continue;
+    auto *BO =
+        BinaryOperator::CreateNeg(Op.V, Op.V->getName() + ".nonneg", SDI);
+    BO->setDebugLoc(SDI->getDebugLoc());
+    Op.V = BO;
+  }
+
+  auto *UDiv =
+      BinaryOperator::CreateUDiv(Ops[0].V, Ops[1].V, SDI->getName(), SDI);
+  UDiv->setDebugLoc(SDI->getDebugLoc());
+  UDiv->setIsExact(SDI->isExact());
+
+  Value *Res = UDiv;
+
+  // If the operands had two different domains, we need to negate the result.
+  if (Ops[0].D != Ops[1].D)
+    Res = BinaryOperator::CreateNeg(Res, Res->getName() + ".neg", SDI);
+
+  SDI->replaceAllUsesWith(Res);
   SDI->eraseFromParent();
 
   // Try to simplify our new udiv.
-  processUDivOrURem(BO, LVI);
+  processUDivOrURem(UDiv, LVI);
 
   return true;
 }
 
+static bool processSDivOrSRem(BinaryOperator *Instr, LazyValueInfo *LVI) {
+  assert(Instr->getOpcode() == Instruction::SDiv ||
+         Instr->getOpcode() == Instruction::SRem);
+  if (Instr->getType()->isVectorTy())
+    return false;
+
+  if (Instr->getOpcode() == Instruction::SDiv)
+    if (processSDiv(Instr, LVI))
+      return true;
+
+  if (Instr->getOpcode() == Instruction::SRem)
+    if (processSRem(Instr, LVI))
+      return true;
+
+  return narrowSDivOrSRem(Instr, LVI);
+}
+
 static bool processAShr(BinaryOperator *SDI, LazyValueInfo *LVI) {
   if (SDI->getType()->isVectorTy())
     return false;
 
-  Constant *Zero = ConstantInt::get(SDI->getType(), 0);
-  if (LVI->getPredicateAt(ICmpInst::ICMP_SGE, SDI->getOperand(0), Zero, SDI) !=
-      LazyValueInfo::True)
+  if (!isNonNegative(SDI->getOperand(0), LVI, SDI))
     return false;
 
   ++NumAShrs;
@@ -719,9 +827,7 @@ static bool processSExt(SExtInst *SDI, LazyValueInfo *LVI) {
 
   Value *Base = SDI->getOperand(0);
 
-  Constant *Zero = ConstantInt::get(Base->getType(), 0);
-  if (LVI->getPredicateAt(ICmpInst::ICMP_SGE, Base, Zero, SDI) !=
-      LazyValueInfo::True)
+  if (!isNonNegative(Base, LVI, SDI))
     return false;
 
   ++NumSExt;
@@ -748,14 +854,12 @@ static bool processBinOp(BinaryOperator *BinOp, LazyValueInfo *LVI) {
   if (NSW && NUW)
     return false;
 
-  BasicBlock *BB = BinOp->getParent();
-
   Instruction::BinaryOps Opcode = BinOp->getOpcode();
   Value *LHS = BinOp->getOperand(0);
   Value *RHS = BinOp->getOperand(1);
 
-  ConstantRange LRange = LVI->getConstantRange(LHS, BB, BinOp);
-  ConstantRange RRange = LVI->getConstantRange(RHS, BB, BinOp);
+  ConstantRange LRange = LVI->getConstantRange(LHS, BinOp);
+  ConstantRange RRange = LVI->getConstantRange(RHS, BinOp);
 
   bool Changed = false;
   bool NewNUW = false, NewNSW = false;
@@ -783,7 +887,6 @@ static bool processAnd(BinaryOperator *BinOp, LazyValueInfo *LVI) {
 
   // Pattern match (and lhs, C) where C includes a superset of bits which might
   // be set in lhs.  This is a common truncation idiom created by instcombine.
-  BasicBlock *BB = BinOp->getParent();
   Value *LHS = BinOp->getOperand(0);
   ConstantInt *RHS = dyn_cast<ConstantInt>(BinOp->getOperand(1));
   if (!RHS || !RHS->getValue().isMask())
@@ -792,7 +895,7 @@ static bool processAnd(BinaryOperator *BinOp, LazyValueInfo *LVI) {
   // We can only replace the AND with LHS based on range info if the range does
   // not include undef.
   ConstantRange LRange =
-      LVI->getConstantRange(LHS, BB, BinOp, /*UndefAllowed=*/false);
+      LVI->getConstantRange(LHS, BinOp, /*UndefAllowed=*/false);
   if (!LRange.getUnsignedMax().ule(RHS->getValue()))
     return false;
 
@@ -804,7 +907,7 @@ static bool processAnd(BinaryOperator *BinOp, LazyValueInfo *LVI) {
 
 
 static Constant *getConstantAt(Value *V, Instruction *At, LazyValueInfo *LVI) {
-  if (Constant *C = LVI->getConstant(V, At->getParent(), At))
+  if (Constant *C = LVI->getConstant(V, At))
     return C;
 
   // TODO: The following really should be sunk inside LVI's core algorithm, or
@@ -858,10 +961,8 @@ static bool runImpl(Function &F, LazyValueInfo *LVI, DominatorTree *DT,
         BBChanged |= processCallSite(cast<CallBase>(*II), LVI);
         break;
       case Instruction::SRem:
-        BBChanged |= processSRem(cast<BinaryOperator>(II), LVI);
-        break;
       case Instruction::SDiv:
-        BBChanged |= processSDiv(cast<BinaryOperator>(II), LVI);
+        BBChanged |= processSDivOrSRem(cast<BinaryOperator>(II), LVI);
         break;
       case Instruction::UDiv:
       case Instruction::URem:
@@ -929,11 +1030,19 @@ CorrelatedValuePropagationPass::run(Function &F, FunctionAnalysisManager &AM) {
 
   bool Changed = runImpl(F, LVI, DT, getBestSimplifyQuery(AM, F));
 
-  if (!Changed)
-    return PreservedAnalyses::all();
   PreservedAnalyses PA;
-  PA.preserve<GlobalsAA>();
-  PA.preserve<DominatorTreeAnalysis>();
-  PA.preserve<LazyValueAnalysis>();
+  if (!Changed) {
+    PA = PreservedAnalyses::all();
+  } else {
+    PA.preserve<GlobalsAA>();
+    PA.preserve<DominatorTreeAnalysis>();
+    PA.preserve<LazyValueAnalysis>();
+  }
+
+  // Keeping LVI alive is expensive, both because it uses a lot of memory, and
+  // because invalidating values in LVI is expensive. While CVP does preserve
+  // LVI, we know that passes after JumpThreading+CVP will not need the result
+  // of this analysis, so we forcefully discard it early.
+  PA.abandon<LazyValueAnalysis>();
   return PA;
 }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/DCE.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/DCE.cpp
index 28947482e303..d55adf7c2d12 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/DCE.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/DCE.cpp
@@ -32,57 +32,10 @@ using namespace llvm;
 
 #define DEBUG_TYPE "dce"
 
-STATISTIC(DIEEliminated, "Number of insts removed by DIE pass");
 STATISTIC(DCEEliminated, "Number of insts removed");
 DEBUG_COUNTER(DCECounter, "dce-transform",
               "Controls which instructions are eliminated");
 
-namespace {
-  //===--------------------------------------------------------------------===//
-  // DeadInstElimination pass implementation
-  //
-struct DeadInstElimination : public FunctionPass {
-  static char ID; // Pass identification, replacement for typeid
-  DeadInstElimination() : FunctionPass(ID) {
-    initializeDeadInstEliminationPass(*PassRegistry::getPassRegistry());
-  }
-  bool runOnFunction(Function &F) override {
-    if (skipFunction(F))
-      return false;
-    auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
-    TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
-
-    bool Changed = false;
-    for (auto &BB : F) {
-      for (BasicBlock::iterator DI = BB.begin(); DI != BB.end(); ) {
-        Instruction *Inst = &*DI++;
-        if (isInstructionTriviallyDead(Inst, TLI)) {
-          if (!DebugCounter::shouldExecute(DCECounter))
-            continue;
-          salvageDebugInfo(*Inst);
-          Inst->eraseFromParent();
-          Changed = true;
-          ++DIEEliminated;
-        }
-      }
-    }
-    return Changed;
-  }
-
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.setPreservesCFG();
-    }
-};
-}
-
-char DeadInstElimination::ID = 0;
-INITIALIZE_PASS(DeadInstElimination, "die",
-                "Dead Instruction Elimination", false, false)
-
-Pass *llvm::createDeadInstEliminationPass() {
-  return new DeadInstElimination();
-}
-
 //===--------------------------------------------------------------------===//
 // RedundantDbgInstElimination pass implementation
 //
@@ -116,6 +69,18 @@ Pass *llvm::createRedundantDbgInstEliminationPass() {
   return new RedundantDbgInstElimination();
 }
 
+PreservedAnalyses
+RedundantDbgInstEliminationPass::run(Function &F, FunctionAnalysisManager &AM) {
+  bool Changed = false;
+  for (auto &BB : F)
+    Changed |= RemoveRedundantDbgInstrs(&BB);
+  if (!Changed)
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
+
 //===--------------------------------------------------------------------===//
 // DeadCodeElimination pass implementation
 //
@@ -178,7 +143,7 @@ static bool eliminateDeadCode(Function &F, TargetLibraryInfo *TLI) {
 }
 
 PreservedAnalyses DCEPass::run(Function &F, FunctionAnalysisManager &AM) {
-  if (!eliminateDeadCode(F, AM.getCachedResult<TargetLibraryAnalysis>(F)))
+  if (!eliminateDeadCode(F, &AM.getResult<TargetLibraryAnalysis>(F)))
     return PreservedAnalyses::all();
 
   PreservedAnalyses PA;
@@ -197,13 +162,14 @@ struct DCELegacyPass : public FunctionPass {
     if (skipFunction(F))
       return false;
 
-    auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
-    TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
+    TargetLibraryInfo *TLI =
+        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
 
     return eliminateDeadCode(F, TLI);
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
     AU.setPreservesCFG();
   }
 };
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index e58db03225ee..2979225c6016 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -84,10 +84,13 @@ STATISTIC(NumFastStores, "Number of stores deleted");
 STATISTIC(NumFastOther, "Number of other instrs removed");
 STATISTIC(NumCompletePartials, "Number of stores dead by later partials");
 STATISTIC(NumModifiedStores, "Number of stores modified");
-STATISTIC(NumNoopStores, "Number of noop stores deleted");
 STATISTIC(NumCFGChecks, "Number of stores modified");
 STATISTIC(NumCFGTries, "Number of stores modified");
 STATISTIC(NumCFGSuccess, "Number of stores modified");
+STATISTIC(NumGetDomMemoryDefPassed,
+          "Number of times a valid candidate is returned from getDomMemoryDef");
+STATISTIC(NumDomMemDefChecks,
+          "Number iterations check for reads in getDomMemoryDef");
 
 DEBUG_COUNTER(MemorySSACounter, "dse-memoryssa",
               "Controls which MemoryDefs are eliminated.");
@@ -103,19 +106,42 @@ EnablePartialStoreMerging("enable-dse-partial-store-merging",
   cl::desc("Enable partial store merging in DSE"));
 
 static cl::opt<bool>
-    EnableMemorySSA("enable-dse-memoryssa", cl::init(false), cl::Hidden,
+    EnableMemorySSA("enable-dse-memoryssa", cl::init(true), cl::Hidden,
                     cl::desc("Use the new MemorySSA-backed DSE."));
 
 static cl::opt<unsigned>
-    MemorySSAScanLimit("dse-memoryssa-scanlimit", cl::init(100), cl::Hidden,
+    MemorySSAScanLimit("dse-memoryssa-scanlimit", cl::init(150), cl::Hidden,
                        cl::desc("The number of memory instructions to scan for "
                                 "dead store elimination (default = 100)"));
+static cl::opt<unsigned> MemorySSAUpwardsStepLimit(
+    "dse-memoryssa-walklimit", cl::init(90), cl::Hidden,
+    cl::desc("The maximum number of steps while walking upwards to find "
+             "MemoryDefs that may be killed (default = 90)"));
+
+static cl::opt<unsigned> MemorySSAPartialStoreLimit(
+    "dse-memoryssa-partial-store-limit", cl::init(5), cl::Hidden,
+    cl::desc("The maximum number candidates that only partially overwrite the "
+             "killing MemoryDef to consider"
+             " (default = 5)"));
 
 static cl::opt<unsigned> MemorySSADefsPerBlockLimit(
     "dse-memoryssa-defs-per-block-limit", cl::init(5000), cl::Hidden,
     cl::desc("The number of MemoryDefs we consider as candidates to eliminated "
              "other stores per basic block (default = 5000)"));
 
+static cl::opt<unsigned> MemorySSASameBBStepCost(
+    "dse-memoryssa-samebb-cost", cl::init(1), cl::Hidden,
+    cl::desc(
+        "The cost of a step in the same basic block as the killing MemoryDef"
+        "(default = 1)"));
+
+static cl::opt<unsigned>
+    MemorySSAOtherBBStepCost("dse-memoryssa-otherbb-cost", cl::init(5),
+                             cl::Hidden,
+                             cl::desc("The cost of a step in a different basic "
+                                      "block than the killing MemoryDef"
+                                      "(default = 5)"));
+
 static cl::opt<unsigned> MemorySSAPathCheckLimit(
     "dse-memoryssa-path-check-limit", cl::init(50), cl::Hidden,
     cl::desc("The maximum number of blocks to check when trying to prove that "
@@ -203,11 +229,13 @@ static bool hasAnalyzableMemoryWrite(Instruction *I,
     case Intrinsic::memset:
     case Intrinsic::memmove:
     case Intrinsic::memcpy:
+    case Intrinsic::memcpy_inline:
     case Intrinsic::memcpy_element_unordered_atomic:
     case Intrinsic::memmove_element_unordered_atomic:
     case Intrinsic::memset_element_unordered_atomic:
     case Intrinsic::init_trampoline:
     case Intrinsic::lifetime_end:
+    case Intrinsic::masked_store:
       return true;
     }
   }
@@ -231,23 +259,23 @@ static bool hasAnalyzableMemoryWrite(Instruction *I,
 /// Return a Location stored to by the specified instruction. If isRemovable
 /// returns true, this function and getLocForRead completely describe the memory
 /// operations for this instruction.
-static MemoryLocation getLocForWrite(Instruction *Inst) {
-
+static MemoryLocation getLocForWrite(Instruction *Inst,
+                                     const TargetLibraryInfo &TLI) {
   if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
     return MemoryLocation::get(SI);
 
-  if (auto *MI = dyn_cast<AnyMemIntrinsic>(Inst)) {
-    // memcpy/memmove/memset.
-    MemoryLocation Loc = MemoryLocation::getForDest(MI);
-    return Loc;
-  }
+  // memcpy/memmove/memset.
+  if (auto *MI = dyn_cast<AnyMemIntrinsic>(Inst))
+    return MemoryLocation::getForDest(MI);
 
   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
     switch (II->getIntrinsicID()) {
     default:
       return MemoryLocation(); // Unhandled intrinsic.
     case Intrinsic::init_trampoline:
-      return MemoryLocation(II->getArgOperand(0));
+      return MemoryLocation::getAfter(II->getArgOperand(0));
+    case Intrinsic::masked_store:
+      return MemoryLocation::getForArgument(II, 1, TLI);
     case Intrinsic::lifetime_end: {
       uint64_t Len = cast<ConstantInt>(II->getArgOperand(0))->getZExtValue();
       return MemoryLocation(II->getArgOperand(1), Len);
@@ -257,7 +285,7 @@ static MemoryLocation getLocForWrite(Instruction *Inst) {
   if (auto *CB = dyn_cast<CallBase>(Inst))
     // All the supported TLI functions so far happen to have dest as their
     // first argument.
-    return MemoryLocation(CB->getArgOperand(0));
+    return MemoryLocation::getAfter(CB->getArgOperand(0));
   return MemoryLocation();
 }
 
@@ -294,11 +322,13 @@ static bool isRemovable(Instruction *I) {
     case Intrinsic::memset:
     case Intrinsic::memmove:
     case Intrinsic::memcpy:
+    case Intrinsic::memcpy_inline:
       // Don't remove volatile memory intrinsics.
       return !cast<MemIntrinsic>(II)->isVolatile();
     case Intrinsic::memcpy_element_unordered_atomic:
     case Intrinsic::memmove_element_unordered_atomic:
     case Intrinsic::memset_element_unordered_atomic:
+    case Intrinsic::masked_store:
       return true;
     }
   }
@@ -344,9 +374,10 @@ static bool isShortenableAtTheBeginning(Instruction *I) {
 }
 
 /// Return the pointer that is being written to.
-static Value *getStoredPointerOperand(Instruction *I) {
+static Value *getStoredPointerOperand(Instruction *I,
+                                      const TargetLibraryInfo &TLI) {
   //TODO: factor this to reuse getLocForWrite
-  MemoryLocation Loc = getLocForWrite(I);
+  MemoryLocation Loc = getLocForWrite(I, TLI);
   assert(Loc.Ptr &&
          "unable to find pointer written for analyzable instruction?");
   // TODO: most APIs don't expect const Value *
@@ -372,31 +403,59 @@ enum OverwriteResult {
   OW_Complete,
   OW_End,
   OW_PartialEarlierWithFullLater,
+  OW_MaybePartial,
   OW_Unknown
 };
 
 } // end anonymous namespace
 
-/// Return 'OW_Complete' if a store to the 'Later' location completely
-/// overwrites a store to the 'Earlier' location, 'OW_End' if the end of the
-/// 'Earlier' location is completely overwritten by 'Later', 'OW_Begin' if the
-/// beginning of the 'Earlier' location is overwritten by 'Later'.
-/// 'OW_PartialEarlierWithFullLater' means that an earlier (big) store was
-/// overwritten by a latter (smaller) store which doesn't write outside the big
-/// store's memory locations. Returns 'OW_Unknown' if nothing can be determined.
-static OverwriteResult isOverwrite(const MemoryLocation &Later,
-                                   const MemoryLocation &Earlier,
-                                   const DataLayout &DL,
-                                   const TargetLibraryInfo &TLI,
-                                   int64_t &EarlierOff, int64_t &LaterOff,
-                                   Instruction *DepWrite,
-                                   InstOverlapIntervalsTy &IOL,
-                                   AliasAnalysis &AA,
-                                   const Function *F) {
+/// Check if two instruction are masked stores that completely
+/// overwrite one another. More specifically, \p Later has to
+/// overwrite \p Earlier.
+template <typename AATy>
+static OverwriteResult isMaskedStoreOverwrite(const Instruction *Later,
+                                              const Instruction *Earlier,
+                                              AATy &AA) {
+  const auto *IIL = dyn_cast<IntrinsicInst>(Later);
+  const auto *IIE = dyn_cast<IntrinsicInst>(Earlier);
+  if (IIL == nullptr || IIE == nullptr)
+    return OW_Unknown;
+  if (IIL->getIntrinsicID() != Intrinsic::masked_store ||
+      IIE->getIntrinsicID() != Intrinsic::masked_store)
+    return OW_Unknown;
+  // Pointers.
+  Value *LP = IIL->getArgOperand(1)->stripPointerCasts();
+  Value *EP = IIE->getArgOperand(1)->stripPointerCasts();
+  if (LP != EP && !AA.isMustAlias(LP, EP))
+    return OW_Unknown;
+  // Masks.
+  // TODO: check that Later's mask is a superset of the Earlier's mask.
+  if (IIL->getArgOperand(3) != IIE->getArgOperand(3))
+    return OW_Unknown;
+  return OW_Complete;
+}
+
+/// Return 'OW_Complete' if a store to the 'Later' location (by \p LaterI
+/// instruction) completely overwrites a store to the 'Earlier' location.
+/// (by \p EarlierI instruction).
+/// Return OW_MaybePartial if \p Later does not completely overwrite
+/// \p Earlier, but they both write to the same underlying object. In that
+/// case, use isPartialOverwrite to check if \p Later partially overwrites
+/// \p Earlier. Returns 'OW_Unknown' if nothing can be determined.
+template <typename AATy>
+static OverwriteResult
+isOverwrite(const Instruction *LaterI, const Instruction *EarlierI,
+            const MemoryLocation &Later, const MemoryLocation &Earlier,
+            const DataLayout &DL, const TargetLibraryInfo &TLI,
+            int64_t &EarlierOff, int64_t &LaterOff, AATy &AA,
+            const Function *F) {
   // FIXME: Vet that this works for size upper-bounds. Seems unlikely that we'll
   // get imprecise values here, though (except for unknown sizes).
-  if (!Later.Size.isPrecise() || !Earlier.Size.isPrecise())
-    return OW_Unknown;
+  if (!Later.Size.isPrecise() || !Earlier.Size.isPrecise()) {
+    // Masked stores have imprecise locations, but we can reason about them
+    // to some extent.
+    return isMaskedStoreOverwrite(LaterI, EarlierI, AA);
+  }
 
   const uint64_t LaterSize = Later.Size.getValue();
   const uint64_t EarlierSize = Earlier.Size.getValue();
@@ -415,8 +474,7 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
   // Check to see if the later store is to the entire object (either a global,
   // an alloca, or a byval/inalloca argument).  If so, then it clearly
   // overwrites any other store to the same object.
-  const Value *UO1 = GetUnderlyingObject(P1, DL),
-              *UO2 = GetUnderlyingObject(P2, DL);
+  const Value *UO1 = getUnderlyingObject(P1), *UO2 = getUnderlyingObject(P2);
 
   // If we can't resolve the same pointers to the same object, then we can't
   // analyze them at all.
@@ -441,26 +499,59 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
   if (BP1 != BP2)
     return OW_Unknown;
 
-  // The later store completely overlaps the earlier store if:
-  //
-  // 1. Both start at the same offset and the later one's size is greater than
-  //    or equal to the earlier one's, or
-  //
-  //      |--earlier--|
-  //      |--   later   --|
-  //
-  // 2. The earlier store has an offset greater than the later offset, but which
-  //    still lies completely within the later store.
-  //
-  //        |--earlier--|
-  //    |-----  later  ------|
+  // The later access completely overlaps the earlier store if and only if
+  // both start and end of the earlier one is "inside" the later one:
+  //    |<->|--earlier--|<->|
+  //    |-------later-------|
+  // Accesses may overlap if and only if start of one of them is "inside"
+  // another one:
+  //    |<->|--earlier--|<----->|
+  //    |-------later-------|
+  //           OR
+  //    |----- earlier -----|
+  //    |<->|---later---|<----->|
   //
   // We have to be careful here as *Off is signed while *.Size is unsigned.
-  if (EarlierOff >= LaterOff &&
-      LaterSize >= EarlierSize &&
-      uint64_t(EarlierOff - LaterOff) + EarlierSize <= LaterSize)
-    return OW_Complete;
 
+  // Check if the earlier access starts "not before" the later one.
+  if (EarlierOff >= LaterOff) {
+    // If the earlier access ends "not after" the later access then the earlier
+    // one is completely overwritten by the later one.
+    if (uint64_t(EarlierOff - LaterOff) + EarlierSize <= LaterSize)
+      return OW_Complete;
+    // If start of the earlier access is "before" end of the later access then
+    // accesses overlap.
+    else if ((uint64_t)(EarlierOff - LaterOff) < LaterSize)
+      return OW_MaybePartial;
+  }
+  // If start of the later access is "before" end of the earlier access then
+  // accesses overlap.
+  else if ((uint64_t)(LaterOff - EarlierOff) < EarlierSize) {
+    return OW_MaybePartial;
+  }
+
+  // Can reach here only if accesses are known not to overlap. There is no
+  // dedicated code to indicate no overlap so signal "unknown".
+  return OW_Unknown;
+}
+
+/// Return 'OW_Complete' if a store to the 'Later' location completely
+/// overwrites a store to the 'Earlier' location, 'OW_End' if the end of the
+/// 'Earlier' location is completely overwritten by 'Later', 'OW_Begin' if the
+/// beginning of the 'Earlier' location is overwritten by 'Later'.
+/// 'OW_PartialEarlierWithFullLater' means that an earlier (big) store was
+/// overwritten by a latter (smaller) store which doesn't write outside the big
+/// store's memory locations. Returns 'OW_Unknown' if nothing can be determined.
+/// NOTE: This function must only be called if both \p Later and \p Earlier
+/// write to the same underlying object with valid \p EarlierOff and \p
+/// LaterOff.
+static OverwriteResult isPartialOverwrite(const MemoryLocation &Later,
+                                          const MemoryLocation &Earlier,
+                                          int64_t EarlierOff, int64_t LaterOff,
+                                          Instruction *DepWrite,
+                                          InstOverlapIntervalsTy &IOL) {
+  const uint64_t LaterSize = Later.Size.getValue();
+  const uint64_t EarlierSize = Earlier.Size.getValue();
   // We may now overlap, although the overlap is not complete. There might also
   // be other incomplete overlaps, and together, they might cover the complete
   // earlier write.
@@ -627,11 +718,10 @@ static bool isPossibleSelfRead(Instruction *Inst,
 /// modified between the first and the second instruction.
 /// Precondition: Second instruction must be dominated by the first
 /// instruction.
-static bool memoryIsNotModifiedBetween(Instruction *FirstI,
-                                       Instruction *SecondI,
-                                       AliasAnalysis *AA,
-                                       const DataLayout &DL,
-                                       DominatorTree *DT) {
+template <typename AATy>
+static bool
+memoryIsNotModifiedBetween(Instruction *FirstI, Instruction *SecondI, AATy &AA,
+                           const DataLayout &DL, DominatorTree *DT) {
   // Do a backwards scan through the CFG from SecondI to FirstI. Look for
   // instructions which can modify the memory location accessed by SecondI.
   //
@@ -680,7 +770,7 @@ static bool memoryIsNotModifiedBetween(Instruction *FirstI,
     for (; BI != EI; ++BI) {
       Instruction *I = &*BI;
       if (I->mayWriteToMemory() && I != SecondI)
-        if (isModSet(AA->getModRefInfo(I, MemLoc.getWithNewPtr(Ptr))))
+        if (isModSet(AA.getModRefInfo(I, MemLoc.getWithNewPtr(Ptr))))
           return false;
     }
     if (B != FirstBB) {
@@ -736,10 +826,9 @@ static bool handleFree(CallInst *F, AliasAnalysis *AA,
                        MapVector<Instruction *, bool> &ThrowableInst) {
   bool MadeChange = false;
 
-  MemoryLocation Loc = MemoryLocation(F->getOperand(0));
+  MemoryLocation Loc = MemoryLocation::getAfter(F->getOperand(0));
   SmallVector<BasicBlock *, 16> Blocks;
   Blocks.push_back(F->getParent());
-  const DataLayout &DL = F->getModule()->getDataLayout();
 
   while (!Blocks.empty()) {
     BasicBlock *BB = Blocks.pop_back_val();
@@ -755,7 +844,7 @@ static bool handleFree(CallInst *F, AliasAnalysis *AA,
         break;
 
       Value *DepPointer =
-          GetUnderlyingObject(getStoredPointerOperand(Dependency), DL);
+          getUnderlyingObject(getStoredPointerOperand(Dependency, *TLI));
 
       // Check for aliasing.
       if (!AA->isMustAlias(F->getArgOperand(0), DepPointer))
@@ -795,7 +884,7 @@ static void removeAccessedObjects(const MemoryLocation &LoadedLoc,
                                   const DataLayout &DL, AliasAnalysis *AA,
                                   const TargetLibraryInfo *TLI,
                                   const Function *F) {
-  const Value *UnderlyingPointer = GetUnderlyingObject(LoadedLoc.Ptr, DL);
+  const Value *UnderlyingPointer = getUnderlyingObject(LoadedLoc.Ptr);
 
   // A constant can't be in the dead pointer set.
   if (isa<Constant>(UnderlyingPointer))
@@ -848,7 +937,7 @@ static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA,
   // Treat byval or inalloca arguments the same, stores to them are dead at the
   // end of the function.
   for (Argument &AI : BB.getParent()->args())
-    if (AI.hasPassPointeeByValueAttr())
+    if (AI.hasPassPointeeByValueCopyAttr())
       DeadStackObjects.insert(&AI);
 
   const DataLayout &DL = BB.getModule()->getDataLayout();
@@ -861,7 +950,7 @@ static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA,
     if (hasAnalyzableMemoryWrite(&*BBI, *TLI) && isRemovable(&*BBI)) {
       // See through pointer-to-pointer bitcasts
       SmallVector<const Value *, 4> Pointers;
-      GetUnderlyingObjects(getStoredPointerOperand(&*BBI), Pointers, DL);
+      getUnderlyingObjects(getStoredPointerOperand(&*BBI, *TLI), Pointers);
 
       // Stores to stack values are valid candidates for removal.
       bool AllDead = true;
@@ -980,8 +1069,8 @@ static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA,
 }
 
 static bool tryToShorten(Instruction *EarlierWrite, int64_t &EarlierOffset,
-                         int64_t &EarlierSize, int64_t LaterOffset,
-                         int64_t LaterSize, bool IsOverwriteEnd) {
+                         uint64_t &EarlierSize, int64_t LaterOffset,
+                         uint64_t LaterSize, bool IsOverwriteEnd) {
   // TODO: base this on the target vector size so that if the earlier
   // store was too small to get vector writes anyway then its likely
   // a good idea to shorten it
@@ -1036,16 +1125,23 @@ static bool tryToShorten(Instruction *EarlierWrite, int64_t &EarlierOffset,
 
 static bool tryToShortenEnd(Instruction *EarlierWrite,
                             OverlapIntervalsTy &IntervalMap,
-                            int64_t &EarlierStart, int64_t &EarlierSize) {
+                            int64_t &EarlierStart, uint64_t &EarlierSize) {
   if (IntervalMap.empty() || !isShortenableAtTheEnd(EarlierWrite))
     return false;
 
   OverlapIntervalsTy::iterator OII = --IntervalMap.end();
   int64_t LaterStart = OII->second;
-  int64_t LaterSize = OII->first - LaterStart;
+  uint64_t LaterSize = OII->first - LaterStart;
+
+  assert(OII->first - LaterStart >= 0 && "Size expected to be positive");
 
-  if (LaterStart > EarlierStart && LaterStart < EarlierStart + EarlierSize &&
-      LaterStart + LaterSize >= EarlierStart + EarlierSize) {
+  if (LaterStart > EarlierStart &&
+      // Note: "LaterStart - EarlierStart" is known to be positive due to
+      // preceding check.
+      (uint64_t)(LaterStart - EarlierStart) < EarlierSize &&
+      // Note: "EarlierSize - (uint64_t)(LaterStart - EarlierStart)" is known to
+      // be non negative due to preceding checks.
+      LaterSize >= EarlierSize - (uint64_t)(LaterStart - EarlierStart)) {
     if (tryToShorten(EarlierWrite, EarlierStart, EarlierSize, LaterStart,
                      LaterSize, true)) {
       IntervalMap.erase(OII);
@@ -1057,16 +1153,23 @@ static bool tryToShortenEnd(Instruction *EarlierWrite,
 
 static bool tryToShortenBegin(Instruction *EarlierWrite,
                               OverlapIntervalsTy &IntervalMap,
-                              int64_t &EarlierStart, int64_t &EarlierSize) {
+                              int64_t &EarlierStart, uint64_t &EarlierSize) {
   if (IntervalMap.empty() || !isShortenableAtTheBeginning(EarlierWrite))
     return false;
 
   OverlapIntervalsTy::iterator OII = IntervalMap.begin();
   int64_t LaterStart = OII->second;
-  int64_t LaterSize = OII->first - LaterStart;
+  uint64_t LaterSize = OII->first - LaterStart;
 
-  if (LaterStart <= EarlierStart && LaterStart + LaterSize > EarlierStart) {
-    assert(LaterStart + LaterSize < EarlierStart + EarlierSize &&
+  assert(OII->first - LaterStart >= 0 && "Size expected to be positive");
+
+  if (LaterStart <= EarlierStart &&
+      // Note: "EarlierStart - LaterStart" is known to be non negative due to
+      // preceding check.
+      LaterSize > (uint64_t)(EarlierStart - LaterStart)) {
+    // Note: "LaterSize - (uint64_t)(EarlierStart - LaterStart)" is known to be
+    // positive due to preceding checks.
+    assert(LaterSize - (uint64_t)(EarlierStart - LaterStart) < EarlierSize &&
            "Should have been handled as OW_Complete");
     if (tryToShorten(EarlierWrite, EarlierStart, EarlierSize, LaterStart,
                      LaterSize, false)) {
@@ -1077,18 +1180,18 @@ static bool tryToShortenBegin(Instruction *EarlierWrite,
   return false;
 }
 
-static bool removePartiallyOverlappedStores(AliasAnalysis *AA,
-                                            const DataLayout &DL,
-                                            InstOverlapIntervalsTy &IOL) {
+static bool removePartiallyOverlappedStores(const DataLayout &DL,
+                                            InstOverlapIntervalsTy &IOL,
+                                            const TargetLibraryInfo &TLI) {
   bool Changed = false;
   for (auto OI : IOL) {
     Instruction *EarlierWrite = OI.first;
-    MemoryLocation Loc = getLocForWrite(EarlierWrite);
+    MemoryLocation Loc = getLocForWrite(EarlierWrite, TLI);
     assert(isRemovable(EarlierWrite) && "Expect only removable instruction");
 
     const Value *Ptr = Loc.Ptr->stripPointerCasts();
     int64_t EarlierStart = 0;
-    int64_t EarlierSize = int64_t(Loc.Size.getValue());
+    uint64_t EarlierSize = Loc.Size.getValue();
     GetPointerBaseWithConstantOffset(Ptr, EarlierStart, DL);
     OverlapIntervalsTy &IntervalMap = OI.second;
     Changed |=
@@ -1118,7 +1221,7 @@ static bool eliminateNoopStore(Instruction *Inst, BasicBlock::iterator &BBI,
   if (LoadInst *DepLoad = dyn_cast<LoadInst>(SI->getValueOperand())) {
     if (SI->getPointerOperand() == DepLoad->getPointerOperand() &&
         isRemovable(SI) &&
-        memoryIsNotModifiedBetween(DepLoad, SI, AA, DL, DT)) {
+        memoryIsNotModifiedBetween(DepLoad, SI, *AA, DL, DT)) {
 
       LLVM_DEBUG(
           dbgs() << "DSE: Remove Store Of Load from same pointer:\n  LOAD: "
@@ -1134,10 +1237,10 @@ static bool eliminateNoopStore(Instruction *Inst, BasicBlock::iterator &BBI,
   Constant *StoredConstant = dyn_cast<Constant>(SI->getValueOperand());
   if (StoredConstant && StoredConstant->isNullValue() && isRemovable(SI)) {
     Instruction *UnderlyingPointer =
-        dyn_cast<Instruction>(GetUnderlyingObject(SI->getPointerOperand(), DL));
+        dyn_cast<Instruction>(getUnderlyingObject(SI->getPointerOperand()));
 
     if (UnderlyingPointer && isCallocLikeFn(UnderlyingPointer, TLI) &&
-        memoryIsNotModifiedBetween(UnderlyingPointer, SI, AA, DL, DT)) {
+        memoryIsNotModifiedBetween(UnderlyingPointer, SI, *AA, DL, DT)) {
       LLVM_DEBUG(
           dbgs() << "DSE: Remove null store to the calloc'ed object:\n  DEAD: "
                  << *Inst << "\n  OBJECT: " << *UnderlyingPointer << '\n');
@@ -1150,11 +1253,10 @@ static bool eliminateNoopStore(Instruction *Inst, BasicBlock::iterator &BBI,
   return false;
 }
 
-static Constant *
-tryToMergePartialOverlappingStores(StoreInst *Earlier, StoreInst *Later,
-                                   int64_t InstWriteOffset,
-                                   int64_t DepWriteOffset, const DataLayout &DL,
-                                   AliasAnalysis *AA, DominatorTree *DT) {
+template <typename AATy>
+static Constant *tryToMergePartialOverlappingStores(
+    StoreInst *Earlier, StoreInst *Later, int64_t InstWriteOffset,
+    int64_t DepWriteOffset, const DataLayout &DL, AATy &AA, DominatorTree *DT) {
 
   if (Earlier && isa<ConstantInt>(Earlier->getValueOperand()) &&
       DL.typeSizeEqualsStoreSize(Earlier->getValueOperand()->getType()) &&
@@ -1245,7 +1347,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
       continue;
 
     // Figure out what location is being stored to.
-    MemoryLocation Loc = getLocForWrite(Inst);
+    MemoryLocation Loc = getLocForWrite(Inst, *TLI);
 
     // If we didn't get a useful location, fail.
     if (!Loc.Ptr)
@@ -1269,7 +1371,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
       Instruction *DepWrite = InstDep.getInst();
       if (!hasAnalyzableMemoryWrite(DepWrite, *TLI))
         break;
-      MemoryLocation DepLoc = getLocForWrite(DepWrite);
+      MemoryLocation DepLoc = getLocForWrite(DepWrite, *TLI);
       // If we didn't get a useful location, or if it isn't a size, bail out.
       if (!DepLoc.Ptr)
         break;
@@ -1289,7 +1391,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
       // to it is dead along the unwind edge. Otherwise, we need to preserve
       // the store.
       if (LastThrowing && DepWrite->comesBefore(LastThrowing)) {
-        const Value* Underlying = GetUnderlyingObject(DepLoc.Ptr, DL);
+        const Value *Underlying = getUnderlyingObject(DepLoc.Ptr);
         bool IsStoreDeadOnUnwind = isa<AllocaInst>(Underlying);
         if (!IsStoreDeadOnUnwind) {
             // We're looking for a call to an allocation function
@@ -1311,9 +1413,13 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
       if (isRemovable(DepWrite) &&
           !isPossibleSelfRead(Inst, Loc, DepWrite, *TLI, *AA)) {
         int64_t InstWriteOffset, DepWriteOffset;
-        OverwriteResult OR = isOverwrite(Loc, DepLoc, DL, *TLI, DepWriteOffset,
-                                         InstWriteOffset, DepWrite, IOL, *AA,
+        OverwriteResult OR = isOverwrite(Inst, DepWrite, Loc, DepLoc, DL, *TLI,
+                                         DepWriteOffset, InstWriteOffset, *AA,
                                          BB.getParent());
+        if (OR == OW_MaybePartial)
+          OR = isPartialOverwrite(Loc, DepLoc, DepWriteOffset, InstWriteOffset,
+                                  DepWrite, IOL);
+
         if (OR == OW_Complete) {
           LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: " << *DepWrite
                             << "\n  KILLER: " << *Inst << '\n');
@@ -1334,8 +1440,8 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
                                                     "when partial-overwrite "
                                                     "tracking is enabled");
           // The overwrite result is known, so these must be known, too.
-          int64_t EarlierSize = DepLoc.Size.getValue();
-          int64_t LaterSize = Loc.Size.getValue();
+          uint64_t EarlierSize = DepLoc.Size.getValue();
+          uint64_t LaterSize = Loc.Size.getValue();
           bool IsOverwriteEnd = (OR == OW_End);
           MadeChange |= tryToShorten(DepWrite, DepWriteOffset, EarlierSize,
                                     InstWriteOffset, LaterSize, IsOverwriteEnd);
@@ -1344,7 +1450,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
           auto *Earlier = dyn_cast<StoreInst>(DepWrite);
           auto *Later = dyn_cast<StoreInst>(Inst);
           if (Constant *C = tryToMergePartialOverlappingStores(
-                  Earlier, Later, InstWriteOffset, DepWriteOffset, DL, AA,
+                  Earlier, Later, InstWriteOffset, DepWriteOffset, DL, *AA,
                   DT)) {
             auto *SI = new StoreInst(
                 C, Earlier->getPointerOperand(), false, Earlier->getAlign(),
@@ -1391,7 +1497,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
   }
 
   if (EnablePartialOverwriteTracking)
-    MadeChange |= removePartiallyOverlappedStores(AA, DL, IOL);
+    MadeChange |= removePartiallyOverlappedStores(DL, IOL, *TLI);
 
   // If this block ends in a return, unwind, or unreachable, all allocas are
   // dead at its end, which means stores to them are also dead.
@@ -1425,20 +1531,21 @@ namespace {
 // in between both MemoryDefs. A bit more concretely:
 //
 // For all MemoryDefs StartDef:
-// 1. Get the next dominating clobbering MemoryDef (DomAccess) by walking
+// 1. Get the next dominating clobbering MemoryDef (EarlierAccess) by walking
 //    upwards.
-// 2. Check that there are no reads between DomAccess and the StartDef by
-//    checking all uses starting at DomAccess and walking until we see StartDef.
-// 3. For each found DomDef, check that:
-//   1. There are no barrier instructions between DomDef and StartDef (like
+// 2. Check that there are no reads between EarlierAccess and the StartDef by
+//    checking all uses starting at EarlierAccess and walking until we see
+//    StartDef.
+// 3. For each found CurrentDef, check that:
+//   1. There are no barrier instructions between CurrentDef and StartDef (like
 //       throws or stores with ordering constraints).
-//   2. StartDef is executed whenever DomDef is executed.
-//   3. StartDef completely overwrites DomDef.
-// 4. Erase DomDef from the function and MemorySSA.
+//   2. StartDef is executed whenever CurrentDef is executed.
+//   3. StartDef completely overwrites CurrentDef.
+// 4. Erase CurrentDef from the function and MemorySSA.
 
-// Returns true if \p M is an intrisnic that does not read or write memory.
-bool isNoopIntrinsic(MemoryUseOrDef *M) {
-  if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(M->getMemoryInst())) {
+// Returns true if \p I is an intrisnic that does not read or write memory.
+bool isNoopIntrinsic(Instruction *I) {
+  if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
     switch (II->getIntrinsicID()) {
     case Intrinsic::lifetime_start:
     case Intrinsic::lifetime_end:
@@ -1481,7 +1588,7 @@ bool canSkipDef(MemoryDef *D, bool DefVisibleToCaller) {
     return true;
 
   // Skip intrinsics that do not really read or modify memory.
-  if (isNoopIntrinsic(D))
+  if (isNoopIntrinsic(D->getMemoryInst()))
     return true;
 
   return false;
@@ -1490,10 +1597,21 @@ bool canSkipDef(MemoryDef *D, bool DefVisibleToCaller) {
 struct DSEState {
   Function &F;
   AliasAnalysis &AA;
+
+  /// The single BatchAA instance that is used to cache AA queries. It will
+  /// not be invalidated over the whole run. This is safe, because:
+  /// 1. Only memory writes are removed, so the alias cache for memory
+  ///    locations remains valid.
+  /// 2. No new instructions are added (only instructions removed), so cached
+  ///    information for a deleted value cannot be accessed by a re-used new
+  ///    value pointer.
+  BatchAAResults BatchAA;
+
   MemorySSA &MSSA;
   DominatorTree &DT;
   PostDominatorTree &PDT;
   const TargetLibraryInfo &TLI;
+  const DataLayout &DL;
 
   // All MemoryDefs that potentially could kill other MemDefs.
   SmallVector<MemoryDef *, 64> MemDefs;
@@ -1501,10 +1619,11 @@ struct DSEState {
   SmallPtrSet<MemoryAccess *, 4> SkipStores;
   // Keep track of all of the objects that are invisible to the caller before
   // the function returns.
-  SmallPtrSet<const Value *, 16> InvisibleToCallerBeforeRet;
+  // SmallPtrSet<const Value *, 16> InvisibleToCallerBeforeRet;
+  DenseMap<const Value *, bool> InvisibleToCallerBeforeRet;
   // Keep track of all of the objects that are invisible to the caller after
   // the function returns.
-  SmallPtrSet<const Value *, 16> InvisibleToCallerAfterRet;
+  DenseMap<const Value *, bool> InvisibleToCallerAfterRet;
   // Keep track of blocks with throwing instructions not modeled in MemorySSA.
   SmallPtrSet<BasicBlock *, 16> ThrowingBlocks;
   // Post-order numbers for each basic block. Used to figure out if memory
@@ -1517,7 +1636,8 @@ struct DSEState {
 
   DSEState(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT,
            PostDominatorTree &PDT, const TargetLibraryInfo &TLI)
-      : F(F), AA(AA), MSSA(MSSA), DT(DT), PDT(PDT), TLI(TLI) {}
+      : F(F), AA(AA), BatchAA(AA), MSSA(MSSA), DT(DT), PDT(PDT), TLI(TLI),
+        DL(F.getParent()->getDataLayout()) {}
 
   static DSEState get(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
                       DominatorTree &DT, PostDominatorTree &PDT,
@@ -1537,42 +1657,54 @@ struct DSEState {
         if (MD && State.MemDefs.size() < MemorySSADefsPerBlockLimit &&
             (State.getLocForWriteEx(&I) || State.isMemTerminatorInst(&I)))
           State.MemDefs.push_back(MD);
-
-        // Track whether alloca and alloca-like objects are visible in the
-        // caller before and after the function returns. Alloca objects are
-        // invalid in the caller, so they are neither visible before or after
-        // the function returns.
-        if (isa<AllocaInst>(&I)) {
-          State.InvisibleToCallerBeforeRet.insert(&I);
-          State.InvisibleToCallerAfterRet.insert(&I);
-        }
-
-        // For alloca-like objects we need to check if they are captured before
-        // the function returns and if the return might capture the object.
-        if (isAllocLikeFn(&I, &TLI)) {
-          bool CapturesBeforeRet = PointerMayBeCaptured(&I, false, true);
-          if (!CapturesBeforeRet) {
-            State.InvisibleToCallerBeforeRet.insert(&I);
-            if (!PointerMayBeCaptured(&I, true, false))
-              State.InvisibleToCallerAfterRet.insert(&I);
-          }
-        }
       }
     }
 
     // Treat byval or inalloca arguments the same as Allocas, stores to them are
     // dead at the end of the function.
     for (Argument &AI : F.args())
-      if (AI.hasPassPointeeByValueAttr()) {
+      if (AI.hasPassPointeeByValueCopyAttr()) {
         // For byval, the caller doesn't know the address of the allocation.
         if (AI.hasByValAttr())
-          State.InvisibleToCallerBeforeRet.insert(&AI);
-        State.InvisibleToCallerAfterRet.insert(&AI);
+          State.InvisibleToCallerBeforeRet.insert({&AI, true});
+        State.InvisibleToCallerAfterRet.insert({&AI, true});
       }
 
     return State;
   }
 
+  bool isInvisibleToCallerAfterRet(const Value *V) {
+    if (isa<AllocaInst>(V))
+      return true;
+    auto I = InvisibleToCallerAfterRet.insert({V, false});
+    if (I.second) {
+      if (!isInvisibleToCallerBeforeRet(V)) {
+        I.first->second = false;
+      } else {
+        auto *Inst = dyn_cast<Instruction>(V);
+        if (Inst && isAllocLikeFn(Inst, &TLI))
+          I.first->second = !PointerMayBeCaptured(V, true, false);
+      }
+    }
+    return I.first->second;
+  }
+
+  bool isInvisibleToCallerBeforeRet(const Value *V) {
+    if (isa<AllocaInst>(V))
+      return true;
+    auto I = InvisibleToCallerBeforeRet.insert({V, false});
+    if (I.second) {
+      auto *Inst = dyn_cast<Instruction>(V);
+      if (Inst && isAllocLikeFn(Inst, &TLI))
+        // NOTE: This could be made more precise by PointerMayBeCapturedBefore
+        // with the killing MemoryDef. But we refrain from doing so for now to
+        // limit compile-time and this does not cause any changes to the number
+        // of stores removed on a large test set in practice.
+        I.first->second = !PointerMayBeCaptured(V, false, true);
+    }
+    return I.first->second;
+  }
+
   Optional<MemoryLocation> getLocForWriteEx(Instruction *I) const {
     if (!I->mayWriteToMemory())
       return None;
@@ -1581,6 +1713,11 @@ struct DSEState {
       return {MemoryLocation::getForDest(MTI)};
 
     if (auto *CB = dyn_cast<CallBase>(I)) {
+      // If the functions may write to memory we do not know about, bail out.
+      if (!CB->onlyAccessesArgMemory() &&
+          !CB->onlyAccessesInaccessibleMemOrArgMem())
+        return None;
+
       LibFunc LF;
       if (TLI.getLibFunc(*CB, LF) && TLI.has(LF)) {
         switch (LF) {
@@ -1588,19 +1725,29 @@ struct DSEState {
         case LibFunc_strncpy:
         case LibFunc_strcat:
         case LibFunc_strncat:
-          return {MemoryLocation(CB->getArgOperand(0))};
+          return {MemoryLocation::getAfter(CB->getArgOperand(0))};
         default:
           break;
         }
       }
+      switch (CB->getIntrinsicID()) {
+      case Intrinsic::init_trampoline:
+        return {MemoryLocation::getAfter(CB->getArgOperand(0))};
+      case Intrinsic::masked_store:
+        return {MemoryLocation::getForArgument(CB, 1, TLI)};
+      default:
+        break;
+      }
       return None;
     }
 
     return MemoryLocation::getOrNone(I);
   }
 
-  /// Returns true if \p Use completely overwrites \p DefLoc.
-  bool isCompleteOverwrite(MemoryLocation DefLoc, Instruction *UseInst) const {
+  /// Returns true if \p UseInst completely overwrites \p DefLoc
+  /// (stored by \p DefInst).
+  bool isCompleteOverwrite(const MemoryLocation &DefLoc, Instruction *DefInst,
+                           Instruction *UseInst) {
     // UseInst has a MemoryDef associated in MemorySSA. It's possible for a
     // MemoryDef to not write to memory, e.g. a volatile load is modeled as a
     // MemoryDef.
@@ -1612,14 +1759,10 @@ struct DSEState {
         return false;
 
     int64_t InstWriteOffset, DepWriteOffset;
-    auto CC = getLocForWriteEx(UseInst);
-    InstOverlapIntervalsTy IOL;
-
-    const DataLayout &DL = F.getParent()->getDataLayout();
-
-    return CC &&
-           isOverwrite(*CC, DefLoc, DL, TLI, DepWriteOffset, InstWriteOffset,
-                       UseInst, IOL, AA, &F) == OW_Complete;
+    if (auto CC = getLocForWriteEx(UseInst))
+      return isOverwrite(UseInst, DefInst, *CC, DefLoc, DL, TLI, DepWriteOffset,
+                         InstWriteOffset, BatchAA, &F) == OW_Complete;
+    return false;
   }
 
   /// Returns true if \p Def is not read before returning from the function.
@@ -1650,10 +1793,12 @@ struct DSEState {
       }
 
       MemoryAccess *UseAccess = WorkList[I];
-      if (isa<MemoryPhi>(UseAccess)) {
-        PushMemUses(UseAccess);
-        continue;
-      }
+      // Simply adding the users of MemoryPhi to the worklist is not enough,
+      // because we might miss read clobbers in different iterations of a loop,
+      // for example.
+      // TODO: Add support for phi translation to handle the loop case.
+      if (isa<MemoryPhi>(UseAccess))
+        return false;
 
       // TODO: Checking for aliasing is expensive. Consider reducing the amount
       // of times this is called and/or caching it.
@@ -1682,7 +1827,8 @@ struct DSEState {
 
     if (auto *CB = dyn_cast<CallBase>(I)) {
       if (isFreeCall(I, &TLI))
-        return {std::make_pair(MemoryLocation(CB->getArgOperand(0)), true)};
+        return {std::make_pair(MemoryLocation::getAfter(CB->getArgOperand(0)),
+                               true)};
     }
 
     return None;
@@ -1696,9 +1842,10 @@ struct DSEState {
            isFreeCall(I, &TLI);
   }
 
-  /// Returns true if \p MaybeTerm is a memory terminator for the same
-  /// underlying object as \p DefLoc.
-  bool isMemTerminator(MemoryLocation DefLoc, Instruction *MaybeTerm) const {
+  /// Returns true if \p MaybeTerm is a memory terminator for \p Loc from
+  /// instruction \p AccessI.
+  bool isMemTerminator(const MemoryLocation &Loc, Instruction *AccessI,
+                       Instruction *MaybeTerm) {
     Optional<std::pair<MemoryLocation, bool>> MaybeTermLoc =
         getLocForTerminator(MaybeTerm);
 
@@ -1707,15 +1854,31 @@ struct DSEState {
 
     // If the terminator is a free-like call, all accesses to the underlying
     // object can be considered terminated.
+    if (getUnderlyingObject(Loc.Ptr) !=
+        getUnderlyingObject(MaybeTermLoc->first.Ptr))
+      return false;
+
+    auto TermLoc = MaybeTermLoc->first;
     if (MaybeTermLoc->second) {
-      DataLayout DL = MaybeTerm->getParent()->getModule()->getDataLayout();
-      DefLoc = MemoryLocation(GetUnderlyingObject(DefLoc.Ptr, DL));
+      const Value *LocUO = getUnderlyingObject(Loc.Ptr);
+      return BatchAA.isMustAlias(TermLoc.Ptr, LocUO);
     }
-    return AA.isMustAlias(MaybeTermLoc->first, DefLoc);
+    int64_t InstWriteOffset, DepWriteOffset;
+    return isOverwrite(MaybeTerm, AccessI, TermLoc, Loc, DL, TLI,
+                       DepWriteOffset, InstWriteOffset, BatchAA,
+                       &F) == OW_Complete;
   }
 
   // Returns true if \p Use may read from \p DefLoc.
-  bool isReadClobber(MemoryLocation DefLoc, Instruction *UseInst) const {
+  bool isReadClobber(const MemoryLocation &DefLoc, Instruction *UseInst) {
+    if (isNoopIntrinsic(UseInst))
+      return false;
+
+    // Monotonic or weaker atomic stores can be re-ordered and do not need to be
+    // treated as read clobber.
+    if (auto SI = dyn_cast<StoreInst>(UseInst))
+      return isStrongerThan(SI->getOrdering(), AtomicOrdering::Monotonic);
+
     if (!UseInst->mayReadFromMemory())
       return false;
 
@@ -1723,88 +1886,246 @@ struct DSEState {
       if (CB->onlyAccessesInaccessibleMemory())
         return false;
 
-    ModRefInfo MR = AA.getModRefInfo(UseInst, DefLoc);
-    // If necessary, perform additional analysis.
-    if (isRefSet(MR))
-      MR = AA.callCapturesBefore(UseInst, DefLoc, &DT);
-    return isRefSet(MR);
+    // NOTE: For calls, the number of stores removed could be slightly improved
+    // by using AA.callCapturesBefore(UseInst, DefLoc, &DT), but that showed to
+    // be expensive compared to the benefits in practice. For now, avoid more
+    // expensive analysis to limit compile-time.
+    return isRefSet(BatchAA.getModRefInfo(UseInst, DefLoc));
+  }
+
+  /// Returns true if \p Ptr is guaranteed to be loop invariant for any possible
+  /// loop. In particular, this guarantees that it only references a single
+  /// MemoryLocation during execution of the containing function.
+  bool IsGuaranteedLoopInvariant(Value *Ptr) {
+    auto IsGuaranteedLoopInvariantBase = [this](Value *Ptr) {
+      Ptr = Ptr->stripPointerCasts();
+      if (auto *I = dyn_cast<Instruction>(Ptr)) {
+        if (isa<AllocaInst>(Ptr))
+          return true;
+
+        if (isAllocLikeFn(I, &TLI))
+          return true;
+
+        return false;
+      }
+      return true;
+    };
+
+    Ptr = Ptr->stripPointerCasts();
+    if (auto *GEP = dyn_cast<GEPOperator>(Ptr)) {
+      return IsGuaranteedLoopInvariantBase(GEP->getPointerOperand()) &&
+             GEP->hasAllConstantIndices();
+    }
+    return IsGuaranteedLoopInvariantBase(Ptr);
   }
 
-  // Find a MemoryDef writing to \p DefLoc and dominating \p Current, with no
-  // read access between them or on any other path to a function exit block if
-  // \p DefLoc is not accessible after the function returns. If there is no such
-  // MemoryDef, return None. The returned value may not (completely) overwrite
-  // \p DefLoc. Currently we bail out when we encounter an aliasing MemoryUse
-  // (read).
+  // Find a MemoryDef writing to \p DefLoc and dominating \p StartAccess, with
+  // no read access between them or on any other path to a function exit block
+  // if \p DefLoc is not accessible after the function returns. If there is no
+  // such MemoryDef, return None. The returned value may not (completely)
+  // overwrite \p DefLoc. Currently we bail out when we encounter an aliasing
+  // MemoryUse (read).
   Optional<MemoryAccess *>
-  getDomMemoryDef(MemoryDef *KillingDef, MemoryAccess *Current,
-                  MemoryLocation DefLoc, bool DefVisibleToCallerBeforeRet,
-                  bool DefVisibleToCallerAfterRet, int &ScanLimit) const {
-    MemoryAccess *DomAccess;
+  getDomMemoryDef(MemoryDef *KillingDef, MemoryAccess *StartAccess,
+                  const MemoryLocation &DefLoc, const Value *DefUO,
+                  unsigned &ScanLimit, unsigned &WalkerStepLimit,
+                  bool IsMemTerm, unsigned &PartialLimit) {
+    if (ScanLimit == 0 || WalkerStepLimit == 0) {
+      LLVM_DEBUG(dbgs() << "\n    ...  hit scan limit\n");
+      return None;
+    }
+
+    MemoryAccess *Current = StartAccess;
+    Instruction *KillingI = KillingDef->getMemoryInst();
     bool StepAgain;
-    LLVM_DEBUG(dbgs() << "  trying to get dominating access for " << *Current
-                      << "\n");
-    // Find the next clobbering Mod access for DefLoc, starting at Current.
+    LLVM_DEBUG(dbgs() << "  trying to get dominating access\n");
+
+    // Find the next clobbering Mod access for DefLoc, starting at StartAccess.
+    Optional<MemoryLocation> CurrentLoc;
     do {
       StepAgain = false;
+      LLVM_DEBUG({
+        dbgs() << "   visiting " << *Current;
+        if (!MSSA.isLiveOnEntryDef(Current) && isa<MemoryUseOrDef>(Current))
+          dbgs() << " (" << *cast<MemoryUseOrDef>(Current)->getMemoryInst()
+                 << ")";
+        dbgs() << "\n";
+      });
+
       // Reached TOP.
-      if (MSSA.isLiveOnEntryDef(Current))
+      if (MSSA.isLiveOnEntryDef(Current)) {
+        LLVM_DEBUG(dbgs() << "   ...  found LiveOnEntryDef\n");
         return None;
+      }
+
+      // Cost of a step. Accesses in the same block are more likely to be valid
+      // candidates for elimination, hence consider them cheaper.
+      unsigned StepCost = KillingDef->getBlock() == Current->getBlock()
+                              ? MemorySSASameBBStepCost
+                              : MemorySSAOtherBBStepCost;
+      if (WalkerStepLimit <= StepCost) {
+        LLVM_DEBUG(dbgs() << "   ...  hit walker step limit\n");
+        return None;
+      }
+      WalkerStepLimit -= StepCost;
 
+      // Return for MemoryPhis. They cannot be eliminated directly and the
+      // caller is responsible for traversing them.
       if (isa<MemoryPhi>(Current)) {
-        DomAccess = Current;
-        break;
+        LLVM_DEBUG(dbgs() << "   ...  found MemoryPhi\n");
+        return Current;
+      }
+
+      // Below, check if CurrentDef is a valid candidate to be eliminated by
+      // KillingDef. If it is not, check the next candidate.
+      MemoryDef *CurrentDef = cast<MemoryDef>(Current);
+      Instruction *CurrentI = CurrentDef->getMemoryInst();
+
+      if (canSkipDef(CurrentDef, !isInvisibleToCallerBeforeRet(DefUO))) {
+        StepAgain = true;
+        Current = CurrentDef->getDefiningAccess();
+        continue;
       }
-      MemoryUseOrDef *CurrentUD = cast<MemoryUseOrDef>(Current);
-      // Look for access that clobber DefLoc.
-      DomAccess = MSSA.getSkipSelfWalker()->getClobberingMemoryAccess(CurrentUD,
-                                                                      DefLoc);
-      if (MSSA.isLiveOnEntryDef(DomAccess))
+
+      // Before we try to remove anything, check for any extra throwing
+      // instructions that block us from DSEing
+      if (mayThrowBetween(KillingI, CurrentI, DefUO)) {
+        LLVM_DEBUG(dbgs() << "  ... skip, may throw!\n");
         return None;
+      }
 
-      if (isa<MemoryPhi>(DomAccess))
-        break;
+      // Check for anything that looks like it will be a barrier to further
+      // removal
+      if (isDSEBarrier(DefUO, CurrentI)) {
+        LLVM_DEBUG(dbgs() << "  ... skip, barrier\n");
+        return None;
+      }
+
+      // If Current is known to be on path that reads DefLoc or is a read
+      // clobber, bail out, as the path is not profitable. We skip this check
+      // for intrinsic calls, because the code knows how to handle memcpy
+      // intrinsics.
+      if (!isa<IntrinsicInst>(CurrentI) && isReadClobber(DefLoc, CurrentI))
+        return None;
+
+      // Quick check if there are direct uses that are read-clobbers.
+      if (any_of(Current->uses(), [this, &DefLoc, StartAccess](Use &U) {
+            if (auto *UseOrDef = dyn_cast<MemoryUseOrDef>(U.getUser()))
+              return !MSSA.dominates(StartAccess, UseOrDef) &&
+                     isReadClobber(DefLoc, UseOrDef->getMemoryInst());
+            return false;
+          })) {
+        LLVM_DEBUG(dbgs() << "   ...  found a read clobber\n");
+        return None;
+      }
+
+      // If Current cannot be analyzed or is not removable, check the next
+      // candidate.
+      if (!hasAnalyzableMemoryWrite(CurrentI, TLI) || !isRemovable(CurrentI)) {
+        StepAgain = true;
+        Current = CurrentDef->getDefiningAccess();
+        continue;
+      }
+
+      // If Current does not have an analyzable write location, skip it
+      CurrentLoc = getLocForWriteEx(CurrentI);
+      if (!CurrentLoc) {
+        StepAgain = true;
+        Current = CurrentDef->getDefiningAccess();
+        continue;
+      }
 
-      // Check if we can skip DomDef for DSE.
-      MemoryDef *DomDef = dyn_cast<MemoryDef>(DomAccess);
-      if (DomDef && canSkipDef(DomDef, DefVisibleToCallerBeforeRet)) {
+      // AliasAnalysis does not account for loops. Limit elimination to
+      // candidates for which we can guarantee they always store to the same
+      // memory location and not multiple locations in a loop.
+      if (Current->getBlock() != KillingDef->getBlock() &&
+          !IsGuaranteedLoopInvariant(const_cast<Value *>(CurrentLoc->Ptr))) {
         StepAgain = true;
-        Current = DomDef->getDefiningAccess();
+        Current = CurrentDef->getDefiningAccess();
+        WalkerStepLimit -= 1;
+        continue;
       }
 
+      if (IsMemTerm) {
+        // If the killing def is a memory terminator (e.g. lifetime.end), check
+        // the next candidate if the current Current does not write the same
+        // underlying object as the terminator.
+        if (!isMemTerminator(*CurrentLoc, CurrentI, KillingI)) {
+          StepAgain = true;
+          Current = CurrentDef->getDefiningAccess();
+        }
+        continue;
+      } else {
+        int64_t InstWriteOffset, DepWriteOffset;
+        auto OR = isOverwrite(KillingI, CurrentI, DefLoc, *CurrentLoc, DL, TLI,
+                              DepWriteOffset, InstWriteOffset, BatchAA, &F);
+        // If Current does not write to the same object as KillingDef, check
+        // the next candidate.
+        if (OR == OW_Unknown) {
+          StepAgain = true;
+          Current = CurrentDef->getDefiningAccess();
+        } else if (OR == OW_MaybePartial) {
+          // If KillingDef only partially overwrites Current, check the next
+          // candidate if the partial step limit is exceeded. This aggressively
+          // limits the number of candidates for partial store elimination,
+          // which are less likely to be removable in the end.
+          if (PartialLimit <= 1) {
+            StepAgain = true;
+            Current = CurrentDef->getDefiningAccess();
+            WalkerStepLimit -= 1;
+            continue;
+          }
+          PartialLimit -= 1;
+        }
+      }
     } while (StepAgain);
 
     // Accesses to objects accessible after the function returns can only be
     // eliminated if the access is killed along all paths to the exit. Collect
     // the blocks with killing (=completely overwriting MemoryDefs) and check if
-    // they cover all paths from DomAccess to any function exit.
-    SmallPtrSet<BasicBlock *, 16> KillingBlocks = {KillingDef->getBlock()};
-    LLVM_DEBUG({
-      dbgs() << "  Checking for reads of " << *DomAccess;
-      if (isa<MemoryDef>(DomAccess))
-        dbgs() << " (" << *cast<MemoryDef>(DomAccess)->getMemoryInst() << ")\n";
-      else
-        dbgs() << ")\n";
-    });
+    // they cover all paths from EarlierAccess to any function exit.
+    SmallPtrSet<Instruction *, 16> KillingDefs;
+    KillingDefs.insert(KillingDef->getMemoryInst());
+    MemoryAccess *EarlierAccess = Current;
+    Instruction *EarlierMemInst =
+        cast<MemoryDef>(EarlierAccess)->getMemoryInst();
+    LLVM_DEBUG(dbgs() << "  Checking for reads of " << *EarlierAccess << " ("
+                      << *EarlierMemInst << ")\n");
 
     SmallSetVector<MemoryAccess *, 32> WorkList;
     auto PushMemUses = [&WorkList](MemoryAccess *Acc) {
       for (Use &U : Acc->uses())
         WorkList.insert(cast<MemoryAccess>(U.getUser()));
     };
-    PushMemUses(DomAccess);
-
-    // Check if DomDef may be read.
+    PushMemUses(EarlierAccess);
+
+    // Optimistically collect all accesses for reads. If we do not find any
+    // read clobbers, add them to the cache.
+    SmallPtrSet<MemoryAccess *, 16> KnownNoReads;
+    if (!EarlierMemInst->mayReadFromMemory())
+      KnownNoReads.insert(EarlierAccess);
+    // Check if EarlierDef may be read.
     for (unsigned I = 0; I < WorkList.size(); I++) {
       MemoryAccess *UseAccess = WorkList[I];
 
       LLVM_DEBUG(dbgs() << "   " << *UseAccess);
-      if (--ScanLimit == 0) {
+      // Bail out if the number of accesses to check exceeds the scan limit.
+      if (ScanLimit < (WorkList.size() - I)) {
         LLVM_DEBUG(dbgs() << "\n    ...  hit scan limit\n");
         return None;
       }
+      --ScanLimit;
+      NumDomMemDefChecks++;
+      KnownNoReads.insert(UseAccess);
 
       if (isa<MemoryPhi>(UseAccess)) {
+        if (any_of(KillingDefs, [this, UseAccess](Instruction *KI) {
+              return DT.properlyDominates(KI->getParent(),
+                                          UseAccess->getBlock());
+            })) {
+          LLVM_DEBUG(dbgs() << " ... skipping, dominated by killing block\n");
+          continue;
+        }
         LLVM_DEBUG(dbgs() << "\n    ... adding PHI uses\n");
         PushMemUses(UseAccess);
         continue;
@@ -1813,29 +2134,45 @@ struct DSEState {
       Instruction *UseInst = cast<MemoryUseOrDef>(UseAccess)->getMemoryInst();
       LLVM_DEBUG(dbgs() << " (" << *UseInst << ")\n");
 
-      if (isNoopIntrinsic(cast<MemoryUseOrDef>(UseAccess))) {
-        LLVM_DEBUG(dbgs() << "    ... adding uses of intrinsic\n");
-        PushMemUses(UseAccess);
+      if (any_of(KillingDefs, [this, UseInst](Instruction *KI) {
+            return DT.dominates(KI, UseInst);
+          })) {
+        LLVM_DEBUG(dbgs() << " ... skipping, dominated by killing def\n");
         continue;
       }
 
       // A memory terminator kills all preceeding MemoryDefs and all succeeding
       // MemoryAccesses. We do not have to check it's users.
-      if (isMemTerminator(DefLoc, UseInst))
+      if (isMemTerminator(*CurrentLoc, EarlierMemInst, UseInst)) {
+        LLVM_DEBUG(
+            dbgs()
+            << " ... skipping, memterminator invalidates following accesses\n");
         continue;
+      }
+
+      if (isNoopIntrinsic(cast<MemoryUseOrDef>(UseAccess)->getMemoryInst())) {
+        LLVM_DEBUG(dbgs() << "    ... adding uses of intrinsic\n");
+        PushMemUses(UseAccess);
+        continue;
+      }
+
+      if (UseInst->mayThrow() && !isInvisibleToCallerBeforeRet(DefUO)) {
+        LLVM_DEBUG(dbgs() << "  ... found throwing instruction\n");
+        return None;
+      }
 
       // Uses which may read the original MemoryDef mean we cannot eliminate the
       // original MD. Stop walk.
-      if (isReadClobber(DefLoc, UseInst)) {
+      if (isReadClobber(*CurrentLoc, UseInst)) {
         LLVM_DEBUG(dbgs() << "    ... found read clobber\n");
         return None;
       }
 
-      // For the KillingDef and DomAccess we only have to check if it reads the
-      // memory location.
+      // For the KillingDef and EarlierAccess we only have to check if it reads
+      // the memory location.
       // TODO: It would probably be better to check for self-reads before
       // calling the function.
-      if (KillingDef == UseAccess || DomAccess == UseAccess) {
+      if (KillingDef == UseAccess || EarlierAccess == UseAccess) {
         LLVM_DEBUG(dbgs() << "    ... skipping killing def/dom access\n");
         continue;
       }
@@ -1844,22 +2181,23 @@ struct DSEState {
       // the original location. Otherwise we have to check uses of *all*
       // MemoryDefs we discover, including non-aliasing ones. Otherwise we might
       // miss cases like the following
-      //   1 = Def(LoE) ; <----- DomDef stores [0,1]
+      //   1 = Def(LoE) ; <----- EarlierDef stores [0,1]
       //   2 = Def(1)   ; (2, 1) = NoAlias,   stores [2,3]
       //   Use(2)       ; MayAlias 2 *and* 1, loads [0, 3].
       //                  (The Use points to the *first* Def it may alias)
       //   3 = Def(1)   ; <---- Current  (3, 2) = NoAlias, (3,1) = MayAlias,
       //                  stores [0,1]
       if (MemoryDef *UseDef = dyn_cast<MemoryDef>(UseAccess)) {
-        if (isCompleteOverwrite(DefLoc, UseInst)) {
-          if (DefVisibleToCallerAfterRet && UseAccess != DomAccess) {
+        if (isCompleteOverwrite(*CurrentLoc, EarlierMemInst, UseInst)) {
+          if (!isInvisibleToCallerAfterRet(DefUO) &&
+              UseAccess != EarlierAccess) {
             BasicBlock *MaybeKillingBlock = UseInst->getParent();
             if (PostOrderNumbers.find(MaybeKillingBlock)->second <
-                PostOrderNumbers.find(DomAccess->getBlock())->second) {
+                PostOrderNumbers.find(EarlierAccess->getBlock())->second) {
 
-              LLVM_DEBUG(dbgs() << "    ... found killing block "
-                                << MaybeKillingBlock->getName() << "\n");
-              KillingBlocks.insert(MaybeKillingBlock);
+              LLVM_DEBUG(dbgs()
+                         << "    ... found killing def " << *UseInst << "\n");
+              KillingDefs.insert(UseInst);
             }
           }
         } else
@@ -1868,11 +2206,15 @@ struct DSEState {
     }
 
     // For accesses to locations visible after the function returns, make sure
-    // that the location is killed (=overwritten) along all paths from DomAccess
-    // to the exit.
-    if (DefVisibleToCallerAfterRet) {
+    // that the location is killed (=overwritten) along all paths from
+    // EarlierAccess to the exit.
+    if (!isInvisibleToCallerAfterRet(DefUO)) {
+      SmallPtrSet<BasicBlock *, 16> KillingBlocks;
+      for (Instruction *KD : KillingDefs)
+        KillingBlocks.insert(KD->getParent());
       assert(!KillingBlocks.empty() &&
              "Expected at least a single killing block");
+
       // Find the common post-dominator of all killing blocks.
       BasicBlock *CommonPred = *KillingBlocks.begin();
       for (auto I = std::next(KillingBlocks.begin()), E = KillingBlocks.end();
@@ -1883,23 +2225,19 @@ struct DSEState {
       }
 
       // If CommonPred is in the set of killing blocks, just check if it
-      // post-dominates DomAccess.
+      // post-dominates EarlierAccess.
       if (KillingBlocks.count(CommonPred)) {
-        if (PDT.dominates(CommonPred, DomAccess->getBlock()))
-          return {DomAccess};
+        if (PDT.dominates(CommonPred, EarlierAccess->getBlock()))
+          return {EarlierAccess};
         return None;
       }
 
-      // If the common post-dominator does not post-dominate DomAccess, there
-      // is a path from DomAccess to an exit not going through a killing block.
-      if (PDT.dominates(CommonPred, DomAccess->getBlock())) {
+      // If the common post-dominator does not post-dominate EarlierAccess,
+      // there is a path from EarlierAccess to an exit not going through a
+      // killing block.
+      if (PDT.dominates(CommonPred, EarlierAccess->getBlock())) {
         SetVector<BasicBlock *> WorkList;
 
-        // DomAccess's post-order number provides an upper bound of the blocks
-        // on a path starting at DomAccess.
-        unsigned UpperBound =
-            PostOrderNumbers.find(DomAccess->getBlock())->second;
-
         // If CommonPred is null, there are multiple exits from the function.
         // They all have to be added to the worklist.
         if (CommonPred)
@@ -1910,24 +2248,20 @@ struct DSEState {
 
         NumCFGTries++;
         // Check if all paths starting from an exit node go through one of the
-        // killing blocks before reaching DomAccess.
+        // killing blocks before reaching EarlierAccess.
         for (unsigned I = 0; I < WorkList.size(); I++) {
           NumCFGChecks++;
           BasicBlock *Current = WorkList[I];
           if (KillingBlocks.count(Current))
             continue;
-          if (Current == DomAccess->getBlock())
+          if (Current == EarlierAccess->getBlock())
             return None;
 
-          // DomAccess is reachable from the entry, so we don't have to explore
-          // unreachable blocks further.
+          // EarlierAccess is reachable from the entry, so we don't have to
+          // explore unreachable blocks further.
           if (!DT.isReachableFromEntry(Current))
             continue;
 
-          unsigned CPO = PostOrderNumbers.find(Current)->second;
-          // Current block is not on a path starting at DomAccess.
-          if (CPO > UpperBound)
-            continue;
           for (BasicBlock *Pred : predecessors(Current))
             WorkList.insert(Pred);
 
@@ -1935,13 +2269,14 @@ struct DSEState {
             return None;
         }
         NumCFGSuccess++;
-        return {DomAccess};
+        return {EarlierAccess};
       }
       return None;
     }
 
-    // No aliasing MemoryUses of DomAccess found, DomAccess is potentially dead.
-    return {DomAccess};
+    // No aliasing MemoryUses of EarlierAccess found, EarlierAccess is
+    // potentially dead.
+    return {EarlierAccess};
   }
 
   // Delete dead memory defs
@@ -1986,11 +2321,11 @@ struct DSEState {
   // checks extra maythrows (those that aren't MemoryDef's). MemoryDef that may
   // throw are handled during the walk from one def to the next.
   bool mayThrowBetween(Instruction *SI, Instruction *NI,
-                       const Value *SILocUnd) const {
+                       const Value *SILocUnd) {
     // First see if we can ignore it by using the fact that SI is an
     // alloca/alloca like object that is not visible to the caller during
     // execution of the function.
-    if (SILocUnd && InvisibleToCallerBeforeRet.count(SILocUnd))
+    if (SILocUnd && isInvisibleToCallerBeforeRet(SILocUnd))
       return false;
 
     if (SI->getParent() == NI->getParent())
@@ -2003,10 +2338,10 @@ struct DSEState {
   //  * A memory instruction that may throw and \p SI accesses a non-stack
   //  object.
   //  * Atomic stores stronger that monotonic.
-  bool isDSEBarrier(const Value *SILocUnd, Instruction *NI) const {
+  bool isDSEBarrier(const Value *SILocUnd, Instruction *NI) {
     // If NI may throw it acts as a barrier, unless we are to an alloca/alloca
     // like object that does not escape.
-    if (NI->mayThrow() && !InvisibleToCallerBeforeRet.count(SILocUnd))
+    if (NI->mayThrow() && !isInvisibleToCallerBeforeRet(SILocUnd))
       return true;
 
     // If NI is an atomic load/store stronger than monotonic, do not try to
@@ -2016,6 +2351,11 @@ struct DSEState {
         return isStrongerThanMonotonic(LI->getOrdering());
       if (auto *SI = dyn_cast<StoreInst>(NI))
         return isStrongerThanMonotonic(SI->getOrdering());
+      if (auto *ARMW = dyn_cast<AtomicRMWInst>(NI))
+        return isStrongerThanMonotonic(ARMW->getOrdering());
+      if (auto *CmpXchg = dyn_cast<AtomicCmpXchgInst>(NI))
+        return isStrongerThanMonotonic(CmpXchg->getSuccessOrdering()) ||
+               isStrongerThanMonotonic(CmpXchg->getFailureOrdering());
       llvm_unreachable("other instructions should be skipped in MemorySSA");
     }
     return false;
@@ -2024,40 +2364,37 @@ struct DSEState {
   /// Eliminate writes to objects that are not visible in the caller and are not
   /// accessed before returning from the function.
   bool eliminateDeadWritesAtEndOfFunction() {
-    const DataLayout &DL = F.getParent()->getDataLayout();
     bool MadeChange = false;
     LLVM_DEBUG(
         dbgs()
         << "Trying to eliminate MemoryDefs at the end of the function\n");
     for (int I = MemDefs.size() - 1; I >= 0; I--) {
       MemoryDef *Def = MemDefs[I];
-      if (SkipStores.find(Def) != SkipStores.end() ||
-          !isRemovable(Def->getMemoryInst()))
+      if (SkipStores.contains(Def) || !isRemovable(Def->getMemoryInst()))
+        continue;
+
+      Instruction *DefI = Def->getMemoryInst();
+      SmallVector<const Value *, 4> Pointers;
+      auto DefLoc = getLocForWriteEx(DefI);
+      if (!DefLoc)
+        continue;
+
+      // NOTE: Currently eliminating writes at the end of a function is limited
+      // to MemoryDefs with a single underlying object, to save compile-time. In
+      // practice it appears the case with multiple underlying objects is very
+      // uncommon. If it turns out to be important, we can use
+      // getUnderlyingObjects here instead.
+      const Value *UO = getUnderlyingObject(DefLoc->Ptr);
+      if (!UO || !isInvisibleToCallerAfterRet(UO))
         continue;
 
-      // TODO: Consider doing the underlying object check first, if it is
-      // beneficial compile-time wise.
       if (isWriteAtEndOfFunction(Def)) {
-        Instruction *DefI = Def->getMemoryInst();
         // See through pointer-to-pointer bitcasts
-        SmallVector<const Value *, 4> Pointers;
-        GetUnderlyingObjects(getLocForWriteEx(DefI)->Ptr, Pointers, DL);
-
         LLVM_DEBUG(dbgs() << "   ... MemoryDef is not accessed until the end "
                              "of the function\n");
-        bool CanKill = true;
-        for (const Value *Pointer : Pointers) {
-          if (!InvisibleToCallerAfterRet.count(Pointer)) {
-            CanKill = false;
-            break;
-          }
-        }
-
-        if (CanKill) {
-          deleteDeadInstruction(DefI);
-          ++NumFastStores;
-          MadeChange = true;
-        }
+        deleteDeadInstruction(DefI);
+        ++NumFastStores;
+        MadeChange = true;
       }
     }
     return MadeChange;
@@ -2065,17 +2402,53 @@ struct DSEState {
 
   /// \returns true if \p Def is a no-op store, either because it
   /// directly stores back a loaded value or stores zero to a calloced object.
-  bool storeIsNoop(MemoryDef *Def, MemoryLocation DefLoc, const Value *DefUO) {
+  bool storeIsNoop(MemoryDef *Def, const MemoryLocation &DefLoc,
+                   const Value *DefUO) {
     StoreInst *Store = dyn_cast<StoreInst>(Def->getMemoryInst());
     if (!Store)
       return false;
 
     if (auto *LoadI = dyn_cast<LoadInst>(Store->getOperand(0))) {
       if (LoadI->getPointerOperand() == Store->getOperand(1)) {
+        // Get the defining access for the load.
         auto *LoadAccess = MSSA.getMemoryAccess(LoadI)->getDefiningAccess();
-        // If both accesses share the same defining access, no instructions
-        // between them can modify the memory location.
-        return LoadAccess == Def->getDefiningAccess();
+        // Fast path: the defining accesses are the same.
+        if (LoadAccess == Def->getDefiningAccess())
+          return true;
+
+        // Look through phi accesses. Recursively scan all phi accesses by
+        // adding them to a worklist. Bail when we run into a memory def that
+        // does not match LoadAccess.
+        SetVector<MemoryAccess *> ToCheck;
+        MemoryAccess *Current =
+            MSSA.getWalker()->getClobberingMemoryAccess(Def);
+        // We don't want to bail when we run into the store memory def. But,
+        // the phi access may point to it. So, pretend like we've already
+        // checked it.
+        ToCheck.insert(Def);
+        ToCheck.insert(Current);
+        // Start at current (1) to simulate already having checked Def.
+        for (unsigned I = 1; I < ToCheck.size(); ++I) {
+          Current = ToCheck[I];
+          if (auto PhiAccess = dyn_cast<MemoryPhi>(Current)) {
+            // Check all the operands.
+            for (auto &Use : PhiAccess->incoming_values())
+              ToCheck.insert(cast<MemoryAccess>(&Use));
+            continue;
+          }
+
+          // If we found a memory def, bail. This happens when we have an
+          // unrelated write in between an otherwise noop store.
+          assert(isa<MemoryDef>(Current) &&
+                 "Only MemoryDefs should reach here.");
+          // TODO: Skip no alias MemoryDefs that have no aliasing reads.
+          // We are searching for the definition of the store's destination.
+          // So, if that is the same definition as the load, then this is a
+          // noop. Otherwise, fail.
+          if (LoadAccess != Current)
+            return false;
+        }
+        return true;
       }
     }
 
@@ -2099,7 +2472,6 @@ bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA,
                                   MemorySSA &MSSA, DominatorTree &DT,
                                   PostDominatorTree &PDT,
                                   const TargetLibraryInfo &TLI) {
-  const DataLayout &DL = F.getParent()->getDataLayout();
   bool MadeChange = false;
 
   DSEState State = DSEState::get(F, AA, MSSA, DT, PDT, TLI);
@@ -2110,7 +2482,7 @@ bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA,
       continue;
     Instruction *SI = KillingDef->getMemoryInst();
 
-    auto MaybeSILoc = State.getLocForWriteEx(SI);
+    Optional<MemoryLocation> MaybeSILoc;
     if (State.isMemTerminatorInst(SI))
       MaybeSILoc = State.getLocForTerminator(SI).map(
           [](const std::pair<MemoryLocation, bool> &P) { return P.first; });
@@ -2124,38 +2496,23 @@ bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA,
     }
     MemoryLocation SILoc = *MaybeSILoc;
     assert(SILoc.Ptr && "SILoc should not be null");
-    const Value *SILocUnd = GetUnderlyingObject(SILoc.Ptr, DL);
-
-    // Check if the store is a no-op.
-    if (isRemovable(SI) && State.storeIsNoop(KillingDef, SILoc, SILocUnd)) {
-      LLVM_DEBUG(dbgs() << "DSE: Remove No-Op Store:\n  DEAD: " << *SI << '\n');
-      State.deleteDeadInstruction(SI);
-      NumNoopStores++;
-      MadeChange = true;
-      continue;
-    }
-
-    Instruction *DefObj =
-        const_cast<Instruction *>(dyn_cast<Instruction>(SILocUnd));
-    bool DefVisibleToCallerBeforeRet =
-        !State.InvisibleToCallerBeforeRet.count(SILocUnd);
-    bool DefVisibleToCallerAfterRet =
-        !State.InvisibleToCallerAfterRet.count(SILocUnd);
-    if (DefObj && isAllocLikeFn(DefObj, &TLI)) {
-      if (DefVisibleToCallerBeforeRet)
-        DefVisibleToCallerBeforeRet =
-            PointerMayBeCapturedBefore(DefObj, false, true, SI, &DT);
-    }
+    const Value *SILocUnd = getUnderlyingObject(SILoc.Ptr);
 
     MemoryAccess *Current = KillingDef;
     LLVM_DEBUG(dbgs() << "Trying to eliminate MemoryDefs killed by "
                       << *KillingDef << " (" << *SI << ")\n");
 
-    int ScanLimit = MemorySSAScanLimit;
+    unsigned ScanLimit = MemorySSAScanLimit;
+    unsigned WalkerStepLimit = MemorySSAUpwardsStepLimit;
+    unsigned PartialLimit = MemorySSAPartialStoreLimit;
     // Worklist of MemoryAccesses that may be killed by KillingDef.
     SetVector<MemoryAccess *> ToCheck;
-    ToCheck.insert(KillingDef->getDefiningAccess());
 
+    if (SILocUnd)
+      ToCheck.insert(KillingDef->getDefiningAccess());
+
+    bool Shortend = false;
+    bool IsMemTerm = State.isMemTerminatorInst(SI);
     // Check if MemoryAccesses in the worklist are killed by KillingDef.
     for (unsigned I = 0; I < ToCheck.size(); I++) {
       Current = ToCheck[I];
@@ -2163,22 +2520,22 @@ bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA,
         continue;
 
       Optional<MemoryAccess *> Next = State.getDomMemoryDef(
-          KillingDef, Current, SILoc, DefVisibleToCallerBeforeRet,
-          DefVisibleToCallerAfterRet, ScanLimit);
+          KillingDef, Current, SILoc, SILocUnd, ScanLimit, WalkerStepLimit,
+          IsMemTerm, PartialLimit);
 
       if (!Next) {
         LLVM_DEBUG(dbgs() << "  finished walk\n");
         continue;
       }
 
-      MemoryAccess *DomAccess = *Next;
-      LLVM_DEBUG(dbgs() << " Checking if we can kill " << *DomAccess);
-      if (isa<MemoryPhi>(DomAccess)) {
+      MemoryAccess *EarlierAccess = *Next;
+      LLVM_DEBUG(dbgs() << " Checking if we can kill " << *EarlierAccess);
+      if (isa<MemoryPhi>(EarlierAccess)) {
         LLVM_DEBUG(dbgs() << "\n  ... adding incoming values to worklist\n");
-        for (Value *V : cast<MemoryPhi>(DomAccess)->incoming_values()) {
+        for (Value *V : cast<MemoryPhi>(EarlierAccess)->incoming_values()) {
           MemoryAccess *IncomingAccess = cast<MemoryAccess>(V);
           BasicBlock *IncomingBlock = IncomingAccess->getBlock();
-          BasicBlock *PhiBlock = DomAccess->getBlock();
+          BasicBlock *PhiBlock = EarlierAccess->getBlock();
 
           // We only consider incoming MemoryAccesses that come before the
           // MemoryPhi. Otherwise we could discover candidates that do not
@@ -2189,44 +2546,20 @@ bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA,
         }
         continue;
       }
-      MemoryDef *NextDef = dyn_cast<MemoryDef>(DomAccess);
+      auto *NextDef = cast<MemoryDef>(EarlierAccess);
       Instruction *NI = NextDef->getMemoryInst();
       LLVM_DEBUG(dbgs() << " (" << *NI << ")\n");
-
-      // Before we try to remove anything, check for any extra throwing
-      // instructions that block us from DSEing
-      if (State.mayThrowBetween(SI, NI, SILocUnd)) {
-        LLVM_DEBUG(dbgs() << "  ... skip, may throw!\n");
-        break;
-      }
-
-      // Check for anything that looks like it will be a barrier to further
-      // removal
-      if (State.isDSEBarrier(SILocUnd, NI)) {
-        LLVM_DEBUG(dbgs() << "  ... skip, barrier\n");
-        continue;
-      }
-
       ToCheck.insert(NextDef->getDefiningAccess());
-
-      if (!hasAnalyzableMemoryWrite(NI, TLI)) {
-        LLVM_DEBUG(dbgs() << "  ... skip, cannot analyze def\n");
-        continue;
-      }
-
-      if (!isRemovable(NI)) {
-        LLVM_DEBUG(dbgs() << "  ... skip, cannot remove def\n");
-        continue;
-      }
+      NumGetDomMemoryDefPassed++;
 
       if (!DebugCounter::shouldExecute(MemorySSACounter))
         continue;
 
       MemoryLocation NILoc = *State.getLocForWriteEx(NI);
 
-      if (State.isMemTerminatorInst(SI)) {
-        const Value *NIUnd = GetUnderlyingObject(NILoc.Ptr, DL);
-        if (!SILocUnd || SILocUnd != NIUnd)
+      if (IsMemTerm) {
+        const Value *NIUnd = getUnderlyingObject(NILoc.Ptr);
+        if (SILocUnd != NIUnd)
           continue;
         LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: " << *NI
                           << "\n  KILLER: " << *SI << '\n');
@@ -2236,32 +2569,43 @@ bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA,
       } else {
         // Check if NI overwrites SI.
         int64_t InstWriteOffset, DepWriteOffset;
-        auto Iter = State.IOLs.insert(
-            std::make_pair<BasicBlock *, InstOverlapIntervalsTy>(
-                NI->getParent(), InstOverlapIntervalsTy()));
-        auto &IOL = Iter.first->second;
-        OverwriteResult OR = isOverwrite(SILoc, NILoc, DL, TLI, DepWriteOffset,
-                                         InstWriteOffset, NI, IOL, AA, &F);
+        OverwriteResult OR =
+            isOverwrite(SI, NI, SILoc, NILoc, State.DL, TLI, DepWriteOffset,
+                        InstWriteOffset, State.BatchAA, &F);
+        if (OR == OW_MaybePartial) {
+          auto Iter = State.IOLs.insert(
+              std::make_pair<BasicBlock *, InstOverlapIntervalsTy>(
+                  NI->getParent(), InstOverlapIntervalsTy()));
+          auto &IOL = Iter.first->second;
+          OR = isPartialOverwrite(SILoc, NILoc, DepWriteOffset, InstWriteOffset,
+                                  NI, IOL);
+        }
 
         if (EnablePartialStoreMerging && OR == OW_PartialEarlierWithFullLater) {
           auto *Earlier = dyn_cast<StoreInst>(NI);
           auto *Later = dyn_cast<StoreInst>(SI);
-          if (Constant *Merged = tryToMergePartialOverlappingStores(
-                  Earlier, Later, InstWriteOffset, DepWriteOffset, DL, &AA,
-                  &DT)) {
-
-            // Update stored value of earlier store to merged constant.
-            Earlier->setOperand(0, Merged);
-            ++NumModifiedStores;
-            MadeChange = true;
-
-            // Remove later store and remove any outstanding overlap intervals
-            // for the updated store.
-            State.deleteDeadInstruction(Later);
-            auto I = State.IOLs.find(Earlier->getParent());
-            if (I != State.IOLs.end())
-              I->second.erase(Earlier);
-            break;
+          // We are re-using tryToMergePartialOverlappingStores, which requires
+          // Earlier to domiante Later.
+          // TODO: implement tryToMergeParialOverlappingStores using MemorySSA.
+          if (Earlier && Later && DT.dominates(Earlier, Later)) {
+            if (Constant *Merged = tryToMergePartialOverlappingStores(
+                    Earlier, Later, InstWriteOffset, DepWriteOffset, State.DL,
+                    State.BatchAA, &DT)) {
+
+              // Update stored value of earlier store to merged constant.
+              Earlier->setOperand(0, Merged);
+              ++NumModifiedStores;
+              MadeChange = true;
+
+              Shortend = true;
+              // Remove later store and remove any outstanding overlap intervals
+              // for the updated store.
+              State.deleteDeadInstruction(Later);
+              auto I = State.IOLs.find(Earlier->getParent());
+              if (I != State.IOLs.end())
+                I->second.erase(Earlier);
+              break;
+            }
           }
         }
 
@@ -2274,11 +2618,21 @@ bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA,
         }
       }
     }
+
+    // Check if the store is a no-op.
+    if (!Shortend && isRemovable(SI) &&
+        State.storeIsNoop(KillingDef, SILoc, SILocUnd)) {
+      LLVM_DEBUG(dbgs() << "DSE: Remove No-Op Store:\n  DEAD: " << *SI << '\n');
+      State.deleteDeadInstruction(SI);
+      NumRedundantStores++;
+      MadeChange = true;
+      continue;
+    }
   }
 
   if (EnablePartialOverwriteTracking)
     for (auto &KV : State.IOLs)
-      MadeChange |= removePartiallyOverlappedStores(&AA, DL, KV.second);
+      MadeChange |= removePartiallyOverlappedStores(State.DL, KV.second, TLI);
 
   MadeChange |= State.eliminateDeadWritesAtEndOfFunction();
   return MadeChange;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/DivRemPairs.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/DivRemPairs.cpp
index d44a5979a8b2..3c6c444d6649 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/DivRemPairs.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/DivRemPairs.cpp
@@ -151,8 +151,8 @@ static DivRemWorklistTy getWorklist(Function &F) {
   // rare than division.
   for (auto &RemPair : RemMap) {
     // Find the matching division instruction from the division map.
-    Instruction *DivInst = DivMap[RemPair.first];
-    if (!DivInst)
+    auto It = DivMap.find(RemPair.first);
+    if (It == DivMap.end())
       continue;
 
     // We have a matching pair of div/rem instructions.
@@ -160,7 +160,7 @@ static DivRemWorklistTy getWorklist(Function &F) {
     Instruction *RemInst = RemPair.second;
 
     // Place it in the worklist.
-    Worklist.emplace_back(DivInst, RemInst);
+    Worklist.emplace_back(It->second, RemInst);
   }
 
   return Worklist;
@@ -315,14 +315,14 @@ static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI,
       //   %rem = sub %x, %mul  // %rem = undef - undef = undef
       // If X is not frozen, %rem becomes undef after transformation.
       // TODO: We need a undef-specific checking function in ValueTracking
-      if (!isGuaranteedNotToBeUndefOrPoison(X, DivInst, &DT)) {
+      if (!isGuaranteedNotToBeUndefOrPoison(X, nullptr, DivInst, &DT)) {
         auto *FrX = new FreezeInst(X, X->getName() + ".frozen", DivInst);
         DivInst->setOperand(0, FrX);
         Sub->setOperand(0, FrX);
       }
       // Same for Y. If X = 1 and Y = (undef | 1), %rem in src is either 1 or 0,
       // but %rem in tgt can be one of many integer values.
-      if (!isGuaranteedNotToBeUndefOrPoison(Y, DivInst, &DT)) {
+      if (!isGuaranteedNotToBeUndefOrPoison(Y, nullptr, DivInst, &DT)) {
         auto *FrY = new FreezeInst(Y, Y->getName() + ".frozen", DivInst);
         DivInst->setOperand(1, FrY);
         Mul->setOperand(1, FrY);
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
index ddfc8555b0a0..180a82917fa9 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -154,33 +154,13 @@ static bool matchSelectWithOptionalNotCond(Value *V, Value *&Cond, Value *&A,
     std::swap(A, B);
   }
 
-  // Match canonical forms of abs/nabs/min/max. We are not using ValueTracking's
+  // Match canonical forms of min/max. We are not using ValueTracking's
   // more powerful matchSelectPattern() because it may rely on instruction flags
   // such as "nsw". That would be incompatible with the current hashing
   // mechanism that may remove flags to increase the likelihood of CSE.
 
-  // These are the canonical forms of abs(X) and nabs(X) created by instcombine:
-  // %N = sub i32 0, %X
-  // %C = icmp slt i32 %X, 0
-  // %ABS = select i1 %C, i32 %N, i32 %X
-  //
-  // %N = sub i32 0, %X
-  // %C = icmp slt i32 %X, 0
-  // %NABS = select i1 %C, i32 %X, i32 %N
   Flavor = SPF_UNKNOWN;
   CmpInst::Predicate Pred;
-  if (match(Cond, m_ICmp(Pred, m_Specific(B), m_ZeroInt())) &&
-      Pred == ICmpInst::ICMP_SLT && match(A, m_Neg(m_Specific(B)))) {
-    // ABS: B < 0 ? -B : B
-    Flavor = SPF_ABS;
-    return true;
-  }
-  if (match(Cond, m_ICmp(Pred, m_Specific(A), m_ZeroInt())) &&
-      Pred == ICmpInst::ICMP_SLT && match(B, m_Neg(m_Specific(A)))) {
-    // NABS: A < 0 ? A : -A
-    Flavor = SPF_NABS;
-    return true;
-  }
 
   if (!match(Cond, m_ICmp(Pred, m_Specific(A), m_Specific(B)))) {
     // Check for commuted variants of min/max by swapping predicate.
@@ -196,6 +176,11 @@ static bool matchSelectWithOptionalNotCond(Value *V, Value *&Cond, Value *&A,
   case CmpInst::ICMP_ULT: Flavor = SPF_UMIN; break;
   case CmpInst::ICMP_SGT: Flavor = SPF_SMAX; break;
   case CmpInst::ICMP_SLT: Flavor = SPF_SMIN; break;
+  // Non-strict inequalities.
+  case CmpInst::ICMP_ULE: Flavor = SPF_UMIN; break;
+  case CmpInst::ICMP_UGE: Flavor = SPF_UMAX; break;
+  case CmpInst::ICMP_SLE: Flavor = SPF_SMIN; break;
+  case CmpInst::ICMP_SGE: Flavor = SPF_SMAX; break;
   default: break;
   }
 
@@ -234,7 +219,7 @@ static unsigned getHashValueImpl(SimpleValue Val) {
   SelectPatternFlavor SPF;
   Value *Cond, *A, *B;
   if (matchSelectWithOptionalNotCond(Inst, Cond, A, B, SPF)) {
-    // Hash min/max/abs (cmp + select) to allow for commuted operands.
+    // Hash min/max (cmp + select) to allow for commuted operands.
     // Min/max may also have non-canonical compare predicate (eg, the compare for
     // smin may use 'sgt' rather than 'slt'), and non-canonical operands in the
     // compare.
@@ -245,10 +230,6 @@ static unsigned getHashValueImpl(SimpleValue Val) {
         std::swap(A, B);
       return hash_combine(Inst->getOpcode(), SPF, A, B);
     }
-    if (SPF == SPF_ABS || SPF == SPF_NABS) {
-      // ABS/NABS always puts the input in A and its negation in B.
-      return hash_combine(Inst->getOpcode(), SPF, A, B);
-    }
 
     // Hash general selects to allow matching commuted true/false operands.
 
@@ -288,6 +269,17 @@ static unsigned getHashValueImpl(SimpleValue Val) {
           isa<FreezeInst>(Inst)) &&
          "Invalid/unknown instruction");
 
+  // Handle intrinsics with commutative operands.
+  // TODO: Extend this to handle intrinsics with >2 operands where the 1st
+  //       2 operands are commutative.
+  auto *II = dyn_cast<IntrinsicInst>(Inst);
+  if (II && II->isCommutative() && II->getNumArgOperands() == 2) {
+    Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1);
+    if (LHS > RHS)
+      std::swap(LHS, RHS);
+    return hash_combine(II->getOpcode(), LHS, RHS);
+  }
+
   // Mix in the opcode.
   return hash_combine(
       Inst->getOpcode(),
@@ -340,7 +332,16 @@ static bool isEqualImpl(SimpleValue LHS, SimpleValue RHS) {
            LHSCmp->getSwappedPredicate() == RHSCmp->getPredicate();
   }
 
-  // Min/max/abs can occur with commuted operands, non-canonical predicates,
+  // TODO: Extend this for >2 args by matching the trailing N-2 args.
+  auto *LII = dyn_cast<IntrinsicInst>(LHSI);
+  auto *RII = dyn_cast<IntrinsicInst>(RHSI);
+  if (LII && RII && LII->getIntrinsicID() == RII->getIntrinsicID() &&
+      LII->isCommutative() && LII->getNumArgOperands() == 2) {
+    return LII->getArgOperand(0) == RII->getArgOperand(1) &&
+           LII->getArgOperand(1) == RII->getArgOperand(0);
+  }
+
+  // Min/max can occur with commuted operands, non-canonical predicates,
   // and/or non-canonical operands.
   // Selects can be non-trivially equivalent via inverted conditions and swaps.
   SelectPatternFlavor LSPF, RSPF;
@@ -354,11 +355,6 @@ static bool isEqualImpl(SimpleValue LHS, SimpleValue RHS) {
         return ((LHSA == RHSA && LHSB == RHSB) ||
                 (LHSA == RHSB && LHSB == RHSA));
 
-      if (LSPF == SPF_ABS || LSPF == SPF_NABS) {
-        // Abs results are placed in a defined order by matchSelectPattern.
-        return LHSA == RHSA && LHSB == RHSB;
-      }
-
       // select Cond, A, B <--> select not(Cond), B, A
       if (CondL == CondR && LHSA == RHSA && LHSB == RHSB)
         return true;
@@ -376,7 +372,7 @@ static bool isEqualImpl(SimpleValue LHS, SimpleValue RHS) {
     // This intentionally does NOT handle patterns with a double-negation in
     // the sense of not + not, because doing so could result in values
     // comparing
-    // as equal that hash differently in the min/max/abs cases like:
+    // as equal that hash differently in the min/max cases like:
     // select (cmp slt, X, Y), X, Y <--> select (not (not (cmp slt, X, Y))), X, Y
     //   ^ hashes as min                  ^ would not hash as min
     // In the context of the EarlyCSE pass, however, such cases never reach
@@ -631,11 +627,11 @@ private:
     StackNode &operator=(const StackNode &) = delete;
 
     // Accessors.
-    unsigned currentGeneration() { return CurrentGeneration; }
-    unsigned childGeneration() { return ChildGeneration; }
+    unsigned currentGeneration() const { return CurrentGeneration; }
+    unsigned childGeneration() const { return ChildGeneration; }
     void childGeneration(unsigned generation) { ChildGeneration = generation; }
     DomTreeNode *node() { return Node; }
-    DomTreeNode::const_iterator childIter() { return ChildIter; }
+    DomTreeNode::const_iterator childIter() const { return ChildIter; }
 
     DomTreeNode *nextChild() {
       DomTreeNode *child = *ChildIter;
@@ -643,8 +639,8 @@ private:
       return child;
     }
 
-    DomTreeNode::const_iterator end() { return EndIter; }
-    bool isProcessed() { return Processed; }
+    DomTreeNode::const_iterator end() const { return EndIter; }
+    bool isProcessed() const { return Processed; }
     void process() { Processed = true; }
 
   private:
@@ -663,29 +659,60 @@ private:
   public:
     ParseMemoryInst(Instruction *Inst, const TargetTransformInfo &TTI)
       : Inst(Inst) {
-      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst))
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
+        IntrID = II->getIntrinsicID();
         if (TTI.getTgtMemIntrinsic(II, Info))
-          IsTargetMemInst = true;
+          return;
+        if (isHandledNonTargetIntrinsic(IntrID)) {
+          switch (IntrID) {
+          case Intrinsic::masked_load:
+            Info.PtrVal = Inst->getOperand(0);
+            Info.MatchingId = Intrinsic::masked_load;
+            Info.ReadMem = true;
+            Info.WriteMem = false;
+            Info.IsVolatile = false;
+            break;
+          case Intrinsic::masked_store:
+            Info.PtrVal = Inst->getOperand(1);
+            // Use the ID of masked load as the "matching id". This will
+            // prevent matching non-masked loads/stores with masked ones
+            // (which could be done), but at the moment, the code here
+            // does not support matching intrinsics with non-intrinsics,
+            // so keep the MatchingIds specific to masked instructions
+            // for now (TODO).
+            Info.MatchingId = Intrinsic::masked_load;
+            Info.ReadMem = false;
+            Info.WriteMem = true;
+            Info.IsVolatile = false;
+            break;
+          }
+        }
+      }
     }
 
+    Instruction *get() { return Inst; }
+    const Instruction *get() const { return Inst; }
+
     bool isLoad() const {
-      if (IsTargetMemInst) return Info.ReadMem;
+      if (IntrID != 0)
+        return Info.ReadMem;
       return isa<LoadInst>(Inst);
     }
 
     bool isStore() const {
-      if (IsTargetMemInst) return Info.WriteMem;
+      if (IntrID != 0)
+        return Info.WriteMem;
       return isa<StoreInst>(Inst);
     }
 
     bool isAtomic() const {
-      if (IsTargetMemInst)
+      if (IntrID != 0)
         return Info.Ordering != AtomicOrdering::NotAtomic;
       return Inst->isAtomic();
     }
 
     bool isUnordered() const {
-      if (IsTargetMemInst)
+      if (IntrID != 0)
         return Info.isUnordered();
 
       if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
@@ -698,7 +725,7 @@ private:
     }
 
     bool isVolatile() const {
-      if (IsTargetMemInst)
+      if (IntrID != 0)
         return Info.IsVolatile;
 
       if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
@@ -716,11 +743,6 @@ private:
       return false;
     }
 
-    bool isMatchingMemLoc(const ParseMemoryInst &Inst) const {
-      return (getPointerOperand() == Inst.getPointerOperand() &&
-              getMatchingId() == Inst.getMatchingId());
-    }
-
     bool isValid() const { return getPointerOperand() != nullptr; }
 
     // For regular (non-intrinsic) loads/stores, this is set to -1. For
@@ -728,44 +750,83 @@ private:
     // field in the MemIntrinsicInfo structure.  That field contains
     // non-negative values only.
     int getMatchingId() const {
-      if (IsTargetMemInst) return Info.MatchingId;
+      if (IntrID != 0)
+        return Info.MatchingId;
       return -1;
     }
 
     Value *getPointerOperand() const {
-      if (IsTargetMemInst) return Info.PtrVal;
+      if (IntrID != 0)
+        return Info.PtrVal;
       return getLoadStorePointerOperand(Inst);
     }
 
     bool mayReadFromMemory() const {
-      if (IsTargetMemInst) return Info.ReadMem;
+      if (IntrID != 0)
+        return Info.ReadMem;
       return Inst->mayReadFromMemory();
     }
 
     bool mayWriteToMemory() const {
-      if (IsTargetMemInst) return Info.WriteMem;
+      if (IntrID != 0)
+        return Info.WriteMem;
       return Inst->mayWriteToMemory();
     }
 
   private:
-    bool IsTargetMemInst = false;
+    Intrinsic::ID IntrID = 0;
     MemIntrinsicInfo Info;
     Instruction *Inst;
   };
 
+  // This function is to prevent accidentally passing a non-target
+  // intrinsic ID to TargetTransformInfo.
+  static bool isHandledNonTargetIntrinsic(Intrinsic::ID ID) {
+    switch (ID) {
+    case Intrinsic::masked_load:
+    case Intrinsic::masked_store:
+      return true;
+    }
+    return false;
+  }
+  static bool isHandledNonTargetIntrinsic(const Value *V) {
+    if (auto *II = dyn_cast<IntrinsicInst>(V))
+      return isHandledNonTargetIntrinsic(II->getIntrinsicID());
+    return false;
+  }
+
   bool processNode(DomTreeNode *Node);
 
   bool handleBranchCondition(Instruction *CondInst, const BranchInst *BI,
                              const BasicBlock *BB, const BasicBlock *Pred);
 
+  Value *getMatchingValue(LoadValue &InVal, ParseMemoryInst &MemInst,
+                          unsigned CurrentGeneration);
+
+  bool overridingStores(const ParseMemoryInst &Earlier,
+                        const ParseMemoryInst &Later);
+
   Value *getOrCreateResult(Value *Inst, Type *ExpectedType) const {
     if (auto *LI = dyn_cast<LoadInst>(Inst))
       return LI;
     if (auto *SI = dyn_cast<StoreInst>(Inst))
       return SI->getValueOperand();
     assert(isa<IntrinsicInst>(Inst) && "Instruction not supported");
-    return TTI.getOrCreateResultFromMemIntrinsic(cast<IntrinsicInst>(Inst),
-                                                 ExpectedType);
+    auto *II = cast<IntrinsicInst>(Inst);
+    if (isHandledNonTargetIntrinsic(II->getIntrinsicID()))
+      return getOrCreateResultNonTargetMemIntrinsic(II, ExpectedType);
+    return TTI.getOrCreateResultFromMemIntrinsic(II, ExpectedType);
+  }
+
+  Value *getOrCreateResultNonTargetMemIntrinsic(IntrinsicInst *II,
+                                                Type *ExpectedType) const {
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::masked_load:
+      return II;
+    case Intrinsic::masked_store:
+      return II->getOperand(0);
+    }
+    return nullptr;
   }
 
   /// Return true if the instruction is known to only operate on memory
@@ -775,6 +836,101 @@ private:
   bool isSameMemGeneration(unsigned EarlierGeneration, unsigned LaterGeneration,
                            Instruction *EarlierInst, Instruction *LaterInst);
 
+  bool isNonTargetIntrinsicMatch(const IntrinsicInst *Earlier,
+                                 const IntrinsicInst *Later) {
+    auto IsSubmask = [](const Value *Mask0, const Value *Mask1) {
+      // Is Mask0 a submask of Mask1?
+      if (Mask0 == Mask1)
+        return true;
+      if (isa<UndefValue>(Mask0) || isa<UndefValue>(Mask1))
+        return false;
+      auto *Vec0 = dyn_cast<ConstantVector>(Mask0);
+      auto *Vec1 = dyn_cast<ConstantVector>(Mask1);
+      if (!Vec0 || !Vec1)
+        return false;
+      assert(Vec0->getType() == Vec1->getType() &&
+             "Masks should have the same type");
+      for (int i = 0, e = Vec0->getNumOperands(); i != e; ++i) {
+        Constant *Elem0 = Vec0->getOperand(i);
+        Constant *Elem1 = Vec1->getOperand(i);
+        auto *Int0 = dyn_cast<ConstantInt>(Elem0);
+        if (Int0 && Int0->isZero())
+          continue;
+        auto *Int1 = dyn_cast<ConstantInt>(Elem1);
+        if (Int1 && !Int1->isZero())
+          continue;
+        if (isa<UndefValue>(Elem0) || isa<UndefValue>(Elem1))
+          return false;
+        if (Elem0 == Elem1)
+          continue;
+        return false;
+      }
+      return true;
+    };
+    auto PtrOp = [](const IntrinsicInst *II) {
+      if (II->getIntrinsicID() == Intrinsic::masked_load)
+        return II->getOperand(0);
+      if (II->getIntrinsicID() == Intrinsic::masked_store)
+        return II->getOperand(1);
+      llvm_unreachable("Unexpected IntrinsicInst");
+    };
+    auto MaskOp = [](const IntrinsicInst *II) {
+      if (II->getIntrinsicID() == Intrinsic::masked_load)
+        return II->getOperand(2);
+      if (II->getIntrinsicID() == Intrinsic::masked_store)
+        return II->getOperand(3);
+      llvm_unreachable("Unexpected IntrinsicInst");
+    };
+    auto ThruOp = [](const IntrinsicInst *II) {
+      if (II->getIntrinsicID() == Intrinsic::masked_load)
+        return II->getOperand(3);
+      llvm_unreachable("Unexpected IntrinsicInst");
+    };
+
+    if (PtrOp(Earlier) != PtrOp(Later))
+      return false;
+
+    Intrinsic::ID IDE = Earlier->getIntrinsicID();
+    Intrinsic::ID IDL = Later->getIntrinsicID();
+    // We could really use specific intrinsic classes for masked loads
+    // and stores in IntrinsicInst.h.
+    if (IDE == Intrinsic::masked_load && IDL == Intrinsic::masked_load) {
+      // Trying to replace later masked load with the earlier one.
+      // Check that the pointers are the same, and
+      // - masks and pass-throughs are the same, or
+      // - replacee's pass-through is "undef" and replacer's mask is a
+      //   super-set of the replacee's mask.
+      if (MaskOp(Earlier) == MaskOp(Later) && ThruOp(Earlier) == ThruOp(Later))
+        return true;
+      if (!isa<UndefValue>(ThruOp(Later)))
+        return false;
+      return IsSubmask(MaskOp(Later), MaskOp(Earlier));
+    }
+    if (IDE == Intrinsic::masked_store && IDL == Intrinsic::masked_load) {
+      // Trying to replace a load of a stored value with the store's value.
+      // Check that the pointers are the same, and
+      // - load's mask is a subset of store's mask, and
+      // - load's pass-through is "undef".
+      if (!IsSubmask(MaskOp(Later), MaskOp(Earlier)))
+        return false;
+      return isa<UndefValue>(ThruOp(Later));
+    }
+    if (IDE == Intrinsic::masked_load && IDL == Intrinsic::masked_store) {
+      // Trying to remove a store of the loaded value.
+      // Check that the pointers are the same, and
+      // - store's mask is a subset of the load's mask.
+      return IsSubmask(MaskOp(Later), MaskOp(Earlier));
+    }
+    if (IDE == Intrinsic::masked_store && IDL == Intrinsic::masked_store) {
+      // Trying to remove a dead store (earlier).
+      // Check that the pointers are the same,
+      // - the to-be-removed store's mask is a subset of the other store's
+      //   mask.
+      return IsSubmask(MaskOp(Earlier), MaskOp(Later));
+    }
+    return false;
+  }
+
   void removeMSSA(Instruction &Inst) {
     if (!MSSA)
       return;
@@ -877,9 +1033,14 @@ bool EarlyCSE::handleBranchCondition(Instruction *CondInst,
   auto *TorF = (BI->getSuccessor(0) == BB)
                    ? ConstantInt::getTrue(BB->getContext())
                    : ConstantInt::getFalse(BB->getContext());
-  auto MatchBinOp = [](Instruction *I, unsigned Opcode) {
-    if (BinaryOperator *BOp = dyn_cast<BinaryOperator>(I))
-      return BOp->getOpcode() == Opcode;
+  auto MatchBinOp = [](Instruction *I, unsigned Opcode, Value *&LHS,
+                       Value *&RHS) {
+    if (Opcode == Instruction::And &&
+        match(I, m_LogicalAnd(m_Value(LHS), m_Value(RHS))))
+      return true;
+    else if (Opcode == Instruction::Or &&
+             match(I, m_LogicalOr(m_Value(LHS), m_Value(RHS))))
+      return true;
     return false;
   };
   // If the condition is AND operation, we can propagate its operands into the
@@ -910,8 +1071,9 @@ bool EarlyCSE::handleBranchCondition(Instruction *CondInst,
       }
     }
 
-    if (MatchBinOp(Curr, PropagateOpcode))
-      for (auto &Op : cast<BinaryOperator>(Curr)->operands())
+    Value *LHS, *RHS;
+    if (MatchBinOp(Curr, PropagateOpcode, LHS, RHS))
+      for (auto &Op : { LHS, RHS })
         if (Instruction *OPI = dyn_cast<Instruction>(Op))
           if (SimpleValue::canHandle(OPI) && Visited.insert(OPI).second)
             WorkList.push_back(OPI);
@@ -920,6 +1082,86 @@ bool EarlyCSE::handleBranchCondition(Instruction *CondInst,
   return MadeChanges;
 }
 
+Value *EarlyCSE::getMatchingValue(LoadValue &InVal, ParseMemoryInst &MemInst,
+                                  unsigned CurrentGeneration) {
+  if (InVal.DefInst == nullptr)
+    return nullptr;
+  if (InVal.MatchingId != MemInst.getMatchingId())
+    return nullptr;
+  // We don't yet handle removing loads with ordering of any kind.
+  if (MemInst.isVolatile() || !MemInst.isUnordered())
+    return nullptr;
+  // We can't replace an atomic load with one which isn't also atomic.
+  if (MemInst.isLoad() && !InVal.IsAtomic && MemInst.isAtomic())
+    return nullptr;
+  // The value V returned from this function is used differently depending
+  // on whether MemInst is a load or a store. If it's a load, we will replace
+  // MemInst with V, if it's a store, we will check if V is the same as the
+  // available value.
+  bool MemInstMatching = !MemInst.isLoad();
+  Instruction *Matching = MemInstMatching ? MemInst.get() : InVal.DefInst;
+  Instruction *Other = MemInstMatching ? InVal.DefInst : MemInst.get();
+
+  // For stores check the result values before checking memory generation
+  // (otherwise isSameMemGeneration may crash).
+  Value *Result = MemInst.isStore()
+                      ? getOrCreateResult(Matching, Other->getType())
+                      : nullptr;
+  if (MemInst.isStore() && InVal.DefInst != Result)
+    return nullptr;
+
+  // Deal with non-target memory intrinsics.
+  bool MatchingNTI = isHandledNonTargetIntrinsic(Matching);
+  bool OtherNTI = isHandledNonTargetIntrinsic(Other);
+  if (OtherNTI != MatchingNTI)
+    return nullptr;
+  if (OtherNTI && MatchingNTI) {
+    if (!isNonTargetIntrinsicMatch(cast<IntrinsicInst>(InVal.DefInst),
+                                   cast<IntrinsicInst>(MemInst.get())))
+      return nullptr;
+  }
+
+  if (!isOperatingOnInvariantMemAt(MemInst.get(), InVal.Generation) &&
+      !isSameMemGeneration(InVal.Generation, CurrentGeneration, InVal.DefInst,
+                           MemInst.get()))
+    return nullptr;
+
+  if (!Result)
+    Result = getOrCreateResult(Matching, Other->getType());
+  return Result;
+}
+
+bool EarlyCSE::overridingStores(const ParseMemoryInst &Earlier,
+                                const ParseMemoryInst &Later) {
+  // Can we remove Earlier store because of Later store?
+
+  assert(Earlier.isUnordered() && !Earlier.isVolatile() &&
+         "Violated invariant");
+  if (Earlier.getPointerOperand() != Later.getPointerOperand())
+    return false;
+  if (Earlier.getMatchingId() != Later.getMatchingId())
+    return false;
+  // At the moment, we don't remove ordered stores, but do remove
+  // unordered atomic stores.  There's no special requirement (for
+  // unordered atomics) about removing atomic stores only in favor of
+  // other atomic stores since we were going to execute the non-atomic
+  // one anyway and the atomic one might never have become visible.
+  if (!Earlier.isUnordered() || !Later.isUnordered())
+    return false;
+
+  // Deal with non-target memory intrinsics.
+  bool ENTI = isHandledNonTargetIntrinsic(Earlier.get());
+  bool LNTI = isHandledNonTargetIntrinsic(Later.get());
+  if (ENTI && LNTI)
+    return isNonTargetIntrinsicMatch(cast<IntrinsicInst>(Earlier.get()),
+                                     cast<IntrinsicInst>(Later.get()));
+
+  // Because of the check above, at least one of them is false.
+  // For now disallow matching intrinsics with non-intrinsics,
+  // so assume that the stores match if neither is an intrinsic.
+  return ENTI == LNTI;
+}
+
 bool EarlyCSE::processNode(DomTreeNode *Node) {
   bool Changed = false;
   BasicBlock *BB = Node->getBlock();
@@ -990,6 +1232,14 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
       continue;
     }
 
+    // Likewise, noalias intrinsics don't actually write.
+    if (match(&Inst,
+              m_Intrinsic<Intrinsic::experimental_noalias_scope_decl>())) {
+      LLVM_DEBUG(dbgs() << "EarlyCSE skipping noalias intrinsic: " << Inst
+                        << '\n');
+      continue;
+    }
+
     // Skip sideeffect intrinsics, for the same reason as assume intrinsics.
     if (match(&Inst, m_Intrinsic<Intrinsic::sideeffect>())) {
       LLVM_DEBUG(dbgs() << "EarlyCSE skipping sideeffect: " << Inst << '\n');
@@ -1136,32 +1386,21 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
       // we can assume the current load loads the same value as the dominating
       // load.
       LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand());
-      if (InVal.DefInst != nullptr &&
-          InVal.MatchingId == MemInst.getMatchingId() &&
-          // We don't yet handle removing loads with ordering of any kind.
-          !MemInst.isVolatile() && MemInst.isUnordered() &&
-          // We can't replace an atomic load with one which isn't also atomic.
-          InVal.IsAtomic >= MemInst.isAtomic() &&
-          (isOperatingOnInvariantMemAt(&Inst, InVal.Generation) ||
-           isSameMemGeneration(InVal.Generation, CurrentGeneration,
-                               InVal.DefInst, &Inst))) {
-        Value *Op = getOrCreateResult(InVal.DefInst, Inst.getType());
-        if (Op != nullptr) {
-          LLVM_DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << Inst
-                            << "  to: " << *InVal.DefInst << '\n');
-          if (!DebugCounter::shouldExecute(CSECounter)) {
-            LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
-            continue;
-          }
-          if (!Inst.use_empty())
-            Inst.replaceAllUsesWith(Op);
-          salvageKnowledge(&Inst, &AC);
-          removeMSSA(Inst);
-          Inst.eraseFromParent();
-          Changed = true;
-          ++NumCSELoad;
+      if (Value *Op = getMatchingValue(InVal, MemInst, CurrentGeneration)) {
+        LLVM_DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << Inst
+                          << "  to: " << *InVal.DefInst << '\n');
+        if (!DebugCounter::shouldExecute(CSECounter)) {
+          LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
           continue;
         }
+        if (!Inst.use_empty())
+          Inst.replaceAllUsesWith(Op);
+        salvageKnowledge(&Inst, &AC);
+        removeMSSA(Inst);
+        Inst.eraseFromParent();
+        Changed = true;
+        ++NumCSELoad;
+        continue;
       }
 
       // Otherwise, remember that we have this instruction.
@@ -1231,13 +1470,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
     if (MemInst.isValid() && MemInst.isStore()) {
       LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand());
       if (InVal.DefInst &&
-          InVal.DefInst == getOrCreateResult(&Inst, InVal.DefInst->getType()) &&
-          InVal.MatchingId == MemInst.getMatchingId() &&
-          // We don't yet handle removing stores with ordering of any kind.
-          !MemInst.isVolatile() && MemInst.isUnordered() &&
-          (isOperatingOnInvariantMemAt(&Inst, InVal.Generation) ||
-           isSameMemGeneration(InVal.Generation, CurrentGeneration,
-                               InVal.DefInst, &Inst))) {
+          InVal.DefInst == getMatchingValue(InVal, MemInst, CurrentGeneration)) {
         // It is okay to have a LastStore to a different pointer here if MemorySSA
         // tells us that the load and store are from the same memory generation.
         // In that case, LastStore should keep its present value since we're
@@ -1272,17 +1505,8 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
       if (MemInst.isValid() && MemInst.isStore()) {
         // We do a trivial form of DSE if there are two stores to the same
         // location with no intervening loads.  Delete the earlier store.
-        // At the moment, we don't remove ordered stores, but do remove
-        // unordered atomic stores.  There's no special requirement (for
-        // unordered atomics) about removing atomic stores only in favor of
-        // other atomic stores since we were going to execute the non-atomic
-        // one anyway and the atomic one might never have become visible.
         if (LastStore) {
-          ParseMemoryInst LastStoreMemInst(LastStore, TTI);
-          assert(LastStoreMemInst.isUnordered() &&
-                 !LastStoreMemInst.isVolatile() &&
-                 "Violated invariant");
-          if (LastStoreMemInst.isMatchingMemLoc(MemInst)) {
+          if (overridingStores(ParseMemoryInst(LastStore, TTI), MemInst)) {
             LLVM_DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore
                               << "  due to: " << Inst << '\n');
             if (!DebugCounter::shouldExecute(CSECounter)) {
@@ -1443,6 +1667,7 @@ public:
     AU.addRequired<TargetLibraryInfoWrapperPass>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
     if (UseMemorySSA) {
+      AU.addRequired<AAResultsWrapperPass>();
       AU.addRequired<MemorySSAWrapperPass>();
       AU.addPreserved<MemorySSAWrapperPass>();
     }
@@ -1484,6 +1709,7 @@ INITIALIZE_PASS_BEGIN(EarlyCSEMemSSALegacyPass, "early-cse-memssa",
                       "Early CSE w/ MemorySSA", false, false)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp
index 72512430b366..e54a270fb276 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp
@@ -12,6 +12,7 @@
 
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/IR/CFG.h"
+#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Float2Int.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Float2Int.cpp
index 83f4c402ed4d..b6d82685e884 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Float2Int.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Float2Int.cpp
@@ -19,7 +19,6 @@
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/IRBuilder.h"
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVN.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVN.cpp
index b16f8591b5a4..c6b6d75aefe8 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -26,8 +26,8 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AssumeBundleQueries.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumeBundleQueries.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/DomTreeUpdater.h"
@@ -36,6 +36,8 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/PHITransAddr.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
@@ -46,7 +48,6 @@
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
@@ -98,21 +99,33 @@ STATISTIC(NumGVNSimpl,  "Number of instructions simplified");
 STATISTIC(NumGVNEqProp, "Number of equalities propagated");
 STATISTIC(NumPRELoad,   "Number of loads PRE'd");
 
+STATISTIC(IsValueFullyAvailableInBlockNumSpeculationsMax,
+          "Number of blocks speculated as available in "
+          "IsValueFullyAvailableInBlock(), max");
+STATISTIC(MaxBBSpeculationCutoffReachedTimes,
+          "Number of times we we reached gvn-max-block-speculations cut-off "
+          "preventing further exploration");
+
 static cl::opt<bool> GVNEnablePRE("enable-pre", cl::init(true), cl::Hidden);
 static cl::opt<bool> GVNEnableLoadPRE("enable-load-pre", cl::init(true));
 static cl::opt<bool> GVNEnableLoadInLoopPRE("enable-load-in-loop-pre",
                                             cl::init(true));
+static cl::opt<bool>
+GVNEnableSplitBackedgeInLoadPRE("enable-split-backedge-in-load-pre",
+                                cl::init(true));
 static cl::opt<bool> GVNEnableMemDep("enable-gvn-memdep", cl::init(true));
 
-// Maximum allowed recursion depth.
-static cl::opt<uint32_t>
-MaxRecurseDepth("gvn-max-recurse-depth", cl::Hidden, cl::init(1000), cl::ZeroOrMore,
-                cl::desc("Max recurse depth in GVN (default = 1000)"));
-
 static cl::opt<uint32_t> MaxNumDeps(
     "gvn-max-num-deps", cl::Hidden, cl::init(100), cl::ZeroOrMore,
     cl::desc("Max number of dependences to attempt Load PRE (default = 100)"));
 
+// This is based on IsValueFullyAvailableInBlockNumSpeculationsMax stat.
+static cl::opt<uint32_t> MaxBBSpeculations(
+    "gvn-max-block-speculations", cl::Hidden, cl::init(600), cl::ZeroOrMore,
+    cl::desc("Max number of blocks we're willing to speculate on (and recurse "
+             "into) when deducing if a value is fully available or not in GVN "
+             "(default = 600)"));
+
 struct llvm::GVN::Expression {
   uint32_t opcode;
   bool commutative = false;
@@ -282,9 +295,9 @@ GVN::Expression GVN::ValueTable::createExpr(Instruction *I) {
   if (I->isCommutative()) {
     // Ensure that commutative instructions that only differ by a permutation
     // of their operands get the same value number by sorting the operand value
-    // numbers.  Since all commutative instructions have two operands it is more
+    // numbers.  Since commutative operands are the 1st two operands it is more
     // efficient to sort by hand rather than using, say, std::sort.
-    assert(I->getNumOperands() == 2 && "Unsupported commutative instruction!");
+    assert(I->getNumOperands() >= 2 && "Unsupported commutative instruction!");
     if (e.varargs[0] > e.varargs[1])
       std::swap(e.varargs[0], e.varargs[1]);
     e.commutative = true;
@@ -353,9 +366,7 @@ GVN::Expression GVN::ValueTable::createExtractvalueExpr(ExtractValueInst *EI) {
        OI != OE; ++OI)
     e.varargs.push_back(lookupOrAdd(*OI));
 
-  for (ExtractValueInst::idx_iterator II = EI->idx_begin(), IE = EI->idx_end();
-         II != IE; ++II)
-    e.varargs.push_back(*II);
+  append_range(e.varargs, EI->indices());
 
   return e;
 }
@@ -399,9 +410,12 @@ uint32_t GVN::ValueTable::lookupOrAddCall(CallInst *C) {
     }
 
     if (local_dep.isDef()) {
-      CallInst* local_cdep = cast<CallInst>(local_dep.getInst());
+      // For masked load/store intrinsics, the local_dep may actully be
+      // a normal load or store instruction.
+      CallInst *local_cdep = dyn_cast<CallInst>(local_dep.getInst());
 
-      if (local_cdep->getNumArgOperands() != C->getNumArgOperands()) {
+      if (!local_cdep ||
+          local_cdep->getNumArgOperands() != C->getNumArgOperands()) {
         valueNumbering[C] = nextValueNumber;
         return nextValueNumber++;
       }
@@ -626,6 +640,11 @@ bool GVN::isLoadInLoopPREEnabled() const {
   return Options.AllowLoadInLoopPRE.getValueOr(GVNEnableLoadInLoopPRE);
 }
 
+bool GVN::isLoadPRESplitBackedgeEnabled() const {
+  return Options.AllowLoadPRESplitBackedge.getValueOr(
+      GVNEnableSplitBackedgeInLoadPRE);
+}
+
 bool GVN::isMemDepEnabled() const {
   return Options.AllowMemDep.getValueOr(GVNEnableMemDep);
 }
@@ -642,14 +661,18 @@ PreservedAnalyses GVN::run(Function &F, FunctionAnalysisManager &AM) {
   auto *MemDep =
       isMemDepEnabled() ? &AM.getResult<MemoryDependenceAnalysis>(F) : nullptr;
   auto *LI = AM.getCachedResult<LoopAnalysis>(F);
+  auto *MSSA = AM.getCachedResult<MemorySSAAnalysis>(F);
   auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
-  bool Changed = runImpl(F, AC, DT, TLI, AA, MemDep, LI, &ORE);
+  bool Changed = runImpl(F, AC, DT, TLI, AA, MemDep, LI, &ORE,
+                         MSSA ? &MSSA->getMSSA() : nullptr);
   if (!Changed)
     return PreservedAnalyses::all();
   PreservedAnalyses PA;
   PA.preserve<DominatorTreeAnalysis>();
   PA.preserve<GlobalsAA>();
   PA.preserve<TargetLibraryAnalysis>();
+  if (MSSA)
+    PA.preserve<MemorySSAAnalysis>();
   if (LI)
     PA.preserve<LoopAnalysis>();
   return PA;
@@ -667,6 +690,18 @@ LLVM_DUMP_METHOD void GVN::dump(DenseMap<uint32_t, Value*>& d) const {
 }
 #endif
 
+enum class AvailabilityState : char {
+  /// We know the block *is not* fully available. This is a fixpoint.
+  Unavailable = 0,
+  /// We know the block *is* fully available. This is a fixpoint.
+  Available = 1,
+  /// We do not know whether the block is fully available or not,
+  /// but we are currently speculating that it will be.
+  /// If it would have turned out that the block was, in fact, not fully
+  /// available, this would have been cleaned up into an Unavailable.
+  SpeculativelyAvailable = 2,
+};
+
 /// Return true if we can prove that the value
 /// we're analyzing is fully available in the specified block.  As we go, keep
 /// track of which blocks we know are fully alive in FullyAvailableBlocks.  This
@@ -675,76 +710,118 @@ LLVM_DUMP_METHOD void GVN::dump(DenseMap<uint32_t, Value*>& d) const {
 ///   1) we know the block *is* fully available.
 ///   2) we do not know whether the block is fully available or not, but we are
 ///      currently speculating that it will be.
-///   3) we are speculating for this block and have used that to speculate for
-///      other blocks.
-static bool IsValueFullyAvailableInBlock(BasicBlock *BB,
-                            DenseMap<BasicBlock*, char> &FullyAvailableBlocks,
-                            uint32_t RecurseDepth) {
-  if (RecurseDepth > MaxRecurseDepth)
-    return false;
-
-  // Optimistically assume that the block is fully available and check to see
-  // if we already know about this block in one lookup.
-  std::pair<DenseMap<BasicBlock*, char>::iterator, bool> IV =
-    FullyAvailableBlocks.insert(std::make_pair(BB, 2));
-
-  // If the entry already existed for this block, return the precomputed value.
-  if (!IV.second) {
-    // If this is a speculative "available" value, mark it as being used for
-    // speculation of other blocks.
-    if (IV.first->second == 2)
-      IV.first->second = 3;
-    return IV.first->second != 0;
-  }
-
-  // Otherwise, see if it is fully available in all predecessors.
-  pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
-
-  // If this block has no predecessors, it isn't live-in here.
-  if (PI == PE)
-    goto SpeculationFailure;
+static bool IsValueFullyAvailableInBlock(
+    BasicBlock *BB,
+    DenseMap<BasicBlock *, AvailabilityState> &FullyAvailableBlocks) {
+  SmallVector<BasicBlock *, 32> Worklist;
+  Optional<BasicBlock *> UnavailableBB;
+
+  // The number of times we didn't find an entry for a block in a map and
+  // optimistically inserted an entry marking block as speculatively available.
+  unsigned NumNewNewSpeculativelyAvailableBBs = 0;
+
+#ifndef NDEBUG
+  SmallSet<BasicBlock *, 32> NewSpeculativelyAvailableBBs;
+  SmallVector<BasicBlock *, 32> AvailableBBs;
+#endif
 
-  for (; PI != PE; ++PI)
-    // If the value isn't fully available in one of our predecessors, then it
-    // isn't fully available in this block either.  Undo our previous
-    // optimistic assumption and bail out.
-    if (!IsValueFullyAvailableInBlock(*PI, FullyAvailableBlocks,RecurseDepth+1))
-      goto SpeculationFailure;
+  Worklist.emplace_back(BB);
+  while (!Worklist.empty()) {
+    BasicBlock *CurrBB = Worklist.pop_back_val(); // LIFO - depth-first!
+    // Optimistically assume that the block is Speculatively Available and check
+    // to see if we already know about this block in one lookup.
+    std::pair<DenseMap<BasicBlock *, AvailabilityState>::iterator, bool> IV =
+        FullyAvailableBlocks.try_emplace(
+            CurrBB, AvailabilityState::SpeculativelyAvailable);
+    AvailabilityState &State = IV.first->second;
+
+    // Did the entry already exist for this block?
+    if (!IV.second) {
+      if (State == AvailabilityState::Unavailable) {
+        UnavailableBB = CurrBB;
+        break; // Backpropagate unavailability info.
+      }
 
-  return true;
+#ifndef NDEBUG
+      AvailableBBs.emplace_back(CurrBB);
+#endif
+      continue; // Don't recurse further, but continue processing worklist.
+    }
 
-// If we get here, we found out that this is not, after
-// all, a fully-available block.  We have a problem if we speculated on this and
-// used the speculation to mark other blocks as available.
-SpeculationFailure:
-  char &BBVal = FullyAvailableBlocks[BB];
+    // No entry found for block.
+    ++NumNewNewSpeculativelyAvailableBBs;
+    bool OutOfBudget = NumNewNewSpeculativelyAvailableBBs > MaxBBSpeculations;
+
+    // If we have exhausted our budget, mark this block as unavailable.
+    // Also, if this block has no predecessors, the value isn't live-in here.
+    if (OutOfBudget || pred_empty(CurrBB)) {
+      MaxBBSpeculationCutoffReachedTimes += (int)OutOfBudget;
+      State = AvailabilityState::Unavailable;
+      UnavailableBB = CurrBB;
+      break; // Backpropagate unavailability info.
+    }
 
-  // If we didn't speculate on this, just return with it set to false.
-  if (BBVal == 2) {
-    BBVal = 0;
-    return false;
+    // Tentatively consider this block as speculatively available.
+#ifndef NDEBUG
+    NewSpeculativelyAvailableBBs.insert(CurrBB);
+#endif
+    // And further recurse into block's predecessors, in depth-first order!
+    Worklist.append(pred_begin(CurrBB), pred_end(CurrBB));
   }
 
-  // If we did speculate on this value, we could have blocks set to 1 that are
-  // incorrect.  Walk the (transitive) successors of this block and mark them as
-  // 0 if set to one.
-  SmallVector<BasicBlock*, 32> BBWorklist;
-  BBWorklist.push_back(BB);
-
-  do {
-    BasicBlock *Entry = BBWorklist.pop_back_val();
-    // Note that this sets blocks to 0 (unavailable) if they happen to not
-    // already be in FullyAvailableBlocks.  This is safe.
-    char &EntryVal = FullyAvailableBlocks[Entry];
-    if (EntryVal == 0) continue;  // Already unavailable.
-
-    // Mark as unavailable.
-    EntryVal = 0;
+#if LLVM_ENABLE_STATS
+  IsValueFullyAvailableInBlockNumSpeculationsMax.updateMax(
+      NumNewNewSpeculativelyAvailableBBs);
+#endif
 
-    BBWorklist.append(succ_begin(Entry), succ_end(Entry));
-  } while (!BBWorklist.empty());
+  // If the block isn't marked as fixpoint yet
+  // (the Unavailable and Available states are fixpoints)
+  auto MarkAsFixpointAndEnqueueSuccessors =
+      [&](BasicBlock *BB, AvailabilityState FixpointState) {
+        auto It = FullyAvailableBlocks.find(BB);
+        if (It == FullyAvailableBlocks.end())
+          return; // Never queried this block, leave as-is.
+        switch (AvailabilityState &State = It->second) {
+        case AvailabilityState::Unavailable:
+        case AvailabilityState::Available:
+          return; // Don't backpropagate further, continue processing worklist.
+        case AvailabilityState::SpeculativelyAvailable: // Fix it!
+          State = FixpointState;
+#ifndef NDEBUG
+          assert(NewSpeculativelyAvailableBBs.erase(BB) &&
+                 "Found a speculatively available successor leftover?");
+#endif
+          // Queue successors for further processing.
+          Worklist.append(succ_begin(BB), succ_end(BB));
+          return;
+        }
+      };
+
+  if (UnavailableBB) {
+    // Okay, we have encountered an unavailable block.
+    // Mark speculatively available blocks reachable from UnavailableBB as
+    // unavailable as well. Paths are terminated when they reach blocks not in
+    // FullyAvailableBlocks or they are not marked as speculatively available.
+    Worklist.clear();
+    Worklist.append(succ_begin(*UnavailableBB), succ_end(*UnavailableBB));
+    while (!Worklist.empty())
+      MarkAsFixpointAndEnqueueSuccessors(Worklist.pop_back_val(),
+                                         AvailabilityState::Unavailable);
+  }
+
+#ifndef NDEBUG
+  Worklist.clear();
+  for (BasicBlock *AvailableBB : AvailableBBs)
+    Worklist.append(succ_begin(AvailableBB), succ_end(AvailableBB));
+  while (!Worklist.empty())
+    MarkAsFixpointAndEnqueueSuccessors(Worklist.pop_back_val(),
+                                       AvailabilityState::Available);
+
+  assert(NewSpeculativelyAvailableBBs.empty() &&
+         "Must have fixed all the new speculatively available blocks.");
+#endif
 
-  return false;
+  return !UnavailableBB;
 }
 
 /// Given a set of loads specified by ValuesPerBlock,
@@ -963,7 +1040,7 @@ bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo,
 
   if (StoreInst *S = dyn_cast<StoreInst>(DepInst)) {
     // Reject loads and stores that are to the same address but are of
-    // different types if we have to. If the stored value is larger or equal to
+    // different types if we have to. If the stored value is convertable to
     // the loaded value, we can reuse it.
     if (!canCoerceMustAliasedValueToLoad(S->getValueOperand(), LI->getType(),
                                          DL))
@@ -1062,7 +1139,6 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
   // backwards through predecessors if needed.
   BasicBlock *LoadBB = LI->getParent();
   BasicBlock *TmpBB = LoadBB;
-  bool IsSafeToSpeculativelyExecute = isSafeToSpeculativelyExecute(LI);
 
   // Check that there is no implicit control flow instructions above our load in
   // its block. If there is an instruction that doesn't always pass the
@@ -1079,8 +1155,9 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
   // because if the index is out of bounds we should deoptimize rather than
   // access the array.
   // Check that there is no guard in this block above our instruction.
-  if (!IsSafeToSpeculativelyExecute && ICF->isDominatedByICFIFromSameBlock(LI))
-    return false;
+  bool MustEnsureSafetyOfSpeculativeExecution =
+      ICF->isDominatedByICFIFromSameBlock(LI);
+
   while (TmpBB->getSinglePredecessor()) {
     TmpBB = TmpBB->getSinglePredecessor();
     if (TmpBB == LoadBB) // Infinite (unreachable) loop.
@@ -1097,8 +1174,8 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
       return false;
 
     // Check that there is no implicit control flow in a block above.
-    if (!IsSafeToSpeculativelyExecute && ICF->hasICF(TmpBB))
-      return false;
+    MustEnsureSafetyOfSpeculativeExecution =
+        MustEnsureSafetyOfSpeculativeExecution || ICF->hasICF(TmpBB);
   }
 
   assert(TmpBB);
@@ -1107,11 +1184,11 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
   // Check to see how many predecessors have the loaded value fully
   // available.
   MapVector<BasicBlock *, Value *> PredLoads;
-  DenseMap<BasicBlock*, char> FullyAvailableBlocks;
+  DenseMap<BasicBlock *, AvailabilityState> FullyAvailableBlocks;
   for (const AvailableValueInBlock &AV : ValuesPerBlock)
-    FullyAvailableBlocks[AV.BB] = true;
+    FullyAvailableBlocks[AV.BB] = AvailabilityState::Available;
   for (BasicBlock *UnavailableBB : UnavailableBlocks)
-    FullyAvailableBlocks[UnavailableBB] = false;
+    FullyAvailableBlocks[UnavailableBB] = AvailabilityState::Unavailable;
 
   SmallVector<BasicBlock *, 4> CriticalEdgePred;
   for (BasicBlock *Pred : predecessors(LoadBB)) {
@@ -1124,7 +1201,7 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
       return false;
     }
 
-    if (IsValueFullyAvailableInBlock(Pred, FullyAvailableBlocks, 0)) {
+    if (IsValueFullyAvailableInBlock(Pred, FullyAvailableBlocks)) {
       continue;
     }
 
@@ -1151,6 +1228,16 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
         return false;
       }
 
+      // Do not split backedge as it will break the canonical loop form.
+      if (!isLoadPRESplitBackedgeEnabled())
+        if (DT->dominates(LoadBB, Pred)) {
+          LLVM_DEBUG(
+              dbgs()
+              << "COULD NOT PRE LOAD BECAUSE OF A BACKEDGE CRITICAL EDGE '"
+              << Pred->getName() << "': " << *LI << '\n');
+          return false;
+        }
+
       CriticalEdgePred.push_back(Pred);
     } else {
       // Only add the predecessors that will not be split for now.
@@ -1170,6 +1257,17 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
   if (NumUnavailablePreds != 1)
       return false;
 
+  // Now we know where we will insert load. We must ensure that it is safe
+  // to speculatively execute the load at that points.
+  if (MustEnsureSafetyOfSpeculativeExecution) {
+    if (CriticalEdgePred.size())
+      if (!isSafeToSpeculativelyExecute(LI, LoadBB->getFirstNonPHI(), DT))
+        return false;
+    for (auto &PL : PredLoads)
+      if (!isSafeToSpeculativelyExecute(LI, PL.first->getTerminator(), DT))
+        return false;
+  }
+
   // Split critical edges, and update the unavailable predecessors accordingly.
   for (BasicBlock *OrigPred : CriticalEdgePred) {
     BasicBlock *NewPred = splitCriticalEdges(OrigPred, LoadBB);
@@ -1251,8 +1349,7 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
     // Instructions that have been inserted in predecessor(s) to materialize
     // the load address do not retain their original debug locations. Doing
     // so could lead to confusing (but correct) source attributions.
-    if (const DebugLoc &DL = I->getDebugLoc())
-      I->setDebugLoc(DebugLoc::get(0, 0, DL.getScope(), DL.getInlinedAt()));
+    I->updateLocationAfterHoist();
 
     // FIXME: We really _ought_ to insert these value numbers into their
     // parent's availability map.  However, in doing so, we risk getting into
@@ -1270,6 +1367,22 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
         LI->getAlign(), LI->getOrdering(), LI->getSyncScopeID(),
         UnavailablePred->getTerminator());
     NewLoad->setDebugLoc(LI->getDebugLoc());
+    if (MSSAU) {
+      auto *MSSA = MSSAU->getMemorySSA();
+      // Get the defining access of the original load or use the load if it is a
+      // MemoryDef (e.g. because it is volatile). The inserted loads are
+      // guaranteed to load from the same definition.
+      auto *LIAcc = MSSA->getMemoryAccess(LI);
+      auto *DefiningAcc =
+          isa<MemoryDef>(LIAcc) ? LIAcc : LIAcc->getDefiningAccess();
+      auto *NewAccess = MSSAU->createMemoryAccessInBB(
+          NewLoad, DefiningAcc, NewLoad->getParent(),
+          MemorySSA::BeforeTerminator);
+      if (auto *NewDef = dyn_cast<MemoryDef>(NewAccess))
+        MSSAU->insertDef(NewDef, /*RenameUses=*/true);
+      else
+        MSSAU->insertUse(cast<MemoryUse>(NewAccess), /*RenameUses=*/true);
+    }
 
     // Transfer the old load's AA tags to the new load.
     AAMDNodes Tags;
@@ -1357,13 +1470,14 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
     return false;
   }
 
+  bool Changed = false;
   // If this load follows a GEP, see if we can PRE the indices before analyzing.
   if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(LI->getOperand(0))) {
     for (GetElementPtrInst::op_iterator OI = GEP->idx_begin(),
                                         OE = GEP->idx_end();
          OI != OE; ++OI)
       if (Instruction *I = dyn_cast<Instruction>(OI->get()))
-        performScalarPRE(I);
+        Changed |= performScalarPRE(I);
   }
 
   // Step 2: Analyze the availability of the load
@@ -1374,7 +1488,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
   // If we have no predecessors that produce a known value for this load, exit
   // early.
   if (ValuesPerBlock.empty())
-    return false;
+    return Changed;
 
   // Step 3: Eliminate fully redundancy.
   //
@@ -1406,12 +1520,12 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
 
   // Step 4: Eliminate partial redundancy.
   if (!isPREEnabled() || !isLoadPREEnabled())
-    return false;
+    return Changed;
   if (!isLoadInLoopPREEnabled() && this->LI &&
       this->LI->getLoopFor(LI->getParent()))
-    return false;
+    return Changed;
 
-  return PerformLoadPRE(LI, ValuesPerBlock, UnavailableBlocks);
+  return Changed || PerformLoadPRE(LI, ValuesPerBlock, UnavailableBlocks);
 }
 
 static bool impliesEquivalanceIfTrue(CmpInst* Cmp) {
@@ -1486,9 +1600,40 @@ bool GVN::processAssumeIntrinsic(IntrinsicInst *IntrinsicI) {
       // Insert a new store to null instruction before the load to indicate that
       // this code is not reachable.  FIXME: We could insert unreachable
       // instruction directly because we can modify the CFG.
-      new StoreInst(UndefValue::get(Int8Ty),
-                    Constant::getNullValue(Int8Ty->getPointerTo()),
-                    IntrinsicI);
+      auto *NewS = new StoreInst(UndefValue::get(Int8Ty),
+                                 Constant::getNullValue(Int8Ty->getPointerTo()),
+                                 IntrinsicI);
+      if (MSSAU) {
+        const MemoryUseOrDef *FirstNonDom = nullptr;
+        const auto *AL =
+            MSSAU->getMemorySSA()->getBlockAccesses(IntrinsicI->getParent());
+
+        // If there are accesses in the current basic block, find the first one
+        // that does not come before NewS. The new memory access is inserted
+        // after the found access or before the terminator if no such access is
+        // found.
+        if (AL) {
+          for (auto &Acc : *AL) {
+            if (auto *Current = dyn_cast<MemoryUseOrDef>(&Acc))
+              if (!Current->getMemoryInst()->comesBefore(NewS)) {
+                FirstNonDom = Current;
+                break;
+              }
+          }
+        }
+
+        // This added store is to null, so it will never executed and we can
+        // just use the LiveOnEntry def as defining access.
+        auto *NewDef =
+            FirstNonDom ? MSSAU->createMemoryAccessBefore(
+                              NewS, MSSAU->getMemorySSA()->getLiveOnEntryDef(),
+                              const_cast<MemoryUseOrDef *>(FirstNonDom))
+                        : MSSAU->createMemoryAccessInBB(
+                              NewS, MSSAU->getMemorySSA()->getLiveOnEntryDef(),
+                              NewS->getParent(), MemorySSA::BeforeTerminator);
+
+        MSSAU->insertDef(cast<MemoryDef>(NewDef), /*RenameUses=*/false);
+      }
     }
     if (isAssumeWithEmptyBundle(*IntrinsicI))
       markInstructionForDeletion(IntrinsicI);
@@ -1516,6 +1661,11 @@ bool GVN::processAssumeIntrinsic(IntrinsicInst *IntrinsicI) {
   // br i1 %cmp, label %bb1, label %bb2 ; will change %cmp to true
   ReplaceOperandsWithMap[V] = True;
 
+  // Similarly, after assume(!NotV) we know that NotV == false.
+  Value *NotV;
+  if (match(V, m_Not(m_Value(NotV))))
+    ReplaceOperandsWithMap[NotV] = ConstantInt::getFalse(V->getContext());
+
   // If we find an equality fact, canonicalize all dominated uses in this block
   // to one of the two values.  We heuristically choice the "oldest" of the
   // two where age is determined by value number. (Note that propagateEquality
@@ -1622,6 +1772,8 @@ bool GVN::processLoad(LoadInst *L) {
     // Replace the load!
     patchAndReplaceAllUsesWith(L, AvailableValue);
     markInstructionForDeletion(L);
+    if (MSSAU)
+      MSSAU->removeMemoryAccess(L);
     ++NumGVNLoad;
     reportLoadElim(L, AvailableValue, ORE);
     // Tell MDA to rexamine the reused pointer since we might have more
@@ -1743,7 +1895,7 @@ uint32_t GVN::ValueTable::phiTranslateImpl(const BasicBlock *Pred,
   }
 
   if (Exp.commutative) {
-    assert(Exp.varargs.size() == 2 && "Unsupported commutative expression!");
+    assert(Exp.varargs.size() >= 2 && "Unsupported commutative instruction!");
     if (Exp.varargs[0] > Exp.varargs[1]) {
       std::swap(Exp.varargs[0], Exp.varargs[1]);
       uint32_t Opcode = Exp.opcode >> 8;
@@ -1766,11 +1918,8 @@ uint32_t GVN::ValueTable::phiTranslateImpl(const BasicBlock *Pred,
 /// again.
 void GVN::ValueTable::eraseTranslateCacheEntry(uint32_t Num,
                                                const BasicBlock &CurrBlock) {
-  for (const BasicBlock *Pred : predecessors(&CurrBlock)) {
-    auto FindRes = PhiTranslateTable.find({Num, Pred});
-    if (FindRes != PhiTranslateTable.end())
-      PhiTranslateTable.erase(FindRes);
-  }
+  for (const BasicBlock *Pred : predecessors(&CurrBlock))
+    PhiTranslateTable.erase({Num, Pred});
 }
 
 // In order to find a leader for a given value number at a
@@ -1934,8 +2083,8 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root,
     // If "A && B" is known true then both A and B are known true.  If "A || B"
     // is known false then both A and B are known false.
     Value *A, *B;
-    if ((isKnownTrue && match(LHS, m_And(m_Value(A), m_Value(B)))) ||
-        (isKnownFalse && match(LHS, m_Or(m_Value(A), m_Value(B))))) {
+    if ((isKnownTrue && match(LHS, m_LogicalAnd(m_Value(A), m_Value(B)))) ||
+        (isKnownFalse && match(LHS, m_LogicalOr(m_Value(A), m_Value(B))))) {
       Worklist.push_back(std::make_pair(A, RHS));
       Worklist.push_back(std::make_pair(B, RHS));
       continue;
@@ -2137,7 +2286,7 @@ bool GVN::processInstruction(Instruction *I) {
 bool GVN::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT,
                   const TargetLibraryInfo &RunTLI, AAResults &RunAA,
                   MemoryDependenceResults *RunMD, LoopInfo *LI,
-                  OptimizationRemarkEmitter *RunORE) {
+                  OptimizationRemarkEmitter *RunORE, MemorySSA *MSSA) {
   AC = &RunAC;
   DT = &RunDT;
   VN.setDomTree(DT);
@@ -2150,6 +2299,8 @@ bool GVN::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT,
   VN.setMemDep(MD);
   ORE = RunORE;
   InvalidBlockRPONumbers = true;
+  MemorySSAUpdater Updater(MSSA);
+  MSSAU = MSSA ? &Updater : nullptr;
 
   bool Changed = false;
   bool ShouldContinue = true;
@@ -2160,7 +2311,7 @@ bool GVN::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT,
   for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ) {
     BasicBlock *BB = &*FI++;
 
-    bool removedBlock = MergeBlockIntoPredecessor(BB, &DTU, LI, nullptr, MD);
+    bool removedBlock = MergeBlockIntoPredecessor(BB, &DTU, LI, MSSAU, MD);
     if (removedBlock)
       ++NumGVNBlocks;
 
@@ -2196,6 +2347,9 @@ bool GVN::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT,
   // iteration.
   DeadBlocks.clear();
 
+  if (MSSA && VerifyMemorySSA)
+    MSSA->verifyMemorySSA();
+
   return Changed;
 }
 
@@ -2236,6 +2390,8 @@ bool GVN::processBlock(BasicBlock *BB) {
       salvageKnowledge(I, AC);
       salvageDebugInfo(*I);
       if (MD) MD->removeInstruction(I);
+      if (MSSAU)
+        MSSAU->removeMemoryAccess(I);
       LLVM_DEBUG(verifyRemoved(I));
       ICF->removeInstruction(I);
       I->eraseFromParent();
@@ -2323,10 +2479,14 @@ bool GVN::performScalarPRE(Instruction *CurInst) {
   if (isa<GetElementPtrInst>(CurInst))
     return false;
 
-  // We don't currently value number ANY inline asm calls.
-  if (auto *CallB = dyn_cast<CallBase>(CurInst))
+  if (auto *CallB = dyn_cast<CallBase>(CurInst)) {
+    // We don't currently value number ANY inline asm calls.
     if (CallB->isInlineAsm())
       return false;
+    // Don't do PRE on convergent calls.
+    if (CallB->isConvergent())
+      return false;
+  }
 
   uint32_t ValNo = VN.lookup(CurInst);
 
@@ -2466,6 +2626,8 @@ bool GVN::performScalarPRE(Instruction *CurInst) {
   LLVM_DEBUG(dbgs() << "GVN PRE removed: " << *CurInst << '\n');
   if (MD)
     MD->removeInstruction(CurInst);
+  if (MSSAU)
+    MSSAU->removeMemoryAccess(CurInst);
   LLVM_DEBUG(verifyRemoved(CurInst));
   // FIXME: Intended to be markInstructionForDeletion(CurInst), but it causes
   // some assertion failures.
@@ -2510,10 +2672,12 @@ BasicBlock *GVN::splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ) {
   // possible.
   BasicBlock *BB = SplitCriticalEdge(
       Pred, Succ,
-      CriticalEdgeSplittingOptions(DT, LI).unsetPreserveLoopSimplify());
-  if (MD)
-    MD->invalidateCachedPredecessors();
-  InvalidBlockRPONumbers = true;
+      CriticalEdgeSplittingOptions(DT, LI, MSSAU).unsetPreserveLoopSimplify());
+  if (BB) {
+    if (MD)
+      MD->invalidateCachedPredecessors();
+    InvalidBlockRPONumbers = true;
+  }
   return BB;
 }
 
@@ -2522,14 +2686,20 @@ BasicBlock *GVN::splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ) {
 bool GVN::splitCriticalEdges() {
   if (toSplit.empty())
     return false;
+
+  bool Changed = false;
   do {
     std::pair<Instruction *, unsigned> Edge = toSplit.pop_back_val();
-    SplitCriticalEdge(Edge.first, Edge.second,
-                      CriticalEdgeSplittingOptions(DT, LI));
+    Changed |= SplitCriticalEdge(Edge.first, Edge.second,
+                                 CriticalEdgeSplittingOptions(DT, LI, MSSAU)) !=
+               nullptr;
   } while (!toSplit.empty());
-  if (MD) MD->invalidateCachedPredecessors();
-  InvalidBlockRPONumbers = true;
-  return true;
+  if (Changed) {
+    if (MD)
+      MD->invalidateCachedPredecessors();
+    InvalidBlockRPONumbers = true;
+  }
+  return Changed;
 }
 
 /// Executes one iteration of GVN
@@ -2633,13 +2803,12 @@ void GVN::addDeadBlock(BasicBlock *BB) {
 
     // First, split the critical edges. This might also create additional blocks
     // to preserve LoopSimplify form and adjust edges accordingly.
-    SmallVector<BasicBlock *, 4> Preds(pred_begin(B), pred_end(B));
+    SmallVector<BasicBlock *, 4> Preds(predecessors(B));
     for (BasicBlock *P : Preds) {
       if (!DeadBlocks.count(P))
         continue;
 
-      if (llvm::any_of(successors(P),
-                       [B](BasicBlock *Succ) { return Succ == B; }) &&
+      if (llvm::is_contained(successors(P), B) &&
           isCriticalEdge(P->getTerminator(), B)) {
         if (BasicBlock *S = splitCriticalEdges(P, B))
           DeadBlocks.insert(P = S);
@@ -2724,6 +2893,7 @@ public:
 
     auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
 
+    auto *MSSAWP = getAnalysisIfAvailable<MemorySSAWrapperPass>();
     return Impl.runImpl(
         F, getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F),
         getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
@@ -2733,7 +2903,8 @@ public:
             ? &getAnalysis<MemoryDependenceWrapperPass>().getMemDep()
             : nullptr,
         LIWP ? &LIWP->getLoopInfo() : nullptr,
-        &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE());
+        &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(),
+        MSSAWP ? &MSSAWP->getMSSA() : nullptr);
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -2744,12 +2915,12 @@ public:
     if (Impl.isMemDepEnabled())
       AU.addRequired<MemoryDependenceWrapperPass>();
     AU.addRequired<AAResultsWrapperPass>();
-
     AU.addPreserved<DominatorTreeWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
     AU.addPreserved<TargetLibraryInfoWrapperPass>();
     AU.addPreserved<LoopInfoWrapperPass>();
     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+    AU.addPreserved<MemorySSAWrapperPass>();
   }
 
 private:
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVNHoist.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVNHoist.cpp
index 9c4cdf2feb56..8d0bd5674964 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVNHoist.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVNHoist.cpp
@@ -149,8 +149,8 @@ struct CHIArg {
   // The instruction (VN) which uses the values flowing out of CHI.
   Instruction *I;
 
-  bool operator==(const CHIArg &A) { return VN == A.VN; }
-  bool operator!=(const CHIArg &A) { return !(*this == A); }
+  bool operator==(const CHIArg &A) const { return VN == A.VN; }
+  bool operator!=(const CHIArg &A) const { return !(*this == A); }
 };
 
 using CHIIt = SmallVectorImpl<CHIArg>::iterator;
@@ -242,11 +242,14 @@ public:
 };
 
 static void combineKnownMetadata(Instruction *ReplInst, Instruction *I) {
-  static const unsigned KnownIDs[] = {
-      LLVMContext::MD_tbaa,           LLVMContext::MD_alias_scope,
-      LLVMContext::MD_noalias,        LLVMContext::MD_range,
-      LLVMContext::MD_fpmath,         LLVMContext::MD_invariant_load,
-      LLVMContext::MD_invariant_group, LLVMContext::MD_access_group};
+  static const unsigned KnownIDs[] = {LLVMContext::MD_tbaa,
+                                      LLVMContext::MD_alias_scope,
+                                      LLVMContext::MD_noalias,
+                                      LLVMContext::MD_range,
+                                      LLVMContext::MD_fpmath,
+                                      LLVMContext::MD_invariant_load,
+                                      LLVMContext::MD_invariant_group,
+                                      LLVMContext::MD_access_group};
   combineMetadata(ReplInst, I, KnownIDs, true);
 }
 
@@ -260,43 +263,7 @@ public:
       : DT(DT), PDT(PDT), AA(AA), MD(MD), MSSA(MSSA),
         MSSAUpdater(std::make_unique<MemorySSAUpdater>(MSSA)) {}
 
-  bool run(Function &F) {
-    NumFuncArgs = F.arg_size();
-    VN.setDomTree(DT);
-    VN.setAliasAnalysis(AA);
-    VN.setMemDep(MD);
-    bool Res = false;
-    // Perform DFS Numbering of instructions.
-    unsigned BBI = 0;
-    for (const BasicBlock *BB : depth_first(&F.getEntryBlock())) {
-      DFSNumber[BB] = ++BBI;
-      unsigned I = 0;
-      for (auto &Inst : *BB)
-        DFSNumber[&Inst] = ++I;
-    }
-
-    int ChainLength = 0;
-
-    // FIXME: use lazy evaluation of VN to avoid the fix-point computation.
-    while (true) {
-      if (MaxChainLength != -1 && ++ChainLength >= MaxChainLength)
-        return Res;
-
-      auto HoistStat = hoistExpressions(F);
-      if (HoistStat.first + HoistStat.second == 0)
-        return Res;
-
-      if (HoistStat.second > 0)
-        // To address a limitation of the current GVN, we need to rerun the
-        // hoisting after we hoisted loads or stores in order to be able to
-        // hoist all scalars dependent on the hoisted ld/st.
-        VN.clear();
-
-      Res = true;
-    }
-
-    return Res;
-  }
+  bool run(Function &F);
 
   // Copied from NewGVN.cpp
   // This function provides global ranking of operations so that we can place
@@ -304,27 +271,7 @@ public:
   // for a complete ordering, as constants all have the same rank.  However,
   // generally, we will simplify an operation with all constants so that it
   // doesn't matter what order they appear in.
-  unsigned int rank(const Value *V) const {
-    // Prefer constants to undef to anything else
-    // Undef is a constant, have to check it first.
-    // Prefer smaller constants to constantexprs
-    if (isa<ConstantExpr>(V))
-      return 2;
-    if (isa<UndefValue>(V))
-      return 1;
-    if (isa<Constant>(V))
-      return 0;
-    else if (auto *A = dyn_cast<Argument>(V))
-      return 3 + A->getArgNo();
-
-    // Need to shift the instruction DFS by number of arguments + 3 to account
-    // for the constant and argument ranking above.
-    auto Result = DFSNumber.lookup(V);
-    if (Result > 0)
-      return 4 + NumFuncArgs + Result;
-    // Unreachable or something else, just return a really large number.
-    return ~0;
-  }
+  unsigned int rank(const Value *V) const;
 
 private:
   GVN::ValueTable VN;
@@ -344,33 +291,7 @@ private:
   enum InsKind { Unknown, Scalar, Load, Store };
 
   // Return true when there are exception handling in BB.
-  bool hasEH(const BasicBlock *BB) {
-    auto It = BBSideEffects.find(BB);
-    if (It != BBSideEffects.end())
-      return It->second;
-
-    if (BB->isEHPad() || BB->hasAddressTaken()) {
-      BBSideEffects[BB] = true;
-      return true;
-    }
-
-    if (BB->getTerminator()->mayThrow()) {
-      BBSideEffects[BB] = true;
-      return true;
-    }
-
-    BBSideEffects[BB] = false;
-    return false;
-  }
-
-  // Return true when a successor of BB dominates A.
-  bool successorDominate(const BasicBlock *BB, const BasicBlock *A) {
-    for (const BasicBlock *Succ : successors(BB))
-      if (DT->dominates(Succ, A))
-        return true;
-
-    return false;
-  }
+  bool hasEH(const BasicBlock *BB);
 
   // Return true when I1 appears before I2 in the instructions of BB.
   bool firstInBB(const Instruction *I1, const Instruction *I2) {
@@ -383,57 +304,10 @@ private:
 
   // Return true when there are memory uses of Def in BB.
   bool hasMemoryUse(const Instruction *NewPt, MemoryDef *Def,
-                    const BasicBlock *BB) {
-    const MemorySSA::AccessList *Acc = MSSA->getBlockAccesses(BB);
-    if (!Acc)
-      return false;
-
-    Instruction *OldPt = Def->getMemoryInst();
-    const BasicBlock *OldBB = OldPt->getParent();
-    const BasicBlock *NewBB = NewPt->getParent();
-    bool ReachedNewPt = false;
-
-    for (const MemoryAccess &MA : *Acc)
-      if (const MemoryUse *MU = dyn_cast<MemoryUse>(&MA)) {
-        Instruction *Insn = MU->getMemoryInst();
-
-        // Do not check whether MU aliases Def when MU occurs after OldPt.
-        if (BB == OldBB && firstInBB(OldPt, Insn))
-          break;
-
-        // Do not check whether MU aliases Def when MU occurs before NewPt.
-        if (BB == NewBB) {
-          if (!ReachedNewPt) {
-            if (firstInBB(Insn, NewPt))
-              continue;
-            ReachedNewPt = true;
-          }
-        }
-        if (MemorySSAUtil::defClobbersUseOrDef(Def, MU, *AA))
-          return true;
-      }
-
-    return false;
-  }
+                    const BasicBlock *BB);
 
   bool hasEHhelper(const BasicBlock *BB, const BasicBlock *SrcBB,
-                   int &NBBsOnAllPaths) {
-    // Stop walk once the limit is reached.
-    if (NBBsOnAllPaths == 0)
-      return true;
-
-    // Impossible to hoist with exceptions on the path.
-    if (hasEH(BB))
-      return true;
-
-    // No such instruction after HoistBarrier in a basic block was
-    // selected for hoisting so instructions selected within basic block with
-    // a hoist barrier can be hoisted.
-    if ((BB != SrcBB) && HoistBarrier.count(BB))
-      return true;
-
-    return false;
-  }
+                   int &NBBsOnAllPaths);
 
   // Return true when there are exception handling or loads of memory Def
   // between Def and NewPt.  This function is only called for stores: Def is
@@ -443,118 +317,19 @@ private:
   // return true when the counter NBBsOnAllPaths reaces 0, except when it is
   // initialized to -1 which is unlimited.
   bool hasEHOrLoadsOnPath(const Instruction *NewPt, MemoryDef *Def,
-                          int &NBBsOnAllPaths) {
-    const BasicBlock *NewBB = NewPt->getParent();
-    const BasicBlock *OldBB = Def->getBlock();
-    assert(DT->dominates(NewBB, OldBB) && "invalid path");
-    assert(DT->dominates(Def->getDefiningAccess()->getBlock(), NewBB) &&
-           "def does not dominate new hoisting point");
-
-    // Walk all basic blocks reachable in depth-first iteration on the inverse
-    // CFG from OldBB to NewBB. These blocks are all the blocks that may be
-    // executed between the execution of NewBB and OldBB. Hoisting an expression
-    // from OldBB into NewBB has to be safe on all execution paths.
-    for (auto I = idf_begin(OldBB), E = idf_end(OldBB); I != E;) {
-      const BasicBlock *BB = *I;
-      if (BB == NewBB) {
-        // Stop traversal when reaching HoistPt.
-        I.skipChildren();
-        continue;
-      }
-
-      if (hasEHhelper(BB, OldBB, NBBsOnAllPaths))
-        return true;
-
-      // Check that we do not move a store past loads.
-      if (hasMemoryUse(NewPt, Def, BB))
-        return true;
-
-      // -1 is unlimited number of blocks on all paths.
-      if (NBBsOnAllPaths != -1)
-        --NBBsOnAllPaths;
-
-      ++I;
-    }
-
-    return false;
-  }
+                          int &NBBsOnAllPaths);
 
   // Return true when there are exception handling between HoistPt and BB.
   // Decrement by 1 NBBsOnAllPaths for each block between HoistPt and BB, and
   // return true when the counter NBBsOnAllPaths reaches 0, except when it is
   // initialized to -1 which is unlimited.
   bool hasEHOnPath(const BasicBlock *HoistPt, const BasicBlock *SrcBB,
-                   int &NBBsOnAllPaths) {
-    assert(DT->dominates(HoistPt, SrcBB) && "Invalid path");
-
-    // Walk all basic blocks reachable in depth-first iteration on
-    // the inverse CFG from BBInsn to NewHoistPt. These blocks are all the
-    // blocks that may be executed between the execution of NewHoistPt and
-    // BBInsn. Hoisting an expression from BBInsn into NewHoistPt has to be safe
-    // on all execution paths.
-    for (auto I = idf_begin(SrcBB), E = idf_end(SrcBB); I != E;) {
-      const BasicBlock *BB = *I;
-      if (BB == HoistPt) {
-        // Stop traversal when reaching NewHoistPt.
-        I.skipChildren();
-        continue;
-      }
-
-      if (hasEHhelper(BB, SrcBB, NBBsOnAllPaths))
-        return true;
-
-      // -1 is unlimited number of blocks on all paths.
-      if (NBBsOnAllPaths != -1)
-        --NBBsOnAllPaths;
-
-      ++I;
-    }
-
-    return false;
-  }
+                   int &NBBsOnAllPaths);
 
   // Return true when it is safe to hoist a memory load or store U from OldPt
   // to NewPt.
   bool safeToHoistLdSt(const Instruction *NewPt, const Instruction *OldPt,
-                       MemoryUseOrDef *U, InsKind K, int &NBBsOnAllPaths) {
-    // In place hoisting is safe.
-    if (NewPt == OldPt)
-      return true;
-
-    const BasicBlock *NewBB = NewPt->getParent();
-    const BasicBlock *OldBB = OldPt->getParent();
-    const BasicBlock *UBB = U->getBlock();
-
-    // Check for dependences on the Memory SSA.
-    MemoryAccess *D = U->getDefiningAccess();
-    BasicBlock *DBB = D->getBlock();
-    if (DT->properlyDominates(NewBB, DBB))
-      // Cannot move the load or store to NewBB above its definition in DBB.
-      return false;
-
-    if (NewBB == DBB && !MSSA->isLiveOnEntryDef(D))
-      if (auto *UD = dyn_cast<MemoryUseOrDef>(D))
-        if (!firstInBB(UD->getMemoryInst(), NewPt))
-          // Cannot move the load or store to NewPt above its definition in D.
-          return false;
-
-    // Check for unsafe hoistings due to side effects.
-    if (K == InsKind::Store) {
-      if (hasEHOrLoadsOnPath(NewPt, cast<MemoryDef>(U), NBBsOnAllPaths))
-        return false;
-    } else if (hasEHOnPath(NewBB, OldBB, NBBsOnAllPaths))
-      return false;
-
-    if (UBB == NewBB) {
-      if (DT->properlyDominates(DBB, NewBB))
-        return true;
-      assert(UBB == DBB);
-      assert(MSSA->locallyDominates(D, U));
-    }
-
-    // No side effects: it is safe to hoist.
-    return true;
-  }
+                       MemoryUseOrDef *U, InsKind K, int &NBBsOnAllPaths);
 
   // Return true when it is safe to hoist scalar instructions from all blocks in
   // WL to HoistBB.
@@ -577,92 +352,21 @@ private:
   // Returns the edge via which an instruction in BB will get the values from.
 
   // Returns true when the values are flowing out to each edge.
-  bool valueAnticipable(CHIArgs C, Instruction *TI) const {
-    if (TI->getNumSuccessors() > (unsigned)size(C))
-      return false; // Not enough args in this CHI.
-
-    for (auto CHI : C) {
-      BasicBlock *Dest = CHI.Dest;
-      // Find if all the edges have values flowing out of BB.
-      bool Found = llvm::any_of(
-          successors(TI), [Dest](const BasicBlock *BB) { return BB == Dest; });
-      if (!Found)
-        return false;
-    }
-    return true;
-  }
+  bool valueAnticipable(CHIArgs C, Instruction *TI) const;
 
   // Check if it is safe to hoist values tracked by CHI in the range
   // [Begin, End) and accumulate them in Safe.
   void checkSafety(CHIArgs C, BasicBlock *BB, InsKind K,
-                   SmallVectorImpl<CHIArg> &Safe) {
-    int NumBBsOnAllPaths = MaxNumberOfBBSInPath;
-    for (auto CHI : C) {
-      Instruction *Insn = CHI.I;
-      if (!Insn) // No instruction was inserted in this CHI.
-        continue;
-      if (K == InsKind::Scalar) {
-        if (safeToHoistScalar(BB, Insn->getParent(), NumBBsOnAllPaths))
-          Safe.push_back(CHI);
-      } else {
-        MemoryUseOrDef *UD = MSSA->getMemoryAccess(Insn);
-        if (safeToHoistLdSt(BB->getTerminator(), Insn, UD, K, NumBBsOnAllPaths))
-          Safe.push_back(CHI);
-      }
-    }
-  }
+                   SmallVectorImpl<CHIArg> &Safe);
 
   using RenameStackType = DenseMap<VNType, SmallVector<Instruction *, 2>>;
 
   // Push all the VNs corresponding to BB into RenameStack.
   void fillRenameStack(BasicBlock *BB, InValuesType &ValueBBs,
-                       RenameStackType &RenameStack) {
-    auto it1 = ValueBBs.find(BB);
-    if (it1 != ValueBBs.end()) {
-      // Iterate in reverse order to keep lower ranked values on the top.
-      for (std::pair<VNType, Instruction *> &VI : reverse(it1->second)) {
-        // Get the value of instruction I
-        LLVM_DEBUG(dbgs() << "\nPushing on stack: " << *VI.second);
-        RenameStack[VI.first].push_back(VI.second);
-      }
-    }
-  }
+                       RenameStackType &RenameStack);
 
   void fillChiArgs(BasicBlock *BB, OutValuesType &CHIBBs,
-                   RenameStackType &RenameStack) {
-    // For each *predecessor* (because Post-DOM) of BB check if it has a CHI
-    for (auto Pred : predecessors(BB)) {
-      auto P = CHIBBs.find(Pred);
-      if (P == CHIBBs.end()) {
-        continue;
-      }
-      LLVM_DEBUG(dbgs() << "\nLooking at CHIs in: " << Pred->getName(););
-      // A CHI is found (BB -> Pred is an edge in the CFG)
-      // Pop the stack until Top(V) = Ve.
-      auto &VCHI = P->second;
-      for (auto It = VCHI.begin(), E = VCHI.end(); It != E;) {
-        CHIArg &C = *It;
-        if (!C.Dest) {
-          auto si = RenameStack.find(C.VN);
-          // The Basic Block where CHI is must dominate the value we want to
-          // track in a CHI. In the PDom walk, there can be values in the
-          // stack which are not control dependent e.g., nested loop.
-          if (si != RenameStack.end() && si->second.size() &&
-              DT->properlyDominates(Pred, si->second.back()->getParent())) {
-            C.Dest = BB;                     // Assign the edge
-            C.I = si->second.pop_back_val(); // Assign the argument
-            LLVM_DEBUG(dbgs()
-                       << "\nCHI Inserted in BB: " << C.Dest->getName() << *C.I
-                       << ", VN: " << C.VN.first << ", " << C.VN.second);
-          }
-          // Move to next CHI of a different value
-          It = std::find_if(It, VCHI.end(),
-                            [It](CHIArg &A) { return A != *It; });
-        } else
-          ++It;
-      }
-    }
-  }
+                   RenameStackType &RenameStack);
 
   // Walk the post-dominator tree top-down and use a stack for each value to
   // store the last value you see. When you hit a CHI from a given edge, the
@@ -692,48 +396,7 @@ private:
   // they form a list of anticipable values. OutValues contains CHIs
   // corresponding to each basic block.
   void findHoistableCandidates(OutValuesType &CHIBBs, InsKind K,
-                               HoistingPointList &HPL) {
-    auto cmpVN = [](const CHIArg &A, const CHIArg &B) { return A.VN < B.VN; };
-
-    // CHIArgs now have the outgoing values, so check for anticipability and
-    // accumulate hoistable candidates in HPL.
-    for (std::pair<BasicBlock *, SmallVector<CHIArg, 2>> &A : CHIBBs) {
-      BasicBlock *BB = A.first;
-      SmallVectorImpl<CHIArg> &CHIs = A.second;
-      // Vector of PHIs contains PHIs for different instructions.
-      // Sort the args according to their VNs, such that identical
-      // instructions are together.
-      llvm::stable_sort(CHIs, cmpVN);
-      auto TI = BB->getTerminator();
-      auto B = CHIs.begin();
-      // [PreIt, PHIIt) form a range of CHIs which have identical VNs.
-      auto PHIIt = std::find_if(CHIs.begin(), CHIs.end(),
-                                 [B](CHIArg &A) { return A != *B; });
-      auto PrevIt = CHIs.begin();
-      while (PrevIt != PHIIt) {
-        // Collect values which satisfy safety checks.
-        SmallVector<CHIArg, 2> Safe;
-        // We check for safety first because there might be multiple values in
-        // the same path, some of which are not safe to be hoisted, but overall
-        // each edge has at least one value which can be hoisted, making the
-        // value anticipable along that path.
-        checkSafety(make_range(PrevIt, PHIIt), BB, K, Safe);
-
-        // List of safe values should be anticipable at TI.
-        if (valueAnticipable(make_range(Safe.begin(), Safe.end()), TI)) {
-          HPL.push_back({BB, SmallVecInsn()});
-          SmallVecInsn &V = HPL.back().second;
-          for (auto B : Safe)
-            V.push_back(B.I);
-        }
-
-        // Check other VNs
-        PrevIt = PHIIt;
-        PHIIt = std::find_if(PrevIt, CHIs.end(),
-                             [PrevIt](CHIArg &A) { return A != *PrevIt; });
-      }
-    }
-  }
+                               HoistingPointList &HPL);
 
   // Compute insertion points for each values which can be fully anticipated at
   // a dominator. HPL contains all such values.
@@ -791,14 +454,14 @@ private:
       }
       // Insert empty CHI node for this VN. This is used to factor out
       // basic blocks where the ANTIC can potentially change.
-      for (auto IDFB : IDFBlocks) {
+      CHIArg EmptyChi = {VN, nullptr, nullptr};
+      for (auto *IDFBB : IDFBlocks) {
         for (unsigned i = 0; i < V.size(); ++i) {
-          CHIArg C = {VN, nullptr, nullptr};
-           // Ignore spurious PDFs.
-          if (DT->properlyDominates(IDFB, V[i]->getParent())) {
-            OutValue[IDFB].push_back(C);
-            LLVM_DEBUG(dbgs() << "\nInsertion a CHI for BB: " << IDFB->getName()
-                              << ", for Insn: " << *V[i]);
+          // Ignore spurious PDFs.
+          if (DT->properlyDominates(IDFBB, V[i]->getParent())) {
+            OutValue[IDFBB].push_back(EmptyChi);
+            LLVM_DEBUG(dbgs() << "\nInserting a CHI for BB: "
+                              << IDFBB->getName() << ", for Insn: " << *V[i]);
           }
         }
       }
@@ -816,364 +479,754 @@ private:
   // a load without hoisting its access function. So before hoisting any
   // expression, make sure that all its operands are available at insert point.
   bool allOperandsAvailable(const Instruction *I,
-                            const BasicBlock *HoistPt) const {
-    for (const Use &Op : I->operands())
-      if (const auto *Inst = dyn_cast<Instruction>(&Op))
-        if (!DT->dominates(Inst->getParent(), HoistPt))
-          return false;
-
-    return true;
-  }
+                            const BasicBlock *HoistPt) const;
 
   // Same as allOperandsAvailable with recursive check for GEP operands.
   bool allGepOperandsAvailable(const Instruction *I,
-                               const BasicBlock *HoistPt) const {
-    for (const Use &Op : I->operands())
-      if (const auto *Inst = dyn_cast<Instruction>(&Op))
-        if (!DT->dominates(Inst->getParent(), HoistPt)) {
-          if (const GetElementPtrInst *GepOp =
-                  dyn_cast<GetElementPtrInst>(Inst)) {
-            if (!allGepOperandsAvailable(GepOp, HoistPt))
-              return false;
-            // Gep is available if all operands of GepOp are available.
-          } else {
-            // Gep is not available if it has operands other than GEPs that are
-            // defined in blocks not dominating HoistPt.
-            return false;
-          }
-        }
-    return true;
-  }
+                               const BasicBlock *HoistPt) const;
 
   // Make all operands of the GEP available.
   void makeGepsAvailable(Instruction *Repl, BasicBlock *HoistPt,
                          const SmallVecInsn &InstructionsToHoist,
-                         Instruction *Gep) const {
-    assert(allGepOperandsAvailable(Gep, HoistPt) &&
-           "GEP operands not available");
-
-    Instruction *ClonedGep = Gep->clone();
-    for (unsigned i = 0, e = Gep->getNumOperands(); i != e; ++i)
-      if (Instruction *Op = dyn_cast<Instruction>(Gep->getOperand(i))) {
-        // Check whether the operand is already available.
-        if (DT->dominates(Op->getParent(), HoistPt))
-          continue;
+                         Instruction *Gep) const;
+
+  void updateAlignment(Instruction *I, Instruction *Repl);
+
+  // Remove all the instructions in Candidates and replace their usage with
+  // Repl. Returns the number of instructions removed.
+  unsigned rauw(const SmallVecInsn &Candidates, Instruction *Repl,
+                MemoryUseOrDef *NewMemAcc);
+
+  // Replace all Memory PHI usage with NewMemAcc.
+  void raMPHIuw(MemoryUseOrDef *NewMemAcc);
+
+  // Remove all other instructions and replace them with Repl.
+  unsigned removeAndReplace(const SmallVecInsn &Candidates, Instruction *Repl,
+                            BasicBlock *DestBB, bool MoveAccess);
+
+  // In the case Repl is a load or a store, we make all their GEPs
+  // available: GEPs are not hoisted by default to avoid the address
+  // computations to be hoisted without the associated load or store.
+  bool makeGepOperandsAvailable(Instruction *Repl, BasicBlock *HoistPt,
+                                const SmallVecInsn &InstructionsToHoist) const;
+
+  std::pair<unsigned, unsigned> hoist(HoistingPointList &HPL);
+
+  // Hoist all expressions. Returns Number of scalars hoisted
+  // and number of non-scalars hoisted.
+  std::pair<unsigned, unsigned> hoistExpressions(Function &F);
+};
+
+class GVNHoistLegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  GVNHoistLegacyPass() : FunctionPass(ID) {
+    initializeGVNHoistLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+    auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+    auto &MD = getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
+    auto &MSSA = getAnalysis<MemorySSAWrapperPass>().getMSSA();
+
+    GVNHoist G(&DT, &PDT, &AA, &MD, &MSSA);
+    return G.run(F);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<PostDominatorTreeWrapperPass>();
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addRequired<MemoryDependenceWrapperPass>();
+    AU.addRequired<MemorySSAWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addPreserved<MemorySSAWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+};
+
+bool GVNHoist::run(Function &F) {
+  NumFuncArgs = F.arg_size();
+  VN.setDomTree(DT);
+  VN.setAliasAnalysis(AA);
+  VN.setMemDep(MD);
+  bool Res = false;
+  // Perform DFS Numbering of instructions.
+  unsigned BBI = 0;
+  for (const BasicBlock *BB : depth_first(&F.getEntryBlock())) {
+    DFSNumber[BB] = ++BBI;
+    unsigned I = 0;
+    for (auto &Inst : *BB)
+      DFSNumber[&Inst] = ++I;
+  }
+
+  int ChainLength = 0;
+
+  // FIXME: use lazy evaluation of VN to avoid the fix-point computation.
+  while (true) {
+    if (MaxChainLength != -1 && ++ChainLength >= MaxChainLength)
+      return Res;
+
+    auto HoistStat = hoistExpressions(F);
+    if (HoistStat.first + HoistStat.second == 0)
+      return Res;
+
+    if (HoistStat.second > 0)
+      // To address a limitation of the current GVN, we need to rerun the
+      // hoisting after we hoisted loads or stores in order to be able to
+      // hoist all scalars dependent on the hoisted ld/st.
+      VN.clear();
+
+    Res = true;
+  }
+
+  return Res;
+}
+
+unsigned int GVNHoist::rank(const Value *V) const {
+  // Prefer constants to undef to anything else
+  // Undef is a constant, have to check it first.
+  // Prefer smaller constants to constantexprs
+  if (isa<ConstantExpr>(V))
+    return 2;
+  if (isa<UndefValue>(V))
+    return 1;
+  if (isa<Constant>(V))
+    return 0;
+  else if (auto *A = dyn_cast<Argument>(V))
+    return 3 + A->getArgNo();
+
+  // Need to shift the instruction DFS by number of arguments + 3 to account
+  // for the constant and argument ranking above.
+  auto Result = DFSNumber.lookup(V);
+  if (Result > 0)
+    return 4 + NumFuncArgs + Result;
+  // Unreachable or something else, just return a really large number.
+  return ~0;
+}
+
+bool GVNHoist::hasEH(const BasicBlock *BB) {
+  auto It = BBSideEffects.find(BB);
+  if (It != BBSideEffects.end())
+    return It->second;
+
+  if (BB->isEHPad() || BB->hasAddressTaken()) {
+    BBSideEffects[BB] = true;
+    return true;
+  }
+
+  if (BB->getTerminator()->mayThrow()) {
+    BBSideEffects[BB] = true;
+    return true;
+  }
+
+  BBSideEffects[BB] = false;
+  return false;
+}
+
+bool GVNHoist::hasMemoryUse(const Instruction *NewPt, MemoryDef *Def,
+                            const BasicBlock *BB) {
+  const MemorySSA::AccessList *Acc = MSSA->getBlockAccesses(BB);
+  if (!Acc)
+    return false;
+
+  Instruction *OldPt = Def->getMemoryInst();
+  const BasicBlock *OldBB = OldPt->getParent();
+  const BasicBlock *NewBB = NewPt->getParent();
+  bool ReachedNewPt = false;
 
-        // As a GEP can refer to other GEPs, recursively make all the operands
-        // of this GEP available at HoistPt.
-        if (GetElementPtrInst *GepOp = dyn_cast<GetElementPtrInst>(Op))
-          makeGepsAvailable(ClonedGep, HoistPt, InstructionsToHoist, GepOp);
+  for (const MemoryAccess &MA : *Acc)
+    if (const MemoryUse *MU = dyn_cast<MemoryUse>(&MA)) {
+      Instruction *Insn = MU->getMemoryInst();
+
+      // Do not check whether MU aliases Def when MU occurs after OldPt.
+      if (BB == OldBB && firstInBB(OldPt, Insn))
+        break;
+
+      // Do not check whether MU aliases Def when MU occurs before NewPt.
+      if (BB == NewBB) {
+        if (!ReachedNewPt) {
+          if (firstInBB(Insn, NewPt))
+            continue;
+          ReachedNewPt = true;
+        }
       }
+      if (MemorySSAUtil::defClobbersUseOrDef(Def, MU, *AA))
+        return true;
+    }
+
+  return false;
+}
+
+bool GVNHoist::hasEHhelper(const BasicBlock *BB, const BasicBlock *SrcBB,
+                           int &NBBsOnAllPaths) {
+  // Stop walk once the limit is reached.
+  if (NBBsOnAllPaths == 0)
+    return true;
+
+  // Impossible to hoist with exceptions on the path.
+  if (hasEH(BB))
+    return true;
+
+  // No such instruction after HoistBarrier in a basic block was
+  // selected for hoisting so instructions selected within basic block with
+  // a hoist barrier can be hoisted.
+  if ((BB != SrcBB) && HoistBarrier.count(BB))
+    return true;
+
+  return false;
+}
 
-    // Copy Gep and replace its uses in Repl with ClonedGep.
-    ClonedGep->insertBefore(HoistPt->getTerminator());
-
-    // Conservatively discard any optimization hints, they may differ on the
-    // other paths.
-    ClonedGep->dropUnknownNonDebugMetadata();
-
-    // If we have optimization hints which agree with each other along different
-    // paths, preserve them.
-    for (const Instruction *OtherInst : InstructionsToHoist) {
-      const GetElementPtrInst *OtherGep;
-      if (auto *OtherLd = dyn_cast<LoadInst>(OtherInst))
-        OtherGep = cast<GetElementPtrInst>(OtherLd->getPointerOperand());
-      else
-        OtherGep = cast<GetElementPtrInst>(
-            cast<StoreInst>(OtherInst)->getPointerOperand());
-      ClonedGep->andIRFlags(OtherGep);
+bool GVNHoist::hasEHOrLoadsOnPath(const Instruction *NewPt, MemoryDef *Def,
+                                  int &NBBsOnAllPaths) {
+  const BasicBlock *NewBB = NewPt->getParent();
+  const BasicBlock *OldBB = Def->getBlock();
+  assert(DT->dominates(NewBB, OldBB) && "invalid path");
+  assert(DT->dominates(Def->getDefiningAccess()->getBlock(), NewBB) &&
+         "def does not dominate new hoisting point");
+
+  // Walk all basic blocks reachable in depth-first iteration on the inverse
+  // CFG from OldBB to NewBB. These blocks are all the blocks that may be
+  // executed between the execution of NewBB and OldBB. Hoisting an expression
+  // from OldBB into NewBB has to be safe on all execution paths.
+  for (auto I = idf_begin(OldBB), E = idf_end(OldBB); I != E;) {
+    const BasicBlock *BB = *I;
+    if (BB == NewBB) {
+      // Stop traversal when reaching HoistPt.
+      I.skipChildren();
+      continue;
     }
 
-    // Replace uses of Gep with ClonedGep in Repl.
-    Repl->replaceUsesOfWith(Gep, ClonedGep);
+    if (hasEHhelper(BB, OldBB, NBBsOnAllPaths))
+      return true;
+
+    // Check that we do not move a store past loads.
+    if (hasMemoryUse(NewPt, Def, BB))
+      return true;
+
+    // -1 is unlimited number of blocks on all paths.
+    if (NBBsOnAllPaths != -1)
+      --NBBsOnAllPaths;
+
+    ++I;
   }
 
-  void updateAlignment(Instruction *I, Instruction *Repl) {
-    if (auto *ReplacementLoad = dyn_cast<LoadInst>(Repl)) {
-      ReplacementLoad->setAlignment(
-          std::min(ReplacementLoad->getAlign(), cast<LoadInst>(I)->getAlign()));
-      ++NumLoadsRemoved;
-    } else if (auto *ReplacementStore = dyn_cast<StoreInst>(Repl)) {
-      ReplacementStore->setAlignment(std::min(ReplacementStore->getAlign(),
-                                              cast<StoreInst>(I)->getAlign()));
-      ++NumStoresRemoved;
-    } else if (auto *ReplacementAlloca = dyn_cast<AllocaInst>(Repl)) {
-      ReplacementAlloca->setAlignment(std::max(
-          ReplacementAlloca->getAlign(), cast<AllocaInst>(I)->getAlign()));
-    } else if (isa<CallInst>(Repl)) {
-      ++NumCallsRemoved;
+  return false;
+}
+
+bool GVNHoist::hasEHOnPath(const BasicBlock *HoistPt, const BasicBlock *SrcBB,
+                           int &NBBsOnAllPaths) {
+  assert(DT->dominates(HoistPt, SrcBB) && "Invalid path");
+
+  // Walk all basic blocks reachable in depth-first iteration on
+  // the inverse CFG from BBInsn to NewHoistPt. These blocks are all the
+  // blocks that may be executed between the execution of NewHoistPt and
+  // BBInsn. Hoisting an expression from BBInsn into NewHoistPt has to be safe
+  // on all execution paths.
+  for (auto I = idf_begin(SrcBB), E = idf_end(SrcBB); I != E;) {
+    const BasicBlock *BB = *I;
+    if (BB == HoistPt) {
+      // Stop traversal when reaching NewHoistPt.
+      I.skipChildren();
+      continue;
     }
+
+    if (hasEHhelper(BB, SrcBB, NBBsOnAllPaths))
+      return true;
+
+    // -1 is unlimited number of blocks on all paths.
+    if (NBBsOnAllPaths != -1)
+      --NBBsOnAllPaths;
+
+    ++I;
   }
 
-  // Remove all the instructions in Candidates and replace their usage with Repl.
-  // Returns the number of instructions removed.
-  unsigned rauw(const SmallVecInsn &Candidates, Instruction *Repl,
-                MemoryUseOrDef *NewMemAcc) {
-    unsigned NR = 0;
-    for (Instruction *I : Candidates) {
-      if (I != Repl) {
-        ++NR;
-        updateAlignment(I, Repl);
-        if (NewMemAcc) {
-          // Update the uses of the old MSSA access with NewMemAcc.
-          MemoryAccess *OldMA = MSSA->getMemoryAccess(I);
-          OldMA->replaceAllUsesWith(NewMemAcc);
-          MSSAUpdater->removeMemoryAccess(OldMA);
-        }
+  return false;
+}
 
-        Repl->andIRFlags(I);
-        combineKnownMetadata(Repl, I);
-        I->replaceAllUsesWith(Repl);
-        // Also invalidate the Alias Analysis cache.
-        MD->removeInstruction(I);
-        I->eraseFromParent();
-      }
+bool GVNHoist::safeToHoistLdSt(const Instruction *NewPt,
+                               const Instruction *OldPt, MemoryUseOrDef *U,
+                               GVNHoist::InsKind K, int &NBBsOnAllPaths) {
+  // In place hoisting is safe.
+  if (NewPt == OldPt)
+    return true;
+
+  const BasicBlock *NewBB = NewPt->getParent();
+  const BasicBlock *OldBB = OldPt->getParent();
+  const BasicBlock *UBB = U->getBlock();
+
+  // Check for dependences on the Memory SSA.
+  MemoryAccess *D = U->getDefiningAccess();
+  BasicBlock *DBB = D->getBlock();
+  if (DT->properlyDominates(NewBB, DBB))
+    // Cannot move the load or store to NewBB above its definition in DBB.
+    return false;
+
+  if (NewBB == DBB && !MSSA->isLiveOnEntryDef(D))
+    if (auto *UD = dyn_cast<MemoryUseOrDef>(D))
+      if (!firstInBB(UD->getMemoryInst(), NewPt))
+        // Cannot move the load or store to NewPt above its definition in D.
+        return false;
+
+  // Check for unsafe hoistings due to side effects.
+  if (K == InsKind::Store) {
+    if (hasEHOrLoadsOnPath(NewPt, cast<MemoryDef>(U), NBBsOnAllPaths))
+      return false;
+  } else if (hasEHOnPath(NewBB, OldBB, NBBsOnAllPaths))
+    return false;
+
+  if (UBB == NewBB) {
+    if (DT->properlyDominates(DBB, NewBB))
+      return true;
+    assert(UBB == DBB);
+    assert(MSSA->locallyDominates(D, U));
+  }
+
+  // No side effects: it is safe to hoist.
+  return true;
+}
+
+bool GVNHoist::valueAnticipable(CHIArgs C, Instruction *TI) const {
+  if (TI->getNumSuccessors() > (unsigned)size(C))
+    return false; // Not enough args in this CHI.
+
+  for (auto CHI : C) {
+    // Find if all the edges have values flowing out of BB.
+    if (!llvm::is_contained(successors(TI), CHI.Dest))
+      return false;
+  }
+  return true;
+}
+
+void GVNHoist::checkSafety(CHIArgs C, BasicBlock *BB, GVNHoist::InsKind K,
+                           SmallVectorImpl<CHIArg> &Safe) {
+  int NumBBsOnAllPaths = MaxNumberOfBBSInPath;
+  for (auto CHI : C) {
+    Instruction *Insn = CHI.I;
+    if (!Insn) // No instruction was inserted in this CHI.
+      continue;
+    if (K == InsKind::Scalar) {
+      if (safeToHoistScalar(BB, Insn->getParent(), NumBBsOnAllPaths))
+        Safe.push_back(CHI);
+    } else {
+      auto *T = BB->getTerminator();
+      if (MemoryUseOrDef *UD = MSSA->getMemoryAccess(Insn))
+        if (safeToHoistLdSt(T, Insn, UD, K, NumBBsOnAllPaths))
+          Safe.push_back(CHI);
     }
-    return NR;
   }
+}
 
-  // Replace all Memory PHI usage with NewMemAcc.
-  void raMPHIuw(MemoryUseOrDef *NewMemAcc) {
-    SmallPtrSet<MemoryPhi *, 4> UsePhis;
-    for (User *U : NewMemAcc->users())
-      if (MemoryPhi *Phi = dyn_cast<MemoryPhi>(U))
-        UsePhis.insert(Phi);
-
-    for (MemoryPhi *Phi : UsePhis) {
-      auto In = Phi->incoming_values();
-      if (llvm::all_of(In, [&](Use &U) { return U == NewMemAcc; })) {
-        Phi->replaceAllUsesWith(NewMemAcc);
-        MSSAUpdater->removeMemoryAccess(Phi);
-      }
+void GVNHoist::fillRenameStack(BasicBlock *BB, InValuesType &ValueBBs,
+                               GVNHoist::RenameStackType &RenameStack) {
+  auto it1 = ValueBBs.find(BB);
+  if (it1 != ValueBBs.end()) {
+    // Iterate in reverse order to keep lower ranked values on the top.
+    for (std::pair<VNType, Instruction *> &VI : reverse(it1->second)) {
+      // Get the value of instruction I
+      LLVM_DEBUG(dbgs() << "\nPushing on stack: " << *VI.second);
+      RenameStack[VI.first].push_back(VI.second);
     }
   }
+}
 
-  // Remove all other instructions and replace them with Repl.
-  unsigned removeAndReplace(const SmallVecInsn &Candidates, Instruction *Repl,
-                            BasicBlock *DestBB, bool MoveAccess) {
-    MemoryUseOrDef *NewMemAcc = MSSA->getMemoryAccess(Repl);
-    if (MoveAccess && NewMemAcc) {
-        // The definition of this ld/st will not change: ld/st hoisting is
-        // legal when the ld/st is not moved past its current definition.
-        MSSAUpdater->moveToPlace(NewMemAcc, DestBB,
-                                 MemorySSA::BeforeTerminator);
+void GVNHoist::fillChiArgs(BasicBlock *BB, OutValuesType &CHIBBs,
+                           GVNHoist::RenameStackType &RenameStack) {
+  // For each *predecessor* (because Post-DOM) of BB check if it has a CHI
+  for (auto Pred : predecessors(BB)) {
+    auto P = CHIBBs.find(Pred);
+    if (P == CHIBBs.end()) {
+      continue;
     }
+    LLVM_DEBUG(dbgs() << "\nLooking at CHIs in: " << Pred->getName(););
+    // A CHI is found (BB -> Pred is an edge in the CFG)
+    // Pop the stack until Top(V) = Ve.
+    auto &VCHI = P->second;
+    for (auto It = VCHI.begin(), E = VCHI.end(); It != E;) {
+      CHIArg &C = *It;
+      if (!C.Dest) {
+        auto si = RenameStack.find(C.VN);
+        // The Basic Block where CHI is must dominate the value we want to
+        // track in a CHI. In the PDom walk, there can be values in the
+        // stack which are not control dependent e.g., nested loop.
+        if (si != RenameStack.end() && si->second.size() &&
+            DT->properlyDominates(Pred, si->second.back()->getParent())) {
+          C.Dest = BB;                     // Assign the edge
+          C.I = si->second.pop_back_val(); // Assign the argument
+          LLVM_DEBUG(dbgs()
+                     << "\nCHI Inserted in BB: " << C.Dest->getName() << *C.I
+                     << ", VN: " << C.VN.first << ", " << C.VN.second);
+        }
+        // Move to next CHI of a different value
+        It = std::find_if(It, VCHI.end(), [It](CHIArg &A) { return A != *It; });
+      } else
+        ++It;
+    }
+  }
+}
 
-    // Replace all other instructions with Repl with memory access NewMemAcc.
-    unsigned NR = rauw(Candidates, Repl, NewMemAcc);
+void GVNHoist::findHoistableCandidates(OutValuesType &CHIBBs,
+                                       GVNHoist::InsKind K,
+                                       HoistingPointList &HPL) {
+  auto cmpVN = [](const CHIArg &A, const CHIArg &B) { return A.VN < B.VN; };
+
+  // CHIArgs now have the outgoing values, so check for anticipability and
+  // accumulate hoistable candidates in HPL.
+  for (std::pair<BasicBlock *, SmallVector<CHIArg, 2>> &A : CHIBBs) {
+    BasicBlock *BB = A.first;
+    SmallVectorImpl<CHIArg> &CHIs = A.second;
+    // Vector of PHIs contains PHIs for different instructions.
+    // Sort the args according to their VNs, such that identical
+    // instructions are together.
+    llvm::stable_sort(CHIs, cmpVN);
+    auto TI = BB->getTerminator();
+    auto B = CHIs.begin();
+    // [PreIt, PHIIt) form a range of CHIs which have identical VNs.
+    auto PHIIt = llvm::find_if(CHIs, [B](CHIArg &A) { return A != *B; });
+    auto PrevIt = CHIs.begin();
+    while (PrevIt != PHIIt) {
+      // Collect values which satisfy safety checks.
+      SmallVector<CHIArg, 2> Safe;
+      // We check for safety first because there might be multiple values in
+      // the same path, some of which are not safe to be hoisted, but overall
+      // each edge has at least one value which can be hoisted, making the
+      // value anticipable along that path.
+      checkSafety(make_range(PrevIt, PHIIt), BB, K, Safe);
+
+      // List of safe values should be anticipable at TI.
+      if (valueAnticipable(make_range(Safe.begin(), Safe.end()), TI)) {
+        HPL.push_back({BB, SmallVecInsn()});
+        SmallVecInsn &V = HPL.back().second;
+        for (auto B : Safe)
+          V.push_back(B.I);
+      }
 
-    // Remove MemorySSA phi nodes with the same arguments.
-    if (NewMemAcc)
-      raMPHIuw(NewMemAcc);
-    return NR;
+      // Check other VNs
+      PrevIt = PHIIt;
+      PHIIt = std::find_if(PrevIt, CHIs.end(),
+                           [PrevIt](CHIArg &A) { return A != *PrevIt; });
+    }
   }
+}
 
-  // In the case Repl is a load or a store, we make all their GEPs
-  // available: GEPs are not hoisted by default to avoid the address
-  // computations to be hoisted without the associated load or store.
-  bool makeGepOperandsAvailable(Instruction *Repl, BasicBlock *HoistPt,
-                                const SmallVecInsn &InstructionsToHoist) const {
-    // Check whether the GEP of a ld/st can be synthesized at HoistPt.
-    GetElementPtrInst *Gep = nullptr;
-    Instruction *Val = nullptr;
-    if (auto *Ld = dyn_cast<LoadInst>(Repl)) {
-      Gep = dyn_cast<GetElementPtrInst>(Ld->getPointerOperand());
-    } else if (auto *St = dyn_cast<StoreInst>(Repl)) {
-      Gep = dyn_cast<GetElementPtrInst>(St->getPointerOperand());
-      Val = dyn_cast<Instruction>(St->getValueOperand());
-      // Check that the stored value is available.
-      if (Val) {
-        if (isa<GetElementPtrInst>(Val)) {
-          // Check whether we can compute the GEP at HoistPt.
-          if (!allGepOperandsAvailable(Val, HoistPt))
+bool GVNHoist::allOperandsAvailable(const Instruction *I,
+                                    const BasicBlock *HoistPt) const {
+  for (const Use &Op : I->operands())
+    if (const auto *Inst = dyn_cast<Instruction>(&Op))
+      if (!DT->dominates(Inst->getParent(), HoistPt))
+        return false;
+
+  return true;
+}
+
+bool GVNHoist::allGepOperandsAvailable(const Instruction *I,
+                                       const BasicBlock *HoistPt) const {
+  for (const Use &Op : I->operands())
+    if (const auto *Inst = dyn_cast<Instruction>(&Op))
+      if (!DT->dominates(Inst->getParent(), HoistPt)) {
+        if (const GetElementPtrInst *GepOp =
+                dyn_cast<GetElementPtrInst>(Inst)) {
+          if (!allGepOperandsAvailable(GepOp, HoistPt))
             return false;
-        } else if (!DT->dominates(Val->getParent(), HoistPt))
+          // Gep is available if all operands of GepOp are available.
+        } else {
+          // Gep is not available if it has operands other than GEPs that are
+          // defined in blocks not dominating HoistPt.
           return false;
+        }
       }
-    }
+  return true;
+}
 
-    // Check whether we can compute the Gep at HoistPt.
-    if (!Gep || !allGepOperandsAvailable(Gep, HoistPt))
-      return false;
+void GVNHoist::makeGepsAvailable(Instruction *Repl, BasicBlock *HoistPt,
+                                 const SmallVecInsn &InstructionsToHoist,
+                                 Instruction *Gep) const {
+  assert(allGepOperandsAvailable(Gep, HoistPt) && "GEP operands not available");
 
-    makeGepsAvailable(Repl, HoistPt, InstructionsToHoist, Gep);
+  Instruction *ClonedGep = Gep->clone();
+  for (unsigned i = 0, e = Gep->getNumOperands(); i != e; ++i)
+    if (Instruction *Op = dyn_cast<Instruction>(Gep->getOperand(i))) {
+      // Check whether the operand is already available.
+      if (DT->dominates(Op->getParent(), HoistPt))
+        continue;
 
-    if (Val && isa<GetElementPtrInst>(Val))
-      makeGepsAvailable(Repl, HoistPt, InstructionsToHoist, Val);
+      // As a GEP can refer to other GEPs, recursively make all the operands
+      // of this GEP available at HoistPt.
+      if (GetElementPtrInst *GepOp = dyn_cast<GetElementPtrInst>(Op))
+        makeGepsAvailable(ClonedGep, HoistPt, InstructionsToHoist, GepOp);
+    }
 
-    return true;
-  }
+  // Copy Gep and replace its uses in Repl with ClonedGep.
+  ClonedGep->insertBefore(HoistPt->getTerminator());
 
-  std::pair<unsigned, unsigned> hoist(HoistingPointList &HPL) {
-    unsigned NI = 0, NL = 0, NS = 0, NC = 0, NR = 0;
-    for (const HoistingPointInfo &HP : HPL) {
-      // Find out whether we already have one of the instructions in HoistPt,
-      // in which case we do not have to move it.
-      BasicBlock *DestBB = HP.first;
-      const SmallVecInsn &InstructionsToHoist = HP.second;
-      Instruction *Repl = nullptr;
-      for (Instruction *I : InstructionsToHoist)
-        if (I->getParent() == DestBB)
-          // If there are two instructions in HoistPt to be hoisted in place:
-          // update Repl to be the first one, such that we can rename the uses
-          // of the second based on the first.
-          if (!Repl || firstInBB(I, Repl))
-            Repl = I;
-
-      // Keep track of whether we moved the instruction so we know whether we
-      // should move the MemoryAccess.
-      bool MoveAccess = true;
-      if (Repl) {
-        // Repl is already in HoistPt: it remains in place.
-        assert(allOperandsAvailable(Repl, DestBB) &&
-               "instruction depends on operands that are not available");
-        MoveAccess = false;
-      } else {
-        // When we do not find Repl in HoistPt, select the first in the list
-        // and move it to HoistPt.
-        Repl = InstructionsToHoist.front();
-
-        // We can move Repl in HoistPt only when all operands are available.
-        // The order in which hoistings are done may influence the availability
-        // of operands.
-        if (!allOperandsAvailable(Repl, DestBB)) {
-          // When HoistingGeps there is nothing more we can do to make the
-          // operands available: just continue.
-          if (HoistingGeps)
-            continue;
+  // Conservatively discard any optimization hints, they may differ on the
+  // other paths.
+  ClonedGep->dropUnknownNonDebugMetadata();
 
-          // When not HoistingGeps we need to copy the GEPs.
-          if (!makeGepOperandsAvailable(Repl, DestBB, InstructionsToHoist))
-            continue;
-        }
+  // If we have optimization hints which agree with each other along different
+  // paths, preserve them.
+  for (const Instruction *OtherInst : InstructionsToHoist) {
+    const GetElementPtrInst *OtherGep;
+    if (auto *OtherLd = dyn_cast<LoadInst>(OtherInst))
+      OtherGep = cast<GetElementPtrInst>(OtherLd->getPointerOperand());
+    else
+      OtherGep = cast<GetElementPtrInst>(
+          cast<StoreInst>(OtherInst)->getPointerOperand());
+    ClonedGep->andIRFlags(OtherGep);
+  }
 
-        // Move the instruction at the end of HoistPt.
-        Instruction *Last = DestBB->getTerminator();
-        MD->removeInstruction(Repl);
-        Repl->moveBefore(Last);
+  // Replace uses of Gep with ClonedGep in Repl.
+  Repl->replaceUsesOfWith(Gep, ClonedGep);
+}
 
-        DFSNumber[Repl] = DFSNumber[Last]++;
+void GVNHoist::updateAlignment(Instruction *I, Instruction *Repl) {
+  if (auto *ReplacementLoad = dyn_cast<LoadInst>(Repl)) {
+    ReplacementLoad->setAlignment(
+        std::min(ReplacementLoad->getAlign(), cast<LoadInst>(I)->getAlign()));
+    ++NumLoadsRemoved;
+  } else if (auto *ReplacementStore = dyn_cast<StoreInst>(Repl)) {
+    ReplacementStore->setAlignment(
+        std::min(ReplacementStore->getAlign(), cast<StoreInst>(I)->getAlign()));
+    ++NumStoresRemoved;
+  } else if (auto *ReplacementAlloca = dyn_cast<AllocaInst>(Repl)) {
+    ReplacementAlloca->setAlignment(std::max(ReplacementAlloca->getAlign(),
+                                             cast<AllocaInst>(I)->getAlign()));
+  } else if (isa<CallInst>(Repl)) {
+    ++NumCallsRemoved;
+  }
+}
+
+unsigned GVNHoist::rauw(const SmallVecInsn &Candidates, Instruction *Repl,
+                        MemoryUseOrDef *NewMemAcc) {
+  unsigned NR = 0;
+  for (Instruction *I : Candidates) {
+    if (I != Repl) {
+      ++NR;
+      updateAlignment(I, Repl);
+      if (NewMemAcc) {
+        // Update the uses of the old MSSA access with NewMemAcc.
+        MemoryAccess *OldMA = MSSA->getMemoryAccess(I);
+        OldMA->replaceAllUsesWith(NewMemAcc);
+        MSSAUpdater->removeMemoryAccess(OldMA);
       }
 
-      NR += removeAndReplace(InstructionsToHoist, Repl, DestBB, MoveAccess);
+      Repl->andIRFlags(I);
+      combineKnownMetadata(Repl, I);
+      I->replaceAllUsesWith(Repl);
+      // Also invalidate the Alias Analysis cache.
+      MD->removeInstruction(I);
+      I->eraseFromParent();
+    }
+  }
+  return NR;
+}
 
-      if (isa<LoadInst>(Repl))
-        ++NL;
-      else if (isa<StoreInst>(Repl))
-        ++NS;
-      else if (isa<CallInst>(Repl))
-        ++NC;
-      else // Scalar
-        ++NI;
+void GVNHoist::raMPHIuw(MemoryUseOrDef *NewMemAcc) {
+  SmallPtrSet<MemoryPhi *, 4> UsePhis;
+  for (User *U : NewMemAcc->users())
+    if (MemoryPhi *Phi = dyn_cast<MemoryPhi>(U))
+      UsePhis.insert(Phi);
+
+  for (MemoryPhi *Phi : UsePhis) {
+    auto In = Phi->incoming_values();
+    if (llvm::all_of(In, [&](Use &U) { return U == NewMemAcc; })) {
+      Phi->replaceAllUsesWith(NewMemAcc);
+      MSSAUpdater->removeMemoryAccess(Phi);
     }
+  }
+}
+
+unsigned GVNHoist::removeAndReplace(const SmallVecInsn &Candidates,
+                                    Instruction *Repl, BasicBlock *DestBB,
+                                    bool MoveAccess) {
+  MemoryUseOrDef *NewMemAcc = MSSA->getMemoryAccess(Repl);
+  if (MoveAccess && NewMemAcc) {
+    // The definition of this ld/st will not change: ld/st hoisting is
+    // legal when the ld/st is not moved past its current definition.
+    MSSAUpdater->moveToPlace(NewMemAcc, DestBB, MemorySSA::BeforeTerminator);
+  }
 
-    if (MSSA && VerifyMemorySSA)
-      MSSA->verifyMemorySSA();
+  // Replace all other instructions with Repl with memory access NewMemAcc.
+  unsigned NR = rauw(Candidates, Repl, NewMemAcc);
 
-    NumHoisted += NL + NS + NC + NI;
-    NumRemoved += NR;
-    NumLoadsHoisted += NL;
-    NumStoresHoisted += NS;
-    NumCallsHoisted += NC;
-    return {NI, NL + NC + NS};
+  // Remove MemorySSA phi nodes with the same arguments.
+  if (NewMemAcc)
+    raMPHIuw(NewMemAcc);
+  return NR;
+}
+
+bool GVNHoist::makeGepOperandsAvailable(
+    Instruction *Repl, BasicBlock *HoistPt,
+    const SmallVecInsn &InstructionsToHoist) const {
+  // Check whether the GEP of a ld/st can be synthesized at HoistPt.
+  GetElementPtrInst *Gep = nullptr;
+  Instruction *Val = nullptr;
+  if (auto *Ld = dyn_cast<LoadInst>(Repl)) {
+    Gep = dyn_cast<GetElementPtrInst>(Ld->getPointerOperand());
+  } else if (auto *St = dyn_cast<StoreInst>(Repl)) {
+    Gep = dyn_cast<GetElementPtrInst>(St->getPointerOperand());
+    Val = dyn_cast<Instruction>(St->getValueOperand());
+    // Check that the stored value is available.
+    if (Val) {
+      if (isa<GetElementPtrInst>(Val)) {
+        // Check whether we can compute the GEP at HoistPt.
+        if (!allGepOperandsAvailable(Val, HoistPt))
+          return false;
+      } else if (!DT->dominates(Val->getParent(), HoistPt))
+        return false;
+    }
   }
 
-  // Hoist all expressions. Returns Number of scalars hoisted
-  // and number of non-scalars hoisted.
-  std::pair<unsigned, unsigned> hoistExpressions(Function &F) {
-    InsnInfo II;
-    LoadInfo LI;
-    StoreInfo SI;
-    CallInfo CI;
-    for (BasicBlock *BB : depth_first(&F.getEntryBlock())) {
-      int InstructionNb = 0;
-      for (Instruction &I1 : *BB) {
-        // If I1 cannot guarantee progress, subsequent instructions
-        // in BB cannot be hoisted anyways.
-        if (!isGuaranteedToTransferExecutionToSuccessor(&I1)) {
-          HoistBarrier.insert(BB);
-          break;
-        }
-        // Only hoist the first instructions in BB up to MaxDepthInBB. Hoisting
-        // deeper may increase the register pressure and compilation time.
-        if (MaxDepthInBB != -1 && InstructionNb++ >= MaxDepthInBB)
-          break;
+  // Check whether we can compute the Gep at HoistPt.
+  if (!Gep || !allGepOperandsAvailable(Gep, HoistPt))
+    return false;
 
-        // Do not value number terminator instructions.
-        if (I1.isTerminator())
-          break;
+  makeGepsAvailable(Repl, HoistPt, InstructionsToHoist, Gep);
 
-        if (auto *Load = dyn_cast<LoadInst>(&I1))
-          LI.insert(Load, VN);
-        else if (auto *Store = dyn_cast<StoreInst>(&I1))
-          SI.insert(Store, VN);
-        else if (auto *Call = dyn_cast<CallInst>(&I1)) {
-          if (auto *Intr = dyn_cast<IntrinsicInst>(Call)) {
-            if (isa<DbgInfoIntrinsic>(Intr) ||
-                Intr->getIntrinsicID() == Intrinsic::assume ||
-                Intr->getIntrinsicID() == Intrinsic::sideeffect)
-              continue;
-          }
-          if (Call->mayHaveSideEffects())
-            break;
-
-          if (Call->isConvergent())
-            break;
-
-          CI.insert(Call, VN);
-        } else if (HoistingGeps || !isa<GetElementPtrInst>(&I1))
-          // Do not hoist scalars past calls that may write to memory because
-          // that could result in spills later. geps are handled separately.
-          // TODO: We can relax this for targets like AArch64 as they have more
-          // registers than X86.
-          II.insert(&I1, VN);
+  if (Val && isa<GetElementPtrInst>(Val))
+    makeGepsAvailable(Repl, HoistPt, InstructionsToHoist, Val);
+
+  return true;
+}
+
+std::pair<unsigned, unsigned> GVNHoist::hoist(HoistingPointList &HPL) {
+  unsigned NI = 0, NL = 0, NS = 0, NC = 0, NR = 0;
+  for (const HoistingPointInfo &HP : HPL) {
+    // Find out whether we already have one of the instructions in HoistPt,
+    // in which case we do not have to move it.
+    BasicBlock *DestBB = HP.first;
+    const SmallVecInsn &InstructionsToHoist = HP.second;
+    Instruction *Repl = nullptr;
+    for (Instruction *I : InstructionsToHoist)
+      if (I->getParent() == DestBB)
+        // If there are two instructions in HoistPt to be hoisted in place:
+        // update Repl to be the first one, such that we can rename the uses
+        // of the second based on the first.
+        if (!Repl || firstInBB(I, Repl))
+          Repl = I;
+
+    // Keep track of whether we moved the instruction so we know whether we
+    // should move the MemoryAccess.
+    bool MoveAccess = true;
+    if (Repl) {
+      // Repl is already in HoistPt: it remains in place.
+      assert(allOperandsAvailable(Repl, DestBB) &&
+             "instruction depends on operands that are not available");
+      MoveAccess = false;
+    } else {
+      // When we do not find Repl in HoistPt, select the first in the list
+      // and move it to HoistPt.
+      Repl = InstructionsToHoist.front();
+
+      // We can move Repl in HoistPt only when all operands are available.
+      // The order in which hoistings are done may influence the availability
+      // of operands.
+      if (!allOperandsAvailable(Repl, DestBB)) {
+        // When HoistingGeps there is nothing more we can do to make the
+        // operands available: just continue.
+        if (HoistingGeps)
+          continue;
+
+        // When not HoistingGeps we need to copy the GEPs.
+        if (!makeGepOperandsAvailable(Repl, DestBB, InstructionsToHoist))
+          continue;
       }
+
+      // Move the instruction at the end of HoistPt.
+      Instruction *Last = DestBB->getTerminator();
+      MD->removeInstruction(Repl);
+      Repl->moveBefore(Last);
+
+      DFSNumber[Repl] = DFSNumber[Last]++;
     }
 
-    HoistingPointList HPL;
-    computeInsertionPoints(II.getVNTable(), HPL, InsKind::Scalar);
-    computeInsertionPoints(LI.getVNTable(), HPL, InsKind::Load);
-    computeInsertionPoints(SI.getVNTable(), HPL, InsKind::Store);
-    computeInsertionPoints(CI.getScalarVNTable(), HPL, InsKind::Scalar);
-    computeInsertionPoints(CI.getLoadVNTable(), HPL, InsKind::Load);
-    computeInsertionPoints(CI.getStoreVNTable(), HPL, InsKind::Store);
-    return hoist(HPL);
+    NR += removeAndReplace(InstructionsToHoist, Repl, DestBB, MoveAccess);
+
+    if (isa<LoadInst>(Repl))
+      ++NL;
+    else if (isa<StoreInst>(Repl))
+      ++NS;
+    else if (isa<CallInst>(Repl))
+      ++NC;
+    else // Scalar
+      ++NI;
   }
-};
 
-class GVNHoistLegacyPass : public FunctionPass {
-public:
-  static char ID;
+  if (MSSA && VerifyMemorySSA)
+    MSSA->verifyMemorySSA();
 
-  GVNHoistLegacyPass() : FunctionPass(ID) {
-    initializeGVNHoistLegacyPassPass(*PassRegistry::getPassRegistry());
-  }
+  NumHoisted += NL + NS + NC + NI;
+  NumRemoved += NR;
+  NumLoadsHoisted += NL;
+  NumStoresHoisted += NS;
+  NumCallsHoisted += NC;
+  return {NI, NL + NC + NS};
+}
 
-  bool runOnFunction(Function &F) override {
-    if (skipFunction(F))
-      return false;
-    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-    auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
-    auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
-    auto &MD = getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
-    auto &MSSA = getAnalysis<MemorySSAWrapperPass>().getMSSA();
+std::pair<unsigned, unsigned> GVNHoist::hoistExpressions(Function &F) {
+  InsnInfo II;
+  LoadInfo LI;
+  StoreInfo SI;
+  CallInfo CI;
+  for (BasicBlock *BB : depth_first(&F.getEntryBlock())) {
+    int InstructionNb = 0;
+    for (Instruction &I1 : *BB) {
+      // If I1 cannot guarantee progress, subsequent instructions
+      // in BB cannot be hoisted anyways.
+      if (!isGuaranteedToTransferExecutionToSuccessor(&I1)) {
+        HoistBarrier.insert(BB);
+        break;
+      }
+      // Only hoist the first instructions in BB up to MaxDepthInBB. Hoisting
+      // deeper may increase the register pressure and compilation time.
+      if (MaxDepthInBB != -1 && InstructionNb++ >= MaxDepthInBB)
+        break;
+
+      // Do not value number terminator instructions.
+      if (I1.isTerminator())
+        break;
+
+      if (auto *Load = dyn_cast<LoadInst>(&I1))
+        LI.insert(Load, VN);
+      else if (auto *Store = dyn_cast<StoreInst>(&I1))
+        SI.insert(Store, VN);
+      else if (auto *Call = dyn_cast<CallInst>(&I1)) {
+        if (auto *Intr = dyn_cast<IntrinsicInst>(Call)) {
+          if (isa<DbgInfoIntrinsic>(Intr) ||
+              Intr->getIntrinsicID() == Intrinsic::assume ||
+              Intr->getIntrinsicID() == Intrinsic::sideeffect)
+            continue;
+        }
+        if (Call->mayHaveSideEffects())
+          break;
 
-    GVNHoist G(&DT, &PDT, &AA, &MD, &MSSA);
-    return G.run(F);
-  }
+        if (Call->isConvergent())
+          break;
 
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addRequired<PostDominatorTreeWrapperPass>();
-    AU.addRequired<AAResultsWrapperPass>();
-    AU.addRequired<MemoryDependenceWrapperPass>();
-    AU.addRequired<MemorySSAWrapperPass>();
-    AU.addPreserved<DominatorTreeWrapperPass>();
-    AU.addPreserved<MemorySSAWrapperPass>();
-    AU.addPreserved<GlobalsAAWrapperPass>();
-    AU.addPreserved<AAResultsWrapperPass>();
+        CI.insert(Call, VN);
+      } else if (HoistingGeps || !isa<GetElementPtrInst>(&I1))
+        // Do not hoist scalars past calls that may write to memory because
+        // that could result in spills later. geps are handled separately.
+        // TODO: We can relax this for targets like AArch64 as they have more
+        // registers than X86.
+        II.insert(&I1, VN);
+    }
   }
-};
+
+  HoistingPointList HPL;
+  computeInsertionPoints(II.getVNTable(), HPL, InsKind::Scalar);
+  computeInsertionPoints(LI.getVNTable(), HPL, InsKind::Load);
+  computeInsertionPoints(SI.getVNTable(), HPL, InsKind::Store);
+  computeInsertionPoints(CI.getScalarVNTable(), HPL, InsKind::Scalar);
+  computeInsertionPoints(CI.getLoadVNTable(), HPL, InsKind::Load);
+  computeInsertionPoints(CI.getStoreVNTable(), HPL, InsKind::Store);
+  return hoist(HPL);
+}
 
 } // end namespace llvm
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVNSink.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVNSink.cpp
index dfb4b7e038ba..aef927ab6558 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVNSink.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVNSink.cpp
@@ -158,8 +158,7 @@ public:
 
   void restrictToBlocks(SmallSetVector<BasicBlock *, 4> &Blocks) {
     for (auto II = Insts.begin(); II != Insts.end();) {
-      if (std::find(Blocks.begin(), Blocks.end(), (*II)->getParent()) ==
-          Blocks.end()) {
+      if (!llvm::is_contained(Blocks, (*II)->getParent())) {
         ActiveBlocks.remove((*II)->getParent());
         II = Insts.erase(II);
       } else {
@@ -277,8 +276,7 @@ public:
     auto VI = Values.begin();
     while (BI != Blocks.end()) {
       assert(VI != Values.end());
-      if (std::find(NewBlocks.begin(), NewBlocks.end(), *BI) ==
-          NewBlocks.end()) {
+      if (!llvm::is_contained(NewBlocks, *BI)) {
         BI = Blocks.erase(BI);
         VI = Values.erase(VI);
       } else {
@@ -694,10 +692,8 @@ Optional<SinkingInstructionCandidate> GVNSink::analyzeInstructionForSinking(
   ModelledPHI NewPHI(NewInsts, ActivePreds);
 
   // Does sinking this instruction render previous PHIs redundant?
-  if (NeededPHIs.find(NewPHI) != NeededPHIs.end()) {
-    NeededPHIs.erase(NewPHI);
+  if (NeededPHIs.erase(NewPHI))
     RecomputePHIContents = true;
-  }
 
   if (RecomputePHIContents) {
     // The needed PHIs have changed, so recompute the set of all needed
@@ -758,8 +754,7 @@ Optional<SinkingInstructionCandidate> GVNSink::analyzeInstructionForSinking(
   Cand.NumMemoryInsts = MemoryInstNum;
   Cand.NumBlocks = ActivePreds.size();
   Cand.NumPHIs = NeededPHIs.size();
-  for (auto *C : ActivePreds)
-    Cand.Blocks.push_back(C);
+  append_range(Cand.Blocks, ActivePreds);
 
   return Cand;
 }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/GuardWidening.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/GuardWidening.cpp
index a3eba27a4d90..61eb4ce0ed46 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/GuardWidening.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/GuardWidening.cpp
@@ -347,9 +347,8 @@ bool GuardWideningImpl::eliminateInstrViaWidening(
     const auto &GuardsInCurBB = GuardsInBlock.find(CurBB)->second;
 
     auto I = GuardsInCurBB.begin();
-    auto E = Instr->getParent() == CurBB
-                 ? std::find(GuardsInCurBB.begin(), GuardsInCurBB.end(), Instr)
-                 : GuardsInCurBB.end();
+    auto E = Instr->getParent() == CurBB ? find(GuardsInCurBB, Instr)
+                                         : GuardsInCurBB.end();
 
 #ifndef NDEBUG
     {
@@ -666,13 +665,12 @@ bool GuardWideningImpl::combineRangeChecks(
     };
 
     copy_if(Checks, std::back_inserter(CurrentChecks), IsCurrentCheck);
-    Checks.erase(remove_if(Checks, IsCurrentCheck), Checks.end());
+    erase_if(Checks, IsCurrentCheck);
 
     assert(CurrentChecks.size() != 0 && "We know we have at least one!");
 
     if (CurrentChecks.size() < 3) {
-      RangeChecksOut.insert(RangeChecksOut.end(), CurrentChecks.begin(),
-                            CurrentChecks.end());
+      llvm::append_range(RangeChecksOut, CurrentChecks);
       continue;
     }
 
@@ -700,9 +698,7 @@ bool GuardWideningImpl::combineRangeChecks(
       return (HighOffset - RC.getOffsetValue()).ult(MaxDiff);
     };
 
-    if (MaxDiff.isMinValue() ||
-        !std::all_of(std::next(CurrentChecks.begin()), CurrentChecks.end(),
-                     OffsetOK))
+    if (MaxDiff.isMinValue() || !all_of(drop_begin(CurrentChecks), OffsetOK))
       return false;
 
     // We have a series of f+1 checks as:
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
index 0f36c3f772e6..ae1fff0fa844 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -131,6 +131,10 @@ static cl::opt<bool>
 LoopPredication("indvars-predicate-loops", cl::Hidden, cl::init(true),
                 cl::desc("Predicate conditions in read only loops"));
 
+static cl::opt<bool>
+AllowIVWidening("indvars-widen-indvars", cl::Hidden, cl::init(true),
+                cl::desc("Allow widening of indvars to eliminate s/zext"));
+
 namespace {
 
 struct RewritePhi;
@@ -145,6 +149,7 @@ class IndVarSimplify {
   std::unique_ptr<MemorySSAUpdater> MSSAU;
 
   SmallVector<WeakTrackingVH, 16> DeadInsts;
+  bool WidenIndVars;
 
   bool handleFloatingPointIV(Loop *L, PHINode *PH);
   bool rewriteNonIntegerIVs(Loop *L);
@@ -167,8 +172,9 @@ class IndVarSimplify {
 public:
   IndVarSimplify(LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
                  const DataLayout &DL, TargetLibraryInfo *TLI,
-                 TargetTransformInfo *TTI, MemorySSA *MSSA)
-      : LI(LI), SE(SE), DT(DT), DL(DL), TLI(TLI), TTI(TTI) {
+                 TargetTransformInfo *TTI, MemorySSA *MSSA, bool WidenIndVars)
+      : LI(LI), SE(SE), DT(DT), DL(DL), TLI(TLI), TTI(TTI),
+        WidenIndVars(WidenIndVars) {
     if (MSSA)
       MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
   }
@@ -178,57 +184,6 @@ public:
 
 } // end anonymous namespace
 
-/// Determine the insertion point for this user. By default, insert immediately
-/// before the user. SCEVExpander or LICM will hoist loop invariants out of the
-/// loop. For PHI nodes, there may be multiple uses, so compute the nearest
-/// common dominator for the incoming blocks. A nullptr can be returned if no
-/// viable location is found: it may happen if User is a PHI and Def only comes
-/// to this PHI from unreachable blocks.
-static Instruction *getInsertPointForUses(Instruction *User, Value *Def,
-                                          DominatorTree *DT, LoopInfo *LI) {
-  PHINode *PHI = dyn_cast<PHINode>(User);
-  if (!PHI)
-    return User;
-
-  Instruction *InsertPt = nullptr;
-  for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i) {
-    if (PHI->getIncomingValue(i) != Def)
-      continue;
-
-    BasicBlock *InsertBB = PHI->getIncomingBlock(i);
-
-    if (!DT->isReachableFromEntry(InsertBB))
-      continue;
-
-    if (!InsertPt) {
-      InsertPt = InsertBB->getTerminator();
-      continue;
-    }
-    InsertBB = DT->findNearestCommonDominator(InsertPt->getParent(), InsertBB);
-    InsertPt = InsertBB->getTerminator();
-  }
-
-  // If we have skipped all inputs, it means that Def only comes to Phi from
-  // unreachable blocks.
-  if (!InsertPt)
-    return nullptr;
-
-  auto *DefI = dyn_cast<Instruction>(Def);
-  if (!DefI)
-    return InsertPt;
-
-  assert(DT->dominates(DefI, InsertPt) && "def does not dominate all uses");
-
-  auto *L = LI->getLoopFor(DefI->getParent());
-  assert(!L || L->contains(LI->getLoopFor(InsertPt->getParent())));
-
-  for (auto *DTN = (*DT)[InsertPt->getParent()]; DTN; DTN = DTN->getIDom())
-    if (LI->getLoopFor(DTN->getBlock()) == L)
-      return DTN->getBlock()->getTerminator();
-
-  llvm_unreachable("DefI dominates InsertPt!");
-}
-
 //===----------------------------------------------------------------------===//
 // rewriteNonIntegerIVs and helpers. Prefer integer IVs.
 //===----------------------------------------------------------------------===//
@@ -550,27 +505,11 @@ bool IndVarSimplify::rewriteFirstIterationLoopExitValues(Loop *L) {
 //  IV Widening - Extend the width of an IV to cover its widest uses.
 //===----------------------------------------------------------------------===//
 
-namespace {
-
-// Collect information about induction variables that are used by sign/zero
-// extend operations. This information is recorded by CollectExtend and provides
-// the input to WidenIV.
-struct WideIVInfo {
-  PHINode *NarrowIV = nullptr;
-
-  // Widest integer type created [sz]ext
-  Type *WidestNativeType = nullptr;
-
-  // Was a sext user seen before a zext?
-  bool IsSigned = false;
-};
-
-} // end anonymous namespace
-
 /// Update information about the induction variable that is extended by this
 /// sign or zero extend operation. This is used to determine the final width of
 /// the IV before actually widening it.
-static void visitIVCast(CastInst *Cast, WideIVInfo &WI, ScalarEvolution *SE,
+static void visitIVCast(CastInst *Cast, WideIVInfo &WI,
+                        ScalarEvolution *SE,
                         const TargetTransformInfo *TTI) {
   bool IsSigned = Cast->getOpcode() == Instruction::SExt;
   if (!IsSigned && Cast->getOpcode() != Instruction::ZExt)
@@ -616,982 +555,6 @@ static void visitIVCast(CastInst *Cast, WideIVInfo &WI, ScalarEvolution *SE,
     WI.WidestNativeType = SE->getEffectiveSCEVType(Ty);
 }
 
-namespace {
-
-/// Record a link in the Narrow IV def-use chain along with the WideIV that
-/// computes the same value as the Narrow IV def.  This avoids caching Use*
-/// pointers.
-struct NarrowIVDefUse {
-  Instruction *NarrowDef = nullptr;
-  Instruction *NarrowUse = nullptr;
-  Instruction *WideDef = nullptr;
-
-  // True if the narrow def is never negative.  Tracking this information lets
-  // us use a sign extension instead of a zero extension or vice versa, when
-  // profitable and legal.
-  bool NeverNegative = false;
-
-  NarrowIVDefUse(Instruction *ND, Instruction *NU, Instruction *WD,
-                 bool NeverNegative)
-      : NarrowDef(ND), NarrowUse(NU), WideDef(WD),
-        NeverNegative(NeverNegative) {}
-};
-
-/// The goal of this transform is to remove sign and zero extends without
-/// creating any new induction variables. To do this, it creates a new phi of
-/// the wider type and redirects all users, either removing extends or inserting
-/// truncs whenever we stop propagating the type.
-class WidenIV {
-  // Parameters
-  PHINode *OrigPhi;
-  Type *WideType;
-
-  // Context
-  LoopInfo        *LI;
-  Loop            *L;
-  ScalarEvolution *SE;
-  DominatorTree   *DT;
-
-  // Does the module have any calls to the llvm.experimental.guard intrinsic
-  // at all? If not we can avoid scanning instructions looking for guards.
-  bool HasGuards;
-
-  // Result
-  PHINode *WidePhi = nullptr;
-  Instruction *WideInc = nullptr;
-  const SCEV *WideIncExpr = nullptr;
-  SmallVectorImpl<WeakTrackingVH> &DeadInsts;
-
-  SmallPtrSet<Instruction *,16> Widened;
-  SmallVector<NarrowIVDefUse, 8> NarrowIVUsers;
-
-  enum ExtendKind { ZeroExtended, SignExtended, Unknown };
-
-  // A map tracking the kind of extension used to widen each narrow IV
-  // and narrow IV user.
-  // Key: pointer to a narrow IV or IV user.
-  // Value: the kind of extension used to widen this Instruction.
-  DenseMap<AssertingVH<Instruction>, ExtendKind> ExtendKindMap;
-
-  using DefUserPair = std::pair<AssertingVH<Value>, AssertingVH<Instruction>>;
-
-  // A map with control-dependent ranges for post increment IV uses. The key is
-  // a pair of IV def and a use of this def denoting the context. The value is
-  // a ConstantRange representing possible values of the def at the given
-  // context.
-  DenseMap<DefUserPair, ConstantRange> PostIncRangeInfos;
-
-  Optional<ConstantRange> getPostIncRangeInfo(Value *Def,
-                                              Instruction *UseI) {
-    DefUserPair Key(Def, UseI);
-    auto It = PostIncRangeInfos.find(Key);
-    return It == PostIncRangeInfos.end()
-               ? Optional<ConstantRange>(None)
-               : Optional<ConstantRange>(It->second);
-  }
-
-  void calculatePostIncRanges(PHINode *OrigPhi);
-  void calculatePostIncRange(Instruction *NarrowDef, Instruction *NarrowUser);
-
-  void updatePostIncRangeInfo(Value *Def, Instruction *UseI, ConstantRange R) {
-    DefUserPair Key(Def, UseI);
-    auto It = PostIncRangeInfos.find(Key);
-    if (It == PostIncRangeInfos.end())
-      PostIncRangeInfos.insert({Key, R});
-    else
-      It->second = R.intersectWith(It->second);
-  }
-
-public:
-  WidenIV(const WideIVInfo &WI, LoopInfo *LInfo, ScalarEvolution *SEv,
-          DominatorTree *DTree, SmallVectorImpl<WeakTrackingVH> &DI,
-          bool HasGuards)
-      : OrigPhi(WI.NarrowIV), WideType(WI.WidestNativeType), LI(LInfo),
-        L(LI->getLoopFor(OrigPhi->getParent())), SE(SEv), DT(DTree),
-        HasGuards(HasGuards), DeadInsts(DI) {
-    assert(L->getHeader() == OrigPhi->getParent() && "Phi must be an IV");
-    ExtendKindMap[OrigPhi] = WI.IsSigned ? SignExtended : ZeroExtended;
-  }
-
-  PHINode *createWideIV(SCEVExpander &Rewriter);
-
-protected:
-  Value *createExtendInst(Value *NarrowOper, Type *WideType, bool IsSigned,
-                          Instruction *Use);
-
-  Instruction *cloneIVUser(NarrowIVDefUse DU, const SCEVAddRecExpr *WideAR);
-  Instruction *cloneArithmeticIVUser(NarrowIVDefUse DU,
-                                     const SCEVAddRecExpr *WideAR);
-  Instruction *cloneBitwiseIVUser(NarrowIVDefUse DU);
-
-  ExtendKind getExtendKind(Instruction *I);
-
-  using WidenedRecTy = std::pair<const SCEVAddRecExpr *, ExtendKind>;
-
-  WidenedRecTy getWideRecurrence(NarrowIVDefUse DU);
-
-  WidenedRecTy getExtendedOperandRecurrence(NarrowIVDefUse DU);
-
-  const SCEV *getSCEVByOpCode(const SCEV *LHS, const SCEV *RHS,
-                              unsigned OpCode) const;
-
-  Instruction *widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter);
-
-  bool widenLoopCompare(NarrowIVDefUse DU);
-  bool widenWithVariantUse(NarrowIVDefUse DU);
-  void widenWithVariantUseCodegen(NarrowIVDefUse DU);
-
-  void pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef);
-};
-
-} // end anonymous namespace
-
-Value *WidenIV::createExtendInst(Value *NarrowOper, Type *WideType,
-                                 bool IsSigned, Instruction *Use) {
-  // Set the debug location and conservative insertion point.
-  IRBuilder<> Builder(Use);
-  // Hoist the insertion point into loop preheaders as far as possible.
-  for (const Loop *L = LI->getLoopFor(Use->getParent());
-       L && L->getLoopPreheader() && L->isLoopInvariant(NarrowOper);
-       L = L->getParentLoop())
-    Builder.SetInsertPoint(L->getLoopPreheader()->getTerminator());
-
-  return IsSigned ? Builder.CreateSExt(NarrowOper, WideType) :
-                    Builder.CreateZExt(NarrowOper, WideType);
-}
-
-/// Instantiate a wide operation to replace a narrow operation. This only needs
-/// to handle operations that can evaluation to SCEVAddRec. It can safely return
-/// 0 for any operation we decide not to clone.
-Instruction *WidenIV::cloneIVUser(NarrowIVDefUse DU,
-                                  const SCEVAddRecExpr *WideAR) {
-  unsigned Opcode = DU.NarrowUse->getOpcode();
-  switch (Opcode) {
-  default:
-    return nullptr;
-  case Instruction::Add:
-  case Instruction::Mul:
-  case Instruction::UDiv:
-  case Instruction::Sub:
-    return cloneArithmeticIVUser(DU, WideAR);
-
-  case Instruction::And:
-  case Instruction::Or:
-  case Instruction::Xor:
-  case Instruction::Shl:
-  case Instruction::LShr:
-  case Instruction::AShr:
-    return cloneBitwiseIVUser(DU);
-  }
-}
-
-Instruction *WidenIV::cloneBitwiseIVUser(NarrowIVDefUse DU) {
-  Instruction *NarrowUse = DU.NarrowUse;
-  Instruction *NarrowDef = DU.NarrowDef;
-  Instruction *WideDef = DU.WideDef;
-
-  LLVM_DEBUG(dbgs() << "Cloning bitwise IVUser: " << *NarrowUse << "\n");
-
-  // Replace NarrowDef operands with WideDef. Otherwise, we don't know anything
-  // about the narrow operand yet so must insert a [sz]ext. It is probably loop
-  // invariant and will be folded or hoisted. If it actually comes from a
-  // widened IV, it should be removed during a future call to widenIVUse.
-  bool IsSigned = getExtendKind(NarrowDef) == SignExtended;
-  Value *LHS = (NarrowUse->getOperand(0) == NarrowDef)
-                   ? WideDef
-                   : createExtendInst(NarrowUse->getOperand(0), WideType,
-                                      IsSigned, NarrowUse);
-  Value *RHS = (NarrowUse->getOperand(1) == NarrowDef)
-                   ? WideDef
-                   : createExtendInst(NarrowUse->getOperand(1), WideType,
-                                      IsSigned, NarrowUse);
-
-  auto *NarrowBO = cast<BinaryOperator>(NarrowUse);
-  auto *WideBO = BinaryOperator::Create(NarrowBO->getOpcode(), LHS, RHS,
-                                        NarrowBO->getName());
-  IRBuilder<> Builder(NarrowUse);
-  Builder.Insert(WideBO);
-  WideBO->copyIRFlags(NarrowBO);
-  return WideBO;
-}
-
-Instruction *WidenIV::cloneArithmeticIVUser(NarrowIVDefUse DU,
-                                            const SCEVAddRecExpr *WideAR) {
-  Instruction *NarrowUse = DU.NarrowUse;
-  Instruction *NarrowDef = DU.NarrowDef;
-  Instruction *WideDef = DU.WideDef;
-
-  LLVM_DEBUG(dbgs() << "Cloning arithmetic IVUser: " << *NarrowUse << "\n");
-
-  unsigned IVOpIdx = (NarrowUse->getOperand(0) == NarrowDef) ? 0 : 1;
-
-  // We're trying to find X such that
-  //
-  //  Widen(NarrowDef `op` NonIVNarrowDef) == WideAR == WideDef `op.wide` X
-  //
-  // We guess two solutions to X, sext(NonIVNarrowDef) and zext(NonIVNarrowDef),
-  // and check using SCEV if any of them are correct.
-
-  // Returns true if extending NonIVNarrowDef according to `SignExt` is a
-  // correct solution to X.
-  auto GuessNonIVOperand = [&](bool SignExt) {
-    const SCEV *WideLHS;
-    const SCEV *WideRHS;
-
-    auto GetExtend = [this, SignExt](const SCEV *S, Type *Ty) {
-      if (SignExt)
-        return SE->getSignExtendExpr(S, Ty);
-      return SE->getZeroExtendExpr(S, Ty);
-    };
-
-    if (IVOpIdx == 0) {
-      WideLHS = SE->getSCEV(WideDef);
-      const SCEV *NarrowRHS = SE->getSCEV(NarrowUse->getOperand(1));
-      WideRHS = GetExtend(NarrowRHS, WideType);
-    } else {
-      const SCEV *NarrowLHS = SE->getSCEV(NarrowUse->getOperand(0));
-      WideLHS = GetExtend(NarrowLHS, WideType);
-      WideRHS = SE->getSCEV(WideDef);
-    }
-
-    // WideUse is "WideDef `op.wide` X" as described in the comment.
-    const SCEV *WideUse = nullptr;
-
-    switch (NarrowUse->getOpcode()) {
-    default:
-      llvm_unreachable("No other possibility!");
-
-    case Instruction::Add:
-      WideUse = SE->getAddExpr(WideLHS, WideRHS);
-      break;
-
-    case Instruction::Mul:
-      WideUse = SE->getMulExpr(WideLHS, WideRHS);
-      break;
-
-    case Instruction::UDiv:
-      WideUse = SE->getUDivExpr(WideLHS, WideRHS);
-      break;
-
-    case Instruction::Sub:
-      WideUse = SE->getMinusSCEV(WideLHS, WideRHS);
-      break;
-    }
-
-    return WideUse == WideAR;
-  };
-
-  bool SignExtend = getExtendKind(NarrowDef) == SignExtended;
-  if (!GuessNonIVOperand(SignExtend)) {
-    SignExtend = !SignExtend;
-    if (!GuessNonIVOperand(SignExtend))
-      return nullptr;
-  }
-
-  Value *LHS = (NarrowUse->getOperand(0) == NarrowDef)
-                   ? WideDef
-                   : createExtendInst(NarrowUse->getOperand(0), WideType,
-                                      SignExtend, NarrowUse);
-  Value *RHS = (NarrowUse->getOperand(1) == NarrowDef)
-                   ? WideDef
-                   : createExtendInst(NarrowUse->getOperand(1), WideType,
-                                      SignExtend, NarrowUse);
-
-  auto *NarrowBO = cast<BinaryOperator>(NarrowUse);
-  auto *WideBO = BinaryOperator::Create(NarrowBO->getOpcode(), LHS, RHS,
-                                        NarrowBO->getName());
-
-  IRBuilder<> Builder(NarrowUse);
-  Builder.Insert(WideBO);
-  WideBO->copyIRFlags(NarrowBO);
-  return WideBO;
-}
-
-WidenIV::ExtendKind WidenIV::getExtendKind(Instruction *I) {
-  auto It = ExtendKindMap.find(I);
-  assert(It != ExtendKindMap.end() && "Instruction not yet extended!");
-  return It->second;
-}
-
-const SCEV *WidenIV::getSCEVByOpCode(const SCEV *LHS, const SCEV *RHS,
-                                     unsigned OpCode) const {
-  if (OpCode == Instruction::Add)
-    return SE->getAddExpr(LHS, RHS);
-  if (OpCode == Instruction::Sub)
-    return SE->getMinusSCEV(LHS, RHS);
-  if (OpCode == Instruction::Mul)
-    return SE->getMulExpr(LHS, RHS);
-
-  llvm_unreachable("Unsupported opcode.");
-}
-
-/// No-wrap operations can transfer sign extension of their result to their
-/// operands. Generate the SCEV value for the widened operation without
-/// actually modifying the IR yet. If the expression after extending the
-/// operands is an AddRec for this loop, return the AddRec and the kind of
-/// extension used.
-WidenIV::WidenedRecTy WidenIV::getExtendedOperandRecurrence(NarrowIVDefUse DU) {
-  // Handle the common case of add<nsw/nuw>
-  const unsigned OpCode = DU.NarrowUse->getOpcode();
-  // Only Add/Sub/Mul instructions supported yet.
-  if (OpCode != Instruction::Add && OpCode != Instruction::Sub &&
-      OpCode != Instruction::Mul)
-    return {nullptr, Unknown};
-
-  // One operand (NarrowDef) has already been extended to WideDef. Now determine
-  // if extending the other will lead to a recurrence.
-  const unsigned ExtendOperIdx =
-      DU.NarrowUse->getOperand(0) == DU.NarrowDef ? 1 : 0;
-  assert(DU.NarrowUse->getOperand(1-ExtendOperIdx) == DU.NarrowDef && "bad DU");
-
-  const SCEV *ExtendOperExpr = nullptr;
-  const OverflowingBinaryOperator *OBO =
-    cast<OverflowingBinaryOperator>(DU.NarrowUse);
-  ExtendKind ExtKind = getExtendKind(DU.NarrowDef);
-  if (ExtKind == SignExtended && OBO->hasNoSignedWrap())
-    ExtendOperExpr = SE->getSignExtendExpr(
-      SE->getSCEV(DU.NarrowUse->getOperand(ExtendOperIdx)), WideType);
-  else if(ExtKind == ZeroExtended && OBO->hasNoUnsignedWrap())
-    ExtendOperExpr = SE->getZeroExtendExpr(
-      SE->getSCEV(DU.NarrowUse->getOperand(ExtendOperIdx)), WideType);
-  else
-    return {nullptr, Unknown};
-
-  // When creating this SCEV expr, don't apply the current operations NSW or NUW
-  // flags. This instruction may be guarded by control flow that the no-wrap
-  // behavior depends on. Non-control-equivalent instructions can be mapped to
-  // the same SCEV expression, and it would be incorrect to transfer NSW/NUW
-  // semantics to those operations.
-  const SCEV *lhs = SE->getSCEV(DU.WideDef);
-  const SCEV *rhs = ExtendOperExpr;
-
-  // Let's swap operands to the initial order for the case of non-commutative
-  // operations, like SUB. See PR21014.
-  if (ExtendOperIdx == 0)
-    std::swap(lhs, rhs);
-  const SCEVAddRecExpr *AddRec =
-      dyn_cast<SCEVAddRecExpr>(getSCEVByOpCode(lhs, rhs, OpCode));
-
-  if (!AddRec || AddRec->getLoop() != L)
-    return {nullptr, Unknown};
-
-  return {AddRec, ExtKind};
-}
-
-/// Is this instruction potentially interesting for further simplification after
-/// widening it's type? In other words, can the extend be safely hoisted out of
-/// the loop with SCEV reducing the value to a recurrence on the same loop. If
-/// so, return the extended recurrence and the kind of extension used. Otherwise
-/// return {nullptr, Unknown}.
-WidenIV::WidenedRecTy WidenIV::getWideRecurrence(NarrowIVDefUse DU) {
-  if (!SE->isSCEVable(DU.NarrowUse->getType()))
-    return {nullptr, Unknown};
-
-  const SCEV *NarrowExpr = SE->getSCEV(DU.NarrowUse);
-  if (SE->getTypeSizeInBits(NarrowExpr->getType()) >=
-      SE->getTypeSizeInBits(WideType)) {
-    // NarrowUse implicitly widens its operand. e.g. a gep with a narrow
-    // index. So don't follow this use.
-    return {nullptr, Unknown};
-  }
-
-  const SCEV *WideExpr;
-  ExtendKind ExtKind;
-  if (DU.NeverNegative) {
-    WideExpr = SE->getSignExtendExpr(NarrowExpr, WideType);
-    if (isa<SCEVAddRecExpr>(WideExpr))
-      ExtKind = SignExtended;
-    else {
-      WideExpr = SE->getZeroExtendExpr(NarrowExpr, WideType);
-      ExtKind = ZeroExtended;
-    }
-  } else if (getExtendKind(DU.NarrowDef) == SignExtended) {
-    WideExpr = SE->getSignExtendExpr(NarrowExpr, WideType);
-    ExtKind = SignExtended;
-  } else {
-    WideExpr = SE->getZeroExtendExpr(NarrowExpr, WideType);
-    ExtKind = ZeroExtended;
-  }
-  const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(WideExpr);
-  if (!AddRec || AddRec->getLoop() != L)
-    return {nullptr, Unknown};
-  return {AddRec, ExtKind};
-}
-
-/// This IV user cannot be widened. Replace this use of the original narrow IV
-/// with a truncation of the new wide IV to isolate and eliminate the narrow IV.
-static void truncateIVUse(NarrowIVDefUse DU, DominatorTree *DT, LoopInfo *LI) {
-  auto *InsertPt = getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT, LI);
-  if (!InsertPt)
-    return;
-  LLVM_DEBUG(dbgs() << "INDVARS: Truncate IV " << *DU.WideDef << " for user "
-                    << *DU.NarrowUse << "\n");
-  IRBuilder<> Builder(InsertPt);
-  Value *Trunc = Builder.CreateTrunc(DU.WideDef, DU.NarrowDef->getType());
-  DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, Trunc);
-}
-
-/// If the narrow use is a compare instruction, then widen the compare
-//  (and possibly the other operand).  The extend operation is hoisted into the
-// loop preheader as far as possible.
-bool WidenIV::widenLoopCompare(NarrowIVDefUse DU) {
-  ICmpInst *Cmp = dyn_cast<ICmpInst>(DU.NarrowUse);
-  if (!Cmp)
-    return false;
-
-  // We can legally widen the comparison in the following two cases:
-  //
-  //  - The signedness of the IV extension and comparison match
-  //
-  //  - The narrow IV is always positive (and thus its sign extension is equal
-  //    to its zero extension).  For instance, let's say we're zero extending
-  //    %narrow for the following use
-  //
-  //      icmp slt i32 %narrow, %val   ... (A)
-  //
-  //    and %narrow is always positive.  Then
-  //
-  //      (A) == icmp slt i32 sext(%narrow), sext(%val)
-  //          == icmp slt i32 zext(%narrow), sext(%val)
-  bool IsSigned = getExtendKind(DU.NarrowDef) == SignExtended;
-  if (!(DU.NeverNegative || IsSigned == Cmp->isSigned()))
-    return false;
-
-  Value *Op = Cmp->getOperand(Cmp->getOperand(0) == DU.NarrowDef ? 1 : 0);
-  unsigned CastWidth = SE->getTypeSizeInBits(Op->getType());
-  unsigned IVWidth = SE->getTypeSizeInBits(WideType);
-  assert(CastWidth <= IVWidth && "Unexpected width while widening compare.");
-
-  // Widen the compare instruction.
-  auto *InsertPt = getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT, LI);
-  if (!InsertPt)
-    return false;
-  IRBuilder<> Builder(InsertPt);
-  DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, DU.WideDef);
-
-  // Widen the other operand of the compare, if necessary.
-  if (CastWidth < IVWidth) {
-    Value *ExtOp = createExtendInst(Op, WideType, Cmp->isSigned(), Cmp);
-    DU.NarrowUse->replaceUsesOfWith(Op, ExtOp);
-  }
-  return true;
-}
-
-// The widenIVUse avoids generating trunc by evaluating the use as AddRec, this
-// will not work when:
-//    1) SCEV traces back to an instruction inside the loop that SCEV can not
-// expand, eg. add %indvar, (load %addr)
-//    2) SCEV finds a loop variant, eg. add %indvar, %loopvariant
-// While SCEV fails to avoid trunc, we can still try to use instruction
-// combining approach to prove trunc is not required. This can be further
-// extended with other instruction combining checks, but for now we handle the
-// following case (sub can be "add" and "mul", "nsw + sext" can be "nus + zext")
-//
-// Src:
-//   %c = sub nsw %b, %indvar
-//   %d = sext %c to i64
-// Dst:
-//   %indvar.ext1 = sext %indvar to i64
-//   %m = sext %b to i64
-//   %d = sub nsw i64 %m, %indvar.ext1
-// Therefore, as long as the result of add/sub/mul is extended to wide type, no
-// trunc is required regardless of how %b is generated. This pattern is common
-// when calculating address in 64 bit architecture
-bool WidenIV::widenWithVariantUse(NarrowIVDefUse DU) {
-  Instruction *NarrowUse = DU.NarrowUse;
-  Instruction *NarrowDef = DU.NarrowDef;
-  Instruction *WideDef = DU.WideDef;
-
-  // Handle the common case of add<nsw/nuw>
-  const unsigned OpCode = NarrowUse->getOpcode();
-  // Only Add/Sub/Mul instructions are supported.
-  if (OpCode != Instruction::Add && OpCode != Instruction::Sub &&
-      OpCode != Instruction::Mul)
-    return false;
-
-  // The operand that is not defined by NarrowDef of DU. Let's call it the
-  // other operand.
-  unsigned ExtendOperIdx = DU.NarrowUse->getOperand(0) == NarrowDef ? 1 : 0;
-  assert(DU.NarrowUse->getOperand(1 - ExtendOperIdx) == DU.NarrowDef &&
-         "bad DU");
-
-  const SCEV *ExtendOperExpr = nullptr;
-  const OverflowingBinaryOperator *OBO =
-    cast<OverflowingBinaryOperator>(NarrowUse);
-  ExtendKind ExtKind = getExtendKind(NarrowDef);
-  if (ExtKind == SignExtended && OBO->hasNoSignedWrap())
-    ExtendOperExpr = SE->getSignExtendExpr(
-      SE->getSCEV(NarrowUse->getOperand(ExtendOperIdx)), WideType);
-  else if (ExtKind == ZeroExtended && OBO->hasNoUnsignedWrap())
-    ExtendOperExpr = SE->getZeroExtendExpr(
-      SE->getSCEV(NarrowUse->getOperand(ExtendOperIdx)), WideType);
-  else
-    return false;
-
-  // Verifying that Defining operand is an AddRec
-  const SCEV *Op1 = SE->getSCEV(WideDef);
-  const SCEVAddRecExpr *AddRecOp1 = dyn_cast<SCEVAddRecExpr>(Op1);
-  if (!AddRecOp1 || AddRecOp1->getLoop() != L)
-    return false;
-  // Verifying that other operand is an Extend.
-  if (ExtKind == SignExtended) {
-    if (!isa<SCEVSignExtendExpr>(ExtendOperExpr))
-      return false;
-  } else {
-    if (!isa<SCEVZeroExtendExpr>(ExtendOperExpr))
-      return false;
-  }
-
-  if (ExtKind == SignExtended) {
-    for (Use &U : NarrowUse->uses()) {
-      SExtInst *User = dyn_cast<SExtInst>(U.getUser());
-      if (!User || User->getType() != WideType)
-        return false;
-    }
-  } else { // ExtKind == ZeroExtended
-    for (Use &U : NarrowUse->uses()) {
-      ZExtInst *User = dyn_cast<ZExtInst>(U.getUser());
-      if (!User || User->getType() != WideType)
-        return false;
-    }
-  }
-
-  return true;
-}
-
-/// Special Case for widening with loop variant (see
-/// WidenIV::widenWithVariant). This is the code generation part.
-void WidenIV::widenWithVariantUseCodegen(NarrowIVDefUse DU) {
-  Instruction *NarrowUse = DU.NarrowUse;
-  Instruction *NarrowDef = DU.NarrowDef;
-  Instruction *WideDef = DU.WideDef;
-
-  ExtendKind ExtKind = getExtendKind(NarrowDef);
-
-  LLVM_DEBUG(dbgs() << "Cloning arithmetic IVUser: " << *NarrowUse << "\n");
-
-  // Generating a widening use instruction.
-  Value *LHS = (NarrowUse->getOperand(0) == NarrowDef)
-                   ? WideDef
-                   : createExtendInst(NarrowUse->getOperand(0), WideType,
-                                      ExtKind, NarrowUse);
-  Value *RHS = (NarrowUse->getOperand(1) == NarrowDef)
-                   ? WideDef
-                   : createExtendInst(NarrowUse->getOperand(1), WideType,
-                                      ExtKind, NarrowUse);
-
-  auto *NarrowBO = cast<BinaryOperator>(NarrowUse);
-  auto *WideBO = BinaryOperator::Create(NarrowBO->getOpcode(), LHS, RHS,
-                                        NarrowBO->getName());
-  IRBuilder<> Builder(NarrowUse);
-  Builder.Insert(WideBO);
-  WideBO->copyIRFlags(NarrowBO);
-
-  assert(ExtKind != Unknown && "Unknown ExtKind not handled");
-
-  ExtendKindMap[NarrowUse] = ExtKind;
-
-  for (Use &U : NarrowUse->uses()) {
-    Instruction *User = nullptr;
-    if (ExtKind == SignExtended)
-      User = dyn_cast<SExtInst>(U.getUser());
-    else
-      User = dyn_cast<ZExtInst>(U.getUser());
-    if (User && User->getType() == WideType) {
-      LLVM_DEBUG(dbgs() << "INDVARS: eliminating " << *User << " replaced by "
-                        << *WideBO << "\n");
-      ++NumElimExt;
-      User->replaceAllUsesWith(WideBO);
-      DeadInsts.emplace_back(User);
-    }
-  }
-}
-
-/// Determine whether an individual user of the narrow IV can be widened. If so,
-/// return the wide clone of the user.
-Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
-  assert(ExtendKindMap.count(DU.NarrowDef) &&
-         "Should already know the kind of extension used to widen NarrowDef");
-
-  // Stop traversing the def-use chain at inner-loop phis or post-loop phis.
-  if (PHINode *UsePhi = dyn_cast<PHINode>(DU.NarrowUse)) {
-    if (LI->getLoopFor(UsePhi->getParent()) != L) {
-      // For LCSSA phis, sink the truncate outside the loop.
-      // After SimplifyCFG most loop exit targets have a single predecessor.
-      // Otherwise fall back to a truncate within the loop.
-      if (UsePhi->getNumOperands() != 1)
-        truncateIVUse(DU, DT, LI);
-      else {
-        // Widening the PHI requires us to insert a trunc.  The logical place
-        // for this trunc is in the same BB as the PHI.  This is not possible if
-        // the BB is terminated by a catchswitch.
-        if (isa<CatchSwitchInst>(UsePhi->getParent()->getTerminator()))
-          return nullptr;
-
-        PHINode *WidePhi =
-          PHINode::Create(DU.WideDef->getType(), 1, UsePhi->getName() + ".wide",
-                          UsePhi);
-        WidePhi->addIncoming(DU.WideDef, UsePhi->getIncomingBlock(0));
-        IRBuilder<> Builder(&*WidePhi->getParent()->getFirstInsertionPt());
-        Value *Trunc = Builder.CreateTrunc(WidePhi, DU.NarrowDef->getType());
-        UsePhi->replaceAllUsesWith(Trunc);
-        DeadInsts.emplace_back(UsePhi);
-        LLVM_DEBUG(dbgs() << "INDVARS: Widen lcssa phi " << *UsePhi << " to "
-                          << *WidePhi << "\n");
-      }
-      return nullptr;
-    }
-  }
-
-  // This narrow use can be widened by a sext if it's non-negative or its narrow
-  // def was widended by a sext. Same for zext.
-  auto canWidenBySExt = [&]() {
-    return DU.NeverNegative || getExtendKind(DU.NarrowDef) == SignExtended;
-  };
-  auto canWidenByZExt = [&]() {
-    return DU.NeverNegative || getExtendKind(DU.NarrowDef) == ZeroExtended;
-  };
-
-  // Our raison d'etre! Eliminate sign and zero extension.
-  if ((isa<SExtInst>(DU.NarrowUse) && canWidenBySExt()) ||
-      (isa<ZExtInst>(DU.NarrowUse) && canWidenByZExt())) {
-    Value *NewDef = DU.WideDef;
-    if (DU.NarrowUse->getType() != WideType) {
-      unsigned CastWidth = SE->getTypeSizeInBits(DU.NarrowUse->getType());
-      unsigned IVWidth = SE->getTypeSizeInBits(WideType);
-      if (CastWidth < IVWidth) {
-        // The cast isn't as wide as the IV, so insert a Trunc.
-        IRBuilder<> Builder(DU.NarrowUse);
-        NewDef = Builder.CreateTrunc(DU.WideDef, DU.NarrowUse->getType());
-      }
-      else {
-        // A wider extend was hidden behind a narrower one. This may induce
-        // another round of IV widening in which the intermediate IV becomes
-        // dead. It should be very rare.
-        LLVM_DEBUG(dbgs() << "INDVARS: New IV " << *WidePhi
-                          << " not wide enough to subsume " << *DU.NarrowUse
-                          << "\n");
-        DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, DU.WideDef);
-        NewDef = DU.NarrowUse;
-      }
-    }
-    if (NewDef != DU.NarrowUse) {
-      LLVM_DEBUG(dbgs() << "INDVARS: eliminating " << *DU.NarrowUse
-                        << " replaced by " << *DU.WideDef << "\n");
-      ++NumElimExt;
-      DU.NarrowUse->replaceAllUsesWith(NewDef);
-      DeadInsts.emplace_back(DU.NarrowUse);
-    }
-    // Now that the extend is gone, we want to expose it's uses for potential
-    // further simplification. We don't need to directly inform SimplifyIVUsers
-    // of the new users, because their parent IV will be processed later as a
-    // new loop phi. If we preserved IVUsers analysis, we would also want to
-    // push the uses of WideDef here.
-
-    // No further widening is needed. The deceased [sz]ext had done it for us.
-    return nullptr;
-  }
-
-  // Does this user itself evaluate to a recurrence after widening?
-  WidenedRecTy WideAddRec = getExtendedOperandRecurrence(DU);
-  if (!WideAddRec.first)
-    WideAddRec = getWideRecurrence(DU);
-
-  assert((WideAddRec.first == nullptr) == (WideAddRec.second == Unknown));
-  if (!WideAddRec.first) {
-    // If use is a loop condition, try to promote the condition instead of
-    // truncating the IV first.
-    if (widenLoopCompare(DU))
-      return nullptr;
-
-    // We are here about to generate a truncate instruction that may hurt
-    // performance because the scalar evolution expression computed earlier
-    // in WideAddRec.first does not indicate a polynomial induction expression.
-    // In that case, look at the operands of the use instruction to determine
-    // if we can still widen the use instead of truncating its operand.
-    if (widenWithVariantUse(DU)) {
-      widenWithVariantUseCodegen(DU);
-      return nullptr;
-    }
-
-    // This user does not evaluate to a recurrence after widening, so don't
-    // follow it. Instead insert a Trunc to kill off the original use,
-    // eventually isolating the original narrow IV so it can be removed.
-    truncateIVUse(DU, DT, LI);
-    return nullptr;
-  }
-  // Assume block terminators cannot evaluate to a recurrence. We can't to
-  // insert a Trunc after a terminator if there happens to be a critical edge.
-  assert(DU.NarrowUse != DU.NarrowUse->getParent()->getTerminator() &&
-         "SCEV is not expected to evaluate a block terminator");
-
-  // Reuse the IV increment that SCEVExpander created as long as it dominates
-  // NarrowUse.
-  Instruction *WideUse = nullptr;
-  if (WideAddRec.first == WideIncExpr &&
-      Rewriter.hoistIVInc(WideInc, DU.NarrowUse))
-    WideUse = WideInc;
-  else {
-    WideUse = cloneIVUser(DU, WideAddRec.first);
-    if (!WideUse)
-      return nullptr;
-  }
-  // Evaluation of WideAddRec ensured that the narrow expression could be
-  // extended outside the loop without overflow. This suggests that the wide use
-  // evaluates to the same expression as the extended narrow use, but doesn't
-  // absolutely guarantee it. Hence the following failsafe check. In rare cases
-  // where it fails, we simply throw away the newly created wide use.
-  if (WideAddRec.first != SE->getSCEV(WideUse)) {
-    LLVM_DEBUG(dbgs() << "Wide use expression mismatch: " << *WideUse << ": "
-                      << *SE->getSCEV(WideUse) << " != " << *WideAddRec.first
-                      << "\n");
-    DeadInsts.emplace_back(WideUse);
-    return nullptr;
-  }
-
-  // if we reached this point then we are going to replace
-  // DU.NarrowUse with WideUse. Reattach DbgValue then.
-  replaceAllDbgUsesWith(*DU.NarrowUse, *WideUse, *WideUse, *DT);
-
-  ExtendKindMap[DU.NarrowUse] = WideAddRec.second;
-  // Returning WideUse pushes it on the worklist.
-  return WideUse;
-}
-
-/// Add eligible users of NarrowDef to NarrowIVUsers.
-void WidenIV::pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef) {
-  const SCEV *NarrowSCEV = SE->getSCEV(NarrowDef);
-  bool NonNegativeDef =
-      SE->isKnownPredicate(ICmpInst::ICMP_SGE, NarrowSCEV,
-                           SE->getConstant(NarrowSCEV->getType(), 0));
-  for (User *U : NarrowDef->users()) {
-    Instruction *NarrowUser = cast<Instruction>(U);
-
-    // Handle data flow merges and bizarre phi cycles.
-    if (!Widened.insert(NarrowUser).second)
-      continue;
-
-    bool NonNegativeUse = false;
-    if (!NonNegativeDef) {
-      // We might have a control-dependent range information for this context.
-      if (auto RangeInfo = getPostIncRangeInfo(NarrowDef, NarrowUser))
-        NonNegativeUse = RangeInfo->getSignedMin().isNonNegative();
-    }
-
-    NarrowIVUsers.emplace_back(NarrowDef, NarrowUser, WideDef,
-                               NonNegativeDef || NonNegativeUse);
-  }
-}
-
-/// Process a single induction variable. First use the SCEVExpander to create a
-/// wide induction variable that evaluates to the same recurrence as the
-/// original narrow IV. Then use a worklist to forward traverse the narrow IV's
-/// def-use chain. After widenIVUse has processed all interesting IV users, the
-/// narrow IV will be isolated for removal by DeleteDeadPHIs.
-///
-/// It would be simpler to delete uses as they are processed, but we must avoid
-/// invalidating SCEV expressions.
-PHINode *WidenIV::createWideIV(SCEVExpander &Rewriter) {
-  // Is this phi an induction variable?
-  const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(OrigPhi));
-  if (!AddRec)
-    return nullptr;
-
-  // Widen the induction variable expression.
-  const SCEV *WideIVExpr = getExtendKind(OrigPhi) == SignExtended
-                               ? SE->getSignExtendExpr(AddRec, WideType)
-                               : SE->getZeroExtendExpr(AddRec, WideType);
-
-  assert(SE->getEffectiveSCEVType(WideIVExpr->getType()) == WideType &&
-         "Expect the new IV expression to preserve its type");
-
-  // Can the IV be extended outside the loop without overflow?
-  AddRec = dyn_cast<SCEVAddRecExpr>(WideIVExpr);
-  if (!AddRec || AddRec->getLoop() != L)
-    return nullptr;
-
-  // An AddRec must have loop-invariant operands. Since this AddRec is
-  // materialized by a loop header phi, the expression cannot have any post-loop
-  // operands, so they must dominate the loop header.
-  assert(
-      SE->properlyDominates(AddRec->getStart(), L->getHeader()) &&
-      SE->properlyDominates(AddRec->getStepRecurrence(*SE), L->getHeader()) &&
-      "Loop header phi recurrence inputs do not dominate the loop");
-
-  // Iterate over IV uses (including transitive ones) looking for IV increments
-  // of the form 'add nsw %iv, <const>'. For each increment and each use of
-  // the increment calculate control-dependent range information basing on
-  // dominating conditions inside of the loop (e.g. a range check inside of the
-  // loop). Calculated ranges are stored in PostIncRangeInfos map.
-  //
-  // Control-dependent range information is later used to prove that a narrow
-  // definition is not negative (see pushNarrowIVUsers). It's difficult to do
-  // this on demand because when pushNarrowIVUsers needs this information some
-  // of the dominating conditions might be already widened.
-  if (UsePostIncrementRanges)
-    calculatePostIncRanges(OrigPhi);
-
-  // The rewriter provides a value for the desired IV expression. This may
-  // either find an existing phi or materialize a new one. Either way, we
-  // expect a well-formed cyclic phi-with-increments. i.e. any operand not part
-  // of the phi-SCC dominates the loop entry.
-  Instruction *InsertPt = &L->getHeader()->front();
-  WidePhi = cast<PHINode>(Rewriter.expandCodeFor(AddRec, WideType, InsertPt));
-
-  // Remembering the WideIV increment generated by SCEVExpander allows
-  // widenIVUse to reuse it when widening the narrow IV's increment. We don't
-  // employ a general reuse mechanism because the call above is the only call to
-  // SCEVExpander. Henceforth, we produce 1-to-1 narrow to wide uses.
-  if (BasicBlock *LatchBlock = L->getLoopLatch()) {
-    WideInc =
-      cast<Instruction>(WidePhi->getIncomingValueForBlock(LatchBlock));
-    WideIncExpr = SE->getSCEV(WideInc);
-    // Propagate the debug location associated with the original loop increment
-    // to the new (widened) increment.
-    auto *OrigInc =
-      cast<Instruction>(OrigPhi->getIncomingValueForBlock(LatchBlock));
-    WideInc->setDebugLoc(OrigInc->getDebugLoc());
-  }
-
-  LLVM_DEBUG(dbgs() << "Wide IV: " << *WidePhi << "\n");
-  ++NumWidened;
-
-  // Traverse the def-use chain using a worklist starting at the original IV.
-  assert(Widened.empty() && NarrowIVUsers.empty() && "expect initial state" );
-
-  Widened.insert(OrigPhi);
-  pushNarrowIVUsers(OrigPhi, WidePhi);
-
-  while (!NarrowIVUsers.empty()) {
-    NarrowIVDefUse DU = NarrowIVUsers.pop_back_val();
-
-    // Process a def-use edge. This may replace the use, so don't hold a
-    // use_iterator across it.
-    Instruction *WideUse = widenIVUse(DU, Rewriter);
-
-    // Follow all def-use edges from the previous narrow use.
-    if (WideUse)
-      pushNarrowIVUsers(DU.NarrowUse, WideUse);
-
-    // widenIVUse may have removed the def-use edge.
-    if (DU.NarrowDef->use_empty())
-      DeadInsts.emplace_back(DU.NarrowDef);
-  }
-
-  // Attach any debug information to the new PHI.
-  replaceAllDbgUsesWith(*OrigPhi, *WidePhi, *WidePhi, *DT);
-
-  return WidePhi;
-}
-
-/// Calculates control-dependent range for the given def at the given context
-/// by looking at dominating conditions inside of the loop
-void WidenIV::calculatePostIncRange(Instruction *NarrowDef,
-                                    Instruction *NarrowUser) {
-  using namespace llvm::PatternMatch;
-
-  Value *NarrowDefLHS;
-  const APInt *NarrowDefRHS;
-  if (!match(NarrowDef, m_NSWAdd(m_Value(NarrowDefLHS),
-                                 m_APInt(NarrowDefRHS))) ||
-      !NarrowDefRHS->isNonNegative())
-    return;
-
-  auto UpdateRangeFromCondition = [&] (Value *Condition,
-                                       bool TrueDest) {
-    CmpInst::Predicate Pred;
-    Value *CmpRHS;
-    if (!match(Condition, m_ICmp(Pred, m_Specific(NarrowDefLHS),
-                                 m_Value(CmpRHS))))
-      return;
-
-    CmpInst::Predicate P =
-            TrueDest ? Pred : CmpInst::getInversePredicate(Pred);
-
-    auto CmpRHSRange = SE->getSignedRange(SE->getSCEV(CmpRHS));
-    auto CmpConstrainedLHSRange =
-            ConstantRange::makeAllowedICmpRegion(P, CmpRHSRange);
-    auto NarrowDefRange = CmpConstrainedLHSRange.addWithNoWrap(
-        *NarrowDefRHS, OverflowingBinaryOperator::NoSignedWrap);
-
-    updatePostIncRangeInfo(NarrowDef, NarrowUser, NarrowDefRange);
-  };
-
-  auto UpdateRangeFromGuards = [&](Instruction *Ctx) {
-    if (!HasGuards)
-      return;
-
-    for (Instruction &I : make_range(Ctx->getIterator().getReverse(),
-                                     Ctx->getParent()->rend())) {
-      Value *C = nullptr;
-      if (match(&I, m_Intrinsic<Intrinsic::experimental_guard>(m_Value(C))))
-        UpdateRangeFromCondition(C, /*TrueDest=*/true);
-    }
-  };
-
-  UpdateRangeFromGuards(NarrowUser);
-
-  BasicBlock *NarrowUserBB = NarrowUser->getParent();
-  // If NarrowUserBB is statically unreachable asking dominator queries may
-  // yield surprising results. (e.g. the block may not have a dom tree node)
-  if (!DT->isReachableFromEntry(NarrowUserBB))
-    return;
-
-  for (auto *DTB = (*DT)[NarrowUserBB]->getIDom();
-       L->contains(DTB->getBlock());
-       DTB = DTB->getIDom()) {
-    auto *BB = DTB->getBlock();
-    auto *TI = BB->getTerminator();
-    UpdateRangeFromGuards(TI);
-
-    auto *BI = dyn_cast<BranchInst>(TI);
-    if (!BI || !BI->isConditional())
-      continue;
-
-    auto *TrueSuccessor = BI->getSuccessor(0);
-    auto *FalseSuccessor = BI->getSuccessor(1);
-
-    auto DominatesNarrowUser = [this, NarrowUser] (BasicBlockEdge BBE) {
-      return BBE.isSingleEdge() &&
-             DT->dominates(BBE, NarrowUser->getParent());
-    };
-
-    if (DominatesNarrowUser(BasicBlockEdge(BB, TrueSuccessor)))
-      UpdateRangeFromCondition(BI->getCondition(), /*TrueDest=*/true);
-
-    if (DominatesNarrowUser(BasicBlockEdge(BB, FalseSuccessor)))
-      UpdateRangeFromCondition(BI->getCondition(), /*TrueDest=*/false);
-  }
-}
-
-/// Calculates PostIncRangeInfos map for the given IV
-void WidenIV::calculatePostIncRanges(PHINode *OrigPhi) {
-  SmallPtrSet<Instruction *, 16> Visited;
-  SmallVector<Instruction *, 6> Worklist;
-  Worklist.push_back(OrigPhi);
-  Visited.insert(OrigPhi);
-
-  while (!Worklist.empty()) {
-    Instruction *NarrowDef = Worklist.pop_back_val();
-
-    for (Use &U : NarrowDef->uses()) {
-      auto *NarrowUser = cast<Instruction>(U.getUser());
-
-      // Don't go looking outside the current loop.
-      auto *NarrowUserLoop = (*LI)[NarrowUser->getParent()];
-      if (!NarrowUserLoop || !L->contains(NarrowUserLoop))
-        continue;
-
-      if (!Visited.insert(NarrowUser).second)
-        continue;
-
-      Worklist.push_back(NarrowUser);
-
-      calculatePostIncRange(NarrowDef, NarrowUser);
-    }
-  }
-}
-
 //===----------------------------------------------------------------------===//
 //  Live IV Reduction - Minimize IVs live across the loop.
 //===----------------------------------------------------------------------===//
@@ -1668,9 +631,18 @@ bool IndVarSimplify::simplifyAndExtend(Loop *L,
       }
     } while(!LoopPhis.empty());
 
+    // Continue if we disallowed widening.
+    if (!WidenIndVars)
+      continue;
+
     for (; !WideIVs.empty(); WideIVs.pop_back()) {
-      WidenIV Widener(WideIVs.back(), LI, SE, DT, DeadInsts, HasGuards);
-      if (PHINode *WidePhi = Widener.createWideIV(Rewriter)) {
+      unsigned ElimExt;
+      unsigned Widened;
+      if (PHINode *WidePhi = createWideIV(WideIVs.back(), LI, SE, Rewriter,
+                                          DT, DeadInsts, ElimExt, Widened,
+                                          HasGuards, UsePostIncrementRanges)) {
+        NumElimExt += ElimExt;
+        NumWidened += Widened;
         Changed = true;
         LoopPhis.push_back(WidePhi);
       }
@@ -1813,7 +785,7 @@ static bool mustExecuteUBIfPoisonOnPathTo(Instruction *Root,
 
     // If we can't analyze propagation through this instruction, just skip it
     // and transitive users.  Safe as false is a conservative result.
-    if (!propagatesPoison(I) && I != Root)
+    if (!propagatesPoison(cast<Operator>(I)) && I != Root)
       continue;
 
     if (KnownPoison.insert(I).second)
@@ -2318,42 +1290,116 @@ bool IndVarSimplify::sinkUnusedInvariants(Loop *L) {
   return MadeAnyChanges;
 }
 
-/// Return a symbolic upper bound for the backedge taken count of the loop.
-/// This is more general than getConstantMaxBackedgeTakenCount as it returns
-/// an arbitrary expression as opposed to only constants.
-/// TODO: Move into the ScalarEvolution class.
-static const SCEV* getMaxBackedgeTakenCount(ScalarEvolution &SE,
-                                            DominatorTree &DT, Loop *L) {
-  SmallVector<BasicBlock*, 16> ExitingBlocks;
-  L->getExitingBlocks(ExitingBlocks);
+static void replaceExitCond(BranchInst *BI, Value *NewCond,
+                            SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
+  auto *OldCond = BI->getCondition();
+  BI->setCondition(NewCond);
+  if (OldCond->use_empty())
+    DeadInsts.emplace_back(OldCond);
+}
 
-  // Form an expression for the maximum exit count possible for this loop. We
-  // merge the max and exact information to approximate a version of
-  // getConstantMaxBackedgeTakenCount which isn't restricted to just constants.
-  SmallVector<const SCEV*, 4> ExitCounts;
-  for (BasicBlock *ExitingBB : ExitingBlocks) {
-    const SCEV *ExitCount = SE.getExitCount(L, ExitingBB);
-    if (isa<SCEVCouldNotCompute>(ExitCount))
-      ExitCount = SE.getExitCount(L, ExitingBB,
-                                  ScalarEvolution::ConstantMaximum);
-    if (!isa<SCEVCouldNotCompute>(ExitCount)) {
-      assert(DT.dominates(ExitingBB, L->getLoopLatch()) &&
-             "We should only have known counts for exiting blocks that "
-             "dominate latch!");
-      ExitCounts.push_back(ExitCount);
-    }
+static void foldExit(const Loop *L, BasicBlock *ExitingBB, bool IsTaken,
+                     SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
+  BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator());
+  bool ExitIfTrue = !L->contains(*succ_begin(ExitingBB));
+  auto *OldCond = BI->getCondition();
+  auto *NewCond =
+      ConstantInt::get(OldCond->getType(), IsTaken ? ExitIfTrue : !ExitIfTrue);
+  replaceExitCond(BI, NewCond, DeadInsts);
+}
+
+static void replaceWithInvariantCond(
+    const Loop *L, BasicBlock *ExitingBB, ICmpInst::Predicate InvariantPred,
+    const SCEV *InvariantLHS, const SCEV *InvariantRHS, SCEVExpander &Rewriter,
+    SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
+  BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator());
+  Rewriter.setInsertPoint(BI);
+  auto *LHSV = Rewriter.expandCodeFor(InvariantLHS);
+  auto *RHSV = Rewriter.expandCodeFor(InvariantRHS);
+  bool ExitIfTrue = !L->contains(*succ_begin(ExitingBB));
+  if (ExitIfTrue)
+    InvariantPred = ICmpInst::getInversePredicate(InvariantPred);
+  IRBuilder<> Builder(BI);
+  auto *NewCond = Builder.CreateICmp(InvariantPred, LHSV, RHSV,
+                                     BI->getCondition()->getName());
+  replaceExitCond(BI, NewCond, DeadInsts);
+}
+
+static bool optimizeLoopExitWithUnknownExitCount(
+    const Loop *L, BranchInst *BI, BasicBlock *ExitingBB,
+    const SCEV *MaxIter, bool Inverted, bool SkipLastIter,
+    ScalarEvolution *SE, SCEVExpander &Rewriter,
+    SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
+  ICmpInst::Predicate Pred;
+  Value *LHS, *RHS;
+  using namespace PatternMatch;
+  BasicBlock *TrueSucc, *FalseSucc;
+  if (!match(BI, m_Br(m_ICmp(Pred, m_Value(LHS), m_Value(RHS)),
+                      m_BasicBlock(TrueSucc), m_BasicBlock(FalseSucc))))
+    return false;
+
+  assert((L->contains(TrueSucc) != L->contains(FalseSucc)) &&
+         "Not a loop exit!");
+
+  // 'LHS pred RHS' should now mean that we stay in loop.
+  if (L->contains(FalseSucc))
+    Pred = CmpInst::getInversePredicate(Pred);
+
+  // If we are proving loop exit, invert the predicate.
+  if (Inverted)
+    Pred = CmpInst::getInversePredicate(Pred);
+
+  const SCEV *LHSS = SE->getSCEVAtScope(LHS, L);
+  const SCEV *RHSS = SE->getSCEVAtScope(RHS, L);
+  // Can we prove it to be trivially true?
+  if (SE->isKnownPredicateAt(Pred, LHSS, RHSS, BI)) {
+    foldExit(L, ExitingBB, Inverted, DeadInsts);
+    return true;
+  }
+  // Further logic works for non-inverted condition only.
+  if (Inverted)
+    return false;
+
+  auto *ARTy = LHSS->getType();
+  auto *MaxIterTy = MaxIter->getType();
+  // If possible, adjust types.
+  if (SE->getTypeSizeInBits(ARTy) > SE->getTypeSizeInBits(MaxIterTy))
+    MaxIter = SE->getZeroExtendExpr(MaxIter, ARTy);
+  else if (SE->getTypeSizeInBits(ARTy) < SE->getTypeSizeInBits(MaxIterTy)) {
+    const SCEV *MinusOne = SE->getMinusOne(ARTy);
+    auto *MaxAllowedIter = SE->getZeroExtendExpr(MinusOne, MaxIterTy);
+    if (SE->isKnownPredicateAt(ICmpInst::ICMP_ULE, MaxIter, MaxAllowedIter, BI))
+      MaxIter = SE->getTruncateExpr(MaxIter, ARTy);
+  }
+
+  if (SkipLastIter) {
+    const SCEV *One = SE->getOne(MaxIter->getType());
+    MaxIter = SE->getMinusSCEV(MaxIter, One);
   }
-  if (ExitCounts.empty())
-    return SE.getCouldNotCompute();
-  return SE.getUMinFromMismatchedTypes(ExitCounts);
+
+  // Check if there is a loop-invariant predicate equivalent to our check.
+  auto LIP = SE->getLoopInvariantExitCondDuringFirstIterations(Pred, LHSS, RHSS,
+                                                               L, BI, MaxIter);
+  if (!LIP)
+    return false;
+
+  // Can we prove it to be trivially true?
+  if (SE->isKnownPredicateAt(LIP->Pred, LIP->LHS, LIP->RHS, BI))
+    foldExit(L, ExitingBB, Inverted, DeadInsts);
+  else
+    replaceWithInvariantCond(L, ExitingBB, LIP->Pred, LIP->LHS, LIP->RHS,
+                             Rewriter, DeadInsts);
+
+  return true;
 }
 
 bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) {
   SmallVector<BasicBlock*, 16> ExitingBlocks;
   L->getExitingBlocks(ExitingBlocks);
 
-  // Remove all exits which aren't both rewriteable and analyzeable.
-  auto NewEnd = llvm::remove_if(ExitingBlocks, [&](BasicBlock *ExitingBB) {
+  // Remove all exits which aren't both rewriteable and execute on every
+  // iteration.
+  llvm::erase_if(ExitingBlocks, [&](BasicBlock *ExitingBB) {
     // If our exitting block exits multiple loops, we can only rewrite the
     // innermost one.  Otherwise, we're changing how many times the innermost
     // loop runs before it exits.
@@ -2369,56 +1415,85 @@ bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) {
     if (isa<Constant>(BI->getCondition()))
       return true;
 
-    const SCEV *ExitCount = SE->getExitCount(L, ExitingBB);
-    if (isa<SCEVCouldNotCompute>(ExitCount))
+    // Likewise, the loop latch must be dominated by the exiting BB.
+    if (!DT->dominates(ExitingBB, L->getLoopLatch()))
       return true;
+
     return false;
   });
-  ExitingBlocks.erase(NewEnd, ExitingBlocks.end());
 
   if (ExitingBlocks.empty())
     return false;
 
   // Get a symbolic upper bound on the loop backedge taken count.
-  const SCEV *MaxExitCount = getMaxBackedgeTakenCount(*SE, *DT, L);
+  const SCEV *MaxExitCount = SE->getSymbolicMaxBackedgeTakenCount(L);
   if (isa<SCEVCouldNotCompute>(MaxExitCount))
     return false;
 
-  // Visit our exit blocks in order of dominance.  We know from the fact that
-  // all exits (left) are analyzeable that the must be a total dominance order
-  // between them as each must dominate the latch.  The visit order only
-  // matters for the provably equal case.
-  llvm::sort(ExitingBlocks,
-             [&](BasicBlock *A, BasicBlock *B) {
+  // Visit our exit blocks in order of dominance. We know from the fact that
+  // all exits must dominate the latch, so there is a total dominance order
+  // between them.
+  llvm::sort(ExitingBlocks, [&](BasicBlock *A, BasicBlock *B) {
                // std::sort sorts in ascending order, so we want the inverse of
                // the normal dominance relation.
                if (A == B) return false;
-               if (DT->properlyDominates(A, B)) return true;
-               if (DT->properlyDominates(B, A)) return false;
-               llvm_unreachable("expected total dominance order!");
-             });
+               if (DT->properlyDominates(A, B))
+                 return true;
+               else {
+                 assert(DT->properlyDominates(B, A) &&
+                        "expected total dominance order!");
+                 return false;
+               }
+  });
 #ifdef ASSERT
   for (unsigned i = 1; i < ExitingBlocks.size(); i++) {
     assert(DT->dominates(ExitingBlocks[i-1], ExitingBlocks[i]));
   }
 #endif
 
-  auto FoldExit = [&](BasicBlock *ExitingBB, bool IsTaken) {
-    BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator());
-    bool ExitIfTrue = !L->contains(*succ_begin(ExitingBB));
-    auto *OldCond = BI->getCondition();
-    auto *NewCond = ConstantInt::get(OldCond->getType(),
-                                     IsTaken ? ExitIfTrue : !ExitIfTrue);
-    BI->setCondition(NewCond);
-    if (OldCond->use_empty())
-      DeadInsts.emplace_back(OldCond);
-  };
-
   bool Changed = false;
+  bool SkipLastIter = false;
   SmallSet<const SCEV*, 8> DominatingExitCounts;
   for (BasicBlock *ExitingBB : ExitingBlocks) {
     const SCEV *ExitCount = SE->getExitCount(L, ExitingBB);
-    assert(!isa<SCEVCouldNotCompute>(ExitCount) && "checked above");
+    if (isa<SCEVCouldNotCompute>(ExitCount)) {
+      // Okay, we do not know the exit count here. Can we at least prove that it
+      // will remain the same within iteration space?
+      auto *BI = cast<BranchInst>(ExitingBB->getTerminator());
+      auto OptimizeCond = [&](bool Inverted, bool SkipLastIter) {
+        return optimizeLoopExitWithUnknownExitCount(
+            L, BI, ExitingBB, MaxExitCount, Inverted, SkipLastIter, SE,
+            Rewriter, DeadInsts);
+      };
+
+      // TODO: We might have proved that we can skip the last iteration for
+      // this check. In this case, we only want to check the condition on the
+      // pre-last iteration (MaxExitCount - 1). However, there is a nasty
+      // corner case:
+      //
+      //   for (i = len; i != 0; i--) { ... check (i ult X) ... }
+      //
+      // If we could not prove that len != 0, then we also could not prove that
+      // (len - 1) is not a UINT_MAX. If we simply query (len - 1), then
+      // OptimizeCond will likely not prove anything for it, even if it could
+      // prove the same fact for len.
+      //
+      // As a temporary solution, we query both last and pre-last iterations in
+      // hope that we will be able to prove triviality for at least one of
+      // them. We can stop querying MaxExitCount for this case once SCEV
+      // understands that (MaxExitCount - 1) will not overflow here.
+      if (OptimizeCond(false, false) || OptimizeCond(true, false))
+        Changed = true;
+      else if (SkipLastIter)
+        if (OptimizeCond(false, true) || OptimizeCond(true, true))
+          Changed = true;
+      continue;
+    }
+
+    if (MaxExitCount == ExitCount)
+      // If the loop has more than 1 iteration, all further checks will be
+      // executed 1 iteration less.
+      SkipLastIter = true;
 
     // If we know we'd exit on the first iteration, rewrite the exit to
     // reflect this.  This does not imply the loop must exit through this
@@ -2426,7 +1501,7 @@ bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) {
     // TODO: Given we know the backedge can't be taken, we should go ahead
     // and break it.  Or at least, kill all the header phis and simplify.
     if (ExitCount->isZero()) {
-      FoldExit(ExitingBB, true);
+      foldExit(L, ExitingBB, true, DeadInsts);
       Changed = true;
       continue;
     }
@@ -2448,7 +1523,7 @@ bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) {
     // one?
     if (SE->isLoopEntryGuardedByCond(L, CmpInst::ICMP_ULT,
                                      MaxExitCount, ExitCount)) {
-      FoldExit(ExitingBB, false);
+      foldExit(L, ExitingBB, false, DeadInsts);
       Changed = true;
       continue;
     }
@@ -2458,7 +1533,7 @@ bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) {
     // exiting iteration, but (from the visit order) strictly follows another
     // which does the same and is thus dead.
     if (!DominatingExitCounts.insert(ExitCount).second) {
-      FoldExit(ExitingBB, false);
+      foldExit(L, ExitingBB, false, DeadInsts);
       Changed = true;
       continue;
     }
@@ -2714,7 +1789,9 @@ bool IndVarSimplify::run(Loop *L) {
   if (optimizeLoopExits(L, Rewriter))  {
     Changed = true;
     // Given we've changed exit counts, notify SCEV
-    SE->forgetLoop(L);
+    // Some nested loops may share same folded exit basic block,
+    // thus we need to notify top most loop.
+    SE->forgetTopmostLoop(L);
   }
 
   // Try to form loop invariant tests for loop exits by changing how many
@@ -2791,11 +1868,15 @@ bool IndVarSimplify::run(Loop *L) {
 
   // Now that we're done iterating through lists, clean up any instructions
   // which are now dead.
-  while (!DeadInsts.empty())
-    if (Instruction *Inst =
-            dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val()))
+  while (!DeadInsts.empty()) {
+    Value *V = DeadInsts.pop_back_val();
+
+    if (PHINode *PHI = dyn_cast_or_null<PHINode>(V))
+      Changed |= RecursivelyDeleteDeadPHINode(PHI, TLI, MSSAU.get());
+    else if (Instruction *Inst = dyn_cast_or_null<Instruction>(V))
       Changed |=
           RecursivelyDeleteTriviallyDeadInstructions(Inst, TLI, MSSAU.get());
+  }
 
   // The Rewriter may not be used from this point on.
 
@@ -2845,7 +1926,8 @@ PreservedAnalyses IndVarSimplifyPass::run(Loop &L, LoopAnalysisManager &AM,
   Function *F = L.getHeader()->getParent();
   const DataLayout &DL = F->getParent()->getDataLayout();
 
-  IndVarSimplify IVS(&AR.LI, &AR.SE, &AR.DT, DL, &AR.TLI, &AR.TTI, AR.MSSA);
+  IndVarSimplify IVS(&AR.LI, &AR.SE, &AR.DT, DL, &AR.TLI, &AR.TTI, AR.MSSA,
+                     WidenIndVars && AllowIVWidening);
   if (!IVS.run(&L))
     return PreservedAnalyses::all();
 
@@ -2882,7 +1964,7 @@ struct IndVarSimplifyLegacyPass : public LoopPass {
     if (MSSAAnalysis)
       MSSA = &MSSAAnalysis->getMSSA();
 
-    IndVarSimplify IVS(LI, SE, DT, DL, TLI, TTI, MSSA);
+    IndVarSimplify IVS(LI, SE, DT, DL, TLI, TTI, MSSA, AllowIVWidening);
     return IVS.run(L);
   }
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
index 30e4822b6769..6e09dec198c2 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
@@ -52,6 +52,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -109,12 +110,12 @@ static cl::opt<bool> PrintChangedLoops("irce-print-changed-loops", cl::Hidden,
 static cl::opt<bool> PrintRangeChecks("irce-print-range-checks", cl::Hidden,
                                       cl::init(false));
 
-static cl::opt<int> MaxExitProbReciprocal("irce-max-exit-prob-reciprocal",
-                                          cl::Hidden, cl::init(10));
-
 static cl::opt<bool> SkipProfitabilityChecks("irce-skip-profitability-checks",
                                              cl::Hidden, cl::init(false));
 
+static cl::opt<unsigned> MinRuntimeIterations("irce-min-runtime-iterations",
+                                              cl::Hidden, cl::init(10));
+
 static cl::opt<bool> AllowUnsignedLatchCondition("irce-allow-unsigned-latch",
                                                  cl::Hidden, cl::init(true));
 
@@ -145,7 +146,6 @@ class InductiveRangeCheck {
   const SCEV *Step = nullptr;
   const SCEV *End = nullptr;
   Use *CheckUse = nullptr;
-  bool IsSigned = true;
 
   static bool parseRangeCheckICmp(Loop *L, ICmpInst *ICI, ScalarEvolution &SE,
                                   Value *&Index, Value *&Length,
@@ -160,7 +160,6 @@ public:
   const SCEV *getBegin() const { return Begin; }
   const SCEV *getStep() const { return Step; }
   const SCEV *getEnd() const { return End; }
-  bool isSigned() const { return IsSigned; }
 
   void print(raw_ostream &OS) const {
     OS << "InductiveRangeCheck:\n";
@@ -229,17 +228,27 @@ public:
                                SmallVectorImpl<InductiveRangeCheck> &Checks);
 };
 
+struct LoopStructure;
+
 class InductiveRangeCheckElimination {
   ScalarEvolution &SE;
   BranchProbabilityInfo *BPI;
   DominatorTree &DT;
   LoopInfo &LI;
 
+  using GetBFIFunc =
+      llvm::Optional<llvm::function_ref<llvm::BlockFrequencyInfo &()> >;
+  GetBFIFunc GetBFI;
+
+  // Returns true if it is profitable to do a transform basing on estimation of
+  // number of iterations.
+  bool isProfitableToTransform(const Loop &L, LoopStructure &LS);
+
 public:
   InductiveRangeCheckElimination(ScalarEvolution &SE,
                                  BranchProbabilityInfo *BPI, DominatorTree &DT,
-                                 LoopInfo &LI)
-      : SE(SE), BPI(BPI), DT(DT), LI(LI) {}
+                                 LoopInfo &LI, GetBFIFunc GetBFI = None)
+      : SE(SE), BPI(BPI), DT(DT), LI(LI), GetBFI(GetBFI) {}
 
   bool run(Loop *L, function_ref<void(Loop *, bool)> LPMAddNewLoop);
 };
@@ -394,7 +403,6 @@ void InductiveRangeCheck::extractRangeChecksFromCond(
   IRC.Begin = IndexAddRec->getStart();
   IRC.Step = IndexAddRec->getStepRecurrence(SE);
   IRC.CheckUse = &ConditionUse;
-  IRC.IsSigned = IsSigned;
   Checks.push_back(IRC);
 }
 
@@ -497,9 +505,8 @@ struct LoopStructure {
     return Result;
   }
 
-  static Optional<LoopStructure> parseLoopStructure(ScalarEvolution &,
-                                                    BranchProbabilityInfo *BPI,
-                                                    Loop &, const char *&);
+  static Optional<LoopStructure> parseLoopStructure(ScalarEvolution &, Loop &,
+                                                    const char *&);
 };
 
 /// This class is used to constrain loops to run within a given iteration space.
@@ -743,8 +750,7 @@ static bool isSafeIncreasingBound(const SCEV *Start,
 }
 
 Optional<LoopStructure>
-LoopStructure::parseLoopStructure(ScalarEvolution &SE,
-                                  BranchProbabilityInfo *BPI, Loop &L,
+LoopStructure::parseLoopStructure(ScalarEvolution &SE, Loop &L,
                                   const char *&FailureReason) {
   if (!L.isLoopSimplifyForm()) {
     FailureReason = "loop not in LoopSimplify form";
@@ -779,16 +785,6 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE,
 
   unsigned LatchBrExitIdx = LatchBr->getSuccessor(0) == Header ? 1 : 0;
 
-  BranchProbability ExitProbability =
-      BPI ? BPI->getEdgeProbability(LatchBr->getParent(), LatchBrExitIdx)
-          : BranchProbability::getZero();
-
-  if (!SkipProfitabilityChecks &&
-      ExitProbability > BranchProbability(1, MaxExitProbReciprocal)) {
-    FailureReason = "short running loop, not profitable";
-    return None;
-  }
-
   ICmpInst *ICI = dyn_cast<ICmpInst>(LatchBr->getCondition());
   if (!ICI || !isa<IntegerType>(ICI->getOperand(0)->getType())) {
     FailureReason = "latch terminator branch not conditional on integral icmp";
@@ -1772,14 +1768,25 @@ PreservedAnalyses IRCEPass::run(Function &F, FunctionAnalysisManager &AM) {
   auto &BPI = AM.getResult<BranchProbabilityAnalysis>(F);
   LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
 
-  InductiveRangeCheckElimination IRCE(SE, &BPI, DT, LI);
+  // Get BFI analysis result on demand. Please note that modification of
+  // CFG invalidates this analysis and we should handle it.
+  auto getBFI = [&F, &AM ]()->BlockFrequencyInfo & {
+    return AM.getResult<BlockFrequencyAnalysis>(F);
+  };
+  InductiveRangeCheckElimination IRCE(SE, &BPI, DT, LI, { getBFI });
 
   bool Changed = false;
+  {
+    bool CFGChanged = false;
+    for (const auto &L : LI) {
+      CFGChanged |= simplifyLoop(L, &DT, &LI, &SE, nullptr, nullptr,
+                                 /*PreserveLCSSA=*/false);
+      Changed |= formLCSSARecursively(*L, DT, &LI, &SE);
+    }
+    Changed |= CFGChanged;
 
-  for (const auto &L : LI) {
-    Changed |= simplifyLoop(L, &DT, &LI, &SE, nullptr, nullptr,
-                            /*PreserveLCSSA=*/false);
-    Changed |= formLCSSARecursively(*L, DT, &LI, &SE);
+    if (CFGChanged && !SkipProfitabilityChecks)
+      AM.invalidate<BlockFrequencyAnalysis>(F);
   }
 
   SmallPriorityWorklist<Loop *, 4> Worklist;
@@ -1791,7 +1798,11 @@ PreservedAnalyses IRCEPass::run(Function &F, FunctionAnalysisManager &AM) {
 
   while (!Worklist.empty()) {
     Loop *L = Worklist.pop_back_val();
-    Changed |= IRCE.run(L, LPMAddNewLoop);
+    if (IRCE.run(L, LPMAddNewLoop)) {
+      Changed = true;
+      if (!SkipProfitabilityChecks)
+        AM.invalidate<BlockFrequencyAnalysis>(F);
+    }
   }
 
   if (!Changed)
@@ -1832,6 +1843,37 @@ bool IRCELegacyPass::runOnFunction(Function &F) {
   return Changed;
 }
 
+bool
+InductiveRangeCheckElimination::isProfitableToTransform(const Loop &L,
+                                                        LoopStructure &LS) {
+  if (SkipProfitabilityChecks)
+    return true;
+  if (GetBFI.hasValue()) {
+    BlockFrequencyInfo &BFI = (*GetBFI)();
+    uint64_t hFreq = BFI.getBlockFreq(LS.Header).getFrequency();
+    uint64_t phFreq = BFI.getBlockFreq(L.getLoopPreheader()).getFrequency();
+    if (phFreq != 0 && hFreq != 0 && (hFreq / phFreq < MinRuntimeIterations)) {
+      LLVM_DEBUG(dbgs() << "irce: could not prove profitability: "
+                        << "the estimated number of iterations basing on "
+                           "frequency info is " << (hFreq / phFreq) << "\n";);
+      return false;
+    }
+    return true;
+  }
+
+  if (!BPI)
+    return true;
+  BranchProbability ExitProbability =
+      BPI->getEdgeProbability(LS.Latch, LS.LatchBrExitIdx);
+  if (ExitProbability > BranchProbability(1, MinRuntimeIterations)) {
+    LLVM_DEBUG(dbgs() << "irce: could not prove profitability: "
+                      << "the exit probability is too big " << ExitProbability
+                      << "\n";);
+    return false;
+  }
+  return true;
+}
+
 bool InductiveRangeCheckElimination::run(
     Loop *L, function_ref<void(Loop *, bool)> LPMAddNewLoop) {
   if (L->getBlocks().size() >= LoopSizeCutoff) {
@@ -1871,13 +1913,15 @@ bool InductiveRangeCheckElimination::run(
 
   const char *FailureReason = nullptr;
   Optional<LoopStructure> MaybeLoopStructure =
-      LoopStructure::parseLoopStructure(SE, BPI, *L, FailureReason);
+      LoopStructure::parseLoopStructure(SE, *L, FailureReason);
   if (!MaybeLoopStructure.hasValue()) {
     LLVM_DEBUG(dbgs() << "irce: could not parse loop structure: "
                       << FailureReason << "\n";);
     return false;
   }
   LoopStructure LS = MaybeLoopStructure.getValue();
+  if (!isProfitableToTransform(*L, LS))
+    return false;
   const SCEVAddRecExpr *IndVar =
       cast<SCEVAddRecExpr>(SE.getMinusSCEV(SE.getSCEV(LS.IndVarBase), SE.getSCEV(LS.IndVarStep)));
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
index db9cc58bbfc4..332eb10ac16b 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
@@ -88,6 +88,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Scalar/InferAddressSpaces.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
@@ -108,6 +109,7 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Use.h"
 #include "llvm/IR/User.h"
@@ -146,13 +148,7 @@ namespace {
 using ValueToAddrSpaceMapTy = DenseMap<const Value *, unsigned>;
 using PostorderStackTy = llvm::SmallVector<PointerIntPair<Value *, 1, bool>, 4>;
 
-/// InferAddressSpaces
 class InferAddressSpaces : public FunctionPass {
-  const TargetTransformInfo *TTI = nullptr;
-  const DataLayout *DL = nullptr;
-
-  /// Target specific address space which uses of should be replaced if
-  /// possible.
   unsigned FlatAddrSpace = 0;
 
 public:
@@ -168,8 +164,16 @@ public:
   }
 
   bool runOnFunction(Function &F) override;
+};
+
+class InferAddressSpacesImpl {
+  const TargetTransformInfo *TTI = nullptr;
+  const DataLayout *DL = nullptr;
+
+  /// Target specific address space which uses of should be replaced if
+  /// possible.
+  unsigned FlatAddrSpace = 0;
 
-private:
   // Returns the new address space of V if updated; otherwise, returns None.
   Optional<unsigned>
   updateAddressSpace(const Value &V,
@@ -211,6 +215,11 @@ private:
     const ValueToValueMapTy &ValueWithNewAddrSpace,
     SmallVectorImpl<const Use *> *UndefUsesToFix) const;
   unsigned joinAddressSpaces(unsigned AS1, unsigned AS2) const;
+
+public:
+  InferAddressSpacesImpl(const TargetTransformInfo *TTI, unsigned FlatAddrSpace)
+      : TTI(TTI), FlatAddrSpace(FlatAddrSpace) {}
+  bool run(Function &F);
 };
 
 } // end anonymous namespace
@@ -286,7 +295,8 @@ static bool isAddressExpression(const Value &V, const DataLayout &DL,
   case Instruction::IntToPtr:
     return isNoopPtrIntCastPair(Op, DL, TTI);
   default:
-    return false;
+    // That value is an address expression if it has an assumed address space.
+    return TTI->getAssumedAddrSpace(&V) != UninitializedAddressSpace;
   }
 }
 
@@ -325,9 +335,9 @@ getPointerOperands(const Value &V, const DataLayout &DL,
   }
 }
 
-bool InferAddressSpaces::rewriteIntrinsicOperands(IntrinsicInst *II,
-                                                  Value *OldV,
-                                                  Value *NewV) const {
+bool InferAddressSpacesImpl::rewriteIntrinsicOperands(IntrinsicInst *II,
+                                                      Value *OldV,
+                                                      Value *NewV) const {
   Module *M = II->getParent()->getParent()->getParent();
 
   switch (II->getIntrinsicID()) {
@@ -354,7 +364,7 @@ bool InferAddressSpaces::rewriteIntrinsicOperands(IntrinsicInst *II,
   }
 }
 
-void InferAddressSpaces::collectRewritableIntrinsicOperands(
+void InferAddressSpacesImpl::collectRewritableIntrinsicOperands(
     IntrinsicInst *II, PostorderStackTy &PostorderStack,
     DenseSet<Value *> &Visited) const {
   auto IID = II->getIntrinsicID();
@@ -379,7 +389,7 @@ void InferAddressSpaces::collectRewritableIntrinsicOperands(
 // Returns all flat address expressions in function F. The elements are
 // If V is an unvisited flat address expression, appends V to PostorderStack
 // and marks it as visited.
-void InferAddressSpaces::appendsFlatAddressExpressionToPostorderStack(
+void InferAddressSpacesImpl::appendsFlatAddressExpressionToPostorderStack(
     Value *V, PostorderStackTy &PostorderStack,
     DenseSet<Value *> &Visited) const {
   assert(V->getType()->isPointerTy());
@@ -394,8 +404,8 @@ void InferAddressSpaces::appendsFlatAddressExpressionToPostorderStack(
     return;
   }
 
-  if (isAddressExpression(*V, *DL, TTI) &&
-      V->getType()->getPointerAddressSpace() == FlatAddrSpace) {
+  if (V->getType()->getPointerAddressSpace() == FlatAddrSpace &&
+      isAddressExpression(*V, *DL, TTI)) {
     if (Visited.insert(V).second) {
       PostorderStack.emplace_back(V, false);
 
@@ -413,7 +423,7 @@ void InferAddressSpaces::appendsFlatAddressExpressionToPostorderStack(
 // Returns all flat address expressions in function F. The elements are ordered
 // ordered in postorder.
 std::vector<WeakTrackingVH>
-InferAddressSpaces::collectFlatAddressExpressions(Function &F) const {
+InferAddressSpacesImpl::collectFlatAddressExpressions(Function &F) const {
   // This function implements a non-recursive postorder traversal of a partial
   // use-def graph of function F.
   PostorderStackTy PostorderStack;
@@ -478,9 +488,12 @@ InferAddressSpaces::collectFlatAddressExpressions(Function &F) const {
     }
     // Otherwise, adds its operands to the stack and explores them.
     PostorderStack.back().setInt(true);
-    for (Value *PtrOperand : getPointerOperands(*TopVal, *DL, TTI)) {
-      appendsFlatAddressExpressionToPostorderStack(PtrOperand, PostorderStack,
-                                                   Visited);
+    // Skip values with an assumed address space.
+    if (TTI->getAssumedAddrSpace(TopVal) == UninitializedAddressSpace) {
+      for (Value *PtrOperand : getPointerOperands(*TopVal, *DL, TTI)) {
+        appendsFlatAddressExpressionToPostorderStack(PtrOperand, PostorderStack,
+                                                     Visited);
+      }
     }
   }
   return Postorder;
@@ -520,7 +533,7 @@ static Value *operandWithNewAddressSpaceOrCreateUndef(
 //
 // This may also return nullptr in the case the instruction could not be
 // rewritten.
-Value *InferAddressSpaces::cloneInstructionWithNewAddressSpace(
+Value *InferAddressSpacesImpl::cloneInstructionWithNewAddressSpace(
     Instruction *I, unsigned NewAddrSpace,
     const ValueToValueMapTy &ValueWithNewAddrSpace,
     SmallVectorImpl<const Use *> *UndefUsesToFix) const {
@@ -555,6 +568,16 @@ Value *InferAddressSpaces::cloneInstructionWithNewAddressSpace(
     return nullptr;
   }
 
+  unsigned AS = TTI->getAssumedAddrSpace(I);
+  if (AS != UninitializedAddressSpace) {
+    // For the assumed address space, insert an `addrspacecast` to make that
+    // explicit.
+    auto *NewPtrTy = I->getType()->getPointerElementType()->getPointerTo(AS);
+    auto *NewI = new AddrSpaceCastInst(I, NewPtrTy);
+    NewI->insertAfter(I);
+    return NewI;
+  }
+
   // Computes the converted pointer operands.
   SmallVector<Value *, 4> NewPointerOperands;
   for (const Use &OperandUse : I->operands()) {
@@ -583,7 +606,7 @@ Value *InferAddressSpaces::cloneInstructionWithNewAddressSpace(
     GetElementPtrInst *GEP = cast<GetElementPtrInst>(I);
     GetElementPtrInst *NewGEP = GetElementPtrInst::Create(
         GEP->getSourceElementType(), NewPointerOperands[0],
-        SmallVector<Value *, 4>(GEP->idx_begin(), GEP->idx_end()));
+        SmallVector<Value *, 4>(GEP->indices()));
     NewGEP->setIsInBounds(GEP->isInBounds());
     return NewGEP;
   }
@@ -695,13 +718,13 @@ static Value *cloneConstantExprWithNewAddressSpace(
 // expression whose address space needs to be modified, in postorder.
 //
 // See cloneInstructionWithNewAddressSpace for the meaning of UndefUsesToFix.
-Value *InferAddressSpaces::cloneValueWithNewAddressSpace(
-  Value *V, unsigned NewAddrSpace,
-  const ValueToValueMapTy &ValueWithNewAddrSpace,
-  SmallVectorImpl<const Use *> *UndefUsesToFix) const {
+Value *InferAddressSpacesImpl::cloneValueWithNewAddressSpace(
+    Value *V, unsigned NewAddrSpace,
+    const ValueToValueMapTy &ValueWithNewAddrSpace,
+    SmallVectorImpl<const Use *> *UndefUsesToFix) const {
   // All values in Postorder are flat address expressions.
-  assert(isAddressExpression(*V, *DL, TTI) &&
-         V->getType()->getPointerAddressSpace() == FlatAddrSpace);
+  assert(V->getType()->getPointerAddressSpace() == FlatAddrSpace &&
+         isAddressExpression(*V, *DL, TTI));
 
   if (Instruction *I = dyn_cast<Instruction>(V)) {
     Value *NewV = cloneInstructionWithNewAddressSpace(
@@ -721,8 +744,8 @@ Value *InferAddressSpaces::cloneValueWithNewAddressSpace(
 
 // Defines the join operation on the address space lattice (see the file header
 // comments).
-unsigned InferAddressSpaces::joinAddressSpaces(unsigned AS1,
-                                               unsigned AS2) const {
+unsigned InferAddressSpacesImpl::joinAddressSpaces(unsigned AS1,
+                                                   unsigned AS2) const {
   if (AS1 == FlatAddrSpace || AS2 == FlatAddrSpace)
     return FlatAddrSpace;
 
@@ -735,11 +758,7 @@ unsigned InferAddressSpaces::joinAddressSpaces(unsigned AS1,
   return (AS1 == AS2) ? AS1 : FlatAddrSpace;
 }
 
-bool InferAddressSpaces::runOnFunction(Function &F) {
-  if (skipFunction(F))
-    return false;
-
-  TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+bool InferAddressSpacesImpl::run(Function &F) {
   DL = &F.getParent()->getDataLayout();
 
   if (AssumeDefaultIsFlatAddressSpace)
@@ -766,7 +785,7 @@ bool InferAddressSpaces::runOnFunction(Function &F) {
 
 // Constants need to be tracked through RAUW to handle cases with nested
 // constant expressions, so wrap values in WeakTrackingVH.
-void InferAddressSpaces::inferAddressSpaces(
+void InferAddressSpacesImpl::inferAddressSpaces(
     ArrayRef<WeakTrackingVH> Postorder,
     ValueToAddrSpaceMapTy *InferredAddrSpace) const {
   SetVector<Value *> Worklist(Postorder.begin(), Postorder.end());
@@ -810,7 +829,7 @@ void InferAddressSpaces::inferAddressSpaces(
   }
 }
 
-Optional<unsigned> InferAddressSpaces::updateAddressSpace(
+Optional<unsigned> InferAddressSpacesImpl::updateAddressSpace(
     const Value &V, const ValueToAddrSpaceMapTy &InferredAddrSpace) const {
   assert(InferredAddrSpace.count(&V));
 
@@ -848,15 +867,24 @@ Optional<unsigned> InferAddressSpaces::updateAddressSpace(
     else
       NewAS = joinAddressSpaces(Src0AS, Src1AS);
   } else {
-    for (Value *PtrOperand : getPointerOperands(V, *DL, TTI)) {
-      auto I = InferredAddrSpace.find(PtrOperand);
-      unsigned OperandAS = I != InferredAddrSpace.end() ?
-        I->second : PtrOperand->getType()->getPointerAddressSpace();
-
-      // join(flat, *) = flat. So we can break if NewAS is already flat.
-      NewAS = joinAddressSpaces(NewAS, OperandAS);
-      if (NewAS == FlatAddrSpace)
-        break;
+    unsigned AS = TTI->getAssumedAddrSpace(&V);
+    if (AS != UninitializedAddressSpace) {
+      // Use the assumed address space directly.
+      NewAS = AS;
+    } else {
+      // Otherwise, infer the address space from its pointer operands.
+      for (Value *PtrOperand : getPointerOperands(V, *DL, TTI)) {
+        auto I = InferredAddrSpace.find(PtrOperand);
+        unsigned OperandAS =
+            I != InferredAddrSpace.end()
+                ? I->second
+                : PtrOperand->getType()->getPointerAddressSpace();
+
+        // join(flat, *) = flat. So we can break if NewAS is already flat.
+        NewAS = joinAddressSpaces(NewAS, OperandAS);
+        if (NewAS == FlatAddrSpace)
+          break;
+      }
     }
   }
 
@@ -947,7 +975,8 @@ static bool handleMemIntrinsicPtrUse(MemIntrinsic *MI, Value *OldV,
 
 // \p returns true if it is OK to change the address space of constant \p C with
 // a ConstantExpr addrspacecast.
-bool InferAddressSpaces::isSafeToCastConstAddrSpace(Constant *C, unsigned NewAS) const {
+bool InferAddressSpacesImpl::isSafeToCastConstAddrSpace(Constant *C,
+                                                        unsigned NewAS) const {
   assert(NewAS != UninitializedAddressSpace);
 
   unsigned SrcAS = C->getType()->getPointerAddressSpace();
@@ -986,7 +1015,7 @@ static Value::use_iterator skipToNextUser(Value::use_iterator I,
   return I;
 }
 
-bool InferAddressSpaces::rewriteWithNewAddressSpaces(
+bool InferAddressSpacesImpl::rewriteWithNewAddressSpaces(
     const TargetTransformInfo &TTI, ArrayRef<WeakTrackingVH> Postorder,
     const ValueToAddrSpaceMapTy &InferredAddrSpace, Function *F) const {
   // For each address expression to be modified, creates a clone of it with its
@@ -997,6 +1026,12 @@ bool InferAddressSpaces::rewriteWithNewAddressSpaces(
   SmallVector<const Use *, 32> UndefUsesToFix;
   for (Value* V : Postorder) {
     unsigned NewAddrSpace = InferredAddrSpace.lookup(V);
+
+    // In some degenerate cases (e.g. invalid IR in unreachable code), we may
+    // not even infer the value to have its original address space.
+    if (NewAddrSpace == UninitializedAddressSpace)
+      continue;
+
     if (V->getType()->getPointerAddressSpace() != NewAddrSpace) {
       Value *New = cloneValueWithNewAddressSpace(
           V, NewAddrSpace, ValueWithNewAddrSpace, &UndefUsesToFix);
@@ -1062,6 +1097,9 @@ bool InferAddressSpaces::rewriteWithNewAddressSpaces(
       }
 
       User *CurUser = U.getUser();
+      // Skip if the current user is the new value itself.
+      if (CurUser == NewV)
+        continue;
       // Handle more complex cases like intrinsic that need to be remangled.
       if (auto *MI = dyn_cast<MemIntrinsic>(CurUser)) {
         if (!MI->isVolatile() && handleMemIntrinsicPtrUse(MI, V, NewV))
@@ -1148,6 +1186,34 @@ bool InferAddressSpaces::rewriteWithNewAddressSpaces(
   return true;
 }
 
+bool InferAddressSpaces::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  return InferAddressSpacesImpl(
+             &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F),
+             FlatAddrSpace)
+      .run(F);
+}
+
 FunctionPass *llvm::createInferAddressSpacesPass(unsigned AddressSpace) {
   return new InferAddressSpaces(AddressSpace);
 }
+
+InferAddressSpacesPass::InferAddressSpacesPass()
+    : FlatAddrSpace(UninitializedAddressSpace) {}
+InferAddressSpacesPass::InferAddressSpacesPass(unsigned AddressSpace)
+    : FlatAddrSpace(AddressSpace) {}
+
+PreservedAnalyses InferAddressSpacesPass::run(Function &F,
+                                              FunctionAnalysisManager &AM) {
+  bool Changed =
+      InferAddressSpacesImpl(&AM.getResult<TargetIRAnalysis>(F), FlatAddrSpace)
+          .run(F);
+  if (Changed) {
+    PreservedAnalyses PA;
+    PA.preserveSet<CFGAnalyses>();
+    return PA;
+  }
+  return PreservedAnalyses::all();
+}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp
index e87b622ab19f..c11d2e4c1d6b 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp
@@ -20,8 +20,10 @@
 #include "llvm/IR/Type.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/Local.h"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "instsimplify"
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index 2f379b7f6160..10b08b4e2224 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -32,6 +32,7 @@
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
@@ -104,6 +105,11 @@ static cl::opt<bool> PrintLVIAfterJumpThreading(
     cl::desc("Print the LazyValueInfo cache after JumpThreading"), cl::init(false),
     cl::Hidden);
 
+static cl::opt<bool> JumpThreadingFreezeSelectCond(
+    "jump-threading-freeze-select-cond",
+    cl::desc("Freeze the condition when unfolding select"), cl::init(false),
+    cl::Hidden);
+
 static cl::opt<bool> ThreadAcrossLoopHeaders(
     "jump-threading-across-loop-headers",
     cl::desc("Allow JumpThreading to thread across loop headers, for testing"),
@@ -133,7 +139,8 @@ namespace {
   public:
     static char ID; // Pass identification
 
-    JumpThreading(int T = -1) : FunctionPass(ID), Impl(T) {
+    JumpThreading(bool InsertFreezeWhenUnfoldingSelect = false, int T = -1)
+        : FunctionPass(ID), Impl(InsertFreezeWhenUnfoldingSelect, T) {
       initializeJumpThreadingPass(*PassRegistry::getPassRegistry());
     }
 
@@ -147,6 +154,7 @@ namespace {
       AU.addPreserved<LazyValueInfoWrapperPass>();
       AU.addPreserved<GlobalsAAWrapperPass>();
       AU.addRequired<TargetLibraryInfoWrapperPass>();
+      AU.addRequired<TargetTransformInfoWrapperPass>();
     }
 
     void releaseMemory() override { Impl.releaseMemory(); }
@@ -166,11 +174,12 @@ INITIALIZE_PASS_END(JumpThreading, "jump-threading",
                 "Jump Threading", false, false)
 
 // Public interface to the Jump Threading pass
-FunctionPass *llvm::createJumpThreadingPass(int Threshold) {
-  return new JumpThreading(Threshold);
+FunctionPass *llvm::createJumpThreadingPass(bool InsertFr, int Threshold) {
+  return new JumpThreading(InsertFr, Threshold);
 }
 
-JumpThreadingPass::JumpThreadingPass(int T) {
+JumpThreadingPass::JumpThreadingPass(bool InsertFr, int T) {
+  InsertFreezeWhenUnfoldingSelect = JumpThreadingFreezeSelectCond | InsertFr;
   DefaultBBDupThreshold = (T == -1) ? BBDuplicateThreshold : unsigned(T);
 }
 
@@ -304,6 +313,10 @@ static void updatePredecessorProfileMetadata(PHINode *PN, BasicBlock *BB) {
 bool JumpThreading::runOnFunction(Function &F) {
   if (skipFunction(F))
     return false;
+  auto TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+  // Jump Threading has no sense for the targets with divergent CF
+  if (TTI->hasBranchDivergence())
+    return false;
   auto TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
   auto DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   auto LVI = &getAnalysis<LazyValueInfoWrapperPass>().getLVI();
@@ -328,6 +341,10 @@ bool JumpThreading::runOnFunction(Function &F) {
 
 PreservedAnalyses JumpThreadingPass::run(Function &F,
                                          FunctionAnalysisManager &AM) {
+  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+  // Jump Threading has no sense for the targets with divergent CF
+  if (TTI.hasBranchDivergence())
+    return PreservedAnalyses::all();
   auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
   auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
   auto &LVI = AM.getResult<LazyValueAnalysis>(F);
@@ -345,6 +362,11 @@ PreservedAnalyses JumpThreadingPass::run(Function &F,
   bool Changed = runImpl(F, &TLI, &LVI, &AA, &DTU, F.hasProfileData(),
                          std::move(BFI), std::move(BPI));
 
+  if (PrintLVIAfterJumpThreading) {
+    dbgs() << "LVI for function '" << F.getName() << "':\n";
+    LVI.printLVI(F, DTU.getDomTree(), dbgs());
+  }
+
   if (!Changed)
     return PreservedAnalyses::all();
   PreservedAnalyses PA;
@@ -397,7 +419,7 @@ bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
       Unreachable.insert(&BB);
 
   if (!ThreadAcrossLoopHeaders)
-    FindLoopHeaders(F);
+    findLoopHeaders(F);
 
   bool EverChanged = false;
   bool Changed;
@@ -406,7 +428,7 @@ bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
     for (auto &BB : F) {
       if (Unreachable.count(&BB))
         continue;
-      while (ProcessBlock(&BB)) // Thread all of the branches we can over BB.
+      while (processBlock(&BB)) // Thread all of the branches we can over BB.
         Changed = true;
 
       // Jump threading may have introduced redundant debug values into BB
@@ -421,7 +443,7 @@ bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
         continue;
 
       if (pred_empty(&BB)) {
-        // When ProcessBlock makes BB unreachable it doesn't bother to fix up
+        // When processBlock makes BB unreachable it doesn't bother to fix up
         // the instructions in it. We must remove BB to prevent invalid IR.
         LLVM_DEBUG(dbgs() << "  JT: Deleting dead block '" << BB.getName()
                           << "' with terminator: " << *BB.getTerminator()
@@ -433,7 +455,7 @@ bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
         continue;
       }
 
-      // ProcessBlock doesn't thread BBs with unconditional TIs. However, if BB
+      // processBlock doesn't thread BBs with unconditional TIs. However, if BB
       // is "almost empty", we attempt to merge BB with its sole successor.
       auto *BI = dyn_cast<BranchInst>(BB.getTerminator());
       if (BI && BI->isUnconditional()) {
@@ -467,7 +489,7 @@ bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
 // at the end of block. RAUW unconditionally replaces all uses
 // including the guards/assumes themselves and the uses before the
 // guard/assume.
-static void ReplaceFoldableUses(Instruction *Cond, Value *ToVal) {
+static void replaceFoldableUses(Instruction *Cond, Value *ToVal) {
   assert(Cond->getType() == ToVal->getType());
   auto *BB = Cond->getParent();
   // We can unconditionally replace all uses in non-local blocks (i.e. uses
@@ -531,10 +553,18 @@ static unsigned getJumpThreadDuplicationCost(BasicBlock *BB,
     // Debugger intrinsics don't incur code size.
     if (isa<DbgInfoIntrinsic>(I)) continue;
 
+    // Pseudo-probes don't incur code size.
+    if (isa<PseudoProbeInst>(I))
+      continue;
+
     // If this is a pointer->pointer bitcast, it is free.
     if (isa<BitCastInst>(I) && I->getType()->isPointerTy())
       continue;
 
+    // Freeze instruction is free, too.
+    if (isa<FreezeInst>(I))
+      continue;
+
     // Bail out if this instruction gives back a token type, it is not possible
     // to duplicate it if it is used outside this BB.
     if (I->getType()->isTokenTy() && I->isUsedOutsideOfBlock(BB))
@@ -562,7 +592,7 @@ static unsigned getJumpThreadDuplicationCost(BasicBlock *BB,
   return Size > Bonus ? Size - Bonus : 0;
 }
 
-/// FindLoopHeaders - We do not want jump threading to turn proper loop
+/// findLoopHeaders - We do not want jump threading to turn proper loop
 /// structures into irreducible loops.  Doing this breaks up the loop nesting
 /// hierarchy and pessimizes later transformations.  To prevent this from
 /// happening, we first have to find the loop headers.  Here we approximate this
@@ -576,7 +606,7 @@ static unsigned getJumpThreadDuplicationCost(BasicBlock *BB,
 /// within the loop (forming a nested loop).  This simple analysis is not rich
 /// enough to track all of these properties and keep it up-to-date as the CFG
 /// mutates, so we don't allow any of these transformations.
-void JumpThreadingPass::FindLoopHeaders(Function &F) {
+void JumpThreadingPass::findLoopHeaders(Function &F) {
   SmallVector<std::pair<const BasicBlock*,const BasicBlock*>, 32> Edges;
   FindFunctionBackedges(F, Edges);
 
@@ -603,13 +633,13 @@ static Constant *getKnownConstant(Value *Val, ConstantPreference Preference) {
   return dyn_cast<ConstantInt>(Val);
 }
 
-/// ComputeValueKnownInPredecessors - Given a basic block BB and a value V, see
+/// computeValueKnownInPredecessors - Given a basic block BB and a value V, see
 /// if we can infer that the value is a known ConstantInt/BlockAddress or undef
 /// in any of our predecessors.  If so, return the known list of value and pred
 /// BB in the result vector.
 ///
 /// This returns true if there were any known values.
-bool JumpThreadingPass::ComputeValueKnownInPredecessorsImpl(
+bool JumpThreadingPass::computeValueKnownInPredecessorsImpl(
     Value *V, BasicBlock *BB, PredValueInfo &Result,
     ConstantPreference Preference, DenseSet<Value *> &RecursionSet,
     Instruction *CxtI) {
@@ -674,13 +704,10 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessorsImpl(
     return !Result.empty();
   }
 
-  // Handle Cast instructions.  Only see through Cast when the source operand is
-  // PHI or Cmp to save the compilation time.
+  // Handle Cast instructions.
   if (CastInst *CI = dyn_cast<CastInst>(I)) {
     Value *Source = CI->getOperand(0);
-    if (!isa<PHINode>(Source) && !isa<CmpInst>(Source))
-      return false;
-    ComputeValueKnownInPredecessorsImpl(Source, BB, Result, Preference,
+    computeValueKnownInPredecessorsImpl(Source, BB, Result, Preference,
                                         RecursionSet, CxtI);
     if (Result.empty())
       return false;
@@ -692,6 +719,18 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessorsImpl(
     return true;
   }
 
+  if (FreezeInst *FI = dyn_cast<FreezeInst>(I)) {
+    Value *Source = FI->getOperand(0);
+    computeValueKnownInPredecessorsImpl(Source, BB, Result, Preference,
+                                        RecursionSet, CxtI);
+
+    erase_if(Result, [](auto &Pair) {
+      return !isGuaranteedNotToBeUndefOrPoison(Pair.first);
+    });
+
+    return !Result.empty();
+  }
+
   // Handle some boolean conditions.
   if (I->getType()->getPrimitiveSizeInBits() == 1) {
     assert(Preference == WantInteger && "One-bit non-integer type?");
@@ -701,9 +740,9 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessorsImpl(
         I->getOpcode() == Instruction::And) {
       PredValueInfoTy LHSVals, RHSVals;
 
-      ComputeValueKnownInPredecessorsImpl(I->getOperand(0), BB, LHSVals,
+      computeValueKnownInPredecessorsImpl(I->getOperand(0), BB, LHSVals,
                                       WantInteger, RecursionSet, CxtI);
-      ComputeValueKnownInPredecessorsImpl(I->getOperand(1), BB, RHSVals,
+      computeValueKnownInPredecessorsImpl(I->getOperand(1), BB, RHSVals,
                                           WantInteger, RecursionSet, CxtI);
 
       if (LHSVals.empty() && RHSVals.empty())
@@ -739,7 +778,7 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessorsImpl(
     if (I->getOpcode() == Instruction::Xor &&
         isa<ConstantInt>(I->getOperand(1)) &&
         cast<ConstantInt>(I->getOperand(1))->isOne()) {
-      ComputeValueKnownInPredecessorsImpl(I->getOperand(0), BB, Result,
+      computeValueKnownInPredecessorsImpl(I->getOperand(0), BB, Result,
                                           WantInteger, RecursionSet, CxtI);
       if (Result.empty())
         return false;
@@ -757,7 +796,7 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessorsImpl(
             && "A binary operator creating a block address?");
     if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->getOperand(1))) {
       PredValueInfoTy LHSVals;
-      ComputeValueKnownInPredecessorsImpl(BO->getOperand(0), BB, LHSVals,
+      computeValueKnownInPredecessorsImpl(BO->getOperand(0), BB, LHSVals,
                                           WantInteger, RecursionSet, CxtI);
 
       // Try to use constant folding to simplify the binary operator.
@@ -891,7 +930,7 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessorsImpl(
       // Try to find a constant value for the LHS of a comparison,
       // and evaluate it statically if we can.
       PredValueInfoTy LHSVals;
-      ComputeValueKnownInPredecessorsImpl(I->getOperand(0), BB, LHSVals,
+      computeValueKnownInPredecessorsImpl(I->getOperand(0), BB, LHSVals,
                                           WantInteger, RecursionSet, CxtI);
 
       for (const auto &LHSVal : LHSVals) {
@@ -912,7 +951,7 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessorsImpl(
     Constant *FalseVal = getKnownConstant(SI->getFalseValue(), Preference);
     PredValueInfoTy Conds;
     if ((TrueVal || FalseVal) &&
-        ComputeValueKnownInPredecessorsImpl(SI->getCondition(), BB, Conds,
+        computeValueKnownInPredecessorsImpl(SI->getCondition(), BB, Conds,
                                             WantInteger, RecursionSet, CxtI)) {
       for (auto &C : Conds) {
         Constant *Cond = C.first;
@@ -940,7 +979,8 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessorsImpl(
   }
 
   // If all else fails, see if LVI can figure out a constant value for us.
-  Constant *CI = LVI->getConstant(V, BB, CxtI);
+  assert(CxtI->getParent() == BB && "CxtI should be in BB");
+  Constant *CI = LVI->getConstant(V, CxtI);
   if (Constant *KC = getKnownConstant(CI, Preference)) {
     for (BasicBlock *Pred : predecessors(BB))
       Result.emplace_back(KC, Pred);
@@ -954,7 +994,7 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessorsImpl(
 ///
 /// Since we can pick an arbitrary destination, we pick the successor with the
 /// fewest predecessors.  This should reduce the in-degree of the others.
-static unsigned GetBestDestForJumpOnUndef(BasicBlock *BB) {
+static unsigned getBestDestForJumpOnUndef(BasicBlock *BB) {
   Instruction *BBTerm = BB->getTerminator();
   unsigned MinSucc = 0;
   BasicBlock *TestBB = BBTerm->getSuccessor(MinSucc);
@@ -982,9 +1022,9 @@ static bool hasAddressTakenAndUsed(BasicBlock *BB) {
   return !BA->use_empty();
 }
 
-/// ProcessBlock - If there are any predecessors whose control can be threaded
+/// processBlock - If there are any predecessors whose control can be threaded
 /// through to a successor, transform them now.
-bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
+bool JumpThreadingPass::processBlock(BasicBlock *BB) {
   // If the block is trivially dead, just return and let the caller nuke it.
   // This simplifies other transformations.
   if (DTU->isBBPendingDeletion(BB) ||
@@ -995,14 +1035,14 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
   // successor, merge the blocks.  This encourages recursive jump threading
   // because now the condition in this block can be threaded through
   // predecessors of our predecessor block.
-  if (MaybeMergeBasicBlockIntoOnlyPred(BB))
+  if (maybeMergeBasicBlockIntoOnlyPred(BB))
     return true;
 
-  if (TryToUnfoldSelectInCurrBB(BB))
+  if (tryToUnfoldSelectInCurrBB(BB))
     return true;
 
   // Look if we can propagate guards to predecessors.
-  if (HasGuards && ProcessGuards(BB))
+  if (HasGuards && processGuards(BB))
     return true;
 
   // What kind of constant we're looking for.
@@ -1027,6 +1067,9 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
     return false; // Must be an invoke or callbr.
   }
 
+  // Keep track if we constant folded the condition in this invocation.
+  bool ConstantFolded = false;
+
   // Run constant folding to see if we can reduce the condition to a simple
   // constant.
   if (Instruction *I = dyn_cast<Instruction>(Condition)) {
@@ -1037,13 +1080,16 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
       if (isInstructionTriviallyDead(I, TLI))
         I->eraseFromParent();
       Condition = SimpleVal;
+      ConstantFolded = true;
     }
   }
 
-  // If the terminator is branching on an undef, we can pick any of the
-  // successors to branch to.  Let GetBestDestForJumpOnUndef decide.
-  if (isa<UndefValue>(Condition)) {
-    unsigned BestSucc = GetBestDestForJumpOnUndef(BB);
+  // If the terminator is branching on an undef or freeze undef, we can pick any
+  // of the successors to branch to.  Let getBestDestForJumpOnUndef decide.
+  auto *FI = dyn_cast<FreezeInst>(Condition);
+  if (isa<UndefValue>(Condition) ||
+      (FI && isa<UndefValue>(FI->getOperand(0)) && FI->hasOneUse())) {
+    unsigned BestSucc = getBestDestForJumpOnUndef(BB);
     std::vector<DominatorTree::UpdateType> Updates;
 
     // Fold the branch/switch.
@@ -1061,6 +1107,8 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
     BranchInst::Create(BBTerm->getSuccessor(BestSucc), BBTerm);
     BBTerm->eraseFromParent();
     DTU->applyUpdatesPermissive(Updates);
+    if (FI)
+      FI->eraseFromParent();
     return true;
   }
 
@@ -1073,6 +1121,8 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
                       << '\n');
     ++NumFolds;
     ConstantFoldTerminator(BB, true, nullptr, DTU);
+    if (HasProfileData)
+      BPI->eraseBlock(BB);
     return true;
   }
 
@@ -1081,9 +1131,9 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
   // All the rest of our checks depend on the condition being an instruction.
   if (!CondInst) {
     // FIXME: Unify this with code below.
-    if (ProcessThreadableEdges(Condition, BB, Preference, Terminator))
+    if (processThreadableEdges(Condition, BB, Preference, Terminator))
       return true;
-    return false;
+    return ConstantFolded;
   }
 
   if (CmpInst *CondCmp = dyn_cast<CmpInst>(CondInst)) {
@@ -1124,22 +1174,24 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
           auto *CI = Ret == LazyValueInfo::True ?
             ConstantInt::getTrue(CondCmp->getType()) :
             ConstantInt::getFalse(CondCmp->getType());
-          ReplaceFoldableUses(CondCmp, CI);
+          replaceFoldableUses(CondCmp, CI);
         }
         DTU->applyUpdatesPermissive(
             {{DominatorTree::Delete, BB, ToRemoveSucc}});
+        if (HasProfileData)
+          BPI->eraseBlock(BB);
         return true;
       }
 
       // We did not manage to simplify this branch, try to see whether
       // CondCmp depends on a known phi-select pattern.
-      if (TryToUnfoldSelect(CondCmp, BB))
+      if (tryToUnfoldSelect(CondCmp, BB))
         return true;
     }
   }
 
   if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator()))
-    if (TryToUnfoldSelect(SI, BB))
+    if (tryToUnfoldSelect(SI, BB))
       return true;
 
   // Check for some cases that are worth simplifying.  Right now we want to look
@@ -1147,6 +1199,11 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
   // we see one, check to see if it's partially redundant.  If so, insert a PHI
   // which can then be used to thread the values.
   Value *SimplifyValue = CondInst;
+
+  if (auto *FI = dyn_cast<FreezeInst>(SimplifyValue))
+    // Look into freeze's operand
+    SimplifyValue = FI->getOperand(0);
+
   if (CmpInst *CondCmp = dyn_cast<CmpInst>(SimplifyValue))
     if (isa<Constant>(CondCmp->getOperand(1)))
       SimplifyValue = CondCmp->getOperand(0);
@@ -1154,7 +1211,7 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
   // TODO: There are other places where load PRE would be profitable, such as
   // more complex comparisons.
   if (LoadInst *LoadI = dyn_cast<LoadInst>(SimplifyValue))
-    if (SimplifyPartiallyRedundantLoad(LoadI))
+    if (simplifyPartiallyRedundantLoad(LoadI))
       return true;
 
   // Before threading, try to propagate profile data backwards:
@@ -1165,29 +1222,32 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
   // Handle a variety of cases where we are branching on something derived from
   // a PHI node in the current block.  If we can prove that any predecessors
   // compute a predictable value based on a PHI node, thread those predecessors.
-  if (ProcessThreadableEdges(CondInst, BB, Preference, Terminator))
+  if (processThreadableEdges(CondInst, BB, Preference, Terminator))
     return true;
 
-  // If this is an otherwise-unfoldable branch on a phi node in the current
-  // block, see if we can simplify.
-  if (PHINode *PN = dyn_cast<PHINode>(CondInst))
-    if (PN->getParent() == BB && isa<BranchInst>(BB->getTerminator()))
-      return ProcessBranchOnPHI(PN);
+  // If this is an otherwise-unfoldable branch on a phi node or freeze(phi) in
+  // the current block, see if we can simplify.
+  PHINode *PN = dyn_cast<PHINode>(
+      isa<FreezeInst>(CondInst) ? cast<FreezeInst>(CondInst)->getOperand(0)
+                                : CondInst);
+
+  if (PN && PN->getParent() == BB && isa<BranchInst>(BB->getTerminator()))
+    return processBranchOnPHI(PN);
 
   // If this is an otherwise-unfoldable branch on a XOR, see if we can simplify.
   if (CondInst->getOpcode() == Instruction::Xor &&
       CondInst->getParent() == BB && isa<BranchInst>(BB->getTerminator()))
-    return ProcessBranchOnXOR(cast<BinaryOperator>(CondInst));
+    return processBranchOnXOR(cast<BinaryOperator>(CondInst));
 
   // Search for a stronger dominating condition that can be used to simplify a
   // conditional branch leaving BB.
-  if (ProcessImpliedCondition(BB))
+  if (processImpliedCondition(BB))
     return true;
 
   return false;
 }
 
-bool JumpThreadingPass::ProcessImpliedCondition(BasicBlock *BB) {
+bool JumpThreadingPass::processImpliedCondition(BasicBlock *BB) {
   auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
   if (!BI || !BI->isConditional())
     return false;
@@ -1217,6 +1277,8 @@ bool JumpThreadingPass::ProcessImpliedCondition(BasicBlock *BB) {
       UncondBI->setDebugLoc(BI->getDebugLoc());
       BI->eraseFromParent();
       DTU->applyUpdatesPermissive({{DominatorTree::Delete, BB, RemoveSucc}});
+      if (HasProfileData)
+        BPI->eraseBlock(BB);
       return true;
     }
     CurrentBB = CurrentPred;
@@ -1234,11 +1296,11 @@ static bool isOpDefinedInBlock(Value *Op, BasicBlock *BB) {
   return false;
 }
 
-/// SimplifyPartiallyRedundantLoad - If LoadI is an obviously partially
+/// simplifyPartiallyRedundantLoad - If LoadI is an obviously partially
 /// redundant load instruction, eliminate it by replacing it with a PHI node.
 /// This is an important optimization that encourages jump threading, and needs
 /// to be run interlaced with other jump threading tasks.
-bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LoadI) {
+bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) {
   // Don't hack volatile and ordered loads.
   if (!LoadI->isUnordered()) return false;
 
@@ -1408,7 +1470,7 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LoadI) {
     }
 
     // Split them out to their own block.
-    UnavailablePred = SplitBlockPreds(LoadBB, PredsToSplit, "thread-pre-split");
+    UnavailablePred = splitBlockPreds(LoadBB, PredsToSplit, "thread-pre-split");
   }
 
   // If the value isn't available in all predecessors, then there will be
@@ -1472,11 +1534,11 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LoadI) {
   return true;
 }
 
-/// FindMostPopularDest - The specified list contains multiple possible
+/// findMostPopularDest - The specified list contains multiple possible
 /// threadable destinations.  Pick the one that occurs the most frequently in
 /// the list.
 static BasicBlock *
-FindMostPopularDest(BasicBlock *BB,
+findMostPopularDest(BasicBlock *BB,
                     const SmallVectorImpl<std::pair<BasicBlock *,
                                           BasicBlock *>> &PredToDestList) {
   assert(!PredToDestList.empty());
@@ -1511,7 +1573,7 @@ FindMostPopularDest(BasicBlock *BB,
 
 // Try to evaluate the value of V when the control flows from PredPredBB to
 // BB->getSinglePredecessor() and then on to BB.
-Constant *JumpThreadingPass::EvaluateOnPredecessorEdge(BasicBlock *BB,
+Constant *JumpThreadingPass::evaluateOnPredecessorEdge(BasicBlock *BB,
                                                        BasicBlock *PredPredBB,
                                                        Value *V) {
   BasicBlock *PredBB = BB->getSinglePredecessor();
@@ -1538,9 +1600,9 @@ Constant *JumpThreadingPass::EvaluateOnPredecessorEdge(BasicBlock *BB,
   if (CmpInst *CondCmp = dyn_cast<CmpInst>(V)) {
     if (CondCmp->getParent() == BB) {
       Constant *Op0 =
-          EvaluateOnPredecessorEdge(BB, PredPredBB, CondCmp->getOperand(0));
+          evaluateOnPredecessorEdge(BB, PredPredBB, CondCmp->getOperand(0));
       Constant *Op1 =
-          EvaluateOnPredecessorEdge(BB, PredPredBB, CondCmp->getOperand(1));
+          evaluateOnPredecessorEdge(BB, PredPredBB, CondCmp->getOperand(1));
       if (Op0 && Op1) {
         return ConstantExpr::getCompare(CondCmp->getPredicate(), Op0, Op1);
       }
@@ -1551,7 +1613,7 @@ Constant *JumpThreadingPass::EvaluateOnPredecessorEdge(BasicBlock *BB,
   return nullptr;
 }
 
-bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
+bool JumpThreadingPass::processThreadableEdges(Value *Cond, BasicBlock *BB,
                                                ConstantPreference Preference,
                                                Instruction *CxtI) {
   // If threading this would thread across a loop header, don't even try to
@@ -1560,15 +1622,15 @@ bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
     return false;
 
   PredValueInfoTy PredValues;
-  if (!ComputeValueKnownInPredecessors(Cond, BB, PredValues, Preference,
+  if (!computeValueKnownInPredecessors(Cond, BB, PredValues, Preference,
                                        CxtI)) {
     // We don't have known values in predecessors.  See if we can thread through
     // BB and its sole predecessor.
-    return MaybeThreadThroughTwoBasicBlocks(BB, Cond);
+    return maybethreadThroughTwoBasicBlocks(BB, Cond);
   }
 
   assert(!PredValues.empty() &&
-         "ComputeValueKnownInPredecessors returned true with no values");
+         "computeValueKnownInPredecessors returned true with no values");
 
   LLVM_DEBUG(dbgs() << "IN BB: " << *BB;
              for (const auto &PredValue : PredValues) {
@@ -1660,6 +1722,8 @@ bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
       BranchInst::Create(OnlyDest, Term);
       Term->eraseFromParent();
       DTU->applyUpdatesPermissive(Updates);
+      if (HasProfileData)
+        BPI->eraseBlock(BB);
 
       // If the condition is now dead due to the removal of the old terminator,
       // erase it.
@@ -1675,7 +1739,7 @@ bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
         // guard/assume.
         else if (OnlyVal && OnlyVal != MultipleVal &&
                  CondInst->getParent() == BB)
-          ReplaceFoldableUses(CondInst, OnlyVal);
+          replaceFoldableUses(CondInst, OnlyVal);
       }
       return true;
     }
@@ -1688,18 +1752,18 @@ bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
   BasicBlock *MostPopularDest = OnlyDest;
 
   if (MostPopularDest == MultipleDestSentinel) {
-    // Remove any loop headers from the Dest list, ThreadEdge conservatively
+    // Remove any loop headers from the Dest list, threadEdge conservatively
     // won't process them, but we might have other destination that are eligible
     // and we still want to process.
     erase_if(PredToDestList,
              [&](const std::pair<BasicBlock *, BasicBlock *> &PredToDest) {
-               return LoopHeaders.count(PredToDest.second) != 0;
+               return LoopHeaders.contains(PredToDest.second);
              });
 
     if (PredToDestList.empty())
       return false;
 
-    MostPopularDest = FindMostPopularDest(BB, PredToDestList);
+    MostPopularDest = findMostPopularDest(BB, PredToDestList);
   }
 
   // Now that we know what the most popular destination is, factor all
@@ -1721,16 +1785,16 @@ bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
   // the destination that these predecessors should get to.
   if (!MostPopularDest)
     MostPopularDest = BB->getTerminator()->
-                            getSuccessor(GetBestDestForJumpOnUndef(BB));
+                            getSuccessor(getBestDestForJumpOnUndef(BB));
 
   // Ok, try to thread it!
-  return TryThreadEdge(BB, PredsToFactor, MostPopularDest);
+  return tryThreadEdge(BB, PredsToFactor, MostPopularDest);
 }
 
-/// ProcessBranchOnPHI - We have an otherwise unthreadable conditional branch on
-/// a PHI node in the current block.  See if there are any simplifications we
-/// can do based on inputs to the phi node.
-bool JumpThreadingPass::ProcessBranchOnPHI(PHINode *PN) {
+/// processBranchOnPHI - We have an otherwise unthreadable conditional branch on
+/// a PHI node (or freeze PHI) in the current block.  See if there are any
+/// simplifications we can do based on inputs to the phi node.
+bool JumpThreadingPass::processBranchOnPHI(PHINode *PN) {
   BasicBlock *BB = PN->getParent();
 
   // TODO: We could make use of this to do it once for blocks with common PHI
@@ -1742,13 +1806,16 @@ bool JumpThreadingPass::ProcessBranchOnPHI(PHINode *PN) {
   // *duplicate* the conditional branch into that block in order to further
   // encourage jump threading and to eliminate cases where we have branch on a
   // phi of an icmp (branch on icmp is much better).
+  // This is still beneficial when a frozen phi is used as the branch condition
+  // because it allows CodeGenPrepare to further canonicalize br(freeze(icmp))
+  // to br(icmp(freeze ...)).
   for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
     BasicBlock *PredBB = PN->getIncomingBlock(i);
     if (BranchInst *PredBr = dyn_cast<BranchInst>(PredBB->getTerminator()))
       if (PredBr->isUnconditional()) {
         PredBBs[0] = PredBB;
         // Try to duplicate BB into PredBB.
-        if (DuplicateCondBranchOnPHIIntoPred(BB, PredBBs))
+        if (duplicateCondBranchOnPHIIntoPred(BB, PredBBs))
           return true;
       }
   }
@@ -1756,10 +1823,10 @@ bool JumpThreadingPass::ProcessBranchOnPHI(PHINode *PN) {
   return false;
 }
 
-/// ProcessBranchOnXOR - We have an otherwise unthreadable conditional branch on
+/// processBranchOnXOR - We have an otherwise unthreadable conditional branch on
 /// a xor instruction in the current block.  See if there are any
 /// simplifications we can do based on inputs to the xor.
-bool JumpThreadingPass::ProcessBranchOnXOR(BinaryOperator *BO) {
+bool JumpThreadingPass::processBranchOnXOR(BinaryOperator *BO) {
   BasicBlock *BB = BO->getParent();
 
   // If either the LHS or RHS of the xor is a constant, don't do this
@@ -1797,17 +1864,17 @@ bool JumpThreadingPass::ProcessBranchOnXOR(BinaryOperator *BO) {
 
   PredValueInfoTy XorOpValues;
   bool isLHS = true;
-  if (!ComputeValueKnownInPredecessors(BO->getOperand(0), BB, XorOpValues,
+  if (!computeValueKnownInPredecessors(BO->getOperand(0), BB, XorOpValues,
                                        WantInteger, BO)) {
     assert(XorOpValues.empty());
-    if (!ComputeValueKnownInPredecessors(BO->getOperand(1), BB, XorOpValues,
+    if (!computeValueKnownInPredecessors(BO->getOperand(1), BB, XorOpValues,
                                          WantInteger, BO))
       return false;
     isLHS = false;
   }
 
   assert(!XorOpValues.empty() &&
-         "ComputeValueKnownInPredecessors returned true with no values");
+         "computeValueKnownInPredecessors returned true with no values");
 
   // Scan the information to see which is most popular: true or false.  The
   // predecessors can be of the set true, false, or undef.
@@ -1868,13 +1935,13 @@ bool JumpThreadingPass::ProcessBranchOnXOR(BinaryOperator *BO) {
     return false;
 
   // Try to duplicate BB into PredBB.
-  return DuplicateCondBranchOnPHIIntoPred(BB, BlocksToFoldInto);
+  return duplicateCondBranchOnPHIIntoPred(BB, BlocksToFoldInto);
 }
 
-/// AddPHINodeEntriesForMappedBlock - We're adding 'NewPred' as a new
+/// addPHINodeEntriesForMappedBlock - We're adding 'NewPred' as a new
 /// predecessor to the PHIBB block.  If it has PHI nodes, add entries for
 /// NewPred using the entries from OldPred (suitably mapped).
-static void AddPHINodeEntriesForMappedBlock(BasicBlock *PHIBB,
+static void addPHINodeEntriesForMappedBlock(BasicBlock *PHIBB,
                                             BasicBlock *OldPred,
                                             BasicBlock *NewPred,
                                      DenseMap<Instruction*, Value*> &ValueMap) {
@@ -1895,7 +1962,7 @@ static void AddPHINodeEntriesForMappedBlock(BasicBlock *PHIBB,
 }
 
 /// Merge basic block BB into its sole predecessor if possible.
-bool JumpThreadingPass::MaybeMergeBasicBlockIntoOnlyPred(BasicBlock *BB) {
+bool JumpThreadingPass::maybeMergeBasicBlockIntoOnlyPred(BasicBlock *BB) {
   BasicBlock *SinglePred = BB->getSinglePredecessor();
   if (!SinglePred)
     return false;
@@ -1946,7 +2013,7 @@ bool JumpThreadingPass::MaybeMergeBasicBlockIntoOnlyPred(BasicBlock *BB) {
 
 /// Update the SSA form.  NewBB contains instructions that are copied from BB.
 /// ValueMapping maps old values in BB to new ones in NewBB.
-void JumpThreadingPass::UpdateSSA(
+void JumpThreadingPass::updateSSA(
     BasicBlock *BB, BasicBlock *NewBB,
     DenseMap<Instruction *, Value *> &ValueMapping) {
   // If there were values defined in BB that are used outside the block, then we
@@ -1992,7 +2059,7 @@ void JumpThreadingPass::UpdateSSA(
 /// arguments that come from PredBB.  Return the map from the variables in the
 /// source basic block to the variables in the newly created basic block.
 DenseMap<Instruction *, Value *>
-JumpThreadingPass::CloneInstructions(BasicBlock::iterator BI,
+JumpThreadingPass::cloneInstructions(BasicBlock::iterator BI,
                                      BasicBlock::iterator BE, BasicBlock *NewBB,
                                      BasicBlock *PredBB) {
   // We are going to have to map operands from the source basic block to the new
@@ -2009,6 +2076,15 @@ JumpThreadingPass::CloneInstructions(BasicBlock::iterator BI,
     ValueMapping[PN] = NewPN;
   }
 
+  // Clone noalias scope declarations in the threaded block. When threading a
+  // loop exit, we would otherwise end up with two idential scope declarations
+  // visible at the same time.
+  SmallVector<MDNode *> NoAliasScopes;
+  DenseMap<MDNode *, MDNode *> ClonedScopes;
+  LLVMContext &Context = PredBB->getContext();
+  identifyNoAliasScopesToClone(BI, BE, NoAliasScopes);
+  cloneNoAliasScopes(NoAliasScopes, ClonedScopes, "thread", Context);
+
   // Clone the non-phi instructions of the source basic block into NewBB,
   // keeping track of the mapping and using it to remap operands in the cloned
   // instructions.
@@ -2017,6 +2093,7 @@ JumpThreadingPass::CloneInstructions(BasicBlock::iterator BI,
     New->setName(BI->getName());
     NewBB->getInstList().push_back(New);
     ValueMapping[&*BI] = New;
+    adaptNoAliasScopes(New, ClonedScopes, Context);
 
     // Remap operands to patch up intra-block references.
     for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
@@ -2031,7 +2108,7 @@ JumpThreadingPass::CloneInstructions(BasicBlock::iterator BI,
 }
 
 /// Attempt to thread through two successive basic blocks.
-bool JumpThreadingPass::MaybeThreadThroughTwoBasicBlocks(BasicBlock *BB,
+bool JumpThreadingPass::maybethreadThroughTwoBasicBlocks(BasicBlock *BB,
                                                          Value *Cond) {
   // Consider:
   //
@@ -2100,7 +2177,7 @@ bool JumpThreadingPass::MaybeThreadThroughTwoBasicBlocks(BasicBlock *BB,
   BasicBlock *OnePred = nullptr;
   for (BasicBlock *P : predecessors(PredBB)) {
     if (ConstantInt *CI = dyn_cast_or_null<ConstantInt>(
-            EvaluateOnPredecessorEdge(BB, P, Cond))) {
+            evaluateOnPredecessorEdge(BB, P, Cond))) {
       if (CI->isZero()) {
         ZeroCount++;
         ZeroPred = P;
@@ -2131,7 +2208,7 @@ bool JumpThreadingPass::MaybeThreadThroughTwoBasicBlocks(BasicBlock *BB,
   }
 
   // If threading this would thread across a loop header, don't thread the edge.
-  // See the comments above FindLoopHeaders for justifications and caveats.
+  // See the comments above findLoopHeaders for justifications and caveats.
   if (LoopHeaders.count(BB) || LoopHeaders.count(SuccBB)) {
     LLVM_DEBUG({
       bool BBIsHeader = LoopHeaders.count(BB);
@@ -2164,11 +2241,11 @@ bool JumpThreadingPass::MaybeThreadThroughTwoBasicBlocks(BasicBlock *BB,
   }
 
   // Now we are ready to duplicate PredBB.
-  ThreadThroughTwoBasicBlocks(PredPredBB, PredBB, BB, SuccBB);
+  threadThroughTwoBasicBlocks(PredPredBB, PredBB, BB, SuccBB);
   return true;
 }
 
-void JumpThreadingPass::ThreadThroughTwoBasicBlocks(BasicBlock *PredPredBB,
+void JumpThreadingPass::threadThroughTwoBasicBlocks(BasicBlock *PredPredBB,
                                                     BasicBlock *PredBB,
                                                     BasicBlock *BB,
                                                     BasicBlock *SuccBB) {
@@ -2194,7 +2271,11 @@ void JumpThreadingPass::ThreadThroughTwoBasicBlocks(BasicBlock *PredPredBB,
   // copy of the block 'NewBB'.  If there are PHI nodes in PredBB, evaluate them
   // to account for entry from PredPredBB.
   DenseMap<Instruction *, Value *> ValueMapping =
-      CloneInstructions(PredBB->begin(), PredBB->end(), NewBB, PredPredBB);
+      cloneInstructions(PredBB->begin(), PredBB->end(), NewBB, PredPredBB);
+
+  // Copy the edge probabilities from PredBB to NewBB.
+  if (HasProfileData)
+    BPI->copyEdgeProbabilities(PredBB, NewBB);
 
   // Update the terminator of PredPredBB to jump to NewBB instead of PredBB.
   // This eliminates predecessors from PredPredBB, which requires us to simplify
@@ -2206,9 +2287,9 @@ void JumpThreadingPass::ThreadThroughTwoBasicBlocks(BasicBlock *PredPredBB,
       PredPredTerm->setSuccessor(i, NewBB);
     }
 
-  AddPHINodeEntriesForMappedBlock(PredBBBranch->getSuccessor(0), PredBB, NewBB,
+  addPHINodeEntriesForMappedBlock(PredBBBranch->getSuccessor(0), PredBB, NewBB,
                                   ValueMapping);
-  AddPHINodeEntriesForMappedBlock(PredBBBranch->getSuccessor(1), PredBB, NewBB,
+  addPHINodeEntriesForMappedBlock(PredBBBranch->getSuccessor(1), PredBB, NewBB,
                                   ValueMapping);
 
   DTU->applyUpdatesPermissive(
@@ -2217,7 +2298,7 @@ void JumpThreadingPass::ThreadThroughTwoBasicBlocks(BasicBlock *PredPredBB,
        {DominatorTree::Insert, PredPredBB, NewBB},
        {DominatorTree::Delete, PredPredBB, PredBB}});
 
-  UpdateSSA(PredBB, NewBB, ValueMapping);
+  updateSSA(PredBB, NewBB, ValueMapping);
 
   // Clean up things like PHI nodes with single operands, dead instructions,
   // etc.
@@ -2226,11 +2307,11 @@ void JumpThreadingPass::ThreadThroughTwoBasicBlocks(BasicBlock *PredPredBB,
 
   SmallVector<BasicBlock *, 1> PredsToFactor;
   PredsToFactor.push_back(NewBB);
-  ThreadEdge(BB, PredsToFactor, SuccBB);
+  threadEdge(BB, PredsToFactor, SuccBB);
 }
 
-/// TryThreadEdge - Thread an edge if it's safe and profitable to do so.
-bool JumpThreadingPass::TryThreadEdge(
+/// tryThreadEdge - Thread an edge if it's safe and profitable to do so.
+bool JumpThreadingPass::tryThreadEdge(
     BasicBlock *BB, const SmallVectorImpl<BasicBlock *> &PredBBs,
     BasicBlock *SuccBB) {
   // If threading to the same block as we come from, we would infinite loop.
@@ -2241,7 +2322,7 @@ bool JumpThreadingPass::TryThreadEdge(
   }
 
   // If threading this would thread across a loop header, don't thread the edge.
-  // See the comments above FindLoopHeaders for justifications and caveats.
+  // See the comments above findLoopHeaders for justifications and caveats.
   if (LoopHeaders.count(BB) || LoopHeaders.count(SuccBB)) {
     LLVM_DEBUG({
       bool BBIsHeader = LoopHeaders.count(BB);
@@ -2262,14 +2343,14 @@ bool JumpThreadingPass::TryThreadEdge(
     return false;
   }
 
-  ThreadEdge(BB, PredBBs, SuccBB);
+  threadEdge(BB, PredBBs, SuccBB);
   return true;
 }
 
-/// ThreadEdge - We have decided that it is safe and profitable to factor the
+/// threadEdge - We have decided that it is safe and profitable to factor the
 /// blocks in PredBBs to one predecessor, then thread an edge from it to SuccBB
 /// across BB.  Transform the IR to reflect this change.
-void JumpThreadingPass::ThreadEdge(BasicBlock *BB,
+void JumpThreadingPass::threadEdge(BasicBlock *BB,
                                    const SmallVectorImpl<BasicBlock *> &PredBBs,
                                    BasicBlock *SuccBB) {
   assert(SuccBB != BB && "Don't create an infinite loop");
@@ -2284,7 +2365,7 @@ void JumpThreadingPass::ThreadEdge(BasicBlock *BB,
   else {
     LLVM_DEBUG(dbgs() << "  Factoring out " << PredBBs.size()
                       << " common predecessors.\n");
-    PredBB = SplitBlockPreds(BB, PredBBs, ".thr_comm");
+    PredBB = splitBlockPreds(BB, PredBBs, ".thr_comm");
   }
 
   // And finally, do it!
@@ -2308,7 +2389,7 @@ void JumpThreadingPass::ThreadEdge(BasicBlock *BB,
 
   // Copy all the instructions from BB to NewBB except the terminator.
   DenseMap<Instruction *, Value *> ValueMapping =
-      CloneInstructions(BB->begin(), std::prev(BB->end()), NewBB, PredBB);
+      cloneInstructions(BB->begin(), std::prev(BB->end()), NewBB, PredBB);
 
   // We didn't copy the terminator from BB over to NewBB, because there is now
   // an unconditional jump to SuccBB.  Insert the unconditional jump.
@@ -2317,7 +2398,7 @@ void JumpThreadingPass::ThreadEdge(BasicBlock *BB,
 
   // Check to see if SuccBB has PHI nodes. If so, we need to add entries to the
   // PHI nodes for NewBB now.
-  AddPHINodeEntriesForMappedBlock(SuccBB, BB, NewBB, ValueMapping);
+  addPHINodeEntriesForMappedBlock(SuccBB, BB, NewBB, ValueMapping);
 
   // Update the terminator of PredBB to jump to NewBB instead of BB.  This
   // eliminates predecessors from BB, which requires us to simplify any PHI
@@ -2334,7 +2415,7 @@ void JumpThreadingPass::ThreadEdge(BasicBlock *BB,
                                {DominatorTree::Insert, PredBB, NewBB},
                                {DominatorTree::Delete, PredBB, BB}});
 
-  UpdateSSA(BB, NewBB, ValueMapping);
+  updateSSA(BB, NewBB, ValueMapping);
 
   // At this point, the IR is fully up to date and consistent.  Do a quick scan
   // over the new instructions and zap any that are constants or dead.  This
@@ -2342,7 +2423,7 @@ void JumpThreadingPass::ThreadEdge(BasicBlock *BB,
   SimplifyInstructionsInBlock(NewBB, TLI);
 
   // Update the edge weight from BB to SuccBB, which should be less than before.
-  UpdateBlockFreqAndEdgeWeight(PredBB, BB, NewBB, SuccBB);
+  updateBlockFreqAndEdgeWeight(PredBB, BB, NewBB, SuccBB);
 
   // Threaded an edge!
   ++NumThreads;
@@ -2351,7 +2432,7 @@ void JumpThreadingPass::ThreadEdge(BasicBlock *BB,
 /// Create a new basic block that will be the predecessor of BB and successor of
 /// all blocks in Preds. When profile data is available, update the frequency of
 /// this new block.
-BasicBlock *JumpThreadingPass::SplitBlockPreds(BasicBlock *BB,
+BasicBlock *JumpThreadingPass::splitBlockPreds(BasicBlock *BB,
                                                ArrayRef<BasicBlock *> Preds,
                                                const char *Suffix) {
   SmallVector<BasicBlock *, 2> NewBBs;
@@ -2412,7 +2493,7 @@ bool JumpThreadingPass::doesBlockHaveProfileData(BasicBlock *BB) {
 /// Update the block frequency of BB and branch weight and the metadata on the
 /// edge BB->SuccBB. This is done by scaling the weight of BB->SuccBB by 1 -
 /// Freq(PredBB->BB) / Freq(BB->SuccBB).
-void JumpThreadingPass::UpdateBlockFreqAndEdgeWeight(BasicBlock *PredBB,
+void JumpThreadingPass::updateBlockFreqAndEdgeWeight(BasicBlock *PredBB,
                                                      BasicBlock *BB,
                                                      BasicBlock *NewBB,
                                                      BasicBlock *SuccBB) {
@@ -2504,18 +2585,18 @@ void JumpThreadingPass::UpdateBlockFreqAndEdgeWeight(BasicBlock *PredBB,
   }
 }
 
-/// DuplicateCondBranchOnPHIIntoPred - PredBB contains an unconditional branch
+/// duplicateCondBranchOnPHIIntoPred - PredBB contains an unconditional branch
 /// to BB which contains an i1 PHI node and a conditional branch on that PHI.
 /// If we can duplicate the contents of BB up into PredBB do so now, this
 /// improves the odds that the branch will be on an analyzable instruction like
 /// a compare.
-bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred(
+bool JumpThreadingPass::duplicateCondBranchOnPHIIntoPred(
     BasicBlock *BB, const SmallVectorImpl<BasicBlock *> &PredBBs) {
   assert(!PredBBs.empty() && "Can't handle an empty set");
 
   // If BB is a loop header, then duplicating this block outside the loop would
   // cause us to transform this into an irreducible loop, don't do this.
-  // See the comments above FindLoopHeaders for justifications and caveats.
+  // See the comments above findLoopHeaders for justifications and caveats.
   if (LoopHeaders.count(BB)) {
     LLVM_DEBUG(dbgs() << "  Not duplicating loop header '" << BB->getName()
                       << "' into predecessor block '" << PredBBs[0]->getName()
@@ -2539,7 +2620,7 @@ bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred(
   else {
     LLVM_DEBUG(dbgs() << "  Factoring out " << PredBBs.size()
                       << " common predecessors.\n");
-    PredBB = SplitBlockPreds(BB, PredBBs, ".thr_comm");
+    PredBB = splitBlockPreds(BB, PredBBs, ".thr_comm");
   }
   Updates.push_back({DominatorTree::Delete, PredBB, BB});
 
@@ -2611,12 +2692,12 @@ bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred(
   // Check to see if the targets of the branch had PHI nodes. If so, we need to
   // add entries to the PHI nodes for branch from PredBB now.
   BranchInst *BBBranch = cast<BranchInst>(BB->getTerminator());
-  AddPHINodeEntriesForMappedBlock(BBBranch->getSuccessor(0), BB, PredBB,
+  addPHINodeEntriesForMappedBlock(BBBranch->getSuccessor(0), BB, PredBB,
                                   ValueMapping);
-  AddPHINodeEntriesForMappedBlock(BBBranch->getSuccessor(1), BB, PredBB,
+  addPHINodeEntriesForMappedBlock(BBBranch->getSuccessor(1), BB, PredBB,
                                   ValueMapping);
 
-  UpdateSSA(BB, PredBB, ValueMapping);
+  updateSSA(BB, PredBB, ValueMapping);
 
   // PredBB no longer jumps to BB, remove entries in the PHI node for the edge
   // that we nuked.
@@ -2624,6 +2705,8 @@ bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred(
 
   // Remove the unconditional branch at the end of the PredBB block.
   OldPredBranch->eraseFromParent();
+  if (HasProfileData)
+    BPI->copyEdgeProbabilities(BB, PredBB);
   DTU->applyUpdatesPermissive(Updates);
 
   ++NumDupes;
@@ -2635,7 +2718,7 @@ bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred(
 // a PHI node in BB. SI has no other use.
 // A new basic block, NewBB, is created and SI is converted to compare and 
 // conditional branch. SI is erased from parent.
-void JumpThreadingPass::UnfoldSelectInstr(BasicBlock *Pred, BasicBlock *BB,
+void JumpThreadingPass::unfoldSelectInstr(BasicBlock *Pred, BasicBlock *BB,
                                           SelectInst *SI, PHINode *SIUse,
                                           unsigned Idx) {
   // Expand the select.
@@ -2670,7 +2753,7 @@ void JumpThreadingPass::UnfoldSelectInstr(BasicBlock *Pred, BasicBlock *BB,
       Phi->addIncoming(Phi->getIncomingValueForBlock(Pred), NewBB);
 }
 
-bool JumpThreadingPass::TryToUnfoldSelect(SwitchInst *SI, BasicBlock *BB) {
+bool JumpThreadingPass::tryToUnfoldSelect(SwitchInst *SI, BasicBlock *BB) {
   PHINode *CondPHI = dyn_cast<PHINode>(SI->getCondition());
 
   if (!CondPHI || CondPHI->getParent() != BB)
@@ -2682,7 +2765,7 @@ bool JumpThreadingPass::TryToUnfoldSelect(SwitchInst *SI, BasicBlock *BB) {
 
     // The second and third condition can be potentially relaxed. Currently
     // the conditions help to simplify the code and allow us to reuse existing
-    // code, developed for TryToUnfoldSelect(CmpInst *, BasicBlock *)
+    // code, developed for tryToUnfoldSelect(CmpInst *, BasicBlock *)
     if (!PredSI || PredSI->getParent() != Pred || !PredSI->hasOneUse())
       continue;
 
@@ -2690,13 +2773,13 @@ bool JumpThreadingPass::TryToUnfoldSelect(SwitchInst *SI, BasicBlock *BB) {
     if (!PredTerm || !PredTerm->isUnconditional())
       continue;
 
-    UnfoldSelectInstr(Pred, BB, PredSI, CondPHI, I);
+    unfoldSelectInstr(Pred, BB, PredSI, CondPHI, I);
     return true;
   }
   return false;
 }
 
-/// TryToUnfoldSelect - Look for blocks of the form
+/// tryToUnfoldSelect - Look for blocks of the form
 /// bb1:
 ///   %a = select
 ///   br bb2
@@ -2708,7 +2791,7 @@ bool JumpThreadingPass::TryToUnfoldSelect(SwitchInst *SI, BasicBlock *BB) {
 ///
 /// And expand the select into a branch structure if one of its arms allows %c
 /// to be folded. This later enables threading from bb1 over bb2.
-bool JumpThreadingPass::TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) {
+bool JumpThreadingPass::tryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) {
   BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
   PHINode *CondLHS = dyn_cast<PHINode>(CondCmp->getOperand(0));
   Constant *CondRHS = cast<Constant>(CondCmp->getOperand(1));
@@ -2742,14 +2825,14 @@ bool JumpThreadingPass::TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) {
     if ((LHSFolds != LazyValueInfo::Unknown ||
          RHSFolds != LazyValueInfo::Unknown) &&
         LHSFolds != RHSFolds) {
-      UnfoldSelectInstr(Pred, BB, SI, CondLHS, I);
+      unfoldSelectInstr(Pred, BB, SI, CondLHS, I);
       return true;
     }
   }
   return false;
 }
 
-/// TryToUnfoldSelectInCurrBB - Look for PHI/Select or PHI/CMP/Select in the
+/// tryToUnfoldSelectInCurrBB - Look for PHI/Select or PHI/CMP/Select in the
 /// same BB in the form
 /// bb:
 ///   %p = phi [false, %bb1], [true, %bb2], [false, %bb3], [true, %bb4], ...
@@ -2769,19 +2852,14 @@ bool JumpThreadingPass::TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) {
 /// select if the associated PHI has at least one constant.  If the unfolded
 /// select is not jump-threaded, it will be folded again in the later
 /// optimizations.
-bool JumpThreadingPass::TryToUnfoldSelectInCurrBB(BasicBlock *BB) {
-  // This transform can introduce a UB (a conditional branch that depends on a
-  // poison value) that was not present in the original program. See
-  // @TryToUnfoldSelectInCurrBB test in test/Transforms/JumpThreading/select.ll.
+bool JumpThreadingPass::tryToUnfoldSelectInCurrBB(BasicBlock *BB) {
+  // This transform would reduce the quality of msan diagnostics.
   // Disable this transform under MemorySanitizer.
-  // FIXME: either delete it or replace with a valid transform. This issue is
-  // not limited to MemorySanitizer (but has only been observed as an MSan false
-  // positive in practice so far).
   if (BB->getParent()->hasFnAttribute(Attribute::SanitizeMemory))
     return false;
 
   // If threading this would thread across a loop header, don't thread the edge.
-  // See the comments above FindLoopHeaders for justifications and caveats.
+  // See the comments above findLoopHeaders for justifications and caveats.
   if (LoopHeaders.count(BB))
     return false;
 
@@ -2824,8 +2902,12 @@ bool JumpThreadingPass::TryToUnfoldSelectInCurrBB(BasicBlock *BB) {
     if (!SI)
       continue;
     // Expand the select.
-    Instruction *Term =
-        SplitBlockAndInsertIfThen(SI->getCondition(), SI, false);
+    Value *Cond = SI->getCondition();
+    if (InsertFreezeWhenUnfoldingSelect &&
+        !isGuaranteedNotToBeUndefOrPoison(Cond, nullptr, SI,
+                                          &DTU->getDomTree()))
+      Cond = new FreezeInst(Cond, "cond.fr", SI);
+    Instruction *Term = SplitBlockAndInsertIfThen(Cond, SI, false);
     BasicBlock *SplitBB = SI->getParent();
     BasicBlock *NewBB = Term->getParent();
     PHINode *NewPN = PHINode::Create(SI->getType(), 2, "", SI);
@@ -2869,7 +2951,7 @@ bool JumpThreadingPass::TryToUnfoldSelectInCurrBB(BasicBlock *BB) {
 /// And cond either implies condGuard or !condGuard. In this case all the
 /// instructions before the guard can be duplicated in both branches, and the
 /// guard is then threaded to one of them.
-bool JumpThreadingPass::ProcessGuards(BasicBlock *BB) {
+bool JumpThreadingPass::processGuards(BasicBlock *BB) {
   using namespace PatternMatch;
 
   // We only want to deal with two predecessors.
@@ -2894,7 +2976,7 @@ bool JumpThreadingPass::ProcessGuards(BasicBlock *BB) {
 
   if (auto *BI = dyn_cast<BranchInst>(Parent->getTerminator()))
     for (auto &I : *BB)
-      if (isGuard(&I) && ThreadGuard(BB, cast<IntrinsicInst>(&I), BI))
+      if (isGuard(&I) && threadGuard(BB, cast<IntrinsicInst>(&I), BI))
         return true;
 
   return false;
@@ -2903,7 +2985,7 @@ bool JumpThreadingPass::ProcessGuards(BasicBlock *BB) {
 /// Try to propagate the guard from BB which is the lower block of a diamond
 /// to one of its branches, in case if diamond's condition implies guard's
 /// condition.
-bool JumpThreadingPass::ThreadGuard(BasicBlock *BB, IntrinsicInst *Guard,
+bool JumpThreadingPass::threadGuard(BasicBlock *BB, IntrinsicInst *Guard,
                                     BranchInst *BI) {
   assert(BI->getNumSuccessors() == 2 && "Wrong number of successors?");
   assert(BI->isConditional() && "Unconditional branch has 2 successors?");
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LICM.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LICM.cpp
index 1a22edaf8726..d2b4ba296f41 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -12,6 +12,13 @@
 // safe.  This pass also promotes must-aliased memory locations in the loop to
 // live in registers, thus hoisting and sinking "invariant" loads and stores.
 //
+// Hoisting operations out of loops is a canonicalization transform.  It
+// enables and simplifies subsequent optimizations in the middle-end.
+// Rematerialization of hoisted instructions to reduce register pressure is the
+// responsibility of the back-end, which has more accurate information about
+// register pressure and also handles other optimizations than LICM that
+// increase live-ranges.
+//
 // This pass uses alias analysis for two purposes:
 //
 //  1. Moving loop invariant loads and calls out of loops.  If we can determine
@@ -35,10 +42,12 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AliasSetTracker.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/GuardUtils.h"
+#include "llvm/Analysis/LazyBlockFrequencyInfo.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopIterator.h"
@@ -98,6 +107,11 @@ static cl::opt<bool> ControlFlowHoisting(
     "licm-control-flow-hoisting", cl::Hidden, cl::init(false),
     cl::desc("Enable control flow (and PHI) hoisting in LICM"));
 
+static cl::opt<unsigned> HoistSinkColdnessThreshold(
+    "licm-coldness-threshold", cl::Hidden, cl::init(4),
+    cl::desc("Relative coldness Threshold of hoisting/sinking destination "
+             "block for LICM to be considered beneficial"));
+
 static cl::opt<uint32_t> MaxNumUsesTraversed(
     "licm-max-num-uses-traversed", cl::Hidden, cl::init(8),
     cl::desc("Max num uses visited for identifying load "
@@ -143,8 +157,9 @@ static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
                   MemorySSAUpdater *MSSAU, ScalarEvolution *SE,
                   OptimizationRemarkEmitter *ORE);
 static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
-                 const Loop *CurLoop, ICFLoopSafetyInfo *SafetyInfo,
-                 MemorySSAUpdater *MSSAU, OptimizationRemarkEmitter *ORE);
+                 BlockFrequencyInfo *BFI, const Loop *CurLoop,
+                 ICFLoopSafetyInfo *SafetyInfo, MemorySSAUpdater *MSSAU,
+                 OptimizationRemarkEmitter *ORE);
 static bool isSafeToExecuteUnconditionally(Instruction &Inst,
                                            const DominatorTree *DT,
                                            const Loop *CurLoop,
@@ -155,8 +170,10 @@ static bool pointerInvalidatedByLoop(MemoryLocation MemLoc,
                                      AliasSetTracker *CurAST, Loop *CurLoop,
                                      AAResults *AA);
 static bool pointerInvalidatedByLoopWithMSSA(MemorySSA *MSSA, MemoryUse *MU,
-                                             Loop *CurLoop,
+                                             Loop *CurLoop, Instruction &I,
                                              SinkAndHoistLICMFlags &Flags);
+static bool pointerInvalidatedByBlockWithMSSA(BasicBlock &BB, MemorySSA &MSSA,
+                                              MemoryUse &MU);
 static Instruction *cloneInstructionInExitBlock(
     Instruction &I, BasicBlock &ExitBlock, PHINode &PN, const LoopInfo *LI,
     const LoopSafetyInfo *SafetyInfo, MemorySSAUpdater *MSSAU);
@@ -171,8 +188,8 @@ static void moveInstructionBefore(Instruction &I, Instruction &Dest,
 namespace {
 struct LoopInvariantCodeMotion {
   bool runOnLoop(Loop *L, AAResults *AA, LoopInfo *LI, DominatorTree *DT,
-                 TargetLibraryInfo *TLI, TargetTransformInfo *TTI,
-                 ScalarEvolution *SE, MemorySSA *MSSA,
+                 BlockFrequencyInfo *BFI, TargetLibraryInfo *TLI,
+                 TargetTransformInfo *TTI, ScalarEvolution *SE, MemorySSA *MSSA,
                  OptimizationRemarkEmitter *ORE);
 
   LoopInvariantCodeMotion(unsigned LicmMssaOptCap,
@@ -204,23 +221,30 @@ struct LegacyLICMPass : public LoopPass {
     if (skipLoop(L))
       return false;
 
+    LLVM_DEBUG(dbgs() << "Perform LICM on Loop with header at block "
+                      << L->getHeader()->getNameOrAsOperand() << "\n");
+
     auto *SE = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
     MemorySSA *MSSA = EnableMSSALoopDependency
                           ? (&getAnalysis<MemorySSAWrapperPass>().getMSSA())
                           : nullptr;
+    bool hasProfileData = L->getHeader()->getParent()->hasProfileData();
+    BlockFrequencyInfo *BFI =
+        hasProfileData ? &getAnalysis<LazyBlockFrequencyInfoPass>().getBFI()
+                       : nullptr;
     // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
-    // pass.  Function analyses need to be preserved across loop transformations
+    // pass. Function analyses need to be preserved across loop transformations
     // but ORE cannot be preserved (see comment before the pass definition).
     OptimizationRemarkEmitter ORE(L->getHeader()->getParent());
-    return LICM.runOnLoop(L,
-                          &getAnalysis<AAResultsWrapperPass>().getAAResults(),
-                          &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(),
-                          &getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
-                          &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
-                              *L->getHeader()->getParent()),
-                          &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
-                              *L->getHeader()->getParent()),
-                          SE ? &SE->getSE() : nullptr, MSSA, &ORE);
+    return LICM.runOnLoop(
+        L, &getAnalysis<AAResultsWrapperPass>().getAAResults(),
+        &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(),
+        &getAnalysis<DominatorTreeWrapperPass>().getDomTree(), BFI,
+        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
+            *L->getHeader()->getParent()),
+        &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+            *L->getHeader()->getParent()),
+        SE ? &SE->getSE() : nullptr, MSSA, &ORE);
   }
 
   /// This transformation requires natural loop information & requires that
@@ -236,6 +260,9 @@ struct LegacyLICMPass : public LoopPass {
     }
     AU.addRequired<TargetTransformInfoWrapperPass>();
     getLoopAnalysisUsage(AU);
+    LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU);
+    AU.addPreserved<LazyBlockFrequencyInfoPass>();
+    AU.addPreserved<LazyBranchProbabilityInfoPass>();
   }
 
 private:
@@ -251,8 +278,8 @@ PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM,
   OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
 
   LoopInvariantCodeMotion LICM(LicmMssaOptCap, LicmMssaNoAccForPromotionCap);
-  if (!LICM.runOnLoop(&L, &AR.AA, &AR.LI, &AR.DT, &AR.TLI, &AR.TTI, &AR.SE,
-                      AR.MSSA, &ORE))
+  if (!LICM.runOnLoop(&L, &AR.AA, &AR.LI, &AR.DT, AR.BFI, &AR.TLI, &AR.TTI,
+                      &AR.SE, AR.MSSA, &ORE))
     return PreservedAnalyses::all();
 
   auto PA = getLoopPassPreservedAnalyses();
@@ -272,6 +299,7 @@ INITIALIZE_PASS_DEPENDENCY(LoopPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LazyBFIPass)
 INITIALIZE_PASS_END(LegacyLICMPass, "licm", "Loop Invariant Code Motion", false,
                     false)
 
@@ -281,13 +309,42 @@ Pass *llvm::createLICMPass(unsigned LicmMssaOptCap,
   return new LegacyLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap);
 }
 
+llvm::SinkAndHoistLICMFlags::SinkAndHoistLICMFlags(bool IsSink, Loop *L,
+                                                   MemorySSA *MSSA)
+    : SinkAndHoistLICMFlags(SetLicmMssaOptCap, SetLicmMssaNoAccForPromotionCap,
+                            IsSink, L, MSSA) {}
+
+llvm::SinkAndHoistLICMFlags::SinkAndHoistLICMFlags(
+    unsigned LicmMssaOptCap, unsigned LicmMssaNoAccForPromotionCap, bool IsSink,
+    Loop *L, MemorySSA *MSSA)
+    : LicmMssaOptCap(LicmMssaOptCap),
+      LicmMssaNoAccForPromotionCap(LicmMssaNoAccForPromotionCap),
+      IsSink(IsSink) {
+  assert(((L != nullptr) == (MSSA != nullptr)) &&
+         "Unexpected values for SinkAndHoistLICMFlags");
+  if (!MSSA)
+    return;
+
+  unsigned AccessCapCount = 0;
+  for (auto *BB : L->getBlocks())
+    if (const auto *Accesses = MSSA->getBlockAccesses(BB))
+      for (const auto &MA : *Accesses) {
+        (void)MA;
+        ++AccessCapCount;
+        if (AccessCapCount > LicmMssaNoAccForPromotionCap) {
+          NoOfMemAccTooLarge = true;
+          return;
+        }
+      }
+}
+
 /// Hoist expressions out of the specified loop. Note, alias info for inner
 /// loop is not preserved so it is not a good idea to run LICM multiple
 /// times on one loop.
 bool LoopInvariantCodeMotion::runOnLoop(
     Loop *L, AAResults *AA, LoopInfo *LI, DominatorTree *DT,
-    TargetLibraryInfo *TLI, TargetTransformInfo *TTI, ScalarEvolution *SE,
-    MemorySSA *MSSA, OptimizationRemarkEmitter *ORE) {
+    BlockFrequencyInfo *BFI, TargetLibraryInfo *TLI, TargetTransformInfo *TTI,
+    ScalarEvolution *SE, MemorySSA *MSSA, OptimizationRemarkEmitter *ORE) {
   bool Changed = false;
 
   assert(L->isLCSSAForm(*DT) && "Loop is not in LCSSA form.");
@@ -300,31 +357,18 @@ bool LoopInvariantCodeMotion::runOnLoop(
 
   std::unique_ptr<AliasSetTracker> CurAST;
   std::unique_ptr<MemorySSAUpdater> MSSAU;
-  bool NoOfMemAccTooLarge = false;
-  unsigned LicmMssaOptCounter = 0;
+  std::unique_ptr<SinkAndHoistLICMFlags> Flags;
 
   if (!MSSA) {
     LLVM_DEBUG(dbgs() << "LICM: Using Alias Set Tracker.\n");
     CurAST = collectAliasInfoForLoop(L, LI, AA);
+    Flags = std::make_unique<SinkAndHoistLICMFlags>(
+        LicmMssaOptCap, LicmMssaNoAccForPromotionCap, /*IsSink=*/true);
   } else {
     LLVM_DEBUG(dbgs() << "LICM: Using MemorySSA.\n");
     MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
-
-    unsigned AccessCapCount = 0;
-    for (auto *BB : L->getBlocks()) {
-      if (auto *Accesses = MSSA->getBlockAccesses(BB)) {
-        for (const auto &MA : *Accesses) {
-          (void)MA;
-          AccessCapCount++;
-          if (AccessCapCount > LicmMssaNoAccForPromotionCap) {
-            NoOfMemAccTooLarge = true;
-            break;
-          }
-        }
-      }
-      if (NoOfMemAccTooLarge)
-        break;
-    }
+    Flags = std::make_unique<SinkAndHoistLICMFlags>(
+        LicmMssaOptCap, LicmMssaNoAccForPromotionCap, /*IsSink=*/true, L, MSSA);
   }
 
   // Get the preheader block to move instructions into...
@@ -343,17 +387,15 @@ bool LoopInvariantCodeMotion::runOnLoop(
   // that we are guaranteed to see definitions before we see uses.  This allows
   // us to sink instructions in one pass, without iteration.  After sinking
   // instructions, we perform another pass to hoist them out of the loop.
-  SinkAndHoistLICMFlags Flags = {NoOfMemAccTooLarge, LicmMssaOptCounter,
-                                 LicmMssaOptCap, LicmMssaNoAccForPromotionCap,
-                                 /*IsSink=*/true};
   if (L->hasDedicatedExits())
-    Changed |= sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, TTI, L,
-                          CurAST.get(), MSSAU.get(), &SafetyInfo, Flags, ORE);
-  Flags.IsSink = false;
-  if (Preheader)
     Changed |=
-        hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, L,
-                    CurAST.get(), MSSAU.get(), SE, &SafetyInfo, Flags, ORE);
+        sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, BFI, TLI, TTI, L,
+                   CurAST.get(), MSSAU.get(), &SafetyInfo, *Flags.get(), ORE);
+  Flags->setIsSink(false);
+  if (Preheader)
+    Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, BFI, TLI, L,
+                           CurAST.get(), MSSAU.get(), SE, &SafetyInfo,
+                           *Flags.get(), ORE);
 
   // Now that all loop invariants have been removed from the loop, promote any
   // memory references to scalars that we can.
@@ -363,7 +405,7 @@ bool LoopInvariantCodeMotion::runOnLoop(
   // preheader for SSA updater, so also avoid sinking when no preheader
   // is available.
   if (!DisablePromotion && Preheader && L->hasDedicatedExits() &&
-      !NoOfMemAccTooLarge) {
+      !Flags->tooManyMemoryAccesses()) {
     // Figure out the loop exits and their insertion points
     SmallVector<BasicBlock *, 8> ExitBlocks;
     L->getUniqueExitBlocks(ExitBlocks);
@@ -432,7 +474,7 @@ bool LoopInvariantCodeMotion::runOnLoop(
   // specifically moving instructions across the loop boundary and so it is
   // especially in need of sanity checking here.
   assert(L->isLCSSAForm(*DT) && "Loop not left in LCSSA form after LICM!");
-  assert((!L->getParentLoop() || L->getParentLoop()->isLCSSAForm(*DT)) &&
+  assert((L->isOutermost() || L->getParentLoop()->isLCSSAForm(*DT)) &&
          "Parent loop not left in LCSSA form after LICM!");
 
   if (MSSAU.get() && VerifyMemorySSA)
@@ -449,10 +491,10 @@ bool LoopInvariantCodeMotion::runOnLoop(
 /// definitions, allowing us to sink a loop body in one pass without iteration.
 ///
 bool llvm::sinkRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
-                      DominatorTree *DT, TargetLibraryInfo *TLI,
-                      TargetTransformInfo *TTI, Loop *CurLoop,
-                      AliasSetTracker *CurAST, MemorySSAUpdater *MSSAU,
-                      ICFLoopSafetyInfo *SafetyInfo,
+                      DominatorTree *DT, BlockFrequencyInfo *BFI,
+                      TargetLibraryInfo *TLI, TargetTransformInfo *TTI,
+                      Loop *CurLoop, AliasSetTracker *CurAST,
+                      MemorySSAUpdater *MSSAU, ICFLoopSafetyInfo *SafetyInfo,
                       SinkAndHoistLICMFlags &Flags,
                       OptimizationRemarkEmitter *ORE) {
 
@@ -501,7 +543,7 @@ bool llvm::sinkRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
           isNotUsedOrFreeInLoop(I, CurLoop, SafetyInfo, TTI, FreeInLoop) &&
           canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, MSSAU, true, &Flags,
                              ORE)) {
-        if (sink(I, LI, DT, CurLoop, SafetyInfo, MSSAU, ORE)) {
+        if (sink(I, LI, DT, BFI, CurLoop, SafetyInfo, MSSAU, ORE)) {
           if (!FreeInLoop) {
             ++II;
             salvageDebugInfo(I);
@@ -585,7 +627,7 @@ public:
       else if (!TrueDestSucc.empty()) {
         Function *F = TrueDest->getParent();
         auto IsSucc = [&](BasicBlock &BB) { return TrueDestSucc.count(&BB); };
-        auto It = std::find_if(F->begin(), F->end(), IsSucc);
+        auto It = llvm::find_if(*F, IsSucc);
         assert(It != F->end() && "Could not find successor in function");
         CommonSucc = &*It;
       }
@@ -653,15 +695,15 @@ public:
           return BB != Pair.second && (Pair.first->getSuccessor(0) == BB ||
                                        Pair.first->getSuccessor(1) == BB);
         };
-    auto It = std::find_if(HoistableBranches.begin(), HoistableBranches.end(),
-                           HasBBAsSuccessor);
+    auto It = llvm::find_if(HoistableBranches, HasBBAsSuccessor);
 
     // If not involved in a pending branch, hoist to preheader
     BasicBlock *InitialPreheader = CurLoop->getLoopPreheader();
     if (It == HoistableBranches.end()) {
-      LLVM_DEBUG(dbgs() << "LICM using " << InitialPreheader->getName()
-                        << " as hoist destination for " << BB->getName()
-                        << "\n");
+      LLVM_DEBUG(dbgs() << "LICM using "
+                        << InitialPreheader->getNameOrAsOperand()
+                        << " as hoist destination for "
+                        << BB->getNameOrAsOperand() << "\n");
       HoistDestinationMap[BB] = InitialPreheader;
       return InitialPreheader;
     }
@@ -746,13 +788,43 @@ public:
 };
 } // namespace
 
+// Hoisting/sinking instruction out of a loop isn't always beneficial. It's only
+// only worthwhile if the destination block is actually colder than current
+// block.
+static bool worthSinkOrHoistInst(Instruction &I, BasicBlock *DstBlock,
+                                 OptimizationRemarkEmitter *ORE,
+                                 BlockFrequencyInfo *BFI) {
+  // Check block frequency only when runtime profile is available
+  // to avoid pathological cases. With static profile, lean towards
+  // hosting because it helps canonicalize the loop for vectorizer.
+  if (!DstBlock->getParent()->hasProfileData())
+    return true;
+
+  if (!HoistSinkColdnessThreshold || !BFI)
+    return true;
+
+  BasicBlock *SrcBlock = I.getParent();
+  if (BFI->getBlockFreq(DstBlock).getFrequency() / HoistSinkColdnessThreshold >
+      BFI->getBlockFreq(SrcBlock).getFrequency()) {
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "SinkHoistInst", &I)
+             << "failed to sink or hoist instruction because containing block "
+                "has lower frequency than destination block";
+    });
+    return false;
+  }
+
+  return true;
+}
+
 /// Walk the specified region of the CFG (defined by all blocks dominated by
 /// the specified block, and that are in the current loop) in depth first
 /// order w.r.t the DominatorTree.  This allows us to visit definitions before
 /// uses, allowing us to hoist a loop body in one pass without iteration.
 ///
 bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
-                       DominatorTree *DT, TargetLibraryInfo *TLI, Loop *CurLoop,
+                       DominatorTree *DT, BlockFrequencyInfo *BFI,
+                       TargetLibraryInfo *TLI, Loop *CurLoop,
                        AliasSetTracker *CurAST, MemorySSAUpdater *MSSAU,
                        ScalarEvolution *SE, ICFLoopSafetyInfo *SafetyInfo,
                        SinkAndHoistLICMFlags &Flags,
@@ -803,13 +875,15 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
 
       // Try hoisting the instruction out to the preheader.  We can only do
       // this if all of the operands of the instruction are loop invariant and
-      // if it is safe to hoist the instruction.
+      // if it is safe to hoist the instruction. We also check block frequency
+      // to make sure instruction only gets hoisted into colder blocks.
       // TODO: It may be safe to hoist if we are hoisting to a conditional block
       // and we have accurately duplicated the control flow from the loop header
       // to that block.
       if (CurLoop->hasLoopInvariantOperands(&I) &&
           canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, MSSAU, true, &Flags,
                              ORE) &&
+          worthSinkOrHoistInst(I, CurLoop->getLoopPreheader(), ORE, BFI) &&
           isSafeToExecuteUnconditionally(
               I, DT, CurLoop, SafetyInfo, ORE,
               CurLoop->getLoopPreheader()->getTerminator())) {
@@ -908,7 +982,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
           HoistPoint = Dominator->getTerminator();
         }
         LLVM_DEBUG(dbgs() << "LICM rehoisting to "
-                          << HoistPoint->getParent()->getName()
+                          << HoistPoint->getParent()->getNameOrAsOperand()
                           << ": " << *I << "\n");
         moveInstructionBefore(*I, *HoistPoint, *SafetyInfo, MSSAU, SE);
         HoistPoint = I;
@@ -940,7 +1014,19 @@ static bool isLoadInvariantInLoop(LoadInst *LI, DominatorTree *DT,
                                   Loop *CurLoop) {
   Value *Addr = LI->getOperand(0);
   const DataLayout &DL = LI->getModule()->getDataLayout();
-  const uint32_t LocSizeInBits = DL.getTypeSizeInBits(LI->getType());
+  const TypeSize LocSizeInBits = DL.getTypeSizeInBits(LI->getType());
+
+  // It is not currently possible for clang to generate an invariant.start
+  // intrinsic with scalable vector types because we don't support thread local
+  // sizeless types and we don't permit sizeless types in structs or classes.
+  // Furthermore, even if support is added for this in future the intrinsic
+  // itself is defined to have a size of -1 for variable sized objects. This
+  // makes it impossible to verify if the intrinsic envelops our region of
+  // interest. For example, both <vscale x 32 x i8> and <vscale x 16 x i8>
+  // types would have a -1 parameter, but the former is clearly double the size
+  // of the latter.
+  if (LocSizeInBits.isScalable())
+    return false;
 
   // if the type is i8 addrspace(x)*, we know this is the type of
   // llvm.invariant.start operand
@@ -970,13 +1056,17 @@ static bool isLoadInvariantInLoop(LoadInst *LI, DominatorTree *DT,
     if (!II || II->getIntrinsicID() != Intrinsic::invariant_start ||
         !II->use_empty())
       continue;
-    unsigned InvariantSizeInBits =
-        cast<ConstantInt>(II->getArgOperand(0))->getSExtValue() * 8;
+    ConstantInt *InvariantSize = cast<ConstantInt>(II->getArgOperand(0));
+    // The intrinsic supports having a -1 argument for variable sized objects
+    // so we should check for that here.
+    if (InvariantSize->isNegative())
+      continue;
+    uint64_t InvariantSizeInBits = InvariantSize->getSExtValue() * 8;
     // Confirm the invariant.start location size contains the load operand size
     // in bits. Also, the invariant.start should dominate the load, and we
     // should not hoist the load out of a loop that contains this dominating
     // invariant.start.
-    if (LocSizeInBits <= InvariantSizeInBits &&
+    if (LocSizeInBits.getFixedSize() <= InvariantSizeInBits &&
         DT->properlyDominates(II->getParent(), CurLoop->getHeader()))
       return true;
   }
@@ -1041,6 +1131,9 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
                               bool TargetExecutesOncePerLoop,
                               SinkAndHoistLICMFlags *Flags,
                               OptimizationRemarkEmitter *ORE) {
+  assert(((CurAST != nullptr) ^ (MSSAU != nullptr)) &&
+         "Either AliasSetTracker or MemorySSA should be initialized.");
+
   // If we don't understand the instruction, bail early.
   if (!isHoistableAndSinkableInst(I))
     return false;
@@ -1074,7 +1167,7 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
                                              CurLoop, AA);
     else
       Invalidated = pointerInvalidatedByLoopWithMSSA(
-          MSSA, cast<MemoryUse>(MSSA->getMemoryAccess(LI)), CurLoop, *Flags);
+          MSSA, cast<MemoryUse>(MSSA->getMemoryAccess(LI)), CurLoop, I, *Flags);
     // Check loop-invariant address because this may also be a sinkable load
     // whose address is not necessarily loop-invariant.
     if (ORE && Invalidated && CurLoop->isLoopInvariant(LI->getPointerOperand()))
@@ -1095,6 +1188,13 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
     if (CI->mayThrow())
       return false;
 
+    // Convergent attribute has been used on operations that involve
+    // inter-thread communication which results are implicitly affected by the
+    // enclosing control flows. It is not safe to hoist or sink such operations
+    // across control flow.
+    if (CI->isConvergent())
+      return false;
+
     using namespace PatternMatch;
     if (match(CI, m_Intrinsic<Intrinsic::assume>()))
       // Assumes don't actually alias anything or throw
@@ -1119,11 +1219,10 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
             bool Invalidated;
             if (CurAST)
               Invalidated = pointerInvalidatedByLoop(
-                  MemoryLocation(Op, LocationSize::unknown(), AAMDNodes()),
-                  CurAST, CurLoop, AA);
+                  MemoryLocation::getBeforeOrAfter(Op), CurAST, CurLoop, AA);
             else
               Invalidated = pointerInvalidatedByLoopWithMSSA(
-                  MSSA, cast<MemoryUse>(MSSA->getMemoryAccess(CI)), CurLoop,
+                  MSSA, cast<MemoryUse>(MSSA->getMemoryAccess(CI)), CurLoop, I,
                   *Flags);
             if (Invalidated)
               return false;
@@ -1183,12 +1282,9 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
     } else { // MSSAU
       if (isOnlyMemoryAccess(SI, CurLoop, MSSAU))
         return true;
-      // If there are more accesses than the Promotion cap, give up, we're not
-      // walking a list that long.
-      if (Flags->NoOfMemAccTooLarge)
-        return false;
-      // Check store only if there's still "quota" to check clobber.
-      if (Flags->LicmMssaOptCounter >= Flags->LicmMssaOptCap)
+      // If there are more accesses than the Promotion cap or no "quota" to
+      // check clobber, then give up as we're not walking a list that long.
+      if (Flags->tooManyMemoryAccesses() || Flags->tooManyClobberingCalls())
         return false;
       // If there are interfering Uses (i.e. their defining access is in the
       // loop), or ordered loads (stored as Defs!), don't move this store.
@@ -1208,7 +1304,7 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
               // Uses may point to an access outside the loop, as getClobbering
               // checks the previous iteration when walking the backedge.
               // FIXME: More precise: no Uses that alias SI.
-              if (!Flags->IsSink && !MSSA->dominates(SIMD, MU))
+              if (!Flags->getIsSink() && !MSSA->dominates(SIMD, MU))
                 return false;
             } else if (const auto *MD = dyn_cast<MemoryDef>(&MA)) {
               if (auto *LI = dyn_cast<LoadInst>(MD->getMemoryInst())) {
@@ -1227,9 +1323,8 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
               }
             }
         }
-
       auto *Source = MSSA->getSkipSelfWalker()->getClobberingMemoryAccess(SI);
-      Flags->LicmMssaOptCounter++;
+      Flags->incrementClobberingCalls();
       // If there are no clobbering Defs in the loop, store is safe to hoist.
       return MSSA->isLiveOnEntryDef(Source) ||
              !CurLoop->contains(Source->getBlock());
@@ -1529,8 +1624,9 @@ static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT,
 /// position, and may either delete it or move it to outside of the loop.
 ///
 static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
-                 const Loop *CurLoop, ICFLoopSafetyInfo *SafetyInfo,
-                 MemorySSAUpdater *MSSAU, OptimizationRemarkEmitter *ORE) {
+                 BlockFrequencyInfo *BFI, const Loop *CurLoop,
+                 ICFLoopSafetyInfo *SafetyInfo, MemorySSAUpdater *MSSAU,
+                 OptimizationRemarkEmitter *ORE) {
   LLVM_DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n");
   ORE->emit([&]() {
     return OptimizationRemark(DEBUG_TYPE, "InstSunk", &I)
@@ -1606,7 +1702,10 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
   // If this instruction is only used outside of the loop, then all users are
   // PHI nodes in exit blocks due to LCSSA form. Just RAUW them with clones of
   // the instruction.
+  // First check if I is worth sinking for all uses. Sink only when it is worth
+  // across all uses.
   SmallSetVector<User*, 8> Users(I.user_begin(), I.user_end());
+  SmallVector<PHINode *, 8> ExitPNs;
   for (auto *UI : Users) {
     auto *User = cast<Instruction>(UI);
 
@@ -1616,6 +1715,15 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
     PHINode *PN = cast<PHINode>(User);
     assert(ExitBlockSet.count(PN->getParent()) &&
            "The LCSSA PHI is not in an exit block!");
+    if (!worthSinkOrHoistInst(I, PN->getParent(), ORE, BFI)) {
+      return Changed;
+    }
+
+    ExitPNs.push_back(PN);
+  }
+
+  for (auto *PN : ExitPNs) {
+
     // The PHI must be trivially replaceable.
     Instruction *New = sinkThroughTriviallyReplaceablePHI(
         PN, &I, LI, SunkCopies, SafetyInfo, CurLoop, MSSAU);
@@ -1633,8 +1741,8 @@ static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
                   BasicBlock *Dest, ICFLoopSafetyInfo *SafetyInfo,
                   MemorySSAUpdater *MSSAU, ScalarEvolution *SE,
                   OptimizationRemarkEmitter *ORE) {
-  LLVM_DEBUG(dbgs() << "LICM hoisting to " << Dest->getName() << ": " << I
-                    << "\n");
+  LLVM_DEBUG(dbgs() << "LICM hoisting to " << Dest->getNameOrAsOperand() << ": "
+                    << I << "\n");
   ORE->emit([&]() {
     return OptimizationRemark(DEBUG_TYPE, "Hoisted", &I) << "hoisting "
                                                          << ore::NV("Inst", &I);
@@ -1658,10 +1766,7 @@ static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
     // Move the new node to the destination block, before its terminator.
     moveInstructionBefore(I, *Dest->getTerminator(), *SafetyInfo, MSSAU, SE);
 
-  // Apply line 0 debug locations when we are moving instructions to different
-  // basic blocks because we want to avoid jumpy line tables.
-  if (const DebugLoc &DL = I.getDebugLoc())
-    I.setDebugLoc(DebugLoc::get(0, 0, DL.getScope(), DL.getInlinedAt()));
+  I.updateLocationAfterHoist();
 
   if (isa<LoadInst>(I))
     ++NumMovedLoads;
@@ -1707,7 +1812,7 @@ class LoopPromoter : public LoadAndStorePromoter {
   SmallVectorImpl<Instruction *> &LoopInsertPts;
   SmallVectorImpl<MemoryAccess *> &MSSAInsertPts;
   PredIteratorCache &PredCache;
-  AliasSetTracker &AST;
+  AliasSetTracker *AST;
   MemorySSAUpdater *MSSAU;
   LoopInfo &LI;
   DebugLoc DL;
@@ -1737,7 +1842,7 @@ public:
                SmallVectorImpl<BasicBlock *> &LEB,
                SmallVectorImpl<Instruction *> &LIP,
                SmallVectorImpl<MemoryAccess *> &MSSAIP, PredIteratorCache &PIC,
-               AliasSetTracker &ast, MemorySSAUpdater *MSSAU, LoopInfo &li,
+               AliasSetTracker *ast, MemorySSAUpdater *MSSAU, LoopInfo &li,
                DebugLoc dl, int alignment, bool UnorderedAtomic,
                const AAMDNodes &AATags, ICFLoopSafetyInfo &SafetyInfo)
       : LoadAndStorePromoter(Insts, S), SomePtr(SP), PointerMustAliases(PMA),
@@ -1794,11 +1899,13 @@ public:
 
   void replaceLoadWithValue(LoadInst *LI, Value *V) const override {
     // Update alias analysis.
-    AST.copyValue(LI, V);
+    if (AST)
+      AST->copyValue(LI, V);
   }
   void instructionDeleted(Instruction *I) const override {
     SafetyInfo.removeInstruction(I);
-    AST.deleteValue(I);
+    if (AST)
+      AST->deleteValue(I);
     if (MSSAU)
       MSSAU->removeMemoryAccess(I);
   }
@@ -1844,7 +1951,7 @@ bool llvm::promoteLoopAccessesToScalars(
     ICFLoopSafetyInfo *SafetyInfo, OptimizationRemarkEmitter *ORE) {
   // Verify inputs.
   assert(LI != nullptr && DT != nullptr && CurLoop != nullptr &&
-         CurAST != nullptr && SafetyInfo != nullptr &&
+         SafetyInfo != nullptr &&
          "Unexpected Input to promoteLoopAccessesToScalars");
 
   Value *SomePtr = *PointerMustAliases.begin();
@@ -1909,7 +2016,7 @@ bool llvm::promoteLoopAccessesToScalars(
     // we have to prove that the store is dead along the unwind edge.  We do
     // this by proving that the caller can't have a reference to the object
     // after return and thus can't possibly load from the object.
-    Value *Object = GetUnderlyingObject(SomePtr, MDL);
+    Value *Object = getUnderlyingObject(SomePtr);
     if (!isKnownNonEscaping(Object, TLI))
       return false;
     // Subtlety: Alloca's aren't visible to callers, but *are* potentially
@@ -2041,7 +2148,7 @@ bool llvm::promoteLoopAccessesToScalars(
     if (IsKnownThreadLocalObject)
       SafeToInsertStore = true;
     else {
-      Value *Object = GetUnderlyingObject(SomePtr, MDL);
+      Value *Object = getUnderlyingObject(SomePtr);
       SafeToInsertStore =
           (isAllocLikeFn(Object, TLI) || isa<AllocaInst>(Object)) &&
           !PointerMayBeCaptured(Object, true, true);
@@ -2072,7 +2179,7 @@ bool llvm::promoteLoopAccessesToScalars(
   SmallVector<PHINode *, 16> NewPHIs;
   SSAUpdater SSA(&NewPHIs);
   LoopPromoter Promoter(SomePtr, LoopUses, SSA, PointerMustAliases, ExitBlocks,
-                        InsertPts, MSSAInsertPts, PIC, *CurAST, MSSAU, *LI, DL,
+                        InsertPts, MSSAInsertPts, PIC, CurAST, MSSAU, *LI, DL,
                         Alignment.value(), SawUnorderedAtomic, AATags,
                         *SafetyInfo);
 
@@ -2187,18 +2294,18 @@ static bool pointerInvalidatedByLoop(MemoryLocation MemLoc,
   return false;
 }
 
-static bool pointerInvalidatedByLoopWithMSSA(MemorySSA *MSSA, MemoryUse *MU,
-                                             Loop *CurLoop,
-                                             SinkAndHoistLICMFlags &Flags) {
+bool pointerInvalidatedByLoopWithMSSA(MemorySSA *MSSA, MemoryUse *MU,
+                                      Loop *CurLoop, Instruction &I,
+                                      SinkAndHoistLICMFlags &Flags) {
   // For hoisting, use the walker to determine safety
-  if (!Flags.IsSink) {
+  if (!Flags.getIsSink()) {
     MemoryAccess *Source;
     // See declaration of SetLicmMssaOptCap for usage details.
-    if (Flags.LicmMssaOptCounter >= Flags.LicmMssaOptCap)
+    if (Flags.tooManyClobberingCalls())
       Source = MU->getDefiningAccess();
     else {
       Source = MSSA->getSkipSelfWalker()->getClobberingMemoryAccess(MU);
-      Flags.LicmMssaOptCounter++;
+      Flags.incrementClobberingCalls();
     }
     return !MSSA->isLiveOnEntryDef(Source) &&
            CurLoop->contains(Source->getBlock());
@@ -2221,15 +2328,25 @@ static bool pointerInvalidatedByLoopWithMSSA(MemorySSA *MSSA, MemoryUse *MU,
   // FIXME: Increase precision: Safe to sink if Use post dominates the Def;
   // needs PostDominatorTreeAnalysis.
   // FIXME: More precise: no Defs that alias this Use.
-  if (Flags.NoOfMemAccTooLarge)
+  if (Flags.tooManyMemoryAccesses())
     return true;
   for (auto *BB : CurLoop->getBlocks())
-    if (auto *Accesses = MSSA->getBlockDefs(BB))
-      for (const auto &MA : *Accesses)
-        if (const auto *MD = dyn_cast<MemoryDef>(&MA))
-          if (MU->getBlock() != MD->getBlock() ||
-              !MSSA->locallyDominates(MD, MU))
-            return true;
+    if (pointerInvalidatedByBlockWithMSSA(*BB, *MSSA, *MU))
+      return true;
+  // When sinking, the source block may not be part of the loop so check it.
+  if (!CurLoop->contains(&I))
+    return pointerInvalidatedByBlockWithMSSA(*I.getParent(), *MSSA, *MU);
+
+  return false;
+}
+
+bool pointerInvalidatedByBlockWithMSSA(BasicBlock &BB, MemorySSA &MSSA,
+                                       MemoryUse &MU) {
+  if (const auto *Accesses = MSSA.getBlockDefs(&BB))
+    for (const auto &MA : *Accesses)
+      if (const auto *MD = dyn_cast<MemoryDef>(&MA))
+        if (MU.getBlock() != MD->getBlock() || !MSSA.locallyDominates(MD, &MU))
+          return true;
   return false;
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
index 687e14d6d7d2..45cdcb2f37dd 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
@@ -271,7 +271,7 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) {
   bool MadeChange = false;
 
   // Only prefetch in the inner-most loop
-  if (!L->empty())
+  if (!L->isInnermost())
     return MadeChange;
 
   SmallPtrSet<const Value *, 32> EphValues;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDeletion.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
index be209d34be42..1266c93316fa 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -26,6 +26,7 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "loop-delete"
@@ -38,6 +39,14 @@ enum class LoopDeletionResult {
   Deleted,
 };
 
+static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B) {
+  if (A == LoopDeletionResult::Deleted || B == LoopDeletionResult::Deleted)
+    return LoopDeletionResult::Deleted;
+  if (A == LoopDeletionResult::Modified || B == LoopDeletionResult::Modified)
+    return LoopDeletionResult::Modified;
+  return LoopDeletionResult::Unmodified;
+}
+
 /// Determines if a loop is dead.
 ///
 /// This assumes that we've already checked for unique exit and exiting blocks,
@@ -53,26 +62,28 @@ static bool isLoopDead(Loop *L, ScalarEvolution &SE,
   // of the loop.
   bool AllEntriesInvariant = true;
   bool AllOutgoingValuesSame = true;
-  for (PHINode &P : ExitBlock->phis()) {
-    Value *incoming = P.getIncomingValueForBlock(ExitingBlocks[0]);
-
-    // Make sure all exiting blocks produce the same incoming value for the exit
-    // block.  If there are different incoming values for different exiting
-    // blocks, then it is impossible to statically determine which value should
-    // be used.
-    AllOutgoingValuesSame =
-        all_of(makeArrayRef(ExitingBlocks).slice(1), [&](BasicBlock *BB) {
-          return incoming == P.getIncomingValueForBlock(BB);
-        });
-
-    if (!AllOutgoingValuesSame)
-      break;
-
-    if (Instruction *I = dyn_cast<Instruction>(incoming))
-      if (!L->makeLoopInvariant(I, Changed, Preheader->getTerminator())) {
-        AllEntriesInvariant = false;
+  if (!L->hasNoExitBlocks()) {
+    for (PHINode &P : ExitBlock->phis()) {
+      Value *incoming = P.getIncomingValueForBlock(ExitingBlocks[0]);
+
+      // Make sure all exiting blocks produce the same incoming value for the
+      // block. If there are different incoming values for different exiting
+      // blocks, then it is impossible to statically determine which value
+      // should be used.
+      AllOutgoingValuesSame =
+          all_of(makeArrayRef(ExitingBlocks).slice(1), [&](BasicBlock *BB) {
+            return incoming == P.getIncomingValueForBlock(BB);
+          });
+
+      if (!AllOutgoingValuesSame)
         break;
-      }
+
+      if (Instruction *I = dyn_cast<Instruction>(incoming))
+        if (!L->makeLoopInvariant(I, Changed, Preheader->getTerminator())) {
+          AllEntriesInvariant = false;
+          break;
+        }
+    }
   }
 
   if (Changed)
@@ -85,7 +96,9 @@ static bool isLoopDead(Loop *L, ScalarEvolution &SE,
   // This includes instructions that could write to memory, and loads that are
   // marked volatile.
   for (auto &I : L->blocks())
-    if (any_of(*I, [](Instruction &I) { return I.mayHaveSideEffects(); }))
+    if (any_of(*I, [](Instruction &I) {
+          return I.mayHaveSideEffects() && !I.isDroppable();
+        }))
       return false;
   return true;
 }
@@ -122,12 +135,33 @@ static bool isLoopNeverExecuted(Loop *L) {
   return true;
 }
 
+/// If we can prove the backedge is untaken, remove it.  This destroys the
+/// loop, but leaves the (now trivially loop invariant) control flow and
+/// side effects (if any) in place.
+static LoopDeletionResult
+breakBackedgeIfNotTaken(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
+                        LoopInfo &LI, MemorySSA *MSSA,
+                        OptimizationRemarkEmitter &ORE) {
+  assert(L->isLCSSAForm(DT) && "Expected LCSSA!");
+
+  if (!L->getLoopLatch())
+    return LoopDeletionResult::Unmodified;
+
+  auto *BTC = SE.getBackedgeTakenCount(L);
+  if (!BTC->isZero())
+    return LoopDeletionResult::Unmodified;
+
+  breakLoopBackedge(L, DT, SE, LI, MSSA);
+  return LoopDeletionResult::Deleted;
+}
+
 /// Remove a loop if it is dead.
 ///
-/// A loop is considered dead if it does not impact the observable behavior of
-/// the program other than finite running time. This never removes a loop that
-/// might be infinite (unless it is never executed), as doing so could change
-/// the halting/non-halting nature of a program.
+/// A loop is considered dead either if it does not impact the observable
+/// behavior of the program other than finite running time, or if it is
+/// required to make progress by an attribute such as 'mustprogress' or
+/// 'llvm.loop.mustprogress' and does not make any. This may remove
+/// infinite loops that have been required to make progress.
 ///
 /// This entire process relies pretty heavily on LoopSimplify form and LCSSA in
 /// order to make various safety checks work.
@@ -151,18 +185,15 @@ static LoopDeletionResult deleteLoopIfDead(Loop *L, DominatorTree &DT,
         << "Deletion requires Loop with preheader and dedicated exits.\n");
     return LoopDeletionResult::Unmodified;
   }
-  // We can't remove loops that contain subloops.  If the subloops were dead,
-  // they would already have been removed in earlier executions of this pass.
-  if (L->begin() != L->end()) {
-    LLVM_DEBUG(dbgs() << "Loop contains subloops.\n");
-    return LoopDeletionResult::Unmodified;
-  }
-
 
   BasicBlock *ExitBlock = L->getUniqueExitBlock();
 
   if (ExitBlock && isLoopNeverExecuted(L)) {
     LLVM_DEBUG(dbgs() << "Loop is proven to never execute, delete it!");
+    // We need to forget the loop before setting the incoming values of the exit
+    // phis to undef, so we properly invalidate the SCEV expressions for those
+    // phis.
+    SE.forgetLoop(L);
     // Set incoming value to undef for phi nodes in the exit block.
     for (PHINode &P : ExitBlock->phis()) {
       std::fill(P.incoming_values().begin(), P.incoming_values().end(),
@@ -183,12 +214,12 @@ static LoopDeletionResult deleteLoopIfDead(Loop *L, DominatorTree &DT,
   SmallVector<BasicBlock *, 4> ExitingBlocks;
   L->getExitingBlocks(ExitingBlocks);
 
-  // We require that the loop only have a single exit block.  Otherwise, we'd
-  // be in the situation of needing to be able to solve statically which exit
-  // block will be branched to, or trying to preserve the branching logic in
-  // a loop invariant manner.
-  if (!ExitBlock) {
-    LLVM_DEBUG(dbgs() << "Deletion requires single exit block\n");
+  // We require that the loop has at most one exit block. Otherwise, we'd be in
+  // the situation of needing to be able to solve statically which exit block
+  // will be branched to, or trying to preserve the branching logic in a loop
+  // invariant manner.
+  if (!ExitBlock && !L->hasNoExitBlocks()) {
+    LLVM_DEBUG(dbgs() << "Deletion requires at most one exit block.\n");
     return LoopDeletionResult::Unmodified;
   }
   // Finally, we have to check that the loop really is dead.
@@ -199,11 +230,13 @@ static LoopDeletionResult deleteLoopIfDead(Loop *L, DominatorTree &DT,
                    : LoopDeletionResult::Unmodified;
   }
 
-  // Don't remove loops for which we can't solve the trip count.
-  // They could be infinite, in which case we'd be changing program behavior.
+  // Don't remove loops for which we can't solve the trip count unless the loop
+  // was required to make progress but has been determined to be dead.
   const SCEV *S = SE.getConstantMaxBackedgeTakenCount(L);
-  if (isa<SCEVCouldNotCompute>(S)) {
-    LLVM_DEBUG(dbgs() << "Could not compute SCEV MaxBackedgeTakenCount.\n");
+  if (isa<SCEVCouldNotCompute>(S) &&
+      !L->getHeader()->getParent()->mustProgress() && !hasMustProgress(L)) {
+    LLVM_DEBUG(dbgs() << "Could not compute SCEV MaxBackedgeTakenCount and was "
+                         "not required to make progress.\n");
     return Changed ? LoopDeletionResult::Modified
                    : LoopDeletionResult::Unmodified;
   }
@@ -232,6 +265,14 @@ PreservedAnalyses LoopDeletionPass::run(Loop &L, LoopAnalysisManager &AM,
   // but ORE cannot be preserved (see comment before the pass definition).
   OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
   auto Result = deleteLoopIfDead(&L, AR.DT, AR.SE, AR.LI, AR.MSSA, ORE);
+
+  // If we can prove the backedge isn't taken, just break it and be done.  This
+  // leaves the loop structure in place which means it can handle dispatching
+  // to the right exit based on whatever loop invariant structure remains.
+  if (Result != LoopDeletionResult::Deleted)
+    Result = merge(Result, breakBackedgeIfNotTaken(&L, AR.DT, AR.SE, AR.LI,
+                                                   AR.MSSA, ORE));
+
   if (Result == LoopDeletionResult::Unmodified)
     return PreservedAnalyses::all();
 
@@ -291,6 +332,12 @@ bool LoopDeletionLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
 
   LoopDeletionResult Result = deleteLoopIfDead(L, DT, SE, LI, MSSA, ORE);
 
+  // If we can prove the backedge isn't taken, just break it and be done.  This
+  // leaves the loop structure in place which means it can handle dispatching
+  // to the right exit based on whatever loop invariant structure remains.
+  if (Result != LoopDeletionResult::Deleted)
+    Result = merge(Result, breakBackedgeIfNotTaken(L, DT, SE, LI, MSSA, ORE));
+
   if (Result == LoopDeletionResult::Deleted)
     LPM.markLoopAsDeleted(*L);
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
index 7867a5468891..1bd2529891b7 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
@@ -33,7 +33,6 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/ADT/iterator_range.h"
-#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
@@ -664,21 +663,23 @@ public:
 
   /// Try to distribute an inner-most loop.
   bool processLoop(std::function<const LoopAccessInfo &(Loop &)> &GetLAA) {
-    assert(L->empty() && "Only process inner loops.");
+    assert(L->isInnermost() && "Only process inner loops.");
 
     LLVM_DEBUG(dbgs() << "\nLDist: In \""
                       << L->getHeader()->getParent()->getName()
                       << "\" checking " << *L << "\n");
 
+    // Having a single exit block implies there's also one exiting block.
     if (!L->getExitBlock())
       return fail("MultipleExitBlocks", "multiple exit blocks");
     if (!L->isLoopSimplifyForm())
       return fail("NotLoopSimplifyForm",
                   "loop is not in loop-simplify form");
+    if (!L->isRotatedForm())
+      return fail("NotBottomTested", "loop is not bottom tested");
 
     BasicBlock *PH = L->getLoopPreheader();
 
-    // LAA will check that we only have a single exiting block.
     LAI = &GetLAA(*L);
 
     // Currently, we only distribute to isolate the part of the loop with
@@ -814,9 +815,7 @@ public:
 
       LLVM_DEBUG(dbgs() << "\nPointers:\n");
       LLVM_DEBUG(LAI->getRuntimePointerChecking()->printChecks(dbgs(), Checks));
-      LoopVersioning LVer(*LAI, L, LI, DT, SE, false);
-      LVer.setAliasChecks(std::move(Checks));
-      LVer.setSCEVChecks(LAI->getPSE().getUnionPredicate());
+      LoopVersioning LVer(*LAI, Checks, L, LI, DT, SE);
       LVer.versionLoop(DefsUsedOutside);
       LVer.annotateLoopWithNoAlias();
 
@@ -982,7 +981,7 @@ static bool runImpl(Function &F, LoopInfo *LI, DominatorTree *DT,
   for (Loop *TopLevelLoop : *LI)
     for (Loop *L : depth_first(TopLevelLoop))
       // We only handle inner-most loops.
-      if (L->empty())
+      if (L->isInnermost())
         Worklist.push_back(L);
 
   // Now walk the identified inner loops.
@@ -1058,7 +1057,8 @@ PreservedAnalyses LoopDistributePass::run(Function &F,
   auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
   std::function<const LoopAccessInfo &(Loop &)> GetLAA =
       [&](Loop &L) -> const LoopAccessInfo & {
-    LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, nullptr};
+    LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,     SE,
+                                      TLI, TTI, nullptr, nullptr};
     return LAM.getResult<LoopAccessAnalysis>(L, AR);
   };
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopFlatten.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
new file mode 100644
index 000000000000..aaff68436c13
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
@@ -0,0 +1,728 @@
+//===- LoopFlatten.cpp - Loop flattening pass------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass flattens pairs nested loops into a single loop.
+//
+// The intention is to optimise loop nests like this, which together access an
+// array linearly:
+//   for (int i = 0; i < N; ++i)
+//     for (int j = 0; j < M; ++j)
+//       f(A[i*M+j]);
+// into one loop:
+//   for (int i = 0; i < (N*M); ++i)
+//     f(A[i]);
+//
+// It can also flatten loops where the induction variables are not used in the
+// loop. This is only worth doing if the induction variables are only used in an
+// expression like i*M+j. If they had any other uses, we would have to insert a
+// div/mod to reconstruct the original values, so this wouldn't be profitable.
+//
+// We also need to prove that N*M will not overflow.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopFlatten.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
+#include "llvm/Transforms/Utils/SimplifyIndVar.h"
+
+#define DEBUG_TYPE "loop-flatten"
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+static cl::opt<unsigned> RepeatedInstructionThreshold(
+    "loop-flatten-cost-threshold", cl::Hidden, cl::init(2),
+    cl::desc("Limit on the cost of instructions that can be repeated due to "
+             "loop flattening"));
+
+static cl::opt<bool>
+    AssumeNoOverflow("loop-flatten-assume-no-overflow", cl::Hidden,
+                     cl::init(false),
+                     cl::desc("Assume that the product of the two iteration "
+                              "limits will never overflow"));
+
+static cl::opt<bool>
+    WidenIV("loop-flatten-widen-iv", cl::Hidden,
+            cl::init(true),
+            cl::desc("Widen the loop induction variables, if possible, so "
+                     "overflow checks won't reject flattening"));
+
+struct FlattenInfo {
+  Loop *OuterLoop = nullptr;
+  Loop *InnerLoop = nullptr;
+  PHINode *InnerInductionPHI = nullptr;
+  PHINode *OuterInductionPHI = nullptr;
+  Value *InnerLimit = nullptr;
+  Value *OuterLimit = nullptr;
+  BinaryOperator *InnerIncrement = nullptr;
+  BinaryOperator *OuterIncrement = nullptr;
+  BranchInst *InnerBranch = nullptr;
+  BranchInst *OuterBranch = nullptr;
+  SmallPtrSet<Value *, 4> LinearIVUses;
+  SmallPtrSet<PHINode *, 4> InnerPHIsToTransform;
+
+  // Whether this holds the flatten info before or after widening.
+  bool Widened = false;
+
+  FlattenInfo(Loop *OL, Loop *IL) : OuterLoop(OL), InnerLoop(IL) {};
+};
+
+// Finds the induction variable, increment and limit for a simple loop that we
+// can flatten.
+static bool findLoopComponents(
+    Loop *L, SmallPtrSetImpl<Instruction *> &IterationInstructions,
+    PHINode *&InductionPHI, Value *&Limit, BinaryOperator *&Increment,
+    BranchInst *&BackBranch, ScalarEvolution *SE) {
+  LLVM_DEBUG(dbgs() << "Finding components of loop: " << L->getName() << "\n");
+
+  if (!L->isLoopSimplifyForm()) {
+    LLVM_DEBUG(dbgs() << "Loop is not in normal form\n");
+    return false;
+  }
+
+  // There must be exactly one exiting block, and it must be the same at the
+  // latch.
+  BasicBlock *Latch = L->getLoopLatch();
+  if (L->getExitingBlock() != Latch) {
+    LLVM_DEBUG(dbgs() << "Exiting and latch block are different\n");
+    return false;
+  }
+  // Latch block must end in a conditional branch.
+  BackBranch = dyn_cast<BranchInst>(Latch->getTerminator());
+  if (!BackBranch || !BackBranch->isConditional()) {
+    LLVM_DEBUG(dbgs() << "Could not find back-branch\n");
+    return false;
+  }
+  IterationInstructions.insert(BackBranch);
+  LLVM_DEBUG(dbgs() << "Found back branch: "; BackBranch->dump());
+  bool ContinueOnTrue = L->contains(BackBranch->getSuccessor(0));
+
+  // Find the induction PHI. If there is no induction PHI, we can't do the
+  // transformation. TODO: could other variables trigger this? Do we have to
+  // search for the best one?
+  InductionPHI = nullptr;
+  for (PHINode &PHI : L->getHeader()->phis()) {
+    InductionDescriptor ID;
+    if (InductionDescriptor::isInductionPHI(&PHI, L, SE, ID)) {
+      InductionPHI = &PHI;
+      LLVM_DEBUG(dbgs() << "Found induction PHI: "; InductionPHI->dump());
+      break;
+    }
+  }
+  if (!InductionPHI) {
+    LLVM_DEBUG(dbgs() << "Could not find induction PHI\n");
+    return false;
+  }
+
+  auto IsValidPredicate = [&](ICmpInst::Predicate Pred) {
+    if (ContinueOnTrue)
+      return Pred == CmpInst::ICMP_NE || Pred == CmpInst::ICMP_ULT;
+    else
+      return Pred == CmpInst::ICMP_EQ;
+  };
+
+  // Find Compare and make sure it is valid
+  ICmpInst *Compare = dyn_cast<ICmpInst>(BackBranch->getCondition());
+  if (!Compare || !IsValidPredicate(Compare->getUnsignedPredicate()) ||
+      Compare->hasNUsesOrMore(2)) {
+    LLVM_DEBUG(dbgs() << "Could not find valid comparison\n");
+    return false;
+  }
+  IterationInstructions.insert(Compare);
+  LLVM_DEBUG(dbgs() << "Found comparison: "; Compare->dump());
+
+  // Find increment and limit from the compare
+  Increment = nullptr;
+  if (match(Compare->getOperand(0),
+            m_c_Add(m_Specific(InductionPHI), m_ConstantInt<1>()))) {
+    Increment = dyn_cast<BinaryOperator>(Compare->getOperand(0));
+    Limit = Compare->getOperand(1);
+  } else if (Compare->getUnsignedPredicate() == CmpInst::ICMP_NE &&
+             match(Compare->getOperand(1),
+                   m_c_Add(m_Specific(InductionPHI), m_ConstantInt<1>()))) {
+    Increment = dyn_cast<BinaryOperator>(Compare->getOperand(1));
+    Limit = Compare->getOperand(0);
+  }
+  if (!Increment || Increment->hasNUsesOrMore(3)) {
+    LLVM_DEBUG(dbgs() << "Cound not find valid increment\n");
+    return false;
+  }
+  IterationInstructions.insert(Increment);
+  LLVM_DEBUG(dbgs() << "Found increment: "; Increment->dump());
+  LLVM_DEBUG(dbgs() << "Found limit: "; Limit->dump());
+
+  assert(InductionPHI->getNumIncomingValues() == 2);
+  assert(InductionPHI->getIncomingValueForBlock(Latch) == Increment &&
+         "PHI value is not increment inst");
+
+  auto *CI = dyn_cast<ConstantInt>(
+      InductionPHI->getIncomingValueForBlock(L->getLoopPreheader()));
+  if (!CI || !CI->isZero()) {
+    LLVM_DEBUG(dbgs() << "PHI value is not zero: "; CI->dump());
+    return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "Successfully found all loop components\n");
+  return true;
+}
+
+static bool checkPHIs(struct FlattenInfo &FI,
+                      const TargetTransformInfo *TTI) {
+  // All PHIs in the inner and outer headers must either be:
+  // - The induction PHI, which we are going to rewrite as one induction in
+  //   the new loop. This is already checked by findLoopComponents.
+  // - An outer header PHI with all incoming values from outside the loop.
+  //   LoopSimplify guarantees we have a pre-header, so we don't need to
+  //   worry about that here.
+  // - Pairs of PHIs in the inner and outer headers, which implement a
+  //   loop-carried dependency that will still be valid in the new loop. To
+  //   be valid, this variable must be modified only in the inner loop.
+
+  // The set of PHI nodes in the outer loop header that we know will still be
+  // valid after the transformation. These will not need to be modified (with
+  // the exception of the induction variable), but we do need to check that
+  // there are no unsafe PHI nodes.
+  SmallPtrSet<PHINode *, 4> SafeOuterPHIs;
+  SafeOuterPHIs.insert(FI.OuterInductionPHI);
+
+  // Check that all PHI nodes in the inner loop header match one of the valid
+  // patterns.
+  for (PHINode &InnerPHI : FI.InnerLoop->getHeader()->phis()) {
+    // The induction PHIs break these rules, and that's OK because we treat
+    // them specially when doing the transformation.
+    if (&InnerPHI == FI.InnerInductionPHI)
+      continue;
+
+    // Each inner loop PHI node must have two incoming values/blocks - one
+    // from the pre-header, and one from the latch.
+    assert(InnerPHI.getNumIncomingValues() == 2);
+    Value *PreHeaderValue =
+        InnerPHI.getIncomingValueForBlock(FI.InnerLoop->getLoopPreheader());
+    Value *LatchValue =
+        InnerPHI.getIncomingValueForBlock(FI.InnerLoop->getLoopLatch());
+
+    // The incoming value from the outer loop must be the PHI node in the
+    // outer loop header, with no modifications made in the top of the outer
+    // loop.
+    PHINode *OuterPHI = dyn_cast<PHINode>(PreHeaderValue);
+    if (!OuterPHI || OuterPHI->getParent() != FI.OuterLoop->getHeader()) {
+      LLVM_DEBUG(dbgs() << "value modified in top of outer loop\n");
+      return false;
+    }
+
+    // The other incoming value must come from the inner loop, without any
+    // modifications in the tail end of the outer loop. We are in LCSSA form,
+    // so this will actually be a PHI in the inner loop's exit block, which
+    // only uses values from inside the inner loop.
+    PHINode *LCSSAPHI = dyn_cast<PHINode>(
+        OuterPHI->getIncomingValueForBlock(FI.OuterLoop->getLoopLatch()));
+    if (!LCSSAPHI) {
+      LLVM_DEBUG(dbgs() << "could not find LCSSA PHI\n");
+      return false;
+    }
+
+    // The value used by the LCSSA PHI must be the same one that the inner
+    // loop's PHI uses.
+    if (LCSSAPHI->hasConstantValue() != LatchValue) {
+      LLVM_DEBUG(
+          dbgs() << "LCSSA PHI incoming value does not match latch value\n");
+      return false;
+    }
+
+    LLVM_DEBUG(dbgs() << "PHI pair is safe:\n");
+    LLVM_DEBUG(dbgs() << "  Inner: "; InnerPHI.dump());
+    LLVM_DEBUG(dbgs() << "  Outer: "; OuterPHI->dump());
+    SafeOuterPHIs.insert(OuterPHI);
+    FI.InnerPHIsToTransform.insert(&InnerPHI);
+  }
+
+  for (PHINode &OuterPHI : FI.OuterLoop->getHeader()->phis()) {
+    if (!SafeOuterPHIs.count(&OuterPHI)) {
+      LLVM_DEBUG(dbgs() << "found unsafe PHI in outer loop: "; OuterPHI.dump());
+      return false;
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "checkPHIs: OK\n");
+  return true;
+}
+
+static bool
+checkOuterLoopInsts(struct FlattenInfo &FI,
+                    SmallPtrSetImpl<Instruction *> &IterationInstructions,
+                    const TargetTransformInfo *TTI) {
+  // Check for instructions in the outer but not inner loop. If any of these
+  // have side-effects then this transformation is not legal, and if there is
+  // a significant amount of code here which can't be optimised out that it's
+  // not profitable (as these instructions would get executed for each
+  // iteration of the inner loop).
+  unsigned RepeatedInstrCost = 0;
+  for (auto *B : FI.OuterLoop->getBlocks()) {
+    if (FI.InnerLoop->contains(B))
+      continue;
+
+    for (auto &I : *B) {
+      if (!isa<PHINode>(&I) && !I.isTerminator() &&
+          !isSafeToSpeculativelyExecute(&I)) {
+        LLVM_DEBUG(dbgs() << "Cannot flatten because instruction may have "
+                             "side effects: ";
+                   I.dump());
+        return false;
+      }
+      // The execution count of the outer loop's iteration instructions
+      // (increment, compare and branch) will be increased, but the
+      // equivalent instructions will be removed from the inner loop, so
+      // they make a net difference of zero.
+      if (IterationInstructions.count(&I))
+        continue;
+      // The uncoditional branch to the inner loop's header will turn into
+      // a fall-through, so adds no cost.
+      BranchInst *Br = dyn_cast<BranchInst>(&I);
+      if (Br && Br->isUnconditional() &&
+          Br->getSuccessor(0) == FI.InnerLoop->getHeader())
+        continue;
+      // Multiplies of the outer iteration variable and inner iteration
+      // count will be optimised out.
+      if (match(&I, m_c_Mul(m_Specific(FI.OuterInductionPHI),
+                            m_Specific(FI.InnerLimit))))
+        continue;
+      int Cost = TTI->getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency);
+      LLVM_DEBUG(dbgs() << "Cost " << Cost << ": "; I.dump());
+      RepeatedInstrCost += Cost;
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "Cost of instructions that will be repeated: "
+                    << RepeatedInstrCost << "\n");
+  // Bail out if flattening the loops would cause instructions in the outer
+  // loop but not in the inner loop to be executed extra times.
+  if (RepeatedInstrCost > RepeatedInstructionThreshold) {
+    LLVM_DEBUG(dbgs() << "checkOuterLoopInsts: not profitable, bailing.\n");
+    return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "checkOuterLoopInsts: OK\n");
+  return true;
+}
+
+static bool checkIVUsers(struct FlattenInfo &FI) {
+  // We require all uses of both induction variables to match this pattern:
+  //
+  //   (OuterPHI * InnerLimit) + InnerPHI
+  //
+  // Any uses of the induction variables not matching that pattern would
+  // require a div/mod to reconstruct in the flattened loop, so the
+  // transformation wouldn't be profitable.
+
+  Value *InnerLimit = FI.InnerLimit;
+  if (FI.Widened &&
+      (isa<SExtInst>(InnerLimit) || isa<ZExtInst>(InnerLimit)))
+    InnerLimit = cast<Instruction>(InnerLimit)->getOperand(0);
+
+  // Check that all uses of the inner loop's induction variable match the
+  // expected pattern, recording the uses of the outer IV.
+  SmallPtrSet<Value *, 4> ValidOuterPHIUses;
+  for (User *U : FI.InnerInductionPHI->users()) {
+    if (U == FI.InnerIncrement)
+      continue;
+
+    // After widening the IVs, a trunc instruction might have been introduced, so
+    // look through truncs.
+    if (isa<TruncInst>(U)) {
+      if (!U->hasOneUse())
+        return false;
+      U = *U->user_begin();
+    }
+
+    LLVM_DEBUG(dbgs() << "Found use of inner induction variable: "; U->dump());
+
+    Value *MatchedMul;
+    Value *MatchedItCount;
+    bool IsAdd = match(U, m_c_Add(m_Specific(FI.InnerInductionPHI),
+                                  m_Value(MatchedMul))) &&
+                 match(MatchedMul, m_c_Mul(m_Specific(FI.OuterInductionPHI),
+                                           m_Value(MatchedItCount)));
+
+    // Matches the same pattern as above, except it also looks for truncs
+    // on the phi, which can be the result of widening the induction variables.
+    bool IsAddTrunc = match(U, m_c_Add(m_Trunc(m_Specific(FI.InnerInductionPHI)),
+                                       m_Value(MatchedMul))) &&
+                      match(MatchedMul,
+                            m_c_Mul(m_Trunc(m_Specific(FI.OuterInductionPHI)),
+                            m_Value(MatchedItCount)));
+
+    if ((IsAdd || IsAddTrunc) && MatchedItCount == InnerLimit) {
+      LLVM_DEBUG(dbgs() << "Use is optimisable\n");
+      ValidOuterPHIUses.insert(MatchedMul);
+      FI.LinearIVUses.insert(U);
+    } else {
+      LLVM_DEBUG(dbgs() << "Did not match expected pattern, bailing\n");
+      return false;
+    }
+  }
+
+  // Check that there are no uses of the outer IV other than the ones found
+  // as part of the pattern above.
+  for (User *U : FI.OuterInductionPHI->users()) {
+    if (U == FI.OuterIncrement)
+      continue;
+
+    auto IsValidOuterPHIUses = [&] (User *U) -> bool {
+      LLVM_DEBUG(dbgs() << "Found use of outer induction variable: "; U->dump());
+      if (!ValidOuterPHIUses.count(U)) {
+        LLVM_DEBUG(dbgs() << "Did not match expected pattern, bailing\n");
+        return false;
+      }
+      LLVM_DEBUG(dbgs() << "Use is optimisable\n");
+      return true;
+    };
+
+    if (auto *V = dyn_cast<TruncInst>(U)) {
+      for (auto *K : V->users()) {
+        if (!IsValidOuterPHIUses(K))
+          return false;
+      }
+      continue;
+    }
+
+    if (!IsValidOuterPHIUses(U))
+      return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "checkIVUsers: OK\n";
+             dbgs() << "Found " << FI.LinearIVUses.size()
+                    << " value(s) that can be replaced:\n";
+             for (Value *V : FI.LinearIVUses) {
+               dbgs() << "  ";
+               V->dump();
+             });
+  return true;
+}
+
+// Return an OverflowResult dependant on if overflow of the multiplication of
+// InnerLimit and OuterLimit can be assumed not to happen.
+static OverflowResult checkOverflow(struct FlattenInfo &FI,
+                                    DominatorTree *DT, AssumptionCache *AC) {
+  Function *F = FI.OuterLoop->getHeader()->getParent();
+  const DataLayout &DL = F->getParent()->getDataLayout();
+
+  // For debugging/testing.
+  if (AssumeNoOverflow)
+    return OverflowResult::NeverOverflows;
+
+  // Check if the multiply could not overflow due to known ranges of the
+  // input values.
+  OverflowResult OR = computeOverflowForUnsignedMul(
+      FI.InnerLimit, FI.OuterLimit, DL, AC,
+      FI.OuterLoop->getLoopPreheader()->getTerminator(), DT);
+  if (OR != OverflowResult::MayOverflow)
+    return OR;
+
+  for (Value *V : FI.LinearIVUses) {
+    for (Value *U : V->users()) {
+      if (auto *GEP = dyn_cast<GetElementPtrInst>(U)) {
+        // The IV is used as the operand of a GEP, and the IV is at least as
+        // wide as the address space of the GEP. In this case, the GEP would
+        // wrap around the address space before the IV increment wraps, which
+        // would be UB.
+        if (GEP->isInBounds() &&
+            V->getType()->getIntegerBitWidth() >=
+                DL.getPointerTypeSizeInBits(GEP->getType())) {
+          LLVM_DEBUG(
+              dbgs() << "use of linear IV would be UB if overflow occurred: ";
+              GEP->dump());
+          return OverflowResult::NeverOverflows;
+        }
+      }
+    }
+  }
+
+  return OverflowResult::MayOverflow;
+}
+
+static bool CanFlattenLoopPair(struct FlattenInfo &FI, DominatorTree *DT,
+                               LoopInfo *LI, ScalarEvolution *SE,
+                               AssumptionCache *AC, const TargetTransformInfo *TTI) {
+  SmallPtrSet<Instruction *, 8> IterationInstructions;
+  if (!findLoopComponents(FI.InnerLoop, IterationInstructions, FI.InnerInductionPHI,
+                          FI.InnerLimit, FI.InnerIncrement, FI.InnerBranch, SE))
+    return false;
+  if (!findLoopComponents(FI.OuterLoop, IterationInstructions, FI.OuterInductionPHI,
+                          FI.OuterLimit, FI.OuterIncrement, FI.OuterBranch, SE))
+    return false;
+
+  // Both of the loop limit values must be invariant in the outer loop
+  // (non-instructions are all inherently invariant).
+  if (!FI.OuterLoop->isLoopInvariant(FI.InnerLimit)) {
+    LLVM_DEBUG(dbgs() << "inner loop limit not invariant\n");
+    return false;
+  }
+  if (!FI.OuterLoop->isLoopInvariant(FI.OuterLimit)) {
+    LLVM_DEBUG(dbgs() << "outer loop limit not invariant\n");
+    return false;
+  }
+
+  if (!checkPHIs(FI, TTI))
+    return false;
+
+  // FIXME: it should be possible to handle different types correctly.
+  if (FI.InnerInductionPHI->getType() != FI.OuterInductionPHI->getType())
+    return false;
+
+  if (!checkOuterLoopInsts(FI, IterationInstructions, TTI))
+    return false;
+
+  // Find the values in the loop that can be replaced with the linearized
+  // induction variable, and check that there are no other uses of the inner
+  // or outer induction variable. If there were, we could still do this
+  // transformation, but we'd have to insert a div/mod to calculate the
+  // original IVs, so it wouldn't be profitable.
+  if (!checkIVUsers(FI))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "CanFlattenLoopPair: OK\n");
+  return true;
+}
+
+static bool DoFlattenLoopPair(struct FlattenInfo &FI, DominatorTree *DT,
+                              LoopInfo *LI, ScalarEvolution *SE,
+                              AssumptionCache *AC,
+                              const TargetTransformInfo *TTI) {
+  Function *F = FI.OuterLoop->getHeader()->getParent();
+  LLVM_DEBUG(dbgs() << "Checks all passed, doing the transformation\n");
+  {
+    using namespace ore;
+    OptimizationRemark Remark(DEBUG_TYPE, "Flattened", FI.InnerLoop->getStartLoc(),
+                              FI.InnerLoop->getHeader());
+    OptimizationRemarkEmitter ORE(F);
+    Remark << "Flattened into outer loop";
+    ORE.emit(Remark);
+  }
+
+  Value *NewTripCount =
+      BinaryOperator::CreateMul(FI.InnerLimit, FI.OuterLimit, "flatten.tripcount",
+                                FI.OuterLoop->getLoopPreheader()->getTerminator());
+  LLVM_DEBUG(dbgs() << "Created new trip count in preheader: ";
+             NewTripCount->dump());
+
+  // Fix up PHI nodes that take values from the inner loop back-edge, which
+  // we are about to remove.
+  FI.InnerInductionPHI->removeIncomingValue(FI.InnerLoop->getLoopLatch());
+
+  // The old Phi will be optimised away later, but for now we can't leave
+  // leave it in an invalid state, so are updating them too.
+  for (PHINode *PHI : FI.InnerPHIsToTransform)
+    PHI->removeIncomingValue(FI.InnerLoop->getLoopLatch());
+
+  // Modify the trip count of the outer loop to be the product of the two
+  // trip counts.
+  cast<User>(FI.OuterBranch->getCondition())->setOperand(1, NewTripCount);
+
+  // Replace the inner loop backedge with an unconditional branch to the exit.
+  BasicBlock *InnerExitBlock = FI.InnerLoop->getExitBlock();
+  BasicBlock *InnerExitingBlock = FI.InnerLoop->getExitingBlock();
+  InnerExitingBlock->getTerminator()->eraseFromParent();
+  BranchInst::Create(InnerExitBlock, InnerExitingBlock);
+  DT->deleteEdge(InnerExitingBlock, FI.InnerLoop->getHeader());
+
+  // Replace all uses of the polynomial calculated from the two induction
+  // variables with the one new one.
+  IRBuilder<> Builder(FI.OuterInductionPHI->getParent()->getTerminator());
+  for (Value *V : FI.LinearIVUses) {
+    Value *OuterValue = FI.OuterInductionPHI;
+    if (FI.Widened)
+      OuterValue = Builder.CreateTrunc(FI.OuterInductionPHI, V->getType(),
+                                       "flatten.trunciv");
+
+    LLVM_DEBUG(dbgs() << "Replacing: "; V->dump();
+               dbgs() << "with:      "; OuterValue->dump());
+    V->replaceAllUsesWith(OuterValue);
+  }
+
+  // Tell LoopInfo, SCEV and the pass manager that the inner loop has been
+  // deleted, and any information that have about the outer loop invalidated.
+  SE->forgetLoop(FI.OuterLoop);
+  SE->forgetLoop(FI.InnerLoop);
+  LI->erase(FI.InnerLoop);
+  return true;
+}
+
+static bool CanWidenIV(struct FlattenInfo &FI, DominatorTree *DT,
+                       LoopInfo *LI, ScalarEvolution *SE,
+                       AssumptionCache *AC, const TargetTransformInfo *TTI) {
+  if (!WidenIV) {
+    LLVM_DEBUG(dbgs() << "Widening the IVs is disabled\n");
+    return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "Try widening the IVs\n");
+  Module *M = FI.InnerLoop->getHeader()->getParent()->getParent();
+  auto &DL = M->getDataLayout();
+  auto *InnerType = FI.InnerInductionPHI->getType();
+  auto *OuterType = FI.OuterInductionPHI->getType();
+  unsigned MaxLegalSize = DL.getLargestLegalIntTypeSizeInBits();
+  auto *MaxLegalType = DL.getLargestLegalIntType(M->getContext());
+
+  // If both induction types are less than the maximum legal integer width,
+  // promote both to the widest type available so we know calculating
+  // (OuterLimit * InnerLimit) as the new trip count is safe.
+  if (InnerType != OuterType ||
+      InnerType->getScalarSizeInBits() >= MaxLegalSize ||
+      MaxLegalType->getScalarSizeInBits() < InnerType->getScalarSizeInBits() * 2) {
+    LLVM_DEBUG(dbgs() << "Can't widen the IV\n");
+    return false;
+  }
+
+  SCEVExpander Rewriter(*SE, DL, "loopflatten");
+  SmallVector<WideIVInfo, 2> WideIVs;
+  SmallVector<WeakTrackingVH, 4> DeadInsts;
+  WideIVs.push_back( {FI.InnerInductionPHI, MaxLegalType, false });
+  WideIVs.push_back( {FI.OuterInductionPHI, MaxLegalType, false });
+  unsigned ElimExt;
+  unsigned Widened;
+
+  for (unsigned i = 0; i < WideIVs.size(); i++) {
+    PHINode *WidePhi = createWideIV(WideIVs[i], LI, SE, Rewriter, DT, DeadInsts,
+                                    ElimExt, Widened, true /* HasGuards */,
+                                    true /* UsePostIncrementRanges */);
+    if (!WidePhi)
+      return false;
+    LLVM_DEBUG(dbgs() << "Created wide phi: "; WidePhi->dump());
+    LLVM_DEBUG(dbgs() << "Deleting old phi: "; WideIVs[i].NarrowIV->dump());
+    RecursivelyDeleteDeadPHINode(WideIVs[i].NarrowIV);
+  }
+  // After widening, rediscover all the loop components.
+  assert(Widened && "Widenend IV expected");
+  FI.Widened = true;
+  return CanFlattenLoopPair(FI, DT, LI, SE, AC, TTI);
+}
+
+static bool FlattenLoopPair(struct FlattenInfo &FI, DominatorTree *DT,
+                            LoopInfo *LI, ScalarEvolution *SE,
+                            AssumptionCache *AC,
+                            const TargetTransformInfo *TTI) {
+  LLVM_DEBUG(
+      dbgs() << "Loop flattening running on outer loop "
+             << FI.OuterLoop->getHeader()->getName() << " and inner loop "
+             << FI.InnerLoop->getHeader()->getName() << " in "
+             << FI.OuterLoop->getHeader()->getParent()->getName() << "\n");
+
+  if (!CanFlattenLoopPair(FI, DT, LI, SE, AC, TTI))
+    return false;
+
+  // Check if we can widen the induction variables to avoid overflow checks.
+  if (CanWidenIV(FI, DT, LI, SE, AC, TTI))
+    return DoFlattenLoopPair(FI, DT, LI, SE, AC, TTI);
+
+  // Check if the new iteration variable might overflow. In this case, we
+  // need to version the loop, and select the original version at runtime if
+  // the iteration space is too large.
+  // TODO: We currently don't version the loop.
+  OverflowResult OR = checkOverflow(FI, DT, AC);
+  if (OR == OverflowResult::AlwaysOverflowsHigh ||
+      OR == OverflowResult::AlwaysOverflowsLow) {
+    LLVM_DEBUG(dbgs() << "Multiply would always overflow, so not profitable\n");
+    return false;
+  } else if (OR == OverflowResult::MayOverflow) {
+    LLVM_DEBUG(dbgs() << "Multiply might overflow, not flattening\n");
+    return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "Multiply cannot overflow, modifying loop in-place\n");
+  return DoFlattenLoopPair(FI, DT, LI, SE, AC, TTI);
+}
+
+bool Flatten(DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE,
+             AssumptionCache *AC, TargetTransformInfo *TTI) {
+  bool Changed = false;
+  for (auto *InnerLoop : LI->getLoopsInPreorder()) {
+    auto *OuterLoop = InnerLoop->getParentLoop();
+    if (!OuterLoop)
+      continue;
+    struct FlattenInfo FI(OuterLoop, InnerLoop);
+    Changed |= FlattenLoopPair(FI, DT, LI, SE, AC, TTI);
+  }
+  return Changed;
+}
+
+PreservedAnalyses LoopFlattenPass::run(Function &F,
+                                       FunctionAnalysisManager &AM) {
+  auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
+  auto *LI = &AM.getResult<LoopAnalysis>(F);
+  auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
+  auto *AC = &AM.getResult<AssumptionAnalysis>(F);
+  auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
+
+  if (!Flatten(DT, LI, SE, AC, TTI))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
+
+namespace {
+class LoopFlattenLegacyPass : public FunctionPass {
+public:
+  static char ID; // Pass ID, replacement for typeid
+  LoopFlattenLegacyPass() : FunctionPass(ID) {
+    initializeLoopFlattenLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  // Possibly flatten loop L into its child.
+  bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    getLoopAnalysisUsage(AU);
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addPreserved<TargetTransformInfoWrapperPass>();
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addPreserved<AssumptionCacheTracker>();
+  }
+};
+} // namespace
+
+char LoopFlattenLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopFlattenLegacyPass, "loop-flatten", "Flattens loops",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_END(LoopFlattenLegacyPass, "loop-flatten", "Flattens loops",
+                    false, false)
+
+FunctionPass *llvm::createLoopFlattenPass() { return new LoopFlattenLegacyPass(); }
+
+bool LoopFlattenLegacyPass::runOnFunction(Function &F) {
+  ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+  DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
+  auto &TTIP = getAnalysis<TargetTransformInfoWrapperPass>();
+  auto *TTI = &TTIP.getTTI(F);
+  auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+  return Flatten(DT, LI, SE, AC, TTI);
+}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopFuse.cpp
index 20edc8699d79..b5f8dfa9aafb 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopFuse.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopFuse.cpp
@@ -46,6 +46,7 @@
 
 #include "llvm/Transforms/Scalar/LoopFuse.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/DependenceAnalysis.h"
 #include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -53,6 +54,7 @@
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/InitializePasses.h"
@@ -64,6 +66,7 @@
 #include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/CodeMoverUtils.h"
+#include "llvm/Transforms/Utils/LoopPeel.h"
 
 using namespace llvm;
 
@@ -114,6 +117,11 @@ static cl::opt<FusionDependenceAnalysisChoice> FusionDependenceAnalysis(
                           "Use all available analyses")),
     cl::Hidden, cl::init(FUSION_DEPENDENCE_ANALYSIS_ALL), cl::ZeroOrMore);
 
+static cl::opt<unsigned> FusionPeelMaxCount(
+    "loop-fusion-peel-max-count", cl::init(0), cl::Hidden,
+    cl::desc("Max number of iterations to be peeled from a loop, such that "
+             "fusion can take place"));
+
 #ifndef NDEBUG
 static cl::opt<bool>
     VerboseFusionDebugging("loop-fusion-verbose-debug",
@@ -157,6 +165,12 @@ struct FusionCandidate {
   bool Valid;
   /// Guard branch of the loop, if it exists
   BranchInst *GuardBranch;
+  /// Peeling Paramaters of the Loop.
+  TTI::PeelingPreferences PP;
+  /// Can you Peel this Loop?
+  bool AbleToPeel;
+  /// Has this loop been Peeled
+  bool Peeled;
 
   /// Dominator and PostDominator trees are needed for the
   /// FusionCandidateCompare function, required by FusionCandidateSet to
@@ -168,11 +182,13 @@ struct FusionCandidate {
   OptimizationRemarkEmitter &ORE;
 
   FusionCandidate(Loop *L, const DominatorTree *DT,
-                  const PostDominatorTree *PDT, OptimizationRemarkEmitter &ORE)
+                  const PostDominatorTree *PDT, OptimizationRemarkEmitter &ORE,
+                  TTI::PeelingPreferences PP)
       : Preheader(L->getLoopPreheader()), Header(L->getHeader()),
         ExitingBlock(L->getExitingBlock()), ExitBlock(L->getExitBlock()),
         Latch(L->getLoopLatch()), L(L), Valid(true),
-        GuardBranch(L->getLoopGuardBranch()), DT(DT), PDT(PDT), ORE(ORE) {
+        GuardBranch(L->getLoopGuardBranch()), PP(PP), AbleToPeel(canPeel(L)),
+        Peeled(false), DT(DT), PDT(PDT), ORE(ORE) {
 
     // Walk over all blocks in the loop and check for conditions that may
     // prevent fusion. For each block, walk over all instructions and collect
@@ -243,6 +259,17 @@ struct FusionCandidate {
       return Preheader;
   }
 
+  /// After Peeling the loop is modified quite a bit, hence all of the Blocks
+  /// need to be updated accordingly.
+  void updateAfterPeeling() {
+    Preheader = L->getLoopPreheader();
+    Header = L->getHeader();
+    ExitingBlock = L->getExitingBlock();
+    ExitBlock = L->getExitBlock();
+    Latch = L->getLoopLatch();
+    verify();
+  }
+
   /// Given a guarded loop, get the successor of the guard that is not in the
   /// loop.
   ///
@@ -254,6 +281,8 @@ struct FusionCandidate {
     assert(GuardBranch && "Only valid on guarded loops.");
     assert(GuardBranch->isConditional() &&
            "Expecting guard to be a conditional branch.");
+    if (Peeled)
+      return GuardBranch->getSuccessor(1);
     return (GuardBranch->getSuccessor(0) == Preheader)
                ? GuardBranch->getSuccessor(1)
                : GuardBranch->getSuccessor(0);
@@ -515,13 +544,17 @@ private:
   ScalarEvolution &SE;
   PostDominatorTree &PDT;
   OptimizationRemarkEmitter &ORE;
+  AssumptionCache &AC;
+
+  const TargetTransformInfo &TTI;
 
 public:
   LoopFuser(LoopInfo &LI, DominatorTree &DT, DependenceInfo &DI,
             ScalarEvolution &SE, PostDominatorTree &PDT,
-            OptimizationRemarkEmitter &ORE, const DataLayout &DL)
+            OptimizationRemarkEmitter &ORE, const DataLayout &DL,
+            AssumptionCache &AC, const TargetTransformInfo &TTI)
       : LDT(LI), DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Lazy), LI(LI),
-        DT(DT), DI(DI), SE(SE), PDT(PDT), ORE(ORE) {}
+        DT(DT), DI(DI), SE(SE), PDT(PDT), ORE(ORE), AC(AC), TTI(TTI) {}
 
   /// This is the main entry point for loop fusion. It will traverse the
   /// specified function and collect candidate loops to fuse, starting at the
@@ -606,7 +639,9 @@ private:
   /// Flow Equivalent sets, sorted by dominance.
   void collectFusionCandidates(const LoopVector &LV) {
     for (Loop *L : LV) {
-      FusionCandidate CurrCand(L, &DT, &PDT, ORE);
+      TTI::PeelingPreferences PP =
+          gatherPeelingPreferences(L, SE, TTI, None, None);
+      FusionCandidate CurrCand(L, &DT, &PDT, ORE, PP);
       if (!CurrCand.isEligibleForFusion(SE))
         continue;
 
@@ -656,33 +691,133 @@ private:
   /// Determine if two fusion candidates have the same trip count (i.e., they
   /// execute the same number of iterations).
   ///
-  /// Note that for now this method simply returns a boolean value because there
-  /// are no mechanisms in loop fusion to handle different trip counts. In the
-  /// future, this behaviour can be extended to adjust one of the loops to make
-  /// the trip counts equal (e.g., loop peeling). When this is added, this
-  /// interface may need to change to return more information than just a
-  /// boolean value.
-  bool identicalTripCounts(const FusionCandidate &FC0,
-                           const FusionCandidate &FC1) const {
+  /// This function will return a pair of values. The first is a boolean,
+  /// stating whether or not the two candidates are known at compile time to
+  /// have the same TripCount. The second is the difference in the two
+  /// TripCounts. This information can be used later to determine whether or not
+  /// peeling can be performed on either one of the candiates.
+  std::pair<bool, Optional<unsigned>>
+  haveIdenticalTripCounts(const FusionCandidate &FC0,
+                          const FusionCandidate &FC1) const {
+
     const SCEV *TripCount0 = SE.getBackedgeTakenCount(FC0.L);
     if (isa<SCEVCouldNotCompute>(TripCount0)) {
       UncomputableTripCount++;
       LLVM_DEBUG(dbgs() << "Trip count of first loop could not be computed!");
-      return false;
+      return {false, None};
     }
 
     const SCEV *TripCount1 = SE.getBackedgeTakenCount(FC1.L);
     if (isa<SCEVCouldNotCompute>(TripCount1)) {
       UncomputableTripCount++;
       LLVM_DEBUG(dbgs() << "Trip count of second loop could not be computed!");
-      return false;
+      return {false, None};
     }
+
     LLVM_DEBUG(dbgs() << "\tTrip counts: " << *TripCount0 << " & "
                       << *TripCount1 << " are "
                       << (TripCount0 == TripCount1 ? "identical" : "different")
                       << "\n");
 
-    return (TripCount0 == TripCount1);
+    if (TripCount0 == TripCount1)
+      return {true, 0};
+
+    LLVM_DEBUG(dbgs() << "The loops do not have the same tripcount, "
+                         "determining the difference between trip counts\n");
+
+    // Currently only considering loops with a single exit point
+    // and a non-constant trip count.
+    const unsigned TC0 = SE.getSmallConstantTripCount(FC0.L);
+    const unsigned TC1 = SE.getSmallConstantTripCount(FC1.L);
+
+    // If any of the tripcounts are zero that means that loop(s) do not have
+    // a single exit or a constant tripcount.
+    if (TC0 == 0 || TC1 == 0) {
+      LLVM_DEBUG(dbgs() << "Loop(s) do not have a single exit point or do not "
+                           "have a constant number of iterations. Peeling "
+                           "is not benefical\n");
+      return {false, None};
+    }
+
+    Optional<unsigned> Difference = None;
+    int Diff = TC0 - TC1;
+
+    if (Diff > 0)
+      Difference = Diff;
+    else {
+      LLVM_DEBUG(
+          dbgs() << "Difference is less than 0. FC1 (second loop) has more "
+                    "iterations than the first one. Currently not supported\n");
+    }
+
+    LLVM_DEBUG(dbgs() << "Difference in loop trip count is: " << Difference
+                      << "\n");
+
+    return {false, Difference};
+  }
+
+  void peelFusionCandidate(FusionCandidate &FC0, const FusionCandidate &FC1,
+                           unsigned PeelCount) {
+    assert(FC0.AbleToPeel && "Should be able to peel loop");
+
+    LLVM_DEBUG(dbgs() << "Attempting to peel first " << PeelCount
+                      << " iterations of the first loop. \n");
+
+    FC0.Peeled = peelLoop(FC0.L, PeelCount, &LI, &SE, &DT, &AC, true);
+    if (FC0.Peeled) {
+      LLVM_DEBUG(dbgs() << "Done Peeling\n");
+
+#ifndef NDEBUG
+      auto IdenticalTripCount = haveIdenticalTripCounts(FC0, FC1);
+
+      assert(IdenticalTripCount.first && *IdenticalTripCount.second == 0 &&
+             "Loops should have identical trip counts after peeling");
+#endif
+
+      FC0.PP.PeelCount += PeelCount;
+
+      // Peeling does not update the PDT
+      PDT.recalculate(*FC0.Preheader->getParent());
+
+      FC0.updateAfterPeeling();
+
+      // In this case the iterations of the loop are constant, so the first
+      // loop will execute completely (will not jump from one of
+      // the peeled blocks to the second loop). Here we are updating the
+      // branch conditions of each of the peeled blocks, such that it will
+      // branch to its successor which is not the preheader of the second loop
+      // in the case of unguarded loops, or the succesors of the exit block of
+      // the first loop otherwise. Doing this update will ensure that the entry
+      // block of the first loop dominates the entry block of the second loop.
+      BasicBlock *BB =
+          FC0.GuardBranch ? FC0.ExitBlock->getUniqueSuccessor() : FC1.Preheader;
+      if (BB) {
+        SmallVector<DominatorTree::UpdateType, 8> TreeUpdates;
+        SmallVector<Instruction *, 8> WorkList;
+        for (BasicBlock *Pred : predecessors(BB)) {
+          if (Pred != FC0.ExitBlock) {
+            WorkList.emplace_back(Pred->getTerminator());
+            TreeUpdates.emplace_back(
+                DominatorTree::UpdateType(DominatorTree::Delete, Pred, BB));
+          }
+        }
+        // Cannot modify the predecessors inside the above loop as it will cause
+        // the iterators to be nullptrs, causing memory errors.
+        for (Instruction *CurrentBranch: WorkList) {
+          BasicBlock *Succ = CurrentBranch->getSuccessor(0);
+          if (Succ == BB)
+            Succ = CurrentBranch->getSuccessor(1);
+          ReplaceInstWithInst(CurrentBranch, BranchInst::Create(Succ));
+        }
+
+        DTU.applyUpdates(TreeUpdates);
+        DTU.flush();
+      }
+      LLVM_DEBUG(
+          dbgs() << "Sucessfully peeled " << FC0.PP.PeelCount
+                 << " iterations from the first loop.\n"
+                    "Both Loops have the same number of iterations now.\n");
+    }
   }
 
   /// Walk each set of control flow equivalent fusion candidates and attempt to
@@ -716,7 +851,32 @@ private:
           FC0->verify();
           FC1->verify();
 
-          if (!identicalTripCounts(*FC0, *FC1)) {
+          // Check if the candidates have identical tripcounts (first value of
+          // pair), and if not check the difference in the tripcounts between
+          // the loops (second value of pair). The difference is not equal to
+          // None iff the loops iterate a constant number of times, and have a
+          // single exit.
+          std::pair<bool, Optional<unsigned>> IdenticalTripCountRes =
+              haveIdenticalTripCounts(*FC0, *FC1);
+          bool SameTripCount = IdenticalTripCountRes.first;
+          Optional<unsigned> TCDifference = IdenticalTripCountRes.second;
+
+          // Here we are checking that FC0 (the first loop) can be peeled, and
+          // both loops have different tripcounts.
+          if (FC0->AbleToPeel && !SameTripCount && TCDifference) {
+            if (*TCDifference > FusionPeelMaxCount) {
+              LLVM_DEBUG(dbgs()
+                         << "Difference in loop trip counts: " << *TCDifference
+                         << " is greater than maximum peel count specificed: "
+                         << FusionPeelMaxCount << "\n");
+            } else {
+              // Dependent on peeling being performed on the first loop, and
+              // assuming all other conditions for fusion return true.
+              SameTripCount = true;
+            }
+          }
+
+          if (!SameTripCount) {
             LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical trip "
                                  "counts. Not fusing.\n");
             reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
@@ -734,7 +894,7 @@ private:
           // Ensure that FC0 and FC1 have identical guards.
           // If one (or both) are not guarded, this check is not necessary.
           if (FC0->GuardBranch && FC1->GuardBranch &&
-              !haveIdenticalGuards(*FC0, *FC1)) {
+              !haveIdenticalGuards(*FC0, *FC1) && !TCDifference) {
             LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical "
                                  "guards. Not Fusing.\n");
             reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
@@ -803,13 +963,23 @@ private:
           LLVM_DEBUG(dbgs() << "\tFusion is performed: " << *FC0 << " and "
                             << *FC1 << "\n");
 
+          FusionCandidate FC0Copy = *FC0;
+          // Peel the loop after determining that fusion is legal. The Loops
+          // will still be safe to fuse after the peeling is performed.
+          bool Peel = TCDifference && *TCDifference > 0;
+          if (Peel)
+            peelFusionCandidate(FC0Copy, *FC1, *TCDifference);
+
           // Report fusion to the Optimization Remarks.
           // Note this needs to be done *before* performFusion because
           // performFusion will change the original loops, making it not
           // possible to identify them after fusion is complete.
-          reportLoopFusion<OptimizationRemark>(*FC0, *FC1, FuseCounter);
+          reportLoopFusion<OptimizationRemark>((Peel ? FC0Copy : *FC0), *FC1,
+                                               FuseCounter);
 
-          FusionCandidate FusedCand(performFusion(*FC0, *FC1), &DT, &PDT, ORE);
+          FusionCandidate FusedCand(
+              performFusion((Peel ? FC0Copy : *FC0), *FC1), &DT, &PDT, ORE,
+              FC0Copy.PP);
           FusedCand.verify();
           assert(FusedCand.isEligibleForFusion(SE) &&
                  "Fused candidate should be eligible for fusion!");
@@ -1086,16 +1256,17 @@ private:
       return (FC1.GuardBranch->getSuccessor(1) == FC1.Preheader);
   }
 
-  /// Simplify the condition of the latch branch of \p FC to true, when both of
-  /// its successors are the same.
+  /// Modify the latch branch of FC to be unconditional since successors of the
+  /// branch are the same.
   void simplifyLatchBranch(const FusionCandidate &FC) const {
     BranchInst *FCLatchBranch = dyn_cast<BranchInst>(FC.Latch->getTerminator());
     if (FCLatchBranch) {
       assert(FCLatchBranch->isConditional() &&
              FCLatchBranch->getSuccessor(0) == FCLatchBranch->getSuccessor(1) &&
              "Expecting the two successors of FCLatchBranch to be the same");
-      FCLatchBranch->setCondition(
-          llvm::ConstantInt::getTrue(FCLatchBranch->getCondition()->getType()));
+      BranchInst *NewBranch =
+          BranchInst::Create(FCLatchBranch->getSuccessor(0));
+      ReplaceInstWithInst(FCLatchBranch, NewBranch);
     }
   }
 
@@ -1155,7 +1326,8 @@ private:
     if (FC0.GuardBranch)
       return fuseGuardedLoops(FC0, FC1);
 
-    assert(FC1.Preheader == FC0.ExitBlock);
+    assert(FC1.Preheader ==
+           (FC0.Peeled ? FC0.ExitBlock->getUniqueSuccessor() : FC0.ExitBlock));
     assert(FC1.Preheader->size() == 1 &&
            FC1.Preheader->getSingleSuccessor() == FC1.Header);
 
@@ -1197,15 +1369,30 @@ private:
     // to FC1.Header? I think this is basically what the three sequences are
     // trying to accomplish; however, doing this directly in the CFG may mean
     // the DT/PDT becomes invalid
-    FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC1.Preheader,
-                                                         FC1.Header);
-    TreeUpdates.emplace_back(DominatorTree::UpdateType(
-        DominatorTree::Delete, FC0.ExitingBlock, FC1.Preheader));
-    TreeUpdates.emplace_back(DominatorTree::UpdateType(
-        DominatorTree::Insert, FC0.ExitingBlock, FC1.Header));
+    if (!FC0.Peeled) {
+      FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC1.Preheader,
+                                                           FC1.Header);
+      TreeUpdates.emplace_back(DominatorTree::UpdateType(
+          DominatorTree::Delete, FC0.ExitingBlock, FC1.Preheader));
+      TreeUpdates.emplace_back(DominatorTree::UpdateType(
+          DominatorTree::Insert, FC0.ExitingBlock, FC1.Header));
+    } else {
+      TreeUpdates.emplace_back(DominatorTree::UpdateType(
+          DominatorTree::Delete, FC0.ExitBlock, FC1.Preheader));
+
+      // Remove the ExitBlock of the first Loop (also not needed)
+      FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC0.ExitBlock,
+                                                           FC1.Header);
+      TreeUpdates.emplace_back(DominatorTree::UpdateType(
+          DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock));
+      FC0.ExitBlock->getTerminator()->eraseFromParent();
+      TreeUpdates.emplace_back(DominatorTree::UpdateType(
+          DominatorTree::Insert, FC0.ExitingBlock, FC1.Header));
+      new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock);
+    }
 
     // The pre-header of L1 is not necessary anymore.
-    assert(pred_begin(FC1.Preheader) == pred_end(FC1.Preheader));
+    assert(pred_empty(FC1.Preheader));
     FC1.Preheader->getTerminator()->eraseFromParent();
     new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader);
     TreeUpdates.emplace_back(DominatorTree::UpdateType(
@@ -1246,7 +1433,7 @@ private:
     FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header);
     FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header);
 
-    // Change the condition of FC0 latch branch to true, as both successors of
+    // Modify the latch branch of FC0 to be unconditional as both successors of
     // the branch are the same.
     simplifyLatchBranch(FC0);
 
@@ -1268,6 +1455,11 @@ private:
 
     LI.removeBlock(FC1.Preheader);
     DTU.deleteBB(FC1.Preheader);
+    if (FC0.Peeled) {
+      LI.removeBlock(FC0.ExitBlock);
+      DTU.deleteBB(FC0.ExitBlock);
+    }
+
     DTU.flush();
 
     // Is there a way to keep SE up-to-date so we don't need to forget the loops
@@ -1282,8 +1474,7 @@ private:
     mergeLatch(FC0, FC1);
 
     // Merge the loops.
-    SmallVector<BasicBlock *, 8> Blocks(FC1.L->block_begin(),
-                                        FC1.L->block_end());
+    SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks());
     for (BasicBlock *BB : Blocks) {
       FC0.L->addBlockEntry(BB);
       FC1.L->removeBlockFromLoop(BB);
@@ -1291,7 +1482,7 @@ private:
         continue;
       LI.changeLoopFor(BB, FC0.L);
     }
-    while (!FC1.L->empty()) {
+    while (!FC1.L->isInnermost()) {
       const auto &ChildLoopIt = FC1.L->begin();
       Loop *ChildLoop = *ChildLoopIt;
       FC1.L->removeChildLoop(ChildLoopIt);
@@ -1364,10 +1555,15 @@ private:
     BasicBlock *FC1GuardBlock = FC1.GuardBranch->getParent();
     BasicBlock *FC0NonLoopBlock = FC0.getNonLoopBlock();
     BasicBlock *FC1NonLoopBlock = FC1.getNonLoopBlock();
+    BasicBlock *FC0ExitBlockSuccessor = FC0.ExitBlock->getUniqueSuccessor();
 
     // Move instructions from the exit block of FC0 to the beginning of the exit
-    // block of FC1.
-    moveInstructionsToTheBeginning(*FC0.ExitBlock, *FC1.ExitBlock, DT, PDT, DI);
+    // block of FC1, in the case that the FC0 loop has not been peeled. In the
+    // case that FC0 loop is peeled, then move the instructions of the successor
+    // of the FC0 Exit block to the beginning of the exit block of FC1.
+    moveInstructionsToTheBeginning(
+        (FC0.Peeled ? *FC0ExitBlockSuccessor : *FC0.ExitBlock), *FC1.ExitBlock,
+        DT, PDT, DI);
 
     // Move instructions from the guard block of FC1 to the end of the guard
     // block of FC0.
@@ -1387,8 +1583,9 @@ private:
     // for FC1 (where FC1 guard would have gone if FC1 was not executed).
     FC1NonLoopBlock->replacePhiUsesWith(FC1GuardBlock, FC0GuardBlock);
     FC0.GuardBranch->replaceUsesOfWith(FC0NonLoopBlock, FC1NonLoopBlock);
-    FC0.ExitBlock->getTerminator()->replaceUsesOfWith(FC1GuardBlock,
-                                                      FC1.Header);
+
+    BasicBlock *BBToUpdate = FC0.Peeled ? FC0ExitBlockSuccessor : FC0.ExitBlock;
+    BBToUpdate->getTerminator()->replaceUsesOfWith(FC1GuardBlock, FC1.Header);
 
     // The guard of FC1 is not necessary anymore.
     FC1.GuardBranch->eraseFromParent();
@@ -1403,9 +1600,18 @@ private:
     TreeUpdates.emplace_back(DominatorTree::UpdateType(
         DominatorTree::Insert, FC0GuardBlock, FC1NonLoopBlock));
 
-    assert(pred_begin(FC1GuardBlock) == pred_end(FC1GuardBlock) &&
+    if (FC0.Peeled) {
+      // Remove the Block after the ExitBlock of FC0
+      TreeUpdates.emplace_back(DominatorTree::UpdateType(
+          DominatorTree::Delete, FC0ExitBlockSuccessor, FC1GuardBlock));
+      FC0ExitBlockSuccessor->getTerminator()->eraseFromParent();
+      new UnreachableInst(FC0ExitBlockSuccessor->getContext(),
+                          FC0ExitBlockSuccessor);
+    }
+
+    assert(pred_empty(FC1GuardBlock) &&
            "Expecting guard block to have no predecessors");
-    assert(succ_begin(FC1GuardBlock) == succ_end(FC1GuardBlock) &&
+    assert(succ_empty(FC1GuardBlock) &&
            "Expecting guard block to have no successors");
 
     // Remember the phi nodes originally in the header of FC0 in order to rewire
@@ -1459,14 +1665,13 @@ private:
     // TODO: In the future, we can handle non-empty exit blocks my merging any
     // instructions from FC0 exit block into FC1 exit block prior to removing
     // the block.
-    assert(pred_begin(FC0.ExitBlock) == pred_end(FC0.ExitBlock) &&
-           "Expecting exit block to be empty");
+    assert(pred_empty(FC0.ExitBlock) && "Expecting exit block to be empty");
     FC0.ExitBlock->getTerminator()->eraseFromParent();
     new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock);
 
     // Remove FC1 Preheader
     // The pre-header of L1 is not necessary anymore.
-    assert(pred_begin(FC1.Preheader) == pred_end(FC1.Preheader));
+    assert(pred_empty(FC1.Preheader));
     FC1.Preheader->getTerminator()->eraseFromParent();
     new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader);
     TreeUpdates.emplace_back(DominatorTree::UpdateType(
@@ -1509,7 +1714,7 @@ private:
     FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header);
     FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header);
 
-    // Change the condition of FC0 latch branch to true, as both successors of
+    // Modify the latch branch of FC0 to be unconditional as both successors of
     // the branch are the same.
     simplifyLatchBranch(FC0);
 
@@ -1529,10 +1734,8 @@ private:
     // All done
     // Apply the updates to the Dominator Tree and cleanup.
 
-    assert(succ_begin(FC1GuardBlock) == succ_end(FC1GuardBlock) &&
-           "FC1GuardBlock has successors!!");
-    assert(pred_begin(FC1GuardBlock) == pred_end(FC1GuardBlock) &&
-           "FC1GuardBlock has predecessors!!");
+    assert(succ_empty(FC1GuardBlock) && "FC1GuardBlock has successors!!");
+    assert(pred_empty(FC1GuardBlock) && "FC1GuardBlock has predecessors!!");
 
     // Update DT/PDT
     DTU.applyUpdates(TreeUpdates);
@@ -1540,6 +1743,10 @@ private:
     LI.removeBlock(FC1GuardBlock);
     LI.removeBlock(FC1.Preheader);
     LI.removeBlock(FC0.ExitBlock);
+    if (FC0.Peeled) {
+      LI.removeBlock(FC0ExitBlockSuccessor);
+      DTU.deleteBB(FC0ExitBlockSuccessor);
+    }
     DTU.deleteBB(FC1GuardBlock);
     DTU.deleteBB(FC1.Preheader);
     DTU.deleteBB(FC0.ExitBlock);
@@ -1557,8 +1764,7 @@ private:
     mergeLatch(FC0, FC1);
 
     // Merge the loops.
-    SmallVector<BasicBlock *, 8> Blocks(FC1.L->block_begin(),
-                                        FC1.L->block_end());
+    SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks());
     for (BasicBlock *BB : Blocks) {
       FC0.L->addBlockEntry(BB);
       FC1.L->removeBlockFromLoop(BB);
@@ -1566,7 +1772,7 @@ private:
         continue;
       LI.changeLoopFor(BB, FC0.L);
     }
-    while (!FC1.L->empty()) {
+    while (!FC1.L->isInnermost()) {
       const auto &ChildLoopIt = FC1.L->begin();
       Loop *ChildLoop = *ChildLoopIt;
       FC1.L->removeChildLoop(ChildLoopIt);
@@ -1606,6 +1812,8 @@ struct LoopFuseLegacy : public FunctionPass {
     AU.addRequired<PostDominatorTreeWrapperPass>();
     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
     AU.addRequired<DependenceAnalysisWrapperPass>();
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
 
     AU.addPreserved<ScalarEvolutionWrapperPass>();
     AU.addPreserved<LoopInfoWrapperPass>();
@@ -1622,9 +1830,12 @@ struct LoopFuseLegacy : public FunctionPass {
     auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
     auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
     auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
-
+    auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+    const TargetTransformInfo &TTI =
+        getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
     const DataLayout &DL = F.getParent()->getDataLayout();
-    LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL);
+
+    LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL, AC, TTI);
     return LF.fuseLoops(F);
   }
 };
@@ -1637,9 +1848,11 @@ PreservedAnalyses LoopFusePass::run(Function &F, FunctionAnalysisManager &AM) {
   auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
   auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
   auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
-
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  const TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
   const DataLayout &DL = F.getParent()->getDataLayout();
-  LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL);
+
+  LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL, AC, TTI);
   bool Changed = LF.fuseLoops(F);
   if (!Changed)
     return PreservedAnalyses::all();
@@ -1662,6 +1875,8 @@ INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_END(LoopFuseLegacy, "loop-fusion", "Loop Fusion", false, false)
 
 FunctionPass *llvm::createLoopFusePass() { return new LoopFuseLegacy(); }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 3cb4df12e9b0..8064c02e2b39 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -47,6 +47,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CmpInstAnalysis.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
@@ -79,6 +80,7 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
@@ -106,6 +108,32 @@ using namespace llvm;
 
 STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
 STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
+STATISTIC(
+    NumShiftUntilBitTest,
+    "Number of uncountable loops recognized as 'shift until bitttest' idiom");
+
+bool DisableLIRP::All;
+static cl::opt<bool, true>
+    DisableLIRPAll("disable-" DEBUG_TYPE "-all",
+                   cl::desc("Options to disable Loop Idiom Recognize Pass."),
+                   cl::location(DisableLIRP::All), cl::init(false),
+                   cl::ReallyHidden);
+
+bool DisableLIRP::Memset;
+static cl::opt<bool, true>
+    DisableLIRPMemset("disable-" DEBUG_TYPE "-memset",
+                      cl::desc("Proceed with loop idiom recognize pass, but do "
+                               "not convert loop(s) to memset."),
+                      cl::location(DisableLIRP::Memset), cl::init(false),
+                      cl::ReallyHidden);
+
+bool DisableLIRP::Memcpy;
+static cl::opt<bool, true>
+    DisableLIRPMemcpy("disable-" DEBUG_TYPE "-memcpy",
+                      cl::desc("Proceed with loop idiom recognize pass, but do "
+                               "not convert loop(s) to memcpy."),
+                      cl::location(DisableLIRP::Memcpy), cl::init(false),
+                      cl::ReallyHidden);
 
 static cl::opt<bool> UseLIRCodeSizeHeurs(
     "use-lir-code-size-heurs",
@@ -204,6 +232,8 @@ private:
                                 const DebugLoc &DL, bool ZeroCheck,
                                 bool IsCntPhiUsedOutsideLoop);
 
+  bool recognizeShiftUntilBitTest();
+
   /// @}
 };
 
@@ -217,6 +247,9 @@ public:
   }
 
   bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+    if (DisableLIRP::All)
+      return false;
+
     if (skipLoop(L))
       return false;
 
@@ -262,6 +295,9 @@ char LoopIdiomRecognizeLegacyPass::ID = 0;
 PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
                                               LoopStandardAnalysisResults &AR,
                                               LPMUpdater &) {
+  if (DisableLIRP::All)
+    return PreservedAnalyses::all();
+
   const auto *DL = &L.getHeader()->getModule()->getDataLayout();
 
   // For the new PM, we also can't use OptimizationRemarkEmitter as an analysis
@@ -426,11 +462,6 @@ LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
   if (!SI->isUnordered())
     return LegalStoreKind::None;
 
-  // Don't convert stores of non-integral pointer types to memsets (which stores
-  // integers).
-  if (DL->isNonIntegralPointerType(SI->getValueOperand()->getType()))
-    return LegalStoreKind::None;
-
   // Avoid merging nontemporal stores.
   if (SI->getMetadata(LLVMContext::MD_nontemporal))
     return LegalStoreKind::None;
@@ -438,9 +469,17 @@ LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
   Value *StoredVal = SI->getValueOperand();
   Value *StorePtr = SI->getPointerOperand();
 
+  // Don't convert stores of non-integral pointer types to memsets (which stores
+  // integers).
+  if (DL->isNonIntegralPointerType(StoredVal->getType()->getScalarType()))
+    return LegalStoreKind::None;
+
   // Reject stores that are so large that they overflow an unsigned.
-  uint64_t SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
-  if ((SizeInBits & 7) || (SizeInBits >> 32) != 0)
+  // When storing out scalable vectors we bail out for now, since the code
+  // below currently only works for constant strides.
+  TypeSize SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
+  if (SizeInBits.isScalable() || (SizeInBits.getFixedSize() & 7) ||
+      (SizeInBits.getFixedSize() >> 32) != 0)
     return LegalStoreKind::None;
 
   // See if the pointer expression is an AddRec like {base,+,1} on the current
@@ -469,13 +508,13 @@ LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
 
   // If we're allowed to form a memset, and the stored value would be
   // acceptable for memset, use it.
-  if (!UnorderedAtomic && HasMemset && SplatValue &&
+  if (!UnorderedAtomic && HasMemset && SplatValue && !DisableLIRP::Memset &&
       // Verify that the stored value is loop invariant.  If not, we can't
       // promote the memset.
       CurLoop->isLoopInvariant(SplatValue)) {
     // It looks like we can use SplatValue.
     return LegalStoreKind::Memset;
-  } else if (!UnorderedAtomic && HasMemsetPattern &&
+  } else if (!UnorderedAtomic && HasMemsetPattern && !DisableLIRP::Memset &&
              // Don't create memset_pattern16s with address spaces.
              StorePtr->getType()->getPointerAddressSpace() == 0 &&
              (PatternValue = getMemSetPatternValue(StoredVal, DL))) {
@@ -484,7 +523,7 @@ LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
   }
 
   // Otherwise, see if the store can be turned into a memcpy.
-  if (HasMemcpy) {
+  if (HasMemcpy && !DisableLIRP::Memcpy) {
     // Check to see if the stride matches the size of the store.  If so, then we
     // know that every byte is touched in the loop.
     APInt Stride = getStoreStride(StoreEv);
@@ -539,12 +578,12 @@ void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
       break;
     case LegalStoreKind::Memset: {
       // Find the base pointer.
-      Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), *DL);
+      Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
       StoreRefsForMemset[Ptr].push_back(SI);
     } break;
     case LegalStoreKind::MemsetPattern: {
       // Find the base pointer.
-      Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), *DL);
+      Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
       StoreRefsForMemsetPattern[Ptr].push_back(SI);
     } break;
     case LegalStoreKind::Memcpy:
@@ -812,7 +851,7 @@ mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,
   // Get the location that may be stored across the loop.  Since the access is
   // strided positively through memory, we say that the modified location starts
   // at the pointer and has infinite size.
-  LocationSize AccessSize = LocationSize::unknown();
+  LocationSize AccessSize = LocationSize::afterPointer();
 
   // If the loop iterates a fixed number of times, we can refine the access size
   // to be exactly the size of the memset, which is (BECount+1)*StoreSize
@@ -864,8 +903,8 @@ static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr,
   // If we're going to need to zero extend the BE count, check if we can add
   // one to it prior to zero extending without overflow. Provided this is safe,
   // it allows better simplification of the +1.
-  if (DL->getTypeSizeInBits(BECount->getType()) <
-          DL->getTypeSizeInBits(IntPtr) &&
+  if (DL->getTypeSizeInBits(BECount->getType()).getFixedSize() <
+          DL->getTypeSizeInBits(IntPtr).getFixedSize() &&
       SE->isLoopEntryGuardedByCond(
           CurLoop, ICmpInst::ICMP_NE, BECount,
           SE->getNegativeSCEV(SE->getOne(BECount->getType())))) {
@@ -908,10 +947,12 @@ bool LoopIdiomRecognize::processLoopStridedStore(
   BasicBlock *Preheader = CurLoop->getLoopPreheader();
   IRBuilder<> Builder(Preheader->getTerminator());
   SCEVExpander Expander(*SE, *DL, "loop-idiom");
+  SCEVExpanderCleaner ExpCleaner(Expander, *DT);
 
   Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS);
   Type *IntIdxTy = DL->getIndexType(DestPtr->getType());
 
+  bool Changed = false;
   const SCEV *Start = Ev->getStart();
   // Handle negative strided loops.
   if (NegStride)
@@ -920,7 +961,7 @@ bool LoopIdiomRecognize::processLoopStridedStore(
   // TODO: ideally we should still be able to generate memset if SCEV expander
   // is taught to generate the dependencies at the latest point.
   if (!isSafeToExpand(Start, *SE))
-    return false;
+    return Changed;
 
   // Okay, we have a strided store "p[i]" of a splattable value.  We can turn
   // this into a memset in the loop preheader now if we want.  However, this
@@ -929,16 +970,22 @@ bool LoopIdiomRecognize::processLoopStridedStore(
   // base pointer and checking the region.
   Value *BasePtr =
       Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());
+
+  // From here on out, conservatively report to the pass manager that we've
+  // changed the IR, even if we later clean up these added instructions. There
+  // may be structural differences e.g. in the order of use lists not accounted
+  // for in just a textual dump of the IR. This is written as a variable, even
+  // though statically all the places this dominates could be replaced with
+  // 'true', with the hope that anyone trying to be clever / "more precise" with
+  // the return value will read this comment, and leave them alone.
+  Changed = true;
+
   if (mayLoopAccessLocation(BasePtr, ModRefInfo::ModRef, CurLoop, BECount,
-                            StoreSize, *AA, Stores)) {
-    Expander.clear();
-    // If we generated new code for the base pointer, clean up.
-    RecursivelyDeleteTriviallyDeadInstructions(BasePtr, TLI);
-    return false;
-  }
+                            StoreSize, *AA, Stores))
+    return Changed;
 
   if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset))
-    return false;
+    return Changed;
 
   // Okay, everything looks good, insert the memset.
 
@@ -948,7 +995,7 @@ bool LoopIdiomRecognize::processLoopStridedStore(
   // TODO: ideally we should still be able to generate memset if SCEV expander
   // is taught to generate the dependencies at the latest point.
   if (!isSafeToExpand(NumBytesS, *SE))
-    return false;
+    return Changed;
 
   Value *NumBytes =
       Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
@@ -1007,32 +1054,10 @@ bool LoopIdiomRecognize::processLoopStridedStore(
   if (MSSAU && VerifyMemorySSA)
     MSSAU->getMemorySSA()->verifyMemorySSA();
   ++NumMemSet;
+  ExpCleaner.markResultUsed();
   return true;
 }
 
-class ExpandedValuesCleaner {
-  SCEVExpander &Expander;
-  TargetLibraryInfo *TLI;
-  SmallVector<Value *, 4> ExpandedValues;
-  bool Commit = false;
-
-public:
-  ExpandedValuesCleaner(SCEVExpander &Expander, TargetLibraryInfo *TLI)
-      : Expander(Expander), TLI(TLI) {}
-
-  void add(Value *V) { ExpandedValues.push_back(V); }
-
-  void commit() { Commit = true; }
-
-  ~ExpandedValuesCleaner() {
-    if (!Commit) {
-      Expander.clear();
-      for (auto *V : ExpandedValues)
-        RecursivelyDeleteTriviallyDeadInstructions(V, TLI);
-    }
-  }
-};
-
 /// If the stored value is a strided load in the same loop with the same stride
 /// this may be transformable into a memcpy.  This kicks in for stuff like
 /// for (i) A[i] = B[i];
@@ -1063,8 +1088,9 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
   IRBuilder<> Builder(Preheader->getTerminator());
   SCEVExpander Expander(*SE, *DL, "loop-idiom");
 
-  ExpandedValuesCleaner EVC(Expander, TLI);
+  SCEVExpanderCleaner ExpCleaner(Expander, *DT);
 
+  bool Changed = false;
   const SCEV *StrStart = StoreEv->getStart();
   unsigned StrAS = SI->getPointerAddressSpace();
   Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS));
@@ -1081,13 +1107,21 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
   // checking everything.
   Value *StoreBasePtr = Expander.expandCodeFor(
       StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator());
-  EVC.add(StoreBasePtr);
+
+  // From here on out, conservatively report to the pass manager that we've
+  // changed the IR, even if we later clean up these added instructions. There
+  // may be structural differences e.g. in the order of use lists not accounted
+  // for in just a textual dump of the IR. This is written as a variable, even
+  // though statically all the places this dominates could be replaced with
+  // 'true', with the hope that anyone trying to be clever / "more precise" with
+  // the return value will read this comment, and leave them alone.
+  Changed = true;
 
   SmallPtrSet<Instruction *, 1> Stores;
   Stores.insert(SI);
   if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
                             StoreSize, *AA, Stores))
-    return false;
+    return Changed;
 
   const SCEV *LdStart = LoadEv->getStart();
   unsigned LdAS = LI->getPointerAddressSpace();
@@ -1100,14 +1134,13 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
   // mutated by the loop.
   Value *LoadBasePtr = Expander.expandCodeFor(
       LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator());
-  EVC.add(LoadBasePtr);
 
   if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
                             StoreSize, *AA, Stores))
-    return false;
+    return Changed;
 
   if (avoidLIRForMultiBlockLoop())
-    return false;
+    return Changed;
 
   // Okay, everything is safe, we can transform this!
 
@@ -1116,7 +1149,6 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
 
   Value *NumBytes =
       Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
-  EVC.add(NumBytes);
 
   CallInst *NewCall = nullptr;
   // Check whether to generate an unordered atomic memcpy:
@@ -1131,14 +1163,14 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
     const Align StoreAlign = SI->getAlign();
     const Align LoadAlign = LI->getAlign();
     if (StoreAlign < StoreSize || LoadAlign < StoreSize)
-      return false;
+      return Changed;
 
     // If the element.atomic memcpy is not lowered into explicit
     // loads/stores later, then it will be lowered into an element-size
     // specific lib call. If the lib call doesn't exist for our store size, then
     // we shouldn't generate the memcpy.
     if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
-      return false;
+      return Changed;
 
     // Create the call.
     // Note that unordered atomic loads/stores are *required* by the spec to
@@ -1176,7 +1208,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
   if (MSSAU && VerifyMemorySSA)
     MSSAU->getMemorySSA()->verifyMemorySSA();
   ++NumMemCpy;
-  EVC.commit();
+  ExpCleaner.markResultUsed();
   return true;
 }
 
@@ -1186,7 +1218,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
 bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
                                                    bool IsLoopMemset) {
   if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) {
-    if (!CurLoop->getParentLoop() && (!IsMemset || !IsLoopMemset)) {
+    if (CurLoop->isOutermost() && (!IsMemset || !IsLoopMemset)) {
       LLVM_DEBUG(dbgs() << "  " << CurLoop->getHeader()->getParent()->getName()
                         << " : LIR " << (IsMemset ? "Memset" : "Memcpy")
                         << " avoided: multi-block top-level loop\n");
@@ -1203,7 +1235,8 @@ bool LoopIdiomRecognize::runOnNoncountableLoop() {
                     << "] Noncountable Loop %"
                     << CurLoop->getHeader()->getName() << "\n");
 
-  return recognizePopcount() || recognizeAndInsertFFS();
+  return recognizePopcount() || recognizeAndInsertFFS() ||
+         recognizeShiftUntilBitTest();
 }
 
 /// Check if the given conditional branch is based on the comparison between
@@ -1450,6 +1483,7 @@ static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,
     return false;
 
   // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
+  //         or cnt.next = cnt + -1.
   // TODO: We can skip the step. If loop trip count is known (CTLZ),
   //       then all uses of "cnt.next" could be optimized to the trip count
   //       plus "cnt0". Currently it is not optimized.
@@ -1463,7 +1497,7 @@ static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,
       continue;
 
     ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
-    if (!Inc || !Inc->isOne())
+    if (!Inc || (!Inc->isOne() && !Inc->isMinusOne()))
       continue;
 
     PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
@@ -1692,11 +1726,13 @@ void LoopIdiomRecognize::transformLoopToCountable(
   // Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block
   IRBuilder<> Builder(PreheaderBr);
   Builder.SetCurrentDebugLocation(DL);
-  Value *FFS, *Count, *CountPrev, *NewCount, *InitXNext;
 
   //   Count = BitWidth - CTLZ(InitX);
+  //   NewCount = Count;
   // If there are uses of CntPhi create:
-  //   CountPrev = BitWidth - CTLZ(InitX >> 1);
+  //   NewCount = BitWidth - CTLZ(InitX >> 1);
+  //   Count = NewCount + 1;
+  Value *InitXNext;
   if (IsCntPhiUsedOutsideLoop) {
     if (DefX->getOpcode() == Instruction::AShr)
       InitXNext =
@@ -1711,27 +1747,31 @@ void LoopIdiomRecognize::transformLoopToCountable(
       llvm_unreachable("Unexpected opcode!");
   } else
     InitXNext = InitX;
-  FFS = createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID);
-  Count = Builder.CreateSub(
-      ConstantInt::get(FFS->getType(),
-                       FFS->getType()->getIntegerBitWidth()),
+  Value *FFS = createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID);
+  Value *Count = Builder.CreateSub(
+      ConstantInt::get(FFS->getType(), FFS->getType()->getIntegerBitWidth()),
       FFS);
+  Value *NewCount = Count;
   if (IsCntPhiUsedOutsideLoop) {
-    CountPrev = Count;
-    Count = Builder.CreateAdd(
-        CountPrev,
-        ConstantInt::get(CountPrev->getType(), 1));
+    NewCount = Count;
+    Count = Builder.CreateAdd(Count, ConstantInt::get(Count->getType(), 1));
   }
 
-  NewCount = Builder.CreateZExtOrTrunc(
-                      IsCntPhiUsedOutsideLoop ? CountPrev : Count,
-                      cast<IntegerType>(CntInst->getType()));
+  NewCount = Builder.CreateZExtOrTrunc(NewCount,
+                                       cast<IntegerType>(CntInst->getType()));
 
-  // If the counter's initial value is not zero, insert Add Inst.
   Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
-  ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
-  if (!InitConst || !InitConst->isZero())
-    NewCount = Builder.CreateAdd(NewCount, CntInitVal);
+  if (cast<ConstantInt>(CntInst->getOperand(1))->isOne()) {
+    // If the counter was being incremented in the loop, add NewCount to the
+    // counter's initial value, but only if the initial value is not zero.
+    ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
+    if (!InitConst || !InitConst->isZero())
+      NewCount = Builder.CreateAdd(NewCount, CntInitVal);
+  } else {
+    // If the count was being decremented in the loop, subtract NewCount from
+    // the counter's initial value.
+    NewCount = Builder.CreateSub(CntInitVal, NewCount);
+  }
 
   // Step 2: Insert new IV and loop condition:
   // loop:
@@ -1879,3 +1919,343 @@ void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
   //   loop. The loop would otherwise not be deleted even if it becomes empty.
   SE->forgetLoop(CurLoop);
 }
+
+/// Match loop-invariant value.
+template <typename SubPattern_t> struct match_LoopInvariant {
+  SubPattern_t SubPattern;
+  const Loop *L;
+
+  match_LoopInvariant(const SubPattern_t &SP, const Loop *L)
+      : SubPattern(SP), L(L) {}
+
+  template <typename ITy> bool match(ITy *V) {
+    return L->isLoopInvariant(V) && SubPattern.match(V);
+  }
+};
+
+/// Matches if the value is loop-invariant.
+template <typename Ty>
+inline match_LoopInvariant<Ty> m_LoopInvariant(const Ty &M, const Loop *L) {
+  return match_LoopInvariant<Ty>(M, L);
+}
+
+/// Return true if the idiom is detected in the loop.
+///
+/// The core idiom we are trying to detect is:
+/// \code
+///   entry:
+///     <...>
+///     %bitmask = shl i32 1, %bitpos
+///     br label %loop
+///
+///   loop:
+///     %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
+///     %x.curr.bitmasked = and i32 %x.curr, %bitmask
+///     %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
+///     %x.next = shl i32 %x.curr, 1
+///     <...>
+///     br i1 %x.curr.isbitunset, label %loop, label %end
+///
+///   end:
+///     %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
+///     %x.next.res = phi i32 [ %x.next, %loop ] <...>
+///     <...>
+/// \endcode
+static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX,
+                                         Value *&BitMask, Value *&BitPos,
+                                         Value *&CurrX, Instruction *&NextX) {
+  LLVM_DEBUG(dbgs() << DEBUG_TYPE
+             " Performing shift-until-bittest idiom detection.\n");
+
+  // Give up if the loop has multiple blocks or multiple backedges.
+  if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) {
+    LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n");
+    return false;
+  }
+
+  BasicBlock *LoopHeaderBB = CurLoop->getHeader();
+  BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
+  assert(LoopPreheaderBB && "There is always a loop preheader.");
+
+  using namespace PatternMatch;
+
+  // Step 1: Check if the loop backedge is in desirable form.
+
+  ICmpInst::Predicate Pred;
+  Value *CmpLHS, *CmpRHS;
+  BasicBlock *TrueBB, *FalseBB;
+  if (!match(LoopHeaderBB->getTerminator(),
+             m_Br(m_ICmp(Pred, m_Value(CmpLHS), m_Value(CmpRHS)),
+                  m_BasicBlock(TrueBB), m_BasicBlock(FalseBB)))) {
+    LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n");
+    return false;
+  }
+
+  // Step 2: Check if the backedge's condition is in desirable form.
+
+  auto MatchVariableBitMask = [&]() {
+    return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
+           match(CmpLHS,
+                 m_c_And(m_Value(CurrX),
+                         m_CombineAnd(
+                             m_Value(BitMask),
+                             m_LoopInvariant(m_Shl(m_One(), m_Value(BitPos)),
+                                             CurLoop))));
+  };
+  auto MatchConstantBitMask = [&]() {
+    return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
+           match(CmpLHS, m_And(m_Value(CurrX),
+                               m_CombineAnd(m_Value(BitMask), m_Power2()))) &&
+           (BitPos = ConstantExpr::getExactLogBase2(cast<Constant>(BitMask)));
+  };
+  auto MatchDecomposableConstantBitMask = [&]() {
+    APInt Mask;
+    return llvm::decomposeBitTestICmp(CmpLHS, CmpRHS, Pred, CurrX, Mask) &&
+           ICmpInst::isEquality(Pred) && Mask.isPowerOf2() &&
+           (BitMask = ConstantInt::get(CurrX->getType(), Mask)) &&
+           (BitPos = ConstantInt::get(CurrX->getType(), Mask.logBase2()));
+  };
+
+  if (!MatchVariableBitMask() && !MatchConstantBitMask() &&
+      !MatchDecomposableConstantBitMask()) {
+    LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge comparison.\n");
+    return false;
+  }
+
+  // Step 3: Check if the recurrence is in desirable form.
+  auto *CurrXPN = dyn_cast<PHINode>(CurrX);
+  if (!CurrXPN || CurrXPN->getParent() != LoopHeaderBB) {
+    LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n");
+    return false;
+  }
+
+  BaseX = CurrXPN->getIncomingValueForBlock(LoopPreheaderBB);
+  NextX =
+      dyn_cast<Instruction>(CurrXPN->getIncomingValueForBlock(LoopHeaderBB));
+
+  if (!NextX || !match(NextX, m_Shl(m_Specific(CurrX), m_One()))) {
+    // FIXME: support right-shift?
+    LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n");
+    return false;
+  }
+
+  // Step 4: Check if the backedge's destinations are in desirable form.
+
+  assert(ICmpInst::isEquality(Pred) &&
+         "Should only get equality predicates here.");
+
+  // cmp-br is commutative, so canonicalize to a single variant.
+  if (Pred != ICmpInst::Predicate::ICMP_EQ) {
+    Pred = ICmpInst::getInversePredicate(Pred);
+    std::swap(TrueBB, FalseBB);
+  }
+
+  // We expect to exit loop when comparison yields false,
+  // so when it yields true we should branch back to loop header.
+  if (TrueBB != LoopHeaderBB) {
+    LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n");
+    return false;
+  }
+
+  // Okay, idiom checks out.
+  return true;
+}
+
+/// Look for the following loop:
+/// \code
+///   entry:
+///     <...>
+///     %bitmask = shl i32 1, %bitpos
+///     br label %loop
+///
+///   loop:
+///     %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
+///     %x.curr.bitmasked = and i32 %x.curr, %bitmask
+///     %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
+///     %x.next = shl i32 %x.curr, 1
+///     <...>
+///     br i1 %x.curr.isbitunset, label %loop, label %end
+///
+///   end:
+///     %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
+///     %x.next.res = phi i32 [ %x.next, %loop ] <...>
+///     <...>
+/// \endcode
+///
+/// And transform it into:
+/// \code
+///   entry:
+///     %bitmask = shl i32 1, %bitpos
+///     %lowbitmask = add i32 %bitmask, -1
+///     %mask = or i32 %lowbitmask, %bitmask
+///     %x.masked = and i32 %x, %mask
+///     %x.masked.numleadingzeros = call i32 @llvm.ctlz.i32(i32 %x.masked,
+///                                                         i1 true)
+///     %x.masked.numactivebits = sub i32 32, %x.masked.numleadingzeros
+///     %x.masked.leadingonepos = add i32 %x.masked.numactivebits, -1
+///     %backedgetakencount = sub i32 %bitpos, %x.masked.leadingonepos
+///     %tripcount = add i32 %backedgetakencount, 1
+///     %x.curr = shl i32 %x, %backedgetakencount
+///     %x.next = shl i32 %x, %tripcount
+///     br label %loop
+///
+///   loop:
+///     %loop.iv = phi i32 [ 0, %entry ], [ %loop.iv.next, %loop ]
+///     %loop.iv.next = add nuw i32 %loop.iv, 1
+///     %loop.ivcheck = icmp eq i32 %loop.iv.next, %tripcount
+///     <...>
+///     br i1 %loop.ivcheck, label %end, label %loop
+///
+///   end:
+///     %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
+///     %x.next.res = phi i32 [ %x.next, %loop ] <...>
+///     <...>
+/// \endcode
+bool LoopIdiomRecognize::recognizeShiftUntilBitTest() {
+  bool MadeChange = false;
+
+  Value *X, *BitMask, *BitPos, *XCurr;
+  Instruction *XNext;
+  if (!detectShiftUntilBitTestIdiom(CurLoop, X, BitMask, BitPos, XCurr,
+                                    XNext)) {
+    LLVM_DEBUG(dbgs() << DEBUG_TYPE
+               " shift-until-bittest idiom detection failed.\n");
+    return MadeChange;
+  }
+  LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom detected!\n");
+
+  // Ok, it is the idiom we were looking for, we *could* transform this loop,
+  // but is it profitable to transform?
+
+  BasicBlock *LoopHeaderBB = CurLoop->getHeader();
+  BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
+  assert(LoopPreheaderBB && "There is always a loop preheader.");
+
+  BasicBlock *SuccessorBB = CurLoop->getExitBlock();
+  assert(LoopPreheaderBB && "There is only a single successor.");
+
+  IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
+  Builder.SetCurrentDebugLocation(cast<Instruction>(XCurr)->getDebugLoc());
+
+  Intrinsic::ID IntrID = Intrinsic::ctlz;
+  Type *Ty = X->getType();
+
+  TargetTransformInfo::TargetCostKind CostKind =
+      TargetTransformInfo::TCK_SizeAndLatency;
+
+  // The rewrite is considered to be unprofitable iff and only iff the
+  // intrinsic/shift we'll use are not cheap. Note that we are okay with *just*
+  // making the loop countable, even if nothing else changes.
+  IntrinsicCostAttributes Attrs(
+      IntrID, Ty, {UndefValue::get(Ty), /*is_zero_undef=*/Builder.getTrue()});
+  int Cost = TTI->getIntrinsicInstrCost(Attrs, CostKind);
+  if (Cost > TargetTransformInfo::TCC_Basic) {
+    LLVM_DEBUG(dbgs() << DEBUG_TYPE
+               " Intrinsic is too costly, not beneficial\n");
+    return MadeChange;
+  }
+  if (TTI->getArithmeticInstrCost(Instruction::Shl, Ty, CostKind) >
+      TargetTransformInfo::TCC_Basic) {
+    LLVM_DEBUG(dbgs() << DEBUG_TYPE " Shift is too costly, not beneficial\n");
+    return MadeChange;
+  }
+
+  // Ok, transform appears worthwhile.
+  MadeChange = true;
+
+  // Step 1: Compute the loop trip count.
+
+  Value *LowBitMask = Builder.CreateAdd(BitMask, Constant::getAllOnesValue(Ty),
+                                        BitPos->getName() + ".lowbitmask");
+  Value *Mask =
+      Builder.CreateOr(LowBitMask, BitMask, BitPos->getName() + ".mask");
+  Value *XMasked = Builder.CreateAnd(X, Mask, X->getName() + ".masked");
+  CallInst *XMaskedNumLeadingZeros = Builder.CreateIntrinsic(
+      IntrID, Ty, {XMasked, /*is_zero_undef=*/Builder.getTrue()},
+      /*FMFSource=*/nullptr, XMasked->getName() + ".numleadingzeros");
+  Value *XMaskedNumActiveBits = Builder.CreateSub(
+      ConstantInt::get(Ty, Ty->getScalarSizeInBits()), XMaskedNumLeadingZeros,
+      XMasked->getName() + ".numactivebits");
+  Value *XMaskedLeadingOnePos =
+      Builder.CreateAdd(XMaskedNumActiveBits, Constant::getAllOnesValue(Ty),
+                        XMasked->getName() + ".leadingonepos");
+
+  Value *LoopBackedgeTakenCount = Builder.CreateSub(
+      BitPos, XMaskedLeadingOnePos, CurLoop->getName() + ".backedgetakencount");
+  // We know loop's backedge-taken count, but what's loop's trip count?
+  // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
+  Value *LoopTripCount =
+      Builder.CreateNUWAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),
+                           CurLoop->getName() + ".tripcount");
+
+  // Step 2: Compute the recurrence's final value without a loop.
+
+  // NewX is always safe to compute, because `LoopBackedgeTakenCount`
+  // will always be smaller than `bitwidth(X)`, i.e. we never get poison.
+  Value *NewX = Builder.CreateShl(X, LoopBackedgeTakenCount);
+  NewX->takeName(XCurr);
+  if (auto *I = dyn_cast<Instruction>(NewX))
+    I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);
+
+  Value *NewXNext;
+  // Rewriting XNext is more complicated, however, because `X << LoopTripCount`
+  // will be poison iff `LoopTripCount == bitwidth(X)` (which will happen
+  // iff `BitPos` is `bitwidth(x) - 1` and `X` is `1`). So unless we know
+  // that isn't the case, we'll need to emit an alternative, safe IR.
+  if (XNext->hasNoSignedWrap() || XNext->hasNoUnsignedWrap() ||
+      PatternMatch::match(
+          BitPos, PatternMatch::m_SpecificInt_ICMP(
+                      ICmpInst::ICMP_NE, APInt(Ty->getScalarSizeInBits(),
+                                               Ty->getScalarSizeInBits() - 1))))
+    NewXNext = Builder.CreateShl(X, LoopTripCount);
+  else {
+    // Otherwise, just additionally shift by one. It's the smallest solution,
+    // alternatively, we could check that NewX is INT_MIN (or BitPos is )
+    // and select 0 instead.
+    NewXNext = Builder.CreateShl(NewX, ConstantInt::get(Ty, 1));
+  }
+
+  NewXNext->takeName(XNext);
+  if (auto *I = dyn_cast<Instruction>(NewXNext))
+    I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);
+
+  // Step 3: Adjust the successor basic block to recieve the computed
+  //         recurrence's final value instead of the recurrence itself.
+
+  XCurr->replaceUsesOutsideBlock(NewX, LoopHeaderBB);
+  XNext->replaceUsesOutsideBlock(NewXNext, LoopHeaderBB);
+
+  // Step 4: Rewrite the loop into a countable form, with canonical IV.
+
+  // The new canonical induction variable.
+  Builder.SetInsertPoint(&LoopHeaderBB->front());
+  auto *IV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
+
+  // The induction itself.
+  // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
+  Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
+  auto *IVNext = Builder.CreateNUWAdd(IV, ConstantInt::get(Ty, 1),
+                                      IV->getName() + ".next");
+
+  // The loop trip count check.
+  auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount,
+                                       CurLoop->getName() + ".ivcheck");
+  Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
+  LoopHeaderBB->getTerminator()->eraseFromParent();
+
+  // Populate the IV PHI.
+  IV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);
+  IV->addIncoming(IVNext, LoopHeaderBB);
+
+  // Step 5: Forget the "non-computable" trip-count SCEV associated with the
+  //   loop. The loop would otherwise not be deleted even if it becomes empty.
+
+  SE->forgetLoop(CurLoop);
+
+  // Other passes will take care of actually deleting the loop if possible.
+
+  LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom optimized!\n");
+
+  ++NumShiftUntilBitTest;
+  return MadeChange;
+}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 7787c0bccd4c..d9dbc0deb42a 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -12,6 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Scalar/LoopInterchange.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -27,6 +28,7 @@
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
@@ -427,9 +429,7 @@ private:
   const LoopInterchangeLegality &LIL;
 };
 
-// Main LoopInterchange Pass.
-struct LoopInterchange : public LoopPass {
-  static char ID;
+struct LoopInterchange {
   ScalarEvolution *SE = nullptr;
   LoopInfo *LI = nullptr;
   DependenceInfo *DI = nullptr;
@@ -438,34 +438,21 @@ struct LoopInterchange : public LoopPass {
   /// Interface to emit optimization remarks.
   OptimizationRemarkEmitter *ORE;
 
-  LoopInterchange() : LoopPass(ID) {
-    initializeLoopInterchangePass(*PassRegistry::getPassRegistry());
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<DependenceAnalysisWrapperPass>();
-    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+  LoopInterchange(ScalarEvolution *SE, LoopInfo *LI, DependenceInfo *DI,
+                  DominatorTree *DT, OptimizationRemarkEmitter *ORE)
+      : SE(SE), LI(LI), DI(DI), DT(DT), ORE(ORE) {}
 
-    getLoopAnalysisUsage(AU);
-  }
-
-  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
-    if (skipLoop(L) || L->getParentLoop())
+  bool run(Loop *L) {
+    if (L->getParentLoop())
       return false;
 
-    SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
-    LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-    DI = &getAnalysis<DependenceAnalysisWrapperPass>().getDI();
-    DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-    ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
-
     return processLoopList(populateWorklist(*L));
   }
 
   bool isComputableLoopNest(LoopVector LoopList) {
     for (Loop *L : LoopList) {
       const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L);
-      if (ExitCountOuter == SE->getCouldNotCompute()) {
+      if (isa<SCEVCouldNotCompute>(ExitCountOuter)) {
         LLVM_DEBUG(dbgs() << "Couldn't compute backedge count\n");
         return false;
       }
@@ -624,6 +611,13 @@ bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
       containsUnsafeInstructions(OuterLoopLatch))
     return false;
 
+  // Also make sure the inner loop preheader does not contain any unsafe
+  // instructions. Note that all instructions in the preheader will be moved to
+  // the outer loop header when interchanging.
+  if (InnerLoopPreHeader != OuterLoopHeader &&
+      containsUnsafeInstructions(InnerLoopPreHeader))
+    return false;
+
   LLVM_DEBUG(dbgs() << "Loops are perfectly nested\n");
   // We have a perfect loop nest.
   return true;
@@ -667,6 +661,10 @@ static Value *followLCSSA(Value *SV) {
 
 // Check V's users to see if it is involved in a reduction in L.
 static PHINode *findInnerReductionPhi(Loop *L, Value *V) {
+  // Reduction variables cannot be constants.
+  if (isa<Constant>(V))
+    return nullptr;
+
   for (Value *User : V->users()) {
     if (PHINode *PHI = dyn_cast<PHINode>(User)) {
       if (PHI->getNumIncomingValues() == 1)
@@ -707,8 +705,7 @@ bool LoopInterchangeLegality::findInductionAndReductions(
         Value *V = followLCSSA(PHI.getIncomingValueForBlock(L->getLoopLatch()));
         PHINode *InnerRedPhi = findInnerReductionPhi(InnerLoop, V);
         if (!InnerRedPhi ||
-            !llvm::any_of(InnerRedPhi->incoming_values(),
-                          [&PHI](Value *V) { return V == &PHI; })) {
+            !llvm::is_contained(InnerRedPhi->incoming_values(), &PHI)) {
           LLVM_DEBUG(
               dbgs()
               << "Failed to recognize PHI as an induction or reduction.\n");
@@ -1045,6 +1042,10 @@ int LoopInterchangeProfitability::getInstrOrderCost() {
         bool FoundInnerInduction = false;
         bool FoundOuterInduction = false;
         for (unsigned i = 0; i < NumOp; ++i) {
+          // Skip operands that are not SCEV-able.
+          if (!SE->isSCEVable(GEP->getOperand(i)->getType()))
+            continue;
+
           const SCEV *OperandVal = SE->getSCEV(GEP->getOperand(i));
           const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OperandVal);
           if (!AR)
@@ -1189,7 +1190,7 @@ void LoopInterchangeTransform::restructureLoops(
     removeChildLoop(NewInner, NewOuter);
     LI->changeTopLevelLoop(NewInner, NewOuter);
   }
-  while (!NewOuter->empty())
+  while (!NewOuter->isInnermost())
     NewInner->addChildLoop(NewOuter->removeChildLoop(NewOuter->begin()));
   NewOuter->addChildLoop(NewInner);
 
@@ -1305,6 +1306,21 @@ bool LoopInterchangeTransform::transform() {
     LLVM_DEBUG(dbgs() << "splitting InnerLoopHeader done\n");
   }
 
+  // Instructions in the original inner loop preheader may depend on values
+  // defined in the outer loop header. Move them there, because the original
+  // inner loop preheader will become the entry into the interchanged loop nest.
+  // Currently we move all instructions and rely on LICM to move invariant
+  // instructions outside the loop nest.
+  BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+  BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
+  if (InnerLoopPreHeader != OuterLoopHeader) {
+    SmallPtrSet<Instruction *, 4> NeedsMoving;
+    for (Instruction &I :
+         make_early_inc_range(make_range(InnerLoopPreHeader->begin(),
+                                         std::prev(InnerLoopPreHeader->end()))))
+      I.moveBefore(OuterLoopHeader->getTerminator());
+  }
+
   Transformed |= adjustLoopLinks();
   if (!Transformed) {
     LLVM_DEBUG(dbgs() << "adjustLoopLinks failed\n");
@@ -1521,8 +1537,7 @@ bool LoopInterchangeTransform::adjustLoopBranches() {
                   InnerLoopPreHeader, DTUpdates, /*MustUpdateOnce=*/false);
   // The outer loop header might or might not branch to the outer latch.
   // We are guaranteed to branch to the inner loop preheader.
-  if (std::find(succ_begin(OuterLoopHeaderBI), succ_end(OuterLoopHeaderBI),
-                OuterLoopLatch) != succ_end(OuterLoopHeaderBI))
+  if (llvm::is_contained(OuterLoopHeaderBI->successors(), OuterLoopLatch))
     updateSuccessor(OuterLoopHeaderBI, OuterLoopLatch, LoopExit, DTUpdates,
                     /*MustUpdateOnce=*/false);
   updateSuccessor(OuterLoopHeaderBI, InnerLoopPreHeader,
@@ -1569,9 +1584,9 @@ bool LoopInterchangeTransform::adjustLoopBranches() {
 
   // Now update the reduction PHIs in the inner and outer loop headers.
   SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs;
-  for (PHINode &PHI : drop_begin(InnerLoopHeader->phis(), 1))
+  for (PHINode &PHI : drop_begin(InnerLoopHeader->phis()))
     InnerLoopPHIs.push_back(cast<PHINode>(&PHI));
-  for (PHINode &PHI : drop_begin(OuterLoopHeader->phis(), 1))
+  for (PHINode &PHI : drop_begin(OuterLoopHeader->phis()))
     OuterLoopPHIs.push_back(cast<PHINode>(&PHI));
 
   auto &OuterInnerReductions = LIL.getOuterInnerReductions();
@@ -1595,6 +1610,17 @@ bool LoopInterchangeTransform::adjustLoopBranches() {
   InnerLoopHeader->replacePhiUsesWith(OuterLoopPreHeader, InnerLoopPreHeader);
   InnerLoopHeader->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch);
 
+  // Values defined in the outer loop header could be used in the inner loop
+  // latch. In that case, we need to create LCSSA phis for them, because after
+  // interchanging they will be defined in the new inner loop and used in the
+  // new outer loop.
+  IRBuilder<> Builder(OuterLoopHeader->getContext());
+  SmallVector<Instruction *, 4> MayNeedLCSSAPhis;
+  for (Instruction &I :
+       make_range(OuterLoopHeader->begin(), std::prev(OuterLoopHeader->end())))
+    MayNeedLCSSAPhis.push_back(&I);
+  formLCSSAForInstructions(MayNeedLCSSAPhis, *DT, *LI, SE, Builder);
+
   return true;
 }
 
@@ -1612,15 +1638,58 @@ bool LoopInterchangeTransform::adjustLoopLinks() {
   return Changed;
 }
 
-char LoopInterchange::ID = 0;
+/// Main LoopInterchange Pass.
+struct LoopInterchangeLegacyPass : public LoopPass {
+  static char ID;
+
+  LoopInterchangeLegacyPass() : LoopPass(ID) {
+    initializeLoopInterchangeLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DependenceAnalysisWrapperPass>();
+    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+
+    getLoopAnalysisUsage(AU);
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+    if (skipLoop(L))
+      return false;
+
+    auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    auto *DI = &getAnalysis<DependenceAnalysisWrapperPass>().getDI();
+    auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+
+    return LoopInterchange(SE, LI, DI, DT, ORE).run(L);
+  }
+};
 
-INITIALIZE_PASS_BEGIN(LoopInterchange, "loop-interchange",
+char LoopInterchangeLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(LoopInterchangeLegacyPass, "loop-interchange",
                       "Interchanges loops for cache reuse", false, false)
 INITIALIZE_PASS_DEPENDENCY(LoopPass)
 INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
 
-INITIALIZE_PASS_END(LoopInterchange, "loop-interchange",
+INITIALIZE_PASS_END(LoopInterchangeLegacyPass, "loop-interchange",
                     "Interchanges loops for cache reuse", false, false)
 
-Pass *llvm::createLoopInterchangePass() { return new LoopInterchange(); }
+Pass *llvm::createLoopInterchangePass() {
+  return new LoopInterchangeLegacyPass();
+}
+
+PreservedAnalyses LoopInterchangePass::run(Loop &L, LoopAnalysisManager &AM,
+                                           LoopStandardAnalysisResults &AR,
+                                           LPMUpdater &U) {
+  Function &F = *L.getHeader()->getParent();
+
+  DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI);
+  OptimizationRemarkEmitter ORE(&F);
+  if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, &ORE).run(&L))
+    return PreservedAnalyses::all();
+  return getLoopPassPreservedAnalyses();
+}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
index 4412b3079461..058612149a94 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -27,7 +27,6 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/GlobalsModRef.h"
@@ -56,6 +55,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/LoopSimplify.h"
 #include "llvm/Transforms/Utils/LoopVersioning.h"
 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 #include "llvm/Transforms/Utils/SizeOpts.h"
@@ -308,8 +308,8 @@ public:
   /// We need a check if one is a pointer for a candidate load and the other is
   /// a pointer for a possibly intervening store.
   bool needsChecking(unsigned PtrIdx1, unsigned PtrIdx2,
-                     const SmallPtrSet<Value *, 4> &PtrsWrittenOnFwdingPath,
-                     const std::set<Value *> &CandLoadPtrs) {
+                     const SmallPtrSetImpl<Value *> &PtrsWrittenOnFwdingPath,
+                     const SmallPtrSetImpl<Value *> &CandLoadPtrs) {
     Value *Ptr1 =
         LAI.getRuntimePointerChecking()->getPointerInfo(PtrIdx1).PointerValue;
     Value *Ptr2 =
@@ -384,11 +384,9 @@ public:
         findPointersWrittenOnForwardingPath(Candidates);
 
     // Collect the pointers of the candidate loads.
-    // FIXME: SmallPtrSet does not work with std::inserter.
-    std::set<Value *> CandLoadPtrs;
-    transform(Candidates,
-                   std::inserter(CandLoadPtrs, CandLoadPtrs.begin()),
-                   std::mem_fn(&StoreToLoadForwardingCandidate::getLoadPtr));
+    SmallPtrSet<Value *, 4> CandLoadPtrs;
+    for (const auto &Candidate : Candidates)
+      CandLoadPtrs.insert(Candidate.getLoadPtr());
 
     const auto &AllChecks = LAI.getRuntimePointerChecking()->getChecks();
     SmallVector<RuntimePointerCheck, 4> Checks;
@@ -488,7 +486,6 @@ public:
 
     // Filter the candidates further.
     SmallVector<StoreToLoadForwardingCandidate, 4> Candidates;
-    unsigned NumForwarding = 0;
     for (const StoreToLoadForwardingCandidate &Cand : StoreToLoadDependences) {
       LLVM_DEBUG(dbgs() << "Candidate " << Cand);
 
@@ -508,12 +505,17 @@ public:
       if (!Cand.isDependenceDistanceOfOne(PSE, L))
         continue;
 
-      ++NumForwarding;
+      assert(isa<SCEVAddRecExpr>(PSE.getSCEV(Cand.Load->getPointerOperand())) &&
+             "Loading from something other than indvar?");
+      assert(
+          isa<SCEVAddRecExpr>(PSE.getSCEV(Cand.Store->getPointerOperand())) &&
+          "Storing to something other than indvar?");
+
+      Candidates.push_back(Cand);
       LLVM_DEBUG(
           dbgs()
-          << NumForwarding
+          << Candidates.size()
           << ". Valid store-to-load forwarding across the loop backedge\n");
-      Candidates.push_back(Cand);
     }
     if (Candidates.empty())
       return false;
@@ -561,10 +563,19 @@ public:
       // Point of no-return, start the transformation.  First, version the loop
       // if necessary.
 
-      LoopVersioning LV(LAI, L, LI, DT, PSE.getSE(), false);
-      LV.setAliasChecks(std::move(Checks));
-      LV.setSCEVChecks(LAI.getPSE().getUnionPredicate());
+      LoopVersioning LV(LAI, Checks, L, LI, DT, PSE.getSE());
       LV.versionLoop();
+
+      // After versioning, some of the candidates' pointers could stop being
+      // SCEVAddRecs. We need to filter them out.
+      auto NoLongerGoodCandidate = [this](
+          const StoreToLoadForwardingCandidate &Cand) {
+        return !isa<SCEVAddRecExpr>(
+                    PSE.getSCEV(Cand.Load->getPointerOperand())) ||
+               !isa<SCEVAddRecExpr>(
+                    PSE.getSCEV(Cand.Store->getPointerOperand()));
+      };
+      llvm::erase_if(Candidates, NoLongerGoodCandidate);
     }
 
     // Next, propagate the value stored by the store to the users of the load.
@@ -573,7 +584,7 @@ public:
                      "storeforward");
     for (const auto &Cand : Candidates)
       propagateStoredValueToLoadUsers(Cand, SEE);
-    NumLoopLoadEliminted += NumForwarding;
+    NumLoopLoadEliminted += Candidates.size();
 
     return true;
   }
@@ -599,6 +610,7 @@ private:
 static bool
 eliminateLoadsAcrossLoops(Function &F, LoopInfo &LI, DominatorTree &DT,
                           BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
+                          ScalarEvolution *SE, AssumptionCache *AC,
                           function_ref<const LoopAccessInfo &(Loop &)> GetLAI) {
   // Build up a worklist of inner-loops to transform to avoid iterator
   // invalidation.
@@ -607,15 +619,21 @@ eliminateLoadsAcrossLoops(Function &F, LoopInfo &LI, DominatorTree &DT,
   // which merely optimizes the use of loads in a loop.
   SmallVector<Loop *, 8> Worklist;
 
+  bool Changed = false;
+
   for (Loop *TopLevelLoop : LI)
-    for (Loop *L : depth_first(TopLevelLoop))
+    for (Loop *L : depth_first(TopLevelLoop)) {
+      Changed |= simplifyLoop(L, &DT, &LI, SE, AC, /*MSSAU*/ nullptr, false);
       // We only handle inner-most loops.
-      if (L->empty())
+      if (L->isInnermost())
         Worklist.push_back(L);
+    }
 
   // Now walk the identified inner loops.
-  bool Changed = false;
   for (Loop *L : Worklist) {
+    // Match historical behavior
+    if (!L->isRotatedForm() || !L->getExitingBlock())
+      continue;
     // The actual work is performed by LoadEliminationForLoop.
     LoadEliminationForLoop LEL(L, &LI, GetLAI(*L), &DT, BFI, PSI);
     Changed |= LEL.processLoop();
@@ -649,7 +667,7 @@ public:
 
     // Process each loop nest in the function.
     return eliminateLoadsAcrossLoops(
-        F, LI, DT, BFI, PSI,
+        F, LI, DT, BFI, PSI, /*SE*/ nullptr, /*AC*/ nullptr,
         [&LAA](Loop &L) -> const LoopAccessInfo & { return LAA.getInfo(&L); });
   }
 
@@ -706,8 +724,9 @@ PreservedAnalyses LoopLoadEliminationPass::run(Function &F,
 
   auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
   bool Changed = eliminateLoadsAcrossLoops(
-      F, LI, DT, BFI, PSI, [&](Loop &L) -> const LoopAccessInfo & {
-        LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
+      F, LI, DT, BFI, PSI, &SE, &AC, [&](Loop &L) -> const LoopAccessInfo & {
+        LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,  SE,
+                                          TLI, TTI, nullptr, MSSA};
         return LAM.getResult<LoopAccessAnalysis>(L, AR);
       });
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopPassManager.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
index 98889a9df116..3fe8e7259114 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
@@ -6,74 +6,113 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Support/TimeProfiler.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
-#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Support/TimeProfiler.h"
 
 using namespace llvm;
 
-// Explicit template instantiations and specialization defininitions for core
-// template typedefs.
 namespace llvm {
-template class PassManager<Loop, LoopAnalysisManager,
-                           LoopStandardAnalysisResults &, LPMUpdater &>;
 
 /// Explicitly specialize the pass manager's run method to handle loop nest
 /// structure updates.
-template <>
 PreservedAnalyses
 PassManager<Loop, LoopAnalysisManager, LoopStandardAnalysisResults &,
             LPMUpdater &>::run(Loop &L, LoopAnalysisManager &AM,
                                LoopStandardAnalysisResults &AR, LPMUpdater &U) {
-  PreservedAnalyses PA = PreservedAnalyses::all();
 
   if (DebugLogging)
     dbgs() << "Starting Loop pass manager run.\n";
 
+  // Runs loop-nest passes only when the current loop is a top-level one.
+  PreservedAnalyses PA = (L.isOutermost() && !LoopNestPasses.empty())
+                             ? runWithLoopNestPasses(L, AM, AR, U)
+                             : runWithoutLoopNestPasses(L, AM, AR, U);
+
+  // Invalidation for the current loop should be handled above, and other loop
+  // analysis results shouldn't be impacted by runs over this loop. Therefore,
+  // the remaining analysis results in the AnalysisManager are preserved. We
+  // mark this with a set so that we don't need to inspect each one
+  // individually.
+  // FIXME: This isn't correct! This loop and all nested loops' analyses should
+  // be preserved, but unrolling should invalidate the parent loop's analyses.
+  PA.preserveSet<AllAnalysesOn<Loop>>();
+
+  if (DebugLogging)
+    dbgs() << "Finished Loop pass manager run.\n";
+
+  return PA;
+}
+
+// Run both loop passes and loop-nest passes on top-level loop \p L.
+PreservedAnalyses
+LoopPassManager::runWithLoopNestPasses(Loop &L, LoopAnalysisManager &AM,
+                                       LoopStandardAnalysisResults &AR,
+                                       LPMUpdater &U) {
+  assert(L.isOutermost() &&
+         "Loop-nest passes should only run on top-level loops.");
+  PreservedAnalyses PA = PreservedAnalyses::all();
+
   // Request PassInstrumentation from analysis manager, will use it to run
   // instrumenting callbacks for the passes later.
   PassInstrumentation PI = AM.getResult<PassInstrumentationAnalysis>(L, AR);
-  for (auto &Pass : Passes) {
-    // Check the PassInstrumentation's BeforePass callbacks before running the
-    // pass, skip its execution completely if asked to (callback returns false).
-    if (!PI.runBeforePass<Loop>(*Pass, L))
-      continue;
 
-    if (DebugLogging)
-      dbgs() << "Running pass: " << Pass->name() << " on " << L;
+  unsigned LoopPassIndex = 0, LoopNestPassIndex = 0;
 
-    PreservedAnalyses PassPA;
-    {
-      TimeTraceScope TimeScope(Pass->name(), L.getName());
-      PassPA = Pass->run(L, AM, AR, U);
+  // `LoopNestPtr` points to the `LoopNest` object for the current top-level
+  // loop and `IsLoopNestPtrValid` indicates whether the pointer is still valid.
+  // The `LoopNest` object will have to be re-constructed if the pointer is
+  // invalid when encountering a loop-nest pass.
+  std::unique_ptr<LoopNest> LoopNestPtr;
+  bool IsLoopNestPtrValid = false;
+
+  for (size_t I = 0, E = IsLoopNestPass.size(); I != E; ++I) {
+    Optional<PreservedAnalyses> PassPA;
+    if (!IsLoopNestPass[I]) {
+      // The `I`-th pass is a loop pass.
+      auto &Pass = LoopPasses[LoopPassIndex++];
+      PassPA = runSinglePass(L, Pass, AM, AR, U, PI);
+    } else {
+      // The `I`-th pass is a loop-nest pass.
+      auto &Pass = LoopNestPasses[LoopNestPassIndex++];
+
+      // If the loop-nest object calculated before is no longer valid,
+      // re-calculate it here before running the loop-nest pass.
+      if (!IsLoopNestPtrValid) {
+        LoopNestPtr = LoopNest::getLoopNest(L, AR.SE);
+        IsLoopNestPtrValid = true;
+      }
+      PassPA = runSinglePass(*LoopNestPtr, Pass, AM, AR, U, PI);
     }
 
-    // do not pass deleted Loop into the instrumentation
-    if (U.skipCurrentLoop())
-      PI.runAfterPassInvalidated<Loop>(*Pass);
-    else
-      PI.runAfterPass<Loop>(*Pass, L);
+    // `PassPA` is `None` means that the before-pass callbacks in
+    // `PassInstrumentation` return false. The pass does not run in this case,
+    // so we can skip the following procedure.
+    if (!PassPA)
+      continue;
 
     // If the loop was deleted, abort the run and return to the outer walk.
     if (U.skipCurrentLoop()) {
-      PA.intersect(std::move(PassPA));
+      PA.intersect(std::move(*PassPA));
       break;
     }
 
-#ifndef NDEBUG
-    // Verify the loop structure and LCSSA form before visiting the loop.
-    L.verifyLoop();
-    assert(L.isRecursivelyLCSSAForm(AR.DT, AR.LI) &&
-           "Loops must remain in LCSSA form!");
-#endif
-
     // Update the analysis manager as each pass runs and potentially
     // invalidates analyses.
-    AM.invalidate(L, PassPA);
+    AM.invalidate(L, *PassPA);
 
     // Finally, we intersect the final preserved analyses to compute the
     // aggregate preserved set for this pass manager.
-    PA.intersect(std::move(PassPA));
+    PA.intersect(std::move(*PassPA));
+
+    // Check if the current pass preserved the loop-nest object or not.
+    IsLoopNestPtrValid &= PassPA->getChecker<LoopNestAnalysis>().preserved();
 
     // FIXME: Historically, the pass managers all called the LLVM context's
     // yield function here. We don't have a generic way to acquire the
@@ -81,21 +120,207 @@ PassManager<Loop, LoopAnalysisManager, LoopStandardAnalysisResults &,
     // in the new pass manager so it is currently omitted.
     // ...getContext().yield();
   }
+  return PA;
+}
 
-  // Invalidation for the current loop should be handled above, and other loop
-  // analysis results shouldn't be impacted by runs over this loop. Therefore,
-  // the remaining analysis results in the AnalysisManager are preserved. We
-  // mark this with a set so that we don't need to inspect each one
-  // individually.
-  // FIXME: This isn't correct! This loop and all nested loops' analyses should
-  // be preserved, but unrolling should invalidate the parent loop's analyses.
-  PA.preserveSet<AllAnalysesOn<Loop>>();
+// Run all loop passes on loop \p L. Loop-nest passes don't run either because
+// \p L is not a top-level one or simply because there are no loop-nest passes
+// in the pass manager at all.
+PreservedAnalyses
+LoopPassManager::runWithoutLoopNestPasses(Loop &L, LoopAnalysisManager &AM,
+                                          LoopStandardAnalysisResults &AR,
+                                          LPMUpdater &U) {
+  PreservedAnalyses PA = PreservedAnalyses::all();
 
-  if (DebugLogging)
-    dbgs() << "Finished Loop pass manager run.\n";
+  // Request PassInstrumentation from analysis manager, will use it to run
+  // instrumenting callbacks for the passes later.
+  PassInstrumentation PI = AM.getResult<PassInstrumentationAnalysis>(L, AR);
+  for (auto &Pass : LoopPasses) {
+    Optional<PreservedAnalyses> PassPA = runSinglePass(L, Pass, AM, AR, U, PI);
+
+    // `PassPA` is `None` means that the before-pass callbacks in
+    // `PassInstrumentation` return false. The pass does not run in this case,
+    // so we can skip the following procedure.
+    if (!PassPA)
+      continue;
+
+    // If the loop was deleted, abort the run and return to the outer walk.
+    if (U.skipCurrentLoop()) {
+      PA.intersect(std::move(*PassPA));
+      break;
+    }
 
+    // Update the analysis manager as each pass runs and potentially
+    // invalidates analyses.
+    AM.invalidate(L, *PassPA);
+
+    // Finally, we intersect the final preserved analyses to compute the
+    // aggregate preserved set for this pass manager.
+    PA.intersect(std::move(*PassPA));
+
+    // FIXME: Historically, the pass managers all called the LLVM context's
+    // yield function here. We don't have a generic way to acquire the
+    // context and it isn't yet clear what the right pattern is for yielding
+    // in the new pass manager so it is currently omitted.
+    // ...getContext().yield();
+  }
   return PA;
 }
+} // namespace llvm
+
+PreservedAnalyses FunctionToLoopPassAdaptor::run(Function &F,
+                                                 FunctionAnalysisManager &AM) {
+  // Before we even compute any loop analyses, first run a miniature function
+  // pass pipeline to put loops into their canonical form. Note that we can
+  // directly build up function analyses after this as the function pass
+  // manager handles all the invalidation at that layer.
+  PassInstrumentation PI = AM.getResult<PassInstrumentationAnalysis>(F);
+
+  PreservedAnalyses PA = PreservedAnalyses::all();
+  // Check the PassInstrumentation's BeforePass callbacks before running the
+  // canonicalization pipeline.
+  if (PI.runBeforePass<Function>(LoopCanonicalizationFPM, F)) {
+    PA = LoopCanonicalizationFPM.run(F, AM);
+    PI.runAfterPass<Function>(LoopCanonicalizationFPM, F, PA);
+  }
+
+  // Get the loop structure for this function
+  LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
+
+  // If there are no loops, there is nothing to do here.
+  if (LI.empty())
+    return PA;
+
+  // Get the analysis results needed by loop passes.
+  MemorySSA *MSSA =
+      UseMemorySSA ? (&AM.getResult<MemorySSAAnalysis>(F).getMSSA()) : nullptr;
+  BlockFrequencyInfo *BFI = UseBlockFrequencyInfo && F.hasProfileData()
+                                ? (&AM.getResult<BlockFrequencyAnalysis>(F))
+                                : nullptr;
+  LoopStandardAnalysisResults LAR = {AM.getResult<AAManager>(F),
+                                     AM.getResult<AssumptionAnalysis>(F),
+                                     AM.getResult<DominatorTreeAnalysis>(F),
+                                     AM.getResult<LoopAnalysis>(F),
+                                     AM.getResult<ScalarEvolutionAnalysis>(F),
+                                     AM.getResult<TargetLibraryAnalysis>(F),
+                                     AM.getResult<TargetIRAnalysis>(F),
+                                     BFI,
+                                     MSSA};
+
+  // Setup the loop analysis manager from its proxy. It is important that
+  // this is only done when there are loops to process and we have built the
+  // LoopStandardAnalysisResults object. The loop analyses cached in this
+  // manager have access to those analysis results and so it must invalidate
+  // itself when they go away.
+  auto &LAMFP = AM.getResult<LoopAnalysisManagerFunctionProxy>(F);
+  if (UseMemorySSA)
+    LAMFP.markMSSAUsed();
+  LoopAnalysisManager &LAM = LAMFP.getManager();
+
+  // A postorder worklist of loops to process.
+  SmallPriorityWorklist<Loop *, 4> Worklist;
+
+  // Register the worklist and loop analysis manager so that loop passes can
+  // update them when they mutate the loop nest structure.
+  LPMUpdater Updater(Worklist, LAM, LoopNestMode);
+
+  // Add the loop nests in the reverse order of LoopInfo. See method
+  // declaration.
+  if (!LoopNestMode) {
+    appendLoopsToWorklist(LI, Worklist);
+  } else {
+    for (Loop *L : LI)
+      Worklist.insert(L);
+  }
+
+#ifndef NDEBUG
+  PI.pushBeforeNonSkippedPassCallback([&LAR, &LI](StringRef PassID, Any IR) {
+    if (isSpecialPass(PassID, {"PassManager"}))
+      return;
+    assert(any_isa<const Loop *>(IR) || any_isa<const LoopNest *>(IR));
+    const Loop *L = any_isa<const Loop *>(IR)
+                        ? any_cast<const Loop *>(IR)
+                        : &any_cast<const LoopNest *>(IR)->getOutermostLoop();
+    assert(L && "Loop should be valid for printing");
+
+    // Verify the loop structure and LCSSA form before visiting the loop.
+    L->verifyLoop();
+    assert(L->isRecursivelyLCSSAForm(LAR.DT, LI) &&
+           "Loops must remain in LCSSA form!");
+  });
+#endif
+
+  do {
+    Loop *L = Worklist.pop_back_val();
+    assert(!(LoopNestMode && L->getParentLoop()) &&
+           "L should be a top-level loop in loop-nest mode.");
+
+    // Reset the update structure for this loop.
+    Updater.CurrentL = L;
+    Updater.SkipCurrentLoop = false;
+
+#ifndef NDEBUG
+    // Save a parent loop pointer for asserts.
+    Updater.ParentL = L->getParentLoop();
+#endif
+    // Check the PassInstrumentation's BeforePass callbacks before running the
+    // pass, skip its execution completely if asked to (callback returns
+    // false).
+    if (!PI.runBeforePass<Loop>(*Pass, *L))
+      continue;
+
+    PreservedAnalyses PassPA;
+    {
+      TimeTraceScope TimeScope(Pass->name());
+      PassPA = Pass->run(*L, LAM, LAR, Updater);
+    }
+
+    // Do not pass deleted Loop into the instrumentation.
+    if (Updater.skipCurrentLoop())
+      PI.runAfterPassInvalidated<Loop>(*Pass, PassPA);
+    else
+      PI.runAfterPass<Loop>(*Pass, *L, PassPA);
+
+    // FIXME: We should verify the set of analyses relevant to Loop passes
+    // are preserved.
+
+    // If the loop hasn't been deleted, we need to handle invalidation here.
+    if (!Updater.skipCurrentLoop())
+      // We know that the loop pass couldn't have invalidated any other
+      // loop's analyses (that's the contract of a loop pass), so directly
+      // handle the loop analysis manager's invalidation here.
+      LAM.invalidate(*L, PassPA);
+
+    // Then intersect the preserved set so that invalidation of module
+    // analyses will eventually occur when the module pass completes.
+    PA.intersect(std::move(PassPA));
+  } while (!Worklist.empty());
+
+#ifndef NDEBUG
+  PI.popBeforeNonSkippedPassCallback();
+#endif
+
+  // By definition we preserve the proxy. We also preserve all analyses on
+  // Loops. This precludes *any* invalidation of loop analyses by the proxy,
+  // but that's OK because we've taken care to invalidate analyses in the
+  // loop analysis manager incrementally above.
+  PA.preserveSet<AllAnalysesOn<Loop>>();
+  PA.preserve<LoopAnalysisManagerFunctionProxy>();
+  // We also preserve the set of standard analyses.
+  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<LoopAnalysis>();
+  PA.preserve<ScalarEvolutionAnalysis>();
+  if (UseBlockFrequencyInfo && F.hasProfileData())
+    PA.preserve<BlockFrequencyAnalysis>();
+  if (UseMemorySSA)
+    PA.preserve<MemorySSAAnalysis>();
+  // FIXME: What we really want to do here is preserve an AA category, but
+  // that concept doesn't exist yet.
+  PA.preserve<AAManager>();
+  PA.preserve<BasicAA>();
+  PA.preserve<GlobalsAA>();
+  PA.preserve<SCEVAA>();
+  return PA;
 }
 
 PrintLoopPass::PrintLoopPass() : OS(dbgs()) {}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopPredication.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopPredication.cpp
index edde22d6708f..4f97641e2027 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopPredication.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopPredication.cpp
@@ -362,7 +362,7 @@ PreservedAnalyses LoopPredicationPass::run(Loop &L, LoopAnalysisManager &AM,
   // For the new PM, we also can't use BranchProbabilityInfo as an analysis
   // pass. Function analyses need to be preserved across loop transformations
   // but BPI is not preserved, hence a newly built one is needed.
-  BranchProbabilityInfo BPI(*F, AR.LI, &AR.TLI);
+  BranchProbabilityInfo BPI(*F, AR.LI, &AR.TLI, &AR.DT, nullptr);
   LoopPredication LP(&AR.AA, &AR.DT, &AR.SE, &AR.LI, &BPI);
   if (!LP.runOnLoop(&L))
     return PreservedAnalyses::all();
@@ -439,8 +439,8 @@ static bool isSafeToTruncateWideIVType(const DataLayout &DL,
                                        Type *RangeCheckType) {
   if (!EnableIVTruncation)
     return false;
-  assert(DL.getTypeSizeInBits(LatchCheck.IV->getType()) >
-             DL.getTypeSizeInBits(RangeCheckType) &&
+  assert(DL.getTypeSizeInBits(LatchCheck.IV->getType()).getFixedSize() >
+             DL.getTypeSizeInBits(RangeCheckType).getFixedSize() &&
          "Expected latch check IV type to be larger than range check operand "
          "type!");
   // The start and end values of the IV should be known. This is to guarantee
@@ -454,13 +454,13 @@ static bool isSafeToTruncateWideIVType(const DataLayout &DL,
   // LatchEnd = 2, rangeCheckType = i32. If it's not a monotonic predicate, the
   // IV wraps around, and the truncation of the IV would lose the range of
   // iterations between 2^32 and 2^64.
-  bool Increasing;
-  if (!SE.isMonotonicPredicate(LatchCheck.IV, LatchCheck.Pred, Increasing))
+  if (!SE.getMonotonicPredicateType(LatchCheck.IV, LatchCheck.Pred))
     return false;
   // The active bits should be less than the bits in the RangeCheckType. This
   // guarantees that truncating the latch check to RangeCheckType is a safe
   // operation.
-  auto RangeCheckTypeBitSize = DL.getTypeSizeInBits(RangeCheckType);
+  auto RangeCheckTypeBitSize =
+      DL.getTypeSizeInBits(RangeCheckType).getFixedSize();
   return Start->getAPInt().getActiveBits() < RangeCheckTypeBitSize &&
          Limit->getAPInt().getActiveBits() < RangeCheckTypeBitSize;
 }
@@ -477,7 +477,8 @@ static Optional<LoopICmp> generateLoopLatchCheck(const DataLayout &DL,
   if (RangeCheckType == LatchType)
     return LatchCheck;
   // For now, bail out if latch type is narrower than range type.
-  if (DL.getTypeSizeInBits(LatchType) < DL.getTypeSizeInBits(RangeCheckType))
+  if (DL.getTypeSizeInBits(LatchType).getFixedSize() <
+      DL.getTypeSizeInBits(RangeCheckType).getFixedSize())
     return None;
   if (!isSafeToTruncateWideIVType(DL, SE, LatchCheck, RangeCheckType))
     return None;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
index 3542d0a4ee73..65a6205f0302 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
@@ -50,6 +50,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopReroll.h"
 #include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -161,12 +162,12 @@ namespace {
     IL_End
   };
 
-  class LoopReroll : public LoopPass {
+  class LoopRerollLegacyPass : public LoopPass {
   public:
     static char ID; // Pass ID, replacement for typeid
 
-    LoopReroll() : LoopPass(ID) {
-      initializeLoopRerollPass(*PassRegistry::getPassRegistry());
+    LoopRerollLegacyPass() : LoopPass(ID) {
+      initializeLoopRerollLegacyPassPass(*PassRegistry::getPassRegistry());
     }
 
     bool runOnLoop(Loop *L, LPPassManager &LPM) override;
@@ -175,6 +176,15 @@ namespace {
       AU.addRequired<TargetLibraryInfoWrapperPass>();
       getLoopAnalysisUsage(AU);
     }
+  };
+
+  class LoopReroll {
+  public:
+    LoopReroll(AliasAnalysis *AA, LoopInfo *LI, ScalarEvolution *SE,
+               TargetLibraryInfo *TLI, DominatorTree *DT, bool PreserveLCSSA)
+        : AA(AA), LI(LI), SE(SE), TLI(TLI), DT(DT),
+          PreserveLCSSA(PreserveLCSSA) {}
+    bool runOnLoop(Loop *L);
 
   protected:
     AliasAnalysis *AA;
@@ -484,16 +494,16 @@ namespace {
 
 } // end anonymous namespace
 
-char LoopReroll::ID = 0;
+char LoopRerollLegacyPass::ID = 0;
 
-INITIALIZE_PASS_BEGIN(LoopReroll, "loop-reroll", "Reroll loops", false, false)
+INITIALIZE_PASS_BEGIN(LoopRerollLegacyPass, "loop-reroll", "Reroll loops",
+                      false, false)
 INITIALIZE_PASS_DEPENDENCY(LoopPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(LoopReroll, "loop-reroll", "Reroll loops", false, false)
+INITIALIZE_PASS_END(LoopRerollLegacyPass, "loop-reroll", "Reroll loops", false,
+                    false)
 
-Pass *llvm::createLoopRerollPass() {
-  return new LoopReroll;
-}
+Pass *llvm::createLoopRerollPass() { return new LoopRerollLegacyPass; }
 
 // Returns true if the provided instruction is used outside the given loop.
 // This operates like Instruction::isUsedOutsideOfBlock, but considers PHIs in
@@ -1071,6 +1081,12 @@ bool LoopReroll::DAGRootTracker::collectUsedInstructions(SmallInstructionSet &Po
   DenseSet<Instruction*> V;
   collectInLoopUserSet(LoopIncs, Exclude, PossibleRedSet, V);
   for (auto *I : V) {
+    if (I->mayHaveSideEffects()) {
+      LLVM_DEBUG(dbgs() << "LRR: Aborting - "
+                        << "An instruction which does not belong to any root "
+                        << "sets must not have side effects: " << *I);
+      return false;
+    }
     Uses[I].set(IL_All);
   }
 
@@ -1086,7 +1102,7 @@ LoopReroll::DAGRootTracker::nextInstr(int Val, UsesTy &In,
                                       UsesTy::iterator *StartI) {
   UsesTy::iterator I = StartI ? *StartI : In.begin();
   while (I != In.end() && (I->second.test(Val) == 0 ||
-                           Exclude.count(I->first) != 0))
+                           Exclude.contains(I->first)))
     ++I;
   return I;
 }
@@ -1644,18 +1660,7 @@ bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header,
   return true;
 }
 
-bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) {
-  if (skipLoop(L))
-    return false;
-
-  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
-  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
-  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
-      *L->getHeader()->getParent());
-  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
-
+bool LoopReroll::runOnLoop(Loop *L) {
   BasicBlock *Header = L->getHeader();
   LLVM_DEBUG(dbgs() << "LRR: F[" << Header->getParent()->getName() << "] Loop %"
                     << Header->getName() << " (" << L->getNumBlocks()
@@ -1704,3 +1709,26 @@ bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) {
 
   return Changed;
 }
+
+bool LoopRerollLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
+  if (skipLoop(L))
+    return false;
+
+  auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
+      *L->getHeader()->getParent());
+  auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
+
+  return LoopReroll(AA, LI, SE, TLI, DT, PreserveLCSSA).runOnLoop(L);
+}
+
+PreservedAnalyses LoopRerollPass::run(Loop &L, LoopAnalysisManager &AM,
+                                      LoopStandardAnalysisResults &AR,
+                                      LPMUpdater &U) {
+  return LoopReroll(&AR.AA, &AR.LI, &AR.SE, &AR.TLI, &AR.DT, true).runOnLoop(&L)
+             ? getLoopPassPreservedAnalyses()
+             : PreservedAnalyses::all();
+}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopRotation.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopRotation.cpp
index f92566ba77ce..ad1cfc68ece0 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopRotation.cpp
@@ -12,6 +12,7 @@
 
 #include "llvm/Transforms/Scalar/LoopRotation.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/MemorySSA.h"
@@ -33,22 +34,35 @@ static cl::opt<unsigned> DefaultRotationThreshold(
     "rotation-max-header-size", cl::init(16), cl::Hidden,
     cl::desc("The default maximum header size for automatic loop rotation"));
 
-LoopRotatePass::LoopRotatePass(bool EnableHeaderDuplication)
-    : EnableHeaderDuplication(EnableHeaderDuplication) {}
+static cl::opt<bool> PrepareForLTOOption(
+    "rotation-prepare-for-lto", cl::init(false), cl::Hidden,
+    cl::desc("Run loop-rotation in the prepare-for-lto stage. This option "
+             "should be used for testing only."));
+
+LoopRotatePass::LoopRotatePass(bool EnableHeaderDuplication, bool PrepareForLTO)
+    : EnableHeaderDuplication(EnableHeaderDuplication),
+      PrepareForLTO(PrepareForLTO) {}
 
 PreservedAnalyses LoopRotatePass::run(Loop &L, LoopAnalysisManager &AM,
                                       LoopStandardAnalysisResults &AR,
                                       LPMUpdater &) {
-  int Threshold = EnableHeaderDuplication ? DefaultRotationThreshold : 0;
+  // Vectorization requires loop-rotation. Use default threshold for loops the
+  // user explicitly marked for vectorization, even when header duplication is
+  // disabled.
+  int Threshold = EnableHeaderDuplication ||
+                          hasVectorizeTransformation(&L) == TM_ForcedByUser
+                      ? DefaultRotationThreshold
+                      : 0;
   const DataLayout &DL = L.getHeader()->getModule()->getDataLayout();
   const SimplifyQuery SQ = getBestSimplifyQuery(AR, DL);
 
   Optional<MemorySSAUpdater> MSSAU;
   if (AR.MSSA)
     MSSAU = MemorySSAUpdater(AR.MSSA);
-  bool Changed = LoopRotation(&L, &AR.LI, &AR.TTI, &AR.AC, &AR.DT, &AR.SE,
-                              MSSAU.hasValue() ? MSSAU.getPointer() : nullptr,
-                              SQ, false, Threshold, false);
+  bool Changed =
+      LoopRotation(&L, &AR.LI, &AR.TTI, &AR.AC, &AR.DT, &AR.SE,
+                   MSSAU.hasValue() ? MSSAU.getPointer() : nullptr, SQ, false,
+                   Threshold, false, PrepareForLTO || PrepareForLTOOption);
 
   if (!Changed)
     return PreservedAnalyses::all();
@@ -66,10 +80,13 @@ namespace {
 
 class LoopRotateLegacyPass : public LoopPass {
   unsigned MaxHeaderSize;
+  bool PrepareForLTO;
 
 public:
   static char ID; // Pass ID, replacement for typeid
-  LoopRotateLegacyPass(int SpecifiedMaxHeaderSize = -1) : LoopPass(ID) {
+  LoopRotateLegacyPass(int SpecifiedMaxHeaderSize = -1,
+                       bool PrepareForLTO = false)
+      : LoopPass(ID), PrepareForLTO(PrepareForLTO) {
     initializeLoopRotateLegacyPassPass(*PassRegistry::getPassRegistry());
     if (SpecifiedMaxHeaderSize == -1)
       MaxHeaderSize = DefaultRotationThreshold;
@@ -105,9 +122,17 @@ public:
       if (MSSAA)
         MSSAU = MemorySSAUpdater(&MSSAA->getMSSA());
     }
+    // Vectorization requires loop-rotation. Use default threshold for loops the
+    // user explicitly marked for vectorization, even when header duplication is
+    // disabled.
+    int Threshold = hasVectorizeTransformation(L) == TM_ForcedByUser
+                        ? DefaultRotationThreshold
+                        : MaxHeaderSize;
+
     return LoopRotation(L, LI, TTI, AC, &DT, &SE,
                         MSSAU.hasValue() ? MSSAU.getPointer() : nullptr, SQ,
-                        false, MaxHeaderSize, false);
+                        false, Threshold, false,
+                        PrepareForLTO || PrepareForLTOOption);
   }
 };
 } // end namespace
@@ -122,6 +147,6 @@ INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
 INITIALIZE_PASS_END(LoopRotateLegacyPass, "loop-rotate", "Rotate Loops", false,
                     false)
 
-Pass *llvm::createLoopRotatePass(int MaxHeaderSize) {
-  return new LoopRotateLegacyPass(MaxHeaderSize);
+Pass *llvm::createLoopRotatePass(int MaxHeaderSize, bool PrepareForLTO) {
+  return new LoopRotateLegacyPass(MaxHeaderSize, PrepareForLTO);
 }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
index 031e5b9c1d2c..cc6d11220807 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -16,7 +16,6 @@
 #include "llvm/Transforms/Scalar/LoopSimplifyCFG.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/DependenceAnalysis.h"
@@ -366,15 +365,20 @@ private:
 
     unsigned DummyIdx = 1;
     for (BasicBlock *BB : DeadExitBlocks) {
-      SmallVector<Instruction *, 4> DeadPhis;
+      // Eliminate all Phis and LandingPads from dead exits.
+      // TODO: Consider removing all instructions in this dead block.
+      SmallVector<Instruction *, 4> DeadInstructions;
       for (auto &PN : BB->phis())
-        DeadPhis.push_back(&PN);
+        DeadInstructions.push_back(&PN);
 
-      // Eliminate all Phis from dead exits.
-      for (Instruction *PN : DeadPhis) {
-        PN->replaceAllUsesWith(UndefValue::get(PN->getType()));
-        PN->eraseFromParent();
+      if (auto *LandingPad = dyn_cast<LandingPadInst>(BB->getFirstNonPHI()))
+        DeadInstructions.emplace_back(LandingPad);
+
+      for (Instruction *I : DeadInstructions) {
+        I->replaceAllUsesWith(UndefValue::get(I->getType()));
+        I->eraseFromParent();
       }
+
       assert(DummyIdx != 0 && "Too many dead exits!");
       DummySwitch->addCase(Builder.getInt32(DummyIdx++), BB);
       DTUpdates.push_back({DominatorTree::Insert, Preheader, BB});
@@ -410,9 +414,10 @@ private:
           FixLCSSALoop = FixLCSSALoop->getParentLoop();
         assert(FixLCSSALoop && "Should be a loop!");
         // We need all DT updates to be done before forming LCSSA.
-        DTU.applyUpdates(DTUpdates);
         if (MSSAU)
-          MSSAU->applyUpdates(DTUpdates, DT);
+          MSSAU->applyUpdates(DTUpdates, DT, /*UpdateDT=*/true);
+        else
+          DTU.applyUpdates(DTUpdates);
         DTUpdates.clear();
         formLCSSARecursively(*FixLCSSALoop, DT, &LI, &SE);
       }
@@ -420,8 +425,7 @@ private:
 
     if (MSSAU) {
       // Clear all updates now. Facilitates deletes that follow.
-      DTU.applyUpdates(DTUpdates);
-      MSSAU->applyUpdates(DTUpdates, DT);
+      MSSAU->applyUpdates(DTUpdates, DT, /*UpdateDT=*/true);
       DTUpdates.clear();
       if (VerifyMemorySSA)
         MSSAU->getMemorySSA()->verifyMemorySSA();
@@ -447,7 +451,7 @@ private:
       if (LI.isLoopHeader(BB)) {
         assert(LI.getLoopFor(BB) != &L && "Attempt to remove current loop!");
         Loop *DL = LI.getLoopFor(BB);
-        if (DL->getParentLoop()) {
+        if (!DL->isOutermost()) {
           for (auto *PL = DL->getParentLoop(); PL; PL = PL->getParentLoop())
             for (auto *BB : DL->getBlocks())
               PL->removeBlockFromLoop(BB);
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSink.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSink.cpp
index 1c03a4bf6c02..47698fdde69f 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSink.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSink.cpp
@@ -39,6 +39,8 @@
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/IR/Dominators.h"
@@ -67,6 +69,14 @@ static cl::opt<unsigned> MaxNumberOfUseBBsForSinking(
     "max-uses-for-sinking", cl::Hidden, cl::init(30),
     cl::desc("Do not sink instructions that have too many uses."));
 
+static cl::opt<bool> EnableMSSAInLoopSink(
+    "enable-mssa-in-loop-sink", cl::Hidden, cl::init(true),
+    cl::desc("Enable MemorySSA for LoopSink in new pass manager"));
+
+static cl::opt<bool> EnableMSSAInLegacyLoopSink(
+    "enable-mssa-in-legacy-loop-sink", cl::Hidden, cl::init(false),
+    cl::desc("Enable MemorySSA for LoopSink in legacy pass manager"));
+
 /// Return adjusted total frequency of \p BBs.
 ///
 /// * If there is only one BB, sinking instruction will not introduce code
@@ -172,11 +182,10 @@ findBBsToSinkInto(const Loop &L, const SmallPtrSetImpl<BasicBlock *> &UseBBs,
 // sinking is successful.
 // \p LoopBlockNumber is used to sort the insertion blocks to ensure
 // determinism.
-static bool sinkInstruction(Loop &L, Instruction &I,
-                            const SmallVectorImpl<BasicBlock *> &ColdLoopBBs,
-                            const SmallDenseMap<BasicBlock *, int, 16> &LoopBlockNumber,
-                            LoopInfo &LI, DominatorTree &DT,
-                            BlockFrequencyInfo &BFI) {
+static bool sinkInstruction(
+    Loop &L, Instruction &I, const SmallVectorImpl<BasicBlock *> &ColdLoopBBs,
+    const SmallDenseMap<BasicBlock *, int, 16> &LoopBlockNumber, LoopInfo &LI,
+    DominatorTree &DT, BlockFrequencyInfo &BFI, MemorySSAUpdater *MSSAU) {
   // Compute the set of blocks in loop L which contain a use of I.
   SmallPtrSet<BasicBlock *, 2> BBs;
   for (auto &U : I.uses()) {
@@ -213,8 +222,7 @@ static bool sinkInstruction(Loop &L, Instruction &I,
   // of the loop block numbers as iterating the set doesn't give a useful
   // order. No need to stable sort as the block numbers are a total ordering.
   SmallVector<BasicBlock *, 2> SortedBBsToSinkInto;
-  SortedBBsToSinkInto.insert(SortedBBsToSinkInto.begin(), BBsToSinkInto.begin(),
-                             BBsToSinkInto.end());
+  llvm::append_range(SortedBBsToSinkInto, BBsToSinkInto);
   llvm::sort(SortedBBsToSinkInto, [&](BasicBlock *A, BasicBlock *B) {
     return LoopBlockNumber.find(A)->second < LoopBlockNumber.find(B)->second;
   });
@@ -230,6 +238,21 @@ static bool sinkInstruction(Loop &L, Instruction &I,
     Instruction *IC = I.clone();
     IC->setName(I.getName());
     IC->insertBefore(&*N->getFirstInsertionPt());
+
+    if (MSSAU && MSSAU->getMemorySSA()->getMemoryAccess(&I)) {
+      // Create a new MemoryAccess and let MemorySSA set its defining access.
+      MemoryAccess *NewMemAcc =
+          MSSAU->createMemoryAccessInBB(IC, nullptr, N, MemorySSA::Beginning);
+      if (NewMemAcc) {
+        if (auto *MemDef = dyn_cast<MemoryDef>(NewMemAcc))
+          MSSAU->insertDef(MemDef, /*RenameUses=*/true);
+        else {
+          auto *MemUse = cast<MemoryUse>(NewMemAcc);
+          MSSAU->insertUse(MemUse, /*RenameUses=*/true);
+        }
+      }
+    }
+
     // Replaces uses of I with IC in N
     I.replaceUsesWithIf(IC, [N](Use &U) {
       return cast<Instruction>(U.getUser())->getParent() == N;
@@ -244,6 +267,11 @@ static bool sinkInstruction(Loop &L, Instruction &I,
   NumLoopSunk++;
   I.moveBefore(&*MoveBB->getFirstInsertionPt());
 
+  if (MSSAU)
+    if (MemoryUseOrDef *OldMemAcc = cast_or_null<MemoryUseOrDef>(
+            MSSAU->getMemorySSA()->getMemoryAccess(&I)))
+      MSSAU->moveToPlace(OldMemAcc, MoveBB, MemorySSA::Beginning);
+
   return true;
 }
 
@@ -252,15 +280,14 @@ static bool sinkInstruction(Loop &L, Instruction &I,
 static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI,
                                           DominatorTree &DT,
                                           BlockFrequencyInfo &BFI,
-                                          ScalarEvolution *SE) {
+                                          ScalarEvolution *SE,
+                                          AliasSetTracker *CurAST,
+                                          MemorySSA *MSSA) {
   BasicBlock *Preheader = L.getLoopPreheader();
-  if (!Preheader)
-    return false;
+  assert(Preheader && "Expected loop to have preheader");
 
-  // Enable LoopSink only when runtime profile is available.
-  // With static profile, the sinking decision may be sub-optimal.
-  if (!Preheader->getParent()->hasProfileData())
-    return false;
+  assert(Preheader->getParent()->hasProfileData() &&
+         "Unexpected call when profile data unavailable.");
 
   const BlockFrequency PreheaderFreq = BFI.getBlockFreq(Preheader);
   // If there are no basic blocks with lower frequency than the preheader then
@@ -271,13 +298,15 @@ static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI,
       }))
     return false;
 
-  bool Changed = false;
-  AliasSetTracker CurAST(AA);
+  std::unique_ptr<MemorySSAUpdater> MSSAU;
+  std::unique_ptr<SinkAndHoistLICMFlags> LICMFlags;
+  if (MSSA) {
+    MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
+    LICMFlags =
+        std::make_unique<SinkAndHoistLICMFlags>(/*IsSink=*/true, &L, MSSA);
+  }
 
-  // Compute alias set.
-  for (BasicBlock *BB : L.blocks())
-    CurAST.add(*BB);
-  CurAST.add(*Preheader);
+  bool Changed = false;
 
   // Sort loop's basic blocks by frequency
   SmallVector<BasicBlock *, 10> ColdLoopBBs;
@@ -300,9 +329,11 @@ static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI,
     // No need to check for instruction's operands are loop invariant.
     assert(L.hasLoopInvariantOperands(I) &&
            "Insts in a loop's preheader should have loop invariant operands!");
-    if (!canSinkOrHoistInst(*I, &AA, &DT, &L, &CurAST, nullptr, false))
+    if (!canSinkOrHoistInst(*I, &AA, &DT, &L, CurAST, MSSAU.get(), false,
+                            LICMFlags.get()))
       continue;
-    if (sinkInstruction(L, *I, ColdLoopBBs, LoopBlockNumber, LI, DT, BFI))
+    if (sinkInstruction(L, *I, ColdLoopBBs, LoopBlockNumber, LI, DT, BFI,
+                        MSSAU.get()))
       Changed = true;
   }
 
@@ -311,6 +342,13 @@ static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI,
   return Changed;
 }
 
+static void computeAliasSet(Loop &L, BasicBlock &Preheader,
+                            AliasSetTracker &CurAST) {
+  for (BasicBlock *BB : L.blocks())
+    CurAST.add(*BB);
+  CurAST.add(Preheader);
+}
+
 PreservedAnalyses LoopSinkPass::run(Function &F, FunctionAnalysisManager &FAM) {
   LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);
   // Nothing to do if there are no loops.
@@ -321,6 +359,10 @@ PreservedAnalyses LoopSinkPass::run(Function &F, FunctionAnalysisManager &FAM) {
   DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
   BlockFrequencyInfo &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
 
+  MemorySSA *MSSA = EnableMSSAInLoopSink
+                        ? &FAM.getResult<MemorySSAAnalysis>(F).getMSSA()
+                        : nullptr;
+
   // We want to do a postorder walk over the loops. Since loops are a tree this
   // is equivalent to a reversed preorder walk and preorder is easy to compute
   // without recursion. Since we reverse the preorder, we will visit siblings
@@ -332,11 +374,27 @@ PreservedAnalyses LoopSinkPass::run(Function &F, FunctionAnalysisManager &FAM) {
   do {
     Loop &L = *PreorderLoops.pop_back_val();
 
+    BasicBlock *Preheader = L.getLoopPreheader();
+    if (!Preheader)
+      continue;
+
+    // Enable LoopSink only when runtime profile is available.
+    // With static profile, the sinking decision may be sub-optimal.
+    if (!Preheader->getParent()->hasProfileData())
+      continue;
+
+    std::unique_ptr<AliasSetTracker> CurAST;
+    if (!EnableMSSAInLoopSink) {
+      CurAST = std::make_unique<AliasSetTracker>(AA);
+      computeAliasSet(L, *Preheader, *CurAST.get());
+    }
+
     // Note that we don't pass SCEV here because it is only used to invalidate
     // loops in SCEV and we don't preserve (or request) SCEV at all making that
     // unnecessary.
     Changed |= sinkLoopInvariantInstructions(L, AA, LI, DT, BFI,
-                                             /*ScalarEvolution*/ nullptr);
+                                             /*ScalarEvolution*/ nullptr,
+                                             CurAST.get(), MSSA);
   } while (!PreorderLoops.empty());
 
   if (!Changed)
@@ -344,6 +402,14 @@ PreservedAnalyses LoopSinkPass::run(Function &F, FunctionAnalysisManager &FAM) {
 
   PreservedAnalyses PA;
   PA.preserveSet<CFGAnalyses>();
+
+  if (MSSA) {
+    PA.preserve<MemorySSAAnalysis>();
+
+    if (VerifyMemorySSA)
+      MSSA->verifyMemorySSA();
+  }
+
   return PA;
 }
 
@@ -358,19 +424,46 @@ struct LegacyLoopSinkPass : public LoopPass {
     if (skipLoop(L))
       return false;
 
+    BasicBlock *Preheader = L->getLoopPreheader();
+    if (!Preheader)
+      return false;
+
+    // Enable LoopSink only when runtime profile is available.
+    // With static profile, the sinking decision may be sub-optimal.
+    if (!Preheader->getParent()->hasProfileData())
+      return false;
+
+    AAResults &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
     auto *SE = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
-    return sinkLoopInvariantInstructions(
-        *L, getAnalysis<AAResultsWrapperPass>().getAAResults(),
-        getAnalysis<LoopInfoWrapperPass>().getLoopInfo(),
+    std::unique_ptr<AliasSetTracker> CurAST;
+    MemorySSA *MSSA = nullptr;
+    if (EnableMSSAInLegacyLoopSink)
+      MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
+    else {
+      CurAST = std::make_unique<AliasSetTracker>(AA);
+      computeAliasSet(*L, *Preheader, *CurAST.get());
+    }
+
+    bool Changed = sinkLoopInvariantInstructions(
+        *L, AA, getAnalysis<LoopInfoWrapperPass>().getLoopInfo(),
         getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
         getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(),
-        SE ? &SE->getSE() : nullptr);
+        SE ? &SE->getSE() : nullptr, CurAST.get(), MSSA);
+
+    if (MSSA && VerifyMemorySSA)
+      MSSA->verifyMemorySSA();
+
+    return Changed;
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
     AU.addRequired<BlockFrequencyInfoWrapperPass>();
     getLoopAnalysisUsage(AU);
+    if (EnableMSSAInLegacyLoopSink) {
+      AU.addRequired<MemorySSAWrapperPass>();
+      AU.addPreserved<MemorySSAWrapperPass>();
+    }
   }
 };
 }
@@ -380,6 +473,7 @@ INITIALIZE_PASS_BEGIN(LegacyLoopSinkPass, "loop-sink", "Loop Sink", false,
                       false)
 INITIALIZE_PASS_DEPENDENCY(LoopPass)
 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
 INITIALIZE_PASS_END(LegacyLoopSinkPass, "loop-sink", "Loop Sink", false, false)
 
 Pass *llvm::createLoopSinkPass() { return new LegacyLoopSinkPass(); }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index cf02ef1e83f3..5dec9b542076 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -75,11 +75,13 @@
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/ScalarEvolutionNormalization.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/GlobalValue.h"
@@ -422,7 +424,7 @@ static void DoInitialMatch(const SCEV *S, Loop *L,
   // Handle a multiplication by -1 (negation) if it didn't fold.
   if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S))
     if (Mul->getOperand(0)->isAllOnesValue()) {
-      SmallVector<const SCEV *, 4> Ops(Mul->op_begin()+1, Mul->op_end());
+      SmallVector<const SCEV *, 4> Ops(drop_begin(Mul->operands()));
       const SCEV *NewMul = SE.getMulExpr(Ops);
 
       SmallVector<const SCEV *, 4> MyGood;
@@ -483,11 +485,10 @@ bool Formula::isCanonical(const Loop &L) const {
   // If ScaledReg is not a recurrent expr, or it is but its loop is not current
   // loop, meanwhile BaseRegs contains a recurrent expr reg related with current
   // loop, we want to swap the reg in BaseRegs with ScaledReg.
-  auto I =
-      find_if(make_range(BaseRegs.begin(), BaseRegs.end()), [&](const SCEV *S) {
-        return isa<const SCEVAddRecExpr>(S) &&
-               (cast<SCEVAddRecExpr>(S)->getLoop() == &L);
-      });
+  auto I = find_if(BaseRegs, [&](const SCEV *S) {
+    return isa<const SCEVAddRecExpr>(S) &&
+           (cast<SCEVAddRecExpr>(S)->getLoop() == &L);
+  });
   return I == BaseRegs.end();
 }
 
@@ -506,8 +507,7 @@ void Formula::canonicalize(const Loop &L) {
 
   // Keep the invariant sum in BaseRegs and one of the variant sum in ScaledReg.
   if (!ScaledReg) {
-    ScaledReg = BaseRegs.back();
-    BaseRegs.pop_back();
+    ScaledReg = BaseRegs.pop_back_val();
     Scale = 1;
   }
 
@@ -516,11 +516,10 @@ void Formula::canonicalize(const Loop &L) {
   // reg with ScaledReg.
   const SCEVAddRecExpr *SAR = dyn_cast<const SCEVAddRecExpr>(ScaledReg);
   if (!SAR || SAR->getLoop() != &L) {
-    auto I = find_if(make_range(BaseRegs.begin(), BaseRegs.end()),
-                     [&](const SCEV *S) {
-                       return isa<const SCEVAddRecExpr>(S) &&
-                              (cast<SCEVAddRecExpr>(S)->getLoop() == &L);
-                     });
+    auto I = find_if(BaseRegs, [&](const SCEV *S) {
+      return isa<const SCEVAddRecExpr>(S) &&
+             (cast<SCEVAddRecExpr>(S)->getLoop() == &L);
+    });
     if (I != BaseRegs.end())
       std::swap(ScaledReg, *I);
   }
@@ -753,13 +752,13 @@ static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
       return C->getValue()->getSExtValue();
     }
   } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
-    SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
+    SmallVector<const SCEV *, 8> NewOps(Add->operands());
     int64_t Result = ExtractImmediate(NewOps.front(), SE);
     if (Result != 0)
       S = SE.getAddExpr(NewOps);
     return Result;
   } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
-    SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
+    SmallVector<const SCEV *, 8> NewOps(AR->operands());
     int64_t Result = ExtractImmediate(NewOps.front(), SE);
     if (Result != 0)
       S = SE.getAddRecExpr(NewOps, AR->getLoop(),
@@ -779,13 +778,13 @@ static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) {
       return GV;
     }
   } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
-    SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
+    SmallVector<const SCEV *, 8> NewOps(Add->operands());
     GlobalValue *Result = ExtractSymbol(NewOps.back(), SE);
     if (Result)
       S = SE.getAddExpr(NewOps);
     return Result;
   } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
-    SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
+    SmallVector<const SCEV *, 8> NewOps(AR->operands());
     GlobalValue *Result = ExtractSymbol(NewOps.front(), SE);
     if (Result)
       S = SE.getAddRecExpr(NewOps, AR->getLoop(),
@@ -935,6 +934,8 @@ static bool isHighCostExpansion(const SCEV *S,
   case scSignExtend:
     return isHighCostExpansion(cast<SCEVSignExtendExpr>(S)->getOperand(),
                                Processed, SE);
+  default:
+    break;
   }
 
   if (!Processed.insert(S).second)
@@ -1210,7 +1211,7 @@ static unsigned getSetupCost(const SCEV *Reg, unsigned Depth) {
     return 0;
   if (const auto *S = dyn_cast<SCEVAddRecExpr>(Reg))
     return getSetupCost(S->getStart(), Depth - 1);
-  if (auto S = dyn_cast<SCEVCastExpr>(Reg))
+  if (auto S = dyn_cast<SCEVIntegralCastExpr>(Reg))
     return getSetupCost(S->getOperand(), Depth - 1);
   if (auto S = dyn_cast<SCEVNAryExpr>(Reg))
     return std::accumulate(S->op_begin(), S->op_end(), 0,
@@ -2786,6 +2787,7 @@ static const SCEV *getExprBase(const SCEV *S) {
   case scAddRecExpr:
     return getExprBase(cast<SCEVAddRecExpr>(S)->getStart());
   }
+  llvm_unreachable("Unknown SCEV kind!");
 }
 
 /// Return true if the chain increment is profitable to expand into a loop
@@ -2861,7 +2863,6 @@ static bool isProfitableChain(IVChain &Chain,
   for (const IVInc &Inc : Chain) {
     if (TTI.isProfitableLSRChainElement(Inc.UserInst))
       return true;
-
     if (Inc.IncExpr->isZero())
       continue;
 
@@ -3401,7 +3402,7 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
 
     if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
       Worklist.append(N->op_begin(), N->op_end());
-    else if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S))
+    else if (const SCEVIntegralCastExpr *C = dyn_cast<SCEVIntegralCastExpr>(S))
       Worklist.push_back(C->getOperand());
     else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
       Worklist.push_back(D->getLHS());
@@ -3834,10 +3835,14 @@ void LSRInstance::GenerateConstantOffsetsImpl(
   F.BaseOffset = (uint64_t)F.BaseOffset + Imm;
   if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
     return;
-  if (IsScaledReg)
+  if (IsScaledReg) {
     F.ScaledReg = G;
-  else
+  } else {
     F.BaseRegs[Idx] = G;
+    // We may generate non canonical Formula if G is a recurrent expr reg
+    // related with current loop while F.ScaledReg is not.
+    F.canonicalize(*L);
+  }
   (void)InsertFormula(LU, LUIdx, F);
 }
 
@@ -5378,10 +5383,11 @@ void LSRInstance::RewriteForPHI(
           // Split the critical edge.
           BasicBlock *NewBB = nullptr;
           if (!Parent->isLandingPad()) {
-            NewBB = SplitCriticalEdge(BB, Parent,
-                                      CriticalEdgeSplittingOptions(&DT, &LI)
-                                          .setMergeIdenticalEdges()
-                                          .setKeepOneInputPHIs());
+            NewBB =
+                SplitCriticalEdge(BB, Parent,
+                                  CriticalEdgeSplittingOptions(&DT, &LI, MSSAU)
+                                      .setMergeIdenticalEdges()
+                                      .setKeepOneInputPHIs());
           } else {
             SmallVector<BasicBlock*, 2> NewBBs;
             SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs, &DT, &LI);
@@ -5514,8 +5520,8 @@ void LSRInstance::ImplementSolution(
   // we can remove them after we are done working.
   SmallVector<WeakTrackingVH, 16> DeadInsts;
 
-  SCEVExpander Rewriter(SE, L->getHeader()->getModule()->getDataLayout(),
-                        "lsr");
+  SCEVExpander Rewriter(SE, L->getHeader()->getModule()->getDataLayout(), "lsr",
+                        false);
 #ifndef NDEBUG
   Rewriter.setDebugType(DEBUG_TYPE);
 #endif
@@ -5614,13 +5620,19 @@ LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
   if (IU.empty()) return;
 
   // Skip nested loops until we can model them better with formulae.
-  if (!L->empty()) {
+  if (!L->isInnermost()) {
     LLVM_DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
     return;
   }
 
   // Start collecting data and preparing for the solver.
-  CollectChains();
+  // If number of registers is not the major cost, we cannot benefit from the
+  // current profitable chain optimization which is based on number of
+  // registers.
+  // FIXME: add profitable chain optimization for other kinds major cost, for
+  // example number of instructions.
+  if (TTI.isNumRegsMajorCostOfLSR() || StressIVChain)
+    CollectChains();
   CollectInterestingTypesAndFactors();
   CollectFixupsAndInitialFormulae();
   CollectLoopInvariantFixupsAndFormulae();
@@ -5760,6 +5772,63 @@ void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addPreserved<MemorySSAWrapperPass>();
 }
 
+using EqualValues = SmallVector<std::tuple<WeakVH, int64_t, DIExpression *>, 4>;
+using EqualValuesMap = DenseMap<DbgValueInst *, EqualValues>;
+
+static void DbgGatherEqualValues(Loop *L, ScalarEvolution &SE,
+                                 EqualValuesMap &DbgValueToEqualSet) {
+  for (auto &B : L->getBlocks()) {
+    for (auto &I : *B) {
+      auto DVI = dyn_cast<DbgValueInst>(&I);
+      if (!DVI)
+        continue;
+      auto V = DVI->getVariableLocation();
+      if (!V || !SE.isSCEVable(V->getType()))
+        continue;
+      auto DbgValueSCEV = SE.getSCEV(V);
+      EqualValues EqSet;
+      for (PHINode &Phi : L->getHeader()->phis()) {
+        if (V->getType() != Phi.getType())
+          continue;
+        if (!SE.isSCEVable(Phi.getType()))
+          continue;
+        auto PhiSCEV = SE.getSCEV(&Phi);
+        Optional<APInt> Offset =
+                SE.computeConstantDifference(DbgValueSCEV, PhiSCEV);
+        if (Offset && Offset->getMinSignedBits() <= 64)
+          EqSet.emplace_back(std::make_tuple(
+              &Phi, Offset.getValue().getSExtValue(), DVI->getExpression()));
+      }
+      DbgValueToEqualSet[DVI] = std::move(EqSet);
+    }
+  }
+}
+
+static void DbgApplyEqualValues(EqualValuesMap &DbgValueToEqualSet) {
+  for (auto A : DbgValueToEqualSet) {
+    auto DVI = A.first;
+    // Only update those that are now undef.
+    if (!isa_and_nonnull<UndefValue>(DVI->getVariableLocation()))
+      continue;
+    for (auto EV : A.second) {
+      auto V = std::get<WeakVH>(EV);
+      if (!V)
+        continue;
+      auto DbgDIExpr = std::get<DIExpression *>(EV);
+      auto Offset = std::get<int64_t>(EV);
+      auto &Ctx = DVI->getContext();
+      DVI->setOperand(0, MetadataAsValue::get(Ctx, ValueAsMetadata::get(V)));
+      if (Offset) {
+        SmallVector<uint64_t, 8> Ops;
+        DIExpression::appendOffset(Ops, Offset);
+        DbgDIExpr = DIExpression::prependOpcodes(DbgDIExpr, Ops, true);
+      }
+      DVI->setOperand(2, MetadataAsValue::get(Ctx, DbgDIExpr));
+      break;
+    }
+  }
+}
+
 static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE,
                                DominatorTree &DT, LoopInfo &LI,
                                const TargetTransformInfo &TTI,
@@ -5775,12 +5844,17 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE,
   Changed |=
       LSRInstance(L, IU, SE, DT, LI, TTI, AC, TLI, MSSAU.get()).getChanged();
 
+  // Debug preservation - before we start removing anything create equivalence
+  // sets for the llvm.dbg.value intrinsics.
+  EqualValuesMap DbgValueToEqualSet;
+  DbgGatherEqualValues(L, SE, DbgValueToEqualSet);
+
   // Remove any extra phis created by processing inner loops.
   Changed |= DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
   if (EnablePhiElim && L->isLoopSimplifyForm()) {
     SmallVector<WeakTrackingVH, 16> DeadInsts;
     const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
-    SCEVExpander Rewriter(SE, DL, "lsr");
+    SCEVExpander Rewriter(SE, DL, "lsr", false);
 #ifndef NDEBUG
     Rewriter.setDebugType(DEBUG_TYPE);
 #endif
@@ -5792,6 +5866,9 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE,
       DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
     }
   }
+
+  DbgApplyEqualValues(DbgValueToEqualSet);
+
   return Changed;
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
index 285cba6ee205..495906e1a763 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
@@ -41,6 +41,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/LoopPeel.h"
 #include "llvm/Transforms/Utils/LoopSimplify.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
@@ -287,6 +288,13 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
                                  None, None, None, None, None);
   TargetTransformInfo::PeelingPreferences PP =
       gatherPeelingPreferences(L, SE, TTI, None, None);
+
+  TransformationMode EnableMode = hasUnrollAndJamTransformation(L);
+  if (EnableMode & TM_Disable)
+    return LoopUnrollResult::Unmodified;
+  if (EnableMode & TM_ForcedByUser)
+    UP.UnrollAndJam = true;
+
   if (AllowUnrollAndJam.getNumOccurrences() > 0)
     UP.UnrollAndJam = AllowUnrollAndJam;
   if (UnrollAndJamThreshold.getNumOccurrences() > 0)
@@ -299,10 +307,6 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
                     << L->getHeader()->getParent()->getName() << "] Loop %"
                     << L->getHeader()->getName() << "\n");
 
-  TransformationMode EnableMode = hasUnrollAndJamTransformation(L);
-  if (EnableMode & TM_Disable)
-    return LoopUnrollResult::Unmodified;
-
   // A loop with any unroll pragma (enabling/disabling/count/etc) is left for
   // the unroller, so long as it does not explicitly have unroll_and_jam
   // metadata. This means #pragma nounroll will disable unroll and jam as well
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 87f40bb7ba85..1b974576a3cc 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -56,6 +56,7 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/LoopPeel.h"
 #include "llvm/Transforms/Utils/LoopSimplify.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/SizeOpts.h"
@@ -75,13 +76,19 @@ using namespace llvm;
 cl::opt<bool> llvm::ForgetSCEVInLoopUnroll(
     "forget-scev-loop-unroll", cl::init(false), cl::Hidden,
     cl::desc("Forget everything in SCEV when doing LoopUnroll, instead of just"
-             " the current top-most loop. This is somtimes preferred to reduce"
+             " the current top-most loop. This is sometimes preferred to reduce"
              " compile time."));
 
 static cl::opt<unsigned>
     UnrollThreshold("unroll-threshold", cl::Hidden,
                     cl::desc("The cost threshold for loop unrolling"));
 
+static cl::opt<unsigned>
+    UnrollOptSizeThreshold(
+      "unroll-optsize-threshold", cl::init(0), cl::Hidden,
+      cl::desc("The cost threshold for loop unrolling when optimizing for "
+               "size"));
+
 static cl::opt<unsigned> UnrollPartialThreshold(
     "unroll-partial-threshold", cl::Hidden,
     cl::desc("The cost threshold for partial loop unrolling"));
@@ -115,10 +122,6 @@ static cl::opt<unsigned> UnrollFullMaxCount(
     cl::desc(
         "Set the max unroll count for full unrolling, for testing purposes"));
 
-static cl::opt<unsigned> UnrollPeelCount(
-    "unroll-peel-count", cl::Hidden,
-    cl::desc("Set the unroll peeling count, for testing purposes"));
-
 static cl::opt<bool>
     UnrollAllowPartial("unroll-allow-partial", cl::Hidden,
                        cl::desc("Allows loops to be partially unrolled until "
@@ -149,15 +152,6 @@ static cl::opt<unsigned> FlatLoopTripCountThreshold(
              "threshold, the loop is considered as flat and will be less "
              "aggressively unrolled."));
 
-static cl::opt<bool>
-    UnrollAllowPeeling("unroll-allow-peeling", cl::init(true), cl::Hidden,
-                       cl::desc("Allows loops to be peeled when the dynamic "
-                                "trip count is known to be low."));
-
-static cl::opt<bool> UnrollAllowLoopNestsPeeling(
-    "unroll-allow-loop-nests-peeling", cl::init(false), cl::Hidden,
-    cl::desc("Allows loop nests to be peeled."));
-
 static cl::opt<bool> UnrollUnrollRemainder(
   "unroll-remainder", cl::Hidden,
   cl::desc("Allow the loop remainder to be unrolled."));
@@ -200,9 +194,9 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
   UP.Threshold =
       OptLevel > 2 ? UnrollThresholdAggressive : UnrollThresholdDefault;
   UP.MaxPercentThresholdBoost = 400;
-  UP.OptSizeThreshold = 0;
+  UP.OptSizeThreshold = UnrollOptSizeThreshold;
   UP.PartialThreshold = 150;
-  UP.PartialOptSizeThreshold = 0;
+  UP.PartialOptSizeThreshold = UnrollOptSizeThreshold;
   UP.Count = 0;
   UP.DefaultUnrollRuntimeCount = 8;
   UP.MaxCount = std::numeric_limits<unsigned>::max();
@@ -224,8 +218,10 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
 
   // Apply size attributes
   bool OptForSize = L->getHeader()->getParent()->hasOptSize() ||
-                    llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
-                                                PGSOQueryType::IRPass);
+                    // Let unroll hints / pragmas take precedence over PGSO.
+                    (hasUnrollTransformation(L) != TM_ForcedByUser &&
+                     llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
+                                                 PGSOQueryType::IRPass));
   if (OptForSize) {
     UP.Threshold = UP.OptSizeThreshold;
     UP.PartialThreshold = UP.PartialOptSizeThreshold;
@@ -275,39 +271,6 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
   return UP;
 }
 
-TargetTransformInfo::PeelingPreferences
-llvm::gatherPeelingPreferences(Loop *L, ScalarEvolution &SE,
-                               const TargetTransformInfo &TTI,
-                               Optional<bool> UserAllowPeeling,
-                               Optional<bool> UserAllowProfileBasedPeeling) {
-  TargetTransformInfo::PeelingPreferences PP;
-
-  // Default values
-  PP.PeelCount = 0;
-  PP.AllowPeeling = true;
-  PP.AllowLoopNestsPeeling = false;
-  PP.PeelProfiledIterations = true;
-
-  // Get Target Specifc Values
-  TTI.getPeelingPreferences(L, SE, PP);
-
-  // User Specified Values using cl::opt
-  if (UnrollPeelCount.getNumOccurrences() > 0)
-    PP.PeelCount = UnrollPeelCount;
-  if (UnrollAllowPeeling.getNumOccurrences() > 0)
-    PP.AllowPeeling = UnrollAllowPeeling;
-  if (UnrollAllowLoopNestsPeeling.getNumOccurrences() > 0)
-    PP.AllowLoopNestsPeeling = UnrollAllowLoopNestsPeeling;
-
-  // User Specifed values provided by argument
-  if (UserAllowPeeling.hasValue())
-    PP.AllowPeeling = *UserAllowPeeling;
-  if (UserAllowProfileBasedPeeling.hasValue())
-    PP.PeelProfiledIterations = *UserAllowProfileBasedPeeling;
-
-  return PP;
-}
-
 namespace {
 
 /// A struct to densely store the state of an instruction after unrolling at
@@ -384,7 +347,7 @@ static Optional<EstimatedUnrollCost> analyzeLoopUnrollCost(
 
   // Only analyze inner loops. We can't properly estimate cost of nested loops
   // and we won't visit inner loops again anyway.
-  if (!L->empty())
+  if (!L->isInnermost())
     return None;
 
   // Don't simulate loops with a big or unknown tripcount
@@ -426,6 +389,10 @@ static Optional<EstimatedUnrollCost> analyzeLoopUnrollCost(
     assert(CostWorklist.empty() && "Must start with an empty cost list");
     assert(PHIUsedList.empty() && "Must start with an empty phi used list");
     CostWorklist.push_back(&RootI);
+    TargetTransformInfo::TargetCostKind CostKind =
+      RootI.getFunction()->hasMinSize() ?
+      TargetTransformInfo::TCK_CodeSize :
+      TargetTransformInfo::TCK_SizeAndLatency;
     for (;; --Iteration) {
       do {
         Instruction *I = CostWorklist.pop_back_val();
@@ -466,7 +433,7 @@ static Optional<EstimatedUnrollCost> analyzeLoopUnrollCost(
 
         // First accumulate the cost of this instruction.
         if (!Cost.IsFree) {
-          UnrolledCost += TTI.getUserCost(I, TargetTransformInfo::TCK_CodeSize);
+          UnrolledCost += TTI.getUserCost(I, CostKind);
           LLVM_DEBUG(dbgs() << "Adding cost of instruction (iteration "
                             << Iteration << "): ");
           LLVM_DEBUG(I->dump());
@@ -506,6 +473,9 @@ static Optional<EstimatedUnrollCost> analyzeLoopUnrollCost(
 
   LLVM_DEBUG(dbgs() << "Starting LoopUnroll profitability analysis...\n");
 
+  TargetTransformInfo::TargetCostKind CostKind =
+    L->getHeader()->getParent()->hasMinSize() ?
+    TargetTransformInfo::TCK_CodeSize : TargetTransformInfo::TCK_SizeAndLatency;
   // Simulate execution of each iteration of the loop counting instructions,
   // which would be simplified.
   // Since the same load will take different values on different iterations,
@@ -559,7 +529,7 @@ static Optional<EstimatedUnrollCost> analyzeLoopUnrollCost(
 
         // Track this instruction's expected baseline cost when executing the
         // rolled loop form.
-        RolledDynamicCost += TTI.getUserCost(&I, TargetTransformInfo::TCK_CodeSize);
+        RolledDynamicCost += TTI.getUserCost(&I, CostKind);
 
         // Visit the instruction to analyze its loop cost after unrolling,
         // and if the visitor returns true, mark the instruction as free after
@@ -881,7 +851,7 @@ bool llvm::computeUnrollCount(
   }
 
   // 4th priority is loop peeling.
-  computePeelCount(L, LoopSize, UP, PP, TripCount, SE);
+  computePeelCount(L, LoopSize, PP, TripCount, SE, UP.Threshold);
   if (PP.PeelCount) {
     UP.Runtime = false;
     UP.Count = 1;
@@ -1073,7 +1043,7 @@ static LoopUnrollResult tryToUnrollLoop(
     return LoopUnrollResult::Unmodified;
   }
 
-  // When automtatic unrolling is disabled, do not unroll unless overridden for
+  // When automatic unrolling is disabled, do not unroll unless overridden for
   // this loop.
   if (OnlyWhenForced && !(TM & TM_Enable))
     return LoopUnrollResult::Unmodified;
@@ -1087,7 +1057,7 @@ static LoopUnrollResult tryToUnrollLoop(
       ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound,
       ProvidedFullUnrollMaxCount);
   TargetTransformInfo::PeelingPreferences PP = gatherPeelingPreferences(
-      L, SE, TTI, ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling);
+      L, SE, TTI, ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling, true);
 
   // Exit early if unrolling is disabled. For OptForSize, we pick the loop size
   // as threshold later on.
@@ -1135,7 +1105,7 @@ static LoopUnrollResult tryToUnrollLoop(
   // If the loop contains a convergent operation, the prelude we'd add
   // to do the first few instructions before we hit the unrolled loop
   // is unsafe -- it adds a control-flow dependency to the convergent
-  // operation.  Therefore restrict remainder loop (try unrollig without).
+  // operation.  Therefore restrict remainder loop (try unrolling without).
   //
   // TODO: This is quite conservative.  In practice, convergent_op()
   // is likely to be called unconditionally in the loop.  In this
@@ -1331,7 +1301,7 @@ Pass *llvm::createLoopUnrollPass(int OptLevel, bool OnlyWhenForced,
 Pass *llvm::createSimpleLoopUnrollPass(int OptLevel, bool OnlyWhenForced,
                                        bool ForgetAllSCEV) {
   return createLoopUnrollPass(OptLevel, OnlyWhenForced, ForgetAllSCEV, -1, -1,
-                              0, 0, 0, 0);
+                              0, 0, 0, 1);
 }
 
 PreservedAnalyses LoopFullUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
@@ -1359,7 +1329,7 @@ PreservedAnalyses LoopFullUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
                                  OnlyWhenForced, ForgetSCEV, /*Count*/ None,
                                  /*Threshold*/ None, /*AllowPartial*/ false,
                                  /*Runtime*/ false, /*UpperBound*/ false,
-                                 /*AllowPeeling*/ false,
+                                 /*AllowPeeling*/ true,
                                  /*AllowProfileBasedPeeling*/ false,
                                  /*FullUnrollMaxCount*/ None) !=
                  LoopUnrollResult::Unmodified;
@@ -1401,7 +1371,7 @@ PreservedAnalyses LoopFullUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
     }
 
     // Otherwise erase the loop from the list if it was in the old loops.
-    return OldLoops.count(SibLoop) != 0;
+    return OldLoops.contains(SibLoop);
   });
   Updater.addSiblingLoops(SibLoops);
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
index 645a89bbd0ff..822a786fc7c7 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -32,6 +32,7 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LazyBlockFrequencyInfo.h"
 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopIterator.h"
@@ -98,6 +99,12 @@ static cl::opt<unsigned>
 Threshold("loop-unswitch-threshold", cl::desc("Max loop size to unswitch"),
           cl::init(100), cl::Hidden);
 
+static cl::opt<unsigned>
+    MSSAThreshold("loop-unswitch-memoryssa-threshold",
+                  cl::desc("Max number of memory uses to explore during "
+                           "partial unswitching analysis"),
+                  cl::init(100), cl::Hidden);
+
 namespace {
 
   class LUAnalysisCache {
@@ -184,6 +191,7 @@ namespace {
     Loop *CurrentLoop = nullptr;
     DominatorTree *DT = nullptr;
     MemorySSA *MSSA = nullptr;
+    AAResults *AA = nullptr;
     std::unique_ptr<MemorySSAUpdater> MSSAU;
     BasicBlock *LoopHeader = nullptr;
     BasicBlock *LoopPreheader = nullptr;
@@ -217,6 +225,10 @@ namespace {
     /// loop preheaders be inserted into the CFG.
     ///
     void getAnalysisUsage(AnalysisUsage &AU) const override {
+      // Lazy BFI and BPI are marked as preserved here so Loop Unswitching
+      // can remain part of the same loop pass as LICM
+      AU.addPreserved<LazyBlockFrequencyInfoPass>();
+      AU.addPreserved<LazyBranchProbabilityInfoPass>();
       AU.addRequired<AssumptionCacheTracker>();
       AU.addRequired<TargetTransformInfoWrapperPass>();
       if (EnableMSSALoopDependency) {
@@ -244,19 +256,22 @@ namespace {
     bool tryTrivialLoopUnswitch(bool &Changed);
 
     bool unswitchIfProfitable(Value *LoopCond, Constant *Val,
-                              Instruction *TI = nullptr);
+                              Instruction *TI = nullptr,
+                              ArrayRef<Instruction *> ToDuplicate = {});
     void unswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val,
                                   BasicBlock *ExitBlock, Instruction *TI);
     void unswitchNontrivialCondition(Value *LIC, Constant *OnVal, Loop *L,
-                                     Instruction *TI);
+                                     Instruction *TI,
+                                     ArrayRef<Instruction *> ToDuplicate = {});
 
     void rewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
                                               Constant *Val, bool IsEqual);
 
-    void emitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
-                                        BasicBlock *TrueDest,
-                                        BasicBlock *FalseDest,
-                                        BranchInst *OldBranch, Instruction *TI);
+    void
+    emitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
+                                   BasicBlock *TrueDest, BasicBlock *FalseDest,
+                                   BranchInst *OldBranch, Instruction *TI,
+                                   ArrayRef<Instruction *> ToDuplicate = {});
 
     void simplifyCode(std::vector<Instruction *> &Worklist, Loop *L);
 
@@ -523,6 +538,7 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPMRef) {
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   LPM = &LPMRef;
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   if (EnableMSSALoopDependency) {
     MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
     MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
@@ -624,6 +640,145 @@ static bool equalityPropUnSafe(Value &LoopCond) {
   return false;
 }
 
+/// Check if the loop header has a conditional branch that is not
+/// loop-invariant, because it involves load instructions. If all paths from
+/// either the true or false successor to the header or loop exists do not
+/// modify the memory feeding the condition, perform 'partial unswitching'. That
+/// is, duplicate the instructions feeding the condition in the pre-header. Then
+/// unswitch on the duplicated condition. The condition is now known in the
+/// unswitched version for the 'invariant' path through the original loop.
+///
+/// If the branch condition of the header is partially invariant, return a pair
+/// containing the instructions to duplicate and a boolean Constant to update
+/// the condition in the loops created for the true or false successors.
+static std::pair<SmallVector<Instruction *, 4>, Constant *>
+hasPartialIVCondition(Loop *L, MemorySSA &MSSA, AAResults *AA) {
+  SmallVector<Instruction *, 4> ToDuplicate;
+
+  auto *TI = dyn_cast<BranchInst>(L->getHeader()->getTerminator());
+  if (!TI || !TI->isConditional())
+    return {};
+
+  auto *CondI = dyn_cast<CmpInst>(TI->getCondition());
+  // The case with the condition outside the loop should already be handled
+  // earlier.
+  if (!CondI || !L->contains(CondI))
+    return {};
+
+  ToDuplicate.push_back(CondI);
+
+  SmallVector<Value *, 4> WorkList;
+  WorkList.append(CondI->op_begin(), CondI->op_end());
+
+  SmallVector<MemoryAccess *, 4> AccessesToCheck;
+  SmallVector<MemoryLocation, 4> AccessedLocs;
+  while (!WorkList.empty()) {
+    Instruction *I = dyn_cast<Instruction>(WorkList.pop_back_val());
+    if (!I || !L->contains(I))
+      continue;
+
+    // TODO: support additional instructions.
+    if (!isa<LoadInst>(I) && !isa<GetElementPtrInst>(I))
+      return {};
+
+    // Do not duplicate volatile and atomic loads.
+    if (auto *LI = dyn_cast<LoadInst>(I))
+      if (LI->isVolatile() || LI->isAtomic())
+        return {};
+
+    ToDuplicate.push_back(I);
+    if (MemoryAccess *MA = MSSA.getMemoryAccess(I)) {
+      if (auto *MemUse = dyn_cast_or_null<MemoryUse>(MA)) {
+        // Queue the defining access to check for alias checks.
+        AccessesToCheck.push_back(MemUse->getDefiningAccess());
+        AccessedLocs.push_back(MemoryLocation::get(I));
+      } else {
+        // MemoryDefs may clobber the location or may be atomic memory
+        // operations. Bail out.
+        return {};
+      }
+    }
+    WorkList.append(I->op_begin(), I->op_end());
+  }
+
+  if (ToDuplicate.size() <= 1)
+    return {};
+
+  auto HasNoClobbersOnPath =
+      [L, AA, &AccessedLocs](BasicBlock *Succ, BasicBlock *Header,
+                             SmallVector<MemoryAccess *, 4> AccessesToCheck) {
+        // First, collect all blocks in the loop that are on a patch from Succ
+        // to the header.
+        SmallVector<BasicBlock *, 4> WorkList;
+        WorkList.push_back(Succ);
+        WorkList.push_back(Header);
+        SmallPtrSet<BasicBlock *, 4> Seen;
+        Seen.insert(Header);
+        while (!WorkList.empty()) {
+          BasicBlock *Current = WorkList.pop_back_val();
+          if (!L->contains(Current))
+            continue;
+          const auto &SeenIns = Seen.insert(Current);
+          if (!SeenIns.second)
+            continue;
+
+          WorkList.append(succ_begin(Current), succ_end(Current));
+        }
+
+        // Require at least 2 blocks on a path through the loop. This skips
+        // paths that directly exit the loop.
+        if (Seen.size() < 2)
+          return false;
+
+        // Next, check if there are any MemoryDefs that are on the path through
+        // the loop (in the Seen set) and they may-alias any of the locations in
+        // AccessedLocs. If that is the case, they may modify the condition and
+        // partial unswitching is not possible.
+        SmallPtrSet<MemoryAccess *, 4> SeenAccesses;
+        while (!AccessesToCheck.empty()) {
+          MemoryAccess *Current = AccessesToCheck.pop_back_val();
+          auto SeenI = SeenAccesses.insert(Current);
+          if (!SeenI.second || !Seen.contains(Current->getBlock()))
+            continue;
+
+          // Bail out if exceeded the threshold.
+          if (SeenAccesses.size() >= MSSAThreshold)
+            return false;
+
+          // MemoryUse are read-only accesses.
+          if (isa<MemoryUse>(Current))
+            continue;
+
+          // For a MemoryDef, check if is aliases any of the location feeding
+          // the original condition.
+          if (auto *CurrentDef = dyn_cast<MemoryDef>(Current)) {
+            if (any_of(AccessedLocs, [AA, CurrentDef](MemoryLocation &Loc) {
+                  return isModSet(
+                      AA->getModRefInfo(CurrentDef->getMemoryInst(), Loc));
+                }))
+              return false;
+          }
+
+          for (Use &U : Current->uses())
+            AccessesToCheck.push_back(cast<MemoryAccess>(U.getUser()));
+        }
+
+        return true;
+      };
+
+  // If we branch to the same successor, partial unswitching will not be
+  // beneficial.
+  if (TI->getSuccessor(0) == TI->getSuccessor(1))
+    return {};
+
+  if (HasNoClobbersOnPath(TI->getSuccessor(0), L->getHeader(), AccessesToCheck))
+    return {ToDuplicate, ConstantInt::getTrue(TI->getContext())};
+  if (HasNoClobbersOnPath(TI->getSuccessor(1), L->getHeader(), AccessesToCheck))
+    return {ToDuplicate, ConstantInt::getFalse(TI->getContext())};
+
+  return {};
+}
+
 /// Do actual work and unswitch loop if possible and profitable.
 bool LoopUnswitch::processCurrentLoop() {
   bool Changed = false;
@@ -661,7 +816,7 @@ bool LoopUnswitch::processCurrentLoop() {
   // FIXME: Use Function::hasOptSize().
   if (OptimizeForSize ||
       LoopHeader->getParent()->hasFnAttribute(Attribute::OptimizeForSize))
-    return false;
+    return Changed;
 
   // Run through the instructions in the loop, keeping track of three things:
   //
@@ -685,10 +840,10 @@ bool LoopUnswitch::processCurrentLoop() {
       if (!CB)
         continue;
       if (CB->isConvergent())
-        return false;
+        return Changed;
       if (auto *II = dyn_cast<InvokeInst>(&I))
         if (!II->getUnwindDest()->canSplitPredecessors())
-          return false;
+          return Changed;
       if (auto *II = dyn_cast<IntrinsicInst>(&I))
         if (II->getIntrinsicID() == Intrinsic::experimental_guard)
           Guards.push_back(II);
@@ -823,6 +978,28 @@ bool LoopUnswitch::processCurrentLoop() {
         }
       }
   }
+
+  // Check if there is a header condition that is invariant along the patch from
+  // either the true or false successors to the header. This allows unswitching
+  // conditions depending on memory accesses, if there's a path not clobbering
+  // the memory locations. Check if this transform has been disabled using
+  // metadata, to avoid unswitching the same loop multiple times.
+  if (MSSA &&
+      !findOptionMDForLoop(CurrentLoop, "llvm.loop.unswitch.partial.disable")) {
+    auto ToDuplicate = hasPartialIVCondition(CurrentLoop, *MSSA, AA);
+    if (!ToDuplicate.first.empty()) {
+      LLVM_DEBUG(dbgs() << "loop-unswitch: Found partially invariant condition "
+                        << *ToDuplicate.first[0] << "\n");
+      ++NumBranches;
+      unswitchIfProfitable(ToDuplicate.first[0], ToDuplicate.second,
+                           CurrentLoop->getHeader()->getTerminator(),
+                           ToDuplicate.first);
+
+      RedoLoop = false;
+      return true;
+    }
+  }
+
   return Changed;
 }
 
@@ -880,7 +1057,8 @@ static BasicBlock *isTrivialLoopExitBlock(Loop *L, BasicBlock *BB) {
 /// simplify the loop.  If we decide that this is profitable,
 /// unswitch the loop, reprocess the pieces, then return true.
 bool LoopUnswitch::unswitchIfProfitable(Value *LoopCond, Constant *Val,
-                                        Instruction *TI) {
+                                        Instruction *TI,
+                                        ArrayRef<Instruction *> ToDuplicate) {
   // Check to see if it would be profitable to unswitch current loop.
   if (!BranchesInfo.costAllowsUnswitching()) {
     LLVM_DEBUG(dbgs() << "NOT unswitching loop %"
@@ -900,31 +1078,69 @@ bool LoopUnswitch::unswitchIfProfitable(Value *LoopCond, Constant *Val,
     return false;
   }
 
-  unswitchNontrivialCondition(LoopCond, Val, CurrentLoop, TI);
+  unswitchNontrivialCondition(LoopCond, Val, CurrentLoop, TI, ToDuplicate);
   return true;
 }
 
 /// Emit a conditional branch on two values if LIC == Val, branch to TrueDst,
 /// otherwise branch to FalseDest. Insert the code immediately before OldBranch
 /// and remove (but not erase!) it from the function.
-void LoopUnswitch::emitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
-                                                  BasicBlock *TrueDest,
-                                                  BasicBlock *FalseDest,
-                                                  BranchInst *OldBranch,
-                                                  Instruction *TI) {
+void LoopUnswitch::emitPreheaderBranchOnCondition(
+    Value *LIC, Constant *Val, BasicBlock *TrueDest, BasicBlock *FalseDest,
+    BranchInst *OldBranch, Instruction *TI,
+    ArrayRef<Instruction *> ToDuplicate) {
   assert(OldBranch->isUnconditional() && "Preheader is not split correctly");
   assert(TrueDest != FalseDest && "Branch targets should be different");
+
   // Insert a conditional branch on LIC to the two preheaders.  The original
   // code is the true version and the new code is the false version.
   Value *BranchVal = LIC;
   bool Swapped = false;
-  if (!isa<ConstantInt>(Val) ||
-      Val->getType() != Type::getInt1Ty(LIC->getContext()))
-    BranchVal = new ICmpInst(OldBranch, ICmpInst::ICMP_EQ, LIC, Val);
-  else if (Val != ConstantInt::getTrue(Val->getContext())) {
-    // We want to enter the new loop when the condition is true.
-    std::swap(TrueDest, FalseDest);
-    Swapped = true;
+
+  if (!ToDuplicate.empty()) {
+    ValueToValueMapTy Old2New;
+    for (Instruction *I : reverse(ToDuplicate)) {
+      auto *New = I->clone();
+      New->insertBefore(OldBranch);
+      RemapInstruction(New, Old2New,
+                       RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+      Old2New[I] = New;
+
+      if (MSSAU) {
+        MemorySSA *MSSA = MSSAU->getMemorySSA();
+        auto *MemA = dyn_cast_or_null<MemoryUse>(MSSA->getMemoryAccess(I));
+        if (!MemA)
+          continue;
+
+        Loop *L = LI->getLoopFor(I->getParent());
+        auto *DefiningAccess = MemA->getDefiningAccess();
+        // Get the first defining access before the loop.
+        while (L->contains(DefiningAccess->getBlock())) {
+          // If the defining access is a MemoryPhi, get the incoming
+          // value for the pre-header as defining access.
+          if (auto *MemPhi = dyn_cast<MemoryPhi>(DefiningAccess)) {
+            DefiningAccess =
+                MemPhi->getIncomingValueForBlock(L->getLoopPreheader());
+          } else {
+            DefiningAccess =
+                cast<MemoryDef>(DefiningAccess)->getDefiningAccess();
+          }
+        }
+        MSSAU->createMemoryAccessInBB(New, DefiningAccess, New->getParent(),
+                                      MemorySSA::BeforeTerminator);
+      }
+    }
+    BranchVal = Old2New[ToDuplicate[0]];
+  } else {
+
+    if (!isa<ConstantInt>(Val) ||
+        Val->getType() != Type::getInt1Ty(LIC->getContext()))
+      BranchVal = new ICmpInst(OldBranch, ICmpInst::ICMP_EQ, LIC, Val);
+    else if (Val != ConstantInt::getTrue(Val->getContext())) {
+      // We want to enter the new loop when the condition is true.
+      std::swap(TrueDest, FalseDest);
+      Swapped = true;
+    }
   }
 
   // Old branch will be removed, so save its parent and successor to update the
@@ -955,10 +1171,11 @@ void LoopUnswitch::emitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
     if (OldBranchSucc != TrueDest && OldBranchSucc != FalseDest) {
       Updates.push_back({DominatorTree::Delete, OldBranchParent, OldBranchSucc});
     }
-    DT->applyUpdates(Updates);
 
     if (MSSAU)
-      MSSAU->applyUpdates(Updates, *DT);
+      MSSAU->applyUpdates(Updates, *DT, /*UpdateDT=*/true);
+    else
+      DT->applyUpdates(Updates);
   }
 
   // If either edge is critical, split it. This helps preserve LoopSimplify
@@ -1207,8 +1424,9 @@ void LoopUnswitch::splitExitEdges(
 /// We determined that the loop is profitable to unswitch when LIC equal Val.
 /// Split it into loop versions and test the condition outside of either loop.
 /// Return the loops created as Out1/Out2.
-void LoopUnswitch::unswitchNontrivialCondition(Value *LIC, Constant *Val,
-                                               Loop *L, Instruction *TI) {
+void LoopUnswitch::unswitchNontrivialCondition(
+    Value *LIC, Constant *Val, Loop *L, Instruction *TI,
+    ArrayRef<Instruction *> ToDuplicate) {
   Function *F = LoopHeader->getParent();
   LLVM_DEBUG(dbgs() << "loop-unswitch: Unswitching loop %"
                     << LoopHeader->getName() << " [" << L->getBlocks().size()
@@ -1233,7 +1451,7 @@ void LoopUnswitch::unswitchNontrivialCondition(Value *LIC, Constant *Val,
   LoopBlocks.push_back(NewPreheader);
 
   // We want the loop to come after the preheader, but before the exit blocks.
-  LoopBlocks.insert(LoopBlocks.end(), L->block_begin(), L->block_end());
+  llvm::append_range(LoopBlocks, L->blocks());
 
   SmallVector<BasicBlock*, 8> ExitBlocks;
   L->getUniqueExitBlocks(ExitBlocks);
@@ -1247,7 +1465,7 @@ void LoopUnswitch::unswitchNontrivialCondition(Value *LIC, Constant *Val,
   L->getUniqueExitBlocks(ExitBlocks);
 
   // Add exit blocks to the loop blocks.
-  LoopBlocks.insert(LoopBlocks.end(), ExitBlocks.begin(), ExitBlocks.end());
+  llvm::append_range(LoopBlocks, ExitBlocks);
 
   // Next step, clone all of the basic blocks that make up the loop (including
   // the loop preheader and exit blocks), keeping track of the mapping between
@@ -1340,7 +1558,7 @@ void LoopUnswitch::unswitchNontrivialCondition(Value *LIC, Constant *Val,
 
   // Emit the new branch that selects between the two versions of this loop.
   emitPreheaderBranchOnCondition(LIC, Val, NewBlocks[0], LoopBlocks[0], OldBR,
-                                 TI);
+                                 TI, ToDuplicate);
   if (MSSAU) {
     // Update MemoryPhis in Exit blocks.
     MSSAU->updateExitBlocksForClonedLoop(ExitBlocks, VMap, *DT);
@@ -1362,17 +1580,38 @@ void LoopUnswitch::unswitchNontrivialCondition(Value *LIC, Constant *Val,
   // iteration.
   WeakTrackingVH LICHandle(LIC);
 
-  // Now we rewrite the original code to know that the condition is true and the
-  // new code to know that the condition is false.
-  rewriteLoopBodyWithConditionConstant(L, LIC, Val, /*IsEqual=*/false);
-
-  // It's possible that simplifying one loop could cause the other to be
-  // changed to another value or a constant.  If its a constant, don't simplify
-  // it.
-  if (!LoopProcessWorklist.empty() && LoopProcessWorklist.back() == NewLoop &&
-      LICHandle && !isa<Constant>(LICHandle))
-    rewriteLoopBodyWithConditionConstant(NewLoop, LICHandle, Val,
-                                         /*IsEqual=*/true);
+  if (ToDuplicate.empty()) {
+    // Now we rewrite the original code to know that the condition is true and
+    // the new code to know that the condition is false.
+    rewriteLoopBodyWithConditionConstant(L, LIC, Val, /*IsEqual=*/false);
+
+    // It's possible that simplifying one loop could cause the other to be
+    // changed to another value or a constant.  If its a constant, don't
+    // simplify it.
+    if (!LoopProcessWorklist.empty() && LoopProcessWorklist.back() == NewLoop &&
+        LICHandle && !isa<Constant>(LICHandle))
+      rewriteLoopBodyWithConditionConstant(NewLoop, LICHandle, Val,
+                                           /*IsEqual=*/true);
+  } else {
+    // Partial unswitching. Update the condition in the right loop with the
+    // constant.
+    auto *CC = cast<ConstantInt>(Val);
+    if (CC->isOneValue()) {
+      rewriteLoopBodyWithConditionConstant(NewLoop, VMap[LIC], Val,
+                                           /*IsEqual=*/true);
+    } else
+      rewriteLoopBodyWithConditionConstant(L, LIC, Val, /*IsEqual=*/true);
+
+    // Mark the new loop as partially unswitched, to avoid unswitching on the
+    // same condition again.
+    auto &Context = NewLoop->getHeader()->getContext();
+    MDNode *DisableUnswitchMD = MDNode::get(
+        Context, MDString::get(Context, "llvm.loop.unswitch.partial.disable"));
+    MDNode *NewLoopID = makePostTransformationMetadata(
+        Context, L->getLoopID(), {"llvm.loop.unswitch.partial"},
+        {DisableUnswitchMD});
+    NewLoop->setLoopID(NewLoopID);
+  }
 
   if (MSSA && VerifyMemorySSA)
     MSSA->verifyMemorySSA();
@@ -1381,9 +1620,7 @@ void LoopUnswitch::unswitchNontrivialCondition(Value *LIC, Constant *Val,
 /// Remove all instances of I from the worklist vector specified.
 static void removeFromWorklist(Instruction *I,
                                std::vector<Instruction *> &Worklist) {
-
-  Worklist.erase(std::remove(Worklist.begin(), Worklist.end(), I),
-                 Worklist.end());
+  llvm::erase_value(Worklist, I);
 }
 
 /// When we find that I really equals V, remove I from the
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
index 06b684ef1e70..2ff1e8480749 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
@@ -59,6 +59,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Scalar/LoopVersioningLICM.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/AliasAnalysis.h"
@@ -114,17 +115,18 @@ static cl::opt<unsigned> LVLoopDepthThreshold(
 
 namespace {
 
-struct LoopVersioningLICM : public LoopPass {
+struct LoopVersioningLICMLegacyPass : public LoopPass {
   static char ID;
 
-  LoopVersioningLICM()
-      : LoopPass(ID), LoopDepthThreshold(LVLoopDepthThreshold),
-        InvariantThreshold(LVInvarThreshold) {
-    initializeLoopVersioningLICMPass(*PassRegistry::getPassRegistry());
+  LoopVersioningLICMLegacyPass() : LoopPass(ID) {
+    initializeLoopVersioningLICMLegacyPassPass(
+        *PassRegistry::getPassRegistry());
   }
 
   bool runOnLoop(Loop *L, LPPassManager &LPM) override;
 
+  StringRef getPassName() const override { return "Loop Versioning for LICM"; }
+
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
     AU.addRequired<AAResultsWrapperPass>();
@@ -138,13 +140,25 @@ struct LoopVersioningLICM : public LoopPass {
     AU.addPreserved<GlobalsAAWrapperPass>();
     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
   }
+};
 
-  StringRef getPassName() const override { return "Loop Versioning for LICM"; }
+struct LoopVersioningLICM {
+  // We don't explicitly pass in LoopAccessInfo to the constructor since the
+  // loop versioning might return early due to instructions that are not safe
+  // for versioning. By passing the proxy instead the construction of
+  // LoopAccessInfo will take place only when it's necessary.
+  LoopVersioningLICM(AliasAnalysis *AA, ScalarEvolution *SE,
+                     OptimizationRemarkEmitter *ORE,
+                     function_ref<const LoopAccessInfo &(Loop *)> GetLAI)
+      : AA(AA), SE(SE), GetLAI(GetLAI),
+        LoopDepthThreshold(LVLoopDepthThreshold),
+        InvariantThreshold(LVInvarThreshold), ORE(ORE) {}
+
+  bool runOnLoop(Loop *L, LoopInfo *LI, DominatorTree *DT);
 
   void reset() {
     AA = nullptr;
     SE = nullptr;
-    LAA = nullptr;
     CurLoop = nullptr;
     LoadAndStoreCounter = 0;
     InvariantCounter = 0;
@@ -169,12 +183,12 @@ private:
   // Current ScalarEvolution
   ScalarEvolution *SE = nullptr;
 
-  // Current LoopAccessAnalysis
-  LoopAccessLegacyAnalysis *LAA = nullptr;
-
   // Current Loop's LoopAccessInfo
   const LoopAccessInfo *LAI = nullptr;
 
+  // Proxy for retrieving LoopAccessInfo.
+  function_ref<const LoopAccessInfo &(Loop *)> GetLAI;
+
   // The current loop we are working on.
   Loop *CurLoop = nullptr;
 
@@ -253,7 +267,7 @@ bool LoopVersioningLICM::legalLoopStructure() {
   // We need to be able to compute the loop trip count in order
   // to generate the bound checks.
   const SCEV *ExitCount = SE->getBackedgeTakenCount(CurLoop);
-  if (ExitCount == SE->getCouldNotCompute()) {
+  if (isa<SCEVCouldNotCompute>(ExitCount)) {
     LLVM_DEBUG(dbgs() << "    loop does not has trip count\n");
     return false;
   }
@@ -400,8 +414,8 @@ bool LoopVersioningLICM::legalLoopInstructions() {
         return false;
       }
     }
-  // Get LoopAccessInfo from current loop.
-  LAI = &LAA->getInfo(CurLoop);
+  // Get LoopAccessInfo from current loop via the proxy.
+  LAI = &GetLAI(CurLoop);
   // Check LoopAccessInfo for need of runtime check.
   if (LAI->getRuntimePointerChecking()->getChecks().empty()) {
     LLVM_DEBUG(dbgs() << "    LAA: Runtime check not found !!\n");
@@ -539,8 +553,8 @@ void LoopVersioningLICM::setNoAliasToLoop(Loop *VerLoop) {
   MDBuilder MDB(I->getContext());
   MDNode *NewDomain = MDB.createAnonymousAliasScopeDomain("LVDomain");
   StringRef Name = "LVAliasScope";
-  SmallVector<Metadata *, 4> Scopes, NoAliases;
   MDNode *NewScope = MDB.createAnonymousAliasScope(NewDomain, Name);
+  SmallVector<Metadata *, 4> Scopes{NewScope}, NoAliases{NewScope};
   // Iterate over each instruction of loop.
   // set no-alias for all load & store instructions.
   for (auto *Block : CurLoop->getBlocks()) {
@@ -548,8 +562,6 @@ void LoopVersioningLICM::setNoAliasToLoop(Loop *VerLoop) {
       // Only interested in instruction that may modify or read memory.
       if (!Inst.mayReadFromMemory() && !Inst.mayWriteToMemory())
         continue;
-      Scopes.push_back(NewScope);
-      NoAliases.push_back(NewScope);
       // Set no-alias for current instruction.
       Inst.setMetadata(
           LLVMContext::MD_noalias,
@@ -564,30 +576,38 @@ void LoopVersioningLICM::setNoAliasToLoop(Loop *VerLoop) {
   }
 }
 
-bool LoopVersioningLICM::runOnLoop(Loop *L, LPPassManager &LPM) {
+bool LoopVersioningLICMLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
+  if (skipLoop(L))
+    return false;
+
+  AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  OptimizationRemarkEmitter *ORE =
+      &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+  LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
+  auto GetLAI = [&](Loop *L) -> const LoopAccessInfo & {
+    return getAnalysis<LoopAccessLegacyAnalysis>().getInfo(L);
+  };
+
+  return LoopVersioningLICM(AA, SE, ORE, GetLAI).runOnLoop(L, LI, DT);
+}
+
+bool LoopVersioningLICM::runOnLoop(Loop *L, LoopInfo *LI, DominatorTree *DT) {
   // This will automatically release all resources hold by the current
   // LoopVersioningLICM object.
   AutoResetter Resetter(*this);
 
-  if (skipLoop(L))
-    return false;
-
   // Do not do the transformation if disabled by metadata.
   if (hasLICMVersioningTransformation(L) & TM_Disable)
     return false;
 
-  // Get Analysis information.
-  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
-  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
-  LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
-  ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
-  LAI = nullptr;
   // Set Current Loop
   CurLoop = L;
   CurAST.reset(new AliasSetTracker(*AA));
 
   // Loop over the body of this loop, construct AST.
-  LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   for (auto *Block : L->getBlocks()) {
     if (LI->getLoopFor(Block) == L) // Ignore blocks in subloop.
       CurAST->add(*Block);          // Incorporate the specified basic block
@@ -602,8 +622,8 @@ bool LoopVersioningLICM::runOnLoop(Loop *L, LPPassManager &LPM) {
     // Do loop versioning.
     // Create memcheck for memory accessed inside loop.
     // Clone original loop, and set blocks properly.
-    DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-    LoopVersioning LVer(*LAI, CurLoop, LI, DT, SE, true);
+    LoopVersioning LVer(*LAI, LAI->getRuntimePointerChecking()->getChecks(),
+                        CurLoop, LI, DT, SE);
     LVer.versionLoop();
     // Set Loop Versioning metaData for original loop.
     addStringMetadataToLoop(LVer.getNonVersionedLoop(), LICMVersioningMetaData);
@@ -621,9 +641,9 @@ bool LoopVersioningLICM::runOnLoop(Loop *L, LPPassManager &LPM) {
   return Changed;
 }
 
-char LoopVersioningLICM::ID = 0;
+char LoopVersioningLICMLegacyPass::ID = 0;
 
-INITIALIZE_PASS_BEGIN(LoopVersioningLICM, "loop-versioning-licm",
+INITIALIZE_PASS_BEGIN(LoopVersioningLICMLegacyPass, "loop-versioning-licm",
                       "Loop Versioning For LICM", false, false)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
@@ -634,7 +654,31 @@ INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
-INITIALIZE_PASS_END(LoopVersioningLICM, "loop-versioning-licm",
+INITIALIZE_PASS_END(LoopVersioningLICMLegacyPass, "loop-versioning-licm",
                     "Loop Versioning For LICM", false, false)
 
-Pass *llvm::createLoopVersioningLICMPass() { return new LoopVersioningLICM(); }
+Pass *llvm::createLoopVersioningLICMPass() {
+  return new LoopVersioningLICMLegacyPass();
+}
+
+namespace llvm {
+
+PreservedAnalyses LoopVersioningLICMPass::run(Loop &L, LoopAnalysisManager &AM,
+                                              LoopStandardAnalysisResults &LAR,
+                                              LPMUpdater &U) {
+  AliasAnalysis *AA = &LAR.AA;
+  ScalarEvolution *SE = &LAR.SE;
+  DominatorTree *DT = &LAR.DT;
+  LoopInfo *LI = &LAR.LI;
+  const Function *F = L.getHeader()->getParent();
+  OptimizationRemarkEmitter ORE(F);
+
+  auto GetLAI = [&](Loop *L) -> const LoopAccessInfo & {
+    return AM.getResult<LoopAccessAnalysis>(*L, LAR);
+  };
+
+  if (!LoopVersioningLICM(AA, SE, &ORE, GetLAI).runOnLoop(&L, LI, DT))
+    return PreservedAnalyses::all();
+  return getLoopPassPreservedAnalyses();
+}
+} // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp
index fddf28c281fc..bb30c48127a0 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp
@@ -43,10 +43,10 @@ STATISTIC(ObjectSizeIntrinsicsHandled,
           "Number of 'objectsize' intrinsic calls handled");
 
 static Value *lowerIsConstantIntrinsic(IntrinsicInst *II) {
-  Value *Op = II->getOperand(0);
-
-  return isa<Constant>(Op) ? ConstantInt::getTrue(II->getType())
-                           : ConstantInt::getFalse(II->getType());
+  if (auto *C = dyn_cast<Constant>(II->getOperand(0)))
+    if (C->isManifestConstant())
+      return ConstantInt::getTrue(II->getType());
+  return ConstantInt::getFalse(II->getType());
 }
 
 static bool replaceConditionalBranchesOnConstant(Instruction *II,
@@ -78,7 +78,7 @@ static bool replaceConditionalBranchesOnConstant(Instruction *II,
       Other->removePredecessor(Source);
       BI->eraseFromParent();
       BranchInst::Create(Target, Source);
-      if (pred_begin(Other) == pred_end(Other))
+      if (pred_empty(Other))
         HasDeadBlocks = true;
     }
   }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
index 0fe7dd9cfb39..da13075dfee2 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
@@ -24,10 +24,8 @@
 #include "llvm/IR/Metadata.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/MisExpect.h"
 
 using namespace llvm;
 
@@ -48,10 +46,10 @@ STATISTIC(ExpectIntrinsicsHandled,
 // 'select' instructions. It may be worthwhile to hoist these values to some
 // shared space, so they can be used directly by other passes.
 
-static cl::opt<uint32_t> LikelyBranchWeight(
+cl::opt<uint32_t> llvm::LikelyBranchWeight(
     "likely-branch-weight", cl::Hidden, cl::init(2000),
     cl::desc("Weight of the branch likely to be taken (default = 2000)"));
-static cl::opt<uint32_t> UnlikelyBranchWeight(
+cl::opt<uint32_t> llvm::UnlikelyBranchWeight(
     "unlikely-branch-weight", cl::Hidden, cl::init(1),
     cl::desc("Weight of the branch unlikely to be taken (default = 1)"));
 
@@ -102,13 +100,7 @@ static bool handleSwitchExpect(SwitchInst &SI) {
   uint64_t Index = (Case == *SI.case_default()) ? 0 : Case.getCaseIndex() + 1;
   Weights[Index] = LikelyBranchWeightVal;
 
-  SI.setMetadata(LLVMContext::MD_misexpect,
-                 MDBuilder(CI->getContext())
-                     .createMisExpect(Index, LikelyBranchWeightVal,
-                                      UnlikelyBranchWeightVal));
-
   SI.setCondition(ArgValue);
-  misexpect::checkFrontendInstrumentation(SI);
 
   SI.setMetadata(LLVMContext::MD_prof,
                  MDBuilder(CI->getContext()).createBranchWeights(Weights));
@@ -317,7 +309,6 @@ template <class BrSelInst> static bool handleBrSelExpect(BrSelInst &BSI) {
 
   MDBuilder MDB(CI->getContext());
   MDNode *Node;
-  MDNode *ExpNode;
 
   uint32_t LikelyBranchWeightVal, UnlikelyBranchWeightVal;
   std::tie(LikelyBranchWeightVal, UnlikelyBranchWeightVal) =
@@ -327,24 +318,16 @@ template <class BrSelInst> static bool handleBrSelExpect(BrSelInst &BSI) {
       (Predicate == CmpInst::ICMP_EQ)) {
     Node =
         MDB.createBranchWeights(LikelyBranchWeightVal, UnlikelyBranchWeightVal);
-    ExpNode =
-        MDB.createMisExpect(0, LikelyBranchWeightVal, UnlikelyBranchWeightVal);
   } else {
     Node =
         MDB.createBranchWeights(UnlikelyBranchWeightVal, LikelyBranchWeightVal);
-    ExpNode =
-        MDB.createMisExpect(1, LikelyBranchWeightVal, UnlikelyBranchWeightVal);
   }
 
-  BSI.setMetadata(LLVMContext::MD_misexpect, ExpNode);
-
   if (CmpI)
     CmpI->setOperand(0, ArgValue);
   else
     BSI.setCondition(ArgValue);
 
-  misexpect::checkFrontendInstrumentation(BSI);
-
   BSI.setMetadata(LLVMContext::MD_prof, Node);
 
   return true;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index 90314b17b5e2..8e251ca940a3 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -42,6 +42,8 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/MatrixUtils.h"
 
 using namespace llvm;
 using namespace PatternMatch;
@@ -61,6 +63,9 @@ static cl::opt<unsigned> TileSize(
     "fuse-matrix-tile-size", cl::init(4), cl::Hidden,
     cl::desc(
         "Tile size for matrix instruction fusion using square-shaped tiles."));
+static cl::opt<bool> TileUseLoops("fuse-matrix-use-loops", cl::init(false),
+                                  cl::Hidden,
+                                  cl::desc("Generate loop nest for tiling."));
 static cl::opt<bool> ForceFusion(
     "force-fuse-matrix", cl::init(false), cl::Hidden,
     cl::desc("Force matrix instruction fusion even if not profitable."));
@@ -182,10 +187,10 @@ class LowerMatrixIntrinsics {
   Function &Func;
   const DataLayout &DL;
   const TargetTransformInfo &TTI;
-  AliasAnalysis &AA;
-  DominatorTree &DT;
-  LoopInfo &LI;
-  OptimizationRemarkEmitter &ORE;
+  AliasAnalysis *AA;
+  DominatorTree *DT;
+  LoopInfo *LI;
+  OptimizationRemarkEmitter *ORE;
 
   /// Contains estimates of the number of operations (loads, stores, compute) required to lower a matrix operation.
   struct OpInfoTy {
@@ -241,7 +246,7 @@ class LowerMatrixIntrinsics {
 
     void setVector(unsigned i, Value *V) { Vectors[i] = V; }
 
-    Type *getElementType() { return getVectorTy()->getElementType(); }
+    Type *getElementType() const { return getVectorTy()->getElementType(); }
 
     unsigned getNumVectors() const {
       if (isColumnMajor())
@@ -271,7 +276,7 @@ class LowerMatrixIntrinsics {
       return getVectorTy();
     }
 
-    VectorType *getVectorTy() {
+    VectorType *getVectorTy() const {
       return cast<VectorType>(Vectors[0]->getType());
     }
 
@@ -329,9 +334,8 @@ class LowerMatrixIntrinsics {
     Value *extractVector(unsigned I, unsigned J, unsigned NumElts,
                          IRBuilder<> &Builder) const {
       Value *Vec = isColumnMajor() ? getColumn(J) : getRow(I);
-      Value *Undef = UndefValue::get(Vec->getType());
       return Builder.CreateShuffleVector(
-          Vec, Undef, createSequentialMask(isColumnMajor() ? I : J, NumElts, 0),
+          Vec, createSequentialMask(isColumnMajor() ? I : J, NumElts, 0),
           "block");
     }
   };
@@ -393,8 +397,8 @@ class LowerMatrixIntrinsics {
 
 public:
   LowerMatrixIntrinsics(Function &F, TargetTransformInfo &TTI,
-                        AliasAnalysis &AA, DominatorTree &DT, LoopInfo &LI,
-                        OptimizationRemarkEmitter &ORE)
+                        AliasAnalysis *AA, DominatorTree *DT, LoopInfo *LI,
+                        OptimizationRemarkEmitter *ORE)
       : Func(F), DL(F.getParent()->getDataLayout()), TTI(TTI), AA(AA), DT(DT),
         LI(LI), ORE(ORE) {}
 
@@ -442,12 +446,11 @@ public:
 
     // Otherwise split MatrixVal.
     SmallVector<Value *, 16> SplitVecs;
-    Value *Undef = UndefValue::get(VType);
     for (unsigned MaskStart = 0;
          MaskStart < cast<FixedVectorType>(VType)->getNumElements();
          MaskStart += SI.getStride()) {
       Value *V = Builder.CreateShuffleVector(
-          MatrixVal, Undef, createSequentialMask(MaskStart, SI.getStride(), 0),
+          MatrixVal, createSequentialMask(MaskStart, SI.getStride(), 0),
           "split");
       SplitVecs.push_back(V);
     }
@@ -485,6 +488,7 @@ public:
     case Instruction::FAdd:
     case Instruction::FSub:
     case Instruction::FMul: // Scalar multiply.
+    case Instruction::FNeg:
     case Instruction::Add:
     case Instruction::Mul:
     case Instruction::Sub:
@@ -527,8 +531,7 @@ public:
     // list.
     LLVM_DEBUG(dbgs() << "Forward-propagate shapes:\n");
     while (!WorkList.empty()) {
-      Instruction *Inst = WorkList.back();
-      WorkList.pop_back();
+      Instruction *Inst = WorkList.pop_back_val();
 
       // New entry, set the value and insert operands
       bool Propagate = false;
@@ -598,8 +601,7 @@ public:
     // worklist.
     LLVM_DEBUG(dbgs() << "Backward-propagate shapes:\n");
     while (!WorkList.empty()) {
-      Value *V = WorkList.back();
-      WorkList.pop_back();
+      Value *V = WorkList.pop_back_val();
 
       size_t BeforeProcessingV = WorkList.size();
       if (!isa<Instruction>(V))
@@ -721,14 +723,18 @@ public:
       Value *Op2;
       if (auto *BinOp = dyn_cast<BinaryOperator>(Inst))
         Changed |= VisitBinaryOperator(BinOp);
+      if (auto *UnOp = dyn_cast<UnaryOperator>(Inst))
+        Changed |= VisitUnaryOperator(UnOp);
       if (match(Inst, m_Load(m_Value(Op1))))
         Changed |= VisitLoad(cast<LoadInst>(Inst), Op1, Builder);
       else if (match(Inst, m_Store(m_Value(Op1), m_Value(Op2))))
         Changed |= VisitStore(cast<StoreInst>(Inst), Op1, Op2, Builder);
     }
 
-    RemarkGenerator RemarkGen(Inst2ColumnMatrix, ORE, Func);
-    RemarkGen.emitRemarks();
+    if (ORE) {
+      RemarkGenerator RemarkGen(Inst2ColumnMatrix, *ORE, Func);
+      RemarkGen.emitRemarks();
+    }
 
     for (Instruction *Inst : reverse(ToRemove))
       Inst->eraseFromParent();
@@ -934,10 +940,8 @@ public:
     unsigned NumElts = cast<FixedVectorType>(Col->getType())->getNumElements();
     assert(NumElts >= BlockNumElts && "Too few elements for current block");
 
-    Value *Undef = UndefValue::get(Block->getType());
     Block = Builder.CreateShuffleVector(
-        Block, Undef,
-        createSequentialMask(0, BlockNumElts, NumElts - BlockNumElts));
+        Block, createSequentialMask(0, BlockNumElts, NumElts - BlockNumElts));
 
     // If Col is 7 long and I is 2 and BlockNumElts is 2 the mask is: 0, 1, 7,
     // 8, 4, 5, 6
@@ -1085,7 +1089,7 @@ public:
     MemoryLocation StoreLoc = MemoryLocation::get(Store);
     MemoryLocation LoadLoc = MemoryLocation::get(Load);
 
-    AliasResult LdAliased = AA.alias(LoadLoc, StoreLoc);
+    AliasResult LdAliased = AA->alias(LoadLoc, StoreLoc);
 
     // If we can statically determine noalias we're good.
     if (!LdAliased)
@@ -1101,14 +1105,17 @@ public:
     // as we adjust Check0 and Check1's branches.
     SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
     for (BasicBlock *Succ : successors(Check0))
-      DTUpdates.push_back({DT.Delete, Check0, Succ});
+      DTUpdates.push_back({DT->Delete, Check0, Succ});
 
-    BasicBlock *Check1 = SplitBlock(MatMul->getParent(), MatMul, nullptr, &LI,
-                                    nullptr, "alias_cont");
+    BasicBlock *Check1 =
+        SplitBlock(MatMul->getParent(), MatMul, (DomTreeUpdater *)nullptr, LI,
+                   nullptr, "alias_cont");
     BasicBlock *Copy =
-        SplitBlock(MatMul->getParent(), MatMul, nullptr, &LI, nullptr, "copy");
-    BasicBlock *Fusion = SplitBlock(MatMul->getParent(), MatMul, nullptr, &LI,
-                                    nullptr, "no_alias");
+        SplitBlock(MatMul->getParent(), MatMul, (DomTreeUpdater *)nullptr, LI,
+                   nullptr, "copy");
+    BasicBlock *Fusion =
+        SplitBlock(MatMul->getParent(), MatMul, (DomTreeUpdater *)nullptr, LI,
+                   nullptr, "no_alias");
 
     // Check if the loaded memory location begins before the end of the store
     // location. If the condition holds, they might overlap, otherwise they are
@@ -1152,11 +1159,11 @@ public:
     PHI->addIncoming(NewLd, Copy);
 
     // Adjust DT.
-    DTUpdates.push_back({DT.Insert, Check0, Check1});
-    DTUpdates.push_back({DT.Insert, Check0, Fusion});
-    DTUpdates.push_back({DT.Insert, Check1, Copy});
-    DTUpdates.push_back({DT.Insert, Check1, Fusion});
-    DT.applyUpdates(DTUpdates);
+    DTUpdates.push_back({DT->Insert, Check0, Check1});
+    DTUpdates.push_back({DT->Insert, Check0, Fusion});
+    DTUpdates.push_back({DT->Insert, Check1, Copy});
+    DTUpdates.push_back({DT->Insert, Check1, Fusion});
+    DT->applyUpdates(DTUpdates);
     return PHI;
   }
 
@@ -1202,6 +1209,63 @@ public:
     return Res;
   }
 
+  void createTiledLoops(CallInst *MatMul, Value *LPtr, ShapeInfo LShape,
+                        Value *RPtr, ShapeInfo RShape, StoreInst *Store,
+                        bool AllowContract) {
+    auto *EltType = cast<VectorType>(MatMul->getType())->getElementType();
+
+    // Create the main tiling loop nest.
+    TileInfo TI(LShape.NumRows, RShape.NumColumns, LShape.NumColumns, TileSize);
+    DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
+    Instruction *InsertI = cast<Instruction>(MatMul);
+    BasicBlock *Start = InsertI->getParent();
+    BasicBlock *End =
+        SplitBlock(InsertI->getParent(), InsertI, DT, LI, nullptr, "continue");
+    IRBuilder<> Builder(MatMul);
+    BasicBlock *InnerBody = TI.CreateTiledLoops(Start, End, Builder, DTU, *LI);
+
+    Type *TileVecTy =
+        FixedVectorType::get(MatMul->getType()->getScalarType(), TileSize);
+    MatrixTy TileResult;
+    // Insert in the inner loop header.
+    Builder.SetInsertPoint(TI.InnerLoopHeader->getTerminator());
+    // Create PHI nodes for the result columns to accumulate across iterations.
+    SmallVector<PHINode *, 4> ColumnPhis;
+    for (unsigned I = 0; I < TileSize; I++) {
+      auto *Phi = Builder.CreatePHI(TileVecTy, 2, "result.vec." + Twine(I));
+      Phi->addIncoming(ConstantAggregateZero::get(TileVecTy),
+                       TI.RowLoopHeader->getSingleSuccessor());
+      TileResult.addVector(Phi);
+      ColumnPhis.push_back(Phi);
+    }
+
+    // Insert in the inner loop body, which computes
+    //   Res += Load(CurrentRow, K) * Load(K, CurrentColumn)
+    Builder.SetInsertPoint(InnerBody->getTerminator());
+    // Load tiles of the operands.
+    MatrixTy A = loadMatrix(LPtr, {}, false, LShape, TI.CurrentRow, TI.CurrentK,
+                            {TileSize, TileSize}, EltType, Builder);
+    MatrixTy B = loadMatrix(RPtr, {}, false, RShape, TI.CurrentK, TI.CurrentCol,
+                            {TileSize, TileSize}, EltType, Builder);
+    emitMatrixMultiply(TileResult, A, B, AllowContract, Builder, true);
+    // Store result after the inner loop is done.
+    Builder.SetInsertPoint(TI.RowLoopLatch->getTerminator());
+    storeMatrix(TileResult, Store->getPointerOperand(), Store->getAlign(),
+                Store->isVolatile(), {LShape.NumRows, RShape.NumColumns},
+                TI.CurrentRow, TI.CurrentCol, EltType, Builder);
+
+    for (unsigned I = 0; I < TileResult.getNumVectors(); I++)
+      ColumnPhis[I]->addIncoming(TileResult.getVector(I), TI.InnerLoopLatch);
+
+    // Force unrolling of a few iterations of the inner loop, to make sure there
+    // is enough work per iteration.
+    // FIXME: The unroller should make this decision directly instead, but
+    // currently the cost-model is not up to the task.
+    unsigned InnerLoopUnrollCount = std::min(10u, LShape.NumColumns / TileSize);
+    addStringMetadataToLoop(LI->getLoopFor(TI.InnerLoopHeader),
+                            "llvm.loop.unroll.count", InnerLoopUnrollCount);
+  }
+
   void emitSIMDTiling(CallInst *MatMul, LoadInst *LoadOp0, LoadInst *LoadOp1,
                       StoreInst *Store,
                       SmallPtrSetImpl<Instruction *> &FusedInsts) {
@@ -1224,28 +1288,34 @@ public:
 
     bool AllowContract = AllowContractEnabled || (isa<FPMathOperator>(MatMul) &&
                                                   MatMul->hasAllowContract());
-    IRBuilder<> Builder(Store);
-    for (unsigned J = 0; J < C; J += TileSize)
-      for (unsigned I = 0; I < R; I += TileSize) {
-        const unsigned TileR = std::min(R - I, unsigned(TileSize));
-        const unsigned TileC = std::min(C - J, unsigned(TileSize));
-        MatrixTy Res = getZeroMatrix(EltType, TileR, TileC);
-
-        for (unsigned K = 0; K < M; K += TileSize) {
-          const unsigned TileM = std::min(M - K, unsigned(TileSize));
-          MatrixTy A =
-              loadMatrix(APtr, LoadOp0->getAlign(), LoadOp0->isVolatile(),
-                         LShape, Builder.getInt64(I), Builder.getInt64(K),
-                         {TileR, TileM}, EltType, Builder);
-          MatrixTy B =
-              loadMatrix(BPtr, LoadOp1->getAlign(), LoadOp1->isVolatile(),
-                         RShape, Builder.getInt64(K), Builder.getInt64(J),
-                         {TileM, TileC}, EltType, Builder);
-          emitMatrixMultiply(Res, A, B, AllowContract, Builder, true);
+    if (TileUseLoops && (R % TileSize == 0 && C % TileSize == 0))
+      createTiledLoops(MatMul, APtr, LShape, BPtr, RShape, Store,
+                       AllowContract);
+    else {
+      IRBuilder<> Builder(Store);
+      for (unsigned J = 0; J < C; J += TileSize)
+        for (unsigned I = 0; I < R; I += TileSize) {
+          const unsigned TileR = std::min(R - I, unsigned(TileSize));
+          const unsigned TileC = std::min(C - J, unsigned(TileSize));
+          MatrixTy Res = getZeroMatrix(EltType, TileR, TileC);
+
+          for (unsigned K = 0; K < M; K += TileSize) {
+            const unsigned TileM = std::min(M - K, unsigned(TileSize));
+            MatrixTy A =
+                loadMatrix(APtr, LoadOp0->getAlign(), LoadOp0->isVolatile(),
+                           LShape, Builder.getInt64(I), Builder.getInt64(K),
+                           {TileR, TileM}, EltType, Builder);
+            MatrixTy B =
+                loadMatrix(BPtr, LoadOp1->getAlign(), LoadOp1->isVolatile(),
+                           RShape, Builder.getInt64(K), Builder.getInt64(J),
+                           {TileM, TileC}, EltType, Builder);
+            emitMatrixMultiply(Res, A, B, AllowContract, Builder, true);
+          }
+          storeMatrix(Res, CPtr, Store->getAlign(), Store->isVolatile(), {R, M},
+                      Builder.getInt64(I), Builder.getInt64(J), EltType,
+                      Builder);
         }
-        storeMatrix(Res, CPtr, Store->getAlign(), Store->isVolatile(), {R, M},
-                    Builder.getInt64(I), Builder.getInt64(J), EltType, Builder);
-      }
+    }
 
     // Mark eliminated instructions as fused and remove them.
     FusedInsts.insert(Store);
@@ -1272,9 +1342,11 @@ public:
   void LowerMatrixMultiplyFused(CallInst *MatMul,
                                 SmallPtrSetImpl<Instruction *> &FusedInsts) {
     if (!FuseMatrix || !MatMul->hasOneUse() ||
-        MatrixLayout != MatrixLayoutTy::ColumnMajor)
+        MatrixLayout != MatrixLayoutTy::ColumnMajor || !DT)
       return;
 
+    assert(AA && LI && "Analyses should be available");
+
     auto *LoadOp0 = dyn_cast<LoadInst>(MatMul->getOperand(0));
     auto *LoadOp1 = dyn_cast<LoadInst>(MatMul->getOperand(1));
     auto *Store = dyn_cast<StoreInst>(*MatMul->user_begin());
@@ -1283,7 +1355,7 @@ public:
       // we create invalid IR.
       // FIXME: See if we can hoist the store address computation.
       auto *AddrI = dyn_cast<Instruction>(Store->getOperand(1));
-      if (AddrI && (!DT.dominates(AddrI, MatMul)))
+      if (AddrI && (!DT->dominates(AddrI, MatMul)))
         return;
 
       emitSIMDTiling(MatMul, LoadOp0, LoadOp1, Store, FusedInsts);
@@ -1300,6 +1372,8 @@ public:
 
     const MatrixTy &Lhs = getMatrix(MatMul->getArgOperand(0), LShape, Builder);
     const MatrixTy &Rhs = getMatrix(MatMul->getArgOperand(1), RShape, Builder);
+    assert(Lhs.getElementType() == Rhs.getElementType() &&
+           "Matrix multiply argument element types do not match.");
 
     const unsigned R = LShape.NumRows;
     const unsigned C = RShape.NumColumns;
@@ -1307,6 +1381,8 @@ public:
 
     // Initialize the output
     MatrixTy Result(R, C, EltType);
+    assert(Lhs.getElementType() == Result.getElementType() &&
+           "Matrix multiply result element type does not match arguments.");
 
     bool AllowContract = AllowContractEnabled || (isa<FPMathOperator>(MatMul) &&
                                                   MatMul->hasAllowContract());
@@ -1424,6 +1500,40 @@ public:
     return true;
   }
 
+  /// Lower unary operators, if shape information is available.
+  bool VisitUnaryOperator(UnaryOperator *Inst) {
+    auto I = ShapeMap.find(Inst);
+    if (I == ShapeMap.end())
+      return false;
+
+    Value *Op = Inst->getOperand(0);
+
+    IRBuilder<> Builder(Inst);
+    ShapeInfo &Shape = I->second;
+
+    MatrixTy Result;
+    MatrixTy M = getMatrix(Op, Shape, Builder);
+
+    // Helper to perform unary op on vectors.
+    auto BuildVectorOp = [&Builder, Inst](Value *Op) {
+      switch (Inst->getOpcode()) {
+      case Instruction::FNeg:
+        return Builder.CreateFNeg(Op);
+      default:
+        llvm_unreachable("Unsupported unary operator for matrix");
+      }
+    };
+
+    for (unsigned I = 0; I < Shape.getNumVectors(); ++I)
+      Result.addVector(BuildVectorOp(M.getVector(I)));
+
+    finalizeLowering(Inst,
+                     Result.addNumComputeOps(getNumOps(Result.getVectorTy()) *
+                                             Result.getNumVectors()),
+                     Builder);
+    return true;
+  }
+
   /// Helper to linearize a matrix expression tree into a string. Currently
   /// matrix expressions are linarized by starting at an expression leaf and
   /// linearizing bottom up.
@@ -1488,7 +1598,7 @@ public:
       if (Value *Ptr = getPointerOperand(V))
         return getUnderlyingObjectThroughLoads(Ptr);
       else if (V->getType()->isPointerTy())
-        return GetUnderlyingObject(V, DL);
+        return getUnderlyingObject(V);
       return V;
     }
 
@@ -1524,7 +1634,7 @@ public:
         write(StringRef(Intrinsic::getName(II->getIntrinsicID(), {}))
                   .drop_front(StringRef("llvm.matrix.").size()));
         write(".");
-        std::string Tmp = "";
+        std::string Tmp;
         raw_string_ostream SS(Tmp);
 
         switch (II->getIntrinsicID()) {
@@ -1737,7 +1847,6 @@ public:
 
       for (Value *Op : cast<Instruction>(V)->operand_values())
         collectSharedInfo(Leaf, Op, ExprsInSubprogram, Shared);
-      return;
     }
 
     /// Calculate the number of exclusive and shared op counts for expression
@@ -1863,15 +1972,25 @@ public:
 PreservedAnalyses LowerMatrixIntrinsicsPass::run(Function &F,
                                                  FunctionAnalysisManager &AM) {
   auto &TTI = AM.getResult<TargetIRAnalysis>(F);
-  auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
-  auto &AA = AM.getResult<AAManager>(F);
-  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
-  auto &LI = AM.getResult<LoopAnalysis>(F);
+  OptimizationRemarkEmitter *ORE = nullptr;
+  AAResults *AA = nullptr;
+  DominatorTree *DT = nullptr;
+  LoopInfo *LI = nullptr;
+
+  if (!Minimal) {
+    ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+    AA = &AM.getResult<AAManager>(F);
+    DT = &AM.getResult<DominatorTreeAnalysis>(F);
+    LI = &AM.getResult<LoopAnalysis>(F);
+  }
 
   LowerMatrixIntrinsics LMT(F, TTI, AA, DT, LI, ORE);
   if (LMT.Visit()) {
     PreservedAnalyses PA;
-    PA.preserveSet<CFGAnalyses>();
+    if (!Minimal) {
+      PA.preserve<LoopAnalysis>();
+      PA.preserve<DominatorTreeAnalysis>();
+    }
     return PA;
   }
   return PreservedAnalyses::all();
@@ -1894,7 +2013,7 @@ public:
     auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
     auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-    LowerMatrixIntrinsics LMT(F, TTI, AA, DT, LI, ORE);
+    LowerMatrixIntrinsics LMT(F, TTI, &AA, &DT, &LI, &ORE);
     bool C = LMT.Visit();
     return C;
   }
@@ -1925,3 +2044,45 @@ INITIALIZE_PASS_END(LowerMatrixIntrinsicsLegacyPass, DEBUG_TYPE, pass_name,
 Pass *llvm::createLowerMatrixIntrinsicsPass() {
   return new LowerMatrixIntrinsicsLegacyPass();
 }
+
+namespace {
+
+/// A lightweight version of the matrix lowering pass that only requires TTI.
+/// Advanced features that require DT, AA or ORE like tiling are disabled. This
+/// is used to lower matrix intrinsics if the main lowering pass is not run, for
+/// example with -O0.
+class LowerMatrixIntrinsicsMinimalLegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  LowerMatrixIntrinsicsMinimalLegacyPass() : FunctionPass(ID) {
+    initializeLowerMatrixIntrinsicsMinimalLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    LowerMatrixIntrinsics LMT(F, TTI, nullptr, nullptr, nullptr, nullptr);
+    bool C = LMT.Visit();
+    return C;
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.setPreservesCFG();
+  }
+};
+} // namespace
+
+static const char pass_name_minimal[] = "Lower the matrix intrinsics (minimal)";
+char LowerMatrixIntrinsicsMinimalLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LowerMatrixIntrinsicsMinimalLegacyPass,
+                      "lower-matrix-intrinsics-minimal", pass_name_minimal,
+                      false, false)
+INITIALIZE_PASS_END(LowerMatrixIntrinsicsMinimalLegacyPass,
+                    "lower-matrix-intrinsics-minimal", pass_name_minimal, false,
+                    false)
+
+Pass *llvm::createLowerMatrixIntrinsicsMinimalPass() {
+  return new LowerMatrixIntrinsicsMinimalLegacyPass();
+}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 4b4196edc12b..a4e695497f30 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -21,8 +21,11 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Argument.h"
@@ -64,10 +67,15 @@ using namespace llvm;
 
 #define DEBUG_TYPE "memcpyopt"
 
+static cl::opt<bool>
+    EnableMemorySSA("enable-memcpyopt-memoryssa", cl::init(false), cl::Hidden,
+                    cl::desc("Use MemorySSA-backed MemCpyOpt."));
+
 STATISTIC(NumMemCpyInstr, "Number of memcpy instructions deleted");
 STATISTIC(NumMemSetInfer, "Number of memsets inferred");
 STATISTIC(NumMoveToCpy,   "Number of memmoves converted to memcpy");
 STATISTIC(NumCpyToSet,    "Number of memcpys converted to memset");
+STATISTIC(NumCallSlot,    "Number of call slot optimizations performed");
 
 namespace {
 
@@ -271,11 +279,17 @@ private:
     AU.setPreservesCFG();
     AU.addRequired<AssumptionCacheTracker>();
     AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addRequired<MemoryDependenceWrapperPass>();
-    AU.addRequired<AAResultsWrapperPass>();
-    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    if (!EnableMemorySSA)
+      AU.addRequired<MemoryDependenceWrapperPass>();
     AU.addPreserved<MemoryDependenceWrapperPass>();
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addPreserved<AAResultsWrapperPass>();
+    if (EnableMemorySSA)
+      AU.addRequired<MemorySSAWrapperPass>();
+    AU.addPreserved<MemorySSAWrapperPass>();
   }
 };
 
@@ -297,6 +311,56 @@ INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
 INITIALIZE_PASS_END(MemCpyOptLegacyPass, "memcpyopt", "MemCpy Optimization",
                     false, false)
 
+// Check that V is either not accessible by the caller, or unwinding cannot
+// occur between Start and End.
+static bool mayBeVisibleThroughUnwinding(Value *V, Instruction *Start,
+                                         Instruction *End) {
+  assert(Start->getParent() == End->getParent() && "Must be in same block");
+  if (!Start->getFunction()->doesNotThrow() &&
+      !isa<AllocaInst>(getUnderlyingObject(V))) {
+    for (const Instruction &I :
+         make_range(Start->getIterator(), End->getIterator())) {
+      if (I.mayThrow())
+        return true;
+    }
+  }
+  return false;
+}
+
+void MemCpyOptPass::eraseInstruction(Instruction *I) {
+  if (MSSAU)
+    MSSAU->removeMemoryAccess(I);
+  if (MD)
+    MD->removeInstruction(I);
+  I->eraseFromParent();
+}
+
+// Check for mod or ref of Loc between Start and End, excluding both boundaries.
+// Start and End must be in the same block
+static bool accessedBetween(AliasAnalysis &AA, MemoryLocation Loc,
+                            const MemoryUseOrDef *Start,
+                            const MemoryUseOrDef *End) {
+  assert(Start->getBlock() == End->getBlock() && "Only local supported");
+  for (const MemoryAccess &MA :
+       make_range(++Start->getIterator(), End->getIterator())) {
+    if (isModOrRefSet(AA.getModRefInfo(cast<MemoryUseOrDef>(MA).getMemoryInst(),
+                                       Loc)))
+      return true;
+  }
+  return false;
+}
+
+// Check for mod of Loc between Start and End, excluding both boundaries.
+// Start and End can be in different blocks.
+static bool writtenBetween(MemorySSA *MSSA, MemoryLocation Loc,
+                           const MemoryUseOrDef *Start,
+                           const MemoryUseOrDef *End) {
+  // TODO: Only walk until we hit Start.
+  MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess(
+      End->getDefiningAccess(), Loc);
+  return !MSSA->dominates(Clobber, Start);
+}
+
 /// When scanning forward over instructions, we look for some other patterns to
 /// fold away. In particular, this looks for stores to neighboring locations of
 /// memory. If it sees enough consecutive ones, it attempts to merge them
@@ -313,7 +377,27 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
   MemsetRanges Ranges(DL);
 
   BasicBlock::iterator BI(StartInst);
+
+  // Keeps track of the last memory use or def before the insertion point for
+  // the new memset. The new MemoryDef for the inserted memsets will be inserted
+  // after MemInsertPoint. It points to either LastMemDef or to the last user
+  // before the insertion point of the memset, if there are any such users.
+  MemoryUseOrDef *MemInsertPoint = nullptr;
+  // Keeps track of the last MemoryDef between StartInst and the insertion point
+  // for the new memset. This will become the defining access of the inserted
+  // memsets.
+  MemoryDef *LastMemDef = nullptr;
   for (++BI; !BI->isTerminator(); ++BI) {
+    if (MSSAU) {
+      auto *CurrentAcc = cast_or_null<MemoryUseOrDef>(
+          MSSAU->getMemorySSA()->getMemoryAccess(&*BI));
+      if (CurrentAcc) {
+        MemInsertPoint = CurrentAcc;
+        if (auto *CurrentDef = dyn_cast<MemoryDef>(CurrentAcc))
+          LastMemDef = CurrentDef;
+      }
+    }
+
     if (!isa<StoreInst>(BI) && !isa<MemSetInst>(BI)) {
       // If the instruction is readnone, ignore it, otherwise bail out.  We
       // don't even allow readonly here because we don't want something like:
@@ -327,8 +411,15 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
       // If this is a store, see if we can merge it in.
       if (!NextStore->isSimple()) break;
 
+      Value *StoredVal = NextStore->getValueOperand();
+
+      // Don't convert stores of non-integral pointer types to memsets (which
+      // stores integers).
+      if (DL.isNonIntegralPointerType(StoredVal->getType()->getScalarType()))
+        break;
+
       // Check to see if this stored value is of the same byte-splattable value.
-      Value *StoredByte = isBytewiseValue(NextStore->getOperand(0), DL);
+      Value *StoredByte = isBytewiseValue(StoredVal, DL);
       if (isa<UndefValue>(ByteVal) && StoredByte)
         ByteVal = StoredByte;
       if (ByteVal != StoredByte)
@@ -392,15 +483,27 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
                                                    : Range.TheStores) dbgs()
                                               << *SI << '\n';
                dbgs() << "With: " << *AMemSet << '\n');
-
     if (!Range.TheStores.empty())
       AMemSet->setDebugLoc(Range.TheStores[0]->getDebugLoc());
 
-    // Zap all the stores.
-    for (Instruction *SI : Range.TheStores) {
-      MD->removeInstruction(SI);
-      SI->eraseFromParent();
+    if (MSSAU) {
+      assert(LastMemDef && MemInsertPoint &&
+             "Both LastMemDef and MemInsertPoint need to be set");
+      auto *NewDef =
+          cast<MemoryDef>(MemInsertPoint->getMemoryInst() == &*BI
+                              ? MSSAU->createMemoryAccessBefore(
+                                    AMemSet, LastMemDef, MemInsertPoint)
+                              : MSSAU->createMemoryAccessAfter(
+                                    AMemSet, LastMemDef, MemInsertPoint));
+      MSSAU->insertDef(NewDef, /*RenameUses=*/true);
+      LastMemDef = NewDef;
+      MemInsertPoint = NewDef;
     }
+
+    // Zap all the stores.
+    for (Instruction *SI : Range.TheStores)
+      eraseInstruction(SI);
+
     ++NumMemSetInfer;
   }
 
@@ -411,11 +514,10 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
 // It will lift the store and its argument + that anything that
 // may alias with these.
 // The method returns true if it was successful.
-static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P,
-                   const LoadInst *LI) {
+bool MemCpyOptPass::moveUp(StoreInst *SI, Instruction *P, const LoadInst *LI) {
   // If the store alias this position, early bail out.
   MemoryLocation StoreLoc = MemoryLocation::get(SI);
-  if (isModOrRefSet(AA.getModRefInfo(P, StoreLoc)))
+  if (isModOrRefSet(AA->getModRefInfo(P, StoreLoc)))
     return false;
 
   // Keep track of the arguments of all instruction we plan to lift
@@ -426,7 +528,7 @@ static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P,
       Args.insert(Ptr);
 
   // Instruction to lift before P.
-  SmallVector<Instruction*, 8> ToLift;
+  SmallVector<Instruction *, 8> ToLift{SI};
 
   // Memory locations of lifted instructions.
   SmallVector<MemoryLocation, 8> MemLocs{StoreLoc};
@@ -439,19 +541,24 @@ static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P,
   for (auto I = --SI->getIterator(), E = P->getIterator(); I != E; --I) {
     auto *C = &*I;
 
-    bool MayAlias = isModOrRefSet(AA.getModRefInfo(C, None));
+    // Make sure hoisting does not perform a store that was not guaranteed to
+    // happen.
+    if (!isGuaranteedToTransferExecutionToSuccessor(C))
+      return false;
+
+    bool MayAlias = isModOrRefSet(AA->getModRefInfo(C, None));
 
     bool NeedLift = false;
     if (Args.erase(C))
       NeedLift = true;
     else if (MayAlias) {
-      NeedLift = llvm::any_of(MemLocs, [C, &AA](const MemoryLocation &ML) {
-        return isModOrRefSet(AA.getModRefInfo(C, ML));
+      NeedLift = llvm::any_of(MemLocs, [C, this](const MemoryLocation &ML) {
+        return isModOrRefSet(AA->getModRefInfo(C, ML));
       });
 
       if (!NeedLift)
-        NeedLift = llvm::any_of(Calls, [C, &AA](const CallBase *Call) {
-          return isModOrRefSet(AA.getModRefInfo(C, Call));
+        NeedLift = llvm::any_of(Calls, [C, this](const CallBase *Call) {
+          return isModOrRefSet(AA->getModRefInfo(C, Call));
         });
     }
 
@@ -461,18 +568,18 @@ static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P,
     if (MayAlias) {
       // Since LI is implicitly moved downwards past the lifted instructions,
       // none of them may modify its source.
-      if (isModSet(AA.getModRefInfo(C, LoadLoc)))
+      if (isModSet(AA->getModRefInfo(C, LoadLoc)))
         return false;
       else if (const auto *Call = dyn_cast<CallBase>(C)) {
         // If we can't lift this before P, it's game over.
-        if (isModOrRefSet(AA.getModRefInfo(P, Call)))
+        if (isModOrRefSet(AA->getModRefInfo(P, Call)))
           return false;
 
         Calls.push_back(Call);
       } else if (isa<LoadInst>(C) || isa<StoreInst>(C) || isa<VAArgInst>(C)) {
         // If we can't lift this before P, it's game over.
         auto ML = MemoryLocation::get(C);
-        if (isModOrRefSet(AA.getModRefInfo(P, ML)))
+        if (isModOrRefSet(AA->getModRefInfo(P, ML)))
           return false;
 
         MemLocs.push_back(ML);
@@ -492,10 +599,40 @@ static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P,
       }
   }
 
-  // We made it, we need to lift
+  // Find MSSA insertion point. Normally P will always have a corresponding
+  // memory access before which we can insert. However, with non-standard AA
+  // pipelines, there may be a mismatch between AA and MSSA, in which case we
+  // will scan for a memory access before P. In either case, we know for sure
+  // that at least the load will have a memory access.
+  // TODO: Simplify this once P will be determined by MSSA, in which case the
+  // discrepancy can no longer occur.
+  MemoryUseOrDef *MemInsertPoint = nullptr;
+  if (MSSAU) {
+    if (MemoryUseOrDef *MA = MSSAU->getMemorySSA()->getMemoryAccess(P)) {
+      MemInsertPoint = cast<MemoryUseOrDef>(--MA->getIterator());
+    } else {
+      const Instruction *ConstP = P;
+      for (const Instruction &I : make_range(++ConstP->getReverseIterator(),
+                                             ++LI->getReverseIterator())) {
+        if (MemoryUseOrDef *MA = MSSAU->getMemorySSA()->getMemoryAccess(&I)) {
+          MemInsertPoint = MA;
+          break;
+        }
+      }
+    }
+  }
+
+  // We made it, we need to lift.
   for (auto *I : llvm::reverse(ToLift)) {
     LLVM_DEBUG(dbgs() << "Lifting " << *I << " before " << *P << "\n");
     I->moveBefore(P);
+    if (MSSAU) {
+      assert(MemInsertPoint && "Must have found insert point");
+      if (MemoryUseOrDef *MA = MSSAU->getMemorySSA()->getMemoryAccess(I)) {
+        MSSAU->moveAfter(MA, MemInsertPoint);
+        MemInsertPoint = MA;
+      }
+    }
   }
 
   return true;
@@ -515,23 +652,30 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
 
   const DataLayout &DL = SI->getModule()->getDataLayout();
 
+  Value *StoredVal = SI->getValueOperand();
+
+  // Not all the transforms below are correct for non-integral pointers, bail
+  // until we've audited the individual pieces.
+  if (DL.isNonIntegralPointerType(StoredVal->getType()->getScalarType()))
+    return false;
+
   // Load to store forwarding can be interpreted as memcpy.
-  if (LoadInst *LI = dyn_cast<LoadInst>(SI->getOperand(0))) {
+  if (LoadInst *LI = dyn_cast<LoadInst>(StoredVal)) {
     if (LI->isSimple() && LI->hasOneUse() &&
         LI->getParent() == SI->getParent()) {
 
       auto *T = LI->getType();
       if (T->isAggregateType()) {
-        AliasAnalysis &AA = LookupAliasAnalysis();
         MemoryLocation LoadLoc = MemoryLocation::get(LI);
 
         // We use alias analysis to check if an instruction may store to
         // the memory we load from in between the load and the store. If
         // such an instruction is found, we try to promote there instead
         // of at the store position.
+        // TODO: Can use MSSA for this.
         Instruction *P = SI;
         for (auto &I : make_range(++LI->getIterator(), SI->getIterator())) {
-          if (isModSet(AA.getModRefInfo(&I, LoadLoc))) {
+          if (isModSet(AA->getModRefInfo(&I, LoadLoc))) {
             P = &I;
             break;
           }
@@ -542,7 +686,7 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
         // position if nothing alias the store memory after this and the store
         // destination is not in the range.
         if (P && P != SI) {
-          if (!moveUp(AA, SI, P, LI))
+          if (!moveUp(SI, P, LI))
             P = nullptr;
         }
 
@@ -553,7 +697,7 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
           // memmove must be used to preserve semantic. If not, memcpy can
           // be used.
           bool UseMemMove = false;
-          if (!AA.isNoAlias(MemoryLocation::get(SI), LoadLoc))
+          if (!AA->isNoAlias(MemoryLocation::get(SI), LoadLoc))
             UseMemMove = true;
 
           uint64_t Size = DL.getTypeStoreSize(T);
@@ -572,10 +716,16 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
           LLVM_DEBUG(dbgs() << "Promoting " << *LI << " to " << *SI << " => "
                             << *M << "\n");
 
-          MD->removeInstruction(SI);
-          SI->eraseFromParent();
-          MD->removeInstruction(LI);
-          LI->eraseFromParent();
+          if (MSSAU) {
+            auto *LastDef =
+                cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(SI));
+            auto *NewAccess =
+                MSSAU->createMemoryAccessAfter(M, LastDef, LastDef);
+            MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true);
+          }
+
+          eraseInstruction(SI);
+          eraseInstruction(LI);
           ++NumMemCpyInstr;
 
           // Make sure we do not invalidate the iterator.
@@ -587,44 +737,50 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
       // Detect cases where we're performing call slot forwarding, but
       // happen to be using a load-store pair to implement it, rather than
       // a memcpy.
-      MemDepResult ldep = MD->getDependency(LI);
       CallInst *C = nullptr;
-      if (ldep.isClobber() && !isa<MemCpyInst>(ldep.getInst()))
-        C = dyn_cast<CallInst>(ldep.getInst());
+      if (EnableMemorySSA) {
+        if (auto *LoadClobber = dyn_cast<MemoryUseOrDef>(
+                MSSA->getWalker()->getClobberingMemoryAccess(LI))) {
+          // The load most post-dom the call. Limit to the same block for now.
+          // TODO: Support non-local call-slot optimization?
+          if (LoadClobber->getBlock() == SI->getParent())
+            C = dyn_cast_or_null<CallInst>(LoadClobber->getMemoryInst());
+        }
+      } else {
+        MemDepResult ldep = MD->getDependency(LI);
+        if (ldep.isClobber() && !isa<MemCpyInst>(ldep.getInst()))
+          C = dyn_cast<CallInst>(ldep.getInst());
+      }
 
       if (C) {
         // Check that nothing touches the dest of the "copy" between
         // the call and the store.
-        Value *CpyDest = SI->getPointerOperand()->stripPointerCasts();
-        bool CpyDestIsLocal = isa<AllocaInst>(CpyDest);
-        AliasAnalysis &AA = LookupAliasAnalysis();
         MemoryLocation StoreLoc = MemoryLocation::get(SI);
-        for (BasicBlock::iterator I = --SI->getIterator(), E = C->getIterator();
-             I != E; --I) {
-          if (isModOrRefSet(AA.getModRefInfo(&*I, StoreLoc))) {
+        if (EnableMemorySSA) {
+          if (accessedBetween(*AA, StoreLoc, MSSA->getMemoryAccess(C),
+                              MSSA->getMemoryAccess(SI)))
             C = nullptr;
-            break;
-          }
-          // The store to dest may never happen if an exception can be thrown
-          // between the load and the store.
-          if (I->mayThrow() && !CpyDestIsLocal) {
-            C = nullptr;
-            break;
+        } else {
+          for (BasicBlock::iterator I = --SI->getIterator(),
+                                    E = C->getIterator();
+               I != E; --I) {
+            if (isModOrRefSet(AA->getModRefInfo(&*I, StoreLoc))) {
+              C = nullptr;
+              break;
+            }
           }
         }
       }
 
       if (C) {
         bool changed = performCallSlotOptzn(
-            LI, SI->getPointerOperand()->stripPointerCasts(),
+            LI, SI, SI->getPointerOperand()->stripPointerCasts(),
             LI->getPointerOperand()->stripPointerCasts(),
             DL.getTypeStoreSize(SI->getOperand(0)->getType()),
             commonAlignment(SI->getAlign(), LI->getAlign()), C);
         if (changed) {
-          MD->removeInstruction(SI);
-          SI->eraseFromParent();
-          MD->removeInstruction(LI);
-          LI->eraseFromParent();
+          eraseInstruction(SI);
+          eraseInstruction(LI);
           ++NumMemCpyInstr;
           return true;
         }
@@ -658,8 +814,15 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
 
       LLVM_DEBUG(dbgs() << "Promoting " << *SI << " to " << *M << "\n");
 
-      MD->removeInstruction(SI);
-      SI->eraseFromParent();
+      if (MSSAU) {
+        assert(isa<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(SI)));
+        auto *LastDef =
+            cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(SI));
+        auto *NewAccess = MSSAU->createMemoryAccessAfter(M, LastDef, LastDef);
+        MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true);
+      }
+
+      eraseInstruction(SI);
       NumMemSetInfer++;
 
       // Make sure we do not invalidate the iterator.
@@ -686,7 +849,8 @@ bool MemCpyOptPass::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) {
 /// Takes a memcpy and a call that it depends on,
 /// and checks for the possibility of a call slot optimization by having
 /// the call write its result directly into the destination of the memcpy.
-bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpy, Value *cpyDest,
+bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
+                                         Instruction *cpyStore, Value *cpyDest,
                                          Value *cpySrc, uint64_t cpyLen,
                                          Align cpyAlign, CallInst *C) {
   // The general transformation to keep in mind is
@@ -717,7 +881,7 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpy, Value *cpyDest,
   if (!srcArraySize)
     return false;
 
-  const DataLayout &DL = cpy->getModule()->getDataLayout();
+  const DataLayout &DL = cpyLoad->getModule()->getDataLayout();
   uint64_t srcSize = DL.getTypeAllocSize(srcAlloca->getAllocatedType()) *
                      srcArraySize->getZExtValue();
 
@@ -727,43 +891,26 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpy, Value *cpyDest,
   // Check that accessing the first srcSize bytes of dest will not cause a
   // trap.  Otherwise the transform is invalid since it might cause a trap
   // to occur earlier than it otherwise would.
-  if (AllocaInst *A = dyn_cast<AllocaInst>(cpyDest)) {
-    // The destination is an alloca.  Check it is larger than srcSize.
-    ConstantInt *destArraySize = dyn_cast<ConstantInt>(A->getArraySize());
-    if (!destArraySize)
-      return false;
-
-    uint64_t destSize = DL.getTypeAllocSize(A->getAllocatedType()) *
-                        destArraySize->getZExtValue();
-
-    if (destSize < srcSize)
-      return false;
-  } else if (Argument *A = dyn_cast<Argument>(cpyDest)) {
-    // The store to dest may never happen if the call can throw.
-    if (C->mayThrow())
-      return false;
-
-    if (A->getDereferenceableBytes() < srcSize) {
-      // If the destination is an sret parameter then only accesses that are
-      // outside of the returned struct type can trap.
-      if (!A->hasStructRetAttr())
-        return false;
-
-      Type *StructTy = cast<PointerType>(A->getType())->getElementType();
-      if (!StructTy->isSized()) {
-        // The call may never return and hence the copy-instruction may never
-        // be executed, and therefore it's not safe to say "the destination
-        // has at least <cpyLen> bytes, as implied by the copy-instruction",
-        return false;
-      }
+  if (!isDereferenceableAndAlignedPointer(cpyDest, Align(1), APInt(64, cpyLen),
+                                          DL, C, DT))
+    return false;
 
-      uint64_t destSize = DL.getTypeAllocSize(StructTy);
-      if (destSize < srcSize)
-        return false;
-    }
-  } else {
+  // Make sure that nothing can observe cpyDest being written early. There are
+  // a number of cases to consider:
+  //  1. cpyDest cannot be accessed between C and cpyStore as a precondition of
+  //     the transform.
+  //  2. C itself may not access cpyDest (prior to the transform). This is
+  //     checked further below.
+  //  3. If cpyDest is accessible to the caller of this function (potentially
+  //     captured and not based on an alloca), we need to ensure that we cannot
+  //     unwind between C and cpyStore. This is checked here.
+  //  4. If cpyDest is potentially captured, there may be accesses to it from
+  //     another thread. In this case, we need to check that cpyStore is
+  //     guaranteed to be executed if C is. As it is a non-atomic access, it
+  //     renders accesses from other threads undefined.
+  //     TODO: This is currently not checked.
+  if (mayBeVisibleThroughUnwinding(cpyDest, C, cpyStore))
     return false;
-  }
 
   // Check that dest points to memory that is at least as aligned as src.
   Align srcAlign = srcAlloca->getAlign();
@@ -777,29 +924,26 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpy, Value *cpyDest,
   // guarantees that it holds only undefined values when passed in (so the final
   // memcpy can be dropped), that it is not read or written between the call and
   // the memcpy, and that writing beyond the end of it is undefined.
-  SmallVector<User*, 8> srcUseList(srcAlloca->user_begin(),
-                                   srcAlloca->user_end());
+  SmallVector<User *, 8> srcUseList(srcAlloca->users());
   while (!srcUseList.empty()) {
     User *U = srcUseList.pop_back_val();
 
     if (isa<BitCastInst>(U) || isa<AddrSpaceCastInst>(U)) {
-      for (User *UU : U->users())
-        srcUseList.push_back(UU);
+      append_range(srcUseList, U->users());
       continue;
     }
     if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(U)) {
       if (!G->hasAllZeroIndices())
         return false;
 
-      for (User *UU : U->users())
-        srcUseList.push_back(UU);
+      append_range(srcUseList, U->users());
       continue;
     }
     if (const IntrinsicInst *IT = dyn_cast<IntrinsicInst>(U))
       if (IT->isLifetimeStartOrEnd())
         continue;
 
-    if (U != C && U != cpy)
+    if (U != C && U != cpyLoad)
       return false;
   }
 
@@ -811,20 +955,24 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpy, Value *cpyDest,
 
   // Since we're changing the parameter to the callsite, we need to make sure
   // that what would be the new parameter dominates the callsite.
-  DominatorTree &DT = LookupDomTree();
-  if (Instruction *cpyDestInst = dyn_cast<Instruction>(cpyDest))
-    if (!DT.dominates(cpyDestInst, C))
+  if (!DT->dominates(cpyDest, C)) {
+    // Support moving a constant index GEP before the call.
+    auto *GEP = dyn_cast<GetElementPtrInst>(cpyDest);
+    if (GEP && GEP->hasAllConstantIndices() &&
+        DT->dominates(GEP->getPointerOperand(), C))
+      GEP->moveBefore(C);
+    else
       return false;
+  }
 
   // In addition to knowing that the call does not access src in some
   // unexpected manner, for example via a global, which we deduce from
   // the use analysis, we also need to know that it does not sneakily
   // access dest.  We rely on AA to figure this out for us.
-  AliasAnalysis &AA = LookupAliasAnalysis();
-  ModRefInfo MR = AA.getModRefInfo(C, cpyDest, LocationSize::precise(srcSize));
+  ModRefInfo MR = AA->getModRefInfo(C, cpyDest, LocationSize::precise(srcSize));
   // If necessary, perform additional analysis.
   if (isModOrRefSet(MR))
-    MR = AA.callCapturesBefore(C, cpyDest, LocationSize::precise(srcSize), &DT);
+    MR = AA->callCapturesBefore(C, cpyDest, LocationSize::precise(srcSize), DT);
   if (isModOrRefSet(MR))
     return false;
 
@@ -866,7 +1014,8 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpy, Value *cpyDest,
 
   // Drop any cached information about the call, because we may have changed
   // its dependence information by changing its parameter.
-  MD->removeInstruction(C);
+  if (MD)
+    MD->removeInstruction(C);
 
   // Update AA metadata
   // FIXME: MD_tbaa_struct and MD_mem_parallel_loop_access should also be
@@ -875,12 +1024,9 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpy, Value *cpyDest,
                          LLVMContext::MD_noalias,
                          LLVMContext::MD_invariant_group,
                          LLVMContext::MD_access_group};
-  combineMetadata(C, cpy, KnownIDs, true);
-
-  // Remove the memcpy.
-  MD->removeInstruction(cpy);
-  ++NumMemCpyInstr;
+  combineMetadata(C, cpyLoad, KnownIDs, true);
 
+  ++NumCallSlot;
   return true;
 }
 
@@ -908,8 +1054,6 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
   if (!MDepLen || !MLen || MDepLen->getZExtValue() < MLen->getZExtValue())
     return false;
 
-  AliasAnalysis &AA = LookupAliasAnalysis();
-
   // Verify that the copied-from memory doesn't change in between the two
   // transfers.  For example, in:
   //    memcpy(a <- b)
@@ -919,21 +1063,28 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
   //
   // TODO: If the code between M and MDep is transparent to the destination "c",
   // then we could still perform the xform by moving M up to the first memcpy.
-  //
-  // NOTE: This is conservative, it will stop on any read from the source loc,
-  // not just the defining memcpy.
-  MemDepResult SourceDep =
-      MD->getPointerDependencyFrom(MemoryLocation::getForSource(MDep), false,
-                                   M->getIterator(), M->getParent());
-  if (!SourceDep.isClobber() || SourceDep.getInst() != MDep)
-    return false;
+  if (EnableMemorySSA) {
+    // TODO: It would be sufficient to check the MDep source up to the memcpy
+    // size of M, rather than MDep.
+    if (writtenBetween(MSSA, MemoryLocation::getForSource(MDep),
+                       MSSA->getMemoryAccess(MDep), MSSA->getMemoryAccess(M)))
+      return false;
+  } else {
+    // NOTE: This is conservative, it will stop on any read from the source loc,
+    // not just the defining memcpy.
+    MemDepResult SourceDep =
+        MD->getPointerDependencyFrom(MemoryLocation::getForSource(MDep), false,
+                                     M->getIterator(), M->getParent());
+    if (!SourceDep.isClobber() || SourceDep.getInst() != MDep)
+      return false;
+  }
 
   // If the dest of the second might alias the source of the first, then the
   // source and dest might overlap.  We still want to eliminate the intermediate
   // value, but we have to generate a memmove instead of memcpy.
   bool UseMemMove = false;
-  if (!AA.isNoAlias(MemoryLocation::getForDest(M),
-                    MemoryLocation::getForSource(MDep)))
+  if (!AA->isNoAlias(MemoryLocation::getForDest(M),
+                     MemoryLocation::getForSource(MDep)))
     UseMemMove = true;
 
   // If all checks passed, then we can transform M.
@@ -943,18 +1094,25 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
   // TODO: Is this worth it if we're creating a less aligned memcpy? For
   // example we could be moving from movaps -> movq on x86.
   IRBuilder<> Builder(M);
+  Instruction *NewM;
   if (UseMemMove)
-    Builder.CreateMemMove(M->getRawDest(), M->getDestAlign(),
-                          MDep->getRawSource(), MDep->getSourceAlign(),
-                          M->getLength(), M->isVolatile());
+    NewM = Builder.CreateMemMove(M->getRawDest(), M->getDestAlign(),
+                                 MDep->getRawSource(), MDep->getSourceAlign(),
+                                 M->getLength(), M->isVolatile());
   else
-    Builder.CreateMemCpy(M->getRawDest(), M->getDestAlign(),
-                         MDep->getRawSource(), MDep->getSourceAlign(),
-                         M->getLength(), M->isVolatile());
+    NewM = Builder.CreateMemCpy(M->getRawDest(), M->getDestAlign(),
+                                MDep->getRawSource(), MDep->getSourceAlign(),
+                                M->getLength(), M->isVolatile());
+
+  if (MSSAU) {
+    assert(isa<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(M)));
+    auto *LastDef = cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(M));
+    auto *NewAccess = MSSAU->createMemoryAccessAfter(NewM, LastDef, LastDef);
+    MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true);
+  }
 
   // Remove the instruction we're replacing.
-  MD->removeInstruction(M);
-  M->eraseFromParent();
+  eraseInstruction(M);
   ++NumMemCpyInstr;
   return true;
 }
@@ -979,18 +1137,41 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy,
   if (MemSet->getDest() != MemCpy->getDest())
     return false;
 
-  // Check that there are no other dependencies on the memset destination.
-  MemDepResult DstDepInfo =
-      MD->getPointerDependencyFrom(MemoryLocation::getForDest(MemSet), false,
-                                   MemCpy->getIterator(), MemCpy->getParent());
-  if (DstDepInfo.getInst() != MemSet)
+  // Check that src and dst of the memcpy aren't the same. While memcpy
+  // operands cannot partially overlap, exact equality is allowed.
+  if (!AA->isNoAlias(MemoryLocation(MemCpy->getSource(),
+                                    LocationSize::precise(1)),
+                     MemoryLocation(MemCpy->getDest(),
+                                    LocationSize::precise(1))))
     return false;
 
+  if (EnableMemorySSA) {
+    // We know that dst up to src_size is not written. We now need to make sure
+    // that dst up to dst_size is not accessed. (If we did not move the memset,
+    // checking for reads would be sufficient.)
+    if (accessedBetween(*AA, MemoryLocation::getForDest(MemSet),
+                        MSSA->getMemoryAccess(MemSet),
+                        MSSA->getMemoryAccess(MemCpy))) {
+      return false;
+    }
+  } else {
+    // We have already checked that dst up to src_size is not accessed. We
+    // need to make sure that there are no accesses up to dst_size either.
+    MemDepResult DstDepInfo = MD->getPointerDependencyFrom(
+        MemoryLocation::getForDest(MemSet), false, MemCpy->getIterator(),
+        MemCpy->getParent());
+    if (DstDepInfo.getInst() != MemSet)
+      return false;
+  }
+
   // Use the same i8* dest as the memcpy, killing the memset dest if different.
   Value *Dest = MemCpy->getRawDest();
   Value *DestSize = MemSet->getLength();
   Value *SrcSize = MemCpy->getLength();
 
+  if (mayBeVisibleThroughUnwinding(Dest, MemSet, MemCpy))
+    return false;
+
   // By default, create an unaligned memset.
   unsigned Align = 1;
   // If Dest is aligned, and SrcSize is constant, use the minimum alignment
@@ -1016,13 +1197,25 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy,
   Value *SizeDiff = Builder.CreateSub(DestSize, SrcSize);
   Value *MemsetLen = Builder.CreateSelect(
       Ule, ConstantInt::getNullValue(DestSize->getType()), SizeDiff);
-  Builder.CreateMemSet(
+  Instruction *NewMemSet = Builder.CreateMemSet(
       Builder.CreateGEP(Dest->getType()->getPointerElementType(), Dest,
                         SrcSize),
       MemSet->getOperand(1), MemsetLen, MaybeAlign(Align));
 
-  MD->removeInstruction(MemSet);
-  MemSet->eraseFromParent();
+  if (MSSAU) {
+    assert(isa<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(MemCpy)) &&
+           "MemCpy must be a MemoryDef");
+    // The new memset is inserted after the memcpy, but it is known that its
+    // defining access is the memset about to be removed which immediately
+    // precedes the memcpy.
+    auto *LastDef =
+        cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(MemCpy));
+    auto *NewAccess = MSSAU->createMemoryAccessBefore(
+        NewMemSet, LastDef->getDefiningAccess(), LastDef);
+    MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true);
+  }
+
+  eraseInstruction(MemSet);
   return true;
 }
 
@@ -1041,6 +1234,24 @@ static bool hasUndefContents(Instruction *I, ConstantInt *Size) {
   return false;
 }
 
+static bool hasUndefContentsMSSA(MemorySSA *MSSA, AliasAnalysis *AA, Value *V,
+                                 MemoryDef *Def, ConstantInt *Size) {
+  if (MSSA->isLiveOnEntryDef(Def))
+    return isa<AllocaInst>(getUnderlyingObject(V));
+
+  if (IntrinsicInst *II =
+          dyn_cast_or_null<IntrinsicInst>(Def->getMemoryInst())) {
+    if (II->getIntrinsicID() == Intrinsic::lifetime_start) {
+      ConstantInt *LTSize = cast<ConstantInt>(II->getArgOperand(0));
+      if (AA->isMustAlias(V, II->getArgOperand(1)) &&
+          LTSize->getZExtValue() >= Size->getZExtValue())
+        return true;
+    }
+  }
+
+  return false;
+}
+
 /// Transform memcpy to memset when its source was just memset.
 /// In other words, turn:
 /// \code
@@ -1057,11 +1268,9 @@ static bool hasUndefContents(Instruction *I, ConstantInt *Size) {
 /// The \p MemCpy must have a Constant length.
 bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
                                                MemSetInst *MemSet) {
-  AliasAnalysis &AA = LookupAliasAnalysis();
-
   // Make sure that memcpy(..., memset(...), ...), that is we are memsetting and
   // memcpying from the same address. Otherwise it is hard to reason about.
-  if (!AA.isMustAlias(MemSet->getRawDest(), MemCpy->getRawSource()))
+  if (!AA->isMustAlias(MemSet->getRawDest(), MemCpy->getRawSource()))
     return false;
 
   // A known memset size is required.
@@ -1078,17 +1287,37 @@ bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
     // interested in the bytes from MemSetSize..CopySize here, but as we can't
     // easily represent this location, we use the full 0..CopySize range.
     MemoryLocation MemCpyLoc = MemoryLocation::getForSource(MemCpy);
-    MemDepResult DepInfo = MD->getPointerDependencyFrom(
-        MemCpyLoc, true, MemSet->getIterator(), MemSet->getParent());
-    if (DepInfo.isDef() && hasUndefContents(DepInfo.getInst(), CopySize))
-      CopySize = MemSetSize;
-    else
+    bool CanReduceSize = false;
+    if (EnableMemorySSA) {
+      MemoryUseOrDef *MemSetAccess = MSSA->getMemoryAccess(MemSet);
+      MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess(
+          MemSetAccess->getDefiningAccess(), MemCpyLoc);
+      if (auto *MD = dyn_cast<MemoryDef>(Clobber))
+        if (hasUndefContentsMSSA(MSSA, AA, MemCpy->getSource(), MD, CopySize))
+          CanReduceSize = true;
+    } else {
+      MemDepResult DepInfo = MD->getPointerDependencyFrom(
+          MemCpyLoc, true, MemSet->getIterator(), MemSet->getParent());
+      if (DepInfo.isDef() && hasUndefContents(DepInfo.getInst(), CopySize))
+        CanReduceSize = true;
+    }
+
+    if (!CanReduceSize)
       return false;
+    CopySize = MemSetSize;
   }
 
   IRBuilder<> Builder(MemCpy);
-  Builder.CreateMemSet(MemCpy->getRawDest(), MemSet->getOperand(1), CopySize,
-                       MaybeAlign(MemCpy->getDestAlignment()));
+  Instruction *NewM =
+      Builder.CreateMemSet(MemCpy->getRawDest(), MemSet->getOperand(1),
+                           CopySize, MaybeAlign(MemCpy->getDestAlignment()));
+  if (MSSAU) {
+    auto *LastDef =
+        cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(MemCpy));
+    auto *NewAccess = MSSAU->createMemoryAccessAfter(NewM, LastDef, LastDef);
+    MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true);
+  }
+
   return true;
 }
 
@@ -1104,8 +1333,7 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
   // If the source and destination of the memcpy are the same, then zap it.
   if (M->getSource() == M->getDest()) {
     ++BBI;
-    MD->removeInstruction(M);
-    M->eraseFromParent();
+    eraseInstruction(M);
     return true;
   }
 
@@ -1115,73 +1343,156 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
       if (Value *ByteVal = isBytewiseValue(GV->getInitializer(),
                                            M->getModule()->getDataLayout())) {
         IRBuilder<> Builder(M);
-        Builder.CreateMemSet(M->getRawDest(), ByteVal, M->getLength(),
-                             MaybeAlign(M->getDestAlignment()), false);
-        MD->removeInstruction(M);
-        M->eraseFromParent();
+        Instruction *NewM =
+            Builder.CreateMemSet(M->getRawDest(), ByteVal, M->getLength(),
+                                 MaybeAlign(M->getDestAlignment()), false);
+        if (MSSAU) {
+          auto *LastDef =
+              cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(M));
+          auto *NewAccess =
+              MSSAU->createMemoryAccessAfter(NewM, LastDef, LastDef);
+          MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true);
+        }
+
+        eraseInstruction(M);
         ++NumCpyToSet;
         return true;
       }
 
-  MemDepResult DepInfo = MD->getDependency(M);
-
-  // Try to turn a partially redundant memset + memcpy into
-  // memcpy + smaller memset.  We don't need the memcpy size for this.
-  if (DepInfo.isClobber())
-    if (MemSetInst *MDep = dyn_cast<MemSetInst>(DepInfo.getInst()))
-      if (processMemSetMemCpyDependence(M, MDep))
-        return true;
+  if (EnableMemorySSA) {
+    MemoryUseOrDef *MA = MSSA->getMemoryAccess(M);
+    MemoryAccess *AnyClobber = MSSA->getWalker()->getClobberingMemoryAccess(MA);
+    MemoryLocation DestLoc = MemoryLocation::getForDest(M);
+    const MemoryAccess *DestClobber =
+        MSSA->getWalker()->getClobberingMemoryAccess(AnyClobber, DestLoc);
+
+    // Try to turn a partially redundant memset + memcpy into
+    // memcpy + smaller memset.  We don't need the memcpy size for this.
+    // The memcpy most post-dom the memset, so limit this to the same basic
+    // block. A non-local generalization is likely not worthwhile.
+    if (auto *MD = dyn_cast<MemoryDef>(DestClobber))
+      if (auto *MDep = dyn_cast_or_null<MemSetInst>(MD->getMemoryInst()))
+        if (DestClobber->getBlock() == M->getParent())
+          if (processMemSetMemCpyDependence(M, MDep))
+            return true;
+
+    // The optimizations after this point require the memcpy size.
+    ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength());
+    if (!CopySize) return false;
+
+    MemoryAccess *SrcClobber = MSSA->getWalker()->getClobberingMemoryAccess(
+        AnyClobber, MemoryLocation::getForSource(M));
+
+    // There are four possible optimizations we can do for memcpy:
+    //   a) memcpy-memcpy xform which exposes redundance for DSE.
+    //   b) call-memcpy xform for return slot optimization.
+    //   c) memcpy from freshly alloca'd space or space that has just started
+    //      its lifetime copies undefined data, and we can therefore eliminate
+    //      the memcpy in favor of the data that was already at the destination.
+    //   d) memcpy from a just-memset'd source can be turned into memset.
+    if (auto *MD = dyn_cast<MemoryDef>(SrcClobber)) {
+      if (Instruction *MI = MD->getMemoryInst()) {
+        if (auto *C = dyn_cast<CallInst>(MI)) {
+          // The memcpy must post-dom the call. Limit to the same block for now.
+          // Additionally, we need to ensure that there are no accesses to dest
+          // between the call and the memcpy. Accesses to src will be checked
+          // by performCallSlotOptzn().
+          // TODO: Support non-local call-slot optimization?
+          if (C->getParent() == M->getParent() &&
+              !accessedBetween(*AA, DestLoc, MD, MA)) {
+            // FIXME: Can we pass in either of dest/src alignment here instead
+            // of conservatively taking the minimum?
+            Align Alignment = std::min(M->getDestAlign().valueOrOne(),
+                                       M->getSourceAlign().valueOrOne());
+            if (performCallSlotOptzn(M, M, M->getDest(), M->getSource(),
+                                     CopySize->getZExtValue(), Alignment, C)) {
+              LLVM_DEBUG(dbgs() << "Performed call slot optimization:\n"
+                                << "    call: " << *C << "\n"
+                                << "    memcpy: " << *M << "\n");
+              eraseInstruction(M);
+              ++NumMemCpyInstr;
+              return true;
+            }
+          }
+        }
+        if (auto *MDep = dyn_cast<MemCpyInst>(MI))
+          return processMemCpyMemCpyDependence(M, MDep);
+        if (auto *MDep = dyn_cast<MemSetInst>(MI)) {
+          if (performMemCpyToMemSetOptzn(M, MDep)) {
+            LLVM_DEBUG(dbgs() << "Converted memcpy to memset\n");
+            eraseInstruction(M);
+            ++NumCpyToSet;
+            return true;
+          }
+        }
+      }
 
-  // The optimizations after this point require the memcpy size.
-  ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength());
-  if (!CopySize) return false;
-
-  // There are four possible optimizations we can do for memcpy:
-  //   a) memcpy-memcpy xform which exposes redundance for DSE.
-  //   b) call-memcpy xform for return slot optimization.
-  //   c) memcpy from freshly alloca'd space or space that has just started its
-  //      lifetime copies undefined data, and we can therefore eliminate the
-  //      memcpy in favor of the data that was already at the destination.
-  //   d) memcpy from a just-memset'd source can be turned into memset.
-  if (DepInfo.isClobber()) {
-    if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) {
-      // FIXME: Can we pass in either of dest/src alignment here instead
-      // of conservatively taking the minimum?
-      Align Alignment = std::min(M->getDestAlign().valueOrOne(),
-                                 M->getSourceAlign().valueOrOne());
-      if (performCallSlotOptzn(M, M->getDest(), M->getSource(),
-                               CopySize->getZExtValue(), Alignment, C)) {
-        MD->removeInstruction(M);
-        M->eraseFromParent();
+      if (hasUndefContentsMSSA(MSSA, AA, M->getSource(), MD, CopySize)) {
+        LLVM_DEBUG(dbgs() << "Removed memcpy from undef\n");
+        eraseInstruction(M);
+        ++NumMemCpyInstr;
         return true;
       }
     }
-  }
+  } else {
+    MemDepResult DepInfo = MD->getDependency(M);
 
-  MemoryLocation SrcLoc = MemoryLocation::getForSource(M);
-  MemDepResult SrcDepInfo = MD->getPointerDependencyFrom(
-      SrcLoc, true, M->getIterator(), M->getParent());
-
-  if (SrcDepInfo.isClobber()) {
-    if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(SrcDepInfo.getInst()))
-      return processMemCpyMemCpyDependence(M, MDep);
-  } else if (SrcDepInfo.isDef()) {
-    if (hasUndefContents(SrcDepInfo.getInst(), CopySize)) {
-      MD->removeInstruction(M);
-      M->eraseFromParent();
-      ++NumMemCpyInstr;
-      return true;
+    // Try to turn a partially redundant memset + memcpy into
+    // memcpy + smaller memset.  We don't need the memcpy size for this.
+    if (DepInfo.isClobber())
+      if (MemSetInst *MDep = dyn_cast<MemSetInst>(DepInfo.getInst()))
+        if (processMemSetMemCpyDependence(M, MDep))
+          return true;
+
+    // The optimizations after this point require the memcpy size.
+    ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength());
+    if (!CopySize) return false;
+
+    // There are four possible optimizations we can do for memcpy:
+    //   a) memcpy-memcpy xform which exposes redundance for DSE.
+    //   b) call-memcpy xform for return slot optimization.
+    //   c) memcpy from freshly alloca'd space or space that has just started
+    //      its lifetime copies undefined data, and we can therefore eliminate
+    //      the memcpy in favor of the data that was already at the destination.
+    //   d) memcpy from a just-memset'd source can be turned into memset.
+    if (DepInfo.isClobber()) {
+      if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) {
+        // FIXME: Can we pass in either of dest/src alignment here instead
+        // of conservatively taking the minimum?
+        Align Alignment = std::min(M->getDestAlign().valueOrOne(),
+                                   M->getSourceAlign().valueOrOne());
+        if (performCallSlotOptzn(M, M, M->getDest(), M->getSource(),
+                                 CopySize->getZExtValue(), Alignment, C)) {
+          eraseInstruction(M);
+          ++NumMemCpyInstr;
+          return true;
+        }
+      }
     }
-  }
 
-  if (SrcDepInfo.isClobber())
-    if (MemSetInst *MDep = dyn_cast<MemSetInst>(SrcDepInfo.getInst()))
-      if (performMemCpyToMemSetOptzn(M, MDep)) {
-        MD->removeInstruction(M);
-        M->eraseFromParent();
-        ++NumCpyToSet;
+    MemoryLocation SrcLoc = MemoryLocation::getForSource(M);
+    MemDepResult SrcDepInfo = MD->getPointerDependencyFrom(
+        SrcLoc, true, M->getIterator(), M->getParent());
+
+    if (SrcDepInfo.isClobber()) {
+      if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(SrcDepInfo.getInst()))
+        return processMemCpyMemCpyDependence(M, MDep);
+    } else if (SrcDepInfo.isDef()) {
+      if (hasUndefContents(SrcDepInfo.getInst(), CopySize)) {
+        eraseInstruction(M);
+        ++NumMemCpyInstr;
         return true;
       }
+    }
+
+    if (SrcDepInfo.isClobber())
+      if (MemSetInst *MDep = dyn_cast<MemSetInst>(SrcDepInfo.getInst()))
+        if (performMemCpyToMemSetOptzn(M, MDep)) {
+          eraseInstruction(M);
+          ++NumCpyToSet;
+          return true;
+        }
+  }
 
   return false;
 }
@@ -1189,14 +1500,12 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
 /// Transforms memmove calls to memcpy calls when the src/dst are guaranteed
 /// not to alias.
 bool MemCpyOptPass::processMemMove(MemMoveInst *M) {
-  AliasAnalysis &AA = LookupAliasAnalysis();
-
   if (!TLI->has(LibFunc_memmove))
     return false;
 
   // See if the pointers alias.
-  if (!AA.isNoAlias(MemoryLocation::getForDest(M),
-                    MemoryLocation::getForSource(M)))
+  if (!AA->isNoAlias(MemoryLocation::getForDest(M),
+                     MemoryLocation::getForSource(M)))
     return false;
 
   LLVM_DEBUG(dbgs() << "MemCpyOptPass: Optimizing memmove -> memcpy: " << *M
@@ -1209,9 +1518,13 @@ bool MemCpyOptPass::processMemMove(MemMoveInst *M) {
   M->setCalledFunction(Intrinsic::getDeclaration(M->getModule(),
                                                  Intrinsic::memcpy, ArgTys));
 
+  // For MemorySSA nothing really changes (except that memcpy may imply stricter
+  // aliasing guarantees).
+
   // MemDep may have over conservative information about this instruction, just
   // conservatively flush it from the cache.
-  MD->removeInstruction(M);
+  if (MD)
+    MD->removeInstruction(M);
 
   ++NumMoveToCpy;
   return true;
@@ -1224,16 +1537,25 @@ bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) {
   Value *ByValArg = CB.getArgOperand(ArgNo);
   Type *ByValTy = cast<PointerType>(ByValArg->getType())->getElementType();
   uint64_t ByValSize = DL.getTypeAllocSize(ByValTy);
-  MemDepResult DepInfo = MD->getPointerDependencyFrom(
-      MemoryLocation(ByValArg, LocationSize::precise(ByValSize)), true,
-      CB.getIterator(), CB.getParent());
-  if (!DepInfo.isClobber())
-    return false;
+  MemoryLocation Loc(ByValArg, LocationSize::precise(ByValSize));
+  MemCpyInst *MDep = nullptr;
+  if (EnableMemorySSA) {
+    MemoryUseOrDef *CallAccess = MSSA->getMemoryAccess(&CB);
+    MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess(
+        CallAccess->getDefiningAccess(), Loc);
+    if (auto *MD = dyn_cast<MemoryDef>(Clobber))
+      MDep = dyn_cast_or_null<MemCpyInst>(MD->getMemoryInst());
+  } else {
+    MemDepResult DepInfo = MD->getPointerDependencyFrom(
+        Loc, true, CB.getIterator(), CB.getParent());
+    if (!DepInfo.isClobber())
+      return false;
+    MDep = dyn_cast<MemCpyInst>(DepInfo.getInst());
+  }
 
   // If the byval argument isn't fed by a memcpy, ignore it.  If it is fed by
   // a memcpy, see if we can byval from the source of the memcpy instead of the
   // result.
-  MemCpyInst *MDep = dyn_cast<MemCpyInst>(DepInfo.getInst());
   if (!MDep || MDep->isVolatile() ||
       ByValArg->stripPointerCasts() != MDep->getDest())
     return false;
@@ -1250,12 +1572,10 @@ bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) {
 
   // If it is greater than the memcpy, then we check to see if we can force the
   // source of the memcpy to the alignment we need.  If we fail, we bail out.
-  AssumptionCache &AC = LookupAssumptionCache();
-  DominatorTree &DT = LookupDomTree();
   MaybeAlign MemDepAlign = MDep->getSourceAlign();
   if ((!MemDepAlign || *MemDepAlign < *ByValAlign) &&
-      getOrEnforceKnownAlignment(MDep->getSource(), ByValAlign, DL, &CB, &AC,
-                                 &DT) < *ByValAlign)
+      getOrEnforceKnownAlignment(MDep->getSource(), ByValAlign, DL, &CB, AC,
+                                 DT) < *ByValAlign)
     return false;
 
   // The address space of the memcpy source must match the byval argument
@@ -1269,14 +1589,19 @@ bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) {
   //    *b = 42;
   //    foo(*a)
   // It would be invalid to transform the second memcpy into foo(*b).
-  //
-  // NOTE: This is conservative, it will stop on any read from the source loc,
-  // not just the defining memcpy.
-  MemDepResult SourceDep = MD->getPointerDependencyFrom(
-      MemoryLocation::getForSource(MDep), false,
-      CB.getIterator(), MDep->getParent());
-  if (!SourceDep.isClobber() || SourceDep.getInst() != MDep)
-    return false;
+  if (EnableMemorySSA) {
+    if (writtenBetween(MSSA, MemoryLocation::getForSource(MDep),
+                       MSSA->getMemoryAccess(MDep), MSSA->getMemoryAccess(&CB)))
+      return false;
+  } else {
+    // NOTE: This is conservative, it will stop on any read from the source loc,
+    // not just the defining memcpy.
+    MemDepResult SourceDep = MD->getPointerDependencyFrom(
+        MemoryLocation::getForSource(MDep), false,
+        CB.getIterator(), MDep->getParent());
+    if (!SourceDep.isClobber() || SourceDep.getInst() != MDep)
+      return false;
+  }
 
   Value *TmpCast = MDep->getSource();
   if (MDep->getSource()->getType() != ByValArg->getType()) {
@@ -1301,15 +1626,13 @@ bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) {
 bool MemCpyOptPass::iterateOnFunction(Function &F) {
   bool MadeChange = false;
 
-  DominatorTree &DT = LookupDomTree();
-
   // Walk all instruction in the function.
   for (BasicBlock &BB : F) {
     // Skip unreachable blocks. For example processStore assumes that an
     // instruction in a BB can't be dominated by a later instruction in the
     // same BB (which is a scenario that can happen for an unreachable BB that
     // has itself as a predecessor).
-    if (!DT.isReachableFromEntry(&BB))
+    if (!DT->isReachableFromEntry(&BB))
       continue;
 
     for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) {
@@ -1345,43 +1668,43 @@ bool MemCpyOptPass::iterateOnFunction(Function &F) {
 }
 
 PreservedAnalyses MemCpyOptPass::run(Function &F, FunctionAnalysisManager &AM) {
-  auto &MD = AM.getResult<MemoryDependenceAnalysis>(F);
+  auto *MD = !EnableMemorySSA ? &AM.getResult<MemoryDependenceAnalysis>(F)
+                              : AM.getCachedResult<MemoryDependenceAnalysis>(F);
   auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
-
-  auto LookupAliasAnalysis = [&]() -> AliasAnalysis & {
-    return AM.getResult<AAManager>(F);
-  };
-  auto LookupAssumptionCache = [&]() -> AssumptionCache & {
-    return AM.getResult<AssumptionAnalysis>(F);
-  };
-  auto LookupDomTree = [&]() -> DominatorTree & {
-    return AM.getResult<DominatorTreeAnalysis>(F);
-  };
-
-  bool MadeChange = runImpl(F, &MD, &TLI, LookupAliasAnalysis,
-                            LookupAssumptionCache, LookupDomTree);
+  auto *AA = &AM.getResult<AAManager>(F);
+  auto *AC = &AM.getResult<AssumptionAnalysis>(F);
+  auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
+  auto *MSSA = EnableMemorySSA ? &AM.getResult<MemorySSAAnalysis>(F)
+                               : AM.getCachedResult<MemorySSAAnalysis>(F);
+
+  bool MadeChange =
+      runImpl(F, MD, &TLI, AA, AC, DT, MSSA ? &MSSA->getMSSA() : nullptr);
   if (!MadeChange)
     return PreservedAnalyses::all();
 
   PreservedAnalyses PA;
   PA.preserveSet<CFGAnalyses>();
   PA.preserve<GlobalsAA>();
-  PA.preserve<MemoryDependenceAnalysis>();
+  if (MD)
+    PA.preserve<MemoryDependenceAnalysis>();
+  if (MSSA)
+    PA.preserve<MemorySSAAnalysis>();
   return PA;
 }
 
-bool MemCpyOptPass::runImpl(
-    Function &F, MemoryDependenceResults *MD_, TargetLibraryInfo *TLI_,
-    std::function<AliasAnalysis &()> LookupAliasAnalysis_,
-    std::function<AssumptionCache &()> LookupAssumptionCache_,
-    std::function<DominatorTree &()> LookupDomTree_) {
+bool MemCpyOptPass::runImpl(Function &F, MemoryDependenceResults *MD_,
+                            TargetLibraryInfo *TLI_, AliasAnalysis *AA_,
+                            AssumptionCache *AC_, DominatorTree *DT_,
+                            MemorySSA *MSSA_) {
   bool MadeChange = false;
   MD = MD_;
   TLI = TLI_;
-  LookupAliasAnalysis = std::move(LookupAliasAnalysis_);
-  LookupAssumptionCache = std::move(LookupAssumptionCache_);
-  LookupDomTree = std::move(LookupDomTree_);
-
+  AA = AA_;
+  AC = AC_;
+  DT = DT_;
+  MSSA = MSSA_;
+  MemorySSAUpdater MSSAU_(MSSA_);
+  MSSAU = MSSA_ ? &MSSAU_ : nullptr;
   // If we don't have at least memset and memcpy, there is little point of doing
   // anything here.  These are required by a freestanding implementation, so if
   // even they are disabled, there is no point in trying hard.
@@ -1394,6 +1717,9 @@ bool MemCpyOptPass::runImpl(
     MadeChange = true;
   }
 
+  if (MSSA_ && VerifyMemorySSA)
+    MSSA_->verifyMemorySSA();
+
   MD = nullptr;
   return MadeChange;
 }
@@ -1403,19 +1729,17 @@ bool MemCpyOptLegacyPass::runOnFunction(Function &F) {
   if (skipFunction(F))
     return false;
 
-  auto *MD = &getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
+  auto *MDWP = !EnableMemorySSA
+      ? &getAnalysis<MemoryDependenceWrapperPass>()
+      : getAnalysisIfAvailable<MemoryDependenceWrapperPass>();
   auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
-
-  auto LookupAliasAnalysis = [this]() -> AliasAnalysis & {
-    return getAnalysis<AAResultsWrapperPass>().getAAResults();
-  };
-  auto LookupAssumptionCache = [this, &F]() -> AssumptionCache & {
-    return getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
-  };
-  auto LookupDomTree = [this]() -> DominatorTree & {
-    return getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  };
-
-  return Impl.runImpl(F, MD, TLI, LookupAliasAnalysis, LookupAssumptionCache,
-                      LookupDomTree);
+  auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+  auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  auto *MSSAWP = EnableMemorySSA
+      ? &getAnalysis<MemorySSAWrapperPass>()
+      : getAnalysisIfAvailable<MemorySSAWrapperPass>();
+
+  return Impl.runImpl(F, MDWP ? & MDWP->getMemDep() : nullptr, TLI, AA, AC, DT,
+                      MSSAWP ? &MSSAWP->getMSSA() : nullptr);
 }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/MergeICmps.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/MergeICmps.cpp
index ce1e142101b8..7f8b75ac8806 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/MergeICmps.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/MergeICmps.cpp
@@ -372,7 +372,7 @@ BCECmpBlock visitCmpBlock(Value *const Val, BasicBlock *const Block,
   } else {
     // In this case, we expect a constant incoming value (the comparison is
     // chained).
-    const auto *const Const = dyn_cast<ConstantInt>(Val);
+    const auto *const Const = cast<ConstantInt>(Val);
     LLVM_DEBUG(dbgs() << "const\n");
     if (!Const->isZero()) return {};
     LLVM_DEBUG(dbgs() << "false\n");
@@ -624,6 +624,17 @@ static BasicBlock *mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
   Value *IsEqual = nullptr;
   LLVM_DEBUG(dbgs() << "Merging " << Comparisons.size() << " comparisons -> "
                     << BB->getName() << "\n");
+
+  // If there is one block that requires splitting, we do it now, i.e.
+  // just before we know we will collapse the chain. The instructions
+  // can be executed before any of the instructions in the chain.
+  const auto ToSplit = llvm::find_if(
+      Comparisons, [](const BCECmpBlock &B) { return B.RequireSplit; });
+  if (ToSplit != Comparisons.end()) {
+    LLVM_DEBUG(dbgs() << "Splitting non_BCE work to header\n");
+    ToSplit->split(BB, AA);
+  }
+
   if (Comparisons.size() == 1) {
     LLVM_DEBUG(dbgs() << "Only one comparison, updating branches\n");
     Value *const LhsLoad =
@@ -633,17 +644,6 @@ static BasicBlock *mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
     // There are no blocks to merge, just do the comparison.
     IsEqual = Builder.CreateICmpEQ(LhsLoad, RhsLoad);
   } else {
-    // If there is one block that requires splitting, we do it now, i.e.
-    // just before we know we will collapse the chain. The instructions
-    // can be executed before any of the instructions in the chain.
-    const auto ToSplit =
-        std::find_if(Comparisons.begin(), Comparisons.end(),
-                     [](const BCECmpBlock &B) { return B.RequireSplit; });
-    if (ToSplit != Comparisons.end()) {
-      LLVM_DEBUG(dbgs() << "Splitting non_BCE work to header\n");
-      ToSplit->split(BB, AA);
-    }
-
     const unsigned TotalSizeBits = std::accumulate(
         Comparisons.begin(), Comparisons.end(), 0u,
         [](int Size, const BCECmpBlock &C) { return Size + C.SizeBits(); });
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/NaryReassociate.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
index 4e010f8704d0..32bb62129e8f 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
@@ -213,54 +213,33 @@ bool NaryReassociatePass::runImpl(Function &F, AssumptionCache *AC_,
   return Changed;
 }
 
-// Explicitly list the instruction types NaryReassociate handles for now.
-static bool isPotentiallyNaryReassociable(Instruction *I) {
-  switch (I->getOpcode()) {
-  case Instruction::Add:
-  case Instruction::GetElementPtr:
-  case Instruction::Mul:
-    return true;
-  default:
-    return false;
-  }
-}
-
 bool NaryReassociatePass::doOneIteration(Function &F) {
   bool Changed = false;
   SeenExprs.clear();
   // Process the basic blocks in a depth first traversal of the dominator
   // tree. This order ensures that all bases of a candidate are in Candidates
   // when we process it.
+  SmallVector<WeakTrackingVH, 16> DeadInsts;
   for (const auto Node : depth_first(DT)) {
     BasicBlock *BB = Node->getBlock();
     for (auto I = BB->begin(); I != BB->end(); ++I) {
-      if (SE->isSCEVable(I->getType()) && isPotentiallyNaryReassociable(&*I)) {
-        const SCEV *OldSCEV = SE->getSCEV(&*I);
-        if (Instruction *NewI = tryReassociate(&*I)) {
-          Changed = true;
-          SE->forgetValue(&*I);
-          I->replaceAllUsesWith(NewI);
-          WeakVH NewIExist = NewI;
-          // If SeenExprs/NewIExist contains I's WeakTrackingVH/WeakVH, that
-          // entry will be replaced with nullptr if deleted.
-          RecursivelyDeleteTriviallyDeadInstructions(&*I, TLI);
-          if (!NewIExist) {
-            // Rare occation where the new instruction (NewI) have been removed,
-            // probably due to parts of the input code was dead from the
-            // beginning, reset the iterator and start over from the beginning
-            I = BB->begin();
-            continue;
-          }
-          I = NewI->getIterator();
-        }
-        // Add the rewritten instruction to SeenExprs; the original instruction
-        // is deleted.
-        const SCEV *NewSCEV = SE->getSCEV(&*I);
-        SeenExprs[NewSCEV].push_back(WeakTrackingVH(&*I));
+      Instruction *OrigI = &*I;
+      const SCEV *OrigSCEV = nullptr;
+      if (Instruction *NewI = tryReassociate(OrigI, OrigSCEV)) {
+        Changed = true;
+        OrigI->replaceAllUsesWith(NewI);
+
+        // Add 'OrigI' to the list of dead instructions.
+        DeadInsts.push_back(WeakTrackingVH(OrigI));
+        // Add the rewritten instruction to SeenExprs; the original
+        // instruction is deleted.
+        const SCEV *NewSCEV = SE->getSCEV(NewI);
+        SeenExprs[NewSCEV].push_back(WeakTrackingVH(NewI));
+
         // Ideally, NewSCEV should equal OldSCEV because tryReassociate(I)
         // is equivalent to I. However, ScalarEvolution::getSCEV may
-        // weaken nsw causing NewSCEV not to equal OldSCEV. For example, suppose
-        // we reassociate
+        // weaken nsw causing NewSCEV not to equal OldSCEV. For example,
+        // suppose we reassociate
         //   I = &a[sext(i +nsw j)] // assuming sizeof(a[0]) = 4
         // to
         //   NewI = &a[sext(i)] + sext(j).
@@ -274,32 +253,47 @@ bool NaryReassociatePass::doOneIteration(Function &F) {
         // equivalence, we add I to SeenExprs[OldSCEV] as well so that we can
         // map both SCEV before and after tryReassociate(I) to I.
         //
-        // This improvement is exercised in @reassociate_gep_nsw in nary-gep.ll.
-        if (NewSCEV != OldSCEV)
-          SeenExprs[OldSCEV].push_back(WeakTrackingVH(&*I));
-      }
+        // This improvement is exercised in @reassociate_gep_nsw in
+        // nary-gep.ll.
+        if (NewSCEV != OrigSCEV)
+          SeenExprs[OrigSCEV].push_back(WeakTrackingVH(NewI));
+      } else if (OrigSCEV)
+        SeenExprs[OrigSCEV].push_back(WeakTrackingVH(OrigI));
     }
   }
+  // Delete all dead instructions from 'DeadInsts'.
+  // Please note ScalarEvolution is updated along the way.
+  RecursivelyDeleteTriviallyDeadInstructionsPermissive(
+      DeadInsts, TLI, nullptr, [this](Value *V) { SE->forgetValue(V); });
+
   return Changed;
 }
 
-Instruction *NaryReassociatePass::tryReassociate(Instruction *I) {
+Instruction *NaryReassociatePass::tryReassociate(Instruction * I,
+                                                 const SCEV *&OrigSCEV) {
+
+  if (!SE->isSCEVable(I->getType()))
+    return nullptr;
+
   switch (I->getOpcode()) {
   case Instruction::Add:
   case Instruction::Mul:
+    OrigSCEV = SE->getSCEV(I);
     return tryReassociateBinaryOp(cast<BinaryOperator>(I));
   case Instruction::GetElementPtr:
+    OrigSCEV = SE->getSCEV(I);
     return tryReassociateGEP(cast<GetElementPtrInst>(I));
   default:
-    llvm_unreachable("should be filtered out by isPotentiallyNaryReassociable");
+    return nullptr;
   }
+
+  llvm_unreachable("should not be reached");
+  return nullptr;
 }
 
 static bool isGEPFoldable(GetElementPtrInst *GEP,
                           const TargetTransformInfo *TTI) {
-  SmallVector<const Value*, 4> Indices;
-  for (auto I = GEP->idx_begin(); I != GEP->idx_end(); ++I)
-    Indices.push_back(*I);
+  SmallVector<const Value *, 4> Indices(GEP->indices());
   return TTI->getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
                          Indices) == TargetTransformInfo::TCC_Free;
 }
@@ -375,8 +369,8 @@ NaryReassociatePass::tryReassociateGEPAtIndex(GetElementPtrInst *GEP,
   // Replace the I-th index with LHS.
   IndexExprs[I] = SE->getSCEV(LHS);
   if (isKnownNonNegative(LHS, *DL, 0, AC, GEP, DT) &&
-      DL->getTypeSizeInBits(LHS->getType()) <
-          DL->getTypeSizeInBits(GEP->getOperand(I)->getType())) {
+      DL->getTypeSizeInBits(LHS->getType()).getFixedSize() <
+          DL->getTypeSizeInBits(GEP->getOperand(I)->getType()).getFixedSize()) {
     // Zero-extend LHS if it is non-negative. InstCombine canonicalizes sext to
     // zext if the source operand is proved non-negative. We should do that
     // consistently so that CandidateExpr more likely appears before. See
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/NewGVN.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/NewGVN.cpp
index 0ed1773373a7..281d47c8625f 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/NewGVN.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/NewGVN.cpp
@@ -662,7 +662,8 @@ public:
          const DataLayout &DL)
       : F(F), DT(DT), TLI(TLI), AA(AA), MSSA(MSSA), AC(AC), DL(DL),
         PredInfo(std::make_unique<PredicateInfo>(F, *DT, *AC)),
-        SQ(DL, TLI, DT, AC, /*CtxI=*/nullptr, /*UseInstrInfo=*/false) {}
+        SQ(DL, TLI, DT, AC, /*CtxI=*/nullptr, /*UseInstrInfo=*/false,
+           /*CanUseUndef=*/false) {}
 
   bool runGVN();
 
@@ -800,12 +801,7 @@ private:
   Value *findPHIOfOpsLeader(const Expression *, const Instruction *,
                             const BasicBlock *) const;
 
-  // New instruction creation.
-  void handleNewInstruction(Instruction *) {}
-
   // Various instruction touch utilities
-  template <typename Map, typename KeyType, typename Func>
-  void for_each_found(Map &, const KeyType &, Func);
   template <typename Map, typename KeyType>
   void touchAndErase(Map &, const KeyType &);
   void markUsersTouched(Value *);
@@ -834,7 +830,6 @@ private:
   BasicBlock *getBlockForValue(Value *V) const;
   void deleteExpression(const Expression *E) const;
   MemoryUseOrDef *getMemoryAccess(const Instruction *) const;
-  MemoryAccess *getDefiningAccess(const MemoryAccess *) const;
   MemoryPhi *getMemoryAccess(const BasicBlock *) const;
   template <class T, class Range> T *getMinDFSOfRange(const Range &) const;
 
@@ -1253,6 +1248,7 @@ const UnknownExpression *NewGVN::createUnknownExpression(Instruction *I) const {
 const CallExpression *
 NewGVN::createCallExpression(CallInst *CI, const MemoryAccess *MA) const {
   // FIXME: Add operand bundles for calls.
+  // FIXME: Allow commutative matching for intrinsics.
   auto *E =
       new (ExpressionAllocator) CallExpression(CI->getNumOperands(), CI, MA);
   setBasicExpressionInfo(CI, E);
@@ -1539,90 +1535,39 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const {
 
   LLVM_DEBUG(dbgs() << "Found predicate info from instruction !\n");
 
-  auto *PWC = dyn_cast<PredicateWithCondition>(PI);
-  if (!PWC)
+  const Optional<PredicateConstraint> &Constraint = PI->getConstraint();
+  if (!Constraint)
     return nullptr;
 
-  auto *CopyOf = I->getOperand(0);
-  auto *Cond = PWC->Condition;
-
-  // If this a copy of the condition, it must be either true or false depending
-  // on the predicate info type and edge.
-  if (CopyOf == Cond) {
-    // We should not need to add predicate users because the predicate info is
-    // already a use of this operand.
-    if (isa<PredicateAssume>(PI))
-      return createConstantExpression(ConstantInt::getTrue(Cond->getType()));
-    if (auto *PBranch = dyn_cast<PredicateBranch>(PI)) {
-      if (PBranch->TrueEdge)
-        return createConstantExpression(ConstantInt::getTrue(Cond->getType()));
-      return createConstantExpression(ConstantInt::getFalse(Cond->getType()));
-    }
-    if (auto *PSwitch = dyn_cast<PredicateSwitch>(PI))
-      return createConstantExpression(cast<Constant>(PSwitch->CaseValue));
-  }
+  CmpInst::Predicate Predicate = Constraint->Predicate;
+  Value *CmpOp0 = I->getOperand(0);
+  Value *CmpOp1 = Constraint->OtherOp;
 
-  // Not a copy of the condition, so see what the predicates tell us about this
-  // value.  First, though, we check to make sure the value is actually a copy
-  // of one of the condition operands. It's possible, in certain cases, for it
-  // to be a copy of a predicateinfo copy. In particular, if two branch
-  // operations use the same condition, and one branch dominates the other, we
-  // will end up with a copy of a copy.  This is currently a small deficiency in
-  // predicateinfo.  What will end up happening here is that we will value
-  // number both copies the same anyway.
-
-  // Everything below relies on the condition being a comparison.
-  auto *Cmp = dyn_cast<CmpInst>(Cond);
-  if (!Cmp)
-    return nullptr;
+  Value *FirstOp = lookupOperandLeader(CmpOp0);
+  Value *SecondOp = lookupOperandLeader(CmpOp1);
+  Value *AdditionallyUsedValue = CmpOp0;
 
-  if (CopyOf != Cmp->getOperand(0) && CopyOf != Cmp->getOperand(1)) {
-    LLVM_DEBUG(dbgs() << "Copy is not of any condition operands!\n");
-    return nullptr;
-  }
-  Value *FirstOp = lookupOperandLeader(Cmp->getOperand(0));
-  Value *SecondOp = lookupOperandLeader(Cmp->getOperand(1));
-  bool SwappedOps = false;
   // Sort the ops.
   if (shouldSwapOperands(FirstOp, SecondOp)) {
     std::swap(FirstOp, SecondOp);
-    SwappedOps = true;
+    Predicate = CmpInst::getSwappedPredicate(Predicate);
+    AdditionallyUsedValue = CmpOp1;
   }
-  CmpInst::Predicate Predicate =
-      SwappedOps ? Cmp->getSwappedPredicate() : Cmp->getPredicate();
-
-  if (isa<PredicateAssume>(PI)) {
-    // If we assume the operands are equal, then they are equal.
-    if (Predicate == CmpInst::ICMP_EQ) {
-      addPredicateUsers(PI, I);
-      addAdditionalUsers(SwappedOps ? Cmp->getOperand(1) : Cmp->getOperand(0),
-                         I);
-      return createVariableOrConstant(FirstOp);
-    }
+
+  if (Predicate == CmpInst::ICMP_EQ) {
+    addPredicateUsers(PI, I);
+    addAdditionalUsers(AdditionallyUsedValue, I);
+    return createVariableOrConstant(FirstOp);
   }
-  if (const auto *PBranch = dyn_cast<PredicateBranch>(PI)) {
-    // If we are *not* a copy of the comparison, we may equal to the other
-    // operand when the predicate implies something about equality of
-    // operations.  In particular, if the comparison is true/false when the
-    // operands are equal, and we are on the right edge, we know this operation
-    // is equal to something.
-    if ((PBranch->TrueEdge && Predicate == CmpInst::ICMP_EQ) ||
-        (!PBranch->TrueEdge && Predicate == CmpInst::ICMP_NE)) {
-      addPredicateUsers(PI, I);
-      addAdditionalUsers(SwappedOps ? Cmp->getOperand(1) : Cmp->getOperand(0),
-                         I);
-      return createVariableOrConstant(FirstOp);
-    }
-    // Handle the special case of floating point.
-    if (((PBranch->TrueEdge && Predicate == CmpInst::FCMP_OEQ) ||
-         (!PBranch->TrueEdge && Predicate == CmpInst::FCMP_UNE)) &&
-        isa<ConstantFP>(FirstOp) && !cast<ConstantFP>(FirstOp)->isZero()) {
-      addPredicateUsers(PI, I);
-      addAdditionalUsers(SwappedOps ? Cmp->getOperand(1) : Cmp->getOperand(0),
-                         I);
-      return createConstantExpression(cast<Constant>(FirstOp));
-    }
+
+  // Handle the special case of floating point.
+  if (Predicate == CmpInst::FCMP_OEQ && isa<ConstantFP>(FirstOp) &&
+      !cast<ConstantFP>(FirstOp)->isZero()) {
+    addPredicateUsers(PI, I);
+    addAdditionalUsers(AdditionallyUsedValue, I);
+    return createConstantExpression(cast<Constant>(FirstOp));
   }
+
   return nullptr;
 }
 
@@ -2044,16 +1989,6 @@ NewGVN::performSymbolicEvaluation(Value *V,
   return E;
 }
 
-// Look up a container in a map, and then call a function for each thing in the
-// found container.
-template <typename Map, typename KeyType, typename Func>
-void NewGVN::for_each_found(Map &M, const KeyType &Key, Func F) {
-  const auto Result = M.find_as(Key);
-  if (Result != M.end())
-    for (typename Map::mapped_type::value_type Mapped : Result->second)
-      F(Mapped);
-}
-
 // Look up a container of values/instructions in a map, and touch all the
 // instructions in the container.  Then erase value from the map.
 template <typename Map, typename KeyType>
@@ -2941,8 +2876,7 @@ void NewGVN::cleanupTables() {
   }
 
   while (!TempInst.empty()) {
-    auto *I = TempInst.back();
-    TempInst.pop_back();
+    auto *I = TempInst.pop_back_val();
     I->deleteValue();
   }
 
@@ -3437,10 +3371,9 @@ bool NewGVN::runGVN() {
   for (auto &B : RPOT) {
     auto *Node = DT->getNode(B);
     if (Node->getNumChildren() > 1)
-      llvm::sort(Node->begin(), Node->end(),
-                 [&](const DomTreeNode *A, const DomTreeNode *B) {
-                   return RPOOrdering[A] < RPOOrdering[B];
-                 });
+      llvm::sort(*Node, [&](const DomTreeNode *A, const DomTreeNode *B) {
+        return RPOOrdering[A] < RPOOrdering[B];
+      });
   }
 
   // Now a standard depth first ordering of the domtree is equivalent to RPO.
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp
index 4553b23532f2..a110f7d5c241 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp
@@ -243,7 +243,7 @@ static bool mustBeFiniteCountedLoop(Loop *L, ScalarEvolution *SE,
                                     BasicBlock *Pred) {
   // A conservative bound on the loop as a whole.
   const SCEV *MaxTrips = SE->getConstantMaxBackedgeTakenCount(L);
-  if (MaxTrips != SE->getCouldNotCompute() &&
+  if (!isa<SCEVCouldNotCompute>(MaxTrips) &&
       SE->getUnsignedRange(MaxTrips).getUnsignedMax().isIntN(
           CountedLoopTripWidth))
     return true;
@@ -255,7 +255,7 @@ static bool mustBeFiniteCountedLoop(Loop *L, ScalarEvolution *SE,
     // This returns an exact expression only.  TODO: We really only need an
     // upper bound here, but SE doesn't expose that.
     const SCEV *MaxExec = SE->getExitCount(L, Pred);
-    if (MaxExec != SE->getCouldNotCompute() &&
+    if (!isa<SCEVCouldNotCompute>(MaxExec) &&
         SE->getUnsignedRange(MaxExec).getUnsignedMax().isIntN(
             CountedLoopTripWidth))
         return true;
@@ -435,7 +435,7 @@ static Instruction *findLocationForEntrySafepoint(Function &F,
   return Cursor;
 }
 
-static const char *const GCSafepointPollName = "gc.safepoint_poll";
+const char GCSafepointPollName[] = "gc.safepoint_poll";
 
 static bool isGCSafepointPoll(Function &F) {
   return F.getName().equals(GCSafepointPollName);
@@ -589,8 +589,7 @@ bool PlaceSafepoints::runOnFunction(Function &F) {
   for (Instruction *PollLocation : PollsNeeded) {
     std::vector<CallBase *> RuntimeCalls;
     InsertSafepointPoll(PollLocation, RuntimeCalls, TLI);
-    ParsePointNeeded.insert(ParsePointNeeded.end(), RuntimeCalls.begin(),
-                            RuntimeCalls.end());
+    llvm::append_range(ParsePointNeeded, RuntimeCalls);
   }
 
   return Modified;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reassociate.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reassociate.cpp
index ba7f367267fe..dffeb7cc227b 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reassociate.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reassociate.cpp
@@ -920,6 +920,100 @@ static Value *NegateValue(Value *V, Instruction *BI,
   return NewNeg;
 }
 
+// See if this `or` looks like an load widening reduction, i.e. that it
+// consists of an `or`/`shl`/`zext`/`load` nodes only. Note that we don't
+// ensure that the pattern is *really* a load widening reduction,
+// we do not ensure that it can really be replaced with a widened load,
+// only that it mostly looks like one.
+static bool isLoadCombineCandidate(Instruction *Or) {
+  SmallVector<Instruction *, 8> Worklist;
+  SmallSet<Instruction *, 8> Visited;
+
+  auto Enqueue = [&](Value *V) {
+    auto *I = dyn_cast<Instruction>(V);
+    // Each node of an `or` reduction must be an instruction,
+    if (!I)
+      return false; // Node is certainly not part of an `or` load reduction.
+    // Only process instructions we have never processed before.
+    if (Visited.insert(I).second)
+      Worklist.emplace_back(I);
+    return true; // Will need to look at parent nodes.
+  };
+
+  if (!Enqueue(Or))
+    return false; // Not an `or` reduction pattern.
+
+  while (!Worklist.empty()) {
+    auto *I = Worklist.pop_back_val();
+
+    // Okay, which instruction is this node?
+    switch (I->getOpcode()) {
+    case Instruction::Or:
+      // Got an `or` node. That's fine, just recurse into it's operands.
+      for (Value *Op : I->operands())
+        if (!Enqueue(Op))
+          return false; // Not an `or` reduction pattern.
+      continue;
+
+    case Instruction::Shl:
+    case Instruction::ZExt:
+      // `shl`/`zext` nodes are fine, just recurse into their base operand.
+      if (!Enqueue(I->getOperand(0)))
+        return false; // Not an `or` reduction pattern.
+      continue;
+
+    case Instruction::Load:
+      // Perfect, `load` node means we've reached an edge of the graph.
+      continue;
+
+    default:        // Unknown node.
+      return false; // Not an `or` reduction pattern.
+    }
+  }
+
+  return true;
+}
+
+/// Return true if it may be profitable to convert this (X|Y) into (X+Y).
+static bool ShouldConvertOrWithNoCommonBitsToAdd(Instruction *Or) {
+  // Don't bother to convert this up unless either the LHS is an associable add
+  // or subtract or mul or if this is only used by one of the above.
+  // This is only a compile-time improvement, it is not needed for correctness!
+  auto isInteresting = [](Value *V) {
+    for (auto Op : {Instruction::Add, Instruction::Sub, Instruction::Mul})
+      if (isReassociableOp(V, Op))
+        return true;
+    return false;
+  };
+
+  if (any_of(Or->operands(), isInteresting))
+    return true;
+
+  Value *VB = Or->user_back();
+  if (Or->hasOneUse() && isInteresting(VB))
+    return true;
+
+  return false;
+}
+
+/// If we have (X|Y), and iff X and Y have no common bits set,
+/// transform this into (X+Y) to allow arithmetics reassociation.
+static BinaryOperator *ConvertOrWithNoCommonBitsToAdd(Instruction *Or) {
+  // Convert an or into an add.
+  BinaryOperator *New =
+      CreateAdd(Or->getOperand(0), Or->getOperand(1), "", Or, Or);
+  New->setHasNoSignedWrap();
+  New->setHasNoUnsignedWrap();
+  New->takeName(Or);
+
+  // Everyone now refers to the add instruction.
+  Or->replaceAllUsesWith(New);
+  New->setDebugLoc(Or->getDebugLoc());
+
+  LLVM_DEBUG(dbgs() << "Converted or into an add: " << *New << '\n');
+  return New;
+}
+
 /// Return true if we should break up this subtract of X-Y into (X + -Y).
 static bool ShouldBreakUpSubtract(Instruction *Sub) {
   // If this is a negation, we can't split it up!
@@ -1034,8 +1128,7 @@ static Value *EmitAddTreeOfValues(Instruction *I,
                                   SmallVectorImpl<WeakTrackingVH> &Ops) {
   if (Ops.size() == 1) return Ops.back();
 
-  Value *V1 = Ops.back();
-  Ops.pop_back();
+  Value *V1 = Ops.pop_back_val();
   Value *V2 = EmitAddTreeOfValues(I, Ops);
   return CreateAdd(V2, V1, "reass.add", I, I);
 }
@@ -1899,7 +1992,7 @@ Value *ReassociatePass::OptimizeExpression(BinaryOperator *I,
 void ReassociatePass::RecursivelyEraseDeadInsts(Instruction *I,
                                                 OrderedSet &Insts) {
   assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!");
-  SmallVector<Value *, 4> Ops(I->op_begin(), I->op_end());
+  SmallVector<Value *, 4> Ops(I->operands());
   ValueRankMap.erase(I);
   Insts.remove(I);
   RedoInsts.remove(I);
@@ -1916,7 +2009,7 @@ void ReassociatePass::EraseInst(Instruction *I) {
   assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!");
   LLVM_DEBUG(dbgs() << "Erasing dead inst: "; I->dump());
 
-  SmallVector<Value*, 8> Ops(I->op_begin(), I->op_end());
+  SmallVector<Value *, 8> Ops(I->operands());
   // Erase the dead instruction.
   ValueRankMap.erase(I);
   RedoInsts.remove(I);
@@ -2116,6 +2209,19 @@ void ReassociatePass::OptimizeInst(Instruction *I) {
   if (I->getType()->isIntegerTy(1))
     return;
 
+  // If this is a bitwise or instruction of operands
+  // with no common bits set, convert it to X+Y.
+  if (I->getOpcode() == Instruction::Or &&
+      ShouldConvertOrWithNoCommonBitsToAdd(I) && !isLoadCombineCandidate(I) &&
+      haveNoCommonBitsSet(I->getOperand(0), I->getOperand(1),
+                          I->getModule()->getDataLayout(), /*AC=*/nullptr, I,
+                          /*DT=*/nullptr)) {
+    Instruction *NI = ConvertOrWithNoCommonBitsToAdd(I);
+    RedoInsts.insert(I);
+    MadeChange = true;
+    I = NI;
+  }
+
   // If this is a subtract instruction which is not already in negate form,
   // see if we can convert it to X+-Y.
   if (I->getOpcode() == Instruction::Sub) {
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reg2Mem.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
index 0716c1320982..a49b9ad3f62b 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
@@ -15,17 +15,23 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Scalar/Reg2Mem.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <list>
 using namespace llvm;
@@ -35,43 +41,17 @@ using namespace llvm;
 STATISTIC(NumRegsDemoted, "Number of registers demoted");
 STATISTIC(NumPhisDemoted, "Number of phi-nodes demoted");
 
-namespace {
-  struct RegToMem : public FunctionPass {
-    static char ID; // Pass identification, replacement for typeid
-    RegToMem() : FunctionPass(ID) {
-      initializeRegToMemPass(*PassRegistry::getPassRegistry());
-    }
-
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequiredID(BreakCriticalEdgesID);
-      AU.addPreservedID(BreakCriticalEdgesID);
-    }
-
-    bool valueEscapes(const Instruction *Inst) const {
-      const BasicBlock *BB = Inst->getParent();
-      for (const User *U : Inst->users()) {
-        const Instruction *UI = cast<Instruction>(U);
-        if (UI->getParent() != BB || isa<PHINode>(UI))
-          return true;
-      }
-      return false;
-    }
-
-    bool runOnFunction(Function &F) override;
-  };
+static bool valueEscapes(const Instruction &Inst) {
+  const BasicBlock *BB = Inst.getParent();
+  for (const User *U : Inst.users()) {
+    const Instruction *UI = cast<Instruction>(U);
+    if (UI->getParent() != BB || isa<PHINode>(UI))
+      return true;
+  }
+  return false;
 }
 
-char RegToMem::ID = 0;
-INITIALIZE_PASS_BEGIN(RegToMem, "reg2mem", "Demote all values to stack slots",
-                false, false)
-INITIALIZE_PASS_DEPENDENCY(BreakCriticalEdges)
-INITIALIZE_PASS_END(RegToMem, "reg2mem", "Demote all values to stack slots",
-                false, false)
-
-bool RegToMem::runOnFunction(Function &F) {
-  if (F.isDeclaration() || skipFunction(F))
-    return false;
-
+static bool runPass(Function &F) {
   // Insert all new allocas into entry block.
   BasicBlock *BBEntry = &F.getEntryBlock();
   assert(pred_empty(BBEntry) &&
@@ -90,40 +70,72 @@ bool RegToMem::runOnFunction(Function &F) {
   // Find the escaped instructions. But don't create stack slots for
   // allocas in entry block.
   std::list<Instruction*> WorkList;
-  for (BasicBlock &ibb : F)
-    for (BasicBlock::iterator iib = ibb.begin(), iie = ibb.end(); iib != iie;
-         ++iib) {
-      if (!(isa<AllocaInst>(iib) && iib->getParent() == BBEntry) &&
-          valueEscapes(&*iib)) {
-        WorkList.push_front(&*iib);
-      }
-    }
+  for (Instruction &I : instructions(F))
+    if (!(isa<AllocaInst>(I) && I.getParent() == BBEntry) && valueEscapes(I))
+      WorkList.push_front(&I);
 
   // Demote escaped instructions
   NumRegsDemoted += WorkList.size();
-  for (Instruction *ilb : WorkList)
-    DemoteRegToStack(*ilb, false, AllocaInsertionPoint);
+  for (Instruction *I : WorkList)
+    DemoteRegToStack(*I, false, AllocaInsertionPoint);
 
   WorkList.clear();
 
   // Find all phi's
-  for (BasicBlock &ibb : F)
-    for (BasicBlock::iterator iib = ibb.begin(), iie = ibb.end(); iib != iie;
-         ++iib)
-      if (isa<PHINode>(iib))
-        WorkList.push_front(&*iib);
+  for (BasicBlock &BB : F)
+    for (auto &Phi : BB.phis())
+      WorkList.push_front(&Phi);
 
   // Demote phi nodes
   NumPhisDemoted += WorkList.size();
-  for (Instruction *ilb : WorkList)
-    DemotePHIToStack(cast<PHINode>(ilb), AllocaInsertionPoint);
+  for (Instruction *I : WorkList)
+    DemotePHIToStack(cast<PHINode>(I), AllocaInsertionPoint);
 
   return true;
 }
 
+PreservedAnalyses RegToMemPass::run(Function &F, FunctionAnalysisManager &AM) {
+  auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
+  auto *LI = &AM.getResult<LoopAnalysis>(F);
+  unsigned N = SplitAllCriticalEdges(F, CriticalEdgeSplittingOptions(DT, LI));
+  bool Changed = runPass(F);
+  if (N == 0 && !Changed)
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<LoopAnalysis>();
+  return PA;
+}
+
+namespace {
+struct RegToMemLegacy : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  RegToMemLegacy() : FunctionPass(ID) {
+    initializeRegToMemLegacyPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequiredID(BreakCriticalEdgesID);
+    AU.addPreservedID(BreakCriticalEdgesID);
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (F.isDeclaration() || skipFunction(F))
+      return false;
+    return runPass(F);
+  }
+};
+} // namespace
+
+char RegToMemLegacy::ID = 0;
+INITIALIZE_PASS_BEGIN(RegToMemLegacy, "reg2mem",
+                      "Demote all values to stack slots", false, false)
+INITIALIZE_PASS_DEPENDENCY(BreakCriticalEdges)
+INITIALIZE_PASS_END(RegToMemLegacy, "reg2mem",
+                    "Demote all values to stack slots", false, false)
 
 // createDemoteRegisterToMemory - Provide an entry point to create this pass.
-char &llvm::DemoteRegisterToMemoryID = RegToMem::ID;
+char &llvm::DemoteRegisterToMemoryID = RegToMemLegacy::ID;
 FunctionPass *llvm::createDemoteRegisterToMemoryPass() {
-  return new RegToMem();
+  return new RegToMemLegacy();
 }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index dc2ad14ae61e..b7830555bf73 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -1487,7 +1487,7 @@ makeStatepointExplicitImpl(CallBase *Call, /* to replace */
   uint32_t NumPatchBytes = 0;
   uint32_t Flags = uint32_t(StatepointFlags::None);
 
-  ArrayRef<Use> CallArgs(Call->arg_begin(), Call->arg_end());
+  SmallVector<Value *, 8> CallArgs(Call->args());
   Optional<ArrayRef<Use>> DeoptArgs;
   if (auto Bundle = Call->getOperandBundle(LLVMContext::OB_deopt))
     DeoptArgs = Bundle->Inputs;
@@ -1520,7 +1520,8 @@ makeStatepointExplicitImpl(CallBase *Call, /* to replace */
 
   Value *CallTarget = Call->getCalledOperand();
   if (Function *F = dyn_cast<Function>(CallTarget)) {
-    if (F->getIntrinsicID() == Intrinsic::experimental_deoptimize) {
+    auto IID = F->getIntrinsicID();
+    if (IID == Intrinsic::experimental_deoptimize) {
       // Calls to llvm.experimental.deoptimize are lowered to calls to the
       // __llvm_deoptimize symbol.  We want to resolve this now, since the
       // verifier does not allow taking the address of an intrinsic function.
@@ -1540,6 +1541,101 @@ makeStatepointExplicitImpl(CallBase *Call, /* to replace */
                        .getCallee();
 
       IsDeoptimize = true;
+    } else if (IID == Intrinsic::memcpy_element_unordered_atomic ||
+               IID == Intrinsic::memmove_element_unordered_atomic) {
+      // Unordered atomic memcpy and memmove intrinsics which are not explicitly
+      // marked as "gc-leaf-function" should be lowered in a GC parseable way.
+      // Specifically, these calls should be lowered to the
+      // __llvm_{memcpy|memmove}_element_unordered_atomic_safepoint symbols.
+      // Similarly to __llvm_deoptimize we want to resolve this now, since the
+      // verifier does not allow taking the address of an intrinsic function.
+      //
+      // Moreover we need to shuffle the arguments for the call in order to
+      // accommodate GC. The underlying source and destination objects might be
+      // relocated during copy operation should the GC occur. To relocate the
+      // derived source and destination pointers the implementation of the
+      // intrinsic should know the corresponding base pointers.
+      //
+      // To make the base pointers available pass them explicitly as arguments:
+      //   memcpy(dest_derived, source_derived, ...) =>
+      //   memcpy(dest_base, dest_offset, source_base, source_offset, ...)
+      auto &Context = Call->getContext();
+      auto &DL = Call->getModule()->getDataLayout();
+      auto GetBaseAndOffset = [&](Value *Derived) {
+        assert(Result.PointerToBase.count(Derived));
+        unsigned AddressSpace = Derived->getType()->getPointerAddressSpace();
+        unsigned IntPtrSize = DL.getPointerSizeInBits(AddressSpace);
+        Value *Base = Result.PointerToBase.find(Derived)->second;
+        Value *Base_int = Builder.CreatePtrToInt(
+            Base, Type::getIntNTy(Context, IntPtrSize));
+        Value *Derived_int = Builder.CreatePtrToInt(
+            Derived, Type::getIntNTy(Context, IntPtrSize));
+        return std::make_pair(Base, Builder.CreateSub(Derived_int, Base_int));
+      };
+
+      auto *Dest = CallArgs[0];
+      Value *DestBase, *DestOffset;
+      std::tie(DestBase, DestOffset) = GetBaseAndOffset(Dest);
+
+      auto *Source = CallArgs[1];
+      Value *SourceBase, *SourceOffset;
+      std::tie(SourceBase, SourceOffset) = GetBaseAndOffset(Source);
+
+      auto *LengthInBytes = CallArgs[2];
+      auto *ElementSizeCI = cast<ConstantInt>(CallArgs[3]);
+
+      CallArgs.clear();
+      CallArgs.push_back(DestBase);
+      CallArgs.push_back(DestOffset);
+      CallArgs.push_back(SourceBase);
+      CallArgs.push_back(SourceOffset);
+      CallArgs.push_back(LengthInBytes);
+
+      SmallVector<Type *, 8> DomainTy;
+      for (Value *Arg : CallArgs)
+        DomainTy.push_back(Arg->getType());
+      auto *FTy = FunctionType::get(Type::getVoidTy(F->getContext()), DomainTy,
+                                    /* isVarArg = */ false);
+
+      auto GetFunctionName = [](Intrinsic::ID IID, ConstantInt *ElementSizeCI) {
+        uint64_t ElementSize = ElementSizeCI->getZExtValue();
+        if (IID == Intrinsic::memcpy_element_unordered_atomic) {
+          switch (ElementSize) {
+          case 1:
+            return "__llvm_memcpy_element_unordered_atomic_safepoint_1";
+          case 2:
+            return "__llvm_memcpy_element_unordered_atomic_safepoint_2";
+          case 4:
+            return "__llvm_memcpy_element_unordered_atomic_safepoint_4";
+          case 8:
+            return "__llvm_memcpy_element_unordered_atomic_safepoint_8";
+          case 16:
+            return "__llvm_memcpy_element_unordered_atomic_safepoint_16";
+          default:
+            llvm_unreachable("unexpected element size!");
+          }
+        }
+        assert(IID == Intrinsic::memmove_element_unordered_atomic);
+        switch (ElementSize) {
+        case 1:
+          return "__llvm_memmove_element_unordered_atomic_safepoint_1";
+        case 2:
+          return "__llvm_memmove_element_unordered_atomic_safepoint_2";
+        case 4:
+          return "__llvm_memmove_element_unordered_atomic_safepoint_4";
+        case 8:
+          return "__llvm_memmove_element_unordered_atomic_safepoint_8";
+        case 16:
+          return "__llvm_memmove_element_unordered_atomic_safepoint_16";
+        default:
+          llvm_unreachable("unexpected element size!");
+        }
+      };
+
+      CallTarget =
+          F->getParent()
+              ->getOrInsertFunction(GetFunctionName(IID, ElementSizeCI), FTy)
+              .getCallee();
     }
   }
 
@@ -1940,8 +2036,7 @@ static void relocationViaAlloca(
 /// tests in ways which make them less useful in testing fused safepoints.
 template <typename T> static void unique_unsorted(SmallVectorImpl<T> &Vec) {
   SmallSet<T, 8> Seen;
-  Vec.erase(remove_if(Vec, [&](const T &V) { return !Seen.insert(V).second; }),
-            Vec.end());
+  erase_if(Vec, [&](const T &V) { return !Seen.insert(V).second; });
 }
 
 /// Insert holders so that each Value is obviously live through the entire
@@ -2013,10 +2108,10 @@ static Value* findRematerializableChainToBasePointer(
 
 // Helper function for the "rematerializeLiveValues". Compute cost of the use
 // chain we are going to rematerialize.
-static unsigned
-chainToBasePointerCost(SmallVectorImpl<Instruction*> &Chain,
+static InstructionCost
+chainToBasePointerCost(SmallVectorImpl<Instruction *> &Chain,
                        TargetTransformInfo &TTI) {
-  unsigned Cost = 0;
+  InstructionCost Cost = 0;
 
   for (Instruction *Instr : Chain) {
     if (CastInst *CI = dyn_cast<CastInst>(Instr)) {
@@ -2025,8 +2120,8 @@ chainToBasePointerCost(SmallVectorImpl<Instruction*> &Chain,
 
       Type *SrcTy = CI->getOperand(0)->getType();
       Cost += TTI.getCastInstrCost(CI->getOpcode(), CI->getType(), SrcTy,
-                                   TargetTransformInfo::TCK_SizeAndLatency,
-                                   CI);
+                                   TTI::getCastContextHint(CI),
+                                   TargetTransformInfo::TCK_SizeAndLatency, CI);
 
     } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) {
       // Cost of the address calculation
@@ -2123,7 +2218,7 @@ static void rematerializeLiveValues(CallBase *Call,
       assert(Info.LiveSet.count(AlternateRootPhi));
     }
     // Compute cost of this chain
-    unsigned Cost = chainToBasePointerCost(ChainToBase, TTI);
+    InstructionCost Cost = chainToBasePointerCost(ChainToBase, TTI);
     // TODO: We can also account for cases when we will be able to remove some
     //       of the rematerialized values by later optimization passes. I.e if
     //       we rematerialized several intersecting chains. Or if original values
@@ -2404,8 +2499,7 @@ static bool insertParsePoints(Function &F, DominatorTree &DT,
     // That Value* no longer exists and we need to use the new gc_result.
     // Thankfully, the live set is embedded in the statepoint (and updated), so
     // we just grab that.
-    Live.insert(Live.end(), Info.StatepointToken->gc_args_begin(),
-                Info.StatepointToken->gc_args_end());
+    llvm::append_range(Live, Info.StatepointToken->gc_args());
 #ifndef NDEBUG
     // Do some basic sanity checks on our liveness results before performing
     // relocation.  Relocation can and will turn mistakes in liveness results
@@ -2581,8 +2675,27 @@ bool RewriteStatepointsForGC::runOnFunction(Function &F, DominatorTree &DT,
   assert(shouldRewriteStatepointsIn(F) && "mismatch in rewrite decision");
 
   auto NeedsRewrite = [&TLI](Instruction &I) {
-    if (const auto *Call = dyn_cast<CallBase>(&I))
-      return !callsGCLeafFunction(Call, TLI) && !isa<GCStatepointInst>(Call);
+    if (const auto *Call = dyn_cast<CallBase>(&I)) {
+      if (isa<GCStatepointInst>(Call))
+        return false;
+      if (callsGCLeafFunction(Call, TLI))
+        return false;
+
+      // Normally it's up to the frontend to make sure that non-leaf calls also
+      // have proper deopt state if it is required. We make an exception for
+      // element atomic memcpy/memmove intrinsics here. Unlike other intrinsics
+      // these are non-leaf by default. They might be generated by the optimizer
+      // which doesn't know how to produce a proper deopt state. So if we see a
+      // non-leaf memcpy/memmove without deopt state just treat it as a leaf
+      // copy and don't produce a statepoint.
+      if (!AllowStatepointWithNoDeoptInfo &&
+          !Call->getOperandBundle(LLVMContext::OB_deopt)) {
+        assert((isa<AtomicMemCpyInst>(Call) || isa<AtomicMemMoveInst>(Call)) &&
+               "Don't expect any other calls here!");
+        return false;
+      }
+      return true;
+    }
     return false;
   };
 
@@ -2620,10 +2733,8 @@ bool RewriteStatepointsForGC::runOnFunction(Function &F, DominatorTree &DT,
   // of liveness sets for no good reason.  It may be harder to do this post
   // insertion since relocations and base phis can confuse things.
   for (BasicBlock &BB : F)
-    if (BB.getUniquePredecessor()) {
-      MadeChange = true;
-      FoldSingleEntryPHINodes(&BB);
-    }
+    if (BB.getUniquePredecessor())
+      MadeChange |= FoldSingleEntryPHINodes(&BB);
 
   // Before we start introducing relocations, we want to tweak the IR a bit to
   // avoid unfortunate code generation effects.  The main example is that we
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SCCP.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SCCP.cpp
index 5ebd3b71fe78..8feed9e9ebfe 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SCCP.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SCCP.cpp
@@ -23,6 +23,7 @@
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -33,6 +34,7 @@
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueLattice.h"
 #include "llvm/Analysis/ValueLatticeUtils.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
@@ -103,8 +105,7 @@ bool isConstant(const ValueLatticeElement &LV) {
 // ValueLatticeElement::isOverdefined() and is intended to be used in the
 // transition to ValueLatticeElement.
 bool isOverdefined(const ValueLatticeElement &LV) {
-  return LV.isOverdefined() ||
-         (LV.isConstantRange() && !LV.getConstantRange().isSingleElement());
+  return !LV.isUnknownOrUndef() && !isConstant(LV);
 }
 
 //===----------------------------------------------------------------------===//
@@ -233,7 +234,7 @@ public:
       for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
         TrackedMultipleRetVals.insert(
             std::make_pair(std::make_pair(F, i), ValueLatticeElement()));
-    } else
+    } else if (!F->getReturnType()->isVoidTy())
       TrackedRetVals.insert(std::make_pair(F, ValueLatticeElement()));
   }
 
@@ -275,7 +276,7 @@ public:
 
   // isEdgeFeasible - Return true if the control flow edge from the 'From' basic
   // block to the 'To' basic block is currently feasible.
-  bool isEdgeFeasible(BasicBlock *From, BasicBlock *To);
+  bool isEdgeFeasible(BasicBlock *From, BasicBlock *To) const;
 
   std::vector<ValueLatticeElement> getStructLatticeValueFor(Value *V) const {
     std::vector<ValueLatticeElement> StructValues;
@@ -541,9 +542,14 @@ private:
 
     auto Iter = AdditionalUsers.find(I);
     if (Iter != AdditionalUsers.end()) {
+      // Copy additional users before notifying them of changes, because new
+      // users may be added, potentially invalidating the iterator.
+      SmallVector<Instruction *, 2> ToNotify;
       for (User *U : Iter->second)
         if (auto *UI = dyn_cast<Instruction>(U))
-          OperandChangedState(UI);
+          ToNotify.push_back(UI);
+      for (Instruction *UI : ToNotify)
+        OperandChangedState(UI);
     }
   }
   void handleCallOverdefined(CallBase &CB);
@@ -648,17 +654,30 @@ void SCCPSolver::getFeasibleSuccessors(Instruction &TI,
       Succs[0] = true;
       return;
     }
-    ValueLatticeElement SCValue = getValueState(SI->getCondition());
-    ConstantInt *CI = getConstantInt(SCValue);
+    const ValueLatticeElement &SCValue = getValueState(SI->getCondition());
+    if (ConstantInt *CI = getConstantInt(SCValue)) {
+      Succs[SI->findCaseValue(CI)->getSuccessorIndex()] = true;
+      return;
+    }
 
-    if (!CI) {   // Overdefined or unknown condition?
-      // All destinations are executable!
-      if (!SCValue.isUnknownOrUndef())
-        Succs.assign(TI.getNumSuccessors(), true);
+    // TODO: Switch on undef is UB. Stop passing false once the rest of LLVM
+    // is ready.
+    if (SCValue.isConstantRange(/*UndefAllowed=*/false)) {
+      const ConstantRange &Range = SCValue.getConstantRange();
+      for (const auto &Case : SI->cases()) {
+        const APInt &CaseValue = Case.getCaseValue()->getValue();
+        if (Range.contains(CaseValue))
+          Succs[Case.getSuccessorIndex()] = true;
+      }
+
+      // TODO: Determine whether default case is reachable.
+      Succs[SI->case_default()->getSuccessorIndex()] = true;
       return;
     }
 
-    Succs[SI->findCaseValue(CI)->getSuccessorIndex()] = true;
+    // Overdefined or unknown condition? All destinations are executable!
+    if (!SCValue.isUnknownOrUndef())
+      Succs.assign(TI.getNumSuccessors(), true);
     return;
   }
 
@@ -704,7 +723,7 @@ void SCCPSolver::getFeasibleSuccessors(Instruction &TI,
 
 // isEdgeFeasible - Return true if the control flow edge from the 'From' basic
 // block to the 'To' basic block is currently feasible.
-bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) {
+bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) const {
   // Check if we've called markEdgeExecutable on the edge yet. (We could
   // be more aggressive and try to consider edges which haven't been marked
   // yet, but there isn't any need.)
@@ -829,6 +848,16 @@ void SCCPSolver::visitCastInst(CastInst &I) {
     auto &LV = getValueState(&I);
     ConstantRange OpRange = OpSt.getConstantRange();
     Type *DestTy = I.getDestTy();
+    // Vectors where all elements have the same known constant range are treated
+    // as a single constant range in the lattice. When bitcasting such vectors,
+    // there is a mis-match between the width of the lattice value (single
+    // constant range) and the original operands (vector). Go to overdefined in
+    // that case.
+    if (I.getOpcode() == Instruction::BitCast &&
+        I.getOperand(0)->getType()->isVectorTy() &&
+        OpRange.getBitWidth() < DL.getTypeSizeInBits(DestTy))
+      return (void)markOverdefined(&I);
+
     ConstantRange Res =
         OpRange.castOp(I.getOpcode(), DL.getTypeSizeInBits(DestTy));
     mergeInValue(LV, &I, ValueLatticeElement::getRange(Res));
@@ -1109,7 +1138,9 @@ static ValueLatticeElement getValueFromMetadata(const Instruction *I) {
     if (I->getType()->isIntegerTy())
       return ValueLatticeElement::getRange(
           getConstantRangeFromMetadata(*Ranges));
-  // TODO: Also handle MD_nonnull.
+  if (I->hasMetadata(LLVMContext::MD_nonnull))
+    return ValueLatticeElement::getNot(
+        ConstantPointerNull::get(cast<PointerType>(I->getType())));
   return ValueLatticeElement::getOverdefined();
 }
 
@@ -1262,55 +1293,33 @@ void SCCPSolver::handleCallResult(CallBase &CB) {
       auto *PI = getPredicateInfoFor(&CB);
       assert(PI && "Missing predicate info for ssa.copy");
 
-      CmpInst *Cmp;
-      bool TrueEdge;
-      if (auto *PBranch = dyn_cast<PredicateBranch>(PI)) {
-        Cmp = dyn_cast<CmpInst>(PBranch->Condition);
-        TrueEdge = PBranch->TrueEdge;
-      } else if (auto *PAssume = dyn_cast<PredicateAssume>(PI)) {
-        Cmp = dyn_cast<CmpInst>(PAssume->Condition);
-        TrueEdge = true;
-      } else {
+      const Optional<PredicateConstraint> &Constraint = PI->getConstraint();
+      if (!Constraint) {
         mergeInValue(ValueState[&CB], &CB, CopyOfVal);
         return;
       }
 
-      // Everything below relies on the condition being a comparison.
-      if (!Cmp) {
-        mergeInValue(ValueState[&CB], &CB, CopyOfVal);
-        return;
-      }
+      CmpInst::Predicate Pred = Constraint->Predicate;
+      Value *OtherOp = Constraint->OtherOp;
 
-      Value *RenamedOp = PI->RenamedOp;
-      Value *CmpOp0 = Cmp->getOperand(0);
-      Value *CmpOp1 = Cmp->getOperand(1);
-      // Bail out if neither of the operands matches RenamedOp.
-      if (CmpOp0 != RenamedOp && CmpOp1 != RenamedOp) {
-        mergeInValue(ValueState[&CB], &CB, getValueState(CopyOf));
+      // Wait until OtherOp is resolved.
+      if (getValueState(OtherOp).isUnknown()) {
+        addAdditionalUser(OtherOp, &CB);
         return;
       }
 
-      auto Pred = Cmp->getPredicate();
-      if (CmpOp1 == RenamedOp) {
-        std::swap(CmpOp0, CmpOp1);
-        Pred = Cmp->getSwappedPredicate();
-      }
-
-      // Wait until CmpOp1 is resolved.
-      if (getValueState(CmpOp1).isUnknown()) {
-        addAdditionalUser(CmpOp1, &CB);
-        return;
-      }
-
-      // The code below relies on PredicateInfo only inserting copies for the
-      // true branch when the branch condition is an AND and only inserting
-      // copies for the false branch when the branch condition is an OR. This
-      // ensures we can intersect the range from the condition with the range of
-      // CopyOf.
-      if (!TrueEdge)
-        Pred = CmpInst::getInversePredicate(Pred);
-
-      ValueLatticeElement CondVal = getValueState(CmpOp1);
+      // TODO: Actually filp MayIncludeUndef for the created range to false,
+      // once most places in the optimizer respect the branches on
+      // undef/poison are UB rule. The reason why the new range cannot be
+      // undef is as follows below:
+      // The new range is based on a branch condition. That guarantees that
+      // neither of the compare operands can be undef in the branch targets,
+      // unless we have conditions that are always true/false (e.g. icmp ule
+      // i32, %a, i32_max). For the latter overdefined/empty range will be
+      // inferred, but the branch will get folded accordingly anyways.
+      bool MayIncludeUndef = !isa<PredicateAssume>(PI);
+
+      ValueLatticeElement CondVal = getValueState(OtherOp);
       ValueLatticeElement &IV = ValueState[&CB];
       if (CondVal.isConstantRange() || CopyOfVal.isConstantRange()) {
         auto ImposedCR =
@@ -1334,30 +1343,47 @@ void SCCPSolver::handleCallResult(CallBase &CB) {
         if (!CopyOfCR.contains(NewCR) && CopyOfCR.getSingleMissingElement())
           NewCR = CopyOfCR;
 
-        addAdditionalUser(CmpOp1, &CB);
-        // TODO: Actually filp MayIncludeUndef for the created range to false,
-        // once most places in the optimizer respect the branches on
-        // undef/poison are UB rule. The reason why the new range cannot be
-        // undef is as follows below:
-        // The new range is based on a branch condition. That guarantees that
-        // neither of the compare operands can be undef in the branch targets,
-        // unless we have conditions that are always true/false (e.g. icmp ule
-        // i32, %a, i32_max). For the latter overdefined/empty range will be
-        // inferred, but the branch will get folded accordingly anyways.
+        addAdditionalUser(OtherOp, &CB);
         mergeInValue(
             IV, &CB,
-            ValueLatticeElement::getRange(NewCR, /*MayIncludeUndef=*/true));
+            ValueLatticeElement::getRange(NewCR, MayIncludeUndef));
         return;
       } else if (Pred == CmpInst::ICMP_EQ && CondVal.isConstant()) {
         // For non-integer values or integer constant expressions, only
         // propagate equal constants.
-        addAdditionalUser(CmpOp1, &CB);
+        addAdditionalUser(OtherOp, &CB);
         mergeInValue(IV, &CB, CondVal);
         return;
+      } else if (Pred == CmpInst::ICMP_NE && CondVal.isConstant() &&
+                 !MayIncludeUndef) {
+        // Propagate inequalities.
+        addAdditionalUser(OtherOp, &CB);
+        mergeInValue(IV, &CB,
+                     ValueLatticeElement::getNot(CondVal.getConstant()));
+        return;
       }
 
       return (void)mergeInValue(IV, &CB, CopyOfVal);
     }
+
+    if (ConstantRange::isIntrinsicSupported(II->getIntrinsicID())) {
+      // Compute result range for intrinsics supported by ConstantRange.
+      // Do this even if we don't know a range for all operands, as we may
+      // still know something about the result range, e.g. of abs(x).
+      SmallVector<ConstantRange, 2> OpRanges;
+      for (Value *Op : II->args()) {
+        const ValueLatticeElement &State = getValueState(Op);
+        if (State.isConstantRange())
+          OpRanges.push_back(State.getConstantRange());
+        else
+          OpRanges.push_back(
+              ConstantRange::getFull(Op->getType()->getScalarSizeInBits()));
+      }
+
+      ConstantRange Result =
+          ConstantRange::intrinsic(II->getIntrinsicID(), OpRanges);
+      return (void)mergeInValue(II, ValueLatticeElement::getRange(Result));
+    }
   }
 
   // The common case is that we aren't tracking the callee, either because we
@@ -1427,8 +1453,7 @@ void SCCPSolver::Solve() {
 
     // Process the basic block work list.
     while (!BBWorkList.empty()) {
-      BasicBlock *BB = BBWorkList.back();
-      BBWorkList.pop_back();
+      BasicBlock *BB = BBWorkList.pop_back_val();
 
       LLVM_DEBUG(dbgs() << "\nPopped off BBWL: " << *BB << '\n');
 
@@ -1456,6 +1481,7 @@ void SCCPSolver::Solve() {
 /// This scan also checks for values that use undefs. It conservatively marks
 /// them as overdefined.
 bool SCCPSolver::ResolvedUndefsIn(Function &F) {
+  bool MadeChange = false;
   for (BasicBlock &BB : F) {
     if (!BBExecutable.count(&BB))
       continue;
@@ -1481,8 +1507,10 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
         // more precise than this but it isn't worth bothering.
         for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
           ValueLatticeElement &LV = getStructValueState(&I, i);
-          if (LV.isUnknownOrUndef())
+          if (LV.isUnknownOrUndef()) {
             markOverdefined(LV, &I);
+            MadeChange = true;
+          }
         }
         continue;
       }
@@ -1509,7 +1537,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
       }
 
       markOverdefined(&I);
-      return true;
+      MadeChange = true;
     }
 
     // Check to see if we have a branch or switch on an undefined value.  If so
@@ -1526,7 +1554,8 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
       if (isa<UndefValue>(BI->getCondition())) {
         BI->setCondition(ConstantInt::getFalse(BI->getContext()));
         markEdgeExecutable(&BB, TI->getSuccessor(1));
-        return true;
+        MadeChange = true;
+        continue;
       }
 
       // Otherwise, it is a branch on a symbolic value which is currently
@@ -1535,7 +1564,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
       // FIXME: Distinguish between dead code and an LLVM "undef" value.
       BasicBlock *DefaultSuccessor = TI->getSuccessor(1);
       if (markEdgeExecutable(&BB, DefaultSuccessor))
-        return true;
+        MadeChange = true;
 
       continue;
     }
@@ -1554,7 +1583,8 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
       if (isa<UndefValue>(IBR->getAddress())) {
         IBR->setAddress(BlockAddress::get(IBR->getSuccessor(0)));
         markEdgeExecutable(&BB, IBR->getSuccessor(0));
-        return true;
+        MadeChange = true;
+        continue;
       }
 
       // Otherwise, it is a branch on a symbolic value which is currently
@@ -1564,7 +1594,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
       // we can assume the branch has undefined behavior instead.
       BasicBlock *DefaultSuccessor = IBR->getSuccessor(0);
       if (markEdgeExecutable(&BB, DefaultSuccessor))
-        return true;
+        MadeChange = true;
 
       continue;
     }
@@ -1579,7 +1609,8 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
       if (isa<UndefValue>(SI->getCondition())) {
         SI->setCondition(SI->case_begin()->getCaseValue());
         markEdgeExecutable(&BB, SI->case_begin()->getCaseSuccessor());
-        return true;
+        MadeChange = true;
+        continue;
       }
 
       // Otherwise, it is a branch on a symbolic value which is currently
@@ -1588,13 +1619,13 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
       // FIXME: Distinguish between dead code and an LLVM "undef" value.
       BasicBlock *DefaultSuccessor = SI->case_begin()->getCaseSuccessor();
       if (markEdgeExecutable(&BB, DefaultSuccessor))
-        return true;
+        MadeChange = true;
 
       continue;
     }
   }
 
-  return false;
+  return MadeChange;
 }
 
 static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) {
@@ -1716,7 +1747,7 @@ static bool runSCCP(Function &F, const DataLayout &DL,
       LLVM_DEBUG(dbgs() << "  BasicBlock Dead:" << BB);
 
       ++NumDeadBlocks;
-      NumInstRemoved += removeAllNonTerminatorAndEHPadInstructions(&BB);
+      NumInstRemoved += removeAllNonTerminatorAndEHPadInstructions(&BB).first;
 
       MadeChanges = true;
       continue;
@@ -1839,39 +1870,68 @@ static void findReturnsToZap(Function &F,
   }
 }
 
-// Update the condition for terminators that are branching on indeterminate
-// values, forcing them to use a specific edge.
-static void forceIndeterminateEdge(Instruction* I, SCCPSolver &Solver) {
-  BasicBlock *Dest = nullptr;
-  Constant *C = nullptr;
-  if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) {
-    if (!isa<ConstantInt>(SI->getCondition())) {
-      // Indeterminate switch; use first case value.
-      Dest = SI->case_begin()->getCaseSuccessor();
-      C = SI->case_begin()->getCaseValue();
-    }
-  } else if (BranchInst *BI = dyn_cast<BranchInst>(I)) {
-    if (!isa<ConstantInt>(BI->getCondition())) {
-      // Indeterminate branch; use false.
-      Dest = BI->getSuccessor(1);
-      C = ConstantInt::getFalse(BI->getContext());
+static bool removeNonFeasibleEdges(const SCCPSolver &Solver, BasicBlock *BB,
+                                   DomTreeUpdater &DTU) {
+  SmallPtrSet<BasicBlock *, 8> FeasibleSuccessors;
+  bool HasNonFeasibleEdges = false;
+  for (BasicBlock *Succ : successors(BB)) {
+    if (Solver.isEdgeFeasible(BB, Succ))
+      FeasibleSuccessors.insert(Succ);
+    else
+      HasNonFeasibleEdges = true;
+  }
+
+  // All edges feasible, nothing to do.
+  if (!HasNonFeasibleEdges)
+    return false;
+
+  // SCCP can only determine non-feasible edges for br, switch and indirectbr.
+  Instruction *TI = BB->getTerminator();
+  assert((isa<BranchInst>(TI) || isa<SwitchInst>(TI) ||
+          isa<IndirectBrInst>(TI)) &&
+         "Terminator must be a br, switch or indirectbr");
+
+  if (FeasibleSuccessors.size() == 1) {
+    // Replace with an unconditional branch to the only feasible successor.
+    BasicBlock *OnlyFeasibleSuccessor = *FeasibleSuccessors.begin();
+    SmallVector<DominatorTree::UpdateType, 8> Updates;
+    bool HaveSeenOnlyFeasibleSuccessor = false;
+    for (BasicBlock *Succ : successors(BB)) {
+      if (Succ == OnlyFeasibleSuccessor && !HaveSeenOnlyFeasibleSuccessor) {
+        // Don't remove the edge to the only feasible successor the first time
+        // we see it. We still do need to remove any multi-edges to it though.
+        HaveSeenOnlyFeasibleSuccessor = true;
+        continue;
+      }
+
+      Succ->removePredecessor(BB);
+      Updates.push_back({DominatorTree::Delete, BB, Succ});
     }
-  } else if (IndirectBrInst *IBR = dyn_cast<IndirectBrInst>(I)) {
-    if (!isa<BlockAddress>(IBR->getAddress()->stripPointerCasts())) {
-      // Indeterminate indirectbr; use successor 0.
-      Dest = IBR->getSuccessor(0);
-      C = BlockAddress::get(IBR->getSuccessor(0));
+
+    BranchInst::Create(OnlyFeasibleSuccessor, BB);
+    TI->eraseFromParent();
+    DTU.applyUpdatesPermissive(Updates);
+  } else if (FeasibleSuccessors.size() > 1) {
+    SwitchInstProfUpdateWrapper SI(*cast<SwitchInst>(TI));
+    SmallVector<DominatorTree::UpdateType, 8> Updates;
+    for (auto CI = SI->case_begin(); CI != SI->case_end();) {
+      if (FeasibleSuccessors.contains(CI->getCaseSuccessor())) {
+        ++CI;
+        continue;
+      }
+
+      BasicBlock *Succ = CI->getCaseSuccessor();
+      Succ->removePredecessor(BB);
+      Updates.push_back({DominatorTree::Delete, BB, Succ});
+      SI.removeCase(CI);
+      // Don't increment CI, as we removed a case.
     }
-  } else {
-    llvm_unreachable("Unexpected terminator instruction");
-  }
-  if (C) {
-    assert(Solver.isEdgeFeasible(I->getParent(), Dest) &&
-           "Didn't find feasible edge?");
-    (void)Dest;
 
-    I->setOperand(0, C);
+    DTU.applyUpdatesPermissive(Updates);
+  } else {
+    llvm_unreachable("Must have at least one feasible successor");
   }
+  return true;
 }
 
 bool llvm::runIPSCCP(
@@ -1923,13 +1983,12 @@ bool llvm::runIPSCCP(
   while (ResolvedUndefs) {
     LLVM_DEBUG(dbgs() << "RESOLVING UNDEFS\n");
     ResolvedUndefs = false;
-    for (Function &F : M)
-      if (Solver.ResolvedUndefsIn(F)) {
-        // We run Solve() after we resolved an undef in a function, because
-        // we might deduce a fact that eliminates an undef in another function.
-        Solver.Solve();
+    for (Function &F : M) {
+      if (Solver.ResolvedUndefsIn(F))
         ResolvedUndefs = true;
-      }
+    }
+    if (ResolvedUndefs)
+      Solver.Solve();
   }
 
   bool MadeChanges = false;
@@ -1943,15 +2002,35 @@ bool llvm::runIPSCCP(
 
     SmallVector<BasicBlock *, 512> BlocksToErase;
 
-    if (Solver.isBlockExecutable(&F.front()))
-      for (Function::arg_iterator AI = F.arg_begin(), E = F.arg_end(); AI != E;
-           ++AI) {
-        if (!AI->use_empty() && tryToReplaceWithConstant(Solver, &*AI)) {
+    if (Solver.isBlockExecutable(&F.front())) {
+      bool ReplacedPointerArg = false;
+      for (Argument &Arg : F.args()) {
+        if (!Arg.use_empty() && tryToReplaceWithConstant(Solver, &Arg)) {
+          ReplacedPointerArg |= Arg.getType()->isPointerTy();
           ++IPNumArgsElimed;
-          continue;
         }
       }
 
+      // If we replaced an argument, the argmemonly and
+      // inaccessiblemem_or_argmemonly attributes do not hold any longer. Remove
+      // them from both the function and callsites.
+      if (ReplacedPointerArg) {
+        AttrBuilder AttributesToRemove;
+        AttributesToRemove.addAttribute(Attribute::ArgMemOnly);
+        AttributesToRemove.addAttribute(Attribute::InaccessibleMemOrArgMemOnly);
+        F.removeAttributes(AttributeList::FunctionIndex, AttributesToRemove);
+
+        for (User *U : F.users()) {
+          auto *CB = dyn_cast<CallBase>(U);
+          if (!CB || CB->getCalledFunction() != &F)
+            continue;
+
+          CB->removeAttributes(AttributeList::FunctionIndex,
+                               AttributesToRemove);
+        }
+      }
+    }
+
     SmallPtrSet<Value *, 32> InsertedValues;
     for (BasicBlock &BB : F) {
       if (!Solver.isBlockExecutable(&BB)) {
@@ -1984,45 +2063,11 @@ bool llvm::runIPSCCP(
                                             /*UseLLVMTrap=*/false,
                                             /*PreserveLCSSA=*/false, &DTU);
 
-    // Now that all instructions in the function are constant folded,
-    // use ConstantFoldTerminator to get rid of in-edges, record DT updates and
-    // delete dead BBs.
-    for (BasicBlock *DeadBB : BlocksToErase) {
-      // If there are any PHI nodes in this successor, drop entries for BB now.
-      for (Value::user_iterator UI = DeadBB->user_begin(),
-                                UE = DeadBB->user_end();
-           UI != UE;) {
-        // Grab the user and then increment the iterator early, as the user
-        // will be deleted. Step past all adjacent uses from the same user.
-        auto *I = dyn_cast<Instruction>(*UI);
-        do { ++UI; } while (UI != UE && *UI == I);
-
-        // Ignore blockaddress users; BasicBlock's dtor will handle them.
-        if (!I) continue;
-
-        // If we have forced an edge for an indeterminate value, then force the
-        // terminator to fold to that edge.
-        forceIndeterminateEdge(I, Solver);
-        BasicBlock *InstBB = I->getParent();
-        bool Folded = ConstantFoldTerminator(InstBB,
-                                             /*DeleteDeadConditions=*/false,
-                                             /*TLI=*/nullptr, &DTU);
-        assert(Folded &&
-              "Expect TermInst on constantint or blockaddress to be folded");
-        (void) Folded;
-        // If we folded the terminator to an unconditional branch to another
-        // dead block, replace it with Unreachable, to avoid trying to fold that
-        // branch again.
-        BranchInst *BI = cast<BranchInst>(InstBB->getTerminator());
-        if (BI && BI->isUnconditional() &&
-            !Solver.isBlockExecutable(BI->getSuccessor(0))) {
-          InstBB->getTerminator()->eraseFromParent();
-          new UnreachableInst(InstBB->getContext(), InstBB);
-        }
-      }
-      // Mark dead BB for deletion.
+    for (BasicBlock &BB : F)
+      MadeChanges |= removeNonFeasibleEdges(Solver, &BB, DTU);
+
+    for (BasicBlock *DeadBB : BlocksToErase)
       DTU.deleteBB(DeadBB);
-    }
 
     for (BasicBlock &BB : F) {
       for (BasicBlock::iterator BI = BB.begin(), E = BB.end(); BI != E;) {
@@ -2054,9 +2099,47 @@ bool llvm::runIPSCCP(
 
   for (const auto &I : Solver.getTrackedRetVals()) {
     Function *F = I.first;
-    if (isOverdefined(I.second) || F->getReturnType()->isVoidTy())
+    const ValueLatticeElement &ReturnValue = I.second;
+
+    // If there is a known constant range for the return value, add !range
+    // metadata to the function's call sites.
+    if (ReturnValue.isConstantRange() &&
+        !ReturnValue.getConstantRange().isSingleElement()) {
+      // Do not add range metadata if the return value may include undef.
+      if (ReturnValue.isConstantRangeIncludingUndef())
+        continue;
+
+      auto &CR = ReturnValue.getConstantRange();
+      for (User *User : F->users()) {
+        auto *CB = dyn_cast<CallBase>(User);
+        if (!CB || CB->getCalledFunction() != F)
+          continue;
+
+        // Limit to cases where the return value is guaranteed to be neither
+        // poison nor undef. Poison will be outside any range and currently
+        // values outside of the specified range cause immediate undefined
+        // behavior.
+        if (!isGuaranteedNotToBeUndefOrPoison(CB, nullptr, CB))
+          continue;
+
+        // Do not touch existing metadata for now.
+        // TODO: We should be able to take the intersection of the existing
+        // metadata and the inferred range.
+        if (CB->getMetadata(LLVMContext::MD_range))
+          continue;
+
+        LLVMContext &Context = CB->getParent()->getContext();
+        Metadata *RangeMD[] = {
+            ConstantAsMetadata::get(ConstantInt::get(Context, CR.getLower())),
+            ConstantAsMetadata::get(ConstantInt::get(Context, CR.getUpper()))};
+        CB->setMetadata(LLVMContext::MD_range, MDNode::get(Context, RangeMD));
+      }
       continue;
-    findReturnsToZap(*F, ReturnsToZap, Solver);
+    }
+    if (F->getReturnType()->isVoidTy())
+      continue;
+    if (isConstant(ReturnValue) || ReturnValue.isUnknownOrUndef())
+      findReturnsToZap(*F, ReturnsToZap, Solver);
   }
 
   for (auto F : Solver.getMRVFunctionsTracked()) {
@@ -2068,9 +2151,27 @@ bool llvm::runIPSCCP(
   }
 
   // Zap all returns which we've identified as zap to change.
+  SmallSetVector<Function *, 8> FuncZappedReturn;
   for (unsigned i = 0, e = ReturnsToZap.size(); i != e; ++i) {
     Function *F = ReturnsToZap[i]->getParent()->getParent();
     ReturnsToZap[i]->setOperand(0, UndefValue::get(F->getReturnType()));
+    // Record all functions that are zapped.
+    FuncZappedReturn.insert(F);
+  }
+
+  // Remove the returned attribute for zapped functions and the
+  // corresponding call sites.
+  for (Function *F : FuncZappedReturn) {
+    for (Argument &A : F->args())
+      F->removeParamAttr(A.getArgNo(), Attribute::Returned);
+    for (Use &U : F->uses()) {
+      // Skip over blockaddr users.
+      if (isa<BlockAddress>(U.getUser()))
+        continue;
+      CallBase *CB = cast<CallBase>(U.getUser());
+      for (Use &Arg : CB->args())
+        CB->removeParamAttr(CB->getArgOperandNo(&Arg), Attribute::Returned);
+    }
   }
 
   // If we inferred constant or undef values for globals variables, we can
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SROA.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SROA.cpp
index 89f324deef9f..af510f1a84bf 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -268,6 +268,11 @@ public:
   /// Access the dead users for this alloca.
   ArrayRef<Instruction *> getDeadUsers() const { return DeadUsers; }
 
+  /// Access Uses that should be dropped if the alloca is promotable.
+  ArrayRef<Use *> getDeadUsesIfPromotable() const {
+    return DeadUseIfPromotable;
+  }
+
   /// Access the dead operands referring to this alloca.
   ///
   /// These are operands which have cannot actually be used to refer to the
@@ -322,6 +327,9 @@ private:
   /// they come from outside of the allocated space.
   SmallVector<Instruction *, 8> DeadUsers;
 
+  /// Uses which will become dead if can promote the alloca.
+  SmallVector<Use *, 8> DeadUseIfPromotable;
+
   /// Operands which will become dead if we rewrite the alloca.
   ///
   /// These are operands that in their particular use can be replaced with
@@ -459,12 +467,8 @@ class AllocaSlices::partition_iterator
         // Remove the uses which have ended in the prior partition. This
         // cannot change the max split slice end because we just checked that
         // the prior partition ended prior to that max.
-        P.SplitTails.erase(llvm::remove_if(P.SplitTails,
-                                           [&](Slice *S) {
-                                             return S->endOffset() <=
-                                                    P.EndOffset;
-                                           }),
-                           P.SplitTails.end());
+        llvm::erase_if(P.SplitTails,
+                       [&](Slice *S) { return S->endOffset() <= P.EndOffset; });
         assert(llvm::any_of(P.SplitTails,
                             [&](Slice *S) {
                               return S->endOffset() == MaxSplitSliceEndOffset;
@@ -780,6 +784,9 @@ private:
         LI.getPointerAddressSpace() != DL.getAllocaAddrSpace())
       return PI.setAborted(&LI);
 
+    if (isa<ScalableVectorType>(LI.getType()))
+      return PI.setAborted(&LI);
+
     uint64_t Size = DL.getTypeStoreSize(LI.getType()).getFixedSize();
     return handleLoadOrStore(LI.getType(), LI, Offset, Size, LI.isVolatile());
   }
@@ -795,6 +802,9 @@ private:
         SI.getPointerAddressSpace() != DL.getAllocaAddrSpace())
       return PI.setAborted(&SI);
 
+    if (isa<ScalableVectorType>(ValOp->getType()))
+      return PI.setAborted(&SI);
+
     uint64_t Size = DL.getTypeStoreSize(ValOp->getType()).getFixedSize();
 
     // If this memory access can be shown to *statically* extend outside the
@@ -920,6 +930,11 @@ private:
   // FIXME: What about debug intrinsics? This matches old behavior, but
   // doesn't make sense.
   void visitIntrinsicInst(IntrinsicInst &II) {
+    if (II.isDroppable()) {
+      AS.DeadUseIfPromotable.push_back(U);
+      return;
+    }
+
     if (!IsOffsetKnown)
       return PI.setAborted(&II);
 
@@ -1057,13 +1072,11 @@ AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
     return;
   }
 
-  Slices.erase(
-      llvm::remove_if(Slices, [](const Slice &S) { return S.isDead(); }),
-      Slices.end());
+  llvm::erase_if(Slices, [](const Slice &S) { return S.isDead(); });
 
   // Sort the uses. This arranges for the offsets to be in ascending order,
   // and the sizes to be in descending order.
-  std::stable_sort(Slices.begin(), Slices.end());
+  llvm::stable_sort(Slices);
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -1109,9 +1122,9 @@ LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); }
 
 /// Walk the range of a partitioning looking for a common type to cover this
 /// sequence of slices.
-static Type *findCommonType(AllocaSlices::const_iterator B,
-                            AllocaSlices::const_iterator E,
-                            uint64_t EndOffset) {
+static std::pair<Type *, IntegerType *>
+findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E,
+               uint64_t EndOffset) {
   Type *Ty = nullptr;
   bool TyIsCommon = true;
   IntegerType *ITy = nullptr;
@@ -1155,7 +1168,7 @@ static Type *findCommonType(AllocaSlices::const_iterator B,
       Ty = UserTy;
   }
 
-  return TyIsCommon ? Ty : ITy;
+  return {TyIsCommon ? Ty : nullptr, ITy};
 }
 
 /// PHI instructions that use an alloca and are subsequently loaded can be
@@ -1379,7 +1392,8 @@ static void speculateSelectInstLoads(SelectInst &SI) {
 /// This will return the BasePtr if that is valid, or build a new GEP
 /// instruction using the IRBuilder if GEP-ing is needed.
 static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr,
-                       SmallVectorImpl<Value *> &Indices, Twine NamePrefix) {
+                       SmallVectorImpl<Value *> &Indices,
+                       const Twine &NamePrefix) {
   if (Indices.empty())
     return BasePtr;
 
@@ -1404,7 +1418,7 @@ static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr,
 static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL,
                                     Value *BasePtr, Type *Ty, Type *TargetTy,
                                     SmallVectorImpl<Value *> &Indices,
-                                    Twine NamePrefix) {
+                                    const Twine &NamePrefix) {
   if (Ty == TargetTy)
     return buildGEP(IRB, BasePtr, Indices, NamePrefix);
 
@@ -1449,7 +1463,7 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL,
                                        Value *Ptr, Type *Ty, APInt &Offset,
                                        Type *TargetTy,
                                        SmallVectorImpl<Value *> &Indices,
-                                       Twine NamePrefix) {
+                                       const Twine &NamePrefix) {
   if (Offset == 0)
     return getNaturalGEPWithType(IRB, DL, Ptr, Ty, TargetTy, Indices,
                                  NamePrefix);
@@ -1524,7 +1538,7 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL,
 static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL,
                                       Value *Ptr, APInt Offset, Type *TargetTy,
                                       SmallVectorImpl<Value *> &Indices,
-                                      Twine NamePrefix) {
+                                      const Twine &NamePrefix) {
   PointerType *Ty = cast<PointerType>(Ptr->getType());
 
   // Don't consider any GEPs through an i8* as natural unless the TargetTy is
@@ -1535,6 +1549,8 @@ static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL,
   Type *ElementTy = Ty->getElementType();
   if (!ElementTy->isSized())
     return nullptr; // We can't GEP through an unsized element.
+  if (isa<ScalableVectorType>(ElementTy))
+    return nullptr;
   APInt ElementSize(Offset.getBitWidth(),
                     DL.getTypeAllocSize(ElementTy).getFixedSize());
   if (ElementSize == 0)
@@ -1563,7 +1579,8 @@ static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL,
 /// a single GEP as possible, thus making each GEP more independent of the
 /// surrounding code.
 static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
-                             APInt Offset, Type *PointerTy, Twine NamePrefix) {
+                             APInt Offset, Type *PointerTy,
+                             const Twine &NamePrefix) {
   // Even though we don't look through PHI nodes, we could be called on an
   // instruction in an unreachable block, which may be on a cycle.
   SmallPtrSet<Value *, 4> Visited;
@@ -1825,7 +1842,7 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
     if (!S.isSplittable())
       return false; // Skip any unsplittable intrinsics.
   } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
-    if (!II->isLifetimeStartOrEnd())
+    if (!II->isLifetimeStartOrEnd() && !II->isDroppable())
       return false;
   } else if (U->get()->getType()->getPointerElementType()->isStructTy()) {
     // Disable vector promotion when there are loads or stores of an FCA.
@@ -1909,12 +1926,9 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
   // do that until all the backends are known to produce good code for all
   // integer vector types.
   if (!HaveCommonEltTy) {
-    CandidateTys.erase(
-        llvm::remove_if(CandidateTys,
-                        [](VectorType *VTy) {
-                          return !VTy->getElementType()->isIntegerTy();
-                        }),
-        CandidateTys.end());
+    llvm::erase_if(CandidateTys, [](VectorType *VTy) {
+      return !VTy->getElementType()->isIntegerTy();
+    });
 
     // If there were no integer vector types, give up.
     if (CandidateTys.empty())
@@ -2058,7 +2072,7 @@ static bool isIntegerWideningViableForSlice(const Slice &S,
     if (!S.isSplittable())
       return false; // Skip any unsplittable intrinsics.
   } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
-    if (!II->isLifetimeStartOrEnd())
+    if (!II->isLifetimeStartOrEnd() && !II->isDroppable())
       return false;
   } else {
     return false;
@@ -2099,8 +2113,7 @@ static bool isIntegerWideningViable(Partition &P, Type *AllocaTy,
   // that we cover the alloca.
   // FIXME: We shouldn't consider split slices that happen to start in the
   // partition here...
-  bool WholeAllocaOp =
-      P.begin() != P.end() ? false : DL.isLegalInteger(SizeInBits);
+  bool WholeAllocaOp = P.empty() && DL.isLegalInteger(SizeInBits);
 
   for (const Slice &S : P)
     if (!isIntegerWideningViableForSlice(S, P.beginOffset(), AllocaTy, DL,
@@ -2193,8 +2206,7 @@ static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex,
   Mask.reserve(NumElements);
   for (unsigned i = BeginIndex; i != EndIndex; ++i)
     Mask.push_back(i);
-  V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()), Mask,
-                              Name + ".extract");
+  V = IRB.CreateShuffleVector(V, Mask, Name + ".extract");
   LLVM_DEBUG(dbgs() << "     shuffle: " << *V << "\n");
   return V;
 }
@@ -2227,22 +2239,22 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
   // use a shuffle vector to widen it with undef elements, and then
   // a second shuffle vector to select between the loaded vector and the
   // incoming vector.
-  SmallVector<Constant *, 8> Mask;
+  SmallVector<int, 8> Mask;
   Mask.reserve(cast<FixedVectorType>(VecTy)->getNumElements());
   for (unsigned i = 0; i != cast<FixedVectorType>(VecTy)->getNumElements(); ++i)
     if (i >= BeginIndex && i < EndIndex)
-      Mask.push_back(IRB.getInt32(i - BeginIndex));
+      Mask.push_back(i - BeginIndex);
     else
-      Mask.push_back(UndefValue::get(IRB.getInt32Ty()));
-  V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()),
-                              ConstantVector::get(Mask), Name + ".expand");
+      Mask.push_back(-1);
+  V = IRB.CreateShuffleVector(V, Mask, Name + ".expand");
   LLVM_DEBUG(dbgs() << "    shuffle: " << *V << "\n");
 
-  Mask.clear();
+  SmallVector<Constant *, 8> Mask2;
+  Mask2.reserve(cast<FixedVectorType>(VecTy)->getNumElements());
   for (unsigned i = 0; i != cast<FixedVectorType>(VecTy)->getNumElements(); ++i)
-    Mask.push_back(IRB.getInt1(i >= BeginIndex && i < EndIndex));
+    Mask2.push_back(IRB.getInt1(i >= BeginIndex && i < EndIndex));
 
-  V = IRB.CreateSelect(ConstantVector::get(Mask), V, Old, Name + "blend");
+  V = IRB.CreateSelect(ConstantVector::get(Mask2), V, Old, Name + "blend");
 
   LLVM_DEBUG(dbgs() << "    blend: " << *V << "\n");
   return V;
@@ -2446,7 +2458,7 @@ private:
   void deleteIfTriviallyDead(Value *V) {
     Instruction *I = cast<Instruction>(V);
     if (isInstructionTriviallyDead(I))
-      Pass.DeadInsts.insert(I);
+      Pass.DeadInsts.push_back(I);
   }
 
   Value *rewriteVectorizedLoadInst() {
@@ -2512,7 +2524,7 @@ private:
                                               NewAI.getAlign(), LI.isVolatile(),
                                               LI.getName());
       if (AATags)
-        NewLI->setAAMetadata(AATags);
+        NewLI->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
       if (LI.isVolatile())
         NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
       if (NewLI->isAtomic())
@@ -2551,7 +2563,7 @@ private:
           IRB.CreateAlignedLoad(TargetTy, getNewAllocaSlicePtr(IRB, LTy),
                                 getSliceAlign(), LI.isVolatile(), LI.getName());
       if (AATags)
-        NewLI->setAAMetadata(AATags);
+        NewLI->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
       if (LI.isVolatile())
         NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
 
@@ -2586,7 +2598,7 @@ private:
       LI.replaceAllUsesWith(V);
     }
 
-    Pass.DeadInsts.insert(&LI);
+    Pass.DeadInsts.push_back(&LI);
     deleteIfTriviallyDead(OldOp);
     LLVM_DEBUG(dbgs() << "          to: " << *V << "\n");
     return !LI.isVolatile() && !IsPtrAdjusted;
@@ -2614,8 +2626,8 @@ private:
     }
     StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign());
     if (AATags)
-      Store->setAAMetadata(AATags);
-    Pass.DeadInsts.insert(&SI);
+      Store->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
+    Pass.DeadInsts.push_back(&SI);
 
     LLVM_DEBUG(dbgs() << "          to: " << *Store << "\n");
     return true;
@@ -2638,8 +2650,8 @@ private:
     Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
                              LLVMContext::MD_access_group});
     if (AATags)
-      Store->setAAMetadata(AATags);
-    Pass.DeadInsts.insert(&SI);
+      Store->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
+    Pass.DeadInsts.push_back(&SI);
     LLVM_DEBUG(dbgs() << "          to: " << *Store << "\n");
     return true;
   }
@@ -2708,12 +2720,12 @@ private:
     NewSI->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
                              LLVMContext::MD_access_group});
     if (AATags)
-      NewSI->setAAMetadata(AATags);
+      NewSI->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
     if (SI.isVolatile())
       NewSI->setAtomic(SI.getOrdering(), SI.getSyncScopeID());
     if (NewSI->isAtomic())
       NewSI->setAlignment(SI.getAlign());
-    Pass.DeadInsts.insert(&SI);
+    Pass.DeadInsts.push_back(&SI);
     deleteIfTriviallyDead(OldOp);
 
     LLVM_DEBUG(dbgs() << "          to: " << *NewSI << "\n");
@@ -2774,11 +2786,11 @@ private:
     }
 
     // Record this instruction for deletion.
-    Pass.DeadInsts.insert(&II);
+    Pass.DeadInsts.push_back(&II);
 
     Type *AllocaTy = NewAI.getAllocatedType();
     Type *ScalarTy = AllocaTy->getScalarType();
-    
+
     const bool CanContinue = [&]() {
       if (VecTy || IntTy)
         return true;
@@ -2804,7 +2816,7 @@ private:
           getNewAllocaSlicePtr(IRB, OldPtr->getType()), II.getValue(), Size,
           MaybeAlign(getSliceAlign()), II.isVolatile());
       if (AATags)
-        New->setAAMetadata(AATags);
+        New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
       LLVM_DEBUG(dbgs() << "          to: " << *New << "\n");
       return false;
     }
@@ -2873,7 +2885,7 @@ private:
     StoreInst *New =
         IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign(), II.isVolatile());
     if (AATags)
-      New->setAAMetadata(AATags);
+      New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
     LLVM_DEBUG(dbgs() << "          to: " << *New << "\n");
     return !II.isVolatile();
   }
@@ -2944,7 +2956,7 @@ private:
       return false;
     }
     // Record this instruction for deletion.
-    Pass.DeadInsts.insert(&II);
+    Pass.DeadInsts.push_back(&II);
 
     // Strip all inbounds GEPs and pointer casts to try to dig out any root
     // alloca that should be re-examined after rewriting this instruction.
@@ -2994,7 +3006,7 @@ private:
       CallInst *New = IRB.CreateMemCpy(DestPtr, DestAlign, SrcPtr, SrcAlign,
                                        Size, II.isVolatile());
       if (AATags)
-        New->setAAMetadata(AATags);
+        New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
       LLVM_DEBUG(dbgs() << "          to: " << *New << "\n");
       return false;
     }
@@ -3048,7 +3060,7 @@ private:
       LoadInst *Load = IRB.CreateAlignedLoad(OtherTy, SrcPtr, SrcAlign,
                                              II.isVolatile(), "copyload");
       if (AATags)
-        Load->setAAMetadata(AATags);
+        Load->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
       Src = Load;
     }
 
@@ -3068,19 +3080,27 @@ private:
     StoreInst *Store = cast<StoreInst>(
         IRB.CreateAlignedStore(Src, DstPtr, DstAlign, II.isVolatile()));
     if (AATags)
-      Store->setAAMetadata(AATags);
+      Store->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
     LLVM_DEBUG(dbgs() << "          to: " << *Store << "\n");
     return !II.isVolatile();
   }
 
   bool visitIntrinsicInst(IntrinsicInst &II) {
-    assert(II.isLifetimeStartOrEnd());
+    assert((II.isLifetimeStartOrEnd() || II.isDroppable()) &&
+           "Unexpected intrinsic!");
     LLVM_DEBUG(dbgs() << "    original: " << II << "\n");
-    assert(II.getArgOperand(1) == OldPtr);
 
     // Record this instruction for deletion.
-    Pass.DeadInsts.insert(&II);
+    Pass.DeadInsts.push_back(&II);
 
+    if (II.isDroppable()) {
+      assert(II.getIntrinsicID() == Intrinsic::assume && "Expected assume");
+      // TODO For now we forget assumed information, this can be improved.
+      OldPtr->dropDroppableUsesIn(II);
+      return true;
+    }
+
+    assert(II.getArgOperand(1) == OldPtr);
     // Lifetime intrinsics are only promotable if they cover the whole alloca.
     // Therefore, we drop lifetime intrinsics which don't cover the whole
     // alloca.
@@ -3361,8 +3381,13 @@ private:
           IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep");
       LoadInst *Load =
           IRB.CreateAlignedLoad(Ty, GEP, Alignment, Name + ".load");
-      if (AATags)
-        Load->setAAMetadata(AATags);
+
+      APInt Offset(
+          DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()), 0);
+      if (AATags &&
+          GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset))
+        Load->setAAMetadata(AATags.shift(Offset.getZExtValue()));
+
       Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert");
       LLVM_DEBUG(dbgs() << "          to: " << *Load << "\n");
     }
@@ -3408,8 +3433,13 @@ private:
           IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep");
       StoreInst *Store =
           IRB.CreateAlignedStore(ExtractValue, InBoundsGEP, Alignment);
-      if (AATags)
-        Store->setAAMetadata(AATags);
+
+      APInt Offset(
+          DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()), 0);
+      if (AATags &&
+          GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset))
+        Store->setAAMetadata(AATags.shift(Offset.getZExtValue()));
+
       LLVM_DEBUG(dbgs() << "          to: " << *Store << "\n");
     }
   };
@@ -3455,7 +3485,7 @@ private:
                       << "\n              " << GEPI);
 
     IRBuilderTy Builder(&GEPI);
-    SmallVector<Value *, 4> Index(GEPI.idx_begin(), GEPI.idx_end());
+    SmallVector<Value *, 4> Index(GEPI.indices());
     bool IsInBounds = GEPI.isInBounds();
 
     Value *True = Sel->getTrueValue();
@@ -3509,20 +3539,27 @@ private:
                       << "\n              " << GEPI
                       << "\n          to: ");
 
-    SmallVector<Value *, 4> Index(GEPI.idx_begin(), GEPI.idx_end());
+    SmallVector<Value *, 4> Index(GEPI.indices());
     bool IsInBounds = GEPI.isInBounds();
     IRBuilderTy PHIBuilder(GEPI.getParent()->getFirstNonPHI());
     PHINode *NewPN = PHIBuilder.CreatePHI(GEPI.getType(),
                                           PHI->getNumIncomingValues(),
                                           PHI->getName() + ".sroa.phi");
     for (unsigned I = 0, E = PHI->getNumIncomingValues(); I != E; ++I) {
-      Instruction *In = cast<Instruction>(PHI->getIncomingValue(I));
+      BasicBlock *B = PHI->getIncomingBlock(I);
+      Value *NewVal = nullptr;
+      int Idx = NewPN->getBasicBlockIndex(B);
+      if (Idx >= 0) {
+        NewVal = NewPN->getIncomingValue(Idx);
+      } else {
+        Instruction *In = cast<Instruction>(PHI->getIncomingValue(I));
 
-      IRBuilderTy B(In->getParent(), std::next(In->getIterator()));
-      Value *NewVal = IsInBounds
-          ? B.CreateInBoundsGEP(In, Index, In->getName() + ".sroa.gep")
-          : B.CreateGEP(In, Index, In->getName() + ".sroa.gep");
-      NewPN->addIncoming(NewVal, PHI->getIncomingBlock(I));
+        IRBuilderTy B(In->getParent(), std::next(In->getIterator()));
+        NewVal = IsInBounds
+            ? B.CreateInBoundsGEP(In, Index, In->getName() + ".sroa.gep")
+            : B.CreateGEP(In, Index, In->getName() + ".sroa.gep");
+      }
+      NewPN->addIncoming(NewVal, B);
     }
 
     Visited.erase(&GEPI);
@@ -3864,63 +3901,53 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
   // such loads and stores, we can only pre-split them if their splits exactly
   // match relative to their starting offset. We have to verify this prior to
   // any rewriting.
-  Stores.erase(
-      llvm::remove_if(Stores,
-                      [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) {
-                        // Lookup the load we are storing in our map of split
-                        // offsets.
-                        auto *LI = cast<LoadInst>(SI->getValueOperand());
-                        // If it was completely unsplittable, then we're done,
-                        // and this store can't be pre-split.
-                        if (UnsplittableLoads.count(LI))
-                          return true;
-
-                        auto LoadOffsetsI = SplitOffsetsMap.find(LI);
-                        if (LoadOffsetsI == SplitOffsetsMap.end())
-                          return false; // Unrelated loads are definitely safe.
-                        auto &LoadOffsets = LoadOffsetsI->second;
-
-                        // Now lookup the store's offsets.
-                        auto &StoreOffsets = SplitOffsetsMap[SI];
-
-                        // If the relative offsets of each split in the load and
-                        // store match exactly, then we can split them and we
-                        // don't need to remove them here.
-                        if (LoadOffsets.Splits == StoreOffsets.Splits)
-                          return false;
-
-                        LLVM_DEBUG(
-                            dbgs()
-                            << "    Mismatched splits for load and store:\n"
-                            << "      " << *LI << "\n"
-                            << "      " << *SI << "\n");
-
-                        // We've found a store and load that we need to split
-                        // with mismatched relative splits. Just give up on them
-                        // and remove both instructions from our list of
-                        // candidates.
-                        UnsplittableLoads.insert(LI);
-                        return true;
-                      }),
-      Stores.end());
+  llvm::erase_if(Stores, [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) {
+    // Lookup the load we are storing in our map of split
+    // offsets.
+    auto *LI = cast<LoadInst>(SI->getValueOperand());
+    // If it was completely unsplittable, then we're done,
+    // and this store can't be pre-split.
+    if (UnsplittableLoads.count(LI))
+      return true;
+
+    auto LoadOffsetsI = SplitOffsetsMap.find(LI);
+    if (LoadOffsetsI == SplitOffsetsMap.end())
+      return false; // Unrelated loads are definitely safe.
+    auto &LoadOffsets = LoadOffsetsI->second;
+
+    // Now lookup the store's offsets.
+    auto &StoreOffsets = SplitOffsetsMap[SI];
+
+    // If the relative offsets of each split in the load and
+    // store match exactly, then we can split them and we
+    // don't need to remove them here.
+    if (LoadOffsets.Splits == StoreOffsets.Splits)
+      return false;
+
+    LLVM_DEBUG(dbgs() << "    Mismatched splits for load and store:\n"
+                      << "      " << *LI << "\n"
+                      << "      " << *SI << "\n");
+
+    // We've found a store and load that we need to split
+    // with mismatched relative splits. Just give up on them
+    // and remove both instructions from our list of
+    // candidates.
+    UnsplittableLoads.insert(LI);
+    return true;
+  });
   // Now we have to go *back* through all the stores, because a later store may
   // have caused an earlier store's load to become unsplittable and if it is
   // unsplittable for the later store, then we can't rely on it being split in
   // the earlier store either.
-  Stores.erase(llvm::remove_if(Stores,
-                               [&UnsplittableLoads](StoreInst *SI) {
-                                 auto *LI =
-                                     cast<LoadInst>(SI->getValueOperand());
-                                 return UnsplittableLoads.count(LI);
-                               }),
-               Stores.end());
+  llvm::erase_if(Stores, [&UnsplittableLoads](StoreInst *SI) {
+    auto *LI = cast<LoadInst>(SI->getValueOperand());
+    return UnsplittableLoads.count(LI);
+  });
   // Once we've established all the loads that can't be split for some reason,
   // filter any that made it into our list out.
-  Loads.erase(llvm::remove_if(Loads,
-                              [&UnsplittableLoads](LoadInst *LI) {
-                                return UnsplittableLoads.count(LI);
-                              }),
-              Loads.end());
+  llvm::erase_if(Loads, [&UnsplittableLoads](LoadInst *LI) {
+    return UnsplittableLoads.count(LI);
+  });
 
   // If no loads or stores are left, there is no pre-splitting to be done for
   // this alloca.
@@ -4057,7 +4084,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
       }
 
       // Mark the original store as dead.
-      DeadInsts.insert(SI);
+      DeadInsts.push_back(SI);
     }
 
     // Save the split loads if there are deferred stores among the users.
@@ -4065,7 +4092,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
       SplitLoadsMap.insert(std::make_pair(LI, std::move(SplitLoads)));
 
     // Mark the original load as dead and kill the original slice.
-    DeadInsts.insert(LI);
+    DeadInsts.push_back(LI);
     Offsets.S->kill();
   }
 
@@ -4187,15 +4214,14 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
     // trivial CSE, including instcombine.
     if (LI->hasOneUse()) {
       assert(*LI->user_begin() == SI && "Single use isn't this store!");
-      DeadInsts.insert(LI);
+      DeadInsts.push_back(LI);
     }
-    DeadInsts.insert(SI);
+    DeadInsts.push_back(SI);
     Offsets.S->kill();
   }
 
   // Remove the killed slices that have ben pre-split.
-  AS.erase(llvm::remove_if(AS, [](const Slice &S) { return S.isDead(); }),
-           AS.end());
+  llvm::erase_if(AS, [](const Slice &S) { return S.isDead(); });
 
   // Insert our new slices. This will sort and merge them into the sorted
   // sequence.
@@ -4209,11 +4235,9 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
 
   // Finally, don't try to promote any allocas that new require re-splitting.
   // They have already been added to the worklist above.
-  PromotableAllocas.erase(
-      llvm::remove_if(
-          PromotableAllocas,
-          [&](AllocaInst *AI) { return ResplitPromotableAllocas.count(AI); }),
-      PromotableAllocas.end());
+  llvm::erase_if(PromotableAllocas, [&](AllocaInst *AI) {
+    return ResplitPromotableAllocas.count(AI);
+  });
 
   return true;
 }
@@ -4235,13 +4259,21 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
   // or an i8 array of an appropriate size.
   Type *SliceTy = nullptr;
   const DataLayout &DL = AI.getModule()->getDataLayout();
-  if (Type *CommonUseTy = findCommonType(P.begin(), P.end(), P.endOffset()))
-    if (DL.getTypeAllocSize(CommonUseTy).getFixedSize() >= P.size())
-      SliceTy = CommonUseTy;
+  std::pair<Type *, IntegerType *> CommonUseTy =
+      findCommonType(P.begin(), P.end(), P.endOffset());
+  // Do all uses operate on the same type?
+  if (CommonUseTy.first)
+    if (DL.getTypeAllocSize(CommonUseTy.first).getFixedSize() >= P.size())
+      SliceTy = CommonUseTy.first;
+  // If not, can we find an appropriate subtype in the original allocated type?
   if (!SliceTy)
     if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
                                                  P.beginOffset(), P.size()))
       SliceTy = TypePartitionTy;
+  // If still not, can we use the largest bitwidth integer type used?
+  if (!SliceTy && CommonUseTy.second)
+    if (DL.getTypeAllocSize(CommonUseTy.second).getFixedSize() >= P.size())
+      SliceTy = CommonUseTy.second;
   if ((!SliceTy || (SliceTy->isArrayTy() &&
                     SliceTy->getArrayElementType()->isIntegerTy())) &&
       DL.isLegalInteger(P.size() * 8))
@@ -4331,6 +4363,13 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
     }
 
   if (Promotable) {
+    for (Use *U : AS.getDeadUsesIfPromotable()) {
+      auto *OldInst = dyn_cast<Instruction>(U->get());
+      Value::dropDroppableUse(*U);
+      if (OldInst)
+        if (isInstructionTriviallyDead(OldInst))
+          DeadInsts.push_back(OldInst);
+    }
     if (PHIUsers.empty() && SelectUsers.empty()) {
       // Promote the alloca.
       PromotableAllocas.push_back(NewAI);
@@ -4465,10 +4504,8 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
   // Migrate debug information from the old alloca to the new alloca(s)
   // and the individual partitions.
   TinyPtrVector<DbgVariableIntrinsic *> DbgDeclares = FindDbgAddrUses(&AI);
-  if (!DbgDeclares.empty()) {
-    auto *Var = DbgDeclares.front()->getVariable();
-    auto *Expr = DbgDeclares.front()->getExpression();
-    auto VarSize = Var->getSizeInBits();
+  for (DbgVariableIntrinsic *DbgDeclare : DbgDeclares) {
+    auto *Expr = DbgDeclare->getExpression();
     DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false);
     uint64_t AllocaSize =
         DL.getTypeSizeInBits(AI.getAllocatedType()).getFixedSize();
@@ -4499,6 +4536,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
         }
 
         // The alloca may be larger than the variable.
+        auto VarSize = DbgDeclare->getVariable()->getSizeInBits();
         if (VarSize) {
           if (Size > *VarSize)
             Size = *VarSize;
@@ -4516,12 +4554,21 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
         }
       }
 
-      // Remove any existing intrinsics describing the same alloca.
-      for (DbgVariableIntrinsic *OldDII : FindDbgAddrUses(Fragment.Alloca))
-        OldDII->eraseFromParent();
+      // Remove any existing intrinsics on the new alloca describing
+      // the variable fragment.
+      for (DbgVariableIntrinsic *OldDII : FindDbgAddrUses(Fragment.Alloca)) {
+        auto SameVariableFragment = [](const DbgVariableIntrinsic *LHS,
+                                       const DbgVariableIntrinsic *RHS) {
+          return LHS->getVariable() == RHS->getVariable() &&
+                 LHS->getDebugLoc()->getInlinedAt() ==
+                     RHS->getDebugLoc()->getInlinedAt();
+        };
+        if (SameVariableFragment(OldDII, DbgDeclare))
+          OldDII->eraseFromParent();
+      }
 
-      DIB.insertDeclare(Fragment.Alloca, Var, FragmentExpr,
-                        DbgDeclares.front()->getDebugLoc(), &AI);
+      DIB.insertDeclare(Fragment.Alloca, DbgDeclare->getVariable(), FragmentExpr,
+                        DbgDeclare->getDebugLoc(), &AI);
     }
   }
   return Changed;
@@ -4538,7 +4585,7 @@ void SROA::clobberUse(Use &U) {
   // minimal.
   if (Instruction *OldI = dyn_cast<Instruction>(OldV))
     if (isInstructionTriviallyDead(OldI)) {
-      DeadInsts.insert(OldI);
+      DeadInsts.push_back(OldI);
     }
 }
 
@@ -4587,7 +4634,7 @@ bool SROA::runOnAlloca(AllocaInst &AI) {
     DeadUser->replaceAllUsesWith(UndefValue::get(DeadUser->getType()));
 
     // And mark it for deletion.
-    DeadInsts.insert(DeadUser);
+    DeadInsts.push_back(DeadUser);
     Changed = true;
   }
   for (Use *DeadOp : AS.getDeadOperands()) {
@@ -4625,7 +4672,8 @@ bool SROA::deleteDeadInstructions(
     SmallPtrSetImpl<AllocaInst *> &DeletedAllocas) {
   bool Changed = false;
   while (!DeadInsts.empty()) {
-    Instruction *I = DeadInsts.pop_back_val();
+    Instruction *I = dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val());
+    if (!I) continue; 
     LLVM_DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
 
     // If the instruction is an alloca, find the possible dbg.declare connected
@@ -4644,7 +4692,7 @@ bool SROA::deleteDeadInstructions(
         // Zero out the operand and see if it becomes trivially dead.
         Operand = nullptr;
         if (isInstructionTriviallyDead(U))
-          DeadInsts.insert(U);
+          DeadInsts.push_back(U);
       }
 
     ++NumDeleted;
@@ -4707,8 +4755,7 @@ PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT,
         auto IsInSet = [&](AllocaInst *AI) { return DeletedAllocas.count(AI); };
         Worklist.remove_if(IsInSet);
         PostPromotionWorklist.remove_if(IsInSet);
-        PromotableAllocas.erase(llvm::remove_if(PromotableAllocas, IsInSet),
-                                PromotableAllocas.end());
+        llvm::erase_if(PromotableAllocas, IsInSet);
         DeletedAllocas.clear();
       }
     }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Scalar.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Scalar.cpp
index 9d088547b436..dba3dba24e25 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Scalar.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Scalar.cpp
@@ -34,14 +34,14 @@ using namespace llvm;
 /// ScalarOpts library.
 void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeADCELegacyPassPass(Registry);
+  initializeAnnotationRemarksLegacyPass(Registry);
   initializeBDCELegacyPassPass(Registry);
   initializeAlignmentFromAssumptionsPass(Registry);
   initializeCallSiteSplittingLegacyPassPass(Registry);
   initializeConstantHoistingLegacyPassPass(Registry);
-  initializeConstantPropagationPass(Registry);
+  initializeConstraintEliminationPass(Registry);
   initializeCorrelatedValuePropagationPass(Registry);
   initializeDCELegacyPassPass(Registry);
-  initializeDeadInstEliminationPass(Registry);
   initializeDivRemPairsLegacyPassPass(Registry);
   initializeScalarizerLegacyPassPass(Registry);
   initializeDSELegacyPassPass(Registry);
@@ -67,22 +67,24 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeLoopDeletionLegacyPassPass(Registry);
   initializeLoopAccessLegacyAnalysisPass(Registry);
   initializeLoopInstSimplifyLegacyPassPass(Registry);
-  initializeLoopInterchangePass(Registry);
+  initializeLoopInterchangeLegacyPassPass(Registry);
+  initializeLoopFlattenLegacyPassPass(Registry);
   initializeLoopPredicationLegacyPassPass(Registry);
   initializeLoopRotateLegacyPassPass(Registry);
   initializeLoopStrengthReducePass(Registry);
-  initializeLoopRerollPass(Registry);
+  initializeLoopRerollLegacyPassPass(Registry);
   initializeLoopUnrollPass(Registry);
   initializeLoopUnrollAndJamPass(Registry);
   initializeLoopUnswitchPass(Registry);
   initializeWarnMissedTransformationsLegacyPass(Registry);
-  initializeLoopVersioningLICMPass(Registry);
+  initializeLoopVersioningLICMLegacyPassPass(Registry);
   initializeLoopIdiomRecognizeLegacyPassPass(Registry);
   initializeLowerAtomicLegacyPassPass(Registry);
   initializeLowerConstantIntrinsicsPass(Registry);
   initializeLowerExpectIntrinsicPass(Registry);
   initializeLowerGuardIntrinsicLegacyPassPass(Registry);
   initializeLowerMatrixIntrinsicsLegacyPassPass(Registry);
+  initializeLowerMatrixIntrinsicsMinimalLegacyPassPass(Registry);
   initializeLowerWidenableConditionLegacyPassPass(Registry);
   initializeMemCpyOptLegacyPassPass(Registry);
   initializeMergeICmpsLegacyPassPass(Registry);
@@ -91,25 +93,26 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializePartiallyInlineLibCallsLegacyPassPass(Registry);
   initializeReassociateLegacyPassPass(Registry);
   initializeRedundantDbgInstEliminationPass(Registry);
-  initializeRegToMemPass(Registry);
+  initializeRegToMemLegacyPass(Registry);
   initializeRewriteStatepointsForGCLegacyPassPass(Registry);
+  initializeScalarizeMaskedMemIntrinLegacyPassPass(Registry);
   initializeSCCPLegacyPassPass(Registry);
   initializeSROALegacyPassPass(Registry);
   initializeCFGSimplifyPassPass(Registry);
-  initializeStructurizeCFGPass(Registry);
+  initializeStructurizeCFGLegacyPassPass(Registry);
   initializeSimpleLoopUnswitchLegacyPassPass(Registry);
   initializeSinkingLegacyPassPass(Registry);
   initializeTailCallElimPass(Registry);
-  initializeSeparateConstOffsetFromGEPPass(Registry);
+  initializeSeparateConstOffsetFromGEPLegacyPassPass(Registry);
   initializeSpeculativeExecutionLegacyPassPass(Registry);
-  initializeStraightLineStrengthReducePass(Registry);
+  initializeStraightLineStrengthReduceLegacyPassPass(Registry);
   initializePlaceBackedgeSafepointsImplPass(Registry);
   initializePlaceSafepointsPass(Registry);
   initializeFloat2IntLegacyPassPass(Registry);
   initializeLoopDistributeLegacyPass(Registry);
   initializeLoopLoadEliminationPass(Registry);
   initializeLoopSimplifyCFGLegacyPassPass(Registry);
-  initializeLoopVersioningPassPass(Registry);
+  initializeLoopVersioningLegacyPassPass(Registry);
   initializeEntryExitInstrumenterPass(Registry);
   initializePostInlineEntryExitInstrumenterPass(Registry);
 }
@@ -139,7 +142,7 @@ void LLVMAddAlignmentFromAssumptionsPass(LLVMPassManagerRef PM) {
 }
 
 void LLVMAddCFGSimplificationPass(LLVMPassManagerRef PM) {
-  unwrap(PM)->add(createCFGSimplificationPass(1, false, false, true));
+  unwrap(PM)->add(createCFGSimplificationPass());
 }
 
 void LLVMAddDeadStoreEliminationPass(LLVMPassManagerRef PM) {
@@ -166,6 +169,10 @@ void LLVMAddIndVarSimplifyPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createIndVarSimplifyPass());
 }
 
+void LLVMAddInstructionSimplifyPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createInstSimplifyLegacyPass());
+}
+
 void LLVMAddJumpThreadingPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createJumpThreadingPass());
 }
@@ -182,6 +189,10 @@ void LLVMAddLoopDeletionPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createLoopDeletionPass());
 }
 
+void LLVMAddLoopFlattenPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopFlattenPass());
+}
+
 void LLVMAddLoopIdiomPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createLoopIdiomPass());
 }
@@ -247,10 +258,6 @@ void LLVMAddTailCallEliminationPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createTailCallEliminationPass());
 }
 
-void LLVMAddConstantPropagationPass(LLVMPassManagerRef PM) {
-  unwrap(PM)->add(createConstantPropagationPass());
-}
-
 void LLVMAddDemoteMemoryToRegisterPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createDemoteRegisterToMemoryPass());
 }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
new file mode 100644
index 000000000000..afa2d1bc7966
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
@@ -0,0 +1,948 @@
+//===- ScalarizeMaskedMemIntrin.cpp - Scalarize unsupported masked mem ----===//
+//                                    instrinsics
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass replaces masked memory intrinsics - when unsupported by the target
+// - with a chain of basic blocks, that deal with the elements one-by-one if the
+// appropriate mask bit is set.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/ScalarizeMaskedMemIntrin.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Transforms/Scalar.h"
+#include <algorithm>
+#include <cassert>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "scalarize-masked-mem-intrin"
+
+namespace {
+
+class ScalarizeMaskedMemIntrinLegacyPass : public FunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+
+  explicit ScalarizeMaskedMemIntrinLegacyPass() : FunctionPass(ID) {
+    initializeScalarizeMaskedMemIntrinLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override;
+
+  StringRef getPassName() const override {
+    return "Scalarize Masked Memory Intrinsics";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+  }
+};
+
+} // end anonymous namespace
+
+static bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT,
+                          const TargetTransformInfo &TTI, const DataLayout &DL);
+static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT,
+                             const TargetTransformInfo &TTI,
+                             const DataLayout &DL);
+
+char ScalarizeMaskedMemIntrinLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(ScalarizeMaskedMemIntrinLegacyPass, DEBUG_TYPE,
+                      "Scalarize unsupported masked memory intrinsics", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(ScalarizeMaskedMemIntrinLegacyPass, DEBUG_TYPE,
+                    "Scalarize unsupported masked memory intrinsics", false,
+                    false)
+
+FunctionPass *llvm::createScalarizeMaskedMemIntrinLegacyPass() {
+  return new ScalarizeMaskedMemIntrinLegacyPass();
+}
+
+static bool isConstantIntVector(Value *Mask) {
+  Constant *C = dyn_cast<Constant>(Mask);
+  if (!C)
+    return false;
+
+  unsigned NumElts = cast<FixedVectorType>(Mask->getType())->getNumElements();
+  for (unsigned i = 0; i != NumElts; ++i) {
+    Constant *CElt = C->getAggregateElement(i);
+    if (!CElt || !isa<ConstantInt>(CElt))
+      return false;
+  }
+
+  return true;
+}
+
+// Translate a masked load intrinsic like
+// <16 x i32 > @llvm.masked.load( <16 x i32>* %addr, i32 align,
+//                               <16 x i1> %mask, <16 x i32> %passthru)
+// to a chain of basic blocks, with loading element one-by-one if
+// the appropriate mask bit is set
+//
+//  %1 = bitcast i8* %addr to i32*
+//  %2 = extractelement <16 x i1> %mask, i32 0
+//  br i1 %2, label %cond.load, label %else
+//
+// cond.load:                                        ; preds = %0
+//  %3 = getelementptr i32* %1, i32 0
+//  %4 = load i32* %3
+//  %5 = insertelement <16 x i32> %passthru, i32 %4, i32 0
+//  br label %else
+//
+// else:                                             ; preds = %0, %cond.load
+//  %res.phi.else = phi <16 x i32> [ %5, %cond.load ], [ undef, %0 ]
+//  %6 = extractelement <16 x i1> %mask, i32 1
+//  br i1 %6, label %cond.load1, label %else2
+//
+// cond.load1:                                       ; preds = %else
+//  %7 = getelementptr i32* %1, i32 1
+//  %8 = load i32* %7
+//  %9 = insertelement <16 x i32> %res.phi.else, i32 %8, i32 1
+//  br label %else2
+//
+// else2:                                          ; preds = %else, %cond.load1
+//  %res.phi.else3 = phi <16 x i32> [ %9, %cond.load1 ], [ %res.phi.else, %else ]
+//  %10 = extractelement <16 x i1> %mask, i32 2
+//  br i1 %10, label %cond.load4, label %else5
+//
+static void scalarizeMaskedLoad(CallInst *CI, bool &ModifiedDT) {
+  Value *Ptr = CI->getArgOperand(0);
+  Value *Alignment = CI->getArgOperand(1);
+  Value *Mask = CI->getArgOperand(2);
+  Value *Src0 = CI->getArgOperand(3);
+
+  const Align AlignVal = cast<ConstantInt>(Alignment)->getAlignValue();
+  VectorType *VecType = cast<FixedVectorType>(CI->getType());
+
+  Type *EltTy = VecType->getElementType();
+
+  IRBuilder<> Builder(CI->getContext());
+  Instruction *InsertPt = CI;
+  BasicBlock *IfBlock = CI->getParent();
+
+  Builder.SetInsertPoint(InsertPt);
+  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+  // Short-cut if the mask is all-true.
+  if (isa<Constant>(Mask) && cast<Constant>(Mask)->isAllOnesValue()) {
+    Value *NewI = Builder.CreateAlignedLoad(VecType, Ptr, AlignVal);
+    CI->replaceAllUsesWith(NewI);
+    CI->eraseFromParent();
+    return;
+  }
+
+  // Adjust alignment for the scalar instruction.
+  const Align AdjustedAlignVal =
+      commonAlignment(AlignVal, EltTy->getPrimitiveSizeInBits() / 8);
+  // Bitcast %addr from i8* to EltTy*
+  Type *NewPtrType =
+      EltTy->getPointerTo(Ptr->getType()->getPointerAddressSpace());
+  Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
+  unsigned VectorWidth = cast<FixedVectorType>(VecType)->getNumElements();
+
+  // The result vector
+  Value *VResult = Src0;
+
+  if (isConstantIntVector(Mask)) {
+    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+      if (cast<Constant>(Mask)->getAggregateElement(Idx)->isNullValue())
+        continue;
+      Value *Gep = Builder.CreateConstInBoundsGEP1_32(EltTy, FirstEltPtr, Idx);
+      LoadInst *Load = Builder.CreateAlignedLoad(EltTy, Gep, AdjustedAlignVal);
+      VResult = Builder.CreateInsertElement(VResult, Load, Idx);
+    }
+    CI->replaceAllUsesWith(VResult);
+    CI->eraseFromParent();
+    return;
+  }
+
+  // If the mask is not v1i1, use scalar bit test operations. This generates
+  // better results on X86 at least.
+  Value *SclrMask;
+  if (VectorWidth != 1) {
+    Type *SclrMaskTy = Builder.getIntNTy(VectorWidth);
+    SclrMask = Builder.CreateBitCast(Mask, SclrMaskTy, "scalar_mask");
+  }
+
+  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+    // Fill the "else" block, created in the previous iteration
+    //
+    //  %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
+    //  %mask_1 = and i16 %scalar_mask, i32 1 << Idx
+    //  %cond = icmp ne i16 %mask_1, 0
+    //  br i1 %mask_1, label %cond.load, label %else
+    //
+    Value *Predicate;
+    if (VectorWidth != 1) {
+      Value *Mask = Builder.getInt(APInt::getOneBitSet(VectorWidth, Idx));
+      Predicate = Builder.CreateICmpNE(Builder.CreateAnd(SclrMask, Mask),
+                                       Builder.getIntN(VectorWidth, 0));
+    } else {
+      Predicate = Builder.CreateExtractElement(Mask, Idx);
+    }
+
+    // Create "cond" block
+    //
+    //  %EltAddr = getelementptr i32* %1, i32 0
+    //  %Elt = load i32* %EltAddr
+    //  VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
+    //
+    BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt->getIterator(),
+                                                     "cond.load");
+    Builder.SetInsertPoint(InsertPt);
+
+    Value *Gep = Builder.CreateConstInBoundsGEP1_32(EltTy, FirstEltPtr, Idx);
+    LoadInst *Load = Builder.CreateAlignedLoad(EltTy, Gep, AdjustedAlignVal);
+    Value *NewVResult = Builder.CreateInsertElement(VResult, Load, Idx);
+
+    // Create "else" block, fill it in the next iteration
+    BasicBlock *NewIfBlock =
+        CondBlock->splitBasicBlock(InsertPt->getIterator(), "else");
+    Builder.SetInsertPoint(InsertPt);
+    Instruction *OldBr = IfBlock->getTerminator();
+    BranchInst::Create(CondBlock, NewIfBlock, Predicate, OldBr);
+    OldBr->eraseFromParent();
+    BasicBlock *PrevIfBlock = IfBlock;
+    IfBlock = NewIfBlock;
+
+    // Create the phi to join the new and previous value.
+    PHINode *Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
+    Phi->addIncoming(NewVResult, CondBlock);
+    Phi->addIncoming(VResult, PrevIfBlock);
+    VResult = Phi;
+  }
+
+  CI->replaceAllUsesWith(VResult);
+  CI->eraseFromParent();
+
+  ModifiedDT = true;
+}
+
+// Translate a masked store intrinsic, like
+// void @llvm.masked.store(<16 x i32> %src, <16 x i32>* %addr, i32 align,
+//                               <16 x i1> %mask)
+// to a chain of basic blocks, that stores element one-by-one if
+// the appropriate mask bit is set
+//
+//   %1 = bitcast i8* %addr to i32*
+//   %2 = extractelement <16 x i1> %mask, i32 0
+//   br i1 %2, label %cond.store, label %else
+//
+// cond.store:                                       ; preds = %0
+//   %3 = extractelement <16 x i32> %val, i32 0
+//   %4 = getelementptr i32* %1, i32 0
+//   store i32 %3, i32* %4
+//   br label %else
+//
+// else:                                             ; preds = %0, %cond.store
+//   %5 = extractelement <16 x i1> %mask, i32 1
+//   br i1 %5, label %cond.store1, label %else2
+//
+// cond.store1:                                      ; preds = %else
+//   %6 = extractelement <16 x i32> %val, i32 1
+//   %7 = getelementptr i32* %1, i32 1
+//   store i32 %6, i32* %7
+//   br label %else2
+//   . . .
+static void scalarizeMaskedStore(CallInst *CI, bool &ModifiedDT) {
+  Value *Src = CI->getArgOperand(0);
+  Value *Ptr = CI->getArgOperand(1);
+  Value *Alignment = CI->getArgOperand(2);
+  Value *Mask = CI->getArgOperand(3);
+
+  const Align AlignVal = cast<ConstantInt>(Alignment)->getAlignValue();
+  auto *VecType = cast<VectorType>(Src->getType());
+
+  Type *EltTy = VecType->getElementType();
+
+  IRBuilder<> Builder(CI->getContext());
+  Instruction *InsertPt = CI;
+  BasicBlock *IfBlock = CI->getParent();
+  Builder.SetInsertPoint(InsertPt);
+  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+  // Short-cut if the mask is all-true.
+  if (isa<Constant>(Mask) && cast<Constant>(Mask)->isAllOnesValue()) {
+    Builder.CreateAlignedStore(Src, Ptr, AlignVal);
+    CI->eraseFromParent();
+    return;
+  }
+
+  // Adjust alignment for the scalar instruction.
+  const Align AdjustedAlignVal =
+      commonAlignment(AlignVal, EltTy->getPrimitiveSizeInBits() / 8);
+  // Bitcast %addr from i8* to EltTy*
+  Type *NewPtrType =
+      EltTy->getPointerTo(Ptr->getType()->getPointerAddressSpace());
+  Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
+  unsigned VectorWidth = cast<FixedVectorType>(VecType)->getNumElements();
+
+  if (isConstantIntVector(Mask)) {
+    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+      if (cast<Constant>(Mask)->getAggregateElement(Idx)->isNullValue())
+        continue;
+      Value *OneElt = Builder.CreateExtractElement(Src, Idx);
+      Value *Gep = Builder.CreateConstInBoundsGEP1_32(EltTy, FirstEltPtr, Idx);
+      Builder.CreateAlignedStore(OneElt, Gep, AdjustedAlignVal);
+    }
+    CI->eraseFromParent();
+    return;
+  }
+
+  // If the mask is not v1i1, use scalar bit test operations. This generates
+  // better results on X86 at least.
+  Value *SclrMask;
+  if (VectorWidth != 1) {
+    Type *SclrMaskTy = Builder.getIntNTy(VectorWidth);
+    SclrMask = Builder.CreateBitCast(Mask, SclrMaskTy, "scalar_mask");
+  }
+
+  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+    // Fill the "else" block, created in the previous iteration
+    //
+    //  %mask_1 = and i16 %scalar_mask, i32 1 << Idx
+    //  %cond = icmp ne i16 %mask_1, 0
+    //  br i1 %mask_1, label %cond.store, label %else
+    //
+    Value *Predicate;
+    if (VectorWidth != 1) {
+      Value *Mask = Builder.getInt(APInt::getOneBitSet(VectorWidth, Idx));
+      Predicate = Builder.CreateICmpNE(Builder.CreateAnd(SclrMask, Mask),
+                                       Builder.getIntN(VectorWidth, 0));
+    } else {
+      Predicate = Builder.CreateExtractElement(Mask, Idx);
+    }
+
+    // Create "cond" block
+    //
+    //  %OneElt = extractelement <16 x i32> %Src, i32 Idx
+    //  %EltAddr = getelementptr i32* %1, i32 0
+    //  %store i32 %OneElt, i32* %EltAddr
+    //
+    BasicBlock *CondBlock =
+        IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.store");
+    Builder.SetInsertPoint(InsertPt);
+
+    Value *OneElt = Builder.CreateExtractElement(Src, Idx);
+    Value *Gep = Builder.CreateConstInBoundsGEP1_32(EltTy, FirstEltPtr, Idx);
+    Builder.CreateAlignedStore(OneElt, Gep, AdjustedAlignVal);
+
+    // Create "else" block, fill it in the next iteration
+    BasicBlock *NewIfBlock =
+        CondBlock->splitBasicBlock(InsertPt->getIterator(), "else");
+    Builder.SetInsertPoint(InsertPt);
+    Instruction *OldBr = IfBlock->getTerminator();
+    BranchInst::Create(CondBlock, NewIfBlock, Predicate, OldBr);
+    OldBr->eraseFromParent();
+    IfBlock = NewIfBlock;
+  }
+  CI->eraseFromParent();
+
+  ModifiedDT = true;
+}
+
+// Translate a masked gather intrinsic like
+// <16 x i32 > @llvm.masked.gather.v16i32( <16 x i32*> %Ptrs, i32 4,
+//                               <16 x i1> %Mask, <16 x i32> %Src)
+// to a chain of basic blocks, with loading element one-by-one if
+// the appropriate mask bit is set
+//
+// %Ptrs = getelementptr i32, i32* %base, <16 x i64> %ind
+// %Mask0 = extractelement <16 x i1> %Mask, i32 0
+// br i1 %Mask0, label %cond.load, label %else
+//
+// cond.load:
+// %Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0
+// %Load0 = load i32, i32* %Ptr0, align 4
+// %Res0 = insertelement <16 x i32> undef, i32 %Load0, i32 0
+// br label %else
+//
+// else:
+// %res.phi.else = phi <16 x i32>[%Res0, %cond.load], [undef, %0]
+// %Mask1 = extractelement <16 x i1> %Mask, i32 1
+// br i1 %Mask1, label %cond.load1, label %else2
+//
+// cond.load1:
+// %Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
+// %Load1 = load i32, i32* %Ptr1, align 4
+// %Res1 = insertelement <16 x i32> %res.phi.else, i32 %Load1, i32 1
+// br label %else2
+// . . .
+// %Result = select <16 x i1> %Mask, <16 x i32> %res.phi.select, <16 x i32> %Src
+// ret <16 x i32> %Result
+static void scalarizeMaskedGather(CallInst *CI, bool &ModifiedDT) {
+  Value *Ptrs = CI->getArgOperand(0);
+  Value *Alignment = CI->getArgOperand(1);
+  Value *Mask = CI->getArgOperand(2);
+  Value *Src0 = CI->getArgOperand(3);
+
+  auto *VecType = cast<FixedVectorType>(CI->getType());
+  Type *EltTy = VecType->getElementType();
+
+  IRBuilder<> Builder(CI->getContext());
+  Instruction *InsertPt = CI;
+  BasicBlock *IfBlock = CI->getParent();
+  Builder.SetInsertPoint(InsertPt);
+  MaybeAlign AlignVal = cast<ConstantInt>(Alignment)->getMaybeAlignValue();
+
+  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+  // The result vector
+  Value *VResult = Src0;
+  unsigned VectorWidth = VecType->getNumElements();
+
+  // Shorten the way if the mask is a vector of constants.
+  if (isConstantIntVector(Mask)) {
+    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+      if (cast<Constant>(Mask)->getAggregateElement(Idx)->isNullValue())
+        continue;
+      Value *Ptr = Builder.CreateExtractElement(Ptrs, Idx, "Ptr" + Twine(Idx));
+      LoadInst *Load =
+          Builder.CreateAlignedLoad(EltTy, Ptr, AlignVal, "Load" + Twine(Idx));
+      VResult =
+          Builder.CreateInsertElement(VResult, Load, Idx, "Res" + Twine(Idx));
+    }
+    CI->replaceAllUsesWith(VResult);
+    CI->eraseFromParent();
+    return;
+  }
+
+  // If the mask is not v1i1, use scalar bit test operations. This generates
+  // better results on X86 at least.
+  Value *SclrMask;
+  if (VectorWidth != 1) {
+    Type *SclrMaskTy = Builder.getIntNTy(VectorWidth);
+    SclrMask = Builder.CreateBitCast(Mask, SclrMaskTy, "scalar_mask");
+  }
+
+  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+    // Fill the "else" block, created in the previous iteration
+    //
+    //  %Mask1 = and i16 %scalar_mask, i32 1 << Idx
+    //  %cond = icmp ne i16 %mask_1, 0
+    //  br i1 %Mask1, label %cond.load, label %else
+    //
+
+    Value *Predicate;
+    if (VectorWidth != 1) {
+      Value *Mask = Builder.getInt(APInt::getOneBitSet(VectorWidth, Idx));
+      Predicate = Builder.CreateICmpNE(Builder.CreateAnd(SclrMask, Mask),
+                                       Builder.getIntN(VectorWidth, 0));
+    } else {
+      Predicate = Builder.CreateExtractElement(Mask, Idx, "Mask" + Twine(Idx));
+    }
+
+    // Create "cond" block
+    //
+    //  %EltAddr = getelementptr i32* %1, i32 0
+    //  %Elt = load i32* %EltAddr
+    //  VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
+    //
+    BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.load");
+    Builder.SetInsertPoint(InsertPt);
+
+    Value *Ptr = Builder.CreateExtractElement(Ptrs, Idx, "Ptr" + Twine(Idx));
+    LoadInst *Load =
+        Builder.CreateAlignedLoad(EltTy, Ptr, AlignVal, "Load" + Twine(Idx));
+    Value *NewVResult =
+        Builder.CreateInsertElement(VResult, Load, Idx, "Res" + Twine(Idx));
+
+    // Create "else" block, fill it in the next iteration
+    BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
+    Builder.SetInsertPoint(InsertPt);
+    Instruction *OldBr = IfBlock->getTerminator();
+    BranchInst::Create(CondBlock, NewIfBlock, Predicate, OldBr);
+    OldBr->eraseFromParent();
+    BasicBlock *PrevIfBlock = IfBlock;
+    IfBlock = NewIfBlock;
+
+    PHINode *Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
+    Phi->addIncoming(NewVResult, CondBlock);
+    Phi->addIncoming(VResult, PrevIfBlock);
+    VResult = Phi;
+  }
+
+  CI->replaceAllUsesWith(VResult);
+  CI->eraseFromParent();
+
+  ModifiedDT = true;
+}
+
+// Translate a masked scatter intrinsic, like
+// void @llvm.masked.scatter.v16i32(<16 x i32> %Src, <16 x i32*>* %Ptrs, i32 4,
+//                                  <16 x i1> %Mask)
+// to a chain of basic blocks, that stores element one-by-one if
+// the appropriate mask bit is set.
+//
+// %Ptrs = getelementptr i32, i32* %ptr, <16 x i64> %ind
+// %Mask0 = extractelement <16 x i1> %Mask, i32 0
+// br i1 %Mask0, label %cond.store, label %else
+//
+// cond.store:
+// %Elt0 = extractelement <16 x i32> %Src, i32 0
+// %Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0
+// store i32 %Elt0, i32* %Ptr0, align 4
+// br label %else
+//
+// else:
+// %Mask1 = extractelement <16 x i1> %Mask, i32 1
+// br i1 %Mask1, label %cond.store1, label %else2
+//
+// cond.store1:
+// %Elt1 = extractelement <16 x i32> %Src, i32 1
+// %Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
+// store i32 %Elt1, i32* %Ptr1, align 4
+// br label %else2
+//   . . .
+static void scalarizeMaskedScatter(CallInst *CI, bool &ModifiedDT) {
+  Value *Src = CI->getArgOperand(0);
+  Value *Ptrs = CI->getArgOperand(1);
+  Value *Alignment = CI->getArgOperand(2);
+  Value *Mask = CI->getArgOperand(3);
+
+  auto *SrcFVTy = cast<FixedVectorType>(Src->getType());
+
+  assert(
+      isa<VectorType>(Ptrs->getType()) &&
+      isa<PointerType>(cast<VectorType>(Ptrs->getType())->getElementType()) &&
+      "Vector of pointers is expected in masked scatter intrinsic");
+
+  IRBuilder<> Builder(CI->getContext());
+  Instruction *InsertPt = CI;
+  BasicBlock *IfBlock = CI->getParent();
+  Builder.SetInsertPoint(InsertPt);
+  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+  MaybeAlign AlignVal = cast<ConstantInt>(Alignment)->getMaybeAlignValue();
+  unsigned VectorWidth = SrcFVTy->getNumElements();
+
+  // Shorten the way if the mask is a vector of constants.
+  if (isConstantIntVector(Mask)) {
+    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+      if (cast<Constant>(Mask)->getAggregateElement(Idx)->isNullValue())
+        continue;
+      Value *OneElt =
+          Builder.CreateExtractElement(Src, Idx, "Elt" + Twine(Idx));
+      Value *Ptr = Builder.CreateExtractElement(Ptrs, Idx, "Ptr" + Twine(Idx));
+      Builder.CreateAlignedStore(OneElt, Ptr, AlignVal);
+    }
+    CI->eraseFromParent();
+    return;
+  }
+
+  // If the mask is not v1i1, use scalar bit test operations. This generates
+  // better results on X86 at least.
+  Value *SclrMask;
+  if (VectorWidth != 1) {
+    Type *SclrMaskTy = Builder.getIntNTy(VectorWidth);
+    SclrMask = Builder.CreateBitCast(Mask, SclrMaskTy, "scalar_mask");
+  }
+
+  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+    // Fill the "else" block, created in the previous iteration
+    //
+    //  %Mask1 = and i16 %scalar_mask, i32 1 << Idx
+    //  %cond = icmp ne i16 %mask_1, 0
+    //  br i1 %Mask1, label %cond.store, label %else
+    //
+    Value *Predicate;
+    if (VectorWidth != 1) {
+      Value *Mask = Builder.getInt(APInt::getOneBitSet(VectorWidth, Idx));
+      Predicate = Builder.CreateICmpNE(Builder.CreateAnd(SclrMask, Mask),
+                                       Builder.getIntN(VectorWidth, 0));
+    } else {
+      Predicate = Builder.CreateExtractElement(Mask, Idx, "Mask" + Twine(Idx));
+    }
+
+    // Create "cond" block
+    //
+    //  %Elt1 = extractelement <16 x i32> %Src, i32 1
+    //  %Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
+    //  %store i32 %Elt1, i32* %Ptr1
+    //
+    BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
+    Builder.SetInsertPoint(InsertPt);
+
+    Value *OneElt = Builder.CreateExtractElement(Src, Idx, "Elt" + Twine(Idx));
+    Value *Ptr = Builder.CreateExtractElement(Ptrs, Idx, "Ptr" + Twine(Idx));
+    Builder.CreateAlignedStore(OneElt, Ptr, AlignVal);
+
+    // Create "else" block, fill it in the next iteration
+    BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
+    Builder.SetInsertPoint(InsertPt);
+    Instruction *OldBr = IfBlock->getTerminator();
+    BranchInst::Create(CondBlock, NewIfBlock, Predicate, OldBr);
+    OldBr->eraseFromParent();
+    IfBlock = NewIfBlock;
+  }
+  CI->eraseFromParent();
+
+  ModifiedDT = true;
+}
+
+static void scalarizeMaskedExpandLoad(CallInst *CI, bool &ModifiedDT) {
+  Value *Ptr = CI->getArgOperand(0);
+  Value *Mask = CI->getArgOperand(1);
+  Value *PassThru = CI->getArgOperand(2);
+
+  auto *VecType = cast<FixedVectorType>(CI->getType());
+
+  Type *EltTy = VecType->getElementType();
+
+  IRBuilder<> Builder(CI->getContext());
+  Instruction *InsertPt = CI;
+  BasicBlock *IfBlock = CI->getParent();
+
+  Builder.SetInsertPoint(InsertPt);
+  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+  unsigned VectorWidth = VecType->getNumElements();
+
+  // The result vector
+  Value *VResult = PassThru;
+
+  // Shorten the way if the mask is a vector of constants.
+  // Create a build_vector pattern, with loads/undefs as necessary and then
+  // shuffle blend with the pass through value.
+  if (isConstantIntVector(Mask)) {
+    unsigned MemIndex = 0;
+    VResult = UndefValue::get(VecType);
+    SmallVector<int, 16> ShuffleMask(VectorWidth, UndefMaskElem);
+    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+      Value *InsertElt;
+      if (cast<Constant>(Mask)->getAggregateElement(Idx)->isNullValue()) {
+        InsertElt = UndefValue::get(EltTy);
+        ShuffleMask[Idx] = Idx + VectorWidth;
+      } else {
+        Value *NewPtr =
+            Builder.CreateConstInBoundsGEP1_32(EltTy, Ptr, MemIndex);
+        InsertElt = Builder.CreateAlignedLoad(EltTy, NewPtr, Align(1),
+                                              "Load" + Twine(Idx));
+        ShuffleMask[Idx] = Idx;
+        ++MemIndex;
+      }
+      VResult = Builder.CreateInsertElement(VResult, InsertElt, Idx,
+                                            "Res" + Twine(Idx));
+    }
+    VResult = Builder.CreateShuffleVector(VResult, PassThru, ShuffleMask);
+    CI->replaceAllUsesWith(VResult);
+    CI->eraseFromParent();
+    return;
+  }
+
+  // If the mask is not v1i1, use scalar bit test operations. This generates
+  // better results on X86 at least.
+  Value *SclrMask;
+  if (VectorWidth != 1) {
+    Type *SclrMaskTy = Builder.getIntNTy(VectorWidth);
+    SclrMask = Builder.CreateBitCast(Mask, SclrMaskTy, "scalar_mask");
+  }
+
+  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+    // Fill the "else" block, created in the previous iteration
+    //
+    //  %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
+    //  %mask_1 = extractelement <16 x i1> %mask, i32 Idx
+    //  br i1 %mask_1, label %cond.load, label %else
+    //
+
+    Value *Predicate;
+    if (VectorWidth != 1) {
+      Value *Mask = Builder.getInt(APInt::getOneBitSet(VectorWidth, Idx));
+      Predicate = Builder.CreateICmpNE(Builder.CreateAnd(SclrMask, Mask),
+                                       Builder.getIntN(VectorWidth, 0));
+    } else {
+      Predicate = Builder.CreateExtractElement(Mask, Idx, "Mask" + Twine(Idx));
+    }
+
+    // Create "cond" block
+    //
+    //  %EltAddr = getelementptr i32* %1, i32 0
+    //  %Elt = load i32* %EltAddr
+    //  VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
+    //
+    BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt->getIterator(),
+                                                     "cond.load");
+    Builder.SetInsertPoint(InsertPt);
+
+    LoadInst *Load = Builder.CreateAlignedLoad(EltTy, Ptr, Align(1));
+    Value *NewVResult = Builder.CreateInsertElement(VResult, Load, Idx);
+
+    // Move the pointer if there are more blocks to come.
+    Value *NewPtr;
+    if ((Idx + 1) != VectorWidth)
+      NewPtr = Builder.CreateConstInBoundsGEP1_32(EltTy, Ptr, 1);
+
+    // Create "else" block, fill it in the next iteration
+    BasicBlock *NewIfBlock =
+        CondBlock->splitBasicBlock(InsertPt->getIterator(), "else");
+    Builder.SetInsertPoint(InsertPt);
+    Instruction *OldBr = IfBlock->getTerminator();
+    BranchInst::Create(CondBlock, NewIfBlock, Predicate, OldBr);
+    OldBr->eraseFromParent();
+    BasicBlock *PrevIfBlock = IfBlock;
+    IfBlock = NewIfBlock;
+
+    // Create the phi to join the new and previous value.
+    PHINode *ResultPhi = Builder.CreatePHI(VecType, 2, "res.phi.else");
+    ResultPhi->addIncoming(NewVResult, CondBlock);
+    ResultPhi->addIncoming(VResult, PrevIfBlock);
+    VResult = ResultPhi;
+
+    // Add a PHI for the pointer if this isn't the last iteration.
+    if ((Idx + 1) != VectorWidth) {
+      PHINode *PtrPhi = Builder.CreatePHI(Ptr->getType(), 2, "ptr.phi.else");
+      PtrPhi->addIncoming(NewPtr, CondBlock);
+      PtrPhi->addIncoming(Ptr, PrevIfBlock);
+      Ptr = PtrPhi;
+    }
+  }
+
+  CI->replaceAllUsesWith(VResult);
+  CI->eraseFromParent();
+
+  ModifiedDT = true;
+}
+
+static void scalarizeMaskedCompressStore(CallInst *CI, bool &ModifiedDT) {
+  Value *Src = CI->getArgOperand(0);
+  Value *Ptr = CI->getArgOperand(1);
+  Value *Mask = CI->getArgOperand(2);
+
+  auto *VecType = cast<FixedVectorType>(Src->getType());
+
+  IRBuilder<> Builder(CI->getContext());
+  Instruction *InsertPt = CI;
+  BasicBlock *IfBlock = CI->getParent();
+
+  Builder.SetInsertPoint(InsertPt);
+  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+  Type *EltTy = VecType->getElementType();
+
+  unsigned VectorWidth = VecType->getNumElements();
+
+  // Shorten the way if the mask is a vector of constants.
+  if (isConstantIntVector(Mask)) {
+    unsigned MemIndex = 0;
+    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+      if (cast<Constant>(Mask)->getAggregateElement(Idx)->isNullValue())
+        continue;
+      Value *OneElt =
+          Builder.CreateExtractElement(Src, Idx, "Elt" + Twine(Idx));
+      Value *NewPtr = Builder.CreateConstInBoundsGEP1_32(EltTy, Ptr, MemIndex);
+      Builder.CreateAlignedStore(OneElt, NewPtr, Align(1));
+      ++MemIndex;
+    }
+    CI->eraseFromParent();
+    return;
+  }
+
+  // If the mask is not v1i1, use scalar bit test operations. This generates
+  // better results on X86 at least.
+  Value *SclrMask;
+  if (VectorWidth != 1) {
+    Type *SclrMaskTy = Builder.getIntNTy(VectorWidth);
+    SclrMask = Builder.CreateBitCast(Mask, SclrMaskTy, "scalar_mask");
+  }
+
+  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+    // Fill the "else" block, created in the previous iteration
+    //
+    //  %mask_1 = extractelement <16 x i1> %mask, i32 Idx
+    //  br i1 %mask_1, label %cond.store, label %else
+    //
+    Value *Predicate;
+    if (VectorWidth != 1) {
+      Value *Mask = Builder.getInt(APInt::getOneBitSet(VectorWidth, Idx));
+      Predicate = Builder.CreateICmpNE(Builder.CreateAnd(SclrMask, Mask),
+                                       Builder.getIntN(VectorWidth, 0));
+    } else {
+      Predicate = Builder.CreateExtractElement(Mask, Idx, "Mask" + Twine(Idx));
+    }
+
+    // Create "cond" block
+    //
+    //  %OneElt = extractelement <16 x i32> %Src, i32 Idx
+    //  %EltAddr = getelementptr i32* %1, i32 0
+    //  %store i32 %OneElt, i32* %EltAddr
+    //
+    BasicBlock *CondBlock =
+        IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.store");
+    Builder.SetInsertPoint(InsertPt);
+
+    Value *OneElt = Builder.CreateExtractElement(Src, Idx);
+    Builder.CreateAlignedStore(OneElt, Ptr, Align(1));
+
+    // Move the pointer if there are more blocks to come.
+    Value *NewPtr;
+    if ((Idx + 1) != VectorWidth)
+      NewPtr = Builder.CreateConstInBoundsGEP1_32(EltTy, Ptr, 1);
+
+    // Create "else" block, fill it in the next iteration
+    BasicBlock *NewIfBlock =
+        CondBlock->splitBasicBlock(InsertPt->getIterator(), "else");
+    Builder.SetInsertPoint(InsertPt);
+    Instruction *OldBr = IfBlock->getTerminator();
+    BranchInst::Create(CondBlock, NewIfBlock, Predicate, OldBr);
+    OldBr->eraseFromParent();
+    BasicBlock *PrevIfBlock = IfBlock;
+    IfBlock = NewIfBlock;
+
+    // Add a PHI for the pointer if this isn't the last iteration.
+    if ((Idx + 1) != VectorWidth) {
+      PHINode *PtrPhi = Builder.CreatePHI(Ptr->getType(), 2, "ptr.phi.else");
+      PtrPhi->addIncoming(NewPtr, CondBlock);
+      PtrPhi->addIncoming(Ptr, PrevIfBlock);
+      Ptr = PtrPhi;
+    }
+  }
+  CI->eraseFromParent();
+
+  ModifiedDT = true;
+}
+
+static bool runImpl(Function &F, const TargetTransformInfo &TTI) {
+  bool EverMadeChange = false;
+  bool MadeChange = true;
+  auto &DL = F.getParent()->getDataLayout();
+  while (MadeChange) {
+    MadeChange = false;
+    for (Function::iterator I = F.begin(); I != F.end();) {
+      BasicBlock *BB = &*I++;
+      bool ModifiedDTOnIteration = false;
+      MadeChange |= optimizeBlock(*BB, ModifiedDTOnIteration, TTI, DL);
+
+      // Restart BB iteration if the dominator tree of the Function was changed
+      if (ModifiedDTOnIteration)
+        break;
+    }
+
+    EverMadeChange |= MadeChange;
+  }
+  return EverMadeChange;
+}
+
+bool ScalarizeMaskedMemIntrinLegacyPass::runOnFunction(Function &F) {
+  auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+  return runImpl(F, TTI);
+}
+
+PreservedAnalyses
+ScalarizeMaskedMemIntrinPass::run(Function &F, FunctionAnalysisManager &AM) {
+  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+  if (!runImpl(F, TTI))
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserve<TargetIRAnalysis>();
+  return PA;
+}
+
+static bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT,
+                          const TargetTransformInfo &TTI,
+                          const DataLayout &DL) {
+  bool MadeChange = false;
+
+  BasicBlock::iterator CurInstIterator = BB.begin();
+  while (CurInstIterator != BB.end()) {
+    if (CallInst *CI = dyn_cast<CallInst>(&*CurInstIterator++))
+      MadeChange |= optimizeCallInst(CI, ModifiedDT, TTI, DL);
+    if (ModifiedDT)
+      return true;
+  }
+
+  return MadeChange;
+}
+
+static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT,
+                             const TargetTransformInfo &TTI,
+                             const DataLayout &DL) {
+  IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
+  if (II) {
+    // The scalarization code below does not work for scalable vectors.
+    if (isa<ScalableVectorType>(II->getType()) ||
+        any_of(II->arg_operands(),
+               [](Value *V) { return isa<ScalableVectorType>(V->getType()); }))
+      return false;
+
+    switch (II->getIntrinsicID()) {
+    default:
+      break;
+    case Intrinsic::masked_load:
+      // Scalarize unsupported vector masked load
+      if (TTI.isLegalMaskedLoad(
+              CI->getType(),
+              cast<ConstantInt>(CI->getArgOperand(1))->getAlignValue()))
+        return false;
+      scalarizeMaskedLoad(CI, ModifiedDT);
+      return true;
+    case Intrinsic::masked_store:
+      if (TTI.isLegalMaskedStore(
+              CI->getArgOperand(0)->getType(),
+              cast<ConstantInt>(CI->getArgOperand(2))->getAlignValue()))
+        return false;
+      scalarizeMaskedStore(CI, ModifiedDT);
+      return true;
+    case Intrinsic::masked_gather: {
+      unsigned AlignmentInt =
+          cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
+      Type *LoadTy = CI->getType();
+      Align Alignment =
+          DL.getValueOrABITypeAlignment(MaybeAlign(AlignmentInt), LoadTy);
+      if (TTI.isLegalMaskedGather(LoadTy, Alignment))
+        return false;
+      scalarizeMaskedGather(CI, ModifiedDT);
+      return true;
+    }
+    case Intrinsic::masked_scatter: {
+      unsigned AlignmentInt =
+          cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
+      Type *StoreTy = CI->getArgOperand(0)->getType();
+      Align Alignment =
+          DL.getValueOrABITypeAlignment(MaybeAlign(AlignmentInt), StoreTy);
+      if (TTI.isLegalMaskedScatter(StoreTy, Alignment))
+        return false;
+      scalarizeMaskedScatter(CI, ModifiedDT);
+      return true;
+    }
+    case Intrinsic::masked_expandload:
+      if (TTI.isLegalMaskedExpandLoad(CI->getType()))
+        return false;
+      scalarizeMaskedExpandLoad(CI, ModifiedDT);
+      return true;
+    case Intrinsic::masked_compressstore:
+      if (TTI.isLegalMaskedCompressStore(CI->getArgOperand(0)->getType()))
+        return false;
+      scalarizeMaskedCompressStore(CI, ModifiedDT);
+      return true;
+    }
+  }
+
+  return false;
+}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Scalarizer.cpp
index 851bd79cd6d8..c95984fe198f 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Scalarizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Scalarizer.cpp
@@ -398,7 +398,8 @@ void ScalarizerVisitor::gather(Instruction *Op, const ValueVector &CV) {
         continue;
 
       Instruction *Old = cast<Instruction>(V);
-      CV[I]->takeName(Old);
+      if (isa<Instruction>(CV[I]))
+        CV[I]->takeName(Old);
       Old->replaceAllUsesWith(CV[I]);
       PotentiallyDeadInstrs.emplace_back(Old);
     }
@@ -732,7 +733,7 @@ bool ScalarizerVisitor::visitBitCastInst(BitCastInst &BCI) {
     auto *MidTy = FixedVectorType::get(SrcVT->getElementType(), FanIn);
     unsigned Op0I = 0;
     for (unsigned ResI = 0; ResI < DstNumElems; ++ResI) {
-      Value *V = UndefValue::get(MidTy);
+      Value *V = PoisonValue::get(MidTy);
       for (unsigned MidI = 0; MidI < FanIn; ++MidI)
         V = Builder.CreateInsertElement(V, Op0[Op0I++], Builder.getInt32(MidI),
                                         BCI.getName() + ".i" + Twine(ResI)
@@ -931,7 +932,7 @@ bool ScalarizerVisitor::finish() {
     if (!Op->use_empty()) {
       // The value is still needed, so recreate it using a series of
       // InsertElements.
-      Value *Res = UndefValue::get(Op->getType());
+      Value *Res = PoisonValue::get(Op->getType());
       if (auto *Ty = dyn_cast<VectorType>(Op->getType())) {
         BasicBlock *BB = Op->getParent();
         unsigned Count = cast<FixedVectorType>(Ty)->getNumElements();
@@ -941,13 +942,13 @@ bool ScalarizerVisitor::finish() {
         for (unsigned I = 0; I < Count; ++I)
           Res = Builder.CreateInsertElement(Res, CV[I], Builder.getInt32(I),
                                             Op->getName() + ".upto" + Twine(I));
+        Res->takeName(Op);
       } else {
         assert(CV.size() == 1 && Op->getType() == CV[0]->getType());
         Res = CV[0];
         if (Op == Res)
           continue;
       }
-      Res->takeName(Op);
       Op->replaceAllUsesWith(Res);
     }
     PotentiallyDeadInstrs.emplace_back(Op);
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index f1d2e3c1ecfa..f216956406b6 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -155,6 +155,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Scalar/SeparateConstOffsetFromGEP.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DepthFirstIterator.h"
@@ -177,6 +178,7 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
@@ -342,13 +344,14 @@ private:
 /// A pass that tries to split every GEP in the function into a variadic
 /// base and a constant offset. It is a FunctionPass because searching for the
 /// constant offset may inspect other basic blocks.
-class SeparateConstOffsetFromGEP : public FunctionPass {
+class SeparateConstOffsetFromGEPLegacyPass : public FunctionPass {
 public:
   static char ID;
 
-  SeparateConstOffsetFromGEP(bool LowerGEP = false)
+  SeparateConstOffsetFromGEPLegacyPass(bool LowerGEP = false)
       : FunctionPass(ID), LowerGEP(LowerGEP) {
-    initializeSeparateConstOffsetFromGEPPass(*PassRegistry::getPassRegistry());
+    initializeSeparateConstOffsetFromGEPLegacyPassPass(
+        *PassRegistry::getPassRegistry());
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -360,13 +363,25 @@ public:
     AU.addRequired<TargetLibraryInfoWrapperPass>();
   }
 
-  bool doInitialization(Module &M) override {
-    DL = &M.getDataLayout();
-    return false;
-  }
-
   bool runOnFunction(Function &F) override;
 
+private:
+  bool LowerGEP;
+};
+
+/// A pass that tries to split every GEP in the function into a variadic
+/// base and a constant offset. It is a FunctionPass because searching for the
+/// constant offset may inspect other basic blocks.
+class SeparateConstOffsetFromGEP {
+public:
+  SeparateConstOffsetFromGEP(
+      DominatorTree *DT, ScalarEvolution *SE, LoopInfo *LI,
+      TargetLibraryInfo *TLI,
+      function_ref<TargetTransformInfo &(Function &)> GetTTI, bool LowerGEP)
+      : DT(DT), SE(SE), LI(LI), TLI(TLI), GetTTI(GetTTI), LowerGEP(LowerGEP) {}
+
+  bool run(Function &F);
+
 private:
   /// Tries to split the given GEP into a variadic base and a constant offset,
   /// and returns true if the splitting succeeds.
@@ -450,9 +465,10 @@ private:
   const DataLayout *DL = nullptr;
   DominatorTree *DT = nullptr;
   ScalarEvolution *SE;
-
   LoopInfo *LI;
   TargetLibraryInfo *TLI;
+  // Retrieved lazily since not always used.
+  function_ref<TargetTransformInfo &(Function &)> GetTTI;
 
   /// Whether to lower a GEP with multiple indices into arithmetic operations or
   /// multiple GEPs with a single index.
@@ -464,10 +480,10 @@ private:
 
 } // end anonymous namespace
 
-char SeparateConstOffsetFromGEP::ID = 0;
+char SeparateConstOffsetFromGEPLegacyPass::ID = 0;
 
 INITIALIZE_PASS_BEGIN(
-    SeparateConstOffsetFromGEP, "separate-const-offset-from-gep",
+    SeparateConstOffsetFromGEPLegacyPass, "separate-const-offset-from-gep",
     "Split GEPs to a variadic base and a constant offset for better CSE", false,
     false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
@@ -476,12 +492,12 @@ INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(
-    SeparateConstOffsetFromGEP, "separate-const-offset-from-gep",
+    SeparateConstOffsetFromGEPLegacyPass, "separate-const-offset-from-gep",
     "Split GEPs to a variadic base and a constant offset for better CSE", false,
     false)
 
 FunctionPass *llvm::createSeparateConstOffsetFromGEPPass(bool LowerGEP) {
-  return new SeparateConstOffsetFromGEP(LowerGEP);
+  return new SeparateConstOffsetFromGEPLegacyPass(LowerGEP);
 }
 
 bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended,
@@ -886,8 +902,8 @@ void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs(
   // If we created a GEP with constant index, and the base is loop invariant,
   // then we swap the first one with it, so LICM can move constant GEP out
   // later.
-  GetElementPtrInst *FirstGEP = dyn_cast_or_null<GetElementPtrInst>(FirstResult);
-  GetElementPtrInst *SecondGEP = dyn_cast_or_null<GetElementPtrInst>(ResultPtr);
+  auto *FirstGEP = dyn_cast_or_null<GetElementPtrInst>(FirstResult);
+  auto *SecondGEP = dyn_cast<GetElementPtrInst>(ResultPtr);
   if (isSwapCandidate && isLegalToSwapOperand(FirstGEP, SecondGEP, L))
     swapGEPOperand(FirstGEP, SecondGEP);
 
@@ -962,8 +978,7 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
   if (!NeedsExtraction)
     return Changed;
 
-  TargetTransformInfo &TTI =
-      getAnalysis<TargetTransformInfoWrapperPass>().getTTI(*GEP->getFunction());
+  TargetTransformInfo &TTI = GetTTI(*GEP->getFunction());
 
   // If LowerGEP is disabled, before really splitting the GEP, check whether the
   // backend supports the addressing mode we are about to produce. If no, this
@@ -1128,17 +1143,25 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
   return true;
 }
 
-bool SeparateConstOffsetFromGEP::runOnFunction(Function &F) {
+bool SeparateConstOffsetFromGEPLegacyPass::runOnFunction(Function &F) {
   if (skipFunction(F))
     return false;
+  auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+  auto GetTTI = [this](Function &F) -> TargetTransformInfo & {
+    return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+  };
+  SeparateConstOffsetFromGEP Impl(DT, SE, LI, TLI, GetTTI, LowerGEP);
+  return Impl.run(F);
+}
 
+bool SeparateConstOffsetFromGEP::run(Function &F) {
   if (DisableSeparateConstOffsetFromGEP)
     return false;
 
-  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
-  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+  DL = &F.getParent()->getDataLayout();
   bool Changed = false;
   for (BasicBlock &B : F) {
     for (BasicBlock::iterator I = B.begin(), IE = B.end(); I != IE;)
@@ -1345,3 +1368,20 @@ void SeparateConstOffsetFromGEP::swapGEPOperand(GetElementPtrInst *First,
   } else
     First->setIsInBounds(true);
 }
+
+PreservedAnalyses
+SeparateConstOffsetFromGEPPass::run(Function &F, FunctionAnalysisManager &AM) {
+  auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
+  auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
+  auto *LI = &AM.getResult<LoopAnalysis>(F);
+  auto *TLI = &AM.getResult<TargetLibraryAnalysis>(F);
+  auto GetTTI = [&AM](Function &F) -> TargetTransformInfo & {
+    return AM.getResult<TargetIRAnalysis>(F);
+  };
+  SeparateConstOffsetFromGEP Impl(DT, SE, LI, TLI, GetTTI, LowerGEP);
+  if (!Impl.run(F))
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 6c6d6ca9cf65..9d3c8d0f3739 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -26,16 +26,18 @@
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/MustExecute.h"
+#include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Use.h"
 #include "llvm/IR/Value.h"
 #include "llvm/InitializePasses.h"
@@ -49,6 +51,7 @@
 #include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <algorithm>
@@ -93,6 +96,11 @@ static cl::opt<bool> UnswitchGuards(
     "simple-loop-unswitch-guards", cl::init(true), cl::Hidden,
     cl::desc("If enabled, simple loop unswitching will also consider "
              "llvm.experimental.guard intrinsics as unswitch candidates."));
+static cl::opt<bool> DropNonTrivialImplicitNullChecks(
+    "simple-loop-unswitch-drop-non-trivial-implicit-null-checks",
+    cl::init(false), cl::Hidden,
+    cl::desc("If enabled, drop make.implicit metadata in unswitched implicit "
+             "null checks to save time analyzing if we can keep it."));
 
 /// Collect all of the loop invariant input values transitively used by the
 /// homogeneous instruction graph from a given root.
@@ -684,11 +692,9 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
   // successor.
   BasicBlock *CommonSuccBB = nullptr;
   if (SI.getNumCases() > 0 &&
-      std::all_of(std::next(SI.case_begin()), SI.case_end(),
-                  [&SI](const SwitchInst::CaseHandle &Case) {
-                    return Case.getCaseSuccessor() ==
-                           SI.case_begin()->getCaseSuccessor();
-                  }))
+      all_of(drop_begin(SI.cases()), [&SI](const SwitchInst::CaseHandle &Case) {
+        return Case.getCaseSuccessor() == SI.case_begin()->getCaseSuccessor();
+      }))
     CommonSuccBB = SI.case_begin()->getCaseSuccessor();
   if (!DefaultExitBB) {
     // If we're not unswitching the default, we need it to match any cases to
@@ -847,12 +853,13 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
     DTUpdates.push_back({DT.Delete, ParentBB, SplitUnswitchedPair.first});
     DTUpdates.push_back({DT.Insert, OldPH, SplitUnswitchedPair.second});
   }
-  DT.applyUpdates(DTUpdates);
 
   if (MSSAU) {
-    MSSAU->applyUpdates(DTUpdates, DT);
+    MSSAU->applyUpdates(DTUpdates, DT, /*UpdateDT=*/true);
     if (VerifyMemorySSA)
       MSSAU->getMemorySSA()->verifyMemorySSA();
+  } else {
+    DT.applyUpdates(DTUpdates);
   }
 
   assert(DT.verify(DominatorTree::VerificationLevel::Fast));
@@ -1133,9 +1140,22 @@ static BasicBlock *buildClonedLoopBlocks(
   // Replace the cloned branch with an unconditional branch to the cloned
   // unswitched successor.
   auto *ClonedSuccBB = cast<BasicBlock>(VMap.lookup(UnswitchedSuccBB));
-  ClonedParentBB->getTerminator()->eraseFromParent();
+  Instruction *ClonedTerminator = ClonedParentBB->getTerminator();
+  // Trivial Simplification. If Terminator is a conditional branch and
+  // condition becomes dead - erase it.
+  Value *ClonedConditionToErase = nullptr;
+  if (auto *BI = dyn_cast<BranchInst>(ClonedTerminator))
+    ClonedConditionToErase = BI->getCondition();
+  else if (auto *SI = dyn_cast<SwitchInst>(ClonedTerminator))
+    ClonedConditionToErase = SI->getCondition();
+
+  ClonedTerminator->eraseFromParent();
   BranchInst::Create(ClonedSuccBB, ClonedParentBB);
 
+  if (ClonedConditionToErase)
+    RecursivelyDeleteTriviallyDeadInstructions(ClonedConditionToErase, nullptr,
+                                               MSSAU);
+
   // If there are duplicate entries in the PHI nodes because of multiple edges
   // to the unswitched successor, we need to nuke all but one as we replaced it
   // with a direct branch.
@@ -1194,7 +1214,7 @@ static Loop *cloneLoopNest(Loop &OrigRootL, Loop *RootParentL,
     LI.addTopLevelLoop(ClonedRootL);
   AddClonedBlocksToLoop(OrigRootL, *ClonedRootL);
 
-  if (OrigRootL.empty())
+  if (OrigRootL.isInnermost())
     return ClonedRootL;
 
   // If we have a nest, we can quickly clone the entire loop nest using an
@@ -2070,6 +2090,23 @@ static void unswitchNontrivialInvariants(
         DominatingSucc, *VMaps.back(), DTUpdates, AC, DT, LI, MSSAU);
   }
 
+  // Drop metadata if we may break its semantics by moving this instr into the
+  // split block.
+  if (TI.getMetadata(LLVMContext::MD_make_implicit)) {
+    if (DropNonTrivialImplicitNullChecks)
+      // Do not spend time trying to understand if we can keep it, just drop it
+      // to save compile time.
+      TI.setMetadata(LLVMContext::MD_make_implicit, nullptr);
+    else {
+      // It is only legal to preserve make.implicit metadata if we are
+      // guaranteed no reach implicit null check after following this branch.
+      ICFLoopSafetyInfo SafetyInfo;
+      SafetyInfo.computeLoopSafetyInfo(&L);
+      if (!SafetyInfo.isGuaranteedToExecute(TI, &DT, &L))
+        TI.setMetadata(LLVMContext::MD_make_implicit, nullptr);
+    }
+  }
+
   // The stitching of the branched code back together depends on whether we're
   // doing full unswitching or not with the exception that we always want to
   // nuke the initial terminator placed in the split block.
@@ -2316,12 +2353,12 @@ static void unswitchNontrivialInvariants(
   for (Loop *UpdatedL :
        llvm::concat<Loop *>(NonChildClonedLoops, HoistedLoops)) {
     UpdateLoop(*UpdatedL);
-    if (!UpdatedL->getParentLoop())
+    if (UpdatedL->isOutermost())
       OuterExitL = nullptr;
   }
   if (IsStillLoop) {
     UpdateLoop(L);
-    if (!L.getParentLoop())
+    if (L.isOutermost())
       OuterExitL = nullptr;
   }
 
@@ -2669,6 +2706,10 @@ unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI,
   // (convergent, noduplicate, or cross-basic-block tokens).
   // FIXME: We might be able to safely handle some of these in non-duplicated
   // regions.
+  TargetTransformInfo::TargetCostKind CostKind =
+      L.getHeader()->getParent()->hasMinSize()
+      ? TargetTransformInfo::TCK_CodeSize
+      : TargetTransformInfo::TCK_SizeAndLatency;
   int LoopCost = 0;
   for (auto *BB : L.blocks()) {
     int Cost = 0;
@@ -2682,7 +2723,7 @@ unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI,
         if (CB->isConvergent() || CB->cannotDuplicate())
           return false;
 
-      Cost += TTI.getUserCost(&I, TargetTransformInfo::TCK_CodeSize);
+      Cost += TTI.getUserCost(&I, CostKind);
     }
     assert(Cost >= 0 && "Must not have negative costs!");
     LoopCost += Cost;
@@ -2844,7 +2885,6 @@ static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI,
                          ScalarEvolution *SE, MemorySSAUpdater *MSSAU) {
   assert(L.isRecursivelyLCSSAForm(DT, LI) &&
          "Loops must be in LCSSA form before unswitching.");
-  bool Changed = false;
 
   // Must be in loop simplified form: we need a preheader and dedicated exits.
   if (!L.isLoopSimplifyForm())
@@ -2864,6 +2904,10 @@ static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI,
   if (!NonTrivial && !EnableNonTrivialUnswitch)
     return false;
 
+  // Skip non-trivial unswitching for optsize functions.
+  if (L.getHeader()->getParent()->hasOptSize())
+    return false;
+
   // For non-trivial unswitching, because it often creates new loops, we rely on
   // the pass manager to iterate on the loops rather than trying to immediately
   // reach a fixed point. There is no substantial advantage to iterating
@@ -2876,7 +2920,7 @@ static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI,
     return true;
 
   // No other opportunities to unswitch.
-  return Changed;
+  return false;
 }
 
 PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM,
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index 2e459c9a64d4..38e7109ead57 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -25,21 +25,25 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/ValueHandle.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/SimplifyCFG.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/SimplifyCFGOptions.h"
 #include <utility>
 using namespace llvm;
 
@@ -61,6 +65,10 @@ static cl::opt<bool> UserForwardSwitchCond(
     "forward-switch-cond", cl::Hidden, cl::init(false),
     cl::desc("Forward switch condition to phi ops (default = false)"));
 
+static cl::opt<bool> UserHoistCommonInsts(
+    "hoist-common-insts", cl::Hidden, cl::init(false),
+    cl::desc("hoist common instructions (default = false)"));
+
 static cl::opt<bool> UserSinkCommonInsts(
     "sink-common-insts", cl::Hidden, cl::init(false),
     cl::desc("Sink common instructions (default = false)"));
@@ -70,14 +78,18 @@ STATISTIC(NumSimpl, "Number of blocks simplified");
 
 /// If we have more than one empty (other than phi node) return blocks,
 /// merge them together to promote recursive block merging.
-static bool mergeEmptyReturnBlocks(Function &F) {
+static bool mergeEmptyReturnBlocks(Function &F, DomTreeUpdater *DTU) {
   bool Changed = false;
 
+  std::vector<DominatorTree::UpdateType> Updates;
+  SmallVector<BasicBlock *, 8> DeadBlocks;
+
   BasicBlock *RetBlock = nullptr;
 
   // Scan all the blocks in the function, looking for empty return blocks.
-  for (Function::iterator BBI = F.begin(), E = F.end(); BBI != E; ) {
-    BasicBlock &BB = *BBI++;
+  for (BasicBlock &BB : make_early_inc_range(F)) {
+    if (DTU && DTU->isBBPendingDeletion(&BB))
+      continue;
 
     // Only look at return blocks.
     ReturnInst *Ret = dyn_cast<ReturnInst>(BB.getTerminator());
@@ -128,8 +140,18 @@ static bool mergeEmptyReturnBlocks(Function &F) {
     if (Ret->getNumOperands() == 0 ||
         Ret->getOperand(0) ==
           cast<ReturnInst>(RetBlock->getTerminator())->getOperand(0)) {
+      // All predecessors of BB should now branch to RetBlock instead.
+      if (DTU) {
+        for (auto *Predecessor : predecessors(&BB)) {
+          // But, iff Predecessor already branches to RetBlock,
+          // don't (re-)add DomTree edge, because it already exists.
+          if (!is_contained(successors(Predecessor), RetBlock))
+            Updates.push_back({DominatorTree::Insert, Predecessor, RetBlock});
+          Updates.push_back({DominatorTree::Delete, Predecessor, &BB});
+        }
+      }
       BB.replaceAllUsesWith(RetBlock);
-      BB.eraseFromParent();
+      DeadBlocks.emplace_back(&BB);
       continue;
     }
 
@@ -153,6 +175,17 @@ static bool mergeEmptyReturnBlocks(Function &F) {
     RetBlockPHI->addIncoming(Ret->getOperand(0), &BB);
     BB.getTerminator()->eraseFromParent();
     BranchInst::Create(RetBlock, &BB);
+    if (DTU)
+      Updates.push_back({DominatorTree::Insert, &BB, RetBlock});
+  }
+
+  if (DTU) {
+    DTU->applyUpdates(Updates);
+    for (auto *BB : DeadBlocks)
+      DTU->deleteBB(BB);
+  } else {
+    for (auto *BB : DeadBlocks)
+      BB->eraseFromParent();
   }
 
   return Changed;
@@ -161,22 +194,36 @@ static bool mergeEmptyReturnBlocks(Function &F) {
 /// Call SimplifyCFG on all the blocks in the function,
 /// iterating until no more changes are made.
 static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
+                                   DomTreeUpdater *DTU,
                                    const SimplifyCFGOptions &Options) {
   bool Changed = false;
   bool LocalChange = true;
 
   SmallVector<std::pair<const BasicBlock *, const BasicBlock *>, 32> Edges;
   FindFunctionBackedges(F, Edges);
-  SmallPtrSet<BasicBlock *, 16> LoopHeaders;
+  SmallPtrSet<BasicBlock *, 16> UniqueLoopHeaders;
   for (unsigned i = 0, e = Edges.size(); i != e; ++i)
-    LoopHeaders.insert(const_cast<BasicBlock *>(Edges[i].second));
+    UniqueLoopHeaders.insert(const_cast<BasicBlock *>(Edges[i].second));
+
+  SmallVector<WeakVH, 16> LoopHeaders(UniqueLoopHeaders.begin(),
+                                      UniqueLoopHeaders.end());
 
   while (LocalChange) {
     LocalChange = false;
 
     // Loop over all of the basic blocks and remove them if they are unneeded.
     for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) {
-      if (simplifyCFG(&*BBIt++, TTI, Options, &LoopHeaders)) {
+      BasicBlock &BB = *BBIt++;
+      if (DTU) {
+        assert(
+            !DTU->isBBPendingDeletion(&BB) &&
+            "Should not end up trying to simplify blocks marked for removal.");
+        // Make sure that the advanced iterator does not point at the blocks
+        // that are marked for removal, skip over all such blocks.
+        while (BBIt != F.end() && DTU->isBBPendingDeletion(&*BBIt))
+          ++BBIt;
+      }
+      if (simplifyCFG(&BB, TTI, DTU, Options, LoopHeaders)) {
         LocalChange = true;
         ++NumSimpl;
       }
@@ -186,11 +233,14 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
   return Changed;
 }
 
-static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI,
-                                const SimplifyCFGOptions &Options) {
-  bool EverChanged = removeUnreachableBlocks(F);
-  EverChanged |= mergeEmptyReturnBlocks(F);
-  EverChanged |= iterativelySimplifyCFG(F, TTI, Options);
+static bool simplifyFunctionCFGImpl(Function &F, const TargetTransformInfo &TTI,
+                                    DominatorTree *DT,
+                                    const SimplifyCFGOptions &Options) {
+  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+
+  bool EverChanged = removeUnreachableBlocks(F, DT ? &DTU : nullptr);
+  EverChanged |= mergeEmptyReturnBlocks(F, DT ? &DTU : nullptr);
+  EverChanged |= iterativelySimplifyCFG(F, TTI, DT ? &DTU : nullptr, Options);
 
   // If neither pass changed anything, we're done.
   if (!EverChanged) return false;
@@ -200,43 +250,75 @@ static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI,
   // iterate between the two optimizations.  We structure the code like this to
   // avoid rerunning iterativelySimplifyCFG if the second pass of
   // removeUnreachableBlocks doesn't do anything.
-  if (!removeUnreachableBlocks(F))
+  if (!removeUnreachableBlocks(F, DT ? &DTU : nullptr))
     return true;
 
   do {
-    EverChanged = iterativelySimplifyCFG(F, TTI, Options);
-    EverChanged |= removeUnreachableBlocks(F);
+    EverChanged = iterativelySimplifyCFG(F, TTI, DT ? &DTU : nullptr, Options);
+    EverChanged |= removeUnreachableBlocks(F, DT ? &DTU : nullptr);
   } while (EverChanged);
 
   return true;
 }
 
+static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI,
+                                DominatorTree *DT,
+                                const SimplifyCFGOptions &Options) {
+  assert((!RequireAndPreserveDomTree ||
+          (DT && DT->verify(DominatorTree::VerificationLevel::Full))) &&
+         "Original domtree is invalid?");
+
+  bool Changed = simplifyFunctionCFGImpl(F, TTI, DT, Options);
+
+  assert((!RequireAndPreserveDomTree ||
+          (DT && DT->verify(DominatorTree::VerificationLevel::Full))) &&
+         "Failed to maintain validity of domtree!");
+
+  return Changed;
+}
+
 // Command-line settings override compile-time settings.
-SimplifyCFGPass::SimplifyCFGPass(const SimplifyCFGOptions &Opts) {
-  Options.BonusInstThreshold = UserBonusInstThreshold.getNumOccurrences()
-                                   ? UserBonusInstThreshold
-                                   : Opts.BonusInstThreshold;
-  Options.ForwardSwitchCondToPhi = UserForwardSwitchCond.getNumOccurrences()
-                                       ? UserForwardSwitchCond
-                                       : Opts.ForwardSwitchCondToPhi;
-  Options.ConvertSwitchToLookupTable = UserSwitchToLookup.getNumOccurrences()
-                                           ? UserSwitchToLookup
-                                           : Opts.ConvertSwitchToLookupTable;
-  Options.NeedCanonicalLoop = UserKeepLoops.getNumOccurrences()
-                                  ? UserKeepLoops
-                                  : Opts.NeedCanonicalLoop;
-  Options.SinkCommonInsts = UserSinkCommonInsts.getNumOccurrences()
-                                ? UserSinkCommonInsts
-                                : Opts.SinkCommonInsts;
+static void applyCommandLineOverridesToOptions(SimplifyCFGOptions &Options) {
+  if (UserBonusInstThreshold.getNumOccurrences())
+    Options.BonusInstThreshold = UserBonusInstThreshold;
+  if (UserForwardSwitchCond.getNumOccurrences())
+    Options.ForwardSwitchCondToPhi = UserForwardSwitchCond;
+  if (UserSwitchToLookup.getNumOccurrences())
+    Options.ConvertSwitchToLookupTable = UserSwitchToLookup;
+  if (UserKeepLoops.getNumOccurrences())
+    Options.NeedCanonicalLoop = UserKeepLoops;
+  if (UserHoistCommonInsts.getNumOccurrences())
+    Options.HoistCommonInsts = UserHoistCommonInsts;
+  if (UserSinkCommonInsts.getNumOccurrences())
+    Options.SinkCommonInsts = UserSinkCommonInsts;
+}
+
+SimplifyCFGPass::SimplifyCFGPass() : Options() {
+  applyCommandLineOverridesToOptions(Options);
+}
+
+SimplifyCFGPass::SimplifyCFGPass(const SimplifyCFGOptions &Opts)
+    : Options(Opts) {
+  applyCommandLineOverridesToOptions(Options);
 }
 
 PreservedAnalyses SimplifyCFGPass::run(Function &F,
                                        FunctionAnalysisManager &AM) {
   auto &TTI = AM.getResult<TargetIRAnalysis>(F);
   Options.AC = &AM.getResult<AssumptionAnalysis>(F);
-  if (!simplifyFunctionCFG(F, TTI, Options))
+  DominatorTree *DT = nullptr;
+  if (RequireAndPreserveDomTree)
+    DT = &AM.getResult<DominatorTreeAnalysis>(F);
+  if (F.hasFnAttribute(Attribute::OptForFuzzing)) {
+    Options.setSimplifyCondBranch(false).setFoldTwoEntryPHINode(false);
+  } else {
+    Options.setSimplifyCondBranch(true).setFoldTwoEntryPHINode(true);
+  }
+  if (!simplifyFunctionCFG(F, TTI, DT, Options))
     return PreservedAnalyses::all();
   PreservedAnalyses PA;
+  if (RequireAndPreserveDomTree)
+    PA.preserve<DominatorTreeAnalysis>();
   PA.preserve<GlobalsAA>();
   return PA;
 }
@@ -247,33 +329,14 @@ struct CFGSimplifyPass : public FunctionPass {
   SimplifyCFGOptions Options;
   std::function<bool(const Function &)> PredicateFtor;
 
-  CFGSimplifyPass(unsigned Threshold = 1, bool ForwardSwitchCond = false,
-                  bool ConvertSwitch = false, bool KeepLoops = true,
-                  bool SinkCommon = false,
+  CFGSimplifyPass(SimplifyCFGOptions Options_ = SimplifyCFGOptions(),
                   std::function<bool(const Function &)> Ftor = nullptr)
-      : FunctionPass(ID), PredicateFtor(std::move(Ftor)) {
+      : FunctionPass(ID), Options(Options_), PredicateFtor(std::move(Ftor)) {
 
     initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry());
 
     // Check for command-line overrides of options for debug/customization.
-    Options.BonusInstThreshold = UserBonusInstThreshold.getNumOccurrences()
-                                    ? UserBonusInstThreshold
-                                    : Threshold;
-
-    Options.ForwardSwitchCondToPhi = UserForwardSwitchCond.getNumOccurrences()
-                                         ? UserForwardSwitchCond
-                                         : ForwardSwitchCond;
-
-    Options.ConvertSwitchToLookupTable = UserSwitchToLookup.getNumOccurrences()
-                                             ? UserSwitchToLookup
-                                             : ConvertSwitch;
-
-    Options.NeedCanonicalLoop =
-        UserKeepLoops.getNumOccurrences() ? UserKeepLoops : KeepLoops;
-
-    Options.SinkCommonInsts = UserSinkCommonInsts.getNumOccurrences()
-                                  ? UserSinkCommonInsts
-                                  : SinkCommon;
+    applyCommandLineOverridesToOptions(Options);
   }
 
   bool runOnFunction(Function &F) override {
@@ -281,6 +344,9 @@ struct CFGSimplifyPass : public FunctionPass {
       return false;
 
     Options.AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+    DominatorTree *DT = nullptr;
+    if (RequireAndPreserveDomTree)
+      DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     if (F.hasFnAttribute(Attribute::OptForFuzzing)) {
       Options.setSimplifyCondBranch(false)
              .setFoldTwoEntryPHINode(false);
@@ -290,11 +356,15 @@ struct CFGSimplifyPass : public FunctionPass {
     }
 
     auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
-    return simplifyFunctionCFG(F, TTI, Options);
+    return simplifyFunctionCFG(F, TTI, DT, Options);
   }
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AssumptionCacheTracker>();
+    if (RequireAndPreserveDomTree)
+      AU.addRequired<DominatorTreeWrapperPass>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
+    if (RequireAndPreserveDomTree)
+      AU.addPreserved<DominatorTreeWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
   }
 };
@@ -305,15 +375,13 @@ INITIALIZE_PASS_BEGIN(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false,
                       false)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_END(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false,
                     false)
 
 // Public interface to the CFGSimplification pass
 FunctionPass *
-llvm::createCFGSimplificationPass(unsigned Threshold, bool ForwardSwitchCond,
-                                  bool ConvertSwitch, bool KeepLoops,
-                                  bool SinkCommon,
+llvm::createCFGSimplificationPass(SimplifyCFGOptions Options,
                                   std::function<bool(const Function &)> Ftor) {
-  return new CFGSimplifyPass(Threshold, ForwardSwitchCond, ConvertSwitch,
-                             KeepLoops, SinkCommon, std::move(Ftor));
+  return new CFGSimplifyPass(Options, std::move(Ftor));
 }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Sink.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Sink.cpp
index 48f289c8f17d..89cfbe384be4 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Sink.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Sink.cpp
@@ -32,31 +32,6 @@ using namespace llvm;
 STATISTIC(NumSunk, "Number of instructions sunk");
 STATISTIC(NumSinkIter, "Number of sinking iterations");
 
-/// AllUsesDominatedByBlock - Return true if all uses of the specified value
-/// occur in blocks dominated by the specified block.
-static bool AllUsesDominatedByBlock(Instruction *Inst, BasicBlock *BB,
-                                    DominatorTree &DT) {
-  // Ignoring debug uses is necessary so debug info doesn't affect the code.
-  // This may leave a referencing dbg_value in the original block, before
-  // the definition of the vreg.  Dwarf generator handles this although the
-  // user might not get the right info at runtime.
-  for (Use &U : Inst->uses()) {
-    // Determine the block of the use.
-    Instruction *UseInst = cast<Instruction>(U.getUser());
-    BasicBlock *UseBlock = UseInst->getParent();
-    if (PHINode *PN = dyn_cast<PHINode>(UseInst)) {
-      // PHI nodes use the operand in the predecessor block, not the block with
-      // the PHI.
-      unsigned Num = PHINode::getIncomingValueNumForOperand(U.getOperandNo());
-      UseBlock = PN->getIncomingBlock(Num);
-    }
-    // Check that it dominates.
-    if (!DT.dominates(BB, UseBlock))
-      return false;
-  }
-  return true;
-}
-
 static bool isSafeToMove(Instruction *Inst, AliasAnalysis &AA,
                          SmallPtrSetImpl<Instruction *> &Stores) {
 
@@ -97,11 +72,6 @@ static bool IsAcceptableTarget(Instruction *Inst, BasicBlock *SuccToSinkTo,
   assert(Inst && "Instruction to be sunk is null");
   assert(SuccToSinkTo && "Candidate sink target is null");
 
-  // It is not possible to sink an instruction into its own block.  This can
-  // happen with loops.
-  if (Inst->getParent() == SuccToSinkTo)
-    return false;
-
   // It's never legal to sink an instruction into a block which terminates in an
   // EH-pad.
   if (SuccToSinkTo->getTerminator()->isExceptionalTerminator())
@@ -129,9 +99,7 @@ static bool IsAcceptableTarget(Instruction *Inst, BasicBlock *SuccToSinkTo,
       return false;
   }
 
-  // Finally, check that all the uses of the instruction are actually
-  // dominated by the candidate
-  return AllUsesDominatedByBlock(Inst, SuccToSinkTo, DT);
+  return true;
 }
 
 /// SinkInstruction - Determine whether it is safe to sink the specified machine
@@ -162,25 +130,37 @@ static bool SinkInstruction(Instruction *Inst,
   // decide.
   BasicBlock *SuccToSinkTo = nullptr;
 
-  // Instructions can only be sunk if all their uses are in blocks
-  // dominated by one of the successors.
-  // Look at all the dominated blocks and see if we can sink it in one.
-  DomTreeNode *DTN = DT.getNode(Inst->getParent());
-  for (auto I = DTN->begin(), E = DTN->end(); I != E && SuccToSinkTo == nullptr;
-       ++I) {
-    BasicBlock *Candidate = (*I)->getBlock();
-    // A node always immediate-dominates its children on the dominator
-    // tree.
-    if (IsAcceptableTarget(Inst, Candidate, DT, LI))
-      SuccToSinkTo = Candidate;
+  // Find the nearest common dominator of all users as the candidate.
+  BasicBlock *BB = Inst->getParent();
+  for (Use &U : Inst->uses()) {
+    Instruction *UseInst = cast<Instruction>(U.getUser());
+    BasicBlock *UseBlock = UseInst->getParent();
+    // Don't worry about dead users.
+    if (!DT.isReachableFromEntry(UseBlock))
+      continue;
+    if (PHINode *PN = dyn_cast<PHINode>(UseInst)) {
+      // PHI nodes use the operand in the predecessor block, not the block with
+      // the PHI.
+      unsigned Num = PHINode::getIncomingValueNumForOperand(U.getOperandNo());
+      UseBlock = PN->getIncomingBlock(Num);
+    }
+    if (SuccToSinkTo)
+      SuccToSinkTo = DT.findNearestCommonDominator(SuccToSinkTo, UseBlock);
+    else
+      SuccToSinkTo = UseBlock;
+    // The current basic block needs to dominate the candidate.
+    if (!DT.dominates(BB, SuccToSinkTo))
+      return false;
   }
 
-  // If no suitable postdominator was found, look at all the successors and
-  // decide which one we should sink to, if any.
-  for (succ_iterator I = succ_begin(Inst->getParent()),
-      E = succ_end(Inst->getParent()); I != E && !SuccToSinkTo; ++I) {
-    if (IsAcceptableTarget(Inst, *I, DT, LI))
-      SuccToSinkTo = *I;
+  if (SuccToSinkTo) {
+    // The nearest common dominator may be in a parent loop of BB, which may not
+    // be beneficial. Find an ancestor.
+    while (SuccToSinkTo != BB &&
+           !IsAcceptableTarget(Inst, SuccToSinkTo, DT, LI))
+      SuccToSinkTo = DT.getNode(SuccToSinkTo)->getIDom()->getBlock();
+    if (SuccToSinkTo == BB)
+      SuccToSinkTo = nullptr;
   }
 
   // If we couldn't find a block to sink to, ignore this instruction.
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp
index 8258b92a716d..9b18c945d950 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp
@@ -756,13 +756,10 @@ static bool tryToSpeculatePHIs(SmallVectorImpl<PHINode *> &PNs,
   // For each PHI node in this block, check whether there are immediate folding
   // opportunities from speculation, and whether that speculation will be
   // valid. This determise the set of safe PHIs to speculate.
-  PNs.erase(llvm::remove_if(PNs,
-                            [&](PHINode *PN) {
-                              return !isSafeAndProfitableToSpeculateAroundPHI(
-                                  *PN, CostSavingsMap, PotentialSpecSet,
-                                  UnsafeSet, DT, TTI);
-                            }),
-            PNs.end());
+  llvm::erase_if(PNs, [&](PHINode *PN) {
+    return !isSafeAndProfitableToSpeculateAroundPHI(
+        *PN, CostSavingsMap, PotentialSpecSet, UnsafeSet, DT, TTI);
+  });
   // If no PHIs were profitable, skip.
   if (PNs.empty()) {
     LLVM_DEBUG(dbgs() << "  No safe and profitable PHIs found!\n");
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
index f82a2936c762..c78185f2a6ad 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
@@ -245,6 +245,13 @@ static unsigned ComputeSpeculationCost(const Instruction *I,
     case Instruction::FNeg:
     case Instruction::ICmp:
     case Instruction::FCmp:
+    case Instruction::Trunc:
+    case Instruction::Freeze:
+    case Instruction::ExtractElement:
+    case Instruction::InsertElement:
+    case Instruction::ShuffleVector:
+    case Instruction::ExtractValue:
+    case Instruction::InsertValue:
       return TTI.getUserCost(I, TargetTransformInfo::TCK_SizeAndLatency);
 
     default:
@@ -274,7 +281,7 @@ bool SpeculativeExecutionPass::considerHoistingFromTo(
 
     for (const Value *V : U->operand_values()) {
       if (const Instruction *I = dyn_cast<Instruction>(V)) {
-        if (NotHoisted.count(I) > 0)
+        if (NotHoisted.contains(I))
           return false;
       }
     }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
index 9f82b1263ebd..577992ccb5f4 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
@@ -55,6 +55,7 @@
 // - When (i' - i) is constant but i and i' are not, we could still perform
 //   SLSR.
 
+#include "llvm/Transforms/Scalar/StraightLineStrengthReduce.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SmallVector.h"
@@ -95,8 +96,39 @@ static const unsigned UnknownAddressSpace =
 
 namespace {
 
-class StraightLineStrengthReduce : public FunctionPass {
+class StraightLineStrengthReduceLegacyPass : public FunctionPass {
+  const DataLayout *DL = nullptr;
+
 public:
+  static char ID;
+
+  StraightLineStrengthReduceLegacyPass() : FunctionPass(ID) {
+    initializeStraightLineStrengthReduceLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    // We do not modify the shape of the CFG.
+    AU.setPreservesCFG();
+  }
+
+  bool doInitialization(Module &M) override {
+    DL = &M.getDataLayout();
+    return false;
+  }
+
+  bool runOnFunction(Function &F) override;
+};
+
+class StraightLineStrengthReduce {
+public:
+  StraightLineStrengthReduce(const DataLayout *DL, DominatorTree *DT,
+                             ScalarEvolution *SE, TargetTransformInfo *TTI)
+      : DL(DL), DT(DT), SE(SE), TTI(TTI) {}
+
   // SLSR candidate. Such a candidate must be in one of the forms described in
   // the header comments.
   struct Candidate {
@@ -144,26 +176,7 @@ public:
     Candidate *Basis = nullptr;
   };
 
-  static char ID;
-
-  StraightLineStrengthReduce() : FunctionPass(ID) {
-    initializeStraightLineStrengthReducePass(*PassRegistry::getPassRegistry());
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addRequired<ScalarEvolutionWrapperPass>();
-    AU.addRequired<TargetTransformInfoWrapperPass>();
-    // We do not modify the shape of the CFG.
-    AU.setPreservesCFG();
-  }
-
-  bool doInitialization(Module &M) override {
-    DL = &M.getDataLayout();
-    return false;
-  }
-
-  bool runOnFunction(Function &F) override;
+  bool runOnFunction(Function &F);
 
 private:
   // Returns true if Basis is a basis for C, i.e., Basis dominates C and they
@@ -243,18 +256,18 @@ private:
 
 } // end anonymous namespace
 
-char StraightLineStrengthReduce::ID = 0;
+char StraightLineStrengthReduceLegacyPass::ID = 0;
 
-INITIALIZE_PASS_BEGIN(StraightLineStrengthReduce, "slsr",
+INITIALIZE_PASS_BEGIN(StraightLineStrengthReduceLegacyPass, "slsr",
                       "Straight line strength reduction", false, false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_END(StraightLineStrengthReduce, "slsr",
+INITIALIZE_PASS_END(StraightLineStrengthReduceLegacyPass, "slsr",
                     "Straight line strength reduction", false, false)
 
 FunctionPass *llvm::createStraightLineStrengthReducePass() {
-  return new StraightLineStrengthReduce();
+  return new StraightLineStrengthReduceLegacyPass();
 }
 
 bool StraightLineStrengthReduce::isBasisFor(const Candidate &Basis,
@@ -272,9 +285,7 @@ bool StraightLineStrengthReduce::isBasisFor(const Candidate &Basis,
 
 static bool isGEPFoldable(GetElementPtrInst *GEP,
                           const TargetTransformInfo *TTI) {
-  SmallVector<const Value*, 4> Indices;
-  for (auto I = GEP->idx_begin(); I != GEP->idx_end(); ++I)
-    Indices.push_back(*I);
+  SmallVector<const Value *, 4> Indices(GEP->indices());
   return TTI->getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
                          Indices) == TargetTransformInfo::TCC_Free;
 }
@@ -704,13 +715,17 @@ void StraightLineStrengthReduce::rewriteCandidateWithBasis(
   UnlinkedInstructions.push_back(C.Ins);
 }
 
-bool StraightLineStrengthReduce::runOnFunction(Function &F) {
+bool StraightLineStrengthReduceLegacyPass::runOnFunction(Function &F) {
   if (skipFunction(F))
     return false;
 
-  TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
-  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+  auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  return StraightLineStrengthReduce(DL, DT, SE, TTI).runOnFunction(F);
+}
+
+bool StraightLineStrengthReduce::runOnFunction(Function &F) {
   // Traverse the dominator tree in the depth-first order. This order makes sure
   // all bases of a candidate are in Candidates when we process it.
   for (const auto Node : depth_first(DT))
@@ -740,3 +755,25 @@ bool StraightLineStrengthReduce::runOnFunction(Function &F) {
   UnlinkedInstructions.clear();
   return Ret;
 }
+
+namespace llvm {
+
+PreservedAnalyses
+StraightLineStrengthReducePass::run(Function &F, FunctionAnalysisManager &AM) {
+  const DataLayout *DL = &F.getParent()->getDataLayout();
+  auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
+  auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
+  auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
+
+  if (!StraightLineStrengthReduce(DL, DT, SE, TTI).runOnFunction(F))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<ScalarEvolutionAnalysis>();
+  PA.preserve<TargetIRAnalysis>();
+  return PA;
+}
+
+} // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index c20e57b02c1a..3e15cad5f3f3 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Scalar/StructurizeCFG.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SCCIterator.h"
@@ -28,6 +29,7 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Use.h"
@@ -55,7 +57,7 @@ using namespace llvm::PatternMatch;
 #define DEBUG_TYPE "structurizecfg"
 
 // The name for newly created blocks.
-static const char *const FlowBlockName = "Flow";
+const char FlowBlockName[] = "Flow";
 
 namespace {
 
@@ -233,9 +235,8 @@ public:
 /// while the true side continues the general flow. So the loop condition
 /// consist of a network of PHI nodes where the true incoming values expresses
 /// breaks and the false values expresses continue states.
-class StructurizeCFG : public RegionPass {
-  bool SkipUniformRegions;
 
+class StructurizeCFG {
   Type *Boolean;
   ConstantInt *BoolTrue;
   ConstantInt *BoolFalse;
@@ -244,7 +245,7 @@ class StructurizeCFG : public RegionPass {
   Function *Func;
   Region *ParentRegion;
 
-  LegacyDivergenceAnalysis *DA;
+  LegacyDivergenceAnalysis *DA = nullptr;
   DominatorTree *DT;
 
   SmallVector<RegionNode *, 8> Order;
@@ -308,20 +309,36 @@ class StructurizeCFG : public RegionPass {
 
   void rebuildSSA();
 
+public:
+  void init(Region *R);
+  bool run(Region *R, DominatorTree *DT);
+  bool makeUniformRegion(Region *R, LegacyDivergenceAnalysis *DA);
+};
+
+class StructurizeCFGLegacyPass : public RegionPass {
+  bool SkipUniformRegions;
+
 public:
   static char ID;
 
-  explicit StructurizeCFG(bool SkipUniformRegions_ = false)
-      : RegionPass(ID),
-        SkipUniformRegions(SkipUniformRegions_) {
+  explicit StructurizeCFGLegacyPass(bool SkipUniformRegions_ = false)
+      : RegionPass(ID), SkipUniformRegions(SkipUniformRegions_) {
     if (ForceSkipUniformRegions.getNumOccurrences())
       SkipUniformRegions = ForceSkipUniformRegions.getValue();
-    initializeStructurizeCFGPass(*PassRegistry::getPassRegistry());
+    initializeStructurizeCFGLegacyPassPass(*PassRegistry::getPassRegistry());
   }
 
-  bool doInitialization(Region *R, RGPassManager &RGM) override;
-
-  bool runOnRegion(Region *R, RGPassManager &RGM) override;
+  bool runOnRegion(Region *R, RGPassManager &RGM) override {
+    StructurizeCFG SCFG;
+    SCFG.init(R);
+    if (SkipUniformRegions) {
+      LegacyDivergenceAnalysis *DA = &getAnalysis<LegacyDivergenceAnalysis>();
+      if (SCFG.makeUniformRegion(R, DA))
+        return false;
+    }
+    DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    return SCFG.run(R, DT);
+  }
 
   StringRef getPassName() const override { return "Structurize control flow"; }
 
@@ -338,28 +355,16 @@ public:
 
 } // end anonymous namespace
 
-char StructurizeCFG::ID = 0;
+char StructurizeCFGLegacyPass::ID = 0;
 
-INITIALIZE_PASS_BEGIN(StructurizeCFG, "structurizecfg", "Structurize the CFG",
-                      false, false)
+INITIALIZE_PASS_BEGIN(StructurizeCFGLegacyPass, "structurizecfg",
+                      "Structurize the CFG", false, false)
 INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
-INITIALIZE_PASS_DEPENDENCY(LowerSwitch)
+INITIALIZE_PASS_DEPENDENCY(LowerSwitchLegacyPass)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(RegionInfoPass)
-INITIALIZE_PASS_END(StructurizeCFG, "structurizecfg", "Structurize the CFG",
-                    false, false)
-
-/// Initialize the types and constants used in the pass
-bool StructurizeCFG::doInitialization(Region *R, RGPassManager &RGM) {
-  LLVMContext &Context = R->getEntry()->getContext();
-
-  Boolean = Type::getInt1Ty(Context);
-  BoolTrue = ConstantInt::getTrue(Context);
-  BoolFalse = ConstantInt::getFalse(Context);
-  BoolUndef = UndefValue::get(Boolean);
-
-  return false;
-}
+INITIALIZE_PASS_END(StructurizeCFGLegacyPass, "structurizecfg",
+                    "Structurize the CFG", false, false)
 
 /// Build up the general order of nodes, by performing a topological sort of the
 /// parent region's nodes, while ensuring that there is no outer cycle node
@@ -1003,48 +1008,62 @@ static bool hasOnlyUniformBranches(Region *R, unsigned UniformMDKindID,
   return SubRegionsAreUniform || (ConditionalDirectChildren <= 1);
 }
 
-/// Run the transformation for each region found
-bool StructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) {
+void StructurizeCFG::init(Region *R) {
+  LLVMContext &Context = R->getEntry()->getContext();
+
+  Boolean = Type::getInt1Ty(Context);
+  BoolTrue = ConstantInt::getTrue(Context);
+  BoolFalse = ConstantInt::getFalse(Context);
+  BoolUndef = UndefValue::get(Boolean);
+
+  this->DA = nullptr;
+}
+
+bool StructurizeCFG::makeUniformRegion(Region *R,
+                                       LegacyDivergenceAnalysis *DA) {
   if (R->isTopLevelRegion())
     return false;
 
-  DA = nullptr;
-
-  if (SkipUniformRegions) {
-    // TODO: We could probably be smarter here with how we handle sub-regions.
-    // We currently rely on the fact that metadata is set by earlier invocations
-    // of the pass on sub-regions, and that this metadata doesn't get lost --
-    // but we shouldn't rely on metadata for correctness!
-    unsigned UniformMDKindID =
-        R->getEntry()->getContext().getMDKindID("structurizecfg.uniform");
-    DA = &getAnalysis<LegacyDivergenceAnalysis>();
-
-    if (hasOnlyUniformBranches(R, UniformMDKindID, *DA)) {
-      LLVM_DEBUG(dbgs() << "Skipping region with uniform control flow: " << *R
-                        << '\n');
-
-      // Mark all direct child block terminators as having been treated as
-      // uniform. To account for a possible future in which non-uniform
-      // sub-regions are treated more cleverly, indirect children are not
-      // marked as uniform.
-      MDNode *MD = MDNode::get(R->getEntry()->getParent()->getContext(), {});
-      for (RegionNode *E : R->elements()) {
-        if (E->isSubRegion())
-          continue;
-
-        if (Instruction *Term = E->getEntry()->getTerminator())
-          Term->setMetadata(UniformMDKindID, MD);
-      }
+  this->DA = DA;
+  // TODO: We could probably be smarter here with how we handle sub-regions.
+  // We currently rely on the fact that metadata is set by earlier invocations
+  // of the pass on sub-regions, and that this metadata doesn't get lost --
+  // but we shouldn't rely on metadata for correctness!
+  unsigned UniformMDKindID =
+      R->getEntry()->getContext().getMDKindID("structurizecfg.uniform");
+
+  if (hasOnlyUniformBranches(R, UniformMDKindID, *DA)) {
+    LLVM_DEBUG(dbgs() << "Skipping region with uniform control flow: " << *R
+                      << '\n');
+
+    // Mark all direct child block terminators as having been treated as
+    // uniform. To account for a possible future in which non-uniform
+    // sub-regions are treated more cleverly, indirect children are not
+    // marked as uniform.
+    MDNode *MD = MDNode::get(R->getEntry()->getParent()->getContext(), {});
+    for (RegionNode *E : R->elements()) {
+      if (E->isSubRegion())
+        continue;
 
-      return false;
+      if (Instruction *Term = E->getEntry()->getTerminator())
+        Term->setMetadata(UniformMDKindID, MD);
     }
+
+    return true;
   }
+  return false;
+}
+
+/// Run the transformation for each region found
+bool StructurizeCFG::run(Region *R, DominatorTree *DT) {
+  if (R->isTopLevelRegion())
+    return false;
+
+  this->DT = DT;
 
   Func = R->getEntry()->getParent();
   ParentRegion = R;
 
-  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-
   orderNodes();
   collectInfos();
   createFlow();
@@ -1069,5 +1088,33 @@ bool StructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) {
 }
 
 Pass *llvm::createStructurizeCFGPass(bool SkipUniformRegions) {
-  return new StructurizeCFG(SkipUniformRegions);
+  return new StructurizeCFGLegacyPass(SkipUniformRegions);
+}
+
+static void addRegionIntoQueue(Region &R, std::vector<Region *> &Regions) {
+  Regions.push_back(&R);
+  for (const auto &E : R)
+    addRegionIntoQueue(*E, Regions);
+}
+
+PreservedAnalyses StructurizeCFGPass::run(Function &F,
+                                          FunctionAnalysisManager &AM) {
+
+  bool Changed = false;
+  DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
+  auto &RI = AM.getResult<RegionInfoAnalysis>(F);
+  std::vector<Region *> Regions;
+  addRegionIntoQueue(*RI.getTopLevelRegion(), Regions);
+  while (!Regions.empty()) {
+    Region *R = Regions.back();
+    StructurizeCFG SCFG;
+    SCFG.init(R);
+    Changed |= SCFG.run(R, DT);
+    Regions.pop_back();
+  }
+  if (!Changed)
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserve<DominatorTreeAnalysis>();
+  return PA;
 }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
index 5bb1d54d7d12..9e7cccc88412 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -92,7 +92,10 @@ STATISTIC(NumAccumAdded, "Number of accumulators introduced");
 /// Scan the specified function for alloca instructions.
 /// If it contains any dynamic allocas, returns false.
 static bool canTRE(Function &F) {
-  // Because of PR962, we don't TRE dynamic allocas.
+  // FIXME: The code generator produces really bad code when an 'escaping
+  // alloca' is changed from being a static alloca to being a dynamic alloca.
+  // Until this is resolved, disable this transformation if that would ever
+  // happen.  This bug is PR962.
   return llvm::all_of(instructions(F), [](Instruction &I) {
     auto *AI = dyn_cast<AllocaInst>(&I);
     return !AI || AI->isStaticAlloca();
@@ -237,7 +240,11 @@ static bool markTails(Function &F, bool &AllCallsAreTailCalls,
         Escaped = ESCAPED;
 
       CallInst *CI = dyn_cast<CallInst>(&I);
-      if (!CI || CI->isTailCall() || isa<DbgInfoIntrinsic>(&I))
+      // A PseudoProbeInst has the IntrInaccessibleMemOnly tag hence it is
+      // considered accessing memory and will be marked as a tail call if we
+      // don't bail out here.
+      if (!CI || CI->isTailCall() || isa<DbgInfoIntrinsic>(&I) ||
+          isa<PseudoProbeInst>(&I))
         continue;
 
       bool IsNoTail = CI->isNoTailCall() || CI->hasOperandBundles();
@@ -279,7 +286,7 @@ static bool markTails(Function &F, bool &AllCallsAreTailCalls,
       }
     }
 
-    for (auto *SuccBB : make_range(succ_begin(BB), succ_end(BB))) {
+    for (auto *SuccBB : successors(BB)) {
       auto &State = Visited[SuccBB];
       if (State < Escaped) {
         State = Escaped;
@@ -419,7 +426,7 @@ class TailRecursionEliminator {
                           DomTreeUpdater &DTU)
       : F(F), TTI(TTI), AA(AA), ORE(ORE), DTU(DTU) {}
 
-  CallInst *findTRECandidate(Instruction *TI,
+  CallInst *findTRECandidate(BasicBlock *BB,
                              bool CannotTailCallElimCallsMarkedTail);
 
   void createTailRecurseLoopHeader(CallInst *CI);
@@ -428,14 +435,10 @@ class TailRecursionEliminator {
 
   bool eliminateCall(CallInst *CI);
 
-  bool foldReturnAndProcessPred(ReturnInst *Ret,
-                                bool CannotTailCallElimCallsMarkedTail);
-
-  bool processReturningBlock(ReturnInst *Ret,
-                             bool CannotTailCallElimCallsMarkedTail);
-
   void cleanupAndFinalize();
 
+  bool processBlock(BasicBlock &BB, bool CannotTailCallElimCallsMarkedTail);
+
 public:
   static bool eliminate(Function &F, const TargetTransformInfo *TTI,
                         AliasAnalysis *AA, OptimizationRemarkEmitter *ORE,
@@ -444,8 +447,8 @@ public:
 } // namespace
 
 CallInst *TailRecursionEliminator::findTRECandidate(
-    Instruction *TI, bool CannotTailCallElimCallsMarkedTail) {
-  BasicBlock *BB = TI->getParent();
+    BasicBlock *BB, bool CannotTailCallElimCallsMarkedTail) {
+  Instruction *TI = BB->getTerminator();
 
   if (&BB->front() == TI) // Make sure there is something before the terminator.
     return nullptr;
@@ -672,63 +675,6 @@ bool TailRecursionEliminator::eliminateCall(CallInst *CI) {
   return true;
 }
 
-bool TailRecursionEliminator::foldReturnAndProcessPred(
-    ReturnInst *Ret, bool CannotTailCallElimCallsMarkedTail) {
-  BasicBlock *BB = Ret->getParent();
-
-  bool Change = false;
-
-  // Make sure this block is a trivial return block.
-  assert(BB->getFirstNonPHIOrDbg() == Ret &&
-         "Trying to fold non-trivial return block");
-
-  // If the return block contains nothing but the return and PHI's,
-  // there might be an opportunity to duplicate the return in its
-  // predecessors and perform TRE there. Look for predecessors that end
-  // in unconditional branch and recursive call(s).
-  SmallVector<BranchInst*, 8> UncondBranchPreds;
-  for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
-    BasicBlock *Pred = *PI;
-    Instruction *PTI = Pred->getTerminator();
-    if (BranchInst *BI = dyn_cast<BranchInst>(PTI))
-      if (BI->isUnconditional())
-        UncondBranchPreds.push_back(BI);
-  }
-
-  while (!UncondBranchPreds.empty()) {
-    BranchInst *BI = UncondBranchPreds.pop_back_val();
-    BasicBlock *Pred = BI->getParent();
-    if (CallInst *CI =
-            findTRECandidate(BI, CannotTailCallElimCallsMarkedTail)) {
-      LLVM_DEBUG(dbgs() << "FOLDING: " << *BB
-                        << "INTO UNCOND BRANCH PRED: " << *Pred);
-      FoldReturnIntoUncondBranch(Ret, BB, Pred, &DTU);
-
-      // Cleanup: if all predecessors of BB have been eliminated by
-      // FoldReturnIntoUncondBranch, delete it.  It is important to empty it,
-      // because the ret instruction in there is still using a value which
-      // eliminateRecursiveTailCall will attempt to remove.
-      if (!BB->hasAddressTaken() && pred_begin(BB) == pred_end(BB))
-        DTU.deleteBB(BB);
-
-      eliminateCall(CI);
-      ++NumRetDuped;
-      Change = true;
-    }
-  }
-
-  return Change;
-}
-
-bool TailRecursionEliminator::processReturningBlock(
-    ReturnInst *Ret, bool CannotTailCallElimCallsMarkedTail) {
-  CallInst *CI = findTRECandidate(Ret, CannotTailCallElimCallsMarkedTail);
-  if (!CI)
-    return false;
-
-  return eliminateCall(CI);
-}
-
 void TailRecursionEliminator::cleanupAndFinalize() {
   // If we eliminated any tail recursions, it's possible that we inserted some
   // silly PHI nodes which just merge an initial value (the incoming operand)
@@ -801,6 +747,50 @@ void TailRecursionEliminator::cleanupAndFinalize() {
   }
 }
 
+bool TailRecursionEliminator::processBlock(
+    BasicBlock &BB, bool CannotTailCallElimCallsMarkedTail) {
+  Instruction *TI = BB.getTerminator();
+
+  if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+    if (BI->isConditional())
+      return false;
+
+    BasicBlock *Succ = BI->getSuccessor(0);
+    ReturnInst *Ret = dyn_cast<ReturnInst>(Succ->getFirstNonPHIOrDbg(true));
+
+    if (!Ret)
+      return false;
+
+    CallInst *CI = findTRECandidate(&BB, CannotTailCallElimCallsMarkedTail);
+
+    if (!CI)
+      return false;
+
+    LLVM_DEBUG(dbgs() << "FOLDING: " << *Succ
+                      << "INTO UNCOND BRANCH PRED: " << BB);
+    FoldReturnIntoUncondBranch(Ret, Succ, &BB, &DTU);
+    ++NumRetDuped;
+
+    // If all predecessors of Succ have been eliminated by
+    // FoldReturnIntoUncondBranch, delete it.  It is important to empty it,
+    // because the ret instruction in there is still using a value which
+    // eliminateCall will attempt to remove.  This block can only contain
+    // instructions that can't have uses, therefore it is safe to remove.
+    if (pred_empty(Succ))
+      DTU.deleteBB(Succ);
+
+    eliminateCall(CI);
+    return true;
+  } else if (isa<ReturnInst>(TI)) {
+    CallInst *CI = findTRECandidate(&BB, CannotTailCallElimCallsMarkedTail);
+
+    if (CI)
+      return eliminateCall(CI);
+  }
+
+  return false;
+}
+
 bool TailRecursionEliminator::eliminate(Function &F,
                                         const TargetTransformInfo *TTI,
                                         AliasAnalysis *AA,
@@ -825,23 +815,11 @@ bool TailRecursionEliminator::eliminate(Function &F,
   // TRE would deallocate variable sized allocas, TRE doesn't).
   bool CanTRETailMarkedCall = canTRE(F);
 
+  // Change any tail recursive calls to loops.
   TailRecursionEliminator TRE(F, TTI, AA, ORE, DTU);
 
-  // Change any tail recursive calls to loops.
-  //
-  // FIXME: The code generator produces really bad code when an 'escaping
-  // alloca' is changed from being a static alloca to being a dynamic alloca.
-  // Until this is resolved, disable this transformation if that would ever
-  // happen.  This bug is PR962.
-  for (Function::iterator BBI = F.begin(), E = F.end(); BBI != E; /*in loop*/) {
-    BasicBlock *BB = &*BBI++; // foldReturnAndProcessPred may delete BB.
-    if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) {
-      bool Change = TRE.processReturningBlock(Ret, !CanTRETailMarkedCall);
-      if (!Change && BB->getFirstNonPHIOrDbg() == Ret)
-        Change = TRE.foldReturnAndProcessPred(Ret, !CanTRETailMarkedCall);
-      MadeChange |= Change;
-    }
-  }
+  for (BasicBlock &BB : F)
+    MadeChange |= TRE.processBlock(BB, !CanTRETailMarkedCall);
 
   TRE.cleanupAndFinalize();
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp
index 7c81e6352dec..80a7d3a43ad6 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp
@@ -48,12 +48,12 @@ static void warnAboutLeftoverTransformations(Loop *L,
 
   if (hasVectorizeTransformation(L) == TM_ForcedByUser) {
     LLVM_DEBUG(dbgs() << "Leftover vectorization transformation\n");
-    Optional<int> VectorizeWidth =
-        getOptionalIntLoopAttribute(L, "llvm.loop.vectorize.width");
+    Optional<ElementCount> VectorizeWidth =
+        getOptionalElementCountLoopAttribute(L);
     Optional<int> InterleaveCount =
         getOptionalIntLoopAttribute(L, "llvm.loop.interleave.count");
 
-    if (VectorizeWidth.getValueOr(0) != 1)
+    if (!VectorizeWidth || VectorizeWidth->isVector())
       ORE->emit(
           DiagnosticInfoOptimizationFailure(DEBUG_TYPE,
                                             "FailedRequestedVectorization",
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp
index 84a66e1e96d2..ccdcf7cbce38 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp
@@ -17,9 +17,6 @@
 #include "llvm/Transforms/Utils/AMDGPUEmitPrintf.h"
 #include "llvm/ADT/SparseBitVector.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/IRBuilder.h"
-
-#include <iostream>
 
 using namespace llvm;
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp
index 7ff73fcdada7..3daff3b4430b 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp
@@ -52,6 +52,7 @@ namespace {
 bool isUsefullToPreserve(Attribute::AttrKind Kind) {
   switch (Kind) {
     case Attribute::NonNull:
+    case Attribute::NoUndef:
     case Attribute::Alignment:
     case Attribute::Dereferenceable:
     case Attribute::DereferenceableOrNull:
@@ -69,7 +70,7 @@ RetainedKnowledge canonicalizedKnowledge(RetainedKnowledge RK, Module *M) {
   default:
     return RK;
   case Attribute::NonNull:
-    RK.WasOn = GetUnderlyingObject(RK.WasOn, M->getDataLayout());
+    RK.WasOn = getUnderlyingObject(RK.WasOn);
     return RK;
   case Attribute::Alignment: {
     Value *V = RK.WasOn->stripInBoundsOffsets([&](const Value *Strip) {
@@ -145,7 +146,7 @@ struct AssumeBuilderState {
     if (!RK.WasOn)
       return true;
     if (RK.WasOn->getType()->isPointerTy()) {
-      Value *UnderlyingPtr = GetUnderlyingObject(RK.WasOn, M->getDataLayout());
+      Value *UnderlyingPtr = getUnderlyingObject(RK.WasOn);
       if (isa<AllocaInst>(UnderlyingPtr) || isa<GlobalValue>(UnderlyingPtr))
         return false;
     }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
index 085d91031cf9..6bcd42c4c6d8 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -105,7 +105,7 @@ void llvm::DeleteDeadBlocks(ArrayRef <BasicBlock *> BBs, DomTreeUpdater *DTU,
   DetatchDeadBlocks(BBs, DTU ? &Updates : nullptr, KeepOneInputPHIs);
 
   if (DTU)
-    DTU->applyUpdatesPermissive(Updates);
+    DTU->applyUpdates(Updates);
 
   for (BasicBlock *BB : BBs)
     if (DTU)
@@ -136,9 +136,10 @@ bool llvm::EliminateUnreachableBlocks(Function &F, DomTreeUpdater *DTU,
   return !DeadBlocks.empty();
 }
 
-void llvm::FoldSingleEntryPHINodes(BasicBlock *BB,
+bool llvm::FoldSingleEntryPHINodes(BasicBlock *BB,
                                    MemoryDependenceResults *MemDep) {
-  if (!isa<PHINode>(BB->begin())) return;
+  if (!isa<PHINode>(BB->begin()))
+    return false;
 
   while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) {
     if (PN->getIncomingValue(0) != PN)
@@ -151,6 +152,7 @@ void llvm::FoldSingleEntryPHINodes(BasicBlock *BB,
 
     PN->eraseFromParent();
   }
+  return true;
 }
 
 bool llvm::DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI,
@@ -228,19 +230,21 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU,
   // These dominator edges will be redirected from Pred.
   std::vector<DominatorTree::UpdateType> Updates;
   if (DTU) {
-    Updates.reserve(1 + (2 * succ_size(BB)));
+    SmallSetVector<BasicBlock *, 2> UniqueSuccessors(succ_begin(BB),
+                                                     succ_end(BB));
+    Updates.reserve(1 + (2 * UniqueSuccessors.size()));
     // Add insert edges first. Experimentally, for the particular case of two
     // blocks that can be merged, with a single successor and single predecessor
     // respectively, it is beneficial to have all insert updates first. Deleting
     // edges first may lead to unreachable blocks, followed by inserting edges
     // making the blocks reachable again. Such DT updates lead to high compile
     // times. We add inserts before deletes here to reduce compile time.
-    for (auto I = succ_begin(BB), E = succ_end(BB); I != E; ++I)
+    for (BasicBlock *UniqueSuccessor : UniqueSuccessors)
       // This successor of BB may already have PredBB as a predecessor.
-      if (llvm::find(successors(PredBB), *I) == succ_end(PredBB))
-        Updates.push_back({DominatorTree::Insert, PredBB, *I});
-    for (auto I = succ_begin(BB), E = succ_end(BB); I != E; ++I)
-      Updates.push_back({DominatorTree::Delete, BB, *I});
+      if (!llvm::is_contained(successors(PredBB), UniqueSuccessor))
+        Updates.push_back({DominatorTree::Insert, PredBB, UniqueSuccessor});
+    for (BasicBlock *UniqueSuccessor : UniqueSuccessors)
+      Updates.push_back({DominatorTree::Delete, BB, UniqueSuccessor});
     Updates.push_back({DominatorTree::Delete, PredBB, BB});
   }
 
@@ -285,11 +289,6 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU,
   // Add unreachable to now empty BB.
   new UnreachableInst(BB->getContext(), BB);
 
-  // Eliminate duplicate/redundant dbg.values. This seems to be a good place to
-  // do that since we might end up with redundant dbg.values describing the
-  // entry PHI node post-splice.
-  RemoveRedundantDbgInstrs(PredBB);
-
   // Inherit predecessors name if it exists.
   if (!PredBB->hasName())
     PredBB->takeName(BB);
@@ -306,7 +305,7 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU,
            isa<UnreachableInst>(BB->getTerminator()) &&
            "The successor list of BB isn't empty before "
            "applying corresponding DTU updates.");
-    DTU->applyUpdatesPermissive(Updates);
+    DTU->applyUpdates(Updates);
     DTU->deleteBB(BB);
   } else {
     BB->eraseFromParent(); // Nuke BB if DTU is nullptr.
@@ -498,14 +497,16 @@ void llvm::ReplaceInstWithInst(Instruction *From, Instruction *To) {
 }
 
 BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, DominatorTree *DT,
-                            LoopInfo *LI, MemorySSAUpdater *MSSAU) {
+                            LoopInfo *LI, MemorySSAUpdater *MSSAU,
+                            const Twine &BBName) {
   unsigned SuccNum = GetSuccessorNumber(BB, Succ);
 
   // If this is a critical edge, let SplitCriticalEdge do it.
   Instruction *LatchTerm = BB->getTerminator();
   if (SplitCriticalEdge(
           LatchTerm, SuccNum,
-          CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA()))
+          CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA(),
+          BBName))
     return LatchTerm->getSuccessor(SuccNum);
 
   // If the edge isn't critical, then BB has a single successor or Succ has a
@@ -515,14 +516,15 @@ BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, DominatorTree *DT,
     // block.
     assert(SP == BB && "CFG broken");
     SP = nullptr;
-    return SplitBlock(Succ, &Succ->front(), DT, LI, MSSAU);
+    return SplitBlock(Succ, &Succ->front(), DT, LI, MSSAU, BBName,
+                      /*Before=*/true);
   }
 
   // Otherwise, if BB has a single successor, split it at the bottom of the
   // block.
   assert(BB->getTerminator()->getNumSuccessors() == 1 &&
          "Should have a single succ!");
-  return SplitBlock(BB, BB->getTerminator(), DT, LI, MSSAU);
+  return SplitBlock(BB, BB->getTerminator(), DT, LI, MSSAU, BBName);
 }
 
 unsigned
@@ -540,9 +542,16 @@ llvm::SplitAllCriticalEdges(Function &F,
   return NumBroken;
 }
 
-BasicBlock *llvm::SplitBlock(BasicBlock *Old, Instruction *SplitPt,
-                             DominatorTree *DT, LoopInfo *LI,
-                             MemorySSAUpdater *MSSAU, const Twine &BBName) {
+static BasicBlock *SplitBlockImpl(BasicBlock *Old, Instruction *SplitPt,
+                                  DomTreeUpdater *DTU, DominatorTree *DT,
+                                  LoopInfo *LI, MemorySSAUpdater *MSSAU,
+                                  const Twine &BBName, bool Before) {
+  if (Before) {
+    DomTreeUpdater LocalDTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
+    return splitBlockBefore(Old, SplitPt,
+                            DTU ? DTU : (DT ? &LocalDTU : nullptr), LI, MSSAU,
+                            BBName);
+  }
   BasicBlock::iterator SplitIt = SplitPt->getIterator();
   while (isa<PHINode>(SplitIt) || SplitIt->isEHPad())
     ++SplitIt;
@@ -556,7 +565,20 @@ BasicBlock *llvm::SplitBlock(BasicBlock *Old, Instruction *SplitPt,
     if (Loop *L = LI->getLoopFor(Old))
       L->addBasicBlockToLoop(New, *LI);
 
-  if (DT)
+  if (DTU) {
+    SmallVector<DominatorTree::UpdateType, 8> Updates;
+    // Old dominates New. New node dominates all other nodes dominated by Old.
+    SmallSetVector<BasicBlock *, 8> UniqueSuccessorsOfOld(succ_begin(New),
+                                                          succ_end(New));
+    Updates.push_back({DominatorTree::Insert, Old, New});
+    Updates.reserve(Updates.size() + 2 * UniqueSuccessorsOfOld.size());
+    for (BasicBlock *UniqueSuccessorOfOld : UniqueSuccessorsOfOld) {
+      Updates.push_back({DominatorTree::Insert, New, UniqueSuccessorOfOld});
+      Updates.push_back({DominatorTree::Delete, Old, UniqueSuccessorOfOld});
+    }
+
+    DTU->applyUpdates(Updates);
+  } else if (DT)
     // Old dominates New. New node dominates all other nodes dominated by Old.
     if (DomTreeNode *OldNode = DT->getNode(Old)) {
       std::vector<DomTreeNode *> Children(OldNode->begin(), OldNode->end());
@@ -574,14 +596,94 @@ BasicBlock *llvm::SplitBlock(BasicBlock *Old, Instruction *SplitPt,
   return New;
 }
 
+BasicBlock *llvm::SplitBlock(BasicBlock *Old, Instruction *SplitPt,
+                             DominatorTree *DT, LoopInfo *LI,
+                             MemorySSAUpdater *MSSAU, const Twine &BBName,
+                             bool Before) {
+  return SplitBlockImpl(Old, SplitPt, /*DTU=*/nullptr, DT, LI, MSSAU, BBName,
+                        Before);
+}
+BasicBlock *llvm::SplitBlock(BasicBlock *Old, Instruction *SplitPt,
+                             DomTreeUpdater *DTU, LoopInfo *LI,
+                             MemorySSAUpdater *MSSAU, const Twine &BBName,
+                             bool Before) {
+  return SplitBlockImpl(Old, SplitPt, DTU, /*DT=*/nullptr, LI, MSSAU, BBName,
+                        Before);
+}
+
+BasicBlock *llvm::splitBlockBefore(BasicBlock *Old, Instruction *SplitPt,
+                                   DomTreeUpdater *DTU, LoopInfo *LI,
+                                   MemorySSAUpdater *MSSAU,
+                                   const Twine &BBName) {
+
+  BasicBlock::iterator SplitIt = SplitPt->getIterator();
+  while (isa<PHINode>(SplitIt) || SplitIt->isEHPad())
+    ++SplitIt;
+  std::string Name = BBName.str();
+  BasicBlock *New = Old->splitBasicBlock(
+      SplitIt, Name.empty() ? Old->getName() + ".split" : Name,
+      /* Before=*/true);
+
+  // The new block lives in whichever loop the old one did. This preserves
+  // LCSSA as well, because we force the split point to be after any PHI nodes.
+  if (LI)
+    if (Loop *L = LI->getLoopFor(Old))
+      L->addBasicBlockToLoop(New, *LI);
+
+  if (DTU) {
+    SmallVector<DominatorTree::UpdateType, 8> DTUpdates;
+    // New dominates Old. The predecessor nodes of the Old node dominate
+    // New node.
+    SmallSetVector<BasicBlock *, 8> UniquePredecessorsOfOld(pred_begin(New),
+                                                            pred_end(New));
+    DTUpdates.push_back({DominatorTree::Insert, New, Old});
+    DTUpdates.reserve(DTUpdates.size() + 2 * UniquePredecessorsOfOld.size());
+    for (BasicBlock *UniquePredecessorOfOld : UniquePredecessorsOfOld) {
+      DTUpdates.push_back({DominatorTree::Insert, UniquePredecessorOfOld, New});
+      DTUpdates.push_back({DominatorTree::Delete, UniquePredecessorOfOld, Old});
+    }
+
+    DTU->applyUpdates(DTUpdates);
+
+    // Move MemoryAccesses still tracked in Old, but part of New now.
+    // Update accesses in successor blocks accordingly.
+    if (MSSAU) {
+      MSSAU->applyUpdates(DTUpdates, DTU->getDomTree());
+      if (VerifyMemorySSA)
+        MSSAU->getMemorySSA()->verifyMemorySSA();
+    }
+  }
+  return New;
+}
+
 /// Update DominatorTree, LoopInfo, and LCCSA analysis information.
 static void UpdateAnalysisInformation(BasicBlock *OldBB, BasicBlock *NewBB,
                                       ArrayRef<BasicBlock *> Preds,
-                                      DominatorTree *DT, LoopInfo *LI,
-                                      MemorySSAUpdater *MSSAU,
+                                      DomTreeUpdater *DTU, DominatorTree *DT,
+                                      LoopInfo *LI, MemorySSAUpdater *MSSAU,
                                       bool PreserveLCSSA, bool &HasLoopExit) {
   // Update dominator tree if available.
-  if (DT) {
+  if (DTU) {
+    // Recalculation of DomTree is needed when updating a forward DomTree and
+    // the Entry BB is replaced.
+    if (NewBB == &NewBB->getParent()->getEntryBlock() && DTU->hasDomTree()) {
+      // The entry block was removed and there is no external interface for
+      // the dominator tree to be notified of this change. In this corner-case
+      // we recalculate the entire tree.
+      DTU->recalculate(*NewBB->getParent());
+    } else {
+      // Split block expects NewBB to have a non-empty set of predecessors.
+      SmallVector<DominatorTree::UpdateType, 8> Updates;
+      SmallSetVector<BasicBlock *, 8> UniquePreds(Preds.begin(), Preds.end());
+      Updates.push_back({DominatorTree::Insert, NewBB, OldBB});
+      Updates.reserve(Updates.size() + 2 * UniquePreds.size());
+      for (auto *UniquePred : UniquePreds) {
+        Updates.push_back({DominatorTree::Insert, UniquePred, NewBB});
+        Updates.push_back({DominatorTree::Delete, UniquePred, OldBB});
+      }
+      DTU->applyUpdates(Updates);
+    }
+  } else if (DT) {
     if (OldBB == DT->getRootNode()->getBlock()) {
       assert(NewBB == &NewBB->getParent()->getEntryBlock());
       DT->setNewRoot(NewBB);
@@ -599,6 +701,8 @@ static void UpdateAnalysisInformation(BasicBlock *OldBB, BasicBlock *NewBB,
   if (!LI)
     return;
 
+  if (DTU && DTU->hasDomTree())
+    DT = &DTU->getDomTree();
   assert(DT && "DT should be available to update LoopInfo!");
   Loop *L = LI->getLoopFor(OldBB);
 
@@ -732,11 +836,17 @@ static void UpdatePHINodes(BasicBlock *OrigBB, BasicBlock *NewBB,
   }
 }
 
-BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB,
-                                         ArrayRef<BasicBlock *> Preds,
-                                         const char *Suffix, DominatorTree *DT,
-                                         LoopInfo *LI, MemorySSAUpdater *MSSAU,
-                                         bool PreserveLCSSA) {
+static void SplitLandingPadPredecessorsImpl(
+    BasicBlock *OrigBB, ArrayRef<BasicBlock *> Preds, const char *Suffix1,
+    const char *Suffix2, SmallVectorImpl<BasicBlock *> &NewBBs,
+    DomTreeUpdater *DTU, DominatorTree *DT, LoopInfo *LI,
+    MemorySSAUpdater *MSSAU, bool PreserveLCSSA);
+
+static BasicBlock *
+SplitBlockPredecessorsImpl(BasicBlock *BB, ArrayRef<BasicBlock *> Preds,
+                           const char *Suffix, DomTreeUpdater *DTU,
+                           DominatorTree *DT, LoopInfo *LI,
+                           MemorySSAUpdater *MSSAU, bool PreserveLCSSA) {
   // Do not attempt to split that which cannot be split.
   if (!BB->canSplitPredecessors())
     return nullptr;
@@ -747,8 +857,8 @@ BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB,
     SmallVector<BasicBlock*, 2> NewBBs;
     std::string NewName = std::string(Suffix) + ".split-lp";
 
-    SplitLandingPadPredecessors(BB, Preds, Suffix, NewName.c_str(), NewBBs, DT,
-                                LI, MSSAU, PreserveLCSSA);
+    SplitLandingPadPredecessorsImpl(BB, Preds, Suffix, NewName.c_str(), NewBBs,
+                                    DTU, DT, LI, MSSAU, PreserveLCSSA);
     return NewBBs[0];
   }
 
@@ -758,12 +868,22 @@ BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB,
 
   // The new block unconditionally branches to the old block.
   BranchInst *BI = BranchInst::Create(BB, NewBB);
+
+  Loop *L = nullptr;
+  BasicBlock *OldLatch = nullptr;
   // Splitting the predecessors of a loop header creates a preheader block.
-  if (LI && LI->isLoopHeader(BB))
+  if (LI && LI->isLoopHeader(BB)) {
+    L = LI->getLoopFor(BB);
     // Using the loop start line number prevents debuggers stepping into the
     // loop body for this instruction.
-    BI->setDebugLoc(LI->getLoopFor(BB)->getStartLoc());
-  else
+    BI->setDebugLoc(L->getStartLoc());
+
+    // If BB is the header of the Loop, it is possible that the loop is
+    // modified, such that the current latch does not remain the latch of the
+    // loop. If that is the case, the loop metadata from the current latch needs
+    // to be applied to the new latch.
+    OldLatch = L->getLoopLatch();
+  } else
     BI->setDebugLoc(BB->getFirstNonPHIOrDbg()->getDebugLoc());
 
   // Move the edges from Preds to point to NewBB instead of BB.
@@ -790,7 +910,7 @@ BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB,
 
   // Update DominatorTree, LoopInfo, and LCCSA analysis information.
   bool HasLoopExit = false;
-  UpdateAnalysisInformation(BB, NewBB, Preds, DT, LI, MSSAU, PreserveLCSSA,
+  UpdateAnalysisInformation(BB, NewBB, Preds, DTU, DT, LI, MSSAU, PreserveLCSSA,
                             HasLoopExit);
 
   if (!Preds.empty()) {
@@ -798,16 +918,41 @@ BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB,
     UpdatePHINodes(BB, NewBB, Preds, BI, HasLoopExit);
   }
 
+  if (OldLatch) {
+    BasicBlock *NewLatch = L->getLoopLatch();
+    if (NewLatch != OldLatch) {
+      MDNode *MD = OldLatch->getTerminator()->getMetadata("llvm.loop");
+      NewLatch->getTerminator()->setMetadata("llvm.loop", MD);
+      OldLatch->getTerminator()->setMetadata("llvm.loop", nullptr);
+    }
+  }
+
   return NewBB;
 }
 
-void llvm::SplitLandingPadPredecessors(BasicBlock *OrigBB,
-                                       ArrayRef<BasicBlock *> Preds,
-                                       const char *Suffix1, const char *Suffix2,
-                                       SmallVectorImpl<BasicBlock *> &NewBBs,
-                                       DominatorTree *DT, LoopInfo *LI,
-                                       MemorySSAUpdater *MSSAU,
-                                       bool PreserveLCSSA) {
+BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB,
+                                         ArrayRef<BasicBlock *> Preds,
+                                         const char *Suffix, DominatorTree *DT,
+                                         LoopInfo *LI, MemorySSAUpdater *MSSAU,
+                                         bool PreserveLCSSA) {
+  return SplitBlockPredecessorsImpl(BB, Preds, Suffix, /*DTU=*/nullptr, DT, LI,
+                                    MSSAU, PreserveLCSSA);
+}
+BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB,
+                                         ArrayRef<BasicBlock *> Preds,
+                                         const char *Suffix,
+                                         DomTreeUpdater *DTU, LoopInfo *LI,
+                                         MemorySSAUpdater *MSSAU,
+                                         bool PreserveLCSSA) {
+  return SplitBlockPredecessorsImpl(BB, Preds, Suffix, DTU,
+                                    /*DT=*/nullptr, LI, MSSAU, PreserveLCSSA);
+}
+
+static void SplitLandingPadPredecessorsImpl(
+    BasicBlock *OrigBB, ArrayRef<BasicBlock *> Preds, const char *Suffix1,
+    const char *Suffix2, SmallVectorImpl<BasicBlock *> &NewBBs,
+    DomTreeUpdater *DTU, DominatorTree *DT, LoopInfo *LI,
+    MemorySSAUpdater *MSSAU, bool PreserveLCSSA) {
   assert(OrigBB->isLandingPad() && "Trying to split a non-landing pad!");
 
   // Create a new basic block for OrigBB's predecessors listed in Preds. Insert
@@ -832,8 +977,8 @@ void llvm::SplitLandingPadPredecessors(BasicBlock *OrigBB,
   }
 
   bool HasLoopExit = false;
-  UpdateAnalysisInformation(OrigBB, NewBB1, Preds, DT, LI, MSSAU, PreserveLCSSA,
-                            HasLoopExit);
+  UpdateAnalysisInformation(OrigBB, NewBB1, Preds, DTU, DT, LI, MSSAU,
+                            PreserveLCSSA, HasLoopExit);
 
   // Update the PHI nodes in OrigBB with the values coming from NewBB1.
   UpdatePHINodes(OrigBB, NewBB1, Preds, BI1, HasLoopExit);
@@ -868,7 +1013,7 @@ void llvm::SplitLandingPadPredecessors(BasicBlock *OrigBB,
 
     // Update DominatorTree, LoopInfo, and LCCSA analysis information.
     HasLoopExit = false;
-    UpdateAnalysisInformation(OrigBB, NewBB2, NewBB2Preds, DT, LI, MSSAU,
+    UpdateAnalysisInformation(OrigBB, NewBB2, NewBB2Preds, DTU, DT, LI, MSSAU,
                               PreserveLCSSA, HasLoopExit);
 
     // Update the PHI nodes in OrigBB with the values coming from NewBB2.
@@ -905,6 +1050,29 @@ void llvm::SplitLandingPadPredecessors(BasicBlock *OrigBB,
   }
 }
 
+void llvm::SplitLandingPadPredecessors(BasicBlock *OrigBB,
+                                       ArrayRef<BasicBlock *> Preds,
+                                       const char *Suffix1, const char *Suffix2,
+                                       SmallVectorImpl<BasicBlock *> &NewBBs,
+                                       DominatorTree *DT, LoopInfo *LI,
+                                       MemorySSAUpdater *MSSAU,
+                                       bool PreserveLCSSA) {
+  return SplitLandingPadPredecessorsImpl(
+      OrigBB, Preds, Suffix1, Suffix2, NewBBs,
+      /*DTU=*/nullptr, DT, LI, MSSAU, PreserveLCSSA);
+}
+void llvm::SplitLandingPadPredecessors(BasicBlock *OrigBB,
+                                       ArrayRef<BasicBlock *> Preds,
+                                       const char *Suffix1, const char *Suffix2,
+                                       SmallVectorImpl<BasicBlock *> &NewBBs,
+                                       DomTreeUpdater *DTU, LoopInfo *LI,
+                                       MemorySSAUpdater *MSSAU,
+                                       bool PreserveLCSSA) {
+  return SplitLandingPadPredecessorsImpl(OrigBB, Preds, Suffix1, Suffix2,
+                                         NewBBs, DTU, /*DT=*/nullptr, LI, MSSAU,
+                                         PreserveLCSSA);
+}
+
 ReturnInst *llvm::FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB,
                                              BasicBlock *Pred,
                                              DomTreeUpdater *DTU) {
@@ -964,14 +1132,24 @@ ReturnInst *llvm::FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB,
   return cast<ReturnInst>(NewRet);
 }
 
-Instruction *llvm::SplitBlockAndInsertIfThen(Value *Cond,
-                                             Instruction *SplitBefore,
-                                             bool Unreachable,
-                                             MDNode *BranchWeights,
-                                             DominatorTree *DT, LoopInfo *LI,
-                                             BasicBlock *ThenBlock) {
+static Instruction *
+SplitBlockAndInsertIfThenImpl(Value *Cond, Instruction *SplitBefore,
+                              bool Unreachable, MDNode *BranchWeights,
+                              DomTreeUpdater *DTU, DominatorTree *DT,
+                              LoopInfo *LI, BasicBlock *ThenBlock) {
+  SmallVector<DominatorTree::UpdateType, 8> Updates;
   BasicBlock *Head = SplitBefore->getParent();
   BasicBlock *Tail = Head->splitBasicBlock(SplitBefore->getIterator());
+  if (DTU) {
+    SmallSetVector<BasicBlock *, 8> UniqueSuccessorsOfHead(succ_begin(Tail),
+                                                           succ_end(Tail));
+    Updates.push_back({DominatorTree::Insert, Head, Tail});
+    Updates.reserve(Updates.size() + 2 * UniqueSuccessorsOfHead.size());
+    for (BasicBlock *UniqueSuccessorOfHead : UniqueSuccessorsOfHead) {
+      Updates.push_back({DominatorTree::Insert, Tail, UniqueSuccessorOfHead});
+      Updates.push_back({DominatorTree::Delete, Head, UniqueSuccessorOfHead});
+    }
+  }
   Instruction *HeadOldTerm = Head->getTerminator();
   LLVMContext &C = Head->getContext();
   Instruction *CheckTerm;
@@ -980,17 +1158,24 @@ Instruction *llvm::SplitBlockAndInsertIfThen(Value *Cond,
     ThenBlock = BasicBlock::Create(C, "", Head->getParent(), Tail);
     if (Unreachable)
       CheckTerm = new UnreachableInst(C, ThenBlock);
-    else
+    else {
       CheckTerm = BranchInst::Create(Tail, ThenBlock);
+      if (DTU)
+        Updates.push_back({DominatorTree::Insert, ThenBlock, Tail});
+    }
     CheckTerm->setDebugLoc(SplitBefore->getDebugLoc());
   } else
     CheckTerm = ThenBlock->getTerminator();
   BranchInst *HeadNewTerm =
-    BranchInst::Create(/*ifTrue*/ThenBlock, /*ifFalse*/Tail, Cond);
+      BranchInst::Create(/*ifTrue*/ ThenBlock, /*ifFalse*/ Tail, Cond);
+  if (DTU)
+    Updates.push_back({DominatorTree::Insert, Head, ThenBlock});
   HeadNewTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
   ReplaceInstWithInst(HeadOldTerm, HeadNewTerm);
 
-  if (DT) {
+  if (DTU)
+    DTU->applyUpdates(Updates);
+  else if (DT) {
     if (DomTreeNode *OldNode = DT->getNode(Head)) {
       std::vector<DomTreeNode *> Children(OldNode->begin(), OldNode->end());
 
@@ -1016,6 +1201,27 @@ Instruction *llvm::SplitBlockAndInsertIfThen(Value *Cond,
   return CheckTerm;
 }
 
+Instruction *llvm::SplitBlockAndInsertIfThen(Value *Cond,
+                                             Instruction *SplitBefore,
+                                             bool Unreachable,
+                                             MDNode *BranchWeights,
+                                             DominatorTree *DT, LoopInfo *LI,
+                                             BasicBlock *ThenBlock) {
+  return SplitBlockAndInsertIfThenImpl(Cond, SplitBefore, Unreachable,
+                                       BranchWeights,
+                                       /*DTU=*/nullptr, DT, LI, ThenBlock);
+}
+Instruction *llvm::SplitBlockAndInsertIfThen(Value *Cond,
+                                             Instruction *SplitBefore,
+                                             bool Unreachable,
+                                             MDNode *BranchWeights,
+                                             DomTreeUpdater *DTU, LoopInfo *LI,
+                                             BasicBlock *ThenBlock) {
+  return SplitBlockAndInsertIfThenImpl(Cond, SplitBefore, Unreachable,
+                                       BranchWeights, DTU, /*DT=*/nullptr, LI,
+                                       ThenBlock);
+}
+
 void llvm::SplitBlockAndInsertIfThenElse(Value *Cond, Instruction *SplitBefore,
                                          Instruction **ThenTerm,
                                          Instruction **ElseTerm,
@@ -1326,11 +1532,11 @@ BasicBlock *llvm::CreateControlFlowHub(
   SmallVector<DominatorTree::UpdateType, 16> Updates;
   if (DTU) {
     for (auto In : Incoming) {
+      Updates.push_back({DominatorTree::Insert, In, FirstGuardBlock});
       for (auto Succ : successors(In)) {
         if (Outgoing.count(Succ))
           Updates.push_back({DominatorTree::Delete, In, Succ});
       }
-      Updates.push_back({DominatorTree::Insert, In, FirstGuardBlock});
     }
   }
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
index 39fb504cf7b7..939a1a3a868d 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -134,9 +134,9 @@ static void createPHIsForSplitLoopExit(ArrayRef<BasicBlock *> Preds,
   }
 }
 
-BasicBlock *
-llvm::SplitCriticalEdge(Instruction *TI, unsigned SuccNum,
-                        const CriticalEdgeSplittingOptions &Options) {
+BasicBlock *llvm::SplitCriticalEdge(Instruction *TI, unsigned SuccNum,
+                                    const CriticalEdgeSplittingOptions &Options,
+                                    const Twine &BBName) {
   if (!isCriticalEdge(TI, SuccNum, Options.MergeIdenticalEdges))
     return nullptr;
 
@@ -158,22 +158,21 @@ llvm::SplitCriticalEdge(Instruction *TI, unsigned SuccNum,
   SmallVector<BasicBlock *, 4> LoopPreds;
   // Check if extra modifications will be required to preserve loop-simplify
   // form after splitting. If it would require splitting blocks with IndirectBr
-  // terminators, bail out if preserving loop-simplify form is requested.
+  // or CallBr terminators, bail out if preserving loop-simplify form is
+  // requested.
   if (LI) {
     if (Loop *TIL = LI->getLoopFor(TIBB)) {
 
-      // The only that we can break LoopSimplify form by splitting a critical
-      // edge is if after the split there exists some edge from TIL to DestBB
-      // *and* the only edge into DestBB from outside of TIL is that of
+      // The only way that we can break LoopSimplify form by splitting a
+      // critical edge is if after the split there exists some edge from TIL to
+      // DestBB *and* the only edge into DestBB from outside of TIL is that of
       // NewBB. If the first isn't true, then LoopSimplify still holds, NewBB
       // is the new exit block and it has no non-loop predecessors. If the
       // second isn't true, then DestBB was not in LoopSimplify form prior to
       // the split as it had a non-loop predecessor. In both of these cases,
       // the predecessor must be directly in TIL, not in a subloop, or again
       // LoopSimplify doesn't hold.
-      for (pred_iterator I = pred_begin(DestBB), E = pred_end(DestBB); I != E;
-           ++I) {
-        BasicBlock *P = *I;
+      for (BasicBlock *P : predecessors(DestBB)) {
         if (P == TIBB)
           continue; // The new block is known.
         if (LI->getLoopFor(P) != TIL) {
@@ -186,7 +185,10 @@ llvm::SplitCriticalEdge(Instruction *TI, unsigned SuccNum,
       // Loop-simplify form can be preserved, if we can split all in-loop
       // predecessors.
       if (any_of(LoopPreds, [](BasicBlock *Pred) {
-            return isa<IndirectBrInst>(Pred->getTerminator());
+            const Instruction *T = Pred->getTerminator();
+            if (const auto *CBR = dyn_cast<CallBrInst>(T))
+              return CBR->getDefaultDest() != Pred;
+            return isa<IndirectBrInst>(T);
           })) {
         if (Options.PreserveLoopSimplify)
           return nullptr;
@@ -196,8 +198,13 @@ llvm::SplitCriticalEdge(Instruction *TI, unsigned SuccNum,
   }
 
   // Create a new basic block, linking it into the CFG.
-  BasicBlock *NewBB = BasicBlock::Create(TI->getContext(),
-                      TIBB->getName() + "." + DestBB->getName() + "_crit_edge");
+  BasicBlock *NewBB = nullptr;
+  if (BBName.str() != "")
+    NewBB = BasicBlock::Create(TI->getContext(), BBName);
+  else
+    NewBB = BasicBlock::Create(TI->getContext(), TIBB->getName() + "." +
+                                                     DestBB->getName() +
+                                                     "_crit_edge");
   // Create our unconditional branch.
   BranchInst *NewBI = BranchInst::Create(DestBB, NewBB);
   NewBI->setDebugLoc(TI->getDebugLoc());
@@ -270,7 +277,7 @@ llvm::SplitCriticalEdge(Instruction *TI, unsigned SuccNum,
     SmallVector<DominatorTree::UpdateType, 3> Updates;
     Updates.push_back({DominatorTree::Insert, TIBB, NewBB});
     Updates.push_back({DominatorTree::Insert, NewBB, DestBB});
-    if (llvm::find(successors(TIBB), DestBB) == succ_end(TIBB))
+    if (!llvm::is_contained(successors(TIBB), DestBB))
       Updates.push_back({DominatorTree::Delete, TIBB, DestBB});
 
     if (DT)
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
index c64ad147fdfe..dba5403f272a 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -31,14 +31,22 @@ using namespace llvm;
 //- Infer Attributes ---------------------------------------------------------//
 
 STATISTIC(NumReadNone, "Number of functions inferred as readnone");
+STATISTIC(NumInaccessibleMemOnly,
+          "Number of functions inferred as inaccessiblememonly");
 STATISTIC(NumReadOnly, "Number of functions inferred as readonly");
 STATISTIC(NumArgMemOnly, "Number of functions inferred as argmemonly");
+STATISTIC(NumInaccessibleMemOrArgMemOnly,
+          "Number of functions inferred as inaccessiblemem_or_argmemonly");
 STATISTIC(NumNoUnwind, "Number of functions inferred as nounwind");
 STATISTIC(NumNoCapture, "Number of arguments inferred as nocapture");
+STATISTIC(NumWriteOnlyArg, "Number of arguments inferred as writeonly");
+STATISTIC(NumSExtArg, "Number of arguments inferred as signext");
 STATISTIC(NumReadOnlyArg, "Number of arguments inferred as readonly");
 STATISTIC(NumNoAlias, "Number of function returns inferred as noalias");
+STATISTIC(NumNoUndef, "Number of function returns inferred as noundef returns");
 STATISTIC(NumNonNull, "Number of function returns inferred as nonnull returns");
 STATISTIC(NumReturnedArg, "Number of arguments inferred as returned");
+STATISTIC(NumWillReturn, "Number of functions inferred as willreturn");
 
 static bool setDoesNotAccessMemory(Function &F) {
   if (F.doesNotAccessMemory())
@@ -48,6 +56,14 @@ static bool setDoesNotAccessMemory(Function &F) {
   return true;
 }
 
+static bool setOnlyAccessesInaccessibleMemory(Function &F) {
+  if (F.onlyAccessesInaccessibleMemory())
+    return false;
+  F.setOnlyAccessesInaccessibleMemory();
+  ++NumInaccessibleMemOnly;
+  return true;
+}
+
 static bool setOnlyReadsMemory(Function &F) {
   if (F.onlyReadsMemory())
     return false;
@@ -64,6 +80,14 @@ static bool setOnlyAccessesArgMemory(Function &F) {
   return true;
 }
 
+static bool setOnlyAccessesInaccessibleMemOrArgMem(Function &F) {
+  if (F.onlyAccessesInaccessibleMemOrArgMem())
+    return false;
+  F.setOnlyAccessesInaccessibleMemOrArgMem();
+  ++NumInaccessibleMemOrArgMemOnly;
+  return true;
+}
+
 static bool setDoesNotThrow(Function &F) {
   if (F.doesNotThrow())
     return false;
@@ -104,16 +128,48 @@ static bool setOnlyReadsMemory(Function &F, unsigned ArgNo) {
   return true;
 }
 
-static bool setRetNonNull(Function &F) {
-  assert(F.getReturnType()->isPointerTy() &&
-         "nonnull applies only to pointers");
-  if (F.hasAttribute(AttributeList::ReturnIndex, Attribute::NonNull))
+static bool setOnlyWritesMemory(Function &F, unsigned ArgNo) {
+  if (F.hasParamAttribute(ArgNo, Attribute::WriteOnly))
+    return false;
+  F.addParamAttr(ArgNo, Attribute::WriteOnly);
+  ++NumWriteOnlyArg;
+  return true;
+}
+
+static bool setSignExtendedArg(Function &F, unsigned ArgNo) {
+ if (F.hasParamAttribute(ArgNo, Attribute::SExt))
     return false;
-  F.addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
-  ++NumNonNull;
+  F.addParamAttr(ArgNo, Attribute::SExt);
+  ++NumSExtArg;
   return true;
 }
 
+static bool setRetNoUndef(Function &F) {
+  if (!F.getReturnType()->isVoidTy() &&
+      !F.hasAttribute(AttributeList::ReturnIndex, Attribute::NoUndef)) {
+    F.addAttribute(AttributeList::ReturnIndex, Attribute::NoUndef);
+    ++NumNoUndef;
+    return true;
+  }
+  return false;
+}
+
+static bool setArgsNoUndef(Function &F) {
+  bool Changed = false;
+  for (unsigned ArgNo = 0; ArgNo < F.arg_size(); ++ArgNo) {
+    if (!F.hasParamAttribute(ArgNo, Attribute::NoUndef)) {
+      F.addParamAttr(ArgNo, Attribute::NoUndef);
+      ++NumNoUndef;
+      Changed = true;
+    }
+  }
+  return Changed;
+}
+
+static bool setRetAndArgsNoUndef(Function &F) {
+  return setRetNoUndef(F) | setArgsNoUndef(F);
+}
+
 static bool setReturnedArg(Function &F, unsigned ArgNo) {
   if (F.hasParamAttribute(ArgNo, Attribute::Returned))
     return false;
@@ -136,6 +192,14 @@ static bool setDoesNotFreeMemory(Function &F) {
   return true;
 }
 
+static bool setWillReturn(Function &F) {
+  if (F.hasFnAttribute(Attribute::WillReturn))
+    return false;
+  F.addFnAttr(Attribute::WillReturn);
+  ++NumWillReturn;
+  return true;
+}
+
 bool llvm::inferLibFuncAttributes(Module *M, StringRef Name,
                                   const TargetLibraryInfo &TLI) {
   Function *F = M->getFunction(Name);
@@ -163,12 +227,15 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     Changed |= setOnlyReadsMemory(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setOnlyAccessesArgMemory(F);
+    Changed |= setWillReturn(F);
     Changed |= setDoesNotCapture(F, 0);
     return Changed;
   case LibFunc_strchr:
   case LibFunc_strrchr:
+    Changed |= setOnlyAccessesArgMemory(F);
     Changed |= setOnlyReadsMemory(F);
     Changed |= setDoesNotThrow(F);
+    Changed |= setWillReturn(F);
     return Changed;
   case LibFunc_strtol:
   case LibFunc_strtod:
@@ -178,26 +245,31 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
   case LibFunc_strtold:
   case LibFunc_strtoull:
     Changed |= setDoesNotThrow(F);
+    Changed |= setWillReturn(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 0);
     return Changed;
   case LibFunc_strcpy:
   case LibFunc_strncpy:
-    Changed |= setDoesNotAlias(F, 0);
-    Changed |= setDoesNotAlias(F, 1);
-    LLVM_FALLTHROUGH;
   case LibFunc_strcat:
   case LibFunc_strncat:
+    Changed |= setWillReturn(F);
     Changed |= setReturnedArg(F, 0);
     LLVM_FALLTHROUGH;
   case LibFunc_stpcpy:
   case LibFunc_stpncpy:
+    Changed |= setOnlyAccessesArgMemory(F);
     Changed |= setDoesNotThrow(F);
+    Changed |= setWillReturn(F);
     Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyWritesMemory(F, 0);
     Changed |= setOnlyReadsMemory(F, 1);
+    Changed |= setDoesNotAlias(F, 0);
+    Changed |= setDoesNotAlias(F, 1);
     return Changed;
   case LibFunc_strxfrm:
     Changed |= setDoesNotThrow(F);
+    Changed |= setWillReturn(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
@@ -206,51 +278,70 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
   case LibFunc_strspn:      // 0,1
   case LibFunc_strncmp:     // 0,1
   case LibFunc_strcspn:     // 0,1
-  case LibFunc_strcoll:     // 0,1
+    Changed |= setDoesNotThrow(F);
+    Changed |= setOnlyAccessesArgMemory(F);
+    Changed |= setWillReturn(F);
+    Changed |= setOnlyReadsMemory(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc_strcoll:
   case LibFunc_strcasecmp:  // 0,1
   case LibFunc_strncasecmp: //
+    // Those functions may depend on the locale, which may be accessed through
+    // global memory.
     Changed |= setOnlyReadsMemory(F);
     Changed |= setDoesNotThrow(F);
+    Changed |= setWillReturn(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
     return Changed;
   case LibFunc_strstr:
   case LibFunc_strpbrk:
+    Changed |= setOnlyAccessesArgMemory(F);
     Changed |= setOnlyReadsMemory(F);
     Changed |= setDoesNotThrow(F);
+    Changed |= setWillReturn(F);
     Changed |= setDoesNotCapture(F, 1);
     return Changed;
   case LibFunc_strtok:
   case LibFunc_strtok_r:
     Changed |= setDoesNotThrow(F);
+    Changed |= setWillReturn(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
   case LibFunc_scanf:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setOnlyReadsMemory(F, 0);
     return Changed;
   case LibFunc_setbuf:
   case LibFunc_setvbuf:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     return Changed;
   case LibFunc_strdup:
   case LibFunc_strndup:
+    Changed |= setOnlyAccessesInaccessibleMemOrArgMem(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setRetDoesNotAlias(F);
+    Changed |= setWillReturn(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setOnlyReadsMemory(F, 0);
     return Changed;
   case LibFunc_stat:
   case LibFunc_statvfs:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 0);
     return Changed;
   case LibFunc_sscanf:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
@@ -258,70 +349,95 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
   case LibFunc_sprintf:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotAlias(F, 0);
+    Changed |= setOnlyWritesMemory(F, 0);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
   case LibFunc_snprintf:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotAlias(F, 0);
+    Changed |= setOnlyWritesMemory(F, 0);
     Changed |= setDoesNotCapture(F, 2);
     Changed |= setOnlyReadsMemory(F, 2);
     return Changed;
   case LibFunc_setitimer:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
+    Changed |= setWillReturn(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setDoesNotCapture(F, 2);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
   case LibFunc_system:
     // May throw; "system" is a valid pthread cancellation point.
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setOnlyReadsMemory(F, 0);
     return Changed;
   case LibFunc_malloc:
+  case LibFunc_vec_malloc:
+    Changed |= setOnlyAccessesInaccessibleMemory(F);
+    Changed |= setRetNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setRetDoesNotAlias(F);
+    Changed |= setWillReturn(F);
     return Changed;
   case LibFunc_memcmp:
+    Changed |= setOnlyAccessesArgMemory(F);
     Changed |= setOnlyReadsMemory(F);
     Changed |= setDoesNotThrow(F);
+    Changed |= setWillReturn(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
     return Changed;
   case LibFunc_memchr:
   case LibFunc_memrchr:
-    Changed |= setOnlyReadsMemory(F);
     Changed |= setDoesNotThrow(F);
+    Changed |= setOnlyAccessesArgMemory(F);
+    Changed |= setOnlyReadsMemory(F);
+    Changed |= setWillReturn(F);
     return Changed;
   case LibFunc_modf:
   case LibFunc_modff:
   case LibFunc_modfl:
     Changed |= setDoesNotThrow(F);
+    Changed |= setWillReturn(F);
     Changed |= setDoesNotCapture(F, 1);
     return Changed;
   case LibFunc_memcpy:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setOnlyAccessesArgMemory(F);
+    Changed |= setWillReturn(F);
     Changed |= setDoesNotAlias(F, 0);
-    Changed |= setDoesNotAlias(F, 1);
     Changed |= setReturnedArg(F, 0);
-    Changed |= setDoesNotThrow(F);
+    Changed |= setOnlyWritesMemory(F, 0);
+    Changed |= setDoesNotAlias(F, 1);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
   case LibFunc_memmove:
-    Changed |= setReturnedArg(F, 0);
     Changed |= setDoesNotThrow(F);
+    Changed |= setOnlyAccessesArgMemory(F);
+    Changed |= setWillReturn(F);
+    Changed |= setReturnedArg(F, 0);
+    Changed |= setOnlyWritesMemory(F, 0);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
   case LibFunc_mempcpy:
   case LibFunc_memccpy:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setOnlyAccessesArgMemory(F);
+    Changed |= setWillReturn(F);
     Changed |= setDoesNotAlias(F, 0);
+    Changed |= setOnlyWritesMemory(F, 0);
     Changed |= setDoesNotAlias(F, 1);
-    Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
@@ -329,38 +445,57 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     Changed |= setDoesNotThrow(F);
     return Changed;
   case LibFunc_memalign:
+    Changed |= setOnlyAccessesInaccessibleMemory(F);
+    Changed |= setRetNoUndef(F);
+    Changed |= setDoesNotThrow(F);
     Changed |= setRetDoesNotAlias(F);
+    Changed |= setWillReturn(F);
     return Changed;
   case LibFunc_mkdir:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setOnlyReadsMemory(F, 0);
     return Changed;
   case LibFunc_mktime:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
+    Changed |= setWillReturn(F);
     Changed |= setDoesNotCapture(F, 0);
     return Changed;
   case LibFunc_realloc:
+  case LibFunc_vec_realloc:
+    Changed |= setOnlyAccessesInaccessibleMemOrArgMem(F);
+    Changed |= setRetNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setRetDoesNotAlias(F);
+    Changed |= setWillReturn(F);
     Changed |= setDoesNotCapture(F, 0);
     return Changed;
+  case LibFunc_reallocf:
+    Changed |= setRetNoUndef(F);
+    Changed |= setWillReturn(F);
+    return Changed;
   case LibFunc_read:
     // May throw; "read" is a valid pthread cancellation point.
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotCapture(F, 1);
     return Changed;
   case LibFunc_rewind:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     return Changed;
   case LibFunc_rmdir:
   case LibFunc_remove:
   case LibFunc_realpath:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setOnlyReadsMemory(F, 0);
     return Changed;
   case LibFunc_rename:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
@@ -368,6 +503,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
   case LibFunc_readlink:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
@@ -375,35 +511,52 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     return Changed;
   case LibFunc_write:
     // May throw; "write" is a valid pthread cancellation point.
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
   case LibFunc_aligned_alloc:
+    Changed |= setOnlyAccessesInaccessibleMemory(F);
+    Changed |= setRetNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setRetDoesNotAlias(F);
+    Changed |= setWillReturn(F);
     return Changed;
   case LibFunc_bcopy:
     Changed |= setDoesNotThrow(F);
+    Changed |= setOnlyAccessesArgMemory(F);
+    Changed |= setWillReturn(F);
     Changed |= setDoesNotCapture(F, 0);
-    Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 0);
+    Changed |= setOnlyWritesMemory(F, 1);
+    Changed |= setDoesNotCapture(F, 1);
     return Changed;
   case LibFunc_bcmp:
     Changed |= setDoesNotThrow(F);
+    Changed |= setOnlyAccessesArgMemory(F);
     Changed |= setOnlyReadsMemory(F);
+    Changed |= setWillReturn(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
     return Changed;
   case LibFunc_bzero:
     Changed |= setDoesNotThrow(F);
+    Changed |= setOnlyAccessesArgMemory(F);
+    Changed |= setWillReturn(F);
     Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyWritesMemory(F, 0);
     return Changed;
   case LibFunc_calloc:
+  case LibFunc_vec_calloc:
+    Changed |= setOnlyAccessesInaccessibleMemory(F);
+    Changed |= setRetNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setRetDoesNotAlias(F);
+    Changed |= setWillReturn(F);
     return Changed;
   case LibFunc_chmod:
   case LibFunc_chown:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setOnlyReadsMemory(F, 0);
@@ -411,6 +564,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
   case LibFunc_ctermid:
   case LibFunc_clearerr:
   case LibFunc_closedir:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     return Changed;
@@ -420,14 +574,17 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
   case LibFunc_atoll:
     Changed |= setDoesNotThrow(F);
     Changed |= setOnlyReadsMemory(F);
+    Changed |= setWillReturn(F);
     Changed |= setDoesNotCapture(F, 0);
     return Changed;
   case LibFunc_access:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setOnlyReadsMemory(F, 0);
     return Changed;
   case LibFunc_fopen:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setRetDoesNotAlias(F);
     Changed |= setDoesNotCapture(F, 0);
@@ -436,13 +593,25 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
   case LibFunc_fdopen:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setRetDoesNotAlias(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
   case LibFunc_feof:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    return Changed;
   case LibFunc_free:
+  case LibFunc_vec_free:
+    Changed |= setOnlyAccessesInaccessibleMemOrArgMem(F);
+    Changed |= setArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setWillReturn(F);
+    Changed |= setDoesNotCapture(F, 0);
+    return Changed;
   case LibFunc_fseek:
   case LibFunc_ftell:
   case LibFunc_fgetc:
@@ -456,10 +625,12 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
   case LibFunc_flockfile:
   case LibFunc_funlockfile:
   case LibFunc_ftrylockfile:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     return Changed;
   case LibFunc_ferror:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setOnlyReadsMemory(F);
@@ -467,26 +638,38 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
   case LibFunc_fputc:
   case LibFunc_fputc_unlocked:
   case LibFunc_fstat:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
   case LibFunc_frexp:
   case LibFunc_frexpf:
   case LibFunc_frexpl:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setWillReturn(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
   case LibFunc_fstatvfs:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     return Changed;
   case LibFunc_fgets:
   case LibFunc_fgets_unlocked:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 2);
     return Changed;
   case LibFunc_fread:
   case LibFunc_fread_unlocked:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 3);
     return Changed;
   case LibFunc_fwrite:
   case LibFunc_fwrite_unlocked:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 3);
@@ -494,6 +677,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     return Changed;
   case LibFunc_fputs:
   case LibFunc_fputs_unlocked:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
@@ -501,23 +685,35 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     return Changed;
   case LibFunc_fscanf:
   case LibFunc_fprintf:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
   case LibFunc_fgetpos:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
     return Changed;
   case LibFunc_getc:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    return Changed;
   case LibFunc_getlogin_r:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    return Changed;
   case LibFunc_getc_unlocked:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     return Changed;
   case LibFunc_getenv:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setOnlyReadsMemory(F);
     Changed |= setDoesNotCapture(F, 0);
@@ -525,37 +721,45 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
   case LibFunc_gets:
   case LibFunc_getchar:
   case LibFunc_getchar_unlocked:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     return Changed;
   case LibFunc_getitimer:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     return Changed;
   case LibFunc_getpwnam:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setOnlyReadsMemory(F, 0);
     return Changed;
   case LibFunc_ungetc:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     return Changed;
   case LibFunc_uname:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     return Changed;
   case LibFunc_unlink:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setOnlyReadsMemory(F, 0);
     return Changed;
   case LibFunc_unsetenv:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setOnlyReadsMemory(F, 0);
     return Changed;
   case LibFunc_utime:
   case LibFunc_utimes:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
@@ -564,30 +768,36 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     return Changed;
   case LibFunc_putc:
   case LibFunc_putc_unlocked:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     return Changed;
   case LibFunc_puts:
   case LibFunc_printf:
   case LibFunc_perror:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setOnlyReadsMemory(F, 0);
     return Changed;
   case LibFunc_pread:
     // May throw; "pread" is a valid pthread cancellation point.
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotCapture(F, 1);
     return Changed;
   case LibFunc_pwrite:
     // May throw; "pwrite" is a valid pthread cancellation point.
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
   case LibFunc_putchar:
   case LibFunc_putchar_unlocked:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     return Changed;
   case LibFunc_popen:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setRetDoesNotAlias(F);
     Changed |= setDoesNotCapture(F, 0);
@@ -596,15 +806,18 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
   case LibFunc_pclose:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     return Changed;
   case LibFunc_vscanf:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setOnlyReadsMemory(F, 0);
     return Changed;
   case LibFunc_vsscanf:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
@@ -612,28 +825,35 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
   case LibFunc_vfscanf:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
   case LibFunc_valloc:
+    Changed |= setOnlyAccessesInaccessibleMemory(F);
+    Changed |= setRetNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setRetDoesNotAlias(F);
+    Changed |= setWillReturn(F);
     return Changed;
   case LibFunc_vprintf:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setOnlyReadsMemory(F, 0);
     return Changed;
   case LibFunc_vfprintf:
   case LibFunc_vsprintf:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
   case LibFunc_vsnprintf:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 2);
@@ -641,20 +861,24 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     return Changed;
   case LibFunc_open:
     // May throw; "open" is a valid pthread cancellation point.
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setOnlyReadsMemory(F, 0);
     return Changed;
   case LibFunc_opendir:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setRetDoesNotAlias(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setOnlyReadsMemory(F, 0);
     return Changed;
   case LibFunc_tmpfile:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setRetDoesNotAlias(F);
     return Changed;
   case LibFunc_times:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     return Changed;
@@ -666,24 +890,29 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     Changed |= setDoesNotAccessMemory(F);
     return Changed;
   case LibFunc_lstat:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 0);
     return Changed;
   case LibFunc_lchown:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setOnlyReadsMemory(F, 0);
     return Changed;
   case LibFunc_qsort:
     // May throw; places call through function pointer.
+    // Cannot give undef pointer/size
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotCapture(F, 3);
     return Changed;
   case LibFunc_dunder_strdup:
   case LibFunc_dunder_strndup:
     Changed |= setDoesNotThrow(F);
     Changed |= setRetDoesNotAlias(F);
+    Changed |= setWillReturn(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setOnlyReadsMemory(F, 0);
     return Changed;
@@ -693,14 +922,17 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
   case LibFunc_under_IO_getc:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     return Changed;
   case LibFunc_under_IO_putc:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     return Changed;
   case LibFunc_dunder_isoc99_scanf:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setOnlyReadsMemory(F, 0);
@@ -708,12 +940,14 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
   case LibFunc_stat64:
   case LibFunc_lstat64:
   case LibFunc_statvfs64:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 0);
     return Changed;
   case LibFunc_dunder_isoc99_sscanf:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
@@ -721,6 +955,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
   case LibFunc_fopen64:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setRetDoesNotAlias(F);
     Changed |= setDoesNotCapture(F, 0);
@@ -730,20 +965,24 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     return Changed;
   case LibFunc_fseeko64:
   case LibFunc_ftello64:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     return Changed;
   case LibFunc_tmpfile64:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setRetDoesNotAlias(F);
     return Changed;
   case LibFunc_fstat64:
   case LibFunc_fstatvfs64:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     return Changed;
   case LibFunc_open64:
     // May throw; "open" is a valid pthread cancellation point.
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setOnlyReadsMemory(F, 0);
     return Changed;
@@ -751,37 +990,166 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     // Currently some platforms have the restrict keyword on the arguments to
     // gettimeofday. To be conservative, do not add noalias to gettimeofday's
     // arguments.
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
     return Changed;
-  case LibFunc_Znwj: // new(unsigned int)
-  case LibFunc_Znwm: // new(unsigned long)
-  case LibFunc_Znaj: // new[](unsigned int)
-  case LibFunc_Znam: // new[](unsigned long)
-  case LibFunc_msvc_new_int: // new(unsigned int)
-  case LibFunc_msvc_new_longlong: // new(unsigned long long)
-  case LibFunc_msvc_new_array_int: // new[](unsigned int)
-  case LibFunc_msvc_new_array_longlong: // new[](unsigned long long)
-    // Operator new always returns a nonnull noalias pointer
-    Changed |= setRetNonNull(F);
-    Changed |= setRetDoesNotAlias(F);
-    return Changed;
   // TODO: add LibFunc entries for:
   // case LibFunc_memset_pattern4:
   // case LibFunc_memset_pattern8:
   case LibFunc_memset_pattern16:
     Changed |= setOnlyAccessesArgMemory(F);
     Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyWritesMemory(F, 0);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
+  case LibFunc_memset:
+    Changed |= setOnlyAccessesArgMemory(F);
+    Changed |= setWillReturn(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setOnlyWritesMemory(F, 0);
+    return Changed;
   // int __nvvm_reflect(const char *)
   case LibFunc_nvvm_reflect:
+    Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotAccessMemory(F);
     Changed |= setDoesNotThrow(F);
     return Changed;
-
+  case LibFunc_ldexp:
+  case LibFunc_ldexpf:
+  case LibFunc_ldexpl:
+    Changed |= setSignExtendedArg(F, 1);
+    Changed |= setWillReturn(F);
+    return Changed;
+  case LibFunc_abs:
+  case LibFunc_acos:
+  case LibFunc_acosf:
+  case LibFunc_acosh:
+  case LibFunc_acoshf:
+  case LibFunc_acoshl:
+  case LibFunc_acosl:
+  case LibFunc_asin:
+  case LibFunc_asinf:
+  case LibFunc_asinh:
+  case LibFunc_asinhf:
+  case LibFunc_asinhl:
+  case LibFunc_asinl:
+  case LibFunc_atan:
+  case LibFunc_atan2:
+  case LibFunc_atan2f:
+  case LibFunc_atan2l:
+  case LibFunc_atanf:
+  case LibFunc_atanh:
+  case LibFunc_atanhf:
+  case LibFunc_atanhl:
+  case LibFunc_atanl:
+  case LibFunc_cbrt:
+  case LibFunc_cbrtf:
+  case LibFunc_cbrtl:
+  case LibFunc_ceil:
+  case LibFunc_ceilf:
+  case LibFunc_ceill:
+  case LibFunc_copysign:
+  case LibFunc_copysignf:
+  case LibFunc_copysignl:
+  case LibFunc_cos:
+  case LibFunc_cosh:
+  case LibFunc_coshf:
+  case LibFunc_coshl:
+  case LibFunc_cosf:
+  case LibFunc_cosl:
+  case LibFunc_cospi:
+  case LibFunc_cospif:
+  case LibFunc_exp:
+  case LibFunc_expf:
+  case LibFunc_expl:
+  case LibFunc_exp2:
+  case LibFunc_exp2f:
+  case LibFunc_exp2l:
+  case LibFunc_expm1:
+  case LibFunc_expm1f:
+  case LibFunc_expm1l:
+  case LibFunc_fabs:
+  case LibFunc_fabsf:
+  case LibFunc_fabsl:
+  case LibFunc_ffs:
+  case LibFunc_ffsl:
+  case LibFunc_ffsll:
+  case LibFunc_floor:
+  case LibFunc_floorf:
+  case LibFunc_floorl:
+  case LibFunc_fls:
+  case LibFunc_flsl:
+  case LibFunc_flsll:
+  case LibFunc_fmax:
+  case LibFunc_fmaxf:
+  case LibFunc_fmaxl:
+  case LibFunc_fmin:
+  case LibFunc_fminf:
+  case LibFunc_fminl:
+  case LibFunc_fmod:
+  case LibFunc_fmodf:
+  case LibFunc_fmodl:
+  case LibFunc_isascii:
+  case LibFunc_isdigit:
+  case LibFunc_labs:
+  case LibFunc_llabs:
+  case LibFunc_log:
+  case LibFunc_log10:
+  case LibFunc_log10f:
+  case LibFunc_log10l:
+  case LibFunc_log1p:
+  case LibFunc_log1pf:
+  case LibFunc_log1pl:
+  case LibFunc_log2:
+  case LibFunc_log2f:
+  case LibFunc_log2l:
+  case LibFunc_logb:
+  case LibFunc_logbf:
+  case LibFunc_logbl:
+  case LibFunc_logf:
+  case LibFunc_logl:
+  case LibFunc_nearbyint:
+  case LibFunc_nearbyintf:
+  case LibFunc_nearbyintl:
+  case LibFunc_pow:
+  case LibFunc_powf:
+  case LibFunc_powl:
+  case LibFunc_rint:
+  case LibFunc_rintf:
+  case LibFunc_rintl:
+  case LibFunc_round:
+  case LibFunc_roundf:
+  case LibFunc_roundl:
+  case LibFunc_sin:
+  case LibFunc_sincospif_stret:
+  case LibFunc_sinf:
+  case LibFunc_sinh:
+  case LibFunc_sinhf:
+  case LibFunc_sinhl:
+  case LibFunc_sinl:
+  case LibFunc_sinpi:
+  case LibFunc_sinpif:
+  case LibFunc_sqrt:
+  case LibFunc_sqrtf:
+  case LibFunc_sqrtl:
+  case LibFunc_strnlen:
+  case LibFunc_tan:
+  case LibFunc_tanf:
+  case LibFunc_tanh:
+  case LibFunc_tanhf:
+  case LibFunc_tanhl:
+  case LibFunc_tanl:
+  case LibFunc_toascii:
+  case LibFunc_trunc:
+  case LibFunc_truncf:
+  case LibFunc_truncl:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotFreeMemory(F);
+    Changed |= setWillReturn(F);
+    return Changed;
   default:
     // FIXME: It'd be really nice to cover all the library functions we're
     // aware of here.
@@ -930,6 +1298,15 @@ Value *llvm::emitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize,
   return CI;
 }
 
+Value *llvm::emitMemPCpy(Value *Dst, Value *Src, Value *Len, IRBuilderBase &B,
+                         const DataLayout &DL, const TargetLibraryInfo *TLI) {
+  LLVMContext &Context = B.GetInsertBlock()->getContext();
+  return emitLibCall(
+      LibFunc_mempcpy, B.getInt8PtrTy(),
+      {B.getInt8PtrTy(), B.getInt8PtrTy(), DL.getIntPtrType(Context)},
+      {Dst, Src, Len}, B, TLI);
+}
+
 Value *llvm::emitMemChr(Value *Ptr, Value *Val, Value *Len, IRBuilderBase &B,
                         const DataLayout &DL, const TargetLibraryInfo *TLI) {
   LLVMContext &Context = B.GetInsertBlock()->getContext();
@@ -969,7 +1346,7 @@ Value *llvm::emitSNPrintf(Value *Dest, Value *Size, Value *Fmt,
                           ArrayRef<Value *> VariadicArgs, IRBuilderBase &B,
                           const TargetLibraryInfo *TLI) {
   SmallVector<Value *, 8> Args{castToCStr(Dest, B), Size, castToCStr(Fmt, B)};
-  Args.insert(Args.end(), VariadicArgs.begin(), VariadicArgs.end());
+  llvm::append_range(Args, VariadicArgs);
   return emitLibCall(LibFunc_snprintf, B.getInt32Ty(),
                      {B.getInt8PtrTy(), Size->getType(), B.getInt8PtrTy()},
                      Args, B, TLI, /*IsVaArgs=*/true);
@@ -979,7 +1356,7 @@ Value *llvm::emitSPrintf(Value *Dest, Value *Fmt,
                          ArrayRef<Value *> VariadicArgs, IRBuilderBase &B,
                          const TargetLibraryInfo *TLI) {
   SmallVector<Value *, 8> Args{castToCStr(Dest, B), castToCStr(Fmt, B)};
-  Args.insert(Args.end(), VariadicArgs.begin(), VariadicArgs.end());
+  llvm::append_range(Args, VariadicArgs);
   return emitLibCall(LibFunc_sprintf, B.getInt32Ty(),
                      {B.getInt8PtrTy(), B.getInt8PtrTy()}, Args, B, TLI,
                      /*IsVaArgs=*/true);
@@ -1087,12 +1464,15 @@ Value *llvm::emitUnaryFloatFnCall(Value *Op, const TargetLibraryInfo *TLI,
 
 static Value *emitBinaryFloatFnCallHelper(Value *Op1, Value *Op2,
                                           StringRef Name, IRBuilderBase &B,
-                                          const AttributeList &Attrs) {
+                                          const AttributeList &Attrs,
+                                          const TargetLibraryInfo *TLI = nullptr) {
   assert((Name != "") && "Must specify Name to emitBinaryFloatFnCall");
 
   Module *M = B.GetInsertBlock()->getModule();
   FunctionCallee Callee = M->getOrInsertFunction(Name, Op1->getType(),
                                                  Op1->getType(), Op2->getType());
+  if (TLI != nullptr)
+    inferLibFuncAttributes(M, Name, *TLI);
   CallInst *CI = B.CreateCall(Callee, { Op1, Op2 }, Name);
 
   // The incoming attribute set may have come from a speculatable intrinsic, but
@@ -1128,7 +1508,7 @@ Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2,
   StringRef Name = getFloatFnName(TLI, Op1->getType(),
                                   DoubleFn, FloatFn, LongDoubleFn);
 
-  return emitBinaryFloatFnCallHelper(Op1, Op2, Name, B, Attrs);
+  return emitBinaryFloatFnCallHelper(Op1, Op2, Name, B, Attrs, TLI);
 }
 
 Value *llvm::emitPutChar(Value *Char, IRBuilderBase &B,
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/CallGraphUpdater.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/CallGraphUpdater.cpp
index 52e859361c59..b2763900e154 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/CallGraphUpdater.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/CallGraphUpdater.cpp
@@ -96,11 +96,12 @@ void CallGraphUpdater::reanalyzeFunction(Function &Fn) {
   }
 }
 
-void CallGraphUpdater::registerOutlinedFunction(Function &NewFn) {
+void CallGraphUpdater::registerOutlinedFunction(Function &OriginalFn,
+                                                Function &NewFn) {
   if (CG)
     CG->addToCallGraph(&NewFn);
   else if (LCG)
-    LCG->addNewFunctionIntoSCC(NewFn, *SCC);
+    LCG->addSplitFunction(OriginalFn, NewFn);
 }
 
 void CallGraphUpdater::removeFunction(Function &DeadFn) {
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
index 5a47c1fd0b6c..bf08bf274737 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
@@ -112,9 +112,7 @@ static void createRetPHINode(Instruction *OrigInst, Instruction *NewInst,
 
   Builder.SetInsertPoint(&MergeBlock->front());
   PHINode *Phi = Builder.CreatePHI(OrigInst->getType(), 0);
-  SmallVector<User *, 16> UsersToUpdate;
-  for (User *U : OrigInst->users())
-    UsersToUpdate.push_back(U);
+  SmallVector<User *, 16> UsersToUpdate(OrigInst->users());
   for (User *U : UsersToUpdate)
     U->replaceUsesOfWith(OrigInst, Phi);
   Phi->addIncoming(OrigInst, OrigInst->getParent());
@@ -165,9 +163,7 @@ static void createRetBitCast(CallBase &CB, Type *RetTy, CastInst **RetBitCast) {
 
   // Save the users of the calling instruction. These uses will be changed to
   // use the bitcast after we create it.
-  SmallVector<User *, 16> UsersToUpdate;
-  for (User *U : CB.users())
-    UsersToUpdate.push_back(U);
+  SmallVector<User *, 16> UsersToUpdate(CB.users());
 
   // Determine an appropriate location to create the bitcast for the return
   // value. The location depends on if we have a call or invoke instruction.
@@ -430,10 +426,11 @@ bool llvm::isLegalToPromote(const CallBase &CB, Function *Callee,
     }
   }
   for (; I < NumArgs; I++) {
-    // Vararg functions can have more arguments than paramters.
+    // Vararg functions can have more arguments than parameters.
     assert(Callee->isVarArg());
     if (CB.paramHasAttr(I, Attribute::StructRet)) {
-      *FailureReason = "SRet arg to vararg function";
+      if (FailureReason)
+        *FailureReason = "SRet arg to vararg function";
       return false;
     }
   }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp
index 1ae17c64b8f6..1f649fe6c748 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp
@@ -109,7 +109,7 @@ void CanonicalizeFreezeInLoopsImpl::InsertFreezeAndForgetFromSCEV(Use &U) {
   auto *ValueToFr = U.get();
   assert(L->contains(UserI->getParent()) &&
          "Should not process an instruction that isn't inside the loop");
-  if (isGuaranteedNotToBeUndefOrPoison(ValueToFr, UserI, &DT))
+  if (isGuaranteedNotToBeUndefOrPoison(ValueToFr, nullptr, UserI, &DT))
     return;
 
   LLVM_DEBUG(dbgs() << "canonfr: inserting freeze:\n");
@@ -176,7 +176,7 @@ bool CanonicalizeFreezeInLoopsImpl::run() {
     assert(StepI && "Step instruction should have been found");
 
     // Drop flags from the step instruction.
-    if (!isGuaranteedNotToBeUndefOrPoison(StepI, StepI, &DT)) {
+    if (!isGuaranteedNotToBeUndefOrPoison(StepI, nullptr, StepI, &DT)) {
       LLVM_DEBUG(dbgs() << "canonfr: drop flags: " << *StepI << "\n");
       StepI->dropPoisonGeneratingFlags();
       SE.forgetValue(StepI);
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/CloneFunction.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/CloneFunction.cpp
index 788983c15690..6ab061510a60 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/CloneFunction.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/CloneFunction.cpp
@@ -27,6 +27,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -36,6 +37,8 @@
 #include <map>
 using namespace llvm;
 
+#define DEBUG_TYPE "clone-function"
+
 /// See comments in Cloning.h.
 BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap,
                                   const Twine &NameSuffix, Function *F,
@@ -137,15 +140,10 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
       MD[SP].reset(SP);
   }
 
-  SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
-  OldFunc->getAllMetadata(MDs);
-  for (auto MD : MDs) {
-    NewFunc->addMetadata(
-        MD.first,
-        *MapMetadata(MD.second, VMap,
-                     ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
-                     TypeMapper, Materializer));
-  }
+  // Everything else beyond this point deals with function instructions,
+  // so if we are dealing with a function declaration, we're done.
+  if (OldFunc->isDeclaration())
+    return;
 
   // When we remap instructions, we want to avoid duplicating inlined
   // DISubprograms, so record all subprograms we find as we duplicate
@@ -157,7 +155,6 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
   // Loop over all of the basic blocks in the function, cloning them as
   // appropriate.  Note that we save BE this way in order to handle cloning of
   // recursive functions into themselves.
-  //
   for (Function::const_iterator BI = OldFunc->begin(), BE = OldFunc->end();
        BI != BE; ++BI) {
     const BasicBlock &BB = *BI;
@@ -196,6 +193,19 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
   for (DIType *Type : DIFinder.types())
     VMap.MD()[Type].reset(Type);
 
+  // Duplicate the metadata that is attached to the cloned function.
+  // Subprograms/CUs/types that were already mapped to themselves won't be
+  // duplicated.
+  SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
+  OldFunc->getAllMetadata(MDs);
+  for (auto MD : MDs) {
+    NewFunc->addMetadata(
+        MD.first,
+        *MapMetadata(MD.second, VMap,
+                     ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
+                     TypeMapper, Materializer));
+  }
+
   // Loop over all of the instructions in the function, fixing up operand
   // references as we go.  This uses VMap to do all the hard work.
   for (Function::iterator BB =
@@ -426,9 +436,7 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
           CodeInfo->OperandBundleCallSites.push_back(NewInst);
 
     // Recursively clone any reachable successor blocks.
-    const Instruction *TI = BB->getTerminator();
-    for (const BasicBlock *Succ : successors(TI))
-      ToClone.push_back(Succ);
+    append_range(ToClone, successors(BB->getTerminator()));
   }
 
   if (CodeInfo) {
@@ -668,8 +676,7 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
     // Check if this block has become dead during inlining or other
     // simplifications. Note that the first block will appear dead, as it has
     // not yet been wired up properly.
-    if (I != Begin && (pred_begin(&*I) == pred_end(&*I) ||
-                       I->getSinglePredecessor() == &*I)) {
+    if (I != Begin && (pred_empty(&*I) || I->getSinglePredecessor() == &*I)) {
       BasicBlock *DeadBB = &*I++;
       DeleteDeadBlock(DeadBB);
       continue;
@@ -877,3 +884,116 @@ BasicBlock *llvm::DuplicateInstructionsInSplitBetween(
 
   return NewBB;
 }
+
+void llvm::cloneNoAliasScopes(
+    ArrayRef<MDNode *> NoAliasDeclScopes,
+    DenseMap<MDNode *, MDNode *> &ClonedScopes,
+    StringRef Ext, LLVMContext &Context) {
+  MDBuilder MDB(Context);
+
+  for (auto *ScopeList : NoAliasDeclScopes) {
+    for (auto &MDOperand : ScopeList->operands()) {
+      if (MDNode *MD = dyn_cast<MDNode>(MDOperand)) {
+        AliasScopeNode SNANode(MD);
+
+        std::string Name;
+        auto ScopeName = SNANode.getName();
+        if (!ScopeName.empty())
+          Name = (Twine(ScopeName) + ":" + Ext).str();
+        else
+          Name = std::string(Ext);
+
+        MDNode *NewScope = MDB.createAnonymousAliasScope(
+            const_cast<MDNode *>(SNANode.getDomain()), Name);
+        ClonedScopes.insert(std::make_pair(MD, NewScope));
+      }
+    }
+  }
+}
+
+void llvm::adaptNoAliasScopes(
+    Instruction *I, const DenseMap<MDNode *, MDNode *> &ClonedScopes,
+    LLVMContext &Context) {
+  auto CloneScopeList = [&](const MDNode *ScopeList) -> MDNode * {
+    bool NeedsReplacement = false;
+    SmallVector<Metadata *, 8> NewScopeList;
+    for (auto &MDOp : ScopeList->operands()) {
+      if (MDNode *MD = dyn_cast<MDNode>(MDOp)) {
+        if (auto *NewMD = ClonedScopes.lookup(MD)) {
+          NewScopeList.push_back(NewMD);
+          NeedsReplacement = true;
+          continue;
+        }
+        NewScopeList.push_back(MD);
+      }
+    }
+    if (NeedsReplacement)
+      return MDNode::get(Context, NewScopeList);
+    return nullptr;
+  };
+
+  if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(I))
+    if (auto *NewScopeList = CloneScopeList(Decl->getScopeList()))
+      Decl->setScopeList(NewScopeList);
+
+  auto replaceWhenNeeded = [&](unsigned MD_ID) {
+    if (const MDNode *CSNoAlias = I->getMetadata(MD_ID))
+      if (auto *NewScopeList = CloneScopeList(CSNoAlias))
+        I->setMetadata(MD_ID, NewScopeList);
+  };
+  replaceWhenNeeded(LLVMContext::MD_noalias);
+  replaceWhenNeeded(LLVMContext::MD_alias_scope);
+}
+
+void llvm::cloneAndAdaptNoAliasScopes(
+    ArrayRef<MDNode *> NoAliasDeclScopes,
+    ArrayRef<BasicBlock *> NewBlocks, LLVMContext &Context, StringRef Ext) {
+  if (NoAliasDeclScopes.empty())
+    return;
+
+  DenseMap<MDNode *, MDNode *> ClonedScopes;
+  LLVM_DEBUG(dbgs() << "cloneAndAdaptNoAliasScopes: cloning "
+                    << NoAliasDeclScopes.size() << " node(s)\n");
+
+  cloneNoAliasScopes(NoAliasDeclScopes, ClonedScopes, Ext, Context);
+  // Identify instructions using metadata that needs adaptation
+  for (BasicBlock *NewBlock : NewBlocks)
+    for (Instruction &I : *NewBlock)
+      adaptNoAliasScopes(&I, ClonedScopes, Context);
+}
+
+void llvm::cloneAndAdaptNoAliasScopes(
+    ArrayRef<MDNode *> NoAliasDeclScopes, Instruction *IStart,
+    Instruction *IEnd, LLVMContext &Context, StringRef Ext) {
+  if (NoAliasDeclScopes.empty())
+    return;
+
+  DenseMap<MDNode *, MDNode *> ClonedScopes;
+  LLVM_DEBUG(dbgs() << "cloneAndAdaptNoAliasScopes: cloning "
+                    << NoAliasDeclScopes.size() << " node(s)\n");
+
+  cloneNoAliasScopes(NoAliasDeclScopes, ClonedScopes, Ext, Context);
+  // Identify instructions using metadata that needs adaptation
+  assert(IStart->getParent() == IEnd->getParent() && "different basic block ?");
+  auto ItStart = IStart->getIterator();
+  auto ItEnd = IEnd->getIterator();
+  ++ItEnd; // IEnd is included, increment ItEnd to get the end of the range
+  for (auto &I : llvm::make_range(ItStart, ItEnd))
+    adaptNoAliasScopes(&I, ClonedScopes, Context);
+}
+
+void llvm::identifyNoAliasScopesToClone(
+    ArrayRef<BasicBlock *> BBs, SmallVectorImpl<MDNode *> &NoAliasDeclScopes) {
+  for (BasicBlock *BB : BBs)
+    for (Instruction &I : *BB)
+      if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(&I))
+        NoAliasDeclScopes.push_back(Decl->getScopeList());
+}
+
+void llvm::identifyNoAliasScopesToClone(
+    BasicBlock::iterator Start, BasicBlock::iterator End,
+    SmallVectorImpl<MDNode *> &NoAliasDeclScopes) {
+  for (Instruction &I : make_range(Start, End))
+    if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(&I))
+      NoAliasDeclScopes.push_back(Decl->getScopeList());
+}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/CloneModule.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/CloneModule.cpp
index 2c8c3abb2922..a6327bbf21bc 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/CloneModule.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/CloneModule.cpp
@@ -117,10 +117,17 @@ std::unique_ptr<Module> llvm::CloneModule(
   //
   for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
        I != E; ++I) {
+    GlobalVariable *GV = cast<GlobalVariable>(VMap[&*I]);
+
+    SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
+    I->getAllMetadata(MDs);
+    for (auto MD : MDs)
+      GV->addMetadata(MD.first,
+                      *MapMetadata(MD.second, VMap, RF_MoveDistinctMDs));
+
     if (I->isDeclaration())
       continue;
 
-    GlobalVariable *GV = cast<GlobalVariable>(VMap[&*I]);
     if (!ShouldCloneDefinition(&*I)) {
       // Skip after setting the correct linkage for an external reference.
       GV->setLinkage(GlobalValue::ExternalLinkage);
@@ -129,12 +136,6 @@ std::unique_ptr<Module> llvm::CloneModule(
     if (I->hasInitializer())
       GV->setInitializer(MapValue(I->getInitializer(), VMap));
 
-    SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
-    I->getAllMetadata(MDs);
-    for (auto MD : MDs)
-      GV->addMetadata(MD.first,
-                      *MapMetadata(MD.second, VMap, RF_MoveDistinctMDs));
-
     copyComdat(GV, &*I);
   }
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index 8cdbb9d35652..390925a03b73 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -535,6 +535,46 @@ void CodeExtractor::findAllocas(const CodeExtractorAnalysisCache &CEAC,
       continue;
     }
 
+    // Find bitcasts in the outlined region that have lifetime marker users
+    // outside that region. Replace the lifetime marker use with an
+    // outside region bitcast to avoid unnecessary alloca/reload instructions
+    // and extra lifetime markers.
+    SmallVector<Instruction *, 2> LifetimeBitcastUsers;
+    for (User *U : AI->users()) {
+      if (!definedInRegion(Blocks, U))
+        continue;
+
+      if (U->stripInBoundsConstantOffsets() != AI)
+        continue;
+
+      Instruction *Bitcast = cast<Instruction>(U);
+      for (User *BU : Bitcast->users()) {
+        IntrinsicInst *IntrInst = dyn_cast<IntrinsicInst>(BU);
+        if (!IntrInst)
+          continue;
+
+        if (!IntrInst->isLifetimeStartOrEnd())
+          continue;
+
+        if (definedInRegion(Blocks, IntrInst))
+          continue;
+
+        LLVM_DEBUG(dbgs() << "Replace use of extracted region bitcast"
+                          << *Bitcast << " in out-of-region lifetime marker "
+                          << *IntrInst << "\n");
+        LifetimeBitcastUsers.push_back(IntrInst);
+      }
+    }
+
+    for (Instruction *I : LifetimeBitcastUsers) {
+      Module *M = AIFunc->getParent();
+      LLVMContext &Ctx = M->getContext();
+      auto *Int8PtrTy = Type::getInt8PtrTy(Ctx);
+      CastInst *CastI =
+          CastInst::CreatePointerCast(AI, Int8PtrTy, "lt.cast", I);
+      I->replaceUsesOfWith(I->getOperand(1), CastI);
+    }
+
     // Follow any bitcasts.
     SmallVector<Instruction *, 2> Bitcasts;
     SmallVector<LifetimeMarkerInfo, 2> BitcastLifetimeInfo;
@@ -728,8 +768,7 @@ void CodeExtractor::severSplitPHINodesOfExits(
         NewBB = BasicBlock::Create(ExitBB->getContext(),
                                    ExitBB->getName() + ".split",
                                    ExitBB->getParent(), ExitBB);
-        SmallVector<BasicBlock *, 4> Preds(pred_begin(ExitBB),
-                                           pred_end(ExitBB));
+        SmallVector<BasicBlock *, 4> Preds(predecessors(ExitBB));
         for (BasicBlock *PredBB : Preds)
           if (Blocks.count(PredBB))
             PredBB->getTerminator()->replaceUsesOfWith(ExitBB, NewBB);
@@ -895,6 +934,7 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
       case Attribute::WriteOnly:
       case Attribute::ZExt:
       case Attribute::ImmArg:
+      case Attribute::ByRef:
       case Attribute::EndAttrKinds:
       case Attribute::EmptyKey:
       case Attribute::TombstoneKey:
@@ -902,9 +942,11 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
       // Those attributes should be safe to propagate to the extracted function.
       case Attribute::AlwaysInline:
       case Attribute::Cold:
+      case Attribute::Hot:
       case Attribute::NoRecurse:
       case Attribute::InlineHint:
       case Attribute::MinSize:
+      case Attribute::NoCallback:
       case Attribute::NoDuplicate:
       case Attribute::NoFree:
       case Attribute::NoImplicitFloat:
@@ -930,6 +972,8 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
       case Attribute::StrictFP:
       case Attribute::UWTable:
       case Attribute::NoCfCheck:
+      case Attribute::MustProgress:
+      case Attribute::NoProfile:
         break;
       }
 
@@ -1434,7 +1478,7 @@ static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc,
   // function arguments, as the parameters don't correspond to anything at the
   // source level.
   assert(OldSP->getUnit() && "Missing compile unit for subprogram");
-  DIBuilder DIB(*OldFunc.getParent(), /*AllowUnresolvedNodes=*/false,
+  DIBuilder DIB(*OldFunc.getParent(), /*AllowUnresolved=*/false,
                 OldSP->getUnit());
   auto SPType = DIB.createSubroutineType(DIB.getOrCreateTypeArray(None));
   DISubprogram::DISPFlags SPFlags = DISubprogram::SPFlagDefinition |
@@ -1505,7 +1549,7 @@ static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc,
   // function.
   for (Instruction &I : instructions(NewFunc)) {
     if (const DebugLoc &DL = I.getDebugLoc())
-      I.setDebugLoc(DebugLoc::get(DL.getLine(), DL.getCol(), NewSP));
+      I.setDebugLoc(DILocation::get(Ctx, DL.getLine(), DL.getCol(), NewSP));
 
     // Loop info metadata may contain line locations. Fix them up.
     auto updateLoopInfoLoc = [&Ctx,
@@ -1516,7 +1560,7 @@ static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc,
     updateLoopMetadataDebugLocations(I, updateLoopInfoLoc);
   }
   if (!TheCall.getDebugLoc())
-    TheCall.setDebugLoc(DebugLoc::get(0, 0, OldSP));
+    TheCall.setDebugLoc(DILocation::get(Ctx, 0, 0, OldSP));
 
   eraseDebugIntrinsicsWithNonLocalRefs(NewFunc);
 }
@@ -1739,7 +1783,7 @@ bool CodeExtractor::verifyAssumptionCache(const Function &OldFunc,
                                           const Function &NewFunc,
                                           AssumptionCache *AC) {
   for (auto AssumeVH : AC->assumptions()) {
-    CallInst *I = dyn_cast_or_null<CallInst>(AssumeVH);
+    auto *I = dyn_cast_or_null<CallInst>(AssumeVH);
     if (!I)
       continue;
 
@@ -1751,12 +1795,12 @@ bool CodeExtractor::verifyAssumptionCache(const Function &OldFunc,
     // that were previously in the old function, but that have now been moved
     // to the new function.
     for (auto AffectedValVH : AC->assumptionsFor(I->getOperand(0))) {
-      CallInst *AffectedCI = dyn_cast_or_null<CallInst>(AffectedValVH);
+      auto *AffectedCI = dyn_cast_or_null<CallInst>(AffectedValVH);
       if (!AffectedCI)
         continue;
       if (AffectedCI->getFunction() != &OldFunc)
         return true;
-      auto *AssumedInst = dyn_cast<Instruction>(AffectedCI->getOperand(0));
+      auto *AssumedInst = cast<Instruction>(AffectedCI->getOperand(0));
       if (AssumedInst->getFunction() != &OldFunc)
         return true;
     }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/CodeMoverUtils.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/CodeMoverUtils.cpp
index 08047dc0f96e..ce982c7403aa 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/CodeMoverUtils.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/CodeMoverUtils.cpp
@@ -355,35 +355,32 @@ bool llvm::isSafeToMoveBefore(Instruction &I, Instruction &InsertPoint,
   // Check if there exists instructions which may throw, may synchonize, or may
   // never return, from I to InsertPoint.
   if (!isSafeToSpeculativelyExecute(&I))
-    if (std::any_of(InstsToCheck.begin(), InstsToCheck.end(),
-                    [](Instruction *I) {
-                      if (I->mayThrow())
-                        return true;
-
-                      const CallBase *CB = dyn_cast<CallBase>(I);
-                      if (!CB)
-                        return false;
-                      if (!CB->hasFnAttr(Attribute::WillReturn))
-                        return true;
-                      if (!CB->hasFnAttr(Attribute::NoSync))
-                        return true;
-
-                      return false;
-                    })) {
+    if (llvm::any_of(InstsToCheck, [](Instruction *I) {
+          if (I->mayThrow())
+            return true;
+
+          const CallBase *CB = dyn_cast<CallBase>(I);
+          if (!CB)
+            return false;
+          if (!CB->hasFnAttr(Attribute::WillReturn))
+            return true;
+          if (!CB->hasFnAttr(Attribute::NoSync))
+            return true;
+
+          return false;
+        })) {
       return reportInvalidCandidate(I, MayThrowException);
     }
 
   // Check if I has any output/flow/anti dependences with instructions from \p
   // StartInst to \p EndInst.
-  if (std::any_of(InstsToCheck.begin(), InstsToCheck.end(),
-                  [&DI, &I](Instruction *CurInst) {
-                    auto DepResult = DI->depends(&I, CurInst, true);
-                    if (DepResult &&
-                        (DepResult->isOutput() || DepResult->isFlow() ||
-                         DepResult->isAnti()))
-                      return true;
-                    return false;
-                  }))
+  if (llvm::any_of(InstsToCheck, [&DI, &I](Instruction *CurInst) {
+        auto DepResult = DI->depends(&I, CurInst, true);
+        if (DepResult && (DepResult->isOutput() || DepResult->isFlow() ||
+                          DepResult->isAnti()))
+          return true;
+        return false;
+      }))
     return reportInvalidCandidate(I, HasDependences);
 
   return true;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/Debugify.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/Debugify.cpp
index 8f98d81a3d79..3e4d53c10dc9 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/Debugify.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/Debugify.cpp
@@ -20,6 +20,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/PassInstrumentation.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 
@@ -198,6 +199,18 @@ bool llvm::applyDebugifyMetadata(
   return true;
 }
 
+static bool applyDebugify(Function &F) {
+  Module &M = *F.getParent();
+  auto FuncIt = F.getIterator();
+  return applyDebugifyMetadata(M, make_range(FuncIt, std::next(FuncIt)),
+                               "FunctionDebugify: ", /*ApplyToMF=*/nullptr);
+}
+
+static bool applyDebugify(Module &M) {
+  return applyDebugifyMetadata(M, M.functions(),
+                               "ModuleDebugify: ", /*ApplyToMF=*/nullptr);
+}
+
 bool llvm::stripDebugifyMetadata(Module &M) {
   bool Changed = false;
 
@@ -226,9 +239,7 @@ bool llvm::stripDebugifyMetadata(Module &M) {
   NamedMDNode *NMD = M.getModuleFlagsMetadata();
   if (!NMD)
     return Changed;
-  SmallVector<MDNode *, 4> Flags;
-  for (MDNode *Flag : NMD->operands())
-    Flags.push_back(Flag);
+  SmallVector<MDNode *, 4> Flags(NMD->operands());
   NMD->clearOperands();
   for (MDNode *Flag : Flags) {
     MDString *Key = dyn_cast_or_null<MDString>(Flag->getOperand(1));
@@ -383,10 +394,7 @@ bool checkDebugifyMetadata(Module &M,
 /// ModulePass for attaching synthetic debug info to everything, used with the
 /// legacy module pass manager.
 struct DebugifyModulePass : public ModulePass {
-  bool runOnModule(Module &M) override {
-    return applyDebugifyMetadata(M, M.functions(),
-                                 "ModuleDebugify: ", /*ApplyToMF*/ nullptr);
-  }
+  bool runOnModule(Module &M) override { return applyDebugify(M); }
 
   DebugifyModulePass() : ModulePass(ID) {}
 
@@ -400,12 +408,7 @@ struct DebugifyModulePass : public ModulePass {
 /// FunctionPass for attaching synthetic debug info to instructions within a
 /// single function, used with the legacy module pass manager.
 struct DebugifyFunctionPass : public FunctionPass {
-  bool runOnFunction(Function &F) override {
-    Module &M = *F.getParent();
-    auto FuncIt = F.getIterator();
-    return applyDebugifyMetadata(M, make_range(FuncIt, std::next(FuncIt)),
-                                 "FunctionDebugify: ", /*ApplyToMF*/ nullptr);
-  }
+  bool runOnFunction(Function &F) override { return applyDebugify(F); }
 
   DebugifyFunctionPass() : FunctionPass(ID) {}
 
@@ -472,9 +475,32 @@ private:
 
 } // end anonymous namespace
 
-ModulePass *createDebugifyModulePass() { return new DebugifyModulePass(); }
+void llvm::exportDebugifyStats(StringRef Path, const DebugifyStatsMap &Map) {
+  std::error_code EC;
+  raw_fd_ostream OS{Path, EC};
+  if (EC) {
+    errs() << "Could not open file: " << EC.message() << ", " << Path << '\n';
+    return;
+  }
+
+  OS << "Pass Name" << ',' << "# of missing debug values" << ','
+     << "# of missing locations" << ',' << "Missing/Expected value ratio" << ','
+     << "Missing/Expected location ratio" << '\n';
+  for (const auto &Entry : Map) {
+    StringRef Pass = Entry.first;
+    DebugifyStatistics Stats = Entry.second;
+
+    OS << Pass << ',' << Stats.NumDbgValuesMissing << ','
+       << Stats.NumDbgLocsMissing << ',' << Stats.getMissingValueRatio() << ','
+       << Stats.getEmptyLocationRatio() << '\n';
+  }
+}
+
+ModulePass *llvm::createDebugifyModulePass() {
+  return new DebugifyModulePass();
+}
 
-FunctionPass *createDebugifyFunctionPass() {
+FunctionPass *llvm::createDebugifyFunctionPass() {
   return new DebugifyFunctionPass();
 }
 
@@ -484,15 +510,15 @@ PreservedAnalyses NewPMDebugifyPass::run(Module &M, ModuleAnalysisManager &) {
   return PreservedAnalyses::all();
 }
 
-ModulePass *createCheckDebugifyModulePass(bool Strip,
-                                          StringRef NameOfWrappedPass,
-                                          DebugifyStatsMap *StatsMap) {
+ModulePass *llvm::createCheckDebugifyModulePass(bool Strip,
+                                                StringRef NameOfWrappedPass,
+                                                DebugifyStatsMap *StatsMap) {
   return new CheckDebugifyModulePass(Strip, NameOfWrappedPass, StatsMap);
 }
 
-FunctionPass *createCheckDebugifyFunctionPass(bool Strip,
-                                              StringRef NameOfWrappedPass,
-                                              DebugifyStatsMap *StatsMap) {
+FunctionPass *
+llvm::createCheckDebugifyFunctionPass(bool Strip, StringRef NameOfWrappedPass,
+                                      DebugifyStatsMap *StatsMap) {
   return new CheckDebugifyFunctionPass(Strip, NameOfWrappedPass, StatsMap);
 }
 
@@ -503,6 +529,41 @@ PreservedAnalyses NewPMCheckDebugifyPass::run(Module &M,
   return PreservedAnalyses::all();
 }
 
+static bool isIgnoredPass(StringRef PassID) {
+  return isSpecialPass(PassID, {"PassManager", "PassAdaptor",
+                                "AnalysisManagerProxy", "PrintFunctionPass",
+                                "PrintModulePass", "BitcodeWriterPass",
+                                "ThinLTOBitcodeWriterPass", "VerifierPass"});
+}
+
+void DebugifyEachInstrumentation::registerCallbacks(
+    PassInstrumentationCallbacks &PIC) {
+  PIC.registerBeforeNonSkippedPassCallback([](StringRef P, Any IR) {
+    if (isIgnoredPass(P))
+      return;
+    if (any_isa<const Function *>(IR))
+      applyDebugify(*const_cast<Function *>(any_cast<const Function *>(IR)));
+    else if (any_isa<const Module *>(IR))
+      applyDebugify(*const_cast<Module *>(any_cast<const Module *>(IR)));
+  });
+  PIC.registerAfterPassCallback([this](StringRef P, Any IR,
+                                       const PreservedAnalyses &PassPA) {
+    if (isIgnoredPass(P))
+      return;
+    if (any_isa<const Function *>(IR)) {
+      auto &F = *const_cast<Function *>(any_cast<const Function *>(IR));
+      Module &M = *F.getParent();
+      auto It = F.getIterator();
+      checkDebugifyMetadata(M, make_range(It, std::next(It)), P,
+                            "CheckFunctionDebugify", /*Strip=*/true, &StatsMap);
+    } else if (any_isa<const Module *>(IR)) {
+      auto &M = *const_cast<Module *>(any_cast<const Module *>(IR));
+      checkDebugifyMetadata(M, M.functions(), P, "CheckModuleDebugify",
+                            /*Strip=*/true, &StatsMap);
+    }
+  });
+}
+
 char DebugifyModulePass::ID = 0;
 static RegisterPass<DebugifyModulePass> DM("debugify",
                                            "Attach debug info to everything");
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp
index f84ff9e5aad1..26f8e21952cc 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp
@@ -83,7 +83,7 @@ static bool runOnFunction(Function &F, bool PostInlining) {
   if (!EntryFunc.empty()) {
     DebugLoc DL;
     if (auto SP = F.getSubprogram())
-      DL = DebugLoc::get(SP->getScopeLine(), 0, SP);
+      DL = DILocation::get(SP->getContext(), SP->getScopeLine(), 0, SP);
 
     insertCall(F, EntryFunc, &*F.begin()->getFirstInsertionPt(), DL);
     Changed = true;
@@ -97,19 +97,14 @@ static bool runOnFunction(Function &F, bool PostInlining) {
         continue;
 
       // If T is preceded by a musttail call, that's the real terminator.
-      Instruction *Prev = T->getPrevNode();
-      if (BitCastInst *BCI = dyn_cast_or_null<BitCastInst>(Prev))
-        Prev = BCI->getPrevNode();
-      if (CallInst *CI = dyn_cast_or_null<CallInst>(Prev)) {
-        if (CI->isMustTailCall())
-          T = CI;
-      }
+      if (CallInst *CI = BB.getTerminatingMustTailCall())
+        T = CI;
 
       DebugLoc DL;
       if (DebugLoc TerminatorDL = T->getDebugLoc())
         DL = TerminatorDL;
       else if (auto SP = F.getSubprogram())
-        DL = DebugLoc::get(0, 0, SP);
+        DL = DILocation::get(SP->getContext(), 0, 0, SP);
 
       insertCall(F, ExitFunc, T, DL);
       Changed = true;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/EscapeEnumerator.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/EscapeEnumerator.cpp
index cae9d9ee6d70..accedd5b4ee0 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/EscapeEnumerator.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/EscapeEnumerator.cpp
@@ -41,6 +41,8 @@ IRBuilder<> *EscapeEnumerator::Next() {
     if (!isa<ReturnInst>(TI) && !isa<ResumeInst>(TI))
       continue;
 
+    if (CallInst *CI = CurBB->getTerminatingMustTailCall())
+      TI = CI;
     Builder.SetInsertPoint(TI);
     return &Builder;
   }
@@ -54,11 +56,12 @@ IRBuilder<> *EscapeEnumerator::Next() {
     return nullptr;
 
   // Find all 'call' instructions that may throw.
+  // We cannot tranform calls with musttail tag.
   SmallVector<Instruction *, 16> Calls;
   for (BasicBlock &BB : F)
     for (Instruction &II : BB)
       if (CallInst *CI = dyn_cast<CallInst>(&II))
-        if (!CI->doesNotThrow())
+        if (!CI->doesNotThrow() && !CI->isMustTailCall())
           Calls.push_back(CI);
 
   if (Calls.empty())
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/Evaluator.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/Evaluator.cpp
index c5dfbf9d92d1..732b00635e29 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/Evaluator.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/Evaluator.cpp
@@ -183,11 +183,11 @@ evaluateBitcastFromPtr(Constant *Ptr, const DataLayout &DL,
                        std::function<Constant *(Constant *)> Func) {
   Constant *Val;
   while (!(Val = Func(Ptr))) {
-    // If Ty is a struct, we can convert the pointer to the struct
+    // If Ty is a non-opaque struct, we can convert the pointer to the struct
     // into a pointer to its first member.
     // FIXME: This could be extended to support arrays as well.
     Type *Ty = cast<PointerType>(Ptr->getType())->getElementType();
-    if (!isa<StructType>(Ty))
+    if (!isa<StructType>(Ty) || cast<StructType>(Ty)->isOpaque())
       break;
 
     IntegerType *IdxTy = IntegerType::get(Ty->getContext(), 32);
@@ -210,11 +210,7 @@ static Constant *getInitializer(Constant *C) {
 Constant *Evaluator::ComputeLoadResult(Constant *P) {
   // If this memory location has been recently stored, use the stored value: it
   // is the most up-to-date.
-  auto findMemLoc = [this](Constant *Ptr) {
-    DenseMap<Constant *, Constant *>::const_iterator I =
-        MutatedMemory.find(Ptr);
-    return I != MutatedMemory.end() ? I->second : nullptr;
-  };
+  auto findMemLoc = [this](Constant *Ptr) { return MutatedMemory.lookup(Ptr); };
 
   if (Constant *Val = findMemLoc(P))
     return Val;
@@ -551,6 +547,10 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
           LLVM_DEBUG(dbgs() << "Skipping sideeffect intrinsic.\n");
           ++CurInst;
           continue;
+        } else if (II->getIntrinsicID() == Intrinsic::pseudoprobe) {
+          LLVM_DEBUG(dbgs() << "Skipping pseudoprobe intrinsic.\n");
+          ++CurInst;
+          continue;
         }
 
         LLVM_DEBUG(dbgs() << "Unknown intrinsic. Can not evaluate.\n");
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/FixIrreducible.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/FixIrreducible.cpp
index 460ba9e97fc6..44af95eef67d 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/FixIrreducible.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/FixIrreducible.cpp
@@ -66,6 +66,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Utils/FixIrreducible.h"
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/Analysis/LoopIterator.h"
 #include "llvm/InitializePasses.h"
@@ -104,7 +105,7 @@ FunctionPass *llvm::createFixIrreduciblePass() { return new FixIrreducible(); }
 INITIALIZE_PASS_BEGIN(FixIrreducible, "fix-irreducible",
                       "Convert irreducible control-flow into natural loops",
                       false /* Only looks at CFG */, false /* Analysis Pass */)
-INITIALIZE_PASS_DEPENDENCY(LowerSwitch)
+INITIALIZE_PASS_DEPENDENCY(LowerSwitchLegacyPass)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_END(FixIrreducible, "fix-irreducible",
@@ -304,11 +305,9 @@ static bool makeReducible(LoopInfo &LI, DominatorTree &DT, Graph &&G) {
   return Changed;
 }
 
-bool FixIrreducible::runOnFunction(Function &F) {
+static bool FixIrreducibleImpl(Function &F, LoopInfo &LI, DominatorTree &DT) {
   LLVM_DEBUG(dbgs() << "===== Fix irreducible control-flow in function: "
                     << F.getName() << "\n");
-  auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
 
   bool Changed = false;
   SmallVector<Loop *, 8> WorkList;
@@ -318,13 +317,10 @@ bool FixIrreducible::runOnFunction(Function &F) {
 
   // Any SCCs reduced are now already in the list of top-level loops, so simply
   // add them all to the worklist.
-  for (auto L : LI) {
-    WorkList.push_back(L);
-  }
+  append_range(WorkList, LI);
 
   while (!WorkList.empty()) {
-    auto L = WorkList.back();
-    WorkList.pop_back();
+    auto L = WorkList.pop_back_val();
     LLVM_DEBUG(dbgs() << "visiting loop with header "
                       << L->getHeader()->getName() << "\n");
     Changed |= makeReducible(LI, DT, *L);
@@ -335,3 +331,21 @@ bool FixIrreducible::runOnFunction(Function &F) {
 
   return Changed;
 }
+
+bool FixIrreducible::runOnFunction(Function &F) {
+  auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  return FixIrreducibleImpl(F, LI, DT);
+}
+
+PreservedAnalyses FixIrreduciblePass::run(Function &F,
+                                          FunctionAnalysisManager &AM) {
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  if (!FixIrreducibleImpl(F, LI, DT))
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserve<LoopAnalysis>();
+  PA.preserve<DominatorTreeAnalysis>();
+  return PA;
+}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/FunctionComparator.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/FunctionComparator.cpp
index 101cb232d8ae..2696557a719f 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/FunctionComparator.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/FunctionComparator.cpp
@@ -124,12 +124,17 @@ int FunctionComparator::cmpAttrs(const AttributeList L,
 
         Type *TyL = LA.getValueAsType();
         Type *TyR = RA.getValueAsType();
-        if (TyL && TyR)
-          return cmpTypes(TyL, TyR);
+        if (TyL && TyR) {
+          if (int Res = cmpTypes(TyL, TyR))
+            return Res;
+          continue;
+        }
 
         // Two pointers, at least one null, so the comparison result is
         // independent of the value of a real pointer.
-        return cmpNumbers((uint64_t)TyL, (uint64_t)TyR);
+        if (int Res = cmpNumbers((uint64_t)TyL, (uint64_t)TyR))
+          return Res;
+        continue;
       }
       if (LA < RA)
         return -1;
@@ -286,6 +291,7 @@ int FunctionComparator::cmpConstants(const Constant *L,
 
   switch (L->getValueID()) {
   case Value::UndefValueVal:
+  case Value::PoisonValueVal:
   case Value::ConstantTokenNoneVal:
     return TypesRes;
   case Value::ConstantIntVal: {
@@ -488,12 +494,13 @@ int FunctionComparator::cmpTypes(Type *TyL, Type *TyR) const {
   case Type::ScalableVectorTyID: {
     auto *STyL = cast<VectorType>(TyL);
     auto *STyR = cast<VectorType>(TyR);
-    if (STyL->getElementCount().Scalable != STyR->getElementCount().Scalable)
-      return cmpNumbers(STyL->getElementCount().Scalable,
-                        STyR->getElementCount().Scalable);
-    if (STyL->getElementCount().Min != STyR->getElementCount().Min)
-      return cmpNumbers(STyL->getElementCount().Min,
-                        STyR->getElementCount().Min);
+    if (STyL->getElementCount().isScalable() !=
+        STyR->getElementCount().isScalable())
+      return cmpNumbers(STyL->getElementCount().isScalable(),
+                        STyR->getElementCount().isScalable());
+    if (STyL->getElementCount() != STyR->getElementCount())
+      return cmpNumbers(STyL->getElementCount().getKnownMinValue(),
+                        STyR->getElementCount().getKnownMinValue());
     return cmpTypes(STyL->getElementType(), STyR->getElementType());
   }
   }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/GlobalStatus.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/GlobalStatus.cpp
index fe58f0e0fe40..f782396be7b6 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/GlobalStatus.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/GlobalStatus.cpp
@@ -136,7 +136,8 @@ static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS,
             GS.StoredType = GlobalStatus::Stored;
           }
         }
-      } else if (isa<BitCastInst>(I) || isa<GetElementPtrInst>(I)) {
+      } else if (isa<BitCastInst>(I) || isa<GetElementPtrInst>(I) ||
+                 isa<AddrSpaceCastInst>(I)) {
         // Skip over bitcasts and GEPs; we don't care about the type or offset
         // of the pointer.
         if (analyzeGlobalAux(I, GS, VisitedUsers))
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/GuardUtils.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/GuardUtils.cpp
index 4cfc9358499a..4dbcbf80d3da 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/GuardUtils.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/GuardUtils.cpp
@@ -30,7 +30,7 @@ static cl::opt<uint32_t> PredicatePassBranchWeight(
 void llvm::makeGuardControlFlowExplicit(Function *DeoptIntrinsic,
                                         CallInst *Guard, bool UseWC) {
   OperandBundleDef DeoptOB(*Guard->getOperandBundle(LLVMContext::OB_deopt));
-  SmallVector<Value *, 4> Args(std::next(Guard->arg_begin()), Guard->arg_end());
+  SmallVector<Value *, 4> Args(drop_begin(Guard->args()));
 
   auto *CheckBB = Guard->getParent();
   auto *DeoptBlockTerm =
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/ImportedFunctionsInliningStatistics.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/ImportedFunctionsInliningStatistics.cpp
deleted file mode 100644
index ea93f99d69e3..000000000000
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/ImportedFunctionsInliningStatistics.cpp
+++ /dev/null
@@ -1,202 +0,0 @@
-//===-- ImportedFunctionsInliningStats.cpp ----------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// Generating inliner statistics for imported functions, mostly useful for
-// ThinLTO.
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/ImportedFunctionsInliningStatistics.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
-#include <iomanip>
-#include <sstream>
-using namespace llvm;
-
-ImportedFunctionsInliningStatistics::InlineGraphNode &
-ImportedFunctionsInliningStatistics::createInlineGraphNode(const Function &F) {
-
-  auto &ValueLookup = NodesMap[F.getName()];
-  if (!ValueLookup) {
-    ValueLookup = std::make_unique<InlineGraphNode>();
-    ValueLookup->Imported = F.hasMetadata("thinlto_src_module");
-  }
-  return *ValueLookup;
-}
-
-void ImportedFunctionsInliningStatistics::recordInline(const Function &Caller,
-                                                       const Function &Callee) {
-
-  InlineGraphNode &CallerNode = createInlineGraphNode(Caller);
-  InlineGraphNode &CalleeNode = createInlineGraphNode(Callee);
-  CalleeNode.NumberOfInlines++;
-
-  if (!CallerNode.Imported && !CalleeNode.Imported) {
-    // Direct inline from not imported callee to not imported caller, so we
-    // don't have to add this to graph. It might be very helpful if you wanna
-    // get the inliner statistics in compile step where there are no imported
-    // functions. In this case the graph would be empty.
-    CalleeNode.NumberOfRealInlines++;
-    return;
-  }
-
-  CallerNode.InlinedCallees.push_back(&CalleeNode);
-  if (!CallerNode.Imported) {
-    // We could avoid second lookup, but it would make the code ultra ugly.
-    auto It = NodesMap.find(Caller.getName());
-    assert(It != NodesMap.end() && "The node should be already there.");
-    // Save Caller as a starting node for traversal. The string has to be one
-    // from map because Caller can disappear (and function name with it).
-    NonImportedCallers.push_back(It->first());
-  }
-}
-
-void ImportedFunctionsInliningStatistics::setModuleInfo(const Module &M) {
-  ModuleName = M.getName();
-  for (const auto &F : M.functions()) {
-    if (F.isDeclaration())
-      continue;
-    AllFunctions++;
-    ImportedFunctions += int(F.hasMetadata("thinlto_src_module"));
-  }
-}
-static std::string getStatString(const char *Msg, int32_t Fraction, int32_t All,
-                                 const char *PercentageOfMsg,
-                                 bool LineEnd = true) {
-  double Result = 0;
-  if (All != 0)
-    Result = 100 * static_cast<double>(Fraction) / All;
-
-  std::stringstream Str;
-  Str << std::setprecision(4) << Msg << ": " << Fraction << " [" << Result
-      << "% of " << PercentageOfMsg << "]";
-  if (LineEnd)
-    Str << "\n";
-  return Str.str();
-}
-
-void ImportedFunctionsInliningStatistics::dump(const bool Verbose) {
-  calculateRealInlines();
-  NonImportedCallers.clear();
-
-  int32_t InlinedImportedFunctionsCount = 0;
-  int32_t InlinedNotImportedFunctionsCount = 0;
-
-  int32_t InlinedImportedFunctionsToImportingModuleCount = 0;
-  int32_t InlinedNotImportedFunctionsToImportingModuleCount = 0;
-
-  const auto SortedNodes = getSortedNodes();
-  std::string Out;
-  Out.reserve(5000);
-  raw_string_ostream Ostream(Out);
-
-  Ostream << "------- Dumping inliner stats for [" << ModuleName
-          << "] -------\n";
-
-  if (Verbose)
-    Ostream << "-- List of inlined functions:\n";
-
-  for (const auto &Node : SortedNodes) {
-    assert(Node->second->NumberOfInlines >= Node->second->NumberOfRealInlines);
-    if (Node->second->NumberOfInlines == 0)
-      continue;
-
-    if (Node->second->Imported) {
-      InlinedImportedFunctionsCount++;
-      InlinedImportedFunctionsToImportingModuleCount +=
-          int(Node->second->NumberOfRealInlines > 0);
-    } else {
-      InlinedNotImportedFunctionsCount++;
-      InlinedNotImportedFunctionsToImportingModuleCount +=
-          int(Node->second->NumberOfRealInlines > 0);
-    }
-
-    if (Verbose)
-      Ostream << "Inlined "
-              << (Node->second->Imported ? "imported " : "not imported ")
-              << "function [" << Node->first() << "]"
-              << ": #inlines = " << Node->second->NumberOfInlines
-              << ", #inlines_to_importing_module = "
-              << Node->second->NumberOfRealInlines << "\n";
-  }
-
-  auto InlinedFunctionsCount =
-      InlinedImportedFunctionsCount + InlinedNotImportedFunctionsCount;
-  auto NotImportedFuncCount = AllFunctions - ImportedFunctions;
-  auto ImportedNotInlinedIntoModule =
-      ImportedFunctions - InlinedImportedFunctionsToImportingModuleCount;
-
-  Ostream << "-- Summary:\n"
-          << "All functions: " << AllFunctions
-          << ", imported functions: " << ImportedFunctions << "\n"
-          << getStatString("inlined functions", InlinedFunctionsCount,
-                           AllFunctions, "all functions")
-          << getStatString("imported functions inlined anywhere",
-                           InlinedImportedFunctionsCount, ImportedFunctions,
-                           "imported functions")
-          << getStatString("imported functions inlined into importing module",
-                           InlinedImportedFunctionsToImportingModuleCount,
-                           ImportedFunctions, "imported functions",
-                           /*LineEnd=*/false)
-          << getStatString(", remaining", ImportedNotInlinedIntoModule,
-                           ImportedFunctions, "imported functions")
-          << getStatString("non-imported functions inlined anywhere",
-                           InlinedNotImportedFunctionsCount,
-                           NotImportedFuncCount, "non-imported functions")
-          << getStatString(
-                 "non-imported functions inlined into importing module",
-                 InlinedNotImportedFunctionsToImportingModuleCount,
-                 NotImportedFuncCount, "non-imported functions");
-  Ostream.flush();
-  dbgs() << Out;
-}
-
-void ImportedFunctionsInliningStatistics::calculateRealInlines() {
-  // Removing duplicated Callers.
-  llvm::sort(NonImportedCallers);
-  NonImportedCallers.erase(
-      std::unique(NonImportedCallers.begin(), NonImportedCallers.end()),
-      NonImportedCallers.end());
-
-  for (const auto &Name : NonImportedCallers) {
-    auto &Node = *NodesMap[Name];
-    if (!Node.Visited)
-      dfs(Node);
-  }
-}
-
-void ImportedFunctionsInliningStatistics::dfs(InlineGraphNode &GraphNode) {
-  assert(!GraphNode.Visited);
-  GraphNode.Visited = true;
-  for (auto *const InlinedFunctionNode : GraphNode.InlinedCallees) {
-    InlinedFunctionNode->NumberOfRealInlines++;
-    if (!InlinedFunctionNode->Visited)
-      dfs(*InlinedFunctionNode);
-  }
-}
-
-ImportedFunctionsInliningStatistics::SortedNodesTy
-ImportedFunctionsInliningStatistics::getSortedNodes() {
-  SortedNodesTy SortedNodes;
-  SortedNodes.reserve(NodesMap.size());
-  for (const NodesMapTy::value_type& Node : NodesMap)
-    SortedNodes.push_back(&Node);
-
-  llvm::sort(SortedNodes, [&](const SortedNodesTy::value_type &Lhs,
-                              const SortedNodesTy::value_type &Rhs) {
-    if (Lhs->second->NumberOfInlines != Rhs->second->NumberOfInlines)
-      return Lhs->second->NumberOfInlines > Rhs->second->NumberOfInlines;
-    if (Lhs->second->NumberOfRealInlines != Rhs->second->NumberOfRealInlines)
-      return Lhs->second->NumberOfRealInlines >
-             Rhs->second->NumberOfRealInlines;
-    return Lhs->first() < Rhs->first();
-  });
-  return SortedNodes;
-}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp
index 9d8f59d62d6d..a2b72e4e7f03 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp
@@ -16,6 +16,7 @@
 #include "llvm/Analysis/DemandedBits.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -77,7 +78,8 @@ static void addMappingsFromTLI(const TargetLibraryInfo &TLI, CallInst &CI) {
   if (CI.isNoBuiltin() || !CI.getCalledFunction())
     return;
 
-  const std::string ScalarName = std::string(CI.getCalledFunction()->getName());
+  StringRef ScalarName = CI.getCalledFunction()->getName();
+
   // Nothing to be done if the TLI thinks the function is not
   // vectorizable.
   if (!TLI.isFunctionVectorizable(ScalarName))
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/InlineFunction.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/InlineFunction.cpp
index b0b7ca484798..fb271a2118ba 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -79,6 +79,12 @@ EnableNoAliasConversion("enable-noalias-to-md-conversion", cl::init(true),
   cl::Hidden,
   cl::desc("Convert noalias attributes to metadata during inlining."));
 
+static cl::opt<bool>
+    UseNoAliasIntrinsic("use-noalias-intrinsic-during-inlining", cl::Hidden,
+                        cl::ZeroOrMore, cl::init(true),
+                        cl::desc("Use the llvm.experimental.noalias.scope.decl "
+                                 "intrinsic during inlining."));
+
 // Disabled by default, because the added alignment assumptions may increase
 // compile-time and block optimizations. This option is not suitable for use
 // with frontends that emit comprehensive parameter alignment annotations.
@@ -771,145 +777,149 @@ static void HandleInlinedEHPad(InvokeInst *II, BasicBlock *FirstNewBlock,
   UnwindDest->removePredecessor(InvokeBB);
 }
 
-/// When inlining a call site that has !llvm.mem.parallel_loop_access or
-/// llvm.access.group metadata, that metadata should be propagated to all
-/// memory-accessing cloned instructions.
-static void PropagateParallelLoopAccessMetadata(CallBase &CB,
-                                                ValueToValueMapTy &VMap) {
-  MDNode *M = CB.getMetadata(LLVMContext::MD_mem_parallel_loop_access);
-  MDNode *CallAccessGroup = CB.getMetadata(LLVMContext::MD_access_group);
-  if (!M && !CallAccessGroup)
+/// When inlining a call site that has !llvm.mem.parallel_loop_access,
+/// !llvm.access.group, !alias.scope or !noalias metadata, that metadata should
+/// be propagated to all memory-accessing cloned instructions.
+static void PropagateCallSiteMetadata(CallBase &CB, Function::iterator FStart,
+                                      Function::iterator FEnd) {
+  MDNode *MemParallelLoopAccess =
+      CB.getMetadata(LLVMContext::MD_mem_parallel_loop_access);
+  MDNode *AccessGroup = CB.getMetadata(LLVMContext::MD_access_group);
+  MDNode *AliasScope = CB.getMetadata(LLVMContext::MD_alias_scope);
+  MDNode *NoAlias = CB.getMetadata(LLVMContext::MD_noalias);
+  if (!MemParallelLoopAccess && !AccessGroup && !AliasScope && !NoAlias)
     return;
 
-  for (ValueToValueMapTy::iterator VMI = VMap.begin(), VMIE = VMap.end();
-       VMI != VMIE; ++VMI) {
-    if (!VMI->second)
-      continue;
-
-    Instruction *NI = dyn_cast<Instruction>(VMI->second);
-    if (!NI)
-      continue;
+  for (BasicBlock &BB : make_range(FStart, FEnd)) {
+    for (Instruction &I : BB) {
+      // This metadata is only relevant for instructions that access memory.
+      if (!I.mayReadOrWriteMemory())
+        continue;
 
-    if (M) {
-      if (MDNode *PM =
-              NI->getMetadata(LLVMContext::MD_mem_parallel_loop_access)) {
-        M = MDNode::concatenate(PM, M);
-      NI->setMetadata(LLVMContext::MD_mem_parallel_loop_access, M);
-      } else if (NI->mayReadOrWriteMemory()) {
-        NI->setMetadata(LLVMContext::MD_mem_parallel_loop_access, M);
+      if (MemParallelLoopAccess) {
+        // TODO: This probably should not overwrite MemParalleLoopAccess.
+        MemParallelLoopAccess = MDNode::concatenate(
+            I.getMetadata(LLVMContext::MD_mem_parallel_loop_access),
+            MemParallelLoopAccess);
+        I.setMetadata(LLVMContext::MD_mem_parallel_loop_access,
+                      MemParallelLoopAccess);
       }
-    }
 
-    if (NI->mayReadOrWriteMemory()) {
-      MDNode *UnitedAccGroups = uniteAccessGroups(
-          NI->getMetadata(LLVMContext::MD_access_group), CallAccessGroup);
-      NI->setMetadata(LLVMContext::MD_access_group, UnitedAccGroups);
+      if (AccessGroup)
+        I.setMetadata(LLVMContext::MD_access_group, uniteAccessGroups(
+            I.getMetadata(LLVMContext::MD_access_group), AccessGroup));
+
+      if (AliasScope)
+        I.setMetadata(LLVMContext::MD_alias_scope, MDNode::concatenate(
+            I.getMetadata(LLVMContext::MD_alias_scope), AliasScope));
+
+      if (NoAlias)
+        I.setMetadata(LLVMContext::MD_noalias, MDNode::concatenate(
+            I.getMetadata(LLVMContext::MD_noalias), NoAlias));
     }
   }
 }
 
-/// When inlining a function that contains noalias scope metadata,
-/// this metadata needs to be cloned so that the inlined blocks
-/// have different "unique scopes" at every call site. Were this not done, then
-/// aliasing scopes from a function inlined into a caller multiple times could
-/// not be differentiated (and this would lead to miscompiles because the
-/// non-aliasing property communicated by the metadata could have
-/// call-site-specific control dependencies).
-static void CloneAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap) {
-  const Function *CalledFunc = CB.getCalledFunction();
+/// Utility for cloning !noalias and !alias.scope metadata. When a code region
+/// using scoped alias metadata is inlined, the aliasing relationships may not
+/// hold between the two version. It is necessary to create a deep clone of the
+/// metadata, putting the two versions in separate scope domains.
+class ScopedAliasMetadataDeepCloner {
+  using MetadataMap = DenseMap<const MDNode *, TrackingMDNodeRef>;
   SetVector<const MDNode *> MD;
-
-  // Note: We could only clone the metadata if it is already used in the
-  // caller. I'm omitting that check here because it might confuse
-  // inter-procedural alias analysis passes. We can revisit this if it becomes
-  // an efficiency or overhead problem.
-
-  for (const BasicBlock &I : *CalledFunc)
-    for (const Instruction &J : I) {
-      if (const MDNode *M = J.getMetadata(LLVMContext::MD_alias_scope))
+  MetadataMap MDMap;
+  void addRecursiveMetadataUses();
+
+public:
+  ScopedAliasMetadataDeepCloner(const Function *F);
+
+  /// Create a new clone of the scoped alias metadata, which will be used by
+  /// subsequent remap() calls.
+  void clone();
+
+  /// Remap instructions in the given range from the original to the cloned
+  /// metadata.
+  void remap(Function::iterator FStart, Function::iterator FEnd);
+};
+
+ScopedAliasMetadataDeepCloner::ScopedAliasMetadataDeepCloner(
+    const Function *F) {
+  for (const BasicBlock &BB : *F) {
+    for (const Instruction &I : BB) {
+      if (const MDNode *M = I.getMetadata(LLVMContext::MD_alias_scope))
         MD.insert(M);
-      if (const MDNode *M = J.getMetadata(LLVMContext::MD_noalias))
+      if (const MDNode *M = I.getMetadata(LLVMContext::MD_noalias))
         MD.insert(M);
-    }
 
-  if (MD.empty())
-    return;
+      // We also need to clone the metadata in noalias intrinsics.
+      if (const auto *Decl = dyn_cast<NoAliasScopeDeclInst>(&I))
+        MD.insert(Decl->getScopeList());
+    }
+  }
+  addRecursiveMetadataUses();
+}
 
-  // Walk the existing metadata, adding the complete (perhaps cyclic) chain to
-  // the set.
+void ScopedAliasMetadataDeepCloner::addRecursiveMetadataUses() {
   SmallVector<const Metadata *, 16> Queue(MD.begin(), MD.end());
   while (!Queue.empty()) {
     const MDNode *M = cast<MDNode>(Queue.pop_back_val());
-    for (unsigned i = 0, ie = M->getNumOperands(); i != ie; ++i)
-      if (const MDNode *M1 = dyn_cast<MDNode>(M->getOperand(i)))
-        if (MD.insert(M1))
-          Queue.push_back(M1);
+    for (const Metadata *Op : M->operands())
+      if (const MDNode *OpMD = dyn_cast<MDNode>(Op))
+        if (MD.insert(OpMD))
+          Queue.push_back(OpMD);
   }
+}
+
+void ScopedAliasMetadataDeepCloner::clone() {
+  assert(MDMap.empty() && "clone() already called ?");
 
-  // Now we have a complete set of all metadata in the chains used to specify
-  // the noalias scopes and the lists of those scopes.
   SmallVector<TempMDTuple, 16> DummyNodes;
-  DenseMap<const MDNode *, TrackingMDNodeRef> MDMap;
   for (const MDNode *I : MD) {
-    DummyNodes.push_back(MDTuple::getTemporary(CalledFunc->getContext(), None));
+    DummyNodes.push_back(MDTuple::getTemporary(I->getContext(), None));
     MDMap[I].reset(DummyNodes.back().get());
   }
 
   // Create new metadata nodes to replace the dummy nodes, replacing old
   // metadata references with either a dummy node or an already-created new
   // node.
+  SmallVector<Metadata *, 4> NewOps;
   for (const MDNode *I : MD) {
-    SmallVector<Metadata *, 4> NewOps;
-    for (unsigned i = 0, ie = I->getNumOperands(); i != ie; ++i) {
-      const Metadata *V = I->getOperand(i);
-      if (const MDNode *M = dyn_cast<MDNode>(V))
+    for (const Metadata *Op : I->operands()) {
+      if (const MDNode *M = dyn_cast<MDNode>(Op))
         NewOps.push_back(MDMap[M]);
       else
-        NewOps.push_back(const_cast<Metadata *>(V));
+        NewOps.push_back(const_cast<Metadata *>(Op));
     }
 
-    MDNode *NewM = MDNode::get(CalledFunc->getContext(), NewOps);
+    MDNode *NewM = MDNode::get(I->getContext(), NewOps);
     MDTuple *TempM = cast<MDTuple>(MDMap[I]);
     assert(TempM->isTemporary() && "Expected temporary node");
 
     TempM->replaceAllUsesWith(NewM);
+    NewOps.clear();
   }
+}
 
-  // Now replace the metadata in the new inlined instructions with the
-  // repacements from the map.
-  for (ValueToValueMapTy::iterator VMI = VMap.begin(), VMIE = VMap.end();
-       VMI != VMIE; ++VMI) {
-    if (!VMI->second)
-      continue;
-
-    Instruction *NI = dyn_cast<Instruction>(VMI->second);
-    if (!NI)
-      continue;
-
-    if (MDNode *M = NI->getMetadata(LLVMContext::MD_alias_scope)) {
-      MDNode *NewMD = MDMap[M];
-      // If the call site also had alias scope metadata (a list of scopes to
-      // which instructions inside it might belong), propagate those scopes to
-      // the inlined instructions.
-      if (MDNode *CSM = CB.getMetadata(LLVMContext::MD_alias_scope))
-        NewMD = MDNode::concatenate(NewMD, CSM);
-      NI->setMetadata(LLVMContext::MD_alias_scope, NewMD);
-    } else if (NI->mayReadOrWriteMemory()) {
-      if (MDNode *M = CB.getMetadata(LLVMContext::MD_alias_scope))
-        NI->setMetadata(LLVMContext::MD_alias_scope, M);
-    }
-
-    if (MDNode *M = NI->getMetadata(LLVMContext::MD_noalias)) {
-      MDNode *NewMD = MDMap[M];
-      // If the call site also had noalias metadata (a list of scopes with
-      // which instructions inside it don't alias), propagate those scopes to
-      // the inlined instructions.
-      if (MDNode *CSM = CB.getMetadata(LLVMContext::MD_noalias))
-        NewMD = MDNode::concatenate(NewMD, CSM);
-      NI->setMetadata(LLVMContext::MD_noalias, NewMD);
-    } else if (NI->mayReadOrWriteMemory()) {
-      if (MDNode *M = CB.getMetadata(LLVMContext::MD_noalias))
-        NI->setMetadata(LLVMContext::MD_noalias, M);
+void ScopedAliasMetadataDeepCloner::remap(Function::iterator FStart,
+                                          Function::iterator FEnd) {
+  if (MDMap.empty())
+    return; // Nothing to do.
+
+  for (BasicBlock &BB : make_range(FStart, FEnd)) {
+    for (Instruction &I : BB) {
+      // TODO: The null checks for the MDMap.lookup() results should no longer
+      // be necessary.
+      if (MDNode *M = I.getMetadata(LLVMContext::MD_alias_scope))
+        if (MDNode *MNew = MDMap.lookup(M))
+          I.setMetadata(LLVMContext::MD_alias_scope, MNew);
+
+      if (MDNode *M = I.getMetadata(LLVMContext::MD_noalias))
+        if (MDNode *MNew = MDMap.lookup(M))
+          I.setMetadata(LLVMContext::MD_noalias, MNew);
+
+      if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(&I))
+        if (MDNode *MNew = MDMap.lookup(Decl->getScopeList()))
+          Decl->setScopeList(MNew);
     }
   }
 }
@@ -967,6 +977,17 @@ static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
     // property of the callee, but also all control dependencies in the caller.
     MDNode *NewScope = MDB.createAnonymousAliasScope(NewDomain, Name);
     NewScopes.insert(std::make_pair(A, NewScope));
+
+    if (UseNoAliasIntrinsic) {
+      // Introduce a llvm.experimental.noalias.scope.decl for the noalias
+      // argument.
+      MDNode *AScopeList = MDNode::get(CalledFunc->getContext(), NewScope);
+      auto *NoAliasDecl =
+          IRBuilder<>(&CB).CreateNoAliasScopeDeclaration(AScopeList);
+      // Ignore the result for now. The result will be used when the
+      // llvm.noalias intrinsic is introduced.
+      (void)NoAliasDecl;
+    }
   }
 
   // Iterate over all new instructions in the map; for all memory-access
@@ -1037,7 +1058,7 @@ static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
       SmallSetVector<const Argument *, 4> NAPtrArgs;
       for (const Value *V : PtrArgs) {
         SmallVector<const Value *, 4> Objects;
-        GetUnderlyingObjects(V, Objects, DL, /* LI = */ nullptr);
+        getUnderlyingObjects(V, Objects, /* LI = */ nullptr);
 
         for (const Value *O : Objects)
           ObjSet.insert(O);
@@ -1245,7 +1266,7 @@ static void AddAlignmentAssumptions(CallBase &CB, InlineFunctionInfo &IFI) {
   Function *CalledFunc = CB.getCalledFunction();
   for (Argument &Arg : CalledFunc->args()) {
     unsigned Align = Arg.getType()->isPointerTy() ? Arg.getParamAlignment() : 0;
-    if (Align && !Arg.hasPassPointeeByValueAttr() && !Arg.hasNUses(0)) {
+    if (Align && !Arg.hasPassPointeeByValueCopyAttr() && !Arg.hasNUses(0)) {
       if (!DTCalculated) {
         DT.recalculate(*CB.getCaller());
         DTCalculated = true;
@@ -1448,8 +1469,8 @@ static DebugLoc inlineDebugLoc(DebugLoc OrigDL, DILocation *InlinedAt,
                                LLVMContext &Ctx,
                                DenseMap<const MDNode *, MDNode *> &IANodes) {
   auto IA = DebugLoc::appendInlinedAt(OrigDL, InlinedAt, Ctx, IANodes);
-  return DebugLoc::get(OrigDL.getLine(), OrigDL.getCol(), OrigDL.getScope(),
-                       IA);
+  return DILocation::get(Ctx, OrigDL.getLine(), OrigDL.getCol(),
+                         OrigDL.getScope(), IA);
 }
 
 /// Update inlined instructions' line numbers to
@@ -1573,8 +1594,7 @@ static void updateCallProfile(Function *Callee, const ValueToValueMapTy &VMap,
     return;
   auto CallSiteCount = PSI ? PSI->getProfileCount(TheCall, CallerBFI) : None;
   int64_t CallCount =
-      std::min(CallSiteCount.hasValue() ? CallSiteCount.getValue() : 0,
-               CalleeEntryCount.getCount());
+      std::min(CallSiteCount.getValueOr(0), CalleeEntryCount.getCount());
   updateProfileCallee(Callee, -CallCount, &VMap);
 }
 
@@ -1765,6 +1785,14 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
     // Keep a list of pair (dst, src) to emit byval initializations.
     SmallVector<std::pair<Value*, Value*>, 4> ByValInit;
 
+    // When inlining a function that contains noalias scope metadata,
+    // this metadata needs to be cloned so that the inlined blocks
+    // have different "unique scopes" at every call site.
+    // Track the metadata that must be cloned. Do this before other changes to
+    // the function, so that we do not get in trouble when inlining caller ==
+    // callee.
+    ScopedAliasMetadataDeepCloner SAMetadataCloner(CB.getCalledFunction());
+
     auto &DL = Caller->getParent()->getDataLayout();
 
     // Calculate the vector of arguments to pass into the function cloner, which
@@ -1855,11 +1883,8 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
           MergedDeoptArgs.reserve(ParentDeopt->Inputs.size() +
                                   ChildOB.Inputs.size());
 
-          MergedDeoptArgs.insert(MergedDeoptArgs.end(),
-                                 ParentDeopt->Inputs.begin(),
-                                 ParentDeopt->Inputs.end());
-          MergedDeoptArgs.insert(MergedDeoptArgs.end(), ChildOB.Inputs.begin(),
-                                 ChildOB.Inputs.end());
+          llvm::append_range(MergedDeoptArgs, ParentDeopt->Inputs);
+          llvm::append_range(MergedDeoptArgs, ChildOB.Inputs);
 
           OpDefs.emplace_back("deopt", std::move(MergedDeoptArgs));
         }
@@ -1885,8 +1910,9 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
     fixupLineNumbers(Caller, FirstNewBlock, &CB,
                      CalledFunc->getSubprogram() != nullptr);
 
-    // Clone existing noalias metadata if necessary.
-    CloneAliasScopeMetadata(CB, VMap);
+    // Now clone the inlined noalias scope metadata.
+    SAMetadataCloner.clone();
+    SAMetadataCloner.remap(FirstNewBlock, Caller->end());
 
     // Add noalias metadata if necessary.
     AddAliasScopeMetadata(CB, VMap, DL, CalleeAAR);
@@ -1895,8 +1921,8 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
     // function which feed into its return value.
     AddReturnAttributes(CB, VMap);
 
-    // Propagate llvm.mem.parallel_loop_access if necessary.
-    PropagateParallelLoopAccessMetadata(CB, VMap);
+    // Propagate metadata on the callsite if necessary.
+    PropagateCallSiteMetadata(CB, FirstNewBlock, Caller->end());
 
     // Register any cloned assumptions.
     if (IFI.GetAssumptionCache)
@@ -2061,7 +2087,7 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
           dyn_cast<ConstantInt>(AI->getArraySize())) {
         auto &DL = Caller->getParent()->getDataLayout();
         Type *AllocaType = AI->getAllocatedType();
-        uint64_t AllocaTypeSize = DL.getTypeAllocSize(AllocaType);
+        TypeSize AllocaTypeSize = DL.getTypeAllocSize(AllocaType);
         uint64_t AllocaArraySize = AIArraySize->getLimitedValue();
 
         // Don't add markers for zero-sized allocas.
@@ -2070,9 +2096,10 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
 
         // Check that array size doesn't saturate uint64_t and doesn't
         // overflow when it's multiplied by type size.
-        if (AllocaArraySize != std::numeric_limits<uint64_t>::max() &&
+        if (!AllocaTypeSize.isScalable() &&
+            AllocaArraySize != std::numeric_limits<uint64_t>::max() &&
             std::numeric_limits<uint64_t>::max() / AllocaArraySize >=
-                AllocaTypeSize) {
+                AllocaTypeSize.getFixedSize()) {
           AllocaSize = ConstantInt::get(Type::getInt64Ty(AI->getContext()),
                                         AllocaArraySize * AllocaTypeSize);
         }
@@ -2198,10 +2225,9 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
     // match the callee's return type, we also need to change the return type of
     // the intrinsic.
     if (Caller->getReturnType() == CB.getType()) {
-      auto NewEnd = llvm::remove_if(Returns, [](ReturnInst *RI) {
+      llvm::erase_if(Returns, [](ReturnInst *RI) {
         return RI->getParent()->getTerminatingDeoptimizeCall() != nullptr;
       });
-      Returns.erase(NewEnd, Returns.end());
     } else {
       SmallVector<ReturnInst *, 8> NormalReturns;
       Function *NewDeoptIntrinsic = Intrinsic::getDeclaration(
@@ -2225,8 +2251,7 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
         auto *CurBB = RI->getParent();
         RI->eraseFromParent();
 
-        SmallVector<Value *, 4> CallArgs(DeoptCall->arg_begin(),
-                                         DeoptCall->arg_end());
+        SmallVector<Value *, 4> CallArgs(DeoptCall->args());
 
         SmallVector<OperandBundleDef, 1> OpBundles;
         DeoptCall->getOperandBundlesAsDefs(OpBundles);
@@ -2463,7 +2488,7 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
 
   // If we inlined any musttail calls and the original return is now
   // unreachable, delete it.  It can only contain a bitcast and ret.
-  if (InlinedMustTailCalls && pred_begin(AfterCallBB) == pred_end(AfterCallBB))
+  if (InlinedMustTailCalls && pred_empty(AfterCallBB))
     AfterCallBB->eraseFromParent();
 
   // We should always be able to fold the entry block of the function into the
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/InstructionNamer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/InstructionNamer.cpp
index 8e339fe46d45..f3499c9c8aed 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/InstructionNamer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/InstructionNamer.cpp
@@ -13,43 +13,52 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Utils/InstructionNamer.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/IR/Type.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Utils.h"
+
 using namespace llvm;
 
 namespace {
-  struct InstNamer : public FunctionPass {
-    static char ID; // Pass identification, replacement for typeid
-    InstNamer() : FunctionPass(ID) {
-      initializeInstNamerPass(*PassRegistry::getPassRegistry());
-    }
+void nameInstructions(Function &F) {
+  for (auto &Arg : F.args()) {
+    if (!Arg.hasName())
+      Arg.setName("arg");
+  }
 
-    void getAnalysisUsage(AnalysisUsage &Info) const override {
-      Info.setPreservesAll();
+  for (BasicBlock &BB : F) {
+    if (!BB.hasName())
+      BB.setName("bb");
+
+    for (Instruction &I : BB) {
+      if (!I.hasName() && !I.getType()->isVoidTy())
+        I.setName("i");
     }
+  }
+}
 
-    bool runOnFunction(Function &F) override {
-      for (auto &Arg : F.args())
-        if (!Arg.hasName())
-          Arg.setName("arg");
+struct InstNamer : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  InstNamer() : FunctionPass(ID) {
+    initializeInstNamerPass(*PassRegistry::getPassRegistry());
+  }
 
-      for (BasicBlock &BB : F) {
-        if (!BB.hasName())
-          BB.setName("bb");
+  void getAnalysisUsage(AnalysisUsage &Info) const override {
+    Info.setPreservesAll();
+  }
 
-        for (Instruction &I : BB)
-          if (!I.hasName() && !I.getType()->isVoidTy())
-            I.setName("i");
-      }
-      return true;
-    }
-  };
+  bool runOnFunction(Function &F) override {
+    nameInstructions(F);
+    return true;
+  }
+};
 
   char InstNamer::ID = 0;
-}
+  } // namespace
 
 INITIALIZE_PASS(InstNamer, "instnamer",
                 "Assign names to anonymous instructions", false, false)
@@ -61,3 +70,9 @@ char &llvm::InstructionNamerID = InstNamer::ID;
 FunctionPass *llvm::createInstructionNamerPass() {
   return new InstNamer();
 }
+
+PreservedAnalyses InstructionNamerPass::run(Function &F,
+                                            FunctionAnalysisManager &FAM) {
+  nameInstructions(F);
+  return PreservedAnalyses::all();
+}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LCSSA.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LCSSA.cpp
index b1a1c564d217..7437701f5339 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LCSSA.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LCSSA.cpp
@@ -40,6 +40,7 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/PredIteratorCache.h"
@@ -77,12 +78,15 @@ static bool isExitBlock(BasicBlock *BB,
 /// rewrite the uses.
 bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
                                     const DominatorTree &DT, const LoopInfo &LI,
-                                    ScalarEvolution *SE) {
+                                    ScalarEvolution *SE, IRBuilderBase &Builder,
+                                    SmallVectorImpl<PHINode *> *PHIsToRemove) {
   SmallVector<Use *, 16> UsesToRewrite;
-  SmallSetVector<PHINode *, 16> PHIsToRemove;
+  SmallSetVector<PHINode *, 16> LocalPHIsToRemove;
   PredIteratorCache PredCache;
   bool Changed = false;
 
+  IRBuilderBase::InsertPointGuard InsertPtGuard(Builder);
+
   // Cache the Loop ExitBlocks across this loop.  We expect to get a lot of
   // instructions within the same loops, computing the exit blocks is
   // expensive, and we're not mutating the loop structure.
@@ -107,6 +111,10 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
     for (Use &U : I->uses()) {
       Instruction *User = cast<Instruction>(U.getUser());
       BasicBlock *UserBB = User->getParent();
+
+      // For practical purposes, we consider that the use in a PHI
+      // occurs in the respective predecessor block. For more info,
+      // see the `phi` doc in LangRef and the LCSSA doc.
       if (auto *PN = dyn_cast<PHINode>(User))
         UserBB = PN->getIncomingBlock(U);
 
@@ -151,12 +159,17 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
       // If we already inserted something for this BB, don't reprocess it.
       if (SSAUpdate.HasValueForBlock(ExitBB))
         continue;
-
-      PHINode *PN = PHINode::Create(I->getType(), PredCache.size(ExitBB),
-                                    I->getName() + ".lcssa", &ExitBB->front());
+      Builder.SetInsertPoint(&ExitBB->front());
+      PHINode *PN = Builder.CreatePHI(I->getType(), PredCache.size(ExitBB),
+                                      I->getName() + ".lcssa");
       // Get the debug location from the original instruction.
       PN->setDebugLoc(I->getDebugLoc());
-      // Add inputs from inside the loop for this PHI.
+
+      // Add inputs from inside the loop for this PHI. This is valid
+      // because `I` dominates `ExitBB` (checked above).  This implies
+      // that every incoming block/edge is dominated by `I` as well,
+      // i.e. we can add uses of `I` to those incoming edges/append to the incoming
+      // blocks without violating the SSA dominance property.
       for (BasicBlock *Pred : PredCache.get(ExitBB)) {
         PN->addIncoming(I, Pred);
 
@@ -190,15 +203,19 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
     // Rewrite all uses outside the loop in terms of the new PHIs we just
     // inserted.
     for (Use *UseToRewrite : UsesToRewrite) {
-      // If this use is in an exit block, rewrite to use the newly inserted PHI.
-      // This is required for correctness because SSAUpdate doesn't handle uses
-      // in the same block.  It assumes the PHI we inserted is at the end of the
-      // block.
       Instruction *User = cast<Instruction>(UseToRewrite->getUser());
       BasicBlock *UserBB = User->getParent();
+
+      // For practical purposes, we consider that the use in a PHI
+      // occurs in the respective predecessor block. For more info,
+      // see the `phi` doc in LangRef and the LCSSA doc.
       if (auto *PN = dyn_cast<PHINode>(User))
         UserBB = PN->getIncomingBlock(*UseToRewrite);
 
+      // If this use is in an exit block, rewrite to use the newly inserted PHI.
+      // This is required for correctness because SSAUpdate doesn't handle uses
+      // in the same block.  It assumes the PHI we inserted is at the end of the
+      // block.
       if (isa<PHINode>(UserBB->begin()) && isExitBlock(UserBB, ExitBlocks)) {
         UseToRewrite->set(&UserBB->front());
         continue;
@@ -248,27 +265,29 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
         Worklist.push_back(PostProcessPN);
 
     // Keep track of PHI nodes that we want to remove because they did not have
-    // any uses rewritten. If the new PHI is used, store it so that we can
-    // try to propagate dbg.value intrinsics to it.
-    SmallVector<PHINode *, 2> NeedDbgValues;
+    // any uses rewritten.
     for (PHINode *PN : AddedPHIs)
       if (PN->use_empty())
-        PHIsToRemove.insert(PN);
-      else
-        NeedDbgValues.push_back(PN);
-    insertDebugValuesForPHIs(InstBB, NeedDbgValues);
+        LocalPHIsToRemove.insert(PN);
+
     Changed = true;
   }
-  // Remove PHI nodes that did not have any uses rewritten. We need to redo the
-  // use_empty() check here, because even if the PHI node wasn't used when added
-  // to PHIsToRemove, later added PHI nodes can be using it.  This cleanup is
-  // not guaranteed to handle trees/cycles of PHI nodes that only are used by
-  // each other. Such situations has only been noticed when the input IR
-  // contains unreachable code, and leaving some extra redundant PHI nodes in
-  // such situations is considered a minor problem.
-  for (PHINode *PN : PHIsToRemove)
-    if (PN->use_empty())
-      PN->eraseFromParent();
+
+  // Remove PHI nodes that did not have any uses rewritten or add them to
+  // PHIsToRemove, so the caller can remove them after some additional cleanup.
+  // We need to redo the use_empty() check here, because even if the PHI node
+  // wasn't used when added to LocalPHIsToRemove, later added PHI nodes can be
+  // using it.  This cleanup is not guaranteed to handle trees/cycles of PHI
+  // nodes that only are used by each other. Such situations has only been
+  // noticed when the input IR contains unreachable code, and leaving some extra
+  // redundant PHI nodes in such situations is considered a minor problem.
+  if (PHIsToRemove) {
+    PHIsToRemove->append(LocalPHIsToRemove.begin(), LocalPHIsToRemove.end());
+  } else {
+    for (PHINode *PN : LocalPHIsToRemove)
+      if (PN->use_empty())
+        PN->eraseFromParent();
+  }
   return Changed;
 }
 
@@ -276,12 +295,9 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
 static void computeBlocksDominatingExits(
     Loop &L, const DominatorTree &DT, SmallVector<BasicBlock *, 8> &ExitBlocks,
     SmallSetVector<BasicBlock *, 8> &BlocksDominatingExits) {
-  SmallVector<BasicBlock *, 8> BBWorklist;
-
   // We start from the exit blocks, as every block trivially dominates itself
   // (not strictly).
-  for (BasicBlock *BB : ExitBlocks)
-    BBWorklist.push_back(BB);
+  SmallVector<BasicBlock *, 8> BBWorklist(ExitBlocks);
 
   while (!BBWorklist.empty()) {
     BasicBlock *BB = BBWorklist.pop_back_val();
@@ -369,7 +385,9 @@ bool llvm::formLCSSA(Loop &L, const DominatorTree &DT, const LoopInfo *LI,
       Worklist.push_back(&I);
     }
   }
-  Changed = formLCSSAForInstructions(Worklist, DT, *LI, SE);
+
+  IRBuilder<> Builder(L.getHeader()->getContext());
+  Changed = formLCSSAForInstructions(Worklist, DT, *LI, SE, Builder);
 
   // If we modified the code, remove any caches about the loop from SCEV to
   // avoid dangling entries.
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/Local.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/Local.cpp
index da40c342af3a..ae26058c210c 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/Local.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/Local.cpp
@@ -91,6 +91,24 @@ using namespace llvm::PatternMatch;
 #define DEBUG_TYPE "local"
 
 STATISTIC(NumRemoved, "Number of unreachable basic blocks removed");
+STATISTIC(NumPHICSEs, "Number of PHI's that got CSE'd");
+
+static cl::opt<bool> PHICSEDebugHash(
+    "phicse-debug-hash",
+#ifdef EXPENSIVE_CHECKS
+    cl::init(true),
+#else
+    cl::init(false),
+#endif
+    cl::Hidden,
+    cl::desc("Perform extra assertion checking to verify that PHINodes's hash "
+             "function is well-behaved w.r.t. its isEqual predicate"));
+
+static cl::opt<unsigned> PHICSENumPHISmallSize(
+    "phicse-num-phi-smallsize", cl::init(32), cl::Hidden,
+    cl::desc(
+        "When the basic block contains not more than this number of PHI nodes, "
+        "perform a (faster!) exhaustive search instead of set-driven one."));
 
 // Max recursion depth for collectBitParts used when detecting bswap and
 // bitreverse idioms
@@ -116,27 +134,10 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
   // Branch - See if we are conditional jumping on constant
   if (auto *BI = dyn_cast<BranchInst>(T)) {
     if (BI->isUnconditional()) return false;  // Can't optimize uncond branch
+
     BasicBlock *Dest1 = BI->getSuccessor(0);
     BasicBlock *Dest2 = BI->getSuccessor(1);
 
-    if (auto *Cond = dyn_cast<ConstantInt>(BI->getCondition())) {
-      // Are we branching on constant?
-      // YES.  Change to unconditional branch...
-      BasicBlock *Destination = Cond->getZExtValue() ? Dest1 : Dest2;
-      BasicBlock *OldDest     = Cond->getZExtValue() ? Dest2 : Dest1;
-
-      // Let the basic block know that we are letting go of it.  Based on this,
-      // it will adjust it's PHI nodes.
-      OldDest->removePredecessor(BB);
-
-      // Replace the conditional branch with an unconditional one.
-      Builder.CreateBr(Destination);
-      BI->eraseFromParent();
-      if (DTU)
-        DTU->applyUpdatesPermissive({{DominatorTree::Delete, BB, OldDest}});
-      return true;
-    }
-
     if (Dest2 == Dest1) {       // Conditional branch to same location?
       // This branch matches something like this:
       //     br bool %cond, label %Dest, label %Dest
@@ -154,6 +155,25 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
         RecursivelyDeleteTriviallyDeadInstructions(Cond, TLI);
       return true;
     }
+
+    if (auto *Cond = dyn_cast<ConstantInt>(BI->getCondition())) {
+      // Are we branching on constant?
+      // YES.  Change to unconditional branch...
+      BasicBlock *Destination = Cond->getZExtValue() ? Dest1 : Dest2;
+      BasicBlock *OldDest = Cond->getZExtValue() ? Dest2 : Dest1;
+
+      // Let the basic block know that we are letting go of it.  Based on this,
+      // it will adjust it's PHI nodes.
+      OldDest->removePredecessor(BB);
+
+      // Replace the conditional branch with an unconditional one.
+      Builder.CreateBr(Destination);
+      BI->eraseFromParent();
+      if (DTU)
+        DTU->applyUpdates({{DominatorTree::Delete, BB, OldDest}});
+      return true;
+    }
+
     return false;
   }
 
@@ -170,6 +190,8 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
       TheOnlyDest = SI->case_begin()->getCaseSuccessor();
     }
 
+    bool Changed = false;
+
     // Figure out which case it goes to.
     for (auto i = SI->case_begin(), e = SI->case_end(); i != e;) {
       // Found case matching a constant operand?
@@ -208,9 +230,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
         DefaultDest->removePredecessor(ParentBB);
         i = SI->removeCase(i);
         e = SI->case_end();
-        if (DTU)
-          DTU->applyUpdatesPermissive(
-              {{DominatorTree::Delete, ParentBB, DefaultDest}});
+        Changed = true;
         continue;
       }
 
@@ -236,19 +256,19 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
       // Insert the new branch.
       Builder.CreateBr(TheOnlyDest);
       BasicBlock *BB = SI->getParent();
-      std::vector <DominatorTree::UpdateType> Updates;
-      if (DTU)
-        Updates.reserve(SI->getNumSuccessors() - 1);
+
+      SmallSetVector<BasicBlock *, 8> RemovedSuccessors;
 
       // Remove entries from PHI nodes which we no longer branch to...
+      BasicBlock *SuccToKeep = TheOnlyDest;
       for (BasicBlock *Succ : successors(SI)) {
+        if (DTU && Succ != TheOnlyDest)
+          RemovedSuccessors.insert(Succ);
         // Found case matching a constant operand?
-        if (Succ == TheOnlyDest) {
-          TheOnlyDest = nullptr; // Don't modify the first branch to TheOnlyDest
+        if (Succ == SuccToKeep) {
+          SuccToKeep = nullptr; // Don't modify the first branch to TheOnlyDest
         } else {
           Succ->removePredecessor(BB);
-          if (DTU)
-            Updates.push_back({DominatorTree::Delete, BB, Succ});
         }
       }
 
@@ -257,8 +277,13 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
       SI->eraseFromParent();
       if (DeleteDeadConditions)
         RecursivelyDeleteTriviallyDeadInstructions(Cond, TLI);
-      if (DTU)
-        DTU->applyUpdatesPermissive(Updates);
+      if (DTU) {
+        std::vector<DominatorTree::UpdateType> Updates;
+        Updates.reserve(RemovedSuccessors.size());
+        for (auto *RemovedSuccessor : RemovedSuccessors)
+          Updates.push_back({DominatorTree::Delete, BB, RemovedSuccessor});
+        DTU->applyUpdates(Updates);
+      }
       return true;
     }
 
@@ -296,7 +321,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
       SI->eraseFromParent();
       return true;
     }
-    return false;
+    return Changed;
   }
 
   if (auto *IBI = dyn_cast<IndirectBrInst>(T)) {
@@ -304,22 +329,20 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
     if (auto *BA =
           dyn_cast<BlockAddress>(IBI->getAddress()->stripPointerCasts())) {
       BasicBlock *TheOnlyDest = BA->getBasicBlock();
-      std::vector <DominatorTree::UpdateType> Updates;
-      if (DTU)
-        Updates.reserve(IBI->getNumDestinations() - 1);
+      SmallSetVector<BasicBlock *, 8> RemovedSuccessors;
 
       // Insert the new branch.
       Builder.CreateBr(TheOnlyDest);
 
+      BasicBlock *SuccToKeep = TheOnlyDest;
       for (unsigned i = 0, e = IBI->getNumDestinations(); i != e; ++i) {
-        if (IBI->getDestination(i) == TheOnlyDest) {
-          TheOnlyDest = nullptr;
+        BasicBlock *DestBB = IBI->getDestination(i);
+        if (DTU && DestBB != TheOnlyDest)
+          RemovedSuccessors.insert(DestBB);
+        if (IBI->getDestination(i) == SuccToKeep) {
+          SuccToKeep = nullptr;
         } else {
-          BasicBlock *ParentBB = IBI->getParent();
-          BasicBlock *DestBB = IBI->getDestination(i);
-          DestBB->removePredecessor(ParentBB);
-          if (DTU)
-            Updates.push_back({DominatorTree::Delete, ParentBB, DestBB});
+          DestBB->removePredecessor(BB);
         }
       }
       Value *Address = IBI->getAddress();
@@ -336,13 +359,18 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
       // If we didn't find our destination in the IBI successor list, then we
       // have undefined behavior.  Replace the unconditional branch with an
       // 'unreachable' instruction.
-      if (TheOnlyDest) {
+      if (SuccToKeep) {
         BB->getTerminator()->eraseFromParent();
         new UnreachableInst(BB->getContext(), BB);
       }
 
-      if (DTU)
-        DTU->applyUpdatesPermissive(Updates);
+      if (DTU) {
+        std::vector<DominatorTree::UpdateType> Updates;
+        Updates.reserve(RemovedSuccessors.size());
+        for (auto *RemovedSuccessor : RemovedSuccessors)
+          Updates.push_back({DominatorTree::Delete, BB, RemovedSuccessor});
+        DTU->applyUpdates(Updates);
+      }
       return true;
     }
   }
@@ -392,6 +420,9 @@ bool llvm::wouldInstructionBeTriviallyDead(Instruction *I,
     return true;
   }
 
+  if (!I->willReturn())
+    return false;
+
   if (!I->mayHaveSideEffects())
     return true;
 
@@ -453,21 +484,24 @@ bool llvm::wouldInstructionBeTriviallyDead(Instruction *I,
 /// trivially dead, delete them too, recursively.  Return true if any
 /// instructions were deleted.
 bool llvm::RecursivelyDeleteTriviallyDeadInstructions(
-    Value *V, const TargetLibraryInfo *TLI, MemorySSAUpdater *MSSAU) {
+    Value *V, const TargetLibraryInfo *TLI, MemorySSAUpdater *MSSAU,
+    std::function<void(Value *)> AboutToDeleteCallback) {
   Instruction *I = dyn_cast<Instruction>(V);
   if (!I || !isInstructionTriviallyDead(I, TLI))
     return false;
 
   SmallVector<WeakTrackingVH, 16> DeadInsts;
   DeadInsts.push_back(I);
-  RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI, MSSAU);
+  RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI, MSSAU,
+                                             AboutToDeleteCallback);
 
   return true;
 }
 
 bool llvm::RecursivelyDeleteTriviallyDeadInstructionsPermissive(
     SmallVectorImpl<WeakTrackingVH> &DeadInsts, const TargetLibraryInfo *TLI,
-    MemorySSAUpdater *MSSAU) {
+    MemorySSAUpdater *MSSAU,
+    std::function<void(Value *)> AboutToDeleteCallback) {
   unsigned S = 0, E = DeadInsts.size(), Alive = 0;
   for (; S != E; ++S) {
     auto *I = cast<Instruction>(DeadInsts[S]);
@@ -478,13 +512,15 @@ bool llvm::RecursivelyDeleteTriviallyDeadInstructionsPermissive(
   }
   if (Alive == E)
     return false;
-  RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI, MSSAU);
+  RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI, MSSAU,
+                                             AboutToDeleteCallback);
   return true;
 }
 
 void llvm::RecursivelyDeleteTriviallyDeadInstructions(
     SmallVectorImpl<WeakTrackingVH> &DeadInsts, const TargetLibraryInfo *TLI,
-    MemorySSAUpdater *MSSAU) {
+    MemorySSAUpdater *MSSAU,
+    std::function<void(Value *)> AboutToDeleteCallback) {
   // Process the dead instruction list until empty.
   while (!DeadInsts.empty()) {
     Value *V = DeadInsts.pop_back_val();
@@ -498,6 +534,9 @@ void llvm::RecursivelyDeleteTriviallyDeadInstructions(
     // Don't lose the debug info while deleting the instructions.
     salvageDebugInfo(*I);
 
+    if (AboutToDeleteCallback)
+      AboutToDeleteCallback(I);
+
     // Null out all of the instruction's operands to see if any operand becomes
     // dead as we go.
     for (Use &OpU : I->operands()) {
@@ -675,34 +714,6 @@ bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB,
 //  Control Flow Graph Restructuring.
 //
 
-void llvm::RemovePredecessorAndSimplify(BasicBlock *BB, BasicBlock *Pred,
-                                        DomTreeUpdater *DTU) {
-  // This only adjusts blocks with PHI nodes.
-  if (!isa<PHINode>(BB->begin()))
-    return;
-
-  // Remove the entries for Pred from the PHI nodes in BB, but do not simplify
-  // them down.  This will leave us with single entry phi nodes and other phis
-  // that can be removed.
-  BB->removePredecessor(Pred, true);
-
-  WeakTrackingVH PhiIt = &BB->front();
-  while (PHINode *PN = dyn_cast<PHINode>(PhiIt)) {
-    PhiIt = &*++BasicBlock::iterator(cast<Instruction>(PhiIt));
-    Value *OldPhiIt = PhiIt;
-
-    if (!recursivelySimplifyInstruction(PN))
-      continue;
-
-    // If recursive simplification ended up deleting the next PHI node we would
-    // iterate to, then our iterator is invalid, restart scanning from the top
-    // of the block.
-    if (PhiIt != OldPhiIt) PhiIt = &BB->front();
-  }
-  if (DTU)
-    DTU->applyUpdatesPermissive({{DominatorTree::Delete, Pred, BB}});
-}
-
 void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB,
                                        DomTreeUpdater *DTU) {
 
@@ -727,13 +738,13 @@ void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB,
   SmallVector<DominatorTree::UpdateType, 32> Updates;
 
   if (DTU) {
-    Updates.push_back({DominatorTree::Delete, PredBB, DestBB});
     for (auto I = pred_begin(PredBB), E = pred_end(PredBB); I != E; ++I) {
-      Updates.push_back({DominatorTree::Delete, *I, PredBB});
       // This predecessor of PredBB may already have DestBB as a successor.
-      if (llvm::find(successors(*I), DestBB) == succ_end(*I))
+      if (!llvm::is_contained(successors(*I), DestBB))
         Updates.push_back({DominatorTree::Insert, *I, DestBB});
+      Updates.push_back({DominatorTree::Delete, *I, PredBB});
     }
+    Updates.push_back({DominatorTree::Delete, PredBB, DestBB});
   }
 
   // Zap anything that took the address of DestBB.  Not doing this will give the
@@ -907,6 +918,7 @@ static void gatherIncomingValuesToPhi(PHINode *PN,
 /// \param IncomingValues A map from block to value.
 static void replaceUndefValuesInPhi(PHINode *PN,
                                     const IncomingValueMap &IncomingValues) {
+  SmallVector<unsigned> TrueUndefOps;
   for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
     Value *V = PN->getIncomingValue(i);
 
@@ -914,10 +926,31 @@ static void replaceUndefValuesInPhi(PHINode *PN,
 
     BasicBlock *BB = PN->getIncomingBlock(i);
     IncomingValueMap::const_iterator It = IncomingValues.find(BB);
-    if (It == IncomingValues.end()) continue;
 
+    // Keep track of undef/poison incoming values. Those must match, so we fix
+    // them up below if needed.
+    // Note: this is conservatively correct, but we could try harder and group
+    // the undef values per incoming basic block.
+    if (It == IncomingValues.end()) {
+      TrueUndefOps.push_back(i);
+      continue;
+    }
+
+    // There is a defined value for this incoming block, so map this undef
+    // incoming value to the defined value.
     PN->setIncomingValue(i, It->second);
   }
+
+  // If there are both undef and poison values incoming, then convert those
+  // values to undef. It is invalid to have different values for the same
+  // incoming block.
+  unsigned PoisonCount = count_if(TrueUndefOps, [&](unsigned i) {
+    return isa<PoisonValue>(PN->getIncomingValue(i));
+  });
+  if (PoisonCount != 0 && PoisonCount != TrueUndefOps.size()) {
+    for (unsigned i : TrueUndefOps)
+      PN->setIncomingValue(i, UndefValue::get(PN->getType()));
+  }
 }
 
 /// Replace a value flowing from a block to a phi with
@@ -1038,14 +1071,16 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB,
 
   SmallVector<DominatorTree::UpdateType, 32> Updates;
   if (DTU) {
-    Updates.push_back({DominatorTree::Delete, BB, Succ});
     // All predecessors of BB will be moved to Succ.
-    for (auto I = pred_begin(BB), E = pred_end(BB); I != E; ++I) {
-      Updates.push_back({DominatorTree::Delete, *I, BB});
+    SmallSetVector<BasicBlock *, 8> Predecessors(pred_begin(BB), pred_end(BB));
+    Updates.reserve(Updates.size() + 2 * Predecessors.size());
+    for (auto *Predecessor : Predecessors) {
       // This predecessor of BB may already have Succ as a successor.
-      if (llvm::find(successors(*I), Succ) == succ_end(*I))
-        Updates.push_back({DominatorTree::Insert, *I, Succ});
+      if (!llvm::is_contained(successors(Predecessor), Succ))
+        Updates.push_back({DominatorTree::Insert, Predecessor, Succ});
+      Updates.push_back({DominatorTree::Delete, Predecessor, BB});
     }
+    Updates.push_back({DominatorTree::Delete, BB, Succ});
   }
 
   if (isa<PHINode>(Succ->begin())) {
@@ -1101,7 +1136,7 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB,
                            "applying corresponding DTU updates.");
 
   if (DTU) {
-    DTU->applyUpdatesPermissive(Updates);
+    DTU->applyUpdates(Updates);
     DTU->deleteBB(BB);
   } else {
     BB->eraseFromParent(); // Delete the old basic block.
@@ -1109,7 +1144,39 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB,
   return true;
 }
 
-bool llvm::EliminateDuplicatePHINodes(BasicBlock *BB) {
+static bool EliminateDuplicatePHINodesNaiveImpl(BasicBlock *BB) {
+  // This implementation doesn't currently consider undef operands
+  // specially. Theoretically, two phis which are identical except for
+  // one having an undef where the other doesn't could be collapsed.
+
+  bool Changed = false;
+
+  // Examine each PHI.
+  // Note that increment of I must *NOT* be in the iteration_expression, since
+  // we don't want to immediately advance when we restart from the beginning.
+  for (auto I = BB->begin(); PHINode *PN = dyn_cast<PHINode>(I);) {
+    ++I;
+    // Is there an identical PHI node in this basic block?
+    // Note that we only look in the upper square's triangle,
+    // we already checked that the lower triangle PHI's aren't identical.
+    for (auto J = I; PHINode *DuplicatePN = dyn_cast<PHINode>(J); ++J) {
+      if (!DuplicatePN->isIdenticalToWhenDefined(PN))
+        continue;
+      // A duplicate. Replace this PHI with the base PHI.
+      ++NumPHICSEs;
+      DuplicatePN->replaceAllUsesWith(PN);
+      DuplicatePN->eraseFromParent();
+      Changed = true;
+
+      // The RAUW can change PHIs that we already visited.
+      I = BB->begin();
+      break; // Start over from the beginning.
+    }
+  }
+  return Changed;
+}
+
+static bool EliminateDuplicatePHINodesSetBasedImpl(BasicBlock *BB) {
   // This implementation doesn't currently consider undef operands
   // specially. Theoretically, two phis which are identical except for
   // one having an undef where the other doesn't could be collapsed.
@@ -1123,7 +1190,13 @@ bool llvm::EliminateDuplicatePHINodes(BasicBlock *BB) {
       return DenseMapInfo<PHINode *>::getTombstoneKey();
     }
 
-    static unsigned getHashValue(PHINode *PN) {
+    static bool isSentinel(PHINode *PN) {
+      return PN == getEmptyKey() || PN == getTombstoneKey();
+    }
+
+    // WARNING: this logic must be kept in sync with
+    //          Instruction::isIdenticalToWhenDefined()!
+    static unsigned getHashValueImpl(PHINode *PN) {
       // Compute a hash value on the operands. Instcombine will likely have
       // sorted them, which helps expose duplicates, but we have to check all
       // the operands to be safe in case instcombine hasn't run.
@@ -1132,16 +1205,37 @@ bool llvm::EliminateDuplicatePHINodes(BasicBlock *BB) {
           hash_combine_range(PN->block_begin(), PN->block_end())));
     }
 
-    static bool isEqual(PHINode *LHS, PHINode *RHS) {
-      if (LHS == getEmptyKey() || LHS == getTombstoneKey() ||
-          RHS == getEmptyKey() || RHS == getTombstoneKey())
+    static unsigned getHashValue(PHINode *PN) {
+#ifndef NDEBUG
+      // If -phicse-debug-hash was specified, return a constant -- this
+      // will force all hashing to collide, so we'll exhaustively search
+      // the table for a match, and the assertion in isEqual will fire if
+      // there's a bug causing equal keys to hash differently.
+      if (PHICSEDebugHash)
+        return 0;
+#endif
+      return getHashValueImpl(PN);
+    }
+
+    static bool isEqualImpl(PHINode *LHS, PHINode *RHS) {
+      if (isSentinel(LHS) || isSentinel(RHS))
         return LHS == RHS;
       return LHS->isIdenticalTo(RHS);
     }
+
+    static bool isEqual(PHINode *LHS, PHINode *RHS) {
+      // These comparisons are nontrivial, so assert that equality implies
+      // hash equality (DenseMap demands this as an invariant).
+      bool Result = isEqualImpl(LHS, RHS);
+      assert(!Result || (isSentinel(LHS) && LHS == RHS) ||
+             getHashValueImpl(LHS) == getHashValueImpl(RHS));
+      return Result;
+    }
   };
 
   // Set of unique PHINodes.
   DenseSet<PHINode *, PHIDenseMapInfo> PHISet;
+  PHISet.reserve(4 * PHICSENumPHISmallSize);
 
   // Examine each PHI.
   bool Changed = false;
@@ -1149,6 +1243,7 @@ bool llvm::EliminateDuplicatePHINodes(BasicBlock *BB) {
     auto Inserted = PHISet.insert(PN);
     if (!Inserted.second) {
       // A duplicate. Replace this PHI with its duplicate.
+      ++NumPHICSEs;
       PN->replaceAllUsesWith(*Inserted.first);
       PN->eraseFromParent();
       Changed = true;
@@ -1163,54 +1258,63 @@ bool llvm::EliminateDuplicatePHINodes(BasicBlock *BB) {
   return Changed;
 }
 
-/// enforceKnownAlignment - If the specified pointer points to an object that
-/// we control, modify the object's alignment to PrefAlign. This isn't
-/// often possible though. If alignment is important, a more reliable approach
-/// is to simply align all global variables and allocation instructions to
-/// their preferred alignment from the beginning.
-static Align enforceKnownAlignment(Value *V, Align Alignment, Align PrefAlign,
-                                   const DataLayout &DL) {
-  assert(PrefAlign > Alignment);
+bool llvm::EliminateDuplicatePHINodes(BasicBlock *BB) {
+  if (
+#ifndef NDEBUG
+      !PHICSEDebugHash &&
+#endif
+      hasNItemsOrLess(BB->phis(), PHICSENumPHISmallSize))
+    return EliminateDuplicatePHINodesNaiveImpl(BB);
+  return EliminateDuplicatePHINodesSetBasedImpl(BB);
+}
 
+/// If the specified pointer points to an object that we control, try to modify
+/// the object's alignment to PrefAlign. Returns a minimum known alignment of
+/// the value after the operation, which may be lower than PrefAlign.
+///
+/// Increating value alignment isn't often possible though. If alignment is
+/// important, a more reliable approach is to simply align all global variables
+/// and allocation instructions to their preferred alignment from the beginning.
+static Align tryEnforceAlignment(Value *V, Align PrefAlign,
+                                 const DataLayout &DL) {
   V = V->stripPointerCasts();
 
   if (AllocaInst *AI = dyn_cast<AllocaInst>(V)) {
-    // TODO: ideally, computeKnownBits ought to have used
-    // AllocaInst::getAlignment() in its computation already, making
-    // the below max redundant. But, as it turns out,
-    // stripPointerCasts recurses through infinite layers of bitcasts,
-    // while computeKnownBits is not allowed to traverse more than 6
-    // levels.
-    Alignment = std::max(AI->getAlign(), Alignment);
-    if (PrefAlign <= Alignment)
-      return Alignment;
+    // TODO: Ideally, this function would not be called if PrefAlign is smaller
+    // than the current alignment, as the known bits calculation should have
+    // already taken it into account. However, this is not always the case,
+    // as computeKnownBits() has a depth limit, while stripPointerCasts()
+    // doesn't.
+    Align CurrentAlign = AI->getAlign();
+    if (PrefAlign <= CurrentAlign)
+      return CurrentAlign;
 
     // If the preferred alignment is greater than the natural stack alignment
     // then don't round up. This avoids dynamic stack realignment.
     if (DL.exceedsNaturalStackAlignment(PrefAlign))
-      return Alignment;
+      return CurrentAlign;
     AI->setAlignment(PrefAlign);
     return PrefAlign;
   }
 
   if (auto *GO = dyn_cast<GlobalObject>(V)) {
     // TODO: as above, this shouldn't be necessary.
-    Alignment = max(GO->getAlign(), Alignment);
-    if (PrefAlign <= Alignment)
-      return Alignment;
+    Align CurrentAlign = GO->getPointerAlignment(DL);
+    if (PrefAlign <= CurrentAlign)
+      return CurrentAlign;
 
     // If there is a large requested alignment and we can, bump up the alignment
     // of the global.  If the memory we set aside for the global may not be the
     // memory used by the final program then it is impossible for us to reliably
     // enforce the preferred alignment.
     if (!GO->canIncreaseAlignment())
-      return Alignment;
+      return CurrentAlign;
 
     GO->setAlignment(PrefAlign);
     return PrefAlign;
   }
 
-  return Alignment;
+  return Align(1);
 }
 
 Align llvm::getOrEnforceKnownAlignment(Value *V, MaybeAlign PrefAlign,
@@ -1232,7 +1336,7 @@ Align llvm::getOrEnforceKnownAlignment(Value *V, MaybeAlign PrefAlign,
   Align Alignment = Align(1ull << std::min(Known.getBitWidth() - 1, TrailZ));
 
   if (PrefAlign && *PrefAlign > Alignment)
-    Alignment = enforceKnownAlignment(V, Alignment, *PrefAlign, DL);
+    Alignment = std::max(Alignment, tryEnforceAlignment(V, *PrefAlign, DL));
 
   // We don't need to make any adjustment.
   return Alignment;
@@ -1270,16 +1374,22 @@ static bool PhiHasDebugValue(DILocalVariable *DIVar,
 /// least n bits.
 static bool valueCoversEntireFragment(Type *ValTy, DbgVariableIntrinsic *DII) {
   const DataLayout &DL = DII->getModule()->getDataLayout();
-  uint64_t ValueSize = DL.getTypeAllocSizeInBits(ValTy);
-  if (auto FragmentSize = DII->getFragmentSizeInBits())
-    return ValueSize >= *FragmentSize;
+  TypeSize ValueSize = DL.getTypeAllocSizeInBits(ValTy);
+  if (Optional<uint64_t> FragmentSize = DII->getFragmentSizeInBits()) {
+    assert(!ValueSize.isScalable() &&
+           "Fragments don't work on scalable types.");
+    return ValueSize.getFixedSize() >= *FragmentSize;
+  }
   // We can't always calculate the size of the DI variable (e.g. if it is a
   // VLA). Try to use the size of the alloca that the dbg intrinsic describes
   // intead.
   if (DII->isAddressOfVariable())
     if (auto *AI = dyn_cast_or_null<AllocaInst>(DII->getVariableLocation()))
-      if (auto FragmentSize = AI->getAllocationSizeInBits(DL))
-        return ValueSize >= *FragmentSize;
+      if (Optional<TypeSize> FragmentSize = AI->getAllocationSizeInBits(DL)) {
+        assert(ValueSize.isScalable() == FragmentSize->isScalable() &&
+               "Both sizes should agree on the scalable flag.");
+        return TypeSize::isKnownGE(ValueSize, *FragmentSize);
+      }
   // Could not determine size of variable. Conservatively return false.
   return false;
 }
@@ -1294,7 +1404,7 @@ static DebugLoc getDebugValueLoc(DbgVariableIntrinsic *DII, Instruction *Src) {
   MDNode *Scope = DeclareLoc.getScope();
   DILocation *InlinedAt = DeclareLoc.getInlinedAt();
   // Produce an unknown location with the correct scope / inlinedAt fields.
-  return DebugLoc::get(0, 0, Scope, InlinedAt);
+  return DILocation::get(DII->getContext(), 0, 0, Scope, InlinedAt);
 }
 
 /// Inserts a llvm.dbg.value intrinsic before a store to an alloca'd value
@@ -1911,8 +2021,10 @@ bool llvm::replaceAllDbgUsesWith(Instruction &From, Value &To,
   return false;
 }
 
-unsigned llvm::removeAllNonTerminatorAndEHPadInstructions(BasicBlock *BB) {
+std::pair<unsigned, unsigned>
+llvm::removeAllNonTerminatorAndEHPadInstructions(BasicBlock *BB) {
   unsigned NumDeadInst = 0;
+  unsigned NumDeadDbgInst = 0;
   // Delete the instructions backwards, as it has a reduced likelihood of
   // having to update as many def-use and use-def chains.
   Instruction *EndInst = BB->getTerminator(); // Last not to be deleted.
@@ -1925,30 +2037,31 @@ unsigned llvm::removeAllNonTerminatorAndEHPadInstructions(BasicBlock *BB) {
       EndInst = Inst;
       continue;
     }
-    if (!isa<DbgInfoIntrinsic>(Inst))
+    if (isa<DbgInfoIntrinsic>(Inst))
+      ++NumDeadDbgInst;
+    else
       ++NumDeadInst;
     Inst->eraseFromParent();
   }
-  return NumDeadInst;
+  return {NumDeadInst, NumDeadDbgInst};
 }
 
 unsigned llvm::changeToUnreachable(Instruction *I, bool UseLLVMTrap,
                                    bool PreserveLCSSA, DomTreeUpdater *DTU,
                                    MemorySSAUpdater *MSSAU) {
   BasicBlock *BB = I->getParent();
-  std::vector <DominatorTree::UpdateType> Updates;
 
   if (MSSAU)
     MSSAU->changeToUnreachable(I);
 
+  SmallSetVector<BasicBlock *, 8> UniqueSuccessors;
+
   // Loop over all of the successors, removing BB's entry from any PHI
   // nodes.
-  if (DTU)
-    Updates.reserve(BB->getTerminator()->getNumSuccessors());
   for (BasicBlock *Successor : successors(BB)) {
     Successor->removePredecessor(BB, PreserveLCSSA);
     if (DTU)
-      Updates.push_back({DominatorTree::Delete, BB, Successor});
+      UniqueSuccessors.insert(Successor);
   }
   // Insert a call to llvm.trap right before this.  This turns the undefined
   // behavior into a hard fail instead of falling through into random code.
@@ -1970,13 +2083,18 @@ unsigned llvm::changeToUnreachable(Instruction *I, bool UseLLVMTrap,
     BB->getInstList().erase(BBI++);
     ++NumInstrsRemoved;
   }
-  if (DTU)
-    DTU->applyUpdatesPermissive(Updates);
+  if (DTU) {
+    SmallVector<DominatorTree::UpdateType, 8> Updates;
+    Updates.reserve(UniqueSuccessors.size());
+    for (BasicBlock *UniqueSuccessor : UniqueSuccessors)
+      Updates.push_back({DominatorTree::Delete, BB, UniqueSuccessor});
+    DTU->applyUpdates(Updates);
+  }
   return NumInstrsRemoved;
 }
 
 CallInst *llvm::createCallMatchingInvoke(InvokeInst *II) {
-  SmallVector<Value *, 8> Args(II->arg_begin(), II->arg_end());
+  SmallVector<Value *, 8> Args(II->args());
   SmallVector<OperandBundleDef, 1> OpBundles;
   II->getOperandBundlesAsDefs(OpBundles);
   CallInst *NewCall = CallInst::Create(II->getFunctionType(),
@@ -2017,7 +2135,7 @@ void llvm::changeToCall(InvokeInst *II, DomTreeUpdater *DTU) {
   UnwindDestBB->removePredecessor(BB);
   II->eraseFromParent();
   if (DTU)
-    DTU->applyUpdatesPermissive({{DominatorTree::Delete, BB, UnwindDestBB}});
+    DTU->applyUpdates({{DominatorTree::Delete, BB, UnwindDestBB}});
 }
 
 BasicBlock *llvm::changeToInvokeAndSplitBasicBlock(CallInst *CI,
@@ -2033,7 +2151,7 @@ BasicBlock *llvm::changeToInvokeAndSplitBasicBlock(CallInst *CI,
   BB->getInstList().pop_back();
 
   // Create the new invoke instruction.
-  SmallVector<Value *, 8> InvokeArgs(CI->arg_begin(), CI->arg_end());
+  SmallVector<Value *, 8> InvokeArgs(CI->args());
   SmallVector<OperandBundleDef, 1> OpBundles;
 
   CI->getOperandBundlesAsDefs(OpBundles);
@@ -2164,8 +2282,7 @@ static bool markAliveBlocks(Function &F,
           UnwindDestBB->removePredecessor(II->getParent());
           II->eraseFromParent();
           if (DTU)
-            DTU->applyUpdatesPermissive(
-                {{DominatorTree::Delete, BB, UnwindDestBB}});
+            DTU->applyUpdates({{DominatorTree::Delete, BB, UnwindDestBB}});
         } else
           changeToCall(II, DTU);
         Changed = true;
@@ -2194,6 +2311,7 @@ static bool markAliveBlocks(Function &F,
         }
       };
 
+      SmallMapVector<BasicBlock *, int, 8> NumPerSuccessorCases;
       // Set of unique CatchPads.
       SmallDenseMap<CatchPadInst *, detail::DenseSetEmpty, 4,
                     CatchPadDenseMapInfo, detail::DenseSetPair<CatchPadInst *>>
@@ -2203,14 +2321,22 @@ static bool markAliveBlocks(Function &F,
                                              E = CatchSwitch->handler_end();
            I != E; ++I) {
         BasicBlock *HandlerBB = *I;
+        ++NumPerSuccessorCases[HandlerBB];
         auto *CatchPad = cast<CatchPadInst>(HandlerBB->getFirstNonPHI());
         if (!HandlerSet.insert({CatchPad, Empty}).second) {
+          --NumPerSuccessorCases[HandlerBB];
           CatchSwitch->removeHandler(I);
           --I;
           --E;
           Changed = true;
         }
       }
+      std::vector<DominatorTree::UpdateType> Updates;
+      for (const std::pair<BasicBlock *, int> &I : NumPerSuccessorCases)
+        if (I.second == 0)
+          Updates.push_back({DominatorTree::Delete, BB, I.first});
+      if (DTU)
+        DTU->applyUpdates(Updates);
     }
 
     Changed |= ConstantFoldTerminator(BB, true, nullptr, DTU);
@@ -2254,7 +2380,7 @@ void llvm::removeUnwindEdge(BasicBlock *BB, DomTreeUpdater *DTU) {
   TI->replaceAllUsesWith(NewTI);
   TI->eraseFromParent();
   if (DTU)
-    DTU->applyUpdatesPermissive({{DominatorTree::Delete, BB, UnwindDest}});
+    DTU->applyUpdates({{DominatorTree::Delete, BB, UnwindDest}});
 }
 
 /// removeUnreachableBlocks - Remove blocks that are not reachable, even
@@ -2270,28 +2396,39 @@ bool llvm::removeUnreachableBlocks(Function &F, DomTreeUpdater *DTU,
     return Changed;
 
   assert(Reachable.size() < F.size());
-  NumRemoved += F.size() - Reachable.size();
 
-  SmallSetVector<BasicBlock *, 8> DeadBlockSet;
+  // Are there any blocks left to actually delete?
+  SmallSetVector<BasicBlock *, 8> BlocksToRemove;
   for (BasicBlock &BB : F) {
     // Skip reachable basic blocks
     if (Reachable.count(&BB))
       continue;
-    DeadBlockSet.insert(&BB);
+    // Skip already-deleted blocks
+    if (DTU && DTU->isBBPendingDeletion(&BB))
+      continue;
+    BlocksToRemove.insert(&BB);
   }
 
+  if (BlocksToRemove.empty())
+    return Changed;
+
+  Changed = true;
+  NumRemoved += BlocksToRemove.size();
+
   if (MSSAU)
-    MSSAU->removeBlocks(DeadBlockSet);
+    MSSAU->removeBlocks(BlocksToRemove);
 
-  // Loop over all of the basic blocks that are not reachable, dropping all of
+  // Loop over all of the basic blocks that are up for removal, dropping all of
   // their internal references. Update DTU if available.
   std::vector<DominatorTree::UpdateType> Updates;
-  for (auto *BB : DeadBlockSet) {
+  for (auto *BB : BlocksToRemove) {
+    SmallSetVector<BasicBlock *, 8> UniqueSuccessors;
     for (BasicBlock *Successor : successors(BB)) {
-      if (!DeadBlockSet.count(Successor))
+      // Only remove references to BB in reachable successors of BB.
+      if (Reachable.count(Successor))
         Successor->removePredecessor(BB);
       if (DTU)
-        Updates.push_back({DominatorTree::Delete, BB, Successor});
+        UniqueSuccessors.insert(Successor);
     }
     BB->dropAllReferences();
     if (DTU) {
@@ -2305,27 +2442,22 @@ bool llvm::removeUnreachableBlocks(Function &F, DomTreeUpdater *DTU,
       new UnreachableInst(BB->getContext(), BB);
       assert(succ_empty(BB) && "The successor list of BB isn't empty before "
                                "applying corresponding DTU updates.");
+      Updates.reserve(Updates.size() + UniqueSuccessors.size());
+      for (auto *UniqueSuccessor : UniqueSuccessors)
+        Updates.push_back({DominatorTree::Delete, BB, UniqueSuccessor});
     }
   }
 
   if (DTU) {
-    DTU->applyUpdatesPermissive(Updates);
-    bool Deleted = false;
-    for (auto *BB : DeadBlockSet) {
-      if (DTU->isBBPendingDeletion(BB))
-        --NumRemoved;
-      else
-        Deleted = true;
+    DTU->applyUpdates(Updates);
+    for (auto *BB : BlocksToRemove)
       DTU->deleteBB(BB);
-    }
-    if (!Deleted)
-      return false;
   } else {
-    for (auto *BB : DeadBlockSet)
+    for (auto *BB : BlocksToRemove)
       BB->eraseFromParent();
   }
 
-  return true;
+  return Changed;
 }
 
 void llvm::combineMetadata(Instruction *K, const Instruction *J,
@@ -2570,10 +2702,13 @@ bool llvm::callsGCLeafFunction(const CallBase *Call,
     if (F->hasFnAttribute("gc-leaf-function"))
       return true;
 
-    if (auto IID = F->getIntrinsicID())
+    if (auto IID = F->getIntrinsicID()) {
       // Most LLVM intrinsics do not take safepoints.
       return IID != Intrinsic::experimental_gc_statepoint &&
-             IID != Intrinsic::experimental_deoptimize;
+             IID != Intrinsic::experimental_deoptimize &&
+             IID != Intrinsic::memcpy_element_unordered_atomic &&
+             IID != Intrinsic::memmove_element_unordered_atomic;
+    }
   }
 
   // Lib calls can be materialized by some passes, and won't be
@@ -2701,7 +2836,7 @@ struct BitPart {
 
 /// Analyze the specified subexpression and see if it is capable of providing
 /// pieces of a bswap or bitreverse. The subexpression provides a potential
-/// piece of a bswap or bitreverse if it can be proven that each non-zero bit in
+/// piece of a bswap or bitreverse if it can be proved that each non-zero bit in
 /// the output of the expression came from a corresponding bit in some other
 /// value. This function is recursive, and the end result is a mapping of
 /// bitnumber to bitnumber. It is the caller's responsibility to validate that
@@ -2713,6 +2848,10 @@ struct BitPart {
 /// BitPart is returned with Provider set to %X and Provenance[24-31] set to
 /// [0-7].
 ///
+/// For vector types, all analysis is performed at the per-element level. No
+/// cross-element analysis is supported (shuffle/insertion/reduction), and all
+/// constant masks must be splatted across all elements.
+///
 /// To avoid revisiting values, the BitPart results are memoized into the
 /// provided map. To avoid unnecessary copying of BitParts, BitParts are
 /// constructed in-place in the \c BPS map. Because of this \c BPS needs to
@@ -2730,7 +2869,7 @@ collectBitParts(Value *V, bool MatchBSwaps, bool MatchBitReversals,
     return I->second;
 
   auto &Result = BPS[V] = None;
-  auto BitWidth = cast<IntegerType>(V->getType())->getBitWidth();
+  auto BitWidth = V->getType()->getScalarSizeInBits();
 
   // Prevent stack overflow by limiting the recursion depth
   if (Depth == BitPartRecursionMaxDepth) {
@@ -2738,13 +2877,16 @@ collectBitParts(Value *V, bool MatchBSwaps, bool MatchBitReversals,
     return Result;
   }
 
-  if (Instruction *I = dyn_cast<Instruction>(V)) {
+  if (auto *I = dyn_cast<Instruction>(V)) {
+    Value *X, *Y;
+    const APInt *C;
+
     // If this is an or instruction, it may be an inner node of the bswap.
-    if (I->getOpcode() == Instruction::Or) {
-      auto &A = collectBitParts(I->getOperand(0), MatchBSwaps,
-                                MatchBitReversals, BPS, Depth + 1);
-      auto &B = collectBitParts(I->getOperand(1), MatchBSwaps,
-                                MatchBitReversals, BPS, Depth + 1);
+    if (match(V, m_Or(m_Value(X), m_Value(Y)))) {
+      const auto &A =
+          collectBitParts(X, MatchBSwaps, MatchBitReversals, BPS, Depth + 1);
+      const auto &B =
+          collectBitParts(Y, MatchBSwaps, MatchBitReversals, BPS, Depth + 1);
       if (!A || !B)
         return Result;
 
@@ -2753,31 +2895,31 @@ collectBitParts(Value *V, bool MatchBSwaps, bool MatchBitReversals,
         return Result;
 
       Result = BitPart(A->Provider, BitWidth);
-      for (unsigned i = 0; i < A->Provenance.size(); ++i) {
-        if (A->Provenance[i] != BitPart::Unset &&
-            B->Provenance[i] != BitPart::Unset &&
-            A->Provenance[i] != B->Provenance[i])
+      for (unsigned BitIdx = 0; BitIdx < BitWidth; ++BitIdx) {
+        if (A->Provenance[BitIdx] != BitPart::Unset &&
+            B->Provenance[BitIdx] != BitPart::Unset &&
+            A->Provenance[BitIdx] != B->Provenance[BitIdx])
           return Result = None;
 
-        if (A->Provenance[i] == BitPart::Unset)
-          Result->Provenance[i] = B->Provenance[i];
+        if (A->Provenance[BitIdx] == BitPart::Unset)
+          Result->Provenance[BitIdx] = B->Provenance[BitIdx];
         else
-          Result->Provenance[i] = A->Provenance[i];
+          Result->Provenance[BitIdx] = A->Provenance[BitIdx];
       }
 
       return Result;
     }
 
     // If this is a logical shift by a constant, recurse then shift the result.
-    if (I->isLogicalShift() && isa<ConstantInt>(I->getOperand(1))) {
-      unsigned BitShift =
-          cast<ConstantInt>(I->getOperand(1))->getLimitedValue(~0U);
+    if (match(V, m_LogicalShift(m_Value(X), m_APInt(C)))) {
+      const APInt &BitShift = *C;
+
       // Ensure the shift amount is defined.
-      if (BitShift > BitWidth)
+      if (BitShift.uge(BitWidth))
         return Result;
 
-      auto &Res = collectBitParts(I->getOperand(0), MatchBSwaps,
-                                  MatchBitReversals, BPS, Depth + 1);
+      const auto &Res =
+          collectBitParts(X, MatchBSwaps, MatchBitReversals, BPS, Depth + 1);
       if (!Res)
         return Result;
       Result = Res;
@@ -2785,11 +2927,11 @@ collectBitParts(Value *V, bool MatchBSwaps, bool MatchBitReversals,
       // Perform the "shift" on BitProvenance.
       auto &P = Result->Provenance;
       if (I->getOpcode() == Instruction::Shl) {
-        P.erase(std::prev(P.end(), BitShift), P.end());
-        P.insert(P.begin(), BitShift, BitPart::Unset);
+        P.erase(std::prev(P.end(), BitShift.getZExtValue()), P.end());
+        P.insert(P.begin(), BitShift.getZExtValue(), BitPart::Unset);
       } else {
-        P.erase(P.begin(), std::next(P.begin(), BitShift));
-        P.insert(P.end(), BitShift, BitPart::Unset);
+        P.erase(P.begin(), std::next(P.begin(), BitShift.getZExtValue()));
+        P.insert(P.end(), BitShift.getZExtValue(), BitPart::Unset);
       }
 
       return Result;
@@ -2797,44 +2939,102 @@ collectBitParts(Value *V, bool MatchBSwaps, bool MatchBitReversals,
 
     // If this is a logical 'and' with a mask that clears bits, recurse then
     // unset the appropriate bits.
-    if (I->getOpcode() == Instruction::And &&
-        isa<ConstantInt>(I->getOperand(1))) {
-      APInt Bit(I->getType()->getPrimitiveSizeInBits(), 1);
-      const APInt &AndMask = cast<ConstantInt>(I->getOperand(1))->getValue();
+    if (match(V, m_And(m_Value(X), m_APInt(C)))) {
+      const APInt &AndMask = *C;
 
       // Check that the mask allows a multiple of 8 bits for a bswap, for an
       // early exit.
       unsigned NumMaskedBits = AndMask.countPopulation();
-      if (!MatchBitReversals && NumMaskedBits % 8 != 0)
+      if (!MatchBitReversals && (NumMaskedBits % 8) != 0)
         return Result;
 
-      auto &Res = collectBitParts(I->getOperand(0), MatchBSwaps,
-                                  MatchBitReversals, BPS, Depth + 1);
+      const auto &Res =
+          collectBitParts(X, MatchBSwaps, MatchBitReversals, BPS, Depth + 1);
       if (!Res)
         return Result;
       Result = Res;
 
-      for (unsigned i = 0; i < BitWidth; ++i, Bit <<= 1)
+      for (unsigned BitIdx = 0; BitIdx < BitWidth; ++BitIdx)
         // If the AndMask is zero for this bit, clear the bit.
-        if ((AndMask & Bit) == 0)
-          Result->Provenance[i] = BitPart::Unset;
+        if (AndMask[BitIdx] == 0)
+          Result->Provenance[BitIdx] = BitPart::Unset;
       return Result;
     }
 
     // If this is a zext instruction zero extend the result.
-    if (I->getOpcode() == Instruction::ZExt) {
-      auto &Res = collectBitParts(I->getOperand(0), MatchBSwaps,
-                                  MatchBitReversals, BPS, Depth + 1);
+    if (match(V, m_ZExt(m_Value(X)))) {
+      const auto &Res =
+          collectBitParts(X, MatchBSwaps, MatchBitReversals, BPS, Depth + 1);
+      if (!Res)
+        return Result;
+
+      Result = BitPart(Res->Provider, BitWidth);
+      auto NarrowBitWidth = X->getType()->getScalarSizeInBits();
+      for (unsigned BitIdx = 0; BitIdx < NarrowBitWidth; ++BitIdx)
+        Result->Provenance[BitIdx] = Res->Provenance[BitIdx];
+      for (unsigned BitIdx = NarrowBitWidth; BitIdx < BitWidth; ++BitIdx)
+        Result->Provenance[BitIdx] = BitPart::Unset;
+      return Result;
+    }
+
+    // BITREVERSE - most likely due to us previous matching a partial
+    // bitreverse.
+    if (match(V, m_BitReverse(m_Value(X)))) {
+      const auto &Res =
+          collectBitParts(X, MatchBSwaps, MatchBitReversals, BPS, Depth + 1);
+      if (!Res)
+        return Result;
+
+      Result = BitPart(Res->Provider, BitWidth);
+      for (unsigned BitIdx = 0; BitIdx < BitWidth; ++BitIdx)
+        Result->Provenance[(BitWidth - 1) - BitIdx] = Res->Provenance[BitIdx];
+      return Result;
+    }
+
+    // BSWAP - most likely due to us previous matching a partial bswap.
+    if (match(V, m_BSwap(m_Value(X)))) {
+      const auto &Res =
+          collectBitParts(X, MatchBSwaps, MatchBitReversals, BPS, Depth + 1);
       if (!Res)
         return Result;
 
+      unsigned ByteWidth = BitWidth / 8;
       Result = BitPart(Res->Provider, BitWidth);
-      auto NarrowBitWidth =
-          cast<IntegerType>(cast<ZExtInst>(I)->getSrcTy())->getBitWidth();
-      for (unsigned i = 0; i < NarrowBitWidth; ++i)
-        Result->Provenance[i] = Res->Provenance[i];
-      for (unsigned i = NarrowBitWidth; i < BitWidth; ++i)
-        Result->Provenance[i] = BitPart::Unset;
+      for (unsigned ByteIdx = 0; ByteIdx < ByteWidth; ++ByteIdx) {
+        unsigned ByteBitOfs = ByteIdx * 8;
+        for (unsigned BitIdx = 0; BitIdx < 8; ++BitIdx)
+          Result->Provenance[(BitWidth - 8 - ByteBitOfs) + BitIdx] =
+              Res->Provenance[ByteBitOfs + BitIdx];
+      }
+      return Result;
+    }
+
+    // Funnel 'double' shifts take 3 operands, 2 inputs and the shift
+    // amount (modulo).
+    // fshl(X,Y,Z): (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
+    // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
+    if (match(V, m_FShl(m_Value(X), m_Value(Y), m_APInt(C))) ||
+        match(V, m_FShr(m_Value(X), m_Value(Y), m_APInt(C)))) {
+      // We can treat fshr as a fshl by flipping the modulo amount.
+      unsigned ModAmt = C->urem(BitWidth);
+      if (cast<IntrinsicInst>(I)->getIntrinsicID() == Intrinsic::fshr)
+        ModAmt = BitWidth - ModAmt;
+
+      const auto &LHS =
+          collectBitParts(X, MatchBSwaps, MatchBitReversals, BPS, Depth + 1);
+      const auto &RHS =
+          collectBitParts(Y, MatchBSwaps, MatchBitReversals, BPS, Depth + 1);
+
+      // Check we have both sources and they are from the same provider.
+      if (!LHS || !RHS || !LHS->Provider || LHS->Provider != RHS->Provider)
+        return Result;
+
+      unsigned StartBitRHS = BitWidth - ModAmt;
+      Result = BitPart(LHS->Provider, BitWidth);
+      for (unsigned BitIdx = 0; BitIdx < StartBitRHS; ++BitIdx)
+        Result->Provenance[BitIdx + ModAmt] = LHS->Provenance[BitIdx];
+      for (unsigned BitIdx = 0; BitIdx < ModAmt; ++BitIdx)
+        Result->Provenance[BitIdx] = RHS->Provenance[BitIdx + StartBitRHS];
       return Result;
     }
   }
@@ -2842,8 +3042,8 @@ collectBitParts(Value *V, bool MatchBSwaps, bool MatchBitReversals,
   // Okay, we got to something that isn't a shift, 'or' or 'and'.  This must be
   // the input value to the bswap/bitreverse.
   Result = BitPart(V, BitWidth);
-  for (unsigned i = 0; i < BitWidth; ++i)
-    Result->Provenance[i] = i;
+  for (unsigned BitIdx = 0; BitIdx < BitWidth; ++BitIdx)
+    Result->Provenance[BitIdx] = BitIdx;
   return Result;
 }
 
@@ -2870,65 +3070,92 @@ bool llvm::recognizeBSwapOrBitReverseIdiom(
     return false;
   if (!MatchBSwaps && !MatchBitReversals)
     return false;
-  IntegerType *ITy = dyn_cast<IntegerType>(I->getType());
-  if (!ITy || ITy->getBitWidth() > 128)
-    return false;   // Can't do vectors or integers > 128 bits.
-  unsigned BW = ITy->getBitWidth();
-
-  unsigned DemandedBW = BW;
-  IntegerType *DemandedTy = ITy;
-  if (I->hasOneUse()) {
-    if (TruncInst *Trunc = dyn_cast<TruncInst>(I->user_back())) {
-      DemandedTy = cast<IntegerType>(Trunc->getType());
-      DemandedBW = DemandedTy->getBitWidth();
-    }
-  }
+  Type *ITy = I->getType();
+  if (!ITy->isIntOrIntVectorTy() || ITy->getScalarSizeInBits() > 128)
+    return false;  // Can't do integer/elements > 128 bits.
+
+  Type *DemandedTy = ITy;
+  if (I->hasOneUse())
+    if (auto *Trunc = dyn_cast<TruncInst>(I->user_back()))
+      DemandedTy = Trunc->getType();
 
   // Try to find all the pieces corresponding to the bswap.
   std::map<Value *, Optional<BitPart>> BPS;
   auto Res = collectBitParts(I, MatchBSwaps, MatchBitReversals, BPS, 0);
   if (!Res)
     return false;
-  auto &BitProvenance = Res->Provenance;
+  ArrayRef<int8_t> BitProvenance = Res->Provenance;
+  assert(all_of(BitProvenance,
+                [](int8_t I) { return I == BitPart::Unset || 0 <= I; }) &&
+         "Illegal bit provenance index");
+
+  // If the upper bits are zero, then attempt to perform as a truncated op.
+  if (BitProvenance.back() == BitPart::Unset) {
+    while (!BitProvenance.empty() && BitProvenance.back() == BitPart::Unset)
+      BitProvenance = BitProvenance.drop_back();
+    if (BitProvenance.empty())
+      return false; // TODO - handle null value?
+    DemandedTy = Type::getIntNTy(I->getContext(), BitProvenance.size());
+    if (auto *IVecTy = dyn_cast<VectorType>(ITy))
+      DemandedTy = VectorType::get(DemandedTy, IVecTy);
+  }
+
+  // Check BitProvenance hasn't found a source larger than the result type.
+  unsigned DemandedBW = DemandedTy->getScalarSizeInBits();
+  if (DemandedBW > ITy->getScalarSizeInBits())
+    return false;
 
   // Now, is the bit permutation correct for a bswap or a bitreverse? We can
   // only byteswap values with an even number of bytes.
-  bool OKForBSwap = DemandedBW % 16 == 0, OKForBitReverse = true;
-  for (unsigned i = 0; i < DemandedBW; ++i) {
-    OKForBSwap &=
-        bitTransformIsCorrectForBSwap(BitProvenance[i], i, DemandedBW);
-    OKForBitReverse &=
-        bitTransformIsCorrectForBitReverse(BitProvenance[i], i, DemandedBW);
+  APInt DemandedMask = APInt::getAllOnesValue(DemandedBW);
+  bool OKForBSwap = MatchBSwaps && (DemandedBW % 16) == 0;
+  bool OKForBitReverse = MatchBitReversals;
+  for (unsigned BitIdx = 0;
+       (BitIdx < DemandedBW) && (OKForBSwap || OKForBitReverse); ++BitIdx) {
+    if (BitProvenance[BitIdx] == BitPart::Unset) {
+      DemandedMask.clearBit(BitIdx);
+      continue;
+    }
+    OKForBSwap &= bitTransformIsCorrectForBSwap(BitProvenance[BitIdx], BitIdx,
+                                                DemandedBW);
+    OKForBitReverse &= bitTransformIsCorrectForBitReverse(BitProvenance[BitIdx],
+                                                          BitIdx, DemandedBW);
   }
 
   Intrinsic::ID Intrin;
-  if (OKForBSwap && MatchBSwaps)
+  if (OKForBSwap)
     Intrin = Intrinsic::bswap;
-  else if (OKForBitReverse && MatchBitReversals)
+  else if (OKForBitReverse)
     Intrin = Intrinsic::bitreverse;
   else
     return false;
 
-  if (ITy != DemandedTy) {
-    Function *F = Intrinsic::getDeclaration(I->getModule(), Intrin, DemandedTy);
-    Value *Provider = Res->Provider;
-    IntegerType *ProviderTy = cast<IntegerType>(Provider->getType());
-    // We may need to truncate the provider.
-    if (DemandedTy != ProviderTy) {
-      auto *Trunc = CastInst::Create(Instruction::Trunc, Provider, DemandedTy,
-                                     "trunc", I);
-      InsertedInsts.push_back(Trunc);
-      Provider = Trunc;
-    }
-    auto *CI = CallInst::Create(F, Provider, "rev", I);
-    InsertedInsts.push_back(CI);
-    auto *ExtInst = CastInst::Create(Instruction::ZExt, CI, ITy, "zext", I);
+  Function *F = Intrinsic::getDeclaration(I->getModule(), Intrin, DemandedTy);
+  Value *Provider = Res->Provider;
+
+  // We may need to truncate the provider.
+  if (DemandedTy != Provider->getType()) {
+    auto *Trunc =
+        CastInst::CreateIntegerCast(Provider, DemandedTy, false, "trunc", I);
+    InsertedInsts.push_back(Trunc);
+    Provider = Trunc;
+  }
+
+  Instruction *Result = CallInst::Create(F, Provider, "rev", I);
+  InsertedInsts.push_back(Result);
+
+  if (!DemandedMask.isAllOnesValue()) {
+    auto *Mask = ConstantInt::get(DemandedTy, DemandedMask);
+    Result = BinaryOperator::Create(Instruction::And, Result, Mask, "mask", I);
+    InsertedInsts.push_back(Result);
+  }
+
+  // We may need to zeroextend back to the result type.
+  if (ITy != Result->getType()) {
+    auto *ExtInst = CastInst::CreateIntegerCast(Result, ITy, false, "zext", I);
     InsertedInsts.push_back(ExtInst);
-    return true;
   }
 
-  Function *F = Intrinsic::getDeclaration(I->getModule(), Intrin, ITy);
-  InsertedInsts.push_back(CallInst::Create(F, Res->Provider, "rev", I));
   return true;
 }
 
@@ -3020,44 +3247,6 @@ bool llvm::canReplaceOperandWithVariable(const Instruction *I, unsigned OpIdx) {
   }
 }
 
-using AllocaForValueMapTy = DenseMap<Value *, AllocaInst *>;
-AllocaInst *llvm::findAllocaForValue(Value *V,
-                                     AllocaForValueMapTy &AllocaForValue) {
-  if (AllocaInst *AI = dyn_cast<AllocaInst>(V))
-    return AI;
-  // See if we've already calculated (or started to calculate) alloca for a
-  // given value.
-  AllocaForValueMapTy::iterator I = AllocaForValue.find(V);
-  if (I != AllocaForValue.end())
-    return I->second;
-  // Store 0 while we're calculating alloca for value V to avoid
-  // infinite recursion if the value references itself.
-  AllocaForValue[V] = nullptr;
-  AllocaInst *Res = nullptr;
-  if (CastInst *CI = dyn_cast<CastInst>(V))
-    Res = findAllocaForValue(CI->getOperand(0), AllocaForValue);
-  else if (PHINode *PN = dyn_cast<PHINode>(V)) {
-    for (Value *IncValue : PN->incoming_values()) {
-      // Allow self-referencing phi-nodes.
-      if (IncValue == PN)
-        continue;
-      AllocaInst *IncValueAI = findAllocaForValue(IncValue, AllocaForValue);
-      // AI for incoming values should exist and should all be equal.
-      if (IncValueAI == nullptr || (Res != nullptr && IncValueAI != Res))
-        return nullptr;
-      Res = IncValueAI;
-    }
-  } else if (GetElementPtrInst *EP = dyn_cast<GetElementPtrInst>(V)) {
-    Res = findAllocaForValue(EP->getPointerOperand(), AllocaForValue);
-  } else {
-    LLVM_DEBUG(dbgs() << "Alloca search cancelled on unknown instruction: "
-                      << *V << "\n");
-  }
-  if (Res)
-    AllocaForValue[V] = Res;
-  return Res;
-}
-
 Value *llvm::invertCondition(Value *Condition) {
   // First: Check if it's a constant
   if (Constant *C = dyn_cast<Constant>(Condition))
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopPeel.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopPeel.cpp
new file mode 100644
index 000000000000..befacb591762
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopPeel.cpp
@@ -0,0 +1,862 @@
+//===- LoopPeel.cpp -------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Loop Peeling Utilities.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/LoopPeel.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/LoopSimplify.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <limits>
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+#define DEBUG_TYPE "loop-peel"
+
+STATISTIC(NumPeeled, "Number of loops peeled");
+
+static cl::opt<unsigned> UnrollPeelCount(
+    "unroll-peel-count", cl::Hidden,
+    cl::desc("Set the unroll peeling count, for testing purposes"));
+
+static cl::opt<bool>
+    UnrollAllowPeeling("unroll-allow-peeling", cl::init(true), cl::Hidden,
+                       cl::desc("Allows loops to be peeled when the dynamic "
+                                "trip count is known to be low."));
+
+static cl::opt<bool>
+    UnrollAllowLoopNestsPeeling("unroll-allow-loop-nests-peeling",
+                                cl::init(false), cl::Hidden,
+                                cl::desc("Allows loop nests to be peeled."));
+
+static cl::opt<unsigned> UnrollPeelMaxCount(
+    "unroll-peel-max-count", cl::init(7), cl::Hidden,
+    cl::desc("Max average trip count which will cause loop peeling."));
+
+static cl::opt<unsigned> UnrollForcePeelCount(
+    "unroll-force-peel-count", cl::init(0), cl::Hidden,
+    cl::desc("Force a peel count regardless of profiling information."));
+
+static cl::opt<bool> UnrollPeelMultiDeoptExit(
+    "unroll-peel-multi-deopt-exit", cl::init(true), cl::Hidden,
+    cl::desc("Allow peeling of loops with multiple deopt exits."));
+
+static const char *PeeledCountMetaData = "llvm.loop.peeled.count";
+
+// Designates that a Phi is estimated to become invariant after an "infinite"
+// number of loop iterations (i.e. only may become an invariant if the loop is
+// fully unrolled).
+static const unsigned InfiniteIterationsToInvariance =
+    std::numeric_limits<unsigned>::max();
+
+// Check whether we are capable of peeling this loop.
+bool llvm::canPeel(Loop *L) {
+  // Make sure the loop is in simplified form
+  if (!L->isLoopSimplifyForm())
+    return false;
+
+  if (UnrollPeelMultiDeoptExit) {
+    SmallVector<BasicBlock *, 4> Exits;
+    L->getUniqueNonLatchExitBlocks(Exits);
+
+    if (!Exits.empty()) {
+      // Latch's terminator is a conditional branch, Latch is exiting and
+      // all non Latch exits ends up with deoptimize.
+      const BasicBlock *Latch = L->getLoopLatch();
+      const BranchInst *T = dyn_cast<BranchInst>(Latch->getTerminator());
+      return T && T->isConditional() && L->isLoopExiting(Latch) &&
+             all_of(Exits, [](const BasicBlock *BB) {
+               return BB->getTerminatingDeoptimizeCall();
+             });
+    }
+  }
+
+  // Only peel loops that contain a single exit
+  if (!L->getExitingBlock() || !L->getUniqueExitBlock())
+    return false;
+
+  // Don't try to peel loops where the latch is not the exiting block.
+  // This can be an indication of two different things:
+  // 1) The loop is not rotated.
+  // 2) The loop contains irreducible control flow that involves the latch.
+  const BasicBlock *Latch = L->getLoopLatch();
+  if (Latch != L->getExitingBlock())
+    return false;
+
+  // Peeling is only supported if the latch is a branch.
+  if (!isa<BranchInst>(Latch->getTerminator()))
+    return false;
+
+  return true;
+}
+
+// This function calculates the number of iterations after which the given Phi
+// becomes an invariant. The pre-calculated values are memorized in the map. The
+// function (shortcut is I) is calculated according to the following definition:
+// Given %x = phi <Inputs from above the loop>, ..., [%y, %back.edge].
+//   If %y is a loop invariant, then I(%x) = 1.
+//   If %y is a Phi from the loop header, I(%x) = I(%y) + 1.
+//   Otherwise, I(%x) is infinite.
+// TODO: Actually if %y is an expression that depends only on Phi %z and some
+//       loop invariants, we can estimate I(%x) = I(%z) + 1. The example
+//       looks like:
+//         %x = phi(0, %a),  <-- becomes invariant starting from 3rd iteration.
+//         %y = phi(0, 5),
+//         %a = %y + 1.
+static unsigned calculateIterationsToInvariance(
+    PHINode *Phi, Loop *L, BasicBlock *BackEdge,
+    SmallDenseMap<PHINode *, unsigned> &IterationsToInvariance) {
+  assert(Phi->getParent() == L->getHeader() &&
+         "Non-loop Phi should not be checked for turning into invariant.");
+  assert(BackEdge == L->getLoopLatch() && "Wrong latch?");
+  // If we already know the answer, take it from the map.
+  auto I = IterationsToInvariance.find(Phi);
+  if (I != IterationsToInvariance.end())
+    return I->second;
+
+  // Otherwise we need to analyze the input from the back edge.
+  Value *Input = Phi->getIncomingValueForBlock(BackEdge);
+  // Place infinity to map to avoid infinite recursion for cycled Phis. Such
+  // cycles can never stop on an invariant.
+  IterationsToInvariance[Phi] = InfiniteIterationsToInvariance;
+  unsigned ToInvariance = InfiniteIterationsToInvariance;
+
+  if (L->isLoopInvariant(Input))
+    ToInvariance = 1u;
+  else if (PHINode *IncPhi = dyn_cast<PHINode>(Input)) {
+    // Only consider Phis in header block.
+    if (IncPhi->getParent() != L->getHeader())
+      return InfiniteIterationsToInvariance;
+    // If the input becomes an invariant after X iterations, then our Phi
+    // becomes an invariant after X + 1 iterations.
+    unsigned InputToInvariance = calculateIterationsToInvariance(
+        IncPhi, L, BackEdge, IterationsToInvariance);
+    if (InputToInvariance != InfiniteIterationsToInvariance)
+      ToInvariance = InputToInvariance + 1u;
+  }
+
+  // If we found that this Phi lies in an invariant chain, update the map.
+  if (ToInvariance != InfiniteIterationsToInvariance)
+    IterationsToInvariance[Phi] = ToInvariance;
+  return ToInvariance;
+}
+
+// Return the number of iterations to peel off that make conditions in the
+// body true/false. For example, if we peel 2 iterations off the loop below,
+// the condition i < 2 can be evaluated at compile time.
+//  for (i = 0; i < n; i++)
+//    if (i < 2)
+//      ..
+//    else
+//      ..
+//   }
+static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount,
+                                         ScalarEvolution &SE) {
+  assert(L.isLoopSimplifyForm() && "Loop needs to be in loop simplify form");
+  unsigned DesiredPeelCount = 0;
+
+  for (auto *BB : L.blocks()) {
+    auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
+    if (!BI || BI->isUnconditional())
+      continue;
+
+    // Ignore loop exit condition.
+    if (L.getLoopLatch() == BB)
+      continue;
+
+    Value *Condition = BI->getCondition();
+    Value *LeftVal, *RightVal;
+    CmpInst::Predicate Pred;
+    if (!match(Condition, m_ICmp(Pred, m_Value(LeftVal), m_Value(RightVal))))
+      continue;
+
+    const SCEV *LeftSCEV = SE.getSCEV(LeftVal);
+    const SCEV *RightSCEV = SE.getSCEV(RightVal);
+
+    // Do not consider predicates that are known to be true or false
+    // independently of the loop iteration.
+    if (SE.isKnownPredicate(Pred, LeftSCEV, RightSCEV) ||
+        SE.isKnownPredicate(ICmpInst::getInversePredicate(Pred), LeftSCEV,
+                            RightSCEV))
+      continue;
+
+    // Check if we have a condition with one AddRec and one non AddRec
+    // expression. Normalize LeftSCEV to be the AddRec.
+    if (!isa<SCEVAddRecExpr>(LeftSCEV)) {
+      if (isa<SCEVAddRecExpr>(RightSCEV)) {
+        std::swap(LeftSCEV, RightSCEV);
+        Pred = ICmpInst::getSwappedPredicate(Pred);
+      } else
+        continue;
+    }
+
+    const SCEVAddRecExpr *LeftAR = cast<SCEVAddRecExpr>(LeftSCEV);
+
+    // Avoid huge SCEV computations in the loop below, make sure we only
+    // consider AddRecs of the loop we are trying to peel.
+    if (!LeftAR->isAffine() || LeftAR->getLoop() != &L)
+      continue;
+    if (!(ICmpInst::isEquality(Pred) && LeftAR->hasNoSelfWrap()) &&
+        !SE.getMonotonicPredicateType(LeftAR, Pred))
+      continue;
+
+    // Check if extending the current DesiredPeelCount lets us evaluate Pred
+    // or !Pred in the loop body statically.
+    unsigned NewPeelCount = DesiredPeelCount;
+
+    const SCEV *IterVal = LeftAR->evaluateAtIteration(
+        SE.getConstant(LeftSCEV->getType(), NewPeelCount), SE);
+
+    // If the original condition is not known, get the negated predicate
+    // (which holds on the else branch) and check if it is known. This allows
+    // us to peel of iterations that make the original condition false.
+    if (!SE.isKnownPredicate(Pred, IterVal, RightSCEV))
+      Pred = ICmpInst::getInversePredicate(Pred);
+
+    const SCEV *Step = LeftAR->getStepRecurrence(SE);
+    const SCEV *NextIterVal = SE.getAddExpr(IterVal, Step);
+    auto PeelOneMoreIteration = [&IterVal, &NextIterVal, &SE, Step,
+                                 &NewPeelCount]() {
+      IterVal = NextIterVal;
+      NextIterVal = SE.getAddExpr(IterVal, Step);
+      NewPeelCount++;
+    };
+
+    auto CanPeelOneMoreIteration = [&NewPeelCount, &MaxPeelCount]() {
+      return NewPeelCount < MaxPeelCount;
+    };
+
+    while (CanPeelOneMoreIteration() &&
+           SE.isKnownPredicate(Pred, IterVal, RightSCEV))
+      PeelOneMoreIteration();
+
+    // With *that* peel count, does the predicate !Pred become known in the
+    // first iteration of the loop body after peeling?
+    if (!SE.isKnownPredicate(ICmpInst::getInversePredicate(Pred), IterVal,
+                             RightSCEV))
+      continue; // If not, give up.
+
+    // However, for equality comparisons, that isn't always sufficient to
+    // eliminate the comparsion in loop body, we may need to peel one more
+    // iteration. See if that makes !Pred become unknown again.
+    if (ICmpInst::isEquality(Pred) &&
+        !SE.isKnownPredicate(ICmpInst::getInversePredicate(Pred), NextIterVal,
+                             RightSCEV) &&
+        !SE.isKnownPredicate(Pred, IterVal, RightSCEV) &&
+        SE.isKnownPredicate(Pred, NextIterVal, RightSCEV)) {
+      if (!CanPeelOneMoreIteration())
+        continue; // Need to peel one more iteration, but can't. Give up.
+      PeelOneMoreIteration(); // Great!
+    }
+
+    DesiredPeelCount = std::max(DesiredPeelCount, NewPeelCount);
+  }
+
+  return DesiredPeelCount;
+}
+
+// Return the number of iterations we want to peel off.
+void llvm::computePeelCount(Loop *L, unsigned LoopSize,
+                            TargetTransformInfo::PeelingPreferences &PP,
+                            unsigned &TripCount, ScalarEvolution &SE,
+                            unsigned Threshold) {
+  assert(LoopSize > 0 && "Zero loop size is not allowed!");
+  // Save the PP.PeelCount value set by the target in
+  // TTI.getPeelingPreferences or by the flag -unroll-peel-count.
+  unsigned TargetPeelCount = PP.PeelCount;
+  PP.PeelCount = 0;
+  if (!canPeel(L))
+    return;
+
+  // Only try to peel innermost loops by default.
+  // The constraint can be relaxed by the target in TTI.getUnrollingPreferences
+  // or by the flag -unroll-allow-loop-nests-peeling.
+  if (!PP.AllowLoopNestsPeeling && !L->isInnermost())
+    return;
+
+  // If the user provided a peel count, use that.
+  bool UserPeelCount = UnrollForcePeelCount.getNumOccurrences() > 0;
+  if (UserPeelCount) {
+    LLVM_DEBUG(dbgs() << "Force-peeling first " << UnrollForcePeelCount
+                      << " iterations.\n");
+    PP.PeelCount = UnrollForcePeelCount;
+    PP.PeelProfiledIterations = true;
+    return;
+  }
+
+  // Skip peeling if it's disabled.
+  if (!PP.AllowPeeling)
+    return;
+
+  unsigned AlreadyPeeled = 0;
+  if (auto Peeled = getOptionalIntLoopAttribute(L, PeeledCountMetaData))
+    AlreadyPeeled = *Peeled;
+  // Stop if we already peeled off the maximum number of iterations.
+  if (AlreadyPeeled >= UnrollPeelMaxCount)
+    return;
+
+  // Here we try to get rid of Phis which become invariants after 1, 2, ..., N
+  // iterations of the loop. For this we compute the number for iterations after
+  // which every Phi is guaranteed to become an invariant, and try to peel the
+  // maximum number of iterations among these values, thus turning all those
+  // Phis into invariants.
+  // First, check that we can peel at least one iteration.
+  if (2 * LoopSize <= Threshold && UnrollPeelMaxCount > 0) {
+    // Store the pre-calculated values here.
+    SmallDenseMap<PHINode *, unsigned> IterationsToInvariance;
+    // Now go through all Phis to calculate their the number of iterations they
+    // need to become invariants.
+    // Start the max computation with the UP.PeelCount value set by the target
+    // in TTI.getUnrollingPreferences or by the flag -unroll-peel-count.
+    unsigned DesiredPeelCount = TargetPeelCount;
+    BasicBlock *BackEdge = L->getLoopLatch();
+    assert(BackEdge && "Loop is not in simplified form?");
+    for (auto BI = L->getHeader()->begin(); isa<PHINode>(&*BI); ++BI) {
+      PHINode *Phi = cast<PHINode>(&*BI);
+      unsigned ToInvariance = calculateIterationsToInvariance(
+          Phi, L, BackEdge, IterationsToInvariance);
+      if (ToInvariance != InfiniteIterationsToInvariance)
+        DesiredPeelCount = std::max(DesiredPeelCount, ToInvariance);
+    }
+
+    // Pay respect to limitations implied by loop size and the max peel count.
+    unsigned MaxPeelCount = UnrollPeelMaxCount;
+    MaxPeelCount = std::min(MaxPeelCount, Threshold / LoopSize - 1);
+
+    DesiredPeelCount = std::max(DesiredPeelCount,
+                                countToEliminateCompares(*L, MaxPeelCount, SE));
+
+    if (DesiredPeelCount > 0) {
+      DesiredPeelCount = std::min(DesiredPeelCount, MaxPeelCount);
+      // Consider max peel count limitation.
+      assert(DesiredPeelCount > 0 && "Wrong loop size estimation?");
+      if (DesiredPeelCount + AlreadyPeeled <= UnrollPeelMaxCount) {
+        LLVM_DEBUG(dbgs() << "Peel " << DesiredPeelCount
+                          << " iteration(s) to turn"
+                          << " some Phis into invariants.\n");
+        PP.PeelCount = DesiredPeelCount;
+        PP.PeelProfiledIterations = false;
+        return;
+      }
+    }
+  }
+
+  // Bail if we know the statically calculated trip count.
+  // In this case we rather prefer partial unrolling.
+  if (TripCount)
+    return;
+
+  // Do not apply profile base peeling if it is disabled.
+  if (!PP.PeelProfiledIterations)
+    return;
+  // If we don't know the trip count, but have reason to believe the average
+  // trip count is low, peeling should be beneficial, since we will usually
+  // hit the peeled section.
+  // We only do this in the presence of profile information, since otherwise
+  // our estimates of the trip count are not reliable enough.
+  if (L->getHeader()->getParent()->hasProfileData()) {
+    Optional<unsigned> PeelCount = getLoopEstimatedTripCount(L);
+    if (!PeelCount)
+      return;
+
+    LLVM_DEBUG(dbgs() << "Profile-based estimated trip count is " << *PeelCount
+                      << "\n");
+
+    if (*PeelCount) {
+      if ((*PeelCount + AlreadyPeeled <= UnrollPeelMaxCount) &&
+          (LoopSize * (*PeelCount + 1) <= Threshold)) {
+        LLVM_DEBUG(dbgs() << "Peeling first " << *PeelCount
+                          << " iterations.\n");
+        PP.PeelCount = *PeelCount;
+        return;
+      }
+      LLVM_DEBUG(dbgs() << "Requested peel count: " << *PeelCount << "\n");
+      LLVM_DEBUG(dbgs() << "Already peel count: " << AlreadyPeeled << "\n");
+      LLVM_DEBUG(dbgs() << "Max peel count: " << UnrollPeelMaxCount << "\n");
+      LLVM_DEBUG(dbgs() << "Peel cost: " << LoopSize * (*PeelCount + 1)
+                        << "\n");
+      LLVM_DEBUG(dbgs() << "Max peel cost: " << Threshold << "\n");
+    }
+  }
+}
+
+/// Update the branch weights of the latch of a peeled-off loop
+/// iteration.
+/// This sets the branch weights for the latch of the recently peeled off loop
+/// iteration correctly.
+/// Let F is a weight of the edge from latch to header.
+/// Let E is a weight of the edge from latch to exit.
+/// F/(F+E) is a probability to go to loop and E/(F+E) is a probability to
+/// go to exit.
+/// Then, Estimated TripCount = F / E.
+/// For I-th (counting from 0) peeled off iteration we set the the weights for
+/// the peeled latch as (TC - I, 1). It gives us reasonable distribution,
+/// The probability to go to exit 1/(TC-I) increases. At the same time
+/// the estimated trip count of remaining loop reduces by I.
+/// To avoid dealing with division rounding we can just multiple both part
+/// of weights to E and use weight as (F - I * E, E).
+///
+/// \param Header The copy of the header block that belongs to next iteration.
+/// \param LatchBR The copy of the latch branch that belongs to this iteration.
+/// \param[in,out] FallThroughWeight The weight of the edge from latch to
+/// header before peeling (in) and after peeled off one iteration (out).
+static void updateBranchWeights(BasicBlock *Header, BranchInst *LatchBR,
+                                uint64_t ExitWeight,
+                                uint64_t &FallThroughWeight) {
+  // FallThroughWeight is 0 means that there is no branch weights on original
+  // latch block or estimated trip count is zero.
+  if (!FallThroughWeight)
+    return;
+
+  unsigned HeaderIdx = (LatchBR->getSuccessor(0) == Header ? 0 : 1);
+  MDBuilder MDB(LatchBR->getContext());
+  MDNode *WeightNode =
+      HeaderIdx ? MDB.createBranchWeights(ExitWeight, FallThroughWeight)
+                : MDB.createBranchWeights(FallThroughWeight, ExitWeight);
+  LatchBR->setMetadata(LLVMContext::MD_prof, WeightNode);
+  FallThroughWeight =
+      FallThroughWeight > ExitWeight ? FallThroughWeight - ExitWeight : 1;
+}
+
+/// Initialize the weights.
+///
+/// \param Header The header block.
+/// \param LatchBR The latch branch.
+/// \param[out] ExitWeight The weight of the edge from Latch to Exit.
+/// \param[out] FallThroughWeight The weight of the edge from Latch to Header.
+static void initBranchWeights(BasicBlock *Header, BranchInst *LatchBR,
+                              uint64_t &ExitWeight,
+                              uint64_t &FallThroughWeight) {
+  uint64_t TrueWeight, FalseWeight;
+  if (!LatchBR->extractProfMetadata(TrueWeight, FalseWeight))
+    return;
+  unsigned HeaderIdx = LatchBR->getSuccessor(0) == Header ? 0 : 1;
+  ExitWeight = HeaderIdx ? TrueWeight : FalseWeight;
+  FallThroughWeight = HeaderIdx ? FalseWeight : TrueWeight;
+}
+
+/// Update the weights of original Latch block after peeling off all iterations.
+///
+/// \param Header The header block.
+/// \param LatchBR The latch branch.
+/// \param ExitWeight The weight of the edge from Latch to Exit.
+/// \param FallThroughWeight The weight of the edge from Latch to Header.
+static void fixupBranchWeights(BasicBlock *Header, BranchInst *LatchBR,
+                               uint64_t ExitWeight,
+                               uint64_t FallThroughWeight) {
+  // FallThroughWeight is 0 means that there is no branch weights on original
+  // latch block or estimated trip count is zero.
+  if (!FallThroughWeight)
+    return;
+
+  // Sets the branch weights on the loop exit.
+  MDBuilder MDB(LatchBR->getContext());
+  unsigned HeaderIdx = LatchBR->getSuccessor(0) == Header ? 0 : 1;
+  MDNode *WeightNode =
+      HeaderIdx ? MDB.createBranchWeights(ExitWeight, FallThroughWeight)
+                : MDB.createBranchWeights(FallThroughWeight, ExitWeight);
+  LatchBR->setMetadata(LLVMContext::MD_prof, WeightNode);
+}
+
+/// Clones the body of the loop L, putting it between \p InsertTop and \p
+/// InsertBot.
+/// \param IterNumber The serial number of the iteration currently being
+/// peeled off.
+/// \param ExitEdges The exit edges of the original loop.
+/// \param[out] NewBlocks A list of the blocks in the newly created clone
+/// \param[out] VMap The value map between the loop and the new clone.
+/// \param LoopBlocks A helper for DFS-traversal of the loop.
+/// \param LVMap A value-map that maps instructions from the original loop to
+/// instructions in the last peeled-off iteration.
+static void cloneLoopBlocks(
+    Loop *L, unsigned IterNumber, BasicBlock *InsertTop, BasicBlock *InsertBot,
+    SmallVectorImpl<std::pair<BasicBlock *, BasicBlock *>> &ExitEdges,
+    SmallVectorImpl<BasicBlock *> &NewBlocks, LoopBlocksDFS &LoopBlocks,
+    ValueToValueMapTy &VMap, ValueToValueMapTy &LVMap, DominatorTree *DT,
+    LoopInfo *LI, ArrayRef<MDNode *> LoopLocalNoAliasDeclScopes) {
+  BasicBlock *Header = L->getHeader();
+  BasicBlock *Latch = L->getLoopLatch();
+  BasicBlock *PreHeader = L->getLoopPreheader();
+
+  Function *F = Header->getParent();
+  LoopBlocksDFS::RPOIterator BlockBegin = LoopBlocks.beginRPO();
+  LoopBlocksDFS::RPOIterator BlockEnd = LoopBlocks.endRPO();
+  Loop *ParentLoop = L->getParentLoop();
+
+  // For each block in the original loop, create a new copy,
+  // and update the value map with the newly created values.
+  for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
+    BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, ".peel", F);
+    NewBlocks.push_back(NewBB);
+
+    // If an original block is an immediate child of the loop L, its copy
+    // is a child of a ParentLoop after peeling. If a block is a child of
+    // a nested loop, it is handled in the cloneLoop() call below.
+    if (ParentLoop && LI->getLoopFor(*BB) == L)
+      ParentLoop->addBasicBlockToLoop(NewBB, *LI);
+
+    VMap[*BB] = NewBB;
+
+    // If dominator tree is available, insert nodes to represent cloned blocks.
+    if (DT) {
+      if (Header == *BB)
+        DT->addNewBlock(NewBB, InsertTop);
+      else {
+        DomTreeNode *IDom = DT->getNode(*BB)->getIDom();
+        // VMap must contain entry for IDom, as the iteration order is RPO.
+        DT->addNewBlock(NewBB, cast<BasicBlock>(VMap[IDom->getBlock()]));
+      }
+    }
+  }
+
+  {
+    // Identify what other metadata depends on the cloned version. After
+    // cloning, replace the metadata with the corrected version for both
+    // memory instructions and noalias intrinsics.
+    std::string Ext = (Twine("Peel") + Twine(IterNumber)).str();
+    cloneAndAdaptNoAliasScopes(LoopLocalNoAliasDeclScopes, NewBlocks,
+                               Header->getContext(), Ext);
+  }
+
+  // Recursively create the new Loop objects for nested loops, if any,
+  // to preserve LoopInfo.
+  for (Loop *ChildLoop : *L) {
+    cloneLoop(ChildLoop, ParentLoop, VMap, LI, nullptr);
+  }
+
+  // Hook-up the control flow for the newly inserted blocks.
+  // The new header is hooked up directly to the "top", which is either
+  // the original loop preheader (for the first iteration) or the previous
+  // iteration's exiting block (for every other iteration)
+  InsertTop->getTerminator()->setSuccessor(0, cast<BasicBlock>(VMap[Header]));
+
+  // Similarly, for the latch:
+  // The original exiting edge is still hooked up to the loop exit.
+  // The backedge now goes to the "bottom", which is either the loop's real
+  // header (for the last peeled iteration) or the copied header of the next
+  // iteration (for every other iteration)
+  BasicBlock *NewLatch = cast<BasicBlock>(VMap[Latch]);
+  BranchInst *LatchBR = cast<BranchInst>(NewLatch->getTerminator());
+  for (unsigned idx = 0, e = LatchBR->getNumSuccessors(); idx < e; ++idx)
+    if (LatchBR->getSuccessor(idx) == Header) {
+      LatchBR->setSuccessor(idx, InsertBot);
+      break;
+    }
+  if (DT)
+    DT->changeImmediateDominator(InsertBot, NewLatch);
+
+  // The new copy of the loop body starts with a bunch of PHI nodes
+  // that pick an incoming value from either the preheader, or the previous
+  // loop iteration. Since this copy is no longer part of the loop, we
+  // resolve this statically:
+  // For the first iteration, we use the value from the preheader directly.
+  // For any other iteration, we replace the phi with the value generated by
+  // the immediately preceding clone of the loop body (which represents
+  // the previous iteration).
+  for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
+    PHINode *NewPHI = cast<PHINode>(VMap[&*I]);
+    if (IterNumber == 0) {
+      VMap[&*I] = NewPHI->getIncomingValueForBlock(PreHeader);
+    } else {
+      Value *LatchVal = NewPHI->getIncomingValueForBlock(Latch);
+      Instruction *LatchInst = dyn_cast<Instruction>(LatchVal);
+      if (LatchInst && L->contains(LatchInst))
+        VMap[&*I] = LVMap[LatchInst];
+      else
+        VMap[&*I] = LatchVal;
+    }
+    cast<BasicBlock>(VMap[Header])->getInstList().erase(NewPHI);
+  }
+
+  // Fix up the outgoing values - we need to add a value for the iteration
+  // we've just created. Note that this must happen *after* the incoming
+  // values are adjusted, since the value going out of the latch may also be
+  // a value coming into the header.
+  for (auto Edge : ExitEdges)
+    for (PHINode &PHI : Edge.second->phis()) {
+      Value *LatchVal = PHI.getIncomingValueForBlock(Edge.first);
+      Instruction *LatchInst = dyn_cast<Instruction>(LatchVal);
+      if (LatchInst && L->contains(LatchInst))
+        LatchVal = VMap[LatchVal];
+      PHI.addIncoming(LatchVal, cast<BasicBlock>(VMap[Edge.first]));
+    }
+
+  // LastValueMap is updated with the values for the current loop
+  // which are used the next time this function is called.
+  for (auto KV : VMap)
+    LVMap[KV.first] = KV.second;
+}
+
+TargetTransformInfo::PeelingPreferences llvm::gatherPeelingPreferences(
+    Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI,
+    Optional<bool> UserAllowPeeling,
+    Optional<bool> UserAllowProfileBasedPeeling, bool UnrollingSpecficValues) {
+  TargetTransformInfo::PeelingPreferences PP;
+
+  // Set the default values.
+  PP.PeelCount = 0;
+  PP.AllowPeeling = true;
+  PP.AllowLoopNestsPeeling = false;
+  PP.PeelProfiledIterations = true;
+
+  // Get the target specifc values.
+  TTI.getPeelingPreferences(L, SE, PP);
+
+  // User specified values using cl::opt.
+  if (UnrollingSpecficValues) {
+    if (UnrollPeelCount.getNumOccurrences() > 0)
+      PP.PeelCount = UnrollPeelCount;
+    if (UnrollAllowPeeling.getNumOccurrences() > 0)
+      PP.AllowPeeling = UnrollAllowPeeling;
+    if (UnrollAllowLoopNestsPeeling.getNumOccurrences() > 0)
+      PP.AllowLoopNestsPeeling = UnrollAllowLoopNestsPeeling;
+  }
+
+  // User specifed values provided by argument.
+  if (UserAllowPeeling.hasValue())
+    PP.AllowPeeling = *UserAllowPeeling;
+  if (UserAllowProfileBasedPeeling.hasValue())
+    PP.PeelProfiledIterations = *UserAllowProfileBasedPeeling;
+
+  return PP;
+}
+
+/// Peel off the first \p PeelCount iterations of loop \p L.
+///
+/// Note that this does not peel them off as a single straight-line block.
+/// Rather, each iteration is peeled off separately, and needs to check the
+/// exit condition.
+/// For loops that dynamically execute \p PeelCount iterations or less
+/// this provides a benefit, since the peeled off iterations, which account
+/// for the bulk of dynamic execution, can be further simplified by scalar
+/// optimizations.
+bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
+                    ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
+                    bool PreserveLCSSA) {
+  assert(PeelCount > 0 && "Attempt to peel out zero iterations?");
+  assert(canPeel(L) && "Attempt to peel a loop which is not peelable?");
+
+  LoopBlocksDFS LoopBlocks(L);
+  LoopBlocks.perform(LI);
+
+  BasicBlock *Header = L->getHeader();
+  BasicBlock *PreHeader = L->getLoopPreheader();
+  BasicBlock *Latch = L->getLoopLatch();
+  SmallVector<std::pair<BasicBlock *, BasicBlock *>, 4> ExitEdges;
+  L->getExitEdges(ExitEdges);
+
+  DenseMap<BasicBlock *, BasicBlock *> ExitIDom;
+  if (DT) {
+    // We'd like to determine the idom of exit block after peeling one
+    // iteration.
+    // Let Exit is exit block.
+    // Let ExitingSet - is a set of predecessors of Exit block. They are exiting
+    // blocks.
+    // Let Latch' and ExitingSet' are copies after a peeling.
+    // We'd like to find an idom'(Exit) - idom of Exit after peeling.
+    // It is an evident that idom'(Exit) will be the nearest common dominator
+    // of ExitingSet and ExitingSet'.
+    // idom(Exit) is a nearest common dominator of ExitingSet.
+    // idom(Exit)' is a nearest common dominator of ExitingSet'.
+    // Taking into account that we have a single Latch, Latch' will dominate
+    // Header and idom(Exit).
+    // So the idom'(Exit) is nearest common dominator of idom(Exit)' and Latch'.
+    // All these basic blocks are in the same loop, so what we find is
+    // (nearest common dominator of idom(Exit) and Latch)'.
+    // In the loop below we remember nearest common dominator of idom(Exit) and
+    // Latch to update idom of Exit later.
+    assert(L->hasDedicatedExits() && "No dedicated exits?");
+    for (auto Edge : ExitEdges) {
+      if (ExitIDom.count(Edge.second))
+        continue;
+      BasicBlock *BB = DT->findNearestCommonDominator(
+          DT->getNode(Edge.second)->getIDom()->getBlock(), Latch);
+      assert(L->contains(BB) && "IDom is not in a loop");
+      ExitIDom[Edge.second] = BB;
+    }
+  }
+
+  Function *F = Header->getParent();
+
+  // Set up all the necessary basic blocks. It is convenient to split the
+  // preheader into 3 parts - two blocks to anchor the peeled copy of the loop
+  // body, and a new preheader for the "real" loop.
+
+  // Peeling the first iteration transforms.
+  //
+  // PreHeader:
+  // ...
+  // Header:
+  //   LoopBody
+  //   If (cond) goto Header
+  // Exit:
+  //
+  // into
+  //
+  // InsertTop:
+  //   LoopBody
+  //   If (!cond) goto Exit
+  // InsertBot:
+  // NewPreHeader:
+  // ...
+  // Header:
+  //  LoopBody
+  //  If (cond) goto Header
+  // Exit:
+  //
+  // Each following iteration will split the current bottom anchor in two,
+  // and put the new copy of the loop body between these two blocks. That is,
+  // after peeling another iteration from the example above, we'll split
+  // InsertBot, and get:
+  //
+  // InsertTop:
+  //   LoopBody
+  //   If (!cond) goto Exit
+  // InsertBot:
+  //   LoopBody
+  //   If (!cond) goto Exit
+  // InsertBot.next:
+  // NewPreHeader:
+  // ...
+  // Header:
+  //  LoopBody
+  //  If (cond) goto Header
+  // Exit:
+
+  BasicBlock *InsertTop = SplitEdge(PreHeader, Header, DT, LI);
+  BasicBlock *InsertBot =
+      SplitBlock(InsertTop, InsertTop->getTerminator(), DT, LI);
+  BasicBlock *NewPreHeader =
+      SplitBlock(InsertBot, InsertBot->getTerminator(), DT, LI);
+
+  InsertTop->setName(Header->getName() + ".peel.begin");
+  InsertBot->setName(Header->getName() + ".peel.next");
+  NewPreHeader->setName(PreHeader->getName() + ".peel.newph");
+
+  ValueToValueMapTy LVMap;
+
+  // If we have branch weight information, we'll want to update it for the
+  // newly created branches.
+  BranchInst *LatchBR =
+      cast<BranchInst>(cast<BasicBlock>(Latch)->getTerminator());
+  uint64_t ExitWeight = 0, FallThroughWeight = 0;
+  initBranchWeights(Header, LatchBR, ExitWeight, FallThroughWeight);
+
+  // Identify what noalias metadata is inside the loop: if it is inside the
+  // loop, the associated metadata must be cloned for each iteration.
+  SmallVector<MDNode *, 6> LoopLocalNoAliasDeclScopes;
+  identifyNoAliasScopesToClone(L->getBlocks(), LoopLocalNoAliasDeclScopes);
+
+  // For each peeled-off iteration, make a copy of the loop.
+  for (unsigned Iter = 0; Iter < PeelCount; ++Iter) {
+    SmallVector<BasicBlock *, 8> NewBlocks;
+    ValueToValueMapTy VMap;
+
+    cloneLoopBlocks(L, Iter, InsertTop, InsertBot, ExitEdges, NewBlocks,
+                    LoopBlocks, VMap, LVMap, DT, LI,
+                    LoopLocalNoAliasDeclScopes);
+
+    // Remap to use values from the current iteration instead of the
+    // previous one.
+    remapInstructionsInBlocks(NewBlocks, VMap);
+
+    if (DT) {
+      // Latches of the cloned loops dominate over the loop exit, so idom of the
+      // latter is the first cloned loop body, as original PreHeader dominates
+      // the original loop body.
+      if (Iter == 0)
+        for (auto Exit : ExitIDom)
+          DT->changeImmediateDominator(Exit.first,
+                                       cast<BasicBlock>(LVMap[Exit.second]));
+#ifdef EXPENSIVE_CHECKS
+      assert(DT->verify(DominatorTree::VerificationLevel::Fast));
+#endif
+    }
+
+    auto *LatchBRCopy = cast<BranchInst>(VMap[LatchBR]);
+    updateBranchWeights(InsertBot, LatchBRCopy, ExitWeight, FallThroughWeight);
+    // Remove Loop metadata from the latch branch instruction
+    // because it is not the Loop's latch branch anymore.
+    LatchBRCopy->setMetadata(LLVMContext::MD_loop, nullptr);
+
+    InsertTop = InsertBot;
+    InsertBot = SplitBlock(InsertBot, InsertBot->getTerminator(), DT, LI);
+    InsertBot->setName(Header->getName() + ".peel.next");
+
+    F->getBasicBlockList().splice(InsertTop->getIterator(),
+                                  F->getBasicBlockList(),
+                                  NewBlocks[0]->getIterator(), F->end());
+  }
+
+  // Now adjust the phi nodes in the loop header to get their initial values
+  // from the last peeled-off iteration instead of the preheader.
+  for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
+    PHINode *PHI = cast<PHINode>(I);
+    Value *NewVal = PHI->getIncomingValueForBlock(Latch);
+    Instruction *LatchInst = dyn_cast<Instruction>(NewVal);
+    if (LatchInst && L->contains(LatchInst))
+      NewVal = LVMap[LatchInst];
+
+    PHI->setIncomingValueForBlock(NewPreHeader, NewVal);
+  }
+
+  fixupBranchWeights(Header, LatchBR, ExitWeight, FallThroughWeight);
+
+  // Update Metadata for count of peeled off iterations.
+  unsigned AlreadyPeeled = 0;
+  if (auto Peeled = getOptionalIntLoopAttribute(L, PeeledCountMetaData))
+    AlreadyPeeled = *Peeled;
+  addStringMetadataToLoop(L, PeeledCountMetaData, AlreadyPeeled + PeelCount);
+
+  if (Loop *ParentLoop = L->getParentLoop())
+    L = ParentLoop;
+
+  // We modified the loop, update SE.
+  SE->forgetTopmostLoop(L);
+
+  // Finally DomtTree must be correct.
+  assert(DT->verify(DominatorTree::VerificationLevel::Fast));
+
+  // FIXME: Incrementally update loop-simplify
+  simplifyLoop(L, DT, LI, SE, AC, nullptr, PreserveLCSSA);
+
+  NumPeeled++;
+
+  return true;
+}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
index 8804bba975b6..b678efdc8d88 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
@@ -12,7 +12,6 @@
 
 #include "llvm/Transforms/Utils/LoopRotationUtils.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/CodeMetrics.h"
@@ -36,6 +35,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
@@ -44,6 +44,8 @@ using namespace llvm;
 
 #define DEBUG_TYPE "loop-rotate"
 
+STATISTIC(NumNotRotatedDueToHeaderSize,
+          "Number of loops not rotated due to the header size");
 STATISTIC(NumRotated, "Number of loops rotated");
 
 static cl::opt<bool>
@@ -64,15 +66,17 @@ class LoopRotate {
   const SimplifyQuery &SQ;
   bool RotationOnly;
   bool IsUtilMode;
+  bool PrepareForLTO;
 
 public:
   LoopRotate(unsigned MaxHeaderSize, LoopInfo *LI,
              const TargetTransformInfo *TTI, AssumptionCache *AC,
              DominatorTree *DT, ScalarEvolution *SE, MemorySSAUpdater *MSSAU,
-             const SimplifyQuery &SQ, bool RotationOnly, bool IsUtilMode)
+             const SimplifyQuery &SQ, bool RotationOnly, bool IsUtilMode,
+             bool PrepareForLTO)
       : MaxHeaderSize(MaxHeaderSize), LI(LI), TTI(TTI), AC(AC), DT(DT), SE(SE),
         MSSAU(MSSAU), SQ(SQ), RotationOnly(RotationOnly),
-        IsUtilMode(IsUtilMode) {}
+        IsUtilMode(IsUtilMode), PrepareForLTO(PrepareForLTO) {}
   bool processLoop(Loop *L);
 
 private:
@@ -300,7 +304,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
       CodeMetrics::collectEphemeralValues(L, AC, EphValues);
 
       CodeMetrics Metrics;
-      Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues);
+      Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues, PrepareForLTO);
       if (Metrics.notDuplicatable) {
         LLVM_DEBUG(
                    dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable"
@@ -320,8 +324,14 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
                           << " instructions, which is more than the threshold ("
                           << MaxHeaderSize << " instructions): ";
                    L->dump());
+        ++NumNotRotatedDueToHeaderSize;
         return Rotated;
       }
+
+      // When preparing for LTO, avoid rotating loops with calls that could be
+      // inlined during the LTO stage.
+      if (PrepareForLTO && Metrics.NumInlineCandidates > 0)
+        return Rotated;
     }
 
     // Now, this loop is suitable for rotation.
@@ -391,6 +401,14 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
         break;
     }
 
+    // Remember the local noalias scope declarations in the header. After the
+    // rotation, they must be duplicated and the scope must be cloned. This
+    // avoids unwanted interaction across iterations.
+    SmallVector<NoAliasScopeDeclInst *, 6> NoAliasDeclInstructions;
+    for (Instruction &I : *OrigHeader)
+      if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(&I))
+        NoAliasDeclInstructions.push_back(Decl);
+
     while (I != E) {
       Instruction *Inst = &*I++;
 
@@ -451,6 +469,69 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
       }
     }
 
+    if (!NoAliasDeclInstructions.empty()) {
+      // There are noalias scope declarations:
+      // (general):
+      // Original:    OrigPre              { OrigHeader NewHeader ... Latch }
+      // after:      (OrigPre+OrigHeader') { NewHeader ... Latch OrigHeader }
+      //
+      // with D: llvm.experimental.noalias.scope.decl,
+      //      U: !noalias or !alias.scope depending on D
+      //       ... { D U1 U2 }   can transform into:
+      // (0) : ... { D U1 U2 }        // no relevant rotation for this part
+      // (1) : ... D' { U1 U2 D }     // D is part of OrigHeader
+      // (2) : ... D' U1' { U2 D U1 } // D, U1 are part of OrigHeader
+      //
+      // We now want to transform:
+      // (1) -> : ... D' { D U1 U2 D'' }
+      // (2) -> : ... D' U1' { D U2 D'' U1'' }
+      // D: original llvm.experimental.noalias.scope.decl
+      // D', U1': duplicate with replaced scopes
+      // D'', U1'': different duplicate with replaced scopes
+      // This ensures a safe fallback to 'may_alias' introduced by the rotate,
+      // as U1'' and U1' scopes will not be compatible wrt to the local restrict
+
+      // Clone the llvm.experimental.noalias.decl again for the NewHeader.
+      Instruction *NewHeaderInsertionPoint = &(*NewHeader->getFirstNonPHI());
+      for (NoAliasScopeDeclInst *NAD : NoAliasDeclInstructions) {
+        LLVM_DEBUG(dbgs() << "  Cloning llvm.experimental.noalias.scope.decl:"
+                          << *NAD << "\n");
+        Instruction *NewNAD = NAD->clone();
+        NewNAD->insertBefore(NewHeaderInsertionPoint);
+      }
+
+      // Scopes must now be duplicated, once for OrigHeader and once for
+      // OrigPreHeader'.
+      {
+        auto &Context = NewHeader->getContext();
+
+        SmallVector<MDNode *, 8> NoAliasDeclScopes;
+        for (NoAliasScopeDeclInst *NAD : NoAliasDeclInstructions)
+          NoAliasDeclScopes.push_back(NAD->getScopeList());
+
+        LLVM_DEBUG(dbgs() << "  Updating OrigHeader scopes\n");
+        cloneAndAdaptNoAliasScopes(NoAliasDeclScopes, {OrigHeader}, Context,
+                                   "h.rot");
+        LLVM_DEBUG(OrigHeader->dump());
+
+        // Keep the compile time impact low by only adapting the inserted block
+        // of instructions in the OrigPreHeader. This might result in slightly
+        // more aliasing between these instructions and those that were already
+        // present, but it will be much faster when the original PreHeader is
+        // large.
+        LLVM_DEBUG(dbgs() << "  Updating part of OrigPreheader scopes\n");
+        auto *FirstDecl =
+            cast<Instruction>(ValueMap[*NoAliasDeclInstructions.begin()]);
+        auto *LastInst = &OrigPreheader->back();
+        cloneAndAdaptNoAliasScopes(NoAliasDeclScopes, FirstDecl, LastInst,
+                                   Context, "pre.rot");
+        LLVM_DEBUG(OrigPreheader->dump());
+
+        LLVM_DEBUG(dbgs() << "  Updated NewHeader:\n");
+        LLVM_DEBUG(NewHeader->dump());
+      }
+    }
+
     // Along with all the other instructions, we just cloned OrigHeader's
     // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's
     // successors by duplicating their incoming values for OrigHeader.
@@ -496,12 +577,13 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
       Updates.push_back({DominatorTree::Insert, OrigPreheader, Exit});
       Updates.push_back({DominatorTree::Insert, OrigPreheader, NewHeader});
       Updates.push_back({DominatorTree::Delete, OrigPreheader, OrigHeader});
-      DT->applyUpdates(Updates);
 
       if (MSSAU) {
-        MSSAU->applyUpdates(Updates, *DT);
+        MSSAU->applyUpdates(Updates, *DT, /*UpdateDT=*/true);
         if (VerifyMemorySSA)
           MSSAU->getMemorySSA()->verifyMemorySSA();
+      } else {
+        DT->applyUpdates(Updates);
       }
     }
 
@@ -575,7 +657,10 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
     // connected by an unconditional branch.  This is just a cleanup so the
     // emitted code isn't too gross in this common case.
     DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
-    MergeBlockIntoPredecessor(OrigHeader, &DTU, LI, MSSAU);
+    BasicBlock *PredBB = OrigHeader->getUniquePredecessor();
+    bool DidMerge = MergeBlockIntoPredecessor(OrigHeader, &DTU, LI, MSSAU);
+    if (DidMerge)
+      RemoveRedundantDbgInstrs(PredBB);
 
     if (MSSAU && VerifyMemorySSA)
       MSSAU->getMemorySSA()->verifyMemorySSA();
@@ -739,13 +824,8 @@ bool llvm::LoopRotation(Loop *L, LoopInfo *LI, const TargetTransformInfo *TTI,
                         ScalarEvolution *SE, MemorySSAUpdater *MSSAU,
                         const SimplifyQuery &SQ, bool RotationOnly = true,
                         unsigned Threshold = unsigned(-1),
-                        bool IsUtilMode = true) {
-  if (MSSAU && VerifyMemorySSA)
-    MSSAU->getMemorySSA()->verifyMemorySSA();
+                        bool IsUtilMode = true, bool PrepareForLTO) {
   LoopRotate LR(Threshold, LI, TTI, AC, DT, SE, MSSAU, SQ, RotationOnly,
-                IsUtilMode);
-  if (MSSAU && VerifyMemorySSA)
-    MSSAU->getMemorySSA()->verifyMemorySSA();
-
+                IsUtilMode, PrepareForLTO);
   return LR.processLoop(L);
 }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopSimplify.cpp
index a8445e94e55a..2e104334ad96 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopSimplify.cpp
@@ -163,7 +163,7 @@ BasicBlock *llvm::InsertPreheaderForLoop(Loop *L, DominatorTree *DT,
 /// if it's not already in there.  Stop predecessor traversal when we reach
 /// StopBlock.
 static void addBlockAndPredsToSet(BasicBlock *InputBB, BasicBlock *StopBlock,
-                                  std::set<BasicBlock*> &Blocks) {
+                                  SmallPtrSetImpl<BasicBlock *> &Blocks) {
   SmallVector<BasicBlock *, 8> Worklist;
   Worklist.push_back(InputBB);
   do {
@@ -171,10 +171,7 @@ static void addBlockAndPredsToSet(BasicBlock *InputBB, BasicBlock *StopBlock,
     if (Blocks.insert(BB).second && BB != StopBlock)
       // If BB is not already processed and it is not a stop block then
       // insert its predecessor in the work list
-      for (pred_iterator I = pred_begin(BB), E = pred_end(BB); I != E; ++I) {
-        BasicBlock *WBB = *I;
-        Worklist.push_back(WBB);
-      }
+      append_range(Worklist, predecessors(BB));
   } while (!Worklist.empty());
 }
 
@@ -308,9 +305,8 @@ static Loop *separateNestedLoop(Loop *L, BasicBlock *Preheader,
 
   // Determine which blocks should stay in L and which should be moved out to
   // the Outer loop now.
-  std::set<BasicBlock*> BlocksInL;
-  for (pred_iterator PI=pred_begin(Header), E = pred_end(Header); PI!=E; ++PI) {
-    BasicBlock *P = *PI;
+  SmallPtrSet<BasicBlock *, 4> BlocksInL;
+  for (BasicBlock *P : predecessors(Header)) {
     if (DT->dominates(Header, P))
       addBlockAndPredsToSet(P, Header, BlocksInL);
   }
@@ -683,7 +679,7 @@ ReprocessLoop:
       // The block has now been cleared of all instructions except for
       // a comparison and a conditional branch. SimplifyCFG may be able
       // to fold it now.
-      if (!FoldBranchToCommonDest(BI, MSSAU))
+      if (!FoldBranchToCommonDest(BI, /*DTU=*/nullptr, MSSAU))
         continue;
 
       // Success. The block is now dead, so remove it from the loop,
@@ -691,7 +687,7 @@ ReprocessLoop:
       LLVM_DEBUG(dbgs() << "LoopSimplify: Eliminating exiting block "
                         << ExitingBlock->getName() << "\n");
 
-      assert(pred_begin(ExitingBlock) == pred_end(ExitingBlock));
+      assert(pred_empty(ExitingBlock));
       Changed = true;
       LI->removeBlock(ExitingBlock);
 
@@ -836,8 +832,8 @@ bool LoopSimplify::runOnFunction(Function &F) {
   bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
 
   // Simplify each loop nest in the function.
-  for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I)
-    Changed |= simplifyLoop(*I, DT, LI, SE, AC, MSSAU.get(), PreserveLCSSA);
+  for (auto *L : *LI)
+    Changed |= simplifyLoop(L, DT, LI, SE, AC, MSSAU.get(), PreserveLCSSA);
 
 #ifndef NDEBUG
   if (PreserveLCSSA) {
@@ -866,9 +862,9 @@ PreservedAnalyses LoopSimplifyPass::run(Function &F,
 
   // Note that we don't preserve LCSSA in the new PM, if you need it run LCSSA
   // after simplifying the loops. MemorySSA is preserved if it exists.
-  for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I)
+  for (auto *L : *LI)
     Changed |=
-        simplifyLoop(*I, DT, LI, SE, AC, MSSAU.get(), /*PreserveLCSSA*/ false);
+        simplifyLoop(L, DT, LI, SE, AC, MSSAU.get(), /*PreserveLCSSA*/ false);
 
   if (!Changed)
     return PreservedAnalyses::all();
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index 3875c631f839..d4cd57405239 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -59,6 +59,7 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopPeel.h"
 #include "llvm/Transforms/Utils/LoopSimplify.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/SimplifyIndVar.h"
@@ -108,14 +109,15 @@ UnrollVerifyDomtree("unroll-verify-domtree", cl::Hidden,
 /// insert a phi-node, otherwise LCSSA will be broken.
 /// The function is just a helper function for llvm::UnrollLoop that returns
 /// true if this situation occurs, indicating that LCSSA needs to be fixed.
-static bool needToInsertPhisForLCSSA(Loop *L, std::vector<BasicBlock *> Blocks,
+static bool needToInsertPhisForLCSSA(Loop *L,
+                                     const std::vector<BasicBlock *> &Blocks,
                                      LoopInfo *LI) {
   for (BasicBlock *BB : Blocks) {
     if (LI->getLoopFor(BB) == L)
       continue;
     for (Instruction &I : *BB) {
       for (Use &U : I.operands()) {
-        if (auto Def = dyn_cast<Instruction>(U)) {
+        if (const auto *Def = dyn_cast<Instruction>(U)) {
           Loop *DefLoop = LI->getLoopFor(Def->getParent());
           if (!DefLoop)
             continue;
@@ -286,14 +288,12 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
                                   OptimizationRemarkEmitter *ORE,
                                   bool PreserveLCSSA, Loop **RemainderLoop) {
 
-  BasicBlock *Preheader = L->getLoopPreheader();
-  if (!Preheader) {
+  if (!L->getLoopPreheader()) {
     LLVM_DEBUG(dbgs() << "  Can't unroll; loop preheader-insertion failed.\n");
     return LoopUnrollResult::Unmodified;
   }
 
-  BasicBlock *LatchBlock = L->getLoopLatch();
-  if (!LatchBlock) {
+  if (!L->getLoopLatch()) {
     LLVM_DEBUG(dbgs() << "  Can't unroll; loop exit-block-insertion failed.\n");
     return LoopUnrollResult::Unmodified;
   }
@@ -304,37 +304,7 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
     return LoopUnrollResult::Unmodified;
   }
 
-  // The current loop unroll pass can unroll loops that have
-  // (1) single latch; and
-  // (2a) latch is unconditional; or
-  // (2b) latch is conditional and is an exiting block
-  // FIXME: The implementation can be extended to work with more complicated
-  // cases, e.g. loops with multiple latches.
-  BasicBlock *Header = L->getHeader();
-  BranchInst *LatchBI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
-
-  // A conditional branch which exits the loop, which can be optimized to an
-  // unconditional branch in the unrolled loop in some cases.
-  BranchInst *ExitingBI = nullptr;
-  bool LatchIsExiting = L->isLoopExiting(LatchBlock);
-  if (LatchIsExiting)
-    ExitingBI = LatchBI;
-  else if (BasicBlock *ExitingBlock = L->getExitingBlock())
-    ExitingBI = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
-  if (!LatchBI || (LatchBI->isConditional() && !LatchIsExiting)) {
-    LLVM_DEBUG(
-        dbgs() << "Can't unroll; a conditional latch must exit the loop");
-    return LoopUnrollResult::Unmodified;
-  }
-  LLVM_DEBUG({
-    if (ExitingBI)
-      dbgs() << "  Exiting Block = " << ExitingBI->getParent()->getName()
-             << "\n";
-    else
-      dbgs() << "  No single exiting block\n";
-  });
-
-  if (Header->hasAddressTaken()) {
+  if (L->getHeader()->hasAddressTaken()) {
     // The loop-rotate pass can be helpful to avoid this in many cases.
     LLVM_DEBUG(
         dbgs() << "  Won't unroll loop: address of header block is taken.\n");
@@ -363,20 +333,6 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
 
   // Are we eliminating the loop control altogether?
   bool CompletelyUnroll = ULO.Count == ULO.TripCount;
-  SmallVector<BasicBlock *, 4> ExitBlocks;
-  L->getExitBlocks(ExitBlocks);
-  std::vector<BasicBlock*> OriginalLoopBlocks = L->getBlocks();
-
-  // Go through all exits of L and see if there are any phi-nodes there. We just
-  // conservatively assume that they're inserted to preserve LCSSA form, which
-  // means that complete unrolling might break this form. We need to either fix
-  // it in-place after the transformation, or entirely rebuild LCSSA. TODO: For
-  // now we just recompute LCSSA for the outer loop, but it should be possible
-  // to fix it in-place.
-  bool NeedToFixLCSSA = PreserveLCSSA && CompletelyUnroll &&
-                        any_of(ExitBlocks, [](const BasicBlock *BB) {
-                          return isa<PHINode>(BB->begin());
-                        });
 
   // We assume a run-time trip count if the compiler cannot
   // figure out the loop trip count and the unroll-runtime
@@ -401,12 +357,63 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
       BasicBlock *ExitingBlock = L->getLoopLatch();
       assert(ExitingBlock && "Loop without exiting block?");
       assert(L->isLoopExiting(ExitingBlock) && "Latch is not exiting?");
-      Preheader = L->getLoopPreheader();
       ULO.TripCount = SE->getSmallConstantTripCount(L, ExitingBlock);
       ULO.TripMultiple = SE->getSmallConstantTripMultiple(L, ExitingBlock);
     }
   }
 
+  // All these values should be taken only after peeling because they might have
+  // changed.
+  BasicBlock *Preheader = L->getLoopPreheader();
+  BasicBlock *Header = L->getHeader();
+  BasicBlock *LatchBlock = L->getLoopLatch();
+  SmallVector<BasicBlock *, 4> ExitBlocks;
+  L->getExitBlocks(ExitBlocks);
+  std::vector<BasicBlock *> OriginalLoopBlocks = L->getBlocks();
+
+  // Go through all exits of L and see if there are any phi-nodes there. We just
+  // conservatively assume that they're inserted to preserve LCSSA form, which
+  // means that complete unrolling might break this form. We need to either fix
+  // it in-place after the transformation, or entirely rebuild LCSSA. TODO: For
+  // now we just recompute LCSSA for the outer loop, but it should be possible
+  // to fix it in-place.
+  bool NeedToFixLCSSA =
+      PreserveLCSSA && CompletelyUnroll &&
+      any_of(ExitBlocks,
+             [](const BasicBlock *BB) { return isa<PHINode>(BB->begin()); });
+
+  // The current loop unroll pass can unroll loops that have
+  // (1) single latch; and
+  // (2a) latch is unconditional; or
+  // (2b) latch is conditional and is an exiting block
+  // FIXME: The implementation can be extended to work with more complicated
+  // cases, e.g. loops with multiple latches.
+  BranchInst *LatchBI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
+
+  // A conditional branch which exits the loop, which can be optimized to an
+  // unconditional branch in the unrolled loop in some cases.
+  BranchInst *ExitingBI = nullptr;
+  bool LatchIsExiting = L->isLoopExiting(LatchBlock);
+  if (LatchIsExiting)
+    ExitingBI = LatchBI;
+  else if (BasicBlock *ExitingBlock = L->getExitingBlock())
+    ExitingBI = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
+  if (!LatchBI || (LatchBI->isConditional() && !LatchIsExiting)) {
+    // If the peeling guard is changed this assert may be relaxed or even
+    // deleted.
+    assert(!Peeled && "Peeling guard changed!");
+    LLVM_DEBUG(
+        dbgs() << "Can't unroll; a conditional latch must exit the loop");
+    return LoopUnrollResult::Unmodified;
+  }
+  LLVM_DEBUG({
+    if (ExitingBI)
+      dbgs() << "  Exiting Block = " << ExitingBI->getParent()->getName()
+             << "\n";
+    else
+      dbgs() << "  No single exiting block\n";
+  });
+
   // Loops containing convergent instructions must have a count that divides
   // their TripMultiple.
   LLVM_DEBUG(
@@ -583,6 +590,11 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
                          << DIL->getFilename() << " Line: " << DIL->getLine());
           }
 
+  // Identify what noalias metadata is inside the loop: if it is inside the
+  // loop, the associated metadata must be cloned for each iteration.
+  SmallVector<MDNode *, 6> LoopLocalNoAliasDeclScopes;
+  identifyNoAliasScopesToClone(L->getBlocks(), LoopLocalNoAliasDeclScopes);
+
   for (unsigned It = 1; It != ULO.Count; ++It) {
     SmallVector<BasicBlock *, 8> NewBlocks;
     SmallDenseMap<const Loop *, Loop *, 4> NewLoops;
@@ -676,6 +688,15 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
             AC->registerAssumption(II);
       }
     }
+
+    {
+      // Identify what other metadata depends on the cloned version. After
+      // cloning, replace the metadata with the corrected version for both
+      // memory instructions and noalias intrinsics.
+      std::string ext = (Twine("It") + Twine(It)).str();
+      cloneAndAdaptNoAliasScopes(LoopLocalNoAliasDeclScopes, NewBlocks,
+                                 Header->getContext(), ext);
+    }
   }
 
   // Loop over the PHI nodes in the original block, setting incoming values.
@@ -863,9 +884,7 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
       if (MergeBlockIntoPredecessor(Dest, &DTU, LI)) {
         // Dest has been folded into Fold. Update our worklists accordingly.
         std::replace(Latches.begin(), Latches.end(), Dest, Fold);
-        UnrolledLoopBlocks.erase(std::remove(UnrolledLoopBlocks.begin(),
-                                             UnrolledLoopBlocks.end(), Dest),
-                                 UnrolledLoopBlocks.end());
+        llvm::erase_value(UnrolledLoopBlocks, Dest);
       }
     }
   }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
index dd628f3e7e0c..6e32a2b865aa 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
@@ -148,8 +148,7 @@ static bool processHeaderPhiOperands(BasicBlock *Header, BasicBlock *Latch,
   }
 
   while (!Worklist.empty()) {
-    Instruction *I = Worklist.back();
-    Worklist.pop_back();
+    Instruction *I = Worklist.pop_back_val();
     if (!Visit(I))
       return false;
 
@@ -459,14 +458,6 @@ llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
   // finish up connecting the blocks and phi nodes. At this point LastValueMap
   // is the last unrolled iterations values.
 
-  // Update Phis in BB from OldBB to point to NewBB
-  auto updatePHIBlocks = [](BasicBlock *BB, BasicBlock *OldBB,
-                            BasicBlock *NewBB) {
-    for (PHINode &Phi : BB->phis()) {
-      int I = Phi.getBasicBlockIndex(OldBB);
-      Phi.setIncomingBlock(I, NewBB);
-    }
-  };
   // Update Phis in BB from OldBB to point to NewBB and use the latest value
   // from LastValueMap
   auto updatePHIBlocksAndValues = [](BasicBlock *BB, BasicBlock *OldBB,
@@ -525,10 +516,10 @@ llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
       cast<BranchInst>(SubLoopBlocksLast.back()->getTerminator());
   SubTerm->setSuccessor(!SubLoopContinueOnTrue, SubLoopBlocksFirst[0]);
   SubTerm->setSuccessor(SubLoopContinueOnTrue, AftBlocksFirst[0]);
-  updatePHIBlocks(SubLoopBlocksFirst[0], ForeBlocksLast[0],
-                  ForeBlocksLast.back());
-  updatePHIBlocks(SubLoopBlocksFirst[0], SubLoopBlocksLast[0],
-                  SubLoopBlocksLast.back());
+  SubLoopBlocksFirst[0]->replacePhiUsesWith(ForeBlocksLast[0],
+                                            ForeBlocksLast.back());
+  SubLoopBlocksFirst[0]->replacePhiUsesWith(SubLoopBlocksLast[0],
+                                            SubLoopBlocksLast.back());
 
   for (unsigned It = 1; It != Count; It++) {
     // Replace the conditional branch of the previous iteration subloop with an
@@ -538,10 +529,10 @@ llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
     BranchInst::Create(SubLoopBlocksFirst[It], SubTerm);
     SubTerm->eraseFromParent();
 
-    updatePHIBlocks(SubLoopBlocksFirst[It], ForeBlocksLast[It],
-                    ForeBlocksLast.back());
-    updatePHIBlocks(SubLoopBlocksFirst[It], SubLoopBlocksLast[It],
-                    SubLoopBlocksLast.back());
+    SubLoopBlocksFirst[It]->replacePhiUsesWith(ForeBlocksLast[It],
+                                               ForeBlocksLast.back());
+    SubLoopBlocksFirst[It]->replacePhiUsesWith(SubLoopBlocksLast[It],
+                                               SubLoopBlocksLast.back());
     movePHIs(SubLoopBlocksFirst[It], SubLoopBlocksFirst[0]);
   }
 
@@ -555,8 +546,8 @@ llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
     assert(AftTerm->getSuccessor(ContinueOnTrue) == LoopExit &&
            "Expecting the ContinueOnTrue successor of AftTerm to be LoopExit");
   }
-  updatePHIBlocks(AftBlocksFirst[0], SubLoopBlocksLast[0],
-                  SubLoopBlocksLast.back());
+  AftBlocksFirst[0]->replacePhiUsesWith(SubLoopBlocksLast[0],
+                                        SubLoopBlocksLast.back());
 
   for (unsigned It = 1; It != Count; It++) {
     // Replace the conditional branch of the previous iteration subloop with an
@@ -566,8 +557,8 @@ llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
     BranchInst::Create(AftBlocksFirst[It], AftTerm);
     AftTerm->eraseFromParent();
 
-    updatePHIBlocks(AftBlocksFirst[It], SubLoopBlocksLast[It],
-                    SubLoopBlocksLast.back());
+    AftBlocksFirst[It]->replacePhiUsesWith(SubLoopBlocksLast[It],
+                                           SubLoopBlocksLast.back());
     movePHIs(AftBlocksFirst[It], AftBlocksFirst[0]);
   }
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp
deleted file mode 100644
index c653aacbee6c..000000000000
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp
+++ /dev/null
@@ -1,798 +0,0 @@
-//===- UnrollLoopPeel.cpp - Loop peeling utilities ------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements some loop unrolling utilities for peeling loops
-// with dynamically inferred (from PGO) trip counts. See LoopUnroll.cpp for
-// unrolling loops with compile-time constant trip counts.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopIterator.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/MDBuilder.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/LoopSimplify.h"
-#include "llvm/Transforms/Utils/LoopUtils.h"
-#include "llvm/Transforms/Utils/UnrollLoop.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
-#include <limits>
-
-using namespace llvm;
-using namespace llvm::PatternMatch;
-
-#define DEBUG_TYPE "loop-unroll"
-
-STATISTIC(NumPeeled, "Number of loops peeled");
-
-static cl::opt<unsigned> UnrollPeelMaxCount(
-    "unroll-peel-max-count", cl::init(7), cl::Hidden,
-    cl::desc("Max average trip count which will cause loop peeling."));
-
-static cl::opt<unsigned> UnrollForcePeelCount(
-    "unroll-force-peel-count", cl::init(0), cl::Hidden,
-    cl::desc("Force a peel count regardless of profiling information."));
-
-static cl::opt<bool> UnrollPeelMultiDeoptExit(
-    "unroll-peel-multi-deopt-exit", cl::init(true), cl::Hidden,
-    cl::desc("Allow peeling of loops with multiple deopt exits."));
-
-static const char *PeeledCountMetaData = "llvm.loop.peeled.count";
-
-// Designates that a Phi is estimated to become invariant after an "infinite"
-// number of loop iterations (i.e. only may become an invariant if the loop is
-// fully unrolled).
-static const unsigned InfiniteIterationsToInvariance =
-    std::numeric_limits<unsigned>::max();
-
-// Check whether we are capable of peeling this loop.
-bool llvm::canPeel(Loop *L) {
-  // Make sure the loop is in simplified form
-  if (!L->isLoopSimplifyForm())
-    return false;
-
-  if (UnrollPeelMultiDeoptExit) {
-    SmallVector<BasicBlock *, 4> Exits;
-    L->getUniqueNonLatchExitBlocks(Exits);
-
-    if (!Exits.empty()) {
-      // Latch's terminator is a conditional branch, Latch is exiting and
-      // all non Latch exits ends up with deoptimize.
-      const BasicBlock *Latch = L->getLoopLatch();
-      const BranchInst *T = dyn_cast<BranchInst>(Latch->getTerminator());
-      return T && T->isConditional() && L->isLoopExiting(Latch) &&
-             all_of(Exits, [](const BasicBlock *BB) {
-               return BB->getTerminatingDeoptimizeCall();
-             });
-    }
-  }
-
-  // Only peel loops that contain a single exit
-  if (!L->getExitingBlock() || !L->getUniqueExitBlock())
-    return false;
-
-  // Don't try to peel loops where the latch is not the exiting block.
-  // This can be an indication of two different things:
-  // 1) The loop is not rotated.
-  // 2) The loop contains irreducible control flow that involves the latch.
-  if (L->getLoopLatch() != L->getExitingBlock())
-    return false;
-
-  return true;
-}
-
-// This function calculates the number of iterations after which the given Phi
-// becomes an invariant. The pre-calculated values are memorized in the map. The
-// function (shortcut is I) is calculated according to the following definition:
-// Given %x = phi <Inputs from above the loop>, ..., [%y, %back.edge].
-//   If %y is a loop invariant, then I(%x) = 1.
-//   If %y is a Phi from the loop header, I(%x) = I(%y) + 1.
-//   Otherwise, I(%x) is infinite.
-// TODO: Actually if %y is an expression that depends only on Phi %z and some
-//       loop invariants, we can estimate I(%x) = I(%z) + 1. The example
-//       looks like:
-//         %x = phi(0, %a),  <-- becomes invariant starting from 3rd iteration.
-//         %y = phi(0, 5),
-//         %a = %y + 1.
-static unsigned calculateIterationsToInvariance(
-    PHINode *Phi, Loop *L, BasicBlock *BackEdge,
-    SmallDenseMap<PHINode *, unsigned> &IterationsToInvariance) {
-  assert(Phi->getParent() == L->getHeader() &&
-         "Non-loop Phi should not be checked for turning into invariant.");
-  assert(BackEdge == L->getLoopLatch() && "Wrong latch?");
-  // If we already know the answer, take it from the map.
-  auto I = IterationsToInvariance.find(Phi);
-  if (I != IterationsToInvariance.end())
-    return I->second;
-
-  // Otherwise we need to analyze the input from the back edge.
-  Value *Input = Phi->getIncomingValueForBlock(BackEdge);
-  // Place infinity to map to avoid infinite recursion for cycled Phis. Such
-  // cycles can never stop on an invariant.
-  IterationsToInvariance[Phi] = InfiniteIterationsToInvariance;
-  unsigned ToInvariance = InfiniteIterationsToInvariance;
-
-  if (L->isLoopInvariant(Input))
-    ToInvariance = 1u;
-  else if (PHINode *IncPhi = dyn_cast<PHINode>(Input)) {
-    // Only consider Phis in header block.
-    if (IncPhi->getParent() != L->getHeader())
-      return InfiniteIterationsToInvariance;
-    // If the input becomes an invariant after X iterations, then our Phi
-    // becomes an invariant after X + 1 iterations.
-    unsigned InputToInvariance = calculateIterationsToInvariance(
-        IncPhi, L, BackEdge, IterationsToInvariance);
-    if (InputToInvariance != InfiniteIterationsToInvariance)
-      ToInvariance = InputToInvariance + 1u;
-  }
-
-  // If we found that this Phi lies in an invariant chain, update the map.
-  if (ToInvariance != InfiniteIterationsToInvariance)
-    IterationsToInvariance[Phi] = ToInvariance;
-  return ToInvariance;
-}
-
-// Return the number of iterations to peel off that make conditions in the
-// body true/false. For example, if we peel 2 iterations off the loop below,
-// the condition i < 2 can be evaluated at compile time.
-//  for (i = 0; i < n; i++)
-//    if (i < 2)
-//      ..
-//    else
-//      ..
-//   }
-static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount,
-                                         ScalarEvolution &SE) {
-  assert(L.isLoopSimplifyForm() && "Loop needs to be in loop simplify form");
-  unsigned DesiredPeelCount = 0;
-
-  for (auto *BB : L.blocks()) {
-    auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
-    if (!BI || BI->isUnconditional())
-      continue;
-
-    // Ignore loop exit condition.
-    if (L.getLoopLatch() == BB)
-      continue;
-
-    Value *Condition = BI->getCondition();
-    Value *LeftVal, *RightVal;
-    CmpInst::Predicate Pred;
-    if (!match(Condition, m_ICmp(Pred, m_Value(LeftVal), m_Value(RightVal))))
-      continue;
-
-    const SCEV *LeftSCEV = SE.getSCEV(LeftVal);
-    const SCEV *RightSCEV = SE.getSCEV(RightVal);
-
-    // Do not consider predicates that are known to be true or false
-    // independently of the loop iteration.
-    if (SE.isKnownPredicate(Pred, LeftSCEV, RightSCEV) ||
-        SE.isKnownPredicate(ICmpInst::getInversePredicate(Pred), LeftSCEV,
-                            RightSCEV))
-      continue;
-
-    // Check if we have a condition with one AddRec and one non AddRec
-    // expression. Normalize LeftSCEV to be the AddRec.
-    if (!isa<SCEVAddRecExpr>(LeftSCEV)) {
-      if (isa<SCEVAddRecExpr>(RightSCEV)) {
-        std::swap(LeftSCEV, RightSCEV);
-        Pred = ICmpInst::getSwappedPredicate(Pred);
-      } else
-        continue;
-    }
-
-    const SCEVAddRecExpr *LeftAR = cast<SCEVAddRecExpr>(LeftSCEV);
-
-    // Avoid huge SCEV computations in the loop below, make sure we only
-    // consider AddRecs of the loop we are trying to peel.
-    if (!LeftAR->isAffine() || LeftAR->getLoop() != &L)
-      continue;
-    bool Increasing;
-    if (!(ICmpInst::isEquality(Pred) && LeftAR->hasNoSelfWrap()) &&
-        !SE.isMonotonicPredicate(LeftAR, Pred, Increasing))
-      continue;
-    (void)Increasing;
-
-    // Check if extending the current DesiredPeelCount lets us evaluate Pred
-    // or !Pred in the loop body statically.
-    unsigned NewPeelCount = DesiredPeelCount;
-
-    const SCEV *IterVal = LeftAR->evaluateAtIteration(
-        SE.getConstant(LeftSCEV->getType(), NewPeelCount), SE);
-
-    // If the original condition is not known, get the negated predicate
-    // (which holds on the else branch) and check if it is known. This allows
-    // us to peel of iterations that make the original condition false.
-    if (!SE.isKnownPredicate(Pred, IterVal, RightSCEV))
-      Pred = ICmpInst::getInversePredicate(Pred);
-
-    const SCEV *Step = LeftAR->getStepRecurrence(SE);
-    const SCEV *NextIterVal = SE.getAddExpr(IterVal, Step);
-    auto PeelOneMoreIteration = [&IterVal, &NextIterVal, &SE, Step,
-                                 &NewPeelCount]() {
-      IterVal = NextIterVal;
-      NextIterVal = SE.getAddExpr(IterVal, Step);
-      NewPeelCount++;
-    };
-
-    auto CanPeelOneMoreIteration = [&NewPeelCount, &MaxPeelCount]() {
-      return NewPeelCount < MaxPeelCount;
-    };
-
-    while (CanPeelOneMoreIteration() &&
-           SE.isKnownPredicate(Pred, IterVal, RightSCEV))
-      PeelOneMoreIteration();
-
-    // With *that* peel count, does the predicate !Pred become known in the
-    // first iteration of the loop body after peeling?
-    if (!SE.isKnownPredicate(ICmpInst::getInversePredicate(Pred), IterVal,
-                             RightSCEV))
-      continue; // If not, give up.
-
-    // However, for equality comparisons, that isn't always sufficient to
-    // eliminate the comparsion in loop body, we may need to peel one more
-    // iteration. See if that makes !Pred become unknown again.
-    if (ICmpInst::isEquality(Pred) &&
-        !SE.isKnownPredicate(ICmpInst::getInversePredicate(Pred), NextIterVal,
-                             RightSCEV) &&
-        !SE.isKnownPredicate(Pred, IterVal, RightSCEV) &&
-        SE.isKnownPredicate(Pred, NextIterVal, RightSCEV)) {
-      if (!CanPeelOneMoreIteration())
-        continue; // Need to peel one more iteration, but can't. Give up.
-      PeelOneMoreIteration(); // Great!
-    }
-
-    DesiredPeelCount = std::max(DesiredPeelCount, NewPeelCount);
-  }
-
-  return DesiredPeelCount;
-}
-
-// Return the number of iterations we want to peel off.
-void llvm::computePeelCount(Loop *L, unsigned LoopSize,
-                            TargetTransformInfo::UnrollingPreferences &UP,
-                            TargetTransformInfo::PeelingPreferences &PP,
-                            unsigned &TripCount, ScalarEvolution &SE) {
-  assert(LoopSize > 0 && "Zero loop size is not allowed!");
-  // Save the PP.PeelCount value set by the target in
-  // TTI.getPeelingPreferences or by the flag -unroll-peel-count.
-  unsigned TargetPeelCount = PP.PeelCount;
-  PP.PeelCount = 0;
-  if (!canPeel(L))
-    return;
-
-  // Only try to peel innermost loops by default.
-  // The constraint can be relaxed by the target in TTI.getUnrollingPreferences
-  // or by the flag -unroll-allow-loop-nests-peeling.
-  if (!PP.AllowLoopNestsPeeling && !L->empty())
-    return;
-
-  // If the user provided a peel count, use that.
-  bool UserPeelCount = UnrollForcePeelCount.getNumOccurrences() > 0;
-  if (UserPeelCount) {
-    LLVM_DEBUG(dbgs() << "Force-peeling first " << UnrollForcePeelCount
-                      << " iterations.\n");
-    PP.PeelCount = UnrollForcePeelCount;
-    PP.PeelProfiledIterations = true;
-    return;
-  }
-
-  // Skip peeling if it's disabled.
-  if (!PP.AllowPeeling)
-    return;
-
-  unsigned AlreadyPeeled = 0;
-  if (auto Peeled = getOptionalIntLoopAttribute(L, PeeledCountMetaData))
-    AlreadyPeeled = *Peeled;
-  // Stop if we already peeled off the maximum number of iterations.
-  if (AlreadyPeeled >= UnrollPeelMaxCount)
-    return;
-
-  // Here we try to get rid of Phis which become invariants after 1, 2, ..., N
-  // iterations of the loop. For this we compute the number for iterations after
-  // which every Phi is guaranteed to become an invariant, and try to peel the
-  // maximum number of iterations among these values, thus turning all those
-  // Phis into invariants.
-  // First, check that we can peel at least one iteration.
-  if (2 * LoopSize <= UP.Threshold && UnrollPeelMaxCount > 0) {
-    // Store the pre-calculated values here.
-    SmallDenseMap<PHINode *, unsigned> IterationsToInvariance;
-    // Now go through all Phis to calculate their the number of iterations they
-    // need to become invariants.
-    // Start the max computation with the UP.PeelCount value set by the target
-    // in TTI.getUnrollingPreferences or by the flag -unroll-peel-count.
-    unsigned DesiredPeelCount = TargetPeelCount;
-    BasicBlock *BackEdge = L->getLoopLatch();
-    assert(BackEdge && "Loop is not in simplified form?");
-    for (auto BI = L->getHeader()->begin(); isa<PHINode>(&*BI); ++BI) {
-      PHINode *Phi = cast<PHINode>(&*BI);
-      unsigned ToInvariance = calculateIterationsToInvariance(
-          Phi, L, BackEdge, IterationsToInvariance);
-      if (ToInvariance != InfiniteIterationsToInvariance)
-        DesiredPeelCount = std::max(DesiredPeelCount, ToInvariance);
-    }
-
-    // Pay respect to limitations implied by loop size and the max peel count.
-    unsigned MaxPeelCount = UnrollPeelMaxCount;
-    MaxPeelCount = std::min(MaxPeelCount, UP.Threshold / LoopSize - 1);
-
-    DesiredPeelCount = std::max(DesiredPeelCount,
-                                countToEliminateCompares(*L, MaxPeelCount, SE));
-
-    if (DesiredPeelCount > 0) {
-      DesiredPeelCount = std::min(DesiredPeelCount, MaxPeelCount);
-      // Consider max peel count limitation.
-      assert(DesiredPeelCount > 0 && "Wrong loop size estimation?");
-      if (DesiredPeelCount + AlreadyPeeled <= UnrollPeelMaxCount) {
-        LLVM_DEBUG(dbgs() << "Peel " << DesiredPeelCount
-                          << " iteration(s) to turn"
-                          << " some Phis into invariants.\n");
-        PP.PeelCount = DesiredPeelCount;
-        PP.PeelProfiledIterations = false;
-        return;
-      }
-    }
-  }
-
-  // Bail if we know the statically calculated trip count.
-  // In this case we rather prefer partial unrolling.
-  if (TripCount)
-    return;
-
-  // Do not apply profile base peeling if it is disabled.
-  if (!PP.PeelProfiledIterations)
-    return;
-  // If we don't know the trip count, but have reason to believe the average
-  // trip count is low, peeling should be beneficial, since we will usually
-  // hit the peeled section.
-  // We only do this in the presence of profile information, since otherwise
-  // our estimates of the trip count are not reliable enough.
-  if (L->getHeader()->getParent()->hasProfileData()) {
-    Optional<unsigned> PeelCount = getLoopEstimatedTripCount(L);
-    if (!PeelCount)
-      return;
-
-    LLVM_DEBUG(dbgs() << "Profile-based estimated trip count is " << *PeelCount
-                      << "\n");
-
-    if (*PeelCount) {
-      if ((*PeelCount + AlreadyPeeled <= UnrollPeelMaxCount) &&
-          (LoopSize * (*PeelCount + 1) <= UP.Threshold)) {
-        LLVM_DEBUG(dbgs() << "Peeling first " << *PeelCount
-                          << " iterations.\n");
-        PP.PeelCount = *PeelCount;
-        return;
-      }
-      LLVM_DEBUG(dbgs() << "Requested peel count: " << *PeelCount << "\n");
-      LLVM_DEBUG(dbgs() << "Already peel count: " << AlreadyPeeled << "\n");
-      LLVM_DEBUG(dbgs() << "Max peel count: " << UnrollPeelMaxCount << "\n");
-      LLVM_DEBUG(dbgs() << "Peel cost: " << LoopSize * (*PeelCount + 1)
-                        << "\n");
-      LLVM_DEBUG(dbgs() << "Max peel cost: " << UP.Threshold << "\n");
-    }
-  }
-}
-
-/// Update the branch weights of the latch of a peeled-off loop
-/// iteration.
-/// This sets the branch weights for the latch of the recently peeled off loop
-/// iteration correctly.
-/// Let F is a weight of the edge from latch to header.
-/// Let E is a weight of the edge from latch to exit.
-/// F/(F+E) is a probability to go to loop and E/(F+E) is a probability to
-/// go to exit.
-/// Then, Estimated TripCount = F / E.
-/// For I-th (counting from 0) peeled off iteration we set the the weights for
-/// the peeled latch as (TC - I, 1). It gives us reasonable distribution,
-/// The probability to go to exit 1/(TC-I) increases. At the same time
-/// the estimated trip count of remaining loop reduces by I.
-/// To avoid dealing with division rounding we can just multiple both part
-/// of weights to E and use weight as (F - I * E, E).
-///
-/// \param Header The copy of the header block that belongs to next iteration.
-/// \param LatchBR The copy of the latch branch that belongs to this iteration.
-/// \param[in,out] FallThroughWeight The weight of the edge from latch to
-/// header before peeling (in) and after peeled off one iteration (out).
-static void updateBranchWeights(BasicBlock *Header, BranchInst *LatchBR,
-                                uint64_t ExitWeight,
-                                uint64_t &FallThroughWeight) {
-  // FallThroughWeight is 0 means that there is no branch weights on original
-  // latch block or estimated trip count is zero.
-  if (!FallThroughWeight)
-    return;
-
-  unsigned HeaderIdx = (LatchBR->getSuccessor(0) == Header ? 0 : 1);
-  MDBuilder MDB(LatchBR->getContext());
-  MDNode *WeightNode =
-      HeaderIdx ? MDB.createBranchWeights(ExitWeight, FallThroughWeight)
-                : MDB.createBranchWeights(FallThroughWeight, ExitWeight);
-  LatchBR->setMetadata(LLVMContext::MD_prof, WeightNode);
-  FallThroughWeight =
-      FallThroughWeight > ExitWeight ? FallThroughWeight - ExitWeight : 1;
-}
-
-/// Initialize the weights.
-///
-/// \param Header The header block.
-/// \param LatchBR The latch branch.
-/// \param[out] ExitWeight The weight of the edge from Latch to Exit.
-/// \param[out] FallThroughWeight The weight of the edge from Latch to Header.
-static void initBranchWeights(BasicBlock *Header, BranchInst *LatchBR,
-                              uint64_t &ExitWeight,
-                              uint64_t &FallThroughWeight) {
-  uint64_t TrueWeight, FalseWeight;
-  if (!LatchBR->extractProfMetadata(TrueWeight, FalseWeight))
-    return;
-  unsigned HeaderIdx = LatchBR->getSuccessor(0) == Header ? 0 : 1;
-  ExitWeight = HeaderIdx ? TrueWeight : FalseWeight;
-  FallThroughWeight = HeaderIdx ? FalseWeight : TrueWeight;
-}
-
-/// Update the weights of original Latch block after peeling off all iterations.
-///
-/// \param Header The header block.
-/// \param LatchBR The latch branch.
-/// \param ExitWeight The weight of the edge from Latch to Exit.
-/// \param FallThroughWeight The weight of the edge from Latch to Header.
-static void fixupBranchWeights(BasicBlock *Header, BranchInst *LatchBR,
-                               uint64_t ExitWeight,
-                               uint64_t FallThroughWeight) {
-  // FallThroughWeight is 0 means that there is no branch weights on original
-  // latch block or estimated trip count is zero.
-  if (!FallThroughWeight)
-    return;
-
-  // Sets the branch weights on the loop exit.
-  MDBuilder MDB(LatchBR->getContext());
-  unsigned HeaderIdx = LatchBR->getSuccessor(0) == Header ? 0 : 1;
-  MDNode *WeightNode =
-      HeaderIdx ? MDB.createBranchWeights(ExitWeight, FallThroughWeight)
-                : MDB.createBranchWeights(FallThroughWeight, ExitWeight);
-  LatchBR->setMetadata(LLVMContext::MD_prof, WeightNode);
-}
-
-/// Clones the body of the loop L, putting it between \p InsertTop and \p
-/// InsertBot.
-/// \param IterNumber The serial number of the iteration currently being
-/// peeled off.
-/// \param ExitEdges The exit edges of the original loop.
-/// \param[out] NewBlocks A list of the blocks in the newly created clone
-/// \param[out] VMap The value map between the loop and the new clone.
-/// \param LoopBlocks A helper for DFS-traversal of the loop.
-/// \param LVMap A value-map that maps instructions from the original loop to
-/// instructions in the last peeled-off iteration.
-static void cloneLoopBlocks(
-    Loop *L, unsigned IterNumber, BasicBlock *InsertTop, BasicBlock *InsertBot,
-    SmallVectorImpl<std::pair<BasicBlock *, BasicBlock *> > &ExitEdges,
-    SmallVectorImpl<BasicBlock *> &NewBlocks, LoopBlocksDFS &LoopBlocks,
-    ValueToValueMapTy &VMap, ValueToValueMapTy &LVMap, DominatorTree *DT,
-    LoopInfo *LI) {
-  BasicBlock *Header = L->getHeader();
-  BasicBlock *Latch = L->getLoopLatch();
-  BasicBlock *PreHeader = L->getLoopPreheader();
-
-  Function *F = Header->getParent();
-  LoopBlocksDFS::RPOIterator BlockBegin = LoopBlocks.beginRPO();
-  LoopBlocksDFS::RPOIterator BlockEnd = LoopBlocks.endRPO();
-  Loop *ParentLoop = L->getParentLoop();
-
-  // For each block in the original loop, create a new copy,
-  // and update the value map with the newly created values.
-  for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
-    BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, ".peel", F);
-    NewBlocks.push_back(NewBB);
-
-    // If an original block is an immediate child of the loop L, its copy
-    // is a child of a ParentLoop after peeling. If a block is a child of
-    // a nested loop, it is handled in the cloneLoop() call below.
-    if (ParentLoop && LI->getLoopFor(*BB) == L)
-      ParentLoop->addBasicBlockToLoop(NewBB, *LI);
-
-    VMap[*BB] = NewBB;
-
-    // If dominator tree is available, insert nodes to represent cloned blocks.
-    if (DT) {
-      if (Header == *BB)
-        DT->addNewBlock(NewBB, InsertTop);
-      else {
-        DomTreeNode *IDom = DT->getNode(*BB)->getIDom();
-        // VMap must contain entry for IDom, as the iteration order is RPO.
-        DT->addNewBlock(NewBB, cast<BasicBlock>(VMap[IDom->getBlock()]));
-      }
-    }
-  }
-
-  // Recursively create the new Loop objects for nested loops, if any,
-  // to preserve LoopInfo.
-  for (Loop *ChildLoop : *L) {
-    cloneLoop(ChildLoop, ParentLoop, VMap, LI, nullptr);
-  }
-
-  // Hook-up the control flow for the newly inserted blocks.
-  // The new header is hooked up directly to the "top", which is either
-  // the original loop preheader (for the first iteration) or the previous
-  // iteration's exiting block (for every other iteration)
-  InsertTop->getTerminator()->setSuccessor(0, cast<BasicBlock>(VMap[Header]));
-
-  // Similarly, for the latch:
-  // The original exiting edge is still hooked up to the loop exit.
-  // The backedge now goes to the "bottom", which is either the loop's real
-  // header (for the last peeled iteration) or the copied header of the next
-  // iteration (for every other iteration)
-  BasicBlock *NewLatch = cast<BasicBlock>(VMap[Latch]);
-  BranchInst *LatchBR = cast<BranchInst>(NewLatch->getTerminator());
-  for (unsigned idx = 0, e = LatchBR->getNumSuccessors(); idx < e; ++idx)
-    if (LatchBR->getSuccessor(idx) == Header) {
-      LatchBR->setSuccessor(idx, InsertBot);
-      break;
-    }
-  if (DT)
-    DT->changeImmediateDominator(InsertBot, NewLatch);
-
-  // The new copy of the loop body starts with a bunch of PHI nodes
-  // that pick an incoming value from either the preheader, or the previous
-  // loop iteration. Since this copy is no longer part of the loop, we
-  // resolve this statically:
-  // For the first iteration, we use the value from the preheader directly.
-  // For any other iteration, we replace the phi with the value generated by
-  // the immediately preceding clone of the loop body (which represents
-  // the previous iteration).
-  for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
-    PHINode *NewPHI = cast<PHINode>(VMap[&*I]);
-    if (IterNumber == 0) {
-      VMap[&*I] = NewPHI->getIncomingValueForBlock(PreHeader);
-    } else {
-      Value *LatchVal = NewPHI->getIncomingValueForBlock(Latch);
-      Instruction *LatchInst = dyn_cast<Instruction>(LatchVal);
-      if (LatchInst && L->contains(LatchInst))
-        VMap[&*I] = LVMap[LatchInst];
-      else
-        VMap[&*I] = LatchVal;
-    }
-    cast<BasicBlock>(VMap[Header])->getInstList().erase(NewPHI);
-  }
-
-  // Fix up the outgoing values - we need to add a value for the iteration
-  // we've just created. Note that this must happen *after* the incoming
-  // values are adjusted, since the value going out of the latch may also be
-  // a value coming into the header.
-  for (auto Edge : ExitEdges)
-    for (PHINode &PHI : Edge.second->phis()) {
-      Value *LatchVal = PHI.getIncomingValueForBlock(Edge.first);
-      Instruction *LatchInst = dyn_cast<Instruction>(LatchVal);
-      if (LatchInst && L->contains(LatchInst))
-        LatchVal = VMap[LatchVal];
-      PHI.addIncoming(LatchVal, cast<BasicBlock>(VMap[Edge.first]));
-    }
-
-  // LastValueMap is updated with the values for the current loop
-  // which are used the next time this function is called.
-  for (auto KV : VMap)
-    LVMap[KV.first] = KV.second;
-}
-
-/// Peel off the first \p PeelCount iterations of loop \p L.
-///
-/// Note that this does not peel them off as a single straight-line block.
-/// Rather, each iteration is peeled off separately, and needs to check the
-/// exit condition.
-/// For loops that dynamically execute \p PeelCount iterations or less
-/// this provides a benefit, since the peeled off iterations, which account
-/// for the bulk of dynamic execution, can be further simplified by scalar
-/// optimizations.
-bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
-                    ScalarEvolution *SE, DominatorTree *DT,
-                    AssumptionCache *AC, bool PreserveLCSSA) {
-  assert(PeelCount > 0 && "Attempt to peel out zero iterations?");
-  assert(canPeel(L) && "Attempt to peel a loop which is not peelable?");
-
-  LoopBlocksDFS LoopBlocks(L);
-  LoopBlocks.perform(LI);
-
-  BasicBlock *Header = L->getHeader();
-  BasicBlock *PreHeader = L->getLoopPreheader();
-  BasicBlock *Latch = L->getLoopLatch();
-  SmallVector<std::pair<BasicBlock *, BasicBlock *>, 4> ExitEdges;
-  L->getExitEdges(ExitEdges);
-
-  DenseMap<BasicBlock *, BasicBlock *> ExitIDom;
-  if (DT) {
-    // We'd like to determine the idom of exit block after peeling one
-    // iteration.
-    // Let Exit is exit block.
-    // Let ExitingSet - is a set of predecessors of Exit block. They are exiting
-    // blocks.
-    // Let Latch' and ExitingSet' are copies after a peeling.
-    // We'd like to find an idom'(Exit) - idom of Exit after peeling.
-    // It is an evident that idom'(Exit) will be the nearest common dominator
-    // of ExitingSet and ExitingSet'.
-    // idom(Exit) is a nearest common dominator of ExitingSet.
-    // idom(Exit)' is a nearest common dominator of ExitingSet'.
-    // Taking into account that we have a single Latch, Latch' will dominate
-    // Header and idom(Exit).
-    // So the idom'(Exit) is nearest common dominator of idom(Exit)' and Latch'.
-    // All these basic blocks are in the same loop, so what we find is
-    // (nearest common dominator of idom(Exit) and Latch)'.
-    // In the loop below we remember nearest common dominator of idom(Exit) and
-    // Latch to update idom of Exit later.
-    assert(L->hasDedicatedExits() && "No dedicated exits?");
-    for (auto Edge : ExitEdges) {
-      if (ExitIDom.count(Edge.second))
-        continue;
-      BasicBlock *BB = DT->findNearestCommonDominator(
-          DT->getNode(Edge.second)->getIDom()->getBlock(), Latch);
-      assert(L->contains(BB) && "IDom is not in a loop");
-      ExitIDom[Edge.second] = BB;
-    }
-  }
-
-  Function *F = Header->getParent();
-
-  // Set up all the necessary basic blocks. It is convenient to split the
-  // preheader into 3 parts - two blocks to anchor the peeled copy of the loop
-  // body, and a new preheader for the "real" loop.
-
-  // Peeling the first iteration transforms.
-  //
-  // PreHeader:
-  // ...
-  // Header:
-  //   LoopBody
-  //   If (cond) goto Header
-  // Exit:
-  //
-  // into
-  //
-  // InsertTop:
-  //   LoopBody
-  //   If (!cond) goto Exit
-  // InsertBot:
-  // NewPreHeader:
-  // ...
-  // Header:
-  //  LoopBody
-  //  If (cond) goto Header
-  // Exit:
-  //
-  // Each following iteration will split the current bottom anchor in two,
-  // and put the new copy of the loop body between these two blocks. That is,
-  // after peeling another iteration from the example above, we'll split
-  // InsertBot, and get:
-  //
-  // InsertTop:
-  //   LoopBody
-  //   If (!cond) goto Exit
-  // InsertBot:
-  //   LoopBody
-  //   If (!cond) goto Exit
-  // InsertBot.next:
-  // NewPreHeader:
-  // ...
-  // Header:
-  //  LoopBody
-  //  If (cond) goto Header
-  // Exit:
-
-  BasicBlock *InsertTop = SplitEdge(PreHeader, Header, DT, LI);
-  BasicBlock *InsertBot =
-      SplitBlock(InsertTop, InsertTop->getTerminator(), DT, LI);
-  BasicBlock *NewPreHeader =
-      SplitBlock(InsertBot, InsertBot->getTerminator(), DT, LI);
-
-  InsertTop->setName(Header->getName() + ".peel.begin");
-  InsertBot->setName(Header->getName() + ".peel.next");
-  NewPreHeader->setName(PreHeader->getName() + ".peel.newph");
-
-  ValueToValueMapTy LVMap;
-
-  // If we have branch weight information, we'll want to update it for the
-  // newly created branches.
-  BranchInst *LatchBR =
-      cast<BranchInst>(cast<BasicBlock>(Latch)->getTerminator());
-  uint64_t ExitWeight = 0, FallThroughWeight = 0;
-  initBranchWeights(Header, LatchBR, ExitWeight, FallThroughWeight);
-
-  // For each peeled-off iteration, make a copy of the loop.
-  for (unsigned Iter = 0; Iter < PeelCount; ++Iter) {
-    SmallVector<BasicBlock *, 8> NewBlocks;
-    ValueToValueMapTy VMap;
-
-    cloneLoopBlocks(L, Iter, InsertTop, InsertBot, ExitEdges, NewBlocks,
-                    LoopBlocks, VMap, LVMap, DT, LI);
-
-    // Remap to use values from the current iteration instead of the
-    // previous one.
-    remapInstructionsInBlocks(NewBlocks, VMap);
-
-    if (DT) {
-      // Latches of the cloned loops dominate over the loop exit, so idom of the
-      // latter is the first cloned loop body, as original PreHeader dominates
-      // the original loop body.
-      if (Iter == 0)
-        for (auto Exit : ExitIDom)
-          DT->changeImmediateDominator(Exit.first,
-                                       cast<BasicBlock>(LVMap[Exit.second]));
-#ifdef EXPENSIVE_CHECKS
-      assert(DT->verify(DominatorTree::VerificationLevel::Fast));
-#endif
-    }
-
-    auto *LatchBRCopy = cast<BranchInst>(VMap[LatchBR]);
-    updateBranchWeights(InsertBot, LatchBRCopy, ExitWeight, FallThroughWeight);
-    // Remove Loop metadata from the latch branch instruction
-    // because it is not the Loop's latch branch anymore.
-    LatchBRCopy->setMetadata(LLVMContext::MD_loop, nullptr);
-
-    InsertTop = InsertBot;
-    InsertBot = SplitBlock(InsertBot, InsertBot->getTerminator(), DT, LI);
-    InsertBot->setName(Header->getName() + ".peel.next");
-
-    F->getBasicBlockList().splice(InsertTop->getIterator(),
-                                  F->getBasicBlockList(),
-                                  NewBlocks[0]->getIterator(), F->end());
-  }
-
-  // Now adjust the phi nodes in the loop header to get their initial values
-  // from the last peeled-off iteration instead of the preheader.
-  for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
-    PHINode *PHI = cast<PHINode>(I);
-    Value *NewVal = PHI->getIncomingValueForBlock(Latch);
-    Instruction *LatchInst = dyn_cast<Instruction>(NewVal);
-    if (LatchInst && L->contains(LatchInst))
-      NewVal = LVMap[LatchInst];
-
-    PHI->setIncomingValueForBlock(NewPreHeader, NewVal);
-  }
-
-  fixupBranchWeights(Header, LatchBR, ExitWeight, FallThroughWeight);
-
-  // Update Metadata for count of peeled off iterations.
-  unsigned AlreadyPeeled = 0;
-  if (auto Peeled = getOptionalIntLoopAttribute(L, PeeledCountMetaData))
-    AlreadyPeeled = *Peeled;
-  addStringMetadataToLoop(L, PeeledCountMetaData, AlreadyPeeled + PeelCount);
-
-  if (Loop *ParentLoop = L->getParentLoop())
-    L = ParentLoop;
-
-  // We modified the loop, update SE.
-  SE->forgetTopmostLoop(L);
-
-  // Finally DomtTree must be correct.
-  assert(DT->verify(DominatorTree::VerificationLevel::Fast));
-
-  // FIXME: Incrementally update loop-simplify
-  simplifyLoop(L, DT, LI, SE, AC, nullptr, PreserveLCSSA);
-
-  NumPeeled++;
-
-  return true;
-}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index 2515b1676cb9..0abf62be156f 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -22,11 +22,11 @@
 
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Dominators.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/CommandLine.h"
@@ -505,6 +505,32 @@ static bool canProfitablyUnrollMultiExitLoop(
   // know of kinds of multiexit loops that would benefit from unrolling.
 }
 
+// Assign the maximum possible trip count as the back edge weight for the
+// remainder loop if the original loop comes with a branch weight.
+static void updateLatchBranchWeightsForRemainderLoop(Loop *OrigLoop,
+                                                     Loop *RemainderLoop,
+                                                     uint64_t UnrollFactor) {
+  uint64_t TrueWeight, FalseWeight;
+  BranchInst *LatchBR =
+      cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
+  if (LatchBR->extractProfMetadata(TrueWeight, FalseWeight)) {
+    uint64_t ExitWeight = LatchBR->getSuccessor(0) == OrigLoop->getHeader()
+                              ? FalseWeight
+                              : TrueWeight;
+    assert(UnrollFactor > 1);
+    uint64_t BackEdgeWeight = (UnrollFactor - 1) * ExitWeight;
+    BasicBlock *Header = RemainderLoop->getHeader();
+    BasicBlock *Latch = RemainderLoop->getLoopLatch();
+    auto *RemainderLatchBR = cast<BranchInst>(Latch->getTerminator());
+    unsigned HeaderIdx = (RemainderLatchBR->getSuccessor(0) == Header ? 0 : 1);
+    MDBuilder MDB(RemainderLatchBR->getContext());
+    MDNode *WeightNode =
+        HeaderIdx ? MDB.createBranchWeights(ExitWeight, BackEdgeWeight)
+                  : MDB.createBranchWeights(BackEdgeWeight, ExitWeight);
+    RemainderLatchBR->setMetadata(LLVMContext::MD_prof, WeightNode);
+  }
+}
+
 /// Insert code in the prolog/epilog code when unrolling a loop with a
 /// run-time trip-count.
 ///
@@ -788,6 +814,11 @@ bool llvm::UnrollRuntimeLoopRemainder(
       InsertTop, InsertBot,
       NewPreHeader, NewBlocks, LoopBlocks, VMap, DT, LI);
 
+  // Assign the maximum possible trip count as the back edge weight for the
+  // remainder loop if the original loop comes with a branch weight.
+  if (remainderLoop && !UnrollRemainder)
+    updateLatchBranchWeightsForRemainderLoop(L, remainderLoop, Count);
+
   // Insert the cloned blocks into the function.
   F->getBasicBlockList().splice(InsertBot->getIterator(),
                                 F->getBasicBlockList(),
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUtils.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 43363736684e..f0f423e9812a 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -63,6 +63,7 @@ static cl::opt<bool> ForceReductionIntrinsic(
 
 static const char *LLVMLoopDisableNonforced = "llvm.loop.disable_nonforced";
 static const char *LLVMLoopDisableLICM = "llvm.licm.disable";
+static const char *LLVMLoopMustProgress = "llvm.loop.mustprogress";
 
 bool llvm::formDedicatedExitBlocks(Loop *L, DominatorTree *DT, LoopInfo *LI,
                                    MemorySSAUpdater *MSSAU,
@@ -297,10 +298,24 @@ static Optional<bool> getOptionalBoolLoopAttribute(const Loop *TheLoop,
   llvm_unreachable("unexpected number of options");
 }
 
-static bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name) {
+bool llvm::getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name) {
   return getOptionalBoolLoopAttribute(TheLoop, Name).getValueOr(false);
 }
 
+Optional<ElementCount>
+llvm::getOptionalElementCountLoopAttribute(Loop *TheLoop) {
+  Optional<int> Width =
+      getOptionalIntLoopAttribute(TheLoop, "llvm.loop.vectorize.width");
+
+  if (Width.hasValue()) {
+    Optional<int> IsScalable = getOptionalIntLoopAttribute(
+        TheLoop, "llvm.loop.vectorize.scalable.enable");
+    return ElementCount::get(*Width, IsScalable.getValueOr(false));
+  }
+
+  return None;
+}
+
 llvm::Optional<int> llvm::getOptionalIntLoopAttribute(Loop *TheLoop,
                                                       StringRef Name) {
   const MDOperand *AttrMD =
@@ -334,7 +349,7 @@ Optional<MDNode *> llvm::makeFollowupLoopID(
 
   bool Changed = false;
   if (InheritAllAttrs || InheritSomeAttrs) {
-    for (const MDOperand &Existing : drop_begin(OrigLoopID->operands(), 1)) {
+    for (const MDOperand &Existing : drop_begin(OrigLoopID->operands())) {
       MDNode *Op = cast<MDNode>(Existing.get());
 
       auto InheritThisAttribute = [InheritSomeAttrs,
@@ -371,7 +386,7 @@ Optional<MDNode *> llvm::makeFollowupLoopID(
       continue;
 
     HasAnyFollowup = true;
-    for (const MDOperand &Option : drop_begin(FollowupNode->operands(), 1)) {
+    for (const MDOperand &Option : drop_begin(FollowupNode->operands())) {
       MDs.push_back(Option.get());
       Changed = true;
     }
@@ -404,6 +419,10 @@ bool llvm::hasDisableLICMTransformsHint(const Loop *L) {
   return getBooleanLoopAttribute(L, LLVMLoopDisableLICM);
 }
 
+bool llvm::hasMustProgress(const Loop *L) {
+  return getBooleanLoopAttribute(L, LLVMLoopMustProgress);
+}
+
 TransformationMode llvm::hasUnrollTransformation(Loop *L) {
   if (getBooleanLoopAttribute(L, "llvm.loop.unroll.disable"))
     return TM_SuppressedByUser;
@@ -450,14 +469,15 @@ TransformationMode llvm::hasVectorizeTransformation(Loop *L) {
   if (Enable == false)
     return TM_SuppressedByUser;
 
-  Optional<int> VectorizeWidth =
-      getOptionalIntLoopAttribute(L, "llvm.loop.vectorize.width");
+  Optional<ElementCount> VectorizeWidth =
+      getOptionalElementCountLoopAttribute(L);
   Optional<int> InterleaveCount =
       getOptionalIntLoopAttribute(L, "llvm.loop.interleave.count");
 
   // 'Forcing' vector width and interleave count to one effectively disables
   // this tranformation.
-  if (Enable == true && VectorizeWidth == 1 && InterleaveCount == 1)
+  if (Enable == true && VectorizeWidth && VectorizeWidth->isScalar() &&
+      InterleaveCount == 1)
     return TM_SuppressedByUser;
 
   if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
@@ -466,10 +486,10 @@ TransformationMode llvm::hasVectorizeTransformation(Loop *L) {
   if (Enable == true)
     return TM_ForcedByUser;
 
-  if (VectorizeWidth == 1 && InterleaveCount == 1)
+  if ((VectorizeWidth && VectorizeWidth->isScalar()) && InterleaveCount == 1)
     return TM_Disable;
 
-  if (VectorizeWidth > 1 || InterleaveCount > 1)
+  if ((VectorizeWidth && VectorizeWidth->isVector()) || InterleaveCount > 1)
     return TM_Enable;
 
   if (hasDisableAllTransformsHint(L))
@@ -542,10 +562,6 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE,
   if (SE)
     SE->forgetLoop(L);
 
-  auto *ExitBlock = L->getUniqueExitBlock();
-  assert(ExitBlock && "Should have a unique exit block!");
-  assert(L->hasDedicatedExits() && "Loop should have dedicated exits!");
-
   auto *OldBr = dyn_cast<BranchInst>(Preheader->getTerminator());
   assert(OldBr && "Preheader must end with a branch");
   assert(OldBr->isUnconditional() && "Preheader must have a single successor");
@@ -575,48 +591,63 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE,
   // deleting the backedge of the outer loop). If the outer loop is indeed a
   // non-loop, it will be deleted in a future iteration of loop deletion pass.
   IRBuilder<> Builder(OldBr);
-  Builder.CreateCondBr(Builder.getFalse(), L->getHeader(), ExitBlock);
-  // Remove the old branch. The conditional branch becomes a new terminator.
-  OldBr->eraseFromParent();
-
-  // Rewrite phis in the exit block to get their inputs from the Preheader
-  // instead of the exiting block.
-  for (PHINode &P : ExitBlock->phis()) {
-    // Set the zero'th element of Phi to be from the preheader and remove all
-    // other incoming values. Given the loop has dedicated exits, all other
-    // incoming values must be from the exiting blocks.
-    int PredIndex = 0;
-    P.setIncomingBlock(PredIndex, Preheader);
-    // Removes all incoming values from all other exiting blocks (including
-    // duplicate values from an exiting block).
-    // Nuke all entries except the zero'th entry which is the preheader entry.
-    // NOTE! We need to remove Incoming Values in the reverse order as done
-    // below, to keep the indices valid for deletion (removeIncomingValues
-    // updates getNumIncomingValues and shifts all values down into the operand
-    // being deleted).
-    for (unsigned i = 0, e = P.getNumIncomingValues() - 1; i != e; ++i)
-      P.removeIncomingValue(e - i, false);
-
-    assert((P.getNumIncomingValues() == 1 &&
-            P.getIncomingBlock(PredIndex) == Preheader) &&
-           "Should have exactly one value and that's from the preheader!");
-  }
 
+  auto *ExitBlock = L->getUniqueExitBlock();
   DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
-  if (DT) {
-    DTU.applyUpdates({{DominatorTree::Insert, Preheader, ExitBlock}});
-    if (MSSA) {
-      MSSAU->applyUpdates({{DominatorTree::Insert, Preheader, ExitBlock}}, *DT);
-      if (VerifyMemorySSA)
-        MSSA->verifyMemorySSA();
+  if (ExitBlock) {
+    assert(ExitBlock && "Should have a unique exit block!");
+    assert(L->hasDedicatedExits() && "Loop should have dedicated exits!");
+
+    Builder.CreateCondBr(Builder.getFalse(), L->getHeader(), ExitBlock);
+    // Remove the old branch. The conditional branch becomes a new terminator.
+    OldBr->eraseFromParent();
+
+    // Rewrite phis in the exit block to get their inputs from the Preheader
+    // instead of the exiting block.
+    for (PHINode &P : ExitBlock->phis()) {
+      // Set the zero'th element of Phi to be from the preheader and remove all
+      // other incoming values. Given the loop has dedicated exits, all other
+      // incoming values must be from the exiting blocks.
+      int PredIndex = 0;
+      P.setIncomingBlock(PredIndex, Preheader);
+      // Removes all incoming values from all other exiting blocks (including
+      // duplicate values from an exiting block).
+      // Nuke all entries except the zero'th entry which is the preheader entry.
+      // NOTE! We need to remove Incoming Values in the reverse order as done
+      // below, to keep the indices valid for deletion (removeIncomingValues
+      // updates getNumIncomingValues and shifts all values down into the
+      // operand being deleted).
+      for (unsigned i = 0, e = P.getNumIncomingValues() - 1; i != e; ++i)
+        P.removeIncomingValue(e - i, false);
+
+      assert((P.getNumIncomingValues() == 1 &&
+              P.getIncomingBlock(PredIndex) == Preheader) &&
+             "Should have exactly one value and that's from the preheader!");
+    }
+
+    if (DT) {
+      DTU.applyUpdates({{DominatorTree::Insert, Preheader, ExitBlock}});
+      if (MSSA) {
+        MSSAU->applyUpdates({{DominatorTree::Insert, Preheader, ExitBlock}},
+                            *DT);
+        if (VerifyMemorySSA)
+          MSSA->verifyMemorySSA();
+      }
     }
-  }
 
-  // Disconnect the loop body by branching directly to its exit.
-  Builder.SetInsertPoint(Preheader->getTerminator());
-  Builder.CreateBr(ExitBlock);
-  // Remove the old branch.
-  Preheader->getTerminator()->eraseFromParent();
+    // Disconnect the loop body by branching directly to its exit.
+    Builder.SetInsertPoint(Preheader->getTerminator());
+    Builder.CreateBr(ExitBlock);
+    // Remove the old branch.
+    Preheader->getTerminator()->eraseFromParent();
+  } else {
+    assert(L->hasNoExitBlocks() &&
+           "Loop should have either zero or one exit blocks.");
+
+    Builder.SetInsertPoint(OldBr);
+    Builder.CreateUnreachable();
+    Preheader->getTerminator()->eraseFromParent();
+  }
 
   if (DT) {
     DTU.applyUpdates({{DominatorTree::Delete, Preheader, L->getHeader()}});
@@ -635,54 +666,58 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE,
   llvm::SmallDenseSet<std::pair<DIVariable *, DIExpression *>, 4> DeadDebugSet;
   llvm::SmallVector<DbgVariableIntrinsic *, 4> DeadDebugInst;
 
-  // Given LCSSA form is satisfied, we should not have users of instructions
-  // within the dead loop outside of the loop. However, LCSSA doesn't take
-  // unreachable uses into account. We handle them here.
-  // We could do it after drop all references (in this case all users in the
-  // loop will be already eliminated and we have less work to do but according
-  // to API doc of User::dropAllReferences only valid operation after dropping
-  // references, is deletion. So let's substitute all usages of
-  // instruction from the loop with undef value of corresponding type first.
-  for (auto *Block : L->blocks())
-    for (Instruction &I : *Block) {
-      auto *Undef = UndefValue::get(I.getType());
-      for (Value::use_iterator UI = I.use_begin(), E = I.use_end(); UI != E;) {
-        Use &U = *UI;
-        ++UI;
-        if (auto *Usr = dyn_cast<Instruction>(U.getUser()))
-          if (L->contains(Usr->getParent()))
-            continue;
-        // If we have a DT then we can check that uses outside a loop only in
-        // unreachable block.
-        if (DT)
-          assert(!DT->isReachableFromEntry(U) &&
-                 "Unexpected user in reachable block");
-        U.set(Undef);
+  if (ExitBlock) {
+    // Given LCSSA form is satisfied, we should not have users of instructions
+    // within the dead loop outside of the loop. However, LCSSA doesn't take
+    // unreachable uses into account. We handle them here.
+    // We could do it after drop all references (in this case all users in the
+    // loop will be already eliminated and we have less work to do but according
+    // to API doc of User::dropAllReferences only valid operation after dropping
+    // references, is deletion. So let's substitute all usages of
+    // instruction from the loop with undef value of corresponding type first.
+    for (auto *Block : L->blocks())
+      for (Instruction &I : *Block) {
+        auto *Undef = UndefValue::get(I.getType());
+        for (Value::use_iterator UI = I.use_begin(), E = I.use_end();
+             UI != E;) {
+          Use &U = *UI;
+          ++UI;
+          if (auto *Usr = dyn_cast<Instruction>(U.getUser()))
+            if (L->contains(Usr->getParent()))
+              continue;
+          // If we have a DT then we can check that uses outside a loop only in
+          // unreachable block.
+          if (DT)
+            assert(!DT->isReachableFromEntry(U) &&
+                   "Unexpected user in reachable block");
+          U.set(Undef);
+        }
+        auto *DVI = dyn_cast<DbgVariableIntrinsic>(&I);
+        if (!DVI)
+          continue;
+        auto Key =
+            DeadDebugSet.find({DVI->getVariable(), DVI->getExpression()});
+        if (Key != DeadDebugSet.end())
+          continue;
+        DeadDebugSet.insert({DVI->getVariable(), DVI->getExpression()});
+        DeadDebugInst.push_back(DVI);
       }
-      auto *DVI = dyn_cast<DbgVariableIntrinsic>(&I);
-      if (!DVI)
-        continue;
-      auto Key = DeadDebugSet.find({DVI->getVariable(), DVI->getExpression()});
-      if (Key != DeadDebugSet.end())
-        continue;
-      DeadDebugSet.insert({DVI->getVariable(), DVI->getExpression()});
-      DeadDebugInst.push_back(DVI);
-    }
 
-  // After the loop has been deleted all the values defined and modified
-  // inside the loop are going to be unavailable.
-  // Since debug values in the loop have been deleted, inserting an undef
-  // dbg.value truncates the range of any dbg.value before the loop where the
-  // loop used to be. This is particularly important for constant values.
-  DIBuilder DIB(*ExitBlock->getModule());
-  Instruction *InsertDbgValueBefore = ExitBlock->getFirstNonPHI();
-  assert(InsertDbgValueBefore &&
-         "There should be a non-PHI instruction in exit block, else these "
-         "instructions will have no parent.");
-  for (auto *DVI : DeadDebugInst)
-    DIB.insertDbgValueIntrinsic(UndefValue::get(Builder.getInt32Ty()),
-                                DVI->getVariable(), DVI->getExpression(),
-                                DVI->getDebugLoc(), InsertDbgValueBefore);
+    // After the loop has been deleted all the values defined and modified
+    // inside the loop are going to be unavailable.
+    // Since debug values in the loop have been deleted, inserting an undef
+    // dbg.value truncates the range of any dbg.value before the loop where the
+    // loop used to be. This is particularly important for constant values.
+    DIBuilder DIB(*ExitBlock->getModule());
+    Instruction *InsertDbgValueBefore = ExitBlock->getFirstNonPHI();
+    assert(InsertDbgValueBefore &&
+           "There should be a non-PHI instruction in exit block, else these "
+           "instructions will have no parent.");
+    for (auto *DVI : DeadDebugInst)
+      DIB.insertDbgValueIntrinsic(UndefValue::get(Builder.getInt32Ty()),
+                                  DVI->getVariable(), DVI->getExpression(),
+                                  DVI->getDebugLoc(), InsertDbgValueBefore);
+  }
 
   // Remove the block from the reference counting scheme, so that we can
   // delete it freely later.
@@ -726,6 +761,51 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE,
   }
 }
 
+static Loop *getOutermostLoop(Loop *L) {
+  while (Loop *Parent = L->getParentLoop())
+    L = Parent;
+  return L;
+}
+
+void llvm::breakLoopBackedge(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
+                             LoopInfo &LI, MemorySSA *MSSA) {
+  auto *Latch = L->getLoopLatch();
+  assert(Latch && "multiple latches not yet supported");
+  auto *Header = L->getHeader();
+  Loop *OutermostLoop = getOutermostLoop(L);
+
+  SE.forgetLoop(L);
+
+  // Note: By splitting the backedge, and then explicitly making it unreachable
+  // we gracefully handle corner cases such as non-bottom tested loops and the
+  // like.  We also have the benefit of being able to reuse existing well tested
+  // code.  It might be worth special casing the common bottom tested case at
+  // some point to avoid code churn.
+
+  std::unique_ptr<MemorySSAUpdater> MSSAU;
+  if (MSSA)
+    MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
+
+  auto *BackedgeBB = SplitEdge(Latch, Header, &DT, &LI, MSSAU.get());
+
+  DomTreeUpdater DTU(&DT, DomTreeUpdater::UpdateStrategy::Eager);
+  (void)changeToUnreachable(BackedgeBB->getTerminator(), /*UseTrap*/false,
+                            /*PreserveLCSSA*/true, &DTU, MSSAU.get());
+
+  // Erase (and destroy) this loop instance.  Handles relinking sub-loops
+  // and blocks within the loop as needed.
+  LI.erase(L);
+
+  // If the loop we broke had a parent, then changeToUnreachable might have
+  // caused a block to be removed from the parent loop (see loop_nest_lcssa
+  // test case in zero-btc.ll for an example), thus changing the parent's
+  // exit blocks.  If that happened, we need to rebuild LCSSA on the outermost
+  // loop which might have a had a block removed.
+  if (OutermostLoop != L)
+    formLCSSARecursively(*OutermostLoop, DT, &LI, &SE);
+}
+
+
 /// Checks if \p L has single exit through latch block except possibly
 /// "deoptimizing" exits. Returns branch instruction terminating the loop
 /// latch if above check is successful, nullptr otherwise.
@@ -838,30 +918,29 @@ bool llvm::hasIterationCountInvariantInParent(Loop *InnerLoop,
   return true;
 }
 
-Value *llvm::createMinMaxOp(IRBuilderBase &Builder,
-                            RecurrenceDescriptor::MinMaxRecurrenceKind RK,
-                            Value *Left, Value *Right) {
-  CmpInst::Predicate P = CmpInst::ICMP_NE;
+Value *llvm::createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left,
+                            Value *Right) {
+  CmpInst::Predicate Pred;
   switch (RK) {
   default:
     llvm_unreachable("Unknown min/max recurrence kind");
-  case RecurrenceDescriptor::MRK_UIntMin:
-    P = CmpInst::ICMP_ULT;
+  case RecurKind::UMin:
+    Pred = CmpInst::ICMP_ULT;
     break;
-  case RecurrenceDescriptor::MRK_UIntMax:
-    P = CmpInst::ICMP_UGT;
+  case RecurKind::UMax:
+    Pred = CmpInst::ICMP_UGT;
     break;
-  case RecurrenceDescriptor::MRK_SIntMin:
-    P = CmpInst::ICMP_SLT;
+  case RecurKind::SMin:
+    Pred = CmpInst::ICMP_SLT;
     break;
-  case RecurrenceDescriptor::MRK_SIntMax:
-    P = CmpInst::ICMP_SGT;
+  case RecurKind::SMax:
+    Pred = CmpInst::ICMP_SGT;
     break;
-  case RecurrenceDescriptor::MRK_FloatMin:
-    P = CmpInst::FCMP_OLT;
+  case RecurKind::FMin:
+    Pred = CmpInst::FCMP_OLT;
     break;
-  case RecurrenceDescriptor::MRK_FloatMax:
-    P = CmpInst::FCMP_OGT;
+  case RecurKind::FMax:
+    Pred = CmpInst::FCMP_OGT;
     break;
   }
 
@@ -871,17 +950,15 @@ Value *llvm::createMinMaxOp(IRBuilderBase &Builder,
   FastMathFlags FMF;
   FMF.setFast();
   Builder.setFastMathFlags(FMF);
-  Value *Cmp = Builder.CreateCmp(P, Left, Right, "rdx.minmax.cmp");
+  Value *Cmp = Builder.CreateCmp(Pred, Left, Right, "rdx.minmax.cmp");
   Value *Select = Builder.CreateSelect(Cmp, Left, Right, "rdx.minmax.select");
   return Select;
 }
 
 // Helper to generate an ordered reduction.
-Value *
-llvm::getOrderedReduction(IRBuilderBase &Builder, Value *Acc, Value *Src,
-                          unsigned Op,
-                          RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind,
-                          ArrayRef<Value *> RedOps) {
+Value *llvm::getOrderedReduction(IRBuilderBase &Builder, Value *Acc, Value *Src,
+                                 unsigned Op, RecurKind RdxKind,
+                                 ArrayRef<Value *> RedOps) {
   unsigned VF = cast<FixedVectorType>(Src->getType())->getNumElements();
 
   // Extract and apply reduction ops in ascending order:
@@ -895,9 +972,9 @@ llvm::getOrderedReduction(IRBuilderBase &Builder, Value *Acc, Value *Src,
       Result = Builder.CreateBinOp((Instruction::BinaryOps)Op, Result, Ext,
                                    "bin.rdx");
     } else {
-      assert(MinMaxKind != RecurrenceDescriptor::MRK_Invalid &&
+      assert(RecurrenceDescriptor::isMinMaxRecurrenceKind(RdxKind) &&
              "Invalid min/max");
-      Result = createMinMaxOp(Builder, MinMaxKind, Result, Ext);
+      Result = createMinMaxOp(Builder, RdxKind, Result, Ext);
     }
 
     if (!RedOps.empty())
@@ -908,10 +985,9 @@ llvm::getOrderedReduction(IRBuilderBase &Builder, Value *Acc, Value *Src,
 }
 
 // Helper to generate a log2 shuffle reduction.
-Value *
-llvm::getShuffleReduction(IRBuilderBase &Builder, Value *Src, unsigned Op,
-                          RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind,
-                          ArrayRef<Value *> RedOps) {
+Value *llvm::getShuffleReduction(IRBuilderBase &Builder, Value *Src,
+                                 unsigned Op, RecurKind RdxKind,
+                                 ArrayRef<Value *> RedOps) {
   unsigned VF = cast<FixedVectorType>(Src->getType())->getNumElements();
   // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
   // and vector ops, reducing the set of values being computed by half each
@@ -928,17 +1004,16 @@ llvm::getShuffleReduction(IRBuilderBase &Builder, Value *Src, unsigned Op,
     // Fill the rest of the mask with undef.
     std::fill(&ShuffleMask[i / 2], ShuffleMask.end(), -1);
 
-    Value *Shuf = Builder.CreateShuffleVector(
-        TmpVec, UndefValue::get(TmpVec->getType()), ShuffleMask, "rdx.shuf");
+    Value *Shuf = Builder.CreateShuffleVector(TmpVec, ShuffleMask, "rdx.shuf");
 
     if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
       // The builder propagates its fast-math-flags setting.
       TmpVec = Builder.CreateBinOp((Instruction::BinaryOps)Op, TmpVec, Shuf,
                                    "bin.rdx");
     } else {
-      assert(MinMaxKind != RecurrenceDescriptor::MRK_Invalid &&
+      assert(RecurrenceDescriptor::isMinMaxRecurrenceKind(RdxKind) &&
              "Invalid min/max");
-      TmpVec = createMinMaxOp(Builder, MinMaxKind, TmpVec, Shuf);
+      TmpVec = createMinMaxOp(Builder, RdxKind, TmpVec, Shuf);
     }
     if (!RedOps.empty())
       propagateIRFlags(TmpVec, RedOps);
@@ -952,124 +1027,62 @@ llvm::getShuffleReduction(IRBuilderBase &Builder, Value *Src, unsigned Op,
   return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
 }
 
-/// Create a simple vector reduction specified by an opcode and some
-/// flags (if generating min/max reductions).
-Value *llvm::createSimpleTargetReduction(
-    IRBuilderBase &Builder, const TargetTransformInfo *TTI, unsigned Opcode,
-    Value *Src, TargetTransformInfo::ReductionFlags Flags,
-    ArrayRef<Value *> RedOps) {
-  auto *SrcVTy = cast<VectorType>(Src->getType());
-
-  std::function<Value *()> BuildFunc;
-  using RD = RecurrenceDescriptor;
-  RD::MinMaxRecurrenceKind MinMaxKind = RD::MRK_Invalid;
-
-  switch (Opcode) {
-  case Instruction::Add:
-    BuildFunc = [&]() { return Builder.CreateAddReduce(Src); };
-    break;
-  case Instruction::Mul:
-    BuildFunc = [&]() { return Builder.CreateMulReduce(Src); };
-    break;
-  case Instruction::And:
-    BuildFunc = [&]() { return Builder.CreateAndReduce(Src); };
-    break;
-  case Instruction::Or:
-    BuildFunc = [&]() { return Builder.CreateOrReduce(Src); };
-    break;
-  case Instruction::Xor:
-    BuildFunc = [&]() { return Builder.CreateXorReduce(Src); };
-    break;
-  case Instruction::FAdd:
-    BuildFunc = [&]() {
-      auto Rdx = Builder.CreateFAddReduce(
-          Constant::getNullValue(SrcVTy->getElementType()), Src);
-      return Rdx;
-    };
-    break;
-  case Instruction::FMul:
-    BuildFunc = [&]() {
-      Type *Ty = SrcVTy->getElementType();
-      auto Rdx = Builder.CreateFMulReduce(ConstantFP::get(Ty, 1.0), Src);
-      return Rdx;
-    };
-    break;
-  case Instruction::ICmp:
-    if (Flags.IsMaxOp) {
-      MinMaxKind = Flags.IsSigned ? RD::MRK_SIntMax : RD::MRK_UIntMax;
-      BuildFunc = [&]() {
-        return Builder.CreateIntMaxReduce(Src, Flags.IsSigned);
-      };
-    } else {
-      MinMaxKind = Flags.IsSigned ? RD::MRK_SIntMin : RD::MRK_UIntMin;
-      BuildFunc = [&]() {
-        return Builder.CreateIntMinReduce(Src, Flags.IsSigned);
-      };
-    }
-    break;
-  case Instruction::FCmp:
-    if (Flags.IsMaxOp) {
-      MinMaxKind = RD::MRK_FloatMax;
-      BuildFunc = [&]() { return Builder.CreateFPMaxReduce(Src, Flags.NoNaN); };
-    } else {
-      MinMaxKind = RD::MRK_FloatMin;
-      BuildFunc = [&]() { return Builder.CreateFPMinReduce(Src, Flags.NoNaN); };
-    }
-    break;
+Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder,
+                                         const TargetTransformInfo *TTI,
+                                         Value *Src, RecurKind RdxKind,
+                                         ArrayRef<Value *> RedOps) {
+  unsigned Opcode = RecurrenceDescriptor::getOpcode(RdxKind);
+  TargetTransformInfo::ReductionFlags RdxFlags;
+  RdxFlags.IsMaxOp = RdxKind == RecurKind::SMax || RdxKind == RecurKind::UMax ||
+                     RdxKind == RecurKind::FMax;
+  RdxFlags.IsSigned = RdxKind == RecurKind::SMax || RdxKind == RecurKind::SMin;
+  if (!ForceReductionIntrinsic &&
+      !TTI->useReductionIntrinsic(Opcode, Src->getType(), RdxFlags))
+    return getShuffleReduction(Builder, Src, Opcode, RdxKind, RedOps);
+
+  auto *SrcVecEltTy = cast<VectorType>(Src->getType())->getElementType();
+  switch (RdxKind) {
+  case RecurKind::Add:
+    return Builder.CreateAddReduce(Src);
+  case RecurKind::Mul:
+    return Builder.CreateMulReduce(Src);
+  case RecurKind::And:
+    return Builder.CreateAndReduce(Src);
+  case RecurKind::Or:
+    return Builder.CreateOrReduce(Src);
+  case RecurKind::Xor:
+    return Builder.CreateXorReduce(Src);
+  case RecurKind::FAdd:
+    return Builder.CreateFAddReduce(ConstantFP::getNegativeZero(SrcVecEltTy),
+                                    Src);
+  case RecurKind::FMul:
+    return Builder.CreateFMulReduce(ConstantFP::get(SrcVecEltTy, 1.0), Src);
+  case RecurKind::SMax:
+    return Builder.CreateIntMaxReduce(Src, true);
+  case RecurKind::SMin:
+    return Builder.CreateIntMinReduce(Src, true);
+  case RecurKind::UMax:
+    return Builder.CreateIntMaxReduce(Src, false);
+  case RecurKind::UMin:
+    return Builder.CreateIntMinReduce(Src, false);
+  case RecurKind::FMax:
+    return Builder.CreateFPMaxReduce(Src);
+  case RecurKind::FMin:
+    return Builder.CreateFPMinReduce(Src);
   default:
     llvm_unreachable("Unhandled opcode");
-    break;
   }
-  if (ForceReductionIntrinsic ||
-      TTI->useReductionIntrinsic(Opcode, Src->getType(), Flags))
-    return BuildFunc();
-  return getShuffleReduction(Builder, Src, Opcode, MinMaxKind, RedOps);
 }
 
-/// Create a vector reduction using a given recurrence descriptor.
 Value *llvm::createTargetReduction(IRBuilderBase &B,
                                    const TargetTransformInfo *TTI,
-                                   RecurrenceDescriptor &Desc, Value *Src,
-                                   bool NoNaN) {
+                                   RecurrenceDescriptor &Desc, Value *Src) {
   // TODO: Support in-order reductions based on the recurrence descriptor.
-  using RD = RecurrenceDescriptor;
-  RD::RecurrenceKind RecKind = Desc.getRecurrenceKind();
-  TargetTransformInfo::ReductionFlags Flags;
-  Flags.NoNaN = NoNaN;
-
   // All ops in the reduction inherit fast-math-flags from the recurrence
   // descriptor.
   IRBuilderBase::FastMathFlagGuard FMFGuard(B);
   B.setFastMathFlags(Desc.getFastMathFlags());
-
-  switch (RecKind) {
-  case RD::RK_FloatAdd:
-    return createSimpleTargetReduction(B, TTI, Instruction::FAdd, Src, Flags);
-  case RD::RK_FloatMult:
-    return createSimpleTargetReduction(B, TTI, Instruction::FMul, Src, Flags);
-  case RD::RK_IntegerAdd:
-    return createSimpleTargetReduction(B, TTI, Instruction::Add, Src, Flags);
-  case RD::RK_IntegerMult:
-    return createSimpleTargetReduction(B, TTI, Instruction::Mul, Src, Flags);
-  case RD::RK_IntegerAnd:
-    return createSimpleTargetReduction(B, TTI, Instruction::And, Src, Flags);
-  case RD::RK_IntegerOr:
-    return createSimpleTargetReduction(B, TTI, Instruction::Or, Src, Flags);
-  case RD::RK_IntegerXor:
-    return createSimpleTargetReduction(B, TTI, Instruction::Xor, Src, Flags);
-  case RD::RK_IntegerMinMax: {
-    RD::MinMaxRecurrenceKind MMKind = Desc.getMinMaxRecurrenceKind();
-    Flags.IsMaxOp = (MMKind == RD::MRK_SIntMax || MMKind == RD::MRK_UIntMax);
-    Flags.IsSigned = (MMKind == RD::MRK_SIntMax || MMKind == RD::MRK_SIntMin);
-    return createSimpleTargetReduction(B, TTI, Instruction::ICmp, Src, Flags);
-  }
-  case RD::RK_FloatMinMax: {
-    Flags.IsMaxOp = Desc.getMinMaxRecurrenceKind() == RD::MRK_FloatMax;
-    return createSimpleTargetReduction(B, TTI, Instruction::FCmp, Src, Flags);
-  }
-  default:
-    llvm_unreachable("Unhandled RecKind");
-  }
+  return createSimpleTargetReduction(B, TTI, Src, Desc.getRecurrenceKind());
 }
 
 void llvm::propagateIRFlags(Value *I, ArrayRef<Value *> VL, Value *OpValue) {
@@ -1145,7 +1158,7 @@ static bool isValidRewrite(ScalarEvolution *SE, Value *FromVal, Value *ToVal) {
   // producing an expression involving multiple pointers. Until then, we must
   // bail out here.
   //
-  // Retrieve the pointer operand of the GEP. Don't use GetUnderlyingObject
+  // Retrieve the pointer operand of the GEP. Don't use getUnderlyingObject
   // because it understands lcssa phis while SCEV does not.
   Value *FromPtr = FromVal;
   Value *ToPtr = ToVal;
@@ -1162,7 +1175,7 @@ static bool isValidRewrite(ScalarEvolution *SE, Value *FromVal, Value *ToVal) {
 
     // SCEV may have rewritten an expression that produces the GEP's pointer
     // operand. That's ok as long as the pointer operand has the same base
-    // pointer. Unlike GetUnderlyingObject(), getPointerBase() will find the
+    // pointer. Unlike getUnderlyingObject(), getPointerBase() will find the
     // base of a recurrence. This handles the case in which SCEV expansion
     // converts a pointer type recurrence into a nonrecurrent pointer base
     // indexed by an integer recurrence.
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopVersioning.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopVersioning.cpp
index 16bd08c704ee..599bd1feb2bc 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopVersioning.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopVersioning.cpp
@@ -16,8 +16,12 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -32,29 +36,22 @@ static cl::opt<bool>
                     cl::desc("Add no-alias annotation for instructions that "
                              "are disambiguated by memchecks"));
 
-LoopVersioning::LoopVersioning(const LoopAccessInfo &LAI, Loop *L, LoopInfo *LI,
-                               DominatorTree *DT, ScalarEvolution *SE,
-                               bool UseLAIChecks)
-    : VersionedLoop(L), NonVersionedLoop(nullptr), LAI(LAI), LI(LI), DT(DT),
+LoopVersioning::LoopVersioning(const LoopAccessInfo &LAI,
+                               ArrayRef<RuntimePointerCheck> Checks, Loop *L,
+                               LoopInfo *LI, DominatorTree *DT,
+                               ScalarEvolution *SE)
+    : VersionedLoop(L), NonVersionedLoop(nullptr),
+      AliasChecks(Checks.begin(), Checks.end()),
+      Preds(LAI.getPSE().getUnionPredicate()), LAI(LAI), LI(LI), DT(DT),
       SE(SE) {
-  assert(L->getExitBlock() && "No single exit block");
-  assert(L->isLoopSimplifyForm() && "Loop is not in loop-simplify form");
-  if (UseLAIChecks) {
-    setAliasChecks(LAI.getRuntimePointerChecking()->getChecks());
-    setSCEVChecks(LAI.getPSE().getUnionPredicate());
-  }
-}
-
-void LoopVersioning::setAliasChecks(ArrayRef<RuntimePointerCheck> Checks) {
-  AliasChecks = {Checks.begin(), Checks.end()};
-}
-
-void LoopVersioning::setSCEVChecks(SCEVUnionPredicate Check) {
-  Preds = std::move(Check);
+  assert(L->getUniqueExitBlock() && "No single exit block");
 }
 
 void LoopVersioning::versionLoop(
     const SmallVectorImpl<Instruction *> &DefsUsedOutside) {
+  assert(VersionedLoop->isLoopSimplifyForm() &&
+         "Loop is not in loop-simplify form");
+
   Instruction *FirstCheckInst;
   Instruction *MemRuntimeCheck;
   Value *SCEVRuntimeCheck;
@@ -67,11 +64,10 @@ void LoopVersioning::versionLoop(
       addRuntimeChecks(RuntimeCheckBB->getTerminator(), VersionedLoop,
                        AliasChecks, RtPtrChecking.getSE());
 
-  const SCEVUnionPredicate &Pred = LAI.getPSE().getUnionPredicate();
   SCEVExpander Exp(*SE, RuntimeCheckBB->getModule()->getDataLayout(),
                    "scev.check");
   SCEVRuntimeCheck =
-      Exp.expandCodeForPredicate(&Pred, RuntimeCheckBB->getTerminator());
+      Exp.expandCodeForPredicate(&Preds, RuntimeCheckBB->getTerminator());
   auto *CI = dyn_cast<ConstantInt>(SCEVRuntimeCheck);
 
   // Discard the SCEV runtime check if it is always true.
@@ -122,6 +118,11 @@ void LoopVersioning::versionLoop(
   // Adds the necessary PHI nodes for the versioned loops based on the
   // loop-defined values used outside of the loop.
   addPHINodes(DefsUsedOutside);
+  formDedicatedExitBlocks(NonVersionedLoop, DT, LI, nullptr, true);
+  formDedicatedExitBlocks(VersionedLoop, DT, LI, nullptr, true);
+  assert(NonVersionedLoop->isLoopSimplifyForm() &&
+         VersionedLoop->isLoopSimplifyForm() &&
+         "The versioned loops should be in simplify form.");
 }
 
 void LoopVersioning::addPHINodes(
@@ -253,47 +254,59 @@ void LoopVersioning::annotateInstWithNoAlias(Instruction *VersionedInst,
 }
 
 namespace {
+bool runImpl(LoopInfo *LI, function_ref<const LoopAccessInfo &(Loop &)> GetLAA,
+             DominatorTree *DT, ScalarEvolution *SE) {
+  // Build up a worklist of inner-loops to version. This is necessary as the
+  // act of versioning a loop creates new loops and can invalidate iterators
+  // across the loops.
+  SmallVector<Loop *, 8> Worklist;
+
+  for (Loop *TopLevelLoop : *LI)
+    for (Loop *L : depth_first(TopLevelLoop))
+      // We only handle inner-most loops.
+      if (L->isInnermost())
+        Worklist.push_back(L);
+
+  // Now walk the identified inner loops.
+  bool Changed = false;
+  for (Loop *L : Worklist) {
+    if (!L->isLoopSimplifyForm() || !L->isRotatedForm() ||
+        !L->getExitingBlock())
+      continue;
+    const LoopAccessInfo &LAI = GetLAA(*L);
+    if (!LAI.hasConvergentOp() &&
+        (LAI.getNumRuntimePointerChecks() ||
+         !LAI.getPSE().getUnionPredicate().isAlwaysTrue())) {
+      LoopVersioning LVer(LAI, LAI.getRuntimePointerChecking()->getChecks(), L,
+                          LI, DT, SE);
+      LVer.versionLoop();
+      LVer.annotateLoopWithNoAlias();
+      Changed = true;
+    }
+  }
+
+  return Changed;
+}
+
 /// Also expose this is a pass.  Currently this is only used for
 /// unit-testing.  It adds all memchecks necessary to remove all may-aliasing
 /// array accesses from the loop.
-class LoopVersioningPass : public FunctionPass {
+class LoopVersioningLegacyPass : public FunctionPass {
 public:
-  LoopVersioningPass() : FunctionPass(ID) {
-    initializeLoopVersioningPassPass(*PassRegistry::getPassRegistry());
+  LoopVersioningLegacyPass() : FunctionPass(ID) {
+    initializeLoopVersioningLegacyPassPass(*PassRegistry::getPassRegistry());
   }
 
   bool runOnFunction(Function &F) override {
     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-    auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
+    auto GetLAA = [&](Loop &L) -> const LoopAccessInfo & {
+      return getAnalysis<LoopAccessLegacyAnalysis>().getInfo(&L);
+    };
+
     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
 
-    // Build up a worklist of inner-loops to version. This is necessary as the
-    // act of versioning a loop creates new loops and can invalidate iterators
-    // across the loops.
-    SmallVector<Loop *, 8> Worklist;
-
-    for (Loop *TopLevelLoop : *LI)
-      for (Loop *L : depth_first(TopLevelLoop))
-        // We only handle inner-most loops.
-        if (L->empty())
-          Worklist.push_back(L);
-
-    // Now walk the identified inner loops.
-    bool Changed = false;
-    for (Loop *L : Worklist) {
-      const LoopAccessInfo &LAI = LAA->getInfo(L);
-      if (L->isLoopSimplifyForm() && !LAI.hasConvergentOp() &&
-          (LAI.getNumRuntimePointerChecks() ||
-           !LAI.getPSE().getUnionPredicate().isAlwaysTrue())) {
-        LoopVersioning LVer(LAI, L, LI, DT, SE);
-        LVer.versionLoop();
-        LVer.annotateLoopWithNoAlias();
-        Changed = true;
-      }
-    }
-
-    return Changed;
+    return runImpl(LI, GetLAA, DT, SE);
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -312,18 +325,45 @@ public:
 #define LVER_OPTION "loop-versioning"
 #define DEBUG_TYPE LVER_OPTION
 
-char LoopVersioningPass::ID;
+char LoopVersioningLegacyPass::ID;
 static const char LVer_name[] = "Loop Versioning";
 
-INITIALIZE_PASS_BEGIN(LoopVersioningPass, LVER_OPTION, LVer_name, false, false)
+INITIALIZE_PASS_BEGIN(LoopVersioningLegacyPass, LVER_OPTION, LVer_name, false,
+                      false)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
-INITIALIZE_PASS_END(LoopVersioningPass, LVER_OPTION, LVer_name, false, false)
+INITIALIZE_PASS_END(LoopVersioningLegacyPass, LVER_OPTION, LVer_name, false,
+                    false)
 
 namespace llvm {
-FunctionPass *createLoopVersioningPass() {
-  return new LoopVersioningPass();
+FunctionPass *createLoopVersioningLegacyPass() {
+  return new LoopVersioningLegacyPass();
 }
+
+PreservedAnalyses LoopVersioningPass::run(Function &F,
+                                          FunctionAnalysisManager &AM) {
+  auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  auto &AA = AM.getResult<AAManager>(F);
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  MemorySSA *MSSA = EnableMSSALoopDependency
+                        ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
+                        : nullptr;
+
+  auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
+  auto GetLAA = [&](Loop &L) -> const LoopAccessInfo & {
+    LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,  SE,
+                                      TLI, TTI, nullptr, MSSA};
+    return LAM.getResult<LoopAccessAnalysis>(L, AR);
+  };
+
+  if (runImpl(&LI, GetLAA, &DT, &SE))
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
 }
+} // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LowerInvoke.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LowerInvoke.cpp
index 0b225e8abc4e..fe0ff5899d8f 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LowerInvoke.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LowerInvoke.cpp
@@ -48,7 +48,7 @@ static bool runImpl(Function &F) {
   bool Changed = false;
   for (BasicBlock &BB : F)
     if (InvokeInst *II = dyn_cast<InvokeInst>(BB.getTerminator())) {
-      SmallVector<Value *, 16> CallArgs(II->arg_begin(), II->arg_end());
+      SmallVector<Value *, 16> CallArgs(II->args());
       SmallVector<OperandBundleDef, 1> OpBundles;
       II->getOperandBundlesAsDefs(OpBundles);
       // Insert a normal call instruction...
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LowerSwitch.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LowerSwitch.cpp
index 34e836d9660f..ec8d7a7074cd 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LowerSwitch.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LowerSwitch.cpp
@@ -12,6 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Utils/LowerSwitch.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -26,6 +27,7 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/IR/Value.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
@@ -55,9 +57,9 @@ namespace {
 
 } // end anonymous namespace
 
+namespace {
 // Return true iff R is covered by Ranges.
-static bool IsInRanges(const IntRange &R,
-                       const std::vector<IntRange> &Ranges) {
+bool IsInRanges(const IntRange &R, const std::vector<IntRange> &Ranges) {
   // Note: Ranges must be sorted, non-overlapping and non-adjacent.
 
   // Find the first range whose High field is >= R.High,
@@ -68,120 +70,34 @@ static bool IsInRanges(const IntRange &R,
   return I != Ranges.end() && I->Low <= R.Low;
 }
 
-namespace {
-
-  /// Replace all SwitchInst instructions with chained branch instructions.
-  class LowerSwitch : public FunctionPass {
-  public:
-    // Pass identification, replacement for typeid
-    static char ID;
-
-    LowerSwitch() : FunctionPass(ID) {
-      initializeLowerSwitchPass(*PassRegistry::getPassRegistry());
-    }
-
-    bool runOnFunction(Function &F) override;
-
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<LazyValueInfoWrapperPass>();
-    }
-
-    struct CaseRange {
-      ConstantInt* Low;
-      ConstantInt* High;
-      BasicBlock* BB;
-
-      CaseRange(ConstantInt *low, ConstantInt *high, BasicBlock *bb)
-          : Low(low), High(high), BB(bb) {}
-    };
-
-    using CaseVector = std::vector<CaseRange>;
-    using CaseItr = std::vector<CaseRange>::iterator;
-
-  private:
-    void processSwitchInst(SwitchInst *SI,
-                           SmallPtrSetImpl<BasicBlock *> &DeleteList,
-                           AssumptionCache *AC, LazyValueInfo *LVI);
-
-    BasicBlock *switchConvert(CaseItr Begin, CaseItr End,
-                              ConstantInt *LowerBound, ConstantInt *UpperBound,
-                              Value *Val, BasicBlock *Predecessor,
-                              BasicBlock *OrigBlock, BasicBlock *Default,
-                              const std::vector<IntRange> &UnreachableRanges);
-    BasicBlock *newLeafBlock(CaseRange &Leaf, Value *Val,
-                             ConstantInt *LowerBound, ConstantInt *UpperBound,
-                             BasicBlock *OrigBlock, BasicBlock *Default);
-    unsigned Clusterify(CaseVector &Cases, SwitchInst *SI);
-  };
-
-  /// The comparison function for sorting the switch case values in the vector.
-  /// WARNING: Case ranges should be disjoint!
-  struct CaseCmp {
-    bool operator()(const LowerSwitch::CaseRange& C1,
-                    const LowerSwitch::CaseRange& C2) {
-      const ConstantInt* CI1 = cast<const ConstantInt>(C1.Low);
-      const ConstantInt* CI2 = cast<const ConstantInt>(C2.High);
-      return CI1->getValue().slt(CI2->getValue());
-    }
-  };
-
-} // end anonymous namespace
-
-char LowerSwitch::ID = 0;
-
-// Publicly exposed interface to pass...
-char &llvm::LowerSwitchID = LowerSwitch::ID;
-
-INITIALIZE_PASS_BEGIN(LowerSwitch, "lowerswitch",
-                      "Lower SwitchInst's to branches", false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass)
-INITIALIZE_PASS_END(LowerSwitch, "lowerswitch",
-                    "Lower SwitchInst's to branches", false, false)
-
-// createLowerSwitchPass - Interface to this file...
-FunctionPass *llvm::createLowerSwitchPass() {
-  return new LowerSwitch();
-}
-
-bool LowerSwitch::runOnFunction(Function &F) {
-  LazyValueInfo *LVI = &getAnalysis<LazyValueInfoWrapperPass>().getLVI();
-  auto *ACT = getAnalysisIfAvailable<AssumptionCacheTracker>();
-  AssumptionCache *AC = ACT ? &ACT->getAssumptionCache(F) : nullptr;
-
-  bool Changed = false;
-  SmallPtrSet<BasicBlock*, 8> DeleteList;
-
-  for (Function::iterator I = F.begin(), E = F.end(); I != E; ) {
-    BasicBlock *Cur = &*I++; // Advance over block so we don't traverse new blocks
-
-    // If the block is a dead Default block that will be deleted later, don't
-    // waste time processing it.
-    if (DeleteList.count(Cur))
-      continue;
-
-    if (SwitchInst *SI = dyn_cast<SwitchInst>(Cur->getTerminator())) {
-      Changed = true;
-      processSwitchInst(SI, DeleteList, AC, LVI);
-    }
-  }
-
-  for (BasicBlock* BB: DeleteList) {
-    LVI->eraseBlock(BB);
-    DeleteDeadBlock(BB);
+struct CaseRange {
+  ConstantInt *Low;
+  ConstantInt *High;
+  BasicBlock *BB;
+
+  CaseRange(ConstantInt *low, ConstantInt *high, BasicBlock *bb)
+      : Low(low), High(high), BB(bb) {}
+};
+
+using CaseVector = std::vector<CaseRange>;
+using CaseItr = std::vector<CaseRange>::iterator;
+
+/// The comparison function for sorting the switch case values in the vector.
+/// WARNING: Case ranges should be disjoint!
+struct CaseCmp {
+  bool operator()(const CaseRange &C1, const CaseRange &C2) {
+    const ConstantInt *CI1 = cast<const ConstantInt>(C1.Low);
+    const ConstantInt *CI2 = cast<const ConstantInt>(C2.High);
+    return CI1->getValue().slt(CI2->getValue());
   }
-
-  return Changed;
-}
+};
 
 /// Used for debugging purposes.
 LLVM_ATTRIBUTE_USED
-static raw_ostream &operator<<(raw_ostream &O,
-                               const LowerSwitch::CaseVector &C) {
+raw_ostream &operator<<(raw_ostream &O, const CaseVector &C) {
   O << "[";
 
-  for (LowerSwitch::CaseVector::const_iterator B = C.begin(), E = C.end();
-       B != E;) {
+  for (CaseVector::const_iterator B = C.begin(), E = C.end(); B != E;) {
     O << "[" << B->Low->getValue() << ", " << B->High->getValue() << "]";
     if (++B != E)
       O << ", ";
@@ -200,9 +116,9 @@ static raw_ostream &operator<<(raw_ostream &O,
 /// 2) Removed if subsequent incoming values now share the same case, i.e.,
 /// multiple outcome edges are condensed into one. This is necessary to keep the
 /// number of phi values equal to the number of branches to SuccBB.
-static void
-fixPhis(BasicBlock *SuccBB, BasicBlock *OrigBB, BasicBlock *NewBB,
-        const unsigned NumMergedCases = std::numeric_limits<unsigned>::max()) {
+void FixPhis(
+    BasicBlock *SuccBB, BasicBlock *OrigBB, BasicBlock *NewBB,
+    const unsigned NumMergedCases = std::numeric_limits<unsigned>::max()) {
   for (BasicBlock::iterator I = SuccBB->begin(),
                             IE = SuccBB->getFirstNonPHI()->getIterator();
        I != IE; ++I) {
@@ -233,17 +149,80 @@ fixPhis(BasicBlock *SuccBB, BasicBlock *OrigBB, BasicBlock *NewBB,
   }
 }
 
+/// Create a new leaf block for the binary lookup tree. It checks if the
+/// switch's value == the case's value. If not, then it jumps to the default
+/// branch. At this point in the tree, the value can't be another valid case
+/// value, so the jump to the "default" branch is warranted.
+BasicBlock *NewLeafBlock(CaseRange &Leaf, Value *Val, ConstantInt *LowerBound,
+                         ConstantInt *UpperBound, BasicBlock *OrigBlock,
+                         BasicBlock *Default) {
+  Function *F = OrigBlock->getParent();
+  BasicBlock *NewLeaf = BasicBlock::Create(Val->getContext(), "LeafBlock");
+  F->getBasicBlockList().insert(++OrigBlock->getIterator(), NewLeaf);
+
+  // Emit comparison
+  ICmpInst *Comp = nullptr;
+  if (Leaf.Low == Leaf.High) {
+    // Make the seteq instruction...
+    Comp =
+        new ICmpInst(*NewLeaf, ICmpInst::ICMP_EQ, Val, Leaf.Low, "SwitchLeaf");
+  } else {
+    // Make range comparison
+    if (Leaf.Low == LowerBound) {
+      // Val >= Min && Val <= Hi --> Val <= Hi
+      Comp = new ICmpInst(*NewLeaf, ICmpInst::ICMP_SLE, Val, Leaf.High,
+                          "SwitchLeaf");
+    } else if (Leaf.High == UpperBound) {
+      // Val <= Max && Val >= Lo --> Val >= Lo
+      Comp = new ICmpInst(*NewLeaf, ICmpInst::ICMP_SGE, Val, Leaf.Low,
+                          "SwitchLeaf");
+    } else if (Leaf.Low->isZero()) {
+      // Val >= 0 && Val <= Hi --> Val <=u Hi
+      Comp = new ICmpInst(*NewLeaf, ICmpInst::ICMP_ULE, Val, Leaf.High,
+                          "SwitchLeaf");
+    } else {
+      // Emit V-Lo <=u Hi-Lo
+      Constant *NegLo = ConstantExpr::getNeg(Leaf.Low);
+      Instruction *Add = BinaryOperator::CreateAdd(
+          Val, NegLo, Val->getName() + ".off", NewLeaf);
+      Constant *UpperBound = ConstantExpr::getAdd(NegLo, Leaf.High);
+      Comp = new ICmpInst(*NewLeaf, ICmpInst::ICMP_ULE, Add, UpperBound,
+                          "SwitchLeaf");
+    }
+  }
+
+  // Make the conditional branch...
+  BasicBlock *Succ = Leaf.BB;
+  BranchInst::Create(Succ, Default, Comp, NewLeaf);
+
+  // If there were any PHI nodes in this successor, rewrite one entry
+  // from OrigBlock to come from NewLeaf.
+  for (BasicBlock::iterator I = Succ->begin(); isa<PHINode>(I); ++I) {
+    PHINode *PN = cast<PHINode>(I);
+    // Remove all but one incoming entries from the cluster
+    uint64_t Range = Leaf.High->getSExtValue() - Leaf.Low->getSExtValue();
+    for (uint64_t j = 0; j < Range; ++j) {
+      PN->removeIncomingValue(OrigBlock);
+    }
+
+    int BlockIdx = PN->getBasicBlockIndex(OrigBlock);
+    assert(BlockIdx != -1 && "Switch didn't go to this successor??");
+    PN->setIncomingBlock((unsigned)BlockIdx, NewLeaf);
+  }
+
+  return NewLeaf;
+}
+
 /// Convert the switch statement into a binary lookup of the case values.
 /// The function recursively builds this tree. LowerBound and UpperBound are
 /// used to keep track of the bounds for Val that have already been checked by
 /// a block emitted by one of the previous calls to switchConvert in the call
 /// stack.
-BasicBlock *
-LowerSwitch::switchConvert(CaseItr Begin, CaseItr End, ConstantInt *LowerBound,
-                           ConstantInt *UpperBound, Value *Val,
-                           BasicBlock *Predecessor, BasicBlock *OrigBlock,
-                           BasicBlock *Default,
-                           const std::vector<IntRange> &UnreachableRanges) {
+BasicBlock *SwitchConvert(CaseItr Begin, CaseItr End, ConstantInt *LowerBound,
+                          ConstantInt *UpperBound, Value *Val,
+                          BasicBlock *Predecessor, BasicBlock *OrigBlock,
+                          BasicBlock *Default,
+                          const std::vector<IntRange> &UnreachableRanges) {
   assert(LowerBound && UpperBound && "Bounds must be initialized");
   unsigned Size = End - Begin;
 
@@ -255,10 +234,10 @@ LowerSwitch::switchConvert(CaseItr Begin, CaseItr End, ConstantInt *LowerBound,
     if (Begin->Low == LowerBound && Begin->High == UpperBound) {
       unsigned NumMergedCases = 0;
       NumMergedCases = UpperBound->getSExtValue() - LowerBound->getSExtValue();
-      fixPhis(Begin->BB, OrigBlock, Predecessor, NumMergedCases);
+      FixPhis(Begin->BB, OrigBlock, Predecessor, NumMergedCases);
       return Begin->BB;
     }
-    return newLeafBlock(*Begin, Val, LowerBound, UpperBound, OrigBlock,
+    return NewLeafBlock(*Begin, Val, LowerBound, UpperBound, OrigBlock,
                         Default);
   }
 
@@ -305,12 +284,12 @@ LowerSwitch::switchConvert(CaseItr Begin, CaseItr End, ConstantInt *LowerBound,
   ICmpInst* Comp = new ICmpInst(ICmpInst::ICMP_SLT,
                                 Val, Pivot.Low, "Pivot");
 
-  BasicBlock *LBranch = switchConvert(LHS.begin(), LHS.end(), LowerBound,
-                                      NewUpperBound, Val, NewNode, OrigBlock,
-                                      Default, UnreachableRanges);
-  BasicBlock *RBranch = switchConvert(RHS.begin(), RHS.end(), NewLowerBound,
-                                      UpperBound, Val, NewNode, OrigBlock,
-                                      Default, UnreachableRanges);
+  BasicBlock *LBranch =
+      SwitchConvert(LHS.begin(), LHS.end(), LowerBound, NewUpperBound, Val,
+                    NewNode, OrigBlock, Default, UnreachableRanges);
+  BasicBlock *RBranch =
+      SwitchConvert(RHS.begin(), RHS.end(), NewLowerBound, UpperBound, Val,
+                    NewNode, OrigBlock, Default, UnreachableRanges);
 
   F->getBasicBlockList().insert(++OrigBlock->getIterator(), NewNode);
   NewNode->getInstList().push_back(Comp);
@@ -319,78 +298,10 @@ LowerSwitch::switchConvert(CaseItr Begin, CaseItr End, ConstantInt *LowerBound,
   return NewNode;
 }
 
-/// Create a new leaf block for the binary lookup tree. It checks if the
-/// switch's value == the case's value. If not, then it jumps to the default
-/// branch. At this point in the tree, the value can't be another valid case
-/// value, so the jump to the "default" branch is warranted.
-BasicBlock *LowerSwitch::newLeafBlock(CaseRange &Leaf, Value *Val,
-                                      ConstantInt *LowerBound,
-                                      ConstantInt *UpperBound,
-                                      BasicBlock *OrigBlock,
-                                      BasicBlock *Default) {
-  Function* F = OrigBlock->getParent();
-  BasicBlock* NewLeaf = BasicBlock::Create(Val->getContext(), "LeafBlock");
-  F->getBasicBlockList().insert(++OrigBlock->getIterator(), NewLeaf);
-
-  // Emit comparison
-  ICmpInst* Comp = nullptr;
-  if (Leaf.Low == Leaf.High) {
-    // Make the seteq instruction...
-    Comp = new ICmpInst(*NewLeaf, ICmpInst::ICMP_EQ, Val,
-                        Leaf.Low, "SwitchLeaf");
-  } else {
-    // Make range comparison
-    if (Leaf.Low == LowerBound) {
-      // Val >= Min && Val <= Hi --> Val <= Hi
-      Comp = new ICmpInst(*NewLeaf, ICmpInst::ICMP_SLE, Val, Leaf.High,
-                          "SwitchLeaf");
-    } else if (Leaf.High == UpperBound) {
-      // Val <= Max && Val >= Lo --> Val >= Lo
-      Comp = new ICmpInst(*NewLeaf, ICmpInst::ICMP_SGE, Val, Leaf.Low,
-                          "SwitchLeaf");
-    } else if (Leaf.Low->isZero()) {
-      // Val >= 0 && Val <= Hi --> Val <=u Hi
-      Comp = new ICmpInst(*NewLeaf, ICmpInst::ICMP_ULE, Val, Leaf.High,
-                          "SwitchLeaf");
-    } else {
-      // Emit V-Lo <=u Hi-Lo
-      Constant* NegLo = ConstantExpr::getNeg(Leaf.Low);
-      Instruction* Add = BinaryOperator::CreateAdd(Val, NegLo,
-                                                   Val->getName()+".off",
-                                                   NewLeaf);
-      Constant *UpperBound = ConstantExpr::getAdd(NegLo, Leaf.High);
-      Comp = new ICmpInst(*NewLeaf, ICmpInst::ICMP_ULE, Add, UpperBound,
-                          "SwitchLeaf");
-    }
-  }
-
-  // Make the conditional branch...
-  BasicBlock* Succ = Leaf.BB;
-  BranchInst::Create(Succ, Default, Comp, NewLeaf);
-
-  // If there were any PHI nodes in this successor, rewrite one entry
-  // from OrigBlock to come from NewLeaf.
-  for (BasicBlock::iterator I = Succ->begin(); isa<PHINode>(I); ++I) {
-    PHINode* PN = cast<PHINode>(I);
-    // Remove all but one incoming entries from the cluster
-    uint64_t Range = Leaf.High->getSExtValue() -
-                     Leaf.Low->getSExtValue();
-    for (uint64_t j = 0; j < Range; ++j) {
-      PN->removeIncomingValue(OrigBlock);
-    }
-
-    int BlockIdx = PN->getBasicBlockIndex(OrigBlock);
-    assert(BlockIdx != -1 && "Switch didn't go to this successor??");
-    PN->setIncomingBlock((unsigned)BlockIdx, NewLeaf);
-  }
-
-  return NewLeaf;
-}
-
 /// Transform simple list of \p SI's cases into list of CaseRange's \p Cases.
 /// \post \p Cases wouldn't contain references to \p SI's default BB.
 /// \returns Number of \p SI's cases that do not reference \p SI's default BB.
-unsigned LowerSwitch::Clusterify(CaseVector& Cases, SwitchInst *SI) {
+unsigned Clusterify(CaseVector &Cases, SwitchInst *SI) {
   unsigned NumSimpleCases = 0;
 
   // Start with "simple" cases
@@ -431,9 +342,9 @@ unsigned LowerSwitch::Clusterify(CaseVector& Cases, SwitchInst *SI) {
 
 /// Replace the specified switch instruction with a sequence of chained if-then
 /// insts in a balanced binary search.
-void LowerSwitch::processSwitchInst(SwitchInst *SI,
-                                    SmallPtrSetImpl<BasicBlock *> &DeleteList,
-                                    AssumptionCache *AC, LazyValueInfo *LVI) {
+void ProcessSwitchInst(SwitchInst *SI,
+                       SmallPtrSetImpl<BasicBlock *> &DeleteList,
+                       AssumptionCache *AC, LazyValueInfo *LVI) {
   BasicBlock *OrigBlock = SI->getParent();
   Function *F = OrigBlock->getParent();
   Value *Val = SI->getCondition();  // The value we are switching on...
@@ -458,7 +369,7 @@ void LowerSwitch::processSwitchInst(SwitchInst *SI,
   if (Cases.empty()) {
     BranchInst::Create(Default, OrigBlock);
     // Remove all the references from Default's PHIs to OrigBlock, but one.
-    fixPhis(Default, OrigBlock, OrigBlock);
+    FixPhis(Default, OrigBlock, OrigBlock);
     SI->eraseFromParent();
     return;
   }
@@ -489,7 +400,7 @@ void LowerSwitch::processSwitchInst(SwitchInst *SI,
     // TODO Shouldn't this create a signed range?
     ConstantRange KnownBitsRange =
         ConstantRange::fromKnownBits(Known, /*IsSigned=*/false);
-    const ConstantRange LVIRange = LVI->getConstantRange(Val, OrigBlock, SI);
+    const ConstantRange LVIRange = LVI->getConstantRange(Val, SI);
     ConstantRange ValRange = KnownBitsRange.intersectWith(LVIRange);
     // We delegate removal of unreachable non-default cases to other passes. In
     // the unlikely event that some of them survived, we just conservatively
@@ -563,10 +474,8 @@ void LowerSwitch::processSwitchInst(SwitchInst *SI,
     // cases.
     assert(MaxPop > 0 && PopSucc);
     Default = PopSucc;
-    Cases.erase(
-        llvm::remove_if(
-            Cases, [PopSucc](const CaseRange &R) { return R.BB == PopSucc; }),
-        Cases.end());
+    llvm::erase_if(Cases,
+                   [PopSucc](const CaseRange &R) { return R.BB == PopSucc; });
 
     // If there are no cases left, just branch.
     if (Cases.empty()) {
@@ -592,12 +501,12 @@ void LowerSwitch::processSwitchInst(SwitchInst *SI,
   BranchInst::Create(Default, NewDefault);
 
   BasicBlock *SwitchBlock =
-      switchConvert(Cases.begin(), Cases.end(), LowerBound, UpperBound, Val,
+      SwitchConvert(Cases.begin(), Cases.end(), LowerBound, UpperBound, Val,
                     OrigBlock, OrigBlock, NewDefault, UnreachableRanges);
 
   // If there are entries in any PHI nodes for the default edge, make sure
   // to update them as well.
-  fixPhis(Default, OrigBlock, NewDefault);
+  FixPhis(Default, OrigBlock, NewDefault);
 
   // Branch to our shiny new if-then stuff...
   BranchInst::Create(SwitchBlock, OrigBlock);
@@ -607,6 +516,84 @@ void LowerSwitch::processSwitchInst(SwitchInst *SI,
   OrigBlock->getInstList().erase(SI);
 
   // If the Default block has no more predecessors just add it to DeleteList.
-  if (pred_begin(OldDefault) == pred_end(OldDefault))
+  if (pred_empty(OldDefault))
     DeleteList.insert(OldDefault);
 }
+
+bool LowerSwitch(Function &F, LazyValueInfo *LVI, AssumptionCache *AC) {
+  bool Changed = false;
+  SmallPtrSet<BasicBlock *, 8> DeleteList;
+
+  for (Function::iterator I = F.begin(), E = F.end(); I != E;) {
+    BasicBlock *Cur =
+        &*I++; // Advance over block so we don't traverse new blocks
+
+    // If the block is a dead Default block that will be deleted later, don't
+    // waste time processing it.
+    if (DeleteList.count(Cur))
+      continue;
+
+    if (SwitchInst *SI = dyn_cast<SwitchInst>(Cur->getTerminator())) {
+      Changed = true;
+      ProcessSwitchInst(SI, DeleteList, AC, LVI);
+    }
+  }
+
+  for (BasicBlock *BB : DeleteList) {
+    LVI->eraseBlock(BB);
+    DeleteDeadBlock(BB);
+  }
+
+  return Changed;
+}
+
+/// Replace all SwitchInst instructions with chained branch instructions.
+class LowerSwitchLegacyPass : public FunctionPass {
+public:
+  // Pass identification, replacement for typeid
+  static char ID;
+
+  LowerSwitchLegacyPass() : FunctionPass(ID) {
+    initializeLowerSwitchLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<LazyValueInfoWrapperPass>();
+  }
+};
+
+} // end anonymous namespace
+
+char LowerSwitchLegacyPass::ID = 0;
+
+// Publicly exposed interface to pass...
+char &llvm::LowerSwitchID = LowerSwitchLegacyPass::ID;
+
+INITIALIZE_PASS_BEGIN(LowerSwitchLegacyPass, "lowerswitch",
+                      "Lower SwitchInst's to branches", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass)
+INITIALIZE_PASS_END(LowerSwitchLegacyPass, "lowerswitch",
+                    "Lower SwitchInst's to branches", false, false)
+
+// createLowerSwitchPass - Interface to this file...
+FunctionPass *llvm::createLowerSwitchPass() {
+  return new LowerSwitchLegacyPass();
+}
+
+bool LowerSwitchLegacyPass::runOnFunction(Function &F) {
+  LazyValueInfo *LVI = &getAnalysis<LazyValueInfoWrapperPass>().getLVI();
+  auto *ACT = getAnalysisIfAvailable<AssumptionCacheTracker>();
+  AssumptionCache *AC = ACT ? &ACT->getAssumptionCache(F) : nullptr;
+  return LowerSwitch(F, LVI, AC);
+}
+
+PreservedAnalyses LowerSwitchPass::run(Function &F,
+                                       FunctionAnalysisManager &AM) {
+  LazyValueInfo *LVI = &AM.getResult<LazyValueAnalysis>(F);
+  AssumptionCache *AC = AM.getCachedResult<AssumptionAnalysis>(F);
+  return LowerSwitch(F, LVI, AC) ? PreservedAnalyses::none()
+                                 : PreservedAnalyses::all();
+}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/MatrixUtils.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/MatrixUtils.cpp
new file mode 100644
index 000000000000..6a137630deeb
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/MatrixUtils.cpp
@@ -0,0 +1,104 @@
+//===- MatrixUtils.cpp - Utilities to lower matrix intrinsics ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Utilities for generating tiled loops for matrix operations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/MatrixUtils.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Type.h"
+
+using namespace llvm;
+
+BasicBlock *TileInfo::CreateLoop(BasicBlock *Preheader, BasicBlock *Exit,
+                                 Value *Bound, Value *Step, StringRef Name,
+                                 IRBuilderBase &B, DomTreeUpdater &DTU, Loop *L,
+                                 LoopInfo &LI) {
+  LLVMContext &Ctx = Preheader->getContext();
+  BasicBlock *Header = BasicBlock::Create(
+      Preheader->getContext(), Name + ".header", Preheader->getParent(), Exit);
+  BasicBlock *Body = BasicBlock::Create(Header->getContext(), Name + ".body",
+                                        Header->getParent(), Exit);
+  BasicBlock *Latch = BasicBlock::Create(Header->getContext(), Name + ".latch",
+                                         Header->getParent(), Exit);
+
+  Type *I32Ty = Type::getInt64Ty(Ctx);
+  BranchInst::Create(Body, Header);
+  BranchInst::Create(Latch, Body);
+  PHINode *IV =
+      PHINode::Create(I32Ty, 2, Name + ".iv", Header->getTerminator());
+  IV->addIncoming(ConstantInt::get(I32Ty, 0), Preheader);
+
+  B.SetInsertPoint(Latch);
+  Value *Inc = B.CreateAdd(IV, Step, Name + ".step");
+  Value *Cond = B.CreateICmpNE(Inc, Bound, Name + ".cond");
+  BranchInst::Create(Header, Exit, Cond, Latch);
+  IV->addIncoming(Inc, Latch);
+
+  BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator());
+  BasicBlock *Tmp = PreheaderBr->getSuccessor(0);
+  PreheaderBr->setSuccessor(0, Header);
+  DTU.applyUpdatesPermissive({
+      {DominatorTree::Delete, Preheader, Tmp},
+      {DominatorTree::Insert, Header, Body},
+      {DominatorTree::Insert, Body, Latch},
+      {DominatorTree::Insert, Latch, Header},
+      {DominatorTree::Insert, Latch, Exit},
+      {DominatorTree::Insert, Preheader, Header},
+  });
+
+  L->addBasicBlockToLoop(Header, LI);
+  L->addBasicBlockToLoop(Body, LI);
+  L->addBasicBlockToLoop(Latch, LI);
+  return Body;
+}
+
+// Creates the following loop nest skeleton:
+//  for C = 0; C < NumColumns; C += TileSize
+//    for R = 0; R < NumRows; R += TileSize
+//      for K = 0; K < Inner ; K += TileSize
+BasicBlock *TileInfo::CreateTiledLoops(BasicBlock *Start, BasicBlock *End,
+                                       IRBuilderBase &B, DomTreeUpdater &DTU,
+                                       LoopInfo &LI) {
+  Loop *ColLoop = LI.AllocateLoop();
+  Loop *RowLoop = LI.AllocateLoop();
+  Loop *InnerLoop = LI.AllocateLoop();
+  RowLoop->addChildLoop(InnerLoop);
+  ColLoop->addChildLoop(RowLoop);
+  if (Loop *ParentL = LI.getLoopFor(Start))
+    ParentL->addChildLoop(ColLoop);
+  else
+    LI.addTopLevelLoop(ColLoop);
+
+  BasicBlock *ColBody =
+      CreateLoop(Start, End, B.getInt64(NumColumns), B.getInt64(TileSize),
+                 "cols", B, DTU, ColLoop, LI);
+  BasicBlock *ColLatch = ColBody->getSingleSuccessor();
+  BasicBlock *RowBody =
+      CreateLoop(ColBody, ColLatch, B.getInt64(NumRows), B.getInt64(TileSize),
+                 "rows", B, DTU, RowLoop, LI);
+  RowLoopLatch = RowBody->getSingleSuccessor();
+
+  BasicBlock *InnerBody =
+      CreateLoop(RowBody, RowLoopLatch, B.getInt64(NumInner),
+                 B.getInt64(TileSize), "inner", B, DTU, InnerLoop, LI);
+  InnerLoopLatch = InnerBody->getSingleSuccessor();
+  ColumnLoopHeader = ColBody->getSinglePredecessor();
+  RowLoopHeader = RowBody->getSinglePredecessor();
+  InnerLoopHeader = InnerBody->getSinglePredecessor();
+  CurrentRow = &*RowLoopHeader->begin();
+  CurrentCol = &*ColumnLoopHeader->begin();
+  CurrentK = &*InnerLoopHeader->begin();
+
+  return InnerBody;
+}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/MetaRenamer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/MetaRenamer.cpp
index 7f961dbaf4b4..e350320e7569 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/MetaRenamer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/MetaRenamer.cpp
@@ -12,6 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Utils/MetaRenamer.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
@@ -25,6 +26,7 @@
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/TypeFinder.h"
 #include "llvm/InitializePasses.h"
@@ -40,123 +42,125 @@ static const char *const metaNames[] = {
 };
 
 namespace {
+// This PRNG is from the ISO C spec. It is intentionally simple and
+// unsuitable for cryptographic use. We're just looking for enough
+// variety to surprise and delight users.
+struct PRNG {
+  unsigned long next;
+
+  void srand(unsigned int seed) { next = seed; }
+
+  int rand() {
+    next = next * 1103515245 + 12345;
+    return (unsigned int)(next / 65536) % 32768;
+  }
+};
 
-  // This PRNG is from the ISO C spec. It is intentionally simple and
-  // unsuitable for cryptographic use. We're just looking for enough
-  // variety to surprise and delight users.
-  struct PRNG {
-    unsigned long next;
+struct Renamer {
+  Renamer(unsigned int seed) { prng.srand(seed); }
 
-    void srand(unsigned int seed) {
-      next = seed;
-    }
+  const char *newName() {
+    return metaNames[prng.rand() % array_lengthof(metaNames)];
+  }
 
-    int rand() {
-      next = next * 1103515245 + 12345;
-      return (unsigned int)(next / 65536) % 32768;
-    }
-  };
+  PRNG prng;
+};
 
-  struct Renamer {
-    Renamer(unsigned int seed) {
-      prng.srand(seed);
-    }
+void MetaRename(Function &F) {
+  for (auto AI = F.arg_begin(), AE = F.arg_end(); AI != AE; ++AI)
+    if (!AI->getType()->isVoidTy())
+      AI->setName("arg");
 
-    const char *newName() {
-      return metaNames[prng.rand() % array_lengthof(metaNames)];
-    }
+  for (auto &BB : F) {
+    BB.setName("bb");
 
-    PRNG prng;
-  };
+    for (auto &I : BB)
+      if (!I.getType()->isVoidTy())
+        I.setName("tmp");
+  }
+}
 
-  struct MetaRenamer : public ModulePass {
-    // Pass identification, replacement for typeid
-    static char ID;
-
-    MetaRenamer() : ModulePass(ID) {
-      initializeMetaRenamerPass(*PassRegistry::getPassRegistry());
-    }
-
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<TargetLibraryInfoWrapperPass>();
-      AU.setPreservesAll();
-    }
-
-    bool runOnModule(Module &M) override {
-      // Seed our PRNG with simple additive sum of ModuleID. We're looking to
-      // simply avoid always having the same function names, and we need to
-      // remain deterministic.
-      unsigned int randSeed = 0;
-      for (auto C : M.getModuleIdentifier())
-        randSeed += C;
-
-      Renamer renamer(randSeed);
-
-      // Rename all aliases
-      for (auto AI = M.alias_begin(), AE = M.alias_end(); AI != AE; ++AI) {
-        StringRef Name = AI->getName();
-        if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1))
-          continue;
-
-        AI->setName("alias");
-      }
-
-      // Rename all global variables
-      for (auto GI = M.global_begin(), GE = M.global_end(); GI != GE; ++GI) {
-        StringRef Name = GI->getName();
-        if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1))
-          continue;
-
-        GI->setName("global");
-      }
-
-      // Rename all struct types
-      TypeFinder StructTypes;
-      StructTypes.run(M, true);
-      for (StructType *STy : StructTypes) {
-        if (STy->isLiteral() || STy->getName().empty()) continue;
-
-        SmallString<128> NameStorage;
-        STy->setName((Twine("struct.") +
-          renamer.newName()).toStringRef(NameStorage));
-      }
-
-      // Rename all functions
-      for (auto &F : M) {
-        StringRef Name = F.getName();
-        LibFunc Tmp;
-        // Leave library functions alone because their presence or absence could
-        // affect the behavior of other passes.
-        if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1) ||
-            getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F).getLibFunc(
-                F, Tmp))
-          continue;
-
-        // Leave @main alone. The output of -metarenamer might be passed to
-        // lli for execution and the latter needs a main entry point.
-        if (Name != "main")
-          F.setName(renamer.newName());
-
-        runOnFunction(F);
-      }
-      return true;
-    }
-
-    bool runOnFunction(Function &F) {
-      for (auto AI = F.arg_begin(), AE = F.arg_end(); AI != AE; ++AI)
-        if (!AI->getType()->isVoidTy())
-          AI->setName("arg");
-
-      for (auto &BB : F) {
-        BB.setName("bb");
-
-        for (auto &I : BB)
-          if (!I.getType()->isVoidTy())
-            I.setName("tmp");
-      }
-      return true;
-    }
-  };
+void MetaRename(Module &M,
+                function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
+  // Seed our PRNG with simple additive sum of ModuleID. We're looking to
+  // simply avoid always having the same function names, and we need to
+  // remain deterministic.
+  unsigned int randSeed = 0;
+  for (auto C : M.getModuleIdentifier())
+    randSeed += C;
+
+  Renamer renamer(randSeed);
+
+  // Rename all aliases
+  for (auto AI = M.alias_begin(), AE = M.alias_end(); AI != AE; ++AI) {
+    StringRef Name = AI->getName();
+    if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1))
+      continue;
+
+    AI->setName("alias");
+  }
+
+  // Rename all global variables
+  for (auto GI = M.global_begin(), GE = M.global_end(); GI != GE; ++GI) {
+    StringRef Name = GI->getName();
+    if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1))
+      continue;
+
+    GI->setName("global");
+  }
+
+  // Rename all struct types
+  TypeFinder StructTypes;
+  StructTypes.run(M, true);
+  for (StructType *STy : StructTypes) {
+    if (STy->isLiteral() || STy->getName().empty())
+      continue;
+
+    SmallString<128> NameStorage;
+    STy->setName(
+        (Twine("struct.") + renamer.newName()).toStringRef(NameStorage));
+  }
+
+  // Rename all functions
+  for (auto &F : M) {
+    StringRef Name = F.getName();
+    LibFunc Tmp;
+    // Leave library functions alone because their presence or absence could
+    // affect the behavior of other passes.
+    if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1) ||
+        GetTLI(F).getLibFunc(F, Tmp))
+      continue;
+
+    // Leave @main alone. The output of -metarenamer might be passed to
+    // lli for execution and the latter needs a main entry point.
+    if (Name != "main")
+      F.setName(renamer.newName());
+
+    MetaRename(F);
+  }
+}
+
+struct MetaRenamer : public ModulePass {
+  // Pass identification, replacement for typeid
+  static char ID;
+
+  MetaRenamer() : ModulePass(ID) {
+    initializeMetaRenamerPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.setPreservesAll();
+  }
+
+  bool runOnModule(Module &M) override {
+    auto GetTLI = [this](Function &F) -> TargetLibraryInfo & {
+      return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+    };
+    MetaRename(M, GetTLI);
+    return true;
+  }
+};
 
 } // end anonymous namespace
 
@@ -175,3 +179,14 @@ INITIALIZE_PASS_END(MetaRenamer, "metarenamer",
 ModulePass *llvm::createMetaRenamerPass() {
   return new MetaRenamer();
 }
+
+PreservedAnalyses MetaRenamerPass::run(Module &M, ModuleAnalysisManager &AM) {
+  FunctionAnalysisManager &FAM =
+      AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
+    return FAM.getResult<TargetLibraryAnalysis>(F);
+  };
+  MetaRename(M, GetTLI);
+
+  return PreservedAnalyses::all();
+}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/MisExpect.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/MisExpect.cpp
deleted file mode 100644
index a16ca1fb8efa..000000000000
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/MisExpect.cpp
+++ /dev/null
@@ -1,178 +0,0 @@
-//===--- MisExpect.cpp - Check the use of llvm.expect with PGO data -------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This contains code to emit warnings for potentially incorrect usage of the
-// llvm.expect intrinsic. This utility extracts the threshold values from
-// metadata associated with the instrumented Branch or Switch instruction. The
-// threshold values are then used to determine if a warning should be emmited.
-//
-// MisExpect metadata is generated when llvm.expect intrinsics are lowered see
-// LowerExpectIntrinsic.cpp
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/MisExpect.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/Analysis/OptimizationRemarkEmitter.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DiagnosticInfo.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/Support/BranchProbability.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/FormatVariadic.h"
-#include <cstdint>
-#include <functional>
-#include <numeric>
-
-#define DEBUG_TYPE "misexpect"
-
-using namespace llvm;
-using namespace misexpect;
-
-namespace llvm {
-
-// Command line option to enable/disable the warning when profile data suggests
-// a mismatch with the use of the llvm.expect intrinsic
-static cl::opt<bool> PGOWarnMisExpect(
-    "pgo-warn-misexpect", cl::init(false), cl::Hidden,
-    cl::desc("Use this option to turn on/off "
-             "warnings about incorrect usage of llvm.expect intrinsics."));
-
-} // namespace llvm
-
-namespace {
-
-Instruction *getOprndOrInst(Instruction *I) {
-  assert(I != nullptr && "MisExpect target Instruction cannot be nullptr");
-  Instruction *Ret = nullptr;
-  if (auto *B = dyn_cast<BranchInst>(I)) {
-    Ret = dyn_cast<Instruction>(B->getCondition());
-  }
-  // TODO: Find a way to resolve condition location for switches
-  // Using the condition of the switch seems to often resolve to an earlier
-  // point in the program, i.e. the calculation of the switch condition, rather
-  // than the switches location in the source code. Thus, we should use the
-  // instruction to get source code locations rather than the condition to
-  // improve diagnostic output, such as the caret. If the same problem exists
-  // for branch instructions, then we should remove this function and directly
-  // use the instruction
-  //
-  // else if (auto S = dyn_cast<SwitchInst>(I)) {
-  // Ret = I;
-  //}
-  return Ret ? Ret : I;
-}
-
-void emitMisexpectDiagnostic(Instruction *I, LLVMContext &Ctx,
-                             uint64_t ProfCount, uint64_t TotalCount) {
-  double PercentageCorrect = (double)ProfCount / TotalCount;
-  auto PerString =
-      formatv("{0:P} ({1} / {2})", PercentageCorrect, ProfCount, TotalCount);
-  auto RemStr = formatv(
-      "Potential performance regression from use of the llvm.expect intrinsic: "
-      "Annotation was correct on {0} of profiled executions.",
-      PerString);
-  Twine Msg(PerString);
-  Instruction *Cond = getOprndOrInst(I);
-  if (PGOWarnMisExpect)
-    Ctx.diagnose(DiagnosticInfoMisExpect(Cond, Msg));
-  OptimizationRemarkEmitter ORE(I->getParent()->getParent());
-  ORE.emit(OptimizationRemark(DEBUG_TYPE, "misexpect", Cond) << RemStr.str());
-}
-
-} // namespace
-
-namespace llvm {
-namespace misexpect {
-
-void verifyMisExpect(Instruction *I, const SmallVector<uint32_t, 4> &Weights,
-                     LLVMContext &Ctx) {
-  if (auto *MisExpectData = I->getMetadata(LLVMContext::MD_misexpect)) {
-    auto *MisExpectDataName = dyn_cast<MDString>(MisExpectData->getOperand(0));
-    if (MisExpectDataName &&
-        MisExpectDataName->getString().equals("misexpect")) {
-      LLVM_DEBUG(llvm::dbgs() << "------------------\n");
-      LLVM_DEBUG(llvm::dbgs()
-                 << "Function: " << I->getFunction()->getName() << "\n");
-      LLVM_DEBUG(llvm::dbgs() << "Instruction: " << *I << ":\n");
-      LLVM_DEBUG(for (int Idx = 0, Size = Weights.size(); Idx < Size; ++Idx) {
-        llvm::dbgs() << "Weights[" << Idx << "] = " << Weights[Idx] << "\n";
-      });
-
-      // extract values from misexpect metadata
-      const auto *IndexCint =
-          mdconst::dyn_extract<ConstantInt>(MisExpectData->getOperand(1));
-      const auto *LikelyCInt =
-          mdconst::dyn_extract<ConstantInt>(MisExpectData->getOperand(2));
-      const auto *UnlikelyCInt =
-          mdconst::dyn_extract<ConstantInt>(MisExpectData->getOperand(3));
-
-      if (!IndexCint || !LikelyCInt || !UnlikelyCInt)
-        return;
-
-      const uint64_t Index = IndexCint->getZExtValue();
-      const uint64_t LikelyBranchWeight = LikelyCInt->getZExtValue();
-      const uint64_t UnlikelyBranchWeight = UnlikelyCInt->getZExtValue();
-      const uint64_t ProfileCount = Weights[Index];
-      const uint64_t CaseTotal = std::accumulate(
-          Weights.begin(), Weights.end(), (uint64_t)0, std::plus<uint64_t>());
-      const uint64_t NumUnlikelyTargets = Weights.size() - 1;
-
-      const uint64_t TotalBranchWeight =
-          LikelyBranchWeight + (UnlikelyBranchWeight * NumUnlikelyTargets);
-
-      const llvm::BranchProbability LikelyThreshold(LikelyBranchWeight,
-                                                    TotalBranchWeight);
-      uint64_t ScaledThreshold = LikelyThreshold.scale(CaseTotal);
-
-      LLVM_DEBUG(llvm::dbgs()
-                 << "Unlikely Targets: " << NumUnlikelyTargets << ":\n");
-      LLVM_DEBUG(llvm::dbgs() << "Profile Count: " << ProfileCount << ":\n");
-      LLVM_DEBUG(llvm::dbgs()
-                 << "Scaled Threshold: " << ScaledThreshold << ":\n");
-      LLVM_DEBUG(llvm::dbgs() << "------------------\n");
-      if (ProfileCount < ScaledThreshold)
-        emitMisexpectDiagnostic(I, Ctx, ProfileCount, CaseTotal);
-    }
-  }
-}
-
-void checkFrontendInstrumentation(Instruction &I) {
-  if (auto *MD = I.getMetadata(LLVMContext::MD_prof)) {
-    unsigned NOps = MD->getNumOperands();
-
-    // Only emit misexpect diagnostics if at least 2 branch weights are present.
-    // Less than 2 branch weights means that the profiling metadata is:
-    //    1) incorrect/corrupted
-    //    2) not branch weight metadata
-    //    3) completely deterministic
-    // In these cases we should not emit any diagnostic related to misexpect.
-    if (NOps < 3)
-      return;
-
-    // Operand 0 is a string tag "branch_weights"
-    if (MDString *Tag = cast<MDString>(MD->getOperand(0))) {
-      if (Tag->getString().equals("branch_weights")) {
-        SmallVector<uint32_t, 4> RealWeights(NOps - 1);
-        for (unsigned i = 1; i < NOps; i++) {
-          ConstantInt *Value =
-              mdconst::dyn_extract<ConstantInt>(MD->getOperand(i));
-          RealWeights[i - 1] = Value->getZExtValue();
-        }
-        verifyMisExpect(&I, RealWeights, I.getContext());
-      }
-    }
-  }
-}
-
-} // namespace misexpect
-} // namespace llvm
-#undef DEBUG_TYPE
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/PredicateInfo.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/PredicateInfo.cpp
index 99b64a7462f6..3312a6f9459b 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/PredicateInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/PredicateInfo.cpp
@@ -53,6 +53,10 @@ static cl::opt<bool> VerifyPredicateInfo(
 DEBUG_COUNTER(RenameCounter, "predicateinfo-rename",
               "Controls which variables are renamed with predicateinfo");
 
+// Maximum number of conditions considered for renaming for each branch/assume.
+// This limits renaming of deep and/or chains.
+static const unsigned MaxCondsPerBranch = 8;
+
 namespace {
 // Given a predicate info that is a type of branching terminator, get the
 // branching block.
@@ -367,6 +371,13 @@ void PredicateInfoBuilder::convertUsesToDFSOrdered(
   }
 }
 
+bool shouldRename(Value *V) {
+  // Only want real values, not constants.  Additionally, operands with one use
+  // are only being used in the comparison, which means they will not be useful
+  // for us to consider for predicateinfo.
+  return (isa<Instruction>(V) || isa<Argument>(V)) && !V->hasOneUse();
+}
+
 // Collect relevant operations from Comparison that we may want to insert copies
 // for.
 void collectCmpOps(CmpInst *Comparison, SmallVectorImpl<Value *> &CmpOperands) {
@@ -374,15 +385,9 @@ void collectCmpOps(CmpInst *Comparison, SmallVectorImpl<Value *> &CmpOperands) {
   auto *Op1 = Comparison->getOperand(1);
   if (Op0 == Op1)
     return;
-  CmpOperands.push_back(Comparison);
-  // Only want real values, not constants.  Additionally, operands with one use
-  // are only being used in the comparison, which means they will not be useful
-  // for us to consider for predicateinfo.
-  //
-  if ((isa<Instruction>(Op0) || isa<Argument>(Op0)) && !Op0->hasOneUse())
-    CmpOperands.push_back(Op0);
-  if ((isa<Instruction>(Op1) || isa<Argument>(Op1)) && !Op1->hasOneUse())
-    CmpOperands.push_back(Op1);
+
+  CmpOperands.push_back(Op0);
+  CmpOperands.push_back(Op1);
 }
 
 // Add Op, PB to the list of value infos for Op, and mark Op to be renamed.
@@ -400,38 +405,32 @@ void PredicateInfoBuilder::addInfoFor(SmallVectorImpl<Value *> &OpsToRename,
 void PredicateInfoBuilder::processAssume(
     IntrinsicInst *II, BasicBlock *AssumeBB,
     SmallVectorImpl<Value *> &OpsToRename) {
-  // See if we have a comparison we support
-  SmallVector<Value *, 8> CmpOperands;
-  SmallVector<Value *, 2> ConditionsToProcess;
-  CmpInst::Predicate Pred;
-  Value *Operand = II->getOperand(0);
-  if (m_c_And(m_Cmp(Pred, m_Value(), m_Value()),
-              m_Cmp(Pred, m_Value(), m_Value()))
-          .match(II->getOperand(0))) {
-    ConditionsToProcess.push_back(cast<BinaryOperator>(Operand)->getOperand(0));
-    ConditionsToProcess.push_back(cast<BinaryOperator>(Operand)->getOperand(1));
-    ConditionsToProcess.push_back(Operand);
-  } else if (isa<CmpInst>(Operand)) {
-
-    ConditionsToProcess.push_back(Operand);
-  }
-  for (auto Cond : ConditionsToProcess) {
-    if (auto *Cmp = dyn_cast<CmpInst>(Cond)) {
-      collectCmpOps(Cmp, CmpOperands);
-      // Now add our copy infos for our operands
-      for (auto *Op : CmpOperands) {
-        auto *PA = new PredicateAssume(Op, II, Cmp);
-        addInfoFor(OpsToRename, Op, PA);
+  SmallVector<Value *, 4> Worklist;
+  SmallPtrSet<Value *, 4> Visited;
+  Worklist.push_back(II->getOperand(0));
+  while (!Worklist.empty()) {
+    Value *Cond = Worklist.pop_back_val();
+    if (!Visited.insert(Cond).second)
+      continue;
+    if (Visited.size() > MaxCondsPerBranch)
+      break;
+
+    Value *Op0, *Op1;
+    if (match(Cond, m_LogicalAnd(m_Value(Op0), m_Value(Op1)))) {
+      Worklist.push_back(Op1);
+      Worklist.push_back(Op0);
+    }
+
+    SmallVector<Value *, 4> Values;
+    Values.push_back(Cond);
+    if (auto *Cmp = dyn_cast<CmpInst>(Cond))
+      collectCmpOps(Cmp, Values);
+
+    for (Value *V : Values) {
+      if (shouldRename(V)) {
+        auto *PA = new PredicateAssume(V, II, Cond);
+        addInfoFor(OpsToRename, V, PA);
       }
-      CmpOperands.clear();
-    } else if (auto *BinOp = dyn_cast<BinaryOperator>(Cond)) {
-      // Otherwise, it should be an AND.
-      assert(BinOp->getOpcode() == Instruction::And &&
-             "Should have been an AND");
-      auto *PA = new PredicateAssume(BinOp, II, BinOp);
-      addInfoFor(OpsToRename, BinOp, PA);
-    } else {
-      llvm_unreachable("Unknown type of condition");
     }
   }
 }
@@ -443,68 +442,46 @@ void PredicateInfoBuilder::processBranch(
     SmallVectorImpl<Value *> &OpsToRename) {
   BasicBlock *FirstBB = BI->getSuccessor(0);
   BasicBlock *SecondBB = BI->getSuccessor(1);
-  SmallVector<BasicBlock *, 2> SuccsToProcess;
-  SuccsToProcess.push_back(FirstBB);
-  SuccsToProcess.push_back(SecondBB);
-  SmallVector<Value *, 2> ConditionsToProcess;
-
-  auto InsertHelper = [&](Value *Op, bool isAnd, bool isOr, Value *Cond) {
-    for (auto *Succ : SuccsToProcess) {
-      // Don't try to insert on a self-edge. This is mainly because we will
-      // eliminate during renaming anyway.
-      if (Succ == BranchBB)
-        continue;
-      bool TakenEdge = (Succ == FirstBB);
-      // For and, only insert on the true edge
-      // For or, only insert on the false edge
-      if ((isAnd && !TakenEdge) || (isOr && TakenEdge))
+
+  for (BasicBlock *Succ : {FirstBB, SecondBB}) {
+    bool TakenEdge = Succ == FirstBB;
+    // Don't try to insert on a self-edge. This is mainly because we will
+    // eliminate during renaming anyway.
+    if (Succ == BranchBB)
+      continue;
+
+    SmallVector<Value *, 4> Worklist;
+    SmallPtrSet<Value *, 4> Visited;
+    Worklist.push_back(BI->getCondition());
+    while (!Worklist.empty()) {
+      Value *Cond = Worklist.pop_back_val();
+      if (!Visited.insert(Cond).second)
         continue;
-      PredicateBase *PB =
-          new PredicateBranch(Op, BranchBB, Succ, Cond, TakenEdge);
-      addInfoFor(OpsToRename, Op, PB);
-      if (!Succ->getSinglePredecessor())
-        EdgeUsesOnly.insert({BranchBB, Succ});
-    }
-  };
+      if (Visited.size() > MaxCondsPerBranch)
+        break;
+
+      Value *Op0, *Op1;
+      if (TakenEdge ? match(Cond, m_LogicalAnd(m_Value(Op0), m_Value(Op1)))
+                    : match(Cond, m_LogicalOr(m_Value(Op0), m_Value(Op1)))) {
+        Worklist.push_back(Op1);
+        Worklist.push_back(Op0);
+      }
 
-  // Match combinations of conditions.
-  CmpInst::Predicate Pred;
-  bool isAnd = false;
-  bool isOr = false;
-  SmallVector<Value *, 8> CmpOperands;
-  if (match(BI->getCondition(), m_And(m_Cmp(Pred, m_Value(), m_Value()),
-                                      m_Cmp(Pred, m_Value(), m_Value()))) ||
-      match(BI->getCondition(), m_Or(m_Cmp(Pred, m_Value(), m_Value()),
-                                     m_Cmp(Pred, m_Value(), m_Value())))) {
-    auto *BinOp = cast<BinaryOperator>(BI->getCondition());
-    if (BinOp->getOpcode() == Instruction::And)
-      isAnd = true;
-    else if (BinOp->getOpcode() == Instruction::Or)
-      isOr = true;
-    ConditionsToProcess.push_back(BinOp->getOperand(0));
-    ConditionsToProcess.push_back(BinOp->getOperand(1));
-    ConditionsToProcess.push_back(BI->getCondition());
-  } else if (isa<CmpInst>(BI->getCondition())) {
-    ConditionsToProcess.push_back(BI->getCondition());
-  }
-  for (auto Cond : ConditionsToProcess) {
-    if (auto *Cmp = dyn_cast<CmpInst>(Cond)) {
-      collectCmpOps(Cmp, CmpOperands);
-      // Now add our copy infos for our operands
-      for (auto *Op : CmpOperands)
-        InsertHelper(Op, isAnd, isOr, Cmp);
-    } else if (auto *BinOp = dyn_cast<BinaryOperator>(Cond)) {
-      // This must be an AND or an OR.
-      assert((BinOp->getOpcode() == Instruction::And ||
-              BinOp->getOpcode() == Instruction::Or) &&
-             "Should have been an AND or an OR");
-      // The actual value of the binop is not subject to the same restrictions
-      // as the comparison. It's either true or false on the true/false branch.
-      InsertHelper(BinOp, false, false, BinOp);
-    } else {
-      llvm_unreachable("Unknown type of condition");
+      SmallVector<Value *, 4> Values;
+      Values.push_back(Cond);
+      if (auto *Cmp = dyn_cast<CmpInst>(Cond))
+        collectCmpOps(Cmp, Values);
+
+      for (Value *V : Values) {
+        if (shouldRename(V)) {
+          PredicateBase *PB =
+              new PredicateBranch(V, BranchBB, Succ, Cond, TakenEdge);
+          addInfoFor(OpsToRename, V, PB);
+          if (!Succ->getSinglePredecessor())
+            EdgeUsesOnly.insert({BranchBB, Succ});
+        }
+      }
     }
-    CmpOperands.clear();
   }
 }
 // Process a block terminating switch, and place relevant operations to be
@@ -822,6 +799,56 @@ PredicateInfo::~PredicateInfo() {
   }
 }
 
+Optional<PredicateConstraint> PredicateBase::getConstraint() const {
+  switch (Type) {
+  case PT_Assume:
+  case PT_Branch: {
+    bool TrueEdge = true;
+    if (auto *PBranch = dyn_cast<PredicateBranch>(this))
+      TrueEdge = PBranch->TrueEdge;
+
+    if (Condition == RenamedOp) {
+      return {{CmpInst::ICMP_EQ,
+               TrueEdge ? ConstantInt::getTrue(Condition->getType())
+                        : ConstantInt::getFalse(Condition->getType())}};
+    }
+
+    CmpInst *Cmp = dyn_cast<CmpInst>(Condition);
+    if (!Cmp) {
+      // TODO: Make this an assertion once RenamedOp is fully accurate.
+      return None;
+    }
+
+    CmpInst::Predicate Pred;
+    Value *OtherOp;
+    if (Cmp->getOperand(0) == RenamedOp) {
+      Pred = Cmp->getPredicate();
+      OtherOp = Cmp->getOperand(1);
+    } else if (Cmp->getOperand(1) == RenamedOp) {
+      Pred = Cmp->getSwappedPredicate();
+      OtherOp = Cmp->getOperand(0);
+    } else {
+      // TODO: Make this an assertion once RenamedOp is fully accurate.
+      return None;
+    }
+
+    // Invert predicate along false edge.
+    if (!TrueEdge)
+      Pred = CmpInst::getInversePredicate(Pred);
+
+    return {{Pred, OtherOp}};
+  }
+  case PT_Switch:
+    if (Condition != RenamedOp) {
+      // TODO: Make this an assertion once RenamedOp is fully accurate.
+      return None;
+    }
+
+    return {{CmpInst::ICMP_EQ, cast<PredicateSwitch>(this)->CaseValue}};
+  }
+  llvm_unreachable("Unknown predicate type");
+}
+
 void PredicateInfo::verifyPredicateInfo() const {}
 
 char PredicateInfoPrinterLegacyPass::ID = 0;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index c7e9c919ec47..86bbb6a889e6 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -62,10 +62,6 @@ STATISTIC(NumDeadAlloca,    "Number of dead alloca's removed");
 STATISTIC(NumPHIInsert,     "Number of PHI nodes inserted");
 
 bool llvm::isAllocaPromotable(const AllocaInst *AI) {
-  // FIXME: If the memory unit is of pointer or integer type, we can permit
-  // assignments to subsections of the memory unit.
-  unsigned AS = AI->getType()->getAddressSpace();
-
   // Only allow direct and non-volatile loads and stores...
   for (const User *U : AI->users()) {
     if (const LoadInst *LI = dyn_cast<LoadInst>(U)) {
@@ -81,19 +77,18 @@ bool llvm::isAllocaPromotable(const AllocaInst *AI) {
       if (SI->isVolatile())
         return false;
     } else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) {
-      if (!II->isLifetimeStartOrEnd())
+      if (!II->isLifetimeStartOrEnd() && !II->isDroppable())
         return false;
     } else if (const BitCastInst *BCI = dyn_cast<BitCastInst>(U)) {
-      if (BCI->getType() != Type::getInt8PtrTy(U->getContext(), AS))
-        return false;
-      if (!onlyUsedByLifetimeMarkers(BCI))
+      if (!onlyUsedByLifetimeMarkersOrDroppableInsts(BCI))
         return false;
     } else if (const GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(U)) {
-      if (GEPI->getType() != Type::getInt8PtrTy(U->getContext(), AS))
-        return false;
       if (!GEPI->hasAllZeroIndices())
         return false;
-      if (!onlyUsedByLifetimeMarkers(GEPI))
+      if (!onlyUsedByLifetimeMarkersOrDroppableInsts(GEPI))
+        return false;
+    } else if (const AddrSpaceCastInst *ASCI = dyn_cast<AddrSpaceCastInst>(U)) {
+      if (!onlyUsedByLifetimeMarkers(ASCI))
         return false;
     } else {
       return false;
@@ -106,6 +101,8 @@ bool llvm::isAllocaPromotable(const AllocaInst *AI) {
 namespace {
 
 struct AllocaInfo {
+  using DbgUserVec = SmallVector<DbgVariableIntrinsic *, 1>;
+
   SmallVector<BasicBlock *, 32> DefiningBlocks;
   SmallVector<BasicBlock *, 32> UsingBlocks;
 
@@ -113,7 +110,7 @@ struct AllocaInfo {
   BasicBlock *OnlyBlock;
   bool OnlyUsedInOneBlock;
 
-  TinyPtrVector<DbgVariableIntrinsic *> DbgDeclares;
+  DbgUserVec DbgUsers;
 
   void clear() {
     DefiningBlocks.clear();
@@ -121,7 +118,7 @@ struct AllocaInfo {
     OnlyStore = nullptr;
     OnlyBlock = nullptr;
     OnlyUsedInOneBlock = true;
-    DbgDeclares.clear();
+    DbgUsers.clear();
   }
 
   /// Scan the uses of the specified alloca, filling in the AllocaInfo used
@@ -132,8 +129,8 @@ struct AllocaInfo {
     // As we scan the uses of the alloca instruction, keep track of stores,
     // and decide whether all of the loads and stores to the alloca are within
     // the same basic block.
-    for (auto UI = AI->user_begin(), E = AI->user_end(); UI != E;) {
-      Instruction *User = cast<Instruction>(*UI++);
+    for (User *U : AI->users()) {
+      Instruction *User = cast<Instruction>(U);
 
       if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
         // Remember the basic blocks which define new values for the alloca
@@ -154,7 +151,7 @@ struct AllocaInfo {
       }
     }
 
-    DbgDeclares = FindDbgAddrUses(AI);
+    findDbgUsers(DbgUsers, AI);
   }
 };
 
@@ -252,7 +249,7 @@ struct PromoteMem2Reg {
   /// For each alloca, we keep track of the dbg.declare intrinsic that
   /// describes it, if any, so that we can convert it to a dbg.value
   /// intrinsic if the alloca gets promoted.
-  SmallVector<TinyPtrVector<DbgVariableIntrinsic *>, 8> AllocaDbgDeclares;
+  SmallVector<AllocaInfo::DbgUserVec, 8> AllocaDbgUsers;
 
   /// The set of basic blocks the renamer has already visited.
   SmallPtrSet<BasicBlock *, 16> Visited;
@@ -312,23 +309,37 @@ static void addAssumeNonNull(AssumptionCache *AC, LoadInst *LI) {
   AC->registerAssumption(CI);
 }
 
-static void removeLifetimeIntrinsicUsers(AllocaInst *AI) {
+static void removeIntrinsicUsers(AllocaInst *AI) {
   // Knowing that this alloca is promotable, we know that it's safe to kill all
   // instructions except for load and store.
 
-  for (auto UI = AI->user_begin(), UE = AI->user_end(); UI != UE;) {
-    Instruction *I = cast<Instruction>(*UI);
+  for (auto UI = AI->use_begin(), UE = AI->use_end(); UI != UE;) {
+    Instruction *I = cast<Instruction>(UI->getUser());
+    Use &U = *UI;
     ++UI;
     if (isa<LoadInst>(I) || isa<StoreInst>(I))
       continue;
 
+    // Drop the use of AI in droppable instructions.
+    if (I->isDroppable()) {
+      I->dropDroppableUse(U);
+      continue;
+    }
+
     if (!I->getType()->isVoidTy()) {
       // The only users of this bitcast/GEP instruction are lifetime intrinsics.
       // Follow the use/def chain to erase them now instead of leaving it for
       // dead code elimination later.
-      for (auto UUI = I->user_begin(), UUE = I->user_end(); UUI != UUE;) {
-        Instruction *Inst = cast<Instruction>(*UUI);
+      for (auto UUI = I->use_begin(), UUE = I->use_end(); UUI != UUE;) {
+        Instruction *Inst = cast<Instruction>(UUI->getUser());
+        Use &UU = *UUI;
         ++UUI;
+
+        // Drop the use of I in droppable instructions.
+        if (Inst->isDroppable()) {
+          Inst->dropDroppableUse(UU);
+          continue;
+        }
         Inst->eraseFromParent();
       }
     }
@@ -355,8 +366,8 @@ static bool rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info,
   // Clear out UsingBlocks.  We will reconstruct it here if needed.
   Info.UsingBlocks.clear();
 
-  for (auto UI = AI->user_begin(), E = AI->user_end(); UI != E;) {
-    Instruction *UserInst = cast<Instruction>(*UI++);
+  for (User *U : make_early_inc_range(AI->users())) {
+    Instruction *UserInst = cast<Instruction>(U);
     if (UserInst == OnlyStore)
       continue;
     LoadInst *LI = cast<LoadInst>(UserInst);
@@ -412,10 +423,14 @@ static bool rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info,
 
   // Record debuginfo for the store and remove the declaration's
   // debuginfo.
-  for (DbgVariableIntrinsic *DII : Info.DbgDeclares) {
-    DIBuilder DIB(*AI->getModule(), /*AllowUnresolved*/ false);
-    ConvertDebugDeclareToDebugValue(DII, Info.OnlyStore, DIB);
-    DII->eraseFromParent();
+  for (DbgVariableIntrinsic *DII : Info.DbgUsers) {
+    if (DII->isAddressOfVariable()) {
+      DIBuilder DIB(*AI->getModule(), /*AllowUnresolved*/ false);
+      ConvertDebugDeclareToDebugValue(DII, Info.OnlyStore, DIB);
+      DII->eraseFromParent();
+    } else if (DII->getExpression()->startsWithDeref()) {
+      DII->eraseFromParent();
+    }
   }
   // Remove the (now dead) store and alloca.
   Info.OnlyStore->eraseFromParent();
@@ -465,8 +480,8 @@ static bool promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
 
   // Walk all of the loads from this alloca, replacing them with the nearest
   // store above them, if any.
-  for (auto UI = AI->user_begin(), E = AI->user_end(); UI != E;) {
-    LoadInst *LI = dyn_cast<LoadInst>(*UI++);
+  for (User *U : make_early_inc_range(AI->users())) {
+    LoadInst *LI = dyn_cast<LoadInst>(U);
     if (!LI)
       continue;
 
@@ -510,9 +525,11 @@ static bool promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
   while (!AI->use_empty()) {
     StoreInst *SI = cast<StoreInst>(AI->user_back());
     // Record debuginfo for the store before removing it.
-    for (DbgVariableIntrinsic *DII : Info.DbgDeclares) {
-      DIBuilder DIB(*AI->getModule(), /*AllowUnresolved*/ false);
-      ConvertDebugDeclareToDebugValue(DII, SI, DIB);
+    for (DbgVariableIntrinsic *DII : Info.DbgUsers) {
+      if (DII->isAddressOfVariable()) {
+        DIBuilder DIB(*AI->getModule(), /*AllowUnresolved*/ false);
+        ConvertDebugDeclareToDebugValue(DII, SI, DIB);
+      }
     }
     SI->eraseFromParent();
     LBI.deleteValue(SI);
@@ -521,8 +538,9 @@ static bool promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
   AI->eraseFromParent();
 
   // The alloca's debuginfo can be removed as well.
-  for (DbgVariableIntrinsic *DII : Info.DbgDeclares)
-    DII->eraseFromParent();
+  for (DbgVariableIntrinsic *DII : Info.DbgUsers)
+    if (DII->isAddressOfVariable() || DII->getExpression()->startsWithDeref())
+      DII->eraseFromParent();
 
   ++NumLocalPromoted;
   return true;
@@ -531,7 +549,7 @@ static bool promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
 void PromoteMem2Reg::run() {
   Function &F = *DT.getRoot()->getParent();
 
-  AllocaDbgDeclares.resize(Allocas.size());
+  AllocaDbgUsers.resize(Allocas.size());
 
   AllocaInfo Info;
   LargeBlockInfo LBI;
@@ -544,7 +562,7 @@ void PromoteMem2Reg::run() {
     assert(AI->getParent()->getParent() == &F &&
            "All allocas should be in the same function, which is same as DF!");
 
-    removeLifetimeIntrinsicUsers(AI);
+    removeIntrinsicUsers(AI);
 
     if (AI->use_empty()) {
       // If there are no uses of the alloca, just delete it now.
@@ -589,8 +607,8 @@ void PromoteMem2Reg::run() {
     }
 
     // Remember the dbg.declare intrinsic describing this alloca, if any.
-    if (!Info.DbgDeclares.empty())
-      AllocaDbgDeclares[AllocaNum] = Info.DbgDeclares;
+    if (!Info.DbgUsers.empty())
+      AllocaDbgUsers[AllocaNum] = Info.DbgUsers;
 
     // Keep the reverse mapping of the 'Allocas' array for the rename pass.
     AllocaLookup[Allocas[AllocaNum]] = AllocaNum;
@@ -663,9 +681,11 @@ void PromoteMem2Reg::run() {
   }
 
   // Remove alloca's dbg.declare instrinsics from the function.
-  for (auto &Declares : AllocaDbgDeclares)
-    for (auto *DII : Declares)
-      DII->eraseFromParent();
+  for (auto &DbgUsers : AllocaDbgUsers) {
+    for (auto *DII : DbgUsers)
+      if (DII->isAddressOfVariable() || DII->getExpression()->startsWithDeref())
+        DII->eraseFromParent();
+  }
 
   // Loop over all of the PHI nodes and see if there are any that we can get
   // rid of because they merge all of the same incoming values.  This can
@@ -720,7 +740,7 @@ void PromoteMem2Reg::run() {
       continue;
 
     // Get the preds for BB.
-    SmallVector<BasicBlock *, 16> Preds(pred_begin(BB), pred_end(BB));
+    SmallVector<BasicBlock *, 16> Preds(predecessors(BB));
 
     // Ok, now we know that all of the PHI nodes are missing entries for some
     // basic blocks.  Start by sorting the incoming predecessors for efficient
@@ -887,7 +907,7 @@ NextIteration:
       // operands so far.  Remember this count.
       unsigned NewPHINumOperands = APN->getNumOperands();
 
-      unsigned NumEdges = std::count(succ_begin(Pred), succ_end(Pred), BB);
+      unsigned NumEdges = llvm::count(successors(Pred), BB);
       assert(NumEdges && "Must be at least one edge from Pred to BB!");
 
       // Add entries for all the phis.
@@ -905,8 +925,9 @@ NextIteration:
 
         // The currently active variable for this block is now the PHI.
         IncomingVals[AllocaNo] = APN;
-        for (DbgVariableIntrinsic *DII : AllocaDbgDeclares[AllocaNo])
-          ConvertDebugDeclareToDebugValue(DII, APN, DIB);
+        for (DbgVariableIntrinsic *DII : AllocaDbgUsers[AllocaNo])
+          if (DII->isAddressOfVariable())
+            ConvertDebugDeclareToDebugValue(DII, APN, DIB);
 
         // Get the next phi node.
         ++PNI;
@@ -965,8 +986,9 @@ NextIteration:
 
       // Record debuginfo for the store before removing it.
       IncomingLocs[AllocaNo] = SI->getDebugLoc();
-      for (DbgVariableIntrinsic *DII : AllocaDbgDeclares[ai->second])
-        ConvertDebugDeclareToDebugValue(DII, SI, DIB);
+      for (DbgVariableIntrinsic *DII : AllocaDbgUsers[ai->second])
+        if (DII->isAddressOfVariable())
+          ConvertDebugDeclareToDebugValue(DII, SI, DIB);
       BB->getInstList().erase(SI);
     }
   }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/SSAUpdater.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/SSAUpdater.cpp
index 57df2334c750..c210d1c46077 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/SSAUpdater.cpp
@@ -64,8 +64,7 @@ bool SSAUpdater::HasValueForBlock(BasicBlock *BB) const {
 }
 
 Value *SSAUpdater::FindValueForBlock(BasicBlock *BB) const {
-  AvailableValsTy::iterator AVI = getAvailableVals(AV).find(BB);
-  return (AVI != getAvailableVals(AV).end()) ? AVI->second : nullptr;
+  return getAvailableVals(AV).lookup(BB);
 }
 
 void SSAUpdater::AddAvailableValue(BasicBlock *BB, Value *V) {
@@ -254,12 +253,10 @@ public:
     // We can get our predecessor info by walking the pred_iterator list,
     // but it is relatively slow.  If we already have PHI nodes in this
     // block, walk one of them to get the predecessor list instead.
-    if (PHINode *SomePhi = dyn_cast<PHINode>(BB->begin())) {
-      Preds->append(SomePhi->block_begin(), SomePhi->block_end());
-    } else {
-      for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
-        Preds->push_back(*PI);
-    }
+    if (PHINode *SomePhi = dyn_cast<PHINode>(BB->begin()))
+      append_range(*Preds, SomePhi->blocks());
+    else
+      append_range(*Preds, predecessors(BB));
   }
 
   /// GetUndefVal - Get an undefined value of the same type as the value
@@ -283,12 +280,6 @@ public:
     PHI->addIncoming(Val, Pred);
   }
 
-  /// InstrIsPHI - Check if an instruction is a PHI.
-  ///
-  static PHINode *InstrIsPHI(Instruction *I) {
-    return dyn_cast<PHINode>(I);
-  }
-
   /// ValueIsPHI - Check if a value is a PHI.
   static PHINode *ValueIsPHI(Value *Val, SSAUpdater *Updater) {
     return dyn_cast<PHINode>(Val);
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index 71b48482f26a..6dbfb0b61fea 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -27,6 +27,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
 
 using namespace llvm;
 
@@ -38,8 +39,7 @@ cl::opt<unsigned> llvm::SCEVCheapExpansionBudget(
 using namespace PatternMatch;
 
 /// ReuseOrCreateCast - Arrange for there to be a cast of V to Ty at IP,
-/// reusing an existing cast if a suitable one exists, moving an existing
-/// cast if a suitable one exists but isn't in the right place, or
+/// reusing an existing cast if a suitable one (= dominating IP) exists, or
 /// creating a new one.
 Value *SCEVExpander::ReuseOrCreateCast(Value *V, Type *Ty,
                                        Instruction::CastOps Op,
@@ -58,40 +58,38 @@ Value *SCEVExpander::ReuseOrCreateCast(Value *V, Type *Ty,
   Instruction *Ret = nullptr;
 
   // Check to see if there is already a cast!
-  for (User *U : V->users())
-    if (U->getType() == Ty)
-      if (CastInst *CI = dyn_cast<CastInst>(U))
-        if (CI->getOpcode() == Op) {
-          // If the cast isn't where we want it, create a new cast at IP.
-          // Likewise, do not reuse a cast at BIP because it must dominate
-          // instructions that might be inserted before BIP.
-          if (BasicBlock::iterator(CI) != IP || BIP == IP) {
-            // Create a new cast, and leave the old cast in place in case
-            // it is being used as an insert point.
-            Ret = CastInst::Create(Op, V, Ty, "", &*IP);
-            Ret->takeName(CI);
-            CI->replaceAllUsesWith(Ret);
-            break;
-          }
-          Ret = CI;
-          break;
-        }
+  for (User *U : V->users()) {
+    if (U->getType() != Ty)
+      continue;
+    CastInst *CI = dyn_cast<CastInst>(U);
+    if (!CI || CI->getOpcode() != Op)
+      continue;
+
+    // Found a suitable cast that is at IP or comes before IP. Use it. Note that
+    // the cast must also properly dominate the Builder's insertion point.
+    if (IP->getParent() == CI->getParent() && &*BIP != CI &&
+        (&*IP == CI || CI->comesBefore(&*IP))) {
+      Ret = CI;
+      break;
+    }
+  }
 
   // Create a new cast.
-  if (!Ret)
+  if (!Ret) {
     Ret = CastInst::Create(Op, V, Ty, V->getName(), &*IP);
+    rememberInstruction(Ret);
+  }
 
   // We assert at the end of the function since IP might point to an
   // instruction with different dominance properties than a cast
   // (an invoke for example) and not dominate BIP (but the cast does).
   assert(SE.DT.dominates(Ret, &*BIP));
 
-  rememberInstruction(Ret);
   return Ret;
 }
 
-static BasicBlock::iterator findInsertPointAfter(Instruction *I,
-                                                 BasicBlock *MustDominate) {
+BasicBlock::iterator
+SCEVExpander::findInsertPointAfter(Instruction *I, Instruction *MustDominate) {
   BasicBlock::iterator IP = ++I->getIterator();
   if (auto *II = dyn_cast<InvokeInst>(I))
     IP = II->getNormalDest()->begin();
@@ -102,11 +100,17 @@ static BasicBlock::iterator findInsertPointAfter(Instruction *I,
   if (isa<FuncletPadInst>(IP) || isa<LandingPadInst>(IP)) {
     ++IP;
   } else if (isa<CatchSwitchInst>(IP)) {
-    IP = MustDominate->getFirstInsertionPt();
+    IP = MustDominate->getParent()->getFirstInsertionPt();
   } else {
     assert(!IP->isEHPad() && "unexpected eh pad!");
   }
 
+  // Adjust insert point to be after instructions inserted by the expander, so
+  // we can re-use already inserted instructions. Avoid skipping past the
+  // original \p MustDominate, in case it is an inserted instruction.
+  while (isInsertedInstruction(&*IP) && &*IP != MustDominate)
+    ++IP;
+
   return IP;
 }
 
@@ -122,6 +126,22 @@ Value *SCEVExpander::InsertNoopCastOfTo(Value *V, Type *Ty) {
   assert(SE.getTypeSizeInBits(V->getType()) == SE.getTypeSizeInBits(Ty) &&
          "InsertNoopCastOfTo cannot change sizes!");
 
+  // inttoptr only works for integral pointers. For non-integral pointers, we
+  // can create a GEP on i8* null  with the integral value as index. Note that
+  // it is safe to use GEP of null instead of inttoptr here, because only
+  // expressions already based on a GEP of null should be converted to pointers
+  // during expansion.
+  if (Op == Instruction::IntToPtr) {
+    auto *PtrTy = cast<PointerType>(Ty);
+    if (DL.isNonIntegralPointerType(PtrTy)) {
+      auto *Int8PtrTy = Builder.getInt8PtrTy(PtrTy->getAddressSpace());
+      assert(DL.getTypeAllocSize(Int8PtrTy->getElementType()) == 1 &&
+             "alloc size of i8 must by 1 byte for the GEP to be correct");
+      auto *GEP = Builder.CreateGEP(
+          Builder.getInt8Ty(), Constant::getNullValue(Int8PtrTy), V, "uglygep");
+      return Builder.CreateBitCast(GEP, Ty);
+    }
+  }
   // Short-circuit unnecessary bitcasts.
   if (Op == Instruction::BitCast) {
     if (V->getType() == Ty)
@@ -166,7 +186,7 @@ Value *SCEVExpander::InsertNoopCastOfTo(Value *V, Type *Ty) {
 
   // Cast the instruction immediately after the instruction.
   Instruction *I = cast<Instruction>(V);
-  BasicBlock::iterator IP = findInsertPointAfter(I, Builder.GetInsertBlock());
+  BasicBlock::iterator IP = findInsertPointAfter(I, &*Builder.GetInsertPoint());
   return ReuseOrCreateCast(I, Ty, Op, IP);
 }
 
@@ -238,7 +258,6 @@ Value *SCEVExpander::InsertBinop(Instruction::BinaryOps Opcode,
     BO->setHasNoUnsignedWrap();
   if (Flags & SCEV::FlagNSW)
     BO->setHasNoSignedWrap();
-  rememberInstruction(BO);
 
   return BO;
 }
@@ -290,7 +309,7 @@ static bool FactorOutConstant(const SCEV *&S, const SCEV *&Remainder,
     if (const SCEVConstant *FC = dyn_cast<SCEVConstant>(Factor))
       if (const SCEVConstant *C = dyn_cast<SCEVConstant>(M->getOperand(0)))
         if (!C->getAPInt().srem(FC->getAPInt())) {
-          SmallVector<const SCEV *, 4> NewMulOps(M->op_begin(), M->op_end());
+          SmallVector<const SCEV *, 4> NewMulOps(M->operands());
           NewMulOps[0] = SE.getConstant(C->getAPInt().sdiv(FC->getAPInt()));
           S = SE.getMulExpr(NewMulOps);
           return true;
@@ -462,9 +481,10 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin,
     // we didn't find any operands that could be factored, tentatively
     // assume that element zero was selected (since the zero offset
     // would obviously be folded away).
-    Value *Scaled = ScaledOps.empty() ?
-                    Constant::getNullValue(Ty) :
-                    expandCodeFor(SE.getAddExpr(ScaledOps), Ty);
+    Value *Scaled =
+        ScaledOps.empty()
+            ? Constant::getNullValue(Ty)
+            : expandCodeForImpl(SE.getAddExpr(ScaledOps), Ty, false);
     GepIndices.push_back(Scaled);
 
     // Collect struct field index operands.
@@ -523,7 +543,7 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin,
            SE.DT.dominates(cast<Instruction>(V), &*Builder.GetInsertPoint()));
 
     // Expand the operands for a plain byte offset.
-    Value *Idx = expandCodeFor(SE.getAddExpr(Ops), Ty);
+    Value *Idx = expandCodeForImpl(SE.getAddExpr(Ops), Ty, false);
 
     // Fold a GEP with constant operands.
     if (Constant *CLHS = dyn_cast<Constant>(V))
@@ -564,10 +584,7 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin,
     }
 
     // Emit a GEP.
-    Value *GEP = Builder.CreateGEP(Builder.getInt8Ty(), V, Idx, "uglygep");
-    rememberInstruction(GEP);
-
-    return GEP;
+    return Builder.CreateGEP(Builder.getInt8Ty(), V, Idx, "uglygep");
   }
 
   {
@@ -598,7 +615,6 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin,
       Casted = InsertNoopCastOfTo(Casted, PTy);
     Value *GEP = Builder.CreateGEP(OriginalElTy, Casted, GepIndices, "scevgep");
     Ops.push_back(SE.getUnknown(GEP));
-    rememberInstruction(GEP);
   }
 
   return expand(SE.getAddExpr(Ops));
@@ -748,14 +764,14 @@ Value *SCEVExpander::visitAddExpr(const SCEVAddExpr *S) {
       Sum = expandAddToGEP(NewOps.begin(), NewOps.end(), PTy, Ty, expand(Op));
     } else if (Op->isNonConstantNegative()) {
       // Instead of doing a negate and add, just do a subtract.
-      Value *W = expandCodeFor(SE.getNegativeSCEV(Op), Ty);
+      Value *W = expandCodeForImpl(SE.getNegativeSCEV(Op), Ty, false);
       Sum = InsertNoopCastOfTo(Sum, Ty);
       Sum = InsertBinop(Instruction::Sub, Sum, W, SCEV::FlagAnyWrap,
                         /*IsSafeToHoist*/ true);
       ++I;
     } else {
       // A simple add.
-      Value *W = expandCodeFor(Op, Ty);
+      Value *W = expandCodeForImpl(Op, Ty, false);
       Sum = InsertNoopCastOfTo(Sum, Ty);
       // Canonicalize a constant to the RHS.
       if (isa<Constant>(Sum)) std::swap(Sum, W);
@@ -807,7 +823,7 @@ Value *SCEVExpander::visitMulExpr(const SCEVMulExpr *S) {
 
     // Calculate powers with exponents 1, 2, 4, 8 etc. and include those of them
     // that are needed into the result.
-    Value *P = expandCodeFor(I->second, Ty);
+    Value *P = expandCodeForImpl(I->second, Ty, false);
     Value *Result = nullptr;
     if (Exponent & 1)
       Result = P;
@@ -866,7 +882,7 @@ Value *SCEVExpander::visitMulExpr(const SCEVMulExpr *S) {
 Value *SCEVExpander::visitUDivExpr(const SCEVUDivExpr *S) {
   Type *Ty = SE.getEffectiveSCEVType(S->getType());
 
-  Value *LHS = expandCodeFor(S->getLHS(), Ty);
+  Value *LHS = expandCodeForImpl(S->getLHS(), Ty, false);
   if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(S->getRHS())) {
     const APInt &RHS = SC->getAPInt();
     if (RHS.isPowerOf2())
@@ -875,7 +891,7 @@ Value *SCEVExpander::visitUDivExpr(const SCEVUDivExpr *S) {
                          SCEV::FlagAnyWrap, /*IsSafeToHoist*/ true);
   }
 
-  Value *RHS = expandCodeFor(S->getRHS(), Ty);
+  Value *RHS = expandCodeForImpl(S->getRHS(), Ty, false);
   return InsertBinop(Instruction::UDiv, LHS, RHS, SCEV::FlagAnyWrap,
                      /*IsSafeToHoist*/ SE.isKnownNonZero(S->getRHS()));
 }
@@ -895,7 +911,7 @@ static void ExposePointerBase(const SCEV *&Base, const SCEV *&Rest,
   }
   if (const SCEVAddExpr *A = dyn_cast<SCEVAddExpr>(Base)) {
     Base = A->getOperand(A->getNumOperands()-1);
-    SmallVector<const SCEV *, 8> NewAddOps(A->op_begin(), A->op_end());
+    SmallVector<const SCEV *, 8> NewAddOps(A->operands());
     NewAddOps.back() = Rest;
     Rest = SE.getAddExpr(NewAddOps);
     ExposePointerBase(Base, Rest, SE);
@@ -1073,15 +1089,12 @@ Value *SCEVExpander::expandIVInc(PHINode *PN, Value *StepV, const Loop *L,
       GEPPtrTy = PointerType::get(Type::getInt1Ty(SE.getContext()),
                                   GEPPtrTy->getAddressSpace());
     IncV = expandAddToGEP(SE.getSCEV(StepV), GEPPtrTy, IntTy, PN);
-    if (IncV->getType() != PN->getType()) {
+    if (IncV->getType() != PN->getType())
       IncV = Builder.CreateBitCast(IncV, PN->getType());
-      rememberInstruction(IncV);
-    }
   } else {
     IncV = useSubtract ?
       Builder.CreateSub(PN, StepV, Twine(IVName) + ".iv.next") :
       Builder.CreateAdd(PN, StepV, Twine(IVName) + ".iv.next");
-    rememberInstruction(IncV);
   }
   return IncV;
 }
@@ -1193,6 +1206,14 @@ SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized,
       if (!SE.isSCEVable(PN.getType()))
         continue;
 
+      // We should not look for a incomplete PHI. Getting SCEV for a incomplete
+      // PHI has no meaning at all.
+      if (!PN.isComplete()) {
+        DEBUG_WITH_TYPE(
+            DebugType, dbgs() << "One incomplete PHI is found: " << PN << "\n");
+        continue;
+      }
+
       const SCEVAddRecExpr *PhiSCEV = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(&PN));
       if (!PhiSCEV)
         continue;
@@ -1253,6 +1274,9 @@ SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized,
       InsertedValues.insert(AddRecPhiMatch);
       // Remember the increment.
       rememberInstruction(IncV);
+      // Those values were not actually inserted but re-used.
+      ReusedValues.insert(AddRecPhiMatch);
+      ReusedValues.insert(IncV);
       return AddRecPhiMatch;
     }
   }
@@ -1273,8 +1297,9 @@ SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized,
   // Expand code for the start value into the loop preheader.
   assert(L->getLoopPreheader() &&
          "Can't expand add recurrences without a loop preheader!");
-  Value *StartV = expandCodeFor(Normalized->getStart(), ExpandTy,
-                                L->getLoopPreheader()->getTerminator());
+  Value *StartV =
+      expandCodeForImpl(Normalized->getStart(), ExpandTy,
+                        L->getLoopPreheader()->getTerminator(), false);
 
   // StartV must have been be inserted into L's preheader to dominate the new
   // phi.
@@ -1292,7 +1317,8 @@ SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized,
   if (useSubtract)
     Step = SE.getNegativeSCEV(Step);
   // Expand the step somewhere that dominates the loop header.
-  Value *StepV = expandCodeFor(Step, IntTy, &L->getHeader()->front());
+  Value *StepV = expandCodeForImpl(
+      Step, IntTy, &*L->getHeader()->getFirstInsertionPt(), false);
 
   // The no-wrap behavior proved by IsIncrement(NUW|NSW) is only applicable if
   // we actually do emit an addition.  It does not apply if we emit a
@@ -1306,7 +1332,6 @@ SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized,
   pred_iterator HPB = pred_begin(Header), HPE = pred_end(Header);
   PHINode *PN = Builder.CreatePHI(ExpandTy, std::distance(HPB, HPE),
                                   Twine(IVName) + ".iv");
-  rememberInstruction(PN);
 
   // Create the step instructions and populate the PHI.
   for (pred_iterator HPI = HPB; HPI != HPE; ++HPI) {
@@ -1415,6 +1440,17 @@ Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) {
     assert(LatchBlock && "PostInc mode requires a unique loop latch!");
     Result = PN->getIncomingValueForBlock(LatchBlock);
 
+    // We might be introducing a new use of the post-inc IV that is not poison
+    // safe, in which case we should drop poison generating flags. Only keep
+    // those flags for which SCEV has proven that they always hold.
+    if (isa<OverflowingBinaryOperator>(Result)) {
+      auto *I = cast<Instruction>(Result);
+      if (!S->hasNoUnsignedWrap())
+        I->setHasNoUnsignedWrap(false);
+      if (!S->hasNoSignedWrap())
+        I->setHasNoSignedWrap(false);
+    }
+
     // For an expansion to use the postinc form, the client must call
     // expandCodeFor with an InsertPoint that is either outside the PostIncLoop
     // or dominated by IVIncInsertPos.
@@ -1438,7 +1474,8 @@ Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) {
       {
         // Expand the step somewhere that dominates the loop header.
         SCEVInsertPointGuard Guard(Builder, this);
-        StepV = expandCodeFor(Step, IntTy, &L->getHeader()->front());
+        StepV = expandCodeForImpl(
+            Step, IntTy, &*L->getHeader()->getFirstInsertionPt(), false);
       }
       Result = expandIVInc(PN, StepV, L, ExpandTy, IntTy, useSubtract);
     }
@@ -1452,16 +1489,13 @@ Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) {
     if (ResTy != SE.getEffectiveSCEVType(ResTy))
       Result = InsertNoopCastOfTo(Result, SE.getEffectiveSCEVType(ResTy));
     // Truncate the result.
-    if (TruncTy != Result->getType()) {
+    if (TruncTy != Result->getType())
       Result = Builder.CreateTrunc(Result, TruncTy);
-      rememberInstruction(Result);
-    }
+
     // Invert the result.
-    if (InvertStep) {
-      Result = Builder.CreateSub(expandCodeFor(Normalized->getStart(), TruncTy),
-                                 Result);
-      rememberInstruction(Result);
-    }
+    if (InvertStep)
+      Result = Builder.CreateSub(
+          expandCodeForImpl(Normalized->getStart(), TruncTy, false), Result);
   }
 
   // Re-apply any non-loop-dominating scale.
@@ -1469,24 +1503,22 @@ Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) {
     assert(S->isAffine() && "Can't linearly scale non-affine recurrences.");
     Result = InsertNoopCastOfTo(Result, IntTy);
     Result = Builder.CreateMul(Result,
-                               expandCodeFor(PostLoopScale, IntTy));
-    rememberInstruction(Result);
+                               expandCodeForImpl(PostLoopScale, IntTy, false));
   }
 
   // Re-apply any non-loop-dominating offset.
   if (PostLoopOffset) {
     if (PointerType *PTy = dyn_cast<PointerType>(ExpandTy)) {
       if (Result->getType()->isIntegerTy()) {
-        Value *Base = expandCodeFor(PostLoopOffset, ExpandTy);
+        Value *Base = expandCodeForImpl(PostLoopOffset, ExpandTy, false);
         Result = expandAddToGEP(SE.getUnknown(Result), PTy, IntTy, Base);
       } else {
         Result = expandAddToGEP(PostLoopOffset, PTy, IntTy, Result);
       }
     } else {
       Result = InsertNoopCastOfTo(Result, IntTy);
-      Result = Builder.CreateAdd(Result,
-                                 expandCodeFor(PostLoopOffset, IntTy));
-      rememberInstruction(Result);
+      Result = Builder.CreateAdd(
+          Result, expandCodeForImpl(PostLoopOffset, IntTy, false));
     }
   }
 
@@ -1527,15 +1559,15 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
     Value *V = expand(SE.getAddRecExpr(NewOps, S->getLoop(),
                                        S->getNoWrapFlags(SCEV::FlagNW)));
     BasicBlock::iterator NewInsertPt =
-        findInsertPointAfter(cast<Instruction>(V), Builder.GetInsertBlock());
-    V = expandCodeFor(SE.getTruncateExpr(SE.getUnknown(V), Ty), nullptr,
-                      &*NewInsertPt);
+        findInsertPointAfter(cast<Instruction>(V), &*Builder.GetInsertPoint());
+    V = expandCodeForImpl(SE.getTruncateExpr(SE.getUnknown(V), Ty), nullptr,
+                          &*NewInsertPt, false);
     return V;
   }
 
   // {X,+,F} --> X + {0,+,F}
   if (!S->getStart()->isZero()) {
-    SmallVector<const SCEV *, 4> NewOps(S->op_begin(), S->op_end());
+    SmallVector<const SCEV *, 4> NewOps(S->operands());
     NewOps[0] = SE.getConstant(Ty, 0);
     const SCEV *Rest = SE.getAddRecExpr(NewOps, L,
                                         S->getNoWrapFlags(SCEV::FlagNW));
@@ -1642,31 +1674,34 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
   return expand(T);
 }
 
+Value *SCEVExpander::visitPtrToIntExpr(const SCEVPtrToIntExpr *S) {
+  Value *V =
+      expandCodeForImpl(S->getOperand(), S->getOperand()->getType(), false);
+  return Builder.CreatePtrToInt(V, S->getType());
+}
+
 Value *SCEVExpander::visitTruncateExpr(const SCEVTruncateExpr *S) {
   Type *Ty = SE.getEffectiveSCEVType(S->getType());
-  Value *V = expandCodeFor(S->getOperand(),
-                           SE.getEffectiveSCEVType(S->getOperand()->getType()));
-  Value *I = Builder.CreateTrunc(V, Ty);
-  rememberInstruction(I);
-  return I;
+  Value *V = expandCodeForImpl(
+      S->getOperand(), SE.getEffectiveSCEVType(S->getOperand()->getType()),
+      false);
+  return Builder.CreateTrunc(V, Ty);
 }
 
 Value *SCEVExpander::visitZeroExtendExpr(const SCEVZeroExtendExpr *S) {
   Type *Ty = SE.getEffectiveSCEVType(S->getType());
-  Value *V = expandCodeFor(S->getOperand(),
-                           SE.getEffectiveSCEVType(S->getOperand()->getType()));
-  Value *I = Builder.CreateZExt(V, Ty);
-  rememberInstruction(I);
-  return I;
+  Value *V = expandCodeForImpl(
+      S->getOperand(), SE.getEffectiveSCEVType(S->getOperand()->getType()),
+      false);
+  return Builder.CreateZExt(V, Ty);
 }
 
 Value *SCEVExpander::visitSignExtendExpr(const SCEVSignExtendExpr *S) {
   Type *Ty = SE.getEffectiveSCEVType(S->getType());
-  Value *V = expandCodeFor(S->getOperand(),
-                           SE.getEffectiveSCEVType(S->getOperand()->getType()));
-  Value *I = Builder.CreateSExt(V, Ty);
-  rememberInstruction(I);
-  return I;
+  Value *V = expandCodeForImpl(
+      S->getOperand(), SE.getEffectiveSCEVType(S->getOperand()->getType()),
+      false);
+  return Builder.CreateSExt(V, Ty);
 }
 
 Value *SCEVExpander::visitSMaxExpr(const SCEVSMaxExpr *S) {
@@ -1680,11 +1715,9 @@ Value *SCEVExpander::visitSMaxExpr(const SCEVSMaxExpr *S) {
       Ty = SE.getEffectiveSCEVType(Ty);
       LHS = InsertNoopCastOfTo(LHS, Ty);
     }
-    Value *RHS = expandCodeFor(S->getOperand(i), Ty);
+    Value *RHS = expandCodeForImpl(S->getOperand(i), Ty, false);
     Value *ICmp = Builder.CreateICmpSGT(LHS, RHS);
-    rememberInstruction(ICmp);
     Value *Sel = Builder.CreateSelect(ICmp, LHS, RHS, "smax");
-    rememberInstruction(Sel);
     LHS = Sel;
   }
   // In the case of mixed integer and pointer types, cast the
@@ -1705,11 +1738,9 @@ Value *SCEVExpander::visitUMaxExpr(const SCEVUMaxExpr *S) {
       Ty = SE.getEffectiveSCEVType(Ty);
       LHS = InsertNoopCastOfTo(LHS, Ty);
     }
-    Value *RHS = expandCodeFor(S->getOperand(i), Ty);
+    Value *RHS = expandCodeForImpl(S->getOperand(i), Ty, false);
     Value *ICmp = Builder.CreateICmpUGT(LHS, RHS);
-    rememberInstruction(ICmp);
     Value *Sel = Builder.CreateSelect(ICmp, LHS, RHS, "umax");
-    rememberInstruction(Sel);
     LHS = Sel;
   }
   // In the case of mixed integer and pointer types, cast the
@@ -1730,11 +1761,9 @@ Value *SCEVExpander::visitSMinExpr(const SCEVSMinExpr *S) {
       Ty = SE.getEffectiveSCEVType(Ty);
       LHS = InsertNoopCastOfTo(LHS, Ty);
     }
-    Value *RHS = expandCodeFor(S->getOperand(i), Ty);
+    Value *RHS = expandCodeForImpl(S->getOperand(i), Ty, false);
     Value *ICmp = Builder.CreateICmpSLT(LHS, RHS);
-    rememberInstruction(ICmp);
     Value *Sel = Builder.CreateSelect(ICmp, LHS, RHS, "smin");
-    rememberInstruction(Sel);
     LHS = Sel;
   }
   // In the case of mixed integer and pointer types, cast the
@@ -1755,11 +1784,9 @@ Value *SCEVExpander::visitUMinExpr(const SCEVUMinExpr *S) {
       Ty = SE.getEffectiveSCEVType(Ty);
       LHS = InsertNoopCastOfTo(LHS, Ty);
     }
-    Value *RHS = expandCodeFor(S->getOperand(i), Ty);
+    Value *RHS = expandCodeForImpl(S->getOperand(i), Ty, false);
     Value *ICmp = Builder.CreateICmpULT(LHS, RHS);
-    rememberInstruction(ICmp);
     Value *Sel = Builder.CreateSelect(ICmp, LHS, RHS, "umin");
-    rememberInstruction(Sel);
     LHS = Sel;
   }
   // In the case of mixed integer and pointer types, cast the
@@ -1769,15 +1796,45 @@ Value *SCEVExpander::visitUMinExpr(const SCEVUMinExpr *S) {
   return LHS;
 }
 
-Value *SCEVExpander::expandCodeFor(const SCEV *SH, Type *Ty,
-                                   Instruction *IP) {
+Value *SCEVExpander::expandCodeForImpl(const SCEV *SH, Type *Ty,
+                                       Instruction *IP, bool Root) {
   setInsertPoint(IP);
-  return expandCodeFor(SH, Ty);
+  Value *V = expandCodeForImpl(SH, Ty, Root);
+  return V;
 }
 
-Value *SCEVExpander::expandCodeFor(const SCEV *SH, Type *Ty) {
+Value *SCEVExpander::expandCodeForImpl(const SCEV *SH, Type *Ty, bool Root) {
   // Expand the code for this SCEV.
   Value *V = expand(SH);
+
+  if (PreserveLCSSA) {
+    if (auto *Inst = dyn_cast<Instruction>(V)) {
+      // Create a temporary instruction to at the current insertion point, so we
+      // can hand it off to the helper to create LCSSA PHIs if required for the
+      // new use.
+      // FIXME: Ideally formLCSSAForInstructions (used in fixupLCSSAFormFor)
+      // would accept a insertion point and return an LCSSA phi for that
+      // insertion point, so there is no need to insert & remove the temporary
+      // instruction.
+      Instruction *Tmp;
+      if (Inst->getType()->isIntegerTy())
+        Tmp =
+            cast<Instruction>(Builder.CreateAdd(Inst, Inst, "tmp.lcssa.user"));
+      else {
+        assert(Inst->getType()->isPointerTy());
+        Tmp = cast<Instruction>(
+            Builder.CreateGEP(Inst, Builder.getInt32(1), "tmp.lcssa.user"));
+      }
+      V = fixupLCSSAFormFor(Tmp, 0);
+
+      // Clean up temporary instruction.
+      InsertedValues.erase(Tmp);
+      InsertedPostIncValues.erase(Tmp);
+      Tmp->eraseFromParent();
+    }
+  }
+
+  InsertedExpressions[std::make_pair(SH, &*Builder.GetInsertPoint())] = V;
   if (Ty) {
     assert(SE.getTypeSizeInBits(Ty) == SE.getTypeSizeInBits(SH->getType()) &&
            "non-trivial casts should be done with the SCEVs directly!");
@@ -1861,20 +1918,17 @@ Value *SCEVExpander::expand(const SCEV *S) {
         // there) so that it is guaranteed to dominate any user inside the loop.
         if (L && SE.hasComputableLoopEvolution(S, L) && !PostIncLoops.count(L))
           InsertPt = &*L->getHeader()->getFirstInsertionPt();
+
         while (InsertPt->getIterator() != Builder.GetInsertPoint() &&
                (isInsertedInstruction(InsertPt) ||
-                isa<DbgInfoIntrinsic>(InsertPt)))
+                isa<DbgInfoIntrinsic>(InsertPt))) {
           InsertPt = &*std::next(InsertPt->getIterator());
+        }
         break;
       }
     }
   }
 
-  // IndVarSimplify sometimes sets the insertion point at the block start, even
-  // when there are PHIs at that point.  We must correct for this.
-  if (isa<PHINode>(*InsertPt))
-    InsertPt = &*InsertPt->getParent()->getFirstInsertionPt();
-
   // Check to see if we already expanded this here.
   auto I = InsertedExpressions.find(std::make_pair(S, InsertPt));
   if (I != InsertedExpressions.end())
@@ -1922,32 +1976,25 @@ Value *SCEVExpander::expand(const SCEV *S) {
 }
 
 void SCEVExpander::rememberInstruction(Value *I) {
-  if (!PostIncLoops.empty())
-    InsertedPostIncValues.insert(I);
-  else
-    InsertedValues.insert(I);
-}
-
-/// getOrInsertCanonicalInductionVariable - This method returns the
-/// canonical induction variable of the specified type for the specified
-/// loop (inserting one if there is none).  A canonical induction variable
-/// starts at zero and steps by one on each iteration.
-PHINode *
-SCEVExpander::getOrInsertCanonicalInductionVariable(const Loop *L,
-                                                    Type *Ty) {
-  assert(Ty->isIntegerTy() && "Can only insert integer induction variables!");
-
-  // Build a SCEV for {0,+,1}<L>.
-  // Conservatively use FlagAnyWrap for now.
-  const SCEV *H = SE.getAddRecExpr(SE.getConstant(Ty, 0),
-                                   SE.getConstant(Ty, 1), L, SCEV::FlagAnyWrap);
+  auto DoInsert = [this](Value *V) {
+    if (!PostIncLoops.empty())
+      InsertedPostIncValues.insert(V);
+    else
+      InsertedValues.insert(V);
+  };
+  DoInsert(I);
 
-  // Emit code for it.
-  SCEVInsertPointGuard Guard(Builder, this);
-  PHINode *V =
-      cast<PHINode>(expandCodeFor(H, nullptr, &L->getHeader()->front()));
+  if (!PreserveLCSSA)
+    return;
 
-  return V;
+  if (auto *Inst = dyn_cast<Instruction>(I)) {
+    // A new instruction has been added, which might introduce new uses outside
+    // a defining loop. Fix LCSSA from for each operand of the new instruction,
+    // if required.
+    for (unsigned OpIdx = 0, OpEnd = Inst->getNumOperands(); OpIdx != OpEnd;
+         OpIdx++)
+      fixupLCSSAFormFor(Inst, OpIdx);
+  }
 }
 
 /// replaceCongruentIVs - Check for congruent phis in this loop header and
@@ -1970,8 +2017,8 @@ SCEVExpander::replaceCongruentIVs(Loop *L, const DominatorTree *DT,
       // Put pointers at the back and make sure pointer < pointer = false.
       if (!LHS->getType()->isIntegerTy() || !RHS->getType()->isIntegerTy())
         return RHS->getType()->isIntegerTy() && !LHS->getType()->isIntegerTy();
-      return RHS->getType()->getPrimitiveSizeInBits() <
-             LHS->getType()->getPrimitiveSizeInBits();
+      return RHS->getType()->getPrimitiveSizeInBits().getFixedSize() <
+             LHS->getType()->getPrimitiveSizeInBits().getFixedSize();
     });
 
   unsigned NumElim = 0;
@@ -2079,6 +2126,8 @@ SCEVExpander::replaceCongruentIVs(Loop *L, const DominatorTree *DT,
     }
     DEBUG_WITH_TYPE(DebugType, dbgs() << "INDVARS: Eliminated congruent iv: "
                                       << *Phi << '\n');
+    DEBUG_WITH_TYPE(DebugType, dbgs() << "INDVARS: Original iv: "
+                                      << *OrigPhiRef << '\n');
     ++NumElim;
     Value *NewIV = OrigPhiRef;
     if (OrigPhiRef->getType() != Phi->getType()) {
@@ -2092,15 +2141,6 @@ SCEVExpander::replaceCongruentIVs(Loop *L, const DominatorTree *DT,
   return NumElim;
 }
 
-Value *SCEVExpander::getExactExistingExpansion(const SCEV *S,
-                                               const Instruction *At, Loop *L) {
-  Optional<ScalarEvolution::ValueOffsetPair> VO =
-      getRelatedExistingExpansion(S, At, L);
-  if (VO && VO.getValue().second == nullptr)
-    return VO.getValue().first;
-  return nullptr;
-}
-
 Optional<ScalarEvolution::ValueOffsetPair>
 SCEVExpander::getRelatedExistingExpansion(const SCEV *S, const Instruction *At,
                                           Loop *L) {
@@ -2139,15 +2179,156 @@ SCEVExpander::getRelatedExistingExpansion(const SCEV *S, const Instruction *At,
   return None;
 }
 
+template<typename T> static int costAndCollectOperands(
+  const SCEVOperand &WorkItem, const TargetTransformInfo &TTI,
+  TargetTransformInfo::TargetCostKind CostKind,
+  SmallVectorImpl<SCEVOperand> &Worklist) {
+
+  const T *S = cast<T>(WorkItem.S);
+  int Cost = 0;
+  // Object to help map SCEV operands to expanded IR instructions.
+  struct OperationIndices {
+    OperationIndices(unsigned Opc, size_t min, size_t max) :
+      Opcode(Opc), MinIdx(min), MaxIdx(max) { }
+    unsigned Opcode;
+    size_t MinIdx;
+    size_t MaxIdx;
+  };
+
+  // Collect the operations of all the instructions that will be needed to
+  // expand the SCEVExpr. This is so that when we come to cost the operands,
+  // we know what the generated user(s) will be.
+  SmallVector<OperationIndices, 2> Operations;
+
+  auto CastCost = [&](unsigned Opcode) {
+    Operations.emplace_back(Opcode, 0, 0);
+    return TTI.getCastInstrCost(Opcode, S->getType(),
+                                S->getOperand(0)->getType(),
+                                TTI::CastContextHint::None, CostKind);
+  };
+
+  auto ArithCost = [&](unsigned Opcode, unsigned NumRequired,
+                       unsigned MinIdx = 0, unsigned MaxIdx = 1) {
+    Operations.emplace_back(Opcode, MinIdx, MaxIdx);
+    return NumRequired *
+      TTI.getArithmeticInstrCost(Opcode, S->getType(), CostKind);
+  };
+
+  auto CmpSelCost = [&](unsigned Opcode, unsigned NumRequired,
+                        unsigned MinIdx, unsigned MaxIdx) {
+    Operations.emplace_back(Opcode, MinIdx, MaxIdx);
+    Type *OpType = S->getOperand(0)->getType();
+    return NumRequired * TTI.getCmpSelInstrCost(
+                             Opcode, OpType, CmpInst::makeCmpResultType(OpType),
+                             CmpInst::BAD_ICMP_PREDICATE, CostKind);
+  };
+
+  switch (S->getSCEVType()) {
+  case scCouldNotCompute:
+    llvm_unreachable("Attempt to use a SCEVCouldNotCompute object!");
+  case scUnknown:
+  case scConstant:
+    return 0;
+  case scPtrToInt:
+    Cost = CastCost(Instruction::PtrToInt);
+    break;
+  case scTruncate:
+    Cost = CastCost(Instruction::Trunc);
+    break;
+  case scZeroExtend:
+    Cost = CastCost(Instruction::ZExt);
+    break;
+  case scSignExtend:
+    Cost = CastCost(Instruction::SExt);
+    break;
+  case scUDivExpr: {
+    unsigned Opcode = Instruction::UDiv;
+    if (auto *SC = dyn_cast<SCEVConstant>(S->getOperand(1)))
+      if (SC->getAPInt().isPowerOf2())
+        Opcode = Instruction::LShr;
+    Cost = ArithCost(Opcode, 1);
+    break;
+  }
+  case scAddExpr:
+    Cost = ArithCost(Instruction::Add, S->getNumOperands() - 1);
+    break;
+  case scMulExpr:
+    // TODO: this is a very pessimistic cost modelling for Mul,
+    // because of Bin Pow algorithm actually used by the expander,
+    // see SCEVExpander::visitMulExpr(), ExpandOpBinPowN().
+    Cost = ArithCost(Instruction::Mul, S->getNumOperands() - 1);
+    break;
+  case scSMaxExpr:
+  case scUMaxExpr:
+  case scSMinExpr:
+  case scUMinExpr: {
+    Cost += CmpSelCost(Instruction::ICmp, S->getNumOperands() - 1, 0, 1);
+    Cost += CmpSelCost(Instruction::Select, S->getNumOperands() - 1, 0, 2);
+    break;
+  }
+  case scAddRecExpr: {
+    // In this polynominal, we may have some zero operands, and we shouldn't
+    // really charge for those. So how many non-zero coeffients are there?
+    int NumTerms = llvm::count_if(S->operands(), [](const SCEV *Op) {
+                                    return !Op->isZero();
+                                  });
+
+    assert(NumTerms >= 1 && "Polynominal should have at least one term.");
+    assert(!(*std::prev(S->operands().end()))->isZero() &&
+           "Last operand should not be zero");
+
+    // Ignoring constant term (operand 0), how many of the coeffients are u> 1?
+    int NumNonZeroDegreeNonOneTerms =
+      llvm::count_if(S->operands(), [](const SCEV *Op) {
+                      auto *SConst = dyn_cast<SCEVConstant>(Op);
+                      return !SConst || SConst->getAPInt().ugt(1);
+                    });
+
+    // Much like with normal add expr, the polynominal will require
+    // one less addition than the number of it's terms.
+    int AddCost = ArithCost(Instruction::Add, NumTerms - 1,
+                            /*MinIdx*/1, /*MaxIdx*/1);
+    // Here, *each* one of those will require a multiplication.
+    int MulCost = ArithCost(Instruction::Mul, NumNonZeroDegreeNonOneTerms);
+    Cost = AddCost + MulCost;
+
+    // What is the degree of this polynominal?
+    int PolyDegree = S->getNumOperands() - 1;
+    assert(PolyDegree >= 1 && "Should be at least affine.");
+
+    // The final term will be:
+    //   Op_{PolyDegree} * x ^ {PolyDegree}
+    // Where  x ^ {PolyDegree}  will again require PolyDegree-1 mul operations.
+    // Note that  x ^ {PolyDegree} = x * x ^ {PolyDegree-1}  so charging for
+    // x ^ {PolyDegree}  will give us  x ^ {2} .. x ^ {PolyDegree-1}  for free.
+    // FIXME: this is conservatively correct, but might be overly pessimistic.
+    Cost += MulCost * (PolyDegree - 1);
+    break;
+  }
+  }
+
+  for (auto &CostOp : Operations) {
+    for (auto SCEVOp : enumerate(S->operands())) {
+      // Clamp the index to account for multiple IR operations being chained.
+      size_t MinIdx = std::max(SCEVOp.index(), CostOp.MinIdx);
+      size_t OpIdx = std::min(MinIdx, CostOp.MaxIdx);
+      Worklist.emplace_back(CostOp.Opcode, OpIdx, SCEVOp.value());
+    }
+  }
+  return Cost;
+}
+
 bool SCEVExpander::isHighCostExpansionHelper(
-    const SCEV *S, Loop *L, const Instruction &At, int &BudgetRemaining,
-    const TargetTransformInfo &TTI, SmallPtrSetImpl<const SCEV *> &Processed,
-    SmallVectorImpl<const SCEV *> &Worklist) {
+    const SCEVOperand &WorkItem, Loop *L, const Instruction &At,
+    int &BudgetRemaining, const TargetTransformInfo &TTI,
+    SmallPtrSetImpl<const SCEV *> &Processed,
+    SmallVectorImpl<SCEVOperand> &Worklist) {
   if (BudgetRemaining < 0)
     return true; // Already run out of budget, give up.
 
+  const SCEV *S = WorkItem.S;
   // Was the cost of expansion of this expression already accounted for?
-  if (!Processed.insert(S).second)
+  if (!isa<SCEVConstant>(S) && !Processed.insert(S).second)
     return false; // We have already accounted for this expression.
 
   // If we can find an existing value for this scev available at the point "At"
@@ -2155,52 +2336,37 @@ bool SCEVExpander::isHighCostExpansionHelper(
   if (getRelatedExistingExpansion(S, &At, L))
     return false; // Consider the expression to be free.
 
-  switch (S->getSCEVType()) {
-  case scUnknown:
-  case scConstant:
-    return false; // Assume to be zero-cost.
-  }
-
   TargetTransformInfo::TargetCostKind CostKind =
-    TargetTransformInfo::TCK_RecipThroughput;
+      L->getHeader()->getParent()->hasMinSize()
+          ? TargetTransformInfo::TCK_CodeSize
+          : TargetTransformInfo::TCK_RecipThroughput;
 
-  if (auto *CastExpr = dyn_cast<SCEVCastExpr>(S)) {
-    unsigned Opcode;
-    switch (S->getSCEVType()) {
-    case scTruncate:
-      Opcode = Instruction::Trunc;
-      break;
-    case scZeroExtend:
-      Opcode = Instruction::ZExt;
-      break;
-    case scSignExtend:
-      Opcode = Instruction::SExt;
-      break;
-    default:
-      llvm_unreachable("There are no other cast types.");
-    }
-    const SCEV *Op = CastExpr->getOperand();
-    BudgetRemaining -= TTI.getCastInstrCost(Opcode, /*Dst=*/S->getType(),
-                                            /*Src=*/Op->getType(), CostKind);
-    Worklist.emplace_back(Op);
+  switch (S->getSCEVType()) {
+  case scCouldNotCompute:
+    llvm_unreachable("Attempt to use a SCEVCouldNotCompute object!");
+  case scUnknown:
+    // Assume to be zero-cost.
+    return false;
+  case scConstant: {
+    // Only evalulate the costs of constants when optimizing for size.
+    if (CostKind != TargetTransformInfo::TCK_CodeSize)
+      return 0;
+    const APInt &Imm = cast<SCEVConstant>(S)->getAPInt();
+    Type *Ty = S->getType();
+    BudgetRemaining -= TTI.getIntImmCostInst(
+        WorkItem.ParentOpcode, WorkItem.OperandIdx, Imm, Ty, CostKind);
+    return BudgetRemaining < 0;
+  }
+  case scTruncate:
+  case scPtrToInt:
+  case scZeroExtend:
+  case scSignExtend: {
+    int Cost =
+        costAndCollectOperands<SCEVCastExpr>(WorkItem, TTI, CostKind, Worklist);
+    BudgetRemaining -= Cost;
     return false; // Will answer upon next entry into this function.
   }
-
-  if (auto *UDivExpr = dyn_cast<SCEVUDivExpr>(S)) {
-    // If the divisor is a power of two count this as a logical right-shift.
-    if (auto *SC = dyn_cast<SCEVConstant>(UDivExpr->getRHS())) {
-      if (SC->getAPInt().isPowerOf2()) {
-        BudgetRemaining -=
-            TTI.getArithmeticInstrCost(Instruction::LShr, S->getType(),
-                                       CostKind);
-        // Note that we don't count the cost of RHS, because it is a constant,
-        // and we consider those to be free. But if that changes, we would need
-        // to log2() it first before calling isHighCostExpansionHelper().
-        Worklist.emplace_back(UDivExpr->getLHS());
-        return false; // Will answer upon next entry into this function.
-      }
-    }
-
+  case scUDivExpr: {
     // UDivExpr is very likely a UDiv that ScalarEvolution's HowFarToZero or
     // HowManyLessThans produced to compute a precise expression, rather than a
     // UDiv from the user's code. If we can't find a UDiv in the code with some
@@ -2213,117 +2379,36 @@ bool SCEVExpander::isHighCostExpansionHelper(
             SE.getAddExpr(S, SE.getConstant(S->getType(), 1)), &At, L))
       return false; // Consider it to be free.
 
+    int Cost =
+        costAndCollectOperands<SCEVUDivExpr>(WorkItem, TTI, CostKind, Worklist);
     // Need to count the cost of this UDiv.
-    BudgetRemaining -=
-        TTI.getArithmeticInstrCost(Instruction::UDiv, S->getType(),
-                                   CostKind);
-    Worklist.insert(Worklist.end(), {UDivExpr->getLHS(), UDivExpr->getRHS()});
+    BudgetRemaining -= Cost;
     return false; // Will answer upon next entry into this function.
   }
-
-  if (const auto *NAry = dyn_cast<SCEVAddRecExpr>(S)) {
-    Type *OpType = NAry->getType();
-
-    assert(NAry->getNumOperands() >= 2 &&
-           "Polynomial should be at least linear");
-
-    int AddCost =
-      TTI.getArithmeticInstrCost(Instruction::Add, OpType, CostKind);
-    int MulCost =
-      TTI.getArithmeticInstrCost(Instruction::Mul, OpType, CostKind);
-
-    // In this polynominal, we may have some zero operands, and we shouldn't
-    // really charge for those. So how many non-zero coeffients are there?
-    int NumTerms = llvm::count_if(NAry->operands(),
-                                  [](const SCEV *S) { return !S->isZero(); });
-    assert(NumTerms >= 1 && "Polynominal should have at least one term.");
-    assert(!(*std::prev(NAry->operands().end()))->isZero() &&
-           "Last operand should not be zero");
-
-    // Much like with normal add expr, the polynominal will require
-    // one less addition than the number of it's terms.
-    BudgetRemaining -= AddCost * (NumTerms - 1);
-    if (BudgetRemaining < 0)
-      return true;
-
-    // Ignoring constant term (operand 0), how many of the coeffients are u> 1?
-    int NumNonZeroDegreeNonOneTerms =
-        llvm::count_if(make_range(std::next(NAry->op_begin()), NAry->op_end()),
-                       [](const SCEV *S) {
-                         auto *SConst = dyn_cast<SCEVConstant>(S);
-                         return !SConst || SConst->getAPInt().ugt(1);
-                       });
-    // Here, *each* one of those will require a multiplication.
-    BudgetRemaining -= MulCost * NumNonZeroDegreeNonOneTerms;
-    if (BudgetRemaining < 0)
-      return true;
-
-    // What is the degree of this polynominal?
-    int PolyDegree = NAry->getNumOperands() - 1;
-    assert(PolyDegree >= 1 && "Should be at least affine.");
-
-    // The final term will be:
-    //   Op_{PolyDegree} * x ^ {PolyDegree}
-    // Where  x ^ {PolyDegree}  will again require PolyDegree-1 mul operations.
-    // Note that  x ^ {PolyDegree} = x * x ^ {PolyDegree-1}  so charging for
-    // x ^ {PolyDegree}  will give us  x ^ {2} .. x ^ {PolyDegree-1}  for free.
-    // FIXME: this is conservatively correct, but might be overly pessimistic.
-    BudgetRemaining -= MulCost * (PolyDegree - 1);
-    if (BudgetRemaining < 0)
-      return true;
-
-    // And finally, the operands themselves should fit within the budget.
-    Worklist.insert(Worklist.end(), NAry->operands().begin(),
-                    NAry->operands().end());
-    return false; // So far so good, though ops may be too costly?
-  }
-
-  if (const SCEVNAryExpr *NAry = dyn_cast<SCEVNAryExpr>(S)) {
-    Type *OpType = NAry->getType();
-
-    int PairCost;
-    switch (S->getSCEVType()) {
-    case scAddExpr:
-      PairCost =
-        TTI.getArithmeticInstrCost(Instruction::Add, OpType, CostKind);
-      break;
-    case scMulExpr:
-      // TODO: this is a very pessimistic cost modelling for Mul,
-      // because of Bin Pow algorithm actually used by the expander,
-      // see SCEVExpander::visitMulExpr(), ExpandOpBinPowN().
-      PairCost =
-        TTI.getArithmeticInstrCost(Instruction::Mul, OpType, CostKind);
-      break;
-    case scSMaxExpr:
-    case scUMaxExpr:
-    case scSMinExpr:
-    case scUMinExpr:
-      PairCost = TTI.getCmpSelInstrCost(Instruction::ICmp, OpType,
-                                        CmpInst::makeCmpResultType(OpType),
-                                        CostKind) +
-                 TTI.getCmpSelInstrCost(Instruction::Select, OpType,
-                                        CmpInst::makeCmpResultType(OpType),
-                                        CostKind);
-      break;
-    default:
-      llvm_unreachable("There are no other variants here.");
-    }
-
-    assert(NAry->getNumOperands() > 1 &&
+  case scAddExpr:
+  case scMulExpr:
+  case scUMaxExpr:
+  case scSMaxExpr:
+  case scUMinExpr:
+  case scSMinExpr: {
+    assert(cast<SCEVNAryExpr>(S)->getNumOperands() > 1 &&
            "Nary expr should have more than 1 operand.");
     // The simple nary expr will require one less op (or pair of ops)
     // than the number of it's terms.
-    BudgetRemaining -= PairCost * (NAry->getNumOperands() - 1);
-    if (BudgetRemaining < 0)
-      return true;
-
-    // And finally, the operands themselves should fit within the budget.
-    Worklist.insert(Worklist.end(), NAry->operands().begin(),
-                    NAry->operands().end());
-    return false; // So far so good, though ops may be too costly?
+    int Cost =
+        costAndCollectOperands<SCEVNAryExpr>(WorkItem, TTI, CostKind, Worklist);
+    BudgetRemaining -= Cost;
+    return BudgetRemaining < 0;
   }
-
-  llvm_unreachable("No other scev expressions possible.");
+  case scAddRecExpr: {
+    assert(cast<SCEVAddRecExpr>(S)->getNumOperands() >= 2 &&
+           "Polynomial should be at least linear");
+    BudgetRemaining -= costAndCollectOperands<SCEVAddRecExpr>(
+        WorkItem, TTI, CostKind, Worklist);
+    return BudgetRemaining < 0;
+  }
+  }
+  llvm_unreachable("Unknown SCEV kind!");
 }
 
 Value *SCEVExpander::expandCodeForPredicate(const SCEVPredicate *Pred,
@@ -2344,8 +2429,10 @@ Value *SCEVExpander::expandCodeForPredicate(const SCEVPredicate *Pred,
 
 Value *SCEVExpander::expandEqualPredicate(const SCEVEqualPredicate *Pred,
                                           Instruction *IP) {
-  Value *Expr0 = expandCodeFor(Pred->getLHS(), Pred->getLHS()->getType(), IP);
-  Value *Expr1 = expandCodeFor(Pred->getRHS(), Pred->getRHS()->getType(), IP);
+  Value *Expr0 =
+      expandCodeForImpl(Pred->getLHS(), Pred->getLHS()->getType(), IP, false);
+  Value *Expr1 =
+      expandCodeForImpl(Pred->getRHS(), Pred->getRHS()->getType(), IP, false);
 
   Builder.SetInsertPoint(IP);
   auto *I = Builder.CreateICmpNE(Expr0, Expr1, "ident.check");
@@ -2361,7 +2448,7 @@ Value *SCEVExpander::generateOverflowCheck(const SCEVAddRecExpr *AR,
   const SCEV *ExitCount =
       SE.getPredicatedBackedgeTakenCount(AR->getLoop(), Pred);
 
-  assert(ExitCount != SE.getCouldNotCompute() && "Invalid loop count");
+  assert(!isa<SCEVCouldNotCompute>(ExitCount) && "Invalid loop count");
 
   const SCEV *Step = AR->getStepRecurrence(SE);
   const SCEV *Start = AR->getStart();
@@ -2377,15 +2464,16 @@ Value *SCEVExpander::generateOverflowCheck(const SCEVAddRecExpr *AR,
 
   IntegerType *CountTy = IntegerType::get(Loc->getContext(), SrcBits);
   Builder.SetInsertPoint(Loc);
-  Value *TripCountVal = expandCodeFor(ExitCount, CountTy, Loc);
+  Value *TripCountVal = expandCodeForImpl(ExitCount, CountTy, Loc, false);
 
   IntegerType *Ty =
       IntegerType::get(Loc->getContext(), SE.getTypeSizeInBits(ARTy));
   Type *ARExpandTy = DL.isNonIntegralPointerType(ARTy) ? ARTy : Ty;
 
-  Value *StepValue = expandCodeFor(Step, Ty, Loc);
-  Value *NegStepValue = expandCodeFor(SE.getNegativeSCEV(Step), Ty, Loc);
-  Value *StartValue = expandCodeFor(Start, ARExpandTy, Loc);
+  Value *StepValue = expandCodeForImpl(Step, Ty, Loc, false);
+  Value *NegStepValue =
+      expandCodeForImpl(SE.getNegativeSCEV(Step), Ty, Loc, false);
+  Value *StartValue = expandCodeForImpl(Start, ARExpandTy, Loc, false);
 
   ConstantInt *Zero =
       ConstantInt::get(Loc->getContext(), APInt::getNullValue(DstBits));
@@ -2445,8 +2533,7 @@ Value *SCEVExpander::generateOverflowCheck(const SCEVAddRecExpr *AR,
     EndCheck = Builder.CreateOr(EndCheck, BackedgeCheck);
   }
 
-  EndCheck = Builder.CreateOr(EndCheck, OfMul);
-  return EndCheck;
+  return Builder.CreateOr(EndCheck, OfMul);
 }
 
 Value *SCEVExpander::expandWrapPredicate(const SCEVWrapPredicate *Pred,
@@ -2489,6 +2576,34 @@ Value *SCEVExpander::expandUnionPredicate(const SCEVUnionPredicate *Union,
   return Check;
 }
 
+Value *SCEVExpander::fixupLCSSAFormFor(Instruction *User, unsigned OpIdx) {
+  assert(PreserveLCSSA);
+  SmallVector<Instruction *, 1> ToUpdate;
+
+  auto *OpV = User->getOperand(OpIdx);
+  auto *OpI = dyn_cast<Instruction>(OpV);
+  if (!OpI)
+    return OpV;
+
+  Loop *DefLoop = SE.LI.getLoopFor(OpI->getParent());
+  Loop *UseLoop = SE.LI.getLoopFor(User->getParent());
+  if (!DefLoop || UseLoop == DefLoop || DefLoop->contains(UseLoop))
+    return OpV;
+
+  ToUpdate.push_back(OpI);
+  SmallVector<PHINode *, 16> PHIsToRemove;
+  formLCSSAForInstructions(ToUpdate, SE.DT, SE.LI, &SE, Builder, &PHIsToRemove);
+  for (PHINode *PN : PHIsToRemove) {
+    if (!PN->use_empty())
+      continue;
+    InsertedValues.erase(PN);
+    InsertedPostIncValues.erase(PN);
+    PN->eraseFromParent();
+  }
+
+  return User->getOperand(OpIdx);
+}
+
 namespace {
 // Search for a SCEV subexpression that is not safe to expand.  Any expression
 // that may expand to a !isSafeToSpeculativelyExecute value is unsafe, namely
@@ -2566,4 +2681,40 @@ bool isSafeToExpandAt(const SCEV *S, const Instruction *InsertionPoint,
   }
   return false;
 }
+
+SCEVExpanderCleaner::~SCEVExpanderCleaner() {
+  // Result is used, nothing to remove.
+  if (ResultUsed)
+    return;
+
+  auto InsertedInstructions = Expander.getAllInsertedInstructions();
+#ifndef NDEBUG
+  SmallPtrSet<Instruction *, 8> InsertedSet(InsertedInstructions.begin(),
+                                            InsertedInstructions.end());
+  (void)InsertedSet;
+#endif
+  // Remove sets with value handles.
+  Expander.clear();
+
+  // Sort so that earlier instructions do not dominate later instructions.
+  stable_sort(InsertedInstructions, [this](Instruction *A, Instruction *B) {
+    return DT.dominates(B, A);
+  });
+  // Remove all inserted instructions.
+  for (Instruction *I : InsertedInstructions) {
+
+#ifndef NDEBUG
+    assert(all_of(I->users(),
+                  [&InsertedSet](Value *U) {
+                    return InsertedSet.contains(cast<Instruction>(U));
+                  }) &&
+           "removed instruction should only be used by instructions inserted "
+           "during expansion");
+#endif
+    assert(!I->getType()->isVoidTy() &&
+           "inserted instruction should have non-void types");
+    I->replaceAllUsesWith(UndefValue::get(I->getType()));
+    I->eraseFromParent();
+  }
+}
 }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index b450d71c996c..de9560df9785 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -13,8 +13,11 @@
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/Sequence.h"
 #include "llvm/ADT/SetOperations.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -58,6 +61,7 @@
 #include "llvm/IR/Use.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -67,6 +71,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <algorithm>
 #include <cassert>
@@ -85,6 +90,12 @@ using namespace PatternMatch;
 
 #define DEBUG_TYPE "simplifycfg"
 
+cl::opt<bool> llvm::RequireAndPreserveDomTree(
+    "simplifycfg-require-and-preserve-domtree", cl::Hidden, cl::ZeroOrMore,
+    cl::init(false),
+    cl::desc("Temorary development switch used to gradually uplift SimplifyCFG "
+             "into preserving DomTree,"));
+
 // Chosen as 2 so as to be cheap, but still to have enough power to fold
 // a select, so the "clamp" idiom (of a min followed by a max) will be caught.
 // To catch this, we need to fold a compare and a select, hence '2' being the
@@ -104,6 +115,10 @@ static cl::opt<bool> DupRet(
     "simplifycfg-dup-ret", cl::Hidden, cl::init(false),
     cl::desc("Duplicate return instructions into unconditional branches"));
 
+static cl::opt<bool>
+    HoistCommon("simplifycfg-hoist-common", cl::Hidden, cl::init(true),
+                cl::desc("Hoist common instructions up to the parent block"));
+
 static cl::opt<bool>
     SinkCommon("simplifycfg-sink-common", cl::Hidden, cl::init(true),
                cl::desc("Sink common instructions down to the end block"));
@@ -138,6 +153,13 @@ MaxSmallBlockSize("simplifycfg-max-small-block-size", cl::Hidden, cl::init(10),
                   cl::desc("Max size of a block which is still considered "
                            "small enough to thread through"));
 
+// Two is chosen to allow one negation and a logical combine.
+static cl::opt<unsigned>
+    BranchFoldThreshold("simplifycfg-branch-fold-threshold", cl::Hidden,
+                        cl::init(2),
+                        cl::desc("Maximum cost of combining conditions when "
+                                 "folding branches"));
+
 STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps");
 STATISTIC(NumLinearMaps,
           "Number of switch instructions turned into linear mapping");
@@ -147,9 +169,22 @@ STATISTIC(
     NumLookupTablesHoles,
     "Number of switch instructions turned into lookup tables (holes checked)");
 STATISTIC(NumTableCmpReuses, "Number of reused switch table lookup compares");
-STATISTIC(NumSinkCommons,
+STATISTIC(NumFoldValueComparisonIntoPredecessors,
+          "Number of value comparisons folded into predecessor basic blocks");
+STATISTIC(NumFoldBranchToCommonDest,
+          "Number of branches folded into predecessor basic block");
+STATISTIC(
+    NumHoistCommonCode,
+    "Number of common instruction 'blocks' hoisted up to the begin block");
+STATISTIC(NumHoistCommonInstrs,
+          "Number of common instructions hoisted up to the begin block");
+STATISTIC(NumSinkCommonCode,
+          "Number of common instruction 'blocks' sunk down to the end block");
+STATISTIC(NumSinkCommonInstrs,
           "Number of common instructions sunk down to the end block");
 STATISTIC(NumSpeculations, "Number of speculative executed instructions");
+STATISTIC(NumInvokes,
+          "Number of invokes with empty resume blocks simplified into calls");
 
 namespace {
 
@@ -182,8 +217,9 @@ struct ValueEqualityComparisonCase {
 
 class SimplifyCFGOpt {
   const TargetTransformInfo &TTI;
+  DomTreeUpdater *DTU;
   const DataLayout &DL;
-  SmallPtrSetImpl<BasicBlock *> *LoopHeaders;
+  ArrayRef<WeakVH> LoopHeaders;
   const SimplifyCFGOptions &Options;
   bool Resimplify;
 
@@ -193,6 +229,9 @@ class SimplifyCFGOpt {
   bool SimplifyEqualityComparisonWithOnlyPredecessor(Instruction *TI,
                                                      BasicBlock *Pred,
                                                      IRBuilder<> &Builder);
+  bool PerformValueComparisonIntoPredecessorFolding(Instruction *TI, Value *&CV,
+                                                    Instruction *PTI,
+                                                    IRBuilder<> &Builder);
   bool FoldValueComparisonIntoPredecessors(Instruction *TI,
                                            IRBuilder<> &Builder);
 
@@ -225,13 +264,18 @@ class SimplifyCFGOpt {
   bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder);
 
 public:
-  SimplifyCFGOpt(const TargetTransformInfo &TTI, const DataLayout &DL,
-                 SmallPtrSetImpl<BasicBlock *> *LoopHeaders,
+  SimplifyCFGOpt(const TargetTransformInfo &TTI, DomTreeUpdater *DTU,
+                 const DataLayout &DL, ArrayRef<WeakVH> LoopHeaders,
                  const SimplifyCFGOptions &Opts)
-      : TTI(TTI), DL(DL), LoopHeaders(LoopHeaders), Options(Opts) {}
+      : TTI(TTI), DTU(DTU), DL(DL), LoopHeaders(LoopHeaders), Options(Opts) {
+    assert((!DTU || !DTU->hasPostDomTree()) &&
+           "SimplifyCFG is not yet capable of maintaining validity of a "
+           "PostDomTree, so don't ask for it.");
+  }
 
-  bool run(BasicBlock *BB);
   bool simplifyOnce(BasicBlock *BB);
+  bool simplifyOnceImpl(BasicBlock *BB);
+  bool run(BasicBlock *BB);
 
   // Helper to set Resimplify and return change indication.
   bool requestResimplify() {
@@ -273,46 +317,6 @@ SafeToMergeTerminators(Instruction *SI1, Instruction *SI2,
   return !Fail;
 }
 
-/// Return true if it is safe and profitable to merge these two terminator
-/// instructions together, where SI1 is an unconditional branch. PhiNodes will
-/// store all PHI nodes in common successors.
-static bool
-isProfitableToFoldUnconditional(BranchInst *SI1, BranchInst *SI2,
-                                Instruction *Cond,
-                                SmallVectorImpl<PHINode *> &PhiNodes) {
-  if (SI1 == SI2)
-    return false; // Can't merge with self!
-  assert(SI1->isUnconditional() && SI2->isConditional());
-
-  // We fold the unconditional branch if we can easily update all PHI nodes in
-  // common successors:
-  // 1> We have a constant incoming value for the conditional branch;
-  // 2> We have "Cond" as the incoming value for the unconditional branch;
-  // 3> SI2->getCondition() and Cond have same operands.
-  CmpInst *Ci2 = dyn_cast<CmpInst>(SI2->getCondition());
-  if (!Ci2)
-    return false;
-  if (!(Cond->getOperand(0) == Ci2->getOperand(0) &&
-        Cond->getOperand(1) == Ci2->getOperand(1)) &&
-      !(Cond->getOperand(0) == Ci2->getOperand(1) &&
-        Cond->getOperand(1) == Ci2->getOperand(0)))
-    return false;
-
-  BasicBlock *SI1BB = SI1->getParent();
-  BasicBlock *SI2BB = SI2->getParent();
-  SmallPtrSet<BasicBlock *, 16> SI1Succs(succ_begin(SI1BB), succ_end(SI1BB));
-  for (BasicBlock *Succ : successors(SI2BB))
-    if (SI1Succs.count(Succ))
-      for (BasicBlock::iterator BBI = Succ->begin(); isa<PHINode>(BBI); ++BBI) {
-        PHINode *PN = cast<PHINode>(BBI);
-        if (PN->getIncomingValueForBlock(SI1BB) != Cond ||
-            !isa<ConstantInt>(PN->getIncomingValueForBlock(SI2BB)))
-          return false;
-        PhiNodes.push_back(PN);
-      }
-  return true;
-}
-
 /// Update PHI nodes in Succ to indicate that there will now be entries in it
 /// from the 'NewPred' block. The values that will be flowing into the PHI nodes
 /// will be the same as those coming in from ExistPred, an existing predecessor
@@ -651,7 +655,7 @@ private:
   /// vector.
   /// One "Extra" case is allowed to differ from the other.
   void gather(Value *V) {
-    bool isEQ = (cast<Instruction>(V)->getOpcode() == Instruction::Or);
+    bool isEQ = match(V, m_LogicalOr(m_Value(), m_Value()));
 
     // Keep a stack (SmallVector for efficiency) for depth-first traversal
     SmallVector<Value *, 8> DFT;
@@ -666,11 +670,14 @@ private:
 
       if (Instruction *I = dyn_cast<Instruction>(V)) {
         // If it is a || (or && depending on isEQ), process the operands.
-        if (I->getOpcode() == (isEQ ? Instruction::Or : Instruction::And)) {
-          if (Visited.insert(I->getOperand(1)).second)
-            DFT.push_back(I->getOperand(1));
-          if (Visited.insert(I->getOperand(0)).second)
-            DFT.push_back(I->getOperand(0));
+        Value *Op0, *Op1;
+        if (isEQ ? match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1)))
+                 : match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1)))) {
+          if (Visited.insert(Op1).second)
+            DFT.push_back(Op1);
+          if (Visited.insert(Op0).second)
+            DFT.push_back(Op0);
+
           continue;
         }
 
@@ -765,7 +772,7 @@ BasicBlock *SimplifyCFGOpt::GetValueEqualityComparisonCases(
 static void
 EliminateBlockCases(BasicBlock *BB,
                     std::vector<ValueEqualityComparisonCase> &Cases) {
-  Cases.erase(std::remove(Cases.begin(), Cases.end(), BB), Cases.end());
+  llvm::erase_value(Cases, BB);
 }
 
 /// Return true if there are any keys in C1 that exist in C2 as well.
@@ -875,13 +882,18 @@ bool SimplifyCFGOpt::SimplifyEqualityComparisonWithOnlyPredecessor(
       (void)NI;
 
       // Remove PHI node entries for the dead edge.
-      ThisCases[0].Dest->removePredecessor(TI->getParent());
+      ThisCases[0].Dest->removePredecessor(PredDef);
 
       LLVM_DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator()
                         << "Through successor TI: " << *TI << "Leaving: " << *NI
                         << "\n");
 
       EraseTerminatorAndDCECond(TI);
+
+      if (DTU)
+        DTU->applyUpdates(
+            {{DominatorTree::Delete, PredDef, ThisCases[0].Dest}});
+
       return true;
     }
 
@@ -894,13 +906,25 @@ bool SimplifyCFGOpt::SimplifyEqualityComparisonWithOnlyPredecessor(
     LLVM_DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator()
                       << "Through successor TI: " << *TI);
 
+    SmallMapVector<BasicBlock *, int, 8> NumPerSuccessorCases;
     for (SwitchInst::CaseIt i = SI->case_end(), e = SI->case_begin(); i != e;) {
       --i;
+      auto *Successor = i->getCaseSuccessor();
+      ++NumPerSuccessorCases[Successor];
       if (DeadCases.count(i->getCaseValue())) {
-        i->getCaseSuccessor()->removePredecessor(TI->getParent());
+        Successor->removePredecessor(PredDef);
         SI.removeCase(i);
+        --NumPerSuccessorCases[Successor];
       }
     }
+
+    std::vector<DominatorTree::UpdateType> Updates;
+    for (const std::pair<BasicBlock *, int> &I : NumPerSuccessorCases)
+      if (I.second == 0)
+        Updates.push_back({DominatorTree::Delete, PredDef, I.first});
+    if (DTU)
+      DTU->applyUpdates(Updates);
+
     LLVM_DEBUG(dbgs() << "Leaving: " << *TI << "\n");
     return true;
   }
@@ -930,12 +954,16 @@ bool SimplifyCFGOpt::SimplifyEqualityComparisonWithOnlyPredecessor(
   if (!TheRealDest)
     TheRealDest = ThisDef;
 
+  SmallSetVector<BasicBlock *, 2> RemovedSuccs;
+
   // Remove PHI node entries for dead edges.
   BasicBlock *CheckEdge = TheRealDest;
   for (BasicBlock *Succ : successors(TIBB))
-    if (Succ != CheckEdge)
+    if (Succ != CheckEdge) {
+      if (Succ != TheRealDest)
+        RemovedSuccs.insert(Succ);
       Succ->removePredecessor(TIBB);
-    else
+    } else
       CheckEdge = nullptr;
 
   // Insert the new branch.
@@ -947,6 +975,13 @@ bool SimplifyCFGOpt::SimplifyEqualityComparisonWithOnlyPredecessor(
                     << "\n");
 
   EraseTerminatorAndDCECond(TI);
+  if (DTU) {
+    SmallVector<DominatorTree::UpdateType, 2> Updates;
+    Updates.reserve(RemovedSuccs.size());
+    for (auto *RemovedSucc : RemovedSuccs)
+      Updates.push_back({DominatorTree::Delete, TIBB, RemovedSucc});
+    DTU->applyUpdates(Updates);
+  }
   return true;
 }
 
@@ -1014,219 +1049,300 @@ static void FitWeights(MutableArrayRef<uint64_t> Weights) {
   }
 }
 
-/// The specified terminator is a value equality comparison instruction
-/// (either a switch or a branch on "X == c").
-/// See if any of the predecessors of the terminator block are value comparisons
-/// on the same value.  If so, and if safe to do so, fold them together.
-bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(Instruction *TI,
-                                                         IRBuilder<> &Builder) {
-  BasicBlock *BB = TI->getParent();
-  Value *CV = isValueEqualityComparison(TI); // CondVal
-  assert(CV && "Not a comparison?");
-  bool Changed = false;
+static void CloneInstructionsIntoPredecessorBlockAndUpdateSSAUses(
+    BasicBlock *BB, BasicBlock *PredBlock, ValueToValueMapTy &VMap) {
+  Instruction *PTI = PredBlock->getTerminator();
 
-  SmallVector<BasicBlock *, 16> Preds(pred_begin(BB), pred_end(BB));
-  while (!Preds.empty()) {
-    BasicBlock *Pred = Preds.pop_back_val();
+  // If we have bonus instructions, clone them into the predecessor block.
+  // Note that there may be multiple predecessor blocks, so we cannot move
+  // bonus instructions to a predecessor block.
+  for (Instruction &BonusInst : *BB) {
+    if (isa<DbgInfoIntrinsic>(BonusInst) || BonusInst.isTerminator())
+      continue;
 
-    // See if the predecessor is a comparison with the same value.
-    Instruction *PTI = Pred->getTerminator();
-    Value *PCV = isValueEqualityComparison(PTI); // PredCondVal
+    Instruction *NewBonusInst = BonusInst.clone();
 
-    if (PCV == CV && TI != PTI) {
-      SmallSetVector<BasicBlock*, 4> FailBlocks;
-      if (!SafeToMergeTerminators(TI, PTI, &FailBlocks)) {
-        for (auto *Succ : FailBlocks) {
-          if (!SplitBlockPredecessors(Succ, TI->getParent(), ".fold.split"))
-            return false;
-        }
-      }
+    if (PTI->getDebugLoc() != NewBonusInst->getDebugLoc()) {
+      // Unless the instruction has the same !dbg location as the original
+      // branch, drop it. When we fold the bonus instructions we want to make
+      // sure we reset their debug locations in order to avoid stepping on
+      // dead code caused by folding dead branches.
+      NewBonusInst->setDebugLoc(DebugLoc());
+    }
 
-      // Figure out which 'cases' to copy from SI to PSI.
-      std::vector<ValueEqualityComparisonCase> BBCases;
-      BasicBlock *BBDefault = GetValueEqualityComparisonCases(TI, BBCases);
-
-      std::vector<ValueEqualityComparisonCase> PredCases;
-      BasicBlock *PredDefault = GetValueEqualityComparisonCases(PTI, PredCases);
-
-      // Based on whether the default edge from PTI goes to BB or not, fill in
-      // PredCases and PredDefault with the new switch cases we would like to
-      // build.
-      SmallVector<BasicBlock *, 8> NewSuccessors;
-
-      // Update the branch weight metadata along the way
-      SmallVector<uint64_t, 8> Weights;
-      bool PredHasWeights = HasBranchWeights(PTI);
-      bool SuccHasWeights = HasBranchWeights(TI);
-
-      if (PredHasWeights) {
-        GetBranchWeights(PTI, Weights);
-        // branch-weight metadata is inconsistent here.
-        if (Weights.size() != 1 + PredCases.size())
-          PredHasWeights = SuccHasWeights = false;
-      } else if (SuccHasWeights)
-        // If there are no predecessor weights but there are successor weights,
-        // populate Weights with 1, which will later be scaled to the sum of
-        // successor's weights
-        Weights.assign(1 + PredCases.size(), 1);
-
-      SmallVector<uint64_t, 8> SuccWeights;
-      if (SuccHasWeights) {
-        GetBranchWeights(TI, SuccWeights);
-        // branch-weight metadata is inconsistent here.
-        if (SuccWeights.size() != 1 + BBCases.size())
-          PredHasWeights = SuccHasWeights = false;
-      } else if (PredHasWeights)
-        SuccWeights.assign(1 + BBCases.size(), 1);
-
-      if (PredDefault == BB) {
-        // If this is the default destination from PTI, only the edges in TI
-        // that don't occur in PTI, or that branch to BB will be activated.
-        std::set<ConstantInt *, ConstantIntOrdering> PTIHandled;
-        for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
-          if (PredCases[i].Dest != BB)
-            PTIHandled.insert(PredCases[i].Value);
-          else {
-            // The default destination is BB, we don't need explicit targets.
-            std::swap(PredCases[i], PredCases.back());
-
-            if (PredHasWeights || SuccHasWeights) {
-              // Increase weight for the default case.
-              Weights[0] += Weights[i + 1];
-              std::swap(Weights[i + 1], Weights.back());
-              Weights.pop_back();
-            }
+    RemapInstruction(NewBonusInst, VMap,
+                     RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+    VMap[&BonusInst] = NewBonusInst;
+
+    // If we moved a load, we cannot any longer claim any knowledge about
+    // its potential value. The previous information might have been valid
+    // only given the branch precondition.
+    // For an analogous reason, we must also drop all the metadata whose
+    // semantics we don't understand. We *can* preserve !annotation, because
+    // it is tied to the instruction itself, not the value or position.
+    NewBonusInst->dropUnknownNonDebugMetadata(LLVMContext::MD_annotation);
+
+    PredBlock->getInstList().insert(PTI->getIterator(), NewBonusInst);
+    NewBonusInst->takeName(&BonusInst);
+    BonusInst.setName(NewBonusInst->getName() + ".old");
+
+    // Update (liveout) uses of bonus instructions,
+    // now that the bonus instruction has been cloned into predecessor.
+    SSAUpdater SSAUpdate;
+    SSAUpdate.Initialize(BonusInst.getType(),
+                         (NewBonusInst->getName() + ".merge").str());
+    SSAUpdate.AddAvailableValue(BB, &BonusInst);
+    SSAUpdate.AddAvailableValue(PredBlock, NewBonusInst);
+    for (Use &U : make_early_inc_range(BonusInst.uses()))
+      SSAUpdate.RewriteUseAfterInsertions(U);
+  }
+}
 
-            PredCases.pop_back();
-            --i;
-            --e;
-          }
+bool SimplifyCFGOpt::PerformValueComparisonIntoPredecessorFolding(
+    Instruction *TI, Value *&CV, Instruction *PTI, IRBuilder<> &Builder) {
+  BasicBlock *BB = TI->getParent();
+  BasicBlock *Pred = PTI->getParent();
+
+  std::vector<DominatorTree::UpdateType> Updates;
+
+  // Figure out which 'cases' to copy from SI to PSI.
+  std::vector<ValueEqualityComparisonCase> BBCases;
+  BasicBlock *BBDefault = GetValueEqualityComparisonCases(TI, BBCases);
 
-        // Reconstruct the new switch statement we will be building.
-        if (PredDefault != BBDefault) {
-          PredDefault->removePredecessor(Pred);
-          PredDefault = BBDefault;
-          NewSuccessors.push_back(BBDefault);
+  std::vector<ValueEqualityComparisonCase> PredCases;
+  BasicBlock *PredDefault = GetValueEqualityComparisonCases(PTI, PredCases);
+
+  // Based on whether the default edge from PTI goes to BB or not, fill in
+  // PredCases and PredDefault with the new switch cases we would like to
+  // build.
+  SmallMapVector<BasicBlock *, int, 8> NewSuccessors;
+
+  // Update the branch weight metadata along the way
+  SmallVector<uint64_t, 8> Weights;
+  bool PredHasWeights = HasBranchWeights(PTI);
+  bool SuccHasWeights = HasBranchWeights(TI);
+
+  if (PredHasWeights) {
+    GetBranchWeights(PTI, Weights);
+    // branch-weight metadata is inconsistent here.
+    if (Weights.size() != 1 + PredCases.size())
+      PredHasWeights = SuccHasWeights = false;
+  } else if (SuccHasWeights)
+    // If there are no predecessor weights but there are successor weights,
+    // populate Weights with 1, which will later be scaled to the sum of
+    // successor's weights
+    Weights.assign(1 + PredCases.size(), 1);
+
+  SmallVector<uint64_t, 8> SuccWeights;
+  if (SuccHasWeights) {
+    GetBranchWeights(TI, SuccWeights);
+    // branch-weight metadata is inconsistent here.
+    if (SuccWeights.size() != 1 + BBCases.size())
+      PredHasWeights = SuccHasWeights = false;
+  } else if (PredHasWeights)
+    SuccWeights.assign(1 + BBCases.size(), 1);
+
+  if (PredDefault == BB) {
+    // If this is the default destination from PTI, only the edges in TI
+    // that don't occur in PTI, or that branch to BB will be activated.
+    std::set<ConstantInt *, ConstantIntOrdering> PTIHandled;
+    for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
+      if (PredCases[i].Dest != BB)
+        PTIHandled.insert(PredCases[i].Value);
+      else {
+        // The default destination is BB, we don't need explicit targets.
+        std::swap(PredCases[i], PredCases.back());
+
+        if (PredHasWeights || SuccHasWeights) {
+          // Increase weight for the default case.
+          Weights[0] += Weights[i + 1];
+          std::swap(Weights[i + 1], Weights.back());
+          Weights.pop_back();
         }
 
-        unsigned CasesFromPred = Weights.size();
-        uint64_t ValidTotalSuccWeight = 0;
-        for (unsigned i = 0, e = BBCases.size(); i != e; ++i)
-          if (!PTIHandled.count(BBCases[i].Value) &&
-              BBCases[i].Dest != BBDefault) {
-            PredCases.push_back(BBCases[i]);
-            NewSuccessors.push_back(BBCases[i].Dest);
-            if (SuccHasWeights || PredHasWeights) {
-              // The default weight is at index 0, so weight for the ith case
-              // should be at index i+1. Scale the cases from successor by
-              // PredDefaultWeight (Weights[0]).
-              Weights.push_back(Weights[0] * SuccWeights[i + 1]);
-              ValidTotalSuccWeight += SuccWeights[i + 1];
-            }
-          }
+        PredCases.pop_back();
+        --i;
+        --e;
+      }
 
+    // Reconstruct the new switch statement we will be building.
+    if (PredDefault != BBDefault) {
+      PredDefault->removePredecessor(Pred);
+      if (PredDefault != BB)
+        Updates.push_back({DominatorTree::Delete, Pred, PredDefault});
+      PredDefault = BBDefault;
+      ++NewSuccessors[BBDefault];
+    }
+
+    unsigned CasesFromPred = Weights.size();
+    uint64_t ValidTotalSuccWeight = 0;
+    for (unsigned i = 0, e = BBCases.size(); i != e; ++i)
+      if (!PTIHandled.count(BBCases[i].Value) && BBCases[i].Dest != BBDefault) {
+        PredCases.push_back(BBCases[i]);
+        ++NewSuccessors[BBCases[i].Dest];
         if (SuccHasWeights || PredHasWeights) {
-          ValidTotalSuccWeight += SuccWeights[0];
-          // Scale the cases from predecessor by ValidTotalSuccWeight.
-          for (unsigned i = 1; i < CasesFromPred; ++i)
-            Weights[i] *= ValidTotalSuccWeight;
-          // Scale the default weight by SuccDefaultWeight (SuccWeights[0]).
-          Weights[0] *= SuccWeights[0];
+          // The default weight is at index 0, so weight for the ith case
+          // should be at index i+1. Scale the cases from successor by
+          // PredDefaultWeight (Weights[0]).
+          Weights.push_back(Weights[0] * SuccWeights[i + 1]);
+          ValidTotalSuccWeight += SuccWeights[i + 1];
         }
-      } else {
-        // If this is not the default destination from PSI, only the edges
-        // in SI that occur in PSI with a destination of BB will be
-        // activated.
-        std::set<ConstantInt *, ConstantIntOrdering> PTIHandled;
-        std::map<ConstantInt *, uint64_t> WeightsForHandled;
-        for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
-          if (PredCases[i].Dest == BB) {
-            PTIHandled.insert(PredCases[i].Value);
-
-            if (PredHasWeights || SuccHasWeights) {
-              WeightsForHandled[PredCases[i].Value] = Weights[i + 1];
-              std::swap(Weights[i + 1], Weights.back());
-              Weights.pop_back();
-            }
-
-            std::swap(PredCases[i], PredCases.back());
-            PredCases.pop_back();
-            --i;
-            --e;
-          }
+      }
 
-        // Okay, now we know which constants were sent to BB from the
-        // predecessor.  Figure out where they will all go now.
-        for (unsigned i = 0, e = BBCases.size(); i != e; ++i)
-          if (PTIHandled.count(BBCases[i].Value)) {
-            // If this is one we are capable of getting...
-            if (PredHasWeights || SuccHasWeights)
-              Weights.push_back(WeightsForHandled[BBCases[i].Value]);
-            PredCases.push_back(BBCases[i]);
-            NewSuccessors.push_back(BBCases[i].Dest);
-            PTIHandled.erase(
-                BBCases[i].Value); // This constant is taken care of
-          }
+    if (SuccHasWeights || PredHasWeights) {
+      ValidTotalSuccWeight += SuccWeights[0];
+      // Scale the cases from predecessor by ValidTotalSuccWeight.
+      for (unsigned i = 1; i < CasesFromPred; ++i)
+        Weights[i] *= ValidTotalSuccWeight;
+      // Scale the default weight by SuccDefaultWeight (SuccWeights[0]).
+      Weights[0] *= SuccWeights[0];
+    }
+  } else {
+    // If this is not the default destination from PSI, only the edges
+    // in SI that occur in PSI with a destination of BB will be
+    // activated.
+    std::set<ConstantInt *, ConstantIntOrdering> PTIHandled;
+    std::map<ConstantInt *, uint64_t> WeightsForHandled;
+    for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
+      if (PredCases[i].Dest == BB) {
+        PTIHandled.insert(PredCases[i].Value);
 
-        // If there are any constants vectored to BB that TI doesn't handle,
-        // they must go to the default destination of TI.
-        for (ConstantInt *I : PTIHandled) {
-          if (PredHasWeights || SuccHasWeights)
-            Weights.push_back(WeightsForHandled[I]);
-          PredCases.push_back(ValueEqualityComparisonCase(I, BBDefault));
-          NewSuccessors.push_back(BBDefault);
+        if (PredHasWeights || SuccHasWeights) {
+          WeightsForHandled[PredCases[i].Value] = Weights[i + 1];
+          std::swap(Weights[i + 1], Weights.back());
+          Weights.pop_back();
         }
+
+        std::swap(PredCases[i], PredCases.back());
+        PredCases.pop_back();
+        --i;
+        --e;
       }
 
-      // Okay, at this point, we know which new successor Pred will get.  Make
-      // sure we update the number of entries in the PHI nodes for these
-      // successors.
-      for (BasicBlock *NewSuccessor : NewSuccessors)
-        AddPredecessorToBlock(NewSuccessor, Pred, BB);
-
-      Builder.SetInsertPoint(PTI);
-      // Convert pointer to int before we switch.
-      if (CV->getType()->isPointerTy()) {
-        CV = Builder.CreatePtrToInt(CV, DL.getIntPtrType(CV->getType()),
-                                    "magicptr");
+    // Okay, now we know which constants were sent to BB from the
+    // predecessor.  Figure out where they will all go now.
+    for (unsigned i = 0, e = BBCases.size(); i != e; ++i)
+      if (PTIHandled.count(BBCases[i].Value)) {
+        // If this is one we are capable of getting...
+        if (PredHasWeights || SuccHasWeights)
+          Weights.push_back(WeightsForHandled[BBCases[i].Value]);
+        PredCases.push_back(BBCases[i]);
+        ++NewSuccessors[BBCases[i].Dest];
+        PTIHandled.erase(BBCases[i].Value); // This constant is taken care of
       }
 
-      // Now that the successors are updated, create the new Switch instruction.
-      SwitchInst *NewSI =
-          Builder.CreateSwitch(CV, PredDefault, PredCases.size());
-      NewSI->setDebugLoc(PTI->getDebugLoc());
-      for (ValueEqualityComparisonCase &V : PredCases)
-        NewSI->addCase(V.Value, V.Dest);
+    // If there are any constants vectored to BB that TI doesn't handle,
+    // they must go to the default destination of TI.
+    for (ConstantInt *I : PTIHandled) {
+      if (PredHasWeights || SuccHasWeights)
+        Weights.push_back(WeightsForHandled[I]);
+      PredCases.push_back(ValueEqualityComparisonCase(I, BBDefault));
+      ++NewSuccessors[BBDefault];
+    }
+  }
+
+  // Okay, at this point, we know which new successor Pred will get.  Make
+  // sure we update the number of entries in the PHI nodes for these
+  // successors.
+  for (const std::pair<BasicBlock *, int /*Num*/> &NewSuccessor :
+       NewSuccessors) {
+    for (auto I : seq(0, NewSuccessor.second)) {
+      (void)I;
+      AddPredecessorToBlock(NewSuccessor.first, Pred, BB);
+    }
+    if (!is_contained(successors(Pred), NewSuccessor.first))
+      Updates.push_back({DominatorTree::Insert, Pred, NewSuccessor.first});
+  }
+
+  Builder.SetInsertPoint(PTI);
+  // Convert pointer to int before we switch.
+  if (CV->getType()->isPointerTy()) {
+    CV =
+        Builder.CreatePtrToInt(CV, DL.getIntPtrType(CV->getType()), "magicptr");
+  }
+
+  // Now that the successors are updated, create the new Switch instruction.
+  SwitchInst *NewSI = Builder.CreateSwitch(CV, PredDefault, PredCases.size());
+  NewSI->setDebugLoc(PTI->getDebugLoc());
+  for (ValueEqualityComparisonCase &V : PredCases)
+    NewSI->addCase(V.Value, V.Dest);
 
-      if (PredHasWeights || SuccHasWeights) {
-        // Halve the weights if any of them cannot fit in an uint32_t
-        FitWeights(Weights);
+  if (PredHasWeights || SuccHasWeights) {
+    // Halve the weights if any of them cannot fit in an uint32_t
+    FitWeights(Weights);
 
-        SmallVector<uint32_t, 8> MDWeights(Weights.begin(), Weights.end());
+    SmallVector<uint32_t, 8> MDWeights(Weights.begin(), Weights.end());
 
-        setBranchWeights(NewSI, MDWeights);
+    setBranchWeights(NewSI, MDWeights);
+  }
+
+  EraseTerminatorAndDCECond(PTI);
+
+  // Okay, last check.  If BB is still a successor of PSI, then we must
+  // have an infinite loop case.  If so, add an infinitely looping block
+  // to handle the case to preserve the behavior of the code.
+  BasicBlock *InfLoopBlock = nullptr;
+  for (unsigned i = 0, e = NewSI->getNumSuccessors(); i != e; ++i)
+    if (NewSI->getSuccessor(i) == BB) {
+      if (!InfLoopBlock) {
+        // Insert it at the end of the function, because it's either code,
+        // or it won't matter if it's hot. :)
+        InfLoopBlock =
+            BasicBlock::Create(BB->getContext(), "infloop", BB->getParent());
+        BranchInst::Create(InfLoopBlock, InfLoopBlock);
+        Updates.push_back({DominatorTree::Insert, InfLoopBlock, InfLoopBlock});
       }
+      NewSI->setSuccessor(i, InfLoopBlock);
+    }
 
-      EraseTerminatorAndDCECond(PTI);
-
-      // Okay, last check.  If BB is still a successor of PSI, then we must
-      // have an infinite loop case.  If so, add an infinitely looping block
-      // to handle the case to preserve the behavior of the code.
-      BasicBlock *InfLoopBlock = nullptr;
-      for (unsigned i = 0, e = NewSI->getNumSuccessors(); i != e; ++i)
-        if (NewSI->getSuccessor(i) == BB) {
-          if (!InfLoopBlock) {
-            // Insert it at the end of the function, because it's either code,
-            // or it won't matter if it's hot. :)
-            InfLoopBlock = BasicBlock::Create(BB->getContext(), "infloop",
-                                              BB->getParent());
-            BranchInst::Create(InfLoopBlock, InfLoopBlock);
-          }
-          NewSI->setSuccessor(i, InfLoopBlock);
-        }
+  if (InfLoopBlock)
+    Updates.push_back({DominatorTree::Insert, Pred, InfLoopBlock});
 
-      Changed = true;
+  Updates.push_back({DominatorTree::Delete, Pred, BB});
+
+  if (DTU)
+    DTU->applyUpdates(Updates);
+
+  ++NumFoldValueComparisonIntoPredecessors;
+  return true;
+}
+
+/// The specified terminator is a value equality comparison instruction
+/// (either a switch or a branch on "X == c").
+/// See if any of the predecessors of the terminator block are value comparisons
+/// on the same value.  If so, and if safe to do so, fold them together.
+bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(Instruction *TI,
+                                                         IRBuilder<> &Builder) {
+  BasicBlock *BB = TI->getParent();
+  Value *CV = isValueEqualityComparison(TI); // CondVal
+  assert(CV && "Not a comparison?");
+
+  bool Changed = false;
+
+  SmallSetVector<BasicBlock *, 16> Preds(pred_begin(BB), pred_end(BB));
+  while (!Preds.empty()) {
+    BasicBlock *Pred = Preds.pop_back_val();
+    Instruction *PTI = Pred->getTerminator();
+
+    // Don't try to fold into itself.
+    if (Pred == BB)
+      continue;
+
+    // See if the predecessor is a comparison with the same value.
+    Value *PCV = isValueEqualityComparison(PTI); // PredCondVal
+    if (PCV != CV)
+      continue;
+
+    SmallSetVector<BasicBlock *, 4> FailBlocks;
+    if (!SafeToMergeTerminators(TI, PTI, &FailBlocks)) {
+      for (auto *Succ : FailBlocks) {
+        if (!SplitBlockPredecessors(Succ, TI->getParent(), ".fold.split", DTU))
+          return false;
+      }
     }
+
+    PerformValueComparisonIntoPredecessorFolding(TI, CV, PTI, Builder);
+    Changed = true;
   }
   return Changed;
 }
@@ -1248,7 +1364,7 @@ static bool isSafeToHoistInvoke(BasicBlock *BB1, BasicBlock *BB2,
   return true;
 }
 
-static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I);
+static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I, bool PtrValueMayBeModified = false);
 
 /// Given a conditional branch that goes to BB1 and BB2, hoist any common code
 /// in the two blocks up into the branch block. The caller of this function
@@ -1285,6 +1401,12 @@ bool SimplifyCFGOpt::HoistThenElseCodeToIf(BranchInst *BI,
   BasicBlock *BIParent = BI->getParent();
 
   bool Changed = false;
+
+  auto _ = make_scope_exit([&]() {
+    if (Changed)
+      ++NumHoistCommonCode;
+  });
+
   do {
     // If we are hoisting the terminator instruction, don't move one (making a
     // broken BB), instead clone it, and remove BI.
@@ -1353,6 +1475,7 @@ bool SimplifyCFGOpt::HoistThenElseCodeToIf(BranchInst *BI,
       I2->eraseFromParent();
       Changed = true;
     }
+    ++NumHoistCommonInstrs;
 
     I1 = &*BB1_Itr++;
     I2 = &*BB2_Itr++;
@@ -1407,6 +1530,8 @@ HoistTerminator:
     I2->replaceAllUsesWith(NT);
     NT->takeName(I1);
   }
+  Changed = true;
+  ++NumHoistCommonInstrs;
 
   // Ensure terminator gets a debug location, even an unknown one, in case
   // it involves inlinable calls.
@@ -1448,12 +1573,20 @@ HoistTerminator:
     }
   }
 
+  SmallVector<DominatorTree::UpdateType, 4> Updates;
+
   // Update any PHI nodes in our new successors.
-  for (BasicBlock *Succ : successors(BB1))
+  for (BasicBlock *Succ : successors(BB1)) {
     AddPredecessorToBlock(Succ, BIParent, BB1);
+    Updates.push_back({DominatorTree::Insert, BIParent, Succ});
+  }
+  for (BasicBlock *Succ : successors(BI))
+    Updates.push_back({DominatorTree::Delete, BIParent, Succ});
 
   EraseTerminatorAndDCECond(BI);
-  return true;
+  if (DTU)
+    DTU->applyUpdates(Updates);
+  return Changed;
 }
 
 // Check lifetime markers.
@@ -1495,6 +1628,11 @@ static bool canSinkInstructions(
         I->getType()->isTokenTy())
       return false;
 
+    // Do not try to sink an instruction in an infinite loop - it can cause
+    // this algorithm to infinite loop.
+    if (I->getParent()->getSingleSuccessor() == I->getParent())
+      return false;
+
     // Conservatively return false if I is an inline-asm instruction. Sinking
     // and merging inline-asm instructions can potentially create arguments
     // that cannot satisfy the inline-asm constraints.
@@ -1581,13 +1719,13 @@ static bool canSinkInstructions(
   return true;
 }
 
-// Assuming canSinkLastInstruction(Blocks) has returned true, sink the last
+// Assuming canSinkInstructions(Blocks) has returned true, sink the last
 // instruction of every block in Blocks to their common successor, commoning
 // into one instruction.
 static bool sinkLastInstruction(ArrayRef<BasicBlock*> Blocks) {
   auto *BBEnd = Blocks[0]->getTerminator()->getSuccessor(0);
 
-  // canSinkLastInstruction returning true guarantees that every block has at
+  // canSinkInstructions returning true guarantees that every block has at
   // least one non-terminator instruction.
   SmallVector<Instruction*,4> Insts;
   for (auto *BB : Blocks) {
@@ -1600,9 +1738,9 @@ static bool sinkLastInstruction(ArrayRef<BasicBlock*> Blocks) {
   }
 
   // The only checking we need to do now is that all users of all instructions
-  // are the same PHI node. canSinkLastInstruction should have checked this but
-  // it is slightly over-aggressive - it gets confused by commutative instructions
-  // so double-check it here.
+  // are the same PHI node. canSinkInstructions should have checked this but
+  // it is slightly over-aggressive - it gets confused by commutative
+  // instructions so double-check it here.
   Instruction *I0 = Insts.front();
   if (!I0->user_empty()) {
     auto *PNUse = dyn_cast<PHINode>(*I0->user_begin());
@@ -1613,11 +1751,11 @@ static bool sinkLastInstruction(ArrayRef<BasicBlock*> Blocks) {
       return false;
   }
 
-  // We don't need to do any more checking here; canSinkLastInstruction should
+  // We don't need to do any more checking here; canSinkInstructions should
   // have done it all for us.
   SmallVector<Value*, 4> NewOperands;
   for (unsigned O = 0, E = I0->getNumOperands(); O != E; ++O) {
-    // This check is different to that in canSinkLastInstruction. There, we
+    // This check is different to that in canSinkInstructions. There, we
     // cared about the global view once simplifycfg (and instcombine) have
     // completed - it takes into account PHIs that become trivially
     // simplifiable.  However here we need a more local view; if an operand
@@ -1744,7 +1882,8 @@ namespace {
 /// true, sink any common code from the predecessors to BB.
 /// We also allow one predecessor to end with conditional branch (but no more
 /// than one).
-static bool SinkCommonCodeFromPredecessors(BasicBlock *BB) {
+static bool SinkCommonCodeFromPredecessors(BasicBlock *BB,
+                                           DomTreeUpdater *DTU) {
   // We support two situations:
   //   (1) all incoming arcs are unconditional
   //   (2) one incoming arc is conditional
@@ -1800,7 +1939,6 @@ static bool SinkCommonCodeFromPredecessors(BasicBlock *BB) {
   if (UnconditionalPreds.size() < 2)
     return false;
 
-  bool Changed = false;
   // We take a two-step approach to tail sinking. First we scan from the end of
   // each block upwards in lockstep. If the n'th instruction from the end of each
   // block can be sunk, those instructions are added to ValuesToSink and we
@@ -1820,6 +1958,12 @@ static bool SinkCommonCodeFromPredecessors(BasicBlock *BB) {
     --LRI;
   }
 
+  // If no instructions can be sunk, early-return.
+  if (ScanIdx == 0)
+    return false;
+
+  bool Changed = false;
+
   auto ProfitableToSinkInstruction = [&](LockstepReverseIterator &LRI) {
     unsigned NumPHIdValues = 0;
     for (auto *I : *LRI)
@@ -1834,7 +1978,7 @@ static bool SinkCommonCodeFromPredecessors(BasicBlock *BB) {
     return NumPHIInsts <= 1;
   };
 
-  if (ScanIdx > 0 && Cond) {
+  if (Cond) {
     // Check if we would actually sink anything first! This mutates the CFG and
     // adds an extra block. The goal in doing this is to allow instructions that
     // couldn't be sunk before to be sunk - obviously, speculatable instructions
@@ -1857,7 +2001,7 @@ static bool SinkCommonCodeFromPredecessors(BasicBlock *BB) {
     LLVM_DEBUG(dbgs() << "SINK: Splitting edge\n");
     // We have a conditional edge and we're going to sink some instructions.
     // Insert a new block postdominating all blocks we're going to sink from.
-    if (!SplitBlockPredecessors(BB, UnconditionalPreds, ".sink.split"))
+    if (!SplitBlockPredecessors(BB, UnconditionalPreds, ".sink.split", DTU))
       // Edges couldn't be split.
       return false;
     Changed = true;
@@ -1875,7 +2019,8 @@ static bool SinkCommonCodeFromPredecessors(BasicBlock *BB) {
   // sink presuming a later value will also be sunk, but stop half way through
   // and never actually sink it which means we produce more PHIs than intended.
   // This is unlikely in practice though.
-  for (unsigned SinkIdx = 0; SinkIdx != ScanIdx; ++SinkIdx) {
+  unsigned SinkIdx = 0;
+  for (; SinkIdx != ScanIdx; ++SinkIdx) {
     LLVM_DEBUG(dbgs() << "SINK: Sink: "
                       << *UnconditionalPreds[0]->getTerminator()->getPrevNode()
                       << "\n");
@@ -1890,11 +2035,18 @@ static bool SinkCommonCodeFromPredecessors(BasicBlock *BB) {
       break;
     }
 
-    if (!sinkLastInstruction(UnconditionalPreds))
-      return Changed;
-    NumSinkCommons++;
+    if (!sinkLastInstruction(UnconditionalPreds)) {
+      LLVM_DEBUG(
+          dbgs()
+          << "SINK: stopping here, failed to actually sink instruction!\n");
+      break;
+    }
+
+    NumSinkCommonInstrs++;
     Changed = true;
   }
+  if (SinkIdx != 0)
+    ++NumSinkCommonCode;
   return Changed;
 }
 
@@ -1938,7 +2090,9 @@ static Value *isSafeToSpeculateStore(Instruction *I, BasicBlock *BrBB,
 
   // Look for a store to the same pointer in BrBB.
   unsigned MaxNumInstToLookAt = 9;
-  for (Instruction &CurI : reverse(BrBB->instructionsWithoutDebug())) {
+  // Skip pseudo probe intrinsic calls which are not really killing any memory
+  // accesses.
+  for (Instruction &CurI : reverse(BrBB->instructionsWithoutDebug(true))) {
     if (!MaxNumInstToLookAt)
       break;
     --MaxNumInstToLookAt;
@@ -1959,6 +2113,65 @@ static Value *isSafeToSpeculateStore(Instruction *I, BasicBlock *BrBB,
   return nullptr;
 }
 
+/// Estimate the cost of the insertion(s) and check that the PHI nodes can be
+/// converted to selects.
+static bool validateAndCostRequiredSelects(BasicBlock *BB, BasicBlock *ThenBB,
+                                           BasicBlock *EndBB,
+                                           unsigned &SpeculatedInstructions,
+                                           int &BudgetRemaining,
+                                           const TargetTransformInfo &TTI) {
+  TargetTransformInfo::TargetCostKind CostKind =
+    BB->getParent()->hasMinSize()
+    ? TargetTransformInfo::TCK_CodeSize
+    : TargetTransformInfo::TCK_SizeAndLatency;
+
+  bool HaveRewritablePHIs = false;
+  for (PHINode &PN : EndBB->phis()) {
+    Value *OrigV = PN.getIncomingValueForBlock(BB);
+    Value *ThenV = PN.getIncomingValueForBlock(ThenBB);
+
+    // FIXME: Try to remove some of the duplication with HoistThenElseCodeToIf.
+    // Skip PHIs which are trivial.
+    if (ThenV == OrigV)
+      continue;
+
+    BudgetRemaining -=
+        TTI.getCmpSelInstrCost(Instruction::Select, PN.getType(), nullptr,
+                               CmpInst::BAD_ICMP_PREDICATE, CostKind);
+
+    // Don't convert to selects if we could remove undefined behavior instead.
+    if (passingValueIsAlwaysUndefined(OrigV, &PN) ||
+        passingValueIsAlwaysUndefined(ThenV, &PN))
+      return false;
+
+    HaveRewritablePHIs = true;
+    ConstantExpr *OrigCE = dyn_cast<ConstantExpr>(OrigV);
+    ConstantExpr *ThenCE = dyn_cast<ConstantExpr>(ThenV);
+    if (!OrigCE && !ThenCE)
+      continue; // Known safe and cheap.
+
+    if ((ThenCE && !isSafeToSpeculativelyExecute(ThenCE)) ||
+        (OrigCE && !isSafeToSpeculativelyExecute(OrigCE)))
+      return false;
+    unsigned OrigCost = OrigCE ? ComputeSpeculationCost(OrigCE, TTI) : 0;
+    unsigned ThenCost = ThenCE ? ComputeSpeculationCost(ThenCE, TTI) : 0;
+    unsigned MaxCost =
+        2 * PHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic;
+    if (OrigCost + ThenCost > MaxCost)
+      return false;
+
+    // Account for the cost of an unfolded ConstantExpr which could end up
+    // getting expanded into Instructions.
+    // FIXME: This doesn't account for how many operations are combined in the
+    // constant expression.
+    ++SpeculatedInstructions;
+    if (SpeculatedInstructions > 1)
+      return false;
+  }
+
+  return HaveRewritablePHIs;
+}
+
 /// Speculate a conditional basic block flattening the CFG.
 ///
 /// Note that this is a very risky transform currently. Speculating
@@ -2005,6 +2218,8 @@ bool SimplifyCFGOpt::SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
 
   BasicBlock *BB = BI->getParent();
   BasicBlock *EndBB = ThenBB->getTerminator()->getSuccessor(0);
+  int BudgetRemaining =
+    PHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic;
 
   // If ThenBB is actually on the false edge of the conditional branch, remember
   // to swap the select operands later.
@@ -2037,6 +2252,14 @@ bool SimplifyCFGOpt::SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
       continue;
     }
 
+    // Skip pseudo probes. The consequence is we lose track of the branch
+    // probability for ThenBB, which is fine since the optimization here takes
+    // place regardless of the branch probability.
+    if (isa<PseudoProbeInst>(I)) {
+      SpeculatedDbgIntrinsics.push_back(I);
+      continue;
+    }
+
     // Only speculatively execute a single instruction (not counting the
     // terminator) for now.
     ++SpeculatedInstructions;
@@ -2082,50 +2305,13 @@ bool SimplifyCFGOpt::SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
         return false;
     }
 
-  // Check that the PHI nodes can be converted to selects.
-  bool HaveRewritablePHIs = false;
-  for (PHINode &PN : EndBB->phis()) {
-    Value *OrigV = PN.getIncomingValueForBlock(BB);
-    Value *ThenV = PN.getIncomingValueForBlock(ThenBB);
-
-    // FIXME: Try to remove some of the duplication with HoistThenElseCodeToIf.
-    // Skip PHIs which are trivial.
-    if (ThenV == OrigV)
-      continue;
-
-    // Don't convert to selects if we could remove undefined behavior instead.
-    if (passingValueIsAlwaysUndefined(OrigV, &PN) ||
-        passingValueIsAlwaysUndefined(ThenV, &PN))
-      return false;
-
-    HaveRewritablePHIs = true;
-    ConstantExpr *OrigCE = dyn_cast<ConstantExpr>(OrigV);
-    ConstantExpr *ThenCE = dyn_cast<ConstantExpr>(ThenV);
-    if (!OrigCE && !ThenCE)
-      continue; // Known safe and cheap.
-
-    if ((ThenCE && !isSafeToSpeculativelyExecute(ThenCE)) ||
-        (OrigCE && !isSafeToSpeculativelyExecute(OrigCE)))
-      return false;
-    unsigned OrigCost = OrigCE ? ComputeSpeculationCost(OrigCE, TTI) : 0;
-    unsigned ThenCost = ThenCE ? ComputeSpeculationCost(ThenCE, TTI) : 0;
-    unsigned MaxCost =
-        2 * PHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic;
-    if (OrigCost + ThenCost > MaxCost)
-      return false;
-
-    // Account for the cost of an unfolded ConstantExpr which could end up
-    // getting expanded into Instructions.
-    // FIXME: This doesn't account for how many operations are combined in the
-    // constant expression.
-    ++SpeculatedInstructions;
-    if (SpeculatedInstructions > 1)
-      return false;
-  }
-
-  // If there are no PHIs to process, bail early. This helps ensure idempotence
-  // as well.
-  if (!HaveRewritablePHIs && !(HoistCondStores && SpeculatedStoreValue))
+  // Check that we can insert the selects and that it's not too expensive to do
+  // so.
+  bool Convert = SpeculatedStore != nullptr;
+  Convert |= validateAndCostRequiredSelects(BB, ThenBB, EndBB,
+                                            SpeculatedInstructions,
+                                            BudgetRemaining, TTI);
+  if (!Convert || BudgetRemaining < 0)
     return false;
 
   // If we get here, we can hoist the instruction and if-convert.
@@ -2199,6 +2385,12 @@ static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB) {
   for (Instruction &I : BB->instructionsWithoutDebug()) {
     if (Size > MaxSmallBlockSize)
       return false; // Don't clone large BB's.
+
+    // Can't fold blocks that contain noduplicate or convergent calls.
+    if (CallInst *CI = dyn_cast<CallInst>(&I))
+      if (CI->cannotDuplicate() || CI->isConvergent())
+        return false;
+
     // We will delete Phis while threading, so Phis should not be accounted in
     // block's size
     if (!isa<PHINode>(I))
@@ -2221,8 +2413,8 @@ static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB) {
 /// If we have a conditional branch on a PHI node value that is defined in the
 /// same block as the branch and if any PHI entries are constants, thread edges
 /// corresponding to that entry to be branches to their ultimate destination.
-static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout &DL,
-                                AssumptionCache *AC) {
+static bool FoldCondBranchOnPHI(BranchInst *BI, DomTreeUpdater *DTU,
+                                const DataLayout &DL, AssumptionCache *AC) {
   BasicBlock *BB = BI->getParent();
   PHINode *PN = dyn_cast<PHINode>(BI->getCondition());
   // NOTE: we currently cannot transform this case if the PHI node is used
@@ -2240,13 +2432,6 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout &DL,
   if (!BlockIsSimpleEnoughToThreadThrough(BB))
     return false;
 
-  // Can't fold blocks that contain noduplicate or convergent calls.
-  if (any_of(*BB, [](const Instruction &I) {
-        const CallInst *CI = dyn_cast<CallInst>(&I);
-        return CI && (CI->cannotDuplicate() || CI->isConvergent());
-      }))
-    return false;
-
   // Okay, this is a simple enough basic block.  See if any phi values are
   // constants.
   for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
@@ -2265,6 +2450,8 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout &DL,
     if (isa<IndirectBrInst>(PredBB->getTerminator()))
       continue;
 
+    SmallVector<DominatorTree::UpdateType, 3> Updates;
+
     // The dest block might have PHI nodes, other predecessors and other
     // difficult cases.  Instead of being smart about this, just insert a new
     // block that jumps to the destination block, effectively splitting
@@ -2273,6 +2460,7 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout &DL,
         BasicBlock::Create(BB->getContext(), RealDest->getName() + ".critedge",
                            RealDest->getParent(), RealDest);
     BranchInst *CritEdgeBranch = BranchInst::Create(RealDest, EdgeBB);
+    Updates.push_back({DominatorTree::Insert, EdgeBB, RealDest});
     CritEdgeBranch->setDebugLoc(BI->getDebugLoc());
 
     // Update PHI nodes.
@@ -2331,8 +2519,14 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout &DL,
         PredBBTI->setSuccessor(i, EdgeBB);
       }
 
+    Updates.push_back({DominatorTree::Insert, PredBB, EdgeBB});
+    Updates.push_back({DominatorTree::Delete, PredBB, BB});
+
+    if (DTU)
+      DTU->applyUpdates(Updates);
+
     // Recurse, simplifying any other constants.
-    return FoldCondBranchOnPHI(BI, DL, AC) || true;
+    return FoldCondBranchOnPHI(BI, DTU, DL, AC) || true;
   }
 
   return false;
@@ -2341,7 +2535,7 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout &DL,
 /// Given a BB that starts with the specified two-entry PHI node,
 /// see if we can eliminate it.
 static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
-                                const DataLayout &DL) {
+                                DomTreeUpdater *DTU, const DataLayout &DL) {
   // Ok, this is a two entry PHI node.  Check to see if this is a simple "if
   // statement", which has a very simple dominance structure.  Basically, we
   // are trying to find the condition that is being branched on, which
@@ -2374,11 +2568,13 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
   int BudgetRemaining =
       TwoEntryPHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic;
 
+  bool Changed = false;
   for (BasicBlock::iterator II = BB->begin(); isa<PHINode>(II);) {
     PHINode *PN = cast<PHINode>(II++);
     if (Value *V = SimplifyInstruction(PN, {DL, PN})) {
       PN->replaceAllUsesWith(V);
       PN->eraseFromParent();
+      Changed = true;
       continue;
     }
 
@@ -2386,7 +2582,7 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
                              BudgetRemaining, TTI) ||
         !DominatesMergePoint(PN->getIncomingValue(1), BB, AggressiveInsts,
                              BudgetRemaining, TTI))
-      return false;
+      return Changed;
   }
 
   // If we folded the first phi, PN dangles at this point.  Refresh it.  If
@@ -2413,7 +2609,7 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
        isa<BinaryOperator>(IfCond)) &&
       !CanHoistNotFromBothValues(PN->getIncomingValue(0),
                                  PN->getIncomingValue(1)))
-    return false;
+    return Changed;
 
   // If all PHI nodes are promotable, check to make sure that all instructions
   // in the predecessor blocks can be promoted as well. If not, we won't be able
@@ -2427,11 +2623,12 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
   } else {
     DomBlock = *pred_begin(IfBlock1);
     for (BasicBlock::iterator I = IfBlock1->begin(); !I->isTerminator(); ++I)
-      if (!AggressiveInsts.count(&*I) && !isa<DbgInfoIntrinsic>(I)) {
+      if (!AggressiveInsts.count(&*I) && !isa<DbgInfoIntrinsic>(I) &&
+          !isa<PseudoProbeInst>(I)) {
         // This is not an aggressive instruction that we can promote.
         // Because of this, we won't be able to get rid of the control flow, so
         // the xform is not worth it.
-        return false;
+        return Changed;
       }
   }
 
@@ -2440,11 +2637,12 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
   } else {
     DomBlock = *pred_begin(IfBlock2);
     for (BasicBlock::iterator I = IfBlock2->begin(); !I->isTerminator(); ++I)
-      if (!AggressiveInsts.count(&*I) && !isa<DbgInfoIntrinsic>(I)) {
+      if (!AggressiveInsts.count(&*I) && !isa<DbgInfoIntrinsic>(I) &&
+          !isa<PseudoProbeInst>(I)) {
         // This is not an aggressive instruction that we can promote.
         // Because of this, we won't be able to get rid of the control flow, so
         // the xform is not worth it.
-        return false;
+        return Changed;
       }
   }
   assert(DomBlock && "Failed to find root DomBlock");
@@ -2487,7 +2685,18 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
   Instruction *OldTI = DomBlock->getTerminator();
   Builder.SetInsertPoint(OldTI);
   Builder.CreateBr(BB);
+
+  SmallVector<DominatorTree::UpdateType, 3> Updates;
+  if (DTU) {
+    Updates.push_back({DominatorTree::Insert, DomBlock, BB});
+    for (auto *Successor : successors(DomBlock))
+      Updates.push_back({DominatorTree::Delete, DomBlock, Successor});
+  }
+
   OldTI->eraseFromParent();
+  if (DTU)
+    DTU->applyUpdates(Updates);
+
   return true;
 }
 
@@ -2496,9 +2705,11 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
 /// introducing a select if the return values disagree.
 bool SimplifyCFGOpt::SimplifyCondBranchToTwoReturns(BranchInst *BI,
                                                     IRBuilder<> &Builder) {
+  auto *BB = BI->getParent();
   assert(BI->isConditional() && "Must be a conditional branch");
   BasicBlock *TrueSucc = BI->getSuccessor(0);
   BasicBlock *FalseSucc = BI->getSuccessor(1);
+  // NOTE: destinations may match, this could be degenerate uncond branch.
   ReturnInst *TrueRet = cast<ReturnInst>(TrueSucc->getTerminator());
   ReturnInst *FalseRet = cast<ReturnInst>(FalseSucc->getTerminator());
 
@@ -2515,10 +2726,17 @@ bool SimplifyCFGOpt::SimplifyCondBranchToTwoReturns(BranchInst *BI,
   // there is no return value for this function, just change the
   // branch into a return.
   if (FalseRet->getNumOperands() == 0) {
-    TrueSucc->removePredecessor(BI->getParent());
-    FalseSucc->removePredecessor(BI->getParent());
+    TrueSucc->removePredecessor(BB);
+    FalseSucc->removePredecessor(BB);
     Builder.CreateRetVoid();
     EraseTerminatorAndDCECond(BI);
+    if (DTU) {
+      SmallVector<DominatorTree::UpdateType, 2> Updates;
+      Updates.push_back({DominatorTree::Delete, BB, TrueSucc});
+      if (TrueSucc != FalseSucc)
+        Updates.push_back({DominatorTree::Delete, BB, FalseSucc});
+      DTU->applyUpdates(Updates);
+    }
     return true;
   }
 
@@ -2530,10 +2748,10 @@ bool SimplifyCFGOpt::SimplifyCondBranchToTwoReturns(BranchInst *BI,
   // Unwrap any PHI nodes in the return blocks.
   if (PHINode *TVPN = dyn_cast_or_null<PHINode>(TrueValue))
     if (TVPN->getParent() == TrueSucc)
-      TrueValue = TVPN->getIncomingValueForBlock(BI->getParent());
+      TrueValue = TVPN->getIncomingValueForBlock(BB);
   if (PHINode *FVPN = dyn_cast_or_null<PHINode>(FalseValue))
     if (FVPN->getParent() == FalseSucc)
-      FalseValue = FVPN->getIncomingValueForBlock(BI->getParent());
+      FalseValue = FVPN->getIncomingValueForBlock(BB);
 
   // In order for this transformation to be safe, we must be able to
   // unconditionally execute both operands to the return.  This is
@@ -2549,8 +2767,8 @@ bool SimplifyCFGOpt::SimplifyCondBranchToTwoReturns(BranchInst *BI,
 
   // Okay, we collected all the mapped values and checked them for sanity, and
   // defined to really do this transformation.  First, update the CFG.
-  TrueSucc->removePredecessor(BI->getParent());
-  FalseSucc->removePredecessor(BI->getParent());
+  TrueSucc->removePredecessor(BB);
+  FalseSucc->removePredecessor(BB);
 
   // Insert select instructions where needed.
   Value *BrCond = BI->getCondition();
@@ -2575,27 +2793,17 @@ bool SimplifyCFGOpt::SimplifyCondBranchToTwoReturns(BranchInst *BI,
                     << *TrueSucc << "\nFALSEBLOCK: " << *FalseSucc);
 
   EraseTerminatorAndDCECond(BI);
+  if (DTU) {
+    SmallVector<DominatorTree::UpdateType, 2> Updates;
+    Updates.push_back({DominatorTree::Delete, BB, TrueSucc});
+    if (TrueSucc != FalseSucc)
+      Updates.push_back({DominatorTree::Delete, BB, FalseSucc});
+    DTU->applyUpdates(Updates);
+  }
 
   return true;
 }
 
-/// Return true if the given instruction is available
-/// in its predecessor block. If yes, the instruction will be removed.
-static bool tryCSEWithPredecessor(Instruction *Inst, BasicBlock *PB) {
-  if (!isa<BinaryOperator>(Inst) && !isa<CmpInst>(Inst))
-    return false;
-  for (Instruction &I : *PB) {
-    Instruction *PBI = &I;
-    // Check whether Inst and PBI generate the same value.
-    if (Inst->isIdenticalTo(PBI)) {
-      Inst->replaceAllUsesWith(PBI);
-      Inst->eraseFromParent();
-      return true;
-    }
-  }
-  return false;
-}
-
 /// Return true if either PBI or BI has branch weight available, and store
 /// the weights in {Pred|Succ}{True|False}Weight. If one of PBI and BI does
 /// not have branch weight, use 1:1 as its weight.
@@ -2619,63 +2827,174 @@ static bool extractPredSuccWeights(BranchInst *PBI, BranchInst *BI,
   }
 }
 
+// Determine if the two branches share a common destination,
+// and deduce a glue that we need to use to join branch's conditions
+// to arrive at the common destination.
+static Optional<std::pair<Instruction::BinaryOps, bool>>
+CheckIfCondBranchesShareCommonDestination(BranchInst *BI, BranchInst *PBI) {
+  assert(BI && PBI && BI->isConditional() && PBI->isConditional() &&
+         "Both blocks must end with a conditional branches.");
+  assert(is_contained(predecessors(BI->getParent()), PBI->getParent()) &&
+         "PredBB must be a predecessor of BB.");
+
+  if (PBI->getSuccessor(0) == BI->getSuccessor(0))
+    return {{Instruction::Or, false}};
+  else if (PBI->getSuccessor(1) == BI->getSuccessor(1))
+    return {{Instruction::And, false}};
+  else if (PBI->getSuccessor(0) == BI->getSuccessor(1))
+    return {{Instruction::And, true}};
+  else if (PBI->getSuccessor(1) == BI->getSuccessor(0))
+    return {{Instruction::Or, true}};
+  return None;
+}
+
+static bool PerformBranchToCommonDestFolding(BranchInst *BI, BranchInst *PBI,
+                                             DomTreeUpdater *DTU,
+                                             MemorySSAUpdater *MSSAU) {
+  BasicBlock *BB = BI->getParent();
+  BasicBlock *PredBlock = PBI->getParent();
+
+  // Determine if the two branches share a common destination.
+  Instruction::BinaryOps Opc;
+  bool InvertPredCond;
+  std::tie(Opc, InvertPredCond) =
+      *CheckIfCondBranchesShareCommonDestination(BI, PBI);
+
+  LLVM_DEBUG(dbgs() << "FOLDING BRANCH TO COMMON DEST:\n" << *PBI << *BB);
+
+  IRBuilder<> Builder(PBI);
+  // The builder is used to create instructions to eliminate the branch in BB.
+  // If BB's terminator has !annotation metadata, add it to the new
+  // instructions.
+  Builder.CollectMetadataToCopy(BB->getTerminator(),
+                                {LLVMContext::MD_annotation});
+
+  // If we need to invert the condition in the pred block to match, do so now.
+  if (InvertPredCond) {
+    Value *NewCond = PBI->getCondition();
+    if (NewCond->hasOneUse() && isa<CmpInst>(NewCond)) {
+      CmpInst *CI = cast<CmpInst>(NewCond);
+      CI->setPredicate(CI->getInversePredicate());
+    } else {
+      NewCond =
+          Builder.CreateNot(NewCond, PBI->getCondition()->getName() + ".not");
+    }
+
+    PBI->setCondition(NewCond);
+    PBI->swapSuccessors();
+  }
+
+  BasicBlock *UniqueSucc =
+      PBI->getSuccessor(0) == BB ? BI->getSuccessor(0) : BI->getSuccessor(1);
+
+  // Before cloning instructions, notify the successor basic block that it
+  // is about to have a new predecessor. This will update PHI nodes,
+  // which will allow us to update live-out uses of bonus instructions.
+  AddPredecessorToBlock(UniqueSucc, PredBlock, BB, MSSAU);
+
+  // Try to update branch weights.
+  uint64_t PredTrueWeight, PredFalseWeight, SuccTrueWeight, SuccFalseWeight;
+  if (extractPredSuccWeights(PBI, BI, PredTrueWeight, PredFalseWeight,
+                             SuccTrueWeight, SuccFalseWeight)) {
+    SmallVector<uint64_t, 8> NewWeights;
+
+    if (PBI->getSuccessor(0) == BB) {
+      // PBI: br i1 %x, BB, FalseDest
+      // BI:  br i1 %y, UniqueSucc, FalseDest
+      // TrueWeight is TrueWeight for PBI * TrueWeight for BI.
+      NewWeights.push_back(PredTrueWeight * SuccTrueWeight);
+      // FalseWeight is FalseWeight for PBI * TotalWeight for BI +
+      //               TrueWeight for PBI * FalseWeight for BI.
+      // We assume that total weights of a BranchInst can fit into 32 bits.
+      // Therefore, we will not have overflow using 64-bit arithmetic.
+      NewWeights.push_back(PredFalseWeight *
+                               (SuccFalseWeight + SuccTrueWeight) +
+                           PredTrueWeight * SuccFalseWeight);
+    } else {
+      // PBI: br i1 %x, TrueDest, BB
+      // BI:  br i1 %y, TrueDest, UniqueSucc
+      // TrueWeight is TrueWeight for PBI * TotalWeight for BI +
+      //              FalseWeight for PBI * TrueWeight for BI.
+      NewWeights.push_back(PredTrueWeight * (SuccFalseWeight + SuccTrueWeight) +
+                           PredFalseWeight * SuccTrueWeight);
+      // FalseWeight is FalseWeight for PBI * FalseWeight for BI.
+      NewWeights.push_back(PredFalseWeight * SuccFalseWeight);
+    }
+
+    // Halve the weights if any of them cannot fit in an uint32_t
+    FitWeights(NewWeights);
+
+    SmallVector<uint32_t, 8> MDWeights(NewWeights.begin(), NewWeights.end());
+    setBranchWeights(PBI, MDWeights[0], MDWeights[1]);
+
+    // TODO: If BB is reachable from all paths through PredBlock, then we
+    // could replace PBI's branch probabilities with BI's.
+  } else
+    PBI->setMetadata(LLVMContext::MD_prof, nullptr);
+
+  // Now, update the CFG.
+  PBI->setSuccessor(PBI->getSuccessor(0) != BB, UniqueSucc);
+
+  if (DTU)
+    DTU->applyUpdates({{DominatorTree::Insert, PredBlock, UniqueSucc},
+                       {DominatorTree::Delete, PredBlock, BB}});
+
+  // If BI was a loop latch, it may have had associated loop metadata.
+  // We need to copy it to the new latch, that is, PBI.
+  if (MDNode *LoopMD = BI->getMetadata(LLVMContext::MD_loop))
+    PBI->setMetadata(LLVMContext::MD_loop, LoopMD);
+
+  ValueToValueMapTy VMap; // maps original values to cloned values
+  CloneInstructionsIntoPredecessorBlockAndUpdateSSAUses(BB, PredBlock, VMap);
+
+  // Now that the Cond was cloned into the predecessor basic block,
+  // or/and the two conditions together.
+  Instruction *NewCond = cast<Instruction>(Builder.CreateBinOp(
+      Opc, PBI->getCondition(), VMap[BI->getCondition()], "or.cond"));
+  PBI->setCondition(NewCond);
+
+  // Copy any debug value intrinsics into the end of PredBlock.
+  for (Instruction &I : *BB) {
+    if (isa<DbgInfoIntrinsic>(I)) {
+      Instruction *NewI = I.clone();
+      RemapInstruction(NewI, VMap,
+                       RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+      NewI->insertBefore(PBI);
+    }
+  }
+
+  ++NumFoldBranchToCommonDest;
+  return true;
+}
+
 /// If this basic block is simple enough, and if a predecessor branches to us
 /// and one of our successors, fold the block into the predecessor and use
 /// logical operations to pick the right destination.
-bool llvm::FoldBranchToCommonDest(BranchInst *BI, MemorySSAUpdater *MSSAU,
+bool llvm::FoldBranchToCommonDest(BranchInst *BI, DomTreeUpdater *DTU,
+                                  MemorySSAUpdater *MSSAU,
+                                  const TargetTransformInfo *TTI,
                                   unsigned BonusInstThreshold) {
+  // If this block ends with an unconditional branch,
+  // let SpeculativelyExecuteBB() deal with it.
+  if (!BI->isConditional())
+    return false;
+
   BasicBlock *BB = BI->getParent();
 
   const unsigned PredCount = pred_size(BB);
 
   bool Changed = false;
 
-  Instruction *Cond = nullptr;
-  if (BI->isConditional())
-    Cond = dyn_cast<Instruction>(BI->getCondition());
-  else {
-    // For unconditional branch, check for a simple CFG pattern, where
-    // BB has a single predecessor and BB's successor is also its predecessor's
-    // successor. If such pattern exists, check for CSE between BB and its
-    // predecessor.
-    if (BasicBlock *PB = BB->getSinglePredecessor())
-      if (BranchInst *PBI = dyn_cast<BranchInst>(PB->getTerminator()))
-        if (PBI->isConditional() &&
-            (BI->getSuccessor(0) == PBI->getSuccessor(0) ||
-             BI->getSuccessor(0) == PBI->getSuccessor(1))) {
-          for (auto I = BB->instructionsWithoutDebug().begin(),
-                    E = BB->instructionsWithoutDebug().end();
-               I != E;) {
-            Instruction *Curr = &*I++;
-            if (isa<CmpInst>(Curr)) {
-              Cond = Curr;
-              break;
-            }
-            // Quit if we can't remove this instruction.
-            if (!tryCSEWithPredecessor(Curr, PB))
-              return Changed;
-            Changed = true;
-          }
-        }
+  TargetTransformInfo::TargetCostKind CostKind =
+    BB->getParent()->hasMinSize() ? TargetTransformInfo::TCK_CodeSize
+                                  : TargetTransformInfo::TCK_SizeAndLatency;
 
-    if (!Cond)
-      return Changed;
-  }
+  Instruction *Cond = dyn_cast<Instruction>(BI->getCondition());
 
   if (!Cond || (!isa<CmpInst>(Cond) && !isa<BinaryOperator>(Cond)) ||
       Cond->getParent() != BB || !Cond->hasOneUse())
     return Changed;
 
-  // Make sure the instruction after the condition is the cond branch.
-  BasicBlock::iterator CondIt = ++Cond->getIterator();
-
-  // Ignore dbg intrinsics.
-  while (isa<DbgInfoIntrinsic>(CondIt))
-    ++CondIt;
-
-  if (&*CondIt != BI)
-    return Changed;
-
   // Only allow this transformation if computing the condition doesn't involve
   // too many instructions and these involved instructions can be executed
   // unconditionally. We denote all involved instructions except the condition
@@ -2683,19 +3002,16 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, MemorySSAUpdater *MSSAU,
   // number of the bonus instructions we'll need to create when cloning into
   // each predecessor does not exceed a certain threshold.
   unsigned NumBonusInsts = 0;
-  for (auto I = BB->begin(); Cond != &*I; ++I) {
-    // Ignore dbg intrinsics.
-    if (isa<DbgInfoIntrinsic>(I))
+  for (Instruction &I : *BB) {
+    // Don't check the branch condition comparison itself.
+    if (&I == Cond)
       continue;
-    if (!I->hasOneUse() || !isSafeToSpeculativelyExecute(&*I))
-      return Changed;
-    // I has only one use and can be executed unconditionally.
-    Instruction *User = dyn_cast<Instruction>(I->user_back());
-    if (User == nullptr || User->getParent() != BB)
+    // Ignore dbg intrinsics, and the terminator.
+    if (isa<DbgInfoIntrinsic>(I) || isa<BranchInst>(I))
+      continue;
+    // I must be safe to execute unconditionally.
+    if (!isSafeToSpeculativelyExecute(&I))
       return Changed;
-    // I is used in the same BB. Since BI uses Cond and doesn't have more slots
-    // to use any other instruction, User must be an instruction between next(I)
-    // and Cond.
 
     // Account for the cost of duplicating this instruction into each
     // predecessor.
@@ -2715,9 +3031,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, MemorySSAUpdater *MSSAU,
       return Changed;
 
   // Finally, don't infinitely unroll conditional loops.
-  BasicBlock *TrueDest = BI->getSuccessor(0);
-  BasicBlock *FalseDest = (BI->isConditional()) ? BI->getSuccessor(1) : nullptr;
-  if (TrueDest == BB || FalseDest == BB)
+  if (is_contained(successors(BB), BB))
     return Changed;
 
   for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
@@ -2727,222 +3041,31 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, MemorySSAUpdater *MSSAU,
     // Check that we have two conditional branches.  If there is a PHI node in
     // the common successor, verify that the same value flows in from both
     // blocks.
-    SmallVector<PHINode *, 4> PHIs;
-    if (!PBI || PBI->isUnconditional() ||
-        (BI->isConditional() && !SafeToMergeTerminators(BI, PBI)) ||
-        (!BI->isConditional() &&
-         !isProfitableToFoldUnconditional(BI, PBI, Cond, PHIs)))
+    if (!PBI || PBI->isUnconditional() || !SafeToMergeTerminators(BI, PBI))
       continue;
 
     // Determine if the two branches share a common destination.
-    Instruction::BinaryOps Opc = Instruction::BinaryOpsEnd;
-    bool InvertPredCond = false;
-
-    if (BI->isConditional()) {
-      if (PBI->getSuccessor(0) == TrueDest) {
-        Opc = Instruction::Or;
-      } else if (PBI->getSuccessor(1) == FalseDest) {
-        Opc = Instruction::And;
-      } else if (PBI->getSuccessor(0) == FalseDest) {
-        Opc = Instruction::And;
-        InvertPredCond = true;
-      } else if (PBI->getSuccessor(1) == TrueDest) {
-        Opc = Instruction::Or;
-        InvertPredCond = true;
-      } else {
-        continue;
-      }
-    } else {
-      if (PBI->getSuccessor(0) != TrueDest && PBI->getSuccessor(1) != TrueDest)
-        continue;
-    }
-
-    LLVM_DEBUG(dbgs() << "FOLDING BRANCH TO COMMON DEST:\n" << *PBI << *BB);
-    Changed = true;
-
-    IRBuilder<> Builder(PBI);
-
-    // If we need to invert the condition in the pred block to match, do so now.
-    if (InvertPredCond) {
-      Value *NewCond = PBI->getCondition();
-
-      if (NewCond->hasOneUse() && isa<CmpInst>(NewCond)) {
-        CmpInst *CI = cast<CmpInst>(NewCond);
-        CI->setPredicate(CI->getInversePredicate());
-      } else {
-        NewCond =
-            Builder.CreateNot(NewCond, PBI->getCondition()->getName() + ".not");
-      }
+    Instruction::BinaryOps Opc;
+    bool InvertPredCond;
+    if (auto Recepie = CheckIfCondBranchesShareCommonDestination(BI, PBI))
+      std::tie(Opc, InvertPredCond) = *Recepie;
+    else
+      continue;
 
-      PBI->setCondition(NewCond);
-      PBI->swapSuccessors();
-    }
+    // Check the cost of inserting the necessary logic before performing the
+    // transformation.
+    if (TTI) {
+      Type *Ty = BI->getCondition()->getType();
+      unsigned Cost = TTI->getArithmeticInstrCost(Opc, Ty, CostKind);
+      if (InvertPredCond && (!PBI->getCondition()->hasOneUse() ||
+          !isa<CmpInst>(PBI->getCondition())))
+        Cost += TTI->getArithmeticInstrCost(Instruction::Xor, Ty, CostKind);
 
-    // If we have bonus instructions, clone them into the predecessor block.
-    // Note that there may be multiple predecessor blocks, so we cannot move
-    // bonus instructions to a predecessor block.
-    ValueToValueMapTy VMap; // maps original values to cloned values
-    // We already make sure Cond is the last instruction before BI. Therefore,
-    // all instructions before Cond other than DbgInfoIntrinsic are bonus
-    // instructions.
-    for (auto BonusInst = BB->begin(); Cond != &*BonusInst; ++BonusInst) {
-      if (isa<DbgInfoIntrinsic>(BonusInst))
+      if (Cost > BranchFoldThreshold)
         continue;
-      Instruction *NewBonusInst = BonusInst->clone();
-
-      // When we fold the bonus instructions we want to make sure we
-      // reset their debug locations in order to avoid stepping on dead
-      // code caused by folding dead branches.
-      NewBonusInst->setDebugLoc(DebugLoc());
-
-      RemapInstruction(NewBonusInst, VMap,
-                       RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
-      VMap[&*BonusInst] = NewBonusInst;
-
-      // If we moved a load, we cannot any longer claim any knowledge about
-      // its potential value. The previous information might have been valid
-      // only given the branch precondition.
-      // For an analogous reason, we must also drop all the metadata whose
-      // semantics we don't understand.
-      NewBonusInst->dropUnknownNonDebugMetadata();
-
-      PredBlock->getInstList().insert(PBI->getIterator(), NewBonusInst);
-      NewBonusInst->takeName(&*BonusInst);
-      BonusInst->setName(BonusInst->getName() + ".old");
     }
 
-    // Clone Cond into the predecessor basic block, and or/and the
-    // two conditions together.
-    Instruction *CondInPred = Cond->clone();
-
-    // Reset the condition debug location to avoid jumping on dead code
-    // as the result of folding dead branches.
-    CondInPred->setDebugLoc(DebugLoc());
-
-    RemapInstruction(CondInPred, VMap,
-                     RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
-    PredBlock->getInstList().insert(PBI->getIterator(), CondInPred);
-    CondInPred->takeName(Cond);
-    Cond->setName(CondInPred->getName() + ".old");
-
-    if (BI->isConditional()) {
-      Instruction *NewCond = cast<Instruction>(
-          Builder.CreateBinOp(Opc, PBI->getCondition(), CondInPred, "or.cond"));
-      PBI->setCondition(NewCond);
-
-      uint64_t PredTrueWeight, PredFalseWeight, SuccTrueWeight, SuccFalseWeight;
-      bool HasWeights =
-          extractPredSuccWeights(PBI, BI, PredTrueWeight, PredFalseWeight,
-                                 SuccTrueWeight, SuccFalseWeight);
-      SmallVector<uint64_t, 8> NewWeights;
-
-      if (PBI->getSuccessor(0) == BB) {
-        if (HasWeights) {
-          // PBI: br i1 %x, BB, FalseDest
-          // BI:  br i1 %y, TrueDest, FalseDest
-          // TrueWeight is TrueWeight for PBI * TrueWeight for BI.
-          NewWeights.push_back(PredTrueWeight * SuccTrueWeight);
-          // FalseWeight is FalseWeight for PBI * TotalWeight for BI +
-          //               TrueWeight for PBI * FalseWeight for BI.
-          // We assume that total weights of a BranchInst can fit into 32 bits.
-          // Therefore, we will not have overflow using 64-bit arithmetic.
-          NewWeights.push_back(PredFalseWeight *
-                                   (SuccFalseWeight + SuccTrueWeight) +
-                               PredTrueWeight * SuccFalseWeight);
-        }
-        AddPredecessorToBlock(TrueDest, PredBlock, BB, MSSAU);
-        PBI->setSuccessor(0, TrueDest);
-      }
-      if (PBI->getSuccessor(1) == BB) {
-        if (HasWeights) {
-          // PBI: br i1 %x, TrueDest, BB
-          // BI:  br i1 %y, TrueDest, FalseDest
-          // TrueWeight is TrueWeight for PBI * TotalWeight for BI +
-          //              FalseWeight for PBI * TrueWeight for BI.
-          NewWeights.push_back(PredTrueWeight *
-                                   (SuccFalseWeight + SuccTrueWeight) +
-                               PredFalseWeight * SuccTrueWeight);
-          // FalseWeight is FalseWeight for PBI * FalseWeight for BI.
-          NewWeights.push_back(PredFalseWeight * SuccFalseWeight);
-        }
-        AddPredecessorToBlock(FalseDest, PredBlock, BB, MSSAU);
-        PBI->setSuccessor(1, FalseDest);
-      }
-      if (NewWeights.size() == 2) {
-        // Halve the weights if any of them cannot fit in an uint32_t
-        FitWeights(NewWeights);
-
-        SmallVector<uint32_t, 8> MDWeights(NewWeights.begin(),
-                                           NewWeights.end());
-        setBranchWeights(PBI, MDWeights[0], MDWeights[1]);
-      } else
-        PBI->setMetadata(LLVMContext::MD_prof, nullptr);
-    } else {
-      // Update PHI nodes in the common successors.
-      for (unsigned i = 0, e = PHIs.size(); i != e; ++i) {
-        ConstantInt *PBI_C = cast<ConstantInt>(
-            PHIs[i]->getIncomingValueForBlock(PBI->getParent()));
-        assert(PBI_C->getType()->isIntegerTy(1));
-        Instruction *MergedCond = nullptr;
-        if (PBI->getSuccessor(0) == TrueDest) {
-          // Create (PBI_Cond and PBI_C) or (!PBI_Cond and BI_Value)
-          // PBI_C is true: PBI_Cond or (!PBI_Cond and BI_Value)
-          //       is false: !PBI_Cond and BI_Value
-          Instruction *NotCond = cast<Instruction>(
-              Builder.CreateNot(PBI->getCondition(), "not.cond"));
-          MergedCond = cast<Instruction>(
-               Builder.CreateBinOp(Instruction::And, NotCond, CondInPred,
-                                   "and.cond"));
-          if (PBI_C->isOne())
-            MergedCond = cast<Instruction>(Builder.CreateBinOp(
-                Instruction::Or, PBI->getCondition(), MergedCond, "or.cond"));
-        } else {
-          // Create (PBI_Cond and BI_Value) or (!PBI_Cond and PBI_C)
-          // PBI_C is true: (PBI_Cond and BI_Value) or (!PBI_Cond)
-          //       is false: PBI_Cond and BI_Value
-          MergedCond = cast<Instruction>(Builder.CreateBinOp(
-              Instruction::And, PBI->getCondition(), CondInPred, "and.cond"));
-          if (PBI_C->isOne()) {
-            Instruction *NotCond = cast<Instruction>(
-                Builder.CreateNot(PBI->getCondition(), "not.cond"));
-            MergedCond = cast<Instruction>(Builder.CreateBinOp(
-                Instruction::Or, NotCond, MergedCond, "or.cond"));
-          }
-        }
-        // Update PHI Node.
-	PHIs[i]->setIncomingValueForBlock(PBI->getParent(), MergedCond);
-      }
-
-      // PBI is changed to branch to TrueDest below. Remove itself from
-      // potential phis from all other successors.
-      if (MSSAU)
-        MSSAU->changeCondBranchToUnconditionalTo(PBI, TrueDest);
-
-      // Change PBI from Conditional to Unconditional.
-      BranchInst *New_PBI = BranchInst::Create(TrueDest, PBI);
-      EraseTerminatorAndDCECond(PBI, MSSAU);
-      PBI = New_PBI;
-    }
-
-    // If BI was a loop latch, it may have had associated loop metadata.
-    // We need to copy it to the new latch, that is, PBI.
-    if (MDNode *LoopMD = BI->getMetadata(LLVMContext::MD_loop))
-      PBI->setMetadata(LLVMContext::MD_loop, LoopMD);
-
-    // TODO: If BB is reachable from all paths through PredBlock, then we
-    // could replace PBI's branch probabilities with BI's.
-
-    // Copy any debug value intrinsics into the end of PredBlock.
-    for (Instruction &I : *BB) {
-      if (isa<DbgInfoIntrinsic>(I)) {
-        Instruction *NewI = I.clone();
-        RemapInstruction(NewI, VMap,
-                         RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
-        NewI->insertBefore(PBI);
-      }
-    }
-
-    return Changed;
+    return PerformBranchToCommonDestFolding(BI, PBI, DTU, MSSAU);
   }
   return Changed;
 }
@@ -3015,12 +3138,10 @@ static Value *ensureValueAvailableInSuccessor(Value *V, BasicBlock *BB,
   return PHI;
 }
 
-static bool mergeConditionalStoreToAddress(BasicBlock *PTB, BasicBlock *PFB,
-                                           BasicBlock *QTB, BasicBlock *QFB,
-                                           BasicBlock *PostBB, Value *Address,
-                                           bool InvertPCond, bool InvertQCond,
-                                           const DataLayout &DL,
-                                           const TargetTransformInfo &TTI) {
+static bool mergeConditionalStoreToAddress(
+    BasicBlock *PTB, BasicBlock *PFB, BasicBlock *QTB, BasicBlock *QFB,
+    BasicBlock *PostBB, Value *Address, bool InvertPCond, bool InvertQCond,
+    DomTreeUpdater *DTU, const DataLayout &DL, const TargetTransformInfo &TTI) {
   // For every pointer, there must be exactly two stores, one coming from
   // PTB or PFB, and the other from QTB or QFB. We don't support more than one
   // store (to any address) in PTB,PFB or QTB,QFB.
@@ -3095,7 +3216,7 @@ static bool mergeConditionalStoreToAddress(BasicBlock *PTB, BasicBlock *PFB,
     return true;
   };
 
-  const SmallVector<StoreInst *, 2> FreeStores = {PStore, QStore};
+  const std::array<StoreInst *, 2> FreeStores = {PStore, QStore};
   if (!MergeCondStoresAggressively &&
       (!IsWorthwhile(PTB, FreeStores) || !IsWorthwhile(PFB, FreeStores) ||
        !IsWorthwhile(QTB, FreeStores) || !IsWorthwhile(QFB, FreeStores)))
@@ -3109,8 +3230,8 @@ static bool mergeConditionalStoreToAddress(BasicBlock *PTB, BasicBlock *PFB,
     // If QTB does not exist, then QFB's only predecessor has a conditional
     // branch to QFB and PostBB.
     BasicBlock *TruePred = QTB ? QTB : QFB->getSinglePredecessor();
-    BasicBlock *NewBB = SplitBlockPredecessors(PostBB, { QFB, TruePred},
-                                               "condstore.split");
+    BasicBlock *NewBB =
+        SplitBlockPredecessors(PostBB, {QFB, TruePred}, "condstore.split", DTU);
     if (!NewBB)
       return false;
     PostBB = NewBB;
@@ -3139,8 +3260,9 @@ static bool mergeConditionalStoreToAddress(BasicBlock *PTB, BasicBlock *PFB,
     QPred = QB.CreateNot(QPred);
   Value *CombinedPred = QB.CreateOr(PPred, QPred);
 
-  auto *T =
-      SplitBlockAndInsertIfThen(CombinedPred, &*QB.GetInsertPoint(), false);
+  auto *T = SplitBlockAndInsertIfThen(CombinedPred, &*QB.GetInsertPoint(),
+                                      /*Unreachable=*/false,
+                                      /*BranchWeights=*/nullptr, DTU);
   QB.SetInsertPoint(T);
   StoreInst *SI = cast<StoreInst>(QB.CreateStore(QPHI, Address));
   AAMDNodes AAMD;
@@ -3160,7 +3282,7 @@ static bool mergeConditionalStoreToAddress(BasicBlock *PTB, BasicBlock *PFB,
 }
 
 static bool mergeConditionalStores(BranchInst *PBI, BranchInst *QBI,
-                                   const DataLayout &DL,
+                                   DomTreeUpdater *DTU, const DataLayout &DL,
                                    const TargetTransformInfo &TTI) {
   // The intention here is to find diamonds or triangles (see below) where each
   // conditional block contains a store to the same address. Both of these
@@ -3262,16 +3384,17 @@ static bool mergeConditionalStores(BranchInst *PBI, BranchInst *QBI,
 
   bool Changed = false;
   for (auto *Address : CommonAddresses)
-    Changed |= mergeConditionalStoreToAddress(
-        PTB, PFB, QTB, QFB, PostBB, Address, InvertPCond, InvertQCond, DL, TTI);
+    Changed |=
+        mergeConditionalStoreToAddress(PTB, PFB, QTB, QFB, PostBB, Address,
+                                       InvertPCond, InvertQCond, DTU, DL, TTI);
   return Changed;
 }
 
-
 /// If the previous block ended with a widenable branch, determine if reusing
 /// the target block is profitable and legal.  This will have the effect of
 /// "widening" PBI, but doesn't require us to reason about hosting safety.
-static bool tryWidenCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) {
+static bool tryWidenCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
+                                           DomTreeUpdater *DTU) {
   // TODO: This can be generalized in two important ways:
   // 1) We can allow phi nodes in IfFalseBB and simply reuse all the input
   //    values from the PBI edge.
@@ -3294,15 +3417,25 @@ static bool tryWidenCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) {
   if (BI->getSuccessor(1) != IfFalseBB && // no inf looping
       BI->getSuccessor(1)->getTerminatingDeoptimizeCall() && // profitability
       NoSideEffects(*BI->getParent())) {
-    BI->getSuccessor(1)->removePredecessor(BI->getParent());
+    auto *OldSuccessor = BI->getSuccessor(1);
+    OldSuccessor->removePredecessor(BI->getParent());
     BI->setSuccessor(1, IfFalseBB);
+    if (DTU)
+      DTU->applyUpdates(
+          {{DominatorTree::Insert, BI->getParent(), IfFalseBB},
+           {DominatorTree::Delete, BI->getParent(), OldSuccessor}});
     return true;
   }
   if (BI->getSuccessor(0) != IfFalseBB && // no inf looping
       BI->getSuccessor(0)->getTerminatingDeoptimizeCall() && // profitability
       NoSideEffects(*BI->getParent())) {
-    BI->getSuccessor(0)->removePredecessor(BI->getParent());
+    auto *OldSuccessor = BI->getSuccessor(0);
+    OldSuccessor->removePredecessor(BI->getParent());
     BI->setSuccessor(0, IfFalseBB);
+    if (DTU)
+      DTU->applyUpdates(
+          {{DominatorTree::Insert, BI->getParent(), IfFalseBB},
+           {DominatorTree::Delete, BI->getParent(), OldSuccessor}});
     return true;
   }
   return false;
@@ -3313,6 +3446,7 @@ static bool tryWidenCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) {
 /// that PBI and BI are both conditional branches, and BI is in one of the
 /// successor blocks of PBI - PBI branches to BI.
 static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
+                                           DomTreeUpdater *DTU,
                                            const DataLayout &DL,
                                            const TargetTransformInfo &TTI) {
   assert(PBI->isConditional() && BI->isConditional());
@@ -3366,7 +3500,7 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
   // If the previous block ended with a widenable branch, determine if reusing
   // the target block is profitable and legal.  This will have the effect of
   // "widening" PBI, but doesn't require us to reason about hosting safety.
-  if (tryWidenCondBranchToCondBranch(PBI, BI))
+  if (tryWidenCondBranchToCondBranch(PBI, BI, DTU))
     return true;
 
   if (auto *CE = dyn_cast<ConstantExpr>(BI->getCondition()))
@@ -3376,7 +3510,7 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
   // If both branches are conditional and both contain stores to the same
   // address, remove the stores from the conditionals and create a conditional
   // merged store at the end.
-  if (MergeCondStores && mergeConditionalStores(PBI, BI, DL, TTI))
+  if (MergeCondStores && mergeConditionalStores(PBI, BI, DTU, DL, TTI))
     return true;
 
   // If this is a conditional branch in an empty block, and if any
@@ -3419,6 +3553,7 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
   // case, it would be unsafe to hoist the operation into a select instruction.
 
   BasicBlock *CommonDest = PBI->getSuccessor(PBIOp);
+  BasicBlock *RemovedDest = PBI->getSuccessor(PBIOp ^ 1);
   unsigned NumPhis = 0;
   for (BasicBlock::iterator II = CommonDest->begin(); isa<PHINode>(II);
        ++II, ++NumPhis) {
@@ -3444,6 +3579,8 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
   LLVM_DEBUG(dbgs() << "FOLDING BRs:" << *PBI->getParent()
                     << "AND: " << *BI->getParent());
 
+  SmallVector<DominatorTree::UpdateType, 5> Updates;
+
   // If OtherDest *is* BB, then BB is a basic block with a single conditional
   // branch in it, where one edge (OtherDest) goes back to itself but the other
   // exits.  We don't *know* that the program avoids the infinite loop
@@ -3457,6 +3594,7 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
     BasicBlock *InfLoopBlock =
         BasicBlock::Create(BB->getContext(), "infloop", BB->getParent());
     BranchInst::Create(InfLoopBlock, InfLoopBlock);
+    Updates.push_back({DominatorTree::Insert, InfLoopBlock, InfLoopBlock});
     OtherDest = InfLoopBlock;
   }
 
@@ -3483,6 +3621,12 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
   PBI->setSuccessor(0, CommonDest);
   PBI->setSuccessor(1, OtherDest);
 
+  Updates.push_back({DominatorTree::Insert, PBI->getParent(), OtherDest});
+  Updates.push_back({DominatorTree::Delete, PBI->getParent(), RemovedDest});
+
+  if (DTU)
+    DTU->applyUpdates(Updates);
+
   // Update branch weight for PBI.
   uint64_t PredTrueWeight, PredFalseWeight, SuccTrueWeight, SuccFalseWeight;
   uint64_t PredCommon, PredOther, SuccCommon, SuccOther;
@@ -3562,6 +3706,7 @@ bool SimplifyCFGOpt::SimplifyTerminatorOnSelect(Instruction *OldTerm,
                                                 BasicBlock *FalseBB,
                                                 uint32_t TrueWeight,
                                                 uint32_t FalseWeight) {
+  auto *BB = OldTerm->getParent();
   // Remove any superfluous successor edges from the CFG.
   // First, figure out which successors to preserve.
   // If TrueBB and FalseBB are equal, only try to preserve one copy of that
@@ -3569,6 +3714,8 @@ bool SimplifyCFGOpt::SimplifyTerminatorOnSelect(Instruction *OldTerm,
   BasicBlock *KeepEdge1 = TrueBB;
   BasicBlock *KeepEdge2 = TrueBB != FalseBB ? FalseBB : nullptr;
 
+  SmallSetVector<BasicBlock *, 2> RemovedSuccessors;
+
   // Then remove the rest.
   for (BasicBlock *Succ : successors(OldTerm)) {
     // Make sure only to keep exactly one copy of each edge.
@@ -3576,9 +3723,13 @@ bool SimplifyCFGOpt::SimplifyTerminatorOnSelect(Instruction *OldTerm,
       KeepEdge1 = nullptr;
     else if (Succ == KeepEdge2)
       KeepEdge2 = nullptr;
-    else
-      Succ->removePredecessor(OldTerm->getParent(),
+    else {
+      Succ->removePredecessor(BB,
                               /*KeepOneInputPHIs=*/true);
+
+      if (Succ != TrueBB && Succ != FalseBB)
+        RemovedSuccessors.insert(Succ);
+    }
   }
 
   IRBuilder<> Builder(OldTerm);
@@ -3586,11 +3737,11 @@ bool SimplifyCFGOpt::SimplifyTerminatorOnSelect(Instruction *OldTerm,
 
   // Insert an appropriate new terminator.
   if (!KeepEdge1 && !KeepEdge2) {
-    if (TrueBB == FalseBB)
+    if (TrueBB == FalseBB) {
       // We were only looking for one successor, and it was present.
       // Create an unconditional branch to it.
       Builder.CreateBr(TrueBB);
-    else {
+    } else {
       // We found both of the successors we were looking for.
       // Create a conditional branch sharing the condition of the select.
       BranchInst *NewBI = Builder.CreateCondBr(Cond, TrueBB, FalseBB);
@@ -3605,15 +3756,25 @@ bool SimplifyCFGOpt::SimplifyTerminatorOnSelect(Instruction *OldTerm,
     // One of the selected values was a successor, but the other wasn't.
     // Insert an unconditional branch to the one that was found;
     // the edge to the one that wasn't must be unreachable.
-    if (!KeepEdge1)
+    if (!KeepEdge1) {
       // Only TrueBB was found.
       Builder.CreateBr(TrueBB);
-    else
+    } else {
       // Only FalseBB was found.
       Builder.CreateBr(FalseBB);
+    }
   }
 
   EraseTerminatorAndDCECond(OldTerm);
+
+  if (DTU) {
+    SmallVector<DominatorTree::UpdateType, 2> Updates;
+    Updates.reserve(RemovedSuccessors.size());
+    for (auto *RemovedSuccessor : RemovedSuccessors)
+      Updates.push_back({DominatorTree::Delete, BB, RemovedSuccessor});
+    DTU->applyUpdates(Updates);
+  }
+
   return true;
 }
 
@@ -3768,6 +3929,8 @@ bool SimplifyCFGOpt::tryToSimplifyUncondBranchWithICmpInIt(
   ICI->replaceAllUsesWith(DefaultCst);
   ICI->eraseFromParent();
 
+  SmallVector<DominatorTree::UpdateType, 2> Updates;
+
   // Okay, the switch goes to this block on a default value.  Add an edge from
   // the switch to the merge point on the compared value.
   BasicBlock *NewBB =
@@ -3781,13 +3944,17 @@ bool SimplifyCFGOpt::tryToSimplifyUncondBranchWithICmpInIt(
       SIW.setSuccessorWeight(0, *NewW);
     }
     SIW.addCase(Cst, NewBB, NewW);
+    Updates.push_back({DominatorTree::Insert, Pred, NewBB});
   }
 
   // NewBB branches to the phi block, add the uncond branch and the phi entry.
   Builder.SetInsertPoint(NewBB);
   Builder.SetCurrentDebugLocation(SI->getDebugLoc());
   Builder.CreateBr(SuccBlock);
+  Updates.push_back({DominatorTree::Insert, NewBB, SuccBlock});
   PHIUse->addIncoming(NewCst, NewBB);
+  if (DTU)
+    DTU->applyUpdates(Updates);
   return true;
 }
 
@@ -3821,7 +3988,7 @@ bool SimplifyCFGOpt::SimplifyBranchOnICmpChain(BranchInst *BI,
   if (UsedICmps <= 1)
     return false;
 
-  bool TrueWhenEqual = (Cond->getOpcode() == Instruction::Or);
+  bool TrueWhenEqual = match(Cond, m_LogicalOr(m_Value(), m_Value()));
 
   // There might be duplicate constants in the list, which the switch
   // instruction can't handle, remove them now.
@@ -3853,12 +4020,15 @@ bool SimplifyCFGOpt::SimplifyBranchOnICmpChain(BranchInst *BI,
                     << " cases into SWITCH.  BB is:\n"
                     << *BB);
 
+  SmallVector<DominatorTree::UpdateType, 2> Updates;
+
   // If there are any extra values that couldn't be folded into the switch
   // then we evaluate them with an explicit branch first. Split the block
   // right before the condbr to handle it.
   if (ExtraCase) {
-    BasicBlock *NewBB =
-        BB->splitBasicBlock(BI->getIterator(), "switch.early.test");
+    BasicBlock *NewBB = SplitBlock(BB, BI, DTU, /*LI=*/nullptr,
+                                   /*MSSAU=*/nullptr, "switch.early.test");
+
     // Remove the uncond branch added to the old block.
     Instruction *OldTI = BB->getTerminator();
     Builder.SetInsertPoint(OldTI);
@@ -3870,6 +4040,8 @@ bool SimplifyCFGOpt::SimplifyBranchOnICmpChain(BranchInst *BI,
 
     OldTI->eraseFromParent();
 
+    Updates.push_back({DominatorTree::Insert, BB, EdgeBB});
+
     // If there are PHI nodes in EdgeBB, then we need to add a new entry to them
     // for the edge we just added.
     AddPredecessorToBlock(EdgeBB, BB, NewBB);
@@ -3905,6 +4077,8 @@ bool SimplifyCFGOpt::SimplifyBranchOnICmpChain(BranchInst *BI,
 
   // Erase the old branch instruction.
   EraseTerminatorAndDCECond(BI);
+  if (DTU)
+    DTU->applyUpdates(Updates);
 
   LLVM_DEBUG(dbgs() << "  ** 'icmp' chain result is:\n" << *BB << '\n');
   return true;
@@ -3921,17 +4095,36 @@ bool SimplifyCFGOpt::simplifyResume(ResumeInst *RI, IRBuilder<> &Builder) {
   return false;
 }
 
+// Check if cleanup block is empty
+static bool isCleanupBlockEmpty(iterator_range<BasicBlock::iterator> R) {
+  for (Instruction &I : R) {
+    auto *II = dyn_cast<IntrinsicInst>(&I);
+    if (!II)
+      return false;
+
+    Intrinsic::ID IntrinsicID = II->getIntrinsicID();
+    switch (IntrinsicID) {
+    case Intrinsic::dbg_declare:
+    case Intrinsic::dbg_value:
+    case Intrinsic::dbg_label:
+    case Intrinsic::lifetime_end:
+      break;
+    default:
+      return false;
+    }
+  }
+  return true;
+}
+
 // Simplify resume that is shared by several landing pads (phi of landing pad).
 bool SimplifyCFGOpt::simplifyCommonResume(ResumeInst *RI) {
   BasicBlock *BB = RI->getParent();
 
-  // Check that there are no other instructions except for debug intrinsics
-  // between the phi of landing pads (RI->getValue()) and resume instruction.
-  BasicBlock::iterator I = cast<Instruction>(RI->getValue())->getIterator(),
-                       E = RI->getIterator();
-  while (++I != E)
-    if (!isa<DbgInfoIntrinsic>(I))
-      return false;
+  // Check that there are no other instructions except for debug and lifetime
+  // intrinsics between the phi's and resume instruction.
+  if (!isCleanupBlockEmpty(
+          make_range(RI->getParent()->getFirstNonPHI(), BB->getTerminator())))
+    return false;
 
   SmallSetVector<BasicBlock *, 4> TrivialUnwindBlocks;
   auto *PhiLPInst = cast<PHINode>(RI->getValue());
@@ -3952,17 +4145,8 @@ bool SimplifyCFGOpt::simplifyCommonResume(ResumeInst *RI) {
     if (IncomingValue != LandingPad)
       continue;
 
-    bool isTrivial = true;
-
-    I = IncomingBB->getFirstNonPHI()->getIterator();
-    E = IncomingBB->getTerminator()->getIterator();
-    while (++I != E)
-      if (!isa<DbgInfoIntrinsic>(I)) {
-        isTrivial = false;
-        break;
-      }
-
-    if (isTrivial)
+    if (isCleanupBlockEmpty(
+            make_range(LandingPad->getNextNode(), IncomingBB->getTerminator())))
       TrivialUnwindBlocks.insert(IncomingBB);
   }
 
@@ -3981,7 +4165,8 @@ bool SimplifyCFGOpt::simplifyCommonResume(ResumeInst *RI) {
     for (pred_iterator PI = pred_begin(TrivialBB), PE = pred_end(TrivialBB);
          PI != PE;) {
       BasicBlock *Pred = *PI++;
-      removeUnwindEdge(Pred);
+      removeUnwindEdge(Pred, DTU);
+      ++NumInvokes;
     }
 
     // In each SimplifyCFG run, only the current processed block can be erased.
@@ -3991,37 +4176,21 @@ bool SimplifyCFGOpt::simplifyCommonResume(ResumeInst *RI) {
     // predecessors.
     TrivialBB->getTerminator()->eraseFromParent();
     new UnreachableInst(RI->getContext(), TrivialBB);
+    if (DTU)
+      DTU->applyUpdates({{DominatorTree::Delete, TrivialBB, BB}});
   }
 
   // Delete the resume block if all its predecessors have been removed.
-  if (pred_empty(BB))
-    BB->eraseFromParent();
+  if (pred_empty(BB)) {
+    if (DTU)
+      DTU->deleteBB(BB);
+    else
+      BB->eraseFromParent();
+  }
 
   return !TrivialUnwindBlocks.empty();
 }
 
-// Check if cleanup block is empty
-static bool isCleanupBlockEmpty(Instruction *Inst, Instruction *RI) {
-  BasicBlock::iterator I = Inst->getIterator(), E = RI->getIterator();
-  while (++I != E) {
-    auto *II = dyn_cast<IntrinsicInst>(I);
-    if (!II)
-      return false;
-
-    Intrinsic::ID IntrinsicID = II->getIntrinsicID();
-    switch (IntrinsicID) {
-    case Intrinsic::dbg_declare:
-    case Intrinsic::dbg_value:
-    case Intrinsic::dbg_label:
-    case Intrinsic::lifetime_end:
-      break;
-    default:
-      return false;
-    }
-  }
-  return true;
-}
-
 // Simplify resume that is only used by a single (non-phi) landing pad.
 bool SimplifyCFGOpt::simplifySingleResume(ResumeInst *RI) {
   BasicBlock *BB = RI->getParent();
@@ -4030,23 +4199,26 @@ bool SimplifyCFGOpt::simplifySingleResume(ResumeInst *RI) {
          "Resume must unwind the exception that caused control to here");
 
   // Check that there are no other instructions except for debug intrinsics.
-  if (!isCleanupBlockEmpty(LPInst, RI))
+  if (!isCleanupBlockEmpty(
+          make_range<Instruction *>(LPInst->getNextNode(), RI)))
     return false;
 
   // Turn all invokes that unwind here into calls and delete the basic block.
   for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE;) {
     BasicBlock *Pred = *PI++;
-    removeUnwindEdge(Pred);
+    removeUnwindEdge(Pred, DTU);
+    ++NumInvokes;
   }
 
   // The landingpad is now unreachable.  Zap it.
-  if (LoopHeaders)
-    LoopHeaders->erase(BB);
-  BB->eraseFromParent();
+  if (DTU)
+    DTU->deleteBB(BB);
+  else
+    BB->eraseFromParent();
   return true;
 }
 
-static bool removeEmptyCleanup(CleanupReturnInst *RI) {
+static bool removeEmptyCleanup(CleanupReturnInst *RI, DomTreeUpdater *DTU) {
   // If this is a trivial cleanup pad that executes no instructions, it can be
   // eliminated.  If the cleanup pad continues to the caller, any predecessor
   // that is an EH pad will be updated to continue to the caller and any
@@ -4067,7 +4239,8 @@ static bool removeEmptyCleanup(CleanupReturnInst *RI) {
     return false;
 
   // Check that there are no other instructions except for benign intrinsics.
-  if (!isCleanupBlockEmpty(CPInst, RI))
+  if (!isCleanupBlockEmpty(
+          make_range<Instruction *>(CPInst->getNextNode(), RI)))
     return false;
 
   // If the cleanup return we are simplifying unwinds to the caller, this will
@@ -4152,19 +4325,32 @@ static bool removeEmptyCleanup(CleanupReturnInst *RI) {
     }
   }
 
+  std::vector<DominatorTree::UpdateType> Updates;
+
   for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE;) {
     // The iterator must be updated here because we are removing this pred.
     BasicBlock *PredBB = *PI++;
     if (UnwindDest == nullptr) {
-      removeUnwindEdge(PredBB);
+      if (DTU)
+        DTU->applyUpdates(Updates);
+      Updates.clear();
+      removeUnwindEdge(PredBB, DTU);
+      ++NumInvokes;
     } else {
       Instruction *TI = PredBB->getTerminator();
       TI->replaceUsesOfWith(BB, UnwindDest);
+      Updates.push_back({DominatorTree::Insert, PredBB, UnwindDest});
+      Updates.push_back({DominatorTree::Delete, PredBB, BB});
     }
   }
 
-  // The cleanup pad is now unreachable.  Zap it.
-  BB->eraseFromParent();
+  if (DTU) {
+    DTU->applyUpdates(Updates);
+    DTU->deleteBB(BB);
+  } else
+    // The cleanup pad is now unreachable.  Zap it.
+    BB->eraseFromParent();
+
   return true;
 }
 
@@ -4211,7 +4397,7 @@ bool SimplifyCFGOpt::simplifyCleanupReturn(CleanupReturnInst *RI) {
   if (mergeCleanupPad(RI))
     return true;
 
-  if (removeEmptyCleanup(RI))
+  if (removeEmptyCleanup(RI, DTU))
     return true;
 
   return false;
@@ -4242,15 +4428,16 @@ bool SimplifyCFGOpt::simplifyReturn(ReturnInst *RI, IRBuilder<> &Builder) {
       BasicBlock *Pred = UncondBranchPreds.pop_back_val();
       LLVM_DEBUG(dbgs() << "FOLDING: " << *BB
                         << "INTO UNCOND BRANCH PRED: " << *Pred);
-      (void)FoldReturnIntoUncondBranch(RI, BB, Pred);
+      (void)FoldReturnIntoUncondBranch(RI, BB, Pred, DTU);
     }
 
     // If we eliminated all predecessors of the block, delete the block now.
     if (pred_empty(BB)) {
       // We know there are no successors, so just nuke the block.
-      if (LoopHeaders)
-        LoopHeaders->erase(BB);
-      BB->eraseFromParent();
+      if (DTU)
+        DTU->deleteBB(BB);
+      else
+        BB->eraseFromParent();
     }
 
     return true;
@@ -4330,18 +4517,26 @@ bool SimplifyCFGOpt::simplifyUnreachable(UnreachableInst *UI) {
   if (&BB->front() != UI)
     return Changed;
 
-  SmallVector<BasicBlock *, 8> Preds(pred_begin(BB), pred_end(BB));
+  std::vector<DominatorTree::UpdateType> Updates;
+
+  SmallSetVector<BasicBlock *, 8> Preds(pred_begin(BB), pred_end(BB));
   for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
-    Instruction *TI = Preds[i]->getTerminator();
+    auto *Predecessor = Preds[i];
+    Instruction *TI = Predecessor->getTerminator();
     IRBuilder<> Builder(TI);
     if (auto *BI = dyn_cast<BranchInst>(TI)) {
-      if (BI->isUnconditional()) {
-        assert(BI->getSuccessor(0) == BB && "Incorrect CFG");
+      // We could either have a proper unconditional branch,
+      // or a degenerate conditional branch with matching destinations.
+      if (all_of(BI->successors(),
+                 [BB](auto *Successor) { return Successor == BB; })) {
         new UnreachableInst(TI->getContext(), TI);
         TI->eraseFromParent();
         Changed = true;
       } else {
+        assert(BI->isConditional() && "Can't get here with an uncond branch.");
         Value* Cond = BI->getCondition();
+        assert(BI->getSuccessor(0) != BI->getSuccessor(1) &&
+               "The destinations are guaranteed to be different here.");
         if (BI->getSuccessor(0) == BB) {
           Builder.CreateAssumption(Builder.CreateNot(Cond));
           Builder.CreateBr(BI->getSuccessor(1));
@@ -4353,6 +4548,7 @@ bool SimplifyCFGOpt::simplifyUnreachable(UnreachableInst *UI) {
         EraseTerminatorAndDCECond(BI);
         Changed = true;
       }
+      Updates.push_back({DominatorTree::Delete, Predecessor, BB});
     } else if (auto *SI = dyn_cast<SwitchInst>(TI)) {
       SwitchInstProfUpdateWrapper SU(*SI);
       for (auto i = SU->case_begin(), e = SU->case_end(); i != e;) {
@@ -4365,14 +4561,23 @@ bool SimplifyCFGOpt::simplifyUnreachable(UnreachableInst *UI) {
         e = SU->case_end();
         Changed = true;
       }
+      // Note that the default destination can't be removed!
+      if (SI->getDefaultDest() != BB)
+        Updates.push_back({DominatorTree::Delete, Predecessor, BB});
     } else if (auto *II = dyn_cast<InvokeInst>(TI)) {
       if (II->getUnwindDest() == BB) {
-        removeUnwindEdge(TI->getParent());
+        if (DTU)
+          DTU->applyUpdates(Updates);
+        Updates.clear();
+        removeUnwindEdge(TI->getParent(), DTU);
         Changed = true;
       }
     } else if (auto *CSI = dyn_cast<CatchSwitchInst>(TI)) {
       if (CSI->getUnwindDest() == BB) {
-        removeUnwindEdge(TI->getParent());
+        if (DTU)
+          DTU->applyUpdates(Updates);
+        Updates.clear();
+        removeUnwindEdge(TI->getParent(), DTU);
         Changed = true;
         continue;
       }
@@ -4387,35 +4592,53 @@ bool SimplifyCFGOpt::simplifyUnreachable(UnreachableInst *UI) {
           Changed = true;
         }
       }
+      Updates.push_back({DominatorTree::Delete, Predecessor, BB});
       if (CSI->getNumHandlers() == 0) {
-        BasicBlock *CatchSwitchBB = CSI->getParent();
         if (CSI->hasUnwindDest()) {
-          // Redirect preds to the unwind dest
-          CatchSwitchBB->replaceAllUsesWith(CSI->getUnwindDest());
+          // Redirect all predecessors of the block containing CatchSwitchInst
+          // to instead branch to the CatchSwitchInst's unwind destination.
+          for (auto *PredecessorOfPredecessor : predecessors(Predecessor)) {
+            Updates.push_back({DominatorTree::Insert, PredecessorOfPredecessor,
+                               CSI->getUnwindDest()});
+            Updates.push_back(
+                {DominatorTree::Delete, PredecessorOfPredecessor, Predecessor});
+          }
+          Predecessor->replaceAllUsesWith(CSI->getUnwindDest());
         } else {
           // Rewrite all preds to unwind to caller (or from invoke to call).
-          SmallVector<BasicBlock *, 8> EHPreds(predecessors(CatchSwitchBB));
+          if (DTU)
+            DTU->applyUpdates(Updates);
+          Updates.clear();
+          SmallVector<BasicBlock *, 8> EHPreds(predecessors(Predecessor));
           for (BasicBlock *EHPred : EHPreds)
-            removeUnwindEdge(EHPred);
+            removeUnwindEdge(EHPred, DTU);
         }
         // The catchswitch is no longer reachable.
         new UnreachableInst(CSI->getContext(), CSI);
         CSI->eraseFromParent();
         Changed = true;
       }
-    } else if (isa<CleanupReturnInst>(TI)) {
+    } else if (auto *CRI = dyn_cast<CleanupReturnInst>(TI)) {
+      (void)CRI;
+      assert(CRI->hasUnwindDest() && CRI->getUnwindDest() == BB &&
+             "Expected to always have an unwind to BB.");
+      Updates.push_back({DominatorTree::Delete, Predecessor, BB});
       new UnreachableInst(TI->getContext(), TI);
       TI->eraseFromParent();
       Changed = true;
     }
   }
 
+  if (DTU)
+    DTU->applyUpdates(Updates);
+
   // If this block is now dead, remove it.
   if (pred_empty(BB) && BB != &BB->getParent()->getEntryBlock()) {
     // We know there are no successors, so just nuke the block.
-    if (LoopHeaders)
-      LoopHeaders->erase(BB);
-    BB->eraseFromParent();
+    if (DTU)
+      DTU->deleteBB(BB);
+    else
+      BB->eraseFromParent();
     return true;
   }
 
@@ -4433,15 +4656,26 @@ static bool CasesAreContiguous(SmallVectorImpl<ConstantInt *> &Cases) {
   return true;
 }
 
-static void createUnreachableSwitchDefault(SwitchInst *Switch) {
+static void createUnreachableSwitchDefault(SwitchInst *Switch,
+                                           DomTreeUpdater *DTU) {
   LLVM_DEBUG(dbgs() << "SimplifyCFG: switch default is dead.\n");
-  BasicBlock *NewDefaultBlock =
-     SplitBlockPredecessors(Switch->getDefaultDest(), Switch->getParent(), "");
+  auto *BB = Switch->getParent();
+  BasicBlock *NewDefaultBlock = SplitBlockPredecessors(
+      Switch->getDefaultDest(), Switch->getParent(), "", DTU);
+  auto *OrigDefaultBlock = Switch->getDefaultDest();
   Switch->setDefaultDest(&*NewDefaultBlock);
-  SplitBlock(&*NewDefaultBlock, &NewDefaultBlock->front());
+  if (DTU)
+    DTU->applyUpdates({{DominatorTree::Insert, BB, &*NewDefaultBlock},
+                       {DominatorTree::Delete, BB, OrigDefaultBlock}});
+  SplitBlock(&*NewDefaultBlock, &NewDefaultBlock->front(), DTU);
+  SmallVector<DominatorTree::UpdateType, 2> Updates;
+  for (auto *Successor : successors(NewDefaultBlock))
+    Updates.push_back({DominatorTree::Delete, NewDefaultBlock, Successor});
   auto *NewTerminator = NewDefaultBlock->getTerminator();
   new UnreachableInst(Switch->getContext(), NewTerminator);
   EraseTerminatorAndDCECond(NewTerminator);
+  if (DTU)
+    DTU->applyUpdates(Updates);
 }
 
 /// Turn a switch with two reachable destinations into an integer range
@@ -4453,6 +4687,8 @@ bool SimplifyCFGOpt::TurnSwitchRangeIntoICmp(SwitchInst *SI,
   bool HasDefault =
       !isa<UnreachableInst>(SI->getDefaultDest()->getFirstNonPHIOrDbg());
 
+  auto *BB = SI->getParent();
+
   // Partition the cases into two sets with different destinations.
   BasicBlock *DestA = HasDefault ? SI->getDefaultDest() : nullptr;
   BasicBlock *DestB = nullptr;
@@ -4556,17 +4792,23 @@ bool SimplifyCFGOpt::TurnSwitchRangeIntoICmp(SwitchInst *SI,
   // Clean up the default block - it may have phis or other instructions before
   // the unreachable terminator.
   if (!HasDefault)
-    createUnreachableSwitchDefault(SI);
+    createUnreachableSwitchDefault(SI, DTU);
+
+  auto *UnreachableDefault = SI->getDefaultDest();
 
   // Drop the switch.
   SI->eraseFromParent();
 
+  if (!HasDefault && DTU)
+    DTU->applyUpdates({{DominatorTree::Delete, BB, UnreachableDefault}});
+
   return true;
 }
 
 /// Compute masked bits for the condition of a switch
 /// and use it to remove dead cases.
-static bool eliminateDeadSwitchCases(SwitchInst *SI, AssumptionCache *AC,
+static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU,
+                                     AssumptionCache *AC,
                                      const DataLayout &DL) {
   Value *Cond = SI->getCondition();
   unsigned Bits = Cond->getType()->getIntegerBitWidth();
@@ -4580,11 +4822,15 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, AssumptionCache *AC,
 
   // Gather dead cases.
   SmallVector<ConstantInt *, 8> DeadCases;
+  SmallMapVector<BasicBlock *, int, 8> NumPerSuccessorCases;
   for (auto &Case : SI->cases()) {
+    auto *Successor = Case.getCaseSuccessor();
+    ++NumPerSuccessorCases[Successor];
     const APInt &CaseVal = Case.getCaseValue()->getValue();
     if (Known.Zero.intersects(CaseVal) || !Known.One.isSubsetOf(CaseVal) ||
         (CaseVal.getMinSignedBits() > MaxSignificantBitsInCond)) {
       DeadCases.push_back(Case.getCaseValue());
+      --NumPerSuccessorCases[Successor];
       LLVM_DEBUG(dbgs() << "SimplifyCFG: switch case " << CaseVal
                         << " is dead.\n");
     }
@@ -4602,7 +4848,7 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, AssumptionCache *AC,
   if (HasDefault && DeadCases.empty() &&
       NumUnknownBits < 64 /* avoid overflow */ &&
       SI->getNumCases() == (1ULL << NumUnknownBits)) {
-    createUnreachableSwitchDefault(SI);
+    createUnreachableSwitchDefault(SI, DTU);
     return true;
   }
 
@@ -4619,6 +4865,13 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, AssumptionCache *AC,
     SIW.removeCase(CaseI);
   }
 
+  std::vector<DominatorTree::UpdateType> Updates;
+  for (const std::pair<BasicBlock *, int> &I : NumPerSuccessorCases)
+    if (I.second == 0)
+      Updates.push_back({DominatorTree::Delete, SI->getParent(), I.first});
+  if (DTU)
+    DTU->applyUpdates(Updates);
+
   return true;
 }
 
@@ -4974,30 +5227,41 @@ static Value *ConvertTwoCaseSwitch(const SwitchCaseResultVectorTy &ResultVector,
 // a select, fixing up PHI nodes and basic blocks.
 static void RemoveSwitchAfterSelectConversion(SwitchInst *SI, PHINode *PHI,
                                               Value *SelectValue,
-                                              IRBuilder<> &Builder) {
+                                              IRBuilder<> &Builder,
+                                              DomTreeUpdater *DTU) {
+  std::vector<DominatorTree::UpdateType> Updates;
+
   BasicBlock *SelectBB = SI->getParent();
+  BasicBlock *DestBB = PHI->getParent();
+
+  if (!is_contained(predecessors(DestBB), SelectBB))
+    Updates.push_back({DominatorTree::Insert, SelectBB, DestBB});
+  Builder.CreateBr(DestBB);
+
+  // Remove the switch.
+
   while (PHI->getBasicBlockIndex(SelectBB) >= 0)
     PHI->removeIncomingValue(SelectBB);
   PHI->addIncoming(SelectValue, SelectBB);
 
-  Builder.CreateBr(PHI->getParent());
-
-  // Remove the switch.
   for (unsigned i = 0, e = SI->getNumSuccessors(); i < e; ++i) {
     BasicBlock *Succ = SI->getSuccessor(i);
 
-    if (Succ == PHI->getParent())
+    if (Succ == DestBB)
       continue;
     Succ->removePredecessor(SelectBB);
+    Updates.push_back({DominatorTree::Delete, SelectBB, Succ});
   }
   SI->eraseFromParent();
+  if (DTU)
+    DTU->applyUpdates(Updates);
 }
 
 /// If the switch is only used to initialize one or more
 /// phi nodes in a common successor block with only two different
 /// constant values, replace the switch with select.
 static bool switchToSelect(SwitchInst *SI, IRBuilder<> &Builder,
-                           const DataLayout &DL,
+                           DomTreeUpdater *DTU, const DataLayout &DL,
                            const TargetTransformInfo &TTI) {
   Value *const Cond = SI->getCondition();
   PHINode *PHI = nullptr;
@@ -5017,7 +5281,7 @@ static bool switchToSelect(SwitchInst *SI, IRBuilder<> &Builder,
   Value *SelectValue =
       ConvertTwoCaseSwitch(UniqueResults, DefaultResult, Cond, Builder);
   if (SelectValue) {
-    RemoveSwitchAfterSelectConversion(SI, PHI, SelectValue, Builder);
+    RemoveSwitchAfterSelectConversion(SI, PHI, SelectValue, Builder, DTU);
     return true;
   }
   // The switch couldn't be converted into a select.
@@ -5402,11 +5666,12 @@ static void reuseTableCompare(
 /// successor block with different constant values, replace the switch with
 /// lookup tables.
 static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
-                                const DataLayout &DL,
+                                DomTreeUpdater *DTU, const DataLayout &DL,
                                 const TargetTransformInfo &TTI) {
   assert(SI->getNumCases() > 1 && "Degenerate switch?");
 
-  Function *Fn = SI->getParent()->getParent();
+  BasicBlock *BB = SI->getParent();
+  Function *Fn = BB->getParent();
   // Only build lookup table when we have a target that supports it or the
   // attribute is not set.
   if (!TTI.shouldBuildLookupTables() ||
@@ -5500,6 +5765,8 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
   if (!ShouldBuildLookupTable(SI, TableSize, TTI, DL, ResultTypes))
     return false;
 
+  std::vector<DominatorTree::UpdateType> Updates;
+
   // Create the BB that does the lookups.
   Module &Mod = *CommonDest->getParent()->getParent();
   BasicBlock *LookupBB = BasicBlock::Create(
@@ -5532,6 +5799,7 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
 
   if (!DefaultIsReachable || GeneratingCoveredLookupTable) {
     Builder.CreateBr(LookupBB);
+    Updates.push_back({DominatorTree::Insert, BB, LookupBB});
     // Note: We call removeProdecessor later since we need to be able to get the
     // PHI value for the default case in case we're using a bit mask.
   } else {
@@ -5539,6 +5807,7 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
         TableIndex, ConstantInt::get(MinCaseVal->getType(), TableSize));
     RangeCheckBranch =
         Builder.CreateCondBr(Cmp, LookupBB, SI->getDefaultDest());
+    Updates.push_back({DominatorTree::Insert, BB, LookupBB});
   }
 
   // Populate the BB that does the lookups.
@@ -5576,16 +5845,18 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
     Value *LoBit = Builder.CreateTrunc(
         Shifted, Type::getInt1Ty(Mod.getContext()), "switch.lobit");
     Builder.CreateCondBr(LoBit, LookupBB, SI->getDefaultDest());
-
+    Updates.push_back({DominatorTree::Insert, MaskBB, LookupBB});
+    Updates.push_back({DominatorTree::Insert, MaskBB, SI->getDefaultDest()});
     Builder.SetInsertPoint(LookupBB);
-    AddPredecessorToBlock(SI->getDefaultDest(), MaskBB, SI->getParent());
+    AddPredecessorToBlock(SI->getDefaultDest(), MaskBB, BB);
   }
 
   if (!DefaultIsReachable || GeneratingCoveredLookupTable) {
     // We cached PHINodes in PHIs. To avoid accessing deleted PHINodes later,
     // do not delete PHINodes here.
-    SI->getDefaultDest()->removePredecessor(SI->getParent(),
+    SI->getDefaultDest()->removePredecessor(BB,
                                             /*KeepOneInputPHIs=*/true);
+    Updates.push_back({DominatorTree::Delete, BB, SI->getDefaultDest()});
   }
 
   bool ReturnedEarly = false;
@@ -5622,19 +5893,29 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
     PHI->addIncoming(Result, LookupBB);
   }
 
-  if (!ReturnedEarly)
+  if (!ReturnedEarly) {
     Builder.CreateBr(CommonDest);
+    Updates.push_back({DominatorTree::Insert, LookupBB, CommonDest});
+  }
 
   // Remove the switch.
+  SmallSetVector<BasicBlock *, 8> RemovedSuccessors;
   for (unsigned i = 0, e = SI->getNumSuccessors(); i < e; ++i) {
     BasicBlock *Succ = SI->getSuccessor(i);
 
     if (Succ == SI->getDefaultDest())
       continue;
-    Succ->removePredecessor(SI->getParent());
+    Succ->removePredecessor(BB);
+    RemovedSuccessors.insert(Succ);
   }
   SI->eraseFromParent();
 
+  if (DTU) {
+    for (BasicBlock *RemovedSuccessor : RemovedSuccessors)
+      Updates.push_back({DominatorTree::Delete, BB, RemovedSuccessor});
+    DTU->applyUpdates(Updates);
+  }
+
   ++NumLookupTables;
   if (NeedMask)
     ++NumLookupTablesHoles;
@@ -5770,10 +6051,10 @@ bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
     return requestResimplify();
 
   // Remove unreachable cases.
-  if (eliminateDeadSwitchCases(SI, Options.AC, DL))
+  if (eliminateDeadSwitchCases(SI, DTU, Options.AC, DL))
     return requestResimplify();
 
-  if (switchToSelect(SI, Builder, DL, TTI))
+  if (switchToSelect(SI, Builder, DTU, DL, TTI))
     return requestResimplify();
 
   if (Options.ForwardSwitchCondToPhi && ForwardSwitchConditionToPHI(SI))
@@ -5785,7 +6066,7 @@ bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
   // CVP. Therefore, only apply this transformation during late stages of the
   // optimisation pipeline.
   if (Options.ConvertSwitchToLookupTable &&
-      SwitchToLookupTable(SI, Builder, DL, TTI))
+      SwitchToLookupTable(SI, Builder, DTU, DL, TTI))
     return requestResimplify();
 
   if (ReduceSwitchRange(SI, Builder, DL, TTI))
@@ -5800,9 +6081,12 @@ bool SimplifyCFGOpt::simplifyIndirectBr(IndirectBrInst *IBI) {
 
   // Eliminate redundant destinations.
   SmallPtrSet<Value *, 8> Succs;
+  SmallSetVector<BasicBlock *, 8> RemovedSuccs;
   for (unsigned i = 0, e = IBI->getNumDestinations(); i != e; ++i) {
     BasicBlock *Dest = IBI->getDestination(i);
     if (!Dest->hasAddressTaken() || !Succs.insert(Dest).second) {
+      if (!Dest->hasAddressTaken())
+        RemovedSuccs.insert(Dest);
       Dest->removePredecessor(BB);
       IBI->removeDestination(i);
       --i;
@@ -5811,6 +6095,14 @@ bool SimplifyCFGOpt::simplifyIndirectBr(IndirectBrInst *IBI) {
     }
   }
 
+  if (DTU) {
+    std::vector<DominatorTree::UpdateType> Updates;
+    Updates.reserve(RemovedSuccs.size());
+    for (auto *RemovedSucc : RemovedSuccs)
+      Updates.push_back({DominatorTree::Delete, BB, RemovedSucc});
+    DTU->applyUpdates(Updates);
+  }
+
   if (IBI->getNumDestinations() == 0) {
     // If the indirectbr has no successors, change it to unreachable.
     new UnreachableInst(IBI->getContext(), IBI);
@@ -5854,7 +6146,7 @@ bool SimplifyCFGOpt::simplifyIndirectBr(IndirectBrInst *IBI) {
 /// block when the inputs in the phi are the same for the two blocks being
 /// merged.  In some cases, this could result in removal of the PHI entirely.
 static bool TryToMergeLandingPad(LandingPadInst *LPad, BranchInst *BI,
-                                 BasicBlock *BB) {
+                                 BasicBlock *BB, DomTreeUpdater *DTU) {
   auto Succ = BB->getUniqueSuccessor();
   assert(Succ);
   // If there's a phi in the successor block, we'd likely have to introduce
@@ -5875,6 +6167,8 @@ static bool TryToMergeLandingPad(LandingPadInst *LPad, BranchInst *BI,
     if (!BI2 || !BI2->isIdenticalTo(BI))
       continue;
 
+    std::vector<DominatorTree::UpdateType> Updates;
+
     // We've found an identical block.  Update our predecessors to take that
     // path instead and make ourselves dead.
     SmallPtrSet<BasicBlock *, 16> Preds;
@@ -5884,6 +6178,8 @@ static bool TryToMergeLandingPad(LandingPadInst *LPad, BranchInst *BI,
       assert(II->getNormalDest() != BB && II->getUnwindDest() == BB &&
              "unexpected successor");
       II->setUnwindDest(OtherPred);
+      Updates.push_back({DominatorTree::Insert, Pred, OtherPred});
+      Updates.push_back({DominatorTree::Delete, Pred, BB});
     }
 
     // The debug info in OtherPred doesn't cover the merged control flow that
@@ -5899,11 +6195,14 @@ static bool TryToMergeLandingPad(LandingPadInst *LPad, BranchInst *BI,
     Succs.insert(succ_begin(BB), succ_end(BB));
     for (BasicBlock *Succ : Succs) {
       Succ->removePredecessor(BB);
+      Updates.push_back({DominatorTree::Delete, BB, Succ});
     }
 
     IRBuilder<> Builder(BI);
     Builder.CreateUnreachable();
     BI->eraseFromParent();
+    if (DTU)
+      DTU->applyUpdates(Updates);
     return true;
   }
   return false;
@@ -5928,11 +6227,11 @@ bool SimplifyCFGOpt::simplifyUncondBranch(BranchInst *BI,
   // backedge, so we can eliminate BB.
   bool NeedCanonicalLoop =
       Options.NeedCanonicalLoop &&
-      (LoopHeaders && BB->hasNPredecessorsOrMore(2) &&
-       (LoopHeaders->count(BB) || LoopHeaders->count(Succ)));
+      (!LoopHeaders.empty() && BB->hasNPredecessorsOrMore(2) &&
+       (is_contained(LoopHeaders, BB) || is_contained(LoopHeaders, Succ)));
   BasicBlock::iterator I = BB->getFirstNonPHIOrDbg()->getIterator();
   if (I->isTerminator() && BB != &BB->getParent()->getEntryBlock() &&
-      !NeedCanonicalLoop && TryToSimplifyUncondBranchFromEmptyBlock(BB))
+      !NeedCanonicalLoop && TryToSimplifyUncondBranchFromEmptyBlock(BB, DTU))
     return true;
 
   // If the only instruction in the block is a seteq/setne comparison against a
@@ -5951,7 +6250,7 @@ bool SimplifyCFGOpt::simplifyUncondBranch(BranchInst *BI,
   if (LandingPadInst *LPad = dyn_cast<LandingPadInst>(I)) {
     for (++I; isa<DbgInfoIntrinsic>(I); ++I)
       ;
-    if (I->isTerminator() && TryToMergeLandingPad(LPad, BI, BB))
+    if (I->isTerminator() && TryToMergeLandingPad(LPad, BI, BB, DTU))
       return true;
   }
 
@@ -5959,7 +6258,8 @@ bool SimplifyCFGOpt::simplifyUncondBranch(BranchInst *BI,
   // branches to us and our successor, fold the comparison into the
   // predecessor and use logical operations to update the incoming value
   // for PHI nodes in common successor.
-  if (FoldBranchToCommonDest(BI, nullptr, Options.BonusInstThreshold))
+  if (FoldBranchToCommonDest(BI, DTU, /*MSSAU=*/nullptr, &TTI,
+                             Options.BonusInstThreshold))
     return requestResimplify();
   return false;
 }
@@ -6022,7 +6322,8 @@ bool SimplifyCFGOpt::simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
   // If this basic block is ONLY a compare and a branch, and if a predecessor
   // branches to us and one of our successors, fold the comparison into the
   // predecessor and use logical operations to pick the right destination.
-  if (FoldBranchToCommonDest(BI, nullptr, Options.BonusInstThreshold))
+  if (FoldBranchToCommonDest(BI, DTU, /*MSSAU=*/nullptr, &TTI,
+                             Options.BonusInstThreshold))
     return requestResimplify();
 
   // We have a conditional branch to two blocks that are only reachable
@@ -6031,8 +6332,9 @@ bool SimplifyCFGOpt::simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
   // can hoist it up to the branching block.
   if (BI->getSuccessor(0)->getSinglePredecessor()) {
     if (BI->getSuccessor(1)->getSinglePredecessor()) {
-      if (HoistThenElseCodeToIf(BI, TTI))
-        return requestResimplify();
+      if (HoistCommon && Options.HoistCommonInsts)
+        if (HoistThenElseCodeToIf(BI, TTI))
+          return requestResimplify();
     } else {
       // If Successor #1 has multiple preds, we may be able to conditionally
       // execute Successor #0 if it branches to Successor #1.
@@ -6056,14 +6358,14 @@ bool SimplifyCFGOpt::simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
   // through this block if any PHI node entries are constants.
   if (PHINode *PN = dyn_cast<PHINode>(BI->getCondition()))
     if (PN->getParent() == BI->getParent())
-      if (FoldCondBranchOnPHI(BI, DL, Options.AC))
+      if (FoldCondBranchOnPHI(BI, DTU, DL, Options.AC))
         return requestResimplify();
 
   // Scan predecessor blocks for conditional branches.
   for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
     if (BranchInst *PBI = dyn_cast<BranchInst>((*PI)->getTerminator()))
       if (PBI != BI && PBI->isConditional())
-        if (SimplifyCondBranchToCondBranch(PBI, BI, DL, TTI))
+        if (SimplifyCondBranchToCondBranch(PBI, BI, DTU, DL, TTI))
           return requestResimplify();
 
   // Look for diamond patterns.
@@ -6071,14 +6373,14 @@ bool SimplifyCFGOpt::simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
     if (BasicBlock *PrevBB = allPredecessorsComeFromSameSource(BB))
       if (BranchInst *PBI = dyn_cast<BranchInst>(PrevBB->getTerminator()))
         if (PBI != BI && PBI->isConditional())
-          if (mergeConditionalStores(PBI, BI, DL, TTI))
+          if (mergeConditionalStores(PBI, BI, DTU, DL, TTI))
             return requestResimplify();
 
   return false;
 }
 
 /// Check if passing a value to an instruction will cause undefined behavior.
-static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I) {
+static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I, bool PtrValueMayBeModified) {
   Constant *C = dyn_cast<Constant>(V);
   if (!C)
     return false;
@@ -6101,12 +6403,15 @@ static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I) {
 
     // Look through GEPs. A load from a GEP derived from NULL is still undefined
     if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Use))
-      if (GEP->getPointerOperand() == I)
-        return passingValueIsAlwaysUndefined(V, GEP);
+      if (GEP->getPointerOperand() == I) {
+        if (!GEP->isInBounds() || !GEP->hasAllZeroIndices())
+          PtrValueMayBeModified = true;
+        return passingValueIsAlwaysUndefined(V, GEP, PtrValueMayBeModified);
+      }
 
     // Look through bitcasts.
     if (BitCastInst *BC = dyn_cast<BitCastInst>(Use))
-      return passingValueIsAlwaysUndefined(V, BC);
+      return passingValueIsAlwaysUndefined(V, BC, PtrValueMayBeModified);
 
     // Load from null is undefined.
     if (LoadInst *LI = dyn_cast<LoadInst>(Use))
@@ -6121,24 +6426,51 @@ static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I) {
                                       SI->getPointerAddressSpace())) &&
                SI->getPointerOperand() == I;
 
-    // A call to null is undefined.
-    if (auto *CB = dyn_cast<CallBase>(Use))
-      return !NullPointerIsDefined(CB->getFunction()) &&
-             CB->getCalledOperand() == I;
+    if (auto *CB = dyn_cast<CallBase>(Use)) {
+      if (C->isNullValue() && NullPointerIsDefined(CB->getFunction()))
+        return false;
+      // A call to null is undefined.
+      if (CB->getCalledOperand() == I)
+        return true;
+
+      if (C->isNullValue()) {
+        for (const llvm::Use &Arg : CB->args())
+          if (Arg == I) {
+            unsigned ArgIdx = CB->getArgOperandNo(&Arg);
+            if (CB->paramHasAttr(ArgIdx, Attribute::NonNull) &&
+                CB->paramHasAttr(ArgIdx, Attribute::NoUndef)) {
+              // Passing null to a nonnnull+noundef argument is undefined.
+              return !PtrValueMayBeModified;
+            }
+          }
+      } else if (isa<UndefValue>(C)) {
+        // Passing undef to a noundef argument is undefined.
+        for (const llvm::Use &Arg : CB->args())
+          if (Arg == I) {
+            unsigned ArgIdx = CB->getArgOperandNo(&Arg);
+            if (CB->paramHasAttr(ArgIdx, Attribute::NoUndef)) {
+              // Passing undef to a noundef argument is undefined.
+              return true;
+            }
+          }
+      }
+    }
   }
   return false;
 }
 
 /// If BB has an incoming value that will always trigger undefined behavior
 /// (eg. null pointer dereference), remove the branch leading here.
-static bool removeUndefIntroducingPredecessor(BasicBlock *BB) {
+static bool removeUndefIntroducingPredecessor(BasicBlock *BB,
+                                              DomTreeUpdater *DTU) {
   for (PHINode &PHI : BB->phis())
     for (unsigned i = 0, e = PHI.getNumIncomingValues(); i != e; ++i)
       if (passingValueIsAlwaysUndefined(PHI.getIncomingValue(i), &PHI)) {
-        Instruction *T = PHI.getIncomingBlock(i)->getTerminator();
+        BasicBlock *Predecessor = PHI.getIncomingBlock(i);
+        Instruction *T = Predecessor->getTerminator();
         IRBuilder<> Builder(T);
         if (BranchInst *BI = dyn_cast<BranchInst>(T)) {
-          BB->removePredecessor(PHI.getIncomingBlock(i));
+          BB->removePredecessor(Predecessor);
           // Turn uncoditional branches into unreachables and remove the dead
           // destination from conditional branches.
           if (BI->isUnconditional())
@@ -6147,6 +6479,8 @@ static bool removeUndefIntroducingPredecessor(BasicBlock *BB) {
             Builder.CreateBr(BI->getSuccessor(0) == BB ? BI->getSuccessor(1)
                                                        : BI->getSuccessor(0));
           BI->eraseFromParent();
+          if (DTU)
+            DTU->applyUpdates({{DominatorTree::Delete, Predecessor, BB}});
           return true;
         }
         // TODO: SwitchInst.
@@ -6155,7 +6489,7 @@ static bool removeUndefIntroducingPredecessor(BasicBlock *BB) {
   return false;
 }
 
-bool SimplifyCFGOpt::simplifyOnce(BasicBlock *BB) {
+bool SimplifyCFGOpt::simplifyOnceImpl(BasicBlock *BB) {
   bool Changed = false;
 
   assert(BB && BB->getParent() && "Block not embedded in function!");
@@ -6166,28 +6500,29 @@ bool SimplifyCFGOpt::simplifyOnce(BasicBlock *BB) {
   if ((pred_empty(BB) && BB != &BB->getParent()->getEntryBlock()) ||
       BB->getSinglePredecessor() == BB) {
     LLVM_DEBUG(dbgs() << "Removing BB: \n" << *BB);
-    DeleteDeadBlock(BB);
+    DeleteDeadBlock(BB, DTU);
     return true;
   }
 
   // Check to see if we can constant propagate this terminator instruction
   // away...
-  Changed |= ConstantFoldTerminator(BB, true);
+  Changed |= ConstantFoldTerminator(BB, /*DeleteDeadConditions=*/true,
+                                    /*TLI=*/nullptr, DTU);
 
   // Check for and eliminate duplicate PHI nodes in this block.
   Changed |= EliminateDuplicatePHINodes(BB);
 
   // Check for and remove branches that will always cause undefined behavior.
-  Changed |= removeUndefIntroducingPredecessor(BB);
+  Changed |= removeUndefIntroducingPredecessor(BB, DTU);
 
   // Merge basic blocks into their predecessor if there is only one distinct
   // pred, and if there is only one distinct successor of the predecessor, and
   // if there are no PHI nodes.
-  if (MergeBlockIntoPredecessor(BB))
+  if (MergeBlockIntoPredecessor(BB, DTU))
     return true;
 
   if (SinkCommon && Options.SinkCommonInsts)
-    Changed |= SinkCommonCodeFromPredecessors(BB);
+    Changed |= SinkCommonCodeFromPredecessors(BB, DTU);
 
   IRBuilder<> Builder(BB);
 
@@ -6196,7 +6531,7 @@ bool SimplifyCFGOpt::simplifyOnce(BasicBlock *BB) {
     // eliminate it, do so now.
     if (auto *PN = dyn_cast<PHINode>(BB->begin()))
       if (PN->getNumIncomingValues() == 2)
-        Changed |= FoldTwoEntryPHINode(PN, TTI, DL);
+        Changed |= FoldTwoEntryPHINode(PN, TTI, DTU, DL);
   }
 
   Instruction *Terminator = BB->getTerminator();
@@ -6228,7 +6563,23 @@ bool SimplifyCFGOpt::simplifyOnce(BasicBlock *BB) {
   return Changed;
 }
 
+bool SimplifyCFGOpt::simplifyOnce(BasicBlock *BB) {
+  bool Changed = simplifyOnceImpl(BB);
+
+  assert((!RequireAndPreserveDomTree ||
+          (DTU &&
+           DTU->getDomTree().verify(DominatorTree::VerificationLevel::Full))) &&
+         "Failed to maintain validity of domtree!");
+
+  return Changed;
+}
+
 bool SimplifyCFGOpt::run(BasicBlock *BB) {
+  assert((!RequireAndPreserveDomTree ||
+          (DTU &&
+           DTU->getDomTree().verify(DominatorTree::VerificationLevel::Full))) &&
+         "Original domtree is invalid?");
+
   bool Changed = false;
 
   // Repeated simplify BB as long as resimplification is requested.
@@ -6244,9 +6595,9 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) {
 }
 
 bool llvm::simplifyCFG(BasicBlock *BB, const TargetTransformInfo &TTI,
-                       const SimplifyCFGOptions &Options,
-                       SmallPtrSetImpl<BasicBlock *> *LoopHeaders) {
-  return SimplifyCFGOpt(TTI, BB->getModule()->getDataLayout(), LoopHeaders,
-                        Options)
+                       DomTreeUpdater *DTU, const SimplifyCFGOptions &Options,
+                       ArrayRef<WeakVH> LoopHeaders) {
+  return SimplifyCFGOpt(TTI, RequireAndPreserveDomTree ? DTU : nullptr,
+                        BB->getModule()->getDataLayout(), LoopHeaders, Options)
       .run(BB);
 }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
index d3d0c3341908..290c04a7ad10 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -191,15 +191,15 @@ bool SimplifyIndvar::makeIVComparisonInvariant(ICmpInst *ICmp,
   const SCEV *S = SE->getSCEVAtScope(ICmp->getOperand(IVOperIdx), ICmpLoop);
   const SCEV *X = SE->getSCEVAtScope(ICmp->getOperand(1 - IVOperIdx), ICmpLoop);
 
-  ICmpInst::Predicate InvariantPredicate;
-  const SCEV *InvariantLHS, *InvariantRHS;
-
   auto *PN = dyn_cast<PHINode>(IVOperand);
   if (!PN)
     return false;
-  if (!SE->isLoopInvariantPredicate(Pred, S, X, L, InvariantPredicate,
-                                    InvariantLHS, InvariantRHS))
+  auto LIP = SE->getLoopInvariantPredicate(Pred, S, X, L);
+  if (!LIP)
     return false;
+  ICmpInst::Predicate InvariantPredicate = LIP->Pred;
+  const SCEV *InvariantLHS = LIP->LHS;
+  const SCEV *InvariantRHS = LIP->RHS;
 
   // Rewrite the comparison to a loop invariant comparison if it can be done
   // cheaply, where cheaply means "we don't need to emit any new
@@ -477,6 +477,7 @@ bool SimplifyIndvar::eliminateOverflowIntrinsic(WithOverflowInst *WO) {
   if (WO->use_empty())
     WO->eraseFromParent();
 
+  Changed = true;
   return true;
 }
 
@@ -967,3 +968,1122 @@ bool simplifyLoopIVs(Loop *L, ScalarEvolution *SE, DominatorTree *DT,
 }
 
 } // namespace llvm
+
+//===----------------------------------------------------------------------===//
+// Widen Induction Variables - Extend the width of an IV to cover its
+// widest uses.
+//===----------------------------------------------------------------------===//
+
+class WidenIV {
+  // Parameters
+  PHINode *OrigPhi;
+  Type *WideType;
+
+  // Context
+  LoopInfo        *LI;
+  Loop            *L;
+  ScalarEvolution *SE;
+  DominatorTree   *DT;
+
+  // Does the module have any calls to the llvm.experimental.guard intrinsic
+  // at all? If not we can avoid scanning instructions looking for guards.
+  bool HasGuards;
+
+  bool UsePostIncrementRanges;
+
+  // Statistics
+  unsigned NumElimExt = 0;
+  unsigned NumWidened = 0;
+
+  // Result
+  PHINode *WidePhi = nullptr;
+  Instruction *WideInc = nullptr;
+  const SCEV *WideIncExpr = nullptr;
+  SmallVectorImpl<WeakTrackingVH> &DeadInsts;
+
+  SmallPtrSet<Instruction *,16> Widened;
+
+  enum ExtendKind { ZeroExtended, SignExtended, Unknown };
+
+  // A map tracking the kind of extension used to widen each narrow IV
+  // and narrow IV user.
+  // Key: pointer to a narrow IV or IV user.
+  // Value: the kind of extension used to widen this Instruction.
+  DenseMap<AssertingVH<Instruction>, ExtendKind> ExtendKindMap;
+
+  using DefUserPair = std::pair<AssertingVH<Value>, AssertingVH<Instruction>>;
+
+  // A map with control-dependent ranges for post increment IV uses. The key is
+  // a pair of IV def and a use of this def denoting the context. The value is
+  // a ConstantRange representing possible values of the def at the given
+  // context.
+  DenseMap<DefUserPair, ConstantRange> PostIncRangeInfos;
+
+  Optional<ConstantRange> getPostIncRangeInfo(Value *Def,
+                                              Instruction *UseI) {
+    DefUserPair Key(Def, UseI);
+    auto It = PostIncRangeInfos.find(Key);
+    return It == PostIncRangeInfos.end()
+               ? Optional<ConstantRange>(None)
+               : Optional<ConstantRange>(It->second);
+  }
+
+  void calculatePostIncRanges(PHINode *OrigPhi);
+  void calculatePostIncRange(Instruction *NarrowDef, Instruction *NarrowUser);
+
+  void updatePostIncRangeInfo(Value *Def, Instruction *UseI, ConstantRange R) {
+    DefUserPair Key(Def, UseI);
+    auto It = PostIncRangeInfos.find(Key);
+    if (It == PostIncRangeInfos.end())
+      PostIncRangeInfos.insert({Key, R});
+    else
+      It->second = R.intersectWith(It->second);
+  }
+
+public:
+  /// Record a link in the Narrow IV def-use chain along with the WideIV that
+  /// computes the same value as the Narrow IV def.  This avoids caching Use*
+  /// pointers.
+  struct NarrowIVDefUse {
+    Instruction *NarrowDef = nullptr;
+    Instruction *NarrowUse = nullptr;
+    Instruction *WideDef = nullptr;
+
+    // True if the narrow def is never negative.  Tracking this information lets
+    // us use a sign extension instead of a zero extension or vice versa, when
+    // profitable and legal.
+    bool NeverNegative = false;
+
+    NarrowIVDefUse(Instruction *ND, Instruction *NU, Instruction *WD,
+                   bool NeverNegative)
+        : NarrowDef(ND), NarrowUse(NU), WideDef(WD),
+          NeverNegative(NeverNegative) {}
+  };
+
+  WidenIV(const WideIVInfo &WI, LoopInfo *LInfo, ScalarEvolution *SEv,
+          DominatorTree *DTree, SmallVectorImpl<WeakTrackingVH> &DI,
+          bool HasGuards, bool UsePostIncrementRanges = true);
+
+  PHINode *createWideIV(SCEVExpander &Rewriter);
+
+  unsigned getNumElimExt() { return NumElimExt; };
+  unsigned getNumWidened() { return NumWidened; };
+
+protected:
+  Value *createExtendInst(Value *NarrowOper, Type *WideType, bool IsSigned,
+                          Instruction *Use);
+
+  Instruction *cloneIVUser(NarrowIVDefUse DU, const SCEVAddRecExpr *WideAR);
+  Instruction *cloneArithmeticIVUser(NarrowIVDefUse DU,
+                                     const SCEVAddRecExpr *WideAR);
+  Instruction *cloneBitwiseIVUser(NarrowIVDefUse DU);
+
+  ExtendKind getExtendKind(Instruction *I);
+
+  using WidenedRecTy = std::pair<const SCEVAddRecExpr *, ExtendKind>;
+
+  WidenedRecTy getWideRecurrence(NarrowIVDefUse DU);
+
+  WidenedRecTy getExtendedOperandRecurrence(NarrowIVDefUse DU);
+
+  const SCEV *getSCEVByOpCode(const SCEV *LHS, const SCEV *RHS,
+                              unsigned OpCode) const;
+
+  Instruction *widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter);
+
+  bool widenLoopCompare(NarrowIVDefUse DU);
+  bool widenWithVariantUse(NarrowIVDefUse DU);
+
+  void pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef);
+
+private:
+  SmallVector<NarrowIVDefUse, 8> NarrowIVUsers;
+};
+
+
+/// Determine the insertion point for this user. By default, insert immediately
+/// before the user. SCEVExpander or LICM will hoist loop invariants out of the
+/// loop. For PHI nodes, there may be multiple uses, so compute the nearest
+/// common dominator for the incoming blocks. A nullptr can be returned if no
+/// viable location is found: it may happen if User is a PHI and Def only comes
+/// to this PHI from unreachable blocks.
+static Instruction *getInsertPointForUses(Instruction *User, Value *Def,
+                                          DominatorTree *DT, LoopInfo *LI) {
+  PHINode *PHI = dyn_cast<PHINode>(User);
+  if (!PHI)
+    return User;
+
+  Instruction *InsertPt = nullptr;
+  for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i) {
+    if (PHI->getIncomingValue(i) != Def)
+      continue;
+
+    BasicBlock *InsertBB = PHI->getIncomingBlock(i);
+
+    if (!DT->isReachableFromEntry(InsertBB))
+      continue;
+
+    if (!InsertPt) {
+      InsertPt = InsertBB->getTerminator();
+      continue;
+    }
+    InsertBB = DT->findNearestCommonDominator(InsertPt->getParent(), InsertBB);
+    InsertPt = InsertBB->getTerminator();
+  }
+
+  // If we have skipped all inputs, it means that Def only comes to Phi from
+  // unreachable blocks.
+  if (!InsertPt)
+    return nullptr;
+
+  auto *DefI = dyn_cast<Instruction>(Def);
+  if (!DefI)
+    return InsertPt;
+
+  assert(DT->dominates(DefI, InsertPt) && "def does not dominate all uses");
+
+  auto *L = LI->getLoopFor(DefI->getParent());
+  assert(!L || L->contains(LI->getLoopFor(InsertPt->getParent())));
+
+  for (auto *DTN = (*DT)[InsertPt->getParent()]; DTN; DTN = DTN->getIDom())
+    if (LI->getLoopFor(DTN->getBlock()) == L)
+      return DTN->getBlock()->getTerminator();
+
+  llvm_unreachable("DefI dominates InsertPt!");
+}
+
+WidenIV::WidenIV(const WideIVInfo &WI, LoopInfo *LInfo, ScalarEvolution *SEv,
+          DominatorTree *DTree, SmallVectorImpl<WeakTrackingVH> &DI,
+          bool HasGuards, bool UsePostIncrementRanges)
+      : OrigPhi(WI.NarrowIV), WideType(WI.WidestNativeType), LI(LInfo),
+        L(LI->getLoopFor(OrigPhi->getParent())), SE(SEv), DT(DTree),
+        HasGuards(HasGuards), UsePostIncrementRanges(UsePostIncrementRanges),
+        DeadInsts(DI) {
+    assert(L->getHeader() == OrigPhi->getParent() && "Phi must be an IV");
+    ExtendKindMap[OrigPhi] = WI.IsSigned ? SignExtended : ZeroExtended;
+}
+
+Value *WidenIV::createExtendInst(Value *NarrowOper, Type *WideType,
+                                 bool IsSigned, Instruction *Use) {
+  // Set the debug location and conservative insertion point.
+  IRBuilder<> Builder(Use);
+  // Hoist the insertion point into loop preheaders as far as possible.
+  for (const Loop *L = LI->getLoopFor(Use->getParent());
+       L && L->getLoopPreheader() && L->isLoopInvariant(NarrowOper);
+       L = L->getParentLoop())
+    Builder.SetInsertPoint(L->getLoopPreheader()->getTerminator());
+
+  return IsSigned ? Builder.CreateSExt(NarrowOper, WideType) :
+                    Builder.CreateZExt(NarrowOper, WideType);
+}
+
+/// Instantiate a wide operation to replace a narrow operation. This only needs
+/// to handle operations that can evaluation to SCEVAddRec. It can safely return
+/// 0 for any operation we decide not to clone.
+Instruction *WidenIV::cloneIVUser(WidenIV::NarrowIVDefUse DU,
+                                  const SCEVAddRecExpr *WideAR) {
+  unsigned Opcode = DU.NarrowUse->getOpcode();
+  switch (Opcode) {
+  default:
+    return nullptr;
+  case Instruction::Add:
+  case Instruction::Mul:
+  case Instruction::UDiv:
+  case Instruction::Sub:
+    return cloneArithmeticIVUser(DU, WideAR);
+
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+    return cloneBitwiseIVUser(DU);
+  }
+}
+
+Instruction *WidenIV::cloneBitwiseIVUser(WidenIV::NarrowIVDefUse DU) {
+  Instruction *NarrowUse = DU.NarrowUse;
+  Instruction *NarrowDef = DU.NarrowDef;
+  Instruction *WideDef = DU.WideDef;
+
+  LLVM_DEBUG(dbgs() << "Cloning bitwise IVUser: " << *NarrowUse << "\n");
+
+  // Replace NarrowDef operands with WideDef. Otherwise, we don't know anything
+  // about the narrow operand yet so must insert a [sz]ext. It is probably loop
+  // invariant and will be folded or hoisted. If it actually comes from a
+  // widened IV, it should be removed during a future call to widenIVUse.
+  bool IsSigned = getExtendKind(NarrowDef) == SignExtended;
+  Value *LHS = (NarrowUse->getOperand(0) == NarrowDef)
+                   ? WideDef
+                   : createExtendInst(NarrowUse->getOperand(0), WideType,
+                                      IsSigned, NarrowUse);
+  Value *RHS = (NarrowUse->getOperand(1) == NarrowDef)
+                   ? WideDef
+                   : createExtendInst(NarrowUse->getOperand(1), WideType,
+                                      IsSigned, NarrowUse);
+
+  auto *NarrowBO = cast<BinaryOperator>(NarrowUse);
+  auto *WideBO = BinaryOperator::Create(NarrowBO->getOpcode(), LHS, RHS,
+                                        NarrowBO->getName());
+  IRBuilder<> Builder(NarrowUse);
+  Builder.Insert(WideBO);
+  WideBO->copyIRFlags(NarrowBO);
+  return WideBO;
+}
+
+Instruction *WidenIV::cloneArithmeticIVUser(WidenIV::NarrowIVDefUse DU,
+                                            const SCEVAddRecExpr *WideAR) {
+  Instruction *NarrowUse = DU.NarrowUse;
+  Instruction *NarrowDef = DU.NarrowDef;
+  Instruction *WideDef = DU.WideDef;
+
+  LLVM_DEBUG(dbgs() << "Cloning arithmetic IVUser: " << *NarrowUse << "\n");
+
+  unsigned IVOpIdx = (NarrowUse->getOperand(0) == NarrowDef) ? 0 : 1;
+
+  // We're trying to find X such that
+  //
+  //  Widen(NarrowDef `op` NonIVNarrowDef) == WideAR == WideDef `op.wide` X
+  //
+  // We guess two solutions to X, sext(NonIVNarrowDef) and zext(NonIVNarrowDef),
+  // and check using SCEV if any of them are correct.
+
+  // Returns true if extending NonIVNarrowDef according to `SignExt` is a
+  // correct solution to X.
+  auto GuessNonIVOperand = [&](bool SignExt) {
+    const SCEV *WideLHS;
+    const SCEV *WideRHS;
+
+    auto GetExtend = [this, SignExt](const SCEV *S, Type *Ty) {
+      if (SignExt)
+        return SE->getSignExtendExpr(S, Ty);
+      return SE->getZeroExtendExpr(S, Ty);
+    };
+
+    if (IVOpIdx == 0) {
+      WideLHS = SE->getSCEV(WideDef);
+      const SCEV *NarrowRHS = SE->getSCEV(NarrowUse->getOperand(1));
+      WideRHS = GetExtend(NarrowRHS, WideType);
+    } else {
+      const SCEV *NarrowLHS = SE->getSCEV(NarrowUse->getOperand(0));
+      WideLHS = GetExtend(NarrowLHS, WideType);
+      WideRHS = SE->getSCEV(WideDef);
+    }
+
+    // WideUse is "WideDef `op.wide` X" as described in the comment.
+    const SCEV *WideUse =
+      getSCEVByOpCode(WideLHS, WideRHS, NarrowUse->getOpcode());
+
+    return WideUse == WideAR;
+  };
+
+  bool SignExtend = getExtendKind(NarrowDef) == SignExtended;
+  if (!GuessNonIVOperand(SignExtend)) {
+    SignExtend = !SignExtend;
+    if (!GuessNonIVOperand(SignExtend))
+      return nullptr;
+  }
+
+  Value *LHS = (NarrowUse->getOperand(0) == NarrowDef)
+                   ? WideDef
+                   : createExtendInst(NarrowUse->getOperand(0), WideType,
+                                      SignExtend, NarrowUse);
+  Value *RHS = (NarrowUse->getOperand(1) == NarrowDef)
+                   ? WideDef
+                   : createExtendInst(NarrowUse->getOperand(1), WideType,
+                                      SignExtend, NarrowUse);
+
+  auto *NarrowBO = cast<BinaryOperator>(NarrowUse);
+  auto *WideBO = BinaryOperator::Create(NarrowBO->getOpcode(), LHS, RHS,
+                                        NarrowBO->getName());
+
+  IRBuilder<> Builder(NarrowUse);
+  Builder.Insert(WideBO);
+  WideBO->copyIRFlags(NarrowBO);
+  return WideBO;
+}
+
+WidenIV::ExtendKind WidenIV::getExtendKind(Instruction *I) {
+  auto It = ExtendKindMap.find(I);
+  assert(It != ExtendKindMap.end() && "Instruction not yet extended!");
+  return It->second;
+}
+
+const SCEV *WidenIV::getSCEVByOpCode(const SCEV *LHS, const SCEV *RHS,
+                                     unsigned OpCode) const {
+  switch (OpCode) {
+  case Instruction::Add:
+    return SE->getAddExpr(LHS, RHS);
+  case Instruction::Sub:
+    return SE->getMinusSCEV(LHS, RHS);
+  case Instruction::Mul:
+    return SE->getMulExpr(LHS, RHS);
+  case Instruction::UDiv:
+    return SE->getUDivExpr(LHS, RHS);
+  default:
+    llvm_unreachable("Unsupported opcode.");
+  };
+}
+
+/// No-wrap operations can transfer sign extension of their result to their
+/// operands. Generate the SCEV value for the widened operation without
+/// actually modifying the IR yet. If the expression after extending the
+/// operands is an AddRec for this loop, return the AddRec and the kind of
+/// extension used.
+WidenIV::WidenedRecTy
+WidenIV::getExtendedOperandRecurrence(WidenIV::NarrowIVDefUse DU) {
+  // Handle the common case of add<nsw/nuw>
+  const unsigned OpCode = DU.NarrowUse->getOpcode();
+  // Only Add/Sub/Mul instructions supported yet.
+  if (OpCode != Instruction::Add && OpCode != Instruction::Sub &&
+      OpCode != Instruction::Mul)
+    return {nullptr, Unknown};
+
+  // One operand (NarrowDef) has already been extended to WideDef. Now determine
+  // if extending the other will lead to a recurrence.
+  const unsigned ExtendOperIdx =
+      DU.NarrowUse->getOperand(0) == DU.NarrowDef ? 1 : 0;
+  assert(DU.NarrowUse->getOperand(1-ExtendOperIdx) == DU.NarrowDef && "bad DU");
+
+  const SCEV *ExtendOperExpr = nullptr;
+  const OverflowingBinaryOperator *OBO =
+    cast<OverflowingBinaryOperator>(DU.NarrowUse);
+  ExtendKind ExtKind = getExtendKind(DU.NarrowDef);
+  if (ExtKind == SignExtended && OBO->hasNoSignedWrap())
+    ExtendOperExpr = SE->getSignExtendExpr(
+      SE->getSCEV(DU.NarrowUse->getOperand(ExtendOperIdx)), WideType);
+  else if(ExtKind == ZeroExtended && OBO->hasNoUnsignedWrap())
+    ExtendOperExpr = SE->getZeroExtendExpr(
+      SE->getSCEV(DU.NarrowUse->getOperand(ExtendOperIdx)), WideType);
+  else
+    return {nullptr, Unknown};
+
+  // When creating this SCEV expr, don't apply the current operations NSW or NUW
+  // flags. This instruction may be guarded by control flow that the no-wrap
+  // behavior depends on. Non-control-equivalent instructions can be mapped to
+  // the same SCEV expression, and it would be incorrect to transfer NSW/NUW
+  // semantics to those operations.
+  const SCEV *lhs = SE->getSCEV(DU.WideDef);
+  const SCEV *rhs = ExtendOperExpr;
+
+  // Let's swap operands to the initial order for the case of non-commutative
+  // operations, like SUB. See PR21014.
+  if (ExtendOperIdx == 0)
+    std::swap(lhs, rhs);
+  const SCEVAddRecExpr *AddRec =
+      dyn_cast<SCEVAddRecExpr>(getSCEVByOpCode(lhs, rhs, OpCode));
+
+  if (!AddRec || AddRec->getLoop() != L)
+    return {nullptr, Unknown};
+
+  return {AddRec, ExtKind};
+}
+
+/// Is this instruction potentially interesting for further simplification after
+/// widening it's type? In other words, can the extend be safely hoisted out of
+/// the loop with SCEV reducing the value to a recurrence on the same loop. If
+/// so, return the extended recurrence and the kind of extension used. Otherwise
+/// return {nullptr, Unknown}.
+WidenIV::WidenedRecTy WidenIV::getWideRecurrence(WidenIV::NarrowIVDefUse DU) {
+  if (!SE->isSCEVable(DU.NarrowUse->getType()))
+    return {nullptr, Unknown};
+
+  const SCEV *NarrowExpr = SE->getSCEV(DU.NarrowUse);
+  if (SE->getTypeSizeInBits(NarrowExpr->getType()) >=
+      SE->getTypeSizeInBits(WideType)) {
+    // NarrowUse implicitly widens its operand. e.g. a gep with a narrow
+    // index. So don't follow this use.
+    return {nullptr, Unknown};
+  }
+
+  const SCEV *WideExpr;
+  ExtendKind ExtKind;
+  if (DU.NeverNegative) {
+    WideExpr = SE->getSignExtendExpr(NarrowExpr, WideType);
+    if (isa<SCEVAddRecExpr>(WideExpr))
+      ExtKind = SignExtended;
+    else {
+      WideExpr = SE->getZeroExtendExpr(NarrowExpr, WideType);
+      ExtKind = ZeroExtended;
+    }
+  } else if (getExtendKind(DU.NarrowDef) == SignExtended) {
+    WideExpr = SE->getSignExtendExpr(NarrowExpr, WideType);
+    ExtKind = SignExtended;
+  } else {
+    WideExpr = SE->getZeroExtendExpr(NarrowExpr, WideType);
+    ExtKind = ZeroExtended;
+  }
+  const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(WideExpr);
+  if (!AddRec || AddRec->getLoop() != L)
+    return {nullptr, Unknown};
+  return {AddRec, ExtKind};
+}
+
+/// This IV user cannot be widened. Replace this use of the original narrow IV
+/// with a truncation of the new wide IV to isolate and eliminate the narrow IV.
+static void truncateIVUse(WidenIV::NarrowIVDefUse DU, DominatorTree *DT,
+                          LoopInfo *LI) {
+  auto *InsertPt = getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT, LI);
+  if (!InsertPt)
+    return;
+  LLVM_DEBUG(dbgs() << "INDVARS: Truncate IV " << *DU.WideDef << " for user "
+                    << *DU.NarrowUse << "\n");
+  IRBuilder<> Builder(InsertPt);
+  Value *Trunc = Builder.CreateTrunc(DU.WideDef, DU.NarrowDef->getType());
+  DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, Trunc);
+}
+
+/// If the narrow use is a compare instruction, then widen the compare
+//  (and possibly the other operand).  The extend operation is hoisted into the
+// loop preheader as far as possible.
+bool WidenIV::widenLoopCompare(WidenIV::NarrowIVDefUse DU) {
+  ICmpInst *Cmp = dyn_cast<ICmpInst>(DU.NarrowUse);
+  if (!Cmp)
+    return false;
+
+  // We can legally widen the comparison in the following two cases:
+  //
+  //  - The signedness of the IV extension and comparison match
+  //
+  //  - The narrow IV is always positive (and thus its sign extension is equal
+  //    to its zero extension).  For instance, let's say we're zero extending
+  //    %narrow for the following use
+  //
+  //      icmp slt i32 %narrow, %val   ... (A)
+  //
+  //    and %narrow is always positive.  Then
+  //
+  //      (A) == icmp slt i32 sext(%narrow), sext(%val)
+  //          == icmp slt i32 zext(%narrow), sext(%val)
+  bool IsSigned = getExtendKind(DU.NarrowDef) == SignExtended;
+  if (!(DU.NeverNegative || IsSigned == Cmp->isSigned()))
+    return false;
+
+  Value *Op = Cmp->getOperand(Cmp->getOperand(0) == DU.NarrowDef ? 1 : 0);
+  unsigned CastWidth = SE->getTypeSizeInBits(Op->getType());
+  unsigned IVWidth = SE->getTypeSizeInBits(WideType);
+  assert(CastWidth <= IVWidth && "Unexpected width while widening compare.");
+
+  // Widen the compare instruction.
+  auto *InsertPt = getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT, LI);
+  if (!InsertPt)
+    return false;
+  IRBuilder<> Builder(InsertPt);
+  DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, DU.WideDef);
+
+  // Widen the other operand of the compare, if necessary.
+  if (CastWidth < IVWidth) {
+    Value *ExtOp = createExtendInst(Op, WideType, Cmp->isSigned(), Cmp);
+    DU.NarrowUse->replaceUsesOfWith(Op, ExtOp);
+  }
+  return true;
+}
+
+// The widenIVUse avoids generating trunc by evaluating the use as AddRec, this
+// will not work when:
+//    1) SCEV traces back to an instruction inside the loop that SCEV can not
+// expand, eg. add %indvar, (load %addr)
+//    2) SCEV finds a loop variant, eg. add %indvar, %loopvariant
+// While SCEV fails to avoid trunc, we can still try to use instruction
+// combining approach to prove trunc is not required. This can be further
+// extended with other instruction combining checks, but for now we handle the
+// following case (sub can be "add" and "mul", "nsw + sext" can be "nus + zext")
+//
+// Src:
+//   %c = sub nsw %b, %indvar
+//   %d = sext %c to i64
+// Dst:
+//   %indvar.ext1 = sext %indvar to i64
+//   %m = sext %b to i64
+//   %d = sub nsw i64 %m, %indvar.ext1
+// Therefore, as long as the result of add/sub/mul is extended to wide type, no
+// trunc is required regardless of how %b is generated. This pattern is common
+// when calculating address in 64 bit architecture
+bool WidenIV::widenWithVariantUse(WidenIV::NarrowIVDefUse DU) {
+  Instruction *NarrowUse = DU.NarrowUse;
+  Instruction *NarrowDef = DU.NarrowDef;
+  Instruction *WideDef = DU.WideDef;
+
+  // Handle the common case of add<nsw/nuw>
+  const unsigned OpCode = NarrowUse->getOpcode();
+  // Only Add/Sub/Mul instructions are supported.
+  if (OpCode != Instruction::Add && OpCode != Instruction::Sub &&
+      OpCode != Instruction::Mul)
+    return false;
+
+  // The operand that is not defined by NarrowDef of DU. Let's call it the
+  // other operand.
+  assert((NarrowUse->getOperand(0) == NarrowDef ||
+          NarrowUse->getOperand(1) == NarrowDef) &&
+         "bad DU");
+
+  const OverflowingBinaryOperator *OBO =
+    cast<OverflowingBinaryOperator>(NarrowUse);
+  ExtendKind ExtKind = getExtendKind(NarrowDef);
+  bool CanSignExtend = ExtKind == SignExtended && OBO->hasNoSignedWrap();
+  bool CanZeroExtend = ExtKind == ZeroExtended && OBO->hasNoUnsignedWrap();
+  auto AnotherOpExtKind = ExtKind;
+
+  // Check that all uses are either:
+  // - narrow def (in case of we are widening the IV increment);
+  // - single-input LCSSA Phis;
+  // - comparison of the chosen type;
+  // - extend of the chosen type (raison d'etre).
+  SmallVector<Instruction *, 4> ExtUsers;
+  SmallVector<PHINode *, 4> LCSSAPhiUsers;
+  SmallVector<ICmpInst *, 4> ICmpUsers;
+  for (Use &U : NarrowUse->uses()) {
+    Instruction *User = cast<Instruction>(U.getUser());
+    if (User == NarrowDef)
+      continue;
+    if (!L->contains(User)) {
+      auto *LCSSAPhi = cast<PHINode>(User);
+      // Make sure there is only 1 input, so that we don't have to split
+      // critical edges.
+      if (LCSSAPhi->getNumOperands() != 1)
+        return false;
+      LCSSAPhiUsers.push_back(LCSSAPhi);
+      continue;
+    }
+    if (auto *ICmp = dyn_cast<ICmpInst>(User)) {
+      auto Pred = ICmp->getPredicate();
+      // We have 3 types of predicates: signed, unsigned and equality
+      // predicates. For equality, it's legal to widen icmp for either sign and
+      // zero extend. For sign extend, we can also do so for signed predicates,
+      // likeweise for zero extend we can widen icmp for unsigned predicates.
+      if (ExtKind == ZeroExtended && ICmpInst::isSigned(Pred))
+        return false;
+      if (ExtKind == SignExtended && ICmpInst::isUnsigned(Pred))
+        return false;
+      ICmpUsers.push_back(ICmp);
+      continue;
+    }
+    if (ExtKind == SignExtended)
+      User = dyn_cast<SExtInst>(User);
+    else
+      User = dyn_cast<ZExtInst>(User);
+    if (!User || User->getType() != WideType)
+      return false;
+    ExtUsers.push_back(User);
+  }
+  if (ExtUsers.empty()) {
+    DeadInsts.emplace_back(NarrowUse);
+    return true;
+  }
+
+  // We'll prove some facts that should be true in the context of ext users. If
+  // there is no users, we are done now. If there are some, pick their common
+  // dominator as context.
+  Instruction *Context = nullptr;
+  for (auto *Ext : ExtUsers) {
+    if (!Context || DT->dominates(Ext, Context))
+      Context = Ext;
+    else if (!DT->dominates(Context, Ext))
+      // For users that don't have dominance relation, use common dominator.
+      Context =
+          DT->findNearestCommonDominator(Context->getParent(), Ext->getParent())
+              ->getTerminator();
+  }
+  assert(Context && "Context not found?");
+
+  if (!CanSignExtend && !CanZeroExtend) {
+    // Because InstCombine turns 'sub nuw' to 'add' losing the no-wrap flag, we
+    // will most likely not see it. Let's try to prove it.
+    if (OpCode != Instruction::Add)
+      return false;
+    if (ExtKind != ZeroExtended)
+      return false;
+    const SCEV *LHS = SE->getSCEV(OBO->getOperand(0));
+    const SCEV *RHS = SE->getSCEV(OBO->getOperand(1));
+    // TODO: Support case for NarrowDef = NarrowUse->getOperand(1).
+    if (NarrowUse->getOperand(0) != NarrowDef)
+      return false;
+    if (!SE->isKnownNegative(RHS))
+      return false;
+    bool ProvedSubNUW = SE->isKnownPredicateAt(
+        ICmpInst::ICMP_UGE, LHS, SE->getNegativeSCEV(RHS), Context);
+    if (!ProvedSubNUW)
+      return false;
+    // In fact, our 'add' is 'sub nuw'. We will need to widen the 2nd operand as
+    // neg(zext(neg(op))), which is basically sext(op).
+    AnotherOpExtKind = SignExtended;
+  }
+
+  // Verifying that Defining operand is an AddRec
+  const SCEV *Op1 = SE->getSCEV(WideDef);
+  const SCEVAddRecExpr *AddRecOp1 = dyn_cast<SCEVAddRecExpr>(Op1);
+  if (!AddRecOp1 || AddRecOp1->getLoop() != L)
+    return false;
+
+  LLVM_DEBUG(dbgs() << "Cloning arithmetic IVUser: " << *NarrowUse << "\n");
+
+  // Generating a widening use instruction.
+  Value *LHS = (NarrowUse->getOperand(0) == NarrowDef)
+                   ? WideDef
+                   : createExtendInst(NarrowUse->getOperand(0), WideType,
+                                      AnotherOpExtKind, NarrowUse);
+  Value *RHS = (NarrowUse->getOperand(1) == NarrowDef)
+                   ? WideDef
+                   : createExtendInst(NarrowUse->getOperand(1), WideType,
+                                      AnotherOpExtKind, NarrowUse);
+
+  auto *NarrowBO = cast<BinaryOperator>(NarrowUse);
+  auto *WideBO = BinaryOperator::Create(NarrowBO->getOpcode(), LHS, RHS,
+                                        NarrowBO->getName());
+  IRBuilder<> Builder(NarrowUse);
+  Builder.Insert(WideBO);
+  WideBO->copyIRFlags(NarrowBO);
+  ExtendKindMap[NarrowUse] = ExtKind;
+
+  for (Instruction *User : ExtUsers) {
+    assert(User->getType() == WideType && "Checked before!");
+    LLVM_DEBUG(dbgs() << "INDVARS: eliminating " << *User << " replaced by "
+                      << *WideBO << "\n");
+    ++NumElimExt;
+    User->replaceAllUsesWith(WideBO);
+    DeadInsts.emplace_back(User);
+  }
+
+  for (PHINode *User : LCSSAPhiUsers) {
+    assert(User->getNumOperands() == 1 && "Checked before!");
+    Builder.SetInsertPoint(User);
+    auto *WidePN =
+        Builder.CreatePHI(WideBO->getType(), 1, User->getName() + ".wide");
+    BasicBlock *LoopExitingBlock = User->getParent()->getSinglePredecessor();
+    assert(LoopExitingBlock && L->contains(LoopExitingBlock) &&
+           "Not a LCSSA Phi?");
+    WidePN->addIncoming(WideBO, LoopExitingBlock);
+    Builder.SetInsertPoint(&*User->getParent()->getFirstInsertionPt());
+    auto *TruncPN = Builder.CreateTrunc(WidePN, User->getType());
+    User->replaceAllUsesWith(TruncPN);
+    DeadInsts.emplace_back(User);
+  }
+
+  for (ICmpInst *User : ICmpUsers) {
+    Builder.SetInsertPoint(User);
+    auto ExtendedOp = [&](Value * V)->Value * {
+      if (V == NarrowUse)
+        return WideBO;
+      if (ExtKind == ZeroExtended)
+        return Builder.CreateZExt(V, WideBO->getType());
+      else
+        return Builder.CreateSExt(V, WideBO->getType());
+    };
+    auto Pred = User->getPredicate();
+    auto *LHS = ExtendedOp(User->getOperand(0));
+    auto *RHS = ExtendedOp(User->getOperand(1));
+    auto *WideCmp =
+        Builder.CreateICmp(Pred, LHS, RHS, User->getName() + ".wide");
+    User->replaceAllUsesWith(WideCmp);
+    DeadInsts.emplace_back(User);
+  }
+
+  return true;
+}
+
+/// Determine whether an individual user of the narrow IV can be widened. If so,
+/// return the wide clone of the user.
+Instruction *WidenIV::widenIVUse(WidenIV::NarrowIVDefUse DU, SCEVExpander &Rewriter) {
+  assert(ExtendKindMap.count(DU.NarrowDef) &&
+         "Should already know the kind of extension used to widen NarrowDef");
+
+  // Stop traversing the def-use chain at inner-loop phis or post-loop phis.
+  if (PHINode *UsePhi = dyn_cast<PHINode>(DU.NarrowUse)) {
+    if (LI->getLoopFor(UsePhi->getParent()) != L) {
+      // For LCSSA phis, sink the truncate outside the loop.
+      // After SimplifyCFG most loop exit targets have a single predecessor.
+      // Otherwise fall back to a truncate within the loop.
+      if (UsePhi->getNumOperands() != 1)
+        truncateIVUse(DU, DT, LI);
+      else {
+        // Widening the PHI requires us to insert a trunc.  The logical place
+        // for this trunc is in the same BB as the PHI.  This is not possible if
+        // the BB is terminated by a catchswitch.
+        if (isa<CatchSwitchInst>(UsePhi->getParent()->getTerminator()))
+          return nullptr;
+
+        PHINode *WidePhi =
+          PHINode::Create(DU.WideDef->getType(), 1, UsePhi->getName() + ".wide",
+                          UsePhi);
+        WidePhi->addIncoming(DU.WideDef, UsePhi->getIncomingBlock(0));
+        IRBuilder<> Builder(&*WidePhi->getParent()->getFirstInsertionPt());
+        Value *Trunc = Builder.CreateTrunc(WidePhi, DU.NarrowDef->getType());
+        UsePhi->replaceAllUsesWith(Trunc);
+        DeadInsts.emplace_back(UsePhi);
+        LLVM_DEBUG(dbgs() << "INDVARS: Widen lcssa phi " << *UsePhi << " to "
+                          << *WidePhi << "\n");
+      }
+      return nullptr;
+    }
+  }
+
+  // This narrow use can be widened by a sext if it's non-negative or its narrow
+  // def was widended by a sext. Same for zext.
+  auto canWidenBySExt = [&]() {
+    return DU.NeverNegative || getExtendKind(DU.NarrowDef) == SignExtended;
+  };
+  auto canWidenByZExt = [&]() {
+    return DU.NeverNegative || getExtendKind(DU.NarrowDef) == ZeroExtended;
+  };
+
+  // Our raison d'etre! Eliminate sign and zero extension.
+  if ((isa<SExtInst>(DU.NarrowUse) && canWidenBySExt()) ||
+      (isa<ZExtInst>(DU.NarrowUse) && canWidenByZExt())) {
+    Value *NewDef = DU.WideDef;
+    if (DU.NarrowUse->getType() != WideType) {
+      unsigned CastWidth = SE->getTypeSizeInBits(DU.NarrowUse->getType());
+      unsigned IVWidth = SE->getTypeSizeInBits(WideType);
+      if (CastWidth < IVWidth) {
+        // The cast isn't as wide as the IV, so insert a Trunc.
+        IRBuilder<> Builder(DU.NarrowUse);
+        NewDef = Builder.CreateTrunc(DU.WideDef, DU.NarrowUse->getType());
+      }
+      else {
+        // A wider extend was hidden behind a narrower one. This may induce
+        // another round of IV widening in which the intermediate IV becomes
+        // dead. It should be very rare.
+        LLVM_DEBUG(dbgs() << "INDVARS: New IV " << *WidePhi
+                          << " not wide enough to subsume " << *DU.NarrowUse
+                          << "\n");
+        DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, DU.WideDef);
+        NewDef = DU.NarrowUse;
+      }
+    }
+    if (NewDef != DU.NarrowUse) {
+      LLVM_DEBUG(dbgs() << "INDVARS: eliminating " << *DU.NarrowUse
+                        << " replaced by " << *DU.WideDef << "\n");
+      ++NumElimExt;
+      DU.NarrowUse->replaceAllUsesWith(NewDef);
+      DeadInsts.emplace_back(DU.NarrowUse);
+    }
+    // Now that the extend is gone, we want to expose it's uses for potential
+    // further simplification. We don't need to directly inform SimplifyIVUsers
+    // of the new users, because their parent IV will be processed later as a
+    // new loop phi. If we preserved IVUsers analysis, we would also want to
+    // push the uses of WideDef here.
+
+    // No further widening is needed. The deceased [sz]ext had done it for us.
+    return nullptr;
+  }
+
+  // Does this user itself evaluate to a recurrence after widening?
+  WidenedRecTy WideAddRec = getExtendedOperandRecurrence(DU);
+  if (!WideAddRec.first)
+    WideAddRec = getWideRecurrence(DU);
+
+  assert((WideAddRec.first == nullptr) == (WideAddRec.second == Unknown));
+  if (!WideAddRec.first) {
+    // If use is a loop condition, try to promote the condition instead of
+    // truncating the IV first.
+    if (widenLoopCompare(DU))
+      return nullptr;
+
+    // We are here about to generate a truncate instruction that may hurt
+    // performance because the scalar evolution expression computed earlier
+    // in WideAddRec.first does not indicate a polynomial induction expression.
+    // In that case, look at the operands of the use instruction to determine
+    // if we can still widen the use instead of truncating its operand.
+    if (widenWithVariantUse(DU))
+      return nullptr;
+
+    // This user does not evaluate to a recurrence after widening, so don't
+    // follow it. Instead insert a Trunc to kill off the original use,
+    // eventually isolating the original narrow IV so it can be removed.
+    truncateIVUse(DU, DT, LI);
+    return nullptr;
+  }
+  // Assume block terminators cannot evaluate to a recurrence. We can't to
+  // insert a Trunc after a terminator if there happens to be a critical edge.
+  assert(DU.NarrowUse != DU.NarrowUse->getParent()->getTerminator() &&
+         "SCEV is not expected to evaluate a block terminator");
+
+  // Reuse the IV increment that SCEVExpander created as long as it dominates
+  // NarrowUse.
+  Instruction *WideUse = nullptr;
+  if (WideAddRec.first == WideIncExpr &&
+      Rewriter.hoistIVInc(WideInc, DU.NarrowUse))
+    WideUse = WideInc;
+  else {
+    WideUse = cloneIVUser(DU, WideAddRec.first);
+    if (!WideUse)
+      return nullptr;
+  }
+  // Evaluation of WideAddRec ensured that the narrow expression could be
+  // extended outside the loop without overflow. This suggests that the wide use
+  // evaluates to the same expression as the extended narrow use, but doesn't
+  // absolutely guarantee it. Hence the following failsafe check. In rare cases
+  // where it fails, we simply throw away the newly created wide use.
+  if (WideAddRec.first != SE->getSCEV(WideUse)) {
+    LLVM_DEBUG(dbgs() << "Wide use expression mismatch: " << *WideUse << ": "
+                      << *SE->getSCEV(WideUse) << " != " << *WideAddRec.first
+                      << "\n");
+    DeadInsts.emplace_back(WideUse);
+    return nullptr;
+  }
+
+  // if we reached this point then we are going to replace
+  // DU.NarrowUse with WideUse. Reattach DbgValue then.
+  replaceAllDbgUsesWith(*DU.NarrowUse, *WideUse, *WideUse, *DT);
+
+  ExtendKindMap[DU.NarrowUse] = WideAddRec.second;
+  // Returning WideUse pushes it on the worklist.
+  return WideUse;
+}
+
+/// Add eligible users of NarrowDef to NarrowIVUsers.
+void WidenIV::pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef) {
+  const SCEV *NarrowSCEV = SE->getSCEV(NarrowDef);
+  bool NonNegativeDef =
+      SE->isKnownPredicate(ICmpInst::ICMP_SGE, NarrowSCEV,
+                           SE->getZero(NarrowSCEV->getType()));
+  for (User *U : NarrowDef->users()) {
+    Instruction *NarrowUser = cast<Instruction>(U);
+
+    // Handle data flow merges and bizarre phi cycles.
+    if (!Widened.insert(NarrowUser).second)
+      continue;
+
+    bool NonNegativeUse = false;
+    if (!NonNegativeDef) {
+      // We might have a control-dependent range information for this context.
+      if (auto RangeInfo = getPostIncRangeInfo(NarrowDef, NarrowUser))
+        NonNegativeUse = RangeInfo->getSignedMin().isNonNegative();
+    }
+
+    NarrowIVUsers.emplace_back(NarrowDef, NarrowUser, WideDef,
+                               NonNegativeDef || NonNegativeUse);
+  }
+}
+
+/// Process a single induction variable. First use the SCEVExpander to create a
+/// wide induction variable that evaluates to the same recurrence as the
+/// original narrow IV. Then use a worklist to forward traverse the narrow IV's
+/// def-use chain. After widenIVUse has processed all interesting IV users, the
+/// narrow IV will be isolated for removal by DeleteDeadPHIs.
+///
+/// It would be simpler to delete uses as they are processed, but we must avoid
+/// invalidating SCEV expressions.
+PHINode *WidenIV::createWideIV(SCEVExpander &Rewriter) {
+  // Is this phi an induction variable?
+  const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(OrigPhi));
+  if (!AddRec)
+    return nullptr;
+
+  // Widen the induction variable expression.
+  const SCEV *WideIVExpr = getExtendKind(OrigPhi) == SignExtended
+                               ? SE->getSignExtendExpr(AddRec, WideType)
+                               : SE->getZeroExtendExpr(AddRec, WideType);
+
+  assert(SE->getEffectiveSCEVType(WideIVExpr->getType()) == WideType &&
+         "Expect the new IV expression to preserve its type");
+
+  // Can the IV be extended outside the loop without overflow?
+  AddRec = dyn_cast<SCEVAddRecExpr>(WideIVExpr);
+  if (!AddRec || AddRec->getLoop() != L)
+    return nullptr;
+
+  // An AddRec must have loop-invariant operands. Since this AddRec is
+  // materialized by a loop header phi, the expression cannot have any post-loop
+  // operands, so they must dominate the loop header.
+  assert(
+      SE->properlyDominates(AddRec->getStart(), L->getHeader()) &&
+      SE->properlyDominates(AddRec->getStepRecurrence(*SE), L->getHeader()) &&
+      "Loop header phi recurrence inputs do not dominate the loop");
+
+  // Iterate over IV uses (including transitive ones) looking for IV increments
+  // of the form 'add nsw %iv, <const>'. For each increment and each use of
+  // the increment calculate control-dependent range information basing on
+  // dominating conditions inside of the loop (e.g. a range check inside of the
+  // loop). Calculated ranges are stored in PostIncRangeInfos map.
+  //
+  // Control-dependent range information is later used to prove that a narrow
+  // definition is not negative (see pushNarrowIVUsers). It's difficult to do
+  // this on demand because when pushNarrowIVUsers needs this information some
+  // of the dominating conditions might be already widened.
+  if (UsePostIncrementRanges)
+    calculatePostIncRanges(OrigPhi);
+
+  // The rewriter provides a value for the desired IV expression. This may
+  // either find an existing phi or materialize a new one. Either way, we
+  // expect a well-formed cyclic phi-with-increments. i.e. any operand not part
+  // of the phi-SCC dominates the loop entry.
+  Instruction *InsertPt = &*L->getHeader()->getFirstInsertionPt();
+  Value *ExpandInst = Rewriter.expandCodeFor(AddRec, WideType, InsertPt);
+  // If the wide phi is not a phi node, for example a cast node, like bitcast,
+  // inttoptr, ptrtoint, just skip for now.
+  if (!(WidePhi = dyn_cast<PHINode>(ExpandInst))) {
+    // if the cast node is an inserted instruction without any user, we should
+    // remove it to make sure the pass don't touch the function as we can not
+    // wide the phi.
+    if (ExpandInst->hasNUses(0) &&
+        Rewriter.isInsertedInstruction(cast<Instruction>(ExpandInst)))
+      DeadInsts.emplace_back(ExpandInst);
+    return nullptr;
+  }
+
+  // Remembering the WideIV increment generated by SCEVExpander allows
+  // widenIVUse to reuse it when widening the narrow IV's increment. We don't
+  // employ a general reuse mechanism because the call above is the only call to
+  // SCEVExpander. Henceforth, we produce 1-to-1 narrow to wide uses.
+  if (BasicBlock *LatchBlock = L->getLoopLatch()) {
+    WideInc =
+      cast<Instruction>(WidePhi->getIncomingValueForBlock(LatchBlock));
+    WideIncExpr = SE->getSCEV(WideInc);
+    // Propagate the debug location associated with the original loop increment
+    // to the new (widened) increment.
+    auto *OrigInc =
+      cast<Instruction>(OrigPhi->getIncomingValueForBlock(LatchBlock));
+    WideInc->setDebugLoc(OrigInc->getDebugLoc());
+  }
+
+  LLVM_DEBUG(dbgs() << "Wide IV: " << *WidePhi << "\n");
+  ++NumWidened;
+
+  // Traverse the def-use chain using a worklist starting at the original IV.
+  assert(Widened.empty() && NarrowIVUsers.empty() && "expect initial state" );
+
+  Widened.insert(OrigPhi);
+  pushNarrowIVUsers(OrigPhi, WidePhi);
+
+  while (!NarrowIVUsers.empty()) {
+    WidenIV::NarrowIVDefUse DU = NarrowIVUsers.pop_back_val();
+
+    // Process a def-use edge. This may replace the use, so don't hold a
+    // use_iterator across it.
+    Instruction *WideUse = widenIVUse(DU, Rewriter);
+
+    // Follow all def-use edges from the previous narrow use.
+    if (WideUse)
+      pushNarrowIVUsers(DU.NarrowUse, WideUse);
+
+    // widenIVUse may have removed the def-use edge.
+    if (DU.NarrowDef->use_empty())
+      DeadInsts.emplace_back(DU.NarrowDef);
+  }
+
+  // Attach any debug information to the new PHI.
+  replaceAllDbgUsesWith(*OrigPhi, *WidePhi, *WidePhi, *DT);
+
+  return WidePhi;
+}
+
+/// Calculates control-dependent range for the given def at the given context
+/// by looking at dominating conditions inside of the loop
+void WidenIV::calculatePostIncRange(Instruction *NarrowDef,
+                                    Instruction *NarrowUser) {
+  using namespace llvm::PatternMatch;
+
+  Value *NarrowDefLHS;
+  const APInt *NarrowDefRHS;
+  if (!match(NarrowDef, m_NSWAdd(m_Value(NarrowDefLHS),
+                                 m_APInt(NarrowDefRHS))) ||
+      !NarrowDefRHS->isNonNegative())
+    return;
+
+  auto UpdateRangeFromCondition = [&] (Value *Condition,
+                                       bool TrueDest) {
+    CmpInst::Predicate Pred;
+    Value *CmpRHS;
+    if (!match(Condition, m_ICmp(Pred, m_Specific(NarrowDefLHS),
+                                 m_Value(CmpRHS))))
+      return;
+
+    CmpInst::Predicate P =
+            TrueDest ? Pred : CmpInst::getInversePredicate(Pred);
+
+    auto CmpRHSRange = SE->getSignedRange(SE->getSCEV(CmpRHS));
+    auto CmpConstrainedLHSRange =
+            ConstantRange::makeAllowedICmpRegion(P, CmpRHSRange);
+    auto NarrowDefRange = CmpConstrainedLHSRange.addWithNoWrap(
+        *NarrowDefRHS, OverflowingBinaryOperator::NoSignedWrap);
+
+    updatePostIncRangeInfo(NarrowDef, NarrowUser, NarrowDefRange);
+  };
+
+  auto UpdateRangeFromGuards = [&](Instruction *Ctx) {
+    if (!HasGuards)
+      return;
+
+    for (Instruction &I : make_range(Ctx->getIterator().getReverse(),
+                                     Ctx->getParent()->rend())) {
+      Value *C = nullptr;
+      if (match(&I, m_Intrinsic<Intrinsic::experimental_guard>(m_Value(C))))
+        UpdateRangeFromCondition(C, /*TrueDest=*/true);
+    }
+  };
+
+  UpdateRangeFromGuards(NarrowUser);
+
+  BasicBlock *NarrowUserBB = NarrowUser->getParent();
+  // If NarrowUserBB is statically unreachable asking dominator queries may
+  // yield surprising results. (e.g. the block may not have a dom tree node)
+  if (!DT->isReachableFromEntry(NarrowUserBB))
+    return;
+
+  for (auto *DTB = (*DT)[NarrowUserBB]->getIDom();
+       L->contains(DTB->getBlock());
+       DTB = DTB->getIDom()) {
+    auto *BB = DTB->getBlock();
+    auto *TI = BB->getTerminator();
+    UpdateRangeFromGuards(TI);
+
+    auto *BI = dyn_cast<BranchInst>(TI);
+    if (!BI || !BI->isConditional())
+      continue;
+
+    auto *TrueSuccessor = BI->getSuccessor(0);
+    auto *FalseSuccessor = BI->getSuccessor(1);
+
+    auto DominatesNarrowUser = [this, NarrowUser] (BasicBlockEdge BBE) {
+      return BBE.isSingleEdge() &&
+             DT->dominates(BBE, NarrowUser->getParent());
+    };
+
+    if (DominatesNarrowUser(BasicBlockEdge(BB, TrueSuccessor)))
+      UpdateRangeFromCondition(BI->getCondition(), /*TrueDest=*/true);
+
+    if (DominatesNarrowUser(BasicBlockEdge(BB, FalseSuccessor)))
+      UpdateRangeFromCondition(BI->getCondition(), /*TrueDest=*/false);
+  }
+}
+
+/// Calculates PostIncRangeInfos map for the given IV
+void WidenIV::calculatePostIncRanges(PHINode *OrigPhi) {
+  SmallPtrSet<Instruction *, 16> Visited;
+  SmallVector<Instruction *, 6> Worklist;
+  Worklist.push_back(OrigPhi);
+  Visited.insert(OrigPhi);
+
+  while (!Worklist.empty()) {
+    Instruction *NarrowDef = Worklist.pop_back_val();
+
+    for (Use &U : NarrowDef->uses()) {
+      auto *NarrowUser = cast<Instruction>(U.getUser());
+
+      // Don't go looking outside the current loop.
+      auto *NarrowUserLoop = (*LI)[NarrowUser->getParent()];
+      if (!NarrowUserLoop || !L->contains(NarrowUserLoop))
+        continue;
+
+      if (!Visited.insert(NarrowUser).second)
+        continue;
+
+      Worklist.push_back(NarrowUser);
+
+      calculatePostIncRange(NarrowDef, NarrowUser);
+    }
+  }
+}
+
+PHINode *llvm::createWideIV(const WideIVInfo &WI,
+    LoopInfo *LI, ScalarEvolution *SE, SCEVExpander &Rewriter,
+    DominatorTree *DT, SmallVectorImpl<WeakTrackingVH> &DeadInsts,
+    unsigned &NumElimExt, unsigned &NumWidened,
+    bool HasGuards, bool UsePostIncrementRanges) {
+  WidenIV Widener(WI, LI, SE, DT, DeadInsts, HasGuards, UsePostIncrementRanges);
+  PHINode *WidePHI = Widener.createWideIV(Rewriter);
+  NumElimExt = Widener.getNumElimExt();
+  NumWidened = Widener.getNumWidened();
+  return WidePHI;
+}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
index cfcc3454a210..f9a9dd237b6c 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -20,7 +20,6 @@
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/CaptureTracking.h"
@@ -542,6 +541,8 @@ Value *LibCallSimplifier::optimizeStrCpy(CallInst *CI, IRBuilderBase &B) {
       B.CreateMemCpy(Dst, Align(1), Src, Align(1),
                      ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len));
   NewCI->setAttributes(CI->getAttributes());
+  NewCI->removeAttributes(AttributeList::ReturnIndex,
+                          AttributeFuncs::typeIncompatible(NewCI->getType()));
   return Dst;
 }
 
@@ -569,6 +570,8 @@ Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilderBase &B) {
   // copy for us.  Make a memcpy to copy the nul byte with align = 1.
   CallInst *NewCI = B.CreateMemCpy(Dst, Align(1), Src, Align(1), LenV);
   NewCI->setAttributes(CI->getAttributes());
+  NewCI->removeAttributes(AttributeList::ReturnIndex,
+                          AttributeFuncs::typeIncompatible(NewCI->getType()));
   return DstEnd;
 }
 
@@ -609,15 +612,27 @@ Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilderBase &B) {
     return Dst;
   }
 
-  // Let strncpy handle the zero padding
-  if (Len > SrcLen + 1)
-    return nullptr;
+  // strncpy(a, "a", 4) - > memcpy(a, "a\0\0\0", 4)
+  if (Len > SrcLen + 1) {
+    if (Len <= 128) {
+      StringRef Str;
+      if (!getConstantStringInfo(Src, Str))
+        return nullptr;
+      std::string SrcStr = Str.str();
+      SrcStr.resize(Len, '\0');
+      Src = B.CreateGlobalString(SrcStr, "str");
+    } else {
+      return nullptr;
+    }
+  }
 
   Type *PT = Callee->getFunctionType()->getParamType(0);
   // strncpy(x, s, c) -> memcpy(align 1 x, align 1 s, c) [s and c are constant]
   CallInst *NewCI = B.CreateMemCpy(Dst, Align(1), Src, Align(1),
                                    ConstantInt::get(DL.getIntPtrType(PT), Len));
   NewCI->setAttributes(CI->getAttributes());
+  NewCI->removeAttributes(AttributeList::ReturnIndex,
+                          AttributeFuncs::typeIncompatible(NewCI->getType()));
   return Dst;
 }
 
@@ -684,8 +699,6 @@ Value *LibCallSimplifier::optimizeStringLength(CallInst *CI, IRBuilderBase &B,
                            Offset);
       }
     }
-
-    return nullptr;
   }
 
   // strlen(x?"foo":"bars") --> x ? 3 : 4
@@ -1095,6 +1108,8 @@ Value *LibCallSimplifier::optimizeMemCpy(CallInst *CI, IRBuilderBase &B) {
   CallInst *NewCI = B.CreateMemCpy(CI->getArgOperand(0), Align(1),
                                    CI->getArgOperand(1), Align(1), Size);
   NewCI->setAttributes(CI->getAttributes());
+  NewCI->removeAttributes(AttributeList::ReturnIndex,
+                          AttributeFuncs::typeIncompatible(NewCI->getType()));
   return CI->getArgOperand(0);
 }
 
@@ -1143,7 +1158,12 @@ Value *LibCallSimplifier::optimizeMemPCpy(CallInst *CI, IRBuilderBase &B) {
   // mempcpy(x, y, n) -> llvm.memcpy(align 1 x, align 1 y, n), x + n
   CallInst *NewCI =
       B.CreateMemCpy(Dst, Align(1), CI->getArgOperand(1), Align(1), N);
+  // Propagate attributes, but memcpy has no return value, so make sure that
+  // any return attributes are compliant.
+  // TODO: Attach return value attributes to the 1st operand to preserve them?
   NewCI->setAttributes(CI->getAttributes());
+  NewCI->removeAttributes(AttributeList::ReturnIndex,
+                          AttributeFuncs::typeIncompatible(NewCI->getType()));
   return B.CreateInBoundsGEP(B.getInt8Ty(), Dst, N);
 }
 
@@ -1157,6 +1177,8 @@ Value *LibCallSimplifier::optimizeMemMove(CallInst *CI, IRBuilderBase &B) {
   CallInst *NewCI = B.CreateMemMove(CI->getArgOperand(0), Align(1),
                                     CI->getArgOperand(1), Align(1), Size);
   NewCI->setAttributes(CI->getAttributes());
+  NewCI->removeAttributes(AttributeList::ReturnIndex,
+                          AttributeFuncs::typeIncompatible(NewCI->getType()));
   return CI->getArgOperand(0);
 }
 
@@ -1217,6 +1239,8 @@ Value *LibCallSimplifier::optimizeMemSet(CallInst *CI, IRBuilderBase &B) {
   Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false);
   CallInst *NewCI = B.CreateMemSet(CI->getArgOperand(0), Val, Size, Align(1));
   NewCI->setAttributes(CI->getAttributes());
+  NewCI->removeAttributes(AttributeList::ReturnIndex,
+                          AttributeFuncs::typeIncompatible(NewCI->getType()));
   return CI->getArgOperand(0);
 }
 
@@ -1629,6 +1653,14 @@ Value *LibCallSimplifier::replacePowWithSqrt(CallInst *Pow, IRBuilderBase &B) {
   if (ExpoF->isNegative() && (!Pow->hasApproxFunc() && !Pow->hasAllowReassoc()))
     return nullptr;
 
+  // If we have a pow() library call (accesses memory) and we can't guarantee
+  // that the base is not an infinity, give up:
+  // pow(-Inf, 0.5) is optionally required to have a result of +Inf (not setting
+  // errno), but sqrt(-Inf) is required by various standards to set errno.
+  if (!Pow->doesNotAccessMemory() && !Pow->hasNoInfs() &&
+      !isKnownNeverInfinity(Base, TLI))
+    return nullptr;
+
   Sqrt = getSqrtCall(Base, Attrs, Pow->doesNotAccessMemory(), Mod, B, TLI);
   if (!Sqrt)
     return nullptr;
@@ -1715,7 +1747,8 @@ Value *LibCallSimplifier::optimizePow(CallInst *Pow, IRBuilderBase &B) {
 
   // pow(x, n) -> x * x * x * ...
   const APFloat *ExpoF;
-  if (AllowApprox && match(Expo, m_APFloat(ExpoF))) {
+  if (AllowApprox && match(Expo, m_APFloat(ExpoF)) &&
+      !ExpoF->isExactlyValue(0.5) && !ExpoF->isExactlyValue(-0.5)) {
     // We limit to a max of 7 multiplications, thus the maximum exponent is 32.
     // If the exponent is an integer+0.5 we generate a call to sqrt and an
     // additional fmul.
@@ -1741,6 +1774,8 @@ Value *LibCallSimplifier::optimizePow(CallInst *Pow, IRBuilderBase &B) {
 
         Sqrt = getSqrtCall(Base, Pow->getCalledFunction()->getAttributes(),
                            Pow->doesNotAccessMemory(), M, B, TLI);
+        if (!Sqrt)
+          return nullptr;
       }
 
       // We will memoize intermediate products of the Addition Chain.
@@ -2164,7 +2199,7 @@ Value *LibCallSimplifier::optimizeSinCosPi(CallInst *CI, IRBuilderBase &B) {
     classifyArgUse(U, F, IsFloat, SinCalls, CosCalls, SinCosCalls);
 
   // It's only worthwhile if both sinpi and cospi are actually used.
-  if (SinCosCalls.empty() && (SinCalls.empty() || CosCalls.empty()))
+  if (SinCalls.empty() || CosCalls.empty())
     return nullptr;
 
   Value *Sin, *Cos, *SinCos;
@@ -2190,7 +2225,7 @@ void LibCallSimplifier::classifyArgUse(
     SmallVectorImpl<CallInst *> &SinCosCalls) {
   CallInst *CI = dyn_cast<CallInst>(Val);
 
-  if (!CI)
+  if (!CI || CI->use_empty())
     return;
 
   // Don't consider calls in other functions.
@@ -2487,6 +2522,30 @@ Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI,
     if (!CI->getArgOperand(2)->getType()->isPointerTy())
       return nullptr;
 
+    if (CI->use_empty())
+      // sprintf(dest, "%s", str) -> strcpy(dest, str)
+      return emitStrCpy(CI->getArgOperand(0), CI->getArgOperand(2), B, TLI);
+
+    uint64_t SrcLen = GetStringLength(CI->getArgOperand(2));
+    if (SrcLen) {
+      B.CreateMemCpy(
+          CI->getArgOperand(0), Align(1), CI->getArgOperand(2), Align(1),
+          ConstantInt::get(DL.getIntPtrType(CI->getContext()), SrcLen));
+      // Returns total number of characters written without null-character.
+      return ConstantInt::get(CI->getType(), SrcLen - 1);
+    } else if (Value *V = emitStpCpy(CI->getArgOperand(0), CI->getArgOperand(2),
+                                     B, TLI)) {
+      // sprintf(dest, "%s", str) -> stpcpy(dest, str) - dest
+      Value *PtrDiff = B.CreatePtrDiff(V, CI->getArgOperand(0));
+      return B.CreateIntCast(PtrDiff, CI->getType(), false);
+    }
+
+    bool OptForSize = CI->getFunction()->hasOptSize() ||
+                      llvm::shouldOptimizeForSize(CI->getParent(), PSI, BFI,
+                                                  PGSOQueryType::IRPass);
+    if (OptForSize)
+      return nullptr;
+
     Value *Len = emitStrLen(CI->getArgOperand(2), B, DL, TLI);
     if (!Len)
       return nullptr;
@@ -3219,6 +3278,8 @@ Value *FortifiedLibCallSimplifier::optimizeMemCpyChk(CallInst *CI,
         B.CreateMemCpy(CI->getArgOperand(0), Align(1), CI->getArgOperand(1),
                        Align(1), CI->getArgOperand(2));
     NewCI->setAttributes(CI->getAttributes());
+    NewCI->removeAttributes(AttributeList::ReturnIndex,
+                            AttributeFuncs::typeIncompatible(NewCI->getType()));
     return CI->getArgOperand(0);
   }
   return nullptr;
@@ -3231,6 +3292,8 @@ Value *FortifiedLibCallSimplifier::optimizeMemMoveChk(CallInst *CI,
         B.CreateMemMove(CI->getArgOperand(0), Align(1), CI->getArgOperand(1),
                         Align(1), CI->getArgOperand(2));
     NewCI->setAttributes(CI->getAttributes());
+    NewCI->removeAttributes(AttributeList::ReturnIndex,
+                            AttributeFuncs::typeIncompatible(NewCI->getType()));
     return CI->getArgOperand(0);
   }
   return nullptr;
@@ -3245,11 +3308,29 @@ Value *FortifiedLibCallSimplifier::optimizeMemSetChk(CallInst *CI,
     CallInst *NewCI = B.CreateMemSet(CI->getArgOperand(0), Val,
                                      CI->getArgOperand(2), Align(1));
     NewCI->setAttributes(CI->getAttributes());
+    NewCI->removeAttributes(AttributeList::ReturnIndex,
+                            AttributeFuncs::typeIncompatible(NewCI->getType()));
     return CI->getArgOperand(0);
   }
   return nullptr;
 }
 
+Value *FortifiedLibCallSimplifier::optimizeMemPCpyChk(CallInst *CI,
+                                                      IRBuilderBase &B) {
+  const DataLayout &DL = CI->getModule()->getDataLayout();
+  if (isFortifiedCallFoldable(CI, 3, 2))
+    if (Value *Call = emitMemPCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+                                  CI->getArgOperand(2), B, DL, TLI)) {
+      CallInst *NewCI = cast<CallInst>(Call);
+      NewCI->setAttributes(CI->getAttributes());
+      NewCI->removeAttributes(
+          AttributeList::ReturnIndex,
+          AttributeFuncs::typeIncompatible(NewCI->getType()));
+      return NewCI;
+    }
+  return nullptr;
+}
+
 Value *FortifiedLibCallSimplifier::optimizeStrpCpyChk(CallInst *CI,
                                                       IRBuilderBase &B,
                                                       LibFunc Func) {
@@ -3330,7 +3411,7 @@ Value *FortifiedLibCallSimplifier::optimizeMemCCpyChk(CallInst *CI,
 Value *FortifiedLibCallSimplifier::optimizeSNPrintfChk(CallInst *CI,
                                                        IRBuilderBase &B) {
   if (isFortifiedCallFoldable(CI, 3, 1, None, 2)) {
-    SmallVector<Value *, 8> VariadicArgs(CI->arg_begin() + 5, CI->arg_end());
+    SmallVector<Value *, 8> VariadicArgs(drop_begin(CI->args(), 5));
     return emitSNPrintf(CI->getArgOperand(0), CI->getArgOperand(1),
                         CI->getArgOperand(4), VariadicArgs, B, TLI);
   }
@@ -3341,7 +3422,7 @@ Value *FortifiedLibCallSimplifier::optimizeSNPrintfChk(CallInst *CI,
 Value *FortifiedLibCallSimplifier::optimizeSPrintfChk(CallInst *CI,
                                                       IRBuilderBase &B) {
   if (isFortifiedCallFoldable(CI, 2, None, None, 1)) {
-    SmallVector<Value *, 8> VariadicArgs(CI->arg_begin() + 4, CI->arg_end());
+    SmallVector<Value *, 8> VariadicArgs(drop_begin(CI->args(), 4));
     return emitSPrintf(CI->getArgOperand(0), CI->getArgOperand(3), VariadicArgs,
                        B, TLI);
   }
@@ -3439,6 +3520,8 @@ Value *FortifiedLibCallSimplifier::optimizeCall(CallInst *CI,
   switch (Func) {
   case LibFunc_memcpy_chk:
     return optimizeMemCpyChk(CI, Builder);
+  case LibFunc_mempcpy_chk:
+    return optimizeMemPCpyChk(CI, Builder);
   case LibFunc_memmove_chk:
     return optimizeMemMoveChk(CI, Builder);
   case LibFunc_memset_chk:
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/SizeOpts.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/SizeOpts.cpp
index e257c5a015f5..beeb60698f04 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/SizeOpts.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/SizeOpts.cpp
@@ -43,11 +43,6 @@ cl::opt<bool> PGSOColdCodeOnlyForPartialSamplePGO(
     cl::desc("Apply the profile guided size optimizations only "
              "to cold code under partial-profile sample PGO."));
 
-cl::opt<bool> PGSOIRPassOrTestOnly(
-    "pgso-ir-pass-or-test-only", cl::Hidden, cl::init(false),
-    cl::desc("Apply the profile guided size optimizations only"
-             "to the IR passes or tests."));
-
 cl::opt<bool> ForcePGSO(
     "force-pgso", cl::Hidden, cl::init(false),
     cl::desc("Force the (profiled-guided) size optimizations. "));
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/StripGCRelocates.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/StripGCRelocates.cpp
index b559811d120b..1fa574f04c37 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/StripGCRelocates.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/StripGCRelocates.cpp
@@ -13,6 +13,7 @@
 // present.
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Utils/StripGCRelocates.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
@@ -24,22 +25,7 @@
 
 using namespace llvm;
 
-namespace {
-struct StripGCRelocates : public FunctionPass {
-  static char ID; // Pass identification, replacement for typeid
-  StripGCRelocates() : FunctionPass(ID) {
-    initializeStripGCRelocatesPass(*PassRegistry::getPassRegistry());
-  }
-
-  void getAnalysisUsage(AnalysisUsage &Info) const override {}
-
-  bool runOnFunction(Function &F) override;
-
-};
-char StripGCRelocates::ID = 0;
-}
-
-bool StripGCRelocates::runOnFunction(Function &F) {
+static bool stripGCRelocates(Function &F) {
   // Nothing to do for declarations.
   if (F.isDeclaration())
     return false;
@@ -71,6 +57,32 @@ bool StripGCRelocates::runOnFunction(Function &F) {
   return !GCRelocates.empty();
 }
 
-INITIALIZE_PASS(StripGCRelocates, "strip-gc-relocates",
+PreservedAnalyses StripGCRelocates::run(Function &F,
+                                        FunctionAnalysisManager &AM) {
+  if (!stripGCRelocates(F))
+    return PreservedAnalyses::all();
+
+  // Removing gc.relocate preserves the CFG, but most other analysis probably
+  // need to re-run.
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
+
+namespace {
+struct StripGCRelocatesLegacy : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  StripGCRelocatesLegacy() : FunctionPass(ID) {
+    initializeStripGCRelocatesLegacyPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &Info) const override {}
+
+  bool runOnFunction(Function &F) override { return ::stripGCRelocates(F); }
+};
+char StripGCRelocatesLegacy::ID = 0;
+} // namespace
+
+INITIALIZE_PASS(StripGCRelocatesLegacy, "strip-gc-relocates",
                 "Strip gc.relocates inserted through RewriteStatepointsForGC",
                 true, false)
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/StripNonLineTableDebugInfo.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/StripNonLineTableDebugInfo.cpp
index 21cbbfb140b6..10fda4df51ba 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/StripNonLineTableDebugInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/StripNonLineTableDebugInfo.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Utils/StripNonLineTableDebugInfo.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
@@ -17,10 +18,11 @@ namespace {
 /// This pass strips all debug info that is not related line tables.
 /// The result will be the same as if the program where compiled with
 /// -gline-tables-only.
-struct StripNonLineTableDebugInfo : public ModulePass {
+struct StripNonLineTableDebugLegacyPass : public ModulePass {
   static char ID; // Pass identification, replacement for typeid
-  StripNonLineTableDebugInfo() : ModulePass(ID) {
-    initializeStripNonLineTableDebugInfoPass(*PassRegistry::getPassRegistry());
+  StripNonLineTableDebugLegacyPass() : ModulePass(ID) {
+    initializeStripNonLineTableDebugLegacyPassPass(
+        *PassRegistry::getPassRegistry());
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -33,10 +35,17 @@ struct StripNonLineTableDebugInfo : public ModulePass {
 };
 }
 
-char StripNonLineTableDebugInfo::ID = 0;
-INITIALIZE_PASS(StripNonLineTableDebugInfo, "strip-nonlinetable-debuginfo",
+char StripNonLineTableDebugLegacyPass::ID = 0;
+INITIALIZE_PASS(StripNonLineTableDebugLegacyPass,
+                "strip-nonlinetable-debuginfo",
                 "Strip all debug info except linetables", false, false)
 
-ModulePass *llvm::createStripNonLineTableDebugInfoPass() {
-  return new StripNonLineTableDebugInfo();
+ModulePass *llvm::createStripNonLineTableDebugLegacyPass() {
+  return new StripNonLineTableDebugLegacyPass();
+}
+
+PreservedAnalyses
+StripNonLineTableDebugInfoPass::run(Module &M, ModuleAnalysisManager &AM) {
+  llvm::stripNonLineTableDebugInfo(M);
+  return PreservedAnalyses::all();
 }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
index 9af39d9a0dd1..3631733713ab 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
@@ -6,10 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This pass is used to ensure that functions have at most one return
-// instruction in them.  Additionally, it keeps track of which node is the new
-// exit node of the CFG.  If there are no exit nodes in the CFG, the getExitNode
-// method will return a null pointer.
+// This pass is used to ensure that functions have at most one return and one
+// unreachable instruction in them.
 //
 //===----------------------------------------------------------------------===//
 
@@ -22,73 +20,66 @@
 #include "llvm/Transforms/Utils.h"
 using namespace llvm;
 
-char UnifyFunctionExitNodes::ID = 0;
+char UnifyFunctionExitNodesLegacyPass::ID = 0;
 
-UnifyFunctionExitNodes::UnifyFunctionExitNodes() : FunctionPass(ID) {
-  initializeUnifyFunctionExitNodesPass(*PassRegistry::getPassRegistry());
+UnifyFunctionExitNodesLegacyPass::UnifyFunctionExitNodesLegacyPass()
+    : FunctionPass(ID) {
+  initializeUnifyFunctionExitNodesLegacyPassPass(
+      *PassRegistry::getPassRegistry());
 }
 
-INITIALIZE_PASS(UnifyFunctionExitNodes, "mergereturn",
+INITIALIZE_PASS(UnifyFunctionExitNodesLegacyPass, "mergereturn",
                 "Unify function exit nodes", false, false)
 
 Pass *llvm::createUnifyFunctionExitNodesPass() {
-  return new UnifyFunctionExitNodes();
+  return new UnifyFunctionExitNodesLegacyPass();
 }
 
-void UnifyFunctionExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{
+void UnifyFunctionExitNodesLegacyPass::getAnalysisUsage(
+    AnalysisUsage &AU) const {
   // We preserve the non-critical-edgeness property
   AU.addPreservedID(BreakCriticalEdgesID);
   // This is a cluster of orthogonal Transforms
   AU.addPreservedID(LowerSwitchID);
 }
 
-// UnifyAllExitNodes - Unify all exit nodes of the CFG by creating a new
-// BasicBlock, and converting all returns to unconditional branches to this
-// new basic block.  The singular exit node is returned.
-//
-// If there are no return stmts in the Function, a null pointer is returned.
-//
-bool UnifyFunctionExitNodes::runOnFunction(Function &F) {
-  // Loop over all of the blocks in a function, tracking all of the blocks that
-  // return.
-  //
-  std::vector<BasicBlock*> ReturningBlocks;
-  std::vector<BasicBlock*> UnreachableBlocks;
+namespace {
+
+bool unifyUnreachableBlocks(Function &F) {
+  std::vector<BasicBlock *> UnreachableBlocks;
+
   for (BasicBlock &I : F)
-    if (isa<ReturnInst>(I.getTerminator()))
-      ReturningBlocks.push_back(&I);
-    else if (isa<UnreachableInst>(I.getTerminator()))
+    if (isa<UnreachableInst>(I.getTerminator()))
       UnreachableBlocks.push_back(&I);
 
-  // Then unreachable blocks.
-  if (UnreachableBlocks.empty()) {
-    UnreachableBlock = nullptr;
-  } else if (UnreachableBlocks.size() == 1) {
-    UnreachableBlock = UnreachableBlocks.front();
-  } else {
-    UnreachableBlock = BasicBlock::Create(F.getContext(),
-                                          "UnifiedUnreachableBlock", &F);
-    new UnreachableInst(F.getContext(), UnreachableBlock);
-
-    for (BasicBlock *BB : UnreachableBlocks) {
-      BB->getInstList().pop_back();  // Remove the unreachable inst.
-      BranchInst::Create(UnreachableBlock, BB);
-    }
+  if (UnreachableBlocks.size() <= 1)
+    return false;
+
+  BasicBlock *UnreachableBlock =
+      BasicBlock::Create(F.getContext(), "UnifiedUnreachableBlock", &F);
+  new UnreachableInst(F.getContext(), UnreachableBlock);
+
+  for (BasicBlock *BB : UnreachableBlocks) {
+    BB->getInstList().pop_back(); // Remove the unreachable inst.
+    BranchInst::Create(UnreachableBlock, BB);
   }
 
-  // Now handle return blocks.
-  if (ReturningBlocks.empty()) {
-    ReturnBlock = nullptr;
-    return false;                          // No blocks return
-  } else if (ReturningBlocks.size() == 1) {
-    ReturnBlock = ReturningBlocks.front(); // Already has a single return block
+  return true;
+}
+
+bool unifyReturnBlocks(Function &F) {
+  std::vector<BasicBlock *> ReturningBlocks;
+
+  for (BasicBlock &I : F)
+    if (isa<ReturnInst>(I.getTerminator()))
+      ReturningBlocks.push_back(&I);
+
+  if (ReturningBlocks.size() <= 1)
     return false;
-  }
 
-  // Otherwise, we need to insert a new basic block into the function, add a PHI
-  // nodes (if the function returns values), and convert all of the return
-  // instructions into unconditional branches.
-  //
+  // Insert a new basic block into the function, add PHI nodes (if the function
+  // returns values), and convert all of the return instructions into
+  // unconditional branches.
   BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(),
                                                "UnifiedReturnBlock", &F);
 
@@ -105,7 +96,6 @@ bool UnifyFunctionExitNodes::runOnFunction(Function &F) {
 
   // Loop over all of the blocks, replacing the return instruction with an
   // unconditional branch.
-  //
   for (BasicBlock *BB : ReturningBlocks) {
     // Add an incoming element to the PHI node for every return instruction that
     // is merging into this new block...
@@ -115,6 +105,25 @@ bool UnifyFunctionExitNodes::runOnFunction(Function &F) {
     BB->getInstList().pop_back();  // Remove the return insn
     BranchInst::Create(NewRetBlock, BB);
   }
-  ReturnBlock = NewRetBlock;
+
   return true;
 }
+} // namespace
+
+// Unify all exit nodes of the CFG by creating a new BasicBlock, and converting
+// all returns to unconditional branches to this new basic block. Also, unify
+// all unreachable blocks.
+bool UnifyFunctionExitNodesLegacyPass::runOnFunction(Function &F) {
+  bool Changed = false;
+  Changed |= unifyUnreachableBlocks(F);
+  Changed |= unifyReturnBlocks(F);
+  return Changed;
+}
+
+PreservedAnalyses UnifyFunctionExitNodesPass::run(Function &F,
+                                                  FunctionAnalysisManager &AM) {
+  bool Changed = false;
+  Changed |= unifyUnreachableBlocks(F);
+  Changed |= unifyReturnBlocks(F);
+  return Changed ? PreservedAnalyses() : PreservedAnalyses::all();
+}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
index b10deee3907c..0b718ed6136e 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
@@ -16,6 +16,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Utils/UnifyLoopExits.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/InitializePasses.h"
@@ -27,10 +29,10 @@
 using namespace llvm;
 
 namespace {
-struct UnifyLoopExits : public FunctionPass {
+struct UnifyLoopExitsLegacyPass : public FunctionPass {
   static char ID;
-  UnifyLoopExits() : FunctionPass(ID) {
-    initializeUnifyLoopExitsPass(*PassRegistry::getPassRegistry());
+  UnifyLoopExitsLegacyPass() : FunctionPass(ID) {
+    initializeUnifyLoopExitsLegacyPassPass(*PassRegistry::getPassRegistry());
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -46,17 +48,19 @@ struct UnifyLoopExits : public FunctionPass {
 };
 } // namespace
 
-char UnifyLoopExits::ID = 0;
+char UnifyLoopExitsLegacyPass::ID = 0;
 
-FunctionPass *llvm::createUnifyLoopExitsPass() { return new UnifyLoopExits(); }
+FunctionPass *llvm::createUnifyLoopExitsPass() {
+  return new UnifyLoopExitsLegacyPass();
+}
 
-INITIALIZE_PASS_BEGIN(UnifyLoopExits, "unify-loop-exits",
+INITIALIZE_PASS_BEGIN(UnifyLoopExitsLegacyPass, "unify-loop-exits",
                       "Fixup each natural loop to have a single exit block",
                       false /* Only looks at CFG */, false /* Analysis Pass */)
-INITIALIZE_PASS_DEPENDENCY(LowerSwitch)
+INITIALIZE_PASS_DEPENDENCY(LowerSwitchLegacyPass)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_END(UnifyLoopExits, "unify-loop-exits",
+INITIALIZE_PASS_END(UnifyLoopExitsLegacyPass, "unify-loop-exits",
                     "Fixup each natural loop to have a single exit block",
                     false /* Only looks at CFG */, false /* Analysis Pass */)
 
@@ -80,7 +84,7 @@ static void restoreSSA(const DominatorTree &DT, const Loop *L,
                        const SetVector<BasicBlock *> &Incoming,
                        BasicBlock *LoopExitBlock) {
   using InstVector = SmallVector<Instruction *, 8>;
-  using IIMap = DenseMap<Instruction *, InstVector>;
+  using IIMap = MapVector<Instruction *, InstVector>;
   IIMap ExternalUsers;
   for (auto BB : L->blocks()) {
     for (auto &I : *BB) {
@@ -203,11 +207,7 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) {
   return true;
 }
 
-bool UnifyLoopExits::runOnFunction(Function &F) {
-  LLVM_DEBUG(dbgs() << "===== Unifying loop exits in function " << F.getName()
-                    << "\n");
-  auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+static bool runImpl(LoopInfo &LI, DominatorTree &DT) {
 
   bool Changed = false;
   auto Loops = LI.getLoopsInPreorder();
@@ -218,3 +218,28 @@ bool UnifyLoopExits::runOnFunction(Function &F) {
   }
   return Changed;
 }
+
+bool UnifyLoopExitsLegacyPass::runOnFunction(Function &F) {
+  LLVM_DEBUG(dbgs() << "===== Unifying loop exits in function " << F.getName()
+                    << "\n");
+  auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
+  return runImpl(LI, DT);
+}
+
+namespace llvm {
+
+PreservedAnalyses UnifyLoopExitsPass::run(Function &F,
+                                          FunctionAnalysisManager &AM) {
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+
+  if (!runImpl(LI, DT))
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserve<LoopAnalysis>();
+  PA.preserve<DominatorTreeAnalysis>();
+  return PA;
+}
+} // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/UniqueInternalLinkageNames.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/UniqueInternalLinkageNames.cpp
index 5b58548e54dc..c57cec6be676 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/UniqueInternalLinkageNames.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/UniqueInternalLinkageNames.cpp
@@ -13,8 +13,11 @@
 
 #include "llvm/Transforms/Utils/UniqueInternalLinkageNames.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/MD5.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 
@@ -27,13 +30,31 @@ static bool uniqueifyInternalLinkageNames(Module &M) {
   Md5.final(R);
   SmallString<32> Str;
   llvm::MD5::stringifyResult(R, Str);
-  std::string ModuleNameHash = (Twine(".") + Twine(Str)).str();
+  // Convert MD5hash to Decimal. Demangler suffixes can either contain numbers
+  // or characters but not both.
+  APInt IntHash = APInt(128, Str.str(), 16);
+  // Prepend "__uniq" before the hash for tools like profilers to understand that
+  // this symbol is of internal linkage type.
+  std::string ModuleNameHash = (Twine(".__uniq.") + Twine(IntHash.toString(10, false))).str();
   bool Changed = false;
+  MDBuilder MDB(M.getContext());
 
   // Append the module hash to all internal linkage functions.
   for (auto &F : M) {
     if (F.hasInternalLinkage()) {
       F.setName(F.getName() + ModuleNameHash);
+      F.addFnAttr("sample-profile-suffix-elision-policy", "selected");
+      // Replace linkage names in the debug metadata.
+      if (DISubprogram *SP = F.getSubprogram()) {
+        if (SP->getRawLinkageName()) {
+          auto *Name = MDB.createString(F.getName());
+          SP->replaceRawLinkageName(Name);
+          if (DISubprogram *SPDecl = SP->getDeclaration()) {
+            if (SPDecl->getRawLinkageName())
+              SPDecl->replaceRawLinkageName(Name);
+          }
+        }
+      }
       Changed = true;
     }
   }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/Utils.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/Utils.cpp
index ce98a739bea8..73c0532f3fd5 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/Utils.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/Utils.cpp
@@ -34,17 +34,17 @@ void llvm::initializeTransformUtils(PassRegistry &Registry) {
   initializeLibCallsShrinkWrapLegacyPassPass(Registry);
   initializeLoopSimplifyPass(Registry);
   initializeLowerInvokeLegacyPassPass(Registry);
-  initializeLowerSwitchPass(Registry);
+  initializeLowerSwitchLegacyPassPass(Registry);
   initializeNameAnonGlobalLegacyPassPass(Registry);
   initializePromoteLegacyPassPass(Registry);
-  initializeStripNonLineTableDebugInfoPass(Registry);
-  initializeUnifyFunctionExitNodesPass(Registry);
+  initializeStripNonLineTableDebugLegacyPassPass(Registry);
+  initializeUnifyFunctionExitNodesLegacyPassPass(Registry);
   initializeMetaRenamerPass(Registry);
-  initializeStripGCRelocatesPass(Registry);
+  initializeStripGCRelocatesLegacyPass(Registry);
   initializePredicateInfoPrinterLegacyPassPass(Registry);
   initializeInjectTLIMappingsLegacyPass(Registry);
   initializeFixIrreduciblePass(Registry);
-  initializeUnifyLoopExitsPass(Registry);
+  initializeUnifyLoopExitsLegacyPassPass(Registry);
   initializeUniqueInternalLinkageNamesLegacyPassPass(Registry);
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/VNCoercion.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/VNCoercion.cpp
index 6ff08cd28712..61cd8595a73b 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/VNCoercion.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/VNCoercion.cpp
@@ -17,6 +17,7 @@ static bool isFirstClassAggregateOrScalableType(Type *Ty) {
 bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
                                      const DataLayout &DL) {
   Type *StoredTy = StoredVal->getType();
+
   if (StoredTy == LoadTy)
     return true;
 
@@ -36,17 +37,29 @@ bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
   if (StoreSize < DL.getTypeSizeInBits(LoadTy).getFixedSize())
     return false;
 
+  bool StoredNI = DL.isNonIntegralPointerType(StoredTy->getScalarType());
+  bool LoadNI = DL.isNonIntegralPointerType(LoadTy->getScalarType());
   // Don't coerce non-integral pointers to integers or vice versa.
-  if (DL.isNonIntegralPointerType(StoredVal->getType()->getScalarType()) !=
-      DL.isNonIntegralPointerType(LoadTy->getScalarType())) {
+  if (StoredNI != LoadNI) {
     // As a special case, allow coercion of memset used to initialize
     // an array w/null.  Despite non-integral pointers not generally having a
     // specific bit pattern, we do assume null is zero.
     if (auto *CI = dyn_cast<Constant>(StoredVal))
       return CI->isNullValue();
     return false;
+  } else if (StoredNI && LoadNI &&
+             StoredTy->getPointerAddressSpace() !=
+                 LoadTy->getPointerAddressSpace()) {
+    return false;
   }
-  
+
+
+  // The implementation below uses inttoptr for vectors of unequal size; we
+  // can't allow this for non integral pointers. We could teach it to extract
+  // exact subvectors if desired. 
+  if (StoredNI && StoreSize != DL.getTypeSizeInBits(LoadTy).getFixedSize())
+    return false;
+
   return true;
 }
 
@@ -223,14 +236,8 @@ int analyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr,
   if (isFirstClassAggregateOrScalableType(StoredVal->getType()))
     return -1;
 
-  // Don't coerce non-integral pointers to integers or vice versa.
-  if (DL.isNonIntegralPointerType(StoredVal->getType()->getScalarType()) !=
-      DL.isNonIntegralPointerType(LoadTy->getScalarType())) {
-    // Allow casts of zero values to null as a special case
-    auto *CI = dyn_cast<Constant>(StoredVal);
-    if (!CI || !CI->isNullValue())
-      return -1;
-  }
+  if (!canCoerceMustAliasedValueToLoad(StoredVal, LoadTy, DL))
+    return -1;
 
   Value *StorePtr = DepSI->getPointerOperand();
   uint64_t StoreSize =
@@ -333,9 +340,7 @@ int analyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr, LoadInst *DepLI,
   if (DepLI->getType()->isStructTy() || DepLI->getType()->isArrayTy())
     return -1;
 
-  // Don't coerce non-integral pointers to integers or vice versa.
-  if (DL.isNonIntegralPointerType(DepLI->getType()->getScalarType()) !=
-      DL.isNonIntegralPointerType(LoadTy->getScalarType()))
+  if (!canCoerceMustAliasedValueToLoad(DepLI, LoadTy, DL))
     return -1;
 
   Value *DepPtr = DepLI->getPointerOperand();
@@ -393,7 +398,7 @@ int analyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
   if (!Src)
     return -1;
 
-  GlobalVariable *GV = dyn_cast<GlobalVariable>(GetUnderlyingObject(Src, DL));
+  GlobalVariable *GV = dyn_cast<GlobalVariable>(getUnderlyingObject(Src));
   if (!GV || !GV->isConstant() || !GV->hasDefinitiveInitializer())
     return -1;
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/ValueMapper.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/ValueMapper.cpp
index f1b3fe8e2fa9..930e0b7ee01a 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/ValueMapper.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/ValueMapper.cpp
@@ -167,12 +167,9 @@ public:
   void flush();
 
 private:
-  void mapGlobalInitializer(GlobalVariable &GV, Constant &Init);
   void mapAppendingVariable(GlobalVariable &GV, Constant *InitPrefix,
                             bool IsOldCtorDtor,
                             ArrayRef<Constant *> NewMembers);
-  void mapGlobalIndirectSymbol(GlobalIndirectSymbol &GIS, Constant &Target);
-  void remapFunction(Function &F, ValueToValueMapTy &VM);
 
   ValueToValueMapTy &getVM() { return *MCs[CurrentMCID].VM; }
   ValueMaterializer *getMaterializer() { return MCs[CurrentMCID].Materializer; }
@@ -822,11 +819,15 @@ void Mapper::flush() {
       break;
     case WorklistEntry::MapAppendingVar: {
       unsigned PrefixSize = AppendingInits.size() - E.AppendingGVNumNewMembers;
+      // mapAppendingVariable call can change AppendingInits if initalizer for
+      // the variable depends on another appending global, because of that inits
+      // need to be extracted and updated before the call.
+      SmallVector<Constant *, 8> NewInits(
+          drop_begin(AppendingInits, PrefixSize));
+      AppendingInits.resize(PrefixSize);
       mapAppendingVariable(*E.Data.AppendingGV.GV,
                            E.Data.AppendingGV.InitPrefix,
-                           E.AppendingGVIsOldCtorDtor,
-                           makeArrayRef(AppendingInits).slice(PrefixSize));
-      AppendingInits.resize(PrefixSize);
+                           E.AppendingGVIsOldCtorDtor, makeArrayRef(NewInits));
       break;
     }
     case WorklistEntry::MapGlobalIndirectSymbol:
@@ -900,14 +901,13 @@ void Mapper::remapInstruction(Instruction *I) {
     LLVMContext &C = CB->getContext();
     AttributeList Attrs = CB->getAttributes();
     for (unsigned i = 0; i < Attrs.getNumAttrSets(); ++i) {
-      if (Attrs.hasAttribute(i, Attribute::ByVal)) {
-        Type *Ty = Attrs.getAttribute(i, Attribute::ByVal).getValueAsType();
-        if (!Ty)
-          continue;
-
-        Attrs = Attrs.removeAttribute(C, i, Attribute::ByVal);
-        Attrs = Attrs.addAttribute(
-            C, i, Attribute::getWithByValType(C, TypeMapper->remapType(Ty)));
+      for (Attribute::AttrKind TypedAttr :
+           {Attribute::ByVal, Attribute::StructRet, Attribute::ByRef}) {
+        if (Type *Ty = Attrs.getAttribute(i, TypedAttr).getValueAsType()) {
+          Attrs = Attrs.replaceAttributeType(C, i, TypedAttr,
+                                             TypeMapper->remapType(Ty));
+          break;
+        }
       }
     }
     CB->setAttributes(Attrs);
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 9b81afbb4b6c..6ec5590d76ba 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -666,6 +666,10 @@ Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
                cast<IntrinsicInst>(&I)->getIntrinsicID() ==
                    Intrinsic::sideeffect) {
       // Ignore llvm.sideeffect calls.
+    } else if (isa<IntrinsicInst>(&I) &&
+               cast<IntrinsicInst>(&I)->getIntrinsicID() ==
+                   Intrinsic::pseudoprobe) {
+      // Ignore llvm.pseudoprobe calls.
     } else if (IsLoadChain && (I.mayWriteToMemory() || I.mayThrow())) {
       LLVM_DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << I
                         << '\n');
@@ -762,8 +766,8 @@ Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
   return Chain.slice(0, ChainIdx);
 }
 
-static ChainID getChainID(const Value *Ptr, const DataLayout &DL) {
-  const Value *ObjPtr = GetUnderlyingObject(Ptr, DL);
+static ChainID getChainID(const Value *Ptr) {
+  const Value *ObjPtr = getUnderlyingObject(Ptr);
   if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
     // The select's themselves are distinct instructions even if they share the
     // same condition and evaluate to consecutive pointers for true and false
@@ -830,7 +834,7 @@ Vectorizer::collectInstructions(BasicBlock *BB) {
         continue;
 
       // Save the load locations.
-      const ChainID ID = getChainID(Ptr, DL);
+      const ChainID ID = getChainID(Ptr);
       LoadRefs[ID].push_back(LI);
     } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
       if (!SI->isSimple())
@@ -876,7 +880,7 @@ Vectorizer::collectInstructions(BasicBlock *BB) {
         continue;
 
       // Save store location.
-      const ChainID ID = getChainID(Ptr, DL);
+      const ChainID ID = getChainID(Ptr);
       StoreRefs[ID].push_back(SI);
     }
   }
@@ -1027,8 +1031,8 @@ bool Vectorizer::vectorizeStoreChain(
   unsigned EltSzInBytes = Sz / 8;
   unsigned SzInBytes = EltSzInBytes * ChainSize;
 
-  VectorType *VecTy;
-  VectorType *VecStoreTy = dyn_cast<VectorType>(StoreTy);
+  FixedVectorType *VecTy;
+  auto *VecStoreTy = dyn_cast<FixedVectorType>(StoreTy);
   if (VecStoreTy)
     VecTy = FixedVectorType::get(StoreTy->getScalarType(),
                                  Chain.size() * VecStoreTy->getNumElements());
@@ -1180,7 +1184,7 @@ bool Vectorizer::vectorizeLoadChain(
   unsigned EltSzInBytes = Sz / 8;
   unsigned SzInBytes = EltSzInBytes * ChainSize;
   VectorType *VecTy;
-  VectorType *VecLoadTy = dyn_cast<VectorType>(LoadTy);
+  auto *VecLoadTy = dyn_cast<FixedVectorType>(LoadTy);
   if (VecLoadTy)
     VecTy = FixedVectorType::get(LoadTy->getScalarType(),
                                  Chain.size() * VecLoadTy->getNumElements());
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 23613775d896..b8c21a0e1cd3 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -13,13 +13,16 @@
 // pass. It should be easy to create an analysis pass around it if there
 // is a need (but D45420 needs to happen first).
 //
+
 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/Transforms/Utils/SizeOpts.h"
 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
 
 using namespace llvm;
@@ -63,6 +66,7 @@ bool LoopVectorizeHints::Hint::validate(unsigned Val) {
     return (Val <= 1);
   case HK_ISVECTORIZED:
   case HK_PREDICATE:
+  case HK_SCALABLE:
     return (Val == 0 || Val == 1);
   }
   return false;
@@ -75,7 +79,8 @@ LoopVectorizeHints::LoopVectorizeHints(const Loop *L,
       Interleave("interleave.count", InterleaveOnlyWhenForced, HK_UNROLL),
       Force("vectorize.enable", FK_Undefined, HK_FORCE),
       IsVectorized("isvectorized", 0, HK_ISVECTORIZED),
-      Predicate("vectorize.predicate.enable", FK_Undefined, HK_PREDICATE), TheLoop(L),
+      Predicate("vectorize.predicate.enable", FK_Undefined, HK_PREDICATE),
+      Scalable("vectorize.scalable.enable", false, HK_SCALABLE), TheLoop(L),
       ORE(ORE) {
   // Populate values with existing loop metadata.
   getHintsFromMetadata();
@@ -88,7 +93,8 @@ LoopVectorizeHints::LoopVectorizeHints(const Loop *L,
     // If the vectorization width and interleaving count are both 1 then
     // consider the loop to have been already vectorized because there's
     // nothing more that we can do.
-    IsVectorized.Value = Width.Value == 1 && Interleave.Value == 1;
+    IsVectorized.Value =
+        getWidth() == ElementCount::getFixed(1) && Interleave.Value == 1;
   LLVM_DEBUG(if (InterleaveOnlyWhenForced && Interleave.Value == 1) dbgs()
              << "LV: Interleaving disabled by the pass manager\n");
 }
@@ -161,7 +167,7 @@ void LoopVectorizeHints::emitRemarkWithHints() const {
       if (Force.Value == LoopVectorizeHints::FK_Enabled) {
         R << " (Force=" << NV("Force", true);
         if (Width.Value != 0)
-          R << ", Vector Width=" << NV("VectorWidth", Width.Value);
+          R << ", Vector Width=" << NV("VectorWidth", getWidth());
         if (Interleave.Value != 0)
           R << ", Interleave Count=" << NV("InterleaveCount", Interleave.Value);
         R << ")";
@@ -172,11 +178,11 @@ void LoopVectorizeHints::emitRemarkWithHints() const {
 }
 
 const char *LoopVectorizeHints::vectorizeAnalysisPassName() const {
-  if (getWidth() == 1)
+  if (getWidth() == ElementCount::getFixed(1))
     return LV_NAME;
   if (getForce() == LoopVectorizeHints::FK_Disabled)
     return LV_NAME;
-  if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth() == 0)
+  if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth().isZero())
     return LV_NAME;
   return OptimizationRemarkAnalysis::AlwaysPrint;
 }
@@ -227,7 +233,8 @@ void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) {
     return;
   unsigned Val = C->getZExtValue();
 
-  Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized, &Predicate};
+  Hint *Hints[] = {&Width,        &Interleave, &Force,
+                   &IsVectorized, &Predicate,  &Scalable};
   for (auto H : Hints) {
     if (Name == H->Name) {
       if (H->validate(Val))
@@ -412,7 +419,11 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
   const ValueToValueMap &Strides =
       getSymbolicStrides() ? *getSymbolicStrides() : ValueToValueMap();
 
-  bool CanAddPredicate = !TheLoop->getHeader()->getParent()->hasOptSize();
+  Function *F = TheLoop->getHeader()->getParent();
+  bool OptForSize = F->hasOptSize() ||
+                    llvm::shouldOptimizeForSize(TheLoop->getHeader(), PSI, BFI,
+                                                PGSOQueryType::IRPass);
+  bool CanAddPredicate = !OptForSize;
   int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, CanAddPredicate, false);
   if (Stride == 1 || Stride == -1)
     return Stride;
@@ -424,7 +435,7 @@ bool LoopVectorizationLegality::isUniform(Value *V) {
 }
 
 bool LoopVectorizationLegality::canVectorizeOuterLoop() {
-  assert(!TheLoop->empty() && "We are not vectorizing an outer loop.");
+  assert(!TheLoop->isInnermost() && "We are not vectorizing an outer loop.");
   // Store the result and return it at the end instead of exiting early, in case
   // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
   bool Result = true;
@@ -768,7 +779,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         // supported on the target.
         if (ST->getMetadata(LLVMContext::MD_nontemporal)) {
           // Arbitrarily try a vector of 2 elements.
-          auto *VecTy = FixedVectorType::get(T, /*NumElements=*/2);
+          auto *VecTy = FixedVectorType::get(T, /*NumElts=*/2);
           assert(VecTy && "did not find vectorized version of stored type");
           if (!TTI->isLegalNTStore(VecTy, ST->getAlign())) {
             reportVectorizationFailure(
@@ -783,7 +794,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         if (LD->getMetadata(LLVMContext::MD_nontemporal)) {
           // For nontemporal loads, check that a nontemporal vector version is
           // supported on the target (arbitrarily try a vector of 2 elements).
-          auto *VecTy = FixedVectorType::get(I.getType(), /*NumElements=*/2);
+          auto *VecTy = FixedVectorType::get(I.getType(), /*NumElts=*/2);
           assert(VecTy && "did not find vectorized version of load type");
           if (!TTI->isLegalNTLoad(VecTy, LD->getAlign())) {
             reportVectorizationFailure(
@@ -912,9 +923,9 @@ bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) {
 }
 
 bool LoopVectorizationLegality::blockCanBePredicated(
-    BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs, bool PreserveGuards) {
-  const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
-
+    BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs,
+    SmallPtrSetImpl<const Instruction *> &MaskedOp,
+    SmallPtrSetImpl<Instruction *> &ConditionalAssumes) const {
   for (Instruction &I : *BB) {
     // Check that we don't have a constant expression that can trap as operand.
     for (Value *Operand : I.operands()) {
@@ -930,17 +941,19 @@ bool LoopVectorizationLegality::blockCanBePredicated(
       continue;
     }
 
+    // Do not let llvm.experimental.noalias.scope.decl block the vectorization.
+    // TODO: there might be cases that it should block the vectorization. Let's
+    // ignore those for now.
+    if (isa<NoAliasScopeDeclInst>(&I))
+      continue;
+
     // We might be able to hoist the load.
     if (I.mayReadFromMemory()) {
       auto *LI = dyn_cast<LoadInst>(&I);
       if (!LI)
         return false;
       if (!SafePtrs.count(LI->getPointerOperand())) {
-        // !llvm.mem.parallel_loop_access implies if-conversion safety.
-        // Otherwise, record that the load needs (real or emulated) masking
-        // and let the cost model decide.
-        if (!IsAnnotatedParallel || PreserveGuards)
-          MaskedOp.insert(LI);
+        MaskedOp.insert(LI);
         continue;
       }
     }
@@ -999,7 +1012,7 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
     ScalarEvolution &SE = *PSE.getSE();
     for (Instruction &I : *BB) {
       LoadInst *LI = dyn_cast<LoadInst>(&I);
-      if (LI && !mustSuppressSpeculation(*LI) &&
+      if (LI && !LI->getType()->isVectorTy() && !mustSuppressSpeculation(*LI) &&
           isDereferenceableAndAlignedInLoop(LI, TheLoop, SE, *DT))
         SafePointers.insert(LI->getPointerOperand());
     }
@@ -1019,7 +1032,8 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
 
     // We must be able to predicate all blocks that need to be predicated.
     if (blockNeedsPredication(BB)) {
-      if (!blockCanBePredicated(BB, SafePointers)) {
+      if (!blockCanBePredicated(BB, SafePointers, MaskedOp,
+                                ConditionalAssumes)) {
         reportVectorizationFailure(
             "Control flow cannot be substituted for a select",
             "control flow cannot be substituted for a select",
@@ -1044,7 +1058,7 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
 // Helper function to canVectorizeLoopNestCFG.
 bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
                                                     bool UseVPlanNativePath) {
-  assert((UseVPlanNativePath || Lp->empty()) &&
+  assert((UseVPlanNativePath || Lp->isInnermost()) &&
          "VPlan-native path is not enabled.");
 
   // TODO: ORE should be improved to show more accurate information when an
@@ -1080,22 +1094,14 @@ bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
       return false;
   }
 
-  // We must have a single exiting block.
-  if (!Lp->getExitingBlock()) {
-    reportVectorizationFailure("The loop must have an exiting block",
-        "loop control flow is not understood by vectorizer",
-        "CFGNotUnderstood", ORE, TheLoop);
-    if (DoExtraAnalysis)
-      Result = false;
-    else
-      return false;
-  }
-
-  // We only handle bottom-tested loops, i.e. loop in which the condition is
-  // checked at the end of each iteration. With that we can assume that all
-  // instructions in the loop are executed the same number of times.
-  if (Lp->getExitingBlock() != Lp->getLoopLatch()) {
-    reportVectorizationFailure("The exiting block is not the loop latch",
+  // We currently must have a single "exit block" after the loop. Note that
+  // multiple "exiting blocks" inside the loop are allowed, provided they all
+  // reach the single exit block.
+  // TODO: This restriction can be relaxed in the near future, it's here solely
+  // to allow separation of changes for review. We need to generalize the phi
+  // update logic in a number of places.
+  if (!Lp->getUniqueExitBlock()) {
+    reportVectorizationFailure("The loop must have a unique exit block",
         "loop control flow is not understood by vectorizer",
         "CFGNotUnderstood", ORE, TheLoop);
     if (DoExtraAnalysis)
@@ -1103,7 +1109,6 @@ bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
     else
       return false;
   }
-
   return Result;
 }
 
@@ -1154,7 +1159,7 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
 
   // Specific checks for outer loops. We skip the remaining legal checks at this
   // point because they don't support outer loops.
-  if (!TheLoop->empty()) {
+  if (!TheLoop->isInnermost()) {
     assert(UseVPlanNativePath && "VPlan-native path is not enabled.");
 
     if (!canVectorizeOuterLoop()) {
@@ -1171,7 +1176,7 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
     return Result;
   }
 
-  assert(TheLoop->empty() && "Inner loop expected.");
+  assert(TheLoop->isInnermost() && "Inner loop expected.");
   // Check if we can if-convert non-single-bb loops.
   unsigned NumBlocks = TheLoop->getNumBlocks();
   if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
@@ -1246,10 +1251,10 @@ bool LoopVectorizationLegality::prepareToFoldTailByMasking() {
       Instruction *UI = cast<Instruction>(U);
       if (TheLoop->contains(UI))
         continue;
-      reportVectorizationFailure(
-          "Cannot fold tail by masking, loop has an outside user for",
-          "Cannot fold tail by masking in the presence of live outs.",
-          "LiveOutFoldingTailByMasking", ORE, TheLoop, UI);
+      LLVM_DEBUG(
+          dbgs()
+          << "LV: Cannot fold tail by masking, loop has an outside user for "
+          << *UI << "\n");
       return false;
     }
   }
@@ -1257,20 +1262,25 @@ bool LoopVectorizationLegality::prepareToFoldTailByMasking() {
   // The list of pointers that we can safely read and write to remains empty.
   SmallPtrSet<Value *, 8> SafePointers;
 
+  SmallPtrSet<const Instruction *, 8> TmpMaskedOp;
+  SmallPtrSet<Instruction *, 8> TmpConditionalAssumes;
+
   // Check and mark all blocks for predication, including those that ordinarily
   // do not need predication such as the header block.
   for (BasicBlock *BB : TheLoop->blocks()) {
-    if (!blockCanBePredicated(BB, SafePointers, /* MaskAllLoads= */ true)) {
-      reportVectorizationFailure(
-          "Cannot fold tail by masking as required",
-          "control flow cannot be substituted for a select",
-          "NoCFGForSelect", ORE, TheLoop,
-          BB->getTerminator());
+    if (!blockCanBePredicated(BB, SafePointers, TmpMaskedOp,
+                              TmpConditionalAssumes)) {
+      LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking as requested.\n");
       return false;
     }
   }
 
   LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n");
+
+  MaskedOp.insert(TmpMaskedOp.begin(), TmpMaskedOp.end());
+  ConditionalAssumes.insert(TmpConditionalAssumes.begin(),
+                            TmpConditionalAssumes.end());
+
   return true;
 }
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 8dd06983cd84..19797e6f7858 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -34,6 +34,7 @@ namespace llvm {
 class LoopVectorizationLegality;
 class LoopVectorizationCostModel;
 class PredicatedScalarEvolution;
+class VPRecipeBuilder;
 
 /// VPlan-based builder utility analogous to IRBuilder.
 class VPBuilder {
@@ -141,6 +142,10 @@ public:
     return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS});
   }
 
+  VPValue *createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal) {
+    return createNaryOp(Instruction::Select, {Cond, TrueVal, FalseVal});
+  }
+
   //===--------------------------------------------------------------------===//
   // RAII helpers.
   //===--------------------------------------------------------------------===//
@@ -171,16 +176,22 @@ public:
 /// Information about vectorization costs
 struct VectorizationFactor {
   // Vector width with best cost
-  unsigned Width;
+  ElementCount Width;
   // Cost of the loop with that width
   unsigned Cost;
 
   // Width 1 means no vectorization, cost 0 means uncomputed cost.
-  static VectorizationFactor Disabled() { return {1, 0}; }
+  static VectorizationFactor Disabled() {
+    return {ElementCount::getFixed(1), 0};
+  }
 
   bool operator==(const VectorizationFactor &rhs) const {
     return Width == rhs.Width && Cost == rhs.Cost;
   }
+
+  bool operator!=(const VectorizationFactor &rhs) const {
+    return !(*this == rhs);
+  }
 };
 
 /// Planner drives the vectorization process after having passed
@@ -226,7 +237,10 @@ class LoopVectorizationPlanner {
   /// A builder used to construct the current plan.
   VPBuilder Builder;
 
-  unsigned BestVF = 0;
+  /// The best number of elements of the vector types used in the
+  /// transformed loop. BestVF = None means that vectorization is
+  /// disabled.
+  Optional<ElementCount> BestVF = None;
   unsigned BestUF = 0;
 
 public:
@@ -241,14 +255,14 @@ public:
 
   /// Plan how to best vectorize, return the best VF and its cost, or None if
   /// vectorization and interleaving should be avoided up front.
-  Optional<VectorizationFactor> plan(unsigned UserVF, unsigned UserIC);
+  Optional<VectorizationFactor> plan(ElementCount UserVF, unsigned UserIC);
 
   /// Use the VPlan-native path to plan how to best vectorize, return the best
   /// VF and its cost.
-  VectorizationFactor planInVPlanNativePath(unsigned UserVF);
+  VectorizationFactor planInVPlanNativePath(ElementCount UserVF);
 
   /// Finalize the best decision and dispose of all other VPlans.
-  void setBestPlan(unsigned VF, unsigned UF);
+  void setBestPlan(ElementCount VF, unsigned UF);
 
   /// Generate the IR code for the body of the vectorized loop according to the
   /// best selected VPlan.
@@ -259,11 +273,21 @@ public:
       O << *Plan;
   }
 
+  /// Look through the existing plans and return true if we have one with all
+  /// the vectorization factors in question.
+  bool hasPlanWithVFs(const ArrayRef<ElementCount> VFs) const {
+    return any_of(VPlans, [&](const VPlanPtr &Plan) {
+      return all_of(VFs, [&](const ElementCount &VF) {
+        return Plan->hasVF(VF);
+      });
+    });
+  }
+
   /// Test a \p Predicate on a \p Range of VF's. Return the value of applying
   /// \p Predicate on Range.Start, possibly decreasing Range.End such that the
   /// returned value holds for the entire \p Range.
   static bool
-  getDecisionAndClampRange(const std::function<bool(unsigned)> &Predicate,
+  getDecisionAndClampRange(const std::function<bool(ElementCount)> &Predicate,
                            VFRange &Range);
 
 protected:
@@ -275,7 +299,7 @@ protected:
   /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
   /// according to the information gathered by Legal when it checked if it is
   /// legal to vectorize the loop.
-  void buildVPlans(unsigned MinVF, unsigned MaxVF);
+  void buildVPlans(ElementCount MinVF, ElementCount MaxVF);
 
 private:
   /// Build a VPlan according to the information gathered by Legal. \return a
@@ -286,14 +310,20 @@ private:
   /// Build a VPlan using VPRecipes according to the information gather by
   /// Legal. This method is only used for the legacy inner loop vectorizer.
   VPlanPtr buildVPlanWithVPRecipes(
-      VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
-      SmallPtrSetImpl<Instruction *> &DeadInstructions,
+      VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
       const DenseMap<Instruction *, Instruction *> &SinkAfter);
 
   /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
   /// according to the information gathered by Legal when it checked if it is
   /// legal to vectorize the loop. This method creates VPlans using VPRecipes.
-  void buildVPlansWithVPRecipes(unsigned MinVF, unsigned MaxVF);
+  void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF);
+
+  /// Adjust the recipes for any inloop reductions. The chain of instructions
+  /// leading from the loop exit instr to the phi need to be converted to
+  /// reductions, with one operand being vector and the other being the scalar
+  /// reduction chain.
+  void adjustRecipesForInLoopReductions(VPlanPtr &Plan,
+                                        VPRecipeBuilder &RecipeBuilder);
 };
 
 } // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 35af8e425778..b456a97aa4ec 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -130,6 +130,7 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/InstructionCost.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -157,18 +158,37 @@ using namespace llvm;
 #define LV_NAME "loop-vectorize"
 #define DEBUG_TYPE LV_NAME
 
+#ifndef NDEBUG
+const char VerboseDebug[] = DEBUG_TYPE "-verbose";
+#endif
+
 /// @{
 /// Metadata attribute names
-static const char *const LLVMLoopVectorizeFollowupAll =
-    "llvm.loop.vectorize.followup_all";
-static const char *const LLVMLoopVectorizeFollowupVectorized =
+const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
+const char LLVMLoopVectorizeFollowupVectorized[] =
     "llvm.loop.vectorize.followup_vectorized";
-static const char *const LLVMLoopVectorizeFollowupEpilogue =
+const char LLVMLoopVectorizeFollowupEpilogue[] =
     "llvm.loop.vectorize.followup_epilogue";
 /// @}
 
 STATISTIC(LoopsVectorized, "Number of loops vectorized");
 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
+STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
+
+static cl::opt<bool> EnableEpilogueVectorization(
+    "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
+    cl::desc("Enable vectorization of epilogue loops."));
+
+static cl::opt<unsigned> EpilogueVectorizationForceVF(
+    "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
+    cl::desc("When epilogue vectorization is enabled, and a value greater than "
+             "1 is specified, forces the given VF for all applicable epilogue "
+             "loops."));
+
+static cl::opt<unsigned> EpilogueVectorizationMinVF(
+    "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
+    cl::desc("Only loops with vectorization factor equal to or larger than "
+             "the specified value are considered for epilogue vectorization."));
 
 /// Loops with a known constant trip count below this number are vectorized only
 /// if no scalar iteration overheads are incurred.
@@ -178,13 +198,36 @@ static cl::opt<unsigned> TinyTripCountVectorThreshold(
              "value are vectorized only if no scalar iteration overheads "
              "are incurred."));
 
-// Indicates that an epilogue is undesired, predication is preferred.
-// This means that the vectorizer will try to fold the loop-tail (epilogue)
-// into the loop and predicate the loop body accordingly.
-static cl::opt<bool> PreferPredicateOverEpilog(
-    "prefer-predicate-over-epilog", cl::init(false), cl::Hidden,
-    cl::desc("Indicate that an epilogue is undesired, predication should be "
-             "used instead."));
+// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
+// that predication is preferred, and this lists all options. I.e., the
+// vectorizer will try to fold the tail-loop (epilogue) into the vector body
+// and predicate the instructions accordingly. If tail-folding fails, there are
+// different fallback strategies depending on these values:
+namespace PreferPredicateTy {
+  enum Option {
+    ScalarEpilogue = 0,
+    PredicateElseScalarEpilogue,
+    PredicateOrDontVectorize
+  };
+} // namespace PreferPredicateTy
+
+static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
+    "prefer-predicate-over-epilogue",
+    cl::init(PreferPredicateTy::ScalarEpilogue),
+    cl::Hidden,
+    cl::desc("Tail-folding and predication preferences over creating a scalar "
+             "epilogue loop."),
+    cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
+                         "scalar-epilogue",
+                         "Don't tail-predicate loops, create scalar epilogue"),
+              clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
+                         "predicate-else-scalar-epilogue",
+                         "prefer tail-folding, create scalar epilogue if tail "
+                         "folding fails."),
+              clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
+                         "predicate-dont-vectorize",
+                         "prefers tail-folding, don't attempt vectorization if "
+                         "tail-folding fails.")));
 
 static cl::opt<bool> MaximizeBandwidth(
     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
@@ -196,7 +239,7 @@ static cl::opt<bool> EnableInterleavedMemAccesses(
     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
 
 /// An interleave-group may need masking if it resides in a block that needs
-/// predication, or in order to mask away gaps. 
+/// predication, or in order to mask away gaps.
 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
@@ -230,6 +273,12 @@ static cl::opt<unsigned> ForceTargetInstructionCost(
              "an instruction to a single constant value. Mostly "
              "useful for getting consistent testing."));
 
+static cl::opt<bool> ForceTargetSupportsScalableVectors(
+    "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
+    cl::desc(
+        "Pretend that scalable vectors are supported, even if the target does "
+        "not support them. This flag should only be used for testing."));
+
 static cl::opt<unsigned> SmallLoopCost(
     "small-loop-cost", cl::init(20), cl::Hidden,
     cl::desc(
@@ -247,6 +296,12 @@ static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
     cl::desc(
         "Enable runtime interleaving until load/store ports are saturated"));
 
+/// Interleave small loops with scalar reductions.
+static cl::opt<bool> InterleaveSmallLoopScalarReduction(
+    "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
+    cl::desc("Enable interleaving for loops with small iteration counts that "
+             "contain scalar reductions to expose ILP."));
+
 /// The number of stores in a loop that are allowed to need predication.
 static cl::opt<unsigned> NumberOfStoresToPredicate(
     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
@@ -265,6 +320,17 @@ static cl::opt<unsigned> MaxNestedScalarReductionIC(
     cl::desc("The maximum interleave count to use when interleaving a scalar "
              "reduction in a nested loop."));
 
+static cl::opt<bool>
+    PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
+                           cl::Hidden,
+                           cl::desc("Prefer in-loop vector reductions, "
+                                    "overriding the targets preference."));
+
+static cl::opt<bool> PreferPredicatedReductionSelect(
+    "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
+    cl::desc(
+        "Prefer predicating a reduction operation over an after loop select."));
+
 cl::opt<bool> EnableVPlanNativePath(
     "enable-vplan-native-path", cl::init(false), cl::Hidden,
     cl::desc("Enable VPlan-native vectorization path with "
@@ -306,17 +372,11 @@ static Type *getMemInstValueType(Value *I) {
 
 /// A helper function that returns true if the given type is irregular. The
 /// type is irregular if its allocated size doesn't equal the store size of an
-/// element of the corresponding vector type at the given vectorization factor.
-static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
-  // Determine if an array of VF elements of type Ty is "bitcast compatible"
-  // with a <VF x Ty> vector.
-  if (VF > 1) {
-    auto *VectorTy = FixedVectorType::get(Ty, VF);
-    return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
-  }
-
-  // If the vectorization factor is one, we just check if an array of type Ty
-  // requires padding between elements.
+/// element of the corresponding vector type.
+static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
+  // Determine if an array of N elements of type Ty is "bitcast compatible"
+  // with a <N x Ty> vector.
+  // This is only true if there is no padding between the array elements.
   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
 }
 
@@ -393,29 +453,42 @@ public:
                       LoopInfo *LI, DominatorTree *DT,
                       const TargetLibraryInfo *TLI,
                       const TargetTransformInfo *TTI, AssumptionCache *AC,
-                      OptimizationRemarkEmitter *ORE, unsigned VecWidth,
+                      OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
-                      LoopVectorizationCostModel *CM)
+                      LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
+                      ProfileSummaryInfo *PSI)
       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
         Builder(PSE.getSE()->getContext()),
-        VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
+        VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM),
+        BFI(BFI), PSI(PSI) {
+    // Query this against the original loop and save it here because the profile
+    // of the original loop header may change as the transformation happens.
+    OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
+        OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
+  }
+
   virtual ~InnerLoopVectorizer() = default;
 
-  /// Create a new empty loop. Unlink the old loop and connect the new one.
-  /// Return the pre-header block of the new loop.
-  BasicBlock *createVectorizedLoopSkeleton();
+  /// Create a new empty loop that will contain vectorized instructions later
+  /// on, while the old loop will be used as the scalar remainder. Control flow
+  /// is generated around the vectorized (and scalar epilogue) loops consisting
+  /// of various checks and bypasses. Return the pre-header block of the new
+  /// loop.
+  /// In the case of epilogue vectorization, this function is overriden to
+  /// handle the more complex control flow around the loops.
+  virtual BasicBlock *createVectorizedLoopSkeleton();
 
   /// Widen a single instruction within the innermost loop.
-  void widenInstruction(Instruction &I, VPUser &Operands,
+  void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands,
                         VPTransformState &State);
 
   /// Widen a single call instruction within the innermost loop.
-  void widenCallInstruction(CallInst &I, VPUser &ArgOperands,
+  void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
                             VPTransformState &State);
 
   /// Widen a single select instruction within the innermost loop.
-  void widenSelectInstruction(SelectInst &I, VPUser &Operands,
+  void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands,
                               bool InvariantCond, VPTransformState &State);
 
   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
@@ -431,14 +504,15 @@ public:
 
   /// Vectorize a single GetElementPtrInst based on information gathered and
   /// decisions taken during planning.
-  void widenGEP(GetElementPtrInst *GEP, VPUser &Indices, unsigned UF,
-                unsigned VF, bool IsPtrLoopInvariant,
+  void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices,
+                unsigned UF, ElementCount VF, bool IsPtrLoopInvariant,
                 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
 
   /// Vectorize a single PHINode in a block. This method handles the induction
   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
   /// arbitrary length vectors.
-  void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
+  void widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc,
+                           Value *StartV, unsigned UF, ElementCount VF);
 
   /// A helper function to scalarize a single Instruction in the innermost loop.
   /// Generates a sequence of scalar instances for each lane between \p MinLane
@@ -452,7 +526,8 @@ public:
   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
   /// is provided, the integer induction variable will first be truncated to
   /// the corresponding type.
-  void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
+  void widenIntOrFpInduction(PHINode *IV, Value *Start,
+                             TruncInst *Trunc = nullptr);
 
   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
   /// vector or scalar value on-demand if one is not yet available. When
@@ -477,6 +552,10 @@ public:
   /// value into a vector.
   Value *getOrCreateVectorValue(Value *V, unsigned Part);
 
+  void setVectorValue(Value *Scalar, unsigned Part, Value *Vector) {
+    VectorLoopValueMap.setVectorValue(Scalar, Part, Vector);
+  }
+
   /// Return a value in the new loop corresponding to \p V from the original
   /// loop at unroll and vector indices \p Instance. If the value has been
   /// vectorized but not scalarized, the necessary extractelement instruction
@@ -491,7 +570,9 @@ public:
   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
   /// values in the vectorized loop.
   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
+                                ArrayRef<VPValue *> VPDefs,
                                 VPTransformState &State, VPValue *Addr,
+                                ArrayRef<VPValue *> StoredValues,
                                 VPValue *BlockInMask = nullptr);
 
   /// Vectorize Load and Store instructions with the base address given in \p
@@ -499,8 +580,8 @@ public:
   /// non-null. Use \p State to translate given VPValues to IR values in the
   /// vectorized loop.
   void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
-                                  VPValue *Addr, VPValue *StoredValue,
-                                  VPValue *BlockInMask);
+                                  VPValue *Def, VPValue *Addr,
+                                  VPValue *StoredValue, VPValue *BlockInMask);
 
   /// Set the debug location in the builder using the debug location in
   /// the instruction.
@@ -544,10 +625,11 @@ protected:
   /// Clear NSW/NUW flags from reduction instructions if necessary.
   void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
 
-  /// The Loop exit block may have single value PHI nodes with some
-  /// incoming value. While vectorizing we only handled real values
-  /// that were defined inside the loop and we should have one value for
-  /// each predecessor of its parent basic block. See PR14725.
+  /// Fixup the LCSSA phi nodes in the unique exit block.  This simply
+  /// means we need to add the appropriate incoming value from the middle
+  /// block as exiting edges from the scalar epilogue loop (if present) are
+  /// already in place, and we exit the vector loop exclusively to the middle
+  /// block.
   void fixLCSSAPHIs();
 
   /// Iteratively sink the scalarized operands of a predicated instruction into
@@ -586,7 +668,8 @@ protected:
   /// truncate instruction, instead of widening the original IV, we widen a
   /// version of the IV truncated to \p EntryVal's type.
   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
-                                       Value *Step, Instruction *EntryVal);
+                                       Value *Step, Value *Start,
+                                       Instruction *EntryVal);
 
   /// Returns true if an instruction \p I should be scalarized instead of
   /// vectorized for the chosen vectorization factor.
@@ -654,6 +737,28 @@ protected:
                               const DataLayout &DL,
                               const InductionDescriptor &ID) const;
 
+  /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
+  /// vector loop preheader, middle block and scalar preheader. Also
+  /// allocate a loop object for the new vector loop and return it.
+  Loop *createVectorLoopSkeleton(StringRef Prefix);
+
+  /// Create new phi nodes for the induction variables to resume iteration count
+  /// in the scalar epilogue, from where the vectorized loop left off (given by
+  /// \p VectorTripCount).
+  /// In cases where the loop skeleton is more complicated (eg. epilogue
+  /// vectorization) and the resume values can come from an additional bypass
+  /// block, the \p AdditionalBypass pair provides information about the bypass
+  /// block and the end value on the edge from bypass to this loop.
+  void createInductionResumeValues(
+      Loop *L, Value *VectorTripCount,
+      std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
+
+  /// Complete the loop skeleton by adding debug MDs, creating appropriate
+  /// conditional branches in the middle block, preparing the builder and
+  /// running the verifier. Take in the vector loop \p L as argument, and return
+  /// the preheader of the completed vector loop.
+  BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
+
   /// Add additional metadata to \p To that was not present on \p Orig.
   ///
   /// Currently this is used to add the noalias annotations based on the
@@ -672,6 +777,11 @@ protected:
   /// vector of instructions.
   void addMetadata(ArrayRef<Value *> To, Instruction *From);
 
+  /// Allow subclasses to override and print debug traces before/after vplan
+  /// execution, when trace information is requested.
+  virtual void printDebugTracesAtStart(){};
+  virtual void printDebugTracesAtEnd(){};
+
   /// The original loop.
   Loop *OrigLoop;
 
@@ -710,7 +820,7 @@ protected:
 
   /// The vectorization SIMD factor to use. Each vector will have this many
   /// vector elements.
-  unsigned VF;
+  ElementCount VF;
 
   /// The vectorization unroll factor to use. Each scalar is vectorized to this
   /// many different vector instructions.
@@ -730,7 +840,8 @@ protected:
   /// Middle Block between the vector and the scalar.
   BasicBlock *LoopMiddleBlock;
 
-  /// The ExitBlock of the scalar loop.
+  /// The (unique) ExitBlock of the scalar loop.  Note that
+  /// there can be multiple exiting edges reaching this block.
   BasicBlock *LoopExitBlock;
 
   /// The vector loop body.
@@ -779,6 +890,14 @@ protected:
   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
   // fixed up at the end of vector code generation.
   SmallVector<PHINode *, 8> OrigPHIsToFix;
+
+  /// BFI and PSI are used to check for profile guided size optimizations.
+  BlockFrequencyInfo *BFI;
+  ProfileSummaryInfo *PSI;
+
+  // Whether this loop should be optimized for size based on profile guided size
+  // optimizatios.
+  bool OptForSizeBasedOnProfile;
 };
 
 class InnerLoopUnroller : public InnerLoopVectorizer {
@@ -789,9 +908,11 @@ public:
                     const TargetTransformInfo *TTI, AssumptionCache *AC,
                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
                     LoopVectorizationLegality *LVL,
-                    LoopVectorizationCostModel *CM)
-      : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
-                            UnrollFactor, LVL, CM) {}
+                    LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
+                    ProfileSummaryInfo *PSI)
+      : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
+                            ElementCount::getFixed(1), UnrollFactor, LVL, CM,
+                            BFI, PSI) {}
 
 private:
   Value *getBroadcastInstrs(Value *V) override;
@@ -801,6 +922,128 @@ private:
   Value *reverseVector(Value *Vec) override;
 };
 
+/// Encapsulate information regarding vectorization of a loop and its epilogue.
+/// This information is meant to be updated and used across two stages of
+/// epilogue vectorization.
+struct EpilogueLoopVectorizationInfo {
+  ElementCount MainLoopVF = ElementCount::getFixed(0);
+  unsigned MainLoopUF = 0;
+  ElementCount EpilogueVF = ElementCount::getFixed(0);
+  unsigned EpilogueUF = 0;
+  BasicBlock *MainLoopIterationCountCheck = nullptr;
+  BasicBlock *EpilogueIterationCountCheck = nullptr;
+  BasicBlock *SCEVSafetyCheck = nullptr;
+  BasicBlock *MemSafetyCheck = nullptr;
+  Value *TripCount = nullptr;
+  Value *VectorTripCount = nullptr;
+
+  EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF,
+                                unsigned EUF)
+      : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF),
+        EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) {
+    assert(EUF == 1 &&
+           "A high UF for the epilogue loop is likely not beneficial.");
+  }
+};
+
+/// An extension of the inner loop vectorizer that creates a skeleton for a
+/// vectorized loop that has its epilogue (residual) also vectorized.
+/// The idea is to run the vplan on a given loop twice, firstly to setup the
+/// skeleton and vectorize the main loop, and secondly to complete the skeleton
+/// from the first step and vectorize the epilogue.  This is achieved by
+/// deriving two concrete strategy classes from this base class and invoking
+/// them in succession from the loop vectorizer planner.
+class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
+public:
+  InnerLoopAndEpilogueVectorizer(
+      Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
+      DominatorTree *DT, const TargetLibraryInfo *TLI,
+      const TargetTransformInfo *TTI, AssumptionCache *AC,
+      OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
+      LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
+      BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
+      : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
+                            EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI),
+        EPI(EPI) {}
+
+  // Override this function to handle the more complex control flow around the
+  // three loops.
+  BasicBlock *createVectorizedLoopSkeleton() final override {
+    return createEpilogueVectorizedLoopSkeleton();
+  }
+
+  /// The interface for creating a vectorized skeleton using one of two
+  /// different strategies, each corresponding to one execution of the vplan
+  /// as described above.
+  virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0;
+
+  /// Holds and updates state information required to vectorize the main loop
+  /// and its epilogue in two separate passes. This setup helps us avoid
+  /// regenerating and recomputing runtime safety checks. It also helps us to
+  /// shorten the iteration-count-check path length for the cases where the
+  /// iteration count of the loop is so small that the main vector loop is
+  /// completely skipped.
+  EpilogueLoopVectorizationInfo &EPI;
+};
+
+/// A specialized derived class of inner loop vectorizer that performs
+/// vectorization of *main* loops in the process of vectorizing loops and their
+/// epilogues.
+class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
+public:
+  EpilogueVectorizerMainLoop(
+      Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
+      DominatorTree *DT, const TargetLibraryInfo *TLI,
+      const TargetTransformInfo *TTI, AssumptionCache *AC,
+      OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
+      LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
+      BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
+      : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
+                                       EPI, LVL, CM, BFI, PSI) {}
+  /// Implements the interface for creating a vectorized skeleton using the
+  /// *main loop* strategy (ie the first pass of vplan execution).
+  BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
+
+protected:
+  /// Emits an iteration count bypass check once for the main loop (when \p
+  /// ForEpilogue is false) and once for the epilogue loop (when \p
+  /// ForEpilogue is true).
+  BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass,
+                                             bool ForEpilogue);
+  void printDebugTracesAtStart() override;
+  void printDebugTracesAtEnd() override;
+};
+
+// A specialized derived class of inner loop vectorizer that performs
+// vectorization of *epilogue* loops in the process of vectorizing loops and
+// their epilogues.
+class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
+public:
+  EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
+                    LoopInfo *LI, DominatorTree *DT,
+                    const TargetLibraryInfo *TLI,
+                    const TargetTransformInfo *TTI, AssumptionCache *AC,
+                    OptimizationRemarkEmitter *ORE,
+                    EpilogueLoopVectorizationInfo &EPI,
+                    LoopVectorizationLegality *LVL,
+                    llvm::LoopVectorizationCostModel *CM,
+                    BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
+      : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
+                                       EPI, LVL, CM, BFI, PSI) {}
+  /// Implements the interface for creating a vectorized skeleton using the
+  /// *epilogue loop* strategy (ie the second pass of vplan execution).
+  BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
+
+protected:
+  /// Emits an iteration count bypass check after the main vector loop has
+  /// finished to see if there are any iterations left to execute by either
+  /// the vector epilogue or the scalar epilogue.
+  BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L,
+                                                      BasicBlock *Bypass,
+                                                      BasicBlock *Insert);
+  void printDebugTracesAtStart() override;
+  void printDebugTracesAtEnd() override;
+};
 } // end namespace llvm
 
 /// Look for a meaningful debug location on the instruction or it's
@@ -827,7 +1070,9 @@ void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr)
     const DILocation *DIL = Inst->getDebugLoc();
     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
         !isa<DbgInfoIntrinsic>(Inst)) {
-      auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF);
+      assert(!VF.isScalable() && "scalable vectors not yet supported.");
+      auto NewDIL =
+          DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
       if (NewDIL)
         B.SetCurrentDebugLocation(NewDIL.getValue());
       else
@@ -881,6 +1126,15 @@ static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
   return R;
 }
 
+/// Return a value for Step multiplied by VF.
+static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) {
+  assert(isa<ConstantInt>(Step) && "Expected an integer step");
+  Constant *StepVal = ConstantInt::get(
+      Step->getType(),
+      cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue());
+  return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
+}
+
 namespace llvm {
 
 void reportVectorizationFailure(const StringRef DebugMsg,
@@ -952,7 +1206,10 @@ enum ScalarEpilogueLowering {
   CM_ScalarEpilogueNotAllowedLowTripLoop,
 
   // Loop hint predicate indicating an epilogue is undesired.
-  CM_ScalarEpilogueNotNeededUsePredicate
+  CM_ScalarEpilogueNotNeededUsePredicate,
+
+  // Directive indicating we must either tail fold or not vectorize
+  CM_ScalarEpilogueNotAllowedUsePredicate
 };
 
 /// LoopVectorizationCostModel - estimates the expected speedups due to
@@ -979,7 +1236,7 @@ public:
 
   /// \return An upper bound for the vectorization factor, or None if
   /// vectorization and interleaving should be avoided up front.
-  Optional<unsigned> computeMaxVF(unsigned UserVF, unsigned UserIC);
+  Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC);
 
   /// \return True if runtime checks are required for vectorization, and false
   /// otherwise.
@@ -989,10 +1246,13 @@ public:
   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
   /// then this vectorization factor will be selected if vectorization is
   /// possible.
-  VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
+  VectorizationFactor selectVectorizationFactor(ElementCount MaxVF);
+  VectorizationFactor
+  selectEpilogueVectorizationFactor(const ElementCount MaxVF,
+                                    const LoopVectorizationPlanner &LVP);
 
   /// Setup cost-based decisions for user vectorization factor.
-  void selectUserVectorizationFactor(unsigned UserVF) {
+  void selectUserVectorizationFactor(ElementCount UserVF) {
     collectUniformsAndScalars(UserVF);
     collectInstsToScalarize(UserVF);
   }
@@ -1006,7 +1266,7 @@ public:
   /// If interleave count has been specified by metadata it will be returned.
   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
   /// are the selected vectorization factor and the cost of the selected VF.
-  unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost);
+  unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
 
   /// Memory access instruction may be vectorized in more than one way.
   /// Form of instruction after vectorization depends on cost.
@@ -1015,7 +1275,7 @@ public:
   /// the lists of loop-uniform and loop-scalar instructions.
   /// The calculated cost is saved with widening decision in order to
   /// avoid redundant calculations.
-  void setCostBasedWideningDecision(unsigned VF);
+  void setCostBasedWideningDecision(ElementCount VF);
 
   /// A struct that represents some properties of the register usage
   /// of a loop.
@@ -1030,11 +1290,16 @@ public:
 
   /// \return Returns information about the register usages of the loop for the
   /// given vectorization factors.
-  SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
+  SmallVector<RegisterUsage, 8>
+  calculateRegisterUsage(ArrayRef<ElementCount> VFs);
 
   /// Collect values we want to ignore in the cost model.
   void collectValuesToIgnore();
 
+  /// Split reductions into those that happen in the loop, and those that happen
+  /// outside. In loop reductions are collected into InLoopReductionChains.
+  void collectInLoopReductions();
+
   /// \returns The smallest bitwidth each instruction can be represented with.
   /// The vector equivalents of these instructions should be truncated to this
   /// type.
@@ -1044,8 +1309,9 @@ public:
 
   /// \returns True if it is more profitable to scalarize instruction \p I for
   /// vectorization factor \p VF.
-  bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
-    assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
+  bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
+    assert(VF.isVector() &&
+           "Profitable to scalarize relevant only for VF > 1.");
 
     // Cost model is not run in the VPlan-native path - return conservative
     // result until this changes.
@@ -1059,8 +1325,8 @@ public:
   }
 
   /// Returns true if \p I is known to be uniform after vectorization.
-  bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
-    if (VF == 1)
+  bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
+    if (VF.isScalar())
       return true;
 
     // Cost model is not run in the VPlan-native path - return conservative
@@ -1075,8 +1341,8 @@ public:
   }
 
   /// Returns true if \p I is known to be scalar after vectorization.
-  bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
-    if (VF == 1)
+  bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
+    if (VF.isScalar())
       return true;
 
     // Cost model is not run in the VPlan-native path - return conservative
@@ -1092,8 +1358,8 @@ public:
 
   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
   /// for vectorization factor \p VF.
-  bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
-    return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
+  bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
+    return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
            !isProfitableToScalarize(I, VF) &&
            !isScalarAfterVectorization(I, VF);
   }
@@ -1110,17 +1376,18 @@ public:
 
   /// Save vectorization decision \p W and \p Cost taken by the cost model for
   /// instruction \p I and vector width \p VF.
-  void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
-                           unsigned Cost) {
-    assert(VF >= 2 && "Expected VF >=2");
+  void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
+                           InstructionCost Cost) {
+    assert(VF.isVector() && "Expected VF >=2");
     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
   }
 
   /// Save vectorization decision \p W and \p Cost taken by the cost model for
   /// interleaving group \p Grp and vector width \p VF.
-  void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
-                           InstWidening W, unsigned Cost) {
-    assert(VF >= 2 && "Expected VF >=2");
+  void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
+                           ElementCount VF, InstWidening W,
+                           InstructionCost Cost) {
+    assert(VF.isVector() && "Expected VF >=2");
     /// Broadcast this decicion to all instructions inside the group.
     /// But the cost will be assigned to one instruction only.
     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
@@ -1136,15 +1403,14 @@ public:
   /// Return the cost model decision for the given instruction \p I and vector
   /// width \p VF. Return CM_Unknown if this instruction did not pass
   /// through the cost modeling.
-  InstWidening getWideningDecision(Instruction *I, unsigned VF) {
-    assert(VF >= 2 && "Expected VF >=2");
-
+  InstWidening getWideningDecision(Instruction *I, ElementCount VF) {
+    assert(VF.isVector() && "Expected VF to be a vector VF");
     // Cost model is not run in the VPlan-native path - return conservative
     // result until this changes.
     if (EnableVPlanNativePath)
       return CM_GatherScatter;
 
-    std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
+    std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
     auto Itr = WideningDecisions.find(InstOnVF);
     if (Itr == WideningDecisions.end())
       return CM_Unknown;
@@ -1153,9 +1419,9 @@ public:
 
   /// Return the vectorization cost for the given instruction \p I and vector
   /// width \p VF.
-  unsigned getWideningCost(Instruction *I, unsigned VF) {
-    assert(VF >= 2 && "Expected VF >=2");
-    std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
+  InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
+    assert(VF.isVector() && "Expected VF >=2");
+    std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
            "The cost is not calculated");
     return WideningDecisions[InstOnVF].second;
@@ -1164,7 +1430,7 @@ public:
   /// Return True if instruction \p I is an optimizable truncate whose operand
   /// is an induction variable. Such a truncate will be removed by adding a new
   /// induction variable with the destination type.
-  bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
+  bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
     // If the instruction is not a truncate, return false.
     auto *Trunc = dyn_cast<TruncInst>(I);
     if (!Trunc)
@@ -1189,14 +1455,14 @@ public:
 
   /// Collects the instructions to scalarize for each predicated instruction in
   /// the loop.
-  void collectInstsToScalarize(unsigned VF);
+  void collectInstsToScalarize(ElementCount VF);
 
   /// Collect Uniform and Scalar values for the given \p VF.
   /// The sets depend on CM decision for Load/Store instructions
   /// that may be vectorized as interleave, gather-scatter or scalarized.
-  void collectUniformsAndScalars(unsigned VF) {
+  void collectUniformsAndScalars(ElementCount VF) {
     // Do the analysis once.
-    if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
+    if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
       return;
     setCostBasedWideningDecision(VF);
     collectLoopUniforms(VF);
@@ -1247,7 +1513,8 @@ public:
   /// instructions that may divide by zero.
   /// If a non-zero VF has been calculated, we check if I will be scalarized
   /// predication for that VF.
-  bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
+  bool isScalarWithPredication(Instruction *I,
+                               ElementCount VF = ElementCount::getFixed(1));
 
   // Returns true if \p I is an instruction that will be predicated either
   // through scalar predication or masked load/store or masked gather/scatter.
@@ -1264,12 +1531,16 @@ public:
 
   /// Returns true if \p I is a memory instruction with consecutive memory
   /// access that can be widened.
-  bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
+  bool
+  memoryInstructionCanBeWidened(Instruction *I,
+                                ElementCount VF = ElementCount::getFixed(1));
 
   /// Returns true if \p I is a memory instruction in an interleaved-group
   /// of memory accesses that can be vectorized with wide vector loads/stores
   /// and shuffles.
-  bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
+  bool
+  interleavedAccessCanBeWidened(Instruction *I,
+                                ElementCount VF = ElementCount::getFixed(1));
 
   /// Check if \p Instr belongs to any interleaved access group.
   bool isAccessInterleaved(Instruction *Instr) {
@@ -1282,11 +1553,16 @@ public:
     return InterleaveInfo.getInterleaveGroup(Instr);
   }
 
-  /// Returns true if an interleaved group requires a scalar iteration
-  /// to handle accesses with gaps, and there is nothing preventing us from
-  /// creating a scalar epilogue.
+  /// Returns true if we're required to use a scalar epilogue for at least
+  /// the final iteration of the original loop.
   bool requiresScalarEpilogue() const {
-    return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
+    if (!isScalarEpilogueAllowed())
+      return false;
+    // If we might exit from anywhere but the latch, must run the exiting
+    // iteration in scalar form.
+    if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
+      return true;
+    return InterleaveInfo.requiresScalarEpilogue();
   }
 
   /// Returns true if a scalar epilogue is not allowed due to optsize or a
@@ -1302,17 +1578,34 @@ public:
     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
   }
 
+  /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
+  /// nodes to the chain of instructions representing the reductions. Uses a
+  /// MapVector to ensure deterministic iteration order.
+  using ReductionChainMap =
+      SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
+
+  /// Return the chain of instructions representing an inloop reduction.
+  const ReductionChainMap &getInLoopReductionChains() const {
+    return InLoopReductionChains;
+  }
+
+  /// Returns true if the Phi is part of an inloop reduction.
+  bool isInLoopReduction(PHINode *Phi) const {
+    return InLoopReductionChains.count(Phi);
+  }
+
   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
   /// with factor VF.  Return the cost of the instruction, including
   /// scalarization overhead if it's needed.
-  unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF);
+  InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF);
 
   /// Estimate cost of a call instruction CI if it were vectorized with factor
   /// VF. Return the cost of the instruction, including scalarization overhead
   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
   /// scalarized -
   /// i.e. either vector version isn't available, or is too expensive.
-  unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
+  InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
+                                    bool &NeedToScalarize);
 
   /// Invalidates decisions already taken by the cost model.
   void invalidateCostModelingDecisions() {
@@ -1327,7 +1620,8 @@ private:
   /// \return An upper bound for the vectorization factor, a power-of-2 larger
   /// than zero. One is returned if vectorization should best be avoided due
   /// to cost.
-  unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
+  ElementCount computeFeasibleMaxVF(unsigned ConstTripCount,
+                                    ElementCount UserVF);
 
   /// The vectorization cost is a combination of the cost itself and a boolean
   /// indicating whether any of the contributing operations will actually
@@ -1336,47 +1630,54 @@ private:
   /// is
   /// false, then all operations will be scalarized (i.e. no vectorization has
   /// actually taken place).
-  using VectorizationCostTy = std::pair<unsigned, bool>;
+  using VectorizationCostTy = std::pair<InstructionCost, bool>;
 
   /// Returns the expected execution cost. The unit of the cost does
   /// not matter because we use the 'cost' units to compare different
   /// vector widths. The cost that is returned is *not* normalized by
   /// the factor width.
-  VectorizationCostTy expectedCost(unsigned VF);
+  VectorizationCostTy expectedCost(ElementCount VF);
 
   /// Returns the execution time cost of an instruction for a given vector
   /// width. Vector width of one means scalar.
-  VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
+  VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
 
   /// The cost-computation logic from getInstructionCost which provides
   /// the vector type as an output parameter.
-  unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
+  InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
+                                     Type *&VectorTy);
+
+  /// Return the cost of instructions in an inloop reduction pattern, if I is
+  /// part of that pattern.
+  InstructionCost getReductionPatternCost(Instruction *I, ElementCount VF,
+                                          Type *VectorTy,
+                                          TTI::TargetCostKind CostKind);
 
   /// Calculate vectorization cost of memory instruction \p I.
-  unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
+  InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
 
   /// The cost computation for scalarized memory instruction.
-  unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
+  InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
 
   /// The cost computation for interleaving group of memory instructions.
-  unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
+  InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
 
   /// The cost computation for Gather/Scatter instruction.
-  unsigned getGatherScatterCost(Instruction *I, unsigned VF);
+  InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
 
   /// The cost computation for widening instruction \p I with consecutive
   /// memory access.
-  unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
+  InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
 
   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
   /// Load: scalar load + broadcast.
   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
   /// element)
-  unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
+  InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
 
   /// Estimate the overhead of scalarizing an instruction. This is a
   /// convenience wrapper for the type-based getScalarizationOverhead API.
-  unsigned getScalarizationOverhead(Instruction *I, unsigned VF);
+  InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF);
 
   /// Returns whether the instruction is a load or store and will be a emitted
   /// as a vector operation.
@@ -1394,7 +1695,7 @@ private:
   /// A type representing the costs for instructions if they were to be
   /// scalarized rather than vectorized. The entries are Instruction-Cost
   /// pairs.
-  using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
+  using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
 
   /// A set containing all BasicBlocks that are known to present after
   /// vectorization as a predicated block.
@@ -1416,19 +1717,30 @@ private:
   /// presence of a cost for an instruction in the mapping indicates that the
   /// instruction will be scalarized when vectorizing with the associated
   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
-  DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
+  DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
 
   /// Holds the instructions known to be uniform after vectorization.
   /// The data is collected per VF.
-  DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
+  DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
 
   /// Holds the instructions known to be scalar after vectorization.
   /// The data is collected per VF.
-  DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
+  DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
 
   /// Holds the instructions (address computations) that are forced to be
   /// scalarized.
-  DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
+  DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
+
+  /// PHINodes of the reductions that should be expanded in-loop along with
+  /// their associated chains of reduction operations, in program order from top
+  /// (PHI) to bottom
+  ReductionChainMap InLoopReductionChains;
+
+  /// A Map of inloop reduction operations and their immediate chain operand.
+  /// FIXME: This can be removed once reductions can be costed correctly in
+  /// vplan. This was added to allow quick lookup to the inloop operations,
+  /// without having to loop through InLoopReductionChains.
+  DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
 
   /// Returns the expected difference in cost from scalarizing the expression
   /// feeding a predicated instruction \p PredInst. The instructions to
@@ -1436,7 +1748,7 @@ private:
   /// non-negative return value implies the expression will be scalarized.
   /// Currently, only single-use chains are considered for scalarization.
   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
-                              unsigned VF);
+                              ElementCount VF);
 
   /// Collect the instructions that are uniform after vectorization. An
   /// instruction is uniform if we represent it with a single scalar value in
@@ -1447,27 +1759,28 @@ private:
   /// scalarized instruction will be represented by VF scalar values in the
   /// vectorized loop, each corresponding to an iteration of the original
   /// scalar loop.
-  void collectLoopUniforms(unsigned VF);
+  void collectLoopUniforms(ElementCount VF);
 
   /// Collect the instructions that are scalar after vectorization. An
   /// instruction is scalar if it is known to be uniform or will be scalarized
   /// during vectorization. Non-uniform scalarized instructions will be
   /// represented by VF values in the vectorized loop, each corresponding to an
   /// iteration of the original scalar loop.
-  void collectLoopScalars(unsigned VF);
+  void collectLoopScalars(ElementCount VF);
 
   /// Keeps cost model vectorization decision and cost for instructions.
   /// Right now it is used for memory instructions only.
-  using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
-                                std::pair<InstWidening, unsigned>>;
+  using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
+                                std::pair<InstWidening, InstructionCost>>;
 
   DecisionList WideningDecisions;
 
   /// Returns true if \p V is expected to be vectorized and it needs to be
   /// extracted.
-  bool needsExtract(Value *V, unsigned VF) const {
+  bool needsExtract(Value *V, ElementCount VF) const {
     Instruction *I = dyn_cast<Instruction>(V);
-    if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I))
+    if (VF.isScalar() || !I || !TheLoop->contains(I) ||
+        TheLoop->isLoopInvariant(I))
       return false;
 
     // Assume we can vectorize V (and hence we need extraction) if the
@@ -1482,11 +1795,21 @@ private:
 
   /// Returns a range containing only operands needing to be extracted.
   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
-                                                   unsigned VF) {
+                                                   ElementCount VF) {
     return SmallVector<Value *, 4>(make_filter_range(
         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
   }
 
+  /// Determines if we have the infrastructure to vectorize loop \p L and its
+  /// epilogue, assuming the main loop is vectorized by \p VF.
+  bool isCandidateForEpilogueVectorization(const Loop &L,
+                                           const ElementCount VF) const;
+
+  /// Returns true if epilogue vectorization is considered profitable, and
+  /// false otherwise.
+  /// \p VF is the vectorization factor chosen for the original loop.
+  bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
+
 public:
   /// The loop that we evaluate.
   Loop *TheLoop;
@@ -1529,6 +1852,9 @@ public:
 
   /// Values to ignore in the cost model when VF > 1.
   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
+
+  /// Profitable vector factors.
+  SmallVector<VectorizationFactor, 8> ProfitableVFs;
 };
 
 } // end namespace llvm
@@ -1549,7 +1875,7 @@ public:
 // representation for pragma 'omp simd' is introduced.
 static bool isExplicitVecOuterLoop(Loop *OuterLp,
                                    OptimizationRemarkEmitter *ORE) {
-  assert(!OuterLp->empty() && "This is not an outer loop");
+  assert(!OuterLp->isInnermost() && "This is not an outer loop");
   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
 
   // Only outer loops with an explicit vectorization hint are supported.
@@ -1582,7 +1908,7 @@ static void collectSupportedLoops(Loop &L, LoopInfo *LI,
   // now, only collect outer loops that have explicit vectorization hints. If we
   // are stress testing the VPlan H-CFG construction, we collect the outermost
   // loop of every loop nest.
-  if (L.empty() || VPlanBuildStressTest ||
+  if (L.isInnermost() || VPlanBuildStressTest ||
       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
     LoopBlocksRPO RPOT(&L);
     RPOT.perform(LI);
@@ -1696,10 +2022,10 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
 }
 
 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
-    const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
+    const InductionDescriptor &II, Value *Step, Value *Start,
+    Instruction *EntryVal) {
   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
          "Expected either an induction phi-node or a truncate of it!");
-  Value *Start = II.getStartValue();
 
   // Construct the initial value of the vector IV in the vector loop preheader
   auto CurrIP = Builder.saveIP();
@@ -1729,7 +2055,8 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
 
   // Multiply the vectorization factor by the step using integer or
   // floating-point arithmetic as appropriate.
-  Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
+  Value *ConstVF =
+      getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue());
   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
 
   // Create a vector splat to use in the induction update.
@@ -1737,10 +2064,10 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
   // FIXME: If the step is non-constant, we create the vector splat with
   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
   //        handle a constant vector splat.
-  Value *SplatVF =
-      isa<Constant>(Mul)
-          ? ConstantVector::getSplat({VF, false}, cast<Constant>(Mul))
-          : Builder.CreateVectorSplat(VF, Mul);
+  assert(!VF.isScalable() && "scalable vectors not yet supported.");
+  Value *SplatVF = isa<Constant>(Mul)
+                       ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
+                       : Builder.CreateVectorSplat(VF, Mul);
   Builder.restoreIP(CurrIP);
 
   // We may need to add the step a number of times, depending on the unroll
@@ -1816,7 +2143,8 @@ void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
 }
 
-void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
+void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
+                                                TruncInst *Trunc) {
   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
          "Primary induction variable must have an integer type");
 
@@ -1874,8 +2202,10 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
   auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
     for (unsigned Part = 0; Part < UF; ++Part) {
+      assert(!VF.isScalable() && "scalable vectors not yet supported.");
       Value *EntryPart =
-          getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
+          getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step,
+                        ID.getInductionOpcode());
       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
       if (Trunc)
         addMetadata(EntryPart, Trunc);
@@ -1885,7 +2215,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
 
   // Now do the actual transformations, and start with creating the step value.
   Value *Step = CreateStepValue(ID.getStep());
-  if (VF <= 1) {
+  if (VF.isZero() || VF.isScalar()) {
     Value *ScalarIV = CreateScalarIV(Step);
     CreateSplatIV(ScalarIV, Step);
     return;
@@ -1896,7 +2226,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
   // least one user in the loop that is not widened.
   auto NeedsScalarIV = needsScalarInduction(EntryVal);
   if (!NeedsScalarIV) {
-    createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
+    createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal);
     return;
   }
 
@@ -1904,7 +2234,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
   // create the phi node, we will splat the scalar induction variable in each
   // loop iteration.
   if (!shouldScalarizeInstruction(EntryVal)) {
-    createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
+    createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal);
     Value *ScalarIV = CreateScalarIV(Step);
     // Create scalar steps that can be used by instructions we will later
     // scalarize. Note that the addition of the scalar steps will not increase
@@ -1926,7 +2256,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
                                           Instruction::BinaryOps BinOp) {
   // Create and check the types.
-  auto *ValVTy = cast<VectorType>(Val->getType());
+  auto *ValVTy = cast<FixedVectorType>(Val->getType());
   int VLen = ValVTy->getNumElements();
 
   Type *STy = Val->getType()->getScalarType();
@@ -1983,8 +2313,7 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
                                            Instruction *EntryVal,
                                            const InductionDescriptor &ID) {
   // We shouldn't have to build scalar steps if we aren't vectorizing.
-  assert(VF > 1 && "VF should be greater than one");
-
+  assert(VF.isVector() && "VF should be greater than one");
   // Get the value type and ensure it and the step have the same integer type.
   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
   assert(ScalarIVTy == Step->getType() &&
@@ -2006,12 +2335,27 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
   // iteration. If EntryVal is uniform, we only need to generate the first
   // lane. Otherwise, we generate all VF values.
   unsigned Lanes =
-      Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
-                                                                         : VF;
+      Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF)
+          ? 1
+          : VF.getKnownMinValue();
+  assert((!VF.isScalable() || Lanes == 1) &&
+         "Should never scalarize a scalable vector");
   // Compute the scalar steps and save the results in VectorLoopValueMap.
   for (unsigned Part = 0; Part < UF; ++Part) {
     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
-      auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
+      auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
+                                         ScalarIVTy->getScalarSizeInBits());
+      Value *StartIdx =
+          createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF);
+      if (ScalarIVTy->isFloatingPointTy())
+        StartIdx = Builder.CreateSIToFP(StartIdx, ScalarIVTy);
+      StartIdx = addFastMathFlag(Builder.CreateBinOp(
+          AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane)));
+      // The step returned by `createStepForVF` is a runtime-evaluated value
+      // when VF is scalable. Otherwise, it should be folded into a Constant.
+      assert((VF.isScalable() || isa<Constant>(StartIdx)) &&
+             "Expected StartIdx to be folded to a constant when VF is not "
+             "scalable");
       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
@@ -2045,7 +2389,7 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
 
     // If we aren't vectorizing, we can just copy the scalar map values over to
     // the vector map.
-    if (VF == 1) {
+    if (VF.isScalar()) {
       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
       return ScalarValue;
     }
@@ -2054,7 +2398,11 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
     // is known to be uniform after vectorization, this corresponds to lane zero
     // of the Part unroll iteration. Otherwise, the last instruction is the one
     // we created for the last vector lane of the Part unroll iteration.
-    unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
+    unsigned LastLane = Cost->isUniformAfterVectorization(I, VF)
+                            ? 0
+                            : VF.getKnownMinValue() - 1;
+    assert((!VF.isScalable() || LastLane == 0) &&
+           "Scalable vectorization can't lead to any scalarized values.");
     auto *LastInst = cast<Instruction>(
         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
 
@@ -2075,10 +2423,11 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
       VectorValue = getBroadcastInstrs(ScalarValue);
       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
     } else {
-      // Initialize packing with insertelements to start from undef.
-      Value *Undef = UndefValue::get(FixedVectorType::get(V->getType(), VF));
-      VectorLoopValueMap.setVectorValue(V, Part, Undef);
-      for (unsigned Lane = 0; Lane < VF; ++Lane)
+      // Initialize packing with insertelements to start from poison.
+      assert(!VF.isScalable() && "VF is assumed to be non scalable.");
+      Value *Poison = PoisonValue::get(VectorType::get(V->getType(), VF));
+      VectorLoopValueMap.setVectorValue(V, Part, Poison);
+      for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
         packScalarIntoVectorValue(V, {Part, Lane});
       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
     }
@@ -2117,7 +2466,7 @@ InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
   // extractelement instruction.
   auto *U = getOrCreateVectorValue(V, Instance.Part);
   if (!U->getType()->isVectorTy()) {
-    assert(VF == 1 && "Value not scalarized has non-vector type");
+    assert(VF.isScalar() && "Value not scalarized has non-vector type");
     return U;
   }
 
@@ -2142,12 +2491,12 @@ void InnerLoopVectorizer::packScalarIntoVectorValue(
 
 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
   assert(Vec->getType()->isVectorTy() && "Invalid type");
+  assert(!VF.isScalable() && "Cannot reverse scalable vectors");
   SmallVector<int, 8> ShuffleMask;
-  for (unsigned i = 0; i < VF; ++i)
-    ShuffleMask.push_back(VF - i - 1);
+  for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
+    ShuffleMask.push_back(VF.getKnownMinValue() - i - 1);
 
-  return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
-                                     ShuffleMask, "reverse");
+  return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse");
 }
 
 // Return whether we allow using masked interleave-groups (for dealing with
@@ -2172,9 +2521,9 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
 //   }
 // To:
 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
-//   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
-//   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
-//   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
+//   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
+//   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
+//   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
 //
 // Or translate following interleaved store group (factor = 3):
 //   for (i = 0; i < N; i+=3) {
@@ -2185,20 +2534,22 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
 //   }
 // To:
 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
-//   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
+//   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
 void InnerLoopVectorizer::vectorizeInterleaveGroup(
-    const InterleaveGroup<Instruction> *Group, VPTransformState &State,
-    VPValue *Addr, VPValue *BlockInMask) {
+    const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
+    VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
+    VPValue *BlockInMask) {
   Instruction *Instr = Group->getInsertPos();
   const DataLayout &DL = Instr->getModule()->getDataLayout();
 
   // Prepare for the vector type of the interleaved load/store.
   Type *ScalarTy = getMemInstValueType(Instr);
   unsigned InterleaveFactor = Group->getFactor();
-  auto *VecTy = FixedVectorType::get(ScalarTy, InterleaveFactor * VF);
+  assert(!VF.isScalable() && "scalable vectors not yet supported.");
+  auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
 
   // Prepare for the new pointers.
   SmallVector<Value *, 2> AddrParts;
@@ -2214,8 +2565,10 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
   // pointer operand of the interleaved access is supposed to be uniform. For
   // uniform instructions, we're only required to generate a value for the
   // first vector lane in each unroll iteration.
+  assert(!VF.isScalable() &&
+         "scalable vector reverse operation is not implemented");
   if (Group->isReverse())
-    Index += (VF - 1) * Group->getFactor();
+    Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
 
   for (unsigned Part = 0; Part < UF; Part++) {
     Value *AddrPart = State.get(Addr, {Part, 0});
@@ -2246,11 +2599,12 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
   }
 
   setDebugLocFromInst(Builder, Instr);
-  Value *UndefVec = UndefValue::get(VecTy);
+  Value *PoisonVec = PoisonValue::get(VecTy);
 
   Value *MaskForGaps = nullptr;
   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
-    MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
+    assert(!VF.isScalable() && "scalable vectors not yet supported.");
+    MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
     assert(MaskForGaps && "Mask for Gaps is required but it is null");
   }
 
@@ -2266,10 +2620,11 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
         Value *GroupMask = MaskForGaps;
         if (BlockInMask) {
           Value *BlockInMaskPart = State.get(BlockInMask, Part);
-          auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
+          assert(!VF.isScalable() && "scalable vectors not yet supported.");
           Value *ShuffledMask = Builder.CreateShuffleVector(
-              BlockInMaskPart, Undefs,
-              createReplicatedMask(InterleaveFactor, VF), "interleaved.mask");
+              BlockInMaskPart,
+              createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
+              "interleaved.mask");
           GroupMask = MaskForGaps
                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
                                                 MaskForGaps)
@@ -2277,7 +2632,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
         }
         NewLoad =
             Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
-                                     GroupMask, UndefVec, "wide.masked.vec");
+                                     GroupMask, PoisonVec, "wide.masked.vec");
       }
       else
         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
@@ -2288,6 +2643,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
 
     // For each member in the group, shuffle out the appropriate data from the
     // wide loads.
+    unsigned J = 0;
     for (unsigned I = 0; I < InterleaveFactor; ++I) {
       Instruction *Member = Group->getMember(I);
 
@@ -2295,28 +2651,33 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
       if (!Member)
         continue;
 
-      auto StrideMask = createStrideMask(I, InterleaveFactor, VF);
+      assert(!VF.isScalable() && "scalable vectors not yet supported.");
+      auto StrideMask =
+          createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
       for (unsigned Part = 0; Part < UF; Part++) {
         Value *StridedVec = Builder.CreateShuffleVector(
-            NewLoads[Part], UndefVec, StrideMask, "strided.vec");
+            NewLoads[Part], StrideMask, "strided.vec");
 
         // If this member has different type, cast the result type.
         if (Member->getType() != ScalarTy) {
-          VectorType *OtherVTy = FixedVectorType::get(Member->getType(), VF);
+          assert(!VF.isScalable() && "VF is assumed to be non scalable.");
+          VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
         }
 
         if (Group->isReverse())
           StridedVec = reverseVector(StridedVec);
 
-        VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
+        State.set(VPDefs[J], Member, StridedVec, Part);
       }
+      ++J;
     }
     return;
   }
 
   // The sub vector type for current instruction.
-  auto *SubVT = FixedVectorType::get(ScalarTy, VF);
+  assert(!VF.isScalable() && "VF is assumed to be non scalable.");
+  auto *SubVT = VectorType::get(ScalarTy, VF);
 
   // Vectorize the interleaved store group.
   for (unsigned Part = 0; Part < UF; Part++) {
@@ -2324,11 +2685,10 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
     SmallVector<Value *, 4> StoredVecs;
     for (unsigned i = 0; i < InterleaveFactor; i++) {
       // Interleaved store group doesn't allow a gap, so each index has a member
-      Instruction *Member = Group->getMember(i);
-      assert(Member && "Fail to get a member from an interleaved store group");
+      assert(Group->getMember(i) && "Fail to get a member from an interleaved store group");
+
+      Value *StoredVec = State.get(StoredValues[i], Part);
 
-      Value *StoredVec = getOrCreateVectorValue(
-          cast<StoreInst>(Member)->getValueOperand(), Part);
       if (Group->isReverse())
         StoredVec = reverseVector(StoredVec);
 
@@ -2344,16 +2704,17 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
     Value *WideVec = concatenateVectors(Builder, StoredVecs);
 
     // Interleave the elements in the wide vector.
+    assert(!VF.isScalable() && "scalable vectors not yet supported.");
     Value *IVec = Builder.CreateShuffleVector(
-        WideVec, UndefVec, createInterleaveMask(VF, InterleaveFactor),
+        WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
         "interleaved.vec");
 
     Instruction *NewStoreInstr;
     if (BlockInMask) {
       Value *BlockInMaskPart = State.get(BlockInMask, Part);
-      auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
       Value *ShuffledMask = Builder.CreateShuffleVector(
-          BlockInMaskPart, Undefs, createReplicatedMask(InterleaveFactor, VF),
+          BlockInMaskPart,
+          createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
           "interleaved.mask");
       NewStoreInstr = Builder.CreateMaskedStore(
           IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
@@ -2366,11 +2727,9 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
   }
 }
 
-void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
-                                                     VPTransformState &State,
-                                                     VPValue *Addr,
-                                                     VPValue *StoredValue,
-                                                     VPValue *BlockInMask) {
+void InnerLoopVectorizer::vectorizeMemoryInstruction(
+    Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr,
+    VPValue *StoredValue, VPValue *BlockInMask) {
   // Attempt to issue a wide load.
   LoadInst *LI = dyn_cast<LoadInst>(Instr);
   StoreInst *SI = dyn_cast<StoreInst>(Instr);
@@ -2387,7 +2746,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
          "CM decision is not to widen the memory instruction");
 
   Type *ScalarDataTy = getMemInstValueType(Instr);
-  auto *DataTy = FixedVectorType::get(ScalarDataTy, VF);
+
+  auto *DataTy = VectorType::get(ScalarDataTy, VF);
   const Align Alignment = getLoadStoreAlignment(Instr);
 
   // Determine if the pointer operand of the access is either consecutive or
@@ -2419,19 +2779,23 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
       InBounds = gep->isInBounds();
 
     if (Reverse) {
+      assert(!VF.isScalable() &&
+             "Reversing vectors is not yet supported for scalable vectors.");
+
       // If the address is consecutive but reversed, then the
       // wide store needs to start at the last vector element.
-      PartPtr = cast<GetElementPtrInst>(
-          Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF)));
+      PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
+          ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue())));
       PartPtr->setIsInBounds(InBounds);
-      PartPtr = cast<GetElementPtrInst>(
-          Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));
+      PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
+          ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue())));
       PartPtr->setIsInBounds(InBounds);
       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
     } else {
+      Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF);
       PartPtr = cast<GetElementPtrInst>(
-          Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));
+          Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
       PartPtr->setIsInBounds(InBounds);
     }
 
@@ -2486,7 +2850,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
       auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
       if (isMaskRequired)
         NewLI = Builder.CreateMaskedLoad(
-            VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy),
+            VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy),
             "wide.masked.load");
       else
         NewLI =
@@ -2497,7 +2861,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
       if (Reverse)
         NewLI = reverseVector(NewLI);
     }
-    VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
+
+    State.set(Def, Instr, NewLI, Part);
   }
 }
 
@@ -2507,6 +2872,12 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User,
                                                VPTransformState &State) {
   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
 
+  // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
+  // the first lane and part.
+  if (isa<NoAliasScopeDeclInst>(Instr))
+    if (Instance.Lane != 0 || Instance.Part != 0)
+      return;
+
   setDebugLocFromInst(Builder, Instr);
 
   // Does this instruction return a value ?
@@ -2519,7 +2890,12 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User,
   // Replace the operands of the cloned instructions with their scalar
   // equivalents in the new loop.
   for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
-    auto *NewOp = State.get(User.getOperand(op), Instance);
+    auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op));
+    auto InputInstance = Instance;
+    if (!Operand || !OrigLoop->contains(Operand) ||
+        (Cost->isUniformAfterVectorization(Operand, State.VF)))
+      InputInstance.Lane = 0;
+    auto *NewOp = State.get(User.getOperand(op), InputInstance);
     Cloned->setOperand(op, NewOp);
   }
   addNewMetadata(Cloned, Instr);
@@ -2527,7 +2903,9 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User,
   // Place the cloned scalar in the new loop.
   Builder.Insert(Cloned);
 
-  // Add the cloned scalar to the scalar map entry.
+  // TODO: Set result for VPValue of VPReciplicateRecipe. This requires
+  // representing scalar values in VPTransformState. Add the cloned scalar to
+  // the scalar map entry.
   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
 
   // If we just cloned a new assumption, add it the assumption cache.
@@ -2564,7 +2942,7 @@ PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
   Induction->addIncoming(Next, Latch);
   // Create the compare.
   Value *ICmp = Builder.CreateICmpEQ(Next, End);
-  Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
+  Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header);
 
   // Now we have two terminators. Remove the old one from the block.
   Latch->getTerminator()->eraseFromParent();
@@ -2581,7 +2959,7 @@ Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
   // Find the loop boundaries.
   ScalarEvolution *SE = PSE.getSE();
   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
-  assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
+  assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
          "Invalid loop count");
 
   Type *IdxTy = Legal->getWidestInductionType();
@@ -2627,7 +3005,8 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
 
   Type *Ty = TC->getType();
-  Constant *Step = ConstantInt::get(Ty, VF * UF);
+  // This is where we can make the step a runtime constant.
+  Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF);
 
   // If the tail is to be folded by masking, round the number of iterations N
   // up to a multiple of Step instead of rounding down. This is done by first
@@ -2636,9 +3015,12 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
   // that it starts at zero and its Step is a power of two; the loop will then
   // exit, with the last early-exit vector comparison also producing all-true.
   if (Cost->foldTailByMasking()) {
-    assert(isPowerOf2_32(VF * UF) &&
+    assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
            "VF*UF must be a power of 2 when folding tail by masking");
-    TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
+    assert(!VF.isScalable() &&
+           "Tail folding not yet supported for scalable vectors");
+    TC = Builder.CreateAdd(
+        TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
   }
 
   // Now we need to generate the expression for the part of the loop that the
@@ -2648,14 +3030,18 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
   // unroll factor (number of SIMD instructions).
   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
 
-  // If there is a non-reversed interleaved group that may speculatively access
-  // memory out-of-bounds, we need to ensure that there will be at least one
-  // iteration of the scalar epilogue loop. Thus, if the step evenly divides
+  // There are two cases where we need to ensure (at least) the last iteration
+  // runs in the scalar remainder loop. Thus, if the step evenly divides
   // the trip count, we set the remainder to be equal to the step. If the step
   // does not evenly divide the trip count, no adjustment is necessary since
   // there will already be scalar iterations. Note that the minimum iterations
-  // check ensures that N >= Step.
-  if (VF > 1 && Cost->requiresScalarEpilogue()) {
+  // check ensures that N >= Step. The cases are:
+  // 1) If there is a non-reversed interleaved group that may speculatively
+  //    access memory out-of-bounds.
+  // 2) If any instruction may follow a conditionally taken exit. That is, if
+  //    the loop contains multiple exiting blocks, or a single exiting block
+  //    which is not the latch.
+  if (VF.isVector() && Cost->requiresScalarEpilogue()) {
     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
     R = Builder.CreateSelect(IsZero, Step, R);
   }
@@ -2668,17 +3054,18 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
                                                    const DataLayout &DL) {
   // Verify that V is a vector type with same number of elements as DstVTy.
-  unsigned VF = DstVTy->getNumElements();
-  VectorType *SrcVecTy = cast<VectorType>(V->getType());
+  auto *DstFVTy = cast<FixedVectorType>(DstVTy);
+  unsigned VF = DstFVTy->getNumElements();
+  auto *SrcVecTy = cast<FixedVectorType>(V->getType());
   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
   Type *SrcElemTy = SrcVecTy->getElementType();
-  Type *DstElemTy = DstVTy->getElementType();
+  Type *DstElemTy = DstFVTy->getElementType();
   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
          "Vector elements must have same size");
 
   // Do a direct cast if element types are castable.
   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
-    return Builder.CreateBitOrPointerCast(V, DstVTy);
+    return Builder.CreateBitOrPointerCast(V, DstFVTy);
   }
   // V cannot be directly casted to desired vector type.
   // May happen when V is a floating point vector but DstVTy is a vector of
@@ -2692,7 +3079,7 @@ Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
-  return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
+  return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
 }
 
 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
@@ -2713,11 +3100,11 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
 
   // If tail is to be folded, vector loop takes care of all iterations.
   Value *CheckMinIters = Builder.getFalse();
-  if (!Cost->foldTailByMasking())
-    CheckMinIters = Builder.CreateICmp(
-        P, Count, ConstantInt::get(Count->getType(), VF * UF),
-        "min.iters.check");
-
+  if (!Cost->foldTailByMasking()) {
+    Value *Step =
+        createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF);
+    CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
+  }
   // Create new preheader for vector loop.
   LoopVectorPreHeader =
       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
@@ -2754,7 +3141,9 @@ void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
     if (C->isZero())
       return;
 
-  assert(!SCEVCheckBlock->getParent()->hasOptSize() &&
+  assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
+           (OptForSizeBasedOnProfile &&
+            Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
          "Cannot SCEV check stride or overflow when optimizing for size");
 
   SCEVCheckBlock->setName("vector.scevcheck");
@@ -2792,15 +3181,8 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
   const auto &RtPtrChecking = *LAI->getRuntimePointerChecking();
   if (!RtPtrChecking.Need)
     return;
-  Instruction *FirstCheckInst;
-  Instruction *MemRuntimeCheck;
-  std::tie(FirstCheckInst, MemRuntimeCheck) =
-      addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop,
-                       RtPtrChecking.getChecks(), RtPtrChecking.getSE());
-  assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking "
-                            "claimed checks are required");
 
-  if (MemCheckBlock->getParent()->hasOptSize()) {
+  if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
            "Cannot emit memory checks when optimizing for size, unless forced "
            "to vectorize.");
@@ -2820,22 +3202,33 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
       SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
                  "vector.ph");
 
+  auto *CondBranch = cast<BranchInst>(
+      Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader));
+  ReplaceInstWithInst(MemCheckBlock->getTerminator(), CondBranch);
+  LoopBypassBlocks.push_back(MemCheckBlock);
+  AddedSafetyChecks = true;
+
   // Update dominator only if this is first RT check.
   if (LoopBypassBlocks.empty()) {
     DT->changeImmediateDominator(Bypass, MemCheckBlock);
     DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
   }
 
-  ReplaceInstWithInst(
-      MemCheckBlock->getTerminator(),
-      BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck));
-  LoopBypassBlocks.push_back(MemCheckBlock);
-  AddedSafetyChecks = true;
+  Instruction *FirstCheckInst;
+  Instruction *MemRuntimeCheck;
+  std::tie(FirstCheckInst, MemRuntimeCheck) =
+      addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop,
+                       RtPtrChecking.getChecks(), RtPtrChecking.getSE());
+  assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking "
+                            "claimed checks are required");
+  CondBranch->setCondition(MemRuntimeCheck);
 
   // We currently don't use LoopVersioning for the actual loop cloning but we
   // still use it to add the noalias metadata.
-  LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
-                                          PSE.getSE());
+  LVer = std::make_unique<LoopVersioning>(
+      *Legal->getLAI(),
+      Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
+      DT, PSE.getSE());
   LVer->prepareNoAliasMetadata();
 }
 
@@ -2939,74 +3332,35 @@ Value *InnerLoopVectorizer::emitTransformedIndex(
   llvm_unreachable("invalid enum");
 }
 
-BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
-  /*
-   In this function we generate a new loop. The new loop will contain
-   the vectorized instructions while the old loop will continue to run the
-   scalar remainder.
-
-       [ ] <-- loop iteration number check.
-    /   |
-   /    v
-  |    [ ] <-- vector loop bypass (may consist of multiple blocks).
-  |  /  |
-  | /   v
-  ||   [ ]     <-- vector pre header.
-  |/    |
-  |     v
-  |    [  ] \
-  |    [  ]_|   <-- vector loop.
-  |     |
-  |     v
-  |   -[ ]   <--- middle-block.
-  |  /  |
-  | /   v
-  -|- >[ ]     <--- new preheader.
-   |    |
-   |    v
-   |   [ ] \
-   |   [ ]_|   <-- old scalar loop to handle remainder.
-    \   |
-     \  v
-      >[ ]     <-- exit block.
-   ...
-   */
-
-  MDNode *OrigLoopID = OrigLoop->getLoopID();
-
-  // Some loops have a single integer induction variable, while other loops
-  // don't. One example is c++ iterators that often have multiple pointer
-  // induction variables. In the code below we also support a case where we
-  // don't have a single induction variable.
-  //
-  // We try to obtain an induction variable from the original loop as hard
-  // as possible. However if we don't find one that:
-  //   - is an integer
-  //   - counts from zero, stepping by one
-  //   - is the size of the widest induction variable type
-  // then we create a new one.
-  OldInduction = Legal->getPrimaryInduction();
-  Type *IdxTy = Legal->getWidestInductionType();
-
-  // Split the single block loop into the two loop structure described above.
+Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
   LoopScalarBody = OrigLoop->getHeader();
   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
-  LoopExitBlock = OrigLoop->getExitBlock();
+  LoopExitBlock = OrigLoop->getUniqueExitBlock();
   assert(LoopExitBlock && "Must have an exit block");
   assert(LoopVectorPreHeader && "Invalid loop structure");
 
   LoopMiddleBlock =
       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
-                 LI, nullptr, "middle.block");
+                 LI, nullptr, Twine(Prefix) + "middle.block");
   LoopScalarPreHeader =
       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
-                 nullptr, "scalar.ph");
+                 nullptr, Twine(Prefix) + "scalar.ph");
+
+  // Set up branch from middle block to the exit and scalar preheader blocks.
+  // completeLoopSkeleton will update the condition to use an iteration check,
+  // if required to decide whether to execute the remainder.
+  BranchInst *BrInst =
+      BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue());
+  auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
+  BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
+  ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
+
   // We intentionally don't let SplitBlock to update LoopInfo since
   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
   // LoopVectorBody is explicitly added to the correct place few lines later.
   LoopVectorBody =
       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
-                 nullptr, nullptr, "vector.body");
+                 nullptr, nullptr, Twine(Prefix) + "vector.body");
 
   // Update dominator for loop exit.
   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
@@ -3023,37 +3377,16 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
     LI->addTopLevelLoop(Lp);
   }
   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
+  return Lp;
+}
 
-  // Find the loop boundaries.
-  Value *Count = getOrCreateTripCount(Lp);
-
-  Value *StartIdx = ConstantInt::get(IdxTy, 0);
-
-  // Now, compare the new count to zero. If it is zero skip the vector loop and
-  // jump to the scalar loop. This check also covers the case where the
-  // backedge-taken count is uint##_max: adding one to it will overflow leading
-  // to an incorrect trip count of zero. In this (rare) case we will also jump
-  // to the scalar loop.
-  emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
-
-  // Generate the code to check any assumptions that we've made for SCEV
-  // expressions.
-  emitSCEVChecks(Lp, LoopScalarPreHeader);
-
-  // Generate the code that checks in runtime if arrays overlap. We put the
-  // checks into a separate block to make the more common case of few elements
-  // faster.
-  emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
-
-  // Generate the induction variable.
-  // The loop step is equal to the vectorization factor (num of SIMD elements)
-  // times the unroll factor (num of SIMD instructions).
-  Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
-  Constant *Step = ConstantInt::get(IdxTy, VF * UF);
-  Induction =
-      createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
-                              getDebugLocFromInstOrOperands(OldInduction));
-
+void InnerLoopVectorizer::createInductionResumeValues(
+    Loop *L, Value *VectorTripCount,
+    std::pair<BasicBlock *, Value *> AdditionalBypass) {
+  assert(VectorTripCount && L && "Expected valid arguments");
+  assert(((AdditionalBypass.first && AdditionalBypass.second) ||
+          (!AdditionalBypass.first && !AdditionalBypass.second)) &&
+         "Inconsistent information about additional bypass.");
   // We are going to resume the execution of the scalar loop.
   // Go over all of the induction variables that we found and fix the
   // PHIs that are left in the scalar version of the loop.
@@ -3061,10 +3394,6 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
   // iteration in the vectorized loop.
   // If we come from a bypass edge then we need to start from the original
   // start value.
-
-  // This variable saves the new starting index for the scalar loop. It is used
-  // to test if there are any tail iterations left once the vector loop has
-  // completed.
   for (auto &InductionEntry : Legal->getInductionVars()) {
     PHINode *OrigPhi = InductionEntry.first;
     InductionDescriptor II = InductionEntry.second;
@@ -3076,20 +3405,32 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
     // Copy original phi DL over to the new one.
     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
     Value *&EndValue = IVEndValues[OrigPhi];
+    Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
     if (OrigPhi == OldInduction) {
       // We know what the end value is.
-      EndValue = CountRoundDown;
+      EndValue = VectorTripCount;
     } else {
-      IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
+      IRBuilder<> B(L->getLoopPreheader()->getTerminator());
       Type *StepType = II.getStep()->getType();
       Instruction::CastOps CastOp =
-          CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
-      Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
+          CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
+      Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
       EndValue->setName("ind.end");
-    }
 
+      // Compute the end value for the additional bypass (if applicable).
+      if (AdditionalBypass.first) {
+        B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
+        CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
+                                         StepType, true);
+        CRD =
+            B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");
+        EndValueFromAdditionalBypass =
+            emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
+        EndValueFromAdditionalBypass->setName("ind.end");
+      }
+    }
     // The new PHI merges the original incoming value, in case of a bypass,
     // or the value at the end of the vectorized loop.
     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
@@ -3099,42 +3440,44 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
     // value.
     for (BasicBlock *BB : LoopBypassBlocks)
       BCResumeVal->addIncoming(II.getStartValue(), BB);
+
+    if (AdditionalBypass.first)
+      BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
+                                            EndValueFromAdditionalBypass);
+
     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
   }
+}
+
+BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
+                                                      MDNode *OrigLoopID) {
+  assert(L && "Expected valid loop.");
+
+  // The trip counts should be cached by now.
+  Value *Count = getOrCreateTripCount(L);
+  Value *VectorTripCount = getOrCreateVectorTripCount(L);
 
-  // We need the OrigLoop (scalar loop part) latch terminator to help
-  // produce correct debug info for the middle block BB instructions.
-  // The legality check stage guarantees that the loop will have a single
-  // latch.
-  assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
-         "Scalar loop latch terminator isn't a branch");
-  BranchInst *ScalarLatchBr =
-      cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
+  auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
 
   // Add a check in the middle block to see if we have completed
   // all of the iterations in the first vector loop.
   // If (N - N%VF) == N, then we *don't* need to run the remainder.
   // If tail is to be folded, we know we don't need to run the remainder.
-  Value *CmpN = Builder.getTrue();
   if (!Cost->foldTailByMasking()) {
-    CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
-                           CountRoundDown, "cmp.n",
-                           LoopMiddleBlock->getTerminator());
+    Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
+                                        Count, VectorTripCount, "cmp.n",
+                                        LoopMiddleBlock->getTerminator());
 
-    // Here we use the same DebugLoc as the scalar loop latch branch instead
+    // Here we use the same DebugLoc as the scalar loop latch terminator instead
     // of the corresponding compare because they may have ended up with
     // different line numbers and we want to avoid awkward line stepping while
     // debugging. Eg. if the compare has got a line number inside the loop.
-    cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
+    CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
+    cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
   }
 
-  BranchInst *BrInst =
-      BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN);
-  BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
-  ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
-
   // Get ready to start creating new instructions into the vectorized body.
-  assert(LoopVectorPreHeader == Lp->getLoopPreheader() &&
+  assert(LoopVectorPreHeader == L->getLoopPreheader() &&
          "Inconsistent vector loop preheader");
   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
 
@@ -3142,7 +3485,7 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
                                       LLVMLoopVectorizeFollowupVectorized});
   if (VectorizedLoopID.hasValue()) {
-    Lp->setLoopID(VectorizedLoopID.getValue());
+    L->setLoopID(VectorizedLoopID.getValue());
 
     // Do not setAlreadyVectorized if loop attributes have been defined
     // explicitly.
@@ -3152,9 +3495,9 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
   // Keep all loop hints from the original loop on the vector loop (we'll
   // replace the vectorizer-specific hints below).
   if (MDNode *LID = OrigLoop->getLoopID())
-    Lp->setLoopID(LID);
+    L->setLoopID(LID);
 
-  LoopVectorizeHints Hints(Lp, true, *ORE);
+  LoopVectorizeHints Hints(L, true, *ORE);
   Hints.setAlreadyVectorized();
 
 #ifdef EXPENSIVE_CHECKS
@@ -3165,43 +3508,128 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
   return LoopVectorPreHeader;
 }
 
-// Fix up external users of the induction variable. At this point, we are
-// in LCSSA form, with all external PHIs that use the IV having one input value,
-// coming from the remainder loop. We need those PHIs to also have a correct
-// value for the IV when arriving directly from the middle block.
-void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
-                                       const InductionDescriptor &II,
-                                       Value *CountRoundDown, Value *EndValue,
-                                       BasicBlock *MiddleBlock) {
-  // There are two kinds of external IV usages - those that use the value
-  // computed in the last iteration (the PHI) and those that use the penultimate
-  // value (the value that feeds into the phi from the loop latch).
-  // We allow both, but they, obviously, have different values.
-
-  assert(OrigLoop->getExitBlock() && "Expected a single exit block");
+BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
+  /*
+   In this function we generate a new loop. The new loop will contain
+   the vectorized instructions while the old loop will continue to run the
+   scalar remainder.
 
-  DenseMap<Value *, Value *> MissingVals;
+       [ ] <-- loop iteration number check.
+    /   |
+   /    v
+  |    [ ] <-- vector loop bypass (may consist of multiple blocks).
+  |  /  |
+  | /   v
+  ||   [ ]     <-- vector pre header.
+  |/    |
+  |     v
+  |    [  ] \
+  |    [  ]_|   <-- vector loop.
+  |     |
+  |     v
+  |   -[ ]   <--- middle-block.
+  |  /  |
+  | /   v
+  -|- >[ ]     <--- new preheader.
+   |    |
+   |    v
+   |   [ ] \
+   |   [ ]_|   <-- old scalar loop to handle remainder.
+    \   |
+     \  v
+      >[ ]     <-- exit block.
+   ...
+   */
 
-  // An external user of the last iteration's value should see the value that
-  // the remainder loop uses to initialize its own IV.
-  Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
-  for (User *U : PostInc->users()) {
-    Instruction *UI = cast<Instruction>(U);
-    if (!OrigLoop->contains(UI)) {
-      assert(isa<PHINode>(UI) && "Expected LCSSA form");
-      MissingVals[UI] = EndValue;
-    }
-  }
+  // Get the metadata of the original loop before it gets modified.
+  MDNode *OrigLoopID = OrigLoop->getLoopID();
 
-  // An external user of the penultimate value need to see EndValue - Step.
-  // The simplest way to get this is to recompute it from the constituent SCEVs,
-  // that is Start + (Step * (CRD - 1)).
-  for (User *U : OrigPhi->users()) {
-    auto *UI = cast<Instruction>(U);
-    if (!OrigLoop->contains(UI)) {
-      const DataLayout &DL =
-          OrigLoop->getHeader()->getModule()->getDataLayout();
-      assert(isa<PHINode>(UI) && "Expected LCSSA form");
+  // Create an empty vector loop, and prepare basic blocks for the runtime
+  // checks.
+  Loop *Lp = createVectorLoopSkeleton("");
+
+  // Now, compare the new count to zero. If it is zero skip the vector loop and
+  // jump to the scalar loop. This check also covers the case where the
+  // backedge-taken count is uint##_max: adding one to it will overflow leading
+  // to an incorrect trip count of zero. In this (rare) case we will also jump
+  // to the scalar loop.
+  emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
+
+  // Generate the code to check any assumptions that we've made for SCEV
+  // expressions.
+  emitSCEVChecks(Lp, LoopScalarPreHeader);
+
+  // Generate the code that checks in runtime if arrays overlap. We put the
+  // checks into a separate block to make the more common case of few elements
+  // faster.
+  emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
+
+  // Some loops have a single integer induction variable, while other loops
+  // don't. One example is c++ iterators that often have multiple pointer
+  // induction variables. In the code below we also support a case where we
+  // don't have a single induction variable.
+  //
+  // We try to obtain an induction variable from the original loop as hard
+  // as possible. However if we don't find one that:
+  //   - is an integer
+  //   - counts from zero, stepping by one
+  //   - is the size of the widest induction variable type
+  // then we create a new one.
+  OldInduction = Legal->getPrimaryInduction();
+  Type *IdxTy = Legal->getWidestInductionType();
+  Value *StartIdx = ConstantInt::get(IdxTy, 0);
+  // The loop step is equal to the vectorization factor (num of SIMD elements)
+  // times the unroll factor (num of SIMD instructions).
+  Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt());
+  Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF);
+  Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
+  Induction =
+      createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
+                              getDebugLocFromInstOrOperands(OldInduction));
+
+  // Emit phis for the new starting index of the scalar loop.
+  createInductionResumeValues(Lp, CountRoundDown);
+
+  return completeLoopSkeleton(Lp, OrigLoopID);
+}
+
+// Fix up external users of the induction variable. At this point, we are
+// in LCSSA form, with all external PHIs that use the IV having one input value,
+// coming from the remainder loop. We need those PHIs to also have a correct
+// value for the IV when arriving directly from the middle block.
+void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
+                                       const InductionDescriptor &II,
+                                       Value *CountRoundDown, Value *EndValue,
+                                       BasicBlock *MiddleBlock) {
+  // There are two kinds of external IV usages - those that use the value
+  // computed in the last iteration (the PHI) and those that use the penultimate
+  // value (the value that feeds into the phi from the loop latch).
+  // We allow both, but they, obviously, have different values.
+
+  assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
+
+  DenseMap<Value *, Value *> MissingVals;
+
+  // An external user of the last iteration's value should see the value that
+  // the remainder loop uses to initialize its own IV.
+  Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
+  for (User *U : PostInc->users()) {
+    Instruction *UI = cast<Instruction>(U);
+    if (!OrigLoop->contains(UI)) {
+      assert(isa<PHINode>(UI) && "Expected LCSSA form");
+      MissingVals[UI] = EndValue;
+    }
+  }
+
+  // An external user of the penultimate value need to see EndValue - Step.
+  // The simplest way to get this is to recompute it from the constituent SCEVs,
+  // that is Start + (Step * (CRD - 1)).
+  for (User *U : OrigPhi->users()) {
+    auto *UI = cast<Instruction>(U);
+    if (!OrigLoop->contains(UI)) {
+      const DataLayout &DL =
+          OrigLoop->getHeader()->getModule()->getDataLayout();
+      assert(isa<PHINode>(UI) && "Expected LCSSA form");
 
       IRBuilder<> B(MiddleBlock->getTerminator());
       Value *CountMinusOne = B.CreateSub(
@@ -3284,9 +3712,10 @@ static void cse(BasicBlock *BB) {
   }
 }
 
-unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
-                                                       unsigned VF,
-                                                       bool &NeedToScalarize) {
+InstructionCost
+LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
+                                              bool &NeedToScalarize) {
+  assert(!VF.isScalable() && "scalable vectors not yet supported.");
   Function *F = CI->getCalledFunction();
   Type *ScalarRetTy = CI->getType();
   SmallVector<Type *, 4> Tys, ScalarTys;
@@ -3297,9 +3726,9 @@ unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
   // to be vectors, so we need to extract individual elements from there,
   // execute VF scalar calls, and then gather the result into the vector return
   // value.
-  unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys,
-                                                 TTI::TCK_RecipThroughput);
-  if (VF == 1)
+  InstructionCost ScalarCallCost =
+      TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);
+  if (VF.isScalar())
     return ScalarCallCost;
 
   // Compute corresponding vector type for return value and arguments.
@@ -3309,31 +3738,33 @@ unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
 
   // Compute costs of unpacking argument values for the scalar calls and
   // packing the return values to a vector.
-  unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
+  InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
 
-  unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
+  InstructionCost Cost =
+      ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
 
   // If we can't emit a vector call for this function, then the currently found
   // cost is the cost we need to return.
   NeedToScalarize = true;
-  VFShape Shape = VFShape::get(*CI, {VF, false}, false /*HasGlobalPred*/);
+  VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
 
   if (!TLI || CI->isNoBuiltin() || !VecFunc)
     return Cost;
 
   // If the corresponding vector cost is cheaper, return its cost.
-  unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys,
-                                                 TTI::TCK_RecipThroughput);
+  InstructionCost VectorCallCost =
+      TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
   if (VectorCallCost < Cost) {
     NeedToScalarize = false;
-    return VectorCallCost;
+    Cost = VectorCallCost;
   }
   return Cost;
 }
 
-unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
-                                                            unsigned VF) {
+InstructionCost
+LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
+                                                   ElementCount VF) {
   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
   assert(ID && "Expected intrinsic call!");
 
@@ -3373,7 +3804,8 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
       Type *ScalarTruncatedTy =
           IntegerType::get(OriginalTy->getContext(), KV.second);
       auto *TruncatedTy = FixedVectorType::get(
-          ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getNumElements());
+          ScalarTruncatedTy,
+          cast<FixedVectorType>(OriginalTy)->getNumElements());
       if (TruncatedTy == OriginalTy)
         continue;
 
@@ -3423,13 +3855,13 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
           break;
         }
       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
-        auto Elements0 =
-            cast<VectorType>(SI->getOperand(0)->getType())->getNumElements();
+        auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType())
+                             ->getNumElements();
         auto *O0 = B.CreateZExtOrTrunc(
             SI->getOperand(0),
             FixedVectorType::get(ScalarTruncatedTy, Elements0));
-        auto Elements1 =
-            cast<VectorType>(SI->getOperand(1)->getType())->getNumElements();
+        auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType())
+                             ->getNumElements();
         auto *O1 = B.CreateZExtOrTrunc(
             SI->getOperand(1),
             FixedVectorType::get(ScalarTruncatedTy, Elements1));
@@ -3439,16 +3871,16 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
         // Don't do anything with the operands, just extend the result.
         continue;
       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
-        auto Elements =
-            cast<VectorType>(IE->getOperand(0)->getType())->getNumElements();
+        auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType())
+                            ->getNumElements();
         auto *O0 = B.CreateZExtOrTrunc(
             IE->getOperand(0),
             FixedVectorType::get(ScalarTruncatedTy, Elements));
         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
-        auto Elements =
-            cast<VectorType>(EE->getOperand(0)->getType())->getNumElements();
+        auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType())
+                            ->getNumElements();
         auto *O0 = B.CreateZExtOrTrunc(
             EE->getOperand(0),
             FixedVectorType::get(ScalarTruncatedTy, Elements));
@@ -3490,7 +3922,7 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
 void InnerLoopVectorizer::fixVectorizedLoop() {
   // Insert truncates and extends for any truncated instructions as hints to
   // InstCombine.
-  if (VF > 1)
+  if (VF.isVector())
     truncateToMinimalBitwidths();
 
   // Fix widened non-induction PHIs by setting up the PHI operands.
@@ -3531,9 +3963,13 @@ void InnerLoopVectorizer::fixVectorizedLoop() {
   // profile is not inherently precise anyway. Note also possible bypass of
   // vector code caused by legality checks is ignored, assigning all the weight
   // to the vector loop, optimistically.
-  setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody),
-                               LI->getLoopFor(LoopVectorBody),
-                               LI->getLoopFor(LoopScalarBody), VF * UF);
+  //
+  // For scalable vectorization we can't know at compile time how many iterations
+  // of the loop are handled in one vector iteration, so instead assume a pessimistic
+  // vscale of '1'.
+  setProfileInfoAfterUnrolling(
+      LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
+      LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
 }
 
 void InnerLoopVectorizer::fixCrossIterationPHIs() {
@@ -3612,11 +4048,12 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
 
   // Create a vector from the initial value.
   auto *VectorInit = ScalarInit;
-  if (VF > 1) {
+  if (VF.isVector()) {
     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
+    assert(!VF.isScalable() && "VF is assumed to be non scalable.");
     VectorInit = Builder.CreateInsertElement(
-        UndefValue::get(FixedVectorType::get(VectorInit->getType(), VF)),
-        VectorInit, Builder.getInt32(VF - 1), "vector.recur.init");
+        PoisonValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
+        Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init");
   }
 
   // We constructed a temporary phi node in the first phase of vectorization.
@@ -3657,10 +4094,11 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
 
   // We will construct a vector for the recurrence by combining the values for
   // the current and previous iterations. This is the required shuffle mask.
-  SmallVector<int, 8> ShuffleMask(VF);
-  ShuffleMask[0] = VF - 1;
-  for (unsigned I = 1; I < VF; ++I)
-    ShuffleMask[I] = I + VF - 1;
+  assert(!VF.isScalable());
+  SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue());
+  ShuffleMask[0] = VF.getKnownMinValue() - 1;
+  for (unsigned I = 1; I < VF.getKnownMinValue(); ++I)
+    ShuffleMask[I] = I + VF.getKnownMinValue() - 1;
 
   // The vector from which to take the initial value for the current iteration
   // (actual or unrolled). Initially, this is the vector phi node.
@@ -3670,9 +4108,10 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
   for (unsigned Part = 0; Part < UF; ++Part) {
     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
-    auto *Shuffle = VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
-                                                         ShuffleMask)
-                           : Incoming;
+    auto *Shuffle =
+        VF.isVector()
+            ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask)
+            : Incoming;
     PhiPart->replaceAllUsesWith(Shuffle);
     cast<Instruction>(PhiPart)->eraseFromParent();
     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
@@ -3685,10 +4124,11 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
   // Extract the last vector element in the middle block. This will be the
   // initial value for the recurrence when jumping to the scalar loop.
   auto *ExtractForScalar = Incoming;
-  if (VF > 1) {
+  if (VF.isVector()) {
     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
     ExtractForScalar = Builder.CreateExtractElement(
-        ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
+        ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1),
+        "vector.recur.extract");
   }
   // Extract the second last element in the middle block if the
   // Phi is used outside the loop. We need to extract the phi itself
@@ -3696,9 +4136,10 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
   // will be the value when jumping to the exit block from the LoopMiddleBlock,
   // when the scalar loop is not run at all.
   Value *ExtractForPhiUsedOutsideLoop = nullptr;
-  if (VF > 1)
+  if (VF.isVector())
     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
-        Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
+        Incoming, Builder.getInt32(VF.getKnownMinValue() - 2),
+        "vector.recur.extract.for.phi");
   // When loop is unrolled without vectorizing, initialize
   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
   // `Incoming`. This is analogous to the vectorized case above: extracting the
@@ -3722,69 +4163,31 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
   // vector recurrence we extracted in the middle block. Since the loop is in
   // LCSSA form, we just need to find all the phi nodes for the original scalar
   // recurrence in the exit block, and then add an edge for the middle block.
-  for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
-    if (LCSSAPhi.getIncomingValue(0) == Phi) {
+  // Note that LCSSA does not imply single entry when the original scalar loop
+  // had multiple exiting edges (as we always run the last iteration in the
+  // scalar epilogue); in that case, the exiting path through middle will be
+  // dynamically dead and the value picked for the phi doesn't matter.
+  for (PHINode &LCSSAPhi : LoopExitBlock->phis())
+    if (any_of(LCSSAPhi.incoming_values(),
+               [Phi](Value *V) { return V == Phi; }))
       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
-    }
-  }
 }
 
 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
-  Constant *Zero = Builder.getInt32(0);
-
   // Get it's reduction variable descriptor.
   assert(Legal->isReductionVariable(Phi) &&
          "Unable to find the reduction variable");
   RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
 
-  RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
+  RecurKind RK = RdxDesc.getRecurrenceKind();
   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
-  RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
-    RdxDesc.getMinMaxRecurrenceKind();
   setDebugLocFromInst(Builder, ReductionStartValue);
-
-  // We need to generate a reduction vector from the incoming scalar.
-  // To do so, we need to generate the 'identity' vector and override
-  // one of the elements with the incoming scalar reduction. We need
-  // to do it in the vector-loop preheader.
-  Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
+  bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi);
 
   // This is the vector-clone of the value that leaves the loop.
   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
 
-  // Find the reduction identity variable. Zero for addition, or, xor,
-  // one for multiplication, -1 for And.
-  Value *Identity;
-  Value *VectorStart;
-  if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
-      RK == RecurrenceDescriptor::RK_FloatMinMax) {
-    // MinMax reduction have the start value as their identify.
-    if (VF == 1) {
-      VectorStart = Identity = ReductionStartValue;
-    } else {
-      VectorStart = Identity =
-        Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
-    }
-  } else {
-    // Handle other reduction kinds:
-    Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
-        RK, VecTy->getScalarType());
-    if (VF == 1) {
-      Identity = Iden;
-      // This vector is the Identity vector where the first element is the
-      // incoming scalar reduction.
-      VectorStart = ReductionStartValue;
-    } else {
-      Identity = ConstantVector::getSplat({VF, false}, Iden);
-
-      // This vector is the Identity vector where the first element is the
-      // incoming scalar reduction.
-      VectorStart =
-        Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
-    }
-  }
-
   // Wrap flags are in general invalid after vectorization, clear them.
   clearReductionWrapFlags(RdxDesc);
 
@@ -3798,10 +4201,6 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
   for (unsigned Part = 0; Part < UF; ++Part) {
     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
     Value *Val = getOrCreateVectorValue(LoopVal, Part);
-    // Make sure to add the reduction start value only to the
-    // first unroll part.
-    Value *StartVal = (Part == 0) ? VectorStart : Identity;
-    cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
     cast<PHINode>(VecRdxPhi)
       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
   }
@@ -3816,8 +4215,9 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
 
   // If tail is folded by masking, the vector value to leave the loop should be
   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
-  // instead of the former.
-  if (Cost->foldTailByMasking()) {
+  // instead of the former. For an inloop reduction the reduction will already
+  // be predicated, and does not need to be handled here.
+  if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) {
     for (unsigned Part = 0; Part < UF; ++Part) {
       Value *VecLoopExitInst =
           VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
@@ -3831,14 +4231,31 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
       }
       assert(Sel && "Reduction exit feeds no select");
       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
+
+      // If the target can create a predicated operator for the reduction at no
+      // extra cost in the loop (for example a predicated vadd), it can be
+      // cheaper for the select to remain in the loop than be sunk out of it,
+      // and so use the select value for the phi instead of the old
+      // LoopExitValue.
+      RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
+      if (PreferPredicatedReductionSelect ||
+          TTI->preferPredicatedReductionSelect(
+              RdxDesc.getOpcode(), Phi->getType(),
+              TargetTransformInfo::ReductionFlags())) {
+        auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part));
+        VecRdxPhi->setIncomingValueForBlock(
+            LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
+      }
     }
   }
 
   // If the vector reduction can be performed in a smaller type, we truncate
   // then extend the loop exit value to enable InstCombine to evaluate the
   // entire expression in the smaller type.
-  if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
-    Type *RdxVecTy = FixedVectorType::get(RdxDesc.getRecurrenceType(), VF);
+  if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) {
+    assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!");
+    assert(!VF.isScalable() && "scalable vectors not yet supported.");
+    Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
     Builder.SetInsertPoint(
         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
     VectorParts RdxParts(UF);
@@ -3865,7 +4282,7 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
 
   // Reduce all of the unrolled parts into a single vector.
   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
-  unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
+  unsigned Op = RecurrenceDescriptor::getOpcode(RK);
 
   // The middle block terminator has already been assigned a DebugLoc here (the
   // OrigLoop's single latch terminator). We want the whole middle block to
@@ -3884,14 +4301,14 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
                               ReducedPartRdx, "bin.rdx"),
           RdxDesc.getFastMathFlags());
     else
-      ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
-                                      RdxPart);
+      ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
   }
 
-  if (VF > 1) {
-    bool NoNaN = Legal->hasFunNoNaNAttr();
+  // Create the reduction after the loop. Note that inloop reductions create the
+  // target reduction in the loop using a Reduction recipe.
+  if (VF.isVector() && !IsInLoopReductionPhi) {
     ReducedPartRdx =
-        createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
+        createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx);
     // If the reduction can be performed in a smaller type, we need to extend
     // the reduction to the wider type before we branch to the original loop.
     if (Phi->getType() != RdxDesc.getRecurrenceType())
@@ -3911,21 +4328,17 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
 
   // Now, we need to fix the users of the reduction variable
   // inside and outside of the scalar remainder loop.
-  // We know that the loop is in LCSSA form. We need to update the
-  // PHI nodes in the exit blocks.
-  for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
-    // All PHINodes need to have a single entry edge, or two if
-    // we already fixed them.
-    assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
 
-    // We found a reduction value exit-PHI. Update it with the
-    // incoming bypass edge.
-    if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
+  // We know that the loop is in LCSSA form. We need to update the PHI nodes
+  // in the exit blocks.  See comment on analogous loop in
+  // fixFirstOrderRecurrence for a more complete explaination of the logic.
+  for (PHINode &LCSSAPhi : LoopExitBlock->phis())
+    if (any_of(LCSSAPhi.incoming_values(),
+               [LoopExitInst](Value *V) { return V == LoopExitInst; }))
       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
-  } // end of the LCSSA phi scan.
 
-    // Fix the scalar loop reduction variable with the incoming reduction sum
-    // from the vector body and from the backedge value.
+  // Fix the scalar loop reduction variable with the incoming reduction sum
+  // from the vector body and from the backedge value.
   int IncomingEdgeBlockIdx =
     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
@@ -3937,9 +4350,8 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
 
 void InnerLoopVectorizer::clearReductionWrapFlags(
     RecurrenceDescriptor &RdxDesc) {
-  RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
-  if (RK != RecurrenceDescriptor::RK_IntegerAdd &&
-      RK != RecurrenceDescriptor::RK_IntegerMult)
+  RecurKind RK = RdxDesc.getRecurrenceKind();
+  if (RK != RecurKind::Add && RK != RecurKind::Mul)
     return;
 
   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
@@ -3968,22 +4380,27 @@ void InnerLoopVectorizer::clearReductionWrapFlags(
 
 void InnerLoopVectorizer::fixLCSSAPHIs() {
   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
-    if (LCSSAPhi.getNumIncomingValues() == 1) {
-      auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
-      // Non-instruction incoming values will have only one value.
-      unsigned LastLane = 0;
-      if (isa<Instruction>(IncomingValue)) 
-          LastLane = Cost->isUniformAfterVectorization(
-                         cast<Instruction>(IncomingValue), VF)
-                         ? 0
-                         : VF - 1;
-      // Can be a loop invariant incoming value or the last scalar value to be
-      // extracted from the vectorized loop.
-      Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
-      Value *lastIncomingValue =
-          getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
-      LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
-    }
+    if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1)
+      // Some phis were already hand updated by the reduction and recurrence
+      // code above, leave them alone.
+      continue;
+
+    auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
+    // Non-instruction incoming values will have only one value.
+    unsigned LastLane = 0;
+    if (isa<Instruction>(IncomingValue))
+      LastLane = Cost->isUniformAfterVectorization(
+                     cast<Instruction>(IncomingValue), VF)
+                     ? 0
+                     : VF.getKnownMinValue() - 1;
+    assert((!VF.isScalable() || LastLane == 0) &&
+           "scalable vectors dont support non-uniform scalars yet");
+    // Can be a loop invariant incoming value or the last scalar value to be
+    // extracted from the vectorized loop.
+    Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
+    Value *lastIncomingValue =
+      getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
+    LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
   }
 }
 
@@ -4087,9 +4504,9 @@ void InnerLoopVectorizer::fixNonInductionPHIs() {
   }
 }
 
-void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands,
-                                   unsigned UF, unsigned VF,
-                                   bool IsPtrLoopInvariant,
+void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef,
+                                   VPUser &Operands, unsigned UF,
+                                   ElementCount VF, bool IsPtrLoopInvariant,
                                    SmallBitVector &IsIndexLoopInvariant,
                                    VPTransformState &State) {
   // Construct a vector GEP by widening the operands of the scalar GEP as
@@ -4098,7 +4515,7 @@ void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands,
   // is vector-typed. Thus, to keep the representation compact, we only use
   // vector-typed operands for loop-varying values.
 
-  if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
+  if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
     // If we are vectorizing, but the GEP has only loop-invariant operands,
     // the GEP we build (by only using vector-typed operands for
     // loop-varying values) would be a scalar pointer. Thus, to ensure we
@@ -4114,7 +4531,7 @@ void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands,
     auto *Clone = Builder.Insert(GEP->clone());
     for (unsigned Part = 0; Part < UF; ++Part) {
       Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
-      VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart);
+      State.set(VPDef, GEP, EntryPart, Part);
       addMetadata(EntryPart, GEP);
     }
   } else {
@@ -4149,16 +4566,19 @@ void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands,
               ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
                                           Indices)
               : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
-      assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
+      assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
              "NewGEP is not a pointer vector");
-      VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP);
+      State.set(VPDef, GEP, NewGEP, Part);
       addMetadata(NewGEP, GEP);
     }
   }
 }
 
-void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
-                                              unsigned VF) {
+void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
+                                              RecurrenceDescriptor *RdxDesc,
+                                              Value *StartV, unsigned UF,
+                                              ElementCount VF) {
+  assert(!VF.isScalable() && "scalable vectors not yet supported.");
   PHINode *P = cast<PHINode>(PN);
   if (EnableVPlanNativePath) {
     // Currently we enter here in the VPlan-native path for non-induction
@@ -4166,7 +4586,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
     // Create a vector phi with no operands - the vector phi operands will be
     // set at the end of vector code generation.
     Type *VecTy =
-        (VF == 1) ? PN->getType() : FixedVectorType::get(PN->getType(), VF);
+        (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF);
     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
     OrigPHIsToFix.push_back(P);
@@ -4181,18 +4601,60 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
   // this value when we vectorize all of the instructions that use the PHI.
-  if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
+  if (RdxDesc || Legal->isFirstOrderRecurrence(P)) {
+    Value *Iden = nullptr;
+    bool ScalarPHI =
+        (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN));
+    Type *VecTy =
+        ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF);
+
+    if (RdxDesc) {
+      assert(Legal->isReductionVariable(P) && StartV &&
+             "RdxDesc should only be set for reduction variables; in that case "
+             "a StartV is also required");
+      RecurKind RK = RdxDesc->getRecurrenceKind();
+      if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) {
+        // MinMax reduction have the start value as their identify.
+        if (ScalarPHI) {
+          Iden = StartV;
+        } else {
+          IRBuilderBase::InsertPointGuard IPBuilder(Builder);
+          Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
+          StartV = Iden = Builder.CreateVectorSplat(VF, StartV, "minmax.ident");
+        }
+      } else {
+        Constant *IdenC = RecurrenceDescriptor::getRecurrenceIdentity(
+            RK, VecTy->getScalarType());
+        Iden = IdenC;
+
+        if (!ScalarPHI) {
+          Iden = ConstantVector::getSplat(VF, IdenC);
+          IRBuilderBase::InsertPointGuard IPBuilder(Builder);
+          Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
+          Constant *Zero = Builder.getInt32(0);
+          StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
+        }
+      }
+    }
+
     for (unsigned Part = 0; Part < UF; ++Part) {
       // This is phase one of vectorizing PHIs.
-      Type *VecTy =
-          (VF == 1) ? PN->getType() : FixedVectorType::get(PN->getType(), VF);
       Value *EntryPart = PHINode::Create(
           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
+      if (StartV) {
+        // Make sure to add the reduction start value only to the
+        // first unroll part.
+        Value *StartVal = (Part == 0) ? StartV : Iden;
+        cast<PHINode>(EntryPart)->addIncoming(StartVal, LoopVectorPreHeader);
+      }
     }
     return;
   }
 
+  assert(!Legal->isReductionVariable(P) &&
+         "reductions should be handled above");
+
   setDebugLocFromInst(Builder, P);
 
   // This PHINode must be an induction variable.
@@ -4213,26 +4675,74 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
   case InductionDescriptor::IK_PtrInduction: {
     // Handle the pointer induction variable case.
     assert(P->getType()->isPointerTy() && "Unexpected type.");
-    // This is the normalized GEP that starts counting at zero.
-    Value *PtrInd = Induction;
-    PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
-    // Determine the number of scalars we need to generate for each unroll
-    // iteration. If the instruction is uniform, we only need to generate the
-    // first lane. Otherwise, we generate all VF values.
-    unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
-    // These are the scalar results. Notice that we don't generate vector GEPs
-    // because scalar GEPs result in better code.
-    for (unsigned Part = 0; Part < UF; ++Part) {
-      for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
-        Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
-        Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
-        Value *SclrGep =
-            emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
-        SclrGep->setName("next.gep");
-        VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
+
+    if (Cost->isScalarAfterVectorization(P, VF)) {
+      // This is the normalized GEP that starts counting at zero.
+      Value *PtrInd =
+          Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
+      // Determine the number of scalars we need to generate for each unroll
+      // iteration. If the instruction is uniform, we only need to generate the
+      // first lane. Otherwise, we generate all VF values.
+      unsigned Lanes =
+          Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.getKnownMinValue();
+      for (unsigned Part = 0; Part < UF; ++Part) {
+        for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
+          Constant *Idx = ConstantInt::get(PtrInd->getType(),
+                                           Lane + Part * VF.getKnownMinValue());
+          Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
+          Value *SclrGep =
+              emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
+          SclrGep->setName("next.gep");
+          VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
+        }
       }
+      return;
+    }
+    assert(isa<SCEVConstant>(II.getStep()) &&
+           "Induction step not a SCEV constant!");
+    Type *PhiType = II.getStep()->getType();
+
+    // Build a pointer phi
+    Value *ScalarStartValue = II.getStartValue();
+    Type *ScStValueType = ScalarStartValue->getType();
+    PHINode *NewPointerPhi =
+        PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
+    NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
+
+    // A pointer induction, performed by using a gep
+    BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
+    Instruction *InductionLoc = LoopLatch->getTerminator();
+    const SCEV *ScalarStep = II.getStep();
+    SCEVExpander Exp(*PSE.getSE(), DL, "induction");
+    Value *ScalarStepValue =
+        Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
+    Value *InductionGEP = GetElementPtrInst::Create(
+        ScStValueType->getPointerElementType(), NewPointerPhi,
+        Builder.CreateMul(
+            ScalarStepValue,
+            ConstantInt::get(PhiType, VF.getKnownMinValue() * UF)),
+        "ptr.ind", InductionLoc);
+    NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
+
+    // Create UF many actual address geps that use the pointer
+    // phi as base and a vectorized version of the step value
+    // (<step*0, ..., step*N>) as offset.
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      SmallVector<Constant *, 8> Indices;
+      // Create a vector of consecutive numbers from zero to VF.
+      for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
+        Indices.push_back(
+            ConstantInt::get(PhiType, i + Part * VF.getKnownMinValue()));
+      Constant *StartOffset = ConstantVector::get(Indices);
+
+      Value *GEP = Builder.CreateGEP(
+          ScStValueType->getPointerElementType(), NewPointerPhi,
+          Builder.CreateMul(
+              StartOffset,
+              Builder.CreateVectorSplat(VF.getKnownMinValue(), ScalarStepValue),
+              "vector.gep"));
+      VectorLoopValueMap.setVectorValue(P, Part, GEP);
     }
-    return;
   }
   }
 }
@@ -4255,7 +4765,8 @@ static bool mayDivideByZero(Instruction &I) {
   return !CInt || CInt->isZero();
 }
 
-void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User,
+void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def,
+                                           VPUser &User,
                                            VPTransformState &State) {
   switch (I.getOpcode()) {
   case Instruction::Call:
@@ -4297,7 +4808,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User,
         VecOp->copyIRFlags(&I);
 
       // Use this vector value for all users of the original instruction.
-      VectorLoopValueMap.setVectorValue(&I, Part, V);
+      State.set(Def, &I, V, Part);
       addMetadata(V, &I);
     }
 
@@ -4321,7 +4832,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User,
       } else {
         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
       }
-      VectorLoopValueMap.setVectorValue(&I, Part, C);
+      State.set(Def, &I, C, Part);
       addMetadata(C, &I);
     }
 
@@ -4345,12 +4856,12 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User,
 
     /// Vectorize casts.
     Type *DestTy =
-        (VF == 1) ? CI->getType() : FixedVectorType::get(CI->getType(), VF);
+        (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF);
 
     for (unsigned Part = 0; Part < UF; ++Part) {
       Value *A = State.get(User.getOperand(0), Part);
       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
-      VectorLoopValueMap.setVectorValue(&I, Part, Cast);
+      State.set(Def, &I, Cast, Part);
       addMetadata(Cast, &I);
     }
     break;
@@ -4362,7 +4873,8 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User,
   } // end of switch.
 }
 
-void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands,
+void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
+                                               VPUser &ArgOperands,
                                                VPTransformState &State) {
   assert(!isa<DbgInfoIntrinsic>(I) &&
          "DbgInfoIntrinsic should have been dropped during VPlan construction");
@@ -4373,7 +4885,7 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands,
 
   SmallVector<Type *, 4> Tys;
   for (Value *ArgOperand : CI->arg_operands())
-    Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
+    Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
 
   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
 
@@ -4381,11 +4893,13 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands,
   // version of the instruction.
   // Is it beneficial to perform intrinsic call compared to lib call?
   bool NeedToScalarize = false;
-  unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
-  bool UseVectorIntrinsic =
-      ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
+  InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
+  InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0;
+  bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
   assert((UseVectorIntrinsic || !NeedToScalarize) &&
          "Instruction should be scalarized elsewhere.");
+  assert(IntrinsicCost.isValid() && CallCost.isValid() &&
+         "Cannot have invalid costs while widening");
 
   for (unsigned Part = 0; Part < UF; ++Part) {
     SmallVector<Value *, 4> Args;
@@ -4404,15 +4918,15 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands,
     if (UseVectorIntrinsic) {
       // Use vector version of the intrinsic.
       Type *TysForDecl[] = {CI->getType()};
-      if (VF > 1)
-        TysForDecl[0] =
-            FixedVectorType::get(CI->getType()->getScalarType(), VF);
+      if (VF.isVector()) {
+        assert(!VF.isScalable() && "VF is assumed to be non scalable.");
+        TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
+      }
       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
       assert(VectorF && "Can't retrieve vector intrinsic.");
     } else {
       // Use vector version of the function call.
-      const VFShape Shape =
-          VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/);
+      const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
 #ifndef NDEBUG
       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
              "Can't create vector function.");
@@ -4426,12 +4940,12 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands,
       if (isa<FPMathOperator>(V))
         V->copyFastMathFlags(CI);
 
-      VectorLoopValueMap.setVectorValue(&I, Part, V);
+      State.set(Def, &I, V, Part);
       addMetadata(V, &I);
   }
 }
 
-void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I,
+void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef,
                                                  VPUser &Operands,
                                                  bool InvariantCond,
                                                  VPTransformState &State) {
@@ -4450,16 +4964,16 @@ void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I,
     Value *Op0 = State.get(Operands.getOperand(1), Part);
     Value *Op1 = State.get(Operands.getOperand(2), Part);
     Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
-    VectorLoopValueMap.setVectorValue(&I, Part, Sel);
+    State.set(VPDef, &I, Sel, Part);
     addMetadata(Sel, &I);
   }
 }
 
-void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
+void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
   // We should not collect Scalars more than once per VF. Right now, this
   // function is called from collectUniformsAndScalars(), which already does
   // this check. Collecting Scalars for VF=1 does not make any sense.
-  assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
+  assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
          "This function should not be visited twice for the same VF");
 
   SmallSetVector<Instruction *, 8> Worklist;
@@ -4468,6 +4982,7 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
   // accesses that will remain scalar.
   SmallSetVector<Instruction *, 8> ScalarPtrs;
   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
+  auto *Latch = TheLoop->getLoopLatch();
 
   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
   // The pointer operands of loads and stores will be scalar as long as the
@@ -4493,11 +5008,33 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
            !TheLoop->isLoopInvariant(V);
   };
 
-  // A helper that evaluates a memory access's use of a pointer. If the use
-  // will be a scalar use, and the pointer is only used by memory accesses, we
-  // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
-  // PossibleNonScalarPtrs.
+  auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
+    if (!isa<PHINode>(Ptr) ||
+        !Legal->getInductionVars().count(cast<PHINode>(Ptr)))
+      return false;
+    auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
+    if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
+      return false;
+    return isScalarUse(MemAccess, Ptr);
+  };
+
+  // A helper that evaluates a memory access's use of a pointer. If the
+  // pointer is actually the pointer induction of a loop, it is being
+  // inserted into Worklist. If the use will be a scalar use, and the
+  // pointer is only used by memory accesses, we place the pointer in
+  // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
+    if (isScalarPtrInduction(MemAccess, Ptr)) {
+      Worklist.insert(cast<Instruction>(Ptr));
+      Instruction *Update = cast<Instruction>(
+          cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
+      Worklist.insert(Update);
+      LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr
+                        << "\n");
+      LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update
+                        << "\n");
+      return;
+    }
     // We only care about bitcast and getelementptr instructions contained in
     // the loop.
     if (!isLoopVaryingBitCastOrGEP(Ptr))
@@ -4521,10 +5058,9 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
   };
 
   // We seed the scalars analysis with three classes of instructions: (1)
-  // instructions marked uniform-after-vectorization, (2) bitcast and
-  // getelementptr instructions used by memory accesses requiring a scalar use,
-  // and (3) pointer induction variables and their update instructions (we
-  // currently only scalarize these).
+  // instructions marked uniform-after-vectorization and (2) bitcast,
+  // getelementptr and (pointer) phi instructions used by memory accesses
+  // requiring a scalar use.
   //
   // (1) Add to the worklist all instructions that have been identified as
   // uniform-after-vectorization.
@@ -4550,24 +5086,6 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
       Worklist.insert(I);
     }
 
-  // (3) Add to the worklist all pointer induction variables and their update
-  // instructions.
-  //
-  // TODO: Once we are able to vectorize pointer induction variables we should
-  //       no longer insert them into the worklist here.
-  auto *Latch = TheLoop->getLoopLatch();
-  for (auto &Induction : Legal->getInductionVars()) {
-    auto *Ind = Induction.first;
-    auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
-    if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
-      continue;
-    Worklist.insert(Ind);
-    Worklist.insert(IndUpdate);
-    LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
-    LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
-                      << "\n");
-  }
-
   // Insert the forced scalars.
   // FIXME: Currently widenPHIInstruction() often creates a dead vector
   // induction variable when the PHI user is scalarized.
@@ -4603,14 +5121,6 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
     auto *Ind = Induction.first;
     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
 
-    // We already considered pointer induction variables, so there's no reason
-    // to look at their users again.
-    //
-    // TODO: Once we are able to vectorize pointer induction variables we
-    //       should no longer skip over them here.
-    if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
-      continue;
-
     // If tail-folding is applied, the primary induction variable will be used
     // to feed a vector compare.
     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
@@ -4646,7 +5156,8 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
   Scalars[VF].insert(Worklist.begin(), Worklist.end());
 }
 
-bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) {
+bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I,
+                                                         ElementCount VF) {
   if (!blockNeedsPredication(I->getParent()))
     return false;
   switch(I->getOpcode()) {
@@ -4660,7 +5171,7 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigne
     auto *Ty = getMemInstValueType(I);
     // We have already decided how to vectorize this instruction, get that
     // result.
-    if (VF > 1) {
+    if (VF.isVector()) {
       InstWidening WideningDecision = getWideningDecision(I, VF);
       assert(WideningDecision != CM_Unknown &&
              "Widening decision should be ready at this moment");
@@ -4681,8 +5192,8 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigne
   return false;
 }
 
-bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
-                                                               unsigned VF) {
+bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
+    Instruction *I, ElementCount VF) {
   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
   assert(getWideningDecision(I, VF) == CM_Unknown &&
          "Decision should not be set yet.");
@@ -4693,7 +5204,7 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
   // requires padding and will be scalarized.
   auto &DL = I->getModule()->getDataLayout();
   auto *ScalarTy = getMemInstValueType(I);
-  if (hasIrregularType(ScalarTy, DL, VF))
+  if (hasIrregularType(ScalarTy, DL))
     return false;
 
   // Check if masking is required.
@@ -4718,8 +5229,8 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
                           : TTI.isLegalMaskedStore(Ty, Alignment);
 }
 
-bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
-                                                               unsigned VF) {
+bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
+    Instruction *I, ElementCount VF) {
   // Get and ensure we have a valid memory instruction.
   LoadInst *LI = dyn_cast<LoadInst>(I);
   StoreInst *SI = dyn_cast<StoreInst>(I);
@@ -4740,19 +5251,19 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
   // requires padding and will be scalarized.
   auto &DL = I->getModule()->getDataLayout();
   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
-  if (hasIrregularType(ScalarTy, DL, VF))
+  if (hasIrregularType(ScalarTy, DL))
     return false;
 
   return true;
 }
 
-void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
+void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
   // We should not collect Uniforms more than once per VF. Right now,
   // this function is called from collectUniformsAndScalars(), which
   // already does this check. Collecting Uniforms for VF=1 does not make any
   // sense.
 
-  assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
+  assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
          "This function should not be visited twice for the same VF");
 
   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
@@ -4778,6 +5289,11 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
   // replicating region where only a single instance out of VF should be formed.
   // TODO: optimize such seldom cases if found important, see PR40816.
   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
+    if (isOutOfScope(I)) {
+      LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
+                        << *I << "\n");
+      return;
+    }
     if (isScalarWithPredication(I, VF)) {
       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
                         << *I << "\n");
@@ -4794,65 +5310,71 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
     addToWorklistIfAllowed(Cmp);
 
-  // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
-  // are pointers that are treated like consecutive pointers during
-  // vectorization. The pointer operands of interleaved accesses are an
-  // example.
-  SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
-
-  // Holds pointer operands of instructions that are possibly non-uniform.
-  SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
-
-  auto isUniformDecision = [&](Instruction *I, unsigned VF) {
+  auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
     InstWidening WideningDecision = getWideningDecision(I, VF);
     assert(WideningDecision != CM_Unknown &&
            "Widening decision should be ready at this moment");
 
+    // A uniform memory op is itself uniform.  We exclude uniform stores
+    // here as they demand the last lane, not the first one.
+    if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
+      assert(WideningDecision == CM_Scalarize);
+      return true;
+    }
+
     return (WideningDecision == CM_Widen ||
             WideningDecision == CM_Widen_Reverse ||
             WideningDecision == CM_Interleave);
   };
-  // Iterate over the instructions in the loop, and collect all
-  // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
-  // that a consecutive-like pointer operand will be scalarized, we collect it
-  // in PossibleNonUniformPtrs instead. We use two sets here because a single
-  // getelementptr instruction can be used by both vectorized and scalarized
-  // memory instructions. For example, if a loop loads and stores from the same
-  // location, but the store is conditional, the store will be scalarized, and
-  // the getelementptr won't remain uniform.
+
+
+  // Returns true if Ptr is the pointer operand of a memory access instruction
+  // I, and I is known to not require scalarization.
+  auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
+    return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
+  };
+
+  // Holds a list of values which are known to have at least one uniform use.
+  // Note that there may be other uses which aren't uniform.  A "uniform use"
+  // here is something which only demands lane 0 of the unrolled iterations;
+  // it does not imply that all lanes produce the same value (e.g. this is not
+  // the usual meaning of uniform)
+  SmallPtrSet<Value *, 8> HasUniformUse;
+
+  // Scan the loop for instructions which are either a) known to have only
+  // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
   for (auto *BB : TheLoop->blocks())
     for (auto &I : *BB) {
       // If there's no pointer operand, there's nothing to do.
-      auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
+      auto *Ptr = getLoadStorePointerOperand(&I);
       if (!Ptr)
         continue;
 
-      // True if all users of Ptr are memory accesses that have Ptr as their
-      // pointer operand.
-      auto UsersAreMemAccesses =
-          llvm::all_of(Ptr->users(), [&](User *U) -> bool {
-            return getLoadStorePointerOperand(U) == Ptr;
-          });
-
-      // Ensure the memory instruction will not be scalarized or used by
-      // gather/scatter, making its pointer operand non-uniform. If the pointer
-      // operand is used by any instruction other than a memory access, we
-      // conservatively assume the pointer operand may be non-uniform.
-      if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
-        PossibleNonUniformPtrs.insert(Ptr);
-
-      // If the memory instruction will be vectorized and its pointer operand
-      // is consecutive-like, or interleaving - the pointer operand should
-      // remain uniform.
-      else
-        ConsecutiveLikePtrs.insert(Ptr);
+      // A uniform memory op is itself uniform.  We exclude uniform stores
+      // here as they demand the last lane, not the first one.
+      if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
+        addToWorklistIfAllowed(&I);
+
+      if (isUniformDecision(&I, VF)) {
+        assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check");
+        HasUniformUse.insert(Ptr);
+      }
     }
 
-  // Add to the Worklist all consecutive and consecutive-like pointers that
-  // aren't also identified as possibly non-uniform.
-  for (auto *V : ConsecutiveLikePtrs)
-    if (!PossibleNonUniformPtrs.count(V))
-      addToWorklistIfAllowed(V);
+  // Add to the worklist any operands which have *only* uniform (e.g. lane 0
+  // demanding) users.  Since loops are assumed to be in LCSSA form, this
+  // disallows uses outside the loop as well.
+  for (auto *V : HasUniformUse) {
+    if (isOutOfScope(V))
+      continue;
+    auto *I = cast<Instruction>(V);
+    auto UsersAreMemAccesses =
+      llvm::all_of(I->users(), [&](User *U) -> bool {
+        return isVectorizedMemAccessUse(cast<Instruction>(U), V);
+      });
+    if (UsersAreMemAccesses)
+      addToWorklistIfAllowed(I);
+  }
 
   // Expand Worklist in topological order: whenever a new instruction
   // is added , its users should be already inside Worklist.  It ensures
@@ -4875,20 +5397,12 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
       auto *OI = cast<Instruction>(OV);
       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
             auto *J = cast<Instruction>(U);
-            return Worklist.count(J) ||
-                   (OI == getLoadStorePointerOperand(J) &&
-                    isUniformDecision(J, VF));
+            return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
           }))
         addToWorklistIfAllowed(OI);
     }
   }
 
-  // Returns true if Ptr is the pointer operand of a memory access instruction
-  // I, and I is known to not require scalarization.
-  auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
-    return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
-  };
-
   // For an instruction to be added into Worklist above, all its users inside
   // the loop should also be in Worklist. However, this condition cannot be
   // true for phi nodes that form a cyclic dependence. We must process phi
@@ -4961,8 +5475,8 @@ bool LoopVectorizationCostModel::runtimeChecksRequired() {
   return false;
 }
 
-Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF,
-                                                            unsigned UserIC) {
+Optional<ElementCount>
+LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
     // TODO: It may by useful to do since it's still likely to be dynamically
     // uniform if the target can skip.
@@ -4984,7 +5498,9 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF,
 
   switch (ScalarEpilogueStatus) {
   case CM_ScalarEpilogueAllowed:
-    return UserVF ? UserVF : computeFeasibleMaxVF(TC);
+    return computeFeasibleMaxVF(TC, UserVF);
+  case CM_ScalarEpilogueNotAllowedUsePredicate:
+    LLVM_FALLTHROUGH;
   case CM_ScalarEpilogueNotNeededUsePredicate:
     LLVM_DEBUG(
         dbgs() << "LV: vector predicate hint/switch found.\n"
@@ -5005,9 +5521,26 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF,
     // for size.
     if (runtimeChecksRequired())
       return None;
+
     break;
   }
 
+  // The only loops we can vectorize without a scalar epilogue, are loops with
+  // a bottom-test and a single exiting block. We'd have to handle the fact
+  // that not every instruction executes on the last iteration.  This will
+  // require a lane mask which varies through the vector loop body.  (TODO)
+  if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
+    // If there was a tail-folding hint/switch, but we can't fold the tail by
+    // masking, fallback to a vectorization with a scalar epilogue.
+    if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
+      LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
+                           "scalar epilogue instead.\n");
+      ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
+      return computeFeasibleMaxVF(TC, UserVF);
+    }
+    return None;
+  }
+
   // Now try the tail folding
 
   // Invalidate interleave groups that require an epilogue if we can't mask
@@ -5020,10 +5553,22 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF,
     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
   }
 
-  unsigned MaxVF = UserVF ? UserVF : computeFeasibleMaxVF(TC);
-  assert((UserVF || isPowerOf2_32(MaxVF)) && "MaxVF must be a power of 2");
-  unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF;
-  if (TC > 0 && TC % MaxVFtimesIC == 0) {
+  ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF);
+  assert(!MaxVF.isScalable() &&
+         "Scalable vectors do not yet support tail folding");
+  assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) &&
+         "MaxVF must be a power of 2");
+  unsigned MaxVFtimesIC =
+      UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue();
+  // Avoid tail folding if the trip count is known to be a multiple of any VF we
+  // chose.
+  ScalarEvolution *SE = PSE.getSE();
+  const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
+  const SCEV *ExitCount = SE->getAddExpr(
+      BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
+  const SCEV *Rem = SE->getURemExpr(
+      ExitCount, SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
+  if (Rem->isZero()) {
     // Accept MaxVF if we do not have a tail.
     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
     return MaxVF;
@@ -5038,6 +5583,20 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF,
     return MaxVF;
   }
 
+  // If there was a tail-folding hint/switch, but we can't fold the tail by
+  // masking, fallback to a vectorization with a scalar epilogue.
+  if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
+    LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
+                         "scalar epilogue instead.\n");
+    ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
+    return MaxVF;
+  }
+
+  if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
+    LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
+    return None;
+  }
+
   if (TC == 0) {
     reportVectorizationFailure(
         "Unable to calculate the loop count due to complex control flow",
@@ -5055,8 +5614,33 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF,
   return None;
 }
 
-unsigned
-LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
+ElementCount
+LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
+                                                 ElementCount UserVF) {
+  bool IgnoreScalableUserVF = UserVF.isScalable() &&
+                              !TTI.supportsScalableVectors() &&
+                              !ForceTargetSupportsScalableVectors;
+  if (IgnoreScalableUserVF) {
+    LLVM_DEBUG(
+        dbgs() << "LV: Ignoring VF=" << UserVF
+               << " because target does not support scalable vectors.\n");
+    ORE->emit([&]() {
+      return OptimizationRemarkAnalysis(DEBUG_TYPE, "IgnoreScalableUserVF",
+                                        TheLoop->getStartLoc(),
+                                        TheLoop->getHeader())
+             << "Ignoring VF=" << ore::NV("UserVF", UserVF)
+             << " because target does not support scalable vectors.";
+    });
+  }
+
+  // Beyond this point two scenarios are handled. If UserVF isn't specified
+  // then a suitable VF is chosen. If UserVF is specified and there are
+  // dependencies, check if it's legal. However, if a UserVF is specified and
+  // there are no dependencies, then there's nothing to do.
+  if (UserVF.isNonZero() && !IgnoreScalableUserVF &&
+      Legal->isSafeForAnyVectorWidth())
+    return UserVF;
+
   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
   unsigned SmallestType, WidestType;
   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
@@ -5066,9 +5650,62 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
   // the memory accesses that is most restrictive (involved in the smallest
   // dependence distance).
-  unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
+  unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits();
+
+  // If the user vectorization factor is legally unsafe, clamp it to a safe
+  // value. Otherwise, return as is.
+  if (UserVF.isNonZero() && !IgnoreScalableUserVF) {
+    unsigned MaxSafeElements =
+        PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType);
+    ElementCount MaxSafeVF = ElementCount::getFixed(MaxSafeElements);
+
+    if (UserVF.isScalable()) {
+      Optional<unsigned> MaxVScale = TTI.getMaxVScale();
+
+      // Scale VF by vscale before checking if it's safe.
+      MaxSafeVF = ElementCount::getScalable(
+          MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
+
+      if (MaxSafeVF.isZero()) {
+        // The dependence distance is too small to use scalable vectors,
+        // fallback on fixed.
+        LLVM_DEBUG(
+            dbgs()
+            << "LV: Max legal vector width too small, scalable vectorization "
+               "unfeasible. Using fixed-width vectorization instead.\n");
+        ORE->emit([&]() {
+          return OptimizationRemarkAnalysis(DEBUG_TYPE, "ScalableVFUnfeasible",
+                                            TheLoop->getStartLoc(),
+                                            TheLoop->getHeader())
+                 << "Max legal vector width too small, scalable vectorization "
+                 << "unfeasible. Using fixed-width vectorization instead.";
+        });
+        return computeFeasibleMaxVF(
+            ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue()));
+      }
+    }
 
-  WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
+    LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n");
+
+    if (ElementCount::isKnownLE(UserVF, MaxSafeVF))
+      return UserVF;
+
+    LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
+                      << " is unsafe, clamping to max safe VF=" << MaxSafeVF
+                      << ".\n");
+    ORE->emit([&]() {
+      return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
+                                        TheLoop->getStartLoc(),
+                                        TheLoop->getHeader())
+             << "User-specified vectorization factor "
+             << ore::NV("UserVectorizationFactor", UserVF)
+             << " is unsafe, clamping to maximum safe vectorization factor "
+             << ore::NV("VectorizationFactor", MaxSafeVF);
+    });
+    return MaxSafeVF;
+  }
+
+  WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits);
 
   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
   // Note that both WidestRegister and WidestType may not be a powers of 2.
@@ -5079,12 +5716,13 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
                     << WidestRegister << " bits.\n");
 
-  assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
-                                 " into one vector!");
+  assert(MaxVectorSize <= WidestRegister &&
+         "Did not expect to pack so many elements"
+         " into one vector!");
   if (MaxVectorSize == 0) {
     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
     MaxVectorSize = 1;
-    return MaxVectorSize;
+    return ElementCount::getFixed(MaxVectorSize);
   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
              isPowerOf2_32(ConstTripCount)) {
     // We need to clamp the VF to be the ConstTripCount. There is no point in
@@ -5092,7 +5730,7 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
                       << ConstTripCount << "\n");
     MaxVectorSize = ConstTripCount;
-    return MaxVectorSize;
+    return ElementCount::getFixed(MaxVectorSize);
   }
 
   unsigned MaxVF = MaxVectorSize;
@@ -5100,10 +5738,10 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
     // Collect all viable vectorization factors larger than the default MaxVF
     // (i.e. MaxVectorSize).
-    SmallVector<unsigned, 8> VFs;
+    SmallVector<ElementCount, 8> VFs;
     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
-      VFs.push_back(VS);
+      VFs.push_back(ElementCount::getFixed(VS));
 
     // For each VF calculate its register usage.
     auto RUs = calculateRegisterUsage(VFs);
@@ -5118,7 +5756,7 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
           Selected = false;
       }
       if (Selected) {
-        MaxVF = VFs[i];
+        MaxVF = VFs[i].getKnownMinValue();
         break;
       }
     }
@@ -5130,30 +5768,39 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
       }
     }
   }
-  return MaxVF;
+  return ElementCount::getFixed(MaxVF);
 }
 
 VectorizationFactor
-LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
-  float Cost = expectedCost(1).first;
-  const float ScalarCost = Cost;
+LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) {
+  // FIXME: This can be fixed for scalable vectors later, because at this stage
+  // the LoopVectorizer will only consider vectorizing a loop with scalable
+  // vectors when the loop has a hint to enable vectorization for a given VF.
+  assert(!MaxVF.isScalable() && "scalable vectors not yet supported");
+
+  InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
+  LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
+  assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
+
   unsigned Width = 1;
-  LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
+  const float ScalarCost = *ExpectedCost.getValue();
+  float Cost = ScalarCost;
 
   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
-  if (ForceVectorization && MaxVF > 1) {
+  if (ForceVectorization && MaxVF.isVector()) {
     // Ignore scalar width, because the user explicitly wants vectorization.
     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
     // evaluation.
     Cost = std::numeric_limits<float>::max();
   }
 
-  for (unsigned i = 2; i <= MaxVF; i *= 2) {
+  for (unsigned i = 2; i <= MaxVF.getFixedValue(); i *= 2) {
     // Notice that the vector loop needs to be executed less times, so
     // we need to divide the cost of the vector loops by the width of
     // the vector elements.
-    VectorizationCostTy C = expectedCost(i);
-    float VectorCost = C.first / (float)i;
+    VectorizationCostTy C = expectedCost(ElementCount::getFixed(i));
+    assert(C.first.isValid() && "Unexpected invalid cost for vector loop");
+    float VectorCost = *C.first.getValue() / (float)i;
     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
                       << " costs: " << (int)VectorCost << ".\n");
     if (!C.second && !ForceVectorization) {
@@ -5162,6 +5809,13 @@ LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
                  << " because it will not generate any vector instructions.\n");
       continue;
     }
+
+    // If profitable add it to ProfitableVF list.
+    if (VectorCost < ScalarCost) {
+      ProfitableVFs.push_back(VectorizationFactor(
+          {ElementCount::getFixed(i), (unsigned)VectorCost}));
+    }
+
     if (VectorCost < Cost) {
       Cost = VectorCost;
       Width = i;
@@ -5180,10 +5834,131 @@ LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
              << "LV: Vectorization seems to be not beneficial, "
              << "but was forced by a user.\n");
   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
-  VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
+  VectorizationFactor Factor = {ElementCount::getFixed(Width),
+                                (unsigned)(Width * Cost)};
   return Factor;
 }
 
+bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
+    const Loop &L, ElementCount VF) const {
+  // Cross iteration phis such as reductions need special handling and are
+  // currently unsupported.
+  if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) {
+        return Legal->isFirstOrderRecurrence(&Phi) ||
+               Legal->isReductionVariable(&Phi);
+      }))
+    return false;
+
+  // Phis with uses outside of the loop require special handling and are
+  // currently unsupported.
+  for (auto &Entry : Legal->getInductionVars()) {
+    // Look for uses of the value of the induction at the last iteration.
+    Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
+    for (User *U : PostInc->users())
+      if (!L.contains(cast<Instruction>(U)))
+        return false;
+    // Look for uses of penultimate value of the induction.
+    for (User *U : Entry.first->users())
+      if (!L.contains(cast<Instruction>(U)))
+        return false;
+  }
+
+  // Induction variables that are widened require special handling that is
+  // currently not supported.
+  if (any_of(Legal->getInductionVars(), [&](auto &Entry) {
+        return !(this->isScalarAfterVectorization(Entry.first, VF) ||
+                 this->isProfitableToScalarize(Entry.first, VF));
+      }))
+    return false;
+
+  return true;
+}
+
+bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
+    const ElementCount VF) const {
+  // FIXME: We need a much better cost-model to take different parameters such
+  // as register pressure, code size increase and cost of extra branches into
+  // account. For now we apply a very crude heuristic and only consider loops
+  // with vectorization factors larger than a certain value.
+  // We also consider epilogue vectorization unprofitable for targets that don't
+  // consider interleaving beneficial (eg. MVE).
+  if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
+    return false;
+  if (VF.getFixedValue() >= EpilogueVectorizationMinVF)
+    return true;
+  return false;
+}
+
+VectorizationFactor
+LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
+    const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
+  VectorizationFactor Result = VectorizationFactor::Disabled();
+  if (!EnableEpilogueVectorization) {
+    LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);
+    return Result;
+  }
+
+  if (!isScalarEpilogueAllowed()) {
+    LLVM_DEBUG(
+        dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
+                  "allowed.\n";);
+    return Result;
+  }
+
+  // FIXME: This can be fixed for scalable vectors later, because at this stage
+  // the LoopVectorizer will only consider vectorizing a loop with scalable
+  // vectors when the loop has a hint to enable vectorization for a given VF.
+  if (MainLoopVF.isScalable()) {
+    LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not "
+                         "yet supported.\n");
+    return Result;
+  }
+
+  // Not really a cost consideration, but check for unsupported cases here to
+  // simplify the logic.
+  if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
+    LLVM_DEBUG(
+        dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
+                  "not a supported candidate.\n";);
+    return Result;
+  }
+
+  if (EpilogueVectorizationForceVF > 1) {
+    LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
+    if (LVP.hasPlanWithVFs(
+            {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)}))
+      return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0};
+    else {
+      LLVM_DEBUG(
+          dbgs()
+              << "LEV: Epilogue vectorization forced factor is not viable.\n";);
+      return Result;
+    }
+  }
+
+  if (TheLoop->getHeader()->getParent()->hasOptSize() ||
+      TheLoop->getHeader()->getParent()->hasMinSize()) {
+    LLVM_DEBUG(
+        dbgs()
+            << "LEV: Epilogue vectorization skipped due to opt for size.\n";);
+    return Result;
+  }
+
+  if (!isEpilogueVectorizationProfitable(MainLoopVF))
+    return Result;
+
+  for (auto &NextVF : ProfitableVFs)
+    if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) &&
+        (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost) &&
+        LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width}))
+      Result = NextVF;
+
+  if (Result != VectorizationFactor::Disabled())
+    LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
+                      << Result.Width.getFixedValue() << "\n";);
+  return Result;
+}
+
 std::pair<unsigned, unsigned>
 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
   unsigned MinWidth = -1U;
@@ -5210,6 +5985,11 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {
         if (!Legal->isReductionVariable(PN))
           continue;
         RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
+        if (PreferInLoopReductions ||
+            TTI.preferInLoopReduction(RdxDesc.getOpcode(),
+                                      RdxDesc.getRecurrenceType(),
+                                      TargetTransformInfo::ReductionFlags()))
+          continue;
         T = RdxDesc.getRecurrenceType();
       }
 
@@ -5240,7 +6020,7 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {
   return {MinWidth, MaxWidth};
 }
 
-unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
+unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
                                                            unsigned LoopCost) {
   // -- The interleave heuristics --
   // We interleave the loop in order to expose ILP and reduce the loop overhead.
@@ -5263,10 +6043,15 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
   if (Legal->getMaxSafeDepDistBytes() != -1U)
     return 1;
 
-  // Do not interleave loops with a relatively small known or estimated trip
-  // count.
   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
-  if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold)
+  const bool HasReductions = !Legal->getReductionVars().empty();
+  // Do not interleave loops with a relatively small known or estimated trip
+  // count. But we will interleave when InterleaveSmallLoopScalarReduction is
+  // enabled, and the code has scalar reductions(HasReductions && VF = 1),
+  // because with the above conditions interleaving can expose ILP and break
+  // cross iteration dependences for reductions.
+  if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
+      !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
     return 1;
 
   RegisterUsage R = calculateRegisterUsage({VF})[0];
@@ -5294,7 +6079,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
                       << " registers of "
                       << TTI.getRegisterClassName(pair.first) << " register class\n");
-    if (VF == 1) {
+    if (VF.isScalar()) {
       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
         TargetNumRegisters = ForceTargetNumScalarRegs;
     } else {
@@ -5318,10 +6103,11 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
   }
 
   // Clamp the interleave ranges to reasonable counts.
-  unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
+  unsigned MaxInterleaveCount =
+      TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
 
   // Check if the user has overridden the max.
-  if (VF == 1) {
+  if (VF.isScalar()) {
     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
   } else {
@@ -5330,28 +6116,47 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
   }
 
   // If trip count is known or estimated compile time constant, limit the
-  // interleave count to be less than the trip count divided by VF.
+  // interleave count to be less than the trip count divided by VF, provided it
+  // is at least 1.
+  //
+  // For scalable vectors we can't know if interleaving is beneficial. It may
+  // not be beneficial for small loops if none of the lanes in the second vector
+  // iterations is enabled. However, for larger loops, there is likely to be a
+  // similar benefit as for fixed-width vectors. For now, we choose to leave
+  // the InterleaveCount as if vscale is '1', although if some information about
+  // the vector is known (e.g. min vector size), we can make a better decision.
   if (BestKnownTC) {
-    MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount);
+    MaxInterleaveCount =
+        std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
+    // Make sure MaxInterleaveCount is greater than 0.
+    MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
   }
 
-  // If we did not calculate the cost for VF (because the user selected the VF)
-  // then we calculate the cost of VF here.
-  if (LoopCost == 0)
-    LoopCost = expectedCost(VF).first;
-
-  assert(LoopCost && "Non-zero loop cost expected");
+  assert(MaxInterleaveCount > 0 &&
+         "Maximum interleave count must be greater than 0");
 
   // Clamp the calculated IC to be between the 1 and the max interleave count
   // that the target and trip count allows.
   if (IC > MaxInterleaveCount)
     IC = MaxInterleaveCount;
-  else if (IC < 1)
-    IC = 1;
+  else
+    // Make sure IC is greater than 0.
+    IC = std::max(1u, IC);
+
+  assert(IC > 0 && "Interleave count must be greater than 0.");
+
+  // If we did not calculate the cost for VF (because the user selected the VF)
+  // then we calculate the cost of VF here.
+  if (LoopCost == 0) {
+    assert(expectedCost(VF).first.isValid() && "Expected a valid cost");
+    LoopCost = *expectedCost(VF).first.getValue();
+  }
+
+  assert(LoopCost && "Non-zero loop cost expected");
 
   // Interleave if we vectorized this loop and there is a reduction that could
   // benefit from interleaving.
-  if (VF > 1 && !Legal->getReductionVars().empty()) {
+  if (VF.isVector() && HasReductions) {
     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
     return IC;
   }
@@ -5359,11 +6164,15 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
   // Note that if we've already vectorized the loop we will have done the
   // runtime check and so interleaving won't require further checks.
   bool InterleavingRequiresRuntimePointerCheck =
-      (VF == 1 && Legal->getRuntimePointerChecking()->Need);
+      (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
 
   // We want to interleave small loops in order to reduce the loop overhead and
   // potentially expose ILP opportunities.
-  LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
+  LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
+                    << "LV: IC is " << IC << '\n'
+                    << "LV: VF is " << VF << '\n');
+  const bool AggressivelyInterleaveReductions =
+      TTI.enableAggressiveInterleaving(HasReductions);
   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
     // We assume that the cost overhead is 1 and we use the cost model
     // to estimate the cost of the loop and interleave until the cost of the
@@ -5382,7 +6191,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
     // by this point), we can increase the critical path length if the loop
     // we're interleaving is inside another loop. Limit, by default to 2, so the
     // critical path only gets increased by one reduction operation.
-    if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) {
+    if (HasReductions && TheLoop->getLoopDepth() > 1) {
       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
       SmallIC = std::min(SmallIC, F);
       StoresIC = std::min(StoresIC, F);
@@ -5396,14 +6205,23 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
       return std::max(StoresIC, LoadsIC);
     }
 
-    LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
-    return SmallIC;
+    // If there are scalar reductions and TTI has enabled aggressive
+    // interleaving for reductions, we will interleave to expose ILP.
+    if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
+        AggressivelyInterleaveReductions) {
+      LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
+      // Interleave no less than SmallIC but not as aggressive as the normal IC
+      // to satisfy the rare situation when resources are too limited.
+      return std::max(IC / 2, SmallIC);
+    } else {
+      LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
+      return SmallIC;
+    }
   }
 
   // Interleave if this is a large loop (small loops are already dealt with by
   // this point) that could benefit from interleaving.
-  bool HasReductions = !Legal->getReductionVars().empty();
-  if (TTI.enableAggressiveInterleaving(HasReductions)) {
+  if (AggressivelyInterleaveReductions) {
     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
     return IC;
   }
@@ -5413,7 +6231,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
 }
 
 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
-LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
+LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
   // This function calculates the register usage by measuring the highest number
   // of values that are alive at a single location. Obviously, this is a very
   // rough estimation. We scan the loop in a topological order in order and
@@ -5485,26 +6303,17 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
     TransposeEnds[Interval.second].push_back(Interval.first);
 
   SmallPtrSet<Instruction *, 8> OpenIntervals;
-
-  // Get the size of the widest register.
-  unsigned MaxSafeDepDist = -1U;
-  if (Legal->getMaxSafeDepDistBytes() != -1U)
-    MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
-  unsigned WidestRegister =
-      std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
-  const DataLayout &DL = TheFunction->getParent()->getDataLayout();
-
   SmallVector<RegisterUsage, 8> RUs(VFs.size());
   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
 
   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
 
   // A lambda that gets the register usage for the given type and VF.
-  auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
-    if (Ty->isTokenTy())
+  const auto &TTICapture = TTI;
+  auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) {
+    if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
       return 0U;
-    unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
-    return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
+    return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
   };
 
   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
@@ -5528,7 +6337,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
       // Count the number of live intervals.
       SmallMapVector<unsigned, unsigned, 4> RegUsage;
 
-      if (VFs[j] == 1) {
+      if (VFs[j].isScalar()) {
         for (auto Inst : OpenIntervals) {
           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
           if (RegUsage.find(ClassID) == RegUsage.end())
@@ -5557,7 +6366,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
           }
         }
       }
-    
+
       for (auto& pair : RegUsage) {
         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
@@ -5575,10 +6384,12 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
 
   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
     SmallMapVector<unsigned, unsigned, 4> Invariant;
-  
+
     for (auto Inst : LoopInvariants) {
-      unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
-      unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType());
+      unsigned Usage =
+          VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
+      unsigned ClassID =
+          TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
       if (Invariant.find(ClassID) == Invariant.end())
         Invariant[ClassID] = Usage;
       else
@@ -5626,12 +6437,13 @@ bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
           NumPredStores > NumberOfStoresToPredicate);
 }
 
-void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
+void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
   // If we aren't vectorizing the loop, or if we've already collected the
   // instructions to scalarize, there's nothing to do. Collection may already
   // have occurred if we have a user-selected VF and are now computing the
   // expected cost for interleaving.
-  if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end())
+  if (VF.isScalar() || VF.isZero() ||
+      InstsToScalarize.find(VF) != InstsToScalarize.end())
     return;
 
   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
@@ -5660,14 +6472,13 @@ void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
 }
 
 int LoopVectorizationCostModel::computePredInstDiscount(
-    Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
-    unsigned VF) {
+    Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
   assert(!isUniformAfterVectorization(PredInst, VF) &&
          "Instruction marked uniform-after-vectorization will be predicated");
 
   // Initialize the discount to zero, meaning that the scalar version and the
   // vector version cost the same.
-  int Discount = 0;
+  InstructionCost Discount = 0;
 
   // Holds instructions to analyze. The instructions we visit are mapped in
   // ScalarCosts. Those instructions are the ones that would be scalarized if
@@ -5722,22 +6533,27 @@ int LoopVectorizationCostModel::computePredInstDiscount(
 
     // Compute the cost of the vector instruction. Note that this cost already
     // includes the scalarization overhead of the predicated instruction.
-    unsigned VectorCost = getInstructionCost(I, VF).first;
+    InstructionCost VectorCost = getInstructionCost(I, VF).first;
 
     // Compute the cost of the scalarized instruction. This cost is the cost of
     // the instruction as if it wasn't if-converted and instead remained in the
     // predicated block. We will scale this cost by block probability after
     // computing the scalarization overhead.
-    unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
+    assert(!VF.isScalable() && "scalable vectors not yet supported.");
+    InstructionCost ScalarCost =
+        VF.getKnownMinValue() *
+        getInstructionCost(I, ElementCount::getFixed(1)).first;
 
     // Compute the scalarization overhead of needed insertelement instructions
     // and phi nodes.
     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
       ScalarCost += TTI.getScalarizationOverhead(
           cast<VectorType>(ToVectorTy(I->getType(), VF)),
-          APInt::getAllOnesValue(VF), true, false);
-      ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI,
-                                            TTI::TCK_RecipThroughput);
+          APInt::getAllOnesValue(VF.getKnownMinValue()), true, false);
+      assert(!VF.isScalable() && "scalable vectors not yet supported.");
+      ScalarCost +=
+          VF.getKnownMinValue() *
+          TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
     }
 
     // Compute the scalarization overhead of needed extractelement
@@ -5750,10 +6566,12 @@ int LoopVectorizationCostModel::computePredInstDiscount(
                "Instruction has non-scalar type");
         if (canBeScalarized(J))
           Worklist.push_back(J);
-        else if (needsExtract(J, VF))
+        else if (needsExtract(J, VF)) {
+          assert(!VF.isScalable() && "scalable vectors not yet supported.");
           ScalarCost += TTI.getScalarizationOverhead(
               cast<VectorType>(ToVectorTy(J->getType(), VF)),
-              APInt::getAllOnesValue(VF), false, true);
+              APInt::getAllOnesValue(VF.getKnownMinValue()), false, true);
+        }
       }
 
     // Scale the total scalar cost by block probability.
@@ -5765,11 +6583,11 @@ int LoopVectorizationCostModel::computePredInstDiscount(
     ScalarCosts[I] = ScalarCost;
   }
 
-  return Discount;
+  return *Discount.getValue();
 }
 
 LoopVectorizationCostModel::VectorizationCostTy
-LoopVectorizationCostModel::expectedCost(unsigned VF) {
+LoopVectorizationCostModel::expectedCost(ElementCount VF) {
   VectorizationCostTy Cost;
 
   // For each block.
@@ -5779,14 +6597,15 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) {
     // For each instruction in the old loop.
     for (Instruction &I : BB->instructionsWithoutDebug()) {
       // Skip ignored values.
-      if (ValuesToIgnore.count(&I) || (VF > 1 && VecValuesToIgnore.count(&I)))
+      if (ValuesToIgnore.count(&I) ||
+          (VF.isVector() && VecValuesToIgnore.count(&I)))
         continue;
 
       VectorizationCostTy C = getInstructionCost(&I, VF);
 
       // Check if we should override the cost.
       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
-        C.first = ForceTargetInstructionCost;
+        C.first = InstructionCost(ForceTargetInstructionCost);
 
       BlockCost.first += C.first;
       BlockCost.second |= C.second;
@@ -5799,9 +6618,10 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) {
     // if-converted. This means that the block's instructions (aside from
     // stores and instructions that may divide by zero) will now be
     // unconditionally executed. For the scalar case, we may not always execute
-    // the predicated block. Thus, scale the block's cost by the probability of
-    // executing it.
-    if (VF == 1 && blockNeedsPredication(BB))
+    // the predicated block, if it is an if-else block. Thus, scale the block's
+    // cost by the probability of executing it. blockNeedsPredication from
+    // Legal is used so as to not include all blocks in tail folded loops.
+    if (VF.isScalar() && Legal->blockNeedsPredication(BB))
       BlockCost.first /= getReciprocalPredBlockProb();
 
     Cost.first += BlockCost.first;
@@ -5846,9 +6666,12 @@ static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
          Legal->hasStride(I->getOperand(1));
 }
 
-unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
-                                                                 unsigned VF) {
-  assert(VF > 1 && "Scalarization cost of instruction implies vectorization.");
+InstructionCost
+LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
+                                                        ElementCount VF) {
+  assert(VF.isVector() &&
+         "Scalarization cost of instruction implies vectorization.");
+  assert(!VF.isScalable() && "scalable vectors not yet supported.");
   Type *ValTy = getMemInstValueType(I);
   auto SE = PSE.getSE();
 
@@ -5861,14 +6684,15 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
 
   // Get the cost of the scalar memory instruction and address computation.
-  unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
+  InstructionCost Cost =
+      VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
 
   // Don't pass *I here, since it is scalar but will actually be part of a
   // vectorized loop where the user of it is a vectorized instruction.
   const Align Alignment = getLoadStoreAlignment(I);
-  Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
-                                   Alignment, AS, 
-                                   TTI::TCK_RecipThroughput);
+  Cost += VF.getKnownMinValue() *
+          TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
+                              AS, TTI::TCK_RecipThroughput);
 
   // Get the overhead of the extractelement and insertelement instructions
   // we might create due to scalarization.
@@ -5889,8 +6713,9 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
   return Cost;
 }
 
-unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
-                                                             unsigned VF) {
+InstructionCost
+LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
+                                                    ElementCount VF) {
   Type *ValTy = getMemInstValueType(I);
   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
   Value *Ptr = getLoadStorePointerOperand(I);
@@ -5901,7 +6726,7 @@ unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
          "Stride should be 1 or -1 for consecutive memory access");
   const Align Alignment = getLoadStoreAlignment(I);
-  unsigned Cost = 0;
+  InstructionCost Cost = 0;
   if (Legal->isMaskRequired(I))
     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
                                       CostKind);
@@ -5915,8 +6740,11 @@ unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
   return Cost;
 }
 
-unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
-                                                         unsigned VF) {
+InstructionCost
+LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
+                                                ElementCount VF) {
+  assert(Legal->isUniformMemOp(*I));
+
   Type *ValTy = getMemInstValueType(I);
   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
   const Align Alignment = getLoadStoreAlignment(I);
@@ -5937,11 +6765,12 @@ unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
          (isLoopInvariantStoreValue
               ? 0
               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
-                                       VF - 1));
+                                       VF.getKnownMinValue() - 1));
 }
 
-unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
-                                                          unsigned VF) {
+InstructionCost
+LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
+                                                 ElementCount VF) {
   Type *ValTy = getMemInstValueType(I);
   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
   const Align Alignment = getLoadStoreAlignment(I);
@@ -5953,8 +6782,9 @@ unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
              TargetTransformInfo::TCK_RecipThroughput, I);
 }
 
-unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
-                                                            unsigned VF) {
+InstructionCost
+LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
+                                                   ElementCount VF) {
   Type *ValTy = getMemInstValueType(I);
   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
   unsigned AS = getLoadStoreAddressSpace(I);
@@ -5963,7 +6793,8 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
   assert(Group && "Fail to get an interleaved access group.");
 
   unsigned InterleaveFactor = Group->getFactor();
-  auto *WideVecTy = FixedVectorType::get(ValTy, VF * InterleaveFactor);
+  assert(!VF.isScalable() && "scalable vectors not yet supported.");
+  auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
 
   // Holds the indices of existing members in an interleaved load group.
   // An interleaved store group doesn't need this as it doesn't allow gaps.
@@ -5977,7 +6808,7 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
   // Calculate the cost of the whole interleaved group.
   bool UseMaskForGaps =
       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
-  unsigned Cost = TTI.getInterleavedMemoryOpCost(
+  InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
 
@@ -5991,11 +6822,122 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
   return Cost;
 }
 
-unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
-                                                              unsigned VF) {
+InstructionCost LoopVectorizationCostModel::getReductionPatternCost(
+    Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) {
+  // Early exit for no inloop reductions
+  if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty))
+    return InstructionCost::getInvalid();
+  auto *VectorTy = cast<VectorType>(Ty);
+
+  // We are looking for a pattern of, and finding the minimal acceptable cost:
+  //  reduce(mul(ext(A), ext(B))) or
+  //  reduce(mul(A, B)) or
+  //  reduce(ext(A)) or
+  //  reduce(A).
+  // The basic idea is that we walk down the tree to do that, finding the root
+  // reduction instruction in InLoopReductionImmediateChains. From there we find
+  // the pattern of mul/ext and test the cost of the entire pattern vs the cost
+  // of the components. If the reduction cost is lower then we return it for the
+  // reduction instruction and 0 for the other instructions in the pattern. If
+  // it is not we return an invalid cost specifying the orignal cost method
+  // should be used.
+  Instruction *RetI = I;
+  if ((RetI->getOpcode() == Instruction::SExt ||
+       RetI->getOpcode() == Instruction::ZExt)) {
+    if (!RetI->hasOneUser())
+      return InstructionCost::getInvalid();
+    RetI = RetI->user_back();
+  }
+  if (RetI->getOpcode() == Instruction::Mul &&
+      RetI->user_back()->getOpcode() == Instruction::Add) {
+    if (!RetI->hasOneUser())
+      return InstructionCost::getInvalid();
+    RetI = RetI->user_back();
+  }
+
+  // Test if the found instruction is a reduction, and if not return an invalid
+  // cost specifying the parent to use the original cost modelling.
+  if (!InLoopReductionImmediateChains.count(RetI))
+    return InstructionCost::getInvalid();
+
+  // Find the reduction this chain is a part of and calculate the basic cost of
+  // the reduction on its own.
+  Instruction *LastChain = InLoopReductionImmediateChains[RetI];
+  Instruction *ReductionPhi = LastChain;
+  while (!isa<PHINode>(ReductionPhi))
+    ReductionPhi = InLoopReductionImmediateChains[ReductionPhi];
+
+  RecurrenceDescriptor RdxDesc =
+      Legal->getReductionVars()[cast<PHINode>(ReductionPhi)];
+  unsigned BaseCost = TTI.getArithmeticReductionCost(RdxDesc.getOpcode(),
+                                                     VectorTy, false, CostKind);
+
+  // Get the operand that was not the reduction chain and match it to one of the
+  // patterns, returning the better cost if it is found.
+  Instruction *RedOp = RetI->getOperand(1) == LastChain
+                           ? dyn_cast<Instruction>(RetI->getOperand(0))
+                           : dyn_cast<Instruction>(RetI->getOperand(1));
+
+  VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
+
+  if (RedOp && (isa<SExtInst>(RedOp) || isa<ZExtInst>(RedOp)) &&
+      !TheLoop->isLoopInvariant(RedOp)) {
+    bool IsUnsigned = isa<ZExtInst>(RedOp);
+    auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
+    InstructionCost RedCost = TTI.getExtendedAddReductionCost(
+        /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
+        CostKind);
+
+    unsigned ExtCost =
+        TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
+                             TTI::CastContextHint::None, CostKind, RedOp);
+    if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
+      return I == RetI ? *RedCost.getValue() : 0;
+  } else if (RedOp && RedOp->getOpcode() == Instruction::Mul) {
+    Instruction *Mul = RedOp;
+    Instruction *Op0 = dyn_cast<Instruction>(Mul->getOperand(0));
+    Instruction *Op1 = dyn_cast<Instruction>(Mul->getOperand(1));
+    if (Op0 && Op1 && (isa<SExtInst>(Op0) || isa<ZExtInst>(Op0)) &&
+        Op0->getOpcode() == Op1->getOpcode() &&
+        Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
+        !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
+      bool IsUnsigned = isa<ZExtInst>(Op0);
+      auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
+      // reduce(mul(ext, ext))
+      unsigned ExtCost =
+          TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType,
+                               TTI::CastContextHint::None, CostKind, Op0);
+      unsigned MulCost =
+          TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind);
+
+      InstructionCost RedCost = TTI.getExtendedAddReductionCost(
+          /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
+          CostKind);
+
+      if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost)
+        return I == RetI ? *RedCost.getValue() : 0;
+    } else {
+      unsigned MulCost =
+          TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind);
+
+      InstructionCost RedCost = TTI.getExtendedAddReductionCost(
+          /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy,
+          CostKind);
+
+      if (RedCost.isValid() && RedCost < MulCost + BaseCost)
+        return I == RetI ? *RedCost.getValue() : 0;
+    }
+  }
+
+  return I == RetI ? BaseCost : InstructionCost::getInvalid();
+}
+
+InstructionCost
+LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
+                                                     ElementCount VF) {
   // Calculate scalar cost only. Vectorization cost should be ready at this
   // moment.
-  if (VF == 1) {
+  if (VF.isScalar()) {
     Type *ValTy = getMemInstValueType(I);
     const Align Alignment = getLoadStoreAlignment(I);
     unsigned AS = getLoadStoreAddressSpace(I);
@@ -6008,43 +6950,52 @@ unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
 }
 
 LoopVectorizationCostModel::VectorizationCostTy
-LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
+LoopVectorizationCostModel::getInstructionCost(Instruction *I,
+                                               ElementCount VF) {
   // If we know that this instruction will remain uniform, check the cost of
   // the scalar version.
   if (isUniformAfterVectorization(I, VF))
-    VF = 1;
+    VF = ElementCount::getFixed(1);
 
-  if (VF > 1 && isProfitableToScalarize(I, VF))
+  if (VF.isVector() && isProfitableToScalarize(I, VF))
     return VectorizationCostTy(InstsToScalarize[VF][I], false);
 
   // Forced scalars do not have any scalarization overhead.
   auto ForcedScalar = ForcedScalars.find(VF);
-  if (VF > 1 && ForcedScalar != ForcedScalars.end()) {
+  if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
     auto InstSet = ForcedScalar->second;
     if (InstSet.count(I))
-      return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
+      return VectorizationCostTy(
+          (getInstructionCost(I, ElementCount::getFixed(1)).first *
+           VF.getKnownMinValue()),
+          false);
   }
 
   Type *VectorTy;
-  unsigned C = getInstructionCost(I, VF, VectorTy);
+  InstructionCost C = getInstructionCost(I, VF, VectorTy);
 
   bool TypeNotScalarized =
-      VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
+      VF.isVector() && VectorTy->isVectorTy() &&
+      TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue();
   return VectorizationCostTy(C, TypeNotScalarized);
 }
 
-unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
-                                                              unsigned VF) {
+InstructionCost
+LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
+                                                     ElementCount VF) {
 
-  if (VF == 1)
+  assert(!VF.isScalable() &&
+         "cannot compute scalarization overhead for scalable vectorization");
+  if (VF.isScalar())
     return 0;
 
-  unsigned Cost = 0;
+  InstructionCost Cost = 0;
   Type *RetTy = ToVectorTy(I->getType(), VF);
   if (!RetTy->isVoidTy() &&
       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
     Cost += TTI.getScalarizationOverhead(
-        cast<VectorType>(RetTy), APInt::getAllOnesValue(VF), true, false);
+        cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()),
+        true, false);
 
   // Some targets keep addresses scalar.
   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
@@ -6061,11 +7012,11 @@ unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
   // Skip operands that do not require extraction/scalarization and do not incur
   // any overhead.
   return Cost + TTI.getOperandsScalarizationOverhead(
-                    filterExtractingOperands(Ops, VF), VF);
+                    filterExtractingOperands(Ops, VF), VF.getKnownMinValue());
 }
 
-void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
-  if (VF == 1)
+void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
+  if (VF.isScalar())
     return;
   NumPredStores = 0;
   for (BasicBlock *BB : TheLoop->blocks()) {
@@ -6082,23 +7033,19 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
         NumPredStores++;
 
-      if (Legal->isUniform(Ptr) &&
-          // Conditional loads and stores should be scalarized and predicated.
-          // isScalarWithPredication cannot be used here since masked
-          // gather/scatters are not considered scalar with predication.
-          !Legal->blockNeedsPredication(I.getParent())) {
+      if (Legal->isUniformMemOp(I)) {
         // TODO: Avoid replicating loads and stores instead of
         // relying on instcombine to remove them.
         // Load: Scalar load + broadcast
         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
-        unsigned Cost = getUniformMemOpCost(&I, VF);
+        InstructionCost Cost = getUniformMemOpCost(&I, VF);
         setWideningDecision(&I, VF, CM_Scalarize, Cost);
         continue;
       }
 
       // We assume that widening is the best solution when possible.
       if (memoryInstructionCanBeWidened(&I, VF)) {
-        unsigned Cost = getConsecutiveMemOpCost(&I, VF);
+        InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
         int ConsecutiveStride =
                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
@@ -6110,7 +7057,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
       }
 
       // Choose between Interleaving, Gather/Scatter or Scalarization.
-      unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
+      InstructionCost InterleaveCost = std::numeric_limits<int>::max();
       unsigned NumAccesses = 1;
       if (isAccessInterleaved(&I)) {
         auto Group = getInterleavedAccessGroup(&I);
@@ -6125,17 +7072,17 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
           InterleaveCost = getInterleaveGroupCost(&I, VF);
       }
 
-      unsigned GatherScatterCost =
+      InstructionCost GatherScatterCost =
           isLegalGatherOrScatter(&I)
               ? getGatherScatterCost(&I, VF) * NumAccesses
-              : std::numeric_limits<unsigned>::max();
+              : std::numeric_limits<int>::max();
 
-      unsigned ScalarizationCost =
+      InstructionCost ScalarizationCost =
           getMemInstScalarizationCost(&I, VF) * NumAccesses;
 
       // Choose better solution for the current VF,
       // write down this decision and use it during vectorization.
-      unsigned Cost;
+      InstructionCost Cost;
       InstWidening Decision;
       if (InterleaveCost <= GatherScatterCost &&
           InterleaveCost < ScalarizationCost) {
@@ -6179,8 +7126,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
 
   // Add all instructions used to generate the addresses.
   SmallVector<Instruction *, 4> Worklist;
-  for (auto *I : AddrDefs)
-    Worklist.push_back(I);
+  append_range(Worklist, AddrDefs);
   while (!Worklist.empty()) {
     Instruction *I = Worklist.pop_back_val();
     for (auto &Op : I->operands())
@@ -6199,14 +7145,18 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
       InstWidening Decision = getWideningDecision(I, VF);
       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
         // Scalarize a widened load of address.
-        setWideningDecision(I, VF, CM_Scalarize,
-                            (VF * getMemoryInstructionCost(I, 1)));
+        setWideningDecision(
+            I, VF, CM_Scalarize,
+            (VF.getKnownMinValue() *
+             getMemoryInstructionCost(I, ElementCount::getFixed(1))));
       else if (auto Group = getInterleavedAccessGroup(I)) {
         // Scalarize an interleave group of address loads.
         for (unsigned I = 0; I < Group->getFactor(); ++I) {
           if (Instruction *Member = Group->getMember(I))
-            setWideningDecision(Member, VF, CM_Scalarize,
-                                (VF * getMemoryInstructionCost(Member, 1)));
+            setWideningDecision(
+                Member, VF, CM_Scalarize,
+                (VF.getKnownMinValue() *
+                 getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
         }
       }
     } else
@@ -6216,9 +7166,9 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
   }
 }
 
-unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
-                                                        unsigned VF,
-                                                        Type *&VectorTy) {
+InstructionCost
+LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
+                                               Type *&VectorTy) {
   Type *RetTy = I->getType();
   if (canTruncateToMinimalBitwidth(I, VF))
     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
@@ -6240,19 +7190,22 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
     // blocks requires also an extract of its vector compare i1 element.
     bool ScalarPredicatedBB = false;
     BranchInst *BI = cast<BranchInst>(I);
-    if (VF > 1 && BI->isConditional() &&
+    if (VF.isVector() && BI->isConditional() &&
         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
       ScalarPredicatedBB = true;
 
     if (ScalarPredicatedBB) {
       // Return cost for branches around scalarized and predicated blocks.
+      assert(!VF.isScalable() && "scalable vectors not yet supported.");
       auto *Vec_i1Ty =
-          FixedVectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
-      return (TTI.getScalarizationOverhead(Vec_i1Ty, APInt::getAllOnesValue(VF),
-                                           false, true) +
-              (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF));
-    } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
+          VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
+      return (TTI.getScalarizationOverhead(
+                  Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()),
+                  false, true) +
+              (TTI.getCFInstrCost(Instruction::Br, CostKind) *
+               VF.getKnownMinValue()));
+    } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
       // The back-edge branch will remain, as will all scalar branches.
       return TTI.getCFInstrCost(Instruction::Br, CostKind);
     else
@@ -6267,20 +7220,20 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
 
     // First-order recurrences are replaced by vector shuffles inside the loop.
     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
-    if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
-      return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
-                                cast<VectorType>(VectorTy), VF - 1,
-                                FixedVectorType::get(RetTy, 1));
+    if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
+      return TTI.getShuffleCost(
+          TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
+          VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
 
     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
     // converted into select instructions. We require N - 1 selects per phi
     // node, where N is the number of incoming values.
-    if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
+    if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
       return (Phi->getNumIncomingValues() - 1) *
              TTI.getCmpSelInstrCost(
                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
-                 CostKind);
+                 CmpInst::BAD_ICMP_PREDICATE, CostKind);
 
     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
   }
@@ -6292,17 +7245,19 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
     // vector lane. Get the scalarization cost and scale this amount by the
     // probability of executing the predicated block. If the instruction is not
     // predicated, we fall through to the next case.
-    if (VF > 1 && isScalarWithPredication(I)) {
-      unsigned Cost = 0;
+    if (VF.isVector() && isScalarWithPredication(I)) {
+      InstructionCost Cost = 0;
 
       // These instructions have a non-void type, so account for the phi nodes
       // that we will create. This cost is likely to be zero. The phi node
       // cost, if any, should be scaled by the block probability because it
       // models a copy at the end of each predicated block.
-      Cost += VF * TTI.getCFInstrCost(Instruction::PHI, CostKind);
+      Cost += VF.getKnownMinValue() *
+              TTI.getCFInstrCost(Instruction::PHI, CostKind);
 
       // The cost of the non-predicated instruction.
-      Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
+      Cost += VF.getKnownMinValue() *
+              TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
 
       // The cost of insertelement and extractelement instructions needed for
       // scalarization.
@@ -6331,6 +7286,13 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
     // Since we will replace the stride by 1 the multiplication should go away.
     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
       return 0;
+
+    // Detect reduction patterns
+    InstructionCost RedCost;
+    if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
+            .isValid())
+      return RedCost;
+
     // Certain instructions can be cheaper to vectorize if they have a constant
     // second vector operand. One example of this are shifts on x86.
     Value *Op2 = I->getOperand(1);
@@ -6341,14 +7303,15 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
       Op2VK = TargetTransformInfo::OK_UniformValue;
 
     SmallVector<const Value *, 4> Operands(I->operand_values());
-    unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
+    unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
     return N * TTI.getArithmeticInstrCost(
                    I->getOpcode(), VectorTy, CostKind,
                    TargetTransformInfo::OK_AnyValue,
                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
   }
   case Instruction::FNeg: {
-    unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
+    assert(!VF.isScalable() && "VF is assumed to be non scalable.");
+    unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
     return N * TTI.getArithmeticInstrCost(
                    I->getOpcode(), VectorTy, CostKind,
                    TargetTransformInfo::OK_AnyValue,
@@ -6362,10 +7325,9 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
     Type *CondTy = SI->getCondition()->getType();
     if (!ScalarCond)
-      CondTy = FixedVectorType::get(CondTy, VF);
-
+      CondTy = VectorType::get(CondTy, VF);
     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
-                                  CostKind, I);
+                                  CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
   }
   case Instruction::ICmp:
   case Instruction::FCmp: {
@@ -6374,18 +7336,18 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
     VectorTy = ToVectorTy(ValTy, VF);
-    return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, CostKind,
-                                  I);
+    return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
+                                  CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
   }
   case Instruction::Store:
   case Instruction::Load: {
-    unsigned Width = VF;
-    if (Width > 1) {
+    ElementCount Width = VF;
+    if (Width.isVector()) {
       InstWidening Decision = getWideningDecision(I, Width);
       assert(Decision != CM_Unknown &&
              "CM decision should be taken at this point");
       if (Decision == CM_Scalarize)
-        Width = 1;
+        Width = ElementCount::getFixed(1);
     }
     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
     return getMemoryInstructionCost(I, VF);
@@ -6402,15 +7364,62 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
   case Instruction::Trunc:
   case Instruction::FPTrunc:
   case Instruction::BitCast: {
+    // Computes the CastContextHint from a Load/Store instruction.
+    auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
+      assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
+             "Expected a load or a store!");
+
+      if (VF.isScalar() || !TheLoop->contains(I))
+        return TTI::CastContextHint::Normal;
+
+      switch (getWideningDecision(I, VF)) {
+      case LoopVectorizationCostModel::CM_GatherScatter:
+        return TTI::CastContextHint::GatherScatter;
+      case LoopVectorizationCostModel::CM_Interleave:
+        return TTI::CastContextHint::Interleave;
+      case LoopVectorizationCostModel::CM_Scalarize:
+      case LoopVectorizationCostModel::CM_Widen:
+        return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
+                                        : TTI::CastContextHint::Normal;
+      case LoopVectorizationCostModel::CM_Widen_Reverse:
+        return TTI::CastContextHint::Reversed;
+      case LoopVectorizationCostModel::CM_Unknown:
+        llvm_unreachable("Instr did not go through cost modelling?");
+      }
+
+      llvm_unreachable("Unhandled case!");
+    };
+
+    unsigned Opcode = I->getOpcode();
+    TTI::CastContextHint CCH = TTI::CastContextHint::None;
+    // For Trunc, the context is the only user, which must be a StoreInst.
+    if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
+      if (I->hasOneUse())
+        if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
+          CCH = ComputeCCH(Store);
+    }
+    // For Z/Sext, the context is the operand, which must be a LoadInst.
+    else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
+             Opcode == Instruction::FPExt) {
+      if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
+        CCH = ComputeCCH(Load);
+    }
+
     // We optimize the truncation of induction variables having constant
     // integer steps. The cost of these truncations is the same as the scalar
     // operation.
     if (isOptimizableIVTruncate(I, VF)) {
       auto *Trunc = cast<TruncInst>(I);
       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
-                                  Trunc->getSrcTy(), CostKind, Trunc);
+                                  Trunc->getSrcTy(), CCH, CostKind, Trunc);
     }
 
+    // Detect reduction patterns
+    InstructionCost RedCost;
+    if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
+            .isValid())
+      return RedCost;
+
     Type *SrcScalarTy = I->getOperand(0)->getType();
     Type *SrcVecTy =
         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
@@ -6421,35 +7430,39 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
       //
       // Calculate the modified src and dest types.
       Type *MinVecTy = VectorTy;
-      if (I->getOpcode() == Instruction::Trunc) {
+      if (Opcode == Instruction::Trunc) {
         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
         VectorTy =
             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
-      } else if (I->getOpcode() == Instruction::ZExt ||
-                 I->getOpcode() == Instruction::SExt) {
+      } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
         VectorTy =
             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
       }
     }
 
-    unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
-    return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy,
-                                    CostKind, I);
+    assert(!VF.isScalable() && "VF is assumed to be non scalable");
+    unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
+    return N *
+           TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
   }
   case Instruction::Call: {
     bool NeedToScalarize;
     CallInst *CI = cast<CallInst>(I);
-    unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
-    if (getVectorIntrinsicIDForCall(CI, TLI))
-      return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
+    InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
+    if (getVectorIntrinsicIDForCall(CI, TLI)) {
+      InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
+      return std::min(CallCost, IntrinsicCost);
+    }
     return CallCost;
   }
+  case Instruction::ExtractValue:
+    return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
   default:
     // The cost of executing VF copies of the scalar instruction. This opcode
     // is unknown. Assume that it is the same as 'mul'.
-    return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy,
-                                           CostKind) +
+    return VF.getKnownMinValue() * TTI.getArithmeticInstrCost(
+                                       Instruction::Mul, VectorTy, CostKind) +
            getScalarizationOverhead(I, VF);
   } // end of switch.
 }
@@ -6502,7 +7515,7 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
   // detection.
   for (auto &Reduction : Legal->getReductionVars()) {
     RecurrenceDescriptor &RedDes = Reduction.second;
-    SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
+    const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
   }
   // Ignore type-casting instructions we identified during induction
@@ -6514,6 +7527,43 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
   }
 }
 
+void LoopVectorizationCostModel::collectInLoopReductions() {
+  for (auto &Reduction : Legal->getReductionVars()) {
+    PHINode *Phi = Reduction.first;
+    RecurrenceDescriptor &RdxDesc = Reduction.second;
+
+    // We don't collect reductions that are type promoted (yet).
+    if (RdxDesc.getRecurrenceType() != Phi->getType())
+      continue;
+
+    // If the target would prefer this reduction to happen "in-loop", then we
+    // want to record it as such.
+    unsigned Opcode = RdxDesc.getOpcode();
+    if (!PreferInLoopReductions &&
+        !TTI.preferInLoopReduction(Opcode, Phi->getType(),
+                                   TargetTransformInfo::ReductionFlags()))
+      continue;
+
+    // Check that we can correctly put the reductions into the loop, by
+    // finding the chain of operations that leads from the phi to the loop
+    // exit value.
+    SmallVector<Instruction *, 4> ReductionOperations =
+        RdxDesc.getReductionOpChain(Phi, TheLoop);
+    bool InLoop = !ReductionOperations.empty();
+    if (InLoop) {
+      InLoopReductionChains[Phi] = ReductionOperations;
+      // Add the elements to InLoopReductionImmediateChains for cost modelling.
+      Instruction *LastChain = Phi;
+      for (auto *I : ReductionOperations) {
+        InLoopReductionImmediateChains[I] = LastChain;
+        LastChain = I;
+      }
+    }
+    LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
+                      << " reduction for phi: " << *Phi << "\n");
+  }
+}
+
 // TODO: we could return a pair of values that specify the max VF and
 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
@@ -6527,37 +7577,40 @@ static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
 }
 
 VectorizationFactor
-LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
-  unsigned VF = UserVF;
+LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
+  assert(!UserVF.isScalable() && "scalable vectors not yet supported");
+  ElementCount VF = UserVF;
   // Outer loop handling: They may require CFG and instruction level
   // transformations before even evaluating whether vectorization is profitable.
   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
   // the vectorization pipeline.
-  if (!OrigLoop->empty()) {
+  if (!OrigLoop->isInnermost()) {
     // If the user doesn't provide a vectorization factor, determine a
     // reasonable one.
-    if (!UserVF) {
-      VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM);
+    if (UserVF.isZero()) {
+      VF = ElementCount::getFixed(
+          determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM));
       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
 
       // Make sure we have a VF > 1 for stress testing.
-      if (VPlanBuildStressTest && VF < 2) {
+      if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
                           << "overriding computed VF.\n");
-        VF = 4;
+        VF = ElementCount::getFixed(4);
       }
     }
     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
-    assert(isPowerOf2_32(VF) && "VF needs to be a power of two");
-    LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF
-                      << " to build VPlans.\n");
+    assert(isPowerOf2_32(VF.getKnownMinValue()) &&
+           "VF needs to be a power of two");
+    LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
+                      << "VF " << VF << " to build VPlans.\n");
     buildVPlans(VF, VF);
 
     // For VPlan build stress testing, we bail out after VPlan construction.
     if (VPlanBuildStressTest)
       return VectorizationFactor::Disabled();
 
-    return {VF, 0};
+    return {VF, 0 /*Cost*/};
   }
 
   LLVM_DEBUG(
@@ -6566,10 +7619,10 @@ LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
   return VectorizationFactor::Disabled();
 }
 
-Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF,
-                                                             unsigned UserIC) {
-  assert(OrigLoop->empty() && "Inner loop expected.");
-  Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC);
+Optional<VectorizationFactor>
+LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
+  assert(OrigLoop->isInnermost() && "Inner loop expected.");
+  Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC);
   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
     return None;
 
@@ -6587,40 +7640,55 @@ Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF,
       CM.invalidateCostModelingDecisions();
   }
 
-  if (UserVF) {
-    LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
-    assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
+  ElementCount MaxVF = MaybeMaxVF.getValue();
+  assert(MaxVF.isNonZero() && "MaxVF is zero.");
+
+  bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxVF);
+  if (!UserVF.isZero() &&
+      (UserVFIsLegal || (UserVF.isScalable() && MaxVF.isScalable()))) {
+    // FIXME: MaxVF is temporarily used inplace of UserVF for illegal scalable
+    // VFs here, this should be reverted to only use legal UserVFs once the
+    // loop below supports scalable VFs.
+    ElementCount VF = UserVFIsLegal ? UserVF : MaxVF;
+    LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max")
+                      << " VF " << VF << ".\n");
+    assert(isPowerOf2_32(VF.getKnownMinValue()) &&
+           "VF needs to be a power of two");
     // Collect the instructions (and their associated costs) that will be more
     // profitable to scalarize.
-    CM.selectUserVectorizationFactor(UserVF);
-    buildVPlansWithVPRecipes(UserVF, UserVF);
+    CM.selectUserVectorizationFactor(VF);
+    CM.collectInLoopReductions();
+    buildVPlansWithVPRecipes(VF, VF);
     LLVM_DEBUG(printPlans(dbgs()));
-    return {{UserVF, 0}};
+    return {{VF, 0}};
   }
 
-  unsigned MaxVF = MaybeMaxVF.getValue();
-  assert(MaxVF != 0 && "MaxVF is zero.");
+  assert(!MaxVF.isScalable() &&
+         "Scalable vectors not yet supported beyond this point");
 
-  for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
+  for (ElementCount VF = ElementCount::getFixed(1);
+       ElementCount::isKnownLE(VF, MaxVF); VF *= 2) {
     // Collect Uniform and Scalar instructions after vectorization with VF.
     CM.collectUniformsAndScalars(VF);
 
     // Collect the instructions (and their associated costs) that will be more
     // profitable to scalarize.
-    if (VF > 1)
+    if (VF.isVector())
       CM.collectInstsToScalarize(VF);
   }
 
-  buildVPlansWithVPRecipes(1, MaxVF);
+  CM.collectInLoopReductions();
+
+  buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF);
   LLVM_DEBUG(printPlans(dbgs()));
-  if (MaxVF == 1)
+  if (MaxVF.isScalar())
     return VectorizationFactor::Disabled();
 
   // Select the optimal vectorization factor.
   return CM.selectVectorizationFactor(MaxVF);
 }
 
-void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
+void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) {
   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
                     << '\n');
   BestVF = VF;
@@ -6639,13 +7707,23 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
   VPCallbackILV CallbackILV(ILV);
 
-  VPTransformState State{BestVF, BestUF,      LI,
-                         DT,     ILV.Builder, ILV.VectorLoopValueMap,
-                         &ILV,   CallbackILV};
+  assert(BestVF.hasValue() && "Vectorization Factor is missing");
+
+  VPTransformState State{*BestVF,
+                         BestUF,
+                         OrigLoop,
+                         LI,
+                         DT,
+                         ILV.Builder,
+                         ILV.VectorLoopValueMap,
+                         &ILV,
+                         CallbackILV};
   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
   State.TripCount = ILV.getOrCreateTripCount(nullptr);
   State.CanonicalIV = ILV.Induction;
 
+  ILV.printDebugTracesAtStart();
+
   //===------------------------------------------------===//
   //
   // Notice: any optimization or new instruction that go
@@ -6661,25 +7739,48 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
   // 3. Fix the vectorized code: take care of header phi's, live-outs,
   //    predication, updating analyses.
   ILV.fixVectorizedLoop();
+
+  ILV.printDebugTracesAtEnd();
 }
 
 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
-  BasicBlock *Latch = OrigLoop->getLoopLatch();
 
-  // We create new control-flow for the vectorized loop, so the original
-  // condition will be dead after vectorization if it's only used by the
-  // branch.
-  auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
-  if (Cmp && Cmp->hasOneUse())
-    DeadInstructions.insert(Cmp);
+  // We create new control-flow for the vectorized loop, so the original exit
+  // conditions will be dead after vectorization if it's only used by the
+  // terminator
+  SmallVector<BasicBlock*> ExitingBlocks;
+  OrigLoop->getExitingBlocks(ExitingBlocks);
+  for (auto *BB : ExitingBlocks) {
+    auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0));
+    if (!Cmp || !Cmp->hasOneUse())
+      continue;
+
+    // TODO: we should introduce a getUniqueExitingBlocks on Loop
+    if (!DeadInstructions.insert(Cmp).second)
+      continue;
+
+    // The operands of the icmp is often a dead trunc, used by IndUpdate.
+    // TODO: can recurse through operands in general
+    for (Value *Op : Cmp->operands()) {
+      if (isa<TruncInst>(Op) && Op->hasOneUse())
+          DeadInstructions.insert(cast<Instruction>(Op));
+    }
+  }
 
   // We create new "steps" for induction variable updates to which the original
   // induction variables map. An original update instruction will be dead if
   // all its users except the induction variable are dead.
+  auto *Latch = OrigLoop->getLoopLatch();
   for (auto &Induction : Legal->getInductionVars()) {
     PHINode *Ind = Induction.first;
     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
+
+    // If the tail is to be folded by masking, the primary induction variable,
+    // if exists, isn't dead: it will be used for masking. Don't kill it.
+    if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction())
+      continue;
+
     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
         }))
@@ -6754,12 +7855,284 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) {
   }
 }
 
+//===--------------------------------------------------------------------===//
+// EpilogueVectorizerMainLoop
+//===--------------------------------------------------------------------===//
+
+/// This function is partially responsible for generating the control flow
+/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
+BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
+  MDNode *OrigLoopID = OrigLoop->getLoopID();
+  Loop *Lp = createVectorLoopSkeleton("");
+
+  // Generate the code to check the minimum iteration count of the vector
+  // epilogue (see below).
+  EPI.EpilogueIterationCountCheck =
+      emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true);
+  EPI.EpilogueIterationCountCheck->setName("iter.check");
+
+  // Generate the code to check any assumptions that we've made for SCEV
+  // expressions.
+  BasicBlock *SavedPreHeader = LoopVectorPreHeader;
+  emitSCEVChecks(Lp, LoopScalarPreHeader);
+
+  // If a safety check was generated save it.
+  if (SavedPreHeader != LoopVectorPreHeader)
+    EPI.SCEVSafetyCheck = SavedPreHeader;
+
+  // Generate the code that checks at runtime if arrays overlap. We put the
+  // checks into a separate block to make the more common case of few elements
+  // faster.
+  SavedPreHeader = LoopVectorPreHeader;
+  emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
+
+  // If a safety check was generated save/overwite it.
+  if (SavedPreHeader != LoopVectorPreHeader)
+    EPI.MemSafetyCheck = SavedPreHeader;
+
+  // Generate the iteration count check for the main loop, *after* the check
+  // for the epilogue loop, so that the path-length is shorter for the case
+  // that goes directly through the vector epilogue. The longer-path length for
+  // the main loop is compensated for, by the gain from vectorizing the larger
+  // trip count. Note: the branch will get updated later on when we vectorize
+  // the epilogue.
+  EPI.MainLoopIterationCountCheck =
+      emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false);
+
+  // Generate the induction variable.
+  OldInduction = Legal->getPrimaryInduction();
+  Type *IdxTy = Legal->getWidestInductionType();
+  Value *StartIdx = ConstantInt::get(IdxTy, 0);
+  Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
+  Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
+  EPI.VectorTripCount = CountRoundDown;
+  Induction =
+      createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
+                              getDebugLocFromInstOrOperands(OldInduction));
+
+  // Skip induction resume value creation here because they will be created in
+  // the second pass. If we created them here, they wouldn't be used anyway,
+  // because the vplan in the second pass still contains the inductions from the
+  // original loop.
+
+  return completeLoopSkeleton(Lp, OrigLoopID);
+}
+
+void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
+  LLVM_DEBUG({
+    dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
+           << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue()
+           << ", Main Loop UF:" << EPI.MainLoopUF
+           << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
+           << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
+  });
+}
+
+void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
+  DEBUG_WITH_TYPE(VerboseDebug, {
+    dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n";
+  });
+}
+
+BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
+    Loop *L, BasicBlock *Bypass, bool ForEpilogue) {
+  assert(L && "Expected valid Loop.");
+  assert(Bypass && "Expected valid bypass basic block.");
+  unsigned VFactor =
+      ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue();
+  unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
+  Value *Count = getOrCreateTripCount(L);
+  // Reuse existing vector loop preheader for TC checks.
+  // Note that new preheader block is generated for vector loop.
+  BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
+  IRBuilder<> Builder(TCCheckBlock->getTerminator());
+
+  // Generate code to check if the loop's trip count is less than VF * UF of the
+  // main vector loop.
+  auto P =
+      Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
+
+  Value *CheckMinIters = Builder.CreateICmp(
+      P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor),
+      "min.iters.check");
+
+  if (!ForEpilogue)
+    TCCheckBlock->setName("vector.main.loop.iter.check");
+
+  // Create new preheader for vector loop.
+  LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
+                                   DT, LI, nullptr, "vector.ph");
+
+  if (ForEpilogue) {
+    assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
+                                 DT->getNode(Bypass)->getIDom()) &&
+           "TC check is expected to dominate Bypass");
+
+    // Update dominator for Bypass & LoopExit.
+    DT->changeImmediateDominator(Bypass, TCCheckBlock);
+    DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
+
+    LoopBypassBlocks.push_back(TCCheckBlock);
+
+    // Save the trip count so we don't have to regenerate it in the
+    // vec.epilog.iter.check. This is safe to do because the trip count
+    // generated here dominates the vector epilog iter check.
+    EPI.TripCount = Count;
+  }
+
+  ReplaceInstWithInst(
+      TCCheckBlock->getTerminator(),
+      BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
+
+  return TCCheckBlock;
+}
+
+//===--------------------------------------------------------------------===//
+// EpilogueVectorizerEpilogueLoop
+//===--------------------------------------------------------------------===//
+
+/// This function is partially responsible for generating the control flow
+/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
+BasicBlock *
+EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
+  MDNode *OrigLoopID = OrigLoop->getLoopID();
+  Loop *Lp = createVectorLoopSkeleton("vec.epilog.");
+
+  // Now, compare the remaining count and if there aren't enough iterations to
+  // execute the vectorized epilogue skip to the scalar part.
+  BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
+  VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
+  LoopVectorPreHeader =
+      SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
+                 LI, nullptr, "vec.epilog.ph");
+  emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader,
+                                          VecEpilogueIterationCountCheck);
+
+  // Adjust the control flow taking the state info from the main loop
+  // vectorization into account.
+  assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
+         "expected this to be saved from the previous pass.");
+  EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
+      VecEpilogueIterationCountCheck, LoopVectorPreHeader);
+
+  DT->changeImmediateDominator(LoopVectorPreHeader,
+                               EPI.MainLoopIterationCountCheck);
+
+  EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
+      VecEpilogueIterationCountCheck, LoopScalarPreHeader);
+
+  if (EPI.SCEVSafetyCheck)
+    EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
+        VecEpilogueIterationCountCheck, LoopScalarPreHeader);
+  if (EPI.MemSafetyCheck)
+    EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
+        VecEpilogueIterationCountCheck, LoopScalarPreHeader);
+
+  DT->changeImmediateDominator(
+      VecEpilogueIterationCountCheck,
+      VecEpilogueIterationCountCheck->getSinglePredecessor());
+
+  DT->changeImmediateDominator(LoopScalarPreHeader,
+                               EPI.EpilogueIterationCountCheck);
+  DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck);
+
+  // Keep track of bypass blocks, as they feed start values to the induction
+  // phis in the scalar loop preheader.
+  if (EPI.SCEVSafetyCheck)
+    LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
+  if (EPI.MemSafetyCheck)
+    LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
+  LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
+
+  // Generate a resume induction for the vector epilogue and put it in the
+  // vector epilogue preheader
+  Type *IdxTy = Legal->getWidestInductionType();
+  PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
+                                         LoopVectorPreHeader->getFirstNonPHI());
+  EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
+  EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
+                           EPI.MainLoopIterationCountCheck);
+
+  // Generate the induction variable.
+  OldInduction = Legal->getPrimaryInduction();
+  Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
+  Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
+  Value *StartIdx = EPResumeVal;
+  Induction =
+      createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
+                              getDebugLocFromInstOrOperands(OldInduction));
+
+  // Generate induction resume values. These variables save the new starting
+  // indexes for the scalar loop. They are used to test if there are any tail
+  // iterations left once the vector loop has completed.
+  // Note that when the vectorized epilogue is skipped due to iteration count
+  // check, then the resume value for the induction variable comes from
+  // the trip count of the main vector loop, hence passing the AdditionalBypass
+  // argument.
+  createInductionResumeValues(Lp, CountRoundDown,
+                              {VecEpilogueIterationCountCheck,
+                               EPI.VectorTripCount} /* AdditionalBypass */);
+
+  AddRuntimeUnrollDisableMetaData(Lp);
+  return completeLoopSkeleton(Lp, OrigLoopID);
+}
+
+BasicBlock *
+EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
+    Loop *L, BasicBlock *Bypass, BasicBlock *Insert) {
+
+  assert(EPI.TripCount &&
+         "Expected trip count to have been safed in the first pass.");
+  assert(
+      (!isa<Instruction>(EPI.TripCount) ||
+       DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
+      "saved trip count does not dominate insertion point.");
+  Value *TC = EPI.TripCount;
+  IRBuilder<> Builder(Insert->getTerminator());
+  Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
+
+  // Generate code to check if the loop's trip count is less than VF * UF of the
+  // vector epilogue loop.
+  auto P =
+      Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
+
+  Value *CheckMinIters = Builder.CreateICmp(
+      P, Count,
+      ConstantInt::get(Count->getType(),
+                       EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF),
+      "min.epilog.iters.check");
+
+  ReplaceInstWithInst(
+      Insert->getTerminator(),
+      BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
+
+  LoopBypassBlocks.push_back(Insert);
+  return Insert;
+}
+
+void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
+  LLVM_DEBUG({
+    dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
+           << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue()
+           << ", Main Loop UF:" << EPI.MainLoopUF
+           << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
+           << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
+  });
+}
+
+void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
+  DEBUG_WITH_TYPE(VerboseDebug, {
+    dbgs() << "final fn:\n" << *Induction->getFunction() << "\n";
+  });
+}
+
 bool LoopVectorizationPlanner::getDecisionAndClampRange(
-    const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
-  assert(Range.End > Range.Start && "Trying to test an empty VF range.");
+    const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
+  assert(!Range.isEmpty() && "Trying to test an empty VF range.");
   bool PredicateAtRangeStart = Predicate(Range.Start);
 
-  for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
+  for (ElementCount TmpVF = Range.Start * 2;
+       ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
     if (Predicate(TmpVF) != PredicateAtRangeStart) {
       Range.End = TmpVF;
       break;
@@ -6773,9 +8146,11 @@ bool LoopVectorizationPlanner::getDecisionAndClampRange(
 /// of VF's starting at a given VF and extending it as much as possible. Each
 /// vectorization decision can potentially shorten this sub-range during
 /// buildVPlan().
-void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
-  for (unsigned VF = MinVF; VF < MaxVF + 1;) {
-    VFRange SubRange = {VF, MaxVF + 1};
+void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
+                                           ElementCount MaxVF) {
+  auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
+  for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
+    VFRange SubRange = {VF, MaxVFPlusOne};
     VPlans.push_back(buildVPlan(SubRange));
     VF = SubRange.End;
   }
@@ -6800,14 +8175,27 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
     return EdgeMaskCache[Edge] = SrcMask;
 
-  VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
+  // If source is an exiting block, we know the exit edge is dynamically dead
+  // in the vector loop, and thus we don't need to restrict the mask.  Avoid
+  // adding uses of an otherwise potentially dead instruction.
+  if (OrigLoop->isLoopExiting(Src))
+    return EdgeMaskCache[Edge] = SrcMask;
+
+  VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
   assert(EdgeMask && "No Edge Mask found for condition");
 
   if (BI->getSuccessor(0) != Dst)
     EdgeMask = Builder.createNot(EdgeMask);
 
-  if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
-    EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
+  if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
+    // The condition is 'SrcMask && EdgeMask', which is equivalent to
+    // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
+    // The select version does not introduce new UB if SrcMask is false and
+    // EdgeMask is poison. Using 'and' here introduces undefined behavior.
+    VPValue *False = Plan->getOrAddVPValue(
+        ConstantInt::getFalse(BI->getCondition()->getType()));
+    EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False);
+  }
 
   return EdgeMaskCache[Edge] = EdgeMask;
 }
@@ -6828,23 +8216,34 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
     if (!CM.blockNeedsPredication(BB))
       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
 
+    // Create the block in mask as the first non-phi instruction in the block.
+    VPBuilder::InsertPointGuard Guard(Builder);
+    auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi();
+    Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint);
+
     // Introduce the early-exit compare IV <= BTC to form header block mask.
     // This is used instead of IV < TC because TC may wrap, unlike BTC.
     // Start by constructing the desired canonical IV.
     VPValue *IV = nullptr;
     if (Legal->getPrimaryInduction())
-      IV = Plan->getVPValue(Legal->getPrimaryInduction());
+      IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction());
     else {
       auto IVRecipe = new VPWidenCanonicalIVRecipe();
-      Builder.getInsertBlock()->appendRecipe(IVRecipe);
+      Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint);
       IV = IVRecipe->getVPValue();
     }
     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
     bool TailFolded = !CM.isScalarEpilogueAllowed();
-    if (TailFolded && CM.TTI.emitGetActiveLaneMask())
-      BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, BTC});
-    else
+
+    if (TailFolded && CM.TTI.emitGetActiveLaneMask()) {
+      // While ActiveLaneMask is a binary op that consumes the loop tripcount
+      // as a second argument, we only pass the IV here and extract the
+      // tripcount from the transform state where codegen of the VP instructions
+      // happen.
+      BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV});
+    } else {
       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
+    }
     return BlockMaskCache[BB] = BlockMask;
   }
 
@@ -6865,14 +8264,13 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
   return BlockMaskCache[BB] = BlockMask;
 }
 
-VPWidenMemoryInstructionRecipe *
-VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
-                                  VPlanPtr &Plan) {
+VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
+                                                VPlanPtr &Plan) {
   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
          "Must be called with either a load or store");
 
-  auto willWiden = [&](unsigned VF) -> bool {
-    if (VF == 1)
+  auto willWiden = [&](ElementCount VF) -> bool {
+    if (VF.isScalar())
       return false;
     LoopVectorizationCostModel::InstWidening Decision =
         CM.getWideningDecision(I, VF);
@@ -6903,20 +8301,22 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
 }
 
 VPWidenIntOrFpInductionRecipe *
-VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const {
+VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, VPlan &Plan) const {
   // Check if this is an integer or fp induction. If so, build the recipe that
   // produces its scalar and vector values.
   InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
   if (II.getKind() == InductionDescriptor::IK_IntInduction ||
-      II.getKind() == InductionDescriptor::IK_FpInduction)
-    return new VPWidenIntOrFpInductionRecipe(Phi);
+      II.getKind() == InductionDescriptor::IK_FpInduction) {
+    VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
+    return new VPWidenIntOrFpInductionRecipe(Phi, Start);
+  }
 
   return nullptr;
 }
 
 VPWidenIntOrFpInductionRecipe *
-VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I,
-                                                VFRange &Range) const {
+VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, VFRange &Range,
+                                                VPlan &Plan) const {
   // Optimize the special case where the source is a constant integer
   // induction variable. Notice that we can only optimize the 'trunc' case
   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
@@ -6925,15 +8325,21 @@ VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I,
   // Determine whether \p K is a truncation based on an induction variable that
   // can be optimized.
   auto isOptimizableIVTruncate =
-      [&](Instruction *K) -> std::function<bool(unsigned)> {
-    return
-        [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
+      [&](Instruction *K) -> std::function<bool(ElementCount)> {
+    return [=](ElementCount VF) -> bool {
+      return CM.isOptimizableIVTruncate(K, VF);
+    };
   };
 
   if (LoopVectorizationPlanner::getDecisionAndClampRange(
-          isOptimizableIVTruncate(I), Range))
+          isOptimizableIVTruncate(I), Range)) {
+
+    InductionDescriptor II =
+        Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0)));
+    VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
-                                             I);
+                                             Start, I);
+  }
   return nullptr;
 }
 
@@ -6962,7 +8368,9 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range,
                                                    VPlan &Plan) const {
 
   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
-      [this, CI](unsigned VF) { return CM.isScalarWithPredication(CI, VF); },
+      [this, CI](ElementCount VF) {
+        return CM.isScalarWithPredication(CI, VF);
+      },
       Range);
 
   if (IsPredicated)
@@ -6970,19 +8378,23 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range,
 
   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
-             ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
+             ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
+             ID == Intrinsic::pseudoprobe ||
+             ID == Intrinsic::experimental_noalias_scope_decl))
     return nullptr;
 
-  auto willWiden = [&](unsigned VF) -> bool {
+  auto willWiden = [&](ElementCount VF) -> bool {
     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
     // The following case may be scalarized depending on the VF.
     // The flag shows whether we use Intrinsic or a usual Call for vectorized
     // version of the instruction.
     // Is it beneficial to perform intrinsic call compared to lib call?
     bool NeedToScalarize = false;
-    unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
-    bool UseVectorIntrinsic =
-        ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
+    InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
+    InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0;
+    bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
+    assert(IntrinsicCost.isValid() && CallCost.isValid() &&
+           "Cannot have invalid costs while widening");
     return UseVectorIntrinsic || !NeedToScalarize;
   };
 
@@ -6997,7 +8409,7 @@ bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
   // Instruction should be widened, unless it is scalar after vectorization,
   // scalarization is profitable or it is predicated.
-  auto WillScalarize = [this, I](unsigned VF) -> bool {
+  auto WillScalarize = [this, I](ElementCount VF) -> bool {
     return CM.isScalarAfterVectorization(I, VF) ||
            CM.isProfitableToScalarize(I, VF) ||
            CM.isScalarWithPredication(I, VF);
@@ -7060,15 +8472,17 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(
     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
     VPlanPtr &Plan) {
   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
-      [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
+      [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
       Range);
 
   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
-      [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
+      [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); },
+      Range);
 
   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
                                        IsUniform, IsPredicated);
   setRecipe(I, Recipe);
+  Plan->addVPValue(I, Recipe);
 
   // Find if I uses a predicated instruction. If so, it will use its scalar
   // value. Avoid hoisting the insert-element which packs the scalar value into
@@ -7110,8 +8524,9 @@ VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
   assert(Instr->getParent() && "Predicated instruction not in any basic block");
   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
-  auto *PHIRecipe =
-      Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
+  auto *PHIRecipe = Instr->getType()->isVoidTy()
+                        ? nullptr
+                        : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr));
   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
@@ -7139,13 +8554,21 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
   if (auto Phi = dyn_cast<PHINode>(Instr)) {
     if (Phi->getParent() != OrigLoop->getHeader())
       return tryToBlend(Phi, Plan);
-    if ((Recipe = tryToOptimizeInductionPHI(Phi)))
+    if ((Recipe = tryToOptimizeInductionPHI(Phi, *Plan)))
       return Recipe;
+
+    if (Legal->isReductionVariable(Phi)) {
+      RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
+      VPValue *StartV =
+          Plan->getOrAddVPValue(RdxDesc.getRecurrenceStartValue());
+      return new VPWidenPHIRecipe(Phi, RdxDesc, *StartV);
+    }
+
     return new VPWidenPHIRecipe(Phi);
   }
 
-  if (isa<TruncInst>(Instr) &&
-      (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range)))
+  if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate(
+                                    cast<TruncInst>(Instr), Range, *Plan)))
     return Recipe;
 
   if (!shouldWiden(Instr, Range))
@@ -7165,35 +8588,9 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
   return tryToWiden(Instr, *Plan);
 }
 
-void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
-                                                        unsigned MaxVF) {
-  assert(OrigLoop->empty() && "Inner loop expected.");
-
-  // Collect conditions feeding internal conditional branches; they need to be
-  // represented in VPlan for it to model masking.
-  SmallPtrSet<Value *, 1> NeedDef;
-
-  auto *Latch = OrigLoop->getLoopLatch();
-  for (BasicBlock *BB : OrigLoop->blocks()) {
-    if (BB == Latch)
-      continue;
-    BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
-    if (Branch && Branch->isConditional())
-      NeedDef.insert(Branch->getCondition());
-  }
-
-  // If the tail is to be folded by masking, the primary induction variable, if
-  // exists needs to be represented in VPlan for it to model early-exit masking.
-  // Also, both the Phi and the live-out instruction of each reduction are
-  // required in order to introduce a select between them in VPlan.
-  if (CM.foldTailByMasking()) {
-    if (Legal->getPrimaryInduction())
-      NeedDef.insert(Legal->getPrimaryInduction());
-    for (auto &Reduction : Legal->getReductionVars()) {
-      NeedDef.insert(Reduction.first);
-      NeedDef.insert(Reduction.second.getLoopExitInstr());
-    }
-  }
+void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
+                                                        ElementCount MaxVF) {
+  assert(OrigLoop->isInnermost() && "Inner loop expected.");
 
   // Collect instructions from the original loop that will become trivially dead
   // in the vectorized loop. We don't need to vectorize these instructions. For
@@ -7216,17 +8613,17 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
   for (Instruction *I : DeadInstructions)
     SinkAfter.erase(I);
 
-  for (unsigned VF = MinVF; VF < MaxVF + 1;) {
-    VFRange SubRange = {VF, MaxVF + 1};
-    VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef,
-                                             DeadInstructions, SinkAfter));
+  auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
+  for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
+    VFRange SubRange = {VF, MaxVFPlusOne};
+    VPlans.push_back(
+        buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
     VF = SubRange.End;
   }
 }
 
 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
-    VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
-    SmallPtrSetImpl<Instruction *> &DeadInstructions,
+    VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
     const DenseMap<Instruction *, Instruction *> &SinkAfter) {
 
   // Hold a mapping from predicated instructions to their recipes, in order to
@@ -7249,14 +8646,28 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
     RecipeBuilder.recordRecipeOf(Entry.first);
     RecipeBuilder.recordRecipeOf(Entry.second);
   }
+  for (auto &Reduction : CM.getInLoopReductionChains()) {
+    PHINode *Phi = Reduction.first;
+    RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind();
+    const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
+
+    RecipeBuilder.recordRecipeOf(Phi);
+    for (auto &R : ReductionOperations) {
+      RecipeBuilder.recordRecipeOf(R);
+      // For min/max reducitons, where we have a pair of icmp/select, we also
+      // need to record the ICmp recipe, so it can be removed later.
+      if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
+        RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
+    }
+  }
 
   // For each interleave group which is relevant for this (possibly trimmed)
   // Range, add it to the set of groups to be later applied to the VPlan and add
   // placeholders for its members' Recipes which we'll be replacing with a
   // single VPInterleaveRecipe.
   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
-    auto applyIG = [IG, this](unsigned VF) -> bool {
-      return (VF >= 2 && // Query is illegal for VF == 1
+    auto applyIG = [IG, this](ElementCount VF) -> bool {
+      return (VF.isVector() && // Query is illegal for VF == 1
               CM.getWideningDecision(IG->getInsertPos(), VF) ==
                   LoopVectorizationCostModel::CM_Interleave);
     };
@@ -7278,10 +8689,6 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
   Plan->setEntry(VPBB);
 
-  // Represent values that will have defs inside VPlan.
-  for (Value *V : NeedDef)
-    Plan->addVPValue(V);
-
   // Scan the body of the loop in a topological order to visit each basic block
   // after having visited its predecessor basic blocks.
   LoopBlocksDFS DFS(OrigLoop);
@@ -7308,6 +8715,11 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
 
       if (auto Recipe =
               RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) {
+        for (auto *Def : Recipe->definedValues()) {
+          auto *UV = Def->getUnderlyingValue();
+          Plan->addVPValue(UV, Def);
+        }
+
         RecipeBuilder.setRecipe(Instr, Recipe);
         VPBB->appendRecipe(Recipe);
         continue;
@@ -7343,6 +8755,18 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
   for (auto &Entry : SinkAfter) {
     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
+    // If the target is in a replication region, make sure to move Sink to the
+    // block after it, not into the replication region itself.
+    if (auto *Region =
+            dyn_cast_or_null<VPRegionBlock>(Target->getParent()->getParent())) {
+      if (Region->isReplicator()) {
+        assert(Region->getNumSuccessors() == 1 && "Expected SESE region!");
+        VPBasicBlock *NextBlock =
+            cast<VPBasicBlock>(Region->getSuccessors().front());
+        Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi());
+        continue;
+      }
+    }
     Sink->moveAfter(Target);
   }
 
@@ -7352,33 +8776,52 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
   for (auto IG : InterleaveGroups) {
     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
         RecipeBuilder.getRecipe(IG->getInsertPos()));
-    (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask()))
-        ->insertBefore(Recipe);
+    SmallVector<VPValue *, 4> StoredValues;
+    for (unsigned i = 0; i < IG->getFactor(); ++i)
+      if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i)))
+        StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0)));
 
+    auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
+                                        Recipe->getMask());
+    VPIG->insertBefore(Recipe);
+    unsigned J = 0;
     for (unsigned i = 0; i < IG->getFactor(); ++i)
       if (Instruction *Member = IG->getMember(i)) {
+        if (!Member->getType()->isVoidTy()) {
+          VPValue *OriginalV = Plan->getVPValue(Member);
+          Plan->removeVPValueFor(Member);
+          Plan->addVPValue(Member, VPIG->getVPValue(J));
+          OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
+          J++;
+        }
         RecipeBuilder.getRecipe(Member)->eraseFromParent();
       }
   }
 
+  // Adjust the recipes for any inloop reductions.
+  if (Range.Start.isVector())
+    adjustRecipesForInLoopReductions(Plan, RecipeBuilder);
+
   // Finally, if tail is folded by masking, introduce selects between the phi
   // and the live-out instruction of each reduction, at the end of the latch.
-  if (CM.foldTailByMasking()) {
+  if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) {
     Builder.setInsertPoint(VPBB);
     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
     for (auto &Reduction : Legal->getReductionVars()) {
-      VPValue *Phi = Plan->getVPValue(Reduction.first);
-      VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
+      if (CM.isInLoopReduction(Reduction.first))
+        continue;
+      VPValue *Phi = Plan->getOrAddVPValue(Reduction.first);
+      VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr());
       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
     }
   }
 
   std::string PlanName;
   raw_string_ostream RSO(PlanName);
-  unsigned VF = Range.Start;
+  ElementCount VF = Range.Start;
   Plan->addVF(VF);
   RSO << "Initial VPlan for VF={" << VF;
-  for (VF *= 2; VF < Range.End; VF *= 2) {
+  for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) {
     Plan->addVF(VF);
     RSO << "," << VF;
   }
@@ -7394,7 +8837,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
   // transformations before even evaluating whether vectorization is profitable.
   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
   // the vectorization pipeline.
-  assert(!OrigLoop->empty());
+  assert(!OrigLoop->isInnermost());
   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
 
   // Create new empty VPlan
@@ -7404,7 +8847,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
   HCFGBuilder.buildHierarchicalCFG();
 
-  for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
+  for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
+       VF *= 2)
     Plan->addVF(VF);
 
   if (EnableVPlanPredication) {
@@ -7422,6 +8866,67 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
   return Plan;
 }
 
+// Adjust the recipes for any inloop reductions. The chain of instructions
+// leading from the loop exit instr to the phi need to be converted to
+// reductions, with one operand being vector and the other being the scalar
+// reduction chain.
+void LoopVectorizationPlanner::adjustRecipesForInLoopReductions(
+    VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) {
+  for (auto &Reduction : CM.getInLoopReductionChains()) {
+    PHINode *Phi = Reduction.first;
+    RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
+    const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
+
+    // ReductionOperations are orders top-down from the phi's use to the
+    // LoopExitValue. We keep a track of the previous item (the Chain) to tell
+    // which of the two operands will remain scalar and which will be reduced.
+    // For minmax the chain will be the select instructions.
+    Instruction *Chain = Phi;
+    for (Instruction *R : ReductionOperations) {
+      VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
+      RecurKind Kind = RdxDesc.getRecurrenceKind();
+
+      VPValue *ChainOp = Plan->getVPValue(Chain);
+      unsigned FirstOpId;
+      if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
+        assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
+               "Expected to replace a VPWidenSelectSC");
+        FirstOpId = 1;
+      } else {
+        assert(isa<VPWidenRecipe>(WidenRecipe) &&
+               "Expected to replace a VPWidenSC");
+        FirstOpId = 0;
+      }
+      unsigned VecOpId =
+          R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
+      VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
+
+      auto *CondOp = CM.foldTailByMasking()
+                         ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
+                         : nullptr;
+      VPReductionRecipe *RedRecipe = new VPReductionRecipe(
+          &RdxDesc, R, ChainOp, VecOp, CondOp, Legal->hasFunNoNaNAttr(), TTI);
+      WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe);
+      Plan->removeVPValueFor(R);
+      Plan->addVPValue(R, RedRecipe);
+      WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
+      WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe);
+      WidenRecipe->eraseFromParent();
+
+      if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
+        VPRecipeBase *CompareRecipe =
+            RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
+        assert(isa<VPWidenRecipe>(CompareRecipe) &&
+               "Expected to replace a VPWidenSC");
+        assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 &&
+               "Expected no remaining users");
+        CompareRecipe->eraseFromParent();
+      }
+      Chain = R;
+    }
+  }
+}
+
 Value* LoopVectorizationPlanner::VPCallbackILV::
 getOrCreateVectorValues(Value *V, unsigned Part) {
       return ILV.getOrCreateVectorValue(V, Part);
@@ -7449,29 +8954,35 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
 }
 
 void VPWidenCallRecipe::execute(VPTransformState &State) {
-  State.ILV->widenCallInstruction(Ingredient, User, State);
+  State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
+                                  *this, State);
 }
 
 void VPWidenSelectRecipe::execute(VPTransformState &State) {
-  State.ILV->widenSelectInstruction(Ingredient, User, InvariantCond, State);
+  State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()),
+                                    this, *this, InvariantCond, State);
 }
 
 void VPWidenRecipe::execute(VPTransformState &State) {
-  State.ILV->widenInstruction(Ingredient, User, State);
+  State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State);
 }
 
 void VPWidenGEPRecipe::execute(VPTransformState &State) {
-  State.ILV->widenGEP(GEP, User, State.UF, State.VF, IsPtrLoopInvariant,
+  State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this,
+                      *this, State.UF, State.VF, IsPtrLoopInvariant,
                       IsIndexLoopInvariant, State);
 }
 
 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
   assert(!State.Instance && "Int or FP induction being replicated.");
-  State.ILV->widenIntOrFpInduction(IV, Trunc);
+  State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(),
+                                   Trunc);
 }
 
 void VPWidenPHIRecipe::execute(VPTransformState &State) {
-  State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
+  Value *StartV =
+      getStartValue() ? getStartValue()->getLiveInIRValue() : nullptr;
+  State.ILV->widenPHIInstruction(Phi, RdxDesc, StartV, State.UF, State.VF);
 }
 
 void VPBlendRecipe::execute(VPTransformState &State) {
@@ -7515,22 +9026,59 @@ void VPBlendRecipe::execute(VPTransformState &State) {
 
 void VPInterleaveRecipe::execute(VPTransformState &State) {
   assert(!State.Instance && "Interleave group being replicated.");
-  State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getMask());
+  State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
+                                      getStoredValues(), getMask());
+}
+
+void VPReductionRecipe::execute(VPTransformState &State) {
+  assert(!State.Instance && "Reduction being replicated.");
+  for (unsigned Part = 0; Part < State.UF; ++Part) {
+    RecurKind Kind = RdxDesc->getRecurrenceKind();
+    Value *NewVecOp = State.get(getVecOp(), Part);
+    if (VPValue *Cond = getCondOp()) {
+      Value *NewCond = State.get(Cond, Part);
+      VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
+      Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
+          Kind, VecTy->getElementType());
+      Constant *IdenVec =
+          ConstantVector::getSplat(VecTy->getElementCount(), Iden);
+      Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
+      NewVecOp = Select;
+    }
+    Value *NewRed =
+        createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp);
+    Value *PrevInChain = State.get(getChainOp(), Part);
+    Value *NextInChain;
+    if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
+      NextInChain =
+          createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(),
+                         NewRed, PrevInChain);
+    } else {
+      NextInChain = State.Builder.CreateBinOp(
+          (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed,
+          PrevInChain);
+    }
+    State.set(this, getUnderlyingInstr(), NextInChain, Part);
+  }
 }
 
 void VPReplicateRecipe::execute(VPTransformState &State) {
   if (State.Instance) { // Generate a single instance.
-    State.ILV->scalarizeInstruction(Ingredient, User, *State.Instance,
-                                    IsPredicated, State);
+    assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
+    State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this,
+                                    *State.Instance, IsPredicated, State);
     // Insert scalar instance packing it into a vector.
-    if (AlsoPack && State.VF > 1) {
-      // If we're constructing lane 0, initialize to start from undef.
+    if (AlsoPack && State.VF.isVector()) {
+      // If we're constructing lane 0, initialize to start from poison.
       if (State.Instance->Lane == 0) {
-        Value *Undef = UndefValue::get(
-            FixedVectorType::get(Ingredient->getType(), State.VF));
-        State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
+        assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
+        Value *Poison = PoisonValue::get(
+            VectorType::get(getUnderlyingValue()->getType(), State.VF));
+        State.ValueMap.setVectorValue(getUnderlyingInstr(),
+                                      State.Instance->Part, Poison);
       }
-      State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
+      State.ILV->packScalarIntoVectorValue(getUnderlyingInstr(),
+                                           *State.Instance);
     }
     return;
   }
@@ -7538,10 +9086,12 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
   // Generate scalar instances for all VF lanes of all UF parts, unless the
   // instruction is uniform inwhich case generate only the first lane for each
   // of the UF parts.
-  unsigned EndLane = IsUniform ? 1 : State.VF;
+  unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
+  assert((!State.VF.isScalable() || IsUniform) &&
+         "Can't scalarize a scalable vector");
   for (unsigned Part = 0; Part < State.UF; ++Part)
     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
-      State.ILV->scalarizeInstruction(Ingredient, User, {Part, Lane},
+      State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, {Part, Lane},
                                       IsPredicated, State);
 }
 
@@ -7573,8 +9123,8 @@ void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
 
 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
   assert(State.Instance && "Predicated instruction PHI works per instance.");
-  Instruction *ScalarPredInst = cast<Instruction>(
-      State.ValueMap.getScalarValue(PredInst, *State.Instance));
+  Instruction *ScalarPredInst =
+      cast<Instruction>(State.get(getOperand(0), *State.Instance));
   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
   assert(PredicatingBB && "Predicated block has no single predecessor.");
@@ -7586,6 +9136,8 @@ void VPPredInstPHIRecipe::execute(VPTransformState &State) {
   // also do that packing, thereby "hoisting" the insert-element sequence.
   // Otherwise, a phi node for the scalar value is needed.
   unsigned Part = State.Instance->Part;
+  Instruction *PredInst =
+      cast<Instruction>(getOperand(0)->getUnderlyingValue());
   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
@@ -7596,16 +9148,17 @@ void VPPredInstPHIRecipe::execute(VPTransformState &State) {
   } else {
     Type *PredInstType = PredInst->getType();
     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
-    Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
+    Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), PredicatingBB);
     Phi->addIncoming(ScalarPredInst, PredicatedBB);
     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
   }
 }
 
 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
-  VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr;
-  State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), StoredValue,
-                                        getMask());
+  VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
+  State.ILV->vectorizeMemoryInstruction(&Ingredient, State,
+                                        StoredValue ? nullptr : getVPValue(),
+                                        getAddr(), StoredValue, getMask());
 }
 
 // Determine how to lower the scalar epilogue, which depends on 1) optimising
@@ -7617,35 +9170,53 @@ static ScalarEpilogueLowering getScalarEpilogueLowering(
     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
     LoopVectorizationLegality &LVL) {
-  bool OptSize =
-      F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
-                                                     PGSOQueryType::IRPass);
   // 1) OptSize takes precedence over all other options, i.e. if this is set,
   // don't look at hints or options, and don't request a scalar epilogue.
-  if (OptSize)
+  // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
+  // LoopAccessInfo (due to code dependency and not being able to reliably get
+  // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
+  // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
+  // versioning when the vectorization is forced, unlike hasOptSize. So revert
+  // back to the old way and vectorize with versioning when forced. See D81345.)
+  if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
+                                                      PGSOQueryType::IRPass) &&
+                          Hints.getForce() != LoopVectorizeHints::FK_Enabled))
     return CM_ScalarEpilogueNotAllowedOptSize;
 
-  bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() &&
-                              !PreferPredicateOverEpilog;
+  // 2) If set, obey the directives
+  if (PreferPredicateOverEpilogue.getNumOccurrences()) {
+    switch (PreferPredicateOverEpilogue) {
+    case PreferPredicateTy::ScalarEpilogue:
+      return CM_ScalarEpilogueAllowed;
+    case PreferPredicateTy::PredicateElseScalarEpilogue:
+      return CM_ScalarEpilogueNotNeededUsePredicate;
+    case PreferPredicateTy::PredicateOrDontVectorize:
+      return CM_ScalarEpilogueNotAllowedUsePredicate;
+    };
+  }
 
-  // 2) Next, if disabling predication is requested on the command line, honour
-  // this and request a scalar epilogue.
-  if (PredicateOptDisabled)
+  // 3) If set, obey the hints
+  switch (Hints.getPredicate()) {
+  case LoopVectorizeHints::FK_Enabled:
+    return CM_ScalarEpilogueNotNeededUsePredicate;
+  case LoopVectorizeHints::FK_Disabled:
     return CM_ScalarEpilogueAllowed;
+  };
 
-  // 3) and 4) look if enabling predication is requested on the command line,
-  // with a loop hint, or if the TTI hook indicates this is profitable, request
-  // predication .
-  if (PreferPredicateOverEpilog ||
-      Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
-      (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
-                                        LVL.getLAI()) &&
-       Hints.getPredicate() != LoopVectorizeHints::FK_Disabled))
+  // 4) if the TTI hook indicates this is profitable, request predication.
+  if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
+                                       LVL.getLAI()))
     return CM_ScalarEpilogueNotNeededUsePredicate;
 
   return CM_ScalarEpilogueAllowed;
 }
 
+void VPTransformState::set(VPValue *Def, Value *IRDef, Value *V,
+                           unsigned Part) {
+  set(Def, V, Part);
+  ILV->setVectorValue(IRDef, Part, V);
+}
+
 // Process the loop in the VPlan-native vectorization path. This path builds
 // VPlan upfront in the vectorization pipeline, which allows to apply
 // VPlan-to-VPlan transformations from the very beginning without modifying the
@@ -7657,7 +9228,7 @@ static bool processLoopInVPlanNativePath(
     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
 
-  if (PSE.getBackedgeTakenCount() == PSE.getSE()->getCouldNotCompute()) {
+  if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
     return false;
   }
@@ -7676,7 +9247,7 @@ static bool processLoopInVPlanNativePath(
   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE);
 
   // Get user vectorization factor.
-  const unsigned UserVF = Hints.getWidth();
+  ElementCount UserVF = Hints.getWidth();
 
   // Plan how to best vectorize, return the best VF and its cost.
   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
@@ -7691,7 +9262,7 @@ static bool processLoopInVPlanNativePath(
   LVP.setBestPlan(VF.Width, 1);
 
   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
-                         &CM);
+                         &CM, BFI, PSI);
   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
                     << L->getHeader()->getParent()->getName() << "\"\n");
   LVP.executePlan(LB, DT);
@@ -7710,7 +9281,7 @@ LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
                               !EnableLoopVectorization) {}
 
 bool LoopVectorizePass::processLoop(Loop *L) {
-  assert((EnableVPlanNativePath || L->empty()) &&
+  assert((EnableVPlanNativePath || L->isInnermost()) &&
          "VPlan-native path is not enabled. Only process inner loops.");
 
 #ifndef NDEBUG
@@ -7755,7 +9326,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   // Check if it is legal to vectorize the loop.
   LoopVectorizationRequirements Requirements(*ORE);
   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
-                                &Requirements, &Hints, DB, AC);
+                                &Requirements, &Hints, DB, AC, BFI, PSI);
   if (!LVL.canVectorize(EnableVPlanNativePath)) {
     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
     Hints.emitRemarkWithHints();
@@ -7772,11 +9343,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   // even evaluating whether vectorization is profitable. Since we cannot modify
   // the incoming IR, we need to build VPlan upfront in the vectorization
   // pipeline.
-  if (!L->empty())
+  if (!L->isInnermost())
     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
                                         ORE, BFI, PSI, Hints);
 
-  assert(L->empty() && "Inner loop expected.");
+  assert(L->isInnermost() && "Inner loop expected.");
 
   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
   // count by optimizing for size, to minimize overheads.
@@ -7841,7 +9412,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE);
 
   // Get user vectorization factor and interleave count.
-  unsigned UserVF = Hints.getWidth();
+  ElementCount UserVF = Hints.getWidth();
   unsigned UserIC = Hints.getInterleave();
 
   // Plan how to best vectorize, return the best VF and its cost.
@@ -7866,7 +9437,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     return false;
   }
 
-  if (VF.Width == 1) {
+  if (VF.Width.isScalar()) {
     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
     VecDiagMsg = std::make_pair(
         "VectorizationNotBeneficial",
@@ -7955,8 +9526,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     assert(IC > 1 && "interleave count should not be 1 or 0");
     // If we decided that it is not legal to vectorize the loop, then
     // interleave it.
-    InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
-                               &CM);
+    InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM,
+                               BFI, PSI);
     LVP.executePlan(Unroller, DT);
 
     ORE->emit([&]() {
@@ -7967,16 +9538,51 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     });
   } else {
     // If we decided that it is *legal* to vectorize the loop, then do it.
-    InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
-                           &LVL, &CM);
-    LVP.executePlan(LB, DT);
-    ++LoopsVectorized;
 
-    // Add metadata to disable runtime unrolling a scalar loop when there are
-    // no runtime checks about strides and memory. A scalar loop that is
-    // rarely used is not worth unrolling.
-    if (!LB.areSafetyChecksAdded())
-      DisableRuntimeUnroll = true;
+    // Consider vectorizing the epilogue too if it's profitable.
+    VectorizationFactor EpilogueVF =
+      CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
+    if (EpilogueVF.Width.isVector()) {
+
+      // The first pass vectorizes the main loop and creates a scalar epilogue
+      // to be vectorized by executing the plan (potentially with a different
+      // factor) again shortly afterwards.
+      EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC,
+                                        EpilogueVF.Width.getKnownMinValue(), 1);
+      EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI,
+                                         &LVL, &CM, BFI, PSI);
+
+      LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF);
+      LVP.executePlan(MainILV, DT);
+      ++LoopsVectorized;
+
+      simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
+      formLCSSARecursively(*L, *DT, LI, SE);
+
+      // Second pass vectorizes the epilogue and adjusts the control flow
+      // edges from the first pass.
+      LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF);
+      EPI.MainLoopVF = EPI.EpilogueVF;
+      EPI.MainLoopUF = EPI.EpilogueUF;
+      EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
+                                               ORE, EPI, &LVL, &CM, BFI, PSI);
+      LVP.executePlan(EpilogILV, DT);
+      ++LoopsEpilogueVectorized;
+
+      if (!MainILV.areSafetyChecksAdded())
+        DisableRuntimeUnroll = true;
+    } else {
+      InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
+                             &LVL, &CM, BFI, PSI);
+      LVP.executePlan(LB, DT);
+      ++LoopsVectorized;
+
+      // Add metadata to disable runtime unrolling a scalar loop when there are
+      // no runtime checks about strides and memory. A scalar loop that is
+      // rarely used is not worth unrolling.
+      if (!LB.areSafetyChecksAdded())
+        DisableRuntimeUnroll = true;
+    }
 
     // Report the vectorization decision.
     ORE->emit([&]() {
@@ -8090,7 +9696,8 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
         [&](Loop &L) -> const LoopAccessInfo & {
-      LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
+      LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,  SE,
+                                        TLI, TTI, nullptr, MSSA};
       return LAM.getResult<LoopAccessAnalysis>(L, AR);
     };
     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index f950d0d4eb2b..0b630197911a 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -17,11 +17,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Vectorize/SLPVectorizer.h"
-#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/STLExtras.h"
@@ -29,14 +26,16 @@
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/iterator.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/DemandedBits.h"
 #include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/IVDescriptors.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryLocation.h"
@@ -47,7 +46,6 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
-#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
@@ -66,7 +64,6 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/NoFolder.h"
 #include "llvm/IR/Operator.h"
-#include "llvm/IR/PassManager.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Use.h"
@@ -83,6 +80,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/InstructionCost.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
@@ -130,6 +128,10 @@ static cl::opt<int>
 MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
     cl::desc("Attempt to vectorize for this register size in bits"));
 
+static cl::opt<unsigned>
+MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden,
+    cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
+
 static cl::opt<int>
 MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden,
     cl::desc("Maximum depth of the lookup for consecutive stores."));
@@ -204,12 +206,12 @@ static bool allSameBlock(ArrayRef<Value *> VL) {
   if (!I0)
     return false;
   BasicBlock *BB = I0->getParent();
-  for (int i = 1, e = VL.size(); i < e; i++) {
-    Instruction *I = dyn_cast<Instruction>(VL[i]);
-    if (!I)
+  for (int I = 1, E = VL.size(); I < E; I++) {
+    auto *II = dyn_cast<Instruction>(VL[I]);
+    if (!II)
       return false;
 
-    if (BB != I->getParent())
+    if (BB != II->getParent())
       return false;
   }
   return true;
@@ -234,11 +236,16 @@ static bool isSplat(ArrayRef<Value *> VL) {
   return true;
 }
 
-/// \returns True if \p I is commutative, handles CmpInst as well as Instruction.
+/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
 static bool isCommutative(Instruction *I) {
-  if (auto *IC = dyn_cast<CmpInst>(I))
-    return IC->isCommutative();
-  return I->isCommutative();
+  if (auto *Cmp = dyn_cast<CmpInst>(I))
+    return Cmp->isCommutative();
+  if (auto *BO = dyn_cast<BinaryOperator>(I))
+    return BO->isCommutative();
+  // TODO: This should check for generic Instruction::isCommutative(), but
+  //       we need to confirm that the caller code correctly handles Intrinsics
+  //       for example (does not have 2 operands).
+  return false;
 }
 
 /// Checks if the vector of instructions can be represented as a shuffle, like:
@@ -250,7 +257,7 @@ static bool isCommutative(Instruction *I) {
 /// %x3x3 = mul i8 %x3, %x3
 /// %y1y1 = mul i8 %y1, %y1
 /// %y2y2 = mul i8 %y2, %y2
-/// %ins1 = insertelement <4 x i8> undef, i8 %x0x0, i32 0
+/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
 /// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
 /// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
 /// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
@@ -265,13 +272,13 @@ static bool isCommutative(Instruction *I) {
 /// %x3 = extractelement <4 x i8> %x, i32 3
 /// %y1 = extractelement <4 x i8> %y, i32 1
 /// %y2 = extractelement <4 x i8> %y, i32 2
-/// %1 = insertelement <4 x i8> undef, i8 %x0, i32 0
+/// %1 = insertelement <4 x i8> poison, i8 %x0, i32 0
 /// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1
 /// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2
 /// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3
 /// %5 = mul <4 x i8> %4, %4
 /// %6 = extractelement <4 x i8> %5, i32 0
-/// %ins1 = insertelement <4 x i8> undef, i8 %6, i32 0
+/// %ins1 = insertelement <4 x i8> poison, i8 %6, i32 0
 /// %7 = extractelement <4 x i8> %5, i32 1
 /// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1
 /// %8 = extractelement <4 x i8> %5, i32 2
@@ -285,7 +292,8 @@ static bool isCommutative(Instruction *I) {
 static Optional<TargetTransformInfo::ShuffleKind>
 isShuffle(ArrayRef<Value *> VL) {
   auto *EI0 = cast<ExtractElementInst>(VL[0]);
-  unsigned Size = EI0->getVectorOperandType()->getNumElements();
+  unsigned Size =
+      cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements();
   Value *Vec1 = nullptr;
   Value *Vec2 = nullptr;
   enum ShuffleMode { Unknown, Select, Permute };
@@ -294,7 +302,7 @@ isShuffle(ArrayRef<Value *> VL) {
     auto *EI = cast<ExtractElementInst>(VL[I]);
     auto *Vec = EI->getVectorOperand();
     // All vector operands must have the same number of vector elements.
-    if (cast<VectorType>(Vec->getType())->getNumElements() != Size)
+    if (cast<FixedVectorType>(Vec->getType())->getNumElements() != Size)
       return None;
     auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
     if (!Idx)
@@ -303,7 +311,7 @@ isShuffle(ArrayRef<Value *> VL) {
     if (Idx->getValue().uge(Size))
       continue;
     unsigned IntIdx = Idx->getValue().getZExtValue();
-    // We can extractelement from undef vector.
+    // We can extractelement from undef or poison vector.
     if (isa<UndefValue>(Vec))
       continue;
     // For correct shuffling we have to have at most 2 different vector operands
@@ -500,7 +508,7 @@ static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
 }
 
 /// \returns the AA location that is being access by the instruction.
-static MemoryLocation getLocation(Instruction *I, AliasAnalysis *AA) {
+static MemoryLocation getLocation(Instruction *I, AAResults *AA) {
   if (StoreInst *SI = dyn_cast<StoreInst>(I))
     return MemoryLocation::get(SI);
   if (LoadInst *LI = dyn_cast<LoadInst>(I))
@@ -521,6 +529,15 @@ static bool isSimple(Instruction *I) {
 
 namespace llvm {
 
+static void inversePermutation(ArrayRef<unsigned> Indices,
+                               SmallVectorImpl<int> &Mask) {
+  Mask.clear();
+  const unsigned E = Indices.size();
+  Mask.resize(E, E + 1);
+  for (unsigned I = 0; I < E; ++I)
+    Mask[Indices[I]] = I;
+}
+
 namespace slpvectorizer {
 
 /// Bottom Up SLP Vectorizer.
@@ -535,9 +552,10 @@ public:
   using StoreList = SmallVector<StoreInst *, 8>;
   using ExtraValueToDebugLocsMap =
       MapVector<Value *, SmallVector<Instruction *, 2>>;
+  using OrdersType = SmallVector<unsigned, 4>;
 
   BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
-          TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
+          TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
           DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
           const DataLayout *DL, OptimizationRemarkEmitter *ORE)
       : F(Func), SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC),
@@ -571,11 +589,11 @@ public:
 
   /// \returns the cost incurred by unwanted spills and fills, caused by
   /// holding live values over call sites.
-  int getSpillCost() const;
+  InstructionCost getSpillCost() const;
 
   /// \returns the vectorization cost of the subtree that starts at \p VL.
   /// A negative number means that this is profitable.
-  int getTreeCost();
+  InstructionCost getTreeCost();
 
   /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
   /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
@@ -612,6 +630,14 @@ public:
 
   /// \returns The best order of instructions for vectorization.
   Optional<ArrayRef<unsigned>> bestOrder() const {
+    assert(llvm::all_of(
+               NumOpsWantToKeepOrder,
+               [this](const decltype(NumOpsWantToKeepOrder)::value_type &D) {
+                 return D.getFirst().size() ==
+                        VectorizableTree[0]->Scalars.size();
+               }) &&
+           "All orders must have the same size as number of instructions in "
+           "tree node.");
     auto I = std::max_element(
         NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(),
         [](const decltype(NumOpsWantToKeepOrder)::value_type &D1,
@@ -625,6 +651,81 @@ public:
     return makeArrayRef(I->getFirst());
   }
 
+  /// Builds the correct order for root instructions.
+  /// If some leaves have the same instructions to be vectorized, we may
+  /// incorrectly evaluate the best order for the root node (it is built for the
+  /// vector of instructions without repeated instructions and, thus, has less
+  /// elements than the root node). This function builds the correct order for
+  /// the root node.
+  /// For example, if the root node is \<a+b, a+c, a+d, f+e\>, then the leaves
+  /// are \<a, a, a, f\> and \<b, c, d, e\>. When we try to vectorize the first
+  /// leaf, it will be shrink to \<a, b\>. If instructions in this leaf should
+  /// be reordered, the best order will be \<1, 0\>. We need to extend this
+  /// order for the root node. For the root node this order should look like
+  /// \<3, 0, 1, 2\>. This function extends the order for the reused
+  /// instructions.
+  void findRootOrder(OrdersType &Order) {
+    // If the leaf has the same number of instructions to vectorize as the root
+    // - order must be set already.
+    unsigned RootSize = VectorizableTree[0]->Scalars.size();
+    if (Order.size() == RootSize)
+      return;
+    SmallVector<unsigned, 4> RealOrder(Order.size());
+    std::swap(Order, RealOrder);
+    SmallVector<int, 4> Mask;
+    inversePermutation(RealOrder, Mask);
+    Order.assign(Mask.begin(), Mask.end());
+    // The leaf has less number of instructions - need to find the true order of
+    // the root.
+    // Scan the nodes starting from the leaf back to the root.
+    const TreeEntry *PNode = VectorizableTree.back().get();
+    SmallVector<const TreeEntry *, 4> Nodes(1, PNode);
+    SmallPtrSet<const TreeEntry *, 4> Visited;
+    while (!Nodes.empty() && Order.size() != RootSize) {
+      const TreeEntry *PNode = Nodes.pop_back_val();
+      if (!Visited.insert(PNode).second)
+        continue;
+      const TreeEntry &Node = *PNode;
+      for (const EdgeInfo &EI : Node.UserTreeIndices)
+        if (EI.UserTE)
+          Nodes.push_back(EI.UserTE);
+      if (Node.ReuseShuffleIndices.empty())
+        continue;
+      // Build the order for the parent node.
+      OrdersType NewOrder(Node.ReuseShuffleIndices.size(), RootSize);
+      SmallVector<unsigned, 4> OrderCounter(Order.size(), 0);
+      // The algorithm of the order extension is:
+      // 1. Calculate the number of the same instructions for the order.
+      // 2. Calculate the index of the new order: total number of instructions
+      // with order less than the order of the current instruction + reuse
+      // number of the current instruction.
+      // 3. The new order is just the index of the instruction in the original
+      // vector of the instructions.
+      for (unsigned I : Node.ReuseShuffleIndices)
+        ++OrderCounter[Order[I]];
+      SmallVector<unsigned, 4> CurrentCounter(Order.size(), 0);
+      for (unsigned I = 0, E = Node.ReuseShuffleIndices.size(); I < E; ++I) {
+        unsigned ReusedIdx = Node.ReuseShuffleIndices[I];
+        unsigned OrderIdx = Order[ReusedIdx];
+        unsigned NewIdx = 0;
+        for (unsigned J = 0; J < OrderIdx; ++J)
+          NewIdx += OrderCounter[J];
+        NewIdx += CurrentCounter[OrderIdx];
+        ++CurrentCounter[OrderIdx];
+        assert(NewOrder[NewIdx] == RootSize &&
+               "The order index should not be written already.");
+        NewOrder[NewIdx] = I;
+      }
+      std::swap(Order, NewOrder);
+    }
+    assert(Order.size() == RootSize &&
+           "Root node is expected or the size of the order must be the same as "
+           "the number of elements in the root node.");
+    assert(llvm::all_of(Order,
+                        [RootSize](unsigned Val) { return Val != RootSize; }) &&
+           "All indices must be initialized");
+  }
+
   /// \return The vector element size in bits to use when vectorizing the
   /// expression tree ending at \p V. If V is a store, the size is the width of
   /// the stored value. Otherwise, the size is the width of the largest loaded
@@ -646,6 +747,12 @@ public:
     return MinVecRegSize;
   }
 
+  unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
+    unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
+      MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
+    return MaxVF ? MaxVF : UINT_MAX;
+  }
+
   /// Check if homogeneous aggregate is isomorphic to some VectorType.
   /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
   /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
@@ -665,7 +772,7 @@ public:
   /// effectively impossible for the backend to undo.
   /// TODO: If load combining is allowed in the IR optimizer, this analysis
   ///       may not be necessary.
-  bool isLoadCombineReductionCandidate(unsigned ReductionOpcode) const;
+  bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
 
   /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
   /// can be load combined in the backend. Load combining may not be allowed in
@@ -880,6 +987,14 @@ public:
       std::array<std::pair<Value *, int>, 2> Values = {{LHS, RHS}};
       for (int Idx = 0, IdxE = Values.size(); Idx != IdxE; ++Idx) {
         Value *V = Values[Idx].first;
+        if (isa<Constant>(V)) {
+          // Since this is a function pass, it doesn't make semantic sense to
+          // walk the users of a subclass of Constant. The users could be in
+          // another function, or even another module that happens to be in
+          // the same LLVMContext.
+          continue;
+        }
+
         // Calculate the absolute lane, using the minimum relative lane of LHS
         // and RHS as base and Idx as the offset.
         int Ln = std::min(LHS.second, RHS.second) + Idx;
@@ -1388,7 +1503,7 @@ private:
   bool areAllUsersVectorized(Instruction *I) const;
 
   /// \returns the cost of the vectorizable entry.
-  int getEntryCost(TreeEntry *E);
+  InstructionCost getEntryCost(TreeEntry *E);
 
   /// This is the recursive part of buildTree.
   void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
@@ -1410,20 +1525,21 @@ private:
 
   /// \returns the scalarization cost for this type. Scalarization in this
   /// context means the creation of vectors from a group of scalars.
-  int getGatherCost(VectorType *Ty,
-                    const DenseSet<unsigned> &ShuffledIndices) const;
+  InstructionCost
+  getGatherCost(FixedVectorType *Ty,
+                const DenseSet<unsigned> &ShuffledIndices) const;
 
   /// \returns the scalarization cost for this list of values. Assuming that
   /// this subtree gets vectorized, we may need to extract the values from the
   /// roots. This method calculates the cost of extracting the values.
-  int getGatherCost(ArrayRef<Value *> VL) const;
+  InstructionCost getGatherCost(ArrayRef<Value *> VL) const;
 
   /// Set the Builder insert point to one after the last instruction in
   /// the bundle
   void setInsertPointAfterBundle(TreeEntry *E);
 
   /// \returns a vector from a collection of scalars in \p VL.
-  Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
+  Value *gather(ArrayRef<Value *> VL);
 
   /// \returns whether the VectorizableTree is fully vectorizable and will
   /// be beneficial even the tree height is tiny.
@@ -1457,15 +1573,17 @@ private:
     /// The Scalars are vectorized into this value. It is initialized to Null.
     Value *VectorizedValue = nullptr;
 
-    /// Do we need to gather this sequence ?
-    enum EntryState { Vectorize, NeedToGather };
+    /// Do we need to gather this sequence or vectorize it
+    /// (either with vector instruction or with scatter/gather
+    /// intrinsics for store/load)?
+    enum EntryState { Vectorize, ScatterVectorize, NeedToGather };
     EntryState State;
 
     /// Does this sequence require some shuffling?
     SmallVector<int, 4> ReuseShuffleIndices;
 
     /// Does this entry require reordering?
-    ArrayRef<unsigned> ReorderIndices;
+    SmallVector<unsigned, 4> ReorderIndices;
 
     /// Points back to the VectorizableTree.
     ///
@@ -1606,6 +1724,9 @@ private:
       case Vectorize:
         dbgs() << "Vectorize\n";
         break;
+      case ScatterVectorize:
+        dbgs() << "ScatterVectorize\n";
+        break;
       case NeedToGather:
         dbgs() << "NeedToGather\n";
         break;
@@ -1627,7 +1748,7 @@ private:
         dbgs() << "NULL\n";
       dbgs() << "ReuseShuffleIndices: ";
       if (ReuseShuffleIndices.empty())
-        dbgs() << "Emtpy";
+        dbgs() << "Empty";
       else
         for (unsigned ReuseIdx : ReuseShuffleIndices)
           dbgs() << ReuseIdx << ", ";
@@ -1644,26 +1765,55 @@ private:
 #endif
   };
 
+#ifndef NDEBUG
+  void dumpTreeCosts(TreeEntry *E, InstructionCost ReuseShuffleCost,
+                     InstructionCost VecCost,
+                     InstructionCost ScalarCost) const {
+    dbgs() << "SLP: Calculated costs for Tree:\n"; E->dump();
+    dbgs() << "SLP: Costs:\n";
+    dbgs() << "SLP:     ReuseShuffleCost = " << ReuseShuffleCost << "\n";
+    dbgs() << "SLP:     VectorCost = " << VecCost << "\n";
+    dbgs() << "SLP:     ScalarCost = " << ScalarCost << "\n";
+    dbgs() << "SLP:     ReuseShuffleCost + VecCost - ScalarCost = " <<
+               ReuseShuffleCost + VecCost - ScalarCost << "\n";
+  }
+#endif
+
   /// Create a new VectorizableTree entry.
   TreeEntry *newTreeEntry(ArrayRef<Value *> VL, Optional<ScheduleData *> Bundle,
                           const InstructionsState &S,
                           const EdgeInfo &UserTreeIdx,
                           ArrayRef<unsigned> ReuseShuffleIndices = None,
                           ArrayRef<unsigned> ReorderIndices = None) {
-    bool Vectorized = (bool)Bundle;
+    TreeEntry::EntryState EntryState =
+        Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
+    return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
+                        ReuseShuffleIndices, ReorderIndices);
+  }
+
+  TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
+                          TreeEntry::EntryState EntryState,
+                          Optional<ScheduleData *> Bundle,
+                          const InstructionsState &S,
+                          const EdgeInfo &UserTreeIdx,
+                          ArrayRef<unsigned> ReuseShuffleIndices = None,
+                          ArrayRef<unsigned> ReorderIndices = None) {
+    assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
+            (Bundle && EntryState != TreeEntry::NeedToGather)) &&
+           "Need to vectorize gather entry?");
     VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
     TreeEntry *Last = VectorizableTree.back().get();
     Last->Idx = VectorizableTree.size() - 1;
     Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
-    Last->State = Vectorized ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
+    Last->State = EntryState;
     Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
                                      ReuseShuffleIndices.end());
-    Last->ReorderIndices = ReorderIndices;
+    Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
     Last->setOperations(S);
-    if (Vectorized) {
-      for (int i = 0, e = VL.size(); i != e; ++i) {
-        assert(!getTreeEntry(VL[i]) && "Scalar already in tree!");
-        ScalarToTreeEntry[VL[i]] = Last;
+    if (Last->State != TreeEntry::NeedToGather) {
+      for (Value *V : VL) {
+        assert(!getTreeEntry(V) && "Scalar already in tree!");
+        ScalarToTreeEntry[V] = Last;
       }
       // Update the scheduler bundle to point to this TreeEntry.
       unsigned Lane = 0;
@@ -1699,18 +1849,10 @@ private:
   }
 #endif
 
-  TreeEntry *getTreeEntry(Value *V) {
-    auto I = ScalarToTreeEntry.find(V);
-    if (I != ScalarToTreeEntry.end())
-      return I->second;
-    return nullptr;
-  }
+  TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); }
 
   const TreeEntry *getTreeEntry(Value *V) const {
-    auto I = ScalarToTreeEntry.find(V);
-    if (I != ScalarToTreeEntry.end())
-      return I->second;
-    return nullptr;
+    return ScalarToTreeEntry.lookup(V);
   }
 
   /// Maps a specific scalar to its tree entry.
@@ -2195,7 +2337,6 @@ private:
   /// List of users to ignore during scheduling and that don't need extracting.
   ArrayRef<Value *> UserIgnoreList;
 
-  using OrdersType = SmallVector<unsigned, 4>;
   /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
   /// sorted SmallVectors of unsigned.
   struct OrdersTypeDenseMapInfo {
@@ -2233,7 +2374,7 @@ private:
   ScalarEvolution *SE;
   TargetTransformInfo *TTI;
   TargetLibraryInfo *TLI;
-  AliasAnalysis *AA;
+  AAResults *AA;
   LoopInfo *LI;
   DominatorTree *DT;
   AssumptionCache *AC;
@@ -2332,9 +2473,9 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
     }
     for (auto V : Entry->Scalars) {
       OS << *V;
-      if (std::any_of(
-              R->ExternalUses.begin(), R->ExternalUses.end(),
-              [&](const BoUpSLP::ExternalUser &EU) { return EU.Scalar == V; }))
+      if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
+            return EU.Scalar == V;
+          }))
         OS << " <extract>";
       OS << "\n";
     }
@@ -2366,13 +2507,17 @@ BoUpSLP::~BoUpSLP() {
            "trying to erase instruction with users.");
     Pair.getFirst()->eraseFromParent();
   }
+#ifdef EXPENSIVE_CHECKS
+  // If we could guarantee that this call is not extremely slow, we could
+  // remove the ifdef limitation (see PR47712).
   assert(!verifyFunction(*F, &dbgs()));
+#endif
 }
 
 void BoUpSLP::eraseInstructions(ArrayRef<Value *> AV) {
   for (auto *V : AV) {
     if (auto *I = dyn_cast<Instruction>(V))
-      eraseInstruction(I, /*ReplaceWithUndef=*/true);
+      eraseInstruction(I, /*ReplaceOpsWithUndef=*/true);
   };
 }
 
@@ -2597,11 +2742,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       auto *PH = cast<PHINode>(VL0);
 
       // Check for terminator values (e.g. invoke).
-      for (unsigned j = 0; j < VL.size(); ++j)
-        for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
+      for (Value *V : VL)
+        for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) {
           Instruction *Term = dyn_cast<Instruction>(
-              cast<PHINode>(VL[j])->getIncomingValueForBlock(
-                  PH->getIncomingBlock(i)));
+              cast<PHINode>(V)->getIncomingValueForBlock(
+                  PH->getIncomingBlock(I)));
           if (Term && Term->isTerminator()) {
             LLVM_DEBUG(dbgs()
                        << "SLP: Need to swizzle PHINodes (terminator use).\n");
@@ -2618,13 +2763,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
 
       // Keeps the reordered operands to avoid code duplication.
       SmallVector<ValueList, 2> OperandsVec;
-      for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
+      for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) {
         ValueList Operands;
         // Prepare the operand vector.
-        for (Value *j : VL)
-          Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock(
-              PH->getIncomingBlock(i)));
-        TE->setOperand(i, Operands);
+        for (Value *V : VL)
+          Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(
+              PH->getIncomingBlock(I)));
+        TE->setOperand(I, Operands);
         OperandsVec.push_back(Operands);
       }
       for (unsigned OpIdx = 0, OpE = OperandsVec.size(); OpIdx != OpE; ++OpIdx)
@@ -2657,12 +2802,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         });
         // Insert new order with initial value 0, if it does not exist,
         // otherwise return the iterator to the existing one.
-        auto StoredCurrentOrderAndNum =
-            NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
-        ++StoredCurrentOrderAndNum->getSecond();
         newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                     ReuseShuffleIndicies,
-                     StoredCurrentOrderAndNum->getFirst());
+                     ReuseShuffleIndicies, CurrentOrder);
+        findRootOrder(CurrentOrder);
+        ++NumOpsWantToKeepOrder[CurrentOrder];
         // This is a special case, as it does not gather, but at the same time
         // we are not extending buildTree_rec() towards the operands.
         ValueList Op0;
@@ -2739,16 +2882,23 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
             LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
           } else {
             // Need to reorder.
-            auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
-            ++I->getSecond();
             TreeEntry *TE =
                 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                             ReuseShuffleIndicies, I->getFirst());
+                             ReuseShuffleIndicies, CurrentOrder);
             TE->setOperandsInOrder();
             LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
+            findRootOrder(CurrentOrder);
+            ++NumOpsWantToKeepOrder[CurrentOrder];
           }
           return;
         }
+        // Vectorizing non-consecutive loads with `llvm.masked.gather`.
+        TreeEntry *TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
+                                     UserTreeIdx, ReuseShuffleIndicies);
+        TE->setOperandsInOrder();
+        buildTree_rec(PointerOps, Depth + 1, {TE, 0});
+        LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
+        return;
       }
 
       LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
@@ -2883,8 +3033,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
-        for (Value *j : VL)
-          Operands.push_back(cast<Instruction>(j)->getOperand(i));
+        for (Value *V : VL)
+          Operands.push_back(cast<Instruction>(V)->getOperand(i));
 
         buildTree_rec(Operands, Depth + 1, {TE, i});
       }
@@ -2952,6 +3102,16 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     case Instruction::Store: {
       // Check if the stores are consecutive or if we need to swizzle them.
       llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
+      // Avoid types that are padded when being allocated as scalars, while
+      // being packed together in a vector (such as i1).
+      if (DL->getTypeSizeInBits(ScalarTy) !=
+          DL->getTypeAllocSizeInBits(ScalarTy)) {
+        BS.cancelScheduling(VL, VL0);
+        newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                     ReuseShuffleIndicies);
+        LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
+        return;
+      }
       // Make sure all stores in the bundle are simple - we can't vectorize
       // atomic or volatile stores.
       SmallVector<Value *, 4> PointerOps(VL.size());
@@ -3001,15 +3161,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
             buildTree_rec(Operands, Depth + 1, {TE, 0});
             LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
           } else {
-            // Need to reorder.
-            auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
-            ++(I->getSecond());
             TreeEntry *TE =
                 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                             ReuseShuffleIndicies, I->getFirst());
+                             ReuseShuffleIndicies, CurrentOrder);
             TE->setOperandsInOrder();
             buildTree_rec(Operands, Depth + 1, {TE, 0});
             LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
+            findRootOrder(CurrentOrder);
+            ++NumOpsWantToKeepOrder[CurrentOrder];
           }
           return;
         }
@@ -3028,7 +3187,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
 
       VFShape Shape = VFShape::get(
-          *CI, {static_cast<unsigned int>(VL.size()), false /*Scalable*/},
+          *CI, ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
           false /*HasGlobalPred*/);
       Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
 
@@ -3165,7 +3324,7 @@ unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
       N *= AT->getNumElements();
       EltTy = AT->getElementType();
     } else {
-      auto *VT = cast<VectorType>(EltTy);
+      auto *VT = cast<FixedVectorType>(EltTy);
       N *= VT->getNumElements();
       EltTy = VT->getElementType();
     }
@@ -3203,7 +3362,7 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
     if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
       return false;
   } else {
-    NElts = cast<VectorType>(Vec->getType())->getNumElements();
+    NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
   }
 
   if (NElts != VL.size())
@@ -3247,27 +3406,26 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
 }
 
 bool BoUpSLP::areAllUsersVectorized(Instruction *I) const {
-  return I->hasOneUse() ||
-         std::all_of(I->user_begin(), I->user_end(), [this](User *U) {
+  return I->hasOneUse() || llvm::all_of(I->users(), [this](User *U) {
            return ScalarToTreeEntry.count(U) > 0;
          });
 }
 
-static std::pair<unsigned, unsigned>
-getVectorCallCosts(CallInst *CI, VectorType *VecTy, TargetTransformInfo *TTI,
-                   TargetLibraryInfo *TLI) {
+static std::pair<InstructionCost, InstructionCost>
+getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
+                   TargetTransformInfo *TTI, TargetLibraryInfo *TLI) {
   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
 
   // Calculate the cost of the scalar and vector calls.
-  IntrinsicCostAttributes CostAttrs(ID, *CI, VecTy->getNumElements());
-  int IntrinsicCost =
+  IntrinsicCostAttributes CostAttrs(ID, *CI, VecTy->getElementCount());
+  auto IntrinsicCost =
     TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
 
-  auto Shape =
-      VFShape::get(*CI, {static_cast<unsigned>(VecTy->getNumElements()), false},
-                   false /*HasGlobalPred*/);
+  auto Shape = VFShape::get(*CI, ElementCount::getFixed(static_cast<unsigned>(
+                                     VecTy->getNumElements())),
+                            false /*HasGlobalPred*/);
   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
-  int LibCost = IntrinsicCost;
+  auto LibCost = IntrinsicCost;
   if (!CI->isNoBuiltin() && VecFunc) {
     // Calculate the cost of the vector library call.
     SmallVector<Type *, 4> VecTys;
@@ -3282,7 +3440,7 @@ getVectorCallCosts(CallInst *CI, VectorType *VecTy, TargetTransformInfo *TTI,
   return {IntrinsicCost, LibCost};
 }
 
-int BoUpSLP::getEntryCost(TreeEntry *E) {
+InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) {
   ArrayRef<Value*> VL = E->Scalars;
 
   Type *ScalarTy = VL[0]->getType();
@@ -3301,7 +3459,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
 
   unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size();
   bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
-  int ReuseShuffleCost = 0;
+  InstructionCost ReuseShuffleCost = 0;
   if (NeedToShuffleReuses) {
     ReuseShuffleCost =
         TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
@@ -3317,7 +3475,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
         allSameType(VL) && allSameBlock(VL)) {
       Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = isShuffle(VL);
       if (ShuffleKind.hasValue()) {
-        int Cost = TTI->getShuffleCost(ShuffleKind.getValue(), VecTy);
+        InstructionCost Cost =
+            TTI->getShuffleCost(ShuffleKind.getValue(), VecTy);
         for (auto *V : VL) {
           // If all users of instruction are going to be vectorized and this
           // instruction itself is not going to be vectorized, consider this
@@ -3336,7 +3495,9 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
     }
     return ReuseShuffleCost + getGatherCost(VL);
   }
-  assert(E->State == TreeEntry::Vectorize && "Unhandled state");
+  assert((E->State == TreeEntry::Vectorize ||
+          E->State == TreeEntry::ScatterVectorize) &&
+         "Unhandled state");
   assert(E->getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
   Instruction *VL0 = E->getMainOp();
   unsigned ShuffleOrOp =
@@ -3375,37 +3536,37 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
               TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx);
         }
       }
-      int DeadCost = ReuseShuffleCost;
+      InstructionCost DeadCost = ReuseShuffleCost;
       if (!E->ReorderIndices.empty()) {
         // TODO: Merge this shuffle with the ReuseShuffleCost.
         DeadCost += TTI->getShuffleCost(
             TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
       }
-      for (unsigned i = 0, e = VL.size(); i < e; ++i) {
-        Instruction *E = cast<Instruction>(VL[i]);
+      for (unsigned I = 0, E = VL.size(); I < E; ++I) {
+        Instruction *EI = cast<Instruction>(VL[I]);
         // If all users are going to be vectorized, instruction can be
         // considered as dead.
         // The same, if have only one user, it will be vectorized for sure.
-        if (areAllUsersVectorized(E)) {
+        if (areAllUsersVectorized(EI)) {
           // Take credit for instruction that will become dead.
-          if (E->hasOneUse()) {
-            Instruction *Ext = E->user_back();
+          if (EI->hasOneUse()) {
+            Instruction *Ext = EI->user_back();
             if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
                 all_of(Ext->users(),
                        [](User *U) { return isa<GetElementPtrInst>(U); })) {
               // Use getExtractWithExtendCost() to calculate the cost of
               // extractelement/ext pair.
               DeadCost -= TTI->getExtractWithExtendCost(
-                  Ext->getOpcode(), Ext->getType(), VecTy, i);
+                  Ext->getOpcode(), Ext->getType(), VecTy, I);
               // Add back the cost of s|zext which is subtracted separately.
               DeadCost += TTI->getCastInstrCost(
-                  Ext->getOpcode(), Ext->getType(), E->getType(), CostKind,
-                  Ext);
+                  Ext->getOpcode(), Ext->getType(), EI->getType(),
+                  TTI::getCastContextHint(Ext), CostKind, Ext);
               continue;
             }
           }
           DeadCost -=
-              TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
+              TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I);
         }
       }
       return DeadCost;
@@ -3423,40 +3584,78 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
     case Instruction::FPTrunc:
     case Instruction::BitCast: {
       Type *SrcTy = VL0->getOperand(0)->getType();
-      int ScalarEltCost =
-          TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy, CostKind,
-                                VL0);
+      InstructionCost ScalarEltCost =
+          TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy,
+                                TTI::getCastContextHint(VL0), CostKind, VL0);
       if (NeedToShuffleReuses) {
         ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
       }
 
       // Calculate the cost of this instruction.
-      int ScalarCost = VL.size() * ScalarEltCost;
+      InstructionCost ScalarCost = VL.size() * ScalarEltCost;
 
       auto *SrcVecTy = FixedVectorType::get(SrcTy, VL.size());
-      int VecCost = 0;
+      InstructionCost VecCost = 0;
       // Check if the values are candidates to demote.
       if (!MinBWs.count(VL0) || VecTy != SrcVecTy) {
-        VecCost = ReuseShuffleCost +
-                  TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy,
-                                        CostKind, VL0);
+        VecCost =
+            ReuseShuffleCost +
+            TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy,
+                                  TTI::getCastContextHint(VL0), CostKind, VL0);
       }
+      LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
       return VecCost - ScalarCost;
     }
     case Instruction::FCmp:
     case Instruction::ICmp:
     case Instruction::Select: {
       // Calculate the cost of this instruction.
-      int ScalarEltCost = TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy,
-                                                  Builder.getInt1Ty(),
-                                                  CostKind, VL0);
+      InstructionCost ScalarEltCost =
+          TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(),
+                                  CmpInst::BAD_ICMP_PREDICATE, CostKind, VL0);
       if (NeedToShuffleReuses) {
         ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
       }
       auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
-      int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
-      int VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy,
-                                            CostKind, VL0);
+      InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
+
+      // Check if all entries in VL are either compares or selects with compares
+      // as condition that have the same predicates.
+      CmpInst::Predicate VecPred = CmpInst::BAD_ICMP_PREDICATE;
+      bool First = true;
+      for (auto *V : VL) {
+        CmpInst::Predicate CurrentPred;
+        auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
+        if ((!match(V, m_Select(MatchCmp, m_Value(), m_Value())) &&
+             !match(V, MatchCmp)) ||
+            (!First && VecPred != CurrentPred)) {
+          VecPred = CmpInst::BAD_ICMP_PREDICATE;
+          break;
+        }
+        First = false;
+        VecPred = CurrentPred;
+      }
+
+      InstructionCost VecCost = TTI->getCmpSelInstrCost(
+          E->getOpcode(), VecTy, MaskTy, VecPred, CostKind, VL0);
+      // Check if it is possible and profitable to use min/max for selects in
+      // VL.
+      //
+      auto IntrinsicAndUse = canConvertToMinOrMaxIntrinsic(VL);
+      if (IntrinsicAndUse.first != Intrinsic::not_intrinsic) {
+        IntrinsicCostAttributes CostAttrs(IntrinsicAndUse.first, VecTy,
+                                          {VecTy, VecTy});
+        InstructionCost IntrinsicCost =
+            TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
+        // If the selects are the only uses of the compares, they will be dead
+        // and we can adjust the cost by removing their cost.
+        if (IntrinsicAndUse.second)
+          IntrinsicCost -=
+              TTI->getCmpSelInstrCost(Instruction::ICmp, VecTy, MaskTy,
+                                      CmpInst::BAD_ICMP_PREDICATE, CostKind);
+        VecCost = std::min(VecCost, IntrinsicCost);
+      }
+      LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
       return ReuseShuffleCost + VecCost - ScalarCost;
     }
     case Instruction::FNeg:
@@ -3516,16 +3715,17 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
       }
 
       SmallVector<const Value *, 4> Operands(VL0->operand_values());
-      int ScalarEltCost = TTI->getArithmeticInstrCost(
-          E->getOpcode(), ScalarTy, CostKind, Op1VK, Op2VK, Op1VP, Op2VP,
-          Operands, VL0);
+      InstructionCost ScalarEltCost =
+          TTI->getArithmeticInstrCost(E->getOpcode(), ScalarTy, CostKind, Op1VK,
+                                      Op2VK, Op1VP, Op2VP, Operands, VL0);
       if (NeedToShuffleReuses) {
         ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
       }
-      int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
-      int VecCost = TTI->getArithmeticInstrCost(
-          E->getOpcode(), VecTy, CostKind, Op1VK, Op2VK, Op1VP, Op2VP,
-          Operands, VL0);
+      InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
+      InstructionCost VecCost =
+          TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind, Op1VK,
+                                      Op2VK, Op1VP, Op2VP, Operands, VL0);
+      LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
       return ReuseShuffleCost + VecCost - ScalarCost;
     }
     case Instruction::GetElementPtr: {
@@ -3534,36 +3734,42 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
       TargetTransformInfo::OperandValueKind Op2VK =
           TargetTransformInfo::OK_UniformConstantValue;
 
-      int ScalarEltCost =
-          TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, CostKind,
-                                      Op1VK, Op2VK);
+      InstructionCost ScalarEltCost = TTI->getArithmeticInstrCost(
+          Instruction::Add, ScalarTy, CostKind, Op1VK, Op2VK);
       if (NeedToShuffleReuses) {
         ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
       }
-      int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
-      int VecCost =
-          TTI->getArithmeticInstrCost(Instruction::Add, VecTy, CostKind,
-                                      Op1VK, Op2VK);
+      InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
+      InstructionCost VecCost = TTI->getArithmeticInstrCost(
+          Instruction::Add, VecTy, CostKind, Op1VK, Op2VK);
+      LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
       return ReuseShuffleCost + VecCost - ScalarCost;
     }
     case Instruction::Load: {
       // Cost of wide load - cost of scalar loads.
       Align alignment = cast<LoadInst>(VL0)->getAlign();
-      int ScalarEltCost =
-          TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0,
-                               CostKind, VL0);
+      InstructionCost ScalarEltCost = TTI->getMemoryOpCost(
+          Instruction::Load, ScalarTy, alignment, 0, CostKind, VL0);
       if (NeedToShuffleReuses) {
         ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
       }
-      int ScalarLdCost = VecTy->getNumElements() * ScalarEltCost;
-      int VecLdCost =
-          TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0,
-                               CostKind, VL0);
+      InstructionCost ScalarLdCost = VecTy->getNumElements() * ScalarEltCost;
+      InstructionCost VecLdCost;
+      if (E->State == TreeEntry::Vectorize) {
+        VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0,
+                                         CostKind, VL0);
+      } else {
+        assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");
+        VecLdCost = TTI->getGatherScatterOpCost(
+            Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),
+            /*VariableMask=*/false, alignment, CostKind, VL0);
+      }
       if (!E->ReorderIndices.empty()) {
         // TODO: Merge this shuffle with the ReuseShuffleCost.
         VecLdCost += TTI->getShuffleCost(
             TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
       }
+      LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecLdCost, ScalarLdCost));
       return ReuseShuffleCost + VecLdCost - ScalarLdCost;
     }
     case Instruction::Store: {
@@ -3572,19 +3778,19 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
       auto *SI =
           cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
       Align Alignment = SI->getAlign();
-      int ScalarEltCost =
-          TTI->getMemoryOpCost(Instruction::Store, ScalarTy, Alignment, 0,
-                               CostKind, VL0);
+      InstructionCost ScalarEltCost = TTI->getMemoryOpCost(
+          Instruction::Store, ScalarTy, Alignment, 0, CostKind, VL0);
       if (NeedToShuffleReuses)
         ReuseShuffleCost = -(ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
-      int ScalarStCost = VecTy->getNumElements() * ScalarEltCost;
-      int VecStCost = TTI->getMemoryOpCost(Instruction::Store,
-                                           VecTy, Alignment, 0, CostKind, VL0);
+      InstructionCost ScalarStCost = VecTy->getNumElements() * ScalarEltCost;
+      InstructionCost VecStCost = TTI->getMemoryOpCost(
+          Instruction::Store, VecTy, Alignment, 0, CostKind, VL0);
       if (IsReorder) {
         // TODO: Merge this shuffle with the ReuseShuffleCost.
         VecStCost += TTI->getShuffleCost(
             TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
       }
+      LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecStCost, ScalarStCost));
       return ReuseShuffleCost + VecStCost - ScalarStCost;
     }
     case Instruction::Call: {
@@ -3592,15 +3798,17 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
 
       // Calculate the cost of the scalar and vector calls.
-      IntrinsicCostAttributes CostAttrs(ID, *CI, 1, 1);
-      int ScalarEltCost = TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
+      IntrinsicCostAttributes CostAttrs(ID, *CI, ElementCount::getFixed(1), 1);
+      InstructionCost ScalarEltCost =
+          TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
       if (NeedToShuffleReuses) {
         ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
       }
-      int ScalarCallCost = VecTy->getNumElements() * ScalarEltCost;
+      InstructionCost ScalarCallCost = VecTy->getNumElements() * ScalarEltCost;
 
       auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);
-      int VecCallCost = std::min(VecCallCosts.first, VecCallCosts.second);
+      InstructionCost VecCallCost =
+          std::min(VecCallCosts.first, VecCallCosts.second);
 
       LLVM_DEBUG(dbgs() << "SLP: Call cost " << VecCallCost - ScalarCallCost
                         << " (" << VecCallCost << "-" << ScalarCallCost << ")"
@@ -3615,7 +3823,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
               (Instruction::isCast(E->getOpcode()) &&
                Instruction::isCast(E->getAltOpcode()))) &&
              "Invalid Shuffle Vector Operand");
-      int ScalarCost = 0;
+      InstructionCost ScalarCost = 0;
       if (NeedToShuffleReuses) {
         for (unsigned Idx : E->ReuseShuffleIndices) {
           Instruction *I = cast<Instruction>(VL[Idx]);
@@ -3633,7 +3841,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
       }
       // VecCost is equal to sum of the cost of creating 2 vectors
       // and the cost of creating shuffle.
-      int VecCost = 0;
+      InstructionCost VecCost = 0;
       if (Instruction::isBinaryOp(E->getOpcode())) {
         VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
         VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy,
@@ -3644,11 +3852,12 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
         auto *Src0Ty = FixedVectorType::get(Src0SclTy, VL.size());
         auto *Src1Ty = FixedVectorType::get(Src1SclTy, VL.size());
         VecCost = TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty,
-                                        CostKind);
+                                        TTI::CastContextHint::None, CostKind);
         VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty,
-                                         CostKind);
+                                         TTI::CastContextHint::None, CostKind);
       }
       VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0);
+      LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
       return ReuseShuffleCost + VecCost - ScalarCost;
     }
     default:
@@ -3686,11 +3895,13 @@ static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
                                        TargetTransformInfo *TTI) {
   // Look past the root to find a source value. Arbitrarily follow the
   // path through operand 0 of any 'or'. Also, peek through optional
-  // shift-left-by-constant.
+  // shift-left-by-multiple-of-8-bits.
   Value *ZextLoad = Root;
+  const APInt *ShAmtC;
   while (!isa<ConstantExpr>(ZextLoad) &&
          (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
-          match(ZextLoad, m_Shl(m_Value(), m_Constant()))))
+          (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
+           ShAmtC->urem(8) == 0)))
     ZextLoad = cast<BinaryOperator>(ZextLoad)->getOperand(0);
 
   // Check if the input is an extended load of the required or/shift expression.
@@ -3714,8 +3925,8 @@ static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
   return true;
 }
 
-bool BoUpSLP::isLoadCombineReductionCandidate(unsigned RdxOpcode) const {
-  if (RdxOpcode != Instruction::Or)
+bool BoUpSLP::isLoadCombineReductionCandidate(RecurKind RdxKind) const {
+  if (RdxKind != RecurKind::Or)
     return false;
 
   unsigned NumElts = VectorizableTree[0]->Scalars.size();
@@ -3756,22 +3967,35 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() const {
   return true;
 }
 
-int BoUpSLP::getSpillCost() const {
+InstructionCost BoUpSLP::getSpillCost() const {
   // Walk from the bottom of the tree to the top, tracking which values are
   // live. When we see a call instruction that is not part of our tree,
   // query TTI to see if there is a cost to keeping values live over it
   // (for example, if spills and fills are required).
   unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
-  int Cost = 0;
+  InstructionCost Cost = 0;
 
   SmallPtrSet<Instruction*, 4> LiveValues;
   Instruction *PrevInst = nullptr;
 
+  // The entries in VectorizableTree are not necessarily ordered by their
+  // position in basic blocks. Collect them and order them by dominance so later
+  // instructions are guaranteed to be visited first. For instructions in
+  // different basic blocks, we only scan to the beginning of the block, so
+  // their order does not matter, as long as all instructions in a basic block
+  // are grouped together. Using dominance ensures a deterministic order.
+  SmallVector<Instruction *, 16> OrderedScalars;
   for (const auto &TEPtr : VectorizableTree) {
     Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
     if (!Inst)
       continue;
+    OrderedScalars.push_back(Inst);
+  }
+  llvm::stable_sort(OrderedScalars, [this](Instruction *A, Instruction *B) {
+    return DT->dominates(B, A);
+  });
 
+  for (Instruction *Inst : OrderedScalars) {
     if (!PrevInst) {
       PrevInst = Inst;
       continue;
@@ -3825,8 +4049,8 @@ int BoUpSLP::getSpillCost() const {
   return Cost;
 }
 
-int BoUpSLP::getTreeCost() {
-  int Cost = 0;
+InstructionCost BoUpSLP::getTreeCost() {
+  InstructionCost Cost = 0;
   LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
                     << VectorizableTree.size() << ".\n");
 
@@ -3856,15 +4080,16 @@ int BoUpSLP::getTreeCost() {
                     }))
       continue;
 
-    int C = getEntryCost(&TE);
+    InstructionCost C = getEntryCost(&TE);
+    Cost += C;
     LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
                       << " for bundle that starts with " << *TE.Scalars[0]
-                      << ".\n");
-    Cost += C;
+                      << ".\n"
+                      << "SLP: Current total cost = " << Cost << "\n");
   }
 
   SmallPtrSet<Value *, 16> ExtractCostCalculated;
-  int ExtractCost = 0;
+  InstructionCost ExtractCost = 0;
   for (ExternalUser &EU : ExternalUses) {
     // We only add extract cost once for the same scalar.
     if (!ExtractCostCalculated.insert(EU.Scalar).second)
@@ -3894,39 +4119,42 @@ int BoUpSLP::getTreeCost() {
     }
   }
 
-  int SpillCost = getSpillCost();
+  InstructionCost SpillCost = getSpillCost();
   Cost += SpillCost + ExtractCost;
 
-  std::string Str;
+#ifndef NDEBUG
+  SmallString<256> Str;
   {
-    raw_string_ostream OS(Str);
+    raw_svector_ostream OS(Str);
     OS << "SLP: Spill Cost = " << SpillCost << ".\n"
        << "SLP: Extract Cost = " << ExtractCost << ".\n"
        << "SLP: Total Cost = " << Cost << ".\n";
   }
   LLVM_DEBUG(dbgs() << Str);
-
   if (ViewSLPTree)
     ViewGraph(this, "SLP" + F->getName(), false, Str);
+#endif
 
   return Cost;
 }
 
-int BoUpSLP::getGatherCost(VectorType *Ty,
-                           const DenseSet<unsigned> &ShuffledIndices) const {
+InstructionCost
+BoUpSLP::getGatherCost(FixedVectorType *Ty,
+                       const DenseSet<unsigned> &ShuffledIndices) const {
   unsigned NumElts = Ty->getNumElements();
   APInt DemandedElts = APInt::getNullValue(NumElts);
-  for (unsigned i = 0; i < NumElts; ++i)
-    if (!ShuffledIndices.count(i))
-      DemandedElts.setBit(i);
-  int Cost = TTI->getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ true,
-                                           /*Extract*/ false);
+  for (unsigned I = 0; I < NumElts; ++I)
+    if (!ShuffledIndices.count(I))
+      DemandedElts.setBit(I);
+  InstructionCost Cost =
+      TTI->getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ true,
+                                    /*Extract*/ false);
   if (!ShuffledIndices.empty())
     Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty);
   return Cost;
 }
 
-int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {
+InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {
   // Find the type of the operands in VL.
   Type *ScalarTy = VL[0]->getType();
   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
@@ -3968,11 +4196,10 @@ void BoUpSLP::setInsertPointAfterBundle(TreeEntry *E) {
   // should be in this block.
   auto *Front = E->getMainOp();
   auto *BB = Front->getParent();
-  assert(llvm::all_of(make_range(E->Scalars.begin(), E->Scalars.end()),
-                      [=](Value *V) -> bool {
-                        auto *I = cast<Instruction>(V);
-                        return !E->isOpcodeOrAlt(I) || I->getParent() == BB;
-                      }));
+  assert(llvm::all_of(E->Scalars, [=](Value *V) -> bool {
+    auto *I = cast<Instruction>(V);
+    return !E->isOpcodeOrAlt(I) || I->getParent() == BB;
+  }));
 
   // The last instruction in the bundle in program order.
   Instruction *LastInst = nullptr;
@@ -4025,34 +4252,30 @@ void BoUpSLP::setInsertPointAfterBundle(TreeEntry *E) {
   Builder.SetCurrentDebugLocation(Front->getDebugLoc());
 }
 
-Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
-  Value *Vec = UndefValue::get(Ty);
-  // Generate the 'InsertElement' instruction.
-  for (unsigned i = 0; i < Ty->getNumElements(); ++i) {
-    Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
-    if (auto *Insrt = dyn_cast<InsertElementInst>(Vec)) {
-      GatherSeq.insert(Insrt);
-      CSEBlocks.insert(Insrt->getParent());
-
-      // Add to our 'need-to-extract' list.
-      if (TreeEntry *E = getTreeEntry(VL[i])) {
-        // Find which lane we need to extract.
-        int FoundLane = -1;
-        for (unsigned Lane = 0, LE = E->Scalars.size(); Lane != LE; ++Lane) {
-          // Is this the lane of the scalar that we are looking for ?
-          if (E->Scalars[Lane] == VL[i]) {
-            FoundLane = Lane;
-            break;
-          }
-        }
-        assert(FoundLane >= 0 && "Could not find the correct lane");
-        if (!E->ReuseShuffleIndices.empty()) {
-          FoundLane =
-              std::distance(E->ReuseShuffleIndices.begin(),
-                            llvm::find(E->ReuseShuffleIndices, FoundLane));
-        }
-        ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane));
+Value *BoUpSLP::gather(ArrayRef<Value *> VL) {
+  Value *Val0 =
+      isa<StoreInst>(VL[0]) ? cast<StoreInst>(VL[0])->getValueOperand() : VL[0];
+  FixedVectorType *VecTy = FixedVectorType::get(Val0->getType(), VL.size());
+  Value *Vec = PoisonValue::get(VecTy);
+  unsigned InsIndex = 0;
+  for (Value *Val : VL) {
+    Vec = Builder.CreateInsertElement(Vec, Val, Builder.getInt32(InsIndex++));
+    auto *InsElt = dyn_cast<InsertElementInst>(Vec);
+    if (!InsElt)
+      continue;
+    GatherSeq.insert(InsElt);
+    CSEBlocks.insert(InsElt->getParent());
+    // Add to our 'need-to-extract' list.
+    if (TreeEntry *Entry = getTreeEntry(Val)) {
+      // Find which lane we need to extract.
+      unsigned FoundLane = std::distance(Entry->Scalars.begin(),
+                                         find(Entry->Scalars, Val));
+      assert(FoundLane < Entry->Scalars.size() && "Couldn't find extract lane");
+      if (!Entry->ReuseShuffleIndices.empty()) {
+        FoundLane = std::distance(Entry->ReuseShuffleIndices.begin(),
+                                  find(Entry->ReuseShuffleIndices, FoundLane));
       }
+      ExternalUses.push_back(ExternalUser(Val, InsElt, FoundLane));
     }
   }
 
@@ -4076,8 +4299,7 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
             for (int Idx : E->ReuseShuffleIndices)
               if (UsedIdxs.insert(Idx).second)
                 UniqueIdxs.emplace_back(Idx);
-            V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()),
-                                            UniqueIdxs);
+            V = Builder.CreateShuffleVector(V, UniqueIdxs);
           }
         }
         return V;
@@ -4085,10 +4307,6 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
     }
   }
 
-  Type *ScalarTy = S.OpValue->getType();
-  if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
-    ScalarTy = SI->getValueOperand()->getType();
-
   // Check that every instruction appears once in this bundle.
   SmallVector<int, 4> ReuseShuffleIndicies;
   SmallVector<Value *, 4> UniqueValues;
@@ -4108,27 +4326,16 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
     else
       VL = UniqueValues;
   }
-  auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
 
-  Value *V = Gather(VL, VecTy);
+  Value *Vec = gather(VL);
   if (!ReuseShuffleIndicies.empty()) {
-    V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
-                                    ReuseShuffleIndicies, "shuffle");
-    if (auto *I = dyn_cast<Instruction>(V)) {
+    Vec = Builder.CreateShuffleVector(Vec, ReuseShuffleIndicies, "shuffle");
+    if (auto *I = dyn_cast<Instruction>(Vec)) {
       GatherSeq.insert(I);
       CSEBlocks.insert(I->getParent());
     }
   }
-  return V;
-}
-
-static void inversePermutation(ArrayRef<unsigned> Indices,
-                               SmallVectorImpl<int> &Mask) {
-  Mask.clear();
-  const unsigned E = Indices.size();
-  Mask.resize(E);
-  for (unsigned I = 0; I < E; ++I)
-    Mask[Indices[I]] = I;
+  return Vec;
 }
 
 Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
@@ -4139,32 +4346,31 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
     return E->VectorizedValue;
   }
 
-  Instruction *VL0 = E->getMainOp();
-  Type *ScalarTy = VL0->getType();
-  if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
-    ScalarTy = SI->getValueOperand()->getType();
-  auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size());
-
   bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
-
   if (E->State == TreeEntry::NeedToGather) {
     setInsertPointAfterBundle(E);
-    auto *V = Gather(E->Scalars, VecTy);
+    Value *Vec = gather(E->Scalars);
     if (NeedToShuffleReuses) {
-      V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
-                                      E->ReuseShuffleIndices, "shuffle");
-      if (auto *I = dyn_cast<Instruction>(V)) {
+      Vec = Builder.CreateShuffleVector(Vec, E->ReuseShuffleIndices, "shuffle");
+      if (auto *I = dyn_cast<Instruction>(Vec)) {
         GatherSeq.insert(I);
         CSEBlocks.insert(I->getParent());
       }
     }
-    E->VectorizedValue = V;
-    return V;
+    E->VectorizedValue = Vec;
+    return Vec;
   }
 
-  assert(E->State == TreeEntry::Vectorize && "Unhandled state");
+  assert((E->State == TreeEntry::Vectorize ||
+          E->State == TreeEntry::ScatterVectorize) &&
+         "Unhandled state");
   unsigned ShuffleOrOp =
       E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
+  Instruction *VL0 = E->getMainOp();
+  Type *ScalarTy = VL0->getType();
+  if (auto *Store = dyn_cast<StoreInst>(VL0))
+    ScalarTy = Store->getValueOperand()->getType();
+  auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size());
   switch (ShuffleOrOp) {
     case Instruction::PHI: {
       auto *PH = cast<PHINode>(VL0);
@@ -4172,10 +4378,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       Builder.SetCurrentDebugLocation(PH->getDebugLoc());
       PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
       Value *V = NewPhi;
-      if (NeedToShuffleReuses) {
-        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
-                                        E->ReuseShuffleIndices, "shuffle");
-      }
+      if (NeedToShuffleReuses)
+        V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
+
       E->VectorizedValue = V;
 
       // PHINodes may have multiple entries from the same block. We want to
@@ -4208,37 +4413,33 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
         SmallVector<int, 4> Mask;
         inversePermutation(E->ReorderIndices, Mask);
         Builder.SetInsertPoint(VL0);
-        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), Mask,
-                                        "reorder_shuffle");
+        V = Builder.CreateShuffleVector(V, Mask, "reorder_shuffle");
       }
       if (NeedToShuffleReuses) {
         // TODO: Merge this shuffle with the ReorderShuffleMask.
         if (E->ReorderIndices.empty())
           Builder.SetInsertPoint(VL0);
-        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
-                                        E->ReuseShuffleIndices, "shuffle");
+        V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
       }
       E->VectorizedValue = V;
       return V;
     }
     case Instruction::ExtractValue: {
-      LoadInst *LI = cast<LoadInst>(E->getSingleOperand(0));
+      auto *LI = cast<LoadInst>(E->getSingleOperand(0));
       Builder.SetInsertPoint(LI);
-      PointerType *PtrTy =
-          PointerType::get(VecTy, LI->getPointerAddressSpace());
+      auto *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());
       Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
       LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
       Value *NewV = propagateMetadata(V, E->Scalars);
       if (!E->ReorderIndices.empty()) {
         SmallVector<int, 4> Mask;
         inversePermutation(E->ReorderIndices, Mask);
-        NewV = Builder.CreateShuffleVector(NewV, UndefValue::get(VecTy), Mask,
-                                           "reorder_shuffle");
+        NewV = Builder.CreateShuffleVector(NewV, Mask, "reorder_shuffle");
       }
       if (NeedToShuffleReuses) {
         // TODO: Merge this shuffle with the ReorderShuffleMask.
-        NewV = Builder.CreateShuffleVector(NewV, UndefValue::get(VecTy),
-                                           E->ReuseShuffleIndices, "shuffle");
+        NewV = Builder.CreateShuffleVector(NewV, E->ReuseShuffleIndices,
+                                           "shuffle");
       }
       E->VectorizedValue = NewV;
       return NewV;
@@ -4266,10 +4467,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
 
       auto *CI = cast<CastInst>(VL0);
       Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
-      if (NeedToShuffleReuses) {
-        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
-                                        E->ReuseShuffleIndices, "shuffle");
-      }
+      if (NeedToShuffleReuses)
+        V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
+
       E->VectorizedValue = V;
       ++NumVectorInstructions;
       return V;
@@ -4289,10 +4489,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
       Value *V = Builder.CreateCmp(P0, L, R);
       propagateIRFlags(V, E->Scalars, VL0);
-      if (NeedToShuffleReuses) {
-        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
-                                        E->ReuseShuffleIndices, "shuffle");
-      }
+      if (NeedToShuffleReuses)
+        V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
+
       E->VectorizedValue = V;
       ++NumVectorInstructions;
       return V;
@@ -4310,10 +4509,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       }
 
       Value *V = Builder.CreateSelect(Cond, True, False);
-      if (NeedToShuffleReuses) {
-        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
-                                        E->ReuseShuffleIndices, "shuffle");
-      }
+      if (NeedToShuffleReuses)
+        V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
+
       E->VectorizedValue = V;
       ++NumVectorInstructions;
       return V;
@@ -4334,10 +4532,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       if (auto *I = dyn_cast<Instruction>(V))
         V = propagateMetadata(I, E->Scalars);
 
-      if (NeedToShuffleReuses) {
-        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
-                                        E->ReuseShuffleIndices, "shuffle");
-      }
+      if (NeedToShuffleReuses)
+        V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
+
       E->VectorizedValue = V;
       ++NumVectorInstructions;
 
@@ -4378,10 +4575,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       if (auto *I = dyn_cast<Instruction>(V))
         V = propagateMetadata(I, E->Scalars);
 
-      if (NeedToShuffleReuses) {
-        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
-                                        E->ReuseShuffleIndices, "shuffle");
-      }
+      if (NeedToShuffleReuses)
+        V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
+
       E->VectorizedValue = V;
       ++NumVectorInstructions;
 
@@ -4396,30 +4592,40 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       setInsertPointAfterBundle(E);
 
       LoadInst *LI = cast<LoadInst>(VL0);
+      Instruction *NewLI;
       unsigned AS = LI->getPointerAddressSpace();
+      Value *PO = LI->getPointerOperand();
+      if (E->State == TreeEntry::Vectorize) {
 
-      Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
-                                            VecTy->getPointerTo(AS));
+        Value *VecPtr = Builder.CreateBitCast(PO, VecTy->getPointerTo(AS));
 
-      // The pointer operand uses an in-tree scalar so we add the new BitCast to
-      // ExternalUses list to make sure that an extract will be generated in the
-      // future.
-      Value *PO = LI->getPointerOperand();
-      if (getTreeEntry(PO))
-        ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0));
+        // The pointer operand uses an in-tree scalar so we add the new BitCast
+        // to ExternalUses list to make sure that an extract will be generated
+        // in the future.
+        if (getTreeEntry(PO))
+          ExternalUses.emplace_back(PO, cast<User>(VecPtr), 0);
+
+        NewLI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign());
+      } else {
+        assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
+        Value *VecPtr = vectorizeTree(E->getOperand(0));
+        // Use the minimum alignment of the gathered loads.
+        Align CommonAlignment = LI->getAlign();
+        for (Value *V : E->Scalars)
+          CommonAlignment =
+              commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign());
+        NewLI = Builder.CreateMaskedGather(VecPtr, CommonAlignment);
+      }
+      Value *V = propagateMetadata(NewLI, E->Scalars);
 
-      LI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign());
-      Value *V = propagateMetadata(LI, E->Scalars);
       if (IsReorder) {
         SmallVector<int, 4> Mask;
         inversePermutation(E->ReorderIndices, Mask);
-        V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()),
-                                        Mask, "reorder_shuffle");
+        V = Builder.CreateShuffleVector(V, Mask, "reorder_shuffle");
       }
       if (NeedToShuffleReuses) {
         // TODO: Merge this shuffle with the ReorderShuffleMask.
-        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
-                                        E->ReuseShuffleIndices, "shuffle");
+        V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
       }
       E->VectorizedValue = V;
       ++NumVectorInstructions;
@@ -4437,9 +4643,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       if (IsReorder) {
         SmallVector<int, 4> Mask(E->ReorderIndices.begin(),
                                  E->ReorderIndices.end());
-        VecValue = Builder.CreateShuffleVector(
-            VecValue, UndefValue::get(VecValue->getType()), Mask,
-            "reorder_shuffle");
+        VecValue = Builder.CreateShuffleVector(VecValue, Mask, "reorder_shuf");
       }
       Value *ScalarPtr = SI->getPointerOperand();
       Value *VecPtr = Builder.CreateBitCast(
@@ -4454,10 +4658,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
         ExternalUses.push_back(ExternalUser(ScalarPtr, cast<User>(VecPtr), 0));
 
       Value *V = propagateMetadata(ST, E->Scalars);
-      if (NeedToShuffleReuses) {
-        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
-                                        E->ReuseShuffleIndices, "shuffle");
-      }
+      if (NeedToShuffleReuses)
+        V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
+
       E->VectorizedValue = V;
       ++NumVectorInstructions;
       return V;
@@ -4494,10 +4697,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       if (Instruction *I = dyn_cast<Instruction>(V))
         V = propagateMetadata(I, E->Scalars);
 
-      if (NeedToShuffleReuses) {
-        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
-                                        E->ReuseShuffleIndices, "shuffle");
-      }
+      if (NeedToShuffleReuses)
+        V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
+
       E->VectorizedValue = V;
       ++NumVectorInstructions;
 
@@ -4537,9 +4739,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
 
       Function *CF;
       if (!UseIntrinsic) {
-        VFShape Shape = VFShape::get(
-            *CI, {static_cast<unsigned>(VecTy->getNumElements()), false},
-            false /*HasGlobalPred*/);
+        VFShape Shape =
+            VFShape::get(*CI, ElementCount::getFixed(static_cast<unsigned>(
+                                  VecTy->getNumElements())),
+                         false /*HasGlobalPred*/);
         CF = VFDatabase(*CI).getVectorizedFunction(Shape);
       } else {
         Type *Tys[] = {FixedVectorType::get(CI->getType(), E->Scalars.size())};
@@ -4557,10 +4760,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
         ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
 
       propagateIRFlags(V, E->Scalars, VL0);
-      if (NeedToShuffleReuses) {
-        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
-                                        E->ReuseShuffleIndices, "shuffle");
-      }
+      if (NeedToShuffleReuses)
+        V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
+
       E->VectorizedValue = V;
       ++NumVectorInstructions;
       return V;
@@ -4625,10 +4827,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       Value *V = Builder.CreateShuffleVector(V0, V1, Mask);
       if (Instruction *I = dyn_cast<Instruction>(V))
         V = propagateMetadata(I, E->Scalars);
-      if (NeedToShuffleReuses) {
-        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
-                                        E->ReuseShuffleIndices, "shuffle");
-      }
+      if (NeedToShuffleReuses)
+        V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
+
       E->VectorizedValue = V;
       ++NumVectorInstructions;
 
@@ -4693,7 +4894,8 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
       continue;
     TreeEntry *E = getTreeEntry(Scalar);
     assert(E && "Invalid scalar");
-    assert(E->State == TreeEntry::Vectorize && "Extracting from a gather list");
+    assert(E->State != TreeEntry::NeedToGather &&
+           "Extracting from a gather list");
 
     Value *Vec = E->VectorizedValue;
     assert(Vec && "Can't find vectorizable value");
@@ -4851,7 +5053,8 @@ void BoUpSLP::optimizeGatherSequence() {
   // instructions into different buckets based on the insert lane.
   SmallVector<Instruction *, 16> Visited;
   for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
-    assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
+    assert(*I &&
+           (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
            "Worklist not sorted properly!");
     BasicBlock *BB = (*I)->getBlock();
     // For all instructions in blocks containing gather sequences:
@@ -4961,8 +5164,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
   // cancelScheduling).
   while (!Bundle->isReady() && !ReadyInsts.empty()) {
 
-    ScheduleData *pickedSD = ReadyInsts.back();
-    ReadyInsts.pop_back();
+    ScheduleData *pickedSD = ReadyInsts.pop_back_val();
 
     if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) {
       schedule(pickedSD, ReadyInsts);
@@ -5106,7 +5308,9 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
 
     if (I->mayReadOrWriteMemory() &&
         (!isa<IntrinsicInst>(I) ||
-         cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect)) {
+         (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
+          cast<IntrinsicInst>(I)->getIntrinsicID() !=
+              Intrinsic::pseudoprobe))) {
       // Update the linked list of memory accessing instructions.
       if (CurrentLoadStore) {
         CurrentLoadStore->NextLoadStore = SD;
@@ -5133,8 +5337,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
   WorkList.push_back(SD);
 
   while (!WorkList.empty()) {
-    ScheduleData *SD = WorkList.back();
-    WorkList.pop_back();
+    ScheduleData *SD = WorkList.pop_back_val();
 
     ScheduleData *BundleMember = SD;
     while (BundleMember) {
@@ -5331,10 +5534,15 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
 }
 
 unsigned BoUpSLP::getVectorElementSize(Value *V) {
-  // If V is a store, just return the width of the stored value without
-  // traversing the expression tree. This is the common case.
-  if (auto *Store = dyn_cast<StoreInst>(V))
-    return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
+  // If V is a store, just return the width of the stored value (or value
+  // truncated just before storing) without traversing the expression tree.
+  // This is the common case.
+  if (auto *Store = dyn_cast<StoreInst>(V)) {
+    if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
+      return DL->getTypeSizeInBits(Trunc->getSrcTy());
+    else
+      return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
+  }
 
   auto E = InstrElementSize.find(V);
   if (E != InstrElementSize.end())
@@ -5683,7 +5891,7 @@ PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &A
 
 bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
                                 TargetTransformInfo *TTI_,
-                                TargetLibraryInfo *TLI_, AliasAnalysis *AA_,
+                                TargetLibraryInfo *TLI_, AAResults *AA_,
                                 LoopInfo *LI_, DominatorTree *DT_,
                                 AssumptionCache *AC_, DemandedBits *DB_,
                                 OptimizationRemarkEmitter *ORE_) {
@@ -5783,11 +5991,11 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
 
   R.computeMinimumValueSizes();
 
-  int Cost = R.getTreeCost();
+  InstructionCost Cost = R.getTreeCost();
 
-  LLVM_DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
+  LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF =" << VF << "\n");
   if (Cost < -SLPCostThreshold) {
-    LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
+    LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
 
     using namespace ore;
 
@@ -5860,7 +6068,7 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
 
     // If a vector register can't hold 1 element, we are done.
     unsigned MaxVecRegSize = R.getMaxVecRegSize();
-    unsigned EltSize = R.getVectorElementSize(Stores[0]);
+    unsigned EltSize = R.getVectorElementSize(Operands[0]);
     if (MaxVecRegSize % EltSize != 0)
       continue;
 
@@ -5911,7 +6119,7 @@ void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
         continue;
       if (!isValidElementType(SI->getValueOperand()->getType()))
         continue;
-      Stores[GetUnderlyingObject(SI->getPointerOperand(), *DL)].push_back(SI);
+      Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
     }
 
     // Ignore getelementptr instructions that have more than one index, a
@@ -5975,6 +6183,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
   unsigned Sz = R.getVectorElementSize(I0);
   unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);
   unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);
+  MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
   if (MaxVF < 2) {
     R.getORE()->emit([&]() {
       return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
@@ -5986,7 +6195,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
 
   bool Changed = false;
   bool CandidateFound = false;
-  int MinCost = SLPCostThreshold;
+  InstructionCost MinCost = SLPCostThreshold.getValue();
 
   bool CompensateUseCost =
       !InsertUses.empty() && llvm::all_of(InsertUses, [](const Value *V) {
@@ -6042,7 +6251,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
         continue;
 
       R.computeMinimumValueSizes();
-      int Cost = R.getTreeCost();
+      InstructionCost Cost = R.getTreeCost();
       CandidateFound = true;
       if (CompensateUseCost) {
         // TODO: Use TTI's getScalarizationOverhead for sequence of inserts
@@ -6052,7 +6261,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
         // part should also switch to same interface.
         // For example, the following case is projected code after SLP:
         //  %4 = extractelement <4 x i64> %3, i32 0
-        //  %v0 = insertelement <4 x i64> undef, i64 %4, i32 0
+        //  %v0 = insertelement <4 x i64> poison, i64 %4, i32 0
         //  %5 = extractelement <4 x i64> %3, i32 1
         //  %v1 = insertelement <4 x i64> %v0, i64 %5, i32 1
         //  %6 = extractelement <4 x i64> %3, i32 2
@@ -6072,7 +6281,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
         // Switching to the TTI interface might help a bit.
         // Alternative solution could be pattern-match to detect a no-op or
         // shuffle.
-        unsigned UserCost = 0;
+        InstructionCost UserCost = 0;
         for (unsigned Lane = 0; Lane < OpsWidth; Lane++) {
           auto *IE = cast<InsertElementInst>(InsertUses[I + Lane]);
           if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2)))
@@ -6163,50 +6372,20 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
   return false;
 }
 
-/// Generate a shuffle mask to be used in a reduction tree.
-///
-/// \param VecLen The length of the vector to be reduced.
-/// \param NumEltsToRdx The number of elements that should be reduced in the
-///        vector.
-/// \param IsPairwise Whether the reduction is a pairwise or splitting
-///        reduction. A pairwise reduction will generate a mask of
-///        <0,2,...> or <1,3,..> while a splitting reduction will generate
-///        <2,3, undef,undef> for a vector of 4 and NumElts = 2.
-/// \param IsLeft True will generate a mask of even elements, odd otherwise.
-static SmallVector<int, 32> createRdxShuffleMask(unsigned VecLen,
-                                                 unsigned NumEltsToRdx,
-                                                 bool IsPairwise, bool IsLeft) {
-  assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask");
-
-  SmallVector<int, 32> ShuffleMask(VecLen, -1);
-
-  if (IsPairwise)
-    // Build a mask of 0, 2, ... (left) or 1, 3, ... (right).
-    for (unsigned i = 0; i != NumEltsToRdx; ++i)
-      ShuffleMask[i] = 2 * i + !IsLeft;
-  else
-    // Move the upper half of the vector to the lower half.
-    for (unsigned i = 0; i != NumEltsToRdx; ++i)
-      ShuffleMask[i] = NumEltsToRdx + i;
-
-  return ShuffleMask;
-}
-
 namespace {
 
 /// Model horizontal reductions.
 ///
-/// A horizontal reduction is a tree of reduction operations (currently add and
-/// fadd) that has operations that can be put into a vector as its leaf.
-/// For example, this tree:
+/// A horizontal reduction is a tree of reduction instructions that has values
+/// that can be put into a vector as its leaves. For example:
 ///
 /// mul mul mul mul
 ///  \  /    \  /
 ///   +       +
 ///    \     /
 ///       +
-/// This tree has "mul" as its reduced values and "+" as its reduction
-/// operations. A reduction might be feeding into a store or a binary operation
+/// This tree has "mul" as its leaf values and "+" as its reduction
+/// instructions. A reduction can feed into a store or a binary operation
 /// feeding a phi.
 ///    ...
 ///    \  /
@@ -6224,333 +6403,30 @@ namespace {
 class HorizontalReduction {
   using ReductionOpsType = SmallVector<Value *, 16>;
   using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
-  ReductionOpsListType  ReductionOps;
+  ReductionOpsListType ReductionOps;
   SmallVector<Value *, 32> ReducedVals;
   // Use map vector to make stable output.
   MapVector<Instruction *, Value *> ExtraArgs;
-
-  /// Kind of the reduction data.
-  enum ReductionKind {
-    RK_None,       /// Not a reduction.
-    RK_Arithmetic, /// Binary reduction data.
-    RK_Min,        /// Minimum reduction data.
-    RK_UMin,       /// Unsigned minimum reduction data.
-    RK_Max,        /// Maximum reduction data.
-    RK_UMax,       /// Unsigned maximum reduction data.
-  };
-
-  /// Contains info about operation, like its opcode, left and right operands.
-  class OperationData {
-    /// Opcode of the instruction.
-    unsigned Opcode = 0;
-
-    /// Left operand of the reduction operation.
-    Value *LHS = nullptr;
-
-    /// Right operand of the reduction operation.
-    Value *RHS = nullptr;
-
-    /// Kind of the reduction operation.
-    ReductionKind Kind = RK_None;
-
-    /// True if float point min/max reduction has no NaNs.
-    bool NoNaN = false;
-
-    /// Checks if the reduction operation can be vectorized.
-    bool isVectorizable() const {
-      return LHS && RHS &&
-             // We currently only support add/mul/logical && min/max reductions.
-             ((Kind == RK_Arithmetic &&
-               (Opcode == Instruction::Add || Opcode == Instruction::FAdd ||
-                Opcode == Instruction::Mul || Opcode == Instruction::FMul ||
-                Opcode == Instruction::And || Opcode == Instruction::Or ||
-                Opcode == Instruction::Xor)) ||
-              ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
-               (Kind == RK_Min || Kind == RK_Max)) ||
-              (Opcode == Instruction::ICmp &&
-               (Kind == RK_UMin || Kind == RK_UMax)));
-    }
-
-    /// Creates reduction operation with the current opcode.
-    Value *createOp(IRBuilder<> &Builder, const Twine &Name) const {
-      assert(isVectorizable() &&
-             "Expected add|fadd or min/max reduction operation.");
-      Value *Cmp = nullptr;
-      switch (Kind) {
-      case RK_Arithmetic:
-        return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, LHS, RHS,
-                                   Name);
-      case RK_Min:
-        Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSLT(LHS, RHS)
-                                          : Builder.CreateFCmpOLT(LHS, RHS);
-        return Builder.CreateSelect(Cmp, LHS, RHS, Name);
-      case RK_Max:
-        Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSGT(LHS, RHS)
-                                          : Builder.CreateFCmpOGT(LHS, RHS);
-        return Builder.CreateSelect(Cmp, LHS, RHS, Name);
-      case RK_UMin:
-        assert(Opcode == Instruction::ICmp && "Expected integer types.");
-        Cmp = Builder.CreateICmpULT(LHS, RHS);
-        return Builder.CreateSelect(Cmp, LHS, RHS, Name);
-      case RK_UMax:
-        assert(Opcode == Instruction::ICmp && "Expected integer types.");
-        Cmp = Builder.CreateICmpUGT(LHS, RHS);
-        return Builder.CreateSelect(Cmp, LHS, RHS, Name);
-      case RK_None:
-        break;
-      }
-      llvm_unreachable("Unknown reduction operation.");
-    }
-
-  public:
-    explicit OperationData() = default;
-
-    /// Construction for reduced values. They are identified by opcode only and
-    /// don't have associated LHS/RHS values.
-    explicit OperationData(Value *V) {
-      if (auto *I = dyn_cast<Instruction>(V))
-        Opcode = I->getOpcode();
-    }
-
-    /// Constructor for reduction operations with opcode and its left and
-    /// right operands.
-    OperationData(unsigned Opcode, Value *LHS, Value *RHS, ReductionKind Kind,
-                  bool NoNaN = false)
-        : Opcode(Opcode), LHS(LHS), RHS(RHS), Kind(Kind), NoNaN(NoNaN) {
-      assert(Kind != RK_None && "One of the reduction operations is expected.");
-    }
-
-    explicit operator bool() const { return Opcode; }
-
-    /// Return true if this operation is any kind of minimum or maximum.
-    bool isMinMax() const {
-      switch (Kind) {
-      case RK_Arithmetic:
-        return false;
-      case RK_Min:
-      case RK_Max:
-      case RK_UMin:
-      case RK_UMax:
-        return true;
-      case RK_None:
-        break;
-      }
-      llvm_unreachable("Reduction kind is not set");
-    }
-
-    /// Get the index of the first operand.
-    unsigned getFirstOperandIndex() const {
-      assert(!!*this && "The opcode is not set.");
-      // We allow calling this before 'Kind' is set, so handle that specially.
-      if (Kind == RK_None)
-        return 0;
-      return isMinMax() ? 1 : 0;
-    }
-
-    /// Total number of operands in the reduction operation.
-    unsigned getNumberOfOperands() const {
-      assert(Kind != RK_None && !!*this && LHS && RHS &&
-             "Expected reduction operation.");
-      return isMinMax() ? 3 : 2;
-    }
-
-    /// Checks if the operation has the same parent as \p P.
-    bool hasSameParent(Instruction *I, Value *P, bool IsRedOp) const {
-      assert(Kind != RK_None && !!*this && LHS && RHS &&
-             "Expected reduction operation.");
-      if (!IsRedOp)
-        return I->getParent() == P;
-      if (isMinMax()) {
-        // SelectInst must be used twice while the condition op must have single
-        // use only.
-        auto *Cmp = cast<Instruction>(cast<SelectInst>(I)->getCondition());
-        return I->getParent() == P && Cmp && Cmp->getParent() == P;
-      }
-      // Arithmetic reduction operation must be used once only.
-      return I->getParent() == P;
-    }
-
-    /// Expected number of uses for reduction operations/reduced values.
-    bool hasRequiredNumberOfUses(Instruction *I, bool IsReductionOp) const {
-      assert(Kind != RK_None && !!*this && LHS && RHS &&
-             "Expected reduction operation.");
-      if (isMinMax())
-        return I->hasNUses(2) &&
-               (!IsReductionOp ||
-                cast<SelectInst>(I)->getCondition()->hasOneUse());
-      return I->hasOneUse();
-    }
-
-    /// Initializes the list of reduction operations.
-    void initReductionOps(ReductionOpsListType &ReductionOps) {
-      assert(Kind != RK_None && !!*this && LHS && RHS &&
-             "Expected reduction operation.");
-      if (isMinMax())
-        ReductionOps.assign(2, ReductionOpsType());
-      else
-        ReductionOps.assign(1, ReductionOpsType());
-    }
-
-    /// Add all reduction operations for the reduction instruction \p I.
-    void addReductionOps(Instruction *I, ReductionOpsListType &ReductionOps) {
-      assert(Kind != RK_None && !!*this && LHS && RHS &&
-             "Expected reduction operation.");
-      if (isMinMax()) {
-        ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
-        ReductionOps[1].emplace_back(I);
-      } else {
-        ReductionOps[0].emplace_back(I);
-      }
-    }
-
-    /// Checks if instruction is associative and can be vectorized.
-    bool isAssociative(Instruction *I) const {
-      assert(Kind != RK_None && *this && LHS && RHS &&
-             "Expected reduction operation.");
-      switch (Kind) {
-      case RK_Arithmetic:
-        return I->isAssociative();
-      case RK_Min:
-      case RK_Max:
-        return Opcode == Instruction::ICmp ||
-               cast<Instruction>(I->getOperand(0))->isFast();
-      case RK_UMin:
-      case RK_UMax:
-        assert(Opcode == Instruction::ICmp &&
-               "Only integer compare operation is expected.");
-        return true;
-      case RK_None:
-        break;
-      }
-      llvm_unreachable("Reduction kind is not set");
-    }
-
-    /// Checks if the reduction operation can be vectorized.
-    bool isVectorizable(Instruction *I) const {
-      return isVectorizable() && isAssociative(I);
-    }
-
-    /// Checks if two operation data are both a reduction op or both a reduced
-    /// value.
-    bool operator==(const OperationData &OD) const {
-      assert(((Kind != OD.Kind) || ((!LHS == !OD.LHS) && (!RHS == !OD.RHS))) &&
-             "One of the comparing operations is incorrect.");
-      return this == &OD || (Kind == OD.Kind && Opcode == OD.Opcode);
-    }
-    bool operator!=(const OperationData &OD) const { return !(*this == OD); }
-    void clear() {
-      Opcode = 0;
-      LHS = nullptr;
-      RHS = nullptr;
-      Kind = RK_None;
-      NoNaN = false;
-    }
-
-    /// Get the opcode of the reduction operation.
-    unsigned getOpcode() const {
-      assert(isVectorizable() && "Expected vectorizable operation.");
-      return Opcode;
-    }
-
-    /// Get kind of reduction data.
-    ReductionKind getKind() const { return Kind; }
-    Value *getLHS() const { return LHS; }
-    Value *getRHS() const { return RHS; }
-    Type *getConditionType() const {
-      return isMinMax() ? CmpInst::makeCmpResultType(LHS->getType()) : nullptr;
-    }
-
-    /// Creates reduction operation with the current opcode with the IR flags
-    /// from \p ReductionOps.
-    Value *createOp(IRBuilder<> &Builder, const Twine &Name,
-                    const ReductionOpsListType &ReductionOps) const {
-      assert(isVectorizable() &&
-             "Expected add|fadd or min/max reduction operation.");
-      auto *Op = createOp(Builder, Name);
-      switch (Kind) {
-      case RK_Arithmetic:
-        propagateIRFlags(Op, ReductionOps[0]);
-        return Op;
-      case RK_Min:
-      case RK_Max:
-      case RK_UMin:
-      case RK_UMax:
-        if (auto *SI = dyn_cast<SelectInst>(Op))
-          propagateIRFlags(SI->getCondition(), ReductionOps[0]);
-        propagateIRFlags(Op, ReductionOps[1]);
-        return Op;
-      case RK_None:
-        break;
-      }
-      llvm_unreachable("Unknown reduction operation.");
-    }
-    /// Creates reduction operation with the current opcode with the IR flags
-    /// from \p I.
-    Value *createOp(IRBuilder<> &Builder, const Twine &Name,
-                    Instruction *I) const {
-      assert(isVectorizable() &&
-             "Expected add|fadd or min/max reduction operation.");
-      auto *Op = createOp(Builder, Name);
-      switch (Kind) {
-      case RK_Arithmetic:
-        propagateIRFlags(Op, I);
-        return Op;
-      case RK_Min:
-      case RK_Max:
-      case RK_UMin:
-      case RK_UMax:
-        if (auto *SI = dyn_cast<SelectInst>(Op)) {
-          propagateIRFlags(SI->getCondition(),
-                           cast<SelectInst>(I)->getCondition());
-        }
-        propagateIRFlags(Op, I);
-        return Op;
-      case RK_None:
-        break;
-      }
-      llvm_unreachable("Unknown reduction operation.");
-    }
-
-    TargetTransformInfo::ReductionFlags getFlags() const {
-      TargetTransformInfo::ReductionFlags Flags;
-      Flags.NoNaN = NoNaN;
-      switch (Kind) {
-      case RK_Arithmetic:
-        break;
-      case RK_Min:
-        Flags.IsSigned = Opcode == Instruction::ICmp;
-        Flags.IsMaxOp = false;
-        break;
-      case RK_Max:
-        Flags.IsSigned = Opcode == Instruction::ICmp;
-        Flags.IsMaxOp = true;
-        break;
-      case RK_UMin:
-        Flags.IsSigned = false;
-        Flags.IsMaxOp = false;
-        break;
-      case RK_UMax:
-        Flags.IsSigned = false;
-        Flags.IsMaxOp = true;
-        break;
-      case RK_None:
-        llvm_unreachable("Reduction kind is not set");
-      }
-      return Flags;
-    }
-  };
-
   WeakTrackingVH ReductionRoot;
+  /// The type of reduction operation.
+  RecurKind RdxKind;
 
-  /// The operation data of the reduction operation.
-  OperationData ReductionData;
+  /// Checks if instruction is associative and can be vectorized.
+  static bool isVectorizable(RecurKind Kind, Instruction *I) {
+    if (Kind == RecurKind::None)
+      return false;
+    if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind))
+      return true;
 
-  /// The operation data of the values we perform a reduction on.
-  OperationData ReducedValueData;
+    if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
+      // FP min/max are associative except for NaN and -0.0. We do not
+      // have to rule out -0.0 here because the intrinsic semantics do not
+      // specify a fixed result for it.
+      return I->getFastMathFlags().noNaNs();
+    }
 
-  /// Should we model this reduction as a pairwise reduction tree or a tree that
-  /// splits the vector in halves and adds those halves.
-  bool IsPairwiseReduction = false;
+    return I->isAssociative();
+  }
 
   /// Checks if the ParentStackElem.first should be marked as a reduction
   /// operation with an extra argument or as extra argument itself.
@@ -6564,7 +6440,8 @@ class HorizontalReduction {
       // in this case.
       // Do not perform analysis of remaining operands of ParentStackElem.first
       // instruction, this whole instruction is an extra argument.
-      ParentStackElem.second = ParentStackElem.first->getNumOperands();
+      RecurKind ParentRdxKind = getRdxKind(ParentStackElem.first);
+      ParentStackElem.second = getNumberOfOperands(ParentRdxKind);
     } else {
       // We ran into something like:
       // ParentStackElem.first += ... + ExtraArg + ...
@@ -6572,110 +6449,238 @@ class HorizontalReduction {
     }
   }
 
-  static OperationData getOperationData(Value *V) {
-    if (!V)
-      return OperationData();
-
-    Value *LHS;
-    Value *RHS;
-    if (m_BinOp(m_Value(LHS), m_Value(RHS)).match(V)) {
-      return OperationData(cast<BinaryOperator>(V)->getOpcode(), LHS, RHS,
-                           RK_Arithmetic);
-    }
-    if (auto *Select = dyn_cast<SelectInst>(V)) {
-      // Look for a min/max pattern.
-      if (m_UMin(m_Value(LHS), m_Value(RHS)).match(Select)) {
-        return OperationData(Instruction::ICmp, LHS, RHS, RK_UMin);
-      } else if (m_SMin(m_Value(LHS), m_Value(RHS)).match(Select)) {
-        return OperationData(Instruction::ICmp, LHS, RHS, RK_Min);
-      } else if (m_OrdFMin(m_Value(LHS), m_Value(RHS)).match(Select) ||
-                 m_UnordFMin(m_Value(LHS), m_Value(RHS)).match(Select)) {
-        return OperationData(
-            Instruction::FCmp, LHS, RHS, RK_Min,
-            cast<Instruction>(Select->getCondition())->hasNoNaNs());
-      } else if (m_UMax(m_Value(LHS), m_Value(RHS)).match(Select)) {
-        return OperationData(Instruction::ICmp, LHS, RHS, RK_UMax);
-      } else if (m_SMax(m_Value(LHS), m_Value(RHS)).match(Select)) {
-        return OperationData(Instruction::ICmp, LHS, RHS, RK_Max);
-      } else if (m_OrdFMax(m_Value(LHS), m_Value(RHS)).match(Select) ||
-                 m_UnordFMax(m_Value(LHS), m_Value(RHS)).match(Select)) {
-        return OperationData(
-            Instruction::FCmp, LHS, RHS, RK_Max,
-            cast<Instruction>(Select->getCondition())->hasNoNaNs());
-      } else {
-        // Try harder: look for min/max pattern based on instructions producing
-        // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
-        // During the intermediate stages of SLP, it's very common to have
-        // pattern like this (since optimizeGatherSequence is run only once
-        // at the end):
-        // %1 = extractelement <2 x i32> %a, i32 0
-        // %2 = extractelement <2 x i32> %a, i32 1
-        // %cond = icmp sgt i32 %1, %2
-        // %3 = extractelement <2 x i32> %a, i32 0
-        // %4 = extractelement <2 x i32> %a, i32 1
-        // %select = select i1 %cond, i32 %3, i32 %4
-        CmpInst::Predicate Pred;
-        Instruction *L1;
-        Instruction *L2;
-
-        LHS = Select->getTrueValue();
-        RHS = Select->getFalseValue();
-        Value *Cond = Select->getCondition();
-
-        // TODO: Support inverse predicates.
-        if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
-          if (!isa<ExtractElementInst>(RHS) ||
-              !L2->isIdenticalTo(cast<Instruction>(RHS)))
-            return OperationData(V);
-        } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
-          if (!isa<ExtractElementInst>(LHS) ||
-              !L1->isIdenticalTo(cast<Instruction>(LHS)))
-            return OperationData(V);
-        } else {
-          if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS))
-            return OperationData(V);
-          if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
-              !L1->isIdenticalTo(cast<Instruction>(LHS)) ||
-              !L2->isIdenticalTo(cast<Instruction>(RHS)))
-            return OperationData(V);
-        }
-        switch (Pred) {
-        default:
-          return OperationData(V);
-
-        case CmpInst::ICMP_ULT:
-        case CmpInst::ICMP_ULE:
-          return OperationData(Instruction::ICmp, LHS, RHS, RK_UMin);
-
-        case CmpInst::ICMP_SLT:
-        case CmpInst::ICMP_SLE:
-          return OperationData(Instruction::ICmp, LHS, RHS, RK_Min);
-
-        case CmpInst::FCMP_OLT:
-        case CmpInst::FCMP_OLE:
-        case CmpInst::FCMP_ULT:
-        case CmpInst::FCMP_ULE:
-          return OperationData(Instruction::FCmp, LHS, RHS, RK_Min,
-                               cast<Instruction>(Cond)->hasNoNaNs());
-
-        case CmpInst::ICMP_UGT:
-        case CmpInst::ICMP_UGE:
-          return OperationData(Instruction::ICmp, LHS, RHS, RK_UMax);
-
-        case CmpInst::ICMP_SGT:
-        case CmpInst::ICMP_SGE:
-          return OperationData(Instruction::ICmp, LHS, RHS, RK_Max);
-
-        case CmpInst::FCMP_OGT:
-        case CmpInst::FCMP_OGE:
-        case CmpInst::FCMP_UGT:
-        case CmpInst::FCMP_UGE:
-          return OperationData(Instruction::FCmp, LHS, RHS, RK_Max,
-                               cast<Instruction>(Cond)->hasNoNaNs());
-        }
+  /// Creates reduction operation with the current opcode.
+  static Value *createOp(IRBuilder<> &Builder, RecurKind Kind, Value *LHS,
+                         Value *RHS, const Twine &Name) {
+    unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
+    switch (Kind) {
+    case RecurKind::Add:
+    case RecurKind::Mul:
+    case RecurKind::Or:
+    case RecurKind::And:
+    case RecurKind::Xor:
+    case RecurKind::FAdd:
+    case RecurKind::FMul:
+      return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
+                                 Name);
+    case RecurKind::FMax:
+      return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS);
+    case RecurKind::FMin:
+      return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS);
+
+    case RecurKind::SMax: {
+      Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name);
+      return Builder.CreateSelect(Cmp, LHS, RHS, Name);
+    }
+    case RecurKind::SMin: {
+      Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name);
+      return Builder.CreateSelect(Cmp, LHS, RHS, Name);
+    }
+    case RecurKind::UMax: {
+      Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name);
+      return Builder.CreateSelect(Cmp, LHS, RHS, Name);
+    }
+    case RecurKind::UMin: {
+      Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name);
+      return Builder.CreateSelect(Cmp, LHS, RHS, Name);
+    }
+    default:
+      llvm_unreachable("Unknown reduction operation.");
+    }
+  }
+
+  /// Creates reduction operation with the current opcode with the IR flags
+  /// from \p ReductionOps.
+  static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS,
+                         Value *RHS, const Twine &Name,
+                         const ReductionOpsListType &ReductionOps) {
+    Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name);
+    if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) {
+      if (auto *Sel = dyn_cast<SelectInst>(Op))
+        propagateIRFlags(Sel->getCondition(), ReductionOps[0]);
+      propagateIRFlags(Op, ReductionOps[1]);
+      return Op;
+    }
+    propagateIRFlags(Op, ReductionOps[0]);
+    return Op;
+  }
+  /// Creates reduction operation with the current opcode with the IR flags
+  /// from \p I.
+  static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS,
+                         Value *RHS, const Twine &Name, Instruction *I) {
+    Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name);
+    if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) {
+      if (auto *Sel = dyn_cast<SelectInst>(Op)) {
+        propagateIRFlags(Sel->getCondition(),
+                         cast<SelectInst>(I)->getCondition());
       }
     }
-    return OperationData(V);
+    propagateIRFlags(Op, I);
+    return Op;
+  }
+
+  static RecurKind getRdxKind(Instruction *I) {
+    assert(I && "Expected instruction for reduction matching");
+    TargetTransformInfo::ReductionFlags RdxFlags;
+    if (match(I, m_Add(m_Value(), m_Value())))
+      return RecurKind::Add;
+    if (match(I, m_Mul(m_Value(), m_Value())))
+      return RecurKind::Mul;
+    if (match(I, m_And(m_Value(), m_Value())))
+      return RecurKind::And;
+    if (match(I, m_Or(m_Value(), m_Value())))
+      return RecurKind::Or;
+    if (match(I, m_Xor(m_Value(), m_Value())))
+      return RecurKind::Xor;
+    if (match(I, m_FAdd(m_Value(), m_Value())))
+      return RecurKind::FAdd;
+    if (match(I, m_FMul(m_Value(), m_Value())))
+      return RecurKind::FMul;
+
+    if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value())))
+      return RecurKind::FMax;
+    if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))
+      return RecurKind::FMin;
+
+    if (match(I, m_SMax(m_Value(), m_Value())))
+      return RecurKind::SMax;
+    if (match(I, m_SMin(m_Value(), m_Value())))
+      return RecurKind::SMin;
+    if (match(I, m_UMax(m_Value(), m_Value())))
+      return RecurKind::UMax;
+    if (match(I, m_UMin(m_Value(), m_Value())))
+      return RecurKind::UMin;
+
+    if (auto *Select = dyn_cast<SelectInst>(I)) {
+      // Try harder: look for min/max pattern based on instructions producing
+      // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
+      // During the intermediate stages of SLP, it's very common to have
+      // pattern like this (since optimizeGatherSequence is run only once
+      // at the end):
+      // %1 = extractelement <2 x i32> %a, i32 0
+      // %2 = extractelement <2 x i32> %a, i32 1
+      // %cond = icmp sgt i32 %1, %2
+      // %3 = extractelement <2 x i32> %a, i32 0
+      // %4 = extractelement <2 x i32> %a, i32 1
+      // %select = select i1 %cond, i32 %3, i32 %4
+      CmpInst::Predicate Pred;
+      Instruction *L1;
+      Instruction *L2;
+
+      Value *LHS = Select->getTrueValue();
+      Value *RHS = Select->getFalseValue();
+      Value *Cond = Select->getCondition();
+
+      // TODO: Support inverse predicates.
+      if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
+        if (!isa<ExtractElementInst>(RHS) ||
+            !L2->isIdenticalTo(cast<Instruction>(RHS)))
+          return RecurKind::None;
+      } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
+        if (!isa<ExtractElementInst>(LHS) ||
+            !L1->isIdenticalTo(cast<Instruction>(LHS)))
+          return RecurKind::None;
+      } else {
+        if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS))
+          return RecurKind::None;
+        if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
+            !L1->isIdenticalTo(cast<Instruction>(LHS)) ||
+            !L2->isIdenticalTo(cast<Instruction>(RHS)))
+          return RecurKind::None;
+      }
+
+      TargetTransformInfo::ReductionFlags RdxFlags;
+      switch (Pred) {
+      default:
+        return RecurKind::None;
+      case CmpInst::ICMP_SGT:
+      case CmpInst::ICMP_SGE:
+        return RecurKind::SMax;
+      case CmpInst::ICMP_SLT:
+      case CmpInst::ICMP_SLE:
+        return RecurKind::SMin;
+      case CmpInst::ICMP_UGT:
+      case CmpInst::ICMP_UGE:
+        return RecurKind::UMax;
+      case CmpInst::ICMP_ULT:
+      case CmpInst::ICMP_ULE:
+        return RecurKind::UMin;
+      }
+    }
+    return RecurKind::None;
+  }
+
+  /// Return true if this operation is a cmp+select idiom.
+  static bool isCmpSel(RecurKind Kind) {
+    return RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind);
+  }
+
+  /// Get the index of the first operand.
+  static unsigned getFirstOperandIndex(RecurKind Kind) {
+    // We allow calling this before 'Kind' is set, so handle that specially.
+    if (Kind == RecurKind::None)
+      return 0;
+    return isCmpSel(Kind) ? 1 : 0;
+  }
+
+  /// Total number of operands in the reduction operation.
+  static unsigned getNumberOfOperands(RecurKind Kind) {
+    return isCmpSel(Kind) ? 3 : 2;
+  }
+
+  /// Checks if the instruction is in basic block \p BB.
+  /// For a min/max reduction check that both compare and select are in \p BB.
+  static bool hasSameParent(RecurKind Kind, Instruction *I, BasicBlock *BB,
+                            bool IsRedOp) {
+    if (IsRedOp && isCmpSel(Kind)) {
+      auto *Cmp = cast<Instruction>(cast<SelectInst>(I)->getCondition());
+      return I->getParent() == BB && Cmp && Cmp->getParent() == BB;
+    }
+    return I->getParent() == BB;
+  }
+
+  /// Expected number of uses for reduction operations/reduced values.
+  static bool hasRequiredNumberOfUses(RecurKind Kind, Instruction *I,
+                                      bool IsReductionOp) {
+    // SelectInst must be used twice while the condition op must have single
+    // use only.
+    if (isCmpSel(Kind))
+      return I->hasNUses(2) &&
+             (!IsReductionOp ||
+              cast<SelectInst>(I)->getCondition()->hasOneUse());
+
+    // Arithmetic reduction operation must be used once only.
+    return I->hasOneUse();
+  }
+
+  /// Initializes the list of reduction operations.
+  void initReductionOps(RecurKind Kind) {
+    if (isCmpSel(Kind))
+      ReductionOps.assign(2, ReductionOpsType());
+    else
+      ReductionOps.assign(1, ReductionOpsType());
+  }
+
+  /// Add all reduction operations for the reduction instruction \p I.
+  void addReductionOps(RecurKind Kind, Instruction *I) {
+    assert(Kind != RecurKind::None && "Expected reduction operation.");
+    if (isCmpSel(Kind)) {
+      ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
+      ReductionOps[1].emplace_back(I);
+    } else {
+      ReductionOps[0].emplace_back(I);
+    }
+  }
+
+  static Value *getLHS(RecurKind Kind, Instruction *I) {
+    if (Kind == RecurKind::None)
+      return nullptr;
+    return I->getOperand(getFirstOperandIndex(Kind));
+  }
+  static Value *getRHS(RecurKind Kind, Instruction *I) {
+    if (Kind == RecurKind::None)
+      return nullptr;
+    return I->getOperand(getFirstOperandIndex(Kind) + 1);
   }
 
 public:
@@ -6684,50 +6689,59 @@ public:
   /// Try to find a reduction tree.
   bool matchAssociativeReduction(PHINode *Phi, Instruction *B) {
     assert((!Phi || is_contained(Phi->operands(), B)) &&
-           "Thi phi needs to use the binary operator");
+           "Phi needs to use the binary operator");
 
-    ReductionData = getOperationData(B);
+    RdxKind = getRdxKind(B);
 
     // We could have a initial reductions that is not an add.
     //  r *= v1 + v2 + v3 + v4
     // In such a case start looking for a tree rooted in the first '+'.
     if (Phi) {
-      if (ReductionData.getLHS() == Phi) {
+      if (getLHS(RdxKind, B) == Phi) {
         Phi = nullptr;
-        B = dyn_cast<Instruction>(ReductionData.getRHS());
-        ReductionData = getOperationData(B);
-      } else if (ReductionData.getRHS() == Phi) {
+        B = dyn_cast<Instruction>(getRHS(RdxKind, B));
+        if (!B)
+          return false;
+        RdxKind = getRdxKind(B);
+      } else if (getRHS(RdxKind, B) == Phi) {
         Phi = nullptr;
-        B = dyn_cast<Instruction>(ReductionData.getLHS());
-        ReductionData = getOperationData(B);
+        B = dyn_cast<Instruction>(getLHS(RdxKind, B));
+        if (!B)
+          return false;
+        RdxKind = getRdxKind(B);
       }
     }
 
-    if (!ReductionData.isVectorizable(B))
+    if (!isVectorizable(RdxKind, B))
       return false;
 
+    // Analyze "regular" integer/FP types for reductions - no target-specific
+    // types or pointers.
     Type *Ty = B->getType();
-    if (!isValidElementType(Ty))
-      return false;
-    if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy())
+    if (!isValidElementType(Ty) || Ty->isPointerTy())
       return false;
 
-    ReducedValueData.clear();
     ReductionRoot = B;
 
+    // The opcode for leaf values that we perform a reduction on.
+    // For example: load(x) + load(y) + load(z) + fptoui(w)
+    // The leaf opcode for 'w' does not match, so we don't include it as a
+    // potential candidate for the reduction.
+    unsigned LeafOpcode = 0;
+
     // Post order traverse the reduction tree starting at B. We only handle true
     // trees containing only binary operators.
     SmallVector<std::pair<Instruction *, unsigned>, 32> Stack;
-    Stack.push_back(std::make_pair(B, ReductionData.getFirstOperandIndex()));
-    ReductionData.initReductionOps(ReductionOps);
+    Stack.push_back(std::make_pair(B, getFirstOperandIndex(RdxKind)));
+    initReductionOps(RdxKind);
     while (!Stack.empty()) {
       Instruction *TreeN = Stack.back().first;
-      unsigned EdgeToVist = Stack.back().second++;
-      OperationData OpData = getOperationData(TreeN);
-      bool IsReducedValue = OpData != ReductionData;
+      unsigned EdgeToVisit = Stack.back().second++;
+      const RecurKind TreeRdxKind = getRdxKind(TreeN);
+      bool IsReducedValue = TreeRdxKind != RdxKind;
 
-      // Postorder vist.
-      if (IsReducedValue || EdgeToVist == OpData.getNumberOfOperands()) {
+      // Postorder visit.
+      if (IsReducedValue || EdgeToVisit == getNumberOfOperands(TreeRdxKind)) {
         if (IsReducedValue)
           ReducedVals.push_back(TreeN);
         else {
@@ -6745,7 +6759,7 @@ public:
             markExtraArg(Stack[Stack.size() - 2], TreeN);
             ExtraArgs.erase(TreeN);
           } else
-            ReductionData.addReductionOps(TreeN, ReductionOps);
+            addReductionOps(RdxKind, TreeN);
         }
         // Retract.
         Stack.pop_back();
@@ -6753,91 +6767,72 @@ public:
       }
 
       // Visit left or right.
-      Value *NextV = TreeN->getOperand(EdgeToVist);
-      if (NextV != Phi) {
-        auto *I = dyn_cast<Instruction>(NextV);
-        OpData = getOperationData(I);
-        // Continue analysis if the next operand is a reduction operation or
-        // (possibly) a reduced value. If the reduced value opcode is not set,
-        // the first met operation != reduction operation is considered as the
-        // reduced value class.
-        if (I && (!ReducedValueData || OpData == ReducedValueData ||
-                  OpData == ReductionData)) {
-          const bool IsReductionOperation = OpData == ReductionData;
-          // Only handle trees in the current basic block.
-          if (!ReductionData.hasSameParent(I, B->getParent(),
-                                           IsReductionOperation)) {
-            // I is an extra argument for TreeN (its parent operation).
-            markExtraArg(Stack.back(), I);
-            continue;
-          }
-
-          // Each tree node needs to have minimal number of users except for the
-          // ultimate reduction.
-          if (!ReductionData.hasRequiredNumberOfUses(I,
-                                                     OpData == ReductionData) &&
-              I != B) {
+      Value *EdgeVal = TreeN->getOperand(EdgeToVisit);
+      auto *I = dyn_cast<Instruction>(EdgeVal);
+      if (!I) {
+        // Edge value is not a reduction instruction or a leaf instruction.
+        // (It may be a constant, function argument, or something else.)
+        markExtraArg(Stack.back(), EdgeVal);
+        continue;
+      }
+      RecurKind EdgeRdxKind = getRdxKind(I);
+      // Continue analysis if the next operand is a reduction operation or
+      // (possibly) a leaf value. If the leaf value opcode is not set,
+      // the first met operation != reduction operation is considered as the
+      // leaf opcode.
+      // Only handle trees in the current basic block.
+      // Each tree node needs to have minimal number of users except for the
+      // ultimate reduction.
+      const bool IsRdxInst = EdgeRdxKind == RdxKind;
+      if (I != Phi && I != B &&
+          hasSameParent(RdxKind, I, B->getParent(), IsRdxInst) &&
+          hasRequiredNumberOfUses(RdxKind, I, IsRdxInst) &&
+          (!LeafOpcode || LeafOpcode == I->getOpcode() || IsRdxInst)) {
+        if (IsRdxInst) {
+          // We need to be able to reassociate the reduction operations.
+          if (!isVectorizable(EdgeRdxKind, I)) {
             // I is an extra argument for TreeN (its parent operation).
             markExtraArg(Stack.back(), I);
             continue;
           }
-
-          if (IsReductionOperation) {
-            // We need to be able to reassociate the reduction operations.
-            if (!OpData.isAssociative(I)) {
-              // I is an extra argument for TreeN (its parent operation).
-              markExtraArg(Stack.back(), I);
-              continue;
-            }
-          } else if (ReducedValueData &&
-                     ReducedValueData != OpData) {
-            // Make sure that the opcodes of the operations that we are going to
-            // reduce match.
-            // I is an extra argument for TreeN (its parent operation).
-            markExtraArg(Stack.back(), I);
-            continue;
-          } else if (!ReducedValueData)
-            ReducedValueData = OpData;
-
-          Stack.push_back(std::make_pair(I, OpData.getFirstOperandIndex()));
-          continue;
+        } else if (!LeafOpcode) {
+          LeafOpcode = I->getOpcode();
         }
+        Stack.push_back(std::make_pair(I, getFirstOperandIndex(EdgeRdxKind)));
+        continue;
       }
-      // NextV is an extra argument for TreeN (its parent operation).
-      markExtraArg(Stack.back(), NextV);
+      // I is an extra argument for TreeN (its parent operation).
+      markExtraArg(Stack.back(), I);
     }
     return true;
   }
 
-  /// Attempt to vectorize the tree found by
-  /// matchAssociativeReduction.
+  /// Attempt to vectorize the tree found by matchAssociativeReduction.
   bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
-    if (ReducedVals.empty())
-      return false;
-
-    // If there is a sufficient number of reduction values, reduce
-    // to a nearby power-of-2. Can safely generate oversized
+    // If there are a sufficient number of reduction values, reduce
+    // to a nearby power-of-2. We can safely generate oversized
     // vectors and rely on the backend to split them to legal sizes.
     unsigned NumReducedVals = ReducedVals.size();
     if (NumReducedVals < 4)
       return false;
 
-    unsigned ReduxWidth = PowerOf2Floor(NumReducedVals);
-
-    Value *VectorizedTree = nullptr;
+    // Intersect the fast-math-flags from all reduction operations.
+    FastMathFlags RdxFMF;
+    RdxFMF.set();
+    for (ReductionOpsType &RdxOp : ReductionOps) {
+      for (Value *RdxVal : RdxOp) {
+        if (auto *FPMO = dyn_cast<FPMathOperator>(RdxVal))
+          RdxFMF &= FPMO->getFastMathFlags();
+      }
+    }
 
-    // FIXME: Fast-math-flags should be set based on the instructions in the
-    //        reduction (not all of 'fast' are required).
     IRBuilder<> Builder(cast<Instruction>(ReductionRoot));
-    FastMathFlags Unsafe;
-    Unsafe.setFast();
-    Builder.setFastMathFlags(Unsafe);
-    unsigned i = 0;
+    Builder.setFastMathFlags(RdxFMF);
 
     BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
-    // The same extra argument may be used several time, so log each attempt
+    // The same extra argument may be used several times, so log each attempt
     // to use it.
-    for (auto &Pair : ExtraArgs) {
+    for (const std::pair<Instruction *, Value *> &Pair : ExtraArgs) {
       assert(Pair.first && "DebugLoc must be set.");
       ExternallyUsedValues[Pair.second].push_back(Pair.first);
     }
@@ -6857,14 +6852,48 @@ public:
     // so set it as externally used to prevent it from being deleted.
     ExternallyUsedValues[ReductionRoot];
     SmallVector<Value *, 16> IgnoreList;
-    for (auto &V : ReductionOps)
-      IgnoreList.append(V.begin(), V.end());
+    for (ReductionOpsType &RdxOp : ReductionOps)
+      IgnoreList.append(RdxOp.begin(), RdxOp.end());
+
+    unsigned ReduxWidth = PowerOf2Floor(NumReducedVals);
+    if (NumReducedVals > ReduxWidth) {
+      // In the loop below, we are building a tree based on a window of
+      // 'ReduxWidth' values.
+      // If the operands of those values have common traits (compare predicate,
+      // constant operand, etc), then we want to group those together to
+      // minimize the cost of the reduction.
+
+      // TODO: This should be extended to count common operands for
+      //       compares and binops.
+
+      // Step 1: Count the number of times each compare predicate occurs.
+      SmallDenseMap<unsigned, unsigned> PredCountMap;
+      for (Value *RdxVal : ReducedVals) {
+        CmpInst::Predicate Pred;
+        if (match(RdxVal, m_Cmp(Pred, m_Value(), m_Value())))
+          ++PredCountMap[Pred];
+      }
+      // Step 2: Sort the values so the most common predicates come first.
+      stable_sort(ReducedVals, [&PredCountMap](Value *A, Value *B) {
+        CmpInst::Predicate PredA, PredB;
+        if (match(A, m_Cmp(PredA, m_Value(), m_Value())) &&
+            match(B, m_Cmp(PredB, m_Value(), m_Value()))) {
+          return PredCountMap[PredA] > PredCountMap[PredB];
+        }
+        return false;
+      });
+    }
+
+    Value *VectorizedTree = nullptr;
+    unsigned i = 0;
     while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
-      auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth);
+      ArrayRef<Value *> VL(&ReducedVals[i], ReduxWidth);
       V.buildTree(VL, ExternallyUsedValues, IgnoreList);
       Optional<ArrayRef<unsigned>> Order = V.bestOrder();
-      // TODO: Handle orders of size less than number of elements in the vector.
-      if (Order && Order->size() == VL.size()) {
+      if (Order) {
+        assert(Order->size() == VL.size() &&
+               "Order size must be the same as number of vectorized "
+               "instructions.");
         // TODO: reorder tree nodes without tree rebuilding.
         SmallVector<Value *, 4> ReorderedOps(VL.size());
         llvm::transform(*Order, ReorderedOps.begin(),
@@ -6873,60 +6902,66 @@ public:
       }
       if (V.isTreeTinyAndNotFullyVectorizable())
         break;
-      if (V.isLoadCombineReductionCandidate(ReductionData.getOpcode()))
+      if (V.isLoadCombineReductionCandidate(RdxKind))
         break;
 
       V.computeMinimumValueSizes();
 
       // Estimate cost.
-      int TreeCost = V.getTreeCost();
-      int ReductionCost = getReductionCost(TTI, ReducedVals[i], ReduxWidth);
-      int Cost = TreeCost + ReductionCost;
+      InstructionCost TreeCost = V.getTreeCost();
+      InstructionCost ReductionCost =
+          getReductionCost(TTI, ReducedVals[i], ReduxWidth);
+      InstructionCost Cost = TreeCost + ReductionCost;
+      if (!Cost.isValid()) {
+        LLVM_DEBUG(dbgs() << "Encountered invalid baseline cost.\n");
+        return false;
+      }
       if (Cost >= -SLPCostThreshold) {
-          V.getORE()->emit([&]() {
-              return OptimizationRemarkMissed(
-                         SV_NAME, "HorSLPNotBeneficial", cast<Instruction>(VL[0]))
-                     << "Vectorizing horizontal reduction is possible"
-                     << "but not beneficial with cost "
-                     << ore::NV("Cost", Cost) << " and threshold "
-                     << ore::NV("Threshold", -SLPCostThreshold);
-          });
-          break;
+        V.getORE()->emit([&]() {
+          return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial",
+                                          cast<Instruction>(VL[0]))
+                 << "Vectorizing horizontal reduction is possible"
+                 << "but not beneficial with cost " << ore::NV("Cost", Cost)
+                 << " and threshold "
+                 << ore::NV("Threshold", -SLPCostThreshold);
+        });
+        break;
       }
 
       LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
                         << Cost << ". (HorRdx)\n");
       V.getORE()->emit([&]() {
-          return OptimizationRemark(
-                     SV_NAME, "VectorizedHorizontalReduction", cast<Instruction>(VL[0]))
-          << "Vectorized horizontal reduction with cost "
-          << ore::NV("Cost", Cost) << " and with tree size "
-          << ore::NV("TreeSize", V.getTreeSize());
+        return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction",
+                                  cast<Instruction>(VL[0]))
+               << "Vectorized horizontal reduction with cost "
+               << ore::NV("Cost", Cost) << " and with tree size "
+               << ore::NV("TreeSize", V.getTreeSize());
       });
 
       // Vectorize a tree.
       DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
       Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues);
 
-      // Emit a reduction. For min/max, the root is a select, but the insertion
+      // Emit a reduction. If the root is a select (min/max idiom), the insert
       // point is the compare condition of that select.
       Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
-      if (ReductionData.isMinMax())
+      if (isCmpSel(RdxKind))
         Builder.SetInsertPoint(getCmpForMinMaxReduction(RdxRootInst));
       else
         Builder.SetInsertPoint(RdxRootInst);
 
       Value *ReducedSubTree =
           emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
-      if (VectorizedTree) {
-        Builder.SetCurrentDebugLocation(Loc);
-        OperationData VectReductionData(ReductionData.getOpcode(),
-                                        VectorizedTree, ReducedSubTree,
-                                        ReductionData.getKind());
-        VectorizedTree =
-            VectReductionData.createOp(Builder, "op.rdx", ReductionOps);
-      } else
+
+      if (!VectorizedTree) {
+        // Initialize the final value in the reduction.
         VectorizedTree = ReducedSubTree;
+      } else {
+        // Update the final value in the reduction.
+        Builder.SetCurrentDebugLocation(Loc);
+        VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,
+                                  ReducedSubTree, "op.rdx", ReductionOps);
+      }
       i += ReduxWidth;
       ReduxWidth = PowerOf2Floor(NumReducedVals - i);
     }
@@ -6936,19 +6971,15 @@ public:
       for (; i < NumReducedVals; ++i) {
         auto *I = cast<Instruction>(ReducedVals[i]);
         Builder.SetCurrentDebugLocation(I->getDebugLoc());
-        OperationData VectReductionData(ReductionData.getOpcode(),
-                                        VectorizedTree, I,
-                                        ReductionData.getKind());
-        VectorizedTree = VectReductionData.createOp(Builder, "", ReductionOps);
+        VectorizedTree =
+            createOp(Builder, RdxKind, VectorizedTree, I, "", ReductionOps);
       }
       for (auto &Pair : ExternallyUsedValues) {
         // Add each externally used value to the final reduction.
         for (auto *I : Pair.second) {
           Builder.SetCurrentDebugLocation(I->getDebugLoc());
-          OperationData VectReductionData(ReductionData.getOpcode(),
-                                          VectorizedTree, Pair.first,
-                                          ReductionData.getKind());
-          VectorizedTree = VectReductionData.createOp(Builder, "op.extra", I);
+          VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,
+                                    Pair.first, "op.extra", I);
         }
       }
 
@@ -6956,7 +6987,7 @@ public:
       // select, we also have to RAUW for the compare instruction feeding the
       // reduction root. That's because the original compare may have extra uses
       // besides the final select of the reduction.
-      if (ReductionData.isMinMax()) {
+      if (isCmpSel(RdxKind)) {
         if (auto *VecSelect = dyn_cast<SelectInst>(VectorizedTree)) {
           Instruction *ScalarCmp =
               getCmpForMinMaxReduction(cast<Instruction>(ReductionRoot));
@@ -6972,77 +7003,68 @@ public:
     return VectorizedTree != nullptr;
   }
 
-  unsigned numReductionValues() const {
-    return ReducedVals.size();
-  }
+  unsigned numReductionValues() const { return ReducedVals.size(); }
 
 private:
   /// Calculate the cost of a reduction.
-  int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal,
-                       unsigned ReduxWidth) {
+  InstructionCost getReductionCost(TargetTransformInfo *TTI,
+                                   Value *FirstReducedVal,
+                                   unsigned ReduxWidth) {
     Type *ScalarTy = FirstReducedVal->getType();
-    auto *VecTy = FixedVectorType::get(ScalarTy, ReduxWidth);
-
-    int PairwiseRdxCost;
-    int SplittingRdxCost;
-    switch (ReductionData.getKind()) {
-    case RK_Arithmetic:
-      PairwiseRdxCost =
-          TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy,
-                                          /*IsPairwiseForm=*/true);
-      SplittingRdxCost =
-          TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy,
-                                          /*IsPairwiseForm=*/false);
-      break;
-    case RK_Min:
-    case RK_Max:
-    case RK_UMin:
-    case RK_UMax: {
-      auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VecTy));
-      bool IsUnsigned = ReductionData.getKind() == RK_UMin ||
-                        ReductionData.getKind() == RK_UMax;
-      PairwiseRdxCost =
-          TTI->getMinMaxReductionCost(VecTy, VecCondTy,
-                                      /*IsPairwiseForm=*/true, IsUnsigned);
-      SplittingRdxCost =
-          TTI->getMinMaxReductionCost(VecTy, VecCondTy,
-                                      /*IsPairwiseForm=*/false, IsUnsigned);
+    FixedVectorType *VectorTy = FixedVectorType::get(ScalarTy, ReduxWidth);
+    InstructionCost VectorCost, ScalarCost;
+    switch (RdxKind) {
+    case RecurKind::Add:
+    case RecurKind::Mul:
+    case RecurKind::Or:
+    case RecurKind::And:
+    case RecurKind::Xor:
+    case RecurKind::FAdd:
+    case RecurKind::FMul: {
+      unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
+      VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy,
+                                                   /*IsPairwiseForm=*/false);
+      ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy);
       break;
     }
-    case RK_None:
-      llvm_unreachable("Expected arithmetic or min/max reduction operation");
-    }
-
-    IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;
-    int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;
-
-    int ScalarReduxCost = 0;
-    switch (ReductionData.getKind()) {
-    case RK_Arithmetic:
-      ScalarReduxCost =
-          TTI->getArithmeticInstrCost(ReductionData.getOpcode(), ScalarTy);
+    case RecurKind::FMax:
+    case RecurKind::FMin: {
+      auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
+      VectorCost =
+          TTI->getMinMaxReductionCost(VectorTy, VecCondTy,
+                                      /*pairwise=*/false, /*unsigned=*/false);
+      ScalarCost =
+          TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy) +
+          TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
+                                  CmpInst::makeCmpResultType(ScalarTy));
       break;
-    case RK_Min:
-    case RK_Max:
-    case RK_UMin:
-    case RK_UMax:
-      ScalarReduxCost =
-          TTI->getCmpSelInstrCost(ReductionData.getOpcode(), ScalarTy) +
+    }
+    case RecurKind::SMax:
+    case RecurKind::SMin:
+    case RecurKind::UMax:
+    case RecurKind::UMin: {
+      auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
+      bool IsUnsigned =
+          RdxKind == RecurKind::UMax || RdxKind == RecurKind::UMin;
+      VectorCost =
+          TTI->getMinMaxReductionCost(VectorTy, VecCondTy,
+                                      /*IsPairwiseForm=*/false, IsUnsigned);
+      ScalarCost =
+          TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy) +
           TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
                                   CmpInst::makeCmpResultType(ScalarTy));
       break;
-    case RK_None:
+    }
+    default:
       llvm_unreachable("Expected arithmetic or min/max reduction operation");
     }
-    ScalarReduxCost *= (ReduxWidth - 1);
 
-    LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
+    // Scalar cost is repeated for N-1 elements.
+    ScalarCost *= (ReduxWidth - 1);
+    LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
                       << " for reduction that starts with " << *FirstReducedVal
-                      << " (It is a "
-                      << (IsPairwiseReduction ? "pairwise" : "splitting")
-                      << " reduction)\n");
-
-    return VecReduxCost - ScalarReduxCost;
+                      << " (It is a splitting reduction)\n");
+    return VectorCost - ScalarCost;
   }
 
   /// Emit a horizontal reduction of the vectorized value.
@@ -7052,92 +7074,142 @@ private:
     assert(isPowerOf2_32(ReduxWidth) &&
            "We only handle power-of-two reductions for now");
 
-    if (!IsPairwiseReduction) {
-      // FIXME: The builder should use an FMF guard. It should not be hard-coded
-      //        to 'fast'.
-      assert(Builder.getFastMathFlags().isFast() && "Expected 'fast' FMF");
-      return createSimpleTargetReduction(
-          Builder, TTI, ReductionData.getOpcode(), VectorizedValue,
-          ReductionData.getFlags(), ReductionOps.back());
-    }
+    return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind,
+                                       ReductionOps.back());
+  }
+};
 
-    Value *TmpVec = VectorizedValue;
-    for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
-      auto LeftMask = createRdxShuffleMask(ReduxWidth, i, true, true);
-      auto RightMask = createRdxShuffleMask(ReduxWidth, i, true, false);
+} // end anonymous namespace
+
+static Optional<unsigned> getAggregateSize(Instruction *InsertInst) {
+  if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))
+    return cast<FixedVectorType>(IE->getType())->getNumElements();
 
-      Value *LeftShuf = Builder.CreateShuffleVector(
-          TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");
-      Value *RightShuf = Builder.CreateShuffleVector(
-          TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
-          "rdx.shuf.r");
-      OperationData VectReductionData(ReductionData.getOpcode(), LeftShuf,
-                                      RightShuf, ReductionData.getKind());
-      TmpVec = VectReductionData.createOp(Builder, "op.rdx", ReductionOps);
+  unsigned AggregateSize = 1;
+  auto *IV = cast<InsertValueInst>(InsertInst);
+  Type *CurrentType = IV->getType();
+  do {
+    if (auto *ST = dyn_cast<StructType>(CurrentType)) {
+      for (auto *Elt : ST->elements())
+        if (Elt != ST->getElementType(0)) // check homogeneity
+          return None;
+      AggregateSize *= ST->getNumElements();
+      CurrentType = ST->getElementType(0);
+    } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
+      AggregateSize *= AT->getNumElements();
+      CurrentType = AT->getElementType();
+    } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
+      AggregateSize *= VT->getNumElements();
+      return AggregateSize;
+    } else if (CurrentType->isSingleValueType()) {
+      return AggregateSize;
+    } else {
+      return None;
     }
+  } while (true);
+}
 
-    // The result is in the first element of the vector.
-    return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
+static Optional<unsigned> getOperandIndex(Instruction *InsertInst,
+                                          unsigned OperandOffset) {
+  unsigned OperandIndex = OperandOffset;
+  if (auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {
+    if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) {
+      auto *VT = cast<FixedVectorType>(IE->getType());
+      OperandIndex *= VT->getNumElements();
+      OperandIndex += CI->getZExtValue();
+      return OperandIndex;
+    }
+    return None;
   }
-};
 
-} // end anonymous namespace
+  auto *IV = cast<InsertValueInst>(InsertInst);
+  Type *CurrentType = IV->getType();
+  for (unsigned int Index : IV->indices()) {
+    if (auto *ST = dyn_cast<StructType>(CurrentType)) {
+      OperandIndex *= ST->getNumElements();
+      CurrentType = ST->getElementType(Index);
+    } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
+      OperandIndex *= AT->getNumElements();
+      CurrentType = AT->getElementType();
+    } else {
+      return None;
+    }
+    OperandIndex += Index;
+  }
+  return OperandIndex;
+}
+
+static bool findBuildAggregate_rec(Instruction *LastInsertInst,
+                                   TargetTransformInfo *TTI,
+                                   SmallVectorImpl<Value *> &BuildVectorOpds,
+                                   SmallVectorImpl<Value *> &InsertElts,
+                                   unsigned OperandOffset) {
+  do {
+    Value *InsertedOperand = LastInsertInst->getOperand(1);
+    Optional<unsigned> OperandIndex =
+        getOperandIndex(LastInsertInst, OperandOffset);
+    if (!OperandIndex)
+      return false;
+    if (isa<InsertElementInst>(InsertedOperand) ||
+        isa<InsertValueInst>(InsertedOperand)) {
+      if (!findBuildAggregate_rec(cast<Instruction>(InsertedOperand), TTI,
+                                  BuildVectorOpds, InsertElts, *OperandIndex))
+        return false;
+    } else {
+      BuildVectorOpds[*OperandIndex] = InsertedOperand;
+      InsertElts[*OperandIndex] = LastInsertInst;
+    }
+    if (isa<UndefValue>(LastInsertInst->getOperand(0)))
+      return true;
+    LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));
+  } while (LastInsertInst != nullptr &&
+           (isa<InsertValueInst>(LastInsertInst) ||
+            isa<InsertElementInst>(LastInsertInst)) &&
+           LastInsertInst->hasOneUse());
+  return false;
+}
 
 /// Recognize construction of vectors like
-///  %ra = insertelement <4 x float> undef, float %s0, i32 0
+///  %ra = insertelement <4 x float> poison, float %s0, i32 0
 ///  %rb = insertelement <4 x float> %ra, float %s1, i32 1
 ///  %rc = insertelement <4 x float> %rb, float %s2, i32 2
 ///  %rd = insertelement <4 x float> %rc, float %s3, i32 3
 ///  starting from the last insertelement or insertvalue instruction.
 ///
-/// Also recognize aggregates like {<2 x float>, <2 x float>},
+/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
 /// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
 /// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
 ///
 /// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
 ///
 /// \return true if it matches.
-static bool findBuildAggregate(Value *LastInsertInst, TargetTransformInfo *TTI,
+static bool findBuildAggregate(Instruction *LastInsertInst,
+                               TargetTransformInfo *TTI,
                                SmallVectorImpl<Value *> &BuildVectorOpds,
                                SmallVectorImpl<Value *> &InsertElts) {
+
   assert((isa<InsertElementInst>(LastInsertInst) ||
           isa<InsertValueInst>(LastInsertInst)) &&
          "Expected insertelement or insertvalue instruction!");
-  do {
-    Value *InsertedOperand;
-    auto *IE = dyn_cast<InsertElementInst>(LastInsertInst);
-    if (IE) {
-      InsertedOperand = IE->getOperand(1);
-      LastInsertInst = IE->getOperand(0);
-    } else {
-      auto *IV = cast<InsertValueInst>(LastInsertInst);
-      InsertedOperand = IV->getInsertedValueOperand();
-      LastInsertInst = IV->getAggregateOperand();
-    }
-    if (isa<InsertElementInst>(InsertedOperand) ||
-        isa<InsertValueInst>(InsertedOperand)) {
-      SmallVector<Value *, 8> TmpBuildVectorOpds;
-      SmallVector<Value *, 8> TmpInsertElts;
-      if (!findBuildAggregate(InsertedOperand, TTI, TmpBuildVectorOpds,
-                              TmpInsertElts))
-        return false;
-      BuildVectorOpds.append(TmpBuildVectorOpds.rbegin(),
-                             TmpBuildVectorOpds.rend());
-      InsertElts.append(TmpInsertElts.rbegin(), TmpInsertElts.rend());
-    } else {
-      BuildVectorOpds.push_back(InsertedOperand);
-      InsertElts.push_back(IE);
-    }
-    if (isa<UndefValue>(LastInsertInst))
-      break;
-    if ((!isa<InsertValueInst>(LastInsertInst) &&
-         !isa<InsertElementInst>(LastInsertInst)) ||
-        !LastInsertInst->hasOneUse())
-      return false;
-  } while (true);
-  std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());
-  std::reverse(InsertElts.begin(), InsertElts.end());
-  return true;
+
+  assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
+         "Expected empty result vectors!");
+
+  Optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst);
+  if (!AggregateSize)
+    return false;
+  BuildVectorOpds.resize(*AggregateSize);
+  InsertElts.resize(*AggregateSize);
+
+  if (findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts,
+                             0)) {
+    llvm::erase_value(BuildVectorOpds, nullptr);
+    llvm::erase_value(InsertElts, nullptr);
+    if (BuildVectorOpds.size() >= 2)
+      return true;
+  }
+
+  return false;
 }
 
 static bool PhiTypeSorterFunc(Value *V, Value *V2) {
@@ -7195,6 +7267,16 @@ static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
   return nullptr;
 }
 
+static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
+  if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
+    return true;
+  if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(V0), m_Value(V1))))
+    return true;
+  if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0), m_Value(V1))))
+    return true;
+  return false;
+}
+
 /// Attempt to reduce a horizontal reduction.
 /// If it is legal to match a horizontal reduction feeding the phi node \a P
 /// with reduction operators \a Root (or one of its operands) in a basic block
@@ -7234,9 +7316,10 @@ static bool tryToVectorizeHorReductionOrInstOperands(
     Instruction *Inst;
     unsigned Level;
     std::tie(Inst, Level) = Stack.pop_back_val();
-    auto *BI = dyn_cast<BinaryOperator>(Inst);
-    auto *SI = dyn_cast<SelectInst>(Inst);
-    if (BI || SI) {
+    Value *B0, *B1;
+    bool IsBinop = matchRdxBop(Inst, B0, B1);
+    bool IsSelect = match(Inst, m_Select(m_Value(), m_Value(), m_Value()));
+    if (IsBinop || IsSelect) {
       HorizontalReduction HorRdx;
       if (HorRdx.matchAssociativeReduction(P, Inst)) {
         if (HorRdx.tryToReduce(R, TTI)) {
@@ -7247,10 +7330,10 @@ static bool tryToVectorizeHorReductionOrInstOperands(
           continue;
         }
       }
-      if (P && BI) {
-        Inst = dyn_cast<Instruction>(BI->getOperand(0));
+      if (P && IsBinop) {
+        Inst = dyn_cast<Instruction>(B0);
         if (Inst == P)
-          Inst = dyn_cast<Instruction>(BI->getOperand(1));
+          Inst = dyn_cast<Instruction>(B1);
         if (!Inst) {
           // Set P to nullptr to avoid re-analysis of phi node in
           // matchAssociativeReduction function unless this is the root node.
@@ -7283,9 +7366,7 @@ static bool tryToVectorizeHorReductionOrInstOperands(
 bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
                                                  BasicBlock *BB, BoUpSLP &R,
                                                  TargetTransformInfo *TTI) {
-  if (!V)
-    return false;
-  auto *I = dyn_cast<Instruction>(V);
+  auto *I = dyn_cast_or_null<Instruction>(V);
   if (!I)
     return false;
 
@@ -7307,8 +7388,7 @@ bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
 
   SmallVector<Value *, 16> BuildVectorOpds;
   SmallVector<Value *, 16> BuildVectorInsts;
-  if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts) ||
-      BuildVectorOpds.size() < 2)
+  if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts))
     return false;
 
   LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
@@ -7323,7 +7403,6 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
   SmallVector<Value *, 16> BuildVectorInsts;
   SmallVector<Value *, 16> BuildVectorOpds;
   if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) ||
-      BuildVectorOpds.size() < 2 ||
       (llvm::all_of(BuildVectorOpds,
                     [](Value *V) { return isa<ExtractElementInst>(V); }) &&
        isShuffle(BuildVectorOpds)))
@@ -7369,7 +7448,6 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
   bool Changed = false;
   SmallVector<Value *, 4> Incoming;
   SmallPtrSet<Value *, 16> VisitedInstrs;
-  unsigned MaxVecRegSize = R.getMaxVecRegSize();
 
   bool HaveVectorizedPhiNodes = true;
   while (HaveVectorizedPhiNodes) {
@@ -7396,27 +7474,8 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
 
       // Look for the next elements with the same type.
       SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
-      Type *EltTy = (*IncIt)->getType();
-
-      assert(EltTy->isSized() &&
-             "Instructions should all be sized at this point");
-      TypeSize EltTS = DL->getTypeSizeInBits(EltTy);
-      if (EltTS.isScalable()) {
-        // For now, just ignore vectorizing scalable types.
-        ++IncIt;
-        continue;
-      }
-
-      unsigned EltSize = EltTS.getFixedSize();
-      unsigned MaxNumElts = MaxVecRegSize / EltSize;
-      if (MaxNumElts < 2) {
-        ++IncIt;
-        continue;
-      }
-
       while (SameTypeIt != E &&
-             (*SameTypeIt)->getType() == EltTy &&
-             (SameTypeIt - IncIt) < MaxNumElts) {
+             (*SameTypeIt)->getType() == (*IncIt)->getType()) {
         VisitedInstrs.insert(*SameTypeIt);
         ++SameTypeIt;
       }
@@ -7448,12 +7507,17 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
   SmallVector<Instruction *, 8> PostProcessInstructions;
   SmallDenseSet<Instruction *, 4> KeyNodes;
   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+    // Skip instructions with scalable type. The num of elements is unknown at
+    // compile-time for scalable type.
+    if (isa<ScalableVectorType>(it->getType()))
+      continue;
+
     // Skip instructions marked for the deletion.
     if (R.isDeleted(&*it))
       continue;
     // We may go through BB multiple times so skip the one we have checked.
     if (!VisitedInstrs.insert(&*it).second) {
-      if (it->use_empty() && KeyNodes.count(&*it) > 0 &&
+      if (it->use_empty() && KeyNodes.contains(&*it) &&
           vectorizeSimpleInstructions(PostProcessInstructions, BB, R)) {
         // We would like to start over since some instructions are deleted
         // and the iterator may become invalid value.
@@ -7470,16 +7534,29 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
     // Try to vectorize reductions that use PHINodes.
     if (PHINode *P = dyn_cast<PHINode>(it)) {
       // Check that the PHI is a reduction PHI.
-      if (P->getNumIncomingValues() != 2)
-        return Changed;
+      if (P->getNumIncomingValues() == 2) {
+        // Try to match and vectorize a horizontal reduction.
+        if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R,
+                                     TTI)) {
+          Changed = true;
+          it = BB->begin();
+          e = BB->end();
+          continue;
+        }
+      }
+      // Try to vectorize the incoming values of the PHI, to catch reductions
+      // that feed into PHIs.
+      for (unsigned I = 0, E = P->getNumIncomingValues(); I != E; I++) {
+        // Skip if the incoming block is the current BB for now. Also, bypass
+        // unreachable IR for efficiency and to avoid crashing.
+        // TODO: Collect the skipped incoming values and try to vectorize them
+        // after processing BB.
+        if (BB == P->getIncomingBlock(I) ||
+            !DT->isReachableFromEntry(P->getIncomingBlock(I)))
+          continue;
 
-      // Try to match and vectorize a horizontal reduction.
-      if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R,
-                                   TTI)) {
-        Changed = true;
-        it = BB->begin();
-        e = BB->end();
-        continue;
+        Changed |= vectorizeRootInstruction(nullptr, P->getIncomingValue(I),
+                                            P->getIncomingBlock(I), R, TTI);
       }
       continue;
     }
@@ -7543,7 +7620,7 @@ bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
     unsigned MaxElts = MaxVecRegSize / EltSize;
     for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
       auto Len = std::min<unsigned>(BE - BI, MaxElts);
-      auto GEPList = makeArrayRef(&Entry.second[BI], Len);
+      ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
 
       // Initialize a set a candidate getelementptrs. Note that we use a
       // SetVector here to preserve program order. If the index computations
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 6f055ca80ff2..873701676067 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -61,17 +61,19 @@ class VPRecipeBuilder {
   /// Check if the load or store instruction \p I should widened for \p
   /// Range.Start and potentially masked. Such instructions are handled by a
   /// recipe that takes an additional VPInstruction for the mask.
-  VPWidenMemoryInstructionRecipe *
-  tryToWidenMemory(Instruction *I, VFRange &Range, VPlanPtr &Plan);
+  VPRecipeBase *tryToWidenMemory(Instruction *I, VFRange &Range,
+                                 VPlanPtr &Plan);
 
   /// Check if an induction recipe should be constructed for \I. If so build and
   /// return it. If not, return null.
-  VPWidenIntOrFpInductionRecipe *tryToOptimizeInductionPHI(PHINode *Phi) const;
+  VPWidenIntOrFpInductionRecipe *tryToOptimizeInductionPHI(PHINode *Phi,
+                                                           VPlan &Plan) const;
 
   /// Optimize the special case where the operand of \p I is a constant integer
   /// induction variable.
   VPWidenIntOrFpInductionRecipe *
-  tryToOptimizeInductionTruncate(TruncInst *I, VFRange &Range) const;
+  tryToOptimizeInductionTruncate(TruncInst *I, VFRange &Range,
+                                 VPlan &Plan) const;
 
   /// Handle non-loop phi nodes. Currently all such phi nodes are turned into
   /// a sequence of select instructions as the vectorizer currently performs
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp
index f5f28a3bffa1..b26399e0ae58 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -20,8 +20,10 @@
 #include "VPlanDominatorTree.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/IVDescriptors.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
@@ -56,13 +58,69 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const VPValue &V) {
   return OS;
 }
 
+VPValue::VPValue(const unsigned char SC, Value *UV, VPDef *Def)
+    : SubclassID(SC), UnderlyingVal(UV), Def(Def) {
+  if (Def)
+    Def->addDefinedValue(this);
+}
+
+VPValue::~VPValue() {
+  assert(Users.empty() && "trying to delete a VPValue with remaining users");
+  if (Def)
+    Def->removeDefinedValue(this);
+}
+
 void VPValue::print(raw_ostream &OS, VPSlotTracker &SlotTracker) const {
-  if (const VPInstruction *Instr = dyn_cast<VPInstruction>(this))
-    Instr->print(OS, SlotTracker);
+  if (const VPRecipeBase *R = dyn_cast_or_null<VPRecipeBase>(Def))
+    R->print(OS, "", SlotTracker);
   else
     printAsOperand(OS, SlotTracker);
 }
 
+void VPValue::dump() const {
+  const VPRecipeBase *Instr = dyn_cast_or_null<VPRecipeBase>(this->Def);
+  VPSlotTracker SlotTracker(
+      (Instr && Instr->getParent()) ? Instr->getParent()->getPlan() : nullptr);
+  print(dbgs(), SlotTracker);
+  dbgs() << "\n";
+}
+
+void VPDef::dump() const {
+  const VPRecipeBase *Instr = dyn_cast_or_null<VPRecipeBase>(this);
+  VPSlotTracker SlotTracker(
+      (Instr && Instr->getParent()) ? Instr->getParent()->getPlan() : nullptr);
+  print(dbgs(), "", SlotTracker);
+  dbgs() << "\n";
+}
+
+VPUser *VPRecipeBase::toVPUser() {
+  if (auto *U = dyn_cast<VPInstruction>(this))
+    return U;
+  if (auto *U = dyn_cast<VPWidenRecipe>(this))
+    return U;
+  if (auto *U = dyn_cast<VPWidenCallRecipe>(this))
+    return U;
+  if (auto *U = dyn_cast<VPWidenSelectRecipe>(this))
+    return U;
+  if (auto *U = dyn_cast<VPWidenGEPRecipe>(this))
+    return U;
+  if (auto *U = dyn_cast<VPBlendRecipe>(this))
+    return U;
+  if (auto *U = dyn_cast<VPInterleaveRecipe>(this))
+    return U;
+  if (auto *U = dyn_cast<VPReplicateRecipe>(this))
+    return U;
+  if (auto *U = dyn_cast<VPBranchOnMaskRecipe>(this))
+    return U;
+  if (auto *U = dyn_cast<VPWidenMemoryInstructionRecipe>(this))
+    return U;
+  if (auto *U = dyn_cast<VPReductionRecipe>(this))
+    return U;
+  if (auto *U = dyn_cast<VPPredInstPHIRecipe>(this))
+    return U;
+  return nullptr;
+}
+
 // Get the top-most entry block of \p Start. This is the entry block of the
 // containing VPlan. This function is templated to support both const and non-const blocks
 template <typename T> static T *getPlanEntry(T *Start) {
@@ -142,14 +200,43 @@ VPBlockBase *VPBlockBase::getEnclosingBlockWithPredecessors() {
 }
 
 void VPBlockBase::deleteCFG(VPBlockBase *Entry) {
-  SmallVector<VPBlockBase *, 8> Blocks;
-  for (VPBlockBase *Block : depth_first(Entry))
-    Blocks.push_back(Block);
+  SmallVector<VPBlockBase *, 8> Blocks(depth_first(Entry));
 
   for (VPBlockBase *Block : Blocks)
     delete Block;
 }
 
+VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() {
+  iterator It = begin();
+  while (It != end() && (isa<VPWidenPHIRecipe>(&*It) ||
+                         isa<VPWidenIntOrFpInductionRecipe>(&*It) ||
+                         isa<VPPredInstPHIRecipe>(&*It) ||
+                         isa<VPWidenCanonicalIVRecipe>(&*It)))
+    It++;
+  return It;
+}
+
+Value *VPTransformState::get(VPValue *Def, const VPIteration &Instance) {
+  if (!Def->getDef() && OrigLoop->isLoopInvariant(Def->getLiveInIRValue()))
+    return Def->getLiveInIRValue();
+
+  if (hasScalarValue(Def, Instance))
+    return Data.PerPartScalars[Def][Instance.Part][Instance.Lane];
+
+  if (hasVectorValue(Def, Instance.Part)) {
+    assert(Data.PerPartOutput.count(Def));
+    auto *VecPart = Data.PerPartOutput[Def][Instance.Part];
+    if (!VecPart->getType()->isVectorTy()) {
+      assert(Instance.Lane == 0 && "cannot get lane > 0 for scalar");
+      return VecPart;
+    }
+    // TODO: Cache created scalar values.
+    return Builder.CreateExtractElement(VecPart,
+                                        Builder.getInt32(Instance.Lane));
+  }
+  return Callback.getOrCreateScalarValue(VPValue2Value[Def], Instance);
+}
+
 BasicBlock *
 VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) {
   // BB stands for IR BasicBlocks. VPBB stands for VPlan VPBasicBlocks.
@@ -267,6 +354,24 @@ void VPBasicBlock::execute(VPTransformState *State) {
   LLVM_DEBUG(dbgs() << "LV: filled BB:" << *NewBB);
 }
 
+void VPBasicBlock::dropAllReferences(VPValue *NewValue) {
+  for (VPRecipeBase &R : Recipes) {
+    for (auto *Def : R.definedValues())
+      Def->replaceAllUsesWith(NewValue);
+
+    if (auto *User = R.toVPUser())
+      for (unsigned I = 0, E = User->getNumOperands(); I != E; I++)
+        User->setOperand(I, NewValue);
+  }
+}
+
+void VPRegionBlock::dropAllReferences(VPValue *NewValue) {
+  for (VPBlockBase *Block : depth_first(Entry))
+    // Drop all references in VPBasicBlocks and replace all uses with
+    // DummyValue.
+    Block->dropAllReferences(NewValue);
+}
+
 void VPRegionBlock::execute(VPTransformState *State) {
   ReversePostOrderTraversal<VPBlockBase *> RPOT(Entry);
 
@@ -300,7 +405,9 @@ void VPRegionBlock::execute(VPTransformState *State) {
 
   for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part) {
     State->Instance->Part = Part;
-    for (unsigned Lane = 0, VF = State->VF; Lane < VF; ++Lane) {
+    assert(!State->VF.isScalable() && "VF is assumed to be non scalable.");
+    for (unsigned Lane = 0, VF = State->VF.getKnownMinValue(); Lane < VF;
+         ++Lane) {
       State->Instance->Lane = Lane;
       // Visit the VPBlocks connected to \p this, starting from it.
       for (VPBlockBase *Block : RPOT) {
@@ -346,6 +453,14 @@ void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
   insertAfter(InsertPos);
 }
 
+void VPRecipeBase::moveBefore(VPBasicBlock &BB,
+                              iplist<VPRecipeBase>::iterator I) {
+  assert(I == BB.end() || I->getParent() == &BB);
+  removeFromParent();
+  Parent = &BB;
+  BB.getRecipeList().insert(I, this);
+}
+
 void VPInstruction::generateInstruction(VPTransformState &State,
                                         unsigned Part) {
   IRBuilder<> &Builder = State.Builder;
@@ -383,14 +498,14 @@ void VPInstruction::generateInstruction(VPTransformState &State,
   case VPInstruction::ActiveLaneMask: {
     // Get first lane of vector induction variable.
     Value *VIVElem0 = State.get(getOperand(0), {Part, 0});
-    // Get first lane of backedge-taken-count.
-    Value *ScalarBTC = State.get(getOperand(1), {Part, 0});
+    // Get the original loop tripcount.
+    Value *ScalarTC = State.TripCount;
 
     auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
-    auto *PredTy = FixedVectorType::get(Int1Ty, State.VF);
+    auto *PredTy = FixedVectorType::get(Int1Ty, State.VF.getKnownMinValue());
     Instruction *Call = Builder.CreateIntrinsic(
-        Intrinsic::get_active_lane_mask, {PredTy, ScalarBTC->getType()},
-        {VIVElem0, ScalarBTC}, nullptr, "active.lane.mask");
+        Intrinsic::get_active_lane_mask, {PredTy, ScalarTC->getType()},
+        {VIVElem0, ScalarTC}, nullptr, "active.lane.mask");
     State.set(this, Call, Part);
     break;
   }
@@ -405,18 +520,15 @@ void VPInstruction::execute(VPTransformState &State) {
     generateInstruction(State, Part);
 }
 
-void VPInstruction::print(raw_ostream &O, const Twine &Indent,
-                          VPSlotTracker &SlotTracker) const {
-  O << "\"EMIT ";
-  print(O, SlotTracker);
-}
-
-void VPInstruction::print(raw_ostream &O) const {
+void VPInstruction::dump() const {
   VPSlotTracker SlotTracker(getParent()->getPlan());
-  print(O, SlotTracker);
+  print(dbgs(), "", SlotTracker);
 }
 
-void VPInstruction::print(raw_ostream &O, VPSlotTracker &SlotTracker) const {
+void VPInstruction::print(raw_ostream &O, const Twine &Indent,
+                          VPSlotTracker &SlotTracker) const {
+  O << "EMIT ";
+
   if (hasResult()) {
     printAsOperand(O, SlotTracker);
     O << " = ";
@@ -461,7 +573,7 @@ void VPlan::execute(VPTransformState *State) {
                                    "trip.count.minus.1");
     auto VF = State->VF;
     Value *VTCMO =
-        VF == 1 ? TCMO : Builder.CreateVectorSplat(VF, TCMO, "broadcast");
+        VF.isScalar() ? TCMO : Builder.CreateVectorSplat(VF, TCMO, "broadcast");
     for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part)
       State->set(BackedgeTakenCount, VTCMO, Part);
   }
@@ -666,7 +778,7 @@ void VPlanPrinter::dumpBasicBlock(const VPBasicBlock *BasicBlock) {
   // Dump the block predicate.
   const VPValue *Pred = BasicBlock->getPredicate();
   if (Pred) {
-    OS << " +\n" << Indent << " \"BlockPredicate: ";
+    OS << " +\n" << Indent << " \"BlockPredicate: \"";
     if (const VPInstruction *PredI = dyn_cast<VPInstruction>(Pred)) {
       PredI->printAsOperand(OS, SlotTracker);
       OS << " (" << DOT::EscapeString(PredI->getParent()->getName())
@@ -676,7 +788,7 @@ void VPlanPrinter::dumpBasicBlock(const VPBasicBlock *BasicBlock) {
   }
 
   for (const VPRecipeBase &Recipe : *BasicBlock) {
-    OS << " +\n" << Indent;
+    OS << " +\n" << Indent << "\"";
     Recipe.print(OS, Indent, SlotTracker);
     OS << "\\l\"";
   }
@@ -715,7 +827,7 @@ void VPlanPrinter::dumpRegion(const VPRegionBlock *Region) {
   dumpEdges(Region);
 }
 
-void VPlanPrinter::printAsIngredient(raw_ostream &O, Value *V) {
+void VPlanPrinter::printAsIngredient(raw_ostream &O, const Value *V) {
   std::string IngredientString;
   raw_string_ostream RSO(IngredientString);
   if (auto *Inst = dyn_cast<Instruction>(V)) {
@@ -738,24 +850,45 @@ void VPlanPrinter::printAsIngredient(raw_ostream &O, Value *V) {
 
 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
                               VPSlotTracker &SlotTracker) const {
-  O << "\"WIDEN-CALL " << VPlanIngredient(&Ingredient);
+  O << "WIDEN-CALL ";
+
+  auto *CI = cast<CallInst>(getUnderlyingInstr());
+  if (CI->getType()->isVoidTy())
+    O << "void ";
+  else {
+    printAsOperand(O, SlotTracker);
+    O << " = ";
+  }
+
+  O << "call @" << CI->getCalledFunction()->getName() << "(";
+  printOperands(O, SlotTracker);
+  O << ")";
 }
 
 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
                                 VPSlotTracker &SlotTracker) const {
-  O << "\"WIDEN-SELECT" << VPlanIngredient(&Ingredient)
-    << (InvariantCond ? " (condition is loop invariant)" : "");
+  O << "WIDEN-SELECT ";
+  printAsOperand(O, SlotTracker);
+  O << " = select ";
+  getOperand(0)->printAsOperand(O, SlotTracker);
+  O << ", ";
+  getOperand(1)->printAsOperand(O, SlotTracker);
+  O << ", ";
+  getOperand(2)->printAsOperand(O, SlotTracker);
+  O << (InvariantCond ? " (condition is loop invariant)" : "");
 }
 
 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
                           VPSlotTracker &SlotTracker) const {
-  O << "\"WIDEN\\l\"";
-  O << "\"  " << VPlanIngredient(&Ingredient);
+  O << "WIDEN ";
+  printAsOperand(O, SlotTracker);
+  O << " = " << getUnderlyingInstr()->getOpcodeName() << " ";
+  printOperands(O, SlotTracker);
 }
 
 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
                                           VPSlotTracker &SlotTracker) const {
-  O << "\"WIDEN-INDUCTION";
+  O << "WIDEN-INDUCTION";
   if (Trunc) {
     O << "\\l\"";
     O << " +\n" << Indent << "\"  " << VPlanIngredient(IV) << "\\l\"";
@@ -766,23 +899,26 @@ void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
 
 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
                              VPSlotTracker &SlotTracker) const {
-  O << "\"WIDEN-GEP ";
+  O << "WIDEN-GEP ";
   O << (IsPtrLoopInvariant ? "Inv" : "Var");
   size_t IndicesNumber = IsIndexLoopInvariant.size();
   for (size_t I = 0; I < IndicesNumber; ++I)
     O << "[" << (IsIndexLoopInvariant[I] ? "Inv" : "Var") << "]";
-  O << "\\l\"";
-  O << " +\n" << Indent << "\"  " << VPlanIngredient(GEP);
+
+  O << " ";
+  printAsOperand(O, SlotTracker);
+  O << " = getelementptr ";
+  printOperands(O, SlotTracker);
 }
 
 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
                              VPSlotTracker &SlotTracker) const {
-  O << "\"WIDEN-PHI " << VPlanIngredient(Phi);
+  O << "WIDEN-PHI " << VPlanIngredient(Phi);
 }
 
 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
                           VPSlotTracker &SlotTracker) const {
-  O << "\"BLEND ";
+  O << "BLEND ";
   Phi->printAsOperand(O, false);
   O << " =";
   if (getNumIncomingValues() == 1) {
@@ -800,46 +936,75 @@ void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
   }
 }
 
+void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
+                              VPSlotTracker &SlotTracker) const {
+  O << "REDUCE ";
+  printAsOperand(O, SlotTracker);
+  O << " = ";
+  getChainOp()->printAsOperand(O, SlotTracker);
+  O << " + reduce." << Instruction::getOpcodeName(RdxDesc->getOpcode())
+    << " (";
+  getVecOp()->printAsOperand(O, SlotTracker);
+  if (getCondOp()) {
+    O << ", ";
+    getCondOp()->printAsOperand(O, SlotTracker);
+  }
+  O << ")";
+}
+
 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
                               VPSlotTracker &SlotTracker) const {
-  O << "\"" << (IsUniform ? "CLONE " : "REPLICATE ")
-    << VPlanIngredient(Ingredient);
+  O << (IsUniform ? "CLONE " : "REPLICATE ");
+
+  if (!getUnderlyingInstr()->getType()->isVoidTy()) {
+    printAsOperand(O, SlotTracker);
+    O << " = ";
+  }
+  O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()) << " ";
+  printOperands(O, SlotTracker);
+
   if (AlsoPack)
     O << " (S->V)";
 }
 
 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
                                 VPSlotTracker &SlotTracker) const {
-  O << "\"PHI-PREDICATED-INSTRUCTION " << VPlanIngredient(PredInst);
+  O << "PHI-PREDICATED-INSTRUCTION ";
+  printOperands(O, SlotTracker);
 }
 
 void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent,
                                            VPSlotTracker &SlotTracker) const {
-  O << "\"WIDEN " << VPlanIngredient(&Instr);
-  O << ", ";
-  getAddr()->printAsOperand(O, SlotTracker);
-  VPValue *Mask = getMask();
-  if (Mask) {
-    O << ", ";
-    Mask->printAsOperand(O, SlotTracker);
+  O << "WIDEN ";
+
+  if (!isStore()) {
+    getVPValue()->printAsOperand(O, SlotTracker);
+    O << " = ";
   }
+  O << Instruction::getOpcodeName(Ingredient.getOpcode()) << " ";
+
+  printOperands(O, SlotTracker);
 }
 
 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
   Value *CanonicalIV = State.CanonicalIV;
   Type *STy = CanonicalIV->getType();
   IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
-  auto VF = State.VF;
-  Value *VStart = VF == 1
+  ElementCount VF = State.VF;
+  assert(!VF.isScalable() && "the code following assumes non scalables ECs");
+  Value *VStart = VF.isScalar()
                       ? CanonicalIV
-                      : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
+                      : Builder.CreateVectorSplat(VF.getKnownMinValue(),
+                                                  CanonicalIV, "broadcast");
   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
     SmallVector<Constant *, 8> Indices;
-    for (unsigned Lane = 0; Lane < VF; ++Lane)
-      Indices.push_back(ConstantInt::get(STy, Part * VF + Lane));
+    for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
+      Indices.push_back(
+          ConstantInt::get(STy, Part * VF.getKnownMinValue() + Lane));
     // If VF == 1, there is only one iteration in the loop above, thus the
     // element pushed back into Indices is ConstantInt::get(STy, Part)
-    Constant *VStep = VF == 1 ? Indices.back() : ConstantVector::get(Indices);
+    Constant *VStep =
+        VF.isScalar() ? Indices.back() : ConstantVector::get(Indices);
     // Add the consecutive indices to the vector value.
     Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
     State.set(getVPValue(), CanonicalVectorIV, Part);
@@ -848,7 +1013,7 @@ void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
 
 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
                                      VPSlotTracker &SlotTracker) const {
-  O << "\"EMIT ";
+  O << "EMIT ";
   getVPValue()->printAsOperand(O, SlotTracker);
   O << " = WIDEN-CANONICAL-INDUCTION";
 }
@@ -856,10 +1021,18 @@ void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
 template void DomTreeBuilder::Calculate<VPDominatorTree>(VPDominatorTree &DT);
 
 void VPValue::replaceAllUsesWith(VPValue *New) {
-  for (VPUser *User : users())
+  for (unsigned J = 0; J < getNumUsers();) {
+    VPUser *User = Users[J];
+    unsigned NumUsers = getNumUsers();
     for (unsigned I = 0, E = User->getNumOperands(); I < E; ++I)
       if (User->getOperand(I) == this)
         User->setOperand(I, New);
+    // If a user got removed after updating the current user, the next user to
+    // update will be moved to the current position, so we only need to
+    // increment the index if the number of users did not change.
+    if (NumUsers == getNumUsers())
+      J++;
+  }
 }
 
 void VPValue::printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const {
@@ -877,6 +1050,12 @@ void VPValue::printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const {
     OS << "vp<%" << Tracker.getSlot(this) << ">";
 }
 
+void VPUser::printOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const {
+  interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) {
+    Op->printAsOperand(O, SlotTracker);
+  });
+}
+
 void VPInterleavedAccessInfo::visitRegion(VPRegionBlock *Region,
                                           Old2NewTy &Old2New,
                                           InterleavedAccessInfo &IAI) {
@@ -925,13 +1104,6 @@ VPInterleavedAccessInfo::VPInterleavedAccessInfo(VPlan &Plan,
 
 void VPSlotTracker::assignSlot(const VPValue *V) {
   assert(Slots.find(V) == Slots.end() && "VPValue already has a slot!");
-  const Value *UV = V->getUnderlyingValue();
-  if (UV)
-    return;
-  const auto *VPI = dyn_cast<VPInstruction>(V);
-  if (VPI && !VPI->hasResult())
-    return;
-
   Slots[V] = NextSlot++;
 }
 
@@ -950,10 +1122,8 @@ void VPSlotTracker::assignSlots(const VPRegionBlock *Region) {
 
 void VPSlotTracker::assignSlots(const VPBasicBlock *VPBB) {
   for (const VPRecipeBase &Recipe : *VPBB) {
-    if (const auto *VPI = dyn_cast<VPInstruction>(&Recipe))
-      assignSlot(VPI);
-    else if (const auto *VPIV = dyn_cast<VPWidenCanonicalIVRecipe>(&Recipe))
-      assignSlot(VPIV->getVPValue());
+    for (VPValue *Def : Recipe.definedValues())
+      assignSlot(Def);
   }
 }
 
@@ -962,10 +1132,6 @@ void VPSlotTracker::assignSlots(const VPlan &Plan) {
   for (const VPValue *V : Plan.VPExternalDefs)
     assignSlot(V);
 
-  for (auto &E : Plan.Value2VPValue)
-    if (!isa<VPInstruction>(E.second))
-      assignSlot(E.second);
-
   for (const VPValue *V : Plan.VPCBVs)
     assignSlot(V);
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h
index f07c94e7a3c7..2cce127cd4ce 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -51,13 +51,12 @@ namespace llvm {
 class BasicBlock;
 class DominatorTree;
 class InnerLoopVectorizer;
-template <class T> class InterleaveGroup;
 class LoopInfo;
 class raw_ostream;
+class RecurrenceDescriptor;
 class Value;
 class VPBasicBlock;
 class VPRegionBlock;
-class VPSlotTracker;
 class VPlan;
 class VPlanSlp;
 
@@ -66,10 +65,22 @@ class VPlanSlp;
 /// [1, 9) = {1, 2, 4, 8}
 struct VFRange {
   // A power of 2.
-  const unsigned Start;
+  const ElementCount Start;
 
   // Need not be a power of 2. If End <= Start range is empty.
-  unsigned End;
+  ElementCount End;
+
+  bool isEmpty() const {
+    return End.getKnownMinValue() <= Start.getKnownMinValue();
+  }
+
+  VFRange(const ElementCount &Start, const ElementCount &End)
+      : Start(Start), End(End) {
+    assert(Start.isScalable() == End.isScalable() &&
+           "Both Start and End should have the same scalable flag");
+    assert(isPowerOf2_32(Start.getKnownMinValue()) &&
+           "Expected Start to be a power of 2");
+  }
 };
 
 using VPlanPtr = std::unique_ptr<VPlan>;
@@ -114,7 +125,7 @@ private:
 
   /// The vectorization factor. Each entry in the scalar map contains UF x VF
   /// scalar values.
-  unsigned VF;
+  ElementCount VF;
 
   /// The vector and scalar map storage. We use std::map and not DenseMap
   /// because insertions to DenseMap invalidate its iterators.
@@ -125,7 +136,7 @@ private:
 
 public:
   /// Construct an empty map with the given unroll and vectorization factors.
-  VectorizerValueMap(unsigned UF, unsigned VF) : UF(UF), VF(VF) {}
+  VectorizerValueMap(unsigned UF, ElementCount VF) : UF(UF), VF(VF) {}
 
   /// \return True if the map has any vector entry for \p Key.
   bool hasAnyVectorValue(Value *Key) const {
@@ -150,12 +161,14 @@ public:
   /// \return True if the map has a scalar entry for \p Key and \p Instance.
   bool hasScalarValue(Value *Key, const VPIteration &Instance) const {
     assert(Instance.Part < UF && "Queried Scalar Part is too large.");
-    assert(Instance.Lane < VF && "Queried Scalar Lane is too large.");
+    assert(Instance.Lane < VF.getKnownMinValue() &&
+           "Queried Scalar Lane is too large.");
+
     if (!hasAnyScalarValue(Key))
       return false;
     const ScalarParts &Entry = ScalarMapStorage.find(Key)->second;
     assert(Entry.size() == UF && "ScalarParts has wrong dimensions.");
-    assert(Entry[Instance.Part].size() == VF &&
+    assert(Entry[Instance.Part].size() == VF.getKnownMinValue() &&
            "ScalarParts has wrong dimensions.");
     return Entry[Instance.Part][Instance.Lane] != nullptr;
   }
@@ -194,7 +207,7 @@ public:
       // TODO: Consider storing uniform values only per-part, as they occupy
       //       lane 0 only, keeping the other VF-1 redundant entries null.
       for (unsigned Part = 0; Part < UF; ++Part)
-        Entry[Part].resize(VF, nullptr);
+        Entry[Part].resize(VF.getKnownMinValue(), nullptr);
       ScalarMapStorage[Key] = Entry;
     }
     ScalarMapStorage[Key][Instance.Part][Instance.Lane] = Scalar;
@@ -233,14 +246,15 @@ struct VPCallback {
 /// VPTransformState holds information passed down when "executing" a VPlan,
 /// needed for generating the output IR.
 struct VPTransformState {
-  VPTransformState(unsigned VF, unsigned UF, LoopInfo *LI, DominatorTree *DT,
-                   IRBuilder<> &Builder, VectorizerValueMap &ValueMap,
-                   InnerLoopVectorizer *ILV, VPCallback &Callback)
-      : VF(VF), UF(UF), Instance(), LI(LI), DT(DT), Builder(Builder),
-        ValueMap(ValueMap), ILV(ILV), Callback(Callback) {}
+  VPTransformState(ElementCount VF, unsigned UF, Loop *OrigLoop, LoopInfo *LI,
+                   DominatorTree *DT, IRBuilder<> &Builder,
+                   VectorizerValueMap &ValueMap, InnerLoopVectorizer *ILV,
+                   VPCallback &Callback)
+      : VF(VF), UF(UF), Instance(), OrigLoop(OrigLoop), LI(LI), DT(DT),
+        Builder(Builder), ValueMap(ValueMap), ILV(ILV), Callback(Callback) {}
 
   /// The chosen Vectorization and Unroll Factors of the loop being vectorized.
-  unsigned VF;
+  ElementCount VF;
   unsigned UF;
 
   /// Hold the indices to generate specific scalar instructions. Null indicates
@@ -255,6 +269,9 @@ struct VPTransformState {
     typedef SmallVector<Value *, 2> PerPartValuesTy;
 
     DenseMap<VPValue *, PerPartValuesTy> PerPartOutput;
+
+    using ScalarsPerPartValuesTy = SmallVector<SmallVector<Value *, 4>, 2>;
+    DenseMap<VPValue *, ScalarsPerPartValuesTy> PerPartScalars;
   } Data;
 
   /// Get the generated Value for a given VPValue and a given Part. Note that
@@ -271,20 +288,21 @@ struct VPTransformState {
   }
 
   /// Get the generated Value for a given VPValue and given Part and Lane.
-  Value *get(VPValue *Def, const VPIteration &Instance) {
-    // If the Def is managed directly by VPTransformState, extract the lane from
-    // the relevant part. Note that currently only VPInstructions and external
-    // defs are managed by VPTransformState. Other Defs are still created by ILV
-    // and managed in its ValueMap. For those this method currently just
-    // delegates the call to ILV below.
-    if (Data.PerPartOutput.count(Def)) {
-      auto *VecPart = Data.PerPartOutput[Def][Instance.Part];
-      // TODO: Cache created scalar values.
-      return Builder.CreateExtractElement(VecPart,
-                                          Builder.getInt32(Instance.Lane));
-    }
+  Value *get(VPValue *Def, const VPIteration &Instance);
+
+  bool hasVectorValue(VPValue *Def, unsigned Part) {
+    auto I = Data.PerPartOutput.find(Def);
+    return I != Data.PerPartOutput.end() && Part < I->second.size() &&
+           I->second[Part];
+  }
 
-    return Callback.getOrCreateScalarValue(VPValue2Value[Def], Instance);
+  bool hasScalarValue(VPValue *Def, VPIteration Instance) {
+    auto I = Data.PerPartScalars.find(Def);
+    if (I == Data.PerPartScalars.end())
+      return false;
+    return Instance.Part < I->second.size() &&
+           Instance.Lane < I->second[Instance.Part].size() &&
+           I->second[Instance.Part][Instance.Lane];
   }
 
   /// Set the generated Value for a given VPValue and a given Part.
@@ -295,6 +313,18 @@ struct VPTransformState {
     }
     Data.PerPartOutput[Def][Part] = V;
   }
+  void set(VPValue *Def, Value *IRDef, Value *V, unsigned Part);
+
+  void set(VPValue *Def, Value *V, const VPIteration &Instance) {
+    auto Iter = Data.PerPartScalars.insert({Def, {}});
+    auto &PerPartVec = Iter.first->second;
+    while (PerPartVec.size() <= Instance.Part)
+      PerPartVec.emplace_back();
+    auto &Scalars = PerPartVec[Instance.Part];
+    while (Scalars.size() <= Instance.Lane)
+      Scalars.push_back(nullptr);
+    Scalars[Instance.Lane] = V;
+  }
 
   /// Hold state information used when constructing the CFG of the output IR,
   /// traversing the VPBasicBlocks and generating corresponding IR BasicBlocks.
@@ -321,6 +351,9 @@ struct VPTransformState {
     CFGState() = default;
   } CFG;
 
+  /// Hold a pointer to the original loop.
+  Loop *OrigLoop;
+
   /// Hold a pointer to LoopInfo to register new basic blocks in the loop.
   LoopInfo *LI;
 
@@ -394,14 +427,14 @@ class VPBlockBase {
 
   /// Remove \p Predecessor from the predecessors of this block.
   void removePredecessor(VPBlockBase *Predecessor) {
-    auto Pos = std::find(Predecessors.begin(), Predecessors.end(), Predecessor);
+    auto Pos = find(Predecessors, Predecessor);
     assert(Pos && "Predecessor does not exist");
     Predecessors.erase(Pos);
   }
 
   /// Remove \p Successor from the successors of this block.
   void removeSuccessor(VPBlockBase *Successor) {
-    auto Pos = std::find(Successors.begin(), Successors.end(), Successor);
+    auto Pos = find(Successors, Successor);
     assert(Pos && "Successor does not exist");
     Successors.erase(Pos);
   }
@@ -594,49 +627,30 @@ public:
     // hoisted into a VPBlockBase.
     return true;
   }
+
+  /// Replace all operands of VPUsers in the block with \p NewValue and also
+  /// replaces all uses of VPValues defined in the block with NewValue.
+  virtual void dropAllReferences(VPValue *NewValue) = 0;
 };
 
 /// VPRecipeBase is a base class modeling a sequence of one or more output IR
-/// instructions.
-class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock> {
+/// instructions. VPRecipeBase owns the the VPValues it defines through VPDef
+/// and is responsible for deleting its defined values. Single-value
+/// VPRecipeBases that also inherit from VPValue must make sure to inherit from
+/// VPRecipeBase before VPValue.
+class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
+                     public VPDef {
   friend VPBasicBlock;
   friend class VPBlockUtils;
 
-  const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
 
   /// Each VPRecipe belongs to a single VPBasicBlock.
   VPBasicBlock *Parent = nullptr;
 
 public:
-  /// An enumeration for keeping track of the concrete subclass of VPRecipeBase
-  /// that is actually instantiated. Values of this enumeration are kept in the
-  /// SubclassID field of the VPRecipeBase objects. They are used for concrete
-  /// type identification.
-  using VPRecipeTy = enum {
-    VPBlendSC,
-    VPBranchOnMaskSC,
-    VPInstructionSC,
-    VPInterleaveSC,
-    VPPredInstPHISC,
-    VPReplicateSC,
-    VPWidenCallSC,
-    VPWidenCanonicalIVSC,
-    VPWidenGEPSC,
-    VPWidenIntOrFpInductionSC,
-    VPWidenMemoryInstructionSC,
-    VPWidenPHISC,
-    VPWidenSC,
-    VPWidenSelectSC
-  };
-
-  VPRecipeBase(const unsigned char SC) : SubclassID(SC) {}
+  VPRecipeBase(const unsigned char SC) : VPDef(SC) {}
   virtual ~VPRecipeBase() = default;
 
-  /// \return an ID for the concrete type of this object.
-  /// This is used to implement the classof checks. This should not be used
-  /// for any other purpose, as the values may change as LLVM evolves.
-  unsigned getVPRecipeID() const { return SubclassID; }
-
   /// \return the VPBasicBlock which this VPRecipe belongs to.
   VPBasicBlock *getParent() { return Parent; }
   const VPBasicBlock *getParent() const { return Parent; }
@@ -645,10 +659,6 @@ public:
   /// this VPRecipe, thereby "executing" the VPlan.
   virtual void execute(struct VPTransformState &State) = 0;
 
-  /// Each recipe prints itself.
-  virtual void print(raw_ostream &O, const Twine &Indent,
-                     VPSlotTracker &SlotTracker) const = 0;
-
   /// Insert an unlinked recipe into a basic block immediately before
   /// the specified recipe.
   void insertBefore(VPRecipeBase *InsertPos);
@@ -661,6 +671,11 @@ public:
   /// the VPBasicBlock that MovePos lives in, right after MovePos.
   void moveAfter(VPRecipeBase *MovePos);
 
+  /// Unlink this recipe and insert into BB before I.
+  ///
+  /// \pre I is a valid iterator into BB.
+  void moveBefore(VPBasicBlock &BB, iplist<VPRecipeBase>::iterator I);
+
   /// This method unlinks 'this' from the containing basic block, but does not
   /// delete it.
   void removeFromParent();
@@ -669,13 +684,46 @@ public:
   ///
   /// \returns an iterator pointing to the element after the erased one
   iplist<VPRecipeBase>::iterator eraseFromParent();
+
+  /// Returns a pointer to a VPUser, if the recipe inherits from VPUser or
+  /// nullptr otherwise.
+  VPUser *toVPUser();
+
+  /// Returns the underlying instruction, if the recipe is a VPValue or nullptr
+  /// otherwise.
+  Instruction *getUnderlyingInstr() {
+    return cast<Instruction>(getVPValue()->getUnderlyingValue());
+  }
+  const Instruction *getUnderlyingInstr() const {
+    return cast<Instruction>(getVPValue()->getUnderlyingValue());
+  }
+
+  /// Method to support type inquiry through isa, cast, and dyn_cast.
+  static inline bool classof(const VPDef *D) {
+    // All VPDefs are also VPRecipeBases.
+    return true;
+  }
 };
 
+inline bool VPUser::classof(const VPDef *Def) {
+  return Def->getVPDefID() == VPRecipeBase::VPInstructionSC ||
+         Def->getVPDefID() == VPRecipeBase::VPWidenSC ||
+         Def->getVPDefID() == VPRecipeBase::VPWidenCallSC ||
+         Def->getVPDefID() == VPRecipeBase::VPWidenSelectSC ||
+         Def->getVPDefID() == VPRecipeBase::VPWidenGEPSC ||
+         Def->getVPDefID() == VPRecipeBase::VPBlendSC ||
+         Def->getVPDefID() == VPRecipeBase::VPInterleaveSC ||
+         Def->getVPDefID() == VPRecipeBase::VPReplicateSC ||
+         Def->getVPDefID() == VPRecipeBase::VPReductionSC ||
+         Def->getVPDefID() == VPRecipeBase::VPBranchOnMaskSC ||
+         Def->getVPDefID() == VPRecipeBase::VPWidenMemoryInstructionSC;
+}
+
 /// This is a concrete Recipe that models a single VPlan-level instruction.
 /// While as any Recipe it may generate a sequence of IR instructions when
 /// executed, these instructions would always form a single-def expression as
 /// the VPInstruction is also a single def-use vertex.
-class VPInstruction : public VPUser, public VPRecipeBase {
+class VPInstruction : public VPRecipeBase, public VPUser, public VPValue {
   friend class VPlanSlp;
 
 public:
@@ -697,23 +745,26 @@ private:
   void generateInstruction(VPTransformState &State, unsigned Part);
 
 protected:
-  Instruction *getUnderlyingInstr() {
-    return cast_or_null<Instruction>(getUnderlyingValue());
-  }
-
   void setUnderlyingInstr(Instruction *I) { setUnderlyingValue(I); }
 
 public:
   VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands)
-      : VPUser(VPValue::VPInstructionSC, Operands),
-        VPRecipeBase(VPRecipeBase::VPInstructionSC), Opcode(Opcode) {}
+      : VPRecipeBase(VPRecipeBase::VPInstructionSC), VPUser(Operands),
+        VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode) {}
+
+  VPInstruction(unsigned Opcode, ArrayRef<VPInstruction *> Operands)
+      : VPRecipeBase(VPRecipeBase::VPInstructionSC), VPUser({}),
+        VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode) {
+    for (auto *I : Operands)
+      addOperand(I->getVPValue());
+  }
 
   VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands)
       : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands)) {}
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const VPValue *V) {
-    return V->getVPValueID() == VPValue::VPInstructionSC;
+    return V->getVPValueID() == VPValue::VPVInstructionSC;
   }
 
   VPInstruction *clone() const {
@@ -722,8 +773,8 @@ public:
   }
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
-  static inline bool classof(const VPRecipeBase *R) {
-    return R->getVPRecipeID() == VPRecipeBase::VPInstructionSC;
+  static inline bool classof(const VPDef *R) {
+    return R->getVPDefID() == VPRecipeBase::VPInstructionSC;
   }
 
   unsigned getOpcode() const { return Opcode; }
@@ -733,13 +784,12 @@ public:
   /// provided.
   void execute(VPTransformState &State) override;
 
-  /// Print the Recipe.
+  /// Print the VPInstruction to \p O.
   void print(raw_ostream &O, const Twine &Indent,
              VPSlotTracker &SlotTracker) const override;
 
-  /// Print the VPInstruction.
-  void print(raw_ostream &O) const;
-  void print(raw_ostream &O, VPSlotTracker &SlotTracker) const;
+  /// Print the VPInstruction to dbgs() (for debugging).
+  void dump() const;
 
   /// Return true if this instruction may modify memory.
   bool mayWriteToMemory() const {
@@ -773,23 +823,21 @@ public:
 /// VPWidenRecipe is a recipe for producing a copy of vector type its
 /// ingredient. This recipe covers most of the traditional vectorization cases
 /// where each ingredient transforms into a vectorized version of itself.
-class VPWidenRecipe : public VPRecipeBase {
-  /// Hold the instruction to be widened.
-  Instruction &Ingredient;
-
-  /// Hold VPValues for the operands of the ingredient.
-  VPUser User;
-
+class VPWidenRecipe : public VPRecipeBase, public VPValue, public VPUser {
 public:
   template <typename IterT>
   VPWidenRecipe(Instruction &I, iterator_range<IterT> Operands)
-      : VPRecipeBase(VPWidenSC), Ingredient(I), User(Operands) {}
+      : VPRecipeBase(VPRecipeBase::VPWidenSC),
+        VPValue(VPValue::VPVWidenSC, &I, this), VPUser(Operands) {}
 
   ~VPWidenRecipe() override = default;
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
-  static inline bool classof(const VPRecipeBase *V) {
-    return V->getVPRecipeID() == VPRecipeBase::VPWidenSC;
+  static inline bool classof(const VPDef *D) {
+    return D->getVPDefID() == VPRecipeBase::VPWidenSC;
+  }
+  static inline bool classof(const VPValue *V) {
+    return V->getVPValueID() == VPValue::VPVWidenSC;
   }
 
   /// Produce widened copies of all Ingredients.
@@ -801,23 +849,19 @@ public:
 };
 
 /// A recipe for widening Call instructions.
-class VPWidenCallRecipe : public VPRecipeBase {
-  /// Hold the call to be widened.
-  CallInst &Ingredient;
-
-  /// Hold VPValues for the arguments of the call.
-  VPUser User;
+class VPWidenCallRecipe : public VPRecipeBase, public VPUser, public VPValue {
 
 public:
   template <typename IterT>
   VPWidenCallRecipe(CallInst &I, iterator_range<IterT> CallArguments)
-      : VPRecipeBase(VPWidenCallSC), Ingredient(I), User(CallArguments) {}
+      : VPRecipeBase(VPRecipeBase::VPWidenCallSC), VPUser(CallArguments),
+        VPValue(VPValue::VPVWidenCallSC, &I, this) {}
 
   ~VPWidenCallRecipe() override = default;
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
-  static inline bool classof(const VPRecipeBase *V) {
-    return V->getVPRecipeID() == VPRecipeBase::VPWidenCallSC;
+  static inline bool classof(const VPDef *D) {
+    return D->getVPDefID() == VPRecipeBase::VPWidenCallSC;
   }
 
   /// Produce a widened version of the call instruction.
@@ -829,13 +873,7 @@ public:
 };
 
 /// A recipe for widening select instructions.
-class VPWidenSelectRecipe : public VPRecipeBase {
-private:
-  /// Hold the select to be widened.
-  SelectInst &Ingredient;
-
-  /// Hold VPValues for the operands of the select.
-  VPUser User;
+class VPWidenSelectRecipe : public VPRecipeBase, public VPUser, public VPValue {
 
   /// Is the condition of the select loop invariant?
   bool InvariantCond;
@@ -844,14 +882,15 @@ public:
   template <typename IterT>
   VPWidenSelectRecipe(SelectInst &I, iterator_range<IterT> Operands,
                       bool InvariantCond)
-      : VPRecipeBase(VPWidenSelectSC), Ingredient(I), User(Operands),
+      : VPRecipeBase(VPRecipeBase::VPWidenSelectSC), VPUser(Operands),
+        VPValue(VPValue::VPVWidenSelectSC, &I, this),
         InvariantCond(InvariantCond) {}
 
   ~VPWidenSelectRecipe() override = default;
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
-  static inline bool classof(const VPRecipeBase *V) {
-    return V->getVPRecipeID() == VPRecipeBase::VPWidenSelectSC;
+  static inline bool classof(const VPDef *D) {
+    return D->getVPDefID() == VPRecipeBase::VPWidenSelectSC;
   }
 
   /// Produce a widened version of the select instruction.
@@ -863,20 +902,24 @@ public:
 };
 
 /// A recipe for handling GEP instructions.
-class VPWidenGEPRecipe : public VPRecipeBase {
-  GetElementPtrInst *GEP;
-
-  /// Hold VPValues for the base and indices of the GEP.
-  VPUser User;
-
+class VPWidenGEPRecipe : public VPRecipeBase,
+                         public VPUser,
+                         public VPValue {
   bool IsPtrLoopInvariant;
   SmallBitVector IsIndexLoopInvariant;
 
 public:
+  template <typename IterT>
+  VPWidenGEPRecipe(GetElementPtrInst *GEP, iterator_range<IterT> Operands)
+      : VPRecipeBase(VPRecipeBase::VPWidenGEPSC), VPUser(Operands),
+        VPValue(VPWidenGEPSC, GEP, this),
+        IsIndexLoopInvariant(GEP->getNumIndices(), false) {}
+
   template <typename IterT>
   VPWidenGEPRecipe(GetElementPtrInst *GEP, iterator_range<IterT> Operands,
                    Loop *OrigLoop)
-      : VPRecipeBase(VPWidenGEPSC), GEP(GEP), User(Operands),
+      : VPRecipeBase(VPRecipeBase::VPWidenGEPSC), VPUser(Operands),
+        VPValue(VPValue::VPVWidenGEPSC, GEP, this),
         IsIndexLoopInvariant(GEP->getNumIndices(), false) {
     IsPtrLoopInvariant = OrigLoop->isLoopInvariant(GEP->getPointerOperand());
     for (auto Index : enumerate(GEP->indices()))
@@ -886,8 +929,8 @@ public:
   ~VPWidenGEPRecipe() override = default;
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
-  static inline bool classof(const VPRecipeBase *V) {
-    return V->getVPRecipeID() == VPRecipeBase::VPWidenGEPSC;
+  static inline bool classof(const VPDef *D) {
+    return D->getVPDefID() == VPRecipeBase::VPWidenGEPSC;
   }
 
   /// Generate the gep nodes.
@@ -900,18 +943,25 @@ public:
 
 /// A recipe for handling phi nodes of integer and floating-point inductions,
 /// producing their vector and scalar values.
-class VPWidenIntOrFpInductionRecipe : public VPRecipeBase {
+class VPWidenIntOrFpInductionRecipe : public VPRecipeBase, public VPUser {
   PHINode *IV;
   TruncInst *Trunc;
 
 public:
-  VPWidenIntOrFpInductionRecipe(PHINode *IV, TruncInst *Trunc = nullptr)
-      : VPRecipeBase(VPWidenIntOrFpInductionSC), IV(IV), Trunc(Trunc) {}
+  VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start,
+                                TruncInst *Trunc = nullptr)
+      : VPRecipeBase(VPWidenIntOrFpInductionSC), VPUser({Start}), IV(IV),
+        Trunc(Trunc) {
+    if (Trunc)
+      new VPValue(Trunc, this);
+    else
+      new VPValue(IV, this);
+  }
   ~VPWidenIntOrFpInductionRecipe() override = default;
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
-  static inline bool classof(const VPRecipeBase *V) {
-    return V->getVPRecipeID() == VPRecipeBase::VPWidenIntOrFpInductionSC;
+  static inline bool classof(const VPDef *D) {
+    return D->getVPDefID() == VPRecipeBase::VPWidenIntOrFpInductionSC;
   }
 
   /// Generate the vectorized and scalarized versions of the phi node as
@@ -921,19 +971,38 @@ public:
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
              VPSlotTracker &SlotTracker) const override;
+
+  /// Returns the start value of the induction.
+  VPValue *getStartValue() { return getOperand(0); }
 };
 
 /// A recipe for handling all phi nodes except for integer and FP inductions.
-class VPWidenPHIRecipe : public VPRecipeBase {
+/// For reduction PHIs, RdxDesc must point to the corresponding recurrence
+/// descriptor and the start value is the first operand of the recipe.
+class VPWidenPHIRecipe : public VPRecipeBase, public VPUser {
   PHINode *Phi;
 
+  /// Descriptor for a reduction PHI.
+  RecurrenceDescriptor *RdxDesc = nullptr;
+
 public:
-  VPWidenPHIRecipe(PHINode *Phi) : VPRecipeBase(VPWidenPHISC), Phi(Phi) {}
+  /// Create a new VPWidenPHIRecipe for the reduction \p Phi described by \p
+  /// RdxDesc.
+  VPWidenPHIRecipe(PHINode *Phi, RecurrenceDescriptor &RdxDesc, VPValue &Start)
+      : VPWidenPHIRecipe(Phi) {
+    this->RdxDesc = &RdxDesc;
+    addOperand(&Start);
+  }
+
+  /// Create a VPWidenPHIRecipe for \p Phi
+  VPWidenPHIRecipe(PHINode *Phi) : VPRecipeBase(VPWidenPHISC), Phi(Phi) {
+    new VPValue(Phi, this);
+  }
   ~VPWidenPHIRecipe() override = default;
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
-  static inline bool classof(const VPRecipeBase *V) {
-    return V->getVPRecipeID() == VPRecipeBase::VPWidenPHISC;
+  static inline bool classof(const VPDef *D) {
+    return D->getVPDefID() == VPRecipeBase::VPWidenPHISC;
   }
 
   /// Generate the phi/select nodes.
@@ -942,21 +1011,25 @@ public:
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
              VPSlotTracker &SlotTracker) const override;
+
+  /// Returns the start value of the phi, if it is a reduction.
+  VPValue *getStartValue() {
+    return getNumOperands() == 0 ? nullptr : getOperand(0);
+  }
 };
 
 /// A recipe for vectorizing a phi-node as a sequence of mask-based select
 /// instructions.
-class VPBlendRecipe : public VPRecipeBase {
+class VPBlendRecipe : public VPRecipeBase, public VPUser {
   PHINode *Phi;
 
+public:
   /// The blend operation is a User of the incoming values and of their
   /// respective masks, ordered [I0, M0, I1, M1, ...]. Note that a single value
   /// might be incoming with a full mask for which there is no VPValue.
-  VPUser User;
-
-public:
   VPBlendRecipe(PHINode *Phi, ArrayRef<VPValue *> Operands)
-      : VPRecipeBase(VPBlendSC), Phi(Phi), User(Operands) {
+      : VPRecipeBase(VPBlendSC), VPUser(Operands), Phi(Phi) {
+    new VPValue(Phi, this);
     assert(Operands.size() > 0 &&
            ((Operands.size() == 1) || (Operands.size() % 2 == 0)) &&
            "Expected either a single incoming value or a positive even number "
@@ -964,23 +1037,19 @@ public:
   }
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
-  static inline bool classof(const VPRecipeBase *V) {
-    return V->getVPRecipeID() == VPRecipeBase::VPBlendSC;
+  static inline bool classof(const VPDef *D) {
+    return D->getVPDefID() == VPRecipeBase::VPBlendSC;
   }
 
   /// Return the number of incoming values, taking into account that a single
   /// incoming value has no mask.
-  unsigned getNumIncomingValues() const {
-    return (User.getNumOperands() + 1) / 2;
-  }
+  unsigned getNumIncomingValues() const { return (getNumOperands() + 1) / 2; }
 
   /// Return incoming value number \p Idx.
-  VPValue *getIncomingValue(unsigned Idx) const {
-    return User.getOperand(Idx * 2);
-  }
+  VPValue *getIncomingValue(unsigned Idx) const { return getOperand(Idx * 2); }
 
   /// Return mask number \p Idx.
-  VPValue *getMask(unsigned Idx) const { return User.getOperand(Idx * 2 + 1); }
+  VPValue *getMask(unsigned Idx) const { return getOperand(Idx * 2 + 1); }
 
   /// Generate the phi/select nodes.
   void execute(VPTransformState &State) override;
@@ -991,35 +1060,58 @@ public:
 };
 
 /// VPInterleaveRecipe is a recipe for transforming an interleave group of load
-/// or stores into one wide load/store and shuffles.
-class VPInterleaveRecipe : public VPRecipeBase {
+/// or stores into one wide load/store and shuffles. The first operand of a
+/// VPInterleave recipe is the address, followed by the stored values, followed
+/// by an optional mask.
+class VPInterleaveRecipe : public VPRecipeBase, public VPUser {
   const InterleaveGroup<Instruction> *IG;
-  VPUser User;
+
+  bool HasMask = false;
 
 public:
   VPInterleaveRecipe(const InterleaveGroup<Instruction> *IG, VPValue *Addr,
-                     VPValue *Mask)
-      : VPRecipeBase(VPInterleaveSC), IG(IG), User({Addr}) {
-    if (Mask)
-      User.addOperand(Mask);
+                     ArrayRef<VPValue *> StoredValues, VPValue *Mask)
+      : VPRecipeBase(VPInterleaveSC), VPUser(Addr), IG(IG) {
+    for (unsigned i = 0; i < IG->getFactor(); ++i)
+      if (Instruction *I = IG->getMember(i)) {
+        if (I->getType()->isVoidTy())
+          continue;
+        new VPValue(I, this);
+      }
+
+    for (auto *SV : StoredValues)
+      addOperand(SV);
+    if (Mask) {
+      HasMask = true;
+      addOperand(Mask);
+    }
   }
   ~VPInterleaveRecipe() override = default;
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
-  static inline bool classof(const VPRecipeBase *V) {
-    return V->getVPRecipeID() == VPRecipeBase::VPInterleaveSC;
+  static inline bool classof(const VPDef *D) {
+    return D->getVPDefID() == VPRecipeBase::VPInterleaveSC;
   }
 
   /// Return the address accessed by this recipe.
   VPValue *getAddr() const {
-    return User.getOperand(0); // Address is the 1st, mandatory operand.
+    return getOperand(0); // Address is the 1st, mandatory operand.
   }
 
   /// Return the mask used by this recipe. Note that a full mask is represented
   /// by a nullptr.
   VPValue *getMask() const {
     // Mask is optional and therefore the last, currently 2nd operand.
-    return User.getNumOperands() == 2 ? User.getOperand(1) : nullptr;
+    return HasMask ? getOperand(getNumOperands() - 1) : nullptr;
+  }
+
+  /// Return the VPValues stored by this interleave group. If it is a load
+  /// interleave group, return an empty ArrayRef.
+  ArrayRef<VPValue *> getStoredValues() const {
+    // The first operand is the address, followed by the stored values, followed
+    // by an optional mask.
+    return ArrayRef<VPValue *>(op_begin(), getNumOperands())
+        .slice(1, getNumOperands() - (HasMask ? 2 : 1));
   }
 
   /// Generate the wide load or store, and shuffles.
@@ -1032,17 +1124,61 @@ public:
   const InterleaveGroup<Instruction> *getInterleaveGroup() { return IG; }
 };
 
+/// A recipe to represent inloop reduction operations, performing a reduction on
+/// a vector operand into a scalar value, and adding the result to a chain.
+/// The Operands are {ChainOp, VecOp, [Condition]}.
+class VPReductionRecipe : public VPRecipeBase, public VPUser, public VPValue {
+  /// The recurrence decriptor for the reduction in question.
+  RecurrenceDescriptor *RdxDesc;
+  /// Fast math flags to use for the resulting reduction operation.
+  bool NoNaN;
+  /// Pointer to the TTI, needed to create the target reduction
+  const TargetTransformInfo *TTI;
+
+public:
+  VPReductionRecipe(RecurrenceDescriptor *R, Instruction *I, VPValue *ChainOp,
+                    VPValue *VecOp, VPValue *CondOp, bool NoNaN,
+                    const TargetTransformInfo *TTI)
+      : VPRecipeBase(VPRecipeBase::VPReductionSC), VPUser({ChainOp, VecOp}),
+        VPValue(VPValue::VPVReductionSC, I, this), RdxDesc(R), NoNaN(NoNaN),
+        TTI(TTI) {
+    if (CondOp)
+      addOperand(CondOp);
+  }
+
+  ~VPReductionRecipe() override = default;
+
+  /// Method to support type inquiry through isa, cast, and dyn_cast.
+  static inline bool classof(const VPValue *V) {
+    return V->getVPValueID() == VPValue::VPVReductionSC;
+  }
+
+  static inline bool classof(const VPDef *D) {
+    return D->getVPDefID() == VPRecipeBase::VPReductionSC;
+  }
+
+  /// Generate the reduction in the loop
+  void execute(VPTransformState &State) override;
+
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+
+  /// The VPValue of the scalar Chain being accumulated.
+  VPValue *getChainOp() const { return getOperand(0); }
+  /// The VPValue of the vector value to be reduced.
+  VPValue *getVecOp() const { return getOperand(1); }
+  /// The VPValue of the condition for the block.
+  VPValue *getCondOp() const {
+    return getNumOperands() > 2 ? getOperand(2) : nullptr;
+  }
+};
+
 /// VPReplicateRecipe replicates a given instruction producing multiple scalar
 /// copies of the original scalar type, one per lane, instead of producing a
 /// single copy of widened type for all lanes. If the instruction is known to be
 /// uniform only one copy, per lane zero, will be generated.
-class VPReplicateRecipe : public VPRecipeBase {
-  /// The instruction being replicated.
-  Instruction *Ingredient;
-
-  /// Hold VPValues for the operands of the ingredient.
-  VPUser User;
-
+class VPReplicateRecipe : public VPRecipeBase, public VPUser, public VPValue {
   /// Indicator if only a single replica per lane is needed.
   bool IsUniform;
 
@@ -1056,8 +1192,9 @@ public:
   template <typename IterT>
   VPReplicateRecipe(Instruction *I, iterator_range<IterT> Operands,
                     bool IsUniform, bool IsPredicated = false)
-      : VPRecipeBase(VPReplicateSC), Ingredient(I), User(Operands),
-        IsUniform(IsUniform), IsPredicated(IsPredicated) {
+      : VPRecipeBase(VPReplicateSC), VPUser(Operands),
+        VPValue(VPVReplicateSC, I, this), IsUniform(IsUniform),
+        IsPredicated(IsPredicated) {
     // Retain the previous behavior of predicateInstructions(), where an
     // insert-element of a predicated instruction got hoisted into the
     // predicated basic block iff it was its only user. This is achieved by
@@ -1069,8 +1206,12 @@ public:
   ~VPReplicateRecipe() override = default;
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
-  static inline bool classof(const VPRecipeBase *V) {
-    return V->getVPRecipeID() == VPRecipeBase::VPReplicateSC;
+  static inline bool classof(const VPDef *D) {
+    return D->getVPDefID() == VPRecipeBase::VPReplicateSC;
+  }
+
+  static inline bool classof(const VPValue *V) {
+    return V->getVPValueID() == VPValue::VPVReplicateSC;
   }
 
   /// Generate replicas of the desired Ingredient. Replicas will be generated
@@ -1083,21 +1224,21 @@ public:
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
              VPSlotTracker &SlotTracker) const override;
+
+  bool isUniform() const { return IsUniform; }
 };
 
 /// A recipe for generating conditional branches on the bits of a mask.
-class VPBranchOnMaskRecipe : public VPRecipeBase {
-  VPUser User;
-
+class VPBranchOnMaskRecipe : public VPRecipeBase, public VPUser {
 public:
   VPBranchOnMaskRecipe(VPValue *BlockInMask) : VPRecipeBase(VPBranchOnMaskSC) {
     if (BlockInMask) // nullptr means all-one mask.
-      User.addOperand(BlockInMask);
+      addOperand(BlockInMask);
   }
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
-  static inline bool classof(const VPRecipeBase *V) {
-    return V->getVPRecipeID() == VPRecipeBase::VPBranchOnMaskSC;
+  static inline bool classof(const VPDef *D) {
+    return D->getVPDefID() == VPRecipeBase::VPBranchOnMaskSC;
   }
 
   /// Generate the extraction of the appropriate bit from the block mask and the
@@ -1109,7 +1250,7 @@ public:
              VPSlotTracker &SlotTracker) const override {
     O << " +\n" << Indent << "\"BRANCH-ON-MASK ";
     if (VPValue *Mask = getMask())
-      Mask->print(O, SlotTracker);
+      Mask->printAsOperand(O, SlotTracker);
     else
       O << " All-One";
     O << "\\l\"";
@@ -1118,9 +1259,9 @@ public:
   /// Return the mask used by this recipe. Note that a full mask is represented
   /// by a nullptr.
   VPValue *getMask() const {
-    assert(User.getNumOperands() <= 1 && "should have either 0 or 1 operands");
+    assert(getNumOperands() <= 1 && "should have either 0 or 1 operands");
     // Mask is optional.
-    return User.getNumOperands() == 1 ? User.getOperand(0) : nullptr;
+    return getNumOperands() == 1 ? getOperand(0) : nullptr;
   }
 };
 
@@ -1129,19 +1270,20 @@ public:
 /// order to merge values that are set under such a branch and feed their uses.
 /// The phi nodes can be scalar or vector depending on the users of the value.
 /// This recipe works in concert with VPBranchOnMaskRecipe.
-class VPPredInstPHIRecipe : public VPRecipeBase {
-  Instruction *PredInst;
+class VPPredInstPHIRecipe : public VPRecipeBase, public VPUser {
 
 public:
   /// Construct a VPPredInstPHIRecipe given \p PredInst whose value needs a phi
   /// nodes after merging back from a Branch-on-Mask.
-  VPPredInstPHIRecipe(Instruction *PredInst)
-      : VPRecipeBase(VPPredInstPHISC), PredInst(PredInst) {}
+  VPPredInstPHIRecipe(VPValue *PredV)
+      : VPRecipeBase(VPPredInstPHISC), VPUser(PredV) {
+    new VPValue(PredV->getUnderlyingValue(), this);
+  }
   ~VPPredInstPHIRecipe() override = default;
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
-  static inline bool classof(const VPRecipeBase *V) {
-    return V->getVPRecipeID() == VPRecipeBase::VPPredInstPHISC;
+  static inline bool classof(const VPDef *D) {
+    return D->getVPDefID() == VPRecipeBase::VPPredInstPHISC;
   }
 
   /// Generates phi nodes for live-outs as needed to retain SSA form.
@@ -1158,56 +1300,59 @@ public:
 /// - For store: Address, stored value, optional mask
 /// TODO: We currently execute only per-part unless a specific instance is
 /// provided.
-class VPWidenMemoryInstructionRecipe : public VPRecipeBase {
-  Instruction &Instr;
-  VPUser User;
+class VPWidenMemoryInstructionRecipe : public VPRecipeBase,
+                                       public VPUser {
+  Instruction &Ingredient;
 
   void setMask(VPValue *Mask) {
     if (!Mask)
       return;
-    User.addOperand(Mask);
+    addOperand(Mask);
   }
 
   bool isMasked() const {
-    return (isa<LoadInst>(Instr) && User.getNumOperands() == 2) ||
-           (isa<StoreInst>(Instr) && User.getNumOperands() == 3);
+    return isStore() ? getNumOperands() == 3 : getNumOperands() == 2;
   }
 
 public:
   VPWidenMemoryInstructionRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask)
-      : VPRecipeBase(VPWidenMemoryInstructionSC), Instr(Load), User({Addr}) {
+      : VPRecipeBase(VPWidenMemoryInstructionSC), VPUser({Addr}),
+        Ingredient(Load) {
+    new VPValue(VPValue::VPVMemoryInstructionSC, &Load, this);
     setMask(Mask);
   }
 
   VPWidenMemoryInstructionRecipe(StoreInst &Store, VPValue *Addr,
                                  VPValue *StoredValue, VPValue *Mask)
-      : VPRecipeBase(VPWidenMemoryInstructionSC), Instr(Store),
-        User({Addr, StoredValue}) {
+      : VPRecipeBase(VPWidenMemoryInstructionSC), VPUser({Addr, StoredValue}),
+        Ingredient(Store) {
     setMask(Mask);
   }
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
-  static inline bool classof(const VPRecipeBase *V) {
-    return V->getVPRecipeID() == VPRecipeBase::VPWidenMemoryInstructionSC;
+  static inline bool classof(const VPDef *D) {
+    return D->getVPDefID() == VPRecipeBase::VPWidenMemoryInstructionSC;
   }
 
   /// Return the address accessed by this recipe.
   VPValue *getAddr() const {
-    return User.getOperand(0); // Address is the 1st, mandatory operand.
+    return getOperand(0); // Address is the 1st, mandatory operand.
   }
 
   /// Return the mask used by this recipe. Note that a full mask is represented
   /// by a nullptr.
   VPValue *getMask() const {
     // Mask is optional and therefore the last operand.
-    return isMasked() ? User.getOperand(User.getNumOperands() - 1) : nullptr;
+    return isMasked() ? getOperand(getNumOperands() - 1) : nullptr;
   }
 
+  /// Returns true if this recipe is a store.
+  bool isStore() const { return isa<StoreInst>(Ingredient); }
+
   /// Return the address accessed by this recipe.
   VPValue *getStoredValue() const {
-    assert(isa<StoreInst>(Instr) &&
-           "Stored value only available for store instructions");
-    return User.getOperand(1); // Stored value is the 2nd, mandatory operand.
+    assert(isStore() && "Stored value only available for store instructions");
+    return getOperand(1); // Stored value is the 2nd, mandatory operand.
   }
 
   /// Generate the wide load/store.
@@ -1220,21 +1365,16 @@ public:
 
 /// A Recipe for widening the canonical induction variable of the vector loop.
 class VPWidenCanonicalIVRecipe : public VPRecipeBase {
-  /// A VPValue representing the canonical vector IV.
-  VPValue Val;
-
 public:
-  VPWidenCanonicalIVRecipe() : VPRecipeBase(VPWidenCanonicalIVSC) {}
-  ~VPWidenCanonicalIVRecipe() override = default;
+  VPWidenCanonicalIVRecipe() : VPRecipeBase(VPWidenCanonicalIVSC) {
+    new VPValue(nullptr, this);
+  }
 
-  /// Return the VPValue representing the canonical vector induction variable of
-  /// the vector loop.
-  const VPValue *getVPValue() const { return &Val; }
-  VPValue *getVPValue() { return &Val; }
+  ~VPWidenCanonicalIVRecipe() override = default;
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
-  static inline bool classof(const VPRecipeBase *V) {
-    return V->getVPRecipeID() == VPRecipeBase::VPWidenCanonicalIVSC;
+  static inline bool classof(const VPDef *D) {
+    return D->getVPDefID() == VPRecipeBase::VPWidenCanonicalIVSC;
   }
 
   /// Generate a canonical vector induction variable of the vector loop, with
@@ -1321,6 +1461,11 @@ public:
   /// this VPBasicBlock, thereby "executing" the VPlan.
   void execute(struct VPTransformState *State) override;
 
+  /// Return the position of the first non-phi node recipe in the block.
+  iterator getFirstNonPhi();
+
+  void dropAllReferences(VPValue *NewValue) override;
+
 private:
   /// Create an IR BasicBlock to hold the output instructions generated by this
   /// VPBasicBlock, and return it. Update the CFGState accordingly.
@@ -1361,8 +1506,11 @@ public:
         IsReplicator(IsReplicator) {}
 
   ~VPRegionBlock() override {
-    if (Entry)
+    if (Entry) {
+      VPValue DummyValue;
+      Entry->dropAllReferences(&DummyValue);
       deleteCFG(Entry);
+    }
   }
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
@@ -1407,6 +1555,8 @@ public:
   /// The method which generates the output IR instructions that correspond to
   /// this VPRegionBlock, thereby "executing" the VPlan.
   void execute(struct VPTransformState *State) override;
+
+  void dropAllReferences(VPValue *NewValue) override;
 };
 
 //===----------------------------------------------------------------------===//
@@ -1544,7 +1694,7 @@ class VPlan {
   VPBlockBase *Entry;
 
   /// Holds the VFs applicable to this VPlan.
-  SmallSet<unsigned, 2> VFs;
+  SmallSetVector<ElementCount, 2> VFs;
 
   /// Holds the name of the VPlan, for printing.
   std::string Name;
@@ -1564,6 +1714,10 @@ class VPlan {
   /// VPlan.
   Value2VPValueTy Value2VPValue;
 
+  /// Contains all VPValues that been allocated by addVPValue directly and need
+  /// to be free when the plan's destructor is called.
+  SmallVector<VPValue *, 16> VPValuesToFree;
+
   /// Holds the VPLoopInfo analysis for this VPlan.
   VPLoopInfo VPLInfo;
 
@@ -1577,10 +1731,15 @@ public:
   }
 
   ~VPlan() {
-    if (Entry)
+    if (Entry) {
+      VPValue DummyValue;
+      for (VPBlockBase *Block : depth_first(Entry))
+        Block->dropAllReferences(&DummyValue);
+
       VPBlockBase::deleteCFG(Entry);
-    for (auto &MapEntry : Value2VPValue)
-      delete MapEntry.second;
+    }
+    for (VPValue *VPV : VPValuesToFree)
+      delete VPV;
     if (BackedgeTakenCount)
       delete BackedgeTakenCount;
     for (VPValue *Def : VPExternalDefs)
@@ -1608,9 +1767,9 @@ public:
     return BackedgeTakenCount;
   }
 
-  void addVF(unsigned VF) { VFs.insert(VF); }
+  void addVF(ElementCount VF) { VFs.insert(VF); }
 
-  bool hasVF(unsigned VF) { return VFs.count(VF); }
+  bool hasVF(ElementCount VF) { return VFs.count(VF); }
 
   const std::string &getName() const { return Name; }
 
@@ -1630,7 +1789,15 @@ public:
   void addVPValue(Value *V) {
     assert(V && "Trying to add a null Value to VPlan");
     assert(!Value2VPValue.count(V) && "Value already exists in VPlan");
-    Value2VPValue[V] = new VPValue(V);
+    VPValue *VPV = new VPValue(V);
+    Value2VPValue[V] = VPV;
+    VPValuesToFree.push_back(VPV);
+  }
+
+  void addVPValue(Value *V, VPValue *VPV) {
+    assert(V && "Trying to add a null Value to VPlan");
+    assert(!Value2VPValue.count(V) && "Value already exists in VPlan");
+    Value2VPValue[V] = VPV;
   }
 
   VPValue *getVPValue(Value *V) {
@@ -1646,6 +1813,8 @@ public:
     return getVPValue(V);
   }
 
+  void removeVPValueFor(Value *V) { Value2VPValue.erase(V); }
+
   /// Return the VPLoopInfo analysis for this VPlan.
   VPLoopInfo &getVPLoopInfo() { return VPLInfo; }
   const VPLoopInfo &getVPLoopInfo() const { return VPLInfo; }
@@ -1723,13 +1892,13 @@ private:
 
   void dump();
 
-  static void printAsIngredient(raw_ostream &O, Value *V);
+  static void printAsIngredient(raw_ostream &O, const Value *V);
 };
 
 struct VPlanIngredient {
-  Value *V;
+  const Value *V;
 
-  VPlanIngredient(Value *V) : V(V) {}
+  VPlanIngredient(const Value *V) : V(V) {}
 };
 
 inline raw_ostream &operator<<(raw_ostream &OS, const VPlanIngredient &I) {
@@ -1879,9 +2048,7 @@ public:
   /// \returns nullptr if doesn't have such group.
   InterleaveGroup<VPInstruction> *
   getInterleaveGroup(VPInstruction *Instr) const {
-    if (InterleaveGroupMap.count(Instr))
-      return InterleaveGroupMap.find(Instr)->second;
-    return nullptr;
+    return InterleaveGroupMap.lookup(Instr);
   }
 };
 
@@ -1965,10 +2132,7 @@ class VPlanSlp {
 public:
   VPlanSlp(VPInterleavedAccessInfo &IAI, VPBasicBlock &BB) : IAI(IAI), BB(BB) {}
 
-  ~VPlanSlp() {
-    for (auto &KV : BundleToCombined)
-      delete KV.second;
-  }
+  ~VPlanSlp() = default;
 
   /// Tries to build an SLP tree rooted at \p Operands and returns a
   /// VPInstruction combining \p Operands, if they can be combined.
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
index 7a80f3ff80a5..ac3b3505dc34 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -191,7 +191,7 @@ void VPlanPredicator::predicateRegionRec(VPRegionBlock *Region) {
   // Generate edge predicates and append them to the block predicate. RPO is
   // necessary since the predecessor blocks' block predicate needs to be set
   // before the current block's block predicate can be computed.
-  for (VPBlockBase *Block : make_range(RPOT.begin(), RPOT.end())) {
+  for (VPBlockBase *Block : RPOT) {
     // TODO: Handle nested regions once we start generating the same.
     assert(!isa<VPRegionBlock>(Block) && "Nested region not expected");
     createOrPropagatePredicates(Block, Region);
@@ -208,7 +208,7 @@ void VPlanPredicator::linearizeRegionRec(VPRegionBlock *Region) {
   ReversePostOrderTraversal<VPBlockBase *> RPOT(Region->getEntry());
   VPBlockBase *PrevBlock = nullptr;
 
-  for (VPBlockBase *CurrBlock : make_range(RPOT.begin(), RPOT.end())) {
+  for (VPBlockBase *CurrBlock : RPOT) {
     // TODO: Handle nested regions once we start generating the same.
     assert(!isa<VPRegionBlock>(CurrBlock) && "Nested region not expected");
 
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp
index 9019ed15ec5f..6f21bf44291a 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp
@@ -124,7 +124,7 @@ bool VPlanSlp::areVectorizable(ArrayRef<VPValue *> Operands) const {
     for (auto &I : *Parent) {
       auto *VPI = cast<VPInstruction>(&I);
       if (VPI->getOpcode() == Instruction::Load &&
-          std::find(Operands.begin(), Operands.end(), VPI) != Operands.end())
+          llvm::is_contained(Operands, VPI))
         LoadsSeen++;
 
       if (LoadsSeen == Operands.size())
@@ -161,7 +161,8 @@ static SmallVector<VPValue *, 4> getOperands(ArrayRef<VPValue *> Values,
                                              unsigned OperandIndex) {
   SmallVector<VPValue *, 4> Operands;
   for (VPValue *V : Values) {
-    auto *U = cast<VPUser>(V);
+    // Currently we only support VPInstructions.
+    auto *U = cast<VPInstruction>(V);
     Operands.push_back(U->getOperand(OperandIndex));
   }
   return Operands;
@@ -222,18 +223,20 @@ static bool areConsecutiveOrMatch(VPInstruction *A, VPInstruction *B,
 /// Traverses and compares operands of V1 and V2 to MaxLevel.
 static unsigned getLAScore(VPValue *V1, VPValue *V2, unsigned MaxLevel,
                            VPInterleavedAccessInfo &IAI) {
-  if (!isa<VPInstruction>(V1) || !isa<VPInstruction>(V2))
+  auto *I1 = dyn_cast<VPInstruction>(V1);
+  auto *I2 = dyn_cast<VPInstruction>(V2);
+  // Currently we only support VPInstructions.
+  if (!I1 || !I2)
     return 0;
 
   if (MaxLevel == 0)
-    return (unsigned)areConsecutiveOrMatch(cast<VPInstruction>(V1),
-                                           cast<VPInstruction>(V2), IAI);
+    return (unsigned)areConsecutiveOrMatch(I1, I2, IAI);
 
   unsigned Score = 0;
-  for (unsigned I = 0, EV1 = cast<VPUser>(V1)->getNumOperands(); I < EV1; ++I)
-    for (unsigned J = 0, EV2 = cast<VPUser>(V2)->getNumOperands(); J < EV2; ++J)
-      Score += getLAScore(cast<VPUser>(V1)->getOperand(I),
-                          cast<VPUser>(V2)->getOperand(J), MaxLevel - 1, IAI);
+  for (unsigned I = 0, EV1 = I1->getNumOperands(); I < EV1; ++I)
+    for (unsigned J = 0, EV2 = I2->getNumOperands(); J < EV2; ++J)
+      Score +=
+          getLAScore(I1->getOperand(I), I2->getOperand(J), MaxLevel - 1, IAI);
   return Score;
 }
 
@@ -463,8 +466,8 @@ VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) {
   auto *VPI = new VPInstruction(Opcode, CombinedOperands);
   VPI->setUnderlyingInstr(cast<VPInstruction>(Values[0])->getUnderlyingInstr());
 
-  LLVM_DEBUG(dbgs() << "Create VPInstruction "; VPI->print(dbgs());
-             cast<VPInstruction>(Values[0])->print(dbgs()); dbgs() << "\n");
+  LLVM_DEBUG(dbgs() << "Create VPInstruction " << *VPI << " "
+                    << *cast<VPInstruction>(Values[0]) << "\n");
   addCombined(Values, VPI);
   return VPI;
 }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 3a4872a72122..1a54603faf22 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -48,6 +48,8 @@ void VPlanTransforms::VPInstructionsToVPRecipes(
       VPInstruction *VPInst = cast<VPInstruction>(Ingredient);
       Instruction *Inst = cast<Instruction>(VPInst->getUnderlyingValue());
       if (DeadInstructions.count(Inst)) {
+        VPValue DummyValue;
+        VPInst->replaceAllUsesWith(&DummyValue);
         Ingredient->eraseFromParent();
         continue;
       }
@@ -66,7 +68,8 @@ void VPlanTransforms::VPInstructionsToVPRecipes(
         InductionDescriptor II = Inductions.lookup(Phi);
         if (II.getKind() == InductionDescriptor::IK_IntInduction ||
             II.getKind() == InductionDescriptor::IK_FpInduction) {
-          NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi);
+          VPValue *Start = Plan->getOrAddVPValue(II.getStartValue());
+          NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start);
         } else
           NewRecipe = new VPWidenPHIRecipe(Phi);
       } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
@@ -77,6 +80,11 @@ void VPlanTransforms::VPInstructionsToVPRecipes(
             new VPWidenRecipe(*Inst, Plan->mapToVPValues(Inst->operands()));
 
       NewRecipe->insertBefore(Ingredient);
+      if (NewRecipe->getNumDefinedValues() == 1)
+        VPInst->replaceAllUsesWith(NewRecipe->getVPValue());
+      else
+        assert(NewRecipe->getNumDefinedValues() == 0 &&
+               "Only recpies with zero or one defined values expected");
       Ingredient->eraseFromParent();
     }
   }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h
index f73505d0279a..ed572ca36627 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -10,9 +10,9 @@
 /// This file contains the declarations of the entities induced by Vectorization
 /// Plans, e.g. the instructions the VPlan intends to generate if executed.
 /// VPlan models the following entities:
-/// VPValue
-///  |-- VPUser
-///  |    |-- VPInstruction
+/// VPValue   VPUser   VPDef
+///    |        |
+///   VPInstruction
 /// These are documented in docs/VectorizationPlan.rst.
 ///
 //===----------------------------------------------------------------------===//
@@ -21,7 +21,9 @@
 #define LLVM_TRANSFORMS_VECTORIZE_VPLAN_VALUE_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/TinyPtrVector.h"
 #include "llvm/ADT/iterator_range.h"
 
 namespace llvm {
@@ -29,8 +31,11 @@ namespace llvm {
 // Forward declarations.
 class raw_ostream;
 class Value;
+class VPDef;
 class VPSlotTracker;
 class VPUser;
+class VPRecipeBase;
+class VPWidenMemoryInstructionRecipe;
 
 // This is the base class of the VPlan Def/Use graph, used for modeling the data
 // flow into, within and out of the VPlan. VPValues can stand for live-ins
@@ -38,10 +43,14 @@ class VPUser;
 // and live-outs which the VPlan will need to fix accordingly.
 class VPValue {
   friend class VPBuilder;
+  friend class VPDef;
+  friend class VPInstruction;
   friend struct VPlanTransforms;
   friend class VPBasicBlock;
   friend class VPInterleavedAccessInfo;
   friend class VPSlotTracker;
+  friend class VPRecipeBase;
+  friend class VPWidenMemoryInstructionRecipe;
 
   const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
 
@@ -51,8 +60,11 @@ protected:
   // Hold the underlying Value, if any, attached to this VPValue.
   Value *UnderlyingVal;
 
-  VPValue(const unsigned char SC, Value *UV = nullptr)
-      : SubclassID(SC), UnderlyingVal(UV) {}
+  /// Pointer to the VPDef that defines this VPValue. If it is nullptr, the
+  /// VPValue is not defined by any recipe modeled in VPlan.
+  VPDef *Def;
+
+  VPValue(const unsigned char SC, Value *UV = nullptr, VPDef *Def = nullptr);
 
   // DESIGN PRINCIPLE: Access to the underlying IR must be strictly limited to
   // the front-end and back-end of VPlan so that the middle-end is as
@@ -61,10 +73,6 @@ protected:
   // for multiple underlying IRs (Polly?) by providing a new VPlan front-end,
   // back-end and analysis information for the new IR.
 
-  /// Return the underlying Value attached to this VPValue.
-  Value *getUnderlyingValue() { return UnderlyingVal; }
-  const Value *getUnderlyingValue() const { return UnderlyingVal; }
-
   // Set \p Val as the underlying Value of this VPValue.
   void setUnderlyingValue(Value *Val) {
     assert(!UnderlyingVal && "Underlying Value is already set.");
@@ -72,16 +80,33 @@ protected:
   }
 
 public:
+  /// Return the underlying Value attached to this VPValue.
+  Value *getUnderlyingValue() { return UnderlyingVal; }
+  const Value *getUnderlyingValue() const { return UnderlyingVal; }
+
   /// An enumeration for keeping track of the concrete subclass of VPValue that
   /// are actually instantiated. Values of this enumeration are kept in the
   /// SubclassID field of the VPValue objects. They are used for concrete
   /// type identification.
-  enum { VPValueSC, VPUserSC, VPInstructionSC };
-
-  VPValue(Value *UV = nullptr) : VPValue(VPValueSC, UV) {}
+  enum {
+    VPValueSC,
+    VPVInstructionSC,
+    VPVMemoryInstructionSC,
+    VPVReductionSC,
+    VPVReplicateSC,
+    VPVWidenSC,
+    VPVWidenCallSC,
+    VPVWidenGEPSC,
+    VPVWidenSelectSC,
+  };
+
+  VPValue(Value *UV = nullptr, VPDef *Def = nullptr)
+      : VPValue(VPValueSC, UV, Def) {}
   VPValue(const VPValue &) = delete;
   VPValue &operator=(const VPValue &) = delete;
 
+  virtual ~VPValue();
+
   /// \return an ID for the concrete type of this object.
   /// This is used to implement the classof checks. This should not be used
   /// for any other purpose, as the values may change as LLVM evolves.
@@ -90,9 +115,28 @@ public:
   void printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const;
   void print(raw_ostream &OS, VPSlotTracker &Tracker) const;
 
+  /// Dump the value to stderr (for debugging).
+  void dump() const;
+
   unsigned getNumUsers() const { return Users.size(); }
   void addUser(VPUser &User) { Users.push_back(&User); }
 
+  /// Remove a single \p User from the list of users.
+  void removeUser(VPUser &User) {
+    bool Found = false;
+    // The same user can be added multiple times, e.g. because the same VPValue
+    // is used twice by the same VPUser. Remove a single one.
+    erase_if(Users, [&User, &Found](VPUser *Other) {
+      if (Found)
+        return false;
+      if (Other == &User) {
+        Found = true;
+        return true;
+      }
+      return false;
+    });
+  }
+
   typedef SmallVectorImpl<VPUser *>::iterator user_iterator;
   typedef SmallVectorImpl<VPUser *>::const_iterator const_user_iterator;
   typedef iterator_range<user_iterator> user_range;
@@ -120,6 +164,17 @@ public:
   }
 
   void replaceAllUsesWith(VPValue *New);
+
+  VPDef *getDef() { return Def; }
+
+  /// Returns the underlying IR value, if this VPValue is defined outside the
+  /// scope of VPlan. Returns nullptr if the VPValue is defined by a VPDef
+  /// inside a VPlan.
+  Value *getLiveInIRValue() {
+    assert(!getDef() &&
+           "VPValue is not a live-in; it is defined by a VPDef inside a VPlan");
+    return getUnderlyingValue();
+  }
 };
 
 typedef DenseMap<Value *, VPValue *> Value2VPValueTy;
@@ -129,34 +184,32 @@ raw_ostream &operator<<(raw_ostream &OS, const VPValue &V);
 
 /// This class augments VPValue with operands which provide the inverse def-use
 /// edges from VPValue's users to their defs.
-class VPUser : public VPValue {
+class VPUser {
   SmallVector<VPValue *, 2> Operands;
 
 protected:
-  VPUser(const unsigned char SC) : VPValue(SC) {}
-  VPUser(const unsigned char SC, ArrayRef<VPValue *> Operands) : VPValue(SC) {
+  /// Print the operands to \p O.
+  void printOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const;
+
+public:
+  VPUser() {}
+  VPUser(ArrayRef<VPValue *> Operands) {
     for (VPValue *Operand : Operands)
       addOperand(Operand);
   }
 
-public:
-  VPUser() : VPValue(VPValue::VPUserSC) {}
-  VPUser(ArrayRef<VPValue *> Operands) : VPUser(VPValue::VPUserSC, Operands) {}
   VPUser(std::initializer_list<VPValue *> Operands)
       : VPUser(ArrayRef<VPValue *>(Operands)) {}
-  template <typename IterT>
-  VPUser(iterator_range<IterT> Operands) : VPValue(VPValue::VPUserSC) {
+  template <typename IterT> VPUser(iterator_range<IterT> Operands) {
     for (VPValue *Operand : Operands)
       addOperand(Operand);
   }
 
   VPUser(const VPUser &) = delete;
   VPUser &operator=(const VPUser &) = delete;
-
-  /// Method to support type inquiry through isa, cast, and dyn_cast.
-  static inline bool classof(const VPValue *V) {
-    return V->getVPValueID() >= VPUserSC &&
-           V->getVPValueID() <= VPInstructionSC;
+  virtual ~VPUser() {
+    for (VPValue *Op : operands())
+      Op->removeUser(*this);
   }
 
   void addOperand(VPValue *Operand) {
@@ -170,7 +223,11 @@ public:
     return Operands[N];
   }
 
-  void setOperand(unsigned I, VPValue *New) { Operands[I] = New; }
+  void setOperand(unsigned I, VPValue *New) {
+    Operands[I]->removeUser(*this);
+    Operands[I] = New;
+    New->addUser(*this);
+  }
 
   typedef SmallVectorImpl<VPValue *>::iterator operand_iterator;
   typedef SmallVectorImpl<VPValue *>::const_iterator const_operand_iterator;
@@ -185,7 +242,110 @@ public:
   const_operand_range operands() const {
     return const_operand_range(op_begin(), op_end());
   }
+
+  /// Method to support type inquiry through isa, cast, and dyn_cast.
+  static inline bool classof(const VPDef *Recipe);
 };
+
+/// This class augments a recipe with a set of VPValues defined by the recipe.
+/// It allows recipes to define zero, one or multiple VPValues. A VPDef owns
+/// the VPValues it defines and is responsible for deleting its defined values.
+/// Single-value VPDefs that also inherit from VPValue must make sure to inherit
+/// from VPDef before VPValue.
+class VPDef {
+  friend class VPValue;
+
+  /// Subclass identifier (for isa/dyn_cast).
+  const unsigned char SubclassID;
+
+  /// The VPValues defined by this VPDef.
+  TinyPtrVector<VPValue *> DefinedValues;
+
+  /// Add \p V as a defined value by this VPDef.
+  void addDefinedValue(VPValue *V) {
+    assert(V->getDef() == this &&
+           "can only add VPValue already linked with this VPDef");
+    DefinedValues.push_back(V);
+  }
+
+  /// Remove \p V from the values defined by this VPDef. \p V must be a defined
+  /// value of this VPDef.
+  void removeDefinedValue(VPValue *V) {
+    assert(V->getDef() == this &&
+           "can only remove VPValue linked with this VPDef");
+    assert(is_contained(DefinedValues, V) &&
+           "VPValue to remove must be in DefinedValues");
+    erase_value(DefinedValues, V);
+    V->Def = nullptr;
+  }
+
+public:
+  /// An enumeration for keeping track of the concrete subclass of VPRecipeBase
+  /// that is actually instantiated. Values of this enumeration are kept in the
+  /// SubclassID field of the VPRecipeBase objects. They are used for concrete
+  /// type identification.
+  using VPRecipeTy = enum {
+    VPBlendSC,
+    VPBranchOnMaskSC,
+    VPInstructionSC,
+    VPInterleaveSC,
+    VPPredInstPHISC,
+    VPReductionSC,
+    VPReplicateSC,
+    VPWidenCallSC,
+    VPWidenCanonicalIVSC,
+    VPWidenGEPSC,
+    VPWidenIntOrFpInductionSC,
+    VPWidenMemoryInstructionSC,
+    VPWidenPHISC,
+    VPWidenSC,
+    VPWidenSelectSC
+  };
+
+  VPDef(const unsigned char SC) : SubclassID(SC) {}
+
+  virtual ~VPDef() {
+    for (VPValue *D : make_early_inc_range(DefinedValues)) {
+      assert(D->Def == this &&
+             "all defined VPValues should point to the containing VPDef");
+      assert(D->getNumUsers() == 0 &&
+             "all defined VPValues should have no more users");
+      D->Def = nullptr;
+      delete D;
+    }
+  }
+
+  /// Returns the VPValue with index \p I defined by the VPDef.
+  VPValue *getVPValue(unsigned I = 0) {
+    assert(DefinedValues[I] && "defined value must be non-null");
+    return DefinedValues[I];
+  }
+  const VPValue *getVPValue(unsigned I = 0) const {
+    assert(DefinedValues[I] && "defined value must be non-null");
+    return DefinedValues[I];
+  }
+
+  /// Returns an ArrayRef of the values defined by the VPDef.
+  ArrayRef<VPValue *> definedValues() { return DefinedValues; }
+  /// Returns an ArrayRef of the values defined by the VPDef.
+  ArrayRef<VPValue *> definedValues() const { return DefinedValues; }
+
+  /// Returns the number of values defined by the VPDef.
+  unsigned getNumDefinedValues() const { return DefinedValues.size(); }
+
+  /// \return an ID for the concrete type of this object.
+  /// This is used to implement the classof checks. This should not be used
+  /// for any other purpose, as the values may change as LLVM evolves.
+  unsigned getVPDefID() const { return SubclassID; }
+
+  /// Dump the VPDef to stderr (for debugging).
+  void dump() const;
+
+  /// Each concrete VPDef prints itself.
+  virtual void print(raw_ostream &O, const Twine &Indent,
+                     VPSlotTracker &SlotTracker) const = 0;
+};
+
 class VPlan;
 class VPBasicBlock;
 class VPRegionBlock;
@@ -205,7 +365,7 @@ class VPSlotTracker {
   void assignSlots(const VPlan &Plan);
 
 public:
-  VPSlotTracker(const VPlan *Plan) {
+  VPSlotTracker(const VPlan *Plan = nullptr) {
     if (Plan)
       assignSlots(*Plan);
   }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index b384c94121e9..6eec8d14de4a 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -65,9 +65,7 @@ static void verifyBlocksInRegion(const VPRegionBlock *Region) {
     for (const VPBlockBase *Succ : Successors) {
       // There must be a bi-directional link between block and successor.
       const auto &SuccPreds = Succ->getPredecessors();
-      assert(std::find(SuccPreds.begin(), SuccPreds.end(), VPB) !=
-                 SuccPreds.end() &&
-             "Missing predecessor link.");
+      assert(llvm::is_contained(SuccPreds, VPB) && "Missing predecessor link.");
       (void)SuccPreds;
     }
 
@@ -86,9 +84,7 @@ static void verifyBlocksInRegion(const VPRegionBlock *Region) {
 
       // There must be a bi-directional link between block and predecessor.
       const auto &PredSuccs = Pred->getSuccessors();
-      assert(std::find(PredSuccs.begin(), PredSuccs.end(), VPB) !=
-                 PredSuccs.end() &&
-             "Missing successor link.");
+      assert(llvm::is_contained(PredSuccs, VPB) && "Missing successor link.");
       (void)PredSuccs;
     }
   }
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 64b41bf9cefa..787f146bdddc 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
@@ -33,6 +34,7 @@ using namespace llvm;
 using namespace llvm::PatternMatch;
 
 #define DEBUG_TYPE "vector-combine"
+STATISTIC(NumVecLoad, "Number of vector loads formed");
 STATISTIC(NumVecCmp, "Number of vector compares formed");
 STATISTIC(NumVecBO, "Number of vector binops formed");
 STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed");
@@ -65,6 +67,7 @@ private:
   const TargetTransformInfo &TTI;
   const DominatorTree &DT;
 
+  bool vectorizeLoadInsert(Instruction &I);
   ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0,
                                         ExtractElementInst *Ext1,
                                         unsigned PreferredExtractIndex) const;
@@ -88,6 +91,138 @@ static void replaceValue(Value &Old, Value &New) {
   New.takeName(&Old);
 }
 
+bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
+  // Match insert into fixed vector of scalar value.
+  // TODO: Handle non-zero insert index.
+  auto *Ty = dyn_cast<FixedVectorType>(I.getType());
+  Value *Scalar;
+  if (!Ty || !match(&I, m_InsertElt(m_Undef(), m_Value(Scalar), m_ZeroInt())) ||
+      !Scalar->hasOneUse())
+    return false;
+
+  // Optionally match an extract from another vector.
+  Value *X;
+  bool HasExtract = match(Scalar, m_ExtractElt(m_Value(X), m_ZeroInt()));
+  if (!HasExtract)
+    X = Scalar;
+
+  // Match source value as load of scalar or vector.
+  // Do not vectorize scalar load (widening) if atomic/volatile or under
+  // asan/hwasan/memtag/tsan. The widened load may load data from dirty regions
+  // or create data races non-existent in the source.
+  auto *Load = dyn_cast<LoadInst>(X);
+  if (!Load || !Load->isSimple() || !Load->hasOneUse() ||
+      Load->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag) ||
+      mustSuppressSpeculation(*Load))
+    return false;
+
+  const DataLayout &DL = I.getModule()->getDataLayout();
+  Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
+  assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
+
+  // If original AS != Load's AS, we can't bitcast the original pointer and have
+  // to use Load's operand instead. Ideally we would want to strip pointer casts
+  // without changing AS, but there's no API to do that ATM.
+  unsigned AS = Load->getPointerAddressSpace();
+  if (AS != SrcPtr->getType()->getPointerAddressSpace())
+    SrcPtr = Load->getPointerOperand();
+
+  // We are potentially transforming byte-sized (8-bit) memory accesses, so make
+  // sure we have all of our type-based constraints in place for this target.
+  Type *ScalarTy = Scalar->getType();
+  uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
+  unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
+  if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0 ||
+      ScalarSize % 8 != 0)
+    return false;
+
+  // Check safety of replacing the scalar load with a larger vector load.
+  // We use minimal alignment (maximum flexibility) because we only care about
+  // the dereferenceable region. When calculating cost and creating a new op,
+  // we may use a larger value based on alignment attributes.
+  unsigned MinVecNumElts = MinVectorSize / ScalarSize;
+  auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false);
+  unsigned OffsetEltIndex = 0;
+  Align Alignment = Load->getAlign();
+  if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), DL, Load, &DT)) {
+    // It is not safe to load directly from the pointer, but we can still peek
+    // through gep offsets and check if it safe to load from a base address with
+    // updated alignment. If it is, we can shuffle the element(s) into place
+    // after loading.
+    unsigned OffsetBitWidth = DL.getIndexTypeSizeInBits(SrcPtr->getType());
+    APInt Offset(OffsetBitWidth, 0);
+    SrcPtr = SrcPtr->stripAndAccumulateInBoundsConstantOffsets(DL, Offset);
+
+    // We want to shuffle the result down from a high element of a vector, so
+    // the offset must be positive.
+    if (Offset.isNegative())
+      return false;
+
+    // The offset must be a multiple of the scalar element to shuffle cleanly
+    // in the element's size.
+    uint64_t ScalarSizeInBytes = ScalarSize / 8;
+    if (Offset.urem(ScalarSizeInBytes) != 0)
+      return false;
+
+    // If we load MinVecNumElts, will our target element still be loaded?
+    OffsetEltIndex = Offset.udiv(ScalarSizeInBytes).getZExtValue();
+    if (OffsetEltIndex >= MinVecNumElts)
+      return false;
+
+    if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), DL, Load, &DT))
+      return false;
+
+    // Update alignment with offset value. Note that the offset could be negated
+    // to more accurately represent "(new) SrcPtr - Offset = (old) SrcPtr", but
+    // negation does not change the result of the alignment calculation.
+    Alignment = commonAlignment(Alignment, Offset.getZExtValue());
+  }
+
+  // Original pattern: insertelt undef, load [free casts of] PtrOp, 0
+  // Use the greater of the alignment on the load or its source pointer.
+  Alignment = std::max(SrcPtr->getPointerAlignment(DL), Alignment);
+  Type *LoadTy = Load->getType();
+  InstructionCost OldCost =
+      TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS);
+  APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0);
+  OldCost += TTI.getScalarizationOverhead(MinVecTy, DemandedElts,
+                                          /* Insert */ true, HasExtract);
+
+  // New pattern: load VecPtr
+  InstructionCost NewCost =
+      TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS);
+  // Optionally, we are shuffling the loaded vector element(s) into place.
+  if (OffsetEltIndex)
+    NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, MinVecTy);
+
+  // We can aggressively convert to the vector form because the backend can
+  // invert this transform if it does not result in a performance win.
+  if (OldCost < NewCost || !NewCost.isValid())
+    return false;
+
+  // It is safe and potentially profitable to load a vector directly:
+  // inselt undef, load Scalar, 0 --> load VecPtr
+  IRBuilder<> Builder(Load);
+  Value *CastedPtr = Builder.CreateBitCast(SrcPtr, MinVecTy->getPointerTo(AS));
+  Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);
+
+  // Set everything but element 0 to undef to prevent poison from propagating
+  // from the extra loaded memory. This will also optionally shrink/grow the
+  // vector from the loaded size to the output size.
+  // We assume this operation has no cost in codegen if there was no offset.
+  // Note that we could use freeze to avoid poison problems, but then we might
+  // still need a shuffle to change the vector size.
+  unsigned OutputNumElts = Ty->getNumElements();
+  SmallVector<int, 16> Mask(OutputNumElts, UndefMaskElem);
+  assert(OffsetEltIndex < MinVecNumElts && "Address offset too big");
+  Mask[0] = OffsetEltIndex;
+  VecLd = Builder.CreateShuffleVector(VecLd, Mask);
+
+  replaceValue(I, *VecLd);
+  ++NumVecLoad;
+  return true;
+}
+
 /// Determine which, if any, of the inputs should be replaced by a shuffle
 /// followed by extract from a different index.
 ExtractElementInst *VectorCombine::getShuffleExtract(
@@ -106,8 +241,14 @@ ExtractElementInst *VectorCombine::getShuffleExtract(
 
   Type *VecTy = Ext0->getVectorOperand()->getType();
   assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types");
-  int Cost0 = TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0);
-  int Cost1 = TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1);
+  InstructionCost Cost0 =
+      TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0);
+  InstructionCost Cost1 =
+      TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1);
+
+  // If both costs are invalid no shuffle is needed
+  if (!Cost0.isValid() && !Cost1.isValid())
+    return nullptr;
 
   // We are extracting from 2 different indexes, so one operand must be shuffled
   // before performing a vector operation and/or extract. The more expensive
@@ -143,7 +284,7 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
          "Expected constant extract indexes");
   Type *ScalarTy = Ext0->getType();
   auto *VecTy = cast<VectorType>(Ext0->getOperand(0)->getType());
-  int ScalarOpCost, VectorOpCost;
+  InstructionCost ScalarOpCost, VectorOpCost;
 
   // Get cost estimates for scalar and vector versions of the operation.
   bool IsBinOp = Instruction::isBinaryOp(Opcode);
@@ -164,9 +305,9 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
   unsigned Ext0Index = cast<ConstantInt>(Ext0->getOperand(1))->getZExtValue();
   unsigned Ext1Index = cast<ConstantInt>(Ext1->getOperand(1))->getZExtValue();
 
-  int Extract0Cost =
+  InstructionCost Extract0Cost =
       TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext0Index);
-  int Extract1Cost =
+  InstructionCost Extract1Cost =
       TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext1Index);
 
   // A more expensive extract will always be replaced by a splat shuffle.
@@ -176,11 +317,11 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
   // TODO: Evaluate whether that always results in lowest cost. Alternatively,
   //       check the cost of creating a broadcast shuffle and shuffling both
   //       operands to element 0.
-  int CheapExtractCost = std::min(Extract0Cost, Extract1Cost);
+  InstructionCost CheapExtractCost = std::min(Extract0Cost, Extract1Cost);
 
   // Extra uses of the extracts mean that we include those costs in the
   // vector total because those instructions will not be eliminated.
-  int OldCost, NewCost;
+  InstructionCost OldCost, NewCost;
   if (Ext0->getOperand(0) == Ext1->getOperand(0) && Ext0Index == Ext1Index) {
     // Handle a special case. If the 2 extracts are identical, adjust the
     // formulas to account for that. The extra use charge allows for either the
@@ -231,8 +372,7 @@ static Value *createShiftShuffle(Value *Vec, unsigned OldIndex,
   auto *VecTy = cast<FixedVectorType>(Vec->getType());
   SmallVector<int, 32> ShufMask(VecTy->getNumElements(), UndefMaskElem);
   ShufMask[NewIndex] = OldIndex;
-  Value *Undef = UndefValue::get(VecTy);
-  return Builder.CreateShuffleVector(Vec, Undef, ShufMask, "shift");
+  return Builder.CreateShuffleVector(Vec, ShufMask, "shift");
 }
 
 /// Given an extract element instruction with constant index operand, shuffle
@@ -366,17 +506,23 @@ bool VectorCombine::foldBitcastShuf(Instruction &I) {
                      m_OneUse(m_Shuffle(m_Value(V), m_Undef(), m_Mask(Mask))))))
     return false;
 
-  // Disallow non-vector casts and length-changing shuffles.
+  // 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for
+  // scalable type is unknown; Second, we cannot reason if the narrowed shuffle
+  // mask for scalable type is a splat or not.
+  // 2) Disallow non-vector casts and length-changing shuffles.
   // TODO: We could allow any shuffle.
-  auto *DestTy = dyn_cast<VectorType>(I.getType());
-  auto *SrcTy = cast<VectorType>(V->getType());
-  if (!DestTy || I.getOperand(0)->getType() != SrcTy)
+  auto *DestTy = dyn_cast<FixedVectorType>(I.getType());
+  auto *SrcTy = dyn_cast<FixedVectorType>(V->getType());
+  if (!SrcTy || !DestTy || I.getOperand(0)->getType() != SrcTy)
     return false;
 
   // The new shuffle must not cost more than the old shuffle. The bitcast is
   // moved ahead of the shuffle, so assume that it has the same cost as before.
-  if (TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, DestTy) >
-      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, SrcTy))
+  InstructionCost DestCost =
+      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, DestTy);
+  InstructionCost SrcCost =
+      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, SrcTy);
+  if (DestCost > SrcCost || !DestCost.isValid())
     return false;
 
   unsigned DestNumElts = DestTy->getNumElements();
@@ -399,8 +545,7 @@ bool VectorCombine::foldBitcastShuf(Instruction &I) {
   // bitcast (shuf V, MaskC) --> shuf (bitcast V), MaskC'
   ++NumShufOfBitcast;
   Value *CastV = Builder.CreateBitCast(V, DestTy);
-  Value *Shuf =
-      Builder.CreateShuffleVector(CastV, UndefValue::get(DestTy), NewMask);
+  Value *Shuf = Builder.CreateShuffleVector(CastV, NewMask);
   replaceValue(I, *Shuf);
   return true;
 }
@@ -467,7 +612,7 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
          "Unexpected types for insert element into binop or cmp");
 
   unsigned Opcode = I.getOpcode();
-  int ScalarOpCost, VectorOpCost;
+  InstructionCost ScalarOpCost, VectorOpCost;
   if (IsCmp) {
     ScalarOpCost = TTI.getCmpSelInstrCost(Opcode, ScalarTy);
     VectorOpCost = TTI.getCmpSelInstrCost(Opcode, VecTy);
@@ -478,16 +623,16 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
 
   // Get cost estimate for the insert element. This cost will factor into
   // both sequences.
-  int InsertCost =
+  InstructionCost InsertCost =
       TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, Index);
-  int OldCost = (IsConst0 ? 0 : InsertCost) + (IsConst1 ? 0 : InsertCost) +
-                VectorOpCost;
-  int NewCost = ScalarOpCost + InsertCost +
-                (IsConst0 ? 0 : !Ins0->hasOneUse() * InsertCost) +
-                (IsConst1 ? 0 : !Ins1->hasOneUse() * InsertCost);
+  InstructionCost OldCost =
+      (IsConst0 ? 0 : InsertCost) + (IsConst1 ? 0 : InsertCost) + VectorOpCost;
+  InstructionCost NewCost = ScalarOpCost + InsertCost +
+                            (IsConst0 ? 0 : !Ins0->hasOneUse() * InsertCost) +
+                            (IsConst1 ? 0 : !Ins1->hasOneUse() * InsertCost);
 
   // We want to scalarize unless the vector variant actually has lower cost.
-  if (OldCost < NewCost)
+  if (OldCost < NewCost || !NewCost.isValid())
     return false;
 
   // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) -->
@@ -567,7 +712,8 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {
   if (!VecTy)
     return false;
 
-  int OldCost = TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0);
+  InstructionCost OldCost =
+      TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0);
   OldCost += TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1);
   OldCost += TTI.getCmpSelInstrCost(CmpOpcode, I0->getType()) * 2;
   OldCost += TTI.getArithmeticInstrCost(I.getOpcode(), I.getType());
@@ -578,7 +724,7 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {
   int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0;
   int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1;
   auto *CmpTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(X->getType()));
-  int NewCost = TTI.getCmpSelInstrCost(CmpOpcode, X->getType());
+  InstructionCost NewCost = TTI.getCmpSelInstrCost(CmpOpcode, X->getType());
   NewCost +=
       TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, CmpTy);
   NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy);
@@ -587,7 +733,7 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {
   // Aggressively form vector ops if the cost is equal because the transform
   // may enable further optimization.
   // Codegen can reverse this transform (scalarize) if it was not profitable.
-  if (OldCost < NewCost)
+  if (OldCost < NewCost || !NewCost.isValid())
     return false;
 
   // Create a vector constant from the 2 scalar constants.
@@ -612,6 +758,10 @@ bool VectorCombine::run() {
   if (DisableVectorCombine)
     return false;
 
+  // Don't attempt vectorization if the target does not support vectors.
+  if (!TTI.getNumberOfRegisters(TTI.getRegisterClassForType(/*Vector*/ true)))
+    return false;
+
   bool MadeChange = false;
   for (BasicBlock &BB : F) {
     // Ignore unreachable basic blocks.
@@ -625,6 +775,7 @@ bool VectorCombine::run() {
       if (isa<DbgInfoIntrinsic>(I))
         continue;
       Builder.SetInsertPoint(&I);
+      MadeChange |= vectorizeLoadInsert(I);
       MadeChange |= foldExtractExtract(I);
       MadeChange |= foldBitcastShuf(I);
       MadeChange |= scalarizeBinopOrCmp(I);
diff --git a/contrib/llvm-project/llvm/lib/WindowsManifest/WindowsManifestMerger.cpp b/contrib/llvm-project/llvm/lib/WindowsManifest/WindowsManifestMerger.cpp
index 031a963cd3b0..6af7bc699d05 100644
--- a/contrib/llvm-project/llvm/lib/WindowsManifest/WindowsManifestMerger.cpp
+++ b/contrib/llvm-project/llvm/lib/WindowsManifest/WindowsManifestMerger.cpp
@@ -16,7 +16,7 @@
 
 #include <map>
 
-#if LLVM_LIBXML2_ENABLED
+#if LLVM_ENABLE_LIBXML2
 #include <libxml/xmlreader.h>
 #endif
 
@@ -41,7 +41,7 @@ public:
 private:
   static void errorCallback(void *Ctx, const char *Format, ...);
   Error getParseError();
-#if LLVM_LIBXML2_ENABLED
+#if LLVM_ENABLE_LIBXML2
   xmlDocPtr CombinedDoc = nullptr;
   std::vector<xmlDocPtr> MergedDocs;
 
@@ -56,7 +56,7 @@ private:
   bool ParseErrorOccurred = false;
 };
 
-#if LLVM_LIBXML2_ENABLED
+#if LLVM_ENABLE_LIBXML2
 
 static constexpr std::pair<StringLiteral, StringLiteral> MtNsHrefsPrefixes[] = {
     {"urn:schemas-microsoft-com:asm.v1", "ms_asmv1"},
diff --git a/contrib/llvm-project/llvm/lib/XRay/InstrumentationMap.cpp b/contrib/llvm-project/llvm/lib/XRay/InstrumentationMap.cpp
index de0a9e60a511..e6534e5a7be7 100644
--- a/contrib/llvm-project/llvm/lib/XRay/InstrumentationMap.cpp
+++ b/contrib/llvm-project/llvm/lib/XRay/InstrumentationMap.cpp
@@ -95,42 +95,46 @@ loadObj(StringRef Filename, object::OwningBinary<object::ObjectFile> &ObjFile,
   if (ObjFile.getBinary()->isELF()) {
     uint32_t RelativeRelocation = [](object::ObjectFile *ObjFile) {
       if (const auto *ELFObj = dyn_cast<object::ELF32LEObjectFile>(ObjFile))
-        return ELFObj->getELFFile()->getRelativeRelocationType();
+        return ELFObj->getELFFile().getRelativeRelocationType();
       else if (const auto *ELFObj =
                    dyn_cast<object::ELF32BEObjectFile>(ObjFile))
-        return ELFObj->getELFFile()->getRelativeRelocationType();
+        return ELFObj->getELFFile().getRelativeRelocationType();
       else if (const auto *ELFObj =
                    dyn_cast<object::ELF64LEObjectFile>(ObjFile))
-        return ELFObj->getELFFile()->getRelativeRelocationType();
+        return ELFObj->getELFFile().getRelativeRelocationType();
       else if (const auto *ELFObj =
                    dyn_cast<object::ELF64BEObjectFile>(ObjFile))
-        return ELFObj->getELFFile()->getRelativeRelocationType();
+        return ELFObj->getELFFile().getRelativeRelocationType();
       else
         return static_cast<uint32_t>(0);
     }(ObjFile.getBinary());
 
-    bool (*SupportsRelocation)(uint64_t);
+    object::SupportsRelocation Supports;
     object::RelocationResolver Resolver;
-    std::tie(SupportsRelocation, Resolver) =
+    std::tie(Supports, Resolver) =
         object::getRelocationResolver(*ObjFile.getBinary());
 
     for (const object::SectionRef &Section : Sections) {
       for (const object::RelocationRef &Reloc : Section.relocations()) {
         if (ObjFile.getBinary()->getArch() == Triple::arm) {
-          if (SupportsRelocation && SupportsRelocation(Reloc.getType())) {
+          if (Supports && Supports(Reloc.getType())) {
             Expected<uint64_t> ValueOrErr = Reloc.getSymbol()->getValue();
             if (!ValueOrErr)
               return ValueOrErr.takeError();
-            Relocs.insert({Reloc.getOffset(), Resolver(Reloc, *ValueOrErr, 0)});
+            Relocs.insert(
+                {Reloc.getOffset(),
+                 object::resolveRelocation(Resolver, Reloc, *ValueOrErr, 0)});
           }
-        } else if (SupportsRelocation && SupportsRelocation(Reloc.getType())) {
+        } else if (Supports && Supports(Reloc.getType())) {
           auto AddendOrErr = object::ELFRelocationRef(Reloc).getAddend();
           auto A = AddendOrErr ? *AddendOrErr : 0;
           Expected<uint64_t> ValueOrErr = Reloc.getSymbol()->getValue();
           if (!ValueOrErr)
             // TODO: Test this error.
             return ValueOrErr.takeError();
-          Relocs.insert({Reloc.getOffset(), Resolver(Reloc, *ValueOrErr, A)});
+          Relocs.insert(
+              {Reloc.getOffset(),
+               object::resolveRelocation(Resolver, Reloc, *ValueOrErr, A)});
         } else if (Reloc.getType() == RelativeRelocation) {
           if (auto AddendOrErr = object::ELFRelocationRef(Reloc).getAddend())
             Relocs.insert({Reloc.getOffset(), *AddendOrErr});
diff --git a/contrib/llvm-project/llvm/tools/bugpoint/CrashDebugger.cpp b/contrib/llvm-project/llvm/tools/bugpoint/CrashDebugger.cpp
index 1a39ff654f05..d06bca9a1258 100644
--- a/contrib/llvm-project/llvm/tools/bugpoint/CrashDebugger.cpp
+++ b/contrib/llvm-project/llvm/tools/bugpoint/CrashDebugger.cpp
@@ -1229,7 +1229,7 @@ static Error DebugACrash(BugDriver &BD, BugTester TestFn) {
       unsigned NewSize = 0;
       for (std::string &Name : FunctionNames) {
         Function *Fn = BD.getProgram().getFunction(Name);
-        assert(Fn && "Could not find funcion?");
+        assert(Fn && "Could not find function?");
 
         std::vector<Attribute> Attrs;
         for (Attribute A : Fn->getAttributes().getFnAttributes())
diff --git a/contrib/llvm-project/llvm/tools/bugpoint/ExecutionDriver.cpp b/contrib/llvm-project/llvm/tools/bugpoint/ExecutionDriver.cpp
index 4c83a9598976..f06f378962d9 100644
--- a/contrib/llvm-project/llvm/tools/bugpoint/ExecutionDriver.cpp
+++ b/contrib/llvm-project/llvm/tools/bugpoint/ExecutionDriver.cpp
@@ -33,7 +33,6 @@ enum OutputType {
   RunJIT,
   RunLLC,
   RunLLCIA,
-  LLC_Safe,
   CompileCustom,
   Custom
 };
@@ -53,7 +52,6 @@ cl::opt<OutputType> InterpreterSel(
                clEnumValN(RunLLC, "run-llc", "Compile with LLC"),
                clEnumValN(RunLLCIA, "run-llc-ia",
                           "Compile with LLC with integrated assembler"),
-               clEnumValN(LLC_Safe, "llc-safe", "Use LLC for all"),
                clEnumValN(CompileCustom, "compile-custom",
                           "Use -compile-command to define a command to "
                           "compile the bitcode. Useful to avoid linking."),
@@ -182,7 +180,6 @@ Error BugDriver::initializeExecutionEnvironment() {
     break;
   case RunLLC:
   case RunLLCIA:
-  case LLC_Safe:
     Interpreter = AbstractInterpreter::createLLC(
         getToolName(), Message, CCBinary, &ToolArgv, &CCToolArgv,
         InterpreterSel == RunLLCIA);
@@ -212,21 +209,12 @@ Error BugDriver::initializeExecutionEnvironment() {
   switch (SafeInterpreterSel) {
   case AutoPick:
     // In "llc-safe" mode, default to using LLC as the "safe" backend.
-    if (!SafeInterpreter && InterpreterSel == LLC_Safe) {
+    if (InterpreterSel == RunLLC) {
       SafeInterpreterSel = RunLLC;
       SafeToolArgs.push_back("--relocation-model=pic");
       SafeInterpreter = AbstractInterpreter::createLLC(
           Path.c_str(), Message, CCBinary, &SafeToolArgs, &CCToolArgv);
-    }
-
-    if (!SafeInterpreter && InterpreterSel != RunLLC &&
-        InterpreterSel != RunJIT) {
-      SafeInterpreterSel = RunLLC;
-      SafeToolArgs.push_back("--relocation-model=pic");
-      SafeInterpreter = AbstractInterpreter::createLLC(
-          Path.c_str(), Message, CCBinary, &SafeToolArgs, &CCToolArgv);
-    }
-    if (!SafeInterpreter) {
+    } else if (InterpreterSel != CompileCustom) {
       SafeInterpreterSel = AutoPick;
       Message = "Sorry, I can't automatically select a safe interpreter!\n";
     }
@@ -247,7 +235,7 @@ Error BugDriver::initializeExecutionEnvironment() {
               "\"safe\" backend right now!\n";
     break;
   }
-  if (!SafeInterpreter) {
+  if (!SafeInterpreter && InterpreterSel != CompileCustom) {
     outs() << Message << "\nExiting.\n";
     exit(1);
   }
diff --git a/contrib/llvm-project/llvm/tools/bugpoint/ExtractFunction.cpp b/contrib/llvm-project/llvm/tools/bugpoint/ExtractFunction.cpp
index d9047acd30e1..7a75cb90edc5 100644
--- a/contrib/llvm-project/llvm/tools/bugpoint/ExtractFunction.cpp
+++ b/contrib/llvm-project/llvm/tools/bugpoint/ExtractFunction.cpp
@@ -386,7 +386,7 @@ BugDriver::extractMappedBlocksFromModule(const std::vector<BasicBlock *> &BBs,
   for (Function &F : *M)
     for (BasicBlock &BB : F)
       // Check if this block is going to be extracted.
-      if (std::find(BBs.begin(), BBs.end(), &BB) == BBs.end())
+      if (!llvm::is_contained(BBs, &BB))
         BlocksToExtract.push_back(&BB);
 
   raw_fd_ostream OS(Temp->FD, /*shouldClose*/ false);
diff --git a/contrib/llvm-project/llvm/tools/bugpoint/OptimizerDriver.cpp b/contrib/llvm-project/llvm/tools/bugpoint/OptimizerDriver.cpp
index 25a970bd6878..ca78735202fc 100644
--- a/contrib/llvm-project/llvm/tools/bugpoint/OptimizerDriver.cpp
+++ b/contrib/llvm-project/llvm/tools/bugpoint/OptimizerDriver.cpp
@@ -205,6 +205,9 @@ bool BugDriver::runPasses(Module &Program,
 
   for (unsigned i = 0, e = OptArgs.size(); i != e; ++i)
     Args.push_back(OptArgs[i]);
+  // Pin to legacy PM since bugpoint has lots of infra and hacks revolving
+  // around the legacy PM.
+  Args.push_back("-enable-new-pm=0");
   Args.push_back("-disable-symbolication");
   Args.push_back("-o");
   Args.push_back(OutputFilename);
diff --git a/contrib/llvm-project/llvm/tools/bugpoint/ToolRunner.cpp b/contrib/llvm-project/llvm/tools/bugpoint/ToolRunner.cpp
index d880aca044d1..c4ea1dad122b 100644
--- a/contrib/llvm-project/llvm/tools/bugpoint/ToolRunner.cpp
+++ b/contrib/llvm-project/llvm/tools/bugpoint/ToolRunner.cpp
@@ -495,7 +495,7 @@ Expected<int> LLC::ExecuteProgram(const std::string &Bitcode,
     return std::move(E);
 
   std::vector<std::string> CCArgs(ArgsForCC);
-  CCArgs.insert(CCArgs.end(), SharedLibs.begin(), SharedLibs.end());
+  llvm::append_range(CCArgs, SharedLibs);
 
   // Assuming LLC worked, compile the result with CC and run it.
   return cc->ExecuteProgram(OutputAsmFile, Args, *FileKind, InputFile,
diff --git a/contrib/llvm-project/llvm/tools/llc/llc.cpp b/contrib/llvm-project/llvm/tools/llc/llc.cpp
index 95f2963ecbd6..48f0adf7c726 100644
--- a/contrib/llvm-project/llvm/tools/llc/llc.cpp
+++ b/contrib/llvm-project/llvm/tools/llc/llc.cpp
@@ -37,6 +37,7 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/MC/SubtargetFeature.h"
 #include "llvm/Pass.h"
+#include "llvm/Remarks/HotnessThresholdParser.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FileSystem.h"
@@ -81,6 +82,15 @@ TimeCompilations("time-compilations", cl::Hidden, cl::init(1u),
                  cl::value_desc("N"),
                  cl::desc("Repeat compilation N times for timing"));
 
+static cl::opt<std::string>
+    BinutilsVersion("binutils-version", cl::Hidden,
+                    cl::desc("Produced object files can use all ELF features "
+                             "supported by this binutils version and newer."
+                             "If -no-integrated-as is specified, the generated "
+                             "assembly will consider GNU as support."
+                             "'none' means that all ELF features can be used, "
+                             "regardless of binutils support"));
+
 static cl::opt<bool>
 NoIntegratedAssembler("no-integrated-as", cl::Hidden,
                       cl::desc("Disable integrated assembler"));
@@ -142,11 +152,13 @@ static cl::opt<bool> RemarksWithHotness(
     cl::desc("With PGO, include profile count in optimization remarks"),
     cl::Hidden);
 
-static cl::opt<unsigned>
-    RemarksHotnessThreshold("pass-remarks-hotness-threshold",
-                            cl::desc("Minimum profile count required for "
-                                     "an optimization remark to be output"),
-                            cl::Hidden);
+static cl::opt<Optional<uint64_t>, false, remarks::HotnessThresholdParser>
+    RemarksHotnessThreshold(
+        "pass-remarks-hotness-threshold",
+        cl::desc("Minimum profile count required for "
+                 "an optimization remark to be output. "
+                 "Use 'auto' to apply the threshold from profile summary."),
+        cl::value_desc("N or 'auto'"), cl::init(0), cl::Hidden);
 
 static cl::opt<std::string>
     RemarksFilename("pass-remarks-output",
@@ -188,6 +200,25 @@ static cl::opt<RunPassOption, true, cl::parser<std::string>> RunPass(
 
 static int compileModule(char **, LLVMContext &);
 
+LLVM_ATTRIBUTE_NORETURN static void reportError(Twine Msg,
+                                                StringRef Filename = "") {
+  SmallString<256> Prefix;
+  if (!Filename.empty()) {
+    if (Filename == "-")
+      Filename = "<stdin>";
+    ("'" + Twine(Filename) + "': ").toStringRef(Prefix);
+  }
+  WithColor::error(errs(), "llc") << Prefix << Msg << "\n";
+  exit(1);
+}
+
+LLVM_ATTRIBUTE_NORETURN static void reportError(Error Err, StringRef Filename) {
+  assert(Err);
+  handleAllErrors(createFileError(Filename, std::move(Err)),
+                  [&](const ErrorInfoBase &EI) { reportError(EI.message()); });
+  llvm_unreachable("reportError() should not return");
+}
+
 static std::unique_ptr<ToolOutputFile> GetOutputStream(const char *TargetName,
                                                        Triple::OSType OS,
                                                        const char *ProgName) {
@@ -224,7 +255,7 @@ static std::unique_ptr<ToolOutputFile> GetOutputStream(const char *TargetName,
           OutputFilename += ".o";
         break;
       case CGFT_Null:
-        OutputFilename += ".null";
+        OutputFilename = "-";
         break;
       }
     }
@@ -248,7 +279,7 @@ static std::unique_ptr<ToolOutputFile> GetOutputStream(const char *TargetName,
     OpenFlags |= sys::fs::OF_Text;
   auto FDOut = std::make_unique<ToolOutputFile>(OutputFilename, EC, OpenFlags);
   if (EC) {
-    WithColor::error() << EC.message() << '\n';
+    reportError(EC.message());
     return nullptr;
   }
 
@@ -316,7 +347,7 @@ int main(int argc, char **argv) {
   initializeConstantHoistingLegacyPassPass(*Registry);
   initializeScalarOpts(*Registry);
   initializeVectorization(*Registry);
-  initializeScalarizeMaskedMemIntrinPass(*Registry);
+  initializeScalarizeMaskedMemIntrinLegacyPassPass(*Registry);
   initializeExpandReductionsPass(*Registry);
   initializeHardwareLoopsPass(*Registry);
   initializeTransformUtils(*Registry);
@@ -341,18 +372,12 @@ int main(int argc, char **argv) {
       setupLLVMOptimizationRemarks(Context, RemarksFilename, RemarksPasses,
                                    RemarksFormat, RemarksWithHotness,
                                    RemarksHotnessThreshold);
-  if (Error E = RemarksFileOrErr.takeError()) {
-    WithColor::error(errs(), argv[0]) << toString(std::move(E)) << '\n';
-    return 1;
-  }
+  if (Error E = RemarksFileOrErr.takeError())
+    reportError(std::move(E), RemarksFilename);
   std::unique_ptr<ToolOutputFile> RemarksFile = std::move(*RemarksFileOrErr);
 
-  if (InputLanguage != "" && InputLanguage != "ir" &&
-      InputLanguage != "mir") {
-    WithColor::error(errs(), argv[0])
-        << "input language must be '', 'IR' or 'MIR'\n";
-    return 1;
-  }
+  if (InputLanguage != "" && InputLanguage != "ir" && InputLanguage != "mir")
+    reportError("input language must be '', 'IR' or 'MIR'");
 
   // Compile the module TimeCompilations times to give better compile time
   // metrics.
@@ -424,14 +449,32 @@ static int compileModule(char **argv, LLVMContext &Context) {
   case '3': OLvl = CodeGenOpt::Aggressive; break;
   }
 
-  TargetOptions Options = codegen::InitTargetOptionsFromCodeGenFlags();
-  Options.DisableIntegratedAS = NoIntegratedAssembler;
-  Options.MCOptions.ShowMCEncoding = ShowMCEncoding;
-  Options.MCOptions.MCUseDwarfDirectory = EnableDwarfDirectory;
-  Options.MCOptions.AsmVerbose = AsmVerbose;
-  Options.MCOptions.PreserveAsmComments = PreserveComments;
-  Options.MCOptions.IASSearchPaths = IncludeDirs;
-  Options.MCOptions.SplitDwarfFile = SplitDwarfFile;
+  // Parse 'none' or '$major.$minor'. Disallow -binutils-version=0 because we
+  // use that to indicate the MC default.
+  if (!BinutilsVersion.empty() && BinutilsVersion != "none") {
+    StringRef V = BinutilsVersion.getValue();
+    unsigned Num;
+    if (V.consumeInteger(10, Num) || Num == 0 ||
+        !(V.empty() ||
+          (V.consume_front(".") && !V.consumeInteger(10, Num) && V.empty()))) {
+      WithColor::error(errs(), argv[0])
+          << "invalid -binutils-version, accepting 'none' or major.minor\n";
+      return 1;
+    }
+  }
+  TargetOptions Options;
+  auto InitializeOptions = [&](const Triple &TheTriple) {
+    Options = codegen::InitTargetOptionsFromCodeGenFlags(TheTriple);
+    Options.BinutilsVersion =
+        TargetMachine::parseBinutilsVersion(BinutilsVersion);
+    Options.DisableIntegratedAS = NoIntegratedAssembler;
+    Options.MCOptions.ShowMCEncoding = ShowMCEncoding;
+    Options.MCOptions.MCUseDwarfDirectory = EnableDwarfDirectory;
+    Options.MCOptions.AsmVerbose = AsmVerbose;
+    Options.MCOptions.PreserveAsmComments = PreserveComments;
+    Options.MCOptions.IASSearchPaths = IncludeDirs;
+    Options.MCOptions.SplitDwarfFile = SplitDwarfFile;
+  };
 
   Optional<Reloc::Model> RM = codegen::getExplicitRelocModel();
 
@@ -460,12 +503,11 @@ static int compileModule(char **argv, LLVMContext &Context) {
 
       // On AIX, setting the relocation model to anything other than PIC is
       // considered a user error.
-      if (TheTriple.isOSAIX() && RM.hasValue() && *RM != Reloc::PIC_) {
-        WithColor::error(errs(), argv[0])
-            << "invalid relocation model, AIX only supports PIC.\n";
-        exit(1);
-      }
+      if (TheTriple.isOSAIX() && RM.hasValue() && *RM != Reloc::PIC_)
+        reportError("invalid relocation model, AIX only supports PIC",
+                    InputFilename);
 
+      InitializeOptions(TheTriple);
       Target = std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
           TheTriple.getTriple(), CPUStr, FeaturesStr, Options, RM,
           codegen::getExplicitCodeModel(), OLvl));
@@ -510,6 +552,7 @@ static int compileModule(char **argv, LLVMContext &Context) {
       return 1;
     }
 
+    InitializeOptions(TheTriple);
     Target = std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
         TheTriple.getTriple(), CPUStr, FeaturesStr, Options, RM,
         codegen::getExplicitCodeModel(), OLvl));
@@ -535,10 +578,8 @@ static int compileModule(char **argv, LLVMContext &Context) {
     std::error_code EC;
     DwoOut = std::make_unique<ToolOutputFile>(SplitDwarfOutputFile, EC,
                                                sys::fs::OF_None);
-    if (EC) {
-      WithColor::error(errs(), argv[0]) << EC.message() << '\n';
-      return 1;
-    }
+    if (EC)
+      reportError(EC.message(), SplitDwarfOutputFile);
   }
 
   // Build up all of the passes that we want to do to the module.
@@ -554,12 +595,8 @@ static int compileModule(char **argv, LLVMContext &Context) {
 
   // Verify module immediately to catch problems before doInitialization() is
   // called on any passes.
-  if (!NoVerify && verifyModule(*M, &errs())) {
-    std::string Prefix =
-        (Twine(argv[0]) + Twine(": ") + Twine(InputFilename)).str();
-    WithColor::error(errs(), Prefix) << "input module is broken!\n";
-    return 1;
-  }
+  if (!NoVerify && verifyModule(*M, &errs()))
+    reportError("input module cannot be verified", InputFilename);
 
   // Override function attributes based on CPUStr, FeaturesStr, and command line
   // flags.
@@ -618,10 +655,7 @@ static int compileModule(char **argv, LLVMContext &Context) {
     } else if (Target->addPassesToEmitFile(
                    PM, *OS, DwoOut ? &DwoOut->os() : nullptr,
                    codegen::getFileType(), NoVerify, MMIWP)) {
-      WithColor::warning(errs(), argv[0])
-          << "target does not support generation of this"
-          << " file type!\n";
-      return 1;
+      reportError("target does not support generation of this file type");
     }
 
     const_cast<TargetLoweringObjectFile *>(LLVMTM.getObjFileLowering())
diff --git a/contrib/llvm-project/llvm/tools/lli/ChildTarget/ChildTarget.cpp b/contrib/llvm-project/llvm/tools/lli/ChildTarget/ChildTarget.cpp
index 77b1d47a9466..5772baca1d09 100644
--- a/contrib/llvm-project/llvm/tools/lli/ChildTarget/ChildTarget.cpp
+++ b/contrib/llvm-project/llvm/tools/lli/ChildTarget/ChildTarget.cpp
@@ -1,5 +1,6 @@
 #include "llvm/ExecutionEngine/Orc/OrcABISupport.h"
 #include "llvm/ExecutionEngine/Orc/OrcRemoteTargetServer.h"
+#include "llvm/ExecutionEngine/Orc/Shared/FDRawByteChannel.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/DynamicLibrary.h"
 #include "llvm/Support/Process.h"
@@ -53,8 +54,9 @@ int main(int argc, char *argv[]) {
     RTDyldMemoryManager::deregisterEHFramesInProcess(Addr, Size);
   };
 
-  FDRawChannel Channel(InFD, OutFD);
-  typedef remote::OrcRemoteTargetServer<FDRawChannel, HostOrcArch> JITServer;
+  shared::FDRawByteChannel Channel(InFD, OutFD);
+  typedef remote::OrcRemoteTargetServer<shared::FDRawByteChannel, HostOrcArch>
+      JITServer;
   JITServer Server(Channel, SymbolLookup, RegisterEHFrames, DeregisterEHFrames);
 
   while (!Server.receivedTerminate())
diff --git a/contrib/llvm-project/llvm/tools/lli/RemoteJITUtils.h b/contrib/llvm-project/llvm/tools/lli/RemoteJITUtils.h
index cc93294af0cf..cc8d034f62a5 100644
--- a/contrib/llvm-project/llvm/tools/lli/RemoteJITUtils.h
+++ b/contrib/llvm-project/llvm/tools/lli/RemoteJITUtils.h
@@ -13,7 +13,7 @@
 #ifndef LLVM_TOOLS_LLI_REMOTEJITUTILS_H
 #define LLVM_TOOLS_LLI_REMOTEJITUTILS_H
 
-#include "llvm/ExecutionEngine/Orc/RPC/RawByteChannel.h"
+#include "llvm/ExecutionEngine/Orc/Shared/FDRawByteChannel.h"
 #include "llvm/ExecutionEngine/RTDyldMemoryManager.h"
 #include <mutex>
 
@@ -23,55 +23,8 @@
 #include <io.h>
 #endif
 
-/// RPC channel that reads from and writes from file descriptors.
-class FDRawChannel final : public llvm::orc::rpc::RawByteChannel {
-public:
-  FDRawChannel(int InFD, int OutFD) : InFD(InFD), OutFD(OutFD) {}
-
-  llvm::Error readBytes(char *Dst, unsigned Size) override {
-    assert(Dst && "Attempt to read into null.");
-    ssize_t Completed = 0;
-    while (Completed < static_cast<ssize_t>(Size)) {
-      ssize_t Read = ::read(InFD, Dst + Completed, Size - Completed);
-      if (Read <= 0) {
-        auto ErrNo = errno;
-        if (ErrNo == EAGAIN || ErrNo == EINTR)
-          continue;
-        else
-          return llvm::errorCodeToError(
-                   std::error_code(errno, std::generic_category()));
-      }
-      Completed += Read;
-    }
-    return llvm::Error::success();
-  }
-
-  llvm::Error appendBytes(const char *Src, unsigned Size) override {
-    assert(Src && "Attempt to append from null.");
-    ssize_t Completed = 0;
-    while (Completed < static_cast<ssize_t>(Size)) {
-      ssize_t Written = ::write(OutFD, Src + Completed, Size - Completed);
-      if (Written < 0) {
-        auto ErrNo = errno;
-        if (ErrNo == EAGAIN || ErrNo == EINTR)
-          continue;
-        else
-          return llvm::errorCodeToError(
-                   std::error_code(errno, std::generic_category()));
-      }
-      Completed += Written;
-    }
-    return llvm::Error::success();
-  }
-
-  llvm::Error send() override { return llvm::Error::success(); }
-
-private:
-  int InFD, OutFD;
-};
-
 // launch the remote process (see lli.cpp) and return a channel to it.
-std::unique_ptr<FDRawChannel> launchRemote();
+std::unique_ptr<llvm::orc::shared::FDRawByteChannel> launchRemote();
 
 namespace llvm {
 
@@ -146,6 +99,27 @@ private:
   std::unique_ptr<RuntimeDyld::MemoryManager> MemMgr;
   std::shared_ptr<LegacyJITSymbolResolver> Resolver;
 };
+
+template <typename RemoteT>
+class RemoteResolver : public LegacyJITSymbolResolver {
+public:
+
+  RemoteResolver(RemoteT &R) : R(R) {}
+
+  JITSymbol findSymbol(const std::string &Name) override {
+    if (auto Addr = R.getSymbolAddress(Name))
+      return JITSymbol(*Addr, JITSymbolFlags::Exported);
+    else
+      return Addr.takeError();
+  }
+
+  JITSymbol findSymbolInLogicalDylib(const std::string &Name) override {
+    return nullptr;
+  }
+
+public:
+  RemoteT &R;
+};
 }
 
 #endif
diff --git a/contrib/llvm-project/llvm/tools/lli/lli.cpp b/contrib/llvm-project/llvm/tools/lli/lli.cpp
index 981e0812d45e..70c838126946 100644
--- a/contrib/llvm-project/llvm/tools/lli/lli.cpp
+++ b/contrib/llvm-project/llvm/tools/lli/lli.cpp
@@ -31,7 +31,7 @@
 #include "llvm/ExecutionEngine/Orc/MachOPlatform.h"
 #include "llvm/ExecutionEngine/Orc/OrcRemoteTargetClient.h"
 #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
-#include "llvm/ExecutionEngine/OrcMCJITReplacement.h"
+#include "llvm/ExecutionEngine/Orc/TargetProcess/TargetExecutionUtils.h"
 #include "llvm/ExecutionEngine/SectionMemoryManager.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/LLVMContext.h"
@@ -76,7 +76,7 @@ static codegen::RegisterCodeGenFlags CGF;
 
 namespace {
 
-  enum class JITKind { MCJIT, OrcMCJITReplacement, OrcLazy };
+  enum class JITKind { MCJIT, OrcLazy };
 
   cl::opt<std::string>
   InputFile(cl::desc("<input bitcode>"), cl::Positional, cl::init("-"));
@@ -92,9 +92,6 @@ namespace {
       "jit-kind", cl::desc("Choose underlying JIT kind."),
       cl::init(JITKind::MCJIT),
       cl::values(clEnumValN(JITKind::MCJIT, "mcjit", "MCJIT"),
-                 clEnumValN(JITKind::OrcMCJITReplacement, "orc-mcjit",
-                            "Orc-based MCJIT replacement "
-                            "(deprecated)"),
                  clEnumValN(JITKind::OrcLazy, "orc-lazy",
                             "Orc-based lazy JIT.")));
 
@@ -449,8 +446,6 @@ int main(int argc, char **argv, char * const *envp) {
   builder.setEngineKind(ForceInterpreter
                         ? EngineKind::Interpreter
                         : EngineKind::JIT);
-  builder.setUseOrcMCJITReplacement(AcknowledgeORCv1Deprecation,
-                                    UseJITKind == JITKind::OrcMCJITReplacement);
 
   // If we are supposed to override the target triple, do so now.
   if (!TargetTriple.empty())
@@ -476,7 +471,8 @@ int main(int argc, char **argv, char * const *envp) {
 
   builder.setOptLevel(getOptLevel());
 
-  TargetOptions Options = codegen::InitTargetOptionsFromCodeGenFlags();
+  TargetOptions Options =
+      codegen::InitTargetOptionsFromCodeGenFlags(Triple(TargetTriple));
   if (codegen::getFloatABIForCalls() != FloatABI::Default)
     Options.FloatABIType = codegen::getFloatABIForCalls();
 
@@ -674,7 +670,7 @@ int main(int argc, char **argv, char * const *envp) {
     // MCJIT itself. FIXME.
 
     // Lanch the remote process and get a channel to it.
-    std::unique_ptr<FDRawChannel> C = launchRemote();
+    std::unique_ptr<orc::shared::FDRawByteChannel> C = launchRemote();
     if (!C) {
       WithColor::error(errs(), argv[0]) << "failed to launch remote JIT.\n";
       exit(1);
@@ -695,14 +691,7 @@ int main(int argc, char **argv, char * const *envp) {
 
     // Forward MCJIT's symbol resolution calls to the remote.
     static_cast<ForwardingMemoryManager *>(RTDyldMM)->setResolver(
-        orc::createLambdaResolver(
-            AcknowledgeORCv1Deprecation,
-            [](const std::string &Name) { return nullptr; },
-            [&](const std::string &Name) {
-              if (auto Addr = ExitOnErr(R->getSymbolAddress(Name)))
-                return JITSymbol(Addr, JITSymbolFlags::Exported);
-              return JITSymbol(nullptr);
-            }));
+        std::make_unique<RemoteResolver<MyRemote>>(*R));
 
     // Grab the target address of the JIT'd main function on the remote and call
     // it.
@@ -1026,7 +1015,7 @@ void disallowOrcOptions() {
   }
 }
 
-std::unique_ptr<FDRawChannel> launchRemote() {
+std::unique_ptr<orc::shared::FDRawByteChannel> launchRemote() {
 #ifndef LLVM_ON_UNIX
   llvm_unreachable("launchRemote not supported on non-Unix platforms");
 #else
@@ -1076,6 +1065,7 @@ std::unique_ptr<FDRawChannel> launchRemote() {
   close(PipeFD[1][1]);
 
   // Return an RPC channel connected to our end of the pipes.
-  return std::make_unique<FDRawChannel>(PipeFD[1][0], PipeFD[0][1]);
+  return std::make_unique<orc::shared::FDRawByteChannel>(PipeFD[1][0],
+                                                         PipeFD[0][1]);
 #endif
 }
diff --git a/contrib/llvm-project/llvm/tools/llvm-ar/llvm-ar.cpp b/contrib/llvm-project/llvm/tools/llvm-ar/llvm-ar.cpp
index d699d4323f0a..4c26c8cad3fa 100644
--- a/contrib/llvm-project/llvm/tools/llvm-ar/llvm-ar.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-ar/llvm-ar.cpp
@@ -125,7 +125,7 @@ MODIFIERS:
   [V] - display the version and exit
 )";
 
-void printHelpMessage() {
+static void printHelpMessage() {
   if (Stem.contains_lower("ranlib"))
     outs() << RanlibHelp;
   else if (Stem.contains_lower("ar"))
diff --git a/contrib/llvm-project/llvm/tools/llvm-cov/CodeCoverage.cpp b/contrib/llvm-project/llvm/tools/llvm-cov/CodeCoverage.cpp
index b3c895b44a6d..baa968820b63 100644
--- a/contrib/llvm-project/llvm/tools/llvm-cov/CodeCoverage.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-cov/CodeCoverage.cpp
@@ -88,6 +88,12 @@ private:
                                ArrayRef<ExpansionRecord> Expansions,
                                const CoverageMapping &Coverage);
 
+  /// Create source views for the branches of the view.
+  void attachBranchSubViews(SourceCoverageView &View, StringRef SourceName,
+                            ArrayRef<CountedRegion> Branches,
+                            const MemoryBuffer &File,
+                            CoverageData &CoverageInfo);
+
   /// Create the source view of a particular function.
   std::unique_ptr<SourceCoverageView>
   createFunctionView(const FunctionRecord &Function,
@@ -130,6 +136,9 @@ private:
   CoverageFiltersMatchAll Filters;
   CoverageFilters IgnoreFilenameFilters;
 
+  /// True if InputSourceFiles are provided.
+  bool HadSourceFiles = false;
+
   /// The path to the indexed profile.
   std::string PGOFilename;
 
@@ -194,6 +203,7 @@ void CodeCoverageTool::addCollectedPath(const std::string &Path) {
   sys::path::remove_dots(EffectivePath, /*remove_dot_dots=*/true);
   if (!IgnoreFilenameFilters.matchesFilename(EffectivePath))
     SourceFiles.emplace_back(EffectivePath.str());
+  HadSourceFiles = !SourceFiles.empty();
 }
 
 void CodeCoverageTool::collectPaths(const std::string &Path) {
@@ -264,15 +274,45 @@ void CodeCoverageTool::attachExpansionSubViews(
     if (!SourceBuffer)
       continue;
 
+    auto SubViewBranches = ExpansionCoverage.getBranches();
     auto SubViewExpansions = ExpansionCoverage.getExpansions();
     auto SubView =
         SourceCoverageView::create(Expansion.Function.Name, SourceBuffer.get(),
                                    ViewOpts, std::move(ExpansionCoverage));
     attachExpansionSubViews(*SubView, SubViewExpansions, Coverage);
+    attachBranchSubViews(*SubView, Expansion.Function.Name, SubViewBranches,
+                         SourceBuffer.get(), ExpansionCoverage);
     View.addExpansion(Expansion.Region, std::move(SubView));
   }
 }
 
+void CodeCoverageTool::attachBranchSubViews(SourceCoverageView &View,
+                                            StringRef SourceName,
+                                            ArrayRef<CountedRegion> Branches,
+                                            const MemoryBuffer &File,
+                                            CoverageData &CoverageInfo) {
+  if (!ViewOpts.ShowBranchCounts && !ViewOpts.ShowBranchPercents)
+    return;
+
+  const auto *NextBranch = Branches.begin();
+  const auto *EndBranch = Branches.end();
+
+  // Group branches that have the same line number into the same subview.
+  while (NextBranch != EndBranch) {
+    std::vector<CountedRegion> ViewBranches;
+    unsigned CurrentLine = NextBranch->LineStart;
+
+    while (NextBranch != EndBranch && CurrentLine == NextBranch->LineStart)
+      ViewBranches.push_back(*NextBranch++);
+
+    if (!ViewBranches.empty()) {
+      auto SubView = SourceCoverageView::create(SourceName, File, ViewOpts,
+                                                std::move(CoverageInfo));
+      View.addBranch(CurrentLine, ViewBranches, std::move(SubView));
+    }
+  }
+}
+
 std::unique_ptr<SourceCoverageView>
 CodeCoverageTool::createFunctionView(const FunctionRecord &Function,
                                      const CoverageMapping &Coverage) {
@@ -283,11 +323,14 @@ CodeCoverageTool::createFunctionView(const FunctionRecord &Function,
   if (!SourceBuffer)
     return nullptr;
 
+  auto Branches = FunctionCoverage.getBranches();
   auto Expansions = FunctionCoverage.getExpansions();
   auto View = SourceCoverageView::create(DC.demangle(Function.Name),
                                          SourceBuffer.get(), ViewOpts,
                                          std::move(FunctionCoverage));
   attachExpansionSubViews(*View, Expansions, Coverage);
+  attachBranchSubViews(*View, DC.demangle(Function.Name), Branches,
+                       SourceBuffer.get(), FunctionCoverage);
 
   return View;
 }
@@ -302,10 +345,13 @@ CodeCoverageTool::createSourceFileView(StringRef SourceFile,
   if (FileCoverage.empty())
     return nullptr;
 
+  auto Branches = FileCoverage.getBranches();
   auto Expansions = FileCoverage.getExpansions();
   auto View = SourceCoverageView::create(SourceFile, SourceBuffer.get(),
                                          ViewOpts, std::move(FileCoverage));
   attachExpansionSubViews(*View, Expansions, Coverage);
+  attachBranchSubViews(*View, SourceFile, Branches, SourceBuffer.get(),
+                       FileCoverage);
   if (!ViewOpts.ShowFunctionInstantiations)
     return View;
 
@@ -322,9 +368,12 @@ CodeCoverageTool::createSourceFileView(StringRef SourceFile,
       if (Function->ExecutionCount > 0) {
         auto SubViewCoverage = Coverage.getCoverageForFunction(*Function);
         auto SubViewExpansions = SubViewCoverage.getExpansions();
+        auto SubViewBranches = SubViewCoverage.getBranches();
         SubView = SourceCoverageView::create(
             Funcname, SourceBuffer.get(), ViewOpts, std::move(SubViewCoverage));
         attachExpansionSubViews(*SubView, SubViewExpansions, Coverage);
+        attachBranchSubViews(*SubView, SourceFile, SubViewBranches,
+                             SourceBuffer.get(), SubViewCoverage);
       }
 
       unsigned FileID = Function->CountedRegions.front().FileID;
@@ -395,6 +444,7 @@ void CodeCoverageTool::remapPathNames(const CoverageMapping &Coverage) {
       return "";
     SmallString<128> NativePath;
     sys::path::native(Path, NativePath);
+    sys::path::remove_dots(NativePath, true);
     if (!sys::path::is_separator(NativePath.back()))
       NativePath += sys::path::get_separator();
     return NativePath.c_str();
@@ -406,6 +456,7 @@ void CodeCoverageTool::remapPathNames(const CoverageMapping &Coverage) {
   for (StringRef Filename : Coverage.getUniqueSourceFiles()) {
     SmallString<128> NativeFilename;
     sys::path::native(Filename, NativeFilename);
+    sys::path::remove_dots(NativeFilename, true);
     if (NativeFilename.startswith(RemapFrom)) {
       RemappedFilenames[Filename] =
           RemapTo + NativeFilename.substr(RemapFrom.size()).str();
@@ -430,16 +481,11 @@ void CodeCoverageTool::remapPathNames(const CoverageMapping &Coverage) {
 void CodeCoverageTool::removeUnmappedInputs(const CoverageMapping &Coverage) {
   std::vector<StringRef> CoveredFiles = Coverage.getUniqueSourceFiles();
 
-  auto UncoveredFilesIt = SourceFiles.end();
   // The user may have specified source files which aren't in the coverage
   // mapping. Filter these files away.
-  UncoveredFilesIt = std::remove_if(
-      SourceFiles.begin(), SourceFiles.end(), [&](const std::string &SF) {
-        return !std::binary_search(CoveredFiles.begin(), CoveredFiles.end(),
-                                   SF);
-      });
-
-  SourceFiles.erase(UncoveredFilesIt, SourceFiles.end());
+  llvm::erase_if(SourceFiles, [&](const std::string &SF) {
+    return !std::binary_search(CoveredFiles.begin(), CoveredFiles.end(), SF);
+  });
 }
 
 void CodeCoverageTool::demangleSymbols(const CoverageMapping &Coverage) {
@@ -544,8 +590,11 @@ int CodeCoverageTool::run(Command Cmd, int argc, const char **argv) {
       cl::Positional, cl::desc("Covered executable or object file."));
 
   cl::list<std::string> CovFilenames(
-      "object", cl::desc("Coverage executable or object file"), cl::ZeroOrMore,
-      cl::CommaSeparated);
+      "object", cl::desc("Coverage executable or object file"), cl::ZeroOrMore);
+
+  cl::opt<bool> DebugDumpCollectedObjects(
+      "dump-collected-objects", cl::Optional, cl::Hidden,
+      cl::desc("Show the collected coverage object files"));
 
   cl::list<std::string> InputSourceFiles(
       cl::Positional, cl::desc("<Source files>"), cl::ZeroOrMore);
@@ -641,6 +690,11 @@ int CodeCoverageTool::run(Command Cmd, int argc, const char **argv) {
       cl::desc("Show region statistics in summary table"),
       cl::init(true));
 
+  cl::opt<bool> BranchSummary(
+      "show-branch-summary", cl::Optional,
+      cl::desc("Show branch condition statistics in summary table"),
+      cl::init(true));
+
   cl::opt<bool> InstantiationSummary(
       "show-instantiation-summary", cl::Optional,
       cl::desc("Show instantiation statistics in summary table"));
@@ -668,6 +722,12 @@ int CodeCoverageTool::run(Command Cmd, int argc, const char **argv) {
       ::exit(1);
     }
 
+    if (DebugDumpCollectedObjects) {
+      for (StringRef OF : ObjectFilenames)
+        outs() << OF << '\n';
+      ::exit(0);
+    }
+
     ViewOpts.Format = Format;
     switch (ViewOpts.Format) {
     case CoverageViewOptions::OutputFormat::Text:
@@ -778,6 +838,7 @@ int CodeCoverageTool::run(Command Cmd, int argc, const char **argv) {
       ::exit(0);
     }
 
+    ViewOpts.ShowBranchSummary = BranchSummary;
     ViewOpts.ShowRegionSummary = RegionSummary;
     ViewOpts.ShowInstantiationSummary = InstantiationSummary;
     ViewOpts.ExportSummaryOnly = SummaryOnly;
@@ -812,6 +873,15 @@ int CodeCoverageTool::doShow(int argc, const char **argv,
       cl::desc("Show the execution counts for each region"),
       cl::cat(ViewCategory));
 
+  cl::opt<CoverageViewOptions::BranchOutputType> ShowBranches(
+      "show-branches", cl::Optional,
+      cl::desc("Show coverage for branch conditions"), cl::cat(ViewCategory),
+      cl::values(clEnumValN(CoverageViewOptions::BranchOutputType::Count,
+                            "count", "Show True/False counts"),
+                 clEnumValN(CoverageViewOptions::BranchOutputType::Percent,
+                            "percent", "Show True/False percent")),
+      cl::init(CoverageViewOptions::BranchOutputType::Off));
+
   cl::opt<bool> ShowBestLineRegionsCounts(
       "show-line-counts-or-regions", cl::Optional,
       cl::desc("Show the execution counts for each line, or the execution "
@@ -855,6 +925,10 @@ int CodeCoverageTool::doShow(int argc, const char **argv,
                            !ShowRegions || ShowBestLineRegionsCounts;
   ViewOpts.ShowRegionMarkers = ShowRegions || ShowBestLineRegionsCounts;
   ViewOpts.ShowExpandedRegions = ShowExpansions;
+  ViewOpts.ShowBranchCounts =
+      ShowBranches == CoverageViewOptions::BranchOutputType::Count;
+  ViewOpts.ShowBranchPercents =
+      ShowBranches == CoverageViewOptions::BranchOutputType::Percent;
   ViewOpts.ShowFunctionInstantiations = ShowInstantiations;
   ViewOpts.ShowOutputDirectory = ShowOutputDirectory;
   ViewOpts.TabSize = TabSize;
@@ -886,7 +960,7 @@ int CodeCoverageTool::doShow(int argc, const char **argv,
 
   auto Printer = CoveragePrinter::create(ViewOpts);
 
-  if (SourceFiles.empty())
+  if (SourceFiles.empty() && !HadSourceFiles)
     // Get the source files from the function coverage mapping.
     for (StringRef Filename : Coverage->getUniqueSourceFiles()) {
       if (!IgnoreFilenameFilters.matchesFilename(Filename))
diff --git a/contrib/llvm-project/llvm/tools/llvm-cov/CoverageExporterJson.cpp b/contrib/llvm-project/llvm/tools/llvm-cov/CoverageExporterJson.cpp
index c8bb1aa5b6ea..d1446f379f00 100644
--- a/contrib/llvm-project/llvm/tools/llvm-cov/CoverageExporterJson.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-cov/CoverageExporterJson.cpp
@@ -18,6 +18,8 @@
 //   -- Export: dict => Json representation of one CoverageMapping
 //     -- Files: array => List of objects describing coverage for files
 //       -- File: dict => Coverage for a single file
+//         -- Branches: array => List of Branches in the file
+//           -- Branch: dict => Describes a branch of the file with counters
 //         -- Segments: array => List of Segments contained in the file
 //           -- Segment: dict => Describes a segment of the file with a counter
 //         -- Expansions: array => List of expansion records
@@ -25,10 +27,13 @@
 //             -- CountedRegion: dict => The region to be expanded
 //             -- TargetRegions: array => List of Regions in the expansion
 //               -- CountedRegion: dict => Single Region in the expansion
+//             -- Branches: array => List of Branches in the expansion
+//               -- Branch: dict => Describes a branch in expansion and counters
 //         -- Summary: dict => Object summarizing the coverage for this file
 //           -- LineCoverage: dict => Object summarizing line coverage
 //           -- FunctionCoverage: dict => Object summarizing function coverage
 //           -- RegionCoverage: dict => Object summarizing region coverage
+//           -- BranchCoverage: dict => Object summarizing branch coverage
 //     -- Functions: array => List of objects describing coverage for functions
 //       -- Function: dict => Coverage info for a single function
 //         -- Filenames: array => List of filenames that the function relates to
@@ -37,6 +42,7 @@
 //     -- FunctionCoverage: dict => Object summarizing function coverage
 //     -- InstantiationCoverage: dict => Object summarizing inst. coverage
 //     -- RegionCoverage: dict => Object summarizing region coverage
+//     -- BranchCoverage: dict => Object summarizing branch coverage
 //
 //===----------------------------------------------------------------------===//
 
@@ -84,6 +90,14 @@ json::Array renderRegion(const coverage::CountedRegion &Region) {
                       int64_t(Region.Kind)});
 }
 
+json::Array renderBranch(const coverage::CountedRegion &Region) {
+  return json::Array(
+      {Region.LineStart, Region.ColumnStart, Region.LineEnd, Region.ColumnEnd,
+       clamp_uint64_to_int64(Region.ExecutionCount),
+       clamp_uint64_to_int64(Region.FalseExecutionCount), Region.FileID,
+       Region.ExpandedFileID, int64_t(Region.Kind)});
+}
+
 json::Array renderRegions(ArrayRef<coverage::CountedRegion> Regions) {
   json::Array RegionArray;
   for (const auto &Region : Regions)
@@ -91,13 +105,49 @@ json::Array renderRegions(ArrayRef<coverage::CountedRegion> Regions) {
   return RegionArray;
 }
 
-json::Object renderExpansion(const coverage::ExpansionRecord &Expansion) {
+json::Array renderBranchRegions(ArrayRef<coverage::CountedRegion> Regions) {
+  json::Array RegionArray;
+  for (const auto &Region : Regions)
+    if (!Region.Folded)
+      RegionArray.push_back(renderBranch(Region));
+  return RegionArray;
+}
+
+std::vector<llvm::coverage::CountedRegion>
+collectNestedBranches(const coverage::CoverageMapping &Coverage,
+                      ArrayRef<llvm::coverage::ExpansionRecord> Expansions) {
+  std::vector<llvm::coverage::CountedRegion> Branches;
+  for (const auto &Expansion : Expansions) {
+    auto ExpansionCoverage = Coverage.getCoverageForExpansion(Expansion);
+
+    // Recursively collect branches from nested expansions.
+    auto NestedExpansions = ExpansionCoverage.getExpansions();
+    auto NestedExBranches = collectNestedBranches(Coverage, NestedExpansions);
+    Branches.insert(Branches.end(), NestedExBranches.begin(),
+                    NestedExBranches.end());
+
+    // Add branches from this level of expansion.
+    auto ExBranches = ExpansionCoverage.getBranches();
+    for (auto B : ExBranches)
+      if (B.FileID == Expansion.FileID)
+        Branches.push_back(B);
+  }
+
+  return Branches;
+}
+
+json::Object renderExpansion(const coverage::CoverageMapping &Coverage,
+                             const coverage::ExpansionRecord &Expansion) {
+  std::vector<llvm::coverage::ExpansionRecord> Expansions = {Expansion};
   return json::Object(
       {{"filenames", json::Array(Expansion.Function.Filenames)},
        // Mark the beginning and end of this expansion in the source file.
        {"source_region", renderRegion(Expansion.Region)},
        // Enumerate the coverage information for the expansion.
-       {"target_regions", renderRegions(Expansion.Function.CountedRegions)}});
+       {"target_regions", renderRegions(Expansion.Function.CountedRegions)},
+       // Enumerate the branch coverage information for the expansion.
+       {"branches",
+        renderBranchRegions(collectNestedBranches(Coverage, Expansions))}});
 }
 
 json::Object renderSummary(const FileCoverageSummary &Summary) {
@@ -123,14 +173,22 @@ json::Object renderSummary(const FileCoverageSummary &Summary) {
              {"covered", int64_t(Summary.RegionCoverage.getCovered())},
              {"notcovered", int64_t(Summary.RegionCoverage.getNumRegions() -
                                     Summary.RegionCoverage.getCovered())},
-             {"percent", Summary.RegionCoverage.getPercentCovered()}})}});
+             {"percent", Summary.RegionCoverage.getPercentCovered()}})},
+       {"branches",
+        json::Object(
+            {{"count", int64_t(Summary.BranchCoverage.getNumBranches())},
+             {"covered", int64_t(Summary.BranchCoverage.getCovered())},
+             {"notcovered", int64_t(Summary.BranchCoverage.getNumBranches() -
+                                    Summary.BranchCoverage.getCovered())},
+             {"percent", Summary.BranchCoverage.getPercentCovered()}})}});
 }
 
-json::Array renderFileExpansions(const coverage::CoverageData &FileCoverage,
+json::Array renderFileExpansions(const coverage::CoverageMapping &Coverage,
+                                 const coverage::CoverageData &FileCoverage,
                                  const FileCoverageSummary &FileReport) {
   json::Array ExpansionArray;
   for (const auto &Expansion : FileCoverage.getExpansions())
-    ExpansionArray.push_back(renderExpansion(Expansion));
+    ExpansionArray.push_back(renderExpansion(Coverage, Expansion));
   return ExpansionArray;
 }
 
@@ -142,6 +200,14 @@ json::Array renderFileSegments(const coverage::CoverageData &FileCoverage,
   return SegmentArray;
 }
 
+json::Array renderFileBranches(const coverage::CoverageData &FileCoverage,
+                               const FileCoverageSummary &FileReport) {
+  json::Array BranchArray;
+  for (const auto &Branch : FileCoverage.getBranches())
+    BranchArray.push_back(renderBranch(Branch));
+  return BranchArray;
+}
+
 json::Object renderFile(const coverage::CoverageMapping &Coverage,
                         const std::string &Filename,
                         const FileCoverageSummary &FileReport,
@@ -151,8 +217,10 @@ json::Object renderFile(const coverage::CoverageMapping &Coverage,
     // Calculate and render detailed coverage information for given file.
     auto FileCoverage = Coverage.getCoverageForFile(Filename);
     File["segments"] = renderFileSegments(FileCoverage, FileReport);
+    File["branches"] = renderFileBranches(FileCoverage, FileReport);
     if (!Options.SkipExpansions) {
-      File["expansions"] = renderFileExpansions(FileCoverage, FileReport);
+      File["expansions"] =
+          renderFileExpansions(Coverage, FileCoverage, FileReport);
     }
   }
   File["summary"] = renderSummary(FileReport);
@@ -197,6 +265,7 @@ json::Array renderFunctions(
         json::Object({{"name", F.Name},
                       {"count", clamp_uint64_to_int64(F.ExecutionCount)},
                       {"regions", renderRegions(F.CountedRegions)},
+                      {"branches", renderBranchRegions(F.CountedBranchRegions)},
                       {"filenames", json::Array(F.Filenames)}}));
   return FunctionArray;
 }
@@ -218,16 +287,15 @@ void CoverageExporterJson::renderRoot(ArrayRef<std::string> SourceFiles) {
                                                         SourceFiles, Options);
   auto Files = renderFiles(Coverage, SourceFiles, FileReports, Options);
   // Sort files in order of their names.
-  std::sort(Files.begin(), Files.end(),
-    [](const json::Value &A, const json::Value &B) {
-      const json::Object *ObjA = A.getAsObject();
-      const json::Object *ObjB = B.getAsObject();
-      assert(ObjA != nullptr && "Value A was not an Object");
-      assert(ObjB != nullptr && "Value B was not an Object");
-      const StringRef FilenameA = ObjA->getString("filename").getValue();
-      const StringRef FilenameB = ObjB->getString("filename").getValue();
-      return FilenameA.compare(FilenameB) < 0;
-    });
+  llvm::sort(Files, [](const json::Value &A, const json::Value &B) {
+    const json::Object *ObjA = A.getAsObject();
+    const json::Object *ObjB = B.getAsObject();
+    assert(ObjA != nullptr && "Value A was not an Object");
+    assert(ObjB != nullptr && "Value B was not an Object");
+    const StringRef FilenameA = ObjA->getString("filename").getValue();
+    const StringRef FilenameB = ObjB->getString("filename").getValue();
+    return FilenameA.compare(FilenameB) < 0;
+  });
   auto Export = json::Object(
       {{"files", std::move(Files)}, {"totals", renderSummary(Totals)}});
   // Skip functions-level information  if necessary.
diff --git a/contrib/llvm-project/llvm/tools/llvm-cov/CoverageExporterLcov.cpp b/contrib/llvm-project/llvm/tools/llvm-cov/CoverageExporterLcov.cpp
index a6b3c6607030..99ca037e7b5e 100644
--- a/contrib/llvm-project/llvm/tools/llvm-cov/CoverageExporterLcov.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-cov/CoverageExporterLcov.cpp
@@ -26,8 +26,12 @@
 //   - "FNH:<number of functions hit>"
 //   - for each instrumented line:
 //     - "DA:<line number>,<execution count>[,<checksum>]
+//   - for each branch:
+//     - "BRDA:<line number>,<branch pair id>,<branch id>,<count>"
+//   - "BRF:<number of branches found>"
+//   - "BRH:<number of branches hit>"
 //   - "LH:<number of lines with non-zero execution count>"
-//   - "LF:<nubmer of instrumented lines>"
+//   - "LF:<number of instrumented lines>"
 //   - "end_of_record"
 //
 // If the user is exporting summary information only, then the FN, FNDA, and DA
@@ -71,11 +75,102 @@ void renderLineExecutionCounts(raw_ostream &OS,
   }
 }
 
+std::vector<llvm::coverage::CountedRegion>
+collectNestedBranches(const coverage::CoverageMapping &Coverage,
+                      ArrayRef<llvm::coverage::ExpansionRecord> Expansions,
+                      int ViewDepth = 0, int SrcLine = 0) {
+  std::vector<llvm::coverage::CountedRegion> Branches;
+  for (const auto &Expansion : Expansions) {
+    auto ExpansionCoverage = Coverage.getCoverageForExpansion(Expansion);
+
+    // If we're at the top level, set the corresponding source line.
+    if (ViewDepth == 0)
+      SrcLine = Expansion.Region.LineStart;
+
+    // Recursively collect branches from nested expansions.
+    auto NestedExpansions = ExpansionCoverage.getExpansions();
+    auto NestedExBranches = collectNestedBranches(Coverage, NestedExpansions,
+                                                  ViewDepth + 1, SrcLine);
+    Branches.insert(Branches.end(), NestedExBranches.begin(),
+                    NestedExBranches.end());
+
+    // Add branches from this level of expansion.
+    auto ExBranches = ExpansionCoverage.getBranches();
+    for (auto B : ExBranches)
+      if (B.FileID == Expansion.FileID) {
+        B.LineStart = SrcLine;
+        Branches.push_back(B);
+      }
+  }
+
+  return Branches;
+}
+
+bool sortLine(llvm::coverage::CountedRegion I,
+              llvm::coverage::CountedRegion J) {
+  return (I.LineStart < J.LineStart) ||
+         ((I.LineStart == J.LineStart) && (I.ColumnStart < J.ColumnStart));
+}
+
+void renderBranchExecutionCounts(raw_ostream &OS,
+                                 const coverage::CoverageMapping &Coverage,
+                                 const coverage::CoverageData &FileCoverage) {
+  std::vector<llvm::coverage::CountedRegion> Branches =
+      FileCoverage.getBranches();
+
+  // Recursively collect branches for all file expansions.
+  std::vector<llvm::coverage::CountedRegion> ExBranches =
+      collectNestedBranches(Coverage, FileCoverage.getExpansions());
+
+  // Append Expansion Branches to Source Branches.
+  Branches.insert(Branches.end(), ExBranches.begin(), ExBranches.end());
+
+  // Sort branches based on line number to ensure branches corresponding to the
+  // same source line are counted together.
+  llvm::sort(Branches, sortLine);
+
+  auto NextBranch = Branches.begin();
+  auto EndBranch = Branches.end();
+
+  // Branches with the same source line are enumerated individually
+  // (BranchIndex) as well as based on True/False pairs (PairIndex).
+  while (NextBranch != EndBranch) {
+    unsigned CurrentLine = NextBranch->LineStart;
+    unsigned PairIndex = 0;
+    unsigned BranchIndex = 0;
+
+    while (NextBranch != EndBranch && CurrentLine == NextBranch->LineStart) {
+      if (!NextBranch->Folded) {
+        unsigned BC1 = NextBranch->ExecutionCount;
+        unsigned BC2 = NextBranch->FalseExecutionCount;
+        bool BranchNotExecuted = (BC1 == 0 && BC2 == 0);
+
+        for (int I = 0; I < 2; I++, BranchIndex++) {
+          OS << "BRDA:" << CurrentLine << ',' << PairIndex << ','
+             << BranchIndex;
+          if (BranchNotExecuted)
+            OS << ',' << '-' << '\n';
+          else
+            OS << ',' << (I == 0 ? BC1 : BC2) << '\n';
+        }
+
+        PairIndex++;
+      }
+      NextBranch++;
+    }
+  }
+}
+
 void renderLineSummary(raw_ostream &OS, const FileCoverageSummary &Summary) {
   OS << "LF:" << Summary.LineCoverage.getNumLines() << '\n'
      << "LH:" << Summary.LineCoverage.getCovered() << '\n';
 }
 
+void renderBranchSummary(raw_ostream &OS, const FileCoverageSummary &Summary) {
+  OS << "BRF:" << Summary.BranchCoverage.getNumBranches() << '\n'
+     << "BFH:" << Summary.BranchCoverage.getCovered() << '\n';
+}
+
 void renderFile(raw_ostream &OS, const coverage::CoverageMapping &Coverage,
                 const std::string &Filename,
                 const FileCoverageSummary &FileReport, bool ExportSummaryOnly,
@@ -91,7 +186,9 @@ void renderFile(raw_ostream &OS, const coverage::CoverageMapping &Coverage,
     // Calculate and render detailed coverage information for given file.
     auto FileCoverage = Coverage.getCoverageForFile(Filename);
     renderLineExecutionCounts(OS, FileCoverage);
+    renderBranchExecutionCounts(OS, Coverage, FileCoverage);
   }
+  renderBranchSummary(OS, FileReport);
   renderLineSummary(OS, FileReport);
 
   OS << "end_of_record\n";
diff --git a/contrib/llvm-project/llvm/tools/llvm-cov/CoverageReport.cpp b/contrib/llvm-project/llvm/tools/llvm-cov/CoverageReport.cpp
index 8509710032d1..2c08f5309494 100644
--- a/contrib/llvm-project/llvm/tools/llvm-cov/CoverageReport.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-cov/CoverageReport.cpp
@@ -86,9 +86,9 @@ Column column(StringRef Str, unsigned Width, const T &Value) {
 }
 
 // Specify the default column widths.
-size_t FileReportColumns[] = {25, 12, 18, 10, 12, 18, 10,
-                              16, 16, 10, 12, 18, 10};
-size_t FunctionReportColumns[] = {25, 10, 8, 8, 10, 8, 8};
+size_t FileReportColumns[] = {25, 12, 18, 10, 12, 18, 10, 16,
+                              16, 10, 12, 18, 10, 12, 18, 10};
+size_t FunctionReportColumns[] = {25, 10, 8, 8, 10, 8, 8, 10, 8, 8};
 
 /// Adjust column widths to fit long file paths and function names.
 void adjustColumnWidths(ArrayRef<StringRef> Files,
@@ -239,6 +239,23 @@ void CoverageReport::render(const FileCoverageSummary &File,
         << '%';
   else
     OS << column("-", FileReportColumns[12], Column::RightAlignment);
+
+  if (Options.ShowBranchSummary) {
+    OS << format("%*u", FileReportColumns[13],
+                 (unsigned)File.BranchCoverage.getNumBranches());
+    Options.colored_ostream(OS, LineCoverageColor)
+        << format("%*u", FileReportColumns[14],
+                  (unsigned)(File.BranchCoverage.getNumBranches() -
+                             File.BranchCoverage.getCovered()));
+    if (File.BranchCoverage.getNumBranches())
+      Options.colored_ostream(OS, LineCoverageColor)
+          << format("%*.2f", FileReportColumns[15] - 1,
+                    File.BranchCoverage.getPercentCovered())
+          << '%';
+    else
+      OS << column("-", FileReportColumns[15], Column::RightAlignment);
+  }
+
   OS << "\n";
 }
 
@@ -273,6 +290,19 @@ void CoverageReport::render(const FunctionCoverageSummary &Function,
       << format("%*.2f", FunctionReportColumns[6] - 1,
                 Function.LineCoverage.getPercentCovered())
       << '%';
+  if (Options.ShowBranchSummary) {
+    OS << format("%*u", FunctionReportColumns[7],
+                 (unsigned)Function.BranchCoverage.getNumBranches());
+    Options.colored_ostream(OS, LineCoverageColor)
+        << format("%*u", FunctionReportColumns[8],
+                  (unsigned)(Function.BranchCoverage.getNumBranches() -
+                             Function.BranchCoverage.getCovered()));
+    Options.colored_ostream(
+        OS, determineCoveragePercentageColor(Function.BranchCoverage))
+        << format("%*.2f", FunctionReportColumns[9] - 1,
+                  Function.BranchCoverage.getPercentCovered())
+        << '%';
+  }
   OS << "\n";
 }
 
@@ -301,6 +331,10 @@ void CoverageReport::renderFunctionReports(ArrayRef<std::string> Files,
        << column("Lines", FunctionReportColumns[4], Column::RightAlignment)
        << column("Miss", FunctionReportColumns[5], Column::RightAlignment)
        << column("Cover", FunctionReportColumns[6], Column::RightAlignment);
+    if (Options.ShowBranchSummary)
+      OS << column("Branches", FunctionReportColumns[7], Column::RightAlignment)
+         << column("Miss", FunctionReportColumns[8], Column::RightAlignment)
+         << column("Cover", FunctionReportColumns[9], Column::RightAlignment);
     OS << "\n";
     renderDivider(FunctionReportColumns, OS);
     OS << "\n";
@@ -310,6 +344,7 @@ void CoverageReport::renderFunctionReports(ArrayRef<std::string> Files,
       ++Totals.ExecutionCount;
       Totals.RegionCoverage += Function.RegionCoverage;
       Totals.LineCoverage += Function.LineCoverage;
+      Totals.BranchCoverage += Function.BranchCoverage;
       render(Function, DC, OS);
     }
     if (Totals.ExecutionCount) {
@@ -420,7 +455,13 @@ void CoverageReport::renderFileReports(
        << column("Executed", FileReportColumns[9], Column::RightAlignment);
   OS << column("Lines", FileReportColumns[10], Column::RightAlignment)
      << column("Missed Lines", FileReportColumns[11], Column::RightAlignment)
-     << column("Cover", FileReportColumns[12], Column::RightAlignment) << "\n";
+     << column("Cover", FileReportColumns[12], Column::RightAlignment);
+  if (Options.ShowBranchSummary)
+    OS << column("Branches", FileReportColumns[13], Column::RightAlignment)
+       << column("Missed Branches", FileReportColumns[14],
+                 Column::RightAlignment)
+       << column("Cover", FileReportColumns[15], Column::RightAlignment);
+  OS << "\n";
   renderDivider(FileReportColumns, OS);
   OS << "\n";
 
diff --git a/contrib/llvm-project/llvm/tools/llvm-cov/CoverageSummaryInfo.cpp b/contrib/llvm-project/llvm/tools/llvm-cov/CoverageSummaryInfo.cpp
index 929529c27b6e..10e059adeb7d 100644
--- a/contrib/llvm-project/llvm/tools/llvm-cov/CoverageSummaryInfo.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-cov/CoverageSummaryInfo.cpp
@@ -16,6 +16,34 @@
 using namespace llvm;
 using namespace coverage;
 
+static void sumBranches(size_t &NumBranches, size_t &CoveredBranches,
+                        const ArrayRef<CountedRegion> &Branches) {
+  for (const auto &BR : Branches) {
+    // Skip folded branches.
+    if (BR.Folded)
+      continue;
+
+    // "True" Condition Branches.
+    ++NumBranches;
+    if (BR.ExecutionCount > 0)
+      ++CoveredBranches;
+    // "False" Condition Branches.
+    ++NumBranches;
+    if (BR.FalseExecutionCount > 0)
+      ++CoveredBranches;
+  }
+}
+
+static void sumBranchExpansions(size_t &NumBranches, size_t &CoveredBranches,
+                                const CoverageMapping &CM,
+                                ArrayRef<ExpansionRecord> Expansions) {
+  for (const auto &Expansion : Expansions) {
+    auto CE = CM.getCoverageForExpansion(Expansion);
+    sumBranches(NumBranches, CoveredBranches, CE.getBranches());
+    sumBranchExpansions(NumBranches, CoveredBranches, CM, CE.getExpansions());
+  }
+}
+
 FunctionCoverageSummary
 FunctionCoverageSummary::get(const CoverageMapping &CM,
                              const coverage::FunctionRecord &Function) {
@@ -40,10 +68,16 @@ FunctionCoverageSummary::get(const CoverageMapping &CM,
       ++CoveredLines;
   }
 
+  // Compute the branch coverage, including branches from expansions.
+  size_t NumBranches = 0, CoveredBranches = 0;
+  sumBranches(NumBranches, CoveredBranches, CD.getBranches());
+  sumBranchExpansions(NumBranches, CoveredBranches, CM, CD.getExpansions());
+
   return FunctionCoverageSummary(
       Function.Name, Function.ExecutionCount,
       RegionCoverageInfo(CoveredRegions, NumCodeRegions),
-      LineCoverageInfo(CoveredLines, NumLines));
+      LineCoverageInfo(CoveredLines, NumLines),
+      BranchCoverageInfo(CoveredBranches, NumBranches));
 }
 
 FunctionCoverageSummary
@@ -62,9 +96,11 @@ FunctionCoverageSummary::get(const InstantiationGroup &Group,
   Summary.ExecutionCount = Group.getTotalExecutionCount();
   Summary.RegionCoverage = Summaries[0].RegionCoverage;
   Summary.LineCoverage = Summaries[0].LineCoverage;
+  Summary.BranchCoverage = Summaries[0].BranchCoverage;
   for (const auto &FCS : Summaries.drop_front()) {
     Summary.RegionCoverage.merge(FCS.RegionCoverage);
     Summary.LineCoverage.merge(FCS.LineCoverage);
+    Summary.BranchCoverage.merge(FCS.BranchCoverage);
   }
   return Summary;
 }
diff --git a/contrib/llvm-project/llvm/tools/llvm-cov/CoverageSummaryInfo.h b/contrib/llvm-project/llvm/tools/llvm-cov/CoverageSummaryInfo.h
index 97beacb26d07..62e7cad1012b 100644
--- a/contrib/llvm-project/llvm/tools/llvm-cov/CoverageSummaryInfo.h
+++ b/contrib/llvm-project/llvm/tools/llvm-cov/CoverageSummaryInfo.h
@@ -101,6 +101,47 @@ public:
   }
 };
 
+/// Provides information about branches coverage for a function/file.
+class BranchCoverageInfo {
+  /// The number of branches that were executed at least once.
+  size_t Covered;
+
+  /// The total number of branches in a function/file.
+  size_t NumBranches;
+
+public:
+  BranchCoverageInfo() : Covered(0), NumBranches(0) {}
+
+  BranchCoverageInfo(size_t Covered, size_t NumBranches)
+      : Covered(Covered), NumBranches(NumBranches) {
+    assert(Covered <= NumBranches && "Covered branches over-counted");
+  }
+
+  BranchCoverageInfo &operator+=(const BranchCoverageInfo &RHS) {
+    Covered += RHS.Covered;
+    NumBranches += RHS.NumBranches;
+    return *this;
+  }
+
+  void merge(const BranchCoverageInfo &RHS) {
+    Covered = std::max(Covered, RHS.Covered);
+    NumBranches = std::max(NumBranches, RHS.NumBranches);
+  }
+
+  size_t getCovered() const { return Covered; }
+
+  size_t getNumBranches() const { return NumBranches; }
+
+  bool isFullyCovered() const { return Covered == NumBranches; }
+
+  double getPercentCovered() const {
+    assert(Covered <= NumBranches && "Covered branches over-counted");
+    if (NumBranches == 0)
+      return 0.0;
+    return double(Covered) / double(NumBranches) * 100.0;
+  }
+};
+
 /// Provides information about function coverage for a file.
 class FunctionCoverageInfo {
   /// The number of functions that were executed.
@@ -147,15 +188,19 @@ struct FunctionCoverageSummary {
   uint64_t ExecutionCount;
   RegionCoverageInfo RegionCoverage;
   LineCoverageInfo LineCoverage;
+  BranchCoverageInfo BranchCoverage;
 
   FunctionCoverageSummary(const std::string &Name)
-      : Name(Name), ExecutionCount(0), RegionCoverage(), LineCoverage() {}
+      : Name(Name), ExecutionCount(0), RegionCoverage(), LineCoverage(),
+        BranchCoverage() {}
 
   FunctionCoverageSummary(const std::string &Name, uint64_t ExecutionCount,
                           const RegionCoverageInfo &RegionCoverage,
-                          const LineCoverageInfo &LineCoverage)
+                          const LineCoverageInfo &LineCoverage,
+                          const BranchCoverageInfo &BranchCoverage)
       : Name(Name), ExecutionCount(ExecutionCount),
-        RegionCoverage(RegionCoverage), LineCoverage(LineCoverage) {}
+        RegionCoverage(RegionCoverage), LineCoverage(LineCoverage),
+        BranchCoverage(BranchCoverage) {}
 
   /// Compute the code coverage summary for the given function coverage
   /// mapping record.
@@ -174,6 +219,7 @@ struct FileCoverageSummary {
   StringRef Name;
   RegionCoverageInfo RegionCoverage;
   LineCoverageInfo LineCoverage;
+  BranchCoverageInfo BranchCoverage;
   FunctionCoverageInfo FunctionCoverage;
   FunctionCoverageInfo InstantiationCoverage;
 
@@ -185,6 +231,7 @@ struct FileCoverageSummary {
     RegionCoverage += RHS.RegionCoverage;
     LineCoverage += RHS.LineCoverage;
     FunctionCoverage += RHS.FunctionCoverage;
+    BranchCoverage += RHS.BranchCoverage;
     InstantiationCoverage += RHS.InstantiationCoverage;
     return *this;
   }
@@ -192,6 +239,7 @@ struct FileCoverageSummary {
   void addFunction(const FunctionCoverageSummary &Function) {
     RegionCoverage += Function.RegionCoverage;
     LineCoverage += Function.LineCoverage;
+    BranchCoverage += Function.BranchCoverage;
     FunctionCoverage.addFunction(/*Covered=*/Function.ExecutionCount > 0);
   }
 
diff --git a/contrib/llvm-project/llvm/tools/llvm-cov/CoverageViewOptions.h b/contrib/llvm-project/llvm/tools/llvm-cov/CoverageViewOptions.h
index dde0c692ab05..eee4ba74e165 100644
--- a/contrib/llvm-project/llvm/tools/llvm-cov/CoverageViewOptions.h
+++ b/contrib/llvm-project/llvm/tools/llvm-cov/CoverageViewOptions.h
@@ -23,20 +23,26 @@ struct CoverageViewOptions {
     Lcov
   };
 
+  enum class BranchOutputType { Count, Percent, Off };
+
   bool Debug;
   bool Colors;
   bool ShowLineNumbers;
   bool ShowLineStats;
   bool ShowRegionMarkers;
+  bool ShowBranchCounts;
+  bool ShowBranchPercents;
   bool ShowExpandedRegions;
   bool ShowFunctionInstantiations;
   bool ShowFullFilenames;
+  bool ShowBranchSummary;
   bool ShowRegionSummary;
   bool ShowInstantiationSummary;
   bool ExportSummaryOnly;
   bool SkipExpansions;
   bool SkipFunctions;
   OutputFormat Format;
+  BranchOutputType ShowBranches;
   std::string ShowOutputDirectory;
   std::vector<std::string> DemanglerOpts;
   uint32_t TabSize;
diff --git a/contrib/llvm-project/llvm/tools/llvm-cov/SourceCoverageView.cpp b/contrib/llvm-project/llvm/tools/llvm-cov/SourceCoverageView.cpp
index cd7395a1a87d..aca58a07f250 100644
--- a/contrib/llvm-project/llvm/tools/llvm-cov/SourceCoverageView.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-cov/SourceCoverageView.cpp
@@ -132,7 +132,8 @@ bool SourceCoverageView::shouldRenderRegionMarkers(
 }
 
 bool SourceCoverageView::hasSubViews() const {
-  return !ExpansionSubViews.empty() || !InstantiationSubViews.empty();
+  return !ExpansionSubViews.empty() || !InstantiationSubViews.empty() ||
+         !BranchSubViews.empty();
 }
 
 std::unique_ptr<SourceCoverageView>
@@ -167,6 +168,12 @@ void SourceCoverageView::addExpansion(
   ExpansionSubViews.emplace_back(Region, std::move(View));
 }
 
+void SourceCoverageView::addBranch(unsigned Line,
+                                   ArrayRef<CountedRegion> Regions,
+                                   std::unique_ptr<SourceCoverageView> View) {
+  BranchSubViews.emplace_back(Line, Regions, std::move(View));
+}
+
 void SourceCoverageView::addInstantiation(
     StringRef FunctionName, unsigned Line,
     std::unique_ptr<SourceCoverageView> View) {
@@ -187,14 +194,17 @@ void SourceCoverageView::print(raw_ostream &OS, bool WholeFile,
   renderTableHeader(OS, (ViewDepth > 0) ? 0 : getFirstUncoveredLineNo(),
                     ViewDepth);
 
-  // We need the expansions and instantiations sorted so we can go through them
-  // while we iterate lines.
+  // We need the expansions, instantiations, and branches sorted so we can go
+  // through them while we iterate lines.
   llvm::stable_sort(ExpansionSubViews);
   llvm::stable_sort(InstantiationSubViews);
+  llvm::stable_sort(BranchSubViews);
   auto NextESV = ExpansionSubViews.begin();
   auto EndESV = ExpansionSubViews.end();
   auto NextISV = InstantiationSubViews.begin();
   auto EndISV = InstantiationSubViews.end();
+  auto NextBRV = BranchSubViews.begin();
+  auto EndBRV = BranchSubViews.end();
 
   // Get the coverage information for the file.
   auto StartSegment = CoverageInfo.begin();
@@ -234,7 +244,7 @@ void SourceCoverageView::print(raw_ostream &OS, bool WholeFile,
     if (shouldRenderRegionMarkers(*LCI))
       renderRegionMarkers(OS, *LCI, ViewDepth);
 
-    // Show the expansions and instantiations for this line.
+    // Show the expansions, instantiations, and branches for this line.
     bool RenderedSubView = false;
     for (; NextESV != EndESV && NextESV->getLine() == LI.line_number();
          ++NextESV) {
@@ -257,6 +267,11 @@ void SourceCoverageView::print(raw_ostream &OS, bool WholeFile,
       renderInstantiationView(OS, *NextISV, ViewDepth + 1);
       RenderedSubView = true;
     }
+    for (; NextBRV != EndBRV && NextBRV->Line == LI.line_number(); ++NextBRV) {
+      renderViewDivider(OS, ViewDepth + 1);
+      renderBranchView(OS, *NextBRV, ViewDepth + 1);
+      RenderedSubView = true;
+    }
     if (RenderedSubView)
       renderViewDivider(OS, ViewDepth + 1);
     renderLineSuffix(OS, ViewDepth);
diff --git a/contrib/llvm-project/llvm/tools/llvm-cov/SourceCoverageView.h b/contrib/llvm-project/llvm/tools/llvm-cov/SourceCoverageView.h
index 9ae928443651..5a9fcdd15761 100644
--- a/contrib/llvm-project/llvm/tools/llvm-cov/SourceCoverageView.h
+++ b/contrib/llvm-project/llvm/tools/llvm-cov/SourceCoverageView.h
@@ -67,6 +67,23 @@ struct InstantiationView {
   }
 };
 
+/// A view that represents one or more branch regions on a given source line.
+struct BranchView {
+  std::vector<CountedRegion> Regions;
+  std::unique_ptr<SourceCoverageView> View;
+  unsigned Line;
+
+  BranchView(unsigned Line, ArrayRef<CountedRegion> Regions,
+             std::unique_ptr<SourceCoverageView> View)
+      : Regions(Regions), View(std::move(View)), Line(Line) {}
+
+  unsigned getLine() const { return Line; }
+
+  friend bool operator<(const BranchView &LHS, const BranchView &RHS) {
+    return LHS.Line < RHS.Line;
+  }
+};
+
 /// A file manager that handles format-aware file creation.
 class CoveragePrinter {
 public:
@@ -140,6 +157,9 @@ class SourceCoverageView {
   /// A container for all expansions (e.g macros) in the source on display.
   std::vector<ExpansionView> ExpansionSubViews;
 
+  /// A container for all branches in the source on display.
+  std::vector<BranchView> BranchSubViews;
+
   /// A container for all instantiations (e.g template functions) in the source
   /// on display.
   std::vector<InstantiationView> InstantiationSubViews;
@@ -209,6 +229,10 @@ protected:
   virtual void renderInstantiationView(raw_ostream &OS, InstantiationView &ISV,
                                        unsigned ViewDepth) = 0;
 
+  /// Render a branch view and any nested views.
+  virtual void renderBranchView(raw_ostream &OS, BranchView &BRV,
+                                unsigned ViewDepth) = 0;
+
   /// Render \p Title, a project title if one is available, and the
   /// created time.
   virtual void renderTitle(raw_ostream &OS, StringRef CellText) = 0;
@@ -255,6 +279,10 @@ public:
   void addInstantiation(StringRef FunctionName, unsigned Line,
                         std::unique_ptr<SourceCoverageView> View);
 
+  /// Add a branch subview to this view.
+  void addBranch(unsigned Line, ArrayRef<CountedRegion> Regions,
+                 std::unique_ptr<SourceCoverageView> View);
+
   /// Print the code coverage information for a specific portion of a
   /// source file to the output stream.
   void print(raw_ostream &OS, bool WholeFile, bool ShowSourceName,
diff --git a/contrib/llvm-project/llvm/tools/llvm-cov/SourceCoverageViewHTML.cpp b/contrib/llvm-project/llvm/tools/llvm-cov/SourceCoverageViewHTML.cpp
index 9d10def0a211..7ab1d6608ccf 100644
--- a/contrib/llvm-project/llvm/tools/llvm-cov/SourceCoverageViewHTML.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-cov/SourceCoverageViewHTML.cpp
@@ -248,7 +248,7 @@ const char *ReportTitleTag = "h2";
 const char *CreatedTimeTag = "h4";
 
 std::string getPathToStyle(StringRef ViewPath) {
-  std::string PathToStyle = "";
+  std::string PathToStyle;
   std::string PathSep = std::string(sys::path::get_separator());
   unsigned NumSeps = ViewPath.count(PathSep);
   for (unsigned I = 0, E = NumSeps; I < E; ++I)
@@ -313,6 +313,8 @@ static void emitColumnLabelsForIndex(raw_ostream &OS,
   Columns.emplace_back(tag("td", "Line Coverage", "column-entry-bold"));
   if (Opts.ShowRegionSummary)
     Columns.emplace_back(tag("td", "Region Coverage", "column-entry-bold"));
+  if (Opts.ShowBranchSummary)
+    Columns.emplace_back(tag("td", "Branch Coverage", "column-entry-bold"));
   OS << tag("tr", join(Columns.begin(), Columns.end(), ""));
 }
 
@@ -378,6 +380,10 @@ void CoveragePrinterHTML::emitFileSummary(raw_ostream &OS, StringRef SF,
     AddCoverageTripleToColumn(FCS.RegionCoverage.getCovered(),
                               FCS.RegionCoverage.getNumRegions(),
                               FCS.RegionCoverage.getPercentCovered());
+  if (Opts.ShowBranchSummary)
+    AddCoverageTripleToColumn(FCS.BranchCoverage.getCovered(),
+                              FCS.BranchCoverage.getNumBranches(),
+                              FCS.BranchCoverage.getPercentCovered());
 
   if (IsTotals)
     OS << tag("tr", join(Columns.begin(), Columns.end(), ""), "light-row-bold");
@@ -611,7 +617,7 @@ void SourceCoverageViewHTML::renderLine(raw_ostream &OS, LineRef L,
 
 void SourceCoverageViewHTML::renderLineCoverageColumn(
     raw_ostream &OS, const LineCoverageStats &Line) {
-  std::string Count = "";
+  std::string Count;
   if (Line.isMapped())
     Count = tag("pre", formatCount(Line.getExecutionCount()));
   std::string CoverageClass =
@@ -650,6 +656,72 @@ void SourceCoverageViewHTML::renderExpansionView(raw_ostream &OS,
   OS << EndExpansionDiv;
 }
 
+void SourceCoverageViewHTML::renderBranchView(raw_ostream &OS, BranchView &BRV,
+                                              unsigned ViewDepth) {
+  // Render the child subview.
+  if (getOptions().Debug)
+    errs() << "Branch at line " << BRV.getLine() << '\n';
+
+  OS << BeginExpansionDiv;
+  OS << BeginPre;
+  for (const auto &R : BRV.Regions) {
+    // Calculate TruePercent and False Percent.
+    double TruePercent = 0.0;
+    double FalsePercent = 0.0;
+    unsigned Total = R.ExecutionCount + R.FalseExecutionCount;
+
+    if (!getOptions().ShowBranchCounts && Total != 0) {
+      TruePercent = ((double)(R.ExecutionCount) / (double)Total) * 100.0;
+      FalsePercent = ((double)(R.FalseExecutionCount) / (double)Total) * 100.0;
+    }
+
+    // Display Line + Column.
+    std::string LineNoStr = utostr(uint64_t(R.LineStart));
+    std::string ColNoStr = utostr(uint64_t(R.ColumnStart));
+    std::string TargetName = "L" + LineNoStr;
+
+    OS << "  Branch (";
+    OS << tag("span",
+              a("#" + TargetName, tag("span", LineNoStr + ":" + ColNoStr),
+                TargetName),
+              "line-number") +
+              "): [";
+
+    if (R.Folded) {
+      OS << "Folded - Ignored]\n";
+      continue;
+    }
+
+    // Display TrueCount or TruePercent.
+    std::string TrueColor = R.ExecutionCount ? "None" : "red";
+    std::string TrueCovClass =
+        (R.ExecutionCount > 0) ? "covered-line" : "uncovered-line";
+
+    OS << tag("span", "True", TrueColor);
+    OS << ": ";
+    if (getOptions().ShowBranchCounts)
+      OS << tag("span", formatCount(R.ExecutionCount), TrueCovClass) << ", ";
+    else
+      OS << format("%0.2f", TruePercent) << "%, ";
+
+    // Display FalseCount or FalsePercent.
+    std::string FalseColor = R.FalseExecutionCount ? "None" : "red";
+    std::string FalseCovClass =
+        (R.FalseExecutionCount > 0) ? "covered-line" : "uncovered-line";
+
+    OS << tag("span", "False", FalseColor);
+    OS << ": ";
+    if (getOptions().ShowBranchCounts)
+      OS << tag("span", formatCount(R.FalseExecutionCount), FalseCovClass);
+    else
+      OS << format("%0.2f", FalsePercent) << "%";
+
+    OS << "]\n";
+  }
+  OS << EndPre;
+  OS << EndExpansionDiv;
+}
+
 void SourceCoverageViewHTML::renderInstantiationView(raw_ostream &OS,
                                                      InstantiationView &ISV,
                                                      unsigned ViewDepth) {
diff --git a/contrib/llvm-project/llvm/tools/llvm-cov/SourceCoverageViewHTML.h b/contrib/llvm-project/llvm/tools/llvm-cov/SourceCoverageViewHTML.h
index 9834040008a6..7d94675f4b0b 100644
--- a/contrib/llvm-project/llvm/tools/llvm-cov/SourceCoverageViewHTML.h
+++ b/contrib/llvm-project/llvm/tools/llvm-cov/SourceCoverageViewHTML.h
@@ -68,6 +68,9 @@ class SourceCoverageViewHTML : public SourceCoverageView {
   void renderExpansionView(raw_ostream &OS, ExpansionView &ESV,
                            unsigned ViewDepth) override;
 
+  void renderBranchView(raw_ostream &OS, BranchView &BRV,
+                        unsigned ViewDepth) override;
+
   void renderInstantiationView(raw_ostream &OS, InstantiationView &ISV,
                                unsigned ViewDepth) override;
 
diff --git a/contrib/llvm-project/llvm/tools/llvm-cov/SourceCoverageViewText.cpp b/contrib/llvm-project/llvm/tools/llvm-cov/SourceCoverageViewText.cpp
index fcabee2ee69d..948414a4f995 100644
--- a/contrib/llvm-project/llvm/tools/llvm-cov/SourceCoverageViewText.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-cov/SourceCoverageViewText.cpp
@@ -10,11 +10,12 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "CoverageReport.h"
 #include "SourceCoverageViewText.h"
+#include "CoverageReport.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Format.h"
 
 using namespace llvm;
 
@@ -222,6 +223,53 @@ void SourceCoverageViewText::renderExpansionView(raw_ostream &OS,
                   /*ShowTitle=*/false, ViewDepth + 1);
 }
 
+void SourceCoverageViewText::renderBranchView(raw_ostream &OS, BranchView &BRV,
+                                              unsigned ViewDepth) {
+  // Render the child subview.
+  if (getOptions().Debug)
+    errs() << "Branch at line " << BRV.getLine() << '\n';
+
+  for (const auto &R : BRV.Regions) {
+    double TruePercent = 0.0;
+    double FalsePercent = 0.0;
+    unsigned Total = R.ExecutionCount + R.FalseExecutionCount;
+
+    if (!getOptions().ShowBranchCounts && Total != 0) {
+      TruePercent = ((double)(R.ExecutionCount) / (double)Total) * 100.0;
+      FalsePercent = ((double)(R.FalseExecutionCount) / (double)Total) * 100.0;
+    }
+
+    renderLinePrefix(OS, ViewDepth);
+    OS << "  Branch (" << R.LineStart << ":" << R.ColumnStart << "): [";
+
+    if (R.Folded) {
+      OS << "Folded - Ignored]\n";
+      continue;
+    }
+
+    colored_ostream(OS, raw_ostream::RED,
+                    getOptions().Colors && !R.ExecutionCount,
+                    /*Bold=*/false, /*BG=*/true)
+        << "True";
+
+    if (getOptions().ShowBranchCounts)
+      OS << ": " << formatCount(R.ExecutionCount) << ", ";
+    else
+      OS << ": " << format("%0.2f", TruePercent) << "%, ";
+
+    colored_ostream(OS, raw_ostream::RED,
+                    getOptions().Colors && !R.FalseExecutionCount,
+                    /*Bold=*/false, /*BG=*/true)
+        << "False";
+
+    if (getOptions().ShowBranchCounts)
+      OS << ": " << formatCount(R.FalseExecutionCount);
+    else
+      OS << ": " << format("%0.2f", FalsePercent) << "%";
+    OS << "]\n";
+  }
+}
+
 void SourceCoverageViewText::renderInstantiationView(raw_ostream &OS,
                                                      InstantiationView &ISV,
                                                      unsigned ViewDepth) {
diff --git a/contrib/llvm-project/llvm/tools/llvm-cov/SourceCoverageViewText.h b/contrib/llvm-project/llvm/tools/llvm-cov/SourceCoverageViewText.h
index c8c4632c3b9d..b2be06039f95 100644
--- a/contrib/llvm-project/llvm/tools/llvm-cov/SourceCoverageViewText.h
+++ b/contrib/llvm-project/llvm/tools/llvm-cov/SourceCoverageViewText.h
@@ -59,6 +59,9 @@ class SourceCoverageViewText : public SourceCoverageView {
   void renderExpansionView(raw_ostream &OS, ExpansionView &ESV,
                            unsigned ViewDepth) override;
 
+  void renderBranchView(raw_ostream &OS, BranchView &BRV,
+                        unsigned ViewDepth) override;
+
   void renderInstantiationView(raw_ostream &OS, InstantiationView &ISV,
                                unsigned ViewDepth) override;
 
diff --git a/contrib/llvm-project/llvm/tools/llvm-cov/gcov.cpp b/contrib/llvm-project/llvm/tools/llvm-cov/gcov.cpp
index 7a1dbbfe9338..d42e7cd3b551 100644
--- a/contrib/llvm-project/llvm/tools/llvm-cov/gcov.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-cov/gcov.cpp
@@ -43,8 +43,10 @@ static void reportCoverage(StringRef SourceFile, StringRef ObjectDir,
                          : InputGCDA;
   GCOVFile GF;
 
+  // Open .gcda and .gcda without requiring a NUL terminator. The concurrent
+  // modification may nullify the NUL terminator condition.
   ErrorOr<std::unique_ptr<MemoryBuffer>> GCNO_Buff =
-      MemoryBuffer::getFileOrSTDIN(GCNO);
+      MemoryBuffer::getFileOrSTDIN(GCNO, -1, /*RequiresNullTerminator=*/false);
   if (std::error_code EC = GCNO_Buff.getError()) {
     errs() << GCNO << ": " << EC.message() << "\n";
     return;
@@ -56,7 +58,7 @@ static void reportCoverage(StringRef SourceFile, StringRef ObjectDir,
   }
 
   ErrorOr<std::unique_ptr<MemoryBuffer>> GCDA_Buff =
-      MemoryBuffer::getFileOrSTDIN(GCDA);
+      MemoryBuffer::getFileOrSTDIN(GCDA, -1, /*RequiresNullTerminator=*/false);
   if (std::error_code EC = GCDA_Buff.getError()) {
     if (EC != errc::no_such_file_or_directory) {
       errs() << GCDA << ": " << EC.message() << "\n";
@@ -75,9 +77,7 @@ static void reportCoverage(StringRef SourceFile, StringRef ObjectDir,
   if (DumpGCOV)
     GF.print(errs());
 
-  FileInfo FI(Options);
-  GF.collectLineCounts(FI);
-  FI.print(llvm::outs(), SourceFile, GCNO, GCDA, GF);
+  gcovOneInput(Options, SourceFile, GCNO, GCDA, GF);
 }
 
 int gcovMain(int argc, const char *argv[]) {
@@ -115,6 +115,11 @@ int gcovMain(int argc, const char *argv[]) {
                           cl::Grouping, cl::NotHidden,
                           cl::aliasopt(Intermediate));
 
+  cl::opt<bool> Demangle("demangled-names", cl::init(false),
+                         cl::desc("Demangle function names"));
+  cl::alias DemangleA("m", cl::desc("Alias for --demangled-names"),
+                      cl::Grouping, cl::NotHidden, cl::aliasopt(Demangle));
+
   cl::opt<bool> NoOutput("n", cl::Grouping, cl::init(false),
                          cl::desc("Do not output any .gcov files"));
   cl::alias NoOutputA("no-output", cl::aliasopt(NoOutput));
@@ -129,6 +134,14 @@ int gcovMain(int argc, const char *argv[]) {
                               cl::desc("Preserve path components"));
   cl::alias PreservePathsA("preserve-paths", cl::aliasopt(PreservePaths));
 
+  cl::opt<bool> RelativeOnly(
+      "r", cl::Grouping,
+      cl::desc("Only dump files with relative paths or absolute paths with the "
+               "prefix specified by -s"));
+  cl::alias RelativeOnlyA("relative-only", cl::aliasopt(RelativeOnly));
+  cl::opt<std::string> SourcePrefix("s", cl::desc("Source prefix to elide"));
+  cl::alias SourcePrefixA("source-prefix", cl::aliasopt(SourcePrefix));
+
   cl::opt<bool> UseStdout("t", cl::Grouping, cl::init(false),
                           cl::desc("Print to stdout"));
   cl::alias UseStdoutA("stdout", cl::aliasopt(UseStdout));
@@ -155,7 +168,8 @@ int gcovMain(int argc, const char *argv[]) {
 
   GCOV::Options Options(AllBlocks, BranchProb, BranchCount, FuncSummary,
                         PreservePaths, UncondBranch, Intermediate, LongNames,
-                        NoOutput, UseStdout, HashFilenames);
+                        Demangle, NoOutput, RelativeOnly, UseStdout,
+                        HashFilenames, SourcePrefix);
 
   for (const auto &SourceFile : SourceFiles)
     reportCoverage(SourceFile, ObjectDir, InputGCNO, InputGCDA, DumpGCOV,
diff --git a/contrib/llvm-project/llvm/tools/llvm-diff/DifferenceEngine.cpp b/contrib/llvm-project/llvm/tools/llvm-diff/DifferenceEngine.cpp
index 2cf1afbc6af5..64c0dc61e806 100644
--- a/contrib/llvm-project/llvm/tools/llvm-diff/DifferenceEngine.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-diff/DifferenceEngine.cpp
@@ -67,7 +67,7 @@ public:
     unsigned NewSize = Storage.size() - 1;
     if (NewSize) {
       // Move the slot at the end to the beginning.
-      if (is_trivially_copyable<T>::value)
+      if (std::is_trivially_copyable<T>::value)
         Storage[0] = Storage[NewSize];
       else
         std::swap(Storage[0], Storage[NewSize]);
diff --git a/contrib/llvm-project/llvm/tools/llvm-dwarfdump/Statistics.cpp b/contrib/llvm-project/llvm/tools/llvm-dwarfdump/Statistics.cpp
index 18b4c40c4d75..82da06eab1d6 100644
--- a/contrib/llvm-project/llvm/tools/llvm-dwarfdump/Statistics.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-dwarfdump/Statistics.cpp
@@ -71,6 +71,8 @@ struct PerFunctionStats {
 /// Holds accumulated global statistics about DIEs.
 struct GlobalStats {
   /// Total number of PC range bytes covered by DW_AT_locations.
+  unsigned TotalBytesCovered = 0;
+  /// Total number of parent DIE PC range bytes covered by DW_AT_Locations.
   unsigned ScopeBytesCovered = 0;
   /// Total number of PC range bytes in each variable's enclosing scope.
   unsigned ScopeBytes = 0;
@@ -143,20 +145,20 @@ struct LocationStats {
 } // namespace
 
 /// Collect debug location statistics for one DIE.
-static void collectLocStats(uint64_t BytesCovered, uint64_t BytesInScope,
+static void collectLocStats(uint64_t ScopeBytesCovered, uint64_t BytesInScope,
                             std::vector<unsigned> &VarParamLocStats,
                             std::vector<unsigned> &ParamLocStats,
                             std::vector<unsigned> &LocalVarLocStats,
                             bool IsParam, bool IsLocalVar) {
-  auto getCoverageBucket = [BytesCovered, BytesInScope]() -> unsigned {
+  auto getCoverageBucket = [ScopeBytesCovered, BytesInScope]() -> unsigned {
     // No debug location at all for the variable.
-    if (BytesCovered == 0)
+    if (ScopeBytesCovered == 0)
       return 0;
     // Fully covered variable within its scope.
-    if (BytesCovered >= BytesInScope)
+    if (ScopeBytesCovered >= BytesInScope)
       return NumOfCoverageCategories - 1;
     // Get covered range (e.g. 20%-29%).
-    unsigned LocBucket = 100 * (double)BytesCovered / BytesInScope;
+    unsigned LocBucket = 100 * (double)ScopeBytesCovered / BytesInScope;
     LocBucket /= 10;
     return LocBucket + 1;
   };
@@ -198,6 +200,15 @@ static std::string constructDieID(DWARFDie Die,
   return ID.str();
 }
 
+/// Return the number of bytes in the overlap of ranges A and B.
+static uint64_t calculateOverlap(DWARFAddressRange A, DWARFAddressRange B) {
+  uint64_t Lower = std::max(A.LowPC, B.LowPC);
+  uint64_t Upper = std::min(A.HighPC, B.HighPC);
+  if (Lower >= Upper)
+    return 0;
+  return Upper - Lower;
+}
+
 /// Collect debug info quality metrics for one DIE.
 static void collectStatsForDie(DWARFDie Die, std::string FnPrefix,
                                std::string VarPrefix, uint64_t BytesInScope,
@@ -208,7 +219,8 @@ static void collectStatsForDie(DWARFDie Die, std::string FnPrefix,
   bool HasLoc = false;
   bool HasSrcLoc = false;
   bool HasType = false;
-  uint64_t BytesCovered = 0;
+  uint64_t TotalBytesCovered = 0;
+  uint64_t ScopeBytesCovered = 0;
   uint64_t BytesEntryValuesCovered = 0;
   auto &FnStats = FnStatMap[FnPrefix];
   bool IsParam = Die.getTag() == dwarf::DW_TAG_formal_parameter;
@@ -261,7 +273,8 @@ static void collectStatsForDie(DWARFDie Die, std::string FnPrefix,
   if (Die.find(dwarf::DW_AT_const_value)) {
     // This catches constant members *and* variables.
     HasLoc = true;
-    BytesCovered = BytesInScope;
+    ScopeBytesCovered = BytesInScope;
+    TotalBytesCovered = BytesInScope;
   } else {
     // Handle variables and function arguments.
     Expected<std::vector<DWARFLocationExpression>> Loc =
@@ -275,13 +288,27 @@ static void collectStatsForDie(DWARFDie Die, std::string FnPrefix,
           *Loc, [](const DWARFLocationExpression &L) { return !L.Range; });
       if (Default != Loc->end()) {
         // Assume the entire range is covered by a single location.
-        BytesCovered = BytesInScope;
+        ScopeBytesCovered = BytesInScope;
+        TotalBytesCovered = BytesInScope;
       } else {
+        // Caller checks this Expected result already, it cannot fail.
+        auto ScopeRanges = cantFail(Die.getParent().getAddressRanges());
         for (auto Entry : *Loc) {
-          uint64_t BytesEntryCovered = Entry.Range->HighPC - Entry.Range->LowPC;
-          BytesCovered += BytesEntryCovered;
+          TotalBytesCovered += Entry.Range->HighPC - Entry.Range->LowPC;
+          uint64_t ScopeBytesCoveredByEntry = 0;
+          // Calculate how many bytes of the parent scope this entry covers.
+          // FIXME: In section 2.6.2 of the DWARFv5 spec it says that "The
+          // address ranges defined by the bounded location descriptions of a
+          // location list may overlap". So in theory a variable can have
+          // multiple simultaneous locations, which would make this calculation
+          // misleading because we will count the overlapped areas
+          // twice. However, clang does not currently emit DWARF like this.
+          for (DWARFAddressRange R : ScopeRanges) {
+            ScopeBytesCoveredByEntry += calculateOverlap(*Entry.Range, R);
+          }
+          ScopeBytesCovered += ScopeBytesCoveredByEntry;
           if (IsEntryValue(Entry.Expr))
-            BytesEntryValuesCovered += BytesEntryCovered;
+            BytesEntryValuesCovered += ScopeBytesCoveredByEntry;
         }
       }
     }
@@ -295,11 +322,11 @@ static void collectStatsForDie(DWARFDie Die, std::string FnPrefix,
     else if (IsLocalVar)
       LocStats.NumVar++;
 
-    collectLocStats(BytesCovered, BytesInScope, LocStats.VarParamLocStats,
+    collectLocStats(ScopeBytesCovered, BytesInScope, LocStats.VarParamLocStats,
                     LocStats.ParamLocStats, LocStats.LocalVarLocStats, IsParam,
                     IsLocalVar);
     // Non debug entry values coverage statistics.
-    collectLocStats(BytesCovered - BytesEntryValuesCovered, BytesInScope,
+    collectLocStats(ScopeBytesCovered - BytesEntryValuesCovered, BytesInScope,
                     LocStats.VarParamNonEntryValLocStats,
                     LocStats.ParamNonEntryValLocStats,
                     LocStats.LocalVarNonEntryValLocStats, IsParam, IsLocalVar);
@@ -313,19 +340,17 @@ static void collectStatsForDie(DWARFDie Die, std::string FnPrefix,
   std::string VarID = constructDieID(Die, VarPrefix);
   FnStats.VarsInFunction.insert(VarID);
 
+  GlobalStats.TotalBytesCovered += TotalBytesCovered;
   if (BytesInScope) {
-    // Turns out we have a lot of ranges that extend past the lexical scope.
-    GlobalStats.ScopeBytesCovered += std::min(BytesInScope, BytesCovered);
+    GlobalStats.ScopeBytesCovered += ScopeBytesCovered;
     GlobalStats.ScopeBytes += BytesInScope;
     GlobalStats.ScopeEntryValueBytesCovered += BytesEntryValuesCovered;
     if (IsParam) {
-      GlobalStats.ParamScopeBytesCovered +=
-          std::min(BytesInScope, BytesCovered);
+      GlobalStats.ParamScopeBytesCovered += ScopeBytesCovered;
       GlobalStats.ParamScopeBytes += BytesInScope;
       GlobalStats.ParamScopeEntryValueBytesCovered += BytesEntryValuesCovered;
     } else if (IsLocalVar) {
-      GlobalStats.LocalVarScopeBytesCovered +=
-          std::min(BytesInScope, BytesCovered);
+      GlobalStats.LocalVarScopeBytesCovered += ScopeBytesCovered;
       GlobalStats.LocalVarScopeBytes += BytesInScope;
       GlobalStats.LocalVarScopeEntryValueBytesCovered +=
           BytesEntryValuesCovered;
@@ -466,49 +491,54 @@ static void collectStatsRecursive(DWARFDie Die, std::string FnPrefix,
   }
 }
 
-/// Print machine-readable output.
-/// The machine-readable format is single-line JSON output.
+/// Print human-readable output.
 /// \{
-static void printDatum(raw_ostream &OS, const char *Key, json::Value Value) {
-  OS << ",\"" << Key << "\":" << Value;
+static void printDatum(json::OStream &J, const char *Key, json::Value Value) {
+  J.attribute(Key, Value);
   LLVM_DEBUG(llvm::dbgs() << Key << ": " << Value << '\n');
 }
 
-static void printLocationStats(raw_ostream &OS, const char *Key,
+static void printLocationStats(json::OStream &J, const char *Key,
                                std::vector<unsigned> &LocationStats) {
-  OS << ",\"" << Key << " with 0% of parent scope covered by DW_AT_location\":"
-     << LocationStats[0];
+  J.attribute(
+      (Twine(Key) + " with 0% of parent scope covered by DW_AT_location").str(),
+      LocationStats[0]);
   LLVM_DEBUG(
       llvm::dbgs() << Key
                    << " with 0% of parent scope covered by DW_AT_location: \\"
                    << LocationStats[0] << '\n');
-  OS << ",\"" << Key
-     << " with (0%,10%) of parent scope covered by DW_AT_location\":"
-     << LocationStats[1];
+  J.attribute(
+      (Twine(Key) + " with (0%,10%) of parent scope covered by DW_AT_location")
+          .str(),
+      LocationStats[1]);
   LLVM_DEBUG(llvm::dbgs()
              << Key
              << " with (0%,10%) of parent scope covered by DW_AT_location: "
              << LocationStats[1] << '\n');
   for (unsigned i = 2; i < NumOfCoverageCategories - 1; ++i) {
-    OS << ",\"" << Key << " with [" << (i - 1) * 10 << "%," << i * 10
-       << "%) of parent scope covered by DW_AT_location\":" << LocationStats[i];
+    J.attribute((Twine(Key) + " with [" + Twine((i - 1) * 10) + "%," +
+                 Twine(i * 10) + "%) of parent scope covered by DW_AT_location")
+                    .str(),
+                LocationStats[i]);
     LLVM_DEBUG(llvm::dbgs()
                << Key << " with [" << (i - 1) * 10 << "%," << i * 10
                << "%) of parent scope covered by DW_AT_location: "
                << LocationStats[i]);
   }
-  OS << ",\"" << Key
-     << " with 100% of parent scope covered by DW_AT_location\":"
-     << LocationStats[NumOfCoverageCategories - 1];
+  J.attribute(
+      (Twine(Key) + " with 100% of parent scope covered by DW_AT_location")
+          .str(),
+      LocationStats[NumOfCoverageCategories - 1]);
   LLVM_DEBUG(
       llvm::dbgs() << Key
                    << " with 100% of parent scope covered by DW_AT_location: "
                    << LocationStats[NumOfCoverageCategories - 1]);
 }
 
-static void printSectionSizes(raw_ostream &OS, const SectionSizes &Sizes) {
+static void printSectionSizes(json::OStream &J, const SectionSizes &Sizes) {
   for (const auto &DebugSec : Sizes.DebugSectionSizes)
-    OS << ",\"#bytes in " << DebugSec.getKey() << "\":" << DebugSec.getValue();
+    J.attribute((Twine("#bytes in ") + DebugSec.getKey()).str(),
+                int64_t(DebugSec.getValue()));
 }
 
 /// \}
@@ -540,7 +570,7 @@ bool dwarfdump::collectStatsForObjectFile(ObjectFile &Obj, DWARFContext &DICtx,
   /// The version number should be increased every time the algorithm is changed
   /// (including bug fixes). New metrics may be added without increasing the
   /// version.
-  unsigned Version = 5;
+  unsigned Version = 6;
   unsigned VarParamTotal = 0;
   unsigned VarParamUnique = 0;
   unsigned VarParamWithLoc = 0;
@@ -587,101 +617,105 @@ bool dwarfdump::collectStatsForObjectFile(ObjectFile &Obj, DWARFContext &DICtx,
 
   // Print summary.
   OS.SetBufferSize(1024);
-  OS << "{\"version\":" << Version;
+  json::OStream J(OS, 2);
+  J.objectBegin();
+  J.attribute("version", Version);
   LLVM_DEBUG(llvm::dbgs() << "Variable location quality metrics\n";
              llvm::dbgs() << "---------------------------------\n");
 
-  printDatum(OS, "file", Filename.str());
-  printDatum(OS, "format", FormatName);
+  printDatum(J, "file", Filename.str());
+  printDatum(J, "format", FormatName);
 
-  printDatum(OS, "#functions", NumFunctions);
-  printDatum(OS, "#functions with location", NumFuncsWithSrcLoc);
-  printDatum(OS, "#inlined functions", NumInlinedFunctions);
-  printDatum(OS, "#inlined functions with abstract origins",
-             NumAbstractOrigins);
+  printDatum(J, "#functions", NumFunctions);
+  printDatum(J, "#functions with location", NumFuncsWithSrcLoc);
+  printDatum(J, "#inlined functions", NumInlinedFunctions);
+  printDatum(J, "#inlined functions with abstract origins", NumAbstractOrigins);
 
   // This includes local variables and formal parameters.
-  printDatum(OS, "#unique source variables", VarParamUnique);
-  printDatum(OS, "#source variables", VarParamTotal);
-  printDatum(OS, "#source variables with location", VarParamWithLoc);
+  printDatum(J, "#unique source variables", VarParamUnique);
+  printDatum(J, "#source variables", VarParamTotal);
+  printDatum(J, "#source variables with location", VarParamWithLoc);
 
-  printDatum(OS, "#call site entries", GlobalStats.CallSiteEntries);
-  printDatum(OS, "#call site DIEs", GlobalStats.CallSiteDIEs);
-  printDatum(OS, "#call site parameter DIEs", GlobalStats.CallSiteParamDIEs);
+  printDatum(J, "#call site entries", GlobalStats.CallSiteEntries);
+  printDatum(J, "#call site DIEs", GlobalStats.CallSiteDIEs);
+  printDatum(J, "#call site parameter DIEs", GlobalStats.CallSiteParamDIEs);
 
-  printDatum(OS, "sum_all_variables(#bytes in parent scope)",
+  printDatum(J, "sum_all_variables(#bytes in parent scope)",
              GlobalStats.ScopeBytes);
-  printDatum(OS,
+  printDatum(J,
+             "sum_all_variables(#bytes in any scope covered by DW_AT_location)",
+             GlobalStats.TotalBytesCovered);
+  printDatum(J,
              "sum_all_variables(#bytes in parent scope covered by "
              "DW_AT_location)",
              GlobalStats.ScopeBytesCovered);
-  printDatum(OS,
+  printDatum(J,
              "sum_all_variables(#bytes in parent scope covered by "
              "DW_OP_entry_value)",
              GlobalStats.ScopeEntryValueBytesCovered);
 
-  printDatum(OS, "sum_all_params(#bytes in parent scope)",
+  printDatum(J, "sum_all_params(#bytes in parent scope)",
              GlobalStats.ParamScopeBytes);
-  printDatum(
-      OS,
-      "sum_all_params(#bytes in parent scope covered by DW_AT_location)",
-      GlobalStats.ParamScopeBytesCovered);
-  printDatum(OS,
+  printDatum(J,
+             "sum_all_params(#bytes in parent scope covered by DW_AT_location)",
+             GlobalStats.ParamScopeBytesCovered);
+  printDatum(J,
              "sum_all_params(#bytes in parent scope covered by "
              "DW_OP_entry_value)",
              GlobalStats.ParamScopeEntryValueBytesCovered);
 
-  printDatum(OS, "sum_all_local_vars(#bytes in parent scope)",
+  printDatum(J, "sum_all_local_vars(#bytes in parent scope)",
              GlobalStats.LocalVarScopeBytes);
-  printDatum(OS,
+  printDatum(J,
              "sum_all_local_vars(#bytes in parent scope covered by "
              "DW_AT_location)",
              GlobalStats.LocalVarScopeBytesCovered);
-  printDatum(OS,
+  printDatum(J,
              "sum_all_local_vars(#bytes in parent scope covered by "
              "DW_OP_entry_value)",
              GlobalStats.LocalVarScopeEntryValueBytesCovered);
 
-  printDatum(OS, "#bytes witin functions", GlobalStats.FunctionSize);
-  printDatum(OS, "#bytes witin inlined functions",
+  printDatum(J, "#bytes within functions", GlobalStats.FunctionSize);
+  printDatum(J, "#bytes within inlined functions",
              GlobalStats.InlineFunctionSize);
 
   // Print the summary for formal parameters.
-  printDatum(OS, "#params", ParamTotal);
-  printDatum(OS, "#params with source location", ParamWithSrcLoc);
-  printDatum(OS, "#params with type", ParamWithType);
-  printDatum(OS, "#params with binary location", ParamWithLoc);
+  printDatum(J, "#params", ParamTotal);
+  printDatum(J, "#params with source location", ParamWithSrcLoc);
+  printDatum(J, "#params with type", ParamWithType);
+  printDatum(J, "#params with binary location", ParamWithLoc);
 
   // Print the summary for local variables.
-  printDatum(OS, "#local vars", LocalVarTotal);
-  printDatum(OS, "#local vars with source location", LocalVarWithSrcLoc);
-  printDatum(OS, "#local vars with type", LocalVarWithType);
-  printDatum(OS, "#local vars with binary location", LocalVarWithLoc);
+  printDatum(J, "#local vars", LocalVarTotal);
+  printDatum(J, "#local vars with source location", LocalVarWithSrcLoc);
+  printDatum(J, "#local vars with type", LocalVarWithType);
+  printDatum(J, "#local vars with binary location", LocalVarWithLoc);
 
   // Print the debug section sizes.
-  printSectionSizes(OS, Sizes);
+  printSectionSizes(J, Sizes);
 
   // Print the location statistics for variables (includes local variables
   // and formal parameters).
-  printDatum(OS, "#variables processed by location statistics",
+  printDatum(J, "#variables processed by location statistics",
              LocStats.NumVarParam);
-  printLocationStats(OS, "#variables", LocStats.VarParamLocStats);
-  printLocationStats(OS, "#variables - entry values",
+  printLocationStats(J, "#variables", LocStats.VarParamLocStats);
+  printLocationStats(J, "#variables - entry values",
                      LocStats.VarParamNonEntryValLocStats);
 
   // Print the location statistics for formal parameters.
-  printDatum(OS, "#params processed by location statistics", LocStats.NumParam);
-  printLocationStats(OS, "#params", LocStats.ParamLocStats);
-  printLocationStats(OS, "#params - entry values",
+  printDatum(J, "#params processed by location statistics", LocStats.NumParam);
+  printLocationStats(J, "#params", LocStats.ParamLocStats);
+  printLocationStats(J, "#params - entry values",
                      LocStats.ParamNonEntryValLocStats);
 
   // Print the location statistics for local variables.
-  printDatum(OS, "#local vars processed by location statistics",
+  printDatum(J, "#local vars processed by location statistics",
              LocStats.NumVar);
-  printLocationStats(OS, "#local vars", LocStats.LocalVarLocStats);
-  printLocationStats(OS, "#local vars - entry values",
+  printLocationStats(J, "#local vars", LocStats.LocalVarLocStats);
+  printLocationStats(J, "#local vars - entry values",
                      LocStats.LocalVarNonEntryValLocStats);
-  OS << "}\n";
+  J.objectEnd();
+  OS << '\n';
   LLVM_DEBUG(
       llvm::dbgs() << "Total Availability: "
                    << (int)std::round((VarParamWithLoc * 100.0) / VarParamTotal)
diff --git a/contrib/llvm-project/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp b/contrib/llvm-project/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
index d8fa4f9953dc..4322f125a84d 100644
--- a/contrib/llvm-project/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
@@ -624,7 +624,7 @@ int main(int argc, char **argv) {
   if (Diff && Verbose) {
     WithColor::error() << "incompatible arguments: specifying both -diff and "
                           "-verbose is currently not supported";
-    return 0;
+    return 1;
   }
 
   std::error_code EC;
@@ -670,7 +670,7 @@ int main(int argc, char **argv) {
   std::vector<std::string> Objects;
   for (const auto &F : InputFilenames) {
     auto Objs = expandBundle(F);
-    Objects.insert(Objects.end(), Objs.begin(), Objs.end());
+    llvm::append_range(Objects, Objs);
   }
 
   bool Success = true;
diff --git a/contrib/llvm-project/llvm/tools/llvm-dwp/llvm-dwp.cpp b/contrib/llvm-project/llvm/tools/llvm-dwp/llvm-dwp.cpp
index d5ebe5ab0a57..d495bd3d4cab 100644
--- a/contrib/llvm-project/llvm/tools/llvm-dwp/llvm-dwp.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-dwp/llvm-dwp.cpp
@@ -377,7 +377,8 @@ writeIndex(MCStreamer &Out, MCSection *Section,
                   &DWARFUnitIndex::Entry::SectionContribution::Length);
 }
 
-std::string buildDWODescription(StringRef Name, StringRef DWPName, StringRef DWOName) {
+static std::string buildDWODescription(StringRef Name, StringRef DWPName,
+                                       StringRef DWOName) {
   std::string Text = "\'";
   Text += Name;
   Text += '\'';
@@ -525,8 +526,8 @@ getDWOFilenames(StringRef ExecFilename) {
     std::string DWOCompDir =
         dwarf::toString(Die.find(dwarf::DW_AT_comp_dir), "");
     if (!DWOCompDir.empty()) {
-      SmallString<16> DWOPath;
-      sys::path::append(DWOPath, DWOCompDir, DWOName);
+      SmallString<16> DWOPath(std::move(DWOName));
+      sys::fs::make_absolute(DWOCompDir, DWOPath);
       DWOPaths.emplace_back(DWOPath.data(), DWOPath.size());
     } else {
       DWOPaths.push_back(std::move(DWOName));
@@ -695,6 +696,14 @@ static int error(const Twine &Error, const Twine &Context) {
   return 1;
 }
 
+static Expected<Triple> readTargetTriple(StringRef FileName) {
+  auto ErrOrObj = object::ObjectFile::createObjectFile(FileName);
+  if (!ErrOrObj)
+    return ErrOrObj.takeError();
+
+  return ErrOrObj->getBinary()->makeTriple();
+}
+
 int main(int argc, char **argv) {
   InitLLVM X(argc, argv);
 
@@ -705,17 +714,36 @@ int main(int argc, char **argv) {
   llvm::InitializeAllTargets();
   llvm::InitializeAllAsmPrinters();
 
+  std::vector<std::string> DWOFilenames = InputFiles;
+  for (const auto &ExecFilename : ExecFilenames) {
+    auto DWOs = getDWOFilenames(ExecFilename);
+    if (!DWOs) {
+      logAllUnhandledErrors(DWOs.takeError(), WithColor::error());
+      return 1;
+    }
+    DWOFilenames.insert(DWOFilenames.end(),
+                        std::make_move_iterator(DWOs->begin()),
+                        std::make_move_iterator(DWOs->end()));
+  }
+
+  if (DWOFilenames.empty())
+    return 0;
+
   std::string ErrorStr;
   StringRef Context = "dwarf streamer init";
 
-  Triple TheTriple("x86_64-linux-gnu");
+  auto ErrOrTriple = readTargetTriple(DWOFilenames.front());
+  if (!ErrOrTriple) {
+    logAllUnhandledErrors(ErrOrTriple.takeError(), WithColor::error());
+    return 1;
+  }
 
   // Get the target.
   const Target *TheTarget =
-      TargetRegistry::lookupTarget("", TheTriple, ErrorStr);
+      TargetRegistry::lookupTarget("", *ErrOrTriple, ErrorStr);
   if (!TheTarget)
     return error(ErrorStr, Context);
-  std::string TripleName = TheTriple.getTriple();
+  std::string TripleName = ErrOrTriple->getTriple();
 
   // Create all the MC Objects.
   std::unique_ptr<MCRegisterInfo> MRI(TheTarget->createMCRegInfo(TripleName));
@@ -730,7 +758,7 @@ int main(int argc, char **argv) {
 
   MCObjectFileInfo MOFI;
   MCContext MC(MAI.get(), MRI.get(), &MOFI);
-  MOFI.InitMCObjectFileInfo(TheTriple, /*PIC*/ false, MC);
+  MOFI.InitMCObjectFileInfo(*ErrOrTriple, /*PIC*/ false, MC);
 
   std::unique_ptr<MCSubtargetInfo> MSTI(
       TheTarget->createMCSubtargetInfo(TripleName, "", ""));
@@ -765,25 +793,13 @@ int main(int argc, char **argv) {
   }
 
   std::unique_ptr<MCStreamer> MS(TheTarget->createMCObjectStreamer(
-      TheTriple, MC, std::unique_ptr<MCAsmBackend>(MAB),
+      *ErrOrTriple, MC, std::unique_ptr<MCAsmBackend>(MAB),
       MAB->createObjectWriter(*OS), std::unique_ptr<MCCodeEmitter>(MCE), *MSTI,
       MCOptions.MCRelaxAll, MCOptions.MCIncrementalLinkerCompatible,
       /*DWARFMustBeAtTheEnd*/ false));
   if (!MS)
     return error("no object streamer for target " + TripleName, Context);
 
-  std::vector<std::string> DWOFilenames = InputFiles;
-  for (const auto &ExecFilename : ExecFilenames) {
-    auto DWOs = getDWOFilenames(ExecFilename);
-    if (!DWOs) {
-      logAllUnhandledErrors(DWOs.takeError(), WithColor::error());
-      return 1;
-    }
-    DWOFilenames.insert(DWOFilenames.end(),
-                        std::make_move_iterator(DWOs->begin()),
-                        std::make_move_iterator(DWOs->end()));
-  }
-
   if (auto Err = write(*MS, DWOFilenames)) {
     logAllUnhandledErrors(std::move(Err), WithColor::error());
     return 1;
diff --git a/contrib/llvm-project/llvm/tools/llvm-link/llvm-link.cpp b/contrib/llvm-project/llvm/tools/llvm-link/llvm-link.cpp
index 7141bd1ca7a1..eed49c438335 100644
--- a/contrib/llvm-project/llvm/tools/llvm-link/llvm-link.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-link/llvm-link.cpp
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Object/Archive.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/BinaryFormat/Magic.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
 #include "llvm/IR/AutoUpgrade.h"
@@ -24,6 +24,7 @@
 #include "llvm/IR/Verifier.h"
 #include "llvm/IRReader/IRReader.h"
 #include "llvm/Linker/Linker.h"
+#include "llvm/Object/Archive.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/InitLLVM.h"
@@ -115,17 +116,18 @@ static ExitOnError ExitOnErr;
 // link path for the specified file to try to find it...
 //
 static std::unique_ptr<Module> loadFile(const char *argv0,
-                                        const std::string &FN,
+                                        std::unique_ptr<MemoryBuffer> Buffer,
                                         LLVMContext &Context,
                                         bool MaterializeMetadata = true) {
   SMDiagnostic Err;
   if (Verbose)
-    errs() << "Loading '" << FN << "'\n";
+    errs() << "Loading '" << Buffer->getBufferIdentifier() << "'\n";
   std::unique_ptr<Module> Result;
   if (DisableLazyLoad)
-    Result = parseIRFile(FN, Err, Context);
+    Result = parseIR(*Buffer, Err, Context);
   else
-    Result = getLazyIRFileModule(FN, Err, Context, !MaterializeMetadata);
+    Result =
+        getLazyIRModule(std::move(Buffer), Err, Context, !MaterializeMetadata);
 
   if (!Result) {
     Err.print(argv0, errs());
@@ -141,20 +143,17 @@ static std::unique_ptr<Module> loadFile(const char *argv0,
 }
 
 static std::unique_ptr<Module> loadArFile(const char *Argv0,
-                                          const std::string &ArchiveName,
-                                          LLVMContext &Context, Linker &L,
-                                          unsigned OrigFlags,
-                                          unsigned ApplicableFlags) {
+                                          std::unique_ptr<MemoryBuffer> Buffer,
+                                          LLVMContext &Context) {
   std::unique_ptr<Module> Result(new Module("ArchiveModule", Context));
+  StringRef ArchiveName = Buffer->getBufferIdentifier();
   if (Verbose)
     errs() << "Reading library archive file '" << ArchiveName
            << "' to memory\n";
-  ErrorOr<std::unique_ptr<MemoryBuffer>> Buf =
-    MemoryBuffer::getFile(ArchiveName, -1, false);
-  ExitOnErr(errorCodeToError(Buf.getError()));
   Error Err = Error::success();
-  object::Archive Archive(Buf.get()->getMemBufferRef(), Err);
+  object::Archive Archive(*Buffer, Err);
   ExitOnErr(std::move(Err));
+  Linker L(*Result);
   for (const object::Archive::Child &C : Archive.children(Err)) {
     Expected<StringRef> Ename = C.getName();
     if (Error E = Ename.takeError()) {
@@ -163,7 +162,7 @@ static std::unique_ptr<Module> loadArFile(const char *Argv0,
           << " failed to read name of archive member"
           << ArchiveName << "'\n";
       return nullptr;
-    };
+    }
     std::string ChildName = Ename.get().str();
     if (Verbose)
       errs() << "Parsing member '" << ChildName
@@ -188,7 +187,12 @@ static std::unique_ptr<Module> loadArFile(const char *Argv0,
       return nullptr;
     }
 
-    std::unique_ptr<Module> M = parseIR(MemBuf.get(), ParseErr, Context);
+    std::unique_ptr<Module> M;
+    if (DisableLazyLoad)
+      M = parseIR(MemBuf.get(), ParseErr, Context);
+    else
+      M = getLazyIRModule(MemoryBuffer::getMemBuffer(MemBuf.get(), false),
+                          ParseErr, Context);
 
     if (!M.get()) {
       errs() << Argv0 << ": ";
@@ -199,9 +203,8 @@ static std::unique_ptr<Module> loadArFile(const char *Argv0,
     }
     if (Verbose)
       errs() << "Linking member '" << ChildName << "' of archive library.\n";
-    if (L.linkModules(*Result, std::move(M), ApplicableFlags))
+    if (L.linkInModule(std::move(M)))
       return nullptr;
-    ApplicableFlags = OrigFlags;
   } // end for each child
   ExitOnErr(std::move(Err));
   return Result;
@@ -287,7 +290,9 @@ static bool importFunctions(const char *argv0, Module &DestModule) {
 
   auto ModuleLoader = [&DestModule](const char *argv0,
                                     const std::string &Identifier) {
-    return loadFile(argv0, Identifier, DestModule.getContext(), false);
+    std::unique_ptr<MemoryBuffer> Buffer =
+        ExitOnErr(errorOrToExpected(MemoryBuffer::getFileOrSTDIN(Identifier)));
+    return loadFile(argv0, std::move(Buffer), DestModule.getContext(), false);
   };
 
   ModuleLazyLoaderCache ModuleLoaderCache(ModuleLoader);
@@ -349,10 +354,13 @@ static bool linkFiles(const char *argv0, LLVMContext &Context, Linker &L,
   // Similar to some flags, internalization doesn't apply to the first file.
   bool InternalizeLinkedSymbols = false;
   for (const auto &File : Files) {
+    std::unique_ptr<MemoryBuffer> Buffer =
+        ExitOnErr(errorOrToExpected(MemoryBuffer::getFileOrSTDIN(File)));
+
     std::unique_ptr<Module> M =
-      (llvm::sys::path::extension(File) == ".a")
-          ? loadArFile(argv0, File, Context, L, Flags, ApplicableFlags)
-          : loadFile(argv0, File, Context);
+        identify_magic(Buffer->getBuffer()) == file_magic::archive
+            ? loadArFile(argv0, std::move(Buffer), Context)
+            : loadFile(argv0, std::move(Buffer), Context);
     if (!M.get()) {
       errs() << argv0 << ": ";
       WithColor::error() << " loading file '" << File << "'\n";
@@ -454,7 +462,8 @@ int main(int argc, char **argv) {
     errs() << "Here's the assembly:\n" << *Composite;
 
   std::error_code EC;
-  ToolOutputFile Out(OutputFilename, EC, sys::fs::OF_None);
+  ToolOutputFile Out(OutputFilename, EC,
+                     OutputAssembly ? sys::fs::OF_Text : sys::fs::OF_None);
   if (EC) {
     WithColor::error() << EC.message() << '\n';
     return 1;
diff --git a/contrib/llvm-project/llvm/tools/llvm-lto/llvm-lto.cpp b/contrib/llvm-project/llvm/tools/llvm-lto/llvm-lto.cpp
index 0bd9078f2d8c..1745b7449714 100644
--- a/contrib/llvm-project/llvm/tools/llvm-lto/llvm-lto.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-lto/llvm-lto.cpp
@@ -78,17 +78,6 @@ static cl::opt<bool> DisableVerify(
     "disable-verify", cl::init(false),
     cl::desc("Do not run the verifier during the optimization pipeline"));
 
-static cl::opt<bool> DisableInline("disable-inlining", cl::init(false),
-                                   cl::desc("Do not run the inliner pass"));
-
-static cl::opt<bool>
-    DisableGVNLoadPRE("disable-gvn-loadpre", cl::init(false),
-                      cl::desc("Do not run the GVN load PRE pass"));
-
-static cl::opt<bool> DisableLTOVectorization(
-    "disable-lto-vectorization", cl::init(false),
-    cl::desc("Do not run loop or slp vectorization during LTO"));
-
 static cl::opt<bool> EnableFreestanding(
     "lto-freestanding", cl::init(false),
     cl::desc("Enable Freestanding (disable builtins / TLI) during LTO"));
@@ -181,6 +170,10 @@ static cl::opt<std::string> ThinLTOGeneratedObjectsDir(
     cl::desc("Save ThinLTO generated object files using filenames created in "
              "the given directory."));
 
+static cl::opt<bool> SaveLinkedModuleFile(
+    "save-linked-module", cl::init(false),
+    cl::desc("Write linked LTO module to file before optimize"));
+
 static cl::opt<bool>
     SaveModuleFile("save-merged-module", cl::init(false),
                    cl::desc("Write merged LTO module to file before CodeGen"));
@@ -333,7 +326,7 @@ getLocalLTOModule(StringRef Path, std::unique_ptr<MemoryBuffer> &Buffer,
 }
 
 /// Print some statistics on the index for each input files.
-void printIndexStats() {
+static void printIndexStats() {
   for (auto &Filename : InputFilenames) {
     ExitOnError ExitOnErr("llvm-lto: error loading file '" + Filename + "': ");
     std::unique_ptr<ModuleSummaryIndex> Index =
@@ -414,7 +407,7 @@ static void printMachOCPUOnly() {
   LLVMContext Context;
   Context.setDiagnosticHandler(std::make_unique<LLVMLTODiagnosticHandler>(),
                                true);
-  TargetOptions Options = codegen::InitTargetOptionsFromCodeGenFlags();
+  TargetOptions Options = codegen::InitTargetOptionsFromCodeGenFlags(Triple());
   for (auto &Filename : InputFilenames) {
     ErrorOr<std::unique_ptr<LTOModule>> ModuleOrErr =
         LTOModule::createFromFile(Context, Filename, Options);
@@ -461,7 +454,7 @@ static void createCombinedModuleSummaryIndex() {
 static void getThinLTOOldAndNewPrefix(std::string &OldPrefix,
                                       std::string &NewPrefix) {
   assert(ThinLTOPrefixReplace.empty() ||
-         ThinLTOPrefixReplace.find(";") != StringRef::npos);
+         ThinLTOPrefixReplace.find(';') != StringRef::npos);
   StringRef PrefixReplace = ThinLTOPrefixReplace;
   std::pair<StringRef, StringRef> Split = PrefixReplace.split(";");
   OldPrefix = Split.first.str();
@@ -903,7 +896,7 @@ int main(int argc, char **argv) {
   InitializeAllAsmParsers();
 
   // set up the TargetOptions for the machine
-  TargetOptions Options = codegen::InitTargetOptionsFromCodeGenFlags();
+  TargetOptions Options = codegen::InitTargetOptionsFromCodeGenFlags(Triple());
 
   if (ListSymbolsOnly) {
     listSymbols(Options);
@@ -960,6 +953,7 @@ int main(int argc, char **argv) {
                                true);
 
   LTOCodeGenerator CodeGen(Context);
+  CodeGen.setDisableVerify(DisableVerify);
 
   if (UseDiagnosticHandler)
     CodeGen.setDiagnosticHandler(handleDiagnostics, nullptr);
@@ -1018,19 +1012,22 @@ int main(int argc, char **argv) {
   CodeGen.setCpu(codegen::getMCPU().c_str());
 
   CodeGen.setOptLevel(OptLevel - '0');
-
-  auto MAttrs = codegen::getMAttrs();
-  if (!MAttrs.empty()) {
-    std::string attrs = join(MAttrs, ",");
-    CodeGen.setAttr(attrs);
-  }
+  CodeGen.setAttrs(codegen::getMAttrs());
 
   if (auto FT = codegen::getExplicitFileType())
     CodeGen.setFileType(FT.getValue());
 
   if (!OutputFilename.empty()) {
-    if (!CodeGen.optimize(DisableVerify, DisableInline, DisableGVNLoadPRE,
-                          DisableLTOVectorization)) {
+    if (SaveLinkedModuleFile) {
+      std::string ModuleFilename = OutputFilename;
+      ModuleFilename += ".linked.bc";
+      std::string ErrMsg;
+
+      if (!CodeGen.writeMergedModules(ModuleFilename))
+        error("writing linked module failed.");
+    }
+
+    if (!CodeGen.optimize()) {
       // Diagnostic messages should have been printed by the handler.
       error("error optimizing the code");
     }
@@ -1071,8 +1068,7 @@ int main(int argc, char **argv) {
       error(": -save-merged-module must be specified with -o");
 
     const char *OutputName = nullptr;
-    if (!CodeGen.compile_to_file(&OutputName, DisableVerify, DisableInline,
-                                 DisableGVNLoadPRE, DisableLTOVectorization))
+    if (!CodeGen.compile_to_file(&OutputName))
       error("error compiling the code");
       // Diagnostic messages should have been printed by the handler.
 
diff --git a/contrib/llvm-project/llvm/tools/llvm-lto2/llvm-lto2.cpp b/contrib/llvm-project/llvm/tools/llvm-lto2/llvm-lto2.cpp
index 9dd1f13bd3c3..ca4278fafb89 100644
--- a/contrib/llvm-project/llvm/tools/llvm-lto2/llvm-lto2.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-lto2/llvm-lto2.cpp
@@ -17,10 +17,12 @@
 
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/CodeGen/CommandFlags.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/LTO/Caching.h"
 #include "llvm/LTO/LTO.h"
 #include "llvm/Passes/PassPlugin.h"
+#include "llvm/Remarks/HotnessThresholdParser.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/InitLLVM.h"
@@ -103,6 +105,14 @@ static cl::opt<bool> RemarksWithHotness(
     cl::desc("With PGO, include profile count in optimization remarks"),
     cl::Hidden);
 
+cl::opt<Optional<uint64_t>, false, remarks::HotnessThresholdParser>
+    RemarksHotnessThreshold(
+        "pass-remarks-hotness-threshold",
+        cl::desc("Minimum profile count required for an "
+                 "optimization remark to be output."
+                 " Use 'auto' to apply the threshold from profile summary."),
+        cl::value_desc("uint or 'auto'"), cl::init(0), cl::Hidden);
+
 static cl::opt<std::string>
     RemarksFilename("pass-remarks-output",
                     cl::desc("Output filename for pass remarks"),
@@ -135,7 +145,7 @@ static cl::opt<bool>
 static cl::opt<bool>
     UseNewPM("use-new-pm",
              cl::desc("Run LTO passes using the new pass manager"),
-             cl::init(false), cl::Hidden);
+             cl::init(LLVM_ENABLE_NEW_PASS_MANAGER), cl::Hidden);
 
 static cl::opt<bool>
     DebugPassManager("debug-pass-manager", cl::init(false), cl::Hidden,
@@ -148,6 +158,11 @@ static cl::list<std::string>
     PassPlugins("load-pass-plugin",
                 cl::desc("Load passes from plugin library"));
 
+static cl::opt<bool> EnableFreestanding(
+    "lto-freestanding",
+    cl::desc("Enable Freestanding (disable builtins / TLI) during LTO"),
+    cl::init(false), cl::Hidden);
+
 static void check(Error E, std::string Msg) {
   if (!E)
     return;
@@ -230,7 +245,7 @@ static int run(int argc, char **argv) {
   };
 
   Conf.CPU = codegen::getMCPU();
-  Conf.Options = codegen::InitTargetOptionsFromCodeGenFlags();
+  Conf.Options = codegen::InitTargetOptionsFromCodeGenFlags(Triple());
   Conf.MAttrs = codegen::getMAttrs();
   if (auto RM = codegen::getExplicitRelocModel())
     Conf.RelocModel = RM.getValue();
@@ -246,6 +261,7 @@ static int run(int argc, char **argv) {
   Conf.RemarksFilename = RemarksFilename;
   Conf.RemarksPasses = RemarksPasses;
   Conf.RemarksWithHotness = RemarksWithHotness;
+  Conf.RemarksHotnessThreshold = RemarksHotnessThreshold;
   Conf.RemarksFormat = RemarksFormat;
 
   Conf.SampleProfile = SamplePGOFile;
@@ -258,6 +274,7 @@ static int run(int argc, char **argv) {
 
   Conf.OptLevel = OptLevel - '0';
   Conf.UseNewPM = UseNewPM;
+  Conf.Freestanding = EnableFreestanding;
   for (auto &PluginFN : PassPlugins)
     Conf.PassPlugins.push_back(PluginFN);
   switch (CGOptLevel) {
diff --git a/contrib/llvm-project/llvm/tools/llvm-mc/llvm-mc.cpp b/contrib/llvm-project/llvm/tools/llvm-mc/llvm-mc.cpp
index 66b55abc4898..f3a66187dd0a 100644
--- a/contrib/llvm-project/llvm/tools/llvm-mc/llvm-mc.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-mc/llvm-mc.cpp
@@ -169,6 +169,10 @@ static cl::opt<bool> LexMasmIntegers(
     "masm-integers",
     cl::desc("Enable binary and hex masm integers (0b110 and 0ABCh)"));
 
+static cl::opt<bool> LexMasmHexFloats(
+    "masm-hexfloats",
+    cl::desc("Enable MASM-style hex float initializers (3F800000r)"));
+
 static cl::opt<bool> NoExecStack("no-exec-stack",
                                  cl::desc("File doesn't need an exec stack"));
 
@@ -300,6 +304,7 @@ static int AssembleInput(const char *ProgName, const Target *TheTarget,
   Parser->setShowParsedOperands(ShowInstOperands);
   Parser->setTargetParser(*TAP);
   Parser->getLexer().setLexMasmIntegers(LexMasmIntegers);
+  Parser->getLexer().setLexMasmHexFloats(LexMasmHexFloats);
 
   int Res = Parser->Run(NoInitialTextSection);
 
@@ -464,8 +469,11 @@ int main(int argc, char **argv) {
   std::unique_ptr<MCStreamer> Str;
 
   std::unique_ptr<MCInstrInfo> MCII(TheTarget->createMCInstrInfo());
+  assert(MCII && "Unable to create instruction info!");
+
   std::unique_ptr<MCSubtargetInfo> STI(
       TheTarget->createMCSubtargetInfo(TripleName, MCPU, FeaturesStr));
+  assert(STI && "Unable to create subtarget info!");
 
   MCInstPrinter *IP = nullptr;
   if (FileType == OFT_AssemblyFile) {
diff --git a/contrib/llvm-project/llvm/tools/llvm-mca/CodeRegion.cpp b/contrib/llvm-project/llvm/tools/llvm-mca/CodeRegion.cpp
index e05517c1ac95..7662538e3b68 100644
--- a/contrib/llvm-project/llvm/tools/llvm-mca/CodeRegion.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-mca/CodeRegion.cpp
@@ -63,7 +63,6 @@ void CodeRegions::beginRegion(StringRef Description, SMLoc Loc) {
 
   ActiveRegions[Description] = Regions.size();
   Regions.emplace_back(std::make_unique<CodeRegion>(Description, Loc));
-  return;
 }
 
 void CodeRegions::endRegion(StringRef Description, SMLoc Loc) {
diff --git a/contrib/llvm-project/llvm/tools/llvm-mca/PipelinePrinter.cpp b/contrib/llvm-project/llvm/tools/llvm-mca/PipelinePrinter.cpp
index 90d468075996..e7dfbfdce26d 100644
--- a/contrib/llvm-project/llvm/tools/llvm-mca/PipelinePrinter.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-mca/PipelinePrinter.cpp
@@ -19,7 +19,7 @@ namespace mca {
 
 void PipelinePrinter::printReport(llvm::raw_ostream &OS) const {
   for (const auto &V : Views)
-    V->printView(OS);
+    V->printView(OutputKind, OS);
 }
 } // namespace mca.
 } // namespace llvm
diff --git a/contrib/llvm-project/llvm/tools/llvm-mca/PipelinePrinter.h b/contrib/llvm-project/llvm/tools/llvm-mca/PipelinePrinter.h
index 004309cd7b8e..ae18140d32b7 100644
--- a/contrib/llvm-project/llvm/tools/llvm-mca/PipelinePrinter.h
+++ b/contrib/llvm-project/llvm/tools/llvm-mca/PipelinePrinter.h
@@ -36,9 +36,11 @@ namespace mca {
 class PipelinePrinter {
   Pipeline &P;
   llvm::SmallVector<std::unique_ptr<View>, 8> Views;
+  View::OutputKind OutputKind;
 
 public:
-  PipelinePrinter(Pipeline &pipeline) : P(pipeline) {}
+  PipelinePrinter(Pipeline &pipeline, View::OutputKind OutputKind)
+      : P(pipeline), OutputKind(OutputKind) {}
 
   void addView(std::unique_ptr<View> V) {
     P.addEventListener(V.get());
diff --git a/contrib/llvm-project/llvm/tools/llvm-mca/Views/BottleneckAnalysis.cpp b/contrib/llvm-project/llvm/tools/llvm-mca/Views/BottleneckAnalysis.cpp
index 99deed6eae97..38a8e2ef9c53 100644
--- a/contrib/llvm-project/llvm/tools/llvm-mca/Views/BottleneckAnalysis.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-mca/Views/BottleneckAnalysis.cpp
@@ -16,7 +16,6 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MCA/Support.h"
 #include "llvm/Support/Format.h"
-#include "llvm/Support/FormattedStream.h"
 
 namespace llvm {
 namespace mca {
@@ -284,21 +283,13 @@ void DependencyGraph::getCriticalSequence(
   }
 }
 
-static void printInstruction(formatted_raw_ostream &FOS,
-                             const MCSubtargetInfo &STI, MCInstPrinter &MCIP,
-                             const MCInst &MCI,
-                             bool UseDifferentColor = false) {
-  std::string Instruction;
-  raw_string_ostream InstrStream(Instruction);
-
+void BottleneckAnalysis::printInstruction(formatted_raw_ostream &FOS,
+                                          const MCInst &MCI,
+                                          bool UseDifferentColor) const {
   FOS.PadToColumn(14);
-
-  MCIP.printInst(&MCI, 0, "", STI, InstrStream);
-  InstrStream.flush();
-
   if (UseDifferentColor)
     FOS.changeColor(raw_ostream::CYAN, true, false);
-  FOS << StringRef(Instruction).ltrim();
+  FOS << printInstructionString(MCI);
   if (UseDifferentColor)
     FOS.resetColor();
 }
@@ -316,6 +307,7 @@ void BottleneckAnalysis::printCriticalSequence(raw_ostream &OS) const {
   OS << "\nCritical sequence based on the simulation:\n\n";
 
   const DependencyEdge &FirstEdge = *Seq[0];
+  ArrayRef<llvm::MCInst> Source = getSource();
   unsigned FromIID = FirstEdge.FromIID % Source.size();
   unsigned ToIID = FirstEdge.ToIID % Source.size();
   bool IsLoopCarried = FromIID >= ToIID;
@@ -331,17 +323,17 @@ void BottleneckAnalysis::printCriticalSequence(raw_ostream &OS) const {
   unsigned CurrentIID = 0;
   if (IsLoopCarried) {
     FOS << "\n +----< " << FromIID << ".";
-    printInstruction(FOS, STI, MCIP, Source[FromIID], HasColors);
+    printInstruction(FOS, Source[FromIID], HasColors);
     FOS << "\n |\n |    < loop carried > \n |";
   } else {
     while (CurrentIID < FromIID) {
       FOS << "\n        " << CurrentIID << ".";
-      printInstruction(FOS, STI, MCIP, Source[CurrentIID]);
+      printInstruction(FOS, Source[CurrentIID]);
       CurrentIID++;
     }
 
     FOS << "\n +----< " << CurrentIID << ".";
-    printInstruction(FOS, STI, MCIP, Source[CurrentIID], HasColors);
+    printInstruction(FOS, Source[CurrentIID], HasColors);
     CurrentIID++;
   }
 
@@ -351,17 +343,17 @@ void BottleneckAnalysis::printCriticalSequence(raw_ostream &OS) const {
 
     while (CurrentIID < LastIID) {
       FOS << "\n |      " << CurrentIID << ".";
-      printInstruction(FOS, STI, MCIP, Source[CurrentIID]);
+      printInstruction(FOS, Source[CurrentIID]);
       CurrentIID++;
     }
 
     if (CurrentIID == ToIID) {
       FOS << "\n +----> " << ToIID << ".";
-      printInstruction(FOS, STI, MCIP, Source[CurrentIID], HasColors);
+      printInstruction(FOS, Source[CurrentIID], HasColors);
     } else {
       FOS << "\n |\n |    < loop carried > \n |"
           << "\n +----> " << ToIID << ".";
-      printInstruction(FOS, STI, MCIP, Source[ToIID], HasColors);
+      printInstruction(FOS, Source[ToIID], HasColors);
     }
     FOS.PadToColumn(58);
 
@@ -373,7 +365,7 @@ void BottleneckAnalysis::printCriticalSequence(raw_ostream &OS) const {
       FOS << "## REGISTER dependency:  ";
       if (HasColors)
         FOS.changeColor(raw_ostream::MAGENTA, true, false);
-      MCIP.printRegName(FOS, Dep.ResourceOrRegID);
+      getInstPrinter().printRegName(FOS, Dep.ResourceOrRegID);
     } else if (Dep.Type == DependencyEdge::DT_MEMORY) {
       FOS << "## MEMORY dependency.";
     } else {
@@ -397,7 +389,7 @@ void BottleneckAnalysis::printCriticalSequence(raw_ostream &OS) const {
 
   while (CurrentIID < Source.size()) {
     FOS << "\n        " << CurrentIID << ".";
-    printInstruction(FOS, STI, MCIP, Source[CurrentIID]);
+    printInstruction(FOS, Source[CurrentIID]);
     CurrentIID++;
   }
 
@@ -451,8 +443,8 @@ void DependencyGraph::addDependency(unsigned From, unsigned To,
 BottleneckAnalysis::BottleneckAnalysis(const MCSubtargetInfo &sti,
                                        MCInstPrinter &Printer,
                                        ArrayRef<MCInst> S, unsigned NumIter)
-    : STI(sti), MCIP(Printer), Tracker(STI.getSchedModel()), DG(S.size() * 3),
-      Source(S), Iterations(NumIter), TotalCycles(0),
+    : InstructionView(sti, Printer, S), Tracker(sti.getSchedModel()),
+      DG(S.size() * 3), Iterations(NumIter), TotalCycles(0),
       PressureIncreasedBecauseOfResources(false),
       PressureIncreasedBecauseOfRegisterDependencies(false),
       PressureIncreasedBecauseOfMemoryDependencies(false),
@@ -461,7 +453,7 @@ BottleneckAnalysis::BottleneckAnalysis(const MCSubtargetInfo &sti,
 void BottleneckAnalysis::addRegisterDep(unsigned From, unsigned To,
                                         unsigned RegID, unsigned Cost) {
   bool IsLoopCarried = From >= To;
-  unsigned SourceSize = Source.size();
+  unsigned SourceSize = getSource().size();
   if (IsLoopCarried) {
     DG.addRegisterDep(From, To + SourceSize, RegID, Cost);
     DG.addRegisterDep(From + SourceSize, To + (SourceSize * 2), RegID, Cost);
@@ -473,7 +465,7 @@ void BottleneckAnalysis::addRegisterDep(unsigned From, unsigned To,
 void BottleneckAnalysis::addMemoryDep(unsigned From, unsigned To,
                                       unsigned Cost) {
   bool IsLoopCarried = From >= To;
-  unsigned SourceSize = Source.size();
+  unsigned SourceSize = getSource().size();
   if (IsLoopCarried) {
     DG.addMemoryDep(From, To + SourceSize, Cost);
     DG.addMemoryDep(From + SourceSize, To + (SourceSize * 2), Cost);
@@ -485,7 +477,7 @@ void BottleneckAnalysis::addMemoryDep(unsigned From, unsigned To,
 void BottleneckAnalysis::addResourceDep(unsigned From, unsigned To,
                                         uint64_t Mask, unsigned Cost) {
   bool IsLoopCarried = From >= To;
-  unsigned SourceSize = Source.size();
+  unsigned SourceSize = getSource().size();
   if (IsLoopCarried) {
     DG.addResourceDep(From, To + SourceSize, Mask, Cost);
     DG.addResourceDep(From + SourceSize, To + (SourceSize * 2), Mask, Cost);
@@ -508,6 +500,7 @@ void BottleneckAnalysis::onEvent(const HWInstructionEvent &Event) {
   if (Event.Type != HWInstructionEvent::Issued)
     return;
 
+  ArrayRef<llvm::MCInst> Source = getSource();
   const Instruction &IS = *Event.IR.getInstruction();
   unsigned To = IID % Source.size();
 
@@ -617,7 +610,7 @@ void BottleneckAnalysis::printBottleneckHints(raw_ostream &OS) const {
 
   if (BPI.PressureIncreaseCycles) {
     ArrayRef<unsigned> Distribution = Tracker.getResourcePressureDistribution();
-    const MCSchedModel &SM = STI.getSchedModel();
+    const MCSchedModel &SM = getSubTargetInfo().getSchedModel();
     for (unsigned I = 0, E = Distribution.size(); I < E; ++I) {
       unsigned ResourceCycles = Distribution[I];
       if (ResourceCycles) {
diff --git a/contrib/llvm-project/llvm/tools/llvm-mca/Views/BottleneckAnalysis.h b/contrib/llvm-project/llvm/tools/llvm-mca/Views/BottleneckAnalysis.h
index 9e3bd5978f09..427937d9e3d7 100644
--- a/contrib/llvm-project/llvm/tools/llvm-mca/Views/BottleneckAnalysis.h
+++ b/contrib/llvm-project/llvm/tools/llvm-mca/Views/BottleneckAnalysis.h
@@ -80,12 +80,13 @@
 #ifndef LLVM_TOOLS_LLVM_MCA_BOTTLENECK_ANALYSIS_H
 #define LLVM_TOOLS_LLVM_MCA_BOTTLENECK_ANALYSIS_H
 
-#include "Views/View.h"
+#include "Views/InstructionView.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCInstPrinter.h"
 #include "llvm/MC/MCSchedule.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
@@ -282,13 +283,10 @@ public:
 };
 
 /// A view that collects and prints a few performance numbers.
-class BottleneckAnalysis : public View {
-  const MCSubtargetInfo &STI;
-  MCInstPrinter &MCIP;
+class BottleneckAnalysis : public InstructionView {
   PressureTracker Tracker;
   DependencyGraph DG;
 
-  ArrayRef<MCInst> Source;
   unsigned Iterations;
   unsigned TotalCycles;
 
@@ -317,6 +315,9 @@ class BottleneckAnalysis : public View {
   void addMemoryDep(unsigned From, unsigned To, unsigned Cy);
   void addResourceDep(unsigned From, unsigned To, uint64_t Mask, unsigned Cy);
 
+  void printInstruction(formatted_raw_ostream &FOS, const MCInst &MCI,
+                        bool UseDifferentColor = false) const;
+
   // Prints a bottleneck message to OS.
   void printBottleneckHints(raw_ostream &OS) const;
   void printCriticalSequence(raw_ostream &OS) const;
@@ -331,6 +332,8 @@ public:
   void onEvent(const HWInstructionEvent &Event) override;
 
   void printView(raw_ostream &OS) const override;
+  StringRef getNameAsString() const override { return "BottleneckAnalysis"; }
+  json::Value toJSON() const override { return "not implemented"; }
 
 #ifndef NDEBUG
   void dump(raw_ostream &OS, MCInstPrinter &MCIP) const { DG.dump(OS, MCIP); }
diff --git a/contrib/llvm-project/llvm/tools/llvm-mca/Views/DispatchStatistics.h b/contrib/llvm-project/llvm/tools/llvm-mca/Views/DispatchStatistics.h
index 07c0f5a4c68f..8d999fb0acfe 100644
--- a/contrib/llvm-project/llvm/tools/llvm-mca/Views/DispatchStatistics.h
+++ b/contrib/llvm-project/llvm/tools/llvm-mca/Views/DispatchStatistics.h
@@ -78,6 +78,7 @@ public:
     printDispatchStalls(OS);
     printDispatchHistogram(OS);
   }
+  StringRef getNameAsString() const override { return "DispatchStatistics"; }
 };
 } // namespace mca
 } // namespace llvm
diff --git a/contrib/llvm-project/llvm/tools/llvm-mca/Views/InstructionInfoView.cpp b/contrib/llvm-project/llvm/tools/llvm-mca/Views/InstructionInfoView.cpp
index fbe9d9021554..2248f63fe7e9 100644
--- a/contrib/llvm-project/llvm/tools/llvm-mca/Views/InstructionInfoView.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-mca/Views/InstructionInfoView.cpp
@@ -13,6 +13,7 @@
 
 #include "Views/InstructionInfoView.h"
 #include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/JSON.h"
 
 namespace llvm {
 namespace mca {
@@ -20,10 +21,13 @@ namespace mca {
 void InstructionInfoView::printView(raw_ostream &OS) const {
   std::string Buffer;
   raw_string_ostream TempStream(Buffer);
-  const MCSchedModel &SM = STI.getSchedModel();
 
-  std::string Instruction;
-  raw_string_ostream InstrStream(Instruction);
+  ArrayRef<llvm::MCInst> Source = getSource();
+  if (!Source.size())
+    return;
+
+  IIVDVec IIVD(Source.size());
+  collectData(IIVD);
 
   TempStream << "\n\nInstruction Info:\n";
   TempStream << "[1]: #uOps\n[2]: Latency\n[3]: RThroughput\n"
@@ -36,40 +40,22 @@ void InstructionInfoView::printView(raw_ostream &OS) const {
     TempStream << "\n[1]    [2]    [3]    [4]    [5]    [6]    Instructions:\n";
   }
 
-  for (unsigned I = 0, E = Source.size(); I < E; ++I) {
-    const MCInst &Inst = Source[I];
-    const MCInstrDesc &MCDesc = MCII.get(Inst.getOpcode());
-
-    // Obtain the scheduling class information from the instruction.
-    unsigned SchedClassID = MCDesc.getSchedClass();
-    unsigned CPUID = SM.getProcessorID();
-
-    // Try to solve variant scheduling classes.
-    while (SchedClassID && SM.getSchedClassDesc(SchedClassID)->isVariant())
-      SchedClassID = STI.resolveVariantSchedClass(SchedClassID, &Inst, CPUID);
+  for (const auto &I : enumerate(zip(IIVD, Source))) {
+    const InstructionInfoViewData &IIVDEntry = std::get<0>(I.value());
 
-    const MCSchedClassDesc &SCDesc = *SM.getSchedClassDesc(SchedClassID);
-    unsigned NumMicroOpcodes = SCDesc.NumMicroOps;
-    unsigned Latency = MCSchedModel::computeInstrLatency(STI, SCDesc);
-    // Add extra latency due to delays in the forwarding data paths.
-    Latency += MCSchedModel::getForwardingDelayCycles(
-        STI.getReadAdvanceEntries(SCDesc));
-    Optional<double> RThroughput =
-        MCSchedModel::getReciprocalThroughput(STI, SCDesc);
-
-    TempStream << ' ' << NumMicroOpcodes << "    ";
-    if (NumMicroOpcodes < 10)
+    TempStream << ' ' << IIVDEntry.NumMicroOpcodes << "    ";
+    if (IIVDEntry.NumMicroOpcodes < 10)
       TempStream << "  ";
-    else if (NumMicroOpcodes < 100)
+    else if (IIVDEntry.NumMicroOpcodes < 100)
       TempStream << ' ';
-    TempStream << Latency << "   ";
-    if (Latency < 10)
+    TempStream << IIVDEntry.Latency << "   ";
+    if (IIVDEntry.Latency < 10)
       TempStream << "  ";
-    else if (Latency < 100)
+    else if (IIVDEntry.Latency < 100)
       TempStream << ' ';
 
-    if (RThroughput.hasValue()) {
-      double RT = RThroughput.getValue();
+    if (IIVDEntry.RThroughput.hasValue()) {
+      double RT = IIVDEntry.RThroughput.getValue();
       TempStream << format("%.2f", RT) << ' ';
       if (RT < 10.0)
         TempStream << "  ";
@@ -78,12 +64,12 @@ void InstructionInfoView::printView(raw_ostream &OS) const {
     } else {
       TempStream << " -     ";
     }
-    TempStream << (MCDesc.mayLoad() ? " *     " : "       ");
-    TempStream << (MCDesc.mayStore() ? " *     " : "       ");
-    TempStream << (MCDesc.hasUnmodeledSideEffects() ? " U     " : "       ");
+    TempStream << (IIVDEntry.mayLoad ? " *     " : "       ");
+    TempStream << (IIVDEntry.mayStore ? " *     " : "       ");
+    TempStream << (IIVDEntry.hasUnmodeledSideEffects ? " U     " : "       ");
 
     if (PrintEncodings) {
-      StringRef Encoding(CE.getEncoding(I));
+      StringRef Encoding(CE.getEncoding(I.index()));
       unsigned EncodingSize = Encoding.size();
       TempStream << " " << EncodingSize
                  << (EncodingSize < 10 ? "     " : "    ");
@@ -95,18 +81,73 @@ void InstructionInfoView::printView(raw_ostream &OS) const {
       FOS.flush();
     }
 
-    MCIP.printInst(&Inst, 0, "", STI, InstrStream);
-    InstrStream.flush();
-
-    // Consume any tabs or spaces at the beginning of the string.
-    StringRef Str(Instruction);
-    Str = Str.ltrim();
-    TempStream << Str << '\n';
-    Instruction = "";
+    const MCInst &Inst = std::get<1>(I.value());
+    TempStream << printInstructionString(Inst) << '\n';
   }
 
   TempStream.flush();
   OS << Buffer;
 }
+
+void InstructionInfoView::collectData(
+    MutableArrayRef<InstructionInfoViewData> IIVD) const {
+  const llvm::MCSubtargetInfo &STI = getSubTargetInfo();
+  const MCSchedModel &SM = STI.getSchedModel();
+  for (const auto &I : zip(getSource(), IIVD)) {
+    const MCInst &Inst = std::get<0>(I);
+    InstructionInfoViewData &IIVDEntry = std::get<1>(I);
+    const MCInstrDesc &MCDesc = MCII.get(Inst.getOpcode());
+
+    // Obtain the scheduling class information from the instruction.
+    unsigned SchedClassID = MCDesc.getSchedClass();
+    unsigned CPUID = SM.getProcessorID();
+
+    // Try to solve variant scheduling classes.
+    while (SchedClassID && SM.getSchedClassDesc(SchedClassID)->isVariant())
+      SchedClassID =
+          STI.resolveVariantSchedClass(SchedClassID, &Inst, &MCII, CPUID);
+
+    const MCSchedClassDesc &SCDesc = *SM.getSchedClassDesc(SchedClassID);
+    IIVDEntry.NumMicroOpcodes = SCDesc.NumMicroOps;
+    IIVDEntry.Latency = MCSchedModel::computeInstrLatency(STI, SCDesc);
+    // Add extra latency due to delays in the forwarding data paths.
+    IIVDEntry.Latency += MCSchedModel::getForwardingDelayCycles(
+        STI.getReadAdvanceEntries(SCDesc));
+    IIVDEntry.RThroughput = MCSchedModel::getReciprocalThroughput(STI, SCDesc);
+    IIVDEntry.mayLoad = MCDesc.mayLoad();
+    IIVDEntry.mayStore = MCDesc.mayStore();
+    IIVDEntry.hasUnmodeledSideEffects = MCDesc.hasUnmodeledSideEffects();
+  }
+}
+
+// Construct a JSON object from a single InstructionInfoViewData object.
+json::Object
+InstructionInfoView::toJSON(const InstructionInfoViewData &IIVD) const {
+  json::Object JO({{"NumMicroOpcodes", IIVD.NumMicroOpcodes},
+                   {"Latency", IIVD.Latency},
+                   {"mayLoad", IIVD.mayLoad},
+                   {"mayStore", IIVD.mayStore},
+                   {"hasUnmodeledSideEffects", IIVD.hasUnmodeledSideEffects}});
+  JO.try_emplace("RThroughput", IIVD.RThroughput.getValueOr(0.0));
+  return JO;
+}
+
+json::Value InstructionInfoView::toJSON() const {
+  ArrayRef<llvm::MCInst> Source = getSource();
+  if (!Source.size())
+    return json::Value(0);
+
+  IIVDVec IIVD(Source.size());
+  collectData(IIVD);
+
+  json::Array InstInfo;
+  for (const auto &I : enumerate(IIVD)) {
+    const InstructionInfoViewData &IIVDEntry = I.value();
+    json::Object JO = toJSON(IIVDEntry);
+    JO.try_emplace("Instruction", (unsigned)I.index());
+    InstInfo.push_back(std::move(JO));
+  }
+  return json::Value(std::move(InstInfo));
+}
 } // namespace mca.
 } // namespace llvm
diff --git a/contrib/llvm-project/llvm/tools/llvm-mca/Views/InstructionInfoView.h b/contrib/llvm-project/llvm/tools/llvm-mca/Views/InstructionInfoView.h
index 0e948304119f..5d52164e2d50 100644
--- a/contrib/llvm-project/llvm/tools/llvm-mca/Views/InstructionInfoView.h
+++ b/contrib/llvm-project/llvm/tools/llvm-mca/Views/InstructionInfoView.h
@@ -34,8 +34,9 @@
 #ifndef LLVM_TOOLS_LLVM_MCA_INSTRUCTIONINFOVIEW_H
 #define LLVM_TOOLS_LLVM_MCA_INSTRUCTIONINFOVIEW_H
 
-#include "Views/View.h"
+#include "Views/InstructionView.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstPrinter.h"
 #include "llvm/MC/MCInstrInfo.h"
@@ -49,23 +50,36 @@ namespace llvm {
 namespace mca {
 
 /// A view that prints out generic instruction information.
-class InstructionInfoView : public View {
-  const llvm::MCSubtargetInfo &STI;
+class InstructionInfoView : public InstructionView {
   const llvm::MCInstrInfo &MCII;
   CodeEmitter &CE;
   bool PrintEncodings;
-  llvm::ArrayRef<llvm::MCInst> Source;
-  llvm::MCInstPrinter &MCIP;
+
+  struct InstructionInfoViewData {
+    unsigned NumMicroOpcodes = 0;
+    unsigned Latency = 0;
+    Optional<double> RThroughput = 0.0;
+    bool mayLoad = false;
+    bool mayStore = false;
+    bool hasUnmodeledSideEffects = false;
+  };
+  using IIVDVec = SmallVector<InstructionInfoViewData, 16>;
+
+  /// Place the data into the array of InstructionInfoViewData IIVD.
+  void collectData(MutableArrayRef<InstructionInfoViewData> IIVD) const;
 
 public:
   InstructionInfoView(const llvm::MCSubtargetInfo &ST,
                       const llvm::MCInstrInfo &II, CodeEmitter &C,
                       bool ShouldPrintEncodings, llvm::ArrayRef<llvm::MCInst> S,
                       llvm::MCInstPrinter &IP)
-      : STI(ST), MCII(II), CE(C), PrintEncodings(ShouldPrintEncodings),
-        Source(S), MCIP(IP) {}
+      : InstructionView(ST, IP, S), MCII(II), CE(C),
+        PrintEncodings(ShouldPrintEncodings) {}
 
   void printView(llvm::raw_ostream &OS) const override;
+  StringRef getNameAsString() const override { return "InstructionInfoView"; }
+  json::Value toJSON() const override;
+  json::Object toJSON(const InstructionInfoViewData &IIVD) const;
 };
 } // namespace mca
 } // namespace llvm
diff --git a/contrib/llvm-project/llvm/tools/llvm-mca/Views/InstructionView.cpp b/contrib/llvm-project/llvm/tools/llvm-mca/Views/InstructionView.cpp
new file mode 100644
index 000000000000..7f7a5b7cdbbb
--- /dev/null
+++ b/contrib/llvm-project/llvm/tools/llvm-mca/Views/InstructionView.cpp
@@ -0,0 +1,60 @@
+//===----------------------- View.cpp ---------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines the member functions of the class InstructionView.
+///
+//===----------------------------------------------------------------------===//
+
+#include <sstream>
+#include "Views/InstructionView.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+
+namespace llvm {
+namespace mca {
+
+StringRef InstructionView::printInstructionString(const llvm::MCInst &MCI) const {
+  InstructionString = "";
+  MCIP.printInst(&MCI, 0, "", STI, InstrStream);
+  InstrStream.flush();
+  // Remove any tabs or spaces at the beginning of the instruction.
+  return StringRef(InstructionString).ltrim();
+}
+
+json::Value InstructionView::toJSON() const {
+  json::Object JO;
+  json::Array SourceInfo;
+  for (const auto &MCI : getSource()) {
+    StringRef Instruction = printInstructionString(MCI);
+    SourceInfo.push_back(Instruction.str());
+  }
+  JO.try_emplace("Instructions", std::move(SourceInfo));
+
+  json::Array Resources;
+  const MCSchedModel &SM = STI.getSchedModel();
+  for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
+    const MCProcResourceDesc &ProcResource = *SM.getProcResource(I);
+    unsigned NumUnits = ProcResource.NumUnits;
+    // Skip groups and invalid resources with zero units.
+    if (ProcResource.SubUnitsIdxBegin || !NumUnits)
+      continue;
+    for (unsigned J = 0; J < NumUnits; ++J) {
+      std::stringstream ResNameStream;
+      ResNameStream << ProcResource.Name;
+      if (NumUnits > 1)
+        ResNameStream << "." << J;
+      Resources.push_back(ResNameStream.str());
+    }
+  }
+  JO.try_emplace("Resources", json::Object({{"CPUName", MCPU}, {"Resources", std::move(Resources)}}));
+
+  return JO;
+}
+} // namespace mca
+} // namespace llvm
diff --git a/contrib/llvm-project/llvm/tools/llvm-mca/Views/InstructionView.h b/contrib/llvm-project/llvm/tools/llvm-mca/Views/InstructionView.h
new file mode 100644
index 000000000000..2a260b97d8fb
--- /dev/null
+++ b/contrib/llvm-project/llvm/tools/llvm-mca/Views/InstructionView.h
@@ -0,0 +1,67 @@
+//===----------------------- InstrucionView.h -----------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines the main interface for Views that examine and reference
+/// a sequence of machine instructions.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_INSTRUCTIONVIEW_H
+#define LLVM_TOOLS_LLVM_MCA_INSTRUCTIONVIEW_H
+
+#include "Views/View.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/JSON.h"
+
+namespace llvm {
+namespace mca {
+
+// The base class for views that deal with individual machine instructions.
+class InstructionView : public View {
+  const llvm::MCSubtargetInfo &STI;
+  llvm::MCInstPrinter &MCIP;
+  llvm::ArrayRef<llvm::MCInst> Source;
+  StringRef MCPU;
+
+  mutable std::string InstructionString;
+  mutable raw_string_ostream InstrStream;
+
+public:
+  void printView(llvm::raw_ostream &) const override {}
+  InstructionView(const llvm::MCSubtargetInfo &STI,
+                  llvm::MCInstPrinter &Printer,
+                  llvm::ArrayRef<llvm::MCInst> S,
+                  StringRef MCPU = StringRef())
+      : STI(STI), MCIP(Printer), Source(S), MCPU(MCPU),
+        InstrStream(InstructionString) {}
+
+  virtual ~InstructionView() = default;
+
+  StringRef getNameAsString() const override {
+    return "Instructions and CPU resources";
+  }
+  // Return a reference to a string representing a given machine instruction.
+  // The result should be used or copied before the next call to
+  // printInstructionString() as it will overwrite the previous result.
+  StringRef printInstructionString(const llvm::MCInst &MCI) const;
+  const llvm::MCSubtargetInfo &getSubTargetInfo() const { return STI; }
+
+  llvm::MCInstPrinter &getInstPrinter() const { return MCIP; }
+  llvm::ArrayRef<llvm::MCInst> getSource() const { return Source; }
+  json::Value toJSON() const override;
+  virtual void printViewJSON(llvm::raw_ostream &OS) override {
+    json::Value JV = toJSON();
+    OS << formatv("{0:2}", JV) << "\n";
+  }
+};
+} // namespace mca
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm-project/llvm/tools/llvm-mca/Views/RegisterFileStatistics.h b/contrib/llvm-project/llvm/tools/llvm-mca/Views/RegisterFileStatistics.h
index a2273dd48b22..cf384dbfe337 100644
--- a/contrib/llvm-project/llvm/tools/llvm-mca/Views/RegisterFileStatistics.h
+++ b/contrib/llvm-project/llvm/tools/llvm-mca/Views/RegisterFileStatistics.h
@@ -73,6 +73,9 @@ public:
   void onCycleEnd() override;
   void onEvent(const HWInstructionEvent &Event) override;
   void printView(llvm::raw_ostream &OS) const override;
+  StringRef getNameAsString() const override {
+    return "RegisterFileStatistics";
+  }
 };
 } // namespace mca
 } // namespace llvm
diff --git a/contrib/llvm-project/llvm/tools/llvm-mca/Views/ResourcePressureView.cpp b/contrib/llvm-project/llvm/tools/llvm-mca/Views/ResourcePressureView.cpp
index bdb9dc21247b..77b3ba0b7c8d 100644
--- a/contrib/llvm-project/llvm/tools/llvm-mca/Views/ResourcePressureView.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-mca/Views/ResourcePressureView.cpp
@@ -21,10 +21,10 @@ namespace mca {
 ResourcePressureView::ResourcePressureView(const llvm::MCSubtargetInfo &sti,
                                            MCInstPrinter &Printer,
                                            ArrayRef<MCInst> S)
-    : STI(sti), MCIP(Printer), Source(S), LastInstructionIdx(0) {
+    : InstructionView(sti, Printer, S), LastInstructionIdx(0) {
   // Populate the map of resource descriptors.
   unsigned R2VIndex = 0;
-  const MCSchedModel &SM = STI.getSchedModel();
+  const MCSchedModel &SM = getSubTargetInfo().getSchedModel();
   for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I) {
     const MCProcResourceDesc &ProcResource = *SM.getProcResource(I);
     unsigned NumUnits = ProcResource.NumUnits;
@@ -37,7 +37,7 @@ ResourcePressureView::ResourcePressureView(const llvm::MCSubtargetInfo &sti,
   }
 
   NumResourceUnits = R2VIndex;
-  ResourceUsage.resize(NumResourceUnits * (Source.size() + 1));
+  ResourceUsage.resize(NumResourceUnits * (getSource().size() + 1));
   std::fill(ResourceUsage.begin(), ResourceUsage.end(), 0.0);
 }
 
@@ -52,6 +52,7 @@ void ResourcePressureView::onEvent(const HWInstructionEvent &Event) {
     return;
 
   const auto &IssueEvent = static_cast<const HWInstructionIssuedEvent &>(Event);
+  ArrayRef<llvm::MCInst> Source = getSource();
   const unsigned SourceIdx = Event.IR.getSourceIndex() % Source.size();
   for (const std::pair<ResourceRef, ResourceCycles> &Use :
        IssueEvent.UsedResources) {
@@ -105,7 +106,7 @@ void ResourcePressureView::printResourcePressurePerIter(raw_ostream &OS) const {
   formatted_raw_ostream FOS(TempStream);
 
   FOS << "\n\nResources:\n";
-  const MCSchedModel &SM = STI.getSchedModel();
+  const MCSchedModel &SM = getSubTargetInfo().getSchedModel();
   for (unsigned I = 1, ResourceIndex = 0, E = SM.getNumProcResourceKinds();
        I < E; ++I) {
     const MCProcResourceDesc &ProcResource = *SM.getProcResource(I);
@@ -132,6 +133,7 @@ void ResourcePressureView::printResourcePressurePerIter(raw_ostream &OS) const {
   FOS << '\n';
   FOS.flush();
 
+  ArrayRef<llvm::MCInst> Source = getSource();
   const unsigned Executions = LastInstructionIdx / Source.size() + 1;
   for (unsigned I = 0, E = NumResourceUnits; I < E; ++I) {
     double Usage = ResourceUsage[I + Source.size() * E];
@@ -148,13 +150,11 @@ void ResourcePressureView::printResourcePressurePerInst(raw_ostream &OS) const {
   formatted_raw_ostream FOS(TempStream);
 
   FOS << "\n\nResource pressure by instruction:\n";
-  printColumnNames(FOS, STI.getSchedModel());
+  printColumnNames(FOS, getSubTargetInfo().getSchedModel());
   FOS << "Instructions:\n";
 
-  std::string Instruction;
-  raw_string_ostream InstrStream(Instruction);
-
   unsigned InstrIndex = 0;
+  ArrayRef<llvm::MCInst> Source = getSource();
   const unsigned Executions = LastInstructionIdx / Source.size() + 1;
   for (const MCInst &MCI : Source) {
     unsigned BaseEltIdx = InstrIndex * NumResourceUnits;
@@ -163,16 +163,7 @@ void ResourcePressureView::printResourcePressurePerInst(raw_ostream &OS) const {
       printResourcePressure(FOS, Usage / Executions, (J + 1) * 7);
     }
 
-    MCIP.printInst(&MCI, 0, "", STI, InstrStream);
-    InstrStream.flush();
-    StringRef Str(Instruction);
-
-    // Remove any tabs or spaces at the beginning of the instruction.
-    Str = Str.ltrim();
-
-    FOS << Str << '\n';
-    Instruction = "";
-
+    FOS << printInstructionString(MCI) << '\n';
     FOS.flush();
     OS << Buffer;
     Buffer = "";
@@ -180,5 +171,30 @@ void ResourcePressureView::printResourcePressurePerInst(raw_ostream &OS) const {
     ++InstrIndex;
   }
 }
+
+json::Value ResourcePressureView::toJSON() const {
+  // We're dumping the instructions and the ResourceUsage array.
+  json::Array ResourcePressureInfo;
+
+  // The ResourceUsage matrix is sparse, so we only consider
+  // non-zero values.
+  ArrayRef<llvm::MCInst> Source = getSource();
+  const unsigned Executions = LastInstructionIdx / Source.size() + 1;
+  for (const auto &R : enumerate(ResourceUsage)) {
+    const ResourceCycles &RU = R.value();
+    if (RU.getNumerator() == 0)
+      continue;
+    unsigned InstructionIndex = R.index() / NumResourceUnits;
+    unsigned ResourceIndex = R.index() % NumResourceUnits;
+    double Usage = RU / Executions;
+    ResourcePressureInfo.push_back(
+        json::Object({{"InstructionIndex", InstructionIndex},
+                      {"ResourceIndex", ResourceIndex},
+                      {"ResourceUsage", Usage}}));
+  }
+
+  json::Object JO({{"ResourcePressureInfo", std::move(ResourcePressureInfo)}});
+  return JO;
+}
 } // namespace mca
 } // namespace llvm
diff --git a/contrib/llvm-project/llvm/tools/llvm-mca/Views/ResourcePressureView.h b/contrib/llvm-project/llvm/tools/llvm-mca/Views/ResourcePressureView.h
index 0fa0b9a36aa3..c3993a08c170 100644
--- a/contrib/llvm-project/llvm/tools/llvm-mca/Views/ResourcePressureView.h
+++ b/contrib/llvm-project/llvm/tools/llvm-mca/Views/ResourcePressureView.h
@@ -57,22 +57,20 @@
 #ifndef LLVM_TOOLS_LLVM_MCA_RESOURCEPRESSUREVIEW_H
 #define LLVM_TOOLS_LLVM_MCA_RESOURCEPRESSUREVIEW_H
 
-#include "Views/View.h"
+#include "Views/InstructionView.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstPrinter.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/JSON.h"
 
 namespace llvm {
 namespace mca {
 
 /// This class collects resource pressure statistics and it is able to print
 /// out all the collected information as a table to an output stream.
-class ResourcePressureView : public View {
-  const llvm::MCSubtargetInfo &STI;
-  llvm::MCInstPrinter &MCIP;
-  llvm::ArrayRef<llvm::MCInst> Source;
+class ResourcePressureView : public InstructionView {
   unsigned LastInstructionIdx;
 
   // Map to quickly obtain the ResourceUsage column index from a processor
@@ -96,6 +94,8 @@ public:
     printResourcePressurePerIter(OS);
     printResourcePressurePerInst(OS);
   }
+  StringRef getNameAsString() const override { return "ResourcePressureView"; }
+  json::Value toJSON() const override;
 };
 } // namespace mca
 } // namespace llvm
diff --git a/contrib/llvm-project/llvm/tools/llvm-mca/Views/RetireControlUnitStatistics.h b/contrib/llvm-project/llvm/tools/llvm-mca/Views/RetireControlUnitStatistics.h
index 1a4d3dec5c56..662a223662e6 100644
--- a/contrib/llvm-project/llvm/tools/llvm-mca/Views/RetireControlUnitStatistics.h
+++ b/contrib/llvm-project/llvm/tools/llvm-mca/Views/RetireControlUnitStatistics.h
@@ -52,6 +52,9 @@ public:
   void onEvent(const HWInstructionEvent &Event) override;
   void onCycleEnd() override;
   void printView(llvm::raw_ostream &OS) const override;
+  StringRef getNameAsString() const override {
+    return "RetireControlUnitStatistics";
+  }
 };
 
 } // namespace mca
diff --git a/contrib/llvm-project/llvm/tools/llvm-mca/Views/SchedulerStatistics.h b/contrib/llvm-project/llvm/tools/llvm-mca/Views/SchedulerStatistics.h
index 32711b4483b4..734046c3112f 100644
--- a/contrib/llvm-project/llvm/tools/llvm-mca/Views/SchedulerStatistics.h
+++ b/contrib/llvm-project/llvm/tools/llvm-mca/Views/SchedulerStatistics.h
@@ -88,6 +88,7 @@ public:
                          llvm::ArrayRef<unsigned> Buffers) override;
 
   void printView(llvm::raw_ostream &OS) const override;
+  StringRef getNameAsString() const override { return "SchedulerStatistics"; }
 };
 } // namespace mca
 } // namespace llvm
diff --git a/contrib/llvm-project/llvm/tools/llvm-mca/Views/SummaryView.cpp b/contrib/llvm-project/llvm/tools/llvm-mca/Views/SummaryView.cpp
index f0e75f7b13ae..c0fe3b5193a7 100644
--- a/contrib/llvm-project/llvm/tools/llvm-mca/Views/SummaryView.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-mca/Views/SummaryView.cpp
@@ -63,32 +63,52 @@ void SummaryView::onEvent(const HWInstructionEvent &Event) {
 }
 
 void SummaryView::printView(raw_ostream &OS) const {
-  unsigned Instructions = Source.size();
-  unsigned Iterations = (LastInstructionIdx / Instructions) + 1;
-  unsigned TotalInstructions = Instructions * Iterations;
-  unsigned TotalUOps = NumMicroOps * Iterations;
-  double IPC = (double)TotalInstructions / TotalCycles;
-  double UOpsPerCycle = (double)TotalUOps / TotalCycles;
-  double BlockRThroughput = computeBlockRThroughput(
-      SM, DispatchWidth, NumMicroOps, ProcResourceUsage);
-
   std::string Buffer;
   raw_string_ostream TempStream(Buffer);
-  TempStream << "Iterations:        " << Iterations;
-  TempStream << "\nInstructions:      " << TotalInstructions;
-  TempStream << "\nTotal Cycles:      " << TotalCycles;
-  TempStream << "\nTotal uOps:        " << TotalUOps << '\n';
-  TempStream << "\nDispatch Width:    " << DispatchWidth;
+  DisplayValues DV;
+
+  collectData(DV);
+  TempStream << "Iterations:        " << DV.Iterations;
+  TempStream << "\nInstructions:      " << DV.TotalInstructions;
+  TempStream << "\nTotal Cycles:      " << DV.TotalCycles;
+  TempStream << "\nTotal uOps:        " << DV.TotalUOps << '\n';
+  TempStream << "\nDispatch Width:    " << DV.DispatchWidth;
   TempStream << "\nuOps Per Cycle:    "
-             << format("%.2f", floor((UOpsPerCycle * 100) + 0.5) / 100);
+             << format("%.2f", floor((DV.UOpsPerCycle * 100) + 0.5) / 100);
   TempStream << "\nIPC:               "
-             << format("%.2f", floor((IPC * 100) + 0.5) / 100);
+             << format("%.2f", floor((DV.IPC * 100) + 0.5) / 100);
   TempStream << "\nBlock RThroughput: "
-             << format("%.1f", floor((BlockRThroughput * 10) + 0.5) / 10)
+             << format("%.1f", floor((DV.BlockRThroughput * 10) + 0.5) / 10)
              << '\n';
   TempStream.flush();
   OS << Buffer;
 }
 
+void SummaryView::collectData(DisplayValues &DV) const {
+  DV.Instructions = Source.size();
+  DV.Iterations = (LastInstructionIdx / DV.Instructions) + 1;
+  DV.TotalInstructions = DV.Instructions * DV.Iterations;
+  DV.TotalCycles = TotalCycles;
+  DV.DispatchWidth = DispatchWidth;
+  DV.TotalUOps = NumMicroOps * DV.Iterations;
+  DV.UOpsPerCycle = (double)DV.TotalUOps / TotalCycles;
+  DV.IPC = (double)DV.TotalInstructions / TotalCycles;
+  DV.BlockRThroughput = computeBlockRThroughput(SM, DispatchWidth, NumMicroOps,
+                                                ProcResourceUsage);
+}
+
+json::Value SummaryView::toJSON() const {
+  DisplayValues DV;
+  collectData(DV);
+  json::Object JO({{"Iterations", DV.Iterations},
+                   {"Instructions", DV.TotalInstructions},
+                   {"TotalCycles", DV.TotalCycles},
+                   {"TotaluOps", DV.TotalUOps},
+                   {"DispatchWidth", DV.DispatchWidth},
+                   {"uOpsPerCycle", DV.UOpsPerCycle},
+                   {"IPC", DV.IPC},
+                   {"BlockRThroughput", DV.BlockRThroughput}});
+  return JO;
+}
 } // namespace mca.
 } // namespace llvm
diff --git a/contrib/llvm-project/llvm/tools/llvm-mca/Views/SummaryView.h b/contrib/llvm-project/llvm/tools/llvm-mca/Views/SummaryView.h
index 9be31b7d51bd..2622e869ef23 100644
--- a/contrib/llvm-project/llvm/tools/llvm-mca/Views/SummaryView.h
+++ b/contrib/llvm-project/llvm/tools/llvm-mca/Views/SummaryView.h
@@ -46,6 +46,18 @@ class SummaryView : public View {
   // The total number of micro opcodes contributed by a block of instructions.
   unsigned NumMicroOps;
 
+  struct DisplayValues {
+    unsigned Instructions;
+    unsigned Iterations;
+    unsigned TotalInstructions;
+    unsigned TotalCycles;
+    unsigned DispatchWidth;
+    unsigned TotalUOps;
+    double IPC;
+    double UOpsPerCycle;
+    double BlockRThroughput;
+  };
+
   // For each processor resource, this vector stores the cumulative number of
   // resource cycles consumed by the analyzed code block.
   llvm::SmallVector<unsigned, 8> ProcResourceUsage;
@@ -65,6 +77,9 @@ class SummaryView : public View {
   //   - Total Resource Cycles / #Units   (for every resource consumed).
   double getBlockRThroughput() const;
 
+  /// Compute the data we want to print out in the object DV.
+  void collectData(DisplayValues &DV) const;
+
 public:
   SummaryView(const llvm::MCSchedModel &Model, llvm::ArrayRef<llvm::MCInst> S,
               unsigned Width);
@@ -72,8 +87,9 @@ public:
   void onCycleEnd() override { ++TotalCycles; }
   void onEvent(const HWInstructionEvent &Event) override;
   void printView(llvm::raw_ostream &OS) const override;
+  StringRef getNameAsString() const override { return "SummaryView"; }
+  json::Value toJSON() const override;
 };
-
 } // namespace mca
 } // namespace llvm
 
diff --git a/contrib/llvm-project/llvm/tools/llvm-mca/Views/TimelineView.cpp b/contrib/llvm-project/llvm/tools/llvm-mca/Views/TimelineView.cpp
index cf5b48e811b8..c8b481bc7ce6 100644
--- a/contrib/llvm-project/llvm/tools/llvm-mca/Views/TimelineView.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-mca/Views/TimelineView.cpp
@@ -20,10 +20,10 @@ namespace mca {
 TimelineView::TimelineView(const MCSubtargetInfo &sti, MCInstPrinter &Printer,
                            llvm::ArrayRef<llvm::MCInst> S, unsigned Iterations,
                            unsigned Cycles)
-    : STI(sti), MCIP(Printer), Source(S), CurrentCycle(0),
+    : InstructionView(sti, Printer, S), CurrentCycle(0),
       MaxCycle(Cycles == 0 ? 80 : Cycles), LastCycle(0), WaitTime(S.size()),
       UsedBuffer(S.size()) {
-  unsigned NumInstructions = Source.size();
+  unsigned NumInstructions = getSource().size();
   assert(Iterations && "Invalid number of iterations specified!");
   NumInstructions *= Iterations;
   Timeline.resize(NumInstructions);
@@ -40,10 +40,10 @@ TimelineView::TimelineView(const MCSubtargetInfo &sti, MCInstPrinter &Printer,
 
 void TimelineView::onReservedBuffers(const InstRef &IR,
                                      ArrayRef<unsigned> Buffers) {
-  if (IR.getSourceIndex() >= Source.size())
+  if (IR.getSourceIndex() >= getSource().size())
     return;
 
-  const MCSchedModel &SM = STI.getSchedModel();
+  const MCSchedModel &SM = getSubTargetInfo().getSchedModel();
   std::pair<unsigned, int> BufferInfo = {0, -1};
   for (const unsigned Buffer : Buffers) {
     const MCProcResourceDesc &MCDesc = *SM.getProcResource(Buffer);
@@ -70,7 +70,7 @@ void TimelineView::onEvent(const HWInstructionEvent &Event) {
     // Update the WaitTime entry which corresponds to this Index.
     assert(TVEntry.CycleDispatched >= 0 && "Invalid TVEntry found!");
     unsigned CycleDispatched = static_cast<unsigned>(TVEntry.CycleDispatched);
-    WaitTimeEntry &WTEntry = WaitTime[Index % Source.size()];
+    WaitTimeEntry &WTEntry = WaitTime[Index % getSource().size()];
     WTEntry.CyclesSpentInSchedulerQueue +=
         TVEntry.CycleIssued - CycleDispatched;
     assert(CycleDispatched <= TVEntry.CycleReady &&
@@ -133,7 +133,7 @@ void TimelineView::printWaitTimeEntry(formatted_raw_ostream &OS,
                                       const WaitTimeEntry &Entry,
                                       unsigned SourceIndex,
                                       unsigned Executions) const {
-  bool PrintingTotals = SourceIndex == Source.size();
+  bool PrintingTotals = SourceIndex == getSource().size();
   unsigned CumulativeExecutions = PrintingTotals ? Timeline.size() : Executions;
 
   if (!PrintingTotals)
@@ -164,7 +164,8 @@ void TimelineView::printWaitTimeEntry(formatted_raw_ostream &OS,
   OS.PadToColumn(27);
   if (!PrintingTotals)
     tryChangeColor(OS, Entry.CyclesSpentAfterWBAndBeforeRetire,
-                   CumulativeExecutions, STI.getSchedModel().MicroOpBufferSize);
+                   CumulativeExecutions,
+                   getSubTargetInfo().getSchedModel().MicroOpBufferSize);
   OS << format("%.1f", floor((AverageTime3 * 10) + 0.5) / 10);
 
   if (OS.has_colors())
@@ -181,33 +182,19 @@ void TimelineView::printAverageWaitTimes(raw_ostream &OS) const {
       "[3]: Average time elapsed from WB until retire stage\n\n"
       "      [0]    [1]    [2]    [3]\n";
   OS << Header;
-
-  // Use a different string stream for printing instructions.
-  std::string Instruction;
-  raw_string_ostream InstrStream(Instruction);
-
   formatted_raw_ostream FOS(OS);
-  unsigned Executions = Timeline.size() / Source.size();
+  unsigned Executions = Timeline.size() / getSource().size();
   unsigned IID = 0;
-  for (const MCInst &Inst : Source) {
+  for (const MCInst &Inst : getSource()) {
     printWaitTimeEntry(FOS, WaitTime[IID], IID, Executions);
-    // Append the instruction info at the end of the line.
-    MCIP.printInst(&Inst, 0, "", STI, InstrStream);
-    InstrStream.flush();
-
-    // Consume any tabs or spaces at the beginning of the string.
-    StringRef Str(Instruction);
-    Str = Str.ltrim();
-    FOS << "   " << Str << '\n';
+    FOS << "   " << printInstructionString(Inst) << '\n';
     FOS.flush();
-    Instruction = "";
-
     ++IID;
   }
 
   // If the timeline contains more than one instruction,
   // let's also print global averages.
-  if (Source.size() != 1) {
+  if (getSource().size() != 1) {
     WaitTimeEntry TotalWaitTime = std::accumulate(
         WaitTime.begin(), WaitTime.end(), WaitTimeEntry{0, 0, 0},
         [](const WaitTimeEntry &A, const WaitTimeEntry &B) {
@@ -220,7 +207,7 @@ void TimelineView::printAverageWaitTimes(raw_ostream &OS) const {
     printWaitTimeEntry(FOS, TotalWaitTime, IID, Executions);
     FOS << "   "
         << "<total>" << '\n';
-    InstrStream.flush();
+    FOS.flush();
   }
 }
 
@@ -292,11 +279,8 @@ void TimelineView::printTimeline(raw_ostream &OS) const {
   printTimelineHeader(FOS, LastCycle);
   FOS.flush();
 
-  // Use a different string stream for the instruction.
-  std::string Instruction;
-  raw_string_ostream InstrStream(Instruction);
-
   unsigned IID = 0;
+  ArrayRef<llvm::MCInst> Source = getSource();
   const unsigned Iterations = Timeline.size() / Source.size();
   for (unsigned Iteration = 0; Iteration < Iterations; ++Iteration) {
     for (const MCInst &Inst : Source) {
@@ -306,20 +290,26 @@ void TimelineView::printTimeline(raw_ostream &OS) const {
 
       unsigned SourceIndex = IID % Source.size();
       printTimelineViewEntry(FOS, Entry, Iteration, SourceIndex);
-      // Append the instruction info at the end of the line.
-      MCIP.printInst(&Inst, 0, "", STI, InstrStream);
-      InstrStream.flush();
-
-      // Consume any tabs or spaces at the beginning of the string.
-      StringRef Str(Instruction);
-      Str = Str.ltrim();
-      FOS << "   " << Str << '\n';
+      FOS << "   " << printInstructionString(Inst) << '\n';
       FOS.flush();
-      Instruction = "";
 
       ++IID;
     }
   }
 }
+
+json::Value TimelineView::toJSON() const {
+  json::Array TimelineInfo;
+
+  for (const TimelineViewEntry &TLE : Timeline) {
+    TimelineInfo.push_back(
+        json::Object({{"CycleDispatched", TLE.CycleDispatched},
+                      {"CycleReady", TLE.CycleReady},
+                      {"CycleIssued", TLE.CycleIssued},
+                      {"CycleExecuted", TLE.CycleExecuted},
+                      {"CycleRetired", TLE.CycleRetired}}));
+  }
+  return json::Object({{"TimelineInfo", std::move(TimelineInfo)}});
+}
 } // namespace mca
 } // namespace llvm
diff --git a/contrib/llvm-project/llvm/tools/llvm-mca/Views/TimelineView.h b/contrib/llvm-project/llvm/tools/llvm-mca/Views/TimelineView.h
index 9bec3b87db45..81f2b0335081 100644
--- a/contrib/llvm-project/llvm/tools/llvm-mca/Views/TimelineView.h
+++ b/contrib/llvm-project/llvm/tools/llvm-mca/Views/TimelineView.h
@@ -100,12 +100,13 @@
 #ifndef LLVM_TOOLS_LLVM_MCA_TIMELINEVIEW_H
 #define LLVM_TOOLS_LLVM_MCA_TIMELINEVIEW_H
 
-#include "Views/View.h"
+#include "Views/InstructionView.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstPrinter.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/JSON.h"
 #include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
@@ -118,11 +119,7 @@ namespace mca {
 /// a TimelineViewEntry object. TimelineViewEntry objects are then used
 /// to print the timeline information, as well as the "average wait times"
 /// for every instruction in the input assembly sequence.
-class TimelineView : public View {
-  const llvm::MCSubtargetInfo &STI;
-  llvm::MCInstPrinter &MCIP;
-  llvm::ArrayRef<llvm::MCInst> Source;
-
+class TimelineView : public InstructionView {
   unsigned CurrentCycle;
   unsigned MaxCycle;
   unsigned LastCycle;
@@ -182,6 +179,8 @@ public:
     printTimeline(OS);
     printAverageWaitTimes(OS);
   }
+  StringRef getNameAsString() const override { return "TimelineView"; }
+  json::Value toJSON() const override;
 };
 } // namespace mca
 } // namespace llvm
diff --git a/contrib/llvm-project/llvm/tools/llvm-mca/Views/View.cpp b/contrib/llvm-project/llvm/tools/llvm-mca/Views/View.cpp
index 8e5c34d2d5c2..09d08d3ae007 100644
--- a/contrib/llvm-project/llvm/tools/llvm-mca/Views/View.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-mca/Views/View.cpp
@@ -12,10 +12,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "Views/View.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 
 namespace llvm {
 namespace mca {
 
 void View::anchor() {}
+
 } // namespace mca
 } // namespace llvm
diff --git a/contrib/llvm-project/llvm/tools/llvm-mca/Views/View.h b/contrib/llvm-project/llvm/tools/llvm-mca/Views/View.h
index 3b52511b4d29..85464bfda662 100644
--- a/contrib/llvm-project/llvm/tools/llvm-mca/Views/View.h
+++ b/contrib/llvm-project/llvm/tools/llvm-mca/Views/View.h
@@ -15,16 +15,34 @@
 #ifndef LLVM_TOOLS_LLVM_MCA_VIEW_H
 #define LLVM_TOOLS_LLVM_MCA_VIEW_H
 
+#include "llvm/MC/MCInstPrinter.h"
 #include "llvm/MCA/HWEventListener.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/JSON.h"
 
 namespace llvm {
 namespace mca {
 
 class View : public HWEventListener {
 public:
+  enum OutputKind { OK_READABLE, OK_JSON };
+
+  void printView(OutputKind OutputKind, llvm::raw_ostream &OS) {
+    if (OutputKind == OK_JSON)
+      printViewJSON(OS);
+    else
+      printView(OS);
+  }
+
   virtual void printView(llvm::raw_ostream &OS) const = 0;
+  virtual void printViewJSON(llvm::raw_ostream &OS) {
+    json::Object JO;
+    JO.try_emplace(getNameAsString().str(), toJSON());
+    OS << formatv("{0:2}", json::Value(std::move(JO))) << "\n";
+  }
   virtual ~View() = default;
+  virtual StringRef getNameAsString() const = 0;
+  virtual json::Value toJSON() const { return "not implemented"; }
   void anchor() override;
 };
 } // namespace mca
diff --git a/contrib/llvm-project/llvm/tools/llvm-mca/llvm-mca.cpp b/contrib/llvm-project/llvm/tools/llvm-mca/llvm-mca.cpp
index 9f3bf41ff3f8..13a2c6363579 100644
--- a/contrib/llvm-project/llvm/tools/llvm-mca/llvm-mca.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-mca/llvm-mca.cpp
@@ -96,6 +96,11 @@ static cl::opt<std::string>
           cl::desc("Additional target features."),
           cl::cat(ToolOptions));
 
+static cl::opt<bool>
+    PrintJson("json",
+          cl::desc("Print the output in json format"),
+          cl::cat(ToolOptions), cl::init(false));
+
 static cl::opt<int>
     OutputAsmVariant("output-asm-variant",
                      cl::desc("Syntax variant to use for output printing"),
@@ -325,11 +330,12 @@ int main(int argc, char **argv) {
   // Apply overrides to llvm-mca specific options.
   processViewOptions();
 
-  if (!MCPU.compare("native"))
+  if (MCPU == "native")
     MCPU = std::string(llvm::sys::getHostCPUName());
 
   std::unique_ptr<MCSubtargetInfo> STI(
       TheTarget->createMCSubtargetInfo(TripleName, MCPU, MATTR));
+  assert(STI && "Unable to create subtarget info!");
   if (!STI->isCPUStringValid(MCPU))
     return 1;
 
@@ -373,6 +379,7 @@ int main(int argc, char **argv) {
   std::unique_ptr<buffer_ostream> BOS;
 
   std::unique_ptr<MCInstrInfo> MCII(TheTarget->createMCInstrInfo());
+  assert(MCII && "Unable to create instruction info!");
 
   std::unique_ptr<MCInstrAnalysis> MCIA(
       TheTarget->createMCInstrAnalysis(MCII.get()));
@@ -443,9 +450,11 @@ int main(int argc, char **argv) {
 
   std::unique_ptr<MCCodeEmitter> MCE(
       TheTarget->createMCCodeEmitter(*MCII, *MRI, Ctx));
+  assert(MCE && "Unable to create code emitter!");
 
   std::unique_ptr<MCAsmBackend> MAB(TheTarget->createMCAsmBackend(
       *STI, *MRI, mc::InitMCTargetOptionsFromFlags()));
+  assert(MAB && "Unable to create asm backend!");
 
   for (const std::unique_ptr<mca::CodeRegion> &Region : Regions) {
     // Skip empty code regions.
@@ -497,7 +506,7 @@ int main(int argc, char **argv) {
       auto P = std::make_unique<mca::Pipeline>();
       P->appendStage(std::make_unique<mca::EntryStage>(S));
       P->appendStage(std::make_unique<mca::InstructionTables>(SM));
-      mca::PipelinePrinter Printer(*P);
+      mca::PipelinePrinter Printer(*P, mca::View::OK_READABLE);
 
       // Create the views for this pipeline, execute, and emit a report.
       if (PrintInstructionInfoView) {
@@ -516,7 +525,14 @@ int main(int argc, char **argv) {
 
     // Create a basic pipeline simulating an out-of-order backend.
     auto P = MCA.createDefaultPipeline(PO, S);
-    mca::PipelinePrinter Printer(*P);
+    mca::PipelinePrinter Printer(*P, PrintJson ? mca::View::OK_JSON
+                                               : mca::View::OK_READABLE);
+
+    // When we output JSON, we add a view that contains the instructions
+    // and CPU resource information.
+    if (PrintJson)
+      Printer.addView(
+          std::make_unique<mca::InstructionView>(*STI, *IP, Insts, MCPU));
 
     if (PrintSummaryView)
       Printer.addView(
diff --git a/contrib/llvm-project/llvm/tools/llvm-nm/llvm-nm.cpp b/contrib/llvm-project/llvm/tools/llvm-nm/llvm-nm.cpp
index ecd1e21e15bf..c678108807c5 100644
--- a/contrib/llvm-project/llvm/tools/llvm-nm/llvm-nm.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-nm/llvm-nm.cpp
@@ -295,7 +295,7 @@ struct NMSymbol {
   uint64_t Address;
   uint64_t Size;
   char TypeChar;
-  StringRef Name;
+  std::string Name;
   StringRef SectionName;
   StringRef TypeName;
   BasicSymbolRef Sym;
@@ -309,25 +309,27 @@ struct NMSymbol {
   uint8_t NType;
   uint8_t NSect;
   uint16_t NDesc;
-  StringRef IndirectName;
+  std::string IndirectName;
 };
 } // anonymous namespace
 
 static bool compareSymbolAddress(const NMSymbol &A, const NMSymbol &B) {
   bool ADefined;
   // Symbol flags have been checked in the caller.
-  uint32_t AFlags = cantFail(A.Sym.getFlags());
-  if (A.Sym.getRawDataRefImpl().p)
+  if (A.Sym.getRawDataRefImpl().p) {
+    uint32_t AFlags = cantFail(A.Sym.getFlags());
     ADefined = !(AFlags & SymbolRef::SF_Undefined);
-  else
+  } else {
     ADefined = A.TypeChar != 'U';
+  }
   bool BDefined;
   // Symbol flags have been checked in the caller.
-  uint32_t BFlags = cantFail(B.Sym.getFlags());
-  if (B.Sym.getRawDataRefImpl().p)
+  if (B.Sym.getRawDataRefImpl().p) {
+    uint32_t BFlags = cantFail(B.Sym.getFlags());
     BDefined = !(BFlags & SymbolRef::SF_Undefined);
-  else
+  } else {
     BDefined = B.TypeChar != 'U';
+  }
   return std::make_tuple(ADefined, A.Address, A.Name, A.Size) <
          std::make_tuple(BDefined, B.Address, B.Name, B.Size);
 }
@@ -813,7 +815,7 @@ static void sortAndPrintSymbolList(SymbolicFile &Obj, bool printName,
 
   for (const NMSymbol &S : SymbolList) {
     uint32_t SymFlags;
-    std::string Name = S.Name.str();
+    std::string Name = S.Name;
     MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(&Obj);
     if (Demangle) {
       if (Optional<std::string> Opt = demangle(S.Name, MachO))
@@ -1093,9 +1095,8 @@ static char getSymbolNMTypeChar(IRObjectFile &Obj, basic_symbol_iterator I) {
 }
 
 static bool isObject(SymbolicFile &Obj, basic_symbol_iterator I) {
-  return !dyn_cast<ELFObjectFileBase>(&Obj)
-             ? false
-             : elf_symbol_iterator(I)->getELFType() == ELF::STT_OBJECT;
+  return isa<ELFObjectFileBase>(&Obj) &&
+         elf_symbol_iterator(I)->getELFType() == ELF::STT_OBJECT;
 }
 
 // For ELF object files, Set TypeName to the symbol typename, to be printed
@@ -1218,10 +1219,557 @@ static unsigned getNsectInMachO(MachOObjectFile &Obj, BasicSymbolRef Sym) {
   return (STE.n_type & MachO::N_TYPE) == MachO::N_SECT ? STE.n_sect : 0;
 }
 
+static void dumpSymbolsFromDLInfoMachO(MachOObjectFile &MachO) {
+  size_t I = SymbolList.size();
+  std::string ExportsNameBuffer;
+  raw_string_ostream EOS(ExportsNameBuffer);
+  std::string BindsNameBuffer;
+  raw_string_ostream BOS(BindsNameBuffer);
+  std::string LazysNameBuffer;
+  raw_string_ostream LOS(LazysNameBuffer);
+  std::string WeaksNameBuffer;
+  raw_string_ostream WOS(WeaksNameBuffer);
+  std::string FunctionStartsNameBuffer;
+  raw_string_ostream FOS(FunctionStartsNameBuffer);
+
+  MachO::mach_header H;
+  MachO::mach_header_64 H_64;
+  uint32_t HFlags = 0;
+  if (MachO.is64Bit()) {
+    H_64 = MachO.MachOObjectFile::getHeader64();
+    HFlags = H_64.flags;
+  } else {
+    H = MachO.MachOObjectFile::getHeader();
+    HFlags = H.flags;
+  }
+  uint64_t BaseSegmentAddress = 0;
+  for (const auto &Command : MachO.load_commands()) {
+    if (Command.C.cmd == MachO::LC_SEGMENT) {
+      MachO::segment_command Seg = MachO.getSegmentLoadCommand(Command);
+      if (Seg.fileoff == 0 && Seg.filesize != 0) {
+        BaseSegmentAddress = Seg.vmaddr;
+        break;
+      }
+    } else if (Command.C.cmd == MachO::LC_SEGMENT_64) {
+      MachO::segment_command_64 Seg = MachO.getSegment64LoadCommand(Command);
+      if (Seg.fileoff == 0 && Seg.filesize != 0) {
+        BaseSegmentAddress = Seg.vmaddr;
+        break;
+      }
+    }
+  }
+  if (DyldInfoOnly || AddDyldInfo ||
+      HFlags & MachO::MH_NLIST_OUTOFSYNC_WITH_DYLDINFO) {
+    unsigned ExportsAdded = 0;
+    Error Err = Error::success();
+    for (const llvm::object::ExportEntry &Entry : MachO.exports(Err)) {
+      bool found = false;
+      bool ReExport = false;
+      if (!DyldInfoOnly) {
+        for (const NMSymbol &S : SymbolList)
+          if (S.Address == Entry.address() + BaseSegmentAddress &&
+              S.Name == Entry.name()) {
+            found = true;
+            break;
+          }
+      }
+      if (!found) {
+        NMSymbol S = {};
+        S.Address = Entry.address() + BaseSegmentAddress;
+        S.Size = 0;
+        S.TypeChar = '\0';
+        S.Name = Entry.name().str();
+        // There is no symbol in the nlist symbol table for this so we set
+        // Sym effectivly to null and the rest of code in here must test for
+        // it and not do things like Sym.getFlags() for it.
+        S.Sym = BasicSymbolRef();
+        S.SymFlags = SymbolRef::SF_Global;
+        S.Section = SectionRef();
+        S.NType = 0;
+        S.NSect = 0;
+        S.NDesc = 0;
+
+        uint64_t EFlags = Entry.flags();
+        bool Abs = ((EFlags & MachO::EXPORT_SYMBOL_FLAGS_KIND_MASK) ==
+                    MachO::EXPORT_SYMBOL_FLAGS_KIND_ABSOLUTE);
+        bool Resolver = (EFlags & MachO::EXPORT_SYMBOL_FLAGS_STUB_AND_RESOLVER);
+        ReExport = (EFlags & MachO::EXPORT_SYMBOL_FLAGS_REEXPORT);
+        bool WeakDef = (EFlags & MachO::EXPORT_SYMBOL_FLAGS_WEAK_DEFINITION);
+        if (WeakDef)
+          S.NDesc |= MachO::N_WEAK_DEF;
+        if (Abs) {
+          S.NType = MachO::N_EXT | MachO::N_ABS;
+          S.TypeChar = 'A';
+        } else if (ReExport) {
+          S.NType = MachO::N_EXT | MachO::N_INDR;
+          S.TypeChar = 'I';
+        } else {
+          S.NType = MachO::N_EXT | MachO::N_SECT;
+          if (Resolver) {
+            S.Address = Entry.other() + BaseSegmentAddress;
+            if ((S.Address & 1) != 0 && !MachO.is64Bit() &&
+                H.cputype == MachO::CPU_TYPE_ARM) {
+              S.Address &= ~1LL;
+              S.NDesc |= MachO::N_ARM_THUMB_DEF;
+            }
+          } else {
+            S.Address = Entry.address() + BaseSegmentAddress;
+          }
+          StringRef SegmentName = StringRef();
+          StringRef SectionName = StringRef();
+          for (const SectionRef &Section : MachO.sections()) {
+            S.NSect++;
+
+            if (Expected<StringRef> NameOrErr = Section.getName())
+              SectionName = *NameOrErr;
+            else
+              consumeError(NameOrErr.takeError());
+
+            SegmentName =
+                MachO.getSectionFinalSegmentName(Section.getRawDataRefImpl());
+            if (S.Address >= Section.getAddress() &&
+                S.Address < Section.getAddress() + Section.getSize()) {
+              S.Section = Section;
+              break;
+            } else if (Entry.name() == "__mh_execute_header" &&
+                       SegmentName == "__TEXT" && SectionName == "__text") {
+              S.Section = Section;
+              S.NDesc |= MachO::REFERENCED_DYNAMICALLY;
+              break;
+            }
+          }
+          if (SegmentName == "__TEXT" && SectionName == "__text")
+            S.TypeChar = 'T';
+          else if (SegmentName == "__DATA" && SectionName == "__data")
+            S.TypeChar = 'D';
+          else if (SegmentName == "__DATA" && SectionName == "__bss")
+            S.TypeChar = 'B';
+          else
+            S.TypeChar = 'S';
+        }
+        SymbolList.push_back(S);
+
+        EOS << Entry.name();
+        EOS << '\0';
+        ExportsAdded++;
+
+        // For ReExports there are a two more things to do, first add the
+        // indirect name and second create the undefined symbol using the
+        // referened dynamic library.
+        if (ReExport) {
+
+          // Add the indirect name.
+          if (Entry.otherName().empty())
+            EOS << Entry.name();
+          else
+            EOS << Entry.otherName();
+          EOS << '\0';
+
+          // Now create the undefined symbol using the referened dynamic
+          // library.
+          NMSymbol U = {};
+          U.Address = 0;
+          U.Size = 0;
+          U.TypeChar = 'U';
+          if (Entry.otherName().empty())
+            U.Name = Entry.name().str();
+          else
+            U.Name = Entry.otherName().str();
+          // Again there is no symbol in the nlist symbol table for this so
+          // we set Sym effectivly to null and the rest of code in here must
+          // test for it and not do things like Sym.getFlags() for it.
+          U.Sym = BasicSymbolRef();
+          U.SymFlags = SymbolRef::SF_Global | SymbolRef::SF_Undefined;
+          U.Section = SectionRef();
+          U.NType = MachO::N_EXT | MachO::N_UNDF;
+          U.NSect = 0;
+          U.NDesc = 0;
+          // The library ordinal for this undefined symbol is in the export
+          // trie Entry.other().
+          MachO::SET_LIBRARY_ORDINAL(U.NDesc, Entry.other());
+          SymbolList.push_back(U);
+
+          // Finally add the undefined symbol's name.
+          if (Entry.otherName().empty())
+            EOS << Entry.name();
+          else
+            EOS << Entry.otherName();
+          EOS << '\0';
+          ExportsAdded++;
+        }
+      }
+    }
+    if (Err)
+      error(std::move(Err), MachO.getFileName());
+    // Set the symbol names and indirect names for the added symbols.
+    if (ExportsAdded) {
+      EOS.flush();
+      const char *Q = ExportsNameBuffer.c_str();
+      for (unsigned K = 0; K < ExportsAdded; K++) {
+        SymbolList[I].Name = Q;
+        Q += strlen(Q) + 1;
+        if (SymbolList[I].TypeChar == 'I') {
+          SymbolList[I].IndirectName = Q;
+          Q += strlen(Q) + 1;
+        }
+        I++;
+      }
+    }
+
+    // Add the undefined symbols from the bind entries.
+    unsigned BindsAdded = 0;
+    Error BErr = Error::success();
+    StringRef LastSymbolName = StringRef();
+    for (const llvm::object::MachOBindEntry &Entry : MachO.bindTable(BErr)) {
+      bool found = false;
+      if (LastSymbolName == Entry.symbolName())
+        found = true;
+      else if (!DyldInfoOnly) {
+        for (unsigned J = 0; J < SymbolList.size() && !found; ++J) {
+          if (SymbolList[J].Name == Entry.symbolName())
+            found = true;
+        }
+      }
+      if (!found) {
+        LastSymbolName = Entry.symbolName();
+        NMSymbol B = {};
+        B.Address = 0;
+        B.Size = 0;
+        B.TypeChar = 'U';
+        // There is no symbol in the nlist symbol table for this so we set
+        // Sym effectivly to null and the rest of code in here must test for
+        // it and not do things like Sym.getFlags() for it.
+        B.Sym = BasicSymbolRef();
+        B.SymFlags = SymbolRef::SF_Global | SymbolRef::SF_Undefined;
+        B.NType = MachO::N_EXT | MachO::N_UNDF;
+        B.NSect = 0;
+        B.NDesc = 0;
+        MachO::SET_LIBRARY_ORDINAL(B.NDesc, Entry.ordinal());
+        B.Name = Entry.symbolName().str();
+        SymbolList.push_back(B);
+        BOS << Entry.symbolName();
+        BOS << '\0';
+        BindsAdded++;
+      }
+    }
+    if (BErr)
+      error(std::move(BErr), MachO.getFileName());
+    // Set the symbol names and indirect names for the added symbols.
+    if (BindsAdded) {
+      BOS.flush();
+      const char *Q = BindsNameBuffer.c_str();
+      for (unsigned K = 0; K < BindsAdded; K++) {
+        SymbolList[I].Name = Q;
+        Q += strlen(Q) + 1;
+        if (SymbolList[I].TypeChar == 'I') {
+          SymbolList[I].IndirectName = Q;
+          Q += strlen(Q) + 1;
+        }
+        I++;
+      }
+    }
+
+    // Add the undefined symbols from the lazy bind entries.
+    unsigned LazysAdded = 0;
+    Error LErr = Error::success();
+    LastSymbolName = StringRef();
+    for (const llvm::object::MachOBindEntry &Entry :
+         MachO.lazyBindTable(LErr)) {
+      bool found = false;
+      if (LastSymbolName == Entry.symbolName())
+        found = true;
+      else {
+        // Here we must check to see it this symbol is already in the
+        // SymbolList as it might have already have been added above via a
+        // non-lazy (bind) entry.
+        for (unsigned J = 0; J < SymbolList.size() && !found; ++J) {
+          if (SymbolList[J].Name == Entry.symbolName())
+            found = true;
+        }
+      }
+      if (!found) {
+        LastSymbolName = Entry.symbolName();
+        NMSymbol L = {};
+        L.Name = Entry.symbolName().str();
+        L.Address = 0;
+        L.Size = 0;
+        L.TypeChar = 'U';
+        // There is no symbol in the nlist symbol table for this so we set
+        // Sym effectivly to null and the rest of code in here must test for
+        // it and not do things like Sym.getFlags() for it.
+        L.Sym = BasicSymbolRef();
+        L.SymFlags = SymbolRef::SF_Global | SymbolRef::SF_Undefined;
+        L.NType = MachO::N_EXT | MachO::N_UNDF;
+        L.NSect = 0;
+        // The REFERENCE_FLAG_UNDEFINED_LAZY is no longer used but here it
+        // makes sence since we are creating this from a lazy bind entry.
+        L.NDesc = MachO::REFERENCE_FLAG_UNDEFINED_LAZY;
+        MachO::SET_LIBRARY_ORDINAL(L.NDesc, Entry.ordinal());
+        SymbolList.push_back(L);
+        LOS << Entry.symbolName();
+        LOS << '\0';
+        LazysAdded++;
+      }
+    }
+    if (LErr)
+      error(std::move(LErr), MachO.getFileName());
+    // Set the symbol names and indirect names for the added symbols.
+    if (LazysAdded) {
+      LOS.flush();
+      const char *Q = LazysNameBuffer.c_str();
+      for (unsigned K = 0; K < LazysAdded; K++) {
+        SymbolList[I].Name = Q;
+        Q += strlen(Q) + 1;
+        if (SymbolList[I].TypeChar == 'I') {
+          SymbolList[I].IndirectName = Q;
+          Q += strlen(Q) + 1;
+        }
+        I++;
+      }
+    }
+
+    // Add the undefineds symbol from the weak bind entries which are not
+    // strong symbols.
+    unsigned WeaksAdded = 0;
+    Error WErr = Error::success();
+    LastSymbolName = StringRef();
+    for (const llvm::object::MachOBindEntry &Entry :
+         MachO.weakBindTable(WErr)) {
+      bool found = false;
+      unsigned J = 0;
+      if (LastSymbolName == Entry.symbolName() ||
+          Entry.flags() & MachO::BIND_SYMBOL_FLAGS_NON_WEAK_DEFINITION) {
+        found = true;
+      } else {
+        for (J = 0; J < SymbolList.size() && !found; ++J) {
+          if (SymbolList[J].Name == Entry.symbolName()) {
+            found = true;
+            break;
+          }
+        }
+      }
+      if (!found) {
+        LastSymbolName = Entry.symbolName();
+        NMSymbol W = {};
+        W.Name = Entry.symbolName().str();
+        W.Address = 0;
+        W.Size = 0;
+        W.TypeChar = 'U';
+        // There is no symbol in the nlist symbol table for this so we set
+        // Sym effectivly to null and the rest of code in here must test for
+        // it and not do things like Sym.getFlags() for it.
+        W.Sym = BasicSymbolRef();
+        W.SymFlags = SymbolRef::SF_Global | SymbolRef::SF_Undefined;
+        W.NType = MachO::N_EXT | MachO::N_UNDF;
+        W.NSect = 0;
+        // Odd that we are using N_WEAK_DEF on an undefined symbol but that is
+        // what is created in this case by the linker when there are real
+        // symbols in the nlist structs.
+        W.NDesc = MachO::N_WEAK_DEF;
+        SymbolList.push_back(W);
+        WOS << Entry.symbolName();
+        WOS << '\0';
+        WeaksAdded++;
+      } else {
+        // This is the case the symbol was previously been found and it could
+        // have been added from a bind or lazy bind symbol.  If so and not
+        // a definition also mark it as weak.
+        if (SymbolList[J].TypeChar == 'U')
+          // See comment above about N_WEAK_DEF.
+          SymbolList[J].NDesc |= MachO::N_WEAK_DEF;
+      }
+    }
+    if (WErr)
+      error(std::move(WErr), MachO.getFileName());
+    // Set the symbol names and indirect names for the added symbols.
+    if (WeaksAdded) {
+      WOS.flush();
+      const char *Q = WeaksNameBuffer.c_str();
+      for (unsigned K = 0; K < WeaksAdded; K++) {
+        SymbolList[I].Name = Q;
+        Q += strlen(Q) + 1;
+        if (SymbolList[I].TypeChar == 'I') {
+          SymbolList[I].IndirectName = Q;
+          Q += strlen(Q) + 1;
+        }
+        I++;
+      }
+    }
+
+    // Trying adding symbol from the function starts table and LC_MAIN entry
+    // point.
+    SmallVector<uint64_t, 8> FoundFns;
+    uint64_t lc_main_offset = UINT64_MAX;
+    for (const auto &Command : MachO.load_commands()) {
+      if (Command.C.cmd == MachO::LC_FUNCTION_STARTS) {
+        // We found a function starts segment, parse the addresses for
+        // consumption.
+        MachO::linkedit_data_command LLC =
+            MachO.getLinkeditDataLoadCommand(Command);
+
+        MachO.ReadULEB128s(LLC.dataoff, FoundFns);
+      } else if (Command.C.cmd == MachO::LC_MAIN) {
+        MachO::entry_point_command LCmain = MachO.getEntryPointCommand(Command);
+        lc_main_offset = LCmain.entryoff;
+      }
+    }
+    // See if these addresses are already in the symbol table.
+    unsigned FunctionStartsAdded = 0;
+    for (uint64_t f = 0; f < FoundFns.size(); f++) {
+      bool found = false;
+      for (unsigned J = 0; J < SymbolList.size() && !found; ++J) {
+        if (SymbolList[J].Address == FoundFns[f] + BaseSegmentAddress)
+          found = true;
+      }
+      // See this address is not already in the symbol table fake up an
+      // nlist for it.
+      if (!found) {
+        NMSymbol F = {};
+        F.Name = "<redacted function X>";
+        F.Address = FoundFns[f] + BaseSegmentAddress;
+        F.Size = 0;
+        // There is no symbol in the nlist symbol table for this so we set
+        // Sym effectivly to null and the rest of code in here must test for
+        // it and not do things like Sym.getFlags() for it.
+        F.Sym = BasicSymbolRef();
+        F.SymFlags = 0;
+        F.NType = MachO::N_SECT;
+        F.NSect = 0;
+        StringRef SegmentName = StringRef();
+        StringRef SectionName = StringRef();
+        for (const SectionRef &Section : MachO.sections()) {
+          if (Expected<StringRef> NameOrErr = Section.getName())
+            SectionName = *NameOrErr;
+          else
+            consumeError(NameOrErr.takeError());
+
+          SegmentName =
+              MachO.getSectionFinalSegmentName(Section.getRawDataRefImpl());
+          F.NSect++;
+          if (F.Address >= Section.getAddress() &&
+              F.Address < Section.getAddress() + Section.getSize()) {
+            F.Section = Section;
+            break;
+          }
+        }
+        if (SegmentName == "__TEXT" && SectionName == "__text")
+          F.TypeChar = 't';
+        else if (SegmentName == "__DATA" && SectionName == "__data")
+          F.TypeChar = 'd';
+        else if (SegmentName == "__DATA" && SectionName == "__bss")
+          F.TypeChar = 'b';
+        else
+          F.TypeChar = 's';
+        F.NDesc = 0;
+        SymbolList.push_back(F);
+        if (FoundFns[f] == lc_main_offset)
+          FOS << "<redacted LC_MAIN>";
+        else
+          FOS << "<redacted function " << f << ">";
+        FOS << '\0';
+        FunctionStartsAdded++;
+      }
+    }
+    if (FunctionStartsAdded) {
+      FOS.flush();
+      const char *Q = FunctionStartsNameBuffer.c_str();
+      for (unsigned K = 0; K < FunctionStartsAdded; K++) {
+        SymbolList[I].Name = Q;
+        Q += strlen(Q) + 1;
+        if (SymbolList[I].TypeChar == 'I') {
+          SymbolList[I].IndirectName = Q;
+          Q += strlen(Q) + 1;
+        }
+        I++;
+      }
+    }
+  }
+}
+
+namespace {
+struct SymbolVersion {
+  std::string Name;
+  bool IsDefault;
+};
+} // namespace
+
+template <class ELFT>
+static Expected<std::vector<SymbolVersion>>
+readSymbolVersionsELF(const ELFFile<ELFT> &Obj, StringRef FileName,
+                      ELFObjectFileBase::elf_symbol_iterator_range Symbols) {
+  using Elf_Shdr = typename ELFT::Shdr;
+
+  // We called sections() earlier, so can't fail here.
+  typename ELFT::ShdrRange SectionsOrErr = cantFail(Obj.sections());
+  const Elf_Shdr *SymVerSec = nullptr;
+  const Elf_Shdr *SymVerNeedSec = nullptr;
+  const Elf_Shdr *SymVerDefSec = nullptr;
+  for (const Elf_Shdr &Sec : SectionsOrErr) {
+    if (Sec.sh_type == ELF::SHT_GNU_versym)
+      SymVerSec = &Sec;
+    else if (Sec.sh_type == ELF::SHT_GNU_verdef)
+      SymVerDefSec = &Sec;
+    else if (Sec.sh_type == ELF::SHT_GNU_verneed)
+      SymVerNeedSec = &Sec;
+  }
+
+  if (!SymVerSec)
+    return std::vector<SymbolVersion>{};
+
+  Expected<SmallVector<Optional<VersionEntry>, 0>> MapOrErr =
+      Obj.loadVersionMap(SymVerNeedSec, SymVerDefSec);
+  if (!MapOrErr)
+    return MapOrErr.takeError();
+
+  std::vector<SymbolVersion> Ret;
+  size_t I = 0;
+  for (auto It = Symbols.begin(), E = Symbols.end(); It != E; ++It) {
+    ++I;
+    Expected<const typename ELFT::Versym *> VerEntryOrErr =
+        Obj.template getEntry<typename ELFT::Versym>(*SymVerSec, I);
+    if (!VerEntryOrErr)
+      return createError("unable to read an entry with index " + Twine(I) +
+                         " from " + describe(Obj, *SymVerSec) + ": " +
+                         toString(VerEntryOrErr.takeError()));
+
+    Expected<uint32_t> FlagsOrErr = It->getFlags();
+    if (!FlagsOrErr)
+      return createError("unable to read flags for symbol with index " +
+                         Twine(I) + ": " + toString(FlagsOrErr.takeError()));
+
+    bool IsDefault;
+    Expected<StringRef> VerOrErr = Obj.getSymbolVersionByIndex(
+        (*VerEntryOrErr)->vs_index, IsDefault, *MapOrErr,
+        (*FlagsOrErr) & SymbolRef::SF_Undefined);
+    if (!VerOrErr)
+      return createError("unable to get a version for entry " + Twine(I) +
+                         " of " + describe(Obj, *SymVerSec) + ": " +
+                         toString(VerOrErr.takeError()));
+
+    Ret.push_back({(*VerOrErr).str(), IsDefault});
+  }
+
+  return Ret;
+}
+
+static Expected<std::vector<SymbolVersion>>
+readSymbolVersionsELF(const ELFObjectFileBase &Obj,
+                      ELFObjectFileBase::elf_symbol_iterator_range Symbols) {
+  if (const auto *ELF = dyn_cast<ELF32LEObjectFile>(&Obj))
+    return readSymbolVersionsELF(ELF->getELFFile(), Obj.getFileName(), Symbols);
+  else if (const auto *ELF = dyn_cast<ELF32BEObjectFile>(&Obj))
+    return readSymbolVersionsELF(ELF->getELFFile(), Obj.getFileName(), Symbols);
+  else if (const auto *ELF = dyn_cast<ELF64LEObjectFile>(&Obj))
+    return readSymbolVersionsELF(ELF->getELFFile(), Obj.getFileName(), Symbols);
+  return readSymbolVersionsELF(cast<ELF64BEObjectFile>(&Obj)->getELFFile(),
+                               Obj.getFileName(), Symbols);
+}
+
 static void dumpSymbolNamesFromObject(SymbolicFile &Obj, bool printName,
                                       StringRef ArchiveName = {},
                                       StringRef ArchitectureName = {}) {
   auto Symbols = Obj.symbols();
+  std::vector<SymbolVersion> SymbolVersions;
   if (DynamicSyms) {
     const auto *E = dyn_cast<ELFObjectFileBase>(&Obj);
     if (!E) {
@@ -1229,9 +1777,16 @@ static void dumpSymbolNamesFromObject(SymbolicFile &Obj, bool printName,
       return;
     }
     Symbols = E->getDynamicSymbolIterators();
+
+    if (Expected<std::vector<SymbolVersion>> VersionsOrErr =
+            readSymbolVersionsELF(*E, Symbols))
+      SymbolVersions = std::move(*VersionsOrErr);
+    else
+      WithColor::warning(errs(), ToolName)
+          << "unable to read symbol versions: "
+          << toString(VersionsOrErr.takeError()) << "\n";
   }
-  std::string NameBuffer;
-  raw_string_ostream OS(NameBuffer);
+
   // If a "-s segname sectname" option was specified and this is a Mach-O
   // file get the section number for that section in this object file.
   unsigned int Nsect = 0;
@@ -1243,7 +1798,9 @@ static void dumpSymbolNamesFromObject(SymbolicFile &Obj, bool printName,
       return;
   }
   if (!(MachO && DyldInfoOnly)) {
+    size_t I = -1;
     for (BasicSymbolRef Sym : Symbols) {
+      ++I;
       Expected<uint32_t> SymFlagsOrErr = Sym.getFlags();
       if (!SymFlagsOrErr) {
         error(SymFlagsOrErr.takeError(), Obj.getFileName());
@@ -1274,6 +1831,8 @@ static void dumpSymbolNamesFromObject(SymbolicFile &Obj, bool printName,
       }
       S.TypeName = getNMTypeName(Obj, Sym);
       S.TypeChar = getNMSectionTagAndName(Obj, Sym, S.SectionName);
+
+      raw_string_ostream OS(S.Name);
       if (Error E = Sym.printName(OS)) {
         if (MachO) {
           OS << "bad string index";
@@ -1281,499 +1840,23 @@ static void dumpSymbolNamesFromObject(SymbolicFile &Obj, bool printName,
         } else
           error(std::move(E), Obj.getFileName());
       }
-      OS << '\0';
+      if (!SymbolVersions.empty() && !SymbolVersions[I].Name.empty())
+        S.Name +=
+            (SymbolVersions[I].IsDefault ? "@@" : "@") + SymbolVersions[I].Name;
+
       S.Sym = Sym;
       SymbolList.push_back(S);
     }
   }
 
-  OS.flush();
-  const char *P = NameBuffer.c_str();
-  unsigned I;
-  for (I = 0; I < SymbolList.size(); ++I) {
-    SymbolList[I].Name = P;
-    P += strlen(P) + 1;
-  }
-
   // If this is a Mach-O file where the nlist symbol table is out of sync
   // with the dyld export trie then look through exports and fake up symbols
   // for the ones that are missing (also done with the -add-dyldinfo flag).
   // This is needed if strip(1) -T is run on a binary containing swift
   // language symbols for example.  The option -only-dyldinfo will fake up
   // all symbols from the dyld export trie as well as the bind info.
-  std::string ExportsNameBuffer;
-  raw_string_ostream EOS(ExportsNameBuffer);
-  std::string BindsNameBuffer;
-  raw_string_ostream BOS(BindsNameBuffer);
-  std::string LazysNameBuffer;
-  raw_string_ostream LOS(LazysNameBuffer);
-  std::string WeaksNameBuffer;
-  raw_string_ostream WOS(WeaksNameBuffer);
-  std::string FunctionStartsNameBuffer;
-  raw_string_ostream FOS(FunctionStartsNameBuffer);
-  if (MachO && !NoDyldInfo) {
-    MachO::mach_header H;
-    MachO::mach_header_64 H_64;
-    uint32_t HFlags = 0;
-    if (MachO->is64Bit()) {
-      H_64 = MachO->MachOObjectFile::getHeader64();
-      HFlags = H_64.flags;
-    } else {
-      H = MachO->MachOObjectFile::getHeader();
-      HFlags = H.flags;
-    }
-    uint64_t BaseSegmentAddress = 0;
-    for (const auto &Command : MachO->load_commands()) {
-      if (Command.C.cmd == MachO::LC_SEGMENT) {
-        MachO::segment_command Seg = MachO->getSegmentLoadCommand(Command);
-        if (Seg.fileoff == 0 && Seg.filesize != 0) {
-          BaseSegmentAddress = Seg.vmaddr;
-          break;
-        }
-      } else if (Command.C.cmd == MachO::LC_SEGMENT_64) {
-        MachO::segment_command_64 Seg = MachO->getSegment64LoadCommand(Command);
-        if (Seg.fileoff == 0 && Seg.filesize != 0) {
-          BaseSegmentAddress = Seg.vmaddr;
-          break;
-        }
-      }
-    }
-    if (DyldInfoOnly || AddDyldInfo ||
-        HFlags & MachO::MH_NLIST_OUTOFSYNC_WITH_DYLDINFO) {
-      unsigned ExportsAdded = 0;
-      Error Err = Error::success();
-      for (const llvm::object::ExportEntry &Entry : MachO->exports(Err)) {
-        bool found = false;
-        bool ReExport = false;
-        if (!DyldInfoOnly) {
-          for (const NMSymbol &S : SymbolList)
-            if (S.Address == Entry.address() + BaseSegmentAddress &&
-                S.Name == Entry.name()) {
-              found = true;
-              break;
-            }
-        }
-        if (!found) {
-          NMSymbol S = {};
-          S.Address = Entry.address() + BaseSegmentAddress;
-          S.Size = 0;
-          S.TypeChar = '\0';
-          S.Name = Entry.name();
-          // There is no symbol in the nlist symbol table for this so we set
-          // Sym effectivly to null and the rest of code in here must test for
-          // it and not do things like Sym.getFlags() for it.
-          S.Sym = BasicSymbolRef();
-          S.SymFlags = SymbolRef::SF_Global;
-          S.Section = SectionRef();
-          S.NType = 0;
-          S.NSect = 0;
-          S.NDesc = 0;
-          S.IndirectName = StringRef();
-
-          uint64_t EFlags = Entry.flags();
-          bool Abs = ((EFlags & MachO::EXPORT_SYMBOL_FLAGS_KIND_MASK) ==
-                      MachO::EXPORT_SYMBOL_FLAGS_KIND_ABSOLUTE);
-          bool Resolver = (EFlags &
-                           MachO::EXPORT_SYMBOL_FLAGS_STUB_AND_RESOLVER);
-          ReExport = (EFlags & MachO::EXPORT_SYMBOL_FLAGS_REEXPORT);
-          bool WeakDef = (EFlags & MachO::EXPORT_SYMBOL_FLAGS_WEAK_DEFINITION);
-          if (WeakDef)
-            S.NDesc |= MachO::N_WEAK_DEF;
-          if (Abs) {
-            S.NType = MachO::N_EXT | MachO::N_ABS;
-            S.TypeChar = 'A';
-          } else if (ReExport) {
-            S.NType = MachO::N_EXT | MachO::N_INDR;
-            S.TypeChar = 'I';
-          } else {
-            S.NType = MachO::N_EXT | MachO::N_SECT;
-            if (Resolver) {
-              S.Address = Entry.other() + BaseSegmentAddress;
-              if ((S.Address & 1) != 0 &&
-                  !MachO->is64Bit() && H.cputype == MachO::CPU_TYPE_ARM){
-                S.Address &= ~1LL;
-                S.NDesc |= MachO::N_ARM_THUMB_DEF;
-              }
-            } else {
-              S.Address = Entry.address() + BaseSegmentAddress;
-            }
-            StringRef SegmentName = StringRef();
-            StringRef SectionName = StringRef();
-            for (const SectionRef &Section : MachO->sections()) {
-              S.NSect++;
- 
-              if (Expected<StringRef> NameOrErr = Section.getName())
-                SectionName = *NameOrErr;
-              else
-                consumeError(NameOrErr.takeError());
-
-              SegmentName = MachO->getSectionFinalSegmentName(
-                                                  Section.getRawDataRefImpl());
-              if (S.Address >= Section.getAddress() &&
-                  S.Address < Section.getAddress() + Section.getSize()) {
-                S.Section = Section;
-                break;
-              } else if (Entry.name() == "__mh_execute_header" &&
-                         SegmentName == "__TEXT" && SectionName == "__text") {
-                S.Section = Section;
-                S.NDesc |= MachO::REFERENCED_DYNAMICALLY;
-                break;
-              }
-            }
-            if (SegmentName == "__TEXT" && SectionName == "__text")
-              S.TypeChar = 'T';
-            else if (SegmentName == "__DATA" && SectionName == "__data")
-              S.TypeChar = 'D';
-            else if (SegmentName == "__DATA" && SectionName == "__bss")
-              S.TypeChar = 'B';
-            else
-              S.TypeChar = 'S';
-          }
-          SymbolList.push_back(S);
-
-          EOS << Entry.name();
-          EOS << '\0';
-          ExportsAdded++;
-
-          // For ReExports there are a two more things to do, first add the
-          // indirect name and second create the undefined symbol using the
-          // referened dynamic library.
-          if (ReExport) {
-
-            // Add the indirect name.
-            if (Entry.otherName().empty())
-              EOS << Entry.name();
-            else
-              EOS << Entry.otherName();
-            EOS << '\0';
-
-            // Now create the undefined symbol using the referened dynamic
-            // library.
-            NMSymbol U = {};
-            U.Address = 0;
-            U.Size = 0;
-            U.TypeChar = 'U';
-            if (Entry.otherName().empty())
-              U.Name = Entry.name();
-            else
-              U.Name = Entry.otherName();
-            // Again there is no symbol in the nlist symbol table for this so
-            // we set Sym effectivly to null and the rest of code in here must
-            // test for it and not do things like Sym.getFlags() for it.
-            U.Sym = BasicSymbolRef();
-            U.SymFlags = SymbolRef::SF_Global | SymbolRef::SF_Undefined;
-            U.Section = SectionRef();
-            U.NType = MachO::N_EXT | MachO::N_UNDF;
-            U.NSect = 0;
-            U.NDesc = 0;
-            // The library ordinal for this undefined symbol is in the export
-            // trie Entry.other().
-            MachO::SET_LIBRARY_ORDINAL(U.NDesc, Entry.other());
-            U.IndirectName = StringRef();
-            SymbolList.push_back(U);
-
-            // Finally add the undefined symbol's name.
-            if (Entry.otherName().empty())
-              EOS << Entry.name();
-            else
-              EOS << Entry.otherName();
-            EOS << '\0';
-            ExportsAdded++;
-          }
-        }
-      }
-      if (Err)
-        error(std::move(Err), MachO->getFileName());
-      // Set the symbol names and indirect names for the added symbols.
-      if (ExportsAdded) {
-        EOS.flush();
-        const char *Q = ExportsNameBuffer.c_str();
-        for (unsigned K = 0; K < ExportsAdded; K++) {
-          SymbolList[I].Name = Q;
-          Q += strlen(Q) + 1;
-          if (SymbolList[I].TypeChar == 'I') {
-            SymbolList[I].IndirectName = Q;
-            Q += strlen(Q) + 1;
-          }
-          I++;
-        }
-      }
-
-      // Add the undefined symbols from the bind entries.
-      unsigned BindsAdded = 0;
-      Error BErr = Error::success();
-      StringRef LastSymbolName = StringRef();
-      for (const llvm::object::MachOBindEntry &Entry : MachO->bindTable(BErr)) {
-        bool found = false;
-        if (LastSymbolName == Entry.symbolName())
-          found = true;
-        else if(!DyldInfoOnly) {
-          for (unsigned J = 0; J < SymbolList.size() && !found; ++J) {
-            if (SymbolList[J].Name == Entry.symbolName())
-              found = true;
-          }
-        }
-        if (!found) {
-          LastSymbolName = Entry.symbolName();
-          NMSymbol B = {};
-          B.Address = 0;
-          B.Size = 0;
-          B.TypeChar = 'U';
-          // There is no symbol in the nlist symbol table for this so we set
-          // Sym effectivly to null and the rest of code in here must test for
-          // it and not do things like Sym.getFlags() for it.
-          B.Sym = BasicSymbolRef();
-          B.SymFlags = SymbolRef::SF_Global | SymbolRef::SF_Undefined;
-          B.NType = MachO::N_EXT | MachO::N_UNDF;
-          B.NSect = 0;
-          B.NDesc = 0;
-          MachO::SET_LIBRARY_ORDINAL(B.NDesc, Entry.ordinal());
-          B.IndirectName = StringRef();
-          B.Name = Entry.symbolName();
-          SymbolList.push_back(B);
-          BOS << Entry.symbolName();
-          BOS << '\0';
-          BindsAdded++;
-        }
-      }
-      if (BErr)
-        error(std::move(BErr), MachO->getFileName());
-      // Set the symbol names and indirect names for the added symbols.
-      if (BindsAdded) {
-        BOS.flush();
-        const char *Q = BindsNameBuffer.c_str();
-        for (unsigned K = 0; K < BindsAdded; K++) {
-          SymbolList[I].Name = Q;
-          Q += strlen(Q) + 1;
-          if (SymbolList[I].TypeChar == 'I') {
-            SymbolList[I].IndirectName = Q;
-            Q += strlen(Q) + 1;
-          }
-          I++;
-        }
-      }
-
-      // Add the undefined symbols from the lazy bind entries.
-      unsigned LazysAdded = 0;
-      Error LErr = Error::success();
-      LastSymbolName = StringRef();
-      for (const llvm::object::MachOBindEntry &Entry :
-           MachO->lazyBindTable(LErr)) {
-        bool found = false;
-        if (LastSymbolName == Entry.symbolName())
-          found = true;
-        else {
-          // Here we must check to see it this symbol is already in the
-          // SymbolList as it might have already have been added above via a
-          // non-lazy (bind) entry.
-          for (unsigned J = 0; J < SymbolList.size() && !found; ++J) {
-            if (SymbolList[J].Name == Entry.symbolName())
-              found = true;
-          }
-        }
-        if (!found) {
-          LastSymbolName = Entry.symbolName();
-          NMSymbol L = {};
-          L.Name = Entry.symbolName();
-          L.Address = 0;
-          L.Size = 0;
-          L.TypeChar = 'U';
-          // There is no symbol in the nlist symbol table for this so we set
-          // Sym effectivly to null and the rest of code in here must test for
-          // it and not do things like Sym.getFlags() for it.
-          L.Sym = BasicSymbolRef();
-          L.SymFlags = SymbolRef::SF_Global | SymbolRef::SF_Undefined;
-          L.NType = MachO::N_EXT | MachO::N_UNDF;
-          L.NSect = 0;
-          // The REFERENCE_FLAG_UNDEFINED_LAZY is no longer used but here it
-          // makes sence since we are creating this from a lazy bind entry.
-          L.NDesc = MachO::REFERENCE_FLAG_UNDEFINED_LAZY;
-          MachO::SET_LIBRARY_ORDINAL(L.NDesc, Entry.ordinal());
-          L.IndirectName = StringRef();
-          SymbolList.push_back(L);
-          LOS << Entry.symbolName();
-          LOS << '\0';
-          LazysAdded++;
-        }
-      }
-      if (LErr)
-        error(std::move(LErr), MachO->getFileName());
-      // Set the symbol names and indirect names for the added symbols.
-      if (LazysAdded) {
-        LOS.flush();
-        const char *Q = LazysNameBuffer.c_str();
-        for (unsigned K = 0; K < LazysAdded; K++) {
-          SymbolList[I].Name = Q;
-          Q += strlen(Q) + 1;
-          if (SymbolList[I].TypeChar == 'I') {
-            SymbolList[I].IndirectName = Q;
-            Q += strlen(Q) + 1;
-          }
-          I++;
-        }
-      }
-
-      // Add the undefineds symbol from the weak bind entries which are not
-      // strong symbols.
-      unsigned WeaksAdded = 0;
-      Error WErr = Error::success();
-      LastSymbolName = StringRef();
-      for (const llvm::object::MachOBindEntry &Entry :
-           MachO->weakBindTable(WErr)) {
-        bool found = false;
-        unsigned J = 0;
-        if (LastSymbolName == Entry.symbolName() ||
-            Entry.flags() & MachO::BIND_SYMBOL_FLAGS_NON_WEAK_DEFINITION) {
-          found = true;
-        } else {
-          for (J = 0; J < SymbolList.size() && !found; ++J) {
-            if (SymbolList[J].Name == Entry.symbolName()) {
-               found = true;
-               break;
-            }
-          }
-        }
-        if (!found) {
-          LastSymbolName = Entry.symbolName();
-          NMSymbol W;
-          memset(&W, '\0', sizeof(NMSymbol));
-          W.Name = Entry.symbolName();
-          W.Address = 0;
-          W.Size = 0;
-          W.TypeChar = 'U';
-          // There is no symbol in the nlist symbol table for this so we set
-          // Sym effectivly to null and the rest of code in here must test for
-          // it and not do things like Sym.getFlags() for it.
-          W.Sym = BasicSymbolRef();
-          W.SymFlags = SymbolRef::SF_Global | SymbolRef::SF_Undefined;
-          W.NType = MachO::N_EXT | MachO::N_UNDF;
-          W.NSect = 0;
-          // Odd that we are using N_WEAK_DEF on an undefined symbol but that is
-          // what is created in this case by the linker when there are real
-          // symbols in the nlist structs.
-          W.NDesc = MachO::N_WEAK_DEF;
-          W.IndirectName = StringRef();
-          SymbolList.push_back(W);
-          WOS << Entry.symbolName();
-          WOS << '\0';
-          WeaksAdded++;
-        } else {
-          // This is the case the symbol was previously been found and it could
-          // have been added from a bind or lazy bind symbol.  If so and not
-          // a definition also mark it as weak.
-          if (SymbolList[J].TypeChar == 'U')
-            // See comment above about N_WEAK_DEF.
-            SymbolList[J].NDesc |= MachO::N_WEAK_DEF;
-        }
-      }
-      if (WErr)
-        error(std::move(WErr), MachO->getFileName());
-      // Set the symbol names and indirect names for the added symbols.
-      if (WeaksAdded) {
-        WOS.flush();
-        const char *Q = WeaksNameBuffer.c_str();
-        for (unsigned K = 0; K < WeaksAdded; K++) {
-          SymbolList[I].Name = Q;
-          Q += strlen(Q) + 1;
-          if (SymbolList[I].TypeChar == 'I') {
-            SymbolList[I].IndirectName = Q;
-            Q += strlen(Q) + 1;
-          }
-          I++;
-        }
-      }
-
-      // Trying adding symbol from the function starts table and LC_MAIN entry
-      // point.
-      SmallVector<uint64_t, 8> FoundFns;
-      uint64_t lc_main_offset = UINT64_MAX;
-      for (const auto &Command : MachO->load_commands()) {
-        if (Command.C.cmd == MachO::LC_FUNCTION_STARTS) {
-          // We found a function starts segment, parse the addresses for
-          // consumption.
-          MachO::linkedit_data_command LLC =
-            MachO->getLinkeditDataLoadCommand(Command);
-
-          MachO->ReadULEB128s(LLC.dataoff, FoundFns);
-        } else if (Command.C.cmd == MachO::LC_MAIN) {
-          MachO::entry_point_command LCmain =
-            MachO->getEntryPointCommand(Command);
-          lc_main_offset = LCmain.entryoff;
-        }
-      }
-      // See if these addresses are already in the symbol table.
-      unsigned FunctionStartsAdded = 0;
-      for (uint64_t f = 0; f < FoundFns.size(); f++) {
-        bool found = false;
-        for (unsigned J = 0; J < SymbolList.size() && !found; ++J) {
-          if (SymbolList[J].Address == FoundFns[f] + BaseSegmentAddress)
-            found = true;
-        }
-        // See this address is not already in the symbol table fake up an
-        // nlist for it.
-        if (!found) {
-          NMSymbol F = {};
-          F.Name = "<redacted function X>";
-          F.Address = FoundFns[f] + BaseSegmentAddress;
-          F.Size = 0;
-          // There is no symbol in the nlist symbol table for this so we set
-          // Sym effectivly to null and the rest of code in here must test for
-          // it and not do things like Sym.getFlags() for it.
-          F.Sym = BasicSymbolRef();
-          F.SymFlags = 0;
-          F.NType = MachO::N_SECT;
-          F.NSect = 0;
-          StringRef SegmentName = StringRef();
-          StringRef SectionName = StringRef();
-          for (const SectionRef &Section : MachO->sections()) {
-            if (Expected<StringRef> NameOrErr = Section.getName())
-              SectionName = *NameOrErr;
-            else
-              consumeError(NameOrErr.takeError());
-
-            SegmentName = MachO->getSectionFinalSegmentName(
-                                                Section.getRawDataRefImpl());
-            F.NSect++;
-            if (F.Address >= Section.getAddress() &&
-                F.Address < Section.getAddress() + Section.getSize()) {
-              F.Section = Section;
-              break;
-            }
-          }
-          if (SegmentName == "__TEXT" && SectionName == "__text")
-            F.TypeChar = 't';
-          else if (SegmentName == "__DATA" && SectionName == "__data")
-            F.TypeChar = 'd';
-          else if (SegmentName == "__DATA" && SectionName == "__bss")
-            F.TypeChar = 'b';
-          else
-            F.TypeChar = 's';
-          F.NDesc = 0;
-          F.IndirectName = StringRef();
-          SymbolList.push_back(F);
-          if (FoundFns[f] == lc_main_offset)
-            FOS << "<redacted LC_MAIN>";
-          else
-            FOS << "<redacted function " << f << ">";
-          FOS << '\0';
-          FunctionStartsAdded++;
-        }
-      }
-      if (FunctionStartsAdded) {
-        FOS.flush();
-        const char *Q = FunctionStartsNameBuffer.c_str();
-        for (unsigned K = 0; K < FunctionStartsAdded; K++) {
-          SymbolList[I].Name = Q;
-          Q += strlen(Q) + 1;
-          if (SymbolList[I].TypeChar == 'I') {
-            SymbolList[I].IndirectName = Q;
-            Q += strlen(Q) + 1;
-          }
-          I++;
-        }
-      }
-    }
-  }
+  if (MachO && !NoDyldInfo)
+    dumpSymbolsFromDLInfoMachO(*MachO);
 
   CurrentFilename = Obj.getFileName();
 
@@ -1810,9 +1893,7 @@ static bool checkMachOAndArchFlags(SymbolicFile *O, std::string &Filename) {
                                        &McpuDefault, &ArchFlag);
   }
   const std::string ArchFlagName(ArchFlag);
-  if (none_of(ArchFlags, [&](const std::string &Name) {
-        return Name == ArchFlagName;
-      })) {
+  if (!llvm::is_contained(ArchFlags, ArchFlagName)) {
     error("No architecture specified", Filename);
     return false;
   }
@@ -2114,8 +2195,7 @@ static void dumpSymbolNamesFromFile(std::string &Filename) {
     for (const TapiUniversal::ObjectForArch &I : TU->objects()) {
       StringRef ArchName = I.getArchFlagName();
       const bool ShowArch =
-          ArchFlags.empty() ||
-          any_of(ArchFlags, [&](StringRef Name) { return Name == ArchName; });
+          ArchFlags.empty() || llvm::is_contained(ArchFlags, ArchName);
       if (!ShowArch)
         continue;
       if (!AddInlinedInfo && !I.isTopLevelLib())
diff --git a/contrib/llvm-project/llvm/tools/llvm-objcopy/BitcodeStripOpts.td b/contrib/llvm-project/llvm/tools/llvm-objcopy/BitcodeStripOpts.td
new file mode 100644
index 000000000000..cc178164b03c
--- /dev/null
+++ b/contrib/llvm-project/llvm/tools/llvm-objcopy/BitcodeStripOpts.td
@@ -0,0 +1,24 @@
+//===-- BitcodeStripOpts.td - llvm-bitcode-strip options  ---------------*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the command line options of llvm-bitcode-strip.
+//
+//===----------------------------------------------------------------------===//
+
+include "llvm/Option/OptParser.td"
+
+def help : Flag<["--"], "help">;
+
+def h : Flag<["-"], "h">, Alias<help>;
+
+def version : Flag<["--"], "version">,
+              HelpText<"Print the version and exit.">;
+
+def V : Flag<["-"], "V">,
+        Alias<version>,
+        HelpText<"Alias for --version">;
diff --git a/contrib/llvm-project/llvm/tools/llvm-objcopy/COFF/COFFObjcopy.cpp b/contrib/llvm-project/llvm/tools/llvm-objcopy/COFF/COFFObjcopy.cpp
index 43ec2b1fa82f..b5de8a45a80f 100644
--- a/contrib/llvm-project/llvm/tools/llvm-objcopy/COFF/COFFObjcopy.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-objcopy/COFF/COFFObjcopy.cpp
@@ -12,7 +12,6 @@
 #include "Object.h"
 #include "Reader.h"
 #include "Writer.h"
-#include "llvm-objcopy.h"
 
 #include "llvm/Object/Binary.h"
 #include "llvm/Object/COFF.h"
@@ -40,11 +39,12 @@ static uint64_t getNextRVA(const Object &Obj) {
                  Obj.IsPE ? Obj.PeHeader.SectionAlignment : 1);
 }
 
-static std::vector<uint8_t> createGnuDebugLinkSectionContents(StringRef File) {
+static Expected<std::vector<uint8_t>>
+createGnuDebugLinkSectionContents(StringRef File) {
   ErrorOr<std::unique_ptr<MemoryBuffer>> LinkTargetOrErr =
       MemoryBuffer::getFile(File);
   if (!LinkTargetOrErr)
-    error("'" + File + "': " + LinkTargetOrErr.getError().message());
+    return createFileError(File, LinkTargetOrErr.getError());
   auto LinkTarget = std::move(*LinkTargetOrErr);
   uint32_t CRC32 = llvm::crc32(arrayRefFromStringRef(LinkTarget->getBuffer()));
 
@@ -81,12 +81,17 @@ static void addSection(Object &Obj, StringRef Name, ArrayRef<uint8_t> Contents,
   Obj.addSections(Sec);
 }
 
-static void addGnuDebugLink(Object &Obj, StringRef DebugLinkFile) {
-  std::vector<uint8_t> Contents =
+static Error addGnuDebugLink(Object &Obj, StringRef DebugLinkFile) {
+  Expected<std::vector<uint8_t>> Contents =
       createGnuDebugLinkSectionContents(DebugLinkFile);
-  addSection(Obj, ".gnu_debuglink", Contents,
+  if (!Contents)
+    return Contents.takeError();
+
+  addSection(Obj, ".gnu_debuglink", *Contents,
              IMAGE_SCN_CNT_INITIALIZED_DATA | IMAGE_SCN_MEM_READ |
                  IMAGE_SCN_MEM_DISCARDABLE);
+
+  return Error::success();
 }
 
 static void setSectionFlags(Section &Sec, SectionFlag AllFlags) {
@@ -174,8 +179,7 @@ static Error handleArgs(const CopyConfig &Config, Object &Obj) {
       Sym.Name = I->getValue();
   }
 
-  // Actually do removals of symbols.
-  Obj.removeSymbols([&](const Symbol &Sym) {
+  auto ToRemove = [&](const Symbol &Sym) -> Expected<bool> {
     // For StripAll, all relocations have been stripped and we remove all
     // symbols.
     if (Config.StripAll || Config.StripAllGNU)
@@ -184,11 +188,10 @@ static Error handleArgs(const CopyConfig &Config, Object &Obj) {
     if (Config.SymbolsToRemove.matches(Sym.Name)) {
       // Explicitly removing a referenced symbol is an error.
       if (Sym.Referenced)
-        reportError(Config.OutputFilename,
-                    createStringError(llvm::errc::invalid_argument,
-                                      "not stripping symbol '%s' because it is "
-                                      "named in a relocation",
-                                      Sym.Name.str().c_str()));
+        return createStringError(
+            llvm::errc::invalid_argument,
+            "'" + Config.OutputFilename + "': not stripping symbol '" +
+                Sym.Name.str() + "' because it is named in a relocation");
       return true;
     }
 
@@ -213,7 +216,11 @@ static Error handleArgs(const CopyConfig &Config, Object &Obj) {
     }
 
     return false;
-  });
+  };
+
+  // Actually do removals of symbols.
+  if (Error Err = Obj.removeSymbols(ToRemove))
+    return Err;
 
   if (!Config.SetSectionFlags.empty())
     for (Section &Sec : Obj.getMutableSections()) {
@@ -239,7 +246,8 @@ static Error handleArgs(const CopyConfig &Config, Object &Obj) {
   }
 
   if (!Config.AddGnuDebugLink.empty())
-    addGnuDebugLink(Obj, Config.AddGnuDebugLink);
+    if (Error E = addGnuDebugLink(Obj, Config.AddGnuDebugLink))
+      return E;
 
   if (Config.AllowBrokenLinks || !Config.BuildIdLinkDir.empty() ||
       Config.BuildIdLinkInput || Config.BuildIdLinkOutput ||
diff --git a/contrib/llvm-project/llvm/tools/llvm-objcopy/COFF/Object.cpp b/contrib/llvm-project/llvm/tools/llvm-objcopy/COFF/Object.cpp
index b07532c1dc39..1c17b8408ee7 100644
--- a/contrib/llvm-project/llvm/tools/llvm-objcopy/COFF/Object.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-objcopy/COFF/Object.cpp
@@ -31,18 +31,23 @@ void Object::updateSymbols() {
 }
 
 const Symbol *Object::findSymbol(size_t UniqueId) const {
-  auto It = SymbolMap.find(UniqueId);
-  if (It == SymbolMap.end())
-    return nullptr;
-  return It->second;
+  return SymbolMap.lookup(UniqueId);
 }
 
-void Object::removeSymbols(function_ref<bool(const Symbol &)> ToRemove) {
-  Symbols.erase(
-      std::remove_if(std::begin(Symbols), std::end(Symbols),
-                     [ToRemove](const Symbol &Sym) { return ToRemove(Sym); }),
-      std::end(Symbols));
+Error Object::removeSymbols(
+    function_ref<Expected<bool>(const Symbol &)> ToRemove) {
+  Error Errs = Error::success();
+  llvm::erase_if(Symbols, [ToRemove, &Errs](const Symbol &Sym) {
+    Expected<bool> ShouldRemove = ToRemove(Sym);
+    if (!ShouldRemove) {
+      Errs = joinErrors(std::move(Errs), ShouldRemove.takeError());
+      return false;
+    }
+    return *ShouldRemove;
+  });
+
   updateSymbols();
+  return Errs;
 }
 
 Error Object::markSymbols() {
@@ -78,44 +83,34 @@ void Object::updateSections() {
 }
 
 const Section *Object::findSection(ssize_t UniqueId) const {
-  auto It = SectionMap.find(UniqueId);
-  if (It == SectionMap.end())
-    return nullptr;
-  return It->second;
+  return SectionMap.lookup(UniqueId);
 }
 
 void Object::removeSections(function_ref<bool(const Section &)> ToRemove) {
   DenseSet<ssize_t> AssociatedSections;
   auto RemoveAssociated = [&AssociatedSections](const Section &Sec) {
-    return AssociatedSections.count(Sec.UniqueId) == 1;
+    return AssociatedSections.contains(Sec.UniqueId);
   };
   do {
     DenseSet<ssize_t> RemovedSections;
-    Sections.erase(
-        std::remove_if(std::begin(Sections), std::end(Sections),
-                       [ToRemove, &RemovedSections](const Section &Sec) {
-                         bool Remove = ToRemove(Sec);
-                         if (Remove)
-                           RemovedSections.insert(Sec.UniqueId);
-                         return Remove;
-                       }),
-        std::end(Sections));
+    llvm::erase_if(Sections, [ToRemove, &RemovedSections](const Section &Sec) {
+      bool Remove = ToRemove(Sec);
+      if (Remove)
+        RemovedSections.insert(Sec.UniqueId);
+      return Remove;
+    });
     // Remove all symbols referring to the removed sections.
     AssociatedSections.clear();
-    Symbols.erase(
-        std::remove_if(
-            std::begin(Symbols), std::end(Symbols),
-            [&RemovedSections, &AssociatedSections](const Symbol &Sym) {
-              // If there are sections that are associative to a removed
-              // section,
-              // remove those as well as nothing will include them (and we can't
-              // leave them dangling).
-              if (RemovedSections.count(Sym.AssociativeComdatTargetSectionId) ==
-                  1)
-                AssociatedSections.insert(Sym.TargetSectionId);
-              return RemovedSections.count(Sym.TargetSectionId) == 1;
-            }),
-        std::end(Symbols));
+    llvm::erase_if(
+        Symbols, [&RemovedSections, &AssociatedSections](const Symbol &Sym) {
+          // If there are sections that are associative to a removed
+          // section,
+          // remove those as well as nothing will include them (and we can't
+          // leave them dangling).
+          if (RemovedSections.count(Sym.AssociativeComdatTargetSectionId) == 1)
+            AssociatedSections.insert(Sym.TargetSectionId);
+          return RemovedSections.contains(Sym.TargetSectionId);
+        });
     ToRemove = RemoveAssociated;
   } while (!AssociatedSections.empty());
   updateSections();
diff --git a/contrib/llvm-project/llvm/tools/llvm-objcopy/COFF/Object.h b/contrib/llvm-project/llvm/tools/llvm-objcopy/COFF/Object.h
index 78f8da00b8cd..0e854b58cbdb 100644
--- a/contrib/llvm-project/llvm/tools/llvm-objcopy/COFF/Object.h
+++ b/contrib/llvm-project/llvm/tools/llvm-objcopy/COFF/Object.h
@@ -26,7 +26,7 @@ namespace coff {
 
 struct Relocation {
   Relocation() = default;
-  Relocation(const object::coff_relocation& R) : Reloc(R) {}
+  Relocation(const object::coff_relocation &R) : Reloc(R) {}
 
   object::coff_relocation Reloc;
   size_t Target = 0;
@@ -116,7 +116,7 @@ struct Object {
   const Symbol *findSymbol(size_t UniqueId) const;
 
   void addSymbols(ArrayRef<Symbol> NewSymbols);
-  void removeSymbols(function_ref<bool(const Symbol &)> ToRemove);
+  Error removeSymbols(function_ref<Expected<bool>(const Symbol &)> ToRemove);
 
   // Set the Referenced field on all Symbols, based on relocations in
   // all sections.
diff --git a/contrib/llvm-project/llvm/tools/llvm-objcopy/CopyConfig.cpp b/contrib/llvm-project/llvm/tools/llvm-objcopy/CopyConfig.cpp
index 1fde54dd290a..ba74759a34c2 100644
--- a/contrib/llvm-project/llvm/tools/llvm-objcopy/CopyConfig.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-objcopy/CopyConfig.cpp
@@ -101,6 +101,43 @@ public:
   InstallNameToolOptTable() : OptTable(InstallNameToolInfoTable) {}
 };
 
+enum BitcodeStripID {
+  BITCODE_STRIP_INVALID = 0, // This is not an option ID.
+#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
+               HELPTEXT, METAVAR, VALUES)                                      \
+  BITCODE_STRIP_##ID,
+#include "BitcodeStripOpts.inc"
+#undef OPTION
+};
+
+#define PREFIX(NAME, VALUE) const char *const BITCODE_STRIP_##NAME[] = VALUE;
+#include "BitcodeStripOpts.inc"
+#undef PREFIX
+
+static const opt::OptTable::Info BitcodeStripInfoTable[] = {
+#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
+               HELPTEXT, METAVAR, VALUES)                                      \
+  {BITCODE_STRIP_##PREFIX,                                                     \
+   NAME,                                                                       \
+   HELPTEXT,                                                                   \
+   METAVAR,                                                                    \
+   BITCODE_STRIP_##ID,                                                         \
+   opt::Option::KIND##Class,                                                   \
+   PARAM,                                                                      \
+   FLAGS,                                                                      \
+   BITCODE_STRIP_##GROUP,                                                      \
+   BITCODE_STRIP_##ALIAS,                                                      \
+   ALIASARGS,                                                                  \
+   VALUES},
+#include "BitcodeStripOpts.inc"
+#undef OPTION
+};
+
+class BitcodeStripOptTable : public opt::OptTable {
+public:
+  BitcodeStripOptTable() : OptTable(BitcodeStripInfoTable) {}
+};
+
 enum StripID {
   STRIP_INVALID = 0, // This is not an option ID.
 #define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
@@ -395,7 +432,7 @@ template <class T> static ErrorOr<T> getAsInteger(StringRef Val) {
 
 namespace {
 
-enum class ToolType { Objcopy, Strip, InstallNameTool };
+enum class ToolType { Objcopy, Strip, InstallNameTool, BitcodeStrip };
 
 } // anonymous namespace
 
@@ -415,6 +452,10 @@ static void printHelp(const opt::OptTable &OptTable, raw_ostream &OS,
     ToolName = "llvm-install-name-tool";
     HelpText = " [options] input";
     break;
+  case ToolType::BitcodeStrip:
+    ToolName = "llvm-bitcode-strip";
+    HelpText = " [options] input";
+    break;
   }
   OptTable.PrintHelp(OS, (ToolName + HelpText).str().c_str(),
                      (ToolName + " tool").str().c_str());
@@ -854,6 +895,9 @@ parseInstallNameToolOptions(ArrayRef<const char *> ArgsArr) {
   for (auto Arg : InputArgs.filtered(INSTALL_NAME_TOOL_add_rpath))
     Config.RPathToAdd.push_back(Arg->getValue());
 
+  for (auto *Arg : InputArgs.filtered(INSTALL_NAME_TOOL_prepend_rpath))
+    Config.RPathToPrepend.push_back(Arg->getValue());
+
   for (auto Arg : InputArgs.filtered(INSTALL_NAME_TOOL_delete_rpath)) {
     StringRef RPath = Arg->getValue();
 
@@ -861,7 +905,12 @@ parseInstallNameToolOptions(ArrayRef<const char *> ArgsArr) {
     if (is_contained(Config.RPathToAdd, RPath))
       return createStringError(
           errc::invalid_argument,
-          "cannot specify both -add_rpath %s and -delete_rpath %s",
+          "cannot specify both -add_rpath '%s' and -delete_rpath '%s'",
+          RPath.str().c_str(), RPath.str().c_str());
+    if (is_contained(Config.RPathToPrepend, RPath))
+      return createStringError(
+          errc::invalid_argument,
+          "cannot specify both -prepend_rpath '%s' and -delete_rpath '%s'",
           RPath.str().c_str(), RPath.str().c_str());
 
     Config.RPathsToRemove.insert(RPath);
@@ -881,33 +930,46 @@ parseInstallNameToolOptions(ArrayRef<const char *> ArgsArr) {
         });
     if (It1 != Config.RPathsToUpdate.end())
       return createStringError(errc::invalid_argument,
-                               "cannot specify both -rpath " + It1->getFirst() +
-                                   " " + It1->getSecond() + " and -rpath " +
-                                   Old + " " + New);
+                               "cannot specify both -rpath '" +
+                                   It1->getFirst() + "' '" + It1->getSecond() +
+                                   "' and -rpath '" + Old + "' '" + New + "'");
 
     // Cannot specify the same rpath under both -delete_rpath and -rpath
     auto It2 = find_if(Config.RPathsToRemove, Match);
     if (It2 != Config.RPathsToRemove.end())
       return createStringError(errc::invalid_argument,
-                               "cannot specify both -delete_rpath " + *It2 +
-                                   " and -rpath " + Old + " " + New);
+                               "cannot specify both -delete_rpath '" + *It2 +
+                                   "' and -rpath '" + Old + "' '" + New + "'");
 
     // Cannot specify the same rpath under both -add_rpath and -rpath
     auto It3 = find_if(Config.RPathToAdd, Match);
     if (It3 != Config.RPathToAdd.end())
       return createStringError(errc::invalid_argument,
-                               "cannot specify both -add_rpath " + *It3 +
-                                   " and -rpath " + Old + " " + New);
+                               "cannot specify both -add_rpath '" + *It3 +
+                                   "' and -rpath '" + Old + "' '" + New + "'");
+
+    // Cannot specify the same rpath under both -prepend_rpath and -rpath.
+    auto It4 = find_if(Config.RPathToPrepend, Match);
+    if (It4 != Config.RPathToPrepend.end())
+      return createStringError(errc::invalid_argument,
+                               "cannot specify both -prepend_rpath '" + *It4 +
+                                   "' and -rpath '" + Old + "' '" + New + "'");
 
     Config.RPathsToUpdate.insert({Old, New});
   }
 
-  if (auto *Arg = InputArgs.getLastArg(INSTALL_NAME_TOOL_id))
+  if (auto *Arg = InputArgs.getLastArg(INSTALL_NAME_TOOL_id)) {
     Config.SharedLibId = Arg->getValue();
+    if (Config.SharedLibId->empty())
+      return createStringError(errc::invalid_argument,
+                               "cannot specify an empty id");
+  }
 
-  for (auto *Arg : InputArgs.filtered(INSTALL_NAME_TOOL_change)) {
+  for (auto *Arg : InputArgs.filtered(INSTALL_NAME_TOOL_change))
     Config.InstallNamesToUpdate.insert({Arg->getValue(0), Arg->getValue(1)});
-  }
+
+  Config.RemoveAllRpaths =
+      InputArgs.hasArg(INSTALL_NAME_TOOL_delete_all_rpaths);
 
   SmallVector<StringRef, 2> Positional;
   for (auto Arg : InputArgs.filtered(INSTALL_NAME_TOOL_UNKNOWN))
@@ -928,6 +990,50 @@ parseInstallNameToolOptions(ArrayRef<const char *> ArgsArr) {
   return std::move(DC);
 }
 
+Expected<DriverConfig>
+parseBitcodeStripOptions(ArrayRef<const char *> ArgsArr) {
+  DriverConfig DC;
+  CopyConfig Config;
+  BitcodeStripOptTable T;
+  unsigned MissingArgumentIndex, MissingArgumentCount;
+  opt::InputArgList InputArgs =
+      T.ParseArgs(ArgsArr, MissingArgumentIndex, MissingArgumentCount);
+
+  if (InputArgs.size() == 0) {
+    printHelp(T, errs(), ToolType::BitcodeStrip);
+    exit(1);
+  }
+
+  if (InputArgs.hasArg(BITCODE_STRIP_help)) {
+    printHelp(T, outs(), ToolType::BitcodeStrip);
+    exit(0);
+  }
+
+  if (InputArgs.hasArg(BITCODE_STRIP_version)) {
+    outs() << "llvm-bitcode-strip, compatible with cctools "
+              "bitcode_strip\n";
+    cl::PrintVersionMessage();
+    exit(0);
+  }
+
+  for (auto *Arg : InputArgs.filtered(BITCODE_STRIP_UNKNOWN))
+    return createStringError(errc::invalid_argument, "unknown argument '%s'",
+                             Arg->getAsString(InputArgs).c_str());
+
+  SmallVector<StringRef, 2> Positional;
+  for (auto *Arg : InputArgs.filtered(BITCODE_STRIP_INPUT))
+    Positional.push_back(Arg->getValue());
+  if (Positional.size() > 1)
+    return createStringError(errc::invalid_argument,
+                             "llvm-bitcode-strip expects a single input file");
+  assert(!Positional.empty());
+  Config.InputFilename = Positional[0];
+  Config.OutputFilename = Positional[0];
+
+  DC.CopyConfigs.push_back(std::move(Config));
+  return std::move(DC);
+}
+
 // ParseStripOptions returns the config and sets the input arguments. If a
 // help flag is set then ParseStripOptions will print the help messege and
 // exit.
diff --git a/contrib/llvm-project/llvm/tools/llvm-objcopy/CopyConfig.h b/contrib/llvm-project/llvm/tools/llvm-objcopy/CopyConfig.h
index 1341dd674c7b..07eac9d2bb1b 100644
--- a/contrib/llvm-project/llvm/tools/llvm-objcopy/CopyConfig.h
+++ b/contrib/llvm-project/llvm/tools/llvm-objcopy/CopyConfig.h
@@ -178,6 +178,7 @@ struct CopyConfig {
   std::vector<StringRef> DumpSection;
   std::vector<StringRef> SymbolsToAdd;
   std::vector<StringRef> RPathToAdd;
+  std::vector<StringRef> RPathToPrepend;
   DenseMap<StringRef, StringRef> RPathsToUpdate;
   DenseMap<StringRef, StringRef> InstallNamesToUpdate;
   DenseSet<StringRef> RPathsToRemove;
@@ -230,6 +231,9 @@ struct CopyConfig {
   bool StripUnneeded = false;
   bool Weaken = false;
   bool DecompressDebugSections = false;
+  // install-name-tool's --delete_all_rpaths
+  bool RemoveAllRpaths = false;
+
   DebugCompressionType CompressionType = DebugCompressionType::None;
 
   // parseELFConfig performs ELF-specific command-line parsing. Fills `ELF` on
@@ -267,6 +271,11 @@ parseObjcopyOptions(ArrayRef<const char *> ArgsArr,
 Expected<DriverConfig>
 parseInstallNameToolOptions(ArrayRef<const char *> ArgsArr);
 
+// ParseBitcodeStripOptions returns the config and sets the input arguments.
+// If a help flag is set then ParseBitcodeStripOptions will print the help
+// messege and exit.
+Expected<DriverConfig> parseBitcodeStripOptions(ArrayRef<const char *> ArgsArr);
+
 // ParseStripOptions returns the config and sets the input arguments. If a
 // help flag is set then ParseStripOptions will print the help messege and
 // exit. ErrorCallback is used to handle recoverable errors. An Error returned
diff --git a/contrib/llvm-project/llvm/tools/llvm-objcopy/ELF/ELFObjcopy.cpp b/contrib/llvm-project/llvm/tools/llvm-objcopy/ELF/ELFObjcopy.cpp
index 66953f9ef0d5..c53a34bc46a3 100644
--- a/contrib/llvm-project/llvm/tools/llvm-objcopy/ELF/ELFObjcopy.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-objcopy/ELF/ELFObjcopy.cpp
@@ -10,7 +10,6 @@
 #include "Buffer.h"
 #include "CopyConfig.h"
 #include "Object.h"
-#include "llvm-objcopy.h"
 #include "llvm/ADT/BitmaskEnum.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/Optional.h"
@@ -185,28 +184,28 @@ findBuildID(const CopyConfig &Config, const object::ELFFile<ELFT> &In) {
       return createFileError(Config.InputFilename, std::move(Err));
   }
 
-  return createFileError(
-      Config.InputFilename,
-      createStringError(llvm::errc::invalid_argument,
-                        "could not find build ID"));
+  return createFileError(Config.InputFilename,
+                         createStringError(llvm::errc::invalid_argument,
+                                           "could not find build ID"));
 }
 
 static Expected<ArrayRef<uint8_t>>
 findBuildID(const CopyConfig &Config, const object::ELFObjectFileBase &In) {
   if (auto *O = dyn_cast<ELFObjectFile<ELF32LE>>(&In))
-    return findBuildID(Config, *O->getELFFile());
+    return findBuildID(Config, O->getELFFile());
   else if (auto *O = dyn_cast<ELFObjectFile<ELF64LE>>(&In))
-    return findBuildID(Config, *O->getELFFile());
+    return findBuildID(Config, O->getELFFile());
   else if (auto *O = dyn_cast<ELFObjectFile<ELF32BE>>(&In))
-    return findBuildID(Config, *O->getELFFile());
+    return findBuildID(Config, O->getELFFile());
   else if (auto *O = dyn_cast<ELFObjectFile<ELF64BE>>(&In))
-    return findBuildID(Config, *O->getELFFile());
+    return findBuildID(Config, O->getELFFile());
 
   llvm_unreachable("Bad file format");
 }
 
 template <class... Ts>
-static Error makeStringError(std::error_code EC, const Twine &Msg, Ts &&... Args) {
+static Error makeStringError(std::error_code EC, const Twine &Msg,
+                             Ts &&... Args) {
   std::string FullMsg = (EC.message() + ": " + Msg).str();
   return createStringError(EC, FullMsg.c_str(), std::forward<Ts>(Args)...);
 }
@@ -266,19 +265,23 @@ static Error linkToBuildIdDir(const CopyConfig &Config, StringRef ToLink,
 
 static Error splitDWOToFile(const CopyConfig &Config, const Reader &Reader,
                             StringRef File, ElfType OutputElfType) {
-  auto DWOFile = Reader.create(false);
+  Expected<std::unique_ptr<Object>> DWOFile = Reader.create(false);
+  if (!DWOFile)
+    return DWOFile.takeError();
+
   auto OnlyKeepDWOPred = [&DWOFile](const SectionBase &Sec) {
-    return onlyKeepDWOPred(*DWOFile, Sec);
+    return onlyKeepDWOPred(**DWOFile, Sec);
   };
-  if (Error E = DWOFile->removeSections(Config.AllowBrokenLinks,
-                                        OnlyKeepDWOPred))
+  if (Error E =
+          (*DWOFile)->removeSections(Config.AllowBrokenLinks, OnlyKeepDWOPred))
     return E;
   if (Config.OutputArch) {
-    DWOFile->Machine = Config.OutputArch.getValue().EMachine;
-    DWOFile->OSABI = Config.OutputArch.getValue().OSABI;
+    (*DWOFile)->Machine = Config.OutputArch.getValue().EMachine;
+    (*DWOFile)->OSABI = Config.OutputArch.getValue().OSABI;
   }
   FileBuffer FB(File);
-  auto Writer = createWriter(Config, *DWOFile, FB, OutputElfType);
+  std::unique_ptr<Writer> Writer =
+      createWriter(Config, **DWOFile, FB, OutputElfType);
   if (Error E = Writer->finalize())
     return E;
   return Writer->write();
@@ -313,22 +316,27 @@ static bool isCompressable(const SectionBase &Sec) {
          StringRef(Sec.Name).startswith(".debug");
 }
 
-static void replaceDebugSections(
+static Error replaceDebugSections(
     Object &Obj, SectionPred &RemovePred,
-    function_ref<bool(const SectionBase &)> shouldReplace,
-    function_ref<SectionBase *(const SectionBase *)> addSection) {
+    function_ref<bool(const SectionBase &)> ShouldReplace,
+    function_ref<Expected<SectionBase *>(const SectionBase *)> AddSection) {
   // Build a list of the debug sections we are going to replace.
-  // We can't call `addSection` while iterating over sections,
+  // We can't call `AddSection` while iterating over sections,
   // because it would mutate the sections array.
   SmallVector<SectionBase *, 13> ToReplace;
   for (auto &Sec : Obj.sections())
-    if (shouldReplace(Sec))
+    if (ShouldReplace(Sec))
       ToReplace.push_back(&Sec);
 
   // Build a mapping from original section to a new one.
   DenseMap<SectionBase *, SectionBase *> FromTo;
-  for (SectionBase *S : ToReplace)
-    FromTo[S] = addSection(S);
+  for (SectionBase *S : ToReplace) {
+    Expected<SectionBase *> NewSection = AddSection(S);
+    if (!NewSection)
+      return NewSection.takeError();
+
+    FromTo[S] = *NewSection;
+  }
 
   // Now we want to update the target sections of relocation
   // sections. Also we will update the relocations themselves
@@ -336,9 +344,11 @@ static void replaceDebugSections(
   for (auto &Sec : Obj.sections())
     Sec.replaceSectionReferences(FromTo);
 
-  RemovePred = [shouldReplace, RemovePred](const SectionBase &Sec) {
-    return shouldReplace(Sec) || RemovePred(Sec);
+  RemovePred = [ShouldReplace, RemovePred](const SectionBase &Sec) {
+    return ShouldReplace(Sec) || RemovePred(Sec);
   };
+
+  return Error::success();
 }
 
 static bool isUnneededSymbol(const Symbol &Sym) {
@@ -577,20 +587,28 @@ static Error replaceAndRemoveSections(const CopyConfig &Config, Object &Obj) {
     };
   }
 
-  if (Config.CompressionType != DebugCompressionType::None)
-    replaceDebugSections(Obj, RemovePred, isCompressable,
-                         [&Config, &Obj](const SectionBase *S) {
-                           return &Obj.addSection<CompressedSection>(
-                                *S, Config.CompressionType);
-                         });
-  else if (Config.DecompressDebugSections)
-    replaceDebugSections(
-        Obj, RemovePred,
-        [](const SectionBase &S) { return isa<CompressedSection>(&S); },
-        [&Obj](const SectionBase *S) {
-          auto CS = cast<CompressedSection>(S);
-          return &Obj.addSection<DecompressedSection>(*CS);
-        });
+  if (Config.CompressionType != DebugCompressionType::None) {
+    if (Error Err = replaceDebugSections(
+            Obj, RemovePred, isCompressable,
+            [&Config, &Obj](const SectionBase *S) -> Expected<SectionBase *> {
+              Expected<CompressedSection> NewSection =
+                  CompressedSection::create(*S, Config.CompressionType);
+              if (!NewSection)
+                return NewSection.takeError();
+
+              return &Obj.addSection<CompressedSection>(std::move(*NewSection));
+            }))
+      return Err;
+  } else if (Config.DecompressDebugSections) {
+    if (Error Err = replaceDebugSections(
+            Obj, RemovePred,
+            [](const SectionBase &S) { return isa<CompressedSection>(&S); },
+            [&Obj](const SectionBase *S) {
+              const CompressedSection *CS = cast<CompressedSection>(S);
+              return &Obj.addSection<DecompressedSection>(*CS);
+            }))
+      return Err;
+  }
 
   return Obj.removeSections(Config.AllowBrokenLinks, RemovePred);
 }
@@ -701,16 +719,6 @@ static Error handleArgs(const CopyConfig &Config, Object &Obj,
     }
   }
 
-  if (!Config.SetSectionFlags.empty()) {
-    for (auto &Sec : Obj.sections()) {
-      const auto Iter = Config.SetSectionFlags.find(Sec.Name);
-      if (Iter != Config.SetSectionFlags.end()) {
-        const SectionFlagsUpdate &SFU = Iter->second;
-        setSectionFlagsAndType(Sec, SFU.NewFlags);
-      }
-    }
-  }
-
   if (Config.OnlyKeepDebug)
     for (auto &Sec : Obj.sections())
       if (Sec.Flags & SHF_ALLOC && Sec.Type != SHT_NOTE)
@@ -740,9 +748,9 @@ static Error handleArgs(const CopyConfig &Config, Object &Obj,
 
   // If the symbol table was previously removed, we need to create a new one
   // before adding new symbols.
-  if (!Obj.SymbolTable && !Config.ELF->SymbolsToAdd.empty()) {
-    Obj.addNewSymbolTable();
-  }
+  if (!Obj.SymbolTable && !Config.ELF->SymbolsToAdd.empty())
+    if (Error E = Obj.addNewSymbolTable())
+      return E;
 
   for (const NewSymbolInfo &SI : Config.ELF->SymbolsToAdd) {
     SectionBase *Sec = Obj.findSection(SI.SectionName);
@@ -752,6 +760,17 @@ static Error handleArgs(const CopyConfig &Config, Object &Obj,
         Sec ? (uint16_t)SYMBOL_SIMPLE_INDEX : (uint16_t)SHN_ABS, 0);
   }
 
+  // --set-section-flags works with sections added by --add-section.
+  if (!Config.SetSectionFlags.empty()) {
+    for (auto &Sec : Obj.sections()) {
+      const auto Iter = Config.SetSectionFlags.find(Sec.Name);
+      if (Iter != Config.SetSectionFlags.end()) {
+        const SectionFlagsUpdate &SFU = Iter->second;
+        setSectionFlagsAndType(Sec, SFU.NewFlags);
+      }
+    }
+  }
+
   if (Config.EntryExpr)
     Obj.Entry = Config.EntryExpr(Obj.Entry);
   return Error::success();
@@ -769,12 +788,15 @@ static Error writeOutput(const CopyConfig &Config, Object &Obj, Buffer &Out,
 Error executeObjcopyOnIHex(const CopyConfig &Config, MemoryBuffer &In,
                            Buffer &Out) {
   IHexReader Reader(&In);
-  std::unique_ptr<Object> Obj = Reader.create(true);
+  Expected<std::unique_ptr<Object>> Obj = Reader.create(true);
+  if (!Obj)
+    return Obj.takeError();
+
   const ElfType OutputElfType =
-    getOutputElfType(Config.OutputArch.getValueOr(MachineInfo()));
-  if (Error E = handleArgs(Config, *Obj, Reader, OutputElfType))
+      getOutputElfType(Config.OutputArch.getValueOr(MachineInfo()));
+  if (Error E = handleArgs(Config, **Obj, Reader, OutputElfType))
     return E;
-  return writeOutput(Config, *Obj, Out, OutputElfType);
+  return writeOutput(Config, **Obj, Out, OutputElfType);
 }
 
 Error executeObjcopyOnRawBinary(const CopyConfig &Config, MemoryBuffer &In,
@@ -782,21 +804,26 @@ Error executeObjcopyOnRawBinary(const CopyConfig &Config, MemoryBuffer &In,
   uint8_t NewSymbolVisibility =
       Config.ELF->NewSymbolVisibility.getValueOr((uint8_t)ELF::STV_DEFAULT);
   BinaryReader Reader(&In, NewSymbolVisibility);
-  std::unique_ptr<Object> Obj = Reader.create(true);
+  Expected<std::unique_ptr<Object>> Obj = Reader.create(true);
+  if (!Obj)
+    return Obj.takeError();
 
   // Prefer OutputArch (-O<format>) if set, otherwise fallback to BinaryArch
   // (-B<arch>).
   const ElfType OutputElfType =
       getOutputElfType(Config.OutputArch.getValueOr(MachineInfo()));
-  if (Error E = handleArgs(Config, *Obj, Reader, OutputElfType))
+  if (Error E = handleArgs(Config, **Obj, Reader, OutputElfType))
     return E;
-  return writeOutput(Config, *Obj, Out, OutputElfType);
+  return writeOutput(Config, **Obj, Out, OutputElfType);
 }
 
 Error executeObjcopyOnBinary(const CopyConfig &Config,
                              object::ELFObjectFileBase &In, Buffer &Out) {
   ELFReader Reader(&In, Config.ExtractPartition);
-  std::unique_ptr<Object> Obj = Reader.create(!Config.SymbolsToAdd.empty());
+  Expected<std::unique_ptr<Object>> Obj =
+      Reader.create(!Config.SymbolsToAdd.empty());
+  if (!Obj)
+    return Obj.takeError();
   // Prefer OutputArch (-O<format>) if set, otherwise infer it from the input.
   const ElfType OutputElfType =
       Config.OutputArch ? getOutputElfType(Config.OutputArch.getValue())
@@ -822,10 +849,10 @@ Error executeObjcopyOnBinary(const CopyConfig &Config,
                              Config.BuildIdLinkInput.getValue(), BuildIdBytes))
       return E;
 
-  if (Error E = handleArgs(Config, *Obj, Reader, OutputElfType))
+  if (Error E = handleArgs(Config, **Obj, Reader, OutputElfType))
     return createFileError(Config.InputFilename, std::move(E));
 
-  if (Error E = writeOutput(Config, *Obj, Out, OutputElfType))
+  if (Error E = writeOutput(Config, **Obj, Out, OutputElfType))
     return createFileError(Config.InputFilename, std::move(E));
   if (!Config.BuildIdLinkDir.empty() && Config.BuildIdLinkOutput)
     if (Error E =
diff --git a/contrib/llvm-project/llvm/tools/llvm-objcopy/ELF/Object.cpp b/contrib/llvm-project/llvm/tools/llvm-objcopy/ELF/Object.cpp
index e15fb24f4c42..0ff82f951b62 100644
--- a/contrib/llvm-project/llvm/tools/llvm-objcopy/ELF/Object.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-objcopy/ELF/Object.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "Object.h"
-#include "llvm-objcopy.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
@@ -15,6 +14,7 @@
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCTargetOptions.h"
+#include "llvm/Object/ELF.h"
 #include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Support/Compression.h"
 #include "llvm/Support/Endian.h"
@@ -51,16 +51,15 @@ template <class ELFT> void ELFWriter<ELFT>::writePhdr(const Segment &Seg) {
 }
 
 Error SectionBase::removeSectionReferences(
-    bool AllowBrokenLinks,
-    function_ref<bool(const SectionBase *)> ToRemove) {
+    bool, function_ref<bool(const SectionBase *)>) {
   return Error::success();
 }
 
-Error SectionBase::removeSymbols(function_ref<bool(const Symbol &)> ToRemove) {
+Error SectionBase::removeSymbols(function_ref<bool(const Symbol &)>) {
   return Error::success();
 }
 
-void SectionBase::initialize(SectionTableRef SecTable) {}
+Error SectionBase::initialize(SectionTableRef) { return Error::success(); }
 void SectionBase::finalize() {}
 void SectionBase::markSymbols() {}
 void SectionBase::replaceSectionReferences(
@@ -82,72 +81,98 @@ template <class ELFT> void ELFWriter<ELFT>::writeShdr(const SectionBase &Sec) {
   Shdr.sh_entsize = Sec.EntrySize;
 }
 
-template <class ELFT> void ELFSectionSizer<ELFT>::visit(Section &Sec) {}
+template <class ELFT> Error ELFSectionSizer<ELFT>::visit(Section &) {
+  return Error::success();
+}
 
-template <class ELFT>
-void ELFSectionSizer<ELFT>::visit(OwnedDataSection &Sec) {}
+template <class ELFT> Error ELFSectionSizer<ELFT>::visit(OwnedDataSection &) {
+  return Error::success();
+}
 
-template <class ELFT>
-void ELFSectionSizer<ELFT>::visit(StringTableSection &Sec) {}
+template <class ELFT> Error ELFSectionSizer<ELFT>::visit(StringTableSection &) {
+  return Error::success();
+}
 
 template <class ELFT>
-void ELFSectionSizer<ELFT>::visit(DynamicRelocationSection &Sec) {}
+Error ELFSectionSizer<ELFT>::visit(DynamicRelocationSection &) {
+  return Error::success();
+}
 
 template <class ELFT>
-void ELFSectionSizer<ELFT>::visit(SymbolTableSection &Sec) {
+Error ELFSectionSizer<ELFT>::visit(SymbolTableSection &Sec) {
   Sec.EntrySize = sizeof(Elf_Sym);
   Sec.Size = Sec.Symbols.size() * Sec.EntrySize;
   // Align to the largest field in Elf_Sym.
   Sec.Align = ELFT::Is64Bits ? sizeof(Elf_Xword) : sizeof(Elf_Word);
+  return Error::success();
 }
 
 template <class ELFT>
-void ELFSectionSizer<ELFT>::visit(RelocationSection &Sec) {
+Error ELFSectionSizer<ELFT>::visit(RelocationSection &Sec) {
   Sec.EntrySize = Sec.Type == SHT_REL ? sizeof(Elf_Rel) : sizeof(Elf_Rela);
   Sec.Size = Sec.Relocations.size() * Sec.EntrySize;
   // Align to the largest field in Elf_Rel(a).
   Sec.Align = ELFT::Is64Bits ? sizeof(Elf_Xword) : sizeof(Elf_Word);
+  return Error::success();
 }
 
 template <class ELFT>
-void ELFSectionSizer<ELFT>::visit(GnuDebugLinkSection &Sec) {}
+Error ELFSectionSizer<ELFT>::visit(GnuDebugLinkSection &) {
+  return Error::success();
+}
 
-template <class ELFT> void ELFSectionSizer<ELFT>::visit(GroupSection &Sec) {
+template <class ELFT> Error ELFSectionSizer<ELFT>::visit(GroupSection &Sec) {
   Sec.Size = sizeof(Elf_Word) + Sec.GroupMembers.size() * sizeof(Elf_Word);
+  return Error::success();
 }
 
 template <class ELFT>
-void ELFSectionSizer<ELFT>::visit(SectionIndexSection &Sec) {}
+Error ELFSectionSizer<ELFT>::visit(SectionIndexSection &) {
+  return Error::success();
+}
 
-template <class ELFT>
-void ELFSectionSizer<ELFT>::visit(CompressedSection &Sec) {}
+template <class ELFT> Error ELFSectionSizer<ELFT>::visit(CompressedSection &) {
+  return Error::success();
+}
 
 template <class ELFT>
-void ELFSectionSizer<ELFT>::visit(DecompressedSection &Sec) {}
+Error ELFSectionSizer<ELFT>::visit(DecompressedSection &) {
+  return Error::success();
+}
 
-void BinarySectionWriter::visit(const SectionIndexSection &Sec) {
-  error("cannot write symbol section index table '" + Sec.Name + "' ");
+Error BinarySectionWriter::visit(const SectionIndexSection &Sec) {
+  return createStringError(errc::operation_not_permitted,
+                           "cannot write symbol section index table '" +
+                               Sec.Name + "' ");
 }
 
-void BinarySectionWriter::visit(const SymbolTableSection &Sec) {
-  error("cannot write symbol table '" + Sec.Name + "' out to binary");
+Error BinarySectionWriter::visit(const SymbolTableSection &Sec) {
+  return createStringError(errc::operation_not_permitted,
+                           "cannot write symbol table '" + Sec.Name +
+                               "' out to binary");
 }
 
-void BinarySectionWriter::visit(const RelocationSection &Sec) {
-  error("cannot write relocation section '" + Sec.Name + "' out to binary");
+Error BinarySectionWriter::visit(const RelocationSection &Sec) {
+  return createStringError(errc::operation_not_permitted,
+                           "cannot write relocation section '" + Sec.Name +
+                               "' out to binary");
 }
 
-void BinarySectionWriter::visit(const GnuDebugLinkSection &Sec) {
-  error("cannot write '" + Sec.Name + "' out to binary");
+Error BinarySectionWriter::visit(const GnuDebugLinkSection &Sec) {
+  return createStringError(errc::operation_not_permitted,
+                           "cannot write '" + Sec.Name + "' out to binary");
 }
 
-void BinarySectionWriter::visit(const GroupSection &Sec) {
-  error("cannot write '" + Sec.Name + "' out to binary");
+Error BinarySectionWriter::visit(const GroupSection &Sec) {
+  return createStringError(errc::operation_not_permitted,
+                           "cannot write '" + Sec.Name + "' out to binary");
 }
 
-void SectionWriter::visit(const Section &Sec) {
+Error SectionWriter::visit(const Section &Sec) {
   if (Sec.Type != SHT_NOBITS)
     llvm::copy(Sec.Contents, Out.getBufferStart() + Sec.Offset);
+
+  return Error::success();
 }
 
 static bool addressOverflows32bit(uint64_t Addr) {
@@ -352,30 +377,34 @@ uint64_t IHexSectionWriterBase::writeBaseAddr(uint64_t Addr) {
   return Base;
 }
 
-void IHexSectionWriterBase::writeData(uint8_t Type, uint16_t Addr,
+void IHexSectionWriterBase::writeData(uint8_t, uint16_t,
                                       ArrayRef<uint8_t> Data) {
   Offset += IHexRecord::getLineLength(Data.size());
 }
 
-void IHexSectionWriterBase::visit(const Section &Sec) {
+Error IHexSectionWriterBase::visit(const Section &Sec) {
   writeSection(&Sec, Sec.Contents);
+  return Error::success();
 }
 
-void IHexSectionWriterBase::visit(const OwnedDataSection &Sec) {
+Error IHexSectionWriterBase::visit(const OwnedDataSection &Sec) {
   writeSection(&Sec, Sec.Data);
+  return Error::success();
 }
 
-void IHexSectionWriterBase::visit(const StringTableSection &Sec) {
+Error IHexSectionWriterBase::visit(const StringTableSection &Sec) {
   // Check that sizer has already done its work
   assert(Sec.Size == Sec.StrTabBuilder.getSize());
   // We are free to pass an invalid pointer to writeSection as long
   // as we don't actually write any data. The real writer class has
   // to override this method .
   writeSection(&Sec, {nullptr, static_cast<size_t>(Sec.Size)});
+  return Error::success();
 }
 
-void IHexSectionWriterBase::visit(const DynamicRelocationSection &Sec) {
+Error IHexSectionWriterBase::visit(const DynamicRelocationSection &Sec) {
   writeSection(&Sec, Sec.Contents);
+  return Error::success();
 }
 
 void IHexSectionWriter::writeData(uint8_t Type, uint16_t Addr,
@@ -385,19 +414,25 @@ void IHexSectionWriter::writeData(uint8_t Type, uint16_t Addr,
   Offset += HexData.size();
 }
 
-void IHexSectionWriter::visit(const StringTableSection &Sec) {
+Error IHexSectionWriter::visit(const StringTableSection &Sec) {
   assert(Sec.Size == Sec.StrTabBuilder.getSize());
   std::vector<uint8_t> Data(Sec.Size);
   Sec.StrTabBuilder.write(Data.data());
   writeSection(&Sec, Data);
+  return Error::success();
 }
 
-void Section::accept(SectionVisitor &Visitor) const { Visitor.visit(*this); }
+Error Section::accept(SectionVisitor &Visitor) const {
+  return Visitor.visit(*this);
+}
 
-void Section::accept(MutableSectionVisitor &Visitor) { Visitor.visit(*this); }
+Error Section::accept(MutableSectionVisitor &Visitor) {
+  return Visitor.visit(*this);
+}
 
-void SectionWriter::visit(const OwnedDataSection &Sec) {
+Error SectionWriter::visit(const OwnedDataSection &Sec) {
   llvm::copy(Sec.Data, Out.getBufferStart() + Sec.Offset);
+  return Error::success();
 }
 
 static constexpr std::array<uint8_t, 4> ZlibGnuMagic = {{'Z', 'L', 'I', 'B'}};
@@ -424,7 +459,7 @@ getDecompressedSizeAndAlignment(ArrayRef<uint8_t> Data) {
 }
 
 template <class ELFT>
-void ELFSectionWriter<ELFT>::visit(const DecompressedSection &Sec) {
+Error ELFSectionWriter<ELFT>::visit(const DecompressedSection &Sec) {
   const size_t DataOffset = isDataGnuCompressed(Sec.OriginalData)
                                 ? (ZlibGnuMagic.size() + sizeof(Sec.Size))
                                 : sizeof(Elf_Chdr_Impl<ELFT>);
@@ -434,32 +469,37 @@ void ELFSectionWriter<ELFT>::visit(const DecompressedSection &Sec) {
       Sec.OriginalData.size() - DataOffset);
 
   SmallVector<char, 128> DecompressedContent;
-  if (Error E = zlib::uncompress(CompressedContent, DecompressedContent,
-                                 static_cast<size_t>(Sec.Size)))
-    reportError(Sec.Name, std::move(E));
+  if (Error Err = zlib::uncompress(CompressedContent, DecompressedContent,
+                                   static_cast<size_t>(Sec.Size)))
+    return createStringError(errc::invalid_argument,
+                             "'" + Sec.Name + "': " + toString(std::move(Err)));
 
   uint8_t *Buf = Out.getBufferStart() + Sec.Offset;
   std::copy(DecompressedContent.begin(), DecompressedContent.end(), Buf);
+
+  return Error::success();
 }
 
-void BinarySectionWriter::visit(const DecompressedSection &Sec) {
-  error("cannot write compressed section '" + Sec.Name + "' ");
+Error BinarySectionWriter::visit(const DecompressedSection &Sec) {
+  return createStringError(errc::operation_not_permitted,
+                           "cannot write compressed section '" + Sec.Name +
+                               "' ");
 }
 
-void DecompressedSection::accept(SectionVisitor &Visitor) const {
-  Visitor.visit(*this);
+Error DecompressedSection::accept(SectionVisitor &Visitor) const {
+  return Visitor.visit(*this);
 }
 
-void DecompressedSection::accept(MutableSectionVisitor &Visitor) {
-  Visitor.visit(*this);
+Error DecompressedSection::accept(MutableSectionVisitor &Visitor) {
+  return Visitor.visit(*this);
 }
 
-void OwnedDataSection::accept(SectionVisitor &Visitor) const {
-  Visitor.visit(*this);
+Error OwnedDataSection::accept(SectionVisitor &Visitor) const {
+  return Visitor.visit(*this);
 }
 
-void OwnedDataSection::accept(MutableSectionVisitor &Visitor) {
-  Visitor.visit(*this);
+Error OwnedDataSection::accept(MutableSectionVisitor &Visitor) {
+  return Visitor.visit(*this);
 }
 
 void OwnedDataSection::appendHexData(StringRef HexData) {
@@ -471,16 +511,18 @@ void OwnedDataSection::appendHexData(StringRef HexData) {
   Size = Data.size();
 }
 
-void BinarySectionWriter::visit(const CompressedSection &Sec) {
-  error("cannot write compressed section '" + Sec.Name + "' ");
+Error BinarySectionWriter::visit(const CompressedSection &Sec) {
+  return createStringError(errc::operation_not_permitted,
+                           "cannot write compressed section '" + Sec.Name +
+                               "' ");
 }
 
 template <class ELFT>
-void ELFSectionWriter<ELFT>::visit(const CompressedSection &Sec) {
+Error ELFSectionWriter<ELFT>::visit(const CompressedSection &Sec) {
   uint8_t *Buf = Out.getBufferStart() + Sec.Offset;
   if (Sec.CompressionType == DebugCompressionType::None) {
     std::copy(Sec.OriginalData.begin(), Sec.OriginalData.end(), Buf);
-    return;
+    return Error::success();
   }
 
   if (Sec.CompressionType == DebugCompressionType::GNU) {
@@ -501,17 +543,42 @@ void ELFSectionWriter<ELFT>::visit(const CompressedSection &Sec) {
   }
 
   std::copy(Sec.CompressedData.begin(), Sec.CompressedData.end(), Buf);
+  return Error::success();
+}
+
+Expected<CompressedSection>
+CompressedSection::create(const SectionBase &Sec,
+                          DebugCompressionType CompressionType) {
+  Error Err = Error::success();
+  CompressedSection Section(Sec, CompressionType, Err);
+
+  if (Err)
+    return std::move(Err);
+
+  return Section;
+}
+Expected<CompressedSection>
+CompressedSection::create(ArrayRef<uint8_t> CompressedData,
+                          uint64_t DecompressedSize,
+                          uint64_t DecompressedAlign) {
+  return CompressedSection(CompressedData, DecompressedSize, DecompressedAlign);
 }
 
 CompressedSection::CompressedSection(const SectionBase &Sec,
-                                     DebugCompressionType CompressionType)
+                                     DebugCompressionType CompressionType,
+                                     Error &OutErr)
     : SectionBase(Sec), CompressionType(CompressionType),
       DecompressedSize(Sec.OriginalData.size()), DecompressedAlign(Sec.Align) {
-  if (Error E = zlib::compress(
+  ErrorAsOutParameter EAO(&OutErr);
+
+  if (Error Err = zlib::compress(
           StringRef(reinterpret_cast<const char *>(OriginalData.data()),
                     OriginalData.size()),
-          CompressedData))
-    reportError(Name, std::move(E));
+          CompressedData)) {
+    OutErr = createStringError(llvm::errc::invalid_argument,
+                               "'" + Name + "': " + toString(std::move(Err)));
+    return;
+  }
 
   size_t ChdrSize;
   if (CompressionType == DebugCompressionType::GNU) {
@@ -537,12 +604,12 @@ CompressedSection::CompressedSection(ArrayRef<uint8_t> CompressedData,
   OriginalData = CompressedData;
 }
 
-void CompressedSection::accept(SectionVisitor &Visitor) const {
-  Visitor.visit(*this);
+Error CompressedSection::accept(SectionVisitor &Visitor) const {
+  return Visitor.visit(*this);
 }
 
-void CompressedSection::accept(MutableSectionVisitor &Visitor) {
-  Visitor.visit(*this);
+Error CompressedSection::accept(MutableSectionVisitor &Visitor) {
+  return Visitor.visit(*this);
 }
 
 void StringTableSection::addString(StringRef Name) { StrTabBuilder.add(Name); }
@@ -556,42 +623,51 @@ void StringTableSection::prepareForLayout() {
   Size = StrTabBuilder.getSize();
 }
 
-void SectionWriter::visit(const StringTableSection &Sec) {
+Error SectionWriter::visit(const StringTableSection &Sec) {
   Sec.StrTabBuilder.write(Out.getBufferStart() + Sec.Offset);
+  return Error::success();
 }
 
-void StringTableSection::accept(SectionVisitor &Visitor) const {
-  Visitor.visit(*this);
+Error StringTableSection::accept(SectionVisitor &Visitor) const {
+  return Visitor.visit(*this);
 }
 
-void StringTableSection::accept(MutableSectionVisitor &Visitor) {
-  Visitor.visit(*this);
+Error StringTableSection::accept(MutableSectionVisitor &Visitor) {
+  return Visitor.visit(*this);
 }
 
 template <class ELFT>
-void ELFSectionWriter<ELFT>::visit(const SectionIndexSection &Sec) {
+Error ELFSectionWriter<ELFT>::visit(const SectionIndexSection &Sec) {
   uint8_t *Buf = Out.getBufferStart() + Sec.Offset;
   llvm::copy(Sec.Indexes, reinterpret_cast<Elf_Word *>(Buf));
+  return Error::success();
 }
 
-void SectionIndexSection::initialize(SectionTableRef SecTable) {
+Error SectionIndexSection::initialize(SectionTableRef SecTable) {
   Size = 0;
-  setSymTab(SecTable.getSectionOfType<SymbolTableSection>(
-      Link,
-      "Link field value " + Twine(Link) + " in section " + Name + " is invalid",
-      "Link field value " + Twine(Link) + " in section " + Name +
-          " is not a symbol table"));
+  Expected<SymbolTableSection *> Sec =
+      SecTable.getSectionOfType<SymbolTableSection>(
+          Link,
+          "Link field value " + Twine(Link) + " in section " + Name +
+              " is invalid",
+          "Link field value " + Twine(Link) + " in section " + Name +
+              " is not a symbol table");
+  if (!Sec)
+    return Sec.takeError();
+
+  setSymTab(*Sec);
   Symbols->setShndxTable(this);
+  return Error::success();
 }
 
 void SectionIndexSection::finalize() { Link = Symbols->Index; }
 
-void SectionIndexSection::accept(SectionVisitor &Visitor) const {
-  Visitor.visit(*this);
+Error SectionIndexSection::accept(SectionVisitor &Visitor) const {
+  return Visitor.visit(*this);
 }
 
-void SectionIndexSection::accept(MutableSectionVisitor &Visitor) {
-  Visitor.visit(*this);
+Error SectionIndexSection::accept(MutableSectionVisitor &Visitor) {
+  return Visitor.visit(*this);
 }
 
 static bool isValidReservedSectionIndex(uint16_t Index, uint16_t Machine) {
@@ -674,8 +750,7 @@ void SymbolTableSection::addSymbol(Twine Name, uint8_t Bind, uint8_t Type,
 }
 
 Error SymbolTableSection::removeSectionReferences(
-    bool AllowBrokenLinks,
-    function_ref<bool(const SectionBase *)> ToRemove) {
+    bool AllowBrokenLinks, function_ref<bool(const SectionBase *)> ToRemove) {
   if (ToRemove(SectionIndexTable))
     SectionIndexTable = nullptr;
   if (ToRemove(SymbolNames)) {
@@ -718,14 +793,20 @@ void SymbolTableSection::replaceSectionReferences(
       Sym->DefinedIn = To;
 }
 
-void SymbolTableSection::initialize(SectionTableRef SecTable) {
+Error SymbolTableSection::initialize(SectionTableRef SecTable) {
   Size = 0;
-  setStrTab(SecTable.getSectionOfType<StringTableSection>(
-      Link,
-      "Symbol table has link index of " + Twine(Link) +
-          " which is not a valid index",
-      "Symbol table has link index of " + Twine(Link) +
-          " which is not a string table"));
+  Expected<StringTableSection *> Sec =
+      SecTable.getSectionOfType<StringTableSection>(
+          Link,
+          "Symbol table has link index of " + Twine(Link) +
+              " which is not a valid index",
+          "Symbol table has link index of " + Twine(Link) +
+              " which is not a string table");
+  if (!Sec)
+    return Sec.takeError();
+
+  setStrTab(*Sec);
+  return Error::success();
 }
 
 void SymbolTableSection::finalize() {
@@ -770,19 +851,25 @@ void SymbolTableSection::fillShndxTable() {
   }
 }
 
-const Symbol *SymbolTableSection::getSymbolByIndex(uint32_t Index) const {
+Expected<const Symbol *>
+SymbolTableSection::getSymbolByIndex(uint32_t Index) const {
   if (Symbols.size() <= Index)
-    error("invalid symbol index: " + Twine(Index));
+    return createStringError(errc::invalid_argument,
+                             "invalid symbol index: " + Twine(Index));
   return Symbols[Index].get();
 }
 
-Symbol *SymbolTableSection::getSymbolByIndex(uint32_t Index) {
-  return const_cast<Symbol *>(
-      static_cast<const SymbolTableSection *>(this)->getSymbolByIndex(Index));
+Expected<Symbol *> SymbolTableSection::getSymbolByIndex(uint32_t Index) {
+  Expected<const Symbol *> Sym =
+      static_cast<const SymbolTableSection *>(this)->getSymbolByIndex(Index);
+  if (!Sym)
+    return Sym.takeError();
+
+  return const_cast<Symbol *>(*Sym);
 }
 
 template <class ELFT>
-void ELFSectionWriter<ELFT>::visit(const SymbolTableSection &Sec) {
+Error ELFSectionWriter<ELFT>::visit(const SymbolTableSection &Sec) {
   Elf_Sym *Sym = reinterpret_cast<Elf_Sym *>(Out.getBufferStart() + Sec.Offset);
   // Loop though symbols setting each entry of the symbol table.
   for (const std::unique_ptr<Symbol> &Symbol : Sec.Symbols) {
@@ -795,19 +882,19 @@ void ELFSectionWriter<ELFT>::visit(const SymbolTableSection &Sec) {
     Sym->st_shndx = Symbol->getShndx();
     ++Sym;
   }
+  return Error::success();
 }
 
-void SymbolTableSection::accept(SectionVisitor &Visitor) const {
-  Visitor.visit(*this);
+Error SymbolTableSection::accept(SectionVisitor &Visitor) const {
+  return Visitor.visit(*this);
 }
 
-void SymbolTableSection::accept(MutableSectionVisitor &Visitor) {
-  Visitor.visit(*this);
+Error SymbolTableSection::accept(MutableSectionVisitor &Visitor) {
+  return Visitor.visit(*this);
 }
 
 Error RelocationSection::removeSectionReferences(
-    bool AllowBrokenLinks,
-    function_ref<bool(const SectionBase *)> ToRemove) {
+    bool AllowBrokenLinks, function_ref<bool(const SectionBase *)> ToRemove) {
   if (ToRemove(Symbols)) {
     if (!AllowBrokenLinks)
       return createStringError(
@@ -834,22 +921,33 @@ Error RelocationSection::removeSectionReferences(
 }
 
 template <class SymTabType>
-void RelocSectionWithSymtabBase<SymTabType>::initialize(
+Error RelocSectionWithSymtabBase<SymTabType>::initialize(
     SectionTableRef SecTable) {
-  if (Link != SHN_UNDEF)
-    setSymTab(SecTable.getSectionOfType<SymTabType>(
+  if (Link != SHN_UNDEF) {
+    Expected<SymTabType *> Sec = SecTable.getSectionOfType<SymTabType>(
         Link,
         "Link field value " + Twine(Link) + " in section " + Name +
             " is invalid",
         "Link field value " + Twine(Link) + " in section " + Name +
-            " is not a symbol table"));
+            " is not a symbol table");
+    if (!Sec)
+      return Sec.takeError();
 
-  if (Info != SHN_UNDEF)
-    setSection(SecTable.getSection(Info, "Info field value " + Twine(Info) +
-                                             " in section " + Name +
-                                             " is invalid"));
-  else
+    setSymTab(*Sec);
+  }
+
+  if (Info != SHN_UNDEF) {
+    Expected<SectionBase *> Sec =
+        SecTable.getSection(Info, "Info field value " + Twine(Info) +
+                                      " in section " + Name + " is invalid");
+    if (!Sec)
+      return Sec.takeError();
+
+    setSection(*Sec);
+  } else
     setSection(nullptr);
+
+  return Error::success();
 }
 
 template <class SymTabType>
@@ -861,7 +959,7 @@ void RelocSectionWithSymtabBase<SymTabType>::finalize() {
 }
 
 template <class ELFT>
-static void setAddend(Elf_Rel_Impl<ELFT, false> &Rel, uint64_t Addend) {}
+static void setAddend(Elf_Rel_Impl<ELFT, false> &, uint64_t) {}
 
 template <class ELFT>
 static void setAddend(Elf_Rel_Impl<ELFT, true> &Rela, uint64_t Addend) {
@@ -880,20 +978,21 @@ static void writeRel(const RelRange &Relocations, T *Buf) {
 }
 
 template <class ELFT>
-void ELFSectionWriter<ELFT>::visit(const RelocationSection &Sec) {
+Error ELFSectionWriter<ELFT>::visit(const RelocationSection &Sec) {
   uint8_t *Buf = Out.getBufferStart() + Sec.Offset;
   if (Sec.Type == SHT_REL)
     writeRel(Sec.Relocations, reinterpret_cast<Elf_Rel *>(Buf));
   else
     writeRel(Sec.Relocations, reinterpret_cast<Elf_Rela *>(Buf));
+  return Error::success();
 }
 
-void RelocationSection::accept(SectionVisitor &Visitor) const {
-  Visitor.visit(*this);
+Error RelocationSection::accept(SectionVisitor &Visitor) const {
+  return Visitor.visit(*this);
 }
 
-void RelocationSection::accept(MutableSectionVisitor &Visitor) {
-  Visitor.visit(*this);
+Error RelocationSection::accept(MutableSectionVisitor &Visitor) {
+  return Visitor.visit(*this);
 }
 
 Error RelocationSection::removeSymbols(
@@ -920,16 +1019,17 @@ void RelocationSection::replaceSectionReferences(
     SecToApplyRel = To;
 }
 
-void SectionWriter::visit(const DynamicRelocationSection &Sec) {
+Error SectionWriter::visit(const DynamicRelocationSection &Sec) {
   llvm::copy(Sec.Contents, Out.getBufferStart() + Sec.Offset);
+  return Error::success();
 }
 
-void DynamicRelocationSection::accept(SectionVisitor &Visitor) const {
-  Visitor.visit(*this);
+Error DynamicRelocationSection::accept(SectionVisitor &Visitor) const {
+  return Visitor.visit(*this);
 }
 
-void DynamicRelocationSection::accept(MutableSectionVisitor &Visitor) {
-  Visitor.visit(*this);
+Error DynamicRelocationSection::accept(MutableSectionVisitor &Visitor) {
+  return Visitor.visit(*this);
 }
 
 Error DynamicRelocationSection::removeSectionReferences(
@@ -1015,14 +1115,22 @@ void GroupSection::onRemove() {
     Sec->Flags &= ~SHF_GROUP;
 }
 
-void Section::initialize(SectionTableRef SecTable) {
+Error Section::initialize(SectionTableRef SecTable) {
   if (Link == ELF::SHN_UNDEF)
-    return;
-  LinkSection =
+    return Error::success();
+
+  Expected<SectionBase *> Sec =
       SecTable.getSection(Link, "Link field value " + Twine(Link) +
                                     " in section " + Name + " is invalid");
+  if (!Sec)
+    return Sec.takeError();
+
+  LinkSection = *Sec;
+
   if (LinkSection->Type == ELF::SHT_SYMTAB)
     LinkSection = nullptr;
+
+  return Error::success();
 }
 
 void Section::finalize() { this->Link = LinkSection ? LinkSection->Index : 0; }
@@ -1051,37 +1159,39 @@ GnuDebugLinkSection::GnuDebugLinkSection(StringRef File,
 }
 
 template <class ELFT>
-void ELFSectionWriter<ELFT>::visit(const GnuDebugLinkSection &Sec) {
+Error ELFSectionWriter<ELFT>::visit(const GnuDebugLinkSection &Sec) {
   unsigned char *Buf = Out.getBufferStart() + Sec.Offset;
   Elf_Word *CRC =
       reinterpret_cast<Elf_Word *>(Buf + Sec.Size - sizeof(Elf_Word));
   *CRC = Sec.CRC32;
   llvm::copy(Sec.FileName, Buf);
+  return Error::success();
 }
 
-void GnuDebugLinkSection::accept(SectionVisitor &Visitor) const {
-  Visitor.visit(*this);
+Error GnuDebugLinkSection::accept(SectionVisitor &Visitor) const {
+  return Visitor.visit(*this);
 }
 
-void GnuDebugLinkSection::accept(MutableSectionVisitor &Visitor) {
-  Visitor.visit(*this);
+Error GnuDebugLinkSection::accept(MutableSectionVisitor &Visitor) {
+  return Visitor.visit(*this);
 }
 
 template <class ELFT>
-void ELFSectionWriter<ELFT>::visit(const GroupSection &Sec) {
+Error ELFSectionWriter<ELFT>::visit(const GroupSection &Sec) {
   ELF::Elf32_Word *Buf =
       reinterpret_cast<ELF::Elf32_Word *>(Out.getBufferStart() + Sec.Offset);
   *Buf++ = Sec.FlagWord;
   for (SectionBase *S : Sec.GroupMembers)
     support::endian::write32<ELFT::TargetEndianness>(Buf++, S->Index);
+  return Error::success();
 }
 
-void GroupSection::accept(SectionVisitor &Visitor) const {
-  Visitor.visit(*this);
+Error GroupSection::accept(SectionVisitor &Visitor) const {
+  return Visitor.visit(*this);
 }
 
-void GroupSection::accept(MutableSectionVisitor &Visitor) {
-  Visitor.visit(*this);
+Error GroupSection::accept(MutableSectionVisitor &Visitor) {
+  return Visitor.visit(*this);
 }
 
 // Returns true IFF a section is wholly inside the range of a segment
@@ -1161,9 +1271,12 @@ SymbolTableSection *BasicELFBuilder::addSymTab(StringTableSection *StrTab) {
   return &SymTab;
 }
 
-void BasicELFBuilder::initSections() {
+Error BasicELFBuilder::initSections() {
   for (SectionBase &Sec : Obj->sections())
-    Sec.initialize(Obj->sections());
+    if (Error Err = Sec.initialize(Obj->sections()))
+      return Err;
+
+  return Error::success();
 }
 
 void BinaryELFBuilder::addData(SymbolTableSection *SymTab) {
@@ -1190,12 +1303,13 @@ void BinaryELFBuilder::addData(SymbolTableSection *SymTab) {
                     0);
 }
 
-std::unique_ptr<Object> BinaryELFBuilder::build() {
+Expected<std::unique_ptr<Object>> BinaryELFBuilder::build() {
   initFileHeader();
   initHeaderSegment();
 
   SymbolTableSection *SymTab = addSymTab(addStrTab());
-  initSections();
+  if (Error Err = initSections())
+    return std::move(Err);
   addData(SymTab);
 
   return std::move(Obj);
@@ -1246,12 +1360,13 @@ void IHexELFBuilder::addDataSections() {
   }
 }
 
-std::unique_ptr<Object> IHexELFBuilder::build() {
+Expected<std::unique_ptr<Object>> IHexELFBuilder::build() {
   initFileHeader();
   initHeaderSegment();
   StringTableSection *StrTab = addStrTab();
   addSymTab(StrTab);
-  initSections();
+  if (Error Err = initSections())
+    return std::move(Err);
   addDataSections();
 
   return std::move(Obj);
@@ -1273,27 +1388,37 @@ template <class ELFT> void ELFBuilder<ELFT>::setParentSegment(Segment &Child) {
   }
 }
 
-template <class ELFT> void ELFBuilder<ELFT>::findEhdrOffset() {
+template <class ELFT> Error ELFBuilder<ELFT>::findEhdrOffset() {
   if (!ExtractPartition)
-    return;
+    return Error::success();
 
   for (const SectionBase &Sec : Obj.sections()) {
     if (Sec.Type == SHT_LLVM_PART_EHDR && Sec.Name == *ExtractPartition) {
       EhdrOffset = Sec.Offset;
-      return;
+      return Error::success();
     }
   }
-  error("could not find partition named '" + *ExtractPartition + "'");
+  return createStringError(errc::invalid_argument,
+                           "could not find partition named '" +
+                               *ExtractPartition + "'");
 }
 
 template <class ELFT>
-void ELFBuilder<ELFT>::readProgramHeaders(const ELFFile<ELFT> &HeadersFile) {
+Error ELFBuilder<ELFT>::readProgramHeaders(const ELFFile<ELFT> &HeadersFile) {
   uint32_t Index = 0;
-  for (const auto &Phdr : unwrapOrError(HeadersFile.program_headers())) {
+
+  Expected<typename ELFFile<ELFT>::Elf_Phdr_Range> Headers =
+      HeadersFile.program_headers();
+  if (!Headers)
+    return Headers.takeError();
+
+  for (const typename ELFFile<ELFT>::Elf_Phdr &Phdr : *Headers) {
     if (Phdr.p_offset + Phdr.p_filesz > HeadersFile.getBufSize())
-      error("program header with offset 0x" + Twine::utohexstr(Phdr.p_offset) +
-            " and file size 0x" + Twine::utohexstr(Phdr.p_filesz) +
-            " goes past the end of the file");
+      return createStringError(
+          errc::invalid_argument,
+          "program header with offset 0x" + Twine::utohexstr(Phdr.p_offset) +
+              " and file size 0x" + Twine::utohexstr(Phdr.p_filesz) +
+              " goes past the end of the file");
 
     ArrayRef<uint8_t> Data{HeadersFile.base() + Phdr.p_offset,
                            (size_t)Phdr.p_filesz};
@@ -1320,7 +1445,7 @@ void ELFBuilder<ELFT>::readProgramHeaders(const ELFFile<ELFT> &HeadersFile) {
   ElfHdr.Index = Index++;
   ElfHdr.OriginalOffset = ElfHdr.Offset = EhdrOffset;
 
-  const auto &Ehdr = *HeadersFile.getHeader();
+  const typename ELFT::Ehdr &Ehdr = HeadersFile.getHeader();
   auto &PrHdr = Obj.ProgramHdrSegment;
   PrHdr.Type = PT_PHDR;
   PrHdr.Flags = 0;
@@ -1341,13 +1466,16 @@ void ELFBuilder<ELFT>::readProgramHeaders(const ELFFile<ELFT> &HeadersFile) {
     setParentSegment(Child);
   setParentSegment(ElfHdr);
   setParentSegment(PrHdr);
+
+  return Error::success();
 }
 
 template <class ELFT>
-void ELFBuilder<ELFT>::initGroupSection(GroupSection *GroupSec) {
+Error ELFBuilder<ELFT>::initGroupSection(GroupSection *GroupSec) {
   if (GroupSec->Align % sizeof(ELF::Elf32_Word) != 0)
-    error("invalid alignment " + Twine(GroupSec->Align) + " of group section '" +
-          GroupSec->Name + "'");
+    return createStringError(errc::invalid_argument,
+                             "invalid alignment " + Twine(GroupSec->Align) +
+                                 " of group section '" + GroupSec->Name + "'");
   SectionTableRef SecTable = Obj.sections();
   if (GroupSec->Link != SHN_UNDEF) {
     auto SymTab = SecTable.template getSectionOfType<SymbolTableSection>(
@@ -1356,16 +1484,23 @@ void ELFBuilder<ELFT>::initGroupSection(GroupSection *GroupSec) {
             GroupSec->Name + "' is invalid",
         "link field value '" + Twine(GroupSec->Link) + "' in section '" +
             GroupSec->Name + "' is not a symbol table");
-    Symbol *Sym = SymTab->getSymbolByIndex(GroupSec->Info);
+    if (!SymTab)
+      return SymTab.takeError();
+
+    Expected<Symbol *> Sym = (*SymTab)->getSymbolByIndex(GroupSec->Info);
     if (!Sym)
-      error("info field value '" + Twine(GroupSec->Info) + "' in section '" +
-            GroupSec->Name + "' is not a valid symbol index");
-    GroupSec->setSymTab(SymTab);
-    GroupSec->setSymbol(Sym);
+      return createStringError(errc::invalid_argument,
+                               "info field value '" + Twine(GroupSec->Info) +
+                                   "' in section '" + GroupSec->Name +
+                                   "' is not a valid symbol index");
+    GroupSec->setSymTab(*SymTab);
+    GroupSec->setSymbol(*Sym);
   }
   if (GroupSec->Contents.size() % sizeof(ELF::Elf32_Word) ||
       GroupSec->Contents.empty())
-    error("the content of the section " + GroupSec->Name + " is malformed");
+    return createStringError(errc::invalid_argument,
+                             "the content of the section " + GroupSec->Name +
+                                 " is malformed");
   const ELF::Elf32_Word *Word =
       reinterpret_cast<const ELF::Elf32_Word *>(GroupSec->Contents.data());
   const ELF::Elf32_Word *End =
@@ -1373,61 +1508,103 @@ void ELFBuilder<ELFT>::initGroupSection(GroupSection *GroupSec) {
   GroupSec->setFlagWord(*Word++);
   for (; Word != End; ++Word) {
     uint32_t Index = support::endian::read32<ELFT::TargetEndianness>(Word);
-    GroupSec->addMember(SecTable.getSection(
+    Expected<SectionBase *> Sec = SecTable.getSection(
         Index, "group member index " + Twine(Index) + " in section '" +
-                   GroupSec->Name + "' is invalid"));
+                   GroupSec->Name + "' is invalid");
+    if (!Sec)
+      return Sec.takeError();
+
+    GroupSec->addMember(*Sec);
   }
+
+  return Error::success();
 }
 
 template <class ELFT>
-void ELFBuilder<ELFT>::initSymbolTable(SymbolTableSection *SymTab) {
-  const Elf_Shdr &Shdr = *unwrapOrError(ElfFile.getSection(SymTab->Index));
-  StringRef StrTabData = unwrapOrError(ElfFile.getStringTableForSymtab(Shdr));
+Error ELFBuilder<ELFT>::initSymbolTable(SymbolTableSection *SymTab) {
+  Expected<const Elf_Shdr *> Shdr = ElfFile.getSection(SymTab->Index);
+  if (!Shdr)
+    return Shdr.takeError();
+
+  Expected<StringRef> StrTabData = ElfFile.getStringTableForSymtab(**Shdr);
+  if (!StrTabData)
+    return StrTabData.takeError();
+
   ArrayRef<Elf_Word> ShndxData;
 
-  auto Symbols = unwrapOrError(ElfFile.symbols(&Shdr));
-  for (const auto &Sym : Symbols) {
+  Expected<typename ELFFile<ELFT>::Elf_Sym_Range> Symbols =
+      ElfFile.symbols(*Shdr);
+  if (!Symbols)
+    return Symbols.takeError();
+
+  for (const typename ELFFile<ELFT>::Elf_Sym &Sym : *Symbols) {
     SectionBase *DefSection = nullptr;
-    StringRef Name = unwrapOrError(Sym.getName(StrTabData));
+
+    Expected<StringRef> Name = Sym.getName(*StrTabData);
+    if (!Name)
+      return Name.takeError();
 
     if (Sym.st_shndx == SHN_XINDEX) {
       if (SymTab->getShndxTable() == nullptr)
-        error("symbol '" + Name +
-              "' has index SHN_XINDEX but no SHT_SYMTAB_SHNDX section exists");
+        return createStringError(errc::invalid_argument,
+                                 "symbol '" + *Name +
+                                     "' has index SHN_XINDEX but no "
+                                     "SHT_SYMTAB_SHNDX section exists");
       if (ShndxData.data() == nullptr) {
-        const Elf_Shdr &ShndxSec =
-            *unwrapOrError(ElfFile.getSection(SymTab->getShndxTable()->Index));
-        ShndxData = unwrapOrError(
-            ElfFile.template getSectionContentsAsArray<Elf_Word>(&ShndxSec));
-        if (ShndxData.size() != Symbols.size())
-          error("symbol section index table does not have the same number of "
-                "entries as the symbol table");
+        Expected<const Elf_Shdr *> ShndxSec =
+            ElfFile.getSection(SymTab->getShndxTable()->Index);
+        if (!ShndxSec)
+          return ShndxSec.takeError();
+
+        Expected<ArrayRef<Elf_Word>> Data =
+            ElfFile.template getSectionContentsAsArray<Elf_Word>(**ShndxSec);
+        if (!Data)
+          return Data.takeError();
+
+        ShndxData = *Data;
+        if (ShndxData.size() != Symbols->size())
+          return createStringError(
+              errc::invalid_argument,
+              "symbol section index table does not have the same number of "
+              "entries as the symbol table");
       }
-      Elf_Word Index = ShndxData[&Sym - Symbols.begin()];
-      DefSection = Obj.sections().getSection(
+      Elf_Word Index = ShndxData[&Sym - Symbols->begin()];
+      Expected<SectionBase *> Sec = Obj.sections().getSection(
           Index,
-          "symbol '" + Name + "' has invalid section index " + Twine(Index));
+          "symbol '" + *Name + "' has invalid section index " + Twine(Index));
+      if (!Sec)
+        return Sec.takeError();
+
+      DefSection = *Sec;
     } else if (Sym.st_shndx >= SHN_LORESERVE) {
       if (!isValidReservedSectionIndex(Sym.st_shndx, Obj.Machine)) {
-        error(
-            "symbol '" + Name +
-            "' has unsupported value greater than or equal to SHN_LORESERVE: " +
-            Twine(Sym.st_shndx));
+        return createStringError(
+            errc::invalid_argument,
+            "symbol '" + *Name +
+                "' has unsupported value greater than or equal "
+                "to SHN_LORESERVE: " +
+                Twine(Sym.st_shndx));
       }
     } else if (Sym.st_shndx != SHN_UNDEF) {
-      DefSection = Obj.sections().getSection(
-          Sym.st_shndx, "symbol '" + Name +
+      Expected<SectionBase *> Sec = Obj.sections().getSection(
+          Sym.st_shndx, "symbol '" + *Name +
                             "' is defined has invalid section index " +
                             Twine(Sym.st_shndx));
+      if (!Sec)
+        return Sec.takeError();
+
+      DefSection = *Sec;
     }
 
-    SymTab->addSymbol(Name, Sym.getBinding(), Sym.getType(), DefSection,
+    SymTab->addSymbol(*Name, Sym.getBinding(), Sym.getType(), DefSection,
                       Sym.getValue(), Sym.st_other, Sym.st_shndx, Sym.st_size);
   }
+
+  return Error::success();
 }
 
 template <class ELFT>
-static void getAddend(uint64_t &ToSet, const Elf_Rel_Impl<ELFT, false> &Rel) {}
+static void getAddend(uint64_t &, const Elf_Rel_Impl<ELFT, false> &) {}
 
 template <class ELFT>
 static void getAddend(uint64_t &ToSet, const Elf_Rel_Impl<ELFT, true> &Rela) {
@@ -1435,8 +1612,8 @@ static void getAddend(uint64_t &ToSet, const Elf_Rel_Impl<ELFT, true> &Rela) {
 }
 
 template <class T>
-static void initRelocations(RelocationSection *Relocs,
-                            SymbolTableSection *SymbolTable, T RelRange) {
+static Error initRelocations(RelocationSection *Relocs,
+                             SymbolTableSection *SymbolTable, T RelRange) {
   for (const auto &Rel : RelRange) {
     Relocation ToAdd;
     ToAdd.Offset = Rel.r_offset;
@@ -1445,39 +1622,54 @@ static void initRelocations(RelocationSection *Relocs,
 
     if (uint32_t Sym = Rel.getSymbol(false)) {
       if (!SymbolTable)
-        error("'" + Relocs->Name +
-              "': relocation references symbol with index " + Twine(Sym) +
-              ", but there is no symbol table");
-      ToAdd.RelocSymbol = SymbolTable->getSymbolByIndex(Sym);
+        return createStringError(
+            errc::invalid_argument,
+            "'" + Relocs->Name + "': relocation references symbol with index " +
+                Twine(Sym) + ", but there is no symbol table");
+      Expected<Symbol *> SymByIndex = SymbolTable->getSymbolByIndex(Sym);
+      if (!SymByIndex)
+        return SymByIndex.takeError();
+
+      ToAdd.RelocSymbol = *SymByIndex;
     }
 
     Relocs->addRelocation(ToAdd);
   }
+
+  return Error::success();
 }
 
-SectionBase *SectionTableRef::getSection(uint32_t Index, Twine ErrMsg) {
+Expected<SectionBase *> SectionTableRef::getSection(uint32_t Index,
+                                                    Twine ErrMsg) {
   if (Index == SHN_UNDEF || Index > Sections.size())
-    error(ErrMsg);
+    return createStringError(errc::invalid_argument, ErrMsg);
   return Sections[Index - 1].get();
 }
 
 template <class T>
-T *SectionTableRef::getSectionOfType(uint32_t Index, Twine IndexErrMsg,
-                                     Twine TypeErrMsg) {
-  if (T *Sec = dyn_cast<T>(getSection(Index, IndexErrMsg)))
+Expected<T *> SectionTableRef::getSectionOfType(uint32_t Index,
+                                                Twine IndexErrMsg,
+                                                Twine TypeErrMsg) {
+  Expected<SectionBase *> BaseSec = getSection(Index, IndexErrMsg);
+  if (!BaseSec)
+    return BaseSec.takeError();
+
+  if (T *Sec = dyn_cast<T>(*BaseSec))
     return Sec;
-  error(TypeErrMsg);
+
+  return createStringError(errc::invalid_argument, TypeErrMsg);
 }
 
 template <class ELFT>
-SectionBase &ELFBuilder<ELFT>::makeSection(const Elf_Shdr &Shdr) {
-  ArrayRef<uint8_t> Data;
+Expected<SectionBase &> ELFBuilder<ELFT>::makeSection(const Elf_Shdr &Shdr) {
   switch (Shdr.sh_type) {
   case SHT_REL:
   case SHT_RELA:
     if (Shdr.sh_flags & SHF_ALLOC) {
-      Data = unwrapOrError(ElfFile.getSectionContents(&Shdr));
-      return Obj.addSection<DynamicRelocationSection>(Data);
+      if (Expected<ArrayRef<uint8_t>> Data = ElfFile.getSectionContents(Shdr))
+        return Obj.addSection<DynamicRelocationSection>(*Data);
+      else
+        return Data.takeError();
     }
     return Obj.addSection<RelocationSection>();
   case SHT_STRTAB:
@@ -1485,25 +1677,35 @@ SectionBase &ELFBuilder<ELFT>::makeSection(const Elf_Shdr &Shdr) {
     // mean altering the memory image. There are no special link types or
     // anything so we can just use a Section.
     if (Shdr.sh_flags & SHF_ALLOC) {
-      Data = unwrapOrError(ElfFile.getSectionContents(&Shdr));
-      return Obj.addSection<Section>(Data);
+      if (Expected<ArrayRef<uint8_t>> Data = ElfFile.getSectionContents(Shdr))
+        return Obj.addSection<Section>(*Data);
+      else
+        return Data.takeError();
     }
     return Obj.addSection<StringTableSection>();
   case SHT_HASH:
   case SHT_GNU_HASH:
     // Hash tables should refer to SHT_DYNSYM which we're not going to change.
     // Because of this we don't need to mess with the hash tables either.
-    Data = unwrapOrError(ElfFile.getSectionContents(&Shdr));
-    return Obj.addSection<Section>(Data);
+    if (Expected<ArrayRef<uint8_t>> Data = ElfFile.getSectionContents(Shdr))
+      return Obj.addSection<Section>(*Data);
+    else
+      return Data.takeError();
   case SHT_GROUP:
-    Data = unwrapOrError(ElfFile.getSectionContents(&Shdr));
-    return Obj.addSection<GroupSection>(Data);
+    if (Expected<ArrayRef<uint8_t>> Data = ElfFile.getSectionContents(Shdr))
+      return Obj.addSection<GroupSection>(*Data);
+    else
+      return Data.takeError();
   case SHT_DYNSYM:
-    Data = unwrapOrError(ElfFile.getSectionContents(&Shdr));
-    return Obj.addSection<DynamicSymbolTableSection>(Data);
+    if (Expected<ArrayRef<uint8_t>> Data = ElfFile.getSectionContents(Shdr))
+      return Obj.addSection<DynamicSymbolTableSection>(*Data);
+    else
+      return Data.takeError();
   case SHT_DYNAMIC:
-    Data = unwrapOrError(ElfFile.getSectionContents(&Shdr));
-    return Obj.addSection<DynamicSection>(Data);
+    if (Expected<ArrayRef<uint8_t>> Data = ElfFile.getSectionContents(Shdr))
+      return Obj.addSection<DynamicSection>(*Data);
+    else
+      return Data.takeError();
   case SHT_SYMTAB: {
     auto &SymTab = Obj.addSection<SymbolTableSection>();
     Obj.SymbolTable = &SymTab;
@@ -1515,114 +1717,176 @@ SectionBase &ELFBuilder<ELFT>::makeSection(const Elf_Shdr &Shdr) {
     return ShndxSection;
   }
   case SHT_NOBITS:
-    return Obj.addSection<Section>(Data);
+    return Obj.addSection<Section>(ArrayRef<uint8_t>());
   default: {
-    Data = unwrapOrError(ElfFile.getSectionContents(&Shdr));
+    Expected<ArrayRef<uint8_t>> Data = ElfFile.getSectionContents(Shdr);
+    if (!Data)
+      return Data.takeError();
+
+    Expected<StringRef> Name = ElfFile.getSectionName(Shdr);
+    if (!Name)
+      return Name.takeError();
 
-    StringRef Name = unwrapOrError(ElfFile.getSectionName(&Shdr));
-    if (Name.startswith(".zdebug") || (Shdr.sh_flags & ELF::SHF_COMPRESSED)) {
+    if (Name->startswith(".zdebug") || (Shdr.sh_flags & ELF::SHF_COMPRESSED)) {
       uint64_t DecompressedSize, DecompressedAlign;
       std::tie(DecompressedSize, DecompressedAlign) =
-          getDecompressedSizeAndAlignment<ELFT>(Data);
-      return Obj.addSection<CompressedSection>(Data, DecompressedSize,
-                                               DecompressedAlign);
+          getDecompressedSizeAndAlignment<ELFT>(*Data);
+      Expected<CompressedSection> NewSection =
+          CompressedSection::create(*Data, DecompressedSize, DecompressedAlign);
+      if (!NewSection)
+        return NewSection.takeError();
+
+      return Obj.addSection<CompressedSection>(std::move(*NewSection));
     }
 
-    return Obj.addSection<Section>(Data);
+    return Obj.addSection<Section>(*Data);
   }
   }
 }
 
-template <class ELFT> void ELFBuilder<ELFT>::readSectionHeaders() {
+template <class ELFT> Error ELFBuilder<ELFT>::readSectionHeaders() {
   uint32_t Index = 0;
-  for (const auto &Shdr : unwrapOrError(ElfFile.sections())) {
+  Expected<typename ELFFile<ELFT>::Elf_Shdr_Range> Sections =
+      ElfFile.sections();
+  if (!Sections)
+    return Sections.takeError();
+
+  for (const typename ELFFile<ELFT>::Elf_Shdr &Shdr : *Sections) {
     if (Index == 0) {
       ++Index;
       continue;
     }
-    auto &Sec = makeSection(Shdr);
-    Sec.Name = std::string(unwrapOrError(ElfFile.getSectionName(&Shdr)));
-    Sec.Type = Sec.OriginalType = Shdr.sh_type;
-    Sec.Flags = Sec.OriginalFlags = Shdr.sh_flags;
-    Sec.Addr = Shdr.sh_addr;
-    Sec.Offset = Shdr.sh_offset;
-    Sec.OriginalOffset = Shdr.sh_offset;
-    Sec.Size = Shdr.sh_size;
-    Sec.Link = Shdr.sh_link;
-    Sec.Info = Shdr.sh_info;
-    Sec.Align = Shdr.sh_addralign;
-    Sec.EntrySize = Shdr.sh_entsize;
-    Sec.Index = Index++;
-    Sec.OriginalData =
+    Expected<SectionBase &> Sec = makeSection(Shdr);
+    if (!Sec)
+      return Sec.takeError();
+
+    Expected<StringRef> SecName = ElfFile.getSectionName(Shdr);
+    if (!SecName)
+      return SecName.takeError();
+    Sec->Name = SecName->str();
+    Sec->Type = Sec->OriginalType = Shdr.sh_type;
+    Sec->Flags = Sec->OriginalFlags = Shdr.sh_flags;
+    Sec->Addr = Shdr.sh_addr;
+    Sec->Offset = Shdr.sh_offset;
+    Sec->OriginalOffset = Shdr.sh_offset;
+    Sec->Size = Shdr.sh_size;
+    Sec->Link = Shdr.sh_link;
+    Sec->Info = Shdr.sh_info;
+    Sec->Align = Shdr.sh_addralign;
+    Sec->EntrySize = Shdr.sh_entsize;
+    Sec->Index = Index++;
+    Sec->OriginalIndex = Sec->Index;
+    Sec->OriginalData =
         ArrayRef<uint8_t>(ElfFile.base() + Shdr.sh_offset,
                           (Shdr.sh_type == SHT_NOBITS) ? 0 : Shdr.sh_size);
   }
+
+  return Error::success();
 }
 
-template <class ELFT> void ELFBuilder<ELFT>::readSections(bool EnsureSymtab) {
-  uint32_t ShstrIndex = ElfFile.getHeader()->e_shstrndx;
-  if (ShstrIndex == SHN_XINDEX)
-    ShstrIndex = unwrapOrError(ElfFile.getSection(0))->sh_link;
+template <class ELFT> Error ELFBuilder<ELFT>::readSections(bool EnsureSymtab) {
+  uint32_t ShstrIndex = ElfFile.getHeader().e_shstrndx;
+  if (ShstrIndex == SHN_XINDEX) {
+    Expected<const Elf_Shdr *> Sec = ElfFile.getSection(0);
+    if (!Sec)
+      return Sec.takeError();
+
+    ShstrIndex = (*Sec)->sh_link;
+  }
 
   if (ShstrIndex == SHN_UNDEF)
     Obj.HadShdrs = false;
-  else
-    Obj.SectionNames =
+  else {
+    Expected<StringTableSection *> Sec =
         Obj.sections().template getSectionOfType<StringTableSection>(
             ShstrIndex,
             "e_shstrndx field value " + Twine(ShstrIndex) + " in elf header " +
                 " is invalid",
             "e_shstrndx field value " + Twine(ShstrIndex) + " in elf header " +
                 " does not reference a string table");
+    if (!Sec)
+      return Sec.takeError();
+
+    Obj.SectionNames = *Sec;
+  }
 
   // If a section index table exists we'll need to initialize it before we
   // initialize the symbol table because the symbol table might need to
   // reference it.
   if (Obj.SectionIndexTable)
-    Obj.SectionIndexTable->initialize(Obj.sections());
+    if (Error Err = Obj.SectionIndexTable->initialize(Obj.sections()))
+      return Err;
 
   // Now that all of the sections have been added we can fill out some extra
   // details about symbol tables. We need the symbol table filled out before
   // any relocations.
   if (Obj.SymbolTable) {
-    Obj.SymbolTable->initialize(Obj.sections());
-    initSymbolTable(Obj.SymbolTable);
+    if (Error Err = Obj.SymbolTable->initialize(Obj.sections()))
+      return Err;
+    if (Error Err = initSymbolTable(Obj.SymbolTable))
+      return Err;
   } else if (EnsureSymtab) {
-    Obj.addNewSymbolTable();
+    if (Error Err = Obj.addNewSymbolTable())
+      return Err;
   }
 
   // Now that all sections and symbols have been added we can add
   // relocations that reference symbols and set the link and info fields for
   // relocation sections.
-  for (auto &Sec : Obj.sections()) {
+  for (SectionBase &Sec : Obj.sections()) {
     if (&Sec == Obj.SymbolTable)
       continue;
-    Sec.initialize(Obj.sections());
+    if (Error Err = Sec.initialize(Obj.sections()))
+      return Err;
     if (auto RelSec = dyn_cast<RelocationSection>(&Sec)) {
-      auto Shdr = unwrapOrError(ElfFile.sections()).begin() + RelSec->Index;
-      if (RelSec->Type == SHT_REL)
-        initRelocations(RelSec, Obj.SymbolTable,
-                        unwrapOrError(ElfFile.rels(Shdr)));
-      else
-        initRelocations(RelSec, Obj.SymbolTable,
-                        unwrapOrError(ElfFile.relas(Shdr)));
+      Expected<typename ELFFile<ELFT>::Elf_Shdr_Range> Sections =
+          ElfFile.sections();
+      if (!Sections)
+        return Sections.takeError();
+
+      const typename ELFFile<ELFT>::Elf_Shdr *Shdr =
+          Sections->begin() + RelSec->Index;
+      if (RelSec->Type == SHT_REL) {
+        Expected<typename ELFFile<ELFT>::Elf_Rel_Range> Rels =
+            ElfFile.rels(*Shdr);
+        if (!Rels)
+          return Rels.takeError();
+
+        if (Error Err = initRelocations(RelSec, Obj.SymbolTable, *Rels))
+          return Err;
+      } else {
+        Expected<typename ELFFile<ELFT>::Elf_Rela_Range> Relas =
+            ElfFile.relas(*Shdr);
+        if (!Relas)
+          return Relas.takeError();
+
+        if (Error Err = initRelocations(RelSec, Obj.SymbolTable, *Relas))
+          return Err;
+      }
     } else if (auto GroupSec = dyn_cast<GroupSection>(&Sec)) {
-      initGroupSection(GroupSec);
+      if (Error Err = initGroupSection(GroupSec))
+        return Err;
     }
   }
+
+  return Error::success();
 }
 
-template <class ELFT> void ELFBuilder<ELFT>::build(bool EnsureSymtab) {
-  readSectionHeaders();
-  findEhdrOffset();
+template <class ELFT> Error ELFBuilder<ELFT>::build(bool EnsureSymtab) {
+  if (Error E = readSectionHeaders())
+    return E;
+  if (Error E = findEhdrOffset())
+    return E;
 
   // The ELFFile whose ELF headers and program headers are copied into the
   // output file. Normally the same as ElfFile, but if we're extracting a
   // loadable partition it will point to the partition's headers.
-  ELFFile<ELFT> HeadersFile = unwrapOrError(ELFFile<ELFT>::create(toStringRef(
-      {ElfFile.base() + EhdrOffset, ElfFile.getBufSize() - EhdrOffset})));
+  Expected<ELFFile<ELFT>> HeadersFile = ELFFile<ELFT>::create(toStringRef(
+      {ElfFile.base() + EhdrOffset, ElfFile.getBufSize() - EhdrOffset}));
+  if (!HeadersFile)
+    return HeadersFile.takeError();
 
-  auto &Ehdr = *HeadersFile.getHeader();
+  const typename ELFFile<ELFT>::Elf_Ehdr &Ehdr = HeadersFile->getHeader();
   Obj.OSABI = Ehdr.e_ident[EI_OSABI];
   Obj.ABIVersion = Ehdr.e_ident[EI_ABIVERSION];
   Obj.Type = Ehdr.e_type;
@@ -1631,15 +1895,17 @@ template <class ELFT> void ELFBuilder<ELFT>::build(bool EnsureSymtab) {
   Obj.Entry = Ehdr.e_entry;
   Obj.Flags = Ehdr.e_flags;
 
-  readSections(EnsureSymtab);
-  readProgramHeaders(HeadersFile);
+  if (Error E = readSections(EnsureSymtab))
+    return E;
+  return readProgramHeaders(*HeadersFile);
 }
 
 Writer::~Writer() {}
 
 Reader::~Reader() {}
 
-std::unique_ptr<Object> BinaryReader::create(bool /*EnsureSymtab*/) const {
+Expected<std::unique_ptr<Object>>
+BinaryReader::create(bool /*EnsureSymtab*/) const {
   return BinaryELFBuilder(MemBuf, NewSymbolVisibility).build();
 }
 
@@ -1669,31 +1935,39 @@ Expected<std::vector<IHexRecord>> IHexReader::parse() const {
   return std::move(Records);
 }
 
-std::unique_ptr<Object> IHexReader::create(bool /*EnsureSymtab*/) const {
-  std::vector<IHexRecord> Records = unwrapOrError(parse());
-  return IHexELFBuilder(Records).build();
+Expected<std::unique_ptr<Object>>
+IHexReader::create(bool /*EnsureSymtab*/) const {
+  Expected<std::vector<IHexRecord>> Records = parse();
+  if (!Records)
+    return Records.takeError();
+
+  return IHexELFBuilder(*Records).build();
 }
 
-std::unique_ptr<Object> ELFReader::create(bool EnsureSymtab) const {
+Expected<std::unique_ptr<Object>> ELFReader::create(bool EnsureSymtab) const {
   auto Obj = std::make_unique<Object>();
   if (auto *O = dyn_cast<ELFObjectFile<ELF32LE>>(Bin)) {
     ELFBuilder<ELF32LE> Builder(*O, *Obj, ExtractPartition);
-    Builder.build(EnsureSymtab);
-    return Obj;
+    if (Error Err = Builder.build(EnsureSymtab))
+      return std::move(Err);
+    return std::move(Obj);
   } else if (auto *O = dyn_cast<ELFObjectFile<ELF64LE>>(Bin)) {
     ELFBuilder<ELF64LE> Builder(*O, *Obj, ExtractPartition);
-    Builder.build(EnsureSymtab);
-    return Obj;
+    if (Error Err = Builder.build(EnsureSymtab))
+      return std::move(Err);
+    return std::move(Obj);
   } else if (auto *O = dyn_cast<ELFObjectFile<ELF32BE>>(Bin)) {
     ELFBuilder<ELF32BE> Builder(*O, *Obj, ExtractPartition);
-    Builder.build(EnsureSymtab);
-    return Obj;
+    if (Error Err = Builder.build(EnsureSymtab))
+      return std::move(Err);
+    return std::move(Obj);
   } else if (auto *O = dyn_cast<ELFObjectFile<ELF64BE>>(Bin)) {
     ELFBuilder<ELF64BE> Builder(*O, *Obj, ExtractPartition);
-    Builder.build(EnsureSymtab);
-    return Obj;
+    if (Error Err = Builder.build(EnsureSymtab))
+      return std::move(Err);
+    return std::move(Obj);
   }
-  error("invalid file type");
+  return createStringError(errc::invalid_argument, "invalid file type");
 }
 
 template <class ELFT> void ELFWriter<ELFT>::writeEhdr() {
@@ -1787,13 +2061,16 @@ template <class ELFT> void ELFWriter<ELFT>::writeShdrs() {
     writeShdr(Sec);
 }
 
-template <class ELFT> void ELFWriter<ELFT>::writeSectionData() {
+template <class ELFT> Error ELFWriter<ELFT>::writeSectionData() {
   for (SectionBase &Sec : Obj.sections())
     // Segments are responsible for writing their contents, so only write the
     // section data if the section is not in a segment. Note that this renders
     // sections in segments effectively immutable.
     if (Sec.ParentSegment == nullptr)
-      Sec.accept(*SecWriter);
+      if (Error Err = Sec.accept(*SecWriter))
+        return Err;
+
+  return Error::success();
 }
 
 template <class ELFT> void ELFWriter<ELFT>::writeSegmentData() {
@@ -1820,8 +2097,8 @@ ELFWriter<ELFT>::ELFWriter(Object &Obj, Buffer &Buf, bool WSH,
     : Writer(Obj, Buf), WriteSectionHeaders(WSH && Obj.HadShdrs),
       OnlyKeepDebug(OnlyKeepDebug) {}
 
-Error Object::removeSections(bool AllowBrokenLinks,
-    std::function<bool(const SectionBase &)> ToRemove) {
+Error Object::removeSections(
+    bool AllowBrokenLinks, std::function<bool(const SectionBase &)> ToRemove) {
 
   auto Iter = std::stable_partition(
       std::begin(Sections), std::end(Sections), [=](const SecPtr &Sec) {
@@ -1857,8 +2134,8 @@ Error Object::removeSections(bool AllowBrokenLinks,
   // a live section critically depends on a section being removed somehow
   // (e.g. the removed section is referenced by a relocation).
   for (auto &KeepSec : make_range(std::begin(Sections), Iter)) {
-    if (Error E = KeepSec->removeSectionReferences(AllowBrokenLinks,
-            [&RemoveSections](const SectionBase *Sec) {
+    if (Error E = KeepSec->removeSectionReferences(
+            AllowBrokenLinks, [&RemoveSections](const SectionBase *Sec) {
               return RemoveSections.find(Sec) != RemoveSections.end();
             }))
       return E;
@@ -1880,7 +2157,7 @@ Error Object::removeSymbols(function_ref<bool(const Symbol &)> ToRemove) {
   return Error::success();
 }
 
-void Object::addNewSymbolTable() {
+Error Object::addNewSymbolTable() {
   assert(!SymbolTable && "Object must not has a SymbolTable.");
 
   // Reuse an existing SHT_STRTAB section if it exists.
@@ -1901,10 +2178,13 @@ void Object::addNewSymbolTable() {
   SymbolTableSection &SymTab = addSection<SymbolTableSection>();
   SymTab.Name = ".symtab";
   SymTab.Link = StrTab->Index;
-  SymTab.initialize(sections());
+  if (Error Err = SymTab.initialize(sections()))
+    return Err;
   SymTab.addSymbol("", 0, 0, nullptr, 0, 0, 0, 0);
 
   SymbolTable = &SymTab;
+
+  return Error::success();
 }
 
 void Object::sortSections() {
@@ -2025,12 +2305,19 @@ static uint64_t layoutSectionsForOnlyKeepDebug(Object &Obj, uint64_t Off) {
   return Off;
 }
 
-// Rewrite p_offset and p_filesz of non-empty non-PT_PHDR segments after
-// sh_offset values have been updated.
+// Rewrite p_offset and p_filesz of non-PT_PHDR segments after sh_offset values
+// have been updated.
 static uint64_t layoutSegmentsForOnlyKeepDebug(std::vector<Segment *> &Segments,
                                                uint64_t HdrEnd) {
   uint64_t MaxOffset = 0;
   for (Segment *Seg : Segments) {
+    // An empty segment contains no section (see sectionWithinSegment). If it
+    // has a parent segment, copy the parent segment's offset field. This works
+    // for empty PT_TLS. We don't handle empty segments without a parent for
+    // now.
+    if (Seg->ParentSegment != nullptr && Seg->MemSize == 0)
+      Seg->Offset = Seg->ParentSegment->Offset;
+
     const SectionBase *FirstSec = Seg->firstSection();
     if (Seg->Type == PT_PHDR || !FirstSec)
       continue;
@@ -2118,7 +2405,8 @@ template <class ELFT> Error ELFWriter<ELFT>::write() {
   writeSegmentData();
   writeEhdr();
   writePhdrs();
-  writeSectionData();
+  if (Error E = writeSectionData())
+    return E;
   if (WriteSectionHeaders)
     writeShdrs();
   return Buf.commit();
@@ -2163,8 +2451,8 @@ template <class ELFT> Error ELFWriter<ELFT>::finalize() {
   if (Obj.sections().size() >= SHN_LORESERVE) {
     SectionTableRef Sections = Obj.sections();
     NeedsLargeIndexes =
-        std::any_of(Sections.begin() + SHN_LORESERVE, Sections.end(),
-                    [](const SectionBase &Sec) { return Sec.HasSymbol; });
+        any_of(drop_begin(Sections, SHN_LORESERVE),
+               [](const SectionBase &Sec) { return Sec.HasSymbol; });
     // TODO: handle case where only one section needs the large index table but
     // only needs it because the large index table hasn't been removed yet.
   }
@@ -2207,7 +2495,8 @@ template <class ELFT> Error ELFWriter<ELFT>::finalize() {
   auto SecSizer = std::make_unique<ELFSectionSizer<ELFT>>();
   for (SectionBase &Sec : Obj.sections()) {
     Sec.Index = Index++;
-    Sec.accept(*SecSizer);
+    if (Error Err = Sec.accept(*SecSizer))
+      return Err;
   }
 
   // The symbol table does not update all other sections on update. For
@@ -2248,7 +2537,9 @@ template <class ELFT> Error ELFWriter<ELFT>::finalize() {
 
 Error BinaryWriter::write() {
   for (const SectionBase &Sec : Obj.allocSections())
-    Sec.accept(*SecWriter);
+    if (Error Err = Sec.accept(*SecWriter))
+      return Err;
+
   return Buf.commit();
 }
 
@@ -2320,7 +2611,8 @@ Error IHexWriter::write() {
   IHexSectionWriter Writer(Buf);
   // Write sections.
   for (const SectionBase *Sec : Sections)
-    Sec->accept(Writer);
+    if (Error Err = Sec->accept(Writer))
+      return Err;
 
   uint64_t Offset = Writer.getBufferOffset();
   // Write entry point address.
@@ -2336,8 +2628,8 @@ Error IHexWriter::checkSection(const SectionBase &Sec) {
   if (addressOverflows32bit(Addr) || addressOverflows32bit(Addr + Sec.Size - 1))
     return createStringError(
         errc::invalid_argument,
-        "Section '%s' address range [0x%llx, 0x%llx] is not 32 bit", Sec.Name.c_str(),
-        Addr, Addr + Sec.Size - 1);
+        "Section '%s' address range [0x%llx, 0x%llx] is not 32 bit",
+        Sec.Name.c_str(), Addr, Addr + Sec.Size - 1);
   return Error::success();
 }
 
@@ -2374,7 +2666,8 @@ Error IHexWriter::finalize() {
 
   IHexSectionWriterBase LengthCalc(Buf);
   for (const SectionBase *Sec : Sections)
-    Sec->accept(LengthCalc);
+    if (Error Err = Sec->accept(LengthCalc))
+      return Err;
 
   // We need space to write section records + StartAddress record
   // (if start adress is not zero) + EndOfFile record.
diff --git a/contrib/llvm-project/llvm/tools/llvm-objcopy/ELF/Object.h b/contrib/llvm-project/llvm/tools/llvm-objcopy/ELF/Object.h
index ed89e916b838..0205c2d4f398 100644
--- a/contrib/llvm-project/llvm/tools/llvm-objcopy/ELF/Object.h
+++ b/contrib/llvm-project/llvm/tools/llvm-objcopy/ELF/Object.h
@@ -61,10 +61,11 @@ public:
   iterator end() const { return iterator(Sections.data() + Sections.size()); }
   size_t size() const { return Sections.size(); }
 
-  SectionBase *getSection(uint32_t Index, Twine ErrMsg);
+  Expected<SectionBase *> getSection(uint32_t Index, Twine ErrMsg);
 
   template <class T>
-  T *getSectionOfType(uint32_t Index, Twine IndexErrMsg, Twine TypeErrMsg);
+  Expected<T *> getSectionOfType(uint32_t Index, Twine IndexErrMsg,
+                                 Twine TypeErrMsg);
 };
 
 enum ElfType { ELFT_ELF32LE, ELFT_ELF64LE, ELFT_ELF32BE, ELFT_ELF64BE };
@@ -73,34 +74,34 @@ class SectionVisitor {
 public:
   virtual ~SectionVisitor() = default;
 
-  virtual void visit(const Section &Sec) = 0;
-  virtual void visit(const OwnedDataSection &Sec) = 0;
-  virtual void visit(const StringTableSection &Sec) = 0;
-  virtual void visit(const SymbolTableSection &Sec) = 0;
-  virtual void visit(const RelocationSection &Sec) = 0;
-  virtual void visit(const DynamicRelocationSection &Sec) = 0;
-  virtual void visit(const GnuDebugLinkSection &Sec) = 0;
-  virtual void visit(const GroupSection &Sec) = 0;
-  virtual void visit(const SectionIndexSection &Sec) = 0;
-  virtual void visit(const CompressedSection &Sec) = 0;
-  virtual void visit(const DecompressedSection &Sec) = 0;
+  virtual Error visit(const Section &Sec) = 0;
+  virtual Error visit(const OwnedDataSection &Sec) = 0;
+  virtual Error visit(const StringTableSection &Sec) = 0;
+  virtual Error visit(const SymbolTableSection &Sec) = 0;
+  virtual Error visit(const RelocationSection &Sec) = 0;
+  virtual Error visit(const DynamicRelocationSection &Sec) = 0;
+  virtual Error visit(const GnuDebugLinkSection &Sec) = 0;
+  virtual Error visit(const GroupSection &Sec) = 0;
+  virtual Error visit(const SectionIndexSection &Sec) = 0;
+  virtual Error visit(const CompressedSection &Sec) = 0;
+  virtual Error visit(const DecompressedSection &Sec) = 0;
 };
 
 class MutableSectionVisitor {
 public:
   virtual ~MutableSectionVisitor() = default;
 
-  virtual void visit(Section &Sec) = 0;
-  virtual void visit(OwnedDataSection &Sec) = 0;
-  virtual void visit(StringTableSection &Sec) = 0;
-  virtual void visit(SymbolTableSection &Sec) = 0;
-  virtual void visit(RelocationSection &Sec) = 0;
-  virtual void visit(DynamicRelocationSection &Sec) = 0;
-  virtual void visit(GnuDebugLinkSection &Sec) = 0;
-  virtual void visit(GroupSection &Sec) = 0;
-  virtual void visit(SectionIndexSection &Sec) = 0;
-  virtual void visit(CompressedSection &Sec) = 0;
-  virtual void visit(DecompressedSection &Sec) = 0;
+  virtual Error visit(Section &Sec) = 0;
+  virtual Error visit(OwnedDataSection &Sec) = 0;
+  virtual Error visit(StringTableSection &Sec) = 0;
+  virtual Error visit(SymbolTableSection &Sec) = 0;
+  virtual Error visit(RelocationSection &Sec) = 0;
+  virtual Error visit(DynamicRelocationSection &Sec) = 0;
+  virtual Error visit(GnuDebugLinkSection &Sec) = 0;
+  virtual Error visit(GroupSection &Sec) = 0;
+  virtual Error visit(SectionIndexSection &Sec) = 0;
+  virtual Error visit(CompressedSection &Sec) = 0;
+  virtual Error visit(DecompressedSection &Sec) = 0;
 };
 
 class SectionWriter : public SectionVisitor {
@@ -110,17 +111,17 @@ protected:
 public:
   virtual ~SectionWriter() = default;
 
-  void visit(const Section &Sec) override;
-  void visit(const OwnedDataSection &Sec) override;
-  void visit(const StringTableSection &Sec) override;
-  void visit(const DynamicRelocationSection &Sec) override;
-  virtual void visit(const SymbolTableSection &Sec) override = 0;
-  virtual void visit(const RelocationSection &Sec) override = 0;
-  virtual void visit(const GnuDebugLinkSection &Sec) override = 0;
-  virtual void visit(const GroupSection &Sec) override = 0;
-  virtual void visit(const SectionIndexSection &Sec) override = 0;
-  virtual void visit(const CompressedSection &Sec) override = 0;
-  virtual void visit(const DecompressedSection &Sec) override = 0;
+  Error visit(const Section &Sec) override;
+  Error visit(const OwnedDataSection &Sec) override;
+  Error visit(const StringTableSection &Sec) override;
+  Error visit(const DynamicRelocationSection &Sec) override;
+  virtual Error visit(const SymbolTableSection &Sec) override = 0;
+  virtual Error visit(const RelocationSection &Sec) override = 0;
+  virtual Error visit(const GnuDebugLinkSection &Sec) override = 0;
+  virtual Error visit(const GroupSection &Sec) override = 0;
+  virtual Error visit(const SectionIndexSection &Sec) override = 0;
+  virtual Error visit(const CompressedSection &Sec) override = 0;
+  virtual Error visit(const DecompressedSection &Sec) override = 0;
 
   explicit SectionWriter(Buffer &Buf) : Out(Buf) {}
 };
@@ -134,13 +135,13 @@ private:
 
 public:
   virtual ~ELFSectionWriter() {}
-  void visit(const SymbolTableSection &Sec) override;
-  void visit(const RelocationSection &Sec) override;
-  void visit(const GnuDebugLinkSection &Sec) override;
-  void visit(const GroupSection &Sec) override;
-  void visit(const SectionIndexSection &Sec) override;
-  void visit(const CompressedSection &Sec) override;
-  void visit(const DecompressedSection &Sec) override;
+  Error visit(const SymbolTableSection &Sec) override;
+  Error visit(const RelocationSection &Sec) override;
+  Error visit(const GnuDebugLinkSection &Sec) override;
+  Error visit(const GroupSection &Sec) override;
+  Error visit(const SectionIndexSection &Sec) override;
+  Error visit(const CompressedSection &Sec) override;
+  Error visit(const DecompressedSection &Sec) override;
 
   explicit ELFSectionWriter(Buffer &Buf) : SectionWriter(Buf) {}
 };
@@ -154,17 +155,17 @@ private:
   using Elf_Xword = typename ELFT::Xword;
 
 public:
-  void visit(Section &Sec) override;
-  void visit(OwnedDataSection &Sec) override;
-  void visit(StringTableSection &Sec) override;
-  void visit(DynamicRelocationSection &Sec) override;
-  void visit(SymbolTableSection &Sec) override;
-  void visit(RelocationSection &Sec) override;
-  void visit(GnuDebugLinkSection &Sec) override;
-  void visit(GroupSection &Sec) override;
-  void visit(SectionIndexSection &Sec) override;
-  void visit(CompressedSection &Sec) override;
-  void visit(DecompressedSection &Sec) override;
+  Error visit(Section &Sec) override;
+  Error visit(OwnedDataSection &Sec) override;
+  Error visit(StringTableSection &Sec) override;
+  Error visit(DynamicRelocationSection &Sec) override;
+  Error visit(SymbolTableSection &Sec) override;
+  Error visit(RelocationSection &Sec) override;
+  Error visit(GnuDebugLinkSection &Sec) override;
+  Error visit(GroupSection &Sec) override;
+  Error visit(SectionIndexSection &Sec) override;
+  Error visit(CompressedSection &Sec) override;
+  Error visit(DecompressedSection &Sec) override;
 };
 
 #define MAKE_SEC_WRITER_FRIEND                                                 \
@@ -178,13 +179,13 @@ class BinarySectionWriter : public SectionWriter {
 public:
   virtual ~BinarySectionWriter() {}
 
-  void visit(const SymbolTableSection &Sec) override;
-  void visit(const RelocationSection &Sec) override;
-  void visit(const GnuDebugLinkSection &Sec) override;
-  void visit(const GroupSection &Sec) override;
-  void visit(const SectionIndexSection &Sec) override;
-  void visit(const CompressedSection &Sec) override;
-  void visit(const DecompressedSection &Sec) override;
+  Error visit(const SymbolTableSection &Sec) override;
+  Error visit(const RelocationSection &Sec) override;
+  Error visit(const GnuDebugLinkSection &Sec) override;
+  Error visit(const GroupSection &Sec) override;
+  Error visit(const SectionIndexSection &Sec) override;
+  Error visit(const CompressedSection &Sec) override;
+  Error visit(const DecompressedSection &Sec) override;
 
   explicit BinarySectionWriter(Buffer &Buf) : SectionWriter(Buf) {}
 };
@@ -285,10 +286,10 @@ public:
   explicit IHexSectionWriterBase(Buffer &Buf) : BinarySectionWriter(Buf) {}
 
   uint64_t getBufferOffset() const { return Offset; }
-  void visit(const Section &Sec) final;
-  void visit(const OwnedDataSection &Sec) final;
-  void visit(const StringTableSection &Sec) override;
-  void visit(const DynamicRelocationSection &Sec) final;
+  Error visit(const Section &Sec) final;
+  Error visit(const OwnedDataSection &Sec) final;
+  Error visit(const StringTableSection &Sec) override;
+  Error visit(const DynamicRelocationSection &Sec) final;
   using BinarySectionWriter::visit;
 };
 
@@ -298,7 +299,7 @@ public:
   IHexSectionWriter(Buffer &Buf) : IHexSectionWriterBase(Buf) {}
 
   void writeData(uint8_t Type, uint16_t Addr, ArrayRef<uint8_t> Data) override;
-  void visit(const StringTableSection &Sec) override;
+  Error visit(const StringTableSection &Sec) override;
 };
 
 class Writer {
@@ -329,7 +330,7 @@ private:
 
   void writePhdrs();
   void writeShdrs();
-  void writeSectionData();
+  Error writeSectionData();
   void writeSegmentData();
 
   void assignOffsets();
@@ -389,8 +390,8 @@ public:
   Segment *ParentSegment = nullptr;
   uint64_t HeaderOffset = 0;
   uint32_t Index = 0;
-  bool HasSymbol = false;
 
+  uint32_t OriginalIndex = 0;
   uint64_t OriginalFlags = 0;
   uint64_t OriginalType = ELF::SHT_NULL;
   uint64_t OriginalOffset = std::numeric_limits<uint64_t>::max();
@@ -406,21 +407,22 @@ public:
   uint64_t Size = 0;
   uint64_t Type = ELF::SHT_NULL;
   ArrayRef<uint8_t> OriginalData;
+  bool HasSymbol = false;
 
   SectionBase() = default;
   SectionBase(const SectionBase &) = default;
 
   virtual ~SectionBase() = default;
 
-  virtual void initialize(SectionTableRef SecTable);
+  virtual Error initialize(SectionTableRef SecTable);
   virtual void finalize();
   // Remove references to these sections. The list of sections must be sorted.
   virtual Error
   removeSectionReferences(bool AllowBrokenLinks,
                           function_ref<bool(const SectionBase *)> ToRemove);
   virtual Error removeSymbols(function_ref<bool(const Symbol &)> ToRemove);
-  virtual void accept(SectionVisitor &Visitor) const = 0;
-  virtual void accept(MutableSectionVisitor &Visitor) = 0;
+  virtual Error accept(SectionVisitor &Visitor) const = 0;
+  virtual Error accept(MutableSectionVisitor &Visitor) = 0;
   virtual void markSymbols();
   virtual void
   replaceSectionReferences(const DenseMap<SectionBase *, SectionBase *> &);
@@ -434,9 +436,9 @@ private:
     bool operator()(const SectionBase *Lhs, const SectionBase *Rhs) const {
       // Some sections might have the same address if one of them is empty. To
       // fix this we can use the lexicographic ordering on ->Addr and the
-      // address of the actully stored section.
+      // original index.
       if (Lhs->OriginalOffset == Rhs->OriginalOffset)
-        return Lhs < Rhs;
+        return Lhs->OriginalIndex < Rhs->OriginalIndex;
       return Lhs->OriginalOffset < Rhs->OriginalOffset;
     }
   };
@@ -481,11 +483,12 @@ class Section : public SectionBase {
 public:
   explicit Section(ArrayRef<uint8_t> Data) : Contents(Data) {}
 
-  void accept(SectionVisitor &Visitor) const override;
-  void accept(MutableSectionVisitor &Visitor) override;
-  Error removeSectionReferences(bool AllowBrokenLinks,
+  Error accept(SectionVisitor &Visitor) const override;
+  Error accept(MutableSectionVisitor &Visitor) override;
+  Error removeSectionReferences(
+      bool AllowBrokenLinks,
       function_ref<bool(const SectionBase *)> ToRemove) override;
-  void initialize(SectionTableRef SecTable) override;
+  Error initialize(SectionTableRef SecTable) override;
   void finalize() override;
 };
 
@@ -513,8 +516,8 @@ public:
   }
 
   void appendHexData(StringRef HexData);
-  void accept(SectionVisitor &Sec) const override;
-  void accept(MutableSectionVisitor &Visitor) override;
+  Error accept(SectionVisitor &Sec) const override;
+  Error accept(MutableSectionVisitor &Visitor) override;
 };
 
 class CompressedSection : public SectionBase {
@@ -526,21 +529,28 @@ class CompressedSection : public SectionBase {
   SmallVector<char, 128> CompressedData;
 
 public:
-  CompressedSection(const SectionBase &Sec,
-                    DebugCompressionType CompressionType);
-  CompressedSection(ArrayRef<uint8_t> CompressedData, uint64_t DecompressedSize,
-                    uint64_t DecompressedAlign);
+  static Expected<CompressedSection>
+  create(const SectionBase &Sec, DebugCompressionType CompressionType);
+  static Expected<CompressedSection> create(ArrayRef<uint8_t> CompressedData,
+                                            uint64_t DecompressedSize,
+                                            uint64_t DecompressedAlign);
 
   uint64_t getDecompressedSize() const { return DecompressedSize; }
   uint64_t getDecompressedAlign() const { return DecompressedAlign; }
 
-  void accept(SectionVisitor &Visitor) const override;
-  void accept(MutableSectionVisitor &Visitor) override;
+  Error accept(SectionVisitor &Visitor) const override;
+  Error accept(MutableSectionVisitor &Visitor) override;
 
   static bool classof(const SectionBase *S) {
     return (S->OriginalFlags & ELF::SHF_COMPRESSED) ||
            (StringRef(S->Name).startswith(".zdebug"));
   }
+
+private:
+  CompressedSection(const SectionBase &Sec,
+                    DebugCompressionType CompressionType, Error &Err);
+  CompressedSection(ArrayRef<uint8_t> CompressedData, uint64_t DecompressedSize,
+                    uint64_t DecompressedAlign);
 };
 
 class DecompressedSection : public SectionBase {
@@ -556,8 +566,8 @@ public:
       Name = "." + Name.substr(2);
   }
 
-  void accept(SectionVisitor &Visitor) const override;
-  void accept(MutableSectionVisitor &Visitor) override;
+  Error accept(SectionVisitor &Visitor) const override;
+  Error accept(MutableSectionVisitor &Visitor) override;
 };
 
 // There are two types of string tables that can exist, dynamic and not dynamic.
@@ -581,8 +591,8 @@ public:
   void addString(StringRef Name);
   uint32_t findIndex(StringRef Name) const;
   void prepareForLayout();
-  void accept(SectionVisitor &Visitor) const override;
-  void accept(MutableSectionVisitor &Visitor) override;
+  Error accept(SectionVisitor &Visitor) const override;
+  Error accept(MutableSectionVisitor &Visitor) override;
 
   static bool classof(const SectionBase *S) {
     if (S->OriginalFlags & ELF::SHF_ALLOC)
@@ -639,18 +649,18 @@ public:
   virtual ~SectionIndexSection() {}
   void addIndex(uint32_t Index) {
     assert(Size > 0);
-    Indexes.push_back(Index);    
+    Indexes.push_back(Index);
   }
 
   void reserve(size_t NumSymbols) {
     Indexes.reserve(NumSymbols);
     Size = NumSymbols * 4;
-  }  
+  }
   void setSymTab(SymbolTableSection *SymTab) { Symbols = SymTab; }
-  void initialize(SectionTableRef SecTable) override;
+  Error initialize(SectionTableRef SecTable) override;
   void finalize() override;
-  void accept(SectionVisitor &Visitor) const override;
-  void accept(MutableSectionVisitor &Visitor) override;
+  Error accept(SectionVisitor &Visitor) const override;
+  Error accept(MutableSectionVisitor &Visitor) override;
 
   SectionIndexSection() {
     Name = ".symtab_shndx";
@@ -688,16 +698,17 @@ public:
   const SectionIndexSection *getShndxTable() const { return SectionIndexTable; }
   void fillShndxTable();
   const SectionBase *getStrTab() const { return SymbolNames; }
-  const Symbol *getSymbolByIndex(uint32_t Index) const;
-  Symbol *getSymbolByIndex(uint32_t Index);
+  Expected<const Symbol *> getSymbolByIndex(uint32_t Index) const;
+  Expected<Symbol *> getSymbolByIndex(uint32_t Index);
   void updateSymbols(function_ref<void(Symbol &)> Callable);
 
-  Error removeSectionReferences(bool AllowBrokenLinks,
+  Error removeSectionReferences(
+      bool AllowBrokenLinks,
       function_ref<bool(const SectionBase *)> ToRemove) override;
-  void initialize(SectionTableRef SecTable) override;
+  Error initialize(SectionTableRef SecTable) override;
   void finalize() override;
-  void accept(SectionVisitor &Visitor) const override;
-  void accept(MutableSectionVisitor &Visitor) override;
+  Error accept(SectionVisitor &Visitor) const override;
+  Error accept(MutableSectionVisitor &Visitor) override;
   Error removeSymbols(function_ref<bool(const Symbol &)> ToRemove) override;
   void replaceSectionReferences(
       const DenseMap<SectionBase *, SectionBase *> &FromTo) override;
@@ -748,7 +759,7 @@ protected:
   SymTabType *Symbols = nullptr;
 
 public:
-  void initialize(SectionTableRef SecTable) override;
+  Error initialize(SectionTableRef SecTable) override;
   void finalize() override;
 };
 
@@ -760,9 +771,10 @@ class RelocationSection
 
 public:
   void addRelocation(Relocation Rel) { Relocations.push_back(Rel); }
-  void accept(SectionVisitor &Visitor) const override;
-  void accept(MutableSectionVisitor &Visitor) override;
-  Error removeSectionReferences(bool AllowBrokenLinks,
+  Error accept(SectionVisitor &Visitor) const override;
+  Error accept(MutableSectionVisitor &Visitor) override;
+  Error removeSectionReferences(
+      bool AllowBrokenLinks,
       function_ref<bool(const SectionBase *)> ToRemove) override;
   Error removeSymbols(function_ref<bool(const Symbol &)> ToRemove) override;
   void markSymbols() override;
@@ -798,8 +810,8 @@ public:
   void setFlagWord(ELF::Elf32_Word W) { FlagWord = W; }
   void addMember(SectionBase *Sec) { GroupMembers.push_back(Sec); }
 
-  void accept(SectionVisitor &) const override;
-  void accept(MutableSectionVisitor &Visitor) override;
+  Error accept(SectionVisitor &) const override;
+  Error accept(MutableSectionVisitor &Visitor) override;
   void finalize() override;
   Error removeSectionReferences(
       bool AllowBrokenLinks,
@@ -843,8 +855,8 @@ private:
 public:
   explicit DynamicRelocationSection(ArrayRef<uint8_t> Data) : Contents(Data) {}
 
-  void accept(SectionVisitor &) const override;
-  void accept(MutableSectionVisitor &Visitor) override;
+  Error accept(SectionVisitor &) const override;
+  Error accept(MutableSectionVisitor &Visitor) override;
   Error removeSectionReferences(
       bool AllowBrokenLinks,
       function_ref<bool(const SectionBase *)> ToRemove) override;
@@ -868,14 +880,14 @@ private:
 public:
   // If we add this section from an external source we can use this ctor.
   explicit GnuDebugLinkSection(StringRef File, uint32_t PrecomputedCRC);
-  void accept(SectionVisitor &Visitor) const override;
-  void accept(MutableSectionVisitor &Visitor) override;
+  Error accept(SectionVisitor &Visitor) const override;
+  Error accept(MutableSectionVisitor &Visitor) override;
 };
 
 class Reader {
 public:
   virtual ~Reader();
-  virtual std::unique_ptr<Object> create(bool EnsureSymtab) const = 0;
+  virtual Expected<std::unique_ptr<Object>> create(bool EnsureSymtab) const = 0;
 };
 
 using object::Binary;
@@ -891,7 +903,7 @@ protected:
   void initHeaderSegment();
   StringTableSection *addStrTab();
   SymbolTableSection *addSymTab(StringTableSection *StrTab);
-  void initSections();
+  Error initSections();
 
 public:
   BasicELFBuilder() : Obj(std::make_unique<Object>()) {}
@@ -907,7 +919,7 @@ public:
       : BasicELFBuilder(), MemBuf(MB),
         NewSymbolVisibility(NewSymbolVisibility) {}
 
-  std::unique_ptr<Object> build();
+  Expected<std::unique_ptr<Object>> build();
 };
 
 class IHexELFBuilder : public BasicELFBuilder {
@@ -919,7 +931,7 @@ public:
   IHexELFBuilder(const std::vector<IHexRecord> &Records)
       : BasicELFBuilder(), Records(Records) {}
 
-  std::unique_ptr<Object> build();
+  Expected<std::unique_ptr<Object>> build();
 };
 
 template <class ELFT> class ELFBuilder {
@@ -934,21 +946,21 @@ private:
   Optional<StringRef> ExtractPartition;
 
   void setParentSegment(Segment &Child);
-  void readProgramHeaders(const ELFFile<ELFT> &HeadersFile);
-  void initGroupSection(GroupSection *GroupSec);
-  void initSymbolTable(SymbolTableSection *SymTab);
-  void readSectionHeaders();
-  void readSections(bool EnsureSymtab);
-  void findEhdrOffset();
-  SectionBase &makeSection(const Elf_Shdr &Shdr);
+  Error readProgramHeaders(const ELFFile<ELFT> &HeadersFile);
+  Error initGroupSection(GroupSection *GroupSec);
+  Error initSymbolTable(SymbolTableSection *SymTab);
+  Error readSectionHeaders();
+  Error readSections(bool EnsureSymtab);
+  Error findEhdrOffset();
+  Expected<SectionBase &> makeSection(const Elf_Shdr &Shdr);
 
 public:
   ELFBuilder(const ELFObjectFile<ELFT> &ElfObj, Object &Obj,
              Optional<StringRef> ExtractPartition)
-      : ElfFile(*ElfObj.getELFFile()), Obj(Obj),
+      : ElfFile(ElfObj.getELFFile()), Obj(Obj),
         ExtractPartition(ExtractPartition) {}
 
-  void build(bool EnsureSymtab);
+  Error build(bool EnsureSymtab);
 };
 
 class BinaryReader : public Reader {
@@ -958,7 +970,7 @@ class BinaryReader : public Reader {
 public:
   BinaryReader(MemoryBuffer *MB, const uint8_t NewSymbolVisibility)
       : MemBuf(MB), NewSymbolVisibility(NewSymbolVisibility) {}
-  std::unique_ptr<Object> create(bool EnsureSymtab) const override;
+  Expected<std::unique_ptr<Object>> create(bool EnsureSymtab) const override;
 };
 
 class IHexReader : public Reader {
@@ -980,7 +992,7 @@ class IHexReader : public Reader {
 public:
   IHexReader(MemoryBuffer *MB) : MemBuf(MB) {}
 
-  std::unique_ptr<Object> create(bool EnsureSymtab) const override;
+  Expected<std::unique_ptr<Object>> create(bool EnsureSymtab) const override;
 };
 
 class ELFReader : public Reader {
@@ -988,7 +1000,7 @@ class ELFReader : public Reader {
   Optional<StringRef> ExtractPartition;
 
 public:
-  std::unique_ptr<Object> create(bool EnsureSymtab) const override;
+  Expected<std::unique_ptr<Object>> create(bool EnsureSymtab) const override;
   explicit ELFReader(Binary *B, Optional<StringRef> ExtractPartition)
       : Bin(B), ExtractPartition(ExtractPartition) {}
 };
@@ -1072,7 +1084,7 @@ public:
     Ptr->Index = Sections.size();
     return *Ptr;
   }
-  void addNewSymbolTable();
+  Error addNewSymbolTable();
   Segment &addSegment(ArrayRef<uint8_t> Data) {
     Segments.emplace_back(std::make_unique<Segment>(Data));
     return *Segments.back();
diff --git a/contrib/llvm-project/llvm/tools/llvm-objcopy/InstallNameToolOpts.td b/contrib/llvm-project/llvm/tools/llvm-objcopy/InstallNameToolOpts.td
index 04ffe62c42fc..88dea84400fb 100644
--- a/contrib/llvm-project/llvm/tools/llvm-objcopy/InstallNameToolOpts.td
+++ b/contrib/llvm-project/llvm/tools/llvm-objcopy/InstallNameToolOpts.td
@@ -18,9 +18,15 @@ def h : Flag<["-"], "h">, Alias<help>;
 def add_rpath : Option<["-", "--"], "add_rpath", KIND_SEPARATE>,
                 HelpText<"Add new rpath">;
 
+def prepend_rpath : Option<["-", "--"], "prepend_rpath", KIND_SEPARATE>,
+                    HelpText<"Add new rpath before other rpaths">;
+
 def delete_rpath: Option<["-", "--"], "delete_rpath", KIND_SEPARATE>,
                   HelpText<"Delete specified rpath">;
 
+def delete_all_rpaths: Flag<["-", "--"], "delete_all_rpaths">,
+              HelpText<"Delete all rpath directives">;
+
 def rpath: MultiArg<["-", "--"], "rpath", 2>,
            HelpText<"Change rpath path name">;
 
@@ -32,3 +38,7 @@ def change: MultiArg<["-", "--"], "change", 2>,
 
 def version : Flag<["--"], "version">,
               HelpText<"Print the version and exit.">;
+
+def V : Flag<["-"], "V">,
+        Alias<version>,
+        HelpText<"Alias for --version">;
diff --git a/contrib/llvm-project/llvm/tools/llvm-objcopy/MachO/MachOLayoutBuilder.cpp b/contrib/llvm-project/llvm/tools/llvm-objcopy/MachO/MachOLayoutBuilder.cpp
index 256c830a44a4..8e2bf36238ec 100644
--- a/contrib/llvm-project/llvm/tools/llvm-objcopy/MachO/MachOLayoutBuilder.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-objcopy/MachO/MachOLayoutBuilder.cpp
@@ -15,6 +15,14 @@ namespace llvm {
 namespace objcopy {
 namespace macho {
 
+StringTableBuilder::Kind
+MachOLayoutBuilder::getStringTableBuilderKind(const Object &O, bool Is64Bit) {
+  if (O.Header.FileType == MachO::HeaderFileType::MH_OBJECT)
+    return Is64Bit ? StringTableBuilder::MachO64 : StringTableBuilder::MachO;
+  return Is64Bit ? StringTableBuilder::MachO64Linked
+                 : StringTableBuilder::MachOLinked;
+}
+
 uint32_t MachOLayoutBuilder::computeSizeOfCmds() const {
   uint32_t Size = 0;
   for (const LoadCommand &LC : O.LoadCommands) {
@@ -148,7 +156,7 @@ uint64_t MachOLayoutBuilder::layoutSegments() {
              "Section's address cannot be smaller than Segment's one");
       uint32_t SectOffset = Sec->Addr - SegmentVmAddr;
       if (IsObjectFile) {
-        if (Sec->isVirtualSection()) {
+        if (!Sec->hasValidOffset()) {
           Sec->Offset = 0;
         } else {
           uint64_t PaddingSize =
@@ -158,7 +166,7 @@ uint64_t MachOLayoutBuilder::layoutSegments() {
           SegFileSize += PaddingSize + Sec->Size;
         }
       } else {
-        if (Sec->isVirtualSection()) {
+        if (!Sec->hasValidOffset()) {
           Sec->Offset = 0;
         } else {
           Sec->Offset = SegOffset + SectOffset;
@@ -252,6 +260,8 @@ Error MachOLayoutBuilder::layoutTail(uint64_t Offset) {
       sizeof(uint32_t) * O.IndirectSymTable.Symbols.size();
   uint64_t StartOfCodeSignature =
       StartOfSymbolStrings + StrTableBuilder.getSize();
+  if (O.CodeSignatureCommandIndex)
+    StartOfCodeSignature = alignTo(StartOfCodeSignature, 16);
   uint64_t LinkEditSize =
       (StartOfCodeSignature + O.CodeSignature.Data.size()) - StartOfLinkEdit;
 
diff --git a/contrib/llvm-project/llvm/tools/llvm-objcopy/MachO/MachOLayoutBuilder.h b/contrib/llvm-project/llvm/tools/llvm-objcopy/MachO/MachOLayoutBuilder.h
index 21cbe56605de..5fe6683e27f3 100644
--- a/contrib/llvm-project/llvm/tools/llvm-objcopy/MachO/MachOLayoutBuilder.h
+++ b/contrib/llvm-project/llvm/tools/llvm-objcopy/MachO/MachOLayoutBuilder.h
@@ -23,7 +23,7 @@ class MachOLayoutBuilder {
 
   // Points to the __LINKEDIT segment if it exists.
   MachO::macho_load_command *LinkEditLoadCommand = nullptr;
-  StringTableBuilder StrTableBuilder{StringTableBuilder::MachO};
+  StringTableBuilder StrTableBuilder;
 
   uint32_t computeSizeOfCmds() const;
   void constructStringTable();
@@ -33,9 +33,13 @@ class MachOLayoutBuilder {
   uint64_t layoutRelocations(uint64_t Offset);
   Error layoutTail(uint64_t Offset);
 
+  static StringTableBuilder::Kind getStringTableBuilderKind(const Object &O,
+                                                            bool Is64Bit);
+
 public:
   MachOLayoutBuilder(Object &O, bool Is64Bit, uint64_t PageSize)
-      : O(O), Is64Bit(Is64Bit), PageSize(PageSize) {}
+      : O(O), Is64Bit(Is64Bit), PageSize(PageSize),
+        StrTableBuilder(getStringTableBuilderKind(O, Is64Bit)) {}
 
   // Recomputes and updates fields in the given object such as file offsets.
   Error layout();
diff --git a/contrib/llvm-project/llvm/tools/llvm-objcopy/MachO/MachOObjcopy.cpp b/contrib/llvm-project/llvm/tools/llvm-objcopy/MachO/MachOObjcopy.cpp
index 5ca5b133572b..fef4a0ae5594 100644
--- a/contrib/llvm-project/llvm/tools/llvm-objcopy/MachO/MachOObjcopy.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-objcopy/MachO/MachOObjcopy.cpp
@@ -8,9 +8,13 @@
 
 #include "MachOObjcopy.h"
 #include "../CopyConfig.h"
+#include "../llvm-objcopy.h"
 #include "MachOReader.h"
 #include "MachOWriter.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/Object/ArchiveWriter.h"
+#include "llvm/Object/MachOUniversal.h"
+#include "llvm/Object/MachOUniversalWriter.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/Error.h"
 
@@ -133,8 +137,14 @@ static Error processLoadCommands(const CopyConfig &Config, Object &Obj) {
   DenseSet<StringRef> RPathsToRemove(Config.RPathsToRemove.begin(),
                                      Config.RPathsToRemove.end());
 
-  LoadCommandPred RemovePred = [&RPathsToRemove](const LoadCommand &LC) {
+  LoadCommandPred RemovePred = [&RPathsToRemove,
+                                &Config](const LoadCommand &LC) {
     if (LC.MachOLoadCommand.load_command_data.cmd == MachO::LC_RPATH) {
+      // When removing all RPaths we don't need to care
+      // about what it contains
+      if (Config.RemoveAllRpaths)
+        return true;
+
       StringRef RPath = getPayloadString(LC);
       if (RPathsToRemove.count(RPath)) {
         RPathsToRemove.erase(RPath);
@@ -168,26 +178,22 @@ static Error processLoadCommands(const CopyConfig &Config, Object &Obj) {
   for (const auto &OldNew : Config.RPathsToUpdate) {
     StringRef Old = OldNew.getFirst();
     StringRef New = OldNew.getSecond();
-    if (RPaths.count(Old) == 0)
+    if (!RPaths.contains(Old))
       return createStringError(errc::invalid_argument,
                                "no LC_RPATH load command with path: " + Old);
-    if (RPaths.count(New) != 0)
+    if (RPaths.contains(New))
       return createStringError(errc::invalid_argument,
-                               "rpath " + New +
-                                   " would create a duplicate load command");
+                               "rpath '" + New +
+                                   "' would create a duplicate load command");
   }
 
   // Update load commands.
   for (LoadCommand &LC : Obj.LoadCommands) {
     switch (LC.MachOLoadCommand.load_command_data.cmd) {
     case MachO::LC_ID_DYLIB:
-      if (Config.SharedLibId) {
-        StringRef Id = Config.SharedLibId.getValue();
-        if (Id.empty())
-          return createStringError(errc::invalid_argument,
-                                   "cannot specify an empty id");
-        updateLoadCommandPayloadString<MachO::dylib_command>(LC, Id);
-      }
+      if (Config.SharedLibId)
+        updateLoadCommandPayloadString<MachO::dylib_command>(
+            LC, *Config.SharedLibId);
       break;
 
     case MachO::LC_RPATH: {
@@ -214,14 +220,30 @@ static Error processLoadCommands(const CopyConfig &Config, Object &Obj) {
 
   // Add new RPaths.
   for (StringRef RPath : Config.RPathToAdd) {
-    if (RPaths.count(RPath) != 0)
+    if (RPaths.contains(RPath))
+      return createStringError(errc::invalid_argument,
+                               "rpath '" + RPath +
+                                   "' would create a duplicate load command");
+    RPaths.insert(RPath);
+    Obj.LoadCommands.push_back(buildRPathLoadCommand(RPath));
+  }
+
+  for (StringRef RPath : Config.RPathToPrepend) {
+    if (RPaths.contains(RPath))
       return createStringError(errc::invalid_argument,
-                               "rpath " + RPath +
-                                   " would create a duplicate load command");
+                               "rpath '" + RPath +
+                                   "' would create a duplicate load command");
+
     RPaths.insert(RPath);
-    Obj.addLoadCommand(buildRPathLoadCommand(RPath));
+    Obj.LoadCommands.insert(Obj.LoadCommands.begin(),
+                            buildRPathLoadCommand(RPath));
   }
 
+  // Unlike appending rpaths, the indexes of subsequent load commands must
+  // be recalculated after prepending one.
+  if (!Config.RPathToPrepend.empty())
+    Obj.updateLoadCommandIndexes();
+
   return Error::success();
 }
 
@@ -258,27 +280,34 @@ static Error addSection(StringRef SecName, StringRef Filename, Object &Obj) {
   StringRef TargetSegName = Pair.first;
   Section Sec(TargetSegName, Pair.second);
   Sec.Content = Obj.NewSectionsContents.save(Buf->getBuffer());
+  Sec.Size = Sec.Content.size();
 
   // Add the a section into an existing segment.
   for (LoadCommand &LC : Obj.LoadCommands) {
     Optional<StringRef> SegName = LC.getSegmentName();
     if (SegName && SegName == TargetSegName) {
+      uint64_t Addr = *LC.getSegmentVMAddr();
+      for (const std::unique_ptr<Section> &S : LC.Sections)
+        Addr = std::max(Addr, S->Addr + S->Size);
       LC.Sections.push_back(std::make_unique<Section>(Sec));
+      LC.Sections.back()->Addr = Addr;
       return Error::success();
     }
   }
 
   // There's no segment named TargetSegName. Create a new load command and
   // Insert a new section into it.
-  LoadCommand &NewSegment = Obj.addSegment(TargetSegName);
+  LoadCommand &NewSegment =
+      Obj.addSegment(TargetSegName, alignTo(Sec.Size, 16384));
   NewSegment.Sections.push_back(std::make_unique<Section>(Sec));
+  NewSegment.Sections.back()->Addr = *NewSegment.getSegmentVMAddr();
   return Error::success();
 }
 
 // isValidMachOCannonicalName returns success if Name is a MachO cannonical name
 // ("<segment>,<section>") and lengths of both segment and section names are
 // valid.
-Error isValidMachOCannonicalName(StringRef Name) {
+static Error isValidMachOCannonicalName(StringRef Name) {
   if (Name.count(',') != 1)
     return createStringError(errc::invalid_argument,
                              "invalid section name '%s' (should be formatted "
@@ -311,8 +340,7 @@ static Error handleArgs(const CopyConfig &Config, Object &Obj) {
       Config.ExtractDWO || Config.LocalizeHidden || Config.PreserveDates ||
       Config.StripAllGNU || Config.StripDWO || Config.StripNonAlloc ||
       Config.StripSections || Config.Weaken || Config.DecompressDebugSections ||
-      Config.StripNonAlloc || Config.StripSections || Config.StripUnneeded ||
-      Config.DiscardMode == DiscardType::Locals ||
+      Config.StripUnneeded || Config.DiscardMode == DiscardType::Locals ||
       !Config.SymbolsToAdd.empty() || Config.EntryExpr) {
     return createStringError(llvm::errc::invalid_argument,
                              "option not supported by llvm-objcopy for MachO");
@@ -360,14 +388,11 @@ static Error handleArgs(const CopyConfig &Config, Object &Obj) {
 Error executeObjcopyOnBinary(const CopyConfig &Config,
                              object::MachOObjectFile &In, Buffer &Out) {
   MachOReader Reader(In);
-  std::unique_ptr<Object> O = Reader.create();
+  Expected<std::unique_ptr<Object>> O = Reader.create();
   if (!O)
-    return createFileError(
-        Config.InputFilename,
-        createStringError(object_error::parse_failed,
-                          "unable to deserialize MachO object"));
+    return createFileError(Config.InputFilename, O.takeError());
 
-  if (Error E = handleArgs(Config, *O))
+  if (Error E = handleArgs(Config, **O))
     return createFileError(Config.InputFilename, std::move(E));
 
   // Page size used for alignment of segment sizes in Mach-O executables and
@@ -383,12 +408,81 @@ Error executeObjcopyOnBinary(const CopyConfig &Config,
     PageSize = 4096;
   }
 
-  MachOWriter Writer(*O, In.is64Bit(), In.isLittleEndian(), PageSize, Out);
+  MachOWriter Writer(**O, In.is64Bit(), In.isLittleEndian(), PageSize, Out);
   if (auto E = Writer.finalize())
     return E;
   return Writer.write();
 }
 
+Error executeObjcopyOnMachOUniversalBinary(CopyConfig &Config,
+                                           const MachOUniversalBinary &In,
+                                           Buffer &Out) {
+  SmallVector<OwningBinary<Binary>, 2> Binaries;
+  SmallVector<Slice, 2> Slices;
+  for (const auto &O : In.objects()) {
+    Expected<std::unique_ptr<Archive>> ArOrErr = O.getAsArchive();
+    if (ArOrErr) {
+      Expected<std::vector<NewArchiveMember>> NewArchiveMembersOrErr =
+          createNewArchiveMembers(Config, **ArOrErr);
+      if (!NewArchiveMembersOrErr)
+        return NewArchiveMembersOrErr.takeError();
+      Expected<std::unique_ptr<MemoryBuffer>> OutputBufferOrErr =
+          writeArchiveToBuffer(*NewArchiveMembersOrErr,
+                               (*ArOrErr)->hasSymbolTable(), (*ArOrErr)->kind(),
+                               Config.DeterministicArchives,
+                               (*ArOrErr)->isThin());
+      if (!OutputBufferOrErr)
+        return OutputBufferOrErr.takeError();
+      Expected<std::unique_ptr<Binary>> BinaryOrErr =
+          object::createBinary(**OutputBufferOrErr);
+      if (!BinaryOrErr)
+        return BinaryOrErr.takeError();
+      Binaries.emplace_back(std::move(*BinaryOrErr),
+                            std::move(*OutputBufferOrErr));
+      Slices.emplace_back(*cast<Archive>(Binaries.back().getBinary()),
+                          O.getCPUType(), O.getCPUSubType(),
+                          O.getArchFlagName(), O.getAlign());
+      continue;
+    }
+    // The methods getAsArchive, getAsObjectFile, getAsIRObject of the class
+    // ObjectForArch return an Error in case of the type mismatch. We need to
+    // check each in turn to see what kind of slice this is, so ignore errors
+    // produced along the way.
+    consumeError(ArOrErr.takeError());
+
+    Expected<std::unique_ptr<MachOObjectFile>> ObjOrErr = O.getAsObjectFile();
+    if (!ObjOrErr) {
+      consumeError(ObjOrErr.takeError());
+      return createStringError(std::errc::invalid_argument,
+                               "slice for '%s' of the universal Mach-O binary "
+                               "'%s' is not a Mach-O object or an archive",
+                               O.getArchFlagName().c_str(),
+                               Config.InputFilename.str().c_str());
+    }
+    std::string ArchFlagName = O.getArchFlagName();
+    MemBuffer MB(ArchFlagName);
+    if (Error E = executeObjcopyOnBinary(Config, **ObjOrErr, MB))
+      return E;
+    std::unique_ptr<WritableMemoryBuffer> OutputBuffer =
+        MB.releaseMemoryBuffer();
+    Expected<std::unique_ptr<Binary>> BinaryOrErr =
+        object::createBinary(*OutputBuffer);
+    if (!BinaryOrErr)
+      return BinaryOrErr.takeError();
+    Binaries.emplace_back(std::move(*BinaryOrErr), std::move(OutputBuffer));
+    Slices.emplace_back(*cast<MachOObjectFile>(Binaries.back().getBinary()),
+                        O.getAlign());
+  }
+  Expected<std::unique_ptr<MemoryBuffer>> B =
+      writeUniversalBinaryToBuffer(Slices);
+  if (!B)
+    return B.takeError();
+  if (Error E = Out.allocate((*B)->getBufferSize()))
+    return E;
+  memcpy(Out.getBufferStart(), (*B)->getBufferStart(), (*B)->getBufferSize());
+  return Out.commit();
+}
+
 } // end namespace macho
 } // end namespace objcopy
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/tools/llvm-objcopy/MachO/MachOObjcopy.h b/contrib/llvm-project/llvm/tools/llvm-objcopy/MachO/MachOObjcopy.h
index f34e361db7ea..c3f5391f79b6 100644
--- a/contrib/llvm-project/llvm/tools/llvm-objcopy/MachO/MachOObjcopy.h
+++ b/contrib/llvm-project/llvm/tools/llvm-objcopy/MachO/MachOObjcopy.h
@@ -24,6 +24,10 @@ class Buffer;
 namespace macho {
 Error executeObjcopyOnBinary(const CopyConfig &Config,
                              object::MachOObjectFile &In, Buffer &Out);
+
+Error executeObjcopyOnMachOUniversalBinary(
+    CopyConfig &Config, const object::MachOUniversalBinary &In, Buffer &Out);
+
 } // end namespace macho
 } // end namespace objcopy
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/tools/llvm-objcopy/MachO/MachOReader.cpp b/contrib/llvm-project/llvm/tools/llvm-objcopy/MachO/MachOReader.cpp
index 99bcec7f6b51..548a85bd497e 100644
--- a/contrib/llvm-project/llvm/tools/llvm-objcopy/MachO/MachOReader.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-objcopy/MachO/MachOReader.cpp
@@ -7,10 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "MachOReader.h"
-#include "../llvm-objcopy.h"
 #include "Object.h"
 #include "llvm/BinaryFormat/MachO.h"
 #include "llvm/Object/MachO.h"
+#include "llvm/Support/Errc.h"
 #include <memory>
 
 namespace llvm {
@@ -35,7 +35,7 @@ Section constructSectionCommon(SectionType Sec, uint32_t Index) {
   S.Index = Index;
   S.Addr = Sec.addr;
   S.Size = Sec.size;
-  S.Offset = Sec.offset;
+  S.OriginalOffset = Sec.offset;
   S.Align = Sec.align;
   S.RelOff = Sec.reloff;
   S.NReloc = Sec.nreloc;
@@ -59,9 +59,8 @@ template <> Section constructSection(MachO::section_64 Sec, uint32_t Index) {
   return S;
 }
 
-// TODO: get rid of reportError and make MachOReader return Expected<> instead.
 template <typename SectionType, typename SegmentType>
-std::vector<std::unique_ptr<Section>>
+Expected<std::vector<std::unique_ptr<Section>>>
 extractSections(const object::MachOObjectFile::LoadCommandInfo &LoadCmd,
                 const object::MachOObjectFile &MachOObj,
                 uint32_t &NextSectionIndex) {
@@ -86,14 +85,15 @@ extractSections(const object::MachOObjectFile::LoadCommandInfo &LoadCmd,
     Expected<object::SectionRef> SecRef =
         MachOObj.getSection(NextSectionIndex++);
     if (!SecRef)
-      reportError(MachOObj.getFileName(), SecRef.takeError());
+      return SecRef.takeError();
 
-    if (Expected<ArrayRef<uint8_t>> E =
-            MachOObj.getSectionContents(SecRef->getRawDataRefImpl()))
-      S.Content =
-          StringRef(reinterpret_cast<const char *>(E->data()), E->size());
-    else
-      reportError(MachOObj.getFileName(), E.takeError());
+    Expected<ArrayRef<uint8_t>> Data =
+        MachOObj.getSectionContents(SecRef->getRawDataRefImpl());
+    if (!Data)
+      return Data.takeError();
+
+    S.Content =
+        StringRef(reinterpret_cast<const char *>(Data->data()), Data->size());
 
     S.Relocations.reserve(S.NReloc);
     for (auto RI = MachOObj.section_rel_begin(SecRef->getRawDataRefImpl()),
@@ -110,10 +110,10 @@ extractSections(const object::MachOObjectFile::LoadCommandInfo &LoadCmd,
     assert(S.NReloc == S.Relocations.size() &&
            "Incorrect number of relocations");
   }
-  return Sections;
+  return std::move(Sections);
 }
 
-void MachOReader::readLoadCommands(Object &O) const {
+Error MachOReader::readLoadCommands(Object &O) const {
   // For MachO sections indices start from 1.
   uint32_t NextSectionIndex = 1;
   for (auto LoadCmd : MachOObj.load_commands()) {
@@ -123,13 +123,20 @@ void MachOReader::readLoadCommands(Object &O) const {
       O.CodeSignatureCommandIndex = O.LoadCommands.size();
       break;
     case MachO::LC_SEGMENT:
-      LC.Sections = extractSections<MachO::section, MachO::segment_command>(
-          LoadCmd, MachOObj, NextSectionIndex);
+      if (Expected<std::vector<std::unique_ptr<Section>>> Sections =
+              extractSections<MachO::section, MachO::segment_command>(
+                  LoadCmd, MachOObj, NextSectionIndex))
+        LC.Sections = std::move(*Sections);
+      else
+        return Sections.takeError();
       break;
     case MachO::LC_SEGMENT_64:
-      LC.Sections =
-          extractSections<MachO::section_64, MachO::segment_command_64>(
-              LoadCmd, MachOObj, NextSectionIndex);
+      if (Expected<std::vector<std::unique_ptr<Section>>> Sections =
+              extractSections<MachO::section_64, MachO::segment_command_64>(
+                  LoadCmd, MachOObj, NextSectionIndex))
+        LC.Sections = std::move(*Sections);
+      else
+        return Sections.takeError();
       break;
     case MachO::LC_SYMTAB:
       O.SymTabCommandIndex = O.LoadCommands.size();
@@ -177,6 +184,7 @@ void MachOReader::readLoadCommands(Object &O) const {
     }
     O.LoadCommands.push_back(std::move(LC));
   }
+  return Error::success();
 }
 
 template <typename nlist_t>
@@ -308,10 +316,11 @@ void MachOReader::readSwiftVersion(Object &O) const {
       }
 }
 
-std::unique_ptr<Object> MachOReader::create() const {
+Expected<std::unique_ptr<Object>> MachOReader::create() const {
   auto Obj = std::make_unique<Object>();
   readHeader(*Obj);
-  readLoadCommands(*Obj);
+  if (Error E = readLoadCommands(*Obj))
+    return std::move(E);
   readSymbolTable(*Obj);
   setSymbolInRelocationInfo(*Obj);
   readRebaseInfo(*Obj);
@@ -324,7 +333,7 @@ std::unique_ptr<Object> MachOReader::create() const {
   readFunctionStartsData(*Obj);
   readIndirectSymbolTable(*Obj);
   readSwiftVersion(*Obj);
-  return Obj;
+  return std::move(Obj);
 }
 
 } // end namespace macho
diff --git a/contrib/llvm-project/llvm/tools/llvm-objcopy/MachO/MachOReader.h b/contrib/llvm-project/llvm/tools/llvm-objcopy/MachO/MachOReader.h
index 65824b6eb389..b446e02865e5 100644
--- a/contrib/llvm-project/llvm/tools/llvm-objcopy/MachO/MachOReader.h
+++ b/contrib/llvm-project/llvm/tools/llvm-objcopy/MachO/MachOReader.h
@@ -21,14 +21,14 @@ namespace macho {
 class Reader {
 public:
   virtual ~Reader(){};
-  virtual std::unique_ptr<Object> create() const = 0;
+  virtual Expected<std::unique_ptr<Object>> create() const = 0;
 };
 
 class MachOReader : public Reader {
   const object::MachOObjectFile &MachOObj;
 
   void readHeader(Object &O) const;
-  void readLoadCommands(Object &O) const;
+  Error readLoadCommands(Object &O) const;
   void readSymbolTable(Object &O) const;
   void setSymbolInRelocationInfo(Object &O) const;
   void readRebaseInfo(Object &O) const;
@@ -46,7 +46,7 @@ class MachOReader : public Reader {
 public:
   explicit MachOReader(const object::MachOObjectFile &Obj) : MachOObj(Obj) {}
 
-  std::unique_ptr<Object> create() const override;
+  Expected<std::unique_ptr<Object>> create() const override;
 };
 
 } // end namespace macho
diff --git a/contrib/llvm-project/llvm/tools/llvm-objcopy/MachO/MachOWriter.cpp b/contrib/llvm-project/llvm/tools/llvm-objcopy/MachO/MachOWriter.cpp
index 3c41e73b2b01..56dd08df3b09 100644
--- a/contrib/llvm-project/llvm/tools/llvm-objcopy/MachO/MachOWriter.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-objcopy/MachO/MachOWriter.cpp
@@ -121,6 +121,14 @@ size_t MachOWriter::totalSize() const {
   // Otherwise, use the last section / reloction.
   for (const LoadCommand &LC : O.LoadCommands)
     for (const std::unique_ptr<Section> &S : LC.Sections) {
+      if (!S->hasValidOffset()) {
+        assert((S->Offset == 0) && "Skipped section's offset must be zero");
+        assert((S->isVirtualSection() || S->Size == 0) &&
+               "Non-zero-fill sections with zero offset must have zero size");
+        continue;
+      }
+      assert((S->Offset != 0) &&
+             "Non-zero-fill section's offset cannot be zero");
       Ends.push_back(S->Offset + S->Size);
       if (S->RelOff)
         Ends.push_back(S->RelOff +
@@ -240,8 +248,12 @@ void MachOWriter::writeSectionInLoadCommand(const Section &Sec, uint8_t *&Out) {
 void MachOWriter::writeSections() {
   for (const LoadCommand &LC : O.LoadCommands)
     for (const std::unique_ptr<Section> &Sec : LC.Sections) {
-      if (Sec->isVirtualSection())
+      if (!Sec->hasValidOffset()) {
+        assert((Sec->Offset == 0) && "Skipped section's offset must be zero");
+        assert((Sec->isVirtualSection() || Sec->Size == 0) &&
+               "Non-zero-fill sections with zero offset must have zero size");
         continue;
+      }
 
       assert(Sec->Offset && "Section offset can not be zero");
       assert((Sec->Size == Sec->Content.size()) && "Incorrect section size");
diff --git a/contrib/llvm-project/llvm/tools/llvm-objcopy/MachO/Object.cpp b/contrib/llvm-project/llvm/tools/llvm-objcopy/MachO/Object.cpp
index de8cb0af108d..cdb97531fb66 100644
--- a/contrib/llvm-project/llvm/tools/llvm-objcopy/MachO/Object.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-objcopy/MachO/Object.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "Object.h"
-#include "../llvm-objcopy.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include <unordered_set>
 
@@ -27,9 +26,7 @@ SymbolEntry *SymbolTable::getSymbolByIndex(uint32_t Index) {
 
 void SymbolTable::removeSymbols(
     function_ref<bool(const std::unique_ptr<SymbolEntry> &)> ToRemove) {
-  Symbols.erase(
-      std::remove_if(std::begin(Symbols), std::end(Symbols), ToRemove),
-      std::end(Symbols));
+  llvm::erase_if(Symbols, ToRemove);
 }
 
 void Object::updateLoadCommandIndexes() {
@@ -111,28 +108,54 @@ Error Object::removeSections(
   return Error::success();
 }
 
-void Object::addLoadCommand(LoadCommand LC) {
-  LoadCommands.push_back(std::move(LC));
+uint64_t Object::nextAvailableSegmentAddress() const {
+  uint64_t HeaderSize =
+      is64Bit() ? sizeof(MachO::mach_header_64) : sizeof(MachO::mach_header);
+  uint64_t Addr = HeaderSize + Header.SizeOfCmds;
+  for (const LoadCommand &LC : LoadCommands) {
+    const MachO::macho_load_command &MLC = LC.MachOLoadCommand;
+    switch (MLC.load_command_data.cmd) {
+    case MachO::LC_SEGMENT:
+      Addr = std::max(Addr,
+                      static_cast<uint64_t>(MLC.segment_command_data.vmaddr) +
+                          MLC.segment_command_data.vmsize);
+      break;
+    case MachO::LC_SEGMENT_64:
+      Addr = std::max(Addr, MLC.segment_command_64_data.vmaddr +
+                                MLC.segment_command_64_data.vmsize);
+      break;
+    default:
+      continue;
+    }
+  }
+  return Addr;
 }
 
 template <typename SegmentType>
-static void constructSegment(SegmentType &Seg,
-                             llvm::MachO::LoadCommandType CmdType,
-                             StringRef SegName) {
+static void
+constructSegment(SegmentType &Seg, llvm::MachO::LoadCommandType CmdType,
+                 StringRef SegName, uint64_t SegVMAddr, uint64_t SegVMSize) {
   assert(SegName.size() <= sizeof(Seg.segname) && "too long segment name");
   memset(&Seg, 0, sizeof(SegmentType));
   Seg.cmd = CmdType;
   strncpy(Seg.segname, SegName.data(), SegName.size());
+  Seg.maxprot |=
+      (MachO::VM_PROT_READ | MachO::VM_PROT_WRITE | MachO::VM_PROT_EXECUTE);
+  Seg.initprot |=
+      (MachO::VM_PROT_READ | MachO::VM_PROT_WRITE | MachO::VM_PROT_EXECUTE);
+  Seg.vmaddr = SegVMAddr;
+  Seg.vmsize = SegVMSize;
 }
 
-LoadCommand &Object::addSegment(StringRef SegName) {
+LoadCommand &Object::addSegment(StringRef SegName, uint64_t SegVMSize) {
   LoadCommand LC;
+  const uint64_t SegVMAddr = nextAvailableSegmentAddress();
   if (is64Bit())
     constructSegment(LC.MachOLoadCommand.segment_command_64_data,
-                     MachO::LC_SEGMENT_64, SegName);
+                     MachO::LC_SEGMENT_64, SegName, SegVMAddr, SegVMSize);
   else
     constructSegment(LC.MachOLoadCommand.segment_command_data,
-                     MachO::LC_SEGMENT, SegName);
+                     MachO::LC_SEGMENT, SegName, SegVMAddr, SegVMSize);
 
   LoadCommands.push_back(std::move(LC));
   return LoadCommands.back();
@@ -156,6 +179,18 @@ Optional<StringRef> LoadCommand::getSegmentName() const {
   }
 }
 
+Optional<uint64_t> LoadCommand::getSegmentVMAddr() const {
+  const MachO::macho_load_command &MLC = MachOLoadCommand;
+  switch (MLC.load_command_data.cmd) {
+  case MachO::LC_SEGMENT:
+    return MLC.segment_command_data.vmaddr;
+  case MachO::LC_SEGMENT_64:
+    return MLC.segment_command_64_data.vmaddr;
+  default:
+    return None;
+  }
+}
+
 } // end namespace macho
 } // end namespace objcopy
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/tools/llvm-objcopy/MachO/Object.h b/contrib/llvm-project/llvm/tools/llvm-objcopy/MachO/Object.h
index e825d1867b09..0bb4b344b2eb 100644
--- a/contrib/llvm-project/llvm/tools/llvm-objcopy/MachO/Object.h
+++ b/contrib/llvm-project/llvm/tools/llvm-objcopy/MachO/Object.h
@@ -44,6 +44,8 @@ struct Section {
   std::string CanonicalName;
   uint64_t Addr = 0;
   uint64_t Size = 0;
+  // Offset in the input file.
+  Optional<uint32_t> OriginalOffset;
   uint32_t Offset = 0;
   uint32_t Align = 0;
   uint32_t RelOff = 0;
@@ -73,6 +75,10 @@ struct Section {
             getType() == MachO::S_GB_ZEROFILL ||
             getType() == MachO::S_THREAD_LOCAL_ZEROFILL);
   }
+
+  bool hasValidOffset() const {
+    return !(isVirtualSection() || (OriginalOffset && *OriginalOffset == 0));
+  }
 };
 
 struct LoadCommand {
@@ -94,6 +100,9 @@ struct LoadCommand {
 
   // Returns the segment name if the load command is a segment command.
   Optional<StringRef> getSegmentName() const;
+
+  // Returns the segment vm address if the load command is a segment command.
+  Optional<uint64_t> getSegmentVMAddr() const;
 };
 
 // A symbol information. Fields which starts with "n_" are same as them in the
@@ -331,17 +340,17 @@ struct Object {
 
   void updateLoadCommandIndexes();
 
-  void addLoadCommand(LoadCommand LC);
-
   /// Creates a new segment load command in the object and returns a reference
   /// to the newly created load command. The caller should verify that SegName
   /// is not too long (SegName.size() should be less than or equal to 16).
-  LoadCommand &addSegment(StringRef SegName);
+  LoadCommand &addSegment(StringRef SegName, uint64_t SegVMSize);
 
   bool is64Bit() const {
     return Header.Magic == MachO::MH_MAGIC_64 ||
            Header.Magic == MachO::MH_CIGAM_64;
   }
+
+  uint64_t nextAvailableSegmentAddress() const;
 };
 
 } // end namespace macho
diff --git a/contrib/llvm-project/llvm/tools/llvm-objcopy/llvm-objcopy.cpp b/contrib/llvm-project/llvm/tools/llvm-objcopy/llvm-objcopy.cpp
index 69b23b6cf975..7fd2acd11e99 100644
--- a/contrib/llvm-project/llvm/tools/llvm-objcopy/llvm-objcopy.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-objcopy/llvm-objcopy.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm-objcopy.h"
 #include "Buffer.h"
 #include "COFF/COFFObjcopy.h"
 #include "CopyConfig.h"
@@ -26,6 +25,7 @@
 #include "llvm/Object/ELFTypes.h"
 #include "llvm/Object/Error.h"
 #include "llvm/Object/MachO.h"
+#include "llvm/Object/MachOUniversal.h"
 #include "llvm/Object/Wasm.h"
 #include "llvm/Option/Arg.h"
 #include "llvm/Option/ArgList.h"
@@ -57,42 +57,36 @@ namespace objcopy {
 // The name this program was invoked as.
 StringRef ToolName;
 
-LLVM_ATTRIBUTE_NORETURN void error(Twine Message) {
-  WithColor::error(errs(), ToolName) << Message << "\n";
-  exit(1);
-}
-
-LLVM_ATTRIBUTE_NORETURN void error(Error E) {
-  assert(E);
-  std::string Buf;
-  raw_string_ostream OS(Buf);
-  logAllUnhandledErrors(std::move(E), OS);
-  OS.flush();
-  WithColor::error(errs(), ToolName) << Buf;
-  exit(1);
-}
-
-LLVM_ATTRIBUTE_NORETURN void reportError(StringRef File, std::error_code EC) {
-  assert(EC);
-  error(createFileError(File, EC));
-}
-
-LLVM_ATTRIBUTE_NORETURN void reportError(StringRef File, Error E) {
-  assert(E);
-  std::string Buf;
-  raw_string_ostream OS(Buf);
-  logAllUnhandledErrors(std::move(E), OS);
-  OS.flush();
-  WithColor::error(errs(), ToolName) << "'" << File << "': " << Buf;
-  exit(1);
-}
-
 ErrorSuccess reportWarning(Error E) {
   assert(E);
   WithColor::warning(errs(), ToolName) << toString(std::move(E)) << '\n';
   return Error::success();
 }
 
+static Expected<DriverConfig> getDriverConfig(ArrayRef<const char *> Args) {
+  StringRef Stem = sys::path::stem(ToolName);
+  auto Is = [=](StringRef Tool) {
+    // We need to recognize the following filenames:
+    //
+    // llvm-objcopy -> objcopy
+    // strip-10.exe -> strip
+    // powerpc64-unknown-freebsd13-objcopy -> objcopy
+    // llvm-install-name-tool -> install-name-tool
+    auto I = Stem.rfind_lower(Tool);
+    return I != StringRef::npos &&
+           (I + Tool.size() == Stem.size() || !isAlnum(Stem[I + Tool.size()]));
+  };
+
+  if (Is("bitcode-strip") || Is("bitcode_strip"))
+    return parseBitcodeStripOptions(Args);
+  else if (Is("strip"))
+    return parseStripOptions(Args, reportWarning);
+  else if (Is("install-name-tool") || Is("install_name_tool"))
+    return parseInstallNameToolOptions(Args);
+  else
+    return parseObjcopyOptions(Args, reportWarning);
+}
+
 } // end namespace objcopy
 } // end namespace llvm
 
@@ -175,6 +169,10 @@ static Error executeObjcopyOnBinary(CopyConfig &Config, object::Binary &In,
     return coff::executeObjcopyOnBinary(Config, *COFFBinary, Out);
   else if (auto *MachOBinary = dyn_cast<object::MachOObjectFile>(&In))
     return macho::executeObjcopyOnBinary(Config, *MachOBinary, Out);
+  else if (auto *MachOUniversalBinary =
+               dyn_cast<object::MachOUniversalBinary>(&In))
+    return macho::executeObjcopyOnMachOUniversalBinary(
+        Config, *MachOUniversalBinary, Out);
   else if (auto *WasmBinary = dyn_cast<object::WasmObjectFile>(&In))
     return objcopy::wasm::executeObjcopyOnBinary(Config, *WasmBinary, Out);
   else
@@ -182,7 +180,11 @@ static Error executeObjcopyOnBinary(CopyConfig &Config, object::Binary &In,
                              "unsupported object file format");
 }
 
-static Error executeObjcopyOnArchive(CopyConfig &Config, const Archive &Ar) {
+namespace llvm {
+namespace objcopy {
+
+Expected<std::vector<NewArchiveMember>>
+createNewArchiveMembers(CopyConfig &Config, const Archive &Ar) {
   std::vector<NewArchiveMember> NewArchiveMembers;
   Error Err = Error::success();
   for (const Archive::Child &Child : Ar.children(Err)) {
@@ -197,7 +199,7 @@ static Error executeObjcopyOnArchive(CopyConfig &Config, const Archive &Ar) {
 
     MemBuffer MB(ChildNameOrErr.get());
     if (Error E = executeObjcopyOnBinary(Config, *ChildOrErr->get(), MB))
-      return E;
+      return std::move(E);
 
     Expected<NewArchiveMember> Member =
         NewArchiveMember::getOldMember(Child, Config.DeterministicArchives);
@@ -209,8 +211,19 @@ static Error executeObjcopyOnArchive(CopyConfig &Config, const Archive &Ar) {
   }
   if (Err)
     return createFileError(Config.InputFilename, std::move(Err));
+  return std::move(NewArchiveMembers);
+}
 
-  return deepWriteArchive(Config.OutputFilename, NewArchiveMembers,
+} // end namespace objcopy
+} // end namespace llvm
+
+static Error executeObjcopyOnArchive(CopyConfig &Config,
+                                     const object::Archive &Ar) {
+  Expected<std::vector<NewArchiveMember>> NewArchiveMembersOrErr =
+      createNewArchiveMembers(Config, Ar);
+  if (!NewArchiveMembersOrErr)
+    return NewArchiveMembersOrErr.takeError();
+  return deepWriteArchive(Config.OutputFilename, *NewArchiveMembersOrErr,
                           Ar.hasSymbolTable(), Ar.kind(),
                           Config.DeterministicArchives, Ar.isThin());
 }
@@ -320,32 +333,12 @@ static Error executeObjcopy(CopyConfig &Config) {
 
 namespace {
 
-enum class ToolType { Objcopy, Strip, InstallNameTool };
-
 } // anonymous namespace
 
 int main(int argc, char **argv) {
   InitLLVM X(argc, argv);
   ToolName = argv[0];
 
-  StringRef Stem = sys::path::stem(ToolName);
-  auto Is = [=](StringRef Tool) {
-    // We need to recognize the following filenames:
-    //
-    // llvm-objcopy -> objcopy
-    // strip-10.exe -> strip
-    // powerpc64-unknown-freebsd13-objcopy -> objcopy
-    // llvm-install-name-tool -> install-name-tool
-    auto I = Stem.rfind_lower(Tool);
-    return I != StringRef::npos &&
-           (I + Tool.size() == Stem.size() || !isAlnum(Stem[I + Tool.size()]));
-  };
-  ToolType Tool = ToolType::Objcopy;
-  if (Is("strip"))
-    Tool = ToolType::Strip;
-  else if (Is("install-name-tool") || Is("install_name_tool"))
-    Tool = ToolType::InstallNameTool;
-
   // Expand response files.
   // TODO: Move these lines, which are copied from lib/Support/CommandLine.cpp,
   // into a separate function in the CommandLine library and call that function
@@ -360,11 +353,8 @@ int main(int argc, char **argv) {
                           NewArgv);
 
   auto Args = makeArrayRef(NewArgv).drop_front();
-  Expected<DriverConfig> DriverConfig =
-      (Tool == ToolType::Strip) ? parseStripOptions(Args, reportWarning)
-                                : ((Tool == ToolType::InstallNameTool)
-                                       ? parseInstallNameToolOptions(Args)
-                                       : parseObjcopyOptions(Args, reportWarning));
+  Expected<DriverConfig> DriverConfig = getDriverConfig(Args);
+
   if (!DriverConfig) {
     logAllUnhandledErrors(DriverConfig.takeError(),
                           WithColor::error(errs(), ToolName));
diff --git a/contrib/llvm-project/llvm/tools/llvm-objcopy/llvm-objcopy.h b/contrib/llvm-project/llvm/tools/llvm-objcopy/llvm-objcopy.h
index 18a789ca1f83..97a166769f95 100644
--- a/contrib/llvm-project/llvm/tools/llvm-objcopy/llvm-objcopy.h
+++ b/contrib/llvm-project/llvm/tools/llvm-objcopy/llvm-objcopy.h
@@ -9,32 +9,22 @@
 #ifndef LLVM_TOOLS_OBJCOPY_OBJCOPY_H
 #define LLVM_TOOLS_OBJCOPY_OBJCOPY_H
 
-#include "llvm/ADT/Twine.h"
-#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
-#include "llvm/Support/raw_ostream.h"
-#include <string>
 
 namespace llvm {
-namespace objcopy {
 
-LLVM_ATTRIBUTE_NORETURN extern void error(Twine Message);
-LLVM_ATTRIBUTE_NORETURN extern void error(Error E);
-LLVM_ATTRIBUTE_NORETURN extern void reportError(StringRef File, Error E);
-LLVM_ATTRIBUTE_NORETURN extern void reportError(StringRef File,
-                                                std::error_code EC);
-
-// This is taken from llvm-readobj.
-// [see here](llvm/tools/llvm-readobj/llvm-readobj.h:38)
-template <class T> T unwrapOrError(Expected<T> EO) {
-  if (EO)
-    return *EO;
-  std::string Buf;
-  raw_string_ostream OS(Buf);
-  logAllUnhandledErrors(EO.takeError(), OS);
-  OS.flush();
-  error(Buf);
-}
+struct NewArchiveMember;
+
+namespace object {
+
+class Archive;
+
+} // end namespace object
+
+namespace objcopy {
+struct CopyConfig;
+Expected<std::vector<NewArchiveMember>>
+createNewArchiveMembers(CopyConfig &Config, const object::Archive &Ar);
 
 } // end namespace objcopy
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/tools/llvm-objcopy/wasm/Object.cpp b/contrib/llvm-project/llvm/tools/llvm-objcopy/wasm/Object.cpp
index 0c416483663f..e7a2956fedca 100644
--- a/contrib/llvm-project/llvm/tools/llvm-objcopy/wasm/Object.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-objcopy/wasm/Object.cpp
@@ -26,9 +26,7 @@ void Object::addSectionWithOwnedContents(
 
 void Object::removeSections(function_ref<bool(const Section &)> ToRemove) {
   // TODO: remove reloc sections for the removed section, handle symbols, etc.
-  Sections.erase(
-      std::remove_if(std::begin(Sections), std::end(Sections), ToRemove),
-      std::end(Sections));
+  llvm::erase_if(Sections, ToRemove);
 }
 
 } // end namespace wasm
diff --git a/contrib/llvm-project/llvm/tools/llvm-objcopy/wasm/WasmObjcopy.cpp b/contrib/llvm-project/llvm/tools/llvm-objcopy/wasm/WasmObjcopy.cpp
index 20781cef2d33..eb0e5635cef9 100644
--- a/contrib/llvm-project/llvm/tools/llvm-objcopy/wasm/WasmObjcopy.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-objcopy/wasm/WasmObjcopy.cpp
@@ -12,7 +12,6 @@
 #include "Object.h"
 #include "Reader.h"
 #include "Writer.h"
-#include "llvm-objcopy.h"
 #include "llvm/Support/Errc.h"
 
 namespace llvm {
diff --git a/contrib/llvm-project/llvm/tools/llvm-objdump/ELFDump.cpp b/contrib/llvm-project/llvm/tools/llvm-objdump/ELFDump.cpp
index 602bc6388252..1c4d59179cc7 100644
--- a/contrib/llvm-project/llvm/tools/llvm-objdump/ELFDump.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-objdump/ELFDump.cpp
@@ -25,14 +25,14 @@ using namespace llvm::object;
 using namespace llvm::objdump;
 
 template <class ELFT>
-static Expected<StringRef> getDynamicStrTab(const ELFFile<ELFT> *Elf) {
-  auto DynamicEntriesOrError = Elf->dynamicEntries();
+static Expected<StringRef> getDynamicStrTab(const ELFFile<ELFT> &Elf) {
+  auto DynamicEntriesOrError = Elf.dynamicEntries();
   if (!DynamicEntriesOrError)
     return DynamicEntriesOrError.takeError();
 
   for (const typename ELFT::Dyn &Dyn : *DynamicEntriesOrError) {
     if (Dyn.d_tag == ELF::DT_STRTAB) {
-      auto MappedAddrOrError = Elf->toMappedAddr(Dyn.getPtr());
+      auto MappedAddrOrError = Elf.toMappedAddr(Dyn.getPtr());
       if (!MappedAddrOrError)
         consumeError(MappedAddrOrError.takeError());
       return StringRef(reinterpret_cast<const char *>(*MappedAddrOrError));
@@ -40,13 +40,13 @@ static Expected<StringRef> getDynamicStrTab(const ELFFile<ELFT> *Elf) {
   }
 
   // If the dynamic segment is not present, we fall back on the sections.
-  auto SectionsOrError = Elf->sections();
+  auto SectionsOrError = Elf.sections();
   if (!SectionsOrError)
     return SectionsOrError.takeError();
 
   for (const typename ELFT::Shdr &Sec : *SectionsOrError) {
     if (Sec.sh_type == ELF::SHT_DYNSYM)
-      return Elf->getStringTableForSymtab(Sec);
+      return Elf.getStringTableForSymtab(Sec);
   }
 
   return createError("dynamic string table not found");
@@ -56,7 +56,7 @@ template <class ELFT>
 static Error getRelocationValueString(const ELFObjectFile<ELFT> *Obj,
                                       const RelocationRef &RelRef,
                                       SmallVectorImpl<char> &Result) {
-  const ELFFile<ELFT> &EF = *Obj->getELFFile();
+  const ELFFile<ELFT> &EF = Obj->getELFFile();
   DataRefImpl Rel = RelRef.getRawDataRefImpl();
   auto SecOrErr = EF.getSection(Rel.d.a);
   if (!SecOrErr)
@@ -85,14 +85,19 @@ static Error getRelocationValueString(const ELFObjectFile<ELFT> *Obj,
 
   if (!Undef) {
     symbol_iterator SI = RelRef.getSymbol();
-    const typename ELFT::Sym *Sym = Obj->getSymbol(SI->getRawDataRefImpl());
-    if (Sym->getType() == ELF::STT_SECTION) {
+    Expected<const typename ELFT::Sym *> SymOrErr =
+        Obj->getSymbol(SI->getRawDataRefImpl());
+    // TODO: test this error.
+    if (!SymOrErr)
+      return SymOrErr.takeError();
+
+    if ((*SymOrErr)->getType() == ELF::STT_SECTION) {
       Expected<section_iterator> SymSI = SI->getSection();
       if (!SymSI)
         return SymSI.takeError();
       const typename ELFT::Shdr *SymSec =
           Obj->getSection((*SymSI)->getRawDataRefImpl());
-      auto SecName = EF.getSectionName(SymSec);
+      auto SecName = EF.getSectionName(*SymSec);
       if (!SecName)
         return SecName.takeError();
       Fmt << *SecName;
@@ -133,9 +138,9 @@ Error objdump::getELFRelocationValueString(const ELFObjectFileBase *Obj,
 }
 
 template <class ELFT>
-static uint64_t getSectionLMA(const ELFFile<ELFT> *Obj,
+static uint64_t getSectionLMA(const ELFFile<ELFT> &Obj,
                               const object::ELFSectionRef &Sec) {
-  auto PhdrRangeOrErr = Obj->program_headers();
+  auto PhdrRangeOrErr = Obj.program_headers();
   if (!PhdrRangeOrErr)
     report_fatal_error(toString(PhdrRangeOrErr.takeError()));
 
@@ -162,14 +167,14 @@ uint64_t objdump::getELFSectionLMA(const object::ELFSectionRef &Sec) {
 }
 
 template <class ELFT>
-static void printDynamicSection(const ELFFile<ELFT> *Elf, StringRef Filename) {
+static void printDynamicSection(const ELFFile<ELFT> &Elf, StringRef Filename) {
   ArrayRef<typename ELFT::Dyn> DynamicEntries =
-      unwrapOrError(Elf->dynamicEntries(), Filename);
+      unwrapOrError(Elf.dynamicEntries(), Filename);
 
   // Find the maximum tag name length to format the value column properly.
   size_t MaxLen = 0;
   for (const typename ELFT::Dyn &Dyn : DynamicEntries)
-    MaxLen = std::max(MaxLen, Elf->getDynamicTagAsString(Dyn.d_tag).size());
+    MaxLen = std::max(MaxLen, Elf.getDynamicTagAsString(Dyn.d_tag).size());
   std::string TagFmt = "  %-" + std::to_string(MaxLen) + "s ";
 
   outs() << "Dynamic Section:\n";
@@ -177,7 +182,7 @@ static void printDynamicSection(const ELFFile<ELFT> *Elf, StringRef Filename) {
     if (Dyn.d_tag == ELF::DT_NULL)
       continue;
 
-    std::string Str = Elf->getDynamicTagAsString(Dyn.d_tag);
+    std::string Str = Elf.getDynamicTagAsString(Dyn.d_tag);
     outs() << format(TagFmt.c_str(), Str.c_str());
 
     const char *Fmt =
@@ -199,9 +204,9 @@ static void printDynamicSection(const ELFFile<ELFT> *Elf, StringRef Filename) {
 }
 
 template <class ELFT>
-static void printProgramHeaders(const ELFFile<ELFT> *Obj, StringRef FileName) {
+static void printProgramHeaders(const ELFFile<ELFT> &Obj, StringRef FileName) {
   outs() << "Program Header:\n";
-  auto ProgramHeaderOrError = Obj->program_headers();
+  auto ProgramHeaderOrError = Obj.program_headers();
   if (!ProgramHeaderOrError) {
     reportWarning("unable to read program headers: " +
                       toString(ProgramHeaderOrError.takeError()),
@@ -328,20 +333,20 @@ static void printSymbolVersionDefinition(const typename ELFT::Shdr &Shdr,
 }
 
 template <class ELFT>
-static void printSymbolVersionInfo(const ELFFile<ELFT> *Elf,
+static void printSymbolVersionInfo(const ELFFile<ELFT> &Elf,
                                    StringRef FileName) {
   ArrayRef<typename ELFT::Shdr> Sections =
-      unwrapOrError(Elf->sections(), FileName);
+      unwrapOrError(Elf.sections(), FileName);
   for (const typename ELFT::Shdr &Shdr : Sections) {
     if (Shdr.sh_type != ELF::SHT_GNU_verneed &&
         Shdr.sh_type != ELF::SHT_GNU_verdef)
       continue;
 
     ArrayRef<uint8_t> Contents =
-        unwrapOrError(Elf->getSectionContents(&Shdr), FileName);
+        unwrapOrError(Elf.getSectionContents(Shdr), FileName);
     const typename ELFT::Shdr *StrTabSec =
-        unwrapOrError(Elf->getSection(Shdr.sh_link), FileName);
-    StringRef StrTab = unwrapOrError(Elf->getStringTable(StrTabSec), FileName);
+        unwrapOrError(Elf.getSection(Shdr.sh_link), FileName);
+    StringRef StrTab = unwrapOrError(Elf.getStringTable(*StrTabSec), FileName);
 
     if (Shdr.sh_type == ELF::SHT_GNU_verneed)
       printSymbolVersionDependency<ELFT>(Contents, StrTab);
diff --git a/contrib/llvm-project/llvm/tools/llvm-objdump/MachODump.cpp b/contrib/llvm-project/llvm/tools/llvm-objdump/MachODump.cpp
index 6d46496ecd4e..51212d52c18b 100644
--- a/contrib/llvm-project/llvm/tools/llvm-objdump/MachODump.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-objdump/MachODump.cpp
@@ -107,70 +107,70 @@ static cl::opt<bool> NoLeadingHeaders("no-leading-headers",
 
 cl::opt<bool> objdump::UniversalHeaders(
     "universal-headers",
-    cl::desc("Print Mach-O universal headers (requires -macho)"),
+    cl::desc("Print Mach-O universal headers (requires --macho)"),
     cl::cat(MachOCat));
 
 static cl::opt<bool> ArchiveMemberOffsets(
     "archive-member-offsets",
     cl::desc("Print the offset to each archive member for Mach-O archives "
-             "(requires -macho and -archive-headers)"),
+             "(requires --macho and --archive-headers)"),
     cl::cat(MachOCat));
 
 cl::opt<bool> objdump::IndirectSymbols(
     "indirect-symbols",
     cl::desc(
-        "Print indirect symbol table for Mach-O objects (requires -macho)"),
+        "Print indirect symbol table for Mach-O objects (requires --macho)"),
     cl::cat(MachOCat));
 
 cl::opt<bool> objdump::DataInCode(
     "data-in-code",
     cl::desc(
-        "Print the data in code table for Mach-O objects (requires -macho)"),
+        "Print the data in code table for Mach-O objects (requires --macho)"),
     cl::cat(MachOCat));
 
 cl::opt<bool>
     objdump::LinkOptHints("link-opt-hints",
                           cl::desc("Print the linker optimization hints for "
-                                   "Mach-O objects (requires -macho)"),
+                                   "Mach-O objects (requires --macho)"),
                           cl::cat(MachOCat));
 
 cl::opt<bool>
     objdump::InfoPlist("info-plist",
                        cl::desc("Print the info plist section as strings for "
-                                "Mach-O objects (requires -macho)"),
+                                "Mach-O objects (requires --macho)"),
                        cl::cat(MachOCat));
 
 cl::opt<bool>
     objdump::DylibsUsed("dylibs-used",
                         cl::desc("Print the shared libraries used for linked "
-                                 "Mach-O files (requires -macho)"),
+                                 "Mach-O files (requires --macho)"),
                         cl::cat(MachOCat));
 
 cl::opt<bool> objdump::DylibId("dylib-id",
                                cl::desc("Print the shared library's id for the "
-                                        "dylib Mach-O file (requires -macho)"),
+                                        "dylib Mach-O file (requires --macho)"),
                                cl::cat(MachOCat));
 
 static cl::opt<bool>
     NonVerbose("non-verbose",
                cl::desc("Print the info for Mach-O objects in non-verbose or "
-                        "numeric form (requires -macho)"),
+                        "numeric form (requires --macho)"),
                cl::cat(MachOCat));
 
 cl::opt<bool>
     objdump::ObjcMetaData("objc-meta-data",
                           cl::desc("Print the Objective-C runtime meta data "
-                                   "for Mach-O files (requires -macho)"),
+                                   "for Mach-O files (requires --macho)"),
                           cl::cat(MachOCat));
 
 static cl::opt<std::string> DisSymName(
     "dis-symname",
-    cl::desc("disassemble just this symbol's instructions (requires -macho)"),
+    cl::desc("disassemble just this symbol's instructions (requires --macho)"),
     cl::cat(MachOCat));
 
 static cl::opt<bool> NoSymbolicOperands(
     "no-symbolic-operands",
-    cl::desc("do not symbolic operands when disassembling (requires -macho)"),
+    cl::desc("do not symbolic operands when disassembling (requires --macho)"),
     cl::cat(MachOCat));
 
 static cl::list<std::string>
@@ -453,7 +453,8 @@ static void printRelocationTargetName(const MachOObjectFile *O,
   bool isExtern = O->getPlainRelocationExternal(RE);
   uint64_t Val = O->getPlainRelocationSymbolNum(RE);
 
-  if (O->getAnyRelocationType(RE) == MachO::ARM64_RELOC_ADDEND) {
+  if (O->getAnyRelocationType(RE) == MachO::ARM64_RELOC_ADDEND &&
+      (O->getArch() == Triple::aarch64 || O->getArch() == Triple::aarch64_be)) {
     Fmt << format("0x%0" PRIx64, Val);
     return;
   }
@@ -1888,9 +1889,7 @@ static bool checkMachOAndArchFlags(ObjectFile *O, StringRef Filename) {
                                        &McpuDefault, &ArchFlag);
   }
   const std::string ArchFlagName(ArchFlag);
-  if (none_of(ArchFlags, [&](const std::string &Name) {
-        return Name == ArchFlagName;
-      })) {
+  if (!llvm::is_contained(ArchFlags, ArchFlagName)) {
     WithColor::error(errs(), "llvm-objdump")
         << Filename << ": no architecture specified.\n";
     return false;
@@ -2109,6 +2108,10 @@ static void printCPUType(uint32_t cputype, uint32_t cpusubtype) {
       outs() << "    cputype CPU_TYPE_ARM64\n";
       outs() << "    cpusubtype CPU_SUBTYPE_ARM64_ALL\n";
       break;
+    case MachO::CPU_SUBTYPE_ARM64_V8:
+      outs() << "    cputype CPU_TYPE_ARM64\n";
+      outs() << "    cpusubtype CPU_SUBTYPE_ARM64_V8\n";
+      break;
     case MachO::CPU_SUBTYPE_ARM64E:
       outs() << "    cputype CPU_TYPE_ARM64\n";
       outs() << "    cpusubtype CPU_SUBTYPE_ARM64E\n";
@@ -2392,7 +2395,7 @@ void objdump::parseInputMachO(MachOUniversalBinary *UB) {
           ArchFound = true;
           Expected<std::unique_ptr<ObjectFile>> ObjOrErr =
               I->getAsObjectFile();
-          std::string ArchitectureName = "";
+          std::string ArchitectureName;
           if (ArchFlags.size() > 1)
             ArchitectureName = I->getArchFlagName();
           if (ObjOrErr) {
@@ -2508,7 +2511,7 @@ void objdump::parseInputMachO(MachOUniversalBinary *UB) {
                                               E = UB->end_objects();
         I != E; ++I) {
     Expected<std::unique_ptr<ObjectFile>> ObjOrErr = I->getAsObjectFile();
-    std::string ArchitectureName = "";
+    std::string ArchitectureName;
     if (moreThanOneArch)
       ArchitectureName = I->getArchFlagName();
     if (ObjOrErr) {
@@ -7195,10 +7198,32 @@ static void DisassembleMachO(StringRef Filename, MachOObjectFile *MachOOF,
   else
     MachOMCPU = MCPU;
 
+#define CHECK_TARGET_INFO_CREATION(NAME)                                       \
+  do {                                                                         \
+    if (!NAME) {                                                               \
+      WithColor::error(errs(), "llvm-objdump")                                 \
+          << "couldn't initialize disassembler for target " << TripleName      \
+          << '\n';                                                             \
+      return;                                                                  \
+    }                                                                          \
+  } while (false)
+#define CHECK_THUMB_TARGET_INFO_CREATION(NAME)                                 \
+  do {                                                                         \
+    if (!NAME) {                                                               \
+      WithColor::error(errs(), "llvm-objdump")                                 \
+          << "couldn't initialize disassembler for target " << ThumbTripleName \
+          << '\n';                                                             \
+      return;                                                                  \
+    }                                                                          \
+  } while (false)
+
   std::unique_ptr<const MCInstrInfo> InstrInfo(TheTarget->createMCInstrInfo());
+  CHECK_TARGET_INFO_CREATION(InstrInfo);
   std::unique_ptr<const MCInstrInfo> ThumbInstrInfo;
-  if (ThumbTarget)
+  if (ThumbTarget) {
     ThumbInstrInfo.reset(ThumbTarget->createMCInstrInfo());
+    CHECK_THUMB_TARGET_INFO_CREATION(ThumbInstrInfo);
+  }
 
   // Package up features to be passed to target/subtarget
   std::string FeaturesStr;
@@ -7213,13 +7238,17 @@ static void DisassembleMachO(StringRef Filename, MachOObjectFile *MachOOF,
   // Set up disassembler.
   std::unique_ptr<const MCRegisterInfo> MRI(
       TheTarget->createMCRegInfo(TripleName));
+  CHECK_TARGET_INFO_CREATION(MRI);
   std::unique_ptr<const MCAsmInfo> AsmInfo(
       TheTarget->createMCAsmInfo(*MRI, TripleName, MCOptions));
+  CHECK_TARGET_INFO_CREATION(AsmInfo);
   std::unique_ptr<const MCSubtargetInfo> STI(
       TheTarget->createMCSubtargetInfo(TripleName, MachOMCPU, FeaturesStr));
+  CHECK_TARGET_INFO_CREATION(STI);
   MCContext Ctx(AsmInfo.get(), MRI.get(), nullptr);
   std::unique_ptr<MCDisassembler> DisAsm(
       TheTarget->createMCDisassembler(*STI, Ctx));
+  CHECK_TARGET_INFO_CREATION(DisAsm);
   std::unique_ptr<MCSymbolizer> Symbolizer;
   struct DisassembleInfo SymbolizerInfo(nullptr, nullptr, nullptr, false);
   std::unique_ptr<MCRelocationInfo> RelInfo(
@@ -7233,6 +7262,7 @@ static void DisassembleMachO(StringRef Filename, MachOObjectFile *MachOOF,
   int AsmPrinterVariant = AsmInfo->getAssemblerDialect();
   std::unique_ptr<MCInstPrinter> IP(TheTarget->createMCInstPrinter(
       Triple(TripleName), AsmPrinterVariant, *AsmInfo, *InstrInfo, *MRI));
+  CHECK_TARGET_INFO_CREATION(IP);
   // Set the display preference for hex vs. decimal immediates.
   IP->setPrintImmHex(PrintImmHex);
   // Comment stream and backing vector.
@@ -7245,12 +7275,6 @@ static void DisassembleMachO(StringRef Filename, MachOObjectFile *MachOOF,
   // comment causing different diffs with the 'C' disassembler library API.
   // IP->setCommentStream(CommentStream);
 
-  if (!AsmInfo || !STI || !DisAsm || !IP) {
-    WithColor::error(errs(), "llvm-objdump")
-        << "couldn't initialize disassembler for target " << TripleName << '\n';
-    return;
-  }
-
   // Set up separate thumb disassembler if needed.
   std::unique_ptr<const MCRegisterInfo> ThumbMRI;
   std::unique_ptr<const MCAsmInfo> ThumbAsmInfo;
@@ -7263,13 +7287,17 @@ static void DisassembleMachO(StringRef Filename, MachOObjectFile *MachOOF,
   std::unique_ptr<MCRelocationInfo> ThumbRelInfo;
   if (ThumbTarget) {
     ThumbMRI.reset(ThumbTarget->createMCRegInfo(ThumbTripleName));
+    CHECK_THUMB_TARGET_INFO_CREATION(ThumbMRI);
     ThumbAsmInfo.reset(
         ThumbTarget->createMCAsmInfo(*ThumbMRI, ThumbTripleName, MCOptions));
+    CHECK_THUMB_TARGET_INFO_CREATION(ThumbAsmInfo);
     ThumbSTI.reset(
         ThumbTarget->createMCSubtargetInfo(ThumbTripleName, MachOMCPU,
                                            FeaturesStr));
+    CHECK_THUMB_TARGET_INFO_CREATION(ThumbSTI);
     ThumbCtx.reset(new MCContext(ThumbAsmInfo.get(), ThumbMRI.get(), nullptr));
     ThumbDisAsm.reset(ThumbTarget->createMCDisassembler(*ThumbSTI, *ThumbCtx));
+    CHECK_THUMB_TARGET_INFO_CREATION(ThumbDisAsm);
     MCContext *PtrThumbCtx = ThumbCtx.get();
     ThumbRelInfo.reset(
         ThumbTarget->createMCRelocationInfo(ThumbTripleName, *PtrThumbCtx));
@@ -7283,16 +7311,13 @@ static void DisassembleMachO(StringRef Filename, MachOObjectFile *MachOOF,
     ThumbIP.reset(ThumbTarget->createMCInstPrinter(
         Triple(ThumbTripleName), ThumbAsmPrinterVariant, *ThumbAsmInfo,
         *ThumbInstrInfo, *ThumbMRI));
+    CHECK_THUMB_TARGET_INFO_CREATION(ThumbIP);
     // Set the display preference for hex vs. decimal immediates.
     ThumbIP->setPrintImmHex(PrintImmHex);
   }
 
-  if (ThumbTarget && (!ThumbAsmInfo || !ThumbSTI || !ThumbDisAsm || !ThumbIP)) {
-    WithColor::error(errs(), "llvm-objdump")
-        << "couldn't initialize disassembler for target " << ThumbTripleName
-        << '\n';
-    return;
-  }
+#undef CHECK_TARGET_INFO_CREATION
+#undef CHECK_THUMB_TARGET_INFO_CREATION
 
   MachO::mach_header Header = MachOOF->getHeader();
 
@@ -8315,6 +8340,9 @@ static void PrintMachHeader(uint32_t magic, uint32_t cputype,
       case MachO::CPU_SUBTYPE_ARM64_ALL:
         outs() << "        ALL";
         break;
+      case MachO::CPU_SUBTYPE_ARM64_V8:
+        outs() << "         V8";
+        break;
       case MachO::CPU_SUBTYPE_ARM64E:
         outs() << "          E";
         break;
diff --git a/contrib/llvm-project/llvm/tools/llvm-objdump/llvm-objdump.cpp b/contrib/llvm-project/llvm/tools/llvm-objdump/llvm-objdump.cpp
index 320bbb5d358b..17128e95727f 100644
--- a/contrib/llvm-project/llvm/tools/llvm-objdump/llvm-objdump.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-objdump/llvm-objdump.cpp
@@ -29,6 +29,7 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/FaultMaps.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
 #include "llvm/DebugInfo/Symbolize/Symbolize.h"
@@ -101,7 +102,7 @@ static cl::alias AllHeadersShort("x", cl::desc("Alias for --all-headers"),
 static cl::opt<std::string>
     ArchName("arch-name",
              cl::desc("Target arch to disassemble for, "
-                      "see -version for available targets"),
+                      "see --version for available targets"),
              cl::cat(ObjdumpCat));
 
 cl::opt<bool>
@@ -226,13 +227,13 @@ static cl::alias MachOm("m", cl::desc("Alias for --macho"), cl::NotHidden,
                         cl::Grouping, cl::aliasopt(MachOOpt));
 
 cl::opt<std::string> objdump::MCPU(
-    "mcpu", cl::desc("Target a specific cpu type (-mcpu=help for details)"),
+    "mcpu", cl::desc("Target a specific cpu type (--mcpu=help for details)"),
     cl::value_desc("cpu-name"), cl::init(""), cl::cat(ObjdumpCat));
 
-cl::list<std::string> objdump::MAttrs("mattr", cl::CommaSeparated,
-                                      cl::desc("Target specific attributes"),
-                                      cl::value_desc("a1,+a2,-a3,..."),
-                                      cl::cat(ObjdumpCat));
+cl::list<std::string> objdump::MAttrs(
+    "mattr", cl::CommaSeparated,
+    cl::desc("Target specific attributes (--mattr=help for details)"),
+    cl::value_desc("a1,+a2,-a3,..."), cl::cat(ObjdumpCat));
 
 cl::opt<bool> objdump::NoShowRawInsn(
     "no-show-raw-insn",
@@ -274,7 +275,7 @@ static cl::alias PrivateHeadersShort("p",
 cl::list<std::string>
     objdump::FilterSections("section",
                             cl::desc("Operate on the specified sections only. "
-                                     "With -macho dump segment,section"),
+                                     "With --macho dump segment,section"),
                             cl::cat(ObjdumpCat));
 static cl::alias FilterSectionsj("j", cl::desc("Alias for --section"),
                                  cl::NotHidden, cl::Grouping, cl::Prefix,
@@ -303,7 +304,7 @@ static cl::opt<bool> PrintSource(
     cl::desc(
         "Display source inlined with disassembly. Implies disassemble object"),
     cl::cat(ObjdumpCat));
-static cl::alias PrintSourceShort("S", cl::desc("Alias for -source"),
+static cl::alias PrintSourceShort("S", cl::desc("Alias for --source"),
                                   cl::NotHidden, cl::Grouping,
                                   cl::aliasopt(PrintSource));
 
@@ -321,6 +322,11 @@ static cl::alias SymbolTableShort("t", cl::desc("Alias for --syms"),
                                   cl::NotHidden, cl::Grouping,
                                   cl::aliasopt(SymbolTable));
 
+static cl::opt<bool> SymbolizeOperands(
+    "symbolize-operands",
+    cl::desc("Symbolize instruction operands when disassembling"),
+    cl::cat(ObjdumpCat));
+
 static cl::opt<bool> DynamicSymbolTable(
     "dynamic-syms",
     cl::desc("Display the contents of the dynamic symbol table"),
@@ -330,11 +336,11 @@ static cl::alias DynamicSymbolTableShort("T",
                                          cl::NotHidden, cl::Grouping,
                                          cl::aliasopt(DynamicSymbolTable));
 
-cl::opt<std::string> objdump::TripleName(
-    "triple",
-    cl::desc(
-        "Target triple to disassemble for, see -version for available targets"),
-    cl::cat(ObjdumpCat));
+cl::opt<std::string>
+    objdump::TripleName("triple",
+                        cl::desc("Target triple to disassemble for, see "
+                                 "--version for available targets"),
+                        cl::cat(ObjdumpCat));
 
 cl::opt<bool> objdump::UnwindInfo("unwind-info",
                                   cl::desc("Display unwind information"),
@@ -348,6 +354,10 @@ static cl::opt<bool>
          cl::cat(ObjdumpCat));
 static cl::alias WideShort("w", cl::Grouping, cl::aliasopt(Wide));
 
+cl::opt<std::string> objdump::Prefix("prefix",
+                                     cl::desc("Add prefix to absolute paths"),
+                                     cl::cat(ObjdumpCat));
+
 enum DebugVarsFormat {
   DVDisabled,
   DVUnicode,
@@ -439,7 +449,7 @@ std::string objdump::getFileNameForError(const object::Archive::Child &C,
   return "<file index: " + std::to_string(Index) + ">";
 }
 
-void objdump::reportWarning(Twine Message, StringRef File) {
+void objdump::reportWarning(const Twine &Message, StringRef File) {
   // Output order between errs() and outs() matters especially for archive
   // files where the output is per member object.
   outs().flush();
@@ -448,7 +458,7 @@ void objdump::reportWarning(Twine Message, StringRef File) {
 }
 
 LLVM_ATTRIBUTE_NORETURN void objdump::reportError(StringRef File,
-                                                  Twine Message) {
+                                                  const Twine &Message) {
   outs().flush();
   WithColor::error(errs(), ToolName) << "'" << File << "': " << Message << "\n";
   exit(1);
@@ -471,11 +481,11 @@ LLVM_ATTRIBUTE_NORETURN void objdump::reportError(Error E, StringRef FileName,
   exit(1);
 }
 
-static void reportCmdLineWarning(Twine Message) {
+static void reportCmdLineWarning(const Twine &Message) {
   WithColor::warning(errs(), ToolName) << Message << "\n";
 }
 
-LLVM_ATTRIBUTE_NORETURN static void reportCmdLineError(Twine Message) {
+LLVM_ATTRIBUTE_NORETURN static void reportCmdLineError(const Twine &Message) {
   WithColor::error(errs(), ToolName) << Message << "\n";
   exit(1);
 }
@@ -797,19 +807,19 @@ public:
     bool IsASCII = DbgVariables == DVASCII;
     switch (C) {
     case LineChar::RangeStart:
-      return IsASCII ? "^" : u8"\u2548";
+      return IsASCII ? "^" : (const char *)u8"\u2548";
     case LineChar::RangeMid:
-      return IsASCII ? "|" : u8"\u2503";
+      return IsASCII ? "|" : (const char *)u8"\u2503";
     case LineChar::RangeEnd:
-      return IsASCII ? "v" : u8"\u253b";
+      return IsASCII ? "v" : (const char *)u8"\u253b";
     case LineChar::LabelVert:
-      return IsASCII ? "|" : u8"\u2502";
+      return IsASCII ? "|" : (const char *)u8"\u2502";
     case LineChar::LabelCornerNew:
-      return IsASCII ? "/" : u8"\u250c";
+      return IsASCII ? "/" : (const char *)u8"\u250c";
     case LineChar::LabelCornerActive:
-      return IsASCII ? "|" : u8"\u2520";
+      return IsASCII ? "|" : (const char *)u8"\u2520";
     case LineChar::LabelHoriz:
-      return IsASCII ? "-" : u8"\u2500";
+      return IsASCII ? "-" : (const char *)u8"\u2500";
     }
     llvm_unreachable("Unhandled LineChar enum");
   }
@@ -937,8 +947,8 @@ protected:
   std::unordered_map<std::string, std::vector<StringRef>> LineCache;
   // Keep track of missing sources.
   StringSet<> MissingSources;
-  // Only emit 'no debug info' warning once.
-  bool WarnedNoDebugInfo;
+  // Only emit 'invalid debug info' warning once.
+  bool WarnedInvalidDebugInfo = false;
 
 private:
   bool cacheSource(const DILineInfo& LineInfoFile);
@@ -952,8 +962,7 @@ private:
 
 public:
   SourcePrinter() = default;
-  SourcePrinter(const ObjectFile *Obj, StringRef DefaultArch)
-      : Obj(Obj), WarnedNoDebugInfo(false) {
+  SourcePrinter(const ObjectFile *Obj, StringRef DefaultArch) : Obj(Obj) {
     symbolize::LLVMSymbolizer::Options SymbolizerOpts;
     SymbolizerOpts.PrintFunctions =
         DILineInfoSpecifier::FunctionNameKind::LinkageName;
@@ -1008,22 +1017,24 @@ void SourcePrinter::printSourceLine(formatted_raw_ostream &OS,
     return;
 
   DILineInfo LineInfo = DILineInfo();
-  auto ExpectedLineInfo = Symbolizer->symbolizeCode(*Obj, Address);
+  Expected<DILineInfo> ExpectedLineInfo =
+      Symbolizer->symbolizeCode(*Obj, Address);
   std::string ErrorMessage;
-  if (!ExpectedLineInfo)
-    ErrorMessage = toString(ExpectedLineInfo.takeError());
-  else
+  if (ExpectedLineInfo) {
     LineInfo = *ExpectedLineInfo;
+  } else if (!WarnedInvalidDebugInfo) {
+    WarnedInvalidDebugInfo = true;
+    // TODO Untested.
+    reportWarning("failed to parse debug information: " +
+                      toString(ExpectedLineInfo.takeError()),
+                  ObjectFilename);
+  }
 
-  if (LineInfo.FileName == DILineInfo::BadString) {
-    if (!WarnedNoDebugInfo) {
-      std::string Warning =
-          "failed to parse debug information for " + ObjectFilename.str();
-      if (!ErrorMessage.empty())
-        Warning += ": " + ErrorMessage;
-      reportWarning(Warning, ObjectFilename);
-      WarnedNoDebugInfo = true;
-    }
+  if (!Prefix.empty() && sys::path::is_absolute_gnu(LineInfo.FileName)) {
+    SmallString<128> FilePath;
+    sys::path::append(FilePath, Prefix, LineInfo.FileName);
+
+    LineInfo.FileName = std::string(FilePath);
   }
 
   if (PrintLines)
@@ -1324,13 +1335,21 @@ PrettyPrinter &selectPrettyPrinter(Triple const &Triple) {
 static uint8_t getElfSymbolType(const ObjectFile *Obj, const SymbolRef &Sym) {
   assert(Obj->isELF());
   if (auto *Elf32LEObj = dyn_cast<ELF32LEObjectFile>(Obj))
-    return Elf32LEObj->getSymbol(Sym.getRawDataRefImpl())->getType();
+    return unwrapOrError(Elf32LEObj->getSymbol(Sym.getRawDataRefImpl()),
+                         Obj->getFileName())
+        ->getType();
   if (auto *Elf64LEObj = dyn_cast<ELF64LEObjectFile>(Obj))
-    return Elf64LEObj->getSymbol(Sym.getRawDataRefImpl())->getType();
+    return unwrapOrError(Elf64LEObj->getSymbol(Sym.getRawDataRefImpl()),
+                         Obj->getFileName())
+        ->getType();
   if (auto *Elf32BEObj = dyn_cast<ELF32BEObjectFile>(Obj))
-    return Elf32BEObj->getSymbol(Sym.getRawDataRefImpl())->getType();
+    return unwrapOrError(Elf32BEObj->getSymbol(Sym.getRawDataRefImpl()),
+                         Obj->getFileName())
+        ->getType();
   if (auto *Elf64BEObj = cast<ELF64BEObjectFile>(Obj))
-    return Elf64BEObj->getSymbol(Sym.getRawDataRefImpl())->getType();
+    return unwrapOrError(Elf64BEObj->getSymbol(Sym.getRawDataRefImpl()),
+                         Obj->getFileName())
+        ->getType();
   llvm_unreachable("Unsupported binary format");
 }
 
@@ -1346,7 +1365,9 @@ addDynamicElfSymbols(const ELFObjectFile<ELFT> *Obj,
     // ELFSymbolRef::getAddress() returns size instead of value for common
     // symbols which is not desirable for disassembly output. Overriding.
     if (SymbolType == ELF::STT_COMMON)
-      Address = Obj->getSymbol(Symbol.getRawDataRefImpl())->st_value;
+      Address = unwrapOrError(Obj->getSymbol(Symbol.getRawDataRefImpl()),
+                              Obj->getFileName())
+                    ->st_value;
 
     StringRef Name = unwrapOrError(Symbol.getName(), Obj->getFileName());
     if (Name.empty())
@@ -1394,13 +1415,23 @@ static void addPltEntries(const ObjectFile *Obj,
     return;
   if (auto *ElfObj = dyn_cast<ELFObjectFileBase>(Obj)) {
     for (auto PltEntry : ElfObj->getPltAddresses()) {
-      SymbolRef Symbol(PltEntry.first, ElfObj);
-      uint8_t SymbolType = getElfSymbolType(Obj, Symbol);
-
-      StringRef Name = unwrapOrError(Symbol.getName(), Obj->getFileName());
-      if (!Name.empty())
-        AllSymbols[*Plt].emplace_back(
-            PltEntry.second, Saver.save((Name + "@plt").str()), SymbolType);
+      if (PltEntry.first) {
+        SymbolRef Symbol(*PltEntry.first, ElfObj);
+        uint8_t SymbolType = getElfSymbolType(Obj, Symbol);
+        if (Expected<StringRef> NameOrErr = Symbol.getName()) {
+          if (!NameOrErr->empty())
+            AllSymbols[*Plt].emplace_back(
+                PltEntry.second, Saver.save((*NameOrErr + "@plt").str()),
+                SymbolType);
+          continue;
+        } else {
+          // The warning has been reported in disassembleObject().
+          consumeError(NameOrErr.takeError());
+        }
+      }
+      reportWarning("PLT entry at 0x" + Twine::utohexstr(PltEntry.second) +
+                        " references an invalid symbol",
+                    Obj->getFileName());
     }
   }
 }
@@ -1568,6 +1599,52 @@ static SymbolInfoTy createDummySymbolInfo(const ObjectFile *Obj,
     return SymbolInfoTy(Addr, Name, Type);
 }
 
+static void
+collectLocalBranchTargets(ArrayRef<uint8_t> Bytes, const MCInstrAnalysis *MIA,
+                          MCDisassembler *DisAsm, MCInstPrinter *IP,
+                          const MCSubtargetInfo *STI, uint64_t SectionAddr,
+                          uint64_t Start, uint64_t End,
+                          std::unordered_map<uint64_t, std::string> &Labels) {
+  // So far only supports X86.
+  if (!STI->getTargetTriple().isX86())
+    return;
+
+  Labels.clear();
+  unsigned LabelCount = 0;
+  Start += SectionAddr;
+  End += SectionAddr;
+  uint64_t Index = Start;
+  while (Index < End) {
+    // Disassemble a real instruction and record function-local branch labels.
+    MCInst Inst;
+    uint64_t Size;
+    bool Disassembled = DisAsm->getInstruction(
+        Inst, Size, Bytes.slice(Index - SectionAddr), Index, nulls());
+    if (Size == 0)
+      Size = 1;
+
+    if (Disassembled && MIA) {
+      uint64_t Target;
+      bool TargetKnown = MIA->evaluateBranch(Inst, Index, Size, Target);
+      if (TargetKnown && (Target >= Start && Target < End) &&
+          !Labels.count(Target))
+        Labels[Target] = ("L" + Twine(LabelCount++)).str();
+    }
+
+    Index += Size;
+  }
+}
+
+static StringRef getSegmentName(const MachOObjectFile *MachO,
+                                const SectionRef &Section) {
+  if (MachO) {
+    DataRefImpl DR = Section.getRawDataRefImpl();
+    StringRef SegmentName = MachO->getSectionFinalSegmentName(DR);
+    return SegmentName;
+  }
+  return "";
+}
+
 static void disassembleObject(const Target *TheTarget, const ObjectFile *Obj,
                               MCContext &Ctx, MCDisassembler *PrimaryDisAsm,
                               MCDisassembler *SecondaryDisAsm,
@@ -1594,8 +1671,12 @@ static void disassembleObject(const Target *TheTarget, const ObjectFile *Obj,
   const StringRef FileName = Obj->getFileName();
   const MachOObjectFile *MachO = dyn_cast<const MachOObjectFile>(Obj);
   for (const SymbolRef &Symbol : Obj->symbols()) {
-    StringRef Name = unwrapOrError(Symbol.getName(), FileName);
-    if (Name.empty() && !(Obj->isXCOFF() && SymbolDescription))
+    Expected<StringRef> NameOrErr = Symbol.getName();
+    if (!NameOrErr) {
+      reportWarning(toString(NameOrErr.takeError()), FileName);
+      continue;
+    }
+    if (NameOrErr->empty() && !(Obj->isXCOFF() && SymbolDescription))
       continue;
 
     if (Obj->isELF() && getElfSymbolType(Obj, Symbol) == ELF::STT_SECTION)
@@ -1672,8 +1753,8 @@ static void disassembleObject(const Target *TheTarget, const ObjectFile *Obj,
   // the output.
   StringSet<> FoundDisasmSymbolSet;
   for (std::pair<const SectionRef, SectionSymbolsTy> &SecSyms : AllSymbols)
-    stable_sort(SecSyms.second);
-  stable_sort(AbsoluteSymbols);
+    llvm::stable_sort(SecSyms.second);
+  llvm::stable_sort(AbsoluteSymbols);
 
   std::unique_ptr<DWARFContext> DICtx;
   LiveVariablePrinter LVP(*Ctx.getRegisterInfo(), *STI);
@@ -1728,12 +1809,7 @@ static void disassembleObject(const Target *TheTarget, const ObjectFile *Obj,
       }
     }
 
-    StringRef SegmentName = "";
-    if (MachO) {
-      DataRefImpl DR = Section.getRawDataRefImpl();
-      SegmentName = MachO->getSectionFinalSegmentName(DR);
-    }
-
+    StringRef SegmentName = getSegmentName(MachO, Section);
     StringRef SectionName = unwrapOrError(Section.getName(), Obj->getFileName());
     // If the section has no symbol at the start, just insert a dummy one.
     if (Symbols.empty() || Symbols[0].Addr != 0) {
@@ -1794,23 +1870,6 @@ static void disassembleObject(const Target *TheTarget, const ObjectFile *Obj,
         outs() << SectionName << ":\n";
       }
 
-      if (Obj->isELF() && Obj->getArch() == Triple::amdgcn) {
-        if (Symbols[SI].Type == ELF::STT_AMDGPU_HSA_KERNEL) {
-          // skip amd_kernel_code_t at the begining of kernel symbol (256 bytes)
-          Start += 256;
-        }
-        if (SI == SE - 1 ||
-            Symbols[SI + 1].Type == ELF::STT_AMDGPU_HSA_KERNEL) {
-          // cut trailing zeroes at the end of kernel
-          // cut up to 256 bytes
-          const uint64_t EndAlign = 256;
-          const auto Limit = End - (std::min)(EndAlign, End - Start);
-          while (End > Limit &&
-            *reinterpret_cast<const support::ulittle32_t*>(&Bytes[End - 4]) == 0)
-            End -= 4;
-        }
-      }
-
       outs() << '\n';
       if (!NoLeadingAddr)
         outs() << format(Is64Bits ? "%016" PRIx64 " " : "%08" PRIx64 " ",
@@ -1880,6 +1939,12 @@ static void disassembleObject(const Target *TheTarget, const ObjectFile *Obj,
                              !DisassembleAll;
       bool DumpARMELFData = false;
       formatted_raw_ostream FOS(outs());
+
+      std::unordered_map<uint64_t, std::string> AllLabels;
+      if (SymbolizeOperands)
+        collectLocalBranchTargets(Bytes, MIA, DisAsm, IP, PrimarySTI,
+                                  SectionAddr, Index, End, AllLabels);
+
       while (Index < End) {
         // ARM and AArch64 ELF binaries can interleave data and text in the
         // same section. We rely on the markers introduced to understand what
@@ -1920,6 +1985,11 @@ static void disassembleObject(const Target *TheTarget, const ObjectFile *Obj,
             }
           }
 
+          // Print local label if there's any.
+          auto Iter = AllLabels.find(SectionAddr + Index);
+          if (Iter != AllLabels.end())
+            FOS << "<" << Iter->second << ">:\n";
+
           // Disassemble a real instruction or a data when disassemble all is
           // provided
           MCInst Inst;
@@ -1953,7 +2023,9 @@ static void disassembleObject(const Target *TheTarget, const ObjectFile *Obj,
                           Inst, SectionAddr + Index, Size)) {
                 Target = *MaybeTarget;
                 PrintTarget = true;
-                FOS << "  # " << Twine::utohexstr(Target);
+                // Do not print real address when symbolizing.
+                if (!SymbolizeOperands)
+                  FOS << "  # " << Twine::utohexstr(Target);
               }
             if (PrintTarget) {
               // In a relocatable object, the target's section must reside in
@@ -2003,17 +2075,30 @@ static void disassembleObject(const Target *TheTarget, const ObjectFile *Obj,
                 }
               }
 
+              // Print the labels corresponding to the target if there's any.
+              bool LabelAvailable = AllLabels.count(Target);
               if (TargetSym != nullptr) {
                 uint64_t TargetAddress = TargetSym->Addr;
+                uint64_t Disp = Target - TargetAddress;
                 std::string TargetName = TargetSym->Name.str();
                 if (Demangle)
                   TargetName = demangle(TargetName);
 
-                FOS << " <" << TargetName;
-                uint64_t Disp = Target - TargetAddress;
-                if (Disp)
-                  FOS << "+0x" << Twine::utohexstr(Disp);
-                FOS << '>';
+                FOS << " <";
+                if (!Disp) {
+                  // Always Print the binary symbol precisely corresponding to
+                  // the target address.
+                  FOS << TargetName;
+                } else if (!LabelAvailable) {
+                  // Always Print the binary symbol plus an offset if there's no
+                  // local label corresponding to the target address.
+                  FOS << TargetName << "+0x" << Twine::utohexstr(Disp);
+                } else {
+                  FOS << AllLabels[Target];
+                }
+                FOS << ">";
+              } else if (LabelAvailable) {
+                FOS << " <" << AllLabels[Target] << ">";
               }
             }
           }
@@ -2089,6 +2174,10 @@ static void disassembleObject(const ObjectFile *Obj, bool InlineRelocs) {
   if (!AsmInfo)
     reportError(Obj->getFileName(),
                 "no assembly info for target " + TripleName);
+
+  if (MCPU.empty())
+    MCPU = Obj->tryGetCPUName().getValueOr("").str();
+
   std::unique_ptr<const MCSubtargetInfo> STI(
       TheTarget->createMCSubtargetInfo(TripleName, MCPU, Features.getString()));
   if (!STI)
@@ -2135,6 +2224,8 @@ static void disassembleObject(const ObjectFile *Obj, bool InlineRelocs) {
                 "no instruction printer for target " + TripleName);
   IP->setPrintImmHex(PrintImmHex);
   IP->setPrintBranchImmAsAddress(true);
+  IP->setSymbolizeOperands(SymbolizeOperands);
+  IP->setMCInstrAnalysis(MIA.get());
 
   PrettyPrinter &PIP = selectPrettyPrinter(Triple(TripleName));
   SourcePrinter SP(Obj, TheTarget->getName());
@@ -2301,6 +2392,8 @@ void objdump::printSectionHeaders(const ObjectFile *Obj) {
 }
 
 void objdump::printSectionContents(const ObjectFile *Obj) {
+  const MachOObjectFile *MachO = dyn_cast<const MachOObjectFile>(Obj);
+
   for (const SectionRef &Section : ToolSectionFilter(*Obj)) {
     StringRef Name = unwrapOrError(Section.getName(), Obj->getFileName());
     uint64_t BaseAddr = Section.getAddress();
@@ -2308,7 +2401,11 @@ void objdump::printSectionContents(const ObjectFile *Obj) {
     if (!Size)
       continue;
 
-    outs() << "Contents of section " << Name << ":\n";
+    outs() << "Contents of section ";
+    StringRef SegmentName = getSegmentName(MachO, Section);
+    if (!SegmentName.empty())
+      outs() << SegmentName << ",";
+    outs() << Name << ":\n";
     if (Section.isBSS()) {
       outs() << format("<skipping contents of bss section at [%04" PRIx64
                        ", %04" PRIx64 ")>\n",
@@ -2466,11 +2563,9 @@ void objdump::printSymbol(const ObjectFile *O, const SymbolRef &Symbol,
   } else if (Section == O->section_end()) {
     outs() << "*UND*";
   } else {
-    if (MachO) {
-      DataRefImpl DR = Section->getRawDataRefImpl();
-      StringRef SegmentName = MachO->getSectionFinalSegmentName(DR);
+    StringRef SegmentName = getSegmentName(MachO, *Section);
+    if (!SegmentName.empty())
       outs() << SegmentName << ",";
-    }
     StringRef SectionName = unwrapOrError(Section->getName(), FileName);
     outs() << SectionName;
   }
@@ -2882,6 +2977,10 @@ int main(int argc, char **argv) {
   if (InputFilenames.empty())
     InputFilenames.push_back("a.out");
 
+  // Removes trailing separators from prefix.
+  while (!Prefix.empty() && sys::path::is_separator(Prefix.back()))
+    Prefix.pop_back();
+
   if (AllHeaders)
     ArchiveHeaders = FileHeaders = PrivateHeaders = Relocations =
         SectionHeaders = SymbolTable = true;
diff --git a/contrib/llvm-project/llvm/tools/llvm-objdump/llvm-objdump.h b/contrib/llvm-project/llvm/tools/llvm-objdump/llvm-objdump.h
index 390fc62d09f8..99bf191a301e 100644
--- a/contrib/llvm-project/llvm/tools/llvm-objdump/llvm-objdump.h
+++ b/contrib/llvm-project/llvm/tools/llvm-objdump/llvm-objdump.h
@@ -18,6 +18,7 @@
 
 namespace llvm {
 class StringRef;
+class Twine;
 
 namespace object {
 class ELFObjectFileBase;
@@ -39,6 +40,7 @@ extern cl::list<std::string> MAttrs;
 extern cl::opt<std::string> MCPU;
 extern cl::opt<bool> NoShowRawInsn;
 extern cl::opt<bool> NoLeadingAddr;
+extern cl::opt<std::string> Prefix;
 extern cl::opt<bool> PrintImmHex;
 extern cl::opt<bool> PrivateHeaders;
 extern cl::opt<bool> Relocations;
@@ -126,11 +128,11 @@ void printSymbolTable(const object::ObjectFile *O, StringRef ArchiveName,
 void printSymbol(const object::ObjectFile *O, const object::SymbolRef &Symbol,
                  StringRef FileName, StringRef ArchiveName,
                  StringRef ArchitectureName, bool DumpDynamic);
-LLVM_ATTRIBUTE_NORETURN void reportError(StringRef File, Twine Message);
+LLVM_ATTRIBUTE_NORETURN void reportError(StringRef File, const Twine &Message);
 LLVM_ATTRIBUTE_NORETURN void reportError(Error E, StringRef FileName,
                                          StringRef ArchiveName = "",
                                          StringRef ArchitectureName = "");
-void reportWarning(Twine Message, StringRef File);
+void reportWarning(const Twine &Message, StringRef File);
 
 template <typename T, typename... Ts>
 T unwrapOrError(Expected<T> EO, Ts &&... Args) {
diff --git a/contrib/llvm-project/llvm/tools/llvm-pdbutil/DumpOutputStyle.cpp b/contrib/llvm-project/llvm/tools/llvm-pdbutil/DumpOutputStyle.cpp
index aa185e8a2f22..babdb56a718c 100644
--- a/contrib/llvm-project/llvm/tools/llvm-pdbutil/DumpOutputStyle.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-pdbutil/DumpOutputStyle.cpp
@@ -738,21 +738,17 @@ namespace {
 constexpr uint32_t kNoneUdtKind = 0;
 constexpr uint32_t kSimpleUdtKind = 1;
 constexpr uint32_t kUnknownUdtKind = 2;
-const StringRef NoneLabel("<none type>");
-const StringRef SimpleLabel("<simple type>");
-const StringRef UnknownLabel("<unknown type>");
-
 } // namespace
 
-static StringRef getUdtStatLabel(uint32_t Kind) {
+static std::string getUdtStatLabel(uint32_t Kind) {
   if (Kind == kNoneUdtKind)
-    return NoneLabel;
+    return "<none type>";
 
   if (Kind == kSimpleUdtKind)
-    return SimpleLabel;
+    return "<simple type>";
 
   if (Kind == kUnknownUdtKind)
-    return UnknownLabel;
+    return "<unknown type>";
 
   return formatTypeLeafKind(static_cast<TypeLeafKind>(Kind));
 }
@@ -760,7 +756,7 @@ static StringRef getUdtStatLabel(uint32_t Kind) {
 static uint32_t getLongestTypeLeafName(const StatCollection &Stats) {
   size_t L = 0;
   for (const auto &Stat : Stats.Individual) {
-    StringRef Label = getUdtStatLabel(Stat.first);
+    std::string Label = getUdtStatLabel(Stat.first);
     L = std::max(L, Label.size());
   }
   return static_cast<uint32_t>(L);
@@ -869,7 +865,7 @@ Error DumpOutputStyle::dumpUdtStats() {
 
   P.formatLine("{0}", fmt_repeat('-', TableWidth));
   for (const auto &Stat : UdtTargetStats.getStatsSortedBySize()) {
-    StringRef Label = getUdtStatLabel(Stat.first);
+    std::string Label = getUdtStatLabel(Stat.first);
     P.formatLine("{0} | {1:N}  {2:N}",
                  fmt_align(Label, AlignStyle::Right, FieldWidth),
                  fmt_align(Stat.second.Count, AlignStyle::Right, CD),
@@ -1911,7 +1907,6 @@ void DumpOutputStyle::dumpSectionHeaders(StringRef Label, DbgHeaderType Type) {
                             P.getIndentLevel(), Header.Characteristics, 1, ""));
     ++I;
   }
-  return;
 }
 
 Error DumpOutputStyle::dumpSectionContribs() {
diff --git a/contrib/llvm-project/llvm/tools/llvm-pdbutil/FormatUtil.cpp b/contrib/llvm-project/llvm/tools/llvm-pdbutil/FormatUtil.cpp
index c9ef19609496..b4837398f1d0 100644
--- a/contrib/llvm-project/llvm/tools/llvm-pdbutil/FormatUtil.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-pdbutil/FormatUtil.cpp
@@ -156,16 +156,17 @@ std::string llvm::pdb::formatSymbolKind(SymbolKind K) {
   return formatUnknownEnum(K);
 }
 
-StringRef llvm::pdb::formatTypeLeafKind(TypeLeafKind K) {
+std::string llvm::pdb::formatTypeLeafKind(TypeLeafKind K) {
   switch (K) {
 #define TYPE_RECORD(EnumName, value, name)                                     \
   case EnumName:                                                               \
     return #EnumName;
 #include "llvm/DebugInfo/CodeView/CodeViewTypes.def"
   default:
-    llvm_unreachable("Unknown type leaf kind!");
+    return formatv("UNKNOWN RECORD ({0:X})",
+                   static_cast<std::underlying_type_t<TypeLeafKind>>(K))
+        .str();
   }
-  return "";
 }
 
 std::string llvm::pdb::formatSegmentOffset(uint16_t Segment, uint32_t Offset) {
diff --git a/contrib/llvm-project/llvm/tools/llvm-pdbutil/FormatUtil.h b/contrib/llvm-project/llvm/tools/llvm-pdbutil/FormatUtil.h
index 1a006844e011..b99ccec215b5 100644
--- a/contrib/llvm-project/llvm/tools/llvm-pdbutil/FormatUtil.h
+++ b/contrib/llvm-project/llvm/tools/llvm-pdbutil/FormatUtil.h
@@ -66,7 +66,7 @@ std::string typesetStringList(uint32_t IndentLevel,
 std::string formatChunkKind(codeview::DebugSubsectionKind Kind,
                             bool Friendly = true);
 std::string formatSymbolKind(codeview::SymbolKind K);
-StringRef formatTypeLeafKind(codeview::TypeLeafKind K);
+std::string formatTypeLeafKind(codeview::TypeLeafKind K);
 
 /// Returns the number of digits in the given integer.
 inline int NumDigits(uint64_t N) {
@@ -123,7 +123,7 @@ struct EndianAdapter final
   explicit EndianAdapter(EndianType &&Item)
       : FormatAdapter<EndianType>(std::move(Item)) {}
 
-  void format(llvm::raw_ostream &Stream, StringRef Style) {
+  void format(llvm::raw_ostream &Stream, StringRef Style) override {
     format_provider<T>::format(static_cast<T>(this->Item), Stream, Style);
   }
 };
diff --git a/contrib/llvm-project/llvm/tools/llvm-pdbutil/MinimalSymbolDumper.cpp b/contrib/llvm-project/llvm/tools/llvm-pdbutil/MinimalSymbolDumper.cpp
index 7a06140855f8..787785c34b78 100644
--- a/contrib/llvm-project/llvm/tools/llvm-pdbutil/MinimalSymbolDumper.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-pdbutil/MinimalSymbolDumper.cpp
@@ -288,7 +288,18 @@ static std::string formatCookieKind(FrameCookieKind Kind) {
 }
 
 static std::string formatRegisterId(RegisterId Id, CPUType Cpu) {
-  if (Cpu == CPUType::ARM64) {
+  if (Cpu == CPUType::ARMNT) {
+    switch (Id) {
+#define CV_REGISTERS_ARM
+#define CV_REGISTER(name, val) RETURN_CASE(RegisterId, name, #name)
+#include "llvm/DebugInfo/CodeView/CodeViewRegisters.def"
+#undef CV_REGISTER
+#undef CV_REGISTERS_ARM
+
+    default:
+      break;
+    }
+  } else if (Cpu == CPUType::ARM64) {
     switch (Id) {
 #define CV_REGISTERS_ARM64
 #define CV_REGISTER(name, val) RETURN_CASE(RegisterId, name, #name)
diff --git a/contrib/llvm-project/llvm/tools/llvm-pdbutil/llvm-pdbutil.cpp b/contrib/llvm-project/llvm/tools/llvm-pdbutil/llvm-pdbutil.cpp
index 00092e71c6b4..19f4880ab5eb 100644
--- a/contrib/llvm-project/llvm/tools/llvm-pdbutil/llvm-pdbutil.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-pdbutil/llvm-pdbutil.cpp
@@ -892,9 +892,9 @@ static void dumpBytes(StringRef Path) {
 bool opts::pretty::shouldDumpSymLevel(SymLevel Search) {
   if (SymTypes.empty())
     return true;
-  if (llvm::find(SymTypes, Search) != SymTypes.end())
+  if (llvm::is_contained(SymTypes, Search))
     return true;
-  if (llvm::find(SymTypes, SymLevel::All) != SymTypes.end())
+  if (llvm::is_contained(SymTypes, SymLevel::All))
     return true;
   return false;
 }
diff --git a/contrib/llvm-project/llvm/tools/llvm-profdata/llvm-profdata.cpp b/contrib/llvm-project/llvm/tools/llvm-profdata/llvm-profdata.cpp
index 843f072a61c3..7e53c30c7579 100644
--- a/contrib/llvm-project/llvm/tools/llvm-profdata/llvm-profdata.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-profdata/llvm-profdata.cpp
@@ -251,6 +251,7 @@ static void loadInput(const WeightedFile &Input, SymbolRemapper *Remapper,
         Filename);
     return;
   }
+  WC->Writer.setInstrEntryBBEnabled(Reader->instrEntryBBEnabled());
 
   for (auto &I : *Reader) {
     if (Remapper)
@@ -295,7 +296,9 @@ static void writeInstrProfile(StringRef OutputFilename,
                               ProfileFormat OutputFormat,
                               InstrProfWriter &Writer) {
   std::error_code EC;
-  raw_fd_ostream Output(OutputFilename.data(), EC, sys::fs::OF_None);
+  raw_fd_ostream Output(OutputFilename.data(), EC,
+                        OutputFormat == PF_Text ? sys::fs::OF_Text
+                                                : sys::fs::OF_None);
   if (EC)
     exitWithErrorCode(EC, OutputFilename);
 
@@ -385,6 +388,172 @@ static void mergeInstrProfile(const WeightedFileVector &Inputs,
   writeInstrProfile(OutputFilename, OutputFormat, Contexts[0]->Writer);
 }
 
+/// The profile entry for a function in instrumentation profile.
+struct InstrProfileEntry {
+  uint64_t MaxCount = 0;
+  float ZeroCounterRatio = 0.0;
+  InstrProfRecord *ProfRecord;
+  InstrProfileEntry(InstrProfRecord *Record);
+  InstrProfileEntry() = default;
+};
+
+InstrProfileEntry::InstrProfileEntry(InstrProfRecord *Record) {
+  ProfRecord = Record;
+  uint64_t CntNum = Record->Counts.size();
+  uint64_t ZeroCntNum = 0;
+  for (size_t I = 0; I < CntNum; ++I) {
+    MaxCount = std::max(MaxCount, Record->Counts[I]);
+    ZeroCntNum += !Record->Counts[I];
+  }
+  ZeroCounterRatio = (float)ZeroCntNum / CntNum;
+}
+
+/// Either set all the counters in the instr profile entry \p IFE to -1
+/// in order to drop the profile or scale up the counters in \p IFP to
+/// be above hot threshold. We use the ratio of zero counters in the
+/// profile of a function to decide the profile is helpful or harmful
+/// for performance, and to choose whether to scale up or drop it.
+static void updateInstrProfileEntry(InstrProfileEntry &IFE,
+                                    uint64_t HotInstrThreshold,
+                                    float ZeroCounterThreshold) {
+  InstrProfRecord *ProfRecord = IFE.ProfRecord;
+  if (!IFE.MaxCount || IFE.ZeroCounterRatio > ZeroCounterThreshold) {
+    // If all or most of the counters of the function are zero, the
+    // profile is unaccountable and shuld be dropped. Reset all the
+    // counters to be -1 and PGO profile-use will drop the profile.
+    // All counters being -1 also implies that the function is hot so
+    // PGO profile-use will also set the entry count metadata to be
+    // above hot threshold.
+    for (size_t I = 0; I < ProfRecord->Counts.size(); ++I)
+      ProfRecord->Counts[I] = -1;
+    return;
+  }
+
+  // Scale up the MaxCount to be multiple times above hot threshold.
+  const unsigned MultiplyFactor = 3;
+  uint64_t Numerator = HotInstrThreshold * MultiplyFactor;
+  uint64_t Denominator = IFE.MaxCount;
+  ProfRecord->scale(Numerator, Denominator, [&](instrprof_error E) {
+    warn(toString(make_error<InstrProfError>(E)));
+  });
+}
+
+const uint64_t ColdPercentileIdx = 15;
+const uint64_t HotPercentileIdx = 11;
+
+/// Adjust the instr profile in \p WC based on the sample profile in
+/// \p Reader.
+static void
+adjustInstrProfile(std::unique_ptr<WriterContext> &WC,
+                   std::unique_ptr<sampleprof::SampleProfileReader> &Reader,
+                   unsigned SupplMinSizeThreshold, float ZeroCounterThreshold,
+                   unsigned InstrProfColdThreshold) {
+  // Function to its entry in instr profile.
+  StringMap<InstrProfileEntry> InstrProfileMap;
+  InstrProfSummaryBuilder IPBuilder(ProfileSummaryBuilder::DefaultCutoffs);
+  for (auto &PD : WC->Writer.getProfileData()) {
+    // Populate IPBuilder.
+    for (const auto &PDV : PD.getValue()) {
+      InstrProfRecord Record = PDV.second;
+      IPBuilder.addRecord(Record);
+    }
+
+    // If a function has multiple entries in instr profile, skip it.
+    if (PD.getValue().size() != 1)
+      continue;
+
+    // Initialize InstrProfileMap.
+    InstrProfRecord *R = &PD.getValue().begin()->second;
+    InstrProfileMap[PD.getKey()] = InstrProfileEntry(R);
+  }
+
+  ProfileSummary InstrPS = *IPBuilder.getSummary();
+  ProfileSummary SamplePS = Reader->getSummary();
+
+  // Compute cold thresholds for instr profile and sample profile.
+  uint64_t ColdSampleThreshold =
+      ProfileSummaryBuilder::getEntryForPercentile(
+          SamplePS.getDetailedSummary(),
+          ProfileSummaryBuilder::DefaultCutoffs[ColdPercentileIdx])
+          .MinCount;
+  uint64_t HotInstrThreshold =
+      ProfileSummaryBuilder::getEntryForPercentile(
+          InstrPS.getDetailedSummary(),
+          ProfileSummaryBuilder::DefaultCutoffs[HotPercentileIdx])
+          .MinCount;
+  uint64_t ColdInstrThreshold =
+      InstrProfColdThreshold
+          ? InstrProfColdThreshold
+          : ProfileSummaryBuilder::getEntryForPercentile(
+                InstrPS.getDetailedSummary(),
+                ProfileSummaryBuilder::DefaultCutoffs[ColdPercentileIdx])
+                .MinCount;
+
+  // Find hot/warm functions in sample profile which is cold in instr profile
+  // and adjust the profiles of those functions in the instr profile.
+  for (const auto &PD : Reader->getProfiles()) {
+    StringRef FName = PD.getKey();
+    const sampleprof::FunctionSamples &FS = PD.getValue();
+    auto It = InstrProfileMap.find(FName);
+    if (FS.getHeadSamples() > ColdSampleThreshold &&
+        It != InstrProfileMap.end() &&
+        It->second.MaxCount <= ColdInstrThreshold &&
+        FS.getBodySamples().size() >= SupplMinSizeThreshold) {
+      updateInstrProfileEntry(It->second, HotInstrThreshold,
+                              ZeroCounterThreshold);
+    }
+  }
+}
+
+/// The main function to supplement instr profile with sample profile.
+/// \Inputs contains the instr profile. \p SampleFilename specifies the
+/// sample profile. \p OutputFilename specifies the output profile name.
+/// \p OutputFormat specifies the output profile format. \p OutputSparse
+/// specifies whether to generate sparse profile. \p SupplMinSizeThreshold
+/// specifies the minimal size for the functions whose profile will be
+/// adjusted. \p ZeroCounterThreshold is the threshold to check whether
+/// a function contains too many zero counters and whether its profile
+/// should be dropped. \p InstrProfColdThreshold is the user specified
+/// cold threshold which will override the cold threshold got from the
+/// instr profile summary.
+static void supplementInstrProfile(
+    const WeightedFileVector &Inputs, StringRef SampleFilename,
+    StringRef OutputFilename, ProfileFormat OutputFormat, bool OutputSparse,
+    unsigned SupplMinSizeThreshold, float ZeroCounterThreshold,
+    unsigned InstrProfColdThreshold) {
+  if (OutputFilename.compare("-") == 0)
+    exitWithError("Cannot write indexed profdata format to stdout.");
+  if (Inputs.size() != 1)
+    exitWithError("Expect one input to be an instr profile.");
+  if (Inputs[0].Weight != 1)
+    exitWithError("Expect instr profile doesn't have weight.");
+
+  StringRef InstrFilename = Inputs[0].Filename;
+
+  // Read sample profile.
+  LLVMContext Context;
+  auto ReaderOrErr =
+      sampleprof::SampleProfileReader::create(SampleFilename.str(), Context);
+  if (std::error_code EC = ReaderOrErr.getError())
+    exitWithErrorCode(EC, SampleFilename);
+  auto Reader = std::move(ReaderOrErr.get());
+  if (std::error_code EC = Reader->read())
+    exitWithErrorCode(EC, SampleFilename);
+
+  // Read instr profile.
+  std::mutex ErrorLock;
+  SmallSet<instrprof_error, 4> WriterErrorCodes;
+  auto WC = std::make_unique<WriterContext>(OutputSparse, ErrorLock,
+                                            WriterErrorCodes);
+  loadInput(Inputs[0], nullptr, WC.get());
+  if (WC->Errors.size() > 0)
+    exitWithError(std::move(WC->Errors[0].first), InstrFilename);
+
+  adjustInstrProfile(WC, Reader, SupplMinSizeThreshold, ZeroCounterThreshold,
+                     InstrProfColdThreshold);
+  writeInstrProfile(OutputFilename, OutputFormat, WC->Writer);
+}
+
 /// Make a copy of the given function samples with all symbol names remapped
 /// by the provided symbol remapper.
 static sampleprof::FunctionSamples
@@ -493,6 +662,7 @@ mergeSampleProfile(const WeightedFileVector &Inputs, SymbolRemapper *Remapper,
   SmallVector<std::unique_ptr<sampleprof::SampleProfileReader>, 5> Readers;
   LLVMContext Context;
   sampleprof::ProfileSymbolList WriterList;
+  Optional<bool> ProfileIsProbeBased;
   for (const auto &Input : Inputs) {
     auto ReaderOrErr = SampleProfileReader::create(Input.Filename, Context);
     if (std::error_code EC = ReaderOrErr.getError()) {
@@ -513,6 +683,11 @@ mergeSampleProfile(const WeightedFileVector &Inputs, SymbolRemapper *Remapper,
     }
 
     StringMap<FunctionSamples> &Profiles = Reader->getProfiles();
+    if (ProfileIsProbeBased &&
+        ProfileIsProbeBased != FunctionSamples::ProfileIsProbeBased)
+      exitWithError(
+          "cannot merge probe-based profile with non-probe-based profile");
+    ProfileIsProbeBased = FunctionSamples::ProfileIsProbeBased;
     for (StringMap<FunctionSamples>::iterator I = Profiles.begin(),
                                               E = Profiles.end();
          I != E; ++I) {
@@ -521,7 +696,7 @@ mergeSampleProfile(const WeightedFileVector &Inputs, SymbolRemapper *Remapper,
           Remapper ? remapSamples(I->second, *Remapper, Result)
                    : FunctionSamples();
       FunctionSamples &Samples = Remapper ? Remapped : I->second;
-      StringRef FName = Samples.getName();
+      StringRef FName = Samples.getNameWithContext(true);
       MergeResult(Result, ProfileMap[FName].merge(Samples, Input.Weight));
       if (Result != sampleprof_error::success) {
         std::error_code EC = make_error_code(Result);
@@ -679,6 +854,28 @@ static int merge_main(int argc, const char *argv[]) {
   cl::opt<bool> GenPartialProfile(
       "gen-partial-profile", cl::init(false), cl::Hidden,
       cl::desc("Generate a partial profile (only meaningful for -extbinary)"));
+  cl::opt<std::string> SupplInstrWithSample(
+      "supplement-instr-with-sample", cl::init(""), cl::Hidden,
+      cl::desc("Supplement an instr profile with sample profile, to correct "
+               "the profile unrepresentativeness issue. The sample "
+               "profile is the input of the flag. Output will be in instr "
+               "format (The flag only works with -instr)"));
+  cl::opt<float> ZeroCounterThreshold(
+      "zero-counter-threshold", cl::init(0.7), cl::Hidden,
+      cl::desc("For the function which is cold in instr profile but hot in "
+               "sample profile, if the ratio of the number of zero counters "
+               "divided by the the total number of counters is above the "
+               "threshold, the profile of the function will be regarded as "
+               "being harmful for performance and will be dropped. "));
+  cl::opt<unsigned> SupplMinSizeThreshold(
+      "suppl-min-size-threshold", cl::init(10), cl::Hidden,
+      cl::desc("If the size of a function is smaller than the threshold, "
+               "assume it can be inlined by PGO early inliner and it won't "
+               "be adjusted based on sample profile. "));
+  cl::opt<unsigned> InstrProfColdThreshold(
+      "instr-prof-cold-threshold", cl::init(0), cl::Hidden,
+      cl::desc("User specified cold threshold for instr profile which will "
+               "override the cold threshold got from profile summary. "));
 
   cl::ParseCommandLineOptions(argc, argv, "LLVM profile data merger\n");
 
@@ -707,6 +904,17 @@ static int merge_main(int argc, const char *argv[]) {
   if (!RemappingFile.empty())
     Remapper = SymbolRemapper::create(RemappingFile);
 
+  if (!SupplInstrWithSample.empty()) {
+    if (ProfileKind != instr)
+      exitWithError(
+          "-supplement-instr-with-sample can only work with -instr. ");
+
+    supplementInstrProfile(WeightedInputs, SupplInstrWithSample, OutputFilename,
+                           OutputFormat, OutputSparse, SupplMinSizeThreshold,
+                           ZeroCounterThreshold, InstrProfColdThreshold);
+    return 0;
+  }
+
   if (ProfileKind == instr)
     mergeInstrProfile(WeightedInputs, Remapper.get(), OutputFilename,
                       OutputFormat, OutputSparse, NumThreads, FailureMode);
@@ -745,6 +953,933 @@ static void overlapInstrProfile(const std::string &BaseFilename,
   Overlap.dump(OS);
 }
 
+namespace {
+struct SampleOverlapStats {
+  StringRef BaseName;
+  StringRef TestName;
+  // Number of overlap units
+  uint64_t OverlapCount;
+  // Total samples of overlap units
+  uint64_t OverlapSample;
+  // Number of and total samples of units that only present in base or test
+  // profile
+  uint64_t BaseUniqueCount;
+  uint64_t BaseUniqueSample;
+  uint64_t TestUniqueCount;
+  uint64_t TestUniqueSample;
+  // Number of units and total samples in base or test profile
+  uint64_t BaseCount;
+  uint64_t BaseSample;
+  uint64_t TestCount;
+  uint64_t TestSample;
+  // Number of and total samples of units that present in at least one profile
+  uint64_t UnionCount;
+  uint64_t UnionSample;
+  // Weighted similarity
+  double Similarity;
+  // For SampleOverlapStats instances representing functions, weights of the
+  // function in base and test profiles
+  double BaseWeight;
+  double TestWeight;
+
+  SampleOverlapStats()
+      : OverlapCount(0), OverlapSample(0), BaseUniqueCount(0),
+        BaseUniqueSample(0), TestUniqueCount(0), TestUniqueSample(0),
+        BaseCount(0), BaseSample(0), TestCount(0), TestSample(0), UnionCount(0),
+        UnionSample(0), Similarity(0.0), BaseWeight(0.0), TestWeight(0.0) {}
+};
+} // end anonymous namespace
+
+namespace {
+struct FuncSampleStats {
+  uint64_t SampleSum;
+  uint64_t MaxSample;
+  uint64_t HotBlockCount;
+  FuncSampleStats() : SampleSum(0), MaxSample(0), HotBlockCount(0) {}
+  FuncSampleStats(uint64_t SampleSum, uint64_t MaxSample,
+                  uint64_t HotBlockCount)
+      : SampleSum(SampleSum), MaxSample(MaxSample),
+        HotBlockCount(HotBlockCount) {}
+};
+} // end anonymous namespace
+
+namespace {
+enum MatchStatus { MS_Match, MS_FirstUnique, MS_SecondUnique, MS_None };
+
+// Class for updating merging steps for two sorted maps. The class should be
+// instantiated with a map iterator type.
+template <class T> class MatchStep {
+public:
+  MatchStep() = delete;
+
+  MatchStep(T FirstIter, T FirstEnd, T SecondIter, T SecondEnd)
+      : FirstIter(FirstIter), FirstEnd(FirstEnd), SecondIter(SecondIter),
+        SecondEnd(SecondEnd), Status(MS_None) {}
+
+  bool areBothFinished() const {
+    return (FirstIter == FirstEnd && SecondIter == SecondEnd);
+  }
+
+  bool isFirstFinished() const { return FirstIter == FirstEnd; }
+
+  bool isSecondFinished() const { return SecondIter == SecondEnd; }
+
+  /// Advance one step based on the previous match status unless the previous
+  /// status is MS_None. Then update Status based on the comparison between two
+  /// container iterators at the current step. If the previous status is
+  /// MS_None, it means two iterators are at the beginning and no comparison has
+  /// been made, so we simply update Status without advancing the iterators.
+  void updateOneStep();
+
+  T getFirstIter() const { return FirstIter; }
+
+  T getSecondIter() const { return SecondIter; }
+
+  MatchStatus getMatchStatus() const { return Status; }
+
+private:
+  // Current iterator and end iterator of the first container.
+  T FirstIter;
+  T FirstEnd;
+  // Current iterator and end iterator of the second container.
+  T SecondIter;
+  T SecondEnd;
+  // Match status of the current step.
+  MatchStatus Status;
+};
+} // end anonymous namespace
+
+template <class T> void MatchStep<T>::updateOneStep() {
+  switch (Status) {
+  case MS_Match:
+    ++FirstIter;
+    ++SecondIter;
+    break;
+  case MS_FirstUnique:
+    ++FirstIter;
+    break;
+  case MS_SecondUnique:
+    ++SecondIter;
+    break;
+  case MS_None:
+    break;
+  }
+
+  // Update Status according to iterators at the current step.
+  if (areBothFinished())
+    return;
+  if (FirstIter != FirstEnd &&
+      (SecondIter == SecondEnd || FirstIter->first < SecondIter->first))
+    Status = MS_FirstUnique;
+  else if (SecondIter != SecondEnd &&
+           (FirstIter == FirstEnd || SecondIter->first < FirstIter->first))
+    Status = MS_SecondUnique;
+  else
+    Status = MS_Match;
+}
+
+// Return the sum of line/block samples, the max line/block sample, and the
+// number of line/block samples above the given threshold in a function
+// including its inlinees.
+static void getFuncSampleStats(const sampleprof::FunctionSamples &Func,
+                               FuncSampleStats &FuncStats,
+                               uint64_t HotThreshold) {
+  for (const auto &L : Func.getBodySamples()) {
+    uint64_t Sample = L.second.getSamples();
+    FuncStats.SampleSum += Sample;
+    FuncStats.MaxSample = std::max(FuncStats.MaxSample, Sample);
+    if (Sample >= HotThreshold)
+      ++FuncStats.HotBlockCount;
+  }
+
+  for (const auto &C : Func.getCallsiteSamples()) {
+    for (const auto &F : C.second)
+      getFuncSampleStats(F.second, FuncStats, HotThreshold);
+  }
+}
+
+/// Predicate that determines if a function is hot with a given threshold. We
+/// keep it separate from its callsites for possible extension in the future.
+static bool isFunctionHot(const FuncSampleStats &FuncStats,
+                          uint64_t HotThreshold) {
+  // We intentionally compare the maximum sample count in a function with the
+  // HotThreshold to get an approximate determination on hot functions.
+  return (FuncStats.MaxSample >= HotThreshold);
+}
+
+namespace {
+class SampleOverlapAggregator {
+public:
+  SampleOverlapAggregator(const std::string &BaseFilename,
+                          const std::string &TestFilename,
+                          double LowSimilarityThreshold, double Epsilon,
+                          const OverlapFuncFilters &FuncFilter)
+      : BaseFilename(BaseFilename), TestFilename(TestFilename),
+        LowSimilarityThreshold(LowSimilarityThreshold), Epsilon(Epsilon),
+        FuncFilter(FuncFilter) {}
+
+  /// Detect 0-sample input profile and report to output stream. This interface
+  /// should be called after loadProfiles().
+  bool detectZeroSampleProfile(raw_fd_ostream &OS) const;
+
+  /// Write out function-level similarity statistics for functions specified by
+  /// options --function, --value-cutoff, and --similarity-cutoff.
+  void dumpFuncSimilarity(raw_fd_ostream &OS) const;
+
+  /// Write out program-level similarity and overlap statistics.
+  void dumpProgramSummary(raw_fd_ostream &OS) const;
+
+  /// Write out hot-function and hot-block statistics for base_profile,
+  /// test_profile, and their overlap. For both cases, the overlap HO is
+  /// calculated as follows:
+  ///    Given the number of functions (or blocks) that are hot in both profiles
+  ///    HCommon and the number of functions (or blocks) that are hot in at
+  ///    least one profile HUnion, HO = HCommon / HUnion.
+  void dumpHotFuncAndBlockOverlap(raw_fd_ostream &OS) const;
+
+  /// This function tries matching functions in base and test profiles. For each
+  /// pair of matched functions, it aggregates the function-level
+  /// similarity into a profile-level similarity. It also dump function-level
+  /// similarity information of functions specified by --function,
+  /// --value-cutoff, and --similarity-cutoff options. The program-level
+  /// similarity PS is computed as follows:
+  ///     Given function-level similarity FS(A) for all function A, the
+  ///     weight of function A in base profile WB(A), and the weight of function
+  ///     A in test profile WT(A), compute PS(base_profile, test_profile) =
+  ///     sum_A(FS(A) * avg(WB(A), WT(A))) ranging in [0.0f to 1.0f] with 0.0
+  ///     meaning no-overlap.
+  void computeSampleProfileOverlap(raw_fd_ostream &OS);
+
+  /// Initialize ProfOverlap with the sum of samples in base and test
+  /// profiles. This function also computes and keeps the sum of samples and
+  /// max sample counts of each function in BaseStats and TestStats for later
+  /// use to avoid re-computations.
+  void initializeSampleProfileOverlap();
+
+  /// Load profiles specified by BaseFilename and TestFilename.
+  std::error_code loadProfiles();
+
+private:
+  SampleOverlapStats ProfOverlap;
+  SampleOverlapStats HotFuncOverlap;
+  SampleOverlapStats HotBlockOverlap;
+  std::string BaseFilename;
+  std::string TestFilename;
+  std::unique_ptr<sampleprof::SampleProfileReader> BaseReader;
+  std::unique_ptr<sampleprof::SampleProfileReader> TestReader;
+  // BaseStats and TestStats hold FuncSampleStats for each function, with
+  // function name as the key.
+  StringMap<FuncSampleStats> BaseStats;
+  StringMap<FuncSampleStats> TestStats;
+  // Low similarity threshold in floating point number
+  double LowSimilarityThreshold;
+  // Block samples above BaseHotThreshold or TestHotThreshold are considered hot
+  // for tracking hot blocks.
+  uint64_t BaseHotThreshold;
+  uint64_t TestHotThreshold;
+  // A small threshold used to round the results of floating point accumulations
+  // to resolve imprecision.
+  const double Epsilon;
+  std::multimap<double, SampleOverlapStats, std::greater<double>>
+      FuncSimilarityDump;
+  // FuncFilter carries specifications in options --value-cutoff and
+  // --function.
+  OverlapFuncFilters FuncFilter;
+  // Column offsets for printing the function-level details table.
+  static const unsigned int TestWeightCol = 15;
+  static const unsigned int SimilarityCol = 30;
+  static const unsigned int OverlapCol = 43;
+  static const unsigned int BaseUniqueCol = 53;
+  static const unsigned int TestUniqueCol = 67;
+  static const unsigned int BaseSampleCol = 81;
+  static const unsigned int TestSampleCol = 96;
+  static const unsigned int FuncNameCol = 111;
+
+  /// Return a similarity of two line/block sample counters in the same
+  /// function in base and test profiles. The line/block-similarity BS(i) is
+  /// computed as follows:
+  ///    For an offsets i, given the sample count at i in base profile BB(i),
+  ///    the sample count at i in test profile BT(i), the sum of sample counts
+  ///    in this function in base profile SB, and the sum of sample counts in
+  ///    this function in test profile ST, compute BS(i) = 1.0 - fabs(BB(i)/SB -
+  ///    BT(i)/ST), ranging in [0.0f to 1.0f] with 0.0 meaning no-overlap.
+  double computeBlockSimilarity(uint64_t BaseSample, uint64_t TestSample,
+                                const SampleOverlapStats &FuncOverlap) const;
+
+  void updateHotBlockOverlap(uint64_t BaseSample, uint64_t TestSample,
+                             uint64_t HotBlockCount);
+
+  void getHotFunctions(const StringMap<FuncSampleStats> &ProfStats,
+                       StringMap<FuncSampleStats> &HotFunc,
+                       uint64_t HotThreshold) const;
+
+  void computeHotFuncOverlap();
+
+  /// This function updates statistics in FuncOverlap, HotBlockOverlap, and
+  /// Difference for two sample units in a matched function according to the
+  /// given match status.
+  void updateOverlapStatsForFunction(uint64_t BaseSample, uint64_t TestSample,
+                                     uint64_t HotBlockCount,
+                                     SampleOverlapStats &FuncOverlap,
+                                     double &Difference, MatchStatus Status);
+
+  /// This function updates statistics in FuncOverlap, HotBlockOverlap, and
+  /// Difference for unmatched callees that only present in one profile in a
+  /// matched caller function.
+  void updateForUnmatchedCallee(const sampleprof::FunctionSamples &Func,
+                                SampleOverlapStats &FuncOverlap,
+                                double &Difference, MatchStatus Status);
+
+  /// This function updates sample overlap statistics of an overlap function in
+  /// base and test profile. It also calculates a function-internal similarity
+  /// FIS as follows:
+  ///    For offsets i that have samples in at least one profile in this
+  ///    function A, given BS(i) returned by computeBlockSimilarity(), compute
+  ///    FIS(A) = (2.0 - sum_i(1.0 - BS(i))) / 2, ranging in [0.0f to 1.0f] with
+  ///    0.0 meaning no overlap.
+  double computeSampleFunctionInternalOverlap(
+      const sampleprof::FunctionSamples &BaseFunc,
+      const sampleprof::FunctionSamples &TestFunc,
+      SampleOverlapStats &FuncOverlap);
+
+  /// Function-level similarity (FS) is a weighted value over function internal
+  /// similarity (FIS). This function computes a function's FS from its FIS by
+  /// applying the weight.
+  double weightForFuncSimilarity(double FuncSimilarity, uint64_t BaseFuncSample,
+                                 uint64_t TestFuncSample) const;
+
+  /// The function-level similarity FS(A) for a function A is computed as
+  /// follows:
+  ///     Compute a function-internal similarity FIS(A) by
+  ///     computeSampleFunctionInternalOverlap(). Then, with the weight of
+  ///     function A in base profile WB(A), and the weight of function A in test
+  ///     profile WT(A), compute FS(A) = FIS(A) * (1.0 - fabs(WB(A) - WT(A)))
+  ///     ranging in [0.0f to 1.0f] with 0.0 meaning no overlap.
+  double
+  computeSampleFunctionOverlap(const sampleprof::FunctionSamples *BaseFunc,
+                               const sampleprof::FunctionSamples *TestFunc,
+                               SampleOverlapStats *FuncOverlap,
+                               uint64_t BaseFuncSample,
+                               uint64_t TestFuncSample);
+
+  /// Profile-level similarity (PS) is a weighted aggregate over function-level
+  /// similarities (FS). This method weights the FS value by the function
+  /// weights in the base and test profiles for the aggregation.
+  double weightByImportance(double FuncSimilarity, uint64_t BaseFuncSample,
+                            uint64_t TestFuncSample) const;
+};
+} // end anonymous namespace
+
+bool SampleOverlapAggregator::detectZeroSampleProfile(
+    raw_fd_ostream &OS) const {
+  bool HaveZeroSample = false;
+  if (ProfOverlap.BaseSample == 0) {
+    OS << "Sum of sample counts for profile " << BaseFilename << " is 0.\n";
+    HaveZeroSample = true;
+  }
+  if (ProfOverlap.TestSample == 0) {
+    OS << "Sum of sample counts for profile " << TestFilename << " is 0.\n";
+    HaveZeroSample = true;
+  }
+  return HaveZeroSample;
+}
+
+double SampleOverlapAggregator::computeBlockSimilarity(
+    uint64_t BaseSample, uint64_t TestSample,
+    const SampleOverlapStats &FuncOverlap) const {
+  double BaseFrac = 0.0;
+  double TestFrac = 0.0;
+  if (FuncOverlap.BaseSample > 0)
+    BaseFrac = static_cast<double>(BaseSample) / FuncOverlap.BaseSample;
+  if (FuncOverlap.TestSample > 0)
+    TestFrac = static_cast<double>(TestSample) / FuncOverlap.TestSample;
+  return 1.0 - std::fabs(BaseFrac - TestFrac);
+}
+
+void SampleOverlapAggregator::updateHotBlockOverlap(uint64_t BaseSample,
+                                                    uint64_t TestSample,
+                                                    uint64_t HotBlockCount) {
+  bool IsBaseHot = (BaseSample >= BaseHotThreshold);
+  bool IsTestHot = (TestSample >= TestHotThreshold);
+  if (!IsBaseHot && !IsTestHot)
+    return;
+
+  HotBlockOverlap.UnionCount += HotBlockCount;
+  if (IsBaseHot)
+    HotBlockOverlap.BaseCount += HotBlockCount;
+  if (IsTestHot)
+    HotBlockOverlap.TestCount += HotBlockCount;
+  if (IsBaseHot && IsTestHot)
+    HotBlockOverlap.OverlapCount += HotBlockCount;
+}
+
+void SampleOverlapAggregator::getHotFunctions(
+    const StringMap<FuncSampleStats> &ProfStats,
+    StringMap<FuncSampleStats> &HotFunc, uint64_t HotThreshold) const {
+  for (const auto &F : ProfStats) {
+    if (isFunctionHot(F.second, HotThreshold))
+      HotFunc.try_emplace(F.first(), F.second);
+  }
+}
+
+void SampleOverlapAggregator::computeHotFuncOverlap() {
+  StringMap<FuncSampleStats> BaseHotFunc;
+  getHotFunctions(BaseStats, BaseHotFunc, BaseHotThreshold);
+  HotFuncOverlap.BaseCount = BaseHotFunc.size();
+
+  StringMap<FuncSampleStats> TestHotFunc;
+  getHotFunctions(TestStats, TestHotFunc, TestHotThreshold);
+  HotFuncOverlap.TestCount = TestHotFunc.size();
+  HotFuncOverlap.UnionCount = HotFuncOverlap.TestCount;
+
+  for (const auto &F : BaseHotFunc) {
+    if (TestHotFunc.count(F.first()))
+      ++HotFuncOverlap.OverlapCount;
+    else
+      ++HotFuncOverlap.UnionCount;
+  }
+}
+
+void SampleOverlapAggregator::updateOverlapStatsForFunction(
+    uint64_t BaseSample, uint64_t TestSample, uint64_t HotBlockCount,
+    SampleOverlapStats &FuncOverlap, double &Difference, MatchStatus Status) {
+  assert(Status != MS_None &&
+         "Match status should be updated before updating overlap statistics");
+  if (Status == MS_FirstUnique) {
+    TestSample = 0;
+    FuncOverlap.BaseUniqueSample += BaseSample;
+  } else if (Status == MS_SecondUnique) {
+    BaseSample = 0;
+    FuncOverlap.TestUniqueSample += TestSample;
+  } else {
+    ++FuncOverlap.OverlapCount;
+  }
+
+  FuncOverlap.UnionSample += std::max(BaseSample, TestSample);
+  FuncOverlap.OverlapSample += std::min(BaseSample, TestSample);
+  Difference +=
+      1.0 - computeBlockSimilarity(BaseSample, TestSample, FuncOverlap);
+  updateHotBlockOverlap(BaseSample, TestSample, HotBlockCount);
+}
+
+void SampleOverlapAggregator::updateForUnmatchedCallee(
+    const sampleprof::FunctionSamples &Func, SampleOverlapStats &FuncOverlap,
+    double &Difference, MatchStatus Status) {
+  assert((Status == MS_FirstUnique || Status == MS_SecondUnique) &&
+         "Status must be either of the two unmatched cases");
+  FuncSampleStats FuncStats;
+  if (Status == MS_FirstUnique) {
+    getFuncSampleStats(Func, FuncStats, BaseHotThreshold);
+    updateOverlapStatsForFunction(FuncStats.SampleSum, 0,
+                                  FuncStats.HotBlockCount, FuncOverlap,
+                                  Difference, Status);
+  } else {
+    getFuncSampleStats(Func, FuncStats, TestHotThreshold);
+    updateOverlapStatsForFunction(0, FuncStats.SampleSum,
+                                  FuncStats.HotBlockCount, FuncOverlap,
+                                  Difference, Status);
+  }
+}
+
+double SampleOverlapAggregator::computeSampleFunctionInternalOverlap(
+    const sampleprof::FunctionSamples &BaseFunc,
+    const sampleprof::FunctionSamples &TestFunc,
+    SampleOverlapStats &FuncOverlap) {
+
+  using namespace sampleprof;
+
+  double Difference = 0;
+
+  // Accumulate Difference for regular line/block samples in the function.
+  // We match them through sort-merge join algorithm because
+  // FunctionSamples::getBodySamples() returns a map of sample counters ordered
+  // by their offsets.
+  MatchStep<BodySampleMap::const_iterator> BlockIterStep(
+      BaseFunc.getBodySamples().cbegin(), BaseFunc.getBodySamples().cend(),
+      TestFunc.getBodySamples().cbegin(), TestFunc.getBodySamples().cend());
+  BlockIterStep.updateOneStep();
+  while (!BlockIterStep.areBothFinished()) {
+    uint64_t BaseSample =
+        BlockIterStep.isFirstFinished()
+            ? 0
+            : BlockIterStep.getFirstIter()->second.getSamples();
+    uint64_t TestSample =
+        BlockIterStep.isSecondFinished()
+            ? 0
+            : BlockIterStep.getSecondIter()->second.getSamples();
+    updateOverlapStatsForFunction(BaseSample, TestSample, 1, FuncOverlap,
+                                  Difference, BlockIterStep.getMatchStatus());
+
+    BlockIterStep.updateOneStep();
+  }
+
+  // Accumulate Difference for callsite lines in the function. We match
+  // them through sort-merge algorithm because
+  // FunctionSamples::getCallsiteSamples() returns a map of callsite records
+  // ordered by their offsets.
+  MatchStep<CallsiteSampleMap::const_iterator> CallsiteIterStep(
+      BaseFunc.getCallsiteSamples().cbegin(),
+      BaseFunc.getCallsiteSamples().cend(),
+      TestFunc.getCallsiteSamples().cbegin(),
+      TestFunc.getCallsiteSamples().cend());
+  CallsiteIterStep.updateOneStep();
+  while (!CallsiteIterStep.areBothFinished()) {
+    MatchStatus CallsiteStepStatus = CallsiteIterStep.getMatchStatus();
+    assert(CallsiteStepStatus != MS_None &&
+           "Match status should be updated before entering loop body");
+
+    if (CallsiteStepStatus != MS_Match) {
+      auto Callsite = (CallsiteStepStatus == MS_FirstUnique)
+                          ? CallsiteIterStep.getFirstIter()
+                          : CallsiteIterStep.getSecondIter();
+      for (const auto &F : Callsite->second)
+        updateForUnmatchedCallee(F.second, FuncOverlap, Difference,
+                                 CallsiteStepStatus);
+    } else {
+      // There may be multiple inlinees at the same offset, so we need to try
+      // matching all of them. This match is implemented through sort-merge
+      // algorithm because callsite records at the same offset are ordered by
+      // function names.
+      MatchStep<FunctionSamplesMap::const_iterator> CalleeIterStep(
+          CallsiteIterStep.getFirstIter()->second.cbegin(),
+          CallsiteIterStep.getFirstIter()->second.cend(),
+          CallsiteIterStep.getSecondIter()->second.cbegin(),
+          CallsiteIterStep.getSecondIter()->second.cend());
+      CalleeIterStep.updateOneStep();
+      while (!CalleeIterStep.areBothFinished()) {
+        MatchStatus CalleeStepStatus = CalleeIterStep.getMatchStatus();
+        if (CalleeStepStatus != MS_Match) {
+          auto Callee = (CalleeStepStatus == MS_FirstUnique)
+                            ? CalleeIterStep.getFirstIter()
+                            : CalleeIterStep.getSecondIter();
+          updateForUnmatchedCallee(Callee->second, FuncOverlap, Difference,
+                                   CalleeStepStatus);
+        } else {
+          // An inlined function can contain other inlinees inside, so compute
+          // the Difference recursively.
+          Difference += 2.0 - 2 * computeSampleFunctionInternalOverlap(
+                                      CalleeIterStep.getFirstIter()->second,
+                                      CalleeIterStep.getSecondIter()->second,
+                                      FuncOverlap);
+        }
+        CalleeIterStep.updateOneStep();
+      }
+    }
+    CallsiteIterStep.updateOneStep();
+  }
+
+  // Difference reflects the total differences of line/block samples in this
+  // function and ranges in [0.0f to 2.0f]. Take (2.0 - Difference) / 2 to
+  // reflect the similarity between function profiles in [0.0f to 1.0f].
+  return (2.0 - Difference) / 2;
+}
+
+double SampleOverlapAggregator::weightForFuncSimilarity(
+    double FuncInternalSimilarity, uint64_t BaseFuncSample,
+    uint64_t TestFuncSample) const {
+  // Compute the weight as the distance between the function weights in two
+  // profiles.
+  double BaseFrac = 0.0;
+  double TestFrac = 0.0;
+  assert(ProfOverlap.BaseSample > 0 &&
+         "Total samples in base profile should be greater than 0");
+  BaseFrac = static_cast<double>(BaseFuncSample) / ProfOverlap.BaseSample;
+  assert(ProfOverlap.TestSample > 0 &&
+         "Total samples in test profile should be greater than 0");
+  TestFrac = static_cast<double>(TestFuncSample) / ProfOverlap.TestSample;
+  double WeightDistance = std::fabs(BaseFrac - TestFrac);
+
+  // Take WeightDistance into the similarity.
+  return FuncInternalSimilarity * (1 - WeightDistance);
+}
+
+double
+SampleOverlapAggregator::weightByImportance(double FuncSimilarity,
+                                            uint64_t BaseFuncSample,
+                                            uint64_t TestFuncSample) const {
+
+  double BaseFrac = 0.0;
+  double TestFrac = 0.0;
+  assert(ProfOverlap.BaseSample > 0 &&
+         "Total samples in base profile should be greater than 0");
+  BaseFrac = static_cast<double>(BaseFuncSample) / ProfOverlap.BaseSample / 2.0;
+  assert(ProfOverlap.TestSample > 0 &&
+         "Total samples in test profile should be greater than 0");
+  TestFrac = static_cast<double>(TestFuncSample) / ProfOverlap.TestSample / 2.0;
+  return FuncSimilarity * (BaseFrac + TestFrac);
+}
+
+double SampleOverlapAggregator::computeSampleFunctionOverlap(
+    const sampleprof::FunctionSamples *BaseFunc,
+    const sampleprof::FunctionSamples *TestFunc,
+    SampleOverlapStats *FuncOverlap, uint64_t BaseFuncSample,
+    uint64_t TestFuncSample) {
+  // Default function internal similarity before weighted, meaning two functions
+  // has no overlap.
+  const double DefaultFuncInternalSimilarity = 0;
+  double FuncSimilarity;
+  double FuncInternalSimilarity;
+
+  // If BaseFunc or TestFunc is nullptr, it means the functions do not overlap.
+  // In this case, we use DefaultFuncInternalSimilarity as the function internal
+  // similarity.
+  if (!BaseFunc || !TestFunc) {
+    FuncInternalSimilarity = DefaultFuncInternalSimilarity;
+  } else {
+    assert(FuncOverlap != nullptr &&
+           "FuncOverlap should be provided in this case");
+    FuncInternalSimilarity = computeSampleFunctionInternalOverlap(
+        *BaseFunc, *TestFunc, *FuncOverlap);
+    // Now, FuncInternalSimilarity may be a little less than 0 due to
+    // imprecision of floating point accumulations. Make it zero if the
+    // difference is below Epsilon.
+    FuncInternalSimilarity = (std::fabs(FuncInternalSimilarity - 0) < Epsilon)
+                                 ? 0
+                                 : FuncInternalSimilarity;
+  }
+  FuncSimilarity = weightForFuncSimilarity(FuncInternalSimilarity,
+                                           BaseFuncSample, TestFuncSample);
+  return FuncSimilarity;
+}
+
+void SampleOverlapAggregator::computeSampleProfileOverlap(raw_fd_ostream &OS) {
+  using namespace sampleprof;
+
+  StringMap<const FunctionSamples *> BaseFuncProf;
+  const auto &BaseProfiles = BaseReader->getProfiles();
+  for (const auto &BaseFunc : BaseProfiles) {
+    BaseFuncProf.try_emplace(BaseFunc.second.getName(), &(BaseFunc.second));
+  }
+  ProfOverlap.UnionCount = BaseFuncProf.size();
+
+  const auto &TestProfiles = TestReader->getProfiles();
+  for (const auto &TestFunc : TestProfiles) {
+    SampleOverlapStats FuncOverlap;
+    FuncOverlap.TestName = TestFunc.second.getName();
+    assert(TestStats.count(FuncOverlap.TestName) &&
+           "TestStats should have records for all functions in test profile "
+           "except inlinees");
+    FuncOverlap.TestSample = TestStats[FuncOverlap.TestName].SampleSum;
+
+    const auto Match = BaseFuncProf.find(FuncOverlap.TestName);
+    if (Match == BaseFuncProf.end()) {
+      const FuncSampleStats &FuncStats = TestStats[FuncOverlap.TestName];
+      ++ProfOverlap.TestUniqueCount;
+      ProfOverlap.TestUniqueSample += FuncStats.SampleSum;
+      FuncOverlap.TestUniqueSample = FuncStats.SampleSum;
+
+      updateHotBlockOverlap(0, FuncStats.SampleSum, FuncStats.HotBlockCount);
+
+      double FuncSimilarity = computeSampleFunctionOverlap(
+          nullptr, nullptr, nullptr, 0, FuncStats.SampleSum);
+      ProfOverlap.Similarity +=
+          weightByImportance(FuncSimilarity, 0, FuncStats.SampleSum);
+
+      ++ProfOverlap.UnionCount;
+      ProfOverlap.UnionSample += FuncStats.SampleSum;
+    } else {
+      ++ProfOverlap.OverlapCount;
+
+      // Two functions match with each other. Compute function-level overlap and
+      // aggregate them into profile-level overlap.
+      FuncOverlap.BaseName = Match->second->getName();
+      assert(BaseStats.count(FuncOverlap.BaseName) &&
+             "BaseStats should have records for all functions in base profile "
+             "except inlinees");
+      FuncOverlap.BaseSample = BaseStats[FuncOverlap.BaseName].SampleSum;
+
+      FuncOverlap.Similarity = computeSampleFunctionOverlap(
+          Match->second, &TestFunc.second, &FuncOverlap, FuncOverlap.BaseSample,
+          FuncOverlap.TestSample);
+      ProfOverlap.Similarity +=
+          weightByImportance(FuncOverlap.Similarity, FuncOverlap.BaseSample,
+                             FuncOverlap.TestSample);
+      ProfOverlap.OverlapSample += FuncOverlap.OverlapSample;
+      ProfOverlap.UnionSample += FuncOverlap.UnionSample;
+
+      // Accumulate the percentage of base unique and test unique samples into
+      // ProfOverlap.
+      ProfOverlap.BaseUniqueSample += FuncOverlap.BaseUniqueSample;
+      ProfOverlap.TestUniqueSample += FuncOverlap.TestUniqueSample;
+
+      // Remove matched base functions for later reporting functions not found
+      // in test profile.
+      BaseFuncProf.erase(Match);
+    }
+
+    // Print function-level similarity information if specified by options.
+    assert(TestStats.count(FuncOverlap.TestName) &&
+           "TestStats should have records for all functions in test profile "
+           "except inlinees");
+    if (TestStats[FuncOverlap.TestName].MaxSample >= FuncFilter.ValueCutoff ||
+        (Match != BaseFuncProf.end() &&
+         FuncOverlap.Similarity < LowSimilarityThreshold) ||
+        (Match != BaseFuncProf.end() && !FuncFilter.NameFilter.empty() &&
+         FuncOverlap.BaseName.find(FuncFilter.NameFilter) !=
+             FuncOverlap.BaseName.npos)) {
+      assert(ProfOverlap.BaseSample > 0 &&
+             "Total samples in base profile should be greater than 0");
+      FuncOverlap.BaseWeight =
+          static_cast<double>(FuncOverlap.BaseSample) / ProfOverlap.BaseSample;
+      assert(ProfOverlap.TestSample > 0 &&
+             "Total samples in test profile should be greater than 0");
+      FuncOverlap.TestWeight =
+          static_cast<double>(FuncOverlap.TestSample) / ProfOverlap.TestSample;
+      FuncSimilarityDump.emplace(FuncOverlap.BaseWeight, FuncOverlap);
+    }
+  }
+
+  // Traverse through functions in base profile but not in test profile.
+  for (const auto &F : BaseFuncProf) {
+    assert(BaseStats.count(F.second->getName()) &&
+           "BaseStats should have records for all functions in base profile "
+           "except inlinees");
+    const FuncSampleStats &FuncStats = BaseStats[F.second->getName()];
+    ++ProfOverlap.BaseUniqueCount;
+    ProfOverlap.BaseUniqueSample += FuncStats.SampleSum;
+
+    updateHotBlockOverlap(FuncStats.SampleSum, 0, FuncStats.HotBlockCount);
+
+    double FuncSimilarity = computeSampleFunctionOverlap(
+        nullptr, nullptr, nullptr, FuncStats.SampleSum, 0);
+    ProfOverlap.Similarity +=
+        weightByImportance(FuncSimilarity, FuncStats.SampleSum, 0);
+
+    ProfOverlap.UnionSample += FuncStats.SampleSum;
+  }
+
+  // Now, ProfSimilarity may be a little greater than 1 due to imprecision
+  // of floating point accumulations. Make it 1.0 if the difference is below
+  // Epsilon.
+  ProfOverlap.Similarity = (std::fabs(ProfOverlap.Similarity - 1) < Epsilon)
+                               ? 1
+                               : ProfOverlap.Similarity;
+
+  computeHotFuncOverlap();
+}
+
+void SampleOverlapAggregator::initializeSampleProfileOverlap() {
+  const auto &BaseProf = BaseReader->getProfiles();
+  for (const auto &I : BaseProf) {
+    ++ProfOverlap.BaseCount;
+    FuncSampleStats FuncStats;
+    getFuncSampleStats(I.second, FuncStats, BaseHotThreshold);
+    ProfOverlap.BaseSample += FuncStats.SampleSum;
+    BaseStats.try_emplace(I.second.getName(), FuncStats);
+  }
+
+  const auto &TestProf = TestReader->getProfiles();
+  for (const auto &I : TestProf) {
+    ++ProfOverlap.TestCount;
+    FuncSampleStats FuncStats;
+    getFuncSampleStats(I.second, FuncStats, TestHotThreshold);
+    ProfOverlap.TestSample += FuncStats.SampleSum;
+    TestStats.try_emplace(I.second.getName(), FuncStats);
+  }
+
+  ProfOverlap.BaseName = StringRef(BaseFilename);
+  ProfOverlap.TestName = StringRef(TestFilename);
+}
+
+void SampleOverlapAggregator::dumpFuncSimilarity(raw_fd_ostream &OS) const {
+  using namespace sampleprof;
+
+  if (FuncSimilarityDump.empty())
+    return;
+
+  formatted_raw_ostream FOS(OS);
+  FOS << "Function-level details:\n";
+  FOS << "Base weight";
+  FOS.PadToColumn(TestWeightCol);
+  FOS << "Test weight";
+  FOS.PadToColumn(SimilarityCol);
+  FOS << "Similarity";
+  FOS.PadToColumn(OverlapCol);
+  FOS << "Overlap";
+  FOS.PadToColumn(BaseUniqueCol);
+  FOS << "Base unique";
+  FOS.PadToColumn(TestUniqueCol);
+  FOS << "Test unique";
+  FOS.PadToColumn(BaseSampleCol);
+  FOS << "Base samples";
+  FOS.PadToColumn(TestSampleCol);
+  FOS << "Test samples";
+  FOS.PadToColumn(FuncNameCol);
+  FOS << "Function name\n";
+  for (const auto &F : FuncSimilarityDump) {
+    double OverlapPercent =
+        F.second.UnionSample > 0
+            ? static_cast<double>(F.second.OverlapSample) / F.second.UnionSample
+            : 0;
+    double BaseUniquePercent =
+        F.second.BaseSample > 0
+            ? static_cast<double>(F.second.BaseUniqueSample) /
+                  F.second.BaseSample
+            : 0;
+    double TestUniquePercent =
+        F.second.TestSample > 0
+            ? static_cast<double>(F.second.TestUniqueSample) /
+                  F.second.TestSample
+            : 0;
+
+    FOS << format("%.2f%%", F.second.BaseWeight * 100);
+    FOS.PadToColumn(TestWeightCol);
+    FOS << format("%.2f%%", F.second.TestWeight * 100);
+    FOS.PadToColumn(SimilarityCol);
+    FOS << format("%.2f%%", F.second.Similarity * 100);
+    FOS.PadToColumn(OverlapCol);
+    FOS << format("%.2f%%", OverlapPercent * 100);
+    FOS.PadToColumn(BaseUniqueCol);
+    FOS << format("%.2f%%", BaseUniquePercent * 100);
+    FOS.PadToColumn(TestUniqueCol);
+    FOS << format("%.2f%%", TestUniquePercent * 100);
+    FOS.PadToColumn(BaseSampleCol);
+    FOS << F.second.BaseSample;
+    FOS.PadToColumn(TestSampleCol);
+    FOS << F.second.TestSample;
+    FOS.PadToColumn(FuncNameCol);
+    FOS << F.second.TestName << "\n";
+  }
+}
+
+void SampleOverlapAggregator::dumpProgramSummary(raw_fd_ostream &OS) const {
+  OS << "Profile overlap infomation for base_profile: " << ProfOverlap.BaseName
+     << " and test_profile: " << ProfOverlap.TestName << "\nProgram level:\n";
+
+  OS << "  Whole program profile similarity: "
+     << format("%.3f%%", ProfOverlap.Similarity * 100) << "\n";
+
+  assert(ProfOverlap.UnionSample > 0 &&
+         "Total samples in two profile should be greater than 0");
+  double OverlapPercent =
+      static_cast<double>(ProfOverlap.OverlapSample) / ProfOverlap.UnionSample;
+  assert(ProfOverlap.BaseSample > 0 &&
+         "Total samples in base profile should be greater than 0");
+  double BaseUniquePercent = static_cast<double>(ProfOverlap.BaseUniqueSample) /
+                             ProfOverlap.BaseSample;
+  assert(ProfOverlap.TestSample > 0 &&
+         "Total samples in test profile should be greater than 0");
+  double TestUniquePercent = static_cast<double>(ProfOverlap.TestUniqueSample) /
+                             ProfOverlap.TestSample;
+
+  OS << "  Whole program sample overlap: "
+     << format("%.3f%%", OverlapPercent * 100) << "\n";
+  OS << "    percentage of samples unique in base profile: "
+     << format("%.3f%%", BaseUniquePercent * 100) << "\n";
+  OS << "    percentage of samples unique in test profile: "
+     << format("%.3f%%", TestUniquePercent * 100) << "\n";
+  OS << "    total samples in base profile: " << ProfOverlap.BaseSample << "\n"
+     << "    total samples in test profile: " << ProfOverlap.TestSample << "\n";
+
+  assert(ProfOverlap.UnionCount > 0 &&
+         "There should be at least one function in two input profiles");
+  double FuncOverlapPercent =
+      static_cast<double>(ProfOverlap.OverlapCount) / ProfOverlap.UnionCount;
+  OS << "  Function overlap: " << format("%.3f%%", FuncOverlapPercent * 100)
+     << "\n";
+  OS << "    overlap functions: " << ProfOverlap.OverlapCount << "\n";
+  OS << "    functions unique in base profile: " << ProfOverlap.BaseUniqueCount
+     << "\n";
+  OS << "    functions unique in test profile: " << ProfOverlap.TestUniqueCount
+     << "\n";
+}
+
+void SampleOverlapAggregator::dumpHotFuncAndBlockOverlap(
+    raw_fd_ostream &OS) const {
+  assert(HotFuncOverlap.UnionCount > 0 &&
+         "There should be at least one hot function in two input profiles");
+  OS << "  Hot-function overlap: "
+     << format("%.3f%%", static_cast<double>(HotFuncOverlap.OverlapCount) /
+                             HotFuncOverlap.UnionCount * 100)
+     << "\n";
+  OS << "    overlap hot functions: " << HotFuncOverlap.OverlapCount << "\n";
+  OS << "    hot functions unique in base profile: "
+     << HotFuncOverlap.BaseCount - HotFuncOverlap.OverlapCount << "\n";
+  OS << "    hot functions unique in test profile: "
+     << HotFuncOverlap.TestCount - HotFuncOverlap.OverlapCount << "\n";
+
+  assert(HotBlockOverlap.UnionCount > 0 &&
+         "There should be at least one hot block in two input profiles");
+  OS << "  Hot-block overlap: "
+     << format("%.3f%%", static_cast<double>(HotBlockOverlap.OverlapCount) /
+                             HotBlockOverlap.UnionCount * 100)
+     << "\n";
+  OS << "    overlap hot blocks: " << HotBlockOverlap.OverlapCount << "\n";
+  OS << "    hot blocks unique in base profile: "
+     << HotBlockOverlap.BaseCount - HotBlockOverlap.OverlapCount << "\n";
+  OS << "    hot blocks unique in test profile: "
+     << HotBlockOverlap.TestCount - HotBlockOverlap.OverlapCount << "\n";
+}
+
+std::error_code SampleOverlapAggregator::loadProfiles() {
+  using namespace sampleprof;
+
+  LLVMContext Context;
+  auto BaseReaderOrErr = SampleProfileReader::create(BaseFilename, Context);
+  if (std::error_code EC = BaseReaderOrErr.getError())
+    exitWithErrorCode(EC, BaseFilename);
+
+  auto TestReaderOrErr = SampleProfileReader::create(TestFilename, Context);
+  if (std::error_code EC = TestReaderOrErr.getError())
+    exitWithErrorCode(EC, TestFilename);
+
+  BaseReader = std::move(BaseReaderOrErr.get());
+  TestReader = std::move(TestReaderOrErr.get());
+
+  if (std::error_code EC = BaseReader->read())
+    exitWithErrorCode(EC, BaseFilename);
+  if (std::error_code EC = TestReader->read())
+    exitWithErrorCode(EC, TestFilename);
+  if (BaseReader->profileIsProbeBased() != TestReader->profileIsProbeBased())
+    exitWithError(
+        "cannot compare probe-based profile with non-probe-based profile");
+
+  // Load BaseHotThreshold and TestHotThreshold as 99-percentile threshold in
+  // profile summary.
+  const uint64_t HotCutoff = 990000;
+  ProfileSummary &BasePS = BaseReader->getSummary();
+  for (const auto &SummaryEntry : BasePS.getDetailedSummary()) {
+    if (SummaryEntry.Cutoff == HotCutoff) {
+      BaseHotThreshold = SummaryEntry.MinCount;
+      break;
+    }
+  }
+
+  ProfileSummary &TestPS = TestReader->getSummary();
+  for (const auto &SummaryEntry : TestPS.getDetailedSummary()) {
+    if (SummaryEntry.Cutoff == HotCutoff) {
+      TestHotThreshold = SummaryEntry.MinCount;
+      break;
+    }
+  }
+  return std::error_code();
+}
+
+void overlapSampleProfile(const std::string &BaseFilename,
+                          const std::string &TestFilename,
+                          const OverlapFuncFilters &FuncFilter,
+                          uint64_t SimilarityCutoff, raw_fd_ostream &OS) {
+  using namespace sampleprof;
+
+  // We use 0.000005 to initialize OverlapAggr.Epsilon because the final metrics
+  // report 2--3 places after decimal point in percentage numbers.
+  SampleOverlapAggregator OverlapAggr(
+      BaseFilename, TestFilename,
+      static_cast<double>(SimilarityCutoff) / 1000000, 0.000005, FuncFilter);
+  if (std::error_code EC = OverlapAggr.loadProfiles())
+    exitWithErrorCode(EC);
+
+  OverlapAggr.initializeSampleProfileOverlap();
+  if (OverlapAggr.detectZeroSampleProfile(OS))
+    return;
+
+  OverlapAggr.computeSampleProfileOverlap(OS);
+
+  OverlapAggr.dumpProgramSummary(OS);
+  OverlapAggr.dumpHotFuncAndBlockOverlap(OS);
+  OverlapAggr.dumpFuncSimilarity(OS);
+}
+
 static int overlap_main(int argc, const char *argv[]) {
   cl::opt<std::string> BaseFilename(cl::Positional, cl::Required,
                                     cl::desc("<base profile file>"));
@@ -763,6 +1898,15 @@ static int overlap_main(int argc, const char *argv[]) {
   cl::opt<std::string> FuncNameFilter(
       "function",
       cl::desc("Function level overlap information for matching functions"));
+  cl::opt<unsigned long long> SimilarityCutoff(
+      "similarity-cutoff", cl::init(0),
+      cl::desc(
+          "For sample profiles, list function names for overlapped functions "
+          "with similarities below the cutoff (percentage times 10000)."));
+  cl::opt<ProfileKinds> ProfileKind(
+      cl::desc("Profile kind:"), cl::init(instr),
+      cl::values(clEnumVal(instr, "Instrumentation profile (default)"),
+                 clEnumVal(sample, "Sample profile")));
   cl::ParseCommandLineOptions(argc, argv, "LLVM profile data overlap tool\n");
 
   std::error_code EC;
@@ -770,9 +1914,14 @@ static int overlap_main(int argc, const char *argv[]) {
   if (EC)
     exitWithErrorCode(EC, Output);
 
-  overlapInstrProfile(BaseFilename, TestFilename,
-                      OverlapFuncFilters{ValueCutoff, FuncNameFilter}, OS,
-                      IsCS);
+  if (ProfileKind == instr)
+    overlapInstrProfile(BaseFilename, TestFilename,
+                        OverlapFuncFilters{ValueCutoff, FuncNameFilter}, OS,
+                        IsCS);
+  else
+    overlapSampleProfile(BaseFilename, TestFilename,
+                         OverlapFuncFilters{ValueCutoff, FuncNameFilter},
+                         SimilarityCutoff, OS);
 
   return 0;
 }
@@ -903,6 +2052,8 @@ static int showInstrProfile(const std::string &Filename, bool ShowCounts,
     uint64_t FuncMax = 0;
     uint64_t FuncSum = 0;
     for (size_t I = 0, E = Func.Counts.size(); I < E; ++I) {
+      if (Func.Counts[I] == (uint64_t)-1)
+        continue;
       FuncMax = std::max(FuncMax, Func.Counts[I]);
       FuncSum += Func.Counts[I];
     }
@@ -977,8 +2128,11 @@ static int showInstrProfile(const std::string &Filename, bool ShowCounts,
   if (TextFormat)
     return 0;
   std::unique_ptr<ProfileSummary> PS(Builder.getSummary());
-  OS << "Instrumentation level: "
-     << (Reader->isIRLevelProfile() ? "IR" : "Front-end") << "\n";
+  bool IsIR = Reader->isIRLevelProfile();
+  OS << "Instrumentation level: " << (IsIR ? "IR" : "Front-end");
+  if (IsIR)
+    OS << "  entry_first = " << Reader->instrEntryBBEnabled();
+  OS << "\n";
   if (ShowAllFunctions || !ShowFunction.empty())
     OS << "Functions shown: " << ShownFunctions << "\n";
   OS << "Total functions: " << PS->getNumFunctions() << "\n";
@@ -1062,17 +2216,21 @@ static void dumpHotFunctionList(const std::vector<std::string> &ColumnTitle,
                                 uint64_t HotProfCount, uint64_t TotalProfCount,
                                 const std::string &HotFuncMetric,
                                 raw_fd_ostream &OS) {
-  assert(ColumnOffset.size() == ColumnTitle.size());
-  assert(ColumnTitle.size() >= 4);
-  assert(TotalFuncCount > 0);
+  assert(ColumnOffset.size() == ColumnTitle.size() &&
+         "ColumnOffset and ColumnTitle should have the same size");
+  assert(ColumnTitle.size() >= 4 &&
+         "ColumnTitle should have at least 4 elements");
+  assert(TotalFuncCount > 0 &&
+         "There should be at least one function in the profile");
   double TotalProfPercent = 0;
   if (TotalProfCount > 0)
-    TotalProfPercent = ((double)HotProfCount) / TotalProfCount * 100;
+    TotalProfPercent = static_cast<double>(HotProfCount) / TotalProfCount * 100;
 
   formatted_raw_ostream FOS(OS);
   FOS << HotFuncCount << " out of " << TotalFuncCount
       << " functions with profile ("
-      << format("%.2f%%", (((double)HotFuncCount) / TotalFuncCount * 100))
+      << format("%.2f%%",
+                (static_cast<double>(HotFuncCount) / TotalFuncCount * 100))
       << ") are considered hot functions";
   if (!HotFuncMetric.empty())
     FOS << " (" << HotFuncMetric << ")";
@@ -1096,7 +2254,6 @@ static void dumpHotFunctionList(const std::vector<std::string> &ColumnTitle,
     FOS.PadToColumn(ColumnOffset[3]);
     FOS << R.FuncName << "\n";
   }
-  return;
 }
 
 static int
@@ -1113,7 +2270,6 @@ showHotFunctionList(const StringMap<sampleprof::FunctionSamples> &Profiles,
       break;
     }
   }
-  assert(MinCountThreshold != 0);
 
   // Traverse all functions in the profile and keep only hot functions.
   // The following loop also calculates the sum of total samples of all
@@ -1124,18 +2280,16 @@ showHotFunctionList(const StringMap<sampleprof::FunctionSamples> &Profiles,
   uint64_t ProfileTotalSample = 0;
   uint64_t HotFuncSample = 0;
   uint64_t HotFuncCount = 0;
-  uint64_t MaxCount = 0;
+
   for (const auto &I : Profiles) {
+    FuncSampleStats FuncStats;
     const FunctionSamples &FuncProf = I.second;
     ProfileTotalSample += FuncProf.getTotalSamples();
-    MaxCount = FuncProf.getMaxCountInside();
+    getFuncSampleStats(FuncProf, FuncStats, MinCountThreshold);
 
-    // MinCountThreshold is a block/line threshold computed for a given cutoff.
-    // We intentionally compare the maximum sample count in a function with this
-    // threshold to get an approximate threshold for hot functions.
-    if (MaxCount >= MinCountThreshold) {
+    if (isFunctionHot(FuncStats, MinCountThreshold)) {
       HotFunc.emplace(FuncProf.getTotalSamples(),
-                      std::make_pair(&(I.second), MaxCount));
+                      std::make_pair(&(I.second), FuncStats.MaxSample));
       HotFuncSample += FuncProf.getTotalSamples();
       ++HotFuncCount;
     }
@@ -1153,9 +2307,9 @@ showHotFunctionList(const StringMap<sampleprof::FunctionSamples> &Profiles,
         (ProfileTotalSample > 0)
             ? (Func.getTotalSamples() * 100.0) / ProfileTotalSample
             : 0;
-    PrintValues.emplace_back(HotFuncInfo(
-        Func.getFuncName(), Func.getTotalSamples(), TotalSamplePercent,
-        FuncPair.second.second, Func.getEntrySamples()));
+    PrintValues.emplace_back(
+        HotFuncInfo(Func.getName(), Func.getTotalSamples(), TotalSamplePercent,
+                    FuncPair.second.second, Func.getEntrySamples()));
   }
   dumpHotFunctionList(ColumnTitle, ColumnOffset, PrintValues, HotFuncCount,
                       Profiles.size(), HotFuncSample, ProfileTotalSample,
@@ -1275,7 +2429,7 @@ static int show_main(int argc, const char *argv[]) {
   if (OutputFilename.empty())
     OutputFilename = "-";
 
-  if (!Filename.compare(OutputFilename)) {
+  if (Filename == OutputFilename) {
     errs() << sys::path::filename(argv[0])
            << ": Input file name cannot be the same as the output file name!\n";
     return 1;
diff --git a/contrib/llvm-project/llvm/tools/llvm-readobj/ARMEHABIPrinter.h b/contrib/llvm-project/llvm/tools/llvm-readobj/ARMEHABIPrinter.h
index 2c0912038c31..3d8acbf48fa9 100644
--- a/contrib/llvm-project/llvm/tools/llvm-readobj/ARMEHABIPrinter.h
+++ b/contrib/llvm-project/llvm/tools/llvm-readobj/ARMEHABIPrinter.h
@@ -9,7 +9,6 @@
 #ifndef LLVM_TOOLS_LLVM_READOBJ_ARMEHABIPRINTER_H
 #define LLVM_TOOLS_LLVM_READOBJ_ARMEHABIPRINTER_H
 
-#include "Error.h"
 #include "llvm-readobj.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Object/ELF.h"
@@ -328,7 +327,7 @@ class PrinterContext {
   typedef typename ET::Word Elf_Word;
 
   ScopedPrinter &SW;
-  const object::ELFFile<ET> *ELF;
+  const object::ELFFile<ET> &ELF;
   StringRef FileName;
   const Elf_Shdr *Symtab;
   ArrayRef<Elf_Word> ShndxTable;
@@ -337,22 +336,23 @@ class PrinterContext {
 
   static uint64_t PREL31(uint32_t Address, uint32_t Place) {
     uint64_t Location = Address & 0x7fffffff;
-    if (Location & 0x04000000)
+    if (Location & 0x40000000)
       Location |= (uint64_t) ~0x7fffffff;
     return Location + Place;
   }
 
-  ErrorOr<StringRef> FunctionAtAddress(unsigned Section, uint64_t Address) const;
+  ErrorOr<StringRef> FunctionAtAddress(uint64_t Address,
+                                       Optional<unsigned> SectionIndex) const;
   const Elf_Shdr *FindExceptionTable(unsigned IndexTableIndex,
                                      off_t IndexTableOffset) const;
 
   void PrintIndexTable(unsigned SectionIndex, const Elf_Shdr *IT) const;
-  void PrintExceptionTable(const Elf_Shdr *IT, const Elf_Shdr *EHT,
+  void PrintExceptionTable(const Elf_Shdr &EHT,
                            uint64_t TableEntryOffset) const;
   void PrintOpcodes(const uint8_t *Entry, size_t Length, off_t Offset) const;
 
 public:
-  PrinterContext(ScopedPrinter &SW, const object::ELFFile<ET> *ELF,
+  PrinterContext(ScopedPrinter &SW, const object::ELFFile<ET> &ELF,
                  StringRef FileName, const Elf_Shdr *Symtab)
       : SW(SW), ELF(ELF), FileName(FileName), Symtab(Symtab) {}
 
@@ -364,27 +364,31 @@ const size_t PrinterContext<ET>::IndexTableEntrySize = 8;
 
 template <typename ET>
 ErrorOr<StringRef>
-PrinterContext<ET>::FunctionAtAddress(unsigned Section,
-                                      uint64_t Address) const {
+PrinterContext<ET>::FunctionAtAddress(uint64_t Address,
+                                      Optional<unsigned> SectionIndex) const {
   if (!Symtab)
-    return readobj_error::unknown_symbol;
-  auto StrTableOrErr = ELF->getStringTableForSymtab(*Symtab);
+    return inconvertibleErrorCode();
+  auto StrTableOrErr = ELF.getStringTableForSymtab(*Symtab);
   if (!StrTableOrErr)
     reportError(StrTableOrErr.takeError(), FileName);
   StringRef StrTable = *StrTableOrErr;
 
-  for (const Elf_Sym &Sym : unwrapOrError(FileName, ELF->symbols(Symtab)))
-    if (Sym.st_shndx == Section && Sym.st_value == Address &&
-        Sym.getType() == ELF::STT_FUNC) {
+  for (const Elf_Sym &Sym : unwrapOrError(FileName, ELF.symbols(Symtab))) {
+    if (SectionIndex && *SectionIndex != Sym.st_shndx)
+      continue;
+
+    if (Sym.st_value == Address && Sym.getType() == ELF::STT_FUNC) {
       auto NameOrErr = Sym.getName(StrTable);
       if (!NameOrErr) {
         // TODO: Actually report errors helpfully.
         consumeError(NameOrErr.takeError());
-        return readobj_error::unknown_symbol;
+        return inconvertibleErrorCode();
       }
       return *NameOrErr;
     }
-  return readobj_error::unknown_symbol;
+  }
+
+  return inconvertibleErrorCode();
 }
 
 template <typename ET>
@@ -399,16 +403,16 @@ PrinterContext<ET>::FindExceptionTable(unsigned IndexSectionIndex,
   /// handling table.  Use this symbol to recover the actual exception handling
   /// table.
 
-  for (const Elf_Shdr &Sec : unwrapOrError(FileName, ELF->sections())) {
+  for (const Elf_Shdr &Sec : unwrapOrError(FileName, ELF.sections())) {
     if (Sec.sh_type != ELF::SHT_REL || Sec.sh_info != IndexSectionIndex)
       continue;
 
-    auto SymTabOrErr = ELF->getSection(Sec.sh_link);
+    auto SymTabOrErr = ELF.getSection(Sec.sh_link);
     if (!SymTabOrErr)
       reportError(SymTabOrErr.takeError(), FileName);
     const Elf_Shdr *SymTab = *SymTabOrErr;
 
-    for (const Elf_Rel &R : unwrapOrError(FileName, ELF->rels(&Sec))) {
+    for (const Elf_Rel &R : unwrapOrError(FileName, ELF.rels(Sec))) {
       if (R.r_offset != static_cast<unsigned>(IndexTableOffset))
         continue;
 
@@ -418,9 +422,9 @@ PrinterContext<ET>::FindExceptionTable(unsigned IndexSectionIndex,
       RelA.r_addend = 0;
 
       const Elf_Sym *Symbol =
-          unwrapOrError(FileName, ELF->getRelocationSymbol(&RelA, SymTab));
+          unwrapOrError(FileName, ELF.getRelocationSymbol(RelA, SymTab));
 
-      auto Ret = ELF->getSection(Symbol, SymTab, ShndxTable);
+      auto Ret = ELF.getSection(*Symbol, SymTab, ShndxTable);
       if (!Ret)
         report_fatal_error(errorToErrorCode(Ret.takeError()).message());
       return *Ret;
@@ -430,10 +434,20 @@ PrinterContext<ET>::FindExceptionTable(unsigned IndexSectionIndex,
 }
 
 template <typename ET>
-void PrinterContext<ET>::PrintExceptionTable(const Elf_Shdr *IT,
-                                             const Elf_Shdr *EHT,
+static const typename ET::Shdr *
+findSectionContainingAddress(const object::ELFFile<ET> &Obj, StringRef FileName,
+                             uint64_t Address) {
+  for (const typename ET::Shdr &Sec : unwrapOrError(FileName, Obj.sections()))
+    if (Address >= Sec.sh_addr && Address < Sec.sh_addr + Sec.sh_size)
+      return &Sec;
+  return nullptr;
+}
+
+template <typename ET>
+void PrinterContext<ET>::PrintExceptionTable(const Elf_Shdr &EHT,
                                              uint64_t TableEntryOffset) const {
-  Expected<ArrayRef<uint8_t>> Contents = ELF->getSectionContents(EHT);
+  // TODO: handle failure.
+  Expected<ArrayRef<uint8_t>> Contents = ELF.getSectionContents(EHT);
   if (!Contents)
     return;
 
@@ -482,10 +496,14 @@ void PrinterContext<ET>::PrintExceptionTable(const Elf_Shdr *IT,
     }
   } else {
     SW.printString("Model", StringRef("Generic"));
-
-    uint64_t Address = PREL31(Word, EHT->sh_addr);
+    const bool IsRelocatable = ELF.getHeader().e_type == ELF::ET_REL;
+    uint64_t Address = IsRelocatable
+                           ? PREL31(Word, EHT.sh_addr)
+                           : PREL31(Word, EHT.sh_addr + TableEntryOffset);
     SW.printHex("PersonalityRoutineAddress", Address);
-    if (ErrorOr<StringRef> Name = FunctionAtAddress(EHT->sh_link, Address))
+    Optional<unsigned> SecIndex =
+        IsRelocatable ? Optional<unsigned>(EHT.sh_link) : None;
+    if (ErrorOr<StringRef> Name = FunctionAtAddress(Address, SecIndex))
       SW.printString("PersonalityRoutineName", *Name);
   }
 }
@@ -500,7 +518,8 @@ void PrinterContext<ET>::PrintOpcodes(const uint8_t *Entry,
 template <typename ET>
 void PrinterContext<ET>::PrintIndexTable(unsigned SectionIndex,
                                          const Elf_Shdr *IT) const {
-  Expected<ArrayRef<uint8_t>> Contents = ELF->getSectionContents(IT);
+  // TODO: handle failure.
+  Expected<ArrayRef<uint8_t>> Contents = ELF.getSectionContents(*IT);
   if (!Contents)
     return;
 
@@ -517,6 +536,7 @@ void PrinterContext<ET>::PrintIndexTable(unsigned SectionIndex,
   const support::ulittle32_t *Data =
     reinterpret_cast<const support::ulittle32_t *>(Contents->data());
   const unsigned Entries = IT->sh_size / IndexTableEntrySize;
+  const bool IsRelocatable = ELF.getHeader().e_type == ELF::ET_REL;
 
   ListScope E(SW, "Entries");
   for (unsigned Entry = 0; Entry < Entries; ++Entry) {
@@ -532,9 +552,31 @@ void PrinterContext<ET>::PrintIndexTable(unsigned SectionIndex,
       continue;
     }
 
-    const uint64_t Offset = PREL31(Word0, IT->sh_addr);
-    SW.printHex("FunctionAddress", Offset);
-    if (ErrorOr<StringRef> Name = FunctionAtAddress(IT->sh_link, Offset))
+    // FIXME: For a relocatable object ideally we might want to:
+    // 1) Find a relocation for the offset of Word0.
+    // 2) Verify this relocation is of an expected type (R_ARM_PREL31) and
+    //    verify the symbol index.
+    // 3) Resolve the relocation using it's symbol value, addend etc.
+    // Currently the code assumes that Word0 contains an addend of a
+    // R_ARM_PREL31 REL relocation that references a section symbol. RELA
+    // relocations are not supported and it works because addresses of sections
+    // are nulls in relocatable objects.
+    //
+    // For a non-relocatable object, Word0 contains a place-relative signed
+    // offset to the referenced entity.
+    const uint64_t Address =
+        IsRelocatable
+            ? PREL31(Word0, IT->sh_addr)
+            : PREL31(Word0, IT->sh_addr + Entry * IndexTableEntrySize);
+    SW.printHex("FunctionAddress", Address);
+
+    // In a relocatable output we might have many .ARM.exidx sections linked to
+    // their code sections via the sh_link field. For a non-relocatable ELF file
+    // the sh_link field is not reliable, because we have one .ARM.exidx section
+    // normally, but might have many code sections.
+    Optional<unsigned> SecIndex =
+        IsRelocatable ? Optional<unsigned>(IT->sh_link) : None;
+    if (ErrorOr<StringRef> Name = FunctionAtAddress(Address, SecIndex))
       SW.printString("FunctionName", *Name);
 
     if (Word1 == EXIDX_CANTUNWIND) {
@@ -550,18 +592,30 @@ void PrinterContext<ET>::PrintIndexTable(unsigned SectionIndex,
 
       PrintOpcodes(Contents->data() + Entry * IndexTableEntrySize + 4, 3, 1);
     } else {
-      const Elf_Shdr *EHT =
-        FindExceptionTable(SectionIndex, Entry * IndexTableEntrySize + 4);
+      const Elf_Shdr *EHT;
+      uint64_t TableEntryAddress;
+      if (IsRelocatable) {
+        TableEntryAddress = PREL31(Word1, IT->sh_addr);
+        EHT = FindExceptionTable(SectionIndex, Entry * IndexTableEntrySize + 4);
+      } else {
+        TableEntryAddress =
+            PREL31(Word1, IT->sh_addr + Entry * IndexTableEntrySize + 4);
+        EHT = findSectionContainingAddress(ELF, FileName, TableEntryAddress);
+      }
 
       if (EHT)
-        if (auto Name = ELF->getSectionName(EHT))
+        // TODO: handle failure.
+        if (Expected<StringRef> Name = ELF.getSectionName(*EHT))
           SW.printString("ExceptionHandlingTable", *Name);
 
-      uint64_t TableEntryOffset = PREL31(Word1, IT->sh_addr);
-      SW.printHex("TableEntryOffset", TableEntryOffset);
-
-      if (EHT)
-        PrintExceptionTable(IT, EHT, TableEntryOffset);
+      SW.printHex(IsRelocatable ? "TableEntryOffset" : "TableEntryAddress",
+                  TableEntryAddress);
+      if (EHT) {
+        if (IsRelocatable)
+          PrintExceptionTable(*EHT, TableEntryAddress);
+        else
+          PrintExceptionTable(*EHT, TableEntryAddress - EHT->sh_addr);
+      }
     }
   }
 }
@@ -571,12 +625,13 @@ void PrinterContext<ET>::PrintUnwindInformation() const {
   DictScope UI(SW, "UnwindInformation");
 
   int SectionIndex = 0;
-  for (const Elf_Shdr &Sec : unwrapOrError(FileName, ELF->sections())) {
+  for (const Elf_Shdr &Sec : unwrapOrError(FileName, ELF.sections())) {
     if (Sec.sh_type == ELF::SHT_ARM_EXIDX) {
       DictScope UIT(SW, "UnwindIndexTable");
 
       SW.printNumber("SectionIndex", SectionIndex);
-      if (auto SectionName = ELF->getSectionName(&Sec))
+      // TODO: handle failure.
+      if (Expected<StringRef> SectionName = ELF.getSectionName(Sec))
         SW.printString("SectionName", *SectionName);
       SW.printHex("SectionOffset", Sec.sh_offset);
 
diff --git a/contrib/llvm-project/llvm/tools/llvm-readobj/ARMWinEHPrinter.cpp b/contrib/llvm-project/llvm/tools/llvm-readobj/ARMWinEHPrinter.cpp
index 8f365c5ad082..5995a09514c8 100644
--- a/contrib/llvm-project/llvm/tools/llvm-readobj/ARMWinEHPrinter.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-readobj/ARMWinEHPrinter.cpp
@@ -62,7 +62,6 @@
 // epilogue of the function.
 
 #include "ARMWinEHPrinter.h"
-#include "Error.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/ARMWinEH.h"
@@ -168,6 +167,11 @@ const Decoder::RingEntry Decoder::Ring64[] = {
   { 0xff, 0xe3, 1, &Decoder::opcode_nop },
   { 0xff, 0xe4, 1, &Decoder::opcode_end },
   { 0xff, 0xe5, 1, &Decoder::opcode_end_c },
+  { 0xff, 0xe6, 1, &Decoder::opcode_save_next },
+  { 0xff, 0xe8, 1, &Decoder::opcode_trap_frame },
+  { 0xff, 0xe9, 1, &Decoder::opcode_machine_frame },
+  { 0xff, 0xea, 1, &Decoder::opcode_context },
+  { 0xff, 0xec, 1, &Decoder::opcode_clear_unwound_to_call },
 };
 
 void Decoder::printRegisters(const std::pair<uint16_t, uint32_t> &RegisterMask) {
@@ -217,7 +221,7 @@ Decoder::getSectionContaining(const COFFObjectFile &COFF, uint64_t VA) {
     if (VA >= Address && (VA - Address) <= Size)
       return Section;
   }
-  return readobj_error::unknown_symbol;
+  return inconvertibleErrorCode();
 }
 
 ErrorOr<object::SymbolRef> Decoder::getSymbol(const COFFObjectFile &COFF,
@@ -235,7 +239,7 @@ ErrorOr<object::SymbolRef> Decoder::getSymbol(const COFFObjectFile &COFF,
     if (*Address == VA)
       return Symbol;
   }
-  return readobj_error::unknown_symbol;
+  return inconvertibleErrorCode();
 }
 
 ErrorOr<SymbolRef> Decoder::getRelocatedSymbol(const COFFObjectFile &,
@@ -246,7 +250,7 @@ ErrorOr<SymbolRef> Decoder::getRelocatedSymbol(const COFFObjectFile &,
     if (RelocationOffset == Offset)
       return *Relocation.getSymbol();
   }
-  return readobj_error::unknown_symbol;
+  return inconvertibleErrorCode();
 }
 
 bool Decoder::opcode_0xxxxxxx(const uint8_t *OC, unsigned &Offset,
@@ -637,7 +641,7 @@ bool Decoder::opcode_save_reg_x(const uint8_t *OC, unsigned &Offset,
   Reg += 19;
   uint32_t Off = ((OC[Offset + 1] & 0x1F) + 1) << 3;
   if (Prologue)
-    SW.startLine() << format("0x%02x%02x              ; str x%u, [sp, #%u]!\n",
+    SW.startLine() << format("0x%02x%02x              ; str x%u, [sp, #-%u]!\n",
                              OC[Offset], OC[Offset + 1], Reg, Off);
   else
     SW.startLine() << format("0x%02x%02x              ; ldr x%u, [sp], #%u\n",
@@ -703,7 +707,7 @@ bool Decoder::opcode_save_freg(const uint8_t *OC, unsigned &Offset,
   Reg >>= 6;
   Reg += 8;
   uint32_t Off = (OC[Offset + 1] & 0x3F) << 3;
-  SW.startLine() << format("0x%02x%02x                ; %s d%u, [sp, #%u]\n",
+  SW.startLine() << format("0x%02x%02x              ; %s d%u, [sp, #%u]\n",
                            OC[Offset], OC[Offset + 1],
                            static_cast<const char *>(Prologue ? "str" : "ldr"),
                            Reg, Off);
@@ -742,7 +746,9 @@ bool Decoder::opcode_alloc_l(const uint8_t *OC, unsigned &Offset,
 
 bool Decoder::opcode_setfp(const uint8_t *OC, unsigned &Offset, unsigned Length,
                            bool Prologue) {
-  SW.startLine() << format("0x%02x                ; mov fp, sp\n", OC[Offset]);
+  SW.startLine() << format("0x%02x                ; mov %s, %s\n", OC[Offset],
+                           static_cast<const char *>(Prologue ? "fp" : "sp"),
+                           static_cast<const char *>(Prologue ? "sp" : "fp"));
   ++Offset;
   return false;
 }
@@ -750,8 +756,11 @@ bool Decoder::opcode_setfp(const uint8_t *OC, unsigned &Offset, unsigned Length,
 bool Decoder::opcode_addfp(const uint8_t *OC, unsigned &Offset, unsigned Length,
                            bool Prologue) {
   unsigned NumBytes = OC[Offset + 1] << 3;
-  SW.startLine() << format("0x%02x%02x              ; add fp, sp, #%u\n",
-                           OC[Offset], OC[Offset + 1], NumBytes);
+  SW.startLine() << format(
+      "0x%02x%02x              ; %s %s, %s, #%u\n", OC[Offset], OC[Offset + 1],
+      static_cast<const char *>(Prologue ? "add" : "sub"),
+      static_cast<const char *>(Prologue ? "fp" : "sp"),
+      static_cast<const char *>(Prologue ? "sp" : "fp"), NumBytes);
   Offset += 2;
   return false;
 }
@@ -777,6 +786,47 @@ bool Decoder::opcode_end_c(const uint8_t *OC, unsigned &Offset, unsigned Length,
   return true;
 }
 
+bool Decoder::opcode_save_next(const uint8_t *OC, unsigned &Offset,
+                               unsigned Length, bool Prologue) {
+  if (Prologue)
+    SW.startLine() << format("0x%02x                ; save next\n", OC[Offset]);
+  else
+    SW.startLine() << format("0x%02x                ; restore next\n",
+                             OC[Offset]);
+  ++Offset;
+  return false;
+}
+
+bool Decoder::opcode_trap_frame(const uint8_t *OC, unsigned &Offset,
+                                unsigned Length, bool Prologue) {
+  SW.startLine() << format("0x%02x                ; trap frame\n", OC[Offset]);
+  ++Offset;
+  return false;
+}
+
+bool Decoder::opcode_machine_frame(const uint8_t *OC, unsigned &Offset,
+                                   unsigned Length, bool Prologue) {
+  SW.startLine() << format("0x%02x                ; machine frame\n",
+                           OC[Offset]);
+  ++Offset;
+  return false;
+}
+
+bool Decoder::opcode_context(const uint8_t *OC, unsigned &Offset,
+                             unsigned Length, bool Prologue) {
+  SW.startLine() << format("0x%02x                ; context\n", OC[Offset]);
+  ++Offset;
+  return false;
+}
+
+bool Decoder::opcode_clear_unwound_to_call(const uint8_t *OC, unsigned &Offset,
+                                           unsigned Length, bool Prologue) {
+  SW.startLine() << format("0x%02x                ; clear unwound to call\n",
+                           OC[Offset]);
+  ++Offset;
+  return false;
+}
+
 void Decoder::decodeOpcodes(ArrayRef<uint8_t> Opcodes, unsigned Offset,
                             bool Prologue) {
   assert((!Prologue || Offset == 0) && "prologue should always use offset 0");
@@ -1061,6 +1111,145 @@ bool Decoder::dumpPackedEntry(const object::COFFObjectFile &COFF,
   return true;
 }
 
+bool Decoder::dumpPackedARM64Entry(const object::COFFObjectFile &COFF,
+                                   const SectionRef Section, uint64_t Offset,
+                                   unsigned Index,
+                                   const RuntimeFunctionARM64 &RF) {
+  assert((RF.Flag() == RuntimeFunctionFlag::RFF_Packed ||
+          RF.Flag() == RuntimeFunctionFlag::RFF_PackedFragment) &&
+         "unpacked entry cannot be treated as a packed entry");
+
+  ErrorOr<SymbolRef> Function = getRelocatedSymbol(COFF, Section, Offset);
+  if (!Function)
+    Function = getSymbol(COFF, RF.BeginAddress, /*FunctionOnly=*/true);
+
+  StringRef FunctionName;
+  uint64_t FunctionAddress;
+  if (Function) {
+    Expected<StringRef> FunctionNameOrErr = Function->getName();
+    if (!FunctionNameOrErr) {
+      std::string Buf;
+      llvm::raw_string_ostream OS(Buf);
+      logAllUnhandledErrors(FunctionNameOrErr.takeError(), OS);
+      OS.flush();
+      report_fatal_error(Buf);
+    }
+    FunctionName = *FunctionNameOrErr;
+    Expected<uint64_t> FunctionAddressOrErr = Function->getAddress();
+    if (!FunctionAddressOrErr) {
+      std::string Buf;
+      llvm::raw_string_ostream OS(Buf);
+      logAllUnhandledErrors(FunctionAddressOrErr.takeError(), OS);
+      OS.flush();
+      report_fatal_error(Buf);
+    }
+    FunctionAddress = *FunctionAddressOrErr;
+  } else {
+    FunctionAddress = COFF.getPE32PlusHeader()->ImageBase + RF.BeginAddress;
+  }
+
+  SW.printString("Function", formatSymbol(FunctionName, FunctionAddress));
+  SW.printBoolean("Fragment",
+                  RF.Flag() == RuntimeFunctionFlag::RFF_PackedFragment);
+  SW.printNumber("FunctionLength", RF.FunctionLength());
+  SW.printNumber("RegF", RF.RegF());
+  SW.printNumber("RegI", RF.RegI());
+  SW.printBoolean("HomedParameters", RF.H());
+  SW.printNumber("CR", RF.CR());
+  SW.printNumber("FrameSize", RF.FrameSize() << 4);
+  ListScope PS(SW, "Prologue");
+
+  // Synthesize the equivalent prologue according to the documentation
+  // at https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling,
+  // printed in reverse order compared to the docs, to match how prologues
+  // are printed for the non-packed case.
+  int IntSZ = 8 * RF.RegI();
+  if (RF.CR() == 1)
+    IntSZ += 8;
+  int FpSZ = 8 * RF.RegF();
+  if (RF.RegF())
+    FpSZ += 8;
+  int SavSZ = (IntSZ + FpSZ + 8 * 8 * RF.H() + 0xf) & ~0xf;
+  int LocSZ = (RF.FrameSize() << 4) - SavSZ;
+
+  if (RF.CR() == 3) {
+    SW.startLine() << "mov x29, sp\n";
+    if (LocSZ <= 512) {
+      SW.startLine() << format("stp x29, lr, [sp, #-%d]!\n", LocSZ);
+    } else {
+      SW.startLine() << "stp x29, lr, [sp, #0]\n";
+    }
+  }
+  if (LocSZ > 4080) {
+    SW.startLine() << format("sub sp, sp, #%d\n", LocSZ - 4080);
+    SW.startLine() << "sub sp, sp, #4080\n";
+  } else if ((RF.CR() != 3 && LocSZ > 0) || LocSZ > 512) {
+    SW.startLine() << format("sub sp, sp, #%d\n", LocSZ);
+  }
+  if (RF.H()) {
+    SW.startLine() << format("stp x6, x7, [sp, #%d]\n", IntSZ + FpSZ + 48);
+    SW.startLine() << format("stp x4, x5, [sp, #%d]\n", IntSZ + FpSZ + 32);
+    SW.startLine() << format("stp x2, x3, [sp, #%d]\n", IntSZ + FpSZ + 16);
+    if (RF.RegI() > 0 || RF.RegF() > 0 || RF.CR() == 1) {
+      SW.startLine() << format("stp x0, x1, [sp, #%d]\n", IntSZ + FpSZ);
+    } else {
+      // This case isn't documented; if neither RegI nor RegF nor CR=1
+      // have decremented the stack pointer by SavSZ, we need to do it here
+      // (as the final stack adjustment of LocSZ excludes SavSZ).
+      SW.startLine() << format("stp x0, x1, [sp, #-%d]!\n", SavSZ);
+    }
+  }
+  int FloatRegs = RF.RegF() > 0 ? RF.RegF() + 1 : 0;
+  for (int I = (FloatRegs + 1) / 2 - 1; I >= 0; I--) {
+    if (I == (FloatRegs + 1) / 2 - 1 && FloatRegs % 2 == 1) {
+      // The last register, an odd register without a pair
+      SW.startLine() << format("str d%d, [sp, #%d]\n", 8 + 2 * I,
+                               IntSZ + 16 * I);
+    } else if (I == 0 && RF.RegI() == 0 && RF.CR() != 1) {
+      SW.startLine() << format("stp d%d, d%d, [sp, #-%d]!\n", 8 + 2 * I,
+                               8 + 2 * I + 1, SavSZ);
+    } else {
+      SW.startLine() << format("stp d%d, d%d, [sp, #%d]\n", 8 + 2 * I,
+                               8 + 2 * I + 1, IntSZ + 16 * I);
+    }
+  }
+  if (RF.CR() == 1 && (RF.RegI() % 2) == 0) {
+    if (RF.RegI() == 0)
+      SW.startLine() << format("str lr, [sp, #-%d]!\n", SavSZ);
+    else
+      SW.startLine() << format("str lr, [sp, #%d]\n", IntSZ - 8);
+  }
+  for (int I = (RF.RegI() + 1) / 2 - 1; I >= 0; I--) {
+    if (I == (RF.RegI() + 1) / 2 - 1 && RF.RegI() % 2 == 1) {
+      // The last register, an odd register without a pair
+      if (RF.CR() == 1) {
+        if (I == 0) { // If this is the only register pair
+          // CR=1 combined with RegI=1 doesn't map to a documented case;
+          // it doesn't map to any regular unwind info opcode, and the
+          // actual unwinder doesn't support it.
+          SW.startLine() << "INVALID!\n";
+        } else
+          SW.startLine() << format("stp x%d, lr, [sp, #%d]\n", 19 + 2 * I,
+                                   16 * I);
+      } else {
+        if (I == 0)
+          SW.startLine() << format("str x%d, [sp, #-%d]!\n", 19 + 2 * I, SavSZ);
+        else
+          SW.startLine() << format("str x%d, [sp, #%d]\n", 19 + 2 * I, 16 * I);
+      }
+    } else if (I == 0) {
+      // The first register pair
+      SW.startLine() << format("stp x19, x20, [sp, #-%d]!\n", SavSZ);
+    } else {
+      SW.startLine() << format("stp x%d, x%d, [sp, #%d]\n", 19 + 2 * I,
+                               19 + 2 * I + 1, 16 * I);
+    }
+  }
+  SW.startLine() << "end\n";
+
+  return true;
+}
+
 bool Decoder::dumpProcedureDataEntry(const COFFObjectFile &COFF,
                                      const SectionRef Section, unsigned Index,
                                      ArrayRef<uint8_t> Contents) {
@@ -1073,8 +1262,8 @@ bool Decoder::dumpProcedureDataEntry(const COFFObjectFile &COFF,
   if (Entry.Flag() == RuntimeFunctionFlag::RFF_Unpacked)
     return dumpUnpackedEntry(COFF, Section, Offset, Index, Entry);
   if (isAArch64) {
-    SW.startLine() << "Packed unwind data not yet supported for ARM64\n";
-    return true;
+    const RuntimeFunctionARM64 EntryARM64(Data);
+    return dumpPackedARM64Entry(COFF, Section, Offset, Index, EntryARM64);
   }
   return dumpPackedEntry(COFF, Section, Offset, Index, Entry);
 }
diff --git a/contrib/llvm-project/llvm/tools/llvm-readobj/ARMWinEHPrinter.h b/contrib/llvm-project/llvm/tools/llvm-readobj/ARMWinEHPrinter.h
index 5de7062cb1d7..3263841a267b 100644
--- a/contrib/llvm-project/llvm/tools/llvm-readobj/ARMWinEHPrinter.h
+++ b/contrib/llvm-project/llvm/tools/llvm-readobj/ARMWinEHPrinter.h
@@ -17,6 +17,7 @@ namespace llvm {
 namespace ARM {
 namespace WinEH {
 class RuntimeFunction;
+class RuntimeFunctionARM64;
 
 class Decoder {
   static const size_t PDataEntrySize;
@@ -120,6 +121,14 @@ class Decoder {
                     bool Prologue);
   bool opcode_save_next(const uint8_t *Opcodes, unsigned &Offset,
                         unsigned Length, bool Prologue);
+  bool opcode_trap_frame(const uint8_t *Opcodes, unsigned &Offset,
+                         unsigned Length, bool Prologue);
+  bool opcode_machine_frame(const uint8_t *Opcodes, unsigned &Offset,
+                            unsigned Length, bool Prologue);
+  bool opcode_context(const uint8_t *Opcodes, unsigned &Offset, unsigned Length,
+                      bool Prologue);
+  bool opcode_clear_unwound_to_call(const uint8_t *Opcodes, unsigned &Offset,
+                                    unsigned Length, bool Prologue);
 
   void decodeOpcodes(ArrayRef<uint8_t> Opcodes, unsigned Offset,
                      bool Prologue);
@@ -146,6 +155,9 @@ class Decoder {
   bool dumpPackedEntry(const object::COFFObjectFile &COFF,
                        const object::SectionRef Section, uint64_t Offset,
                        unsigned Index, const RuntimeFunction &Entry);
+  bool dumpPackedARM64Entry(const object::COFFObjectFile &COFF,
+                            const object::SectionRef Section, uint64_t Offset,
+                            unsigned Index, const RuntimeFunctionARM64 &Entry);
   bool dumpProcedureDataEntry(const object::COFFObjectFile &COFF,
                               const object::SectionRef Section, unsigned Entry,
                               ArrayRef<uint8_t> Contents);
diff --git a/contrib/llvm-project/llvm/tools/llvm-readobj/COFFDumper.cpp b/contrib/llvm-project/llvm/tools/llvm-readobj/COFFDumper.cpp
index 89a904f53ae7..684967f93393 100644
--- a/contrib/llvm-project/llvm/tools/llvm-readobj/COFFDumper.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-readobj/COFFDumper.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARMWinEHPrinter.h"
-#include "Error.h"
 #include "ObjDumper.h"
 #include "StackMapPrinter.h"
 #include "Win64EHDumper.h"
@@ -60,10 +59,6 @@ using namespace llvm::codeview;
 using namespace llvm::support;
 using namespace llvm::Win64EH;
 
-static inline Error createError(const Twine &Err) {
-  return make_error<StringError>(Err, object_error::parse_failed);
-}
-
 namespace {
 
 struct LoadConfigTables {
@@ -72,6 +67,8 @@ struct LoadConfigTables {
   uint32_t GuardFlags = 0;
   uint64_t GuardFidTableVA = 0;
   uint64_t GuardFidTableCount = 0;
+  uint64_t GuardIatTableVA = 0;
+  uint64_t GuardIatTableCount = 0;
   uint64_t GuardLJmpTableVA = 0;
   uint64_t GuardLJmpTableCount = 0;
 };
@@ -80,7 +77,8 @@ class COFFDumper : public ObjDumper {
 public:
   friend class COFFObjectDumpDelegate;
   COFFDumper(const llvm::object::COFFObjectFile *Obj, ScopedPrinter &Writer)
-      : ObjDumper(Writer), Obj(Obj), Writer(Writer), Types(100) {}
+      : ObjDumper(Writer, Obj->getFileName()), Obj(Obj), Writer(Writer),
+        Types(100) {}
 
   void printFileHeaders() override;
   void printSectionHeaders() override;
@@ -94,6 +92,7 @@ public:
   void printCOFFDirectives() override;
   void printCOFFBaseReloc() override;
   void printCOFFDebugDirectory() override;
+  void printCOFFTLSDirectory() override;
   void printCOFFResources() override;
   void printCOFFLoadConfig() override;
   void printCodeViewDebugInfo() override;
@@ -121,13 +120,14 @@ private:
   void printBaseOfDataField(const pe32plus_header *Hdr);
   template <typename T>
   void printCOFFLoadConfig(const T *Conf, LoadConfigTables &Tables);
+  template <typename IntTy>
+  void printCOFFTLSDirectory(const coff_tls_directory<IntTy> *TlsTable);
   typedef void (*PrintExtraCB)(raw_ostream &, const uint8_t *);
   void printRVATable(uint64_t TableVA, uint64_t Count, uint64_t EntrySize,
                      PrintExtraCB PrintExtra = 0);
 
   void printCodeViewSymbolSection(StringRef SectionName, const SectionRef &Section);
   void printCodeViewTypeSection(StringRef SectionName, const SectionRef &Section);
-  StringRef getTypeName(TypeIndex Ty);
   StringRef getFileNameForFileOffset(uint32_t FileOffset);
   void printFileNameForOffset(StringRef Label, uint32_t FileOffset);
   void printTypeIndex(StringRef FieldName, TypeIndex TI) {
@@ -243,15 +243,9 @@ private:
 
 namespace llvm {
 
-std::error_code createCOFFDumper(const object::ObjectFile *Obj,
-                                 ScopedPrinter &Writer,
-                                 std::unique_ptr<ObjDumper> &Result) {
-  const COFFObjectFile *COFFObj = dyn_cast<COFFObjectFile>(Obj);
-  if (!COFFObj)
-    return readobj_error::unsupported_obj_file_format;
-
-  Result.reset(new COFFDumper(COFFObj, Writer));
-  return readobj_error::success;
+std::unique_ptr<ObjDumper> createCOFFDumper(const object::COFFObjectFile &Obj,
+                                            ScopedPrinter &Writer) {
+  return std::make_unique<COFFDumper>(&Obj, Writer);
 }
 
 } // namespace llvm
@@ -272,9 +266,9 @@ std::error_code COFFDumper::resolveSymbol(const coff_section *Section,
     }
   }
   if (SymI == Obj->symbol_end())
-    return readobj_error::unknown_symbol;
+    return inconvertibleErrorCode();
   Sym = *SymI;
-  return readobj_error::success;
+  return std::error_code();
 }
 
 // Given a section and an offset into this section the function returns the name
@@ -588,7 +582,7 @@ static std::error_code getSymbolAuxData(const COFFObjectFile *Obj,
   ArrayRef<uint8_t> AuxData = Obj->getSymbolAuxData(Symbol);
   AuxData = AuxData.slice(AuxSymbolIdx * Obj->getSymbolTableEntrySize());
   Aux = reinterpret_cast<const T*>(AuxData.data());
-  return readobj_error::success;
+  return std::error_code();
 }
 
 void COFFDumper::cacheRelocations() {
@@ -815,6 +809,11 @@ void COFFDumper::printCOFFLoadConfig() {
     }
   }
 
+  if (Tables.GuardIatTableVA) {
+    ListScope LS(W, "GuardIatTable");
+    printRVATable(Tables.GuardIatTableVA, Tables.GuardIatTableCount, 4);
+  }
+
   if (Tables.GuardLJmpTableVA) {
     ListScope LS(W, "GuardLJmpTable");
     printRVATable(Tables.GuardLJmpTableVA, Tables.GuardLJmpTableCount, 4);
@@ -899,6 +898,9 @@ void COFFDumper::printCOFFLoadConfig(const T *Conf, LoadConfigTables &Tables) {
              Conf->GuardRFVerifyStackPointerFunctionPointer);
   W.printHex("HotPatchTableOffset", Conf->HotPatchTableOffset);
 
+  Tables.GuardIatTableVA = Conf->GuardAddressTakenIatEntryTable;
+  Tables.GuardIatTableCount = Conf->GuardAddressTakenIatEntryCount;
+
   Tables.GuardLJmpTableVA = Conf->GuardLongJumpTargetTable;
   Tables.GuardLJmpTableCount = Conf->GuardLongJumpTargetCount;
 }
@@ -2029,3 +2031,27 @@ void llvm::dumpCodeViewMergedTypes(ScopedPrinter &Writer,
     Writer.flush();
   }
 }
+
+void COFFDumper::printCOFFTLSDirectory() {
+  if (Obj->is64())
+    printCOFFTLSDirectory(Obj->getTLSDirectory64());
+  else
+    printCOFFTLSDirectory(Obj->getTLSDirectory32());
+}
+
+template <typename IntTy>
+void COFFDumper::printCOFFTLSDirectory(
+    const coff_tls_directory<IntTy> *TlsTable) {
+  DictScope D(W, "TLSDirectory");
+  if (!TlsTable)
+    return;
+
+  W.printHex("StartAddressOfRawData", TlsTable->StartAddressOfRawData);
+  W.printHex("EndAddressOfRawData", TlsTable->EndAddressOfRawData);
+  W.printHex("AddressOfIndex", TlsTable->AddressOfIndex);
+  W.printHex("AddressOfCallBacks", TlsTable->AddressOfCallBacks);
+  W.printHex("SizeOfZeroFill", TlsTable->SizeOfZeroFill);
+  W.printFlags("Characteristics", TlsTable->Characteristics,
+               makeArrayRef(ImageSectionCharacteristics),
+               COFF::SectionCharacteristics(COFF::IMAGE_SCN_ALIGN_MASK));
+}
diff --git a/contrib/llvm-project/llvm/tools/llvm-readobj/DwarfCFIEHPrinter.h b/contrib/llvm-project/llvm/tools/llvm-readobj/DwarfCFIEHPrinter.h
index 27942224053f..2dfe21684a62 100644
--- a/contrib/llvm-project/llvm/tools/llvm-readobj/DwarfCFIEHPrinter.h
+++ b/contrib/llvm-project/llvm/tools/llvm-readobj/DwarfCFIEHPrinter.h
@@ -9,38 +9,37 @@
 #ifndef LLVM_TOOLS_LLVM_READOBJ_DWARFCFIEHPRINTER_H
 #define LLVM_TOOLS_LLVM_READOBJ_DWARFCFIEHPRINTER_H
 
-#include "Error.h"
 #include "llvm-readobj.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugFrame.h"
 #include "llvm/Object/ELF.h"
-#include "llvm/Object/ELFTypes.h"
 #include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Object/ELFTypes.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
-#include "llvm/DebugInfo/DWARF/DWARFDebugFrame.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Support/type_traits.h"
 
 namespace llvm {
 namespace DwarfCFIEH {
 
-template <typename ELFT>
-class PrinterContext {
+template <typename ELFT> class PrinterContext {
   using Elf_Shdr = typename ELFT::Shdr;
   using Elf_Phdr = typename ELFT::Phdr;
 
   ScopedPrinter &W;
-  const object::ELFObjectFile<ELFT> *ObjF;
+  const object::ELFObjectFile<ELFT> &ObjF;
 
   void printEHFrameHdr(const Elf_Phdr *EHFramePHdr) const;
   void printEHFrame(const Elf_Shdr *EHFrameShdr) const;
 
 public:
-  PrinterContext(ScopedPrinter &W, const object::ELFObjectFile<ELFT> *ObjF)
+  PrinterContext(ScopedPrinter &W, const object::ELFObjectFile<ELFT> &ObjF)
       : W(W), ObjF(ObjF) {}
 
   void printUnwindInformation() const;
@@ -48,11 +47,11 @@ public:
 
 template <class ELFT>
 static const typename ELFT::Shdr *
-findSectionByAddress(const object::ELFObjectFile<ELFT> *ObjF, uint64_t Addr) {
+findSectionByAddress(const object::ELFObjectFile<ELFT> &ObjF, uint64_t Addr) {
   Expected<typename ELFT::ShdrRange> SectionsOrErr =
-      ObjF->getELFFile()->sections();
+      ObjF.getELFFile().sections();
   if (!SectionsOrErr)
-    reportError(SectionsOrErr.takeError(), ObjF->getFileName());
+    reportError(SectionsOrErr.takeError(), ObjF.getFileName());
 
   for (const typename ELFT::Shdr &Shdr : *SectionsOrErr)
     if (Shdr.sh_addr == Addr)
@@ -62,11 +61,11 @@ findSectionByAddress(const object::ELFObjectFile<ELFT> *ObjF, uint64_t Addr) {
 
 template <typename ELFT>
 void PrinterContext<ELFT>::printUnwindInformation() const {
-  const object::ELFFile<ELFT> *Obj = ObjF->getELFFile();
+  const object::ELFFile<ELFT> &Obj = ObjF.getELFFile();
 
-  Expected<typename ELFT::PhdrRange> PhdrsOrErr = Obj->program_headers();
+  Expected<typename ELFT::PhdrRange> PhdrsOrErr = Obj.program_headers();
   if (!PhdrsOrErr)
-    reportError(PhdrsOrErr.takeError(), ObjF->getFileName());
+    reportError(PhdrsOrErr.takeError(), ObjF.getFileName());
 
   for (const Elf_Phdr &Phdr : *PhdrsOrErr) {
     if (Phdr.p_type != ELF::PT_GNU_EH_FRAME)
@@ -75,20 +74,19 @@ void PrinterContext<ELFT>::printUnwindInformation() const {
     if (Phdr.p_memsz != Phdr.p_filesz)
       reportError(object::createError(
                       "p_memsz does not match p_filesz for GNU_EH_FRAME"),
-                  ObjF->getFileName());
+                  ObjF.getFileName());
     printEHFrameHdr(&Phdr);
     break;
   }
 
-  Expected<typename ELFT::ShdrRange> SectionsOrErr =
-      ObjF->getELFFile()->sections();
+  Expected<typename ELFT::ShdrRange> SectionsOrErr = Obj.sections();
   if (!SectionsOrErr)
-    reportError(SectionsOrErr.takeError(), ObjF->getFileName());
+    reportError(SectionsOrErr.takeError(), ObjF.getFileName());
 
   for (const Elf_Shdr &Shdr : *SectionsOrErr) {
-    Expected<StringRef> NameOrErr = Obj->getSectionName(&Shdr);
+    Expected<StringRef> NameOrErr = Obj.getSectionName(Shdr);
     if (!NameOrErr)
-      reportError(NameOrErr.takeError(), ObjF->getFileName());
+      reportError(NameOrErr.takeError(), ObjF.getFileName());
     if (*NameOrErr == ".eh_frame")
       printEHFrame(&Shdr);
   }
@@ -102,18 +100,18 @@ void PrinterContext<ELFT>::printEHFrameHdr(const Elf_Phdr *EHFramePHdr) const {
   W.startLine() << format("Offset: 0x%" PRIx64 "\n", (uint64_t)EHFramePHdr->p_offset);
   W.startLine() << format("Size: 0x%" PRIx64 "\n", (uint64_t)EHFramePHdr->p_memsz);
 
-  const object::ELFFile<ELFT> *Obj = ObjF->getELFFile();
+  const object::ELFFile<ELFT> &Obj = ObjF.getELFFile();
   if (const Elf_Shdr *EHFrameHdr =
           findSectionByAddress(ObjF, EHFramePHdr->p_vaddr)) {
-    Expected<StringRef> NameOrErr = Obj->getSectionName(EHFrameHdr);
+    Expected<StringRef> NameOrErr = Obj.getSectionName(*EHFrameHdr);
     if (!NameOrErr)
-      reportError(NameOrErr.takeError(), ObjF->getFileName());
+      reportError(NameOrErr.takeError(), ObjF.getFileName());
     W.printString("Corresponding Section", *NameOrErr);
   }
 
-  Expected<ArrayRef<uint8_t>> Content = Obj->getSegmentContents(EHFramePHdr);
+  Expected<ArrayRef<uint8_t>> Content = Obj.getSegmentContents(*EHFramePHdr);
   if (!Content)
-    reportError(Content.takeError(), ObjF->getFileName());
+    reportError(Content.takeError(), ObjF.getFileName());
 
   DataExtractor DE(*Content,
                    ELFT::TargetEndianness == support::endianness::little,
@@ -127,25 +125,25 @@ void PrinterContext<ELFT>::printEHFrameHdr(const Elf_Phdr *EHFramePHdr) const {
   if (Version != 1)
     reportError(
         object::createError("only version 1 of .eh_frame_hdr is supported"),
-        ObjF->getFileName());
+        ObjF.getFileName());
 
   uint64_t EHFramePtrEnc = DE.getU8(&Offset);
   W.startLine() << format("eh_frame_ptr_enc: 0x%" PRIx64 "\n", EHFramePtrEnc);
   if (EHFramePtrEnc != (dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4))
     reportError(object::createError("unexpected encoding eh_frame_ptr_enc"),
-                ObjF->getFileName());
+                ObjF.getFileName());
 
   uint64_t FDECountEnc = DE.getU8(&Offset);
   W.startLine() << format("fde_count_enc: 0x%" PRIx64 "\n", FDECountEnc);
   if (FDECountEnc != dwarf::DW_EH_PE_udata4)
     reportError(object::createError("unexpected encoding fde_count_enc"),
-                ObjF->getFileName());
+                ObjF.getFileName());
 
   uint64_t TableEnc = DE.getU8(&Offset);
   W.startLine() << format("table_enc: 0x%" PRIx64 "\n", TableEnc);
   if (TableEnc != (dwarf::DW_EH_PE_datarel | dwarf::DW_EH_PE_sdata4))
     reportError(object::createError("unexpected encoding table_enc"),
-                ObjF->getFileName());
+                ObjF.getFileName());
 
   auto EHFramePtr = DE.getSigned(&Offset, 4) + EHFrameHdrAddress + 4;
   W.startLine() << format("eh_frame_ptr: 0x%" PRIx64 "\n", EHFramePtr);
@@ -156,7 +154,7 @@ void PrinterContext<ELFT>::printEHFrameHdr(const Elf_Phdr *EHFramePHdr) const {
   unsigned NumEntries = 0;
   uint64_t PrevPC = 0;
   while (Offset + 8 <= EHFramePHdr->p_memsz && NumEntries < FDECount) {
-    DictScope D(W, std::string("entry ")  + std::to_string(NumEntries));
+    DictScope D(W, std::string("entry ") + std::to_string(NumEntries));
 
     auto InitialPC = DE.getSigned(&Offset, 4) + EHFrameHdrAddress;
     W.startLine() << format("initial_location: 0x%" PRIx64 "\n", InitialPC);
@@ -165,7 +163,7 @@ void PrinterContext<ELFT>::printEHFrameHdr(const Elf_Phdr *EHFramePHdr) const {
 
     if (InitialPC < PrevPC)
       reportError(object::createError("initial_location is out of order"),
-                  ObjF->getFileName());
+                  ObjF.getFileName());
 
     PrevPC = InitialPC;
     ++NumEntries;
@@ -182,17 +180,20 @@ void PrinterContext<ELFT>::printEHFrame(const Elf_Shdr *EHFrameShdr) const {
   W.indent();
 
   Expected<ArrayRef<uint8_t>> DataOrErr =
-      ObjF->getELFFile()->getSectionContents(EHFrameShdr);
+      ObjF.getELFFile().getSectionContents(*EHFrameShdr);
   if (!DataOrErr)
-    reportError(DataOrErr.takeError(), ObjF->getFileName());
+    reportError(DataOrErr.takeError(), ObjF.getFileName());
 
-  DWARFDataExtractor DE(*DataOrErr,
+  // Construct DWARFDataExtractor to handle relocations ("PC Begin" fields).
+  std::unique_ptr<DWARFContext> DICtx = DWARFContext::create(ObjF, nullptr);
+  DWARFDataExtractor DE(DICtx->getDWARFObj(),
+                        DICtx->getDWARFObj().getEHFrameSection(),
                         ELFT::TargetEndianness == support::endianness::little,
                         ELFT::Is64Bits ? 8 : 4);
-  DWARFDebugFrame EHFrame(Triple::ArchType(ObjF->getArch()), /*IsEH=*/true,
+  DWARFDebugFrame EHFrame(Triple::ArchType(ObjF.getArch()), /*IsEH=*/true,
                           /*EHFrameAddress=*/Address);
   if (Error E = EHFrame.parse(DE))
-    reportError(std::move(E), ObjF->getFileName());
+    reportError(std::move(E), ObjF.getFileName());
 
   for (const dwarf::FrameEntry &Entry : EHFrame) {
     if (const dwarf::CIE *CIE = dyn_cast<dwarf::CIE>(&Entry)) {
@@ -224,7 +225,8 @@ void PrinterContext<ELFT>::printEHFrame(const Elf_Shdr *EHFrameShdr) const {
     W.getOStream() << "\n";
     W.startLine() << "Program:\n";
     W.indent();
-    Entry.cfis().dump(W.getOStream(), nullptr, W.getIndentLevel());
+    Entry.cfis().dump(W.getOStream(), DIDumpOptions(), nullptr,
+                      W.getIndentLevel());
     W.unindent();
     W.unindent();
     W.getOStream() << "\n";
@@ -232,7 +234,7 @@ void PrinterContext<ELFT>::printEHFrame(const Elf_Shdr *EHFrameShdr) const {
 
   W.unindent();
 }
-}
-}
+} // namespace DwarfCFIEH
+} // namespace llvm
 
 #endif
diff --git a/contrib/llvm-project/llvm/tools/llvm-readobj/ELFDumper.cpp b/contrib/llvm-project/llvm/tools/llvm-readobj/ELFDumper.cpp
index 15076f1f8933..0f508f8dc0f2 100644
--- a/contrib/llvm-project/llvm/tools/llvm-readobj/ELFDumper.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-readobj/ELFDumper.cpp
@@ -13,7 +13,6 @@
 
 #include "ARMEHABIPrinter.h"
 #include "DwarfCFIEHPrinter.h"
-#include "Error.h"
 #include "ObjDumper.h"
 #include "StackMapPrinter.h"
 #include "llvm-readobj.h"
@@ -65,7 +64,6 @@
 #include <memory>
 #include <string>
 #include <system_error>
-#include <unordered_set>
 #include <vector>
 
 using namespace llvm;
@@ -82,61 +80,37 @@ using namespace ELF;
 #define ENUM_ENT_1(enum)                                                       \
   { #enum, #enum, ELF::enum }
 
-#define LLVM_READOBJ_PHDR_ENUM(ns, enum)                                       \
-  case ns::enum:                                                               \
-    return std::string(#enum).substr(3);
-
-#define TYPEDEF_ELF_TYPES(ELFT)                                                \
-  using ELFO = ELFFile<ELFT>;                                                  \
-  using Elf_Addr = typename ELFT::Addr;                                        \
-  using Elf_Shdr = typename ELFT::Shdr;                                        \
-  using Elf_Sym = typename ELFT::Sym;                                          \
-  using Elf_Dyn = typename ELFT::Dyn;                                          \
-  using Elf_Dyn_Range = typename ELFT::DynRange;                               \
-  using Elf_Rel = typename ELFT::Rel;                                          \
-  using Elf_Rela = typename ELFT::Rela;                                        \
-  using Elf_Relr = typename ELFT::Relr;                                        \
-  using Elf_Rel_Range = typename ELFT::RelRange;                               \
-  using Elf_Rela_Range = typename ELFT::RelaRange;                             \
-  using Elf_Relr_Range = typename ELFT::RelrRange;                             \
-  using Elf_Phdr = typename ELFT::Phdr;                                        \
-  using Elf_Half = typename ELFT::Half;                                        \
-  using Elf_Ehdr = typename ELFT::Ehdr;                                        \
-  using Elf_Word = typename ELFT::Word;                                        \
-  using Elf_Hash = typename ELFT::Hash;                                        \
-  using Elf_GnuHash = typename ELFT::GnuHash;                                  \
-  using Elf_Note  = typename ELFT::Note;                                       \
-  using Elf_Sym_Range = typename ELFT::SymRange;                               \
-  using Elf_Versym = typename ELFT::Versym;                                    \
-  using Elf_Verneed = typename ELFT::Verneed;                                  \
-  using Elf_Vernaux = typename ELFT::Vernaux;                                  \
-  using Elf_Verdef = typename ELFT::Verdef;                                    \
-  using Elf_Verdaux = typename ELFT::Verdaux;                                  \
-  using Elf_CGProfile = typename ELFT::CGProfile;                              \
-  using uintX_t = typename ELFT::uint;
-
 namespace {
 
-template <class ELFT> class DumpStyle;
+template <class ELFT> struct RelSymbol {
+  RelSymbol(const typename ELFT::Sym *S, StringRef N)
+      : Sym(S), Name(N.str()) {}
+  const typename ELFT::Sym *Sym;
+  std::string Name;
+};
 
 /// Represents a contiguous uniform range in the file. We cannot just create a
 /// range directly because when creating one of these from the .dynamic table
 /// the size, entity size and virtual address are different entries in arbitrary
 /// order (DT_REL, DT_RELSZ, DT_RELENT for example).
 struct DynRegionInfo {
-  DynRegionInfo(StringRef ObjName) : FileName(ObjName) {}
-  DynRegionInfo(const void *A, uint64_t S, uint64_t ES, StringRef ObjName)
-      : Addr(A), Size(S), EntSize(ES), FileName(ObjName) {}
+  DynRegionInfo(const Binary &Owner, const ObjDumper &D)
+      : Obj(&Owner), Dumper(&D) {}
+  DynRegionInfo(const Binary &Owner, const ObjDumper &D, const uint8_t *A,
+                uint64_t S, uint64_t ES)
+      : Addr(A), Size(S), EntSize(ES), Obj(&Owner), Dumper(&D) {}
 
   /// Address in current address space.
-  const void *Addr = nullptr;
+  const uint8_t *Addr = nullptr;
   /// Size in bytes of the region.
   uint64_t Size = 0;
   /// Size of each entity in the region.
   uint64_t EntSize = 0;
 
-  /// Name of the file. Used for error reporting.
-  StringRef FileName;
+  /// Owner object. Used for error reporting.
+  const Binary *Obj;
+  /// Dumper used for error reporting.
+  const ObjDumper *Dumper;
   /// Error prefix. Used for error reporting to provide more information.
   std::string Context;
   /// Region size name. Used for error reporting.
@@ -149,6 +123,20 @@ struct DynRegionInfo {
     const Type *Start = reinterpret_cast<const Type *>(Addr);
     if (!Start)
       return {Start, Start};
+
+    const uint64_t Offset =
+        Addr - (const uint8_t *)Obj->getMemoryBufferRef().getBufferStart();
+    const uint64_t ObjSize = Obj->getMemoryBufferRef().getBufferSize();
+
+    if (Size > ObjSize - Offset) {
+      Dumper->reportUniqueWarning(
+          "unable to read data at 0x" + Twine::utohexstr(Offset) +
+          " of size 0x" + Twine::utohexstr(Size) + " (" + SizePrintName +
+          "): it goes past the end of the file of size 0x" +
+          Twine::utohexstr(ObjSize));
+      return {Start, Start};
+    }
+
     if (EntSize == sizeof(Type) && (Size % EntSize == 0))
       return {Start, Start + (Size / EntSize)};
 
@@ -163,697 +151,415 @@ struct DynRegionInfo {
           (" or " + EntSizePrintName + " (0x" + Twine::utohexstr(EntSize) + ")")
               .str();
 
-    reportWarning(createError(Msg.c_str()), FileName);
+    Dumper->reportUniqueWarning(Msg);
     return {Start, Start};
   }
 };
 
-namespace {
-struct VerdAux {
-  unsigned Offset;
-  std::string Name;
+struct GroupMember {
+  StringRef Name;
+  uint64_t Index;
 };
 
-struct VerDef {
-  unsigned Offset;
-  unsigned Version;
-  unsigned Flags;
-  unsigned Ndx;
-  unsigned Cnt;
-  unsigned Hash;
-  std::string Name;
-  std::vector<VerdAux> AuxV;
+struct GroupSection {
+  StringRef Name;
+  std::string Signature;
+  uint64_t ShName;
+  uint64_t Index;
+  uint32_t Link;
+  uint32_t Info;
+  uint32_t Type;
+  std::vector<GroupMember> Members;
 };
 
-struct VernAux {
-  unsigned Hash;
-  unsigned Flags;
-  unsigned Other;
-  unsigned Offset;
-  std::string Name;
-};
+namespace {
 
-struct VerNeed {
-  unsigned Version;
-  unsigned Cnt;
-  unsigned Offset;
-  std::string File;
-  std::vector<VernAux> AuxV;
+struct NoteType {
+  uint32_t ID;
+  StringRef Name;
 };
 
 } // namespace
 
+template <class ELFT> class Relocation {
+public:
+  Relocation(const typename ELFT::Rel &R, bool IsMips64EL)
+      : Type(R.getType(IsMips64EL)), Symbol(R.getSymbol(IsMips64EL)),
+        Offset(R.r_offset), Info(R.r_info) {}
+
+  Relocation(const typename ELFT::Rela &R, bool IsMips64EL)
+      : Relocation((const typename ELFT::Rel &)R, IsMips64EL) {
+    Addend = R.r_addend;
+  }
+
+  uint32_t Type;
+  uint32_t Symbol;
+  typename ELFT::uint Offset;
+  typename ELFT::uint Info;
+  Optional<int64_t> Addend;
+};
+
+template <class ELFT> class MipsGOTParser;
+
 template <typename ELFT> class ELFDumper : public ObjDumper {
+  LLVM_ELF_IMPORT_TYPES_ELFT(ELFT)
+
 public:
-  ELFDumper(const object::ELFObjectFile<ELFT> *ObjF, ScopedPrinter &Writer);
+  ELFDumper(const object::ELFObjectFile<ELFT> &ObjF, ScopedPrinter &Writer);
 
-  void printFileHeaders() override;
-  void printSectionHeaders() override;
-  void printRelocations() override;
-  void printDependentLibs() override;
-  void printDynamicRelocations() override;
-  void printSymbols(bool PrintSymbols, bool PrintDynamicSymbols) override;
-  void printHashSymbols() override;
   void printUnwindInfo() override;
-
-  void printDynamicTable() override;
   void printNeededLibraries() override;
-  void printProgramHeaders(bool PrintProgramHeaders,
-                           cl::boolOrDefault PrintSectionMapping) override;
   void printHashTable() override;
-  void printGnuHashTable(const object::ObjectFile *Obj) override;
+  void printGnuHashTable() override;
   void printLoadName() override;
   void printVersionInfo() override;
-  void printGroupSections() override;
-
   void printArchSpecificInfo() override;
-
   void printStackMap() const override;
 
-  void printHashHistograms() override;
-
-  void printCGProfile() override;
-  void printAddrsig() override;
-
-  void printNotes() override;
+  const object::ELFObjectFile<ELFT> &getElfObject() const { return ObjF; };
 
-  void printELFLinkerOptions() override;
-  void printStackSizes() override;
+  std::string describe(const Elf_Shdr &Sec) const;
 
-  const object::ELFObjectFile<ELFT> *getElfObject() const { return ObjF; };
+  unsigned getHashTableEntSize() const {
+    // EM_S390 and ELF::EM_ALPHA platforms use 8-bytes entries in SHT_HASH
+    // sections. This violates the ELF specification.
+    if (Obj.getHeader().e_machine == ELF::EM_S390 ||
+        Obj.getHeader().e_machine == ELF::EM_ALPHA)
+      return 8;
+    return 4;
+  }
 
-private:
-  std::unique_ptr<DumpStyle<ELFT>> ELFDumperStyle;
+  Elf_Dyn_Range dynamic_table() const {
+    // A valid .dynamic section contains an array of entries terminated
+    // with a DT_NULL entry. However, sometimes the section content may
+    // continue past the DT_NULL entry, so to dump the section correctly,
+    // we first find the end of the entries by iterating over them.
+    Elf_Dyn_Range Table = DynamicTable.template getAsArrayRef<Elf_Dyn>();
 
-  TYPEDEF_ELF_TYPES(ELFT)
+    size_t Size = 0;
+    while (Size < Table.size())
+      if (Table[Size++].getTag() == DT_NULL)
+        break;
 
-  DynRegionInfo checkDRI(DynRegionInfo DRI) {
-    const ELFFile<ELFT> *Obj = ObjF->getELFFile();
-    if (DRI.Addr < Obj->base() ||
-        reinterpret_cast<const uint8_t *>(DRI.Addr) + DRI.Size >
-            Obj->base() + Obj->getBufSize())
-      reportError(errorCodeToError(llvm::object::object_error::parse_failed),
-                  ObjF->getFileName());
-    return DRI;
+    return Table.slice(0, Size);
   }
 
-  DynRegionInfo createDRIFrom(const Elf_Phdr *P, uintX_t EntSize) {
-    return checkDRI({ObjF->getELFFile()->base() + P->p_offset, P->p_filesz,
-                     EntSize, ObjF->getFileName()});
+  Elf_Sym_Range dynamic_symbols() const {
+    if (!DynSymRegion)
+      return Elf_Sym_Range();
+    return DynSymRegion->template getAsArrayRef<Elf_Sym>();
   }
 
-  DynRegionInfo createDRIFrom(const Elf_Shdr *S) {
-    return checkDRI({ObjF->getELFFile()->base() + S->sh_offset, S->sh_size,
-                     S->sh_entsize, ObjF->getFileName()});
+  const Elf_Shdr *findSectionByName(StringRef Name) const;
+
+  StringRef getDynamicStringTable() const { return DynamicStringTable; }
+
+protected:
+  virtual void printVersionSymbolSection(const Elf_Shdr *Sec) = 0;
+  virtual void printVersionDefinitionSection(const Elf_Shdr *Sec) = 0;
+  virtual void printVersionDependencySection(const Elf_Shdr *Sec) = 0;
+
+  void
+  printDependentLibsHelper(function_ref<void(const Elf_Shdr &)> OnSectionStart,
+                           function_ref<void(StringRef, uint64_t)> OnLibEntry);
+
+  virtual void printRelRelaReloc(const Relocation<ELFT> &R,
+                                 const RelSymbol<ELFT> &RelSym) = 0;
+  virtual void printRelrReloc(const Elf_Relr &R) = 0;
+  virtual void printDynamicRelocHeader(unsigned Type, StringRef Name,
+                                       const DynRegionInfo &Reg) {}
+  void printReloc(const Relocation<ELFT> &R, unsigned RelIndex,
+                  const Elf_Shdr &Sec, const Elf_Shdr *SymTab);
+  void printDynamicReloc(const Relocation<ELFT> &R);
+  void printDynamicRelocationsHelper();
+  void printRelocationsHelper(const Elf_Shdr &Sec);
+  void forEachRelocationDo(
+      const Elf_Shdr &Sec, bool RawRelr,
+      llvm::function_ref<void(const Relocation<ELFT> &, unsigned,
+                              const Elf_Shdr &, const Elf_Shdr *)>
+          RelRelaFn,
+      llvm::function_ref<void(const Elf_Relr &)> RelrFn);
+
+  virtual void printSymtabMessage(const Elf_Shdr *Symtab, size_t Offset,
+                                  bool NonVisibilityBitsUsed) const {};
+  virtual void printSymbol(const Elf_Sym &Symbol, unsigned SymIndex,
+                           DataRegion<Elf_Word> ShndxTable,
+                           Optional<StringRef> StrTable, bool IsDynamic,
+                           bool NonVisibilityBitsUsed) const = 0;
+
+  virtual void printMipsABIFlags() = 0;
+  virtual void printMipsGOT(const MipsGOTParser<ELFT> &Parser) = 0;
+  virtual void printMipsPLT(const MipsGOTParser<ELFT> &Parser) = 0;
+
+  Expected<ArrayRef<Elf_Versym>>
+  getVersionTable(const Elf_Shdr &Sec, ArrayRef<Elf_Sym> *SymTab,
+                  StringRef *StrTab, const Elf_Shdr **SymTabSec) const;
+  StringRef getPrintableSectionName(const Elf_Shdr &Sec) const;
+
+  std::vector<GroupSection> getGroups();
+
+  bool printFunctionStackSize(uint64_t SymValue,
+                              Optional<const Elf_Shdr *> FunctionSec,
+                              const Elf_Shdr &StackSizeSec, DataExtractor Data,
+                              uint64_t *Offset);
+  void printStackSize(const Relocation<ELFT> &R, const Elf_Shdr &RelocSec,
+                      unsigned Ndx, const Elf_Shdr *SymTab,
+                      const Elf_Shdr *FunctionSec, const Elf_Shdr &StackSizeSec,
+                      const RelocationResolver &Resolver, DataExtractor Data);
+  virtual void printStackSizeEntry(uint64_t Size, StringRef FuncName) = 0;
+
+  void printRelocatableStackSizes(std::function<void()> PrintHeader);
+  void printNonRelocatableStackSizes(std::function<void()> PrintHeader);
+
+  const object::ELFObjectFile<ELFT> &ObjF;
+  const ELFFile<ELFT> &Obj;
+  StringRef FileName;
+
+  Expected<DynRegionInfo> createDRI(uint64_t Offset, uint64_t Size,
+                                    uint64_t EntSize) {
+    if (Offset + Size < Offset || Offset + Size > Obj.getBufSize())
+      return createError("offset (0x" + Twine::utohexstr(Offset) +
+                         ") + size (0x" + Twine::utohexstr(Size) +
+                         ") is greater than the file size (0x" +
+                         Twine::utohexstr(Obj.getBufSize()) + ")");
+    return DynRegionInfo(ObjF, *this, Obj.base() + Offset, Size, EntSize);
   }
 
   void printAttributes();
   void printMipsReginfo();
   void printMipsOptions();
 
-  std::pair<const Elf_Phdr *, const Elf_Shdr *>
-  findDynamic(const ELFFile<ELFT> *Obj);
-  void loadDynamicTable(const ELFFile<ELFT> *Obj);
-  void parseDynamicTable(const ELFFile<ELFT> *Obj);
+  std::pair<const Elf_Phdr *, const Elf_Shdr *> findDynamic();
+  void loadDynamicTable();
+  void parseDynamicTable();
 
-  Expected<StringRef> getSymbolVersion(const Elf_Sym *symb,
+  Expected<StringRef> getSymbolVersion(const Elf_Sym &Sym,
                                        bool &IsDefault) const;
-  Error LoadVersionMap() const;
+  Expected<SmallVector<Optional<VersionEntry>, 0> *> getVersionMap() const;
 
-  const object::ELFObjectFile<ELFT> *ObjF;
   DynRegionInfo DynRelRegion;
   DynRegionInfo DynRelaRegion;
   DynRegionInfo DynRelrRegion;
   DynRegionInfo DynPLTRelRegion;
   Optional<DynRegionInfo> DynSymRegion;
+  DynRegionInfo DynSymTabShndxRegion;
   DynRegionInfo DynamicTable;
   StringRef DynamicStringTable;
-  StringRef SOName = "<Not found>";
   const Elf_Hash *HashTable = nullptr;
   const Elf_GnuHash *GnuHashTable = nullptr;
   const Elf_Shdr *DotSymtabSec = nullptr;
+  const Elf_Shdr *DotDynsymSec = nullptr;
   const Elf_Shdr *DotCGProfileSec = nullptr;
   const Elf_Shdr *DotAddrsigSec = nullptr;
-  StringRef DynSymtabName;
-  ArrayRef<Elf_Word> ShndxTable;
+  DenseMap<const Elf_Shdr *, ArrayRef<Elf_Word>> ShndxTables;
+  Optional<uint64_t> SONameOffset;
 
   const Elf_Shdr *SymbolVersionSection = nullptr;   // .gnu.version
   const Elf_Shdr *SymbolVersionNeedSection = nullptr; // .gnu.version_r
   const Elf_Shdr *SymbolVersionDefSection = nullptr; // .gnu.version_d
 
-  struct VersionEntry {
-    std::string Name;
-    bool IsVerDef;
-  };
-  mutable SmallVector<Optional<VersionEntry>, 16> VersionMap;
-
-  std::unordered_set<std::string> Warnings;
-
-public:
-  Elf_Dyn_Range dynamic_table() const {
-    // A valid .dynamic section contains an array of entries terminated
-    // with a DT_NULL entry. However, sometimes the section content may
-    // continue past the DT_NULL entry, so to dump the section correctly,
-    // we first find the end of the entries by iterating over them.
-    Elf_Dyn_Range Table = DynamicTable.getAsArrayRef<Elf_Dyn>();
-
-    size_t Size = 0;
-    while (Size < Table.size())
-      if (Table[Size++].getTag() == DT_NULL)
-        break;
-
-    return Table.slice(0, Size);
-  }
-
-  Optional<DynRegionInfo> getDynSymRegion() const { return DynSymRegion; }
-
-  Elf_Sym_Range dynamic_symbols() const {
-    if (!DynSymRegion)
-      return Elf_Sym_Range();
-    return DynSymRegion->getAsArrayRef<Elf_Sym>();
-  }
-
-  Elf_Rel_Range dyn_rels() const;
-  Elf_Rela_Range dyn_relas() const;
-  Elf_Relr_Range dyn_relrs() const;
-  std::string getFullSymbolName(const Elf_Sym *Symbol,
+  std::string getFullSymbolName(const Elf_Sym &Symbol, unsigned SymIndex,
+                                DataRegion<Elf_Word> ShndxTable,
                                 Optional<StringRef> StrTable,
                                 bool IsDynamic) const;
-  Expected<unsigned> getSymbolSectionIndex(const Elf_Sym *Symbol,
-                                           const Elf_Sym *FirstSym) const;
-  Expected<StringRef> getSymbolSectionName(const Elf_Sym *Symbol,
+  Expected<unsigned>
+  getSymbolSectionIndex(const Elf_Sym &Symbol, unsigned SymIndex,
+                        DataRegion<Elf_Word> ShndxTable) const;
+  Expected<StringRef> getSymbolSectionName(const Elf_Sym &Symbol,
                                            unsigned SectionIndex) const;
   std::string getStaticSymbolName(uint32_t Index) const;
   StringRef getDynamicString(uint64_t Value) const;
-  Expected<StringRef> getSymbolVersionByIndex(uint32_t VersionSymbolIndex,
-                                              bool &IsDefault) const;
 
   void printSymbolsHelper(bool IsDynamic) const;
   std::string getDynamicEntry(uint64_t Type, uint64_t Value) const;
 
-  const Elf_Shdr *getDotSymtabSec() const { return DotSymtabSec; }
-  const Elf_Shdr *getDotCGProfileSec() const { return DotCGProfileSec; }
-  const Elf_Shdr *getDotAddrsigSec() const { return DotAddrsigSec; }
-  ArrayRef<Elf_Word> getShndxTable() const { return ShndxTable; }
-  StringRef getDynamicStringTable() const { return DynamicStringTable; }
-  const DynRegionInfo &getDynRelRegion() const { return DynRelRegion; }
-  const DynRegionInfo &getDynRelaRegion() const { return DynRelaRegion; }
-  const DynRegionInfo &getDynRelrRegion() const { return DynRelrRegion; }
-  const DynRegionInfo &getDynPLTRelRegion() const { return DynPLTRelRegion; }
-  const DynRegionInfo &getDynamicTableRegion() const { return DynamicTable; }
-  const Elf_Hash *getHashTable() const { return HashTable; }
-  const Elf_GnuHash *getGnuHashTable() const { return GnuHashTable; }
-
-  Expected<ArrayRef<Elf_Versym>> getVersionTable(const Elf_Shdr *Sec,
-                                                 ArrayRef<Elf_Sym> *SymTab,
-                                                 StringRef *StrTab) const;
-  Expected<std::vector<VerDef>>
-  getVersionDefinitions(const Elf_Shdr *Sec) const;
-  Expected<std::vector<VerNeed>>
-  getVersionDependencies(const Elf_Shdr *Sec) const;
-
-  Expected<std::pair<const Elf_Sym *, std::string>>
-  getRelocationTarget(const Elf_Shdr *SymTab, const Elf_Rela &R) const;
-
-  std::function<Error(const Twine &Msg)> WarningHandler;
-  void reportUniqueWarning(Error Err) const;
+  Expected<RelSymbol<ELFT>> getRelocationTarget(const Relocation<ELFT> &R,
+                                                const Elf_Shdr *SymTab) const;
+
+  ArrayRef<Elf_Word> getShndxTable(const Elf_Shdr *Symtab) const;
+
+private:
+  mutable SmallVector<Optional<VersionEntry>, 0> VersionMap;
 };
 
 template <class ELFT>
-static Expected<StringRef> getLinkAsStrtab(const ELFFile<ELFT> *Obj,
-                                           const typename ELFT::Shdr *Sec,
-                                           unsigned SecNdx) {
-  Expected<const typename ELFT::Shdr *> StrTabSecOrErr =
-      Obj->getSection(Sec->sh_link);
-  if (!StrTabSecOrErr)
-    return createError("invalid section linked to " +
-                       object::getELFSectionTypeName(
-                           Obj->getHeader()->e_machine, Sec->sh_type) +
-                       " section with index " + Twine(SecNdx) + ": " +
-                       toString(StrTabSecOrErr.takeError()));
-
-  Expected<StringRef> StrTabOrErr = Obj->getStringTable(*StrTabSecOrErr);
-  if (!StrTabOrErr)
-    return createError("invalid string table linked to " +
-                       object::getELFSectionTypeName(
-                           Obj->getHeader()->e_machine, Sec->sh_type) +
-                       " section with index " + Twine(SecNdx) + ": " +
-                       toString(StrTabOrErr.takeError()));
-  return *StrTabOrErr;
+std::string ELFDumper<ELFT>::describe(const Elf_Shdr &Sec) const {
+  return ::describe(Obj, Sec);
 }
 
-// Returns the linked symbol table and associated string table for a given section.
+namespace {
+
+template <class ELFT> struct SymtabLink {
+  typename ELFT::SymRange Symbols;
+  StringRef StringTable;
+  const typename ELFT::Shdr *SymTab;
+};
+
+// Returns the linked symbol table, symbols and associated string table for a
+// given section.
 template <class ELFT>
-static Expected<std::pair<typename ELFT::SymRange, StringRef>>
-getLinkAsSymtab(const ELFFile<ELFT> *Obj, const typename ELFT::Shdr *Sec,
-                   unsigned SecNdx, unsigned ExpectedType) {
+Expected<SymtabLink<ELFT>> getLinkAsSymtab(const ELFFile<ELFT> &Obj,
+                                           const typename ELFT::Shdr &Sec,
+                                           unsigned ExpectedType) {
   Expected<const typename ELFT::Shdr *> SymtabOrErr =
-      Obj->getSection(Sec->sh_link);
+      Obj.getSection(Sec.sh_link);
   if (!SymtabOrErr)
-    return createError("invalid section linked to " +
-                       object::getELFSectionTypeName(
-                           Obj->getHeader()->e_machine, Sec->sh_type) +
-                       " section with index " + Twine(SecNdx) + ": " +
-                       toString(SymtabOrErr.takeError()));
+    return createError("invalid section linked to " + describe(Obj, Sec) +
+                       ": " + toString(SymtabOrErr.takeError()));
 
   if ((*SymtabOrErr)->sh_type != ExpectedType)
     return createError(
-        "invalid section linked to " +
-        object::getELFSectionTypeName(Obj->getHeader()->e_machine,
-                                      Sec->sh_type) +
-        " section with index " + Twine(SecNdx) + ": expected " +
-        object::getELFSectionTypeName(Obj->getHeader()->e_machine,
-                                      ExpectedType) +
+        "invalid section linked to " + describe(Obj, Sec) + ": expected " +
+        object::getELFSectionTypeName(Obj.getHeader().e_machine, ExpectedType) +
         ", but got " +
-        object::getELFSectionTypeName(Obj->getHeader()->e_machine,
+        object::getELFSectionTypeName(Obj.getHeader().e_machine,
                                       (*SymtabOrErr)->sh_type));
 
-  Expected<StringRef> StrTabOrErr =
-      getLinkAsStrtab(Obj, *SymtabOrErr, Sec->sh_link);
+  Expected<StringRef> StrTabOrErr = Obj.getLinkAsStrtab(**SymtabOrErr);
   if (!StrTabOrErr)
     return createError(
         "can't get a string table for the symbol table linked to " +
-        object::getELFSectionTypeName(Obj->getHeader()->e_machine,
-                                      Sec->sh_type) +
-        " section with index " + Twine(SecNdx) + ": " +
-        toString(StrTabOrErr.takeError()));
+        describe(Obj, Sec) + ": " + toString(StrTabOrErr.takeError()));
 
-  Expected<typename ELFT::SymRange> SymsOrErr = Obj->symbols(*SymtabOrErr);
+  Expected<typename ELFT::SymRange> SymsOrErr = Obj.symbols(*SymtabOrErr);
   if (!SymsOrErr)
-    return createError(
-        "unable to read symbols from the symbol table with index " +
-        Twine(Sec->sh_link) + ": " + toString(SymsOrErr.takeError()));
+    return createError("unable to read symbols from the " + describe(Obj, Sec) +
+                       ": " + toString(SymsOrErr.takeError()));
 
-  return std::make_pair(*SymsOrErr, *StrTabOrErr);
+  return SymtabLink<ELFT>{*SymsOrErr, *StrTabOrErr, *SymtabOrErr};
 }
 
+} // namespace
+
 template <class ELFT>
 Expected<ArrayRef<typename ELFT::Versym>>
-ELFDumper<ELFT>::getVersionTable(const Elf_Shdr *Sec, ArrayRef<Elf_Sym> *SymTab,
-                                 StringRef *StrTab) const {
-  assert((!SymTab && !StrTab) || (SymTab && StrTab));
-  const ELFFile<ELFT> *Obj = ObjF->getELFFile();
-  unsigned SecNdx = Sec - &cantFail(Obj->sections()).front();
-
-  if (uintptr_t(Obj->base() + Sec->sh_offset) % sizeof(uint16_t) != 0)
-    return createError("the SHT_GNU_versym section with index " +
-                       Twine(SecNdx) + " is misaligned");
+ELFDumper<ELFT>::getVersionTable(const Elf_Shdr &Sec, ArrayRef<Elf_Sym> *SymTab,
+                                 StringRef *StrTab,
+                                 const Elf_Shdr **SymTabSec) const {
+  assert((!SymTab && !StrTab && !SymTabSec) || (SymTab && StrTab && SymTabSec));
+  if (reinterpret_cast<uintptr_t>(Obj.base() + Sec.sh_offset) %
+          sizeof(uint16_t) !=
+      0)
+    return createError("the " + describe(Sec) + " is misaligned");
 
   Expected<ArrayRef<Elf_Versym>> VersionsOrErr =
-      Obj->template getSectionContentsAsArray<Elf_Versym>(Sec);
+      Obj.template getSectionContentsAsArray<Elf_Versym>(Sec);
   if (!VersionsOrErr)
-    return createError(
-        "cannot read content of SHT_GNU_versym section with index " +
-        Twine(SecNdx) + ": " + toString(VersionsOrErr.takeError()));
+    return createError("cannot read content of " + describe(Sec) + ": " +
+                       toString(VersionsOrErr.takeError()));
 
-  Expected<std::pair<ArrayRef<Elf_Sym>, StringRef>> SymTabOrErr =
-      getLinkAsSymtab(Obj, Sec, SecNdx, SHT_DYNSYM);
+  Expected<SymtabLink<ELFT>> SymTabOrErr =
+      getLinkAsSymtab(Obj, Sec, SHT_DYNSYM);
   if (!SymTabOrErr) {
     reportUniqueWarning(SymTabOrErr.takeError());
     return *VersionsOrErr;
   }
 
-  if (SymTabOrErr->first.size() != VersionsOrErr->size())
-    reportUniqueWarning(
-        createError("SHT_GNU_versym section with index " + Twine(SecNdx) +
-                    ": the number of entries (" + Twine(VersionsOrErr->size()) +
-                    ") does not match the number of symbols (" +
-                    Twine(SymTabOrErr->first.size()) +
-                    ") in the symbol table with index " + Twine(Sec->sh_link)));
-
-  if (SymTab)
-    std::tie(*SymTab, *StrTab) = *SymTabOrErr;
-  return *VersionsOrErr;
-}
-
-template <class ELFT>
-Expected<std::vector<VerDef>>
-ELFDumper<ELFT>::getVersionDefinitions(const Elf_Shdr *Sec) const {
-  const ELFFile<ELFT> *Obj = ObjF->getELFFile();
-  unsigned SecNdx = Sec - &cantFail(Obj->sections()).front();
-
-  Expected<StringRef> StrTabOrErr = getLinkAsStrtab(Obj, Sec, SecNdx);
-  if (!StrTabOrErr)
-    return StrTabOrErr.takeError();
-
-  Expected<ArrayRef<uint8_t>> ContentsOrErr = Obj->getSectionContents(Sec);
-  if (!ContentsOrErr)
-    return createError(
-        "cannot read content of SHT_GNU_verdef section with index " +
-        Twine(SecNdx) + ": " + toString(ContentsOrErr.takeError()));
-
-  const uint8_t *Start = ContentsOrErr->data();
-  const uint8_t *End = Start + ContentsOrErr->size();
-
-  auto ExtractNextAux = [&](const uint8_t *&VerdauxBuf,
-                            unsigned VerDefNdx) -> Expected<VerdAux> {
-    if (VerdauxBuf + sizeof(Elf_Verdaux) > End)
-      return createError("invalid SHT_GNU_verdef section with index " +
-                         Twine(SecNdx) + ": version definition " +
-                         Twine(VerDefNdx) +
-                         " refers to an auxiliary entry that goes past the end "
-                         "of the section");
-
-    auto *Verdaux = reinterpret_cast<const Elf_Verdaux *>(VerdauxBuf);
-    VerdauxBuf += Verdaux->vda_next;
-
-    VerdAux Aux;
-    Aux.Offset = VerdauxBuf - Start;
-    if (Verdaux->vda_name <= StrTabOrErr->size())
-      Aux.Name = std::string(StrTabOrErr->drop_front(Verdaux->vda_name));
-    else
-      Aux.Name = "<invalid vda_name: " + to_string(Verdaux->vda_name) + ">";
-    return Aux;
-  };
-
-  std::vector<VerDef> Ret;
-  const uint8_t *VerdefBuf = Start;
-  for (unsigned I = 1; I <= /*VerDefsNum=*/Sec->sh_info; ++I) {
-    if (VerdefBuf + sizeof(Elf_Verdef) > End)
-      return createError("invalid SHT_GNU_verdef section with index " +
-                         Twine(SecNdx) + ": version definition " + Twine(I) +
-                         " goes past the end of the section");
-
-    if (uintptr_t(VerdefBuf) % sizeof(uint32_t) != 0)
-      return createError(
-          "invalid SHT_GNU_verdef section with index " + Twine(SecNdx) +
-          ": found a misaligned version definition entry at offset 0x" +
-          Twine::utohexstr(VerdefBuf - Start));
-
-    unsigned Version = *reinterpret_cast<const Elf_Half *>(VerdefBuf);
-    if (Version != 1)
-      return createError("unable to dump SHT_GNU_verdef section with index " +
-                         Twine(SecNdx) + ": version " + Twine(Version) +
-                         " is not yet supported");
-
-    const Elf_Verdef *D = reinterpret_cast<const Elf_Verdef *>(VerdefBuf);
-    VerDef &VD = *Ret.emplace(Ret.end());
-    VD.Offset = VerdefBuf - Start;
-    VD.Version = D->vd_version;
-    VD.Flags = D->vd_flags;
-    VD.Ndx = D->vd_ndx;
-    VD.Cnt = D->vd_cnt;
-    VD.Hash = D->vd_hash;
-
-    const uint8_t *VerdauxBuf = VerdefBuf + D->vd_aux;
-    for (unsigned J = 0; J < D->vd_cnt; ++J) {
-      if (uintptr_t(VerdauxBuf) % sizeof(uint32_t) != 0)
-        return createError("invalid SHT_GNU_verdef section with index " +
-                           Twine(SecNdx) +
-                           ": found a misaligned auxiliary entry at offset 0x" +
-                           Twine::utohexstr(VerdauxBuf - Start));
-
-      Expected<VerdAux> AuxOrErr = ExtractNextAux(VerdauxBuf, I);
-      if (!AuxOrErr)
-        return AuxOrErr.takeError();
-
-      if (J == 0)
-        VD.Name = AuxOrErr->Name;
-      else
-        VD.AuxV.push_back(*AuxOrErr);
-    }
-
-    VerdefBuf += D->vd_next;
-  }
-
-  return Ret;
-}
-
-template <class ELFT>
-Expected<std::vector<VerNeed>>
-ELFDumper<ELFT>::getVersionDependencies(const Elf_Shdr *Sec) const {
-  const ELFFile<ELFT> *Obj = ObjF->getELFFile();
-  unsigned SecNdx = Sec - &cantFail(Obj->sections()).front();
-
-  StringRef StrTab;
-  Expected<StringRef> StrTabOrErr = getLinkAsStrtab(Obj, Sec, SecNdx);
-  if (!StrTabOrErr)
-    reportUniqueWarning(StrTabOrErr.takeError());
-  else
-    StrTab = *StrTabOrErr;
-
-  Expected<ArrayRef<uint8_t>> ContentsOrErr = Obj->getSectionContents(Sec);
-  if (!ContentsOrErr)
-    return createError(
-        "cannot read content of SHT_GNU_verneed section with index " +
-        Twine(SecNdx) + ": " + toString(ContentsOrErr.takeError()));
-
-  const uint8_t *Start = ContentsOrErr->data();
-  const uint8_t *End = Start + ContentsOrErr->size();
-  const uint8_t *VerneedBuf = Start;
-
-  std::vector<VerNeed> Ret;
-  for (unsigned I = 1; I <= /*VerneedNum=*/Sec->sh_info; ++I) {
-    if (VerneedBuf + sizeof(Elf_Verdef) > End)
-      return createError("invalid SHT_GNU_verneed section with index " +
-                         Twine(SecNdx) + ": version dependency " + Twine(I) +
-                         " goes past the end of the section");
+  if (SymTabOrErr->Symbols.size() != VersionsOrErr->size())
+    reportUniqueWarning(describe(Sec) + ": the number of entries (" +
+                        Twine(VersionsOrErr->size()) +
+                        ") does not match the number of symbols (" +
+                        Twine(SymTabOrErr->Symbols.size()) +
+                        ") in the symbol table with index " +
+                        Twine(Sec.sh_link));
 
-    if (uintptr_t(VerneedBuf) % sizeof(uint32_t) != 0)
-      return createError(
-          "invalid SHT_GNU_verneed section with index " + Twine(SecNdx) +
-          ": found a misaligned version dependency entry at offset 0x" +
-          Twine::utohexstr(VerneedBuf - Start));
-
-    unsigned Version = *reinterpret_cast<const Elf_Half *>(VerneedBuf);
-    if (Version != 1)
-      return createError("unable to dump SHT_GNU_verneed section with index " +
-                         Twine(SecNdx) + ": version " + Twine(Version) +
-                         " is not yet supported");
-
-    const Elf_Verneed *Verneed =
-        reinterpret_cast<const Elf_Verneed *>(VerneedBuf);
-
-    VerNeed &VN = *Ret.emplace(Ret.end());
-    VN.Version = Verneed->vn_version;
-    VN.Cnt = Verneed->vn_cnt;
-    VN.Offset = VerneedBuf - Start;
-
-    if (Verneed->vn_file < StrTab.size())
-      VN.File = std::string(StrTab.drop_front(Verneed->vn_file));
-    else
-      VN.File = "<corrupt vn_file: " + to_string(Verneed->vn_file) + ">";
-
-    const uint8_t *VernauxBuf = VerneedBuf + Verneed->vn_aux;
-    for (unsigned J = 0; J < Verneed->vn_cnt; ++J) {
-      if (uintptr_t(VernauxBuf) % sizeof(uint32_t) != 0)
-        return createError("invalid SHT_GNU_verneed section with index " +
-                           Twine(SecNdx) +
-                           ": found a misaligned auxiliary entry at offset 0x" +
-                           Twine::utohexstr(VernauxBuf - Start));
-
-      if (VernauxBuf + sizeof(Elf_Vernaux) > End)
-        return createError(
-            "invalid SHT_GNU_verneed section with index " + Twine(SecNdx) +
-            ": version dependency " + Twine(I) +
-            " refers to an auxiliary entry that goes past the end "
-            "of the section");
-
-      const Elf_Vernaux *Vernaux =
-          reinterpret_cast<const Elf_Vernaux *>(VernauxBuf);
-
-      VernAux &Aux = *VN.AuxV.emplace(VN.AuxV.end());
-      Aux.Hash = Vernaux->vna_hash;
-      Aux.Flags = Vernaux->vna_flags;
-      Aux.Other = Vernaux->vna_other;
-      Aux.Offset = VernauxBuf - Start;
-      if (StrTab.size() <= Vernaux->vna_name)
-        Aux.Name = "<corrupt>";
-      else
-        Aux.Name = std::string(StrTab.drop_front(Vernaux->vna_name));
-
-      VernauxBuf += Vernaux->vna_next;
-    }
-    VerneedBuf += Verneed->vn_next;
+  if (SymTab) {
+    *SymTab = SymTabOrErr->Symbols;
+    *StrTab = SymTabOrErr->StringTable;
+    *SymTabSec = SymTabOrErr->SymTab;
   }
-  return Ret;
+  return *VersionsOrErr;
 }
 
 template <class ELFT>
 void ELFDumper<ELFT>::printSymbolsHelper(bool IsDynamic) const {
   Optional<StringRef> StrTable;
-  StringRef SymtabName;
   size_t Entries = 0;
   Elf_Sym_Range Syms(nullptr, nullptr);
-  const ELFFile<ELFT> *Obj = ObjF->getELFFile();
+  const Elf_Shdr *SymtabSec = IsDynamic ? DotDynsymSec : DotSymtabSec;
+
   if (IsDynamic) {
     StrTable = DynamicStringTable;
     Syms = dynamic_symbols();
-    SymtabName = DynSymtabName;
     Entries = Syms.size();
-  } else {
-    if (!DotSymtabSec)
-      return;
-
+  } else if (DotSymtabSec) {
     if (Expected<StringRef> StrTableOrErr =
-            Obj->getStringTableForSymtab(*DotSymtabSec))
+            Obj.getStringTableForSymtab(*DotSymtabSec))
       StrTable = *StrTableOrErr;
     else
-      reportUniqueWarning(createError(
+      reportUniqueWarning(
           "unable to get the string table for the SHT_SYMTAB section: " +
-          toString(StrTableOrErr.takeError())));
+          toString(StrTableOrErr.takeError()));
 
-    if (Expected<Elf_Sym_Range> SymsOrErr = Obj->symbols(DotSymtabSec))
+    if (Expected<Elf_Sym_Range> SymsOrErr = Obj.symbols(DotSymtabSec))
       Syms = *SymsOrErr;
     else
       reportUniqueWarning(
-          createError("unable to read symbols from the SHT_SYMTAB section: " +
-                      toString(SymsOrErr.takeError())));
-
-    if (Expected<StringRef> SymtabNameOrErr =
-            Obj->getSectionName(DotSymtabSec)) {
-      SymtabName = *SymtabNameOrErr;
-    } else {
-      reportUniqueWarning(
-          createError("unable to get the name of the SHT_SYMTAB section: " +
-                      toString(SymtabNameOrErr.takeError())));
-      SymtabName = "<?>";
-    }
-
+          "unable to read symbols from the SHT_SYMTAB section: " +
+          toString(SymsOrErr.takeError()));
     Entries = DotSymtabSec->getEntityCount();
   }
-  if (Syms.begin() == Syms.end())
+  if (Syms.empty())
     return;
 
   // The st_other field has 2 logical parts. The first two bits hold the symbol
   // visibility (STV_*) and the remainder hold other platform-specific values.
-  bool NonVisibilityBitsUsed = llvm::find_if(Syms, [](const Elf_Sym &S) {
-                                 return S.st_other & ~0x3;
-                               }) != Syms.end();
+  bool NonVisibilityBitsUsed =
+      llvm::any_of(Syms, [](const Elf_Sym &S) { return S.st_other & ~0x3; });
+
+  DataRegion<Elf_Word> ShndxTable =
+      IsDynamic ? DataRegion<Elf_Word>(
+                      (const Elf_Word *)this->DynSymTabShndxRegion.Addr,
+                      this->getElfObject().getELFFile().end())
+                : DataRegion<Elf_Word>(this->getShndxTable(SymtabSec));
 
-  ELFDumperStyle->printSymtabMessage(Obj, SymtabName, Entries,
-                                     NonVisibilityBitsUsed);
-  for (const auto &Sym : Syms)
-    ELFDumperStyle->printSymbol(Obj, &Sym, Syms.begin(), StrTable, IsDynamic,
-                                NonVisibilityBitsUsed);
+  printSymtabMessage(SymtabSec, Entries, NonVisibilityBitsUsed);
+  for (const Elf_Sym &Sym : Syms)
+    printSymbol(Sym, &Sym - Syms.begin(), ShndxTable, StrTable, IsDynamic,
+                NonVisibilityBitsUsed);
 }
 
-template <class ELFT> class MipsGOTParser;
+template <typename ELFT> class GNUELFDumper : public ELFDumper<ELFT> {
+  formatted_raw_ostream &OS;
 
-template <typename ELFT> class DumpStyle {
 public:
-  using Elf_Shdr = typename ELFT::Shdr;
-  using Elf_Sym = typename ELFT::Sym;
-  using Elf_Addr = typename ELFT::Addr;
-
-  DumpStyle(ELFDumper<ELFT> *Dumper) : Dumper(Dumper) {
-    FileName = this->Dumper->getElfObject()->getFileName();
-  }
-
-  virtual ~DumpStyle() = default;
-
-  virtual void printFileHeaders(const ELFFile<ELFT> *Obj) = 0;
-  virtual void printGroupSections(const ELFFile<ELFT> *Obj) = 0;
-  virtual void printRelocations(const ELFFile<ELFT> *Obj) = 0;
-  virtual void printSectionHeaders(const ELFFile<ELFT> *Obj) = 0;
-  virtual void printSymbols(const ELFFile<ELFT> *Obj, bool PrintSymbols,
-                            bool PrintDynamicSymbols) = 0;
-  virtual void printHashSymbols(const ELFFile<ELFT> *Obj) {}
-  virtual void printDependentLibs(const ELFFile<ELFT> *Obj) = 0;
-  virtual void printDynamic(const ELFFile<ELFT> *Obj) {}
-  virtual void printDynamicRelocations(const ELFFile<ELFT> *Obj) = 0;
-  virtual void printSymtabMessage(const ELFFile<ELFT> *Obj, StringRef Name,
-                                  size_t Offset, bool NonVisibilityBitsUsed) {}
-  virtual void printSymbol(const ELFFile<ELFT> *Obj, const Elf_Sym *Symbol,
-                           const Elf_Sym *FirstSym,
-                           Optional<StringRef> StrTable, bool IsDynamic,
-                           bool NonVisibilityBitsUsed) = 0;
-  virtual void printProgramHeaders(const ELFFile<ELFT> *Obj,
-                                   bool PrintProgramHeaders,
-                                   cl::boolOrDefault PrintSectionMapping) = 0;
-  virtual void printVersionSymbolSection(const ELFFile<ELFT> *Obj,
-                                         const Elf_Shdr *Sec) = 0;
-  virtual void printVersionDefinitionSection(const ELFFile<ELFT> *Obj,
-                                             const Elf_Shdr *Sec) = 0;
-  virtual void printVersionDependencySection(const ELFFile<ELFT> *Obj,
-                                             const Elf_Shdr *Sec) = 0;
-  virtual void printHashHistograms(const ELFFile<ELFT> *Obj) = 0;
-  virtual void printCGProfile(const ELFFile<ELFT> *Obj) = 0;
-  virtual void printAddrsig(const ELFFile<ELFT> *Obj) = 0;
-  virtual void printNotes(const ELFFile<ELFT> *Obj) = 0;
-  virtual void printELFLinkerOptions(const ELFFile<ELFT> *Obj) = 0;
-  virtual void printStackSizes(const ELFObjectFile<ELFT> *Obj) = 0;
-  void printNonRelocatableStackSizes(const ELFObjectFile<ELFT> *Obj,
-                                     std::function<void()> PrintHeader);
-  void printRelocatableStackSizes(const ELFObjectFile<ELFT> *Obj,
-                                  std::function<void()> PrintHeader);
-  void printFunctionStackSize(const ELFObjectFile<ELFT> *Obj, uint64_t SymValue,
-                              Optional<SectionRef> FunctionSec,
-                              const StringRef SectionName, DataExtractor Data,
-                              uint64_t *Offset);
-  void printStackSize(const ELFObjectFile<ELFT> *Obj, RelocationRef Rel,
-                      SectionRef FunctionSec,
-                      const StringRef &StackSizeSectionName,
-                      const RelocationResolver &Resolver, DataExtractor Data);
-  virtual void printStackSizeEntry(uint64_t Size, StringRef FuncName) = 0;
-  virtual void printMipsGOT(const MipsGOTParser<ELFT> &Parser) = 0;
-  virtual void printMipsPLT(const MipsGOTParser<ELFT> &Parser) = 0;
-  virtual void printMipsABIFlags(const ELFObjectFile<ELFT> *Obj) = 0;
-  const ELFDumper<ELFT> *dumper() const { return Dumper; }
-
-protected:
-  void printDependentLibsHelper(
-      const ELFFile<ELFT> *Obj,
-      function_ref<void(const Elf_Shdr &)> OnSectionStart,
-      function_ref<void(StringRef, uint64_t)> OnSectionEntry);
-
-  void reportUniqueWarning(Error Err) const;
-  StringRef FileName;
-
-private:
-  const ELFDumper<ELFT> *Dumper;
-};
+  LLVM_ELF_IMPORT_TYPES_ELFT(ELFT)
 
-template <typename ELFT> class GNUStyle : public DumpStyle<ELFT> {
-  formatted_raw_ostream &OS;
+  GNUELFDumper(const object::ELFObjectFile<ELFT> &ObjF, ScopedPrinter &Writer)
+      : ELFDumper<ELFT>(ObjF, Writer),
+        OS(static_cast<formatted_raw_ostream &>(Writer.getOStream())) {
+    assert(&this->W.getOStream() == &llvm::fouts());
+  }
 
-public:
-  TYPEDEF_ELF_TYPES(ELFT)
-
-  GNUStyle(ScopedPrinter &W, ELFDumper<ELFT> *Dumper)
-      : DumpStyle<ELFT>(Dumper),
-        OS(static_cast<formatted_raw_ostream&>(W.getOStream())) {
-    assert (&W.getOStream() == &llvm::fouts());
-  }
-
-  void printFileHeaders(const ELFO *Obj) override;
-  void printGroupSections(const ELFFile<ELFT> *Obj) override;
-  void printRelocations(const ELFO *Obj) override;
-  void printSectionHeaders(const ELFO *Obj) override;
-  void printSymbols(const ELFO *Obj, bool PrintSymbols,
-                    bool PrintDynamicSymbols) override;
-  void printHashSymbols(const ELFO *Obj) override;
-  void printDependentLibs(const ELFFile<ELFT> *Obj) override;
-  void printDynamic(const ELFFile<ELFT> *Obj) override;
-  void printDynamicRelocations(const ELFO *Obj) override;
-  void printSymtabMessage(const ELFO *Obj, StringRef Name, size_t Offset,
-                          bool NonVisibilityBitsUsed) override;
-  void printProgramHeaders(const ELFO *Obj, bool PrintProgramHeaders,
+  void printFileHeaders() override;
+  void printGroupSections() override;
+  void printRelocations() override;
+  void printSectionHeaders() override;
+  void printSymbols(bool PrintSymbols, bool PrintDynamicSymbols) override;
+  void printHashSymbols() override;
+  void printSectionDetails() override;
+  void printDependentLibs() override;
+  void printDynamicTable() override;
+  void printDynamicRelocations() override;
+  void printSymtabMessage(const Elf_Shdr *Symtab, size_t Offset,
+                          bool NonVisibilityBitsUsed) const override;
+  void printProgramHeaders(bool PrintProgramHeaders,
                            cl::boolOrDefault PrintSectionMapping) override;
-  void printVersionSymbolSection(const ELFFile<ELFT> *Obj,
-                                 const Elf_Shdr *Sec) override;
-  void printVersionDefinitionSection(const ELFFile<ELFT> *Obj,
-                                     const Elf_Shdr *Sec) override;
-  void printVersionDependencySection(const ELFFile<ELFT> *Obj,
-                                     const Elf_Shdr *Sec) override;
-  void printHashHistograms(const ELFFile<ELFT> *Obj) override;
-  void printCGProfile(const ELFFile<ELFT> *Obj) override;
-  void printAddrsig(const ELFFile<ELFT> *Obj) override;
-  void printNotes(const ELFFile<ELFT> *Obj) override;
-  void printELFLinkerOptions(const ELFFile<ELFT> *Obj) override;
-  void printStackSizes(const ELFObjectFile<ELFT> *Obj) override;
-  void printStackSizeEntry(uint64_t Size, StringRef FuncName) override;
-  void printMipsGOT(const MipsGOTParser<ELFT> &Parser) override;
-  void printMipsPLT(const MipsGOTParser<ELFT> &Parser) override;
-  void printMipsABIFlags(const ELFObjectFile<ELFT> *Obj) override;
+  void printVersionSymbolSection(const Elf_Shdr *Sec) override;
+  void printVersionDefinitionSection(const Elf_Shdr *Sec) override;
+  void printVersionDependencySection(const Elf_Shdr *Sec) override;
+  void printHashHistograms() override;
+  void printCGProfile() override;
+  void printAddrsig() override;
+  void printNotes() override;
+  void printELFLinkerOptions() override;
+  void printStackSizes() override;
 
 private:
   void printHashHistogram(const Elf_Hash &HashTable);
   void printGnuHashHistogram(const Elf_GnuHash &GnuHashTable);
-
-  void printHashTableSymbols(const ELFO *Obj, const Elf_Hash &HashTable);
-  void printGnuHashTableSymbols(const ELFO *Obj,
-                                const Elf_GnuHash &GnuHashTable);
+  void printHashTableSymbols(const Elf_Hash &HashTable);
+  void printGnuHashTableSymbols(const Elf_GnuHash &GnuHashTable);
 
   struct Field {
     std::string Str;
@@ -864,8 +570,8 @@ private:
   };
 
   template <typename T, typename TEnum>
-  std::string printEnum(T Value, ArrayRef<EnumEntry<TEnum>> EnumValues) {
-    for (const auto &EnumItem : EnumValues)
+  std::string printEnum(T Value, ArrayRef<EnumEntry<TEnum>> EnumValues) const {
+    for (const EnumEntry<TEnum> &EnumItem : EnumValues)
       if (EnumItem.Value == Value)
         return std::string(EnumItem.AltName);
     return to_hexString(Value, false);
@@ -874,9 +580,9 @@ private:
   template <typename T, typename TEnum>
   std::string printFlags(T Value, ArrayRef<EnumEntry<TEnum>> EnumValues,
                          TEnum EnumMask1 = {}, TEnum EnumMask2 = {},
-                         TEnum EnumMask3 = {}) {
+                         TEnum EnumMask3 = {}) const {
     std::string Str;
-    for (const auto &Flag : EnumValues) {
+    for (const EnumEntry<TEnum> &Flag : EnumValues) {
       if (Flag.Value == 0)
         continue;
 
@@ -898,95 +604,85 @@ private:
     return Str;
   }
 
-  formatted_raw_ostream &printField(struct Field F) {
+  formatted_raw_ostream &printField(struct Field F) const {
     if (F.Column != 0)
       OS.PadToColumn(F.Column);
     OS << F.Str;
     OS.flush();
     return OS;
   }
-  void printHashedSymbol(const ELFO *Obj, const Elf_Sym *FirstSym, uint32_t Sym,
-                         StringRef StrTable, uint32_t Bucket);
-  void printRelocHeader(unsigned SType);
-  void printRelocation(const ELFO *Obj, unsigned SecIndex,
-                       const Elf_Shdr *SymTab, const Elf_Rela &R,
-                       unsigned RelIndex, bool IsRela);
-  void printRelocation(const ELFO *Obj, const Elf_Sym *Sym,
-                       StringRef SymbolName, const Elf_Rela &R, bool IsRela);
-  void printSymbol(const ELFO *Obj, const Elf_Sym *Symbol, const Elf_Sym *First,
+  void printHashedSymbol(const Elf_Sym *Sym, unsigned SymIndex,
+                         DataRegion<Elf_Word> ShndxTable, StringRef StrTable,
+                         uint32_t Bucket);
+  void printRelrReloc(const Elf_Relr &R) override;
+  void printRelRelaReloc(const Relocation<ELFT> &R,
+                         const RelSymbol<ELFT> &RelSym) override;
+  void printSymbol(const Elf_Sym &Symbol, unsigned SymIndex,
+                   DataRegion<Elf_Word> ShndxTable,
                    Optional<StringRef> StrTable, bool IsDynamic,
-                   bool NonVisibilityBitsUsed) override;
-  std::string getSymbolSectionNdx(const ELFO *Obj, const Elf_Sym *Symbol,
-                                  const Elf_Sym *FirstSym);
-  void printDynamicRelocation(const ELFO *Obj, Elf_Rela R, bool IsRela);
-  void printProgramHeaders(const ELFO *Obj);
-  void printSectionMapping(const ELFO *Obj);
-  void printGNUVersionSectionProlog(const ELFFile<ELFT> *Obj,
-                                    const typename ELFT::Shdr *Sec,
+                   bool NonVisibilityBitsUsed) const override;
+  void printDynamicRelocHeader(unsigned Type, StringRef Name,
+                               const DynRegionInfo &Reg) override;
+
+  std::string getSymbolSectionNdx(const Elf_Sym &Symbol, unsigned SymIndex,
+                                  DataRegion<Elf_Word> ShndxTable) const;
+  void printProgramHeaders() override;
+  void printSectionMapping() override;
+  void printGNUVersionSectionProlog(const typename ELFT::Shdr &Sec,
                                     const Twine &Label, unsigned EntriesNum);
-};
 
-template <class ELFT>
-void ELFDumper<ELFT>::reportUniqueWarning(Error Err) const {
-  handleAllErrors(std::move(Err), [&](const ErrorInfoBase &EI) {
-    cantFail(WarningHandler(EI.message()),
-             "WarningHandler should always return ErrorSuccess");
-  });
-}
+  void printStackSizeEntry(uint64_t Size, StringRef FuncName) override;
 
-template <class ELFT>
-void DumpStyle<ELFT>::reportUniqueWarning(Error Err) const {
-  this->dumper()->reportUniqueWarning(std::move(Err));
-}
+  void printMipsGOT(const MipsGOTParser<ELFT> &Parser) override;
+  void printMipsPLT(const MipsGOTParser<ELFT> &Parser) override;
+  void printMipsABIFlags() override;
+};
 
-template <typename ELFT> class LLVMStyle : public DumpStyle<ELFT> {
+template <typename ELFT> class LLVMELFDumper : public ELFDumper<ELFT> {
 public:
-  TYPEDEF_ELF_TYPES(ELFT)
-
-  LLVMStyle(ScopedPrinter &W, ELFDumper<ELFT> *Dumper)
-      : DumpStyle<ELFT>(Dumper), W(W) {}
-
-  void printFileHeaders(const ELFO *Obj) override;
-  void printGroupSections(const ELFFile<ELFT> *Obj) override;
-  void printRelocations(const ELFO *Obj) override;
-  void printRelocations(const Elf_Shdr *Sec, const ELFO *Obj);
-  void printSectionHeaders(const ELFO *Obj) override;
-  void printSymbols(const ELFO *Obj, bool PrintSymbols,
-                    bool PrintDynamicSymbols) override;
-  void printDependentLibs(const ELFFile<ELFT> *Obj) override;
-  void printDynamic(const ELFFile<ELFT> *Obj) override;
-  void printDynamicRelocations(const ELFO *Obj) override;
-  void printProgramHeaders(const ELFO *Obj, bool PrintProgramHeaders,
+  LLVM_ELF_IMPORT_TYPES_ELFT(ELFT)
+
+  LLVMELFDumper(const object::ELFObjectFile<ELFT> &ObjF, ScopedPrinter &Writer)
+      : ELFDumper<ELFT>(ObjF, Writer), W(Writer) {}
+
+  void printFileHeaders() override;
+  void printGroupSections() override;
+  void printRelocations() override;
+  void printSectionHeaders() override;
+  void printSymbols(bool PrintSymbols, bool PrintDynamicSymbols) override;
+  void printDependentLibs() override;
+  void printDynamicTable() override;
+  void printDynamicRelocations() override;
+  void printProgramHeaders(bool PrintProgramHeaders,
                            cl::boolOrDefault PrintSectionMapping) override;
-  void printVersionSymbolSection(const ELFFile<ELFT> *Obj,
-                                 const Elf_Shdr *Sec) override;
-  void printVersionDefinitionSection(const ELFFile<ELFT> *Obj,
-                                     const Elf_Shdr *Sec) override;
-  void printVersionDependencySection(const ELFFile<ELFT> *Obj,
-                                     const Elf_Shdr *Sec) override;
-  void printHashHistograms(const ELFFile<ELFT> *Obj) override;
-  void printCGProfile(const ELFFile<ELFT> *Obj) override;
-  void printAddrsig(const ELFFile<ELFT> *Obj) override;
-  void printNotes(const ELFFile<ELFT> *Obj) override;
-  void printELFLinkerOptions(const ELFFile<ELFT> *Obj) override;
-  void printStackSizes(const ELFObjectFile<ELFT> *Obj) override;
-  void printStackSizeEntry(uint64_t Size, StringRef FuncName) override;
-  void printMipsGOT(const MipsGOTParser<ELFT> &Parser) override;
-  void printMipsPLT(const MipsGOTParser<ELFT> &Parser) override;
-  void printMipsABIFlags(const ELFObjectFile<ELFT> *Obj) override;
+  void printVersionSymbolSection(const Elf_Shdr *Sec) override;
+  void printVersionDefinitionSection(const Elf_Shdr *Sec) override;
+  void printVersionDependencySection(const Elf_Shdr *Sec) override;
+  void printHashHistograms() override;
+  void printCGProfile() override;
+  void printAddrsig() override;
+  void printNotes() override;
+  void printELFLinkerOptions() override;
+  void printStackSizes() override;
 
 private:
-  void printRelocation(const ELFO *Obj, unsigned SecIndex, Elf_Rela Rel,
-                       unsigned RelIndex, const Elf_Shdr *SymTab);
-  void printDynamicRelocation(const ELFO *Obj, Elf_Rela Rel);
-  void printSymbols(const ELFO *Obj);
-  void printDynamicSymbols(const ELFO *Obj);
-  void printSymbolSection(const Elf_Sym *Symbol, const Elf_Sym *First);
-  void printSymbol(const ELFO *Obj, const Elf_Sym *Symbol, const Elf_Sym *First,
+  void printRelrReloc(const Elf_Relr &R) override;
+  void printRelRelaReloc(const Relocation<ELFT> &R,
+                         const RelSymbol<ELFT> &RelSym) override;
+
+  void printSymbolSection(const Elf_Sym &Symbol, unsigned SymIndex,
+                          DataRegion<Elf_Word> ShndxTable) const;
+  void printSymbol(const Elf_Sym &Symbol, unsigned SymIndex,
+                   DataRegion<Elf_Word> ShndxTable,
                    Optional<StringRef> StrTable, bool IsDynamic,
-                   bool /*NonVisibilityBitsUsed*/) override;
-  void printProgramHeaders(const ELFO *Obj);
-  void printSectionMapping(const ELFO *Obj) {}
+                   bool /*NonVisibilityBitsUsed*/) const override;
+  void printProgramHeaders() override;
+  void printSectionMapping() override {}
+  void printStackSizeEntry(uint64_t Size, StringRef FuncName) override;
+
+  void printMipsGOT(const MipsGOTParser<ELFT> &Parser) override;
+  void printMipsPLT(const MipsGOTParser<ELFT> &Parser) override;
+  void printMipsABIFlags() override;
 
   ScopedPrinter &W;
 };
@@ -996,81 +692,53 @@ private:
 namespace llvm {
 
 template <class ELFT>
-static std::error_code createELFDumper(const ELFObjectFile<ELFT> *Obj,
-                                       ScopedPrinter &Writer,
-                                       std::unique_ptr<ObjDumper> &Result) {
-  Result.reset(new ELFDumper<ELFT>(Obj, Writer));
-  return readobj_error::success;
+static std::unique_ptr<ObjDumper>
+createELFDumper(const ELFObjectFile<ELFT> &Obj, ScopedPrinter &Writer) {
+  if (opts::Output == opts::GNU)
+    return std::make_unique<GNUELFDumper<ELFT>>(Obj, Writer);
+  return std::make_unique<LLVMELFDumper<ELFT>>(Obj, Writer);
 }
 
-std::error_code createELFDumper(const object::ObjectFile *Obj,
-                                ScopedPrinter &Writer,
-                                std::unique_ptr<ObjDumper> &Result) {
+std::unique_ptr<ObjDumper> createELFDumper(const object::ELFObjectFileBase &Obj,
+                                           ScopedPrinter &Writer) {
   // Little-endian 32-bit
-  if (const ELF32LEObjectFile *ELFObj = dyn_cast<ELF32LEObjectFile>(Obj))
-    return createELFDumper(ELFObj, Writer, Result);
+  if (const ELF32LEObjectFile *ELFObj = dyn_cast<ELF32LEObjectFile>(&Obj))
+    return createELFDumper(*ELFObj, Writer);
 
   // Big-endian 32-bit
-  if (const ELF32BEObjectFile *ELFObj = dyn_cast<ELF32BEObjectFile>(Obj))
-    return createELFDumper(ELFObj, Writer, Result);
+  if (const ELF32BEObjectFile *ELFObj = dyn_cast<ELF32BEObjectFile>(&Obj))
+    return createELFDumper(*ELFObj, Writer);
 
   // Little-endian 64-bit
-  if (const ELF64LEObjectFile *ELFObj = dyn_cast<ELF64LEObjectFile>(Obj))
-    return createELFDumper(ELFObj, Writer, Result);
+  if (const ELF64LEObjectFile *ELFObj = dyn_cast<ELF64LEObjectFile>(&Obj))
+    return createELFDumper(*ELFObj, Writer);
 
   // Big-endian 64-bit
-  if (const ELF64BEObjectFile *ELFObj = dyn_cast<ELF64BEObjectFile>(Obj))
-    return createELFDumper(ELFObj, Writer, Result);
-
-  return readobj_error::unsupported_obj_file_format;
+  return createELFDumper(*cast<ELF64BEObjectFile>(&Obj), Writer);
 }
 
 } // end namespace llvm
 
-template <class ELFT> Error ELFDumper<ELFT>::LoadVersionMap() const {
-  // If there is no dynamic symtab or version table, there is nothing to do.
-  if (!DynSymRegion || !SymbolVersionSection)
-    return Error::success();
-
-  // Has the VersionMap already been loaded?
-  if (!VersionMap.empty())
-    return Error::success();
-
-  // The first two version indexes are reserved.
-  // Index 0 is LOCAL, index 1 is GLOBAL.
-  VersionMap.push_back(VersionEntry());
-  VersionMap.push_back(VersionEntry());
-
-  auto InsertEntry = [this](unsigned N, StringRef Version, bool IsVerdef) {
-    if (N >= VersionMap.size())
-      VersionMap.resize(N + 1);
-    VersionMap[N] = {std::string(Version), IsVerdef};
-  };
-
-  if (SymbolVersionDefSection) {
-    Expected<std::vector<VerDef>> Defs =
-        this->getVersionDefinitions(SymbolVersionDefSection);
-    if (!Defs)
-      return Defs.takeError();
-    for (const VerDef &Def : *Defs)
-      InsertEntry(Def.Ndx & ELF::VERSYM_VERSION, Def.Name, true);
-  }
-
-  if (SymbolVersionNeedSection) {
-    Expected<std::vector<VerNeed>> Deps =
-        this->getVersionDependencies(SymbolVersionNeedSection);
-    if (!Deps)
-      return Deps.takeError();
-    for (const VerNeed &Dep : *Deps)
-      for (const VernAux &Aux : Dep.AuxV)
-        InsertEntry(Aux.Other & ELF::VERSYM_VERSION, Aux.Name, false);
-  }
+template <class ELFT>
+Expected<SmallVector<Optional<VersionEntry>, 0> *>
+ELFDumper<ELFT>::getVersionMap() const {
+  // If the VersionMap has already been loaded or if there is no dynamic symtab
+  // or version table, there is nothing to do.
+  if (!VersionMap.empty() || !DynSymRegion || !SymbolVersionSection)
+    return &VersionMap;
+
+  Expected<SmallVector<Optional<VersionEntry>, 0>> MapOrErr =
+      Obj.loadVersionMap(SymbolVersionNeedSection, SymbolVersionDefSection);
+  if (MapOrErr)
+    VersionMap = *MapOrErr;
+  else
+    return MapOrErr.takeError();
 
-  return Error::success();
+  return &VersionMap;
 }
 
 template <typename ELFT>
-Expected<StringRef> ELFDumper<ELFT>::getSymbolVersion(const Elf_Sym *Sym,
+Expected<StringRef> ELFDumper<ELFT>::getSymbolVersion(const Elf_Sym &Sym,
                                                       bool &IsDefault) const {
   // This is a dynamic symbol. Look in the GNU symbol version table.
   if (!SymbolVersionSection) {
@@ -1081,55 +749,69 @@ Expected<StringRef> ELFDumper<ELFT>::getSymbolVersion(const Elf_Sym *Sym,
 
   assert(DynSymRegion && "DynSymRegion has not been initialised");
   // Determine the position in the symbol table of this entry.
-  size_t EntryIndex = (reinterpret_cast<uintptr_t>(Sym) -
+  size_t EntryIndex = (reinterpret_cast<uintptr_t>(&Sym) -
                        reinterpret_cast<uintptr_t>(DynSymRegion->Addr)) /
                       sizeof(Elf_Sym);
 
   // Get the corresponding version index entry.
-  if (Expected<const Elf_Versym *> EntryOrErr =
-          ObjF->getELFFile()->template getEntry<Elf_Versym>(
-              SymbolVersionSection, EntryIndex))
-    return this->getSymbolVersionByIndex((*EntryOrErr)->vs_index, IsDefault);
-  else
+  Expected<const Elf_Versym *> EntryOrErr =
+      Obj.template getEntry<Elf_Versym>(*SymbolVersionSection, EntryIndex);
+  if (!EntryOrErr)
     return EntryOrErr.takeError();
+
+  unsigned Version = (*EntryOrErr)->vs_index;
+  if (Version == VER_NDX_LOCAL || Version == VER_NDX_GLOBAL) {
+    IsDefault = false;
+    return "";
+  }
+
+  Expected<SmallVector<Optional<VersionEntry>, 0> *> MapOrErr =
+      getVersionMap();
+  if (!MapOrErr)
+    return MapOrErr.takeError();
+
+  return Obj.getSymbolVersionByIndex(Version, IsDefault, **MapOrErr,
+                                     Sym.st_shndx == ELF::SHN_UNDEF);
 }
 
 template <typename ELFT>
-Expected<std::pair<const typename ELFT::Sym *, std::string>>
-ELFDumper<ELFT>::getRelocationTarget(const Elf_Shdr *SymTab,
-                                     const Elf_Rela &R) const {
-  const ELFFile<ELFT> *Obj = ObjF->getELFFile();
-  Expected<const Elf_Sym *> SymOrErr = Obj->getRelocationSymbol(&R, SymTab);
+Expected<RelSymbol<ELFT>>
+ELFDumper<ELFT>::getRelocationTarget(const Relocation<ELFT> &R,
+                                     const Elf_Shdr *SymTab) const {
+  if (R.Symbol == 0)
+    return RelSymbol<ELFT>(nullptr, "");
+
+  Expected<const Elf_Sym *> SymOrErr =
+      Obj.template getEntry<Elf_Sym>(*SymTab, R.Symbol);
   if (!SymOrErr)
-    return SymOrErr.takeError();
+    return createError("unable to read an entry with index " + Twine(R.Symbol) +
+                       " from " + describe(*SymTab) + ": " +
+                       toString(SymOrErr.takeError()));
   const Elf_Sym *Sym = *SymOrErr;
   if (!Sym)
-    return std::make_pair(nullptr, "");
-
-  // The st_name field of a STT_SECTION is usually 0 (empty string).
-  // This code block returns the section name.
-  if (Sym->getType() == ELF::STT_SECTION) {
-    Expected<const Elf_Shdr *> SecOrErr =
-        Obj->getSection(Sym, SymTab, ShndxTable);
-    if (!SecOrErr)
-      return SecOrErr.takeError();
-    // A section symbol describes the section at index 0.
-    if (*SecOrErr == nullptr)
-      return std::make_pair(Sym, "");
-
-    Expected<StringRef> NameOrErr = Obj->getSectionName(*SecOrErr);
-    if (!NameOrErr)
-      return NameOrErr.takeError();
-    return std::make_pair(Sym, NameOrErr->str());
-  }
-
-  Expected<StringRef> StrTableOrErr = Obj->getStringTableForSymtab(*SymTab);
+    return RelSymbol<ELFT>(nullptr, "");
+
+  Expected<StringRef> StrTableOrErr = Obj.getStringTableForSymtab(*SymTab);
   if (!StrTableOrErr)
     return StrTableOrErr.takeError();
 
+  const Elf_Sym *FirstSym =
+      cantFail(Obj.template getEntry<Elf_Sym>(*SymTab, 0));
   std::string SymbolName =
-      getFullSymbolName(Sym, *StrTableOrErr, SymTab->sh_type == SHT_DYNSYM);
-  return std::make_pair(Sym, SymbolName);
+      getFullSymbolName(*Sym, Sym - FirstSym, getShndxTable(SymTab),
+                        *StrTableOrErr, SymTab->sh_type == SHT_DYNSYM);
+  return RelSymbol<ELFT>(Sym, SymbolName);
+}
+
+template <typename ELFT>
+ArrayRef<typename ELFT::Word>
+ELFDumper<ELFT>::getShndxTable(const Elf_Shdr *Symtab) const {
+  if (Symtab) {
+    auto It = ShndxTables.find(Symtab);
+    if (It != ShndxTables.end())
+      return It->second;
+  }
+  return {};
 }
 
 static std::string maybeDemangle(StringRef Name) {
@@ -1139,19 +821,17 @@ static std::string maybeDemangle(StringRef Name) {
 template <typename ELFT>
 std::string ELFDumper<ELFT>::getStaticSymbolName(uint32_t Index) const {
   auto Warn = [&](Error E) -> std::string {
-    this->reportUniqueWarning(
-        createError("unable to read the name of symbol with index " +
-                    Twine(Index) + ": " + toString(std::move(E))));
+    reportUniqueWarning("unable to read the name of symbol with index " +
+                        Twine(Index) + ": " + toString(std::move(E)));
     return "<?>";
   };
 
-  const ELFFile<ELFT> *Obj = ObjF->getELFFile();
   Expected<const typename ELFT::Sym *> SymOrErr =
-      Obj->getSymbol(DotSymtabSec, Index);
+      Obj.getSymbol(DotSymtabSec, Index);
   if (!SymOrErr)
     return Warn(SymOrErr.takeError());
 
-  Expected<StringRef> StrTabOrErr = Obj->getStringTableForSymtab(*DotSymtabSec);
+  Expected<StringRef> StrTabOrErr = Obj.getStringTableForSymtab(*DotSymtabSec);
   if (!StrTabOrErr)
     return Warn(StrTabOrErr.takeError());
 
@@ -1162,52 +842,25 @@ std::string ELFDumper<ELFT>::getStaticSymbolName(uint32_t Index) const {
 }
 
 template <typename ELFT>
-Expected<StringRef>
-ELFDumper<ELFT>::getSymbolVersionByIndex(uint32_t SymbolVersionIndex,
-                                         bool &IsDefault) const {
-  size_t VersionIndex = SymbolVersionIndex & VERSYM_VERSION;
-
-  // Special markers for unversioned symbols.
-  if (VersionIndex == VER_NDX_LOCAL || VersionIndex == VER_NDX_GLOBAL) {
-    IsDefault = false;
-    return "";
-  }
-
-  // Lookup this symbol in the version table.
-  if (Error E = LoadVersionMap())
-    return std::move(E);
-  if (VersionIndex >= VersionMap.size() || !VersionMap[VersionIndex])
-    return createError("SHT_GNU_versym section refers to a version index " +
-                       Twine(VersionIndex) + " which is missing");
-
-  const VersionEntry &Entry = *VersionMap[VersionIndex];
-  if (Entry.IsVerDef)
-    IsDefault = !(SymbolVersionIndex & VERSYM_HIDDEN);
-  else
-    IsDefault = false;
-  return Entry.Name.c_str();
-}
-
-template <typename ELFT>
-std::string ELFDumper<ELFT>::getFullSymbolName(const Elf_Sym *Symbol,
+std::string ELFDumper<ELFT>::getFullSymbolName(const Elf_Sym &Symbol,
+                                               unsigned SymIndex,
+                                               DataRegion<Elf_Word> ShndxTable,
                                                Optional<StringRef> StrTable,
                                                bool IsDynamic) const {
   if (!StrTable)
     return "<?>";
 
   std::string SymbolName;
-  if (Expected<StringRef> NameOrErr = Symbol->getName(*StrTable)) {
+  if (Expected<StringRef> NameOrErr = Symbol.getName(*StrTable)) {
     SymbolName = maybeDemangle(*NameOrErr);
   } else {
     reportUniqueWarning(NameOrErr.takeError());
     return "<?>";
   }
 
-  if (SymbolName.empty() && Symbol->getType() == ELF::STT_SECTION) {
-    Elf_Sym_Range Syms = unwrapOrError(
-        ObjF->getFileName(), ObjF->getELFFile()->symbols(DotSymtabSec));
+  if (SymbolName.empty() && Symbol.getType() == ELF::STT_SECTION) {
     Expected<unsigned> SectionIndex =
-        getSymbolSectionIndex(Symbol, Syms.begin());
+        getSymbolSectionIndex(Symbol, SymIndex, ShndxTable);
     if (!SectionIndex) {
       reportUniqueWarning(SectionIndex.takeError());
       return "<?>";
@@ -1224,7 +877,7 @@ std::string ELFDumper<ELFT>::getFullSymbolName(const Elf_Sym *Symbol,
     return SymbolName;
 
   bool IsDefault;
-  Expected<StringRef> VersionOrErr = getSymbolVersion(&*Symbol, IsDefault);
+  Expected<StringRef> VersionOrErr = getSymbolVersion(Symbol, IsDefault);
   if (!VersionOrErr) {
     reportUniqueWarning(VersionOrErr.takeError());
     return SymbolName + "@<corrupt>";
@@ -1239,59 +892,55 @@ std::string ELFDumper<ELFT>::getFullSymbolName(const Elf_Sym *Symbol,
 
 template <typename ELFT>
 Expected<unsigned>
-ELFDumper<ELFT>::getSymbolSectionIndex(const Elf_Sym *Symbol,
-                                       const Elf_Sym *FirstSym) const {
-  return Symbol->st_shndx == SHN_XINDEX
-             ? object::getExtendedSymbolTableIndex<ELFT>(Symbol, FirstSym,
-                                                         ShndxTable)
-             : Symbol->st_shndx;
-}
-
-// If the Symbol has a reserved st_shndx other than SHN_XINDEX, return a
-// descriptive interpretation of the st_shndx value. Otherwise, return the name
-// of the section with index SectionIndex. This function assumes that if the
-// Symbol has st_shndx == SHN_XINDEX the SectionIndex will be the value derived
-// from the SHT_SYMTAB_SHNDX section.
+ELFDumper<ELFT>::getSymbolSectionIndex(const Elf_Sym &Symbol, unsigned SymIndex,
+                                       DataRegion<Elf_Word> ShndxTable) const {
+  unsigned Ndx = Symbol.st_shndx;
+  if (Ndx == SHN_XINDEX)
+    return object::getExtendedSymbolTableIndex<ELFT>(Symbol, SymIndex,
+                                                     ShndxTable);
+  if (Ndx != SHN_UNDEF && Ndx < SHN_LORESERVE)
+    return Ndx;
+
+  auto CreateErr = [&](const Twine &Name, Optional<unsigned> Offset = None) {
+    std::string Desc;
+    if (Offset)
+      Desc = (Name + "+0x" + Twine::utohexstr(*Offset)).str();
+    else
+      Desc = Name.str();
+    return createError(
+        "unable to get section index for symbol with st_shndx = 0x" +
+        Twine::utohexstr(Ndx) + " (" + Desc + ")");
+  };
+
+  if (Ndx >= ELF::SHN_LOPROC && Ndx <= ELF::SHN_HIPROC)
+    return CreateErr("SHN_LOPROC", Ndx - ELF::SHN_LOPROC);
+  if (Ndx >= ELF::SHN_LOOS && Ndx <= ELF::SHN_HIOS)
+    return CreateErr("SHN_LOOS", Ndx - ELF::SHN_LOOS);
+  if (Ndx == ELF::SHN_UNDEF)
+    return CreateErr("SHN_UNDEF");
+  if (Ndx == ELF::SHN_ABS)
+    return CreateErr("SHN_ABS");
+  if (Ndx == ELF::SHN_COMMON)
+    return CreateErr("SHN_COMMON");
+  return CreateErr("SHN_LORESERVE", Ndx - SHN_LORESERVE);
+}
+
 template <typename ELFT>
 Expected<StringRef>
-ELFDumper<ELFT>::getSymbolSectionName(const Elf_Sym *Symbol,
+ELFDumper<ELFT>::getSymbolSectionName(const Elf_Sym &Symbol,
                                       unsigned SectionIndex) const {
-  if (Symbol->isUndefined())
-    return "Undefined";
-  if (Symbol->isProcessorSpecific())
-    return "Processor Specific";
-  if (Symbol->isOSSpecific())
-    return "Operating System Specific";
-  if (Symbol->isAbsolute())
-    return "Absolute";
-  if (Symbol->isCommon())
-    return "Common";
-  if (Symbol->isReserved() && Symbol->st_shndx != SHN_XINDEX)
-    return "Reserved";
-
-  const ELFFile<ELFT> *Obj = ObjF->getELFFile();
-  Expected<const Elf_Shdr *> SecOrErr =
-      Obj->getSection(SectionIndex);
+  Expected<const Elf_Shdr *> SecOrErr = Obj.getSection(SectionIndex);
   if (!SecOrErr)
     return SecOrErr.takeError();
-  return Obj->getSectionName(*SecOrErr);
+  return Obj.getSectionName(**SecOrErr);
 }
 
 template <class ELFO>
 static const typename ELFO::Elf_Shdr *
-findNotEmptySectionByAddress(const ELFO *Obj, StringRef FileName,
+findNotEmptySectionByAddress(const ELFO &Obj, StringRef FileName,
                              uint64_t Addr) {
-  for (const typename ELFO::Elf_Shdr &Shdr : cantFail(Obj->sections()))
-    if (Shdr.sh_addr == Addr && Shdr.sh_size > 0)
-      return &Shdr;
-  return nullptr;
-}
-
-template <class ELFO>
-static const typename ELFO::Elf_Shdr *
-findSectionByName(const ELFO &Obj, StringRef FileName, StringRef Name) {
   for (const typename ELFO::Elf_Shdr &Shdr : cantFail(Obj.sections()))
-    if (Name == unwrapOrError(FileName, Obj.getSectionName(&Shdr)))
+    if (Shdr.sh_addr == Addr && Shdr.sh_size > 0)
       return &Shdr;
   return nullptr;
 }
@@ -1337,11 +986,6 @@ static const EnumEntry<unsigned> ElfOSABI[] = {
   {"Standalone",   "Standalone App",       ELF::ELFOSABI_STANDALONE}
 };
 
-static const EnumEntry<unsigned> SymVersionFlags[] = {
-    {"Base", "BASE", VER_FLG_BASE},
-    {"Weak", "WEAK", VER_FLG_WEAK},
-    {"Info", "INFO", VER_FLG_INFO}};
-
 static const EnumEntry<unsigned> AMDGPUElfOSABI[] = {
   {"AMDGPU_HSA",    "AMDGPU - HSA",    ELF::ELFOSABI_AMDGPU_HSA},
   {"AMDGPU_PAL",    "AMDGPU - PAL",    ELF::ELFOSABI_AMDGPU_PAL},
@@ -1671,9 +1315,8 @@ static std::string getGNUFlags(unsigned EMachine, uint64_t Flags) {
   return Str;
 }
 
-static const char *getElfSegmentType(unsigned Arch, unsigned Type) {
-  // Check potentially overlapped processor-specific
-  // program header type.
+static StringRef segmentTypeToString(unsigned Arch, unsigned Type) {
+  // Check potentially overlapped processor-specific program header type.
   switch (Arch) {
   case ELF::EM_ARM:
     switch (Type) { LLVM_READOBJ_ENUM_CASE(ELF, PT_ARM_EXIDX); }
@@ -1682,25 +1325,25 @@ static const char *getElfSegmentType(unsigned Arch, unsigned Type) {
   case ELF::EM_MIPS_RS3_LE:
     switch (Type) {
       LLVM_READOBJ_ENUM_CASE(ELF, PT_MIPS_REGINFO);
-    LLVM_READOBJ_ENUM_CASE(ELF, PT_MIPS_RTPROC);
-    LLVM_READOBJ_ENUM_CASE(ELF, PT_MIPS_OPTIONS);
-    LLVM_READOBJ_ENUM_CASE(ELF, PT_MIPS_ABIFLAGS);
+      LLVM_READOBJ_ENUM_CASE(ELF, PT_MIPS_RTPROC);
+      LLVM_READOBJ_ENUM_CASE(ELF, PT_MIPS_OPTIONS);
+      LLVM_READOBJ_ENUM_CASE(ELF, PT_MIPS_ABIFLAGS);
     }
     break;
   }
 
   switch (Type) {
-  LLVM_READOBJ_ENUM_CASE(ELF, PT_NULL   );
-  LLVM_READOBJ_ENUM_CASE(ELF, PT_LOAD   );
-  LLVM_READOBJ_ENUM_CASE(ELF, PT_DYNAMIC);
-  LLVM_READOBJ_ENUM_CASE(ELF, PT_INTERP );
-  LLVM_READOBJ_ENUM_CASE(ELF, PT_NOTE   );
-  LLVM_READOBJ_ENUM_CASE(ELF, PT_SHLIB  );
-  LLVM_READOBJ_ENUM_CASE(ELF, PT_PHDR   );
-  LLVM_READOBJ_ENUM_CASE(ELF, PT_TLS    );
-
-  LLVM_READOBJ_ENUM_CASE(ELF, PT_GNU_EH_FRAME);
-  LLVM_READOBJ_ENUM_CASE(ELF, PT_SUNW_UNWIND);
+    LLVM_READOBJ_ENUM_CASE(ELF, PT_NULL);
+    LLVM_READOBJ_ENUM_CASE(ELF, PT_LOAD);
+    LLVM_READOBJ_ENUM_CASE(ELF, PT_DYNAMIC);
+    LLVM_READOBJ_ENUM_CASE(ELF, PT_INTERP);
+    LLVM_READOBJ_ENUM_CASE(ELF, PT_NOTE);
+    LLVM_READOBJ_ENUM_CASE(ELF, PT_SHLIB);
+    LLVM_READOBJ_ENUM_CASE(ELF, PT_PHDR);
+    LLVM_READOBJ_ENUM_CASE(ELF, PT_TLS);
+
+    LLVM_READOBJ_ENUM_CASE(ELF, PT_GNU_EH_FRAME);
+    LLVM_READOBJ_ENUM_CASE(ELF, PT_SUNW_UNWIND);
 
     LLVM_READOBJ_ENUM_CASE(ELF, PT_GNU_STACK);
     LLVM_READOBJ_ENUM_CASE(ELF, PT_GNU_RELRO);
@@ -1709,50 +1352,27 @@ static const char *getElfSegmentType(unsigned Arch, unsigned Type) {
     LLVM_READOBJ_ENUM_CASE(ELF, PT_OPENBSD_RANDOMIZE);
     LLVM_READOBJ_ENUM_CASE(ELF, PT_OPENBSD_WXNEEDED);
     LLVM_READOBJ_ENUM_CASE(ELF, PT_OPENBSD_BOOTDATA);
-
   default:
     return "";
   }
 }
 
-static std::string getElfPtType(unsigned Arch, unsigned Type) {
-  switch (Type) {
-    LLVM_READOBJ_PHDR_ENUM(ELF, PT_NULL)
-    LLVM_READOBJ_PHDR_ENUM(ELF, PT_LOAD)
-    LLVM_READOBJ_PHDR_ENUM(ELF, PT_DYNAMIC)
-    LLVM_READOBJ_PHDR_ENUM(ELF, PT_INTERP)
-    LLVM_READOBJ_PHDR_ENUM(ELF, PT_NOTE)
-    LLVM_READOBJ_PHDR_ENUM(ELF, PT_SHLIB)
-    LLVM_READOBJ_PHDR_ENUM(ELF, PT_PHDR)
-    LLVM_READOBJ_PHDR_ENUM(ELF, PT_TLS)
-    LLVM_READOBJ_PHDR_ENUM(ELF, PT_GNU_EH_FRAME)
-    LLVM_READOBJ_PHDR_ENUM(ELF, PT_SUNW_UNWIND)
-    LLVM_READOBJ_PHDR_ENUM(ELF, PT_GNU_STACK)
-    LLVM_READOBJ_PHDR_ENUM(ELF, PT_GNU_RELRO)
-    LLVM_READOBJ_PHDR_ENUM(ELF, PT_GNU_PROPERTY)
-  default:
-    // All machine specific PT_* types
-    switch (Arch) {
-    case ELF::EM_ARM:
-      if (Type == ELF::PT_ARM_EXIDX)
-        return "EXIDX";
-      break;
-    case ELF::EM_MIPS:
-    case ELF::EM_MIPS_RS3_LE:
-      switch (Type) {
-      case PT_MIPS_REGINFO:
-        return "REGINFO";
-      case PT_MIPS_RTPROC:
-        return "RTPROC";
-      case PT_MIPS_OPTIONS:
-        return "OPTIONS";
-      case PT_MIPS_ABIFLAGS:
-        return "ABIFLAGS";
-      }
-      break;
-    }
-  }
-  return std::string("<unknown>: ") + to_string(format_hex(Type, 1));
+static std::string getGNUPtType(unsigned Arch, unsigned Type) {
+  StringRef Seg = segmentTypeToString(Arch, Type);
+  if (Seg.empty())
+    return std::string("<unknown>: ") + to_string(format_hex(Type, 1));
+
+  // E.g. "PT_ARM_EXIDX" -> "EXIDX".
+  if (Seg.startswith("PT_ARM_"))
+    return Seg.drop_front(7).str();
+
+  // E.g. "PT_MIPS_REGINFO" -> "REGINFO".
+  if (Seg.startswith("PT_MIPS_"))
+    return Seg.drop_front(8).str();
+
+  // E.g. "PT_LOAD" -> "LOAD".
+  assert(Seg.startswith("PT_"));
+  return Seg.drop_front(3).str();
 }
 
 static const EnumEntry<unsigned> ElfSegmentFlags[] = {
@@ -1827,14 +1447,17 @@ static const EnumEntry<unsigned> ElfHeaderAMDGPUFlags[] = {
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_R600_TURKS),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX600),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX601),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX602),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX700),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX701),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX702),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX703),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX704),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX705),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX801),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX802),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX803),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX805),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX810),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX900),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX902),
@@ -1842,10 +1465,14 @@ static const EnumEntry<unsigned> ElfHeaderAMDGPUFlags[] = {
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX906),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX908),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX909),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX90C),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX1010),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX1011),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX1012),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX1030),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX1031),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX1032),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX1033),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_XNACK),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_SRAM_ECC)
 };
@@ -1871,6 +1498,10 @@ static const EnumEntry<unsigned> ElfMipsSymOtherFlags[] = {
   LLVM_READOBJ_ENUM_ENT(ELF, STO_MIPS_MICROMIPS)
 };
 
+static const EnumEntry<unsigned> ElfAArch64SymOtherFlags[] = {
+  LLVM_READOBJ_ENUM_ENT(ELF, STO_AARCH64_VARIANT_PCS)
+};
+
 static const EnumEntry<unsigned> ElfMips16SymOtherFlags[] = {
   LLVM_READOBJ_ENUM_ENT(ELF, STO_MIPS_OPTIONAL),
   LLVM_READOBJ_ENUM_ENT(ELF, STO_MIPS_PLT),
@@ -1898,10 +1529,10 @@ static const char *getElfMipsOptionsOdkType(unsigned Odk) {
 
 template <typename ELFT>
 std::pair<const typename ELFT::Phdr *, const typename ELFT::Shdr *>
-ELFDumper<ELFT>::findDynamic(const ELFFile<ELFT> *Obj) {
+ELFDumper<ELFT>::findDynamic() {
   // Try to locate the PT_DYNAMIC header.
   const Elf_Phdr *DynamicPhdr = nullptr;
-  if (Expected<ArrayRef<Elf_Phdr>> PhdrsOrErr = Obj->program_headers()) {
+  if (Expected<ArrayRef<Elf_Phdr>> PhdrsOrErr = Obj.program_headers()) {
     for (const Elf_Phdr &Phdr : *PhdrsOrErr) {
       if (Phdr.p_type != ELF::PT_DYNAMIC)
         continue;
@@ -1909,85 +1540,89 @@ ELFDumper<ELFT>::findDynamic(const ELFFile<ELFT> *Obj) {
       break;
     }
   } else {
-    this->reportUniqueWarning(createError(
+    reportUniqueWarning(
         "unable to read program headers to locate the PT_DYNAMIC segment: " +
-        toString(PhdrsOrErr.takeError())));
+        toString(PhdrsOrErr.takeError()));
   }
 
   // Try to locate the .dynamic section in the sections header table.
   const Elf_Shdr *DynamicSec = nullptr;
-  for (const Elf_Shdr &Sec : cantFail(Obj->sections())) {
+  for (const Elf_Shdr &Sec : cantFail(Obj.sections())) {
     if (Sec.sh_type != ELF::SHT_DYNAMIC)
       continue;
     DynamicSec = &Sec;
     break;
   }
 
-  if (DynamicPhdr && DynamicPhdr->p_offset + DynamicPhdr->p_filesz >
-                         ObjF->getMemoryBufferRef().getBufferSize()) {
-    reportWarning(
-        createError(
-            "PT_DYNAMIC segment offset + size exceeds the size of the file"),
-        ObjF->getFileName());
+  if (DynamicPhdr && ((DynamicPhdr->p_offset + DynamicPhdr->p_filesz >
+                       ObjF.getMemoryBufferRef().getBufferSize()) ||
+                      (DynamicPhdr->p_offset + DynamicPhdr->p_filesz <
+                       DynamicPhdr->p_offset))) {
+    reportUniqueWarning(
+        "PT_DYNAMIC segment offset (0x" +
+        Twine::utohexstr(DynamicPhdr->p_offset) + ") + file size (0x" +
+        Twine::utohexstr(DynamicPhdr->p_filesz) +
+        ") exceeds the size of the file (0x" +
+        Twine::utohexstr(ObjF.getMemoryBufferRef().getBufferSize()) + ")");
     // Don't use the broken dynamic header.
     DynamicPhdr = nullptr;
   }
 
   if (DynamicPhdr && DynamicSec) {
-    StringRef Name =
-        unwrapOrError(ObjF->getFileName(), Obj->getSectionName(DynamicSec));
     if (DynamicSec->sh_addr + DynamicSec->sh_size >
             DynamicPhdr->p_vaddr + DynamicPhdr->p_memsz ||
         DynamicSec->sh_addr < DynamicPhdr->p_vaddr)
-      reportWarning(createError("The SHT_DYNAMIC section '" + Name +
-                                "' is not contained within the "
-                                "PT_DYNAMIC segment"),
-                    ObjF->getFileName());
+      reportUniqueWarning(describe(*DynamicSec) +
+                          " is not contained within the "
+                          "PT_DYNAMIC segment");
 
     if (DynamicSec->sh_addr != DynamicPhdr->p_vaddr)
-      reportWarning(createError("The SHT_DYNAMIC section '" + Name +
-                                "' is not at the start of "
-                                "PT_DYNAMIC segment"),
-                    ObjF->getFileName());
+      reportUniqueWarning(describe(*DynamicSec) + " is not at the start of "
+                                                  "PT_DYNAMIC segment");
   }
 
   return std::make_pair(DynamicPhdr, DynamicSec);
 }
 
 template <typename ELFT>
-void ELFDumper<ELFT>::loadDynamicTable(const ELFFile<ELFT> *Obj) {
+void ELFDumper<ELFT>::loadDynamicTable() {
   const Elf_Phdr *DynamicPhdr;
   const Elf_Shdr *DynamicSec;
-  std::tie(DynamicPhdr, DynamicSec) = findDynamic(Obj);
+  std::tie(DynamicPhdr, DynamicSec) = findDynamic();
   if (!DynamicPhdr && !DynamicSec)
     return;
 
-  DynRegionInfo FromPhdr(ObjF->getFileName());
+  DynRegionInfo FromPhdr(ObjF, *this);
   bool IsPhdrTableValid = false;
   if (DynamicPhdr) {
-    FromPhdr = createDRIFrom(DynamicPhdr, sizeof(Elf_Dyn));
+    // Use cantFail(), because p_offset/p_filesz fields of a PT_DYNAMIC are
+    // validated in findDynamic() and so createDRI() is not expected to fail.
+    FromPhdr = cantFail(createDRI(DynamicPhdr->p_offset, DynamicPhdr->p_filesz,
+                                  sizeof(Elf_Dyn)));
     FromPhdr.SizePrintName = "PT_DYNAMIC size";
     FromPhdr.EntSizePrintName = "";
-
-    IsPhdrTableValid = !FromPhdr.getAsArrayRef<Elf_Dyn>().empty();
+    IsPhdrTableValid = !FromPhdr.template getAsArrayRef<Elf_Dyn>().empty();
   }
 
   // Locate the dynamic table described in a section header.
   // Ignore sh_entsize and use the expected value for entry size explicitly.
   // This allows us to dump dynamic sections with a broken sh_entsize
   // field.
-  DynRegionInfo FromSec(ObjF->getFileName());
+  DynRegionInfo FromSec(ObjF, *this);
   bool IsSecTableValid = false;
   if (DynamicSec) {
-    FromSec =
-        checkDRI({ObjF->getELFFile()->base() + DynamicSec->sh_offset,
-                  DynamicSec->sh_size, sizeof(Elf_Dyn), ObjF->getFileName()});
-    FromSec.Context = ("section with index " +
-                       Twine(DynamicSec - &cantFail(Obj->sections()).front()))
-                          .str();
-    FromSec.EntSizePrintName = "";
-
-    IsSecTableValid = !FromSec.getAsArrayRef<Elf_Dyn>().empty();
+    Expected<DynRegionInfo> RegOrErr =
+        createDRI(DynamicSec->sh_offset, DynamicSec->sh_size, sizeof(Elf_Dyn));
+    if (RegOrErr) {
+      FromSec = *RegOrErr;
+      FromSec.Context = describe(*DynamicSec);
+      FromSec.EntSizePrintName = "";
+      IsSecTableValid = !FromSec.template getAsArrayRef<Elf_Dyn>().empty();
+    } else {
+      reportUniqueWarning("unable to read the dynamic table from " +
+                          describe(*DynamicSec) + ": " +
+                          toString(RegOrErr.takeError()));
+    }
   }
 
   // When we only have information from one of the SHT_DYNAMIC section header or
@@ -1995,10 +1630,9 @@ void ELFDumper<ELFT>::loadDynamicTable(const ELFFile<ELFT> *Obj) {
   if (!DynamicPhdr || !DynamicSec) {
     if ((DynamicPhdr && IsPhdrTableValid) || (DynamicSec && IsSecTableValid)) {
       DynamicTable = DynamicPhdr ? FromPhdr : FromSec;
-      parseDynamicTable(Obj);
+      parseDynamicTable();
     } else {
-      reportWarning(createError("no valid dynamic table was found"),
-                    ObjF->getFileName());
+      reportUniqueWarning("no valid dynamic table was found");
     }
     return;
   }
@@ -2008,58 +1642,43 @@ void ELFDumper<ELFT>::loadDynamicTable(const ELFFile<ELFT> *Obj) {
   // verify that.
 
   if (FromPhdr.Addr != FromSec.Addr)
-    reportWarning(createError("SHT_DYNAMIC section header and PT_DYNAMIC "
-                              "program header disagree about "
-                              "the location of the dynamic table"),
-                  ObjF->getFileName());
+    reportUniqueWarning("SHT_DYNAMIC section header and PT_DYNAMIC "
+                        "program header disagree about "
+                        "the location of the dynamic table");
 
   if (!IsPhdrTableValid && !IsSecTableValid) {
-    reportWarning(createError("no valid dynamic table was found"),
-                  ObjF->getFileName());
+    reportUniqueWarning("no valid dynamic table was found");
     return;
   }
 
-  // Information in the PT_DYNAMIC program header has priority over the information
-  // in a section header.
+  // Information in the PT_DYNAMIC program header has priority over the
+  // information in a section header.
   if (IsPhdrTableValid) {
     if (!IsSecTableValid)
-      reportWarning(
-          createError(
-              "SHT_DYNAMIC dynamic table is invalid: PT_DYNAMIC will be used"),
-          ObjF->getFileName());
+      reportUniqueWarning(
+          "SHT_DYNAMIC dynamic table is invalid: PT_DYNAMIC will be used");
     DynamicTable = FromPhdr;
   } else {
-    reportWarning(
-        createError(
-            "PT_DYNAMIC dynamic table is invalid: SHT_DYNAMIC will be used"),
-        ObjF->getFileName());
+    reportUniqueWarning(
+        "PT_DYNAMIC dynamic table is invalid: SHT_DYNAMIC will be used");
     DynamicTable = FromSec;
   }
 
-  parseDynamicTable(Obj);
+  parseDynamicTable();
 }
 
 template <typename ELFT>
-ELFDumper<ELFT>::ELFDumper(const object::ELFObjectFile<ELFT> *ObjF,
+ELFDumper<ELFT>::ELFDumper(const object::ELFObjectFile<ELFT> &O,
                            ScopedPrinter &Writer)
-    : ObjDumper(Writer), ObjF(ObjF), DynRelRegion(ObjF->getFileName()),
-      DynRelaRegion(ObjF->getFileName()), DynRelrRegion(ObjF->getFileName()),
-      DynPLTRelRegion(ObjF->getFileName()), DynamicTable(ObjF->getFileName()) {
-  // Dumper reports all non-critical errors as warnings.
-  // It does not print the same warning more than once.
-  WarningHandler = [this](const Twine &Msg) {
-    if (Warnings.insert(Msg.str()).second)
-      reportWarning(createError(Msg), this->ObjF->getFileName());
-    return Error::success();
-  };
-
-  if (opts::Output == opts::GNU)
-    ELFDumperStyle.reset(new GNUStyle<ELFT>(Writer, this));
-  else
-    ELFDumperStyle.reset(new LLVMStyle<ELFT>(Writer, this));
+    : ObjDumper(Writer, O.getFileName()), ObjF(O), Obj(O.getELFFile()),
+      FileName(O.getFileName()), DynRelRegion(O, *this),
+      DynRelaRegion(O, *this), DynRelrRegion(O, *this),
+      DynPLTRelRegion(O, *this), DynSymTabShndxRegion(O, *this),
+      DynamicTable(O, *this) {
+  if (!O.IsContentValid())
+    return;
 
-  const ELFFile<ELFT> *Obj = ObjF->getELFFile();
-  typename ELFT::ShdrRange Sections = cantFail(Obj->sections());
+  typename ELFT::ShdrRange Sections = cantFail(Obj.sections());
   for (const Elf_Shdr &Sec : Sections) {
     switch (Sec.sh_type) {
     case ELF::SHT_SYMTAB:
@@ -2067,24 +1686,51 @@ ELFDumper<ELFT>::ELFDumper(const object::ELFObjectFile<ELFT> *ObjF,
         DotSymtabSec = &Sec;
       break;
     case ELF::SHT_DYNSYM:
+      if (!DotDynsymSec)
+        DotDynsymSec = &Sec;
+
       if (!DynSymRegion) {
-        DynSymRegion = createDRIFrom(&Sec);
-        DynSymRegion->Context =
-            ("section with index " + Twine(&Sec - &Sections.front())).str();
-        // This is only used (if Elf_Shdr present)for naming section in GNU
-        // style
-        DynSymtabName =
-            unwrapOrError(ObjF->getFileName(), Obj->getSectionName(&Sec));
-
-        if (Expected<StringRef> E = Obj->getStringTableForSymtab(Sec))
-          DynamicStringTable = *E;
-        else
-          reportWarning(E.takeError(), ObjF->getFileName());
+        Expected<DynRegionInfo> RegOrErr =
+            createDRI(Sec.sh_offset, Sec.sh_size, Sec.sh_entsize);
+        if (RegOrErr) {
+          DynSymRegion = *RegOrErr;
+          DynSymRegion->Context = describe(Sec);
+
+          if (Expected<StringRef> E = Obj.getStringTableForSymtab(Sec))
+            DynamicStringTable = *E;
+          else
+            reportUniqueWarning("unable to get the string table for the " +
+                                describe(Sec) + ": " + toString(E.takeError()));
+        } else {
+          reportUniqueWarning("unable to read dynamic symbols from " +
+                              describe(Sec) + ": " +
+                              toString(RegOrErr.takeError()));
+        }
       }
       break;
-    case ELF::SHT_SYMTAB_SHNDX:
-      ShndxTable = unwrapOrError(ObjF->getFileName(), Obj->getSHNDXTable(Sec));
+    case ELF::SHT_SYMTAB_SHNDX: {
+      uint32_t SymtabNdx = Sec.sh_link;
+      if (SymtabNdx >= Sections.size()) {
+        reportUniqueWarning(
+            "unable to get the associated symbol table for " + describe(Sec) +
+            ": sh_link (" + Twine(SymtabNdx) +
+            ") is greater than or equal to the total number of sections (" +
+            Twine(Sections.size()) + ")");
+        continue;
+      }
+
+      if (Expected<ArrayRef<Elf_Word>> ShndxTableOrErr =
+              Obj.getSHNDXTable(Sec)) {
+        if (!ShndxTables.insert({&Sections[SymtabNdx], *ShndxTableOrErr})
+                 .second)
+          reportUniqueWarning(
+              "multiple SHT_SYMTAB_SHNDX sections are linked to " +
+              describe(Sec));
+      } else {
+        reportUniqueWarning(ShndxTableOrErr.takeError());
+      }
       break;
+    }
     case ELF::SHT_GNU_versym:
       if (!SymbolVersionSection)
         SymbolVersionSection = &Sec;
@@ -2108,25 +1754,24 @@ ELFDumper<ELFT>::ELFDumper(const object::ELFObjectFile<ELFT> *ObjF,
     }
   }
 
-  loadDynamicTable(Obj);
+  loadDynamicTable();
 }
 
-template <typename ELFT>
-void ELFDumper<ELFT>::parseDynamicTable(const ELFFile<ELFT> *Obj) {
+template <typename ELFT> void ELFDumper<ELFT>::parseDynamicTable() {
   auto toMappedAddr = [&](uint64_t Tag, uint64_t VAddr) -> const uint8_t * {
-    auto MappedAddrOrError = ObjF->getELFFile()->toMappedAddr(VAddr);
+    auto MappedAddrOrError = Obj.toMappedAddr(VAddr, [&](const Twine &Msg) {
+      this->reportUniqueWarning(Msg);
+      return Error::success();
+    });
     if (!MappedAddrOrError) {
-      Error Err =
-          createError("Unable to parse DT_" + Obj->getDynamicTagAsString(Tag) +
-                      ": " + llvm::toString(MappedAddrOrError.takeError()));
-
-      reportWarning(std::move(Err), ObjF->getFileName());
+      this->reportUniqueWarning("unable to parse DT_" +
+                                Obj.getDynamicTagAsString(Tag) + ": " +
+                                llvm::toString(MappedAddrOrError.takeError()));
       return nullptr;
     }
     return MappedAddrOrError.get();
   };
 
-  uint64_t SONameOffset = 0;
   const char *StringTableBegin = nullptr;
   uint64_t StringTableSize = 0;
   Optional<DynRegionInfo> DynSymFromTable;
@@ -2151,7 +1796,7 @@ void ELFDumper<ELFT>::parseDynamicTable(const ELFFile<ELFT> *Obj) {
       // If we can't map the DT_SYMTAB value to an address (e.g. when there are
       // no program headers), we ignore its value.
       if (const uint8_t *VA = toMappedAddr(Dyn.getTag(), Dyn.getPtr())) {
-        DynSymFromTable.emplace(ObjF->getFileName());
+        DynSymFromTable.emplace(ObjF, *this);
         DynSymFromTable->Addr = VA;
         DynSymFromTable->EntSize = sizeof(Elf_Sym);
         DynSymFromTable->EntSizePrintName = "";
@@ -2161,11 +1806,10 @@ void ELFDumper<ELFT>::parseDynamicTable(const ELFFile<ELFT> *Obj) {
     case ELF::DT_SYMENT: {
       uint64_t Val = Dyn.getVal();
       if (Val != sizeof(Elf_Sym))
-        reportWarning(createError("DT_SYMENT value of 0x" +
+        this->reportUniqueWarning("DT_SYMENT value of 0x" +
                                   Twine::utohexstr(Val) +
                                   " is not the size of a symbol (0x" +
-                                  Twine::utohexstr(sizeof(Elf_Sym)) + ")"),
-                      ObjF->getFileName());
+                                  Twine::utohexstr(sizeof(Elf_Sym)) + ")");
       break;
     }
     case ELF::DT_RELA:
@@ -2217,10 +1861,9 @@ void ELFDumper<ELFT>::parseDynamicTable(const ELFFile<ELFT> *Obj) {
       else if (Dyn.getVal() == DT_RELA)
         DynPLTRelRegion.EntSize = sizeof(Elf_Rela);
       else
-        reportError(createError(Twine("unknown DT_PLTREL value of ") +
-                                Twine((uint64_t)Dyn.getVal())),
-                    ObjF->getFileName());
-      DynPLTRelRegion.EntSizePrintName = "";
+        reportUniqueWarning(Twine("unknown DT_PLTREL value of ") +
+                            Twine((uint64_t)Dyn.getVal()));
+      DynPLTRelRegion.EntSizePrintName = "PLTREL entry size";
       break;
     case ELF::DT_JMPREL:
       DynPLTRelRegion.Addr = toMappedAddr(Dyn.getTag(), Dyn.getPtr());
@@ -2229,24 +1872,26 @@ void ELFDumper<ELFT>::parseDynamicTable(const ELFFile<ELFT> *Obj) {
       DynPLTRelRegion.Size = Dyn.getVal();
       DynPLTRelRegion.SizePrintName = "DT_PLTRELSZ value";
       break;
+    case ELF::DT_SYMTAB_SHNDX:
+      DynSymTabShndxRegion.Addr = toMappedAddr(Dyn.getTag(), Dyn.getPtr());
+      DynSymTabShndxRegion.EntSize = sizeof(Elf_Word);
+      break;
     }
   }
 
   if (StringTableBegin) {
-    const uint64_t FileSize = ObjF->getELFFile()->getBufSize();
-    const uint64_t Offset =
-        (const uint8_t *)StringTableBegin - ObjF->getELFFile()->base();
+    const uint64_t FileSize = Obj.getBufSize();
+    const uint64_t Offset = (const uint8_t *)StringTableBegin - Obj.base();
     if (StringTableSize > FileSize - Offset)
-      reportUniqueWarning(createError(
+      reportUniqueWarning(
           "the dynamic string table at 0x" + Twine::utohexstr(Offset) +
           " goes past the end of the file (0x" + Twine::utohexstr(FileSize) +
-          ") with DT_STRSZ = 0x" + Twine::utohexstr(StringTableSize)));
+          ") with DT_STRSZ = 0x" + Twine::utohexstr(StringTableSize));
     else
       DynamicStringTable = StringRef(StringTableBegin, StringTableSize);
   }
 
-  SOName = getDynamicString(SONameOffset);
-
+  const bool IsHashTableSupported = getHashTableEntSize() == 4;
   if (DynSymRegion) {
     // Often we find the information about the dynamic symbol table
     // location in the SHT_DYNSYM section header. However, the value in
@@ -2262,16 +1907,15 @@ void ELFDumper<ELFT>::parseDynamicTable(const ELFFile<ELFT> *Obj) {
     // equal nchain". Check to see if the DT_HASH hash table nchain value
     // conflicts with the number of symbols in the dynamic symbol table
     // according to the section header.
-    if (HashTable) {
+    if (HashTable && IsHashTableSupported) {
       if (DynSymRegion->EntSize == 0)
-        reportUniqueWarning(
-            createError("SHT_DYNSYM section has sh_entsize == 0"));
+        reportUniqueWarning("SHT_DYNSYM section has sh_entsize == 0");
       else if (HashTable->nchain != DynSymRegion->Size / DynSymRegion->EntSize)
-        reportUniqueWarning(createError(
+        reportUniqueWarning(
             "hash table nchain (" + Twine(HashTable->nchain) +
             ") differs from symbol count derived from SHT_DYNSYM section "
             "header (" +
-            Twine(DynSymRegion->Size / DynSymRegion->EntSize) + ")"));
+            Twine(DynSymRegion->Size / DynSymRegion->EntSize) + ")");
     }
   }
 
@@ -2290,95 +1934,31 @@ void ELFDumper<ELFT>::parseDynamicTable(const ELFFile<ELFT> *Obj) {
 
   // Derive the dynamic symbol table size from the DT_HASH hash table, if
   // present.
-  if (HashTable && DynSymRegion)
-    DynSymRegion->Size = HashTable->nchain * DynSymRegion->EntSize;
-}
-
-template <typename ELFT>
-typename ELFDumper<ELFT>::Elf_Rel_Range ELFDumper<ELFT>::dyn_rels() const {
-  return DynRelRegion.getAsArrayRef<Elf_Rel>();
-}
-
-template <typename ELFT>
-typename ELFDumper<ELFT>::Elf_Rela_Range ELFDumper<ELFT>::dyn_relas() const {
-  return DynRelaRegion.getAsArrayRef<Elf_Rela>();
-}
-
-template <typename ELFT>
-typename ELFDumper<ELFT>::Elf_Relr_Range ELFDumper<ELFT>::dyn_relrs() const {
-  return DynRelrRegion.getAsArrayRef<Elf_Relr>();
-}
-
-template <class ELFT> void ELFDumper<ELFT>::printFileHeaders() {
-  ELFDumperStyle->printFileHeaders(ObjF->getELFFile());
-}
-
-template <class ELFT> void ELFDumper<ELFT>::printSectionHeaders() {
-  ELFDumperStyle->printSectionHeaders(ObjF->getELFFile());
-}
-
-template <class ELFT> void ELFDumper<ELFT>::printRelocations() {
-  ELFDumperStyle->printRelocations(ObjF->getELFFile());
-}
-
-template <class ELFT>
-void ELFDumper<ELFT>::printProgramHeaders(
-    bool PrintProgramHeaders, cl::boolOrDefault PrintSectionMapping) {
-  ELFDumperStyle->printProgramHeaders(ObjF->getELFFile(), PrintProgramHeaders,
-                                      PrintSectionMapping);
+  if (HashTable && IsHashTableSupported && DynSymRegion) {
+    const uint64_t FileSize = Obj.getBufSize();
+    const uint64_t DerivedSize =
+        (uint64_t)HashTable->nchain * DynSymRegion->EntSize;
+    const uint64_t Offset = (const uint8_t *)DynSymRegion->Addr - Obj.base();
+    if (DerivedSize > FileSize - Offset)
+      reportUniqueWarning(
+          "the size (0x" + Twine::utohexstr(DerivedSize) +
+          ") of the dynamic symbol table at 0x" + Twine::utohexstr(Offset) +
+          ", derived from the hash table, goes past the end of the file (0x" +
+          Twine::utohexstr(FileSize) + ") and will be ignored");
+    else
+      DynSymRegion->Size = HashTable->nchain * DynSymRegion->EntSize;
+  }
 }
 
 template <typename ELFT> void ELFDumper<ELFT>::printVersionInfo() {
   // Dump version symbol section.
-  ELFDumperStyle->printVersionSymbolSection(ObjF->getELFFile(),
-                                            SymbolVersionSection);
+  printVersionSymbolSection(SymbolVersionSection);
 
   // Dump version definition section.
-  ELFDumperStyle->printVersionDefinitionSection(ObjF->getELFFile(),
-                                                SymbolVersionDefSection);
+  printVersionDefinitionSection(SymbolVersionDefSection);
 
   // Dump version dependency section.
-  ELFDumperStyle->printVersionDependencySection(ObjF->getELFFile(),
-                                                SymbolVersionNeedSection);
-}
-
-template <class ELFT> void ELFDumper<ELFT>::printDependentLibs() {
-  ELFDumperStyle->printDependentLibs(ObjF->getELFFile());
-}
-
-template <class ELFT> void ELFDumper<ELFT>::printDynamicRelocations() {
-  ELFDumperStyle->printDynamicRelocations(ObjF->getELFFile());
-}
-
-template <class ELFT>
-void ELFDumper<ELFT>::printSymbols(bool PrintSymbols,
-                                   bool PrintDynamicSymbols) {
-  ELFDumperStyle->printSymbols(ObjF->getELFFile(), PrintSymbols,
-                               PrintDynamicSymbols);
-}
-
-template <class ELFT> void ELFDumper<ELFT>::printHashSymbols() {
-  ELFDumperStyle->printHashSymbols(ObjF->getELFFile());
-}
-
-template <class ELFT> void ELFDumper<ELFT>::printHashHistograms() {
-  ELFDumperStyle->printHashHistograms(ObjF->getELFFile());
-}
-
-template <class ELFT> void ELFDumper<ELFT>::printCGProfile() {
-  ELFDumperStyle->printCGProfile(ObjF->getELFFile());
-}
-
-template <class ELFT> void ELFDumper<ELFT>::printNotes() {
-  ELFDumperStyle->printNotes(ObjF->getELFFile());
-}
-
-template <class ELFT> void ELFDumper<ELFT>::printELFLinkerOptions() {
-  ELFDumperStyle->printELFLinkerOptions(ObjF->getELFFile());
-}
-
-template <class ELFT> void ELFDumper<ELFT>::printStackSizes() {
-  ELFDumperStyle->printStackSizes(ObjF);
+  printVersionDependencySection(SymbolVersionNeedSection);
 }
 
 #define LLVM_READOBJ_DT_FLAG_ENT(prefix, enum)                                 \
@@ -2445,21 +2025,28 @@ static const EnumEntry<unsigned> ElfDynamicDTMipsFlags[] = {
 
 template <typename T, typename TFlag>
 void printFlags(T Value, ArrayRef<EnumEntry<TFlag>> Flags, raw_ostream &OS) {
-  using FlagEntry = EnumEntry<TFlag>;
-  using FlagVector = SmallVector<FlagEntry, 10>;
-  FlagVector SetFlags;
-
-  for (const auto &Flag : Flags) {
-    if (Flag.Value == 0)
-      continue;
-
-    if ((Value & Flag.Value) == Flag.Value)
+  SmallVector<EnumEntry<TFlag>, 10> SetFlags;
+  for (const EnumEntry<TFlag> &Flag : Flags)
+    if (Flag.Value != 0 && (Value & Flag.Value) == Flag.Value)
       SetFlags.push_back(Flag);
-  }
 
-  for (const auto &Flag : SetFlags) {
+  for (const EnumEntry<TFlag> &Flag : SetFlags)
     OS << Flag.Name << " ";
+}
+
+template <class ELFT>
+const typename ELFT::Shdr *
+ELFDumper<ELFT>::findSectionByName(StringRef Name) const {
+  for (const Elf_Shdr &Shdr : cantFail(Obj.sections())) {
+    if (Expected<StringRef> NameOrErr = Obj.getSectionName(Shdr)) {
+      if (*NameOrErr == Name)
+        return &Shdr;
+    } else {
+      reportUniqueWarning("unable to read the name of " + describe(Shdr) +
+                          ": " + toString(NameOrErr.takeError()));
+    }
   }
+  return nullptr;
 }
 
 template <class ELFT>
@@ -2483,11 +2070,12 @@ std::string ELFDumper<ELFT>::getDynamicEntry(uint64_t Type,
   };
 
   // Handle custom printing of architecture specific tags
-  switch (ObjF->getELFFile()->getHeader()->e_machine) {
+  switch (Obj.getHeader().e_machine) {
   case EM_AARCH64:
     switch (Type) {
     case DT_AARCH64_BTI_PLT:
     case DT_AARCH64_PAC_PLT:
+    case DT_AARCH64_VARIANT_PCS:
       return std::to_string(Value);
     default:
       break;
@@ -2635,19 +2223,19 @@ std::string ELFDumper<ELFT>::getDynamicEntry(uint64_t Type,
 template <class ELFT>
 StringRef ELFDumper<ELFT>::getDynamicString(uint64_t Value) const {
   if (DynamicStringTable.empty() && !DynamicStringTable.data()) {
-    reportUniqueWarning(createError("string table was not found"));
+    reportUniqueWarning("string table was not found");
     return "<?>";
   }
 
   auto WarnAndReturn = [this](const Twine &Msg, uint64_t Offset) {
-    reportUniqueWarning(createError("string table at offset 0x" +
-                                    Twine::utohexstr(Offset) + Msg));
+    reportUniqueWarning("string table at offset 0x" + Twine::utohexstr(Offset) +
+                        Msg);
     return "<?>";
   };
 
-  const uint64_t FileSize = ObjF->getELFFile()->getBufSize();
+  const uint64_t FileSize = Obj.getBufSize();
   const uint64_t Offset =
-      (const uint8_t *)DynamicStringTable.data() - ObjF->getELFFile()->base();
+      (const uint8_t *)DynamicStringTable.data() - Obj.base();
   if (DynamicStringTable.size() > FileSize - Offset)
     return WarnAndReturn(" with size 0x" +
                              Twine::utohexstr(DynamicStringTable.size()) +
@@ -2676,25 +2264,18 @@ template <class ELFT> void ELFDumper<ELFT>::printUnwindInfo() {
   Ctx.printUnwindInformation();
 }
 
+// The namespace is needed to fix the compilation with GCC older than 7.0+.
 namespace {
-
 template <> void ELFDumper<ELF32LE>::printUnwindInfo() {
-  const ELFFile<ELF32LE> *Obj = ObjF->getELFFile();
-  const unsigned Machine = Obj->getHeader()->e_machine;
-  if (Machine == EM_ARM) {
-    ARM::EHABI::PrinterContext<ELF32LE> Ctx(W, Obj, ObjF->getFileName(),
+  if (Obj.getHeader().e_machine == EM_ARM) {
+    ARM::EHABI::PrinterContext<ELF32LE> Ctx(W, Obj, ObjF.getFileName(),
                                             DotSymtabSec);
     Ctx.PrintUnwindInformation();
   }
   DwarfCFIEH::PrinterContext<ELF32LE> Ctx(W, ObjF);
   Ctx.printUnwindInformation();
 }
-
-} // end anonymous namespace
-
-template <class ELFT> void ELFDumper<ELFT>::printDynamicTable() {
-  ELFDumperStyle->printDynamic(ObjF->getELFFile());
-}
+} // namespace
 
 template <class ELFT> void ELFDumper<ELFT>::printNeededLibraries() {
   ListScope D(W, "NeededLibraries");
@@ -2711,47 +2292,60 @@ template <class ELFT> void ELFDumper<ELFT>::printNeededLibraries() {
 }
 
 template <class ELFT>
-static Error checkHashTable(const ELFFile<ELFT> *Obj,
+static Error checkHashTable(const ELFDumper<ELFT> &Dumper,
                             const typename ELFT::Hash *H,
                             bool *IsHeaderValid = nullptr) {
-  auto MakeError = [&](uint64_t Off, const Twine &Msg = "") {
-    return createError("the hash table at offset 0x" + Twine::utohexstr(Off) +
+  const ELFFile<ELFT> &Obj = Dumper.getElfObject().getELFFile();
+  const uint64_t SecOffset = (const uint8_t *)H - Obj.base();
+  if (Dumper.getHashTableEntSize() == 8) {
+    auto It = llvm::find_if(ElfMachineType, [&](const EnumEntry<unsigned> &E) {
+      return E.Value == Obj.getHeader().e_machine;
+    });
+    if (IsHeaderValid)
+      *IsHeaderValid = false;
+    return createError("the hash table at 0x" + Twine::utohexstr(SecOffset) +
+                       " is not supported: it contains non-standard 8 "
+                       "byte entries on " +
+                       It->AltName + " platform");
+  }
+
+  auto MakeError = [&](const Twine &Msg = "") {
+    return createError("the hash table at offset 0x" +
+                       Twine::utohexstr(SecOffset) +
                        " goes past the end of the file (0x" +
-                       Twine::utohexstr(Obj->getBufSize()) + ")" + Msg);
+                       Twine::utohexstr(Obj.getBufSize()) + ")" + Msg);
   };
 
   // Each SHT_HASH section starts from two 32-bit fields: nbucket and nchain.
   const unsigned HeaderSize = 2 * sizeof(typename ELFT::Word);
-  const uint64_t SecOffset = (const uint8_t *)H - Obj->base();
 
   if (IsHeaderValid)
-    *IsHeaderValid = Obj->getBufSize() - SecOffset >= HeaderSize;
+    *IsHeaderValid = Obj.getBufSize() - SecOffset >= HeaderSize;
 
-  if (Obj->getBufSize() - SecOffset < HeaderSize)
-    return MakeError(SecOffset);
+  if (Obj.getBufSize() - SecOffset < HeaderSize)
+    return MakeError();
 
-  if (Obj->getBufSize() - SecOffset - HeaderSize <
+  if (Obj.getBufSize() - SecOffset - HeaderSize <
       ((uint64_t)H->nbucket + H->nchain) * sizeof(typename ELFT::Word))
-    return MakeError(SecOffset, ", nbucket = " + Twine(H->nbucket) +
-                                    ", nchain = " + Twine(H->nchain));
+    return MakeError(", nbucket = " + Twine(H->nbucket) +
+                     ", nchain = " + Twine(H->nchain));
   return Error::success();
 }
 
 template <class ELFT>
-static Error checkGNUHashTable(const ELFFile<ELFT> *Obj,
+static Error checkGNUHashTable(const ELFFile<ELFT> &Obj,
                                const typename ELFT::GnuHash *GnuHashTable,
                                bool *IsHeaderValid = nullptr) {
   const uint8_t *TableData = reinterpret_cast<const uint8_t *>(GnuHashTable);
-  assert(TableData >= Obj->base() &&
-         TableData < Obj->base() + Obj->getBufSize() &&
+  assert(TableData >= Obj.base() && TableData < Obj.base() + Obj.getBufSize() &&
          "GnuHashTable must always point to a location inside the file");
 
-  uint64_t TableOffset = TableData - Obj->base();
+  uint64_t TableOffset = TableData - Obj.base();
   if (IsHeaderValid)
-    *IsHeaderValid = TableOffset + /*Header size:*/ 16 < Obj->getBufSize();
+    *IsHeaderValid = TableOffset + /*Header size:*/ 16 < Obj.getBufSize();
   if (TableOffset + 16 + (uint64_t)GnuHashTable->nbuckets * 4 +
           (uint64_t)GnuHashTable->maskwords * sizeof(typename ELFT::Off) >=
-      Obj->getBufSize())
+      Obj.getBufSize())
     return createError("unable to dump the SHT_GNU_HASH "
                        "section at 0x" +
                        Twine::utohexstr(TableOffset) +
@@ -2765,7 +2359,7 @@ template <typename ELFT> void ELFDumper<ELFT>::printHashTable() {
     return;
 
   bool IsHeaderValid;
-  Error Err = checkHashTable(ObjF->getELFFile(), HashTable, &IsHeaderValid);
+  Error Err = checkHashTable(*this, HashTable, &IsHeaderValid);
   if (IsHeaderValid) {
     W.printNumber("Num Buckets", HashTable->nbucket);
     W.printNumber("Num Chains", HashTable->nchain);
@@ -2788,7 +2382,7 @@ getGnuHashTableChains(Optional<DynRegionInfo> DynSymRegion,
     return createError("no dynamic symbol table found");
 
   ArrayRef<typename ELFT::Sym> DynSymTable =
-      DynSymRegion->getAsArrayRef<typename ELFT::Sym>();
+      DynSymRegion->template getAsArrayRef<typename ELFT::Sym>();
   size_t NumSyms = DynSymTable.size();
   if (!NumSyms)
     return createError("the dynamic symbol table is empty");
@@ -2805,24 +2399,23 @@ getGnuHashTableChains(Optional<DynRegionInfo> DynSymRegion,
   // is irrelevant and we should not report a warning.
   ArrayRef<typename ELFT::Word> Buckets = GnuHashTable->buckets();
   if (!llvm::all_of(Buckets, [](typename ELFT::Word V) { return V == 0; }))
-    return createError("the first hashed symbol index (" +
-                       Twine(GnuHashTable->symndx) +
-                       ") is larger than the number of dynamic symbols (" +
-                       Twine(NumSyms) + ")");
+    return createError(
+        "the first hashed symbol index (" + Twine(GnuHashTable->symndx) +
+        ") is greater than or equal to the number of dynamic symbols (" +
+        Twine(NumSyms) + ")");
   // There is no way to represent an array of (dynamic symbols count - symndx)
   // length.
   return ArrayRef<typename ELFT::Word>();
 }
 
 template <typename ELFT>
-void ELFDumper<ELFT>::printGnuHashTable(const object::ObjectFile *Obj) {
+void ELFDumper<ELFT>::printGnuHashTable() {
   DictScope D(W, "GnuHashTable");
   if (!GnuHashTable)
     return;
 
   bool IsHeaderValid;
-  Error Err =
-      checkGNUHashTable<ELFT>(ObjF->getELFFile(), GnuHashTable, &IsHeaderValid);
+  Error Err = checkGNUHashTable<ELFT>(Obj, GnuHashTable, &IsHeaderValid);
   if (IsHeaderValid) {
     W.printNumber("Num Buckets", GnuHashTable->nbuckets);
     W.printNumber("First Hashed Symbol Index", GnuHashTable->symndx);
@@ -2844,10 +2437,9 @@ void ELFDumper<ELFT>::printGnuHashTable(const object::ObjectFile *Obj) {
   Expected<ArrayRef<Elf_Word>> Chains =
       getGnuHashTableChains<ELFT>(DynSymRegion, GnuHashTable);
   if (!Chains) {
-    reportUniqueWarning(
-        createError("unable to dump 'Values' for the SHT_GNU_HASH "
-                    "section: " +
-                    toString(Chains.takeError())));
+    reportUniqueWarning("unable to dump 'Values' for the SHT_GNU_HASH "
+                        "section: " +
+                        toString(Chains.takeError()));
     return;
   }
 
@@ -2855,32 +2447,32 @@ void ELFDumper<ELFT>::printGnuHashTable(const object::ObjectFile *Obj) {
 }
 
 template <typename ELFT> void ELFDumper<ELFT>::printLoadName() {
+  StringRef SOName = "<Not found>";
+  if (SONameOffset)
+    SOName = getDynamicString(*SONameOffset);
   W.printString("LoadName", SOName);
 }
 
 template <class ELFT> void ELFDumper<ELFT>::printArchSpecificInfo() {
-  const ELFFile<ELFT> *Obj = ObjF->getELFFile();
-  switch (Obj->getHeader()->e_machine) {
+  switch (Obj.getHeader().e_machine) {
   case EM_ARM:
   case EM_RISCV:
     printAttributes();
     break;
   case EM_MIPS: {
-    ELFDumperStyle->printMipsABIFlags(ObjF);
+    printMipsABIFlags();
     printMipsOptions();
     printMipsReginfo();
-
-    MipsGOTParser<ELFT> Parser(Obj, ObjF->getFileName(), dynamic_table(),
-                               dynamic_symbols());
+    MipsGOTParser<ELFT> Parser(*this);
     if (Error E = Parser.findGOT(dynamic_table(), dynamic_symbols()))
-      reportError(std::move(E), ObjF->getFileName());
+      reportUniqueWarning(std::move(E));
     else if (!Parser.isGotEmpty())
-      ELFDumperStyle->printMipsGOT(Parser);
+      printMipsGOT(Parser);
 
     if (Error E = Parser.findPLT(dynamic_table()))
-      reportError(std::move(E), ObjF->getFileName());
+      reportUniqueWarning(std::move(E));
     else if (!Parser.isPltEmpty())
-      ELFDumperStyle->printMipsPLT(Parser);
+      printMipsPLT(Parser);
     break;
   }
   default:
@@ -2888,59 +2480,63 @@ template <class ELFT> void ELFDumper<ELFT>::printArchSpecificInfo() {
   }
 }
 
-namespace {
-
 template <class ELFT> void ELFDumper<ELFT>::printAttributes() {
-  const ELFFile<ELFT> *Obj = ObjF->getELFFile();
-  if (!Obj->isLE()) {
+  if (!Obj.isLE()) {
     W.startLine() << "Attributes not implemented.\n";
     return;
   }
 
-  const unsigned Machine = Obj->getHeader()->e_machine;
+  const unsigned Machine = Obj.getHeader().e_machine;
   assert((Machine == EM_ARM || Machine == EM_RISCV) &&
          "Attributes not implemented.");
 
   DictScope BA(W, "BuildAttributes");
-  for (const Elf_Shdr &Sec : cantFail(Obj->sections())) {
+  for (const Elf_Shdr &Sec : cantFail(Obj.sections())) {
     if (Sec.sh_type != ELF::SHT_ARM_ATTRIBUTES &&
         Sec.sh_type != ELF::SHT_RISCV_ATTRIBUTES)
       continue;
 
-    ArrayRef<uint8_t> Contents =
-        unwrapOrError(ObjF->getFileName(), Obj->getSectionContents(&Sec));
-    if (Contents[0] != ELFAttrs::Format_Version) {
-      reportWarning(createError(Twine("unrecognised FormatVersion: 0x") +
-                                Twine::utohexstr(Contents[0])),
-                    ObjF->getFileName());
+    ArrayRef<uint8_t> Contents;
+    if (Expected<ArrayRef<uint8_t>> ContentOrErr =
+            Obj.getSectionContents(Sec)) {
+      Contents = *ContentOrErr;
+      if (Contents.empty()) {
+        reportUniqueWarning("the " + describe(Sec) + " is empty");
+        continue;
+      }
+    } else {
+      reportUniqueWarning("unable to read the content of the " + describe(Sec) +
+                          ": " + toString(ContentOrErr.takeError()));
       continue;
     }
+
     W.printHex("FormatVersion", Contents[0]);
-    if (Contents.size() == 1)
-      continue;
 
-    // TODO: Delete the redundant FormatVersion check above.
-    if (Machine == EM_ARM) {
-      if (Error E = ARMAttributeParser(&W).parse(Contents, support::little))
-        reportWarning(std::move(E), ObjF->getFileName());
-    } else if (Machine == EM_RISCV) {
-      if (Error E = RISCVAttributeParser(&W).parse(Contents, support::little))
-        reportWarning(std::move(E), ObjF->getFileName());
-    }
+    auto ParseAttrubutes = [&]() {
+      if (Machine == EM_ARM)
+        return ARMAttributeParser(&W).parse(Contents, support::little);
+      return RISCVAttributeParser(&W).parse(Contents, support::little);
+    };
+
+    if (Error E = ParseAttrubutes())
+      reportUniqueWarning("unable to dump attributes from the " +
+                          describe(Sec) + ": " + toString(std::move(E)));
   }
 }
 
+namespace {
+
 template <class ELFT> class MipsGOTParser {
 public:
-  TYPEDEF_ELF_TYPES(ELFT)
-  using Entry = typename ELFO::Elf_Addr;
+  LLVM_ELF_IMPORT_TYPES_ELFT(ELFT)
+  using Entry = typename ELFT::Addr;
   using Entries = ArrayRef<Entry>;
 
   const bool IsStatic;
-  const ELFO * const Obj;
+  const ELFFile<ELFT> &Obj;
+  const ELFDumper<ELFT> &Dumper;
 
-  MipsGOTParser(const ELFO *Obj, StringRef FileName, Elf_Dyn_Range DynTable,
-                Elf_Sym_Range DynSyms);
+  MipsGOTParser(const ELFDumper<ELFT> &D);
   Error findGOT(Elf_Dyn_Range DynTable, Elf_Sym_Range DynSyms);
   Error findPLT(Elf_Dyn_Range DynTable);
 
@@ -2967,6 +2563,7 @@ public:
   const Elf_Sym *getPltSym(const Entry *E) const;
 
   StringRef getPltStrTable() const { return PltStrTable; }
+  const Elf_Shdr *getPltSymTable() const { return PltSymTable; }
 
 private:
   const Elf_Shdr *GotSec;
@@ -2988,12 +2585,11 @@ private:
 } // end anonymous namespace
 
 template <class ELFT>
-MipsGOTParser<ELFT>::MipsGOTParser(const ELFO *Obj, StringRef FileName,
-                                   Elf_Dyn_Range DynTable,
-                                   Elf_Sym_Range DynSyms)
-    : IsStatic(DynTable.empty()), Obj(Obj), GotSec(nullptr), LocalNum(0),
-      GlobalNum(0), PltSec(nullptr), PltRelSec(nullptr), PltSymTable(nullptr),
-      FileName(FileName) {}
+MipsGOTParser<ELFT>::MipsGOTParser(const ELFDumper<ELFT> &D)
+    : IsStatic(D.dynamic_table().empty()), Obj(D.getElfObject().getELFFile()),
+      Dumper(D), GotSec(nullptr), LocalNum(0), GlobalNum(0), PltSec(nullptr),
+      PltRelSec(nullptr), PltSymTable(nullptr),
+      FileName(D.getElfObject().getFileName()) {}
 
 template <class ELFT>
 Error MipsGOTParser<ELFT>::findGOT(Elf_Dyn_Range DynTable,
@@ -3004,12 +2600,12 @@ Error MipsGOTParser<ELFT>::findGOT(Elf_Dyn_Range DynTable,
 
   // Find static GOT secton.
   if (IsStatic) {
-    GotSec = findSectionByName(*Obj, FileName, ".got");
+    GotSec = Dumper.findSectionByName(".got");
     if (!GotSec)
       return Error::success();
 
     ArrayRef<uint8_t> Content =
-        unwrapOrError(FileName, Obj->getSectionContents(GotSec));
+        unwrapOrError(FileName, Obj.getSectionContents(*GotSec));
     GotEntries = Entries(reinterpret_cast<const Entry *>(Content.data()),
                          Content.size() / sizeof(Entry));
     LocalNum = GotEntries.size();
@@ -3059,7 +2655,7 @@ Error MipsGOTParser<ELFT>::findGOT(Elf_Dyn_Range DynTable,
   GlobalNum = DynSymTotal - *DtGotSym;
 
   ArrayRef<uint8_t> Content =
-      unwrapOrError(FileName, Obj->getSectionContents(GotSec));
+      unwrapOrError(FileName, Obj.getSectionContents(*GotSec));
   GotEntries = Entries(reinterpret_cast<const Entry *>(Content.data()),
                        Content.size() / sizeof(Entry));
   GotDynSyms = DynSyms.drop_front(*DtGotSym);
@@ -3103,7 +2699,7 @@ Error MipsGOTParser<ELFT>::findPLT(Elf_Dyn_Range DynTable) {
                        Twine::utohexstr(*DtJmpRel));
 
   if (Expected<ArrayRef<uint8_t>> PltContentOrErr =
-          Obj->getSectionContents(PltSec))
+          Obj.getSectionContents(*PltSec))
     PltEntries =
         Entries(reinterpret_cast<const Entry *>(PltContentOrErr->data()),
                 PltContentOrErr->size() / sizeof(Entry));
@@ -3112,25 +2708,20 @@ Error MipsGOTParser<ELFT>::findPLT(Elf_Dyn_Range DynTable) {
                        toString(PltContentOrErr.takeError()));
 
   if (Expected<const Elf_Shdr *> PltSymTableOrErr =
-          Obj->getSection(PltRelSec->sh_link)) {
+          Obj.getSection(PltRelSec->sh_link))
     PltSymTable = *PltSymTableOrErr;
-  } else {
-    unsigned SecNdx = PltRelSec - &cantFail(Obj->sections()).front();
-    return createError("unable to get a symbol table linked to the RELPLT "
-                       "section with index " +
-                       Twine(SecNdx) + ": " +
+  else
+    return createError("unable to get a symbol table linked to the " +
+                       describe(Obj, *PltRelSec) + ": " +
                        toString(PltSymTableOrErr.takeError()));
-  }
 
   if (Expected<StringRef> StrTabOrErr =
-          Obj->getStringTableForSymtab(*PltSymTable)) {
+          Obj.getStringTableForSymtab(*PltSymTable))
     PltStrTable = *StrTabOrErr;
-  } else {
-    unsigned SecNdx = PltSymTable - &cantFail(Obj->sections()).front();
-    return createError(
-        "unable to get a string table for the symbol table with index " +
-        Twine(SecNdx) + ": " + toString(StrTabOrErr.takeError()));
-  }
+  else
+    return createError("unable to get a string table for the " +
+                       describe(Obj, *PltSymTable) + ": " +
+                       toString(StrTabOrErr.takeError()));
 
   return Error::success();
 }
@@ -3232,13 +2823,13 @@ const typename MipsGOTParser<ELFT>::Elf_Sym *
 MipsGOTParser<ELFT>::getPltSym(const Entry *E) const {
   int64_t Offset = std::distance(getPltEntries().data(), E);
   if (PltRelSec->sh_type == ELF::SHT_REL) {
-    Elf_Rel_Range Rels = unwrapOrError(FileName, Obj->rels(PltRelSec));
+    Elf_Rel_Range Rels = unwrapOrError(FileName, Obj.rels(*PltRelSec));
     return unwrapOrError(FileName,
-                         Obj->getRelocationSymbol(&Rels[Offset], PltSymTable));
+                         Obj.getRelocationSymbol(Rels[Offset], PltSymTable));
   } else {
-    Elf_Rela_Range Rels = unwrapOrError(FileName, Obj->relas(PltRelSec));
+    Elf_Rela_Range Rels = unwrapOrError(FileName, Obj.relas(*PltRelSec));
     return unwrapOrError(FileName,
-                         Obj->getRelocationSymbol(&Rels[Offset], PltSymTable));
+                         Obj.getRelocationSymbol(Rels[Offset], PltSymTable));
   }
 }
 
@@ -3327,84 +2918,143 @@ static void printMipsReginfoData(ScopedPrinter &W,
 }
 
 template <class ELFT> void ELFDumper<ELFT>::printMipsReginfo() {
-  const ELFFile<ELFT> *Obj = ObjF->getELFFile();
-  const Elf_Shdr *Shdr = findSectionByName(*Obj, ObjF->getFileName(), ".reginfo");
-  if (!Shdr) {
+  const Elf_Shdr *RegInfoSec = findSectionByName(".reginfo");
+  if (!RegInfoSec) {
     W.startLine() << "There is no .reginfo section in the file.\n";
     return;
   }
-  ArrayRef<uint8_t> Sec =
-      unwrapOrError(ObjF->getFileName(), Obj->getSectionContents(Shdr));
-  if (Sec.size() != sizeof(Elf_Mips_RegInfo<ELFT>)) {
-    W.startLine() << "The .reginfo section has a wrong size.\n";
+
+  Expected<ArrayRef<uint8_t>> ContentsOrErr =
+      Obj.getSectionContents(*RegInfoSec);
+  if (!ContentsOrErr) {
+    this->reportUniqueWarning(
+        "unable to read the content of the .reginfo section (" +
+        describe(*RegInfoSec) + "): " + toString(ContentsOrErr.takeError()));
+    return;
+  }
+
+  if (ContentsOrErr->size() < sizeof(Elf_Mips_RegInfo<ELFT>)) {
+    this->reportUniqueWarning("the .reginfo section has an invalid size (0x" +
+                              Twine::utohexstr(ContentsOrErr->size()) + ")");
     return;
   }
 
   DictScope GS(W, "MIPS RegInfo");
-  auto *Reginfo = reinterpret_cast<const Elf_Mips_RegInfo<ELFT> *>(Sec.data());
-  printMipsReginfoData(W, *Reginfo);
+  printMipsReginfoData(W, *reinterpret_cast<const Elf_Mips_RegInfo<ELFT> *>(
+                              ContentsOrErr->data()));
+}
+
+template <class ELFT>
+static Expected<const Elf_Mips_Options<ELFT> *>
+readMipsOptions(const uint8_t *SecBegin, ArrayRef<uint8_t> &SecData,
+                bool &IsSupported) {
+  if (SecData.size() < sizeof(Elf_Mips_Options<ELFT>))
+    return createError("the .MIPS.options section has an invalid size (0x" +
+                       Twine::utohexstr(SecData.size()) + ")");
+
+  const Elf_Mips_Options<ELFT> *O =
+      reinterpret_cast<const Elf_Mips_Options<ELFT> *>(SecData.data());
+  const uint8_t Size = O->size;
+  if (Size > SecData.size()) {
+    const uint64_t Offset = SecData.data() - SecBegin;
+    const uint64_t SecSize = Offset + SecData.size();
+    return createError("a descriptor of size 0x" + Twine::utohexstr(Size) +
+                       " at offset 0x" + Twine::utohexstr(Offset) +
+                       " goes past the end of the .MIPS.options "
+                       "section of size 0x" +
+                       Twine::utohexstr(SecSize));
+  }
+
+  IsSupported = O->kind == ODK_REGINFO;
+  const size_t ExpectedSize =
+      sizeof(Elf_Mips_Options<ELFT>) + sizeof(Elf_Mips_RegInfo<ELFT>);
+
+  if (IsSupported)
+    if (Size < ExpectedSize)
+      return createError(
+          "a .MIPS.options entry of kind " +
+          Twine(getElfMipsOptionsOdkType(O->kind)) +
+          " has an invalid size (0x" + Twine::utohexstr(Size) +
+          "), the expected size is 0x" + Twine::utohexstr(ExpectedSize));
+
+  SecData = SecData.drop_front(Size);
+  return O;
 }
 
 template <class ELFT> void ELFDumper<ELFT>::printMipsOptions() {
-  const ELFFile<ELFT> *Obj = ObjF->getELFFile();
-  const Elf_Shdr *Shdr =
-      findSectionByName(*Obj, ObjF->getFileName(), ".MIPS.options");
-  if (!Shdr) {
+  const Elf_Shdr *MipsOpts = findSectionByName(".MIPS.options");
+  if (!MipsOpts) {
     W.startLine() << "There is no .MIPS.options section in the file.\n";
     return;
   }
 
   DictScope GS(W, "MIPS Options");
 
-  ArrayRef<uint8_t> Sec =
-      unwrapOrError(ObjF->getFileName(), Obj->getSectionContents(Shdr));
-  while (!Sec.empty()) {
-    if (Sec.size() < sizeof(Elf_Mips_Options<ELFT>)) {
-      W.startLine() << "The .MIPS.options section has a wrong size.\n";
-      return;
-    }
-    auto *O = reinterpret_cast<const Elf_Mips_Options<ELFT> *>(Sec.data());
-    DictScope GS(W, getElfMipsOptionsOdkType(O->kind));
-    switch (O->kind) {
-    case ODK_REGINFO:
-      printMipsReginfoData(W, O->getRegInfo());
-      break;
-    default:
-      W.startLine() << "Unsupported MIPS options tag.\n";
+  ArrayRef<uint8_t> Data =
+      unwrapOrError(ObjF.getFileName(), Obj.getSectionContents(*MipsOpts));
+  const uint8_t *const SecBegin = Data.begin();
+  while (!Data.empty()) {
+    bool IsSupported;
+    Expected<const Elf_Mips_Options<ELFT> *> OptsOrErr =
+        readMipsOptions<ELFT>(SecBegin, Data, IsSupported);
+    if (!OptsOrErr) {
+      reportUniqueWarning(OptsOrErr.takeError());
       break;
     }
-    Sec = Sec.slice(O->size);
-  }
-}
 
-template <class ELFT> void ELFDumper<ELFT>::printStackMap() const {
-  const ELFFile<ELFT> *Obj = ObjF->getELFFile();
-  const Elf_Shdr *StackMapSection = nullptr;
-  for (const Elf_Shdr &Sec : cantFail(Obj->sections())) {
-    StringRef Name =
-        unwrapOrError(ObjF->getFileName(), Obj->getSectionName(&Sec));
-    if (Name == ".llvm_stackmaps") {
-      StackMapSection = &Sec;
-      break;
+    unsigned Kind = (*OptsOrErr)->kind;
+    const char *Type = getElfMipsOptionsOdkType(Kind);
+    if (!IsSupported) {
+      W.startLine() << "Unsupported MIPS options tag: " << Type << " (" << Kind
+                    << ")\n";
+      continue;
     }
+
+    DictScope GS(W, Type);
+    if (Kind == ODK_REGINFO)
+      printMipsReginfoData(W, (*OptsOrErr)->getRegInfo());
+    else
+      llvm_unreachable("unexpected .MIPS.options section descriptor kind");
   }
+}
 
+template <class ELFT> void ELFDumper<ELFT>::printStackMap() const {
+  const Elf_Shdr *StackMapSection = findSectionByName(".llvm_stackmaps");
   if (!StackMapSection)
     return;
 
-  ArrayRef<uint8_t> StackMapContentsArray = unwrapOrError(
-      ObjF->getFileName(), Obj->getSectionContents(StackMapSection));
+  auto Warn = [&](Error &&E) {
+    this->reportUniqueWarning("unable to read the stack map from " +
+                              describe(*StackMapSection) + ": " +
+                              toString(std::move(E)));
+  };
 
-  prettyPrintStackMap(
-      W, StackMapParser<ELFT::TargetEndianness>(StackMapContentsArray));
-}
+  Expected<ArrayRef<uint8_t>> ContentOrErr =
+      Obj.getSectionContents(*StackMapSection);
+  if (!ContentOrErr) {
+    Warn(ContentOrErr.takeError());
+    return;
+  }
 
-template <class ELFT> void ELFDumper<ELFT>::printGroupSections() {
-  ELFDumperStyle->printGroupSections(ObjF->getELFFile());
+  if (Error E = StackMapParser<ELFT::TargetEndianness>::validateHeader(
+          *ContentOrErr)) {
+    Warn(std::move(E));
+    return;
+  }
+
+  prettyPrintStackMap(W, StackMapParser<ELFT::TargetEndianness>(*ContentOrErr));
 }
 
-template <class ELFT> void ELFDumper<ELFT>::printAddrsig() {
-  ELFDumperStyle->printAddrsig(ObjF->getELFFile());
+template <class ELFT>
+void ELFDumper<ELFT>::printReloc(const Relocation<ELFT> &R, unsigned RelIndex,
+                                 const Elf_Shdr &Sec, const Elf_Shdr *SymTab) {
+  Expected<RelSymbol<ELFT>> Target = getRelocationTarget(R, SymTab);
+  if (!Target)
+    reportUniqueWarning("unable to print relocation " + Twine(RelIndex) +
+                        " in " + describe(Sec) + ": " +
+                        toString(Target.takeError()));
+  else
+    printRelRelaReloc(R, *Target);
 }
 
 static inline void printFields(formatted_raw_ostream &OS, StringRef Str1,
@@ -3417,155 +3067,214 @@ static inline void printFields(formatted_raw_ostream &OS, StringRef Str1,
 }
 
 template <class ELFT>
-static std::string getSectionHeadersNumString(const ELFFile<ELFT> *Obj,
+static std::string getSectionHeadersNumString(const ELFFile<ELFT> &Obj,
                                               StringRef FileName) {
-  const typename ELFT::Ehdr *ElfHeader = Obj->getHeader();
-  if (ElfHeader->e_shnum != 0)
-    return to_string(ElfHeader->e_shnum);
+  const typename ELFT::Ehdr &ElfHeader = Obj.getHeader();
+  if (ElfHeader.e_shnum != 0)
+    return to_string(ElfHeader.e_shnum);
+
+  Expected<ArrayRef<typename ELFT::Shdr>> ArrOrErr = Obj.sections();
+  if (!ArrOrErr) {
+    // In this case we can ignore an error, because we have already reported a
+    // warning about the broken section header table earlier.
+    consumeError(ArrOrErr.takeError());
+    return "<?>";
+  }
 
-  ArrayRef<typename ELFT::Shdr> Arr = cantFail(Obj->sections());
-  if (Arr.empty())
+  if (ArrOrErr->empty())
     return "0";
-  return "0 (" + to_string(Arr[0].sh_size) + ")";
+  return "0 (" + to_string((*ArrOrErr)[0].sh_size) + ")";
 }
 
 template <class ELFT>
-static std::string getSectionHeaderTableIndexString(const ELFFile<ELFT> *Obj,
+static std::string getSectionHeaderTableIndexString(const ELFFile<ELFT> &Obj,
                                                     StringRef FileName) {
-  const typename ELFT::Ehdr *ElfHeader = Obj->getHeader();
-  if (ElfHeader->e_shstrndx != SHN_XINDEX)
-    return to_string(ElfHeader->e_shstrndx);
+  const typename ELFT::Ehdr &ElfHeader = Obj.getHeader();
+  if (ElfHeader.e_shstrndx != SHN_XINDEX)
+    return to_string(ElfHeader.e_shstrndx);
+
+  Expected<ArrayRef<typename ELFT::Shdr>> ArrOrErr = Obj.sections();
+  if (!ArrOrErr) {
+    // In this case we can ignore an error, because we have already reported a
+    // warning about the broken section header table earlier.
+    consumeError(ArrOrErr.takeError());
+    return "<?>";
+  }
 
-  ArrayRef<typename ELFT::Shdr> Arr = cantFail(Obj->sections());
-  if (Arr.empty())
+  if (ArrOrErr->empty())
     return "65535 (corrupt: out of range)";
-  return to_string(ElfHeader->e_shstrndx) + " (" + to_string(Arr[0].sh_link) +
-         ")";
+  return to_string(ElfHeader.e_shstrndx) + " (" +
+         to_string((*ArrOrErr)[0].sh_link) + ")";
+}
+
+static const EnumEntry<unsigned> *getObjectFileEnumEntry(unsigned Type) {
+  auto It = llvm::find_if(ElfObjectFileType, [&](const EnumEntry<unsigned> &E) {
+    return E.Value == Type;
+  });
+  if (It != makeArrayRef(ElfObjectFileType).end())
+    return It;
+  return nullptr;
 }
 
-template <class ELFT> void GNUStyle<ELFT>::printFileHeaders(const ELFO *Obj) {
-  const Elf_Ehdr *e = Obj->getHeader();
+template <class ELFT> void GNUELFDumper<ELFT>::printFileHeaders() {
+  const Elf_Ehdr &e = this->Obj.getHeader();
   OS << "ELF Header:\n";
   OS << "  Magic:  ";
   std::string Str;
   for (int i = 0; i < ELF::EI_NIDENT; i++)
-    OS << format(" %02x", static_cast<int>(e->e_ident[i]));
+    OS << format(" %02x", static_cast<int>(e.e_ident[i]));
   OS << "\n";
-  Str = printEnum(e->e_ident[ELF::EI_CLASS], makeArrayRef(ElfClass));
+  Str = printEnum(e.e_ident[ELF::EI_CLASS], makeArrayRef(ElfClass));
   printFields(OS, "Class:", Str);
-  Str = printEnum(e->e_ident[ELF::EI_DATA], makeArrayRef(ElfDataEncoding));
+  Str = printEnum(e.e_ident[ELF::EI_DATA], makeArrayRef(ElfDataEncoding));
   printFields(OS, "Data:", Str);
   OS.PadToColumn(2u);
   OS << "Version:";
   OS.PadToColumn(37u);
-  OS << to_hexString(e->e_ident[ELF::EI_VERSION]);
-  if (e->e_version == ELF::EV_CURRENT)
+  OS << to_hexString(e.e_ident[ELF::EI_VERSION]);
+  if (e.e_version == ELF::EV_CURRENT)
     OS << " (current)";
   OS << "\n";
-  Str = printEnum(e->e_ident[ELF::EI_OSABI], makeArrayRef(ElfOSABI));
+  Str = printEnum(e.e_ident[ELF::EI_OSABI], makeArrayRef(ElfOSABI));
   printFields(OS, "OS/ABI:", Str);
   printFields(OS,
-              "ABI Version:", std::to_string(e->e_ident[ELF::EI_ABIVERSION]));
-  Str = printEnum(e->e_type, makeArrayRef(ElfObjectFileType));
+              "ABI Version:", std::to_string(e.e_ident[ELF::EI_ABIVERSION]));
+
+  if (const EnumEntry<unsigned> *E = getObjectFileEnumEntry(e.e_type)) {
+    Str = E->AltName.str();
+  } else {
+    if (e.e_type >= ET_LOPROC)
+      Str = "Processor Specific: (" + to_hexString(e.e_type, false) + ")";
+    else if (e.e_type >= ET_LOOS)
+      Str = "OS Specific: (" + to_hexString(e.e_type, false) + ")";
+    else
+      Str = "<unknown>: " + to_hexString(e.e_type, false);
+  }
   printFields(OS, "Type:", Str);
-  Str = printEnum(e->e_machine, makeArrayRef(ElfMachineType));
+
+  Str = printEnum(e.e_machine, makeArrayRef(ElfMachineType));
   printFields(OS, "Machine:", Str);
-  Str = "0x" + to_hexString(e->e_version);
+  Str = "0x" + to_hexString(e.e_version);
   printFields(OS, "Version:", Str);
-  Str = "0x" + to_hexString(e->e_entry);
+  Str = "0x" + to_hexString(e.e_entry);
   printFields(OS, "Entry point address:", Str);
-  Str = to_string(e->e_phoff) + " (bytes into file)";
+  Str = to_string(e.e_phoff) + " (bytes into file)";
   printFields(OS, "Start of program headers:", Str);
-  Str = to_string(e->e_shoff) + " (bytes into file)";
+  Str = to_string(e.e_shoff) + " (bytes into file)";
   printFields(OS, "Start of section headers:", Str);
   std::string ElfFlags;
-  if (e->e_machine == EM_MIPS)
+  if (e.e_machine == EM_MIPS)
     ElfFlags =
-        printFlags(e->e_flags, makeArrayRef(ElfHeaderMipsFlags),
+        printFlags(e.e_flags, makeArrayRef(ElfHeaderMipsFlags),
                    unsigned(ELF::EF_MIPS_ARCH), unsigned(ELF::EF_MIPS_ABI),
                    unsigned(ELF::EF_MIPS_MACH));
-  else if (e->e_machine == EM_RISCV)
-    ElfFlags = printFlags(e->e_flags, makeArrayRef(ElfHeaderRISCVFlags));
-  Str = "0x" + to_hexString(e->e_flags);
+  else if (e.e_machine == EM_RISCV)
+    ElfFlags = printFlags(e.e_flags, makeArrayRef(ElfHeaderRISCVFlags));
+  Str = "0x" + to_hexString(e.e_flags);
   if (!ElfFlags.empty())
     Str = Str + ", " + ElfFlags;
   printFields(OS, "Flags:", Str);
-  Str = to_string(e->e_ehsize) + " (bytes)";
+  Str = to_string(e.e_ehsize) + " (bytes)";
   printFields(OS, "Size of this header:", Str);
-  Str = to_string(e->e_phentsize) + " (bytes)";
+  Str = to_string(e.e_phentsize) + " (bytes)";
   printFields(OS, "Size of program headers:", Str);
-  Str = to_string(e->e_phnum);
+  Str = to_string(e.e_phnum);
   printFields(OS, "Number of program headers:", Str);
-  Str = to_string(e->e_shentsize) + " (bytes)";
+  Str = to_string(e.e_shentsize) + " (bytes)";
   printFields(OS, "Size of section headers:", Str);
-  Str = getSectionHeadersNumString(Obj, this->FileName);
+  Str = getSectionHeadersNumString(this->Obj, this->FileName);
   printFields(OS, "Number of section headers:", Str);
-  Str = getSectionHeaderTableIndexString(Obj, this->FileName);
+  Str = getSectionHeaderTableIndexString(this->Obj, this->FileName);
   printFields(OS, "Section header string table index:", Str);
 }
 
-namespace {
-struct GroupMember {
-  StringRef Name;
-  uint64_t Index;
-};
+template <class ELFT> std::vector<GroupSection> ELFDumper<ELFT>::getGroups() {
+  auto GetSignature = [&](const Elf_Sym &Sym, unsigned SymNdx,
+                          const Elf_Shdr &Symtab) -> StringRef {
+    Expected<StringRef> StrTableOrErr = Obj.getStringTableForSymtab(Symtab);
+    if (!StrTableOrErr) {
+      reportUniqueWarning("unable to get the string table for " +
+                          describe(Symtab) + ": " +
+                          toString(StrTableOrErr.takeError()));
+      return "<?>";
+    }
 
-struct GroupSection {
-  StringRef Name;
-  std::string Signature;
-  uint64_t ShName;
-  uint64_t Index;
-  uint32_t Link;
-  uint32_t Info;
-  uint32_t Type;
-  std::vector<GroupMember> Members;
-};
+    StringRef Strings = *StrTableOrErr;
+    if (Sym.st_name >= Strings.size()) {
+      reportUniqueWarning("unable to get the name of the symbol with index " +
+                          Twine(SymNdx) + ": st_name (0x" +
+                          Twine::utohexstr(Sym.st_name) +
+                          ") is past the end of the string table of size 0x" +
+                          Twine::utohexstr(Strings.size()));
+      return "<?>";
+    }
 
-template <class ELFT>
-std::vector<GroupSection> getGroups(const ELFFile<ELFT> *Obj,
-                                    StringRef FileName) {
-  using Elf_Shdr = typename ELFT::Shdr;
-  using Elf_Sym = typename ELFT::Sym;
-  using Elf_Word = typename ELFT::Word;
+    return StrTableOrErr->data() + Sym.st_name;
+  };
 
   std::vector<GroupSection> Ret;
   uint64_t I = 0;
-  for (const Elf_Shdr &Sec : cantFail(Obj->sections())) {
+  for (const Elf_Shdr &Sec : cantFail(Obj.sections())) {
     ++I;
     if (Sec.sh_type != ELF::SHT_GROUP)
       continue;
 
-    const Elf_Shdr *Symtab =
-        unwrapOrError(FileName, Obj->getSection(Sec.sh_link));
-    StringRef StrTable =
-        unwrapOrError(FileName, Obj->getStringTableForSymtab(*Symtab));
-    const Elf_Sym *Sym = unwrapOrError(
-        FileName, Obj->template getEntry<Elf_Sym>(Symtab, Sec.sh_info));
-    auto Data = unwrapOrError(
-        FileName, Obj->template getSectionContentsAsArray<Elf_Word>(&Sec));
-
-    StringRef Name = unwrapOrError(FileName, Obj->getSectionName(&Sec));
-    StringRef Signature = StrTable.data() + Sym->st_name;
-    Ret.push_back({Name,
+    StringRef Signature = "<?>";
+    if (Expected<const Elf_Shdr *> SymtabOrErr = Obj.getSection(Sec.sh_link)) {
+      if (Expected<const Elf_Sym *> SymOrErr =
+              Obj.template getEntry<Elf_Sym>(**SymtabOrErr, Sec.sh_info))
+        Signature = GetSignature(**SymOrErr, Sec.sh_info, **SymtabOrErr);
+      else
+        reportUniqueWarning("unable to get the signature symbol for " +
+                            describe(Sec) + ": " +
+                            toString(SymOrErr.takeError()));
+    } else {
+      reportUniqueWarning("unable to get the symbol table for " +
+                          describe(Sec) + ": " +
+                          toString(SymtabOrErr.takeError()));
+    }
+
+    ArrayRef<Elf_Word> Data;
+    if (Expected<ArrayRef<Elf_Word>> ContentsOrErr =
+            Obj.template getSectionContentsAsArray<Elf_Word>(Sec)) {
+      if (ContentsOrErr->empty())
+        reportUniqueWarning("unable to read the section group flag from the " +
+                            describe(Sec) + ": the section is empty");
+      else
+        Data = *ContentsOrErr;
+    } else {
+      reportUniqueWarning("unable to get the content of the " + describe(Sec) +
+                          ": " + toString(ContentsOrErr.takeError()));
+    }
+
+    Ret.push_back({getPrintableSectionName(Sec),
                    maybeDemangle(Signature),
                    Sec.sh_name,
                    I - 1,
                    Sec.sh_link,
                    Sec.sh_info,
-                   Data[0],
+                   Data.empty() ? Elf_Word(0) : Data[0],
                    {}});
 
+    if (Data.empty())
+      continue;
+
     std::vector<GroupMember> &GM = Ret.back().Members;
     for (uint32_t Ndx : Data.slice(1)) {
-      auto Sec = unwrapOrError(FileName, Obj->getSection(Ndx));
-      const StringRef Name = unwrapOrError(FileName, Obj->getSectionName(Sec));
-      GM.push_back({Name, Ndx});
+      if (Expected<const Elf_Shdr *> SecOrErr = Obj.getSection(Ndx)) {
+        GM.push_back({getPrintableSectionName(**SecOrErr), Ndx});
+      } else {
+        reportUniqueWarning("unable to get the section with index " +
+                            Twine(Ndx) + " when dumping the " + describe(Sec) +
+                            ": " + toString(SecOrErr.takeError()));
+        GM.push_back({"<?>", Ndx});
+      }
     }
   }
   return Ret;
 }
 
-DenseMap<uint64_t, const GroupSection *>
+static DenseMap<uint64_t, const GroupSection *>
 mapSectionsToGroups(ArrayRef<GroupSection> Groups) {
   DenseMap<uint64_t, const GroupSection *> Ret;
   for (const GroupSection &G : Groups)
@@ -3574,10 +3283,8 @@ mapSectionsToGroups(ArrayRef<GroupSection> Groups) {
   return Ret;
 }
 
-} // namespace
-
-template <class ELFT> void GNUStyle<ELFT>::printGroupSections(const ELFO *Obj) {
-  std::vector<GroupSection> V = getGroups<ELFT>(Obj, this->FileName);
+template <class ELFT> void GNUELFDumper<ELFT>::printGroupSections() {
+  std::vector<GroupSection> V = this->getGroups();
   DenseMap<uint64_t, const GroupSection *> Map = mapSectionsToGroups(V);
   for (const GroupSection &G : V) {
     OS << "\n"
@@ -3587,15 +3294,13 @@ template <class ELFT> void GNUStyle<ELFT>::printGroupSections(const ELFO *Obj) {
        << "   [Index]    Name\n";
     for (const GroupMember &GM : G.Members) {
       const GroupSection *MainGroup = Map[GM.Index];
-      if (MainGroup != &G) {
-        OS.flush();
-        errs() << "Error: section [" << format_decimal(GM.Index, 5)
-               << "] in group section [" << format_decimal(G.Index, 5)
-               << "] already in group section ["
-               << format_decimal(MainGroup->Index, 5) << "]";
-        errs().flush();
-        continue;
-      }
+      if (MainGroup != &G)
+        this->reportUniqueWarning(
+            "section with index " + Twine(GM.Index) +
+            ", included in the group section with index " +
+            Twine(MainGroup->Index) +
+            ", was also found in the group section with index " +
+            Twine(G.Index));
       OS << "   [" << format_decimal(GM.Index, 5) << "]   " << GM.Name << "\n";
     }
   }
@@ -3605,60 +3310,51 @@ template <class ELFT> void GNUStyle<ELFT>::printGroupSections(const ELFO *Obj) {
 }
 
 template <class ELFT>
-void GNUStyle<ELFT>::printRelocation(const ELFO *Obj, unsigned SecIndex,
-                                     const Elf_Shdr *SymTab, const Elf_Rela &R,
-                                     unsigned RelIndex, bool IsRela) {
-  Expected<std::pair<const typename ELFT::Sym *, std::string>> Target =
-      this->dumper()->getRelocationTarget(SymTab, R);
-  if (!Target)
-    this->reportUniqueWarning(createError(
-        "unable to print relocation " + Twine(RelIndex) + " in section " +
-        Twine(SecIndex) + ": " + toString(Target.takeError())));
-  else
-    printRelocation(Obj, /*Sym=*/Target->first, /*Name=*/Target->second, R,
-                    IsRela);
+void GNUELFDumper<ELFT>::printRelrReloc(const Elf_Relr &R) {
+  OS << to_string(format_hex_no_prefix(R, ELFT::Is64Bits ? 16 : 8)) << "\n";
 }
 
 template <class ELFT>
-void GNUStyle<ELFT>::printRelocation(const ELFO *Obj, const Elf_Sym *Sym,
-                                     StringRef SymbolName, const Elf_Rela &R,
-                                     bool IsRela) {
+void GNUELFDumper<ELFT>::printRelRelaReloc(const Relocation<ELFT> &R,
+                                           const RelSymbol<ELFT> &RelSym) {
   // First two fields are bit width dependent. The rest of them are fixed width.
   unsigned Bias = ELFT::Is64Bits ? 8 : 0;
   Field Fields[5] = {0, 10 + Bias, 19 + 2 * Bias, 42 + 2 * Bias, 53 + 2 * Bias};
   unsigned Width = ELFT::Is64Bits ? 16 : 8;
 
-  Fields[0].Str = to_string(format_hex_no_prefix(R.r_offset, Width));
-  Fields[1].Str = to_string(format_hex_no_prefix(R.r_info, Width));
+  Fields[0].Str = to_string(format_hex_no_prefix(R.Offset, Width));
+  Fields[1].Str = to_string(format_hex_no_prefix(R.Info, Width));
 
   SmallString<32> RelocName;
-  Obj->getRelocationTypeName(R.getType(Obj->isMips64EL()), RelocName);
+  this->Obj.getRelocationTypeName(R.Type, RelocName);
   Fields[2].Str = RelocName.c_str();
 
-  if (Sym)
-    Fields[3].Str = to_string(format_hex_no_prefix(Sym->getValue(), Width));
+  if (RelSym.Sym)
+    Fields[3].Str =
+        to_string(format_hex_no_prefix(RelSym.Sym->getValue(), Width));
 
-  Fields[4].Str = std::string(SymbolName);
+  Fields[4].Str = std::string(RelSym.Name);
   for (const Field &F : Fields)
     printField(F);
 
   std::string Addend;
-  if (IsRela) {
-    int64_t RelAddend = R.r_addend;
-    if (!SymbolName.empty()) {
-      if (R.r_addend < 0) {
+  if (Optional<int64_t> A = R.Addend) {
+    int64_t RelAddend = *A;
+    if (!RelSym.Name.empty()) {
+      if (RelAddend < 0) {
         Addend = " - ";
         RelAddend = std::abs(RelAddend);
-      } else
+      } else {
         Addend = " + ";
+      }
     }
-
     Addend += to_hexString(RelAddend, false);
   }
   OS << Addend << "\n";
 }
 
-template <class ELFT> void GNUStyle<ELFT>::printRelocHeader(unsigned SType) {
+template <class ELFT>
+static void printRelocHeaderFields(formatted_raw_ostream &OS, unsigned SType) {
   bool IsRela = SType == ELF::SHT_RELA || SType == ELF::SHT_ANDROID_RELA;
   bool IsRelr = SType == ELF::SHT_RELR || SType == ELF::SHT_ANDROID_RELR;
   if (ELFT::Is64Bits)
@@ -3679,75 +3375,68 @@ template <class ELFT> void GNUStyle<ELFT>::printRelocHeader(unsigned SType) {
   OS << "\n";
 }
 
-template <class ELFT> void GNUStyle<ELFT>::printRelocations(const ELFO *Obj) {
-  bool HasRelocSections = false;
-  for (const Elf_Shdr &Sec : cantFail(Obj->sections())) {
-    if (Sec.sh_type != ELF::SHT_REL && Sec.sh_type != ELF::SHT_RELA &&
-        Sec.sh_type != ELF::SHT_RELR && Sec.sh_type != ELF::SHT_ANDROID_REL &&
-        Sec.sh_type != ELF::SHT_ANDROID_RELA &&
-        Sec.sh_type != ELF::SHT_ANDROID_RELR)
-      continue;
-    HasRelocSections = true;
-    StringRef Name = unwrapOrError(this->FileName, Obj->getSectionName(&Sec));
-    unsigned Entries = Sec.getEntityCount();
-    std::vector<Elf_Rela> AndroidRelas;
+template <class ELFT>
+void GNUELFDumper<ELFT>::printDynamicRelocHeader(unsigned Type, StringRef Name,
+                                                 const DynRegionInfo &Reg) {
+  uint64_t Offset = Reg.Addr - this->Obj.base();
+  OS << "\n'" << Name.str().c_str() << "' relocation section at offset 0x"
+     << to_hexString(Offset, false) << " contains " << Reg.Size << " bytes:\n";
+  printRelocHeaderFields<ELFT>(OS, Type);
+}
+
+template <class ELFT>
+static bool isRelocationSec(const typename ELFT::Shdr &Sec) {
+  return Sec.sh_type == ELF::SHT_REL || Sec.sh_type == ELF::SHT_RELA ||
+         Sec.sh_type == ELF::SHT_RELR || Sec.sh_type == ELF::SHT_ANDROID_REL ||
+         Sec.sh_type == ELF::SHT_ANDROID_RELA ||
+         Sec.sh_type == ELF::SHT_ANDROID_RELR;
+}
+
+template <class ELFT> void GNUELFDumper<ELFT>::printRelocations() {
+  auto GetEntriesNum = [&](const Elf_Shdr &Sec) -> Expected<size_t> {
+    // Android's packed relocation section needs to be unpacked first
+    // to get the actual number of entries.
     if (Sec.sh_type == ELF::SHT_ANDROID_REL ||
         Sec.sh_type == ELF::SHT_ANDROID_RELA) {
-      // Android's packed relocation section needs to be unpacked first
-      // to get the actual number of entries.
-      AndroidRelas = unwrapOrError(this->FileName, Obj->android_relas(&Sec));
-      Entries = AndroidRelas.size();
+      Expected<std::vector<typename ELFT::Rela>> RelasOrErr =
+          this->Obj.android_relas(Sec);
+      if (!RelasOrErr)
+        return RelasOrErr.takeError();
+      return RelasOrErr->size();
     }
-    std::vector<Elf_Rela> RelrRelas;
+
     if (!opts::RawRelr && (Sec.sh_type == ELF::SHT_RELR ||
                            Sec.sh_type == ELF::SHT_ANDROID_RELR)) {
-      // .relr.dyn relative relocation section needs to be unpacked first
-      // to get the actual number of entries.
-      Elf_Relr_Range Relrs = unwrapOrError(this->FileName, Obj->relrs(&Sec));
-      RelrRelas = unwrapOrError(this->FileName, Obj->decode_relrs(Relrs));
-      Entries = RelrRelas.size();
+      Expected<Elf_Relr_Range> RelrsOrErr = this->Obj.relrs(Sec);
+      if (!RelrsOrErr)
+        return RelrsOrErr.takeError();
+      return this->Obj.decode_relrs(*RelrsOrErr).size();
     }
+
+    return Sec.getEntityCount();
+  };
+
+  bool HasRelocSections = false;
+  for (const Elf_Shdr &Sec : cantFail(this->Obj.sections())) {
+    if (!isRelocationSec<ELFT>(Sec))
+      continue;
+    HasRelocSections = true;
+
+    std::string EntriesNum = "<?>";
+    if (Expected<size_t> NumOrErr = GetEntriesNum(Sec))
+      EntriesNum = std::to_string(*NumOrErr);
+    else
+      this->reportUniqueWarning("unable to get the number of relocations in " +
+                                this->describe(Sec) + ": " +
+                                toString(NumOrErr.takeError()));
+
     uintX_t Offset = Sec.sh_offset;
+    StringRef Name = this->getPrintableSectionName(Sec);
     OS << "\nRelocation section '" << Name << "' at offset 0x"
-       << to_hexString(Offset, false) << " contains " << Entries
+       << to_hexString(Offset, false) << " contains " << EntriesNum
        << " entries:\n";
-    printRelocHeader(Sec.sh_type);
-    const Elf_Shdr *SymTab =
-        unwrapOrError(this->FileName, Obj->getSection(Sec.sh_link));
-    unsigned SecNdx = &Sec - &cantFail(Obj->sections()).front();
-    unsigned RelNdx = 0;
-
-    switch (Sec.sh_type) {
-    case ELF::SHT_REL:
-      for (const auto &R : unwrapOrError(this->FileName, Obj->rels(&Sec))) {
-        Elf_Rela Rela;
-        Rela.r_offset = R.r_offset;
-        Rela.r_info = R.r_info;
-        Rela.r_addend = 0;
-        printRelocation(Obj, SecNdx, SymTab, Rela, ++RelNdx, false);
-      }
-      break;
-    case ELF::SHT_RELA:
-      for (const auto &R : unwrapOrError(this->FileName, Obj->relas(&Sec)))
-        printRelocation(Obj, SecNdx, SymTab, R, ++RelNdx, true);
-      break;
-    case ELF::SHT_RELR:
-    case ELF::SHT_ANDROID_RELR:
-      if (opts::RawRelr)
-        for (const auto &R : unwrapOrError(this->FileName, Obj->relrs(&Sec)))
-          OS << to_string(format_hex_no_prefix(R, ELFT::Is64Bits ? 16 : 8))
-             << "\n";
-      else
-        for (const auto &R : RelrRelas)
-          printRelocation(Obj, SecNdx, SymTab, R, ++RelNdx, false);
-      break;
-    case ELF::SHT_ANDROID_REL:
-    case ELF::SHT_ANDROID_RELA:
-      for (const auto &R : AndroidRelas)
-        printRelocation(Obj, SecNdx, SymTab, R, ++RelNdx,
-                        Sec.sh_type == ELF::SHT_ANDROID_RELA);
-      break;
-    }
+    printRelocHeaderFields<ELFT>(OS, Sec.sh_type);
+    this->printRelocationsHelper(Sec);
   }
   if (!HasRelocSections)
     OS << "\nThere are no relocations in this file.\n";
@@ -3767,122 +3456,23 @@ static std::string getSectionTypeOffsetString(unsigned Type) {
   return "0x" + to_hexString(Type) + ": <unknown>";
 }
 
-static std::string getSectionTypeString(unsigned Arch, unsigned Type) {
-  using namespace ELF;
+static std::string getSectionTypeString(unsigned Machine, unsigned Type) {
+  StringRef Name = getELFSectionTypeName(Machine, Type);
 
-  switch (Arch) {
-  case EM_ARM:
-    switch (Type) {
-    case SHT_ARM_EXIDX:
-      return "ARM_EXIDX";
-    case SHT_ARM_PREEMPTMAP:
-      return "ARM_PREEMPTMAP";
-    case SHT_ARM_ATTRIBUTES:
-      return "ARM_ATTRIBUTES";
-    case SHT_ARM_DEBUGOVERLAY:
-      return "ARM_DEBUGOVERLAY";
-    case SHT_ARM_OVERLAYSECTION:
-      return "ARM_OVERLAYSECTION";
-    }
-    break;
-  case EM_X86_64:
-    switch (Type) {
-    case SHT_X86_64_UNWIND:
-      return "X86_64_UNWIND";
-    }
-    break;
-  case EM_MIPS:
-  case EM_MIPS_RS3_LE:
-    switch (Type) {
-    case SHT_MIPS_REGINFO:
-      return "MIPS_REGINFO";
-    case SHT_MIPS_OPTIONS:
-      return "MIPS_OPTIONS";
-    case SHT_MIPS_DWARF:
-      return "MIPS_DWARF";
-    case SHT_MIPS_ABIFLAGS:
-      return "MIPS_ABIFLAGS";
-    }
-    break;
-  case EM_RISCV:
-    switch (Type) {
-    case SHT_RISCV_ATTRIBUTES:
-      return "RISCV_ATTRIBUTES";
-    }
+  // Handle SHT_GNU_* type names.
+  if (Name.startswith("SHT_GNU_")) {
+    if (Name == "SHT_GNU_HASH")
+      return "GNU_HASH";
+    // E.g. SHT_GNU_verneed -> VERNEED.
+    return Name.drop_front(8).upper();
   }
-  switch (Type) {
-  case SHT_NULL:
-    return "NULL";
-  case SHT_PROGBITS:
-    return "PROGBITS";
-  case SHT_SYMTAB:
-    return "SYMTAB";
-  case SHT_STRTAB:
-    return "STRTAB";
-  case SHT_RELA:
-    return "RELA";
-  case SHT_HASH:
-    return "HASH";
-  case SHT_DYNAMIC:
-    return "DYNAMIC";
-  case SHT_NOTE:
-    return "NOTE";
-  case SHT_NOBITS:
-    return "NOBITS";
-  case SHT_REL:
-    return "REL";
-  case SHT_SHLIB:
-    return "SHLIB";
-  case SHT_DYNSYM:
-    return "DYNSYM";
-  case SHT_INIT_ARRAY:
-    return "INIT_ARRAY";
-  case SHT_FINI_ARRAY:
-    return "FINI_ARRAY";
-  case SHT_PREINIT_ARRAY:
-    return "PREINIT_ARRAY";
-  case SHT_GROUP:
-    return "GROUP";
-  case SHT_SYMTAB_SHNDX:
+
+  if (Name == "SHT_SYMTAB_SHNDX")
     return "SYMTAB SECTION INDICES";
-  case SHT_ANDROID_REL:
-    return "ANDROID_REL";
-  case SHT_ANDROID_RELA:
-    return "ANDROID_RELA";
-  case SHT_RELR:
-  case SHT_ANDROID_RELR:
-    return "RELR";
-  case SHT_LLVM_ODRTAB:
-    return "LLVM_ODRTAB";
-  case SHT_LLVM_LINKER_OPTIONS:
-    return "LLVM_LINKER_OPTIONS";
-  case SHT_LLVM_CALL_GRAPH_PROFILE:
-    return "LLVM_CALL_GRAPH_PROFILE";
-  case SHT_LLVM_ADDRSIG:
-    return "LLVM_ADDRSIG";
-  case SHT_LLVM_DEPENDENT_LIBRARIES:
-    return "LLVM_DEPENDENT_LIBRARIES";
-  case SHT_LLVM_SYMPART:
-    return "LLVM_SYMPART";
-  case SHT_LLVM_PART_EHDR:
-    return "LLVM_PART_EHDR";
-  case SHT_LLVM_PART_PHDR:
-    return "LLVM_PART_PHDR";
-  // FIXME: Parse processor specific GNU attributes
-  case SHT_GNU_ATTRIBUTES:
-    return "ATTRIBUTES";
-  case SHT_GNU_HASH:
-    return "GNU_HASH";
-  case SHT_GNU_verdef:
-    return "VERDEF";
-  case SHT_GNU_verneed:
-    return "VERNEED";
-  case SHT_GNU_versym:
-    return "VERSYM";
-  default:
-    return getSectionTypeOffsetString(Type);
-  }
-  return "";
+
+  if (Name.startswith("SHT_"))
+    return Name.drop_front(4).str();
+  return getSectionTypeOffsetString(Type);
 }
 
 static void printSectionDescription(formatted_raw_ostream &OS,
@@ -3904,26 +3494,25 @@ static void printSectionDescription(formatted_raw_ostream &OS,
   OS << "p (processor specific)\n";
 }
 
-template <class ELFT>
-void GNUStyle<ELFT>::printSectionHeaders(const ELFO *Obj) {
+template <class ELFT> void GNUELFDumper<ELFT>::printSectionHeaders() {
   unsigned Bias = ELFT::Is64Bits ? 0 : 8;
-  ArrayRef<Elf_Shdr> Sections = cantFail(Obj->sections());
+  ArrayRef<Elf_Shdr> Sections = cantFail(this->Obj.sections());
   OS << "There are " << to_string(Sections.size())
      << " section headers, starting at offset "
-     << "0x" << to_hexString(Obj->getHeader()->e_shoff, false) << ":\n\n";
+     << "0x" << to_hexString(this->Obj.getHeader().e_shoff, false) << ":\n\n";
   OS << "Section Headers:\n";
   Field Fields[11] = {
       {"[Nr]", 2},        {"Name", 7},        {"Type", 25},
       {"Address", 41},    {"Off", 58 - Bias}, {"Size", 65 - Bias},
       {"ES", 72 - Bias},  {"Flg", 75 - Bias}, {"Lk", 79 - Bias},
       {"Inf", 82 - Bias}, {"Al", 86 - Bias}};
-  for (auto &F : Fields)
+  for (const Field &F : Fields)
     printField(F);
   OS << "\n";
 
   StringRef SecStrTable;
   if (Expected<StringRef> SecStrTableOrErr =
-          Obj->getSectionStringTable(Sections, this->dumper()->WarningHandler))
+          this->Obj.getSectionStringTable(Sections, this->WarningHandler))
     SecStrTable = *SecStrTableOrErr;
   else
     this->reportUniqueWarning(SecStrTableOrErr.takeError());
@@ -3935,15 +3524,15 @@ void GNUStyle<ELFT>::printSectionHeaders(const ELFO *Obj) {
       Fields[1].Str = "<no-strings>";
     else
       Fields[1].Str = std::string(unwrapOrError<StringRef>(
-          this->FileName, Obj->getSectionName(&Sec, SecStrTable)));
+          this->FileName, this->Obj.getSectionName(Sec, SecStrTable)));
     Fields[2].Str =
-        getSectionTypeString(Obj->getHeader()->e_machine, Sec.sh_type);
+        getSectionTypeString(this->Obj.getHeader().e_machine, Sec.sh_type);
     Fields[3].Str =
         to_string(format_hex_no_prefix(Sec.sh_addr, ELFT::Is64Bits ? 16 : 8));
     Fields[4].Str = to_string(format_hex_no_prefix(Sec.sh_offset, 6));
     Fields[5].Str = to_string(format_hex_no_prefix(Sec.sh_size, 6));
     Fields[6].Str = to_string(format_hex_no_prefix(Sec.sh_entsize, 2));
-    Fields[7].Str = getGNUFlags(Obj->getHeader()->e_machine, Sec.sh_flags);
+    Fields[7].Str = getGNUFlags(this->Obj.getHeader().e_machine, Sec.sh_flags);
     Fields[8].Str = to_string(Sec.sh_link);
     Fields[9].Str = to_string(Sec.sh_info);
     Fields[10].Str = to_string(Sec.sh_addralign);
@@ -3963,13 +3552,16 @@ void GNUStyle<ELFT>::printSectionHeaders(const ELFO *Obj) {
     OS << "\n";
     ++SectionIndex;
   }
-  printSectionDescription(OS, Obj->getHeader()->e_machine);
+  printSectionDescription(OS, this->Obj.getHeader().e_machine);
 }
 
 template <class ELFT>
-void GNUStyle<ELFT>::printSymtabMessage(const ELFO *Obj, StringRef Name,
-                                        size_t Entries,
-                                        bool NonVisibilityBitsUsed) {
+void GNUELFDumper<ELFT>::printSymtabMessage(const Elf_Shdr *Symtab,
+                                            size_t Entries,
+                                            bool NonVisibilityBitsUsed) const {
+  StringRef Name;
+  if (Symtab)
+    Name = this->getPrintableSectionName(*Symtab);
   if (!Name.empty())
     OS << "\nSymbol table '" << Name << "'";
   else
@@ -3987,10 +3579,11 @@ void GNUStyle<ELFT>::printSymtabMessage(const ELFO *Obj, StringRef Name,
 }
 
 template <class ELFT>
-std::string GNUStyle<ELFT>::getSymbolSectionNdx(const ELFO *Obj,
-                                                const Elf_Sym *Symbol,
-                                                const Elf_Sym *FirstSym) {
-  unsigned SectionIndex = Symbol->st_shndx;
+std::string
+GNUELFDumper<ELFT>::getSymbolSectionNdx(const Elf_Sym &Symbol,
+                                        unsigned SymIndex,
+                                        DataRegion<Elf_Word> ShndxTable) const {
+  unsigned SectionIndex = Symbol.st_shndx;
   switch (SectionIndex) {
   case ELF::SHN_UNDEF:
     return "UND";
@@ -3999,11 +3592,11 @@ std::string GNUStyle<ELFT>::getSymbolSectionNdx(const ELFO *Obj,
   case ELF::SHN_COMMON:
     return "COM";
   case ELF::SHN_XINDEX: {
-    Expected<uint32_t> IndexOrErr = object::getExtendedSymbolTableIndex<ELFT>(
-        Symbol, FirstSym, this->dumper()->getShndxTable());
+    Expected<uint32_t> IndexOrErr =
+        object::getExtendedSymbolTableIndex<ELFT>(Symbol, SymIndex, ShndxTable);
     if (!IndexOrErr) {
-      assert(Symbol->st_shndx == SHN_XINDEX &&
-             "getSymbolSectionIndex should only fail due to an invalid "
+      assert(Symbol.st_shndx == SHN_XINDEX &&
+             "getExtendedSymbolTableIndex should only fail due to an invalid "
              "SHT_SYMTAB_SHNDX table/reference");
       this->reportUniqueWarning(IndexOrErr.takeError());
       return "RSV[0xffff]";
@@ -4031,71 +3624,75 @@ std::string GNUStyle<ELFT>::getSymbolSectionNdx(const ELFO *Obj,
 }
 
 template <class ELFT>
-void GNUStyle<ELFT>::printSymbol(const ELFO *Obj, const Elf_Sym *Symbol,
-                                 const Elf_Sym *FirstSym,
-                                 Optional<StringRef> StrTable, bool IsDynamic,
-                                 bool NonVisibilityBitsUsed) {
-  static int Idx = 0;
-  static bool Dynamic = true;
-
-  // If this function was called with a different value from IsDynamic
-  // from last call, happens when we move from dynamic to static symbol
-  // table, "Num" field should be reset.
-  if (!Dynamic != !IsDynamic) {
-    Idx = 0;
-    Dynamic = false;
-  }
-
+void GNUELFDumper<ELFT>::printSymbol(const Elf_Sym &Symbol, unsigned SymIndex,
+                                     DataRegion<Elf_Word> ShndxTable,
+                                     Optional<StringRef> StrTable,
+                                     bool IsDynamic,
+                                     bool NonVisibilityBitsUsed) const {
   unsigned Bias = ELFT::Is64Bits ? 8 : 0;
   Field Fields[8] = {0,         8,         17 + Bias, 23 + Bias,
                      31 + Bias, 38 + Bias, 48 + Bias, 51 + Bias};
-  Fields[0].Str = to_string(format_decimal(Idx++, 6)) + ":";
-  Fields[1].Str = to_string(
-      format_hex_no_prefix(Symbol->st_value, ELFT::Is64Bits ? 16 : 8));
-  Fields[2].Str = to_string(format_decimal(Symbol->st_size, 5));
+  Fields[0].Str = to_string(format_decimal(SymIndex, 6)) + ":";
+  Fields[1].Str =
+      to_string(format_hex_no_prefix(Symbol.st_value, ELFT::Is64Bits ? 16 : 8));
+  Fields[2].Str = to_string(format_decimal(Symbol.st_size, 5));
 
-  unsigned char SymbolType = Symbol->getType();
-  if (Obj->getHeader()->e_machine == ELF::EM_AMDGPU &&
+  unsigned char SymbolType = Symbol.getType();
+  if (this->Obj.getHeader().e_machine == ELF::EM_AMDGPU &&
       SymbolType >= ELF::STT_LOOS && SymbolType < ELF::STT_HIOS)
     Fields[3].Str = printEnum(SymbolType, makeArrayRef(AMDGPUSymbolTypes));
   else
     Fields[3].Str = printEnum(SymbolType, makeArrayRef(ElfSymbolTypes));
 
   Fields[4].Str =
-      printEnum(Symbol->getBinding(), makeArrayRef(ElfSymbolBindings));
+      printEnum(Symbol.getBinding(), makeArrayRef(ElfSymbolBindings));
   Fields[5].Str =
-      printEnum(Symbol->getVisibility(), makeArrayRef(ElfSymbolVisibilities));
-  if (Symbol->st_other & ~0x3)
-    Fields[5].Str +=
-        " [<other: " + to_string(format_hex(Symbol->st_other, 2)) + ">]";
+      printEnum(Symbol.getVisibility(), makeArrayRef(ElfSymbolVisibilities));
+
+  if (Symbol.st_other & ~0x3) {
+    if (this->Obj.getHeader().e_machine == ELF::EM_AARCH64) {
+      uint8_t Other = Symbol.st_other & ~0x3;
+      if (Other & STO_AARCH64_VARIANT_PCS) {
+        Other &= ~STO_AARCH64_VARIANT_PCS;
+        Fields[5].Str += " [VARIANT_PCS";
+        if (Other != 0)
+          Fields[5].Str.append(" | " + to_hexString(Other, false));
+        Fields[5].Str.append("]");
+      }
+    } else {
+      Fields[5].Str +=
+          " [<other: " + to_string(format_hex(Symbol.st_other, 2)) + ">]";
+    }
+  }
 
   Fields[6].Column += NonVisibilityBitsUsed ? 13 : 0;
-  Fields[6].Str = getSymbolSectionNdx(Obj, Symbol, FirstSym);
+  Fields[6].Str = getSymbolSectionNdx(Symbol, SymIndex, ShndxTable);
 
-  Fields[7].Str =
-      this->dumper()->getFullSymbolName(Symbol, StrTable, IsDynamic);
-  for (auto &Entry : Fields)
+  Fields[7].Str = this->getFullSymbolName(Symbol, SymIndex, ShndxTable,
+                                          StrTable, IsDynamic);
+  for (const Field &Entry : Fields)
     printField(Entry);
   OS << "\n";
 }
 
 template <class ELFT>
-void GNUStyle<ELFT>::printHashedSymbol(const ELFO *Obj, const Elf_Sym *FirstSym,
-                                       uint32_t Sym, StringRef StrTable,
-                                       uint32_t Bucket) {
+void GNUELFDumper<ELFT>::printHashedSymbol(const Elf_Sym *Symbol,
+                                           unsigned SymIndex,
+                                           DataRegion<Elf_Word> ShndxTable,
+                                           StringRef StrTable,
+                                           uint32_t Bucket) {
   unsigned Bias = ELFT::Is64Bits ? 8 : 0;
   Field Fields[9] = {0,         6,         11,        20 + Bias, 25 + Bias,
                      34 + Bias, 41 + Bias, 49 + Bias, 53 + Bias};
-  Fields[0].Str = to_string(format_decimal(Sym, 5));
+  Fields[0].Str = to_string(format_decimal(SymIndex, 5));
   Fields[1].Str = to_string(format_decimal(Bucket, 3)) + ":";
 
-  const auto Symbol = FirstSym + Sym;
   Fields[2].Str = to_string(
       format_hex_no_prefix(Symbol->st_value, ELFT::Is64Bits ? 16 : 8));
   Fields[3].Str = to_string(format_decimal(Symbol->st_size, 5));
 
   unsigned char SymbolType = Symbol->getType();
-  if (Obj->getHeader()->e_machine == ELF::EM_AMDGPU &&
+  if (this->Obj.getHeader().e_machine == ELF::EM_AMDGPU &&
       SymbolType >= ELF::STT_LOOS && SymbolType < ELF::STT_HIOS)
     Fields[4].Str = printEnum(SymbolType, makeArrayRef(AMDGPUSymbolTypes));
   else
@@ -4105,30 +3702,29 @@ void GNUStyle<ELFT>::printHashedSymbol(const ELFO *Obj, const Elf_Sym *FirstSym,
       printEnum(Symbol->getBinding(), makeArrayRef(ElfSymbolBindings));
   Fields[6].Str =
       printEnum(Symbol->getVisibility(), makeArrayRef(ElfSymbolVisibilities));
-  Fields[7].Str = getSymbolSectionNdx(Obj, Symbol, FirstSym);
-  Fields[8].Str = this->dumper()->getFullSymbolName(Symbol, StrTable, true);
+  Fields[7].Str = getSymbolSectionNdx(*Symbol, SymIndex, ShndxTable);
+  Fields[8].Str =
+      this->getFullSymbolName(*Symbol, SymIndex, ShndxTable, StrTable, true);
 
-  for (auto &Entry : Fields)
+  for (const Field &Entry : Fields)
     printField(Entry);
   OS << "\n";
 }
 
 template <class ELFT>
-void GNUStyle<ELFT>::printSymbols(const ELFO *Obj, bool PrintSymbols,
-                                  bool PrintDynamicSymbols) {
+void GNUELFDumper<ELFT>::printSymbols(bool PrintSymbols,
+                                      bool PrintDynamicSymbols) {
   if (!PrintSymbols && !PrintDynamicSymbols)
     return;
   // GNU readelf prints both the .dynsym and .symtab with --symbols.
-  this->dumper()->printSymbolsHelper(true);
+  this->printSymbolsHelper(true);
   if (PrintSymbols)
-    this->dumper()->printSymbolsHelper(false);
+    this->printSymbolsHelper(false);
 }
 
 template <class ELFT>
-void GNUStyle<ELFT>::printHashTableSymbols(const ELFO *Obj,
-                                           const Elf_Hash &SysVHash) {
-  StringRef StringTable = this->dumper()->getDynamicStringTable();
-  if (StringTable.empty())
+void GNUELFDumper<ELFT>::printHashTableSymbols(const Elf_Hash &SysVHash) {
+  if (this->DynamicStringTable.empty())
     return;
 
   if (ELFT::Is64Bits)
@@ -4137,17 +3733,18 @@ void GNUStyle<ELFT>::printHashTableSymbols(const ELFO *Obj,
     OS << "  Num Buc:    Value  Size   Type   Bind Vis      Ndx Name";
   OS << "\n";
 
-  Elf_Sym_Range DynSyms = this->dumper()->dynamic_symbols();
+  Elf_Sym_Range DynSyms = this->dynamic_symbols();
   const Elf_Sym *FirstSym = DynSyms.empty() ? nullptr : &DynSyms[0];
   if (!FirstSym) {
-    Optional<DynRegionInfo> DynSymRegion = this->dumper()->getDynSymRegion();
     this->reportUniqueWarning(
-        createError(Twine("unable to print symbols for the .hash table: the "
-                          "dynamic symbol table ") +
-                    (DynSymRegion ? "is empty" : "was not found")));
+        Twine("unable to print symbols for the .hash table: the "
+              "dynamic symbol table ") +
+        (this->DynSymRegion ? "is empty" : "was not found"));
     return;
   }
 
+  DataRegion<Elf_Word> ShndxTable(
+      (const Elf_Word *)this->DynSymTabShndxRegion.Addr, this->Obj.end());
   auto Buckets = SysVHash.buckets();
   auto Chains = SysVHash.chains();
   for (uint32_t Buc = 0; Buc < SysVHash.nbucket; Buc++) {
@@ -4159,64 +3756,100 @@ void GNUStyle<ELFT>::printHashTableSymbols(const ELFO *Obj,
         break;
 
       if (Visited[Ch]) {
-        reportWarning(createError(".hash section is invalid: bucket " +
+        this->reportUniqueWarning(".hash section is invalid: bucket " +
                                   Twine(Ch) +
-                                  ": a cycle was detected in the linked chain"),
-                      this->FileName);
+                                  ": a cycle was detected in the linked chain");
         break;
       }
 
-      printHashedSymbol(Obj, FirstSym, Ch, StringTable, Buc);
+      printHashedSymbol(FirstSym + Ch, Ch, ShndxTable, this->DynamicStringTable,
+                        Buc);
       Visited[Ch] = true;
     }
   }
 }
 
 template <class ELFT>
-void GNUStyle<ELFT>::printGnuHashTableSymbols(const ELFO *Obj,
-                                              const Elf_GnuHash &GnuHash) {
-  StringRef StringTable = this->dumper()->getDynamicStringTable();
-  if (StringTable.empty())
+void GNUELFDumper<ELFT>::printGnuHashTableSymbols(const Elf_GnuHash &GnuHash) {
+  if (this->DynamicStringTable.empty())
     return;
 
-  Elf_Sym_Range DynSyms = this->dumper()->dynamic_symbols();
+  Elf_Sym_Range DynSyms = this->dynamic_symbols();
   const Elf_Sym *FirstSym = DynSyms.empty() ? nullptr : &DynSyms[0];
   if (!FirstSym) {
-    Optional<DynRegionInfo> DynSymRegion = this->dumper()->getDynSymRegion();
-    this->reportUniqueWarning(createError(
+    this->reportUniqueWarning(
         Twine("unable to print symbols for the .gnu.hash table: the "
               "dynamic symbol table ") +
-        (DynSymRegion ? "is empty" : "was not found")));
+        (this->DynSymRegion ? "is empty" : "was not found"));
     return;
   }
 
+  auto GetSymbol = [&](uint64_t SymIndex,
+                       uint64_t SymsTotal) -> const Elf_Sym * {
+    if (SymIndex >= SymsTotal) {
+      this->reportUniqueWarning(
+          "unable to print hashed symbol with index " + Twine(SymIndex) +
+          ", which is greater than or equal to the number of dynamic symbols "
+          "(" +
+          Twine::utohexstr(SymsTotal) + ")");
+      return nullptr;
+    }
+    return FirstSym + SymIndex;
+  };
+
+  Expected<ArrayRef<Elf_Word>> ValuesOrErr =
+      getGnuHashTableChains<ELFT>(this->DynSymRegion, &GnuHash);
+  ArrayRef<Elf_Word> Values;
+  if (!ValuesOrErr)
+    this->reportUniqueWarning("unable to get hash values for the SHT_GNU_HASH "
+                              "section: " +
+                              toString(ValuesOrErr.takeError()));
+  else
+    Values = *ValuesOrErr;
+
+  DataRegion<Elf_Word> ShndxTable(
+      (const Elf_Word *)this->DynSymTabShndxRegion.Addr, this->Obj.end());
   ArrayRef<Elf_Word> Buckets = GnuHash.buckets();
   for (uint32_t Buc = 0; Buc < GnuHash.nbuckets; Buc++) {
     if (Buckets[Buc] == ELF::STN_UNDEF)
       continue;
     uint32_t Index = Buckets[Buc];
-    uint32_t GnuHashable = Index - GnuHash.symndx;
-    // Print whole chain
+    // Print whole chain.
     while (true) {
-      printHashedSymbol(Obj, FirstSym, Index++, StringTable, Buc);
-      // Chain ends at symbol with stopper bit
-      if ((GnuHash.values(DynSyms.size())[GnuHashable++] & 1) == 1)
+      uint32_t SymIndex = Index++;
+      if (const Elf_Sym *Sym = GetSymbol(SymIndex, DynSyms.size()))
+        printHashedSymbol(Sym, SymIndex, ShndxTable, this->DynamicStringTable,
+                          Buc);
+      else
+        break;
+
+      if (SymIndex < GnuHash.symndx) {
+        this->reportUniqueWarning(
+            "unable to read the hash value for symbol with index " +
+            Twine(SymIndex) +
+            ", which is less than the index of the first hashed symbol (" +
+            Twine(GnuHash.symndx) + ")");
+        break;
+      }
+
+       // Chain ends at symbol with stopper bit.
+      if ((Values[SymIndex - GnuHash.symndx] & 1) == 1)
         break;
     }
   }
 }
 
-template <class ELFT> void GNUStyle<ELFT>::printHashSymbols(const ELFO *Obj) {
-  if (const Elf_Hash *SysVHash = this->dumper()->getHashTable()) {
+template <class ELFT> void GNUELFDumper<ELFT>::printHashSymbols() {
+  if (this->HashTable) {
     OS << "\n Symbol table of .hash for image:\n";
-    if (Error E = checkHashTable<ELFT>(Obj, SysVHash))
+    if (Error E = checkHashTable<ELFT>(*this, this->HashTable))
       this->reportUniqueWarning(std::move(E));
     else
-      printHashTableSymbols(Obj, *SysVHash);
+      printHashTableSymbols(*this->HashTable);
   }
 
   // Try printing the .gnu.hash table.
-  if (const Elf_GnuHash *GnuHash = this->dumper()->getGnuHashTable()) {
+  if (this->GnuHashTable) {
     OS << "\n Symbol table of .gnu.hash for image:\n";
     if (ELFT::Is64Bits)
       OS << "  Num Buc:    Value          Size   Type   Bind Vis      Ndx Name";
@@ -4224,10 +3857,119 @@ template <class ELFT> void GNUStyle<ELFT>::printHashSymbols(const ELFO *Obj) {
       OS << "  Num Buc:    Value  Size   Type   Bind Vis      Ndx Name";
     OS << "\n";
 
-    if (Error E = checkGNUHashTable<ELFT>(Obj, GnuHash))
+    if (Error E = checkGNUHashTable<ELFT>(this->Obj, this->GnuHashTable))
       this->reportUniqueWarning(std::move(E));
     else
-      printGnuHashTableSymbols(Obj, *GnuHash);
+      printGnuHashTableSymbols(*this->GnuHashTable);
+  }
+}
+
+template <class ELFT> void GNUELFDumper<ELFT>::printSectionDetails() {
+  ArrayRef<Elf_Shdr> Sections = cantFail(this->Obj.sections());
+  OS << "There are " << to_string(Sections.size())
+     << " section headers, starting at offset "
+     << "0x" << to_hexString(this->Obj.getHeader().e_shoff, false) << ":\n\n";
+
+  OS << "Section Headers:\n";
+
+  auto PrintFields = [&](ArrayRef<Field> V) {
+    for (const Field &F : V)
+      printField(F);
+    OS << "\n";
+  };
+
+  PrintFields({{"[Nr]", 2}, {"Name", 7}});
+
+  constexpr bool Is64 = ELFT::Is64Bits;
+  PrintFields({{"Type", 7},
+               {Is64 ? "Address" : "Addr", 23},
+               {"Off", Is64 ? 40 : 32},
+               {"Size", Is64 ? 47 : 39},
+               {"ES", Is64 ? 54 : 46},
+               {"Lk", Is64 ? 59 : 51},
+               {"Inf", Is64 ? 62 : 54},
+               {"Al", Is64 ? 66 : 57}});
+  PrintFields({{"Flags", 7}});
+
+  StringRef SecStrTable;
+  if (Expected<StringRef> SecStrTableOrErr =
+          this->Obj.getSectionStringTable(Sections, this->WarningHandler))
+    SecStrTable = *SecStrTableOrErr;
+  else
+    this->reportUniqueWarning(SecStrTableOrErr.takeError());
+
+  size_t SectionIndex = 0;
+  const unsigned AddrSize = Is64 ? 16 : 8;
+  for (const Elf_Shdr &S : Sections) {
+    StringRef Name = "<?>";
+    if (Expected<StringRef> NameOrErr =
+            this->Obj.getSectionName(S, SecStrTable))
+      Name = *NameOrErr;
+    else
+      this->reportUniqueWarning(NameOrErr.takeError());
+
+    OS.PadToColumn(2);
+    OS << "[" << right_justify(to_string(SectionIndex), 2) << "]";
+    PrintFields({{Name, 7}});
+    PrintFields(
+        {{getSectionTypeString(this->Obj.getHeader().e_machine, S.sh_type), 7},
+         {to_string(format_hex_no_prefix(S.sh_addr, AddrSize)), 23},
+         {to_string(format_hex_no_prefix(S.sh_offset, 6)), Is64 ? 39 : 32},
+         {to_string(format_hex_no_prefix(S.sh_size, 6)), Is64 ? 47 : 39},
+         {to_string(format_hex_no_prefix(S.sh_entsize, 2)), Is64 ? 54 : 46},
+         {to_string(S.sh_link), Is64 ? 59 : 51},
+         {to_string(S.sh_info), Is64 ? 63 : 55},
+         {to_string(S.sh_addralign), Is64 ? 66 : 58}});
+
+    OS.PadToColumn(7);
+    OS << "[" << to_string(format_hex_no_prefix(S.sh_flags, AddrSize)) << "]: ";
+
+    DenseMap<unsigned, StringRef> FlagToName = {
+        {SHF_WRITE, "WRITE"},           {SHF_ALLOC, "ALLOC"},
+        {SHF_EXECINSTR, "EXEC"},        {SHF_MERGE, "MERGE"},
+        {SHF_STRINGS, "STRINGS"},       {SHF_INFO_LINK, "INFO LINK"},
+        {SHF_LINK_ORDER, "LINK ORDER"}, {SHF_OS_NONCONFORMING, "OS NONCONF"},
+        {SHF_GROUP, "GROUP"},           {SHF_TLS, "TLS"},
+        {SHF_COMPRESSED, "COMPRESSED"}, {SHF_EXCLUDE, "EXCLUDE"}};
+
+    uint64_t Flags = S.sh_flags;
+    uint64_t UnknownFlags = 0;
+    bool NeedsComma = false;
+    while (Flags) {
+      // Take the least significant bit as a flag.
+      uint64_t Flag = Flags & -Flags;
+      Flags -= Flag;
+
+      auto It = FlagToName.find(Flag);
+      if (It != FlagToName.end()) {
+        if (NeedsComma)
+          OS << ", ";
+        NeedsComma = true;
+        OS << It->second;
+      } else {
+        UnknownFlags |= Flag;
+      }
+    }
+
+    auto PrintUnknownFlags = [&](uint64_t Mask, StringRef Name) {
+      uint64_t FlagsToPrint = UnknownFlags & Mask;
+      if (!FlagsToPrint)
+        return;
+
+      if (NeedsComma)
+        OS << ", ";
+      OS << Name << " ("
+         << to_string(format_hex_no_prefix(FlagsToPrint, AddrSize)) << ")";
+      UnknownFlags &= ~Mask;
+      NeedsComma = true;
+    };
+
+    PrintUnknownFlags(SHF_MASKOS, "OS");
+    PrintUnknownFlags(SHF_MASKPROC, "PROC");
+    PrintUnknownFlags(uint64_t(-1), "UNKNOWN");
+
+    OS << "\n";
+    ++SectionIndex;
   }
 }
 
@@ -4312,29 +4054,27 @@ static bool checkPTDynamic(const typename ELFT::Phdr &Phdr,
 }
 
 template <class ELFT>
-void GNUStyle<ELFT>::printProgramHeaders(
-    const ELFO *Obj, bool PrintProgramHeaders,
-    cl::boolOrDefault PrintSectionMapping) {
+void GNUELFDumper<ELFT>::printProgramHeaders(
+    bool PrintProgramHeaders, cl::boolOrDefault PrintSectionMapping) {
   if (PrintProgramHeaders)
-    printProgramHeaders(Obj);
+    printProgramHeaders();
 
   // Display the section mapping along with the program headers, unless
   // -section-mapping is explicitly set to false.
   if (PrintSectionMapping != cl::BOU_FALSE)
-    printSectionMapping(Obj);
+    printSectionMapping();
 }
 
-template <class ELFT>
-void GNUStyle<ELFT>::printProgramHeaders(const ELFO *Obj) {
+template <class ELFT> void GNUELFDumper<ELFT>::printProgramHeaders() {
   unsigned Bias = ELFT::Is64Bits ? 8 : 0;
-  const Elf_Ehdr *Header = Obj->getHeader();
+  const Elf_Ehdr &Header = this->Obj.getHeader();
   Field Fields[8] = {2,         17,        26,        37 + Bias,
                      48 + Bias, 56 + Bias, 64 + Bias, 68 + Bias};
   OS << "\nElf file type is "
-     << printEnum(Header->e_type, makeArrayRef(ElfObjectFileType)) << "\n"
-     << "Entry point " << format_hex(Header->e_entry, 3) << "\n"
-     << "There are " << Header->e_phnum << " program headers,"
-     << " starting at offset " << Header->e_phoff << "\n\n"
+     << printEnum(Header.e_type, makeArrayRef(ElfObjectFileType)) << "\n"
+     << "Entry point " << format_hex(Header.e_entry, 3) << "\n"
+     << "There are " << Header.e_phnum << " program headers,"
+     << " starting at offset " << Header.e_phoff << "\n\n"
      << "Program Headers:\n";
   if (ELFT::Is64Bits)
     OS << "  Type           Offset   VirtAddr           PhysAddr         "
@@ -4346,15 +4086,15 @@ void GNUStyle<ELFT>::printProgramHeaders(const ELFO *Obj) {
   unsigned Width = ELFT::Is64Bits ? 18 : 10;
   unsigned SizeWidth = ELFT::Is64Bits ? 8 : 7;
 
-  Expected<ArrayRef<Elf_Phdr>> PhdrsOrErr = Obj->program_headers();
+  Expected<ArrayRef<Elf_Phdr>> PhdrsOrErr = this->Obj.program_headers();
   if (!PhdrsOrErr) {
-    this->reportUniqueWarning(createError("unable to dump program headers: " +
-                                          toString(PhdrsOrErr.takeError())));
+    this->reportUniqueWarning("unable to dump program headers: " +
+                              toString(PhdrsOrErr.takeError()));
     return;
   }
 
   for (const Elf_Phdr &Phdr : *PhdrsOrErr) {
-    Fields[0].Str = getElfPtType(Header->e_machine, Phdr.p_type);
+    Fields[0].Str = getGNUPtType(Header.e_machine, Phdr.p_type);
     Fields[1].Str = to_string(format_hex(Phdr.p_offset, 8));
     Fields[2].Str = to_string(format_hex(Phdr.p_vaddr, Width));
     Fields[3].Str = to_string(format_hex(Phdr.p_paddr, Width));
@@ -4362,26 +4102,25 @@ void GNUStyle<ELFT>::printProgramHeaders(const ELFO *Obj) {
     Fields[5].Str = to_string(format_hex(Phdr.p_memsz, SizeWidth));
     Fields[6].Str = printPhdrFlags(Phdr.p_flags);
     Fields[7].Str = to_string(format_hex(Phdr.p_align, 1));
-    for (auto Field : Fields)
-      printField(Field);
+    for (const Field &F : Fields)
+      printField(F);
     if (Phdr.p_type == ELF::PT_INTERP) {
       OS << "\n";
       auto ReportBadInterp = [&](const Twine &Msg) {
-        reportWarning(
-            createError("unable to read program interpreter name at offset 0x" +
-                        Twine::utohexstr(Phdr.p_offset) + ": " + Msg),
-            this->FileName);
+        this->reportUniqueWarning(
+            "unable to read program interpreter name at offset 0x" +
+            Twine::utohexstr(Phdr.p_offset) + ": " + Msg);
       };
 
-      if (Phdr.p_offset >= Obj->getBufSize()) {
+      if (Phdr.p_offset >= this->Obj.getBufSize()) {
         ReportBadInterp("it goes past the end of the file (0x" +
-                        Twine::utohexstr(Obj->getBufSize()) + ")");
+                        Twine::utohexstr(this->Obj.getBufSize()) + ")");
         continue;
       }
 
       const char *Data =
-          reinterpret_cast<const char *>(Obj->base()) + Phdr.p_offset;
-      size_t MaxSize = Obj->getBufSize() - Phdr.p_offset;
+          reinterpret_cast<const char *>(this->Obj.base()) + Phdr.p_offset;
+      size_t MaxSize = this->Obj.getBufSize() - Phdr.p_offset;
       size_t Len = strnlen(Data, MaxSize);
       if (Len == MaxSize) {
         ReportBadInterp("it is not null-terminated");
@@ -4395,17 +4134,16 @@ void GNUStyle<ELFT>::printProgramHeaders(const ELFO *Obj) {
   }
 }
 
-template <class ELFT>
-void GNUStyle<ELFT>::printSectionMapping(const ELFO *Obj) {
+template <class ELFT> void GNUELFDumper<ELFT>::printSectionMapping() {
   OS << "\n Section to Segment mapping:\n  Segment Sections...\n";
   DenseSet<const Elf_Shdr *> BelongsToSegment;
   int Phnum = 0;
 
-  Expected<ArrayRef<Elf_Phdr>> PhdrsOrErr = Obj->program_headers();
+  Expected<ArrayRef<Elf_Phdr>> PhdrsOrErr = this->Obj.program_headers();
   if (!PhdrsOrErr) {
-    this->reportUniqueWarning(createError(
+    this->reportUniqueWarning(
         "can't read program headers to build section to segment mapping: " +
-        toString(PhdrsOrErr.takeError())));
+        toString(PhdrsOrErr.takeError()));
     return;
   }
 
@@ -4413,7 +4151,7 @@ void GNUStyle<ELFT>::printSectionMapping(const ELFO *Obj) {
     std::string Sections;
     OS << format("   %2.2d     ", Phnum++);
     // Check if each section is in a segment and then print mapping.
-    for (const Elf_Shdr &Sec : cantFail(Obj->sections())) {
+    for (const Elf_Shdr &Sec : cantFail(this->Obj.sections())) {
       if (Sec.sh_type == ELF::SHT_NULL)
         continue;
 
@@ -4423,7 +4161,7 @@ void GNUStyle<ELFT>::printSectionMapping(const ELFO *Obj) {
       if (checkTLSSections<ELFT>(Phdr, Sec) && checkOffsets<ELFT>(Phdr, Sec) &&
           checkVMA<ELFT>(Phdr, Sec) && checkPTDynamic<ELFT>(Phdr, Sec)) {
         Sections +=
-            unwrapOrError(this->FileName, Obj->getSectionName(&Sec)).str() +
+            unwrapOrError(this->FileName, this->Obj.getSectionName(Sec)).str() +
             " ";
         BelongsToSegment.insert(&Sec);
       }
@@ -4434,10 +4172,11 @@ void GNUStyle<ELFT>::printSectionMapping(const ELFO *Obj) {
 
   // Display sections that do not belong to a segment.
   std::string Sections;
-  for (const Elf_Shdr &Sec : cantFail(Obj->sections())) {
+  for (const Elf_Shdr &Sec : cantFail(this->Obj.sections())) {
     if (BelongsToSegment.find(&Sec) == BelongsToSegment.end())
       Sections +=
-          unwrapOrError(this->FileName, Obj->getSectionName(&Sec)).str() + ' ';
+          unwrapOrError(this->FileName, this->Obj.getSectionName(Sec)).str() +
+          ' ';
   }
   if (!Sections.empty()) {
     OS << "   None  " << Sections << '\n';
@@ -4446,41 +4185,44 @@ void GNUStyle<ELFT>::printSectionMapping(const ELFO *Obj) {
 }
 
 namespace {
-template <class ELFT> struct RelSymbol {
-  const typename ELFT::Sym *Sym;
-  std::string Name;
-};
 
 template <class ELFT>
-RelSymbol<ELFT> getSymbolForReloc(const ELFFile<ELFT> *Obj, StringRef FileName,
-                                  const ELFDumper<ELFT> *Dumper,
-                                  const typename ELFT::Rela &Reloc) {
-  uint32_t SymIndex = Reloc.getSymbol(Obj->isMips64EL());
-  auto WarnAndReturn = [&](const typename ELFT::Sym *Sym,
+RelSymbol<ELFT> getSymbolForReloc(const ELFDumper<ELFT> &Dumper,
+                                  const Relocation<ELFT> &Reloc) {
+  using Elf_Sym = typename ELFT::Sym;
+  auto WarnAndReturn = [&](const Elf_Sym *Sym,
                            const Twine &Reason) -> RelSymbol<ELFT> {
-    reportWarning(
-        createError("unable to get name of the dynamic symbol with index " +
-                    Twine(SymIndex) + ": " + Reason),
-        FileName);
+    Dumper.reportUniqueWarning(
+        "unable to get name of the dynamic symbol with index " +
+        Twine(Reloc.Symbol) + ": " + Reason);
     return {Sym, "<corrupt>"};
   };
 
-  ArrayRef<typename ELFT::Sym> Symbols = Dumper->dynamic_symbols();
-  const typename ELFT::Sym *FirstSym = Symbols.begin();
+  ArrayRef<Elf_Sym> Symbols = Dumper.dynamic_symbols();
+  const Elf_Sym *FirstSym = Symbols.begin();
   if (!FirstSym)
     return WarnAndReturn(nullptr, "no dynamic symbol table found");
 
   // We might have an object without a section header. In this case the size of
   // Symbols is zero, because there is no way to know the size of the dynamic
   // table. We should allow this case and not print a warning.
-  if (!Symbols.empty() && SymIndex >= Symbols.size())
+  if (!Symbols.empty() && Reloc.Symbol >= Symbols.size())
     return WarnAndReturn(
         nullptr,
         "index is greater than or equal to the number of dynamic symbols (" +
             Twine(Symbols.size()) + ")");
 
-  const typename ELFT::Sym *Sym = FirstSym + SymIndex;
-  Expected<StringRef> ErrOrName = Sym->getName(Dumper->getDynamicStringTable());
+  const ELFFile<ELFT> &Obj = Dumper.getElfObject().getELFFile();
+  const uint64_t FileSize = Obj.getBufSize();
+  const uint64_t SymOffset = ((const uint8_t *)FirstSym - Obj.base()) +
+                             (uint64_t)Reloc.Symbol * sizeof(Elf_Sym);
+  if (SymOffset + sizeof(Elf_Sym) > FileSize)
+    return WarnAndReturn(nullptr, "symbol at 0x" + Twine::utohexstr(SymOffset) +
+                                      " goes past the end of the file (0x" +
+                                      Twine::utohexstr(FileSize) + ")");
+
+  const Elf_Sym *Sym = FirstSym + Reloc.Symbol;
+  Expected<StringRef> ErrOrName = Sym->getName(Dumper.getDynamicStringTable());
   if (!ErrOrName)
     return WarnAndReturn(Sym, toString(ErrOrName.takeError()));
 
@@ -4489,37 +4231,27 @@ RelSymbol<ELFT> getSymbolForReloc(const ELFFile<ELFT> *Obj, StringRef FileName,
 } // namespace
 
 template <class ELFT>
-void GNUStyle<ELFT>::printDynamicRelocation(const ELFO *Obj, Elf_Rela R,
-                                            bool IsRela) {
-  RelSymbol<ELFT> S = getSymbolForReloc(Obj, this->FileName, this->dumper(), R);
-  printRelocation(Obj, S.Sym, S.Name, R, IsRela);
-}
-
-template <class ELFT>
-static size_t getMaxDynamicTagSize(const ELFFile<ELFT> *Obj,
+static size_t getMaxDynamicTagSize(const ELFFile<ELFT> &Obj,
                                    typename ELFT::DynRange Tags) {
   size_t Max = 0;
   for (const typename ELFT::Dyn &Dyn : Tags)
-    Max = std::max(Max, Obj->getDynamicTagAsString(Dyn.d_tag).size());
+    Max = std::max(Max, Obj.getDynamicTagAsString(Dyn.d_tag).size());
   return Max;
 }
 
-template <class ELFT> void GNUStyle<ELFT>::printDynamic(const ELFO *Obj) {
-  Elf_Dyn_Range Table = this->dumper()->dynamic_table();
+template <class ELFT> void GNUELFDumper<ELFT>::printDynamicTable() {
+  Elf_Dyn_Range Table = this->dynamic_table();
   if (Table.empty())
     return;
 
-  const DynRegionInfo &DynamicTableRegion =
-      this->dumper()->getDynamicTableRegion();
-
   OS << "Dynamic section at offset "
-     << format_hex(reinterpret_cast<const uint8_t *>(DynamicTableRegion.Addr) -
-                       Obj->base(),
+     << format_hex(reinterpret_cast<const uint8_t *>(this->DynamicTable.Addr) -
+                       this->Obj.base(),
                    1)
      << " contains " << Table.size() << " entries:\n";
 
   // The type name is surrounded with round brackets, hence add 2.
-  size_t MaxTagSize = getMaxDynamicTagSize(Obj, Table) + 2;
+  size_t MaxTagSize = getMaxDynamicTagSize(this->Obj, Table) + 2;
   // The "Name/Value" column should be indented from the "Type" column by N
   // spaces, where N = MaxTagSize - length of "Type" (4) + trailing
   // space (1) = 3.
@@ -4530,127 +4262,115 @@ template <class ELFT> void GNUStyle<ELFT>::printDynamic(const ELFO *Obj) {
   for (auto Entry : Table) {
     uintX_t Tag = Entry.getTag();
     std::string Type =
-        std::string("(") + Obj->getDynamicTagAsString(Tag).c_str() + ")";
-    std::string Value = this->dumper()->getDynamicEntry(Tag, Entry.getVal());
+        std::string("(") + this->Obj.getDynamicTagAsString(Tag).c_str() + ")";
+    std::string Value = this->getDynamicEntry(Tag, Entry.getVal());
     OS << "  " << format_hex(Tag, ELFT::Is64Bits ? 18 : 10)
        << format(ValueFmt.c_str(), Type.c_str()) << Value << "\n";
   }
 }
 
+template <class ELFT> void GNUELFDumper<ELFT>::printDynamicRelocations() {
+  this->printDynamicRelocationsHelper();
+}
+
 template <class ELFT>
-void GNUStyle<ELFT>::printDynamicRelocations(const ELFO *Obj) {
-  const DynRegionInfo &DynRelRegion = this->dumper()->getDynRelRegion();
-  const DynRegionInfo &DynRelaRegion = this->dumper()->getDynRelaRegion();
-  const DynRegionInfo &DynRelrRegion = this->dumper()->getDynRelrRegion();
-  const DynRegionInfo &DynPLTRelRegion = this->dumper()->getDynPLTRelRegion();
-  if (DynRelaRegion.Size > 0) {
-    OS << "\n'RELA' relocation section at offset "
-       << format_hex(reinterpret_cast<const uint8_t *>(DynRelaRegion.Addr) -
-                         Obj->base(),
-                     1)
-       << " contains " << DynRelaRegion.Size << " bytes:\n";
-    printRelocHeader(ELF::SHT_RELA);
-    for (const Elf_Rela &Rela : this->dumper()->dyn_relas())
-      printDynamicRelocation(Obj, Rela, true);
-  }
-  if (DynRelRegion.Size > 0) {
-    OS << "\n'REL' relocation section at offset "
-       << format_hex(reinterpret_cast<const uint8_t *>(DynRelRegion.Addr) -
-                         Obj->base(),
-                     1)
-       << " contains " << DynRelRegion.Size << " bytes:\n";
-    printRelocHeader(ELF::SHT_REL);
-    for (const Elf_Rel &Rel : this->dumper()->dyn_rels()) {
-      Elf_Rela Rela;
-      Rela.r_offset = Rel.r_offset;
-      Rela.r_info = Rel.r_info;
-      Rela.r_addend = 0;
-      printDynamicRelocation(Obj, Rela, false);
-    }
-  }
-  if (DynRelrRegion.Size > 0) {
-    OS << "\n'RELR' relocation section at offset "
-       << format_hex(reinterpret_cast<const uint8_t *>(DynRelrRegion.Addr) -
-                         Obj->base(),
-                     1)
-       << " contains " << DynRelrRegion.Size << " bytes:\n";
-    printRelocHeader(ELF::SHT_REL);
-    Elf_Relr_Range Relrs = this->dumper()->dyn_relrs();
-    std::vector<Elf_Rela> RelrRelas =
-        unwrapOrError(this->FileName, Obj->decode_relrs(Relrs));
-    for (const Elf_Rela &Rela : RelrRelas) {
-      printDynamicRelocation(Obj, Rela, false);
-    }
-  }
-  if (DynPLTRelRegion.Size) {
-    OS << "\n'PLT' relocation section at offset "
-       << format_hex(reinterpret_cast<const uint8_t *>(DynPLTRelRegion.Addr) -
-                         Obj->base(),
-                     1)
-       << " contains " << DynPLTRelRegion.Size << " bytes:\n";
-
-    if (DynPLTRelRegion.EntSize == sizeof(Elf_Rela)) {
-      printRelocHeader(ELF::SHT_RELA);
-      for (const Elf_Rela &Rela : DynPLTRelRegion.getAsArrayRef<Elf_Rela>())
-        printDynamicRelocation(Obj, Rela, true);
+void ELFDumper<ELFT>::printDynamicReloc(const Relocation<ELFT> &R) {
+  printRelRelaReloc(R, getSymbolForReloc(*this, R));
+}
+
+template <class ELFT>
+void ELFDumper<ELFT>::printRelocationsHelper(const Elf_Shdr &Sec) {
+  this->forEachRelocationDo(
+      Sec, opts::RawRelr,
+      [&](const Relocation<ELFT> &R, unsigned Ndx, const Elf_Shdr &Sec,
+          const Elf_Shdr *SymTab) { printReloc(R, Ndx, Sec, SymTab); },
+      [&](const Elf_Relr &R) { printRelrReloc(R); });
+}
+
+template <class ELFT> void ELFDumper<ELFT>::printDynamicRelocationsHelper() {
+  const bool IsMips64EL = this->Obj.isMips64EL();
+  if (this->DynRelaRegion.Size > 0) {
+    printDynamicRelocHeader(ELF::SHT_RELA, "RELA", this->DynRelaRegion);
+    for (const Elf_Rela &Rela :
+         this->DynRelaRegion.template getAsArrayRef<Elf_Rela>())
+      printDynamicReloc(Relocation<ELFT>(Rela, IsMips64EL));
+  }
+
+  if (this->DynRelRegion.Size > 0) {
+    printDynamicRelocHeader(ELF::SHT_REL, "REL", this->DynRelRegion);
+    for (const Elf_Rel &Rel :
+         this->DynRelRegion.template getAsArrayRef<Elf_Rel>())
+      printDynamicReloc(Relocation<ELFT>(Rel, IsMips64EL));
+  }
+
+  if (this->DynRelrRegion.Size > 0) {
+    printDynamicRelocHeader(ELF::SHT_REL, "RELR", this->DynRelrRegion);
+    Elf_Relr_Range Relrs =
+        this->DynRelrRegion.template getAsArrayRef<Elf_Relr>();
+    for (const Elf_Rel &Rel : Obj.decode_relrs(Relrs))
+      printDynamicReloc(Relocation<ELFT>(Rel, IsMips64EL));
+  }
+
+  if (this->DynPLTRelRegion.Size) {
+    if (this->DynPLTRelRegion.EntSize == sizeof(Elf_Rela)) {
+      printDynamicRelocHeader(ELF::SHT_RELA, "PLT", this->DynPLTRelRegion);
+      for (const Elf_Rela &Rela :
+           this->DynPLTRelRegion.template getAsArrayRef<Elf_Rela>())
+        printDynamicReloc(Relocation<ELFT>(Rela, IsMips64EL));
     } else {
-      printRelocHeader(ELF::SHT_REL);
-      for (const Elf_Rel &Rel : DynPLTRelRegion.getAsArrayRef<Elf_Rel>()) {
-        Elf_Rela Rela;
-        Rela.r_offset = Rel.r_offset;
-        Rela.r_info = Rel.r_info;
-        Rela.r_addend = 0;
-        printDynamicRelocation(Obj, Rela, false);
-      }
+      printDynamicRelocHeader(ELF::SHT_REL, "PLT", this->DynPLTRelRegion);
+      for (const Elf_Rel &Rel :
+           this->DynPLTRelRegion.template getAsArrayRef<Elf_Rel>())
+        printDynamicReloc(Relocation<ELFT>(Rel, IsMips64EL));
     }
   }
 }
 
 template <class ELFT>
-void GNUStyle<ELFT>::printGNUVersionSectionProlog(
-    const ELFFile<ELFT> *Obj, const typename ELFT::Shdr *Sec,
-    const Twine &Label, unsigned EntriesNum) {
-  StringRef SecName = unwrapOrError(this->FileName, Obj->getSectionName(Sec));
+void GNUELFDumper<ELFT>::printGNUVersionSectionProlog(
+    const typename ELFT::Shdr &Sec, const Twine &Label, unsigned EntriesNum) {
+  // Don't inline the SecName, because it might report a warning to stderr and
+  // corrupt the output.
+  StringRef SecName = this->getPrintableSectionName(Sec);
   OS << Label << " section '" << SecName << "' "
      << "contains " << EntriesNum << " entries:\n";
 
-  unsigned SecNdx = Sec - &cantFail(Obj->sections()).front();
-  StringRef SymTabName = "<corrupt>";
-
-  Expected<const typename ELFT::Shdr *> SymTabOrErr =
-      Obj->getSection(Sec->sh_link);
-  if (SymTabOrErr)
-    SymTabName =
-        unwrapOrError(this->FileName, Obj->getSectionName(*SymTabOrErr));
+  StringRef LinkedSecName = "<corrupt>";
+  if (Expected<const typename ELFT::Shdr *> LinkedSecOrErr =
+          this->Obj.getSection(Sec.sh_link))
+    LinkedSecName = this->getPrintableSectionName(**LinkedSecOrErr);
   else
-    this->reportUniqueWarning(
-        createError("invalid section linked to " +
-                    object::getELFSectionTypeName(Obj->getHeader()->e_machine,
-                                                  Sec->sh_type) +
-                    " section with index " + Twine(SecNdx) + ": " +
-                    toString(SymTabOrErr.takeError())));
+    this->reportUniqueWarning("invalid section linked to " +
+                              this->describe(Sec) + ": " +
+                              toString(LinkedSecOrErr.takeError()));
 
-  OS << " Addr: " << format_hex_no_prefix(Sec->sh_addr, 16)
-     << "  Offset: " << format_hex(Sec->sh_offset, 8)
-     << "  Link: " << Sec->sh_link << " (" << SymTabName << ")\n";
+  OS << " Addr: " << format_hex_no_prefix(Sec.sh_addr, 16)
+     << "  Offset: " << format_hex(Sec.sh_offset, 8)
+     << "  Link: " << Sec.sh_link << " (" << LinkedSecName << ")\n";
 }
 
 template <class ELFT>
-void GNUStyle<ELFT>::printVersionSymbolSection(const ELFFile<ELFT> *Obj,
-                                               const Elf_Shdr *Sec) {
+void GNUELFDumper<ELFT>::printVersionSymbolSection(const Elf_Shdr *Sec) {
   if (!Sec)
     return;
 
-  printGNUVersionSectionProlog(Obj, Sec, "Version symbols",
+  printGNUVersionSectionProlog(*Sec, "Version symbols",
                                Sec->sh_size / sizeof(Elf_Versym));
   Expected<ArrayRef<Elf_Versym>> VerTableOrErr =
-      this->dumper()->getVersionTable(Sec, /*SymTab=*/nullptr,
-                                      /*StrTab=*/nullptr);
+      this->getVersionTable(*Sec, /*SymTab=*/nullptr,
+                            /*StrTab=*/nullptr, /*SymTabSec=*/nullptr);
   if (!VerTableOrErr) {
     this->reportUniqueWarning(VerTableOrErr.takeError());
     return;
   }
 
+  SmallVector<Optional<VersionEntry>, 0> *VersionMap = nullptr;
+  if (Expected<SmallVector<Optional<VersionEntry>, 0> *> MapOrErr =
+          this->getVersionMap())
+    VersionMap = *MapOrErr;
+  else
+    this->reportUniqueWarning(MapOrErr.takeError());
+
   ArrayRef<Elf_Versym> VerTable = *VerTableOrErr;
   std::vector<StringRef> Versions;
   for (size_t I = 0, E = VerTable.size(); I < E; ++I) {
@@ -4660,17 +4380,18 @@ void GNUStyle<ELFT>::printVersionSymbolSection(const ELFFile<ELFT> *Obj,
       continue;
     }
 
+    if (!VersionMap) {
+      Versions.emplace_back("<corrupt>");
+      continue;
+    }
+
     bool IsDefault;
-    Expected<StringRef> NameOrErr =
-        this->dumper()->getSymbolVersionByIndex(Ndx, IsDefault);
+    Expected<StringRef> NameOrErr = this->Obj.getSymbolVersionByIndex(
+        Ndx, IsDefault, *VersionMap, /*IsSymHidden=*/None);
     if (!NameOrErr) {
-      if (!NameOrErr) {
-        unsigned SecNdx = Sec - &cantFail(Obj->sections()).front();
-        this->reportUniqueWarning(createError(
-            "unable to get a version for entry " + Twine(I) +
-            " of SHT_GNU_versym section with index " + Twine(SecNdx) + ": " +
-            toString(NameOrErr.takeError())));
-      }
+      this->reportUniqueWarning("unable to get a version for entry " +
+                                Twine(I) + " of " + this->describe(*Sec) +
+                                ": " + toString(NameOrErr.takeError()));
       Versions.emplace_back("<corrupt>");
       continue;
     }
@@ -4714,14 +4435,13 @@ static std::string versionFlagToString(unsigned Flags) {
 }
 
 template <class ELFT>
-void GNUStyle<ELFT>::printVersionDefinitionSection(const ELFFile<ELFT> *Obj,
-                                                   const Elf_Shdr *Sec) {
+void GNUELFDumper<ELFT>::printVersionDefinitionSection(const Elf_Shdr *Sec) {
   if (!Sec)
     return;
 
-  printGNUVersionSectionProlog(Obj, Sec, "Version definition", Sec->sh_info);
+  printGNUVersionSectionProlog(*Sec, "Version definition", Sec->sh_info);
 
-  Expected<std::vector<VerDef>> V = this->dumper()->getVersionDefinitions(Sec);
+  Expected<std::vector<VerDef>> V = this->Obj.getVersionDefinitions(*Sec);
   if (!V) {
     this->reportUniqueWarning(V.takeError());
     return;
@@ -4742,16 +4462,15 @@ void GNUStyle<ELFT>::printVersionDefinitionSection(const ELFFile<ELFT> *Obj,
 }
 
 template <class ELFT>
-void GNUStyle<ELFT>::printVersionDependencySection(const ELFFile<ELFT> *Obj,
-                                                   const Elf_Shdr *Sec) {
+void GNUELFDumper<ELFT>::printVersionDependencySection(const Elf_Shdr *Sec) {
   if (!Sec)
     return;
 
   unsigned VerneedNum = Sec->sh_info;
-  printGNUVersionSectionProlog(Obj, Sec, "Version needs", VerneedNum);
+  printGNUVersionSectionProlog(*Sec, "Version needs", VerneedNum);
 
   Expected<std::vector<VerNeed>> V =
-      this->dumper()->getVersionDependencies(Sec);
+      this->Obj.getVersionDependencies(*Sec, this->WarningHandler);
   if (!V) {
     this->reportUniqueWarning(V.takeError());
     return;
@@ -4769,7 +4488,7 @@ void GNUStyle<ELFT>::printVersionDependencySection(const ELFFile<ELFT> *Obj,
 }
 
 template <class ELFT>
-void GNUStyle<ELFT>::printHashHistogram(const Elf_Hash &HashTable) {
+void GNUELFDumper<ELFT>::printHashHistogram(const Elf_Hash &HashTable) {
   size_t NBucket = HashTable.nbucket;
   size_t NChain = HashTable.nchain;
   ArrayRef<Elf_Word> Buckets = HashTable.buckets();
@@ -4791,10 +4510,9 @@ void GNUStyle<ELFT>::printHashHistogram(const Elf_Hash &HashTable) {
       if (C == ELF::STN_UNDEF)
         break;
       if (Visited[C]) {
-        reportWarning(createError(".hash section is invalid: bucket " +
+        this->reportUniqueWarning(".hash section is invalid: bucket " +
                                   Twine(C) +
-                                  ": a cycle was detected in the linked chain"),
-                      this->FileName);
+                                  ": a cycle was detected in the linked chain");
         break;
       }
       Visited[C] = true;
@@ -4825,13 +4543,13 @@ void GNUStyle<ELFT>::printHashHistogram(const Elf_Hash &HashTable) {
 }
 
 template <class ELFT>
-void GNUStyle<ELFT>::printGnuHashHistogram(const Elf_GnuHash &GnuHashTable) {
-  Expected<ArrayRef<Elf_Word>> ChainsOrErr = getGnuHashTableChains<ELFT>(
-      this->dumper()->getDynSymRegion(), &GnuHashTable);
+void GNUELFDumper<ELFT>::printGnuHashHistogram(
+    const Elf_GnuHash &GnuHashTable) {
+  Expected<ArrayRef<Elf_Word>> ChainsOrErr =
+      getGnuHashTableChains<ELFT>(this->DynSymRegion, &GnuHashTable);
   if (!ChainsOrErr) {
-    this->reportUniqueWarning(
-        createError("unable to print the GNU hash table histogram: " +
-                    toString(ChainsOrErr.takeError())));
+    this->reportUniqueWarning("unable to print the GNU hash table histogram: " +
+                              toString(ChainsOrErr.takeError()));
     return;
   }
 
@@ -4883,211 +4601,83 @@ void GNUStyle<ELFT>::printGnuHashHistogram(const Elf_GnuHash &GnuHashTable) {
 // dynamic symbol table. The table shows the number of hash buckets for
 // different lengths of chains as an absolute number and percentage of the total
 // buckets, and the cumulative coverage of symbols for each set of buckets.
-template <class ELFT>
-void GNUStyle<ELFT>::printHashHistograms(const ELFFile<ELFT> *Obj) {
+template <class ELFT> void GNUELFDumper<ELFT>::printHashHistograms() {
   // Print histogram for the .hash section.
-  if (const Elf_Hash *HashTable = this->dumper()->getHashTable()) {
-    if (Error E = checkHashTable<ELFT>(Obj, HashTable))
+  if (this->HashTable) {
+    if (Error E = checkHashTable<ELFT>(*this, this->HashTable))
       this->reportUniqueWarning(std::move(E));
     else
-      printHashHistogram(*HashTable);
+      printHashHistogram(*this->HashTable);
   }
 
   // Print histogram for the .gnu.hash section.
-  if (const Elf_GnuHash *GnuHashTable = this->dumper()->getGnuHashTable()) {
-    if (Error E = checkGNUHashTable<ELFT>(Obj, GnuHashTable))
+  if (this->GnuHashTable) {
+    if (Error E = checkGNUHashTable<ELFT>(this->Obj, this->GnuHashTable))
       this->reportUniqueWarning(std::move(E));
     else
-      printGnuHashHistogram(*GnuHashTable);
+      printGnuHashHistogram(*this->GnuHashTable);
   }
 }
 
-template <class ELFT>
-void GNUStyle<ELFT>::printCGProfile(const ELFFile<ELFT> *Obj) {
+template <class ELFT> void GNUELFDumper<ELFT>::printCGProfile() {
   OS << "GNUStyle::printCGProfile not implemented\n";
 }
 
-template <class ELFT>
-void GNUStyle<ELFT>::printAddrsig(const ELFFile<ELFT> *Obj) {
-  reportError(createError("--addrsig: not implemented"), this->FileName);
-}
-
-static StringRef getGenericNoteTypeName(const uint32_t NT) {
-  static const struct {
-    uint32_t ID;
-    const char *Name;
-  } Notes[] = {
-      {ELF::NT_VERSION, "NT_VERSION (version)"},
-      {ELF::NT_ARCH, "NT_ARCH (architecture)"},
-      {ELF::NT_GNU_BUILD_ATTRIBUTE_OPEN, "OPEN"},
-      {ELF::NT_GNU_BUILD_ATTRIBUTE_FUNC, "func"},
-  };
-
-  for (const auto &Note : Notes)
-    if (Note.ID == NT)
-      return Note.Name;
-
-  return "";
-}
-
-static StringRef getCoreNoteTypeName(const uint32_t NT) {
-  static const struct {
-    uint32_t ID;
-    const char *Name;
-  } Notes[] = {
-      {ELF::NT_PRSTATUS, "NT_PRSTATUS (prstatus structure)"},
-      {ELF::NT_FPREGSET, "NT_FPREGSET (floating point registers)"},
-      {ELF::NT_PRPSINFO, "NT_PRPSINFO (prpsinfo structure)"},
-      {ELF::NT_TASKSTRUCT, "NT_TASKSTRUCT (task structure)"},
-      {ELF::NT_AUXV, "NT_AUXV (auxiliary vector)"},
-      {ELF::NT_PSTATUS, "NT_PSTATUS (pstatus structure)"},
-      {ELF::NT_FPREGS, "NT_FPREGS (floating point registers)"},
-      {ELF::NT_PSINFO, "NT_PSINFO (psinfo structure)"},
-      {ELF::NT_LWPSTATUS, "NT_LWPSTATUS (lwpstatus_t structure)"},
-      {ELF::NT_LWPSINFO, "NT_LWPSINFO (lwpsinfo_t structure)"},
-      {ELF::NT_WIN32PSTATUS, "NT_WIN32PSTATUS (win32_pstatus structure)"},
-
-      {ELF::NT_PPC_VMX, "NT_PPC_VMX (ppc Altivec registers)"},
-      {ELF::NT_PPC_VSX, "NT_PPC_VSX (ppc VSX registers)"},
-      {ELF::NT_PPC_TAR, "NT_PPC_TAR (ppc TAR register)"},
-      {ELF::NT_PPC_PPR, "NT_PPC_PPR (ppc PPR register)"},
-      {ELF::NT_PPC_DSCR, "NT_PPC_DSCR (ppc DSCR register)"},
-      {ELF::NT_PPC_EBB, "NT_PPC_EBB (ppc EBB registers)"},
-      {ELF::NT_PPC_PMU, "NT_PPC_PMU (ppc PMU registers)"},
-      {ELF::NT_PPC_TM_CGPR, "NT_PPC_TM_CGPR (ppc checkpointed GPR registers)"},
-      {ELF::NT_PPC_TM_CFPR,
-       "NT_PPC_TM_CFPR (ppc checkpointed floating point registers)"},
-      {ELF::NT_PPC_TM_CVMX,
-       "NT_PPC_TM_CVMX (ppc checkpointed Altivec registers)"},
-      {ELF::NT_PPC_TM_CVSX, "NT_PPC_TM_CVSX (ppc checkpointed VSX registers)"},
-      {ELF::NT_PPC_TM_SPR, "NT_PPC_TM_SPR (ppc TM special purpose registers)"},
-      {ELF::NT_PPC_TM_CTAR, "NT_PPC_TM_CTAR (ppc checkpointed TAR register)"},
-      {ELF::NT_PPC_TM_CPPR, "NT_PPC_TM_CPPR (ppc checkpointed PPR register)"},
-      {ELF::NT_PPC_TM_CDSCR,
-       "NT_PPC_TM_CDSCR (ppc checkpointed DSCR register)"},
-
-      {ELF::NT_386_TLS, "NT_386_TLS (x86 TLS information)"},
-      {ELF::NT_386_IOPERM, "NT_386_IOPERM (x86 I/O permissions)"},
-      {ELF::NT_X86_XSTATE, "NT_X86_XSTATE (x86 XSAVE extended state)"},
-
-      {ELF::NT_S390_HIGH_GPRS,
-       "NT_S390_HIGH_GPRS (s390 upper register halves)"},
-      {ELF::NT_S390_TIMER, "NT_S390_TIMER (s390 timer register)"},
-      {ELF::NT_S390_TODCMP, "NT_S390_TODCMP (s390 TOD comparator register)"},
-      {ELF::NT_S390_TODPREG,
-       "NT_S390_TODPREG (s390 TOD programmable register)"},
-      {ELF::NT_S390_CTRS, "NT_S390_CTRS (s390 control registers)"},
-      {ELF::NT_S390_PREFIX, "NT_S390_PREFIX (s390 prefix register)"},
-      {ELF::NT_S390_LAST_BREAK,
-       "NT_S390_LAST_BREAK (s390 last breaking event address)"},
-      {ELF::NT_S390_SYSTEM_CALL,
-       "NT_S390_SYSTEM_CALL (s390 system call restart data)"},
-      {ELF::NT_S390_TDB, "NT_S390_TDB (s390 transaction diagnostic block)"},
-      {ELF::NT_S390_VXRS_LOW,
-       "NT_S390_VXRS_LOW (s390 vector registers 0-15 upper half)"},
-      {ELF::NT_S390_VXRS_HIGH,
-       "NT_S390_VXRS_HIGH (s390 vector registers 16-31)"},
-      {ELF::NT_S390_GS_CB, "NT_S390_GS_CB (s390 guarded-storage registers)"},
-      {ELF::NT_S390_GS_BC,
-       "NT_S390_GS_BC (s390 guarded-storage broadcast control)"},
-
-      {ELF::NT_ARM_VFP, "NT_ARM_VFP (arm VFP registers)"},
-      {ELF::NT_ARM_TLS, "NT_ARM_TLS (AArch TLS registers)"},
-      {ELF::NT_ARM_HW_BREAK,
-       "NT_ARM_HW_BREAK (AArch hardware breakpoint registers)"},
-      {ELF::NT_ARM_HW_WATCH,
-       "NT_ARM_HW_WATCH (AArch hardware watchpoint registers)"},
-
-      {ELF::NT_FILE, "NT_FILE (mapped files)"},
-      {ELF::NT_PRXFPREG, "NT_PRXFPREG (user_xfpregs structure)"},
-      {ELF::NT_SIGINFO, "NT_SIGINFO (siginfo_t data)"},
-  };
-
-  for (const auto &Note : Notes)
-    if (Note.ID == NT)
-      return Note.Name;
-
-  return "";
-}
-
-static std::string getGNUNoteTypeName(const uint32_t NT) {
-  static const struct {
-    uint32_t ID;
-    const char *Name;
-  } Notes[] = {
-      {ELF::NT_GNU_ABI_TAG, "NT_GNU_ABI_TAG (ABI version tag)"},
-      {ELF::NT_GNU_HWCAP, "NT_GNU_HWCAP (DSO-supplied software HWCAP info)"},
-      {ELF::NT_GNU_BUILD_ID, "NT_GNU_BUILD_ID (unique build ID bitstring)"},
-      {ELF::NT_GNU_GOLD_VERSION, "NT_GNU_GOLD_VERSION (gold version)"},
-      {ELF::NT_GNU_PROPERTY_TYPE_0, "NT_GNU_PROPERTY_TYPE_0 (property note)"},
-  };
-
-  for (const auto &Note : Notes)
-    if (Note.ID == NT)
-      return std::string(Note.Name);
-
-  std::string string;
-  raw_string_ostream OS(string);
-  OS << format("Unknown note type (0x%08x)", NT);
-  return OS.str();
+static Expected<std::vector<uint64_t>> toULEB128Array(ArrayRef<uint8_t> Data) {
+  std::vector<uint64_t> Ret;
+  const uint8_t *Cur = Data.begin();
+  const uint8_t *End = Data.end();
+  while (Cur != End) {
+    unsigned Size;
+    const char *Err;
+    Ret.push_back(decodeULEB128(Cur, &Size, End, &Err));
+    if (Err)
+      return createError(Err);
+    Cur += Size;
+  }
+  return Ret;
 }
 
-static std::string getFreeBSDNoteTypeName(const uint32_t NT) {
-  static const struct {
-    uint32_t ID;
-    const char *Name;
-  } Notes[] = {
-      {ELF::NT_FREEBSD_THRMISC, "NT_THRMISC (thrmisc structure)"},
-      {ELF::NT_FREEBSD_PROCSTAT_PROC, "NT_PROCSTAT_PROC (proc data)"},
-      {ELF::NT_FREEBSD_PROCSTAT_FILES, "NT_PROCSTAT_FILES (files data)"},
-      {ELF::NT_FREEBSD_PROCSTAT_VMMAP, "NT_PROCSTAT_VMMAP (vmmap data)"},
-      {ELF::NT_FREEBSD_PROCSTAT_GROUPS, "NT_PROCSTAT_GROUPS (groups data)"},
-      {ELF::NT_FREEBSD_PROCSTAT_UMASK, "NT_PROCSTAT_UMASK (umask data)"},
-      {ELF::NT_FREEBSD_PROCSTAT_RLIMIT, "NT_PROCSTAT_RLIMIT (rlimit data)"},
-      {ELF::NT_FREEBSD_PROCSTAT_OSREL, "NT_PROCSTAT_OSREL (osreldate data)"},
-      {ELF::NT_FREEBSD_PROCSTAT_PSSTRINGS,
-       "NT_PROCSTAT_PSSTRINGS (ps_strings data)"},
-      {ELF::NT_FREEBSD_PROCSTAT_AUXV, "NT_PROCSTAT_AUXV (auxv data)"},
-  };
-
-  for (const auto &Note : Notes)
-    if (Note.ID == NT)
-      return std::string(Note.Name);
+template <class ELFT>
+static Expected<std::vector<uint64_t>>
+decodeAddrsigSection(const ELFFile<ELFT> &Obj, const typename ELFT::Shdr &Sec) {
+  Expected<ArrayRef<uint8_t>> ContentsOrErr = Obj.getSectionContents(Sec);
+  if (!ContentsOrErr)
+    return ContentsOrErr.takeError();
 
-  std::string string;
-  raw_string_ostream OS(string);
-  OS << format("Unknown note type (0x%08x)", NT);
-  return OS.str();
+  if (Expected<std::vector<uint64_t>> SymsOrErr =
+          toULEB128Array(*ContentsOrErr))
+    return *SymsOrErr;
+  else
+    return createError("unable to decode " + describe(Obj, Sec) + ": " +
+                       toString(SymsOrErr.takeError()));
 }
 
-static std::string getAMDNoteTypeName(const uint32_t NT) {
-  static const struct {
-    uint32_t ID;
-    const char *Name;
-  } Notes[] = {{ELF::NT_AMD_AMDGPU_HSA_METADATA,
-                "NT_AMD_AMDGPU_HSA_METADATA (HSA Metadata)"},
-               {ELF::NT_AMD_AMDGPU_ISA, "NT_AMD_AMDGPU_ISA (ISA Version)"},
-               {ELF::NT_AMD_AMDGPU_PAL_METADATA,
-                "NT_AMD_AMDGPU_PAL_METADATA (PAL Metadata)"}};
-
-  for (const auto &Note : Notes)
-    if (Note.ID == NT)
-      return std::string(Note.Name);
+template <class ELFT> void GNUELFDumper<ELFT>::printAddrsig() {
+  if (!this->DotAddrsigSec)
+    return;
 
-  std::string string;
-  raw_string_ostream OS(string);
-  OS << format("Unknown note type (0x%08x)", NT);
-  return OS.str();
-}
+  Expected<std::vector<uint64_t>> SymsOrErr =
+      decodeAddrsigSection(this->Obj, *this->DotAddrsigSec);
+  if (!SymsOrErr) {
+    this->reportUniqueWarning(SymsOrErr.takeError());
+    return;
+  }
 
-static std::string getAMDGPUNoteTypeName(const uint32_t NT) {
-  if (NT == ELF::NT_AMDGPU_METADATA)
-    return std::string("NT_AMDGPU_METADATA (AMDGPU Metadata)");
+  StringRef Name = this->getPrintableSectionName(*this->DotAddrsigSec);
+  OS << "\nAddress-significant symbols section '" << Name << "'"
+     << " contains " << SymsOrErr->size() << " entries:\n";
+  OS << "   Num: Name\n";
 
-  std::string string;
-  raw_string_ostream OS(string);
-  OS << format("Unknown note type (0x%08x)", NT);
-  return OS.str();
+  Field Fields[2] = {0, 8};
+  size_t SymIndex = 0;
+  for (uint64_t Sym : *SymsOrErr) {
+    Fields[0].Str = to_string(format_decimal(++SymIndex, 6)) + ":";
+    Fields[1].Str = this->getStaticSymbolName(Sym);
+    for (const Field &Entry : Fields)
+      printField(Entry);
+    OS << "\n";
+  }
 }
 
 template <typename ELFT>
@@ -5277,7 +4867,7 @@ template <typename ELFT> static GNUAbiTag getGNUAbiTag(ArrayRef<uint8_t> Desc) {
 static std::string getGNUBuildId(ArrayRef<uint8_t> Desc) {
   std::string str;
   raw_string_ostream OS(str);
-  for (const auto &B : Desc)
+  for (uint8_t B : Desc)
     OS << format_hex_no_prefix(B, 2);
   return OS.str();
 }
@@ -5309,7 +4899,7 @@ static void printGNUNote(raw_ostream &OS, uint32_t NoteType,
     break;
   case ELF::NT_GNU_PROPERTY_TYPE_0:
     OS << "    Properties:";
-    for (const auto &Property : getGNUPropertyList<ELFT>(Desc))
+    for (const std::string &Property : getGNUPropertyList<ELFT>(Desc))
       OS << "    " << Property << "\n";
     break;
   }
@@ -5348,17 +4938,17 @@ static AMDGPUNote getAMDGPUNote(uint32_t NoteType, ArrayRef<uint8_t> Desc) {
   default:
     return {"", ""};
   case ELF::NT_AMDGPU_METADATA: {
-    auto MsgPackString =
+    StringRef MsgPackString =
         StringRef(reinterpret_cast<const char *>(Desc.data()), Desc.size());
     msgpack::Document MsgPackDoc;
     if (!MsgPackDoc.readFromBlob(MsgPackString, /*Multi=*/false))
       return {"AMDGPU Metadata", "Invalid AMDGPU Metadata"};
 
     AMDGPU::HSAMD::V3::MetadataVerifier Verifier(true);
+    std::string HSAMetadataString;
     if (!Verifier.verify(MsgPackDoc.getRoot()))
-      return {"AMDGPU Metadata", "Invalid AMDGPU Metadata"};
+      HSAMetadataString = "Invalid AMDGPU Metadata\n";
 
-    std::string HSAMetadataString;
     raw_string_ostream StrOS(HSAMetadataString);
     MsgPackDoc.toYAML(StrOS);
 
@@ -5389,19 +4979,20 @@ static Expected<CoreNote> readCoreNote(DataExtractor Desc) {
   const int Bytes = Desc.getAddressSize();
 
   if (!Desc.isValidOffsetForAddress(2))
-    return createStringError(object_error::parse_failed,
-                             "malformed note: header too short");
+    return createError("the note of size 0x" + Twine::utohexstr(Desc.size()) +
+                       " is too short, expected at least 0x" +
+                       Twine::utohexstr(Bytes * 2));
   if (Desc.getData().back() != 0)
-    return createStringError(object_error::parse_failed,
-                             "malformed note: not NUL terminated");
+    return createError("the note is not NUL terminated");
 
   uint64_t DescOffset = 0;
   uint64_t FileCount = Desc.getAddress(&DescOffset);
   Ret.PageSize = Desc.getAddress(&DescOffset);
 
   if (!Desc.isValidOffsetForAddress(3 * FileCount * Bytes))
-    return createStringError(object_error::parse_failed,
-                             "malformed note: too short for number of files");
+    return createError("unable to read file mappings (found " +
+                       Twine(FileCount) + "): the note of size 0x" +
+                       Twine::utohexstr(Desc.size()) + " is too short");
 
   uint64_t FilenamesOffset = 0;
   DataExtractor Filenames(
@@ -5409,10 +5000,14 @@ static Expected<CoreNote> readCoreNote(DataExtractor Desc) {
       Desc.isLittleEndian(), Desc.getAddressSize());
 
   Ret.Mappings.resize(FileCount);
+  size_t I = 0;
   for (CoreFileMapping &Mapping : Ret.Mappings) {
+    ++I;
     if (!Filenames.isValidOffsetForDataOfSize(FilenamesOffset, 1))
-      return createStringError(object_error::parse_failed,
-                               "malformed note: too few filenames");
+      return createError(
+          "unable to read the file name for the mapping with index " +
+          Twine(I) + ": the note of size 0x" + Twine::utohexstr(Desc.size()) +
+          " is truncated");
     Mapping.Start = Desc.getAddress(&DescOffset);
     Mapping.End = Desc.getAddress(&DescOffset);
     Mapping.Offset = Desc.getAddress(&DescOffset);
@@ -5439,8 +5034,205 @@ static void printCoreNote(raw_ostream &OS, const CoreNote &Note) {
   }
 }
 
-template <class ELFT>
-void GNUStyle<ELFT>::printNotes(const ELFFile<ELFT> *Obj) {
+static const NoteType GenericNoteTypes[] = {
+    {ELF::NT_VERSION, "NT_VERSION (version)"},
+    {ELF::NT_ARCH, "NT_ARCH (architecture)"},
+    {ELF::NT_GNU_BUILD_ATTRIBUTE_OPEN, "OPEN"},
+    {ELF::NT_GNU_BUILD_ATTRIBUTE_FUNC, "func"},
+};
+
+static const NoteType GNUNoteTypes[] = {
+    {ELF::NT_GNU_ABI_TAG, "NT_GNU_ABI_TAG (ABI version tag)"},
+    {ELF::NT_GNU_HWCAP, "NT_GNU_HWCAP (DSO-supplied software HWCAP info)"},
+    {ELF::NT_GNU_BUILD_ID, "NT_GNU_BUILD_ID (unique build ID bitstring)"},
+    {ELF::NT_GNU_GOLD_VERSION, "NT_GNU_GOLD_VERSION (gold version)"},
+    {ELF::NT_GNU_PROPERTY_TYPE_0, "NT_GNU_PROPERTY_TYPE_0 (property note)"},
+};
+
+static const NoteType FreeBSDNoteTypes[] = {
+    {ELF::NT_FREEBSD_THRMISC, "NT_THRMISC (thrmisc structure)"},
+    {ELF::NT_FREEBSD_PROCSTAT_PROC, "NT_PROCSTAT_PROC (proc data)"},
+    {ELF::NT_FREEBSD_PROCSTAT_FILES, "NT_PROCSTAT_FILES (files data)"},
+    {ELF::NT_FREEBSD_PROCSTAT_VMMAP, "NT_PROCSTAT_VMMAP (vmmap data)"},
+    {ELF::NT_FREEBSD_PROCSTAT_GROUPS, "NT_PROCSTAT_GROUPS (groups data)"},
+    {ELF::NT_FREEBSD_PROCSTAT_UMASK, "NT_PROCSTAT_UMASK (umask data)"},
+    {ELF::NT_FREEBSD_PROCSTAT_RLIMIT, "NT_PROCSTAT_RLIMIT (rlimit data)"},
+    {ELF::NT_FREEBSD_PROCSTAT_OSREL, "NT_PROCSTAT_OSREL (osreldate data)"},
+    {ELF::NT_FREEBSD_PROCSTAT_PSSTRINGS,
+     "NT_PROCSTAT_PSSTRINGS (ps_strings data)"},
+    {ELF::NT_FREEBSD_PROCSTAT_AUXV, "NT_PROCSTAT_AUXV (auxv data)"},
+};
+
+static const NoteType AMDNoteTypes[] = {
+    {ELF::NT_AMD_AMDGPU_HSA_METADATA,
+     "NT_AMD_AMDGPU_HSA_METADATA (HSA Metadata)"},
+    {ELF::NT_AMD_AMDGPU_ISA, "NT_AMD_AMDGPU_ISA (ISA Version)"},
+    {ELF::NT_AMD_AMDGPU_PAL_METADATA,
+     "NT_AMD_AMDGPU_PAL_METADATA (PAL Metadata)"},
+};
+
+static const NoteType AMDGPUNoteTypes[] = {
+    {ELF::NT_AMDGPU_METADATA, "NT_AMDGPU_METADATA (AMDGPU Metadata)"},
+};
+
+static const NoteType CoreNoteTypes[] = {
+    {ELF::NT_PRSTATUS, "NT_PRSTATUS (prstatus structure)"},
+    {ELF::NT_FPREGSET, "NT_FPREGSET (floating point registers)"},
+    {ELF::NT_PRPSINFO, "NT_PRPSINFO (prpsinfo structure)"},
+    {ELF::NT_TASKSTRUCT, "NT_TASKSTRUCT (task structure)"},
+    {ELF::NT_AUXV, "NT_AUXV (auxiliary vector)"},
+    {ELF::NT_PSTATUS, "NT_PSTATUS (pstatus structure)"},
+    {ELF::NT_FPREGS, "NT_FPREGS (floating point registers)"},
+    {ELF::NT_PSINFO, "NT_PSINFO (psinfo structure)"},
+    {ELF::NT_LWPSTATUS, "NT_LWPSTATUS (lwpstatus_t structure)"},
+    {ELF::NT_LWPSINFO, "NT_LWPSINFO (lwpsinfo_t structure)"},
+    {ELF::NT_WIN32PSTATUS, "NT_WIN32PSTATUS (win32_pstatus structure)"},
+
+    {ELF::NT_PPC_VMX, "NT_PPC_VMX (ppc Altivec registers)"},
+    {ELF::NT_PPC_VSX, "NT_PPC_VSX (ppc VSX registers)"},
+    {ELF::NT_PPC_TAR, "NT_PPC_TAR (ppc TAR register)"},
+    {ELF::NT_PPC_PPR, "NT_PPC_PPR (ppc PPR register)"},
+    {ELF::NT_PPC_DSCR, "NT_PPC_DSCR (ppc DSCR register)"},
+    {ELF::NT_PPC_EBB, "NT_PPC_EBB (ppc EBB registers)"},
+    {ELF::NT_PPC_PMU, "NT_PPC_PMU (ppc PMU registers)"},
+    {ELF::NT_PPC_TM_CGPR, "NT_PPC_TM_CGPR (ppc checkpointed GPR registers)"},
+    {ELF::NT_PPC_TM_CFPR,
+     "NT_PPC_TM_CFPR (ppc checkpointed floating point registers)"},
+    {ELF::NT_PPC_TM_CVMX,
+     "NT_PPC_TM_CVMX (ppc checkpointed Altivec registers)"},
+    {ELF::NT_PPC_TM_CVSX, "NT_PPC_TM_CVSX (ppc checkpointed VSX registers)"},
+    {ELF::NT_PPC_TM_SPR, "NT_PPC_TM_SPR (ppc TM special purpose registers)"},
+    {ELF::NT_PPC_TM_CTAR, "NT_PPC_TM_CTAR (ppc checkpointed TAR register)"},
+    {ELF::NT_PPC_TM_CPPR, "NT_PPC_TM_CPPR (ppc checkpointed PPR register)"},
+    {ELF::NT_PPC_TM_CDSCR, "NT_PPC_TM_CDSCR (ppc checkpointed DSCR register)"},
+
+    {ELF::NT_386_TLS, "NT_386_TLS (x86 TLS information)"},
+    {ELF::NT_386_IOPERM, "NT_386_IOPERM (x86 I/O permissions)"},
+    {ELF::NT_X86_XSTATE, "NT_X86_XSTATE (x86 XSAVE extended state)"},
+
+    {ELF::NT_S390_HIGH_GPRS, "NT_S390_HIGH_GPRS (s390 upper register halves)"},
+    {ELF::NT_S390_TIMER, "NT_S390_TIMER (s390 timer register)"},
+    {ELF::NT_S390_TODCMP, "NT_S390_TODCMP (s390 TOD comparator register)"},
+    {ELF::NT_S390_TODPREG, "NT_S390_TODPREG (s390 TOD programmable register)"},
+    {ELF::NT_S390_CTRS, "NT_S390_CTRS (s390 control registers)"},
+    {ELF::NT_S390_PREFIX, "NT_S390_PREFIX (s390 prefix register)"},
+    {ELF::NT_S390_LAST_BREAK,
+     "NT_S390_LAST_BREAK (s390 last breaking event address)"},
+    {ELF::NT_S390_SYSTEM_CALL,
+     "NT_S390_SYSTEM_CALL (s390 system call restart data)"},
+    {ELF::NT_S390_TDB, "NT_S390_TDB (s390 transaction diagnostic block)"},
+    {ELF::NT_S390_VXRS_LOW,
+     "NT_S390_VXRS_LOW (s390 vector registers 0-15 upper half)"},
+    {ELF::NT_S390_VXRS_HIGH, "NT_S390_VXRS_HIGH (s390 vector registers 16-31)"},
+    {ELF::NT_S390_GS_CB, "NT_S390_GS_CB (s390 guarded-storage registers)"},
+    {ELF::NT_S390_GS_BC,
+     "NT_S390_GS_BC (s390 guarded-storage broadcast control)"},
+
+    {ELF::NT_ARM_VFP, "NT_ARM_VFP (arm VFP registers)"},
+    {ELF::NT_ARM_TLS, "NT_ARM_TLS (AArch TLS registers)"},
+    {ELF::NT_ARM_HW_BREAK,
+     "NT_ARM_HW_BREAK (AArch hardware breakpoint registers)"},
+    {ELF::NT_ARM_HW_WATCH,
+     "NT_ARM_HW_WATCH (AArch hardware watchpoint registers)"},
+
+    {ELF::NT_FILE, "NT_FILE (mapped files)"},
+    {ELF::NT_PRXFPREG, "NT_PRXFPREG (user_xfpregs structure)"},
+    {ELF::NT_SIGINFO, "NT_SIGINFO (siginfo_t data)"},
+};
+
+template <class ELFT>
+const StringRef getNoteTypeName(const typename ELFT::Note &Note,
+                                unsigned ELFType) {
+  uint32_t Type = Note.getType();
+  auto FindNote = [&](ArrayRef<NoteType> V) -> StringRef {
+    for (const NoteType &N : V)
+      if (N.ID == Type)
+        return N.Name;
+    return "";
+  };
+
+  StringRef Name = Note.getName();
+  if (Name == "GNU")
+    return FindNote(GNUNoteTypes);
+  if (Name == "FreeBSD")
+    return FindNote(FreeBSDNoteTypes);
+  if (Name == "AMD")
+    return FindNote(AMDNoteTypes);
+  if (Name == "AMDGPU")
+    return FindNote(AMDGPUNoteTypes);
+
+  if (ELFType == ELF::ET_CORE)
+    return FindNote(CoreNoteTypes);
+  return FindNote(GenericNoteTypes);
+}
+
+template <class ELFT>
+static void printNotesHelper(
+    const ELFDumper<ELFT> &Dumper,
+    llvm::function_ref<void(Optional<StringRef>, typename ELFT::Off,
+                            typename ELFT::Addr)>
+        StartNotesFn,
+    llvm::function_ref<Error(const typename ELFT::Note &)> ProcessNoteFn,
+    llvm::function_ref<void()> FinishNotesFn) {
+  const ELFFile<ELFT> &Obj = Dumper.getElfObject().getELFFile();
+
+  ArrayRef<typename ELFT::Shdr> Sections = cantFail(Obj.sections());
+  if (Obj.getHeader().e_type != ELF::ET_CORE && !Sections.empty()) {
+    for (const typename ELFT::Shdr &S : Sections) {
+      if (S.sh_type != SHT_NOTE)
+        continue;
+      StartNotesFn(expectedToOptional(Obj.getSectionName(S)), S.sh_offset,
+                   S.sh_size);
+      Error Err = Error::success();
+      size_t I = 0;
+      for (const typename ELFT::Note Note : Obj.notes(S, Err)) {
+        if (Error E = ProcessNoteFn(Note))
+          Dumper.reportUniqueWarning(
+              "unable to read note with index " + Twine(I) + " from the " +
+              describe(Obj, S) + ": " + toString(std::move(E)));
+        ++I;
+      }
+      if (Err)
+        Dumper.reportUniqueWarning("unable to read notes from the " +
+                                   describe(Obj, S) + ": " +
+                                   toString(std::move(Err)));
+      FinishNotesFn();
+    }
+    return;
+  }
+
+  Expected<ArrayRef<typename ELFT::Phdr>> PhdrsOrErr = Obj.program_headers();
+  if (!PhdrsOrErr) {
+    Dumper.reportUniqueWarning(
+        "unable to read program headers to locate the PT_NOTE segment: " +
+        toString(PhdrsOrErr.takeError()));
+    return;
+  }
+
+  for (size_t I = 0, E = (*PhdrsOrErr).size(); I != E; ++I) {
+    const typename ELFT::Phdr &P = (*PhdrsOrErr)[I];
+    if (P.p_type != PT_NOTE)
+      continue;
+    StartNotesFn(/*SecName=*/None, P.p_offset, P.p_filesz);
+    Error Err = Error::success();
+    size_t Index = 0;
+    for (const typename ELFT::Note Note : Obj.notes(P, Err)) {
+      if (Error E = ProcessNoteFn(Note))
+        Dumper.reportUniqueWarning("unable to read note with index " +
+                                   Twine(Index) +
+                                   " from the PT_NOTE segment with index " +
+                                   Twine(I) + ": " + toString(std::move(E)));
+      ++Index;
+    }
+    if (Err)
+      Dumper.reportUniqueWarning(
+          "unable to read notes from the PT_NOTE segment with index " +
+          Twine(I) + ": " + toString(std::move(Err)));
+    FinishNotesFn();
+  }
+}
+
+template <class ELFT> void GNUELFDumper<ELFT>::printNotes() {
   auto PrintHeader = [&](Optional<StringRef> SecName,
                          const typename ELFT::Off Offset,
                          const typename ELFT::Addr Size) {
@@ -5455,7 +5247,7 @@ void GNUStyle<ELFT>::printNotes(const ELFFile<ELFT> *Obj) {
     OS << "  Owner                Data size \tDescription\n";
   };
 
-  auto ProcessNote = [&](const Elf_Note &Note) {
+  auto ProcessNote = [&](const Elf_Note &Note) -> Error {
     StringRef Name = Note.getName();
     ArrayRef<uint8_t> Descriptor = Note.getDesc();
     Elf_Word Type = Note.getType();
@@ -5463,23 +5255,13 @@ void GNUStyle<ELFT>::printNotes(const ELFFile<ELFT> *Obj) {
     // Print the note owner/type.
     OS << "  " << left_justify(Name, 20) << ' '
        << format_hex(Descriptor.size(), 10) << '\t';
-    if (Name == "GNU") {
-      OS << getGNUNoteTypeName(Type) << '\n';
-    } else if (Name == "FreeBSD") {
-      OS << getFreeBSDNoteTypeName(Type) << '\n';
-    } else if (Name == "AMD") {
-      OS << getAMDNoteTypeName(Type) << '\n';
-    } else if (Name == "AMDGPU") {
-      OS << getAMDGPUNoteTypeName(Type) << '\n';
-    } else {
-      StringRef NoteType = Obj->getHeader()->e_type == ELF::ET_CORE
-                               ? getCoreNoteTypeName(Type)
-                               : getGenericNoteTypeName(Type);
-      if (!NoteType.empty())
-        OS << NoteType << '\n';
-      else
-        OS << "Unknown note type: (" << format_hex(Type, 10) << ")\n";
-    }
+
+    StringRef NoteType =
+        getNoteTypeName<ELFT>(Note, this->Obj.getHeader().e_type);
+    if (!NoteType.empty())
+      OS << NoteType << '\n';
+    else
+      OS << "Unknown note type: (" << format_hex(Type, 10) << ")\n";
 
     // Print the description, or fallback to printing raw bytes for unknown
     // owners.
@@ -5498,11 +5280,10 @@ void GNUStyle<ELFT>::printNotes(const ELFFile<ELFT> *Obj) {
         DataExtractor DescExtractor(Descriptor,
                                     ELFT::TargetEndianness == support::little,
                                     sizeof(Elf_Addr));
-        Expected<CoreNote> Note = readCoreNote(DescExtractor);
-        if (Note)
-          printCoreNote<ELFT>(OS, *Note);
+        if (Expected<CoreNote> NoteOrErr = readCoreNote(DescExtractor))
+          printCoreNote<ELFT>(OS, *NoteOrErr);
         else
-          reportWarning(Note.takeError(), this->FileName);
+          return NoteOrErr.takeError();
       }
     } else if (!Descriptor.empty()) {
       OS << "   description data:";
@@ -5510,68 +5291,34 @@ void GNUStyle<ELFT>::printNotes(const ELFFile<ELFT> *Obj) {
         OS << " " << format("%02x", B);
       OS << '\n';
     }
+    return Error::success();
   };
 
-  ArrayRef<Elf_Shdr> Sections = cantFail(Obj->sections());
-  if (Obj->getHeader()->e_type != ELF::ET_CORE && !Sections.empty()) {
-    for (const auto &S : Sections) {
-      if (S.sh_type != SHT_NOTE)
-        continue;
-      PrintHeader(expectedToOptional(Obj->getSectionName(&S)), S.sh_offset,
-                  S.sh_size);
-      Error Err = Error::success();
-      for (auto Note : Obj->notes(S, Err))
-        ProcessNote(Note);
-      if (Err)
-        reportError(std::move(Err), this->FileName);
-    }
-  } else {
-    Expected<ArrayRef<Elf_Phdr>> PhdrsOrErr = Obj->program_headers();
-    if (!PhdrsOrErr) {
-      this->reportUniqueWarning(createError(
-          "unable to read program headers to locate the PT_NOTE segment: " +
-          toString(PhdrsOrErr.takeError())));
-      return;
-    }
-
-    for (const Elf_Phdr &P : *PhdrsOrErr) {
-      if (P.p_type != PT_NOTE)
-        continue;
-      PrintHeader(/*SecName=*/None, P.p_offset, P.p_filesz);
-      Error Err = Error::success();
-      for (auto Note : Obj->notes(P, Err))
-        ProcessNote(Note);
-      if (Err)
-        reportError(std::move(Err), this->FileName);
-    }
-  }
+  printNotesHelper(*this, PrintHeader, ProcessNote, []() {});
 }
 
-template <class ELFT>
-void GNUStyle<ELFT>::printELFLinkerOptions(const ELFFile<ELFT> *Obj) {
+template <class ELFT> void GNUELFDumper<ELFT>::printELFLinkerOptions() {
   OS << "printELFLinkerOptions not implemented!\n";
 }
 
 template <class ELFT>
-void DumpStyle<ELFT>::printDependentLibsHelper(
-    const ELFFile<ELFT> *Obj,
+void ELFDumper<ELFT>::printDependentLibsHelper(
     function_ref<void(const Elf_Shdr &)> OnSectionStart,
     function_ref<void(StringRef, uint64_t)> OnLibEntry) {
   auto Warn = [this](unsigned SecNdx, StringRef Msg) {
-    this->reportUniqueWarning(
-        createError("SHT_LLVM_DEPENDENT_LIBRARIES section at index " +
-                    Twine(SecNdx) + " is broken: " + Msg));
+    this->reportUniqueWarning("SHT_LLVM_DEPENDENT_LIBRARIES section at index " +
+                              Twine(SecNdx) + " is broken: " + Msg);
   };
 
   unsigned I = -1;
-  for (const Elf_Shdr &Shdr : cantFail(Obj->sections())) {
+  for (const Elf_Shdr &Shdr : cantFail(Obj.sections())) {
     ++I;
     if (Shdr.sh_type != ELF::SHT_LLVM_DEPENDENT_LIBRARIES)
       continue;
 
     OnSectionStart(Shdr);
 
-    Expected<ArrayRef<uint8_t>> ContentsOrErr = Obj->getSectionContents(&Shdr);
+    Expected<ArrayRef<uint8_t>> ContentsOrErr = Obj.getSectionContents(Shdr);
     if (!ContentsOrErr) {
       Warn(I, toString(ContentsOrErr.takeError()));
       continue;
@@ -5592,7 +5339,93 @@ void DumpStyle<ELFT>::printDependentLibsHelper(
 }
 
 template <class ELFT>
-void GNUStyle<ELFT>::printDependentLibs(const ELFFile<ELFT> *Obj) {
+void ELFDumper<ELFT>::forEachRelocationDo(
+    const Elf_Shdr &Sec, bool RawRelr,
+    llvm::function_ref<void(const Relocation<ELFT> &, unsigned,
+                            const Elf_Shdr &, const Elf_Shdr *)>
+        RelRelaFn,
+    llvm::function_ref<void(const Elf_Relr &)> RelrFn) {
+  auto Warn = [&](Error &&E,
+                  const Twine &Prefix = "unable to read relocations from") {
+    this->reportUniqueWarning(Prefix + " " + describe(Sec) + ": " +
+                              toString(std::move(E)));
+  };
+
+  // SHT_RELR/SHT_ANDROID_RELR sections do not have an associated symbol table.
+  // For them we should not treat the value of the sh_link field as an index of
+  // a symbol table.
+  const Elf_Shdr *SymTab;
+  if (Sec.sh_type != ELF::SHT_RELR && Sec.sh_type != ELF::SHT_ANDROID_RELR) {
+    Expected<const Elf_Shdr *> SymTabOrErr = Obj.getSection(Sec.sh_link);
+    if (!SymTabOrErr) {
+      Warn(SymTabOrErr.takeError(), "unable to locate a symbol table for");
+      return;
+    }
+    SymTab = *SymTabOrErr;
+  }
+
+  unsigned RelNdx = 0;
+  const bool IsMips64EL = this->Obj.isMips64EL();
+  switch (Sec.sh_type) {
+  case ELF::SHT_REL:
+    if (Expected<Elf_Rel_Range> RangeOrErr = Obj.rels(Sec)) {
+      for (const Elf_Rel &R : *RangeOrErr)
+        RelRelaFn(Relocation<ELFT>(R, IsMips64EL), RelNdx++, Sec, SymTab);
+    } else {
+      Warn(RangeOrErr.takeError());
+    }
+    break;
+  case ELF::SHT_RELA:
+    if (Expected<Elf_Rela_Range> RangeOrErr = Obj.relas(Sec)) {
+      for (const Elf_Rela &R : *RangeOrErr)
+        RelRelaFn(Relocation<ELFT>(R, IsMips64EL), RelNdx++, Sec, SymTab);
+    } else {
+      Warn(RangeOrErr.takeError());
+    }
+    break;
+  case ELF::SHT_RELR:
+  case ELF::SHT_ANDROID_RELR: {
+    Expected<Elf_Relr_Range> RangeOrErr = Obj.relrs(Sec);
+    if (!RangeOrErr) {
+      Warn(RangeOrErr.takeError());
+      break;
+    }
+    if (RawRelr) {
+      for (const Elf_Relr &R : *RangeOrErr)
+        RelrFn(R);
+      break;
+    }
+
+    for (const Elf_Rel &R : Obj.decode_relrs(*RangeOrErr))
+      RelRelaFn(Relocation<ELFT>(R, IsMips64EL), RelNdx++, Sec,
+                /*SymTab=*/nullptr);
+    break;
+  }
+  case ELF::SHT_ANDROID_REL:
+  case ELF::SHT_ANDROID_RELA:
+    if (Expected<std::vector<Elf_Rela>> RelasOrErr = Obj.android_relas(Sec)) {
+      for (const Elf_Rela &R : *RelasOrErr)
+        RelRelaFn(Relocation<ELFT>(R, IsMips64EL), RelNdx++, Sec, SymTab);
+    } else {
+      Warn(RelasOrErr.takeError());
+    }
+    break;
+  }
+}
+
+template <class ELFT>
+StringRef ELFDumper<ELFT>::getPrintableSectionName(const Elf_Shdr &Sec) const {
+  StringRef Name = "<?>";
+  if (Expected<StringRef> SecNameOrErr =
+          Obj.getSectionName(Sec, this->WarningHandler))
+    Name = *SecNameOrErr;
+  else
+    this->reportUniqueWarning("unable to get the name of " + describe(Sec) +
+                              ": " + toString(SecNameOrErr.takeError()));
+  return Name;
+}
+
+template <class ELFT> void GNUELFDumper<ELFT>::printDependentLibs() {
   bool SectionStarted = false;
   struct NameOffset {
     StringRef Name;
@@ -5605,7 +5438,7 @@ void GNUStyle<ELFT>::printDependentLibs(const ELFFile<ELFT> *Obj) {
        << format_hex(Current.Offset, 1) << " contains " << SecEntries.size()
        << " entries:\n";
     for (NameOffset Entry : SecEntries)
-      OS << "  [" << format("%6tx", Entry.Offset) << "]  " << Entry.Name
+      OS << "  [" << format("%6" PRIx64, Entry.Offset) << "]  " << Entry.Name
          << "\n";
     OS << "\n";
     SecEntries.clear();
@@ -5616,104 +5449,94 @@ void GNUStyle<ELFT>::printDependentLibs(const ELFFile<ELFT> *Obj) {
       PrintSection();
     SectionStarted = true;
     Current.Offset = Shdr.sh_offset;
-    Expected<StringRef> Name = Obj->getSectionName(&Shdr);
-    if (!Name) {
-      Current.Name = "<?>";
-      this->reportUniqueWarning(
-          createError("cannot get section name of "
-                      "SHT_LLVM_DEPENDENT_LIBRARIES section: " +
-                      toString(Name.takeError())));
-    } else {
-      Current.Name = *Name;
-    }
+    Current.Name = this->getPrintableSectionName(Shdr);
   };
   auto OnLibEntry = [&](StringRef Lib, uint64_t Offset) {
     SecEntries.push_back(NameOffset{Lib, Offset});
   };
 
-  this->printDependentLibsHelper(Obj, OnSectionStart, OnLibEntry);
+  this->printDependentLibsHelper(OnSectionStart, OnLibEntry);
   if (SectionStarted)
     PrintSection();
 }
 
-// Used for printing section names in places where possible errors can be
-// ignored.
-static StringRef getSectionName(const SectionRef &Sec) {
-  Expected<StringRef> NameOrErr = Sec.getName();
-  if (NameOrErr)
-    return *NameOrErr;
-  consumeError(NameOrErr.takeError());
-  return "<?>";
-}
+template <class ELFT>
+bool ELFDumper<ELFT>::printFunctionStackSize(
+    uint64_t SymValue, Optional<const Elf_Shdr *> FunctionSec,
+    const Elf_Shdr &StackSizeSec, DataExtractor Data, uint64_t *Offset) {
+  uint32_t FuncSymIndex = 0;
+  if (this->DotSymtabSec) {
+    if (Expected<Elf_Sym_Range> SymsOrError = Obj.symbols(this->DotSymtabSec)) {
+      uint32_t Index = (uint32_t)-1;
+      for (const Elf_Sym &Sym : *SymsOrError) {
+        ++Index;
+
+        if (Sym.st_shndx == ELF::SHN_UNDEF || Sym.getType() != ELF::STT_FUNC)
+          continue;
+
+        if (Expected<uint64_t> SymAddrOrErr =
+                ObjF.toSymbolRef(this->DotSymtabSec, Index).getAddress()) {
+          if (SymValue != *SymAddrOrErr)
+            continue;
+        } else {
+          std::string Name = this->getStaticSymbolName(Index);
+          reportUniqueWarning("unable to get address of symbol '" + Name +
+                              "': " + toString(SymAddrOrErr.takeError()));
+          break;
+        }
 
-// Used for printing symbol names in places where possible errors can be
-// ignored.
-static std::string getSymbolName(const ELFSymbolRef &Sym) {
-  Expected<StringRef> NameOrErr = Sym.getName();
-  if (NameOrErr)
-    return maybeDemangle(*NameOrErr);
-  consumeError(NameOrErr.takeError());
-  return "<?>";
-}
+        // Check if the symbol is in the right section. FunctionSec == None
+        // means "any section".
+        if (FunctionSec) {
+          if (Expected<const Elf_Shdr *> SecOrErr =
+                  Obj.getSection(Sym, this->DotSymtabSec,
+                                 this->getShndxTable(this->DotSymtabSec))) {
+            if (*FunctionSec != *SecOrErr)
+              continue;
+          } else {
+            std::string Name = this->getStaticSymbolName(Index);
+            // Note: it is impossible to trigger this error currently, it is
+            // untested.
+            reportUniqueWarning("unable to get section of symbol '" + Name +
+                                "': " + toString(SecOrErr.takeError()));
+            break;
+          }
+        }
 
-template <class ELFT>
-void DumpStyle<ELFT>::printFunctionStackSize(const ELFObjectFile<ELFT> *Obj,
-                                             uint64_t SymValue,
-                                             Optional<SectionRef> FunctionSec,
-                                             const StringRef SectionName,
-                                             DataExtractor Data,
-                                             uint64_t *Offset) {
-  // This function ignores potentially erroneous input, unless it is directly
-  // related to stack size reporting.
-  SymbolRef FuncSym;
-  for (const ELFSymbolRef &Symbol : Obj->symbols()) {
-    Expected<uint64_t> SymAddrOrErr = Symbol.getAddress();
-    if (!SymAddrOrErr) {
-      consumeError(SymAddrOrErr.takeError());
-      continue;
-    }
-    if (Expected<uint32_t> SymFlags = Symbol.getFlags()) {
-      if (*SymFlags & SymbolRef::SF_Undefined)
-        continue;
-    } else
-      consumeError(SymFlags.takeError());
-    if (Symbol.getELFType() == ELF::STT_FUNC && *SymAddrOrErr == SymValue) {
-      // Check if the symbol is in the right section. FunctionSec == None means
-      // "any section".
-      if (!FunctionSec || FunctionSec->containsSymbol(Symbol)) {
-        FuncSym = Symbol;
+        FuncSymIndex = Index;
         break;
       }
+    } else {
+      reportUniqueWarning("unable to read the symbol table: " +
+                          toString(SymsOrError.takeError()));
     }
   }
 
   std::string FuncName = "?";
-  // A valid SymbolRef has a non-null object file pointer.
-  if (FuncSym.BasicSymbolRef::getObject())
-    FuncName = getSymbolName(FuncSym);
+  if (!FuncSymIndex)
+    reportUniqueWarning(
+        "could not identify function symbol for stack size entry in " +
+        describe(StackSizeSec));
   else
-    reportWarning(
-        createError("could not identify function symbol for stack size entry"),
-        Obj->getFileName());
+    FuncName = this->getStaticSymbolName(FuncSymIndex);
 
   // Extract the size. The expectation is that Offset is pointing to the right
   // place, i.e. past the function address.
-  uint64_t PrevOffset = *Offset;
-  uint64_t StackSize = Data.getULEB128(Offset);
-  // getULEB128() does not advance Offset if it is not able to extract a valid
-  // integer.
-  if (*Offset == PrevOffset)
-    reportError(
-        createStringError(object_error::parse_failed,
-                          "could not extract a valid stack size in section %s",
-                          SectionName.data()),
-        Obj->getFileName());
-
+  Error Err = Error::success();
+  uint64_t StackSize = Data.getULEB128(Offset, &Err);
+  if (Err) {
+    reportUniqueWarning("could not extract a valid stack size from " +
+                        describe(StackSizeSec) + ": " +
+                        toString(std::move(Err)));
+    return false;
+  }
   printStackSizeEntry(StackSize, FuncName);
+  return true;
 }
 
 template <class ELFT>
-void GNUStyle<ELFT>::printStackSizeEntry(uint64_t Size, StringRef FuncName) {
+void GNUELFDumper<ELFT>::printStackSizeEntry(uint64_t Size,
+                                             StringRef FuncName) {
   OS.PadToColumn(2);
   OS << format_decimal(Size, 11);
   OS.PadToColumn(18);
@@ -5721,108 +5544,98 @@ void GNUStyle<ELFT>::printStackSizeEntry(uint64_t Size, StringRef FuncName) {
 }
 
 template <class ELFT>
-void DumpStyle<ELFT>::printStackSize(const ELFObjectFile<ELFT> *Obj,
-                                     RelocationRef Reloc,
-                                     SectionRef FunctionSec,
-                                     const StringRef &StackSizeSectionName,
+void ELFDumper<ELFT>::printStackSize(const Relocation<ELFT> &R,
+                                     const Elf_Shdr &RelocSec, unsigned Ndx,
+                                     const Elf_Shdr *SymTab,
+                                     const Elf_Shdr *FunctionSec,
+                                     const Elf_Shdr &StackSizeSec,
                                      const RelocationResolver &Resolver,
                                      DataExtractor Data) {
   // This function ignores potentially erroneous input, unless it is directly
   // related to stack size reporting.
-  object::symbol_iterator RelocSym = Reloc.getSymbol();
+  const Elf_Sym *Sym = nullptr;
+  Expected<RelSymbol<ELFT>> TargetOrErr = this->getRelocationTarget(R, SymTab);
+  if (!TargetOrErr)
+    reportUniqueWarning("unable to get the target of relocation with index " +
+                        Twine(Ndx) + " in " + describe(RelocSec) + ": " +
+                        toString(TargetOrErr.takeError()));
+  else
+    Sym = TargetOrErr->Sym;
+
   uint64_t RelocSymValue = 0;
-  StringRef FileStr = Obj->getFileName();
-  if (RelocSym != Obj->symbol_end()) {
-    // Ensure that the relocation symbol is in the function section, i.e. the
-    // section where the functions whose stack sizes we are reporting are
-    // located.
-    auto SectionOrErr = RelocSym->getSection();
+  if (Sym) {
+    Expected<const Elf_Shdr *> SectionOrErr =
+        this->Obj.getSection(*Sym, SymTab, this->getShndxTable(SymTab));
     if (!SectionOrErr) {
-      reportWarning(
-          createError("cannot identify the section for relocation symbol '" +
-                      getSymbolName(*RelocSym) + "'"),
-          FileStr);
-      consumeError(SectionOrErr.takeError());
+      reportUniqueWarning(
+          "cannot identify the section for relocation symbol '" +
+          (*TargetOrErr).Name + "': " + toString(SectionOrErr.takeError()));
     } else if (*SectionOrErr != FunctionSec) {
-      reportWarning(createError("relocation symbol '" +
-                                getSymbolName(*RelocSym) +
-                                "' is not in the expected section"),
-                    FileStr);
+      reportUniqueWarning("relocation symbol '" + (*TargetOrErr).Name +
+                          "' is not in the expected section");
       // Pretend that the symbol is in the correct section and report its
       // stack size anyway.
-      FunctionSec = **SectionOrErr;
+      FunctionSec = *SectionOrErr;
     }
 
-    Expected<uint64_t> RelocSymValueOrErr = RelocSym->getValue();
-    if (RelocSymValueOrErr)
-      RelocSymValue = *RelocSymValueOrErr;
-    else
-      consumeError(RelocSymValueOrErr.takeError());
+    RelocSymValue = Sym->st_value;
   }
 
-  uint64_t Offset = Reloc.getOffset();
-  if (!Data.isValidOffsetForDataOfSize(Offset, sizeof(Elf_Addr) + 1))
-    reportError(
-        createStringError(object_error::parse_failed,
-                          "found invalid relocation offset into section %s "
-                          "while trying to extract a stack size entry",
-                          StackSizeSectionName.data()),
-        FileStr);
+  uint64_t Offset = R.Offset;
+  if (!Data.isValidOffsetForDataOfSize(Offset, sizeof(Elf_Addr) + 1)) {
+    reportUniqueWarning("found invalid relocation offset (0x" +
+                        Twine::utohexstr(Offset) + ") into " +
+                        describe(StackSizeSec) +
+                        " while trying to extract a stack size entry");
+    return;
+  }
 
-  uint64_t Addend = Data.getAddress(&Offset);
-  uint64_t SymValue = Resolver(Reloc, RelocSymValue, Addend);
-  this->printFunctionStackSize(Obj, SymValue, FunctionSec, StackSizeSectionName,
-                               Data, &Offset);
+  uint64_t SymValue =
+      Resolver(R.Type, Offset, RelocSymValue, Data.getAddress(&Offset),
+               R.Addend.getValueOr(0));
+  this->printFunctionStackSize(SymValue, FunctionSec, StackSizeSec, Data,
+                               &Offset);
 }
 
 template <class ELFT>
-void DumpStyle<ELFT>::printNonRelocatableStackSizes(
-    const ELFObjectFile<ELFT> *Obj, std::function<void()> PrintHeader) {
+void ELFDumper<ELFT>::printNonRelocatableStackSizes(
+    std::function<void()> PrintHeader) {
   // This function ignores potentially erroneous input, unless it is directly
   // related to stack size reporting.
-  const ELFFile<ELFT> *EF = Obj->getELFFile();
-  StringRef FileStr = Obj->getFileName();
-  for (const SectionRef &Sec : Obj->sections()) {
-    StringRef SectionName = getSectionName(Sec);
-    if (SectionName != ".stack_sizes")
+  for (const Elf_Shdr &Sec : cantFail(Obj.sections())) {
+    if (this->getPrintableSectionName(Sec) != ".stack_sizes")
       continue;
     PrintHeader();
-    const Elf_Shdr *ElfSec = Obj->getSection(Sec.getRawDataRefImpl());
     ArrayRef<uint8_t> Contents =
-        unwrapOrError(this->FileName, EF->getSectionContents(ElfSec));
-    DataExtractor Data(Contents, Obj->isLittleEndian(), sizeof(Elf_Addr));
+        unwrapOrError(this->FileName, Obj.getSectionContents(Sec));
+    DataExtractor Data(Contents, Obj.isLE(), sizeof(Elf_Addr));
     uint64_t Offset = 0;
     while (Offset < Contents.size()) {
       // The function address is followed by a ULEB representing the stack
       // size. Check for an extra byte before we try to process the entry.
       if (!Data.isValidOffsetForDataOfSize(Offset, sizeof(Elf_Addr) + 1)) {
-        reportError(
-            createStringError(
-                object_error::parse_failed,
-                "section %s ended while trying to extract a stack size entry",
-                SectionName.data()),
-            FileStr);
+        reportUniqueWarning(
+            describe(Sec) +
+            " ended while trying to extract a stack size entry");
+        break;
       }
       uint64_t SymValue = Data.getAddress(&Offset);
-      printFunctionStackSize(Obj, SymValue, /*FunctionSec=*/None, SectionName,
-                             Data, &Offset);
+      if (!printFunctionStackSize(SymValue, /*FunctionSec=*/None, Sec, Data,
+                                  &Offset))
+        break;
     }
   }
 }
 
 template <class ELFT>
-void DumpStyle<ELFT>::printRelocatableStackSizes(
-    const ELFObjectFile<ELFT> *Obj, std::function<void()> PrintHeader) {
-  const ELFFile<ELFT> *EF = Obj->getELFFile();
-
+void ELFDumper<ELFT>::printRelocatableStackSizes(
+    std::function<void()> PrintHeader) {
   // Build a map between stack size sections and their corresponding relocation
   // sections.
-  llvm::MapVector<SectionRef, SectionRef> StackSizeRelocMap;
-  const SectionRef NullSection{};
-
-  for (const SectionRef &Sec : Obj->sections()) {
+  llvm::MapVector<const Elf_Shdr *, const Elf_Shdr *> StackSizeRelocMap;
+  for (const Elf_Shdr &Sec : cantFail(Obj.sections())) {
     StringRef SectionName;
-    if (Expected<StringRef> NameOrErr = Sec.getName())
+    if (Expected<StringRef> NameOrErr = Obj.getSectionName(Sec))
       SectionName = *NameOrErr;
     else
       consumeError(NameOrErr.takeError());
@@ -5830,84 +5643,83 @@ void DumpStyle<ELFT>::printRelocatableStackSizes(
     // A stack size section that we haven't encountered yet is mapped to the
     // null section until we find its corresponding relocation section.
     if (SectionName == ".stack_sizes")
-      if (StackSizeRelocMap.count(Sec) == 0) {
-        StackSizeRelocMap[Sec] = NullSection;
+      if (StackSizeRelocMap
+              .insert(std::make_pair(&Sec, (const Elf_Shdr *)nullptr))
+              .second)
         continue;
-      }
 
     // Check relocation sections if they are relocating contents of a
     // stack sizes section.
-    const Elf_Shdr *ElfSec = Obj->getSection(Sec.getRawDataRefImpl());
-    uint32_t SectionType = ElfSec->sh_type;
-    if (SectionType != ELF::SHT_RELA && SectionType != ELF::SHT_REL)
+    if (Sec.sh_type != ELF::SHT_RELA && Sec.sh_type != ELF::SHT_REL)
       continue;
 
-    Expected<section_iterator> RelSecOrErr = Sec.getRelocatedSection();
-    if (!RelSecOrErr)
-      reportError(createStringError(object_error::parse_failed,
-                                    "%s: failed to get a relocated section: %s",
-                                    SectionName.data(),
-                                    toString(RelSecOrErr.takeError()).c_str()),
-                  Obj->getFileName());
-
-    const Elf_Shdr *ContentsSec =
-        Obj->getSection((*RelSecOrErr)->getRawDataRefImpl());
-    Expected<StringRef> ContentsSectionNameOrErr =
-        EF->getSectionName(ContentsSec);
-    if (!ContentsSectionNameOrErr) {
-      consumeError(ContentsSectionNameOrErr.takeError());
+    Expected<const Elf_Shdr *> RelSecOrErr = Obj.getSection(Sec.sh_info);
+    if (!RelSecOrErr) {
+      reportUniqueWarning(describe(Sec) +
+                          ": failed to get a relocated section: " +
+                          toString(RelSecOrErr.takeError()));
       continue;
     }
-    if (*ContentsSectionNameOrErr != ".stack_sizes")
+
+    const Elf_Shdr *ContentsSec = *RelSecOrErr;
+    if (this->getPrintableSectionName(**RelSecOrErr) != ".stack_sizes")
       continue;
+
     // Insert a mapping from the stack sizes section to its relocation section.
-    StackSizeRelocMap[Obj->toSectionRef(ContentsSec)] = Sec;
+    StackSizeRelocMap[ContentsSec] = &Sec;
   }
 
   for (const auto &StackSizeMapEntry : StackSizeRelocMap) {
     PrintHeader();
-    const SectionRef &StackSizesSec = StackSizeMapEntry.first;
-    const SectionRef &RelocSec = StackSizeMapEntry.second;
+    const Elf_Shdr *StackSizesELFSec = StackSizeMapEntry.first;
+    const Elf_Shdr *RelocSec = StackSizeMapEntry.second;
 
     // Warn about stack size sections without a relocation section.
-    StringRef StackSizeSectionName = getSectionName(StackSizesSec);
-    if (RelocSec == NullSection) {
-      reportWarning(createError("section " + StackSizeSectionName +
-                                " does not have a corresponding "
+    if (!RelocSec) {
+      reportWarning(createError(".stack_sizes (" + describe(*StackSizesELFSec) +
+                                ") does not have a corresponding "
                                 "relocation section"),
-                    Obj->getFileName());
+                    FileName);
       continue;
     }
 
     // A .stack_sizes section header's sh_link field is supposed to point
     // to the section that contains the functions whose stack sizes are
     // described in it.
-    const Elf_Shdr *StackSizesELFSec =
-        Obj->getSection(StackSizesSec.getRawDataRefImpl());
-    const SectionRef FunctionSec = Obj->toSectionRef(unwrapOrError(
-        this->FileName, EF->getSection(StackSizesELFSec->sh_link)));
+    const Elf_Shdr *FunctionSec = unwrapOrError(
+        this->FileName, Obj.getSection(StackSizesELFSec->sh_link));
 
-    bool (*IsSupportedFn)(uint64_t);
+    SupportsRelocation IsSupportedFn;
     RelocationResolver Resolver;
-    std::tie(IsSupportedFn, Resolver) = getRelocationResolver(*Obj);
-    auto Contents = unwrapOrError(this->FileName, StackSizesSec.getContents());
-    DataExtractor Data(Contents, Obj->isLittleEndian(), sizeof(Elf_Addr));
-    for (const RelocationRef &Reloc : RelocSec.relocations()) {
-      if (!IsSupportedFn || !IsSupportedFn(Reloc.getType()))
-        reportError(createStringError(
-                        object_error::parse_failed,
-                        "unsupported relocation type in section %s: %s",
-                        getSectionName(RelocSec).data(),
-                        EF->getRelocationTypeName(Reloc.getType()).data()),
-                    Obj->getFileName());
-      this->printStackSize(Obj, Reloc, FunctionSec, StackSizeSectionName,
-                           Resolver, Data);
-    }
+    std::tie(IsSupportedFn, Resolver) = getRelocationResolver(this->ObjF);
+    ArrayRef<uint8_t> Contents =
+        unwrapOrError(this->FileName, Obj.getSectionContents(*StackSizesELFSec));
+    DataExtractor Data(Contents, Obj.isLE(), sizeof(Elf_Addr));
+
+    forEachRelocationDo(
+        *RelocSec, /*RawRelr=*/false,
+        [&](const Relocation<ELFT> &R, unsigned Ndx, const Elf_Shdr &Sec,
+            const Elf_Shdr *SymTab) {
+          if (!IsSupportedFn || !IsSupportedFn(R.Type)) {
+            reportUniqueWarning(
+                describe(*RelocSec) +
+                " contains an unsupported relocation with index " + Twine(Ndx) +
+                ": " + Obj.getRelocationTypeName(R.Type));
+            return;
+          }
+
+          this->printStackSize(R, *RelocSec, Ndx, SymTab, FunctionSec,
+                               *StackSizesELFSec, Resolver, Data);
+        },
+        [](const Elf_Relr &) {
+          llvm_unreachable("can't get here, because we only support "
+                           "SHT_REL/SHT_RELA sections");
+        });
   }
 }
 
 template <class ELFT>
-void GNUStyle<ELFT>::printStackSizes(const ELFObjectFile<ELFT> *Obj) {
+void GNUELFDumper<ELFT>::printStackSizes() {
   bool HeaderHasBeenPrinted = false;
   auto PrintHeader = [&]() {
     if (HeaderHasBeenPrinted)
@@ -5922,14 +5734,14 @@ void GNUStyle<ELFT>::printStackSizes(const ELFObjectFile<ELFT> *Obj) {
 
   // For non-relocatable objects, look directly for sections whose name starts
   // with .stack_sizes and process the contents.
-  if (Obj->isRelocatableObject())
-    this->printRelocatableStackSizes(Obj, PrintHeader);
+  if (this->Obj.getHeader().e_type == ELF::ET_REL)
+    this->printRelocatableStackSizes(PrintHeader);
   else
-    this->printNonRelocatableStackSizes(Obj, PrintHeader);
+    this->printNonRelocatableStackSizes(PrintHeader);
 }
 
 template <class ELFT>
-void GNUStyle<ELFT>::printMipsGOT(const MipsGOTParser<ELFT> &Parser) {
+void GNUELFDumper<ELFT>::printMipsGOT(const MipsGOTParser<ELFT> &Parser) {
   size_t Bias = ELFT::Is64Bits ? 8 : 0;
   auto PrintEntry = [&](const Elf_Addr *E, StringRef Purpose) {
     OS.PadToColumn(2);
@@ -5977,10 +5789,14 @@ void GNUStyle<ELFT>::printMipsGOT(const MipsGOTParser<ELFT> &Parser) {
          << " Type    Ndx Name\n";
     else
       OS << "   Address     Access  Initial Sym.Val. Type    Ndx Name\n";
+
+    DataRegion<Elf_Word> ShndxTable(
+        (const Elf_Word *)this->DynSymTabShndxRegion.Addr, this->Obj.end());
     for (auto &E : Parser.getGlobalEntries()) {
-      const Elf_Sym *Sym = Parser.getGotSym(&E);
-      std::string SymName = this->dumper()->getFullSymbolName(
-          Sym, this->dumper()->getDynamicStringTable(), false);
+      const Elf_Sym &Sym = *Parser.getGotSym(&E);
+      const Elf_Sym &FirstSym = this->dynamic_symbols()[0];
+      std::string SymName = this->getFullSymbolName(
+          Sym, &Sym - &FirstSym, ShndxTable, this->DynamicStringTable, false);
 
       OS.PadToColumn(2);
       OS << to_string(format_hex_no_prefix(Parser.getGotAddress(&E), 8 + Bias));
@@ -5989,12 +5805,12 @@ void GNUStyle<ELFT>::printMipsGOT(const MipsGOTParser<ELFT> &Parser) {
       OS.PadToColumn(22 + Bias);
       OS << to_string(format_hex_no_prefix(E, 8 + Bias));
       OS.PadToColumn(31 + 2 * Bias);
-      OS << to_string(format_hex_no_prefix(Sym->st_value, 8 + Bias));
+      OS << to_string(format_hex_no_prefix(Sym.st_value, 8 + Bias));
       OS.PadToColumn(40 + 3 * Bias);
-      OS << printEnum(Sym->getType(), makeArrayRef(ElfSymbolTypes));
+      OS << printEnum(Sym.getType(), makeArrayRef(ElfSymbolTypes));
       OS.PadToColumn(48 + 3 * Bias);
-      OS << getSymbolSectionNdx(Parser.Obj, Sym,
-                                this->dumper()->dynamic_symbols().begin());
+      OS << getSymbolSectionNdx(Sym, &Sym - this->dynamic_symbols().begin(),
+                                ShndxTable);
       OS.PadToColumn(52 + 3 * Bias);
       OS << SymName << "\n";
     }
@@ -6006,7 +5822,7 @@ void GNUStyle<ELFT>::printMipsGOT(const MipsGOTParser<ELFT> &Parser) {
 }
 
 template <class ELFT>
-void GNUStyle<ELFT>::printMipsPLT(const MipsGOTParser<ELFT> &Parser) {
+void GNUELFDumper<ELFT>::printMipsPLT(const MipsGOTParser<ELFT> &Parser) {
   size_t Bias = ELFT::Is64Bits ? 8 : 0;
   auto PrintEntry = [&](const Elf_Addr *E, StringRef Purpose) {
     OS.PadToColumn(2);
@@ -6029,22 +5845,26 @@ void GNUStyle<ELFT>::printMipsPLT(const MipsGOTParser<ELFT> &Parser) {
     OS << "\n";
     OS << " Entries:\n";
     OS << "   Address  Initial Sym.Val. Type    Ndx Name\n";
+    DataRegion<Elf_Word> ShndxTable(
+        (const Elf_Word *)this->DynSymTabShndxRegion.Addr, this->Obj.end());
     for (auto &E : Parser.getPltEntries()) {
-      const Elf_Sym *Sym = Parser.getPltSym(&E);
-      std::string SymName = this->dumper()->getFullSymbolName(
-          Sym, this->dumper()->getDynamicStringTable(), false);
+      const Elf_Sym &Sym = *Parser.getPltSym(&E);
+      const Elf_Sym &FirstSym = *cantFail(
+          this->Obj.template getEntry<Elf_Sym>(*Parser.getPltSymTable(), 0));
+      std::string SymName = this->getFullSymbolName(
+          Sym, &Sym - &FirstSym, ShndxTable, this->DynamicStringTable, false);
 
       OS.PadToColumn(2);
       OS << to_string(format_hex_no_prefix(Parser.getPltAddress(&E), 8 + Bias));
       OS.PadToColumn(11 + Bias);
       OS << to_string(format_hex_no_prefix(E, 8 + Bias));
       OS.PadToColumn(20 + 2 * Bias);
-      OS << to_string(format_hex_no_prefix(Sym->st_value, 8 + Bias));
+      OS << to_string(format_hex_no_prefix(Sym.st_value, 8 + Bias));
       OS.PadToColumn(29 + 3 * Bias);
-      OS << printEnum(Sym->getType(), makeArrayRef(ElfSymbolTypes));
+      OS << printEnum(Sym.getType(), makeArrayRef(ElfSymbolTypes));
       OS.PadToColumn(37 + 3 * Bias);
-      OS << getSymbolSectionNdx(Parser.Obj, Sym,
-                                this->dumper()->dynamic_symbols().begin());
+      OS << getSymbolSectionNdx(Sym, &Sym - this->dynamic_symbols().begin(),
+                                ShndxTable);
       OS.PadToColumn(41 + 3 * Bias);
       OS << SymName << "\n";
     }
@@ -6052,20 +5872,33 @@ void GNUStyle<ELFT>::printMipsPLT(const MipsGOTParser<ELFT> &Parser) {
 }
 
 template <class ELFT>
-void GNUStyle<ELFT>::printMipsABIFlags(const ELFObjectFile<ELFT> *ObjF) {
-  const ELFFile<ELFT> *Obj = ObjF->getELFFile();
-  const Elf_Shdr *Shdr =
-      findSectionByName(*Obj, ObjF->getFileName(), ".MIPS.abiflags");
-  if (!Shdr)
-    return;
+Expected<const Elf_Mips_ABIFlags<ELFT> *>
+getMipsAbiFlagsSection(const ELFDumper<ELFT> &Dumper) {
+  const typename ELFT::Shdr *Sec = Dumper.findSectionByName(".MIPS.abiflags");
+  if (Sec == nullptr)
+    return nullptr;
 
-  ArrayRef<uint8_t> Sec =
-      unwrapOrError(ObjF->getFileName(), Obj->getSectionContents(Shdr));
-  if (Sec.size() != sizeof(Elf_Mips_ABIFlags<ELFT>))
-    reportError(createError(".MIPS.abiflags section has a wrong size"),
-                ObjF->getFileName());
+  constexpr StringRef ErrPrefix = "unable to read the .MIPS.abiflags section: ";
+  Expected<ArrayRef<uint8_t>> DataOrErr =
+      Dumper.getElfObject().getELFFile().getSectionContents(*Sec);
+  if (!DataOrErr)
+    return createError(ErrPrefix + toString(DataOrErr.takeError()));
 
-  auto *Flags = reinterpret_cast<const Elf_Mips_ABIFlags<ELFT> *>(Sec.data());
+  if (DataOrErr->size() != sizeof(Elf_Mips_ABIFlags<ELFT>))
+    return createError(ErrPrefix + "it has a wrong size (" +
+        Twine(DataOrErr->size()) + ")");
+  return reinterpret_cast<const Elf_Mips_ABIFlags<ELFT> *>(DataOrErr->data());
+}
+
+template <class ELFT> void GNUELFDumper<ELFT>::printMipsABIFlags() {
+  const Elf_Mips_ABIFlags<ELFT> *Flags = nullptr;
+  if (Expected<const Elf_Mips_ABIFlags<ELFT> *> SecOrErr =
+          getMipsAbiFlagsSection(*this))
+    Flags = *SecOrErr;
+  else
+    this->reportUniqueWarning(SecOrErr.takeError());
+  if (!Flags)
+    return;
 
   OS << "MIPS ABI Flags Version: " << Flags->version << "\n\n";
   OS << "ISA: MIPS" << int(Flags->isa_level);
@@ -6090,22 +5923,22 @@ void GNUStyle<ELFT>::printMipsABIFlags(const ELFObjectFile<ELFT> *ObjF) {
   OS << "\n";
 }
 
-template <class ELFT> void LLVMStyle<ELFT>::printFileHeaders(const ELFO *Obj) {
-  const Elf_Ehdr *E = Obj->getHeader();
+template <class ELFT> void LLVMELFDumper<ELFT>::printFileHeaders() {
+  const Elf_Ehdr &E = this->Obj.getHeader();
   {
     DictScope D(W, "ElfHeader");
     {
       DictScope D(W, "Ident");
-      W.printBinary("Magic", makeArrayRef(E->e_ident).slice(ELF::EI_MAG0, 4));
-      W.printEnum("Class", E->e_ident[ELF::EI_CLASS], makeArrayRef(ElfClass));
-      W.printEnum("DataEncoding", E->e_ident[ELF::EI_DATA],
+      W.printBinary("Magic", makeArrayRef(E.e_ident).slice(ELF::EI_MAG0, 4));
+      W.printEnum("Class", E.e_ident[ELF::EI_CLASS], makeArrayRef(ElfClass));
+      W.printEnum("DataEncoding", E.e_ident[ELF::EI_DATA],
                   makeArrayRef(ElfDataEncoding));
-      W.printNumber("FileVersion", E->e_ident[ELF::EI_VERSION]);
+      W.printNumber("FileVersion", E.e_ident[ELF::EI_VERSION]);
 
       auto OSABI = makeArrayRef(ElfOSABI);
-      if (E->e_ident[ELF::EI_OSABI] >= ELF::ELFOSABI_FIRST_ARCH &&
-          E->e_ident[ELF::EI_OSABI] <= ELF::ELFOSABI_LAST_ARCH) {
-        switch (E->e_machine) {
+      if (E.e_ident[ELF::EI_OSABI] >= ELF::ELFOSABI_FIRST_ARCH &&
+          E.e_ident[ELF::EI_OSABI] <= ELF::ELFOSABI_LAST_ARCH) {
+        switch (E.e_machine) {
         case ELF::EM_AMDGPU:
           OSABI = makeArrayRef(AMDGPUElfOSABI);
           break;
@@ -6117,43 +5950,54 @@ template <class ELFT> void LLVMStyle<ELFT>::printFileHeaders(const ELFO *Obj) {
           break;
         }
       }
-      W.printEnum("OS/ABI", E->e_ident[ELF::EI_OSABI], OSABI);
-      W.printNumber("ABIVersion", E->e_ident[ELF::EI_ABIVERSION]);
-      W.printBinary("Unused", makeArrayRef(E->e_ident).slice(ELF::EI_PAD));
+      W.printEnum("OS/ABI", E.e_ident[ELF::EI_OSABI], OSABI);
+      W.printNumber("ABIVersion", E.e_ident[ELF::EI_ABIVERSION]);
+      W.printBinary("Unused", makeArrayRef(E.e_ident).slice(ELF::EI_PAD));
     }
 
-    W.printEnum("Type", E->e_type, makeArrayRef(ElfObjectFileType));
-    W.printEnum("Machine", E->e_machine, makeArrayRef(ElfMachineType));
-    W.printNumber("Version", E->e_version);
-    W.printHex("Entry", E->e_entry);
-    W.printHex("ProgramHeaderOffset", E->e_phoff);
-    W.printHex("SectionHeaderOffset", E->e_shoff);
-    if (E->e_machine == EM_MIPS)
-      W.printFlags("Flags", E->e_flags, makeArrayRef(ElfHeaderMipsFlags),
+    std::string TypeStr;
+    if (const EnumEntry<unsigned> *Ent = getObjectFileEnumEntry(E.e_type)) {
+      TypeStr = Ent->Name.str();
+    } else {
+      if (E.e_type >= ET_LOPROC)
+        TypeStr = "Processor Specific";
+      else if (E.e_type >= ET_LOOS)
+        TypeStr = "OS Specific";
+      else
+        TypeStr = "Unknown";
+    }
+    W.printString("Type", TypeStr + " (0x" + to_hexString(E.e_type) + ")");
+
+    W.printEnum("Machine", E.e_machine, makeArrayRef(ElfMachineType));
+    W.printNumber("Version", E.e_version);
+    W.printHex("Entry", E.e_entry);
+    W.printHex("ProgramHeaderOffset", E.e_phoff);
+    W.printHex("SectionHeaderOffset", E.e_shoff);
+    if (E.e_machine == EM_MIPS)
+      W.printFlags("Flags", E.e_flags, makeArrayRef(ElfHeaderMipsFlags),
                    unsigned(ELF::EF_MIPS_ARCH), unsigned(ELF::EF_MIPS_ABI),
                    unsigned(ELF::EF_MIPS_MACH));
-    else if (E->e_machine == EM_AMDGPU)
-      W.printFlags("Flags", E->e_flags, makeArrayRef(ElfHeaderAMDGPUFlags),
+    else if (E.e_machine == EM_AMDGPU)
+      W.printFlags("Flags", E.e_flags, makeArrayRef(ElfHeaderAMDGPUFlags),
                    unsigned(ELF::EF_AMDGPU_MACH));
-    else if (E->e_machine == EM_RISCV)
-      W.printFlags("Flags", E->e_flags, makeArrayRef(ElfHeaderRISCVFlags));
+    else if (E.e_machine == EM_RISCV)
+      W.printFlags("Flags", E.e_flags, makeArrayRef(ElfHeaderRISCVFlags));
     else
-      W.printFlags("Flags", E->e_flags);
-    W.printNumber("HeaderSize", E->e_ehsize);
-    W.printNumber("ProgramHeaderEntrySize", E->e_phentsize);
-    W.printNumber("ProgramHeaderCount", E->e_phnum);
-    W.printNumber("SectionHeaderEntrySize", E->e_shentsize);
+      W.printFlags("Flags", E.e_flags);
+    W.printNumber("HeaderSize", E.e_ehsize);
+    W.printNumber("ProgramHeaderEntrySize", E.e_phentsize);
+    W.printNumber("ProgramHeaderCount", E.e_phnum);
+    W.printNumber("SectionHeaderEntrySize", E.e_shentsize);
     W.printString("SectionHeaderCount",
-                  getSectionHeadersNumString(Obj, this->FileName));
+                  getSectionHeadersNumString(this->Obj, this->FileName));
     W.printString("StringTableSectionIndex",
-                  getSectionHeaderTableIndexString(Obj, this->FileName));
+                  getSectionHeaderTableIndexString(this->Obj, this->FileName));
   }
 }
 
-template <class ELFT>
-void LLVMStyle<ELFT>::printGroupSections(const ELFO *Obj) {
+template <class ELFT> void LLVMELFDumper<ELFT>::printGroupSections() {
   DictScope Lists(W, "Groups");
-  std::vector<GroupSection> V = getGroups<ELFT>(Obj, this->FileName);
+  std::vector<GroupSection> V = this->getGroups();
   DenseMap<uint64_t, const GroupSection *> Map = mapSectionsToGroups(V);
   for (const GroupSection &G : V) {
     DictScope D(W, "Group");
@@ -6167,15 +6011,13 @@ void LLVMStyle<ELFT>::printGroupSections(const ELFO *Obj) {
     ListScope L(W, "Section(s) in group");
     for (const GroupMember &GM : G.Members) {
       const GroupSection *MainGroup = Map[GM.Index];
-      if (MainGroup != &G) {
-        W.flush();
-        errs() << "Error: " << GM.Name << " (" << GM.Index
-               << ") in a group " + G.Name + " (" << G.Index
-               << ") is already in a group " + MainGroup->Name + " ("
-               << MainGroup->Index << ")\n";
-        errs().flush();
-        continue;
-      }
+      if (MainGroup != &G)
+        this->reportUniqueWarning(
+            "section with index " + Twine(GM.Index) +
+            ", included in the group section with index " +
+            Twine(MainGroup->Index) +
+            ", was also found in the group section with index " +
+            Twine(G.Index));
       W.startLine() << GM.Name << " (" << GM.Index << ")\n";
     }
   }
@@ -6184,129 +6026,66 @@ void LLVMStyle<ELFT>::printGroupSections(const ELFO *Obj) {
     W.startLine() << "There are no group sections in the file.\n";
 }
 
-template <class ELFT> void LLVMStyle<ELFT>::printRelocations(const ELFO *Obj) {
+template <class ELFT> void LLVMELFDumper<ELFT>::printRelocations() {
   ListScope D(W, "Relocations");
 
-  int SectionNumber = -1;
-  for (const Elf_Shdr &Sec : cantFail(Obj->sections())) {
-    ++SectionNumber;
-
-    if (Sec.sh_type != ELF::SHT_REL && Sec.sh_type != ELF::SHT_RELA &&
-        Sec.sh_type != ELF::SHT_RELR && Sec.sh_type != ELF::SHT_ANDROID_REL &&
-        Sec.sh_type != ELF::SHT_ANDROID_RELA &&
-        Sec.sh_type != ELF::SHT_ANDROID_RELR)
+  for (const Elf_Shdr &Sec : cantFail(this->Obj.sections())) {
+    if (!isRelocationSec<ELFT>(Sec))
       continue;
 
-    StringRef Name = unwrapOrError(this->FileName, Obj->getSectionName(&Sec));
-
-    W.startLine() << "Section (" << SectionNumber << ") " << Name << " {\n";
+    StringRef Name = this->getPrintableSectionName(Sec);
+    unsigned SecNdx = &Sec - &cantFail(this->Obj.sections()).front();
+    W.startLine() << "Section (" << SecNdx << ") " << Name << " {\n";
     W.indent();
-
-    printRelocations(&Sec, Obj);
-
+    this->printRelocationsHelper(Sec);
     W.unindent();
     W.startLine() << "}\n";
   }
 }
 
 template <class ELFT>
-void LLVMStyle<ELFT>::printRelocations(const Elf_Shdr *Sec, const ELFO *Obj) {
-  const Elf_Shdr *SymTab =
-      unwrapOrError(this->FileName, Obj->getSection(Sec->sh_link));
-  unsigned SecNdx = Sec - &cantFail(Obj->sections()).front();
-  unsigned RelNdx = 0;
-
-  switch (Sec->sh_type) {
-  case ELF::SHT_REL:
-    for (const Elf_Rel &R : unwrapOrError(this->FileName, Obj->rels(Sec))) {
-      Elf_Rela Rela;
-      Rela.r_offset = R.r_offset;
-      Rela.r_info = R.r_info;
-      Rela.r_addend = 0;
-      printRelocation(Obj, SecNdx, Rela, ++RelNdx, SymTab);
-    }
-    break;
-  case ELF::SHT_RELA:
-    for (const Elf_Rela &R : unwrapOrError(this->FileName, Obj->relas(Sec)))
-      printRelocation(Obj, SecNdx, R, ++RelNdx, SymTab);
-    break;
-  case ELF::SHT_RELR:
-  case ELF::SHT_ANDROID_RELR: {
-    Elf_Relr_Range Relrs = unwrapOrError(this->FileName, Obj->relrs(Sec));
-    if (opts::RawRelr) {
-      for (const Elf_Relr &R : Relrs)
-        W.startLine() << W.hex(R) << "\n";
-    } else {
-      std::vector<Elf_Rela> RelrRelas =
-          unwrapOrError(this->FileName, Obj->decode_relrs(Relrs));
-      for (const Elf_Rela &R : RelrRelas)
-        printRelocation(Obj, SecNdx, R, ++RelNdx, SymTab);
-    }
-    break;
-  }
-  case ELF::SHT_ANDROID_REL:
-  case ELF::SHT_ANDROID_RELA:
-    for (const Elf_Rela &R :
-         unwrapOrError(this->FileName, Obj->android_relas(Sec)))
-      printRelocation(Obj, SecNdx, R, ++RelNdx, SymTab);
-    break;
-  }
+void LLVMELFDumper<ELFT>::printRelrReloc(const Elf_Relr &R) {
+  W.startLine() << W.hex(R) << "\n";
 }
 
 template <class ELFT>
-void LLVMStyle<ELFT>::printRelocation(const ELFO *Obj, unsigned SecIndex,
-                                      Elf_Rela Rel, unsigned RelIndex,
-                                      const Elf_Shdr *SymTab) {
-  Expected<std::pair<const typename ELFT::Sym *, std::string>> Target =
-      this->dumper()->getRelocationTarget(SymTab, Rel);
-  if (!Target) {
-    this->reportUniqueWarning(createError(
-        "unable to print relocation " + Twine(RelIndex) + " in section " +
-        Twine(SecIndex) + ": " + toString(Target.takeError())));
-    return;
-  }
-
-  std::string TargetName = Target->second;
+void LLVMELFDumper<ELFT>::printRelRelaReloc(const Relocation<ELFT> &R,
+                                            const RelSymbol<ELFT> &RelSym) {
+  StringRef SymbolName = RelSym.Name;
   SmallString<32> RelocName;
-  Obj->getRelocationTypeName(Rel.getType(Obj->isMips64EL()), RelocName);
+  this->Obj.getRelocationTypeName(R.Type, RelocName);
 
   if (opts::ExpandRelocs) {
     DictScope Group(W, "Relocation");
-    W.printHex("Offset", Rel.r_offset);
-    W.printNumber("Type", RelocName, (int)Rel.getType(Obj->isMips64EL()));
-    W.printNumber("Symbol", !TargetName.empty() ? TargetName : "-",
-                  Rel.getSymbol(Obj->isMips64EL()));
-    W.printHex("Addend", Rel.r_addend);
+    W.printHex("Offset", R.Offset);
+    W.printNumber("Type", RelocName, R.Type);
+    W.printNumber("Symbol", !SymbolName.empty() ? SymbolName : "-", R.Symbol);
+    if (R.Addend)
+      W.printHex("Addend", (uintX_t)*R.Addend);
   } else {
     raw_ostream &OS = W.startLine();
-    OS << W.hex(Rel.r_offset) << " " << RelocName << " "
-       << (!TargetName.empty() ? TargetName : "-") << " " << W.hex(Rel.r_addend)
-       << "\n";
+    OS << W.hex(R.Offset) << " " << RelocName << " "
+       << (!SymbolName.empty() ? SymbolName : "-");
+    if (R.Addend)
+      OS << " " << W.hex((uintX_t)*R.Addend);
+    OS << "\n";
   }
 }
 
-template <class ELFT>
-void LLVMStyle<ELFT>::printSectionHeaders(const ELFO *Obj) {
+template <class ELFT> void LLVMELFDumper<ELFT>::printSectionHeaders() {
   ListScope SectionsD(W, "Sections");
 
   int SectionIndex = -1;
   std::vector<EnumEntry<unsigned>> FlagsList =
-      getSectionFlagsForTarget(Obj->getHeader()->e_machine);
-  for (const Elf_Shdr &Sec : cantFail(Obj->sections())) {
-    StringRef Name = "<?>";
-    if (Expected<StringRef> SecNameOrErr =
-            Obj->getSectionName(&Sec, this->dumper()->WarningHandler))
-      Name = *SecNameOrErr;
-    else
-      this->reportUniqueWarning(SecNameOrErr.takeError());
-
+      getSectionFlagsForTarget(this->Obj.getHeader().e_machine);
+  for (const Elf_Shdr &Sec : cantFail(this->Obj.sections())) {
     DictScope SectionD(W, "Section");
     W.printNumber("Index", ++SectionIndex);
-    W.printNumber("Name", Name, Sec.sh_name);
-    W.printHex(
-        "Type",
-        object::getELFSectionTypeName(Obj->getHeader()->e_machine, Sec.sh_type),
-        Sec.sh_type);
+    W.printNumber("Name", this->getPrintableSectionName(Sec), Sec.sh_name);
+    W.printHex("Type",
+               object::getELFSectionTypeName(this->Obj.getHeader().e_machine,
+                                             Sec.sh_type),
+               Sec.sh_type);
     W.printFlags("Flags", Sec.sh_flags, makeArrayRef(FlagsList));
     W.printHex("Address", Sec.sh_addr);
     W.printHex("Offset", Sec.sh_offset);
@@ -6318,31 +6097,33 @@ void LLVMStyle<ELFT>::printSectionHeaders(const ELFO *Obj) {
 
     if (opts::SectionRelocations) {
       ListScope D(W, "Relocations");
-      printRelocations(&Sec, Obj);
+      this->printRelocationsHelper(Sec);
     }
 
     if (opts::SectionSymbols) {
       ListScope D(W, "Symbols");
-      const Elf_Shdr *Symtab = this->dumper()->getDotSymtabSec();
-      StringRef StrTable =
-          unwrapOrError(this->FileName, Obj->getStringTableForSymtab(*Symtab));
-
-      for (const Elf_Sym &Sym :
-           unwrapOrError(this->FileName, Obj->symbols(Symtab))) {
-        const Elf_Shdr *SymSec = unwrapOrError(
+      if (this->DotSymtabSec) {
+        StringRef StrTable = unwrapOrError(
             this->FileName,
-            Obj->getSection(&Sym, Symtab, this->dumper()->getShndxTable()));
-        if (SymSec == &Sec)
-          printSymbol(
-              Obj, &Sym,
-              unwrapOrError(this->FileName, Obj->symbols(Symtab)).begin(),
-              StrTable, false, false);
+            this->Obj.getStringTableForSymtab(*this->DotSymtabSec));
+        ArrayRef<Elf_Word> ShndxTable = this->getShndxTable(this->DotSymtabSec);
+
+        typename ELFT::SymRange Symbols = unwrapOrError(
+            this->FileName, this->Obj.symbols(this->DotSymtabSec));
+        for (const Elf_Sym &Sym : Symbols) {
+          const Elf_Shdr *SymSec = unwrapOrError(
+              this->FileName,
+              this->Obj.getSection(Sym, this->DotSymtabSec, ShndxTable));
+          if (SymSec == &Sec)
+            printSymbol(Sym, &Sym - &Symbols[0], ShndxTable, StrTable, false,
+                        false);
+        }
       }
     }
 
     if (opts::SectionData && Sec.sh_type != ELF::SHT_NOBITS) {
       ArrayRef<uint8_t> Data =
-          unwrapOrError(this->FileName, Obj->getSectionContents(&Sec));
+          unwrapOrError(this->FileName, this->Obj.getSectionContents(Sec));
       W.printBinaryBlock(
           "SectionData",
           StringRef(reinterpret_cast<const char *>(Data.data()), Data.size()));
@@ -6351,12 +6132,34 @@ void LLVMStyle<ELFT>::printSectionHeaders(const ELFO *Obj) {
 }
 
 template <class ELFT>
-void LLVMStyle<ELFT>::printSymbolSection(const Elf_Sym *Symbol,
-                                         const Elf_Sym *First) {
+void LLVMELFDumper<ELFT>::printSymbolSection(
+    const Elf_Sym &Symbol, unsigned SymIndex,
+    DataRegion<Elf_Word> ShndxTable) const {
+  auto GetSectionSpecialType = [&]() -> Optional<StringRef> {
+    if (Symbol.isUndefined())
+      return StringRef("Undefined");
+    if (Symbol.isProcessorSpecific())
+      return StringRef("Processor Specific");
+    if (Symbol.isOSSpecific())
+      return StringRef("Operating System Specific");
+    if (Symbol.isAbsolute())
+      return StringRef("Absolute");
+    if (Symbol.isCommon())
+      return StringRef("Common");
+    if (Symbol.isReserved() && Symbol.st_shndx != SHN_XINDEX)
+      return StringRef("Reserved");
+    return None;
+  };
+
+  if (Optional<StringRef> Type = GetSectionSpecialType()) {
+    W.printHex("Section", *Type, Symbol.st_shndx);
+    return;
+  }
+
   Expected<unsigned> SectionIndex =
-      this->dumper()->getSymbolSectionIndex(Symbol, First);
+      this->getSymbolSectionIndex(Symbol, SymIndex, ShndxTable);
   if (!SectionIndex) {
-    assert(Symbol->st_shndx == SHN_XINDEX &&
+    assert(Symbol.st_shndx == SHN_XINDEX &&
            "getSymbolSectionIndex should only fail due to an invalid "
            "SHT_SYMTAB_SHNDX table/reference");
     this->reportUniqueWarning(SectionIndex.takeError());
@@ -6365,11 +6168,11 @@ void LLVMStyle<ELFT>::printSymbolSection(const Elf_Sym *Symbol,
   }
 
   Expected<StringRef> SectionName =
-      this->dumper()->getSymbolSectionName(Symbol, *SectionIndex);
+      this->getSymbolSectionName(Symbol, *SectionIndex);
   if (!SectionName) {
     // Don't report an invalid section name if the section headers are missing.
     // In such situations, all sections will be "invalid".
-    if (!this->dumper()->getElfObject()->sections().empty())
+    if (!this->ObjF.sections().empty())
       this->reportUniqueWarning(SectionName.takeError());
     else
       consumeError(SectionName.takeError());
@@ -6380,36 +6183,37 @@ void LLVMStyle<ELFT>::printSymbolSection(const Elf_Sym *Symbol,
 }
 
 template <class ELFT>
-void LLVMStyle<ELFT>::printSymbol(const ELFO *Obj, const Elf_Sym *Symbol,
-                                  const Elf_Sym *First,
-                                  Optional<StringRef> StrTable, bool IsDynamic,
-                                  bool /*NonVisibilityBitsUsed*/) {
-  std::string FullSymbolName =
-      this->dumper()->getFullSymbolName(Symbol, StrTable, IsDynamic);
-  unsigned char SymbolType = Symbol->getType();
+void LLVMELFDumper<ELFT>::printSymbol(const Elf_Sym &Symbol, unsigned SymIndex,
+                                      DataRegion<Elf_Word> ShndxTable,
+                                      Optional<StringRef> StrTable,
+                                      bool IsDynamic,
+                                      bool /*NonVisibilityBitsUsed*/) const {
+  std::string FullSymbolName = this->getFullSymbolName(
+      Symbol, SymIndex, ShndxTable, StrTable, IsDynamic);
+  unsigned char SymbolType = Symbol.getType();
 
   DictScope D(W, "Symbol");
-  W.printNumber("Name", FullSymbolName, Symbol->st_name);
-  W.printHex("Value", Symbol->st_value);
-  W.printNumber("Size", Symbol->st_size);
-  W.printEnum("Binding", Symbol->getBinding(), makeArrayRef(ElfSymbolBindings));
-  if (Obj->getHeader()->e_machine == ELF::EM_AMDGPU &&
+  W.printNumber("Name", FullSymbolName, Symbol.st_name);
+  W.printHex("Value", Symbol.st_value);
+  W.printNumber("Size", Symbol.st_size);
+  W.printEnum("Binding", Symbol.getBinding(), makeArrayRef(ElfSymbolBindings));
+  if (this->Obj.getHeader().e_machine == ELF::EM_AMDGPU &&
       SymbolType >= ELF::STT_LOOS && SymbolType < ELF::STT_HIOS)
     W.printEnum("Type", SymbolType, makeArrayRef(AMDGPUSymbolTypes));
   else
     W.printEnum("Type", SymbolType, makeArrayRef(ElfSymbolTypes));
-  if (Symbol->st_other == 0)
+  if (Symbol.st_other == 0)
     // Usually st_other flag is zero. Do not pollute the output
     // by flags enumeration in that case.
     W.printNumber("Other", 0);
   else {
     std::vector<EnumEntry<unsigned>> SymOtherFlags(std::begin(ElfSymOtherFlags),
                                                    std::end(ElfSymOtherFlags));
-    if (Obj->getHeader()->e_machine == EM_MIPS) {
+    if (this->Obj.getHeader().e_machine == EM_MIPS) {
       // Someones in their infinite wisdom decided to make STO_MIPS_MIPS16
       // flag overlapped with other ST_MIPS_xxx flags. So consider both
       // cases separately.
-      if ((Symbol->st_other & STO_MIPS_MIPS16) == STO_MIPS_MIPS16)
+      if ((Symbol.st_other & STO_MIPS_MIPS16) == STO_MIPS_MIPS16)
         SymOtherFlags.insert(SymOtherFlags.end(),
                              std::begin(ElfMips16SymOtherFlags),
                              std::end(ElfMips16SymOtherFlags));
@@ -6417,40 +6221,37 @@ void LLVMStyle<ELFT>::printSymbol(const ELFO *Obj, const Elf_Sym *Symbol,
         SymOtherFlags.insert(SymOtherFlags.end(),
                              std::begin(ElfMipsSymOtherFlags),
                              std::end(ElfMipsSymOtherFlags));
+    } else if (this->Obj.getHeader().e_machine == EM_AARCH64) {
+      SymOtherFlags.insert(SymOtherFlags.end(),
+                           std::begin(ElfAArch64SymOtherFlags),
+                           std::end(ElfAArch64SymOtherFlags));
     }
-    W.printFlags("Other", Symbol->st_other, makeArrayRef(SymOtherFlags), 0x3u);
+    W.printFlags("Other", Symbol.st_other, makeArrayRef(SymOtherFlags), 0x3u);
   }
-  printSymbolSection(Symbol, First);
-}
-
-template <class ELFT>
-void LLVMStyle<ELFT>::printSymbols(const ELFO *Obj, bool PrintSymbols,
-                                   bool PrintDynamicSymbols) {
-  if (PrintSymbols)
-    printSymbols(Obj);
-  if (PrintDynamicSymbols)
-    printDynamicSymbols(Obj);
-}
-
-template <class ELFT> void LLVMStyle<ELFT>::printSymbols(const ELFO *Obj) {
-  ListScope Group(W, "Symbols");
-  this->dumper()->printSymbolsHelper(false);
+  printSymbolSection(Symbol, SymIndex, ShndxTable);
 }
 
 template <class ELFT>
-void LLVMStyle<ELFT>::printDynamicSymbols(const ELFO *Obj) {
-  ListScope Group(W, "DynamicSymbols");
-  this->dumper()->printSymbolsHelper(true);
+void LLVMELFDumper<ELFT>::printSymbols(bool PrintSymbols,
+                                       bool PrintDynamicSymbols) {
+  if (PrintSymbols) {
+    ListScope Group(W, "Symbols");
+    this->printSymbolsHelper(false);
+  }
+  if (PrintDynamicSymbols) {
+    ListScope Group(W, "DynamicSymbols");
+    this->printSymbolsHelper(true);
+  }
 }
 
-template <class ELFT> void LLVMStyle<ELFT>::printDynamic(const ELFFile<ELFT> *Obj) {
-  Elf_Dyn_Range Table = this->dumper()->dynamic_table();
+template <class ELFT> void LLVMELFDumper<ELFT>::printDynamicTable() {
+  Elf_Dyn_Range Table = this->dynamic_table();
   if (Table.empty())
     return;
 
   W.startLine() << "DynamicSection [ (" << Table.size() << " entries)\n";
 
-  size_t MaxTagSize = getMaxDynamicTagSize(Obj, Table);
+  size_t MaxTagSize = getMaxDynamicTagSize(this->Obj, Table);
   // The "Name/Value" column should be indented from the "Type" column by N
   // spaces, where N = MaxTagSize - length of "Type" (4) + trailing
   // space (1) = -3.
@@ -6460,108 +6261,49 @@ template <class ELFT> void LLVMStyle<ELFT>::printDynamic(const ELFFile<ELFT> *Ob
   std::string ValueFmt = "%-" + std::to_string(MaxTagSize) + "s ";
   for (auto Entry : Table) {
     uintX_t Tag = Entry.getTag();
-    std::string Value = this->dumper()->getDynamicEntry(Tag, Entry.getVal());
+    std::string Value = this->getDynamicEntry(Tag, Entry.getVal());
     W.startLine() << "  " << format_hex(Tag, ELFT::Is64Bits ? 18 : 10, true)
                   << " "
                   << format(ValueFmt.c_str(),
-                            Obj->getDynamicTagAsString(Tag).c_str())
+                            this->Obj.getDynamicTagAsString(Tag).c_str())
                   << Value << "\n";
   }
   W.startLine() << "]\n";
 }
 
-template <class ELFT>
-void LLVMStyle<ELFT>::printDynamicRelocations(const ELFO *Obj) {
-  const DynRegionInfo &DynRelRegion = this->dumper()->getDynRelRegion();
-  const DynRegionInfo &DynRelaRegion = this->dumper()->getDynRelaRegion();
-  const DynRegionInfo &DynRelrRegion = this->dumper()->getDynRelrRegion();
-  const DynRegionInfo &DynPLTRelRegion = this->dumper()->getDynPLTRelRegion();
-
+template <class ELFT> void LLVMELFDumper<ELFT>::printDynamicRelocations() {
   W.startLine() << "Dynamic Relocations {\n";
   W.indent();
-  if (DynRelaRegion.Size > 0) {
-    for (const Elf_Rela &Rela : this->dumper()->dyn_relas())
-      printDynamicRelocation(Obj, Rela);
-  }
-  if (DynRelRegion.Size > 0) {
-    for (const Elf_Rel &Rel : this->dumper()->dyn_rels()) {
-      Elf_Rela Rela;
-      Rela.r_offset = Rel.r_offset;
-      Rela.r_info = Rel.r_info;
-      Rela.r_addend = 0;
-      printDynamicRelocation(Obj, Rela);
-    }
-  }
-
-  if (DynRelrRegion.Size > 0) {
-    Elf_Relr_Range Relrs = this->dumper()->dyn_relrs();
-    std::vector<Elf_Rela> RelrRelas =
-        unwrapOrError(this->FileName, Obj->decode_relrs(Relrs));
-    for (const Elf_Rela &Rela : RelrRelas)
-      printDynamicRelocation(Obj, Rela);
-  }
-  if (DynPLTRelRegion.EntSize == sizeof(Elf_Rela))
-    for (const Elf_Rela &Rela : DynPLTRelRegion.getAsArrayRef<Elf_Rela>())
-      printDynamicRelocation(Obj, Rela);
-  else
-    for (const Elf_Rel &Rel : DynPLTRelRegion.getAsArrayRef<Elf_Rel>()) {
-      Elf_Rela Rela;
-      Rela.r_offset = Rel.r_offset;
-      Rela.r_info = Rel.r_info;
-      Rela.r_addend = 0;
-      printDynamicRelocation(Obj, Rela);
-    }
+  this->printDynamicRelocationsHelper();
   W.unindent();
   W.startLine() << "}\n";
 }
 
 template <class ELFT>
-void LLVMStyle<ELFT>::printDynamicRelocation(const ELFO *Obj, Elf_Rela Rel) {
-  SmallString<32> RelocName;
-  Obj->getRelocationTypeName(Rel.getType(Obj->isMips64EL()), RelocName);
-  std::string SymbolName =
-      getSymbolForReloc(Obj, this->FileName, this->dumper(), Rel).Name;
-
-  if (opts::ExpandRelocs) {
-    DictScope Group(W, "Relocation");
-    W.printHex("Offset", Rel.r_offset);
-    W.printNumber("Type", RelocName, (int)Rel.getType(Obj->isMips64EL()));
-    W.printString("Symbol", !SymbolName.empty() ? SymbolName : "-");
-    W.printHex("Addend", Rel.r_addend);
-  } else {
-    raw_ostream &OS = W.startLine();
-    OS << W.hex(Rel.r_offset) << " " << RelocName << " "
-       << (!SymbolName.empty() ? SymbolName : "-") << " " << W.hex(Rel.r_addend)
-       << "\n";
-  }
-}
-
-template <class ELFT>
-void LLVMStyle<ELFT>::printProgramHeaders(
-    const ELFO *Obj, bool PrintProgramHeaders,
-    cl::boolOrDefault PrintSectionMapping) {
+void LLVMELFDumper<ELFT>::printProgramHeaders(
+    bool PrintProgramHeaders, cl::boolOrDefault PrintSectionMapping) {
   if (PrintProgramHeaders)
-    printProgramHeaders(Obj);
+    printProgramHeaders();
   if (PrintSectionMapping == cl::BOU_TRUE)
-    printSectionMapping(Obj);
+    printSectionMapping();
 }
 
-template <class ELFT>
-void LLVMStyle<ELFT>::printProgramHeaders(const ELFO *Obj) {
+template <class ELFT> void LLVMELFDumper<ELFT>::printProgramHeaders() {
   ListScope L(W, "ProgramHeaders");
 
-  Expected<ArrayRef<Elf_Phdr>> PhdrsOrErr = Obj->program_headers();
+  Expected<ArrayRef<Elf_Phdr>> PhdrsOrErr = this->Obj.program_headers();
   if (!PhdrsOrErr) {
-    this->reportUniqueWarning(createError("unable to dump program headers: " +
-                                          toString(PhdrsOrErr.takeError())));
+    this->reportUniqueWarning("unable to dump program headers: " +
+                              toString(PhdrsOrErr.takeError()));
     return;
   }
 
   for (const Elf_Phdr &Phdr : *PhdrsOrErr) {
     DictScope P(W, "ProgramHeader");
-    W.printHex("Type",
-               getElfSegmentType(Obj->getHeader()->e_machine, Phdr.p_type),
-               Phdr.p_type);
+    StringRef Type =
+        segmentTypeToString(this->Obj.getHeader().e_machine, Phdr.p_type);
+
+    W.printHex("Type", Type.empty() ? "Unknown" : Type, Phdr.p_type);
     W.printHex("Offset", Phdr.p_offset);
     W.printHex("VirtualAddress", Phdr.p_vaddr);
     W.printHex("PhysicalAddress", Phdr.p_paddr);
@@ -6573,16 +6315,16 @@ void LLVMStyle<ELFT>::printProgramHeaders(const ELFO *Obj) {
 }
 
 template <class ELFT>
-void LLVMStyle<ELFT>::printVersionSymbolSection(const ELFFile<ELFT> *Obj,
-                                                const Elf_Shdr *Sec) {
+void LLVMELFDumper<ELFT>::printVersionSymbolSection(const Elf_Shdr *Sec) {
   ListScope SS(W, "VersionSymbols");
   if (!Sec)
     return;
 
   StringRef StrTable;
   ArrayRef<Elf_Sym> Syms;
+  const Elf_Shdr *SymTabSec;
   Expected<ArrayRef<Elf_Versym>> VerTableOrErr =
-      this->dumper()->getVersionTable(Sec, &Syms, &StrTable);
+      this->getVersionTable(*Sec, &Syms, &StrTable, &SymTabSec);
   if (!VerTableOrErr) {
     this->reportUniqueWarning(VerTableOrErr.takeError());
     return;
@@ -6591,22 +6333,28 @@ void LLVMStyle<ELFT>::printVersionSymbolSection(const ELFFile<ELFT> *Obj,
   if (StrTable.empty() || Syms.empty() || Syms.size() != VerTableOrErr->size())
     return;
 
+  ArrayRef<Elf_Word> ShNdxTable = this->getShndxTable(SymTabSec);
   for (size_t I = 0, E = Syms.size(); I < E; ++I) {
     DictScope S(W, "Symbol");
     W.printNumber("Version", (*VerTableOrErr)[I].vs_index & VERSYM_VERSION);
-    W.printString("Name", this->dumper()->getFullSymbolName(
-                              &Syms[I], StrTable, /*IsDynamic=*/true));
+    W.printString("Name",
+                  this->getFullSymbolName(Syms[I], I, ShNdxTable, StrTable,
+                                          /*IsDynamic=*/true));
   }
 }
 
+static const EnumEntry<unsigned> SymVersionFlags[] = {
+    {"Base", "BASE", VER_FLG_BASE},
+    {"Weak", "WEAK", VER_FLG_WEAK},
+    {"Info", "INFO", VER_FLG_INFO}};
+
 template <class ELFT>
-void LLVMStyle<ELFT>::printVersionDefinitionSection(const ELFFile<ELFT> *Obj,
-                                                    const Elf_Shdr *Sec) {
+void LLVMELFDumper<ELFT>::printVersionDefinitionSection(const Elf_Shdr *Sec) {
   ListScope SD(W, "VersionDefinitions");
   if (!Sec)
     return;
 
-  Expected<std::vector<VerDef>> V = this->dumper()->getVersionDefinitions(Sec);
+  Expected<std::vector<VerDef>> V = this->Obj.getVersionDefinitions(*Sec);
   if (!V) {
     this->reportUniqueWarning(V.takeError());
     return;
@@ -6626,14 +6374,13 @@ void LLVMStyle<ELFT>::printVersionDefinitionSection(const ELFFile<ELFT> *Obj,
 }
 
 template <class ELFT>
-void LLVMStyle<ELFT>::printVersionDependencySection(const ELFFile<ELFT> *Obj,
-                                                    const Elf_Shdr *Sec) {
+void LLVMELFDumper<ELFT>::printVersionDependencySection(const Elf_Shdr *Sec) {
   ListScope SD(W, "VersionRequirements");
   if (!Sec)
     return;
 
   Expected<std::vector<VerNeed>> V =
-      this->dumper()->getVersionDependencies(Sec);
+      this->Obj.getVersionDependencies(*Sec, this->WarningHandler);
   if (!V) {
     this->reportUniqueWarning(V.takeError());
     return;
@@ -6656,75 +6403,49 @@ void LLVMStyle<ELFT>::printVersionDependencySection(const ELFFile<ELFT> *Obj,
   }
 }
 
-template <class ELFT>
-void LLVMStyle<ELFT>::printHashHistograms(const ELFFile<ELFT> *Obj) {
+template <class ELFT> void LLVMELFDumper<ELFT>::printHashHistograms() {
   W.startLine() << "Hash Histogram not implemented!\n";
 }
 
-template <class ELFT>
-void LLVMStyle<ELFT>::printCGProfile(const ELFFile<ELFT> *Obj) {
+template <class ELFT> void LLVMELFDumper<ELFT>::printCGProfile() {
   ListScope L(W, "CGProfile");
-  if (!this->dumper()->getDotCGProfileSec())
+  if (!this->DotCGProfileSec)
     return;
 
   Expected<ArrayRef<Elf_CGProfile>> CGProfileOrErr =
-      Obj->template getSectionContentsAsArray<Elf_CGProfile>(
-          this->dumper()->getDotCGProfileSec());
+      this->Obj.template getSectionContentsAsArray<Elf_CGProfile>(
+          *this->DotCGProfileSec);
   if (!CGProfileOrErr) {
     this->reportUniqueWarning(
-        createError("unable to dump the SHT_LLVM_CALL_GRAPH_PROFILE section: " +
-                    toString(CGProfileOrErr.takeError())));
+        "unable to dump the SHT_LLVM_CALL_GRAPH_PROFILE section: " +
+        toString(CGProfileOrErr.takeError()));
     return;
   }
 
   for (const Elf_CGProfile &CGPE : *CGProfileOrErr) {
     DictScope D(W, "CGProfileEntry");
-    W.printNumber("From", this->dumper()->getStaticSymbolName(CGPE.cgp_from),
+    W.printNumber("From", this->getStaticSymbolName(CGPE.cgp_from),
                   CGPE.cgp_from);
-    W.printNumber("To", this->dumper()->getStaticSymbolName(CGPE.cgp_to),
+    W.printNumber("To", this->getStaticSymbolName(CGPE.cgp_to),
                   CGPE.cgp_to);
     W.printNumber("Weight", CGPE.cgp_weight);
   }
 }
 
-static Expected<std::vector<uint64_t>> toULEB128Array(ArrayRef<uint8_t> Data) {
-  std::vector<uint64_t> Ret;
-  const uint8_t *Cur = Data.begin();
-  const uint8_t *End = Data.end();
-  while (Cur != End) {
-    unsigned Size;
-    const char *Err;
-    Ret.push_back(decodeULEB128(Cur, &Size, End, &Err));
-    if (Err)
-      return createError(Err);
-    Cur += Size;
-  }
-  return Ret;
-}
-
-template <class ELFT>
-void LLVMStyle<ELFT>::printAddrsig(const ELFFile<ELFT> *Obj) {
+template <class ELFT> void LLVMELFDumper<ELFT>::printAddrsig() {
   ListScope L(W, "Addrsig");
-  if (!this->dumper()->getDotAddrsigSec())
+  if (!this->DotAddrsigSec)
     return;
-  ArrayRef<uint8_t> Contents = unwrapOrError(
-      this->FileName,
-      Obj->getSectionContents(this->dumper()->getDotAddrsigSec()));
-  Expected<std::vector<uint64_t>> V = toULEB128Array(Contents);
-  if (!V) {
-    reportWarning(V.takeError(), this->FileName);
+
+  Expected<std::vector<uint64_t>> SymsOrErr =
+      decodeAddrsigSection(this->Obj, *this->DotAddrsigSec);
+  if (!SymsOrErr) {
+    this->reportUniqueWarning(SymsOrErr.takeError());
     return;
   }
 
-  for (uint64_t Sym : *V) {
-    Expected<std::string> NameOrErr = this->dumper()->getStaticSymbolName(Sym);
-    if (NameOrErr) {
-      W.printNumber("Sym", *NameOrErr, Sym);
-      continue;
-    }
-    reportWarning(NameOrErr.takeError(), this->FileName);
-    W.printNumber("Sym", "<?>", Sym);
-  }
+  for (uint64_t Sym : *SymsOrErr)
+    W.printNumber("Sym", this->getStaticSymbolName(Sym), Sym);
 }
 
 template <typename ELFT>
@@ -6752,7 +6473,7 @@ static void printGNUNoteLLVMStyle(uint32_t NoteType, ArrayRef<uint8_t> Desc,
     break;
   case ELF::NT_GNU_PROPERTY_TYPE_0:
     ListScope D(W, "Property");
-    for (const auto &Property : getGNUPropertyList<ELFT>(Desc))
+    for (const std::string &Property : getGNUPropertyList<ELFT>(Desc))
       W.printString(Property);
     break;
   }
@@ -6769,19 +6490,22 @@ static void printCoreNoteLLVMStyle(const CoreNote &Note, ScopedPrinter &W) {
   }
 }
 
-template <class ELFT>
-void LLVMStyle<ELFT>::printNotes(const ELFFile<ELFT> *Obj) {
+template <class ELFT> void LLVMELFDumper<ELFT>::printNotes() {
   ListScope L(W, "Notes");
 
-  auto PrintHeader = [&](Optional<StringRef> SecName,
-                         const typename ELFT::Off Offset,
-                         const typename ELFT::Addr Size) {
+  std::unique_ptr<DictScope> NoteScope;
+  auto StartNotes = [&](Optional<StringRef> SecName,
+                        const typename ELFT::Off Offset,
+                        const typename ELFT::Addr Size) {
+    NoteScope = std::make_unique<DictScope>(W, "NoteSection");
     W.printString("Name", SecName ? *SecName : "<?>");
     W.printHex("Offset", Offset);
     W.printHex("Size", Size);
   };
 
-  auto ProcessNote = [&](const Elf_Note &Note) {
+  auto EndNotes = [&] { NoteScope.reset(); };
+
+  auto ProcessNote = [&](const Elf_Note &Note) -> Error {
     DictScope D2(W, "Note");
     StringRef Name = Note.getName();
     ArrayRef<uint8_t> Descriptor = Note.getDesc();
@@ -6790,24 +6514,14 @@ void LLVMStyle<ELFT>::printNotes(const ELFFile<ELFT> *Obj) {
     // Print the note owner/type.
     W.printString("Owner", Name);
     W.printHex("Data size", Descriptor.size());
-    if (Name == "GNU") {
-      W.printString("Type", getGNUNoteTypeName(Type));
-    } else if (Name == "FreeBSD") {
-      W.printString("Type", getFreeBSDNoteTypeName(Type));
-    } else if (Name == "AMD") {
-      W.printString("Type", getAMDNoteTypeName(Type));
-    } else if (Name == "AMDGPU") {
-      W.printString("Type", getAMDGPUNoteTypeName(Type));
-    } else {
-      StringRef NoteType = Obj->getHeader()->e_type == ELF::ET_CORE
-                               ? getCoreNoteTypeName(Type)
-                               : getGenericNoteTypeName(Type);
-      if (!NoteType.empty())
-        W.printString("Type", NoteType);
-      else
-        W.printString("Type",
-                      "Unknown (" + to_string(format_hex(Type, 10)) + ")");
-    }
+
+    StringRef NoteType =
+        getNoteTypeName<ELFT>(Note, this->Obj.getHeader().e_type);
+    if (!NoteType.empty())
+      W.printString("Type", NoteType);
+    else
+      W.printString("Type",
+                    "Unknown (" + to_string(format_hex(Type, 10)) + ")");
 
     // Print the description, or fallback to printing raw bytes for unknown
     // owners.
@@ -6826,91 +6540,56 @@ void LLVMStyle<ELFT>::printNotes(const ELFFile<ELFT> *Obj) {
         DataExtractor DescExtractor(Descriptor,
                                     ELFT::TargetEndianness == support::little,
                                     sizeof(Elf_Addr));
-        Expected<CoreNote> Note = readCoreNote(DescExtractor);
-        if (Note)
+        if (Expected<CoreNote> Note = readCoreNote(DescExtractor))
           printCoreNoteLLVMStyle(*Note, W);
         else
-          reportWarning(Note.takeError(), this->FileName);
+          return Note.takeError();
       }
     } else if (!Descriptor.empty()) {
       W.printBinaryBlock("Description data", Descriptor);
     }
+    return Error::success();
   };
 
-  ArrayRef<Elf_Shdr> Sections = cantFail(Obj->sections());
-  if (Obj->getHeader()->e_type != ELF::ET_CORE && !Sections.empty()) {
-    for (const auto &S : Sections) {
-      if (S.sh_type != SHT_NOTE)
-        continue;
-      DictScope D(W, "NoteSection");
-      PrintHeader(expectedToOptional(Obj->getSectionName(&S)), S.sh_offset,
-                  S.sh_size);
-      Error Err = Error::success();
-      for (auto Note : Obj->notes(S, Err))
-        ProcessNote(Note);
-      if (Err)
-        reportError(std::move(Err), this->FileName);
-    }
-  } else {
-    Expected<ArrayRef<Elf_Phdr>> PhdrsOrErr = Obj->program_headers();
-    if (!PhdrsOrErr) {
-      this->reportUniqueWarning(createError(
-          "unable to read program headers to locate the PT_NOTE segment: " +
-          toString(PhdrsOrErr.takeError())));
-      return;
-    }
-
-    for (const Elf_Phdr &P : *PhdrsOrErr) {
-      if (P.p_type != PT_NOTE)
-        continue;
-      DictScope D(W, "NoteSection");
-      PrintHeader(/*SecName=*/None, P.p_offset, P.p_filesz);
-      Error Err = Error::success();
-      for (auto Note : Obj->notes(P, Err))
-        ProcessNote(Note);
-      if (Err)
-        reportError(std::move(Err), this->FileName);
-    }
-  }
+  printNotesHelper(*this, StartNotes, ProcessNote, EndNotes);
 }
 
-template <class ELFT>
-void LLVMStyle<ELFT>::printELFLinkerOptions(const ELFFile<ELFT> *Obj) {
+template <class ELFT> void LLVMELFDumper<ELFT>::printELFLinkerOptions() {
   ListScope L(W, "LinkerOptions");
 
   unsigned I = -1;
-  for (const Elf_Shdr &Shdr : cantFail(Obj->sections())) {
+  for (const Elf_Shdr &Shdr : cantFail(this->Obj.sections())) {
     ++I;
     if (Shdr.sh_type != ELF::SHT_LLVM_LINKER_OPTIONS)
       continue;
 
-    Expected<ArrayRef<uint8_t>> ContentsOrErr = Obj->getSectionContents(&Shdr);
+    Expected<ArrayRef<uint8_t>> ContentsOrErr =
+        this->Obj.getSectionContents(Shdr);
     if (!ContentsOrErr) {
-      this->reportUniqueWarning(
-          createError("unable to read the content of the "
-                      "SHT_LLVM_LINKER_OPTIONS section: " +
-                      toString(ContentsOrErr.takeError())));
+      this->reportUniqueWarning("unable to read the content of the "
+                                "SHT_LLVM_LINKER_OPTIONS section: " +
+                                toString(ContentsOrErr.takeError()));
       continue;
     }
     if (ContentsOrErr->empty())
       continue;
 
     if (ContentsOrErr->back() != 0) {
-      this->reportUniqueWarning(
-          createError("SHT_LLVM_LINKER_OPTIONS section at index " + Twine(I) +
-                      " is broken: the "
-                      "content is not null-terminated"));
+      this->reportUniqueWarning("SHT_LLVM_LINKER_OPTIONS section at index " +
+                                Twine(I) +
+                                " is broken: the "
+                                "content is not null-terminated");
       continue;
     }
 
     SmallVector<StringRef, 16> Strings;
     toStringRef(ContentsOrErr->drop_back()).split(Strings, '\0');
     if (Strings.size() % 2 != 0) {
-      this->reportUniqueWarning(createError(
+      this->reportUniqueWarning(
           "SHT_LLVM_LINKER_OPTIONS section at index " + Twine(I) +
           " is broken: an incomplete "
           "key-value pair was found. The last possible key was: \"" +
-          Strings.back() + "\""));
+          Strings.back() + "\"");
       continue;
     }
 
@@ -6919,32 +6598,30 @@ void LLVMStyle<ELFT>::printELFLinkerOptions(const ELFFile<ELFT> *Obj) {
   }
 }
 
-template <class ELFT>
-void LLVMStyle<ELFT>::printDependentLibs(const ELFFile<ELFT> *Obj) {
+template <class ELFT> void LLVMELFDumper<ELFT>::printDependentLibs() {
   ListScope L(W, "DependentLibs");
   this->printDependentLibsHelper(
-      Obj, [](const Elf_Shdr &) {},
+      [](const Elf_Shdr &) {},
       [this](StringRef Lib, uint64_t) { W.printString(Lib); });
 }
 
-template <class ELFT>
-void LLVMStyle<ELFT>::printStackSizes(const ELFObjectFile<ELFT> *Obj) {
+template <class ELFT> void LLVMELFDumper<ELFT>::printStackSizes() {
   ListScope L(W, "StackSizes");
-  if (Obj->isRelocatableObject())
-    this->printRelocatableStackSizes(Obj, []() {});
+  if (this->Obj.getHeader().e_type == ELF::ET_REL)
+    this->printRelocatableStackSizes([]() {});
   else
-    this->printNonRelocatableStackSizes(Obj, []() {});
+    this->printNonRelocatableStackSizes([]() {});
 }
 
 template <class ELFT>
-void LLVMStyle<ELFT>::printStackSizeEntry(uint64_t Size, StringRef FuncName) {
+void LLVMELFDumper<ELFT>::printStackSizeEntry(uint64_t Size, StringRef FuncName) {
   DictScope D(W, "Entry");
   W.printString("Function", FuncName);
   W.printHex("Size", Size);
 }
 
 template <class ELFT>
-void LLVMStyle<ELFT>::printMipsGOT(const MipsGOTParser<ELFT> &Parser) {
+void LLVMELFDumper<ELFT>::printMipsGOT(const MipsGOTParser<ELFT> &Parser) {
   auto PrintEntry = [&](const Elf_Addr *E) {
     W.printHex("Address", Parser.getGotAddress(E));
     W.printNumber("Access", Parser.getGotOffset(E));
@@ -6986,14 +6663,18 @@ void LLVMStyle<ELFT>::printMipsGOT(const MipsGOTParser<ELFT> &Parser) {
 
       PrintEntry(&E);
 
-      const Elf_Sym *Sym = Parser.getGotSym(&E);
-      W.printHex("Value", Sym->st_value);
-      W.printEnum("Type", Sym->getType(), makeArrayRef(ElfSymbolTypes));
-      printSymbolSection(Sym, this->dumper()->dynamic_symbols().begin());
+      const Elf_Sym &Sym = *Parser.getGotSym(&E);
+      W.printHex("Value", Sym.st_value);
+      W.printEnum("Type", Sym.getType(), makeArrayRef(ElfSymbolTypes));
+
+      const unsigned SymIndex = &Sym - this->dynamic_symbols().begin();
+      DataRegion<Elf_Word> ShndxTable(
+          (const Elf_Word *)this->DynSymTabShndxRegion.Addr, this->Obj.end());
+      printSymbolSection(Sym, SymIndex, ShndxTable);
 
-      std::string SymName = this->dumper()->getFullSymbolName(
-          Sym, this->dumper()->getDynamicStringTable(), true);
-      W.printNumber("Name", SymName, Sym->st_name);
+      std::string SymName = this->getFullSymbolName(
+          Sym, SymIndex, ShndxTable, this->DynamicStringTable, true);
+      W.printNumber("Name", SymName, Sym.st_name);
     }
   }
 
@@ -7002,7 +6683,7 @@ void LLVMStyle<ELFT>::printMipsGOT(const MipsGOTParser<ELFT> &Parser) {
 }
 
 template <class ELFT>
-void LLVMStyle<ELFT>::printMipsPLT(const MipsGOTParser<ELFT> &Parser) {
+void LLVMELFDumper<ELFT>::printMipsPLT(const MipsGOTParser<ELFT> &Parser) {
   auto PrintEntry = [&](const Elf_Addr *E) {
     W.printHex("Address", Parser.getPltAddress(E));
     W.printHex("Initial", *E);
@@ -7026,40 +6707,41 @@ void LLVMStyle<ELFT>::printMipsPLT(const MipsGOTParser<ELFT> &Parser) {
   }
   {
     ListScope LS(W, "Entries");
+    DataRegion<Elf_Word> ShndxTable(
+        (const Elf_Word *)this->DynSymTabShndxRegion.Addr, this->Obj.end());
     for (auto &E : Parser.getPltEntries()) {
       DictScope D(W, "Entry");
       PrintEntry(&E);
 
-      const Elf_Sym *Sym = Parser.getPltSym(&E);
-      W.printHex("Value", Sym->st_value);
-      W.printEnum("Type", Sym->getType(), makeArrayRef(ElfSymbolTypes));
-      printSymbolSection(Sym, this->dumper()->dynamic_symbols().begin());
-
-      std::string SymName =
-          this->dumper()->getFullSymbolName(Sym, Parser.getPltStrTable(), true);
-      W.printNumber("Name", SymName, Sym->st_name);
+      const Elf_Sym &Sym = *Parser.getPltSym(&E);
+      W.printHex("Value", Sym.st_value);
+      W.printEnum("Type", Sym.getType(), makeArrayRef(ElfSymbolTypes));
+      printSymbolSection(Sym, &Sym - this->dynamic_symbols().begin(),
+                         ShndxTable);
+
+      const Elf_Sym *FirstSym = cantFail(
+          this->Obj.template getEntry<Elf_Sym>(*Parser.getPltSymTable(), 0));
+      std::string SymName = this->getFullSymbolName(
+          Sym, &Sym - FirstSym, ShndxTable, Parser.getPltStrTable(), true);
+      W.printNumber("Name", SymName, Sym.st_name);
     }
   }
 }
 
-template <class ELFT>
-void LLVMStyle<ELFT>::printMipsABIFlags(const ELFObjectFile<ELFT> *ObjF) {
-  const ELFFile<ELFT> *Obj = ObjF->getELFFile();
-  const Elf_Shdr *Shdr =
-      findSectionByName(*Obj, ObjF->getFileName(), ".MIPS.abiflags");
-  if (!Shdr) {
-    W.startLine() << "There is no .MIPS.abiflags section in the file.\n";
-    return;
-  }
-  ArrayRef<uint8_t> Sec =
-      unwrapOrError(ObjF->getFileName(), Obj->getSectionContents(Shdr));
-  if (Sec.size() != sizeof(Elf_Mips_ABIFlags<ELFT>)) {
-    W.startLine() << "The .MIPS.abiflags section has a wrong size.\n";
+template <class ELFT> void LLVMELFDumper<ELFT>::printMipsABIFlags() {
+  const Elf_Mips_ABIFlags<ELFT> *Flags;
+  if (Expected<const Elf_Mips_ABIFlags<ELFT> *> SecOrErr =
+          getMipsAbiFlagsSection(*this)) {
+    Flags = *SecOrErr;
+    if (!Flags) {
+      W.startLine() << "There is no .MIPS.abiflags section in the file.\n";
+      return;
+    }
+  } else {
+    this->reportUniqueWarning(SecOrErr.takeError());
     return;
   }
 
-  auto *Flags = reinterpret_cast<const Elf_Mips_ABIFlags<ELFT> *>(Sec.data());
-
   raw_ostream &OS = W.getOStream();
   DictScope GS(W, "MIPS ABI Flags");
 
diff --git a/contrib/llvm-project/llvm/tools/llvm-readobj/Error.cpp b/contrib/llvm-project/llvm/tools/llvm-readobj/Error.cpp
deleted file mode 100644
index 1010f18a58c8..000000000000
--- a/contrib/llvm-project/llvm/tools/llvm-readobj/Error.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-//===- Error.cpp - system_error extensions for llvm-readobj -----*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This defines a new error_category for the llvm-readobj tool.
-//
-//===----------------------------------------------------------------------===//
-
-#include "Error.h"
-#include "llvm/Support/ErrorHandling.h"
-
-using namespace llvm;
-
-namespace {
-// FIXME: This class is only here to support the transition to llvm::Error. It
-// will be removed once this transition is complete. Clients should prefer to
-// deal with the Error value directly, rather than converting to error_code.
-class _readobj_error_category : public std::error_category {
-public:
-  const char* name() const noexcept override;
-  std::string message(int ev) const override;
-};
-} // namespace
-
-const char *_readobj_error_category::name() const noexcept {
-  return "llvm.readobj";
-}
-
-std::string _readobj_error_category::message(int EV) const {
-  switch (static_cast<readobj_error>(EV)) {
-  case readobj_error::success: return "Success";
-  case readobj_error::file_not_found:
-    return "No such file.";
-  case readobj_error::unsupported_file_format:
-    return "The file was not recognized as a valid object file.";
-  case readobj_error::unrecognized_file_format:
-    return "Unrecognized file type.";
-  case readobj_error::unsupported_obj_file_format:
-    return "Unsupported object file format.";
-  case readobj_error::unknown_symbol:
-    return "Unknown symbol.";
-  }
-  llvm_unreachable("An enumerator of readobj_error does not have a message "
-                   "defined.");
-}
-
-namespace llvm {
-const std::error_category &readobj_category() {
-  static _readobj_error_category o;
-  return o;
-}
-} // namespace llvm
diff --git a/contrib/llvm-project/llvm/tools/llvm-readobj/Error.h b/contrib/llvm-project/llvm/tools/llvm-readobj/Error.h
deleted file mode 100644
index f390e1b96f8a..000000000000
--- a/contrib/llvm-project/llvm/tools/llvm-readobj/Error.h
+++ /dev/null
@@ -1,40 +0,0 @@
-//===- Error.h - system_error extensions for llvm-readobj -------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This declares a new error_category for the llvm-readobj tool.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TOOLS_LLVM_READOBJ_ERROR_H
-#define LLVM_TOOLS_LLVM_READOBJ_ERROR_H
-
-#include <system_error>
-
-namespace llvm {
-const std::error_category &readobj_category();
-
-enum class readobj_error {
-  success = 0,
-  file_not_found,
-  unsupported_file_format,
-  unrecognized_file_format,
-  unsupported_obj_file_format,
-  unknown_symbol
-};
-
-inline std::error_code make_error_code(readobj_error e) {
-  return std::error_code(static_cast<int>(e), readobj_category());
-}
-
-} // namespace llvm
-
-namespace std {
-template <> struct is_error_code_enum<llvm::readobj_error> : std::true_type {};
-}
-
-#endif
diff --git a/contrib/llvm-project/llvm/tools/llvm-readobj/MachODumper.cpp b/contrib/llvm-project/llvm/tools/llvm-readobj/MachODumper.cpp
index 20a60b3df699..c13b1f3bf2a0 100644
--- a/contrib/llvm-project/llvm/tools/llvm-readobj/MachODumper.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-readobj/MachODumper.cpp
@@ -10,7 +10,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "Error.h"
 #include "ObjDumper.h"
 #include "StackMapPrinter.h"
 #include "llvm-readobj.h"
@@ -28,7 +27,7 @@ namespace {
 class MachODumper : public ObjDumper {
 public:
   MachODumper(const MachOObjectFile *Obj, ScopedPrinter &Writer)
-      : ObjDumper(Writer), Obj(Obj) {}
+      : ObjDumper(Writer, Obj->getFileName()), Obj(Obj) {}
 
   void printFileHeaders() override;
   void printSectionHeaders() override;
@@ -68,15 +67,9 @@ private:
 
 namespace llvm {
 
-std::error_code createMachODumper(const object::ObjectFile *Obj,
-                                  ScopedPrinter &Writer,
-                                  std::unique_ptr<ObjDumper> &Result) {
-  const MachOObjectFile *MachOObj = dyn_cast<MachOObjectFile>(Obj);
-  if (!MachOObj)
-    return readobj_error::unsupported_obj_file_format;
-
-  Result.reset(new MachODumper(MachOObj, Writer));
-  return readobj_error::success;
+std::unique_ptr<ObjDumper> createMachODumper(const object::MachOObjectFile &Obj,
+                                             ScopedPrinter &Writer) {
+  return std::make_unique<MachODumper>(&Obj, Writer);
 }
 
 } // namespace llvm
@@ -161,8 +154,9 @@ static const EnumEntry<uint32_t> MachOHeaderCpuSubtypesARM[] = {
 };
 
 static const EnumEntry<uint32_t> MachOHeaderCpuSubtypesARM64[] = {
-  LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_ARM64_ALL),
-  LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_ARM64E),
+    LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_ARM64_ALL),
+    LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_ARM64_V8),
+    LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_ARM64E),
 };
 
 static const EnumEntry<uint32_t> MachOHeaderCpuSubtypesSPARC[] = {
@@ -629,13 +623,20 @@ void MachODumper::printSymbol(const SymbolRef &Symbol) {
   getSymbol(Obj, Symbol.getRawDataRefImpl(), MOSymbol);
 
   StringRef SectionName = "";
-  Expected<section_iterator> SecIOrErr = Symbol.getSection();
-  if (!SecIOrErr)
-    reportError(SecIOrErr.takeError(), Obj->getFileName());
-
-  section_iterator SecI = *SecIOrErr;
-  if (SecI != Obj->section_end())
-    SectionName = unwrapOrError(Obj->getFileName(), SecI->getName());
+  // Don't ask a Mach-O STABS symbol for its section unless we know that
+  // STAB symbol's section field refers to a valid section index. Otherwise
+  // the symbol may error trying to load a section that does not exist.
+  // TODO: Add a whitelist of STABS symbol types that contain valid section
+  // indices.
+  if (!(MOSymbol.Type & MachO::N_STAB)) {
+    Expected<section_iterator> SecIOrErr = Symbol.getSection();
+    if (!SecIOrErr)
+      reportError(SecIOrErr.takeError(), Obj->getFileName());
+
+    section_iterator SecI = *SecIOrErr;
+    if (SecI != Obj->section_end())
+      SectionName = unwrapOrError(Obj->getFileName(), SecI->getName());
+  }
 
   DictScope D(W, "Symbol");
   W.printNumber("Name", SymbolName, MOSymbol.StringIndex);
diff --git a/contrib/llvm-project/llvm/tools/llvm-readobj/ObjDumper.cpp b/contrib/llvm-project/llvm/tools/llvm-readobj/ObjDumper.cpp
index ce61f1c53a4d..fc91d81f0784 100644
--- a/contrib/llvm-project/llvm/tools/llvm-readobj/ObjDumper.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-readobj/ObjDumper.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "ObjDumper.h"
-#include "Error.h"
 #include "llvm-readobj.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/Error.h"
@@ -27,9 +26,25 @@ static inline Error createError(const Twine &Msg) {
   return createStringError(object::object_error::parse_failed, Msg);
 }
 
-ObjDumper::ObjDumper(ScopedPrinter &Writer) : W(Writer) {}
+ObjDumper::ObjDumper(ScopedPrinter &Writer, StringRef ObjName) : W(Writer) {
+  // Dumper reports all non-critical errors as warnings.
+  // It does not print the same warning more than once.
+  WarningHandler = [=](const Twine &Msg) {
+    if (Warnings.insert(Msg.str()).second)
+      reportWarning(createError(Msg), ObjName);
+    return Error::success();
+  };
+}
+
+ObjDumper::~ObjDumper() {}
+
+void ObjDumper::reportUniqueWarning(Error Err) const {
+  reportUniqueWarning(toString(std::move(Err)));
+}
 
-ObjDumper::~ObjDumper() {
+void ObjDumper::reportUniqueWarning(const Twine &Msg) const {
+  cantFail(WarningHandler(Msg),
+           "WarningHandler should always return ErrorSuccess");
 }
 
 static void printAsPrintable(raw_ostream &W, const uint8_t *Start, size_t Len) {
@@ -38,7 +53,7 @@ static void printAsPrintable(raw_ostream &W, const uint8_t *Start, size_t Len) {
 }
 
 static std::vector<object::SectionRef>
-getSectionRefsByNameOrIndex(const object::ObjectFile *Obj,
+getSectionRefsByNameOrIndex(const object::ObjectFile &Obj,
                             ArrayRef<std::string> Sections) {
   std::vector<object::SectionRef> Ret;
   std::map<std::string, bool> SecNames;
@@ -51,9 +66,9 @@ getSectionRefsByNameOrIndex(const object::ObjectFile *Obj,
       SecNames.emplace(std::string(Section), false);
   }
 
-  SecIndex = Obj->isELF() ? 0 : 1;
-  for (object::SectionRef SecRef : Obj->sections()) {
-    StringRef SecName = unwrapOrError(Obj->getFileName(), SecRef.getName());
+  SecIndex = Obj.isELF() ? 0 : 1;
+  for (object::SectionRef SecRef : Obj.sections()) {
+    StringRef SecName = unwrapOrError(Obj.getFileName(), SecRef.getName());
     auto NameIt = SecNames.find(std::string(SecName));
     if (NameIt != SecNames.end())
       NameIt->second = true;
@@ -69,24 +84,23 @@ getSectionRefsByNameOrIndex(const object::ObjectFile *Obj,
     if (!S.second)
       reportWarning(
           createError(formatv("could not find section '{0}'", S.first).str()),
-          Obj->getFileName());
+          Obj.getFileName());
 
   for (std::pair<unsigned, bool> S : SecIndices)
     if (!S.second)
       reportWarning(
           createError(formatv("could not find section {0}", S.first).str()),
-          Obj->getFileName());
+          Obj.getFileName());
 
   return Ret;
 }
 
-void ObjDumper::printSectionsAsString(const object::ObjectFile *Obj,
+void ObjDumper::printSectionsAsString(const object::ObjectFile &Obj,
                                       ArrayRef<std::string> Sections) {
   bool First = true;
   for (object::SectionRef Section :
        getSectionRefsByNameOrIndex(Obj, Sections)) {
-    StringRef SectionName =
-        unwrapOrError(Obj->getFileName(), Section.getName());
+    StringRef SectionName = unwrapOrError(Obj.getFileName(), Section.getName());
 
     if (!First)
       W.startLine() << '\n';
@@ -94,7 +108,7 @@ void ObjDumper::printSectionsAsString(const object::ObjectFile *Obj,
     W.startLine() << "String dump of section '" << SectionName << "':\n";
 
     StringRef SectionContent =
-        unwrapOrError(Obj->getFileName(), Section.getContents());
+        unwrapOrError(Obj.getFileName(), Section.getContents());
 
     const uint8_t *SecContent = SectionContent.bytes_begin();
     const uint8_t *CurrentWord = SecContent;
@@ -115,13 +129,12 @@ void ObjDumper::printSectionsAsString(const object::ObjectFile *Obj,
   }
 }
 
-void ObjDumper::printSectionsAsHex(const object::ObjectFile *Obj,
+void ObjDumper::printSectionsAsHex(const object::ObjectFile &Obj,
                                    ArrayRef<std::string> Sections) {
   bool First = true;
   for (object::SectionRef Section :
        getSectionRefsByNameOrIndex(Obj, Sections)) {
-    StringRef SectionName =
-        unwrapOrError(Obj->getFileName(), Section.getName());
+    StringRef SectionName = unwrapOrError(Obj.getFileName(), Section.getName());
 
     if (!First)
       W.startLine() << '\n';
@@ -129,7 +142,7 @@ void ObjDumper::printSectionsAsHex(const object::ObjectFile *Obj,
     W.startLine() << "Hex dump of section '" << SectionName << "':\n";
 
     StringRef SectionContent =
-        unwrapOrError(Obj->getFileName(), Section.getContents());
+        unwrapOrError(Obj.getFileName(), Section.getContents());
     const uint8_t *SecContent = SectionContent.bytes_begin();
     const uint8_t *SecEnd = SecContent + SectionContent.size();
 
@@ -155,8 +168,9 @@ void ObjDumper::printSectionsAsHex(const object::ObjectFile *Obj,
       // Least, if we cut in a middle of a row, we add the remaining characters,
       // which is (8 - (k * 2)).
       if (i < 4)
-        W.startLine() << format("%*c", (4 - i) * 8 + (4 - i) + (8 - (k * 2)),
-                                ' ');
+        W.startLine() << format("%*c", (4 - i) * 8 + (4 - i), ' ');
+      if (k < 4)
+        W.startLine() << format("%*c", 8 - k * 2, ' ');
 
       TmpSecPtr = SecPtr;
       for (i = 0; TmpSecPtr + i < SecEnd && i < 16; ++i)
diff --git a/contrib/llvm-project/llvm/tools/llvm-readobj/ObjDumper.h b/contrib/llvm-project/llvm/tools/llvm-readobj/ObjDumper.h
index 57477606d6e8..d4e166b504cf 100644
--- a/contrib/llvm-project/llvm/tools/llvm-readobj/ObjDumper.h
+++ b/contrib/llvm-project/llvm/tools/llvm-readobj/ObjDumper.h
@@ -16,10 +16,14 @@
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/CommandLine.h"
 
+#include <unordered_set>
+
 namespace llvm {
 namespace object {
 class COFFImportFile;
 class ObjectFile;
+class XCOFFObjectFile;
+class ELFObjectFileBase;
 }
 namespace codeview {
 class GlobalTypeTableBuilder;
@@ -30,9 +34,11 @@ class ScopedPrinter;
 
 class ObjDumper {
 public:
-  ObjDumper(ScopedPrinter &Writer);
+  ObjDumper(ScopedPrinter &Writer, StringRef ObjName);
   virtual ~ObjDumper();
 
+  virtual bool canDumpContent() { return true; }
+
   virtual void printFileHeaders() = 0;
   virtual void printSectionHeaders() = 0;
   virtual void printRelocations() = 0;
@@ -59,7 +65,7 @@ public:
   virtual void printNeededLibraries() { }
   virtual void printSectionAsHex(StringRef SectionName) {}
   virtual void printHashTable() { }
-  virtual void printGnuHashTable(const object::ObjectFile *Obj) {}
+  virtual void printGnuHashTable() {}
   virtual void printHashSymbols() {}
   virtual void printLoadName() {}
   virtual void printVersionInfo() {}
@@ -70,7 +76,8 @@ public:
   virtual void printNotes() {}
   virtual void printELFLinkerOptions() {}
   virtual void printStackSizes() {}
-  virtual void printArchSpecificInfo() { }
+  virtual void printSectionDetails() {}
+  virtual void printArchSpecificInfo() {}
 
   // Only implemented for PE/COFF.
   virtual void printCOFFImports() { }
@@ -78,6 +85,7 @@ public:
   virtual void printCOFFDirectives() { }
   virtual void printCOFFBaseReloc() { }
   virtual void printCOFFDebugDirectory() { }
+  virtual void printCOFFTLSDirectory() {}
   virtual void printCOFFResources() {}
   virtual void printCOFFLoadConfig() { }
   virtual void printCodeViewDebugInfo() { }
@@ -98,11 +106,15 @@ public:
 
   virtual void printStackMap() const = 0;
 
-  void printSectionsAsString(const object::ObjectFile *Obj,
+  void printSectionsAsString(const object::ObjectFile &Obj,
                              ArrayRef<std::string> Sections);
-  void printSectionsAsHex(const object::ObjectFile *Obj,
+  void printSectionsAsHex(const object::ObjectFile &Obj,
                           ArrayRef<std::string> Sections);
 
+  std::function<Error(const Twine &Msg)> WarningHandler;
+  void reportUniqueWarning(Error Err) const;
+  void reportUniqueWarning(const Twine &Msg) const;
+
 protected:
   ScopedPrinter &W;
 
@@ -111,27 +123,24 @@ private:
   virtual void printDynamicSymbols() {}
   virtual void printProgramHeaders() {}
   virtual void printSectionMapping() {}
+
+  std::unordered_set<std::string> Warnings;
 };
 
-std::error_code createCOFFDumper(const object::ObjectFile *Obj,
-                                 ScopedPrinter &Writer,
-                                 std::unique_ptr<ObjDumper> &Result);
+std::unique_ptr<ObjDumper> createCOFFDumper(const object::COFFObjectFile &Obj,
+                                            ScopedPrinter &Writer);
 
-std::error_code createELFDumper(const object::ObjectFile *Obj,
-                                ScopedPrinter &Writer,
-                                std::unique_ptr<ObjDumper> &Result);
+std::unique_ptr<ObjDumper> createELFDumper(const object::ELFObjectFileBase &Obj,
+                                           ScopedPrinter &Writer);
 
-std::error_code createMachODumper(const object::ObjectFile *Obj,
-                                  ScopedPrinter &Writer,
-                                  std::unique_ptr<ObjDumper> &Result);
+std::unique_ptr<ObjDumper> createMachODumper(const object::MachOObjectFile &Obj,
+                                             ScopedPrinter &Writer);
 
-std::error_code createWasmDumper(const object::ObjectFile *Obj,
-                                 ScopedPrinter &Writer,
-                                 std::unique_ptr<ObjDumper> &Result);
+std::unique_ptr<ObjDumper> createWasmDumper(const object::WasmObjectFile &Obj,
+                                            ScopedPrinter &Writer);
 
-std::error_code createXCOFFDumper(const object::ObjectFile *Obj,
-                                  ScopedPrinter &Writer,
-                                  std::unique_ptr<ObjDumper> &Result);
+std::unique_ptr<ObjDumper> createXCOFFDumper(const object::XCOFFObjectFile &Obj,
+                                             ScopedPrinter &Writer);
 
 void dumpCOFFImportFile(const object::COFFImportFile *File,
                         ScopedPrinter &Writer);
diff --git a/contrib/llvm-project/llvm/tools/llvm-readobj/WasmDumper.cpp b/contrib/llvm-project/llvm/tools/llvm-readobj/WasmDumper.cpp
index a02dbb999826..fb7134d20a85 100644
--- a/contrib/llvm-project/llvm/tools/llvm-readobj/WasmDumper.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-readobj/WasmDumper.cpp
@@ -10,7 +10,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "Error.h"
 #include "ObjDumper.h"
 #include "llvm-readobj.h"
 #include "llvm/Object/Wasm.h"
@@ -25,7 +24,7 @@ static const EnumEntry<unsigned> WasmSymbolTypes[] = {
 #define ENUM_ENTRY(X)                                                          \
   { #X, wasm::WASM_SYMBOL_TYPE_##X }
     ENUM_ENTRY(FUNCTION), ENUM_ENTRY(DATA),  ENUM_ENTRY(GLOBAL),
-    ENUM_ENTRY(SECTION),  ENUM_ENTRY(EVENT),
+    ENUM_ENTRY(SECTION),  ENUM_ENTRY(EVENT), ENUM_ENTRY(TABLE),
 #undef ENUM_ENTRY
 };
 
@@ -58,7 +57,7 @@ static const EnumEntry<unsigned> WasmSymbolFlags[] = {
 class WasmDumper : public ObjDumper {
 public:
   WasmDumper(const WasmObjectFile *Obj, ScopedPrinter &Writer)
-      : ObjDumper(Writer), Obj(Obj) {}
+      : ObjDumper(Writer, Obj->getFileName()), Obj(Obj) {}
 
   void printFileHeaders() override;
   void printSectionHeaders() override;
@@ -241,14 +240,9 @@ void WasmDumper::printSymbol(const SymbolRef &Sym) {
 
 namespace llvm {
 
-std::error_code createWasmDumper(const object::ObjectFile *Obj,
-                                 ScopedPrinter &Writer,
-                                 std::unique_ptr<ObjDumper> &Result) {
-  const auto *WasmObj = dyn_cast<WasmObjectFile>(Obj);
-  assert(WasmObj && "createWasmDumper called with non-wasm object");
-
-  Result.reset(new WasmDumper(WasmObj, Writer));
-  return readobj_error::success;
+std::unique_ptr<ObjDumper> createWasmDumper(const object::WasmObjectFile &Obj,
+                                            ScopedPrinter &Writer) {
+  return std::make_unique<WasmDumper>(&Obj, Writer);
 }
 
 } // namespace llvm
diff --git a/contrib/llvm-project/llvm/tools/llvm-readobj/Win64EHDumper.cpp b/contrib/llvm-project/llvm/tools/llvm-readobj/Win64EHDumper.cpp
index 380baae2eeb4..7e84c1bca35d 100644
--- a/contrib/llvm-project/llvm/tools/llvm-readobj/Win64EHDumper.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-readobj/Win64EHDumper.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "Win64EHDumper.h"
-#include "Error.h"
 #include "llvm-readobj.h"
 #include "llvm/Object/COFF.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -120,10 +119,10 @@ static std::error_code getSymbol(const COFFObjectFile &COFF, uint64_t VA,
       return errorToErrorCode(Address.takeError());
     if (*Address == VA) {
       Sym = Symbol;
-      return readobj_error::success;
+      return std::error_code();
     }
   }
-  return readobj_error::unknown_symbol;
+  return inconvertibleErrorCode();
 }
 
 static std::string formatSymbol(const Dumper::Context &Ctx,
diff --git a/contrib/llvm-project/llvm/tools/llvm-readobj/WindowsResourceDumper.cpp b/contrib/llvm-project/llvm/tools/llvm-readobj/WindowsResourceDumper.cpp
index a2fb6aac3f93..fb085ecaa76e 100644
--- a/contrib/llvm-project/llvm/tools/llvm-readobj/WindowsResourceDumper.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-readobj/WindowsResourceDumper.cpp
@@ -11,7 +11,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "WindowsResourceDumper.h"
-#include "Error.h"
 #include "llvm/Object/WindowsResource.h"
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/ScopedPrinter.h"
diff --git a/contrib/llvm-project/llvm/tools/llvm-readobj/XCOFFDumper.cpp b/contrib/llvm-project/llvm/tools/llvm-readobj/XCOFFDumper.cpp
index dd62f98d9595..8f0f18cedceb 100644
--- a/contrib/llvm-project/llvm/tools/llvm-readobj/XCOFFDumper.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-readobj/XCOFFDumper.cpp
@@ -10,7 +10,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "Error.h"
 #include "ObjDumper.h"
 #include "llvm-readobj.h"
 #include "llvm/Object/XCOFFObjectFile.h"
@@ -25,7 +24,7 @@ class XCOFFDumper : public ObjDumper {
 
 public:
   XCOFFDumper(const XCOFFObjectFile &Obj, ScopedPrinter &Writer)
-      : ObjDumper(Writer), Obj(Obj) {}
+      : ObjDumper(Writer, Obj.getFileName()), Obj(Obj) {}
 
   void printFileHeaders() override;
   void printSectionHeaders() override;
@@ -515,14 +514,8 @@ void XCOFFDumper::printSectionHeaders(ArrayRef<T> Sections) {
 }
 
 namespace llvm {
-std::error_code createXCOFFDumper(const object::ObjectFile *Obj,
-                                  ScopedPrinter &Writer,
-                                  std::unique_ptr<ObjDumper> &Result) {
-  const XCOFFObjectFile *XObj = dyn_cast<XCOFFObjectFile>(Obj);
-  if (!XObj)
-    return readobj_error::unsupported_obj_file_format;
-
-  Result.reset(new XCOFFDumper(*XObj, Writer));
-  return readobj_error::success;
+std::unique_ptr<ObjDumper>
+createXCOFFDumper(const object::XCOFFObjectFile &XObj, ScopedPrinter &Writer) {
+  return std::make_unique<XCOFFDumper>(XObj, Writer);
 }
 } // namespace llvm
diff --git a/contrib/llvm-project/llvm/tools/llvm-readobj/llvm-readobj.cpp b/contrib/llvm-project/llvm/tools/llvm-readobj/llvm-readobj.cpp
index b9c6ad2256ae..41cd4414d051 100644
--- a/contrib/llvm-project/llvm/tools/llvm-readobj/llvm-readobj.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-readobj/llvm-readobj.cpp
@@ -19,20 +19,23 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm-readobj.h"
-#include "Error.h"
 #include "ObjDumper.h"
 #include "WindowsResourceDumper.h"
 #include "llvm/DebugInfo/CodeView/GlobalTypeTableBuilder.h"
 #include "llvm/DebugInfo/CodeView/MergingTypeTableBuilder.h"
 #include "llvm/Object/Archive.h"
 #include "llvm/Object/COFFImportFile.h"
+#include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Object/MachOUniversal.h"
 #include "llvm/Object/ObjectFile.h"
+#include "llvm/Object/Wasm.h"
 #include "llvm/Object/WindowsResource.h"
+#include "llvm/Object/XCOFFObjectFile.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/Errc.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/InitLLVM.h"
@@ -63,7 +66,7 @@ namespace opts {
       DependentLibraries("dependent-libraries",
                          cl::desc("Display the dependent libraries section"));
 
-  // --headers -e
+  // --headers, -e
   cl::opt<bool>
       Headers("headers",
           cl::desc("Equivalent to setting: --file-headers, --program-headers, "
@@ -134,6 +137,11 @@ namespace opts {
   cl::opt<bool> DynRelocs("dyn-relocations",
     cl::desc("Display the dynamic relocation entries in the file"));
 
+  // --section-details
+  // Also -t in llvm-readelf mode.
+  cl::opt<bool> SectionDetails("section-details",
+                               cl::desc("Display the section details"));
+
   // --symbols
   // Also -s in llvm-readelf mode, or -t in llvm-readobj mode.
   cl::opt<bool>
@@ -183,14 +191,18 @@ namespace opts {
                           cl::aliasopt(ProgramHeaders));
 
   // --string-dump, -p
-  cl::list<std::string> StringDump("string-dump", cl::desc("<number|name>"),
-                                   cl::ZeroOrMore);
+  cl::list<std::string> StringDump(
+      "string-dump", cl::value_desc("number|name"),
+      cl::desc("Display the specified section(s) as a list of strings"),
+      cl::ZeroOrMore);
   cl::alias StringDumpShort("p", cl::desc("Alias for --string-dump"),
                             cl::aliasopt(StringDump), cl::Prefix);
 
   // --hex-dump, -x
-  cl::list<std::string> HexDump("hex-dump", cl::desc("<number|name>"),
-                                cl::ZeroOrMore);
+  cl::list<std::string>
+      HexDump("hex-dump", cl::value_desc("number|name"),
+              cl::desc("Display the specified section(s) as hexadecimal bytes"),
+              cl::ZeroOrMore);
   cl::alias HexDumpShort("x", cl::desc("Alias for --hex-dump"),
                          cl::aliasopt(HexDump), cl::Prefix);
 
@@ -265,6 +277,10 @@ namespace opts {
   COFFDebugDirectory("coff-debug-directory",
                      cl::desc("Display the PE/COFF debug directory"));
 
+  // --coff-tls-directory
+  cl::opt<bool> COFFTLSDirectory("coff-tls-directory",
+                                 cl::desc("Display the PE/COFF TLS directory"));
+
   // --coff-resources
   cl::opt<bool> COFFResources("coff-resources",
                               cl::desc("Display the PE/COFF .rsrc section"));
@@ -420,55 +436,73 @@ struct ReadObjTypeTableBuilder {
 static ReadObjTypeTableBuilder CVTypes;
 
 /// Creates an format-specific object file dumper.
-static std::error_code createDumper(const ObjectFile *Obj,
-                                    ScopedPrinter &Writer,
-                                    std::unique_ptr<ObjDumper> &Result) {
-  if (!Obj)
-    return readobj_error::unsupported_file_format;
-
-  if (Obj->isCOFF())
-    return createCOFFDumper(Obj, Writer, Result);
-  if (Obj->isELF())
-    return createELFDumper(Obj, Writer, Result);
-  if (Obj->isMachO())
-    return createMachODumper(Obj, Writer, Result);
-  if (Obj->isWasm())
-    return createWasmDumper(Obj, Writer, Result);
-  if (Obj->isXCOFF())
-    return createXCOFFDumper(Obj, Writer, Result);
-
-  return readobj_error::unsupported_obj_file_format;
+static Expected<std::unique_ptr<ObjDumper>>
+createDumper(const ObjectFile &Obj, ScopedPrinter &Writer) {
+  if (const COFFObjectFile *COFFObj = dyn_cast<COFFObjectFile>(&Obj))
+    return createCOFFDumper(*COFFObj, Writer);
+
+  if (const ELFObjectFileBase *ELFObj = dyn_cast<ELFObjectFileBase>(&Obj))
+    return createELFDumper(*ELFObj, Writer);
+
+  if (const MachOObjectFile *MachOObj = dyn_cast<MachOObjectFile>(&Obj))
+    return createMachODumper(*MachOObj, Writer);
+
+  if (const WasmObjectFile *WasmObj = dyn_cast<WasmObjectFile>(&Obj))
+    return createWasmDumper(*WasmObj, Writer);
+
+  if (const XCOFFObjectFile *XObj = dyn_cast<XCOFFObjectFile>(&Obj))
+    return createXCOFFDumper(*XObj, Writer);
+
+  return createStringError(errc::invalid_argument,
+                           "unsupported object file format");
 }
 
 /// Dumps the specified object file.
-static void dumpObject(const ObjectFile *Obj, ScopedPrinter &Writer,
+static void dumpObject(ObjectFile &Obj, ScopedPrinter &Writer,
                        const Archive *A = nullptr) {
   std::string FileStr =
-          A ? Twine(A->getFileName() + "(" + Obj->getFileName() + ")").str()
-            : Obj->getFileName().str();
+      A ? Twine(A->getFileName() + "(" + Obj.getFileName() + ")").str()
+        : Obj.getFileName().str();
 
-  std::unique_ptr<ObjDumper> Dumper;
-  if (std::error_code EC = createDumper(Obj, Writer, Dumper))
-    reportError(errorCodeToError(EC), FileStr);
+  std::string ContentErrString;
+  if (Error ContentErr = Obj.initContent())
+    ContentErrString = "unable to continue dumping, the file is corrupt: " +
+                       toString(std::move(ContentErr));
+
+  ObjDumper *Dumper;
+  Expected<std::unique_ptr<ObjDumper>> DumperOrErr = createDumper(Obj, Writer);
+  if (!DumperOrErr)
+    reportError(DumperOrErr.takeError(), FileStr);
+  Dumper = (*DumperOrErr).get();
 
   if (opts::Output == opts::LLVM || opts::InputFilenames.size() > 1 || A) {
     Writer.startLine() << "\n";
     Writer.printString("File", FileStr);
   }
   if (opts::Output == opts::LLVM) {
-    Writer.printString("Format", Obj->getFileFormatName());
-    Writer.printString("Arch", Triple::getArchTypeName(
-                                   (llvm::Triple::ArchType)Obj->getArch()));
+    Writer.printString("Format", Obj.getFileFormatName());
+    Writer.printString("Arch", Triple::getArchTypeName(Obj.getArch()));
     Writer.printString(
         "AddressSize",
-        std::string(formatv("{0}bit", 8 * Obj->getBytesInAddress())));
+        std::string(formatv("{0}bit", 8 * Obj.getBytesInAddress())));
     Dumper->printLoadName();
   }
 
   if (opts::FileHeaders)
     Dumper->printFileHeaders();
-  if (opts::SectionHeaders)
-    Dumper->printSectionHeaders();
+
+  // This is only used for ELF currently. In some cases, when an object is
+  // corrupt (e.g. truncated), we can't dump anything except the file header.
+  if (!ContentErrString.empty())
+    reportError(createError(ContentErrString), FileStr);
+
+  if (opts::SectionDetails || opts::SectionHeaders) {
+    if (opts::Output == opts::GNU && opts::SectionDetails)
+      Dumper->printSectionDetails();
+    else
+      Dumper->printSectionHeaders();
+  }
+
   if (opts::HashSymbols)
     Dumper->printHashSymbols();
   if (opts::ProgramHeaders || opts::SectionMapping == cl::BOU_TRUE)
@@ -492,10 +526,10 @@ static void dumpObject(const ObjectFile *Obj, ScopedPrinter &Writer,
   if (opts::HashTable)
     Dumper->printHashTable();
   if (opts::GnuHashTable)
-    Dumper->printGnuHashTable(Obj);
+    Dumper->printGnuHashTable();
   if (opts::VersionInfo)
     Dumper->printVersionInfo();
-  if (Obj->isELF()) {
+  if (Obj.isELF()) {
     if (opts::DependentLibraries)
       Dumper->printDependentLibs();
     if (opts::ELFLinkerOptions)
@@ -513,7 +547,7 @@ static void dumpObject(const ObjectFile *Obj, ScopedPrinter &Writer,
     if (opts::Notes)
       Dumper->printNotes();
   }
-  if (Obj->isCOFF()) {
+  if (Obj.isCOFF()) {
     if (opts::COFFImports)
       Dumper->printCOFFImports();
     if (opts::COFFExports)
@@ -524,6 +558,8 @@ static void dumpObject(const ObjectFile *Obj, ScopedPrinter &Writer,
       Dumper->printCOFFBaseReloc();
     if (opts::COFFDebugDirectory)
       Dumper->printCOFFDebugDirectory();
+    if (opts::COFFTLSDirectory)
+      Dumper->printCOFFTLSDirectory();
     if (opts::COFFResources)
       Dumper->printCOFFResources();
     if (opts::COFFLoadConfig)
@@ -539,7 +575,7 @@ static void dumpObject(const ObjectFile *Obj, ScopedPrinter &Writer,
                                  CVTypes.GlobalIDTable, CVTypes.GlobalTypeTable,
                                  opts::CodeViewEnableGHash);
   }
-  if (Obj->isMachO()) {
+  if (Obj.isMachO()) {
     if (opts::MachODataInCode)
       Dumper->printMachODataInCode();
     if (opts::MachOIndirectSymbols)
@@ -569,13 +605,17 @@ static void dumpArchive(const Archive *Arc, ScopedPrinter &Writer) {
         reportError(std::move(E), Arc->getFileName());
       continue;
     }
-    if (ObjectFile *Obj = dyn_cast<ObjectFile>(&*ChildOrErr.get()))
-      dumpObject(Obj, Writer, Arc);
-    else if (COFFImportFile *Imp = dyn_cast<COFFImportFile>(&*ChildOrErr.get()))
+
+    Binary *Bin = ChildOrErr->get();
+    if (ObjectFile *Obj = dyn_cast<ObjectFile>(Bin))
+      dumpObject(*Obj, Writer, Arc);
+    else if (COFFImportFile *Imp = dyn_cast<COFFImportFile>(Bin))
       dumpCOFFImportFile(Imp, Writer);
     else
-      reportError(errorCodeToError(readobj_error::unrecognized_file_format),
-                  Arc->getFileName());
+      reportWarning(createStringError(errc::invalid_argument,
+                                      Bin->getFileName() +
+                                          " has an unsupported file type"),
+                    Arc->getFileName());
   }
   if (Err)
     reportError(std::move(Err), Arc->getFileName());
@@ -587,7 +627,7 @@ static void dumpMachOUniversalBinary(const MachOUniversalBinary *UBinary,
   for (const MachOUniversalBinary::ObjectForArch &Obj : UBinary->objects()) {
     Expected<std::unique_ptr<MachOObjectFile>> ObjOrErr = Obj.getAsObjectFile();
     if (ObjOrErr)
-      dumpObject(&*ObjOrErr.get(), Writer);
+      dumpObject(*ObjOrErr.get(), Writer);
     else if (auto E = isNotObjectErrorInvalidFileType(ObjOrErr.takeError()))
       reportError(ObjOrErr.takeError(), UBinary->getFileName());
     else if (Expected<std::unique_ptr<Archive>> AOrErr = Obj.getAsArchive())
@@ -607,7 +647,8 @@ static void dumpWindowsResourceFile(WindowsResource *WinRes,
 /// Opens \a File and dumps it.
 static void dumpInput(StringRef File, ScopedPrinter &Writer) {
   // Attempt to open the binary.
-  Expected<OwningBinary<Binary>> BinaryOrErr = createBinary(File);
+  Expected<OwningBinary<Binary>> BinaryOrErr =
+      createBinary(File, /*Context=*/nullptr, /*InitContent=*/false);
   if (!BinaryOrErr)
     reportError(BinaryOrErr.takeError(), File);
   Binary &Binary = *BinaryOrErr.get().getBinary();
@@ -618,14 +659,13 @@ static void dumpInput(StringRef File, ScopedPrinter &Writer) {
                dyn_cast<MachOUniversalBinary>(&Binary))
     dumpMachOUniversalBinary(UBinary, Writer);
   else if (ObjectFile *Obj = dyn_cast<ObjectFile>(&Binary))
-    dumpObject(Obj, Writer);
+    dumpObject(*Obj, Writer);
   else if (COFFImportFile *Import = dyn_cast<COFFImportFile>(&Binary))
     dumpCOFFImportFile(Import, Writer);
   else if (WindowsResource *WinRes = dyn_cast<WindowsResource>(&Binary))
     dumpWindowsResourceFile(WinRes, Writer);
   else
-    reportError(errorCodeToError(readobj_error::unrecognized_file_format),
-                File);
+    llvm_unreachable("unrecognized file type");
 
   CVTypes.Binaries.push_back(std::move(*BinaryOrErr));
 }
@@ -638,8 +678,7 @@ static void registerReadobjAliases() {
                                  cl::aliasopt(opts::SectionHeaders),
                                  cl::NotHidden);
 
-  // Only register -t in llvm-readobj, as readelf reserves it for
-  // --section-details (not implemented yet).
+  // llvm-readelf reserves it for --section-details.
   static cl::alias SymbolsShort("t", cl::desc("Alias for --symbols"),
                                 cl::aliasopt(opts::Symbols), cl::NotHidden);
 
@@ -665,6 +704,11 @@ static void registerReadelfAliases() {
                                 cl::aliasopt(opts::Symbols), cl::NotHidden,
                                 cl::Grouping);
 
+  // -t is here because for readobj it is an alias for --symbols.
+  static cl::alias SectionDetailsShort(
+      "t", cl::desc("Alias for --section-details"),
+      cl::aliasopt(opts::SectionDetails), cl::NotHidden);
+
   // Allow all single letter flags to be grouped together.
   for (auto &OptEntry : cl::getRegisteredOptions()) {
     StringRef ArgName = OptEntry.getKey();
@@ -690,6 +734,11 @@ int main(int argc, const char *argv[]) {
 
   cl::ParseCommandLineOptions(argc, argv, "LLVM Object Reader\n");
 
+  // Default to print error if no filename is specified.
+  if (opts::InputFilenames.empty()) {
+    error("no input files specified");
+  }
+
   if (opts::All) {
     opts::FileHeaders = true;
     opts::ProgramHeaders = true;
@@ -714,10 +763,6 @@ int main(int argc, const char *argv[]) {
     opts::SectionHeaders = true;
   }
 
-  // Default to stdin if no filename is specified.
-  if (opts::InputFilenames.empty())
-    opts::InputFilenames.push_back("-");
-
   ScopedPrinter Writer(fouts());
   for (const std::string &I : opts::InputFilenames)
     dumpInput(I, Writer);
diff --git a/contrib/llvm-project/llvm/tools/llvm-rtdyld/llvm-rtdyld.cpp b/contrib/llvm-project/llvm/tools/llvm-rtdyld/llvm-rtdyld.cpp
index be5dbdd1c559..919943129311 100644
--- a/contrib/llvm-project/llvm/tools/llvm-rtdyld/llvm-rtdyld.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-rtdyld/llvm-rtdyld.cpp
@@ -764,6 +764,8 @@ static int linkAndVerify() {
     ErrorAndExit("Unable to create disassembler!");
 
   std::unique_ptr<MCInstrInfo> MII(TheTarget->createMCInstrInfo());
+  if (!MII)
+    ErrorAndExit("Unable to create target instruction info!");
 
   std::unique_ptr<MCInstPrinter> InstPrinter(
       TheTarget->createMCInstPrinter(Triple(TripleName), 0, *MAI, *MII, *MRI));
diff --git a/contrib/llvm-project/llvm/tools/llvm-size/llvm-size.cpp b/contrib/llvm-project/llvm/tools/llvm-size/llvm-size.cpp
index 987270e98c48..4f98a4c0ec10 100644
--- a/contrib/llvm-project/llvm/tools/llvm-size/llvm-size.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-size/llvm-size.cpp
@@ -542,9 +542,7 @@ static bool checkMachOAndArchFlags(ObjectFile *O, StringRef Filename) {
     H = MachO->MachOObjectFile::getHeader();
     T = MachOObjectFile::getArchTriple(H.cputype, H.cpusubtype);
   }
-  if (none_of(ArchFlags, [&](const std::string &Name) {
-        return Name == T.getArchName();
-      })) {
+  if (!is_contained(ArchFlags, T.getArchName())) {
     error("no architecture specified", Filename);
     return false;
   }
diff --git a/contrib/llvm-project/llvm/tools/llvm-stress/llvm-stress.cpp b/contrib/llvm-project/llvm/tools/llvm-stress/llvm-stress.cpp
index 22f530dde167..538240d65738 100644
--- a/contrib/llvm-project/llvm/tools/llvm-stress/llvm-stress.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-stress/llvm-stress.cpp
@@ -38,8 +38,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
-#include "llvm/Support/ManagedStatic.h"
-#include "llvm/Support/PrettyStackTrace.h"
+#include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
@@ -238,7 +237,7 @@ protected:
         return ConstantFP::getAllOnesValue(Tp);
       return ConstantFP::getNullValue(Tp);
     } else if (Tp->isVectorTy()) {
-      VectorType *VTp = cast<VectorType>(Tp);
+      auto *VTp = cast<FixedVectorType>(Tp);
 
       std::vector<Constant*> TempValues;
       TempValues.reserve(VTp->getNumElements());
@@ -316,8 +315,7 @@ protected:
         Type::getFloatTy(Context),
         Type::getDoubleTy(Context)
       });
-      ScalarTypes.insert(ScalarTypes.end(),
-        AdditionalScalarTypes.begin(), AdditionalScalarTypes.end());
+      llvm::append_range(ScalarTypes, AdditionalScalarTypes);
     }
 
     return ScalarTypes[getRandom() % ScalarTypes.size()];
@@ -483,10 +481,13 @@ struct ExtractElementModifier: public Modifier {
 
   void Act() override {
     Value *Val0 = getRandomVectorValue();
-    Value *V = ExtractElementInst::Create(Val0,
-             ConstantInt::get(Type::getInt32Ty(BB->getContext()),
-             getRandom() % cast<VectorType>(Val0->getType())->getNumElements()),
-             "E", BB->getTerminator());
+    Value *V = ExtractElementInst::Create(
+        Val0,
+        ConstantInt::get(
+            Type::getInt32Ty(BB->getContext()),
+            getRandom() %
+                cast<FixedVectorType>(Val0->getType())->getNumElements()),
+        "E", BB->getTerminator());
     return PT->push_back(V);
   }
 };
@@ -499,7 +500,7 @@ struct ShuffModifier: public Modifier {
     Value *Val0 = getRandomVectorValue();
     Value *Val1 = getRandomValue(Val0->getType());
 
-    unsigned Width = cast<VectorType>(Val0->getType())->getNumElements();
+    unsigned Width = cast<FixedVectorType>(Val0->getType())->getNumElements();
     std::vector<Constant*> Idxs;
 
     Type *I32 = Type::getInt32Ty(BB->getContext());
@@ -527,10 +528,13 @@ struct InsertElementModifier: public Modifier {
     Value *Val0 = getRandomVectorValue();
     Value *Val1 = getRandomValue(Val0->getType()->getScalarType());
 
-    Value *V = InsertElementInst::Create(Val0, Val1,
-              ConstantInt::get(Type::getInt32Ty(BB->getContext()),
-              getRandom() % cast<VectorType>(Val0->getType())->getNumElements()),
-              "I",  BB->getTerminator());
+    Value *V = InsertElementInst::Create(
+        Val0, Val1,
+        ConstantInt::get(
+            Type::getInt32Ty(BB->getContext()),
+            getRandom() %
+                cast<FixedVectorType>(Val0->getType())->getNumElements()),
+        "I", BB->getTerminator());
     return PT->push_back(V);
   }
 };
@@ -546,7 +550,7 @@ struct CastModifier: public Modifier {
 
     // Handle vector casts vectors.
     if (VTy->isVectorTy()) {
-      VectorType *VecTy = cast<VectorType>(VTy);
+      auto *VecTy = cast<FixedVectorType>(VTy);
       DestTy = pickVectorType(VecTy->getNumElements());
     }
 
@@ -733,10 +737,8 @@ static void IntroduceControlFlow(Function *F, Random &R) {
 int main(int argc, char **argv) {
   using namespace llvm;
 
-  // Init LLVM, call llvm_shutdown() on exit, parse args, etc.
-  PrettyStackTraceProgram X(argc, argv);
+  InitLLVM X(argc, argv);
   cl::ParseCommandLineOptions(argc, argv, "llvm codegen stress-tester\n");
-  llvm_shutdown_obj Y;
 
   auto M = std::make_unique<Module>("/tmp/autogen.bc", Context);
   Function *F = GenEmptyFunction(M.get());
diff --git a/contrib/llvm-project/llvm/tools/llvm-symbolizer/Opts.td b/contrib/llvm-project/llvm/tools/llvm-symbolizer/Opts.td
new file mode 100644
index 000000000000..ac23639f130e
--- /dev/null
+++ b/contrib/llvm-project/llvm/tools/llvm-symbolizer/Opts.td
@@ -0,0 +1,71 @@
+include "llvm/Option/OptParser.td"
+
+multiclass B<string name, string help1, string help2> {
+  def NAME: Flag<["--", "-"], name>, HelpText<help1>;
+  def no_ # NAME: Flag<["--", "-"], "no-" # name>, HelpText<help2>;
+}
+
+multiclass Eq<string name, string help> {
+  def NAME #_EQ : Joined<["--", "-"], name #"=">,
+                  HelpText<help>;
+  def : Separate<["--", "-"], name>, Alias<!cast<Joined>(NAME #_EQ)>;
+}
+
+class F<string name, string help>: Flag<["--", "-"], name>, HelpText<help>;
+
+def addresses : F<"addresses", "Show address before line information">;
+defm adjust_vma
+    : Eq<"adjust-vma", "Add specified offset to object file addresses">,
+      MetaVarName<"<offset>">;
+def basenames : Flag<["--"], "basenames">, HelpText<"Strip directory names from paths">;
+defm debug_file_directory : Eq<"debug-file-directory", "Path to directory where to look for debug files">, MetaVarName<"<dir>">;
+defm default_arch : Eq<"default-arch", "Default architecture (for multi-arch objects)">;
+defm demangle : B<"demangle", "Demangle function names", "Don't demangle function names">;
+def functions : F<"functions", "Print function name for a given address">;
+def functions_EQ : Joined<["--"], "functions=">, HelpText<"Print function name for a given address">, Values<"none,short,linkage">;
+def help : F<"help", "Display this help">;
+defm dwp : Eq<"dwp", "Path to DWP file to be use for any split CUs">, MetaVarName<"<file>">;
+defm dsym_hint : Eq<"dsym-hint", "Path to .dSYM bundles to search for debug info for the object files">, MetaVarName<"<dir>">;
+defm fallback_debug_path : Eq<"fallback-debug-path", "Fallback path for debug binaries">, MetaVarName<"<dir>">;
+defm inlines : B<"inlines", "Print all inlined frames for a given address",
+                 "Do not print inlined frames">;
+defm obj
+    : Eq<"obj", "Path to object file to be symbolized (if not provided, "
+                "object file should be specified for each input line)">, MetaVarName<"<file>">;
+defm output_style
+    : Eq<"output-style", "Specify print style. Supported styles: LLVM, GNU">,
+      MetaVarName<"style">,
+      Values<"LLVM,GNU">;
+def pretty_print : F<"pretty-print", "Make the output more human friendly">;
+defm print_source_context_lines : Eq<"print-source-context-lines", "Print N lines of source file context">;
+def relative_address : F<"relative-address", "Interpret addresses as addresses relative to the image base">;
+def relativenames : F<"relativenames", "Strip the compilation directory from paths">;
+defm untag_addresses : B<"untag-addresses", "", "Remove memory tags from addresses before symbolization">;
+def use_dia: F<"dia", "Use the DIA library to access symbols (Windows only)">;
+def verbose : F<"verbose", "Print verbose line info">;
+def version : F<"version", "Display the version">;
+
+def : Flag<["-"], "a">, Alias<addresses>, HelpText<"Alias for --addresses">;
+def : F<"print-address", "Alias for --addresses">, Alias<addresses>;
+def : Flag<["-"], "C">, Alias<demangle>, HelpText<"Alias for --demangle">;
+def : Joined<["--"], "exe=">, Alias<obj_EQ>, HelpText<"Alias for --obj">, MetaVarName<"<file>">;
+def : Separate<["--"], "exe">, Alias<obj_EQ>, HelpText<"Alias for --obj">, MetaVarName<"<file>">;
+def : JoinedOrSeparate<["-"], "e">, Alias<obj_EQ>, HelpText<"Alias for --obj">, MetaVarName<"<file>">;
+def : Joined<["-"], "e=">, Alias<obj_EQ>, HelpText<"Alias for --obj">, MetaVarName<"<file>">;
+def : Flag<["-"], "f">, Alias<functions>, HelpText<"Alias for --functions">;
+def : Joined<["-"], "f=">, Alias<functions_EQ>, HelpText<"Alias for --functions=">;
+def : Flag<["-"], "h">, Alias<help>;
+def : Flag<["-"], "i">, Alias<inlines>, HelpText<"Alias for --inlines">;
+def : F<"inlining", "Alias for --inlines">, Alias<inlines>;
+def : Flag<["-"], "p">, Alias<pretty_print>, HelpText<"Alias for --pretty-print">;
+def : Flag<["-"], "s">, Alias<basenames>, HelpText<"Alias for --basenames">;
+def : Flag<["-"], "v">, Alias<version>, HelpText<"Alias for --version">;
+
+// Compatibility aliases for old asan_symbolize.py and sanitizer binaries (before 2020-08).
+def : Flag<["--"], "inlining=true">, Alias<inlines>, HelpText<"Alias for --inlines">;
+def : Flag<["--"], "inlining=false">, Alias<no_inlines>, HelpText<"Alias for --no-inlines">;
+// Compatibility aliases for pprof's symbolizer.
+def : Flag<["-"], "demangle=true">, Alias<demangle>, HelpText<"Alias for --demangle">;
+def : Flag<["-"], "demangle=false">, Alias<no_demangle>, HelpText<"Alias for --no-demangle">;
+// Compatibility no-op options.
+def : Flag<["--"], "use-symbol-table=true">;
diff --git a/contrib/llvm-project/llvm/tools/llvm-symbolizer/llvm-symbolizer.cpp b/contrib/llvm-project/llvm/tools/llvm-symbolizer/llvm-symbolizer.cpp
index 6a702c64a105..8734c2d74045 100644
--- a/contrib/llvm-project/llvm/tools/llvm-symbolizer/llvm-symbolizer.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-symbolizer/llvm-symbolizer.cpp
@@ -14,15 +14,21 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "Opts.inc"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Config/config.h"
 #include "llvm/DebugInfo/Symbolize/DIPrinter.h"
 #include "llvm/DebugInfo/Symbolize/Symbolize.h"
+#include "llvm/Option/Arg.h"
+#include "llvm/Option/ArgList.h"
+#include "llvm/Option/Option.h"
 #include "llvm/Support/COM.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/StringSaver.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cstdio>
@@ -32,144 +38,42 @@
 using namespace llvm;
 using namespace symbolize;
 
-static cl::opt<bool>
-ClUseSymbolTable("use-symbol-table", cl::init(true),
-                 cl::desc("Prefer names in symbol table to names "
-                          "in debug info"));
-
-static cl::opt<FunctionNameKind> ClPrintFunctions(
-    "functions", cl::init(FunctionNameKind::LinkageName),
-    cl::desc("Print function name for a given address"), cl::ValueOptional,
-    cl::values(clEnumValN(FunctionNameKind::None, "none", "omit function name"),
-               clEnumValN(FunctionNameKind::ShortName, "short",
-                          "print short function name"),
-               clEnumValN(FunctionNameKind::LinkageName, "linkage",
-                          "print function linkage name"),
-               // Sentinel value for unspecified value.
-               clEnumValN(FunctionNameKind::LinkageName, "", "")));
-static cl::alias ClPrintFunctionsShort("f", cl::desc("Alias for -functions"),
-                                       cl::NotHidden, cl::Grouping,
-                                       cl::aliasopt(ClPrintFunctions));
-
-static cl::opt<bool>
-    ClUseRelativeAddress("relative-address", cl::init(false),
-                         cl::desc("Interpret addresses as relative addresses"),
-                         cl::ReallyHidden);
-
-static cl::opt<bool> ClUntagAddresses(
-    "untag-addresses", cl::init(true),
-    cl::desc("Remove memory tags from addresses before symbolization"));
-
-static cl::opt<bool>
-    ClPrintInlining("inlining", cl::init(true),
-                    cl::desc("Print all inlined frames for a given address"));
-static cl::alias
-    ClPrintInliningAliasI("i", cl::desc("Alias for -inlining"),
-                          cl::NotHidden, cl::aliasopt(ClPrintInlining),
-                          cl::Grouping);
-static cl::alias
-    ClPrintInliningAliasInlines("inlines", cl::desc("Alias for -inlining"),
-                                cl::NotHidden, cl::aliasopt(ClPrintInlining));
-
-static cl::opt<bool> ClBasenames("basenames", cl::init(false),
-                                 cl::desc("Strip directory names from paths"));
-static cl::alias ClBasenamesShort("s", cl::desc("Alias for -basenames"),
-                                  cl::NotHidden, cl::aliasopt(ClBasenames));
-
-static cl::opt<bool>
-    ClRelativenames("relativenames", cl::init(false),
-                    cl::desc("Strip the compilation directory from paths"));
-
-static cl::opt<bool>
-ClDemangle("demangle", cl::init(true), cl::desc("Demangle function names"));
-static cl::alias
-ClDemangleShort("C", cl::desc("Alias for -demangle"),
-                cl::NotHidden, cl::aliasopt(ClDemangle), cl::Grouping);
-static cl::opt<bool>
-ClNoDemangle("no-demangle", cl::init(false),
-             cl::desc("Don't demangle function names"));
-
-static cl::opt<std::string> ClDefaultArch("default-arch", cl::init(""),
-                                          cl::desc("Default architecture "
-                                                   "(for multi-arch objects)"));
-
-static cl::opt<std::string>
-ClBinaryName("obj", cl::init(""),
-             cl::desc("Path to object file to be symbolized (if not provided, "
-                      "object file should be specified for each input line)"));
-static cl::alias
-ClBinaryNameAliasExe("exe", cl::desc("Alias for -obj"),
-                     cl::NotHidden, cl::aliasopt(ClBinaryName));
-static cl::alias ClBinaryNameAliasE("e", cl::desc("Alias for -obj"),
-                                    cl::NotHidden, cl::Grouping, cl::Prefix,
-                                    cl::aliasopt(ClBinaryName));
-
-static cl::opt<std::string>
-    ClDwpName("dwp", cl::init(""),
-              cl::desc("Path to DWP file to be use for any split CUs"));
-
-static cl::list<std::string>
-ClDsymHint("dsym-hint", cl::ZeroOrMore,
-           cl::desc("Path to .dSYM bundles to search for debug info for the "
-                    "object files"));
-
-static cl::opt<bool>
-ClPrintAddress("print-address", cl::init(false),
-               cl::desc("Show address before line information"));
-static cl::alias
-ClPrintAddressAliasAddresses("addresses", cl::desc("Alias for -print-address"),
-                             cl::NotHidden, cl::aliasopt(ClPrintAddress));
-static cl::alias
-ClPrintAddressAliasA("a", cl::desc("Alias for -print-address"),
-                     cl::NotHidden, cl::aliasopt(ClPrintAddress), cl::Grouping);
-
-static cl::opt<bool>
-    ClPrettyPrint("pretty-print", cl::init(false),
-                  cl::desc("Make the output more human friendly"));
-static cl::alias ClPrettyPrintShort("p", cl::desc("Alias for -pretty-print"),
-                                    cl::NotHidden,
-                                    cl::aliasopt(ClPrettyPrint), cl::Grouping);
-
-static cl::opt<int> ClPrintSourceContextLines(
-    "print-source-context-lines", cl::init(0),
-    cl::desc("Print N number of source file context"));
+namespace {
+enum ID {
+  OPT_INVALID = 0, // This is not an option ID.
+#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
+               HELPTEXT, METAVAR, VALUES)                                      \
+  OPT_##ID,
+#include "Opts.inc"
+#undef OPTION
+};
 
-static cl::opt<bool> ClVerbose("verbose", cl::init(false),
-                               cl::desc("Print verbose line info"));
+#define PREFIX(NAME, VALUE) const char *const NAME[] = VALUE;
+#include "Opts.inc"
+#undef PREFIX
+
+static const opt::OptTable::Info InfoTable[] = {
+#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
+               HELPTEXT, METAVAR, VALUES)                                      \
+  {                                                                            \
+      PREFIX,      NAME,      HELPTEXT,                                        \
+      METAVAR,     OPT_##ID,  opt::Option::KIND##Class,                        \
+      PARAM,       FLAGS,     OPT_##GROUP,                                     \
+      OPT_##ALIAS, ALIASARGS, VALUES},
+#include "Opts.inc"
+#undef OPTION
+};
 
-static cl::opt<uint64_t>
-    ClAdjustVMA("adjust-vma", cl::init(0), cl::value_desc("offset"),
-                cl::desc("Add specified offset to object file addresses"));
+class SymbolizerOptTable : public opt::OptTable {
+public:
+  SymbolizerOptTable() : OptTable(InfoTable, true) {}
+};
+} // namespace
 
 static cl::list<std::string> ClInputAddresses(cl::Positional,
                                               cl::desc("<input addresses>..."),
                                               cl::ZeroOrMore);
 
-static cl::opt<std::string>
-    ClFallbackDebugPath("fallback-debug-path", cl::init(""),
-                        cl::desc("Fallback path for debug binaries."));
-
-static cl::list<std::string>
-    ClDebugFileDirectory("debug-file-directory", cl::ZeroOrMore,
-                         cl::value_desc("dir"),
-                         cl::desc("Path to directory where to look for debug "
-                                  "files."));
-
-static cl::opt<DIPrinter::OutputStyle>
-    ClOutputStyle("output-style", cl::init(DIPrinter::OutputStyle::LLVM),
-                  cl::desc("Specify print style"),
-                  cl::values(clEnumValN(DIPrinter::OutputStyle::LLVM, "LLVM",
-                                        "LLVM default style"),
-                             clEnumValN(DIPrinter::OutputStyle::GNU, "GNU",
-                                        "GNU addr2line style")));
-
-static cl::opt<bool>
-    ClUseNativePDBReader("use-native-pdb-reader", cl::init(0),
-                         cl::desc("Use native PDB functionality"));
-
-static cl::extrahelp
-    HelpResponse("\nPass @FILE as argument to read options from FILE.\n");
-
 template<typename T>
 static bool error(Expected<T> &ResOrErr) {
   if (ResOrErr)
@@ -185,7 +89,8 @@ enum class Command {
   Frame,
 };
 
-static bool parseCommand(bool IsAddr2Line, StringRef InputString, Command &Cmd,
+static bool parseCommand(StringRef BinaryName, bool IsAddr2Line,
+                         StringRef InputString, Command &Cmd,
                          std::string &ModuleName, uint64_t &ModuleOffset) {
   const char kDelimiters[] = " \n\r";
   ModuleName = "";
@@ -201,7 +106,7 @@ static bool parseCommand(bool IsAddr2Line, StringRef InputString, Command &Cmd,
   }
   const char *Pos = InputString.data();
   // Skip delimiters and parse input filename (if needed).
-  if (ClBinaryName.empty()) {
+  if (BinaryName.empty()) {
     Pos += strspn(Pos, kDelimiters);
     if (*Pos == '"' || *Pos == '\'') {
       char Quote = *Pos;
@@ -217,7 +122,7 @@ static bool parseCommand(bool IsAddr2Line, StringRef InputString, Command &Cmd,
       Pos += NameLength;
     }
   } else {
-    ModuleName = ClBinaryName;
+    ModuleName = BinaryName.str();
   }
   // Skip delimiters and parse module offset.
   Pos += strspn(Pos, kDelimiters);
@@ -230,24 +135,26 @@ static bool parseCommand(bool IsAddr2Line, StringRef InputString, Command &Cmd,
   return !Offset.getAsInteger(IsAddr2Line ? 16 : 0, ModuleOffset);
 }
 
-static void symbolizeInput(bool IsAddr2Line, StringRef InputString,
-                           LLVMSymbolizer &Symbolizer, DIPrinter &Printer) {
+static void symbolizeInput(const opt::InputArgList &Args, uint64_t AdjustVMA,
+                           bool IsAddr2Line, DIPrinter::OutputStyle OutputStyle,
+                           StringRef InputString, LLVMSymbolizer &Symbolizer,
+                           DIPrinter &Printer) {
   Command Cmd;
   std::string ModuleName;
   uint64_t Offset = 0;
-  if (!parseCommand(IsAddr2Line, StringRef(InputString), Cmd, ModuleName,
-                    Offset)) {
+  if (!parseCommand(Args.getLastArgValue(OPT_obj_EQ), IsAddr2Line,
+                    StringRef(InputString), Cmd, ModuleName, Offset)) {
     outs() << InputString << "\n";
     return;
   }
 
-  if (ClPrintAddress) {
+  if (Args.hasArg(OPT_addresses)) {
     outs() << "0x";
     outs().write_hex(Offset);
-    StringRef Delimiter = ClPrettyPrint ? ": " : "\n";
+    StringRef Delimiter = Args.hasArg(OPT_pretty_print) ? ": " : "\n";
     outs() << Delimiter;
   }
-  Offset -= ClAdjustVMA;
+  Offset -= AdjustVMA;
   if (Cmd == Command::Data) {
     auto ResOrErr = Symbolizer.symbolizeData(
         ModuleName, {Offset, object::SectionedAddress::UndefSection});
@@ -261,102 +168,182 @@ static void symbolizeInput(bool IsAddr2Line, StringRef InputString,
       if (ResOrErr->empty())
         outs() << "??\n";
     }
-  } else if (ClPrintInlining) {
+  } else if (Args.hasFlag(OPT_inlines, OPT_no_inlines, !IsAddr2Line)) {
     auto ResOrErr = Symbolizer.symbolizeInlinedCode(
         ModuleName, {Offset, object::SectionedAddress::UndefSection});
     Printer << (error(ResOrErr) ? DIInliningInfo() : ResOrErr.get());
-  } else if (ClOutputStyle == DIPrinter::OutputStyle::GNU) {
-    // With ClPrintFunctions == FunctionNameKind::LinkageName (default)
-    // and ClUseSymbolTable == true (also default), Symbolizer.symbolizeCode()
+  } else if (OutputStyle == DIPrinter::OutputStyle::GNU) {
+    // With PrintFunctions == FunctionNameKind::LinkageName (default)
+    // and UseSymbolTable == true (also default), Symbolizer.symbolizeCode()
     // may override the name of an inlined function with the name of the topmost
     // caller function in the inlining chain. This contradicts the existing
     // behavior of addr2line. Symbolizer.symbolizeInlinedCode() overrides only
     // the topmost function, which suits our needs better.
     auto ResOrErr = Symbolizer.symbolizeInlinedCode(
         ModuleName, {Offset, object::SectionedAddress::UndefSection});
-    Printer << (error(ResOrErr) ? DILineInfo() : ResOrErr.get().getFrame(0));
+    if (!ResOrErr || ResOrErr->getNumberOfFrames() == 0) {
+      error(ResOrErr);
+      Printer << DILineInfo();
+    } else {
+      Printer << ResOrErr->getFrame(0);
+    }
   } else {
     auto ResOrErr = Symbolizer.symbolizeCode(
         ModuleName, {Offset, object::SectionedAddress::UndefSection});
     Printer << (error(ResOrErr) ? DILineInfo() : ResOrErr.get());
   }
-  if (ClOutputStyle == DIPrinter::OutputStyle::LLVM)
+  if (OutputStyle == DIPrinter::OutputStyle::LLVM)
     outs() << "\n";
 }
 
-int main(int argc, char **argv) {
-  InitLLVM X(argc, argv);
+static void printHelp(StringRef ToolName, const SymbolizerOptTable &Tbl,
+                      raw_ostream &OS) {
+  const char HelpText[] = " [options] addresses...";
+  Tbl.PrintHelp(OS, (ToolName + HelpText).str().c_str(),
+                ToolName.str().c_str());
+  // TODO Replace this with OptTable API once it adds extrahelp support.
+  OS << "\nPass @FILE as argument to read options from FILE.\n";
+}
 
-  bool IsAddr2Line = sys::path::stem(argv[0]).contains("addr2line");
+static opt::InputArgList parseOptions(int Argc, char *Argv[], bool IsAddr2Line,
+                                      StringSaver &Saver,
+                                      SymbolizerOptTable &Tbl) {
+  StringRef ToolName = IsAddr2Line ? "llvm-addr2line" : "llvm-symbolizer";
+  Tbl.setGroupedShortOptions(true);
+  // The environment variable specifies initial options which can be overridden
+  // by commnad line options.
+  Tbl.setInitialOptionsFromEnvironment(IsAddr2Line ? "LLVM_ADDR2LINE_OPTS"
+                                                   : "LLVM_SYMBOLIZER_OPTS");
+  bool HasError = false;
+  opt::InputArgList Args =
+      Tbl.parseArgs(Argc, Argv, OPT_UNKNOWN, Saver, [&](StringRef Msg) {
+        errs() << ("error: " + Msg + "\n");
+        HasError = true;
+      });
+  if (HasError)
+    exit(1);
+  if (Args.hasArg(OPT_help)) {
+    printHelp(ToolName, Tbl, outs());
+    exit(0);
+  }
+  if (Args.hasArg(OPT_version)) {
+    outs() << ToolName << '\n';
+    cl::PrintVersionMessage();
+    exit(0);
+  }
 
-  if (IsAddr2Line) {
-    ClDemangle.setInitialValue(false);
-    ClPrintFunctions.setInitialValue(FunctionNameKind::None);
-    ClPrintInlining.setInitialValue(false);
-    ClUntagAddresses.setInitialValue(false);
-    ClOutputStyle.setInitialValue(DIPrinter::OutputStyle::GNU);
+  return Args;
+}
+
+template <typename T>
+static void parseIntArg(const opt::InputArgList &Args, int ID, T &Value) {
+  if (const opt::Arg *A = Args.getLastArg(ID)) {
+    StringRef V(A->getValue());
+    if (!llvm::to_integer(V, Value, 0)) {
+      errs() << A->getSpelling() +
+                    ": expected a non-negative integer, but got '" + V + "'";
+      exit(1);
+    }
+  } else {
+    Value = 0;
   }
+}
 
-  llvm::sys::InitializeCOMRAII COM(llvm::sys::COMThreadingMode::MultiThreaded);
-  cl::ParseCommandLineOptions(
-      argc, argv, IsAddr2Line ? "llvm-addr2line\n" : "llvm-symbolizer\n",
-      /*Errs=*/nullptr,
-      IsAddr2Line ? "LLVM_ADDR2LINE_OPTS" : "LLVM_SYMBOLIZER_OPTS");
+static FunctionNameKind decideHowToPrintFunctions(const opt::InputArgList &Args,
+                                                  bool IsAddr2Line) {
+  if (Args.hasArg(OPT_functions))
+    return FunctionNameKind::LinkageName;
+  if (const opt::Arg *A = Args.getLastArg(OPT_functions_EQ))
+    return StringSwitch<FunctionNameKind>(A->getValue())
+        .Case("none", FunctionNameKind::None)
+        .Case("short", FunctionNameKind::ShortName)
+        .Default(FunctionNameKind::LinkageName);
+  return IsAddr2Line ? FunctionNameKind::None : FunctionNameKind::LinkageName;
+}
 
-  // If both --demangle and --no-demangle are specified then pick the last one.
-  if (ClNoDemangle.getPosition() > ClDemangle.getPosition())
-    ClDemangle = !ClNoDemangle;
+int main(int argc, char **argv) {
+  InitLLVM X(argc, argv);
+  sys::InitializeCOMRAII COM(sys::COMThreadingMode::MultiThreaded);
+
+  bool IsAddr2Line = sys::path::stem(argv[0]).contains("addr2line");
+  BumpPtrAllocator A;
+  StringSaver Saver(A);
+  SymbolizerOptTable Tbl;
+  opt::InputArgList Args = parseOptions(argc, argv, IsAddr2Line, Saver, Tbl);
 
   LLVMSymbolizer::Options Opts;
-  Opts.PrintFunctions = ClPrintFunctions;
-  Opts.UseSymbolTable = ClUseSymbolTable;
-  Opts.Demangle = ClDemangle;
-  Opts.RelativeAddresses = ClUseRelativeAddress;
-  Opts.UntagAddresses = ClUntagAddresses;
-  Opts.DefaultArch = ClDefaultArch;
-  Opts.FallbackDebugPath = ClFallbackDebugPath;
-  Opts.DWPName = ClDwpName;
-  Opts.DebugFileDirectory = ClDebugFileDirectory;
-  Opts.UseNativePDBReader = ClUseNativePDBReader;
-  Opts.PathStyle = DILineInfoSpecifier::FileLineInfoKind::AbsoluteFilePath;
-  // If both --basenames and --relativenames are specified then pick the last
-  // one.
-  if (ClBasenames.getPosition() > ClRelativenames.getPosition())
-    Opts.PathStyle = DILineInfoSpecifier::FileLineInfoKind::BaseNameOnly;
-  else if (ClRelativenames)
-    Opts.PathStyle = DILineInfoSpecifier::FileLineInfoKind::RelativeFilePath;
+  uint64_t AdjustVMA;
+  unsigned SourceContextLines;
+  parseIntArg(Args, OPT_adjust_vma_EQ, AdjustVMA);
+  if (const opt::Arg *A = Args.getLastArg(OPT_basenames, OPT_relativenames)) {
+    Opts.PathStyle =
+        A->getOption().matches(OPT_basenames)
+            ? DILineInfoSpecifier::FileLineInfoKind::BaseNameOnly
+            : DILineInfoSpecifier::FileLineInfoKind::RelativeFilePath;
+  } else {
+    Opts.PathStyle = DILineInfoSpecifier::FileLineInfoKind::AbsoluteFilePath;
+  }
+  Opts.DebugFileDirectory = Args.getAllArgValues(OPT_debug_file_directory_EQ);
+  Opts.DefaultArch = Args.getLastArgValue(OPT_default_arch_EQ).str();
+  Opts.Demangle = Args.hasFlag(OPT_demangle, OPT_no_demangle, !IsAddr2Line);
+  Opts.DWPName = Args.getLastArgValue(OPT_dwp_EQ).str();
+  Opts.FallbackDebugPath =
+      Args.getLastArgValue(OPT_fallback_debug_path_EQ).str();
+  Opts.PrintFunctions = decideHowToPrintFunctions(Args, IsAddr2Line);
+  parseIntArg(Args, OPT_print_source_context_lines_EQ, SourceContextLines);
+  Opts.RelativeAddresses = Args.hasArg(OPT_relative_address);
+  Opts.UntagAddresses =
+      Args.hasFlag(OPT_untag_addresses, OPT_no_untag_addresses, !IsAddr2Line);
+  Opts.UseDIA = Args.hasArg(OPT_use_dia);
+#if !defined(LLVM_ENABLE_DIA_SDK)
+  if (Opts.UseDIA) {
+    WithColor::warning() << "DIA not available; using native PDB reader\n";
+    Opts.UseDIA = false;
+  }
+#endif
+  Opts.UseSymbolTable = true;
 
-  for (const auto &hint : ClDsymHint) {
-    if (sys::path::extension(hint) == ".dSYM") {
-      Opts.DsymHints.push_back(hint);
+  for (const opt::Arg *A : Args.filtered(OPT_dsym_hint_EQ)) {
+    StringRef Hint(A->getValue());
+    if (sys::path::extension(Hint) == ".dSYM") {
+      Opts.DsymHints.emplace_back(Hint);
     } else {
-      errs() << "Warning: invalid dSYM hint: \"" << hint <<
-                "\" (must have the '.dSYM' extension).\n";
+      errs() << "Warning: invalid dSYM hint: \"" << Hint
+             << "\" (must have the '.dSYM' extension).\n";
     }
   }
-  LLVMSymbolizer Symbolizer(Opts);
 
-  DIPrinter Printer(outs(), ClPrintFunctions != FunctionNameKind::None,
-                    ClPrettyPrint, ClPrintSourceContextLines, ClVerbose,
-                    ClOutputStyle);
+  auto OutputStyle =
+      IsAddr2Line ? DIPrinter::OutputStyle::GNU : DIPrinter::OutputStyle::LLVM;
+  if (const opt::Arg *A = Args.getLastArg(OPT_output_style_EQ)) {
+    OutputStyle = strcmp(A->getValue(), "GNU") == 0
+                      ? DIPrinter::OutputStyle::GNU
+                      : DIPrinter::OutputStyle::LLVM;
+  }
+
+  LLVMSymbolizer Symbolizer(Opts);
+  DIPrinter Printer(outs(), Opts.PrintFunctions != FunctionNameKind::None,
+                    Args.hasArg(OPT_pretty_print), SourceContextLines,
+                    Args.hasArg(OPT_verbose), OutputStyle);
 
-  if (ClInputAddresses.empty()) {
+  std::vector<std::string> InputAddresses = Args.getAllArgValues(OPT_INPUT);
+  if (InputAddresses.empty()) {
     const int kMaxInputStringLength = 1024;
     char InputString[kMaxInputStringLength];
 
     while (fgets(InputString, sizeof(InputString), stdin)) {
       // Strip newline characters.
       std::string StrippedInputString(InputString);
-      StrippedInputString.erase(
-          std::remove_if(StrippedInputString.begin(), StrippedInputString.end(),
-                         [](char c) { return c == '\r' || c == '\n'; }),
-          StrippedInputString.end());
-      symbolizeInput(IsAddr2Line, StrippedInputString, Symbolizer, Printer);
+      llvm::erase_if(StrippedInputString,
+                     [](char c) { return c == '\r' || c == '\n'; });
+      symbolizeInput(Args, AdjustVMA, IsAddr2Line, OutputStyle,
+                     StrippedInputString, Symbolizer, Printer);
       outs().flush();
     }
   } else {
-    for (StringRef Address : ClInputAddresses)
-      symbolizeInput(IsAddr2Line, Address, Symbolizer, Printer);
+    for (StringRef Address : InputAddresses)
+      symbolizeInput(Args, AdjustVMA, IsAddr2Line, OutputStyle, Address,
+                     Symbolizer, Printer);
   }
 
   return 0;
diff --git a/contrib/llvm-project/llvm/tools/llvm-xray/xray-account.cpp b/contrib/llvm-project/llvm/tools/llvm-xray/xray-account.cpp
index fcac33b23d4d..bde028a432fe 100644
--- a/contrib/llvm-project/llvm/tools/llvm-xray/xray-account.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-xray/xray-account.cpp
@@ -35,6 +35,9 @@ static cl::opt<bool>
                      cl::sub(Account), cl::init(false));
 static cl::alias AccountKeepGoing2("k", cl::aliasopt(AccountKeepGoing),
                                    cl::desc("Alias for -keep_going"));
+static cl::opt<bool> AccountRecursiveCallsOnly(
+    "recursive-calls-only", cl::desc("Only count the calls that are recursive"),
+    cl::sub(Account), cl::init(false));
 static cl::opt<bool> AccountDeduceSiblingCalls(
     "deduce-sibling-calls",
     cl::desc("Deduce sibling calls when unrolling function call stacks"),
@@ -126,6 +129,32 @@ template <class T> T diff(T L, T R) { return std::max(L, R) - std::min(L, R); }
 
 } // namespace
 
+using RecursionStatus = LatencyAccountant::FunctionStack::RecursionStatus;
+RecursionStatus &RecursionStatus::operator++() {
+  auto Depth = Bitfield::get<RecursionStatus::Depth>(Storage);
+  assert(Depth >= 0 && Depth < std::numeric_limits<decltype(Depth)>::max());
+  ++Depth;
+  Bitfield::set<RecursionStatus::Depth>(Storage, Depth); // ++Storage
+  // Did this function just (maybe indirectly) call itself the first time?
+  if (!isRecursive() && Depth == 2) // Storage == 2  /  Storage s> 1
+    Bitfield::set<RecursionStatus::IsRecursive>(Storage,
+                                                true); // Storage |= INT_MIN
+  return *this;
+}
+RecursionStatus &RecursionStatus::operator--() {
+  auto Depth = Bitfield::get<RecursionStatus::Depth>(Storage);
+  assert(Depth > 0);
+  --Depth;
+  Bitfield::set<RecursionStatus::Depth>(Storage, Depth); // --Storage
+  // Did we leave a function that previouly (maybe indirectly) called itself?
+  if (isRecursive() && Depth == 0) // Storage == INT_MIN
+    Bitfield::set<RecursionStatus::IsRecursive>(Storage, false); // Storage = 0
+  return *this;
+}
+bool RecursionStatus::isRecursive() const {
+  return Bitfield::get<RecursionStatus::IsRecursive>(Storage); // Storage s< 0
+}
+
 bool LatencyAccountant::accountRecord(const XRayRecord &Record) {
   setMinMax(PerThreadMinMaxTSC[Record.TId], Record.TSC);
   setMinMax(PerCPUMinMaxTSC[Record.CPU], Record.TSC);
@@ -137,6 +166,8 @@ bool LatencyAccountant::accountRecord(const XRayRecord &Record) {
     return false;
 
   auto &ThreadStack = PerThreadFunctionStack[Record.TId];
+  if (RecursiveCallsOnly && !ThreadStack.RecursionDepth)
+    ThreadStack.RecursionDepth.emplace();
   switch (Record.Type) {
   case RecordTypes::CUSTOM_EVENT:
   case RecordTypes::TYPED_EVENT:
@@ -144,18 +175,24 @@ bool LatencyAccountant::accountRecord(const XRayRecord &Record) {
     return true;
   case RecordTypes::ENTER:
   case RecordTypes::ENTER_ARG: {
-    ThreadStack.emplace_back(Record.FuncId, Record.TSC);
+    ThreadStack.Stack.emplace_back(Record.FuncId, Record.TSC);
+    if (ThreadStack.RecursionDepth)
+      ++(*ThreadStack.RecursionDepth)[Record.FuncId];
     break;
   }
   case RecordTypes::EXIT:
   case RecordTypes::TAIL_EXIT: {
-    if (ThreadStack.empty())
+    if (ThreadStack.Stack.empty())
       return false;
 
-    if (ThreadStack.back().first == Record.FuncId) {
-      const auto &Top = ThreadStack.back();
-      recordLatency(Top.first, diff(Top.second, Record.TSC));
-      ThreadStack.pop_back();
+    if (ThreadStack.Stack.back().first == Record.FuncId) {
+      const auto &Top = ThreadStack.Stack.back();
+      if (!ThreadStack.RecursionDepth ||
+          (*ThreadStack.RecursionDepth)[Top.first].isRecursive())
+        recordLatency(Top.first, diff(Top.second, Record.TSC));
+      if (ThreadStack.RecursionDepth)
+        --(*ThreadStack.RecursionDepth)[Top.first];
+      ThreadStack.Stack.pop_back();
       break;
     }
 
@@ -164,11 +201,11 @@ bool LatencyAccountant::accountRecord(const XRayRecord &Record) {
 
     // Look for the parent up the stack.
     auto Parent =
-        std::find_if(ThreadStack.rbegin(), ThreadStack.rend(),
+        std::find_if(ThreadStack.Stack.rbegin(), ThreadStack.Stack.rend(),
                      [&](const std::pair<const int32_t, uint64_t> &E) {
                        return E.first == Record.FuncId;
                      });
-    if (Parent == ThreadStack.rend())
+    if (Parent == ThreadStack.Stack.rend())
       return false;
 
     // Account time for this apparently sibling call exit up the stack.
@@ -199,11 +236,17 @@ bool LatencyAccountant::accountRecord(const XRayRecord &Record) {
     // complexity to do correctly (need to backtrack, etc.).
     //
     // FIXME: Potentially implement the more complex deduction algorithm?
-    auto I = std::next(Parent).base();
-    for (auto &E : make_range(I, ThreadStack.end())) {
-      recordLatency(E.first, diff(E.second, Record.TSC));
+    auto R = make_range(std::next(Parent).base(), ThreadStack.Stack.end());
+    for (auto &E : R) {
+      if (!ThreadStack.RecursionDepth ||
+          (*ThreadStack.RecursionDepth)[E.first].isRecursive())
+        recordLatency(E.first, diff(E.second, Record.TSC));
+    }
+    for (auto &Top : reverse(R)) {
+      if (ThreadStack.RecursionDepth)
+        --(*ThreadStack.RecursionDepth)[Top.first];
+      ThreadStack.Stack.pop_back();
     }
-    ThreadStack.erase(I, ThreadStack.end());
     break;
   }
   }
@@ -226,7 +269,7 @@ struct ResultRow {
   std::string Function;
 };
 
-ResultRow getStats(std::vector<uint64_t> &Timings) {
+ResultRow getStats(MutableArrayRef<uint64_t> Timings) {
   assert(!Timings.empty());
   ResultRow R;
   R.Sum = std::accumulate(Timings.begin(), Timings.end(), 0.0);
@@ -240,11 +283,13 @@ ResultRow getStats(std::vector<uint64_t> &Timings) {
   R.Median = Timings[MedianOff];
 
   auto Pct90Off = std::floor(Timings.size() * 0.9);
-  std::nth_element(Timings.begin(), Timings.begin() + Pct90Off, Timings.end());
+  std::nth_element(Timings.begin(), Timings.begin() + (uint64_t)Pct90Off,
+                   Timings.end());
   R.Pct90 = Timings[Pct90Off];
 
   auto Pct99Off = std::floor(Timings.size() * 0.99);
-  std::nth_element(Timings.begin(), Timings.begin() + Pct99Off, Timings.end());
+  std::nth_element(Timings.begin(), Timings.begin() + (uint64_t)Pct99Off,
+                   Timings.end());
   R.Pct99 = Timings[Pct99Off];
   return R;
 }
@@ -423,7 +468,8 @@ static CommandRegistration Unused(&Account, []() -> Error {
   symbolize::LLVMSymbolizer Symbolizer;
   llvm::xray::FuncIdConversionHelper FuncIdHelper(AccountInstrMap, Symbolizer,
                                                   FunctionAddresses);
-  xray::LatencyAccountant FCA(FuncIdHelper, AccountDeduceSiblingCalls);
+  xray::LatencyAccountant FCA(FuncIdHelper, AccountRecursiveCallsOnly,
+                              AccountDeduceSiblingCalls);
   auto TraceOrErr = loadTraceFile(AccountInput);
   if (!TraceOrErr)
     return joinErrors(
@@ -445,12 +491,12 @@ static CommandRegistration Unused(&Account, []() -> Error {
         << '\n';
     for (const auto &ThreadStack : FCA.getPerThreadFunctionStack()) {
       errs() << "Thread ID: " << ThreadStack.first << "\n";
-      if (ThreadStack.second.empty()) {
+      if (ThreadStack.second.Stack.empty()) {
         errs() << "  (empty stack)\n";
         continue;
       }
-      auto Level = ThreadStack.second.size();
-      for (const auto &Entry : llvm::reverse(ThreadStack.second))
+      auto Level = ThreadStack.second.Stack.size();
+      for (const auto &Entry : llvm::reverse(ThreadStack.second.Stack))
         errs() << "  #" << Level-- << "\t"
                << FuncIdHelper.SymbolOrNumber(Entry.first) << '\n';
     }
diff --git a/contrib/llvm-project/llvm/tools/llvm-xray/xray-account.h b/contrib/llvm-project/llvm/tools/llvm-xray/xray-account.h
index b63ecc59b71a..371a9cc708e9 100644
--- a/contrib/llvm-project/llvm/tools/llvm-xray/xray-account.h
+++ b/contrib/llvm-project/llvm/tools/llvm-xray/xray-account.h
@@ -18,6 +18,7 @@
 #include <vector>
 
 #include "func-id-helper.h"
+#include "llvm/ADT/Bitfields.h"
 #include "llvm/Support/Program.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/XRay/XRayRecord.h"
@@ -27,12 +28,26 @@ namespace xray {
 
 class LatencyAccountant {
 public:
-  typedef std::map<int32_t, std::vector<uint64_t>> FunctionLatencyMap;
-  typedef std::map<uint32_t, std::pair<uint64_t, uint64_t>>
+  typedef llvm::DenseMap<int32_t, llvm::SmallVector<uint64_t, 0>>
+      FunctionLatencyMap;
+  typedef llvm::DenseMap<uint32_t, std::pair<uint64_t, uint64_t>>
       PerThreadMinMaxTSCMap;
-  typedef std::map<uint8_t, std::pair<uint64_t, uint64_t>> PerCPUMinMaxTSCMap;
-  typedef std::vector<std::pair<int32_t, uint64_t>> FunctionStack;
-  typedef std::map<uint32_t, FunctionStack> PerThreadFunctionStackMap;
+  typedef llvm::DenseMap<uint8_t, std::pair<uint64_t, uint64_t>>
+      PerCPUMinMaxTSCMap;
+  struct FunctionStack {
+    llvm::SmallVector<std::pair<int32_t, uint64_t>, 32> Stack;
+    class RecursionStatus {
+      uint32_t Storage = 0;
+      using Depth = Bitfield::Element<int32_t, 0, 31>;    // Low 31 bits.
+      using IsRecursive = Bitfield::Element<bool, 31, 1>; // Sign bit.
+    public:
+      RecursionStatus &operator++();
+      RecursionStatus &operator--();
+      bool isRecursive() const;
+    };
+    Optional<llvm::DenseMap<int32_t, RecursionStatus>> RecursionDepth;
+  };
+  typedef llvm::DenseMap<uint32_t, FunctionStack> PerThreadFunctionStackMap;
 
 private:
   PerThreadFunctionStackMap PerThreadFunctionStack;
@@ -41,6 +56,7 @@ private:
   PerCPUMinMaxTSCMap PerCPUMinMaxTSC;
   FuncIdConversionHelper &FuncIdHelper;
 
+  bool RecursiveCallsOnly = false;
   bool DeduceSiblingCalls = false;
   uint64_t CurrentMaxTSC = 0;
 
@@ -50,8 +66,9 @@ private:
 
 public:
   explicit LatencyAccountant(FuncIdConversionHelper &FuncIdHelper,
-                             bool DeduceSiblingCalls)
-      : FuncIdHelper(FuncIdHelper), DeduceSiblingCalls(DeduceSiblingCalls) {}
+                             bool RecursiveCallsOnly, bool DeduceSiblingCalls)
+      : FuncIdHelper(FuncIdHelper), RecursiveCallsOnly(RecursiveCallsOnly),
+        DeduceSiblingCalls(DeduceSiblingCalls) {}
 
   const FunctionLatencyMap &getFunctionLatencies() const {
     return FunctionLatencies;
diff --git a/contrib/llvm-project/llvm/tools/llvm-xray/xray-graph.cpp b/contrib/llvm-project/llvm/tools/llvm-xray/xray-graph.cpp
index 522609b938f2..00a1807c07c9 100644
--- a/contrib/llvm-project/llvm/tools/llvm-xray/xray-graph.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-xray/xray-graph.cpp
@@ -312,8 +312,7 @@ void GraphRenderer::calculateVertexStatistics() {
     if (V.first != 0) {
       for (auto &E : G.inEdges(V.first)) {
         auto &A = E.second;
-        TempTimings.insert(TempTimings.end(), A.Timings.begin(),
-                           A.Timings.end());
+        llvm::append_range(TempTimings, A.Timings);
       }
       getStats(TempTimings.begin(), TempTimings.end(), G[V.first].S);
       updateMaxStats(G[V.first].S, G.GraphVertexMax);
diff --git a/contrib/llvm-project/llvm/tools/llvm-xray/xray-stacks.cpp b/contrib/llvm-project/llvm/tools/llvm-xray/xray-stacks.cpp
index 1e4490289534..d04b998dacca 100644
--- a/contrib/llvm-project/llvm/tools/llvm-xray/xray-stacks.cpp
+++ b/contrib/llvm-project/llvm/tools/llvm-xray/xray-stacks.cpp
@@ -252,13 +252,15 @@ private:
 /// maintain an index of unique functions, and provide a means of iterating
 /// through all the instrumented call stacks which we know about.
 
+namespace {
 struct StackDuration {
   llvm::SmallVector<int64_t, 4> TerminalDurations;
   llvm::SmallVector<int64_t, 4> IntermediateDurations;
 };
+} // namespace
 
-StackDuration mergeStackDuration(const StackDuration &Left,
-                                 const StackDuration &Right) {
+static StackDuration mergeStackDuration(const StackDuration &Left,
+                                        const StackDuration &Right) {
   StackDuration Data{};
   Data.TerminalDurations.reserve(Left.TerminalDurations.size() +
                                  Right.TerminalDurations.size());
@@ -280,7 +282,7 @@ StackDuration mergeStackDuration(const StackDuration &Left,
 using StackTrieNode = TrieNode<StackDuration>;
 
 template <AggregationType AggType>
-std::size_t GetValueForStack(const StackTrieNode *Node);
+static std::size_t GetValueForStack(const StackTrieNode *Node);
 
 // When computing total time spent in a stack, we're adding the timings from
 // its callees and the timings from when it was a leaf.
@@ -454,8 +456,7 @@ public:
     int Level = 0;
     OS << formatv("{0,-5} {1,-60} {2,+12} {3,+16}\n", "lvl", "function",
                   "count", "sum");
-    for (auto *F :
-         reverse(make_range(CurrentStack.begin() + 1, CurrentStack.end()))) {
+    for (auto *F : reverse(drop_begin(CurrentStack))) {
       auto Sum = std::accumulate(F->ExtraData.IntermediateDurations.begin(),
                                  F->ExtraData.IntermediateDurations.end(), 0LL);
       auto FuncId = FN.SymbolOrNumber(F->FuncId);
@@ -638,10 +639,8 @@ public:
           {
             auto E =
                 std::make_pair(Top, Top->ExtraData.TerminalDurations.size());
-            TopStacksByCount.insert(std::lower_bound(TopStacksByCount.begin(),
-                                                     TopStacksByCount.end(), E,
-                                                     greater_second),
-                                    E);
+            TopStacksByCount.insert(
+                llvm::lower_bound(TopStacksByCount, E, greater_second), E);
             if (TopStacksByCount.size() == 11)
               TopStacksByCount.pop_back();
           }
@@ -669,9 +668,9 @@ public:
   }
 };
 
-std::string CreateErrorMessage(StackTrie::AccountRecordStatus Error,
-                               const XRayRecord &Record,
-                               const FuncIdConversionHelper &Converter) {
+static std::string CreateErrorMessage(StackTrie::AccountRecordStatus Error,
+                                      const XRayRecord &Record,
+                                      const FuncIdConversionHelper &Converter) {
   switch (Error) {
   case StackTrie::AccountRecordStatus::ENTRY_NOT_FOUND:
     return std::string(
diff --git a/contrib/llvm-project/llvm/tools/opt/NewPMDriver.cpp b/contrib/llvm-project/llvm/tools/opt/NewPMDriver.cpp
index b94c58decdda..401a58fc154a 100644
--- a/contrib/llvm-project/llvm/tools/opt/NewPMDriver.cpp
+++ b/contrib/llvm-project/llvm/tools/opt/NewPMDriver.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/CGSCCPassManager.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Bitcode/BitcodeWriterPass.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Dominators.h"
@@ -29,17 +30,28 @@
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Passes/PassPlugin.h"
 #include "llvm/Passes/StandardInstrumentations.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/IPO/ThinLTOBitcodeWriter.h"
+#include "llvm/Transforms/Instrumentation/AddressSanitizer.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils/Debugify.h"
 
 using namespace llvm;
 using namespace opt_tool;
 
+namespace llvm {
+cl::opt<bool> DebugifyEach(
+    "debugify-each",
+    cl::desc("Start each pass with debugify and end it with check-debugify"));
+
+cl::opt<std::string>
+    DebugifyExport("debugify-export",
+                   cl::desc("Export per-pass debugify statistics to this file"),
+                   cl::value_desc("filename"));
+} // namespace llvm
+
 static cl::opt<bool>
     DebugPM("debug-pass-manager", cl::Hidden,
             cl::desc("Print pass management debugging information"));
@@ -55,7 +67,7 @@ static cl::opt<std::string>
     AAPipeline("aa-pipeline",
                cl::desc("A textual description of the alias analysis "
                         "pipeline for handling managed aliasing queries"),
-               cl::Hidden);
+               cl::Hidden, cl::init("default"));
 
 /// {{@ These options accept textual pipeline descriptions which will be
 /// inserted into default pipelines at the respective extension points
@@ -92,12 +104,17 @@ static cl::opt<std::string> VectorizerStartEPPipeline(
     cl::Hidden);
 static cl::opt<std::string> PipelineStartEPPipeline(
     "passes-ep-pipeline-start",
-    cl::desc("A textual description of the function pass pipeline inserted at "
+    cl::desc("A textual description of the module pass pipeline inserted at "
              "the PipelineStart extension point into default pipelines"),
     cl::Hidden);
+static cl::opt<std::string> PipelineEarlySimplificationEPPipeline(
+    "passes-ep-pipeline-early-simplification",
+    cl::desc("A textual description of the module pass pipeline inserted at "
+             "the EarlySimplification extension point into default pipelines"),
+    cl::Hidden);
 static cl::opt<std::string> OptimizerLastEPPipeline(
     "passes-ep-optimizer-last",
-    cl::desc("A textual description of the function pass pipeline inserted at "
+    cl::desc("A textual description of the module pass pipeline inserted at "
              "the OptimizerLast extension point into default pipelines"),
     cl::Hidden);
 
@@ -108,6 +125,7 @@ extern cl::opt<PGOKind> PGOKindFlag;
 extern cl::opt<std::string> ProfileFile;
 extern cl::opt<CSPGOKind> CSPGOKindFlag;
 extern cl::opt<std::string> CSProfileGenFile;
+extern cl::opt<bool> DisableBasicAA;
 
 static cl::opt<std::string>
     ProfileRemappingFile("profile-remapping-file",
@@ -116,6 +134,13 @@ static cl::opt<std::string>
 static cl::opt<bool> DebugInfoForProfiling(
     "new-pm-debug-info-for-profiling", cl::init(false), cl::Hidden,
     cl::desc("Emit special debug info to enable PGO profile generation."));
+static cl::opt<bool> PseudoProbeForProfiling(
+    "new-pm-pseudo-probe-for-profiling", cl::init(false), cl::Hidden,
+    cl::desc("Emit pseudo probes to enable PGO profile generation."));
+static cl::opt<bool> UniqueInternalLinkageNames(
+    "new-pm-unique-internal-linkage-names", cl::init(false), cl::Hidden,
+    cl::desc("Uniqueify Internal Linkage Symbol Names by appending the MD5 "
+             "hash of the module path."));
 /// @}}
 
 template <typename PassManagerT>
@@ -137,72 +162,63 @@ bool tryParsePipelineText(PassBuilder &PB,
 
 /// If one of the EPPipeline command line options was given, register callbacks
 /// for parsing and inserting the given pipeline
-static void registerEPCallbacks(PassBuilder &PB, bool VerifyEachPass,
-                                bool DebugLogging) {
+static void registerEPCallbacks(PassBuilder &PB) {
   if (tryParsePipelineText<FunctionPassManager>(PB, PeepholeEPPipeline))
     PB.registerPeepholeEPCallback(
-        [&PB, VerifyEachPass, DebugLogging](
-            FunctionPassManager &PM, PassBuilder::OptimizationLevel Level) {
+        [&PB](FunctionPassManager &PM, PassBuilder::OptimizationLevel Level) {
           ExitOnError Err("Unable to parse PeepholeEP pipeline: ");
-          Err(PB.parsePassPipeline(PM, PeepholeEPPipeline, VerifyEachPass,
-                                   DebugLogging));
+          Err(PB.parsePassPipeline(PM, PeepholeEPPipeline));
         });
   if (tryParsePipelineText<LoopPassManager>(PB,
                                             LateLoopOptimizationsEPPipeline))
     PB.registerLateLoopOptimizationsEPCallback(
-        [&PB, VerifyEachPass, DebugLogging](
-            LoopPassManager &PM, PassBuilder::OptimizationLevel Level) {
+        [&PB](LoopPassManager &PM, PassBuilder::OptimizationLevel Level) {
           ExitOnError Err("Unable to parse LateLoopOptimizationsEP pipeline: ");
-          Err(PB.parsePassPipeline(PM, LateLoopOptimizationsEPPipeline,
-                                   VerifyEachPass, DebugLogging));
+          Err(PB.parsePassPipeline(PM, LateLoopOptimizationsEPPipeline));
         });
   if (tryParsePipelineText<LoopPassManager>(PB, LoopOptimizerEndEPPipeline))
     PB.registerLoopOptimizerEndEPCallback(
-        [&PB, VerifyEachPass, DebugLogging](
-            LoopPassManager &PM, PassBuilder::OptimizationLevel Level) {
+        [&PB](LoopPassManager &PM, PassBuilder::OptimizationLevel Level) {
           ExitOnError Err("Unable to parse LoopOptimizerEndEP pipeline: ");
-          Err(PB.parsePassPipeline(PM, LoopOptimizerEndEPPipeline,
-                                   VerifyEachPass, DebugLogging));
+          Err(PB.parsePassPipeline(PM, LoopOptimizerEndEPPipeline));
         });
   if (tryParsePipelineText<FunctionPassManager>(PB,
                                                 ScalarOptimizerLateEPPipeline))
     PB.registerScalarOptimizerLateEPCallback(
-        [&PB, VerifyEachPass, DebugLogging](
-            FunctionPassManager &PM, PassBuilder::OptimizationLevel Level) {
+        [&PB](FunctionPassManager &PM, PassBuilder::OptimizationLevel Level) {
           ExitOnError Err("Unable to parse ScalarOptimizerLateEP pipeline: ");
-          Err(PB.parsePassPipeline(PM, ScalarOptimizerLateEPPipeline,
-                                   VerifyEachPass, DebugLogging));
+          Err(PB.parsePassPipeline(PM, ScalarOptimizerLateEPPipeline));
         });
   if (tryParsePipelineText<CGSCCPassManager>(PB, CGSCCOptimizerLateEPPipeline))
     PB.registerCGSCCOptimizerLateEPCallback(
-        [&PB, VerifyEachPass, DebugLogging](
-            CGSCCPassManager &PM, PassBuilder::OptimizationLevel Level) {
+        [&PB](CGSCCPassManager &PM, PassBuilder::OptimizationLevel Level) {
           ExitOnError Err("Unable to parse CGSCCOptimizerLateEP pipeline: ");
-          Err(PB.parsePassPipeline(PM, CGSCCOptimizerLateEPPipeline,
-                                   VerifyEachPass, DebugLogging));
+          Err(PB.parsePassPipeline(PM, CGSCCOptimizerLateEPPipeline));
         });
   if (tryParsePipelineText<FunctionPassManager>(PB, VectorizerStartEPPipeline))
     PB.registerVectorizerStartEPCallback(
-        [&PB, VerifyEachPass, DebugLogging](
-            FunctionPassManager &PM, PassBuilder::OptimizationLevel Level) {
+        [&PB](FunctionPassManager &PM, PassBuilder::OptimizationLevel Level) {
           ExitOnError Err("Unable to parse VectorizerStartEP pipeline: ");
-          Err(PB.parsePassPipeline(PM, VectorizerStartEPPipeline,
-                                   VerifyEachPass, DebugLogging));
+          Err(PB.parsePassPipeline(PM, VectorizerStartEPPipeline));
         });
   if (tryParsePipelineText<ModulePassManager>(PB, PipelineStartEPPipeline))
     PB.registerPipelineStartEPCallback(
-        [&PB, VerifyEachPass, DebugLogging](ModulePassManager &PM) {
+        [&PB](ModulePassManager &PM, PassBuilder::OptimizationLevel) {
           ExitOnError Err("Unable to parse PipelineStartEP pipeline: ");
-          Err(PB.parsePassPipeline(PM, PipelineStartEPPipeline, VerifyEachPass,
-                                   DebugLogging));
+          Err(PB.parsePassPipeline(PM, PipelineStartEPPipeline));
+        });
+  if (tryParsePipelineText<ModulePassManager>(
+          PB, PipelineEarlySimplificationEPPipeline))
+    PB.registerPipelineEarlySimplificationEPCallback(
+        [&PB](ModulePassManager &PM, PassBuilder::OptimizationLevel) {
+          ExitOnError Err("Unable to parse EarlySimplification pipeline: ");
+          Err(PB.parsePassPipeline(PM, PipelineEarlySimplificationEPPipeline));
         });
   if (tryParsePipelineText<FunctionPassManager>(PB, OptimizerLastEPPipeline))
     PB.registerOptimizerLastEPCallback(
-        [&PB, VerifyEachPass, DebugLogging](ModulePassManager &PM,
-                                            PassBuilder::OptimizationLevel) {
+        [&PB](ModulePassManager &PM, PassBuilder::OptimizationLevel) {
           ExitOnError Err("Unable to parse OptimizerLastEP pipeline: ");
-          Err(PB.parsePassPipeline(PM, OptimizerLastEPPipeline, VerifyEachPass,
-                                   DebugLogging));
+          Err(PB.parsePassPipeline(PM, OptimizerLastEPPipeline));
         });
 }
 
@@ -211,7 +227,8 @@ static void registerEPCallbacks(PassBuilder &PB, bool VerifyEachPass,
 #include "llvm/Support/Extension.def"
 
 bool llvm::runPassPipeline(StringRef Arg0, Module &M, TargetMachine *TM,
-                           ToolOutputFile *Out, ToolOutputFile *ThinLTOLinkOut,
+                           TargetLibraryInfoImpl *TLII, ToolOutputFile *Out,
+                           ToolOutputFile *ThinLTOLinkOut,
                            ToolOutputFile *OptRemarkFile,
                            StringRef PassPipeline, ArrayRef<StringRef> Passes,
                            OutputKind OK, VerifierKind VK,
@@ -237,6 +254,9 @@ bool llvm::runPassPipeline(StringRef Arg0, Module &M, TargetMachine *TM,
     if (DebugInfoForProfiling)
       P = PGOOptions("", "", "", PGOOptions::NoAction, PGOOptions::NoCSAction,
                      true);
+    else if (PseudoProbeForProfiling)
+      P = PGOOptions("", "", "", PGOOptions::NoAction, PGOOptions::NoCSAction,
+                     false, true);
     else
       P = None;
   }
@@ -260,8 +280,11 @@ bool llvm::runPassPipeline(StringRef Arg0, Module &M, TargetMachine *TM,
     }
   }
   PassInstrumentationCallbacks PIC;
-  StandardInstrumentations SI;
+  StandardInstrumentations SI(DebugPM, VerifyEachPass);
   SI.registerCallbacks(PIC);
+  DebugifyEachInstrumentation Debugify;
+  if (DebugifyEach)
+    Debugify.registerCallbacks(PIC);
 
   PipelineTuningOptions PTO;
   // LoopUnrolling defaults on to true and DisableLoopUnrolling is initialized
@@ -269,8 +292,9 @@ bool llvm::runPassPipeline(StringRef Arg0, Module &M, TargetMachine *TM,
   // option has been enabled.
   PTO.LoopUnrolling = !DisableLoopUnrolling;
   PTO.Coroutines = Coroutines;
-  PassBuilder PB(TM, PTO, P, &PIC);
-  registerEPCallbacks(PB, VerifyEachPass, DebugPM);
+  PTO.UniqueLinkageNames = UniqueInternalLinkageNames;
+  PassBuilder PB(DebugPM, TM, PTO, P, &PIC);
+  registerEPCallbacks(PB);
 
   // Load requested pass plugins and let them register pass builder callbacks
   for (auto &PluginFN : PassPlugins) {
@@ -297,6 +321,25 @@ bool llvm::runPassPipeline(StringRef Arg0, Module &M, TargetMachine *TM,
         }
         return false;
       });
+  PB.registerPipelineParsingCallback(
+      [](StringRef Name, ModulePassManager &MPM,
+         ArrayRef<PassBuilder::PipelineElement>) {
+        if (Name == "asan-pipeline") {
+          MPM.addPass(
+              RequireAnalysisPass<ASanGlobalsMetadataAnalysis, Module>());
+          MPM.addPass(
+              createModuleToFunctionPassAdaptor(AddressSanitizerPass()));
+          MPM.addPass(ModuleAddressSanitizerPass());
+          return true;
+        } else if (Name == "asan-function-pipeline") {
+          MPM.addPass(
+              RequireAnalysisPass<ASanGlobalsMetadataAnalysis, Module>());
+          MPM.addPass(
+              createModuleToFunctionPassAdaptor(AddressSanitizerPass()));
+          return true;
+        }
+        return false;
+      });
 
 #define HANDLE_EXTENSION(Ext)                                                  \
   get##Ext##PluginInfo().RegisterPassBuilderCallbacks(PB);
@@ -305,25 +348,33 @@ bool llvm::runPassPipeline(StringRef Arg0, Module &M, TargetMachine *TM,
   // Specially handle the alias analysis manager so that we can register
   // a custom pipeline of AA passes with it.
   AAManager AA;
-  if (!AAPipeline.empty()) {
-    assert(Passes.empty() &&
-           "--aa-pipeline and -foo-pass should not both be specified");
+  if (Passes.empty()) {
     if (auto Err = PB.parseAAPipeline(AA, AAPipeline)) {
       errs() << Arg0 << ": " << toString(std::move(Err)) << "\n";
       return false;
     }
   }
+
+  // For compatibility with the legacy PM AA pipeline.
+  // AAResultsWrapperPass by default provides basic-aa in the legacy PM
+  // unless -disable-basic-aa is specified.
+  // TODO: remove this once tests implicitly requiring basic-aa use -passes= and
+  // -aa-pipeline=basic-aa.
+  if (!Passes.empty() && !DisableBasicAA) {
+    if (auto Err = PB.parseAAPipeline(AA, "basic-aa")) {
+      errs() << Arg0 << ": " << toString(std::move(Err)) << "\n";
+      return false;
+    }
+  }
+
   // For compatibility with legacy pass manager.
   // Alias analyses are not specially specified when using the legacy PM.
-  SmallVector<StringRef, 4> NonAAPasses;
   for (auto PassName : Passes) {
     if (PB.isAAPassName(PassName)) {
       if (auto Err = PB.parseAAPipeline(AA, PassName)) {
         errs() << Arg0 << ": " << toString(std::move(Err)) << "\n";
         return false;
       }
-    } else {
-      NonAAPasses.push_back(PassName);
     }
   }
 
@@ -334,6 +385,8 @@ bool llvm::runPassPipeline(StringRef Arg0, Module &M, TargetMachine *TM,
 
   // Register the AA manager first so that our version is the one used.
   FAM.registerPass([&] { return std::move(AA); });
+  // Register our TargetLibraryInfoImpl.
+  FAM.registerPass([&] { return TargetLibraryAnalysis(*TLII); });
 
   // Register all the basic analyses with the managers.
   PB.registerModuleAnalyses(MAM);
@@ -351,18 +404,16 @@ bool llvm::runPassPipeline(StringRef Arg0, Module &M, TargetMachine *TM,
   if (!PassPipeline.empty()) {
     assert(Passes.empty() &&
            "PassPipeline and Passes should not both contain passes");
-    if (auto Err =
-            PB.parsePassPipeline(MPM, PassPipeline, VerifyEachPass, DebugPM)) {
+    if (auto Err = PB.parsePassPipeline(MPM, PassPipeline)) {
       errs() << Arg0 << ": " << toString(std::move(Err)) << "\n";
       return false;
     }
   }
-  for (auto PassName : NonAAPasses) {
+  for (auto PassName : Passes) {
     std::string ModifiedPassName(PassName.begin(), PassName.end());
     if (PB.isAnalysisPassName(PassName))
       ModifiedPassName = "require<" + ModifiedPassName + ">";
-    if (auto Err = PB.parsePassPipeline(MPM, ModifiedPassName, VerifyEachPass,
-                                        DebugPM)) {
+    if (auto Err = PB.parsePassPipeline(MPM, ModifiedPassName)) {
       errs() << Arg0 << ": " << toString(std::move(Err)) << "\n";
       return false;
     }
@@ -407,5 +458,8 @@ bool llvm::runPassPipeline(StringRef Arg0, Module &M, TargetMachine *TM,
   if (OptRemarkFile)
     OptRemarkFile->keep();
 
+  if (DebugifyEach && !DebugifyExport.empty())
+    exportDebugifyStats(DebugifyExport, Debugify.StatsMap);
+
   return true;
 }
diff --git a/contrib/llvm-project/llvm/tools/opt/NewPMDriver.h b/contrib/llvm-project/llvm/tools/opt/NewPMDriver.h
index 7ae273a2c1f4..87a71cec4c53 100644
--- a/contrib/llvm-project/llvm/tools/opt/NewPMDriver.h
+++ b/contrib/llvm-project/llvm/tools/opt/NewPMDriver.h
@@ -21,12 +21,17 @@
 #define LLVM_TOOLS_OPT_NEWPMDRIVER_H
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/CommandLine.h"
 
 namespace llvm {
 class StringRef;
 class Module;
 class TargetMachine;
 class ToolOutputFile;
+class TargetLibraryInfoImpl;
+
+extern cl::opt<bool> DebugifyEach;
+extern cl::opt<std::string> DebugifyExport;
 
 namespace opt_tool {
 enum OutputKind {
@@ -59,10 +64,10 @@ enum CSPGOKind { NoCSPGO, CSInstrGen, CSInstrUse };
 /// ThinLTOLinkOut is only used when OK is OK_OutputThinLTOBitcode, and can be
 /// nullptr.
 bool runPassPipeline(StringRef Arg0, Module &M, TargetMachine *TM,
-                     ToolOutputFile *Out, ToolOutputFile *ThinLinkOut,
-                     ToolOutputFile *OptRemarkFile, StringRef PassPipeline,
-                     ArrayRef<StringRef> PassInfos, opt_tool::OutputKind OK,
-                     opt_tool::VerifierKind VK,
+                     TargetLibraryInfoImpl *TLII, ToolOutputFile *Out,
+                     ToolOutputFile *ThinLinkOut, ToolOutputFile *OptRemarkFile,
+                     StringRef PassPipeline, ArrayRef<StringRef> PassInfos,
+                     opt_tool::OutputKind OK, opt_tool::VerifierKind VK,
                      bool ShouldPreserveAssemblyUseListOrder,
                      bool ShouldPreserveBitcodeUseListOrder,
                      bool EmitSummaryIndex, bool EmitModuleHash,
diff --git a/contrib/llvm-project/llvm/tools/opt/opt.cpp b/contrib/llvm-project/llvm/tools/opt/opt.cpp
index c250eefb8c43..5cb59f85ccf8 100644
--- a/contrib/llvm-project/llvm/tools/opt/opt.cpp
+++ b/contrib/llvm-project/llvm/tools/opt/opt.cpp
@@ -22,13 +22,11 @@
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/AsmParser/Parser.h"
-#include "llvm/Bitcode/BitcodeWriterPass.h"
 #include "llvm/CodeGen/CommandFlags.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
-#include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LLVMRemarkStreamer.h"
 #include "llvm/IR/LegacyPassManager.h"
@@ -40,6 +38,7 @@
 #include "llvm/LinkAllIR.h"
 #include "llvm/LinkAllPasses.h"
 #include "llvm/MC/SubtargetFeature.h"
+#include "llvm/Remarks/HotnessThresholdParser.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Host.h"
@@ -71,8 +70,10 @@ static codegen::RegisterCodeGenFlags CFG;
 static cl::list<const PassInfo*, bool, PassNameParser>
 PassList(cl::desc("Optimizations available:"));
 
-static cl::opt<bool> EnableNewPassManager(
-    "enable-new-pm", cl::desc("Enable the new pass manager"), cl::init(false));
+static cl::opt<bool>
+    EnableNewPassManager("enable-new-pm",
+                         cl::desc("Enable the new pass manager"),
+                         cl::init(LLVM_ENABLE_NEW_PASS_MANAGER));
 
 // This flag specifies a textual description of the optimization pass pipeline
 // to run over the module. This flag switches opt to use the new pass manager
@@ -211,16 +212,6 @@ static cl::opt<bool> EnableDebugify(
     cl::desc(
         "Start the pipeline with debugify and end it with check-debugify"));
 
-static cl::opt<bool> DebugifyEach(
-    "debugify-each",
-    cl::desc(
-        "Start each pass with debugify and end it with check-debugify"));
-
-static cl::opt<std::string>
-    DebugifyExport("debugify-export",
-                   cl::desc("Export per-pass debugify statistics to this file"),
-                   cl::value_desc("filename"), cl::init(""));
-
 static cl::opt<bool>
 PrintBreakpoints("print-breakpoints-for-testing",
                  cl::desc("Print select breakpoints location for testing"));
@@ -274,11 +265,13 @@ static cl::opt<bool> RemarksWithHotness(
     cl::desc("With PGO, include profile count in optimization remarks"),
     cl::Hidden);
 
-static cl::opt<unsigned>
-    RemarksHotnessThreshold("pass-remarks-hotness-threshold",
-                            cl::desc("Minimum profile count required for "
-                                     "an optimization remark to be output"),
-                            cl::Hidden);
+static cl::opt<Optional<uint64_t>, false, remarks::HotnessThresholdParser>
+    RemarksHotnessThreshold(
+        "pass-remarks-hotness-threshold",
+        cl::desc("Minimum profile count required for "
+                 "an optimization remark to be output. "
+                 "Use 'auto' to apply the threshold from profile summary."),
+        cl::value_desc("N or 'auto'"), cl::init(0), cl::Hidden);
 
 static cl::opt<std::string>
     RemarksFilename("pass-remarks-output",
@@ -325,48 +318,6 @@ cl::opt<std::string> CSProfileGenFile(
     cl::desc("Path to the instrumented context sensitive profile."),
     cl::Hidden);
 
-class OptCustomPassManager : public legacy::PassManager {
-  DebugifyStatsMap DIStatsMap;
-
-public:
-  using super = legacy::PassManager;
-
-  void add(Pass *P) override {
-    // Wrap each pass with (-check)-debugify passes if requested, making
-    // exceptions for passes which shouldn't see -debugify instrumentation.
-    bool WrapWithDebugify = DebugifyEach && !P->getAsImmutablePass() &&
-                            !isIRPrintingPass(P) && !isBitcodeWriterPass(P);
-    if (!WrapWithDebugify) {
-      super::add(P);
-      return;
-    }
-
-    // Apply -debugify/-check-debugify before/after each pass and collect
-    // debug info loss statistics.
-    PassKind Kind = P->getPassKind();
-    StringRef Name = P->getPassName();
-
-    // TODO: Implement Debugify for LoopPass.
-    switch (Kind) {
-      case PT_Function:
-        super::add(createDebugifyFunctionPass());
-        super::add(P);
-        super::add(createCheckDebugifyFunctionPass(true, Name, &DIStatsMap));
-        break;
-      case PT_Module:
-        super::add(createDebugifyModulePass());
-        super::add(P);
-        super::add(createCheckDebugifyModulePass(true, Name, &DIStatsMap));
-        break;
-      default:
-        super::add(P);
-        break;
-    }
-  }
-
-  const DebugifyStatsMap &getDebugifyStatsMap() const { return DIStatsMap; }
-};
-
 static inline void addPass(legacy::PassManagerBase &PM, Pass *P) {
   // Add the pass to the pass manager...
   PM.add(P);
@@ -490,28 +441,6 @@ static TargetMachine* GetTargetMachine(Triple TheTriple, StringRef CPUStr,
 void initializeExampleIRTransforms(llvm::PassRegistry &Registry);
 #endif
 
-
-void exportDebugifyStats(llvm::StringRef Path, const DebugifyStatsMap &Map) {
-  std::error_code EC;
-  raw_fd_ostream OS{Path, EC};
-  if (EC) {
-    errs() << "Could not open file: " << EC.message() << ", " << Path << '\n';
-    return;
-  }
-
-  OS << "Pass Name" << ',' << "# of missing debug values" << ','
-     << "# of missing locations" << ',' << "Missing/Expected value ratio" << ','
-     << "Missing/Expected location ratio" << '\n';
-  for (const auto &Entry : Map) {
-    StringRef Pass = Entry.first;
-    DebugifyStatistics Stats = Entry.second;
-
-    OS << Pass << ',' << Stats.NumDbgValuesMissing << ','
-       << Stats.NumDbgLocsMissing << ',' << Stats.getMissingValueRatio() << ','
-       << Stats.getEmptyLocationRatio() << '\n';
-  }
-}
-
 struct TimeTracerRAII {
   TimeTracerRAII(StringRef ProgramName) {
     if (TimeTrace)
@@ -530,6 +459,67 @@ struct TimeTracerRAII {
   }
 };
 
+// For use in NPM transition. Currently this contains most codegen-specific
+// passes. Remove passes from here when porting to the NPM.
+// TODO: use a codegen version of PassRegistry.def/PassBuilder::is*Pass() once
+// it exists.
+static bool shouldPinPassToLegacyPM(StringRef Pass) {
+  std::vector<StringRef> PassNameExactToIgnore = {
+      "nvvm-reflect",
+      "nvvm-intr-range",
+      "amdgpu-simplifylib",
+      "amdgpu-usenative",
+      "amdgpu-promote-alloca",
+      "amdgpu-promote-alloca-to-vector",
+      "amdgpu-lower-kernel-attributes",
+      "amdgpu-propagate-attributes-early",
+      "amdgpu-propagate-attributes-late",
+      "amdgpu-unify-metadata",
+      "amdgpu-printf-runtime-binding",
+      "amdgpu-always-inline"};
+  for (const auto &P : PassNameExactToIgnore)
+    if (Pass == P)
+      return false;
+
+  std::vector<StringRef> PassNamePrefix = {
+      "x86-",  "xcore-", "wasm-",    "systemz-", "ppc-",    "nvvm-",   "nvptx-",
+      "mips-", "lanai-", "hexagon-", "bpf-",     "avr-",    "thumb2-", "arm-",
+      "si-",   "gcn-",   "amdgpu-",  "aarch64-", "amdgcn-", "polly-"};
+  std::vector<StringRef> PassNameContain = {"ehprepare"};
+  std::vector<StringRef> PassNameExact = {
+      "safe-stack",           "cost-model",
+      "codegenprepare",       "interleaved-load-combine",
+      "unreachableblockelim", "verify-safepoint-ir",
+      "divergence",           "atomic-expand",
+      "hardware-loops",       "type-promotion",
+      "mve-tail-predication", "interleaved-access",
+      "global-merge",         "pre-isel-intrinsic-lowering",
+      "expand-reductions",    "indirectbr-expand",
+      "generic-to-nvvm",      "expandmemcmp",
+      "loop-reduce",          "lower-amx-type",
+      "polyhedral-info"};
+  for (const auto &P : PassNamePrefix)
+    if (Pass.startswith(P))
+      return true;
+  for (const auto &P : PassNameContain)
+    if (Pass.contains(P))
+      return true;
+  for (const auto &P : PassNameExact)
+    if (Pass == P)
+      return true;
+  return false;
+}
+
+// For use in NPM transition.
+static bool shouldForceLegacyPM() {
+  for (const auto &P : PassList) {
+    StringRef Arg = P->getPassArgument();
+    if (shouldPinPassToLegacyPM(Arg))
+      return true;
+  }
+  return false;
+}
+
 //===----------------------------------------------------------------------===//
 // main for opt
 //
@@ -563,12 +553,12 @@ int main(int argc, char **argv) {
   // For codegen passes, only passes that do IR to IR transformation are
   // supported.
   initializeExpandMemCmpPassPass(Registry);
-  initializeScalarizeMaskedMemIntrinPass(Registry);
+  initializeScalarizeMaskedMemIntrinLegacyPassPass(Registry);
   initializeCodeGenPreparePass(Registry);
   initializeAtomicExpandPass(Registry);
   initializeRewriteSymbolsLegacyPassPass(Registry);
   initializeWinEHPreparePass(Registry);
-  initializeDwarfEHPreparePass(Registry);
+  initializeDwarfEHPrepareLegacyPassPass(Registry);
   initializeSafeStackLegacyPassPass(Registry);
   initializeSjLjEHPreparePass(Registry);
   initializePreISelIntrinsicLoweringLegacyPassPass(Registry);
@@ -700,7 +690,8 @@ int main(int argc, char **argv) {
   Triple ModuleTriple(M->getTargetTriple());
   std::string CPUStr, FeaturesStr;
   TargetMachine *Machine = nullptr;
-  const TargetOptions Options = codegen::InitTargetOptionsFromCodeGenFlags();
+  const TargetOptions Options =
+      codegen::InitTargetOptionsFromCodeGenFlags(ModuleTriple);
 
   if (ModuleTriple.getArch()) {
     CPUStr = codegen::getCPUStr();
@@ -729,16 +720,41 @@ int main(int argc, char **argv) {
   if (OutputThinLTOBC)
     M->addModuleFlag(Module::Error, "EnableSplitLTOUnit", SplitLTOUnit);
 
-  if (EnableNewPassManager || PassPipeline.getNumOccurrences() > 0) {
+  // Add an appropriate TargetLibraryInfo pass for the module's triple.
+  TargetLibraryInfoImpl TLII(ModuleTriple);
+
+  // The -disable-simplify-libcalls flag actually disables all builtin optzns.
+  if (DisableSimplifyLibCalls)
+    TLII.disableAllFunctions();
+  else {
+    // Disable individual builtin functions in TargetLibraryInfo.
+    LibFunc F;
+    for (auto &FuncName : DisableBuiltins)
+      if (TLII.getLibFunc(FuncName, F))
+        TLII.setUnavailable(F);
+      else {
+        errs() << argv[0] << ": cannot disable nonexistent builtin function "
+               << FuncName << '\n';
+        return 1;
+      }
+  }
+
+  // If `-passes=` is specified, use NPM.
+  // If `-enable-new-pm` is specified and there are no codegen passes, use NPM.
+  // e.g. `-enable-new-pm -sroa` will use NPM.
+  // but `-enable-new-pm -codegenprepare` will still revert to legacy PM.
+  if ((EnableNewPassManager && !shouldForceLegacyPM()) ||
+      PassPipeline.getNumOccurrences() > 0) {
+    if (AnalyzeOnly) {
+      errs() << "Cannot specify -analyze under new pass manager\n";
+      return 1;
+    }
     if (PassPipeline.getNumOccurrences() > 0 && PassList.size() > 0) {
       errs()
-          << "Cannot specify passes via both -foo-pass and --passes=foo-pass";
+          << "Cannot specify passes via both -foo-pass and --passes=foo-pass\n";
       return 1;
     }
     SmallVector<StringRef, 4> Passes;
-    for (const auto &P : PassList) {
-      Passes.push_back(P->getPassArgument());
-    }
     if (OptLevelO0)
       Passes.push_back("default<O0>");
     if (OptLevelO1)
@@ -751,6 +767,8 @@ int main(int argc, char **argv) {
       Passes.push_back("default<Os>");
     if (OptLevelOz)
       Passes.push_back("default<Oz>");
+    for (const auto &P : PassList)
+      Passes.push_back(P->getPassArgument());
     OutputKind OK = OK_NoOutput;
     if (!NoOutput)
       OK = OutputAssembly
@@ -766,9 +784,9 @@ int main(int argc, char **argv) {
     // The user has asked to use the new pass manager and provided a pipeline
     // string. Hand off the rest of the functionality to the new code for that
     // layer.
-    return runPassPipeline(argv[0], *M, TM.get(), Out.get(), ThinLinkOut.get(),
-                           RemarksFile.get(), PassPipeline, Passes, OK, VK,
-                           PreserveAssemblyUseListOrder,
+    return runPassPipeline(argv[0], *M, TM.get(), &TLII, Out.get(),
+                           ThinLinkOut.get(), RemarksFile.get(), PassPipeline,
+                           Passes, OK, VK, PreserveAssemblyUseListOrder,
                            PreserveBitcodeUseListOrder, EmitSummaryIndex,
                            EmitModuleHash, EnableDebugify, Coroutines)
                ? 0
@@ -776,28 +794,13 @@ int main(int argc, char **argv) {
   }
 
   // Create a PassManager to hold and optimize the collection of passes we are
-  // about to build.
-  OptCustomPassManager Passes;
-  bool AddOneTimeDebugifyPasses = EnableDebugify && !DebugifyEach;
+  // about to build. If the -debugify-each option is set, wrap each pass with
+  // the (-check)-debugify passes.
+  DebugifyCustomPassManager Passes;
+  if (DebugifyEach)
+    Passes.enableDebugifyEach();
 
-  // Add an appropriate TargetLibraryInfo pass for the module's triple.
-  TargetLibraryInfoImpl TLII(ModuleTriple);
-
-  // The -disable-simplify-libcalls flag actually disables all builtin optzns.
-  if (DisableSimplifyLibCalls)
-    TLII.disableAllFunctions();
-  else {
-    // Disable individual builtin functions in TargetLibraryInfo.
-    LibFunc F;
-    for (auto &FuncName : DisableBuiltins)
-      if (TLII.getLibFunc(FuncName, F))
-        TLII.setUnavailable(F);
-      else {
-        errs() << argv[0] << ": cannot disable nonexistent builtin function "
-               << FuncName << '\n';
-        return 1;
-      }
-  }
+  bool AddOneTimeDebugifyPasses = EnableDebugify && !DebugifyEach;
 
   Passes.add(new TargetLibraryInfoWrapperPass(TLII));
 
diff --git a/contrib/llvm-project/llvm/utils/TableGen/AsmMatcherEmitter.cpp b/contrib/llvm-project/llvm/utils/TableGen/AsmMatcherEmitter.cpp
index 3d63059dcb8b..9d304910ba4e 100644
--- a/contrib/llvm-project/llvm/utils/TableGen/AsmMatcherEmitter.cpp
+++ b/contrib/llvm-project/llvm/utils/TableGen/AsmMatcherEmitter.cpp
@@ -612,7 +612,7 @@ struct MatchableInfo {
   /// operator< - Compare two matchables.
   bool operator<(const MatchableInfo &RHS) const {
     // The primary comparator is the instruction mnemonic.
-    if (int Cmp = Mnemonic.compare(RHS.Mnemonic))
+    if (int Cmp = Mnemonic.compare_lower(RHS.Mnemonic))
       return Cmp == -1;
 
     if (AsmOperands.size() != RHS.AsmOperands.size())
@@ -789,9 +789,8 @@ public:
   }
 
   bool hasOptionalOperands() const {
-    return find_if(Classes, [](const ClassInfo &Class) {
-             return Class.IsOptional;
-           }) != Classes.end();
+    return any_of(Classes,
+                  [](const ClassInfo &Class) { return Class.IsOptional; });
   }
 };
 
@@ -1018,7 +1017,7 @@ void MatchableInfo::tokenizeAsmString(const AsmMatcherInfo &Info,
 
     case '$': {
       if (InTok) {
-        addAsmOperand(String.slice(Prev, i), false);
+        addAsmOperand(String.slice(Prev, i), IsIsolatedToken);
         InTok = false;
         IsIsolatedToken = false;
       }
@@ -1113,9 +1112,7 @@ static std::string getEnumNameForToken(StringRef Str) {
     case '-': Res += "_MINUS_"; break;
     case '#': Res += "_HASH_"; break;
     default:
-      if ((*it >= 'A' && *it <= 'Z') ||
-          (*it >= 'a' && *it <= 'z') ||
-          (*it >= '0' && *it <= '9'))
+      if (isAlnum(*it))
         Res += *it;
       else
         Res += "_" + utostr((unsigned) *it) + "_";
@@ -1980,7 +1977,7 @@ emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
   }
   CvtOS << "  unsigned OpIdx;\n";
   CvtOS << "  Inst.setOpcode(Opcode);\n";
-  CvtOS << "  for (const uint8_t *p = Converter; *p; p+= 2) {\n";
+  CvtOS << "  for (const uint8_t *p = Converter; *p; p += 2) {\n";
   if (HasOptionalOperands) {
     CvtOS << "    OpIdx = *(p + 1) - DefaultsOffset[*(p + 1)];\n";
   } else {
@@ -1990,14 +1987,14 @@ emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
   CvtOS << "    default: llvm_unreachable(\"invalid conversion entry!\");\n";
   CvtOS << "    case CVT_Reg:\n";
   CvtOS << "      static_cast<" << TargetOperandClass
-        << "&>(*Operands[OpIdx]).addRegOperands(Inst, 1);\n";
+        << " &>(*Operands[OpIdx]).addRegOperands(Inst, 1);\n";
   CvtOS << "      break;\n";
   CvtOS << "    case CVT_Tied: {\n";
   CvtOS << "      assert(OpIdx < (size_t)(std::end(TiedAsmOperandTable) -\n";
-  CvtOS << "                          std::begin(TiedAsmOperandTable)) &&\n";
+  CvtOS << "                              std::begin(TiedAsmOperandTable)) &&\n";
   CvtOS << "             \"Tied operand not found\");\n";
   CvtOS << "      unsigned TiedResOpnd = TiedAsmOperandTable[OpIdx][0];\n";
-  CvtOS << "      if (TiedResOpnd != (uint8_t) -1)\n";
+  CvtOS << "      if (TiedResOpnd != (uint8_t)-1)\n";
   CvtOS << "        Inst.addOperand(Inst.getOperand(TiedResOpnd));\n";
   CvtOS << "      break;\n";
   CvtOS << "    }\n";
@@ -2012,7 +2009,7 @@ emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
        << "  assert(Kind < CVT_NUM_SIGNATURES && \"Invalid signature!\");\n"
        << "  unsigned NumMCOperands = 0;\n"
        << "  const uint8_t *Converter = ConversionTable[Kind];\n"
-       << "  for (const uint8_t *p = Converter; *p; p+= 2) {\n"
+       << "  for (const uint8_t *p = Converter; *p; p += 2) {\n"
        << "    switch (*p) {\n"
        << "    default: llvm_unreachable(\"invalid conversion entry!\");\n"
        << "    case CVT_Reg:\n"
@@ -2129,12 +2126,12 @@ emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
                 << OpInfo.MINumOperands << ");\n"
                 << "      } else {\n"
                 << "        static_cast<" << TargetOperandClass
-                << "&>(*Operands[OpIdx])." << Op.Class->RenderMethod
+                << " &>(*Operands[OpIdx])." << Op.Class->RenderMethod
                 << "(Inst, " << OpInfo.MINumOperands << ");\n"
                 << "      }\n";
         } else {
           CvtOS << "      static_cast<" << TargetOperandClass
-                << "&>(*Operands[OpIdx])." << Op.Class->RenderMethod
+                << " &>(*Operands[OpIdx])." << Op.Class->RenderMethod
                 << "(Inst, " << OpInfo.MINumOperands << ");\n";
         }
         CvtOS << "      break;\n";
@@ -2390,9 +2387,9 @@ static void emitMatchClassEnumeration(CodeGenTarget &Target,
 static void emitOperandMatchErrorDiagStrings(AsmMatcherInfo &Info, raw_ostream &OS) {
   // If the target does not use DiagnosticString for any operands, don't emit
   // an unused function.
-  if (std::all_of(
-          Info.Classes.begin(), Info.Classes.end(),
-          [](const ClassInfo &CI) { return CI.DiagnosticString.empty(); }))
+  if (llvm::all_of(Info.Classes, [](const ClassInfo &CI) {
+        return CI.DiagnosticString.empty();
+      }))
     return;
 
   OS << "static const char *getMatchKindDiag(" << Info.Target.getName()
@@ -2448,7 +2445,7 @@ static void emitValidateOperandClass(AsmMatcherInfo &Info,
   OS << "static unsigned validateOperandClass(MCParsedAsmOperand &GOp, "
      << "MatchClassKind Kind) {\n";
   OS << "  " << Info.Target.getName() << "Operand &Operand = ("
-     << Info.Target.getName() << "Operand&)GOp;\n";
+     << Info.Target.getName() << "Operand &)GOp;\n";
 
   // The InvalidMatchClass is not to match any operand.
   OS << "  if (Kind == InvalidMatchClass)\n";
@@ -2811,7 +2808,7 @@ static bool emitMnemonicAliases(raw_ostream &OS, const AsmMatcherInfo &Info,
     Record *AsmVariant = Target.getAsmParserVariant(VC);
     int AsmParserVariantNo = AsmVariant->getValueAsInt("Variant");
     StringRef AsmParserVariantName = AsmVariant->getValueAsString("Name");
-    OS << "    case " << AsmParserVariantNo << ":\n";
+    OS << "  case " << AsmParserVariantNo << ":\n";
     emitMnemonicAliasVariant(OS, Info, Aliases, /*Indent=*/2,
                              AsmParserVariantName);
     OS << "    break;\n";
@@ -2880,7 +2877,7 @@ static void emitCustomOperandParsing(raw_ostream &OS, CodeGenTarget &Target,
     OS << "  { ";
 
     // Store a pascal-style length byte in the mnemonic.
-    std::string LenMnemonic = char(II.Mnemonic.size()) + II.Mnemonic.str();
+    std::string LenMnemonic = char(II.Mnemonic.size()) + II.Mnemonic.lower();
     OS << StringTable.GetOrAddStringOffset(LenMnemonic, false)
        << " /* " << II.Mnemonic << " */, ";
 
@@ -2978,7 +2975,7 @@ static void emitCustomOperandParsing(raw_ostream &OS, CodeGenTarget &Target,
         "FeatureBitsets[it->RequiredFeaturesIdx];\n";
   OS << "    if (!ParseForAllFeatures && (AvailableFeatures & "
         "RequiredFeatures) != RequiredFeatures)\n";
-  OS << "        continue;\n\n";
+  OS << "      continue;\n\n";
 
   // Emit check to ensure the operand number matches.
   OS << "    // check if the operand in question has a custom parser.\n";
@@ -3011,10 +3008,10 @@ static void emitAsmTiedOperandConstraints(CodeGenTarget &Target,
   OS << "                               uint64_t &ErrorInfo) {\n";
   OS << "  assert(Kind < CVT_NUM_SIGNATURES && \"Invalid signature!\");\n";
   OS << "  const uint8_t *Converter = ConversionTable[Kind];\n";
-  OS << "  for (const uint8_t *p = Converter; *p; p+= 2) {\n";
+  OS << "  for (const uint8_t *p = Converter; *p; p += 2) {\n";
   OS << "    switch (*p) {\n";
   OS << "    case CVT_Tied: {\n";
-  OS << "      unsigned OpIdx = *(p+1);\n";
+  OS << "      unsigned OpIdx = *(p + 1);\n";
   OS << "      assert(OpIdx < (size_t)(std::end(TiedAsmOperandTable) -\n";
   OS << "                              std::begin(TiedAsmOperandTable)) &&\n";
   OS << "             \"Tied operand not found\");\n";
@@ -3086,7 +3083,7 @@ static void emitMnemonicSpellChecker(raw_ostream &OS, CodeGenTarget &Target,
     OS << "\n";
     OS << "  std::string Res = \", did you mean: \";\n";
     OS << "  unsigned i = 0;\n";
-    OS << "  for( ; i < Candidates.size() - 1; i++)\n";
+    OS << "  for (; i < Candidates.size() - 1; i++)\n";
     OS << "    Res += Candidates[i].str() + \", \";\n";
     OS << "  return Res + Candidates[i].str() + \"?\";\n";
   }
@@ -3094,6 +3091,67 @@ static void emitMnemonicSpellChecker(raw_ostream &OS, CodeGenTarget &Target,
   OS << "\n";
 }
 
+static void emitMnemonicChecker(raw_ostream &OS,
+                                CodeGenTarget &Target,
+                                unsigned VariantCount,
+                                bool HasMnemonicFirst,
+                                bool HasMnemonicAliases) {
+  OS << "static bool " << Target.getName()
+     << "CheckMnemonic(StringRef Mnemonic,\n";
+  OS << "                                "
+     << "const FeatureBitset &AvailableFeatures,\n";
+  OS << "                                "
+     << "unsigned VariantID) {\n";
+
+  if (!VariantCount) {
+    OS <<  "  return false;\n";
+  } else {
+    if (HasMnemonicAliases) {
+      OS << "  // Process all MnemonicAliases to remap the mnemonic.\n";
+      OS << "  applyMnemonicAliases(Mnemonic, AvailableFeatures, VariantID);";
+      OS << "\n\n";
+    }
+    OS << "  // Find the appropriate table for this asm variant.\n";
+    OS << "  const MatchEntry *Start, *End;\n";
+    OS << "  switch (VariantID) {\n";
+    OS << "  default: llvm_unreachable(\"invalid variant!\");\n";
+    for (unsigned VC = 0; VC != VariantCount; ++VC) {
+      Record *AsmVariant = Target.getAsmParserVariant(VC);
+      int AsmVariantNo = AsmVariant->getValueAsInt("Variant");
+      OS << "  case " << AsmVariantNo << ": Start = std::begin(MatchTable" << VC
+         << "); End = std::end(MatchTable" << VC << "); break;\n";
+    }
+    OS << "  }\n\n";
+
+    OS << "  // Search the table.\n";
+    if (HasMnemonicFirst) {
+      OS << "  auto MnemonicRange = "
+            "std::equal_range(Start, End, Mnemonic, LessOpcode());\n\n";
+    } else {
+      OS << "  auto MnemonicRange = std::make_pair(Start, End);\n";
+      OS << "  unsigned SIndex = Mnemonic.empty() ? 0 : 1;\n";
+      OS << "  if (!Mnemonic.empty())\n";
+      OS << "    MnemonicRange = "
+         << "std::equal_range(Start, End, Mnemonic.lower(), LessOpcode());\n\n";
+    }
+
+    OS << "  if (MnemonicRange.first == MnemonicRange.second)\n";
+    OS << "    return false;\n\n";
+
+    OS << "  for (const MatchEntry *it = MnemonicRange.first, "
+       << "*ie = MnemonicRange.second;\n";
+    OS << "       it != ie; ++it) {\n";
+    OS << "    const FeatureBitset &RequiredFeatures =\n";
+    OS << "      FeatureBitsets[it->RequiredFeaturesIdx];\n";
+    OS << "    if ((AvailableFeatures & RequiredFeatures) == ";
+    OS << "RequiredFeatures)\n";
+    OS << "      return true;\n";
+    OS << "  }\n";
+    OS << "  return false;\n";
+  }
+  OS << "}\n";
+  OS << "\n";
+}
 
 // Emit a function mapping match classes to strings, for debugging.
 static void emitMatchClassKindNames(std::forward_list<ClassInfo> &Infos,
@@ -3194,7 +3252,7 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   OS << "#undef GET_ASSEMBLER_HEADER\n";
   OS << "  // This should be included into the middle of the declaration of\n";
   OS << "  // your subclasses implementation of MCTargetAsmParser.\n";
-  OS << "  FeatureBitset ComputeAvailableFeatures(const FeatureBitset& FB) const;\n";
+  OS << "  FeatureBitset ComputeAvailableFeatures(const FeatureBitset &FB) const;\n";
   if (HasOptionalOperands) {
     OS << "  void convertToMCInst(unsigned Kind, MCInst &Inst, "
        << "unsigned Opcode,\n"
@@ -3324,7 +3382,7 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
     HasDeprecation |= MI->HasDeprecation;
 
     // Store a pascal-style length byte in the mnemonic.
-    std::string LenMnemonic = char(MI->Mnemonic.size()) + MI->Mnemonic.str();
+    std::string LenMnemonic = char(MI->Mnemonic.size()) + MI->Mnemonic.lower();
     MaxMnemonicIndex = std::max(MaxMnemonicIndex,
                         StringTable.GetOrAddStringOffset(LenMnemonic, false));
   }
@@ -3438,7 +3496,8 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
         continue;
 
       // Store a pascal-style length byte in the mnemonic.
-      std::string LenMnemonic = char(MI->Mnemonic.size()) + MI->Mnemonic.str();
+      std::string LenMnemonic =
+          char(MI->Mnemonic.size()) + MI->Mnemonic.lower();
       OS << "  { " << StringTable.GetOrAddStringOffset(LenMnemonic, false)
          << " /* " << MI->Mnemonic << " */, "
          << Target.getInstNamespace() << "::"
@@ -3497,12 +3556,12 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   OS << "  // Get the instruction mnemonic, which is the first token.\n";
   if (HasMnemonicFirst) {
     OS << "  StringRef Mnemonic = ((" << Target.getName()
-       << "Operand&)*Operands[0]).getToken();\n\n";
+       << "Operand &)*Operands[0]).getToken();\n\n";
   } else {
     OS << "  StringRef Mnemonic;\n";
     OS << "  if (Operands[0]->isToken())\n";
     OS << "    Mnemonic = ((" << Target.getName()
-       << "Operand&)*Operands[0]).getToken();\n\n";
+       << "Operand &)*Operands[0]).getToken();\n\n";
   }
 
   if (HasMnemonicAliases) {
@@ -3552,7 +3611,7 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   }
 
   OS << "  DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"AsmMatcher: found \" <<\n"
-     << "  std::distance(MnemonicRange.first, MnemonicRange.second) << \n"
+     << "  std::distance(MnemonicRange.first, MnemonicRange.second) <<\n"
      << "  \" encodings with mnemonic '\" << Mnemonic << \"'\\n\");\n\n";
 
   OS << "  // Return a more specific error code if no mnemonics match.\n";
@@ -3727,10 +3786,10 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   OS << "      FeatureBitset NewMissingFeatures = RequiredFeatures & "
         "~AvailableFeatures;\n";
   OS << "      DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"Missing target features:\";\n";
-  OS << "                       for (unsigned I = 0, E = NewMissingFeatures.size(); I != E; ++I)\n";
-  OS << "                         if (NewMissingFeatures[I])\n";
-  OS << "                           dbgs() << ' ' << I;\n";
-  OS << "                       dbgs() << \"\\n\");\n";
+  OS << "                      for (unsigned I = 0, E = NewMissingFeatures.size(); I != E; ++I)\n";
+  OS << "                        if (NewMissingFeatures[I])\n";
+  OS << "                          dbgs() << ' ' << I;\n";
+  OS << "                      dbgs() << \"\\n\");\n";
   if (ReportMultipleNearMisses) {
     OS << "      FeaturesNearMiss = NearMissInfo::getMissedFeature(NewMissingFeatures);\n";
   } else {
@@ -3865,7 +3924,7 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
     OS << "        getTargetOptions().MCNoDeprecatedWarn &&\n";
     OS << "        MII.getDeprecatedInfo(Inst, getSTI(), Info)) {\n";
     OS << "      SMLoc Loc = ((" << Target.getName()
-       << "Operand&)*Operands[0]).getStartLoc();\n";
+       << "Operand &)*Operands[0]).getStartLoc();\n";
     OS << "      getParser().Warning(Loc, Info, None);\n";
     OS << "    }\n";
   }
@@ -3908,6 +3967,14 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   emitMnemonicSpellChecker(OS, Target, VariantCount);
 
   OS << "#endif // GET_MNEMONIC_SPELL_CHECKER\n\n";
+
+  OS << "\n#ifdef GET_MNEMONIC_CHECKER\n";
+  OS << "#undef GET_MNEMONIC_CHECKER\n\n";
+
+  emitMnemonicChecker(OS, Target, VariantCount,
+                      HasMnemonicFirst, HasMnemonicAliases);
+
+  OS << "#endif // GET_MNEMONIC_CHECKER\n\n";
 }
 
 namespace llvm {
diff --git a/contrib/llvm-project/llvm/utils/TableGen/AsmWriterEmitter.cpp b/contrib/llvm-project/llvm/utils/TableGen/AsmWriterEmitter.cpp
index d10ea71e97e3..92df204475b9 100644
--- a/contrib/llvm-project/llvm/utils/TableGen/AsmWriterEmitter.cpp
+++ b/contrib/llvm-project/llvm/utils/TableGen/AsmWriterEmitter.cpp
@@ -64,9 +64,15 @@ public:
   AsmWriterEmitter(RecordKeeper &R);
 
   void run(raw_ostream &o);
-
 private:
-  void EmitPrintInstruction(raw_ostream &o);
+  void EmitGetMnemonic(
+      raw_ostream &o,
+      std::vector<std::vector<std::string>> &TableDrivenOperandPrinters,
+      unsigned &BitsLeft, unsigned &AsmStrBits);
+  void EmitPrintInstruction(
+      raw_ostream &o,
+      std::vector<std::vector<std::string>> &TableDrivenOperandPrinters,
+      unsigned &BitsLeft, unsigned &AsmStrBits);
   void EmitGetRegisterName(raw_ostream &o);
   void EmitPrintAliasInstruction(raw_ostream &O);
 
@@ -212,12 +218,11 @@ FindUniqueOperandCommands(std::vector<std::string> &UniqueOperandCommands,
 
       // Otherwise, scan to see if all of the other instructions in this command
       // set share the operand.
-      if (std::any_of(Idxs.begin()+1, Idxs.end(),
-                      [&](unsigned Idx) {
-                        const AsmWriterInst &OtherInst = Instructions[Idx];
-                        return OtherInst.Operands.size() == Op ||
-                          OtherInst.Operands[Op] != FirstInst.Operands[Op];
-                      }))
+      if (any_of(drop_begin(Idxs), [&](unsigned Idx) {
+            const AsmWriterInst &OtherInst = Instructions[Idx];
+            return OtherInst.Operands.size() == Op ||
+                   OtherInst.Operands[Op] != FirstInst.Operands[Op];
+          }))
         break;
 
       // Okay, everything in this command set has the same next operand.  Add it
@@ -288,22 +293,19 @@ static void UnescapeAliasString(std::string &Str) {
   }
 }
 
-/// EmitPrintInstruction - Generate the code for the "printInstruction" method
-/// implementation. Destroys all instances of AsmWriterInst information, by
-/// clearing the Instructions vector.
-void AsmWriterEmitter::EmitPrintInstruction(raw_ostream &O) {
+void AsmWriterEmitter::EmitGetMnemonic(
+    raw_ostream &O,
+    std::vector<std::vector<std::string>> &TableDrivenOperandPrinters,
+    unsigned &BitsLeft, unsigned &AsmStrBits) {
   Record *AsmWriter = Target.getAsmWriter();
   StringRef ClassName = AsmWriter->getValueAsString("AsmWriterClassName");
   bool PassSubtarget = AsmWriter->getValueAsInt("PassSubtarget");
 
-  O << "/// printInstruction - This method is automatically generated by "
+  O << "/// getMnemonic - This method is automatically generated by "
        "tablegen\n"
        "/// from the instruction set description.\n"
-       "void "
-    << Target.getName() << ClassName
-    << "::printInstruction(const MCInst *MI, uint64_t Address, "
-    << (PassSubtarget ? "const MCSubtargetInfo &STI, " : "")
-    << "raw_ostream &O) {\n";
+       "std::pair<const char *, uint64_t> "
+    << Target.getName() << ClassName << "::getMnemonic(const MCInst *MI) {\n";
 
   // Build an aggregate string, and build a table of offsets into it.
   SequenceToOffsetTable<std::string> StringTable;
@@ -349,13 +351,11 @@ void AsmWriterEmitter::EmitPrintInstruction(raw_ostream &O) {
   }
 
   // Figure out how many bits we used for the string index.
-  unsigned AsmStrBits = Log2_32_Ceil(MaxStringIdx+2);
+  AsmStrBits = Log2_32_Ceil(MaxStringIdx + 2);
 
   // To reduce code size, we compactify common instructions into a few bits
   // in the opcode-indexed table.
-  unsigned BitsLeft = OpcodeInfoBits-AsmStrBits;
-
-  std::vector<std::vector<std::string>> TableDrivenOperandPrinters;
+  BitsLeft = OpcodeInfoBits - AsmStrBits;
 
   while (true) {
     std::vector<std::string> UniqueOperandCommands;
@@ -435,15 +435,47 @@ void AsmWriterEmitter::EmitPrintInstruction(raw_ostream &O) {
     ++Table;
   }
 
-  // Emit the initial tab character.
-  O << "  O << \"\\t\";\n\n";
-
   O << "  // Emit the opcode for the instruction.\n";
   O << BitsString;
 
+  // Return mnemonic string and bits.
+  O << "  return {AsmStrs+(Bits & " << (1 << AsmStrBits) - 1
+    << ")-1, Bits};\n\n";
+
+  O << "}\n";
+}
+
+/// EmitPrintInstruction - Generate the code for the "printInstruction" method
+/// implementation. Destroys all instances of AsmWriterInst information, by
+/// clearing the Instructions vector.
+void AsmWriterEmitter::EmitPrintInstruction(
+    raw_ostream &O,
+    std::vector<std::vector<std::string>> &TableDrivenOperandPrinters,
+    unsigned &BitsLeft, unsigned &AsmStrBits) {
+  const unsigned OpcodeInfoBits = 64;
+  Record *AsmWriter = Target.getAsmWriter();
+  StringRef ClassName = AsmWriter->getValueAsString("AsmWriterClassName");
+  bool PassSubtarget = AsmWriter->getValueAsInt("PassSubtarget");
+
+  O << "/// printInstruction - This method is automatically generated by "
+       "tablegen\n"
+       "/// from the instruction set description.\n"
+       "void "
+    << Target.getName() << ClassName
+    << "::printInstruction(const MCInst *MI, uint64_t Address, "
+    << (PassSubtarget ? "const MCSubtargetInfo &STI, " : "")
+    << "raw_ostream &O) {\n";
+
+  // Emit the initial tab character.
+  O << "  O << \"\\t\";\n\n";
+
   // Emit the starting string.
-  O << "  assert(Bits != 0 && \"Cannot print this instruction.\");\n"
-    << "  O << AsmStrs+(Bits & " << (1 << AsmStrBits)-1 << ")-1;\n\n";
+  O << "  auto MnemonicInfo = getMnemonic(MI);\n\n";
+  O << "  O << MnemonicInfo.first;\n\n";
+
+  O << "  uint" << ((BitsLeft < (OpcodeInfoBits - 32)) ? 64 : 32)
+    << "_t Bits = MnemonicInfo.second;\n"
+    << "  assert(Bits != 0 && \"Cannot print this instruction.\");\n";
 
   // Output the table driven operand information.
   BitsLeft = OpcodeInfoBits-AsmStrBits;
@@ -489,10 +521,8 @@ void AsmWriterEmitter::EmitPrintInstruction(raw_ostream &O) {
   }
 
   // Okay, delete instructions with no operand info left.
-  auto I = llvm::remove_if(Instructions,
-                     [](AsmWriterInst &Inst) { return Inst.Operands.empty(); });
-  Instructions.erase(I, Instructions.end());
-
+  llvm::erase_if(Instructions,
+                 [](AsmWriterInst &Inst) { return Inst.Operands.empty(); });
 
   // Because this is a vector, we want to emit from the end.  Reverse all of the
   // elements in the vector.
@@ -683,9 +713,7 @@ public:
         ++Next;
     } else {
       // $name, just eat the usual suspects.
-      while (I != End &&
-             ((*I >= 'a' && *I <= 'z') || (*I >= 'A' && *I <= 'Z') ||
-              (*I >= '0' && *I <= '9') || *I == '_'))
+      while (I != End && (isAlnum(*I) || *I == '_'))
         ++I;
       Next = I;
     }
@@ -1232,13 +1260,10 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
       << "    break;\n";
 
     for (unsigned i = 0; i < MCOpPredicates.size(); ++i) {
-      Init *MCOpPred = MCOpPredicates[i]->getValueInit("MCOperandPredicate");
-      if (CodeInit *SI = dyn_cast<CodeInit>(MCOpPred)) {
-        O << "  case " << i + 1 << ": {\n"
-          << SI->getValue() << "\n"
-          << "    }\n";
-      } else
-        llvm_unreachable("Unexpected MCOperandPredicate field!");
+      StringRef MCOpPred = MCOpPredicates[i]->getValueAsString("MCOperandPredicate");
+      O << "  case " << i + 1 << ": {\n"
+        << MCOpPred.data() << "\n"
+        << "    }\n";
     }
     O << "  }\n"
       << "}\n\n";
@@ -1262,7 +1287,11 @@ AsmWriterEmitter::AsmWriterEmitter(RecordKeeper &R) : Records(R), Target(R) {
 }
 
 void AsmWriterEmitter::run(raw_ostream &O) {
-  EmitPrintInstruction(O);
+  std::vector<std::vector<std::string>> TableDrivenOperandPrinters;
+  unsigned BitsLeft = 0;
+  unsigned AsmStrBits = 0;
+  EmitGetMnemonic(O, TableDrivenOperandPrinters, BitsLeft, AsmStrBits);
+  EmitPrintInstruction(O, TableDrivenOperandPrinters, BitsLeft, AsmStrBits);
   EmitGetRegisterName(O);
   EmitPrintAliasInstruction(O);
 }
diff --git a/contrib/llvm-project/llvm/utils/TableGen/AsmWriterInst.cpp b/contrib/llvm-project/llvm/utils/TableGen/AsmWriterInst.cpp
index 24d29ffc28e5..cf24f79334ca 100644
--- a/contrib/llvm-project/llvm/utils/TableGen/AsmWriterInst.cpp
+++ b/contrib/llvm-project/llvm/utils/TableGen/AsmWriterInst.cpp
@@ -18,12 +18,7 @@
 
 using namespace llvm;
 
-static bool isIdentChar(char C) {
-  return (C >= 'a' && C <= 'z') ||
-  (C >= 'A' && C <= 'Z') ||
-  (C >= '0' && C <= '9') ||
-  C == '_';
-}
+static bool isIdentChar(char C) { return isAlnum(C) || C == '_'; }
 
 std::string AsmWriterOperand::getCode(bool PassSubtarget) const {
   if (OperandType == isLiteralTextOperand) {
diff --git a/contrib/llvm-project/llvm/utils/TableGen/CallingConvEmitter.cpp b/contrib/llvm-project/llvm/utils/TableGen/CallingConvEmitter.cpp
index a4e993f80ec9..9e997483d21d 100644
--- a/contrib/llvm-project/llvm/utils/TableGen/CallingConvEmitter.cpp
+++ b/contrib/llvm-project/llvm/utils/TableGen/CallingConvEmitter.cpp
@@ -38,6 +38,7 @@ void CallingConvEmitter::run(raw_ostream &O) {
 
   // Emit prototypes for all of the non-custom CC's so that they can forward ref
   // each other.
+  Records.startTimer("Emit prototypes");
   for (Record *CC : CCs) {
     if (!CC->getValueAsBit("Custom")) {
       unsigned Pad = CC->getName().size();
@@ -56,6 +57,7 @@ void CallingConvEmitter::run(raw_ostream &O) {
   }
 
   // Emit each non-custom calling convention description in full.
+  Records.startTimer("Emit full descriptions");
   for (Record *CC : CCs) {
     if (!CC->getValueAsBit("Custom"))
       EmitCallingConv(CC, O);
@@ -85,7 +87,7 @@ void CallingConvEmitter::EmitCallingConv(Record *CC, raw_ostream &O) {
     EmitAction(CCActions->getElementAsRecord(i), 2, O);
   }
   
-  O << "\n  return true;  // CC didn't match.\n";
+  O << "\n  return true; // CC didn't match.\n";
   O << "}\n";
 }
 
@@ -238,11 +240,11 @@ void CallingConvEmitter::EmitAction(Record *Action,
         O << IndentStr << "LocInfo = CCValAssign::FPExt;\n";
       } else {
         O << IndentStr << "if (ArgFlags.isSExt())\n"
-          << IndentStr << IndentStr << "LocInfo = CCValAssign::SExt;\n"
+          << IndentStr << "  LocInfo = CCValAssign::SExt;\n"
           << IndentStr << "else if (ArgFlags.isZExt())\n"
-          << IndentStr << IndentStr << "LocInfo = CCValAssign::ZExt;\n"
+          << IndentStr << "  LocInfo = CCValAssign::ZExt;\n"
           << IndentStr << "else\n"
-          << IndentStr << IndentStr << "LocInfo = CCValAssign::AExt;\n";
+          << IndentStr << "  LocInfo = CCValAssign::AExt;\n";
       }
     } else if (Action->isSubClassOf("CCPromoteToUpperBitsInType")) {
       Record *DestTy = Action->getValueAsDef("DestTy");
@@ -254,11 +256,11 @@ void CallingConvEmitter::EmitAction(Record *Action,
                         "point");
       } else {
         O << IndentStr << "if (ArgFlags.isSExt())\n"
-          << IndentStr << IndentStr << "LocInfo = CCValAssign::SExtUpper;\n"
+          << IndentStr << "  LocInfo = CCValAssign::SExtUpper;\n"
           << IndentStr << "else if (ArgFlags.isZExt())\n"
-          << IndentStr << IndentStr << "LocInfo = CCValAssign::ZExtUpper;\n"
+          << IndentStr << "  LocInfo = CCValAssign::ZExtUpper;\n"
           << IndentStr << "else\n"
-          << IndentStr << IndentStr << "LocInfo = CCValAssign::AExtUpper;\n";
+          << IndentStr << "  LocInfo = CCValAssign::AExtUpper;\n";
       }
     } else if (Action->isSubClassOf("CCBitConvertToType")) {
       Record *DestTy = Action->getValueAsDef("DestTy");
@@ -282,7 +284,7 @@ void CallingConvEmitter::EmitAction(Record *Action,
       O << IndentStr
         << "if (" << Action->getValueAsString("FuncName") << "(ValNo, ValVT, "
         << "LocVT, LocInfo, ArgFlags, State))\n";
-      O << IndentStr << IndentStr << "return false;\n";
+      O << IndentStr << "  return false;\n";
     } else {
       errs() << *Action;
       PrintFatalError(Action->getLoc(), "Unknown CCAction!");
diff --git a/contrib/llvm-project/llvm/utils/TableGen/CodeEmitterGen.cpp b/contrib/llvm-project/llvm/utils/TableGen/CodeEmitterGen.cpp
index 6338d44fb2a7..53bf953b13cf 100644
--- a/contrib/llvm-project/llvm/utils/TableGen/CodeEmitterGen.cpp
+++ b/contrib/llvm-project/llvm/utils/TableGen/CodeEmitterGen.cpp
@@ -311,7 +311,7 @@ std::string CodeEmitterGen::getInstructionCaseForEncoding(Record *R, Record *Enc
   for (const RecordVal &RV : EncodingDef->getValues()) {
     // Ignore fixed fields in the record, we're looking for values like:
     //    bits<5> RST = { ?, ?, ?, ?, ? };
-    if (RV.getPrefix() || RV.getValue()->isComplete())
+    if (RV.isNonconcreteOK() || RV.getValue()->isComplete())
       continue;
 
     AddCodeToMergeInOperand(R, BI, std::string(RV.getName()), NumberedOp,
@@ -483,7 +483,7 @@ void CodeEmitterGen::run(raw_ostream &o) {
       << "    Inst = Inst.zext(" << BitWidth << ");\n"
       << "  if (Scratch.getBitWidth() != " << BitWidth << ")\n"
       << "    Scratch = Scratch.zext(" << BitWidth << ");\n"
-      << "  LoadIntFromMemory(Inst, (uint8_t*)&InstBits[opcode * " << NumWords
+      << "  LoadIntFromMemory(Inst, (uint8_t *)&InstBits[opcode * " << NumWords
       << "], " << NumBytes << ");\n"
       << "  APInt &Value = Inst;\n"
       << "  APInt &op = Scratch;\n"
@@ -643,9 +643,9 @@ void CodeEmitterGen::run(raw_ostream &o) {
     << "    report_fatal_error(Msg.str());\n"
     << "  }\n"
     << "#else\n"
-    << "// Silence unused variable warning on targets that don't use MCII for "
+    << "  // Silence unused variable warning on targets that don't use MCII for "
        "other purposes (e.g. BPF).\n"
-    << "(void)MCII;\n"
+    << "  (void)MCII;\n"
     << "#endif // NDEBUG\n";
   o << "}\n";
   o << "#endif\n";
diff --git a/contrib/llvm-project/llvm/utils/TableGen/CodeGenDAGPatterns.cpp b/contrib/llvm-project/llvm/utils/TableGen/CodeGenDAGPatterns.cpp
index 6fdc116721f3..1ca4a68eb155 100644
--- a/contrib/llvm-project/llvm/utils/TableGen/CodeGenDAGPatterns.cpp
+++ b/contrib/llvm-project/llvm/utils/TableGen/CodeGenDAGPatterns.cpp
@@ -507,9 +507,9 @@ bool TypeInfer::EnforceSmallerThan(TypeSetByHwMode &Small,
     // Always treat non-scalable MVTs as smaller than scalable MVTs for the
     // purposes of ordering.
     auto ASize = std::make_tuple(A.isScalableVector(), A.getScalarSizeInBits(),
-                                 A.getSizeInBits());
+                                 A.getSizeInBits().getKnownMinSize());
     auto BSize = std::make_tuple(B.isScalableVector(), B.getScalarSizeInBits(),
-                                 B.getSizeInBits());
+                                 B.getSizeInBits().getKnownMinSize());
     return ASize < BSize;
   };
   auto SameKindLE = [](MVT A, MVT B) -> bool {
@@ -520,8 +520,10 @@ bool TypeInfer::EnforceSmallerThan(TypeSetByHwMode &Small,
         std::make_tuple(B.isVector(), B.isScalableVector()))
       return false;
 
-    return std::make_tuple(A.getScalarSizeInBits(), A.getSizeInBits()) <=
-           std::make_tuple(B.getScalarSizeInBits(), B.getSizeInBits());
+    return std::make_tuple(A.getScalarSizeInBits(),
+                           A.getSizeInBits().getKnownMinSize()) <=
+           std::make_tuple(B.getScalarSizeInBits(),
+                           B.getSizeInBits().getKnownMinSize());
   };
 
   for (unsigned M : Modes) {
@@ -728,14 +730,14 @@ bool TypeInfer::EnforceSameSize(TypeSetByHwMode &A, TypeSetByHwMode &B) {
   if (B.empty())
     Changed |= EnforceAny(B);
 
-  auto NoSize = [](const SmallSet<unsigned,2> &Sizes, MVT T) -> bool {
+  auto NoSize = [](const SmallSet<TypeSize, 2> &Sizes, MVT T) -> bool {
     return !Sizes.count(T.getSizeInBits());
   };
 
   for (unsigned M : union_modes(A, B)) {
     TypeSetByHwMode::SetType &AS = A.get(M);
     TypeSetByHwMode::SetType &BS = B.get(M);
-    SmallSet<unsigned,2> AN, BN;
+    SmallSet<TypeSize, 2> AN, BN;
 
     for (MVT T : AS)
       AN.insert(T.getSizeInBits());
@@ -871,7 +873,7 @@ bool TreePredicateFn::hasPredCode() const {
 }
 
 std::string TreePredicateFn::getPredCode() const {
-  std::string Code = "";
+  std::string Code;
 
   if (!isLoad() && !isStore() && !isAtomic()) {
     Record *MemoryVT = getMemoryVT();
@@ -2388,7 +2390,7 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
         MVT::SimpleValueType VT = P.second.SimpleTy;
         if (VT == MVT::iPTR || VT == MVT::iPTRAny)
           continue;
-        unsigned Size = MVT(VT).getSizeInBits();
+        unsigned Size = MVT(VT).getFixedSizeInBits();
         // Make sure that the value is representable for this type.
         if (Size >= 32)
           continue;
@@ -3086,7 +3088,7 @@ CodeGenDAGPatterns::CodeGenDAGPatterns(RecordKeeper &R,
   VerifyInstructionFlags();
 }
 
-Record *CodeGenDAGPatterns::getSDNodeNamed(const std::string &Name) const {
+Record *CodeGenDAGPatterns::getSDNodeNamed(StringRef Name) const {
   Record *N = Records.getDef(Name);
   if (!N || !N->isSubClassOf("SDNode"))
     PrintFatalError("Error getting SDNode '" + Name + "'!");
@@ -3589,6 +3591,9 @@ static bool hasNullFragReference(DagInit *DI) {
   if (Operator->getName() == "null_frag") return true;
   // If any of the arguments reference the null fragment, return true.
   for (unsigned i = 0, e = DI->getNumArgs(); i != e; ++i) {
+    if (auto Arg = dyn_cast<DefInit>(DI->getArg(i)))
+      if (Arg->getDef()->getName() == "null_frag")
+        return true;
     DagInit *Arg = dyn_cast<DagInit>(DI->getArg(i));
     if (Arg && hasNullFragReference(Arg))
       return true;
@@ -3696,10 +3701,11 @@ void CodeGenDAGPatterns::parseInstructionPattern(
   for (unsigned i = 0; i != NumResults; ++i) {
     if (i == CGI.Operands.size()) {
       const std::string &OpName =
-          std::find_if(InstResults.begin(), InstResults.end(),
-                       [](const std::pair<std::string, TreePatternNodePtr> &P) {
-                         return P.second;
-                       })
+          llvm::find_if(
+              InstResults,
+              [](const std::pair<std::string, TreePatternNodePtr> &P) {
+                return P.second;
+              })
               ->first;
 
       I.error("'" + OpName + "' set but does not appear in operand list!");
@@ -4288,7 +4294,7 @@ void CodeGenDAGPatterns::ExpandHwModeBasedTypes() {
 
     std::vector<Predicate> Preds = P.Predicates;
     const std::vector<Predicate> &MC = ModeChecks[Mode];
-    Preds.insert(Preds.end(), MC.begin(), MC.end());
+    llvm::append_range(Preds, MC);
     PatternsToMatch.emplace_back(P.getSrcRecord(), Preds, std::move(NewSrc),
                                  std::move(NewDst), P.getDstRegs(),
                                  P.getAddedComplexity(), Record::getNewUID(),
diff --git a/contrib/llvm-project/llvm/utils/TableGen/CodeGenDAGPatterns.h b/contrib/llvm-project/llvm/utils/TableGen/CodeGenDAGPatterns.h
index a3b84d76fde9..bc939fe9acc1 100644
--- a/contrib/llvm-project/llvm/utils/TableGen/CodeGenDAGPatterns.h
+++ b/contrib/llvm-project/llvm/utils/TableGen/CodeGenDAGPatterns.h
@@ -14,7 +14,6 @@
 #ifndef LLVM_UTILS_TABLEGEN_CODEGENDAGPATTERNS_H
 #define LLVM_UTILS_TABLEGEN_CODEGENDAGPATTERNS_H
 
-#include "CodeGenHwModes.h"
 #include "CodeGenIntrinsics.h"
 #include "CodeGenTarget.h"
 #include "SDNodeProperties.h"
@@ -42,7 +41,6 @@ class SDNodeInfo;
 class TreePattern;
 class TreePatternNode;
 class CodeGenDAGPatterns;
-class ComplexPattern;
 
 /// Shared pointer for TreePatternNode.
 using TreePatternNodePtr = std::shared_ptr<TreePatternNode>;
@@ -190,7 +188,7 @@ private:
 
 struct TypeSetByHwMode : public InfoByHwMode<MachineValueTypeSet> {
   using SetType = MachineValueTypeSet;
-  std::vector<unsigned> AddrSpaces;
+  SmallVector<unsigned, 16> AddrSpaces;
 
   TypeSetByHwMode() = default;
   TypeSetByHwMode(const TypeSetByHwMode &VTS) = default;
@@ -438,8 +436,6 @@ public:
   unsigned getScope() const { return Scope; }
   const std::string &getIdentifier() const { return Identifier; }
 
-  std::string getFullName() const;
-
   bool operator==(const ScopedName &o) const;
   bool operator!=(const ScopedName &o) const;
 };
@@ -1180,7 +1176,7 @@ public:
   const CodeGenTarget &getTargetInfo() const { return Target; }
   const TypeSetByHwMode &getLegalTypes() const { return LegalVTS; }
 
-  Record *getSDNodeNamed(const std::string &Name) const;
+  Record *getSDNodeNamed(StringRef Name) const;
 
   const SDNodeInfo &getSDNodeInfo(Record *R) const {
     auto F = SDNodes.find(R);
diff --git a/contrib/llvm-project/llvm/utils/TableGen/CodeGenInstruction.cpp b/contrib/llvm-project/llvm/utils/TableGen/CodeGenInstruction.cpp
index 1df5902b081e..960fe08677f7 100644
--- a/contrib/llvm-project/llvm/utils/TableGen/CodeGenInstruction.cpp
+++ b/contrib/llvm-project/llvm/utils/TableGen/CodeGenInstruction.cpp
@@ -472,7 +472,7 @@ HasOneImplicitDefWithKnownVT(const CodeGenTarget &TargetInfo) const {
 /// include text from the specified variant, returning the new string.
 std::string CodeGenInstruction::
 FlattenAsmStringVariants(StringRef Cur, unsigned Variant) {
-  std::string Res = "";
+  std::string Res;
 
   for (;;) {
     // Find the start of the next variant string.
diff --git a/contrib/llvm-project/llvm/utils/TableGen/CodeGenIntrinsics.h b/contrib/llvm-project/llvm/utils/TableGen/CodeGenIntrinsics.h
index af59c1f3d833..c469f662a42d 100644
--- a/contrib/llvm-project/llvm/utils/TableGen/CodeGenIntrinsics.h
+++ b/contrib/llvm-project/llvm/utils/TableGen/CodeGenIntrinsics.h
@@ -148,6 +148,7 @@ struct CodeGenIntrinsic {
   enum ArgAttrKind {
     NoCapture,
     NoAlias,
+    NoUndef,
     Returned,
     ReadOnly,
     WriteOnly,
@@ -176,6 +177,13 @@ struct CodeGenIntrinsic {
     return Properties & (1 << Prop);
   }
 
+  /// Goes through all IntrProperties that have IsDefault
+  /// value set and sets the property.
+  void setDefaultProperties(Record *R, std::vector<Record *> DefaultProperties);
+
+  /// Helper function to set property \p Name to true;
+  void setProperty(Record *R);
+
   /// Returns true if the parameter at \p ParamIdx is a pointer type. Returns
   /// false if the parameter is not a pointer, or \p ParamIdx is greater than
   /// the size of \p IS.ParamVTs.
@@ -185,7 +193,7 @@ struct CodeGenIntrinsic {
 
   bool isParamImmArg(unsigned ParamIdx) const;
 
-  CodeGenIntrinsic(Record *R);
+  CodeGenIntrinsic(Record *R, std::vector<Record *> DefaultProperties);
 };
 
 class CodeGenIntrinsicTable {
diff --git a/contrib/llvm-project/llvm/utils/TableGen/CodeGenMapTable.cpp b/contrib/llvm-project/llvm/utils/TableGen/CodeGenMapTable.cpp
index baca0768b26b..289a20a96f02 100644
--- a/contrib/llvm-project/llvm/utils/TableGen/CodeGenMapTable.cpp
+++ b/contrib/llvm-project/llvm/utils/TableGen/CodeGenMapTable.cpp
@@ -144,25 +144,15 @@ public:
     }
   }
 
-  std::string getName() const {
-    return Name;
-  }
+  const std::string &getName() const { return Name; }
 
-  std::string getFilterClass() {
-    return FilterClass;
-  }
+  const std::string &getFilterClass() const { return FilterClass; }
 
-  ListInit *getRowFields() const {
-    return RowFields;
-  }
+  ListInit *getRowFields() const { return RowFields; }
 
-  ListInit *getColFields() const {
-    return ColFields;
-  }
+  ListInit *getColFields() const { return ColFields; }
 
-  ListInit *getKeyCol() const {
-    return KeyCol;
-  }
+  ListInit *getKeyCol() const { return KeyCol; }
 
   const std::vector<ListInit*> &getValueCols() const {
     return ValueCols;
@@ -200,7 +190,7 @@ private:
 public:
   MapTableEmitter(CodeGenTarget &Target, RecordKeeper &Records, Record *IMRec):
                   Target(Target), InstrMapDesc(IMRec) {
-    const std::string FilterClass = InstrMapDesc.getFilterClass();
+    const std::string &FilterClass = InstrMapDesc.getFilterClass();
     InstrDefs = Records.getAllDerivedDefinitions(FilterClass);
   }
 
@@ -384,7 +374,7 @@ unsigned MapTableEmitter::emitBinSearchTable(raw_ostream &OS) {
   for (unsigned i = 0; i < TotalNumInstr; i++) {
     Record *CurInstr = NumberedInstructions[i]->TheDef;
     std::vector<Record*> ColInstrs = MapTable[CurInstr];
-    std::string OutStr("");
+    std::string OutStr;
     unsigned RelExists = 0;
     if (!ColInstrs.empty()) {
       for (unsigned j = 0; j < NumCol; j++) {
@@ -422,7 +412,7 @@ void MapTableEmitter::emitBinSearch(raw_ostream &OS, unsigned TableSize) {
   OS << "  unsigned start = 0;\n";
   OS << "  unsigned end = " << TableSize << ";\n";
   OS << "  while (start < end) {\n";
-  OS << "    mid = start + (end - start)/2;\n";
+  OS << "    mid = start + (end - start) / 2;\n";
   OS << "    if (Opcode == " << InstrMapDesc.getName() << "Table[mid][0]) {\n";
   OS << "      break;\n";
   OS << "    }\n";
diff --git a/contrib/llvm-project/llvm/utils/TableGen/CodeGenRegisters.cpp b/contrib/llvm-project/llvm/utils/TableGen/CodeGenRegisters.cpp
index 4584bc7cfae3..f9a7ba6bba80 100644
--- a/contrib/llvm-project/llvm/utils/TableGen/CodeGenRegisters.cpp
+++ b/contrib/llvm-project/llvm/utils/TableGen/CodeGenRegisters.cpp
@@ -196,7 +196,7 @@ void CodeGenRegister::buildObjectGraph(CodeGenRegBank &RegBank) {
   }
 }
 
-const StringRef CodeGenRegister::getName() const {
+StringRef CodeGenRegister::getName() const {
   assert(TheDef && "no def");
   return TheDef->getName();
 }
@@ -496,11 +496,10 @@ void CodeGenRegister::computeSecondarySubRegs(CodeGenRegBank &RegBank) {
       assert(getSubRegIndex(SubReg) == SubRegIdx && "LeadingSuperRegs correct");
       for (CodeGenRegister *SubReg : Cand->ExplicitSubRegs) {
         if (CodeGenSubRegIndex *SubRegIdx = getSubRegIndex(SubReg)) {
-          if (SubRegIdx->ConcatenationOf.empty()) {
+          if (SubRegIdx->ConcatenationOf.empty())
             Parts.push_back(SubRegIdx);
-          } else
-            for (CodeGenSubRegIndex *SubIdx : SubRegIdx->ConcatenationOf)
-              Parts.push_back(SubIdx);
+          else
+            append_range(Parts, SubRegIdx->ConcatenationOf);
         } else {
           // Sub-register doesn't exist.
           Parts.clear();
@@ -743,6 +742,8 @@ CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank, Record *R)
       TopoSigs(RegBank.getNumTopoSigs()), EnumValue(-1) {
   GeneratePressureSet = R->getValueAsBit("GeneratePressureSet");
   std::vector<Record*> TypeList = R->getValueAsListOfDefs("RegTypes");
+  if (TypeList.empty())
+    PrintFatalError(R->getLoc(), "RegTypes list must not be empty!");
   for (unsigned i = 0, e = TypeList.size(); i != e; ++i) {
     Record *Type = TypeList[i];
     if (!Type->isSubClassOf("ValueType"))
@@ -751,7 +752,6 @@ CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank, Record *R)
                           "' does not derive from the ValueType class!");
     VTs.push_back(getValueTypeByHwMode(Type, RegBank.getHwModes()));
   }
-  assert(!VTs.empty() && "RegisterClass must contain at least one ValueType!");
 
   // Allocation order 0 is the full set. AltOrders provides others.
   const SetTheory::RecVec *Elements = RegBank.getSets().expand(R);
@@ -998,6 +998,8 @@ CodeGenRegisterClass::getMatchingSubClassWithSubRegs(
                       const CodeGenRegisterClass *B) {
     // If there are multiple, identical register classes, prefer the original
     // register class.
+    if (A == B)
+      return false;
     if (A->getMembers().size() == B->getMembers().size())
       return A == this;
     return A->getMembers().size() > B->getMembers().size();
@@ -1235,8 +1237,7 @@ CodeGenSubRegIndex *CodeGenRegBank::getSubRegIdx(Record *Def) {
 
 const CodeGenSubRegIndex *
 CodeGenRegBank::findSubRegIdx(const Record* Def) const {
-  auto I = Def2SubRegIdx.find(Def);
-  return (I == Def2SubRegIdx.end()) ? nullptr : I->second;
+  return Def2SubRegIdx.lookup(Def);
 }
 
 CodeGenRegister *CodeGenRegBank::getReg(Record *Def) {
@@ -2008,7 +2009,7 @@ void CodeGenRegBank::computeRegUnitSets() {
     if (RCRegUnits.empty())
       continue;
 
-    LLVM_DEBUG(dbgs() << "RC " << RC.getName() << " Units: \n";
+    LLVM_DEBUG(dbgs() << "RC " << RC.getName() << " Units:\n";
                for (auto U
                     : RCRegUnits) printRegUnitName(U);
                dbgs() << "\n  UnitSetIDs:");
diff --git a/contrib/llvm-project/llvm/utils/TableGen/CodeGenRegisters.h b/contrib/llvm-project/llvm/utils/TableGen/CodeGenRegisters.h
index 2b200adef312..5228e6518fe5 100644
--- a/contrib/llvm-project/llvm/utils/TableGen/CodeGenRegisters.h
+++ b/contrib/llvm-project/llvm/utils/TableGen/CodeGenRegisters.h
@@ -163,7 +163,7 @@ namespace llvm {
 
     CodeGenRegister(Record *R, unsigned Enum);
 
-    const StringRef getName() const;
+    StringRef getName() const;
 
     // Extract more information from TheDef. This is used to build an object
     // graph after all CodeGenRegister objects have been created.
@@ -353,7 +353,7 @@ namespace llvm {
     unsigned getNumValueTypes() const { return VTs.size(); }
 
     bool hasType(const ValueTypeByHwMode &VT) const {
-      return std::find(VTs.begin(), VTs.end(), VT) != VTs.end();
+      return llvm::is_contained(VTs, VT);
     }
 
     const ValueTypeByHwMode &getValueTypeNum(unsigned VTNum) const {
diff --git a/contrib/llvm-project/llvm/utils/TableGen/CodeGenSchedule.cpp b/contrib/llvm-project/llvm/utils/TableGen/CodeGenSchedule.cpp
index 67583c736cd2..b20eb6eff422 100644
--- a/contrib/llvm-project/llvm/utils/TableGen/CodeGenSchedule.cpp
+++ b/contrib/llvm-project/llvm/utils/TableGen/CodeGenSchedule.cpp
@@ -86,7 +86,7 @@ struct InstRegexOp : public SetTheory::Operator {
     auto Pseudos = Instructions.slice(NumGeneric, NumPseudos);
     auto NonPseudos = Instructions.slice(NumGeneric + NumPseudos);
 
-    for (Init *Arg : make_range(Expr->arg_begin(), Expr->arg_end())) {
+    for (Init *Arg : Expr->getArgs()) {
       StringInit *SI = dyn_cast<StringInit>(Arg);
       if (!SI)
         PrintFatalError(Loc, "instregex requires pattern string: " +
@@ -248,8 +248,7 @@ void CodeGenSchedModels::checkSTIPredicates() const {
     }
 
     PrintError(R->getLoc(), "STIPredicate " + Name + " multiply declared.");
-    PrintNote(It->second->getLoc(), "Previous declaration was here.");
-    PrintFatalError(R->getLoc(), "Invalid STIPredicateDecl found.");
+    PrintFatalNote(It->second->getLoc(), "Previous declaration was here.");
   }
 
   // Disallow InstructionEquivalenceClasses with an empty instruction list.
@@ -284,7 +283,7 @@ static APInt constructOperandMask(ArrayRef<int64_t> Indices) {
 
 static void
 processSTIPredicate(STIPredicateFunction &Fn,
-                    const DenseMap<Record *, unsigned> &ProcModelMap) {
+                    const ProcModelMapTy &ProcModelMap) {
   DenseMap<const Record *, unsigned> Opcode2Index;
   using OpcodeMapPair = std::pair<const Record *, OpcodeInfo>;
   std::vector<OpcodeMapPair> OpcodeMappings;
@@ -454,10 +453,8 @@ void CodeGenSchedModels::checkMCInstPredicates() const {
 
     PrintError(TIIPred->getLoc(),
                "TIIPredicate " + Name + " is multiply defined.");
-    PrintNote(It->second->getLoc(),
-              " Previous definition of " + Name + " was here.");
-    PrintFatalError(TIIPred->getLoc(),
-                    "Found conflicting definitions of TIIPredicate.");
+    PrintFatalNote(It->second->getLoc(),
+                   " Previous definition of " + Name + " was here.");
   }
 }
 
@@ -953,9 +950,9 @@ void CodeGenSchedModels::collectSchedClasses() {
     }
     // If ProcIndices contains zero, the class applies to all processors.
     LLVM_DEBUG({
-      if (!std::count(ProcIndices.begin(), ProcIndices.end(), 0)) {
+      if (!llvm::is_contained(ProcIndices, 0)) {
         for (const CodeGenProcModel &PM : ProcModels) {
-          if (!std::count(ProcIndices.begin(), ProcIndices.end(), PM.Index))
+          if (!llvm::is_contained(ProcIndices, PM.Index))
             dbgs() << "No machine model for " << Inst->TheDef->getName()
                    << " on processor " << PM.ModelName << '\n';
         }
@@ -1083,13 +1080,14 @@ void CodeGenSchedModels::createInstRWClass(Record *InstRWDef) {
             if (RWD->getValueAsDef("SchedModel") == RWModelDef &&
                 RWModelDef->getValueAsBit("FullInstRWOverlapCheck")) {
               assert(!InstDefs.empty()); // Checked at function start.
-              PrintFatalError
-                  (InstRWDef->getLoc(),
-                   "Overlapping InstRW definition for \"" +
-                   InstDefs.front()->getName() +
-                   "\" also matches previous \"" +
-                   RWD->getValue("Instrs")->getValue()->getAsString() +
-                   "\".");
+              PrintError(
+                  InstRWDef->getLoc(),
+                  "Overlapping InstRW definition for \"" +
+                      InstDefs.front()->getName() +
+                      "\" also matches previous \"" +
+                      RWD->getValue("Instrs")->getValue()->getAsString() +
+                      "\".");
+              PrintFatalNote(RWD->getLoc(), "Previous match was here.");
             }
           }
           LLVM_DEBUG(dbgs() << "InstRW: Reuse SC " << OldSCIdx << ":"
@@ -1118,13 +1116,13 @@ void CodeGenSchedModels::createInstRWClass(Record *InstRWDef) {
       for (Record *OldRWDef : SchedClasses[OldSCIdx].InstRWs) {
         if (OldRWDef->getValueAsDef("SchedModel") == RWModelDef) {
           assert(!InstDefs.empty()); // Checked at function start.
-          PrintFatalError
-              (InstRWDef->getLoc(),
-               "Overlapping InstRW definition for \"" +
-               InstDefs.front()->getName() +
-               "\" also matches previous \"" +
-               OldRWDef->getValue("Instrs")->getValue()->getAsString() +
-               "\".");
+          PrintError(
+              InstRWDef->getLoc(),
+              "Overlapping InstRW definition for \"" +
+                  InstDefs.front()->getName() + "\" also matches previous \"" +
+                  OldRWDef->getValue("Instrs")->getValue()->getAsString() +
+                  "\".");
+          PrintFatalNote(OldRWDef->getLoc(), "Previous match was here.");
         }
         assert(OldRWDef != InstRWDef &&
                "SchedClass has duplicate InstRW def");
@@ -1210,11 +1208,10 @@ void CodeGenSchedModels::collectProcItinRW() {
 
 // Gather the unsupported features for processor models.
 void CodeGenSchedModels::collectProcUnsupportedFeatures() {
-  for (CodeGenProcModel &ProcModel : ProcModels) {
-    for (Record *Pred : ProcModel.ModelDef->getValueAsListOfDefs("UnsupportedFeatures")) {
-       ProcModel.UnsupportedFeaturesDefs.push_back(Pred);
-    }
-  }
+  for (CodeGenProcModel &ProcModel : ProcModels)
+    append_range(
+        ProcModel.UnsupportedFeaturesDefs,
+        ProcModel.ModelDef->getValueAsListOfDefs("UnsupportedFeatures"));
 }
 
 /// Infer new classes from existing classes. In the process, this may create new
@@ -1250,7 +1247,7 @@ void CodeGenSchedModels::inferFromItinClass(Record *ItinClassDef,
     bool HasMatch = false;
     for (const Record *Rec : PM.ItinRWDefs) {
       RecVec Matched = Rec->getValueAsListOfDefs("MatchedItinClasses");
-      if (!std::count(Matched.begin(), Matched.end(), ItinClassDef))
+      if (!llvm::is_contained(Matched, ItinClassDef))
         continue;
       if (HasMatch)
         PrintFatalError(Rec->getLoc(), "Duplicate itinerary class "
@@ -1283,6 +1280,7 @@ void CodeGenSchedModels::inferFromInstRWs(unsigned SCIdx) {
     findRWs(Rec->getValueAsListOfDefs("OperandReadWrites"), Writes, Reads);
     unsigned PIdx = getProcModel(Rec->getValueAsDef("SchedModel")).Index;
     inferFromRW(Writes, Reads, SCIdx, PIdx); // May mutate SchedClasses.
+    SchedClasses[SCIdx].InstRWProcIndices.insert(PIdx);
   }
 }
 
@@ -1315,7 +1313,13 @@ struct PredTransition {
   SmallVector<PredCheck, 4> PredTerm;
   SmallVector<SmallVector<unsigned,4>, 16> WriteSequences;
   SmallVector<SmallVector<unsigned,4>, 16> ReadSequences;
-  SmallVector<unsigned, 4> ProcIndices;
+  unsigned ProcIndex = 0;
+
+  PredTransition() = default;
+  PredTransition(ArrayRef<PredCheck> PT, unsigned ProcId) {
+    PredTerm.assign(PT.begin(), PT.end());
+    ProcIndex = ProcId;
+  }
 };
 
 // Encapsulate a set of partially constructed transitions.
@@ -1328,17 +1332,18 @@ public:
 
   PredTransitions(CodeGenSchedModels &sm): SchedModels(sm) {}
 
-  void substituteVariantOperand(const SmallVectorImpl<unsigned> &RWSeq,
+  bool substituteVariantOperand(const SmallVectorImpl<unsigned> &RWSeq,
                                 bool IsRead, unsigned StartIdx);
 
-  void substituteVariants(const PredTransition &Trans);
+  bool substituteVariants(const PredTransition &Trans);
 
 #ifndef NDEBUG
   void dump() const;
 #endif
 
 private:
-  bool mutuallyExclusive(Record *PredDef, ArrayRef<PredCheck> Term);
+  bool mutuallyExclusive(Record *PredDef, ArrayRef<Record *> Preds,
+                         ArrayRef<PredCheck> Term);
   void getIntersectingVariants(
     const CodeGenSchedRW &SchedRW, unsigned TransIdx,
     std::vector<TransVariant> &IntersectingVariants);
@@ -1357,6 +1362,7 @@ private:
 // are always checked in the order they are defined in the .td file. Later
 // conditions implicitly negate any prior condition.
 bool PredTransitions::mutuallyExclusive(Record *PredDef,
+                                        ArrayRef<Record *> Preds,
                                         ArrayRef<PredCheck> Term) {
   for (const PredCheck &PC: Term) {
     if (PC.Predicate == PredDef)
@@ -1367,49 +1373,49 @@ bool PredTransitions::mutuallyExclusive(Record *PredDef,
     RecVec Variants = SchedRW.TheDef->getValueAsListOfDefs("Variants");
     if (any_of(Variants, [PredDef](const Record *R) {
           return R->getValueAsDef("Predicate") == PredDef;
-        }))
-      return true;
-  }
-  return false;
-}
-
-static bool hasAliasedVariants(const CodeGenSchedRW &RW,
-                               CodeGenSchedModels &SchedModels) {
-  if (RW.HasVariants)
-    return true;
-
-  for (Record *Alias : RW.Aliases) {
-    const CodeGenSchedRW &AliasRW =
-      SchedModels.getSchedRW(Alias->getValueAsDef("AliasRW"));
-    if (AliasRW.HasVariants)
+        })) {
+      // To check if PredDef is mutually exclusive with PC we also need to
+      // check that PC.Predicate is exclusive with all predicates from variant
+      // we're expanding. Consider following RW sequence with two variants
+      // (1 & 2), where A, B and C are predicates from corresponding SchedVars:
+      //
+      // 1:A/B - 2:C/B
+      //
+      // Here C is not mutually exclusive with variant (1), because A doesn't
+      // exist in variant (2). This means we have possible transitions from A
+      // to C and from A to B, and fully expanded sequence would look like:
+      //
+      // if (A & C) return ...;
+      // if (A & B) return ...;
+      // if (B) return ...;
+      //
+      // Now let's consider another sequence:
+      //
+      // 1:A/B - 2:A/B
+      //
+      // Here A in variant (2) is mutually exclusive with variant (1), because
+      // A also exists in (2). This means A->B transition is impossible and
+      // expanded sequence would look like:
+      //
+      // if (A) return ...;
+      // if (B) return ...;
+      if (!count(Preds, PC.Predicate))
+        continue;
       return true;
-    if (AliasRW.IsSequence) {
-      IdxVec ExpandedRWs;
-      SchedModels.expandRWSequence(AliasRW.Index, ExpandedRWs, AliasRW.IsRead);
-      for (unsigned SI : ExpandedRWs) {
-        if (hasAliasedVariants(SchedModels.getSchedRW(SI, AliasRW.IsRead),
-                               SchedModels))
-          return true;
-      }
     }
   }
   return false;
 }
 
-static bool hasVariant(ArrayRef<PredTransition> Transitions,
-                       CodeGenSchedModels &SchedModels) {
-  for (const PredTransition &PTI : Transitions) {
-    for (const SmallVectorImpl<unsigned> &WSI : PTI.WriteSequences)
-      for (unsigned WI : WSI)
-        if (hasAliasedVariants(SchedModels.getSchedWrite(WI), SchedModels))
-          return true;
-
-    for (const SmallVectorImpl<unsigned> &RSI : PTI.ReadSequences)
-      for (unsigned RI : RSI)
-        if (hasAliasedVariants(SchedModels.getSchedRead(RI), SchedModels))
-          return true;
+static std::vector<Record *> getAllPredicates(ArrayRef<TransVariant> Variants,
+                                              unsigned ProcId) {
+  std::vector<Record *> Preds;
+  for (auto &Variant : Variants) {
+    if (!Variant.VarOrSeqDef->isSubClassOf("SchedVar"))
+      continue;
+    Preds.push_back(Variant.VarOrSeqDef->getValueAsDef("Predicate"));
   }
-  return false;
+  return Preds;
 }
 
 // Populate IntersectingVariants with any variants or aliased sequences of the
@@ -1428,12 +1434,14 @@ void PredTransitions::getIntersectingVariants(
       Record *ModelDef = SchedRW.TheDef->getValueAsDef("SchedModel");
       VarProcIdx = SchedModels.getProcModel(ModelDef).Index;
     }
-    // Push each variant. Assign TransVecIdx later.
-    const RecVec VarDefs = SchedRW.TheDef->getValueAsListOfDefs("Variants");
-    for (Record *VarDef : VarDefs)
-      Variants.emplace_back(VarDef, SchedRW.Index, VarProcIdx, 0);
-    if (VarProcIdx == 0)
-      GenericRW = true;
+    if (VarProcIdx == 0 || VarProcIdx == TransVec[TransIdx].ProcIndex) {
+      // Push each variant. Assign TransVecIdx later.
+      const RecVec VarDefs = SchedRW.TheDef->getValueAsListOfDefs("Variants");
+      for (Record *VarDef : VarDefs)
+        Variants.emplace_back(VarDef, SchedRW.Index, VarProcIdx, 0);
+      if (VarProcIdx == 0)
+        GenericRW = true;
+    }
   }
   for (RecIter AI = SchedRW.Aliases.begin(), AE = SchedRW.Aliases.end();
        AI != AE; ++AI) {
@@ -1445,6 +1453,17 @@ void PredTransitions::getIntersectingVariants(
       Record *ModelDef = (*AI)->getValueAsDef("SchedModel");
       AliasProcIdx = SchedModels.getProcModel(ModelDef).Index;
     }
+    if (AliasProcIdx && AliasProcIdx != TransVec[TransIdx].ProcIndex)
+      continue;
+    if (!Variants.empty()) {
+      const CodeGenProcModel &PM =
+          *(SchedModels.procModelBegin() + AliasProcIdx);
+      PrintFatalError((*AI)->getLoc(),
+                      "Multiple variants defined for processor " +
+                          PM.ModelName +
+                          " Ensure only one SchedAlias exists per RW.");
+    }
+
     const CodeGenSchedRW &AliasRW =
       SchedModels.getSchedRW((*AI)->getValueAsDef("AliasRW"));
 
@@ -1458,29 +1477,17 @@ void PredTransitions::getIntersectingVariants(
     if (AliasProcIdx == 0)
       GenericRW = true;
   }
+  std::vector<Record *> AllPreds =
+      getAllPredicates(Variants, TransVec[TransIdx].ProcIndex);
   for (TransVariant &Variant : Variants) {
     // Don't expand variants if the processor models don't intersect.
     // A zero processor index means any processor.
-    SmallVectorImpl<unsigned> &ProcIndices = TransVec[TransIdx].ProcIndices;
-    if (ProcIndices[0] && Variant.ProcIdx) {
-      unsigned Cnt = std::count(ProcIndices.begin(), ProcIndices.end(),
-                                Variant.ProcIdx);
-      if (!Cnt)
-        continue;
-      if (Cnt > 1) {
-        const CodeGenProcModel &PM =
-          *(SchedModels.procModelBegin() + Variant.ProcIdx);
-        PrintFatalError(Variant.VarOrSeqDef->getLoc(),
-                        "Multiple variants defined for processor " +
-                        PM.ModelName +
-                        " Ensure only one SchedAlias exists per RW.");
-      }
-    }
     if (Variant.VarOrSeqDef->isSubClassOf("SchedVar")) {
       Record *PredDef = Variant.VarOrSeqDef->getValueAsDef("Predicate");
-      if (mutuallyExclusive(PredDef, TransVec[TransIdx].PredTerm))
+      if (mutuallyExclusive(PredDef, AllPreds, TransVec[TransIdx].PredTerm))
         continue;
     }
+
     if (IntersectingVariants.empty()) {
       // The first variant builds on the existing transition.
       Variant.TransVecIdx = TransIdx;
@@ -1507,9 +1514,6 @@ pushVariant(const TransVariant &VInfo, bool IsRead) {
 
   // If this operand transition is reached through a processor-specific alias,
   // then the whole transition is specific to this processor.
-  if (VInfo.ProcIdx != 0)
-    Trans.ProcIndices.assign(1, VInfo.ProcIdx);
-
   IdxVec SelectedRWs;
   if (VInfo.VarOrSeqDef->isSubClassOf("SchedVar")) {
     Record *PredDef = VInfo.VarOrSeqDef->getValueAsDef("Predicate");
@@ -1530,6 +1534,7 @@ pushVariant(const TransVariant &VInfo, bool IsRead) {
   if (SchedRW.IsVariadic) {
     unsigned OperIdx = RWSequences.size()-1;
     // Make N-1 copies of this transition's last sequence.
+    RWSequences.reserve(RWSequences.size() + SelectedRWs.size() - 1);
     RWSequences.insert(RWSequences.end(), SelectedRWs.size() - 1,
                        RWSequences[OperIdx]);
     // Push each of the N elements of the SelectedRWs onto a copy of the last
@@ -1543,8 +1548,7 @@ pushVariant(const TransVariant &VInfo, bool IsRead) {
         ExpandedRWs.push_back(*RWI);
       else
         SchedModels.expandRWSequence(*RWI, ExpandedRWs, IsRead);
-      RWSequences[OperIdx].insert(RWSequences[OperIdx].end(),
-                                  ExpandedRWs.begin(), ExpandedRWs.end());
+      llvm::append_range(RWSequences[OperIdx], ExpandedRWs);
     }
     assert(OperIdx == RWSequences.size() && "missed a sequence");
   }
@@ -1560,7 +1564,7 @@ pushVariant(const TransVariant &VInfo, bool IsRead) {
       else
         SchedModels.expandRWSequence(*RWI, ExpandedRWs, IsRead);
     }
-    Seq.insert(Seq.end(), ExpandedRWs.begin(), ExpandedRWs.end());
+    llvm::append_range(Seq, ExpandedRWs);
   }
 }
 
@@ -1568,9 +1572,9 @@ pushVariant(const TransVariant &VInfo, bool IsRead) {
 // operand. StartIdx is an index into TransVec where partial results
 // starts. RWSeq must be applied to all transitions between StartIdx and the end
 // of TransVec.
-void PredTransitions::substituteVariantOperand(
-  const SmallVectorImpl<unsigned> &RWSeq, bool IsRead, unsigned StartIdx) {
-
+bool PredTransitions::substituteVariantOperand(
+    const SmallVectorImpl<unsigned> &RWSeq, bool IsRead, unsigned StartIdx) {
+  bool Subst = false;
   // Visit each original RW within the current sequence.
   for (SmallVectorImpl<unsigned>::const_iterator
          RWI = RWSeq.begin(), RWE = RWSeq.end(); RWI != RWE; ++RWI) {
@@ -1580,27 +1584,25 @@ void PredTransitions::substituteVariantOperand(
     // revisited (TransEnd must be loop invariant).
     for (unsigned TransIdx = StartIdx, TransEnd = TransVec.size();
          TransIdx != TransEnd; ++TransIdx) {
-      // In the common case, push RW onto the current operand's sequence.
-      if (!hasAliasedVariants(SchedRW, SchedModels)) {
-        if (IsRead)
-          TransVec[TransIdx].ReadSequences.back().push_back(*RWI);
-        else
-          TransVec[TransIdx].WriteSequences.back().push_back(*RWI);
-        continue;
-      }
       // Distribute this partial PredTransition across intersecting variants.
       // This will push a copies of TransVec[TransIdx] on the back of TransVec.
       std::vector<TransVariant> IntersectingVariants;
       getIntersectingVariants(SchedRW, TransIdx, IntersectingVariants);
       // Now expand each variant on top of its copy of the transition.
-      for (std::vector<TransVariant>::const_iterator
-             IVI = IntersectingVariants.begin(),
-             IVE = IntersectingVariants.end();
-           IVI != IVE; ++IVI) {
-        pushVariant(*IVI, IsRead);
+      for (const TransVariant &IV : IntersectingVariants)
+        pushVariant(IV, IsRead);
+      if (IntersectingVariants.empty()) {
+        if (IsRead)
+          TransVec[TransIdx].ReadSequences.back().push_back(*RWI);
+        else
+          TransVec[TransIdx].WriteSequences.back().push_back(*RWI);
+        continue;
+      } else {
+        Subst = true;
       }
     }
   }
+  return Subst;
 }
 
 // For each variant of a Read/Write in Trans, substitute the sequence of
@@ -1609,13 +1611,13 @@ void PredTransitions::substituteVariantOperand(
 // predicates should result in linear growth in the total number variants.
 //
 // This is one step in a breadth-first search of nested variants.
-void PredTransitions::substituteVariants(const PredTransition &Trans) {
+bool PredTransitions::substituteVariants(const PredTransition &Trans) {
   // Build up a set of partial results starting at the back of
   // PredTransitions. Remember the first new transition.
   unsigned StartIdx = TransVec.size();
-  TransVec.emplace_back();
-  TransVec.back().PredTerm = Trans.PredTerm;
-  TransVec.back().ProcIndices = Trans.ProcIndices;
+  bool Subst = false;
+  assert(Trans.ProcIndex != 0);
+  TransVec.emplace_back(Trans.PredTerm, Trans.ProcIndex);
 
   // Visit each original write sequence.
   for (SmallVectorImpl<SmallVector<unsigned,4>>::const_iterator
@@ -1626,7 +1628,7 @@ void PredTransitions::substituteVariants(const PredTransition &Trans) {
            TransVec.begin() + StartIdx, E = TransVec.end(); I != E; ++I) {
       I->WriteSequences.emplace_back();
     }
-    substituteVariantOperand(*WSI, /*IsRead=*/false, StartIdx);
+    Subst |= substituteVariantOperand(*WSI, /*IsRead=*/false, StartIdx);
   }
   // Visit each original read sequence.
   for (SmallVectorImpl<SmallVector<unsigned,4>>::const_iterator
@@ -1637,10 +1639,37 @@ void PredTransitions::substituteVariants(const PredTransition &Trans) {
            TransVec.begin() + StartIdx, E = TransVec.end(); I != E; ++I) {
       I->ReadSequences.emplace_back();
     }
-    substituteVariantOperand(*RSI, /*IsRead=*/true, StartIdx);
+    Subst |= substituteVariantOperand(*RSI, /*IsRead=*/true, StartIdx);
   }
+  return Subst;
+}
+
+static void addSequences(CodeGenSchedModels &SchedModels,
+                         const SmallVectorImpl<SmallVector<unsigned, 4>> &Seqs,
+                         IdxVec &Result, bool IsRead) {
+  for (const auto &S : Seqs)
+    if (!S.empty())
+      Result.push_back(SchedModels.findOrInsertRW(S, IsRead));
 }
 
+#ifndef NDEBUG
+static void dumpRecVec(const RecVec &RV) {
+  for (const Record *R : RV)
+    dbgs() << R->getName() << ", ";
+}
+#endif
+
+static void dumpTransition(const CodeGenSchedModels &SchedModels,
+                           const CodeGenSchedClass &FromSC,
+                           const CodeGenSchedTransition &SCTrans,
+                           const RecVec &Preds) {
+  LLVM_DEBUG(dbgs() << "Adding transition from " << FromSC.Name << "("
+                    << FromSC.Index << ") to "
+                    << SchedModels.getSchedClass(SCTrans.ToClassIdx).Name << "("
+                    << SCTrans.ToClassIdx << ") on pred term: (";
+             dumpRecVec(Preds);
+             dbgs() << ") on processor (" << SCTrans.ProcIndex << ")\n");
+}
 // Create a new SchedClass for each variant found by inferFromRW. Pass
 static void inferFromTransitions(ArrayRef<PredTransition> LastTransitions,
                                  unsigned FromClassIdx,
@@ -1649,21 +1678,25 @@ static void inferFromTransitions(ArrayRef<PredTransition> LastTransitions,
   // requires creating a new SchedClass.
   for (ArrayRef<PredTransition>::iterator
          I = LastTransitions.begin(), E = LastTransitions.end(); I != E; ++I) {
-    IdxVec OperWritesVariant;
-    transform(I->WriteSequences, std::back_inserter(OperWritesVariant),
-              [&SchedModels](ArrayRef<unsigned> WS) {
-                return SchedModels.findOrInsertRW(WS, /*IsRead=*/false);
-              });
-    IdxVec OperReadsVariant;
-    transform(I->ReadSequences, std::back_inserter(OperReadsVariant),
-              [&SchedModels](ArrayRef<unsigned> RS) {
-                return SchedModels.findOrInsertRW(RS, /*IsRead=*/true);
-              });
+    // Variant expansion (substituteVariants) may create unconditional
+    // transitions. We don't need to build sched classes for them.
+    if (I->PredTerm.empty())
+      continue;
+    IdxVec OperWritesVariant, OperReadsVariant;
+    addSequences(SchedModels, I->WriteSequences, OperWritesVariant, false);
+    addSequences(SchedModels, I->ReadSequences, OperReadsVariant, true);
     CodeGenSchedTransition SCTrans;
+
+    // Transition should not contain processor indices already assigned to
+    // InstRWs in this scheduling class.
+    const CodeGenSchedClass &FromSC = SchedModels.getSchedClass(FromClassIdx);
+    if (FromSC.InstRWProcIndices.count(I->ProcIndex))
+      continue;
+    SCTrans.ProcIndex = I->ProcIndex;
     SCTrans.ToClassIdx =
-      SchedModels.addSchedClass(/*ItinClassDef=*/nullptr, OperWritesVariant,
-                                OperReadsVariant, I->ProcIndices);
-    SCTrans.ProcIndices.assign(I->ProcIndices.begin(), I->ProcIndices.end());
+        SchedModels.addSchedClass(/*ItinClassDef=*/nullptr, OperWritesVariant,
+                                  OperReadsVariant, I->ProcIndex);
+
     // The final PredTerm is unique set of predicates guarding the transition.
     RecVec Preds;
     transform(I->PredTerm, std::back_inserter(Preds),
@@ -1671,12 +1704,36 @@ static void inferFromTransitions(ArrayRef<PredTransition> LastTransitions,
                 return P.Predicate;
               });
     Preds.erase(std::unique(Preds.begin(), Preds.end()), Preds.end());
+    dumpTransition(SchedModels, FromSC, SCTrans, Preds);
     SCTrans.PredTerm = std::move(Preds);
     SchedModels.getSchedClass(FromClassIdx)
         .Transitions.push_back(std::move(SCTrans));
   }
 }
 
+std::vector<unsigned> CodeGenSchedModels::getAllProcIndices() const {
+  std::vector<unsigned> ProcIdVec;
+  for (const auto &PM : ProcModelMap)
+    if (PM.second != 0)
+      ProcIdVec.push_back(PM.second);
+  // The order of the keys (Record pointers) of ProcModelMap are not stable.
+  // Sort to stabalize the values.
+  llvm::sort(ProcIdVec);
+  return ProcIdVec;
+}
+
+static std::vector<PredTransition>
+makePerProcessorTransitions(const PredTransition &Trans,
+                            ArrayRef<unsigned> ProcIndices) {
+  std::vector<PredTransition> PerCpuTransVec;
+  for (unsigned ProcId : ProcIndices) {
+    assert(ProcId != 0);
+    PerCpuTransVec.push_back(Trans);
+    PerCpuTransVec.back().ProcIndex = ProcId;
+  }
+  return PerCpuTransVec;
+}
+
 // Create new SchedClasses for the given ReadWrite list. If any of the
 // ReadWrites refers to a SchedVariant, create a new SchedClass for each variant
 // of the ReadWrite list, following Aliases if necessary.
@@ -1686,13 +1743,10 @@ void CodeGenSchedModels::inferFromRW(ArrayRef<unsigned> OperWrites,
                                      ArrayRef<unsigned> ProcIndices) {
   LLVM_DEBUG(dbgs() << "INFER RW proc("; dumpIdxVec(ProcIndices);
              dbgs() << ") ");
-
   // Create a seed transition with an empty PredTerm and the expanded sequences
   // of SchedWrites for the current SchedClass.
   std::vector<PredTransition> LastTransitions;
   LastTransitions.emplace_back();
-  LastTransitions.back().ProcIndices.append(ProcIndices.begin(),
-                                            ProcIndices.end());
 
   for (unsigned WriteIdx : OperWrites) {
     IdxVec WriteSeq;
@@ -1713,18 +1767,21 @@ void CodeGenSchedModels::inferFromRW(ArrayRef<unsigned> OperWrites,
   }
   LLVM_DEBUG(dbgs() << '\n');
 
+  LastTransitions = makePerProcessorTransitions(
+      LastTransitions[0], llvm::is_contained(ProcIndices, 0)
+                              ? ArrayRef<unsigned>(getAllProcIndices())
+                              : ProcIndices);
   // Collect all PredTransitions for individual operands.
   // Iterate until no variant writes remain.
-  while (hasVariant(LastTransitions, *this)) {
+  bool SubstitutedAny;
+  do {
+    SubstitutedAny = false;
     PredTransitions Transitions(*this);
     for (const PredTransition &Trans : LastTransitions)
-      Transitions.substituteVariants(Trans);
+      SubstitutedAny |= Transitions.substituteVariants(Trans);
     LLVM_DEBUG(Transitions.dump());
     LastTransitions.swap(Transitions.TransVec);
-  }
-  // If the first transition has no variants, nothing to do.
-  if (LastTransitions[0].PredTerm.empty())
-    return;
+  } while (SubstitutedAny);
 
   // WARNING: We are about to mutate the SchedClasses vector. Do not refer to
   // OperWrites, OperReads, or ProcIndices after calling inferFromTransitions.
@@ -1767,8 +1824,7 @@ void CodeGenSchedModels::verifyProcResourceGroups(CodeGenProcModel &PM) {
                              OtherUnits.begin(), OtherUnits.end())
           != CheckUnits.end()) {
         // CheckUnits and OtherUnits overlap
-        OtherUnits.insert(OtherUnits.end(), CheckUnits.begin(),
-                          CheckUnits.end());
+        llvm::append_range(OtherUnits, CheckUnits);
         if (!hasSuperGroup(OtherUnits, PM)) {
           PrintFatalError((PM.ProcResourceDefs[i])->getLoc(),
                           "proc resource group overlaps with "
@@ -1990,7 +2046,7 @@ void CodeGenSchedModels::collectItinProcResources(Record *ItinClassDef) {
     for (RecIter II = PM.ItinRWDefs.begin(), IE = PM.ItinRWDefs.end();
          II != IE; ++II) {
       RecVec Matched = (*II)->getValueAsListOfDefs("MatchedItinClasses");
-      if (!std::count(Matched.begin(), Matched.end(), ItinClassDef))
+      if (!llvm::is_contained(Matched, ItinClassDef))
         continue;
       if (HasMatch)
         PrintFatalError((*II)->getLoc(), "Duplicate itinerary class "
@@ -2191,13 +2247,14 @@ void CodeGenSchedClass::dump(const CodeGenSchedModels* SchedModels) const {
       dbgs().indent(10);
     }
   }
-  dbgs() << "\n  ProcIdx: "; dumpIdxVec(ProcIndices); dbgs() << '\n';
+  dbgs() << "\n  ProcIdx: "; dumpIdxVec(ProcIndices);
   if (!Transitions.empty()) {
     dbgs() << "\n Transitions for Proc ";
     for (const CodeGenSchedTransition &Transition : Transitions) {
-      dumpIdxVec(Transition.ProcIndices);
+      dbgs() << Transition.ProcIndex << ", ";
     }
   }
+  dbgs() << '\n';
 }
 
 void PredTransitions::dump() const {
diff --git a/contrib/llvm-project/llvm/utils/TableGen/CodeGenSchedule.h b/contrib/llvm-project/llvm/utils/TableGen/CodeGenSchedule.h
index c487d142d46c..9020447c940b 100644
--- a/contrib/llvm-project/llvm/utils/TableGen/CodeGenSchedule.h
+++ b/contrib/llvm-project/llvm/utils/TableGen/CodeGenSchedule.h
@@ -16,10 +16,12 @@
 
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/SetTheory.h"
+#include <map>
 
 namespace llvm {
 
@@ -94,7 +96,7 @@ struct CodeGenSchedRW {
 /// Represent a transition between SchedClasses induced by SchedVariant.
 struct CodeGenSchedTransition {
   unsigned ToClassIdx;
-  IdxVec ProcIndices;
+  unsigned ProcIndex;
   RecVec PredTerm;
 };
 
@@ -139,6 +141,8 @@ struct CodeGenSchedClass {
   // Instructions should be ignored by this class because they have been split
   // off to join another inferred class.
   RecVec InstRWs;
+  // InstRWs processor indices. Filled in inferFromInstRWs
+  DenseSet<unsigned> InstRWProcIndices;
 
   CodeGenSchedClass(unsigned Index, std::string Name, Record *ItinClassDef)
     : Index(Index), Name(std::move(Name)), ItinClassDef(ItinClassDef) {}
@@ -358,8 +362,7 @@ public:
   OpcodeGroup(OpcodeGroup &&Other) = default;
 
   void addOpcode(const Record *Opcode) {
-    assert(std::find(Opcodes.begin(), Opcodes.end(), Opcode) == Opcodes.end() &&
-           "Opcode already in set!");
+    assert(!llvm::is_contained(Opcodes, Opcode) && "Opcode already in set!");
     Opcodes.push_back(Opcode);
   }
 
@@ -407,6 +410,8 @@ public:
   ArrayRef<OpcodeGroup> getGroups() const { return Groups; }
 };
 
+using ProcModelMapTy = DenseMap<const Record *, unsigned>;
+
 /// Top level container for machine model data.
 class CodeGenSchedModels {
   RecordKeeper &Records;
@@ -419,7 +424,6 @@ class CodeGenSchedModels {
   std::vector<CodeGenProcModel> ProcModels;
 
   // Map Processor's MachineModel or ProcItin to a CodeGenProcModel index.
-  using ProcModelMapTy = DenseMap<Record*, unsigned>;
   ProcModelMapTy ProcModelMap;
 
   // Per-operand SchedReadWrite types.
@@ -441,6 +445,7 @@ class CodeGenSchedModels {
   InstClassMapTy InstrClassMap;
 
   std::vector<STIPredicateFunction> STIPredicates;
+  std::vector<unsigned> getAllProcIndices() const;
 
 public:
   CodeGenSchedModels(RecordKeeper& RK, const CodeGenTarget &TGT);
diff --git a/contrib/llvm-project/llvm/utils/TableGen/CodeGenTarget.cpp b/contrib/llvm-project/llvm/utils/TableGen/CodeGenTarget.cpp
index 891a08ea590e..8f6d212df5ec 100644
--- a/contrib/llvm-project/llvm/utils/TableGen/CodeGenTarget.cpp
+++ b/contrib/llvm-project/llvm/utils/TableGen/CodeGenTarget.cpp
@@ -76,6 +76,7 @@ StringRef llvm::getEnumName(MVT::SimpleValueType T) {
   case MVT::f128:     return "MVT::f128";
   case MVT::ppcf128:  return "MVT::ppcf128";
   case MVT::x86mmx:   return "MVT::x86mmx";
+  case MVT::x86amx:   return "MVT::x86amx";
   case MVT::Glue:     return "MVT::Glue";
   case MVT::isVoid:   return "MVT::isVoid";
   case MVT::v1i1:     return "MVT::v1i1";
@@ -86,6 +87,7 @@ StringRef llvm::getEnumName(MVT::SimpleValueType T) {
   case MVT::v32i1:    return "MVT::v32i1";
   case MVT::v64i1:    return "MVT::v64i1";
   case MVT::v128i1:   return "MVT::v128i1";
+  case MVT::v256i1:   return "MVT::v256i1";
   case MVT::v512i1:   return "MVT::v512i1";
   case MVT::v1024i1:  return "MVT::v1024i1";
   case MVT::v1i8:     return "MVT::v1i8";
@@ -126,6 +128,9 @@ StringRef llvm::getEnumName(MVT::SimpleValueType T) {
   case MVT::v8i64:    return "MVT::v8i64";
   case MVT::v16i64:   return "MVT::v16i64";
   case MVT::v32i64:   return "MVT::v32i64";
+  case MVT::v64i64:   return "MVT::v64i64";
+  case MVT::v128i64:  return "MVT::v128i64";
+  case MVT::v256i64:  return "MVT::v256i64";
   case MVT::v1i128:   return "MVT::v1i128";
   case MVT::v2f16:    return "MVT::v2f16";
   case MVT::v3f16:    return "MVT::v3f16";
@@ -163,6 +168,9 @@ StringRef llvm::getEnumName(MVT::SimpleValueType T) {
   case MVT::v8f64:    return "MVT::v8f64";
   case MVT::v16f64:   return "MVT::v16f64";
   case MVT::v32f64:   return "MVT::v32f64";
+  case MVT::v64f64:   return "MVT::v64f64";
+  case MVT::v128f64:  return "MVT::v128f64";
+  case MVT::v256f64:  return "MVT::v256f64";
   case MVT::nxv1i1:   return "MVT::nxv1i1";
   case MVT::nxv2i1:   return "MVT::nxv2i1";
   case MVT::nxv4i1:   return "MVT::nxv4i1";
@@ -204,21 +212,22 @@ StringRef llvm::getEnumName(MVT::SimpleValueType T) {
   case MVT::nxv2bf16:  return "MVT::nxv2bf16";
   case MVT::nxv4bf16:  return "MVT::nxv4bf16";
   case MVT::nxv8bf16:  return "MVT::nxv8bf16";
-  case MVT::nxv1f32:  return "MVT::nxv1f32";
-  case MVT::nxv2f32:  return "MVT::nxv2f32";
-  case MVT::nxv4f32:  return "MVT::nxv4f32";
-  case MVT::nxv8f32:  return "MVT::nxv8f32";
-  case MVT::nxv16f32: return "MVT::nxv16f32";
-  case MVT::nxv1f64:  return "MVT::nxv1f64";
-  case MVT::nxv2f64:  return "MVT::nxv2f64";
-  case MVT::nxv4f64:  return "MVT::nxv4f64";
-  case MVT::nxv8f64:  return "MVT::nxv8f64";
-  case MVT::token:    return "MVT::token";
-  case MVT::Metadata: return "MVT::Metadata";
-  case MVT::iPTR:     return "MVT::iPTR";
-  case MVT::iPTRAny:  return "MVT::iPTRAny";
-  case MVT::Untyped:  return "MVT::Untyped";
-  case MVT::exnref:   return "MVT::exnref";
+  case MVT::nxv1f32:   return "MVT::nxv1f32";
+  case MVT::nxv2f32:   return "MVT::nxv2f32";
+  case MVT::nxv4f32:   return "MVT::nxv4f32";
+  case MVT::nxv8f32:   return "MVT::nxv8f32";
+  case MVT::nxv16f32:  return "MVT::nxv16f32";
+  case MVT::nxv1f64:   return "MVT::nxv1f64";
+  case MVT::nxv2f64:   return "MVT::nxv2f64";
+  case MVT::nxv4f64:   return "MVT::nxv4f64";
+  case MVT::nxv8f64:   return "MVT::nxv8f64";
+  case MVT::token:     return "MVT::token";
+  case MVT::Metadata:  return "MVT::Metadata";
+  case MVT::iPTR:      return "MVT::iPTR";
+  case MVT::iPTRAny:   return "MVT::iPTRAny";
+  case MVT::Untyped:   return "MVT::Untyped";
+  case MVT::funcref:   return "MVT::funcref";
+  case MVT::externref: return "MVT::externref";
   default: llvm_unreachable("ILLEGAL VALUE TYPE!");
   }
 }
@@ -251,19 +260,27 @@ CodeGenTarget::CodeGenTarget(RecordKeeper &records)
 CodeGenTarget::~CodeGenTarget() {
 }
 
-const StringRef CodeGenTarget::getName() const {
-  return TargetRec->getName();
-}
+StringRef CodeGenTarget::getName() const { return TargetRec->getName(); }
 
+/// getInstNamespace - Find and return the target machine's instruction
+/// namespace. The namespace is cached because it is requested multiple times.
 StringRef CodeGenTarget::getInstNamespace() const {
-  for (const CodeGenInstruction *Inst : getInstructionsByEnumValue()) {
-    // Make sure not to pick up "TargetOpcode" by accidentally getting
-    // the namespace off the PHI instruction or something.
-    if (Inst->Namespace != "TargetOpcode")
-      return Inst->Namespace;
+  if (InstNamespace.empty()) {
+    for (const CodeGenInstruction *Inst : getInstructionsByEnumValue()) {
+      // We are not interested in the "TargetOpcode" namespace.
+      if (Inst->Namespace != "TargetOpcode") {
+        InstNamespace = Inst->Namespace;
+        break;
+      }
+    }
   }
 
-  return "";
+  return InstNamespace;
+}
+
+StringRef CodeGenTarget::getRegNamespace() const {
+  auto &RegClasses = RegBank->getRegClasses();
+  return RegClasses.size() > 0 ? RegClasses.front().Namespace : "";
 }
 
 Record *CodeGenTarget::getInstructionSet() const {
@@ -324,7 +341,8 @@ CodeGenRegBank &CodeGenTarget::getRegBank() const {
 Optional<CodeGenRegisterClass *>
 CodeGenTarget::getSuperRegForSubReg(const ValueTypeByHwMode &ValueTy,
                                     CodeGenRegBank &RegBank,
-                                    const CodeGenSubRegIndex *SubIdx) const {
+                                    const CodeGenSubRegIndex *SubIdx,
+                                    bool MustBeAllocatable) const {
   std::vector<CodeGenRegisterClass *> Candidates;
   auto &RegClasses = RegBank.getRegClasses();
 
@@ -337,10 +355,11 @@ CodeGenTarget::getSuperRegForSubReg(const ValueTypeByHwMode &ValueTy,
       continue;
 
     // We have a class. Check if it supports this value type.
-    if (llvm::none_of(SubClassWithSubReg->VTs,
-                      [&ValueTy](const ValueTypeByHwMode &ClassVT) {
-                        return ClassVT == ValueTy;
-                      }))
+    if (!llvm::is_contained(SubClassWithSubReg->VTs, ValueTy))
+      continue;
+
+    // If necessary, check that it is allocatable.
+    if (MustBeAllocatable && !SubClassWithSubReg->Allocatable)
       continue;
 
     // We have a register class which supports both the value type and
@@ -376,11 +395,7 @@ void CodeGenTarget::ReadRegAltNameIndices() const {
 /// getRegisterByName - If there is a register with the specific AsmName,
 /// return it.
 const CodeGenRegister *CodeGenTarget::getRegisterByName(StringRef Name) const {
-  const StringMap<CodeGenRegister*> &Regs = getRegBank().getRegistersByName();
-  StringMap<CodeGenRegister*>::const_iterator I = Regs.find(Name);
-  if (I == Regs.end())
-    return nullptr;
-  return I->second;
+  return getRegBank().getRegistersByName().lookup(Name);
 }
 
 std::vector<ValueTypeByHwMode> CodeGenTarget::getRegisterVTs(Record *R)
@@ -390,7 +405,7 @@ std::vector<ValueTypeByHwMode> CodeGenTarget::getRegisterVTs(Record *R)
   for (const auto &RC : getRegBank().getRegClasses()) {
     if (RC.contains(Reg)) {
       ArrayRef<ValueTypeByHwMode> InVTs = RC.getValueTypes();
-      Result.insert(Result.end(), InVTs.begin(), InVTs.end());
+      llvm::append_range(Result, InVTs);
     }
   }
 
@@ -403,7 +418,7 @@ std::vector<ValueTypeByHwMode> CodeGenTarget::getRegisterVTs(Record *R)
 
 void CodeGenTarget::ReadLegalValueTypes() const {
   for (const auto &RC : getRegBank().getRegClasses())
-    LegalValueTypes.insert(LegalValueTypes.end(), RC.VTs.begin(), RC.VTs.end());
+    llvm::append_range(LegalValueTypes, RC.VTs);
 
   // Remove duplicates.
   llvm::sort(LegalValueTypes);
@@ -419,8 +434,6 @@ CodeGenSchedModels &CodeGenTarget::getSchedModels() const {
 }
 
 void CodeGenTarget::ReadInstructions() const {
-  NamedRegionTimer T("Read Instructions", "Time spent reading instructions",
-                     "CodeGenTarget", "CodeGenTarget", TimeRegions);
   std::vector<Record*> Insts = Records.getAllDerivedDefinitions("Instruction");
   if (Insts.size() <= 2)
     PrintFatalError("No 'Instruction' subclasses defined!");
@@ -599,12 +612,19 @@ ComplexPattern::ComplexPattern(Record *R) {
 //===----------------------------------------------------------------------===//
 
 CodeGenIntrinsicTable::CodeGenIntrinsicTable(const RecordKeeper &RC) {
-  std::vector<Record*> Defs = RC.getAllDerivedDefinitions("Intrinsic");
+  std::vector<Record *> IntrProperties =
+      RC.getAllDerivedDefinitions("IntrinsicProperty");
 
+  std::vector<Record *> DefaultProperties;
+  for (Record *Rec : IntrProperties)
+    if (Rec->getValueAsBit("IsDefault"))
+      DefaultProperties.push_back(Rec);
+
+  std::vector<Record *> Defs = RC.getAllDerivedDefinitions("Intrinsic");
   Intrinsics.reserve(Defs.size());
 
   for (unsigned I = 0, e = Defs.size(); I != e; ++I)
-    Intrinsics.push_back(CodeGenIntrinsic(Defs[I]));
+    Intrinsics.push_back(CodeGenIntrinsic(Defs[I], DefaultProperties));
 
   llvm::sort(Intrinsics,
              [](const CodeGenIntrinsic &LHS, const CodeGenIntrinsic &RHS) {
@@ -620,7 +640,8 @@ CodeGenIntrinsicTable::CodeGenIntrinsicTable(const RecordKeeper &RC) {
   Targets.back().Count = Intrinsics.size() - Targets.back().Offset;
 }
 
-CodeGenIntrinsic::CodeGenIntrinsic(Record *R) {
+CodeGenIntrinsic::CodeGenIntrinsic(Record *R,
+                                   std::vector<Record *> DefaultProperties) {
   TheDef = R;
   std::string DefName = std::string(R->getName());
   ArrayRef<SMLoc> DefLoc = R->getLoc();
@@ -773,70 +794,12 @@ CodeGenIntrinsic::CodeGenIntrinsic(Record *R) {
     assert(Property->isSubClassOf("IntrinsicProperty") &&
            "Expected a property!");
 
-    if (Property->getName() == "IntrNoMem")
-      ModRef = NoMem;
-    else if (Property->getName() == "IntrReadMem")
-      ModRef = ModRefBehavior(ModRef & ~MR_Mod);
-    else if (Property->getName() == "IntrWriteMem")
-      ModRef = ModRefBehavior(ModRef & ~MR_Ref);
-    else if (Property->getName() == "IntrArgMemOnly")
-      ModRef = ModRefBehavior((ModRef & ~MR_Anywhere) | MR_ArgMem);
-    else if (Property->getName() == "IntrInaccessibleMemOnly")
-      ModRef = ModRefBehavior((ModRef & ~MR_Anywhere) | MR_InaccessibleMem);
-    else if (Property->getName() == "IntrInaccessibleMemOrArgMemOnly")
-      ModRef = ModRefBehavior((ModRef & ~MR_Anywhere) | MR_ArgMem |
-                              MR_InaccessibleMem);
-    else if (Property->getName() == "Commutative")
-      isCommutative = true;
-    else if (Property->getName() == "Throws")
-      canThrow = true;
-    else if (Property->getName() == "IntrNoDuplicate")
-      isNoDuplicate = true;
-    else if (Property->getName() == "IntrConvergent")
-      isConvergent = true;
-    else if (Property->getName() == "IntrNoReturn")
-      isNoReturn = true;
-    else if (Property->getName() == "IntrNoSync")
-      isNoSync = true;
-    else if (Property->getName() == "IntrNoFree")
-      isNoFree = true;
-    else if (Property->getName() == "IntrWillReturn")
-      isWillReturn = true;
-    else if (Property->getName() == "IntrCold")
-      isCold = true;
-    else if (Property->getName() == "IntrSpeculatable")
-      isSpeculatable = true;
-    else if (Property->getName() == "IntrHasSideEffects")
-      hasSideEffects = true;
-    else if (Property->isSubClassOf("NoCapture")) {
-      unsigned ArgNo = Property->getValueAsInt("ArgNo");
-      ArgumentAttributes.emplace_back(ArgNo, NoCapture, 0);
-    } else if (Property->isSubClassOf("NoAlias")) {
-      unsigned ArgNo = Property->getValueAsInt("ArgNo");
-      ArgumentAttributes.emplace_back(ArgNo, NoAlias, 0);
-    } else if (Property->isSubClassOf("Returned")) {
-      unsigned ArgNo = Property->getValueAsInt("ArgNo");
-      ArgumentAttributes.emplace_back(ArgNo, Returned, 0);
-    } else if (Property->isSubClassOf("ReadOnly")) {
-      unsigned ArgNo = Property->getValueAsInt("ArgNo");
-      ArgumentAttributes.emplace_back(ArgNo, ReadOnly, 0);
-    } else if (Property->isSubClassOf("WriteOnly")) {
-      unsigned ArgNo = Property->getValueAsInt("ArgNo");
-      ArgumentAttributes.emplace_back(ArgNo, WriteOnly, 0);
-    } else if (Property->isSubClassOf("ReadNone")) {
-      unsigned ArgNo = Property->getValueAsInt("ArgNo");
-      ArgumentAttributes.emplace_back(ArgNo, ReadNone, 0);
-    } else if (Property->isSubClassOf("ImmArg")) {
-      unsigned ArgNo = Property->getValueAsInt("ArgNo");
-      ArgumentAttributes.emplace_back(ArgNo, ImmArg, 0);
-    } else if (Property->isSubClassOf("Align")) {
-      unsigned ArgNo = Property->getValueAsInt("ArgNo");
-      uint64_t Align = Property->getValueAsInt("Align");
-      ArgumentAttributes.emplace_back(ArgNo, Alignment, Align);
-    } else
-      llvm_unreachable("Unknown property!");
+    setProperty(Property);
   }
 
+  // Set default properties to true.
+  setDefaultProperties(R, DefaultProperties);
+
   // Also record the SDPatternOperator Properties.
   Properties = parseSDPatternOperatorProperties(R);
 
@@ -844,6 +807,92 @@ CodeGenIntrinsic::CodeGenIntrinsic(Record *R) {
   llvm::sort(ArgumentAttributes);
 }
 
+void CodeGenIntrinsic::setDefaultProperties(
+    Record *R, std::vector<Record *> DefaultProperties) {
+  // opt-out of using default attributes.
+  if (R->getValueAsBit("DisableDefaultAttributes"))
+    return;
+
+  for (Record *Rec : DefaultProperties)
+    setProperty(Rec);
+}
+
+void CodeGenIntrinsic::setProperty(Record *R) {
+  if (R->getName() == "IntrNoMem")
+    ModRef = NoMem;
+  else if (R->getName() == "IntrReadMem") {
+    if (!(ModRef & MR_Ref))
+      PrintFatalError(TheDef->getLoc(),
+                      Twine("IntrReadMem cannot be used after IntrNoMem or "
+                            "IntrWriteMem. Default is ReadWrite"));
+    ModRef = ModRefBehavior(ModRef & ~MR_Mod);
+  } else if (R->getName() == "IntrWriteMem") {
+    if (!(ModRef & MR_Mod))
+      PrintFatalError(TheDef->getLoc(),
+                      Twine("IntrWriteMem cannot be used after IntrNoMem or "
+                            "IntrReadMem. Default is ReadWrite"));
+    ModRef = ModRefBehavior(ModRef & ~MR_Ref);
+  } else if (R->getName() == "IntrArgMemOnly")
+    ModRef = ModRefBehavior((ModRef & ~MR_Anywhere) | MR_ArgMem);
+  else if (R->getName() == "IntrInaccessibleMemOnly")
+    ModRef = ModRefBehavior((ModRef & ~MR_Anywhere) | MR_InaccessibleMem);
+  else if (R->getName() == "IntrInaccessibleMemOrArgMemOnly")
+    ModRef = ModRefBehavior((ModRef & ~MR_Anywhere) | MR_ArgMem |
+                            MR_InaccessibleMem);
+  else if (R->getName() == "Commutative")
+    isCommutative = true;
+  else if (R->getName() == "Throws")
+    canThrow = true;
+  else if (R->getName() == "IntrNoDuplicate")
+    isNoDuplicate = true;
+  else if (R->getName() == "IntrConvergent")
+    isConvergent = true;
+  else if (R->getName() == "IntrNoReturn")
+    isNoReturn = true;
+  else if (R->getName() == "IntrNoSync")
+    isNoSync = true;
+  else if (R->getName() == "IntrNoFree")
+    isNoFree = true;
+  else if (R->getName() == "IntrWillReturn")
+    isWillReturn = !isNoReturn;
+  else if (R->getName() == "IntrCold")
+    isCold = true;
+  else if (R->getName() == "IntrSpeculatable")
+    isSpeculatable = true;
+  else if (R->getName() == "IntrHasSideEffects")
+    hasSideEffects = true;
+  else if (R->isSubClassOf("NoCapture")) {
+    unsigned ArgNo = R->getValueAsInt("ArgNo");
+    ArgumentAttributes.emplace_back(ArgNo, NoCapture, 0);
+  } else if (R->isSubClassOf("NoAlias")) {
+    unsigned ArgNo = R->getValueAsInt("ArgNo");
+    ArgumentAttributes.emplace_back(ArgNo, NoAlias, 0);
+  } else if (R->isSubClassOf("NoUndef")) {
+    unsigned ArgNo = R->getValueAsInt("ArgNo");
+    ArgumentAttributes.emplace_back(ArgNo, NoUndef, 0);
+  } else if (R->isSubClassOf("Returned")) {
+    unsigned ArgNo = R->getValueAsInt("ArgNo");
+    ArgumentAttributes.emplace_back(ArgNo, Returned, 0);
+  } else if (R->isSubClassOf("ReadOnly")) {
+    unsigned ArgNo = R->getValueAsInt("ArgNo");
+    ArgumentAttributes.emplace_back(ArgNo, ReadOnly, 0);
+  } else if (R->isSubClassOf("WriteOnly")) {
+    unsigned ArgNo = R->getValueAsInt("ArgNo");
+    ArgumentAttributes.emplace_back(ArgNo, WriteOnly, 0);
+  } else if (R->isSubClassOf("ReadNone")) {
+    unsigned ArgNo = R->getValueAsInt("ArgNo");
+    ArgumentAttributes.emplace_back(ArgNo, ReadNone, 0);
+  } else if (R->isSubClassOf("ImmArg")) {
+    unsigned ArgNo = R->getValueAsInt("ArgNo");
+    ArgumentAttributes.emplace_back(ArgNo, ImmArg, 0);
+  } else if (R->isSubClassOf("Align")) {
+    unsigned ArgNo = R->getValueAsInt("ArgNo");
+    uint64_t Align = R->getValueAsInt("Align");
+    ArgumentAttributes.emplace_back(ArgNo, Alignment, Align);
+  } else
+    llvm_unreachable("Unknown property!");
+}
+
 bool CodeGenIntrinsic::isParamAPointer(unsigned ParamIdx) const {
   if (ParamIdx >= IS.ParamVTs.size())
     return false;
diff --git a/contrib/llvm-project/llvm/utils/TableGen/CodeGenTarget.h b/contrib/llvm-project/llvm/utils/TableGen/CodeGenTarget.h
index 6c89f34c50ec..9de9b512f74f 100644
--- a/contrib/llvm-project/llvm/utils/TableGen/CodeGenTarget.h
+++ b/contrib/llvm-project/llvm/utils/TableGen/CodeGenTarget.h
@@ -60,6 +60,7 @@ class CodeGenTarget {
 
   mutable std::unique_ptr<CodeGenSchedModels> SchedModels;
 
+  mutable StringRef InstNamespace;
   mutable std::vector<const CodeGenInstruction*> InstrsByEnum;
   mutable unsigned NumPseudoInstructions = 0;
 public:
@@ -67,12 +68,15 @@ public:
   ~CodeGenTarget();
 
   Record *getTargetRecord() const { return TargetRec; }
-  const StringRef getName() const;
+  StringRef getName() const;
 
   /// getInstNamespace - Return the target-specific instruction namespace.
   ///
   StringRef getInstNamespace() const;
 
+  /// getRegNamespace - Return the target-specific register namespace.
+  StringRef getRegNamespace() const;
+
   /// getInstructionSet - Return the InstructionSet object.
   ///
   Record *getInstructionSet() const;
@@ -107,7 +111,8 @@ public:
   /// covers \p SubIdx if it exists.
   Optional<CodeGenRegisterClass *>
   getSuperRegForSubReg(const ValueTypeByHwMode &Ty, CodeGenRegBank &RegBank,
-                       const CodeGenSubRegIndex *SubIdx) const;
+                       const CodeGenSubRegIndex *SubIdx,
+                       bool MustBeAllocatable = false) const;
 
   /// getRegisterByName - If there is a register with the specific AsmName,
   /// return it.
diff --git a/contrib/llvm-project/llvm/utils/TableGen/DAGISelEmitter.cpp b/contrib/llvm-project/llvm/utils/TableGen/DAGISelEmitter.cpp
index d8e78ce55c7b..32ed0bf98743 100644
--- a/contrib/llvm-project/llvm/utils/TableGen/DAGISelEmitter.cpp
+++ b/contrib/llvm-project/llvm/utils/TableGen/DAGISelEmitter.cpp
@@ -23,9 +23,10 @@ namespace {
 /// DAGISelEmitter - The top-level class which coordinates construction
 /// and emission of the instruction selector.
 class DAGISelEmitter {
+  RecordKeeper &Records; // Just so we can get at the timing functions.
   CodeGenDAGPatterns CGP;
 public:
-  explicit DAGISelEmitter(RecordKeeper &R) : CGP(R) {}
+  explicit DAGISelEmitter(RecordKeeper &R) : Records(R), CGP(R) {}
   void run(raw_ostream &OS);
 };
 } // End anonymous namespace
@@ -150,6 +151,7 @@ void DAGISelEmitter::run(raw_ostream &OS) {
              });
 
   // Add all the patterns to a temporary list so we can sort them.
+  Records.startTimer("Sort patterns");
   std::vector<const PatternToMatch*> Patterns;
   for (CodeGenDAGPatterns::ptm_iterator I = CGP.ptm_begin(), E = CGP.ptm_end();
        I != E; ++I)
@@ -157,11 +159,10 @@ void DAGISelEmitter::run(raw_ostream &OS) {
 
   // We want to process the matches in order of minimal cost.  Sort the patterns
   // so the least cost one is at the start.
-  std::stable_sort(Patterns.begin(), Patterns.end(),
-                   PatternSortingPredicate(CGP));
-
+  llvm::stable_sort(Patterns, PatternSortingPredicate(CGP));
 
   // Convert each variant of each pattern into a Matcher.
+  Records.startTimer("Convert to matchers");
   std::vector<Matcher*> PatternMatchers;
   for (unsigned i = 0, e = Patterns.size(); i != e; ++i) {
     for (unsigned Variant = 0; ; ++Variant) {
@@ -175,14 +176,19 @@ void DAGISelEmitter::run(raw_ostream &OS) {
   std::unique_ptr<Matcher> TheMatcher =
     std::make_unique<ScopeMatcher>(PatternMatchers);
 
+  Records.startTimer("Optimize matchers");
   OptimizeMatcher(TheMatcher, CGP);
+
   //Matcher->dump();
+
+  Records.startTimer("Emit matcher table");
   EmitMatcherTable(TheMatcher.get(), CGP, OS);
 }
 
 namespace llvm {
 
 void EmitDAGISel(RecordKeeper &RK, raw_ostream &OS) {
+  RK.startTimer("Parse patterns");
   DAGISelEmitter(RK).run(OS);
 }
 
diff --git a/contrib/llvm-project/llvm/utils/TableGen/DAGISelMatcher.h b/contrib/llvm-project/llvm/utils/TableGen/DAGISelMatcher.h
index 223513fc8d38..ff9a0cb335d1 100644
--- a/contrib/llvm-project/llvm/utils/TableGen/DAGISelMatcher.h
+++ b/contrib/llvm-project/llvm/utils/TableGen/DAGISelMatcher.h
@@ -31,7 +31,7 @@ Matcher *ConvertPatternToMatcher(const PatternToMatch &Pattern,unsigned Variant,
                                  const CodeGenDAGPatterns &CGP);
 void OptimizeMatcher(std::unique_ptr<Matcher> &Matcher,
                      const CodeGenDAGPatterns &CGP);
-void EmitMatcherTable(const Matcher *Matcher, const CodeGenDAGPatterns &CGP,
+void EmitMatcherTable(Matcher *Matcher, const CodeGenDAGPatterns &CGP,
                       raw_ostream &OS);
 
 
@@ -41,6 +41,7 @@ class Matcher {
   // The next matcher node that is executed after this one.  Null if this is the
   // last stage of a match.
   std::unique_ptr<Matcher> Next;
+  size_t Size; // Size in bytes of matcher and all its children (if any).
   virtual void anchor();
 public:
   enum KindTy {
@@ -85,7 +86,10 @@ public:
     EmitNode,             // Create a DAG node
     EmitNodeXForm,        // Run a SDNodeXForm
     CompleteMatch,        // Finish a match and update the results.
-    MorphNodeTo           // Build a node, finish a match and update results.
+    MorphNodeTo,          // Build a node, finish a match and update results.
+
+    // Highest enum value; watch out when adding more.
+    HighestKind = MorphNodeTo
   };
   const KindTy Kind;
 
@@ -94,6 +98,8 @@ protected:
 public:
   virtual ~Matcher() {}
 
+  unsigned getSize() const { return Size; }
+  void setSize(unsigned sz) { Size = sz; }
   KindTy getKind() const { return Kind; }
 
   Matcher *getNext() { return Next.get(); }
@@ -700,7 +706,7 @@ public:
   const ComplexPattern &getPattern() const { return Pattern; }
   unsigned getMatchNumber() const { return MatchNumber; }
 
-  const std::string getName() const { return Name; }
+  std::string getName() const { return Name; }
   unsigned getFirstResult() const { return FirstResult; }
 
   static bool classof(const Matcher *N) {
@@ -757,8 +763,8 @@ private:
   }
 };
 
-/// CheckImmAllOnesVMatcher - This check if the current node is an build vector
-/// of all ones.
+/// CheckImmAllOnesVMatcher - This checks if the current node is a build_vector
+/// or splat_vector of all ones.
 class CheckImmAllOnesVMatcher : public Matcher {
 public:
   CheckImmAllOnesVMatcher() : Matcher(CheckImmAllOnesV) {}
@@ -773,8 +779,8 @@ private:
   bool isContradictoryImpl(const Matcher *M) const override;
 };
 
-/// CheckImmAllZerosVMatcher - This check if the current node is an build vector
-/// of all zeros.
+/// CheckImmAllZerosVMatcher - This checks if the current node is a
+/// build_vector or splat_vector of all zeros.
 class CheckImmAllZerosVMatcher : public Matcher {
 public:
   CheckImmAllZerosVMatcher() : Matcher(CheckImmAllZerosV) {}
diff --git a/contrib/llvm-project/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp b/contrib/llvm-project/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp
index d9ec14aab8a8..03528a46aea7 100644
--- a/contrib/llvm-project/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp
+++ b/contrib/llvm-project/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp
@@ -23,6 +23,7 @@
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
+
 using namespace llvm;
 
 enum {
@@ -47,6 +48,8 @@ namespace {
 class MatcherTableEmitter {
   const CodeGenDAGPatterns &CGP;
 
+  SmallVector<unsigned, Matcher::HighestKind+1> OpcodeCounts;
+
   DenseMap<TreePattern *, unsigned> NodePredicateMap;
   std::vector<TreePredicateFn> NodePredicates;
   std::vector<TreePredicateFn> NodePredicatesWithOperands;
@@ -79,12 +82,15 @@ class MatcherTableEmitter {
   }
 
 public:
-  MatcherTableEmitter(const CodeGenDAGPatterns &cgp)
-    : CGP(cgp) {}
+  MatcherTableEmitter(const CodeGenDAGPatterns &cgp) : CGP(cgp) {
+    OpcodeCounts.assign(Matcher::HighestKind+1, 0);
+  }
 
-  unsigned EmitMatcherList(const Matcher *N, unsigned Indent,
+  unsigned EmitMatcherList(const Matcher *N, const unsigned Indent,
                            unsigned StartIdx, raw_ostream &OS);
 
+  unsigned SizeMatcherList(Matcher *N, raw_ostream &OS);
+
   void EmitPredicateFunctions(raw_ostream &OS);
 
   void EmitHistogram(const Matcher *N, raw_ostream &OS);
@@ -95,7 +101,9 @@ private:
   void EmitNodePredicatesFunction(const std::vector<TreePredicateFn> &Preds,
                                   StringRef Decl, raw_ostream &OS);
 
-  unsigned EmitMatcher(const Matcher *N, unsigned Indent, unsigned CurrentIdx,
+  unsigned SizeMatcher(Matcher *N, raw_ostream &OS);
+
+  unsigned EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx,
                        raw_ostream &OS);
 
   unsigned getNodePredicate(TreePredicateFn Pred) {
@@ -165,7 +173,7 @@ static std::string GetPatFromTreePatternNode(const TreePatternNode *N) {
   return str;
 }
 
-static unsigned GetVBRSize(unsigned Val) {
+static size_t GetVBRSize(unsigned Val) {
   if (Val <= 127) return 1;
 
   unsigned NumBytes = 0;
@@ -219,6 +227,78 @@ static std::string getIncludePath(const Record *R) {
   return str;
 }
 
+/// This function traverses the matcher tree and sizes all the nodes
+/// that are children of the three kinds of nodes that have them.
+unsigned MatcherTableEmitter::
+SizeMatcherList(Matcher *N, raw_ostream &OS) {
+  unsigned Size = 0;
+  while (N) {
+    Size += SizeMatcher(N, OS);
+    N = N->getNext();
+  }
+  return Size;
+}
+
+/// This function sizes the children of the three kinds of nodes that
+/// have them. It does so by using special cases for those three
+/// nodes, but sharing the code in EmitMatcher() for the other kinds.
+unsigned MatcherTableEmitter::
+SizeMatcher(Matcher *N, raw_ostream &OS) {
+  unsigned Idx = 0;
+
+  ++OpcodeCounts[N->getKind()];
+  switch (N->getKind()) {
+  // The Scope matcher has its kind, a series of child size + child,
+  // and a trailing zero.
+  case Matcher::Scope: {
+    ScopeMatcher *SM = cast<ScopeMatcher>(N);
+    assert(SM->getNext() == nullptr && "Scope matcher should not have next");
+    unsigned Size = 1; // Count the kind.
+    for (unsigned i = 0, e = SM->getNumChildren(); i != e; ++i) {
+      const size_t ChildSize = SizeMatcherList(SM->getChild(i), OS);
+      assert(ChildSize != 0 && "Matcher cannot have child of size 0");
+      SM->getChild(i)->setSize(ChildSize);
+      Size += GetVBRSize(ChildSize) + ChildSize; // Count VBR and child size.
+    }
+    ++Size; // Count the zero sentinel.
+    return Size;
+  }
+
+  // SwitchOpcode and SwitchType have their kind, a series of child size +
+  // opcode/type + child, and a trailing zero.
+  case Matcher::SwitchOpcode:
+  case Matcher::SwitchType: {
+    unsigned Size = 1; // Count the kind.
+    unsigned NumCases;
+    if (const SwitchOpcodeMatcher *SOM = dyn_cast<SwitchOpcodeMatcher>(N))
+      NumCases = SOM->getNumCases();
+    else
+      NumCases = cast<SwitchTypeMatcher>(N)->getNumCases();
+    for (unsigned i = 0, e = NumCases; i != e; ++i) {
+      Matcher *Child;
+      if (SwitchOpcodeMatcher *SOM = dyn_cast<SwitchOpcodeMatcher>(N)) {
+        Child = SOM->getCaseMatcher(i);
+        Size += 2; // Count the child's opcode.
+      } else {
+        Child = cast<SwitchTypeMatcher>(N)->getCaseMatcher(i);
+        ++Size; // Count the child's type.
+      }
+      const size_t ChildSize = SizeMatcherList(Child, OS);
+      assert(ChildSize != 0 && "Matcher cannot have child of size 0");
+      Child->setSize(ChildSize);
+      Size += GetVBRSize(ChildSize) + ChildSize; // Count VBR and child size.
+    }
+    ++Size; // Count the zero sentinel.
+    return Size;
+  }
+
+  default:
+    // Employ the matcher emitter to size other matchers.
+    return EmitMatcher(N, 0, Idx, OS);
+  }
+  llvm_unreachable("Unreachable");
+}
+
 static void BeginEmitFunction(raw_ostream &OS, StringRef RetType,
                               StringRef Decl, bool AddOverride) {
   OS << "#ifdef GET_DAGISEL_DECL\n";
@@ -250,7 +330,7 @@ void MatcherTableEmitter::EmitPatternMatchTable(raw_ostream &OS) {
   BeginEmitFunction(OS, "StringRef", "getPatternForIndex(unsigned Index)",
                     true/*AddOverride*/);
   OS << "{\n";
-  OS << "static const char * PATTERN_MATCH_TABLE[] = {\n";
+  OS << "static const char *PATTERN_MATCH_TABLE[] = {\n";
 
   for (const auto &It : VecPatterns) {
     OS << "\"" << It.first << "\",\n";
@@ -264,7 +344,7 @@ void MatcherTableEmitter::EmitPatternMatchTable(raw_ostream &OS) {
   BeginEmitFunction(OS, "StringRef", "getIncludePathForIndex(unsigned Index)",
                     true/*AddOverride*/);
   OS << "{\n";
-  OS << "static const char * INCLUDE_PATH_TABLE[] = {\n";
+  OS << "static const char *INCLUDE_PATH_TABLE[] = {\n";
 
   for (const auto &It : VecIncludeStrings) {
     OS << "\"" << It << "\",\n";
@@ -279,19 +359,16 @@ void MatcherTableEmitter::EmitPatternMatchTable(raw_ostream &OS) {
 /// EmitMatcher - Emit bytes for the specified matcher and return
 /// the number of bytes emitted.
 unsigned MatcherTableEmitter::
-EmitMatcher(const Matcher *N, unsigned Indent, unsigned CurrentIdx,
+EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx,
             raw_ostream &OS) {
   OS.indent(Indent);
 
   switch (N->getKind()) {
   case Matcher::Scope: {
     const ScopeMatcher *SM = cast<ScopeMatcher>(N);
-    assert(SM->getNext() == nullptr && "Shouldn't have next after scope");
-
     unsigned StartIdx = CurrentIdx;
 
     // Emit all of the children.
-    SmallString<128> TmpBuf;
     for (unsigned i = 0, e = SM->getNumChildren(); i != e; ++i) {
       if (i == 0) {
         OS << "OPC_Scope, ";
@@ -304,34 +381,21 @@ EmitMatcher(const Matcher *N, unsigned Indent, unsigned CurrentIdx,
           OS.indent(Indent);
       }
 
-      // We need to encode the child and the offset of the failure code before
-      // emitting either of them.  Handle this by buffering the output into a
-      // string while we get the size.  Unfortunately, the offset of the
-      // children depends on the VBR size of the child, so for large children we
-      // have to iterate a bit.
-      unsigned ChildSize = 0;
-      unsigned VBRSize = 0;
-      do {
-        VBRSize = GetVBRSize(ChildSize);
-
-        TmpBuf.clear();
-        raw_svector_ostream OS(TmpBuf);
-        ChildSize = EmitMatcherList(SM->getChild(i), Indent+1,
-                                    CurrentIdx+VBRSize, OS);
-      } while (GetVBRSize(ChildSize) != VBRSize);
-
-      assert(ChildSize != 0 && "Should not have a zero-sized child!");
-
-      CurrentIdx += EmitVBRValue(ChildSize, OS);
+      size_t ChildSize = SM->getChild(i)->getSize();
+      size_t VBRSize = GetVBRSize(ChildSize);
+      EmitVBRValue(ChildSize, OS);
       if (!OmitComments) {
-        OS << "/*->" << CurrentIdx+ChildSize << "*/";
-
+        OS << "/*->" << CurrentIdx + VBRSize + ChildSize << "*/";
         if (i == 0)
           OS << " // " << SM->getNumChildren() << " children in Scope";
       }
+      OS << '\n';
 
-      OS << '\n' << TmpBuf;
-      CurrentIdx += ChildSize;
+      ChildSize = EmitMatcherList(SM->getChild(i), Indent+1,
+                                  CurrentIdx + VBRSize, OS);
+      assert(ChildSize == SM->getChild(i)->getSize() &&
+             "Emitted child size does not match calculated size");
+      CurrentIdx += VBRSize + ChildSize;
     }
 
     // Emit a zero as a sentinel indicating end of 'Scope'.
@@ -450,7 +514,6 @@ EmitMatcher(const Matcher *N, unsigned Indent, unsigned CurrentIdx,
     ++CurrentIdx;
 
     // For each case we emit the size, then the opcode, then the matcher.
-    SmallString<128> TmpBuf;
     for (unsigned i = 0, e = NumCases; i != e; ++i) {
       const Matcher *Child;
       unsigned IdxSize;
@@ -462,24 +525,6 @@ EmitMatcher(const Matcher *N, unsigned Indent, unsigned CurrentIdx,
         IdxSize = 1;  // size of type in table is 1 byte.
       }
 
-      // We need to encode the opcode and the offset of the case code before
-      // emitting the case code.  Handle this by buffering the output into a
-      // string while we get the size.  Unfortunately, the offset of the
-      // children depends on the VBR size of the child, so for large children we
-      // have to iterate a bit.
-      unsigned ChildSize = 0;
-      unsigned VBRSize = 0;
-      do {
-        VBRSize = GetVBRSize(ChildSize);
-
-        TmpBuf.clear();
-        raw_svector_ostream OS(TmpBuf);
-        ChildSize = EmitMatcherList(Child, Indent+1, CurrentIdx+VBRSize+IdxSize,
-                                    OS);
-      } while (GetVBRSize(ChildSize) != VBRSize);
-
-      assert(ChildSize != 0 && "Should not have a zero-sized child!");
-
       if (i != 0) {
         if (!OmitComments)
           OS << "/*" << format_decimal(CurrentIdx, IndexWidth) << "*/";
@@ -489,20 +534,19 @@ EmitMatcher(const Matcher *N, unsigned Indent, unsigned CurrentIdx,
                      "/*SwitchOpcode*/ " : "/*SwitchType*/ ");
       }
 
-      // Emit the VBR.
-      CurrentIdx += EmitVBRValue(ChildSize, OS);
-
+      size_t ChildSize = Child->getSize();
+      CurrentIdx += EmitVBRValue(ChildSize, OS) + IdxSize;
       if (const SwitchOpcodeMatcher *SOM = dyn_cast<SwitchOpcodeMatcher>(N))
         OS << "TARGET_VAL(" << SOM->getCaseOpcode(i).getEnumName() << "),";
       else
         OS << getEnumName(cast<SwitchTypeMatcher>(N)->getCaseType(i)) << ',';
-
-      CurrentIdx += IdxSize;
-
       if (!OmitComments)
-        OS << "// ->" << CurrentIdx+ChildSize;
+        OS << "// ->" << CurrentIdx + ChildSize;
       OS << '\n';
-      OS << TmpBuf;
+
+      ChildSize = EmitMatcherList(Child, Indent+1, CurrentIdx, OS);
+      assert(ChildSize == Child->getSize() &&
+             "Emitted child size does not match calculated size");
       CurrentIdx += ChildSize;
     }
 
@@ -515,8 +559,7 @@ EmitMatcher(const Matcher *N, unsigned Indent, unsigned CurrentIdx,
              " // EndSwitchOpcode" : " // EndSwitchType");
 
     OS << '\n';
-    ++CurrentIdx;
-    return CurrentIdx-StartIdx;
+    return CurrentIdx - StartIdx + 1;
   }
 
  case Matcher::CheckType:
@@ -810,9 +853,10 @@ EmitMatcher(const Matcher *N, unsigned Indent, unsigned CurrentIdx,
   llvm_unreachable("Unreachable");
 }
 
-/// EmitMatcherList - Emit the bytes for the specified matcher subtree.
+/// This function traverses the matcher tree and emits all the nodes.
+/// The nodes have already been sized.
 unsigned MatcherTableEmitter::
-EmitMatcherList(const Matcher *N, unsigned Indent, unsigned CurrentIdx,
+EmitMatcherList(const Matcher *N, const unsigned Indent, unsigned CurrentIdx,
                 raw_ostream &OS) {
   unsigned Size = 0;
   while (N) {
@@ -841,12 +885,12 @@ void MatcherTableEmitter::EmitNodePredicatesFunction(
   OS << "  default: llvm_unreachable(\"Invalid predicate in table?\");\n";
   for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
     // Emit the predicate code corresponding to this pattern.
-    TreePredicateFn PredFn = Preds[i];
+    const TreePredicateFn PredFn = Preds[i];
 
     assert(!PredFn.isAlwaysTrue() && "No code in this predicate");
-    OS << "  case " << i << ": { \n";
+    OS << "  case " << i << ": {\n";
     for (auto *SimilarPred :
-          NodePredicatesByCodeToRun[PredFn.getCodeToRunOnSDNode()])
+             NodePredicatesByCodeToRun[PredFn.getCodeToRunOnSDNode()])
       OS << "    // " << TreePredicateFn(SimilarPred).getFnName() <<'\n';
 
     OS << PredFn.getCodeToRunOnSDNode() << "\n  }\n";
@@ -887,7 +931,7 @@ void MatcherTableEmitter::EmitPredicateFunctions(raw_ostream &OS) {
     BeginEmitFunction(OS, "bool",
           "CheckComplexPattern(SDNode *Root, SDNode *Parent,\n"
           "      SDValue N, unsigned PatternNo,\n"
-          "      SmallVectorImpl<std::pair<SDValue, SDNode*>> &Result)",
+          "      SmallVectorImpl<std::pair<SDValue, SDNode *>> &Result)",
           true/*AddOverride*/);
     OS << "{\n";
     OS << "  unsigned NextRes = Result.size();\n";
@@ -975,28 +1019,6 @@ void MatcherTableEmitter::EmitPredicateFunctions(raw_ostream &OS) {
   }
 }
 
-static void BuildHistogram(const Matcher *M, std::vector<unsigned> &OpcodeFreq){
-  for (; M != nullptr; M = M->getNext()) {
-    // Count this node.
-    if (unsigned(M->getKind()) >= OpcodeFreq.size())
-      OpcodeFreq.resize(M->getKind()+1);
-    OpcodeFreq[M->getKind()]++;
-
-    // Handle recursive nodes.
-    if (const ScopeMatcher *SM = dyn_cast<ScopeMatcher>(M)) {
-      for (unsigned i = 0, e = SM->getNumChildren(); i != e; ++i)
-        BuildHistogram(SM->getChild(i), OpcodeFreq);
-    } else if (const SwitchOpcodeMatcher *SOM =
-                 dyn_cast<SwitchOpcodeMatcher>(M)) {
-      for (unsigned i = 0, e = SOM->getNumCases(); i != e; ++i)
-        BuildHistogram(SOM->getCaseMatcher(i), OpcodeFreq);
-    } else if (const SwitchTypeMatcher *STM = dyn_cast<SwitchTypeMatcher>(M)) {
-      for (unsigned i = 0, e = STM->getNumCases(); i != e; ++i)
-        BuildHistogram(STM->getCaseMatcher(i), OpcodeFreq);
-    }
-  }
-}
-
 static StringRef getOpcodeString(Matcher::KindTy Kind) {
   switch (Kind) {
   case Matcher::Scope: return "OPC_Scope"; break;
@@ -1048,20 +1070,17 @@ void MatcherTableEmitter::EmitHistogram(const Matcher *M,
   if (OmitComments)
     return;
 
-  std::vector<unsigned> OpcodeFreq;
-  BuildHistogram(M, OpcodeFreq);
-
   OS << "  // Opcode Histogram:\n";
-  for (unsigned i = 0, e = OpcodeFreq.size(); i != e; ++i) {
+  for (unsigned i = 0, e = OpcodeCounts.size(); i != e; ++i) {
     OS << "  // #"
        << left_justify(getOpcodeString((Matcher::KindTy)i), HistOpcWidth)
-       << " = " << OpcodeFreq[i] << '\n';
+       << " = " << OpcodeCounts[i] << '\n';
   }
   OS << '\n';
 }
 
 
-void llvm::EmitMatcherTable(const Matcher *TheMatcher,
+void llvm::EmitMatcherTable(Matcher *TheMatcher,
                             const CodeGenDAGPatterns &CGP,
                             raw_ostream &OS) {
   OS << "#if defined(GET_DAGISEL_DECL) && defined(GET_DAGISEL_BODY)\n";
@@ -1096,12 +1115,23 @@ void llvm::EmitMatcherTable(const Matcher *TheMatcher,
   BeginEmitFunction(OS, "void", "SelectCode(SDNode *N)", false/*AddOverride*/);
   MatcherTableEmitter MatcherEmitter(CGP);
 
+  // First we size all the children of the three kinds of matchers that have
+  // them. This is done by sharing the code in EmitMatcher(). but we don't
+  // want to emit anything, so we turn off comments and use a null stream.
+  bool SaveOmitComments = OmitComments;
+  OmitComments = true;
+  raw_null_ostream NullOS;
+  unsigned TotalSize = MatcherEmitter.SizeMatcherList(TheMatcher, NullOS);
+  OmitComments = SaveOmitComments;
+
+  // Now that the matchers are sized, we can emit the code for them to the
+  // final stream.
   OS << "{\n";
   OS << "  // Some target values are emitted as 2 bytes, TARGET_VAL handles\n";
   OS << "  // this.\n";
   OS << "  #define TARGET_VAL(X) X & 255, unsigned(X) >> 8\n";
   OS << "  static const unsigned char MatcherTable[] = {\n";
-  unsigned TotalSize = MatcherEmitter.EmitMatcherList(TheMatcher, 1, 0, OS);
+  TotalSize = MatcherEmitter.EmitMatcherList(TheMatcher, 1, 0, OS);
   OS << "    0\n  }; // Total Array size is " << (TotalSize+1) << " bytes\n\n";
 
   MatcherEmitter.EmitHistogram(TheMatcher, OS);
diff --git a/contrib/llvm-project/llvm/utils/TableGen/DAGISelMatcherGen.cpp b/contrib/llvm-project/llvm/utils/TableGen/DAGISelMatcherGen.cpp
index 123ea3374c74..f7415b87e1c0 100644
--- a/contrib/llvm-project/llvm/utils/TableGen/DAGISelMatcherGen.cpp
+++ b/contrib/llvm-project/llvm/utils/TableGen/DAGISelMatcherGen.cpp
@@ -282,7 +282,9 @@ void MatcherGen::EmitLeafMatchCode(const TreePatternNode *N) {
     // check to ensure that this gets folded into the normal top-level
     // OpcodeSwitch.
     if (N == Pattern.getSrcPattern()) {
-      const SDNodeInfo &NI = CGP.getSDNodeInfo(CGP.getSDNodeNamed("build_vector"));
+      MVT VT = N->getSimpleType(0);
+      StringRef Name = VT.isScalableVector() ? "splat_vector" : "build_vector";
+      const SDNodeInfo &NI = CGP.getSDNodeInfo(CGP.getSDNodeNamed(Name));
       AddMatcher(new CheckOpcodeMatcher(NI));
     }
     return AddMatcher(new CheckImmAllOnesVMatcher());
@@ -292,7 +294,9 @@ void MatcherGen::EmitLeafMatchCode(const TreePatternNode *N) {
     // check to ensure that this gets folded into the normal top-level
     // OpcodeSwitch.
     if (N == Pattern.getSrcPattern()) {
-      const SDNodeInfo &NI = CGP.getSDNodeInfo(CGP.getSDNodeNamed("build_vector"));
+      MVT VT = N->getSimpleType(0);
+      StringRef Name = VT.isScalableVector() ? "splat_vector" : "build_vector";
+      const SDNodeInfo &NI = CGP.getSDNodeInfo(CGP.getSDNodeNamed(Name));
       AddMatcher(new CheckOpcodeMatcher(NI));
     }
     return AddMatcher(new CheckImmAllZerosVMatcher());
@@ -744,7 +748,7 @@ void MatcherGen::EmitResultLeafAsOperand(const TreePatternNode *N,
     }
   }
 
-  errs() << "unhandled leaf node: \n";
+  errs() << "unhandled leaf node:\n";
   N->dump();
 }
 
diff --git a/contrib/llvm-project/llvm/utils/TableGen/DFAEmitter.cpp b/contrib/llvm-project/llvm/utils/TableGen/DFAEmitter.cpp
index 7391f6845a4b..781cb0636fa1 100644
--- a/contrib/llvm-project/llvm/utils/TableGen/DFAEmitter.cpp
+++ b/contrib/llvm-project/llvm/utils/TableGen/DFAEmitter.cpp
@@ -174,7 +174,7 @@ namespace {
 struct Action {
   Record *R = nullptr;
   unsigned I = 0;
-  std::string S = nullptr;
+  std::string S;
 
   Action() = default;
   Action(Record *R, unsigned I, std::string S) : R(R), I(I), S(S) {}
@@ -346,8 +346,7 @@ Transition::Transition(Record *R, Automaton *Parent) {
     } else if (isa<IntRecTy>(SymbolV->getType())) {
       Actions.emplace_back(nullptr, R->getValueAsInt(A), "");
       Types.emplace_back("unsigned");
-    } else if (isa<StringRecTy>(SymbolV->getType()) ||
-               isa<CodeRecTy>(SymbolV->getType())) {
+    } else if (isa<StringRecTy>(SymbolV->getType())) {
       Actions.emplace_back(nullptr, 0, std::string(R->getValueAsString(A)));
       Types.emplace_back("std::string");
     } else {
diff --git a/contrib/llvm-project/llvm/utils/TableGen/DFAPacketizerEmitter.cpp b/contrib/llvm-project/llvm/utils/TableGen/DFAPacketizerEmitter.cpp
index bc4a084b3224..40d9385e5e9e 100644
--- a/contrib/llvm-project/llvm/utils/TableGen/DFAPacketizerEmitter.cpp
+++ b/contrib/llvm-project/llvm/utils/TableGen/DFAPacketizerEmitter.cpp
@@ -263,7 +263,7 @@ void DFAPacketizerEmitter::emitForItineraries(
     OS << "  " << ProcModelStartIdx[Model] << ", // " << Model->ModelName
        << "\n";
   }
-  OS << ScheduleClasses.size() << "\n};\n\n";
+  OS << "  " << ScheduleClasses.size() << "\n};\n\n";
 
   // The type of a state in the nondeterministic automaton we're defining.
   using NfaStateTy = uint64_t;
diff --git a/contrib/llvm-project/llvm/utils/TableGen/DirectiveEmitter.cpp b/contrib/llvm-project/llvm/utils/TableGen/DirectiveEmitter.cpp
index 2061ff1fdd1a..c9daa9b24155 100644
--- a/contrib/llvm-project/llvm/utils/TableGen/DirectiveEmitter.cpp
+++ b/contrib/llvm-project/llvm/utils/TableGen/DirectiveEmitter.cpp
@@ -11,9 +11,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/TableGen/DirectiveEmitter.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
@@ -40,23 +40,15 @@ private:
 
 namespace llvm {
 
-// Get Directive or Clause name formatted by replacing whitespaces with
-// underscores.
-std::string getFormattedName(StringRef Name) {
-  std::string N = Name.str();
-  std::replace(N.begin(), N.end(), ' ', '_');
-  return N;
-}
-
 // Generate enum class
 void GenerateEnumClass(const std::vector<Record *> &Records, raw_ostream &OS,
-                       StringRef Enum, StringRef Prefix, StringRef CppNamespace,
-                       bool MakeEnumAvailableInNamespace) {
+                       StringRef Enum, StringRef Prefix,
+                       const DirectiveLanguage &DirLang) {
   OS << "\n";
   OS << "enum class " << Enum << " {\n";
   for (const auto &R : Records) {
-    const auto Name = R->getValueAsString("name");
-    OS << "  " << Prefix << getFormattedName(Name) << ",\n";
+    BaseRecord Rec{R};
+    OS << "  " << Prefix << Rec.getFormattedName() << ",\n";
   }
   OS << "};\n";
   OS << "\n";
@@ -68,44 +60,130 @@ void GenerateEnumClass(const std::vector<Record *> &Records, raw_ostream &OS,
   // At the same time we do not loose the strong type guarantees of the enum
   // class, that is we cannot pass an unsigned as Directive without an explicit
   // cast.
-  if (MakeEnumAvailableInNamespace) {
+  if (DirLang.hasMakeEnumAvailableInNamespace()) {
     OS << "\n";
     for (const auto &R : Records) {
-      const auto FormattedName = getFormattedName(R->getValueAsString("name"));
-      OS << "constexpr auto " << Prefix << FormattedName << " = "
-         << "llvm::" << CppNamespace << "::" << Enum << "::" << Prefix
-         << FormattedName << ";\n";
+      BaseRecord Rec{R};
+      OS << "constexpr auto " << Prefix << Rec.getFormattedName() << " = "
+         << "llvm::" << DirLang.getCppNamespace() << "::" << Enum
+         << "::" << Prefix << Rec.getFormattedName() << ";\n";
     }
   }
 }
 
+// Generate enums for values that clauses can take.
+// Also generate function declarations for get<Enum>Name(StringRef Str).
+void GenerateEnumClauseVal(const std::vector<Record *> &Records,
+                           raw_ostream &OS, const DirectiveLanguage &DirLang,
+                           std::string &EnumHelperFuncs) {
+  for (const auto &R : Records) {
+    Clause C{R};
+    const auto &ClauseVals = C.getClauseVals();
+    if (ClauseVals.size() <= 0)
+      continue;
+
+    const auto &EnumName = C.getEnumName();
+    if (EnumName.size() == 0) {
+      PrintError("enumClauseValue field not set in Clause" +
+                 C.getFormattedName() + ".");
+      return;
+    }
+
+    OS << "\n";
+    OS << "enum class " << EnumName << " {\n";
+    for (const auto &CV : ClauseVals) {
+      ClauseVal CVal{CV};
+      OS << "  " << CV->getName() << "=" << CVal.getValue() << ",\n";
+    }
+    OS << "};\n";
+
+    if (DirLang.hasMakeEnumAvailableInNamespace()) {
+      OS << "\n";
+      for (const auto &CV : ClauseVals) {
+        OS << "constexpr auto " << CV->getName() << " = "
+           << "llvm::" << DirLang.getCppNamespace() << "::" << EnumName
+           << "::" << CV->getName() << ";\n";
+      }
+      EnumHelperFuncs += (llvm::Twine(EnumName) + llvm::Twine(" get") +
+                          llvm::Twine(EnumName) + llvm::Twine("(StringRef);\n"))
+                             .str();
+
+      EnumHelperFuncs +=
+          (llvm::Twine("llvm::StringRef get") + llvm::Twine(DirLang.getName()) +
+           llvm::Twine(EnumName) + llvm::Twine("Name(") +
+           llvm::Twine(EnumName) + llvm::Twine(");\n"))
+              .str();
+    }
+  }
+}
+
+bool HasDuplicateClauses(const std::vector<Record *> &Clauses,
+                         const Directive &Directive,
+                         llvm::StringSet<> &CrtClauses) {
+  bool HasError = false;
+  for (const auto &C : Clauses) {
+    VersionedClause VerClause{C};
+    const auto insRes = CrtClauses.insert(VerClause.getClause().getName());
+    if (!insRes.second) {
+      PrintError("Clause " + VerClause.getClause().getRecordName() +
+                 " already defined on directive " + Directive.getRecordName());
+      HasError = true;
+    }
+  }
+  return HasError;
+}
+
+// Check for duplicate clauses in lists. Clauses cannot appear twice in the
+// three allowed list. Also, since required implies allowed, clauses cannot
+// appear in both the allowedClauses and requiredClauses lists.
+bool HasDuplicateClausesInDirectives(const std::vector<Record *> &Directives) {
+  bool HasDuplicate = false;
+  for (const auto &D : Directives) {
+    Directive Dir{D};
+    llvm::StringSet<> Clauses;
+    // Check for duplicates in the three allowed lists.
+    if (HasDuplicateClauses(Dir.getAllowedClauses(), Dir, Clauses) ||
+        HasDuplicateClauses(Dir.getAllowedOnceClauses(), Dir, Clauses) ||
+        HasDuplicateClauses(Dir.getAllowedExclusiveClauses(), Dir, Clauses)) {
+      HasDuplicate = true;
+    }
+    // Check for duplicate between allowedClauses and required
+    Clauses.clear();
+    if (HasDuplicateClauses(Dir.getAllowedClauses(), Dir, Clauses) ||
+        HasDuplicateClauses(Dir.getRequiredClauses(), Dir, Clauses)) {
+      HasDuplicate = true;
+    }
+    if (HasDuplicate)
+      PrintFatalError("One or more clauses are defined multiple times on"
+                      " directive " +
+                      Dir.getRecordName());
+  }
+
+  return HasDuplicate;
+}
+
+// Check consitency of records. Return true if an error has been detected.
+// Return false if the records are valid.
+bool DirectiveLanguage::HasValidityErrors() const {
+  if (getDirectiveLanguages().size() != 1) {
+    PrintFatalError("A single definition of DirectiveLanguage is needed.");
+    return true;
+  }
+
+  return HasDuplicateClausesInDirectives(getDirectives());
+}
+
 // Generate the declaration section for the enumeration in the directive
 // language
 void EmitDirectivesDecl(RecordKeeper &Records, raw_ostream &OS) {
-
-  const auto &DirectiveLanguages =
-      Records.getAllDerivedDefinitions("DirectiveLanguage");
-
-  if (DirectiveLanguages.size() != 1) {
-    PrintError("A single definition of DirectiveLanguage is needed.");
+  const auto DirLang = DirectiveLanguage{Records};
+  if (DirLang.HasValidityErrors())
     return;
-  }
 
-  const auto &DirectiveLanguage = DirectiveLanguages[0];
-  StringRef LanguageName = DirectiveLanguage->getValueAsString("name");
-  StringRef DirectivePrefix =
-      DirectiveLanguage->getValueAsString("directivePrefix");
-  StringRef ClausePrefix = DirectiveLanguage->getValueAsString("clausePrefix");
-  StringRef CppNamespace = DirectiveLanguage->getValueAsString("cppNamespace");
-  bool MakeEnumAvailableInNamespace =
-      DirectiveLanguage->getValueAsBit("makeEnumAvailableInNamespace");
-  bool EnableBitmaskEnumInNamespace =
-      DirectiveLanguage->getValueAsBit("enableBitmaskEnumInNamespace");
-
-  OS << "#ifndef LLVM_" << LanguageName << "_INC\n";
-  OS << "#define LLVM_" << LanguageName << "_INC\n";
-
-  if (EnableBitmaskEnumInNamespace)
+  OS << "#ifndef LLVM_" << DirLang.getName() << "_INC\n";
+  OS << "#define LLVM_" << DirLang.getName() << "_INC\n";
+
+  if (DirLang.hasEnableBitmaskEnumInNamespace())
     OS << "\n#include \"llvm/ADT/BitmaskEnum.h\"\n";
 
   OS << "\n";
@@ -114,41 +192,48 @@ void EmitDirectivesDecl(RecordKeeper &Records, raw_ostream &OS) {
 
   // Open namespaces defined in the directive language
   llvm::SmallVector<StringRef, 2> Namespaces;
-  llvm::SplitString(CppNamespace, Namespaces, "::");
+  llvm::SplitString(DirLang.getCppNamespace(), Namespaces, "::");
   for (auto Ns : Namespaces)
     OS << "namespace " << Ns << " {\n";
 
-  if (EnableBitmaskEnumInNamespace)
+  if (DirLang.hasEnableBitmaskEnumInNamespace())
     OS << "\nLLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();\n";
 
   // Emit Directive enumeration
-  const auto &Directives = Records.getAllDerivedDefinitions("Directive");
-  GenerateEnumClass(Directives, OS, "Directive", DirectivePrefix, CppNamespace,
-                    MakeEnumAvailableInNamespace);
+  GenerateEnumClass(DirLang.getDirectives(), OS, "Directive",
+                    DirLang.getDirectivePrefix(), DirLang);
 
   // Emit Clause enumeration
-  const auto &Clauses = Records.getAllDerivedDefinitions("Clause");
-  GenerateEnumClass(Clauses, OS, "Clause", ClausePrefix, CppNamespace,
-                    MakeEnumAvailableInNamespace);
+  GenerateEnumClass(DirLang.getClauses(), OS, "Clause",
+                    DirLang.getClausePrefix(), DirLang);
+
+  // Emit ClauseVal enumeration
+  std::string EnumHelperFuncs;
+  GenerateEnumClauseVal(DirLang.getClauses(), OS, DirLang, EnumHelperFuncs);
 
   // Generic function signatures
   OS << "\n";
   OS << "// Enumeration helper functions\n";
-  OS << "Directive get" << LanguageName
+  OS << "Directive get" << DirLang.getName()
      << "DirectiveKind(llvm::StringRef Str);\n";
   OS << "\n";
-  OS << "llvm::StringRef get" << LanguageName
+  OS << "llvm::StringRef get" << DirLang.getName()
      << "DirectiveName(Directive D);\n";
   OS << "\n";
-  OS << "Clause get" << LanguageName << "ClauseKind(llvm::StringRef Str);\n";
+  OS << "Clause get" << DirLang.getName()
+     << "ClauseKind(llvm::StringRef Str);\n";
   OS << "\n";
-  OS << "llvm::StringRef get" << LanguageName << "ClauseName(Clause C);\n";
+  OS << "llvm::StringRef get" << DirLang.getName() << "ClauseName(Clause C);\n";
   OS << "\n";
   OS << "/// Return true if \\p C is a valid clause for \\p D in version \\p "
      << "Version.\n";
   OS << "bool isAllowedClauseForDirective(Directive D, "
      << "Clause C, unsigned Version);\n";
   OS << "\n";
+  if (EnumHelperFuncs.length() > 0) {
+    OS << EnumHelperFuncs;
+    OS << "\n";
+  }
 
   // Closing namespaces
   for (auto Ns : llvm::reverse(Namespaces))
@@ -156,138 +241,183 @@ void EmitDirectivesDecl(RecordKeeper &Records, raw_ostream &OS) {
 
   OS << "} // namespace llvm\n";
 
-  OS << "#endif // LLVM_" << LanguageName << "_INC\n";
+  OS << "#endif // LLVM_" << DirLang.getName() << "_INC\n";
 }
 
 // Generate function implementation for get<Enum>Name(StringRef Str)
 void GenerateGetName(const std::vector<Record *> &Records, raw_ostream &OS,
-                     StringRef Enum, StringRef Prefix, StringRef LanguageName,
-                     StringRef Namespace) {
+                     StringRef Enum, const DirectiveLanguage &DirLang,
+                     StringRef Prefix) {
   OS << "\n";
-  OS << "llvm::StringRef llvm::" << Namespace << "::get" << LanguageName << Enum
-     << "Name(" << Enum << " Kind) {\n";
+  OS << "llvm::StringRef llvm::" << DirLang.getCppNamespace() << "::get"
+     << DirLang.getName() << Enum << "Name(" << Enum << " Kind) {\n";
   OS << "  switch (Kind) {\n";
   for (const auto &R : Records) {
-    const auto Name = R->getValueAsString("name");
-    const auto AlternativeName = R->getValueAsString("alternativeName");
-    OS << "    case " << Prefix << getFormattedName(Name) << ":\n";
+    BaseRecord Rec{R};
+    OS << "    case " << Prefix << Rec.getFormattedName() << ":\n";
     OS << "      return \"";
-    if (AlternativeName.empty())
-      OS << Name;
+    if (Rec.getAlternativeName().empty())
+      OS << Rec.getName();
     else
-      OS << AlternativeName;
+      OS << Rec.getAlternativeName();
     OS << "\";\n";
   }
   OS << "  }\n"; // switch
-  OS << "  llvm_unreachable(\"Invalid " << LanguageName << " " << Enum
+  OS << "  llvm_unreachable(\"Invalid " << DirLang.getName() << " " << Enum
      << " kind\");\n";
   OS << "}\n";
 }
 
 // Generate function implementation for get<Enum>Kind(StringRef Str)
 void GenerateGetKind(const std::vector<Record *> &Records, raw_ostream &OS,
-                     StringRef Enum, StringRef Prefix, StringRef LanguageName,
-                     StringRef Namespace, bool ImplicitAsUnknown) {
+                     StringRef Enum, const DirectiveLanguage &DirLang,
+                     StringRef Prefix, bool ImplicitAsUnknown) {
 
-  auto DefaultIt = std::find_if(Records.begin(), Records.end(), [](Record *R) {
-    return R->getValueAsBit("isDefault") == true;
-  });
+  auto DefaultIt = llvm::find_if(
+      Records, [](Record *R) { return R->getValueAsBit("isDefault") == true; });
 
   if (DefaultIt == Records.end()) {
-    PrintError("A least one " + Enum + " must be defined as default.");
+    PrintError("At least one " + Enum + " must be defined as default.");
     return;
   }
 
-  const auto FormattedDefaultName =
-      getFormattedName((*DefaultIt)->getValueAsString("name"));
+  BaseRecord DefaultRec{(*DefaultIt)};
 
   OS << "\n";
-  OS << Enum << " llvm::" << Namespace << "::get" << LanguageName << Enum
-     << "Kind(llvm::StringRef Str) {\n";
+  OS << Enum << " llvm::" << DirLang.getCppNamespace() << "::get"
+     << DirLang.getName() << Enum << "Kind(llvm::StringRef Str) {\n";
   OS << "  return llvm::StringSwitch<" << Enum << ">(Str)\n";
 
   for (const auto &R : Records) {
-    const auto Name = R->getValueAsString("name");
+    BaseRecord Rec{R};
     if (ImplicitAsUnknown && R->getValueAsBit("isImplicit")) {
-      OS << "    .Case(\"" << Name << "\"," << Prefix << FormattedDefaultName
-         << ")\n";
+      OS << "    .Case(\"" << Rec.getName() << "\"," << Prefix
+         << DefaultRec.getFormattedName() << ")\n";
     } else {
-      OS << "    .Case(\"" << Name << "\"," << Prefix << getFormattedName(Name)
-         << ")\n";
+      OS << "    .Case(\"" << Rec.getName() << "\"," << Prefix
+         << Rec.getFormattedName() << ")\n";
     }
   }
-  OS << "    .Default(" << Prefix << FormattedDefaultName << ");\n";
+  OS << "    .Default(" << Prefix << DefaultRec.getFormattedName() << ");\n";
   OS << "}\n";
 }
 
+// Generate function implementation for get<ClauseVal>Kind(StringRef Str)
+void GenerateGetKindClauseVal(const DirectiveLanguage &DirLang,
+                              raw_ostream &OS) {
+  for (const auto &R : DirLang.getClauses()) {
+    Clause C{R};
+    const auto &ClauseVals = C.getClauseVals();
+    if (ClauseVals.size() <= 0)
+      continue;
+
+    auto DefaultIt = llvm::find_if(ClauseVals, [](Record *CV) {
+      return CV->getValueAsBit("isDefault") == true;
+    });
+
+    if (DefaultIt == ClauseVals.end()) {
+      PrintError("At least one val in Clause " + C.getFormattedName() +
+                 " must be defined as default.");
+      return;
+    }
+    const auto DefaultName = (*DefaultIt)->getName();
+
+    const auto &EnumName = C.getEnumName();
+    if (EnumName.size() == 0) {
+      PrintError("enumClauseValue field not set in Clause" +
+                 C.getFormattedName() + ".");
+      return;
+    }
+
+    OS << "\n";
+    OS << EnumName << " llvm::" << DirLang.getCppNamespace() << "::get"
+       << EnumName << "(llvm::StringRef Str) {\n";
+    OS << "  return llvm::StringSwitch<" << EnumName << ">(Str)\n";
+    for (const auto &CV : ClauseVals) {
+      ClauseVal CVal{CV};
+      OS << "    .Case(\"" << CVal.getFormattedName() << "\"," << CV->getName()
+         << ")\n";
+    }
+    OS << "    .Default(" << DefaultName << ");\n";
+    OS << "}\n";
+
+    OS << "\n";
+    OS << "llvm::StringRef llvm::" << DirLang.getCppNamespace() << "::get"
+       << DirLang.getName() << EnumName
+       << "Name(llvm::" << DirLang.getCppNamespace() << "::" << EnumName
+       << " x) {\n";
+    OS << "  switch (x) {\n";
+    for (const auto &CV : ClauseVals) {
+      ClauseVal CVal{CV};
+      OS << "    case " << CV->getName() << ":\n";
+      OS << "      return \"" << CVal.getFormattedName() << "\";\n";
+    }
+    OS << "  }\n"; // switch
+    OS << "  llvm_unreachable(\"Invalid " << DirLang.getName() << " "
+       << EnumName << " kind\");\n";
+    OS << "}\n";
+  }
+}
+
 void GenerateCaseForVersionedClauses(const std::vector<Record *> &Clauses,
                                      raw_ostream &OS, StringRef DirectiveName,
-                                     StringRef DirectivePrefix,
-                                     StringRef ClausePrefix,
+                                     const DirectiveLanguage &DirLang,
                                      llvm::StringSet<> &Cases) {
   for (const auto &C : Clauses) {
-    const auto MinVersion = C->getValueAsInt("minVersion");
-    const auto MaxVersion = C->getValueAsInt("maxVersion");
-    const auto SpecificClause = C->getValueAsDef("clause");
-    const auto ClauseName =
-        getFormattedName(SpecificClause->getValueAsString("name"));
-
-    if (Cases.find(ClauseName) == Cases.end()) {
-      Cases.insert(ClauseName);
-      OS << "        case " << ClausePrefix << ClauseName << ":\n";
-      OS << "          return " << MinVersion << " <= Version && " << MaxVersion
-         << " >= Version;\n";
+    VersionedClause VerClause{C};
+
+    const auto ClauseFormattedName = VerClause.getClause().getFormattedName();
+
+    if (Cases.find(ClauseFormattedName) == Cases.end()) {
+      Cases.insert(ClauseFormattedName);
+      OS << "        case " << DirLang.getClausePrefix() << ClauseFormattedName
+         << ":\n";
+      OS << "          return " << VerClause.getMinVersion()
+         << " <= Version && " << VerClause.getMaxVersion() << " >= Version;\n";
     }
   }
 }
 
 // Generate the isAllowedClauseForDirective function implementation.
-void GenerateIsAllowedClause(const std::vector<Record *> &Directives,
-                             raw_ostream &OS, StringRef LanguageName,
-                             StringRef DirectivePrefix, StringRef ClausePrefix,
-                             StringRef CppNamespace) {
+void GenerateIsAllowedClause(const DirectiveLanguage &DirLang,
+                             raw_ostream &OS) {
   OS << "\n";
-  OS << "bool llvm::" << CppNamespace << "::isAllowedClauseForDirective("
+  OS << "bool llvm::" << DirLang.getCppNamespace()
+     << "::isAllowedClauseForDirective("
      << "Directive D, Clause C, unsigned Version) {\n";
-  OS << "  assert(unsigned(D) <= llvm::" << CppNamespace
+  OS << "  assert(unsigned(D) <= llvm::" << DirLang.getCppNamespace()
      << "::Directive_enumSize);\n";
-  OS << "  assert(unsigned(C) <= llvm::" << CppNamespace
+  OS << "  assert(unsigned(C) <= llvm::" << DirLang.getCppNamespace()
      << "::Clause_enumSize);\n";
 
   OS << "  switch (D) {\n";
 
-  for (const auto &D : Directives) {
-
-    const auto DirectiveName = D->getValueAsString("name");
-    const auto &AllowedClauses = D->getValueAsListOfDefs("allowedClauses");
-    const auto &AllowedOnceClauses =
-        D->getValueAsListOfDefs("allowedOnceClauses");
-    const auto &AllowedExclusiveClauses =
-        D->getValueAsListOfDefs("allowedExclusiveClauses");
-    const auto &RequiredClauses = D->getValueAsListOfDefs("requiredClauses");
+  for (const auto &D : DirLang.getDirectives()) {
+    Directive Dir{D};
 
-    OS << "    case " << DirectivePrefix << getFormattedName(DirectiveName)
+    OS << "    case " << DirLang.getDirectivePrefix() << Dir.getFormattedName()
        << ":\n";
-    if (AllowedClauses.size() == 0 && AllowedOnceClauses.size() == 0 &&
-        AllowedExclusiveClauses.size() == 0 && RequiredClauses.size() == 0) {
+    if (Dir.getAllowedClauses().size() == 0 &&
+        Dir.getAllowedOnceClauses().size() == 0 &&
+        Dir.getAllowedExclusiveClauses().size() == 0 &&
+        Dir.getRequiredClauses().size() == 0) {
       OS << "      return false;\n";
     } else {
       OS << "      switch (C) {\n";
 
       llvm::StringSet<> Cases;
 
-      GenerateCaseForVersionedClauses(AllowedClauses, OS, DirectiveName,
-                                      DirectivePrefix, ClausePrefix, Cases);
+      GenerateCaseForVersionedClauses(Dir.getAllowedClauses(), OS,
+                                      Dir.getName(), DirLang, Cases);
 
-      GenerateCaseForVersionedClauses(AllowedOnceClauses, OS, DirectiveName,
-                                      DirectivePrefix, ClausePrefix, Cases);
+      GenerateCaseForVersionedClauses(Dir.getAllowedOnceClauses(), OS,
+                                      Dir.getName(), DirLang, Cases);
 
-      GenerateCaseForVersionedClauses(AllowedExclusiveClauses, OS,
-                                      DirectiveName, DirectivePrefix,
-                                      ClausePrefix, Cases);
+      GenerateCaseForVersionedClauses(Dir.getAllowedExclusiveClauses(), OS,
+                                      Dir.getName(), DirLang, Cases);
 
-      GenerateCaseForVersionedClauses(RequiredClauses, OS, DirectiveName,
-                                      DirectivePrefix, ClausePrefix, Cases);
+      GenerateCaseForVersionedClauses(Dir.getRequiredClauses(), OS,
+                                      Dir.getName(), DirLang, Cases);
 
       OS << "        default:\n";
       OS << "          return false;\n";
@@ -297,37 +427,32 @@ void GenerateIsAllowedClause(const std::vector<Record *> &Directives,
   }
 
   OS << "  }\n"; // End of directives switch
-  OS << "  llvm_unreachable(\"Invalid " << LanguageName
+  OS << "  llvm_unreachable(\"Invalid " << DirLang.getName()
      << " Directive kind\");\n";
   OS << "}\n"; // End of function isAllowedClauseForDirective
 }
 
 // Generate a simple enum set with the give clauses.
 void GenerateClauseSet(const std::vector<Record *> &Clauses, raw_ostream &OS,
-                       StringRef ClauseEnumSetClass, StringRef ClauseSetPrefix,
-                       StringRef DirectiveName, StringRef DirectivePrefix,
-                       StringRef ClausePrefix, StringRef CppNamespace) {
+                       StringRef ClauseSetPrefix, Directive &Dir,
+                       const DirectiveLanguage &DirLang) {
 
   OS << "\n";
-  OS << "  static " << ClauseEnumSetClass << " " << ClauseSetPrefix
-     << DirectivePrefix << getFormattedName(DirectiveName) << " {\n";
+  OS << "  static " << DirLang.getClauseEnumSetClass() << " " << ClauseSetPrefix
+     << DirLang.getDirectivePrefix() << Dir.getFormattedName() << " {\n";
 
   for (const auto &C : Clauses) {
-    const auto SpecificClause = C->getValueAsDef("clause");
-    const auto ClauseName = SpecificClause->getValueAsString("name");
-    OS << "    llvm::" << CppNamespace << "::Clause::" << ClausePrefix
-       << getFormattedName(ClauseName) << ",\n";
+    VersionedClause VerClause{C};
+    OS << "    llvm::" << DirLang.getCppNamespace()
+       << "::Clause::" << DirLang.getClausePrefix()
+       << VerClause.getClause().getFormattedName() << ",\n";
   }
   OS << "  };\n";
 }
 
 // Generate an enum set for the 4 kinds of clauses linked to a directive.
-void GenerateDirectiveClauseSets(const std::vector<Record *> &Directives,
-                                 raw_ostream &OS, StringRef LanguageName,
-                                 StringRef ClauseEnumSetClass,
-                                 StringRef DirectivePrefix,
-                                 StringRef ClausePrefix,
-                                 StringRef CppNamespace) {
+void GenerateDirectiveClauseSets(const DirectiveLanguage &DirLang,
+                                 raw_ostream &OS) {
 
   IfDefScope Scope("GEN_FLANG_DIRECTIVE_CLAUSE_SETS", OS);
 
@@ -336,35 +461,24 @@ void GenerateDirectiveClauseSets(const std::vector<Record *> &Directives,
 
   // Open namespaces defined in the directive language.
   llvm::SmallVector<StringRef, 2> Namespaces;
-  llvm::SplitString(CppNamespace, Namespaces, "::");
+  llvm::SplitString(DirLang.getCppNamespace(), Namespaces, "::");
   for (auto Ns : Namespaces)
     OS << "namespace " << Ns << " {\n";
 
-  for (const auto &D : Directives) {
-    const auto DirectiveName = D->getValueAsString("name");
-
-    const auto &AllowedClauses = D->getValueAsListOfDefs("allowedClauses");
-    const auto &AllowedOnceClauses =
-        D->getValueAsListOfDefs("allowedOnceClauses");
-    const auto &AllowedExclusiveClauses =
-        D->getValueAsListOfDefs("allowedExclusiveClauses");
-    const auto &RequiredClauses = D->getValueAsListOfDefs("requiredClauses");
+  for (const auto &D : DirLang.getDirectives()) {
+    Directive Dir{D};
 
     OS << "\n";
-    OS << "  // Sets for " << DirectiveName << "\n";
-
-    GenerateClauseSet(AllowedClauses, OS, ClauseEnumSetClass, "allowedClauses_",
-                      DirectiveName, DirectivePrefix, ClausePrefix,
-                      CppNamespace);
-    GenerateClauseSet(AllowedOnceClauses, OS, ClauseEnumSetClass,
-                      "allowedOnceClauses_", DirectiveName, DirectivePrefix,
-                      ClausePrefix, CppNamespace);
-    GenerateClauseSet(AllowedExclusiveClauses, OS, ClauseEnumSetClass,
-                      "allowedExclusiveClauses_", DirectiveName,
-                      DirectivePrefix, ClausePrefix, CppNamespace);
-    GenerateClauseSet(RequiredClauses, OS, ClauseEnumSetClass,
-                      "requiredClauses_", DirectiveName, DirectivePrefix,
-                      ClausePrefix, CppNamespace);
+    OS << "  // Sets for " << Dir.getName() << "\n";
+
+    GenerateClauseSet(Dir.getAllowedClauses(), OS, "allowedClauses_", Dir,
+                      DirLang);
+    GenerateClauseSet(Dir.getAllowedOnceClauses(), OS, "allowedOnceClauses_",
+                      Dir, DirLang);
+    GenerateClauseSet(Dir.getAllowedExclusiveClauses(), OS,
+                      "allowedExclusiveClauses_", Dir, DirLang);
+    GenerateClauseSet(Dir.getRequiredClauses(), OS, "requiredClauses_", Dir,
+                      DirLang);
   }
 
   // Closing namespaces
@@ -377,118 +491,253 @@ void GenerateDirectiveClauseSets(const std::vector<Record *> &Directives,
 // Generate a map of directive (key) with DirectiveClauses struct as values.
 // The struct holds the 4 sets of enumeration for the 4 kinds of clauses
 // allowances (allowed, allowed once, allowed exclusive and required).
-void GenerateDirectiveClauseMap(const std::vector<Record *> &Directives,
-                                raw_ostream &OS, StringRef LanguageName,
-                                StringRef ClauseEnumSetClass,
-                                StringRef DirectivePrefix,
-                                StringRef ClausePrefix,
-                                StringRef CppNamespace) {
+void GenerateDirectiveClauseMap(const DirectiveLanguage &DirLang,
+                                raw_ostream &OS) {
 
   IfDefScope Scope("GEN_FLANG_DIRECTIVE_CLAUSE_MAP", OS);
 
   OS << "\n";
-  OS << "struct " << LanguageName << "DirectiveClauses {\n";
-  OS << "  const " << ClauseEnumSetClass << " allowed;\n";
-  OS << "  const " << ClauseEnumSetClass << " allowedOnce;\n";
-  OS << "  const " << ClauseEnumSetClass << " allowedExclusive;\n";
-  OS << "  const " << ClauseEnumSetClass << " requiredOneOf;\n";
-  OS << "};\n";
+  OS << "{\n";
 
-  OS << "\n";
-
-  OS << "std::unordered_map<llvm::" << CppNamespace << "::Directive, "
-     << LanguageName << "DirectiveClauses>\n";
-  OS << "    directiveClausesTable = {\n";
-
-  for (const auto &D : Directives) {
-    const auto FormattedDirectiveName =
-        getFormattedName(D->getValueAsString("name"));
-    OS << "  {llvm::" << CppNamespace << "::Directive::" << DirectivePrefix
-       << FormattedDirectiveName << ",\n";
+  for (const auto &D : DirLang.getDirectives()) {
+    Directive Dir{D};
+    OS << "  {llvm::" << DirLang.getCppNamespace()
+       << "::Directive::" << DirLang.getDirectivePrefix()
+       << Dir.getFormattedName() << ",\n";
     OS << "    {\n";
-    OS << "      llvm::" << CppNamespace << "::allowedClauses_"
-       << DirectivePrefix << FormattedDirectiveName << ",\n";
-    OS << "      llvm::" << CppNamespace << "::allowedOnceClauses_"
-       << DirectivePrefix << FormattedDirectiveName << ",\n";
-    OS << "      llvm::" << CppNamespace << "::allowedExclusiveClauses_"
-       << DirectivePrefix << FormattedDirectiveName << ",\n";
-    OS << "      llvm::" << CppNamespace << "::requiredClauses_"
-       << DirectivePrefix << FormattedDirectiveName << ",\n";
+    OS << "      llvm::" << DirLang.getCppNamespace() << "::allowedClauses_"
+       << DirLang.getDirectivePrefix() << Dir.getFormattedName() << ",\n";
+    OS << "      llvm::" << DirLang.getCppNamespace() << "::allowedOnceClauses_"
+       << DirLang.getDirectivePrefix() << Dir.getFormattedName() << ",\n";
+    OS << "      llvm::" << DirLang.getCppNamespace()
+       << "::allowedExclusiveClauses_" << DirLang.getDirectivePrefix()
+       << Dir.getFormattedName() << ",\n";
+    OS << "      llvm::" << DirLang.getCppNamespace() << "::requiredClauses_"
+       << DirLang.getDirectivePrefix() << Dir.getFormattedName() << ",\n";
     OS << "    }\n";
     OS << "  },\n";
   }
 
-  OS << "};\n";
+  OS << "}\n";
 }
 
-// Generate the implemenation section for the enumeration in the directive
-// language
-void EmitDirectivesFlangImpl(const std::vector<Record *> &Directives,
-                             raw_ostream &OS, StringRef LanguageName,
-                             StringRef ClauseEnumSetClass,
-                             StringRef DirectivePrefix, StringRef ClausePrefix,
-                             StringRef CppNamespace) {
+// Generate classes entry for Flang clauses in the Flang parse-tree
+// If the clause as a non-generic class, no entry is generated.
+// If the clause does not hold a value, an EMPTY_CLASS is used.
+// If the clause class is generic then a WRAPPER_CLASS is used. When the value
+// is optional, the value class is wrapped into a std::optional.
+void GenerateFlangClauseParserClass(const DirectiveLanguage &DirLang,
+                                    raw_ostream &OS) {
 
-  GenerateDirectiveClauseSets(Directives, OS, LanguageName, ClauseEnumSetClass,
-                              DirectivePrefix, ClausePrefix, CppNamespace);
+  IfDefScope Scope("GEN_FLANG_CLAUSE_PARSER_CLASSES", OS);
+
+  OS << "\n";
 
-  GenerateDirectiveClauseMap(Directives, OS, LanguageName, ClauseEnumSetClass,
-                             DirectivePrefix, ClausePrefix, CppNamespace);
+  for (const auto &C : DirLang.getClauses()) {
+    Clause Clause{C};
+    if (!Clause.getFlangClass().empty()) {
+      OS << "WRAPPER_CLASS(" << Clause.getFormattedParserClassName() << ", ";
+      if (Clause.isValueOptional() && Clause.isValueList()) {
+        OS << "std::optional<std::list<" << Clause.getFlangClass() << ">>";
+      } else if (Clause.isValueOptional()) {
+        OS << "std::optional<" << Clause.getFlangClass() << ">";
+      } else if (Clause.isValueList()) {
+        OS << "std::list<" << Clause.getFlangClass() << ">";
+      } else {
+        OS << Clause.getFlangClass();
+      }
+    } else {
+      OS << "EMPTY_CLASS(" << Clause.getFormattedParserClassName();
+    }
+    OS << ");\n";
+  }
 }
 
-// Generate the implemenation section for the enumeration in the directive
-// language.
-void EmitDirectivesGen(RecordKeeper &Records, raw_ostream &OS) {
+// Generate a list of the different clause classes for Flang.
+void GenerateFlangClauseParserClassList(const DirectiveLanguage &DirLang,
+                                        raw_ostream &OS) {
 
-  const auto &DirectiveLanguages =
-      Records.getAllDerivedDefinitions("DirectiveLanguage");
+  IfDefScope Scope("GEN_FLANG_CLAUSE_PARSER_CLASSES_LIST", OS);
 
-  if (DirectiveLanguages.size() != 1) {
-    PrintError("A single definition of DirectiveLanguage is needed.");
-    return;
+  OS << "\n";
+  llvm::interleaveComma(DirLang.getClauses(), OS, [&](Record *C) {
+    Clause Clause{C};
+    OS << Clause.getFormattedParserClassName() << "\n";
+  });
+}
+
+// Generate dump node list for the clauses holding a generic class name.
+void GenerateFlangClauseDump(const DirectiveLanguage &DirLang,
+                             raw_ostream &OS) {
+
+  IfDefScope Scope("GEN_FLANG_DUMP_PARSE_TREE_CLAUSES", OS);
+
+  OS << "\n";
+  for (const auto &C : DirLang.getClauses()) {
+    Clause Clause{C};
+    OS << "NODE(" << DirLang.getFlangClauseBaseClass() << ", "
+       << Clause.getFormattedParserClassName() << ")\n";
   }
+}
 
-  const auto &DirectiveLanguage = DirectiveLanguages[0];
-  StringRef DirectivePrefix =
-      DirectiveLanguage->getValueAsString("directivePrefix");
-  StringRef LanguageName = DirectiveLanguage->getValueAsString("name");
-  StringRef ClausePrefix = DirectiveLanguage->getValueAsString("clausePrefix");
-  StringRef CppNamespace = DirectiveLanguage->getValueAsString("cppNamespace");
-  StringRef ClauseEnumSetClass =
-      DirectiveLanguage->getValueAsString("clauseEnumSetClass");
+// Generate Unparse functions for clauses classes in the Flang parse-tree
+// If the clause is a non-generic class, no entry is generated.
+void GenerateFlangClauseUnparse(const DirectiveLanguage &DirLang,
+                                raw_ostream &OS) {
 
-  const auto &Directives = Records.getAllDerivedDefinitions("Directive");
+  IfDefScope Scope("GEN_FLANG_CLAUSE_UNPARSE", OS);
 
-  EmitDirectivesFlangImpl(Directives, OS, LanguageName, ClauseEnumSetClass,
-                          DirectivePrefix, ClausePrefix, CppNamespace);
+  OS << "\n";
+
+  for (const auto &C : DirLang.getClauses()) {
+    Clause Clause{C};
+    if (!Clause.getFlangClass().empty()) {
+      if (Clause.isValueOptional() && Clause.getDefaultValue().empty()) {
+        OS << "void Unparse(const " << DirLang.getFlangClauseBaseClass()
+           << "::" << Clause.getFormattedParserClassName() << " &x) {\n";
+        OS << "  Word(\"" << Clause.getName().upper() << "\");\n";
+
+        OS << "  Walk(\"(\", x.v, \")\");\n";
+        OS << "}\n";
+      } else if (Clause.isValueOptional()) {
+        OS << "void Unparse(const " << DirLang.getFlangClauseBaseClass()
+           << "::" << Clause.getFormattedParserClassName() << " &x) {\n";
+        OS << "  Word(\"" << Clause.getName().upper() << "\");\n";
+        OS << "  Put(\"(\");\n";
+        OS << "  if (x.v.has_value())\n";
+        if (Clause.isValueList())
+          OS << "    Walk(x.v, \",\");\n";
+        else
+          OS << "    Walk(x.v);\n";
+        OS << "  else\n";
+        OS << "    Put(\"" << Clause.getDefaultValue() << "\");\n";
+        OS << "  Put(\")\");\n";
+        OS << "}\n";
+      } else {
+        OS << "void Unparse(const " << DirLang.getFlangClauseBaseClass()
+           << "::" << Clause.getFormattedParserClassName() << " &x) {\n";
+        OS << "  Word(\"" << Clause.getName().upper() << "\");\n";
+        OS << "  Put(\"(\");\n";
+        if (Clause.isValueList())
+          OS << "  Walk(x.v, \",\");\n";
+        else
+          OS << "  Walk(x.v);\n";
+        OS << "  Put(\")\");\n";
+        OS << "}\n";
+      }
+    } else {
+      OS << "void Before(const " << DirLang.getFlangClauseBaseClass()
+         << "::" << Clause.getFormattedParserClassName() << " &) { Word(\""
+         << Clause.getName().upper() << "\"); }\n";
+    }
+  }
 }
 
-// Generate the implemenation for the enumeration in the directive
-// language. This code can be included in library.
-void EmitDirectivesImpl(RecordKeeper &Records, raw_ostream &OS) {
+// Generate the implementation section for the enumeration in the directive
+// language
+void EmitDirectivesFlangImpl(const DirectiveLanguage &DirLang,
+                             raw_ostream &OS) {
 
-  const auto &DirectiveLanguages =
-      Records.getAllDerivedDefinitions("DirectiveLanguage");
+  GenerateDirectiveClauseSets(DirLang, OS);
 
-  if (DirectiveLanguages.size() != 1) {
-    PrintError("A single definition of DirectiveLanguage is needed.");
-    return;
+  GenerateDirectiveClauseMap(DirLang, OS);
+
+  GenerateFlangClauseParserClass(DirLang, OS);
+
+  GenerateFlangClauseParserClassList(DirLang, OS);
+
+  GenerateFlangClauseDump(DirLang, OS);
+
+  GenerateFlangClauseUnparse(DirLang, OS);
+}
+
+void GenerateClauseClassMacro(const DirectiveLanguage &DirLang,
+                              raw_ostream &OS) {
+  // Generate macros style information for legacy code in clang
+  IfDefScope Scope("GEN_CLANG_CLAUSE_CLASS", OS);
+
+  OS << "\n";
+
+  OS << "#ifndef CLAUSE\n";
+  OS << "#define CLAUSE(Enum, Str, Implicit)\n";
+  OS << "#endif\n";
+  OS << "#ifndef CLAUSE_CLASS\n";
+  OS << "#define CLAUSE_CLASS(Enum, Str, Class)\n";
+  OS << "#endif\n";
+  OS << "#ifndef CLAUSE_NO_CLASS\n";
+  OS << "#define CLAUSE_NO_CLASS(Enum, Str)\n";
+  OS << "#endif\n";
+  OS << "\n";
+  OS << "#define __CLAUSE(Name, Class)                      \\\n";
+  OS << "  CLAUSE(" << DirLang.getClausePrefix()
+     << "##Name, #Name, /* Implicit */ false) \\\n";
+  OS << "  CLAUSE_CLASS(" << DirLang.getClausePrefix()
+     << "##Name, #Name, Class)\n";
+  OS << "#define __CLAUSE_NO_CLASS(Name)                    \\\n";
+  OS << "  CLAUSE(" << DirLang.getClausePrefix()
+     << "##Name, #Name, /* Implicit */ false) \\\n";
+  OS << "  CLAUSE_NO_CLASS(" << DirLang.getClausePrefix() << "##Name, #Name)\n";
+  OS << "#define __IMPLICIT_CLAUSE_CLASS(Name, Str, Class)  \\\n";
+  OS << "  CLAUSE(" << DirLang.getClausePrefix()
+     << "##Name, Str, /* Implicit */ true)    \\\n";
+  OS << "  CLAUSE_CLASS(" << DirLang.getClausePrefix()
+     << "##Name, Str, Class)\n";
+  OS << "#define __IMPLICIT_CLAUSE_NO_CLASS(Name, Str)      \\\n";
+  OS << "  CLAUSE(" << DirLang.getClausePrefix()
+     << "##Name, Str, /* Implicit */ true)    \\\n";
+  OS << "  CLAUSE_NO_CLASS(" << DirLang.getClausePrefix() << "##Name, Str)\n";
+  OS << "\n";
+
+  for (const auto &R : DirLang.getClauses()) {
+    Clause C{R};
+    if (C.getClangClass().empty()) { // NO_CLASS
+      if (C.isImplicit()) {
+        OS << "__IMPLICIT_CLAUSE_NO_CLASS(" << C.getFormattedName() << ", \""
+           << C.getFormattedName() << "\")\n";
+      } else {
+        OS << "__CLAUSE_NO_CLASS(" << C.getFormattedName() << ")\n";
+      }
+    } else { // CLASS
+      if (C.isImplicit()) {
+        OS << "__IMPLICIT_CLAUSE_CLASS(" << C.getFormattedName() << ", \""
+           << C.getFormattedName() << "\", " << C.getClangClass() << ")\n";
+      } else {
+        OS << "__CLAUSE(" << C.getFormattedName() << ", " << C.getClangClass()
+           << ")\n";
+      }
+    }
   }
 
-  const auto &DirectiveLanguage = DirectiveLanguages[0];
-  StringRef DirectivePrefix =
-      DirectiveLanguage->getValueAsString("directivePrefix");
-  StringRef LanguageName = DirectiveLanguage->getValueAsString("name");
-  StringRef ClausePrefix = DirectiveLanguage->getValueAsString("clausePrefix");
-  StringRef CppNamespace = DirectiveLanguage->getValueAsString("cppNamespace");
-  const auto &Directives = Records.getAllDerivedDefinitions("Directive");
-  const auto &Clauses = Records.getAllDerivedDefinitions("Clause");
+  OS << "\n";
+  OS << "#undef __IMPLICIT_CLAUSE_NO_CLASS\n";
+  OS << "#undef __IMPLICIT_CLAUSE_CLASS\n";
+  OS << "#undef __CLAUSE\n";
+  OS << "#undef CLAUSE_NO_CLASS\n";
+  OS << "#undef CLAUSE_CLASS\n";
+  OS << "#undef CLAUSE\n";
+}
+
+// Generate the implementation section for the enumeration in the directive
+// language.
+void EmitDirectivesGen(RecordKeeper &Records, raw_ostream &OS) {
+  const auto DirLang = DirectiveLanguage{Records};
+  if (DirLang.HasValidityErrors())
+    return;
+
+  EmitDirectivesFlangImpl(DirLang, OS);
 
-  StringRef IncludeHeader =
-      DirectiveLanguage->getValueAsString("includeHeader");
+  GenerateClauseClassMacro(DirLang, OS);
+}
 
-  if (!IncludeHeader.empty())
-    OS << "#include \"" << IncludeHeader << "\"\n\n";
+// Generate the implementation for the enumeration in the directive
+// language. This code can be included in library.
+void EmitDirectivesImpl(RecordKeeper &Records, raw_ostream &OS) {
+  const auto DirLang = DirectiveLanguage{Records};
+  if (DirLang.HasValidityErrors())
+    return;
+
+  if (!DirLang.getIncludeHeader().empty())
+    OS << "#include \"" << DirLang.getIncludeHeader() << "\"\n\n";
 
   OS << "#include \"llvm/ADT/StringRef.h\"\n";
   OS << "#include \"llvm/ADT/StringSwitch.h\"\n";
@@ -496,29 +745,32 @@ void EmitDirectivesImpl(RecordKeeper &Records, raw_ostream &OS) {
   OS << "\n";
   OS << "using namespace llvm;\n";
   llvm::SmallVector<StringRef, 2> Namespaces;
-  llvm::SplitString(CppNamespace, Namespaces, "::");
+  llvm::SplitString(DirLang.getCppNamespace(), Namespaces, "::");
   for (auto Ns : Namespaces)
     OS << "using namespace " << Ns << ";\n";
 
   // getDirectiveKind(StringRef Str)
-  GenerateGetKind(Directives, OS, "Directive", DirectivePrefix, LanguageName,
-                  CppNamespace, /*ImplicitAsUnknown=*/false);
+  GenerateGetKind(DirLang.getDirectives(), OS, "Directive", DirLang,
+                  DirLang.getDirectivePrefix(), /*ImplicitAsUnknown=*/false);
 
   // getDirectiveName(Directive Kind)
-  GenerateGetName(Directives, OS, "Directive", DirectivePrefix, LanguageName,
-                  CppNamespace);
+  GenerateGetName(DirLang.getDirectives(), OS, "Directive", DirLang,
+                  DirLang.getDirectivePrefix());
 
   // getClauseKind(StringRef Str)
-  GenerateGetKind(Clauses, OS, "Clause", ClausePrefix, LanguageName,
-                  CppNamespace, /*ImplicitAsUnknown=*/true);
+  GenerateGetKind(DirLang.getClauses(), OS, "Clause", DirLang,
+                  DirLang.getClausePrefix(),
+                  /*ImplicitAsUnknown=*/true);
 
   // getClauseName(Clause Kind)
-  GenerateGetName(Clauses, OS, "Clause", ClausePrefix, LanguageName,
-                  CppNamespace);
+  GenerateGetName(DirLang.getClauses(), OS, "Clause", DirLang,
+                  DirLang.getClausePrefix());
+
+  // get<ClauseVal>Kind(StringRef Str)
+  GenerateGetKindClauseVal(DirLang, OS);
 
   // isAllowedClauseForDirective(Directive D, Clause C, unsigned Version)
-  GenerateIsAllowedClause(Directives, OS, LanguageName, DirectivePrefix,
-                          ClausePrefix, CppNamespace);
+  GenerateIsAllowedClause(DirLang, OS);
 }
 
 } // namespace llvm
diff --git a/contrib/llvm-project/llvm/utils/TableGen/ExegesisEmitter.cpp b/contrib/llvm-project/llvm/utils/TableGen/ExegesisEmitter.cpp
index 8f784e4a4121..4e532c371691 100644
--- a/contrib/llvm-project/llvm/utils/TableGen/ExegesisEmitter.cpp
+++ b/contrib/llvm-project/llvm/utils/TableGen/ExegesisEmitter.cpp
@@ -144,7 +144,7 @@ void ExegesisEmitter::emitPfmCountersInfo(const Record &Def,
 
 void ExegesisEmitter::emitPfmCounters(raw_ostream &OS) const {
   // Emit the counter name table.
-  OS << "\nstatic const char* " << Target << "PfmCounterNames[] = {\n";
+  OS << "\nstatic const char *" << Target << "PfmCounterNames[] = {\n";
   for (const auto &NameAndIndex : PfmCounterNameTable)
     OS << "  \"" << NameAndIndex.first << "\", // " << NameAndIndex.second
        << "\n";
diff --git a/contrib/llvm-project/llvm/utils/TableGen/FixedLenDecoderEmitter.cpp b/contrib/llvm-project/llvm/utils/TableGen/FixedLenDecoderEmitter.cpp
index 88d210f7fd39..01b39df055ad 100644
--- a/contrib/llvm-project/llvm/utils/TableGen/FixedLenDecoderEmitter.cpp
+++ b/contrib/llvm-project/llvm/utils/TableGen/FixedLenDecoderEmitter.cpp
@@ -226,6 +226,8 @@ typedef std::vector<bit_value_t> insn_t;
 
 namespace {
 
+static const uint64_t NO_FIXED_SEGMENTS_SENTINEL = -1ULL;
+
 class FilterChooser;
 
 /// Filter - Filter works with FilterChooser to produce the decoding tree for
@@ -279,7 +281,7 @@ protected:
   std::vector<EncodingIDAndOpcode> VariableInstructions;
 
   // Map of well-known segment value to its delegate.
-  std::map<unsigned, std::unique_ptr<const FilterChooser>> FilterChooserMap;
+  std::map<uint64_t, std::unique_ptr<const FilterChooser>> FilterChooserMap;
 
   // Number of instructions which fall under FilteredInstructions category.
   unsigned NumFiltered;
@@ -305,7 +307,7 @@ public:
   const FilterChooser &getVariableFC() const {
     assert(NumFiltered == 1);
     assert(FilterChooserMap.size() == 1);
-    return *(FilterChooserMap.find((unsigned)-1)->second);
+    return *(FilterChooserMap.find(NO_FIXED_SEGMENTS_SENTINEL)->second);
   }
 
   // Divides the decoding task into sub tasks and delegates them to the
@@ -602,10 +604,9 @@ void Filter::recurse() {
 
     // Delegates to an inferior filter chooser for further processing on this
     // group of instructions whose segment values are variable.
-    FilterChooserMap.insert(
-        std::make_pair(-1U, std::make_unique<FilterChooser>(
-                                Owner->AllInstructions, VariableInstructions,
-                                Owner->Operands, BitValueArray, *Owner)));
+    FilterChooserMap.insert(std::make_pair(NO_FIXED_SEGMENTS_SENTINEL,
+        std::make_unique<FilterChooser>(Owner->AllInstructions,
+            VariableInstructions, Owner->Operands, BitValueArray, *Owner)));
   }
 
   // No need to recurse for a singleton filtered instruction.
@@ -674,7 +675,7 @@ void Filter::emitTableEntry(DecoderTableInfo &TableInfo) const {
   for (auto &Filter : FilterChooserMap) {
     // Field value -1 implies a non-empty set of variable instructions.
     // See also recurse().
-    if (Filter.first == (unsigned)-1) {
+    if (Filter.first == NO_FIXED_SEGMENTS_SENTINEL) {
       HasFallthrough = true;
 
       // Each scope should always have at least one filter value to check
@@ -721,7 +722,7 @@ void Filter::emitTableEntry(DecoderTableInfo &TableInfo) const {
   assert(TableInfo.FixupStack.size() > 1 && "fixup stack underflow!");
   FixupScopeList::iterator Source = TableInfo.FixupStack.end() - 1;
   FixupScopeList::iterator Dest = Source - 1;
-  Dest->insert(Dest->end(), Source->begin(), Source->end());
+  llvm::append_range(*Dest, *Source);
   TableInfo.FixupStack.pop_back();
 
   // If there is no fallthrough, then the final filter should get fixed
@@ -941,7 +942,7 @@ emitPredicateFunction(formatted_raw_ostream &OS, PredicateSet &Predicates,
   // The predicate function is just a big switch statement based on the
   // input predicate index.
   OS.indent(Indentation) << "static bool checkDecoderPredicate(unsigned Idx, "
-    << "const FeatureBitset& Bits) {\n";
+    << "const FeatureBitset &Bits) {\n";
   Indentation += 2;
   if (!Predicates.empty()) {
     OS.indent(Indentation) << "switch (Idx) {\n";
@@ -965,7 +966,7 @@ emitDecoderFunction(formatted_raw_ostream &OS, DecoderSet &Decoders,
                     unsigned Indentation) const {
   // The decoder function is just a big switch statement based on the
   // input decoder index.
-  OS.indent(Indentation) << "template<typename InsnType>\n";
+  OS.indent(Indentation) << "template <typename InsnType>\n";
   OS.indent(Indentation) << "static DecodeStatus decodeToMCInst(DecodeStatus S,"
     << " unsigned Idx, InsnType insn, MCInst &MI,\n";
   OS.indent(Indentation) << "                                   uint64_t "
@@ -1192,7 +1193,7 @@ bool FilterChooser::emitPredicateMatch(raw_ostream &o, unsigned &Indentation,
     if (!Pred->getValue("AssemblerMatcherPredicate"))
       continue;
 
-    if (!dyn_cast<DagInit>(Pred->getValue("AssemblerCondDag")->getValue()))
+    if (!isa<DagInit>(Pred->getValue("AssemblerCondDag")->getValue()))
       continue;
 
     const DagInit *D = Pred->getValueAsDag("AssemblerCondDag");
@@ -1886,7 +1887,7 @@ populateInstruction(CodeGenTarget &Target, const Record &EncodingDef,
     for (unsigned i = 0, e = Vals.size(); i != e; ++i) {
       // Ignore fixed fields in the record, we're looking for values like:
       //    bits<5> RST = { ?, ?, ?, ?, ? };
-      if (Vals[i].getPrefix() || Vals[i].getValue()->isComplete())
+      if (Vals[i].isNonconcreteOK() || Vals[i].getValue()->isComplete())
         continue;
 
       // Determine if Vals[i] actually contributes to the Inst encoding.
@@ -2009,9 +2010,8 @@ populateInstruction(CodeGenTarget &Target, const Record &EncodingDef,
   // For each operand, see if we can figure out where it is encoded.
   for (const auto &Op : InOutOperands) {
     if (!NumberedInsnOperands[std::string(Op.second)].empty()) {
-      InsnOperands.insert(InsnOperands.end(),
-                          NumberedInsnOperands[std::string(Op.second)].begin(),
-                          NumberedInsnOperands[std::string(Op.second)].end());
+      llvm::append_range(InsnOperands,
+                         NumberedInsnOperands[std::string(Op.second)]);
       continue;
     }
     if (!NumberedInsnOperands[TiedNames[std::string(Op.second)]].empty()) {
@@ -2162,7 +2162,7 @@ static void emitFieldFromInstruction(formatted_raw_ostream &OS) {
      << "// * Support shift (<<, >>) with signed and unsigned integers on the "
         "RHS\n"
      << "// * Support put (<<) to raw_ostream&\n"
-     << "template<typename InsnType>\n"
+     << "template <typename InsnType>\n"
      << "#if defined(_MSC_VER) && !defined(__clang__)\n"
      << "__declspec(noinline)\n"
      << "#endif\n"
@@ -2182,7 +2182,7 @@ static void emitFieldFromInstruction(formatted_raw_ostream &OS) {
      << "  return (insn & fieldMask) >> startBit;\n"
      << "}\n"
      << "\n"
-     << "template<typename InsnType>\n"
+     << "template <typename InsnType>\n"
      << "static InsnType fieldFromInstruction(InsnType insn, unsigned "
         "startBit,\n"
      << "                                     unsigned numBits, "
@@ -2193,7 +2193,7 @@ static void emitFieldFromInstruction(formatted_raw_ostream &OS) {
      << "  return (insn >> startBit) & fieldMask;\n"
      << "}\n"
      << "\n"
-     << "template<typename InsnType>\n"
+     << "template <typename InsnType>\n"
      << "static InsnType fieldFromInstruction(InsnType insn, unsigned "
         "startBit,\n"
      << "                                     unsigned numBits) {\n"
@@ -2205,14 +2205,14 @@ static void emitFieldFromInstruction(formatted_raw_ostream &OS) {
 // emitDecodeInstruction - Emit the templated helper function
 // decodeInstruction().
 static void emitDecodeInstruction(formatted_raw_ostream &OS) {
-  OS << "template<typename InsnType>\n"
+  OS << "template <typename InsnType>\n"
      << "static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], "
         "MCInst &MI,\n"
      << "                                      InsnType insn, uint64_t "
         "Address,\n"
      << "                                      const void *DisAsm,\n"
      << "                                      const MCSubtargetInfo &STI) {\n"
-     << "  const FeatureBitset& Bits = STI.getFeatureBits();\n"
+     << "  const FeatureBitset &Bits = STI.getFeatureBits();\n"
      << "\n"
      << "  const uint8_t *Ptr = DecodeTable;\n"
      << "  InsnType CurFieldValue = 0;\n"
@@ -2374,7 +2374,7 @@ static void emitDecodeInstruction(formatted_raw_ostream &OS) {
      << "      if (Fail)\n"
      << "        S = MCDisassembler::SoftFail;\n"
      << "      LLVM_DEBUG(dbgs() << Loc << \": OPC_SoftFail: \" << (Fail ? "
-        "\"FAIL\\n\":\"PASS\\n\"));\n"
+        "\"FAIL\\n\" : \"PASS\\n\"));\n"
      << "      break;\n"
      << "    }\n"
      << "    case MCD::OPC_Fail: {\n"
@@ -2392,8 +2392,8 @@ static void emitDecodeInstruction(formatted_raw_ostream &OS) {
 void FixedLenDecoderEmitter::run(raw_ostream &o) {
   formatted_raw_ostream OS(o);
   OS << "#include \"llvm/MC/MCInst.h\"\n";
-  OS << "#include \"llvm/Support/Debug.h\"\n";
   OS << "#include \"llvm/Support/DataTypes.h\"\n";
+  OS << "#include \"llvm/Support/Debug.h\"\n";
   OS << "#include \"llvm/Support/LEB128.h\"\n";
   OS << "#include \"llvm/Support/raw_ostream.h\"\n";
   OS << "#include <assert.h>\n";
diff --git a/contrib/llvm-project/llvm/utils/TableGen/GICombinerEmitter.cpp b/contrib/llvm-project/llvm/utils/TableGen/GICombinerEmitter.cpp
index e2a670070ae7..ab00cff63998 100644
--- a/contrib/llvm-project/llvm/utils/TableGen/GICombinerEmitter.cpp
+++ b/contrib/llvm-project/llvm/utils/TableGen/GICombinerEmitter.cpp
@@ -150,7 +150,7 @@ protected:
 
   /// A block of arbitrary C++ to finish testing the match.
   /// FIXME: This is a temporary measure until we have actual pattern matching
-  const CodeInit *MatchingFixupCode = nullptr;
+  const StringInit *MatchingFixupCode = nullptr;
 
   /// The MatchData defined by the match stage and required by the apply stage.
   /// This allows the plumbing of arbitrary data from C++ predicates between the
@@ -199,7 +199,7 @@ public:
   unsigned allocUID() { return UID++; }
   StringRef getName() const { return TheDef.getName(); }
   const Record &getDef() const { return TheDef; }
-  const CodeInit *getMatchingFixupCode() const { return MatchingFixupCode; }
+  const StringInit *getMatchingFixupCode() const { return MatchingFixupCode; }
   size_t getNumRoots() const { return Roots.size(); }
 
   GIMatchDag &getMatchDag() { return MatchDag; }
@@ -346,8 +346,6 @@ void CombineRule::declareMatchData(StringRef PatternSymbol, StringRef Type,
 }
 
 bool CombineRule::parseDefs() {
-  NamedRegionTimer T("parseDefs", "Time spent parsing the defs", "Rule Parsing",
-                     "Time spent on rule parsing", TimeRegions);
   DagInit *Defs = TheDef.getValueAsDag("Defs");
 
   if (Defs->getOperatorAsDef(TheDef.getLoc())->getName() != "defs") {
@@ -434,9 +432,9 @@ bool CombineRule::parseInstructionMatcher(
       }
 
       if (InstrOperand.isDef()) {
-        if (find_if(Roots, [&](const RootInfo &X) {
+        if (any_of(Roots, [&](const RootInfo &X) {
               return X.getPatternSymbol() == Name;
-            }) != Roots.end()) {
+            })) {
           N->setMatchRoot();
         }
       }
@@ -462,9 +460,9 @@ bool CombineRule::parseWipMatchOpcodeMatcher(const CodeGenTarget &Target,
         MatchDag.addInstrNode(makeDebugName(*this, Name), insertStrTab(Name),
                               MatchDag.getContext().makeEmptyOperandList());
 
-    if (find_if(Roots, [&](const RootInfo &X) {
+    if (any_of(Roots, [&](const RootInfo &X) {
           return ArgName && X.getPatternSymbol() == ArgName->getValue();
-        }) != Roots.end()) {
+        })) {
       N->setMatchRoot();
     }
 
@@ -488,8 +486,6 @@ bool CombineRule::parseWipMatchOpcodeMatcher(const CodeGenTarget &Target,
   return false;
 }
 bool CombineRule::parseMatcher(const CodeGenTarget &Target) {
-  NamedRegionTimer T("parseMatcher", "Time spent parsing the matcher",
-                     "Rule Parsing", "Time spent on rule parsing", TimeRegions);
   StringMap<std::vector<VarInfo>> NamedEdgeDefs;
   StringMap<std::vector<VarInfo>> NamedEdgeUses;
   DagInit *Matchers = TheDef.getValueAsDag("Match");
@@ -518,10 +514,10 @@ bool CombineRule::parseMatcher(const CodeGenTarget &Target) {
 
 
     // Parse arbitrary C++ code we have in lieu of supporting MIR matching
-    if (const CodeInit *CodeI = dyn_cast<CodeInit>(Matchers->getArg(I))) {
+    if (const StringInit *StringI = dyn_cast<StringInit>(Matchers->getArg(I))) {
       assert(!MatchingFixupCode &&
              "Only one block of arbitrary code is currently permitted");
-      MatchingFixupCode = CodeI;
+      MatchingFixupCode = StringI;
       MatchDag.setHasPostMatchPredicate(true);
       continue;
     }
@@ -593,6 +589,7 @@ bool CombineRule::parseMatcher(const CodeGenTarget &Target) {
 }
 
 class GICombinerEmitter {
+  RecordKeeper &Records;
   StringRef Name;
   const CodeGenTarget &Target;
   Record *Combiner;
@@ -618,7 +615,6 @@ public:
   /// response to the generated cl::opt.
   void emitNameMatcher(raw_ostream &OS) const;
 
-  void generateDeclarationsCodeForTree(raw_ostream &OS, const GIMatchTree &Tree) const;
   void generateCodeForTree(raw_ostream &OS, const GIMatchTree &Tree,
                            StringRef Indent) const;
 };
@@ -626,7 +622,7 @@ public:
 GICombinerEmitter::GICombinerEmitter(RecordKeeper &RK,
                                      const CodeGenTarget &Target,
                                      StringRef Name, Record *Combiner)
-    : Name(Name), Target(Target), Combiner(Combiner) {}
+    : Records(RK), Name(Name), Target(Target), Combiner(Combiner) {}
 
 void GICombinerEmitter::emitNameMatcher(raw_ostream &OS) const {
   std::vector<std::pair<std::string, std::string>> Cases;
@@ -762,7 +758,7 @@ void GICombinerEmitter::generateCodeForTree(raw_ostream &OS,
     DagInit *Applyer = RuleDef.getValueAsDag("Apply");
     if (Applyer->getOperatorAsDef(RuleDef.getLoc())->getName() !=
         "apply") {
-      PrintError(RuleDef.getLoc(), "Expected apply operator");
+      PrintError(RuleDef.getLoc(), "Expected 'apply' operator in Apply DAG");
       return;
     }
 
@@ -803,16 +799,16 @@ void GICombinerEmitter::generateCodeForTree(raw_ostream &OS,
       OS << Indent << "      && [&]() {\n"
          << Indent << "      "
          << CodeExpander(Rule->getMatchingFixupCode()->getValue(), Expansions,
-                         Rule->getMatchingFixupCode()->getLoc(), ShowExpansions)
+                         RuleDef.getLoc(), ShowExpansions)
          << "\n"
          << Indent << "      return true;\n"
          << Indent << "  }()";
     }
     OS << ") {\n" << Indent << "   ";
 
-    if (const CodeInit *Code = dyn_cast<CodeInit>(Applyer->getArg(0))) {
+    if (const StringInit *Code = dyn_cast<StringInit>(Applyer->getArg(0))) {
       OS << CodeExpander(Code->getAsUnquotedString(), Expansions,
-                         Code->getLoc(), ShowExpansions)
+                         RuleDef.getLoc(), ShowExpansions)
          << "\n"
          << Indent << "    return true;\n"
          << Indent << "  }\n";
@@ -850,6 +846,7 @@ static void emitAdditionalHelperMethodArguments(raw_ostream &OS,
 }
 
 void GICombinerEmitter::run(raw_ostream &OS) {
+  Records.startTimer("Gather rules");
   gatherRules(Rules, Combiner->getValueAsListOfDefs("Rules"));
   if (StopAfterParse) {
     MatchDagCtx.print(errs());
@@ -861,11 +858,8 @@ void GICombinerEmitter::run(raw_ostream &OS) {
     PrintFatalError(Combiner->getLoc(), "Failed to parse one or more rules");
   LLVM_DEBUG(dbgs() << "Optimizing tree for " << Rules.size() << " rules\n");
   std::unique_ptr<GIMatchTree> Tree;
+  Records.startTimer("Optimize combiner");
   {
-    NamedRegionTimer T("Optimize", "Time spent optimizing the combiner",
-                       "Code Generation", "Time spent generating code",
-                       TimeRegions);
-
     GIMatchTreeBuilder TreeBuilder(0);
     for (const auto &Rule : Rules) {
       bool HadARoot = false;
@@ -887,9 +881,7 @@ void GICombinerEmitter::run(raw_ostream &OS) {
     return;
   }
 
-  NamedRegionTimer T("Emit", "Time spent emitting the combiner",
-                     "Code Generation", "Time spent generating code",
-                     TimeRegions);
+  Records.startTimer("Emit combiner");
   OS << "#ifdef " << Name.upper() << "_GENCOMBINERHELPER_DEPS\n"
      << "#include \"llvm/ADT/SparseBitVector.h\"\n"
      << "namespace llvm {\n"
@@ -906,7 +898,6 @@ void GICombinerEmitter::run(raw_ostream &OS) {
      << "  bool isRuleDisabled(unsigned ID) const;\n"
      << "  bool setRuleEnabled(StringRef RuleIdentifier);\n"
      << "  bool setRuleDisabled(StringRef RuleIdentifier);\n"
-     << "\n"
      << "};\n"
      << "\n"
      << "class " << getClassName();
@@ -914,10 +905,10 @@ void GICombinerEmitter::run(raw_ostream &OS) {
   if (!StateClass.empty())
     OS << " : public " << StateClass;
   OS << " {\n"
-     << " const " << getClassName() << "RuleConfig *RuleConfig;\n"
+     << "  const " << getClassName() << "RuleConfig *RuleConfig;\n"
      << "\n"
      << "public:\n"
-     << "  template<typename ... Args>" << getClassName() << "(const "
+     << "  template <typename... Args>" << getClassName() << "(const "
      << getClassName() << "RuleConfig &RuleConfig, Args &&... args) : ";
   if (!StateClass.empty())
     OS << StateClass << "(std::forward<Args>(args)...), ";
@@ -947,9 +938,9 @@ void GICombinerEmitter::run(raw_ostream &OS) {
      << "    if (First >= Last)\n"
      << "      report_fatal_error(\"Beginning of range should be before "
         "end of range\");\n"
-     << "    return {{ *First, *Last + 1 }};\n"
+     << "    return {{*First, *Last + 1}};\n"
      << "  } else if (RangePair.first == \"*\") {\n"
-     << "    return {{ 0, " << Rules.size() << " }};\n"
+     << "    return {{0, " << Rules.size() << "}};\n"
      << "  } else {\n"
      << "    const auto I = getRuleIdxForIdentifier(RangePair.first);\n"
      << "    if (!I.hasValue())\n"
@@ -963,7 +954,7 @@ void GICombinerEmitter::run(raw_ostream &OS) {
     OS << "bool " << getClassName() << "RuleConfig::setRule"
        << (Enabled ? "Enabled" : "Disabled") << "(StringRef RuleIdentifier) {\n"
        << "  auto MaybeRange = getRuleRangeForIdentifier(RuleIdentifier);\n"
-       << "  if(!MaybeRange.hasValue())\n"
+       << "  if (!MaybeRange.hasValue())\n"
        << "    return false;\n"
        << "  for (auto I = MaybeRange->first; I < MaybeRange->second; ++I)\n"
        << "    DisabledRules." << (Enabled ? "reset" : "set") << "(I);\n"
@@ -1026,7 +1017,7 @@ void GICombinerEmitter::run(raw_ostream &OS) {
      << "  MachineBasicBlock *MBB = MI.getParent();\n"
      << "  MachineFunction *MF = MBB->getParent();\n"
      << "  MachineRegisterInfo &MRI = MF->getRegInfo();\n"
-     << "  SmallVector<MachineInstr *, 8> MIs = { &MI };\n\n"
+     << "  SmallVector<MachineInstr *, 8> MIs = {&MI};\n\n"
      << "  (void)MBB; (void)MF; (void)MRI; (void)RuleConfig;\n\n";
 
   OS << "  // Match data\n";
diff --git a/contrib/llvm-project/llvm/utils/TableGen/GlobalISel/CodeExpander.cpp b/contrib/llvm-project/llvm/utils/TableGen/GlobalISel/CodeExpander.cpp
index d59a9b8e3b65..3ebb293f466e 100644
--- a/contrib/llvm-project/llvm/utils/TableGen/GlobalISel/CodeExpander.cpp
+++ b/contrib/llvm-project/llvm/utils/TableGen/GlobalISel/CodeExpander.cpp
@@ -58,21 +58,15 @@ void CodeExpander::emit(raw_ostream &OS) const {
       // Warn if we split because no terminator was found.
       StringRef EndVar = StartVar.drop_front(2 /* ${ */ + Var.size());
       if (EndVar.empty()) {
-        size_t LocOffset = StartVar.data() - Code.data();
-        PrintWarning(
-            Loc.size() > 0 && Loc[0].isValid()
-                ? SMLoc::getFromPointer(Loc[0].getPointer() + LocOffset)
-                : SMLoc(),
-            "Unterminated expansion");
+        PrintWarning(Loc, "Unterminated expansion '${" + Var + "'");
+        PrintNote("Code: [{" + Code + "}]");
       }
 
       auto ValueI = Expansions.find(Var);
       if (ValueI == Expansions.end()) {
-        size_t LocOffset = StartVar.data() - Code.data();
-        PrintError(Loc.size() > 0 && Loc[0].isValid()
-                       ? SMLoc::getFromPointer(Loc[0].getPointer() + LocOffset)
-                       : SMLoc(),
-                   "Attempting to expand an undeclared variable " + Var);
+        PrintError(Loc,
+                   "Attempt to expand an undeclared variable '" + Var + "'");
+        PrintNote("Code: [{" + Code + "}]");
       }
       if (ShowExpansions)
         OS << "/*$" << Var << "{*/";
@@ -82,11 +76,8 @@ void CodeExpander::emit(raw_ostream &OS) const {
       continue;
     }
 
-    size_t LocOffset = Current.data() - Code.data();
-    PrintWarning(Loc.size() > 0 && Loc[0].isValid()
-                     ? SMLoc::getFromPointer(Loc[0].getPointer() + LocOffset)
-                     : SMLoc(),
-                 "Assuming missing escape character");
+    PrintWarning(Loc, "Assuming missing escape character: \\$");
+    PrintNote("Code: [{" + Code + "}]");
     OS << "$";
     Current = Current.drop_front(1);
   }
diff --git a/contrib/llvm-project/llvm/utils/TableGen/GlobalISel/GIMatchDag.cpp b/contrib/llvm-project/llvm/utils/TableGen/GlobalISel/GIMatchDag.cpp
index a3a9b7d8b037..7e037dd03b60 100644
--- a/contrib/llvm-project/llvm/utils/TableGen/GlobalISel/GIMatchDag.cpp
+++ b/contrib/llvm-project/llvm/utils/TableGen/GlobalISel/GIMatchDag.cpp
@@ -41,7 +41,7 @@ void GIMatchDag::writeDOTGraph(raw_ostream &OS, StringRef ID) const {
     SmallVector<std::pair<unsigned, StringRef>, 8> ToPrint;
     for (const auto &Assignment : N->user_assigned_operand_names())
       ToPrint.emplace_back(Assignment.first, Assignment.second);
-    llvm::sort(ToPrint.begin(), ToPrint.end());
+    llvm::sort(ToPrint);
     StringRef Separator = "";
     for (const auto &Assignment : ToPrint) {
       OS << Separator << "$" << Assignment.second << "=getOperand("
diff --git a/contrib/llvm-project/llvm/utils/TableGen/GlobalISel/GIMatchDagInstr.cpp b/contrib/llvm-project/llvm/utils/TableGen/GlobalISel/GIMatchDagInstr.cpp
index 218b741be20c..ad9fbea8f881 100644
--- a/contrib/llvm-project/llvm/utils/TableGen/GlobalISel/GIMatchDagInstr.cpp
+++ b/contrib/llvm-project/llvm/utils/TableGen/GlobalISel/GIMatchDagInstr.cpp
@@ -27,7 +27,7 @@ void GIMatchDagInstr::print(raw_ostream &OS) const {
     SmallVector<std::pair<unsigned, StringRef>, 8> ToPrint;
     for (const auto &Assignment : UserAssignedNamesForOperands)
       ToPrint.emplace_back(Assignment.first, Assignment.second);
-    llvm::sort(ToPrint.begin(), ToPrint.end());
+    llvm::sort(ToPrint);
     StringRef Separator = "";
     for (const auto &Assignment : ToPrint) {
       OS << Separator << "$" << Assignment.second << "=getOperand("
diff --git a/contrib/llvm-project/llvm/utils/TableGen/GlobalISel/GIMatchTree.cpp b/contrib/llvm-project/llvm/utils/TableGen/GlobalISel/GIMatchTree.cpp
index 96dc4fc94893..d08a83333c30 100644
--- a/contrib/llvm-project/llvm/utils/TableGen/GlobalISel/GIMatchTree.cpp
+++ b/contrib/llvm-project/llvm/utils/TableGen/GlobalISel/GIMatchTree.cpp
@@ -121,8 +121,7 @@ void GIMatchTreeBuilderLeafInfo::declareInstr(const GIMatchDagInstr *Instr, unsi
     Info.bindOperandVariable(VarBinding.second, ID, VarBinding.first);
 
   // Clear the bit indicating we haven't visited this instr.
-  const auto &NodeI = std::find(MatchDag.instr_nodes_begin(),
-                            MatchDag.instr_nodes_end(), Instr);
+  const auto &NodeI = find(MatchDag.instr_nodes(), Instr);
   assert(NodeI != MatchDag.instr_nodes_end() && "Instr isn't in this DAG");
   unsigned InstrIdx = MatchDag.getInstrNodeIdx(NodeI);
   RemainingInstrNodes.reset(InstrIdx);
@@ -266,11 +265,10 @@ void GIMatchTreeBuilder::runStep() {
       LLVM_DEBUG(dbgs() << "Leaf contains multiple rules, drop after the first "
                            "fully tested rule\n");
       auto FirstFullyTested =
-          std::find_if(Leaves.begin(), Leaves.end(),
-                       [](const GIMatchTreeBuilderLeafInfo &X) {
-                         return X.isFullyTraversed() && X.isFullyTested() &&
-                                !X.getMatchDag().hasPostMatchPredicate();
-                       });
+          llvm::find_if(Leaves, [](const GIMatchTreeBuilderLeafInfo &X) {
+            return X.isFullyTraversed() && X.isFullyTested() &&
+                   !X.getMatchDag().hasPostMatchPredicate();
+          });
       if (FirstFullyTested != Leaves.end())
         FirstFullyTested++;
 
@@ -456,8 +454,7 @@ void GIMatchTreeOpcodePartitioner::repartition(
         // predicates for one instruction in the same DAG. That should be
         // impossible.
         assert(AllOpcodes && "Conflicting opcode predicates");
-        for (const CodeGenInstruction *Expected : OpcodeP->getInstrs())
-          OpcodesForThisPredicate.push_back(Expected);
+        append_range(OpcodesForThisPredicate, OpcodeP->getInstrs());
       }
 
       for (const CodeGenInstruction *Expected : OpcodesForThisPredicate) {
diff --git a/contrib/llvm-project/llvm/utils/TableGen/GlobalISel/GIMatchTree.h b/contrib/llvm-project/llvm/utils/TableGen/GlobalISel/GIMatchTree.h
index b86f6454589c..bf41a2e0e234 100644
--- a/contrib/llvm-project/llvm/utils/TableGen/GlobalISel/GIMatchTree.h
+++ b/contrib/llvm-project/llvm/utils/TableGen/GlobalISel/GIMatchTree.h
@@ -353,10 +353,7 @@ public:
   void declareOperand(unsigned InstrID, unsigned OpIdx);
 
   GIMatchTreeInstrInfo *getInstrInfo(unsigned ID) const {
-    auto I = InstrIDToInfo.find(ID);
-    if (I != InstrIDToInfo.end())
-      return I->second;
-    return nullptr;
+    return InstrIDToInfo.lookup(ID);
   }
 
   void dump(raw_ostream &OS) const {
diff --git a/contrib/llvm-project/llvm/utils/TableGen/GlobalISelEmitter.cpp b/contrib/llvm-project/llvm/utils/TableGen/GlobalISelEmitter.cpp
index 4e8dcc52fc20..0a6985a3eac2 100644
--- a/contrib/llvm-project/llvm/utils/TableGen/GlobalISelEmitter.cpp
+++ b/contrib/llvm-project/llvm/utils/TableGen/GlobalISelEmitter.cpp
@@ -187,17 +187,21 @@ class InstructionMatcher;
 static Optional<LLTCodeGen> MVTToLLT(MVT::SimpleValueType SVT) {
   MVT VT(SVT);
 
-  if (VT.isVector() && VT.getVectorNumElements() != 1)
+  if (VT.isScalableVector())
+    return None;
+
+  if (VT.isFixedLengthVector() && VT.getVectorNumElements() != 1)
     return LLTCodeGen(
         LLT::vector(VT.getVectorNumElements(), VT.getScalarSizeInBits()));
 
   if (VT.isInteger() || VT.isFloatingPoint())
     return LLTCodeGen(LLT::scalar(VT.getSizeInBits()));
+
   return None;
 }
 
 static std::string explainPredicates(const TreePatternNode *N) {
-  std::string Explanation = "";
+  std::string Explanation;
   StringRef Separator = "";
   for (const TreePredicateCall &Call : N->getPredicateCalls()) {
     const TreePredicateFn &P = Call.Fn;
@@ -301,8 +305,8 @@ static Error failedImport(const Twine &Reason) {
 }
 
 static Error isTrivialOperatorNode(const TreePatternNode *N) {
-  std::string Explanation = "";
-  std::string Separator = "";
+  std::string Explanation;
+  std::string Separator;
 
   bool HasUnsupportedPredicate = false;
   for (const TreePredicateCall &Call : N->getPredicateCalls()) {
@@ -389,6 +393,10 @@ getNameForFeatureBitset(const std::vector<Record *> &FeatureBitset) {
   return Name;
 }
 
+static std::string getScopedName(unsigned Scope, const std::string &Name) {
+  return ("pred:" + Twine(Scope) + ":" + Name).str();
+}
+
 //===- MatchTable Helpers -------------------------------------------------===//
 
 class MatchTable;
@@ -445,9 +453,8 @@ public:
   MatchTableRecord(Optional<unsigned> LabelID_, StringRef EmitStr,
                    unsigned NumElements, unsigned Flags,
                    int64_t RawValue = std::numeric_limits<int64_t>::min())
-      : LabelID(LabelID_.hasValue() ? LabelID_.getValue() : ~0u),
-        EmitStr(EmitStr), NumElements(NumElements), Flags(Flags),
-        RawValue(RawValue) {
+      : LabelID(LabelID_.getValueOr(~0u)), EmitStr(EmitStr),
+        NumElements(NumElements), Flags(Flags), RawValue(RawValue) {
     assert((!LabelID_.hasValue() || LabelID != ~0u) &&
            "This value is reserved for non-labels");
   }
@@ -852,6 +859,11 @@ protected:
       DefinedComplexPatternSubOperandMap;
   /// A map of Symbolic Names to ComplexPattern sub-operands.
   DefinedComplexPatternSubOperandMap ComplexSubOperands;
+  /// A map used to for multiple referenced error check of ComplexSubOperand.
+  /// ComplexSubOperand can't be referenced multiple from different operands,
+  /// however multiple references from same operand are allowed since that is
+  /// how 'same operand checks' are generated.
+  StringMap<std::string> ComplexSubOperandsParentName;
 
   uint64_t RuleID;
   static uint64_t NextRuleID;
@@ -917,14 +929,25 @@ public:
   void definePhysRegOperand(Record *Reg, OperandMatcher &OM);
 
   Error defineComplexSubOperand(StringRef SymbolicName, Record *ComplexPattern,
-                                unsigned RendererID, unsigned SubOperandID) {
-    if (ComplexSubOperands.count(SymbolicName))
-      return failedImport(
-          "Complex suboperand referenced more than once (Operand: " +
-          SymbolicName + ")");
+                                unsigned RendererID, unsigned SubOperandID,
+                                StringRef ParentSymbolicName) {
+    std::string ParentName(ParentSymbolicName);
+    if (ComplexSubOperands.count(SymbolicName)) {
+      const std::string &RecordedParentName =
+          ComplexSubOperandsParentName[SymbolicName];
+      if (RecordedParentName != ParentName)
+        return failedImport("Error: Complex suboperand " + SymbolicName +
+                            " referenced by different operands: " +
+                            RecordedParentName + " and " + ParentName + ".");
+      // Complex suboperand referenced more than once from same the operand is
+      // used to generate 'same operand check'. Emitting of
+      // GIR_ComplexSubOperandRenderer for them is already handled.
+      return Error::success();
+    }
 
     ComplexSubOperands[SymbolicName] =
         std::make_tuple(ComplexPattern, RendererID, SubOperandID);
+    ComplexSubOperandsParentName[SymbolicName] = ParentName;
 
     return Error::success();
   }
@@ -992,10 +1015,6 @@ protected:
   bool Optimized = false;
 
 public:
-  /// Construct a new predicate and add it to the matcher.
-  template <class Kind, class... Args>
-  Optional<Kind *> addPredicate(Args &&... args);
-
   typename PredicatesTy::iterator predicates_begin() {
     return Predicates.begin();
   }
@@ -1089,6 +1108,7 @@ public:
     IPM_MemoryVsLLTSize,
     IPM_MemoryAddressSpace,
     IPM_MemoryAlignment,
+    IPM_VectorSplatImm,
     IPM_GenericPredicate,
     OPM_SameOperand,
     OPM_ComplexPattern,
@@ -1101,6 +1121,7 @@ public:
     OPM_PointerToAny,
     OPM_RegBank,
     OPM_MBB,
+    OPM_RecordNamedOperand,
   };
 
 protected:
@@ -1270,10 +1291,15 @@ public:
       : OperandPredicateMatcher(OPM_PointerToAny, InsnVarID, OpIdx),
         SizeInBits(SizeInBits) {}
 
-  static bool classof(const OperandPredicateMatcher *P) {
+  static bool classof(const PredicateMatcher *P) {
     return P->getKind() == OPM_PointerToAny;
   }
 
+  bool isIdentical(const PredicateMatcher &B) const override {
+    return OperandPredicateMatcher::isIdentical(B) &&
+           SizeInBits == cast<PointerToAnyOperandMatcher>(&B)->SizeInBits;
+  }
+
   void emitPredicateOpcodes(MatchTable &Table,
                             RuleMatcher &Rule) const override {
     Table << MatchTable::Opcode("GIM_CheckPointerToAny")
@@ -1284,6 +1310,40 @@ public:
   }
 };
 
+/// Generates code to record named operand in RecordedOperands list at StoreIdx.
+/// Predicates with 'let PredicateCodeUsesOperands = 1' get RecordedOperands as
+/// an argument to predicate's c++ code once all operands have been matched.
+class RecordNamedOperandMatcher : public OperandPredicateMatcher {
+protected:
+  unsigned StoreIdx;
+  std::string Name;
+
+public:
+  RecordNamedOperandMatcher(unsigned InsnVarID, unsigned OpIdx,
+                            unsigned StoreIdx, StringRef Name)
+      : OperandPredicateMatcher(OPM_RecordNamedOperand, InsnVarID, OpIdx),
+        StoreIdx(StoreIdx), Name(Name) {}
+
+  static bool classof(const PredicateMatcher *P) {
+    return P->getKind() == OPM_RecordNamedOperand;
+  }
+
+  bool isIdentical(const PredicateMatcher &B) const override {
+    return OperandPredicateMatcher::isIdentical(B) &&
+           StoreIdx == cast<RecordNamedOperandMatcher>(&B)->StoreIdx &&
+           Name == cast<RecordNamedOperandMatcher>(&B)->Name;
+  }
+
+  void emitPredicateOpcodes(MatchTable &Table,
+                            RuleMatcher &Rule) const override {
+    Table << MatchTable::Opcode("GIM_RecordNamedOperand")
+          << MatchTable::Comment("MI") << MatchTable::IntValue(InsnVarID)
+          << MatchTable::Comment("Op") << MatchTable::IntValue(OpIdx)
+          << MatchTable::Comment("StoreIdx") << MatchTable::IntValue(StoreIdx)
+          << MatchTable::Comment("Name : " + Name) << MatchTable::LineBreak;
+  }
+};
+
 /// Generates code to check that an operand is a particular target constant.
 class ComplexPatternOperandMatcher : public OperandPredicateMatcher {
 protected:
@@ -1523,7 +1583,7 @@ public:
         AllocatedTemporariesBaseID(AllocatedTemporariesBaseID) {}
 
   bool hasSymbolicName() const { return !SymbolicName.empty(); }
-  const StringRef getSymbolicName() const { return SymbolicName; }
+  StringRef getSymbolicName() const { return SymbolicName; }
   void setSymbolicName(StringRef Name) {
     assert(SymbolicName.empty() && "Operand already has a symbolic name");
     SymbolicName = std::string(Name);
@@ -1670,10 +1730,23 @@ PredicateListMatcher<PredicateMatcher>::getNoPredicateComment() const {
 /// Generates code to check the opcode of an instruction.
 class InstructionOpcodeMatcher : public InstructionPredicateMatcher {
 protected:
-  const CodeGenInstruction *I;
+  // Allow matching one to several, similar opcodes that share properties. This
+  // is to handle patterns where one SelectionDAG operation maps to multiple
+  // GlobalISel ones (e.g. G_BUILD_VECTOR and G_BUILD_VECTOR_TRUNC). The first
+  // is treated as the canonical opcode.
+  SmallVector<const CodeGenInstruction *, 2> Insts;
 
   static DenseMap<const CodeGenInstruction *, unsigned> OpcodeValues;
 
+
+  MatchTableRecord getInstValue(const CodeGenInstruction *I) const {
+    const auto VI = OpcodeValues.find(I);
+    if (VI != OpcodeValues.end())
+      return MatchTable::NamedValue(I->Namespace, I->TheDef->getName(),
+                                    VI->second);
+    return MatchTable::NamedValue(I->Namespace, I->TheDef->getName());
+  }
+
 public:
   static void initOpcodeValuesMap(const CodeGenTarget &Target) {
     OpcodeValues.clear();
@@ -1683,8 +1756,13 @@ public:
       OpcodeValues[I] = OpcodeValue++;
   }
 
-  InstructionOpcodeMatcher(unsigned InsnVarID, const CodeGenInstruction *I)
-      : InstructionPredicateMatcher(IPM_Opcode, InsnVarID), I(I) {}
+  InstructionOpcodeMatcher(unsigned InsnVarID,
+                           ArrayRef<const CodeGenInstruction *> I)
+      : InstructionPredicateMatcher(IPM_Opcode, InsnVarID),
+        Insts(I.begin(), I.end()) {
+    assert((Insts.size() == 1 || Insts.size() == 2) &&
+           "unexpected number of opcode alternatives");
+  }
 
   static bool classof(const PredicateMatcher *P) {
     return P->getKind() == IPM_Opcode;
@@ -1692,22 +1770,36 @@ public:
 
   bool isIdentical(const PredicateMatcher &B) const override {
     return InstructionPredicateMatcher::isIdentical(B) &&
-           I == cast<InstructionOpcodeMatcher>(&B)->I;
+           Insts == cast<InstructionOpcodeMatcher>(&B)->Insts;
   }
+
+  bool hasValue() const override {
+    return Insts.size() == 1 && OpcodeValues.count(Insts[0]);
+  }
+
+  // TODO: This is used for the SwitchMatcher optimization. We should be able to
+  // return a list of the opcodes to match.
   MatchTableRecord getValue() const override {
+    assert(Insts.size() == 1);
+
+    const CodeGenInstruction *I = Insts[0];
     const auto VI = OpcodeValues.find(I);
     if (VI != OpcodeValues.end())
       return MatchTable::NamedValue(I->Namespace, I->TheDef->getName(),
                                     VI->second);
     return MatchTable::NamedValue(I->Namespace, I->TheDef->getName());
   }
-  bool hasValue() const override { return OpcodeValues.count(I); }
 
   void emitPredicateOpcodes(MatchTable &Table,
                             RuleMatcher &Rule) const override {
-    Table << MatchTable::Opcode("GIM_CheckOpcode") << MatchTable::Comment("MI")
-          << MatchTable::IntValue(InsnVarID) << getValue()
-          << MatchTable::LineBreak;
+    StringRef CheckType = Insts.size() == 1 ?
+                          "GIM_CheckOpcode" : "GIM_CheckOpcodeIsEither";
+    Table << MatchTable::Opcode(CheckType) << MatchTable::Comment("MI")
+          << MatchTable::IntValue(InsnVarID);
+
+    for (const CodeGenInstruction *I : Insts)
+      Table << getInstValue(I);
+    Table << MatchTable::LineBreak;
   }
 
   /// Compare the priority of this object and B.
@@ -1725,20 +1817,32 @@ public:
     // using instruction frequency information to improve compile time.
     if (const InstructionOpcodeMatcher *BO =
             dyn_cast<InstructionOpcodeMatcher>(&B))
-      return I->TheDef->getName() < BO->I->TheDef->getName();
+      return Insts[0]->TheDef->getName() < BO->Insts[0]->TheDef->getName();
 
     return false;
   };
 
   bool isConstantInstruction() const {
-    return I->TheDef->getName() == "G_CONSTANT";
+    return Insts.size() == 1 && Insts[0]->TheDef->getName() == "G_CONSTANT";
   }
 
-  StringRef getOpcode() const { return I->TheDef->getName(); }
-  bool isVariadicNumOperands() const { return I->Operands.isVariadic; }
+  // The first opcode is the canonical opcode, and later are alternatives.
+  StringRef getOpcode() const {
+    return Insts[0]->TheDef->getName();
+  }
+
+  ArrayRef<const CodeGenInstruction *> getAlternativeOpcodes() {
+    return Insts;
+  }
+
+  bool isVariadicNumOperands() const {
+    // If one is variadic, they all should be.
+    return Insts[0]->Operands.isVariadic;
+  }
 
   StringRef getOperandType(unsigned OpIdx) const {
-    return I->Operands[OpIdx].OperandType;
+    // Types expected to be uniform for all alternatives.
+    return Insts[0]->Operands[OpIdx].OperandType;
   }
 };
 
@@ -2021,6 +2125,42 @@ public:
   }
 };
 
+// Matcher for immAllOnesV/immAllZerosV
+class VectorSplatImmPredicateMatcher : public InstructionPredicateMatcher {
+public:
+  enum SplatKind {
+    AllZeros,
+    AllOnes
+  };
+
+private:
+  SplatKind Kind;
+
+public:
+  VectorSplatImmPredicateMatcher(unsigned InsnVarID, SplatKind K)
+      : InstructionPredicateMatcher(IPM_VectorSplatImm, InsnVarID), Kind(K) {}
+
+  static bool classof(const PredicateMatcher *P) {
+    return P->getKind() == IPM_VectorSplatImm;
+  }
+
+  bool isIdentical(const PredicateMatcher &B) const override {
+    return InstructionPredicateMatcher::isIdentical(B) &&
+           Kind == static_cast<const VectorSplatImmPredicateMatcher &>(B).Kind;
+  }
+
+  void emitPredicateOpcodes(MatchTable &Table,
+                            RuleMatcher &Rule) const override {
+    if (Kind == AllOnes)
+      Table << MatchTable::Opcode("GIM_CheckIsBuildVectorAllOnes");
+    else
+      Table << MatchTable::Opcode("GIM_CheckIsBuildVectorAllZeros");
+
+    Table << MatchTable::Comment("MI") << MatchTable::IntValue(InsnVarID);
+    Table << MatchTable::LineBreak;
+  }
+};
+
 /// Generates code to check an arbitrary C++ instruction predicate.
 class GenericInstructionPredicateMatcher : public InstructionPredicateMatcher {
 protected:
@@ -2077,8 +2217,9 @@ protected:
   SmallVector<std::pair<Record *, unsigned>, 2> PhysRegInputs;
 
 public:
-  InstructionMatcher(RuleMatcher &Rule, StringRef SymbolicName)
-      : Rule(Rule), SymbolicName(SymbolicName) {
+  InstructionMatcher(RuleMatcher &Rule, StringRef SymbolicName,
+                     bool NumOpsCheck = true)
+      : Rule(Rule), NumOperandsCheck(NumOpsCheck), SymbolicName(SymbolicName) {
     // We create a new instruction matcher.
     // Get a new ID for that instruction.
     InsnVarID = Rule.implicitlyDefineInsnVar(*this);
@@ -2108,10 +2249,10 @@ public:
   }
 
   OperandMatcher &getOperand(unsigned OpIdx) {
-    auto I = std::find_if(Operands.begin(), Operands.end(),
-                          [&OpIdx](const std::unique_ptr<OperandMatcher> &X) {
-                            return X->getOpIdx() == OpIdx;
-                          });
+    auto I = llvm::find_if(Operands,
+                           [&OpIdx](const std::unique_ptr<OperandMatcher> &X) {
+                             return X->getOpIdx() == OpIdx;
+                           });
     if (I != Operands.end())
       return **I;
     llvm_unreachable("Failed to lookup operand");
@@ -2267,9 +2408,10 @@ protected:
 
 public:
   InstructionOperandMatcher(unsigned InsnVarID, unsigned OpIdx,
-                            RuleMatcher &Rule, StringRef SymbolicName)
+                            RuleMatcher &Rule, StringRef SymbolicName,
+                            bool NumOpsCheck = true)
       : OperandPredicateMatcher(OPM_Instruction, InsnVarID, OpIdx),
-        InsnMatcher(new InstructionMatcher(Rule, SymbolicName)) {}
+        InsnMatcher(new InstructionMatcher(Rule, SymbolicName, NumOpsCheck)) {}
 
   static bool classof(const PredicateMatcher *P) {
     return P->getKind() == OPM_Instruction;
@@ -2395,7 +2537,7 @@ public:
     return R->getKind() == OR_Copy;
   }
 
-  const StringRef getSymbolicName() const { return SymbolicName; }
+  StringRef getSymbolicName() const { return SymbolicName; }
 
   void emitRenderOpcodes(MatchTable &Table, RuleMatcher &Rule) const override {
     const OperandMatcher &Operand = Rule.getOperandMatcher(SymbolicName);
@@ -2462,7 +2604,7 @@ public:
     return R->getKind() == OR_CopyOrAddZeroReg;
   }
 
-  const StringRef getSymbolicName() const { return SymbolicName; }
+  StringRef getSymbolicName() const { return SymbolicName; }
 
   void emitRenderOpcodes(MatchTable &Table, RuleMatcher &Rule) const override {
     const OperandMatcher &Operand = Rule.getOperandMatcher(SymbolicName);
@@ -2499,7 +2641,7 @@ public:
     return R->getKind() == OR_CopyConstantAsImm;
   }
 
-  const StringRef getSymbolicName() const { return SymbolicName; }
+  StringRef getSymbolicName() const { return SymbolicName; }
 
   void emitRenderOpcodes(MatchTable &Table, RuleMatcher &Rule) const override {
     InstructionMatcher &InsnMatcher = Rule.getInstructionMatcher(SymbolicName);
@@ -2530,7 +2672,7 @@ public:
     return R->getKind() == OR_CopyFConstantAsFPImm;
   }
 
-  const StringRef getSymbolicName() const { return SymbolicName; }
+  StringRef getSymbolicName() const { return SymbolicName; }
 
   void emitRenderOpcodes(MatchTable &Table, RuleMatcher &Rule) const override {
     InstructionMatcher &InsnMatcher = Rule.getInstructionMatcher(SymbolicName);
@@ -2564,7 +2706,7 @@ public:
     return R->getKind() == OR_CopySubReg;
   }
 
-  const StringRef getSymbolicName() const { return SymbolicName; }
+  StringRef getSymbolicName() const { return SymbolicName; }
 
   void emitRenderOpcodes(MatchTable &Table, RuleMatcher &Rule) const override {
     const OperandMatcher &Operand = Rule.getOperandMatcher(SymbolicName);
@@ -2587,12 +2729,13 @@ protected:
   unsigned InsnID;
   const Record *RegisterDef;
   bool IsDef;
+  const CodeGenTarget &Target;
 
 public:
-  AddRegisterRenderer(unsigned InsnID, const Record *RegisterDef,
-                      bool IsDef = false)
+  AddRegisterRenderer(unsigned InsnID, const CodeGenTarget &Target,
+                      const Record *RegisterDef, bool IsDef = false)
       : OperandRenderer(OR_Register), InsnID(InsnID), RegisterDef(RegisterDef),
-        IsDef(IsDef) {}
+        IsDef(IsDef), Target(Target) {}
 
   static bool classof(const OperandRenderer *R) {
     return R->getKind() == OR_Register;
@@ -2600,13 +2743,17 @@ public:
 
   void emitRenderOpcodes(MatchTable &Table, RuleMatcher &Rule) const override {
     Table << MatchTable::Opcode("GIR_AddRegister")
-          << MatchTable::Comment("InsnID") << MatchTable::IntValue(InsnID)
-          << MatchTable::NamedValue(
-                 (RegisterDef->getValue("Namespace")
-                      ? RegisterDef->getValueAsString("Namespace")
-                      : ""),
-                 RegisterDef->getName())
-          << MatchTable::Comment("AddRegisterRegFlags");
+          << MatchTable::Comment("InsnID") << MatchTable::IntValue(InsnID);
+    if (RegisterDef->getName() != "zero_reg") {
+      Table << MatchTable::NamedValue(
+                   (RegisterDef->getValue("Namespace")
+                        ? RegisterDef->getValueAsString("Namespace")
+                        : ""),
+                   RegisterDef->getName());
+    } else {
+      Table << MatchTable::NamedValue(Target.getRegNamespace(), "NoRegister");
+    }
+    Table << MatchTable::Comment("AddRegisterRegFlags");
 
     // TODO: This is encoded as a 64-bit element, but only 16 or 32-bits are
     // really needed for a physical register reference. We can pack the
@@ -2628,12 +2775,14 @@ protected:
   unsigned TempRegID;
   const CodeGenSubRegIndex *SubRegIdx;
   bool IsDef;
+  bool IsDead;
 
 public:
   TempRegRenderer(unsigned InsnID, unsigned TempRegID, bool IsDef = false,
-                  const CodeGenSubRegIndex *SubReg = nullptr)
+                  const CodeGenSubRegIndex *SubReg = nullptr,
+                  bool IsDead = false)
       : OperandRenderer(OR_Register), InsnID(InsnID), TempRegID(TempRegID),
-        SubRegIdx(SubReg), IsDef(IsDef) {}
+        SubRegIdx(SubReg), IsDef(IsDef), IsDead(IsDead) {}
 
   static bool classof(const OperandRenderer *R) {
     return R->getKind() == OR_TempRegister;
@@ -2650,9 +2799,13 @@ public:
           << MatchTable::Comment("TempRegID") << MatchTable::IntValue(TempRegID)
           << MatchTable::Comment("TempRegFlags");
 
-    if (IsDef)
-      Table << MatchTable::NamedValue("RegState::Define");
-    else
+    if (IsDef) {
+      SmallString<32> RegFlags;
+      RegFlags += "RegState::Define";
+      if (IsDead)
+        RegFlags += "|RegState::Dead";
+      Table << MatchTable::NamedValue(RegFlags);
+    } else
       Table << MatchTable::IntValue(0);
 
     if (SubRegIdx)
@@ -3365,6 +3518,16 @@ private:
   // Rule coverage information.
   Optional<CodeGenCoverage> RuleCoverage;
 
+  /// Variables used to help with collecting of named operands for predicates
+  /// with 'let PredicateCodeUsesOperands = 1'. WaitingForNamedOperands is set
+  /// to the number of named operands that predicate expects. Store locations in
+  /// StoreIdxForName correspond to the order in which operand names appear in
+  /// predicate's argument list.
+  /// When we visit named leaf operand and WaitingForNamedOperands is not zero,
+  /// add matcher that will record operand and decrease counter.
+  unsigned WaitingForNamedOperands = 0;
+  StringMap<unsigned> StoreIdxForName;
+
   void gatherOpcodeValues();
   void gatherTypeIDValues();
   void gatherNodeEquivs();
@@ -3394,7 +3557,11 @@ private:
   Expected<action_iterator>
   createInstructionRenderer(action_iterator InsertPt, RuleMatcher &M,
                             const TreePatternNode *Dst);
-  void importExplicitDefRenderers(BuildMIAction &DstMIBuilder);
+
+  Expected<action_iterator>
+  importExplicitDefRenderers(action_iterator InsertPt, RuleMatcher &M,
+                             BuildMIAction &DstMIBuilder,
+                             const TreePatternNode *Dst);
 
   Expected<action_iterator>
   importExplicitUseRenderers(action_iterator InsertPt, RuleMatcher &M,
@@ -3413,7 +3580,8 @@ private:
 
   void emitCxxPredicateFns(raw_ostream &OS, StringRef CodeFieldName,
                            StringRef TypeIdentifier, StringRef ArgType,
-                           StringRef ArgName, StringRef AdditionalDeclarations,
+                           StringRef ArgName, StringRef AdditionalArgs,
+                           StringRef AdditionalDeclarations,
                            std::function<bool(const Record *R)> Filter);
   void emitImmPredicateFns(raw_ostream &OS, StringRef TypeIdentifier,
                            StringRef ArgType,
@@ -3455,6 +3623,12 @@ private:
   Optional<const CodeGenRegisterClass *>
   inferRegClassFromPattern(TreePatternNode *N);
 
+  // Add builtin predicates.
+  Expected<InstructionMatcher &>
+  addBuiltinPredicates(const Record *SrcGIEquivOrNull,
+                       const TreePredicateFn &Predicate,
+                       InstructionMatcher &InsnMatcher, bool &HasAddedMatcher);
+
 public:
   /// Takes a sequence of \p Rules and group them based on the predicates
   /// they share. \p MatcherStorage is used as a memory container
@@ -3562,6 +3736,147 @@ GlobalISelEmitter::importRulePredicates(RuleMatcher &M,
   return Error::success();
 }
 
+Expected<InstructionMatcher &> GlobalISelEmitter::addBuiltinPredicates(
+    const Record *SrcGIEquivOrNull, const TreePredicateFn &Predicate,
+    InstructionMatcher &InsnMatcher, bool &HasAddedMatcher) {
+  if (Predicate.isLoad() || Predicate.isStore() || Predicate.isAtomic()) {
+    if (const ListInit *AddrSpaces = Predicate.getAddressSpaces()) {
+      SmallVector<unsigned, 4> ParsedAddrSpaces;
+
+      for (Init *Val : AddrSpaces->getValues()) {
+        IntInit *IntVal = dyn_cast<IntInit>(Val);
+        if (!IntVal)
+          return failedImport("Address space is not an integer");
+        ParsedAddrSpaces.push_back(IntVal->getValue());
+      }
+
+      if (!ParsedAddrSpaces.empty()) {
+        InsnMatcher.addPredicate<MemoryAddressSpacePredicateMatcher>(
+            0, ParsedAddrSpaces);
+      }
+    }
+
+    int64_t MinAlign = Predicate.getMinAlignment();
+    if (MinAlign > 0)
+      InsnMatcher.addPredicate<MemoryAlignmentPredicateMatcher>(0, MinAlign);
+  }
+
+  // G_LOAD is used for both non-extending and any-extending loads.
+  if (Predicate.isLoad() && Predicate.isNonExtLoad()) {
+    InsnMatcher.addPredicate<MemoryVsLLTSizePredicateMatcher>(
+        0, MemoryVsLLTSizePredicateMatcher::EqualTo, 0);
+    return InsnMatcher;
+  }
+  if (Predicate.isLoad() && Predicate.isAnyExtLoad()) {
+    InsnMatcher.addPredicate<MemoryVsLLTSizePredicateMatcher>(
+        0, MemoryVsLLTSizePredicateMatcher::LessThan, 0);
+    return InsnMatcher;
+  }
+
+  if (Predicate.isStore()) {
+    if (Predicate.isTruncStore()) {
+      // FIXME: If MemoryVT is set, we end up with 2 checks for the MMO size.
+      InsnMatcher.addPredicate<MemoryVsLLTSizePredicateMatcher>(
+          0, MemoryVsLLTSizePredicateMatcher::LessThan, 0);
+      return InsnMatcher;
+    }
+    if (Predicate.isNonTruncStore()) {
+      // We need to check the sizes match here otherwise we could incorrectly
+      // match truncating stores with non-truncating ones.
+      InsnMatcher.addPredicate<MemoryVsLLTSizePredicateMatcher>(
+          0, MemoryVsLLTSizePredicateMatcher::EqualTo, 0);
+    }
+  }
+
+  // No check required. We already did it by swapping the opcode.
+  if (!SrcGIEquivOrNull->isValueUnset("IfSignExtend") &&
+      Predicate.isSignExtLoad())
+    return InsnMatcher;
+
+  // No check required. We already did it by swapping the opcode.
+  if (!SrcGIEquivOrNull->isValueUnset("IfZeroExtend") &&
+      Predicate.isZeroExtLoad())
+    return InsnMatcher;
+
+  // No check required. G_STORE by itself is a non-extending store.
+  if (Predicate.isNonTruncStore())
+    return InsnMatcher;
+
+  if (Predicate.isLoad() || Predicate.isStore() || Predicate.isAtomic()) {
+    if (Predicate.getMemoryVT() != nullptr) {
+      Optional<LLTCodeGen> MemTyOrNone =
+          MVTToLLT(getValueType(Predicate.getMemoryVT()));
+
+      if (!MemTyOrNone)
+        return failedImport("MemVT could not be converted to LLT");
+
+      // MMO's work in bytes so we must take care of unusual types like i1
+      // don't round down.
+      unsigned MemSizeInBits =
+          llvm::alignTo(MemTyOrNone->get().getSizeInBits(), 8);
+
+      InsnMatcher.addPredicate<MemorySizePredicateMatcher>(0,
+                                                           MemSizeInBits / 8);
+      return InsnMatcher;
+    }
+  }
+
+  if (Predicate.isLoad() || Predicate.isStore()) {
+    // No check required. A G_LOAD/G_STORE is an unindexed load.
+    if (Predicate.isUnindexed())
+      return InsnMatcher;
+  }
+
+  if (Predicate.isAtomic()) {
+    if (Predicate.isAtomicOrderingMonotonic()) {
+      InsnMatcher.addPredicate<AtomicOrderingMMOPredicateMatcher>("Monotonic");
+      return InsnMatcher;
+    }
+    if (Predicate.isAtomicOrderingAcquire()) {
+      InsnMatcher.addPredicate<AtomicOrderingMMOPredicateMatcher>("Acquire");
+      return InsnMatcher;
+    }
+    if (Predicate.isAtomicOrderingRelease()) {
+      InsnMatcher.addPredicate<AtomicOrderingMMOPredicateMatcher>("Release");
+      return InsnMatcher;
+    }
+    if (Predicate.isAtomicOrderingAcquireRelease()) {
+      InsnMatcher.addPredicate<AtomicOrderingMMOPredicateMatcher>(
+          "AcquireRelease");
+      return InsnMatcher;
+    }
+    if (Predicate.isAtomicOrderingSequentiallyConsistent()) {
+      InsnMatcher.addPredicate<AtomicOrderingMMOPredicateMatcher>(
+          "SequentiallyConsistent");
+      return InsnMatcher;
+    }
+  }
+
+  if (Predicate.isAtomicOrderingAcquireOrStronger()) {
+    InsnMatcher.addPredicate<AtomicOrderingMMOPredicateMatcher>(
+        "Acquire", AtomicOrderingMMOPredicateMatcher::AO_OrStronger);
+    return InsnMatcher;
+  }
+  if (Predicate.isAtomicOrderingWeakerThanAcquire()) {
+    InsnMatcher.addPredicate<AtomicOrderingMMOPredicateMatcher>(
+        "Acquire", AtomicOrderingMMOPredicateMatcher::AO_WeakerThan);
+    return InsnMatcher;
+  }
+
+  if (Predicate.isAtomicOrderingReleaseOrStronger()) {
+    InsnMatcher.addPredicate<AtomicOrderingMMOPredicateMatcher>(
+        "Release", AtomicOrderingMMOPredicateMatcher::AO_OrStronger);
+    return InsnMatcher;
+  }
+  if (Predicate.isAtomicOrderingWeakerThanRelease()) {
+    InsnMatcher.addPredicate<AtomicOrderingMMOPredicateMatcher>(
+        "Release", AtomicOrderingMMOPredicateMatcher::AO_WeakerThan);
+    return InsnMatcher;
+  }
+  HasAddedMatcher = false;
+  return InsnMatcher;
+}
+
 Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
     RuleMatcher &Rule, InstructionMatcher &InsnMatcher,
     const TreePatternNode *Src, unsigned &TempOpIdx) {
@@ -3603,6 +3918,7 @@ Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
 
   for (const TreePredicateCall &Call : Src->getPredicateCalls()) {
     const TreePredicateFn &Predicate = Call.Fn;
+    bool HasAddedBuiltinMatcher = true;
     if (Predicate.isAlwaysTrue())
       continue;
 
@@ -3611,154 +3927,35 @@ Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
       continue;
     }
 
-    // An address space check is needed in all contexts if there is one.
-    if (Predicate.isLoad() || Predicate.isStore() || Predicate.isAtomic()) {
-      if (const ListInit *AddrSpaces = Predicate.getAddressSpaces()) {
-        SmallVector<unsigned, 4> ParsedAddrSpaces;
-
-        for (Init *Val : AddrSpaces->getValues()) {
-          IntInit *IntVal = dyn_cast<IntInit>(Val);
-          if (!IntVal)
-            return failedImport("Address space is not an integer");
-          ParsedAddrSpaces.push_back(IntVal->getValue());
-        }
-
-        if (!ParsedAddrSpaces.empty()) {
-          InsnMatcher.addPredicate<MemoryAddressSpacePredicateMatcher>(
-            0, ParsedAddrSpaces);
-        }
-      }
-
-      int64_t MinAlign = Predicate.getMinAlignment();
-      if (MinAlign > 0)
-        InsnMatcher.addPredicate<MemoryAlignmentPredicateMatcher>(0, MinAlign);
-    }
-
-    // G_LOAD is used for both non-extending and any-extending loads.
-    if (Predicate.isLoad() && Predicate.isNonExtLoad()) {
-      InsnMatcher.addPredicate<MemoryVsLLTSizePredicateMatcher>(
-          0, MemoryVsLLTSizePredicateMatcher::EqualTo, 0);
-      continue;
-    }
-    if (Predicate.isLoad() && Predicate.isAnyExtLoad()) {
-      InsnMatcher.addPredicate<MemoryVsLLTSizePredicateMatcher>(
-          0, MemoryVsLLTSizePredicateMatcher::LessThan, 0);
-      continue;
-    }
-
-    if (Predicate.isStore()) {
-      if (Predicate.isTruncStore()) {
-        // FIXME: If MemoryVT is set, we end up with 2 checks for the MMO size.
-        InsnMatcher.addPredicate<MemoryVsLLTSizePredicateMatcher>(
-            0, MemoryVsLLTSizePredicateMatcher::LessThan, 0);
-        continue;
-      }
-      if (Predicate.isNonTruncStore()) {
-        // We need to check the sizes match here otherwise we could incorrectly
-        // match truncating stores with non-truncating ones.
-        InsnMatcher.addPredicate<MemoryVsLLTSizePredicateMatcher>(
-            0, MemoryVsLLTSizePredicateMatcher::EqualTo, 0);
-      }
-    }
-
-    // No check required. We already did it by swapping the opcode.
-    if (!SrcGIEquivOrNull->isValueUnset("IfSignExtend") &&
-        Predicate.isSignExtLoad())
-      continue;
-
-    // No check required. We already did it by swapping the opcode.
-    if (!SrcGIEquivOrNull->isValueUnset("IfZeroExtend") &&
-        Predicate.isZeroExtLoad())
-      continue;
-
-    // No check required. G_STORE by itself is a non-extending store.
-    if (Predicate.isNonTruncStore())
-      continue;
-
-    if (Predicate.isLoad() || Predicate.isStore() || Predicate.isAtomic()) {
-      if (Predicate.getMemoryVT() != nullptr) {
-        Optional<LLTCodeGen> MemTyOrNone =
-            MVTToLLT(getValueType(Predicate.getMemoryVT()));
-
-        if (!MemTyOrNone)
-          return failedImport("MemVT could not be converted to LLT");
-
-        // MMO's work in bytes so we must take care of unusual types like i1
-        // don't round down.
-        unsigned MemSizeInBits =
-            llvm::alignTo(MemTyOrNone->get().getSizeInBits(), 8);
-
-        InsnMatcher.addPredicate<MemorySizePredicateMatcher>(
-            0, MemSizeInBits / 8);
-        continue;
-      }
-    }
-
-    if (Predicate.isLoad() || Predicate.isStore()) {
-      // No check required. A G_LOAD/G_STORE is an unindexed load.
-      if (Predicate.isUnindexed())
-        continue;
-    }
-
-    if (Predicate.isAtomic()) {
-      if (Predicate.isAtomicOrderingMonotonic()) {
-        InsnMatcher.addPredicate<AtomicOrderingMMOPredicateMatcher>(
-            "Monotonic");
-        continue;
-      }
-      if (Predicate.isAtomicOrderingAcquire()) {
-        InsnMatcher.addPredicate<AtomicOrderingMMOPredicateMatcher>("Acquire");
-        continue;
-      }
-      if (Predicate.isAtomicOrderingRelease()) {
-        InsnMatcher.addPredicate<AtomicOrderingMMOPredicateMatcher>("Release");
-        continue;
-      }
-      if (Predicate.isAtomicOrderingAcquireRelease()) {
-        InsnMatcher.addPredicate<AtomicOrderingMMOPredicateMatcher>(
-            "AcquireRelease");
-        continue;
-      }
-      if (Predicate.isAtomicOrderingSequentiallyConsistent()) {
-        InsnMatcher.addPredicate<AtomicOrderingMMOPredicateMatcher>(
-            "SequentiallyConsistent");
-        continue;
-      }
-
-      if (Predicate.isAtomicOrderingAcquireOrStronger()) {
-        InsnMatcher.addPredicate<AtomicOrderingMMOPredicateMatcher>(
-            "Acquire", AtomicOrderingMMOPredicateMatcher::AO_OrStronger);
-        continue;
-      }
-      if (Predicate.isAtomicOrderingWeakerThanAcquire()) {
-        InsnMatcher.addPredicate<AtomicOrderingMMOPredicateMatcher>(
-            "Acquire", AtomicOrderingMMOPredicateMatcher::AO_WeakerThan);
-        continue;
-      }
-
-      if (Predicate.isAtomicOrderingReleaseOrStronger()) {
-        InsnMatcher.addPredicate<AtomicOrderingMMOPredicateMatcher>(
-            "Release", AtomicOrderingMMOPredicateMatcher::AO_OrStronger);
-        continue;
-      }
-      if (Predicate.isAtomicOrderingWeakerThanRelease()) {
-        InsnMatcher.addPredicate<AtomicOrderingMMOPredicateMatcher>(
-            "Release", AtomicOrderingMMOPredicateMatcher::AO_WeakerThan);
-        continue;
-      }
-    }
+    auto InsnMatcherOrError = addBuiltinPredicates(
+        SrcGIEquivOrNull, Predicate, InsnMatcher, HasAddedBuiltinMatcher);
+    if (auto Error = InsnMatcherOrError.takeError())
+      return std::move(Error);
 
     if (Predicate.hasGISelPredicateCode()) {
+      if (Predicate.usesOperands()) {
+        assert(WaitingForNamedOperands == 0 &&
+               "previous predicate didn't find all operands or "
+               "nested predicate that uses operands");
+        TreePattern *TP = Predicate.getOrigPatFragRecord();
+        WaitingForNamedOperands = TP->getNumArgs();
+        for (unsigned i = 0; i < WaitingForNamedOperands; ++i)
+          StoreIdxForName[getScopedName(Call.Scope, TP->getArgName(i))] = i;
+      }
       InsnMatcher.addPredicate<GenericInstructionPredicateMatcher>(Predicate);
       continue;
     }
-
-    return failedImport("Src pattern child has predicate (" +
-                        explainPredicates(Src) + ")");
+    if (!HasAddedBuiltinMatcher) {
+      return failedImport("Src pattern child has predicate (" +
+                          explainPredicates(Src) + ")");
+    }
   }
+
+  bool IsAtomic = false;
   if (SrcGIEquivOrNull && SrcGIEquivOrNull->getValueAsBit("CheckMMOIsNonAtomic"))
     InsnMatcher.addPredicate<AtomicOrderingMMOPredicateMatcher>("NotAtomic");
   else if (SrcGIEquivOrNull && SrcGIEquivOrNull->getValueAsBit("CheckMMOIsAtomic")) {
+    IsAtomic = true;
     InsnMatcher.addPredicate<AtomicOrderingMMOPredicateMatcher>(
       "Unordered", AtomicOrderingMMOPredicateMatcher::AO_OrStronger);
   }
@@ -3812,6 +4009,27 @@ Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
       }
     }
 
+    // Hack around an unfortunate mistake in how atomic store (and really
+    // atomicrmw in general) operands were ordered. A ISD::STORE used the order
+    // <stored value>, <pointer> order. ISD::ATOMIC_STORE used the opposite,
+    // <pointer>, <stored value>. In GlobalISel there's just the one store
+    // opcode, so we need to swap the operands here to get the right type check.
+    if (IsAtomic && SrcGIOrNull->TheDef->getName() == "G_STORE") {
+      assert(NumChildren == 2 && "wrong operands for atomic store");
+
+      TreePatternNode *PtrChild = Src->getChild(0);
+      TreePatternNode *ValueChild = Src->getChild(1);
+
+      if (auto Error = importChildMatcher(Rule, InsnMatcher, PtrChild, true,
+                                          false, 1, TempOpIdx))
+        return std::move(Error);
+
+      if (auto Error = importChildMatcher(Rule, InsnMatcher, ValueChild, false,
+                                          false, 0, TempOpIdx))
+        return std::move(Error);
+      return InsnMatcher;
+    }
+
     // Match the used operands (i.e. the children of the operator).
     bool IsIntrinsic =
         SrcGIOrNull->TheDef->getName() == "G_INTRINSIC" ||
@@ -3902,12 +4120,22 @@ Error GlobalISelEmitter::importChildMatcher(
     bool OperandIsImmArg, unsigned OpIdx, unsigned &TempOpIdx) {
 
   Record *PhysReg = nullptr;
-  StringRef SrcChildName = getSrcChildName(SrcChild, PhysReg);
+  std::string SrcChildName = std::string(getSrcChildName(SrcChild, PhysReg));
+  if (!SrcChild->isLeaf() &&
+      SrcChild->getOperator()->isSubClassOf("ComplexPattern")) {
+    // The "name" of a non-leaf complex pattern (MY_PAT $op1, $op2) is
+    // "MY_PAT:op1:op2" and the ones with same "name" represent same operand.
+    std::string PatternName = std::string(SrcChild->getOperator()->getName());
+    for (unsigned i = 0; i < SrcChild->getNumChildren(); ++i) {
+      PatternName += ":";
+      PatternName += SrcChild->getChild(i)->getName();
+    }
+    SrcChildName = PatternName;
+  }
 
   OperandMatcher &OM =
-      PhysReg
-          ? InsnMatcher.addPhysRegInput(PhysReg, OpIdx, TempOpIdx)
-          : InsnMatcher.addOperand(OpIdx, std::string(SrcChildName), TempOpIdx);
+      PhysReg ? InsnMatcher.addPhysRegInput(PhysReg, OpIdx, TempOpIdx)
+              : InsnMatcher.addOperand(OpIdx, SrcChildName, TempOpIdx);
   if (OM.isSameAsAnotherOperand())
     return Error::success();
 
@@ -3954,9 +4182,9 @@ Error GlobalISelEmitter::importChildMatcher(
       for (unsigned i = 0, e = SrcChild->getNumChildren(); i != e; ++i) {
         auto *SubOperand = SrcChild->getChild(i);
         if (!SubOperand->getName().empty()) {
-          if (auto Error = Rule.defineComplexSubOperand(SubOperand->getName(),
-                                                        SrcChild->getOperator(),
-                                                        RendererID, i))
+          if (auto Error = Rule.defineComplexSubOperand(
+                  SubOperand->getName(), SrcChild->getOperator(), RendererID, i,
+                  SrcChildName))
             return Error;
         }
       }
@@ -4002,6 +4230,13 @@ Error GlobalISelEmitter::importChildMatcher(
   if (auto *ChildDefInit = dyn_cast<DefInit>(SrcChild->getLeafValue())) {
     auto *ChildRec = ChildDefInit->getDef();
 
+    if (WaitingForNamedOperands) {
+      auto PA = SrcChild->getNamesAsPredicateArg().begin();
+      std::string Name = getScopedName(PA->getScope(), PA->getIdentifier());
+      OM.addPredicate<RecordNamedOperandMatcher>(StoreIdxForName[Name], Name);
+      --WaitingForNamedOperands;
+    }
+
     // Check for register classes.
     if (ChildRec->isSubClassOf("RegisterClass") ||
         ChildRec->isSubClassOf("RegisterOperand")) {
@@ -4044,6 +4279,40 @@ Error GlobalISelEmitter::importChildMatcher(
     if (ChildRec->getName() == "srcvalue")
       return Error::success();
 
+    const bool ImmAllOnesV = ChildRec->getName() == "immAllOnesV";
+    if (ImmAllOnesV || ChildRec->getName() == "immAllZerosV") {
+      auto MaybeInsnOperand = OM.addPredicate<InstructionOperandMatcher>(
+          InsnMatcher.getRuleMatcher(), SrcChild->getName(), false);
+      InstructionOperandMatcher &InsnOperand = **MaybeInsnOperand;
+
+      ValueTypeByHwMode VTy = ChildTypes.front().getValueTypeByHwMode();
+
+      const CodeGenInstruction &BuildVector
+        = Target.getInstruction(RK.getDef("G_BUILD_VECTOR"));
+      const CodeGenInstruction &BuildVectorTrunc
+        = Target.getInstruction(RK.getDef("G_BUILD_VECTOR_TRUNC"));
+
+      // Treat G_BUILD_VECTOR as the canonical opcode, and G_BUILD_VECTOR_TRUNC
+      // as an alternative.
+      InsnOperand.getInsnMatcher().addPredicate<InstructionOpcodeMatcher>(
+      makeArrayRef({&BuildVector, &BuildVectorTrunc}));
+
+      // TODO: Handle both G_BUILD_VECTOR and G_BUILD_VECTOR_TRUNC We could
+      // theoretically not emit any opcode check, but getOpcodeMatcher currently
+      // has to succeed.
+      OperandMatcher &OM =
+          InsnOperand.getInsnMatcher().addOperand(0, "", TempOpIdx);
+      if (auto Error =
+              OM.addTypeCheckPredicate(VTy, false /* OperandIsAPointer */))
+        return failedImport(toString(std::move(Error)) +
+                            " for result of Src pattern operator");
+
+      InsnOperand.getInsnMatcher().addPredicate<VectorSplatImmPredicateMatcher>(
+          ImmAllOnesV ? VectorSplatImmPredicateMatcher::AllOnes
+                      : VectorSplatImmPredicateMatcher::AllZeros);
+      return Error::success();
+    }
+
     return failedImport(
         "Src pattern child def is an unsupported tablegen class");
   }
@@ -4155,7 +4424,7 @@ Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderer(
       return failedImport("Dst operand has an unsupported type");
 
     if (ChildRec->isSubClassOf("Register")) {
-      DstMIBuilder.addRenderer<AddRegisterRenderer>(ChildRec);
+      DstMIBuilder.addRenderer<AddRegisterRenderer>(Target, ChildRec);
       return InsertPt;
     }
 
@@ -4215,12 +4484,15 @@ Expected<BuildMIAction &> GlobalISelEmitter::createAndImportInstructionRenderer(
         &Target.getInstruction(RK.getDef("COPY")));
     BuildMIAction &CopyToPhysRegMIBuilder =
         *static_cast<BuildMIAction *>(InsertPt->get());
-    CopyToPhysRegMIBuilder.addRenderer<AddRegisterRenderer>(PhysInput.first,
+    CopyToPhysRegMIBuilder.addRenderer<AddRegisterRenderer>(Target,
+                                                            PhysInput.first,
                                                             true);
     CopyToPhysRegMIBuilder.addRenderer<CopyPhysRegRenderer>(PhysInput.first);
   }
 
-  importExplicitDefRenderers(DstMIBuilder);
+  if (auto Error = importExplicitDefRenderers(InsertPt, M, DstMIBuilder, Dst)
+                       .takeError())
+    return std::move(Error);
 
   if (auto Error = importExplicitUseRenderers(InsertPt, M, DstMIBuilder, Dst)
                        .takeError())
@@ -4372,13 +4644,39 @@ Expected<action_iterator> GlobalISelEmitter::createInstructionRenderer(
                                        DstI);
 }
 
-void GlobalISelEmitter::importExplicitDefRenderers(
-    BuildMIAction &DstMIBuilder) {
+Expected<action_iterator> GlobalISelEmitter::importExplicitDefRenderers(
+    action_iterator InsertPt, RuleMatcher &M, BuildMIAction &DstMIBuilder,
+    const TreePatternNode *Dst) {
   const CodeGenInstruction *DstI = DstMIBuilder.getCGI();
-  for (unsigned I = 0; I < DstI->Operands.NumDefs; ++I) {
-    const CGIOperandList::OperandInfo &DstIOperand = DstI->Operands[I];
-    DstMIBuilder.addRenderer<CopyRenderer>(DstIOperand.Name);
+  const unsigned NumDefs = DstI->Operands.NumDefs;
+  if (NumDefs == 0)
+    return InsertPt;
+
+  DstMIBuilder.addRenderer<CopyRenderer>(DstI->Operands[0].Name);
+
+  // Some instructions have multiple defs, but are missing a type entry
+  // (e.g. s_cc_out operands).
+  if (Dst->getExtTypes().size() < NumDefs)
+    return failedImport("unhandled discarded def");
+
+  // Patterns only handle a single result, so any result after the first is an
+  // implicitly dead def.
+  for (unsigned I = 1; I < NumDefs; ++I) {
+    const TypeSetByHwMode &ExtTy = Dst->getExtType(I);
+    if (!ExtTy.isMachineValueType())
+      return failedImport("unsupported typeset");
+
+    auto OpTy = MVTToLLT(ExtTy.getMachineValueType().SimpleTy);
+    if (!OpTy)
+      return failedImport("unsupported type");
+
+    unsigned TempRegID = M.allocateTempRegID();
+    InsertPt =
+      M.insertAction<MakeTempRegisterAction>(InsertPt, *OpTy, TempRegID);
+    DstMIBuilder.addRenderer<TempRegRenderer>(TempRegID, true, nullptr, true);
   }
+
+  return InsertPt;
 }
 
 Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderers(
@@ -4392,6 +4690,8 @@ Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderers(
 
   // EXTRACT_SUBREG needs to use a subregister COPY.
   if (Name == "EXTRACT_SUBREG") {
+    if (!Dst->getChild(1)->isLeaf())
+      return failedImport("EXTRACT_SUBREG child #1 is not a leaf");
     DefInit *SubRegInit = dyn_cast<DefInit>(Dst->getChild(1)->getLeafValue());
     if (!SubRegInit)
       return failedImport("EXTRACT_SUBREG child #1 is not a subreg index");
@@ -4578,7 +4878,7 @@ Error GlobalISelEmitter::importDefaultOperandRenderers(
         IDMIBuilder.addRenderer<TempRegRenderer>(TempRegID);
         DstMIBuilder.addRenderer<TempRegRenderer>(TempRegID);
       } else {
-        DstMIBuilder.addRenderer<AddRegisterRenderer>(Def);
+        DstMIBuilder.addRenderer<AddRegisterRenderer>(Target, Def);
       }
       continue;
     }
@@ -4653,6 +4953,17 @@ GlobalISelEmitter::inferRegClassFromPattern(TreePatternNode *N) {
       return None;
     return getRegClassFromLeaf(RCChild);
   }
+  if (InstName == "INSERT_SUBREG") {
+    TreePatternNode *Child0 = N->getChild(0);
+    assert(Child0->getNumTypes() == 1 && "Unexpected number of types!");
+    const TypeSetByHwMode &VTy = Child0->getExtType(0);
+    return inferSuperRegisterClassForNode(VTy, Child0, N->getChild(2));
+  }
+  if (InstName == "EXTRACT_SUBREG") {
+    assert(N->getNumTypes() == 1 && "Unexpected number of types!");
+    const TypeSetByHwMode &VTy = N->getExtType(0);
+    return inferSuperRegisterClass(VTy, N->getChild(1));
+  }
 
   // Handle destination record types that we can safely infer a register class
   // from.
@@ -4689,7 +5000,8 @@ GlobalISelEmitter::inferSuperRegisterClass(const TypeSetByHwMode &Ty,
   // Use the information we found above to find a minimal register class which
   // supports the subregister and type we want.
   auto RC =
-      Target.getSuperRegForSubReg(Ty.getValueTypeByHwMode(), CGRegs, SubIdx);
+      Target.getSuperRegForSubReg(Ty.getValueTypeByHwMode(), CGRegs, SubIdx,
+                                  /* MustBeAllocatable */ true);
   if (!RC)
     return None;
   return *RC;
@@ -4777,9 +5089,9 @@ Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
 
   if (Dst->isLeaf()) {
     Record *RCDef = getInitValueAsRegClass(Dst->getLeafValue());
-
-    const CodeGenRegisterClass &RC = Target.getRegisterClass(RCDef);
     if (RCDef) {
+      const CodeGenRegisterClass &RC = Target.getRegisterClass(RCDef);
+
       // We need to replace the def and all its uses with the specified
       // operand. However, we must also insert COPY's wherever needed.
       // For now, emit a copy and let the register allocator clean up.
@@ -4814,8 +5126,8 @@ Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
   auto &DstI = Target.getInstruction(DstOp);
   StringRef DstIName = DstI.TheDef->getName();
 
-  if (DstI.Operands.NumDefs != Src->getExtTypes().size())
-    return failedImport("Src pattern results and dst MI defs are different (" +
+  if (DstI.Operands.NumDefs < Src->getExtTypes().size())
+    return failedImport("Src pattern result has more defs than dst MI (" +
                         to_string(Src->getExtTypes().size()) + " def(s) vs " +
                         to_string(DstI.Operands.NumDefs) + " def(s))");
 
@@ -5035,7 +5347,8 @@ Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
 // trouble than it's worth.
 void GlobalISelEmitter::emitCxxPredicateFns(
     raw_ostream &OS, StringRef CodeFieldName, StringRef TypeIdentifier,
-    StringRef ArgType, StringRef ArgName, StringRef AdditionalDeclarations,
+    StringRef ArgType, StringRef ArgName, StringRef AdditionalArgs,
+    StringRef AdditionalDeclarations,
     std::function<bool(const Record *R)> Filter) {
   std::vector<const Record *> MatchedRecords;
   const auto &Defs = RK.getAllDerivedDefinitions("PatFrag");
@@ -5060,7 +5373,7 @@ void GlobalISelEmitter::emitCxxPredicateFns(
 
   OS << "bool " << Target.getName() << "InstructionSelector::test" << ArgName
      << "Predicate_" << TypeIdentifier << "(unsigned PredicateID, " << ArgType << " "
-     << ArgName << ") const {\n"
+     << ArgName << AdditionalArgs <<") const {\n"
      << AdditionalDeclarations;
   if (!AdditionalDeclarations.empty())
     OS << "\n";
@@ -5086,12 +5399,13 @@ void GlobalISelEmitter::emitImmPredicateFns(
     raw_ostream &OS, StringRef TypeIdentifier, StringRef ArgType,
     std::function<bool(const Record *R)> Filter) {
   return emitCxxPredicateFns(OS, "ImmediateCode", TypeIdentifier, ArgType,
-                             "Imm", "", Filter);
+                             "Imm", "", "", Filter);
 }
 
 void GlobalISelEmitter::emitMIPredicateFns(raw_ostream &OS) {
   return emitCxxPredicateFns(
       OS, "GISelPredicateCode", "MI", "const MachineInstr &", "MI",
+      ", const std::array<const MachineOperand *, 3> &Operands",
       "  const MachineFunction &MF = *MI.getParent()->getParent();\n"
       "  const MachineRegisterInfo &MRI = MF.getRegInfo();\n"
       "  (void)MRI;",
@@ -5117,8 +5431,7 @@ std::vector<Matcher *> GlobalISelEmitter::optimizeRules(
     // added rules out of it and make sure to re-create the group to properly
     // re-initialize it:
     if (CurrentGroup->size() < 2)
-      for (Matcher *M : CurrentGroup->matchers())
-        OptRules.push_back(M);
+      append_range(OptRules, CurrentGroup->matchers());
     else {
       CurrentGroup->finalize();
       OptRules.push_back(CurrentGroup.get());
@@ -5167,15 +5480,13 @@ GlobalISelEmitter::buildMatchTable(MutableArrayRef<RuleMatcher> Rules,
       OpcodeOrder[Opcode] = CurrentOrdering++;
   }
 
-  std::stable_sort(InputRules.begin(), InputRules.end(),
-                   [&OpcodeOrder](const Matcher *A, const Matcher *B) {
-                     auto *L = static_cast<const RuleMatcher *>(A);
-                     auto *R = static_cast<const RuleMatcher *>(B);
-                     return std::make_tuple(OpcodeOrder[L->getOpcode()],
-                                            L->getNumOperands()) <
-                            std::make_tuple(OpcodeOrder[R->getOpcode()],
-                                            R->getNumOperands());
-                   });
+  llvm::stable_sort(InputRules, [&OpcodeOrder](const Matcher *A,
+                                               const Matcher *B) {
+    auto *L = static_cast<const RuleMatcher *>(A);
+    auto *R = static_cast<const RuleMatcher *>(B);
+    return std::make_tuple(OpcodeOrder[L->getOpcode()], L->getNumOperands()) <
+           std::make_tuple(OpcodeOrder[R->getOpcode()], R->getNumOperands());
+  });
 
   for (Matcher *Rule : InputRules)
     Rule->optimize();
@@ -5308,7 +5619,7 @@ void GlobalISelEmitter::run(raw_ostream &OS) {
 
      << "  typedef void(" << Target.getName()
      << "InstructionSelector::*CustomRendererFn)(MachineInstrBuilder &, const "
-        "MachineInstr&, int) "
+        "MachineInstr &, int) "
         "const;\n"
      << "  const ISelInfoTy<PredicateBitset, ComplexMatcherMemFn, "
         "CustomRendererFn> "
@@ -5324,7 +5635,8 @@ void GlobalISelEmitter::run(raw_ostream &OS) {
      << "  bool testImmPredicate_APFloat(unsigned PredicateID, const APFloat "
         "&Imm) const override;\n"
      << "  const int64_t *getMatchTable() const override;\n"
-     << "  bool testMIPredicate_MI(unsigned PredicateID, const MachineInstr &MI) "
+     << "  bool testMIPredicate_MI(unsigned PredicateID, const MachineInstr &MI"
+        ", const std::array<const MachineOperand *, 3> &Operands) "
         "const override;\n"
      << "#endif // ifdef GET_GLOBALISEL_TEMPORARIES_DECL\n\n";
 
@@ -5360,7 +5672,7 @@ void GlobalISelEmitter::run(raw_ostream &OS) {
   OS << "void " << Target.getName() << "InstructionSelector"
     "::setupGeneratedPerFunctionState(MachineFunction &MF) {\n"
     "  AvailableFunctionFeatures = computeAvailableFunctionFeatures("
-    "(const " << Target.getName() << "Subtarget*)&MF.getSubtarget(), &MF);\n"
+    "(const " << Target.getName() << "Subtarget *)&MF.getSubtarget(), &MF);\n"
     "}\n";
 
   if (Target.getName() == "X86" || Target.getName() == "AArch64") {
@@ -5378,8 +5690,7 @@ void GlobalISelEmitter::run(raw_ostream &OS) {
   // Emit a table containing the LLT objects needed by the matcher and an enum
   // for the matcher to reference them with.
   std::vector<LLTCodeGen> TypeObjects;
-  for (const auto &Ty : KnownTypes)
-    TypeObjects.push_back(Ty);
+  append_range(TypeObjects, KnownTypes);
   llvm::sort(TypeObjects);
   OS << "// LLT Objects.\n"
      << "enum {\n";
@@ -5481,7 +5792,7 @@ void GlobalISelEmitter::run(raw_ostream &OS) {
      << "enum {\n"
      << "  GICR_Invalid,\n";
   for (const auto &Record : CustomRendererFns)
-    OS << "  GICR_" << Record->getValueAsString("RendererFn") << ", \n";
+    OS << "  GICR_" << Record->getValueAsString("RendererFn") << ",\n";
   OS << "};\n";
 
   OS << Target.getName() << "InstructionSelector::CustomRendererFn\n"
@@ -5774,11 +6085,10 @@ void SwitchMatcher::finalize() {
   if (empty())
     return;
 
-  std::stable_sort(Matchers.begin(), Matchers.end(),
-                   [](const Matcher *L, const Matcher *R) {
-                     return L->getFirstCondition().getValue() <
-                            R->getFirstCondition().getValue();
-                   });
+  llvm::stable_sort(Matchers, [](const Matcher *L, const Matcher *R) {
+    return L->getFirstCondition().getValue() <
+           R->getFirstCondition().getValue();
+  });
   Condition = Matchers[0]->popFirstCondition();
   for (unsigned I = 1, E = Values.size(); I < E; ++I)
     Matchers[I]->popFirstCondition();
diff --git a/contrib/llvm-project/llvm/utils/TableGen/InstrInfoEmitter.cpp b/contrib/llvm-project/llvm/utils/TableGen/InstrInfoEmitter.cpp
index f3141735a995..9ff385faec56 100644
--- a/contrib/llvm-project/llvm/utils/TableGen/InstrInfoEmitter.cpp
+++ b/contrib/llvm-project/llvm/utils/TableGen/InstrInfoEmitter.cpp
@@ -182,11 +182,10 @@ InstrInfoEmitter::GetOperandInfo(const CodeGenInstruction &Inst) {
       if (Constraint.isNone())
         Res += "0";
       else if (Constraint.isEarlyClobber())
-        Res += "(1 << MCOI::EARLY_CLOBBER)";
+        Res += "MCOI_EARLY_CLOBBER";
       else {
         assert(Constraint.isTied());
-        Res += "((" + utostr(Constraint.getTiedOperand()) +
-                    " << 16) | (1 << MCOI::TIED_TO))";
+        Res += "MCOI_TIED_TO(" + utostr(Constraint.getTiedOperand()) + ")";
       }
 
       Result.push_back(Res);
@@ -280,7 +279,7 @@ void InstrInfoEmitter::emitOperandNameMappings(raw_ostream &OS,
   for (const auto &Op : Operands)
     OS << "  " << Op.first << " = " << Op.second << ",\n";
 
-  OS << "OPERAND_LAST";
+  OS << "  OPERAND_LAST";
   OS << "\n};\n";
   OS << "} // end namespace OpName\n";
   OS << "} // end namespace " << Namespace << "\n";
@@ -316,7 +315,7 @@ void InstrInfoEmitter::emitOperandNameMappings(raw_ostream &OS,
 
       OS << "    return OperandMap[" << TableIndex++ << "][NamedIdx];\n";
     }
-    OS << "    default: return -1;\n";
+    OS << "  default: return -1;\n";
     OS << "  }\n";
   } else {
     // There are no operands, so no need to emit anything
@@ -371,7 +370,7 @@ void InstrInfoEmitter::emitOperandTypeMappings(
   OS << "namespace " << Namespace << " {\n";
   OS << "LLVM_READONLY\n";
   OS << "static int getOperandType(uint16_t Opcode, uint16_t OpIdx) {\n";
-  // TODO: Factor out instructions with same operands to compress the tables.
+  // TODO: Factor out duplicate operand lists to compress the tables.
   if (!NumberedInstructions.empty()) {
     std::vector<int> OperandOffsets;
     std::vector<Record *> OperandRecords;
@@ -385,7 +384,7 @@ void InstrInfoEmitter::emitOperandTypeMappings(
           OperandRecords.push_back(Op.Rec);
           ++CurrentOffset;
         } else {
-          for (Init *Arg : make_range(MIOI->arg_begin(), MIOI->arg_end())) {
+          for (Init *Arg : MIOI->getArgs()) {
             OperandRecords.push_back(cast<DefInit>(Arg)->getDef());
             ++CurrentOffset;
           }
@@ -393,16 +392,26 @@ void InstrInfoEmitter::emitOperandTypeMappings(
       }
     }
 
-    // Emit the table of offsets for the opcode lookup.
-    OS << "  const int Offsets[] = {\n";
+    // Emit the table of offsets (indexes) into the operand type table.
+    // Size the unsigned integer offset to save space.
+    assert(OperandRecords.size() <= UINT32_MAX &&
+           "Too many operands for offset table");
+    OS << ((OperandRecords.size() <= UINT16_MAX) ? "  const uint16_t"
+                                                 : "  const uint32_t");
+    OS << " Offsets[] = {\n";
     for (int I = 0, E = OperandOffsets.size(); I != E; ++I)
       OS << "    " << OperandOffsets[I] << ",\n";
     OS << "  };\n";
 
     // Add an entry for the end so that we don't need to special case it below.
     OperandOffsets.push_back(OperandRecords.size());
+
     // Emit the actual operand types in a flat table.
-    OS << "  const int OpcodeOperandTypes[] = {\n    ";
+    // Size the signed integer operand type to save space.
+    assert(EnumVal <= INT16_MAX &&
+           "Too many operand types for operand types table");
+    OS << ((EnumVal <= INT8_MAX) ? "  const int8_t" : "  const int16_t");
+    OS << " OpcodeOperandTypes[] = {\n    ";
     for (int I = 0, E = OperandRecords.size(), CurOffset = 1; I != E; ++I) {
       // We print each Opcode's operands in its own row.
       if (I == OperandOffsets[CurOffset]) {
@@ -532,6 +541,7 @@ void InstrInfoEmitter::run(raw_ostream &OS) {
   unsigned ListNumber = 0;
 
   // Emit all of the instruction's implicit uses and defs.
+  Records.startTimer("Emit uses/defs");
   for (const CodeGenInstruction *II : Target.getInstructionsByEnumValue()) {
     Record *Inst = II->TheDef;
     std::vector<Record*> Uses = Inst->getValueAsListOfDefs("Uses");
@@ -549,10 +559,12 @@ void InstrInfoEmitter::run(raw_ostream &OS) {
   OperandInfoMapTy OperandInfoIDs;
 
   // Emit all of the operand info records.
+  Records.startTimer("Emit operand info");
   EmitOperandInfo(OS, OperandInfoIDs);
 
   // Emit all of the MCInstrDesc records in their ENUM ordering.
   //
+  Records.startTimer("Emit InstrDesc records");
   OS << "\nextern const MCInstrDesc " << TargetName << "Insts[] = {\n";
   ArrayRef<const CodeGenInstruction*> NumberedInstructions =
     Target.getInstructionsByEnumValue();
@@ -568,6 +580,7 @@ void InstrInfoEmitter::run(raw_ostream &OS) {
   OS << "};\n\n";
 
   // Emit the array of instruction names.
+  Records.startTimer("Emit instruction names");
   InstrNames.layout();
   InstrNames.emitStringLiteralDef(OS, Twine("extern const char ") + TargetName +
                                           "InstrNameData[]");
@@ -628,6 +641,7 @@ void InstrInfoEmitter::run(raw_ostream &OS) {
   }
 
   // MCInstrInfo initialization routine.
+  Records.startTimer("Emit initialization routine");
   OS << "static inline void Init" << TargetName
      << "MCInstrInfo(MCInstrInfo *II) {\n";
   OS << "  II->InitMCInstrInfo(" << TargetName << "Insts, " << TargetName
@@ -706,10 +720,13 @@ void InstrInfoEmitter::run(raw_ostream &OS) {
 
   OS << "#endif // GET_INSTRINFO_CTOR_DTOR\n\n";
 
+  Records.startTimer("Emit operand name mappings");
   emitOperandNameMappings(OS, Target, NumberedInstructions);
 
+  Records.startTimer("Emit operand type mappings");
   emitOperandTypeMappings(OS, Target, NumberedInstructions);
 
+  Records.startTimer("Emit helper methods");
   emitMCIIHelperMethods(OS, TargetName);
 }
 
@@ -862,7 +879,9 @@ void InstrInfoEmitter::emitEnums(raw_ostream &OS) {
 namespace llvm {
 
 void EmitInstrInfo(RecordKeeper &RK, raw_ostream &OS) {
+  RK.startTimer("Analyze DAG patterns");
   InstrInfoEmitter(RK).run(OS);
+  RK.startTimer("Emit map table");
   EmitMapTable(RK, OS);
 }
 
diff --git a/contrib/llvm-project/llvm/utils/TableGen/IntrinsicEmitter.cpp b/contrib/llvm-project/llvm/utils/TableGen/IntrinsicEmitter.cpp
index 7e4191494149..978d24c8300d 100644
--- a/contrib/llvm-project/llvm/utils/TableGen/IntrinsicEmitter.cpp
+++ b/contrib/llvm-project/llvm/utils/TableGen/IntrinsicEmitter.cpp
@@ -246,13 +246,16 @@ enum IIT_Info {
   IIT_SUBDIVIDE4_ARG = 45,
   IIT_VEC_OF_BITCASTS_TO_INT = 46,
   IIT_V128 = 47,
-  IIT_BF16 = 48
+  IIT_BF16 = 48,
+  IIT_STRUCT9 = 49,
+  IIT_V256 = 50,
+  IIT_AMX  = 51
 };
 
 static void EncodeFixedValueType(MVT::SimpleValueType VT,
                                  std::vector<unsigned char> &Sig) {
   if (MVT(VT).isInteger()) {
-    unsigned BitWidth = MVT(VT).getSizeInBits();
+    unsigned BitWidth = MVT(VT).getFixedSizeInBits();
     switch (BitWidth) {
     default: PrintFatalError("unhandled integer type width in intrinsic!");
     case 1: return Sig.push_back(IIT_I1);
@@ -274,6 +277,7 @@ static void EncodeFixedValueType(MVT::SimpleValueType VT,
   case MVT::token: return Sig.push_back(IIT_TOKEN);
   case MVT::Metadata: return Sig.push_back(IIT_METADATA);
   case MVT::x86mmx: return Sig.push_back(IIT_MMX);
+  case MVT::x86amx: return Sig.push_back(IIT_AMX);
   // MVT::OtherVT is used to mean the empty struct type here.
   case MVT::Other: return Sig.push_back(IIT_EMPTYSTRUCT);
   // MVT::isVoid is used to represent varargs here.
@@ -384,6 +388,7 @@ static void EncodeFixedType(Record *R, std::vector<unsigned char> &ArgCodes,
     case 32: Sig.push_back(IIT_V32); break;
     case 64: Sig.push_back(IIT_V64); break;
     case 128: Sig.push_back(IIT_V128); break;
+    case 256: Sig.push_back(IIT_V256); break;
     case 512: Sig.push_back(IIT_V512); break;
     case 1024: Sig.push_back(IIT_V1024); break;
     }
@@ -469,6 +474,7 @@ static void ComputeFixedEncoding(const CodeGenIntrinsic &Int,
       case 6: TypeSig.push_back(IIT_STRUCT6); break;
       case 7: TypeSig.push_back(IIT_STRUCT7); break;
       case 8: TypeSig.push_back(IIT_STRUCT8); break;
+      case 9: TypeSig.push_back(IIT_STRUCT9); break;
       default: llvm_unreachable("Unhandled case in struct");
     }
 
@@ -632,13 +638,13 @@ void IntrinsicEmitter::EmitAttributes(const CodeGenIntrinsicTable &Ints,
       std::max(maxArgAttrs, unsigned(intrinsic.ArgumentAttributes.size()));
     unsigned &N = UniqAttributes[&intrinsic];
     if (N) continue;
-    assert(AttrNum < 256 && "Too many unique attributes for table!");
     N = ++AttrNum;
+    assert(N < 65536 && "Too many unique attributes for table!");
   }
 
   // Emit an array of AttributeList.  Most intrinsics will have at least one
   // entry, for the function itself (index ~1), which is usually nounwind.
-  OS << "  static const uint8_t IntrinsicsToAttributesMap[] = {\n";
+  OS << "  static const uint16_t IntrinsicsToAttributesMap[] = {\n";
 
   for (unsigned i = 0, e = Ints.size(); i != e; ++i) {
     const CodeGenIntrinsic &intrinsic = Ints[i];
@@ -687,6 +693,12 @@ void IntrinsicEmitter::EmitAttributes(const CodeGenIntrinsicTable &Ints,
             OS << "Attribute::NoAlias";
             addComma = true;
             break;
+          case CodeGenIntrinsic::NoUndef:
+            if (addComma)
+              OS << ",";
+            OS << "Attribute::NoUndef";
+            addComma = true;
+            break;
           case CodeGenIntrinsic::Returned:
             if (addComma)
               OS << ",";
diff --git a/contrib/llvm-project/llvm/utils/TableGen/OptParserEmitter.cpp b/contrib/llvm-project/llvm/utils/TableGen/OptParserEmitter.cpp
index 251533a8d154..8e6c05885e5b 100644
--- a/contrib/llvm-project/llvm/utils/TableGen/OptParserEmitter.cpp
+++ b/contrib/llvm-project/llvm/utils/TableGen/OptParserEmitter.cpp
@@ -20,7 +20,7 @@
 
 using namespace llvm;
 
-static const std::string getOptionName(const Record &R) {
+static std::string getOptionName(const Record &R) {
   // Use the record name unless EnumName is defined.
   if (isa<UnsetInit>(R.getValueInit("EnumName")))
     return std::string(R.getName());
@@ -35,19 +35,20 @@ static raw_ostream &write_cstring(raw_ostream &OS, llvm::StringRef Str) {
   return OS;
 }
 
-static const std::string getOptionSpelling(const Record &R,
-                                           size_t &PrefixLength) {
+static std::string getOptionSpelling(const Record &R, size_t &PrefixLength) {
   std::vector<StringRef> Prefixes = R.getValueAsListOfStrings("Prefixes");
   StringRef Name = R.getValueAsString("Name");
+
   if (Prefixes.empty()) {
     PrefixLength = 0;
     return Name.str();
   }
+
   PrefixLength = Prefixes[0].size();
   return (Twine(Prefixes[0]) + Twine(Name)).str();
 }
 
-static const std::string getOptionSpelling(const Record &R) {
+static std::string getOptionSpelling(const Record &R) {
   size_t PrefixLength;
   return getOptionSpelling(R, PrefixLength);
 }
@@ -59,75 +60,29 @@ static void emitNameUsingSpelling(raw_ostream &OS, const Record &R) {
   OS << "[" << PrefixLength << "]";
 }
 
-class MarshallingKindInfo {
+class MarshallingInfo {
 public:
+  static constexpr const char *MacroName = "OPTION_WITH_MARSHALLING";
   const Record &R;
-  const char *MacroName;
   bool ShouldAlwaysEmit;
+  StringRef MacroPrefix;
   StringRef KeyPath;
   StringRef DefaultValue;
   StringRef NormalizedValuesScope;
-
-  void emit(raw_ostream &OS) const {
-    write_cstring(OS, StringRef(getOptionSpelling(R)));
-    OS << ", ";
-    OS << ShouldAlwaysEmit;
-    OS << ", ";
-    OS << KeyPath;
-    OS << ", ";
-    emitScopedNormalizedValue(OS, DefaultValue);
-    OS << ", ";
-    emitSpecific(OS);
-  }
-
-  virtual Optional<StringRef> emitValueTable(raw_ostream &OS) const {
-    return None;
-  }
-
-  virtual ~MarshallingKindInfo() = default;
-
-  static std::unique_ptr<MarshallingKindInfo> create(const Record &R);
-
-protected:
-  void emitScopedNormalizedValue(raw_ostream &OS,
-                                 StringRef NormalizedValue) const {
-    if (!NormalizedValuesScope.empty())
-      OS << NormalizedValuesScope << "::";
-    OS << NormalizedValue;
-  }
-
-  virtual void emitSpecific(raw_ostream &OS) const = 0;
-  MarshallingKindInfo(const Record &R, const char *MacroName)
-      : R(R), MacroName(MacroName) {}
-};
-
-class MarshallingFlagInfo final : public MarshallingKindInfo {
-public:
-  bool IsPositive;
-
-  void emitSpecific(raw_ostream &OS) const override { OS << IsPositive; }
-
-  static std::unique_ptr<MarshallingKindInfo> create(const Record &R) {
-    std::unique_ptr<MarshallingFlagInfo> Ret(new MarshallingFlagInfo(R));
-    Ret->IsPositive = R.getValueAsBit("IsPositive");
-    return Ret;
-  }
-
-private:
-  MarshallingFlagInfo(const Record &R)
-      : MarshallingKindInfo(R, "OPTION_WITH_MARSHALLING_FLAG") {}
-};
-
-class MarshallingStringInfo final : public MarshallingKindInfo {
-public:
-  StringRef NormalizerRetTy;
+  StringRef ImpliedCheck;
+  StringRef ImpliedValue;
+  StringRef ShouldParse;
   StringRef Normalizer;
   StringRef Denormalizer;
+  StringRef ValueMerger;
+  StringRef ValueExtractor;
   int TableIndex = -1;
   std::vector<StringRef> Values;
   std::vector<StringRef> NormalizedValues;
   std::string ValueTableName;
 
+  static size_t NextTableIndex;
+
   static constexpr const char *ValueTablePreamble = R"(
 struct SimpleEnumValue {
   const char *Name;
@@ -143,17 +98,39 @@ struct SimpleEnumValueTable {
   static constexpr const char *ValueTablesDecl =
       "static const SimpleEnumValueTable SimpleEnumValueTables[] = ";
 
-  void emitSpecific(raw_ostream &OS) const override {
-    emitScopedNormalizedValue(OS, NormalizerRetTy);
+  MarshallingInfo(const Record &R) : R(R) {}
+
+  std::string getMacroName() const {
+    return (MacroPrefix + MarshallingInfo::MacroName).str();
+  }
+
+  void emit(raw_ostream &OS) const {
+    write_cstring(OS, StringRef(getOptionSpelling(R)));
+    OS << ", ";
+    OS << ShouldParse;
+    OS << ", ";
+    OS << ShouldAlwaysEmit;
+    OS << ", ";
+    OS << KeyPath;
+    OS << ", ";
+    emitScopedNormalizedValue(OS, DefaultValue);
+    OS << ", ";
+    OS << ImpliedCheck;
+    OS << ", ";
+    emitScopedNormalizedValue(OS, ImpliedValue);
     OS << ", ";
     OS << Normalizer;
     OS << ", ";
     OS << Denormalizer;
     OS << ", ";
+    OS << ValueMerger;
+    OS << ", ";
+    OS << ValueExtractor;
+    OS << ", ";
     OS << TableIndex;
   }
 
-  Optional<StringRef> emitValueTable(raw_ostream &OS) const override {
+  Optional<StringRef> emitValueTable(raw_ostream &OS) const {
     if (TableIndex == -1)
       return {};
     OS << "static const SimpleEnumValue " << ValueTableName << "[] = {\n";
@@ -169,73 +146,66 @@ struct SimpleEnumValueTable {
     return StringRef(ValueTableName);
   }
 
-  static std::unique_ptr<MarshallingKindInfo> create(const Record &R) {
-    assert(!isa<UnsetInit>(R.getValueInit("NormalizerRetTy")) &&
-           "String options must have a type");
-
-    std::unique_ptr<MarshallingStringInfo> Ret(new MarshallingStringInfo(R));
-    Ret->NormalizerRetTy = R.getValueAsString("NormalizerRetTy");
-
-    Ret->Normalizer = R.getValueAsString("Normalizer");
-    Ret->Denormalizer = R.getValueAsString("Denormalizer");
-
-    if (!isa<UnsetInit>(R.getValueInit("NormalizedValues"))) {
-      assert(!isa<UnsetInit>(R.getValueInit("Values")) &&
-             "Cannot provide normalized values for value-less options");
-      Ret->TableIndex = NextTableIndex++;
-      Ret->NormalizedValues = R.getValueAsListOfStrings("NormalizedValues");
-      Ret->Values.reserve(Ret->NormalizedValues.size());
-      Ret->ValueTableName = getOptionName(R) + "ValueTable";
-
-      StringRef ValuesStr = R.getValueAsString("Values");
-      for (;;) {
-        size_t Idx = ValuesStr.find(',');
-        if (Idx == StringRef::npos)
-          break;
-        if (Idx > 0)
-          Ret->Values.push_back(ValuesStr.slice(0, Idx));
-        ValuesStr = ValuesStr.slice(Idx + 1, StringRef::npos);
-      }
-      if (!ValuesStr.empty())
-        Ret->Values.push_back(ValuesStr);
-
-      assert(Ret->Values.size() == Ret->NormalizedValues.size() &&
-             "The number of normalized values doesn't match the number of "
-             "values");
-    }
-
-    return Ret;
-  }
-
 private:
-  MarshallingStringInfo(const Record &R)
-      : MarshallingKindInfo(R, "OPTION_WITH_MARSHALLING_STRING") {}
-
-  static size_t NextTableIndex;
+  void emitScopedNormalizedValue(raw_ostream &OS,
+                                 StringRef NormalizedValue) const {
+    if (!NormalizedValuesScope.empty())
+      OS << NormalizedValuesScope << "::";
+    OS << NormalizedValue;
+  }
 };
 
-size_t MarshallingStringInfo::NextTableIndex = 0;
+size_t MarshallingInfo::NextTableIndex = 0;
 
-std::unique_ptr<MarshallingKindInfo>
-MarshallingKindInfo::create(const Record &R) {
+static MarshallingInfo createMarshallingInfo(const Record &R) {
   assert(!isa<UnsetInit>(R.getValueInit("KeyPath")) &&
          !isa<UnsetInit>(R.getValueInit("DefaultValue")) &&
-         "Must provide at least a key-path and a default value for emitting "
-         "marshalling information");
-
-  std::unique_ptr<MarshallingKindInfo> Ret = nullptr;
-  StringRef MarshallingKindStr = R.getValueAsString("MarshallingKind");
-
-  if (MarshallingKindStr == "flag")
-    Ret = MarshallingFlagInfo::create(R);
-  else if (MarshallingKindStr == "string")
-    Ret = MarshallingStringInfo::create(R);
-
-  Ret->ShouldAlwaysEmit = R.getValueAsBit("ShouldAlwaysEmit");
-  Ret->KeyPath = R.getValueAsString("KeyPath");
-  Ret->DefaultValue = R.getValueAsString("DefaultValue");
-  if (!isa<UnsetInit>(R.getValueInit("NormalizedValuesScope")))
-    Ret->NormalizedValuesScope = R.getValueAsString("NormalizedValuesScope");
+         !isa<UnsetInit>(R.getValueInit("ValueMerger")) &&
+         "MarshallingInfo must have a provide a keypath, default value and a "
+         "value merger");
+
+  MarshallingInfo Ret(R);
+
+  Ret.ShouldAlwaysEmit = R.getValueAsBit("ShouldAlwaysEmit");
+  Ret.MacroPrefix = R.getValueAsString("MacroPrefix");
+  Ret.KeyPath = R.getValueAsString("KeyPath");
+  Ret.DefaultValue = R.getValueAsString("DefaultValue");
+  Ret.NormalizedValuesScope = R.getValueAsString("NormalizedValuesScope");
+  Ret.ImpliedCheck = R.getValueAsString("ImpliedCheck");
+  Ret.ImpliedValue =
+      R.getValueAsOptionalString("ImpliedValue").getValueOr(Ret.DefaultValue);
+
+  Ret.ShouldParse = R.getValueAsString("ShouldParse");
+  Ret.Normalizer = R.getValueAsString("Normalizer");
+  Ret.Denormalizer = R.getValueAsString("Denormalizer");
+  Ret.ValueMerger = R.getValueAsString("ValueMerger");
+  Ret.ValueExtractor = R.getValueAsString("ValueExtractor");
+
+  if (!isa<UnsetInit>(R.getValueInit("NormalizedValues"))) {
+    assert(!isa<UnsetInit>(R.getValueInit("Values")) &&
+           "Cannot provide normalized values for value-less options");
+    Ret.TableIndex = MarshallingInfo::NextTableIndex++;
+    Ret.NormalizedValues = R.getValueAsListOfStrings("NormalizedValues");
+    Ret.Values.reserve(Ret.NormalizedValues.size());
+    Ret.ValueTableName = getOptionName(R) + "ValueTable";
+
+    StringRef ValuesStr = R.getValueAsString("Values");
+    for (;;) {
+      size_t Idx = ValuesStr.find(',');
+      if (Idx == StringRef::npos)
+        break;
+      if (Idx > 0)
+        Ret.Values.push_back(ValuesStr.slice(0, Idx));
+      ValuesStr = ValuesStr.slice(Idx + 1, StringRef::npos);
+    }
+    if (!ValuesStr.empty())
+      Ret.Values.push_back(ValuesStr);
+
+    assert(Ret.Values.size() == Ret.NormalizedValues.size() &&
+           "The number of normalized values doesn't match the number of "
+           "values");
+  }
+
   return Ret;
 }
 
@@ -258,13 +228,12 @@ void EmitOptParser(RecordKeeper &Records, raw_ostream &OS) {
   PrefixesT Prefixes;
   Prefixes.insert(std::make_pair(PrefixKeyT(), "prefix_0"));
   unsigned CurPrefix = 0;
-  for (unsigned i = 0, e = Opts.size(); i != e; ++i) {
-    const Record &R = *Opts[i];
-    std::vector<StringRef> prf = R.getValueAsListOfStrings("Prefixes");
-    PrefixKeyT prfkey(prf.begin(), prf.end());
+  for (const Record &R : llvm::make_pointee_range(Opts)) {
+    std::vector<StringRef> RPrefixes = R.getValueAsListOfStrings("Prefixes");
+    PrefixKeyT PrefixKey(RPrefixes.begin(), RPrefixes.end());
     unsigned NewPrefix = CurPrefix + 1;
-    if (Prefixes.insert(std::make_pair(prfkey, (Twine("prefix_") +
-                                              Twine(NewPrefix)).str())).second)
+    std::string Prefix = (Twine("prefix_") + Twine(NewPrefix)).str();
+    if (Prefixes.insert(std::make_pair(PrefixKey, Prefix)).second)
       CurPrefix = NewPrefix;
   }
 
@@ -274,19 +243,16 @@ void EmitOptParser(RecordKeeper &Records, raw_ostream &OS) {
   OS << "// Prefixes\n\n";
   OS << "#ifdef PREFIX\n";
   OS << "#define COMMA ,\n";
-  for (PrefixesT::const_iterator I = Prefixes.begin(), E = Prefixes.end();
-                                  I != E; ++I) {
+  for (const auto &Prefix : Prefixes) {
     OS << "PREFIX(";
 
     // Prefix name.
-    OS << I->second;
+    OS << Prefix.second;
 
     // Prefix values.
     OS << ", {";
-    for (PrefixKeyT::const_iterator PI = I->first.begin(),
-                                    PE = I->first.end(); PI != PE; ++PI) {
-      OS << "\"" << *PI << "\" COMMA ";
-    }
+    for (StringRef PrefixKey : Prefix.first)
+      OS << "\"" << PrefixKey << "\" COMMA ";
     OS << "nullptr})\n";
   }
   OS << "#undef COMMA\n";
@@ -295,9 +261,7 @@ void EmitOptParser(RecordKeeper &Records, raw_ostream &OS) {
   OS << "/////////\n";
   OS << "// Groups\n\n";
   OS << "#ifdef OPTION\n";
-  for (unsigned i = 0, e = Groups.size(); i != e; ++i) {
-    const Record &R = *Groups[i];
-
+  for (const Record &R : llvm::make_pointee_range(Groups)) {
     // Start a single option entry.
     OS << "OPTION(";
 
@@ -344,8 +308,8 @@ void EmitOptParser(RecordKeeper &Records, raw_ostream &OS) {
 
   auto WriteOptRecordFields = [&](raw_ostream &OS, const Record &R) {
     // The option prefix;
-    std::vector<StringRef> prf = R.getValueAsListOfStrings("Prefixes");
-    OS << Prefixes[PrefixKeyT(prf.begin(), prf.end())] << ", ";
+    std::vector<StringRef> RPrefixes = R.getValueAsListOfStrings("Prefixes");
+    OS << Prefixes[PrefixKeyT(RPrefixes.begin(), RPrefixes.end())] << ", ";
 
     // The option string.
     emitNameUsingSpelling(OS, R);
@@ -382,8 +346,8 @@ void EmitOptParser(RecordKeeper &Records, raw_ostream &OS) {
       OS << "nullptr";
     } else {
       OS << "\"";
-      for (size_t i = 0, e = AliasArgs.size(); i != e; ++i)
-        OS << AliasArgs[i] << "\\0";
+      for (StringRef AliasArg : AliasArgs)
+        OS << AliasArg << "\\0";
       OS << "\"";
     }
 
@@ -427,39 +391,63 @@ void EmitOptParser(RecordKeeper &Records, raw_ostream &OS) {
       OS << "nullptr";
   };
 
-  std::vector<std::unique_ptr<MarshallingKindInfo>> OptsWithMarshalling;
-  for (unsigned I = 0, E = Opts.size(); I != E; ++I) {
-    const Record &R = *Opts[I];
+  auto IsMarshallingOption = [](const Record &R) {
+    return !isa<UnsetInit>(R.getValueInit("KeyPath")) &&
+           !R.getValueAsString("KeyPath").empty();
+  };
 
+  std::vector<const Record *> OptsWithMarshalling;
+  for (const Record &R : llvm::make_pointee_range(Opts)) {
     // Start a single option entry.
     OS << "OPTION(";
     WriteOptRecordFields(OS, R);
     OS << ")\n";
-    if (!isa<UnsetInit>(R.getValueInit("MarshallingKind")))
-      OptsWithMarshalling.push_back(MarshallingKindInfo::create(R));
+    if (IsMarshallingOption(R))
+      OptsWithMarshalling.push_back(&R);
   }
   OS << "#endif // OPTION\n";
 
-  for (const auto &KindInfo : OptsWithMarshalling) {
-    OS << "#ifdef " << KindInfo->MacroName << "\n";
-    OS << KindInfo->MacroName << "(";
-    WriteOptRecordFields(OS, KindInfo->R);
+  auto CmpMarshallingOpts = [](const Record *const *A, const Record *const *B) {
+    unsigned AID = (*A)->getID();
+    unsigned BID = (*B)->getID();
+
+    if (AID < BID)
+      return -1;
+    if (AID > BID)
+      return 1;
+    return 0;
+  };
+  // The RecordKeeper stores records (options) in lexicographical order, and we
+  // have reordered the options again when generating prefix groups. We need to
+  // restore the original definition order of options with marshalling to honor
+  // the topology of the dependency graph implied by `DefaultAnyOf`.
+  array_pod_sort(OptsWithMarshalling.begin(), OptsWithMarshalling.end(),
+                 CmpMarshallingOpts);
+
+  std::vector<MarshallingInfo> MarshallingInfos;
+  for (const auto *R : OptsWithMarshalling)
+    MarshallingInfos.push_back(createMarshallingInfo(*R));
+
+  for (const auto &MI : MarshallingInfos) {
+    OS << "#ifdef " << MI.getMacroName() << "\n";
+    OS << MI.getMacroName() << "(";
+    WriteOptRecordFields(OS, MI.R);
     OS << ", ";
-    KindInfo->emit(OS);
+    MI.emit(OS);
     OS << ")\n";
-    OS << "#endif // " << KindInfo->MacroName << "\n";
+    OS << "#endif // " << MI.getMacroName() << "\n";
   }
 
   OS << "\n";
   OS << "#ifdef SIMPLE_ENUM_VALUE_TABLE";
   OS << "\n";
-  OS << MarshallingStringInfo::ValueTablePreamble;
+  OS << MarshallingInfo::ValueTablePreamble;
   std::vector<StringRef> ValueTableNames;
-  for (const auto &KindInfo : OptsWithMarshalling)
-    if (auto MaybeValueTableName = KindInfo->emitValueTable(OS))
+  for (const auto &MI : MarshallingInfos)
+    if (auto MaybeValueTableName = MI.emitValueTable(OS))
       ValueTableNames.push_back(*MaybeValueTableName);
 
-  OS << MarshallingStringInfo::ValueTablesDecl << "{";
+  OS << MarshallingInfo::ValueTablesDecl << "{";
   for (auto ValueTableName : ValueTableNames)
     OS << "{" << ValueTableName << ", sizeof(" << ValueTableName
        << ") / sizeof(SimpleEnumValue)"
@@ -475,8 +463,7 @@ void EmitOptParser(RecordKeeper &Records, raw_ostream &OS) {
   OS << "#ifdef OPTTABLE_ARG_INIT\n";
   OS << "//////////\n";
   OS << "// Option Values\n\n";
-  for (unsigned I = 0, E = Opts.size(); I != E; ++I) {
-    const Record &R = *Opts[I];
+  for (const Record &R : llvm::make_pointee_range(Opts)) {
     if (isa<UnsetInit>(R.getValueInit("ValuesCode")))
       continue;
     OS << "{\n";
diff --git a/contrib/llvm-project/llvm/utils/TableGen/PredicateExpander.cpp b/contrib/llvm-project/llvm/utils/TableGen/PredicateExpander.cpp
index 9f7f40db2626..a76640f6d11f 100644
--- a/contrib/llvm-project/llvm/utils/TableGen/PredicateExpander.cpp
+++ b/contrib/llvm-project/llvm/utils/TableGen/PredicateExpander.cpp
@@ -198,6 +198,18 @@ void PredicateExpander::expandCheckIsImmOperand(raw_ostream &OS, int OpIndex) {
      << "getOperand(" << OpIndex << ").isImm() ";
 }
 
+void PredicateExpander::expandCheckFunctionPredicateWithTII(
+    raw_ostream &OS, StringRef MCInstFn, StringRef MachineInstrFn,
+    StringRef TIIPtr) {
+  if (!shouldExpandForMC()) {
+    OS << (TIIPtr.empty() ? "TII" : TIIPtr) << "->" << MachineInstrFn;
+    OS << (isByRef() ? "(MI)" : "(*MI)");
+    return;
+  }
+
+  OS << MCInstFn << (isByRef() ? "(MI" : "(*MI") << ", MCII)";
+}
+
 void PredicateExpander::expandCheckFunctionPredicate(raw_ostream &OS,
                                                      StringRef MCInstFn,
                                                      StringRef MachineInstrFn) {
@@ -358,10 +370,18 @@ void PredicateExpander::expandPredicate(raw_ostream &OS, const Record *Rec) {
     return expandPredicateSequence(OS, Rec->getValueAsListOfDefs("Predicates"),
                                    /* AllOf */ false);
 
-  if (Rec->isSubClassOf("CheckFunctionPredicate"))
+  if (Rec->isSubClassOf("CheckFunctionPredicate")) {
     return expandCheckFunctionPredicate(
         OS, Rec->getValueAsString("MCInstFnName"),
         Rec->getValueAsString("MachineInstrFnName"));
+  }
+
+  if (Rec->isSubClassOf("CheckFunctionPredicateWithTII")) {
+    return expandCheckFunctionPredicateWithTII(
+        OS, Rec->getValueAsString("MCInstFnName"),
+        Rec->getValueAsString("MachineInstrFnName"),
+        Rec->getValueAsString("TIIPtrName"));
+  }
 
   if (Rec->isSubClassOf("CheckNonPortable"))
     return expandCheckNonPortable(OS, Rec->getValueAsString("CodeBlock"));
diff --git a/contrib/llvm-project/llvm/utils/TableGen/PredicateExpander.h b/contrib/llvm-project/llvm/utils/TableGen/PredicateExpander.h
index 115a81cf123b..29cca92d902c 100644
--- a/contrib/llvm-project/llvm/utils/TableGen/PredicateExpander.h
+++ b/contrib/llvm-project/llvm/utils/TableGen/PredicateExpander.h
@@ -79,6 +79,9 @@ public:
   void expandCheckInvalidRegOperand(raw_ostream &OS, int OpIndex);
   void expandCheckFunctionPredicate(raw_ostream &OS, StringRef MCInstFn,
                                     StringRef MachineInstrFn);
+  void expandCheckFunctionPredicateWithTII(raw_ostream &OS, StringRef MCInstFn,
+                                           StringRef MachineInstrFn,
+                                           StringRef TIIPtr);
   void expandCheckNonPortable(raw_ostream &OS, StringRef CodeBlock);
   void expandPredicate(raw_ostream &OS, const Record *Rec);
   void expandReturnStatement(raw_ostream &OS, const Record *Rec);
diff --git a/contrib/llvm-project/llvm/utils/TableGen/PseudoLoweringEmitter.cpp b/contrib/llvm-project/llvm/utils/TableGen/PseudoLoweringEmitter.cpp
index 3a80d8e5d1c4..e05409db67d0 100644
--- a/contrib/llvm-project/llvm/utils/TableGen/PseudoLoweringEmitter.cpp
+++ b/contrib/llvm-project/llvm/utils/TableGen/PseudoLoweringEmitter.cpp
@@ -89,11 +89,15 @@ addDagOperandMapping(Record *Rec, DagInit *Dag, CodeGenInstruction &Insn,
       // problem.
       // FIXME: We probably shouldn't ever get a non-zero BaseIdx here.
       assert(BaseIdx == 0 && "Named subargument in pseudo expansion?!");
-      if (DI->getDef() != Insn.Operands[BaseIdx + i].Rec)
-        PrintFatalError(Rec->getLoc(),
-                      "Pseudo operand type '" + DI->getDef()->getName() +
-                      "' does not match expansion operand type '" +
-                      Insn.Operands[BaseIdx + i].Rec->getName() + "'");
+      // FIXME: Are the message operand types backward?
+      if (DI->getDef() != Insn.Operands[BaseIdx + i].Rec) {
+        PrintError(Rec, "In pseudo instruction '" + Rec->getName() +
+                        "', operand type '" + DI->getDef()->getName() +
+                        "' does not match expansion operand type '" +
+                        Insn.Operands[BaseIdx + i].Rec->getName() + "'");
+        PrintFatalNote(DI->getDef(),
+                       "Value was assigned at the following location:");
+      }
       // Source operand maps to destination operand. The Data element
       // will be filled in later, just set the Kind for now. Do it
       // for each corresponding MachineInstr operand, not just the first.
@@ -128,23 +132,38 @@ void PseudoLoweringEmitter::evaluateExpansion(Record *Rec) {
   LLVM_DEBUG(dbgs() << "  Result: " << *Dag << "\n");
 
   DefInit *OpDef = dyn_cast<DefInit>(Dag->getOperator());
-  if (!OpDef)
-    PrintFatalError(Rec->getLoc(), Rec->getName() +
-                  " has unexpected operator type!");
+  if (!OpDef) {
+    PrintError(Rec, "In pseudo instruction '" + Rec->getName() +
+                    "', result operator is not a record");
+    PrintFatalNote(Rec->getValue("ResultInst"),
+                   "Result was assigned at the following location:");
+  }
   Record *Operator = OpDef->getDef();
-  if (!Operator->isSubClassOf("Instruction"))
-    PrintFatalError(Rec->getLoc(), "Pseudo result '" + Operator->getName() +
-                    "' is not an instruction!");
+  if (!Operator->isSubClassOf("Instruction")) {
+    PrintError(Rec, "In pseudo instruction '" + Rec->getName() +
+                    "', result operator '" + Operator->getName() +
+                    "' is not an instruction");
+    PrintFatalNote(Rec->getValue("ResultInst"),
+                   "Result was assigned at the following location:");
+  }
 
   CodeGenInstruction Insn(Operator);
 
-  if (Insn.isCodeGenOnly || Insn.isPseudo)
-    PrintFatalError(Rec->getLoc(), "Pseudo result '" + Operator->getName() +
-                    "' cannot be another pseudo instruction!");
+  if (Insn.isCodeGenOnly || Insn.isPseudo) {
+    PrintError(Rec, "In pseudo instruction '" + Rec->getName() +
+                    "', result operator '" + Operator->getName() +
+                    "' cannot be a pseudo instruction");
+    PrintFatalNote(Rec->getValue("ResultInst"),
+                   "Result was assigned at the following location:");
+  }
 
-  if (Insn.Operands.size() != Dag->getNumArgs())
-    PrintFatalError(Rec->getLoc(), "Pseudo result '" + Operator->getName() +
-                    "' operand count mismatch");
+  if (Insn.Operands.size() != Dag->getNumArgs()) {
+    PrintError(Rec, "In pseudo instruction '" + Rec->getName() +
+                    "', result operator '" + Operator->getName() +
+                    "' has the wrong number of operands");
+    PrintFatalNote(Rec->getValue("ResultInst"),
+                   "Result was assigned at the following location:");
+  }
 
   unsigned NumMIOperands = 0;
   for (unsigned i = 0, e = Insn.Operands.size(); i != e; ++i)
@@ -177,10 +196,13 @@ void PseudoLoweringEmitter::evaluateExpansion(Record *Rec) {
       continue;
     StringMap<unsigned>::iterator SourceOp =
       SourceOperands.find(Dag->getArgNameStr(i));
-    if (SourceOp == SourceOperands.end())
-      PrintFatalError(Rec->getLoc(),
-                      "Pseudo output operand '" + Dag->getArgNameStr(i) +
-                      "' has no matching source operand.");
+    if (SourceOp == SourceOperands.end()) {
+      PrintError(Rec, "In pseudo instruction '" + Rec->getName() +
+                      "', output operand '" + Dag->getArgNameStr(i) +
+                      "' has no matching source operand");
+      PrintFatalNote(Rec->getValue("ResultInst"),
+                     "Value was assigned at the following location:");
+    }
     // Map the source operand to the destination operand index for each
     // MachineInstr operand.
     for (unsigned I = 0, E = Insn.Operands[i].MINumOperands; I != E; ++I)
@@ -204,15 +226,15 @@ void PseudoLoweringEmitter::emitLoweringEmitter(raw_ostream &o) {
 
   if (!Expansions.empty()) {
     o << "  switch (MI->getOpcode()) {\n"
-      << "    default: return false;\n";
+      << "  default: return false;\n";
     for (auto &Expansion : Expansions) {
       CodeGenInstruction &Source = Expansion.Source;
       CodeGenInstruction &Dest = Expansion.Dest;
-      o << "    case " << Source.Namespace << "::"
+      o << "  case " << Source.Namespace << "::"
         << Source.TheDef->getName() << ": {\n"
-        << "      MCInst TmpInst;\n"
-        << "      MCOperand MCOp;\n"
-        << "      TmpInst.setOpcode(" << Dest.Namespace << "::"
+        << "    MCInst TmpInst;\n"
+        << "    MCOperand MCOp;\n"
+        << "    TmpInst.setOpcode(" << Dest.Namespace << "::"
         << Dest.TheDef->getName() << ");\n";
 
       // Copy the operands from the source instruction.
@@ -221,23 +243,23 @@ void PseudoLoweringEmitter::emitLoweringEmitter(raw_ostream &o) {
       //        expansion DAG.
       unsigned MIOpNo = 0;
       for (const auto &DestOperand : Dest.Operands) {
-        o << "      // Operand: " << DestOperand.Name << "\n";
+        o << "    // Operand: " << DestOperand.Name << "\n";
         for (unsigned i = 0, e = DestOperand.MINumOperands; i != e; ++i) {
           switch (Expansion.OperandMap[MIOpNo + i].Kind) {
             case OpData::Operand:
-            o << "      lowerOperand(MI->getOperand("
+            o << "    lowerOperand(MI->getOperand("
               << Source.Operands[Expansion.OperandMap[MIOpNo].Data
               .Operand].MIOperandNo + i
               << "), MCOp);\n"
-              << "      TmpInst.addOperand(MCOp);\n";
+              << "    TmpInst.addOperand(MCOp);\n";
             break;
             case OpData::Imm:
-            o << "      TmpInst.addOperand(MCOperand::createImm("
+            o << "    TmpInst.addOperand(MCOperand::createImm("
               << Expansion.OperandMap[MIOpNo + i].Data.Imm << "));\n";
             break;
             case OpData::Reg: {
               Record *Reg = Expansion.OperandMap[MIOpNo + i].Data.Reg;
-              o << "      TmpInst.addOperand(MCOperand::createReg(";
+              o << "    TmpInst.addOperand(MCOperand::createReg(";
               // "zero_reg" is special.
               if (Reg->getName() == "zero_reg")
                 o << "0";
@@ -253,15 +275,15 @@ void PseudoLoweringEmitter::emitLoweringEmitter(raw_ostream &o) {
       }
       if (Dest.Operands.isVariadic) {
         MIOpNo = Source.Operands.size() + 1;
-        o << "      // variable_ops\n";
-        o << "      for (unsigned i = " << MIOpNo
+        o << "    // variable_ops\n";
+        o << "    for (unsigned i = " << MIOpNo
           << ", e = MI->getNumOperands(); i != e; ++i)\n"
-          << "        if (lowerOperand(MI->getOperand(i), MCOp))\n"
-          << "          TmpInst.addOperand(MCOp);\n";
+          << "      if (lowerOperand(MI->getOperand(i), MCOp))\n"
+          << "        TmpInst.addOperand(MCOp);\n";
       }
-      o << "      EmitToStreamer(OutStreamer, TmpInst);\n"
-        << "      break;\n"
-        << "    }\n";
+      o << "    EmitToStreamer(OutStreamer, TmpInst);\n"
+        << "    break;\n"
+        << "  }\n";
     }
     o << "  }\n  return true;";
   } else
@@ -271,24 +293,18 @@ void PseudoLoweringEmitter::emitLoweringEmitter(raw_ostream &o) {
 }
 
 void PseudoLoweringEmitter::run(raw_ostream &o) {
-  Record *ExpansionClass = Records.getClass("PseudoInstExpansion");
-  Record *InstructionClass = Records.getClass("Instruction");
-  assert(ExpansionClass && "PseudoInstExpansion class definition missing!");
-  assert(InstructionClass && "Instruction class definition missing!");
-
-  std::vector<Record*> Insts;
-  for (const auto &D : Records.getDefs()) {
-    if (D.second->isSubClassOf(ExpansionClass) &&
-        D.second->isSubClassOf(InstructionClass))
-      Insts.push_back(D.second.get());
-  }
+  StringRef Classes[] = {"PseudoInstExpansion", "Instruction"};
+  std::vector<Record *> Insts =
+      Records.getAllDerivedDefinitions(makeArrayRef(Classes));
 
   // Process the pseudo expansion definitions, validating them as we do so.
+  Records.startTimer("Process definitions");
   for (unsigned i = 0, e = Insts.size(); i != e; ++i)
     evaluateExpansion(Insts[i]);
 
   // Generate expansion code to lower the pseudo to an MCInst of the real
   // instruction.
+  Records.startTimer("Emit expansion code");
   emitLoweringEmitter(o);
 }
 
diff --git a/contrib/llvm-project/llvm/utils/TableGen/RISCVCompressInstEmitter.cpp b/contrib/llvm-project/llvm/utils/TableGen/RISCVCompressInstEmitter.cpp
index f298e639bf7f..183c8f9494db 100644
--- a/contrib/llvm-project/llvm/utils/TableGen/RISCVCompressInstEmitter.cpp
+++ b/contrib/llvm-project/llvm/utils/TableGen/RISCVCompressInstEmitter.cpp
@@ -37,11 +37,11 @@
 // compressing/uncompressing MCInst instructions, plus
 // some helper functions:
 //
-// bool compressInst(MCInst& OutInst, const MCInst &MI,
+// bool compressInst(MCInst &OutInst, const MCInst &MI,
 //                   const MCSubtargetInfo &STI,
 //                   MCContext &Context);
 //
-// bool uncompressInst(MCInst& OutInst, const MCInst &MI,
+// bool uncompressInst(MCInst &OutInst, const MCInst &MI,
 //                     const MCRegisterInfo &MRI,
 //                     const MCSubtargetInfo &STI);
 //
@@ -101,11 +101,12 @@ class RISCVCompressInstEmitter {
     IndexedMap<OpData>
         DestOperandMap; // Maps operands in the Dest Instruction
                         // to the corresponding Source instruction operand.
+    bool IsCompressOnly;
     CompressPat(CodeGenInstruction &S, CodeGenInstruction &D,
                 std::vector<Record *> RF, IndexedMap<OpData> &SourceMap,
-                IndexedMap<OpData> &DestMap)
+                IndexedMap<OpData> &DestMap, bool IsCompressOnly)
         : Source(S), Dest(D), PatReqFeatures(RF), SourceOperandMap(SourceMap),
-          DestOperandMap(DestMap) {}
+          DestOperandMap(DestMap), IsCompressOnly(IsCompressOnly) {}
   };
   enum EmitterType { Compress, Uncompress, CheckCompress };
   RecordKeeper &Records;
@@ -138,13 +139,12 @@ public:
 } // End anonymous namespace.
 
 bool RISCVCompressInstEmitter::validateRegister(Record *Reg, Record *RegClass) {
-  assert(Reg->isSubClassOf("Register") && "Reg record should be a Register\n");
-  assert(RegClass->isSubClassOf("RegisterClass") && "RegClass record should be"
-                                                    " a RegisterClass\n");
+  assert(Reg->isSubClassOf("Register") && "Reg record should be a Register");
+  assert(RegClass->isSubClassOf("RegisterClass") &&
+         "RegClass record should be a RegisterClass");
   const CodeGenRegisterClass &RC = Target.getRegisterClass(RegClass);
   const CodeGenRegister *R = Target.getRegisterByName(Reg->getName().lower());
-  assert((R != nullptr) &&
-         ("Register" + Reg->getName().str() + " not defined!!\n").c_str());
+  assert((R != nullptr) && "Register not defined!!");
   return RC.contains(R);
 }
 
@@ -237,9 +237,9 @@ void RISCVCompressInstEmitter::addDagOperandMapping(
       if (Inst.Operands[i].Rec->isSubClassOf("RegisterClass"))
         PrintFatalError(
             Rec->getLoc(),
-            ("Error in Dag '" + Dag->getAsString() + "' Found immediate: '" +
-             II->getAsString() +
-             "' but corresponding instruction operand expected a register!"));
+            "Error in Dag '" + Dag->getAsString() + "' Found immediate: '" +
+                II->getAsString() +
+                "' but corresponding instruction operand expected a register!");
       // No pattern validation check possible for values of fixed immediate.
       OperandMap[i].Kind = OpData::Imm;
       OperandMap[i].Data.Imm = II->getValue();
@@ -285,11 +285,7 @@ static bool verifyDagOpCount(CodeGenInstruction &Inst, DagInit *Dag,
 }
 
 static bool validateArgsTypes(Init *Arg1, Init *Arg2) {
-  DefInit *Type1 = dyn_cast<DefInit>(Arg1);
-  DefInit *Type2 = dyn_cast<DefInit>(Arg2);
-  assert(Type1 && ("Arg1 type not found\n"));
-  assert(Type2 && ("Arg2 type not found\n"));
-  return Type1->getDef() == Type2->getDef();
+  return cast<DefInit>(Arg1)->getDef() == cast<DefInit>(Arg2)->getDef();
 }
 
 // Creates a mapping between the operand name in the Dag (e.g. $rs1) and
@@ -471,7 +467,8 @@ void RISCVCompressInstEmitter::evaluateCompressPat(Record *Rec) {
   });
 
   CompressPatterns.push_back(CompressPat(SourceInst, DestInst, PatReqFeatures,
-                                         SourceOperandMap, DestOperandMap));
+                                         SourceOperandMap, DestOperandMap,
+                                         Rec->getValueAsBit("isCompressOnly")));
 }
 
 static void
@@ -514,50 +511,41 @@ getReqFeatures(std::set<std::pair<bool, StringRef>> &FeaturesSet,
 static unsigned getPredicates(DenseMap<const Record *, unsigned> &PredicateMap,
                               std::vector<const Record *> &Predicates,
                               Record *Rec, StringRef Name) {
-  unsigned Entry = PredicateMap[Rec];
+  unsigned &Entry = PredicateMap[Rec];
   if (Entry)
     return Entry;
 
   if (!Rec->isValueUnset(Name)) {
     Predicates.push_back(Rec);
     Entry = Predicates.size();
-    PredicateMap[Rec] = Entry;
     return Entry;
   }
 
   PrintFatalError(Rec->getLoc(), "No " + Name +
-    " predicate on this operand at all: '" + Rec->getName().str() + "'");
+                                     " predicate on this operand at all: '" +
+                                     Rec->getName() + "'");
   return 0;
 }
 
-static void printPredicates(std::vector<const Record *> &Predicates,
+static void printPredicates(const std::vector<const Record *> &Predicates,
                             StringRef Name, raw_ostream &o) {
   for (unsigned i = 0; i < Predicates.size(); ++i) {
-    Init *Pred = Predicates[i]->getValueInit(Name);
-    if (CodeInit *SI = dyn_cast<CodeInit>(Pred))
-      o << "  case " << i + 1 << ": {\n"
-        << "  // " << Predicates[i]->getName().str() << "\n"
-        << "  " << SI->getValue() << "\n"
-        << "  }\n";
-    else
-      llvm_unreachable("Unexpected predicate field!");
+    StringRef Pred = Predicates[i]->getValueAsString(Name);
+    o << "  case " << i + 1 << ": {\n"
+      << "  // " << Predicates[i]->getName() << "\n"
+      << "  " << Pred << "\n"
+      << "  }\n";
   }
 }
 
-static std::string mergeCondAndCode(raw_string_ostream &CondStream,
-                                    raw_string_ostream &CodeStream) {
-  std::string S;
-  raw_string_ostream CombinedStream(S);
-  CombinedStream.indent(4)
-      << "if ("
-      << CondStream.str().substr(
-             6, CondStream.str().length() -
-                    10) // remove first indentation and last '&&'.
-      << ") {\n";
-  CombinedStream << CodeStream.str();
+static void mergeCondAndCode(raw_ostream &CombinedStream, StringRef CondStr,
+                             StringRef CodeStr) {
+  // Remove first indentation and last '&&'.
+  CondStr = CondStr.drop_front(6).drop_back(4);
+  CombinedStream.indent(4) << "if (" << CondStr << ") {\n";
+  CombinedStream << CodeStr;
   CombinedStream.indent(4) << "  return true;\n";
   CombinedStream.indent(4) << "} // if\n";
-  return CombinedStream.str();
 }
 
 void RISCVCompressInstEmitter::emitCompressInstEmitter(raw_ostream &o,
@@ -568,23 +556,20 @@ void RISCVCompressInstEmitter::emitCompressInstEmitter(raw_ostream &o,
                     "'PassSubtarget' is false. SubTargetInfo object is needed "
                     "for target features.\n");
 
-  std::string Namespace = std::string(Target.getName());
+  StringRef Namespace = Target.getName();
 
   // Sort entries in CompressPatterns to handle instructions that can have more
   // than one candidate for compression\uncompression, e.g ADD can be
   // transformed to a C_ADD or a C_MV. When emitting 'uncompress()' function the
   // source and destination are flipped and the sort key needs to change
   // accordingly.
-  llvm::stable_sort(CompressPatterns,
-                    [EType](const CompressPat &LHS, const CompressPat &RHS) {
-                      if (EType == EmitterType::Compress ||
-                        EType == EmitterType::CheckCompress)
-                        return (LHS.Source.TheDef->getName().str() <
-                                RHS.Source.TheDef->getName().str());
-                      else
-                        return (LHS.Dest.TheDef->getName().str() <
-                                RHS.Dest.TheDef->getName().str());
-                    });
+  llvm::stable_sort(CompressPatterns, [EType](const CompressPat &LHS,
+                                              const CompressPat &RHS) {
+    if (EType == EmitterType::Compress || EType == EmitterType::CheckCompress)
+      return (LHS.Source.TheDef->getName() < RHS.Source.TheDef->getName());
+    else
+      return (LHS.Dest.TheDef->getName() < RHS.Dest.TheDef->getName());
+  });
 
   // A list of MCOperandPredicates for all operands in use, and the reverse map.
   std::vector<const Record *> MCOpPredicates;
@@ -610,17 +595,17 @@ void RISCVCompressInstEmitter::emitCompressInstEmitter(raw_ostream &o,
       << "#undef GEN_CHECK_COMPRESS_INSTR\n\n";
 
   if (EType == EmitterType::Compress) {
-    FuncH << "static bool compressInst(MCInst& OutInst,\n";
+    FuncH << "static bool compressInst(MCInst &OutInst,\n";
     FuncH.indent(25) << "const MCInst &MI,\n";
     FuncH.indent(25) << "const MCSubtargetInfo &STI,\n";
     FuncH.indent(25) << "MCContext &Context) {\n";
   } else if (EType == EmitterType::Uncompress){
-    FuncH << "static bool uncompressInst(MCInst& OutInst,\n";
+    FuncH << "static bool uncompressInst(MCInst &OutInst,\n";
     FuncH.indent(27) << "const MCInst &MI,\n";
     FuncH.indent(27) << "const MCRegisterInfo &MRI,\n";
     FuncH.indent(27) << "const MCSubtargetInfo &STI) {\n";
   } else if (EType == EmitterType::CheckCompress) {
-    FuncH << "static bool isCompressibleInst(const MachineInstr& MI,\n";
+    FuncH << "static bool isCompressibleInst(const MachineInstr &MI,\n";
     FuncH.indent(27) << "const RISCVSubtarget *Subtarget,\n";
     FuncH.indent(27) << "const MCRegisterInfo &MRI,\n";
     FuncH.indent(27) << "const MCSubtargetInfo &STI) {\n";
@@ -638,10 +623,10 @@ void RISCVCompressInstEmitter::emitCompressInstEmitter(raw_ostream &o,
     return;
   }
 
-  std::string CaseString("");
+  std::string CaseString;
   raw_string_ostream CaseStream(CaseString);
-  std::string PrevOp("");
-  std::string CurOp("");
+  StringRef PrevOp;
+  StringRef CurOp;
   CaseStream << "  switch (MI.getOpcode()) {\n";
   CaseStream << "    default: return false;\n";
 
@@ -651,6 +636,9 @@ void RISCVCompressInstEmitter::emitCompressInstEmitter(raw_ostream &o,
     EType == EmitterType::Compress || EType == EmitterType::Uncompress;
 
   for (auto &CompressPat : CompressPatterns) {
+    if (EType == EmitterType::Uncompress && CompressPat.IsCompressOnly)
+      continue;
+
     std::string CondString;
     std::string CodeString;
     raw_string_ostream CondStream(CondString);
@@ -664,10 +652,10 @@ void RISCVCompressInstEmitter::emitCompressInstEmitter(raw_ostream &o,
     IndexedMap<OpData> &DestOperandMap = CompressOrCheck ?
       CompressPat.DestOperandMap : CompressPat.SourceOperandMap;
 
-    CurOp = Source.TheDef->getName().str();
+    CurOp = Source.TheDef->getName();
     // Check current and previous opcode to decide to continue or end a case.
     if (CurOp != PrevOp) {
-      if (PrevOp != "")
+      if (!PrevOp.empty())
         CaseStream.indent(6) << "break;\n    } // case " + PrevOp + "\n";
       CaseStream.indent(4) << "case " + Namespace + "::" + CurOp + ": {\n";
     }
@@ -688,9 +676,9 @@ void RISCVCompressInstEmitter::emitCompressInstEmitter(raw_ostream &o,
     // Emit checks for all required features.
     for (auto &Op : FeaturesSet) {
       StringRef Not = Op.first ? "!" : "";
-      CondStream.indent(6)
-          << Not << ("STI.getFeatureBits()[" + Namespace + "::" + Op.second + "]").str() +
-                  " &&\n";
+      CondStream.indent(6) << Not << "STI.getFeatureBits()[" << Namespace
+                           << "::" << Op.second << "]"
+                           << " &&\n";
     }
 
     // Emit checks for all required feature groups.
@@ -699,8 +687,8 @@ void RISCVCompressInstEmitter::emitCompressInstEmitter(raw_ostream &o,
       for (auto &Op : Set) {
         bool isLast = &Op == &*Set.rbegin();
         StringRef Not = Op.first ? "!" : "";
-        CondStream << Not << ("STI.getFeatureBits()[" + Namespace + "::" + Op.second +
-                          "]").str();
+        CondStream << Not << "STI.getFeatureBits()[" << Namespace
+                   << "::" << Op.second << "]";
         if (!isLast)
           CondStream << " || ";
       }
@@ -713,10 +701,8 @@ void RISCVCompressInstEmitter::emitCompressInstEmitter(raw_ostream &o,
       if (SourceOperandMap[OpNo].TiedOpIdx != -1) {
         if (Source.Operands[OpNo].Rec->isSubClassOf("RegisterClass"))
           CondStream.indent(6)
-              << "(MI.getOperand("
-              << std::to_string(OpNo) + ").getReg() ==  MI.getOperand("
-              << std::to_string(SourceOperandMap[OpNo].TiedOpIdx)
-              << ").getReg()) &&\n";
+              << "(MI.getOperand(" << OpNo << ").getReg() ==  MI.getOperand("
+              << SourceOperandMap[OpNo].TiedOpIdx << ").getReg()) &&\n";
         else
           PrintFatalError("Unexpected tied operand types!\n");
       }
@@ -727,27 +713,26 @@ void RISCVCompressInstEmitter::emitCompressInstEmitter(raw_ostream &o,
         break;
       case OpData::Imm:
         CondStream.indent(6)
-            << "(MI.getOperand(" + std::to_string(OpNo) + ").isImm()) &&\n" +
-                   "      (MI.getOperand(" + std::to_string(OpNo) +
-                   ").getImm() == " +
-                   std::to_string(SourceOperandMap[OpNo].Data.Imm) + ") &&\n";
+            << "(MI.getOperand(" << OpNo << ").isImm()) &&\n"
+            << "      (MI.getOperand(" << OpNo
+            << ").getImm() == " << SourceOperandMap[OpNo].Data.Imm << ") &&\n";
         break;
       case OpData::Reg: {
         Record *Reg = SourceOperandMap[OpNo].Data.Reg;
-        CondStream.indent(6) << "(MI.getOperand(" + std::to_string(OpNo) +
-                                    ").getReg() == " + Namespace +
-                                    "::" + Reg->getName().str() + ") &&\n";
+        CondStream.indent(6)
+            << "(MI.getOperand(" << OpNo << ").getReg() == " << Namespace
+            << "::" << Reg->getName() << ") &&\n";
         break;
       }
       }
     }
-    CodeStream.indent(6) << "// " + Dest.AsmString + "\n";
+    CodeStream.indent(6) << "// " << Dest.AsmString << "\n";
     if (CompressOrUncompress)
-      CodeStream.indent(6) << "OutInst.setOpcode(" + Namespace +
-                                "::" + Dest.TheDef->getName().str() + ");\n";
+      CodeStream.indent(6) << "OutInst.setOpcode(" << Namespace
+                           << "::" << Dest.TheDef->getName() << ");\n";
     OpNo = 0;
     for (const auto &DestOperand : Dest.Operands) {
-      CodeStream.indent(6) << "// Operand: " + DestOperand.Name + "\n";
+      CodeStream.indent(6) << "// Operand: " << DestOperand.Name << "\n";
       switch (DestOperandMap[OpNo].Kind) {
       case OpData::Operand: {
         unsigned OpIdx = DestOperandMap[OpNo].Data.Operand;
@@ -759,68 +744,67 @@ void RISCVCompressInstEmitter::emitCompressInstEmitter(raw_ostream &o,
           // Don't check register class if this is a tied operand, it was done
           // for the operand its tied to.
           if (DestOperand.getTiedRegister() == -1)
-            CondStream.indent(6)
-                << "(MRI.getRegClass(" + Namespace +
-                       "::" + DestOperand.Rec->getName().str() +
-                       "RegClassID).contains(" + "MI.getOperand(" +
-                       std::to_string(OpIdx) + ").getReg())) &&\n";
+            CondStream.indent(6) << "(MRI.getRegClass(" << Namespace
+                                 << "::" << DestOperand.Rec->getName()
+                                 << "RegClassID).contains(MI.getOperand("
+                                 << OpIdx << ").getReg())) &&\n";
 
           if (CompressOrUncompress)
-            CodeStream.indent(6) << "OutInst.addOperand(MI.getOperand(" +
-                                        std::to_string(OpIdx) + "));\n";
+            CodeStream.indent(6)
+                << "OutInst.addOperand(MI.getOperand(" << OpIdx << "));\n";
         } else {
           // Handling immediate operands.
           if (CompressOrUncompress) {
-            unsigned Entry = getPredicates(MCOpPredicateMap, MCOpPredicates,
-              DestOperand.Rec, StringRef("MCOperandPredicate"));
-            CondStream.indent(6) << Namespace + "ValidateMCOperand(" +
-                                        "MI.getOperand(" + std::to_string(OpIdx) +
-                                        "), STI, " + std::to_string(Entry) +
-                                        ") &&\n";
+            unsigned Entry =
+                getPredicates(MCOpPredicateMap, MCOpPredicates, DestOperand.Rec,
+                              "MCOperandPredicate");
+            CondStream.indent(6)
+                << Namespace << "ValidateMCOperand("
+                << "MI.getOperand(" << OpIdx << "), STI, " << Entry << ") &&\n";
           } else {
-            unsigned Entry = getPredicates(ImmLeafPredicateMap, ImmLeafPredicates,
-              DestOperand.Rec, StringRef("ImmediateCode"));
-            CondStream.indent(6) << "MI.getOperand(" + std::to_string(OpIdx) +
-                                    ").isImm() && \n";
-            CondStream.indent(6) << Namespace + "ValidateMachineOperand(" +
-                                        "MI.getOperand(" + std::to_string(OpIdx) +
-                                        "), Subtarget, " + std::to_string(Entry) +
-                                        ") &&\n";
+            unsigned Entry =
+                getPredicates(ImmLeafPredicateMap, ImmLeafPredicates,
+                              DestOperand.Rec, "ImmediateCode");
+            CondStream.indent(6)
+                << "MI.getOperand(" << OpIdx << ").isImm() &&\n";
+            CondStream.indent(6) << Namespace << "ValidateMachineOperand("
+                                 << "MI.getOperand(" << OpIdx
+                                 << "), Subtarget, " << Entry << ") &&\n";
           }
           if (CompressOrUncompress)
-            CodeStream.indent(6) << "OutInst.addOperand(MI.getOperand(" +
-                                        std::to_string(OpIdx) + "));\n";
+            CodeStream.indent(6)
+                << "OutInst.addOperand(MI.getOperand(" << OpIdx << "));\n";
         }
         break;
       }
       case OpData::Imm: {
         if (CompressOrUncompress) {
           unsigned Entry = getPredicates(MCOpPredicateMap, MCOpPredicates,
-            DestOperand.Rec, StringRef("MCOperandPredicate"));
+                                         DestOperand.Rec, "MCOperandPredicate");
           CondStream.indent(6)
-              << Namespace + "ValidateMCOperand(" + "MCOperand::createImm(" +
-                     std::to_string(DestOperandMap[OpNo].Data.Imm) + "), STI, " +
-                     std::to_string(Entry) + ") &&\n";
+              << Namespace << "ValidateMCOperand("
+              << "MCOperand::createImm(" << DestOperandMap[OpNo].Data.Imm
+              << "), STI, " << Entry << ") &&\n";
         } else {
           unsigned Entry = getPredicates(ImmLeafPredicateMap, ImmLeafPredicates,
-            DestOperand.Rec, StringRef("ImmediateCode"));
+                                         DestOperand.Rec, "ImmediateCode");
           CondStream.indent(6)
-              << Namespace + "ValidateMachineOperand(" + "MachineOperand::CreateImm(" +
-                     std::to_string(DestOperandMap[OpNo].Data.Imm) + "), SubTarget, " +
-                     std::to_string(Entry) + ") &&\n";
+              << Namespace
+              << "ValidateMachineOperand(MachineOperand::CreateImm("
+              << DestOperandMap[OpNo].Data.Imm << "), SubTarget, " << Entry
+              << ") &&\n";
         }
         if (CompressOrUncompress)
-          CodeStream.indent(6)
-              << "OutInst.addOperand(MCOperand::createImm(" +
-                     std::to_string(DestOperandMap[OpNo].Data.Imm) + "));\n";
+          CodeStream.indent(6) << "OutInst.addOperand(MCOperand::createImm("
+                               << DestOperandMap[OpNo].Data.Imm << "));\n";
       } break;
       case OpData::Reg: {
         if (CompressOrUncompress) {
           // Fixed register has been validated at pattern validation time.
           Record *Reg = DestOperandMap[OpNo].Data.Reg;
-          CodeStream.indent(6) << "OutInst.addOperand(MCOperand::createReg(" +
-                                      Namespace + "::" + Reg->getName().str() +
-                                      "));\n";
+          CodeStream.indent(6)
+              << "OutInst.addOperand(MCOperand::createReg(" << Namespace
+              << "::" << Reg->getName() << "));\n";
         }
       } break;
       }
@@ -828,12 +812,12 @@ void RISCVCompressInstEmitter::emitCompressInstEmitter(raw_ostream &o,
     }
     if (CompressOrUncompress)
       CodeStream.indent(6) << "OutInst.setLoc(MI.getLoc());\n";
-    CaseStream << mergeCondAndCode(CondStream, CodeStream);
+    mergeCondAndCode(CaseStream, CondStream.str(), CodeStream.str());
     PrevOp = CurOp;
   }
   Func << CaseStream.str() << "\n";
   // Close brace for the last case.
-  Func.indent(4) << "} // case " + CurOp + "\n";
+  Func.indent(4) << "} // case " << CurOp << "\n";
   Func.indent(2) << "} // switch\n";
   Func.indent(2) << "return false;\n}\n";
 
@@ -858,7 +842,7 @@ void RISCVCompressInstEmitter::emitCompressInstEmitter(raw_ostream &o,
       << "ValidateMachineOperand(const MachineOperand &MO,\n"
       << "                  const RISCVSubtarget *Subtarget,\n"
       << "                  unsigned PredicateIndex) {\n"
-      << "  int64_t Imm = MO.getImm(); \n"
+      << "  int64_t Imm = MO.getImm();\n"
       << "  switch (PredicateIndex) {\n"
       << "  default:\n"
       << "    llvm_unreachable(\"Unknown ImmLeaf Predicate kind\");\n"
@@ -884,13 +868,7 @@ void RISCVCompressInstEmitter::emitCompressInstEmitter(raw_ostream &o,
 }
 
 void RISCVCompressInstEmitter::run(raw_ostream &o) {
-  Record *CompressClass = Records.getClass("CompressPat");
-  assert(CompressClass && "Compress class definition missing!");
-  std::vector<Record *> Insts;
-  for (const auto &D : Records.getDefs()) {
-    if (D.second->isSubClassOf(CompressClass))
-      Insts.push_back(D.second.get());
-  }
+  std::vector<Record *> Insts = Records.getAllDerivedDefinitions("CompressPat");
 
   // Process the CompressPat definitions, validating them as we do so.
   for (unsigned i = 0, e = Insts.size(); i != e; ++i)
diff --git a/contrib/llvm-project/llvm/utils/TableGen/RegisterBankEmitter.cpp b/contrib/llvm-project/llvm/utils/TableGen/RegisterBankEmitter.cpp
index 586f857b1fb0..0725657150f8 100644
--- a/contrib/llvm-project/llvm/utils/TableGen/RegisterBankEmitter.cpp
+++ b/contrib/llvm-project/llvm/utils/TableGen/RegisterBankEmitter.cpp
@@ -71,10 +71,7 @@ public:
 
   /// Add a register class to the bank without duplicates.
   void addRegisterClass(const CodeGenRegisterClass *RC) {
-    if (std::find_if(RCs.begin(), RCs.end(),
-                     [&RC](const CodeGenRegisterClass *X) {
-                       return X == RC;
-                     }) != RCs.end())
+    if (llvm::is_contained(RCs, RC))
       return;
 
     // FIXME? We really want the register size rather than the spill size
@@ -131,9 +128,12 @@ void RegisterBankEmitter::emitHeader(raw_ostream &OS,
   // <Target>RegisterBankInfo.h
   OS << "namespace llvm {\n"
      << "namespace " << TargetName << " {\n"
-     << "enum {\n";
+     << "enum : unsigned {\n";
+
+  OS << "  InvalidRegBankID = ~0u,\n";
+  unsigned ID = 0;
   for (const auto &Bank : Banks)
-    OS << "  " << Bank.getEnumeratorName() << ",\n";
+    OS << "  " << Bank.getEnumeratorName() << " = " << ID++ << ",\n";
   OS << "  NumRegisterBanks,\n"
      << "};\n"
      << "} // end namespace " << TargetName << "\n"
@@ -168,7 +168,7 @@ void RegisterBankEmitter::emitBaseClassDefinition(
 ///                to the class.
 static void visitRegisterBankClasses(
     const CodeGenRegBank &RegisterClassHierarchy,
-    const CodeGenRegisterClass *RC, const Twine Kind,
+    const CodeGenRegisterClass *RC, const Twine &Kind,
     std::function<void(const CodeGenRegisterClass *, StringRef)> VisitFn,
     SmallPtrSetImpl<const CodeGenRegisterClass *> &VisitedRCs) {
 
@@ -182,7 +182,7 @@ static void visitRegisterBankClasses(
 
   for (const auto &PossibleSubclass : RegisterClassHierarchy.getRegClasses()) {
     std::string TmpKind =
-        (Twine(Kind) + " (" + PossibleSubclass.getName() + ")").str();
+        (Kind + " (" + PossibleSubclass.getName() + ")").str();
 
     // Visit each subclass of an explicitly named class.
     if (RC != &PossibleSubclass && RC->hasSubClass(&PossibleSubclass))
@@ -279,6 +279,7 @@ void RegisterBankEmitter::run(raw_ostream &OS) {
   StringRef TargetName = Target.getName();
   const CodeGenRegBank &RegisterClassHierarchy = Target.getRegBank();
 
+  Records.startTimer("Analyze records");
   std::vector<RegisterBank> Banks;
   for (const auto &V : Records.getAllDerivedDefinitions("RegisterBank")) {
     SmallPtrSet<const CodeGenRegisterClass *, 8> VisitedRCs;
@@ -300,6 +301,7 @@ void RegisterBankEmitter::run(raw_ostream &OS) {
   }
 
   // Warn about ambiguous MIR caused by register bank/class name clashes.
+  Records.startTimer("Warn ambiguous");
   for (const auto &Class : RegisterClassHierarchy.getRegClasses()) {
     for (const auto &Bank : Banks) {
       if (Bank.getName().lower() == StringRef(Class.getName()).lower()) {
@@ -312,6 +314,7 @@ void RegisterBankEmitter::run(raw_ostream &OS) {
     }
   }
 
+  Records.startTimer("Emit output");
   emitSourceFileHeader("Register Bank Source Fragments", OS);
   OS << "#ifdef GET_REGBANK_DECLARATIONS\n"
      << "#undef GET_REGBANK_DECLARATIONS\n";
diff --git a/contrib/llvm-project/llvm/utils/TableGen/RegisterInfoEmitter.cpp b/contrib/llvm-project/llvm/utils/TableGen/RegisterInfoEmitter.cpp
index a615587efdee..dce7594dec2f 100644
--- a/contrib/llvm-project/llvm/utils/TableGen/RegisterInfoEmitter.cpp
+++ b/contrib/llvm-project/llvm/utils/TableGen/RegisterInfoEmitter.cpp
@@ -127,7 +127,7 @@ void RegisterInfoEmitter::runEnums(raw_ostream &OS,
     OS << "  " << Reg.getName() << " = " << Reg.EnumValue << ",\n";
   assert(Registers.size() == Registers.back().EnumValue &&
          "Register enum value mismatch!");
-  OS << "  NUM_TARGET_REGS \t// " << Registers.size()+1 << "\n";
+  OS << "  NUM_TARGET_REGS // " << Registers.size()+1 << "\n";
   OS << "};\n";
   if (!Namespace.empty())
     OS << "} // end namespace " << Namespace << "\n";
@@ -146,7 +146,7 @@ void RegisterInfoEmitter::runEnums(raw_ostream &OS,
     for (const auto &RC : RegisterClasses)
       OS << "  " << RC.getName() << "RegClassID"
          << " = " << RC.EnumValue << ",\n";
-    OS << "\n  };\n";
+    OS << "\n};\n";
     if (!Namespace.empty())
       OS << "} // end namespace " << Namespace << "\n\n";
   }
@@ -323,7 +323,7 @@ EmitRegUnitPressure(raw_ostream &OS, const CodeGenRegBank &RegBank,
   OS << "/// Get the dimensions of register pressure impacted by this "
      << "register class.\n"
      << "/// Returns a -1 terminated array of pressure set IDs\n"
-     << "const int* " << ClassName << "::\n"
+     << "const int *" << ClassName << "::\n"
      << "getRegClassPressureSets(const TargetRegisterClass *RC) const {\n";
   OS << "  static const " << getMinimalTypeForRange(PSetsSeqs.size() - 1, 32)
      << " RCSetStartTable[] = {\n    ";
@@ -337,7 +337,7 @@ EmitRegUnitPressure(raw_ostream &OS, const CodeGenRegBank &RegBank,
   OS << "/// Get the dimensions of register pressure impacted by this "
      << "register unit.\n"
      << "/// Returns a -1 terminated array of pressure set IDs\n"
-     << "const int* " << ClassName << "::\n"
+     << "const int *" << ClassName << "::\n"
      << "getRegUnitPressureSets(unsigned RegUnit) const {\n"
      << "  assert(RegUnit < " << RegBank.getNumNativeRegUnits()
      << " && \"invalid register unit\");\n";
@@ -356,11 +356,10 @@ EmitRegUnitPressure(raw_ostream &OS, const CodeGenRegBank &RegBank,
 using DwarfRegNumsMapPair = std::pair<Record*, std::vector<int64_t>>;
 using DwarfRegNumsVecTy = std::vector<DwarfRegNumsMapPair>;
 
-void finalizeDwarfRegNumsKeys(DwarfRegNumsVecTy &DwarfRegNums) {
+static void finalizeDwarfRegNumsKeys(DwarfRegNumsVecTy &DwarfRegNums) {
   // Sort and unique to get a map-like vector. We want the last assignment to
   // match previous behaviour.
-  std::stable_sort(DwarfRegNums.begin(), DwarfRegNums.end(),
-                   on_first<LessRecordRegister>());
+  llvm::stable_sort(DwarfRegNums, on_first<LessRecordRegister>());
   // Warn about duplicate assignments.
   const Record *LastSeenReg = nullptr;
   for (const auto &X : DwarfRegNums) {
@@ -462,18 +461,16 @@ void RegisterInfoEmitter::EmitRegMappingTables(
 
     DefInit *DI = cast<DefInit>(V->getValue());
     Record *Alias = DI->getDef();
-    const auto &AliasIter =
-        std::lower_bound(DwarfRegNums.begin(), DwarfRegNums.end(), Alias,
-                         [](const DwarfRegNumsMapPair &A, const Record *B) {
-                           return LessRecordRegister()(A.first, B);
-                         });
+    const auto &AliasIter = llvm::lower_bound(
+        DwarfRegNums, Alias, [](const DwarfRegNumsMapPair &A, const Record *B) {
+          return LessRecordRegister()(A.first, B);
+        });
     assert(AliasIter != DwarfRegNums.end() && AliasIter->first == Alias &&
            "Expected Alias to be present in map");
-    const auto &RegIter =
-        std::lower_bound(DwarfRegNums.begin(), DwarfRegNums.end(), Reg,
-                         [](const DwarfRegNumsMapPair &A, const Record *B) {
-                           return LessRecordRegister()(A.first, B);
-                         });
+    const auto &RegIter = llvm::lower_bound(
+        DwarfRegNums, Reg, [](const DwarfRegNumsMapPair &A, const Record *B) {
+          return LessRecordRegister()(A.first, B);
+        });
     assert(RegIter != DwarfRegNums.end() && RegIter->first == Reg &&
            "Expected Reg to be present in map");
     RegIter->second = AliasIter->second;
@@ -960,7 +957,7 @@ RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target,
     const auto &RUMasks = Reg.getRegUnitLaneMasks();
     MaskVec &LaneMaskVec = RegUnitLaneMasks[i];
     assert(LaneMaskVec.empty());
-    LaneMaskVec.insert(LaneMaskVec.begin(), RUMasks.begin(), RUMasks.end());
+    llvm::append_range(LaneMaskVec, RUMasks);
     // Terminator mask should not be used inside of the list.
 #ifndef NDEBUG
     for (LaneBitmask M : LaneMaskVec) {
@@ -1167,7 +1164,7 @@ RegisterInfoEmitter::runTargetHeader(raw_ostream &OS, CodeGenTarget &Target,
        << "  LaneBitmask reverseComposeSubRegIndexLaneMaskImpl"
        << "(unsigned, LaneBitmask) const override;\n"
        << "  const TargetRegisterClass *getSubClassWithSubReg"
-       << "(const TargetRegisterClass*, unsigned) const override;\n";
+       << "(const TargetRegisterClass *, unsigned) const override;\n";
   }
   OS << "  const RegClassWeight &getRegClassWeight("
      << "const TargetRegisterClass *RC) const override;\n"
@@ -1288,7 +1285,8 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
         OS << CGH.getMode(M).Name;
       OS << ")\n";
       for (const auto &RC : RegisterClasses) {
-        assert(RC.EnumValue == EV++ && "Unexpected order of register classes");
+        assert(RC.EnumValue == EV && "Unexpected order of register classes");
+        ++EV;
         (void)EV;
         const RegSizeInfo &RI = RC.RSI.get(M);
         OS << "  { " << RI.RegSize << ", " << RI.SpillSize << ", "
@@ -1435,7 +1433,7 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
   }
 
   OS << "\nnamespace {\n";
-  OS << "  const TargetRegisterClass* const RegisterClasses[] = {\n";
+  OS << "  const TargetRegisterClass *const RegisterClasses[] = {\n";
   for (const auto &RC : RegisterClasses)
     OS << "    &" << RC.getQualifiedName() << "RegClass,\n";
   OS << "  };\n";
@@ -1615,9 +1613,16 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
 
 void RegisterInfoEmitter::run(raw_ostream &OS) {
   CodeGenRegBank &RegBank = Target.getRegBank();
+  Records.startTimer("Print enums");
   runEnums(OS, Target, RegBank);
+
+  Records.startTimer("Print MC registers");
   runMCDesc(OS, Target, RegBank);
+
+  Records.startTimer("Print header fragment");
   runTargetHeader(OS, Target, RegBank);
+
+  Records.startTimer("Print target registers");
   runTargetDesc(OS, Target, RegBank);
 
   if (RegisterInfoDebug)
diff --git a/contrib/llvm-project/llvm/utils/TableGen/SearchableTableEmitter.cpp b/contrib/llvm-project/llvm/utils/TableGen/SearchableTableEmitter.cpp
index 326cb4e54edc..912d43b2caa1 100644
--- a/contrib/llvm-project/llvm/utils/TableGen/SearchableTableEmitter.cpp
+++ b/contrib/llvm-project/llvm/utils/TableGen/SearchableTableEmitter.cpp
@@ -12,6 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "CodeGenIntrinsics.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/Format.h"
@@ -19,7 +21,6 @@
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
-#include "CodeGenIntrinsics.h"
 #include <algorithm>
 #include <set>
 #include <string>
@@ -53,6 +54,7 @@ struct GenericEnum {
 struct GenericField {
   std::string Name;
   RecTy *RecType = nullptr;
+  bool IsCode = false;
   bool IsIntrinsic = false;
   bool IsInstruction = false;
   GenericEnum *Enum = nullptr;
@@ -62,12 +64,14 @@ struct GenericField {
 
 struct SearchIndex {
   std::string Name;
+  SMLoc Loc; // Source location of PrimaryKey or Key field definition.
   SmallVector<GenericField, 1> Fields;
   bool EarlyOut = false;
 };
 
 struct GenericTable {
   std::string Name;
+  ArrayRef<SMLoc> Locs; // Source locations from the Record instance.
   std::string PreprocessorGuard;
   std::string CppTypeName;
   SmallVector<GenericField, 2> Fields;
@@ -106,15 +110,17 @@ private:
     TypeInArgument,
   };
 
-  std::string primaryRepresentation(const GenericField &Field, Init *I) {
-    if (StringInit *SI = dyn_cast<StringInit>(I))
-      return SI->getAsString();
-    else if (BitsInit *BI = dyn_cast<BitsInit>(I))
+  std::string primaryRepresentation(SMLoc Loc, const GenericField &Field,
+                                    Init *I) {
+    if (StringInit *SI = dyn_cast<StringInit>(I)) {
+      if (Field.IsCode || SI->hasCodeFormat())
+        return std::string(SI->getValue());
+      else
+        return SI->getAsString();
+    } else if (BitsInit *BI = dyn_cast<BitsInit>(I))
       return "0x" + utohexstr(getAsInt(BI));
     else if (BitInit *BI = dyn_cast<BitInit>(I))
       return BI->getValue() ? "true" : "false";
-    else if (CodeInit *CI = dyn_cast<CodeInit>(I))
-      return std::string(CI->getValue());
     else if (Field.IsIntrinsic)
       return "Intrinsic::" + getIntrinsic(I).EnumName;
     else if (Field.IsInstruction)
@@ -122,11 +128,12 @@ private:
     else if (Field.Enum) {
       auto *Entry = Field.Enum->EntryMap[cast<DefInit>(I)->getDef()];
       if (!Entry)
-        PrintFatalError(Twine("Entry for field '") + Field.Name + "' is null");
+        PrintFatalError(Loc,
+                        Twine("Entry for field '") + Field.Name + "' is null");
       return std::string(Entry->first);
     }
-    PrintFatalError(Twine("invalid field type for field '") + Field.Name +
-                    "', expected: string, bits, bit or code");
+    PrintFatalError(Loc, Twine("invalid field type for field '") + Field.Name + 
+                             "'; expected: bit, bits, string, or code");
   }
 
   bool isIntrinsic(Init *I) {
@@ -138,17 +145,16 @@ private:
   CodeGenIntrinsic &getIntrinsic(Init *I) {
     std::unique_ptr<CodeGenIntrinsic> &Intr = Intrinsics[I];
     if (!Intr)
-      Intr = std::make_unique<CodeGenIntrinsic>(cast<DefInit>(I)->getDef());
+      Intr = std::make_unique<CodeGenIntrinsic>(cast<DefInit>(I)->getDef(),
+                                                std::vector<Record *>());
     return *Intr;
   }
 
   bool compareBy(Record *LHS, Record *RHS, const SearchIndex &Index);
 
-  bool isIntegral(Init *I) {
-    return isa<BitsInit>(I) || isa<CodeInit>(I) || isIntrinsic(I);
-  }
-
-  std::string searchableFieldType(const GenericField &Field, TypeContext Ctx) {
+  std::string searchableFieldType(const GenericTable &Table,
+                                  const SearchIndex &Index,
+                                  const GenericField &Field, TypeContext Ctx) {
     if (isa<StringRecTy>(Field.RecType)) {
       if (Ctx == TypeInStaticStruct)
         return "const char *";
@@ -165,12 +171,16 @@ private:
         return "uint32_t";
       if (NumBits <= 64)
         return "uint64_t";
-      PrintFatalError(Twine("bitfield '") + Field.Name +
-                      "' too large to search");
+      PrintFatalError(Index.Loc, Twine("In table '") + Table.Name + 
+                                     "' lookup method '" + Index.Name +
+                                     "', key field '" + Field.Name +
+                                     "' of type bits is too large");
     } else if (Field.Enum || Field.IsIntrinsic || Field.IsInstruction)
       return "unsigned";
-    PrintFatalError(Twine("Field '") + Field.Name + "' has unknown type '" +
-                    Field.RecType->getAsString() + "' to search by");
+    PrintFatalError(Index.Loc,
+                    Twine("In table '") + Table.Name + "' lookup method '" +
+                        Index.Name + "', key field '" + Field.Name +
+                        "' has invalid type: " + Field.RecType->getAsString());
   }
 
   void emitGenericTable(const GenericTable &Table, raw_ostream &OS);
@@ -183,7 +193,7 @@ private:
 
   bool parseFieldType(GenericField &Field, Init *II);
   std::unique_ptr<SearchIndex>
-  parseSearchIndex(GenericTable &Table, StringRef Name,
+  parseSearchIndex(GenericTable &Table, const RecordVal *RecVal, StringRef Name,
                    const std::vector<StringRef> &Key, bool EarlyOut);
   void collectEnumEntries(GenericEnum &Enum, StringRef NameField,
                           StringRef ValueField,
@@ -258,8 +268,8 @@ bool SearchableTableEmitter::compareBy(Record *LHS, Record *RHS,
       if (LHSv > RHSv)
         return false;
     } else {
-      std::string LHSs = primaryRepresentation(Field, LHSI);
-      std::string RHSs = primaryRepresentation(Field, RHSI);
+      std::string LHSs = primaryRepresentation(Index.Loc, Field, LHSI);
+      std::string RHSs = primaryRepresentation(Index.Loc, Field, RHSI);
 
       if (isa<StringRecTy>(Field.RecType)) {
         LHSs = StringRef(LHSs).upper();
@@ -314,7 +324,8 @@ void SearchableTableEmitter::emitLookupFunction(const GenericTable &Table,
   } else {
     OS << "  struct IndexType {\n";
     for (const auto &Field : Index.Fields) {
-      OS << "    " << searchableFieldType(Field, TypeInStaticStruct) << " "
+      OS << "    "
+         << searchableFieldType(Table, Index, Field, TypeInStaticStruct) << " "
          << Field.Name << ";\n";
     }
     OS << "    unsigned _index;\n";
@@ -327,11 +338,10 @@ void SearchableTableEmitter::emitLookupFunction(const GenericTable &Table,
     for (unsigned i = 0; i < Table.Entries.size(); ++i)
       Entries.emplace_back(Table.Entries[i], i);
 
-    std::stable_sort(Entries.begin(), Entries.end(),
-                     [&](const std::pair<Record *, unsigned> &LHS,
-                         const std::pair<Record *, unsigned> &RHS) {
-                       return compareBy(LHS.first, RHS.first, Index);
-                     });
+    llvm::stable_sort(Entries, [&](const std::pair<Record *, unsigned> &LHS,
+                                   const std::pair<Record *, unsigned> &RHS) {
+      return compareBy(LHS.first, RHS.first, Index);
+    });
 
     IndexRowsStorage.reserve(Entries.size());
     for (const auto &Entry : Entries) {
@@ -344,8 +354,8 @@ void SearchableTableEmitter::emitLookupFunction(const GenericTable &Table,
           OS << ", ";
         NeedComma = true;
 
-        std::string Repr =
-            primaryRepresentation(Field, Entry.first->getValueInit(Field.Name));
+        std::string Repr = primaryRepresentation(
+            Index.Loc, Field, Entry.first->getValueInit(Field.Name));
         if (isa<StringRecTy>(Field.RecType))
           Repr = StringRef(Repr).upper();
         OS << Repr;
@@ -388,10 +398,10 @@ void SearchableTableEmitter::emitLookupFunction(const GenericTable &Table,
 
   if (Index.EarlyOut) {
     const GenericField &Field = Index.Fields[0];
-    std::string FirstRepr =
-        primaryRepresentation(Field, IndexRows[0]->getValueInit(Field.Name));
+    std::string FirstRepr = primaryRepresentation(
+        Index.Loc, Field, IndexRows[0]->getValueInit(Field.Name));
     std::string LastRepr = primaryRepresentation(
-        Field, IndexRows.back()->getValueInit(Field.Name));
+        Index.Loc, Field, IndexRows.back()->getValueInit(Field.Name));
     OS << "  if ((" << Field.Name << " < " << FirstRepr << ") ||\n";
     OS << "      (" << Field.Name << " > " << LastRepr << "))\n";
     OS << "    return nullptr;\n\n";
@@ -399,11 +409,11 @@ void SearchableTableEmitter::emitLookupFunction(const GenericTable &Table,
 
   OS << "  struct KeyType {\n";
   for (const auto &Field : Index.Fields) {
-    OS << "    " << searchableFieldType(Field, TypeInTempStruct) << " "
-       << Field.Name << ";\n";
+    OS << "    " << searchableFieldType(Table, Index, Field, TypeInTempStruct) 
+       << " " << Field.Name << ";\n";
   }
   OS << "  };\n";
-  OS << "  KeyType Key = { ";
+  OS << "  KeyType Key = {";
   bool NeedComma = false;
   for (const auto &Field : Index.Fields) {
     if (NeedComma)
@@ -414,12 +424,14 @@ void SearchableTableEmitter::emitLookupFunction(const GenericTable &Table,
     if (isa<StringRecTy>(Field.RecType)) {
       OS << ".upper()";
       if (IsPrimary)
-        PrintFatalError(Twine("Use a secondary index for case-insensitive "
-                              "comparison of field '") +
-                        Field.Name + "' in table '" + Table.Name + "'");
+        PrintFatalError(Index.Loc, 
+                        Twine("In table '") + Table.Name +
+                            "', use a secondary lookup method for "
+                            "case-insensitive comparison of field '" +
+                            Field.Name + "'");
     }
   }
-  OS << " };\n";
+  OS << "};\n";
 
   OS << "  auto Table = makeArrayRef(" << IndexName << ");\n";
   OS << "  auto Idx = std::lower_bound(Table.begin(), Table.end(), Key,\n";
@@ -476,7 +488,8 @@ void SearchableTableEmitter::emitLookupDeclaration(const GenericTable &Table,
       OS << ", ";
     NeedComma = true;
 
-    OS << searchableFieldType(Field, TypeInArgument) << " " << Field.Name;
+    OS << searchableFieldType(Table, Index, Field, TypeInArgument) << " "
+       << Field.Name;
   }
   OS << ")";
 }
@@ -511,7 +524,8 @@ void SearchableTableEmitter::emitGenericTable(const GenericTable &Table,
         OS << ", ";
       NeedComma = true;
 
-      OS << primaryRepresentation(Field, Entry->getValueInit(Field.Name));
+      OS << primaryRepresentation(Table.Locs[0], Field,
+                                  Entry->getValueInit(Field.Name));
     }
 
     OS << " }, // " << i << "\n";
@@ -528,40 +542,49 @@ void SearchableTableEmitter::emitGenericTable(const GenericTable &Table,
   OS << "#endif\n\n";
 }
 
-bool SearchableTableEmitter::parseFieldType(GenericField &Field, Init *II) {
-  if (auto DI = dyn_cast<DefInit>(II)) {
-    Record *TypeRec = DI->getDef();
-    if (TypeRec->isSubClassOf("GenericEnum")) {
-      Field.Enum = EnumMap[TypeRec];
-      Field.RecType = RecordRecTy::get(Field.Enum->Class);
+bool SearchableTableEmitter::parseFieldType(GenericField &Field, Init *TypeOf) {
+  if (auto Type = dyn_cast<StringInit>(TypeOf)) {
+    if (Type->getValue() == "code") {
+      Field.IsCode = true;
       return true;
+    } else {
+      if (Record *TypeRec = Records.getDef(Type->getValue())) {
+        if (TypeRec->isSubClassOf("GenericEnum")) {
+          Field.Enum = EnumMap[TypeRec];
+          Field.RecType = RecordRecTy::get(Field.Enum->Class);
+          return true;
+        }
+      }
     }
   }
 
   return false;
 }
 
-std::unique_ptr<SearchIndex>
-SearchableTableEmitter::parseSearchIndex(GenericTable &Table, StringRef Name,
-                                         const std::vector<StringRef> &Key,
-                                         bool EarlyOut) {
+std::unique_ptr<SearchIndex> SearchableTableEmitter::parseSearchIndex(
+    GenericTable &Table, const RecordVal *KeyRecVal, StringRef Name,
+    const std::vector<StringRef> &Key, bool EarlyOut) {
   auto Index = std::make_unique<SearchIndex>();
   Index->Name = std::string(Name);
+  Index->Loc = KeyRecVal->getLoc();
   Index->EarlyOut = EarlyOut;
 
   for (const auto &FieldName : Key) {
     const GenericField *Field = Table.getFieldByName(FieldName);
     if (!Field)
-      PrintFatalError(Twine("Search index '") + Name +
-                      "' refers to non-existing field '" + FieldName +
-                      "' in table '" + Table.Name + "'");
+      PrintFatalError(
+          KeyRecVal,
+          Twine("In table '") + Table.Name +
+              "', 'PrimaryKey' or 'Key' refers to nonexistent field '" +
+              FieldName + "'");
+                      
     Index->Fields.push_back(*Field);
   }
 
   if (EarlyOut && isa<StringRecTy>(Index->Fields[0].RecType)) {
     PrintFatalError(
-        "Early-out is not supported for string types (in search index '" +
-        Twine(Name) + "'");
+        KeyRecVal, Twine("In lookup method '") + Name + "', early-out is not " +
+                       "supported for a first key field of type string");
   }
 
   return Index;
@@ -586,11 +609,11 @@ void SearchableTableEmitter::collectEnumEntries(
   }
 
   if (ValueField.empty()) {
-    std::stable_sort(Enum.Entries.begin(), Enum.Entries.end(),
-                     [](const std::unique_ptr<GenericEnum::Entry> &LHS,
-                        const std::unique_ptr<GenericEnum::Entry> &RHS) {
-                       return LHS->first < RHS->first;
-                     });
+    llvm::stable_sort(Enum.Entries,
+                      [](const std::unique_ptr<GenericEnum::Entry> &LHS,
+                         const std::unique_ptr<GenericEnum::Entry> &RHS) {
+                        return LHS->first < RHS->first;
+                      });
 
     for (size_t i = 0; i < Enum.Entries.size(); ++i)
       Enum.Entries[i]->second = i;
@@ -600,31 +623,33 @@ void SearchableTableEmitter::collectEnumEntries(
 void SearchableTableEmitter::collectTableEntries(
     GenericTable &Table, const std::vector<Record *> &Items) {
   if (Items.empty())
-    PrintWarning(Twine("Table '") + Table.Name + "' has no items");
+    PrintFatalError(Table.Locs,
+                    Twine("Table '") + Table.Name + "' has no entries");
 
   for (auto EntryRec : Items) {
     for (auto &Field : Table.Fields) {
       auto TI = dyn_cast<TypedInit>(EntryRec->getValueInit(Field.Name));
       if (!TI || !TI->isComplete()) {
-        PrintFatalError(EntryRec->getLoc(),
-                        Twine("Record '") + EntryRec->getName() +
-                            "' in table '" + Table.Name +
-                            "' is missing field '" + Field.Name + "'");
+        PrintFatalError(EntryRec, Twine("Record '") + EntryRec->getName() +
+                                      "' for table '" + Table.Name +
+                                      "' is missing field '" + Field.Name +
+                                      "'");
       }
       if (!Field.RecType) {
         Field.RecType = TI->getType();
       } else {
         RecTy *Ty = resolveTypes(Field.RecType, TI->getType());
         if (!Ty)
-          PrintFatalError(Twine("Field '") + Field.Name + "' of table '" +
-                          Table.Name + "' has incompatible type: " +
-                          Field.RecType->getAsString() + " vs. " +
-                          TI->getType()->getAsString());
+          PrintFatalError(EntryRec->getValue(Field.Name), 
+                          Twine("Field '") + Field.Name + "' of table '" +
+                          Table.Name + "' entry has incompatible type: " +
+                          TI->getType()->getAsString() + " vs. " +
+                          Field.RecType->getAsString());
         Field.RecType = Ty;
       }
     }
 
-    Table.Entries.push_back(EntryRec);
+    Table.Entries.push_back(EntryRec); // Add record to table's record list.
   }
 
   Record *IntrinsicClass = Records.getClass("Intrinsic");
@@ -665,8 +690,9 @@ void SearchableTableEmitter::run(raw_ostream &OS) {
     StringRef FilterClass = EnumRec->getValueAsString("FilterClass");
     Enum->Class = Records.getClass(FilterClass);
     if (!Enum->Class)
-      PrintFatalError(EnumRec->getLoc(), Twine("Enum FilterClass '") +
-                                             FilterClass + "' does not exist");
+      PrintFatalError(EnumRec->getValue("FilterClass"), 
+                      Twine("Enum FilterClass '") + FilterClass +
+                          "' does not exist");
 
     collectEnumEntries(*Enum, NameField, ValueField,
                        Records.getAllDerivedDefinitions(FilterClass));
@@ -677,36 +703,44 @@ void SearchableTableEmitter::run(raw_ostream &OS) {
   for (auto TableRec : Records.getAllDerivedDefinitions("GenericTable")) {
     auto Table = std::make_unique<GenericTable>();
     Table->Name = std::string(TableRec->getName());
+    Table->Locs = TableRec->getLoc();
     Table->PreprocessorGuard = std::string(TableRec->getName());
     Table->CppTypeName = std::string(TableRec->getValueAsString("CppTypeName"));
 
     std::vector<StringRef> Fields = TableRec->getValueAsListOfStrings("Fields");
     for (const auto &FieldName : Fields) {
-      Table->Fields.emplace_back(FieldName);
-
-      if (auto TypeOfVal = TableRec->getValue(("TypeOf_" + FieldName).str())) {
-        if (!parseFieldType(Table->Fields.back(), TypeOfVal->getValue())) {
-          PrintFatalError(TableRec->getLoc(),
-                          Twine("Table '") + Table->Name +
-                              "' has bad 'TypeOf_" + FieldName +
-                              "': " + TypeOfVal->getValue()->getAsString());
+      Table->Fields.emplace_back(FieldName); // Construct a GenericField.
+
+      if (auto TypeOfRecordVal = TableRec->getValue(("TypeOf_" + FieldName).str())) {
+        if (!parseFieldType(Table->Fields.back(), TypeOfRecordVal->getValue())) {
+          PrintError(TypeOfRecordVal, 
+                     Twine("Table '") + Table->Name +
+                         "' has invalid 'TypeOf_" + FieldName +
+                         "': " + TypeOfRecordVal->getValue()->getAsString());
+          PrintFatalNote("The 'TypeOf_xxx' field must be a string naming a "
+                         "GenericEnum record, or \"code\"");
         }
       }
     }
 
-    collectTableEntries(*Table, Records.getAllDerivedDefinitions(
-                                    TableRec->getValueAsString("FilterClass")));
+    StringRef FilterClass = TableRec->getValueAsString("FilterClass");
+    if (!Records.getClass(FilterClass))
+      PrintFatalError(TableRec->getValue("FilterClass"), 
+                      Twine("Table FilterClass '") +
+                          FilterClass + "' does not exist");
+
+    collectTableEntries(*Table, Records.getAllDerivedDefinitions(FilterClass));
 
     if (!TableRec->isValueUnset("PrimaryKey")) {
       Table->PrimaryKey =
-          parseSearchIndex(*Table, TableRec->getValueAsString("PrimaryKeyName"),
+          parseSearchIndex(*Table, TableRec->getValue("PrimaryKey"),
+                           TableRec->getValueAsString("PrimaryKeyName"),
                            TableRec->getValueAsListOfStrings("PrimaryKey"),
                            TableRec->getValueAsBit("PrimaryKeyEarlyOut"));
 
-      std::stable_sort(Table->Entries.begin(), Table->Entries.end(),
-                       [&](Record *LHS, Record *RHS) {
-                         return compareBy(LHS, RHS, *Table->PrimaryKey);
-                       });
+      llvm::stable_sort(Table->Entries, [&](Record *LHS, Record *RHS) {
+        return compareBy(LHS, RHS, *Table->PrimaryKey);
+      });
     }
 
     TableMap.insert(std::make_pair(TableRec, Table.get()));
@@ -717,15 +751,16 @@ void SearchableTableEmitter::run(raw_ostream &OS) {
     Record *TableRec = IndexRec->getValueAsDef("Table");
     auto It = TableMap.find(TableRec);
     if (It == TableMap.end())
-      PrintFatalError(IndexRec->getLoc(),
+      PrintFatalError(IndexRec->getValue("Table"), 
                       Twine("SearchIndex '") + IndexRec->getName() +
-                          "' refers to non-existing table '" +
+                          "' refers to nonexistent table '" +
                           TableRec->getName());
 
     GenericTable &Table = *It->second;
-    Table.Indices.push_back(parseSearchIndex(
-        Table, IndexRec->getName(), IndexRec->getValueAsListOfStrings("Key"),
-        IndexRec->getValueAsBit("EarlyOut")));
+    Table.Indices.push_back(
+        parseSearchIndex(Table, IndexRec->getValue("Key"), IndexRec->getName(), 
+                         IndexRec->getValueAsListOfStrings("Key"),
+                         IndexRec->getValueAsBit("EarlyOut")));
   }
 
   // Translate legacy tables.
@@ -756,6 +791,7 @@ void SearchableTableEmitter::run(raw_ostream &OS) {
 
     auto Table = std::make_unique<GenericTable>();
     Table->Name = (Twine(Class->getName()) + "sList").str();
+    Table->Locs = Class->getLoc();
     Table->PreprocessorGuard = Class->getName().upper();
     Table->CppTypeName = std::string(Class->getName());
 
@@ -778,7 +814,8 @@ void SearchableTableEmitter::run(raw_ostream &OS) {
          Class->getValueAsListOfStrings("SearchableFields")) {
       std::string Name =
           (Twine("lookup") + Table->CppTypeName + "By" + Field).str();
-      Table->Indices.push_back(parseSearchIndex(*Table, Name, {Field}, false));
+      Table->Indices.push_back(parseSearchIndex(*Table, Class->getValue(Field),
+                                                Name, {Field}, false));
     }
 
     Tables.emplace_back(std::move(Table));
diff --git a/contrib/llvm-project/llvm/utils/TableGen/SubtargetEmitter.cpp b/contrib/llvm-project/llvm/utils/TableGen/SubtargetEmitter.cpp
index 68ee839c43ba..7d2b4b929df3 100644
--- a/contrib/llvm-project/llvm/utils/TableGen/SubtargetEmitter.cpp
+++ b/contrib/llvm-project/llvm/utils/TableGen/SubtargetEmitter.cpp
@@ -114,7 +114,6 @@ class SubtargetEmitter {
                            SchedClassTables &SchedTables);
   void EmitSchedClassTables(SchedClassTables &SchedTables, raw_ostream &OS);
   void EmitProcessorModels(raw_ostream &OS);
-  void EmitProcessorLookup(raw_ostream &OS);
   void EmitSchedModelHelpers(const std::string &ClassName, raw_ostream &OS);
   void emitSchedModelHelpersImpl(raw_ostream &OS,
                                  bool OnlyExpandMCInstPredicates = false);
@@ -267,12 +266,15 @@ SubtargetEmitter::CPUKeyValues(raw_ostream &OS,
   for (Record *Processor : ProcessorList) {
     StringRef Name = Processor->getValueAsString("Name");
     RecVec FeatureList = Processor->getValueAsListOfDefs("Features");
+    RecVec TuneFeatureList = Processor->getValueAsListOfDefs("TuneFeatures");
 
     // Emit as { "cpu", "description", 0, { f1 , f2 , ... fn } },
     OS << " { "
        << "\"" << Name << "\", ";
 
     printFeatureMask(OS, FeatureList, FeatureMap);
+    OS << ", ";
+    printFeatureMask(OS, TuneFeatureList, FeatureMap);
 
     // Emit the scheduler model pointer.
     const std::string &ProcModelName =
@@ -728,10 +730,8 @@ void SubtargetEmitter::EmitLoadStoreQueueInfo(const CodeGenProcModel &ProcModel,
   unsigned QueueID = 0;
   if (ProcModel.LoadQueue) {
     const Record *Queue = ProcModel.LoadQueue->getValueAsDef("QueueDescriptor");
-    QueueID =
-        1 + std::distance(ProcModel.ProcResourceDefs.begin(),
-                          std::find(ProcModel.ProcResourceDefs.begin(),
-                                    ProcModel.ProcResourceDefs.end(), Queue));
+    QueueID = 1 + std::distance(ProcModel.ProcResourceDefs.begin(),
+                                find(ProcModel.ProcResourceDefs, Queue));
   }
   OS << "  " << QueueID << ", // Resource Descriptor for the Load Queue\n";
 
@@ -739,10 +739,8 @@ void SubtargetEmitter::EmitLoadStoreQueueInfo(const CodeGenProcModel &ProcModel,
   if (ProcModel.StoreQueue) {
     const Record *Queue =
         ProcModel.StoreQueue->getValueAsDef("QueueDescriptor");
-    QueueID =
-        1 + std::distance(ProcModel.ProcResourceDefs.begin(),
-                          std::find(ProcModel.ProcResourceDefs.begin(),
-                                    ProcModel.ProcResourceDefs.end(), Queue));
+    QueueID = 1 + std::distance(ProcModel.ProcResourceDefs.begin(),
+                                find(ProcModel.ProcResourceDefs, Queue));
   }
   OS << "  " << QueueID << ", // Resource Descriptor for the Store Queue\n";
 }
@@ -1005,8 +1003,7 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel,
     bool HasVariants = false;
     for (const CodeGenSchedTransition &CGT :
            make_range(SC.Transitions.begin(), SC.Transitions.end())) {
-      if (CGT.ProcIndices[0] == 0 ||
-          is_contained(CGT.ProcIndices, ProcModel.Index)) {
+      if (CGT.ProcIndex == ProcModel.Index) {
         HasVariants = true;
         break;
       }
@@ -1219,11 +1216,8 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel,
     }
     else {
       SCDesc.WriteLatencyIdx = SchedTables.WriteLatencies.size();
-      SchedTables.WriteLatencies.insert(SchedTables.WriteLatencies.end(),
-                                        WriteLatencies.begin(),
-                                        WriteLatencies.end());
-      SchedTables.WriterNames.insert(SchedTables.WriterNames.end(),
-                                     WriterNames.begin(), WriterNames.end());
+      llvm::append_range(SchedTables.WriteLatencies, WriteLatencies);
+      llvm::append_range(SchedTables.WriterNames, WriterNames);
     }
     // ReadAdvanceEntries must remain in operand order.
     SCDesc.NumReadAdvanceEntries = ReadAdvanceEntries.size();
@@ -1235,8 +1229,7 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel,
       SCDesc.ReadAdvanceIdx = RAPos - SchedTables.ReadAdvanceEntries.begin();
     else {
       SCDesc.ReadAdvanceIdx = SchedTables.ReadAdvanceEntries.size();
-      SchedTables.ReadAdvanceEntries.insert(RAPos, ReadAdvanceEntries.begin(),
-                                            ReadAdvanceEntries.end());
+      llvm::append_range(SchedTables.ReadAdvanceEntries, ReadAdvanceEntries);
     }
   }
 }
@@ -1443,20 +1436,20 @@ static void emitPredicateProlog(const RecordKeeper &Records, raw_ostream &OS) {
   OS << Buffer;
 }
 
+static bool isTruePredicate(const Record *Rec) {
+  return Rec->isSubClassOf("MCSchedPredicate") &&
+         Rec->getValueAsDef("Pred")->isSubClassOf("MCTrue");
+}
+
 static void emitPredicates(const CodeGenSchedTransition &T,
                            const CodeGenSchedClass &SC, PredicateExpander &PE,
                            raw_ostream &OS) {
   std::string Buffer;
   raw_string_ostream SS(Buffer);
 
-  auto IsTruePredicate = [](const Record *Rec) {
-    return Rec->isSubClassOf("MCSchedPredicate") &&
-           Rec->getValueAsDef("Pred")->isSubClassOf("MCTrue");
-  };
-
   // If not all predicates are MCTrue, then we need an if-stmt.
   unsigned NumNonTruePreds =
-      T.PredTerm.size() - count_if(T.PredTerm, IsTruePredicate);
+      T.PredTerm.size() - count_if(T.PredTerm, isTruePredicate);
 
   SS.indent(PE.getIndentLevel() * 2);
 
@@ -1468,7 +1461,7 @@ static void emitPredicates(const CodeGenSchedTransition &T,
 
     for (const Record *Rec : T.PredTerm) {
       // Skip predicates that evaluate to "true".
-      if (IsTruePredicate(Rec))
+      if (isTruePredicate(Rec))
         continue;
 
       if (FirstNonTruePredicate) {
@@ -1504,7 +1497,8 @@ static void emitPredicates(const CodeGenSchedTransition &T,
 
 // Used by method `SubtargetEmitter::emitSchedModelHelpersImpl()` to generate
 // epilogue code for the auto-generated helper.
-void emitSchedModelHelperEpilogue(raw_ostream &OS, bool ShouldReturnZero) {
+static void emitSchedModelHelperEpilogue(raw_ostream &OS,
+                                         bool ShouldReturnZero) {
   if (ShouldReturnZero) {
     OS << "  // Don't know how to resolve this scheduling class.\n"
        << "  return 0;\n";
@@ -1514,15 +1508,15 @@ void emitSchedModelHelperEpilogue(raw_ostream &OS, bool ShouldReturnZero) {
   OS << "  report_fatal_error(\"Expected a variant SchedClass\");\n";
 }
 
-bool hasMCSchedPredicates(const CodeGenSchedTransition &T) {
+static bool hasMCSchedPredicates(const CodeGenSchedTransition &T) {
   return all_of(T.PredTerm, [](const Record *Rec) {
     return Rec->isSubClassOf("MCSchedPredicate");
   });
 }
 
-void collectVariantClasses(const CodeGenSchedModels &SchedModels,
-                           IdxVec &VariantClasses,
-                           bool OnlyExpandMCInstPredicates) {
+static void collectVariantClasses(const CodeGenSchedModels &SchedModels,
+                                  IdxVec &VariantClasses,
+                                  bool OnlyExpandMCInstPredicates) {
   for (const CodeGenSchedClass &SC : SchedModels.schedClasses()) {
     // Ignore non-variant scheduling classes.
     if (SC.Transitions.empty())
@@ -1541,19 +1535,24 @@ void collectVariantClasses(const CodeGenSchedModels &SchedModels,
   }
 }
 
-void collectProcessorIndices(const CodeGenSchedClass &SC, IdxVec &ProcIndices) {
+static void collectProcessorIndices(const CodeGenSchedClass &SC,
+                                    IdxVec &ProcIndices) {
   // A variant scheduling class may define transitions for multiple
   // processors.  This function identifies wich processors are associated with
   // transition rules specified by variant class `SC`.
   for (const CodeGenSchedTransition &T : SC.Transitions) {
     IdxVec PI;
-    std::set_union(T.ProcIndices.begin(), T.ProcIndices.end(),
-                   ProcIndices.begin(), ProcIndices.end(),
-                   std::back_inserter(PI));
+    std::set_union(&T.ProcIndex, &T.ProcIndex + 1, ProcIndices.begin(),
+                   ProcIndices.end(), std::back_inserter(PI));
     ProcIndices.swap(PI);
   }
 }
 
+static bool isAlwaysTrue(const CodeGenSchedTransition &T) {
+  return llvm::all_of(T.PredTerm,
+                      [](const Record *R) { return isTruePredicate(R); });
+}
+
 void SubtargetEmitter::emitSchedModelHelpersImpl(
     raw_ostream &OS, bool OnlyExpandMCInstPredicates) {
   IdxVec VariantClasses;
@@ -1596,8 +1595,9 @@ void SubtargetEmitter::emitSchedModelHelpersImpl(
       }
 
       // Now emit transitions associated with processor PI.
+      const CodeGenSchedTransition *FinalT = nullptr;
       for (const CodeGenSchedTransition &T : SC.Transitions) {
-        if (PI != 0 && !count(T.ProcIndices, PI))
+        if (PI != 0 && T.ProcIndex != PI)
           continue;
 
         // Emit only transitions based on MCSchedPredicate, if it's the case.
@@ -1610,9 +1610,17 @@ void SubtargetEmitter::emitSchedModelHelpersImpl(
         if (OnlyExpandMCInstPredicates && !hasMCSchedPredicates(T))
           continue;
 
+        // If transition is folded to 'return X' it should be the last one.
+        if (isAlwaysTrue(T)) {
+          FinalT = &T;
+          continue;
+        }
         PE.setIndentLevel(3);
         emitPredicates(T, SchedModels.getSchedClass(T.ToClassIdx), PE, OS);
       }
+      if (FinalT)
+        emitPredicates(*FinalT, SchedModels.getSchedClass(FinalT->ToClassIdx),
+                       PE, OS);
 
       OS << "    }\n";
 
@@ -1646,9 +1654,9 @@ void SubtargetEmitter::EmitSchedModelHelpers(const std::string &ClassName,
 
   OS << "unsigned " << ClassName
      << "\n::resolveVariantSchedClass(unsigned SchedClass, const MCInst *MI,"
-     << " unsigned CPUID) const {\n"
+     << " const MCInstrInfo *MCII, unsigned CPUID) const {\n"
      << "  return " << Target << "_MC"
-     << "::resolveVariantSchedClassImpl(SchedClass, MI, CPUID);\n"
+     << "::resolveVariantSchedClassImpl(SchedClass, MI, MCII, CPUID);\n"
      << "} // " << ClassName << "::resolveVariantSchedClass\n\n";
 
   STIPredicateExpander PE(Target);
@@ -1692,17 +1700,19 @@ void SubtargetEmitter::ParseFeaturesFunction(raw_ostream &OS,
      << "// subtarget options.\n"
      << "void llvm::";
   OS << Target;
-  OS << "Subtarget::ParseSubtargetFeatures(StringRef CPU, StringRef FS) {\n"
+  OS << "Subtarget::ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, "
+     << "StringRef FS) {\n"
      << "  LLVM_DEBUG(dbgs() << \"\\nFeatures:\" << FS);\n"
-     << "  LLVM_DEBUG(dbgs() << \"\\nCPU:\" << CPU << \"\\n\\n\");\n";
+     << "  LLVM_DEBUG(dbgs() << \"\\nCPU:\" << CPU);\n"
+     << "  LLVM_DEBUG(dbgs() << \"\\nTuneCPU:\" << TuneCPU << \"\\n\\n\");\n";
 
   if (Features.empty()) {
     OS << "}\n";
     return;
   }
 
-  OS << "  InitMCProcessorInfo(CPU, FS);\n"
-     << "  const FeatureBitset& Bits = getFeatureBits();\n";
+  OS << "  InitMCProcessorInfo(CPU, TuneCPU, FS);\n"
+     << "  const FeatureBitset &Bits = getFeatureBits();\n";
 
   for (Record *R : Features) {
     // Next record
@@ -1727,26 +1737,28 @@ void SubtargetEmitter::ParseFeaturesFunction(raw_ostream &OS,
 void SubtargetEmitter::emitGenMCSubtargetInfo(raw_ostream &OS) {
   OS << "namespace " << Target << "_MC {\n"
      << "unsigned resolveVariantSchedClassImpl(unsigned SchedClass,\n"
-     << "    const MCInst *MI, unsigned CPUID) {\n";
+     << "    const MCInst *MI, const MCInstrInfo *MCII, unsigned CPUID) {\n";
   emitSchedModelHelpersImpl(OS, /* OnlyExpandMCPredicates */ true);
   OS << "}\n";
   OS << "} // end namespace " << Target << "_MC\n\n";
 
   OS << "struct " << Target
      << "GenMCSubtargetInfo : public MCSubtargetInfo {\n";
-  OS << "  " << Target << "GenMCSubtargetInfo(const Triple &TT, \n"
-     << "    StringRef CPU, StringRef FS, ArrayRef<SubtargetFeatureKV> PF,\n"
+  OS << "  " << Target << "GenMCSubtargetInfo(const Triple &TT,\n"
+     << "    StringRef CPU, StringRef TuneCPU, StringRef FS,\n"
+     << "    ArrayRef<SubtargetFeatureKV> PF,\n"
      << "    ArrayRef<SubtargetSubTypeKV> PD,\n"
      << "    const MCWriteProcResEntry *WPR,\n"
      << "    const MCWriteLatencyEntry *WL,\n"
      << "    const MCReadAdvanceEntry *RA, const InstrStage *IS,\n"
      << "    const unsigned *OC, const unsigned *FP) :\n"
-     << "      MCSubtargetInfo(TT, CPU, FS, PF, PD,\n"
+     << "      MCSubtargetInfo(TT, CPU, TuneCPU, FS, PF, PD,\n"
      << "                      WPR, WL, RA, IS, OC, FP) { }\n\n"
      << "  unsigned resolveVariantSchedClass(unsigned SchedClass,\n"
-     << "      const MCInst *MI, unsigned CPUID) const override {\n"
+     << "      const MCInst *MI, const MCInstrInfo *MCII,\n"
+     << "      unsigned CPUID) const override {\n"
      << "    return " << Target << "_MC"
-     << "::resolveVariantSchedClassImpl(SchedClass, MI, CPUID); \n";
+     << "::resolveVariantSchedClassImpl(SchedClass, MI, MCII, CPUID);\n";
   OS << "  }\n";
   if (TGT.getHwModes().getNumModeIds() > 1)
     OS << "  unsigned getHwMode() const override;\n";
@@ -1817,8 +1829,9 @@ void SubtargetEmitter::run(raw_ostream &OS) {
 
   OS << "\nstatic inline MCSubtargetInfo *create" << Target
      << "MCSubtargetInfoImpl("
-     << "const Triple &TT, StringRef CPU, StringRef FS) {\n";
-  OS << "  return new " << Target << "GenMCSubtargetInfo(TT, CPU, FS, ";
+     << "const Triple &TT, StringRef CPU, StringRef TuneCPU, StringRef FS) {\n";
+  OS << "  return new " << Target
+     << "GenMCSubtargetInfo(TT, CPU, TuneCPU, FS, ";
   if (NumFeatures)
     OS << Target << "FeatureKV, ";
   else
@@ -1862,17 +1875,18 @@ void SubtargetEmitter::run(raw_ostream &OS) {
   OS << "class DFAPacketizer;\n";
   OS << "namespace " << Target << "_MC {\n"
      << "unsigned resolveVariantSchedClassImpl(unsigned SchedClass,"
-     << " const MCInst *MI, unsigned CPUID);\n"
+     << " const MCInst *MI, const MCInstrInfo *MCII, unsigned CPUID);\n"
      << "} // end namespace " << Target << "_MC\n\n";
   OS << "struct " << ClassName << " : public TargetSubtargetInfo {\n"
      << "  explicit " << ClassName << "(const Triple &TT, StringRef CPU, "
-     << "StringRef FS);\n"
+     << "StringRef TuneCPU, StringRef FS);\n"
      << "public:\n"
      << "  unsigned resolveSchedClass(unsigned SchedClass, "
      << " const MachineInstr *DefMI,"
      << " const TargetSchedModel *SchedModel) const override;\n"
      << "  unsigned resolveVariantSchedClass(unsigned SchedClass,"
-     << " const MCInst *MI, unsigned CPUID) const override;\n"
+     << " const MCInst *MI, const MCInstrInfo *MCII,"
+     << " unsigned CPUID) const override;\n"
      << "  DFAPacketizer *createDFAPacketizer(const InstrItineraryData *IID)"
      << " const;\n";
   if (TGT.getHwModes().getNumModeIds() > 1)
@@ -1909,8 +1923,8 @@ void SubtargetEmitter::run(raw_ostream &OS) {
   }
 
   OS << ClassName << "::" << ClassName << "(const Triple &TT, StringRef CPU, "
-     << "StringRef FS)\n"
-     << "  : TargetSubtargetInfo(TT, CPU, FS, ";
+     << "StringRef TuneCPU, StringRef FS)\n"
+     << "  : TargetSubtargetInfo(TT, CPU, TuneCPU, FS, ";
   if (NumFeatures)
     OS << "makeArrayRef(" << Target << "FeatureKV, " << NumFeatures << "), ";
   else
diff --git a/contrib/llvm-project/llvm/utils/TableGen/SubtargetFeatureInfo.cpp b/contrib/llvm-project/llvm/utils/TableGen/SubtargetFeatureInfo.cpp
index 3821f4757464..105ed82c9d02 100644
--- a/contrib/llvm-project/llvm/utils/TableGen/SubtargetFeatureInfo.cpp
+++ b/contrib/llvm-project/llvm/utils/TableGen/SubtargetFeatureInfo.cpp
@@ -7,11 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "SubtargetFeatureInfo.h"
-
 #include "Types.h"
 #include "llvm/Config/llvm-config.h"
+#include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
-
 #include <map>
 
 using namespace llvm;
@@ -113,7 +112,7 @@ void SubtargetFeatureInfo::emitComputeAssemblerAvailableFeatures(
     StringRef TargetName, StringRef ClassName, StringRef FuncName,
     SubtargetFeatureInfoMap &SubtargetFeatures, raw_ostream &OS) {
   OS << "FeatureBitset " << TargetName << ClassName << "::\n"
-     << FuncName << "(const FeatureBitset& FB) const {\n";
+     << FuncName << "(const FeatureBitset &FB) const {\n";
   OS << "  FeatureBitset Features;\n";
   for (const auto &SF : SubtargetFeatures) {
     const SubtargetFeatureInfo &SFI = SF.second;
diff --git a/contrib/llvm-project/llvm/utils/TableGen/SubtargetFeatureInfo.h b/contrib/llvm-project/llvm/utils/TableGen/SubtargetFeatureInfo.h
index d72f8b93461f..8c8a4487934c 100644
--- a/contrib/llvm-project/llvm/utils/TableGen/SubtargetFeatureInfo.h
+++ b/contrib/llvm-project/llvm/utils/TableGen/SubtargetFeatureInfo.h
@@ -9,17 +9,12 @@
 #ifndef LLVM_UTIL_TABLEGEN_SUBTARGETFEATUREINFO_H
 #define LLVM_UTIL_TABLEGEN_SUBTARGETFEATUREINFO_H
 
-#include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
-
 #include <map>
 #include <string>
 #include <vector>
 
 namespace llvm {
-class Record;
-class RecordKeeper;
-
 struct SubtargetFeatureInfo;
 using SubtargetFeatureInfoMap = std::map<Record *, SubtargetFeatureInfo, LessRecordByID>;
 
diff --git a/contrib/llvm-project/llvm/utils/TableGen/TableGen.cpp b/contrib/llvm-project/llvm/utils/TableGen/TableGen.cpp
index 8015a58471ca..6d851da34731 100644
--- a/contrib/llvm-project/llvm/utils/TableGen/TableGen.cpp
+++ b/contrib/llvm-project/llvm/utils/TableGen/TableGen.cpp
@@ -12,9 +12,7 @@
 
 #include "TableGenBackends.h" // Declares all backends.
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/ManagedStatic.h"
-#include "llvm/Support/PrettyStackTrace.h"
-#include "llvm/Support/Signals.h"
+#include "llvm/Support/InitLLVM.h"
 #include "llvm/TableGen/Main.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/SetTheory.h"
@@ -23,6 +21,8 @@ using namespace llvm;
 
 enum ActionType {
   PrintRecords,
+  PrintDetailedRecords,
+  NullBackend,
   DumpJSON,
   GenEmitter,
   GenRegisterInfo,
@@ -60,9 +60,6 @@ enum ActionType {
 };
 
 namespace llvm {
-/// Storage for TimeRegionsOpt as a global so that backends aren't required to
-/// include CommandLine.h
-bool TimeRegions = false;
 cl::opt<bool> EmitLongStrLiterals(
     "long-string-literals",
     cl::desc("when emitting large string tables, prefer string literals over "
@@ -77,6 +74,10 @@ cl::opt<ActionType> Action(
     cl::values(
         clEnumValN(PrintRecords, "print-records",
                    "Print all records to stdout (default)"),
+        clEnumValN(PrintDetailedRecords, "print-detailed-records",
+                   "Print full details of all records to stdout"),
+        clEnumValN(NullBackend, "null-backend",
+                   "Do nothing after parsing (useful for timing)"),
         clEnumValN(DumpJSON, "dump-json",
                    "Dump all records as machine-readable JSON"),
         clEnumValN(GenEmitter, "gen-emitter", "Generate machine code emitter"),
@@ -144,15 +145,15 @@ cl::opt<std::string> Class("class", cl::desc("Print Enum list for this class"),
                            cl::value_desc("class name"),
                            cl::cat(PrintEnumsCat));
 
-cl::opt<bool, true>
-    TimeRegionsOpt("time-regions",
-                   cl::desc("Time regions of tablegens execution"),
-                   cl::location(TimeRegions));
-
 bool LLVMTableGenMain(raw_ostream &OS, RecordKeeper &Records) {
   switch (Action) {
   case PrintRecords:
-    OS << Records;           // No argument, dump all contents
+    OS << Records;              // No argument, dump all contents
+    break;
+  case PrintDetailedRecords:
+    EmitDetailedRecords(Records, OS);
+    break;
+  case NullBackend:             // No backend at all.
     break;
   case DumpJSON:
     EmitJSON(Records, OS);
@@ -278,12 +279,9 @@ bool LLVMTableGenMain(raw_ostream &OS, RecordKeeper &Records) {
 }
 
 int main(int argc, char **argv) {
-  sys::PrintStackTraceOnErrorSignal(argv[0]);
-  PrettyStackTraceProgram X(argc, argv);
+  InitLLVM X(argc, argv);
   cl::ParseCommandLineOptions(argc, argv);
 
-  llvm_shutdown_obj Y;
-
   return TableGenMain(argv[0], &LLVMTableGenMain);
 }
 
diff --git a/contrib/llvm-project/llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp b/contrib/llvm-project/llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp
index 54aa5a8164f2..7518b262e6e9 100644
--- a/contrib/llvm-project/llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp
+++ b/contrib/llvm-project/llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp
@@ -39,9 +39,15 @@ void emitWebAssemblyDisassemblerTables(
             ->getValue());
     if (Opc == 0xFFFFFFFF)
       continue; // No opcode defined.
-    assert(Opc <= 0xFFFF);
-    auto Prefix = Opc >> 8;
-    Opc = Opc & 0xFF;
+    assert(Opc <= 0xFFFFFF);
+    unsigned Prefix;
+    if (Opc <= 0xFFFF) {
+      Prefix = Opc >> 8;
+      Opc = Opc & 0xFF;
+    } else {
+      Prefix = Opc >> 16;
+      Opc = Opc & 0xFFFF;
+    }
     auto &CGIP = OpcodeTable[Prefix][Opc];
     // All wasm instructions have a StackBased field of type string, we only
     // want the instructions for which this is "true".
@@ -133,8 +139,7 @@ void emitWebAssemblyDisassemblerTables(
         }
         // Store operands if no prior occurrence.
         if (OperandStart == OperandTable.size()) {
-          OperandTable.insert(OperandTable.end(), CurOperandList.begin(),
-                              CurOperandList.end());
+          llvm::append_range(OperandTable, CurOperandList);
         }
         OS << OperandStart;
       } else {
diff --git a/contrib/llvm-project/llvm/utils/TableGen/X86DisassemblerTables.cpp b/contrib/llvm-project/llvm/utils/TableGen/X86DisassemblerTables.cpp
index 76e4fd9a13ee..331664f875b7 100644
--- a/contrib/llvm-project/llvm/utils/TableGen/X86DisassemblerTables.cpp
+++ b/contrib/llvm-project/llvm/utils/TableGen/X86DisassemblerTables.cpp
@@ -763,7 +763,7 @@ void DisassemblerTables::emitOpcodeDecision(raw_ostream &o1, raw_ostream &o2,
   }
   if (index == 256) {
     // If all 256 entries are MODRM_ONEENTRY, omit output.
-    assert(MODRM_ONEENTRY == 0);
+    static_assert(MODRM_ONEENTRY == 0, "");
     --i2;
     o2 << "},\n";
   } else {
diff --git a/contrib/llvm-project/llvm/utils/TableGen/X86FoldTablesEmitter.cpp b/contrib/llvm-project/llvm/utils/TableGen/X86FoldTablesEmitter.cpp
index 8026c324cd40..85d926215113 100644
--- a/contrib/llvm-project/llvm/utils/TableGen/X86FoldTablesEmitter.cpp
+++ b/contrib/llvm-project/llvm/utils/TableGen/X86FoldTablesEmitter.cpp
@@ -127,6 +127,15 @@ class X86FoldTablesEmitter {
 
       OS << "0 },\n";
     }
+
+    bool operator<(const X86FoldTableEntry &RHS) const {
+      bool LHSpseudo = RegInst->TheDef->getValueAsBit("isPseudo");
+      bool RHSpseudo = RHS.RegInst->TheDef->getValueAsBit("isPseudo");
+      if (LHSpseudo != RHSpseudo)
+        return LHSpseudo;
+
+      return RegInst->TheDef->getName() < RHS.RegInst->TheDef->getName();
+    }
   };
 
   typedef std::vector<X86FoldTableEntry> FoldTable;
@@ -225,14 +234,8 @@ static inline unsigned int getRegOperandSize(const Record *RegRec) {
 }
 
 // Return the size of the memory operand
-static inline unsigned int
-getMemOperandSize(const Record *MemRec, const bool IntrinsicSensitive = false) {
+static inline unsigned getMemOperandSize(const Record *MemRec) {
   if (MemRec->isSubClassOf("Operand")) {
-    // Intrinsic memory instructions use ssmem/sdmem.
-    if (IntrinsicSensitive &&
-        (MemRec->getName() == "sdmem" || MemRec->getName() == "ssmem"))
-      return 128;
-
     StringRef Name =
         MemRec->getValueAsDef("ParserMatchClass")->getValueAsString("Name");
     if (Name == "Mem8")
@@ -568,8 +571,6 @@ void X86FoldTablesEmitter::updateTables(const CodeGenInstruction *RegInstr,
         getRegOperandSize(RegOpRec) == getMemOperandSize(MemOpRec))
       addEntryWithFlags(Table0, RegInstr, MemInstr, S, 0);
   }
-
-  return;
 }
 
 void X86FoldTablesEmitter::run(formatted_raw_ostream &OS) {
@@ -653,6 +654,14 @@ void X86FoldTablesEmitter::run(formatted_raw_ostream &OS) {
                  &(Target.getInstruction(MemInstIter)), Entry.Strategy);
   }
 
+  // Sort the tables before printing.
+  llvm::sort(Table2Addr);
+  llvm::sort(Table0);
+  llvm::sort(Table1);
+  llvm::sort(Table2);
+  llvm::sort(Table3);
+  llvm::sort(Table4);
+
   // Print all tables.
   printTable(Table2Addr, "Table2Addr", OS);
   printTable(Table0, "Table0", OS);
diff --git a/contrib/llvm-project/llvm/utils/TableGen/X86RecognizableInstr.cpp b/contrib/llvm-project/llvm/utils/TableGen/X86RecognizableInstr.cpp
index 6a245b5eb425..e4b7c05cfb88 100644
--- a/contrib/llvm-project/llvm/utils/TableGen/X86RecognizableInstr.cpp
+++ b/contrib/llvm-project/llvm/utils/TableGen/X86RecognizableInstr.cpp
@@ -54,7 +54,7 @@ static uint8_t byteFromBitsInit(BitsInit &init) {
 /// @param rec  - The record from which to extract the value.
 /// @param name - The name of the field in the record.
 /// @return     - The field, as translated by byteFromBitsInit().
-static uint8_t byteFromRec(const Record* rec, const std::string &name) {
+static uint8_t byteFromRec(const Record* rec, StringRef name) {
   BitsInit* bits = rec->getValueAsBitsInit(name);
   return byteFromBitsInit(*bits);
 }
-- 
cgit v1.2.3